diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..b5f23de --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-03-25T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.08651v2","updated":"2024-03-25T14:41:07Z","published":"2024-03-13T16:06:07Z","title":"HAIFIT: Human-Centered AI for Fashion Image Translation","summary":" In the realm of fashion design, sketches serve as the canvas for expressing\nan artist's distinctive drawing style and creative vision, capturing intricate\ndetails like stroke variations and texture nuances. The advent of\nsketch-to-image cross-modal translation technology has notably aided designers.\nHowever, existing methods often compromise these sketch details during image\ngeneration, resulting in images that deviate from the designer's intended\nconcept. This limitation hampers the ability to offer designers a precise\npreview of the final output. To overcome this challenge, we introduce HAIFIT, a\nnovel approach that transforms sketches into high-fidelity, lifelike clothing\nimages by integrating multi-scale features and capturing extensive feature map\ndependencies from diverse perspectives. Through extensive qualitative and\nquantitative evaluations conducted on our self-collected dataset, our method\ndemonstrates superior performance compared to existing methods in generating\nphotorealistic clothing images. Our method excels in preserving the distinctive\nstyle and intricate details essential for fashion design applications.\n","authors":["Jianan Jiang","Xinglin Li","Weiren Yu","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2403.08651v2.pdf","comment":"8 pages,8 figures"},{"id":"http://arxiv.org/abs/2402.19463v2","updated":"2024-03-25T14:27:03Z","published":"2024-02-29T18:54:53Z","title":"SeMoLi: What Moves Together Belongs Together","summary":" We tackle semi-supervised object detection based on motion cues. Recent\nresults suggest that heuristic-based clustering methods in conjunction with\nobject trackers can be used to pseudo-label instances of moving objects and use\nthese as supervisory signals to train 3D object detectors in Lidar data without\nmanual supervision. We re-think this approach and suggest that both, object\ndetection, as well as motion-inspired pseudo-labeling, can be tackled in a\ndata-driven manner. We leverage recent advances in scene flow estimation to\nobtain point trajectories from which we extract long-term, class-agnostic\nmotion patterns. Revisiting correlation clustering in the context of message\npassing networks, we learn to group those motion patterns to cluster points to\nobject instances. By estimating the full extent of the objects, we obtain\nper-scan 3D bounding boxes that we use to supervise a Lidar object detection\nnetwork. Our method not only outperforms prior heuristic-based approaches (57.5\nAP, +14 improvement over prior work), more importantly, we show we can\npseudo-label and train object detectors across datasets.\n","authors":["Jenny Seidenschwarz","Aljoša Ošep","Francesco Ferroni","Simon Lucey","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2402.19463v2.pdf","comment":"Accepted to CVPR 2024!"},{"id":"http://arxiv.org/abs/2403.16803v1","updated":"2024-03-25T14:21:49Z","published":"2024-03-25T14:21:49Z","title":"Exploiting Priors from 3D Diffusion Models for RGB-Based One-Shot View\n Planning","summary":" Object reconstruction is relevant for many autonomous robotic tasks that\nrequire interaction with the environment. A key challenge in such scenarios is\nplanning view configurations to collect informative measurements for\nreconstructing an initially unknown object. One-shot view planning enables\nefficient data collection by predicting view configurations and planning the\nglobally shortest path connecting all views at once. However, geometric priors\nabout the object are required to conduct one-shot view planning. In this work,\nwe propose a novel one-shot view planning approach that utilizes the powerful\n3D generation capabilities of diffusion models as priors. By incorporating such\ngeometric priors into our pipeline, we achieve effective one-shot view planning\nstarting with only a single RGB image of the object to be reconstructed. Our\nplanning experiments in simulation and real-world setups indicate that our\napproach balances well between object reconstruction quality and movement cost.\n","authors":["Sicong Pan","Liren Jin","Xuying Huang","Cyrill Stachniss","Marija Popović","Maren Bennewitz"],"pdf_url":"https://arxiv.org/pdf/2403.16803v1.pdf","comment":"Sicong Pan and Liren Jin have equal contribution. Submitted to IROS\n 2024"},{"id":"http://arxiv.org/abs/2403.16794v1","updated":"2024-03-25T14:13:09Z","published":"2024-03-25T14:13:09Z","title":"CurbNet: Curb Detection Framework Based on LiDAR Point Cloud\n Segmentation","summary":" Curb detection is an important function in intelligent driving and can be\nused to determine drivable areas of the road. However, curbs are difficult to\ndetect due to the complex road environment. This paper introduces CurbNet, a\nnovel framework for curb detection, leveraging point cloud segmentation.\nAddressing the dearth of comprehensive curb datasets and the absence of 3D\nannotations, we have developed the 3D-Curb dataset, encompassing 7,100 frames,\nwhich represents the largest and most categorically diverse collection of curb\npoint clouds currently available. Recognizing that curbs are primarily\ncharacterized by height variations, our approach harnesses spatially-rich 3D\npoint clouds for training. To tackle the challenges presented by the uneven\ndistribution of curb features on the xy-plane and their reliance on z-axis\nhigh-frequency features, we introduce the multi-scale and channel attention\n(MSCA) module, a bespoke solution designed to optimize detection performance.\nMoreover, we propose an adaptive weighted loss function group, specifically\nformulated to counteract the imbalance in the distribution of curb point clouds\nrelative to other categories. Our extensive experimentation on 2 major datasets\nhas yielded results that surpass existing benchmarks set by leading curb\ndetection and point cloud segmentation models. By integrating multi-clustering\nand curve fitting techniques in our post-processing stage, we have\nsubstantially reduced noise in curb detection, thereby enhancing precision to\n0.8744. Notably, CurbNet has achieved an exceptional average metrics of over\n0.95 at a tolerance of just 0.15m, thereby establishing a new benchmark.\nFurthermore, corroborative real-world experiments and dataset analyzes mutually\nvalidate each other, solidifying CurbNet's superior detection proficiency and\nits robust generalizability.\n","authors":["Guoyang Zhao","Fulong Ma","Yuxuan Liu","Weiqing Qi","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16788v1","updated":"2024-03-25T14:02:33Z","published":"2024-03-25T14:02:33Z","title":"HPL-ESS: Hybrid Pseudo-Labeling for Unsupervised Event-based Semantic\n Segmentation","summary":" Event-based semantic segmentation has gained popularity due to its capability\nto deal with scenarios under high-speed motion and extreme lighting conditions,\nwhich cannot be addressed by conventional RGB cameras. Since it is hard to\nannotate event data, previous approaches rely on event-to-image reconstruction\nto obtain pseudo labels for training. However, this will inevitably introduce\nnoise, and learning from noisy pseudo labels, especially when generated from a\nsingle source, may reinforce the errors. This drawback is also called\nconfirmation bias in pseudo-labeling. In this paper, we propose a novel hybrid\npseudo-labeling framework for unsupervised event-based semantic segmentation,\nHPL-ESS, to alleviate the influence of noisy pseudo labels. In particular, we\nfirst employ a plain unsupervised domain adaptation framework as our baseline,\nwhich can generate a set of pseudo labels through self-training. Then, we\nincorporate offline event-to-image reconstruction into the framework, and\nobtain another set of pseudo labels by predicting segmentation maps on the\nreconstructed images. A noisy label learning strategy is designed to mix the\ntwo sets of pseudo labels and enhance the quality. Moreover, we propose a soft\nprototypical alignment module to further improve the consistency of target\ndomain features. Extensive experiments show that our proposed method\noutperforms existing state-of-the-art methods by a large margin on the\nDSEC-Semantic dataset (+5.88% accuracy, +10.32% mIoU), which even surpasses\nseveral supervised methods.\n","authors":["Linglin Jing","Yiming Ding","Yunpeng Gao","Zhigang Wang","Xu Yan","Dong Wang","Gerald Schaefer","Hui Fang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2403.16788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16782v1","updated":"2024-03-25T13:57:45Z","published":"2024-03-25T13:57:45Z","title":"The Anatomy of Adversarial Attacks: Concept-based XAI Dissection","summary":" Adversarial attacks (AAs) pose a significant threat to the reliability and\nrobustness of deep neural networks. While the impact of these attacks on model\npredictions has been extensively studied, their effect on the learned\nrepresentations and concepts within these models remains largely unexplored. In\nthis work, we perform an in-depth analysis of the influence of AAs on the\nconcepts learned by convolutional neural networks (CNNs) using eXplainable\nartificial intelligence (XAI) techniques. Through an extensive set of\nexperiments across various network architectures and targeted AA techniques, we\nunveil several key findings. First, AAs induce substantial alterations in the\nconcept composition within the feature space, introducing new concepts or\nmodifying existing ones. Second, the adversarial perturbation itself can be\nlinearly decomposed into a set of latent vector components, with a subset of\nthese being responsible for the attack's success. Notably, we discover that\nthese components are target-specific, i.e., are similar for a given target\nclass throughout different AA techniques and starting classes. Our findings\nprovide valuable insights into the nature of AAs and their impact on learned\nrepresentations, paving the way for the development of more robust and\ninterpretable deep learning models, as well as effective defenses against\nadversarial threats.\n","authors":["Georgii Mikriukov","Gesina Schwalbe","Franz Motzkus","Korinna Bade"],"pdf_url":"https://arxiv.org/pdf/2403.16782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16776v1","updated":"2024-03-25T13:52:48Z","published":"2024-03-25T13:52:48Z","title":"Diff-Def: Diffusion-Generated Deformation Fields for Conditional Atlases","summary":" Anatomical atlases are widely used for population analysis. Conditional\natlases target a particular sub-population defined via certain conditions (e.g.\ndemographics or pathologies) and allow for the investigation of fine-grained\nanatomical differences - such as morphological changes correlated with age.\nExisting approaches use either registration-based methods that are unable to\nhandle large anatomical variations or generative models, which can suffer from\ntraining instabilities and hallucinations. To overcome these limitations, we\nuse latent diffusion models to generate deformation fields, which transform a\ngeneral population atlas into one representing a specific sub-population. By\ngenerating a deformation field and registering the conditional atlas to a\nneighbourhood of images, we ensure structural plausibility and avoid\nhallucinations, which can occur during direct image synthesis. We compare our\nmethod to several state-of-the-art atlas generation methods in experiments\nusing 5000 brain as well as whole-body MR images from UK Biobank. Our method\ngenerates highly realistic atlases with smooth transformations and high\nanatomical fidelity, outperforming the baselines.\n","authors":["Sophie Starck","Vasiliki Sideri-Lampretsa","Bernhard Kainz","Martin Menten","Tamara Mueller","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2403.16776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14897v2","updated":"2024-03-25T13:46:03Z","published":"2024-03-22T01:02:09Z","title":"Geometric Generative Models based on Morphological Equivariant PDEs and\n GANs","summary":" Content and image generation consist in creating or generating data from\nnoisy information by extracting specific features such as texture, edges, and\nother thin image structures. We are interested here in generative models, and\ntwo main problems are addressed. Firstly, the improvements of specific feature\nextraction while accounting at multiscale levels intrinsic geometric features;\nand secondly, the equivariance of the network to reduce its complexity and\nprovide a geometric interpretability. To proceed, we propose a geometric\ngenerative model based on an equivariant partial differential equation (PDE)\nfor group convolution neural networks (G-CNNs), so called PDE-G-CNNs, built on\nmorphology operators and generative adversarial networks (GANs). Equivariant\nmorphological PDE layers are composed of multiscale dilations and erosions\nformulated in Riemannian manifolds, while group symmetries are defined on a Lie\ngroup. We take advantage of the Lie group structure to properly integrate the\nequivariance in layers, and are able to use the Riemannian metric to solve the\nmultiscale morphological operations. Each point of the Lie group is associated\nwith a unique point in the manifold, which helps us derive a metric on the\nRiemannian manifold from a tensor field invariant under the Lie group so that\nthe induced metric has the same symmetries. The proposed geometric\nmorphological GAN (GM-GAN) is obtained by using the proposed morphological\nequivariant convolutions in PDE-G-CNNs to bring nonlinearity in classical CNNs.\nGM-GAN is evaluated on MNIST data and compared with GANs. Preliminary results\nshow that GM-GAN model outperforms classical GAN.\n","authors":["El Hadji S. Diop","Thierno Fall","Alioune Mbengue","Mohamed Daoudi"],"pdf_url":"https://arxiv.org/pdf/2403.14897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16700v2","updated":"2024-03-25T13:33:51Z","published":"2024-01-30T03:00:25Z","title":"Towards Precise 3D Human Pose Estimation with Multi-Perspective\n Spatial-Temporal Relational Transformers","summary":" 3D human pose estimation captures the human joint points in three-dimensional\nspace while keeping the depth information and physical structure. That is\nessential for applications that require precise pose information, such as\nhuman-computer interaction, scene understanding, and rehabilitation training.\nDue to the challenges in data collection, mainstream datasets of 3D human pose\nestimation are primarily composed of multi-view video data collected in\nlaboratory environments, which contains rich spatial-temporal correlation\ninformation besides the image frame content. Given the remarkable\nself-attention mechanism of transformers, capable of capturing the\nspatial-temporal correlation from multi-view video datasets, we propose a\nmulti-stage framework for 3D sequence-to-sequence (seq2seq) human pose\ndetection. Firstly, the spatial module represents the human pose feature by\nintra-image content, while the frame-image relation module extracts temporal\nrelationships and 3D spatial positional relationship features between the\nmulti-perspective images. Secondly, the self-attention mechanism is adopted to\neliminate the interference from non-human body parts and reduce computing\nresources. Our method is evaluated on Human3.6M, a popular 3D human pose\ndetection dataset. Experimental results demonstrate that our approach achieves\nstate-of-the-art performance on this dataset. The source code will be available\nat https://github.com/WUJINHUAN/3D-human-pose.\n","authors":["Jianbin Jiao","Xina Cheng","Weijie Chen","Xiaoting Yin","Hao Shi","Kailun Yang"],"pdf_url":"https://arxiv.org/pdf/2401.16700v2.pdf","comment":"Accepted to IJCNN 2024. The source code will be available at\n https://github.com/WUJINHUAN/3D-human-pose"},{"id":"http://arxiv.org/abs/2402.04599v2","updated":"2024-03-25T13:30:37Z","published":"2024-02-07T05:47:31Z","title":"Meet JEANIE: a Similarity Measure for 3D Skeleton Sequences via\n Temporal-Viewpoint Alignment","summary":" Video sequences exhibit significant nuisance variations (undesired effects)\nof speed of actions, temporal locations, and subjects' poses, leading to\ntemporal-viewpoint misalignment when comparing two sets of frames or evaluating\nthe similarity of two sequences. Thus, we propose Joint tEmporal and cAmera\nviewpoiNt alIgnmEnt (JEANIE) for sequence pairs. In particular, we focus on 3D\nskeleton sequences whose camera and subjects' poses can be easily manipulated\nin 3D. We evaluate JEANIE on skeletal Few-shot Action Recognition (FSAR), where\nmatching well temporal blocks (temporal chunks that make up a sequence) of\nsupport-query sequence pairs (by factoring out nuisance variations) is\nessential due to limited samples of novel classes. Given a query sequence, we\ncreate its several views by simulating several camera locations. For a support\nsequence, we match it with view-simulated query sequences, as in the popular\nDynamic Time Warping (DTW). Specifically, each support temporal block can be\nmatched to the query temporal block with the same or adjacent (next) temporal\nindex, and adjacent camera views to achieve joint local temporal-viewpoint\nwarping. JEANIE selects the smallest distance among matching paths with\ndifferent temporal-viewpoint warping patterns, an advantage over DTW which only\nperforms temporal alignment. We also propose an unsupervised FSAR akin to\nclustering of sequences with JEANIE as a distance measure. JEANIE achieves\nstate-of-the-art results on NTU-60, NTU-120, Kinetics-skeleton and UWA3D\nMultiview Activity II on supervised and unsupervised FSAR, and their\nmeta-learning inspired fusion.\n","authors":["Lei Wang","Jun Liu","Liang Zheng","Tom Gedeon","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2402.04599v2.pdf","comment":"Accepted by the International Journal of Computer Vision (IJCV). An\n extension of our ACCV'22 paper [arXiv:arXiv:2210.16820] which was\n distinguished by the Sang Uk Lee Best Student Paper Award"},{"id":"http://arxiv.org/abs/2403.06764v2","updated":"2024-03-25T13:29:30Z","published":"2024-03-11T14:35:32Z","title":"An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference\n Acceleration for Large Vision-Language Models","summary":" In this study, we identify the inefficient attention phenomena in Large\nVision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5,\nQwenVL-Chat and Video-LLaVA. We find out that the attention computation over\nvisual tokens is of extreme inefficiency in the deep layers of popular LVLMs,\nsuggesting a need for a sparser approach compared to textual data handling. To\nthis end, we introduce FastV, a versatile plug-and-play method designed to\noptimize computational efficiency by learning adaptive attention patterns in\nearly layers and pruning visual tokens in subsequent ones. Our evaluations\ndemonstrate FastV's ability to dramatically reduce computational costs (e.g., a\n45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a\nwide range of image and video understanding tasks. The computational efficiency\nand performance trade-off of FastV are highly customizable and\npareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve\na lower budget than that of a 7B-parameter model, while still maintaining\nsuperior performance. We believe FastV has practical values for deployment of\nLVLMs in edge devices and commercial models. Code is released at\nhttps://github.com/pkunlp-icler/FastV.\n","authors":["Liang Chen","Haozhe Zhao","Tianyu Liu","Shuai Bai","Junyang Lin","Chang Zhou","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2403.06764v2.pdf","comment":"21 papes, 8 figures, code is released at\n https://github.com/pkunlp-icler/FastV"},{"id":"http://arxiv.org/abs/2402.15648v2","updated":"2024-03-25T13:27:26Z","published":"2024-02-23T23:15:54Z","title":"MambaIR: A Simple Baseline for Image Restoration with State-Space Model","summary":" Recent years have seen significant advancements in image restoration, largely\nattributed to the development of modern deep neural networks, such as CNNs and\nTransformers. However, existing restoration backbones often face the dilemma\nbetween global receptive fields and efficient computation, hindering their\napplication in practice. Recently, the Selective Structured State Space Model,\nespecially the improved version Mamba, has shown great potential for long-range\ndependency modeling with linear complexity, which offers a way to resolve the\nabove dilemma. However, the standard Mamba still faces certain challenges in\nlow-level vision such as local pixel forgetting and channel redundancy. In this\nwork, we introduce a simple but effective baseline, named MambaIR, which\nintroduces both local enhancement and channel attention to improve the vanilla\nMamba. In this way, our MambaIR takes advantage of the local pixel similarity\nand reduces the channel redundancy. Extensive experiments demonstrate the\nsuperiority of our method, for example, MambaIR outperforms SwinIR by up to\n0.45dB on image SR, using similar computational cost but with a global\nreceptive field. Code is available at \\url{https://github.com/csguoh/MambaIR}.\n","authors":["Hang Guo","Jinmin Li","Tao Dai","Zhihao Ouyang","Xudong Ren","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2402.15648v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2312.11897v2","updated":"2024-03-25T13:15:22Z","published":"2023-12-19T06:42:47Z","title":"Text-Conditioned Resampler For Long Form Video Understanding","summary":" In this paper we present a text-conditioned video resampler (TCR) module that\nuses a pre-trained and frozen visual encoder and large language model (LLM) to\nprocess long video sequences for a task. TCR localises relevant visual features\nfrom the video given a text condition and provides them to a LLM to generate a\ntext response. Due to its lightweight design and use of cross-attention, TCR\ncan process more than 100 frames at a time with plain attention and without\noptimised implementations. We make the following contributions: (i) we design a\ntransformer-based sampling architecture that can process long videos\nconditioned on a task, together with a training method that enables it to\nbridge pre-trained visual and language models; (ii) we identify tasks that\ncould benefit from longer video perception; and (iii) we empirically validate\nits efficacy on a wide variety of evaluation tasks including NextQA, EgoSchema,\nand the EGO4D-LTA challenge.\n","authors":["Bruno Korbar","Yongqin Xian","Alessio Tonioni","Andrew Zisserman","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.11897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16736v1","updated":"2024-03-25T13:09:40Z","published":"2024-03-25T13:09:40Z","title":"Creating a Digital Twin of Spinal Surgery: A Proof of Concept","summary":" Surgery digitalization is the process of creating a virtual replica of\nreal-world surgery, also referred to as a surgical digital twin (SDT). It has\nsignificant applications in various fields such as education and training,\nsurgical planning, and automation of surgical tasks. Given their detailed\nrepresentations of surgical procedures, SDTs are an ideal foundation for\nmachine learning methods, enabling automatic generation of training data. In\nrobotic surgery, SDTs can provide realistic virtual environments in which\nrobots may learn through trial and error. In this paper, we present a proof of\nconcept (PoC) for surgery digitalization that is applied to an ex-vivo spinal\nsurgery performed in realistic conditions. The proposed digitalization focuses\non the acquisition and modelling of the geometry and appearance of the entire\nsurgical scene. We employ five RGB-D cameras for dynamic 3D reconstruction of\nthe surgeon, a high-end camera for 3D reconstruction of the anatomy, an\ninfrared stereo camera for surgical instrument tracking, and a laser scanner\nfor 3D reconstruction of the operating room and data fusion. We justify the\nproposed methodology, discuss the challenges faced and further extensions of\nour prototype. While our PoC partially relies on manual data curation, its high\nquality and great potential motivate the development of automated methods for\nthe creation of SDTs. The quality of our SDT can be assessed in a rendered\nvideo available at https://youtu.be/LqVaWGgaTMY .\n","authors":["Jonas Hein","Frederic Giraud","Lilian Calvet","Alexander Schwarz","Nicola Alessandro Cavalcanti","Sergey Prokudin","Mazda Farshad","Siyu Tang","Marc Pollefeys","Fabio Carrillo","Philipp Fürnstahl"],"pdf_url":"https://arxiv.org/pdf/2403.16736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00374v3","updated":"2024-03-25T13:01:27Z","published":"2023-12-31T02:25:41Z","title":"EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via\n Expressive Masked Audio Gesture Modeling","summary":" We propose EMAGE, a framework to generate full-body human gestures from audio\nand masked gestures, encompassing facial, local body, hands, and global\nmovements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new\nmesh-level holistic co-speech dataset. BEAT2 combines MoShed SMPLX body with\nFLAME head parameters and further refines the modeling of head, neck, and\nfinger movements, offering a community-standardized, high-quality 3D motion\ncaptured dataset. EMAGE leverages masked body gesture priors during training to\nboost inference performance. It involves a Masked Audio Gesture Transformer,\nfacilitating joint training on audio-to-gesture generation and masked gesture\nreconstruction to effectively encode audio and body gesture hints. Encoded body\nhints from masked gestures are then separately employed to generate facial and\nbody movements. Moreover, EMAGE adaptively merges speech features from the\naudio's rhythm and content and utilizes four compositional VQ-VAEs to enhance\nthe results' fidelity and diversity. Experiments demonstrate that EMAGE\ngenerates holistic gestures with state-of-the-art performance and is flexible\nin accepting predefined spatial-temporal gesture inputs, generating complete,\naudio-synchronized results. Our code and dataset are available at\nhttps://pantomatrix.github.io/EMAGE/\n","authors":["Haiyang Liu","Zihao Zhu","Giorgio Becherini","Yichen Peng","Mingyang Su","You Zhou","Xuefei Zhe","Naoya Iwamoto","Bo Zheng","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2401.00374v3.pdf","comment":"CVPR Camera Ready; Project Page: https://pantomatrix.github.io/EMAGE/"},{"id":"http://arxiv.org/abs/2402.07310v2","updated":"2024-03-25T12:58:45Z","published":"2024-02-11T21:16:42Z","title":"BioNeRF: Biologically Plausible Neural Radiance Fields for View\n Synthesis","summary":" This paper presents BioNeRF, a biologically plausible architecture that\nmodels scenes in a 3D representation and synthesizes new views through radiance\nfields. Since NeRF relies on the network weights to store the scene's\n3-dimensional representation, BioNeRF implements a cognitive-inspired mechanism\nthat fuses inputs from multiple sources into a memory-like structure, improving\nthe storing capacity and extracting more intrinsic and correlated information.\nBioNeRF also mimics a behavior observed in pyramidal cells concerning\ncontextual information, in which the memory is provided as the context and\ncombined with the inputs of two subsequent neural models, one responsible for\nproducing the volumetric densities and the other the colors used to render the\nscene. Experimental results show that BioNeRF outperforms state-of-the-art\nresults concerning a quality measure that encodes human perception in two\ndatasets: real-world images and synthetic data.\n","authors":["Leandro A. Passos","Douglas Rodrigues","Danilo Jodas","Kelton A. P. Costa","Ahsan Adeel","João Paulo Papa"],"pdf_url":"https://arxiv.org/pdf/2402.07310v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02969v2","updated":"2024-03-25T12:45:03Z","published":"2024-03-05T13:45:46Z","title":"Multi-modal Instruction Tuned LLMs with Fine-grained Visual Perception","summary":" Multimodal Large Language Model (MLLMs) leverages Large Language Models as a\ncognitive framework for diverse visual-language tasks. Recent efforts have been\nmade to equip MLLMs with visual perceiving and grounding capabilities. However,\nthere still remains a gap in providing fine-grained pixel-level perceptions and\nextending interactions beyond text-specific inputs. In this work, we propose\n{\\bf{AnyRef}}, a general MLLM model that can generate pixel-wise object\nperceptions and natural language descriptions from multi-modality references,\nsuch as texts, boxes, images, or audio. This innovation empowers users with\ngreater flexibility to engage with the model beyond textual and regional\nprompts, without modality-specific designs. Through our proposed refocusing\nmechanism, the generated grounding output is guided to better focus on the\nreferenced object, implicitly incorporating additional pixel-level supervision.\nThis simple modification utilizes attention scores generated during the\ninference of LLM, eliminating the need for extra computations while exhibiting\nperformance enhancements in both grounding masks and referring expressions.\nWith only publicly available training data, our model achieves state-of-the-art\nresults across multiple benchmarks, including diverse modality referring\nsegmentation and region-level referring expression generation.\n","authors":["Junwen He","Yifan Wang","Lijun Wang","Huchuan Lu","Jun-Yan He","Jin-Peng Lan","Bin Luo","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2403.02969v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16697v1","updated":"2024-03-25T12:31:01Z","published":"2024-03-25T12:31:01Z","title":"DPStyler: Dynamic PromptStyler for Source-Free Domain Generalization","summary":" Source-Free Domain Generalization (SFDG) aims to develop a model that works\nfor unseen target domains without relying on any source domain. Recent work,\nPromptStyler, employs text prompts to simulate different distribution shifts in\nthe joint vision-language space, allowing the model to generalize effectively\nto unseen domains without using any images. However, 1) PromptStyler's style\ngeneration strategy has limitations, as all style patterns are fixed after the\nfirst training phase. This leads to the training set in the second training\nphase being restricted to a limited set of styles. Additionally, 2) the frozen\ntext encoder in PromptStyler result in the encoder's output varying with the\nstyle of the input text prompts, making it difficult for the model to learn\ndomain-invariant features. In this paper, we introduce Dynamic PromptStyler\n(DPStyler), comprising Style Generation and Style Removal modules to address\nthese issues. The Style Generation module refreshes all styles at every\ntraining epoch, while the Style Removal module eliminates variations in the\nencoder's output features caused by input styles. Moreover, since the Style\nGeneration module, responsible for generating style word vectors using random\nsampling or style mixing, makes the model sensitive to input text prompts, we\nintroduce a model ensemble method to mitigate this sensitivity. Extensive\nexperiments demonstrate that our framework outperforms state-of-the-art methods\non benchmark datasets.\n","authors":["Yunlong Tang","Yuxuan Wan","Lei Qi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2403.16697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16695v1","updated":"2024-03-25T12:26:32Z","published":"2024-03-25T12:26:32Z","title":"Assessing the Performance of Deep Learning for Automated Gleason Grading\n in Prostate Cancer","summary":" Prostate cancer is a dominant health concern calling for advanced diagnostic\ntools. Utilizing digital pathology and artificial intelligence, this study\nexplores the potential of 11 deep neural network architectures for automated\nGleason grading in prostate carcinoma focusing on comparing traditional and\nrecent architectures. A standardized image classification pipeline, based on\nthe AUCMEDI framework, facilitated robust evaluation using an in-house dataset\nconsisting of 34,264 annotated tissue tiles. The results indicated varying\nsensitivity across architectures, with ConvNeXt demonstrating the strongest\nperformance. Notably, newer architectures achieved superior performance, even\nthough with challenges in differentiating closely related Gleason grades. The\nConvNeXt model was capable of learning a balance between complexity and\ngeneralizability. Overall, this study lays the groundwork for enhanced Gleason\ngrading systems, potentially improving diagnostic efficiency for prostate\ncancer.\n","authors":["Dominik Müller","Philip Meyer","Lukas Rentschler","Robin Manz","Daniel Hieber","Jonas Bäcker","Samantha Cramer","Christoph Wengenmayr","Bruno Märkl","Ralf Huss","Frank Kramer","Iñaki Soto-Rey","Johannes Raffler"],"pdf_url":"https://arxiv.org/pdf/2403.16695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16689v1","updated":"2024-03-25T12:23:39Z","published":"2024-03-25T12:23:39Z","title":"Synapse: Learning Preferential Concepts from Visual Demonstrations","summary":" This paper addresses the problem of preference learning, which aims to learn\nuser-specific preferences (e.g., \"good parking spot\", \"convenient drop-off\nlocation\") from visual input. Despite its similarity to learning factual\nconcepts (e.g., \"red cube\"), preference learning is a fundamentally harder\nproblem due to its subjective nature and the paucity of person-specific\ntraining data. We address this problem using a new framework called Synapse,\nwhich is a neuro-symbolic approach designed to efficiently learn preferential\nconcepts from limited demonstrations. Synapse represents preferences as\nneuro-symbolic programs in a domain-specific language (DSL) that operates over\nimages, and leverages a novel combination of visual parsing, large language\nmodels, and program synthesis to learn programs representing individual\npreferences. We evaluate Synapse through extensive experimentation including a\nuser case study focusing on mobility-related concepts in mobile robotics and\nautonomous driving. Our evaluation demonstrates that Synapse significantly\noutperforms existing baselines as well as its own ablations. The code and other\ndetails can be found on the project website https://amrl.cs.utexas.edu/synapse .\n","authors":["Sadanand Modak","Noah Patton","Isil Dillig","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2403.16689v1.pdf","comment":"23 pages, 7 figures; Preprint"},{"id":"http://arxiv.org/abs/2403.16678v1","updated":"2024-03-25T12:15:42Z","published":"2024-03-25T12:15:42Z","title":"DeepGleason: a System for Automated Gleason Grading of Prostate Cancer\n using Deep Neural Networks","summary":" Advances in digital pathology and artificial intelligence (AI) offer\npromising opportunities for clinical decision support and enhancing diagnostic\nworkflows. Previous studies already demonstrated AI's potential for automated\nGleason grading, but lack state-of-the-art methodology and model reusability.\nTo address this issue, we propose DeepGleason: an open-source deep neural\nnetwork based image classification system for automated Gleason grading using\nwhole-slide histopathology images from prostate tissue sections. Implemented\nwith the standardized AUCMEDI framework, our tool employs a tile-wise\nclassification approach utilizing fine-tuned image preprocessing techniques in\ncombination with a ConvNeXt architecture which was compared to various\nstate-of-the-art architectures. The neural network model was trained and\nvalidated on an in-house dataset of 34,264 annotated tiles from 369 prostate\ncarcinoma slides. We demonstrated that DeepGleason is capable of highly\naccurate and reliable Gleason grading with a macro-averaged F1-score of 0.806,\nAUC of 0.991, and Accuracy of 0.974. The internal architecture comparison\nrevealed that the ConvNeXt model was superior performance-wise on our dataset\nto established and other modern architectures like transformers. Furthermore,\nwe were able to outperform the current state-of-the-art in tile-wise\nfine-classification with a sensitivity and specificity of 0.94 and 0.98 for\nbenign vs malignant detection as well as of 0.91 and 0.75 for Gleason 3 vs\nGleason 4 & 5 classification, respectively. Our tool contributes to the wider\nadoption of AI-based Gleason grading within the research community and paves\nthe way for broader clinical application of deep learning models in digital\npathology. DeepGleason is open-source and publicly available for research\napplication in the following Git repository:\nhttps://github.com/frankkramer-lab/DeepGleason.\n","authors":["Dominik Müller","Philip Meyer","Lukas Rentschler","Robin Manz","Jonas Bäcker","Samantha Cramer","Christoph Wengenmayr","Bruno Märkl","Ralf Huss","Iñaki Soto-Rey","Johannes Raffler"],"pdf_url":"https://arxiv.org/pdf/2403.16678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16677v1","updated":"2024-03-25T12:14:48Z","published":"2024-03-25T12:14:48Z","title":"FOOL: Addressing the Downlink Bottleneck in Satellite Computing with\n Neural Feature Compression","summary":" Nanosatellite constellations equipped with sensors capturing large geographic\nregions provide unprecedented opportunities for Earth observation. As\nconstellation sizes increase, network contention poses a downlink bottleneck.\nOrbital Edge Computing (OEC) leverages limited onboard compute resources to\nreduce transfer costs by processing the raw captures at the source. However,\ncurrent solutions have limited practicability due to reliance on crude\nfiltering methods or over-prioritizing particular downstream tasks.\n This work presents FOOL, an OEC-native and task-agnostic feature compression\nmethod that preserves prediction performance. FOOL partitions high-resolution\nsatellite imagery to maximize throughput. Further, it embeds context and\nleverages inter-tile dependencies to lower transfer costs with negligible\noverhead. While FOOL is a feature compressor, it can recover images with\ncompetitive scores on perceptual quality measures at lower bitrates. We\nextensively evaluate transfer cost reduction by including the peculiarity of\nintermittently available network connections in low earth orbit. Lastly, we\ntest the feasibility of our system for standardized nanosatellite form factors.\nWe demonstrate that FOOL permits downlinking over 100x the data volume without\nrelying on prior information on the downstream tasks.\n","authors":["Alireza Furutanpey","Qiyang Zhang","Philipp Raith","Tobias Pfandzelter","Shangguang Wang","Schahram Dustdar"],"pdf_url":"https://arxiv.org/pdf/2403.16677v1.pdf","comment":"18 pages, double column, 19 figures, 7 tables, Initial Submission to\n IEEE Transactions on Mobile Computing"},{"id":"http://arxiv.org/abs/2403.16669v1","updated":"2024-03-25T12:07:24Z","published":"2024-03-25T12:07:24Z","title":"Domain Adaptive Detection of MAVs: A Benchmark and Noise Suppression\n Network","summary":" Visual detection of Micro Air Vehicles (MAVs) has attracted increasing\nattention in recent years due to its important application in various tasks.\nThe existing methods for MAV detection assume that the training set and testing\nset have the same distribution. As a result, when deployed in new domains, the\ndetectors would have a significant performance degradation due to domain\ndiscrepancy. In this paper, we study the problem of cross-domain MAV detection.\nThe contributions of this paper are threefold. 1) We propose a\nMulti-MAV-Multi-Domain (M3D) dataset consisting of both simulation and\nrealistic images. Compared to other existing datasets, the proposed one is more\ncomprehensive in the sense that it covers rich scenes, diverse MAV types, and\nvarious viewing angles. A new benchmark for cross-domain MAV detection is\nproposed based on the proposed dataset. 2) We propose a Noise Suppression\nNetwork (NSN) based on the framework of pseudo-labeling and a large-to-small\ntraining procedure. To reduce the challenging pseudo-label noises, two novel\nmodules are designed in this network. The first is a prior-based curriculum\nlearning module for allocating adaptive thresholds for pseudo labels with\ndifferent difficulties. The second is a masked copy-paste augmentation module\nfor pasting truly-labeled MAVs on unlabeled target images and thus decreasing\npseudo-label noises. 3) Extensive experimental results verify the superior\nperformance of the proposed method compared to the state-of-the-art ones. In\nparticular, it achieves mAP of 46.9%(+5.8%), 50.5%(+3.7%), and 61.5%(+11.3%) on\nthe tasks of simulation-to-real adaptation, cross-scene adaptation, and\ncross-camera adaptation, respectively.\n","authors":["Yin Zhang","Jinhong Deng","Peidong Liu","Wen Li","Shiyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.16669v1.pdf","comment":"17 pages, 11 figures. Accepted by IEEE Transactions on Automation\n Science and Engineering"},{"id":"http://arxiv.org/abs/2308.10299v3","updated":"2024-03-25T12:04:41Z","published":"2023-08-20T15:38:40Z","title":"Boosting Adversarial Transferability by Block Shuffle and Rotation","summary":" Adversarial examples mislead deep neural networks with imperceptible\nperturbations and have brought significant threats to deep learning. An\nimportant aspect is their transferability, which refers to their ability to\ndeceive other models, thus enabling attacks in the black-box setting. Though\nvarious methods have been proposed to boost transferability, the performance\nstill falls short compared with white-box attacks. In this work, we observe\nthat existing input transformation based attacks, one of the mainstream\ntransfer-based attacks, result in different attention heatmaps on various\nmodels, which might limit the transferability. We also find that breaking the\nintrinsic relation of the image can disrupt the attention heatmap of the\noriginal image. Based on this finding, we propose a novel input transformation\nbased attack called block shuffle and rotation (BSR). Specifically, BSR splits\nthe input image into several blocks, then randomly shuffles and rotates these\nblocks to construct a set of new images for gradient calculation. Empirical\nevaluations on the ImageNet dataset demonstrate that BSR could achieve\nsignificantly better transferability than the existing input transformation\nbased methods under single-model and ensemble-model settings. Combining BSR\nwith the current input transformation method can further improve the\ntransferability, which significantly outperforms the state-of-the-art methods.\nCode is available at https://github.com/Trustworthy-AI-Group/BSR\n","authors":["Kunyu Wang","Xuanran He","Wenxuan Wang","Xiaosen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10299v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2311.16515v2","updated":"2024-03-25T12:01:59Z","published":"2023-11-25T14:24:49Z","title":"Word4Per: Zero-shot Composed Person Retrieval","summary":" Searching for specific person has great social benefits and security value,\nand it often involves a combination of visual and textual information.\nConventional person retrieval methods, whether image-based or text-based,\nusually fall short in effectively harnessing both types of information, leading\nto the loss of accuracy. In this paper, a whole new task called Composed Person\nRetrieval (CPR) is proposed to jointly utilize both image and text information\nfor target person retrieval. However, the supervised CPR requires very costly\nmanual annotation dataset, while there are currently no available resources. To\nmitigate this issue, we firstly introduce the Zero-shot Composed Person\nRetrieval (ZS-CPR), which leverages existing domain-related data to resolve the\nCPR problem without expensive annotations. Secondly, to learn ZS-CPR model, we\npropose a two-stage learning framework, Word4Per, where a lightweight Textual\nInversion Network (TINet) and a text-based person retrieval model based on\nfine-tuned Contrastive Language-Image Pre-training (CLIP) network are learned\nwithout utilizing any CPR data. Thirdly, a finely annotated Image-Text Composed\nPerson Retrieval (ITCPR) dataset is built as the benchmark to assess the\nperformance of the proposed Word4Per framework. Extensive experiments under\nboth Rank-1 and mAP demonstrate the effectiveness of Word4Per for the ZS-CPR\ntask, surpassing the comparative methods by over 10\\%. The code and ITCPR\ndataset will be publicly available at\nhttps://github.com/Delong-liu-bupt/Word4Per.\n","authors":["Delong Liu","Haiwen Li","Zhicheng Zhao","Fei Su","Yuan Dong"],"pdf_url":"https://arxiv.org/pdf/2311.16515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05305v2","updated":"2024-03-25T11:48:27Z","published":"2024-02-07T22:50:47Z","title":"Knowledge Distillation for Road Detection based on cross-model\n Semi-Supervised Learning","summary":" The advancement of knowledge distillation has played a crucial role in\nenabling the transfer of knowledge from larger teacher models to smaller and\nmore efficient student models, and is particularly beneficial for online and\nresource-constrained applications. The effectiveness of the student model\nheavily relies on the quality of the distilled knowledge received from the\nteacher. Given the accessibility of unlabelled remote sensing data,\nsemi-supervised learning has become a prevalent strategy for enhancing model\nperformance. However, relying solely on semi-supervised learning with smaller\nmodels may be insufficient due to their limited capacity for feature\nextraction. This limitation restricts their ability to exploit training data.\nTo address this issue, we propose an integrated approach that combines\nknowledge distillation and semi-supervised learning methods. This hybrid\napproach leverages the robust capabilities of large models to effectively\nutilise large unlabelled data whilst subsequently providing the small student\nmodel with rich and informative features for enhancement. The proposed\nsemi-supervised learning-based knowledge distillation (SSLKD) approach\ndemonstrates a notable improvement in the performance of the student model, in\nthe application of road segmentation, surpassing the effectiveness of\ntraditional semi-supervised learning methods.\n","authors":["Wanli Ma","Oktay Karakus","Paul L. Rosin"],"pdf_url":"https://arxiv.org/pdf/2402.05305v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02935v2","updated":"2024-03-25T11:45:58Z","published":"2023-08-05T18:32:49Z","title":"Unveiling the Blind Spots: A Critical Examination of Fairness in\n Autonomous Driving Systems","summary":" Autonomous driving systems have extended the spectrum of Web of Things for\nintelligent vehicles and have become an important component of the Web\necosystem. Similar to traditional Web-based applications, fairness is an\nessential aspect for ensuring the high quality of autonomous driving systems,\nparticularly in the context of pedestrian detectors within them. However, there\nis an absence in the literature of a comprehensive assessment of the fairness\nof current Deep Learning (DL)-based pedestrian detectors. To fill the gap, we\nevaluate eight widely-explored DL-based pedestrian detectors across demographic\ngroups on large-scale real-world datasets. To enable a thorough fairness\nevaluation, we provide extensive annotations for the datasets, resulting in\n8,311 images with 16,070 gender labels, 20,115 age labels, and 3,513 skin tone\nlabels. Our findings reveal significant fairness issues related to age. The\nundetected proportions for adults are 20.14% lower compared to children.\nFurthermore, we explore how various driving scenarios affect the fairness of\npedestrian detectors. We find that the bias may exacerbate for children and\nfemales towards low brightness and low contrast.\n","authors":["Xinyue Li","Zhenpeng Chen","Jie M. Zhang","Federica Sarro","Ying Zhang","Xuanzhe Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02935v2.pdf","comment":"Update the models evaluated and the experimental results"},{"id":"http://arxiv.org/abs/2310.06744v2","updated":"2024-03-25T11:35:55Z","published":"2023-10-10T16:14:20Z","title":"HiFi-123: Towards High-fidelity One Image to 3D Content Generation","summary":" Recent advances in diffusion models have enabled 3D generation from a single\nimage. However, current methods often produce suboptimal results for novel\nviews, with blurred textures and deviations from the reference image, limiting\ntheir practical applications. In this paper, we introduce HiFi-123, a method\ndesigned for high-fidelity and multi-view consistent 3D generation. Our\ncontributions are twofold: First, we propose a Reference-Guided Novel View\nEnhancement (RGNV) technique that significantly improves the fidelity of\ndiffusion-based zero-shot novel view synthesis methods. Second, capitalizing on\nthe RGNV, we present a novel Reference-Guided State Distillation (RGSD) loss.\nWhen incorporated into the optimization-based image-to-3D pipeline, our method\nsignificantly improves 3D generation quality, achieving state-of-the-art\nperformance. Comprehensive evaluations demonstrate the effectiveness of our\napproach over existing methods, both qualitatively and quantitatively. Video\nresults are available on the project page.\n","authors":["Wangbo Yu","Li Yuan","Yan-Pei Cao","Xiangjun Gao","Xiaoyu Li","Wenbo Hu","Long Quan","Ying Shan","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2310.06744v2.pdf","comment":"Project Page: https://drexubery.github.io/HiFi-123/"},{"id":"http://arxiv.org/abs/2403.16646v1","updated":"2024-03-25T11:32:05Z","published":"2024-03-25T11:32:05Z","title":"Clustering Propagation for Universal Medical Image Segmentation","summary":" Prominent solutions for medical image segmentation are typically tailored for\nautomatic or interactive setups, posing challenges in facilitating progress\nachieved in one task to another.$_{\\!}$ This$_{\\!}$ also$_{\\!}$\nnecessitates$_{\\!}$ separate$_{\\!}$ models for each task, duplicating both\ntraining time and parameters.$_{\\!}$ To$_{\\!}$ address$_{\\!}$ above$_{\\!}$\nissues,$_{\\!}$ we$_{\\!}$ introduce$_{\\!}$ S2VNet,$_{\\!}$ a$_{\\!}$\nuniversal$_{\\!}$ framework$_{\\!}$ that$_{\\!}$ leverages$_{\\!}$\nSlice-to-Volume$_{\\!}$ propagation$_{\\!}$ to$_{\\!}$ unify automatic/interactive\nsegmentation within a single model and one training session. Inspired by\nclustering-based segmentation techniques, S2VNet makes full use of the\nslice-wise structure of volumetric data by initializing cluster centers from\nthe cluster$_{\\!}$ results$_{\\!}$ of$_{\\!}$ previous$_{\\!}$ slice.$_{\\!}$ This\nenables knowledge acquired from prior slices to assist in the segmentation of\nthe current slice, further efficiently bridging the communication between\nremote slices using mere 2D networks. Moreover, such a framework readily\naccommodates interactive segmentation with no architectural change, simply by\ninitializing centroids from user inputs. S2VNet distinguishes itself by swift\ninference speeds and reduced memory consumption compared to prevailing 3D\nsolutions. It can also handle multi-class interactions with each of them\nserving to initialize different centroids. Experiments on three benchmarks\ndemonstrate S2VNet surpasses task-specified solutions on both\nautomatic/interactive setups.\n","authors":["Yuhang Ding","Liulei Li","Wenguan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16646v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.16643v1","updated":"2024-03-25T11:29:19Z","published":"2024-03-25T11:29:19Z","title":"Self-Adaptive Reality-Guided Diffusion for Artifact-Free\n Super-Resolution","summary":" Artifact-free super-resolution (SR) aims to translate low-resolution images\ninto their high-resolution counterparts with a strict integrity of the original\ncontent, eliminating any distortions or synthetic details. While traditional\ndiffusion-based SR techniques have demonstrated remarkable abilities to enhance\nimage detail, they are prone to artifact introduction during iterative\nprocedures. Such artifacts, ranging from trivial noise to unauthentic textures,\ndeviate from the true structure of the source image, thus challenging the\nintegrity of the super-resolution process. In this work, we propose\nSelf-Adaptive Reality-Guided Diffusion (SARGD), a training-free method that\ndelves into the latent space to effectively identify and mitigate the\npropagation of artifacts. Our SARGD begins by using an artifact detector to\nidentify implausible pixels, creating a binary mask that highlights artifacts.\nFollowing this, the Reality Guidance Refinement (RGR) process refines artifacts\nby integrating this mask with realistic latent representations, improving\nalignment with the original image. Nonetheless, initial realistic-latent\nrepresentations from lower-quality images result in over-smoothing in the final\noutput. To address this, we introduce a Self-Adaptive Guidance (SAG) mechanism.\nIt dynamically computes a reality score, enhancing the sharpness of the\nrealistic latent. These alternating mechanisms collectively achieve\nartifact-free super-resolution. Extensive experiments demonstrate the\nsuperiority of our method, delivering detailed artifact-free high-resolution\nimages while reducing sampling steps by 2X. We release our code at\nhttps://github.com/ProAirVerse/Self-Adaptive-Guidance-Diffusion.git.\n","authors":["Qingping Zheng","Ling Zheng","Yuanfan Guo","Ying Li","Songcen Xu","Jiankang Deng","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16640v1","updated":"2024-03-25T11:28:52Z","published":"2024-03-25T11:28:52Z","title":"Multi-Scale Texture Loss for CT denoising with GANs","summary":" Generative Adversarial Networks (GANs) have proved as a powerful framework\nfor denoising applications in medical imaging. However, GAN-based denoising\nalgorithms still suffer from limitations in capturing complex relationships\nwithin the images. In this regard, the loss function plays a crucial role in\nguiding the image generation process, encompassing how much a synthetic image\ndiffers from a real image. To grasp highly complex and non-linear textural\nrelationships in the training process, this work presents a loss function that\nleverages the intrinsic multi-scale nature of the Gray-Level-Co-occurrence\nMatrix (GLCM). Although the recent advances in deep learning have demonstrated\nsuperior performance in classification and detection tasks, we hypothesize that\nits information content can be valuable when integrated into GANs' training. To\nthis end, we propose a differentiable implementation of the GLCM suited for\ngradient-based optimization. Our approach also introduces a self-attention\nlayer that dynamically aggregates the multi-scale texture information extracted\nfrom the images. We validate our approach by carrying out extensive experiments\nin the context of low-dose CT denoising, a challenging application that aims to\nenhance the quality of noisy CT scans. We utilize three publicly available\ndatasets, including one simulated and two real datasets. The results are\npromising as compared to other well-established loss functions, being also\nconsistent across three different GAN architectures. The code is available at:\nhttps://github.com/FrancescoDiFeola/DenoTextureLoss\n","authors":["Francesco Di Feola","Lorenzo Tronchin","Valerio Guarrasi","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2403.16640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16638v1","updated":"2024-03-25T11:26:18Z","published":"2024-03-25T11:26:18Z","title":"AI-Generated Video Detection via Spatio-Temporal Anomaly Learning","summary":" The advancement of generation models has led to the emergence of highly\nrealistic artificial intelligence (AI)-generated videos. Malicious users can\neasily create non-existent videos to spread false information. This letter\nproposes an effective AI-generated video detection (AIGVDet) scheme by\ncapturing the forensic traces with a two-branch spatio-temporal convolutional\nneural network (CNN). Specifically, two ResNet sub-detectors are learned\nseparately for identifying the anomalies in spatical and optical flow domains,\nrespectively. Results of such sub-detectors are fused to further enhance the\ndiscrimination ability. A large-scale generated video dataset (GVD) is\nconstructed as a benchmark for model training and evaluation. Extensive\nexperimental results verify the high generalization and robustness of our\nAIGVDet scheme. Code and dataset will be available at\nhttps://github.com/multimediaFor/AIGVDet.\n","authors":["Jianfa Bai","Man Lin","Gang Cao"],"pdf_url":"https://arxiv.org/pdf/2403.16638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16476v4","updated":"2024-03-25T11:24:45Z","published":"2023-12-27T08:50:01Z","title":"SVGDreamer: Text Guided SVG Generation with Diffusion Model","summary":" Recently, text-guided scalable vector graphics (SVGs) synthesis has shown\npromise in domains such as iconography and sketch. However, existing\ntext-to-SVG generation methods lack editability and struggle with visual\nquality and result diversity. To address these limitations, we propose a novel\ntext-guided vector graphics synthesis method called SVGDreamer. SVGDreamer\nincorporates a semantic-driven image vectorization (SIVE) process that enables\nthe decomposition of synthesis into foreground objects and background, thereby\nenhancing editability. Specifically, the SIVE process introduce attention-based\nprimitive control and an attention-mask loss function for effective control and\nmanipulation of individual elements. Additionally, we propose a Vectorized\nParticle-based Score Distillation (VPSD) approach to tackle the challenges of\nshape over-smoothing, color over-saturation, limited diversity in results, and\nslow convergence in existing text-to-SVG generation methods. VPSD models SVGs\nas distributions of control points and colors to counteract over-smoothing and\nover-saturation. Furthermore, VPSD leverages a reward model to reweight vector\nparticles, which improves aesthetic appeal and accelerates convergence.\nExtensive experiments have been conducted to validate the effectiveness of\nSVGDreamer, demonstrating its superiority over baseline methods in terms of\neditability, visual quality, and diversity. The code and demo of SVGDreamer can\nbe found at https://ximinng.github.io/SVGDreamer-project/\n","authors":["Ximing Xing","Haitao Zhou","Chuang Wang","Jing Zhang","Dong Xu","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2312.16476v4.pdf","comment":"Accepted by CVPR 2024. project link:\n https://ximinng.github.io/SVGDreamer-project/"},{"id":"http://arxiv.org/abs/2403.16635v1","updated":"2024-03-25T11:24:02Z","published":"2024-03-25T11:24:02Z","title":"V2X-PC: Vehicle-to-everything Collaborative Perception via Point Cluster","summary":" The objective of the collaborative vehicle-to-everything perception task is\nto enhance the individual vehicle's perception capability through message\ncommunication among neighboring traffic agents. Previous methods focus on\nachieving optimal performance within bandwidth limitations and typically adopt\nBEV maps as the basic collaborative message units. However, we demonstrate that\ncollaboration with dense representations is plagued by object feature\ndestruction during message packing, inefficient message aggregation for\nlong-range collaboration, and implicit structure representation communication.\nTo tackle these issues, we introduce a brand new message unit, namely point\ncluster, designed to represent the scene sparsely with a combination of\nlow-level structure information and high-level semantic information. The point\ncluster inherently preserves object information while packing messages, with\nweak relevance to the collaboration range, and supports explicit structure\nmodeling. Building upon this representation, we propose a novel framework\nV2X-PC for collaborative perception. This framework includes a Point Cluster\nPacking (PCP) module to keep object feature and manage bandwidth through the\nmanipulation of cluster point numbers. As for effective message aggregation, we\npropose a Point Cluster Aggregation (PCA) module to match and merge point\nclusters associated with the same object. To further handle time latency and\npose errors encountered in real-world scenarios, we propose parameter-free\nsolutions that can adapt to different noisy levels without finetuning.\nExperiments on two widely recognized collaborative perception benchmarks\nshowcase the superior performance of our method compared to the previous\nstate-of-the-art approaches relying on BEV maps.\n","authors":["Si Liu","Zihan Ding","Jiahui Fu","Hongyu Li","Siheng Chen","Shifeng Zhang","Xu Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.16635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16627v1","updated":"2024-03-25T11:16:23Z","published":"2024-03-25T11:16:23Z","title":"SDXS: Real-Time One-Step Latent Diffusion Models with Image Conditions","summary":" Recent advancements in diffusion models have positioned them at the forefront\nof image generation. Despite their superior performance, diffusion models are\nnot without drawbacks; they are characterized by complex architectures and\nsubstantial computational demands, resulting in significant latency due to\ntheir iterative sampling process. To mitigate these limitations, we introduce a\ndual approach involving model miniaturization and a reduction in sampling\nsteps, aimed at significantly decreasing model latency. Our methodology\nleverages knowledge distillation to streamline the U-Net and image decoder\narchitectures, and introduces an innovative one-step DM training technique that\nutilizes feature matching and score distillation. We present two models,\nSDXS-512 and SDXS-1024, achieving inference speeds of approximately 100 FPS\n(30x faster than SD v1.5) and 30 FP (60x faster than SDXL) on a single GPU,\nrespectively. Moreover, our training approach offers promising applications in\nimage-conditioned control, facilitating efficient image-to-image translation.\n","authors":["Yuda Song","Zehao Sun","Xuanwu Yin"],"pdf_url":"https://arxiv.org/pdf/2403.16627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17744v2","updated":"2024-03-25T11:04:17Z","published":"2023-11-29T15:49:31Z","title":"Variational Bayes image restoration with compressive autoencoders","summary":" Regularization of inverse problems is of paramount importance in\ncomputational imaging. The ability of neural networks to learn efficient image\nrepresentations has been recently exploited to design powerful data-driven\nregularizers. While state-of-the-art plug-and-play methods rely on an implicit\nregularization provided by neural denoisers, alternative Bayesian approaches\nconsider Maximum A Posteriori (MAP) estimation in the latent space of a\ngenerative model, thus with an explicit regularization. However,\nstate-of-the-art deep generative models require a huge amount of training data\ncompared to denoisers. Besides, their complexity hampers the optimization\ninvolved in latent MAP derivation. In this work, we first propose to use\ncompressive autoencoders instead. These networks, which can be seen as\nvariational autoencoders with a flexible latent prior, are smaller and easier\nto train than state-of-the-art generative models. As a second contribution, we\nintroduce the Variational Bayes Latent Estimation (VBLE) algorithm, which\nperforms latent estimation within the framework of variational inference.\nThanks to a simple yet efficient parameterization of the variational posterior,\nVBLE allows for fast and easy (approximate) posterior sampling. Experimental\nresults on image datasets BSD and FFHQ demonstrate that VBLE reaches similar\nperformance than state-of-the-art plug-and-play methods, while being able to\nquantify uncertainties faster than other existing posterior sampling\ntechniques.\n","authors":["Maud Biquard","Marie Chabert","Thomas Oberlin"],"pdf_url":"https://arxiv.org/pdf/2311.17744v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12198v2","updated":"2024-03-25T11:04:04Z","published":"2023-12-19T14:34:36Z","title":"Mask Grounding for Referring Image Segmentation","summary":" Referring Image Segmentation (RIS) is a challenging task that requires an\nalgorithm to segment objects referred by free-form language expressions.\nDespite significant progress in recent years, most state-of-the-art (SOTA)\nmethods still suffer from considerable language-image modality gap at the pixel\nand word level. These methods generally 1) rely on sentence-level language\nfeatures for language-image alignment and 2) lack explicit training supervision\nfor fine-grained visual grounding. Consequently, they exhibit weak object-level\ncorrespondence between visual and language features. Without well-grounded\nfeatures, prior methods struggle to understand complex expressions that require\nstrong reasoning over relationships among multiple objects, especially when\ndealing with rarely used or ambiguous clauses. To tackle this challenge, we\nintroduce a novel Mask Grounding auxiliary task that significantly improves\nvisual grounding within language features, by explicitly teaching the model to\nlearn fine-grained correspondence between masked textual tokens and their\nmatching visual objects. Mask Grounding can be directly used on prior RIS\nmethods and consistently bring improvements. Furthermore, to holistically\naddress the modality gap, we also design a cross-modal alignment loss and an\naccompanying alignment module. These additions work synergistically with Mask\nGrounding. With all these techniques, our comprehensive approach culminates in\nMagNet (Mask-grounded Network), an architecture that significantly outperforms\nprior arts on three key benchmarks (RefCOCO, RefCOCO+ and G-Ref), demonstrating\nour method's effectiveness in addressing current limitations of RIS algorithms.\nOur code and pre-trained weights will be released.\n","authors":["Yong Xien Chng","Henry Zheng","Yizeng Han","Xuchong Qiu","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2312.12198v2.pdf","comment":"Accepted by CVPR2024; Project page:\n https://yxchng.github.io/projects/mask-grounding"},{"id":"http://arxiv.org/abs/2403.16612v1","updated":"2024-03-25T10:42:48Z","published":"2024-03-25T10:42:48Z","title":"Calibrating Bayesian UNet++ for Sub-Seasonal Forecasting","summary":" Seasonal forecasting is a crucial task when it comes to detecting the extreme\nheat and colds that occur due to climate change. Confidence in the predictions\nshould be reliable since a small increase in the temperatures in a year has a\nbig impact on the world. Calibration of the neural networks provides a way to\nensure our confidence in the predictions. However, calibrating regression\nmodels is an under-researched topic, especially in forecasters. We calibrate a\nUNet++ based architecture, which was shown to outperform physics-based models\nin temperature anomalies. We show that with a slight trade-off between\nprediction error and calibration error, it is possible to get more reliable and\nsharper forecasts. We believe that calibration should be an important part of\nsafety-critical machine learning applications such as weather forecasters.\n","authors":["Busra Asan","Abdullah Akgul","Alper Unal","Melih Kandemir","Gozde Unal"],"pdf_url":"https://arxiv.org/pdf/2403.16612v1.pdf","comment":"Accepted as a workshop paper at \"ICLR 2024 Tackling Climate Change\n with Machine Learning\""},{"id":"http://arxiv.org/abs/2403.16607v1","updated":"2024-03-25T10:38:17Z","published":"2024-03-25T10:38:17Z","title":"Enhancing Industrial Transfer Learning with Style Filter: Cost Reduction\n and Defect-Focus","summary":" Addressing the challenge of data scarcity in industrial domains, transfer\nlearning emerges as a pivotal paradigm. This work introduces Style Filter, a\ntailored methodology for industrial contexts. By selectively filtering source\ndomain data before knowledge transfer, Style Filter reduces the quantity of\ndata while maintaining or even enhancing the performance of transfer learning\nstrategy. Offering label-free operation, minimal reliance on prior knowledge,\nindependence from specific models, and re-utilization, Style Filter is\nevaluated on authentic industrial datasets, highlighting its effectiveness when\nemployed before conventional transfer strategies in the deep learning domain.\nThe results underscore the effectiveness of Style Filter in real-world\nindustrial applications.\n","authors":["Chen Li","Ruijie Ma","Xiang Qian","Xiaohao Wang","Xinghui Li"],"pdf_url":"https://arxiv.org/pdf/2403.16607v1.pdf","comment":"17 pages, 11 figures,4 tables"},{"id":"http://arxiv.org/abs/2403.16605v1","updated":"2024-03-25T10:30:22Z","published":"2024-03-25T10:30:22Z","title":"SatSynth: Augmenting Image-Mask Pairs through Diffusion Models for\n Aerial Semantic Segmentation","summary":" In recent years, semantic segmentation has become a pivotal tool in\nprocessing and interpreting satellite imagery. Yet, a prevalent limitation of\nsupervised learning techniques remains the need for extensive manual\nannotations by experts. In this work, we explore the potential of generative\nimage diffusion to address the scarcity of annotated data in earth observation\ntasks. The main idea is to learn the joint data manifold of images and labels,\nleveraging recent advancements in denoising diffusion probabilistic models. To\nthe best of our knowledge, we are the first to generate both images and\ncorresponding masks for satellite segmentation. We find that the obtained pairs\nnot only display high quality in fine-scale features but also ensure a wide\nsampling diversity. Both aspects are crucial for earth observation data, where\nsemantic classes can vary severely in scale and occurrence frequency. We employ\nthe novel data instances for downstream segmentation, as a form of data\naugmentation. In our experiments, we provide comparisons to prior works based\non discriminative diffusion models or GANs. We demonstrate that integrating\ngenerated samples yields significant quantitative improvements for satellite\nsemantic segmentation -- both compared to baselines and when training only on\nthe original data.\n","authors":["Aysim Toker","Marvin Eisenberger","Daniel Cremers","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2403.16605v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.16594v1","updated":"2024-03-25T10:13:52Z","published":"2024-03-25T10:13:52Z","title":"EDUE: Expert Disagreement-Guided One-Pass Uncertainty Estimation for\n Medical Image Segmentation","summary":" Deploying deep learning (DL) models in medical applications relies on\npredictive performance and other critical factors, such as conveying\ntrustworthy predictive uncertainty. Uncertainty estimation (UE) methods provide\npotential solutions for evaluating prediction reliability and improving the\nmodel confidence calibration. Despite increasing interest in UE, challenges\npersist, such as the need for explicit methods to capture aleatoric uncertainty\nand align uncertainty estimates with real-life disagreements among domain\nexperts. This paper proposes an Expert Disagreement-Guided Uncertainty\nEstimation (EDUE) for medical image segmentation. By leveraging variability in\nground-truth annotations from multiple raters, we guide the model during\ntraining and incorporate random sampling-based strategies to enhance\ncalibration confidence. Our method achieves 55% and 23% improvement in\ncorrelation on average with expert disagreements at the image and pixel levels,\nrespectively, better calibration, and competitive segmentation performance\ncompared to the state-of-the-art deep ensembles, requiring only a single\nforward pass.\n","authors":["Kudaibergen Abutalip","Numan Saeed","Ikboljon Sobirov","Vincent Andrearczyk","Adrien Depeursinge","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.16594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14828v2","updated":"2024-03-25T10:12:46Z","published":"2024-03-21T20:43:10Z","title":"Multimodal-Conditioned Latent Diffusion Models for Fashion Image Editing","summary":" Fashion illustration is a crucial medium for designers to convey their\ncreative vision and transform design concepts into tangible representations\nthat showcase the interplay between clothing and the human body. In the context\nof fashion design, computer vision techniques have the potential to enhance and\nstreamline the design process. Departing from prior research primarily focused\non virtual try-on, this paper tackles the task of multimodal-conditioned\nfashion image editing. Our approach aims to generate human-centric fashion\nimages guided by multimodal prompts, including text, human body poses, garment\nsketches, and fabric textures. To address this problem, we propose extending\nlatent diffusion models to incorporate these multiple modalities and modifying\nthe structure of the denoising network, taking multimodal prompts as input. To\ncondition the proposed architecture on fabric textures, we employ textual\ninversion techniques and let diverse cross-attention layers of the denoising\nnetwork attend to textual and texture information, thus incorporating different\ngranularity conditioning details. Given the lack of datasets for the task, we\nextend two existing fashion datasets, Dress Code and VITON-HD, with multimodal\nannotations. Experimental evaluations demonstrate the effectiveness of our\nproposed approach in terms of realism and coherence concerning the provided\nmultimodal inputs.\n","authors":["Alberto Baldrati","Davide Morelli","Marcella Cornia","Marco Bertini","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2403.14828v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16582v1","updated":"2024-03-25T09:49:42Z","published":"2024-03-25T09:49:42Z","title":"In the Search for Optimal Multi-view Learning Models for Crop\n Classification with Global Remote Sensing Data","summary":" Crop classification is of critical importance due to its role in studying\ncrop pattern changes, resource management, and carbon sequestration. When\nemploying data-driven techniques for its prediction, utilizing various temporal\ndata sources is necessary. Deep learning models have proven to be effective for\nthis task by mapping time series data to high-level representation for\nprediction. However, they face substantial challenges when dealing with\nmultiple input patterns. The literature offers limited guidance for Multi-View\nLearning (MVL) scenarios, as it has primarily focused on exploring fusion\nstrategies with specific encoders and validating them in local regions. In\ncontrast, we investigate the impact of simultaneous selection of the fusion\nstrategy and the encoder architecture evaluated on a global-scale cropland and\ncrop-type classifications. We use a range of five fusion strategies (Input,\nFeature, Decision, Ensemble, Hybrid) and five temporal encoder architectures\n(LSTM, GRU, TempCNN, TAE, L-TAE) as possible MVL model configurations. The\nvalidation is on the CropHarvest dataset that provides optical, radar, and\nweather time series, and topographic information as input data. We found that\nin scenarios with a limited number of labeled samples, a unique configuration\nis insufficient for all the cases. Instead, a specialized combination,\nincluding encoder and fusion strategy, should be meticulously sought. To\nstreamline this search process, we suggest initially identifying the optimal\nencoder architecture tailored for a particular fusion strategy, and then\ndetermining the most suitable fusion strategy for the classification task. We\nprovide a technical framework for researchers exploring crop classification or\nrelated tasks through a MVL approach.\n","authors":["Francisco Mena","Diego Arenas","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.16582v1.pdf","comment":"submitted to journal"},{"id":"http://arxiv.org/abs/2403.16578v1","updated":"2024-03-25T09:43:56Z","published":"2024-03-25T09:43:56Z","title":"SegICL: A Universal In-context Learning Framework for Enhanced\n Segmentation in Medical Imaging","summary":" Medical image segmentation models adapting to new tasks in a training-free\nmanner through in-context learning is an exciting advancement. Universal\nsegmentation models aim to generalize across the diverse modality of medical\nimages, yet their effectiveness often diminishes when applied to\nout-of-distribution (OOD) data modalities and tasks, requiring intricate\nfine-tuning of model for optimal performance. For addressing this challenge, we\nintroduce SegICL, a novel approach leveraging In-Context Learning (ICL) for\nimage segmentation. Unlike existing methods, SegICL has the capability to\nemploy text-guided segmentation and conduct in-context learning with a small\nset of image-mask pairs, eliminating the need for training the model from\nscratch or fine-tuning for OOD tasks (including OOD modality and dataset).\nExtensive experimental validation of SegICL demonstrates a positive correlation\nbetween the number of prompt samples and segmentation performance on OOD\nmodalities and tasks. This indicates that SegICL effectively address new\nsegmentation tasks based on contextual information. Additionally, SegICL also\nexhibits comparable segmentation performance to mainstream models on OOD and\nin-distribution tasks. Our code will be released soon.\n","authors":["Lingdong Shen","Fangxin Shang","Yehui Yang","Xiaoshuang Huang","Shining Xiang"],"pdf_url":"https://arxiv.org/pdf/2403.16578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10615v2","updated":"2024-03-25T09:42:13Z","published":"2024-03-15T18:26:33Z","title":"LightIt: Illumination Modeling and Control for Diffusion Models","summary":" We introduce LightIt, a method for explicit illumination control for image\ngeneration. Recent generative methods lack lighting control, which is crucial\nto numerous artistic aspects of image generation such as setting the overall\nmood or cinematic appearance. To overcome these limitations, we propose to\ncondition the generation on shading and normal maps. We model the lighting with\nsingle bounce shading, which includes cast shadows. We first train a shading\nestimation module to generate a dataset of real-world images and shading pairs.\nThen, we train a control network using the estimated shading and normals as\ninput. Our method demonstrates high-quality image generation and lighting\ncontrol in numerous scenes. Additionally, we use our generated dataset to train\nan identity-preserving relighting model, conditioned on an image and a target\nshading. Our method is the first that enables the generation of images with\ncontrollable, consistent lighting and performs on par with specialized\nrelighting state-of-the-art methods.\n","authors":["Peter Kocsis","Julien Philip","Kalyan Sunkavalli","Matthias Nießner","Yannick Hold-Geoffroy"],"pdf_url":"https://arxiv.org/pdf/2403.10615v2.pdf","comment":"Project page: https://peter-kocsis.github.io/LightIt/ Video:\n https://youtu.be/cCfSBD5aPLI"},{"id":"http://arxiv.org/abs/2403.15353v2","updated":"2024-03-25T09:36:42Z","published":"2024-03-22T17:08:03Z","title":"Fully automated workflow for the design of patient-specific orthopaedic\n implants: application to total knee arthroplasty","summary":" Arthroplasty is commonly performed to treat joint osteoarthritis, reducing\npain and improving mobility. While arthroplasty has known several technical\nimprovements, a significant share of patients are still unsatisfied with their\nsurgery. Personalised arthroplasty improves surgical outcomes however current\nsolutions require delays, making it difficult to integrate in clinical routine.\nWe propose a fully automated workflow to design patient-specific implants,\npresented for total knee arthroplasty, the most widely performed arthroplasty\nin the world nowadays.\n The proposed pipeline first uses artificial neural networks to segment the\nproximal and distal extremities of the femur and tibia. Then the full bones are\nreconstructed using augmented statistical shape models, combining shape and\nlandmarks information. Finally, 77 morphological parameters are computed to\ndesign patient-specific implants. The developed workflow has been trained using\n91 CT scans of lower limb and evaluated on 41 CT scans manually segmented, in\nterms of accuracy and execution time.\n The workflow accuracy was $0.4\\pm0.2mm$ for the segmentation, $1.2\\pm0.4mm$\nfor the full bones reconstruction, and $2.8\\pm2.2mm$ for the anatomical\nlandmarks determination. The custom implants fitted the patients' anatomy with\n$0.6\\pm0.2mm$ accuracy. The whole process from segmentation to implants' design\nlasted about 5 minutes.\n The proposed workflow allows for a fast and reliable personalisation of knee\nimplants, directly from the patient CT image without requiring any manual\nintervention. It establishes a patient-specific pre-operative planning for TKA\nin a very short time making it easily available for all patients. Combined with\nefficient implant manufacturing techniques, this solution could help answer the\ngrowing number of arthroplasties while reducing complications and improving the\npatients' satisfaction.\n","authors":["Aziliz Guezou-Philippe","Arnaud Clavé","Ehouarn Maguet","Ludivine Maintier","Charles Garraud","Jean-Rassaire Fouefack","Valérie Burdin","Eric Stindel","Guillaume Dardenne"],"pdf_url":"https://arxiv.org/pdf/2403.15353v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16569v1","updated":"2024-03-25T09:36:10Z","published":"2024-03-25T09:36:10Z","title":"Revealing Vulnerabilities of Neural Networks in Parameter Learning and\n Defense Against Explanation-Aware Backdoors","summary":" Explainable Artificial Intelligence (XAI) strategies play a crucial part in\nincreasing the understanding and trustworthiness of neural networks.\nNonetheless, these techniques could potentially generate misleading\nexplanations. Blinding attacks can drastically alter a machine learning\nalgorithm's prediction and explanation, providing misleading information by\nadding visually unnoticeable artifacts into the input, while maintaining the\nmodel's accuracy. It poses a serious challenge in ensuring the reliability of\nXAI methods. To ensure the reliability of XAI methods poses a real challenge,\nwe leverage statistical analysis to highlight the changes in CNN weights within\na CNN following blinding attacks. We introduce a method specifically designed\nto limit the effectiveness of such attacks during the evaluation phase,\navoiding the need for extra training. The method we suggest defences against\nmost modern explanation-aware adversarial attacks, achieving an approximate\ndecrease of ~99\\% in the Attack Success Rate (ASR) and a ~91\\% reduction in the\nMean Square Error (MSE) between the original explanation and the defended\n(post-attack) explanation across three unique types of attacks.\n","authors":["Md Abdul Kadir","GowthamKrishna Addluri","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2403.16569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16558v1","updated":"2024-03-25T09:17:15Z","published":"2024-03-25T09:17:15Z","title":"Elysium: Exploring Object-level Perception in Videos via MLLM","summary":" Multi-modal Large Language Models (MLLMs) have demonstrated their ability to\nperceive objects in still images, but their application in video-related tasks,\nsuch as object tracking, remains understudied. This lack of exploration is\nprimarily due to two key challenges. Firstly, extensive pretraining on\nlarge-scale video datasets is required to equip MLLMs with the capability to\nperceive objects across multiple frames and understand inter-frame\nrelationships. Secondly, processing a large number of frames within the context\nwindow of Large Language Models (LLMs) can impose a significant computational\nburden. To address the first challenge, we introduce ElysiumTrack-1M, a\nlarge-scale video dataset paired with novel tasks: Referring Single Object\nTracking (RSOT) and Video Referring Expression Generation (Video-REG).\nElysiumTrack-1M contains 1.27 million annotated video frames with corresponding\nobject boxes and descriptions. Leveraging this dataset, we conduct training of\nMLLMs and propose a token-compression model T-Selector to tackle the second\nchallenge. Our proposed approach, Elysium: Exploring Object-level Perception in\nVideos via MLLM, is an end-to-end trainable MLLM that makes the first attempt\nto conduct object-level tasks in videos without requiring any additional\nplug-in or expert models.\n","authors":["Han Wang","Yanjie Wang","Yongjie Ye","Yuxiang Nie","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2403.16558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16552v1","updated":"2024-03-25T08:57:27Z","published":"2024-03-25T08:57:27Z","title":"QKFormer: Hierarchical Spiking Transformer using Q-K Attention","summary":" Spiking Transformers, which integrate Spiking Neural Networks (SNNs) with\nTransformer architectures, have attracted significant attention due to their\npotential for energy efficiency and high performance. However, existing models\nin this domain still suffer from suboptimal performance. We introduce several\ninnovations to improve the performance: i) We propose a novel spike-form Q-K\nattention mechanism, tailored for SNNs, which efficiently models the importance\nof token or channel dimensions through binary vectors with linear complexity.\nii) We incorporate the hierarchical structure, which significantly benefits the\nperformance of both the brain and artificial neural networks, into spiking\ntransformers to obtain multi-scale spiking representation. iii) We design a\nversatile and powerful patch embedding module with a deformed shortcut\nspecifically for spiking transformers. Together, we develop QKFormer, a\nhierarchical spiking transformer based on Q-K attention with direct training.\nQKFormer shows significantly superior performance over existing\nstate-of-the-art SNN models on various mainstream datasets. Notably, with\ncomparable size to Spikformer (66.34 M, 74.81%), QKFormer (64.96 M) achieves a\ngroundbreaking top-1 accuracy of 85.65% on ImageNet-1k, substantially\noutperforming Spikformer by 10.84%. To our best knowledge, this is the first\ntime that directly training SNNs have exceeded 85% accuracy on ImageNet-1K. The\ncode and models are publicly available at\nhttps://github.com/zhouchenlin2096/QKFormer\n","authors":["Chenlin Zhou","Han Zhang","Zhaokun Zhou","Liutao Yu","Liwei Huang","Xiaopeng Fan","Li Yuan","Zhengyu Ma","Huihui Zhou","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2403.16552v1.pdf","comment":"10 pages, code: https://github.com/zhouchenlin2096/QKFormer"},{"id":"http://arxiv.org/abs/2403.11854v2","updated":"2024-03-25T08:54:33Z","published":"2024-03-18T15:03:56Z","title":"denoiSplit: a method for joint image splitting and unsupervised\n denoising","summary":" In this work we present denoiSplit, a method to tackle a new analysis task,\ni.e. the challenge of joint semantic image splitting and unsupervised\ndenoising. This dual approach has important applications in fluorescence\nmicroscopy, where semantic image splitting has important applications but noise\ndoes generally hinder the downstream analysis of image content. Image splitting\ninvolves dissecting an image into its distinguishable semantic structures. We\nshow that the current state-of-the-art method for this task struggles in the\npresence of image noise, inadvertently also distributing the noise across the\npredicted outputs. The method we present here can deal with image noise by\nintegrating an unsupervised denoising sub-task. This integration results in\nimproved semantic image unmixing, even in the presence of notable and realistic\nlevels of imaging noise. A key innovation in denoiSplit is the use of\nspecifically formulated noise models and the suitable adjustment of\nKL-divergence loss for the high-dimensional hierarchical latent space we are\ntraining. We showcase the performance of denoiSplit across 4 tasks on\nreal-world microscopy images. Additionally, we perform qualitative and\nquantitative evaluations and compare results to existing benchmarks,\ndemonstrating the effectiveness of using denoiSplit: a single Variational\nSplitting Encoder-Decoder (VSE) Network using two suitable noise models to\njointly perform semantic splitting and denoising.\n","authors":["Ashesh Ashesh","Florian Jug"],"pdf_url":"https://arxiv.org/pdf/2403.11854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02970v5","updated":"2024-03-25T08:50:42Z","published":"2023-04-06T09:54:06Z","title":"Unraveling Instance Associations: A Closer Look for Audio-Visual\n Segmentation","summary":" Audio-visual segmentation (AVS) is a challenging task that involves\naccurately segmenting sounding objects based on audio-visual cues. The\neffectiveness of audio-visual learning critically depends on achieving accurate\ncross-modal alignment between sound and visual objects. Successful audio-visual\nlearning requires two essential components: 1) a challenging dataset with\nhigh-quality pixel-level multi-class annotated images associated with audio\nfiles, and 2) a model that can establish strong links between audio information\nand its corresponding visual object. However, these requirements are only\npartially addressed by current methods, with training sets containing biased\naudio-visual data, and models that generalise poorly beyond this biased\ntraining set. In this work, we propose a new cost-effective strategy to build\nchallenging and relatively unbiased high-quality audio-visual segmentation\nbenchmarks. We also propose a new informative sample mining method for\naudio-visual supervised contrastive learning to leverage discriminative\ncontrastive samples to enforce cross-modal understanding. We show empirical\nresults that demonstrate the effectiveness of our benchmark. Furthermore,\nexperiments conducted on existing AVS datasets and on our new benchmark show\nthat our method achieves state-of-the-art (SOTA) segmentation accuracy.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Helen Frazer","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v5.pdf","comment":"Code is available at https://github.com/cyh-0/CAVP"},{"id":"http://arxiv.org/abs/2403.06904v2","updated":"2024-03-25T08:45:37Z","published":"2024-03-11T16:56:37Z","title":"FocusCLIP: Multimodal Subject-Level Guidance for Zero-Shot Transfer in\n Human-Centric Tasks","summary":" We propose FocusCLIP, integrating subject-level guidance--a specialized\nmechanism for target-specific supervision--into the CLIP framework for improved\nzero-shot transfer on human-centric tasks. Our novel contributions enhance CLIP\non both the vision and text sides. On the vision side, we incorporate ROI\nheatmaps emulating human visual attention mechanisms to emphasize\nsubject-relevant image regions. On the text side, we introduce human pose\ndescriptions to provide rich contextual information. For human-centric tasks,\nFocusCLIP is trained with images from the MPII Human Pose dataset. The proposed\napproach surpassed CLIP by an average of 8.61% across five previously unseen\ndatasets covering three human-centric tasks. FocusCLIP achieved an average\naccuracy of 33.65% compared to 25.04% by CLIP. We observed a 3.98% improvement\nin activity recognition, a 14.78% improvement in age classification, and a\n7.06% improvement in emotion recognition. Moreover, using our proposed\nsingle-shot LLM prompting strategy, we release a high-quality MPII Pose\nDescriptions dataset to encourage further research in multimodal learning for\nhuman-centric tasks. Furthermore, we also demonstrate the effectiveness of our\nsubject-level supervision on non-human-centric tasks. FocusCLIP shows a 2.47%\nimprovement over CLIP in zero-shot bird classification using the CUB dataset.\nOur findings emphasize the potential of integrating subject-level guidance with\ngeneral pretraining methods for enhanced downstream performance.\n","authors":["Muhammad Saif Ullah Khan","Muhammad Ferjad Naeem","Federico Tombari","Luc Van Gool","Didier Stricker","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2403.06904v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00247v4","updated":"2024-03-25T08:34:15Z","published":"2023-08-01T03:00:36Z","title":"Unleashing the Power of Self-Supervised Image Denoising: A Comprehensive\n Review","summary":" The advent of deep learning has brought a revolutionary transformation to\nimage denoising techniques. However, the persistent challenge of acquiring\nnoise-clean pairs for supervised methods in real-world scenarios remains\nformidable, necessitating the exploration of more practical self-supervised\nimage denoising. This paper focuses on self-supervised image denoising methods\nthat offer effective solutions to address this challenge. Our comprehensive\nreview thoroughly analyzes the latest advancements in self-supervised image\ndenoising approaches, categorizing them into three distinct classes: General\nmethods, Blind Spot Network (BSN)-based methods, and Transformer-based methods.\nFor each class, we provide a concise theoretical analysis along with their\npractical applications. To assess the effectiveness of these methods, we\npresent both quantitative and qualitative experimental results on various\ndatasets, utilizing classical algorithms as benchmarks. Additionally, we\ncritically discuss the current limitations of these methods and propose\npromising directions for future research. By offering a detailed overview of\nrecent developments in self-supervised image denoising, this review serves as\nan invaluable resource for researchers and practitioners in the field,\nfacilitating a deeper understanding of this emerging domain and inspiring\nfurther advancements.\n","authors":["Dan Zhang","Fangfang Zhou","Felix Albu","Yuanzhou Wei","Xiao Yang","Yuan Gu","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.00247v4.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2403.16539v1","updated":"2024-03-25T08:31:14Z","published":"2024-03-25T08:31:14Z","title":"DOrA: 3D Visual Grounding with Order-Aware Referring","summary":" 3D visual grounding aims to identify the target object within a 3D point\ncloud scene referred to by a natural language description. While previous works\nattempt to exploit the verbo-visual relation with proposed cross-modal\ntransformers, unstructured natural utterances and scattered objects might lead\nto undesirable performances. In this paper, we introduce DOrA, a novel 3D\nvisual grounding framework with Order-Aware referring. DOrA is designed to\nleverage Large Language Models (LLMs) to parse language description, suggesting\na referential order of anchor objects. Such ordered anchor objects allow DOrA\nto update visual features and locate the target object during the grounding\nprocess. Experimental results on the NR3D and ScanRefer datasets demonstrate\nour superiority in both low-resource and full-data scenarios. In particular,\nDOrA surpasses current state-of-the-art frameworks by 9.3% and 7.8% grounding\naccuracy under 1% data and 10% data settings, respectively.\n","authors":["Tung-Yu Wu","Sheng-Yu Huang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08262v4","updated":"2024-03-25T08:29:52Z","published":"2024-03-13T05:25:49Z","title":"BiTT: Bi-directional Texture Reconstruction of Interacting Two Hands\n from a Single Image","summary":" Creating personalized hand avatars is important to offer a realistic\nexperience to users on AR / VR platforms. While most prior studies focused on\nreconstructing 3D hand shapes, some recent work has tackled the reconstruction\nof hand textures on top of shapes. However, these methods are often limited to\ncapturing pixels on the visible side of a hand, requiring diverse views of the\nhand in a video or multiple images as input. In this paper, we propose a novel\nmethod, BiTT(Bi-directional Texture reconstruction of Two hands), which is the\nfirst end-to-end trainable method for relightable, pose-free texture\nreconstruction of two interacting hands taking only a single RGB image, by\nthree novel components: 1) bi-directional (left $\\leftrightarrow$ right)\ntexture reconstruction using the texture symmetry of left / right hands, 2)\nutilizing a texture parametric model for hand texture recovery, and 3) the\noverall coarse-to-fine stage pipeline for reconstructing personalized texture\nof two interacting hands. BiTT first estimates the scene light condition and\nalbedo image from an input image, then reconstructs the texture of both hands\nthrough the texture parametric model and bi-directional texture reconstructor.\nIn experiments using InterHand2.6M and RGB2Hands datasets, our method\nsignificantly outperforms state-of-the-art hand texture reconstruction methods\nquantitatively and qualitatively. The code is available at\nhttps://github.com/yunminjin2/BiTT\n","authors":["Minje Kim","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.08262v4.pdf","comment":"Accepted by CVPR 2024, Project Page:\n https://yunminjin2.github.io/projects/bitt/"},{"id":"http://arxiv.org/abs/2403.16536v1","updated":"2024-03-25T08:26:42Z","published":"2024-03-25T08:26:42Z","title":"VMRNN: Integrating Vision Mamba and LSTM for Efficient and Accurate\n Spatiotemporal Forecasting","summary":" Combining CNNs or ViTs, with RNNs for spatiotemporal forecasting, has yielded\nunparalleled results in predicting temporal and spatial dynamics. However,\nmodeling extensive global information remains a formidable challenge; CNNs are\nlimited by their narrow receptive fields, and ViTs struggle with the intensive\ncomputational demands of their attention mechanisms. The emergence of recent\nMamba-based architectures has been met with enthusiasm for their exceptional\nlong-sequence modeling capabilities, surpassing established vision models in\nefficiency and accuracy, which motivates us to develop an innovative\narchitecture tailored for spatiotemporal forecasting. In this paper, we propose\nthe VMRNN cell, a new recurrent unit that integrates the strengths of Vision\nMamba blocks with LSTM. We construct a network centered on VMRNN cells to\ntackle spatiotemporal prediction tasks effectively. Our extensive evaluations\nshow that our proposed approach secures competitive results on a variety of\ntasks while maintaining a smaller model size. Our code is available at\nhttps://github.com/yyyujintang/VMRNN-PyTorch.\n","authors":["Yujin Tang","Peijie Dong","Zhenheng Tang","Xiaowen Chu","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2403.16536v1.pdf","comment":"11 pages, 7 figures. arXiv admin note: text overlap with\n arXiv:2308.09891 by other authors"},{"id":"http://arxiv.org/abs/2403.16530v1","updated":"2024-03-25T08:16:06Z","published":"2024-03-25T08:16:06Z","title":"An Intermediate Fusion ViT Enables Efficient Text-Image Alignment in\n Diffusion Models","summary":" Diffusion models have been widely used for conditional data cross-modal\ngeneration tasks such as text-to-image and text-to-video. However,\nstate-of-the-art models still fail to align the generated visual concepts with\nhigh-level semantics in a language such as object count, spatial relationship,\netc. We approach this problem from a multimodal data fusion perspective and\ninvestigate how different fusion strategies can affect vision-language\nalignment. We discover that compared to the widely used early fusion of\nconditioning text in a pretrained image feature space, a specially designed\nintermediate fusion can: (i) boost text-to-image alignment with improved\ngeneration quality and (ii) improve training and inference efficiency by\nreducing low-rank text-to-image attention calculations. We perform experiments\nusing a text-to-image generation task on the MS-COCO dataset. We compare our\nintermediate fusion mechanism with the classic early fusion mechanism on two\ncommon conditioning methods on a U-shaped ViT backbone. Our intermediate fusion\nmodel achieves a higher CLIP Score and lower FID, with 20% reduced FLOPs, and\n50% increased training speed compared to a strong U-ViT baseline with an early\nfusion.\n","authors":["Zizhao Hu","Shaochong Jia","Mohammad Rostami"],"pdf_url":"https://arxiv.org/pdf/2403.16530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16528v1","updated":"2024-03-25T08:14:22Z","published":"2024-03-25T08:14:22Z","title":"Open-Set Recognition in the Age of Vision-Language Models","summary":" Are vision-language models (VLMs) open-set models because they are trained on\ninternet-scale datasets? We answer this question with a clear no - VLMs\nintroduce closed-set assumptions via their finite query set, making them\nvulnerable to open-set conditions. We systematically evaluate VLMs for open-set\nrecognition and find they frequently misclassify objects not contained in their\nquery set, leading to alarmingly low precision when tuned for high recall and\nvice versa. We show that naively increasing the size of the query set to\ncontain more and more classes does not mitigate this problem, but instead\ncauses diminishing task performance and open-set performance. We establish a\nrevised definition of the open-set problem for the age of VLMs, define a new\nbenchmark and evaluation protocol to facilitate standardised evaluation and\nresearch in this important area, and evaluate promising baseline approaches\nbased on predictive uncertainty and dedicated negative embeddings on a range of\nVLM classifiers and object detectors.\n","authors":["Dimity Miller","Niko Sünderhauf","Alex Kenna","Keita Mason"],"pdf_url":"https://arxiv.org/pdf/2403.16528v1.pdf","comment":"31 pages, under review"},{"id":"http://arxiv.org/abs/2403.16526v1","updated":"2024-03-25T08:09:22Z","published":"2024-03-25T08:09:22Z","title":"ModeTv2: GPU-accelerated Motion Decomposition Transformer for Pairwise\n Optimization in Medical Image Registration","summary":" Deformable image registration plays a crucial role in medical imaging, aiding\nin disease diagnosis and image-guided interventions. Traditional iterative\nmethods are slow, while deep learning (DL) accelerates solutions but faces\nusability and precision challenges. This study introduces a pyramid network\nwith the enhanced motion decomposition Transformer (ModeTv2) operator,\nshowcasing superior pairwise optimization (PO) akin to traditional methods. We\nre-implement ModeT operator with CUDA extensions to enhance its computational\nefficiency. We further propose RegHead module which refines deformation fields,\nimproves the realism of deformation and reduces parameters. By adopting the PO,\nthe proposed network balances accuracy, efficiency, and generalizability.\nExtensive experiments on two public brain MRI datasets and one abdominal CT\ndataset demonstrate the network's suitability for PO, providing a DL model with\nenhanced usability and interpretability. The code is publicly available.\n","authors":["Haiqiao Wang","Zhuoyuan Wang","Dong Ni","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08863v3","updated":"2024-03-25T08:05:16Z","published":"2023-11-15T10:49:15Z","title":"Toulouse Hyperspectral Data Set: a benchmark data set to assess\n semi-supervised spectral representation learning and pixel-wise\n classification techniques","summary":" Airborne hyperspectral images can be used to map the land cover in large\nurban areas, thanks to their very high spatial and spectral resolutions on a\nwide spectral domain. While the spectral dimension of hyperspectral images is\nhighly informative of the chemical composition of the land surface, the use of\nstate-of-the-art machine learning algorithms to map the land cover has been\ndramatically limited by the availability of training data. To cope with the\nscarcity of annotations, semi-supervised and self-supervised techniques have\nlately raised a lot of interest in the community. Yet, the publicly available\nhyperspectral data sets commonly used to benchmark machine learning models are\nnot totally suited to evaluate their generalization performances due to one or\nseveral of the following properties: a limited geographical coverage (which\ndoes not reflect the spectral diversity in metropolitan areas), a small number\nof land cover classes and a lack of appropriate standard train / test splits\nfor semi-supervised and self-supervised learning. Therefore, we release in this\npaper the Toulouse Hyperspectral Data Set that stands out from other data sets\nin the above-mentioned respects in order to meet key issues in spectral\nrepresentation learning and classification over large-scale hyperspectral\nimages with very few labeled pixels. Besides, we discuss and experiment\nself-supervised techniques for spectral representation learning, including the\nMasked Autoencoder, and establish a baseline for pixel-wise classification\nachieving 85% overall accuracy and 77% F1 score. The Toulouse Hyperspectral\nData Set and our code are publicly available at\nhttps://www.toulouse-hyperspectral-data-set.com and\nhttps://www.github.com/Romain3Ch216/tlse-experiments, respectively.\n","authors":["Romain Thoreau","Laurent Risser","Véronique Achard","Béatrice Berthelot","Xavier Briottet"],"pdf_url":"https://arxiv.org/pdf/2311.08863v3.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2403.16520v1","updated":"2024-03-25T08:02:41Z","published":"2024-03-25T08:02:41Z","title":"CMViM: Contrastive Masked Vim Autoencoder for 3D Multi-modal\n Representation Learning for AD classification","summary":" Alzheimer's disease (AD) is an incurable neurodegenerative condition leading\nto cognitive and functional deterioration. Given the lack of a cure, prompt and\nprecise AD diagnosis is vital, a complex process dependent on multiple factors\nand multi-modal data. While successful efforts have been made to integrate\nmulti-modal representation learning into medical datasets, scant attention has\nbeen given to 3D medical images. In this paper, we propose Contrastive Masked\nVim Autoencoder (CMViM), the first efficient representation learning method\ntailored for 3D multi-modal data. Our proposed framework is built on a masked\nVim autoencoder to learn a unified multi-modal representation and\nlong-dependencies contained in 3D medical images. We also introduce an\nintra-modal contrastive learning module to enhance the capability of the\nmulti-modal Vim encoder for modeling the discriminative features in the same\nmodality, and an inter-modal contrastive learning module to alleviate\nmisaligned representation among modalities. Our framework consists of two main\nsteps: 1) incorporate the Vision Mamba (Vim) into the mask autoencoder to\nreconstruct 3D masked multi-modal data efficiently. 2) align the multi-modal\nrepresentations with contrastive learning mechanisms from both intra-modal and\ninter-modal aspects. Our framework is pre-trained and validated ADNI2 dataset\nand validated on the downstream task for AD classification. The proposed CMViM\nyields 2.7\\% AUC performance improvement compared with other state-of-the-art\nmethods.\n","authors":["Guangqian Yang","Kangrui Du","Zhihan Yang","Ye Du","Yongping Zheng","Shujun Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16520v1.pdf","comment":"11 pages, 1 figure"},{"id":"http://arxiv.org/abs/2403.16516v1","updated":"2024-03-25T08:00:43Z","published":"2024-03-25T08:00:43Z","title":"Visually Guided Generative Text-Layout Pre-training for Document\n Intelligence","summary":" Prior study shows that pre-training techniques can boost the performance of\nvisual document understanding (VDU), which typically requires models to gain\nabilities to perceive and reason both document texts and layouts (e.g.,\nlocations of texts and table-cells). To this end, we propose visually guided\ngenerative text-layout pre-training, named ViTLP. Given a document image, the\nmodel optimizes hierarchical language and layout modeling objectives to\ngenerate the interleaved text and layout sequence. In addition, to address the\nlimitation of processing long documents by Transformers, we introduce a\nstraightforward yet effective multi-segment generative pre-training scheme,\nfacilitating ViTLP to process word-intensive documents of any length. ViTLP can\nfunction as a native OCR model to localize and recognize texts of document\nimages. Besides, ViTLP can be effectively applied to various downstream VDU\ntasks. Extensive experiments show that ViTLP achieves competitive performance\nover existing baselines on benchmark VDU tasks, including information\nextraction, document classification, and document question answering.\n","authors":["Zhiming Mao","Haoli Bai","Lu Hou","Jiansheng Wei","Xin Jiang","Qun Liu","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2403.16516v1.pdf","comment":"Accepted to NAACL 2024 main conference. The first version of this\n paper was submitted to OpenReview\n (https://openreview.net/forum?id=ARtBIBAmNR) in June 2023"},{"id":"http://arxiv.org/abs/2403.16513v1","updated":"2024-03-25T07:58:58Z","published":"2024-03-25T07:58:58Z","title":"Let Real Images be as a Judger, Spotting Fake Images Synthesized with\n Generative Models","summary":" In the last few years, generative models have shown their powerful\ncapabilities in synthesizing realistic images in both quality and diversity\n(i.e., facial images, and natural subjects). Unfortunately, the artifact\npatterns in fake images synthesized by different generative models are\ninconsistent, leading to the failure of previous research that relied on\nspotting subtle differences between real and fake. In our preliminary\nexperiments, we find that the artifacts in fake images always change with the\ndevelopment of the generative model, while natural images exhibit stable\nstatistical properties. In this paper, we employ natural traces shared only by\nreal images as an additional predictive target in the detector. Specifically,\nthe natural traces are learned from the wild real images and we introduce\nextended supervised contrastive learning to bring them closer to real images\nand further away from fake ones. This motivates the detector to make decisions\nbased on the proximity of images to the natural traces. To conduct a\ncomprehensive experiment, we built a high-quality and diverse dataset that\nincludes generative models comprising 6 GAN and 6 diffusion models, to evaluate\nthe effectiveness in generalizing unknown forgery techniques and robustness in\nsurviving different transformations. Experimental results show that our\nproposed method gives 96.1% mAP significantly outperforms the baselines.\nExtensive experiments conducted on the widely recognized platform Midjourney\nreveal that our proposed method achieves an accuracy exceeding 78.4%,\nunderscoring its practicality for real-world application deployment. The source\ncode and partial self-built dataset are available in supplementary material.\n","authors":["Ziyou Liang","Run Wang","Weifeng Liu","Yuyang Zhang","Wenyuan Yang","Lina Wang","Xingkai Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16510v1","updated":"2024-03-25T07:54:18Z","published":"2024-03-25T07:54:18Z","title":"Make-Your-Anchor: A Diffusion-based 2D Avatar Generation Framework","summary":" Despite the remarkable process of talking-head-based avatar-creating\nsolutions, directly generating anchor-style videos with full-body motions\nremains challenging. In this study, we propose Make-Your-Anchor, a novel system\nnecessitating only a one-minute video clip of an individual for training,\nsubsequently enabling the automatic generation of anchor-style videos with\nprecise torso and hand movements. Specifically, we finetune a proposed\nstructure-guided diffusion model on input video to render 3D mesh conditions\ninto human appearances. We adopt a two-stage training strategy for the\ndiffusion model, effectively binding movements with specific appearances. To\nproduce arbitrary long temporal video, we extend the 2D U-Net in the frame-wise\ndiffusion model to a 3D style without additional training cost, and a simple\nyet effective batch-overlapped temporal denoising module is proposed to bypass\nthe constraints on video length during inference. Finally, a novel\nidentity-specific face enhancement module is introduced to improve the visual\nquality of facial regions in the output videos. Comparative experiments\ndemonstrate the effectiveness and superiority of the system in terms of visual\nquality, temporal coherence, and identity preservation, outperforming SOTA\ndiffusion/non-diffusion methods. Project page:\n\\url{https://github.com/ICTMCG/Make-Your-Anchor}.\n","authors":["Ziyao Huang","Fan Tang","Yong Zhang","Xiaodong Cun","Juan Cao","Jintao Li","Tong-Yee Lee"],"pdf_url":"https://arxiv.org/pdf/2403.16510v1.pdf","comment":"accepted at CVPR2024"},{"id":"http://arxiv.org/abs/2305.01309v2","updated":"2024-03-25T07:53:54Z","published":"2023-05-02T10:35:20Z","title":"Geometric Prior Based Deep Human Point Cloud Geometry Compression","summary":" The emergence of digital avatars has raised an exponential increase in the\ndemand for human point clouds with realistic and intricate details. The\ncompression of such data becomes challenging with overwhelming data amounts\ncomprising millions of points. Herein, we leverage the human geometric prior in\ngeometry redundancy removal of point clouds, greatly promoting the compression\nperformance. More specifically, the prior provides topological constraints as\ngeometry initialization, allowing adaptive adjustments with a compact parameter\nset that could be represented with only a few bits. Therefore, we can envisage\nhigh-resolution human point clouds as a combination of geometric priors and\nstructural deviations. The priors could first be derived with an aligned point\ncloud, and subsequently the difference of features is compressed into a compact\nlatent code. The proposed framework can operate in a play-and-plug fashion with\nexisting learning based point cloud compression methods. Extensive experimental\nresults show that our approach significantly improves the compression\nperformance without deteriorating the quality, demonstrating its promise in a\nvariety of applications.\n","authors":["Xinju Wu","Pingping Zhang","Meng Wang","Peilin Chen","Shiqi Wang","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2305.01309v2.pdf","comment":"Accepted by TCSVT 2024"},{"id":"http://arxiv.org/abs/2311.17315v3","updated":"2024-03-25T07:51:14Z","published":"2023-11-29T02:10:31Z","title":"Explaining CLIP's performance disparities on data from blind/low vision\n users","summary":" Large multi-modal models (LMMs) hold the potential to usher in a new era of\nautomated visual assistance for people who are blind or low vision (BLV). Yet,\nthese models have not been systematically evaluated on data captured by BLV\nusers. We address this by empirically assessing CLIP, a widely-used LMM likely\nto underpin many assistive technologies. Testing 25 CLIP variants in a\nzero-shot classification task, we find that their accuracy is 15 percentage\npoints lower on average for images captured by BLV users than web-crawled\nimages. This disparity stems from CLIP's sensitivities to 1) image content\n(e.g. not recognizing disability objects as well as other objects); 2) image\nquality (e.g. not being robust to lighting variation); and 3) text content\n(e.g. not recognizing objects described by tactile adjectives as well as visual\nones). We delve deeper with a textual analysis of three common pre-training\ndatasets: LAION-400M, LAION-2B and DataComp-1B, showing that disability content\nis rarely mentioned. We then provide three examples that illustrate how the\nperformance disparities extend to three downstream models underpinned by CLIP:\nOWL-ViT, CLIPSeg and DALL-E2. We find that few-shot learning with as few as 5\nimages can mitigate CLIP's quality-of-service disparities for BLV users in some\nscenarios, which we discuss alongside a set of other possible mitigations.\n","authors":["Daniela Massiceti","Camilla Longden","Agnieszka Słowik","Samuel Wills","Martin Grayson","Cecily Morrison"],"pdf_url":"https://arxiv.org/pdf/2311.17315v3.pdf","comment":"Accepted at 2024 IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR)"},{"id":"http://arxiv.org/abs/2403.16502v1","updated":"2024-03-25T07:35:28Z","published":"2024-03-25T07:35:28Z","title":"Medical Image Registration and Its Application in Retinal Images: A\n Review","summary":" Medical image registration is vital for disease diagnosis and treatment with\nits ability to merge diverse information of images, which may be captured under\ndifferent times, angles, or modalities. Although several surveys have reviewed\nthe development of medical image registration, these surveys have not\nsystematically summarized methodologies of existing medical image registration\nmethods. To this end, we provide a comprehensive review of these methods from\ntraditional and deep learning-based directions, aiming to help audiences\nunderstand the development of medical image registration quickly. In\nparticular, we review recent advances in retinal image registration at the end\nof each section, which has not attracted much attention. Additionally, we also\ndiscuss the current challenges of retinal image registration and provide\ninsights and prospects for future research.\n","authors":["Qiushi Nie","Xiaoqing Zhang","Yan Hu","Mingdao Gong","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16499v1","updated":"2024-03-25T07:34:06Z","published":"2024-03-25T07:34:06Z","title":"Self-Supervised Learning for Medical Image Data with Anatomy-Oriented\n Imaging Planes","summary":" Self-supervised learning has emerged as a powerful tool for pretraining deep\nnetworks on unlabeled data, prior to transfer learning of target tasks with\nlimited annotation. The relevance between the pretraining pretext and target\ntasks is crucial to the success of transfer learning. Various pretext tasks\nhave been proposed to utilize properties of medical image data (e.g., three\ndimensionality), which are more relevant to medical image analysis than generic\nones for natural images. However, previous work rarely paid attention to data\nwith anatomy-oriented imaging planes, e.g., standard cardiac magnetic resonance\nimaging views. As these imaging planes are defined according to the anatomy of\nthe imaged organ, pretext tasks effectively exploiting this information can\npretrain the networks to gain knowledge on the organ of interest. In this work,\nwe propose two complementary pretext tasks for this group of medical image data\nbased on the spatial relationship of the imaging planes. The first is to learn\nthe relative orientation between the imaging planes and implemented as\nregressing their intersecting lines. The second exploits parallel imaging\nplanes to regress their relative slice locations within a stack. Both pretext\ntasks are conceptually straightforward and easy to implement, and can be\ncombined in multitask learning for better representation learning. Thorough\nexperiments on two anatomical structures (heart and knee) and representative\ntarget tasks (semantic segmentation and classification) demonstrate that the\nproposed pretext tasks are effective in pretraining deep networks for\nremarkably boosted performance on the target tasks, and superior to other\nrecent approaches.\n","authors":["Tianwei Zhang","Dong Wei","Mengmeng Zhua","Shi Gu","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.16499v1.pdf","comment":"Medical Image Analysis"},{"id":"http://arxiv.org/abs/2403.16497v1","updated":"2024-03-25T07:29:18Z","published":"2024-03-25T07:29:18Z","title":"PathoTune: Adapting Visual Foundation Model to Pathological Specialists","summary":" As natural image understanding moves towards the pretrain-finetune era,\nresearch in pathology imaging is concurrently evolving. Despite the predominant\nfocus on pretraining pathological foundation models, how to adapt foundation\nmodels to downstream tasks is little explored. For downstream adaptation, we\npropose the existence of two domain gaps, i.e., the Foundation-Task Gap and the\nTask-Instance Gap. To mitigate these gaps, we introduce PathoTune, a framework\ndesigned to efficiently adapt pathological or even visual foundation models to\npathology-specific tasks via multi-modal prompt tuning. The proposed framework\nleverages Task-specific Visual Prompts and Task-specific Textual Prompts to\nidentify task-relevant features, along with Instance-specific Visual Prompts\nfor encoding single pathological image features. Results across multiple\ndatasets at both patch-level and WSI-level demonstrate its superior performance\nover single-modality prompt tuning approaches. Significantly, PathoTune\nfacilitates the direct adaptation of natural visual foundation models to\npathological tasks, drastically outperforming pathological foundation models\nwith simple linear probing. The code will be available upon acceptance.\n","authors":["Jiaxuan Lu","Fang Yan","Xiaofan Zhang","Yue Gao","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.16497v1.pdf","comment":"Submitted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2403.16494v1","updated":"2024-03-25T07:22:22Z","published":"2024-03-25T07:22:22Z","title":"CT-Bound: Fast Boundary Estimation From Noisy Images Via Hybrid\n Convolution and Transformer Neural Networks","summary":" We present CT-Bound, a fast boundary estimation method for noisy images using\na hybrid Convolution and Transformer neural network. The proposed architecture\ndecomposes boundary estimation into two tasks: local detection and global\nregularization of image boundaries. It first estimates a parametric\nrepresentation of boundary structures only using the input image within a small\nreceptive field and then refines the boundary structure in the parameter domain\nwithout accessing the input image. Because of this, a part of the network can\nbe easily trained using naive, synthetic images and still generalized to real\nimages, and the entire architecture is computationally efficient as the\nboundary refinement is non-iterative and not in the image domain. Compared with\nthe previous highest accuracy methods, our experiment shows that CT-Bound is\n100 times faster, producing comparably accurate, high-quality boundary and\ncolor maps. We also demonstrate that CT-Bound can produce boundary and color\nmaps on real captured images without extra fine-tuning and real-time boundary\nmap and color map videos at ten frames per second.\n","authors":["Wei Xu","Junjie Luo","Qi Guo"],"pdf_url":"https://arxiv.org/pdf/2403.16494v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.16481v1","updated":"2024-03-25T07:07:50Z","published":"2024-03-25T07:07:50Z","title":"REFRAME: Reflective Surface Real-Time Rendering for Mobile Devices","summary":" This work tackles the challenging task of achieving real-time novel view\nsynthesis on various scenes, including highly reflective objects and unbounded\noutdoor scenes. Existing real-time rendering methods, especially those based on\nmeshes, often have subpar performance in modeling surfaces with rich\nview-dependent appearances. Our key idea lies in leveraging meshes for\nrendering acceleration while incorporating a novel approach to parameterize\nview-dependent information. We decompose the color into diffuse and specular,\nand model the specular color in the reflected direction based on a neural\nenvironment map. Our experiments demonstrate that our method achieves\ncomparable reconstruction quality for highly reflective surfaces compared to\nstate-of-the-art offline methods, while also efficiently enabling real-time\nrendering on edge devices such as smartphones.\n","authors":["Chaojie Ji","Yufeng Li","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2403.16481v1.pdf","comment":"Project Page:https://xdimlab.github.io/REFRAME/"},{"id":"http://arxiv.org/abs/2403.06606v2","updated":"2024-03-25T06:57:57Z","published":"2024-03-11T10:50:53Z","title":"Distributionally Generative Augmentation for Fair Facial Attribute\n Classification","summary":" Facial Attribute Classification (FAC) holds substantial promise in widespread\napplications. However, FAC models trained by traditional methodologies can be\nunfair by exhibiting accuracy inconsistencies across varied data\nsubpopulations. This unfairness is largely attributed to bias in data, where\nsome spurious attributes (e.g., Male) statistically correlate with the target\nattribute (e.g., Smiling). Most of existing fairness-aware methods rely on the\nlabels of spurious attributes, which may be unavailable in practice. This work\nproposes a novel, generation-based two-stage framework to train a fair FAC\nmodel on biased data without additional annotation. Initially, we identify the\npotential spurious attributes based on generative models. Notably, it enhances\ninterpretability by explicitly showing the spurious attributes in image space.\nFollowing this, for each image, we first edit the spurious attributes with a\nrandom degree sampled from a uniform distribution, while keeping target\nattribute unchanged. Then we train a fair FAC model by fostering model\ninvariance to these augmentation. Extensive experiments on three common\ndatasets demonstrate the effectiveness of our method in promoting fairness in\nFAC without compromising accuracy. Codes are in\nhttps://github.com/heqianpei/DiGA.\n","authors":["Fengda Zhang","Qianpei He","Kun Kuang","Jiashuo Liu","Long Chen","Chao Wu","Jun Xiao","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.06606v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10066v2","updated":"2024-03-25T06:27:57Z","published":"2024-03-15T07:16:07Z","title":"Contrastive Pre-Training with Multi-View Fusion for No-Reference Point\n Cloud Quality Assessment","summary":" No-reference point cloud quality assessment (NR-PCQA) aims to automatically\nevaluate the perceptual quality of distorted point clouds without available\nreference, which have achieved tremendous improvements due to the utilization\nof deep neural networks. However, learning-based NR-PCQA methods suffer from\nthe scarcity of labeled data and usually perform suboptimally in terms of\ngeneralization. To solve the problem, we propose a novel contrastive\npre-training framework tailored for PCQA (CoPA), which enables the pre-trained\nmodel to learn quality-aware representations from unlabeled data. To obtain\nanchors in the representation space, we project point clouds with different\ndistortions into images and randomly mix their local patches to form mixed\nimages with multiple distortions. Utilizing the generated anchors, we constrain\nthe pre-training process via a quality-aware contrastive loss following the\nphilosophy that perceptual quality is closely related to both content and\ndistortion. Furthermore, in the model fine-tuning stage, we propose a\nsemantic-guided multi-view fusion module to effectively integrate the features\nof projected images from multiple perspectives. Extensive experiments show that\nour method outperforms the state-of-the-art PCQA methods on popular benchmarks.\nFurther investigations demonstrate that CoPA can also benefit existing\nlearning-based PCQA models.\n","authors":["Ziyu Shan","Yujie Zhang","Qi Yang","Haichen Yang","Yiling Xu","Jenq-Neng Hwang","Xiaozhong Xu","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.10066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16450v1","updated":"2024-03-25T06:22:27Z","published":"2024-03-25T06:22:27Z","title":"Camera-aware Label Refinement for Unsupervised Person Re-identification","summary":" Unsupervised person re-identification aims to retrieve images of a specified\nperson without identity labels. Many recent unsupervised Re-ID approaches adopt\nclustering-based methods to measure cross-camera feature similarity to roughly\ndivide images into clusters. They ignore the feature distribution discrepancy\ninduced by camera domain gap, resulting in the unavoidable performance\ndegradation. Camera information is usually available, and the feature\ndistribution in the single camera usually focuses more on the appearance of the\nindividual and has less intra-identity variance. Inspired by the observation,\nwe introduce a \\textbf{C}amera-\\textbf{A}ware \\textbf{L}abel\n\\textbf{R}efinement~(CALR) framework that reduces camera discrepancy by\nclustering intra-camera similarity. Specifically, we employ intra-camera\ntraining to obtain reliable local pseudo labels within each camera, and then\nrefine global labels generated by inter-camera clustering and train the\ndiscriminative model using more reliable global pseudo labels in a self-paced\nmanner. Meanwhile, we develop a camera-alignment module to align feature\ndistributions under different cameras, which could help deal with the camera\nvariance further. Extensive experiments validate the superiority of our\nproposed method over state-of-the-art approaches. The code is accessible at\nhttps://github.com/leeBooMla/CALR.\n","authors":["Pengna Li","Kangyi Wu","Wenli Huang","Sanping Zhou","Jinjun Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16450v1.pdf","comment":"submitted to IEEE TMM"},{"id":"http://arxiv.org/abs/2312.02480v2","updated":"2024-03-25T06:22:09Z","published":"2023-12-05T04:13:31Z","title":"Differentiable Point-based Inverse Rendering","summary":" We present differentiable point-based inverse rendering, DPIR, an\nanalysis-by-synthesis method that processes images captured under diverse\nilluminations to estimate shape and spatially-varying BRDF. To this end, we\nadopt point-based rendering, eliminating the need for multiple samplings per\nray, typical of volumetric rendering, thus significantly enhancing the speed of\ninverse rendering. To realize this idea, we devise a hybrid point-volumetric\nrepresentation for geometry and a regularized basis-BRDF representation for\nreflectance. The hybrid geometric representation enables fast rendering through\npoint-based splatting while retaining the geometric details and stability\ninherent to SDF-based representations. The regularized basis-BRDF mitigates the\nill-posedness of inverse rendering stemming from limited light-view angular\nsamples. We also propose an efficient shadow detection method using point-based\nshadow map rendering. Our extensive evaluations demonstrate that DPIR\noutperforms prior works in terms of reconstruction accuracy, computational\nefficiency, and memory footprint. Furthermore, our explicit point-based\nrepresentation and rendering enables intuitive geometry and reflectance\nediting.\n","authors":["Hoon-Gyu Chung","Seokjun Choi","Seung-Hwan Baek"],"pdf_url":"https://arxiv.org/pdf/2312.02480v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16442v1","updated":"2024-03-25T06:05:50Z","published":"2024-03-25T06:05:50Z","title":"If CLIP Could Talk: Understanding Vision-Language Model Representations\n Through Their Preferred Concept Descriptions","summary":" Recent works often assume that Vision-Language Model (VLM) representations\nare based on visual attributes like shape. However, it is unclear to what\nextent VLMs prioritize this information to represent concepts. We propose\nExtract and Explore (EX2), a novel approach to characterize important textual\nfeatures for VLMs. EX2 uses reinforcement learning to align a large language\nmodel with VLM preferences and generates descriptions that incorporate the\nimportant features for the VLM. Then, we inspect the descriptions to identify\nthe features that contribute to VLM representations. We find that spurious\ndescriptions have a major role in VLM representations despite providing no\nhelpful information, e.g., Click to enlarge photo of CONCEPT. More importantly,\namong informative descriptions, VLMs rely significantly on non-visual\nattributes like habitat to represent visual concepts. Also, our analysis\nreveals that different VLMs prioritize different attributes in their\nrepresentations. Overall, we show that VLMs do not simply match images to scene\ndescriptions and that non-visual or even spurious descriptions significantly\ninfluence their representations.\n","authors":["Reza Esfandiarpoor","Cristina Menghini","Stephen H. Bach"],"pdf_url":"https://arxiv.org/pdf/2403.16442v1.pdf","comment":"Code: https://github.com/BatsResearch/ex2"},{"id":"http://arxiv.org/abs/2310.14566v5","updated":"2024-03-25T06:05:24Z","published":"2023-10-23T04:49:09Z","title":"HallusionBench: An Advanced Diagnostic Suite for Entangled Language\n Hallucination and Visual Illusion in Large Vision-Language Models","summary":" We introduce HallusionBench, a comprehensive benchmark designed for the\nevaluation of image-context reasoning. This benchmark presents significant\nchallenges to advanced large visual-language models (LVLMs), such as\nGPT-4V(Vision), Gemini Pro Vision, Claude 3, and LLaVA-1.5, by emphasizing\nnuanced understanding and interpretation of visual data. The benchmark\ncomprises 346 images paired with 1129 questions, all meticulously crafted by\nhuman experts. We introduce a novel structure for these visual questions\ndesigned to establish control groups. This structure enables us to conduct a\nquantitative analysis of the models' response tendencies, logical consistency,\nand various failure modes. In our evaluation on HallusionBench, we benchmarked\n15 different models, highlighting a 31.42% question-pair accuracy achieved by\nthe state-of-the-art GPT-4V. Notably, all other evaluated models achieve\naccuracy below 16%. Moreover, our analysis not only highlights the observed\nfailure modes, including language hallucination and visual illusion, but also\ndeepens an understanding of these pitfalls. Our comprehensive case studies\nwithin HallusionBench shed light on the challenges of hallucination and\nillusion in LVLMs. Based on these insights, we suggest potential pathways for\ntheir future improvement. The benchmark and codebase can be accessed at\nhttps://github.com/tianyi-lab/HallusionBench.\n","authors":["Tianrui Guan","Fuxiao Liu","Xiyang Wu","Ruiqi Xian","Zongxia Li","Xiaoyu Liu","Xijun Wang","Lichang Chen","Furong Huang","Yaser Yacoob","Dinesh Manocha","Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.14566v5.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16440v1","updated":"2024-03-25T06:02:05Z","published":"2024-03-25T06:02:05Z","title":"RCBEVDet: Radar-camera Fusion in Bird's Eye View for 3D Object Detection","summary":" Three-dimensional object detection is one of the key tasks in autonomous\ndriving. To reduce costs in practice, low-cost multi-view cameras for 3D object\ndetection are proposed to replace the expansive LiDAR sensors. However, relying\nsolely on cameras is difficult to achieve highly accurate and robust 3D object\ndetection. An effective solution to this issue is combining multi-view cameras\nwith the economical millimeter-wave radar sensor to achieve more reliable\nmulti-modal 3D object detection. In this paper, we introduce RCBEVDet, a\nradar-camera fusion 3D object detection method in the bird's eye view (BEV).\nSpecifically, we first design RadarBEVNet for radar BEV feature extraction.\nRadarBEVNet consists of a dual-stream radar backbone and a Radar Cross-Section\n(RCS) aware BEV encoder. In the dual-stream radar backbone, a point-based\nencoder and a transformer-based encoder are proposed to extract radar features,\nwith an injection and extraction module to facilitate communication between the\ntwo encoders. The RCS-aware BEV encoder takes RCS as the object size prior to\nscattering the point feature in BEV. Besides, we present the Cross-Attention\nMulti-layer Fusion module to automatically align the multi-modal BEV feature\nfrom radar and camera with the deformable attention mechanism, and then fuse\nthe feature with channel and spatial fusion layers. Experimental results show\nthat RCBEVDet achieves new state-of-the-art radar-camera fusion results on\nnuScenes and view-of-delft (VoD) 3D object detection benchmarks. Furthermore,\nRCBEVDet achieves better 3D detection results than all real-time camera-only\nand radar-camera 3D object detectors with a faster inference speed at 21~28\nFPS. The source code will be released at https://github.com/VDIGPKU/RCBEVDet.\n","authors":["Zhiwei Lin","Zhe Liu","Zhongyu Xia","Xinhao Wang","Yongtao Wang","Shengxiang Qi","Yang Dong","Nan Dong","Le Zhang","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.16440v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.16439v1","updated":"2024-03-25T05:58:33Z","published":"2024-03-25T05:58:33Z","title":"Producing and Leveraging Online Map Uncertainty in Trajectory Prediction","summary":" High-definition (HD) maps have played an integral role in the development of\nmodern autonomous vehicle (AV) stacks, albeit with high associated labeling and\nmaintenance costs. As a result, many recent works have proposed methods for\nestimating HD maps online from sensor data, enabling AVs to operate outside of\npreviously-mapped regions. However, current online map estimation approaches\nare developed in isolation of their downstream tasks, complicating their\nintegration in AV stacks. In particular, they do not produce uncertainty or\nconfidence estimates. In this work, we extend multiple state-of-the-art online\nmap estimation methods to additionally estimate uncertainty and show how this\nenables more tightly integrating online mapping with trajectory forecasting. In\ndoing so, we find that incorporating uncertainty yields up to 50% faster\ntraining convergence and up to 15% better prediction performance on the\nreal-world nuScenes driving dataset.\n","authors":["Xunjiang Gu","Guanyu Song","Igor Gilitschenski","Marco Pavone","Boris Ivanovic"],"pdf_url":"https://arxiv.org/pdf/2403.16439v1.pdf","comment":"14 pages, 14 figures, 6 tables. CVPR 2024"},{"id":"http://arxiv.org/abs/2403.07371v2","updated":"2024-03-25T05:48:28Z","published":"2024-03-12T07:15:29Z","title":"Time-Efficient and Identity-Consistent Virtual Try-On Using A Variant of\n Altered Diffusion Models","summary":" This study discusses the critical issues of Virtual Try-On in contemporary\ne-commerce and the prospective metaverse, emphasizing the challenges of\npreserving intricate texture details and distinctive features of the target\nperson and the clothes in various scenarios, such as clothing texture and\nidentity characteristics like tattoos or accessories. In addition to the\nfidelity of the synthesized images, the efficiency of the synthesis process\npresents a significant hurdle. Various existing approaches are explored,\nhighlighting the limitations and unresolved aspects, e.g., identity information\nomission, uncontrollable artifacts, and low synthesis speed. It then proposes a\nnovel diffusion-based solution that addresses garment texture preservation and\nuser identity retention during virtual try-on. The proposed network comprises\ntwo primary modules - a warping module aligning clothing with individual\nfeatures and a try-on module refining the attire and generating missing parts\nintegrated with a mask-aware post-processing technique ensuring the integrity\nof the individual's identity. It demonstrates impressive results, surpassing\nthe state-of-the-art in speed by nearly 20 times during inference, with\nsuperior fidelity in qualitative assessments. Quantitative evaluations confirm\ncomparable performance with the recent SOTA method on the VITON-HD and\nDresscode datasets.\n","authors":["Phuong Dam","Jihoon Jeong","Anh Tran","Daeyoung Kim"],"pdf_url":"https://arxiv.org/pdf/2403.07371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16438v1","updated":"2024-03-25T05:46:06Z","published":"2024-03-25T05:46:06Z","title":"Real-time Neuron Segmentation for Voltage Imaging","summary":" In voltage imaging, where the membrane potentials of individual neurons are\nrecorded at from hundreds to thousand frames per second using fluorescence\nmicroscopy, data processing presents a challenge. Even a fraction of a minute\nof recording with a limited image size yields gigabytes of video data\nconsisting of tens of thousands of frames, which can be time-consuming to\nprocess. Moreover, millisecond-level short exposures lead to noisy video\nframes, obscuring neuron footprints especially in deep-brain samples where\nnoisy signals are buried in background fluorescence. To address this challenge,\nwe propose a fast neuron segmentation method able to detect multiple,\npotentially overlapping, spiking neurons from noisy video frames, and implement\na data processing pipeline incorporating the proposed segmentation method along\nwith GPU-accelerated motion correction. By testing on existing datasets as well\nas on new datasets we introduce, we show that our pipeline extracts neuron\nfootprints that agree well with human annotation even from cluttered datasets,\nand demonstrate real-time processing of voltage imaging data on a single\ndesktop computer for the first time.\n","authors":["Yosuke Bando","Ramdas Pillai","Atsushi Kajita","Farhan Abdul Hakeem","Yves Quemener","Hua-an Tseng","Kiryl D. Piatkevich","Changyang Linghu","Xue Han","Edward S. Boyden"],"pdf_url":"https://arxiv.org/pdf/2403.16438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06199v4","updated":"2024-03-25T05:36:56Z","published":"2024-03-10T12:43:27Z","title":"Mipha: A Comprehensive Overhaul of Multimodal Assistant with Small\n Language Models","summary":" Multimodal Large Language Models (MLLMs) have showcased impressive skills in\ntasks related to visual understanding and reasoning. Yet, their widespread\napplication faces obstacles due to the high computational demands during both\nthe training and inference phases, restricting their use to a limited audience\nwithin the research and user communities. In this paper, we investigate the\ndesign aspects of Multimodal Small Language Models (MSLMs) and propose an\nefficient multimodal assistant named Mipha, which is designed to create synergy\namong various aspects: visual representation, language models, and optimization\nstrategies. We show that without increasing the volume of training data, our\nMipha-3B outperforms the state-of-the-art large MLLMs, especially\nLLaVA-1.5-13B, on multiple benchmarks. Through detailed discussion, we provide\ninsights and guidelines for developing strong MSLMs that rival the capabilities\nof MLLMs. Our code is available at https://github.com/zhuyiche/llava-phi.\n","authors":["Minjie Zhu","Yichen Zhu","Xin Liu","Ning Liu","Zhiyuan Xu","Chaomin Shen","Yaxin Peng","Zhicai Ou","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2403.06199v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18287v2","updated":"2024-03-25T05:34:58Z","published":"2023-11-30T06:45:52Z","title":"Dispersed Structured Light for Hyperspectral 3D Imaging","summary":" Hyperspectral 3D imaging aims to acquire both depth and spectral information\nof a scene. However, existing methods are either prohibitively expensive and\nbulky or compromise on spectral and depth accuracy. In this work, we present\nDispersed Structured Light (DSL), a cost-effective and compact method for\naccurate hyperspectral 3D imaging. DSL modifies a traditional projector-camera\nsystem by placing a sub-millimeter thick diffraction grating film front of the\nprojector. The grating disperses structured light based on light wavelength. To\nutilize the dispersed structured light, we devise a model for dispersive\nprojection image formation and a per-pixel hyperspectral 3D reconstruction\nmethod. We validate DSL by instantiating a compact experimental prototype. DSL\nachieves spectral accuracy of 18.8nm full-width half-maximum (FWHM) and depth\nerror of 1mm. We demonstrate that DSL outperforms prior work on practical\nhyperspectral 3D imaging. DSL promises accurate and practical hyperspectral 3D\nimaging for diverse application domains, including computer vision and\ngraphics, cultural heritage, geology, and biology.\n","authors":["Suhyun Shin","Seokjun Choi","Felix Heide","Seung-Hwan Baek"],"pdf_url":"https://arxiv.org/pdf/2311.18287v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16431v1","updated":"2024-03-25T05:22:34Z","published":"2024-03-25T05:22:34Z","title":"DOCTR: Disentangled Object-Centric Transformer for Point Scene\n Understanding","summary":" Point scene understanding is a challenging task to process real-world scene\npoint cloud, which aims at segmenting each object, estimating its pose, and\nreconstructing its mesh simultaneously. Recent state-of-the-art method first\nsegments each object and then processes them independently with multiple stages\nfor the different sub-tasks. This leads to a complex pipeline to optimize and\nmakes it hard to leverage the relationship constraints between multiple\nobjects. In this work, we propose a novel Disentangled Object-Centric\nTRansformer (DOCTR) that explores object-centric representation to facilitate\nlearning with multiple objects for the multiple sub-tasks in a unified manner.\nEach object is represented as a query, and a Transformer decoder is adapted to\niteratively optimize all the queries involving their relationship. In\nparticular, we introduce a semantic-geometry disentangled query (SGDQ) design\nthat enables the query features to attend separately to semantic information\nand geometric information relevant to the corresponding sub-tasks. A hybrid\nbipartite matching module is employed to well use the supervisions from all the\nsub-tasks during training. Qualitative and quantitative experimental results\ndemonstrate that our method achieves state-of-the-art performance on the\nchallenging ScanNet dataset. Code is available at\nhttps://github.com/SAITPublic/DOCTR.\n","authors":["Xiaoxuan Yu","Hao Wang","Weiming Li","Qiang Wang","Soonyong Cho","Younghun Sung"],"pdf_url":"https://arxiv.org/pdf/2403.16431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13964v3","updated":"2024-03-25T05:18:04Z","published":"2023-12-21T15:51:12Z","title":"PIA: Your Personalized Image Animator via Plug-and-Play Modules in\n Text-to-Image Models","summary":" Recent advancements in personalized text-to-image (T2I) models have\nrevolutionized content creation, empowering non-experts to generate stunning\nimages with unique styles. While promising, adding realistic motions into these\npersonalized images by text poses significant challenges in preserving distinct\nstyles, high-fidelity details, and achieving motion controllability by text. In\nthis paper, we present PIA, a Personalized Image Animator that excels in\naligning with condition images, achieving motion controllability by text, and\nthe compatibility with various personalized T2I models without specific tuning.\nTo achieve these goals, PIA builds upon a base T2I model with well-trained\ntemporal alignment layers, allowing for the seamless transformation of any\npersonalized T2I model into an image animation model. A key component of PIA is\nthe introduction of the condition module, which utilizes the condition frame\nand inter-frame affinity as input to transfer appearance information guided by\nthe affinity hint for individual frame synthesis in the latent space. This\ndesign mitigates the challenges of appearance-related image alignment within\nand allows for a stronger focus on aligning with motion-related guidance.\n","authors":["Yiming Zhang","Zhening Xing","Yanhong Zeng","Youqing Fang","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13964v3.pdf","comment":"Project page: https://pi-animator.github.io/"},{"id":"http://arxiv.org/abs/2403.16428v1","updated":"2024-03-25T05:12:21Z","published":"2024-03-25T05:12:21Z","title":"Benchmarks and Challenges in Pose Estimation for Egocentric Hand\n Interactions with Objects","summary":" We interact with the world with our hands and see it through our own\n(egocentric) perspective. A holistic 3D understanding of such interactions from\negocentric views is important for tasks in robotics, AR/VR, action recognition\nand motion generation. Accurately reconstructing such interactions in 3D is\nchallenging due to heavy occlusion, viewpoint bias, camera distortion, and\nmotion blur from the head movement. To this end, we designed the HANDS23\nchallenge based on the AssemblyHands and ARCTIC datasets with carefully\ndesigned training and testing splits. Based on the results of the top submitted\nmethods and more recent baselines on the leaderboards, we perform a thorough\nanalysis on 3D hand(-object) reconstruction tasks. Our analysis demonstrates\nthe effectiveness of addressing distortion specific to egocentric cameras,\nadopting high-capacity transformers to learn complex hand-object interactions,\nand fusing predictions from different views. Our study further reveals\nchallenging scenarios intractable with state-of-the-art methods, such as fast\nhand motion, object reconstruction from narrow egocentric views, and close\ncontact between two hands and objects. Our efforts will enrich the community's\nknowledge foundation and facilitate future hand studies on egocentric\nhand-object interactions.\n","authors":["Zicong Fan","Takehiko Ohkawa","Linlin Yang","Nie Lin","Zhishan Zhou","Shihao Zhou","Jiajun Liang","Zhong Gao","Xuanyang Zhang","Xue Zhang","Fei Li","Liu Zheng","Feng Lu","Karim Abou Zeid","Bastian Leibe","Jeongwan On","Seungryul Baek","Aditya Prakash","Saurabh Gupta","Kun He","Yoichi Sato","Otmar Hilliges","Hyung Jin Chang","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2403.16428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16425v1","updated":"2024-03-25T05:10:34Z","published":"2024-03-25T05:10:34Z","title":"Enhancing Visual Place Recognition via Fast and Slow Adaptive Biasing in\n Event Cameras","summary":" Event cameras are increasingly popular in robotics due to their beneficial\nfeatures, such as low latency, energy efficiency, and high dynamic range.\nNevertheless, their downstream task performance is greatly influenced by the\noptimization of bias parameters. These parameters, for instance, regulate the\nnecessary change in light intensity to trigger an event, which in turn depends\non factors such as the environment lighting and camera motion. This paper\nintroduces feedback control algorithms that automatically tune the bias\nparameters through two interacting methods: 1) An immediate, on-the-fly fast\nadaptation of the refractory period, which sets the minimum interval between\nconsecutive events, and 2) if the event rate exceeds the specified bounds even\nafter changing the refractory period repeatedly, the controller adapts the\npixel bandwidth and event thresholds, which stabilizes after a short period of\nnoise events across all pixels (slow adaptation). Our evaluation focuses on the\nvisual place recognition task, where incoming query images are compared to a\ngiven reference database. We conducted comprehensive evaluations of our\nalgorithms' adaptive feedback control in real-time. To do so, we collected the\nQCR-Fast-and-Slow dataset that contains DAVIS346 event camera streams from 366\nrepeated traversals of a Scout Mini robot navigating through a 100 meter long\nindoor lab setting (totaling over 35km distance traveled) in varying brightness\nconditions with ground truth location information. Our proposed feedback\ncontrollers result in superior performance when compared to the standard bias\nsettings and prior feedback control methods. Our findings also detail the\nimpact of bias adjustments on task performance and feature ablation studies on\nthe fast and slow adaptation mechanisms.\n","authors":["Gokul B. Nair","Michael Milford","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2403.16425v1.pdf","comment":"8 pages, 9 figures, paper under review"},{"id":"http://arxiv.org/abs/2312.03009v2","updated":"2024-03-25T05:04:04Z","published":"2023-12-04T19:01:19Z","title":"I-PHYRE: Interactive Physical Reasoning","summary":" Current evaluation protocols predominantly assess physical reasoning in\nstationary scenes, creating a gap in evaluating agents' abilities to interact\nwith dynamic events. While contemporary methods allow agents to modify initial\nscene configurations and observe consequences, they lack the capability to\ninteract with events in real time. To address this, we introduce I-PHYRE, a\nframework that challenges agents to simultaneously exhibit intuitive physical\nreasoning, multi-step planning, and in-situ intervention. Here, intuitive\nphysical reasoning refers to a quick, approximate understanding of physics to\naddress complex problems; multi-step denotes the need for extensive sequence\nplanning in I-PHYRE, considering each intervention can significantly alter\nsubsequent choices; and in-situ implies the necessity for timely object\nmanipulation within a scene, where minor timing deviations can result in task\nfailure. We formulate four game splits to scrutinize agents' learning and\ngeneralization of essential principles of interactive physical reasoning,\nfostering learning through interaction with representative scenarios. Our\nexploration involves three planning strategies and examines several supervised\nand reinforcement agents' zero-shot generalization proficiency on I-PHYRE. The\noutcomes highlight a notable gap between existing learning algorithms and human\nperformance, emphasizing the imperative for more research in enhancing agents\nwith interactive physical reasoning capabilities. The environment and baselines\nwill be made publicly available.\n","authors":["Shiqian Li","Kewen Wu","Chi Zhang","Yixin Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.03009v2.pdf","comment":"21 pages, ICLR 2024"},{"id":"http://arxiv.org/abs/2403.16422v1","updated":"2024-03-25T04:54:49Z","published":"2024-03-25T04:54:49Z","title":"Refining Text-to-Image Generation: Towards Accurate Training-Free\n Glyph-Enhanced Image Generation","summary":" Over the past few years, Text-to-Image (T2I) generation approaches based on\ndiffusion models have gained significant attention. However, vanilla diffusion\nmodels often suffer from spelling inaccuracies in the text displayed within the\ngenerated images. The capability to generate visual text is crucial, offering\nboth academic interest and a wide range of practical applications. To produce\naccurate visual text images, state-of-the-art techniques adopt a\nglyph-controlled image generation approach, consisting of a text layout\ngenerator followed by an image generator that is conditioned on the generated\ntext layout. Nevertheless, our study reveals that these models still face three\nprimary challenges, prompting us to develop a testbed to facilitate future\nresearch. We introduce a benchmark, LenCom-Eval, specifically designed for\ntesting models' capability in generating images with Lengthy and Complex visual\ntext. Subsequently, we introduce a training-free framework to enhance the\ntwo-stage generation approaches. We examine the effectiveness of our approach\non both LenCom-Eval and MARIO-Eval benchmarks and demonstrate notable\nimprovements across a range of evaluation metrics, including CLIPScore, OCR\nprecision, recall, F1 score, accuracy, and edit distance scores. For instance,\nour proposed framework improves the backbone model, TextDiffuser, by more than\n23\\% and 13.5\\% in terms of OCR word F1 on LenCom-Eval and MARIO-Eval,\nrespectively. Our work makes a unique contribution to the field by focusing on\ngenerating images with long and rare text sequences, a niche previously\nunexplored by existing literature\n","authors":["Sanyam Lakhanpal","Shivang Chopra","Vinija Jain","Aman Chadha","Man Luo"],"pdf_url":"https://arxiv.org/pdf/2403.16422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03173v3","updated":"2024-03-25T04:42:22Z","published":"2024-03-05T18:08:29Z","title":"Solving the bongard-logo problem by modeling a probabilistic model","summary":" Abstract reasoning problems challenge the perceptual and cognitive abilities\nof AI algorithms, demanding deeper pattern discernment and inductive reasoning\nbeyond explicit image features. This study introduces PMoC, a tailored\nprobability model for the Bongard-Logo problem, achieving high reasoning\naccuracy by constructing independent probability models. Additionally, we\npresent Pose-Transformer, an enhanced Transformer-Encoder designed for complex\nabstract reasoning tasks, including Bongard-Logo, RAVEN, I-RAVEN, and PGM.\nPose-Transformer incorporates positional information learning, inspired by\ncapsule networks' pose matrices, enhancing its focus on local positional\nrelationships in image data processing. When integrated with PMoC, it further\nimproves reasoning accuracy. Our approach effectively addresses reasoning\ndifficulties associated with abstract entities' positional changes,\noutperforming previous models on the OIG, D3$\\times$3 subsets of RAVEN, and PGM\ndatabases. This research contributes to advancing AI's capabilities in abstract\nreasoning and cognitive pattern recognition.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03173v3.pdf","comment":"14 pages, 11 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.03190v4","updated":"2024-03-25T04:40:39Z","published":"2024-03-05T18:29:17Z","title":"Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract\n Reasoning process","summary":" Abstract reasoning problems pose significant challenges to artificial\nintelligence algorithms, demanding cognitive capabilities beyond those required\nfor perception tasks. This study introduces the Triple-CFN approach to tackle\nthe Bongard-Logo problem, achieving notable reasoning accuracy by implicitly\nreorganizing the concept space of conflicting instances. Additionally, the\nTriple-CFN paradigm proves effective for the RPM problem with necessary\nmodifications, yielding competitive results. To further enhance performance on\nthe RPM issue, we develop the Meta Triple-CFN network, which explicitly\nstructures the problem space while maintaining interpretability on progressive\npatterns. The success of Meta Triple-CFN is attributed to its paradigm of\nmodeling the conceptual space, equivalent to normalizing reasoning information.\nBased on this ideology, we introduce the Re-space layer, enhancing the\nperformance of both Meta Triple-CFN and Triple-CFN. This paper aims to\ncontribute to advancements in machine intelligence by exploring innovative\nnetwork designs for addressing abstract reasoning problems, paving the way for\nfurther breakthroughs in this domain.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03190v4.pdf","comment":"14 pages, 14 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.03452v4","updated":"2024-03-25T04:38:42Z","published":"2024-03-06T04:36:43Z","title":"D4C glove-train: solving the RPM and Bongard-logo problem by\n distributing and Circumscribing concepts","summary":" This paper achieves noteworthy progress in the realm of abstract reasoning,\nparticularly in addressing Raven's Progressive Matrices (RPM) and Bongard-Logo\nchallenges. Initially, we introduce Lico-Net, a novel baseline model that\nresolves RPM problems with remarkable accuracy. Leveraging this foundation, we\nadvance with the D3C approach, which advocates representing the underlying\nconcepts in abstract reasoning problems through distributions. This perspective\nenhances the performance of both Lico-Net and a baseline model excelling in\nBongard-Logo tasks. To bolster the computational efficiency of D3C, we present\nthe D3C-cos variant, offering a streamlined yet precise solution. Furthermore,\nwe propose the D2C method, redefining conceptual boundaries within these\ndomains and bridging the divide between high-level abstractions and their\nlower-dimensional counterparts. Finally, we extend our methodology to D4C,\nemploying adversarial techniques to refine conceptual boundaries further and\ndemonstrate substantial improvements in both RPM and Bongard-Logo challenges.\nOverall, our contributions present a fresh outlook and practical advancements\nin the field of abstract reasoning.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03452v4.pdf","comment":"18 pages, 19 figures, 6 tables"},{"id":"http://arxiv.org/abs/2403.16412v1","updated":"2024-03-25T04:14:07Z","published":"2024-03-25T04:14:07Z","title":"Unsupervised Template-assisted Point Cloud Shape Correspondence Network","summary":" Unsupervised point cloud shape correspondence aims to establish point-wise\ncorrespondences between source and target point clouds. Existing methods obtain\ncorrespondences directly by computing point-wise feature similarity between\npoint clouds. However, non-rigid objects possess strong deformability and\nunusual shapes, making it a longstanding challenge to directly establish\ncorrespondences between point clouds with unconventional shapes. To address\nthis challenge, we propose an unsupervised Template-Assisted point cloud shape\ncorrespondence Network, termed TANet, including a template generation module\nand a template assistance module. The proposed TANet enjoys several merits.\nFirstly, the template generation module establishes a set of learnable\ntemplates with explicit structures. Secondly, we introduce a template\nassistance module that extensively leverages the generated templates to\nestablish more accurate shape correspondences from multiple perspectives.\nExtensive experiments on four human and animal datasets demonstrate that TANet\nachieves favorable performance against state-of-the-art methods.\n","authors":["Jiacheng Deng","Jiahao Lu","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.16412v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.16410v1","updated":"2024-03-25T04:05:23Z","published":"2024-03-25T04:05:23Z","title":"Spike-NeRF: Neural Radiance Field Based On Spike Camera","summary":" As a neuromorphic sensor with high temporal resolution, spike cameras offer\nnotable advantages over traditional cameras in high-speed vision applications\nsuch as high-speed optical estimation, depth estimation, and object tracking.\nInspired by the success of the spike camera, we proposed Spike-NeRF, the first\nNeural Radiance Field derived from spike data, to achieve 3D reconstruction and\nnovel viewpoint synthesis of high-speed scenes. Instead of the multi-view\nimages at the same time of NeRF, the inputs of Spike-NeRF are continuous spike\nstreams captured by a moving spike camera in a very short time. To reconstruct\na correct and stable 3D scene from high-frequency but unstable spike data, we\ndevised spike masks along with a distinctive loss function. We evaluate our\nmethod qualitatively and numerically on several challenging synthetic scenes\ngenerated by blender with the spike camera simulator. Our results demonstrate\nthat Spike-NeRF produces more visually appealing results than the existing\nmethods and the baseline we proposed in high-speed scenes. Our code and data\nwill be released soon.\n","authors":["Yijia Guo","Yuanxi Bai","Liwen Hu","Mianzhi Liu","Ziyi Guo","Lei Ma","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2403.16410v1.pdf","comment":"This paper is accepted by ICME2024"},{"id":"http://arxiv.org/abs/2403.16407v1","updated":"2024-03-25T03:47:53Z","published":"2024-03-25T03:47:53Z","title":"A Survey on Long Video Generation: Challenges, Methods, and Prospects","summary":" Video generation is a rapidly advancing research area, garnering significant\nattention due to its broad range of applications. One critical aspect of this\nfield is the generation of long-duration videos, which presents unique\nchallenges and opportunities. This paper presents the first survey of recent\nadvancements in long video generation and summarises them into two key\nparadigms: divide and conquer temporal autoregressive.\n We delve into the common models employed in each paradigm, including aspects\nof network design and conditioning techniques. Furthermore, we offer a\ncomprehensive overview and classification of the datasets and evaluation\nmetrics which are crucial for advancing long video generation research.\nConcluding with a summary of existing studies, we also discuss the emerging\nchallenges and future directions in this dynamic field. We hope that this\nsurvey will serve as an essential reference for researchers and practitioners\nin the realm of long video generation.\n","authors":["Chengxuan Li","Di Huang","Zeyu Lu","Yang Xiao","Qingqi Pei","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2403.16407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16405v1","updated":"2024-03-25T03:44:36Z","published":"2024-03-25T03:44:36Z","title":"Ensemble Adversarial Defense via Integration of Multiple Dispersed Low\n Curvature Models","summary":" The integration of an ensemble of deep learning models has been extensively\nexplored to enhance defense against adversarial attacks. The diversity among\nsub-models increases the attack cost required to deceive the majority of the\nensemble, thereby improving the adversarial robustness. While existing\napproaches mainly center on increasing diversity in feature representations or\ndispersion of first-order gradients with respect to input, the limited\ncorrelation between these diversity metrics and adversarial robustness\nconstrains the performance of ensemble adversarial defense. In this work, we\naim to enhance ensemble diversity by reducing attack transferability. We\nidentify second-order gradients, which depict the loss curvature, as a key\nfactor in adversarial robustness. Computing the Hessian matrix involved in\nsecond-order gradients is computationally expensive. To address this, we\napproximate the Hessian-vector product using differential approximation. Given\nthat low curvature provides better robustness, our ensemble model was designed\nto consider the influence of curvature among different sub-models. We introduce\na novel regularizer to train multiple more-diverse low-curvature network\nmodels. Extensive experiments across various datasets demonstrate that our\nensemble model exhibits superior robustness against a range of attacks,\nunderscoring the effectiveness of our approach.\n","authors":["Kaikang Zhao","Xi Chen","Wei Huang","Liuxin Ding","Xianglong Kong","Fan Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.16405v1.pdf","comment":"Accepted to The 2024 International Joint Conference on Neural\n Networks (IJCNN)"},{"id":"http://arxiv.org/abs/2304.06928v2","updated":"2024-03-25T03:40:19Z","published":"2023-04-14T05:25:52Z","title":"CiPR: An Efficient Framework with Cross-instance Positive Relations for\n Generalized Category Discovery","summary":" We tackle the issue of generalized category discovery (GCD). GCD considers\nthe open-world problem of automatically clustering a partially labelled\ndataset, in which the unlabelled data may contain instances from both novel\ncategories and labelled classes. In this paper, we address the GCD problem with\nan unknown category number for the unlabelled data. We propose a framework,\nnamed CiPR, to bootstrap the representation by exploiting Cross-instance\nPositive Relations in the partially labelled data for contrastive learning,\nwhich have been neglected in existing methods. To obtain reliable\ncross-instance relations to facilitate representation learning, we introduce a\nsemi-supervised hierarchical clustering algorithm, named selective neighbor\nclustering (SNC), which can produce a clustering hierarchy directly from the\nconnected components of a graph constructed from selective neighbors. We\nfurther present a method to estimate the unknown class number using SNC with a\njoint reference score that considers clustering indexes of both labelled and\nunlabelled data, and extend SNC to allow label assignment for the unlabelled\ninstances with a given class number. We thoroughly evaluate our framework on\npublic generic image recognition datasets and challenging fine-grained\ndatasets, and establish a new state-of-the-art. Code:\nhttps://github.com/haoosz/CiPR\n","authors":["Shaozhe Hao","Kai Han","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2304.06928v2.pdf","comment":"Accepted to TMLR. Code: https://github.com/haoosz/CiPR"},{"id":"http://arxiv.org/abs/2311.13614v2","updated":"2024-03-25T03:39:45Z","published":"2023-11-22T04:52:58Z","title":"HalluciDoctor: Mitigating Hallucinatory Toxicity in Visual Instruction\n Data","summary":" Multi-modal Large Language Models (MLLMs) tuned on machine-generated\ninstruction-following data have demonstrated remarkable performance in various\nmulti-modal understanding and generation tasks. However, the hallucinations\ninherent in machine-generated data, which could lead to hallucinatory outputs\nin MLLMs, remain under-explored. This work aims to investigate various\nhallucinations (i.e., object, relation, attribute hallucinations) and mitigate\nthose hallucinatory toxicities in large-scale machine-generated visual\ninstruction datasets. Drawing on the human ability to identify factual errors,\nwe present a novel hallucination detection and elimination framework,\nHalluciDoctor, based on the cross-checking paradigm. We use our framework to\nidentify and eliminate hallucinations in the training data automatically.\nInterestingly, HalluciDoctor also indicates that spurious correlations arising\nfrom long-tail object co-occurrences contribute to hallucinations. Based on\nthat, we execute counterfactual visual instruction expansion to balance data\ndistribution, thereby enhancing MLLMs' resistance to hallucinations.\nComprehensive experiments on hallucination evaluation benchmarks show that our\nmethod successfully mitigates 44.6% hallucinations relatively and maintains\ncompetitive performance compared to LLaVA. The data and code for this paper are\npublicly available. \\url{https://github.com/Yuqifan1117/HalluciDoctor}.\n","authors":["Qifan Yu","Juncheng Li","Longhui Wei","Liang Pang","Wentao Ye","Bosheng Qin","Siliang Tang","Qi Tian","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2311.13614v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16400v1","updated":"2024-03-25T03:30:37Z","published":"2024-03-25T03:30:37Z","title":"ASDF: Assembly State Detection Utilizing Late Fusion by Integrating 6D\n Pose Estimation","summary":" In medical and industrial domains, providing guidance for assembly processes\nis critical to ensure efficiency and safety. Errors in assembly can lead to\nsignificant consequences such as extended surgery times, and prolonged\nmanufacturing or maintenance times in industry. Assembly scenarios can benefit\nfrom in-situ AR visualization to provide guidance, reduce assembly times and\nminimize errors. To enable in-situ visualization 6D pose estimation can be\nleveraged. Existing 6D pose estimation techniques primarily focus on individual\nobjects and static captures. However, assembly scenarios have various dynamics\nincluding occlusion during assembly and dynamics in the assembly objects\nappearance. Existing work, combining object detection/6D pose estimation and\nassembly state detection focuses either on pure deep learning-based approaches,\nor limit the assembly state detection to building blocks. To address the\nchallenges of 6D pose estimation in combination with assembly state detection,\nour approach ASDF builds upon the strengths of YOLOv8, a real-time capable\nobject detection framework. We extend this framework, refine the object pose\nand fuse pose knowledge with network-detected pose information. Utilizing our\nlate fusion in our Pose2State module results in refined 6D pose estimation and\nassembly state detection. By combining both pose and state information, our\nPose2State module predicts the final assembly state with precision. Our\nevaluation on our ASDF dataset shows that our Pose2State module leads to an\nimproved assembly state detection and that the improvement of the assembly\nstate further leads to a more robust 6D pose estimation. Moreover, on the GBOT\ndataset, we outperform the pure deep learning-based network, and even\noutperform the hybrid and pure tracking-based approaches.\n","authors":["Hannah Schieber","Shiyu Li","Niklas Corell","Philipp Beckerle","Julian Kreimeier","Daniel Roth"],"pdf_url":"https://arxiv.org/pdf/2403.16400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17460v3","updated":"2024-03-25T03:21:39Z","published":"2023-11-29T09:02:07Z","title":"W-HMR: Human Mesh Recovery in World Space with Weak-supervised Camera\n Calibration and Orientation Correction","summary":" For a long time, in reconstructing 3D human bodies from monocular images,\nmost methods opted to simplify the task by minimizing the influence of the\ncamera. Using a coarse focal length setting results in the reconstructed bodies\nnot aligning well with distorted images. Ignoring camera rotation leads to an\nunrealistic reconstructed body pose in world space. Consequently, the\napplication scenarios of existing methods are confined to controlled\nenvironments. When confronted with complex and diverse in-the-wild images, they\nstruggle to achieve accurate and reasonable reconstruction in world space. To\naddress the above issues, we propose W-HMR, which decouples global body\nrecovery into camera calibration, local body recovery, and global body\norientation correction. We design the first weak-supervised camera calibration\nmethod for body distortion, eliminating dependence on focal length labels and\nachieving finer mesh-image alignment. We propose a novel orientation correction\nmodule to allow the reconstructed human body to remain normal in world space.\nDecoupling body orientation and body pose enables our model to consider the\naccuracy in camera coordinate and the reasonableness in world coordinate\nsimultaneously, expanding the range of applications. As a result, W-HMR\nachieves high-quality reconstruction in dual coordinate systems, particularly\nin challenging scenes. Codes and demos have been released on the project page\nhttps://yw0208.github.io/w-hmr/.\n","authors":["Wei Yao","Hongwen Zhang","Yunlian Sun","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2311.17460v3.pdf","comment":"Project Page: https://yw0208.github.io/w-hmr/"},{"id":"http://arxiv.org/abs/2403.16395v1","updated":"2024-03-25T03:18:58Z","published":"2024-03-25T03:18:58Z","title":"Multi-attention Associate Prediction Network for Visual Tracking","summary":" Classification-regression prediction networks have realized impressive\nsuccess in several modern deep trackers. However, there is an inherent\ndifference between classification and regression tasks, so they have diverse\neven opposite demands for feature matching. Existed models always ignore the\nkey issue and only employ a unified matching block in two task branches,\ndecaying the decision quality. Besides, these models also struggle with\ndecision misalignment situation. In this paper, we propose a multi-attention\nassociate prediction network (MAPNet) to tackle the above problems. Concretely,\ntwo novel matchers, i.e., category-aware matcher and spatial-aware matcher, are\nfirst designed for feature comparison by integrating self, cross, channel or\nspatial attentions organically. They are capable of fully capturing the\ncategory-related semantics for classification and the local spatial contexts\nfor regression, respectively. Then, we present a dual alignment module to\nenhance the correspondences between two branches, which is useful to find the\noptimal tracking solution. Finally, we describe a Siamese tracker built upon\nthe proposed prediction network, which achieves the leading performance on five\ntracking benchmarks, consisting of LaSOT, TrackingNet, GOT-10k, TNL2k and\nUAV123, and surpasses other state-of-the-art approaches.\n","authors":["Xinglong Sun","Haijiang Sun","Shan Jiang","Jiacheng Wang","Xilai Wei","Zhonghe Hu"],"pdf_url":"https://arxiv.org/pdf/2403.16395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16387v1","updated":"2024-03-25T03:06:45Z","published":"2024-03-25T03:06:45Z","title":"Text-IF: Leveraging Semantic Text Guidance for Degradation-Aware and\n Interactive Image Fusion","summary":" Image fusion aims to combine information from different source images to\ncreate a comprehensively representative image. Existing fusion methods are\ntypically helpless in dealing with degradations in low-quality source images\nand non-interactive to multiple subjective and objective needs. To solve them,\nwe introduce a novel approach that leverages semantic text guidance image\nfusion model for degradation-aware and interactive image fusion task, termed as\nText-IF. It innovatively extends the classical image fusion to the text guided\nimage fusion along with the ability to harmoniously address the degradation and\ninteraction issues during fusion. Through the text semantic encoder and\nsemantic interaction fusion decoder, Text-IF is accessible to the all-in-one\ninfrared and visible image degradation-aware processing and the interactive\nflexible fusion outcomes. In this way, Text-IF achieves not only multi-modal\nimage fusion, but also multi-modal information fusion. Extensive experiments\nprove that our proposed text guided image fusion strategy has obvious\nadvantages over SOTA methods in the image fusion performance and degradation\ntreatment. The code is available at https://github.com/XunpengYi/Text-IF.\n","authors":["Xunpeng Yi","Han Xu","Hao Zhang","Linfeng Tang","Jiayi Ma"],"pdf_url":"https://arxiv.org/pdf/2403.16387v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.09065v3","updated":"2024-03-25T03:04:44Z","published":"2024-03-14T03:12:02Z","title":"When Semantic Segmentation Meets Frequency Aliasing","summary":" Despite recent advancements in semantic segmentation, where and what pixels\nare hard to segment remains largely unexplored. Existing research only\nseparates an image into easy and hard regions and empirically observes the\nlatter are associated with object boundaries. In this paper, we conduct a\ncomprehensive analysis of hard pixel errors, categorizing them into three\ntypes: false responses, merging mistakes, and displacements. Our findings\nreveal a quantitative association between hard pixels and aliasing, which is\ndistortion caused by the overlapping of frequency components in the Fourier\ndomain during downsampling. To identify the frequencies responsible for\naliasing, we propose using the equivalent sampling rate to calculate the\nNyquist frequency, which marks the threshold for aliasing. Then, we introduce\nthe aliasing score as a metric to quantify the extent of aliasing. While\npositively correlated with the proposed aliasing score, three types of hard\npixels exhibit different patterns. Here, we propose two novel de-aliasing\nfilter (DAF) and frequency mixing (FreqMix) modules to alleviate aliasing\ndegradation by accurately removing or adjusting frequencies higher than the\nNyquist frequency. The DAF precisely removes the frequencies responsible for\naliasing before downsampling, while the FreqMix dynamically selects\nhigh-frequency components within the encoder block. Experimental results\ndemonstrate consistent improvements in semantic segmentation and low-light\ninstance segmentation tasks. The code is available at:\nhttps://github.com/Linwei-Chen/Seg-Aliasing.\n","authors":["Linwei Chen","Lin Gu","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.09065v3.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2403.16386v1","updated":"2024-03-25T03:02:51Z","published":"2024-03-25T03:02:51Z","title":"Dia-LLaMA: Towards Large Language Model-driven CT Report Generation","summary":" Medical report generation has achieved remarkable advancements yet has still\nbeen faced with several challenges. First, the inherent imbalance in the\ndistribution of normal and abnormal cases may lead models to exhibit a biased\nfocus on normal samples, resulting in unreliable diagnoses. Second, the\nfrequent occurrence of common template sentences in the reports may overwhelm\nthe critical abnormal information. Moreover, existing works focus on 2D chest\nX-rays, leaving CT report generation underexplored due to the high-dimensional\nnature of CT images and the limited availability of CT-report pairs. Recently,\nLLM has shown a great ability to generate reliable answers with appropriate\nprompts, which shed light on addressing the aforementioned challenges. In this\npaper, we propose Dia-LLaMA, a framework to adapt the LLaMA2-7B for CT report\ngeneration by incorporating diagnostic information as guidance prompts.\nConsidering the high dimension of CT, we leverage a pre-trained ViT3D with\nperceiver to extract the visual information. To tailor the LLM for report\ngeneration and emphasize abnormality, we extract additional diagnostic\ninformation by referring to a disease prototype memory bank, which is updated\nduring training to capture common disease representations. Furthermore, we\nintroduce disease-aware attention to enable the model to adjust attention for\ndifferent diseases. Experiments on the chest CT dataset demonstrated that our\nproposed method outperformed previous methods and achieved state-of-the-art on\nboth clinical efficacy performance and natural language generation metrics. The\ncode will be made publically available.\n","authors":["Zhixuan Chen","Luyang Luo","Yequan Bie","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2403.16386v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2403.16385v1","updated":"2024-03-25T03:02:27Z","published":"2024-03-25T03:02:27Z","title":"Synthesize Step-by-Step: Tools, Templates and LLMs as Data Generators\n for Reasoning-Based Chart VQA","summary":" Understanding data visualizations like charts and plots requires reasoning\nabout both visual elements and numerics. Although strong in extractive\nquestions, current chart visual question answering (chart VQA) models suffer on\ncomplex reasoning questions. In this work, we address the lack of reasoning\nability by data augmentation. We leverage Large Language Models (LLMs), which\nhave shown to have strong reasoning ability, as an automatic data annotator\nthat generates question-answer annotations for chart images. The key innovation\nin our method lies in the Synthesize Step-by-Step strategy: our LLM-based data\ngenerator learns to decompose the complex question into step-by-step\nsub-questions (rationales), which are then used to derive the final answer\nusing external tools, i.e. Python. This step-wise generation procedure is\ntrained on synthetic data generated using a template-based QA generation\npipeline. Experimental results highlight the significance of the proposed\nstep-by-step generation. By training with the LLM-augmented data (LAMENDA), we\nsignificantly enhance the chart VQA models, achieving the state-of-the-art\naccuracy on the ChartQA and PlotQA datasets. In particular, our approach\nimproves the accuracy of the previous state-of-the-art approach from 38% to 54%\non the human-written questions in the ChartQA dataset, which needs strong\nreasoning. We hope our work underscores the potential of synthetic data and\nencourages further exploration of data augmentation using LLMs for\nreasoning-heavy tasks.\n","authors":["Li Zhuowan","Jasani Bhavan","Tang Peng","Ghadar Shabnam"],"pdf_url":"https://arxiv.org/pdf/2403.16385v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16384v1","updated":"2024-03-25T03:01:53Z","published":"2024-03-25T03:01:53Z","title":"Residual Dense Swin Transformer for Continuous Depth-Independent\n Ultrasound Imaging","summary":" Ultrasound imaging is crucial for evaluating organ morphology and function,\nyet depth adjustment can degrade image quality and field-of-view, presenting a\ndepth-dependent dilemma. Traditional interpolation-based zoom-in techniques\noften sacrifice detail and introduce artifacts. Motivated by the potential of\narbitrary-scale super-resolution to naturally address these inherent\nchallenges, we present the Residual Dense Swin Transformer Network (RDSTN),\ndesigned to capture the non-local characteristics and long-range dependencies\nintrinsic to ultrasound images. It comprises a linear embedding module for\nfeature enhancement, an encoder with shifted-window attention for modeling\nnon-locality, and an MLP decoder for continuous detail reconstruction. This\nstrategy streamlines balancing image quality and field-of-view, which offers\nsuperior textures over traditional methods. Experimentally, RDSTN outperforms\nexisting approaches while requiring fewer parameters. In conclusion, RDSTN\nshows promising potential for ultrasound image enhancement by overcoming the\nlimitations of conventional interpolation-based methods and achieving\ndepth-independent imaging.\n","authors":["Jintong Hu","Hui Che","Zishuo Li","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16384v1.pdf","comment":"Accepted by ICASSP2024, https://ieeexplore.ieee.org/document/10447712"},{"id":"http://arxiv.org/abs/2403.16379v1","updated":"2024-03-25T02:53:32Z","published":"2024-03-25T02:53:32Z","title":"FlashEval: Towards Fast and Accurate Evaluation of Text-to-image\n Diffusion Generative Models","summary":" In recent years, there has been significant progress in the development of\ntext-to-image generative models. Evaluating the quality of the generative\nmodels is one essential step in the development process. Unfortunately, the\nevaluation process could consume a significant amount of computational\nresources, making the required periodic evaluation of model performance (e.g.,\nmonitoring training progress) impractical. Therefore, we seek to improve the\nevaluation efficiency by selecting the representative subset of the text-image\ndataset. We systematically investigate the design choices, including the\nselection criteria (textural features or image-based metrics) and the selection\ngranularity (prompt-level or set-level). We find that the insights from prior\nwork on subset selection for training data do not generalize to this problem,\nand we propose FlashEval, an iterative search algorithm tailored to evaluation\ndata selection. We demonstrate the effectiveness of FlashEval on ranking\ndiffusion models with various configurations, including architectures,\nquantization levels, and sampler schedules on COCO and DiffusionDB datasets.\nOur searched 50-item subset could achieve comparable evaluation quality to the\nrandomly sampled 500-item subset for COCO annotations on unseen models,\nachieving a 10x evaluation speedup. We release the condensed subset of these\ncommonly used datasets to help facilitate diffusion algorithm design and\nevaluation, and open-source FlashEval as a tool for condensing future datasets,\naccessible at https://github.com/thu-nics/FlashEval.\n","authors":["Lin Zhao","Tianchen Zhao","Zinan Lin","Xuefei Ning","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16379v1.pdf","comment":"The paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15082v2","updated":"2024-03-25T02:50:07Z","published":"2024-03-22T10:06:31Z","title":"Cell Variational Information Bottleneck Network","summary":" In this work, we propose Cell Variational Information Bottleneck Network\n(cellVIB), a convolutional neural network using information bottleneck\nmechanism, which can be combined with the latest feedforward network\narchitecture in an end-to-end training method. Our Cell Variational Information\nBottleneck Network is constructed by stacking VIB cells, which generate feature\nmaps with uncertainty. As layers going deeper, the regularization effect will\ngradually increase, instead of directly adding excessive regular constraints to\nthe output layer of the model as in Deep VIB. Under each VIB cell, the\nfeedforward process learns an independent mean term and an standard deviation\nterm, and predicts the Gaussian distribution based on them. The feedback\nprocess is based on reparameterization trick for effective training. This work\nperforms an extensive analysis on MNIST dataset to verify the effectiveness of\neach VIB cells, and provides an insightful analysis on how the VIB cells affect\nmutual information. Experiments conducted on CIFAR-10 also prove that our\ncellVIB is robust against noisy labels during training and against corrupted\nimages during testing. Then, we validate our method on PACS dataset, whose\nresults show that the VIB cells can significantly improve the generalization\nperformance of the basic model. Finally, in a more complex representation\nlearning task, face recognition, our network structure has also achieved very\ncompetitive results.\n","authors":["Zhonghua Zhai","Chen Ju","Jinsong Lan","Shuai Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.15082v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16376v1","updated":"2024-03-25T02:46:57Z","published":"2024-03-25T02:46:57Z","title":"Elite360D: Towards Efficient 360 Depth Estimation via Semantic- and\n Distance-Aware Bi-Projection Fusion","summary":" 360 depth estimation has recently received great attention for 3D\nreconstruction owing to its omnidirectional field of view (FoV). Recent\napproaches are predominantly focused on cross-projection fusion with\ngeometry-based re-projection: they fuse 360 images with equirectangular\nprojection (ERP) and another projection type, e.g., cubemap projection to\nestimate depth with the ERP format. However, these methods suffer from 1)\nlimited local receptive fields, making it hardly possible to capture large FoV\nscenes, and 2) prohibitive computational cost, caused by the complex\ncross-projection fusion module design. In this paper, we propose Elite360D, a\nnovel framework that inputs the ERP image and icosahedron projection (ICOSAP)\npoint set, which is undistorted and spatially continuous. Elite360D is superior\nin its capacity in learning a representation from a local-with-global\nperspective. With a flexible ERP image encoder, it includes an ICOSAP point\nencoder, and a Bi-projection Bi-attention Fusion (B2F) module (totally ~1M\nparameters). Specifically, the ERP image encoder can take various perspective\nimage-trained backbones (e.g., ResNet, Transformer) to extract local features.\nThe point encoder extracts the global features from the ICOSAP. Then, the B2F\nmodule captures the semantic- and distance-aware dependencies between each\npixel of the ERP feature and the entire ICOSAP feature set. Without specific\nbackbone design and obvious computational cost increase, Elite360D outperforms\nthe prior arts on several benchmark datasets.\n","authors":["Hao Ai","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16376v1.pdf","comment":"8 pages, accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.09506v2","updated":"2024-03-25T02:45:35Z","published":"2024-03-14T15:53:04Z","title":"Don't Judge by the Look: Towards Motion Coherent Video Representation","summary":" Current training pipelines in object recognition neglect Hue Jittering when\ndoing data augmentation as it not only brings appearance changes that are\ndetrimental to classification, but also the implementation is inefficient in\npractice. In this study, we investigate the effect of hue variance in the\ncontext of video understanding and find this variance to be beneficial since\nstatic appearances are less important in videos that contain motion\ninformation. Based on this observation, we propose a data augmentation method\nfor video understanding, named Motion Coherent Augmentation (MCA), that\nintroduces appearance variation in videos and implicitly encourages the model\nto prioritize motion patterns, rather than static appearances. Concretely, we\npropose an operation SwapMix to efficiently modify the appearance of video\nsamples, and introduce Variation Alignment (VA) to resolve the distribution\nshift caused by SwapMix, enforcing the model to learn appearance invariant\nrepresentations. Comprehensive empirical evaluation across various\narchitectures and different datasets solidly validates the effectiveness and\ngeneralization ability of MCA, and the application of VA in other augmentation\nmethods. Code is available at https://github.com/BeSpontaneous/MCA-pytorch.\n","authors":["Yitian Zhang","Yue Bai","Huan Wang","Yizhou Wang","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2403.09506v2.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2403.16370v1","updated":"2024-03-25T02:30:32Z","published":"2024-03-25T02:30:32Z","title":"GoodSAM: Bridging Domain and Capacity Gaps via Segment Anything Model\n for Distortion-aware Panoramic Semantic Segmentation","summary":" This paper tackles a novel yet challenging problem: how to transfer knowledge\nfrom the emerging Segment Anything Model (SAM) -- which reveals impressive\nzero-shot instance segmentation capacity -- to learn a compact panoramic\nsemantic segmentation model, i.e., student, without requiring any labeled data.\nThis poses considerable challenges due to SAM's inability to provide semantic\nlabels and the large capacity gap between SAM and the student. To this end, we\npropose a novel framework, called GoodSAM, that introduces a teacher assistant\n(TA) to provide semantic information, integrated with SAM to generate ensemble\nlogits to achieve knowledge transfer. Specifically, we propose a\nDistortion-Aware Rectification (DAR) module that first addresses the distortion\nproblem of panoramic images by imposing prediction-level consistency and\nboundary enhancement. This subtly enhances TA's prediction capacity on\npanoramic images. DAR then incorporates a cross-task complementary fusion block\nto adaptively merge the predictions of SAM and TA to obtain more reliable\nensemble logits. Moreover, we introduce a Multi-level Knowledge Adaptation\n(MKA) module to efficiently transfer the multi-level feature knowledge from TA\nand ensemble logits to learn a compact student model. Extensive experiments on\ntwo benchmarks show that our GoodSAM achieves a remarkable +3.75\\% mIoU\nimprovement over the state-of-the-art (SOTA) domain adaptation methods. Also,\nour most lightweight model achieves comparable performance to the SOTA methods\nwith only 3.7M parameters.\n","authors":["Weiming Zhang","Yexin Liu","Xu Zheng","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16370v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16368v1","updated":"2024-03-25T02:17:20Z","published":"2024-03-25T02:17:20Z","title":"Distilling Semantic Priors from SAM to Efficient Image Restoration\n Models","summary":" In image restoration (IR), leveraging semantic priors from segmentation\nmodels has been a common approach to improve performance. The recent segment\nanything model (SAM) has emerged as a powerful tool for extracting advanced\nsemantic priors to enhance IR tasks. However, the computational cost of SAM is\nprohibitive for IR, compared to existing smaller IR models. The incorporation\nof SAM for extracting semantic priors considerably hampers the model inference\nefficiency. To address this issue, we propose a general framework to distill\nSAM's semantic knowledge to boost exiting IR models without interfering with\ntheir inference process. Specifically, our proposed framework consists of the\nsemantic priors fusion (SPF) scheme and the semantic priors distillation (SPD)\nscheme. SPF fuses two kinds of information between the restored image predicted\nby the original IR model and the semantic mask predicted by SAM for the refined\nrestored image. SPD leverages a self-distillation manner to distill the fused\nsemantic priors to boost the performance of original IR models. Additionally,\nwe design a semantic-guided relation (SGR) module for SPD, which ensures\nsemantic feature representation space consistency to fully distill the priors.\nWe demonstrate the effectiveness of our framework across multiple IR models and\ntasks, including deraining, deblurring, and denoising.\n","authors":["Quan Zhang","Xiaoyu Liu","Wei Li","Hanting Chen","Junchao Liu","Jie Hu","Zhiwei Xiong","Chun Yuan","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15048v2","updated":"2024-03-25T02:08:01Z","published":"2024-03-22T09:13:09Z","title":"Cartoon Hallucinations Detection: Pose-aware In Context Visual Learning","summary":" Large-scale Text-to-Image (TTI) models have become a common approach for\ngenerating training data in various generative fields. However, visual\nhallucinations, which contain perceptually critical defects, remain a concern,\nespecially in non-photorealistic styles like cartoon characters. We propose a\nnovel visual hallucination detection system for cartoon character images\ngenerated by TTI models. Our approach leverages pose-aware in-context visual\nlearning (PA-ICVL) with Vision-Language Models (VLMs), utilizing both RGB\nimages and pose information. By incorporating pose guidance from a fine-tuned\npose estimator, we enable VLMs to make more accurate decisions. Experimental\nresults demonstrate significant improvements in identifying visual\nhallucinations compared to baseline methods relying solely on RGB images. This\nresearch advances TTI models by mitigating visual hallucinations, expanding\ntheir potential in non-photorealistic domains.\n","authors":["Bumsoo Kim","Wonseop Shin","Kyuchul Lee","Sanghyun Seo"],"pdf_url":"https://arxiv.org/pdf/2403.15048v2.pdf","comment":"11 pages, 12 figures, 1 table, Project page:\n https://gh-bumsookim.github.io/Cartoon-Hallucinations-Detection/"},{"id":"http://arxiv.org/abs/2403.16365v1","updated":"2024-03-25T02:03:38Z","published":"2024-03-25T02:03:38Z","title":"Generating Potent Poisons and Backdoors from Scratch with Guided\n Diffusion","summary":" Modern neural networks are often trained on massive datasets that are web\nscraped with minimal human inspection. As a result of this insecure curation\npipeline, an adversary can poison or backdoor the resulting model by uploading\nmalicious data to the internet and waiting for a victim to scrape and train on\nit. Existing approaches for creating poisons and backdoors start with randomly\nsampled clean data, called base samples, and then modify those samples to craft\npoisons. However, some base samples may be significantly more amenable to\npoisoning than others. As a result, we may be able to craft more potent poisons\nby carefully choosing the base samples. In this work, we use guided diffusion\nto synthesize base samples from scratch that lead to significantly more potent\npoisons and backdoors than previous state-of-the-art attacks. Our Guided\nDiffusion Poisoning (GDP) base samples can be combined with any downstream\npoisoning or backdoor attack to boost its effectiveness. Our implementation\ncode is publicly available at: https://github.com/hsouri/GDP .\n","authors":["Hossein Souri","Arpit Bansal","Hamid Kazemi","Liam Fowl","Aniruddha Saha","Jonas Geiping","Andrew Gordon Wilson","Rama Chellappa","Tom Goldstein","Micah Goldblum"],"pdf_url":"https://arxiv.org/pdf/2403.16365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17516v3","updated":"2024-03-25T01:55:03Z","published":"2023-11-29T10:39:53Z","title":"MMA-Diffusion: MultiModal Attack on Diffusion Models","summary":" In recent years, Text-to-Image (T2I) models have seen remarkable\nadvancements, gaining widespread adoption. However, this progress has\ninadvertently opened avenues for potential misuse, particularly in generating\ninappropriate or Not-Safe-For-Work (NSFW) content. Our work introduces\nMMA-Diffusion, a framework that presents a significant and realistic threat to\nthe security of T2I models by effectively circumventing current defensive\nmeasures in both open-source models and commercial online services. Unlike\nprevious approaches, MMA-Diffusion leverages both textual and visual modalities\nto bypass safeguards like prompt filters and post-hoc safety checkers, thus\nexposing and highlighting the vulnerabilities in existing defense mechanisms.\n","authors":["Yijun Yang","Ruiyuan Gao","Xiaosen Wang","Tsung-Yi Ho","Nan Xu","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2311.17516v3.pdf","comment":"CVPR 2024. Code is available at\n https://github.com/yangyijune/MMA-Diffusion"},{"id":"http://arxiv.org/abs/2403.16361v1","updated":"2024-03-25T01:54:57Z","published":"2024-03-25T01:54:57Z","title":"RSTAR: Rotational Streak Artifact Reduction in 4D CBCT using Separable\n and Circular Convolutions","summary":" Four-dimensional cone-beam computed tomography (4D CBCT) provides\nrespiration-resolved images and can be used for image-guided radiation therapy.\nHowever, the ability to reveal respiratory motion comes at the cost of image\nartifacts. As raw projection data are sorted into multiple respiratory phases,\nthere is a limited number of cone-beam projections available for image\nreconstruction. Consequently, the 4D CBCT images are covered by severe streak\nartifacts. Although several deep learning-based methods have been proposed to\naddress this issue, most algorithms employ ordinary network models, neglecting\nthe intrinsic structural prior within 4D CBCT images. In this paper, we first\nexplore the origin and appearance of streak artifacts in 4D CBCT\nimages.Specifically, we find that streak artifacts exhibit a periodic\nrotational motion along with the patient's respiration. This unique motion\npattern inspires us to distinguish the artifacts from the desired anatomical\nstructures in the spatiotemporal domain. Thereafter, we propose a\nspatiotemporal neural network named RSTAR-Net with separable and circular\nconvolutions for Rotational Streak Artifact Reduction. The specially designed\nmodel effectively encodes dynamic image features, facilitating the recovery of\n4D CBCT images. Moreover, RSTAR-Net is also lightweight and computationally\nefficient. Extensive experiments substantiate the effectiveness of our proposed\nmethod, and RSTAR-Net shows superior performance to comparison methods.\n","authors":["Ziheng Deng","Hua Chen","Haibo Hu","Zhiyong Xu","Tianling Lyu","Yan Xi","Yang Chen","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.16361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09911v2","updated":"2024-03-25T01:54:41Z","published":"2023-08-19T05:34:13Z","title":"Noisy-Correspondence Learning for Text-to-Image Person Re-identification","summary":" Text-to-image person re-identification (TIReID) is a compelling topic in the\ncross-modal community, which aims to retrieve the target person based on a\ntextual query. Although numerous TIReID methods have been proposed and achieved\npromising performance, they implicitly assume the training image-text pairs are\ncorrectly aligned, which is not always the case in real-world scenarios. In\npractice, the image-text pairs inevitably exist under-correlated or even\nfalse-correlated, a.k.a noisy correspondence (NC), due to the low quality of\nthe images and annotation errors. To address this problem, we propose a novel\nRobust Dual Embedding method (RDE) that can learn robust visual-semantic\nassociations even with NC. Specifically, RDE consists of two main components:\n1) A Confident Consensus Division (CCD) module that leverages the dual-grained\ndecisions of dual embedding modules to obtain a consensus set of clean training\ndata, which enables the model to learn correct and reliable visual-semantic\nassociations. 2) A Triplet Alignment Loss (TAL) relaxes the conventional\nTriplet Ranking loss with the hardest negative samples to a log-exponential\nupper bound over all negative ones, thus preventing the model collapse under NC\nand can also focus on hard-negative samples for promising performance. We\nconduct extensive experiments on three public benchmarks, namely CUHK-PEDES,\nICFG-PEDES, and RSTPReID, to evaluate the performance and robustness of our\nRDE. Our method achieves state-of-the-art results both with and without\nsynthetic noisy correspondences on all three datasets. Code is available at\nhttps://github.com/QinYang79/RDE.\n","authors":["Yang Qin","Yingke Chen","Dezhong Peng","Xi Peng","Joey Tianyi Zhou","Peng Hu"],"pdf_url":"https://arxiv.org/pdf/2308.09911v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16358v1","updated":"2024-03-25T01:44:34Z","published":"2024-03-25T01:44:34Z","title":"ChebMixer: Efficient Graph Representation Learning with MLP Mixer","summary":" Graph neural networks have achieved remarkable success in learning graph\nrepresentations, especially graph Transformer, which has recently shown\nsuperior performance on various graph mining tasks. However, graph Transformer\ngenerally treats nodes as tokens, which results in quadratic complexity\nregarding the number of nodes during self-attention computation. The graph MLP\nMixer addresses this challenge by using the efficient MLP Mixer technique from\ncomputer vision. However, the time-consuming process of extracting graph tokens\nlimits its performance. In this paper, we present a novel architecture named\nChebMixer, a newly graph MLP Mixer that uses fast Chebyshev polynomials-based\nspectral filtering to extract a sequence of tokens. Firstly, we produce\nmultiscale representations of graph nodes via fast Chebyshev polynomial-based\nspectral filtering. Next, we consider each node's multiscale representations as\na sequence of tokens and refine the node representation with an effective MLP\nMixer. Finally, we aggregate the multiscale representations of nodes through\nChebyshev interpolation. Owing to the powerful representation capabilities and\nfast computational properties of MLP Mixer, we can quickly extract more\ninformative node representations to improve the performance of downstream\ntasks. The experimental results prove our significant improvements in a variety\nof scenarios ranging from graph node classification to medical image\nsegmentation.\n","authors":["Xiaoyan Kui","Haonan Yan","Qinsong Li","Liming Chen","Beiji Zou"],"pdf_url":"https://arxiv.org/pdf/2403.16358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11614v2","updated":"2024-03-25T01:23:07Z","published":"2024-03-18T09:44:44Z","title":"CRS-Diff: Controllable Generative Remote Sensing Foundation Model","summary":" The emergence of diffusion models has revolutionized the field of image\ngeneration, providing new methods for creating high-quality, high-resolution\nimages across various applications. However, the potential of these models for\ngenerating domain-specific images, particularly remote sensing (RS) images,\nremains largely untapped. RS images that are notable for their high resolution,\nextensive coverage, and rich information content, bring new challenges that\ngeneral diffusion models may not adequately address. This paper proposes\nCRS-Diff, a pioneering diffusion modeling framework specifically tailored for\ngenerating remote sensing imagery, leveraging the inherent advantages of\ndiffusion models while integrating advanced control mechanisms to ensure that\nthe imagery is not only visually clear but also enriched with geographic and\ntemporal information. The model integrates global and local control inputs,\nenabling precise combinations of generation conditions to refine the generation\nprocess. A comprehensive evaluation of CRS-Diff has demonstrated its superior\ncapability to generate RS imagery both in a single condition and multiple\nconditions compared with previous methods in terms of image quality and\ndiversity.\n","authors":["Datao Tang","Xiangyong Cao","Xingsong Hou","Zhongyuan Jiang","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2403.11614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17034v2","updated":"2024-03-25T01:21:18Z","published":"2023-11-28T18:45:13Z","title":"Telling Left from Right: Identifying Geometry-Aware Semantic\n Correspondence","summary":" While pre-trained large-scale vision models have shown significant promise\nfor semantic correspondence, their features often struggle to grasp the\ngeometry and orientation of instances. This paper identifies the importance of\nbeing geometry-aware for semantic correspondence and reveals a limitation of\nthe features of current foundation models under simple post-processing. We show\nthat incorporating this information can markedly enhance semantic\ncorrespondence performance with simple but effective solutions in both\nzero-shot and supervised settings. We also construct a new challenging\nbenchmark for semantic correspondence built from an existing animal pose\nestimation dataset, for both pre-training validating models. Our method\nachieves a PCK@0.10 score of 65.4 (zero-shot) and 85.6 (supervised) on the\nchallenging SPair-71k dataset, outperforming the state of the art by 5.5p and\n11.0p absolute gains, respectively. Our code and datasets are publicly\navailable at: https://telling-left-from-right.github.io/.\n","authors":["Junyi Zhang","Charles Herrmann","Junhwa Hur","Eric Chen","Varun Jampani","Deqing Sun","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2311.17034v2.pdf","comment":"Accepted by CVPR 24, project page:\n https://telling-left-from-right.github.io/"},{"id":"http://arxiv.org/abs/2403.14743v2","updated":"2024-03-25T01:18:37Z","published":"2024-03-21T18:00:00Z","title":"VURF: A General-purpose Reasoning and Self-refinement Framework for\n Video Understanding","summary":" Recent studies have demonstrated the effectiveness of Large Language Models\n(LLMs) as reasoning modules that can deconstruct complex tasks into more\nmanageable sub-tasks, particularly when applied to visual reasoning tasks for\nimages. In contrast, this paper introduces a Video Understanding and Reasoning\nFramework (VURF) based on the reasoning power of LLMs. Ours is a novel approach\nto extend the utility of LLMs in the context of video tasks, leveraging their\ncapacity to generalize from minimal input and output demonstrations within a\ncontextual framework. By presenting LLMs with pairs of instructions and their\ncorresponding high-level programs, we harness their contextual learning\ncapabilities to generate executable visual programs for video understanding. To\nenhance program's accuracy and robustness, we implement two important\nstrategies. Firstly, we employ a feedback-generation approach, powered by\nGPT-3.5, to rectify errors in programs utilizing unsupported functions.\nSecondly, taking motivation from recent works on self refinement of LLM\noutputs, we introduce an iterative procedure for improving the quality of the\nin-context examples by aligning the initial outputs to the outputs that would\nhave been generated had the LLM not been bound by the structure of the\nin-context examples. Our results on several video-specific tasks, including\nvisual QA, video anticipation, pose estimation and multi-video QA illustrate\nthe efficacy of these enhancements in improving the performance of visual\nprogramming approaches for video tasks.\n","authors":["Ahmad Mahmood","Ashmal Vayani","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.14743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10119v2","updated":"2024-03-25T01:08:14Z","published":"2024-03-15T09:08:27Z","title":"URS-NeRF: Unordered Rolling Shutter Bundle Adjustment for Neural\n Radiance Fields","summary":" We propose a novel rolling shutter bundle adjustment method for neural\nradiance fields (NeRF), which utilizes the unordered rolling shutter (RS)\nimages to obtain the implicit 3D representation. Existing NeRF methods suffer\nfrom low-quality images and inaccurate initial camera poses due to the RS\neffect in the image, whereas, the previous method that incorporates the RS into\nNeRF requires strict sequential data input, limiting its widespread\napplicability. In constant, our method recovers the physical formation of RS\nimages by estimating camera poses and velocities, thereby removing the input\nconstraints on sequential data. Moreover, we adopt a coarse-to-fine training\nstrategy, in which the RS epipolar constraints of the pairwise frames in the\nscene graph are used to detect the camera poses that fall into local minima.\nThe poses detected as outliers are corrected by the interpolation method with\nneighboring poses. The experimental results validate the effectiveness of our\nmethod over state-of-the-art works and demonstrate that the reconstruction of\n3D representations is not constrained by the requirement of video sequence\ninput.\n","authors":["Bo Xu","Ziao Liu","Mengqi Guo","Jiancheng Li","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2403.10119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.05453v2","updated":"2024-03-25T01:00:38Z","published":"2021-06-10T01:45:32Z","title":"Improving White-box Robustness of Pre-processing Defenses via Joint\n Adversarial Training","summary":" Deep neural networks (DNNs) are vulnerable to adversarial noise. A range of\nadversarial defense techniques have been proposed to mitigate the interference\nof adversarial noise, among which the input pre-processing methods are scalable\nand show great potential to safeguard DNNs. However, pre-processing methods may\nsuffer from the robustness degradation effect, in which the defense reduces\nrather than improving the adversarial robustness of a target model in a\nwhite-box setting. A potential cause of this negative effect is that\nadversarial training examples are static and independent to the pre-processing\nmodel. To solve this problem, we investigate the influence of full adversarial\nexamples which are crafted against the full model, and find they indeed have a\npositive impact on the robustness of defenses. Furthermore, we find that simply\nchanging the adversarial training examples in pre-processing methods does not\ncompletely alleviate the robustness degradation effect. This is due to the\nadversarial risk of the pre-processed model being neglected, which is another\ncause of the robustness degradation effect. Motivated by above analyses, we\npropose a method called Joint Adversarial Training based Pre-processing (JATP)\ndefense. Specifically, we formulate a feature similarity based adversarial risk\nfor the pre-processing model by using full adversarial examples found in a\nfeature space. Unlike standard adversarial training, we only update the\npre-processing model, which prompts us to introduce a pixel-wise loss to\nimprove its cross-model transferability. We then conduct a joint adversarial\ntraining on the pre-processing model to minimize this overall risk. Empirical\nresults show that our method could effectively mitigate the robustness\ndegradation effect across different target models in comparison to previous\nstate-of-the-art approaches.\n","authors":["Dawei Zhou","Nannan Wang","Xinbo Gao","Bo Han","Jun Yu","Xiaoyu Wang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2106.05453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16350v1","updated":"2024-03-25T00:59:35Z","published":"2024-03-25T00:59:35Z","title":"3D-EffiViTCaps: 3D Efficient Vision Transformer with Capsule for Medical\n Image Segmentation","summary":" Medical image segmentation (MIS) aims to finely segment various organs. It\nrequires grasping global information from both parts and the entire image for\nbetter segmenting, and clinically there are often certain requirements for\nsegmentation efficiency. Convolutional neural networks (CNNs) have made\nconsiderable achievements in MIS. However, they are difficult to fully collect\nglobal context information and their pooling layer may cause information loss.\nCapsule networks, which combine the benefits of CNNs while taking into account\nadditional information such as relative location that CNNs do not, have lately\ndemonstrated some advantages in MIS. Vision Transformer (ViT) employs\ntransformers in visual tasks. Transformer based on attention mechanism has\nexcellent global inductive modeling capabilities and is expected to capture\nlongrange information. Moreover, there have been resent studies on making ViT\nmore lightweight to minimize model complexity and increase efficiency. In this\npaper, we propose a U-shaped 3D encoder-decoder network named 3D-EffiViTCaps,\nwhich combines 3D capsule blocks with 3D EfficientViT blocks for MIS. Our\nencoder uses capsule blocks and EfficientViT blocks to jointly capture local\nand global semantic information more effectively and efficiently with less\ninformation loss, while the decoder employs CNN blocks and EfficientViT blocks\nto catch ffner details for segmentation. We conduct experiments on various\ndatasets, including iSeg-2017, Hippocampus and Cardiac to verify the\nperformance and efficiency of 3D-EffiViTCaps, which performs better than\nprevious 3D CNN-based, 3D Capsule-based and 3D Transformer-based models. We\nfurther implement a series of ablation experiments on the main blocks. Our code\nis available at: https://github.com/HidNeuron/3D-EffiViTCaps.\n","authors":["Dongwei Gan","Ming Chang","Juan Chen"],"pdf_url":"https://arxiv.org/pdf/2403.16350v1.pdf","comment":"15 pages, 4 figures, submitted to ICPR2024"},{"id":"http://arxiv.org/abs/2301.06626v2","updated":"2024-03-25T00:45:30Z","published":"2023-01-16T22:30:53Z","title":"Masked Vector Quantization","summary":" Generative models with discrete latent representations have recently\ndemonstrated an impressive ability to learn complex high-dimensional data\ndistributions. However, their performance relies on a long sequence of tokens\nper instance and a large number of codebook entries, resulting in long sampling\ntimes and considerable computation to fit the categorical posterior. To address\nthese issues, we propose the Masked Vector Quantization (MVQ) framework which\nincreases the representational capacity of each code vector by learning mask\nconfigurations via a stochastic winner-takes-all training regime called\nMultiple Hypothese Dropout (MH-Dropout). On ImageNet 64$\\times$64, MVQ reduces\nFID in existing vector quantization architectures by up to $68\\%$ at 2 tokens\nper instance and $57\\%$ at 5 tokens. These improvements widen as codebook\nentries is reduced and allows for $7\\textit{--}45\\times$ speed-up in token\nsampling during inference. As an additional benefit, we find that smaller\nlatent spaces lead to MVQ identifying transferable visual representations where\nmultiple can be smoothly combined.\n","authors":["David D. Nguyen","David Leibowitz","Surya Nepal","Salil S. Kanhere"],"pdf_url":"https://arxiv.org/pdf/2301.06626v2.pdf","comment":"A newer version of this manuscript was archived under 2312.11735"},{"id":"http://arxiv.org/abs/2403.16338v1","updated":"2024-03-25T00:24:10Z","published":"2024-03-25T00:24:10Z","title":"Impact of Video Compression Artifacts on Fisheye Camera Visual\n Perception Tasks","summary":" Autonomous driving systems require extensive data collection schemes to cover\nthe diverse scenarios needed for building a robust and safe system. The data\nvolumes are in the order of Exabytes and have to be stored for a long period of\ntime (i.e., more than 10 years of the vehicle's life cycle). Lossless\ncompression doesn't provide sufficient compression ratios, hence, lossy video\ncompression has been explored. It is essential to prove that lossy video\ncompression artifacts do not impact the performance of the perception\nalgorithms. However, there is limited work in this area to provide a solid\nconclusion. In particular, there is no such work for fisheye cameras, which\nhave high radial distortion and where compression may have higher artifacts.\nFisheye cameras are commonly used in automotive systems for 3D object detection\ntask. In this work, we provide the first analysis of the impact of standard\nvideo compression codecs on wide FOV fisheye camera images. We demonstrate that\nthe achievable compression with negligible impact depends on the dataset and\ntemporal prediction of the video codec. We propose a radial distortion-aware\nzonal metric to evaluate the performance of artifacts in fisheye images. In\naddition, we present a novel method for estimating affine mode parameters of\nthe latest VVC codec, and suggest some areas for improvement in video codecs\nfor the application to fisheye imagery.\n","authors":["Madhumitha Sakthi","Louis Kerofsky","Varun Ravi Kumar","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2403.16338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16335v1","updated":"2024-03-25T00:17:43Z","published":"2024-03-25T00:17:43Z","title":"MEDDAP: Medical Dataset Enhancement via Diversified Augmentation\n Pipeline","summary":" The effectiveness of Deep Neural Networks (DNNs) heavily relies on the\nabundance and accuracy of available training data. However, collecting and\nannotating data on a large scale is often both costly and time-intensive,\nparticularly in medical cases where practitioners are already occupied with\ntheir duties. Moreover, ensuring that the model remains robust across various\nscenarios of image capture is crucial in medical domains, especially when\ndealing with ultrasound images that vary based on the settings of different\ndevices and the manual operation of the transducer. To address this challenge,\nwe introduce a novel pipeline called MEDDAP, which leverages Stable Diffusion\n(SD) models to augment existing small datasets by automatically generating new\ninformative labeled samples. Pretrained checkpoints for SD are typically based\non natural images, and training them for medical images requires significant\nGPU resources due to their heavy parameters. To overcome this challenge, we\nintroduce USLoRA (Ultrasound Low-Rank Adaptation), a novel fine-tuning method\ntailored specifically for ultrasound applications. USLoRA allows for selective\nfine-tuning of weights within SD, requiring fewer than 0.1\\% of parameters\ncompared to fully fine-tuning only the UNet portion of SD. To enhance dataset\ndiversity, we incorporate different adjectives into the generation process\nprompts, thereby desensitizing the classifiers to intensity changes across\ndifferent images. This approach is inspired by clinicians' decision-making\nprocesses regarding breast tumors, where tumor shape often plays a more crucial\nrole than intensity. In conclusion, our pipeline not only outperforms\nclassifiers trained on the original dataset but also demonstrates superior\nperformance when encountering unseen datasets. The source code is available at\nhttps://github.com/yasamin-med/MEDDAP.\n","authors":["Yasamin Medghalchi","Niloufar Zakariaei","Arman Rahmim","Ilker Hacihaliloglu"],"pdf_url":"https://arxiv.org/pdf/2403.16335v1.pdf","comment":"submitted to miccai 2024 submitted to miccai 2024 Submitted to\n MICCAI-2024"},{"id":"http://arxiv.org/abs/2403.15388v2","updated":"2024-03-25T17:59:55Z","published":"2024-03-22T17:59:52Z","title":"LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal\n Models","summary":" Large Multimodal Models (LMMs) have shown significant reasoning capabilities\nby connecting a visual encoder and a large language model. LMMs typically use a\nfixed amount of visual tokens, such as the penultimate layer features in the\nCLIP visual encoder, as the prefix content. Recent LMMs incorporate more\ncomplex visual inputs, such as high-resolution images and videos, which\nincrease the number of visual tokens significantly. However, due to the design\nof the Transformer architecture, computational costs associated with these\nmodels tend to increase quadratically with the number of input tokens. To\ntackle this problem, we explore a token reduction mechanism and find, similar\nto prior work, that many visual tokens are spatially redundant. Based on this,\nwe propose PruMerge, a novel adaptive visual token reduction approach, which\nlargely reduces the number of visual tokens while maintaining comparable model\nperformance. We first select the unpruned visual tokens based on their\nsimilarity to class tokens and spatial tokens. We then cluster the pruned\ntokens based on key similarity and merge the clustered tokens with the unpruned\ntokens to supplement their information. Empirically, when applied to LLaVA-1.5,\nour approach can compress the visual tokens by 18 times on average, and achieve\ncomparable performance across diverse visual question-answering and reasoning\ntasks. Code and checkpoints are at https://llava-prumerge.github.io/.\n","authors":["Yuzhang Shang","Mu Cai","Bingxin Xu","Yong Jae Lee","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.15388v2.pdf","comment":"Project page: https://llava-prumerge.github.io/"},{"id":"http://arxiv.org/abs/2403.15317v2","updated":"2024-03-25T16:45:41Z","published":"2024-03-22T16:11:29Z","title":"Point-DETR3D: Leveraging Imagery Data with Spatial Point Prior for\n Weakly Semi-supervised 3D Object Detection","summary":" Training high-accuracy 3D detectors necessitates massive labeled 3D\nannotations with 7 degree-of-freedom, which is laborious and time-consuming.\nTherefore, the form of point annotations is proposed to offer significant\nprospects for practical applications in 3D detection, which is not only more\naccessible and less expensive but also provides strong spatial information for\nobject localization. In this paper, we empirically discover that it is\nnon-trivial to merely adapt Point-DETR to its 3D form, encountering two main\nbottlenecks: 1) it fails to encode strong 3D prior into the model, and 2) it\ngenerates low-quality pseudo labels in distant regions due to the extreme\nsparsity of LiDAR points. To overcome these challenges, we introduce\nPoint-DETR3D, a teacher-student framework for weakly semi-supervised 3D\ndetection, designed to fully capitalize on point-wise supervision within a\nconstrained instance-wise annotation budget.Different from Point-DETR which\nencodes 3D positional information solely through a point encoder, we propose an\nexplicit positional query initialization strategy to enhance the positional\nprior. Considering the low quality of pseudo labels at distant regions produced\nby the teacher model, we enhance the detector's perception by incorporating\ndense imagery data through a novel Cross-Modal Deformable RoI Fusion\n(D-RoI).Moreover, an innovative point-guided self-supervised learning technique\nis proposed to allow for fully exploiting point priors, even in student\nmodels.Extensive experiments on representative nuScenes dataset demonstrate our\nPoint-DETR3D obtains significant improvements compared to previous works.\nNotably, with only 5% of labeled data, Point-DETR3D achieves over 90%\nperformance of its fully supervised counterpart.\n","authors":["Hongzhi Gao","Zheng Chen","Zehui Chen","Lin Chen","Jiaming Liu","Shanghang Zhang","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.15317v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2403.15011v2","updated":"2024-03-25T14:50:47Z","published":"2024-03-22T07:49:55Z","title":"Cell Tracking according to Biological Needs -- Strong Mitosis-aware\n Random-finite Sets Tracker with Aleatoric Uncertainty","summary":" Cell tracking and segmentation assist biologists in extracting insights from\nlarge-scale microscopy time-lapse data. Driven by local accuracy metrics,\ncurrent tracking approaches often suffer from a lack of long-term consistency.\nTo address this issue, we introduce an uncertainty estimation technique for\nneural tracking-by-regression frameworks and incorporate it into our novel\nextended Poisson multi-Bernoulli mixture tracker. Our uncertainty estimation\nidentifies uncertain associations within high-performing tracking-by-regression\nmethods using problem-specific test-time augmentations. Leveraging this\nuncertainty, along with a novel mitosis-aware assignment problem formulation,\nour tracker resolves false associations and mitosis detections stemming from\nlong-term conflicts. We evaluate our approach on nine competitive datasets and\ndemonstrate that it outperforms the current state-of-the-art on biologically\nrelevant metrics substantially, achieving improvements by a factor of\napproximately $5.75$. Furthermore, we uncover new insights into the behavior of\ntracking-by-regression uncertainty.\n","authors":["Timo Kaiser","Maximilian Schier","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2403.15011v2.pdf","comment":"23 pages, 10 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.02365v2","updated":"2024-03-25T23:52:15Z","published":"2023-12-04T21:46:39Z","title":"MEDPSeg: Hierarchical polymorphic multitask learning for the\n segmentation of ground-glass opacities, consolidation, and pulmonary\n structures on computed tomography","summary":" The COVID-19 pandemic response highlighted the potential of deep learning\nmethods in facilitating the diagnosis, prognosis and understanding of lung\ndiseases through automated segmentation of pulmonary structures and lesions in\nchest computed tomography (CT). Automated separation of lung lesion into\nground-glass opacity (GGO) and consolidation is hindered due to the\nlabor-intensive and subjective nature of this task, resulting in scarce\navailability of ground truth for supervised learning. To tackle this problem,\nwe propose MEDPSeg. MEDPSeg learns from heterogeneous chest CT targets through\nhierarchical polymorphic multitask learning (HPML). HPML explores the\nhierarchical nature of GGO and consolidation, lung lesions, and the lungs, with\nfurther benefits achieved through multitasking airway and pulmonary artery\nsegmentation. Over 6000 volumetric CT scans from different partially labeled\nsources were used for training and testing. Experiments show PML enabling new\nstate-of-the-art performance for GGO and consolidation segmentation tasks. In\naddition, MEDPSeg simultaneously performs segmentation of the lung parenchyma,\nairways, pulmonary artery, and lung lesions, all in a single forward\nprediction, with performance comparable to state-of-the-art methods specialized\nin each of those targets. Finally, we provide an open-source implementation\nwith a graphical user interface at https://github.com/MICLab-Unicamp/medpseg.\n","authors":["Diedre S. Carmo","Jean A. Ribeiro","Alejandro P. Comellas","Joseph M. Reinhardt","Sarah E. Gerard","Letícia Rittner","Roberto A. Lotufo"],"pdf_url":"https://arxiv.org/pdf/2312.02365v2.pdf","comment":"This manuscript is under review and might change in the future"},{"id":"http://arxiv.org/abs/2312.00412v2","updated":"2024-03-25T23:40:29Z","published":"2023-12-01T08:22:34Z","title":"SCHEME: Scalable Channer Mixer for Vision Transformers","summary":" Vision Transformers have received significant attention due to their\nimpressive performance in many vision tasks. While the token mixer or attention\nblock has been studied in great detail, the channel mixer or feature mixing\nblock (FFN or MLP) has not been explored in depth albeit it accounts for a bulk\nof the parameters and computation in a model. In this work, we study whether\nsparse feature mixing can replace the dense connections and confirm this with a\nblock diagonal MLP structure that improves the accuracy by supporting larger\nexpansion ratios. To improve the feature clusters formed by this structure and\nthereby further improve the accuracy, a lightweight, parameter-free, channel\ncovariance attention (CCA) mechanism is introduced as a parallel branch during\ntraining. This design of CCA enables gradual feature mixing across channel\ngroups during training whose contribution decays to zero as the training\nprogresses to convergence. This allows the CCA block to be discarded during\ninference, thus enabling enhanced performance with no additional computational\ncost. The resulting $\\textit{Scalable CHannEl MixEr}$ (SCHEME) can be plugged\ninto any ViT architecture to obtain a gamut of models with different trade-offs\nbetween complexity and performance by controlling the block diagonal structure\nsize in the MLP. This is shown by the introduction of a new family of\nSCHEMEformer models that is shown to establish new Pareto frontiers for\naccuracy vs FLOPS, accuracy vs model size, and accuracy vs throughput,\nespecially for fast transformers of small model size. For example, the\nSCHEMEformer establishes a new SOTA of 79.7% accuracy for ViTs using pure\nattention mixers on ImageNet-1K at 1.77G FLOPs.\n","authors":["Deepak Sridhar","Yunsheng Li","Nuno Vasconcelos"],"pdf_url":"https://arxiv.org/pdf/2312.00412v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.10971v2","updated":"2024-03-25T23:14:28Z","published":"2023-10-17T03:35:27Z","title":"Context-Aware Meta-Learning","summary":" Large Language Models like ChatGPT demonstrate a remarkable capacity to learn\nnew concepts during inference without any fine-tuning. However, visual models\ntrained to detect new objects during inference have been unable to replicate\nthis ability, and instead either perform poorly or require meta-training and/or\nfine-tuning on similar objects. In this work, we propose a meta-learning\nalgorithm that emulates Large Language Models by learning new visual concepts\nduring inference without fine-tuning. Our approach leverages a frozen\npre-trained feature extractor, and analogous to in-context learning, recasts\nvisual meta-learning as sequence modeling over datapoints with known labels and\na test datapoint with an unknown label. On 8 out of 11 meta-learning\nbenchmarks, our approach -- without meta-training or fine-tuning -- exceeds or\nmatches the state-of-the-art algorithm, P>M>F, which is meta-trained on these\nbenchmarks. Our code is available at https://github.com/cfifty/CAML.\n","authors":["Christopher Fifty","Dennis Duan","Ronald G. Junkins","Ehsan Amid","Jure Leskovec","Christopher Re","Sebastian Thrun"],"pdf_url":"https://arxiv.org/pdf/2310.10971v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.17255v1","updated":"2024-03-25T23:03:51Z","published":"2024-03-25T23:03:51Z","title":"Decoding the visual attention of pathologists to reveal their level of\n expertise","summary":" We present a method for classifying the expertise of a pathologist based on\nhow they allocated their attention during a cancer reading. We engage this\ndecoding task by developing a novel method for predicting the attention of\npathologists as they read whole-slide Images (WSIs) of prostate and make cancer\ngrade classifications. Our ground truth measure of a pathologists' attention is\nthe x, y and z (magnification) movement of their viewport as they navigated\nthrough WSIs during readings, and to date we have the attention behavior of 43\npathologists reading 123 WSIs. These data revealed that specialists have higher\nagreement in both their attention and cancer grades compared to general\npathologists and residents, suggesting that sufficient information may exist in\ntheir attention behavior to classify their expertise level. To attempt this, we\ntrained a transformer-based model to predict the visual attention heatmaps of\nresident, general, and specialist (GU) pathologists during Gleason grading.\nBased solely on a pathologist's attention during a reading, our model was able\nto predict their level of expertise with 75.3%, 56.1%, and 77.2% accuracy,\nrespectively, better than chance and baseline models. Our model therefore\nenables a pathologist's expertise level to be easily and objectively evaluated,\nimportant for pathology training and competency assessment. Tools developed\nfrom our model could also be used to help pathology trainees learn how to read\nWSIs like an expert.\n","authors":["Souradeep Chakraborty","Dana Perez","Paul Friedman","Natallia Sheuka","Constantin Friedman","Oksana Yaskiv","Rajarsi Gupta","Gregory J. Zelinsky","Joel H. Saltz","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.17255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17237v1","updated":"2024-03-25T22:34:05Z","published":"2024-03-25T22:34:05Z","title":"DreamPolisher: Towards High-Quality Text-to-3D Generation via Geometric\n Diffusion","summary":" We present DreamPolisher, a novel Gaussian Splatting based method with\ngeometric guidance, tailored to learn cross-view consistency and intricate\ndetail from textual descriptions. While recent progress on text-to-3D\ngeneration methods have been promising, prevailing methods often fail to ensure\nview-consistency and textural richness. This problem becomes particularly\nnoticeable for methods that work with text input alone. To address this, we\npropose a two-stage Gaussian Splatting based approach that enforces geometric\nconsistency among views. Initially, a coarse 3D generation undergoes refinement\nvia geometric optimization. Subsequently, we use a ControlNet driven refiner\ncoupled with the geometric consistency term to improve both texture fidelity\nand overall consistency of the generated 3D asset. Empirical evaluations across\ndiverse textual prompts spanning various object categories demonstrate the\nefficacy of DreamPolisher in generating consistent and realistic 3D objects,\naligning closely with the semantics of the textual instructions.\n","authors":["Yuanze Lin","Ronald Clark","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2403.17237v1.pdf","comment":"Project webpage: https://yuanze-lin.me/DreamPolisher_page/"},{"id":"http://arxiv.org/abs/2312.12735v2","updated":"2024-03-25T22:25:35Z","published":"2023-12-20T03:16:34Z","title":"MetaSegNet: Metadata-collaborative Vision-Language Representation\n Learning for Semantic Segmentation of Remote Sensing Images","summary":" Semantic segmentation of remote sensing images plays a vital role in a wide\nrange of Earth Observation (EO) applications, such as land use land cover\nmapping, environment monitoring, and sustainable development. Driven by rapid\ndevelopments in Artificial Intelligence (AI), deep learning (DL) has emerged as\nthe mainstream tool for semantic segmentation and has achieved many\nbreakthroughs in the field of remote sensing. However, the existing DL-based\nmethods mainly focus on unimodal visual data while ignoring the rich multimodal\ninformation involved in the real world, usually demonstrating weak reliability\nand generlization. Inspired by the success of Vision Transformers and large\nlanguage models, we propose a novel metadata-collaborative multimodal\nsegmentation network (MetaSegNet) that applies vision-language representation\nlearning for semantic segmentation of remote sensing images. Unlike the common\nmodel structure that only uses unimodal visual data, we extract the key\ncharacteristic (e.g. the climate zone) from freely available remote sensing\nimage metadata and transfer it into knowledge-based text prompts via the\ngeneric ChatGPT. Then, we construct an image encoder, a text encoder and a\ncross-modal attention fusion subnetwork to extract the image and text feature\nand apply image-text interaction. Benefiting from such a design, the proposed\nMetaSegNet demonstrates superior generalization and achieves competitive\naccuracy with the state-of-the-art semantic segmentation methods on the\nlarge-scale OpenEarthMap dataset (68.6% mIoU) and Potsdam dataset (93.3% mean\nF1 score) as well as LoveDA dataset (52.2% mIoU).\n","authors":["Libo Wang","Sijun Dong","Ying Chen","Xiaoliang Meng","Shenghui Fang","Ayman Habib","Songlin Fei"],"pdf_url":"https://arxiv.org/pdf/2312.12735v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15101v3","updated":"2024-03-25T22:13:44Z","published":"2023-12-22T22:46:48Z","title":"Fix-Con: Automatic Fault Localization and Repair of Deep Learning Model\n Conversions between Frameworks","summary":" Converting deep learning models between frameworks is a common step to\nmaximize model compatibility across devices and leverage optimization features\nthat may be exclusively provided in one deep learning framework. However, this\nconversion process may be riddled with bugs, making the converted models either\nundeployable or problematic, considerably degrading their prediction\ncorrectness.\n In this paper we propose an automated approach for fault localization and\nrepair, Fix-Con, during model conversion between deep learning frameworks.\nFix-Con is capable of detecting and fixing faults introduced in model input,\nparameters, hyperparameters, and the model graph during conversion.\n Fix-Con uses a set of fault types (mined from surveying conversion issues\nreported \\nick{in code repositories and forums}) to localize potential\nconversion faults in the converted target model and then repair them\nappropriately, e.g., replacing the parameters of the target model with those\nfrom the source model. This is done iteratively for every image in the dataset,\ncomparing output label differences between the source model and the converted\ntarget model until all differences are resolved. We evaluate the effectiveness\nof Fix-Con in fixing model conversion bugs of three widely used image\nrecognition models converted across four different deep learning frameworks.\nOverall, Fix-Con was able to fix $462$ out of $755$ detected conversion faults,\neither completely repairing or significantly improving the performance of $14$\nout of the $15$ erroneous conversion cases.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2312.15101v3.pdf","comment":"12 pages, 4 figures, 3 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/1905.10711v5","updated":"2024-03-25T22:10:45Z","published":"2019-05-26T01:58:28Z","title":"DISN: Deep Implicit Surface Network for High-quality Single-view 3D\n Reconstruction","summary":" Reconstructing 3D shapes from single-view images has been a long-standing\nresearch problem. In this paper, we present DISN, a Deep Implicit Surface\nNetwork which can generate a high-quality detail-rich 3D mesh from an 2D image\nby predicting the underlying signed distance fields. In addition to utilizing\nglobal image features, DISN predicts the projected location for each 3D point\non the 2D image, and extracts local features from the image feature maps.\nCombining global and local features significantly improves the accuracy of the\nsigned distance field prediction, especially for the detail-rich areas. To the\nbest of our knowledge, DISN is the first method that constantly captures\ndetails such as holes and thin structures present in 3D shapes from single-view\nimages. DISN achieves the state-of-the-art single-view reconstruction\nperformance on a variety of shape categories reconstructed from both synthetic\nand real images. Code is available at https://github.com/xharlie/DISN The\nsupplementary can be found at\nhttps://xharlie.github.io/images/neurips_2019_supp.pdf\n","authors":["Qiangeng Xu","Weiyue Wang","Duygu Ceylan","Radomir Mech","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/1905.10711v5.pdf","comment":"This project was in part supported by the gift funding to the\n University of Southern California from Adobe Research"},{"id":"http://arxiv.org/abs/2403.17223v1","updated":"2024-03-25T21:53:36Z","published":"2024-03-25T21:53:36Z","title":"Co-Occurring of Object Detection and Identification towards unlabeled\n object discovery","summary":" In this paper, we propose a novel deep learning based approach for\nidentifying co-occurring objects in conjunction with base objects in multilabel\nobject categories. Nowadays, with the advancement in computer vision based\ntechniques we need to know about co-occurring objects with respect to base\nobject for various purposes. The pipeline of the proposed work is composed of\ntwo stages: in the first stage of the proposed model we detect all the bounding\nboxes present in the image and their corresponding labels, then in the second\nstage we perform co-occurrence matrix analysis. In co-occurrence matrix\nanalysis, we set base classes based on the maximum occurrences of the labels\nand build association rules and generate frequent patterns. These frequent\npatterns will show base classes and their corresponding co-occurring classes.\nWe performed our experiments on two publicly available datasets: Pascal VOC and\nMS-COCO. The experimental results on public benchmark dataset is reported in\nSec 4. Further we extend this work by considering all frequently objects as\nunlabeled and what if they are occluded as well.\n","authors":["Binay Kumar Singh","Niels Da Vitoria Lobo"],"pdf_url":"https://arxiv.org/pdf/2403.17223v1.pdf","comment":"6 pages, 2 figures,"},{"id":"http://arxiv.org/abs/2403.17217v1","updated":"2024-03-25T21:46:53Z","published":"2024-03-25T21:46:53Z","title":"DiffusionAct: Controllable Diffusion Autoencoder for One-shot Face\n Reenactment","summary":" Video-driven neural face reenactment aims to synthesize realistic facial\nimages that successfully preserve the identity and appearance of a source face,\nwhile transferring the target head pose and facial expressions. Existing\nGAN-based methods suffer from either distortions and visual artifacts or poor\nreconstruction quality, i.e., the background and several important appearance\ndetails, such as hair style/color, glasses and accessories, are not faithfully\nreconstructed. Recent advances in Diffusion Probabilistic Models (DPMs) enable\nthe generation of high-quality realistic images. To this end, in this paper we\npresent DiffusionAct, a novel method that leverages the photo-realistic image\ngeneration of diffusion models to perform neural face reenactment.\nSpecifically, we propose to control the semantic space of a Diffusion\nAutoencoder (DiffAE), in order to edit the facial pose of the input images,\ndefined as the head pose orientation and the facial expressions. Our method\nallows one-shot, self, and cross-subject reenactment, without requiring\nsubject-specific fine-tuning. We compare against state-of-the-art GAN-,\nStyleGAN2-, and diffusion-based methods, showing better or on-par reenactment\nperformance.\n","authors":["Stella Bounareli","Christos Tzelepis","Vasileios Argyriou","Ioannis Patras","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2403.17217v1.pdf","comment":"Project page: https://stelabou.github.io/diffusionact/"},{"id":"http://arxiv.org/abs/2403.17213v1","updated":"2024-03-25T21:40:44Z","published":"2024-03-25T21:40:44Z","title":"AnimateMe: 4D Facial Expressions via Diffusion Models","summary":" The field of photorealistic 3D avatar reconstruction and generation has\ngarnered significant attention in recent years; however, animating such avatars\nremains challenging. Recent advances in diffusion models have notably enhanced\nthe capabilities of generative models in 2D animation. In this work, we\ndirectly utilize these models within the 3D domain to achieve controllable and\nhigh-fidelity 4D facial animation. By integrating the strengths of diffusion\nprocesses and geometric deep learning, we employ Graph Neural Networks (GNNs)\nas denoising diffusion models in a novel approach, formulating the diffusion\nprocess directly on the mesh space and enabling the generation of 3D facial\nexpressions. This facilitates the generation of facial deformations through a\nmesh-diffusion-based model. Additionally, to ensure temporal coherence in our\nanimations, we propose a consistent noise sampling method. Under a series of\nboth quantitative and qualitative experiments, we showcase that the proposed\nmethod outperforms prior work in 4D expression synthesis by generating\nhigh-fidelity extreme expressions. Furthermore, we applied our method to\ntextured 4D facial expression generation, implementing a straightforward\nextension that involves training on a large-scale textured 4D facial expression\ndatabase.\n","authors":["Dimitrios Gerogiannis","Foivos Paraperas Papantoniou","Rolandos Alexandros Potamias","Alexandros Lattas","Stylianos Moschoglou","Stylianos Ploumpis","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2403.17213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06157v5","updated":"2024-03-25T21:23:11Z","published":"2023-06-10T23:50:02Z","title":"Fault Localization for Buggy Deep Learning Framework Conversions in\n Image Recognition","summary":" When deploying Deep Neural Networks (DNNs), developers often convert models\nfrom one deep learning framework to another (e.g., TensorFlow to PyTorch).\nHowever, this process is error-prone and can impact target model accuracy. To\nidentify the extent of such impact, we perform and briefly present a\ndifferential analysis against three DNNs widely used for image recognition\n(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep\nlearning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which\nrevealed numerous model crashes and output label discrepancies of up to 100%.\nTo mitigate such errors, we present a novel approach towards fault localization\nand repair of buggy deep learning framework conversions, focusing on\npre-trained image recognition models. Our technique consists of four stages of\nanalysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters,\nand 4) graph representation. In addition, we propose various strategies towards\nfault repair of the faults detected. We implement our technique on top of the\nApache TVM deep learning compiler, and we test it by conducting a preliminary\nfault localization analysis for the conversion of InceptionV3 from TF to\nTFLite. Our approach detected a fault in a common DNN converter tool, which\nintroduced precision errors in weights, reducing model accuracy. After our\nfault localization, we repaired the issue, reducing our conversion error to\nzero.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06157v5.pdf","comment":"5 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.17192v1","updated":"2024-03-25T21:08:26Z","published":"2024-03-25T21:08:26Z","title":"Strategies to Improve Real-World Applicability of Laparoscopic Anatomy\n Segmentation Models","summary":" Accurate identification and localization of anatomical structures of varying\nsize and appearance in laparoscopic imaging are necessary to leverage the\npotential of computer vision techniques for surgical decision support.\nSegmentation performance of such models is traditionally reported using metrics\nof overlap such as IoU. However, imbalanced and unrealistic representation of\nclasses in the training data and suboptimal selection of reported metrics have\nthe potential to skew nominal segmentation performance and thereby ultimately\nlimit clinical translation. In this work, we systematically analyze the impact\nof class characteristics (i.e., organ size differences), training and test data\ncomposition (i.e., representation of positive and negative examples), and\nmodeling parameters (i.e., foreground-to-background class weight) on eight\nsegmentation metrics: accuracy, precision, recall, IoU, F1 score, specificity,\nHausdorff Distance, and Average Symmetric Surface Distance. Based on our\nfindings, we propose two simple yet effective strategies to improve real-world\napplicability of image segmentation models in laparoscopic surgical data: (1)\ninclusion of negative examples in the training process and (2) adaptation of\nforeground-background weights in segmentation models to maximize model\nperformance with respect to specific metrics of interest, depending on the\nclinical use case.\n","authors":["Fiona R. Kolbinger","Jiangpeng He","Jinge Ma","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.17192v1.pdf","comment":"13 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2306.06208v5","updated":"2024-03-25T21:08:25Z","published":"2023-06-05T23:07:01Z","title":"DeltaNN: Assessing the Impact of Computational Environment Parameters on\n the Performance of Image Recognition Models","summary":" Image recognition tasks typically use deep learning and require enormous\nprocessing power, thus relying on hardware accelerators like GPUs and TPUs for\nfast, timely processing. Failure in real-time image recognition tasks can occur\ndue to sub-optimal mapping on hardware accelerators during model deployment,\nwhich may lead to timing uncertainty and erroneous behavior. Mapping on\nhardware accelerators is done using multiple software components like deep\nlearning frameworks, compilers, and device libraries, that we refer to as the\ncomputational environment. Owing to the increased use of image recognition\ntasks in safety-critical applications like autonomous driving and medical\nimaging, it is imperative to assess their robustness to changes in the\ncomputational environment, as the impact of parameters like deep learning\nframeworks, compiler optimizations, and hardware devices on model performance\nand correctness is not yet well understood.\n In this paper we present a differential testing framework, DeltaNN, that\nallows us to assess the impact of different computational environment\nparameters on the performance of image recognition models during deployment,\npost training. DeltaNN generates different implementations of a given image\nrecognition model for variations in environment parameters, namely, deep\nlearning frameworks, compiler optimizations and hardware devices and analyzes\ndifferences in model performance as a result. Using DeltaNN, we conduct an\nempirical study of robustness analysis of three popular image recognition\nmodels using the ImageNet dataset. We report the impact in terms of\nmisclassifications and inference time differences across different settings. In\ntotal, we observed up to 100% output label differences across deep learning\nframeworks, and up to 81% unexpected performance degradation in terms of\ninference time, when applying compiler optimizations.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06208v5.pdf","comment":"11 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.17188v1","updated":"2024-03-25T21:01:29Z","published":"2024-03-25T21:01:29Z","title":"LOTUS: Evasive and Resilient Backdoor Attacks through Sub-Partitioning","summary":" Backdoor attack poses a significant security threat to Deep Learning\napplications. Existing attacks are often not evasive to established backdoor\ndetection techniques. This susceptibility primarily stems from the fact that\nthese attacks typically leverage a universal trigger pattern or transformation\nfunction, such that the trigger can cause misclassification for any input. In\nresponse to this, recent papers have introduced attacks using sample-specific\ninvisible triggers crafted through special transformation functions. While\nthese approaches manage to evade detection to some extent, they reveal\nvulnerability to existing backdoor mitigation techniques. To address and\nenhance both evasiveness and resilience, we introduce a novel backdoor attack\nLOTUS. Specifically, it leverages a secret function to separate samples in the\nvictim class into a set of partitions and applies unique triggers to different\npartitions. Furthermore, LOTUS incorporates an effective trigger focusing\nmechanism, ensuring only the trigger corresponding to the partition can induce\nthe backdoor behavior. Extensive experimental results show that LOTUS can\nachieve high attack success rate across 4 datasets and 7 model structures, and\neffectively evading 13 backdoor detection and mitigation techniques. The code\nis available at https://github.com/Megum1/LOTUS.\n","authors":["Siyuan Cheng","Guanhong Tao","Yingqi Liu","Guangyu Shen","Shengwei An","Shiwei Feng","Xiangzhe Xu","Kaiyuan Zhang","Shiqing Ma","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.17188v1.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR\n 2024)"},{"id":"http://arxiv.org/abs/2403.17177v1","updated":"2024-03-25T20:44:01Z","published":"2024-03-25T20:44:01Z","title":"Brain Stroke Segmentation Using Deep Learning Models: A Comparative\n Study","summary":" Stroke segmentation plays a crucial role in the diagnosis and treatment of\nstroke patients by providing spatial information about affected brain regions\nand the extent of damage. Segmenting stroke lesions accurately is a challenging\ntask, given that conventional manual techniques are time consuming and prone to\nerrors. Recently, advanced deep models have been introduced for general medical\nimage segmentation, demonstrating promising results that surpass many state of\nthe art networks when evaluated on specific datasets. With the advent of the\nvision Transformers, several models have been introduced based on them, while\nothers have aimed to design better modules based on traditional convolutional\nlayers to extract long-range dependencies like Transformers. The question of\nwhether such high-level designs are necessary for all segmentation cases to\nachieve the best results remains unanswered. In this study, we selected four\ntypes of deep models that were recently proposed and evaluated their\nperformance for stroke segmentation: a pure Transformer-based architecture\n(DAE-Former), two advanced CNN-based models (LKA and DLKA) with attention\nmechanisms in their design, an advanced hybrid model that incorporates CNNs\nwith Transformers (FCT), and the well- known self-adaptive nnUNet framework\nwith its configuration based on given data. We examined their performance on\ntwo publicly available datasets, and found that the nnUNet achieved the best\nresults with the simplest design among all. Revealing the robustness issue of\nTransformers to such variabilities serves as a potential reason for their\nweaker performance. Furthermore, nnUNet's success underscores the significant\nimpact of preprocessing and postprocessing techniques in enhancing segmentation\nresults, surpassing the focus solely on architectural designs\n","authors":["Ahmed Soliman","Yousif Yousif","Ahmed Ibrahim","Yalda Zafari-Ghadim","Essam A. Rashed","Mohamed Mabrok"],"pdf_url":"https://arxiv.org/pdf/2403.17177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17176v1","updated":"2024-03-25T20:43:48Z","published":"2024-03-25T20:43:48Z","title":"Histogram Layers for Neural Engineered Features","summary":" In the computer vision literature, many effective histogram-based features\nhave been developed. These engineered features include local binary patterns\nand edge histogram descriptors among others and they have been shown to be\ninformative features for a variety of computer vision tasks. In this paper, we\nexplore whether these features can be learned through histogram layers embedded\nin a neural network and, therefore, be leveraged within deep learning\nframeworks. By using histogram features, local statistics of the feature maps\nfrom the convolution neural networks can be used to better represent the data.\nWe present neural versions of local binary pattern and edge histogram\ndescriptors that jointly improve the feature representation and perform image\nclassification. Experiments are presented on benchmark and real-world datasets.\n","authors":["Joshua Peeples","Salim Al Kharsa","Luke Saleh","Alina Zare"],"pdf_url":"https://arxiv.org/pdf/2403.17176v1.pdf","comment":"11 pages, 7 figures, submitted for review"},{"id":"http://arxiv.org/abs/2403.17175v1","updated":"2024-03-25T20:43:23Z","published":"2024-03-25T20:43:23Z","title":"Engagement Measurement Based on Facial Landmarks and Spatial-Temporal\n Graph Convolutional Networks","summary":" Engagement in virtual learning is crucial for a variety of factors including\nlearner satisfaction, performance, and compliance with learning programs, but\nmeasuring it is a challenging task. There is therefore considerable interest in\nutilizing artificial intelligence and affective computing to measure engagement\nin natural settings as well as on a large scale. This paper introduces a novel,\nprivacy-preserving method for engagement measurement from videos. It uses\nfacial landmarks, which carry no personally identifiable information, extracted\nfrom videos via the MediaPipe deep learning solution. The extracted facial\nlandmarks are fed to a Spatial-Temporal Graph Convolutional Network (ST-GCN) to\noutput the engagement level of the learner in the video. To integrate the\nordinal nature of the engagement variable into the training process, ST-GCNs\nundergo training in a novel ordinal learning framework based on transfer\nlearning. Experimental results on two video student engagement measurement\ndatasets show the superiority of the proposed method compared to previous\nmethods with improved state-of-the-art on the EngageNet dataset with a %3.1\nimprovement in four-class engagement level classification accuracy and on the\nOnline Student Engagement dataset with a %1.5 improvement in binary engagement\nclassification accuracy. The relatively lightweight ST-GCN and its integration\nwith the real-time MediaPipe deep learning solution make the proposed approach\ncapable of being deployed on virtual learning platforms and measuring\nengagement in real time.\n","authors":["Ali Abedi","Shehroz S. Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17173v1","updated":"2024-03-25T20:39:58Z","published":"2024-03-25T20:39:58Z","title":"Task2Box: Box Embeddings for Modeling Asymmetric Task Relationships","summary":" Modeling and visualizing relationships between tasks or datasets is an\nimportant step towards solving various meta-tasks such as dataset discovery,\nmulti-tasking, and transfer learning. However, many relationships, such as\ncontainment and transferability, are naturally asymmetric and current\napproaches for representation and visualization (e.g., t-SNE do not readily\nsupport this. We propose Task2Box, an approach to represent tasks using box\nembeddings -- axis-aligned hyperrectangles in low dimensional spaces -- that\ncan capture asymmetric relationships between them through volumetric overlaps.\nWe show that Task2Box accurately predicts unseen hierarchical relationships\nbetween nodes in ImageNet and iNaturalist datasets, as well as transferability\nbetween tasks in the Taskonomy benchmark. We also show that box embeddings\nestimated from task representations (e.g., CLIP, Task2Vec, or attribute based)\ncan be used to predict relationships between unseen tasks more accurately than\nclassifiers trained on the same representations, as well as handcrafted\nasymmetric distances (e.g., KL divergence). This suggests that low-dimensional\nbox embeddings can effectively capture these task relationships and have the\nadded advantage of being interpretable. We use the approach to visualize\nrelationships among publicly available image classification datasets on popular\ndataset hosting platform called Hugging Face.\n","authors":["Rangel Daroya","Aaron Sun","Subhransu Maji"],"pdf_url":"https://arxiv.org/pdf/2403.17173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09857v2","updated":"2024-03-25T20:08:07Z","published":"2024-03-14T20:34:53Z","title":"Few-Shot Class Incremental Learning with Attention-Aware Self-Adaptive\n Prompt","summary":" Few-Shot Class-Incremental Learning (FSCIL) models aim to incrementally learn\nnew classes with scarce samples while preserving knowledge of old ones.\nExisting FSCIL methods usually fine-tune the entire backbone, leading to\noverfitting and hindering the potential to learn new classes. On the other\nhand, recent prompt-based CIL approaches alleviate forgetting by training\nprompts with sufficient data in each task. In this work, we propose a novel\nframework named Attention-aware Self-adaptive Prompt (ASP). ASP encourages\ntask-invariant prompts to capture shared knowledge by reducing specific\ninformation from the attention aspect. Additionally, self-adaptive\ntask-specific prompts in ASP provide specific information and transfer\nknowledge from old classes to new classes with an Information Bottleneck\nlearning objective. In summary, ASP prevents overfitting on base task and does\nnot require enormous data in few-shot incremental tasks. Extensive experiments\non three benchmark datasets validate that ASP consistently outperforms\nstate-of-the-art FSCIL and prompt-based CIL methods in terms of both learning\nnew classes and mitigating forgetting.\n","authors":["Chenxi Liu","Zhenyi Wang","Tianyi Xiong","Ruibo Chen","Yihan Wu","Junfeng Guo","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2403.09857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07021v2","updated":"2024-03-25T19:46:25Z","published":"2023-10-10T21:16:29Z","title":"Pre-Trained Masked Image Model for Mobile Robot Navigation","summary":" 2D top-down maps are commonly used for the navigation and exploration of\nmobile robots through unknown areas. Typically, the robot builds the navigation\nmaps incrementally from local observations using onboard sensors. Recent works\nhave shown that predicting the structural patterns in the environment through\nlearning-based approaches can greatly enhance task efficiency. While many such\nworks build task-specific networks using limited datasets, we show that the\nexisting foundational vision networks can accomplish the same without any\nfine-tuning. Specifically, we use Masked Autoencoders, pre-trained on street\nimages, to present novel applications for field-of-view expansion, single-agent\ntopological exploration, and multi-agent exploration for indoor mapping, across\ndifferent input modalities. Our work motivates the use of foundational vision\nmodels for generalized structure prediction-driven applications, especially in\nthe dearth of training data. For more qualitative results see\nhttps://raaslab.org/projects/MIM4Robots.\n","authors":["Vishnu Dutt Sharma","Anukriti Singh","Pratap Tokekar"],"pdf_url":"https://arxiv.org/pdf/2310.07021v2.pdf","comment":"Accepted at ICRA 2024"},{"id":"http://arxiv.org/abs/2403.17128v1","updated":"2024-03-25T19:13:12Z","published":"2024-03-25T19:13:12Z","title":"Benchmarking Video Frame Interpolation","summary":" Video frame interpolation, the task of synthesizing new frames in between two\nor more given ones, is becoming an increasingly popular research target.\nHowever, the current evaluation of frame interpolation techniques is not ideal.\nDue to the plethora of test datasets available and inconsistent computation of\nerror metrics, a coherent and fair comparison across papers is very\nchallenging. Furthermore, new test sets have been proposed as part of method\npapers so they are unable to provide the in-depth evaluation of a dedicated\nbenchmarking paper. Another severe downside is that these test sets violate the\nassumption of linearity when given two input frames, making it impossible to\nsolve without an oracle. We hence strongly believe that the community would\ngreatly benefit from a benchmarking paper, which is what we propose.\nSpecifically, we present a benchmark which establishes consistent error metrics\nby utilizing a submission website that computes them, provides insights by\nanalyzing the interpolation quality with respect to various per-pixel\nattributes such as the motion magnitude, contains a carefully designed test set\nadhering to the assumption of linearity by utilizing synthetic data, and\nevaluates the computational efficiency in a coherent manner.\n","authors":["Simon Kiefhaber","Simon Niklaus","Feng Liu","Simone Schaub-Meyer"],"pdf_url":"https://arxiv.org/pdf/2403.17128v1.pdf","comment":"http://sniklaus.com/vfibench"},{"id":"http://arxiv.org/abs/2311.17286v2","updated":"2024-03-25T19:05:04Z","published":"2023-11-29T00:09:45Z","title":"LEOD: Label-Efficient Object Detection for Event Cameras","summary":" Object detection with event cameras benefits from the sensor's low latency\nand high dynamic range. However, it is costly to fully label event streams for\nsupervised training due to their high temporal resolution. To reduce this cost,\nwe present LEOD, the first method for label-efficient event-based detection.\nOur approach unifies weakly- and semi-supervised object detection with a\nself-training mechanism. We first utilize a detector pre-trained on limited\nlabels to produce pseudo ground truth on unlabeled events. Then, the detector\nis re-trained with both real and generated labels. Leveraging the temporal\nconsistency of events, we run bi-directional inference and apply tracking-based\npost-processing to enhance the quality of pseudo labels. To stabilize training\nagainst label noise, we further design a soft anchor assignment strategy. We\nintroduce new experimental protocols to evaluate the task of label-efficient\nevent-based detection on Gen1 and 1Mpx datasets. LEOD consistently outperforms\nsupervised baselines across various labeling ratios. For example, on Gen1, it\nimproves mAP by 8.6% and 7.8% for RVT-S trained with 1% and 2% labels. On 1Mpx,\nRVT-S with 10% labels even surpasses its fully-supervised counterpart using\n100% labels. LEOD maintains its effectiveness even when all labeled data are\navailable, reaching new state-of-the-art results. Finally, we show that our\nmethod readily scales to improve larger detectors as well. Code is released at\nhttps://github.com/Wuziyi616/LEOD\n","authors":["Ziyi Wu","Mathias Gehrig","Qing Lyu","Xudong Liu","Igor Gilitschenski"],"pdf_url":"https://arxiv.org/pdf/2311.17286v2.pdf","comment":"CVPR 2024. Code: https://github.com/Wuziyi616/LEOD"},{"id":"http://arxiv.org/abs/2311.16682v2","updated":"2024-03-25T18:54:18Z","published":"2023-11-28T10:53:55Z","title":"ContextSeg: Sketch Semantic Segmentation by Querying the Context with\n Attention","summary":" Sketch semantic segmentation is a well-explored and pivotal problem in\ncomputer vision involving the assignment of pre-defined part labels to\nindividual strokes. This paper presents ContextSeg - a simple yet highly\neffective approach to tackling this problem with two stages. In the first\nstage, to better encode the shape and positional information of strokes, we\npropose to predict an extra dense distance field in an autoencoder network to\nreinforce structural information learning. In the second stage, we treat an\nentire stroke as a single entity and label a group of strokes within the same\nsemantic part using an auto-regressive Transformer with the default attention\nmechanism. By group-based labeling, our method can fully leverage the context\ninformation when making decisions for the remaining groups of strokes. Our\nmethod achieves the best segmentation accuracy compared with state-of-the-art\napproaches on two representative datasets and has been extensively evaluated\ndemonstrating its superior performance. Additionally, we offer insights into\nsolving part imbalance in training data and the preliminary experiment on\ncross-category training, which can inspire future research in this field.\n","authors":["Jiawei Wang","Changjian Li"],"pdf_url":"https://arxiv.org/pdf/2311.16682v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12151v2","updated":"2024-03-25T18:50:06Z","published":"2024-03-18T18:08:44Z","title":"Fusing Domain-Specific Content from Large Language Models into Knowledge\n Graphs for Enhanced Zero Shot Object State Classification","summary":" Domain-specific knowledge can significantly contribute to addressing a wide\nvariety of vision tasks. However, the generation of such knowledge entails\nconsiderable human labor and time costs. This study investigates the potential\nof Large Language Models (LLMs) in generating and providing domain-specific\ninformation through semantic embeddings. To achieve this, an LLM is integrated\ninto a pipeline that utilizes Knowledge Graphs and pre-trained semantic vectors\nin the context of the Vision-based Zero-shot Object State Classification task.\nWe thoroughly examine the behavior of the LLM through an extensive ablation\nstudy. Our findings reveal that the integration of LLM-based embeddings, in\ncombination with general-purpose pre-trained embeddings, leads to substantial\nperformance improvements. Drawing insights from this ablation study, we conduct\na comparative analysis against competing models, thereby highlighting the\nstate-of-the-art performance achieved by the proposed approach.\n","authors":["Filippos Gouidis","Katerina Papantoniou","Konstantinos Papoutsakis Theodore Patkos","Antonis Argyros","Dimitris Plexousakis"],"pdf_url":"https://arxiv.org/pdf/2403.12151v2.pdf","comment":"Accepted at the AAAI-MAKE 24"},{"id":"http://arxiv.org/abs/2312.12730v2","updated":"2024-03-25T18:49:52Z","published":"2023-12-20T02:58:25Z","title":"A Closer Look at the Few-Shot Adaptation of Large Vision-Language Models","summary":" Efficient transfer learning (ETL) is receiving increasing attention to adapt\nlarge pre-trained language-vision models on downstream tasks with a few labeled\nsamples. While significant progress has been made, we reveal that\nstate-of-the-art ETL approaches exhibit strong performance only in\nnarrowly-defined experimental setups, and with a careful adjustment of\nhyperparameters based on a large corpus of labeled samples. In particular, we\nmake two interesting, and surprising empirical observations. First, to\noutperform a simple Linear Probing baseline, these methods require to optimize\ntheir hyper-parameters on each target task. And second, they typically\nunderperform -- sometimes dramatically -- standard zero-shot predictions in the\npresence of distributional drifts. Motivated by the unrealistic assumptions\nmade in the existing literature, i.e., access to a large validation set and\ncase-specific grid-search for optimal hyperparameters, we propose a novel\napproach that meets the requirements of real-world scenarios. More concretely,\nwe introduce a CLass-Adaptive linear Probe (CLAP) objective, whose balancing\nterm is optimized via an adaptation of the general Augmented Lagrangian method\ntailored to this context. We comprehensively evaluate CLAP on a broad span of\ndatasets and scenarios, demonstrating that it consistently outperforms SoTA\napproaches, while yet being a much more efficient alternative.\n","authors":["Julio Silva-Rodríguez","Sina Hajimiri","Ismail Ben Ayed","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2312.12730v2.pdf","comment":"CVPR 2024. Code: https://github.com/jusiro/CLAP"},{"id":"http://arxiv.org/abs/2403.17103v1","updated":"2024-03-25T18:41:43Z","published":"2024-03-25T18:41:43Z","title":"Animal Avatars: Reconstructing Animatable 3D Animals from Casual Videos","summary":" We present a method to build animatable dog avatars from monocular videos.\nThis is challenging as animals display a range of (unpredictable) non-rigid\nmovements and have a variety of appearance details (e.g., fur, spots, tails).\nWe develop an approach that links the video frames via a 4D solution that\njointly solves for animal's pose variation, and its appearance (in a canonical\npose). To this end, we significantly improve the quality of template-based\nshape fitting by endowing the SMAL parametric model with Continuous Surface\nEmbeddings, which brings image-to-mesh reprojection constaints that are denser,\nand thus stronger, than the previously used sparse semantic keypoint\ncorrespondences. To model appearance, we propose an implicit duplex-mesh\ntexture that is defined in the canonical pose, but can be deformed using SMAL\npose coefficients and later rendered to enforce a photometric compatibility\nwith the input video frames. On the challenging CoP3D and APTv2 datasets, we\ndemonstrate superior results (both in terms of pose estimates and predicted\nappearance) to existing template-free (RAC) and template-based approaches\n(BARC, BITE).\n","authors":["Remy Sabathier","Niloy J. Mitra","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2403.17103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10367v2","updated":"2024-03-25T18:33:01Z","published":"2024-03-15T14:59:21Z","title":"Testing MediaPipe Holistic for Linguistic Analysis of Nonmanual Markers\n in Sign Languages","summary":" Advances in Deep Learning have made possible reliable landmark tracking of\nhuman bodies and faces that can be used for a variety of tasks. We test a\nrecent Computer Vision solution, MediaPipe Holistic (MPH), to find out if its\ntracking of the facial features is reliable enough for a linguistic analysis of\ndata from sign languages, and compare it to an older solution (OpenFace, OF).\nWe use an existing data set of sentences in Kazakh-Russian Sign Language and a\nnewly created small data set of videos with head tilts and eyebrow movements.\nWe find that MPH does not perform well enough for linguistic analysis of\neyebrow movement - but in a different way from OF, which is also performing\npoorly without correction. We reiterate a previous proposal to train additional\ncorrection models to overcome these limitations.\n","authors":["Anna Kuznetsova","Vadim Kimmelman"],"pdf_url":"https://arxiv.org/pdf/2403.10367v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17094v1","updated":"2024-03-25T18:32:41Z","published":"2024-03-25T18:32:41Z","title":"SynFog: A Photo-realistic Synthetic Fog Dataset based on End-to-end\n Imaging Simulation for Advancing Real-World Defogging in Autonomous Driving","summary":" To advance research in learning-based defogging algorithms, various synthetic\nfog datasets have been developed. However, existing datasets created using the\nAtmospheric Scattering Model (ASM) or real-time rendering engines often\nstruggle to produce photo-realistic foggy images that accurately mimic the\nactual imaging process. This limitation hinders the effective generalization of\nmodels from synthetic to real data. In this paper, we introduce an end-to-end\nsimulation pipeline designed to generate photo-realistic foggy images. This\npipeline comprehensively considers the entire physically-based foggy scene\nimaging process, closely aligning with real-world image capture methods. Based\non this pipeline, we present a new synthetic fog dataset named SynFog, which\nfeatures both sky light and active lighting conditions, as well as three levels\nof fog density. Experimental results demonstrate that models trained on SynFog\nexhibit superior performance in visual perception and detection accuracy\ncompared to others when applied to real-world foggy images.\n","authors":["Yiming Xie","Henglu Wei","Zhenyi Liu","Xiaoyu Wang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2403.17094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17084v1","updated":"2024-03-25T18:18:12Z","published":"2024-03-25T18:18:12Z","title":"A Comparative Analysis of Visual Odometry in Virtual and Real-World\n Railways Environments","summary":" Perception tasks play a crucial role in the development of automated\noperations and systems across multiple application fields. In the railway\ntransportation domain, these tasks can improve the safety, reliability, and\nefficiency of various perations, including train localization, signal\nrecognition, and track discrimination. However, collecting considerable and\nprecisely labeled datasets for testing such novel algorithms poses extreme\nchallenges in the railway environment due to the severe restrictions in\naccessing the infrastructures and the practical difficulties associated with\nproperly equipping trains with the required sensors, such as cameras and\nLiDARs. The remarkable innovations of graphic engine tools offer new solutions\nto craft realistic synthetic datasets. To illustrate the advantages of\nemploying graphic simulation for early-stage testing of perception tasks in the\nrailway domain, this paper presents a comparative analysis of the performance\nof a SLAM algorithm applied both in a virtual synthetic environment and a\nreal-world scenario. The analysis leverages virtual railway environments\ncreated with the latest version of Unreal Engine, facilitating data collection\nand allowing the examination of challenging scenarios, including\nlow-visibility, dangerous operational modes, and complex environments. The\nresults highlight the feasibility and potentiality of graphic simulation to\nadvance perception tasks in the railway domain.\n","authors":["Gianluca D'Amico","Mauro Marinoni","Giorgio Buttazzo"],"pdf_url":"https://arxiv.org/pdf/2403.17084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17083v1","updated":"2024-03-25T18:16:34Z","published":"2024-03-25T18:16:34Z","title":"A Study in Dataset Pruning for Image Super-Resolution","summary":" In image Super-Resolution (SR), relying on large datasets for training is a\ndouble-edged sword. While offering rich training material, they also demand\nsubstantial computational and storage resources. In this work, we analyze\ndataset pruning as a solution to these challenges. We introduce a novel\napproach that reduces a dataset to a core-set of training samples, selected\nbased on their loss values as determined by a simple pre-trained SR model. By\nfocusing the training on just 50% of the original dataset, specifically on the\nsamples characterized by the highest loss values, we achieve results comparable\nto or even surpassing those obtained from training on the entire dataset.\nInterestingly, our analysis reveals that the top 5% of samples with the highest\nloss values negatively affect the training process. Excluding these samples and\nadjusting the selection to favor easier samples further enhances training\noutcomes. Our work opens new perspectives to the untapped potential of dataset\npruning in image SR. It suggests that careful selection of training data based\non loss-value metrics can lead to better SR models, challenging the\nconventional wisdom that more data inevitably leads to better performance.\n","authors":["Brian B. Moser","Federico Raue","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.17083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16703v3","updated":"2024-03-25T18:03:41Z","published":"2023-11-28T11:27:48Z","title":"CADTalk: An Algorithm and Benchmark for Semantic Commenting of CAD\n Programs","summary":" CAD programs are a popular way to compactly encode shapes as a sequence of\noperations that are easy to parametrically modify. However, without sufficient\nsemantic comments and structure, such programs can be challenging to\nunderstand, let alone modify. We introduce the problem of semantic commenting\nCAD programs, wherein the goal is to segment the input program into code blocks\ncorresponding to semantically meaningful shape parts and assign a semantic\nlabel to each block. We solve the problem by combining program parsing with\nvisual-semantic analysis afforded by recent advances in foundational language\nand vision models. Specifically, by executing the input programs, we create\nshapes, which we use to generate conditional photorealistic images to make use\nof semantic annotators for such images. We then distill the information across\nthe images and link back to the original programs to semantically comment on\nthem. Additionally, we collected and annotated a benchmark dataset, CADTalk,\nconsisting of 5,288 machine-made programs and 45 human-made programs with\nground truth semantic comments. We extensively evaluated our approach, compared\nit to a GPT-based baseline, and an open-set shape segmentation baseline, and\nreported an 83.24% accuracy on the new CADTalk dataset. Code and data:\nhttps://enigma-li.github.io/CADTalk/.\n","authors":["Haocheng Yuan","Jing Xu","Hao Pan","Adrien Bousseau","Niloy J. Mitra","Changjian Li"],"pdf_url":"https://arxiv.org/pdf/2311.16703v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17064v1","updated":"2024-03-25T18:00:42Z","published":"2024-03-25T18:00:42Z","title":"Continuous, Subject-Specific Attribute Control in T2I Models by\n Identifying Semantic Directions","summary":" In recent years, advances in text-to-image (T2I) diffusion models have\nsubstantially elevated the quality of their generated images. However,\nachieving fine-grained control over attributes remains a challenge due to the\nlimitations of natural language prompts (such as no continuous set of\nintermediate descriptions existing between ``person'' and ``old person''). Even\nthough many methods were introduced that augment the model or generation\nprocess to enable such control, methods that do not require a fixed reference\nimage are limited to either enabling global fine-grained attribute expression\ncontrol or coarse attribute expression control localized to specific subjects,\nnot both simultaneously. We show that there exist directions in the commonly\nused token-level CLIP text embeddings that enable fine-grained subject-specific\ncontrol of high-level attributes in text-to-image models. Based on this\nobservation, we introduce one efficient optimization-free and one robust\noptimization-based method to identify these directions for specific attributes\nfrom contrastive text prompts. We demonstrate that these directions can be used\nto augment the prompt text input with fine-grained control over attributes of\nspecific subjects in a compositional manner (control over multiple attributes\nof a single subject) without having to adapt the diffusion model. Project page:\nhttps://compvis.github.io/attribute-control. Code is available at\nhttps://github.com/CompVis/attribute-control.\n","authors":["Stefan Andreas Baumann","Felix Krause","Michael Neumayr","Nick Stracke","Vincent Tao Hu","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2403.17064v1.pdf","comment":"Project page: https://compvis.github.io/attribute-control"},{"id":"http://arxiv.org/abs/2403.17010v1","updated":"2024-03-25T17:59:59Z","published":"2024-03-25T17:59:59Z","title":"Calib3D: Calibrating Model Preferences for Reliable 3D Scene\n Understanding","summary":" Safety-critical 3D scene understanding tasks necessitate not only accurate\nbut also confident predictions from 3D perception models. This study introduces\nCalib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D\nscene understanding models from an uncertainty estimation viewpoint. We\ncomprehensively evaluate 28 state-of-the-art models across 10 diverse 3D\ndatasets, uncovering insightful phenomena that cope with both the aleatoric and\nepistemic uncertainties in 3D scene understanding. We discover that despite\nachieving impressive levels of accuracy, existing models frequently fail to\nprovide reliable uncertainty estimates -- a pitfall that critically undermines\ntheir applicability in safety-sensitive contexts. Through extensive analysis of\nkey factors such as network capacity, LiDAR representations, rasterization\nresolutions, and 3D data augmentation techniques, we correlate these aspects\ndirectly with the model calibration efficacy. Furthermore, we introduce DeptS,\na novel depth-aware scaling approach aimed at enhancing 3D model calibration.\nExtensive experiments across a wide range of configurations validate the\nsuperiority of our method. We hope this work could serve as a cornerstone for\nfostering reliable 3D scene understanding. Code and benchmark toolkits are\npublicly available.\n","authors":["Lingdong Kong","Xiang Xu","Jun Cen","Wenwei Zhang","Liang Pan","Kai Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17010v1.pdf","comment":"Preprint; 37 pages, 8 figures, 11 tables; Code at\n https://github.com/ldkong1205/Calib3D"},{"id":"http://arxiv.org/abs/2403.17009v1","updated":"2024-03-25T17:59:58Z","published":"2024-03-25T17:59:58Z","title":"Optimizing LiDAR Placements for Robust Driving Perception in Adverse\n Conditions","summary":" The robustness of driving perception systems under unprecedented conditions\nis crucial for safety-critical usages. Latest advancements have prompted\nincreasing interests towards multi-LiDAR perception. However, prevailing\ndriving datasets predominantly utilize single-LiDAR systems and collect data\ndevoid of adverse conditions, failing to capture the complexities of real-world\nenvironments accurately. Addressing these gaps, we proposed Place3D, a\nfull-cycle pipeline that encompasses LiDAR placement optimization, data\ngeneration, and downstream evaluations. Our framework makes three appealing\ncontributions. 1) To identify the most effective configurations for multi-LiDAR\nsystems, we introduce a Surrogate Metric of the Semantic Occupancy Grids\n(M-SOG) to evaluate LiDAR placement quality. 2) Leveraging the M-SOG metric, we\npropose a novel optimization strategy to refine multi-LiDAR placements. 3)\nCentered around the theme of multi-condition multi-LiDAR perception, we collect\na 364,000-frame dataset from both clean and adverse conditions. Extensive\nexperiments demonstrate that LiDAR placements optimized using our approach\noutperform various baselines. We showcase exceptional robustness in both 3D\nobject detection and LiDAR semantic segmentation tasks, under diverse adverse\nweather and sensor failure conditions. Code and benchmark toolkit are publicly\navailable.\n","authors":["Ye Li","Lingdong Kong","Hanjiang Hu","Xiaohao Xu","Xiaonan Huang"],"pdf_url":"https://arxiv.org/pdf/2403.17009v1.pdf","comment":"Preprint; 40 pages, 11 figures, 15 tables; Code at\n https://github.com/ywyeli/Place3D"},{"id":"http://arxiv.org/abs/2403.17008v1","updated":"2024-03-25T17:59:57Z","published":"2024-03-25T17:59:57Z","title":"FlashFace: Human Image Personalization with High-fidelity Identity\n Preservation","summary":" This work presents FlashFace, a practical tool with which users can easily\npersonalize their own photos on the fly by providing one or a few reference\nface images and a text prompt. Our approach is distinguishable from existing\nhuman photo customization methods by higher-fidelity identity preservation and\nbetter instruction following, benefiting from two subtle designs. First, we\nencode the face identity into a series of feature maps instead of one image\ntoken as in prior arts, allowing the model to retain more details of the\nreference faces (e.g., scars, tattoos, and face shape ). Second, we introduce a\ndisentangled integration strategy to balance the text and image guidance during\nthe text-to-image generation process, alleviating the conflict between the\nreference faces and the text prompts (e.g., personalizing an adult into a\n\"child\" or an \"elder\"). Extensive experimental results demonstrate the\neffectiveness of our method on various applications, including human image\npersonalization, face swapping under language prompts, making virtual\ncharacters into real people, etc. Project Page:\nhttps://jshilong.github.io/flashface-page.\n","authors":["Shilong Zhang","Lianghua Huang","Xi Chen","Yifei Zhang","Zhi-Fan Wu","Yutong Feng","Wei Wang","Yujun Shen","Yu Liu","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2403.17008v1.pdf","comment":"Project Page:https://jshilong.github.io/flashface-page"},{"id":"http://arxiv.org/abs/2403.17007v1","updated":"2024-03-25T17:59:42Z","published":"2024-03-25T17:59:42Z","title":"DreamLIP: Language-Image Pre-training with Long Captions","summary":" Language-image pre-training largely relies on how precisely and thoroughly a\ntext describes its paired image. In practice, however, the contents of an image\ncan be so rich that well describing them requires lengthy captions (e.g., with\n10 sentences), which are usually missing in existing datasets. Consequently,\nthere are currently no clear evidences on whether and how language-image\npre-training could benefit from long captions. To figure this out, we first\nre-caption 30M images with detailed descriptions using a pre-trained\nMulti-modality Large Language Model (MLLM), and then study the usage of the\nresulting captions under a contrastive learning framework. We observe that,\neach sentence within a long caption is very likely to describe the image\npartially (e.g., an object). Motivated by this, we propose to dynamically\nsample sub-captions from the text label to construct multiple positive pairs,\nand introduce a grouping loss to match the embeddings of each sub-caption with\nits corresponding local image patches in a self-supervised manner. Experimental\nresults on a wide rage of downstream tasks demonstrate the consistent\nsuperiority of our method, termed DreamLIP, over previous alternatives,\nhighlighting its fine-grained representational capacity. It is noteworthy that,\non the tasks of image-text retrieval and semantic segmentation, our model\ntrained with 30M image-text pairs achieves on par or even better performance\nthan CLIP trained with 400M pairs. Project page is available at\nhttps://zyf0619sjtu.github.io/dream-lip.\n","authors":["Kecheng Zheng","Yifei Zhang","Wei Wu","Fan Lu","Shuailei Ma","Xin Jin","Wei Chen","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2403.17007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17006v1","updated":"2024-03-25T17:59:41Z","published":"2024-03-25T17:59:41Z","title":"Invertible Diffusion Models for Compressed Sensing","summary":" While deep neural networks (NN) significantly advance image compressed\nsensing (CS) by improving reconstruction quality, the necessity of training\ncurrent CS NNs from scratch constrains their effectiveness and hampers rapid\ndeployment. Although recent methods utilize pre-trained diffusion models for\nimage reconstruction, they struggle with slow inference and restricted\nadaptability to CS. To tackle these challenges, this paper proposes Invertible\nDiffusion Models (IDM), a novel efficient, end-to-end diffusion-based CS\nmethod. IDM repurposes a large-scale diffusion sampling process as a\nreconstruction model, and finetunes it end-to-end to recover original images\ndirectly from CS measurements, moving beyond the traditional paradigm of\none-step noise estimation learning. To enable such memory-intensive end-to-end\nfinetuning, we propose a novel two-level invertible design to transform both\n(1) the multi-step sampling process and (2) the noise estimation U-Net in each\nstep into invertible networks. As a result, most intermediate features are\ncleared during training to reduce up to 93.8% GPU memory. In addition, we\ndevelop a set of lightweight modules to inject measurements into noise\nestimator to further facilitate reconstruction. Experiments demonstrate that\nIDM outperforms existing state-of-the-art CS networks by up to 2.64dB in PSNR.\nCompared to the recent diffusion model-based approach DDNM, our IDM achieves up\nto 10.09dB PSNR gain and 14.54 times faster inference.\n","authors":["Bin Chen","Zhenyu Zhang","Weiqi Li","Chen Zhao","Jiwen Yu","Shijie Zhao","Jie Chen","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.17006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17005v1","updated":"2024-03-25T17:59:40Z","published":"2024-03-25T17:59:40Z","title":"TRIP: Temporal Residual Learning with Image Noise Prior for\n Image-to-Video Diffusion Models","summary":" Recent advances in text-to-video generation have demonstrated the utility of\npowerful diffusion models. Nevertheless, the problem is not trivial when\nshaping diffusion models to animate static image (i.e., image-to-video\ngeneration). The difficulty originates from the aspect that the diffusion\nprocess of subsequent animated frames should not only preserve the faithful\nalignment with the given image but also pursue temporal coherence among\nadjacent frames. To alleviate this, we present TRIP, a new recipe of\nimage-to-video diffusion paradigm that pivots on image noise prior derived from\nstatic image to jointly trigger inter-frame relational reasoning and ease the\ncoherent temporal modeling via temporal residual learning. Technically, the\nimage noise prior is first attained through one-step backward diffusion process\nbased on both static image and noised video latent codes. Next, TRIP executes a\nresidual-like dual-path scheme for noise prediction: 1) a shortcut path that\ndirectly takes image noise prior as the reference noise of each frame to\namplify the alignment between the first frame and subsequent frames; 2) a\nresidual path that employs 3D-UNet over noised video and static image latent\ncodes to enable inter-frame relational reasoning, thereby easing the learning\nof the residual noise for each frame. Furthermore, both reference and residual\nnoise of each frame are dynamically merged via attention mechanism for final\nvideo generation. Extensive experiments on WebVid-10M, DTDB and MSR-VTT\ndatasets demonstrate the effectiveness of our TRIP for image-to-video\ngeneration. Please see our project page at https://trip-i2v.github.io/TRIP/.\n","authors":["Zhongwei Zhang","Fuchen Long","Yingwei Pan","Zhaofan Qiu","Ting Yao","Yang Cao","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2403.17005v1.pdf","comment":"CVPR 2024; Project page: https://trip-i2v.github.io/TRIP/"},{"id":"http://arxiv.org/abs/2403.17004v1","updated":"2024-03-25T17:59:35Z","published":"2024-03-25T17:59:35Z","title":"SD-DiT: Unleashing the Power of Self-supervised Discrimination in\n Diffusion Transformer","summary":" Diffusion Transformer (DiT) has emerged as the new trend of generative\ndiffusion models on image generation. In view of extremely slow convergence in\ntypical DiT, recent breakthroughs have been driven by mask strategy that\nsignificantly improves the training efficiency of DiT with additional\nintra-image contextual learning. Despite this progress, mask strategy still\nsuffers from two inherent limitations: (a) training-inference discrepancy and\n(b) fuzzy relations between mask reconstruction & generative diffusion process,\nresulting in sub-optimal training of DiT. In this work, we address these\nlimitations by novelly unleashing the self-supervised discrimination knowledge\nto boost DiT training. Technically, we frame our DiT in a teacher-student\nmanner. The teacher-student discriminative pairs are built on the diffusion\nnoises along the same Probability Flow Ordinary Differential Equation (PF-ODE).\nInstead of applying mask reconstruction loss over both DiT encoder and decoder,\nwe decouple DiT encoder and decoder to separately tackle discriminative and\ngenerative objectives. In particular, by encoding discriminative pairs with\nstudent and teacher DiT encoders, a new discriminative loss is designed to\nencourage the inter-image alignment in the self-supervised embedding space.\nAfter that, student samples are fed into student DiT decoder to perform the\ntypical generative diffusion task. Extensive experiments are conducted on\nImageNet dataset, and our method achieves a competitive balance between\ntraining cost and generative capacity.\n","authors":["Rui Zhu","Yingwei Pan","Yehao Li","Ting Yao","Zhenglong Sun","Tao Mei","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.17004v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17001v1","updated":"2024-03-25T17:59:31Z","published":"2024-03-25T17:59:31Z","title":"VP3D: Unleashing 2D Visual Prompt for Text-to-3D Generation","summary":" Recent innovations on text-to-3D generation have featured Score Distillation\nSampling (SDS), which enables the zero-shot learning of implicit 3D models\n(NeRF) by directly distilling prior knowledge from 2D diffusion models.\nHowever, current SDS-based models still struggle with intricate text prompts\nand commonly result in distorted 3D models with unrealistic textures or\ncross-view inconsistency issues. In this work, we introduce a novel Visual\nPrompt-guided text-to-3D diffusion model (VP3D) that explicitly unleashes the\nvisual appearance knowledge in 2D visual prompt to boost text-to-3D generation.\nInstead of solely supervising SDS with text prompt, VP3D first capitalizes on\n2D diffusion model to generate a high-quality image from input text, which\nsubsequently acts as visual prompt to strengthen SDS optimization with explicit\nvisual appearance. Meanwhile, we couple the SDS optimization with additional\ndifferentiable reward function that encourages rendering images of 3D models to\nbetter visually align with 2D visual prompt and semantically match with text\nprompt. Through extensive experiments, we show that the 2D Visual Prompt in our\nVP3D significantly eases the learning of visual appearance of 3D models and\nthus leads to higher visual fidelity with more detailed textures. It is also\nappealing in view that when replacing the self-generating visual prompt with a\ngiven reference image, VP3D is able to trigger a new task of stylized\ntext-to-3D generation. Our project page is available at\nhttps://vp3d-cvpr24.github.io.\n","authors":["Yang Chen","Yingwei Pan","Haibo Yang","Ting Yao","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2403.17001v1.pdf","comment":"CVPR 2024; Project page: https://vp3d-cvpr24.github.io"},{"id":"http://arxiv.org/abs/2403.17000v1","updated":"2024-03-25T17:59:26Z","published":"2024-03-25T17:59:26Z","title":"Learning Spatial Adaptation and Temporal Coherence in Diffusion Models\n for Video Super-Resolution","summary":" Diffusion models are just at a tipping point for image super-resolution task.\nNevertheless, it is not trivial to capitalize on diffusion models for video\nsuper-resolution which necessitates not only the preservation of visual\nappearance from low-resolution to high-resolution videos, but also the temporal\nconsistency across video frames. In this paper, we propose a novel approach,\npursuing Spatial Adaptation and Temporal Coherence (SATeCo), for video\nsuper-resolution. SATeCo pivots on learning spatial-temporal guidance from\nlow-resolution videos to calibrate both latent-space high-resolution video\ndenoising and pixel-space video reconstruction. Technically, SATeCo freezes all\nthe parameters of the pre-trained UNet and VAE, and only optimizes two\ndeliberately-designed spatial feature adaptation (SFA) and temporal feature\nalignment (TFA) modules, in the decoder of UNet and VAE. SFA modulates frame\nfeatures via adaptively estimating affine parameters for each pixel,\nguaranteeing pixel-wise guidance for high-resolution frame synthesis. TFA\ndelves into feature interaction within a 3D local window (tubelet) through\nself-attention, and executes cross-attention between tubelet and its\nlow-resolution counterpart to guide temporal feature alignment. Extensive\nexperiments conducted on the REDS4 and Vid4 datasets demonstrate the\neffectiveness of our approach.\n","authors":["Zhikai Chen","Fuchen Long","Zhaofan Qiu","Ting Yao","Wengang Zhou","Jiebo Luo","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2403.17000v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16999v1","updated":"2024-03-25T17:59:23Z","published":"2024-03-25T17:59:23Z","title":"Visual CoT: Unleashing Chain-of-Thought Reasoning in Multi-Modal\n Language Models","summary":" This paper presents Visual CoT, a novel pipeline that leverages the reasoning\ncapabilities of multi-modal large language models (MLLMs) by incorporating\nvisual Chain-of-Thought (CoT) reasoning. While MLLMs have shown promise in\nvarious visual tasks, they often lack interpretability and struggle with\ncomplex visual inputs. To address these challenges, we propose a multi-turn\nprocessing pipeline that dynamically focuses on visual inputs and provides\ninterpretable thoughts. We collect and introduce the Visual CoT dataset\ncomprising 373k question-answer pairs, annotated with intermediate bounding\nboxes highlighting key regions essential for answering the questions.\nImportantly, the introduced benchmark is capable of evaluating MLLMs in\nscenarios requiring specific local region identification. Extensive experiments\ndemonstrate the effectiveness of our framework and shed light on better\ninference strategies. The Visual CoT dataset, benchmark, and pre-trained models\nare available to foster further research in this direction.\n","authors":["Hao Shao","Shengju Qian","Han Xiao","Guanglu Song","Zhuofan Zong","Letian Wang","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.16999v1.pdf","comment":"Code: https://github.com/deepcs233/Visual-CoT"},{"id":"http://arxiv.org/abs/2403.16998v1","updated":"2024-03-25T17:59:09Z","published":"2024-03-25T17:59:09Z","title":"Understanding Long Videos in One Multimodal Language Model Pass","summary":" Large Language Models (LLMs), known to contain a strong awareness of world\nknowledge, have allowed recent approaches to achieve excellent performance on\nLong-Video Understanding benchmarks, but at high inference costs. In this work,\nwe first propose Likelihood Selection, a simple technique that unlocks faster\ninference in autoregressive LLMs for multiple-choice tasks common in long-video\nbenchmarks. In addition to faster inference, we discover the resulting models\nto yield surprisingly good accuracy on long-video tasks, even with no video\nspecific information. Building on this, we inject video-specific object-centric\ninformation extracted from off-the-shelf pre-trained models and utilize natural\nlanguage as a medium for information fusion. Our resulting Multimodal Video\nUnderstanding (MVU) framework demonstrates state-of-the-art performance across\nlong-video and fine-grained action recognition benchmarks. Code available at:\nhttps://github.com/kahnchana/mvu\n","authors":["Kanchana Ranasinghe","Xiang Li","Kumara Kahatapitiya","Michael S. Ryoo"],"pdf_url":"https://arxiv.org/pdf/2403.16998v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2403.16997v1","updated":"2024-03-25T17:59:03Z","published":"2024-03-25T17:59:03Z","title":"Composed Video Retrieval via Enriched Context and Discriminative\n Embeddings","summary":" Composed video retrieval (CoVR) is a challenging problem in computer vision\nwhich has recently highlighted the integration of modification text with visual\nqueries for more sophisticated video search in large databases. Existing works\npredominantly rely on visual queries combined with modification text to\ndistinguish relevant videos. However, such a strategy struggles to fully\npreserve the rich query-specific context in retrieved target videos and only\nrepresents the target video using visual embedding. We introduce a novel CoVR\nframework that leverages detailed language descriptions to explicitly encode\nquery-specific contextual information and learns discriminative embeddings of\nvision only, text only and vision-text for better alignment to accurately\nretrieve matched target videos. Our proposed framework can be flexibly employed\nfor both composed video (CoVR) and image (CoIR) retrieval tasks. Experiments on\nthree datasets show that our approach obtains state-of-the-art performance for\nboth CovR and zero-shot CoIR tasks, achieving gains as high as around 7% in\nterms of recall@K=1 score. Our code, models, detailed language descriptions for\nWebViD-CoVR dataset are available at\n\\url{https://github.com/OmkarThawakar/composed-video-retrieval}\n","authors":["Omkar Thawakar","Muzammal Naseer","Rao Muhammad Anwer","Salman Khan","Michael Felsberg","Mubarak Shah","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.16997v1.pdf","comment":"CVPR-2024"},{"id":"http://arxiv.org/abs/2403.16996v1","updated":"2024-03-25T17:59:01Z","published":"2024-03-25T17:59:01Z","title":"DriveCoT: Integrating Chain-of-Thought Reasoning with End-to-End Driving","summary":" End-to-end driving has made significant progress in recent years,\ndemonstrating benefits such as system simplicity and competitive driving\nperformance under both open-loop and closed-loop settings. Nevertheless, the\nlack of interpretability and controllability in its driving decisions hinders\nreal-world deployment for end-to-end driving systems. In this paper, we collect\na comprehensive end-to-end driving dataset named DriveCoT, leveraging the CARLA\nsimulator. It contains sensor data, control decisions, and chain-of-thought\nlabels to indicate the reasoning process. We utilize the challenging driving\nscenarios from the CARLA leaderboard 2.0, which involve high-speed driving and\nlane-changing, and propose a rule-based expert policy to control the vehicle\nand generate ground truth labels for its reasoning process across different\ndriving aspects and the final decisions. This dataset can serve as an open-loop\nend-to-end driving benchmark, enabling the evaluation of accuracy in various\nchain-of-thought aspects and the final decision. In addition, we propose a\nbaseline model called DriveCoT-Agent, trained on our dataset, to generate\nchain-of-thought predictions and final decisions. The trained model exhibits\nstrong performance in both open-loop and closed-loop evaluations, demonstrating\nthe effectiveness of our proposed dataset.\n","authors":["Tianqi Wang","Enze Xie","Ruihang Chu","Zhenguo Li","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2403.16996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14456v6","updated":"2024-03-25T17:58:59Z","published":"2022-11-26T02:15:35Z","title":"TetraSphere: A Neural Descriptor for O(3)-Invariant Point Cloud Analysis","summary":" In many practical applications, 3D point cloud analysis requires rotation\ninvariance. In this paper, we present a learnable descriptor invariant under 3D\nrotations and reflections, i.e., the O(3) actions, utilizing the recently\nintroduced steerable 3D spherical neurons and vector neurons. Specifically, we\npropose an embedding of the 3D spherical neurons into 4D vector neurons, which\nleverages end-to-end training of the model. In our approach, we perform\nTetraTransform--an equivariant embedding of the 3D input into 4D, constructed\nfrom the steerable neurons--and extract deeper O(3)-equivariant features using\nvector neurons. This integration of the TetraTransform into the VN-DGCNN\nframework, termed TetraSphere, negligibly increases the number of parameters by\nless than 0.0002%. TetraSphere sets a new state-of-the-art performance\nclassifying randomly rotated real-world object scans of the challenging subsets\nof ScanObjectNN. Additionally, TetraSphere outperforms all equivariant methods\non randomly rotated synthetic data: classifying objects from ModelNet40 and\nsegmenting parts of the ShapeNet shapes. Thus, our results reveal the practical\nvalue of steerable 3D spherical neurons for learning in 3D Euclidean space. The\ncode is available at https://github.com/pavlo-melnyk/tetrasphere.\n","authors":["Pavlo Melnyk","Andreas Robinson","Michael Felsberg","Mårten Wadenbäck"],"pdf_url":"https://arxiv.org/pdf/2211.14456v6.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16994v1","updated":"2024-03-25T17:56:41Z","published":"2024-03-25T17:56:41Z","title":"Mapping Image Transformations Onto Pixel Processor Arrays","summary":" Pixel Processor Arrays (PPA) present a new vision sensor/processor\narchitecture consisting of a SIMD array of processor elements, each capable of\nlight capture, storage, processing and local communication. Such a device\nallows visual data to be efficiently stored and manipulated directly upon the\nfocal plane, but also demands the invention of new approaches and algorithms,\nsuitable for the massively-parallel fine-grain processor arrays. In this paper\nwe demonstrate how various image transformations, including shearing, rotation\nand scaling, can be performed directly upon a PPA. The implementation details\nare presented using the SCAMP-5 vision chip, that contains a 256x256\npixel-parallel array. Our approaches for performing the image transformations\nefficiently exploit the parallel computation in a cellular processor array,\nminimizing the number of SIMD instructions required. These fundamental image\ntransformations are vital building blocks for many visual tasks. This paper\naims to serve as a reference for future PPA research while demonstrating the\nflexibility of PPA architectures.\n","authors":["Laurie Bose","Piotr Dudek"],"pdf_url":"https://arxiv.org/pdf/2403.16994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16993v1","updated":"2024-03-25T17:55:52Z","published":"2024-03-25T17:55:52Z","title":"Comp4D: LLM-Guided Compositional 4D Scene Generation","summary":" Recent advancements in diffusion models for 2D and 3D content creation have\nsparked a surge of interest in generating 4D content. However, the scarcity of\n3D scene datasets constrains current methodologies to primarily object-centric\ngeneration. To overcome this limitation, we present Comp4D, a novel framework\nfor Compositional 4D Generation. Unlike conventional methods that generate a\nsingular 4D representation of the entire scene, Comp4D innovatively constructs\neach 4D object within the scene separately. Utilizing Large Language Models\n(LLMs), the framework begins by decomposing an input text prompt into distinct\nentities and maps out their trajectories. It then constructs the compositional\n4D scene by accurately positioning these objects along their designated paths.\nTo refine the scene, our method employs a compositional score distillation\ntechnique guided by the pre-defined trajectories, utilizing pre-trained\ndiffusion models across text-to-image, text-to-video, and text-to-3D domains.\nExtensive experiments demonstrate our outstanding 4D content creation\ncapability compared to prior arts, showcasing superior visual quality, motion\nfidelity, and enhanced object interactions.\n","authors":["Dejia Xu","Hanwen Liang","Neel P. Bhatt","Hezhen Hu","Hanxue Liang","Konstantinos N. Plataniotis","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16993v1.pdf","comment":"Project page: https://vita-group.github.io/Comp4D/"},{"id":"http://arxiv.org/abs/2403.16990v1","updated":"2024-03-25T17:52:07Z","published":"2024-03-25T17:52:07Z","title":"Be Yourself: Bounded Attention for Multi-Subject Text-to-Image\n Generation","summary":" Text-to-image diffusion models have an unprecedented ability to generate\ndiverse and high-quality images. However, they often struggle to faithfully\ncapture the intended semantics of complex input prompts that include multiple\nsubjects. Recently, numerous layout-to-image extensions have been introduced to\nimprove user control, aiming to localize subjects represented by specific\ntokens. Yet, these methods often produce semantically inaccurate images,\nespecially when dealing with multiple semantically or visually similar\nsubjects. In this work, we study and analyze the causes of these limitations.\nOur exploration reveals that the primary issue stems from inadvertent semantic\nleakage between subjects in the denoising process. This leakage is attributed\nto the diffusion model's attention layers, which tend to blend the visual\nfeatures of different subjects. To address these issues, we introduce Bounded\nAttention, a training-free method for bounding the information flow in the\nsampling process. Bounded Attention prevents detrimental leakage among subjects\nand enables guiding the generation to promote each subject's individuality,\neven with complex multi-subject conditioning. Through extensive\nexperimentation, we demonstrate that our method empowers the generation of\nmultiple subjects that better align with given prompts and layouts.\n","authors":["Omer Dahary","Or Patashnik","Kfir Aberman","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2403.16990v1.pdf","comment":"Project page: https://omer11a.github.io/bounded-attention/"},{"id":"http://arxiv.org/abs/2311.15773v3","updated":"2024-03-25T17:41:23Z","published":"2023-11-27T12:48:33Z","title":"Check, Locate, Rectify: A Training-Free Layout Calibration System for\n Text-to-Image Generation","summary":" Diffusion models have recently achieved remarkable progress in generating\nrealistic images. However, challenges remain in accurately understanding and\nsynthesizing the layout requirements in the textual prompts. To align the\ngenerated image with layout instructions, we present a training-free layout\ncalibration system SimM that intervenes in the generative process on the fly\nduring inference time. Specifically, following a \"check-locate-rectify\"\npipeline, the system first analyses the prompt to generate the target layout\nand compares it with the intermediate outputs to automatically detect errors.\nThen, by moving the located activations and making intra- and inter-map\nadjustments, the rectification process can be performed with negligible\ncomputational overhead. To evaluate SimM over a range of layout requirements,\nwe present a benchmark SimMBench that compensates for the lack of superlative\nspatial relations in existing datasets. And both quantitative and qualitative\nresults demonstrate the effectiveness of the proposed SimM in calibrating the\nlayout inconsistencies. Our project page is at https://simm-t2i.github.io/SimM.\n","authors":["Biao Gong","Siteng Huang","Yutong Feng","Shiwei Zhang","Yuyuan Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15773v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16974v1","updated":"2024-03-25T17:40:32Z","published":"2024-03-25T17:40:32Z","title":"Self-STORM: Deep Unrolled Self-Supervised Learning for Super-Resolution\n Microscopy","summary":" The use of fluorescent molecules to create long sequences of low-density,\ndiffraction-limited images enables highly-precise molecule localization.\nHowever, this methodology requires lengthy imaging times, which limits the\nability to view dynamic interactions of live cells on short time scales. Many\ntechniques have been developed to reduce the number of frames needed for\nlocalization, from classic iterative optimization to deep neural networks.\nParticularly, deep algorithm unrolling utilizes both the structure of iterative\nsparse recovery algorithms and the performance gains of supervised deep\nlearning. However, the robustness of this approach is highly dependant on\nhaving sufficient training data. In this paper we introduce deep unrolled\nself-supervised learning, which alleviates the need for such data by training a\nsequence-specific, model-based autoencoder that learns only from given\nmeasurements. Our proposed method exceeds the performance of its supervised\ncounterparts, thus allowing for robust, dynamic imaging well below the\ndiffraction limit without any labeled training samples. Furthermore, the\nsuggested model-based autoencoder scheme can be utilized to enhance\ngeneralization in any sparse recovery framework, without the need for external\ntraining data.\n","authors":["Yair Ben Sahel","Yonina C. Eldar"],"pdf_url":"https://arxiv.org/pdf/2403.16974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12027v2","updated":"2024-03-25T17:39:10Z","published":"2024-03-18T17:57:09Z","title":"From Pixels to Insights: A Survey on Automatic Chart Understanding in\n the Era of Large Foundation Models","summary":" Data visualization in the form of charts plays a pivotal role in data\nanalysis, offering critical insights and aiding in informed decision-making.\nAutomatic chart understanding has witnessed significant advancements with the\nrise of large foundation models in recent years. Foundation models, such as\nlarge language models, have revolutionized various natural language processing\ntasks and are increasingly being applied to chart understanding tasks. This\nsurvey paper provides a comprehensive overview of the recent developments,\nchallenges, and future directions in chart understanding within the context of\nthese foundation models. We review fundamental building blocks crucial for\nstudying chart understanding tasks. Additionally, we explore various tasks and\ntheir evaluation metrics and sources of both charts and textual inputs. Various\nmodeling strategies are then examined, encompassing both classification-based\nand generation-based approaches, along with tool augmentation techniques that\nenhance chart understanding performance. Furthermore, we discuss the\nstate-of-the-art performance of each task and discuss how we can improve the\nperformance. Challenges and future directions are addressed, highlighting the\nimportance of several topics, such as domain-specific charts, lack of efforts\nin developing evaluation metrics, and agent-oriented settings. This survey\npaper serves as a comprehensive resource for researchers and practitioners in\nthe fields of natural language processing, computer vision, and data analysis,\nproviding valuable insights and directions for future research in chart\nunderstanding leveraging large foundation models. The studies mentioned in this\npaper, along with emerging new research, will be continually updated at:\nhttps://github.com/khuangaf/Awesome-Chart-Understanding.\n","authors":["Kung-Hsiang Huang","Hou Pong Chan","Yi R. Fung","Haoyi Qiu","Mingyang Zhou","Shafiq Joty","Shih-Fu Chang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2403.12027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16970v1","updated":"2024-03-25T17:31:12Z","published":"2024-03-25T17:31:12Z","title":"Joint chest X-ray diagnosis and clinical visual attention prediction\n with multi-stage cooperative learning: enhancing interpretability","summary":" As deep learning has become the state-of-the-art for computer-assisted\ndiagnosis, interpretability of the automatic decisions is crucial for clinical\ndeployment. While various methods were proposed in this domain, visual\nattention maps of clinicians during radiological screening offer a unique asset\nto provide important insights and can potentially enhance the quality of\ncomputer-assisted diagnosis. With this paper, we introduce a novel\ndeep-learning framework for joint disease diagnosis and prediction of\ncorresponding visual saliency maps for chest X-ray scans. Specifically, we\ndesigned a novel dual-encoder multi-task UNet, which leverages both a\nDenseNet201 backbone and a Residual and Squeeze-and-Excitation block-based\nencoder to extract diverse features for saliency map prediction, and a\nmulti-scale feature-fusion classifier to perform disease classification. To\ntackle the issue of asynchronous training schedules of individual tasks in\nmulti-task learning, we proposed a multi-stage cooperative learning strategy,\nwith contrastive learning for feature encoder pretraining to boost performance.\nExperiments show that our proposed method outperformed existing techniques for\nchest X-ray diagnosis and the quality of visual saliency map prediction.\n","authors":["Zirui Qiu","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.16970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16967v1","updated":"2024-03-25T17:26:08Z","published":"2024-03-25T17:26:08Z","title":"Visual Whole-Body Control for Legged Loco-Manipulation","summary":" We study the problem of mobile manipulation using legged robots equipped with\nan arm, namely legged loco-manipulation. The robot legs, while usually utilized\nfor mobility, offer an opportunity to amplify the manipulation capabilities by\nconducting whole-body control. That is, the robot can control the legs and the\narm at the same time to extend its workspace. We propose a framework that can\nconduct the whole-body control autonomously with visual observations. Our\napproach, namely \\ourFull~(\\our), is composed of a low-level policy using all\ndegrees of freedom to track the end-effector manipulator position and a\nhigh-level policy proposing the end-effector position based on visual inputs.\nWe train both levels of policies in simulation and perform Sim2Real transfer\nfor real robot deployment. We perform extensive experiments and show\nsignificant improvements over baselines in picking up diverse objects in\ndifferent configurations (heights, locations, orientations) and environments.\nProject page: https://wholebody-b1.github.io\n","authors":["Minghuan Liu","Zixuan Chen","Xuxin Cheng","Yandong Ji","Ruihan Yang","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16967v1.pdf","comment":"The first two authors contribute equally. Project page:\n https://wholebody-b1.github.io"},{"id":"http://arxiv.org/abs/2403.16964v1","updated":"2024-03-25T17:22:11Z","published":"2024-03-25T17:22:11Z","title":"GSDF: 3DGS Meets SDF for Improved Rendering and Reconstruction","summary":" Presenting a 3D scene from multiview images remains a core and long-standing\nchallenge in computer vision and computer graphics. Two main requirements lie\nin rendering and reconstruction. Notably, SOTA rendering quality is usually\nachieved with neural volumetric rendering techniques, which rely on aggregated\npoint/primitive-wise color and neglect the underlying scene geometry. Learning\nof neural implicit surfaces is sparked from the success of neural rendering.\nCurrent works either constrain the distribution of density fields or the shape\nof primitives, resulting in degraded rendering quality and flaws on the learned\nscene surfaces. The efficacy of such methods is limited by the inherent\nconstraints of the chosen neural representation, which struggles to capture\nfine surface details, especially for larger, more intricate scenes. To address\nthese issues, we introduce GSDF, a novel dual-branch architecture that combines\nthe benefits of a flexible and efficient 3D Gaussian Splatting (3DGS)\nrepresentation with neural Signed Distance Fields (SDF). The core idea is to\nleverage and enhance the strengths of each branch while alleviating their\nlimitation through mutual guidance and joint supervision. We show on diverse\nscenes that our design unlocks the potential for more accurate and detailed\nsurface reconstructions, and at the meantime benefits 3DGS rendering with\nstructures that are more aligned with the underlying geometry.\n","authors":["Mulin Yu","Tao Lu","Linning Xu","Lihan Jiang","Yuanbo Xiangli","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2403.16964v1.pdf","comment":"Project page: https://city-super.github.io/GSDF"},{"id":"http://arxiv.org/abs/2403.16958v1","updated":"2024-03-25T17:17:45Z","published":"2024-03-25T17:17:45Z","title":"TwinLiteNetPlus: A Stronger Model for Real-time Drivable Area and Lane\n Segmentation","summary":" Semantic segmentation is crucial for autonomous driving, particularly for\nDrivable Area and Lane Segmentation, ensuring safety and navigation. To address\nthe high computational costs of current state-of-the-art (SOTA) models, this\npaper introduces TwinLiteNetPlus (TwinLiteNet$^+$), a model adept at balancing\nefficiency and accuracy. TwinLiteNet$^+$ incorporates standard and depth-wise\nseparable dilated convolutions, reducing complexity while maintaining high\naccuracy. It is available in four configurations, from the robust 1.94\nmillion-parameter TwinLiteNet$^+_{\\text{Large}}$ to the ultra-compact\n34K-parameter TwinLiteNet$^+_{\\text{Nano}}$. Notably,\nTwinLiteNet$^+_{\\text{Large}}$ attains a 92.9\\% mIoU for Drivable Area\nSegmentation and a 34.2\\% IoU for Lane Segmentation. These results notably\noutperform those of current SOTA models while requiring a computational cost\nthat is approximately 11 times lower in terms of Floating Point Operations\n(FLOPs) compared to the existing SOTA model. Extensively tested on various\nembedded devices, TwinLiteNet$^+$ demonstrates promising latency and power\nefficiency, underscoring its suitability for real-world autonomous vehicle\napplications.\n","authors":["Quang-Huy Che","Duc-Tri Le","Minh-Quan Pham","Vinh-Tiep Nguyen","Duc-Khai Lam"],"pdf_url":"https://arxiv.org/pdf/2403.16958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15841v3","updated":"2024-03-25T17:17:31Z","published":"2023-11-27T14:07:13Z","title":"Learning Disentangled Identifiers for Action-Customized Text-to-Image\n Generation","summary":" This study focuses on a novel task in text-to-image (T2I) generation, namely\naction customization. The objective of this task is to learn the co-existing\naction from limited data and generalize it to unseen humans or even animals.\nExperimental results show that existing subject-driven customization methods\nfail to learn the representative characteristics of actions and struggle in\ndecoupling actions from context features, including appearance. To overcome the\npreference for low-level features and the entanglement of high-level features,\nwe propose an inversion-based method Action-Disentangled Identifier (ADI) to\nlearn action-specific identifiers from the exemplar images. ADI first expands\nthe semantic conditioning space by introducing layer-wise identifier tokens,\nthereby increasing the representational richness while distributing the\ninversion across different features. Then, to block the inversion of\naction-agnostic features, ADI extracts the gradient invariance from the\nconstructed sample triples and masks the updates of irrelevant channels. To\ncomprehensively evaluate the task, we present an ActionBench that includes a\nvariety of actions, each accompanied by meticulously selected samples. Both\nquantitative and qualitative results show that our ADI outperforms existing\nbaselines in action-customized T2I generation. Our project page is at\nhttps://adi-t2i.github.io/ADI.\n","authors":["Siteng Huang","Biao Gong","Yutong Feng","Xi Chen","Yuqian Fu","Yu Liu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15841v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16954v1","updated":"2024-03-25T17:16:27Z","published":"2024-03-25T17:16:27Z","title":"Isolated Diffusion: Optimizing Multi-Concept Text-to-Image Generation\n Training-Freely with Isolated Diffusion Guidance","summary":" Large-scale text-to-image diffusion models have achieved great success in\nsynthesizing high-quality and diverse images given target text prompts. Despite\nthe revolutionary image generation ability, current state-of-the-art models\nstill struggle to deal with multi-concept generation accurately in many cases.\nThis phenomenon is known as ``concept bleeding\" and displays as the unexpected\noverlapping or merging of various concepts. This paper presents a general\napproach for text-to-image diffusion models to address the mutual interference\nbetween different subjects and their attachments in complex scenes, pursuing\nbetter text-image consistency. The core idea is to isolate the synthesizing\nprocesses of different concepts. We propose to bind each attachment to\ncorresponding subjects separately with split text prompts. Besides, we\nintroduce a revision method to fix the concept bleeding problem in\nmulti-subject synthesis. We first depend on pre-trained object detection and\nsegmentation models to obtain the layouts of subjects. Then we isolate and\nresynthesize each subject individually with corresponding text prompts to avoid\nmutual interference. Overall, we achieve a training-free strategy, named\nIsolated Diffusion, to optimize multi-concept text-to-image synthesis. It is\ncompatible with the latest Stable Diffusion XL (SDXL) and prior Stable\nDiffusion (SD) models. We compare our approach with alternative methods using a\nvariety of multi-concept text prompts and demonstrate its effectiveness with\nclear advantages in text-image consistency and user study.\n","authors":["Jingyuan Zhu","Huimin Ma","Jiansheng Chen","Jian Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.16954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16937v1","updated":"2024-03-25T17:01:34Z","published":"2024-03-25T17:01:34Z","title":"Hyperspherical Classification with Dynamic Label-to-Prototype Assignment","summary":" Aiming to enhance the utilization of metric space by the parametric softmax\nclassifier, recent studies suggest replacing it with a non-parametric\nalternative. Although a non-parametric classifier may provide better metric\nspace utilization, it introduces the challenge of capturing inter-class\nrelationships. A shared characteristic among prior non-parametric classifiers\nis the static assignment of labels to prototypes during the training, ie, each\nprototype consistently represents a class throughout the training course.\nOrthogonal to previous works, we present a simple yet effective method to\noptimize the category assigned to each prototype (label-to-prototype\nassignment) during the training. To this aim, we formalize the problem as a\ntwo-step optimization objective over network parameters and label-to-prototype\nassignment mapping. We solve this optimization using a sequential combination\nof gradient descent and Bipartide matching. We demonstrate the benefits of the\nproposed approach by conducting experiments on balanced and long-tail\nclassification problems using different backbone network architectures. In\nparticular, our method outperforms its competitors by 1.22\\% accuracy on\nCIFAR-100, and 2.15\\% on ImageNet-200 using a metric space dimension half of\nthe size of its competitors. Code:\nhttps://github.com/msed-Ebrahimi/DL2PA_CVPR24\n","authors":["Mohammad Saeed Ebrahimi Saadabadi","Ali Dabouei","Sahar Rahimi Malakshan","Nasser M. Nasrabad"],"pdf_url":"https://arxiv.org/pdf/2403.16937v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.08399v2","updated":"2024-03-25T16:50:43Z","published":"2024-01-16T14:41:42Z","title":"TACO: Benchmarking Generalizable Bimanual Tool-ACtion-Object\n Understanding","summary":" Humans commonly work with multiple objects in daily life and can intuitively\ntransfer manipulation skills to novel objects by understanding object\nfunctional regularities. However, existing technical approaches for analyzing\nand synthesizing hand-object manipulation are mostly limited to handling a\nsingle hand and object due to the lack of data support. To address this, we\nconstruct TACO, an extensive bimanual hand-object-interaction dataset spanning\na large variety of tool-action-object compositions for daily human activities.\nTACO contains 2.5K motion sequences paired with third-person and egocentric\nviews, precise hand-object 3D meshes, and action labels. To rapidly expand the\ndata scale, we present a fully automatic data acquisition pipeline combining\nmulti-view sensing with an optical motion capture system. With the vast\nresearch fields provided by TACO, we benchmark three generalizable\nhand-object-interaction tasks: compositional action recognition, generalizable\nhand-object motion forecasting, and cooperative grasp synthesis. Extensive\nexperiments reveal new insights, challenges, and opportunities for advancing\nthe studies of generalizable hand-object motion analysis and synthesis. Our\ndata and code are available at https://taco2024.github.io.\n","authors":["Yun Liu","Haolin Yang","Xu Si","Ling Liu","Zipeng Li","Yuxiang Zhang","Yebin Liu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2401.08399v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16921v1","updated":"2024-03-25T16:39:15Z","published":"2024-03-25T16:39:15Z","title":"PropTest: Automatic Property Testing for Improved Visual Programming","summary":" Visual Programming has emerged as an alternative to end-to-end black-box\nvisual reasoning models. This type of methods leverage Large Language Models\n(LLMs) to decompose a problem and generate the source code for an executable\ncomputer program. This strategy has the advantage of offering an interpretable\nreasoning path and does not require finetuning a model with task-specific data.\nWe propose PropTest, a general strategy that improves visual programming by\nfurther using an LLM to generate code that tests for visual properties in an\ninitial round of proposed solutions. Particularly, our method tests for\ndata-type consistency, as well as syntactic and semantic properties in the\ngenerated solutions. Our proposed solution outperforms baselines and achieves\ncomparable results to state-of-the-art methods while using smaller and publicly\navailable LLMs (CodeLlama-7B and WizardCoder-15B). This is demonstrated across\ndifferent benchmarks on visual question answering and referring expression\ncomprehension, showing the efficacy of our approach in enhancing the\nperformance and generalization of visual reasoning tasks. Specifically,\nPropTest improves ViperGPT by obtaining 48.66% accuracy (+8.3%) on the A-OKVQA\nbenchmark and 52.8% (+3.3%) on the RefCOCO+ benchmark using CodeLlama-7B.\n","authors":["Jaywon Koo","Ziyan Yang","Paola Cascante-Bonilla","Baishakhi Ray","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2403.16921v1.pdf","comment":"Project Page: https://jaywonkoo17.github.io/PropTest/"},{"id":"http://arxiv.org/abs/2311.11138v2","updated":"2024-03-25T16:10:20Z","published":"2023-11-18T18:18:33Z","title":"Estimating Uncertainty in Landslide Segmentation Models","summary":" Landslides are a recurring, widespread hazard. Preparation and mitigation\nefforts can be aided by a high-quality, large-scale dataset that covers global\nat-risk areas. Such a dataset currently does not exist and is impossible to\nconstruct manually. Recent automated efforts focus on deep learning models for\nlandslide segmentation (pixel labeling) from satellite imagery. However, it is\nalso important to characterize the uncertainty or confidence levels of such\nsegmentations. Accurate and robust uncertainty estimates can enable low-cost\n(in terms of manual labor) oversight of auto-generated landslide databases to\nresolve errors, identify hard negative examples, and increase the size of\nlabeled training data. In this paper, we evaluate several methods for assessing\npixel-level uncertainty of the segmentation. Three methods that do not require\narchitectural changes were compared, including Pre-Threshold activations,\nMonte-Carlo Dropout and Test-Time Augmentation -- a method that measures the\nrobustness of predictions in the face of data augmentation. Experimentally, the\nquality of the latter method was consistently higher than the others across a\nvariety of models and metrics in our dataset.\n","authors":["Savinay Nagendra","Chaopeng Shen","Daniel Kifer"],"pdf_url":"https://arxiv.org/pdf/2311.11138v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16897v1","updated":"2024-03-25T16:08:04Z","published":"2024-03-25T16:08:04Z","title":"Make-It-Vivid: Dressing Your Animatable Biped Cartoon Characters from\n Text","summary":" Creating and animating 3D biped cartoon characters is crucial and valuable in\nvarious applications. Compared with geometry, the diverse texture design plays\nan important role in making 3D biped cartoon characters vivid and charming.\nTherefore, we focus on automatic texture design for cartoon characters based on\ninput instructions. This is challenging for domain-specific requirements and a\nlack of high-quality data. To address this challenge, we propose Make-It-Vivid,\nthe first attempt to enable high-quality texture generation from text in UV\nspace. We prepare a detailed text-texture paired data for 3D characters by\nusing vision-question-answering agents. Then we customize a pretrained\ntext-to-image model to generate texture map with template structure while\npreserving the natural 2D image knowledge. Furthermore, to enhance fine-grained\ndetails, we propose a novel adversarial learning scheme to shorten the domain\ngap between original dataset and realistic texture domain. Extensive\nexperiments show that our approach outperforms current texture generation\nmethods, resulting in efficient character texturing and faithful generation\nwith prompts. Besides, we showcase various applications such as out of domain\ngeneration and texture stylization. We also provide an efficient generation\nsystem for automatic text-guided textured character generation and animation.\n","authors":["Junshu Tang","Yanhong Zeng","Ke Fan","Xuheng Wang","Bo Dai","Kai Chen","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2403.16897v1.pdf","comment":"Project page: https://make-it-vivid.github.io/"},{"id":"http://arxiv.org/abs/2312.10035v2","updated":"2024-03-25T16:00:01Z","published":"2023-12-15T18:59:59Z","title":"Point Transformer V3: Simpler, Faster, Stronger","summary":" This paper is not motivated to seek innovation within the attention\nmechanism. Instead, it focuses on overcoming the existing trade-offs between\naccuracy and efficiency within the context of point cloud processing,\nleveraging the power of scale. Drawing inspiration from recent advances in 3D\nlarge-scale representation learning, we recognize that model performance is\nmore influenced by scale than by intricate design. Therefore, we present Point\nTransformer V3 (PTv3), which prioritizes simplicity and efficiency over the\naccuracy of certain mechanisms that are minor to the overall performance after\nscaling, such as replacing the precise neighbor search by KNN with an efficient\nserialized neighbor mapping of point clouds organized with specific patterns.\nThis principle enables significant scaling, expanding the receptive field from\n16 to 1024 points while remaining efficient (a 3x increase in processing speed\nand a 10x improvement in memory efficiency compared with its predecessor,\nPTv2). PTv3 attains state-of-the-art results on over 20 downstream tasks that\nspan both indoor and outdoor scenarios. Further enhanced with multi-dataset\njoint training, PTv3 pushes these results to a higher level.\n","authors":["Xiaoyang Wu","Li Jiang","Peng-Shuai Wang","Zhijian Liu","Xihui Liu","Yu Qiao","Wanli Ouyang","Tong He","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.10035v2.pdf","comment":"CVPR 2024, code available at Pointcept\n (https://github.com/Pointcept/PointTransformerV3)"},{"id":"http://arxiv.org/abs/2403.17042v1","updated":"2024-03-25T15:58:26Z","published":"2024-03-25T15:58:26Z","title":"Provably Robust Score-Based Diffusion Posterior Sampling for\n Plug-and-Play Image Reconstruction","summary":" In a great number of tasks in science and engineering, the goal is to infer\nan unknown image from a small number of measurements collected from a known\nforward model describing certain sensing or imaging modality. Due to resource\nconstraints, this task is often extremely ill-posed, which necessitates the\nadoption of expressive prior information to regularize the solution space.\nScore-based diffusion models, due to its impressive empirical success, have\nemerged as an appealing candidate of an expressive prior in image\nreconstruction. In order to accommodate diverse tasks at once, it is of great\ninterest to develop efficient, consistent and robust algorithms that\nincorporate {\\em unconditional} score functions of an image prior distribution\nin conjunction with flexible choices of forward models.\n This work develops an algorithmic framework for employing score-based\ndiffusion models as an expressive data prior in general nonlinear inverse\nproblems. Motivated by the plug-and-play framework in the imaging community, we\nintroduce a diffusion plug-and-play method (\\textsf{DPnP}) that alternatively\ncalls two samplers, a proximal consistency sampler based solely on the\nlikelihood function of the forward model, and a denoising diffusion sampler\nbased solely on the score functions of the image prior. The key insight is that\ndenoising under white Gaussian noise can be solved {\\em rigorously} via both\nstochastic (i.e., DDPM-type) and deterministic (i.e., DDIM-type) samplers using\nthe unconditional score functions. We establish both asymptotic and\nnon-asymptotic performance guarantees of \\textsf{DPnP}, and provide numerical\nexperiments to illustrate its promise in solving both linear and nonlinear\nimage reconstruction tasks. To the best of our knowledge, \\textsf{DPnP} is the\nfirst provably-robust posterior sampling method for nonlinear inverse problems\nusing unconditional diffusion priors.\n","authors":["Xingyu Xu","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2403.17042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16888v1","updated":"2024-03-25T15:56:51Z","published":"2024-03-25T15:56:51Z","title":"Towards Balanced RGB-TSDF Fusion for Consistent Semantic Scene\n Completion by 3D RGB Feature Completion and a Classwise Entropy Loss Function","summary":" Semantic Scene Completion (SSC) aims to jointly infer semantics and\noccupancies of 3D scenes. Truncated Signed Distance Function (TSDF), a 3D\nencoding of depth, has been a common input for SSC. Furthermore, RGB-TSDF\nfusion, seems promising since these two modalities provide color and geometry\ninformation, respectively. Nevertheless, RGB-TSDF fusion has been considered\nnontrivial and commonly-used naive addition will result in inconsistent\nresults. We argue that the inconsistency comes from the sparsity of RGB\nfeatures upon projecting into 3D space, while TSDF features are dense, leading\nto imbalanced feature maps when summed up. To address this RGB-TSDF\ndistribution difference, we propose a two-stage network with a 3D RGB feature\ncompletion module that completes RGB features with meaningful values for\noccluded areas. Moreover, we propose an effective classwise entropy loss\nfunction to punish inconsistency. Extensive experiments on public datasets\nverify that our method achieves state-of-the-art performance among methods that\ndo not adopt extra data.\n","authors":["Laiyan Ding","Panwen Hu","Jie Li","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2403.16888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16885v1","updated":"2024-03-25T15:56:17Z","published":"2024-03-25T15:56:17Z","title":"CVT-xRF: Contrastive In-Voxel Transformer for 3D Consistent Radiance\n Fields from Sparse Inputs","summary":" Neural Radiance Fields (NeRF) have shown impressive capabilities for\nphotorealistic novel view synthesis when trained on dense inputs. However, when\ntrained on sparse inputs, NeRF typically encounters issues of incorrect density\nor color predictions, mainly due to insufficient coverage of the scene causing\npartial and sparse supervision, thus leading to significant performance\ndegradation. While existing works mainly consider ray-level consistency to\nconstruct 2D learning regularization based on rendered color, depth, or\nsemantics on image planes, in this paper we propose a novel approach that\nmodels 3D spatial field consistency to improve NeRF's performance with sparse\ninputs. Specifically, we first adopt a voxel-based ray sampling strategy to\nensure that the sampled rays intersect with a certain voxel in 3D space. We\nthen randomly sample additional points within the voxel and apply a Transformer\nto infer the properties of other points on each ray, which are then\nincorporated into the volume rendering. By backpropagating through the\nrendering loss, we enhance the consistency among neighboring points.\nAdditionally, we propose to use a contrastive loss on the encoder output of the\nTransformer to further improve consistency within each voxel. Experiments\ndemonstrate that our method yields significant improvement over different\nradiance fields in the sparse inputs setting, and achieves comparable\nperformance with current works.\n","authors":["Yingji Zhong","Lanqing Hong","Zhenguo Li","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16885v1.pdf","comment":"The paper is accepted by CVPR 2024. Project page is available at\n https://zhongyingji.github.io/CVT-xRF"},{"id":"http://arxiv.org/abs/2403.16862v1","updated":"2024-03-25T15:26:32Z","published":"2024-03-25T15:26:32Z","title":"INPC: Implicit Neural Point Clouds for Radiance Field Rendering","summary":" We introduce a new approach for reconstruction and novel-view synthesis of\nunbounded real-world scenes. In contrast to previous methods using either\nvolumetric fields, grid-based models, or discrete point cloud proxies, we\npropose a hybrid scene representation, which implicitly encodes a point cloud\nin a continuous octree-based probability field and a multi-resolution hash\ngrid. In doing so, we combine the benefits of both worlds by retaining\nfavorable behavior during optimization: Our novel implicit point cloud\nrepresentation and differentiable bilinear rasterizer enable fast rendering\nwhile preserving fine geometric detail without depending on initial priors like\nstructure-from-motion point clouds. Our method achieves state-of-the-art image\nquality on several common benchmark datasets. Furthermore, we achieve fast\ninference at interactive frame rates, and can extract explicit point clouds to\nfurther enhance performance.\n","authors":["Florian Hahlbohm","Linus Franke","Moritz Kappel","Susana Castillo","Marc Stamminger","Marcus Magnor"],"pdf_url":"https://arxiv.org/pdf/2403.16862v1.pdf","comment":"Project page: https://fhahlbohm.github.io/inpc/"},{"id":"http://arxiv.org/abs/2304.11959v2","updated":"2024-03-25T15:15:41Z","published":"2023-04-24T09:53:21Z","title":"A Forward and Backward Compatible Framework for Few-shot\n Class-incremental Pill Recognition","summary":" Automatic Pill Recognition (APR) systems are crucial for enhancing hospital\nefficiency, assisting visually impaired individuals, and preventing\ncross-infection. However, most existing deep learning-based pill recognition\nsystems can only perform classification on classes with sufficient training\ndata. In practice, the high cost of data annotation and the continuous increase\nin new pill classes necessitate the development of a few-shot class-incremental\npill recognition system. This paper introduces the first few-shot\nclass-incremental pill recognition framework, named Discriminative and\nBidirectional Compatible Few-Shot Class-Incremental Learning (DBC-FSCIL). It\nencompasses forward-compatible and backward-compatible learning components. In\nforward-compatible learning, we propose an innovative virtual class synthesis\nstrategy and a Center-Triplet (CT) loss to enhance discriminative feature\nlearning. These virtual classes serve as placeholders in the feature space for\nfuture class updates, providing diverse semantic knowledge for model training.\nFor backward-compatible learning, we develop a strategy to synthesize reliable\npseudo-features of old classes using uncertainty quantification, facilitating\nData Replay (DR) and Knowledge Distillation (KD). This approach allows for the\nflexible synthesis of features and effectively reduces additional storage\nrequirements for samples and models. Additionally, we construct a new pill\nimage dataset for FSCIL and assess various mainstream FSCIL methods,\nestablishing new benchmarks. Our experimental results demonstrate that our\nframework surpasses existing State-of-the-art (SOTA) methods. The code is\navailable at https://github.com/zhang-jinghua/DBC-FSCIL.\n","authors":["Jinghua Zhang","Li Liu","Kai Gao","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2304.11959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16848v1","updated":"2024-03-25T15:09:54Z","published":"2024-03-25T15:09:54Z","title":"Multiple Object Tracking as ID Prediction","summary":" In Multiple Object Tracking (MOT), tracking-by-detection methods have stood\nthe test for a long time, which split the process into two parts according to\nthe definition: object detection and association. They leverage robust\nsingle-frame detectors and treat object association as a post-processing step\nthrough hand-crafted heuristic algorithms and surrogate tasks. However, the\nnature of heuristic techniques prevents end-to-end exploitation of training\ndata, leading to increasingly cumbersome and challenging manual modification\nwhile facing complicated or novel scenarios. In this paper, we regard this\nobject association task as an End-to-End in-context ID prediction problem and\npropose a streamlined baseline called MOTIP. Specifically, we form the target\nembeddings into historical trajectory information while considering the\ncorresponding IDs as in-context prompts, then directly predict the ID labels\nfor the objects in the current frame. Thanks to this end-to-end process, MOTIP\ncan learn tracking capabilities straight from training data, freeing itself\nfrom burdensome hand-crafted algorithms. Without bells and whistles, our method\nachieves impressive state-of-the-art performance in complex scenarios like\nDanceTrack and SportsMOT, and it performs competitively with other\ntransformer-based methods on MOT17. We believe that MOTIP demonstrates\nremarkable potential and can serve as a starting point for future research. The\ncode is available at https://github.com/MCG-NJU/MOTIP.\n","authors":["Ruopeng Gao","Yijun Zhang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16848v1.pdf","comment":"71.4 HOTA on DanceTrack (with CrowdHuman), 67.5/70.0 HOTA on\n DanceTrack built upon Deformable DETR and DAB-Deformable DETR respectively\n (without additional data). The code repository will be created within several\n days"},{"id":"http://arxiv.org/abs/2403.16834v1","updated":"2024-03-25T14:57:29Z","published":"2024-03-25T14:57:29Z","title":"From Two Stream to One Stream: Efficient RGB-T Tracking via Mutual\n Prompt Learning and Knowledge Distillation","summary":" Due to the complementary nature of visible light and thermal in-frared\nmodalities, object tracking based on the fusion of visible light images and\nthermal images (referred to as RGB-T tracking) has received increasing\nattention from researchers in recent years. How to achieve more comprehensive\nfusion of information from the two modalities at a lower cost has been an issue\nthat re-searchers have been exploring. Inspired by visual prompt learn-ing, we\ndesigned a novel two-stream RGB-T tracking architecture based on cross-modal\nmutual prompt learning, and used this model as a teacher to guide a one-stream\nstudent model for rapid learning through knowledge distillation techniques.\nExtensive experiments have shown that, compared to similar RGB-T track-ers, our\ndesigned teacher model achieved the highest precision rate, while the student\nmodel, with comparable precision rate to the teacher model, realized an\ninference speed more than three times faster than the teacher model.(Codes will\nbe available if accepted.)\n","authors":["Yang Luo","Xiqing Guo","Hao Li"],"pdf_url":"https://arxiv.org/pdf/2403.16834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16831v1","updated":"2024-03-25T14:57:18Z","published":"2024-03-25T14:57:18Z","title":"UrbanVLP: A Multi-Granularity Vision-Language Pre-Trained Foundation\n Model for Urban Indicator Prediction","summary":" Urban indicator prediction aims to infer socio-economic metrics in diverse\nurban landscapes using data-driven methods. However, prevalent pre-trained\nmodels, particularly those reliant on satellite imagery, face dual challenges.\nFirstly, concentrating solely on macro-level patterns from satellite data may\nintroduce bias, lacking nuanced details at micro levels, such as architectural\ndetails at a place. Secondly, the lack of interpretability in pre-trained\nmodels limits their utility in providing transparent evidence for urban\nplanning. In response to these issues, we devise a novel Vision-Language\nPre-Trained Model (UrbanVLP) in this paper. Our UrbanVLP seamlessly integrates\nmulti-granularity information from both macro (satellite) and micro\n(street-view) levels, overcoming the limitations of prior pre-trained models.\nMoreover, it introduces automatic text generation and calibration, elevating\ninterpretability in downstream applications by producing high-quality text\ndescriptions of urban imagery. Rigorous experiments conducted across six\nsocio-economic tasks underscore UrbanVLP's superior performance. We also deploy\na web platform to verify its practicality.\n","authors":["Xixuan Hao","Wei Chen","Yibo Yan","Siru Zhong","Kun Wang","Qingsong Wen","Yuxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2403.16831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15529v2","updated":"2024-03-25T14:52:44Z","published":"2023-11-27T04:22:48Z","title":"Efficient Dataset Distillation via Minimax Diffusion","summary":" Dataset distillation reduces the storage and computational consumption of\ntraining a network by generating a small surrogate dataset that encapsulates\nrich information of the original large-scale one. However, previous\ndistillation methods heavily rely on the sample-wise iterative optimization\nscheme. As the images-per-class (IPC) setting or image resolution grows larger,\nthe necessary computation will demand overwhelming time and resources. In this\nwork, we intend to incorporate generative diffusion techniques for computing\nthe surrogate dataset. Observing that key factors for constructing an effective\nsurrogate dataset are representativeness and diversity, we design additional\nminimax criteria in the generative training to enhance these facets for the\ngenerated images of diffusion models. We present a theoretical model of the\nprocess as hierarchical diffusion control demonstrating the flexibility of the\ndiffusion process to target these criteria without jeopardizing the\nfaithfulness of the sample to the desired distribution. The proposed method\nachieves state-of-the-art validation performance while demanding much less\ncomputational resources. Under the 100-IPC setting on ImageWoof, our method\nrequires less than one-twentieth the distillation time of previous methods, yet\nyields even better performance. Source code and generated data are available in\nhttps://github.com/vimar-gu/MinimaxDiffusion.\n","authors":["Jianyang Gu","Saeed Vahidian","Vyacheslav Kungurtsev","Haonan Wang","Wei Jiang","Yang You","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2311.15529v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.13848v2","updated":"2024-03-25T14:45:53Z","published":"2024-02-21T14:50:24Z","title":"Zero-BEV: Zero-shot Projection of Any First-Person Modality to BEV Maps","summary":" Bird's-eye view (BEV) maps are an important geometrically structured\nrepresentation widely used in robotics, in particular self-driving vehicles and\nterrestrial robots. Existing algorithms either require depth information for\nthe geometric projection, which is not always reliably available, or are\ntrained end-to-end in a fully supervised way to map visual first-person\nobservations to BEV representation, and are therefore restricted to the output\nmodality they have been trained for. In contrast, we propose a new model\ncapable of performing zero-shot projections of any modality available in a\nfirst person view to the corresponding BEV map. This is achieved by\ndisentangling the geometric inverse perspective projection from the modality\ntransformation, eg. RGB to occupancy. The method is general and we showcase\nexperiments projecting to BEV three different modalities: semantic\nsegmentation, motion vectors and object bounding boxes detected in first\nperson. We experimentally show that the model outperforms competing methods, in\nparticular the widely used baseline resorting to monocular depth estimation.\n","authors":["Gianluca Monaci","Leonid Antsfeld","Boris Chidlovskii","Christian Wolf"],"pdf_url":"https://arxiv.org/pdf/2402.13848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09913v2","updated":"2024-03-25T14:09:09Z","published":"2023-12-15T16:23:42Z","title":"LAENeRF: Local Appearance Editing for Neural Radiance Fields","summary":" Due to the omnipresence of Neural Radiance Fields (NeRFs), the interest\ntowards editable implicit 3D representations has surged over the last years.\nHowever, editing implicit or hybrid representations as used for NeRFs is\ndifficult due to the entanglement of appearance and geometry encoded in the\nmodel parameters. Despite these challenges, recent research has shown first\npromising steps towards photorealistic and non-photorealistic appearance edits.\nThe main open issues of related work include limited interactivity, a lack of\nsupport for local edits and large memory requirements, rendering them less\nuseful in practice. We address these limitations with LAENeRF, a unified\nframework for photorealistic and non-photorealistic appearance editing of\nNeRFs. To tackle local editing, we leverage a voxel grid as starting point for\nregion selection. We learn a mapping from expected ray terminations to final\noutput color, which can optionally be supervised by a style loss, resulting in\na framework which can perform photorealistic and non-photorealistic appearance\nediting of selected regions. Relying on a single point per ray for our mapping,\nwe limit memory requirements and enable fast optimization. To guarantee\ninteractivity, we compose the output color using a set of learned, modifiable\nbase colors, composed with additive layer mixing. Compared to concurrent work,\nLAENeRF enables recoloring and stylization while keeping processing time low.\nFurthermore, we demonstrate that our approach surpasses baseline methods both\nquantitatively and qualitatively.\n","authors":["Lukas Radl","Michael Steiner","Andreas Kurz","Markus Steinberger"],"pdf_url":"https://arxiv.org/pdf/2312.09913v2.pdf","comment":"Accepted to CVPR 2024! Project website:\n https://r4dl.github.io/LAENeRF/"},{"id":"http://arxiv.org/abs/2307.04570v3","updated":"2024-03-25T13:31:33Z","published":"2023-07-10T14:02:31Z","title":"A Call to Reflect on Evaluation Practices for Age Estimation:\n Comparative Analysis of the State-of-the-Art and a Unified Benchmark","summary":" Comparing different age estimation methods poses a challenge due to the\nunreliability of published results stemming from inconsistencies in the\nbenchmarking process. Previous studies have reported continuous performance\nimprovements over the past decade using specialized methods; however, our\nfindings challenge these claims. This paper identifies two trivial, yet\npersistent issues with the currently used evaluation protocol and describes how\nto resolve them. We offer an extensive comparative analysis for\nstate-of-the-art facial age estimation methods. Surprisingly, we find that the\nperformance differences between the methods are negligible compared to the\neffect of other factors, such as facial alignment, facial coverage, image\nresolution, model architecture, or the amount of data used for pretraining. We\nuse the gained insights to propose using FaRL as the backbone model and\ndemonstrate its effectiveness on all public datasets. We make the source code\nand exact data splits public on GitHub.\n","authors":["Jakub Paplham","Vojtech Franc"],"pdf_url":"https://arxiv.org/pdf/2307.04570v3.pdf","comment":"CVPR 2024 Camera-Ready"},{"id":"http://arxiv.org/abs/2403.16707v1","updated":"2024-03-25T12:44:52Z","published":"2024-03-25T12:44:52Z","title":"One-Shot Domain Incremental Learning","summary":" Domain incremental learning (DIL) has been discussed in previous studies on\ndeep neural network models for classification. In DIL, we assume that samples\non new domains are observed over time. The models must classify inputs on all\ndomains. In practice, however, we may encounter a situation where we need to\nperform DIL under the constraint that the samples on the new domain are\nobserved only infrequently. Therefore, in this study, we consider the extreme\ncase where we have only one sample from the new domain, which we call one-shot\nDIL. We first empirically show that existing DIL methods do not work well in\none-shot DIL. We have analyzed the reason for this failure through various\ninvestigations. According to our analysis, we clarify that the difficulty of\none-shot DIL is caused by the statistics in the batch normalization layers.\nTherefore, we propose a technique regarding these statistics and demonstrate\nthe effectiveness of our technique through experiments on open datasets.\n","authors":["Yasushi Esaki","Satoshi Koide","Takuro Kutsuna"],"pdf_url":"https://arxiv.org/pdf/2403.16707v1.pdf","comment":"accepted at IEEE International Joint Conference on Neural Networks\n (IJCNN) 2024"},{"id":"http://arxiv.org/abs/2303.17245v4","updated":"2024-03-25T12:20:02Z","published":"2023-03-30T09:22:17Z","title":"Investigating and Mitigating the Side Effects of Noisy Views for\n Self-Supervised Clustering Algorithms in Practical Multi-View Scenarios","summary":" Multi-view clustering (MVC) aims at exploring category structures among\nmulti-view data in self-supervised manners. Multiple views provide more\ninformation than single views and thus existing MVC methods can achieve\nsatisfactory performance. However, their performance might seriously degenerate\nwhen the views are noisy in practical multi-view scenarios. In this paper, we\nformally investigate the drawback of noisy views and then propose a\ntheoretically grounded deep MVC method (namely MVCAN) to address this issue.\nSpecifically, we propose a novel MVC objective that enables un-shared\nparameters and inconsistent clustering predictions across multiple views to\nreduce the side effects of noisy views. Furthermore, a two-level multi-view\niterative optimization is designed to generate robust learning targets for\nrefining individual views' representation learning. Theoretical analysis\nreveals that MVCAN works by achieving the multi-view consistency,\ncomplementarity, and noise robustness. Finally, experiments on extensive public\ndatasets demonstrate that MVCAN outperforms state-of-the-art methods and is\nrobust against the existence of noisy views.\n","authors":["Jie Xu","Yazhou Ren","Xiaolong Wang","Lei Feng","Zheng Zhang","Gang Niu","Xiaofeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.17245v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.14332v3","updated":"2024-03-25T09:53:30Z","published":"2022-05-28T04:45:07Z","title":"V4D: Voxel for 4D Novel View Synthesis","summary":" Neural radiance fields have made a remarkable breakthrough in the novel view\nsynthesis task at the 3D static scene. However, for the 4D circumstance (e.g.,\ndynamic scene), the performance of the existing method is still limited by the\ncapacity of the neural network, typically in a multilayer perceptron network\n(MLP). In this paper, we utilize 3D Voxel to model the 4D neural radiance\nfield, short as V4D, where the 3D voxel has two formats. The first one is to\nregularly model the 3D space and then use the sampled local 3D feature with the\ntime index to model the density field and the texture field by a tiny MLP. The\nsecond one is in look-up tables (LUTs) format that is for the pixel-level\nrefinement, where the pseudo-surface produced by the volume rendering is\nutilized as the guidance information to learn a 2D pixel-level refinement\nmapping. The proposed LUTs-based refinement module achieves the performance\ngain with little computational cost and could serve as the plug-and-play module\nin the novel view synthesis task. Moreover, we propose a more effective\nconditional positional encoding toward the 4D data that achieves performance\ngain with negligible computational burdens. Extensive experiments demonstrate\nthat the proposed method achieves state-of-the-art performance at a low\ncomputational cost.\n","authors":["Wanshui Gan","Hongbin Xu","Yi Huang","Shifeng Chen","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2205.14332v3.pdf","comment":"Code released. Accepted by IEEE TVCG 2023"},{"id":"http://arxiv.org/abs/2403.16469v1","updated":"2024-03-25T06:50:25Z","published":"2024-03-25T06:50:25Z","title":"Learning from Reduced Labels for Long-Tailed Data","summary":" Long-tailed data is prevalent in real-world classification tasks and heavily\nrelies on supervised information, which makes the annotation process\nexceptionally labor-intensive and time-consuming. Unfortunately, despite being\na common approach to mitigate labeling costs, existing weakly supervised\nlearning methods struggle to adequately preserve supervised information for\ntail samples, resulting in a decline in accuracy for the tail classes. To\nalleviate this problem, we introduce a novel weakly supervised labeling setting\ncalled Reduced Label. The proposed labeling setting not only avoids the decline\nof supervised information for the tail samples, but also decreases the labeling\ncosts associated with long-tailed data. Additionally, we propose an\nstraightforward and highly efficient unbiased framework with strong theoretical\nguarantees to learn from these Reduced Labels. Extensive experiments conducted\non benchmark datasets including ImageNet validate the effectiveness of our\napproach, surpassing the performance of state-of-the-art weakly supervised\nmethods.\n","authors":["Meng Wei","Zhongnian Li","Yong Zhou","Xinzheng Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16469v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.17719v1","updated":"2024-03-25T05:21:26Z","published":"2024-03-25T05:21:26Z","title":"Resolution Limit of Single-Photon LiDAR","summary":" Single-photon Light Detection and Ranging (LiDAR) systems are often equipped\nwith an array of detectors for improved spatial resolution and sensing speed.\nHowever, given a fixed amount of flux produced by the laser transmitter across\nthe scene, the per-pixel Signal-to-Noise Ratio (SNR) will decrease when more\npixels are packed in a unit space. This presents a fundamental trade-off\nbetween the spatial resolution of the sensor array and the SNR received at each\npixel. Theoretical characterization of this fundamental limit is explored. By\nderiving the photon arrival statistics and introducing a series of new\napproximation techniques, the Mean Squared Error (MSE) of the\nmaximum-likelihood estimator of the time delay is derived. The theoretical\npredictions align well with simulations and real data.\n","authors":["Stanley H. Chan","Hashan K. Weerasooriya","Weijian Zhang","Pamela Abshire","Istvan Gyongy","Robert K. Henderson"],"pdf_url":"https://arxiv.org/pdf/2403.17719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16374v1","updated":"2024-03-25T02:38:34Z","published":"2024-03-25T02:38:34Z","title":"ProIn: Learning to Predict Trajectory Based on Progressive Interactions\n for Autonomous Driving","summary":" Accurate motion prediction of pedestrians, cyclists, and other surrounding\nvehicles (all called agents) is very important for autonomous driving. Most\nexisting works capture map information through an one-stage interaction with\nmap by vector-based attention, to provide map constraints for social\ninteraction and multi-modal differentiation. However, these methods have to\nencode all required map rules into the focal agent's feature, so as to retain\nall possible intentions' paths while at the meantime to adapt to potential\nsocial interaction. In this work, a progressive interaction network is proposed\nto enable the agent's feature to progressively focus on relevant maps, in order\nto better learn agents' feature representation capturing the relevant map\nconstraints. The network progressively encode the complex influence of map\nconstraints into the agent's feature through graph convolutions at the\nfollowing three stages: after historical trajectory encoder, after social\ninteraction, and after multi-modal differentiation. In addition, a weight\nallocation mechanism is proposed for multi-modal training, so that each mode\ncan obtain learning opportunities from a single-mode ground truth. Experiments\nhave validated the superiority of progressive interactions to the existing\none-stage interaction, and demonstrate the effectiveness of each component.\nEncouraging results were obtained in the challenging benchmarks.\n","authors":["Yinke Dong","Haifeng Yuan","Hongkun Liu","Wei Jing","Fangzhen Li","Hongmin Liu","Bin Fan"],"pdf_url":"https://arxiv.org/pdf/2403.16374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17177v1","updated":"2024-03-25T20:44:01Z","published":"2024-03-25T20:44:01Z","title":"Brain Stroke Segmentation Using Deep Learning Models: A Comparative\n Study","summary":" Stroke segmentation plays a crucial role in the diagnosis and treatment of\nstroke patients by providing spatial information about affected brain regions\nand the extent of damage. Segmenting stroke lesions accurately is a challenging\ntask, given that conventional manual techniques are time consuming and prone to\nerrors. Recently, advanced deep models have been introduced for general medical\nimage segmentation, demonstrating promising results that surpass many state of\nthe art networks when evaluated on specific datasets. With the advent of the\nvision Transformers, several models have been introduced based on them, while\nothers have aimed to design better modules based on traditional convolutional\nlayers to extract long-range dependencies like Transformers. The question of\nwhether such high-level designs are necessary for all segmentation cases to\nachieve the best results remains unanswered. In this study, we selected four\ntypes of deep models that were recently proposed and evaluated their\nperformance for stroke segmentation: a pure Transformer-based architecture\n(DAE-Former), two advanced CNN-based models (LKA and DLKA) with attention\nmechanisms in their design, an advanced hybrid model that incorporates CNNs\nwith Transformers (FCT), and the well-known self-adaptive nnUNet framework with\nits configuration based on given data. We examined their performance on two\npublicly available datasets, and found that the nnUNet achieved the best\nresults with the simplest design among all. Revealing the robustness issue of\nTransformers to such variabilities serves as a potential reason for their\nweaker performance. Furthermore, nnUNet's success underscores the significant\nimpact of preprocessing and postprocessing techniques in enhancing segmentation\nresults, surpassing the focus solely on architectural designs\n","authors":["Ahmed Soliman","Yousif Yousif","Ahmed Ibrahim","Yalda Zafari-Ghadim","Essam A. Rashed","Mohamed Mabrok"],"pdf_url":"https://arxiv.org/pdf/2403.17177v1.pdf","comment":null}]},"2024-03-24T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.15168v3","updated":"2024-03-24T23:32:50Z","published":"2023-10-23T17:59:52Z","title":"Ghost on the Shell: An Expressive Representation of General 3D Shapes","summary":" The creation of photorealistic virtual worlds requires the accurate modeling\nof 3D surface geometry for a wide range of objects. For this, meshes are\nappealing since they 1) enable fast physics-based rendering with realistic\nmaterial and lighting, 2) support physical simulation, and 3) are\nmemory-efficient for modern graphics pipelines. Recent work on reconstructing\nand statistically modeling 3D shape, however, has critiqued meshes as being\ntopologically inflexible. To capture a wide range of object shapes, any 3D\nrepresentation must be able to model solid, watertight, shapes as well as thin,\nopen, surfaces. Recent work has focused on the former, and methods for\nreconstructing open surfaces do not support fast reconstruction with material\nand lighting or unconditional generative modelling. Inspired by the observation\nthat open surfaces can be seen as islands floating on watertight surfaces, we\nparameterize open surfaces by defining a manifold signed distance field on\nwatertight templates. With this parameterization, we further develop a\ngrid-based and differentiable representation that parameterizes both watertight\nand non-watertight meshes of arbitrary topology. Our new representation, called\nGhost-on-the-Shell (G-Shell), enables two important applications:\ndifferentiable rasterization-based reconstruction from multiview images and\ngenerative modelling of non-watertight meshes. We empirically demonstrate that\nG-Shell achieves state-of-the-art performance on non-watertight mesh\nreconstruction and generation tasks, while also performing effectively for\nwatertight meshes.\n","authors":["Zhen Liu","Yao Feng","Yuliang Xiu","Weiyang Liu","Liam Paull","Michael J. Black","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2310.15168v3.pdf","comment":"ICLR 2024 Oral (v3: 30 pages, 19 figures, Project Page:\n https://gshell3d.github.io/)"},{"id":"http://arxiv.org/abs/2311.01623v3","updated":"2024-03-24T23:13:06Z","published":"2023-11-03T16:58:10Z","title":"VQPy: An Object-Oriented Approach to Modern Video Analytics","summary":" Video analytics is widely used in contemporary systems and services. At the\nforefront of video analytics are video queries that users develop to find\nobjects of particular interest. Building upon the insight that video objects\n(e.g., human, animals, cars, etc.), the center of video analytics, are similar\nin spirit to objects modeled by traditional object-oriented languages, we\npropose to develop an object-oriented approach to video analytics. This\napproach, named VQPy, consists of a frontend$\\unicode{x2015}$a Python variant\nwith constructs that make it easy for users to express video objects and their\ninteractions$\\unicode{x2015}$as well as an extensible backend that can\nautomatically construct and optimize pipelines based on video objects. We have\nimplemented and open-sourced VQPy, which has been productized in Cisco as part\nof its DeepVision framework.\n","authors":["Shan Yu","Zhenting Zhu","Yu Chen","Hanchen Xu","Pengzhan Zhao","Yang Wang","Arthi Padmanabhan","Hugo Latapie","Harry Xu"],"pdf_url":"https://arxiv.org/pdf/2311.01623v3.pdf","comment":"MLSys'24"},{"id":"http://arxiv.org/abs/2403.16318v1","updated":"2024-03-24T22:53:16Z","published":"2024-03-24T22:53:16Z","title":"AutoInst: Automatic Instance-Based Segmentation of LiDAR 3D Scans","summary":" Recently, progress in acquisition equipment such as LiDAR sensors has enabled\nsensing increasingly spacious outdoor 3D environments. Making sense of such 3D\nacquisitions requires fine-grained scene understanding, such as constructing\ninstance-based 3D scene segmentations. Commonly, a neural network is trained\nfor this task; however, this requires access to a large, densely annotated\ndataset, which is widely known to be challenging to obtain. To address this\nissue, in this work we propose to predict instance segmentations for 3D scenes\nin an unsupervised way, without relying on ground-truth annotations. To this\nend, we construct a learning framework consisting of two components: (1) a\npseudo-annotation scheme for generating initial unsupervised pseudo-labels; and\n(2) a self-training algorithm for instance segmentation to fit robust, accurate\ninstances from initial noisy proposals. To enable generating 3D instance mask\nproposals, we construct a weighted proxy-graph by connecting 3D points with\nedges integrating multi-modal image- and point-based self-supervised features,\nand perform graph-cuts to isolate individual pseudo-instances. We then build on\na state-of-the-art point-based architecture and train a 3D instance\nsegmentation model, resulting in significant refinement of initial proposals.\nTo scale to arbitrary complexity 3D scenes, we design our algorithm to operate\non local 3D point chunks and construct a merging step to generate scene-level\ninstance segmentations. Experiments on the challenging SemanticKITTI benchmark\ndemonstrate the potential of our approach, where it attains 13.3% higher\nAverage Precision and 9.1% higher F1 score compared to the best-performing\nbaseline. The code will be made publicly available at\nhttps://github.com/artonson/autoinst.\n","authors":["Cedric Perauer","Laurenz Adrian Heidrich","Haifan Zhang","Matthias Nießner","Anastasiia Kornilova","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2403.16318v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.03881v2","updated":"2024-03-24T21:38:49Z","published":"2024-03-06T17:41:41Z","title":"Latent Dataset Distillation with Diffusion Models","summary":" The efficacy of machine learning has traditionally relied on the availability\nof increasingly larger datasets. However, large datasets pose storage\nchallenges and contain non-influential samples, which could be ignored during\ntraining without impacting the final accuracy of the model. In response to\nthese limitations, the concept of distilling the information on a dataset into\na condensed set of (synthetic) samples, namely a distilled dataset, emerged.\nOne crucial aspect is the selected architecture (usually ConvNet) for linking\nthe original and synthetic datasets. However, the final accuracy is lower if\nthe employed model architecture differs from the model used during\ndistillation. Another challenge is the generation of high-resolution images,\ne.g., 128x128 and higher. In this paper, we propose Latent Dataset Distillation\nwith Diffusion Models (LD3M) that combine diffusion in latent space with\ndataset distillation to tackle both challenges. LD3M incorporates a novel\ndiffusion process tailored for dataset distillation, which improves the\ngradient norms for learning synthetic images. By adjusting the number of\ndiffusion steps, LD3M also offers a straightforward way of controlling the\ntrade-off between speed and accuracy. We evaluate our approach in several\nImageNet subsets and for high-resolution images (128x128 and 256x256). As a\nresult, LD3M consistently outperforms state-of-the-art distillation techniques\nby up to 4.8 p.p. and 4.2 p.p. for 1 and 10 images per class, respectively.\n","authors":["Brian B. Moser","Federico Raue","Sebastian Palacio","Stanislav Frolov","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.03881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16292v1","updated":"2024-03-24T20:48:36Z","published":"2024-03-24T20:48:36Z","title":"latentSplat: Autoencoding Variational Gaussians for Fast Generalizable\n 3D Reconstruction","summary":" We present latentSplat, a method to predict semantic Gaussians in a 3D latent\nspace that can be splatted and decoded by a light-weight generative 2D\narchitecture. Existing methods for generalizable 3D reconstruction either do\nnot enable fast inference of high resolution novel views due to slow volume\nrendering, or are limited to interpolation of close input views, even in\nsimpler settings with a single central object, where 360-degree generalization\nis possible. In this work, we combine a regression-based approach with a\ngenerative model, moving towards both of these capabilities within the same\nmethod, trained purely on readily available real video data. The core of our\nmethod are variational 3D Gaussians, a representation that efficiently encodes\nvarying uncertainty within a latent space consisting of 3D feature Gaussians.\nFrom these Gaussians, specific instances can be sampled and rendered via\nefficient Gaussian splatting and a fast, generative decoder network. We show\nthat latentSplat outperforms previous works in reconstruction quality and\ngeneralization, while being fast and scalable to high-resolution data.\n","authors":["Christopher Wewer","Kevin Raj","Eddy Ilg","Bernt Schiele","Jan Eric Lenssen"],"pdf_url":"https://arxiv.org/pdf/2403.16292v1.pdf","comment":"Project website: https://geometric-rl.mpi-inf.mpg.de/latentsplat/"},{"id":"http://arxiv.org/abs/2403.16286v1","updated":"2024-03-24T20:31:42Z","published":"2024-03-24T20:31:42Z","title":"HemoSet: The First Blood Segmentation Dataset for Automation of\n Hemostasis Management","summary":" Hemorrhaging occurs in surgeries of all types, forcing surgeons to quickly\nadapt to the visual interference that results from blood rapidly filling the\nsurgical field. Introducing automation into the crucial surgical task of\nhemostasis management would offload mental and physical tasks from the surgeon\nand surgical assistants while simultaneously increasing the efficiency and\nsafety of the operation. The first step in automation of hemostasis management\nis detection of blood in the surgical field. To propel the development of blood\ndetection algorithms in surgeries, we present HemoSet, the first blood\nsegmentation dataset based on bleeding during a live animal robotic surgery.\nOur dataset features vessel hemorrhage scenarios where turbulent flow leads to\nabnormal pooling geometries in surgical fields. These pools are formed in\nconditions endemic to surgical procedures -- uneven heterogeneous tissue, under\nglossy lighting conditions and rapid tool movement. We benchmark several\nstate-of-the-art segmentation models and provide insight into the difficulties\nspecific to blood detection. We intend for HemoSet to spur development of\nautonomous blood suction tools by providing a platform for training and\nrefining blood segmentation models, addressing the precision needed for such\nrobotics.\n","authors":["Albert J. Miao Shan Lin","Jingpei Lu","Florian Richter","Benjamin Ostrander","Emily K. Funk","Ryan K. Orosco","Michael C. Yip"],"pdf_url":"https://arxiv.org/pdf/2403.16286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04926v2","updated":"2024-03-24T20:25:03Z","published":"2024-03-07T22:21:08Z","title":"BAGS: Blur Agnostic Gaussian Splatting through Multi-Scale Kernel\n Modeling","summary":" Recent efforts in using 3D Gaussians for scene reconstruction and novel view\nsynthesis can achieve impressive results on curated benchmarks; however, images\ncaptured in real life are often blurry. In this work, we analyze the robustness\nof Gaussian-Splatting-based methods against various image blur, such as motion\nblur, defocus blur, downscaling blur, \\etc. Under these degradations,\nGaussian-Splatting-based methods tend to overfit and produce worse results than\nNeural-Radiance-Field-based methods. To address this issue, we propose Blur\nAgnostic Gaussian Splatting (BAGS). BAGS introduces additional 2D modeling\ncapacities such that a 3D-consistent and high quality scene can be\nreconstructed despite image-wise blur. Specifically, we model blur by\nestimating per-pixel convolution kernels from a Blur Proposal Network (BPN).\nBPN is designed to consider spatial, color, and depth variations of the scene\nto maximize modeling capacity. Additionally, BPN also proposes a\nquality-assessing mask, which indicates regions where blur occur. Finally, we\nintroduce a coarse-to-fine kernel optimization scheme; this optimization scheme\nis fast and avoids sub-optimal solutions due to a sparse point cloud\ninitialization, which often occurs when we apply Structure-from-Motion on\nblurry images. We demonstrate that BAGS achieves photorealistic renderings\nunder various challenging blur conditions and imaging geometry, while\nsignificantly improving upon existing approaches.\n","authors":["Cheng Peng","Yutao Tang","Yifan Zhou","Nengyu Wang","Xijun Liu","Deming Li","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2403.04926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16276v1","updated":"2024-03-24T19:50:49Z","published":"2024-03-24T19:50:49Z","title":"AVicuna: Audio-Visual LLM with Interleaver and Context-Boundary\n Alignment for Temporal Referential Dialogue","summary":" In everyday communication, humans frequently use speech and gestures to refer\nto specific areas or objects, a process known as Referential Dialogue (RD).\nWhile prior studies have investigated RD through Large Language Models (LLMs)\nor Large Multimodal Models (LMMs) in static contexts, the exploration of\nTemporal Referential Dialogue (TRD) within audio-visual media remains limited.\nTwo primary challenges hinder progress in this field: (1) the absence of\ncomprehensive, untrimmed audio-visual video datasets with precise temporal\nannotations, and (2) the need for methods to integrate complex temporal\nauditory and visual cues effectively. To address these challenges, we introduce\na novel framework to generate PU-VALOR, an extensive audio-visual dataset\ncomprising over 114,000 untrimmed videos with accurate temporal demarcations.\nWe also present AVicuna, featuring an Audio-Visual Tokens Interleaver (AVTI)\nthat ensures the temporal alignment of audio-visual information. Additionally,\nwe develop the A5-222K dataset, encompassing more than 200,000 audio-text\npairings, to facilitate the audio and text alignments. Our experiments\ndemonstrate that AVicuna can effectively handle TRD in audio-visual videos and\nachieve state-of-the-art performance on various audio-visual video\nunderstanding tasks, particularly in untrimmed videos. We further investigate\nthe optimal audio-interleaving rate for interleaved audio-visual inputs, which\nmaximizes performance on the Audio-Visual Event Dense Localization task.\n","authors":["Yunlong Tang","Daiki Shimada","Jing Bi","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.00915v3","updated":"2024-03-24T19:39:00Z","published":"2022-09-02T09:50:31Z","title":"Detection of diabetic retinopathy using longitudinal self-supervised\n learning","summary":" Longitudinal imaging is able to capture both static anatomical structures and\ndynamic changes in disease progression towards earlier and better\npatient-specific pathology management. However, conventional approaches for\ndetecting diabetic retinopathy (DR) rarely take advantage of longitudinal\ninformation to improve DR analysis. In this work, we investigate the benefit of\nexploiting self-supervised learning with a longitudinal nature for DR diagnosis\npurposes. We compare different longitudinal self-supervised learning (LSSL)\nmethods to model the disease progression from longitudinal retinal color fundus\nphotographs (CFP) to detect early DR severity changes using a pair of\nconsecutive exams. The experiments were conducted on a longitudinal DR\nscreening dataset with or without those trained encoders (LSSL) acting as a\nlongitudinal pretext task. Results achieve an AUC of 0.875 for the baseline\n(model trained from scratch) and an AUC of 0.96 (95% CI: 0.9593-0.9655 DeLong\ntest) with a p-value < 2.2e-16 on early fusion using a simple ResNet alike\narchitecture with frozen LSSL weights, suggesting that the LSSL latent space\nenables to encode the dynamic of DR progression.\n","authors":["Rachid Zeghlache","Pierre-Henri Conze","Mostafa El Habib Daho","Ramin Tadayoni","Pascal Massin","Béatrice Cochener","Gwenolé Quellec","Mathieu Lamard"],"pdf_url":"https://arxiv.org/pdf/2209.00915v3.pdf","comment":"Accepted preprint for presentation at MICCAI-OMIA"},{"id":"http://arxiv.org/abs/2403.16272v1","updated":"2024-03-24T19:34:33Z","published":"2024-03-24T19:34:33Z","title":"L-MAE: Longitudinal masked auto-encoder with time and severity-aware\n encoding for diabetic retinopathy progression prediction","summary":" Pre-training strategies based on self-supervised learning (SSL) have proven\nto be effective pretext tasks for many downstream tasks in computer vision. Due\nto the significant disparity between medical and natural images, the\napplication of typical SSL is not straightforward in medical imaging.\nAdditionally, those pretext tasks often lack context, which is critical for\ncomputer-aided clinical decision support. In this paper, we developed a\nlongitudinal masked auto-encoder (MAE) based on the well-known\nTransformer-based MAE. In particular, we explored the importance of time-aware\nposition embedding as well as disease progression-aware masking. Taking into\naccount the time between examinations instead of just scheduling them offers\nthe benefit of capturing temporal changes and trends. The masking strategy, for\nits part, evolves during follow-up to better capture pathological changes,\nensuring a more accurate assessment of disease progression. Using OPHDIAT, a\nlarge follow-up screening dataset targeting diabetic retinopathy (DR), we\nevaluated the pre-trained weights on a longitudinal task, which is to predict\nthe severity label of the next visit within 3 years based on the past time\nseries examinations. Our results demonstrated the relevancy of both time-aware\nposition embedding and masking strategies based on disease progression\nknowledge. Compared to popular baseline models and standard longitudinal\nTransformers, these simple yet effective extensions significantly enhance the\npredictive ability of deep classification models.\n","authors":["Rachid Zeghlache","Pierre-Henri Conze","Mostafa El Habib Daho","Yihao Li","Alireza Rezaei","Hugo Le Boité","Ramin Tadayoni","Pascal Massin","Béatrice Cochener","Ikram Brahim","Gwenolé Quellec","Mathieu Lamard"],"pdf_url":"https://arxiv.org/pdf/2403.16272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16271v1","updated":"2024-03-24T19:32:39Z","published":"2024-03-24T19:32:39Z","title":"Object Detectors in the Open Environment:Challenges, Solutions, and\n Outlook","summary":" With the emergence of foundation models, deep learning-based object detectors\nhave shown practical usability in closed set scenarios. However, for real-world\ntasks, object detectors often operate in open environments, where crucial\nfactors (\\eg, data distribution, objective) that influence model learning are\noften changing. The dynamic and intricate nature of the open environment poses\nnovel and formidable challenges to object detectors. Unfortunately, current\nresearch on object detectors in open environments lacks a comprehensive\nanalysis of their distinctive characteristics, challenges, and corresponding\nsolutions, which hinders their secure deployment in critical real-world\nscenarios. This paper aims to bridge this gap by conducting a comprehensive\nreview and analysis of object detectors in open environments. We initially\nidentified limitations of key structural components within the existing\ndetection pipeline and propose the open environment object detector challenge\nframework that includes four quadrants (\\ie, out-of-domain, out-of-category,\nrobust learning, and incremental learning) based on the dimensions of the data\n/ target changes. For each quadrant of challenges in the proposed framework, we\npresent a detailed description and systematic analysis of the overarching goals\nand core difficulties, systematically review the corresponding solutions, and\nbenchmark their performance over multiple widely adopted datasets. In addition,\nwe engage in a discussion of open problems and potential avenues for future\nresearch. This paper aims to provide a fresh, comprehensive, and systematic\nunderstanding of the challenges and solutions associated with open-environment\nobject detectors, thus catalyzing the development of more solid applications in\nreal-world scenarios.\n","authors":["Siyuan Liang","Wei Wang","Ruoyu Chen","Aishan Liu","Boxi Wu","Ee-Chien Chang","Xiaochun Cao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.16271v1.pdf","comment":"32 pages, 17 figures"},{"id":"http://arxiv.org/abs/2403.16270v1","updated":"2024-03-24T19:22:15Z","published":"2024-03-24T19:22:15Z","title":"Constricting Normal Latent Space for Anomaly Detection with Normal-only\n Training Data","summary":" In order to devise an anomaly detection model using only normal training\ndata, an autoencoder (AE) is typically trained to reconstruct the data. As a\nresult, the AE can extract normal representations in its latent space. During\ntest time, since AE is not trained using real anomalies, it is expected to\npoorly reconstruct the anomalous data. However, several researchers have\nobserved that it is not the case. In this work, we propose to limit the\nreconstruction capability of AE by introducing a novel latent constriction\nloss, which is added to the existing reconstruction loss. By using our method,\nno extra computational cost is added to the AE during test time. Evaluations\nusing three video anomaly detection benchmark datasets, i.e., Ped2, Avenue, and\nShanghaiTech, demonstrate the effectiveness of our method in limiting the\nreconstruction capability of AE, which leads to a better anomaly detection\nmodel.\n","authors":["Marcella Astrid","Muhammad Zaigham Zaheer","Seung-Ik Lee"],"pdf_url":"https://arxiv.org/pdf/2403.16270v1.pdf","comment":"ICLR Workshop 2024 (PML4LRS)"},{"id":"http://arxiv.org/abs/2303.12054v3","updated":"2024-03-24T19:16:21Z","published":"2023-03-21T17:45:38Z","title":"Influencer Backdoor Attack on Semantic Segmentation","summary":" When a small number of poisoned samples are injected into the training\ndataset of a deep neural network, the network can be induced to exhibit\nmalicious behavior during inferences, which poses potential threats to\nreal-world applications. While they have been intensively studied in\nclassification, backdoor attacks on semantic segmentation have been largely\noverlooked. Unlike classification, semantic segmentation aims to classify every\npixel within a given image. In this work, we explore backdoor attacks on\nsegmentation models to misclassify all pixels of a victim class by injecting a\nspecific trigger on non-victim pixels during inferences, which is dubbed\nInfluencer Backdoor Attack (IBA). IBA is expected to maintain the\nclassification accuracy of non-victim pixels and mislead classifications of all\nvictim pixels in every single inference and could be easily applied to\nreal-world scenes. Based on the context aggregation ability of segmentation\nmodels, we proposed a simple, yet effective, Nearest-Neighbor trigger injection\nstrategy. We also introduce an innovative Pixel Random Labeling strategy which\nmaintains optimal performance even when the trigger is placed far from the\nvictim pixels. Our extensive experiments reveal that current segmentation\nmodels do suffer from backdoor attacks, demonstrate IBA real-world\napplicability, and show that our proposed techniques can further increase\nattack performance.\n","authors":["Haoheng Lan","Jindong Gu","Philip Torr","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.12054v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16263v1","updated":"2024-03-24T18:53:57Z","published":"2024-03-24T18:53:57Z","title":"Emotion Recognition from the perspective of Activity Recognition","summary":" Applications of an efficient emotion recognition system can be found in\nseveral domains such as medicine, driver fatigue surveillance, social robotics,\nand human-computer interaction. Appraising human emotional states, behaviors,\nand reactions displayed in real-world settings can be accomplished using latent\ncontinuous dimensions. Continuous dimensional models of human affect, such as\nthose based on valence and arousal are more accurate in describing a broad\nrange of spontaneous everyday emotions than more traditional models of discrete\nstereotypical emotion categories (e.g. happiness, surprise). Most of the prior\nwork on estimating valence and arousal considers laboratory settings and acted\ndata. But, for emotion recognition systems to be deployed and integrated into\nreal-world mobile and computing devices, we need to consider data collected in\nthe world. Action recognition is a domain of Computer Vision that involves\ncapturing complementary information on appearance from still frames and motion\nbetween frames. In this paper, we treat emotion recognition from the\nperspective of action recognition by exploring the application of deep learning\narchitectures specifically designed for action recognition, for continuous\naffect recognition. We propose a novel three-stream end-to-end deep learning\nregression pipeline with an attention mechanism, which is an ensemble design\nbased on sub-modules of multiple state-of-the-art action recognition systems.\nThe pipeline constitutes a novel data pre-processing approach with a spatial\nself-attention mechanism to extract keyframes. The optical flow of\nhigh-attention regions of the face is extracted to capture temporal context.\nAFEW-VA in-the-wild dataset has been used to conduct comparative experiments.\nQuantitative analysis shows that the proposed model outperforms multiple\nstandard baselines of both emotion recognition and action recognition models.\n","authors":["Savinay Nagendra","Prapti Panigrahi"],"pdf_url":"https://arxiv.org/pdf/2403.16263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16260v1","updated":"2024-03-24T18:43:04Z","published":"2024-03-24T18:43:04Z","title":"Out-of-Distribution Detection via Deep Multi-Comprehension Ensemble","summary":" Recent research underscores the pivotal role of the Out-of-Distribution (OOD)\nfeature representation field scale in determining the efficacy of models in OOD\ndetection. Consequently, the adoption of model ensembles has emerged as a\nprominent strategy to augment this feature representation field, capitalizing\non anticipated model diversity.\n However, our introduction of novel qualitative and quantitative model\nensemble evaluation methods, specifically Loss Basin/Barrier Visualization and\nthe Self-Coupling Index, reveals a critical drawback in existing ensemble\nmethods. We find that these methods incorporate weights that are\naffine-transformable, exhibiting limited variability and thus failing to\nachieve the desired diversity in feature representation.\n To address this limitation, we elevate the dimensions of traditional model\nensembles, incorporating various factors such as different weight\ninitializations, data holdout, etc., into distinct supervision tasks. This\ninnovative approach, termed Multi-Comprehension (MC) Ensemble, leverages\ndiverse training tasks to generate distinct comprehensions of the data and\nlabels, thereby extending the feature representation field.\n Our experimental results demonstrate the superior performance of the MC\nEnsemble strategy in OOD detection compared to both the naive Deep Ensemble\nmethod and a standalone model of comparable size. This underscores the\neffectiveness of our proposed approach in enhancing the model's capability to\ndetect instances outside its training distribution.\n","authors":["Chenhui Xu","Fuxun Yu","Zirui Xu","Nathan Inkawhich","Xiang Chen"],"pdf_url":"https://arxiv.org/pdf/2403.16260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16258v1","updated":"2024-03-24T18:33:16Z","published":"2024-03-24T18:33:16Z","title":"Laplacian-guided Entropy Model in Neural Codec with Blur-dissipated\n Synthesis","summary":" While replacing Gaussian decoders with a conditional diffusion model enhances\nthe perceptual quality of reconstructions in neural image compression, their\nlack of inductive bias for image data restricts their ability to achieve\nstate-of-the-art perceptual levels. To address this limitation, we adopt a\nnon-isotropic diffusion model at the decoder side. This model imposes an\ninductive bias aimed at distinguishing between frequency contents, thereby\nfacilitating the generation of high-quality images. Moreover, our framework is\nequipped with a novel entropy model that accurately models the probability\ndistribution of latent representation by exploiting spatio-channel correlations\nin latent space, while accelerating the entropy decoding step. This\nchannel-wise entropy model leverages both local and global spatial contexts\nwithin each channel chunk. The global spatial context is built upon the\nTransformer, which is specifically designed for image compression tasks. The\ndesigned Transformer employs a Laplacian-shaped positional encoding, the\nlearnable parameters of which are adaptively adjusted for each channel cluster.\nOur experiments demonstrate that our proposed framework yields better\nperceptual quality compared to cutting-edge generative-based codecs, and the\nproposed entropy model contributes to notable bitrate savings.\n","authors":["Atefeh Khoshkhahtinat","Ali Zafari","Piyush M. Mehta","Nasser M. Nasrabadi"],"pdf_url":"https://arxiv.org/pdf/2403.16258v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.16257v1","updated":"2024-03-24T18:33:15Z","published":"2024-03-24T18:33:15Z","title":"Unlearning Backdoor Threats: Enhancing Backdoor Defense in Multimodal\n Contrastive Learning via Local Token Unlearning","summary":" Multimodal contrastive learning has emerged as a powerful paradigm for\nbuilding high-quality features using the complementary strengths of various\ndata modalities. However, the open nature of such systems inadvertently\nincreases the possibility of backdoor attacks. These attacks subtly embed\nmalicious behaviors within the model during training, which can be activated by\nspecific triggers in the inference phase, posing significant security risks.\nDespite existing countermeasures through fine-tuning that reduce the adverse\nimpacts of such attacks, these defenses often degrade the clean accuracy and\nnecessitate the construction of extensive clean training pairs. In this paper,\nwe explore the possibility of a less-cost defense from the perspective of model\nunlearning, that is, whether the model can be made to quickly \\textbf{u}nlearn\n\\textbf{b}ackdoor \\textbf{t}hreats (UBT) by constructing a small set of\npoisoned samples. Specifically, we strengthen the backdoor shortcuts to\ndiscover suspicious samples through overfitting training prioritized by weak\nsimilarity samples. Building on the initial identification of suspicious\nsamples, we introduce an innovative token-based localized forgetting training\nregime. This technique specifically targets the poisoned aspects of the model,\napplying a focused effort to unlearn the backdoor associations and trying not\nto damage the integrity of the overall model. Experimental results show that\nour method not only ensures a minimal success rate for attacks, but also\npreserves the model's high clean accuracy.\n","authors":["Siyuan Liang","Kuanrong Liu","Jiajun Gong","Jiawei Liang","Yuan Xun","Ee-Chien Chang","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2403.16257v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2403.06912v3","updated":"2024-03-24T18:10:11Z","published":"2024-03-11T17:02:11Z","title":"DNGaussian: Optimizing Sparse-View 3D Gaussian Radiance Fields with\n Global-Local Depth Normalization","summary":" Radiance fields have demonstrated impressive performance in synthesizing\nnovel views from sparse input views, yet prevailing methods suffer from high\ntraining costs and slow inference speed. This paper introduces DNGaussian, a\ndepth-regularized framework based on 3D Gaussian radiance fields, offering\nreal-time and high-quality few-shot novel view synthesis at low costs. Our\nmotivation stems from the highly efficient representation and surprising\nquality of the recent 3D Gaussian Splatting, despite it will encounter a\ngeometry degradation when input views decrease. In the Gaussian radiance\nfields, we find this degradation in scene geometry primarily lined to the\npositioning of Gaussian primitives and can be mitigated by depth constraint.\nConsequently, we propose a Hard and Soft Depth Regularization to restore\naccurate scene geometry under coarse monocular depth supervision while\nmaintaining a fine-grained color appearance. To further refine detailed\ngeometry reshaping, we introduce Global-Local Depth Normalization, enhancing\nthe focus on small local depth changes. Extensive experiments on LLFF, DTU, and\nBlender datasets demonstrate that DNGaussian outperforms state-of-the-art\nmethods, achieving comparable or better results with significantly reduced\nmemory cost, a $25 \\times$ reduction in training time, and over $3000 \\times$\nfaster rendering speed.\n","authors":["Jiahe Li","Jiawei Zhang","Xiao Bai","Jin Zheng","Xin Ning","Jun Zhou","Lin Gu"],"pdf_url":"https://arxiv.org/pdf/2403.06912v3.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://fictionarry.github.io/DNGaussian/"},{"id":"http://arxiv.org/abs/2306.12547v2","updated":"2024-03-24T18:00:57Z","published":"2023-06-21T20:21:15Z","title":"DGC-GNN: Leveraging Geometry and Color Cues for Visual Descriptor-Free\n 2D-3D Matching","summary":" Matching 2D keypoints in an image to a sparse 3D point cloud of the scene\nwithout requiring visual descriptors has garnered increased interest due to its\nlow memory requirements, inherent privacy preservation, and reduced need for\nexpensive 3D model maintenance compared to visual descriptor-based methods.\nHowever, existing algorithms often compromise on performance, resulting in a\nsignificant deterioration compared to their descriptor-based counterparts. In\nthis paper, we introduce DGC-GNN, a novel algorithm that employs a\nglobal-to-local Graph Neural Network (GNN) that progressively exploits\ngeometric and color cues to represent keypoints, thereby improving matching\naccuracy. Our procedure encodes both Euclidean and angular relations at a\ncoarse level, forming the geometric embedding to guide the point matching. We\nevaluate DGC-GNN on both indoor and outdoor datasets, demonstrating that it not\nonly doubles the accuracy of the state-of-the-art visual descriptor-free\nalgorithm but also substantially narrows the performance gap between\ndescriptor-based and descriptor-free methods.\n","authors":["Shuzhe Wang","Juho Kannala","Daniel Barath"],"pdf_url":"https://arxiv.org/pdf/2306.12547v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16246v1","updated":"2024-03-24T17:33:22Z","published":"2024-03-24T17:33:22Z","title":"Partially Blinded Unlearning: Class Unlearning for Deep Networks a\n Bayesian Perspective","summary":" In order to adhere to regulatory standards governing individual data privacy\nand safety, machine learning models must systematically eliminate information\nderived from specific subsets of a user's training data that can no longer be\nutilized. The emerging discipline of Machine Unlearning has arisen as a pivotal\narea of research, facilitating the process of selectively discarding\ninformation designated to specific sets or classes of data from a pre-trained\nmodel, thereby eliminating the necessity for extensive retraining from scratch.\nThe principal aim of this study is to formulate a methodology tailored for the\npurposeful elimination of information linked to a specific class of data from a\npre-trained classification network. This intentional removal is crafted to\ndegrade the model's performance specifically concerning the unlearned data\nclass while concurrently minimizing any detrimental impacts on the model's\nperformance in other classes. To achieve this goal, we frame the class\nunlearning problem from a Bayesian perspective, which yields a loss function\nthat minimizes the log-likelihood associated with the unlearned data with a\nstability regularization in parameter space. This stability regularization\nincorporates Mohalanobis distance with respect to the Fisher Information matrix\nand $l_2$ distance from the pre-trained model parameters. Our novel approach,\ntermed \\textbf{Partially-Blinded Unlearning (PBU)}, surpasses existing\nstate-of-the-art class unlearning methods, demonstrating superior\neffectiveness. Notably, PBU achieves this efficacy without requiring awareness\nof the entire training dataset but only to the unlearned data points, marking a\ndistinctive feature of its performance.\n","authors":["Subhodip Panda","Shashwat Sourav","Prathosh A. P"],"pdf_url":"https://arxiv.org/pdf/2403.16246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04364v2","updated":"2024-03-24T17:22:35Z","published":"2023-12-07T15:35:42Z","title":"DemoCaricature: Democratising Caricature Generation with a Rough Sketch","summary":" In this paper, we democratise caricature generation, empowering individuals\nto effortlessly craft personalised caricatures with just a photo and a\nconceptual sketch. Our objective is to strike a delicate balance between\nabstraction and identity, while preserving the creativity and subjectivity\ninherent in a sketch. To achieve this, we present Explicit Rank-1 Model Editing\nalongside single-image personalisation, selectively applying nuanced edits to\ncross-attention layers for a seamless merge of identity and style.\nAdditionally, we propose Random Mask Reconstruction to enhance robustness,\ndirecting the model to focus on distinctive identity and style features.\nCrucially, our aim is not to replace artists but to eliminate accessibility\nbarriers, allowing enthusiasts to engage in the artistry.\n","authors":["Dar-Yen Chen","Ayan Kumar Bhunia","Subhadeep Koley","Aneeshan Sain","Pinaki Nath Chowdhury","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2312.04364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16244v1","updated":"2024-03-24T17:21:32Z","published":"2024-03-24T17:21:32Z","title":"On the Equivalency, Substitutability, and Flexibility of Synthetic Data","summary":" We study, from an empirical standpoint, the efficacy of synthetic data in\nreal-world scenarios. Leveraging synthetic data for training perception models\nhas become a key strategy embraced by the community due to its efficiency,\nscalability, perfect annotations, and low costs. Despite proven advantages, few\nstudies put their stress on how to efficiently generate synthetic datasets to\nsolve real-world problems and to what extent synthetic data can reduce the\neffort for real-world data collection. To answer the questions, we\nsystematically investigate several interesting properties of synthetic data --\nthe equivalency of synthetic data to real-world data, the substitutability of\nsynthetic data for real data, and the flexibility of synthetic data generators\nto close up domain gaps. Leveraging the M3Act synthetic data generator, we\nconduct experiments on DanceTrack and MOT17. Our results suggest that synthetic\ndata not only enhances model performance but also demonstrates substitutability\nfor real data, with 60% to 80% replacement without performance loss. In\naddition, our study of the impact of synthetic data distributions on downstream\nperformance reveals the importance of flexible data generators in narrowing\ndomain gaps for improved model adaptability.\n","authors":["Che-Jui Chang","Danrui Li","Seonghyeon Moon","Mubbasir Kapadia"],"pdf_url":"https://arxiv.org/pdf/2403.16244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12188v2","updated":"2024-03-24T17:19:14Z","published":"2023-09-21T15:54:33Z","title":"SG-Bot: Object Rearrangement via Coarse-to-Fine Robotic Imagination on\n Scene Graphs","summary":" Object rearrangement is pivotal in robotic-environment interactions,\nrepresenting a significant capability in embodied AI. In this paper, we present\nSG-Bot, a novel rearrangement framework that utilizes a coarse-to-fine scheme\nwith a scene graph as the scene representation. Unlike previous methods that\nrely on either known goal priors or zero-shot large models, SG-Bot exemplifies\nlightweight, real-time, and user-controllable characteristics, seamlessly\nblending the consideration of commonsense knowledge with automatic generation\ncapabilities. SG-Bot employs a three-fold procedure--observation, imagination,\nand execution--to adeptly address the task. Initially, objects are discerned\nand extracted from a cluttered scene during the observation. These objects are\nfirst coarsely organized and depicted within a scene graph, guided by either\ncommonsense or user-defined criteria. Then, this scene graph subsequently\ninforms a generative model, which forms a fine-grained goal scene considering\nthe shape information from the initial scene and object semantics. Finally, for\nexecution, the initial and envisioned goal scenes are matched to formulate\nrobotic action policies. Experimental results demonstrate that SG-Bot\noutperforms competitors by a large margin.\n","authors":["Guangyao Zhai","Xiaoni Cai","Dianye Huang","Yan Di","Fabian Manhardt","Federico Tombari","Nassir Navab","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2309.12188v2.pdf","comment":"ICRA 2024 accepted. Project website:\n https://sites.google.com/view/sg-bot"},{"id":"http://arxiv.org/abs/2403.14119v2","updated":"2024-03-24T17:16:53Z","published":"2024-03-21T04:08:29Z","title":"C-TPT: Calibrated Test-Time Prompt Tuning for Vision-Language Models via\n Text Feature Dispersion","summary":" In deep learning, test-time adaptation has gained attention as a method for\nmodel fine-tuning without the need for labeled data. A prime exemplification is\nthe recently proposed test-time prompt tuning for large-scale vision-language\nmodels such as CLIP. Unfortunately, these prompts have been mainly developed to\nimprove accuracy, overlooking the importance of calibration, which is a crucial\naspect for quantifying prediction uncertainty. However, traditional calibration\nmethods rely on substantial amounts of labeled data, making them impractical\nfor test-time scenarios. To this end, this paper explores calibration during\ntest-time prompt tuning by leveraging the inherent properties of CLIP. Through\na series of observations, we find that the prompt choice significantly affects\nthe calibration in CLIP, where the prompts leading to higher text feature\ndispersion result in better-calibrated predictions. Introducing the Average\nText Feature Dispersion (ATFD), we establish its relationship with calibration\nerror and present a novel method, Calibrated Test-time Prompt Tuning (C-TPT),\nfor optimizing prompts during test-time with enhanced calibration. Through\nextensive experiments on different CLIP architectures and datasets, we show\nthat C-TPT can effectively improve the calibration of test-time prompt tuning\nwithout needing labeled data. The code is publicly accessible at\nhttps://github.com/hee-suk-yoon/C-TPT.\n","authors":["Hee Suk Yoon","Eunseop Yoon","Joshua Tian Jin Tee","Mark Hasegawa-Johnson","Yingzhen Li","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2403.14119v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.16242v1","updated":"2024-03-24T17:13:46Z","published":"2024-03-24T17:13:46Z","title":"Adversarially Masked Video Consistency for Unsupervised Domain\n Adaptation","summary":" We study the problem of unsupervised domain adaptation for egocentric videos.\nWe propose a transformer-based model to learn class-discriminative and\ndomain-invariant feature representations. It consists of two novel designs. The\nfirst module is called Generative Adversarial Domain Alignment Network with the\naim of learning domain-invariant representations. It simultaneously learns a\nmask generator and a domain-invariant encoder in an adversarial way. The\ndomain-invariant encoder is trained to minimize the distance between the source\nand target domain. The masking generator, conversely, aims at producing\nchallenging masks by maximizing the domain distance. The second is a Masked\nConsistency Learning module to learn class-discriminative representations. It\nenforces the prediction consistency between the masked target videos and their\nfull forms. To better evaluate the effectiveness of domain adaptation methods,\nwe construct a more challenging benchmark for egocentric videos, U-Ego4D. Our\nmethod achieves state-of-the-art performance on the Epic-Kitchen and the\nproposed U-Ego4D benchmark.\n","authors":["Xiaoyu Zhu","Junwei Liang","Po-Yao Huang","Alex Hauptmann"],"pdf_url":"https://arxiv.org/pdf/2403.16242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16240v1","updated":"2024-03-24T17:12:13Z","published":"2024-03-24T17:12:13Z","title":"Low Rank Groupwise Deformations for Motion Tracking in Cardiac Cine MRI","summary":" Diffeomorphic image registration is a commonly used method to deform one\nimage to resemble another. While warping a single image to another is useful,\nit can be advantageous to warp multiple images simultaneously, such as in\ntracking the motion of the heart across a sequence of images. In this paper,\nour objective is to propose a novel method capable of registering a group or\nsequence of images to a target image, resulting in registered images that\nappear identical and therefore have a low rank. Moreover, we aim for these\nregistered images to closely resemble the target image. Through experimental\nevidence, we will demonstrate our method's superior efficacy in producing\nlow-rank groupwise deformations compared to other state-of-the-art approaches.\n","authors":["Sean Rendell","Jinming Duan"],"pdf_url":"https://arxiv.org/pdf/2403.16240v1.pdf","comment":"A thesis submitted to the University of Birmingham for MSc Degree"},{"id":"http://arxiv.org/abs/2403.16227v1","updated":"2024-03-24T16:41:50Z","published":"2024-03-24T16:41:50Z","title":"Dual-modal Prior Semantic Guided Infrared and Visible Image Fusion for\n Intelligent Transportation System","summary":" Infrared and visible image fusion (IVF) plays an important role in\nintelligent transportation system (ITS). The early works predominantly focus on\nboosting the visual appeal of the fused result, and only several recent\napproaches have tried to combine the high-level vision task with IVF. However,\nthey prioritize the design of cascaded structure to seek unified suitable\nfeatures and fit different tasks. Thus, they tend to typically bias toward to\nreconstructing raw pixels without considering the significance of semantic\nfeatures. Therefore, we propose a novel prior semantic guided image fusion\nmethod based on the dual-modality strategy, improving the performance of IVF in\nITS. Specifically, to explore the independent significant semantic of each\nmodality, we first design two parallel semantic segmentation branches with a\nrefined feature adaptive-modulation (RFaM) mechanism. RFaM can perceive the\nfeatures that are semantically distinct enough in each semantic segmentation\nbranch. Then, two pilot experiments based on the two branches are conducted to\ncapture the significant prior semantic of two images, which then is applied to\nguide the fusion task in the integration of semantic segmentation branches and\nfusion branches. In addition, to aggregate both high-level semantics and\nimpressive visual effects, we further investigate the frequency response of the\nprior semantics, and propose a multi-level representation-adaptive fusion\n(MRaF) module to explicitly integrate the low-frequent prior semantic with the\nhigh-frequent details. Extensive experiments on two public datasets demonstrate\nthe superiority of our method over the state-of-the-art image fusion\napproaches, in terms of either the visual appeal or the high-level semantics.\n","authors":["Jing Li","Lu Bai","Bin Yang","Chang Li","Lingfei Ma","Lixin Cui","Edwin R. Hancock"],"pdf_url":"https://arxiv.org/pdf/2403.16227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09457v2","updated":"2024-03-24T16:37:04Z","published":"2023-10-14T00:32:11Z","title":"UCM-Net: A Lightweight and Efficient Solution for Skin Lesion\n Segmentation using MLP and CNN","summary":" Skin cancer is a significant public health problem, and computer-aided\ndiagnosis can help to prevent and treat it. A crucial step for computer-aided\ndiagnosis is accurately segmenting skin lesions in images, which allows for\nlesion detection, classification, and analysis. However, this task is\nchallenging due to the diverse characteristics of lesions, such as appearance,\nshape, size, color, texture, and location, as well as image quality issues like\nnoise, artifacts, and occlusions. Deep learning models have recently been\napplied to skin lesion segmentation, but they have high parameter counts and\ncomputational demands, making them unsuitable for mobile health applications.\nTo address this challenge, we propose UCM-Net, a novel, efficient, and\nlightweight solution that integrates Multi-Layer Perceptions (MLP) and\nConvolutional Neural Networks (CNN). Unlike conventional UNet architectures,\nour UCMNet-Block reduces parameter overhead and enhances UCM-Net's learning\ncapabilities, leading to robust segmentation performance. We validate UCM-Net's\ncompetitiveness through extensive experiments on PH2, isic2017 and isic2018\ndatasets. Remarkably, UCM-Net has less than 50KB parameters and less than 0.05\nGiga-Operations Per Second (GLOPs), setting a new possible standard for\nefficiency in skin lesion segmentation. The source code will be publicly\navailable.\n","authors":["Chunyu Yuan","Dongfang Zhao","Sos S. Agaian"],"pdf_url":"https://arxiv.org/pdf/2310.09457v2.pdf","comment":"17 pages, under review"},{"id":"http://arxiv.org/abs/2403.16224v1","updated":"2024-03-24T16:34:47Z","published":"2024-03-24T16:34:47Z","title":"Inverse Rendering of Glossy Objects via the Neural Plenoptic Function\n and Radiance Fields","summary":" Inverse rendering aims at recovering both geometry and materials of objects.\nIt provides a more compatible reconstruction for conventional rendering\nengines, compared with the neural radiance fields (NeRFs). On the other hand,\nexisting NeRF-based inverse rendering methods cannot handle glossy objects with\nlocal light interactions well, as they typically oversimplify the illumination\nas a 2D environmental map, which assumes infinite lights only. Observing the\nsuperiority of NeRFs in recovering radiance fields, we propose a novel 5D\nNeural Plenoptic Function (NeP) based on NeRFs and ray tracing, such that more\naccurate lighting-object interactions can be formulated via the rendering\nequation. We also design a material-aware cone sampling strategy to efficiently\nintegrate lights inside the BRDF lobes with the help of pre-filtered radiance\nfields. Our method has two stages: the geometry of the target object and the\npre-filtered environmental radiance fields are reconstructed in the first\nstage, and materials of the target object are estimated in the second stage\nwith the proposed NeP and material-aware cone sampling strategy. Extensive\nexperiments on the proposed real-world and synthetic datasets demonstrate that\nour method can reconstruct high-fidelity geometry/materials of challenging\nglossy objects with complex lighting interactions from nearby objects. Project\nwebpage: https://whyy.site/paper/nep\n","authors":["Haoyuan Wang","Wenbo Hu","Lei Zhu","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2403.16224v1.pdf","comment":"CVPR 2024 paper. Project webpage https://whyy.site/paper/nep"},{"id":"http://arxiv.org/abs/2403.16221v1","updated":"2024-03-24T16:29:50Z","published":"2024-03-24T16:29:50Z","title":"Exemplar-Free Class Incremental Learning via Incremental Representation","summary":" Exemplar-Free Class Incremental Learning (efCIL) aims to continuously\nincorporate the knowledge from new classes while retaining previously learned\ninformation, without storing any old-class exemplars (i.e., samples). For this\npurpose, various efCIL methods have been proposed over the past few years,\ngenerally with elaborately constructed old pseudo-features, increasing the\ndifficulty of model development and interpretation. In contrast, we propose a\n\\textbf{simple Incremental Representation (IR) framework} for efCIL without\nconstructing old pseudo-features. IR utilizes dataset augmentation to cover a\nsuitable feature space and prevents the model from forgetting by using a single\nL2 space maintenance loss. We discard the transient classifier trained on each\none of the sequence tasks and instead replace it with a 1-near-neighbor\nclassifier for inference, ensuring the representation is incrementally updated\nduring CIL. Extensive experiments demonstrate that our proposed IR achieves\ncomparable performance while significantly preventing the model from forgetting\non CIFAR100, TinyImageNet, and ImageNetSubset datasets.\n","authors":["Libo Huang","Zhulin An","Yan Zeng","Chuanguang Yang","Xinqiang Yu","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16212v1","updated":"2024-03-24T16:11:27Z","published":"2024-03-24T16:11:27Z","title":"Leveraging Deep Learning and Xception Architecture for High-Accuracy MRI\n Classification in Alzheimer Diagnosis","summary":" Exploring the application of deep learning technologies in the field of\nmedical diagnostics, Magnetic Resonance Imaging (MRI) provides a unique\nperspective for observing and diagnosing complex neurodegenerative diseases\nsuch as Alzheimer Disease (AD). With advancements in deep learning,\nparticularly in Convolutional Neural Networks (CNNs) and the Xception network\narchitecture, we are now able to analyze and classify vast amounts of MRI data\nwith unprecedented accuracy. The progress of this technology not only enhances\nour understanding of brain structural changes but also opens up new avenues for\nmonitoring disease progression through non-invasive means and potentially\nallows for precise diagnosis in the early stages of the disease.\n This study aims to classify MRI images using deep learning models to identify\ndifferent stages of Alzheimer Disease through a series of innovative data\nprocessing and model construction steps. Our experimental results show that the\ndeep learning framework based on the Xception model achieved a 99.6% accuracy\nrate in the multi-class MRI image classification task, demonstrating its\npotential application value in assistive diagnosis. Future research will focus\non expanding the dataset, improving model interpretability, and clinical\nvalidation to further promote the application of deep learning technology in\nthe medical field, with the hope of bringing earlier diagnosis and more\npersonalized treatment plans to Alzheimer Disease patients.\n","authors":["Shaojie Li","Haichen Qu","Xinqi Dong","Bo Dang","Hengyi Zang","Yulu Gong"],"pdf_url":"https://arxiv.org/pdf/2403.16212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16210v1","updated":"2024-03-24T16:09:21Z","published":"2024-03-24T16:09:21Z","title":"Frankenstein: Generating Semantic-Compositional 3D Scenes in One\n Tri-Plane","summary":" We present Frankenstein, a diffusion-based framework that can generate\nsemantic-compositional 3D scenes in a single pass. Unlike existing methods that\noutput a single, unified 3D shape, Frankenstein simultaneously generates\nmultiple separated shapes, each corresponding to a semantically meaningful\npart. The 3D scene information is encoded in one single tri-plane tensor, from\nwhich multiple Singed Distance Function (SDF) fields can be decoded to\nrepresent the compositional shapes. During training, an auto-encoder compresses\ntri-planes into a latent space, and then the denoising diffusion process is\nemployed to approximate the distribution of the compositional scenes.\nFrankenstein demonstrates promising results in generating room interiors as\nwell as human avatars with automatically separated parts. The generated scenes\nfacilitate many downstream applications, such as part-wise re-texturing, object\nrearrangement in the room or avatar cloth re-targeting.\n","authors":["Han Yan","Yang Li","Zhennan Wu","Shenzhou Chen","Weixuan Sun","Taizhang Shang","Weizhe Liu","Tian Chen","Xiaqiang Dai","Chao Ma","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2403.16210v1.pdf","comment":"Video: https://youtu.be/lRn-HqyCrLI"},{"id":"http://arxiv.org/abs/2403.16209v1","updated":"2024-03-24T16:08:10Z","published":"2024-03-24T16:08:10Z","title":"Image Captioning in news report scenario","summary":" Image captioning strives to generate pertinent captions for specified images,\nsituating itself at the crossroads of Computer Vision (CV) and Natural Language\nProcessing (NLP). This endeavor is of paramount importance with far-reaching\napplications in recommendation systems, news outlets, social media, and beyond.\nParticularly within the realm of news reporting, captions are expected to\nencompass detailed information, such as the identities of celebrities captured\nin the images. However, much of the existing body of work primarily centers\naround understanding scenes and actions. In this paper, we explore the realm of\nimage captioning specifically tailored for celebrity photographs, illustrating\nits broad potential for enhancing news industry practices. This exploration\naims to augment automated news content generation, thereby facilitating a more\nnuanced dissemination of information. Our endeavor shows a broader horizon,\nenriching the narrative in news reporting through a more intuitive image\ncaptioning framework.\n","authors":["Tianrui Liu","Qi Cai","Changxin Xu","Zhanxin Zhou","Jize Xiong","Yuxin Qiao","Tsungwei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16209v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.16207v1","updated":"2024-03-24T16:03:27Z","published":"2024-03-24T16:03:27Z","title":"Skull-to-Face: Anatomy-Guided 3D Facial Reconstruction and Editing","summary":" Deducing the 3D face from a skull is an essential but challenging task in\nforensic science and archaeology. Existing methods for automated facial\nreconstruction yield inaccurate results, suffering from the non-determinative\nnature of the problem that a skull with a sparse set of tissue depth cannot\nfully determine the skinned face. Additionally, their texture-less results\nrequire further post-processing stages to achieve a photo-realistic appearance.\nThis paper proposes an end-to-end 3D face reconstruction and exploration tool,\nproviding textured 3D faces for reference. With the help of state-of-the-art\ntext-to-image diffusion models and image-based facial reconstruction\ntechniques, we generate an initial reference 3D face, whose biological profile\naligns with the given skull. We then adapt these initial faces to meet the\nstatistical expectations of extruded anatomical landmarks on the skull through\nan optimization process. The joint statistical distribution of tissue depths is\nlearned on a small set of anatomical landmarks on the skull. To support further\nadjustment, we propose an efficient face adaptation tool to assist users in\ntuning tissue depths, either globally or at local regions, while observing\nplausible visual feedback. Experiments conducted on a real skull-face dataset\ndemonstrated the effectiveness of our proposed pipeline in terms of\nreconstruction accuracy, diversity, and stability.\n","authors":["Yongqing Liang","Congyi Zhang","Junli Zhao","Wenping Wang","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2403.16207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16205v1","updated":"2024-03-24T15:58:48Z","published":"2024-03-24T15:58:48Z","title":"Blur2Blur: Blur Conversion for Unsupervised Image Deblurring on Unknown\n Domains","summary":" This paper presents an innovative framework designed to train an image\ndeblurring algorithm tailored to a specific camera device. This algorithm works\nby transforming a blurry input image, which is challenging to deblur, into\nanother blurry image that is more amenable to deblurring. The transformation\nprocess, from one blurry state to another, leverages unpaired data consisting\nof sharp and blurry images captured by the target camera device. Learning this\nblur-to-blur transformation is inherently simpler than direct blur-to-sharp\nconversion, as it primarily involves modifying blur patterns rather than the\nintricate task of reconstructing fine image details. The efficacy of the\nproposed approach has been demonstrated through comprehensive experiments on\nvarious benchmarks, where it significantly outperforms state-of-the-art methods\nboth quantitatively and qualitatively. Our code and data are available at\nhttps://zero1778.github.io/blur2blur/\n","authors":["Bang-Dang Pham","Phong Tran","Anh Tran","Cuong Pham","Rang Nguyen","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2403.16205v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16202v1","updated":"2024-03-24T15:51:17Z","published":"2024-03-24T15:51:17Z","title":"FH-SSTNet: Forehead Creases based User Verification using Spatio-Spatial\n Temporal Network","summary":" Biometric authentication, which utilizes contactless features, such as\nforehead patterns, has become increasingly important for identity verification\nand access management. The proposed method is based on learning a 3D\nspatio-spatial temporal convolution to create detailed pictures of forehead\npatterns. We introduce a new CNN model called the Forehead Spatio-Spatial\nTemporal Network (FH-SSTNet), which utilizes a 3D CNN architecture with triplet\nloss to capture distinguishing features. We enhance the model's discrimination\ncapability using Arcloss in the network's head. Experimentation on the Forehead\nCreases version 1 (FH-V1) dataset, containing 247 unique subjects, demonstrates\nthe superior performance of FH-SSTNet compared to existing methods and\npre-trained CNNs like ResNet50, especially for forehead-based user\nverification. The results demonstrate the superior performance of FH-SSTNet for\nforehead-based user verification, confirming its effectiveness in identity\nauthentication.\n","authors":["Geetanjali Sharma","Gaurav Jaswal","Aditya Nigam","Raghavendra Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2403.16202v1.pdf","comment":"6 pages, 5 Figure, IWBF conference"},{"id":"http://arxiv.org/abs/2403.16201v1","updated":"2024-03-24T15:48:29Z","published":"2024-03-24T15:48:29Z","title":"From Discrete to Continuous: Deep Fair Clustering With Transferable\n Representations","summary":" We consider the problem of deep fair clustering, which partitions data into\nclusters via the representations extracted by deep neural networks while hiding\nsensitive data attributes. To achieve fairness, existing methods present a\nvariety of fairness-related objective functions based on the group fairness\ncriterion. However, these works typically assume that the sensitive attributes\nare discrete and do not work for continuous sensitive variables, such as the\nproportion of the female population in an area. Besides, the potential of the\nrepresentations learned from clustering tasks to improve performance on other\ntasks is ignored by existing works. In light of these limitations, we propose a\nflexible deep fair clustering method that can handle discrete and continuous\nsensitive attributes simultaneously. Specifically, we design an information\nbottleneck style objective function to learn fair and clustering-friendly\nrepresentations. Furthermore, we explore for the first time the transferability\nof the extracted representations to other downstream tasks. Unlike existing\nworks, we impose fairness at the representation level, which could guarantee\nfairness for the transferred task regardless of clustering results. To verify\nthe effectiveness of the proposed method, we perform extensive experiments on\ndatasets with discrete and continuous sensitive attributes, demonstrating the\nadvantage of our method in comparison with state-of-the-art methods.\n","authors":["Xiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.16201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16198v1","updated":"2024-03-24T15:39:52Z","published":"2024-03-24T15:39:52Z","title":"Diffusion Model is a Good Pose Estimator from 3D RF-Vision","summary":" Human pose estimation (HPE) from Radio Frequency vision (RF-vision) performs\nhuman sensing using RF signals that penetrate obstacles without revealing\nprivacy (e.g., facial information). Recently, mmWave radar has emerged as a\npromising RF-vision sensor, providing radar point clouds by processing RF\nsignals. However, the mmWave radar has a limited resolution with severe noise,\nleading to inaccurate and inconsistent human pose estimation. This work\nproposes mmDiff, a novel diffusion-based pose estimator tailored for noisy\nradar data. Our approach aims to provide reliable guidance as conditions to\ndiffusion models. Two key challenges are addressed by mmDiff: (1)\nmiss-detection of parts of human bodies, which is addressed by a module that\nisolates feature extraction from different body parts, and (2) signal\ninconsistency due to environmental interference, which is tackled by\nincorporating prior knowledge of body structure and motion. Several modules are\ndesigned to achieve these goals, whose features work as the conditions for the\nsubsequent diffusion model, eliminating the miss-detection and instability of\nHPE based on RF-vision. Extensive experiments demonstrate that mmDiff\noutperforms existing methods significantly, achieving state-of-the-art\nperformances on public datasets.\n","authors":["Junqiao Fan","Jianfei Yang","Yuecong Xu","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2403.16198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13356v2","updated":"2024-03-24T15:30:46Z","published":"2023-08-25T13:05:06Z","title":"CEIMVEN: An Approach of Cutting Edge Implementation of Modified Versions\n of EfficientNet (V1-V2) Architecture for Breast Cancer Detection and\n Classification from Ultrasound Images","summary":" Undoubtedly breast cancer identifies itself as one of the most widespread and\nterrifying cancers across the globe. Millions of women are getting affected\neach year from it. Breast cancer remains the major one for being the reason of\nlargest number of demise of women. In the recent time of research, Medical\nImage Computing and Processing has been playing a significant role for\ndetecting and classifying breast cancers from ultrasound images and mammograms,\nalong with the celestial touch of deep neural networks. In this research, we\nfocused mostly on our rigorous implementations and iterative result analysis of\ndifferent cutting-edge modified versions of EfficientNet architectures namely\nEfficientNet-V1 (b0-b7) and EfficientNet-V2 (b0-b3) with ultrasound image,\nnamed as CEIMVEN. We utilized transfer learning approach here for using the\npre-trained models of EfficientNet versions. We activated the hyper-parameter\ntuning procedures, added fully connected layers, discarded the unprecedented\noutliers and recorded the accuracy results from our custom modified\nEfficientNet architectures. Our deep learning model training approach was\nrelated to both identifying the cancer affected areas with region of interest\n(ROI) techniques and multiple classifications (benign, malignant and normal).\nThe approximate testing accuracies we got from the modified versions of\nEfficientNet-V1 (b0- 99.15%, b1- 98.58%, b2- 98.43%, b3- 98.01%, b4- 98.86%,\nb5- 97.72%, b6- 97.72%, b7- 98.72%) and EfficientNet-V2 (b0- 99.29%, b1-\n99.01%, b2- 98.72%, b3- 99.43%) are showing very bright future and strong\npotentials of deep learning approach for the successful detection and\nclassification of breast cancers from the ultrasound images at a very early\nstage. The code for this research is available here:\nhttps://github.com/ac005sheekar/CEIMVEN-Cutting-Edge-Implementation-of-Modified-EfficientNet-V1-V2-for-BreastCancer-Detection.\n","authors":["Sheekar Banerjee","Md. Kamrul Hasan Monir"],"pdf_url":"https://arxiv.org/pdf/2308.13356v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17417v2","updated":"2024-03-24T15:26:11Z","published":"2024-02-27T11:17:46Z","title":"CARZero: Cross-Attention Alignment for Radiology Zero-Shot\n Classification","summary":" The advancement of Zero-Shot Learning in the medical domain has been driven\nforward by using pre-trained models on large-scale image-text pairs, focusing\non image-text alignment. However, existing methods primarily rely on cosine\nsimilarity for alignment, which may not fully capture the complex relationship\nbetween medical images and reports. To address this gap, we introduce a novel\napproach called Cross-Attention Alignment for Radiology Zero-Shot\nClassification (CARZero). Our approach innovatively leverages cross-attention\nmechanisms to process image and report features, creating a Similarity\nRepresentation that more accurately reflects the intricate relationships in\nmedical semantics. This representation is then linearly projected to form an\nimage-text similarity matrix for cross-modality alignment. Additionally,\nrecognizing the pivotal role of prompt selection in zero-shot learning, CARZero\nincorporates a Large Language Model-based prompt alignment strategy. This\nstrategy standardizes diverse diagnostic expressions into a unified format for\nboth training and inference phases, overcoming the challenges of manual prompt\ndesign. Our approach is simple yet effective, demonstrating state-of-the-art\nperformance in zero-shot classification on five official chest radiograph\ndiagnostic test sets, including remarkable results on datasets with long-tail\ndistributions of rare diseases. This achievement is attributed to our new\nimage-text alignment strategy, which effectively addresses the complex\nrelationship between medical images and reports. Code and models are available\nat https://github.com/laihaoran/CARZero.\n","authors":["Haoran Lai","Qingsong Yao","Zihang Jiang","Rongsheng Wang","Zhiyang He","Xiaodong Tao","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.17417v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16194v1","updated":"2024-03-24T15:24:04Z","published":"2024-03-24T15:24:04Z","title":"Pose-Guided Self-Training with Two-Stage Clustering for Unsupervised\n Landmark Discovery","summary":" Unsupervised landmarks discovery (ULD) for an object category is a\nchallenging computer vision problem. In pursuit of developing a robust ULD\nframework, we explore the potential of a recent paradigm of self-supervised\nlearning algorithms, known as diffusion models. Some recent works have shown\nthat these models implicitly contain important correspondence cues. Towards\nharnessing the potential of diffusion models for the ULD task, we make the\nfollowing core contributions. First, we propose a ZeroShot ULD baseline based\non simple clustering of random pixel locations with nearest neighbour matching.\nIt delivers better results than existing ULD methods. Second, motivated by the\nZeroShot performance, we develop a ULD algorithm based on diffusion features\nusing self-training and clustering which also outperforms prior methods by\nnotable margins. Third, we introduce a new proxy task based on generating\nlatent pose codes and also propose a two-stage clustering mechanism to\nfacilitate effective pseudo-labeling, resulting in a significant performance\nimprovement. Overall, our approach consistently outperforms state-of-the-art\nmethods on four challenging benchmarks AFLW, MAFL, CatHeads and LS3D by\nsignificant margins.\n","authors":["Siddharth Tourani","Ahmed Alwheibi","Arif Mahmood","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2403.16194v1.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2112.06502v2","updated":"2024-03-24T15:12:20Z","published":"2021-12-13T09:24:45Z","title":"DGL-GAN: Discriminator Guided Learning for GAN Compression","summary":" Generative Adversarial Networks (GANs) with high computation costs, e.g.,\nBigGAN and StyleGAN2, have achieved remarkable results in synthesizing\nhigh-resolution images from random noise. Reducing the computation cost of GANs\nwhile keeping generating photo-realistic images is a challenging field. In this\nwork, we propose a novel yet simple {\\bf D}iscriminator {\\bf G}uided {\\bf\nL}earning approach for compressing vanilla {\\bf GAN}, dubbed {\\bf DGL-GAN}.\nMotivated by the phenomenon that the teacher discriminator may contain some\nmeaningful information about both real images and fake images, we merely\ntransfer the knowledge from the teacher discriminator via the adversarial\ninteraction between the teacher discriminator and the student generator. We\napply DGL-GAN to compress the two most representative large-scale vanilla GANs,\ni.e., StyleGAN2 and BigGAN. Experiments show that DGL-GAN achieves\nstate-of-the-art (SOTA) results on both StyleGAN2 and BigGAN. Moreover, DGL-GAN\nis also effective in boosting the performance of original uncompressed GANs.\nOriginal uncompressed StyleGAN2 boosted with DGL-GAN achieves FID 2.65 on FFHQ,\nwhich achieves a new state-of-the-art performance. Code and models are\navailable at \\url{https://github.com/yuesongtian/DGL-GAN}\n","authors":["Yuesong Tian","Li Shen","Xiang Tian","Dacheng Tao","Zhifeng Li","Wei Liu","Yaowu Chen"],"pdf_url":"https://arxiv.org/pdf/2112.06502v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14729v3","updated":"2024-03-24T15:11:38Z","published":"2023-10-23T09:05:18Z","title":"MAS: Multi-view Ancestral Sampling for 3D motion generation using 2D\n diffusion","summary":" We introduce Multi-view Ancestral Sampling (MAS), a method for 3D motion\ngeneration, using 2D diffusion models that were trained on motions obtained\nfrom in-the-wild videos. As such, MAS opens opportunities to exciting and\ndiverse fields of motion previously under-explored as 3D data is scarce and\nhard to collect. MAS works by simultaneously denoising multiple 2D motion\nsequences representing different views of the same 3D motion. It ensures\nconsistency across all views at each diffusion step by combining the individual\ngenerations into a unified 3D sequence, and projecting it back to the original\nviews. We demonstrate MAS on 2D pose data acquired from videos depicting\nprofessional basketball maneuvers, rhythmic gymnastic performances featuring a\nball apparatus, and horse races. In each of these domains, 3D motion capture is\narduous, and yet, MAS generates diverse and realistic 3D sequences. Unlike the\nScore Distillation approach, which optimizes each sample by repeatedly applying\nsmall fixes, our method uses a sampling process that was constructed for the\ndiffusion framework. As we demonstrate, MAS avoids common issues such as\nout-of-domain sampling and mode-collapse. https://guytevet.github.io/mas-page/\n","authors":["Roy Kapon","Guy Tevet","Daniel Cohen-Or","Amit H. Bermano"],"pdf_url":"https://arxiv.org/pdf/2310.14729v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16188v1","updated":"2024-03-24T15:10:22Z","published":"2024-03-24T15:10:22Z","title":"Cross-domain Multi-modal Few-shot Object Detection via Rich Text","summary":" Cross-modal feature extraction and integration have led to steady performance\nimprovements in few-shot learning tasks due to generating richer features.\nHowever, existing multi-modal object detection (MM-OD) methods degrade when\nfacing significant domain-shift and are sample insufficient. We hypothesize\nthat rich text information could more effectively help the model to build a\nknowledge relationship between the vision instance and its language description\nand can help mitigate domain shift. Specifically, we study the Cross-Domain\nfew-shot generalization of MM-OD (CDMM-FSOD) and propose a meta-learning based\nmulti-modal few-shot object detection method that utilizes rich text semantic\ninformation as an auxiliary modality to achieve domain adaptation in the\ncontext of FSOD. Our proposed network contains (i) a multi-modal feature\naggregation module that aligns the vision and language support feature\nembeddings and (ii) a rich text semantic rectify module that utilizes\nbidirectional text feature generation to reinforce multi-modal feature\nalignment and thus to enhance the model's language understanding capability. We\nevaluate our model on common standard cross-domain object detection datasets\nand demonstrate that our approach considerably outperforms existing FSOD\nmethods.\n","authors":["Zeyu Shangguan","Daniel Seita","Mohammad Rostami"],"pdf_url":"https://arxiv.org/pdf/2403.16188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16184v1","updated":"2024-03-24T15:02:24Z","published":"2024-03-24T15:02:24Z","title":"Improving Scene Graph Generation with Relation Words' Debiasing in\n Vision-Language Models","summary":" Scene Graph Generation (SGG) provides basic language representation of visual\nscenes, requiring models to grasp complex and diverse semantics between various\nobjects. However, this complexity and diversity in SGG also leads to\nunderrepresentation, where part of test triplets are rare or even unseen during\ntraining, resulting in imprecise predictions. To tackle this, we propose using\nthe SGG models with pretrained vision-language models (VLMs) to enhance\nrepresentation. However, due to the gap between the pretraining and SGG,\ndirectly ensembling the pretrained VLMs leads to severe biases across relation\nwords. Thus, we introduce LM Estimation to approximate the words' distribution\nunderlies in the pretraining language sets, and then use the distribution for\ndebiasing. After that, we ensemble VLMs with SGG models to enhance\nrepresentation. Considering that each model may represent better at different\nsamples, we use a certainty-aware indicator to score each sample and\ndynamically adjust the ensemble weights. Our method effectively addresses the\nwords biases, enhances SGG's representation, and achieve markable performance\nenhancements. It is training-free and integrates well with existing SGG models.\n","authors":["Yuxuan Wang","Xiaoyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16182v1","updated":"2024-03-24T15:00:44Z","published":"2024-03-24T15:00:44Z","title":"EgoExoLearn: A Dataset for Bridging Asynchronous Ego- and Exo-centric\n View of Procedural Activities in Real World","summary":" Being able to map the activities of others into one's own point of view is\none fundamental human skill even from a very early age. Taking a step toward\nunderstanding this human ability, we introduce EgoExoLearn, a large-scale\ndataset that emulates the human demonstration following process, in which\nindividuals record egocentric videos as they execute tasks guided by\ndemonstration videos. Focusing on the potential applications in daily\nassistance and professional support, EgoExoLearn contains egocentric and\ndemonstration video data spanning 120 hours captured in daily life scenarios\nand specialized laboratories. Along with the videos we record high-quality gaze\ndata and provide detailed multimodal annotations, formulating a playground for\nmodeling the human ability to bridge asynchronous procedural actions from\ndifferent viewpoints. To this end, we present benchmarks such as cross-view\nassociation, cross-view action planning, and cross-view referenced skill\nassessment, along with detailed analysis. We expect EgoExoLearn can serve as an\nimportant resource for bridging the actions across views, thus paving the way\nfor creating AI agents capable of seamlessly learning by observing humans in\nthe real world. Code and data can be found at:\nhttps://github.com/OpenGVLab/EgoExoLearn\n","authors":["Yifei Huang","Guo Chen","Jilan Xu","Mingfang Zhang","Lijin Yang","Baoqi Pei","Hongjie Zhang","Lu Dong","Yali Wang","Limin Wang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2403.16182v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.01696v2","updated":"2024-03-24T14:50:42Z","published":"2023-12-04T07:35:02Z","title":"BEVNeXt: Reviving Dense BEV Frameworks for 3D Object Detection","summary":" Recently, the rise of query-based Transformer decoders is reshaping\ncamera-based 3D object detection. These query-based decoders are surpassing the\ntraditional dense BEV (Bird's Eye View)-based methods. However, we argue that\ndense BEV frameworks remain important due to their outstanding abilities in\ndepth estimation and object localization, depicting 3D scenes accurately and\ncomprehensively. This paper aims to address the drawbacks of the existing dense\nBEV-based 3D object detectors by introducing our proposed enhanced components,\nincluding a CRF-modulated depth estimation module enforcing object-level\nconsistencies, a long-term temporal aggregation module with extended receptive\nfields, and a two-stage object decoder combining perspective techniques with\nCRF-modulated depth embedding. These enhancements lead to a \"modernized\" dense\nBEV framework dubbed BEVNeXt. On the nuScenes benchmark, BEVNeXt outperforms\nboth BEV-based and query-based frameworks under various settings, achieving a\nstate-of-the-art result of 64.2 NDS on the nuScenes test set. Code will be\navailable at \\url{https://github.com/woxihuanjiangguo/BEVNeXt}.\n","authors":["Zhenxin Li","Shiyi Lan","Jose M. Alvarez","Zuxuan Wu"],"pdf_url":"https://arxiv.org/pdf/2312.01696v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16175v1","updated":"2024-03-24T14:35:06Z","published":"2024-03-24T14:35:06Z","title":"Enhancing MRI-Based Classification of Alzheimer's Disease with\n Explainable 3D Hybrid Compact Convolutional Transformers","summary":" Alzheimer's disease (AD), characterized by progressive cognitive decline and\nmemory loss, presents a formidable global health challenge, underscoring the\ncritical importance of early and precise diagnosis for timely interventions and\nenhanced patient outcomes. While MRI scans provide valuable insights into brain\nstructures, traditional analysis methods often struggle to discern intricate 3D\npatterns crucial for AD identification. Addressing this challenge, we introduce\nan alternative end-to-end deep learning model, the 3D Hybrid Compact\nConvolutional Transformers 3D (HCCT). By synergistically combining\nconvolutional neural networks (CNNs) and vision transformers (ViTs), the 3D\nHCCT adeptly captures both local features and long-range relationships within\n3D MRI scans. Extensive evaluations on prominent AD benchmark dataset, ADNI,\ndemonstrate the 3D HCCT's superior performance, surpassing state of the art CNN\nand transformer-based methods in classification accuracy. Its robust\ngeneralization capability and interpretability marks a significant stride in AD\nclassification from 3D MRI scans, promising more accurate and reliable\ndiagnoses for improved patient care and superior clinical outcomes.\n","authors":["Arindam Majee","Avisek Gupta","Sourav Raha","Swagatam Das"],"pdf_url":"https://arxiv.org/pdf/2403.16175v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.16172v1","updated":"2024-03-24T14:29:41Z","published":"2024-03-24T14:29:41Z","title":"Fusion of Minutia Cylinder Codes and Minutia Patch Embeddings for Latent\n Fingerprint Recognition","summary":" Latent fingerprints are one of the most widely used forensic evidence by law\nenforcement agencies. However, latent recognition performance is far from the\nexemplary performance of sensor fingerprint recognition due to deformations and\nartifacts within these images. In this study, we propose a fusion based local\nmatching approach towards latent fingerprint recognition. Recent latent\nrecognition studies typically relied on local descriptor generation methods, in\nwhich either handcrafted minutiae features or deep neural network features are\nextracted around a minutia of interest, in the latent recognition process.\nProposed approach would integrate these handcrafted features with a recently\nproposed deep neural network embedding features in a multi-stage fusion\napproach to significantly improve latent recognition results. Effectiveness of\nthe proposed approach has been shown on several public and private data sets.\nAs demonstrated in our experimental results, proposed method improves rank-1\nidentification accuracy by considerably for real-world datasets when compared\nto either the single usage of these features or existing state-of-the-art\nmethods in the literature.\n","authors":["Yusuf Artan","Bensu Alkan Semiz"],"pdf_url":"https://arxiv.org/pdf/2403.16172v1.pdf","comment":"9 pages,7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.16169v1","updated":"2024-03-24T14:24:13Z","published":"2024-03-24T14:24:13Z","title":"Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method","summary":" Gaze plays a crucial role in revealing human attention and intention,\nshedding light on the cognitive processes behind human actions. The integration\nof gaze guidance with the dynamics of hand-object interactions boosts the\naccuracy of human motion prediction. However, the lack of datasets that capture\nthe intricate relationship and consistency among gaze, hand, and object\nmovements remains a substantial hurdle. In this paper, we introduce the first\nGaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task\nfor synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI,\nfeatures simultaneous 3D modeling of gaze, hand, and object interactions,\ncomprising 479 sequences with an average duration of 19.1 seconds, 812\nsub-sequences, and 33 objects of various sizes. We propose a hierarchical\nframework centered on a gaze-guided hand-object interaction diffusion model,\nnamed GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions\ninto spatial-temporal features and goal pose conditions at different levels of\ninformation granularity. During the diffusion phase, two gaze-conditioned\ndiffusion models are stacked to simplify the complex synthesis of hand-object\nmotions. Here, the object motion diffusion model generates sequences of object\nmotions based on gaze conditions, while the hand motion diffusion model\nproduces hand motions based on the generated object motion. To improve\nfine-grained goal pose alignment, we introduce a Spherical Gaussian constraint\nto guide the denoising step. In the subsequent post-diffusion phase, we\noptimize the generated hand motions using contact consistency. Our extensive\nexperiments highlight the uniqueness of our dataset and the effectiveness of\nour approach.\n","authors":["Jie Tian","Lingxiao Yang","Ran Ji","Yuexin Ma","Lan Xu","Jingyi Yu","Ye Shi","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08924v2","updated":"2024-03-24T14:23:59Z","published":"2023-12-14T13:31:01Z","title":"Training-free Zero-shot Composed Image Retrieval with Local Concept\n Reranking","summary":" Composed image retrieval attempts to retrieve an image of interest from\ngallery images through a composed query of a reference image and its\ncorresponding modified text. It has recently attracted attention due to the\ncollaboration of information-rich images and concise language to precisely\nexpress the requirements of target images. Most current composed image\nretrieval methods follow a supervised learning approach to training on a costly\ntriplet dataset composed of a reference image, modified text, and a\ncorresponding target image. To avoid difficult to-obtain labeled triplet\ntraining data, zero-shot composed image retrieval (ZS-CIR) has been introduced,\nwhich aims to retrieve the target image by learning from image-text pairs\n(self-supervised triplets), without the need for human-labeled triplets.\nHowever, this self-supervised triplet learning approach is computationally less\neffective and less understandable as it assumes the interaction between image\nand text is conducted with implicit query embedding without explicit semantical\ninterpretation. In this work, we present a new training-free zero-shot composed\nimage retrieval method which translates the query into explicit\nhuman-understandable text. This helps improve model learning efficiency to\nenhance the generalization capacity of foundation models. Further, we introduce\na Local Concept Re-ranking (LCR) mechanism to focus on discriminative local\ninformation extracted from the modified instructions. Extensive experiments on\nfour ZS-CIR benchmarks show that our method achieves comparable performances to\nthat of the state of-the-art triplet training based methods, but significantly\noutperforms other training-free methods on the open domain datasets (CIRR,\nCIRCO and COCO), as well as the fashion domain dataset (FashionIQ).\n","authors":["Shitong Sun","Fanghua Ye","Shaogang Gong"],"pdf_url":"https://arxiv.org/pdf/2312.08924v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2403.16167v1","updated":"2024-03-24T14:21:06Z","published":"2024-03-24T14:21:06Z","title":"Exploiting Semantic Reconstruction to Mitigate Hallucinations in\n Vision-Language Models","summary":" Hallucinations in vision-language models pose a significant challenge to\ntheir reliability, particularly in the generation of long captions. Current\nmethods fall short of accurately identifying and mitigating these\nhallucinations. To address this issue, we introduce ESREAL, a novel\nunsupervised learning framework designed to suppress the generation of\nhallucinations through accurate localization and penalization of hallucinated\ntokens. Initially, ESREAL creates a reconstructed image based on the generated\ncaption and aligns its corresponding regions with those of the original image.\nThis semantic reconstruction aids in identifying both the presence and type of\ntoken-level hallucinations within the generated caption. Subsequently, ESREAL\ncomputes token-level hallucination scores by assessing the semantic similarity\nof aligned regions based on the type of hallucination. Finally, ESREAL employs\na proximal policy optimization algorithm, where it selectively penalizes\nhallucinated tokens according to their token-level hallucination scores. Our\nframework notably reduces hallucinations in LLaVA, InstructBLIP, and mPLUG-Owl2\nby 32.81%, 27.08%, and 7.46% on the CHAIR metric. This improvement is achieved\nsolely through signals derived from the image itself, without the need for any\nimage-text pairs.\n","authors":["Minchan Kim","Minyeong Kim","Junik Bae","Suhwan Choi","Sungkyung Kim","Buru Chang"],"pdf_url":"https://arxiv.org/pdf/2403.16167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16161v1","updated":"2024-03-24T14:02:25Z","published":"2024-03-24T14:02:25Z","title":"Towards Online Real-Time Memory-based Video Inpainting Transformers","summary":" Video inpainting tasks have seen significant improvements in recent years\nwith the rise of deep neural networks and, in particular, vision transformers.\nAlthough these models show promising reconstruction quality and temporal\nconsistency, they are still unsuitable for live videos, one of the last steps\nto make them completely convincing and usable. The main limitations are that\nthese state-of-the-art models inpaint using the whole video (offline\nprocessing) and show an insufficient frame rate. In our approach, we propose a\nframework to adapt existing inpainting transformers to these constraints by\nmemorizing and refining redundant computations while maintaining a decent\ninpainting quality. Using this framework with some of the most recent\ninpainting models, we show great online results with a consistent throughput\nabove 20 frames per second. The code and pretrained models will be made\navailable upon acceptance.\n","authors":["Guillaume Thiry","Hao Tang","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2403.16161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04784v2","updated":"2024-03-24T13:56:31Z","published":"2023-12-08T01:53:06Z","title":"Reality's Canvas, Language's Brush: Crafting 3D Avatars from Monocular\n Video","summary":" Recent advancements in 3D avatar generation excel with multi-view supervision\nfor photorealistic models. However, monocular counterparts lag in quality\ndespite broader applicability. We propose ReCaLaB to close this gap. ReCaLaB is\na fully-differentiable pipeline that learns high-fidelity 3D human avatars from\njust a single RGB video. A pose-conditioned deformable NeRF is optimized to\nvolumetrically represent a human subject in canonical T-pose. The canonical\nrepresentation is then leveraged to efficiently associate neural textures using\n2D-3D correspondences. This enables the separation of diffused color generation\nand lighting correction branches that jointly compose an RGB prediction. The\ndesign allows to control intermediate results for human pose, body shape,\ntexture, and lighting with text prompts. An image-conditioned diffusion model\nthereby helps to animate appearance and pose of the 3D avatar to create video\nsequences with previously unseen human motion. Extensive experiments show that\nReCaLaB outperforms previous monocular approaches in terms of image quality for\nimage synthesis tasks. Moreover, natural language offers an intuitive user\ninterface for creative manipulation of 3D human avatars.\n","authors":["Yuchen Rao","Eduardo Perez Pellitero","Benjamin Busam","Yiren Zhou","Jifei Song"],"pdf_url":"https://arxiv.org/pdf/2312.04784v2.pdf","comment":"Video link: https://youtu.be/Oz83z1es2J4"},{"id":"http://arxiv.org/abs/2403.13352v2","updated":"2024-03-24T13:45:42Z","published":"2024-03-20T07:31:07Z","title":"AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in\n Text-to-Image Generation","summary":" Text-to-Image (T2I) diffusion models have achieved remarkable success in\nimage generation. Despite their progress, challenges remain in both\nprompt-following ability, image quality and lack of high-quality datasets,\nwhich are essential for refining these models. As acquiring labeled data is\ncostly, we introduce AGFSync, a framework that enhances T2I diffusion models\nthrough Direct Preference Optimization (DPO) in a fully AI-driven approach.\nAGFSync utilizes Vision-Language Models (VLM) to assess image quality across\nstyle, coherence, and aesthetics, generating feedback data within an AI-driven\nloop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and\nSDXL, our extensive experiments on the TIFA dataset demonstrate notable\nimprovements in VQA scores, aesthetic evaluations, and performance on the HPSv2\nbenchmark, consistently outperforming the base models. AGFSync's method of\nrefining T2I diffusion models paves the way for scalable alignment techniques.\n","authors":["Jingkun An","Yinghao Zhu","Zongjian Li","Haoran Feng","Bohua Chen","Yemin Shi","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2403.13352v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16146v1","updated":"2024-03-24T13:36:23Z","published":"2024-03-24T13:36:23Z","title":"Realtime Robust Shape Estimation of Deformable Linear Object","summary":" Realtime shape estimation of continuum objects and manipulators is essential\nfor developing accurate planning and control paradigms. The existing methods\nthat create dense point clouds from camera images, and/or use distinguishable\nmarkers on a deformable body have limitations in realtime tracking of large\ncontinuum objects/manipulators. The physical occlusion of markers can often\ncompromise accurate shape estimation. We propose a robust method to estimate\nthe shape of linear deformable objects in realtime using scattered and\nunordered key points. By utilizing a robust probability-based labeling\nalgorithm, our approach identifies the true order of the detected key points\nand then reconstructs the shape using piecewise spline interpolation. The\napproach only relies on knowing the number of the key points and the interval\nbetween two neighboring points. We demonstrate the robustness of the method\nwhen key points are partially occluded. The proposed method is also integrated\ninto a simulation in Unity for tracking the shape of a cable with a length of\n1m and a radius of 5mm. The simulation results show that our proposed approach\nachieves an average length error of 1.07% over the continuum's centerline and\nan average cross-section error of 2.11mm. The real-world experiments of\ntracking and estimating a heavy-load cable prove that the proposed approach is\nrobust under occlusion and complex entanglement scenarios.\n","authors":["Jiaming Zhang","Zhaomeng Zhang","Yihao Liu","Yaqian Chen","Amir Kheradmand","Mehran Armand"],"pdf_url":"https://arxiv.org/pdf/2403.16146v1.pdf","comment":"This paper has been accepted to IEEE ICRA 2024 as a contributed paper"},{"id":"http://arxiv.org/abs/2403.16143v1","updated":"2024-03-24T13:31:31Z","published":"2024-03-24T13:31:31Z","title":"CFAT: Unleashing TriangularWindows for Image Super-resolution","summary":" Transformer-based models have revolutionized the field of image\nsuper-resolution (SR) by harnessing their inherent ability to capture complex\ncontextual features. The overlapping rectangular shifted window technique used\nin transformer architecture nowadays is a common practice in super-resolution\nmodels to improve the quality and robustness of image upscaling. However, it\nsuffers from distortion at the boundaries and has limited unique shifting\nmodes. To overcome these weaknesses, we propose a non-overlapping triangular\nwindow technique that synchronously works with the rectangular one to mitigate\nboundary-level distortion and allows the model to access more unique sifting\nmodes. In this paper, we propose a Composite Fusion Attention Transformer\n(CFAT) that incorporates triangular-rectangular window-based local attention\nwith a channel-based global attention technique in image super-resolution. As a\nresult, CFAT enables attention mechanisms to be activated on more image pixels\nand captures long-range, multi-scale features to improve SR performance. The\nextensive experimental results and ablation study demonstrate the effectiveness\nof CFAT in the SR domain. Our proposed model shows a significant 0.7 dB\nperformance improvement over other state-of-the-art SR architectures.\n","authors":["Abhisek Ray","Gaurav Kumar","Maheshkumar H. Kolekar"],"pdf_url":"https://arxiv.org/pdf/2403.16143v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16141v1","updated":"2024-03-24T13:27:49Z","published":"2024-03-24T13:27:49Z","title":"Entity-NeRF: Detecting and Removing Moving Entities in Urban Scenes","summary":" Recent advancements in the study of Neural Radiance Fields (NeRF) for dynamic\nscenes often involve explicit modeling of scene dynamics. However, this\napproach faces challenges in modeling scene dynamics in urban environments,\nwhere moving objects of various categories and scales are present. In such\nsettings, it becomes crucial to effectively eliminate moving objects to\naccurately reconstruct static backgrounds. Our research introduces an\ninnovative method, termed here as Entity-NeRF, which combines the strengths of\nknowledge-based and statistical strategies. This approach utilizes entity-wise\nstatistics, leveraging entity segmentation and stationary entity classification\nthrough thing/stuff segmentation. To assess our methodology, we created an\nurban scene dataset masked with moving objects. Our comprehensive experiments\ndemonstrate that Entity-NeRF notably outperforms existing techniques in\nremoving moving objects and reconstructing static urban backgrounds, both\nquantitatively and qualitatively.\n","authors":["Takashi Otonari","Satoshi Ikehata","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2403.16141v1.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR 2024), Project website:\n https://otonari726.github.io/entitynerf/"},{"id":"http://arxiv.org/abs/2403.16131v1","updated":"2024-03-24T13:01:57Z","published":"2024-03-24T13:01:57Z","title":"Salience DETR: Enhancing Detection Transformer with Hierarchical\n Salience Filtering Refinement","summary":" DETR-like methods have significantly increased detection performance in an\nend-to-end manner. The mainstream two-stage frameworks of them perform dense\nself-attention and select a fraction of queries for sparse cross-attention,\nwhich is proven effective for improving performance but also introduces a heavy\ncomputational burden and high dependence on stable query selection. This paper\ndemonstrates that suboptimal two-stage selection strategies result in scale\nbias and redundancy due to the mismatch between selected queries and objects in\ntwo-stage initialization. To address these issues, we propose hierarchical\nsalience filtering refinement, which performs transformer encoding only on\nfiltered discriminative queries, for a better trade-off between computational\nefficiency and precision. The filtering process overcomes scale bias through a\nnovel scale-independent salience supervision. To compensate for the semantic\nmisalignment among queries, we introduce elaborate query refinement modules for\nstable two-stage initialization. Based on above improvements, the proposed\nSalience DETR achieves significant improvements of +4.0% AP, +0.2% AP, +4.4% AP\non three challenging task-specific detection datasets, as well as 49.2% AP on\nCOCO 2017 with less FLOPs. The code is available at\nhttps://github.com/xiuqhou/Salience-DETR.\n","authors":["Xiuquan Hou","Meiqin Liu","Senlin Zhang","Ping Wei","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2403.16131v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.09334v2","updated":"2024-03-24T13:00:54Z","published":"2024-03-14T12:22:54Z","title":"Video Editing via Factorized Diffusion Distillation","summary":" We introduce Emu Video Edit (EVE), a model that establishes a new\nstate-of-the art in video editing without relying on any supervised video\nediting data. To develop EVE we separately train an image editing adapter and a\nvideo generation adapter, and attach both to the same text-to-image model.\nThen, to align the adapters towards video editing we introduce a new\nunsupervised distillation procedure, Factorized Diffusion Distillation. This\nprocedure distills knowledge from one or more teachers simultaneously, without\nany supervised data. We utilize this procedure to teach EVE to edit videos by\njointly distilling knowledge to (i) precisely edit each individual frame from\nthe image editing adapter, and (ii) ensure temporal consistency among the\nedited frames using the video generation adapter. Finally, to demonstrate the\npotential of our approach in unlocking other capabilities, we align additional\ncombinations of adapters\n","authors":["Uriel Singer","Amit Zohar","Yuval Kirstain","Shelly Sheynin","Adam Polyak","Devi Parikh","Yaniv Taigman"],"pdf_url":"https://arxiv.org/pdf/2403.09334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16128v1","updated":"2024-03-24T12:55:50Z","published":"2024-03-24T12:55:50Z","title":"Enhancing Video Transformers for Action Understanding with VLM-aided\n Training","summary":" Owing to their ability to extract relevant spatio-temporal video embeddings,\nVision Transformers (ViTs) are currently the best performing models in video\naction understanding. However, their generalization over domains or datasets is\nsomewhat limited. In contrast, Visual Language Models (VLMs) have demonstrated\nexceptional generalization performance, but are currently unable to process\nvideos. Consequently, they cannot extract spatio-temporal patterns that are\ncrucial for action understanding. In this paper, we propose the Four-tiered\nPrompts (FTP) framework that takes advantage of the complementary strengths of\nViTs and VLMs. We retain ViTs' strong spatio-temporal representation ability\nbut improve the visual encodings to be more comprehensive and general by\naligning them with VLM outputs. The FTP framework adds four feature processors\nthat focus on specific aspects of human action in videos: action category,\naction components, action description, and context information. The VLMs are\nonly employed during training, and inference incurs a minimal computation cost.\nOur approach consistently yields state-of-the-art performance. For instance, we\nachieve remarkable top-1 accuracy of 93.8% on Kinetics-400 and 83.4% on\nSomething-Something V2, surpassing VideoMAEv2 by 2.8% and 2.6%, respectively.\n","authors":["Hui Lu","Hu Jian","Ronald Poppe","Albert Ali Salah"],"pdf_url":"https://arxiv.org/pdf/2403.16128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01099v2","updated":"2024-03-24T12:55:31Z","published":"2023-10-02T11:17:19Z","title":"HyMNet: a Multimodal Deep Learning System for Hypertension\n Classification using Fundus Photographs and Cardiometabolic Risk Factors","summary":" In recent years, deep learning has shown promise in predicting hypertension\n(HTN) from fundus images. However, most prior research has primarily focused on\nanalyzing a single type of data, which may not capture the full complexity of\nHTN risk. To address this limitation, this study introduces a multimodal deep\nlearning (MMDL) system, dubbed HyMNet, which combines fundus images and\ncardiometabolic risk factors, specifically age and gender, to improve\nhypertension detection capabilities. Our MMDL system uses RETFound, a\nfoundation model pre-trained on 1.6 million retinal images, for the fundus path\nand a fully connected neural network for the age and gender path. The two paths\nare jointly trained by concatenating the feature vectors from each path that\nare then fed into a fusion network. The system was trained on 5,016 retinal\nimages from 1,243 individuals collected from the Saudi Ministry of National\nGuard Health Affairs. The results show that the multimodal model that\nintegrates fundus images along with age and gender outperforms the unimodal\nsystem trained solely on fundus photographs, with an F1 score of 0.771 [0.747,\n0.796], and 0.745 [0.719, 0.772] for hypertension detection, respectively.\nAdditionally, we studied the effect underlying diabetes mellitus has on the\nmodel's predictive ability, concluding that diabetes is used as a confounding\nvariable for distinguishing hypertensive cases. Our code and model weights are\npublicly available at https://github.com/MohammedSB/HyMNet.\n","authors":["Mohammed Baharoon","Hessa Almatar","Reema Alduhayan","Tariq Aldebasi","Badr Alahmadi","Yahya Bokhari","Mohammed Alawad","Ahmed Almazroa","Abdulrhman Aljouie"],"pdf_url":"https://arxiv.org/pdf/2310.01099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12466v2","updated":"2024-03-24T12:42:25Z","published":"2024-03-19T05:50:48Z","title":"Few-shot Object Localization","summary":" Existing object localization methods are tailored to locate a specific class\nof objects, relying on abundant labeled data for model optimization. However,\nin numerous real-world scenarios, acquiring large labeled data can be arduous,\nsignificantly constraining the broader application of localization models. To\nbridge this research gap, this paper proposes the novel task of Few-Shot Object\nLocalization (FSOL), which seeks to achieve precise localization with limited\nsamples available. This task achieves generalized object localization by\nleveraging a small number of labeled support samples to query the positional\ninformation of objects within corresponding images. To advance this research\nfield, we propose an innovative high-performance baseline model. Our model\nintegrates a dual-path feature augmentation module to enhance shape association\nand gradient differences between supports and query images, alongside a self\nquery module designed to explore the association between feature maps and query\nimages. Experimental results demonstrate a significant performance improvement\nof our approach in the FSOL task, establishing an efficient benchmark for\nfurther research. All codes and data are available at\nhttps://github.com/Ryh1218/FSOL.\n","authors":["Yunhan Ren","Bo Li","Chengyang Zhang","Yong Zhang","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2403.12466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16124v1","updated":"2024-03-24T12:41:58Z","published":"2024-03-24T12:41:58Z","title":"Enhancing Visual Continual Learning with Language-Guided Supervision","summary":" Continual learning (CL) aims to empower models to learn new tasks without\nforgetting previously acquired knowledge. Most prior works concentrate on the\ntechniques of architectures, replay data, regularization, \\etc. However, the\ncategory name of each class is largely neglected. Existing methods commonly\nutilize the one-hot labels and randomly initialize the classifier head. We\nargue that the scarce semantic information conveyed by the one-hot labels\nhampers the effective knowledge transfer across tasks. In this paper, we\nrevisit the role of the classifier head within the CL paradigm and replace the\nclassifier with semantic knowledge from pretrained language models (PLMs).\nSpecifically, we use PLMs to generate semantic targets for each class, which\nare frozen and serve as supervision signals during training. Such targets fully\nconsider the semantic correlation between all classes across tasks. Empirical\nstudies show that our approach mitigates forgetting by alleviating\nrepresentation drifting and facilitating knowledge transfer across tasks. The\nproposed method is simple to implement and can seamlessly be plugged into\nexisting methods with negligible adjustments. Extensive experiments based on\neleven mainstream baselines demonstrate the effectiveness and generalizability\nof our approach to various protocols. For example, under the class-incremental\nlearning setting on ImageNet-100, our method significantly improves the Top-1\naccuracy by 3.2\\% to 6.1\\% while reducing the forgetting rate by 2.6\\% to\n13.1\\%.\n","authors":["Bolin Ni","Hongbo Zhao","Chenghao Zhang","Ke Hu","Gaofeng Meng","Zhaoxiang Zhang","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2403.16124v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2401.05010v2","updated":"2024-03-24T12:32:06Z","published":"2024-01-10T08:56:02Z","title":"Less is More: A Closer Look at Semantic-based Few-Shot Learning","summary":" Few-shot Learning aims to learn and distinguish new categories with a very\nlimited number of available images, presenting a significant challenge in the\nrealm of deep learning. Recent researchers have sought to leverage the\nadditional textual or linguistic information of these rare categories with a\npre-trained language model to facilitate learning, thus partially alleviating\nthe problem of insufficient supervision signals. However, the full potential of\nthe textual information and pre-trained language model have been underestimated\nin the few-shot learning till now, resulting in limited performance\nenhancements. To address this, we propose a simple but effective framework for\nfew-shot learning tasks, specifically designed to exploit the textual\ninformation and language model. In more detail, we explicitly exploit the\nzero-shot capability of the pre-trained language model with the learnable\nprompt. And we just add the visual feature with the textual feature for\ninference directly without the intricate designed fusion modules in previous\nworks. Additionally, we apply the self-ensemble and distillation to further\nenhance these components. Our extensive experiments conducted across four\nwidely used few-shot datasets demonstrate that our simple framework achieves\nimpressive results. Particularly noteworthy is its outstanding performance in\nthe 1-shot learning task, surpassing state-of-the-art methods by an average of\n3.0\\% in classification accuracy. \\footnote{We will make the source codes of\nthe proposed framework publicly available upon acceptance. }.\n","authors":["Chunpeng Zhou","Haishuai Wang","Xilu Yuan","Zhi Yu","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2401.05010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16116v1","updated":"2024-03-24T12:15:28Z","published":"2024-03-24T12:15:28Z","title":"Self-Supervised Multi-Frame Neural Scene Flow","summary":" Neural Scene Flow Prior (NSFP) and Fast Neural Scene Flow (FNSF) have shown\nremarkable adaptability in the context of large out-of-distribution autonomous\ndriving. Despite their success, the underlying reasons for their astonishing\ngeneralization capabilities remain unclear. Our research addresses this gap by\nexamining the generalization capabilities of NSFP through the lens of uniform\nstability, revealing that its performance is inversely proportional to the\nnumber of input point clouds. This finding sheds light on NSFP's effectiveness\nin handling large-scale point cloud scene flow estimation tasks. Motivated by\nsuch theoretical insights, we further explore the improvement of scene flow\nestimation by leveraging historical point clouds across multiple frames, which\ninherently increases the number of point clouds. Consequently, we propose a\nsimple and effective method for multi-frame point cloud scene flow estimation,\nalong with a theoretical evaluation of its generalization abilities. Our\nanalysis confirms that the proposed method maintains a limited generalization\nerror, suggesting that adding multiple frames to the scene flow optimization\nprocess does not detract from its generalizability. Extensive experimental\nresults on large-scale autonomous driving Waymo Open and Argoverse lidar\ndatasets demonstrate that the proposed method achieves state-of-the-art\nperformance.\n","authors":["Dongrui Liu","Daqi Liu","Xueqian Li","Sihao Lin","Hongwei xie","Bing Wang","Xiaojun Chang","Lei Chu"],"pdf_url":"https://arxiv.org/pdf/2403.16116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16112v1","updated":"2024-03-24T12:05:23Z","published":"2024-03-24T12:05:23Z","title":"Opportunities and challenges in the application of large artificial\n intelligence models in radiology","summary":" Influenced by ChatGPT, artificial intelligence (AI) large models have\nwitnessed a global upsurge in large model research and development. As people\nenjoy the convenience by this AI large model, more and more large models in\nsubdivided fields are gradually being proposed, especially large models in\nradiology imaging field. This article first introduces the development history\nof large models, technical details, workflow, working principles of multimodal\nlarge models and working principles of video generation large models. Secondly,\nwe summarize the latest research progress of AI large models in radiology\neducation, radiology report generation, applications of unimodal and multimodal\nradiology. Finally, this paper also summarizes some of the challenges of large\nAI models in radiology, with the aim of better promoting the rapid revolution\nin the field of radiography.\n","authors":["Liangrui Pan","Zhenyu Zhao","Ying Lu","Kewei Tang","Liyong Fu","Qingchun Liang","Shaoliang Peng"],"pdf_url":"https://arxiv.org/pdf/2403.16112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18411v2","updated":"2024-03-24T12:04:11Z","published":"2024-02-28T15:31:45Z","title":"Unsupervised Cross-Domain Image Retrieval via Prototypical Optimal\n Transport","summary":" Unsupervised cross-domain image retrieval (UCIR) aims to retrieve images\nsharing the same category across diverse domains without relying on labeled\ndata. Prior approaches have typically decomposed the UCIR problem into two\ndistinct tasks: intra-domain representation learning and cross-domain feature\nalignment. However, these segregated strategies overlook the potential\nsynergies between these tasks. This paper introduces ProtoOT, a novel Optimal\nTransport formulation explicitly tailored for UCIR, which integrates\nintra-domain feature representation learning and cross-domain alignment into a\nunified framework. ProtoOT leverages the strengths of the K-means clustering\nmethod to effectively manage distribution imbalances inherent in UCIR. By\nutilizing K-means for generating initial prototypes and approximating class\nmarginal distributions, we modify the constraints in Optimal Transport\naccordingly, significantly enhancing its performance in UCIR scenarios.\nFurthermore, we incorporate contrastive learning into the ProtoOT framework to\nfurther improve representation learning. This encourages local semantic\nconsistency among features with similar semantics, while also explicitly\nenforcing separation between features and unmatched prototypes, thereby\nenhancing global discriminativeness. ProtoOT surpasses existing\nstate-of-the-art methods by a notable margin across benchmark datasets.\nNotably, on DomainNet, ProtoOT achieves an average P@200 enhancement of 24.44%,\nand on Office-Home, it demonstrates a P@15 improvement of 12.12%. Code is\navailable at https://github.com/HCVLAB/ProtoOT.\n","authors":["Bin Li","Ye Shi","Qian Yu","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2402.18411v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2403.16111v1","updated":"2024-03-24T12:04:06Z","published":"2024-03-24T12:04:06Z","title":"EVA: Zero-shot Accurate Attributes and Multi-Object Video Editing","summary":" Current diffusion-based video editing primarily focuses on local editing\n(\\textit{e.g.,} object/background editing) or global style editing by utilizing\nvarious dense correspondences. However, these methods often fail to accurately\nedit the foreground and background simultaneously while preserving the original\nlayout. We find that the crux of the issue stems from the imprecise\ndistribution of attention weights across designated regions, including\ninaccurate text-to-attribute control and attention leakage. To tackle this\nissue, we introduce EVA, a \\textbf{zero-shot} and \\textbf{multi-attribute}\nvideo editing framework tailored for human-centric videos with complex motions.\nWe incorporate a Spatial-Temporal Layout-Guided Attention mechanism that\nleverages the intrinsic positive and negative correspondences of cross-frame\ndiffusion features. To avoid attention leakage, we utilize these\ncorrespondences to boost the attention scores of tokens within the same\nattribute across all video frames while limiting interactions between tokens of\ndifferent attributes in the self-attention layer. For precise text-to-attribute\nmanipulation, we use discrete text embeddings focused on specific layout areas\nwithin the cross-attention layer. Benefiting from the precise attention weight\ndistribution, EVA can be easily generalized to multi-object editing scenarios\nand achieves accurate identity mapping. Extensive experiments demonstrate EVA\nachieves state-of-the-art results in real-world scenarios. Full results are\nprovided at https://knightyxp.github.io/EVA/\n","authors":["Xiangpeng Yang","Linchao Zhu","Hehe Fan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16111v1.pdf","comment":"Project page: https://knightyxp.github.io/EVA"},{"id":"http://arxiv.org/abs/2403.16095v1","updated":"2024-03-24T11:19:59Z","published":"2024-03-24T11:19:59Z","title":"CG-SLAM: Efficient Dense RGB-D SLAM in a Consistent Uncertainty-aware 3D\n Gaussian Field","summary":" Recently neural radiance fields (NeRF) have been widely exploited as 3D\nrepresentations for dense simultaneous localization and mapping (SLAM). Despite\ntheir notable successes in surface modeling and novel view synthesis, existing\nNeRF-based methods are hindered by their computationally intensive and\ntime-consuming volume rendering pipeline. This paper presents an efficient\ndense RGB-D SLAM system, i.e., CG-SLAM, based on a novel uncertainty-aware 3D\nGaussian field with high consistency and geometric stability. Through an\nin-depth analysis of Gaussian Splatting, we propose several techniques to\nconstruct a consistent and stable 3D Gaussian field suitable for tracking and\nmapping. Additionally, a novel depth uncertainty model is proposed to ensure\nthe selection of valuable Gaussian primitives during optimization, thereby\nimproving tracking efficiency and accuracy. Experiments on various datasets\ndemonstrate that CG-SLAM achieves superior tracking and mapping performance\nwith a notable tracking speed of up to 15 Hz. We will make our source code\npublicly available. Project page: https://zju3dv.github.io/cg-slam.\n","authors":["Jiarui Hu","Xianhao Chen","Boyin Feng","Guanglin Li","Liangjing Yang","Hujun Bao","Guofeng Zhang","Zhaopeng Cui"],"pdf_url":"https://arxiv.org/pdf/2403.16095v1.pdf","comment":"Project Page: https://zju3dv.github.io/cg-slam"},{"id":"http://arxiv.org/abs/2403.16092v1","updated":"2024-03-24T11:09:41Z","published":"2024-03-24T11:09:41Z","title":"Are NeRFs ready for autonomous driving? Towards closing the\n real-to-simulation gap","summary":" Neural Radiance Fields (NeRFs) have emerged as promising tools for advancing\nautonomous driving (AD) research, offering scalable closed-loop simulation and\ndata augmentation capabilities. However, to trust the results achieved in\nsimulation, one needs to ensure that AD systems perceive real and rendered data\nin the same way. Although the performance of rendering methods is increasing,\nmany scenarios will remain inherently challenging to reconstruct faithfully. To\nthis end, we propose a novel perspective for addressing the real-to-simulated\ndata gap. Rather than solely focusing on improving rendering fidelity, we\nexplore simple yet effective methods to enhance perception model robustness to\nNeRF artifacts without compromising performance on real data. Moreover, we\nconduct the first large-scale investigation into the real-to-simulated data gap\nin an AD setting using a state-of-the-art neural rendering technique.\nSpecifically, we evaluate object detectors and an online mapping model on real\nand simulated data, and study the effects of different pre-training strategies.\nOur results show notable improvements in model robustness to simulated data,\neven improving real-world performance in some cases. Last, we delve into the\ncorrelation between the real-to-simulated gap and image reconstruction metrics,\nidentifying FID and LPIPS as strong indicators.\n","authors":["Carl Lindström","Georg Hess","Adam Lilja","Maryam Fatemi","Lars Hammarstrand","Christoffer Petersson","Lennart Svensson"],"pdf_url":"https://arxiv.org/pdf/2403.16092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03059v6","updated":"2024-03-24T10:29:46Z","published":"2023-10-04T16:49:36Z","title":"Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models","summary":" The popularity of pre-trained large models has revolutionized downstream\ntasks across diverse fields, such as language, vision, and multi-modality. To\nminimize the adaption cost for downstream tasks, many Parameter-Efficient\nFine-Tuning (PEFT) techniques are proposed for language and 2D image\npre-trained models. However, the specialized PEFT method for 3D pre-trained\nmodels is still under-explored. To this end, we introduce Point-PEFT, a novel\nframework for adapting point cloud pre-trained models with minimal learnable\nparameters. Specifically, for a pre-trained 3D model, we freeze most of its\nparameters, and only tune the newly added PEFT modules on downstream tasks,\nwhich consist of a Point-prior Prompt and a Geometry-aware Adapter. The\nPoint-prior Prompt adopts a set of learnable prompt tokens, for which we\npropose to construct a memory bank with domain-specific knowledge, and utilize\na parameter-free attention to enhance the prompt tokens. The Geometry-aware\nAdapter aims to aggregate point cloud features within spatial neighborhoods to\ncapture fine-grained geometric information through local interactions.\nExtensive experiments indicate that our Point-PEFT can achieve better\nperformance than the full fine-tuning on various downstream tasks, while using\nonly 5% of the trainable parameters, demonstrating the efficiency and\neffectiveness of our approach. Code is released at\nhttps://github.com/Ivan-Tang-3D/Point-PEFT.\n","authors":["Yiwen Tang","Ray Zhang","Zoey Guo","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2310.03059v6.pdf","comment":"The specialized PEFT framework for 3D pre-trained models, which\n achieves competitive performance to full fine-tuning, and significantly\n reduces the computational resources. Project page:\n https://github.com/Ivan-Tang-3D/Point-PEFT"},{"id":"http://arxiv.org/abs/2402.12928v4","updated":"2024-03-24T10:06:59Z","published":"2024-02-20T11:28:50Z","title":"A Literature Review of Literature Reviews in Pattern Analysis and\n Machine Intelligence","summary":" By consolidating scattered knowledge, the literature review provides a\ncomprehensive understanding of the investigated topic. However, reading,\nconducting, or peer-reviewing review papers generally demands a significant\ninvestment of time and effort from researchers. To improve efficiency, this\npaper aims to provide a thorough review of reviews in the PAMI field from\ndiverse perspectives. First, this paper proposes several article-level,\nfield-normalized, and large language model-empowered bibliometric indicators to\nevaluate reviews. To facilitate this, a meta-data database dubbed RiPAMI, and a\ntopic dataset are constructed. Second, based on these indicators, the study\npresents comparative analyses of representative reviews, unveiling the\ncharacteristics of publications across various fields, periods, and journals.\nThe newly emerging AI-generated literature reviews are also appraised, and the\nobserved differences suggest that most AI-generated reviews still lag behind\nhuman-authored reviews in multiple aspects. Third, we briefly provide a\nsubjective evaluation of representative PAMI reviews and introduce a paper\nstructure-based typology of literature reviews. This typology may improve the\nclarity and effectiveness for scholars in reading and writing reviews, while\nalso serving as a guide for AI systems in generating well-organized reviews.\nFinally, this work offers insights into the current challenges of literature\nreviews and envisions future directions for their development.\n","authors":["Penghai Zhao","Xin Zhang","Ming-Ming Cheng","Jian Yang","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2402.12928v4.pdf","comment":"IEEE version v1. [February 19, 2024] IEEE version v2 with typos\n fixed. [February 23, 2024] IEEE version v3 with errors fixed. [February 29,\n 2024] IEEE version v4 with improved quaility. [February 29, 2024]"},{"id":"http://arxiv.org/abs/2403.16080v1","updated":"2024-03-24T10:06:40Z","published":"2024-03-24T10:06:40Z","title":"PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic\n Human Modeling","summary":" High-quality human reconstruction and photo-realistic rendering of a dynamic\nscene is a long-standing problem in computer vision and graphics. Despite\nconsiderable efforts invested in developing various capture systems and\nreconstruction algorithms, recent advancements still struggle with loose or\noversized clothing and overly complex poses. In part, this is due to the\nchallenges of acquiring high-quality human datasets. To facilitate the\ndevelopment of these fields, in this paper, we present PKU-DyMVHumans, a\nversatile human-centric dataset for high-fidelity reconstruction and rendering\nof dynamic human scenarios from dense multi-view videos. It comprises 8.2\nmillion frames captured by more than 56 synchronized cameras across diverse\nscenarios. These sequences comprise 32 human subjects across 45 different\nscenarios, each with a high-detailed appearance and realistic human motion.\nInspired by recent advancements in neural radiance field (NeRF)-based scene\nrepresentations, we carefully set up an off-the-shelf framework that is easy to\nprovide those state-of-the-art NeRF-based implementations and benchmark on\nPKU-DyMVHumans dataset. It is paving the way for various applications like\nfine-grained foreground/background decomposition, high-quality human\nreconstruction and photo-realistic novel view synthesis of a dynamic scene.\nExtensive studies are performed on the benchmark, demonstrating new\nobservations and challenges that emerge from using such high-fidelity dynamic\ndata. The dataset is available at: https://pku-dymvhumans.github.io.\n","authors":["Xiaoyun Zheng","Liwei Liao","Xufeng Li","Jianbo Jiao","Rongjie Wang","Feng Gao","Shiqi Wang","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13438v3","updated":"2024-03-24T10:06:25Z","published":"2024-03-18T17:38:29Z","title":"See, Imagine, Plan: Discovering and Hallucinating Tasks from a Single\n Image","summary":" Humans can not only recognize and understand the world in its current state\nbut also envision future scenarios that extend beyond immediate perception. To\nresemble this profound human capacity, we introduce zero-shot task\nhallucination -- given a single RGB image of any scene comprising unknown\nenvironments and objects, our model can identify potential tasks and imagine\ntheir execution in a vivid narrative, realized as a video. We develop a modular\npipeline that progressively enhances scene decomposition, comprehension, and\nreconstruction, incorporating VLM for dynamic interaction and 3D motion\nplanning for object trajectories. Our model can discover diverse tasks, with\nthe generated task videos demonstrating realistic and compelling visual\noutcomes that are understandable by both machines and humans. Project Page:\nhttps://dannymcy.github.io/zeroshot_task_hallucination/\n","authors":["Chenyang Ma","Kai Lu","Ta-Ying Cheng","Niki Trigoni","Andrew Markham"],"pdf_url":"https://arxiv.org/pdf/2403.13438v3.pdf","comment":"Project Page: https://dannymcy.github.io/zeroshot_task_hallucination/"},{"id":"http://arxiv.org/abs/2310.19258v2","updated":"2024-03-24T09:32:51Z","published":"2023-10-30T04:04:02Z","title":"Improving Online Source-free Domain Adaptation for Object Detection by\n Unsupervised Data Acquisition","summary":" Effective object detection in mobile robots is challenged by deployment in\ndiverse and unfamiliar environments. Online Source-Free Domain Adaptation\n(O-SFDA) offers model adaptation using a stream of unlabeled data from a target\ndomain in online manner. However, not all captured frames contain information\nthat is beneficial for adaptation, particularly when there is a strong class\nimbalance. This paper introduces a novel approach to enhance O-SFDA for\nadaptive object detection in mobile robots via unsupervised data acquisition.\nOur methodology prioritizes the most informative unlabeled frames for inclusion\nin the online training process. Empirical evaluation on a real-world dataset\nreveals that our method outperforms existing state-of-the-art O-SFDA\ntechniques, demonstrating the viability of unsupervised data acquisition for\nimproving adaptive object detection in mobile robots.\n","authors":["Xiangyu Shi","Yanyuan Qiao","Qi Wu","Lingqiao Liu","Feras Dayoub"],"pdf_url":"https://arxiv.org/pdf/2310.19258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07347v2","updated":"2024-03-24T09:22:13Z","published":"2024-03-12T06:07:29Z","title":"Frequency Decoupling for Motion Magnification via Multi-Level Isomorphic\n Architecture","summary":" Video Motion Magnification (VMM) aims to reveal subtle and imperceptible\nmotion information of objects in the macroscopic world. Prior methods directly\nmodel the motion field from the Eulerian perspective by Representation Learning\nthat separates shape and texture or Multi-domain Learning from phase\nfluctuations. Inspired by the frequency spectrum, we observe that the\nlow-frequency components with stable energy always possess spatial structure\nand less noise, making them suitable for modeling the subtle motion field. To\nthis end, we present FD4MM, a new paradigm of Frequency Decoupling for Motion\nMagnification with a Multi-level Isomorphic Architecture to capture multi-level\nhigh-frequency details and a stable low-frequency structure (motion field) in\nvideo space. Since high-frequency details and subtle motions are susceptible to\ninformation degradation due to their inherent subtlety and unavoidable external\ninterference from noise, we carefully design Sparse High/Low-pass Filters to\nenhance the integrity of details and motion structures, and a Sparse Frequency\nMixer to promote seamless recoupling. Besides, we innovatively design a\ncontrastive regularization for this task to strengthen the model's ability to\ndiscriminate irrelevant features, reducing undesired motion magnification.\nExtensive experiments on both Real-world and Synthetic Datasets show that our\nFD4MM outperforms SOTA methods. Meanwhile, FD4MM reduces FLOPs by 1.63$\\times$\nand boosts inference speed by 1.68$\\times$ than the latest method. Our code is\navailable at https://github.com/Jiafei127/FD4MM.\n","authors":["Fei Wang","Dan Guo","Kun Li","Zhun Zhong","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.07347v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.16071v1","updated":"2024-03-24T09:18:21Z","published":"2024-03-24T09:18:21Z","title":"Landmark-Guided Cross-Speaker Lip Reading with Mutual Information\n Regularization","summary":" Lip reading, the process of interpreting silent speech from visual lip\nmovements, has gained rising attention for its wide range of realistic\napplications. Deep learning approaches greatly improve current lip reading\nsystems. However, lip reading in cross-speaker scenarios where the speaker\nidentity changes, poses a challenging problem due to inter-speaker variability.\nA well-trained lip reading system may perform poorly when handling a brand new\nspeaker. To learn a speaker-robust lip reading model, a key insight is to\nreduce visual variations across speakers, avoiding the model overfitting to\nspecific speakers. In this work, in view of both input visual clues and latent\nrepresentations based on a hybrid CTC/attention architecture, we propose to\nexploit the lip landmark-guided fine-grained visual clues instead of\nfrequently-used mouth-cropped images as input features, diminishing\nspeaker-specific appearance characteristics. Furthermore, a max-min mutual\ninformation regularization approach is proposed to capture speaker-insensitive\nlatent representations. Experimental evaluations on public lip reading datasets\ndemonstrate the effectiveness of the proposed approach under the intra-speaker\nand inter-speaker conditions.\n","authors":["Linzhi Wu","Xingyu Zhang","Yakun Zhang","Changyan Zheng","Tiejun Liu","Liang Xie","Ye Yan","Erwei Yin"],"pdf_url":"https://arxiv.org/pdf/2403.16071v1.pdf","comment":"To appear in LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2403.16067v1","updated":"2024-03-24T08:34:08Z","published":"2024-03-24T08:34:08Z","title":"Robust Diffusion Models for Adversarial Purification","summary":" Diffusion models (DMs) based adversarial purification (AP) has shown to be\nthe most powerful alternative to adversarial training (AT). However, these\nmethods neglect the fact that pre-trained diffusion models themselves are not\nrobust to adversarial attacks as well. Additionally, the diffusion process can\neasily destroy semantic information and generate a high quality image but\ntotally different from the original input image after the reverse process,\nleading to degraded standard accuracy. To overcome these issues, a natural idea\nis to harness adversarial training strategy to retrain or fine-tune the\npre-trained diffusion model, which is computationally prohibitive. We propose a\nnovel robust reverse process with adversarial guidance, which is independent of\ngiven pre-trained DMs and avoids retraining or fine-tuning the DMs. This robust\nguidance can not only ensure to generate purified examples retaining more\nsemantic content but also mitigate the accuracy-robustness trade-off of DMs for\nthe first time, which also provides DM-based AP an efficient adaptive ability\nto new attacks. Extensive experiments are conducted to demonstrate that our\nmethod achieves the state-of-the-art results and exhibits generalization\nagainst different attacks.\n","authors":["Guang Lin","Zerui Tao","Jianhai Zhang","Toshihisa Tanaka","Qibin Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.16067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14137v2","updated":"2024-03-24T07:58:01Z","published":"2024-03-21T05:13:12Z","title":"SynerMix: Synergistic Mixup Solution for Enhanced Intra-Class Cohesion\n and Inter-Class Separability in Image Classification","summary":" To address the issues of MixUp and its variants (e.g., Manifold MixUp) in\nimage classification tasks-namely, their neglect of mixing within the same\nclass (intra-class mixup) and their inadequacy in enhancing intra-class\ncohesion through their mixing operations-we propose a novel mixup method named\nSynerMix-Intra and, building upon this, introduce a synergistic mixup solution\nnamed SynerMix. SynerMix-Intra specifically targets intra-class mixup to\nbolster intra-class cohesion, a feature not addressed by current mixup methods.\nFor each mini-batch, it leverages feature representations of unaugmented\noriginal images from each class to generate a synthesized feature\nrepresentation through random linear interpolation. All synthesized\nrepresentations are then fed into the classification and loss layers to\ncalculate an average classification loss that significantly enhances\nintra-class cohesion. Furthermore, SynerMix combines SynerMix-Intra with an\nexisting mixup approach (e.g., MixUp, Manifold MixUp), which primarily focuses\non inter-class mixup and has the benefit of enhancing inter-class separability.\nIn doing so, it integrates both inter- and intra-class mixup in a balanced way\nwhile concurrently improving intra-class cohesion and inter-class separability.\nExperimental results on six datasets show that SynerMix achieves a 0.1% to\n3.43% higher accuracy than the best of either MixUp or SynerMix-Intra alone,\naveraging a 1.16% gain. It also surpasses the top-performer of either Manifold\nMixUp or SynerMix-Intra by 0.12% to 5.16%, with an average gain of 1.11%. Given\nthat SynerMix is model-agnostic, it holds significant potential for application\nin other domains where mixup methods have shown promise, such as speech and\ntext classification. Our code is publicly available at:\nhttps://github.com/wxitxy/synermix.git.\n","authors":["Ye Xu","Ya Gao","Xiaorong Qiu","Yang Chen","Ying Ji"],"pdf_url":"https://arxiv.org/pdf/2403.14137v2.pdf","comment":"25 pages,12 figures"},{"id":"http://arxiv.org/abs/2403.16051v1","updated":"2024-03-24T07:36:38Z","published":"2024-03-24T07:36:38Z","title":"Segment Anything Model for Road Network Graph Extraction","summary":" We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for\nextracting large-scale, vectorized road network graphs from satellite imagery.\nTo predict graph geometry, we formulate it as a dense semantic segmentation\ntask, leveraging the inherent strengths of SAM. The image encoder of SAM is\nfine-tuned to produce probability masks for roads and intersections, from which\nthe graph vertices are extracted via simple non-maximum suppression. To predict\ngraph topology, we designed a lightweight transformer-based graph neural\nnetwork, which leverages the SAM image embeddings to estimate the edge\nexistence probabilities between vertices. Our approach directly predicts the\ngraph vertices and edges for large regions without expensive and complex\npost-processing heuristics, and is capable of building complete road network\ngraphs spanning multiple square kilometers in a matter of seconds. With its\nsimple, straightforward, and minimalist design, SAM-Road achieves comparable\naccuracy with the state-of-the-art method RNGDet++, while being 40 times faster\non the City-scale dataset. We thus demonstrate the power of a foundational\nvision model when applied to a graph learning task. The code is available at\nhttps://github.com/htcr/sam_road.\n","authors":["Congrui Hetang","Haoru Xue","Cindy Le","Tianwei Yue","Wenping Wang","Yihui He"],"pdf_url":"https://arxiv.org/pdf/2403.16051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16050v1","updated":"2024-03-24T07:33:08Z","published":"2024-03-24T07:33:08Z","title":"A General and Efficient Federated Split Learning with Pre-trained Image\n Transformers for Heterogeneous Data","summary":" Federated Split Learning (FSL) is a promising distributed learning paradigm\nin practice, which gathers the strengths of both Federated Learning (FL) and\nSplit Learning (SL) paradigms, to ensure model privacy while diminishing the\nresource overhead of each client, especially on large transformer models in a\nresource-constrained environment, e.g., Internet of Things (IoT). However,\nalmost all works merely investigate the performance with simple neural network\nmodels in FSL. Despite the minor efforts focusing on incorporating Vision\nTransformers (ViT) as model architectures, they train ViT from scratch, thereby\nleading to enormous training overhead in each device with limited resources.\nTherefore, in this paper, we harness Pre-trained Image Transformers (PITs) as\nthe initial model, coined FES-PIT, to accelerate the training process and\nimprove model robustness. Furthermore, we propose FES-PTZO to hinder the\ngradient inversion attack, especially having the capability compatible with\nblack-box scenarios, where the gradient information is unavailable. Concretely,\nFES-PTZO approximates the server gradient by utilizing a zeroth-order (ZO)\noptimization, which replaces the backward propagation with just one forward\nprocess. Empirically, we are the first to provide a systematic evaluation of\nFSL methods with PITs in real-world datasets, different partial device\nparticipations, and heterogeneous data splits. Our experiments verify the\neffectiveness of our algorithms.\n","authors":["Yifan Shi","Yuhui Zhang","Ziyue Huang","Xiaofeng Yang","Li Shen","Wei Chen","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16048v1","updated":"2024-03-24T07:29:04Z","published":"2024-03-24T07:29:04Z","title":"Edit3K: Universal Representation Learning for Video Editing Components","summary":" This paper focuses on understanding the predominant video creation pipeline,\ni.e., compositional video editing with six main types of editing components,\nincluding video effects, animation, transition, filter, sticker, and text. In\ncontrast to existing visual representation learning of visual materials (i.e.,\nimages/videos), we aim to learn visual representations of editing\nactions/components that are generally applied on raw materials. We start by\nproposing the first large-scale dataset for editing components of video\ncreation, which covers about $3,094$ editing components with $618,800$ videos.\nEach video in our dataset is rendered by various image/video materials with a\nsingle editing component, which supports atomic visual understanding of\ndifferent editing components. It can also benefit several downstream tasks,\ne.g., editing component recommendation, editing component\nrecognition/retrieval, etc. Existing visual representation methods perform\npoorly because it is difficult to disentangle the visual appearance of editing\ncomponents from raw materials. To that end, we benchmark popular alternative\nsolutions and propose a novel method that learns to attend to the appearance of\nediting components regardless of raw materials. Our method achieves favorable\nresults on editing component retrieval/recognition compared to the alternative\nsolutions. A user study is also conducted to show that our representations\ncluster visually similar editing components better than other alternatives.\nFurthermore, our learned representations used to transition recommendation\ntasks achieve state-of-the-art results on the AutoTransition dataset. The code\nand dataset will be released for academic use.\n","authors":["Xin Gu","Libo Zhang","Fan Chen","Longyin Wen","Yufei Wang","Tiejian Luo","Sijie Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.16048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01024v2","updated":"2024-03-24T07:10:27Z","published":"2023-07-03T13:55:44Z","title":"SAM-DA: UAV Tracks Anything at Night with SAM-Powered Domain Adaptation","summary":" Domain adaptation (DA) has demonstrated significant promise for real-time\nnighttime unmanned aerial vehicle (UAV) tracking. However, the state-of-the-art\n(SOTA) DA still lacks the potential object with accurate pixel-level location\nand boundary to generate the high-quality target domain training sample. This\nkey issue constrains the transfer learning of the real-time daytime SOTA\ntrackers for challenging nighttime UAV tracking. Recently, the notable Segment\nAnything Model (SAM) has achieved a remarkable zero-shot generalization ability\nto discover abundant potential objects due to its huge data-driven training\napproach. To solve the aforementioned issue, this work proposes a novel\nSAM-powered DA framework for real-time nighttime UAV tracking, i.e., SAM-DA.\nSpecifically, an innovative SAM-powered target domain training sample swelling\nis designed to determine enormous high-quality target domain training samples\nfrom every single raw nighttime image. This novel one-to-many generation\nsignificantly expands the high-quality target domain training sample for DA.\nComprehensive experiments on extensive nighttime UAV videos prove the\nrobustness and domain adaptability of SAM-DA for nighttime UAV tracking.\nEspecially, compared to the SOTA DA, SAM-DA can achieve better performance with\nfewer raw nighttime images, i.e., the fewer-better training. This economized\ntraining approach facilitates the quick validation and deployment of algorithms\nfor UAVs. The code is available at https://github.com/vision4robotics/SAM-DA.\n","authors":["Changhong Fu","Liangliang Yao","Haobo Zuo","Guangze Zheng","Jia Pan"],"pdf_url":"https://arxiv.org/pdf/2307.01024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16043v1","updated":"2024-03-24T07:04:08Z","published":"2024-03-24T07:04:08Z","title":"Semantic Is Enough: Only Semantic Information For NeRF Reconstruction","summary":" Recent research that combines implicit 3D representation with semantic\ninformation, like Semantic-NeRF, has proven that NeRF model could perform\nexcellently in rendering 3D structures with semantic labels. This research aims\nto extend the Semantic Neural Radiance Fields (Semantic-NeRF) model by focusing\nsolely on semantic output and removing the RGB output component. We reformulate\nthe model and its training procedure to leverage only the cross-entropy loss\nbetween the model semantic output and the ground truth semantic images,\nremoving the colour data traditionally used in the original Semantic-NeRF\napproach. We then conduct a series of identical experiments using the original\nand the modified Semantic-NeRF model. Our primary objective is to obverse the\nimpact of this modification on the model performance by Semantic-NeRF, focusing\non tasks such as scene understanding, object detection, and segmentation. The\nresults offer valuable insights into the new way of rendering the scenes and\nprovide an avenue for further research and development in semantic-focused 3D\nscene understanding.\n","authors":["Ruibo Wang","Song Zhang","Ping Huang","Donghai Zhang","Wei Yan"],"pdf_url":"https://arxiv.org/pdf/2403.16043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11557v2","updated":"2024-03-24T07:01:37Z","published":"2023-12-17T09:05:47Z","title":"SAI3D: Segment Any Instance in 3D Scenes","summary":" Advancements in 3D instance segmentation have traditionally been tethered to\nthe availability of annotated datasets, limiting their application to a narrow\nspectrum of object categories. Recent efforts have sought to harness\nvision-language models like CLIP for open-set semantic reasoning, yet these\nmethods struggle to distinguish between objects of the same categories and rely\non specific prompts that are not universally applicable. In this paper, we\nintroduce SAI3D, a novel zero-shot 3D instance segmentation approach that\nsynergistically leverages geometric priors and semantic cues derived from\nSegment Anything Model (SAM). Our method partitions a 3D scene into geometric\nprimitives, which are then progressively merged into 3D instance segmentations\nthat are consistent with the multi-view SAM masks. Moreover, we design a\nhierarchical region-growing algorithm with a dynamic thresholding mechanism,\nwhich largely improves the robustness of finegrained 3D scene parsing.Empirical\nevaluations on ScanNet, Matterport3D and the more challenging ScanNet++\ndatasets demonstrate the superiority of our approach. Notably, SAI3D\noutperforms existing open-vocabulary baselines and even surpasses\nfully-supervised methods in class-agnostic segmentation on ScanNet++. Our\nproject page is at https://yd-yin.github.io/SAI3D.\n","authors":["Yingda Yin","Yuzheng Liu","Yang Xiao","Daniel Cohen-Or","Jingwei Huang","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2312.11557v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16034v1","updated":"2024-03-24T06:30:02Z","published":"2024-03-24T06:30:02Z","title":"V2X-Real: a Largs-Scale Dataset for Vehicle-to-Everything Cooperative\n Perception","summary":" Recent advancements in Vehicle-to-Everything (V2X) technologies have enabled\nautonomous vehicles to share sensing information to see through occlusions,\ngreatly boosting the perception capability. However, there are no real-world\ndatasets to facilitate the real V2X cooperative perception research -- existing\ndatasets either only support Vehicle-to-Infrastructure cooperation or\nVehicle-to-Vehicle cooperation. In this paper, we propose a dataset that has a\nmixture of multiple vehicles and smart infrastructure simultaneously to\nfacilitate the V2X cooperative perception development with multi-modality\nsensing data. Our V2X-Real is collected using two connected automated vehicles\nand two smart infrastructures, which are all equipped with multi-modal sensors\nincluding LiDAR sensors and multi-view cameras. The whole dataset contains 33K\nLiDAR frames and 171K camera data with over 1.2M annotated bounding boxes of 10\ncategories in very challenging urban scenarios. According to the collaboration\nmode and ego perspective, we derive four types of datasets for Vehicle-Centric,\nInfrastructure-Centric, Vehicle-to-Vehicle, and\nInfrastructure-to-Infrastructure cooperative perception. Comprehensive\nmulti-class multi-agent benchmarks of SOTA cooperative perception methods are\nprovided. The V2X-Real dataset and benchmark codes will be released.\n","authors":["Hao Xiang","Zhaoliang Zheng","Xin Xia","Runsheng Xu","Letian Gao","Zewei Zhou","Xu Han","Xinkai Ji","Mingxi Li","Zonglin Meng","Li Jin","Mingyue Lei","Zhaoyang Ma","Zihang He","Haoxuan Ma","Yunshuang Yuan","Yingqian Zhao","Jiaqi Ma"],"pdf_url":"https://arxiv.org/pdf/2403.16034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16028v1","updated":"2024-03-24T06:10:22Z","published":"2024-03-24T06:10:22Z","title":"Exploring the Impact of Dataset Bias on Dataset Distillation","summary":" Dataset Distillation (DD) is a promising technique to synthesize a smaller\ndataset that preserves essential information from the original dataset. This\nsynthetic dataset can serve as a substitute for the original large-scale one,\nand help alleviate the training workload. However, current DD methods typically\noperate under the assumption that the dataset is unbiased, overlooking\npotential bias issues within the dataset itself. To fill in this blank, we\nsystematically investigate the influence of dataset bias on DD. To the best of\nour knowledge, this is the first exploration in the DD domain. Given that there\nare no suitable biased datasets for DD, we first construct two biased datasets,\nCMNIST-DD and CCIFAR10-DD, to establish a foundation for subsequent analysis.\nThen we utilize existing DD methods to generate synthetic datasets on CMNIST-DD\nand CCIFAR10-DD, and evaluate their performance following the standard process.\nExperiments demonstrate that biases present in the original dataset\nsignificantly impact the performance of the synthetic dataset in most cases,\nwhich highlights the necessity of identifying and mitigating biases in the\noriginal datasets during DD. Finally, we reformulate DD within the context of a\nbiased dataset. Our code along with biased datasets are available at\nhttps://github.com/yaolu-zjut/Biased-DD.\n","authors":["Yao Lu","Jianyang Gu","Xuguang Chen","Saeed Vahidian","Qi Xuan"],"pdf_url":"https://arxiv.org/pdf/2403.16028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16024v1","updated":"2024-03-24T05:57:00Z","published":"2024-03-24T05:57:00Z","title":"A Unified Module for Accelerating STABLE-DIFFUSION: LCM-LORA","summary":" This paper presents a comprehensive study on the unified module for\naccelerating stable-diffusion processes, specifically focusing on the lcm-lora\nmodule. Stable-diffusion processes play a crucial role in various scientific\nand engineering domains, and their acceleration is of paramount importance for\nefficient computational performance. The standard iterative procedures for\nsolving fixed-source discrete ordinates problems often exhibit slow\nconvergence, particularly in optically thick scenarios. To address this\nchallenge, unconditionally stable diffusion-acceleration methods have been\ndeveloped, aiming to enhance the computational efficiency of transport\nequations and discrete ordinates problems. This study delves into the\ntheoretical foundations and numerical results of unconditionally stable\ndiffusion synthetic acceleration methods, providing insights into their\nstability and performance for model discrete ordinates problems. Furthermore,\nthe paper explores recent advancements in diffusion model acceleration,\nincluding on device acceleration of large diffusion models via gpu aware\noptimizations, highlighting the potential for significantly improved inference\nlatency. The results and analyses in this study provide important insights into\nstable diffusion processes and have important ramifications for the creation\nand application of acceleration methods specifically, the lcm-lora module in a\nvariety of computing environments.\n","authors":["Ayush Thakur","Rashmi Vashisth"],"pdf_url":"https://arxiv.org/pdf/2403.16024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16023v1","updated":"2024-03-24T05:55:39Z","published":"2024-03-24T05:55:39Z","title":"RPMArt: Towards Robust Perception and Manipulation for Articulated\n Objects","summary":" Articulated objects are commonly found in daily life. It is essential that\nrobots can exhibit robust perception and manipulation skills for articulated\nobjects in real-world robotic applications. However, existing methods for\narticulated objects insufficiently address noise in point clouds and struggle\nto bridge the gap between simulation and reality, thus limiting the practical\ndeployment in real-world scenarios. To tackle these challenges, we propose a\nframework towards Robust Perception and Manipulation for Articulated Objects\n(RPMArt), which learns to estimate the articulation parameters and manipulate\nthe articulation part from the noisy point cloud. Our primary contribution is a\nRobust Articulation Network (RoArtNet) that is able to predict both joint\nparameters and affordable points robustly by local feature learning and point\ntuple voting. Moreover, we introduce an articulation-aware classification\nscheme to enhance its ability for sim-to-real transfer. Finally, with the\nestimated affordable point and articulation joint constraint, the robot can\ngenerate robust actions to manipulate articulated objects. After learning only\nfrom synthetic data, RPMArt is able to transfer zero-shot to real-world\narticulated objects. Experimental results confirm our approach's effectiveness,\nwith our framework achieving state-of-the-art performance in both noise-added\nsimulation and real-world environments. The code and data will be open-sourced\nfor reproduction. More results are published on the project website at\nhttps://r-pmart.github.io .\n","authors":["Junbo Wang","Wenhai Liu","Qiaojun Yu","Yang You","Liu Liu","Weiming Wang","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2403.16023v1.pdf","comment":"8 pages, 7 figures, submitted to 2024 IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS 2024), project website at\n https://r-pmart.github.io"},{"id":"http://arxiv.org/abs/2403.16020v1","updated":"2024-03-24T05:50:00Z","published":"2024-03-24T05:50:00Z","title":"PaPr: Training-Free One-Step Patch Pruning with Lightweight ConvNets for\n Faster Inference","summary":" As deep neural networks evolve from convolutional neural networks (ConvNets)\nto advanced vision transformers (ViTs), there is an increased need to eliminate\nredundant data for faster processing without compromising accuracy. Previous\nmethods are often architecture-specific or necessitate re-training, restricting\ntheir applicability with frequent model updates. To solve this, we first\nintroduce a novel property of lightweight ConvNets: their ability to identify\nkey discriminative patch regions in images, irrespective of model's final\naccuracy or size. We demonstrate that fully-connected layers are the primary\nbottleneck for ConvNets performance, and their suppression with simple weight\nrecalibration markedly enhances discriminative patch localization performance.\nUsing this insight, we introduce PaPr, a method for substantially pruning\nredundant patches with minimal accuracy loss using lightweight ConvNets across\na variety of deep learning architectures, including ViTs, ConvNets, and hybrid\ntransformers, without any re-training. Moreover, the simple early-stage\none-step patch pruning with PaPr enhances existing patch reduction methods.\nThrough extensive testing on diverse architectures, PaPr achieves significantly\nhigher accuracy over state-of-the-art patch reduction methods with similar FLOP\ncount reduction. More specifically, PaPr reduces about 70% of redundant patches\nin videos with less than 0.8% drop in accuracy, and up to 3.7x FLOPs reduction,\nwhich is a 15% more reduction with 2.5% higher accuracy.\n","authors":["Tanvir Mahmud","Burhaneddin Yaman","Chun-Hao Liu","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2403.16020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05997v3","updated":"2024-03-24T05:46:10Z","published":"2023-01-15T02:04:02Z","title":"Exploiting Auxiliary Caption for Video Grounding","summary":" Video grounding aims to locate a moment of interest matching the given query\nsentence from an untrimmed video. Previous works ignore the {sparsity dilemma}\nin video annotations, which fails to provide the context information between\npotential events and query sentences in the dataset. In this paper, we contend\nthat exploiting easily available captions which describe general actions, i.e.,\nauxiliary captions defined in our paper, will significantly boost the\nperformance. To this end, we propose an Auxiliary Caption Network (ACNet) for\nvideo grounding. Specifically, we first introduce dense video captioning to\ngenerate dense captions and then obtain auxiliary captions by Non-Auxiliary\nCaption Suppression (NACS). To capture the potential information in auxiliary\ncaptions, we propose Caption Guided Attention (CGA) project the semantic\nrelations between auxiliary captions and query sentences into temporal space\nand fuse them into visual representations. Considering the gap between\nauxiliary captions and ground truth, we propose Asymmetric Cross-modal\nContrastive Learning (ACCL) for constructing more negative pairs to maximize\ncross-modal mutual information. Extensive experiments on three public datasets\n(i.e., ActivityNet Captions, TACoS and ActivityNet-CG) demonstrate that our\nmethod significantly outperforms state-of-the-art methods.\n","authors":["Hongxiang Li","Meng Cao","Xuxin Cheng","Zhihong Zhu","Yaowei Li","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2301.05997v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16016v1","updated":"2024-03-24T05:26:55Z","published":"2024-03-24T05:26:55Z","title":"Fill in the ____ (a Diffusion-based Image Inpainting Pipeline)","summary":" Image inpainting is the process of taking an image and generating lost or\nintentionally occluded portions. Inpainting has countless applications\nincluding restoring previously damaged pictures, restoring the quality of\nimages that have been degraded due to compression, and removing unwanted\nobjects/text. Modern inpainting techniques have shown remarkable ability in\ngenerating sensible completions for images with mask occlusions. In our paper,\nan overview of the progress of inpainting techniques will be provided, along\nwith identifying current leading approaches, focusing on their strengths and\nweaknesses. A critical gap in these existing models will be addressed, focusing\non the ability to prompt and control what exactly is generated. We will\nadditionally justify why we think this is the natural next progressive step\nthat inpainting models must take, and provide multiple approaches to\nimplementing this functionality. Finally, we will evaluate the results of our\napproaches by qualitatively checking whether they generate high-quality images\nthat correctly inpaint regions with the objects that they are instructed to\nproduce.\n","authors":["Eyoel Gebre","Krishna Saxena","Timothy Tran"],"pdf_url":"https://arxiv.org/pdf/2403.16016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13897v2","updated":"2024-03-24T05:20:15Z","published":"2023-08-26T14:50:24Z","title":"InsertNeRF: Instilling Generalizability into NeRF with HyperNet Modules","summary":" Generalizing Neural Radiance Fields (NeRF) to new scenes is a significant\nchallenge that existing approaches struggle to address without extensive\nmodifications to vanilla NeRF framework. We introduce InsertNeRF, a method for\nINStilling gEneRalizabiliTy into NeRF. By utilizing multiple plug-and-play\nHyperNet modules, InsertNeRF dynamically tailors NeRF's weights to specific\nreference scenes, transforming multi-scale sampling-aware features into\nscene-specific representations. This novel design allows for more accurate and\nefficient representations of complex appearances and geometries. Experiments\nshow that this method not only achieves superior generalization performance but\nalso provides a flexible pathway for integration with other NeRF-like systems,\neven in sparse input settings. Code will be available\nhttps://github.com/bbbbby-99/InsertNeRF.\n","authors":["Yanqi Bao","Tianyu Ding","Jing Huo","Wenbin Li","Yuxin Li","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2308.13897v2.pdf","comment":"This work was accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2403.16009v1","updated":"2024-03-24T04:39:40Z","published":"2024-03-24T04:39:40Z","title":"SM2C: Boost the Semi-supervised Segmentation for Medical Image by using\n Meta Pseudo Labels and Mixed Images","summary":" Recently, machine learning-based semantic segmentation algorithms have\ndemonstrated their potential to accurately segment regions and contours in\nmedical images, allowing the precise location of anatomical structures and\nabnormalities. Although medical images are difficult to acquire and annotate,\nsemi-supervised learning methods are efficient in dealing with the scarcity of\nlabeled data. However, overfitting is almost inevitable due to the limited\nimages for training. Furthermore, the intricate shapes of organs and lesions in\nmedical images introduce additional complexity in different cases, preventing\nnetworks from acquiring a strong ability to generalize. To this end, we\nintroduce a novel method called Scaling-up Mix with Multi-Class (SM2C). This\nmethod uses three strategies - scaling-up image size, multi-class mixing, and\nobject shape jittering - to improve the ability to learn semantic features\nwithin medical images. By diversifying the shape of the segmentation objects\nand enriching the semantic information within each sample, the SM2C\ndemonstrates its potential, especially in the training of unlabelled data.\nExtensive experiments demonstrate the effectiveness of the SM2C on three\nbenchmark medical image segmentation datasets. The proposed framework shows\nsignificant improvements over state-of-the-art counterparts.\n","authors":["Yifei Wang","Chuhong Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.16009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16005v1","updated":"2024-03-24T04:23:56Z","published":"2024-03-24T04:23:56Z","title":"Knowledge-Enhanced Dual-stream Zero-shot Composed Image Retrieval","summary":" We study the zero-shot Composed Image Retrieval (ZS-CIR) task, which is to\nretrieve the target image given a reference image and a description without\ntraining on the triplet datasets. Previous works generate pseudo-word tokens by\nprojecting the reference image features to the text embedding space. However,\nthey focus on the global visual representation, ignoring the representation of\ndetailed attributes, e.g., color, object number and layout. To address this\nchallenge, we propose a Knowledge-Enhanced Dual-stream zero-shot composed image\nretrieval framework (KEDs). KEDs implicitly models the attributes of the\nreference images by incorporating a database. The database enriches the\npseudo-word tokens by providing relevant images and captions, emphasizing\nshared attribute information in various aspects. In this way, KEDs recognizes\nthe reference image from diverse perspectives. Moreover, KEDs adopts an extra\nstream that aligns pseudo-word tokens with textual concepts, leveraging\npseudo-triplets mined from image-text pairs. The pseudo-word tokens generated\nin this stream are explicitly aligned with fine-grained semantics in the text\nembedding space. Extensive experiments on widely used benchmarks, i.e.\nImageNet-R, COCO object, Fashion-IQ and CIRR, show that KEDs outperforms\nprevious zero-shot composed image retrieval methods.\n","authors":["Yucheng Suo","Fan Ma","Linchao Zhu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16005v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16003v1","updated":"2024-03-24T04:22:37Z","published":"2024-03-24T04:22:37Z","title":"Diverse Representation Embedding for Lifelong Person Re-Identification","summary":" Lifelong Person Re-Identification (LReID) aims to continuously learn from\nsuccessive data streams, matching individuals across multiple cameras. The key\nchallenge for LReID is how to effectively preserve old knowledge while learning\nnew information incrementally. Task-level domain gaps and limited old task\ndatasets are key factors leading to catastrophic forgetting in ReLD, which are\noverlooked in existing methods. To alleviate this problem, we propose a novel\nDiverse Representation Embedding (DRE) framework for LReID. The proposed DRE\npreserves old knowledge while adapting to new information based on\ninstance-level and task-level layout. Concretely, an Adaptive Constraint Module\n(ACM) is proposed to implement integration and push away operations between\nmultiple representations, obtaining dense embedding subspace for each instance\nto improve matching ability on limited old task datasets. Based on the\nprocessed diverse representation, we interact knowledge between the adjustment\nmodel and the learner model through Knowledge Update (KU) and Knowledge\nPreservation (KP) strategies at the task-level layout, which reduce the\ntask-wise domain gap on both old and new tasks, and exploit diverse\nrepresentation of each instance in limited datasets from old tasks, improving\nmodel performance for extended periods. Extensive experiments were conducted on\neleven Re-ID datasets, including five seen datasets for training in order-1 and\norder-2 orders and six unseen datasets for inference. Compared to\nstate-of-the-art methods, our method achieves significantly improved\nperformance in holistic, large-scale, and occluded datasets.\n","authors":["Shiben Liu","Huijie Fan","Qiang Wang","Xiai Chen","Zhi Han","Yandong Tang"],"pdf_url":"https://arxiv.org/pdf/2403.16003v1.pdf","comment":"11 pages,7 Tables,3 Figures"},{"id":"http://arxiv.org/abs/2403.16002v1","updated":"2024-03-24T04:15:50Z","published":"2024-03-24T04:15:50Z","title":"SDSTrack: Self-Distillation Symmetric Adapter Learning for Multi-Modal\n Visual Object Tracking","summary":" Multimodal Visual Object Tracking (VOT) has recently gained significant\nattention due to its robustness. Early research focused on fully fine-tuning\nRGB-based trackers, which was inefficient and lacked generalized representation\ndue to the scarcity of multimodal data. Therefore, recent studies have utilized\nprompt tuning to transfer pre-trained RGB-based trackers to multimodal data.\nHowever, the modality gap limits pre-trained knowledge recall, and the\ndominance of the RGB modality persists, preventing the full utilization of\ninformation from other modalities. To address these issues, we propose a novel\nsymmetric multimodal tracking framework called SDSTrack. We introduce\nlightweight adaptation for efficient fine-tuning, which directly transfers the\nfeature extraction ability from RGB to other domains with a small number of\ntrainable parameters and integrates multimodal features in a balanced,\nsymmetric manner. Furthermore, we design a complementary masked patch\ndistillation strategy to enhance the robustness of trackers in complex\nenvironments, such as extreme weather, poor imaging, and sensor failure.\nExtensive experiments demonstrate that SDSTrack outperforms state-of-the-art\nmethods in various multimodal tracking scenarios, including RGB+Depth,\nRGB+Thermal, and RGB+Event tracking, and exhibits impressive results in extreme\nconditions. Our source code is available at https://github.com/hoqolo/SDSTrack.\n","authors":["Xiaojun Hou","Jiazheng Xing","Yijie Qian","Yaowei Guo","Shuo Xin","Junhao Chen","Kai Tang","Mengmeng Wang","Zhengkai Jiang","Liang Liu","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16002v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.12494v2","updated":"2024-03-24T03:17:24Z","published":"2024-03-19T07:02:08Z","title":"Task-Customized Mixture of Adapters for General Image Fusion","summary":" General image fusion aims at integrating important information from\nmulti-source images. However, due to the significant cross-task gap, the\nrespective fusion mechanism varies considerably in practice, resulting in\nlimited performance across subtasks. To handle this problem, we propose a novel\ntask-customized mixture of adapters (TC-MoA) for general image fusion,\nadaptively prompting various fusion tasks in a unified model. We borrow the\ninsight from the mixture of experts (MoE), taking the experts as efficient\ntuning adapters to prompt a pre-trained foundation model. These adapters are\nshared across different tasks and constrained by mutual information\nregularization, ensuring compatibility with different tasks while\ncomplementarity for multi-source images. The task-specific routing networks\ncustomize these adapters to extract task-specific information from different\nsources with dynamic dominant intensity, performing adaptive visual feature\nprompt fusion. Notably, our TC-MoA controls the dominant intensity bias for\ndifferent fusion tasks, successfully unifying multiple fusion tasks in a single\nmodel. Extensive experiments show that TC-MoA outperforms the competing\napproaches in learning commonalities while retaining compatibility for general\nimage fusion (multi-modal, multi-exposure, and multi-focus), and also\ndemonstrating striking controllability on more generalization experiments. The\ncode is available at https://github.com/YangSun22/TC-MoA .\n","authors":["Pengfei Zhu","Yang Sun","Bing Cao","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2403.12494v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15994v1","updated":"2024-03-24T03:10:39Z","published":"2024-03-24T03:10:39Z","title":"Multi-Scale Spatio-Temporal Graph Convolutional Network for Facial\n Expression Spotting","summary":" Facial expression spotting is a significant but challenging task in facial\nexpression analysis. The accuracy of expression spotting is affected not only\nby irrelevant facial movements but also by the difficulty of perceiving subtle\nmotions in micro-expressions. In this paper, we propose a Multi-Scale\nSpatio-Temporal Graph Convolutional Network (SpoT-GCN) for facial expression\nspotting. To extract more robust motion features, we track both short- and\nlong-term motion of facial muscles in compact sliding windows whose window\nlength adapts to the temporal receptive field of the network. This strategy,\ntermed the receptive field adaptive sliding window strategy, effectively\nmagnifies the motion features while alleviating the problem of severe head\nmovement. The subtle motion features are then converted to a facial graph\nrepresentation, whose spatio-temporal graph patterns are learned by a graph\nconvolutional network. This network learns both local and global features from\nmultiple scales of facial graph structures using our proposed facial local\ngraph pooling (FLGP). Furthermore, we introduce supervised contrastive learning\nto enhance the discriminative capability of our model for difficult-to-classify\nframes. The experimental results on the SAMM-LV and CAS(ME)^2 datasets\ndemonstrate that our method achieves state-of-the-art performance, particularly\nin micro-expression spotting. Ablation studies further verify the effectiveness\nof our proposed modules.\n","authors":["Yicheng Deng","Hideaki Hayashi","Hajime Nagahara"],"pdf_url":"https://arxiv.org/pdf/2403.15994v1.pdf","comment":"Accepted by FG2024"},{"id":"http://arxiv.org/abs/2403.15992v1","updated":"2024-03-24T03:10:07Z","published":"2024-03-24T03:10:07Z","title":"BIMCV-R: A Landmark Dataset for 3D CT Text-Image Retrieval","summary":" The burgeoning integration of 3D medical imaging into healthcare has led to a\nsubstantial increase in the workload of medical professionals. To assist\nclinicians in their diagnostic processes and alleviate their workload, the\ndevelopment of a robust system for retrieving similar case studies presents a\nviable solution. While the concept holds great promise, the field of 3D medical\ntext-image retrieval is currently limited by the absence of robust evaluation\nbenchmarks and curated datasets. To remedy this, our study presents a\ngroundbreaking dataset, BIMCV-R (This dataset will be released upon\nacceptance.), which includes an extensive collection of 8,069 3D CT volumes,\nencompassing over 2 million slices, paired with their respective radiological\nreports. Expanding upon the foundational work of our dataset, we craft a\nretrieval strategy, MedFinder. This approach employs a dual-stream network\narchitecture, harnessing the potential of large language models to advance the\nfield of medical image retrieval beyond existing text-image retrieval\nsolutions. It marks our preliminary step towards developing a system capable of\nfacilitating text-to-image, image-to-text, and keyword-based retrieval tasks.\n","authors":["Yinda Chen","Che Liu","Xiaoyu Liu","Rossella Arcucci","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2403.15992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02155v2","updated":"2024-03-24T02:57:28Z","published":"2023-12-04T18:59:55Z","title":"GPS-Gaussian: Generalizable Pixel-wise 3D Gaussian Splatting for\n Real-time Human Novel View Synthesis","summary":" We present a new approach, termed GPS-Gaussian, for synthesizing novel views\nof a character in a real-time manner. The proposed method enables 2K-resolution\nrendering under a sparse-view camera setting. Unlike the original Gaussian\nSplatting or neural implicit rendering methods that necessitate per-subject\noptimizations, we introduce Gaussian parameter maps defined on the source views\nand regress directly Gaussian Splatting properties for instant novel view\nsynthesis without any fine-tuning or optimization. To this end, we train our\nGaussian parameter regression module on a large amount of human scan data,\njointly with a depth estimation module to lift 2D parameter maps to 3D space.\nThe proposed framework is fully differentiable and experiments on several\ndatasets demonstrate that our method outperforms state-of-the-art methods while\nachieving an exceeding rendering speed.\n","authors":["Shunyuan Zheng","Boyao Zhou","Ruizhi Shao","Boning Liu","Shengping Zhang","Liqiang Nie","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02155v2.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://shunyuanzheng.github.io/GPS-Gaussian"},{"id":"http://arxiv.org/abs/2403.15990v1","updated":"2024-03-24T02:55:45Z","published":"2024-03-24T02:55:45Z","title":"Mars Spectrometry 2: Gas Chromatography -- Second place solution","summary":" The Mars Spectrometry 2: Gas Chromatography challenge was sponsored by NASA\nand run on the DrivenData competition platform in 2022. This report describes\nthe solution which achieved the second-best score on the competition's test\ndataset. The solution utilized two-dimensional, image-like representations of\nthe competition's chromatography data samples. A number of different\nConvolutional Neural Network models were trained and ensembled for the final\nsubmission.\n","authors":["Dmitry A. Konovalov"],"pdf_url":"https://arxiv.org/pdf/2403.15990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05656v3","updated":"2024-03-24T02:43:55Z","published":"2023-03-10T02:15:58Z","title":"EHRDiff: Exploring Realistic EHR Synthesis with Diffusion Models","summary":" Electronic health records (EHR) contain a wealth of biomedical information,\nserving as valuable resources for the development of precision medicine\nsystems. However, privacy concerns have resulted in limited access to\nhigh-quality and large-scale EHR data for researchers, impeding progress in\nmethodological development. Recent research has delved into synthesizing\nrealistic EHR data through generative modeling techniques, where a majority of\nproposed methods relied on generative adversarial networks (GAN) and their\nvariants for EHR synthesis. Despite GAN-based methods attaining\nstate-of-the-art performance in generating EHR data, these approaches are\ndifficult to train and prone to mode collapse. Recently introduced in\ngenerative modeling, diffusion models have established cutting-edge performance\nin image generation, but their efficacy in EHR data synthesis remains largely\nunexplored. In this study, we investigate the potential of diffusion models for\nEHR data synthesis and introduce a novel method, EHRDiff. Through extensive\nexperiments, EHRDiff establishes new state-of-the-art quality for synthetic EHR\ndata, protecting private information in the meanwhile.\n","authors":["Hongyi Yuan","Songchi Zhou","Sheng Yu"],"pdf_url":"https://arxiv.org/pdf/2303.05656v3.pdf","comment":"Accepted by TMLR, preprint of camera-ready version"},{"id":"http://arxiv.org/abs/2403.15981v1","updated":"2024-03-24T02:15:14Z","published":"2024-03-24T02:15:14Z","title":"Exploring Accurate 3D Phenotyping in Greenhouse through Neural Radiance\n Fields","summary":" Accurate collection of plant phenotyping is critical to optimising\nsustainable farming practices in precision agriculture. Traditional phenotyping\nin controlled laboratory environments, while valuable, falls short in\nunderstanding plant growth under real-world conditions. Emerging sensor and\ndigital technologies offer a promising approach for direct phenotyping of\nplants in farm environments. This study investigates a learning-based\nphenotyping method using the Neural Radiance Field to achieve accurate in-situ\nphenotyping of pepper plants in greenhouse environments. To quantitatively\nevaluate the performance of this method, traditional point cloud registration\non 3D scanning data is implemented for comparison. Experimental result shows\nthat NeRF(Neural Radiance Fields) achieves competitive accuracy compared to the\n3D scanning methods. The mean distance error between the scanner-based method\nand the NeRF-based method is 0.865mm. This study shows that the learning-based\nNeRF method achieves similar accuracy to 3D scanning-based methods but with\nimproved scalability and robustness.\n","authors":["unhong Zhao","Wei Ying","Yaoqiang Pan","Zhenfeng Yi","Chao Chen","Kewei Hu","Hanwen Kang"],"pdf_url":"https://arxiv.org/pdf/2403.15981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03695v2","updated":"2024-03-24T02:03:55Z","published":"2024-01-08T06:53:33Z","title":"A Large-Scale Empirical Study on Improving the Fairness of Image\n Classification Models","summary":" Fairness has been a critical issue that affects the adoption of deep learning\nmodels in real practice. To improve model fairness, many existing methods have\nbeen proposed and evaluated to be effective in their own contexts. However,\nthere is still no systematic evaluation among them for a comprehensive\ncomparison under the same context, which makes it hard to understand the\nperformance distinction among them, hindering the research progress and\npractical adoption of them. To fill this gap, this paper endeavours to conduct\nthe first large-scale empirical study to comprehensively compare the\nperformance of existing state-of-the-art fairness improving techniques.\nSpecifically, we target the widely-used application scenario of image\nclassification, and utilized three different datasets and five commonly-used\nperformance metrics to assess in total 13 methods from diverse categories. Our\nfindings reveal substantial variations in the performance of each method across\ndifferent datasets and sensitive attributes, indicating over-fitting on\nspecific datasets by many existing methods. Furthermore, different fairness\nevaluation metrics, due to their distinct focuses, yield significantly\ndifferent assessment results. Overall, we observe that pre-processing methods\nand in-processing methods outperform post-processing methods, with\npre-processing methods exhibiting the best performance. Our empirical study\noffers comprehensive recommendations for enhancing fairness in deep learning\nmodels. We approach the problem from multiple dimensions, aiming to provide a\nuniform evaluation platform and inspire researchers to explore more effective\nfairness solutions via a set of implications.\n","authors":["Junjie Yang","Jiajun Jiang","Zeyu Sun","Junjie Chen"],"pdf_url":"https://arxiv.org/pdf/2401.03695v2.pdf","comment":"Accepted by the 33rd ACM SIGSOFT International Symposium on Software\n Testing and Analysis (ISSTA 2024). Please include ISSTA in any citations"},{"id":"http://arxiv.org/abs/2012.04132v4","updated":"2024-03-24T01:23:11Z","published":"2020-12-08T00:37:35Z","title":"A Number Sense as an Emergent Property of the Manipulating Brain","summary":" The ability to understand and manipulate numbers and quantities emerges\nduring childhood, but the mechanism through which humans acquire and develop\nthis ability is still poorly understood. We explore this question through a\nmodel, assuming that the learner is able to pick up and place small objects\nfrom, and to, locations of its choosing, and will spontaneously engage in such\nundirected manipulation. We further assume that the learner's visual system\nwill monitor the changing arrangements of objects in the scene and will learn\nto predict the effects of each action by comparing perception with a\nsupervisory signal from the motor system. We model perception using standard\ndeep networks for feature extraction and classification, and gradient descent\nlearning. Our main finding is that, from learning the task of action\nprediction, an unexpected image representation emerges exhibiting regularities\nthat foreshadow the perception and representation of numbers and quantity.\nThese include distinct categories for zero and the first few natural numbers, a\nstrict ordering of the numbers, and a one-dimensional signal that correlates\nwith numerical quantity. As a result, our model acquires the ability to\nestimate numerosity, i.e. the number of objects in the scene, as well as\nsubitization, i.e. the ability to recognize at a glance the exact number of\nobjects in small scenes. Remarkably, subitization and numerosity estimation\nextrapolate to scenes containing many objects, far beyond the three objects\nused during training. We conclude that important aspects of a facility with\nnumbers and quantities may be learned with supervision from a simple\npre-training task. Our observations suggest that cross-modal learning is a\npowerful learning mechanism that may be harnessed in artificial intelligence.\n","authors":["Neehar Kondapaneni","Pietro Perona"],"pdf_url":"https://arxiv.org/pdf/2012.04132v4.pdf","comment":"16 pages, 5 figures, 15 supplemental figures"},{"id":"http://arxiv.org/abs/2403.15977v1","updated":"2024-03-24T01:20:08Z","published":"2024-03-24T01:20:08Z","title":"Towards Two-Stream Foveation-based Active Vision Learning","summary":" Deep neural network (DNN) based machine perception frameworks process the\nentire input in a one-shot manner to provide answers to both \"what object is\nbeing observed\" and \"where it is located\". In contrast, the \"two-stream\nhypothesis\" from neuroscience explains the neural processing in the human\nvisual cortex as an active vision system that utilizes two separate regions of\nthe brain to answer the what and the where questions. In this work, we propose\na machine learning framework inspired by the \"two-stream hypothesis\" and\nexplore the potential benefits that it offers. Specifically, the proposed\nframework models the following mechanisms: 1) ventral (what) stream focusing on\nthe input regions perceived by the fovea part of an eye (foveation), 2) dorsal\n(where) stream providing visual guidance, and 3) iterative processing of the\ntwo streams to calibrate visual focus and process the sequence of focused image\npatches. The training of the proposed framework is accomplished by label-based\nDNN training for the ventral stream model and reinforcement learning for the\ndorsal stream model. We show that the two-stream foveation-based learning is\napplicable to the challenging task of weakly-supervised object localization\n(WSOL), where the training data is limited to the object class or its\nattributes. The framework is capable of both predicting the properties of an\nobject and successfully localizing it by predicting its bounding box. We also\nshow that, due to the independent nature of the two streams, the dorsal model\ncan be applied on its own to unseen images to localize objects from different\ndatasets.\n","authors":["Timur Ibrayev","Amitangshu Mukherjee","Sai Aparna Aketi","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2403.15977v1.pdf","comment":"18 pages, 14 figures, Under consideration at IEEE Transactions on\n Cognitive and Developmental Systems"},{"id":"http://arxiv.org/abs/2403.15974v1","updated":"2024-03-24T00:46:40Z","published":"2024-03-24T00:46:40Z","title":"CBGT-Net: A Neuromimetic Architecture for Robust Classification of\n Streaming Data","summary":" This paper describes CBGT-Net, a neural network model inspired by the\ncortico-basal ganglia-thalamic (CBGT) circuits found in mammalian brains.\nUnlike traditional neural network models, which either generate an output for\neach provided input, or an output after a fixed sequence of inputs, the\nCBGT-Net learns to produce an output after a sufficient criteria for evidence\nis achieved from a stream of observed data. For each observation, the CBGT-Net\ngenerates a vector that explicitly represents the amount of evidence the\nobservation provides for each potential decision, accumulates the evidence over\ntime, and generates a decision when the accumulated evidence exceeds a\npre-defined threshold. We evaluate the proposed model on two image\nclassification tasks, where models need to predict image categories based on a\nstream of small patches extracted from the image. We show that the CBGT-Net\nprovides improved accuracy and robustness compared to models trained to\nclassify from a single patch, and models leveraging an LSTM layer to classify\nfrom a fixed sequence length of patches.\n","authors":["Shreya Sharma","Dana Hughes","Katia Sycara"],"pdf_url":"https://arxiv.org/pdf/2403.15974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11868v2","updated":"2024-03-24T00:11:08Z","published":"2023-10-18T10:36:34Z","title":"To Generate or Not? Safety-Driven Unlearned Diffusion Models Are Still\n Easy To Generate Unsafe Images ... For Now","summary":" The recent advances in diffusion models (DMs) have revolutionized the\ngeneration of realistic and complex images. However, these models also\nintroduce potential safety hazards, such as producing harmful content and\ninfringing data copyrights. Despite the development of safety-driven unlearning\ntechniques to counteract these challenges, doubts about their efficacy persist.\nTo tackle this issue, we introduce an evaluation framework that leverages\nadversarial prompts to discern the trustworthiness of these safety-driven DMs\nafter they have undergone the process of unlearning harmful concepts.\nSpecifically, we investigated the adversarial robustness of DMs, assessed by\nadversarial prompts, when eliminating unwanted concepts, styles, and objects.\nWe develop an effective and efficient adversarial prompt generation approach\nfor DMs, termed UnlearnDiffAtk. This method capitalizes on the intrinsic\nclassification abilities of DMs to simplify the creation of adversarial\nprompts, thereby eliminating the need for auxiliary classification or diffusion\nmodels.Through extensive benchmarking, we evaluate the robustness of five\nwidely-used safety-driven unlearned DMs (i.e., DMs after unlearning undesirable\nconcepts, styles, or objects) across a variety of tasks. Our results\ndemonstrate the effectiveness and efficiency merits of UnlearnDiffAtk over the\nstate-of-the-art adversarial prompt generation method and reveal the lack of\nrobustness of current safety-driven unlearning techniques when applied to DMs.\nCodes are available at https://github.com/OPTML-Group/Diffusion-MU-Attack.\nWARNING: This paper contains model outputs that may be offensive in nature.\n","authors":["Yimeng Zhang","Jinghan Jia","Xin Chen","Aochuan Chen","Yihua Zhang","Jiancheng Liu","Ke Ding","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2310.11868v2.pdf","comment":"Codes are available at\n https://github.com/OPTML-Group/Diffusion-MU-Attack"},{"id":"http://arxiv.org/abs/1908.01978v2","updated":"2024-03-24T17:47:06Z","published":"2019-08-06T06:44:43Z","title":"Multi-view Deep Subspace Clustering Networks","summary":" Multi-view subspace clustering aims to discover the inherent structure of\ndata by fusing multiple views of complementary information. Most existing\nmethods first extract multiple types of handcrafted features and then learn a\njoint affinity matrix for clustering. The disadvantage of this approach lies in\ntwo aspects: 1) multi-view relations are not embedded into feature learning,\nand 2) the end-to-end learning manner of deep learning is not suitable for\nmulti-view clustering. Even when deep features have been extracted, it is a\nnontrivial problem to choose a proper backbone for clustering on different\ndatasets. To address these issues, we propose the Multi-view Deep Subspace\nClustering Networks (MvDSCN), which learns a multi-view self-representation\nmatrix in an end-to-end manner. The MvDSCN consists of two sub-networks, \\ie, a\ndiversity network (Dnet) and a universality network (Unet). A latent space is\nbuilt using deep convolutional autoencoders, and a self-representation matrix\nis learned in the latent space using a fully connected layer. Dnet learns\nview-specific self-representation matrices, whereas Unet learns a common\nself-representation matrix for all views. To exploit the complementarity of\nmulti-view representations, the Hilbert--Schmidt independence criterion (HSIC)\nis introduced as a diversity regularizer that captures the nonlinear,\nhigh-order inter-view relations. Because different views share the same label\nspace, the self-representation matrices of each view are aligned to the common\none by universality regularization. The MvDSCN also unifies multiple backbones\nto boost clustering performance and avoid the need for model selection.\nExperiments demonstrate the superiority of the MvDSCN.\n","authors":["Pengfei Zhu","Xinjie Yao","Yu Wang","Binyuan Hui","Dawei Du","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/1908.01978v2.pdf","comment":"Accepted by T-CYB"}]},"2024-03-26T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.17937v1","updated":"2024-03-26T17:59:58Z","published":"2024-03-26T17:59:58Z","title":"Efficient Video Object Segmentation via Modulated Cross-Attention Memory","summary":" Recently, transformer-based approaches have shown promising results for\nsemi-supervised video object segmentation. However, these approaches typically\nstruggle on long videos due to increased GPU memory demands, as they frequently\nexpand the memory bank every few frames. We propose a transformer-based\napproach, named MAVOS, that introduces an optimized and dynamic long-term\nmodulated cross-attention (MCA) memory to model temporal smoothness without\nrequiring frequent memory expansion. The proposed MCA effectively encodes both\nlocal and global features at various levels of granularity while efficiently\nmaintaining consistent speed regardless of the video length. Extensive\nexperiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017,\ndemonstrate the effectiveness of our proposed contributions leading to\nreal-time inference and markedly reduced memory demands without any degradation\nin segmentation accuracy on long videos. Compared to the best existing\ntransformer-based approach, our MAVOS increases the speed by 7.6x, while\nsignificantly reducing the GPU memory by 87% with comparable segmentation\nperformance on short and long video datasets. Notably on the LVOS dataset, our\nMAVOS achieves a J&F score of 63.3% while operating at 37 frames per second\n(FPS) on a single V100 GPU. Our code and models will be publicly available at:\nhttps://github.com/Amshaker/MAVOS.\n","authors":["Abdelrahman Shaker","Syed Talal Wasim","Martin Danelljan","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17936v1","updated":"2024-03-26T17:59:52Z","published":"2024-03-26T17:59:52Z","title":"ConvoFusion: Multi-Modal Conversational Diffusion for Co-Speech Gesture\n Synthesis","summary":" Gestures play a key role in human communication. Recent methods for co-speech\ngesture generation, while managing to generate beat-aligned motions, struggle\ngenerating gestures that are semantically aligned with the utterance. Compared\nto beat gestures that align naturally to the audio signal, semantically\ncoherent gestures require modeling the complex interactions between the\nlanguage and human motion, and can be controlled by focusing on certain words.\nTherefore, we present ConvoFusion, a diffusion-based approach for multi-modal\ngesture synthesis, which can not only generate gestures based on multi-modal\nspeech inputs, but can also facilitate controllability in gesture synthesis.\nOur method proposes two guidance objectives that allow the users to modulate\nthe impact of different conditioning modalities (e.g. audio vs text) as well as\nto choose certain words to be emphasized during gesturing. Our method is\nversatile in that it can be trained either for generating monologue gestures or\neven the conversational gestures. To further advance the research on\nmulti-party interactive gestures, the DnD Group Gesture dataset is released,\nwhich contains 6 hours of gesture data showing 5 people interacting with one\nanother. We compare our method with several recent works and demonstrate\neffectiveness of our method on a variety of tasks. We urge the reader to watch\nour supplementary video at our website.\n","authors":["Muhammad Hamza Mughal","Rishabh Dabral","Ikhsanul Habibie","Lucia Donatelli","Marc Habermann","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2403.17936v1.pdf","comment":"CVPR 2024. Project Page:\n https://vcai.mpi-inf.mpg.de/projects/ConvoFusion/"},{"id":"http://arxiv.org/abs/2403.17935v1","updated":"2024-03-26T17:59:24Z","published":"2024-03-26T17:59:24Z","title":"OmniVid: A Generative Framework for Universal Video Understanding","summary":" The core of video understanding tasks, such as recognition, captioning, and\ntracking, is to automatically detect objects or actions in a video and analyze\ntheir temporal evolution. Despite sharing a common goal, different tasks often\nrely on distinct model architectures and annotation formats. In contrast,\nnatural language processing benefits from a unified output space, i.e., text\nsequences, which simplifies the training of powerful foundational language\nmodels, such as GPT-3, with extensive training corpora. Inspired by this, we\nseek to unify the output space of video understanding tasks by using languages\nas labels and additionally introducing time and box tokens. In this way, a\nvariety of video tasks could be formulated as video-grounded token generation.\nThis enables us to address various types of video tasks, including\nclassification (such as action recognition), captioning (covering clip\ncaptioning, video question answering, and dense video captioning), and\nlocalization tasks (such as visual object tracking) within a fully shared\nencoder-decoder architecture, following a generative framework. Through\ncomprehensive experiments, we demonstrate such a simple and straightforward\nidea is quite effective and can achieve state-of-the-art or competitive results\non seven video benchmarks, providing a novel perspective for more universal\nvideo understanding. Code is available at https://github.com/wangjk666/OmniVid.\n","authors":["Junke Wang","Dongdong Chen","Chong Luo","Bo He","Lu Yuan","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.17935v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17934v1","updated":"2024-03-26T17:59:23Z","published":"2024-03-26T17:59:23Z","title":"AiOS: All-in-One-Stage Expressive Human Pose and Shape Estimation","summary":" Expressive human pose and shape estimation (a.k.a. 3D whole-body mesh\nrecovery) involves the human body, hand, and expression estimation. Most\nexisting methods have tackled this task in a two-stage manner, first detecting\nthe human body part with an off-the-shelf detection model and inferring the\ndifferent human body parts individually. Despite the impressive results\nachieved, these methods suffer from 1) loss of valuable contextual information\nvia cropping, 2) introducing distractions, and 3) lacking inter-association\namong different persons and body parts, inevitably causing performance\ndegradation, especially for crowded scenes. To address these issues, we\nintroduce a novel all-in-one-stage framework, AiOS, for multiple expressive\nhuman pose and shape recovery without an additional human detection step.\nSpecifically, our method is built upon DETR, which treats multi-person\nwhole-body mesh recovery task as a progressive set prediction problem with\nvarious sequential detection. We devise the decoder tokens and extend them to\nour task. Specifically, we first employ a human token to probe a human location\nin the image and encode global features for each instance, which provides a\ncoarse location for the later transformer block. Then, we introduce a\njoint-related token to probe the human joint in the image and encoder a\nfine-grained local feature, which collaborates with the global feature to\nregress the whole-body mesh. This straightforward but effective model\noutperforms previous state-of-the-art methods by a 9% reduction in NMVE on\nAGORA, a 30% reduction in PVE on EHF, a 10% reduction in PVE on ARCTIC, and a\n3% reduction in PVE on EgoBody.\n","authors":["Qingping Sun","Yanjun Wang","Ailing Zeng","Wanqi Yin","Chen Wei","Wenjia Wang","Haiyi Mei","Chi Sing Leung","Ziwei Liu","Lei Yang","Zhongang Cai"],"pdf_url":"https://arxiv.org/pdf/2403.17934v1.pdf","comment":"Homepage: https://ttxskk.github.io/AiOS/"},{"id":"http://arxiv.org/abs/2403.17933v1","updated":"2024-03-26T17:58:29Z","published":"2024-03-26T17:58:29Z","title":"SLEDGE: Synthesizing Simulation Environments for Driving Agents with\n Generative Models","summary":" SLEDGE is the first generative simulator for vehicle motion planning trained\non real-world driving logs. Its core component is a learned model that is able\nto generate agent bounding boxes and lane graphs. The model's outputs serve as\nan initial state for traffic simulation. The unique properties of the entities\nto be generated for SLEDGE, such as their connectivity and variable count per\nscene, render the naive application of most modern generative models to this\ntask non-trivial. Therefore, together with a systematic study of existing lane\ngraph representations, we introduce a novel raster-to-vector autoencoder\n(RVAE). It encodes agents and the lane graph into distinct channels in a\nrasterized latent map. This facilitates both lane-conditioned agent generation\nand combined generation of lanes and agents with a Diffusion Transformer. Using\ngenerated entities in SLEDGE enables greater control over the simulation, e.g.\nupsampling turns or increasing traffic density. Further, SLEDGE can support\n500m long routes, a capability not found in existing data-driven simulators\nlike nuPlan. It presents new challenges for planning algorithms, evidenced by\nfailure rates of over 40% for PDM, the winner of the 2023 nuPlan challenge,\nwhen tested on hard routes and dense traffic generated by our model. Compared\nto nuPlan, SLEDGE requires 500$\\times$ less storage to set up (<4GB), making it\na more accessible option and helping with democratizing future research in this\nfield.\n","authors":["Kashyap Chitta","Daniel Dauner","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2403.17933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17931v1","updated":"2024-03-26T17:58:22Z","published":"2024-03-26T17:58:22Z","title":"Track Everything Everywhere Fast and Robustly","summary":" We propose a novel test-time optimization approach for efficiently and\nrobustly tracking any pixel at any time in a video. The latest state-of-the-art\noptimization-based tracking technique, OmniMotion, requires a prohibitively\nlong optimization time, rendering it impractical for downstream applications.\nOmniMotion is sensitive to the choice of random seeds, leading to unstable\nconvergence. To improve efficiency and robustness, we introduce a novel\ninvertible deformation network, CaDeX++, which factorizes the function\nrepresentation into a local spatial-temporal feature grid and enhances the\nexpressivity of the coupling blocks with non-linear functions. While CaDeX++\nincorporates a stronger geometric bias within its architectural design, it also\ntakes advantage of the inductive bias provided by the vision foundation models.\nOur system utilizes monocular depth estimation to represent scene geometry and\nenhances the objective by incorporating DINOv2 long-term semantics to regulate\nthe optimization process. Our experiments demonstrate a substantial improvement\nin training speed (more than \\textbf{10 times} faster), robustness, and\naccuracy in tracking over the SoTA optimization-based method OmniMotion.\n","authors":["Yunzhou Song","Jiahui Lei","Ziyun Wang","Lingjie Liu","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2403.17931v1.pdf","comment":"project page: https://timsong412.github.io/FastOmniTrack/"},{"id":"http://arxiv.org/abs/2403.17929v1","updated":"2024-03-26T17:58:07Z","published":"2024-03-26T17:58:07Z","title":"Towards Explaining Hypercomplex Neural Networks","summary":" Hypercomplex neural networks are gaining increasing interest in the deep\nlearning community. The attention directed towards hypercomplex models\noriginates from several aspects, spanning from purely theoretical and\nmathematical characteristics to the practical advantage of lightweight models\nover conventional networks, and their unique properties to capture both global\nand local relations. In particular, a branch of these architectures,\nparameterized hypercomplex neural networks (PHNNs), has also gained popularity\ndue to their versatility across a multitude of application domains.\nNonetheless, only few attempts have been made to explain or interpret their\nintricacies. In this paper, we propose inherently interpretable PHNNs and\nquaternion-like networks, thus without the need for any post-hoc method. To\nachieve this, we define a type of cosine-similarity transform within the\nparameterized hypercomplex domain. This PHB-cos transform induces weight\nalignment with relevant input features and allows to reduce the model into a\nsingle linear transform, rendering it directly interpretable. In this work, we\nstart to draw insights into how this unique branch of neural models operates.\nWe observe that hypercomplex networks exhibit a tendency to concentrate on the\nshape around the main object of interest, in addition to the shape of the\nobject itself. We provide a thorough analysis, studying single neurons of\ndifferent layers and comparing them against how real-valued networks learn. The\ncode of the paper is available at https://github.com/ispamm/HxAI.\n","authors":["Eleonora Lopez","Eleonora Grassucci","Debora Capriotti","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2403.17929v1.pdf","comment":"The paper has been accepted at IEEE WCCI 2024"},{"id":"http://arxiv.org/abs/2403.17926v1","updated":"2024-03-26T17:57:20Z","published":"2024-03-26T17:57:20Z","title":"FastCAR: Fast Classification And Regression Multi-Task Learning via Task\n Consolidation for Modelling a Continuous Property Variable of Object Classes","summary":" FastCAR is a novel task consolidation approach in Multi-Task Learning (MTL)\nfor a classification and a regression task, despite task heterogeneity with\nonly subtle correlation. It addresses object classification and continuous\nproperty variable regression, a crucial use case in science and engineering.\nFastCAR involves a labeling transformation approach that can be used with a\nsingle-task regression network architecture. FastCAR outperforms traditional\nMTL model families, parametrized in the landscape of architecture and loss\nweighting schemes, when learning of both tasks are collectively considered\n(classification accuracy of 99.54%, regression mean absolute percentage error\nof 2.3%). The experiments performed used an Advanced Steel Property dataset\ncontributed by us. The dataset comprises 4536 images of 224x224 pixels,\nannotated with object classes and hardness properties that take continuous\nvalues. With the labeling transformation and single-task regression network\narchitecture, FastCAR achieves reduced latency and time efficiency.\n","authors":["Anoop Kini","Andreas Jansche","Timo Bernthaler","Gerhard Schneider"],"pdf_url":"https://arxiv.org/pdf/2403.17926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17924v1","updated":"2024-03-26T17:57:05Z","published":"2024-03-26T17:57:05Z","title":"AID: Attention Interpolation of Text-to-Image Diffusion","summary":" Conditional diffusion models can create unseen images in various settings,\naiding image interpolation. Interpolation in latent spaces is well-studied, but\ninterpolation with specific conditions like text or poses is less understood.\nSimple approaches, such as linear interpolation in the space of conditions,\noften result in images that lack consistency, smoothness, and fidelity. To that\nend, we introduce a novel training-free technique named Attention Interpolation\nvia Diffusion (AID). Our key contributions include 1) proposing an inner/outer\ninterpolated attention layer; 2) fusing the interpolated attention with\nself-attention to boost fidelity; and 3) applying beta distribution to\nselection to increase smoothness. We also present a variant, Prompt-guided\nAttention Interpolation via Diffusion (PAID), that considers interpolation as a\ncondition-dependent generative process. This method enables the creation of new\nimages with greater consistency, smoothness, and efficiency, and offers control\nover the exact path of interpolation. Our approach demonstrates effectiveness\nfor conceptual and spatial interpolation. Code and demo are available at\nhttps://github.com/QY-H00/attention-interpolation-diffusion.\n","authors":["Qiyuan He","Jinghao Wang","Ziwei Liu","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2403.17924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17920v1","updated":"2024-03-26T17:55:11Z","published":"2024-03-26T17:55:11Z","title":"TC4D: Trajectory-Conditioned Text-to-4D Generation","summary":" Recent techniques for text-to-4D generation synthesize dynamic 3D scenes\nusing supervision from pre-trained text-to-video models. However, existing\nrepresentations for motion, such as deformation models or time-dependent neural\nrepresentations, are limited in the amount of motion they can generate-they\ncannot synthesize motion extending far beyond the bounding box used for volume\nrendering. The lack of a more flexible motion model contributes to the gap in\nrealism between 4D generation methods and recent, near-photorealistic video\ngeneration models. Here, we propose TC4D: trajectory-conditioned text-to-4D\ngeneration, which factors motion into global and local components. We represent\nthe global motion of a scene's bounding box using rigid transformation along a\ntrajectory parameterized by a spline. We learn local deformations that conform\nto the global trajectory using supervision from a text-to-video model. Our\napproach enables the synthesis of scenes animated along arbitrary trajectories,\ncompositional scene generation, and significant improvements to the realism and\namount of generated motion, which we evaluate qualitatively and through a user\nstudy. Video results can be viewed on our website:\nhttps://sherwinbahmani.github.io/tc4d.\n","authors":["Sherwin Bahmani","Xian Liu","Yifan Wang","Ivan Skorokhodov","Victor Rong","Ziwei Liu","Xihui Liu","Jeong Joon Park","Sergey Tulyakov","Gordon Wetzstein","Andrea Tagliasacchi","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2403.17920v1.pdf","comment":"Project Page: https://sherwinbahmani.github.io/tc4d"},{"id":"http://arxiv.org/abs/2403.17916v1","updated":"2024-03-26T17:53:27Z","published":"2024-03-26T17:53:27Z","title":"CMP: Cooperative Motion Prediction with Multi-Agent Communication","summary":" The confluence of the advancement of Autonomous Vehicles (AVs) and the\nmaturity of Vehicle-to-Everything (V2X) communication has enabled the\ncapability of cooperative connected and automated vehicles (CAVs). Building on\ntop of cooperative perception, this paper explores the feasibility and\neffectiveness of cooperative motion prediction. Our method, CMP, takes LiDAR\nsignals as input to enhance tracking and prediction capabilities. Unlike\nprevious work that focuses separately on either cooperative perception or\nmotion prediction, our framework, to the best of our knowledge, is the first to\naddress the unified problem where CAVs share information in both perception and\nprediction modules. Incorporated into our design is the unique capability to\ntolerate realistic V2X bandwidth limitations and transmission delays, while\ndealing with bulky perception representations. We also propose a prediction\naggregation module, which unifies the predictions obtained by different CAVs\nand generates the final prediction. Through extensive experiments and ablation\nstudies, we demonstrate the effectiveness of our method in cooperative\nperception, tracking, and motion prediction tasks. In particular, CMP reduces\nthe average prediction error by 17.2\\% with fewer missing detections compared\nwith the no cooperation setting. Our work marks a significant step forward in\nthe cooperative capabilities of CAVs, showcasing enhanced performance in\ncomplex scenarios.\n","authors":["Zhuoyuan Wu","Yuping Wang","Hengbo Ma","Zhaowei Li","Hang Qiu","Jiachen Li"],"pdf_url":"https://arxiv.org/pdf/2403.17916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17915v1","updated":"2024-03-26T17:52:23Z","published":"2024-03-26T17:52:23Z","title":"Leveraging Near-Field Lighting for Monocular Depth Estimation from\n Endoscopy Videos","summary":" Monocular depth estimation in endoscopy videos can enable assistive and\nrobotic surgery to obtain better coverage of the organ and detection of various\nhealth issues. Despite promising progress on mainstream, natural image depth\nestimation, techniques perform poorly on endoscopy images due to a lack of\nstrong geometric features and challenging illumination effects. In this paper,\nwe utilize the photometric cues, i.e., the light emitted from an endoscope and\nreflected by the surface, to improve monocular depth estimation. We first\ncreate two novel loss functions with supervised and self-supervised variants\nthat utilize a per-pixel shading representation. We then propose a novel depth\nrefinement network (PPSNet) that leverages the same per-pixel shading\nrepresentation. Finally, we introduce teacher-student transfer learning to\nproduce better depth maps from both synthetic data with supervision and\nclinical data with self-supervision. We achieve state-of-the-art results on the\nC3VD dataset while estimating high-quality depth maps from clinical data. Our\ncode, pre-trained models, and supplementary materials can be found on our\nproject page: https://ppsnet.github.io/\n","authors":["Akshay Paruchuri","Samuel Ehrenstein","Shuxian Wang","Inbar Fried","Stephen M. Pizer","Marc Niethammer","Roni Sengupta"],"pdf_url":"https://arxiv.org/pdf/2403.17915v1.pdf","comment":"26 pages, 7 tables, 7 figures"},{"id":"http://arxiv.org/abs/2403.17909v1","updated":"2024-03-26T17:46:25Z","published":"2024-03-26T17:46:25Z","title":"ELGC-Net: Efficient Local-Global Context Aggregation for Remote Sensing\n Change Detection","summary":" Deep learning has shown remarkable success in remote sensing change detection\n(CD), aiming to identify semantic change regions between co-registered\nsatellite image pairs acquired at distinct time stamps. However, existing\nconvolutional neural network and transformer-based frameworks often struggle to\naccurately segment semantic change regions. Moreover, transformers-based\nmethods with standard self-attention suffer from quadratic computational\ncomplexity with respect to the image resolution, making them less practical for\nCD tasks with limited training data. To address these issues, we propose an\nefficient change detection framework, ELGC-Net, which leverages rich contextual\ninformation to precisely estimate change regions while reducing the model size.\nOur ELGC-Net comprises a Siamese encoder, fusion modules, and a decoder. The\nfocus of our design is the introduction of an Efficient Local-Global Context\nAggregator module within the encoder, capturing enhanced global context and\nlocal spatial information through a novel pooled-transpose (PT) attention and\ndepthwise convolution, respectively. The PT attention employs pooling\noperations for robust feature extraction and minimizes computational cost with\ntransposed attention. Extensive experiments on three challenging CD datasets\ndemonstrate that ELGC-Net outperforms existing methods. Compared to the recent\ntransformer-based CD approach (ChangeFormer), ELGC-Net achieves a 1.4% gain in\nintersection over union metric on the LEVIR-CD dataset, while significantly\nreducing trainable parameters. Our proposed ELGC-Net sets a new\nstate-of-the-art performance in remote sensing change detection benchmarks.\nFinally, we also introduce ELGC-Net-LW, a lighter variant with significantly\nreduced computational complexity, suitable for resource-constrained settings,\nwhile achieving comparable performance. Project url\nhttps://github.com/techmn/elgcnet.\n","authors":["Mubashir Noman","Mustansar Fiaz","Hisham Cholakkal","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17909v1.pdf","comment":"accepted at IEEE TGRS"},{"id":"http://arxiv.org/abs/2403.17905v1","updated":"2024-03-26T17:45:06Z","published":"2024-03-26T17:45:06Z","title":"Scalable Non-Cartesian Magnetic Resonance Imaging with R2D2","summary":" We propose a new approach for non-Cartesian magnetic resonance image\nreconstruction. While unrolled architectures provide robustness via\ndata-consistency layers, embedding measurement operators in Deep Neural Network\n(DNN) can become impractical at large scale. Alternative Plug-and-Play (PnP)\napproaches, where the denoising DNNs are blind to the measurement setting, are\nnot affected by this limitation and have also proven effective, but their\nhighly iterative nature also affects scalability. To address this scalability\nchallenge, we leverage the \"Residual-to-Residual DNN series for high-Dynamic\nrange imaging (R2D2)\" approach recently introduced in astronomical imaging.\nR2D2's reconstruction is formed as a series of residual images, iteratively\nestimated as outputs of DNNs taking the previous iteration's image estimate and\nassociated data residual as inputs. The method can be interpreted as a learned\nversion of the Matching Pursuit algorithm. We demonstrate R2D2 in simulation,\nconsidering radial k-space sampling acquisition sequences. Our preliminary\nresults suggest that R2D2 achieves: (i) suboptimal performance compared to its\nunrolled incarnation R2D2-Net, which is however non-scalable due to the\nnecessary embedding of NUFFT-based data-consistency layers; (ii) superior\nreconstruction quality to a scalable version of R2D2-Net embedding an FFT-based\napproximation for data consistency; (iii) superior reconstruction quality to\nPnP, while only requiring few iterations.\n","authors":["Chen Yiwei","Tang Chao","Aghabiglou Amir","Chu Chung San","Wiaux Yves"],"pdf_url":"https://arxiv.org/pdf/2403.17905v1.pdf","comment":"submitted to IEEE EUSIPCO 2024"},{"id":"http://arxiv.org/abs/2403.17902v1","updated":"2024-03-26T17:43:15Z","published":"2024-03-26T17:43:15Z","title":"Serpent: Scalable and Efficient Image Restoration via Multi-scale\n Structured State Space Models","summary":" The landscape of computational building blocks of efficient image restoration\narchitectures is dominated by a combination of convolutional processing and\nvarious attention mechanisms. However, convolutional filters are inherently\nlocal and therefore struggle at modeling long-range dependencies in images. On\nthe other hand, attention excels at capturing global interactions between\narbitrary image regions, however at a quadratic cost in image dimension. In\nthis work, we propose Serpent, an architecture that leverages recent advances\nin state space models (SSMs) in its core computational block. SSMs, originally\nintroduced for sequence modeling, can maintain a global receptive field with a\nfavorable linear scaling in input size. Our preliminary results demonstrate\nthat Serpent can achieve reconstruction quality on par with state-of-the-art\ntechniques, while requiring orders of magnitude less compute (up to $150$ fold\nreduction in FLOPS) and a factor of up to $5\\times$ less GPU memory while\nmaintaining a compact model size.\n","authors":["Mohammad Shahab Sepehri","Zalan Fabian","Mahdi Soltanolkotabi"],"pdf_url":"https://arxiv.org/pdf/2403.17902v1.pdf","comment":"7 pages, 5 figures, preliminary workshop submission of a\n comprehensive work to be released soon"},{"id":"http://arxiv.org/abs/2307.16897v2","updated":"2024-03-26T17:40:47Z","published":"2023-07-31T17:59:48Z","title":"DiVa-360: The Dynamic Visual Dataset for Immersive Neural Fields","summary":" Advances in neural fields are enabling high-fidelity capture of the shape and\nappearance of dynamic 3D scenes. However, their capabilities lag behind those\noffered by conventional representations such as 2D videos because of\nalgorithmic challenges and the lack of large-scale multi-view real-world\ndatasets. We address the dataset limitation with DiVa-360, a real-world 360\ndynamic visual dataset that contains synchronized high-resolution and\nlong-duration multi-view video sequences of table-scale scenes captured using a\ncustomized low-cost system with 53 cameras. It contains 21 object-centric\nsequences categorized by different motion types, 25 intricate hand-object\ninteraction sequences, and 8 long-duration sequences for a total of 17.4 M\nimage frames. In addition, we provide foreground-background segmentation masks,\nsynchronized audio, and text descriptions. We benchmark the state-of-the-art\ndynamic neural field methods on DiVa-360 and provide insights about existing\nmethods and future challenges on long-duration neural field capture.\n","authors":["Cheng-You Lu","Peisen Zhou","Angela Xing","Chandradeep Pokhariya","Arnab Dey","Ishaan Shah","Rugved Mavidipalli","Dylan Hu","Andrew Comport","Kefan Chen","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2307.16897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17898v1","updated":"2024-03-26T17:39:36Z","published":"2024-03-26T17:39:36Z","title":"Octree-GS: Towards Consistent Real-time Rendering with LOD-Structured 3D\n Gaussians","summary":" The recent 3D Gaussian splatting (3D-GS) has shown remarkable rendering\nfidelity and efficiency compared to NeRF-based neural scene representations.\nWhile demonstrating the potential for real-time rendering, 3D-GS encounters\nrendering bottlenecks in large scenes with complex details due to an excessive\nnumber of Gaussian primitives located within the viewing frustum. This\nlimitation is particularly noticeable in zoom-out views and can lead to\ninconsistent rendering speeds in scenes with varying details. Moreover, it\noften struggles to capture the corresponding level of details at different\nscales with its heuristic density control operation. Inspired by the\nLevel-of-Detail (LOD) techniques, we introduce Octree-GS, featuring an\nLOD-structured 3D Gaussian approach supporting level-of-detail decomposition\nfor scene representation that contributes to the final rendering results. Our\nmodel dynamically selects the appropriate level from the set of\nmulti-resolution anchor points, ensuring consistent rendering performance with\nadaptive LOD adjustments while maintaining high-fidelity rendering results.\n","authors":["Kerui Ren","Lihan Jiang","Tao Lu","Mulin Yu","Linning Xu","Zhangkai Ni","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2403.17898v1.pdf","comment":"Project page: https://city-super.github.io/octree-gs/"},{"id":"http://arxiv.org/abs/2403.17893v1","updated":"2024-03-26T17:29:26Z","published":"2024-03-26T17:29:26Z","title":"A Survey on 3D Egocentric Human Pose Estimation","summary":" Egocentric human pose estimation aims to estimate human body poses and\ndevelop body representations from a first-person camera perspective. It has\ngained vast popularity in recent years because of its wide range of\napplications in sectors like XR-technologies, human-computer interaction, and\nfitness tracking. However, to the best of our knowledge, there is no systematic\nliterature review based on the proposed solutions regarding egocentric 3D human\npose estimation. To that end, the aim of this survey paper is to provide an\nextensive overview of the current state of egocentric pose estimation research.\nIn this paper, we categorize and discuss the popular datasets and the different\npose estimation models, highlighting the strengths and weaknesses of different\nmethods by comparative analysis. This survey can be a valuable resource for\nboth researchers and practitioners in the field, offering insights into key\nconcepts and cutting-edge solutions in egocentric pose estimation, its\nwide-ranging applications, as well as the open problems with future scope.\n","authors":["Md Mushfiqur Azam","Kevin Desai"],"pdf_url":"https://arxiv.org/pdf/2403.17893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17888v1","updated":"2024-03-26T17:21:24Z","published":"2024-03-26T17:21:24Z","title":"2D Gaussian Splatting for Geometrically Accurate Radiance Fields","summary":" 3D Gaussian Splatting (3DGS) has recently revolutionized radiance field\nreconstruction, achieving high quality novel view synthesis and fast rendering\nspeed without baking. However, 3DGS fails to accurately represent surfaces due\nto the multi-view inconsistent nature of 3D Gaussians. We present 2D Gaussian\nSplatting (2DGS), a novel approach to model and reconstruct geometrically\naccurate radiance fields from multi-view images. Our key idea is to collapse\nthe 3D volume into a set of 2D oriented planar Gaussian disks. Unlike 3D\nGaussians, 2D Gaussians provide view-consistent geometry while modeling\nsurfaces intrinsically. To accurately recover thin surfaces and achieve stable\noptimization, we introduce a perspective-accurate 2D splatting process\nutilizing ray-splat intersection and rasterization. Additionally, we\nincorporate depth distortion and normal consistency terms to further enhance\nthe quality of the reconstructions. We demonstrate that our differentiable\nrenderer allows for noise-free and detailed geometry reconstruction while\nmaintaining competitive appearance quality, fast training speed, and real-time\nrendering. Our code will be made publicly available.\n","authors":["Binbin Huang","Zehao Yu","Anpei Chen","Andreas Geiger","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2403.17888v1.pdf","comment":"12 pages, 12 figures"},{"id":"http://arxiv.org/abs/2403.17884v1","updated":"2024-03-26T17:16:04Z","published":"2024-03-26T17:16:04Z","title":"Sen2Fire: A Challenging Benchmark Dataset for Wildfire Detection using\n Sentinel Data","summary":" Utilizing satellite imagery for wildfire detection presents substantial\npotential for practical applications. To advance the development of machine\nlearning algorithms in this domain, our study introduces the \\textit{Sen2Fire}\ndataset--a challenging satellite remote sensing dataset tailored for wildfire\ndetection. This dataset is curated from Sentinel-2 multi-spectral data and\nSentinel-5P aerosol product, comprising a total of 2466 image patches. Each\npatch has a size of 512$\\times$512 pixels with 13 bands. Given the distinctive\nsensitivities of various wavebands to wildfire responses, our research focuses\non optimizing wildfire detection by evaluating different wavebands and\nemploying a combination of spectral indices, such as normalized burn ratio\n(NBR) and normalized difference vegetation index (NDVI). The results suggest\nthat, in contrast to using all bands for wildfire detection, selecting specific\nband combinations yields superior performance. Additionally, our study\nunderscores the positive impact of integrating Sentinel-5 aerosol data for\nwildfire detection. The code and dataset are available online\n(https://zenodo.org/records/10881058).\n","authors":["Yonghao Xu","Amanda Berg","Leif Haglund"],"pdf_url":"https://arxiv.org/pdf/2403.17884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02640v3","updated":"2024-03-26T17:14:14Z","published":"2024-03-05T04:08:19Z","title":"HoloVIC: Large-scale Dataset and Benchmark for Multi-Sensor Holographic\n Intersection and Vehicle-Infrastructure Cooperative","summary":" Vehicle-to-everything (V2X) is a popular topic in the field of Autonomous\nDriving in recent years. Vehicle-infrastructure cooperation (VIC) becomes one\nof the important research area. Due to the complexity of traffic conditions\nsuch as blind spots and occlusion, it greatly limits the perception\ncapabilities of single-view roadside sensing systems. To further enhance the\naccuracy of roadside perception and provide better information to the vehicle\nside, in this paper, we constructed holographic intersections with various\nlayouts to build a large-scale multi-sensor holographic vehicle-infrastructure\ncooperation dataset, called HoloVIC. Our dataset includes 3 different types of\nsensors (Camera, Lidar, Fisheye) and employs 4 sensor-layouts based on the\ndifferent intersections. Each intersection is equipped with 6-18 sensors to\ncapture synchronous data. While autonomous vehicles pass through these\nintersections for collecting VIC data. HoloVIC contains in total on 100k+\nsynchronous frames from different sensors. Additionally, we annotated 3D\nbounding boxes based on Camera, Fisheye, and Lidar. We also associate the IDs\nof the same objects across different devices and consecutive frames in\nsequence. Based on HoloVIC, we formulated four tasks to facilitate the\ndevelopment of related research. We also provide benchmarks for these tasks.\n","authors":["Cong Ma","Lei Qiao","Chengkai Zhu","Kai Liu","Zelong Kong","Qing Li","Xueqi Zhou","Yuheng Kan","Wei Wu"],"pdf_url":"https://arxiv.org/pdf/2403.02640v3.pdf","comment":"Accept to CVPR 2024, Benchmark Website: https://holovic.net"},{"id":"http://arxiv.org/abs/2403.17883v1","updated":"2024-03-26T17:13:17Z","published":"2024-03-26T17:13:17Z","title":"Superior and Pragmatic Talking Face Generation with Teacher-Student\n Framework","summary":" Talking face generation technology creates talking videos from arbitrary\nappearance and motion signal, with the \"arbitrary\" offering ease of use but\nalso introducing challenges in practical applications. Existing methods work\nwell with standard inputs but suffer serious performance degradation with\nintricate real-world ones. Moreover, efficiency is also an important concern in\ndeployment. To comprehensively address these issues, we introduce SuperFace, a\nteacher-student framework that balances quality, robustness, cost and\neditability. We first propose a simple but effective teacher model capable of\nhandling inputs of varying qualities to generate high-quality results. Building\non this, we devise an efficient distillation strategy to acquire an\nidentity-specific student model that maintains quality with significantly\nreduced computational load. Our experiments validate that SuperFace offers a\nmore comprehensive solution than existing methods for the four mentioned\nobjectives, especially in reducing FLOPs by 99\\% with the student model.\nSuperFace can be driven by both video and audio and allows for localized facial\nattributes editing.\n","authors":["Chao Liang","Jianwen Jiang","Tianyun Zhong","Gaojie Lin","Zhengkun Rong","Jiaqi Yang","Yongming Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.17883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17881v1","updated":"2024-03-26T17:12:34Z","published":"2024-03-26T17:12:34Z","title":"Deepfake Generation and Detection: A Benchmark and Survey","summary":" In addition to the advancements in deepfake generation, corresponding\ndetection technologies need to continuously evolve to regulate the potential\nmisuse of deepfakes, such as for privacy invasion and phishing attacks. This\nsurvey comprehensively reviews the latest developments in deepfake generation\nand detection, summarizing and analyzing the current state of the art in this\nrapidly evolving field. We first unify task definitions, comprehensively\nintroduce datasets and metrics, and discuss the development of generation and\ndetection technology frameworks. Then, we discuss the development of several\nrelated sub-fields and focus on researching four mainstream deepfake fields:\npopular face swap, face reenactment, talking face generation, and facial\nattribute editing, as well as foreign detection. Subsequently, we\ncomprehensively benchmark representative methods on popular datasets for each\nfield, fully evaluating the latest and influential works published in top\nconferences/journals. Finally, we analyze the challenges and future research\ndirections of the discussed fields. We closely follow the latest developments\nin https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection.\n","authors":["Gan Pei","Jiangning Zhang","Menghan Hu","Guangtao Zhai","Chengjie Wang","Zhenyu Zhang","Jian Yang","Chunhua Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.17881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17879v1","updated":"2024-03-26T17:11:51Z","published":"2024-03-26T17:11:51Z","title":"Low-Latency Neural Stereo Streaming","summary":" The rise of new video modalities like virtual reality or autonomous driving\nhas increased the demand for efficient multi-view video compression methods,\nboth in terms of rate-distortion (R-D) performance and in terms of delay and\nruntime. While most recent stereo video compression approaches have shown\npromising performance, they compress left and right views sequentially, leading\nto poor parallelization and runtime performance. This work presents Low-Latency\nneural codec for Stereo video Streaming (LLSS), a novel parallel stereo video\ncoding method designed for fast and efficient low-latency stereo video\nstreaming. Instead of using a sequential cross-view motion compensation like\nexisting methods, LLSS introduces a bidirectional feature shifting module to\ndirectly exploit mutual information among views and encode them effectively\nwith a joint cross-view prior model for entropy coding. Thanks to this design,\nLLSS processes left and right views in parallel, minimizing latency; all while\nsubstantially improving R-D performance compared to both existing neural and\nconventional codecs.\n","authors":["Qiqi Hou","Farzad Farhadzadeh","Amir Said","Guillaume Sautiere","Hoang Le"],"pdf_url":"https://arxiv.org/pdf/2403.17879v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.17870v1","updated":"2024-03-26T16:57:55Z","published":"2024-03-26T16:57:55Z","title":"Boosting Diffusion Models with Moving Average Sampling in Frequency\n Domain","summary":" Diffusion models have recently brought a powerful revolution in image\ngeneration. Despite showing impressive generative capabilities, most of these\nmodels rely on the current sample to denoise the next one, possibly resulting\nin denoising instability. In this paper, we reinterpret the iterative denoising\nprocess as model optimization and leverage a moving average mechanism to\nensemble all the prior samples. Instead of simply applying moving average to\nthe denoised samples at different timesteps, we first map the denoised samples\nto data space and then perform moving average to avoid distribution shift\nacross timesteps. In view that diffusion models evolve the recovery from\nlow-frequency components to high-frequency details, we further decompose the\nsamples into different frequency components and execute moving average\nseparately on each component. We name the complete approach \"Moving Average\nSampling in Frequency domain (MASF)\". MASF could be seamlessly integrated into\nmainstream pre-trained diffusion models and sampling schedules. Extensive\nexperiments on both unconditional and conditional diffusion models demonstrate\nthat our MASF leads to superior performances compared to the baselines, with\nalmost negligible additional complexity cost.\n","authors":["Yurui Qian","Qi Cai","Yingwei Pan","Yehao Li","Ting Yao","Qibin Sun","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2403.17870v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17869v1","updated":"2024-03-26T16:57:33Z","published":"2024-03-26T16:57:33Z","title":"To Supervise or Not to Supervise: Understanding and Addressing the Key\n Challenges of 3D Transfer Learning","summary":" Transfer learning has long been a key factor in the advancement of many\nfields including 2D image analysis. Unfortunately, its applicability in 3D data\nprocessing has been relatively limited. While several approaches for 3D\ntransfer learning have been proposed in recent literature, with contrastive\nlearning gaining particular prominence, most existing methods in this domain\nhave only been studied and evaluated in limited scenarios. Most importantly,\nthere is currently a lack of principled understanding of both when and why 3D\ntransfer learning methods are applicable. Remarkably, even the applicability of\nstandard supervised pre-training is poorly understood. In this work, we conduct\nthe first in-depth quantitative and qualitative investigation of supervised and\ncontrastive pre-training strategies and their utility in downstream 3D tasks.\nWe demonstrate that layer-wise analysis of learned features provides\nsignificant insight into the downstream utility of trained networks. Informed\nby this analysis, we propose a simple geometric regularization strategy, which\nimproves the transferability of supervised pre-training. Our work thus sheds\nlight onto both the specific challenges of 3D transfer learning, as well as\nstrategies to overcome them.\n","authors":["Souhail Hadgi","Lei Li","Maks Ovsjanikov"],"pdf_url":"https://arxiv.org/pdf/2403.17869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17846v1","updated":"2024-03-26T16:36:43Z","published":"2024-03-26T16:36:43Z","title":"Hierarchical Open-Vocabulary 3D Scene Graphs for Language-Grounded Robot\n Navigation","summary":" Recent open-vocabulary robot mapping methods enrich dense geometric maps with\npre-trained visual-language features. While these maps allow for the prediction\nof point-wise saliency maps when queried for a certain language concept,\nlarge-scale environments and abstract queries beyond the object level still\npose a considerable hurdle, ultimately limiting language-grounded robotic\nnavigation. In this work, we present HOV-SG, a hierarchical open-vocabulary 3D\nscene graph mapping approach for language-grounded robot navigation. Leveraging\nopen-vocabulary vision foundation models, we first obtain state-of-the-art\nopen-vocabulary segment-level maps in 3D and subsequently construct a 3D scene\ngraph hierarchy consisting of floor, room, and object concepts, each enriched\nwith open-vocabulary features. Our approach is able to represent multi-story\nbuildings and allows robotic traversal of those using a cross-floor Voronoi\ngraph. HOV-SG is evaluated on three distinct datasets and surpasses previous\nbaselines in open-vocabulary semantic accuracy on the object, room, and floor\nlevel while producing a 75% reduction in representation size compared to dense\nopen-vocabulary maps. In order to prove the efficacy and generalization\ncapabilities of HOV-SG, we showcase successful long-horizon\nlanguage-conditioned robot navigation within real-world multi-storage\nenvironments. We provide code and trial video data at http://hovsg.github.io/.\n","authors":["Abdelrhman Werby","Chenguang Huang","Martin Büchner","Abhinav Valada","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2403.17846v1.pdf","comment":"Code and video are available at http://hovsg.github.io/"},{"id":"http://arxiv.org/abs/2401.06003v2","updated":"2024-03-26T16:30:20Z","published":"2024-01-11T16:06:36Z","title":"TRIPS: Trilinear Point Splatting for Real-Time Radiance Field Rendering","summary":" Point-based radiance field rendering has demonstrated impressive results for\nnovel view synthesis, offering a compelling blend of rendering quality and\ncomputational efficiency. However, also latest approaches in this domain are\nnot without their shortcomings. 3D Gaussian Splatting [Kerbl and Kopanas et al.\n2023] struggles when tasked with rendering highly detailed scenes, due to\nblurring and cloudy artifacts. On the other hand, ADOP [R\\\"uckert et al. 2022]\ncan accommodate crisper images, but the neural reconstruction network decreases\nperformance, it grapples with temporal instability and it is unable to\neffectively address large gaps in the point cloud.\n In this paper, we present TRIPS (Trilinear Point Splatting), an approach that\ncombines ideas from both Gaussian Splatting and ADOP. The fundamental concept\nbehind our novel technique involves rasterizing points into a screen-space\nimage pyramid, with the selection of the pyramid layer determined by the\nprojected point size. This approach allows rendering arbitrarily large points\nusing a single trilinear write. A lightweight neural network is then used to\nreconstruct a hole-free image including detail beyond splat resolution.\nImportantly, our render pipeline is entirely differentiable, allowing for\nautomatic optimization of both point sizes and positions.\n Our evaluation demonstrate that TRIPS surpasses existing state-of-the-art\nmethods in terms of rendering quality while maintaining a real-time frame rate\nof 60 frames per second on readily available hardware. This performance extends\nto challenging scenarios, such as scenes featuring intricate geometry,\nexpansive landscapes, and auto-exposed footage.\n The project page is located at: https://lfranke.github.io/trips/\n","authors":["Linus Franke","Darius Rückert","Laura Fink","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2401.06003v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17839v1","updated":"2024-03-26T16:27:37Z","published":"2024-03-26T16:27:37Z","title":"ReMamber: Referring Image Segmentation with Mamba Twister","summary":" Referring Image Segmentation (RIS) leveraging transformers has achieved great\nsuccess on the interpretation of complex visual-language tasks. However, the\nquadratic computation cost makes it resource-consuming in capturing long-range\nvisual-language dependencies. Fortunately, Mamba addresses this with efficient\nlinear complexity in processing. However, directly applying Mamba to\nmulti-modal interactions presents challenges, primarily due to inadequate\nchannel interactions for the effective fusion of multi-modal data. In this\npaper, we propose ReMamber, a novel RIS architecture that integrates the power\nof Mamba with a multi-modal Mamba Twister block. The Mamba Twister explicitly\nmodels image-text interaction, and fuses textual and visual features through\nits unique channel and spatial twisting mechanism. We achieve the\nstate-of-the-art on three challenging benchmarks. Moreover, we conduct thorough\nanalyses of ReMamber and discuss other fusion designs using Mamba. These\nprovide valuable perspectives for future research.\n","authors":["Yuhuan Yang","Chaofan Ma","Jiangchao Yao","Zhun Zhong","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.17839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17837v1","updated":"2024-03-26T16:24:42Z","published":"2024-03-26T16:24:42Z","title":"GTA-HDR: A Large-Scale Synthetic Dataset for HDR Image Reconstruction","summary":" High Dynamic Range (HDR) content (i.e., images and videos) has a broad range\nof applications. However, capturing HDR content from real-world scenes is\nexpensive and time- consuming. Therefore, the challenging task of\nreconstructing visually accurate HDR images from their Low Dynamic Range (LDR)\ncounterparts is gaining attention in the vision research community. A major\nchallenge in this research problem is the lack of datasets, which capture\ndiverse scene conditions (e.g., lighting, shadows, weather, locations,\nlandscapes, objects, humans, buildings) and various image features (e.g.,\ncolor, contrast, saturation, hue, luminance, brightness, radiance). To address\nthis gap, in this paper, we introduce GTA-HDR, a large-scale synthetic dataset\nof photo-realistic HDR images sampled from the GTA-V video game. We perform\nthorough evaluation of the proposed dataset, which demonstrates significant\nqualitative and quantitative improvements of the state-of-the-art HDR image\nreconstruction methods. Furthermore, we demonstrate the effectiveness of the\nproposed dataset and its impact on additional computer vision tasks including\n3D human pose estimation, human body part segmentation, and holistic scene\nsegmentation. The dataset, data collection pipeline, and evaluation code are\navailable at: https://github.com/HrishavBakulBarua/GTA-HDR.\n","authors":["Hrishav Bakul Barua","Kalin Stefanov","KokSheik Wong","Abhinav Dhall","Ganesh Krishnasamy"],"pdf_url":"https://arxiv.org/pdf/2403.17837v1.pdf","comment":"Submitted to IEEE"},{"id":"http://arxiv.org/abs/2403.17834v1","updated":"2024-03-26T16:19:56Z","published":"2024-03-26T16:19:56Z","title":"A foundation model utilizing chest CT volumes and radiology reports for\n supervised-level zero-shot detection of abnormalities","summary":" A major challenge in computational research in 3D medical imaging is the lack\nof comprehensive datasets. Addressing this issue, our study introduces CT-RATE,\nthe first 3D medical imaging dataset that pairs images with textual reports.\nCT-RATE consists of 25,692 non-contrast chest CT volumes, expanded to 50,188\nthrough various reconstructions, from 21,304 unique patients, along with\ncorresponding radiology text reports. Leveraging CT-RATE, we developed CT-CLIP,\na CT-focused contrastive language-image pre-training framework. As a versatile,\nself-supervised model, CT-CLIP is designed for broad application and does not\nrequire task-specific training. Remarkably, CT-CLIP outperforms\nstate-of-the-art, fully supervised methods in multi-abnormality detection\nacross all key metrics, thus eliminating the need for manual annotation. We\nalso demonstrate its utility in case retrieval, whether using imagery or\ntextual queries, thereby advancing knowledge dissemination. The open-source\nrelease of CT-RATE and CT-CLIP marks a significant advancement in medical AI,\nenhancing 3D imaging analysis and fostering innovation in healthcare.\n","authors":["Ibrahim Ethem Hamamci","Sezgin Er","Furkan Almas","Ayse Gulnihan Simsek","Sevval Nil Esirgun","Irem Dogan","Muhammed Furkan Dasdelen","Bastian Wittmann","Enis Simsar","Mehmet Simsar","Emine Bensu Erdemir","Abdullah Alanbay","Anjany Sekuboyina","Berkan Lafci","Mehmet K. Ozdemir","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2403.17834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.13969v3","updated":"2024-03-26T16:13:26Z","published":"2021-08-31T16:51:00Z","title":"Semi-Supervised Crowd Counting from Unlabeled Data","summary":" Automatic Crowd behavior analysis can be applied to effectively help the\ndaily transportation statistics and planning, which helps the smart city\nconstruction. As one of the most important keys, crowd counting has drawn\nincreasing attention. Recent works achieved promising performance but relied on\nthe supervised paradigm with expensive crowd annotations. To alleviate the\nannotation cost in real-world transportation scenarios, in this work we\nproposed a semi-supervised learning framework $S^{4}\\textit{Crowd}$, which can\nleverage both unlabeled/labeled data for robust crowd counting. In the\nunsupervised pathway, two \\textit{self-supervised losses} were proposed to\nsimulate the crowd variations such as scale, illumination, based on which\nsupervised information pseudo labels were generated and gradually refined. We\nalso proposed a crowd-driven recurrent unit \\textit{Gated-Crowd-Recurrent-Unit\n(GCRU)}, which can preserve discriminant crowd information by extracting\nsecond-order statistics, yielding pseudo labels with improved quality. A joint\nloss including both unsupervised/supervised information was proposed, and a\ndynamic weighting strategy was employed to balance the importance of the\nunsupervised loss and supervised loss at different training stages. We\nconducted extensive experiments on four popular crowd counting datasets in\nsemi-supervised settings. Experimental results supported the effectiveness of\neach proposed component in our $S^{4}$Crowd framework. Our method achieved\ncompetitive performance in semi-supervised learning approaches on these crowd\ncounting datasets.\n","authors":["Haoran Duan","Fan Wan","Rui Sun","Zeyu Wang","Varun Ojha","Yu Guan","Hubert P. H. Shum","Bingzhang Hu","Yang Long"],"pdf_url":"https://arxiv.org/pdf/2108.13969v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17830v1","updated":"2024-03-26T16:10:21Z","published":"2024-03-26T16:10:21Z","title":"Assessment of Multimodal Large Language Models in Alignment with Human\n Values","summary":" Large Language Models (LLMs) aim to serve as versatile assistants aligned\nwith human values, as defined by the principles of being helpful, honest, and\nharmless (hhh). However, in terms of Multimodal Large Language Models (MLLMs),\ndespite their commendable performance in perception and reasoning tasks, their\nalignment with human values remains largely unexplored, given the complexity of\ndefining hhh dimensions in the visual world and the difficulty in collecting\nrelevant data that accurately mirrors real-world situations. To address this\ngap, we introduce Ch3Ef, a Compreh3ensive Evaluation dataset and strategy for\nassessing alignment with human expectations. Ch3Ef dataset contains 1002\nhuman-annotated data samples, covering 12 domains and 46 tasks based on the hhh\nprinciple. We also present a unified evaluation strategy supporting assessment\nacross various scenarios and different perspectives. Based on the evaluation\nresults, we summarize over 10 key findings that deepen the understanding of\nMLLM capabilities, limitations, and the dynamic relationships between\nevaluation levels, guiding future advancements in the field.\n","authors":["Zhelun Shi","Zhipin Wang","Hongxing Fan","Zaibin Zhang","Lijun Li","Yongting Zhang","Zhenfei Yin","Lu Sheng","Yu Qiao","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2403.17830v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2311.02692"},{"id":"http://arxiv.org/abs/2403.17827v1","updated":"2024-03-26T16:06:42Z","published":"2024-03-26T16:06:42Z","title":"DiffH2O: Diffusion-Based Synthesis of Hand-Object Interactions from\n Textual Descriptions","summary":" Generating natural hand-object interactions in 3D is challenging as the\nresulting hand and object motions are expected to be physically plausible and\nsemantically meaningful. Furthermore, generalization to unseen objects is\nhindered by the limited scale of available hand-object interaction datasets. We\npropose DiffH2O, a novel method to synthesize realistic, one or two-handed\nobject interactions from provided text prompts and geometry of the object. The\nmethod introduces three techniques that enable effective learning from limited\ndata. First, we decompose the task into a grasping stage and a text-based\ninteraction stage and use separate diffusion models for each. In the grasping\nstage, the model only generates hand motions, whereas in the interaction phase\nboth hand and object poses are synthesized. Second, we propose a compact\nrepresentation that tightly couples hand and object poses. Third, we propose\ntwo different guidance schemes to allow more control of the generated motions:\ngrasp guidance and detailed textual guidance. Grasp guidance takes a single\ntarget grasping pose and guides the diffusion model to reach this grasp at the\nend of the grasping stage, which provides control over the grasping pose. Given\na grasping motion from this stage, multiple different actions can be prompted\nin the interaction phase. For textual guidance, we contribute comprehensive\ntext descriptions to the GRAB dataset and show that they enable our method to\nhave more fine-grained control over hand-object interactions. Our quantitative\nand qualitative evaluation demonstrates that the proposed method outperforms\nbaseline methods and leads to natural hand-object motions. Moreover, we\ndemonstrate the practicality of our framework by utilizing a hand pose estimate\nfrom an off-the-shelf pose estimator for guidance, and then sampling multiple\ndifferent actions in the interaction stage.\n","authors":["Sammy Christen","Shreyas Hampali","Fadime Sener","Edoardo Remelli","Tomas Hodan","Eric Sauser","Shugao Ma","Bugra Tekin"],"pdf_url":"https://arxiv.org/pdf/2403.17827v1.pdf","comment":"Project Page: https://diffh2o.github.io/"},{"id":"http://arxiv.org/abs/2403.17823v1","updated":"2024-03-26T16:04:19Z","published":"2024-03-26T16:04:19Z","title":"Efficient Image Pre-Training with Siamese Cropped Masked Autoencoders","summary":" Self-supervised pre-training of image encoders is omnipresent in the\nliterature, particularly following the introduction of Masked autoencoders\n(MAE). Current efforts attempt to learn object-centric representations from\nmotion in videos. In particular, SiamMAE recently introduced a Siamese network,\ntraining a shared-weight encoder from two frames of a video with a high\nasymmetric masking ratio (95%). In this work, we propose CropMAE, an\nalternative approach to the Siamese pre-training introduced by SiamMAE. Our\nmethod specifically differs by exclusively considering pairs of cropped images\nsourced from the same image but cropped differently, deviating from the\nconventional pairs of frames extracted from a video. CropMAE therefore\nalleviates the need for video datasets, while maintaining competitive\nperformances and drastically reducing pre-training time. Furthermore, we\ndemonstrate that CropMAE learns similar object-centric representations without\nexplicit motion, showing that current self-supervised learning methods do not\nlearn objects from motion, but rather thanks to the Siamese architecture.\nFinally, CropMAE achieves the highest masking ratio to date (98.5%), enabling\nthe reconstruction of images using only two visible patches. Our code is\navailable at https://github.com/alexandre-eymael/CropMAE.\n","authors":["Alexandre Eymaël","Renaud Vandeghen","Anthony Cioppa","Silvio Giancola","Bernard Ghanem","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2403.17823v1.pdf","comment":"19 pages, 6 figures, 3 tables, 1 page of supplementary material"},{"id":"http://arxiv.org/abs/2403.17822v1","updated":"2024-03-26T16:00:31Z","published":"2024-03-26T16:00:31Z","title":"DN-Splatter: Depth and Normal Priors for Gaussian Splatting and Meshing","summary":" 3D Gaussian splatting, a novel differentiable rendering technique, has\nachieved state-of-the-art novel view synthesis results with high rendering\nspeeds and relatively low training times. However, its performance on scenes\ncommonly seen in indoor datasets is poor due to the lack of geometric\nconstraints during optimization. We extend 3D Gaussian splatting with depth and\nnormal cues to tackle challenging indoor datasets and showcase techniques for\nefficient mesh extraction, an important downstream application. Specifically,\nwe regularize the optimization procedure with depth information, enforce local\nsmoothness of nearby Gaussians, and use the geometry of the 3D Gaussians\nsupervised by normal cues to achieve better alignment with the true scene\ngeometry. We improve depth estimation and novel view synthesis results over\nbaselines and show how this simple yet effective regularization technique can\nbe used to directly extract meshes from the Gaussian representation yielding\nmore physically accurate reconstructions on indoor scenes. Our code will be\nreleased in https://github.com/maturk/dn-splatter.\n","authors":["Matias Turkulainen","Xuqian Ren","Iaroslav Melekhov","Otto Seiskari","Esa Rahtu","Juho Kannala"],"pdf_url":"https://arxiv.org/pdf/2403.17822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15964v2","updated":"2024-03-26T15:58:26Z","published":"2023-11-27T16:07:37Z","title":"Efficient Pre-training for Localized Instruction Generation of Videos","summary":" Procedural videos show step-by-step demonstrations of tasks like recipe\npreparation. Understanding such videos is challenging, involving the precise\nlocalization of steps and the generation of textual instructions. Manually\nannotating steps and writing instructions is costly, which limits the size of\ncurrent datasets and hinders effective learning. Leveraging large but noisy\nvideo-transcript datasets for pre-training can boost performance, but demands\nsignificant computational resources. Furthermore, transcripts contain\nirrelevant content and exhibit style variation compared to instructions written\nby human annotators. To mitigate both issues, we propose a technique,\nSieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters\nirrelevant transcripts and (ii) Swap enhances the quality of the text\ninstruction by automatically replacing the transcripts with human-written\ninstructions from a text-only recipe dataset. The curated dataset, three orders\nof magnitude smaller than current web-scale datasets, enables efficient\ntraining of large-scale models with competitive performance. We complement our\nSieve-\\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step\nlocalization and instruction generation for procedural videos. When this model\nis pre-trained on our curated dataset, it achieves state-of-the-art performance\nin zero-shot and finetuning settings on YouCook2 and Tasty, while using a\nfraction of the computational resources.\n","authors":["Anil Batra","Davide Moltisanti","Laura Sevilla-Lara","Marcus Rohrbach","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2311.15964v2.pdf","comment":"This version has some missing experiments and elaborative technical\n details"},{"id":"http://arxiv.org/abs/2403.17808v1","updated":"2024-03-26T15:45:29Z","published":"2024-03-26T15:45:29Z","title":"Annotated Biomedical Video Generation using Denoising Diffusion\n Probabilistic Models and Flow Fields","summary":" The segmentation and tracking of living cells play a vital role within the\nbiomedical domain, particularly in cancer research, drug development, and\ndevelopmental biology. These are usually tedious and time-consuming tasks that\nare traditionally done by biomedical experts. Recently, to automatize these\nprocesses, deep learning based segmentation and tracking methods have been\nproposed. These methods require large-scale datasets and their full potential\nis constrained by the scarcity of annotated data in the biomedical imaging\ndomain. To address this limitation, we propose Biomedical Video Diffusion Model\n(BVDM), capable of generating realistic-looking synthetic microscopy videos.\nTrained only on a single real video, BVDM can generate videos of arbitrary\nlength with pixel-level annotations that can be used for training data-hungry\nmodels. It is composed of a denoising diffusion probabilistic model (DDPM)\ngenerating high-fidelity synthetic cell microscopy images and a flow prediction\nmodel (FPM) predicting the non-rigid transformation between consecutive video\nframes. During inference, initially, the DDPM imposes realistic cell textures\non synthetic cell masks which are generated based on real data statistics. The\nflow prediction model predicts the flow field between consecutive masks and\napplies that to the DDPM output from the previous time frame to create the next\none while keeping temporal consistency. BVDM outperforms state-of-the-art\nsynthetic live cell microscopy video generation models. Furthermore, we\ndemonstrate that a sufficiently large synthetic dataset enhances the\nperformance of cell segmentation and tracking models compared to using a\nlimited amount of available real data.\n","authors":["Rüveyda Yilmaz","Dennis Eschweiler","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2403.17808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17804v1","updated":"2024-03-26T15:42:01Z","published":"2024-03-26T15:42:01Z","title":"Improving Text-to-Image Consistency via Automatic Prompt Optimization","summary":" Impressive advances in text-to-image (T2I) generative models have yielded a\nplethora of high performing models which are able to generate aesthetically\nappealing, photorealistic images. Despite the progress, these models still\nstruggle to produce images that are consistent with the input prompt,\noftentimes failing to capture object quantities, relations and attributes\nproperly. Existing solutions to improve prompt-image consistency suffer from\nthe following challenges: (1) they oftentimes require model fine-tuning, (2)\nthey only focus on nearby prompt samples, and (3) they are affected by\nunfavorable trade-offs among image quality, representation diversity, and\nprompt-image consistency. In this paper, we address these challenges and\nintroduce a T2I optimization-by-prompting framework, OPT2I, which leverages a\nlarge language model (LLM) to improve prompt-image consistency in T2I models.\nOur framework starts from a user prompt and iteratively generates revised\nprompts with the goal of maximizing a consistency score. Our extensive\nvalidation on two datasets, MSCOCO and PartiPrompts, shows that OPT2I can boost\nthe initial consistency score by up to 24.9% in terms of DSG score while\npreserving the FID and increasing the recall between generated and real data.\nOur work paves the way toward building more reliable and robust T2I systems by\nharnessing the power of LLMs.\n","authors":["Oscar Mañas","Pietro Astolfi","Melissa Hall","Candace Ross","Jack Urbanek","Adina Williams","Aishwarya Agrawal","Adriana Romero-Soriano","Michal Drozdzal"],"pdf_url":"https://arxiv.org/pdf/2403.17804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00454v3","updated":"2024-03-26T15:41:17Z","published":"2023-09-30T18:13:41Z","title":"SimLVSeg: Simplifying Left Ventricular Segmentation in 2D+Time\n Echocardiograms with Self- and Weakly-Supervised Learning","summary":" Echocardiography has become an indispensable clinical imaging modality for\ngeneral heart health assessment. From calculating biomarkers such as ejection\nfraction to the probability of a patient's heart failure, accurate segmentation\nof the heart structures allows doctors to assess the heart's condition and\ndevise treatments with greater precision and accuracy. However, achieving\naccurate and reliable left ventricle segmentation is time-consuming and\nchallenging due to different reasons. Hence, clinicians often rely on\nsegmenting the left ventricular (LV) in two specific echocardiogram frames to\nmake a diagnosis. This limited coverage in manual LV segmentation poses a\nchallenge for developing automatic LV segmentation with high temporal\nconsistency, as the resulting dataset is typically annotated sparsely. In\nresponse to this challenge, this work introduces SimLVSeg, a novel paradigm\nthat enables video-based networks for consistent LV segmentation from sparsely\nannotated echocardiogram videos. SimLVSeg consists of self-supervised\npre-training with temporal masking, followed by weakly supervised learning\ntailored for LV segmentation from sparse annotations. We demonstrate how\nSimLVSeg outperforms the state-of-the-art solutions by achieving a 93.32%\n(95%CI 93.21-93.43%) dice score on the largest 2D+time echocardiography dataset\n(EchoNet-Dynamic) while being more efficient. SimLVSeg is compatible with two\ntypes of video segmentation networks: 2D super image and 3D segmentation. To\nshow the effectiveness of our approach, we provide extensive ablation studies,\nincluding pre-training settings and various deep learning backbones. We further\nconduct an out-of-distribution test to showcase SimLVSeg's generalizability on\nunseen distribution (CAMUS dataset). The code is publicly available at\nhttps://github.com/fadamsyah/SimLVSeg.\n","authors":["Fadillah Maani","Asim Ukaye","Nada Saadi","Numan Saeed","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2310.00454v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08639v2","updated":"2024-03-26T15:40:20Z","published":"2024-03-13T15:51:23Z","title":"HIMap: HybrId Representation Learning for End-to-end Vectorized HD Map\n Construction","summary":" Vectorized High-Definition (HD) map construction requires predictions of the\ncategory and point coordinates of map elements (e.g. road boundary, lane\ndivider, pedestrian crossing, etc.). State-of-the-art methods are mainly based\non point-level representation learning for regressing accurate point\ncoordinates. However, this pipeline has limitations in obtaining element-level\ninformation and handling element-level failures, e.g. erroneous element shape\nor entanglement between elements. To tackle the above issues, we propose a\nsimple yet effective HybrId framework named HIMap to sufficiently learn and\ninteract both point-level and element-level information. Concretely, we\nintroduce a hybrid representation called HIQuery to represent all map elements,\nand propose a point-element interactor to interactively extract and encode the\nhybrid information of elements, e.g. point position and element shape, into the\nHIQuery. Additionally, we present a point-element consistency constraint to\nenhance the consistency between the point-level and element-level information.\nFinally, the output point-element integrated HIQuery can be directly converted\ninto map elements' class, point coordinates, and mask. We conduct extensive\nexperiments and consistently outperform previous methods on both nuScenes and\nArgoverse2 datasets. Notably, our method achieves $77.8$ mAP on the nuScenes\ndataset, remarkably superior to previous SOTAs by $8.3$ mAP at least.\n","authors":["Yi Zhou","Hui Zhang","Jiaqian Yu","Yifan Yang","Sangil Jung","Seung-In Park","ByungIn Yoo"],"pdf_url":"https://arxiv.org/pdf/2403.08639v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17801v1","updated":"2024-03-26T15:40:05Z","published":"2024-03-26T15:40:05Z","title":"Towards 3D Vision with Low-Cost Single-Photon Cameras","summary":" We present a method for reconstructing 3D shape of arbitrary Lambertian\nobjects based on measurements by miniature, energy-efficient, low-cost\nsingle-photon cameras. These cameras, operating as time resolved image sensors,\nilluminate the scene with a very fast pulse of diffuse light and record the\nshape of that pulse as it returns back from the scene at a high temporal\nresolution. We propose to model this image formation process, account for its\nnon-idealities, and adapt neural rendering to reconstruct 3D geometry from a\nset of spatially distributed sensors with known poses. We show that our\napproach can successfully recover complex 3D shapes from simulated data. We\nfurther demonstrate 3D object reconstruction from real-world captures,\nutilizing measurements from a commodity proximity sensor. Our work draws a\nconnection between image-based modeling and active range scanning and is a step\ntowards 3D vision with single-photon cameras.\n","authors":["Fangzhou Mu","Carter Sifferman","Sacha Jungerman","Yiquan Li","Mark Han","Michael Gleicher","Mohit Gupta","Yin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17787v1","updated":"2024-03-26T15:20:49Z","published":"2024-03-26T15:20:49Z","title":"Evaluating the Efficacy of Prompt-Engineered Large Multimodal Models\n Versus Fine-Tuned Vision Transformers in Image-Based Security Applications","summary":" The success of Large Language Models (LLMs) has led to a parallel rise in the\ndevelopment of Large Multimodal Models (LMMs), such as Gemini-pro, which have\nbegun to transform a variety of applications. These sophisticated multimodal\nmodels are designed to interpret and analyze complex data, integrating both\ntextual and visual information on a scale previously unattainable, opening new\navenues for a range of applications. This paper investigates the applicability\nand effectiveness of prompt-engineered Gemini-pro LMMs versus fine-tuned Vision\nTransformer (ViT) models in addressing critical security challenges. We focus\non two distinct tasks: a visually evident task of detecting simple triggers,\nsuch as small squares in images, indicative of potential backdoors, and a\nnon-visually evident task of malware classification through visual\nrepresentations. Our results highlight a significant divergence in performance,\nwith Gemini-pro falling short in accuracy and reliability when compared to\nfine-tuned ViT models. The ViT models, on the other hand, demonstrate\nexceptional accuracy, achieving near-perfect performance on both tasks. This\nstudy not only showcases the strengths and limitations of prompt-engineered\nLMMs in cybersecurity applications but also emphasizes the unmatched efficacy\nof fine-tuned ViT models for precise and dependable tasks.\n","authors":["Fouad Trad","Ali Chehab"],"pdf_url":"https://arxiv.org/pdf/2403.17787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17782v1","updated":"2024-03-26T15:15:15Z","published":"2024-03-26T15:15:15Z","title":"GenesisTex: Adapting Image Denoising Diffusion to Texture Space","summary":" We present GenesisTex, a novel method for synthesizing textures for 3D\ngeometries from text descriptions. GenesisTex adapts the pretrained image\ndiffusion model to texture space by texture space sampling. Specifically, we\nmaintain a latent texture map for each viewpoint, which is updated with\npredicted noise on the rendering of the corresponding viewpoint. The sampled\nlatent texture maps are then decoded into a final texture map. During the\nsampling process, we focus on both global and local consistency across multiple\nviewpoints: global consistency is achieved through the integration of style\nconsistency mechanisms within the noise prediction network, and low-level\nconsistency is achieved by dynamically aligning latent textures. Finally, we\napply reference-based inpainting and img2img on denser views for texture\nrefinement. Our approach overcomes the limitations of slow optimization in\ndistillation-based methods and instability in inpainting-based methods.\nExperiments on meshes from various sources demonstrate that our method\nsurpasses the baseline methods quantitatively and qualitatively.\n","authors":["Chenjian Gao","Boyan Jiang","Xinghui Li","Yingpeng Zhang","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2403.17782v1.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.16167v2","updated":"2024-03-26T15:14:25Z","published":"2024-03-24T14:21:06Z","title":"Exploiting Semantic Reconstruction to Mitigate Hallucinations in\n Vision-Language Models","summary":" Hallucinations in vision-language models pose a significant challenge to\ntheir reliability, particularly in the generation of long captions. Current\nmethods fall short of accurately identifying and mitigating these\nhallucinations. To address this issue, we introduce ESREAL, a novel\nunsupervised learning framework designed to suppress the generation of\nhallucinations through accurate localization and penalization of hallucinated\ntokens. Initially, ESREAL creates a reconstructed image based on the generated\ncaption and aligns its corresponding regions with those of the original image.\nThis semantic reconstruction aids in identifying both the presence and type of\ntoken-level hallucinations within the generated caption. Subsequently, ESREAL\ncomputes token-level hallucination scores by assessing the semantic similarity\nof aligned regions based on the type of hallucination. Finally, ESREAL employs\na proximal policy optimization algorithm, where it selectively penalizes\nhallucinated tokens according to their token-level hallucination scores. Our\nframework notably reduces hallucinations in LLaVA, InstructBLIP, and mPLUG-Owl2\nby 32.81%, 27.08%, and 7.46% on the CHAIR metric. This improvement is achieved\nsolely through signals derived from the image itself, without the need for any\nimage-text pairs.\n","authors":["Minchan Kim","Minyeong Kim","Junik Bae","Suhwan Choi","Sungkyung Kim","Buru Chang"],"pdf_url":"https://arxiv.org/pdf/2403.16167v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12225v2","updated":"2024-03-26T15:06:00Z","published":"2024-02-19T15:33:09Z","title":"Pushing Auto-regressive Models for 3D Shape Generation at Capacity and\n Scalability","summary":" Auto-regressive models have achieved impressive results in 2D image\ngeneration by modeling joint distributions in grid space. In this paper, we\nextend auto-regressive models to 3D domains, and seek a stronger ability of 3D\nshape generation by improving auto-regressive models at capacity and\nscalability simultaneously. Firstly, we leverage an ensemble of publicly\navailable 3D datasets to facilitate the training of large-scale models. It\nconsists of a comprehensive collection of approximately 900,000 objects, with\nmultiple properties of meshes, points, voxels, rendered images, and text\ncaptions. This diverse labeled dataset, termed Objaverse-Mix, empowers our\nmodel to learn from a wide range of object variations. However, directly\napplying 3D auto-regression encounters critical challenges of high\ncomputational demands on volumetric grids and ambiguous auto-regressive order\nalong grid dimensions, resulting in inferior quality of 3D shapes. To this end,\nwe then present a novel framework Argus3D in terms of capacity. Concretely, our\napproach introduces discrete representation learning based on a latent vector\ninstead of volumetric grids, which not only reduces computational costs but\nalso preserves essential geometric details by learning the joint distributions\nin a more tractable order. The capacity of conditional generation can thus be\nrealized by simply concatenating various conditioning inputs to the latent\nvector, such as point clouds, categories, images, and texts. In addition,\nthanks to the simplicity of our model architecture, we naturally scale up our\napproach to a larger model with an impressive 3.6 billion parameters, further\nenhancing the quality of versatile 3D generation. Extensive experiments on four\ngeneration tasks demonstrate that Argus3D can synthesize diverse and faithful\nshapes across multiple categories, achieving remarkable performance.\n","authors":["Xuelin Qian","Yu Wang","Simian Luo","Yinda Zhang","Ying Tai","Zhenyu Zhang","Chengjie Wang","Xiangyang Xue","Bo Zhao","Tiejun Huang","Yunsheng Wu","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2402.12225v2.pdf","comment":"Project page: https://argus-3d.github.io/ . Datasets:\n https://huggingface.co/datasets/BAAI/Objaverse-MIX. arXiv admin note:\n substantial text overlap with arXiv:2303.14700"},{"id":"http://arxiv.org/abs/2403.17770v1","updated":"2024-03-26T14:59:11Z","published":"2024-03-26T14:59:11Z","title":"CT Synthesis with Conditional Diffusion Models for Abdominal Lymph Node\n Segmentation","summary":" Despite the significant success achieved by deep learning methods in medical\nimage segmentation, researchers still struggle in the computer-aided diagnosis\nof abdominal lymph nodes due to the complex abdominal environment, small and\nindistinguishable lesions, and limited annotated data. To address these\nproblems, we present a pipeline that integrates the conditional diffusion model\nfor lymph node generation and the nnU-Net model for lymph node segmentation to\nimprove the segmentation performance of abdominal lymph nodes through\nsynthesizing a diversity of realistic abdominal lymph node data. We propose\nLN-DDPM, a conditional denoising diffusion probabilistic model (DDPM) for lymph\nnode (LN) generation. LN-DDPM utilizes lymph node masks and anatomical\nstructure masks as model conditions. These conditions work in two conditioning\nmechanisms: global structure conditioning and local detail conditioning, to\ndistinguish between lymph nodes and their surroundings and better capture lymph\nnode characteristics. The obtained paired abdominal lymph node images and masks\nare used for the downstream segmentation task. Experimental results on the\nabdominal lymph node datasets demonstrate that LN-DDPM outperforms other\ngenerative methods in the abdominal lymph node image synthesis and better\nassists the downstream abdominal lymph node segmentation task.\n","authors":["Yongrui Yu","Hanyu Chen","Zitian Zhang","Qiong Xiao","Wenhui Lei","Linrui Dai","Yu Fu","Hui Tan","Guan Wang","Peng Gao","Xiaofan Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.17770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17057v2","updated":"2024-03-26T14:54:04Z","published":"2023-11-28T18:59:52Z","title":"ReMoS: 3D Motion-Conditioned Reaction Synthesis for Two-Person\n Interactions","summary":" Current approaches for 3D human motion synthesis generate high-quality\nanimations of digital humans performing a wide variety of actions and gestures.\nHowever, a notable technological gap exists in addressing the complex dynamics\nof multi-human interactions within this paradigm. In this work, we present\nReMoS, a denoising diffusion-based model that synthesizes full-body reactive\nmotion of a person in a two-person interaction scenario. Assuming the motion of\none person is given, we employ a combined spatio-temporal cross-attention\nmechanism to synthesize the reactive body and hand motion of the second person,\nthereby completing the interactions between the two. We demonstrate ReMoS\nacross challenging two-person scenarios such as pair-dancing, Ninjutsu,\nkickboxing, and acrobatics, where one person's movements have complex and\ndiverse influences on the other. We also contribute the ReMoCap dataset for\ntwo-person interactions containing full-body and finger motions. We evaluate\nReMoS through multiple quantitative metrics, qualitative visualizations, and a\nuser study, and also indicate usability in interactive motion editing\napplications.\n","authors":["Anindita Ghosh","Rishabh Dabral","Vladislav Golyanik","Christian Theobalt","Philipp Slusallek"],"pdf_url":"https://arxiv.org/pdf/2311.17057v2.pdf","comment":"17 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.17765v1","updated":"2024-03-26T14:53:24Z","published":"2024-03-26T14:53:24Z","title":"MUTE-SLAM: Real-Time Neural SLAM with Multiple Tri-Plane Hash\n Representations","summary":" We introduce MUTE-SLAM, a real-time neural RGB-D SLAM system employing\nmultiple tri-plane hash-encodings for efficient scene representation. MUTE-SLAM\neffectively tracks camera positions and incrementally builds a scalable\nmulti-map representation for both small and large indoor environments. It\ndynamically allocates sub-maps for newly observed local regions, enabling\nconstraint-free mapping without prior scene information. Unlike traditional\ngrid-based methods, we use three orthogonal axis-aligned planes for\nhash-encoding scene properties, significantly reducing hash collisions and the\nnumber of trainable parameters. This hybrid approach not only speeds up\nconvergence but also enhances the fidelity of surface reconstruction.\nFurthermore, our optimization strategy concurrently optimizes all sub-maps\nintersecting with the current camera frustum, ensuring global consistency.\nExtensive testing on both real-world and synthetic datasets has shown that\nMUTE-SLAM delivers state-of-the-art surface reconstruction quality and\ncompetitive tracking performance across diverse indoor settings. The code will\nbe made public upon acceptance of the paper.\n","authors":["Yifan Yan","Ruomin He","Zhenghua Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15585v2","updated":"2024-03-26T14:51:57Z","published":"2024-03-22T19:19:51Z","title":"MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis","summary":" Chest X-ray images are commonly used for predicting acute and chronic\ncardiopulmonary conditions, but efforts to integrate them with structured\nclinical data face challenges due to incomplete electronic health records\n(EHR). This paper introduces \\textbf{MedPromptX}, the first model to integrate\nmultimodal large language models (MLLMs), few-shot prompting (FP) and visual\ngrounding (VG) to combine imagery with EHR data for chest X-ray diagnosis. A\npre-trained MLLM is utilized to complement the missing EHR information,\nproviding a comprehensive understanding of patients' medical history.\nAdditionally, FP reduces the necessity for extensive training of MLLMs while\neffectively tackling the issue of hallucination. Nevertheless, the process of\ndetermining the optimal number of few-shot examples and selecting high-quality\ncandidates can be burdensome, yet it profoundly influences model performance.\nHence, we propose a new technique that dynamically refines few-shot data for\nreal-time adjustment to new patient scenarios. Moreover, VG aids in focusing\nthe model's attention on relevant regions of interest in X-ray images,\nenhancing the identification of abnormalities. We release MedPromptX-VQA, a new\nin-context visual question answering dataset encompassing interleaved image and\nEHR data derived from MIMIC-IV and MIMIC-CXR databases. Results demonstrate the\nSOTA performance of MedPromptX, achieving an 11% improvement in F1-score\ncompared to the baselines. Code and data are available at\nhttps://github.com/BioMedIA-MBZUAI/MedPromptX\n","authors":["Mai A. Shaaban","Adnan Khan","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.15585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17761v1","updated":"2024-03-26T14:51:53Z","published":"2024-03-26T14:51:53Z","title":"Makeup Prior Models for 3D Facial Makeup Estimation and Applications","summary":" In this work, we introduce two types of makeup prior models to extend\nexisting 3D face prior models: PCA-based and StyleGAN2-based priors. The\nPCA-based prior model is a linear model that is easy to construct and is\ncomputationally efficient. However, it retains only low-frequency information.\nConversely, the StyleGAN2-based model can represent high-frequency information\nwith relatively higher computational cost than the PCA-based model. Although\nthere is a trade-off between the two models, both are applicable to 3D facial\nmakeup estimation and related applications. By leveraging makeup prior models\nand designing a makeup consistency module, we effectively address the\nchallenges that previous methods faced in robustly estimating makeup,\nparticularly in the context of handling self-occluded faces. In experiments, we\ndemonstrate that our approach reduces computational costs by several orders of\nmagnitude, achieving speeds up to 180 times faster. In addition, by improving\nthe accuracy of the estimated makeup, we confirm that our methods are highly\nadvantageous for various 3D facial makeup applications such as 3D makeup face\nreconstruction, user-friendly makeup editing, makeup transfer, and\ninterpolation.\n","authors":["Xingchao Yang","Takafumi Taketomi","Yuki Endo","Yoshihiro Kanamori"],"pdf_url":"https://arxiv.org/pdf/2403.17761v1.pdf","comment":"CVPR2024. Project: https://yangxingchao.github.io/makeup-priors-page"},{"id":"http://arxiv.org/abs/2403.17757v1","updated":"2024-03-26T14:49:22Z","published":"2024-03-26T14:49:22Z","title":"Noise2Noise Denoising of CRISM Hyperspectral Data","summary":" Hyperspectral data acquired by the Compact Reconnaissance Imaging\nSpectrometer for Mars (CRISM) have allowed for unparalleled mapping of the\nsurface mineralogy of Mars. Due to sensor degradation over time, a significant\nportion of the recently acquired data is considered unusable. Here a new\ndata-driven model architecture, Noise2Noise4Mars (N2N4M), is introduced to\nremove noise from CRISM images. Our model is self-supervised and does not\nrequire zero-noise target data, making it well suited for use in Planetary\nScience applications where high quality labelled data is scarce. We demonstrate\nits strong performance on synthetic-noise data and CRISM images, and its impact\non downstream classification performance, outperforming benchmark methods on\nmost metrics. This allows for detailed analysis for critical sites of interest\non the Martian surface, including proposed lander sites.\n","authors":["Robert Platt","Rossella Arcucci","Cédric John"],"pdf_url":"https://arxiv.org/pdf/2403.17757v1.pdf","comment":"5 pages, 3 figures. Accepted as a conference paper at the ICLR 2024\n ML4RS Workshop"},{"id":"http://arxiv.org/abs/2403.17755v1","updated":"2024-03-26T14:44:51Z","published":"2024-03-26T14:44:51Z","title":"DataCook: Crafting Anti-Adversarial Examples for Healthcare Data\n Copyright Protection","summary":" In the realm of healthcare, the challenges of copyright protection and\nunauthorized third-party misuse are increasingly significant. Traditional\nmethods for data copyright protection are applied prior to data distribution,\nimplying that models trained on these data become uncontrollable. This paper\nintroduces a novel approach, named DataCook, designed to safeguard the\ncopyright of healthcare data during the deployment phase. DataCook operates by\n\"cooking\" the raw data before distribution, enabling the development of models\nthat perform normally on this processed data. However, during the deployment\nphase, the original test data must be also \"cooked\" through DataCook to ensure\nnormal model performance. This process grants copyright holders control over\nauthorization during the deployment phase. The mechanism behind DataCook is by\ncrafting anti-adversarial examples (AntiAdv), which are designed to enhance\nmodel confidence, as opposed to standard adversarial examples (Adv) that aim to\nconfuse models. Similar to Adv, AntiAdv introduces imperceptible perturbations,\nensuring that the data processed by DataCook remains easily understandable. We\nconducted extensive experiments on MedMNIST datasets, encompassing both 2D/3D\ndata and the high-resolution variants. The outcomes indicate that DataCook\neffectively meets its objectives, preventing models trained on AntiAdv from\nanalyzing unauthorized data effectively, without compromising the validity and\naccuracy of the data in legitimate scenarios. Code and data are available at\nhttps://github.com/MedMNIST/DataCook.\n","authors":["Sihan Shang","Jiancheng Yang","Zhenglong Sun","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2403.17755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06247v2","updated":"2024-03-26T14:42:21Z","published":"2024-03-10T16:11:17Z","title":"Text-Guided Variational Image Generation for Industrial Anomaly\n Detection and Segmentation","summary":" We propose a text-guided variational image generation method to address the\nchallenge of getting clean data for anomaly detection in industrial\nmanufacturing. Our method utilizes text information about the target object,\nlearned from extensive text library documents, to generate non-defective data\nimages resembling the input image. The proposed framework ensures that the\ngenerated non-defective images align with anticipated distributions derived\nfrom textual and image-based knowledge, ensuring stability and generality.\nExperimental results demonstrate the effectiveness of our approach, surpassing\nprevious methods even with limited non-defective data. Our approach is\nvalidated through generalization tests across four baseline models and three\ndistinct datasets. We present an additional analysis to enhance the\neffectiveness of anomaly detection models by utilizing the generated images.\n","authors":["Mingyu Lee","Jongwon Choi"],"pdf_url":"https://arxiv.org/pdf/2403.06247v2.pdf","comment":"18 pages, Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17749v1","updated":"2024-03-26T14:40:17Z","published":"2024-03-26T14:40:17Z","title":"Multi-Task Dense Prediction via Mixture of Low-Rank Experts","summary":" Previous multi-task dense prediction methods based on the Mixture of Experts\n(MoE) have received great performance but they neglect the importance of\nexplicitly modeling the global relations among all tasks. In this paper, we\npresent a novel decoder-focused method for multi-task dense prediction, called\nMixture-of-Low-Rank-Experts (MLoRE). To model the global task relationships,\nMLoRE adds a generic convolution path to the original MoE structure, where each\ntask feature can go through this path for explicit parameter sharing.\nFurthermore, to control the parameters and computational cost brought by the\nincrease in the number of experts, we take inspiration from LoRA and propose to\nleverage the low-rank format of a vanilla convolution in the expert network.\nSince the low-rank experts have fewer parameters and can be dynamically\nparameterized into the generic convolution, the parameters and computational\ncost do not change much with the increase of experts. Benefiting from this\ndesign, we increase the number of experts and its reception field to enlarge\nthe representation capacity, facilitating multiple dense tasks learning in a\nunified network. Extensive experiments on the PASCAL-Context and NYUD-v2\nbenchmarks show that our MLoRE achieves superior performance compared to\nprevious state-of-the-art methods on all metrics. Our code is available at\nhttps://github.com/YuqiYang213/MLoRE.\n","authors":["Yuqi Yang","Peng-Tao Jiang","Qibin Hou","Hao Zhang","Jinwei Chen","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2403.17749v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.08270v2","updated":"2024-03-26T14:39:43Z","published":"2024-03-13T05:46:36Z","title":"Identity-aware Dual-constraint Network for Cloth-Changing Person\n Re-identification","summary":" Cloth-Changing Person Re-Identification (CC-ReID) aims to accurately identify\nthe target person in more realistic surveillance scenarios, where pedestrians\nusually change their clothing. Despite great progress, limited cloth-changing\ntraining samples in existing CC-ReID datasets still prevent the model from\nadequately learning cloth-irrelevant features. In addition, due to the absence\nof explicit supervision to keep the model constantly focused on\ncloth-irrelevant areas, existing methods are still hampered by the disruption\nof clothing variations. To solve the above issues, we propose an Identity-aware\nDual-constraint Network (IDNet) for the CC-ReID task. Specifically, to help the\nmodel extract cloth-irrelevant clues, we propose a Clothes Diversity\nAugmentation (CDA), which generates more realistic cloth-changing samples by\nenriching the clothing color while preserving the texture. In addition, a\nMulti-scale Constraint Block (MCB) is designed, which extracts fine-grained\nidentity-related features and effectively transfers cloth-irrelevant knowledge.\nMoreover, a Counterfactual-guided Attention Module (CAM) is presented, which\nlearns cloth-irrelevant features from channel and space dimensions and utilizes\nthe counterfactual intervention for supervising the attention map to highlight\nidentity-related regions. Finally, a Semantic Alignment Constraint (SAC) is\ndesigned to facilitate high-level semantic feature interaction. Comprehensive\nexperiments on four CC-ReID datasets indicate that our method outperforms prior\nstate-of-the-art approaches.\n","authors":["Peini Guo","Mengyuan Liu","Hong Liu","Ruijia Fan","Guoquan Wang","Bin He"],"pdf_url":"https://arxiv.org/pdf/2403.08270v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02129v4","updated":"2024-03-26T14:38:23Z","published":"2023-10-03T15:10:46Z","title":"Unveiling the Pitfalls of Knowledge Editing for Large Language Models","summary":" As the cost associated with fine-tuning Large Language Models (LLMs)\ncontinues to rise, recent research efforts have pivoted towards developing\nmethodologies to edit implicit knowledge embedded within LLMs. Yet, there's\nstill a dark cloud lingering overhead -- will knowledge editing trigger\nbutterfly effect? since it is still unclear whether knowledge editing might\nintroduce side effects that pose potential risks or not. This paper pioneers\nthe investigation into the potential pitfalls associated with knowledge editing\nfor LLMs. To achieve this, we introduce new benchmark datasets and propose\ninnovative evaluation metrics. Our results underline two pivotal concerns: (1)\nKnowledge Conflict: Editing groups of facts that logically clash can magnify\nthe inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)\nKnowledge Distortion: Altering parameters with the aim of editing factual\nknowledge can irrevocably warp the innate knowledge structure of LLMs.\nExperimental results vividly demonstrate that knowledge editing might\ninadvertently cast a shadow of unintended consequences on LLMs, which warrant\nattention and efforts for future works. Code and data are available at\nhttps://github.com/zjunlp/PitfallsKnowledgeEditing.\n","authors":["Zhoubo Li","Ningyu Zhang","Yunzhi Yao","Mengru Wang","Xi Chen","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02129v4.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.17734v1","updated":"2024-03-26T14:21:49Z","published":"2024-03-26T14:21:49Z","title":"Paired Diffusion: Generation of related, synthetic PET-CT-Segmentation\n scans using Linked Denoising Diffusion Probabilistic Models","summary":" The rapid advancement of Artificial Intelligence (AI) in biomedical imaging\nand radiotherapy is hindered by the limited availability of large imaging data\nrepositories. With recent research and improvements in denoising diffusion\nprobabilistic models (DDPM), high quality synthetic medical scans are now\npossible. Despite this, there is currently no way of generating multiple\nrelated images, such as a corresponding ground truth which can be used to train\nmodels, so synthetic scans are often manually annotated before use. This\nresearch introduces a novel architecture that is able to generate multiple,\nrelated PET-CT-tumour mask pairs using paired networks and conditional\nencoders. Our approach includes innovative, time step-controlled mechanisms and\na `noise-seeding' strategy to improve DDPM sampling consistency. While our\nmodel requires a modified perceptual loss function to ensure accurate feature\nalignment we show generation of clearly aligned synthetic images and\nimprovement in segmentation accuracy with generated images.\n","authors":["Rowan Bradbury","Katherine A. Vallis","Bartlomiej W. Papiez"],"pdf_url":"https://arxiv.org/pdf/2403.17734v1.pdf","comment":"to be published in IEEE International Symposium on Biomedical Imaging\n 2024"},{"id":"http://arxiv.org/abs/2403.17727v1","updated":"2024-03-26T14:16:56Z","published":"2024-03-26T14:16:56Z","title":"FastPerson: Enhancing Video Learning through Effective Video\n Summarization that Preserves Linguistic and Visual Contexts","summary":" Quickly understanding lengthy lecture videos is essential for learners with\nlimited time and interest in various topics to improve their learning\nefficiency. To this end, video summarization has been actively researched to\nenable users to view only important scenes from a video. However, these studies\nfocus on either the visual or audio information of a video and extract\nimportant segments in the video. Therefore, there is a risk of missing\nimportant information when both the teacher's speech and visual information on\nthe blackboard or slides are important, such as in a lecture video. To tackle\nthis issue, we propose FastPerson, a video summarization approach that\nconsiders both the visual and auditory information in lecture videos.\nFastPerson creates summary videos by utilizing audio transcriptions along with\non-screen images and text, minimizing the risk of overlooking crucial\ninformation for learners. Further, it provides a feature that allows learners\nto switch between the summary and original videos for each chapter of the\nvideo, enabling them to adjust the pace of learning based on their interests\nand level of understanding. We conducted an evaluation with 40 participants to\nassess the effectiveness of our method and confirmed that it reduced viewing\ntime by 53\\% at the same level of comprehension as that when using traditional\nvideo playback methods.\n","authors":["Kazuki Kawamura","Jun Rekimoto"],"pdf_url":"https://arxiv.org/pdf/2403.17727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17464v2","updated":"2024-03-26T14:15:25Z","published":"2024-02-27T12:42:06Z","title":"Generative 3D Part Assembly via Part-Whole-Hierarchy Message Passing","summary":" Generative 3D part assembly involves understanding part relationships and\npredicting their 6-DoF poses for assembling a realistic 3D shape. Prior work\noften focus on the geometry of individual parts, neglecting part-whole\nhierarchies of objects. Leveraging two key observations: 1) super-part poses\nprovide strong hints about part poses, and 2) predicting super-part poses is\neasier due to fewer superparts, we propose a part-whole-hierarchy message\npassing network for efficient 3D part assembly. We first introduce super-parts\nby grouping geometrically similar parts without any semantic labels. Then we\nemploy a part-whole hierarchical encoder, wherein a super-part encoder predicts\nlatent super-part poses based on input parts. Subsequently, we transform the\npoint cloud using the latent poses, feeding it to the part encoder for\naggregating super-part information and reasoning about part relationships to\npredict all part poses. In training, only ground-truth part poses are required.\nDuring inference, the predicted latent poses of super-parts enhance\ninterpretability. Experimental results on the PartNet dataset show that our\nmethod achieves state-of-the-art performance in part and connectivity accuracy\nand enables an interpretable hierarchical part assembly.\n","authors":["Bi'an Du","Xiang Gao","Wei Hu","Renjie Liao"],"pdf_url":"https://arxiv.org/pdf/2402.17464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17725v1","updated":"2024-03-26T14:13:44Z","published":"2024-03-26T14:13:44Z","title":"Deep Learning for Segmentation of Cracks in High-Resolution Images of\n Steel Bridges","summary":" Automating the current bridge visual inspection practices using drones and\nimage processing techniques is a prominent way to make these inspections more\neffective, robust, and less expensive. In this paper, we investigate the\ndevelopment of a novel deep-learning method for the detection of fatigue cracks\nin high-resolution images of steel bridges. First, we present a novel and\nchallenging dataset comprising of images of cracks in steel bridges. Secondly,\nwe integrate the ConvNext neural network with a previous state- of-the-art\nencoder-decoder network for crack segmentation. We study and report, the\neffects of the use of background patches on the network performance when\napplied to high-resolution images of cracks in steel bridges. Finally, we\nintroduce a loss function that allows the use of more background patches for\nthe training process, which yields a significant reduction in false positive\nrates.\n","authors":["Andrii Kompanets","Gautam Pai","Remco Duits","Davide Leonetti","Bert Snijder"],"pdf_url":"https://arxiv.org/pdf/2403.17725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17712v1","updated":"2024-03-26T13:58:47Z","published":"2024-03-26T13:58:47Z","title":"Invisible Gas Detection: An RGB-Thermal Cross Attention Network and A\n New Benchmark","summary":" The widespread use of various chemical gases in industrial processes\nnecessitates effective measures to prevent their leakage during transportation\nand storage, given their high toxicity. Thermal infrared-based computer vision\ndetection techniques provide a straightforward approach to identify gas leakage\nareas. However, the development of high-quality algorithms has been challenging\ndue to the low texture in thermal images and the lack of open-source datasets.\nIn this paper, we present the RGB-Thermal Cross Attention Network (RT-CAN),\nwhich employs an RGB-assisted two-stream network architecture to integrate\ntexture information from RGB images and gas area information from thermal\nimages. Additionally, to facilitate the research of invisible gas detection, we\nintroduce Gas-DB, an extensive open-source gas detection database including\nabout 1.3K well-annotated RGB-thermal images with eight variant collection\nscenes. Experimental results demonstrate that our method successfully leverages\nthe advantages of both modalities, achieving state-of-the-art (SOTA)\nperformance among RGB-thermal methods, surpassing single-stream SOTA models in\nterms of accuracy, Intersection of Union (IoU), and F2 metrics by 4.86%, 5.65%,\nand 4.88%, respectively. The code and data will be made available soon.\n","authors":["Jue Wang","Yuxiang Lin","Qi Zhao","Dong Luo","Shuaibao Chen","Wei Chen","Xiaojiang Peng"],"pdf_url":"https://arxiv.org/pdf/2403.17712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15094v2","updated":"2024-03-26T13:57:26Z","published":"2023-05-24T12:22:23Z","title":"InNeRF360: Text-Guided 3D-Consistent Object Inpainting on 360-degree\n Neural Radiance Fields","summary":" We propose InNeRF360, an automatic system that accurately removes\ntext-specified objects from 360-degree Neural Radiance Fields (NeRF). The\nchallenge is to effectively remove objects while inpainting perceptually\nconsistent content for the missing regions, which is particularly demanding for\nexisting NeRF models due to their implicit volumetric representation. Moreover,\nunbounded scenes are more prone to floater artifacts in the inpainted region\nthan frontal-facing scenes, as the change of object appearance and background\nacross views is more sensitive to inaccurate segmentations and inconsistent\ninpainting. With a trained NeRF and a text description, our method efficiently\nremoves specified objects and inpaints visually consistent content without\nartifacts. We apply depth-space warping to enforce consistency across multiview\ntext-encoded segmentations, and then refine the inpainted NeRF model using\nperceptual priors and 3D diffusion-based geometric priors to ensure visual\nplausibility. Through extensive experiments in segmentation and inpainting on\n360-degree and frontal-facing NeRFs, we show that our approach is effective and\nenhances NeRF's editability. Project page: https://ivrl.github.io/InNeRF360.\n","authors":["Dongqing Wang","Tong Zhang","Alaa Abboud","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2305.15094v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17709v1","updated":"2024-03-26T13:56:34Z","published":"2024-03-26T13:56:34Z","title":"Groupwise Query Specialization and Quality-Aware Multi-Assignment for\n Transformer-based Visual Relationship Detection","summary":" Visual Relationship Detection (VRD) has seen significant advancements with\nTransformer-based architectures recently. However, we identify two key\nlimitations in a conventional label assignment for training Transformer-based\nVRD models, which is a process of mapping a ground-truth (GT) to a prediction.\nUnder the conventional assignment, an unspecialized query is trained since a\nquery is expected to detect every relation, which makes it difficult for a\nquery to specialize in specific relations. Furthermore, a query is also\ninsufficiently trained since a GT is assigned only to a single prediction,\ntherefore near-correct or even correct predictions are suppressed by being\nassigned no relation as a GT. To address these issues, we propose Groupwise\nQuery Specialization and Quality-Aware Multi-Assignment (SpeaQ). Groupwise\nQuery Specialization trains a specialized query by dividing queries and\nrelations into disjoint groups and directing a query in a specific query group\nsolely toward relations in the corresponding relation group. Quality-Aware\nMulti-Assignment further facilitates the training by assigning a GT to multiple\npredictions that are significantly close to a GT in terms of a subject, an\nobject, and the relation in between. Experimental results and analyses show\nthat SpeaQ effectively trains specialized queries, which better utilize the\ncapacity of a model, resulting in consistent performance gains with zero\nadditional inference cost across multiple VRD models and benchmarks. Code is\navailable at https://github.com/mlvlab/SpeaQ.\n","authors":["Jongha Kim","Jihwan Park","Jinyoung Park","Jinyoung Kim","Sehyung Kim","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2403.17709v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.16014v2","updated":"2024-03-26T13:55:40Z","published":"2023-12-26T11:49:23Z","title":"Passive Non-Line-of-Sight Imaging with Light Transport Modulation","summary":" Passive non-line-of-sight (NLOS) imaging has witnessed rapid development in\nrecent years, due to its ability to image objects that are out of sight. The\nlight transport condition plays an important role in this task since changing\nthe conditions will lead to different imaging models. Existing learning-based\nNLOS methods usually train independent models for different light transport\nconditions, which is computationally inefficient and impairs the practicality\nof the models. In this work, we propose NLOS-LTM, a novel passive NLOS imaging\nmethod that effectively handles multiple light transport conditions with a\nsingle network. We achieve this by inferring a latent light transport\nrepresentation from the projection image and using this representation to\nmodulate the network that reconstructs the hidden image from the projection\nimage. We train a light transport encoder together with a vector quantizer to\nobtain the light transport representation. To further regulate this\nrepresentation, we jointly learn both the reconstruction network and the\nreprojection network during training. A set of light transport modulation\nblocks is used to modulate the two jointly trained networks in a multi-scale\nway. Extensive experiments on a large-scale passive NLOS dataset demonstrate\nthe superiority of the proposed method. The code is available at\nhttps://github.com/JerryOctopus/NLOS-LTM.\n","authors":["Jiarui Zhang","Ruixu Geng","Xiaolong Du","Yan Chen","Houqiang Li","Yang Hu"],"pdf_url":"https://arxiv.org/pdf/2312.16014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17708v1","updated":"2024-03-26T13:54:52Z","published":"2024-03-26T13:54:52Z","title":"Panonut360: A Head and Eye Tracking Dataset for Panoramic Video","summary":" With the rapid development and widespread application of VR/AR technology,\nmaximizing the quality of immersive panoramic video services that match users'\npersonal preferences and habits has become a long-standing challenge.\nUnderstanding the saliency region where users focus, based on data collected\nwith HMDs, can promote multimedia encoding, transmission, and quality\nassessment. At the same time, large-scale datasets are essential for\nresearchers and developers to explore short/long-term user behavior patterns\nand train AI models related to panoramic videos. However, existing panoramic\nvideo datasets often include low-frequency user head or eye movement data\nthrough short-term videos only, lacking sufficient data for analyzing users'\nField of View (FoV) and generating video saliency regions.\n Driven by these practical factors, in this paper, we present a head and eye\ntracking dataset involving 50 users (25 males and 25 females) watching 15\npanoramic videos. The dataset provides details on the viewport and gaze\nattention locations of users. Besides, we present some statistics samples\nextracted from the dataset. For example, the deviation between head and eye\nmovements challenges the widely held assumption that gaze attention decreases\nfrom the center of the FoV following a Gaussian distribution. Our analysis\nreveals a consistent downward offset in gaze fixations relative to the FoV in\nexperimental settings involving multiple users and videos. That's why we name\nthe dataset Panonut, a saliency weighting shaped like a donut. Finally, we also\nprovide a script that generates saliency distributions based on given head or\neye coordinates and pre-generated saliency distribution map sets of each video\nfrom the collected eye tracking data.\n The dataset is available on website: https://dianvrlab.github.io/Panonut360/.\n","authors":["Yutong Xu","Junhao Du","Jiahe Wang","Yuwei Ning","Sihan Zhou Yang Cao"],"pdf_url":"https://arxiv.org/pdf/2403.17708v1.pdf","comment":"7 pages,ACM MMSys'24 accepted"},{"id":"http://arxiv.org/abs/2403.17702v1","updated":"2024-03-26T13:40:52Z","published":"2024-03-26T13:40:52Z","title":"The Solution for the CVPR 2023 1st foundation model challenge-Track2","summary":" In this paper, we propose a solution for cross-modal transportation\nretrieval. Due to the cross-domain problem of traffic images, we divide the\nproblem into two sub-tasks of pedestrian retrieval and vehicle retrieval\nthrough a simple strategy. In pedestrian retrieval tasks, we use IRRA as the\nbase model and specifically design an Attribute Classification to mine the\nknowledge implied by attribute labels. More importantly, We use the strategy of\nInclusion Relation Matching to make the image-text pairs with inclusion\nrelation have similar representation in the feature space. For the vehicle\nretrieval task, we use BLIP as the base model. Since aligning the color\nattributes of vehicles is challenging, we introduce attribute-based object\ndetection techniques to add color patch blocks to vehicle images for color data\naugmentation. This serves as strong prior information, helping the model\nperform the image-text alignment. At the same time, we incorporate labeled\nattributes into the image-text alignment loss to learn fine-grained alignment\nand prevent similar images and texts from being incorrectly separated. Our\napproach ranked first in the final B-board test with a score of 70.9.\n","authors":["Haonan Xu","Yurui Huang","Sishun Pan","Zhihao Guan","Yi Xu","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17701v1","updated":"2024-03-26T13:40:18Z","published":"2024-03-26T13:40:18Z","title":"Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical\n Image Segmentation","summary":" Image segmentation holds a vital position in the realms of diagnosis and\ntreatment within the medical domain. Traditional convolutional neural networks\n(CNNs) and Transformer models have made significant advancements in this realm,\nbut they still encounter challenges because of limited receptive field or high\ncomputing complexity. Recently, State Space Models (SSMs), particularly Mamba\nand its variants, have demonstrated notable performance in the field of vision.\nHowever, their feature extraction methods may not be sufficiently effective and\nretain some redundant structures, leaving room for parameter reduction.\nMotivated by previous spatial and channel attention methods, we propose Triplet\nMamba-UNet. The method leverages residual VSS Blocks to extract intensive\ncontextual features, while Triplet SSM is employed to fuse features across\nspatial and channel dimensions. We conducted experiments on ISIC17, ISIC18,\nCVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets,\ndemonstrating the superior segmentation performance of our proposed TM-UNet.\nAdditionally, compared to the previous VM-UNet, our model achieves a one-third\nreduction in parameters.\n","authors":["Hao Tang","Lianglun Cheng","Guoheng Huang","Zhengguang Tan","Junhao Lu","Kaihong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.17701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17695v1","updated":"2024-03-26T13:35:10Z","published":"2024-03-26T13:35:10Z","title":"PlainMamba: Improving Non-Hierarchical Mamba in Visual Recognition","summary":" We present PlainMamba: a simple non-hierarchical state space model (SSM)\ndesigned for general visual recognition. The recent Mamba model has shown how\nSSMs can be highly competitive with other architectures on sequential data and\ninitial attempts have been made to apply it to images. In this paper, we\nfurther adapt the selective scanning process of Mamba to the visual domain,\nenhancing its ability to learn features from two-dimensional images by (i) a\ncontinuous 2D scanning process that improves spatial continuity by ensuring\nadjacency of tokens in the scanning sequence, and (ii) direction-aware updating\nwhich enables the model to discern the spatial relations of tokens by encoding\ndirectional information. Our architecture is designed to be easy to use and\neasy to scale, formed by stacking identical PlainMamba blocks, resulting in a\nmodel with constant width throughout all layers. The architecture is further\nsimplified by removing the need for special tokens. We evaluate PlainMamba on a\nvariety of visual recognition tasks including image classification, semantic\nsegmentation, object detection, and instance segmentation. Our method achieves\nperformance gains over previous non-hierarchical models and is competitive with\nhierarchical alternatives. For tasks requiring high-resolution inputs, in\nparticular, PlainMamba requires much less computing while maintaining high\nperformance. Code and models are available at\nhttps://github.com/ChenhongyiYang/PlainMamba\n","authors":["Chenhongyi Yang","Zehui Chen","Miguel Espinosa","Linus Ericsson","Zhenyu Wang","Jiaming Liu","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2403.17695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17694v1","updated":"2024-03-26T13:35:02Z","published":"2024-03-26T13:35:02Z","title":"AniPortrait: Audio-Driven Synthesis of Photorealistic Portrait Animation","summary":" In this study, we propose AniPortrait, a novel framework for generating\nhigh-quality animation driven by audio and a reference portrait image. Our\nmethodology is divided into two stages. Initially, we extract 3D intermediate\nrepresentations from audio and project them into a sequence of 2D facial\nlandmarks. Subsequently, we employ a robust diffusion model, coupled with a\nmotion module, to convert the landmark sequence into photorealistic and\ntemporally consistent portrait animation. Experimental results demonstrate the\nsuperiority of AniPortrait in terms of facial naturalness, pose diversity, and\nvisual quality, thereby offering an enhanced perceptual experience. Moreover,\nour methodology exhibits considerable potential in terms of flexibility and\ncontrollability, which can be effectively applied in areas such as facial\nmotion editing or face reenactment. We release code and model weights at\nhttps://github.com/scutzzj/AniPortrait\n","authors":["Huawei Wei","Zejun Yang","Zhisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.17694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17692v1","updated":"2024-03-26T13:33:16Z","published":"2024-03-26T13:33:16Z","title":"Manifold-Guided Lyapunov Control with Diffusion Models","summary":" This paper presents a novel approach to generating stabilizing controllers\nfor a large class of dynamical systems using diffusion models. The core\nobjective is to develop stabilizing control functions by identifying the\nclosest asymptotically stable vector field relative to a predetermined manifold\nand adjusting the control function based on this finding. To achieve this, we\nemploy a diffusion model trained on pairs consisting of asymptotically stable\nvector fields and their corresponding Lyapunov functions. Our numerical results\ndemonstrate that this pre-trained model can achieve stabilization over\npreviously unseen systems efficiently and rapidly, showcasing the potential of\nour approach in fast zero-shot control and generalizability.\n","authors":["Amartya Mukherjee","Thanin Quartz","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17692v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2403.17691v1","updated":"2024-03-26T13:32:32Z","published":"2024-03-26T13:32:32Z","title":"Not All Similarities Are Created Equal: Leveraging Data-Driven Biases to\n Inform GenAI Copyright Disputes","summary":" The advent of Generative Artificial Intelligence (GenAI) models, including\nGitHub Copilot, OpenAI GPT, and Stable Diffusion, has revolutionized content\ncreation, enabling non-professionals to produce high-quality content across\nvarious domains. This transformative technology has led to a surge of synthetic\ncontent and sparked legal disputes over copyright infringement. To address\nthese challenges, this paper introduces a novel approach that leverages the\nlearning capacity of GenAI models for copyright legal analysis, demonstrated\nwith GPT2 and Stable Diffusion models. Copyright law distinguishes between\noriginal expressions and generic ones (Sc\\`enes \\`a faire), protecting the\nformer and permitting reproduction of the latter. However, this distinction has\nhistorically been challenging to make consistently, leading to over-protection\nof copyrighted works. GenAI offers an unprecedented opportunity to enhance this\nlegal analysis by revealing shared patterns in preexisting works. We propose a\ndata-driven approach to identify the genericity of works created by GenAI,\nemploying \"data-driven bias\" to assess the genericity of expressive\ncompositions. This approach aids in copyright scope determination by utilizing\nthe capabilities of GenAI to identify and prioritize expressive elements and\nrank them according to their frequency in the model's dataset. The potential\nimplications of measuring expressive genericity for copyright law are profound.\nSuch scoring could assist courts in determining copyright scope during\nlitigation, inform the registration practices of Copyright Offices, allowing\nregistration of only highly original synthetic works, and help copyright owners\nsignal the value of their works and facilitate fairer licensing deals. More\ngenerally, this approach offers valuable insights to policymakers grappling\nwith adapting copyright law to the challenges posed by the era of GenAI.\n","authors":["Uri Hacohen","Adi Haviv","Shahar Sarfaty","Bruria Friedman","Niva Elkin-Koren","Roi Livni","Amit H Bermano"],"pdf_url":"https://arxiv.org/pdf/2403.17691v1.pdf","comment":"Presented at ACM CSLAW 2024"},{"id":"http://arxiv.org/abs/2311.16081v2","updated":"2024-03-26T13:32:06Z","published":"2023-11-27T18:52:09Z","title":"ViT-Lens: Towards Omni-modal Representations","summary":" Aiming to advance AI agents, large foundation models significantly improve\nreasoning and instruction execution, yet the current focus on vision and\nlanguage neglects the potential of perceiving diverse modalities in open-world\nenvironments. However, the success of data-driven vision and language models is\ncostly or even infeasible to be reproduced for rare modalities. In this paper,\nwe present ViT-Lens-2 that facilitates efficient omni-modal representation\nlearning by perceiving novel modalities with a pretrained ViT and aligning them\nto a pre-defined space. Specifically, the modality-specific lens is tuned to\nproject any-modal signals to an intermediate embedding space, which are then\nprocessed by a strong ViT with pre-trained visual knowledge. The encoded\nrepresentations are optimized toward aligning with the modal-independent space,\npre-defined by off-the-shelf foundation models. ViT-Lens-2 provides a unified\nsolution for representation learning of increasing modalities with two\nappealing advantages: (i) Unlocking the great potential of pretrained ViTs to\nnovel modalities effectively with efficient data regime; (ii) Enabling emergent\ndownstream capabilities through modality alignment and shared ViT parameters.\nWe tailor ViT-Lens-2 to learn representations for 3D point cloud, depth, audio,\ntactile and EEG, and set new state-of-the-art results across various\nunderstanding tasks, such as zero-shot classification. By seamlessly\nintegrating ViT-Lens-2 into Multimodal Foundation Models, we enable\nAny-modality to Text and Image Generation in a zero-shot manner. Code and\nmodels are available at https://github.com/TencentARC/ViT-Lens.\n","authors":["Weixian Lei","Yixiao Ge","Kun Yi","Jianfeng Zhang","Difei Gao","Dylan Sun","Yuying Ge","Ying Shan","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2311.16081v2.pdf","comment":"This work is a follow-up of arXiv:2308.10185. Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.11708v3","updated":"2024-03-26T13:21:52Z","published":"2024-03-18T12:12:45Z","title":"Implicit Discriminative Knowledge Learning for Visible-Infrared Person\n Re-Identification","summary":" Visible-Infrared Person Re-identification (VI-ReID) is a challenging\ncross-modal pedestrian retrieval task, due to significant intra-class\nvariations and cross-modal discrepancies among different cameras. Existing\nworks mainly focus on embedding images of different modalities into a unified\nspace to mine modality-shared features. They only seek distinctive information\nwithin these shared features, while ignoring the identity-aware useful\ninformation that is implicit in the modality-specific features. To address this\nissue, we propose a novel Implicit Discriminative Knowledge Learning (IDKL)\nnetwork to uncover and leverage the implicit discriminative information\ncontained within the modality-specific. First, we extract modality-specific and\nmodality-shared features using a novel dual-stream network. Then, the\nmodality-specific features undergo purification to reduce their modality style\ndiscrepancies while preserving identity-aware discriminative knowledge.\nSubsequently, this kind of implicit knowledge is distilled into the\nmodality-shared feature to enhance its distinctiveness. Finally, an alignment\nloss is proposed to minimize modality discrepancy on enhanced modality-shared\nfeatures. Extensive experiments on multiple public datasets demonstrate the\nsuperiority of IDKL network over the state-of-the-art methods. Code is\navailable at https://github.com/1KK077/IDKL.\n","authors":["Kaijie Ren","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11708v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2311.17094v2","updated":"2024-03-26T13:21:43Z","published":"2023-11-28T06:17:49Z","title":"In Search of a Data Transformation That Accelerates Neural Field\n Training","summary":" Neural field is an emerging paradigm in data representation that trains a\nneural network to approximate the given signal. A key obstacle that prevents\nits widespread adoption is the encoding speed-generating neural fields requires\nan overfitting of a neural network, which can take a significant number of SGD\nsteps to reach the desired fidelity level. In this paper, we delve into the\nimpacts of data transformations on the speed of neural field training,\nspecifically focusing on how permuting pixel locations affect the convergence\nspeed of SGD. Counterintuitively, we find that randomly permuting the pixel\nlocations can considerably accelerate the training. To explain this phenomenon,\nwe examine the neural field training through the lens of PSNR curves, loss\nlandscapes, and error patterns. Our analyses suggest that the random pixel\npermutations remove the easy-to-fit patterns, which facilitate easy\noptimization in the early stage but hinder capturing fine details of the\nsignal.\n","authors":["Junwon Seo","Sangyoon Lee","Kwang In Kim","Jaeho Lee"],"pdf_url":"https://arxiv.org/pdf/2311.17094v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.02512v2","updated":"2024-03-26T13:21:28Z","published":"2023-12-05T05:36:44Z","title":"AV2AV: Direct Audio-Visual Speech to Audio-Visual Speech Translation\n with Unified Audio-Visual Speech Representation","summary":" This paper proposes a novel direct Audio-Visual Speech to Audio-Visual Speech\nTranslation (AV2AV) framework, where the input and output of the system are\nmultimodal (i.e., audio and visual speech). With the proposed AV2AV, two key\nadvantages can be brought: 1) We can perform real-like conversations with\nindividuals worldwide in a virtual meeting by utilizing our own primary\nlanguages. In contrast to Speech-to-Speech Translation (A2A), which solely\ntranslates between audio modalities, the proposed AV2AV directly translates\nbetween audio-visual speech. This capability enhances the dialogue experience\nby presenting synchronized lip movements along with the translated speech. 2)\nWe can improve the robustness of the spoken language translation system. By\nemploying the complementary information of audio-visual speech, the system can\neffectively translate spoken language even in the presence of acoustic noise,\nshowcasing robust performance. To mitigate the problem of the absence of a\nparallel AV2AV translation dataset, we propose to train our spoken language\ntranslation system with the audio-only dataset of A2A. This is done by learning\nunified audio-visual speech representations through self-supervised learning in\nadvance to train the translation system. Moreover, we propose an AV-Renderer\nthat can generate raw audio and video in parallel. It is designed with\nzero-shot speaker modeling, thus the speaker in source audio-visual speech can\nbe maintained at the target translated audio-visual speech. The effectiveness\nof AV2AV is evaluated with extensive experiments in a many-to-many language\ntranslation setting. Demo page is available on\nhttps://choijeongsoo.github.io/av2av.\n","authors":["Jeongsoo Choi","Se Jin Park","Minsu Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2312.02512v2.pdf","comment":"CVPR 2024. Code & Demo: https://choijeongsoo.github.io/av2av"},{"id":"http://arxiv.org/abs/2304.10417v3","updated":"2024-03-26T13:16:02Z","published":"2023-04-20T16:01:55Z","title":"SINC: Spatial Composition of 3D Human Motions for Simultaneous Action\n Generation","summary":" Our goal is to synthesize 3D human motions given textual inputs describing\nsimultaneous actions, for example 'waving hand' while 'walking' at the same\ntime. We refer to generating such simultaneous movements as performing 'spatial\ncompositions'. In contrast to temporal compositions that seek to transition\nfrom one action to another, spatial compositing requires understanding which\nbody parts are involved in which action, to be able to move them\nsimultaneously. Motivated by the observation that the correspondence between\nactions and body parts is encoded in powerful language models, we extract this\nknowledge by prompting GPT-3 with text such as \"what are the body parts\ninvolved in the action ?\", while also providing the parts list and\nfew-shot examples. Given this action-part mapping, we combine body parts from\ntwo motions together and establish the first automated method to spatially\ncompose two actions. However, training data with compositional actions is\nalways limited by the combinatorics. Hence, we further create synthetic data\nwith this approach, and use it to train a new state-of-the-art text-to-motion\ngeneration model, called SINC (\"SImultaneous actioN Compositions for 3D human\nmotions\"). In our experiments, that training with such GPT-guided synthetic\ndata improves spatial composition generation over baselines. Our code is\npublicly available at https://sinc.is.tue.mpg.de/.\n","authors":["Nikos Athanasiou","Mathis Petrovich","Michael J. Black","Gül Varol"],"pdf_url":"https://arxiv.org/pdf/2304.10417v3.pdf","comment":"Teaser Fixed"},{"id":"http://arxiv.org/abs/2403.14135v2","updated":"2024-03-26T13:15:12Z","published":"2024-03-21T05:10:26Z","title":"Powerful Lossy Compression for Noisy Images","summary":" Image compression and denoising represent fundamental challenges in image\nprocessing with many real-world applications. To address practical demands,\ncurrent solutions can be categorized into two main strategies: 1) sequential\nmethod; and 2) joint method. However, sequential methods have the disadvantage\nof error accumulation as there is information loss between multiple individual\nmodels. Recently, the academic community began to make some attempts to tackle\nthis problem through end-to-end joint methods. Most of them ignore that\ndifferent regions of noisy images have different characteristics. To solve\nthese problems, in this paper, our proposed signal-to-noise ratio~(SNR) aware\njoint solution exploits local and non-local features for image compression and\ndenoising simultaneously. We design an end-to-end trainable network, which\nincludes the main encoder branch, the guidance branch, and the signal-to-noise\nratio~(SNR) aware branch. We conducted extensive experiments on both synthetic\nand real-world datasets, demonstrating that our joint solution outperforms\nexisting state-of-the-art methods.\n","authors":["Shilv Cai","Xiaoguo Liang","Shuning Cao","Luxin Yan","Sheng Zhong","Liqun Chen","Xu Zou"],"pdf_url":"https://arxiv.org/pdf/2403.14135v2.pdf","comment":"Accepted by ICME 2024"},{"id":"http://arxiv.org/abs/2308.10185v2","updated":"2024-03-26T13:11:07Z","published":"2023-08-20T07:26:51Z","title":"ViT-Lens: Initiating Omni-Modal Exploration through 3D Insights","summary":" Though the success of CLIP-based training recipes in vision-language models,\ntheir scalability to more modalities (e.g., 3D, audio, etc.) is limited to\nlarge-scale data, which is expensive or even inapplicable for rare modalities.\nIn this paper, we present ViT-Lens that facilitates efficient omni-modal\nrepresentation learning by perceiving novel modalities with a pretrained ViT\nand aligning to a pre-defined space. Specifically, the modality-specific lens\nis tuned to project multimodal signals to the shared embedding space, which are\nthen processed by a strong ViT that carries pre-trained image knowledge. The\nencoded multimodal representations are optimized toward aligning with the\nmodal-independent space, pre-defined by off-the-shelf foundation models. A\nwell-trained lens with a ViT backbone has the potential to serve as one of\nthese foundation models, supervising the learning of subsequent modalities.\nViT-Lens provides a unified solution for representation learning of increasing\nmodalities with two appealing benefits: (i) Exploiting the pretrained ViT\nacross tasks and domains effectively with efficient data regime; (ii) Emergent\ndownstream capabilities of novel modalities are demonstrated due to the\nmodality alignment space. We evaluate ViT-Lens in the context of 3D as an\ninitial verification. In zero-shot 3D classification, ViT-Lens achieves\nsubstantial improvements over previous state-of-the-art, showing 52.0% accuracy\non Objaverse-LVIS, 87.4% on ModelNet40, and 60.6% on ScanObjectNN. Furthermore,\nwe enable zero-shot 3D question-answering by simply integrating the trained 3D\nlens into the InstructBLIP model without any adaptation. We will release the\nresults of ViT-Lens on more modalities in the near future.\n","authors":["Weixian Lei","Yixiao Ge","Jianfeng Zhang","Dylan Sun","Kun Yi","Ying Shan","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2308.10185v2.pdf","comment":"19 pages, 4 figures and 9 tables"},{"id":"http://arxiv.org/abs/2403.17678v1","updated":"2024-03-26T13:05:49Z","published":"2024-03-26T13:05:49Z","title":"Hierarchical Light Transformer Ensembles for Multimodal Trajectory\n Forecasting","summary":" Accurate trajectory forecasting is crucial for the performance of various\nsystems, such as advanced driver-assistance systems and self-driving vehicles.\nThese forecasts allow to anticipate events leading to collisions and,\ntherefore, to mitigate them. Deep Neural Networks have excelled in motion\nforecasting, but issues like overconfidence and uncertainty quantification\npersist. Deep Ensembles address these concerns, yet applying them to multimodal\ndistributions remains challenging. In this paper, we propose a novel approach\nnamed Hierarchical Light Transformer Ensembles (HLT-Ens), aimed at efficiently\ntraining an ensemble of Transformer architectures using a novel hierarchical\nloss function. HLT-Ens leverages grouped fully connected layers, inspired by\ngrouped convolution techniques, to capture multimodal distributions,\neffectively. Through extensive experimentation, we demonstrate that HLT-Ens\nachieves state-of-the-art performance levels, offering a promising avenue for\nimproving trajectory forecasting techniques.\n","authors":["Adrien Lafage","Mathieu Barbier","Gianni Franchi","David Filliat"],"pdf_url":"https://arxiv.org/pdf/2403.17678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17672v1","updated":"2024-03-26T13:02:38Z","published":"2024-03-26T13:02:38Z","title":"Predicting Perceived Gloss: Do Weak Labels Suffice?","summary":" Estimating perceptual attributes of materials directly from images is a\nchallenging task due to their complex, not fully-understood interactions with\nexternal factors, such as geometry and lighting. Supervised deep learning\nmodels have recently been shown to outperform traditional approaches, but rely\non large datasets of human-annotated images for accurate perception\npredictions. Obtaining reliable annotations is a costly endeavor, aggravated by\nthe limited ability of these models to generalise to different aspects of\nappearance. In this work, we show how a much smaller set of human annotations\n(\"strong labels\") can be effectively augmented with automatically derived \"weak\nlabels\" in the context of learning a low-dimensional image-computable gloss\nmetric. We evaluate three alternative weak labels for predicting human gloss\nperception from limited annotated data. Incorporating weak labels enhances our\ngloss prediction beyond the current state of the art. Moreover, it enables a\nsubstantial reduction in human annotation costs without sacrificing accuracy,\nwhether working with rendered images or real photographs.\n","authors":["Julia Guerrero-Viu","J. Daniel Subias","Ana Serrano","Katherine R. Storrs","Roland W. Fleming","Belen Masia","Diego Gutierrez"],"pdf_url":"https://arxiv.org/pdf/2403.17672v1.pdf","comment":"Computer Graphics Forum (Eurographics 2024)"},{"id":"http://arxiv.org/abs/2310.01819v3","updated":"2024-03-26T12:59:39Z","published":"2023-10-03T06:16:38Z","title":"TP2O: Creative Text Pair-to-Object Generation using Balance\n Swap-Sampling","summary":" Generating creative combinatorial objects from two seemingly unrelated object\ntexts is a challenging task in text-to-image synthesis, often hindered by a\nfocus on emulating existing data distributions. In this paper, we develop a\nstraightforward yet highly effective method, called \\textbf{balance\nswap-sampling}. First, we propose a swapping mechanism that generates a novel\ncombinatorial object image set by randomly exchanging intrinsic elements of two\ntext embeddings through a cutting-edge diffusion model. Second, we introduce a\nbalance swapping region to efficiently sample a small subset from the newly\ngenerated image set by balancing CLIP distances between the new images and\ntheir original generations, increasing the likelihood of accepting the\nhigh-quality combinations. Last, we employ a segmentation method to compare\nCLIP distances among the segmented components, ultimately selecting the most\npromising object from the sampled subset. Extensive experiments demonstrate\nthat our approach outperforms recent SOTA T2I methods. Surprisingly, our\nresults even rival those of human artists, such as frog-broccoli.\n","authors":["Jun Li","Zedong Zhang","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2310.01819v3.pdf","comment":"Project page: https://tp2o.github.io/anon/"},{"id":"http://arxiv.org/abs/2312.00869v2","updated":"2024-03-26T12:56:55Z","published":"2023-12-01T19:00:17Z","title":"Segment and Caption Anything","summary":" We propose a method to efficiently equip the Segment Anything Model (SAM)\nwith the ability to generate regional captions. SAM presents strong\ngeneralizability to segment anything while is short for semantic understanding.\nBy introducing a lightweight query-based feature mixer, we align the\nregion-specific features with the embedding space of language models for later\ncaption generation. As the number of trainable parameters is small (typically\nin the order of tens of millions), it costs less computation, less memory\nusage, and less communication bandwidth, resulting in both fast and scalable\ntraining. To address the scarcity problem of regional caption data, we propose\nto first pre-train our model on objection detection and segmentation tasks. We\ncall this step weak supervision pretraining since the pre-training data only\ncontains category names instead of full-sentence descriptions. The weak\nsupervision pretraining allows us to leverage many publicly available object\ndetection and segmentation datasets. We conduct extensive experiments to\ndemonstrate the superiority of our method and validate each design choice. This\nwork serves as a stepping stone towards scaling up regional captioning data and\nsheds light on exploring efficient ways to augment SAM with regional semantics.\nThe project page, along with the associated code, can be accessed via\nhttps://xk-huang.github.io/segment-caption-anything/.\n","authors":["Xiaoke Huang","Jianfeng Wang","Yansong Tang","Zheng Zhang","Han Hu","Jiwen Lu","Lijuan Wang","Zicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2312.00869v2.pdf","comment":"The project page, along with the associated code, can be accessed via\n https://xk-huang.github.io/segment-caption-anything/; Update author\n information; Accepted by CVPR 24"},{"id":"http://arxiv.org/abs/2403.17664v1","updated":"2024-03-26T12:53:10Z","published":"2024-03-26T12:53:10Z","title":"DiffFAE: Advancing High-fidelity One-shot Facial Appearance Editing with\n Space-sensitive Customization and Semantic Preservation","summary":" Facial Appearance Editing (FAE) aims to modify physical attributes, such as\npose, expression and lighting, of human facial images while preserving\nattributes like identity and background, showing great importance in\nphotograph. In spite of the great progress in this area, current researches\ngenerally meet three challenges: low generation fidelity, poor attribute\npreservation, and inefficient inference. To overcome above challenges, this\npaper presents DiffFAE, a one-stage and highly-efficient diffusion-based\nframework tailored for high-fidelity FAE. For high-fidelity query attributes\ntransfer, we adopt Space-sensitive Physical Customization (SPC), which ensures\nthe fidelity and generalization ability by utilizing rendering texture derived\nfrom 3D Morphable Model (3DMM). In order to preserve source attributes, we\nintroduce the Region-responsive Semantic Composition (RSC). This module is\nguided to learn decoupled source-regarding features, thereby better preserving\nthe identity and alleviating artifacts from non-facial attributes such as hair,\nclothes, and background. We further introduce a consistency regularization for\nour pipeline to enhance editing controllability by leveraging prior knowledge\nin the attention matrices of diffusion model. Extensive experiments demonstrate\nthe superiority of DiffFAE over existing methods, achieving state-of-the-art\nperformance in facial appearance editing.\n","authors":["Qilin Wang","Jiangning Zhang","Chengming Xu","Weijian Cao","Ying Tai","Yue Han","Yanhao Ge","Hong Gu","Chengjie Wang","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2403.17664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14149v4","updated":"2024-03-26T12:47:12Z","published":"2023-12-21T18:59:06Z","title":"TagAlign: Improving Vision-Language Alignment with Multi-Tag\n Classification","summary":" The crux of learning vision-language models is to extract semantically\naligned information from visual and linguistic data. Existing attempts usually\nface the problem of coarse alignment, e.g., the vision encoder struggles in\nlocalizing an attribute-specified object. In this work, we propose an\nembarrassingly simple approach to better align image and text features with no\nneed of additional data formats other than image-text pairs. Concretely, given\nan image and its paired text, we manage to parse objects (e.g., cat) and\nattributes (e.g., black) from the description, which are highly likely to exist\nin the image. It is noteworthy that the parsing pipeline is fully automatic and\nthus enjoys good scalability. With these parsed semantics as supervision\nsignals, we can complement the commonly used image-text contrastive loss with\nthe multi-tag classification loss. Extensive experimental results on a broad\nsuite of semantic segmentation datasets substantiate the average 5.2\\%\nimprovement of our framework over existing alternatives. Furthermore, the\nvisualization results indicate that attribute supervision makes vision-language\nmodels accurately localize attribute-specified objects. Project page can be\nfound at https://qinying-liu.github.io/Tag-Align.\n","authors":["Qinying Liu","Wei Wu","Kecheng Zheng","Zhan Tong","Jiawei Liu","Yu Liu","Wei Chen","Zilei Wang","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2312.14149v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03246v5","updated":"2024-03-26T12:35:03Z","published":"2024-02-05T18:03:53Z","title":"SGS-SLAM: Semantic Gaussian Splatting For Neural Dense SLAM","summary":" We present SGS-SLAM, the first semantic visual SLAM system based on Gaussian\nSplatting. It incorporates appearance, geometry, and semantic features through\nmulti-channel optimization, addressing the oversmoothing limitations of neural\nimplicit SLAM systems in high-quality rendering, scene understanding, and\nobject-level geometry. We introduce a unique semantic feature loss that\neffectively compensates for the shortcomings of traditional depth and color\nlosses in object optimization. Through a semantic-guided keyframe selection\nstrategy, we prevent erroneous reconstructions caused by cumulative errors.\nExtensive experiments demonstrate that SGS-SLAM delivers state-of-the-art\nperformance in camera pose estimation, map reconstruction, precise semantic\nsegmentation, and object-level geometric accuracy, while ensuring real-time\nrendering capabilities.\n","authors":["Mingrui Li","Shuhong Liu","Heng Zhou","Guohao Zhu","Na Cheng","Tianchen Deng","Hongyu Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03246v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17651v1","updated":"2024-03-26T12:31:58Z","published":"2024-03-26T12:31:58Z","title":"Exploring Dynamic Transformer for Efficient Object Tracking","summary":" The speed-precision trade-off is a critical problem for visual object\ntracking which usually requires low latency and deployment on constrained\nresources. Existing solutions for efficient tracking mainly focus on adopting\nlight-weight backbones or modules, which nevertheless come at the cost of a\nsacrifice in precision. In this paper, inspired by dynamic network routing, we\npropose DyTrack, a dynamic transformer framework for efficient tracking.\nReal-world tracking scenarios exhibit diverse levels of complexity. We argue\nthat a simple network is sufficient for easy frames in video sequences, while\nmore computation could be assigned to difficult ones. DyTrack automatically\nlearns to configure proper reasoning routes for various inputs, gaining better\nutilization of the available computational budget. Thus, it can achieve higher\nperformance with the same running speed. We formulate instance-specific\ntracking as a sequential decision problem and attach terminating branches to\nintermediate layers of the entire model. Especially, to fully utilize the\ncomputations, we introduce the feature recycling mechanism to reuse the outputs\nof predecessors. Furthermore, a target-aware self-distillation strategy is\ndesigned to enhance the discriminating capabilities of early predictions by\neffectively mimicking the representation pattern of the deep model. Extensive\nexperiments on multiple benchmarks demonstrate that DyTrack achieves promising\nspeed-precision trade-offs with only a single model. For instance, DyTrack\nobtains 64.9% AUC on LaSOT with a speed of 256 fps.\n","authors":["Jiawen Zhu","Xin Chen","Haiwen Diao","Shuai Li","Jun-Yan He","Chenyang Li","Bin Luo","Dong Wang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.17651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02109v2","updated":"2024-03-26T12:28:02Z","published":"2023-12-04T18:39:00Z","title":"ArtAdapter: Text-to-Image Style Transfer using Multi-Level Style Encoder\n and Explicit Adaptation","summary":" This work introduces ArtAdapter, a transformative text-to-image (T2I) style\ntransfer framework that transcends traditional limitations of color,\nbrushstrokes, and object shape, capturing high-level style elements such as\ncomposition and distinctive artistic expression. The integration of a\nmulti-level style encoder with our proposed explicit adaptation mechanism\nenables ArtAdapter to achieve unprecedented fidelity in style transfer,\nensuring close alignment with textual descriptions. Additionally, the\nincorporation of an Auxiliary Content Adapter (ACA) effectively separates\ncontent from style, alleviating the borrowing of content from style references.\nMoreover, our novel fast finetuning approach could further enhance zero-shot\nstyle representation while mitigating the risk of overfitting. Comprehensive\nevaluations confirm that ArtAdapter surpasses current state-of-the-art methods.\n","authors":["Dar-Yen Chen","Hamish Tennent","Ching-Wen Hsu"],"pdf_url":"https://arxiv.org/pdf/2312.02109v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17639v1","updated":"2024-03-26T12:21:47Z","published":"2024-03-26T12:21:47Z","title":"High-Resolution Image Translation Model Based on Grayscale Redefinition","summary":" Image-to-image translation is a technique that focuses on transferring images\nfrom one domain to another while maintaining the essential content\nrepresentations. In recent years, image-to-image translation has gained\nsignificant attention and achieved remarkable advancements due to its diverse\napplications in computer vision and image processing tasks. In this work, we\npropose an innovative method for image translation between different domains.\nFor high-resolution image translation tasks, we use a grayscale adjustment\nmethod to achieve pixel-level translation. For other tasks, we utilize the\nPix2PixHD model with a coarse-to-fine generator, multi-scale discriminator, and\nimproved loss to enhance the image translation performance. On the other hand,\nto tackle the issue of sparse training data, we adopt model weight\ninitialization from other task to optimize the performance of the current task.\n","authors":["Xixian Wu","Dian Chao","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17638v1","updated":"2024-03-26T12:17:46Z","published":"2024-03-26T12:17:46Z","title":"Learning with Unreliability: Fast Few-shot Voxel Radiance Fields with\n Relative Geometric Consistency","summary":" We propose a voxel-based optimization framework, ReVoRF, for few-shot\nradiance fields that strategically address the unreliability in pseudo novel\nview synthesis. Our method pivots on the insight that relative depth\nrelationships within neighboring regions are more reliable than the absolute\ncolor values in disoccluded areas. Consequently, we devise a bilateral\ngeometric consistency loss that carefully navigates the trade-off between color\nfidelity and geometric accuracy in the context of depth consistency for\nuncertain regions. Moreover, we present a reliability-guided learning strategy\nto discern and utilize the variable quality across synthesized views,\ncomplemented by a reliability-aware voxel smoothing algorithm that smoothens\nthe transition between reliable and unreliable data patches. Our approach\nallows for a more nuanced use of all available data, promoting enhanced\nlearning from regions previously considered unsuitable for high-quality\nreconstruction. Extensive experiments across diverse datasets reveal that our\napproach attains significant gains in efficiency and accuracy, delivering\nrendering speeds of 3 FPS, 7 mins to train a $360^\\circ$ scene, and a 5\\%\nimprovement in PSNR over existing few-shot methods. Code is available at\nhttps://github.com/HKCLynn/ReVoRF.\n","authors":["Yingjie Xu","Bangzhen Liu","Hao Tang","Bailin Deng","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2403.17638v1.pdf","comment":"CVPR 2024 final version"},{"id":"http://arxiv.org/abs/2403.15010v2","updated":"2024-03-26T12:16:14Z","published":"2024-03-22T07:47:13Z","title":"Clean-image Backdoor Attacks","summary":" To gather a significant quantity of annotated training data for\nhigh-performance image classification models, numerous companies opt to enlist\nthird-party providers to label their unlabeled data. This practice is widely\nregarded as secure, even in cases where some annotated errors occur, as the\nimpact of these minor inaccuracies on the final performance of the models is\nnegligible and existing backdoor attacks require attacker's ability to poison\nthe training images. Nevertheless, in this paper, we propose clean-image\nbackdoor attacks which uncover that backdoors can still be injected via a\nfraction of incorrect labels without modifying the training images.\nSpecifically, in our attacks, the attacker first seeks a trigger feature to\ndivide the training images into two parts: those with the feature and those\nwithout it. Subsequently, the attacker falsifies the labels of the former part\nto a backdoor class. The backdoor will be finally implanted into the target\nmodel after it is trained on the poisoned data. During the inference phase, the\nattacker can activate the backdoor in two ways: slightly modifying the input\nimage to obtain the trigger feature, or taking an image that naturally has the\ntrigger feature as input. We conduct extensive experiments to demonstrate the\neffectiveness and practicality of our attacks. According to the experimental\nresults, we conclude that our attacks seriously jeopardize the fairness and\nrobustness of image classification models, and it is necessary to be vigilant\nabout the incorrect labels in outsourced labeling.\n","authors":["Dazhong Rong","Guoyao Yu","Shuheng Shen","Xinyi Fu","Peng Qian","Jianhai Chen","Qinming He","Xing Fu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06683v2","updated":"2024-03-26T12:10:13Z","published":"2024-03-11T12:57:51Z","title":"Transferring Relative Monocular Depth to Surgical Vision with Temporal\n Consistency","summary":" Relative monocular depth, inferring depth up to shift and scale from a single\nimage, is an active research topic. Recent deep learning models, trained on\nlarge and varied meta-datasets, now provide excellent performance in the domain\nof natural images. However, few datasets exist which provide ground truth depth\nfor endoscopic images, making training such models from scratch unfeasible.\nThis work investigates the transfer of these models into the surgical domain,\nand presents an effective and simple way to improve on standard supervision\nthrough the use of temporal consistency self-supervision. We show temporal\nconsistency significantly improves supervised training alone when transferring\nto the low-data regime of endoscopy, and outperforms the prevalent\nself-supervision technique for this task. In addition we show our method\ndrastically outperforms the state-of-the-art method from within the domain of\nendoscopy. We also release our code, model and ensembled meta-dataset,\nMeta-MED, establishing a strong benchmark for future work.\n","authors":["Charlie Budd","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2403.06683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17633v1","updated":"2024-03-26T12:08:14Z","published":"2024-03-26T12:08:14Z","title":"UADA3D: Unsupervised Adversarial Domain Adaptation for 3D Object\n Detection with Sparse LiDAR and Large Domain Gaps","summary":" In this study, we address a gap in existing unsupervised domain adaptation\napproaches on LiDAR-based 3D object detection, which have predominantly\nconcentrated on adapting between established, high-density autonomous driving\ndatasets. We focus on sparser point clouds, capturing scenarios from different\nperspectives: not just from vehicles on the road but also from mobile robots on\nsidewalks, which encounter significantly different environmental conditions and\nsensor configurations. We introduce Unsupervised Adversarial Domain Adaptation\nfor 3D Object Detection (UADA3D). UADA3D does not depend on pre-trained source\nmodels or teacher-student architectures. Instead, it uses an adversarial\napproach to directly learn domain-invariant features. We demonstrate its\nefficacy in various adaptation scenarios, showing significant improvements in\nboth self-driving car and mobile robot domains. Our code is open-source and\nwill be available soon.\n","authors":["Maciej K Wozniak","Mattias Hansson","Marko Thiel","Patric Jensfelt"],"pdf_url":"https://arxiv.org/pdf/2403.17633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17631v1","updated":"2024-03-26T12:08:04Z","published":"2024-03-26T12:08:04Z","title":"AniArtAvatar: Animatable 3D Art Avatar from a Single Image","summary":" We present a novel approach for generating animatable 3D-aware art avatars\nfrom a single image, with controllable facial expressions, head poses, and\nshoulder movements. Unlike previous reenactment methods, our approach utilizes\na view-conditioned 2D diffusion model to synthesize multi-view images from a\nsingle art portrait with a neutral expression. With the generated colors and\nnormals, we synthesize a static avatar using an SDF-based neural surface. For\navatar animation, we extract control points, transfer the motion with these\npoints, and deform the implicit canonical space. Firstly, we render the front\nimage of the avatar, extract the 2D landmarks, and project them to the 3D space\nusing a trained SDF network. We extract 3D driving landmarks using 3DMM and\ntransfer the motion to the avatar landmarks. To animate the avatar pose, we\nmanually set the body height and bound the head and torso of an avatar with two\ncages. The head and torso can be animated by transforming the two cages. Our\napproach is a one-shot pipeline that can be applied to various styles.\nExperiments demonstrate that our method can generate high-quality 3D art\navatars with desired control over different motions.\n","authors":["Shaoxu Li"],"pdf_url":"https://arxiv.org/pdf/2403.17631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01598v3","updated":"2024-03-26T11:54:40Z","published":"2023-06-02T15:09:19Z","title":"Towards Source-free Domain Adaptive Semantic Segmentation via\n Importance-aware and Prototype-contrast Learning","summary":" Domain adaptive semantic segmentation enables robust pixel-wise understanding\nin real-world driving scenes. Source-free domain adaptation, as a more\npractical technique, addresses the concerns of data privacy and storage\nlimitations in typical unsupervised domain adaptation methods, making it\nespecially relevant in the context of intelligent vehicles. It utilizes a\nwell-trained source model and unlabeled target data to achieve adaptation in\nthe target domain. However, in the absence of source data and target labels,\ncurrent solutions cannot sufficiently reduce the impact of domain shift and\nfully leverage the information from the target data. In this paper, we propose\nan end-to-end source-free domain adaptation semantic segmentation method via\nImportance-Aware and Prototype-Contrast (IAPC) learning. The proposed IAPC\nframework effectively extracts domain-invariant knowledge from the well-trained\nsource model and learns domain-specific knowledge from the unlabeled target\ndomain. Specifically, considering the problem of domain shift in the prediction\nof the target domain by the source model, we put forward an importance-aware\nmechanism for the biased target prediction probability distribution to extract\ndomain-invariant knowledge from the source model. We further introduce a\nprototype-contrast strategy, which includes a prototype-symmetric cross-entropy\nloss and a prototype-enhanced cross-entropy loss, to learn target intra-domain\nknowledge without relying on labels. A comprehensive variety of experiments on\ntwo domain adaptive semantic segmentation benchmarks demonstrates that the\nproposed end-to-end IAPC solution outperforms existing state-of-the-art\nmethods. The source code is publicly available at\nhttps://github.com/yihong-97/Source-free-IAPC.\n","authors":["Yihong Cao","Hui Zhang","Xiao Lu","Zheng Xiao","Kailun Yang","Yaonan Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01598v3.pdf","comment":"Accepted to IEEE Transactions on Intelligent Vehicles (T-IV). The\n source code is publicly available at\n https://github.com/yihong-97/Source-free-IAPC"},{"id":"http://arxiv.org/abs/2310.17569v2","updated":"2024-03-26T11:52:23Z","published":"2023-10-26T16:58:01Z","title":"SD4Match: Learning to Prompt Stable Diffusion Model for Semantic\n Matching","summary":" In this paper, we address the challenge of matching semantically similar\nkeypoints across image pairs. Existing research indicates that the intermediate\noutput of the UNet within the Stable Diffusion (SD) can serve as robust image\nfeature maps for such a matching task. We demonstrate that by employing a basic\nprompt tuning technique, the inherent potential of Stable Diffusion can be\nharnessed, resulting in a significant enhancement in accuracy over previous\napproaches. We further introduce a novel conditional prompting module that\nconditions the prompt on the local details of the input image pairs, leading to\na further improvement in performance. We designate our approach as SD4Match,\nshort for Stable Diffusion for Semantic Matching. Comprehensive evaluations of\nSD4Match on the PF-Pascal, PF-Willow, and SPair-71k datasets show that it sets\nnew benchmarks in accuracy across all these datasets. Particularly, SD4Match\noutperforms the previous state-of-the-art by a margin of 12 percentage points\non the challenging SPair-71k dataset.\n","authors":["Xinghui Li","Jingyi Lu","Kai Han","Victor Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2310.17569v2.pdf","comment":"Accepted to CVPR 2024. Project website:\n https://sd4match.active.vision/"},{"id":"http://arxiv.org/abs/2403.17615v1","updated":"2024-03-26T11:48:37Z","published":"2024-03-26T11:48:37Z","title":"Grad-CAMO: Learning Interpretable Single-Cell Morphological Profiles\n from 3D Cell Painting Images","summary":" Despite their black-box nature, deep learning models are extensively used in\nimage-based drug discovery to extract feature vectors from single cells in\nmicroscopy images. To better understand how these networks perform\nrepresentation learning, we employ visual explainability techniques (e.g.,\nGrad-CAM). Our analyses reveal several mechanisms by which supervised models\ncheat, exploiting biologically irrelevant pixels when extracting morphological\nfeatures from images, such as noise in the background. This raises doubts\nregarding the fidelity of learned single-cell representations and their\nrelevance when investigating downstream biological questions. To address this\nmisalignment between researcher expectations and machine behavior, we introduce\nGrad-CAMO, a novel single-cell interpretability score for supervised feature\nextractors. Grad-CAMO measures the proportion of a model's attention that is\nconcentrated on the cell of interest versus the background. This metric can be\nassessed per-cell or averaged across a validation set, offering a tool to audit\nindividual features vectors or guide the improved design of deep learning\narchitectures. Importantly, Grad-CAMO seamlessly integrates into existing\nworkflows, requiring no dataset or model modifications, and is compatible with\nboth 2D and 3D Cell Painting data. Additional results are available at\nhttps://github.com/eigenvivek/Grad-CAMO.\n","authors":["Vivek Gopalakrishnan","Jingzhe Ma","Zhiyong Xie"],"pdf_url":"https://arxiv.org/pdf/2403.17615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17610v1","updated":"2024-03-26T11:43:05Z","published":"2024-03-26T11:43:05Z","title":"MMVP: A Multimodal MoCap Dataset with Vision and Pressure Sensors","summary":" Foot contact is an important cue not only for human motion capture but also\nfor motion understanding and physically plausible motion generation. However,\nmost of the foot-contact annotations in existing datasets are estimated by\npurely visual matching and distance thresholding, which results in low accuracy\nand coarse granularity. Even though existing multimodal datasets\nsynergistically capture plantar pressure (foot contact) and visual signals,\nthey are specifically designed for small-range and slow motion such as Taiji\nQuan and Yoga. Therefore, there is still a lack of a vision-pressure multimodal\ndataset with large-range and fast human motion, as well as accurate and dense\nfoot-contact annotation. To fill this gap, we propose a Multimodal MoCap\nDataset with Vision and Pressure sensors, named MMVP. MMVP provides accurate\nand dense plantar pressure signals synchronized with RGBD observations, which\nis especially useful for both plausible shape estimation, robust pose fitting\nwithout foot drifting, and accurate global translation tracking. To validate\nthe dataset, we propose an RGBD-P SMPL fitting method and also a\nmonocular-video-based baseline framework, VP-MoCap, for human motion capture.\nExperiments demonstrate that our RGBD-P SMPL Fitting results significantly\noutperform pure visual motion capture. Moreover, VP-MoCap outperforms SOTA\nmethods in foot-contact and global translation estimation accuracy. We believe\nthe configuration of the dataset and the baseline frameworks will stimulate the\nresearch in this direction and also provide a good reference for MoCap\napplications in various domains. Project page:\nhttps://haolyuan.github.io/MMVP-Dataset/.\n","authors":["He Zhang","Shenghao Ren","Haolei Yuan","Jianhui Zhao","Fan Li","Shuangpeng Sun","Zhenghao Liang","Tao Yu","Qiu Shen","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2403.17610v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.17608v1","updated":"2024-03-26T11:39:00Z","published":"2024-03-26T11:39:00Z","title":"Fake or JPEG? Revealing Common Biases in Generated Image Detection\n Datasets","summary":" The widespread adoption of generative image models has highlighted the urgent\nneed to detect artificial content, which is a crucial step in combating\nwidespread manipulation and misinformation. Consequently, numerous detectors\nand associated datasets have emerged. However, many of these datasets\ninadvertently introduce undesirable biases, thereby impacting the effectiveness\nand evaluation of detectors. In this paper, we emphasize that many datasets for\nAI-generated image detection contain biases related to JPEG compression and\nimage size. Using the GenImage dataset, we demonstrate that detectors indeed\nlearn from these undesired factors. Furthermore, we show that removing the\nnamed biases substantially increases robustness to JPEG compression and\nsignificantly alters the cross-generator performance of evaluated detectors.\nSpecifically, it leads to more than 11 percentage points increase in\ncross-generator performance for ResNet50 and Swin-T detectors on the GenImage\ndataset, achieving state-of-the-art results.\n We provide the dataset and source codes of this paper on the anonymous\nwebsite: https://www.unbiased-genimage.org\n","authors":["Patrick Grommelt","Louis Weiss","Franz-Josef Pfreundt","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2403.17608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04701v3","updated":"2024-03-26T11:26:17Z","published":"2024-03-07T17:48:48Z","title":"ObjectCompose: Evaluating Resilience of Vision-Based Models on\n Object-to-Background Compositional Changes","summary":" Given the large-scale multi-modal training of recent vision-based models and\ntheir generalization capabilities, understanding the extent of their robustness\nis critical for their real-world deployment. In this work, we evaluate the\nresilience of current vision-based models against diverse object-to-background\ncontext variations. The majority of robustness evaluation methods have\nintroduced synthetic datasets to induce changes to object characteristics\n(viewpoints, scale, color) or utilized image transformation techniques\n(adversarial changes, common corruptions) on real images to simulate shifts in\ndistributions. Recent works have explored leveraging large language models and\ndiffusion models to generate changes in the background. However, these methods\neither lack in offering control over the changes to be made or distort the\nobject semantics, making them unsuitable for the task. Our method, on the other\nhand, can induce diverse object-to-background changes while preserving the\noriginal semantics and appearance of the object. To achieve this goal, we\nharness the generative capabilities of text-to-image, image-to-text, and\nimage-to-segment models to automatically generate a broad spectrum of\nobject-to-background changes. We induce both natural and adversarial background\nchanges by either modifying the textual prompts or optimizing the latents and\ntextual embedding of text-to-image models. We produce various versions of\nstandard vision datasets (ImageNet, COCO), incorporating either diverse and\nrealistic backgrounds into the images or introducing color, texture, and\nadversarial changes in the background. We conduct extensive experiment to\nanalyze the robustness of vision-based models against object-to-background\ncontext variations across diverse tasks. Code\nhttps://github.com/Muhammad-Huzaifaa/ObjectCompose.git\n","authors":["Hashmat Shadab Malik","Muhammad Huzaifa","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.04701v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13518v2","updated":"2024-03-26T11:16:47Z","published":"2024-03-20T11:38:30Z","title":"Motion Generation from Fine-grained Textual Descriptions","summary":" The task of text2motion is to generate human motion sequences from given\ntextual descriptions, where the model explores diverse mappings from natural\nlanguage instructions to human body movements. While most existing works are\nconfined to coarse-grained motion descriptions, e.g., \"A man squats.\",\nfine-grained descriptions specifying movements of relevant body parts are\nbarely explored. Models trained with coarse-grained texts may not be able to\nlearn mappings from fine-grained motion-related words to motion primitives,\nresulting in the failure to generate motions from unseen descriptions. In this\npaper, we build a large-scale language-motion dataset specializing in\nfine-grained textual descriptions, FineHumanML3D, by feeding GPT-3.5-turbo with\nstep-by-step instructions with pseudo-code compulsory checks. Accordingly, we\ndesign a new text2motion model, FineMotionDiffuse, making full use of\nfine-grained textual information. Our quantitative evaluation shows that\nFineMotionDiffuse trained on FineHumanML3D improves FID by a large margin of\n0.38, compared with competitive baselines. According to the qualitative\nevaluation and case study, our model outperforms MotionDiffuse in generating\nspatially or chronologically composite motions, by learning the implicit\nmappings from fine-grained descriptions to the corresponding basic motions. We\nrelease our data at https://github.com/KunhangL/finemotiondiffuse.\n","authors":["Kunhang Li","Yansong Feng"],"pdf_url":"https://arxiv.org/pdf/2403.13518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15905v2","updated":"2024-03-26T11:11:49Z","published":"2024-03-23T18:19:02Z","title":"Towards Low-Energy Adaptive Personalization for Resource-Constrained\n Devices","summary":" The personalization of machine learning (ML) models to address data drift is\na significant challenge in the context of Internet of Things (IoT)\napplications. Presently, most approaches focus on fine-tuning either the full\nbase model or its last few layers to adapt to new data, while often neglecting\nenergy costs. However, various types of data drift exist, and fine-tuning the\nfull base model or the last few layers may not result in optimal performance in\ncertain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy\nadaptive personalization framework designed for resource-constrained devices.\nWe categorize data drift and personalization into three types: input-level,\nfeature-level, and output-level. For each type, we fine-tune different blocks\nof the model to achieve optimal performance with reduced energy costs.\nSpecifically, input-, feature-, and output-level correspond to fine-tuning the\nfront, middle, and rear blocks of the model. We evaluate TBFT on a ResNet\nmodel, three datasets, three different training sizes, and a Raspberry Pi.\nCompared with the $Block Avg$, where each block is fine-tuned individually and\ntheir performance improvements are averaged, TBFT exhibits an improvement in\nmodel accuracy by an average of 15.30% whilst saving 41.57% energy consumption\non average compared with full fine-tuning.\n","authors":["Yushan Huang","Josh Millar","Yuxuan Long","Yuchen Zhao","Hamed Hadaddi"],"pdf_url":"https://arxiv.org/pdf/2403.15905v2.pdf","comment":"Accepetd to The 4th Workshop on Machine Learning and Systems\n (EuroMLSys '24)"},{"id":"http://arxiv.org/abs/2403.07576v2","updated":"2024-03-26T10:55:51Z","published":"2024-03-12T12:05:43Z","title":"FPT: Fine-grained Prompt Tuning for Parameter and Memory Efficient Fine\n Tuning in High-resolution Medical Image Classification","summary":" Parameter-efficient fine-tuning (PEFT) is proposed as a cost-effective way to\ntransfer pre-trained models to downstream tasks, avoiding the high cost of\nupdating entire large-scale pre-trained models (LPMs). In this work, we present\nFine-grained Prompt Tuning (FPT), a novel PEFT method for medical image\nclassification. FPT significantly reduces memory consumption compared to other\nPEFT methods, especially in high-resolution contexts. To achieve this, we first\nfreeze the weights of the LPM and construct a learnable lightweight side\nnetwork. The frozen LPM takes high-resolution images as input to extract\nfine-grained features, while the side network is fed low-resolution images to\nreduce memory usage. To allow the side network to access pre-trained knowledge,\nwe introduce fine-grained prompts that summarize information from the LPM\nthrough a fusion module. Important tokens selection and preloading techniques\nare employed to further reduce training cost and memory requirements. We\nevaluate FPT on four medical datasets with varying sizes, modalities, and\ncomplexities. Experimental results demonstrate that FPT achieves comparable\nperformance to fine-tuning the entire LPM while using only 1.8% of the\nlearnable parameters and 13% of the memory costs of an encoder ViT-B model with\na 512 x 512 input resolution.\n","authors":["Yijin Huang","Pujin Cheng","Roger Tam","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2403.07576v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17589v1","updated":"2024-03-26T10:54:07Z","published":"2024-03-26T10:54:07Z","title":"Dual Memory Networks: A Versatile Adaptation Approach for\n Vision-Language Models","summary":" With the emergence of pre-trained vision-language models like CLIP, how to\nadapt them to various downstream classification tasks has garnered significant\nattention in recent research. The adaptation strategies can be typically\ncategorized into three paradigms: zero-shot adaptation, few-shot adaptation,\nand the recently-proposed training-free few-shot adaptation. Most existing\napproaches are tailored for a specific setting and can only cater to one or two\nof these paradigms. In this paper, we introduce a versatile adaptation approach\nthat can effectively work under all three settings. Specifically, we propose\nthe dual memory networks that comprise dynamic and static memory components.\nThe static memory caches training data knowledge, enabling training-free\nfew-shot adaptation, while the dynamic memory preserves historical test\nfeatures online during the testing process, allowing for the exploration of\nadditional data insights beyond the training set. This novel capability\nenhances model performance in the few-shot setting and enables model usability\nin the absence of training data. The two memory networks employ the same\nflexible memory interactive strategy, which can operate in a training-free mode\nand can be further enhanced by incorporating learnable projection layers. Our\napproach is tested across 11 datasets under the three task settings.\nRemarkably, in the zero-shot scenario, it outperforms existing methods by over\n3\\% and even shows superior results against methods utilizing external training\ndata. Additionally, our method exhibits robust performance against natural\ndistribution shifts. Codes are available at \\url{https://github.com/YBZh/DMN}.\n","authors":["Yabin Zhang","Wenjie Zhu","Hui Tang","Zhiyuan Ma","Kaiyang Zhou","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.17589v1.pdf","comment":"CVPR2024; Codes are available at \\url{https://github.com/YBZh/DMN}"},{"id":"http://arxiv.org/abs/2311.13385v3","updated":"2024-03-26T10:21:46Z","published":"2023-11-22T13:27:36Z","title":"SegVol: Universal and Interactive Volumetric Medical Image Segmentation","summary":" Precise image segmentation provides clinical study with instructive\ninformation. Despite the remarkable progress achieved in medical image\nsegmentation, there is still an absence of 3D foundation segmentation model\nthat can segment a wide range of anatomical categories with easy user\ninteraction. In this paper, we propose a 3D foundation segmentation model,\nnamed SegVol, supporting universal and interactive volumetric medical image\nsegmentation. By scaling up training data to 90K unlabeled Computed Tomography\n(CT) volumes and 6K labeled CT volumes, this foundation model supports the\nsegmentation of over 200 anatomical categories using semantic and spatial\nprompts. Extensive experiments on 10 internal validation tasks and 18 external\nvalidation tasks verify that SegVol outperforms the state of the art by a large\nmargin. Through its capacity to provide precise volumetric segmentation across\nvarious anatomical categories, SegVol has the potential to accelerate\nadvancements in medical imaging diagnosis and facilitate treatment\noptimization. The model and code are publicly available at:\nhttps://github.com/BAAI-DCAI/SegVol.\n","authors":["Yuxin Du","Fan Bai","Tiejun Huang","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.13385v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03611v2","updated":"2024-03-26T10:13:11Z","published":"2023-12-06T16:55:53Z","title":"DreamComposer: Controllable 3D Object Generation via Multi-View\n Conditions","summary":" Utilizing pre-trained 2D large-scale generative models, recent works are\ncapable of generating high-quality novel views from a single in-the-wild image.\nHowever, due to the lack of information from multiple views, these works\nencounter difficulties in generating controllable novel views. In this paper,\nwe present DreamComposer, a flexible and scalable framework that can enhance\nexisting view-aware diffusion models by injecting multi-view conditions.\nSpecifically, DreamComposer first uses a view-aware 3D lifting module to obtain\n3D representations of an object from multiple views. Then, it renders the\nlatent features of the target view from 3D representations with the multi-view\nfeature fusion module. Finally the target view features extracted from\nmulti-view inputs are injected into a pre-trained diffusion model. Experiments\nshow that DreamComposer is compatible with state-of-the-art diffusion models\nfor zero-shot novel view synthesis, further enhancing them to generate\nhigh-fidelity novel view images with multi-view conditions, ready for\ncontrollable 3D object reconstruction and various other applications.\n","authors":["Yunhan Yang","Yukun Huang","Xiaoyang Wu","Yuan-Chen Guo","Song-Hai Zhang","Hengshuang Zhao","Tong He","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03611v2.pdf","comment":"Project Page: https://yhyang-myron.github.io/DreamComposer/"},{"id":"http://arxiv.org/abs/2312.08879v2","updated":"2024-03-26T10:04:11Z","published":"2023-12-12T11:00:39Z","title":"Regularizing Self-supervised 3D Scene Flows with Surface Awareness and\n Cyclic Consistency","summary":" Learning without supervision how to predict 3D scene flows from point clouds\nis essential to many perception systems. We propose a novel learning framework\nfor this task which improves the necessary regularization. Relying on the\nassumption that scene elements are mostly rigid, current smoothness losses are\nbuilt on the definition of ``rigid clusters\" in the input point clouds. The\ndefinition of these clusters is challenging and has a significant impact on the\nquality of predicted flows. We introduce two new consistency losses that\nenlarge clusters while preventing them from spreading over distinct objects. In\nparticular, we enforce \\emph{temporal} consistency with a forward-backward\ncyclic loss and \\emph{spatial} consistency by considering surface orientation\nsimilarity in addition to spatial proximity. The proposed losses are\nmodel-independent and can thus be used in a plug-and-play fashion to\nsignificantly improve the performance of existing models, as demonstrated on\ntwo most widely used architectures. We also showcase the effectiveness and\ngeneralization capability of our framework on four standard sensor-unique\ndriving datasets, achieving state-of-the-art performance in 3D scene flow\nestimation. Our codes are available on https://github.com/ctu-vras/sac-flow.\n","authors":["Patrik Vacek","David Hurych","Karel Zimmermann","Patrick Perez","Tomas Svoboda"],"pdf_url":"https://arxiv.org/pdf/2312.08879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17550v1","updated":"2024-03-26T09:58:06Z","published":"2024-03-26T09:58:06Z","title":"DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping","summary":" Recently, significant progress has been achieved in sensing real large-scale\noutdoor 3D environments, particularly by using modern acquisition equipment\nsuch as LiDAR sensors. Unfortunately, they are fundamentally limited in their\nability to produce dense, complete 3D scenes. To address this issue, recent\nlearning-based methods integrate neural implicit representations and\noptimizable feature grids to approximate surfaces of 3D scenes. However,\nnaively fitting samples along raw LiDAR rays leads to noisy 3D mapping results\ndue to the nature of sparse, conflicting LiDAR measurements. Instead, in this\nwork we depart from fitting LiDAR data exactly, instead letting the network\noptimize a non-metric monotonic implicit field defined in 3D space. To fit our\nfield, we design a learning system integrating a monotonicity loss that enables\noptimizing neural monotonic fields and leverages recent progress in large-scale\n3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as\ncaptured by multiple quantitative and perceptual measures and visual results\nobtained for Mai City, Newer College, and KITTI benchmarks. The code of our\napproach will be made publicly available.\n","authors":["Kutay Yılmaz","Matthias Nießner","Anastasiia Kornilova","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2403.17550v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.17549v1","updated":"2024-03-26T09:55:49Z","published":"2024-03-26T09:55:49Z","title":"Practical Applications of Advanced Cloud Services and Generative AI\n Systems in Medical Image Analysis","summary":" The medical field is one of the important fields in the application of\nartificial intelligence technology. With the explosive growth and\ndiversification of medical data, as well as the continuous improvement of\nmedical needs and challenges, artificial intelligence technology is playing an\nincreasingly important role in the medical field. Artificial intelligence\ntechnologies represented by computer vision, natural language processing, and\nmachine learning have been widely penetrated into diverse scenarios such as\nmedical imaging, health management, medical information, and drug research and\ndevelopment, and have become an important driving force for improving the level\nand quality of medical services.The article explores the transformative\npotential of generative AI in medical imaging, emphasizing its ability to\ngenerate syntheticACM-2 data, enhance images, aid in anomaly detection, and\nfacilitate image-to-image translation. Despite challenges like model\ncomplexity, the applications of generative models in healthcare, including\nMed-PaLM 2 technology, show promising results. By addressing limitations in\ndataset size and diversity, these models contribute to more accurate diagnoses\nand improved patient outcomes. However, ethical considerations and\ncollaboration among stakeholders are essential for responsible implementation.\nThrough experiments leveraging GANs to augment brain tumor MRI datasets, the\nstudy demonstrates how generative AI can enhance image quality and diversity,\nultimately advancing medical diagnostics and patient care.\n","authors":["Jingyu Xu","Binbin Wu","Jiaxin Huang","Yulu Gong","Yifan Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17545v1","updated":"2024-03-26T09:49:35Z","published":"2024-03-26T09:49:35Z","title":"A Gaze-grounded Visual Question Answering Dataset for Clarifying\n Ambiguous Japanese Questions","summary":" Situated conversations, which refer to visual information as visual question\nanswering (VQA), often contain ambiguities caused by reliance on directive\ninformation. This problem is exacerbated because some languages, such as\nJapanese, often omit subjective or objective terms. Such ambiguities in\nquestions are often clarified by the contexts in conversational situations,\nsuch as joint attention with a user or user gaze information. In this study, we\npropose the Gaze-grounded VQA dataset (GazeVQA) that clarifies ambiguous\nquestions using gaze information by focusing on a clarification process\ncomplemented by gaze information. We also propose a method that utilizes gaze\ntarget estimation results to improve the accuracy of GazeVQA tasks. Our\nexperimental results showed that the proposed method improved the performance\nin some cases of a VQA system on GazeVQA and identified some typical problems\nof GazeVQA tasks that need to be improved.\n","authors":["Shun Inadumi","Seiya Kawano","Akishige Yuguchi","Yasutomo Kawanishi","Koichiro Yoshino"],"pdf_url":"https://arxiv.org/pdf/2403.17545v1.pdf","comment":"LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2403.17541v1","updated":"2024-03-26T09:44:34Z","published":"2024-03-26T09:44:34Z","title":"WordRobe: Text-Guided Generation of Textured 3D Garments","summary":" In this paper, we tackle a new and challenging problem of text-driven\ngeneration of 3D garments with high-quality textures. We propose \"WordRobe\", a\nnovel framework for the generation of unposed & textured 3D garment meshes from\nuser-friendly text prompts. We achieve this by first learning a latent\nrepresentation of 3D garments using a novel coarse-to-fine training strategy\nand a loss for latent disentanglement, promoting better latent interpolation.\nSubsequently, we align the garment latent space to the CLIP embedding space in\na weakly supervised manner, enabling text-driven 3D garment generation and\nediting. For appearance modeling, we leverage the zero-shot generation\ncapability of ControlNet to synthesize view-consistent texture maps in a single\nfeed-forward inference step, thereby drastically decreasing the generation time\nas compared to existing methods. We demonstrate superior performance over\ncurrent SOTAs for learning 3D garment latent space, garment interpolation, and\ntext-driven texture synthesis, supported by quantitative evaluation and\nqualitative user study. The unposed 3D garment meshes generated using WordRobe\ncan be directly fed to standard cloth simulation & animation pipelines without\nany post-processing.\n","authors":["Astitva Srivastava","Pranav Manu","Amit Raj","Varun Jampani","Avinash Sharma"],"pdf_url":"https://arxiv.org/pdf/2403.17541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17537v1","updated":"2024-03-26T09:42:28Z","published":"2024-03-26T09:42:28Z","title":"NeRF-HuGS: Improved Neural Radiance Fields in Non-static Scenes Using\n Heuristics-Guided Segmentation","summary":" Neural Radiance Field (NeRF) has been widely recognized for its excellence in\nnovel view synthesis and 3D scene reconstruction. However, their effectiveness\nis inherently tied to the assumption of static scenes, rendering them\nsusceptible to undesirable artifacts when confronted with transient distractors\nsuch as moving objects or shadows. In this work, we propose a novel paradigm,\nnamely \"Heuristics-Guided Segmentation\" (HuGS), which significantly enhances\nthe separation of static scenes from transient distractors by harmoniously\ncombining the strengths of hand-crafted heuristics and state-of-the-art\nsegmentation models, thus significantly transcending the limitations of\nprevious solutions. Furthermore, we delve into the meticulous design of\nheuristics, introducing a seamless fusion of Structure-from-Motion (SfM)-based\nheuristics and color residual heuristics, catering to a diverse range of\ntexture profiles. Extensive experiments demonstrate the superiority and\nrobustness of our method in mitigating transient distractors for NeRFs trained\nin non-static scenes. Project page: https://cnhaox.github.io/NeRF-HuGS/.\n","authors":["Jiahao Chen","Yipeng Qin","Lingjie Liu","Jiangbo Lu","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17537v1.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2403.17530v1","updated":"2024-03-26T09:36:20Z","published":"2024-03-26T09:36:20Z","title":"Boosting Few-Shot Learning with Disentangled Self-Supervised Learning\n and Meta-Learning for Medical Image Classification","summary":" Background and objective: Employing deep learning models in critical domains\nsuch as medical imaging poses challenges associated with the limited\navailability of training data. We present a strategy for improving the\nperformance and generalization capabilities of models trained in low-data\nregimes. Methods: The proposed method starts with a pre-training phase, where\nfeatures learned in a self-supervised learning setting are disentangled to\nimprove the robustness of the representations for downstream tasks. We then\nintroduce a meta-fine-tuning step, leveraging related classes between\nmeta-training and meta-testing phases but varying the granularity level. This\napproach aims to enhance the model's generalization capabilities by exposing it\nto more challenging classification tasks during meta-training and evaluating it\non easier tasks but holding greater clinical relevance during meta-testing. We\ndemonstrate the effectiveness of the proposed approach through a series of\nexperiments exploring several backbones, as well as diverse pre-training and\nfine-tuning schemes, on two distinct medical tasks, i.e., classification of\nprostate cancer aggressiveness from MRI data and classification of breast\ncancer malignity from microscopic images. Results: Our results indicate that\nthe proposed approach consistently yields superior performance w.r.t. ablation\nexperiments, maintaining competitiveness even when a distribution shift between\ntraining and evaluation data occurs. Conclusion: Extensive experiments\ndemonstrate the effectiveness and wide applicability of the proposed approach.\nWe hope that this work will add another solution to the arsenal of addressing\nlearning issues in data-scarce imaging domains.\n","authors":["Eva Pachetti","Sotirios A. Tsaftaris","Sara Colantonio"],"pdf_url":"https://arxiv.org/pdf/2403.17530v1.pdf","comment":"20 pages, 4 figures, 4 tables. Submitted to Elsevier on 25 March 2024"},{"id":"http://arxiv.org/abs/2207.12730v2","updated":"2024-03-26T09:35:03Z","published":"2022-07-26T08:34:17Z","title":"P2ANet: A Dataset and Benchmark for Dense Action Detection from Table\n Tennis Match Broadcasting Videos","summary":" While deep learning has been widely used for video analytics, such as video\nclassification and action detection, dense action detection with fast-moving\nsubjects from sports videos is still challenging. In this work, we release yet\nanother sports video benchmark \\TheName{} for \\emph{\\underline{P}}ing\n\\emph{\\underline{P}}ong-\\emph{\\underline{A}}ction detection, which consists of\n2,721 video clips collected from the broadcasting videos of professional table\ntennis matches in World Table Tennis Championships and Olympiads. We work with\na crew of table tennis professionals and referees on a specially designed\nannotation toolbox to obtain fine-grained action labels (in 14 classes) for\nevery ping-pong action that appeared in the dataset, and formulate two sets of\naction detection problems -- \\emph{action localization} and \\emph{action\nrecognition}. We evaluate a number of commonly-seen action recognition (e.g.,\nTSM, TSN, Video SwinTransformer, and Slowfast) and action localization models\n(e.g., BSN, BSN++, BMN, TCANet), using \\TheName{} for both problems, under\nvarious settings. These models can only achieve 48\\% area under the AR-AN curve\nfor localization and 82\\% top-one accuracy for recognition since the ping-pong\nactions are dense with fast-moving subjects but broadcasting videos are with\nonly 25 FPS. The results confirm that \\TheName{} is still a challenging task\nand can be used as a special benchmark for dense action detection from videos.\n","authors":["Jiang Bian","Xuhong Li","Tao Wang","Qingzhong Wang","Jun Huang","Chen Liu","Jun Zhao","Feixiang Lu","Dejing Dou","Haoyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2207.12730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12378v2","updated":"2024-03-26T09:31:28Z","published":"2023-09-21T11:47:01Z","title":"Unsupervised Semantic Segmentation Through Depth-Guided Feature\n Correlation and Sampling","summary":" Traditionally, training neural networks to perform semantic segmentation\nrequired expensive human-made annotations. But more recently, advances in the\nfield of unsupervised learning have made significant progress on this issue and\ntowards closing the gap to supervised algorithms. To achieve this, semantic\nknowledge is distilled by learning to correlate randomly sampled features from\nimages across an entire dataset. In this work, we build upon these advances by\nincorporating information about the structure of the scene into the training\nprocess through the use of depth information. We achieve this by (1) learning\ndepth-feature correlation by spatially correlate the feature maps with the\ndepth maps to induce knowledge about the structure of the scene and (2)\nimplementing farthest-point sampling to more effectively select relevant\nfeatures by utilizing 3D sampling techniques on depth information of the scene.\nFinally, we demonstrate the effectiveness of our technical contributions\nthrough extensive experimentation and present significant improvements in\nperformance across multiple benchmark datasets.\n","authors":["Leon Sick","Dominik Engel","Pedro Hermosilla","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2309.12378v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17525v1","updated":"2024-03-26T09:26:12Z","published":"2024-03-26T09:26:12Z","title":"Equipping Sketch Patches with Context-Aware Positional Encoding for\n Graphic Sketch Representation","summary":" The drawing order of a sketch records how it is created stroke-by-stroke by a\nhuman being. For graphic sketch representation learning, recent studies have\ninjected sketch drawing orders into graph edge construction by linking each\npatch to another in accordance to a temporal-based nearest neighboring\nstrategy. However, such constructed graph edges may be unreliable, since a\nsketch could have variants of drawings. In this paper, we propose a\nvariant-drawing-protected method by equipping sketch patches with context-aware\npositional encoding (PE) to make better use of drawing orders for learning\ngraphic sketch representation. Instead of injecting sketch drawings into graph\nedges, we embed these sequential information into graph nodes only. More\nspecifically, each patch embedding is equipped with a sinusoidal absolute PE to\nhighlight the sequential position in the drawing order. And its neighboring\npatches, ranked by the values of self-attention scores between patch\nembeddings, are equipped with learnable relative PEs to restore the contextual\npositions within a neighborhood. During message aggregation via graph\nconvolutional networks, a node receives both semantic contents from patch\nembeddings and contextual patterns from PEs by its neighbors, arriving at\ndrawing-order-enhanced sketch representations. Experimental results indicate\nthat our method significantly improves sketch healing and controllable sketch\nsynthesis.\n","authors":["Sicong Zang","Zhijun Fang"],"pdf_url":"https://arxiv.org/pdf/2403.17525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17520v1","updated":"2024-03-26T09:22:37Z","published":"2024-03-26T09:22:37Z","title":"Boosting Adversarial Training via Fisher-Rao Norm-based Regularization","summary":" Adversarial training is extensively utilized to improve the adversarial\nrobustness of deep neural networks. Yet, mitigating the degradation of standard\ngeneralization performance in adversarial-trained models remains an open\nproblem. This paper attempts to resolve this issue through the lens of model\ncomplexity. First, We leverage the Fisher-Rao norm, a geometrically invariant\nmetric for model complexity, to establish the non-trivial bounds of the\nCross-Entropy Loss-based Rademacher complexity for a ReLU-activated Multi-Layer\nPerceptron. Then we generalize a complexity-related variable, which is\nsensitive to the changes in model width and the trade-off factors in\nadversarial training. Moreover, intensive empirical evidence validates that\nthis variable highly correlates with the generalization gap of Cross-Entropy\nloss between adversarial-trained and standard-trained models, especially during\nthe initial and final phases of the training process. Building upon this\nobservation, we propose a novel regularization framework, called Logit-Oriented\nAdversarial Training (LOAT), which can mitigate the trade-off between\nrobustness and accuracy while imposing only a negligible increase in\ncomputational overhead. Our extensive experiments demonstrate that the proposed\nregularization strategy can boost the performance of the prevalent adversarial\ntraining algorithms, including PGD-AT, TRADES, TRADES (LSE), MART, and DM-AT,\nacross various network architectures. Our code will be available at\nhttps://github.com/TrustAI/LOAT.\n","authors":["Xiangyu Yin","Wenjie Ruan"],"pdf_url":"https://arxiv.org/pdf/2403.17520v1.pdf","comment":"This paper has been accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2312.04529v2","updated":"2024-03-26T09:21:29Z","published":"2023-12-07T18:50:00Z","title":"Diffusion Reflectance Map: Single-Image Stochastic Inverse Rendering of\n Illumination and Reflectance","summary":" Reflectance bounds the frequency spectrum of illumination in the object\nappearance. In this paper, we introduce the first stochastic inverse rendering\nmethod, which recovers the attenuated frequency spectrum of an illumination\njointly with the reflectance of an object of known geometry from a single\nimage. Our key idea is to solve this blind inverse problem in the reflectance\nmap, an appearance representation invariant to the underlying geometry, by\nlearning to reverse the image formation with a novel diffusion model which we\nrefer to as the Diffusion Reflectance Map Network (DRMNet). Given an observed\nreflectance map converted and completed from the single input image, DRMNet\ngenerates a reflectance map corresponding to a perfect mirror sphere while\njointly estimating the reflectance. The forward process can be understood as\ngradually filtering a natural illumination with lower and lower frequency\nreflectance and additive Gaussian noise. DRMNet learns to invert this process\nwith two subnetworks, IllNet and RefNet, which work in concert towards this\njoint estimation. The network is trained on an extensive synthetic dataset and\nis demonstrated to generalize to real images, showing state-of-the-art accuracy\non established datasets.\n","authors":["Yuto Enyo","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2312.04529v2.pdf","comment":"to be published in CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17512v1","updated":"2024-03-26T09:13:06Z","published":"2024-03-26T09:13:06Z","title":"Random-coupled Neural Network","summary":" Improving the efficiency of current neural networks and modeling them in\nbiological neural systems have become popular research directions in recent\nyears. Pulse-coupled neural network (PCNN) is a well applicated model for\nimitating the computation characteristics of the human brain in computer vision\nand neural network fields. However, differences between the PCNN and biological\nneural systems remain: limited neural connection, high computational cost, and\nlack of stochastic property. In this study, random-coupled neural network\n(RCNN) is proposed. It overcomes these difficulties in PCNN's neuromorphic\ncomputing via a random inactivation process. This process randomly closes some\nneural connections in the RCNN model, realized by the random inactivation\nweight matrix of link input. This releases the computational burden of PCNN,\nmaking it affordable to achieve vast neural connections. Furthermore, the image\nand video processing mechanisms of RCNN are researched. It encodes constant\nstimuli as periodic spike trains and periodic stimuli as chaotic spike trains,\nthe same as biological neural information encoding characteristics. Finally,\nthe RCNN is applicated to image segmentation, fusion, and pulse shape\ndiscrimination subtasks. It is demonstrated to be robust, efficient, and highly\nanti-noised, with outstanding performance in all applications mentioned above.\n","authors":["Haoran Liu","Mingzhe Liu","Peng Li","Jiahui Wu","Xin Jiang","Zhuo Zuo","Bingqi Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13660v2","updated":"2024-03-26T09:09:15Z","published":"2024-03-20T15:08:57Z","title":"ProMamba: Prompt-Mamba for polyp segmentation","summary":" Detecting polyps through colonoscopy is an important task in medical image\nsegmentation, which provides significant assistance and reference value for\nclinical surgery. However, accurate segmentation of polyps is a challenging\ntask due to two main reasons. Firstly, polyps exhibit various shapes and\ncolors. Secondly, the boundaries between polyps and their normal surroundings\nare often unclear. Additionally, significant differences between different\ndatasets lead to limited generalization capabilities of existing methods. To\naddress these issues, we propose a segmentation model based on Prompt-Mamba,\nwhich incorporates the latest Vision-Mamba and prompt technologies. Compared to\nprevious models trained on the same dataset, our model not only maintains high\nsegmentation accuracy on the validation part of the same dataset but also\ndemonstrates superior accuracy on unseen datasets, exhibiting excellent\ngeneralization capabilities. Notably, we are the first to apply the\nVision-Mamba architecture to polyp segmentation and the first to utilize prompt\ntechnology in a polyp segmentation model. Our model efficiently accomplishes\nsegmentation tasks, surpassing previous state-of-the-art methods by an average\nof 5% across six datasets. Furthermore, we have developed multiple versions of\nour model with scaled parameter counts, achieving better performance than\nprevious models even with fewer parameters. Our code and trained weights will\nbe released soon.\n","authors":["Jianhao Xie","Ruofan Liao","Ziang Zhang","Sida Yi","Yuesheng Zhu","Guibo Luo"],"pdf_url":"https://arxiv.org/pdf/2403.13660v2.pdf","comment":"10 pages, 2 figures,3 tabels"},{"id":"http://arxiv.org/abs/2403.17503v1","updated":"2024-03-26T09:04:18Z","published":"2024-03-26T09:04:18Z","title":"DS-AL: A Dual-Stream Analytic Learning for Exemplar-Free\n Class-Incremental Learning","summary":" Class-incremental learning (CIL) under an exemplar-free constraint has\npresented a significant challenge. Existing methods adhering to this constraint\nare prone to catastrophic forgetting, far more so than replay-based techniques\nthat retain access to past samples. In this paper, to solve the exemplar-free\nCIL problem, we propose a Dual-Stream Analytic Learning (DS-AL) approach. The\nDS-AL contains a main stream offering an analytical (i.e., closed-form) linear\nsolution, and a compensation stream improving the inherent under-fitting\nlimitation due to adopting linear mapping. The main stream redefines the CIL\nproblem into a Concatenated Recursive Least Squares (C-RLS) task, allowing an\nequivalence between the CIL and its joint-learning counterpart. The\ncompensation stream is governed by a Dual-Activation Compensation (DAC) module.\nThis module re-activates the embedding with a different activation function\nfrom the main stream one, and seeks fitting compensation by projecting the\nembedding to the null space of the main stream's linear mapping. Empirical\nresults demonstrate that the DS-AL, despite being an exemplar-free technique,\ndelivers performance comparable with or better than that of replay-based\nmethods across various datasets, including CIFAR-100, ImageNet-100 and\nImageNet-Full. Additionally, the C-RLS' equivalent property allows the DS-AL to\nexecute CIL in a phase-invariant manner. This is evidenced by a\nnever-before-seen 500-phase CIL ImageNet task, which performs on a level\nidentical to a 5-phase one. Our codes are available at\nhttps://github.com/ZHUANGHP/Analytic-continual-learning.\n","authors":["Huiping Zhuang","Run He","Kai Tong","Ziqian Zeng","Cen Chen","Zhiping Lin"],"pdf_url":"https://arxiv.org/pdf/2403.17503v1.pdf","comment":"Accepted in AAAI 2024"},{"id":"http://arxiv.org/abs/2403.17502v1","updated":"2024-03-26T09:03:40Z","published":"2024-03-26T09:03:40Z","title":"SeNM-VAE: Semi-Supervised Noise Modeling with Hierarchical Variational\n Autoencoder","summary":" The data bottleneck has emerged as a fundamental challenge in learning based\nimage restoration methods. Researchers have attempted to generate synthesized\ntraining data using paired or unpaired samples to address this challenge. This\nstudy proposes SeNM-VAE, a semi-supervised noise modeling method that leverages\nboth paired and unpaired datasets to generate realistic degraded data. Our\napproach is based on modeling the conditional distribution of degraded and\nclean images with a specially designed graphical model. Under the variational\ninference framework, we develop an objective function for handling both paired\nand unpaired data. We employ our method to generate paired training samples for\nreal-world image denoising and super-resolution tasks. Our approach excels in\nthe quality of synthetic degraded images compared to other unpaired and paired\nnoise modeling methods. Furthermore, our approach demonstrates remarkable\nperformance in downstream image restoration tasks, even with limited paired\ndata. With more paired data, our method achieves the best performance on the\nSIDD dataset.\n","authors":["Dihan Zheng","Yihang Zou","Xiaowen Zhang","Chenglong Bao"],"pdf_url":"https://arxiv.org/pdf/2403.17502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17497v1","updated":"2024-03-26T08:58:28Z","published":"2024-03-26T08:58:28Z","title":"Sharing the Cost of Success: A Game for Evaluating and Learning\n Collaborative Multi-Agent Instruction Giving and Following Policies","summary":" In collaborative goal-oriented settings, the participants are not only\ninterested in achieving a successful outcome, but do also implicitly negotiate\nthe effort they put into the interaction (by adapting to each other). In this\nwork, we propose a challenging interactive reference game that requires two\nplayers to coordinate on vision and language observations. The learning signal\nin this game is a score (given after playing) that takes into account the\nachieved goal and the players' assumed efforts during the interaction. We show\nthat a standard Proximal Policy Optimization (PPO) setup achieves a high\nsuccess rate when bootstrapped with heuristic partner behaviors that implement\ninsights from the analysis of human-human interactions. And we find that a\npairing of neural partners indeed reduces the measured joint effort when\nplaying together repeatedly. However, we observe that in comparison to a\nreasonable heuristic pairing there is still room for improvement -- which\ninvites further research in the direction of cost-sharing in collaborative\ninteractions.\n","authors":["Philipp Sadler","Sherzod Hakimov","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2403.17497v1.pdf","comment":"9 pages, Accepted at LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2310.05370v2","updated":"2024-03-26T08:54:49Z","published":"2023-10-09T02:59:21Z","title":"SocialCircle: Learning the Angle-based Social Interaction Representation\n for Pedestrian Trajectory Prediction","summary":" Analyzing and forecasting trajectories of agents like pedestrians and cars in\ncomplex scenes has become more and more significant in many intelligent systems\nand applications. The diversity and uncertainty in socially interactive\nbehaviors among a rich variety of agents make this task more challenging than\nother deterministic computer vision tasks. Researchers have made a lot of\nefforts to quantify the effects of these interactions on future trajectories\nthrough different mathematical models and network structures, but this problem\nhas not been well solved. Inspired by marine animals that localize the\npositions of their companions underwater through echoes, we build a new\nanglebased trainable social interaction representation, named SocialCircle, for\ncontinuously reflecting the context of social interactions at different angular\norientations relative to the target agent. We validate the effect of the\nproposed SocialCircle by training it along with several newly released\ntrajectory prediction models, and experiments show that the SocialCircle not\nonly quantitatively improves the prediction performance, but also qualitatively\nhelps better simulate social interactions when forecasting pedestrian\ntrajectories in a way that is consistent with human intuitions.\n","authors":["Conghao Wong","Beihao Xia","Ziqian Zou","Yulong Wang","Xinge You"],"pdf_url":"https://arxiv.org/pdf/2310.05370v2.pdf","comment":"CVPR 2024 accepted"},{"id":"http://arxiv.org/abs/2403.17496v1","updated":"2024-03-26T08:53:25Z","published":"2024-03-26T08:53:25Z","title":"Dr.Hair: Reconstructing Scalp-Connected Hair Strands without\n Pre-training via Differentiable Rendering of Line Segments","summary":" In the film and gaming industries, achieving a realistic hair appearance\ntypically involves the use of strands originating from the scalp. However,\nreconstructing these strands from observed surface images of hair presents\nsignificant challenges. The difficulty in acquiring Ground Truth (GT) data has\nled state-of-the-art learning-based methods to rely on pre-training with\nmanually prepared synthetic CG data. This process is not only labor-intensive\nand costly but also introduces complications due to the domain gap when\ncompared to real-world data. In this study, we propose an optimization-based\napproach that eliminates the need for pre-training. Our method represents hair\nstrands as line segments growing from the scalp and optimizes them using a\nnovel differentiable rendering algorithm. To robustly optimize a substantial\nnumber of slender explicit geometries, we introduce 3D orientation estimation\nutilizing global optimization, strand initialization based on Laplace's\nequation, and reparameterization that leverages geometric connectivity and\nspatial proximity. Unlike existing optimization-based methods, our method is\ncapable of reconstructing internal hair flow in an absolute direction. Our\nmethod exhibits robust and accurate inverse rendering, surpassing the quality\nof existing methods and significantly improving processing speed.\n","authors":["Yusuke Takimoto","Hikari Takehara","Hiroyuki Sato","Zihao Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.17496v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.13039v2","updated":"2024-03-26T08:52:05Z","published":"2024-03-19T16:21:47Z","title":"Emotic Masked Autoencoder with Attention Fusion for Facial Expression\n Recognition","summary":" Facial Expression Recognition (FER) is a critical task within computer vision\nwith diverse applications across various domains. Addressing the challenge of\nlimited FER datasets, which hampers the generalization capability of expression\nrecognition models, is imperative for enhancing performance. Our paper presents\nan innovative approach integrating the MAE-Face self-supervised learning (SSL)\nmethod and Fusion Attention mechanism for expression classification,\nparticularly showcased in the 6th Affective Behavior 32 pages harvmac; added\nreferences for section 5Analysis in-the-wild (ABAW) competition. Additionally,\nwe propose preprocessing techniques to emphasize essential facial features,\nthereby enhancing model performance on both training and validation sets,\nnotably demonstrated on the Aff-wild2 dataset.\n","authors":["Bach Nguyen-Xuan","Thien Nguyen-Hoang","Nhu Tai-Do"],"pdf_url":"https://arxiv.org/pdf/2403.13039v2.pdf","comment":"6 pages; added references for section 1; corrected typo for email\n author"},{"id":"http://arxiv.org/abs/2403.13653v2","updated":"2024-03-26T08:45:09Z","published":"2024-03-20T14:58:40Z","title":"Learning User Embeddings from Human Gaze for Personalised Saliency\n Prediction","summary":" Reusable embeddings of user behaviour have shown significant performance\nimprovements for the personalised saliency prediction task. However, prior\nworks require explicit user characteristics and preferences as input, which are\noften difficult to obtain. We present a novel method to extract user embeddings\nfrom pairs of natural images and corresponding saliency maps generated from a\nsmall amount of user-specific eye tracking data. At the core of our method is a\nSiamese convolutional neural encoder that learns the user embeddings by\ncontrasting the image and personal saliency map pairs of different users.\nEvaluations on two public saliency datasets show that the generated embeddings\nhave high discriminative power, are effective at refining universal saliency\nmaps to the individual users, and generalise well across users and images.\nFinally, based on our model's ability to encode individual user\ncharacteristics, our work points towards other applications that can benefit\nfrom reusable embeddings of gaze behaviour.\n","authors":["Florian Strohm","Mihai Bâce","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2403.13653v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17726v2","updated":"2024-03-26T08:38:52Z","published":"2024-02-27T17:58:09Z","title":"VRP-SAM: SAM with Visual Reference Prompt","summary":" In this paper, we propose a novel Visual Reference Prompt (VRP) encoder that\nempowers the Segment Anything Model (SAM) to utilize annotated reference images\nas prompts for segmentation, creating the VRP-SAM model. In essence, VRP-SAM\ncan utilize annotated reference images to comprehend specific objects and\nperform segmentation of specific objects in target image. It is note that the\nVRP encoder can support a variety of annotation formats for reference images,\nincluding \\textbf{point}, \\textbf{box}, \\textbf{scribble}, and \\textbf{mask}.\nVRP-SAM achieves a breakthrough within the SAM framework by extending its\nversatility and applicability while preserving SAM's inherent strengths, thus\nenhancing user-friendliness. To enhance the generalization ability of VRP-SAM,\nthe VRP encoder adopts a meta-learning strategy. To validate the effectiveness\nof VRP-SAM, we conducted extensive empirical studies on the Pascal and COCO\ndatasets. Remarkably, VRP-SAM achieved state-of-the-art performance in visual\nreference segmentation with minimal learnable parameters. Furthermore, VRP-SAM\ndemonstrates strong generalization capabilities, allowing it to perform\nsegmentation of unseen objects and enabling cross-domain segmentation. The\nsource code and models will be available at\n\\url{https://github.com/syp2ysy/VRP-SAM}\n","authors":["Yanpeng Sun","Jiahui Chen","Shan Zhang","Xinyu Zhang","Qiang Chen","Gang Zhang","Errui Ding","Jingdong Wang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2402.17726v2.pdf","comment":"Accepted by CVPR 2024; The camera-ready version"},{"id":"http://arxiv.org/abs/2403.13972v2","updated":"2024-03-26T08:34:16Z","published":"2024-03-20T20:47:53Z","title":"SeFFeC: Semantic Facial Feature Control for Fine-grained Face Editing","summary":" We propose Semantic Facial Feature Control (SeFFeC) - a novel method for\nfine-grained face shape editing. Our method enables the manipulation of\nhuman-understandable, semantic face features, such as nose length or mouth\nwidth, which are defined by different groups of facial landmarks. In contrast\nto existing methods, the use of facial landmarks enables precise measurement of\nthe facial features, which then enables training SeFFeC without any manually\nannotated labels. SeFFeC consists of a transformer-based encoder network that\ntakes a latent vector of a pre-trained generative model and a facial feature\nembedding as input, and learns to modify the latent vector to perform the\ndesired face edit operation. To ensure that the desired feature measurement is\nchanged towards the target value without altering uncorrelated features, we\nintroduced a novel semantic face feature loss. Qualitative and quantitative\nresults show that SeFFeC enables precise and fine-grained control of 23 facial\nfeatures, some of which could not previously be controlled by other methods,\nwithout requiring manual annotations. Unlike existing methods, SeFFeC also\nprovides deterministic control over the exact values of the facial features and\nmore localised and disentangled face edits.\n","authors":["Florian Strohm","Mihai Bâce","Markus Kaltenecker","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2403.13972v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17477v1","updated":"2024-03-26T08:13:02Z","published":"2024-03-26T08:13:02Z","title":"DiffGaze: A Diffusion Model for Continuous Gaze Sequence Generation on\n 360° Images","summary":" We present DiffGaze, a novel method for generating realistic and diverse\ncontinuous human gaze sequences on 360{\\deg} images based on a conditional\nscore-based denoising diffusion model. Generating human gaze on 360{\\deg}\nimages is important for various human-computer interaction and computer\ngraphics applications, e.g. for creating large-scale eye tracking datasets or\nfor realistic animation of virtual humans. However, existing methods are\nlimited to predicting discrete fixation sequences or aggregated saliency maps,\nthereby neglecting crucial parts of natural gaze behaviour. Our method uses\nfeatures extracted from 360{\\deg} images as condition and uses two transformers\nto model the temporal and spatial dependencies of continuous human gaze. We\nevaluate DiffGaze on two 360{\\deg} image benchmarks for gaze sequence\ngeneration as well as scanpath prediction and saliency prediction. Our\nevaluations show that DiffGaze outperforms state-of-the-art methods on all\ntasks on both benchmarks. We also report a 21-participant user study showing\nthat our method generates gaze sequences that are indistinguishable from real\nhuman sequences.\n","authors":["Chuhan Jiao","Yao Wang","Guanhua Zhang","Mihai Bâce","Zhiming Hu","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2403.17477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12036v3","updated":"2024-03-26T08:09:43Z","published":"2022-11-22T06:19:17Z","title":"Dual Prototype Attention for Unsupervised Video Object Segmentation","summary":" Unsupervised video object segmentation (VOS) aims to detect and segment the\nmost salient object in videos. The primary techniques used in unsupervised VOS\nare 1) the collaboration of appearance and motion information; and 2) temporal\nfusion between different frames. This paper proposes two novel prototype-based\nattention mechanisms, inter-modality attention (IMA) and inter-frame attention\n(IFA), to incorporate these techniques via dense propagation across different\nmodalities and frames. IMA densely integrates context information from\ndifferent modalities based on a mutual refinement. IFA injects global context\nof a video to the query frame, enabling a full utilization of useful properties\nfrom multiple frames. Experimental results on public benchmark datasets\ndemonstrate that our proposed approach outperforms all existing methods by a\nsubstantial margin. The proposed two components are also thoroughly validated\nvia ablative study.\n","authors":["Suhwan Cho","Minhyeok Lee","Seunghoon Lee","Dogyoon Lee","Heeseung Choi","Ig-Jae Kim","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2211.12036v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2311.09974v2","updated":"2024-03-26T08:04:00Z","published":"2023-11-16T15:47:49Z","title":"From Pretext to Purpose: Batch-Adaptive Self-Supervised Learning","summary":" In recent years, self-supervised contrastive learning has emerged as a\ndistinguished paradigm in the artificial intelligence landscape. It facilitates\nunsupervised feature learning through contrastive delineations at the instance\nlevel. However, crafting an effective self-supervised paradigm remains a\npivotal challenge within this field. This paper delves into two crucial factors\nimpacting self-supervised contrastive learning-bach size and pretext tasks, and\nfrom a data processing standpoint, proposes an adaptive technique of batch\nfusion. The proposed method, via dimensionality reduction and reconstruction of\nbatch data, enables formerly isolated individual data to partake in intra-batch\ncommunication through the Embedding Layer. Moreover, it adaptively amplifies\nthe self-supervised feature encoding capability as the training progresses. We\nconducted a linear classification test of this method based on the classic\ncontrastive learning framework on ImageNet-1k. The empirical findings\nillustrate that our approach achieves state-of-the-art performance under\nequitable comparisons. Benefiting from its \"plug-and-play\" characteristics, we\nfurther explored other contrastive learning methods. On the ImageNet-100,\ncompared to the original performance, the top1 has seen a maximum increase of\n1.25%. We suggest that the proposed method may contribute to the advancement of\ndata-driven self-supervised learning research, bringing a fresh perspective to\nthis community.\n","authors":["Jiansong Zhang","Linlin Shen","Peizhong Liu"],"pdf_url":"https://arxiv.org/pdf/2311.09974v2.pdf","comment":"14 pages, 2 figures, the code of this paper will be released soon"},{"id":"http://arxiv.org/abs/2311.16926v4","updated":"2024-03-26T07:55:24Z","published":"2023-11-28T16:31:27Z","title":"LLaFS: When Large Language Models Meet Few-Shot Segmentation","summary":" This paper proposes LLaFS, the first attempt to leverage large language\nmodels (LLMs) in few-shot segmentation. In contrast to the conventional\nfew-shot segmentation methods that only rely on the limited and biased\ninformation from the annotated support images, LLaFS leverages the vast prior\nknowledge gained by LLM as an effective supplement and directly uses the LLM to\nsegment images in a few-shot manner. To enable the text-based LLM to handle\nimage-related tasks, we carefully design an input instruction that allows the\nLLM to produce segmentation results represented as polygons, and propose a\nregion-attribute table to simulate the human visual mechanism and provide\nmulti-modal guidance. We also synthesize pseudo samples and use curriculum\nlearning for pretraining to augment data and achieve better optimization. LLaFS\nachieves state-of-the-art results on multiple datasets, showing the potential\nof using LLMs for few-shot computer vision tasks.\n","authors":["Lanyun Zhu","Tianrun Chen","Deyi Ji","Jieping Ye","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16926v4.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.17465v1","updated":"2024-03-26T07:55:16Z","published":"2024-03-26T07:55:16Z","title":"LaRE^2: Latent Reconstruction Error Based Method for Diffusion-Generated\n Image Detection","summary":" The evolution of Diffusion Models has dramatically improved image generation\nquality, making it increasingly difficult to differentiate between real and\ngenerated images. This development, while impressive, also raises significant\nprivacy and security concerns. In response to this, we propose a novel Latent\nREconstruction error guided feature REfinement method (LaRE^2) for detecting\nthe diffusion-generated images. We come up with the Latent Reconstruction Error\n(LaRE), the first reconstruction-error based feature in the latent space for\ngenerated image detection. LaRE surpasses existing methods in terms of feature\nextraction efficiency while preserving crucial cues required to differentiate\nbetween the real and the fake. To exploit LaRE, we propose an Error-Guided\nfeature REfinement module (EGRE), which can refine the image feature guided by\nLaRE to enhance the discriminativeness of the feature. Our EGRE utilizes an\nalign-then-refine mechanism, which effectively refines the image feature for\ngenerated-image detection from both spatial and channel perspectives. Extensive\nexperiments on the large-scale GenImage benchmark demonstrate the superiority\nof our LaRE^2, which surpasses the best SoTA method by up to 11.9%/12.1%\naverage ACC/AP across 8 different image generators. LaRE also surpasses\nexisting methods in terms of feature extraction cost, delivering an impressive\nspeed enhancement of 8 times.\n","authors":["Yunpeng Luo","Junlong Du","Ke Yan","Shouhong Ding"],"pdf_url":"https://arxiv.org/pdf/2403.17465v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17460v1","updated":"2024-03-26T07:48:49Z","published":"2024-03-26T07:48:49Z","title":"Building Bridges across Spatial and Temporal Resolutions:\n Reference-Based Super-Resolution via Change Priors and Conditional Diffusion\n Model","summary":" Reference-based super-resolution (RefSR) has the potential to build bridges\nacross spatial and temporal resolutions of remote sensing images. However,\nexisting RefSR methods are limited by the faithfulness of content\nreconstruction and the effectiveness of texture transfer in large scaling\nfactors. Conditional diffusion models have opened up new opportunities for\ngenerating realistic high-resolution images, but effectively utilizing\nreference images within these models remains an area for further exploration.\nFurthermore, content fidelity is difficult to guarantee in areas without\nrelevant reference information. To solve these issues, we propose a\nchange-aware diffusion model named Ref-Diff for RefSR, using the land cover\nchange priors to guide the denoising process explicitly. Specifically, we\ninject the priors into the denoising model to improve the utilization of\nreference information in unchanged areas and regulate the reconstruction of\nsemantically relevant content in changed areas. With this powerful guidance, we\ndecouple the semantics-guided denoising and reference texture-guided denoising\nprocesses to improve the model performance. Extensive experiments demonstrate\nthe superior effectiveness and robustness of the proposed method compared with\nstate-of-the-art RefSR methods in both quantitative and qualitative\nevaluations. The code and data are available at\nhttps://github.com/dongrunmin/RefDiff.\n","authors":["Runmin Dong","Shuai Yuan","Bin Luo","Mengxuan Chen","Jinxiao Zhang","Lixian Zhang","Weijia Li","Juepeng Zheng","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2403.17460v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.14027v2","updated":"2024-03-26T07:47:20Z","published":"2024-03-20T22:52:34Z","title":"EcoSense: Energy-Efficient Intelligent Sensing for In-Shore Ship\n Detection through Edge-Cloud Collaboration","summary":" Detecting marine objects inshore presents challenges owing to algorithmic\nintricacies and complexities in system deployment. We propose a\ndifficulty-aware edge-cloud collaborative sensing system that splits the task\ninto object localization and fine-grained classification. Objects are\nclassified either at the edge or within the cloud, based on their estimated\ndifficulty. The framework comprises a low-power device-tailored front-end model\nfor object localization, classification, and difficulty estimation, along with\na transformer-graph convolutional network-based back-end model for fine-grained\nclassification. Our system demonstrates superior performance (mAP@0.5 +4.3%})\non widely used marine object detection datasets, significantly reducing both\ndata transmission volume (by 95.43%) and energy consumption (by 72.7%}) at the\nsystem level. We validate the proposed system across various embedded system\nplatforms and in real-world scenarios involving drone deployment.\n","authors":["Wenjun Huang","Hanning Chen","Yang Ni","Arghavan Rezvani","Sanggeon Yun","Sungheon Jeon","Eric Pedley","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2403.14027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.03180v5","updated":"2024-03-26T07:44:45Z","published":"2021-06-06T17:01:13Z","title":"Vision Transformers with Hierarchical Attention","summary":" This paper tackles the high computational/space complexity associated with\nMulti-Head Self-Attention (MHSA) in vanilla vision transformers. To this end,\nwe propose Hierarchical MHSA (H-MHSA), a novel approach that computes\nself-attention in a hierarchical fashion. Specifically, we first divide the\ninput image into patches as commonly done, and each patch is viewed as a token.\nThen, the proposed H-MHSA learns token relationships within local patches,\nserving as local relationship modeling. Then, the small patches are merged into\nlarger ones, and H-MHSA models the global dependencies for the small number of\nthe merged tokens. At last, the local and global attentive features are\naggregated to obtain features with powerful representation capacity. Since we\nonly calculate attention for a limited number of tokens at each step, the\ncomputational load is reduced dramatically. Hence, H-MHSA can efficiently model\nglobal relationships among tokens without sacrificing fine-grained information.\nWith the H-MHSA module incorporated, we build a family of\nHierarchical-Attention-based Transformer Networks, namely HAT-Net. To\ndemonstrate the superiority of HAT-Net in scene understanding, we conduct\nextensive experiments on fundamental vision tasks, including image\nclassification, semantic segmentation, object detection, and instance\nsegmentation. Therefore, HAT-Net provides a new perspective for vision\ntransformers. Code and pretrained models are available at\nhttps://github.com/yun-liu/HAT-Net.\n","authors":["Yun Liu","Yu-Huan Wu","Guolei Sun","Le Zhang","Ajad Chhatkuli","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2106.03180v5.pdf","comment":"Machine Intelligence Research (MIR), DOI: 10.1007/s11633-024-1393-8"},{"id":"http://arxiv.org/abs/2308.07728v5","updated":"2024-03-26T07:43:08Z","published":"2023-08-15T12:08:43Z","title":"Domain-Aware Fine-Tuning: Enhancing Neural Network Adaptability","summary":" Fine-tuning pre-trained neural network models has become a widely adopted\napproach across various domains. However, it can lead to the distortion of\npre-trained feature extractors that already possess strong generalization\ncapabilities. Mitigating feature distortion during adaptation to new target\ndomains is crucial. Recent studies have shown promising results in handling\nfeature distortion by aligning the head layer on in-distribution datasets\nbefore performing fine-tuning. Nonetheless, a significant limitation arises\nfrom the treatment of batch normalization layers during fine-tuning, leading to\nsuboptimal performance. In this paper, we propose Domain-Aware Fine-Tuning\n(DAFT), a novel approach that incorporates batch normalization conversion and\nthe integration of linear probing and fine-tuning. Our batch normalization\nconversion method effectively mitigates feature distortion by reducing\nmodifications to the neural network during fine-tuning. Additionally, we\nintroduce the integration of linear probing and fine-tuning to optimize the\nhead layer with gradual adaptation of the feature extractor. By leveraging\nbatch normalization layers and integrating linear probing and fine-tuning, our\nDAFT significantly mitigates feature distortion and achieves improved model\nperformance on both in-distribution and out-of-distribution datasets. Extensive\nexperiments demonstrate that our method outperforms other baseline methods,\ndemonstrating its effectiveness in not only improving performance but also\nmitigating feature distortion.\n","authors":["Seokhyeon Ha","Sunbeom Jung","Jungwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07728v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17447v1","updated":"2024-03-26T07:26:00Z","published":"2024-03-26T07:26:00Z","title":"Chain of Compression: A Systematic Approach to Combinationally Compress\n Convolutional Neural Networks","summary":" Convolutional neural networks (CNNs) have achieved significant popularity,\nbut their computational and memory intensity poses challenges for\nresource-constrained computing systems, particularly with the prerequisite of\nreal-time performance. To release this burden, model compression has become an\nimportant research focus. Many approaches like quantization, pruning, early\nexit, and knowledge distillation have demonstrated the effect of reducing\nredundancy in neural networks. Upon closer examination, it becomes apparent\nthat each approach capitalizes on its unique features to compress the neural\nnetwork, and they can also exhibit complementary behavior when combined. To\nexplore the interactions and reap the benefits from the complementary features,\nwe propose the Chain of Compression, which works on the combinational sequence\nto apply these common techniques to compress the neural network. Validated on\nthe image-based regression and classification networks across different data\nsets, our proposed Chain of Compression can significantly compress the\ncomputation cost by 100-1000 times with ignorable accuracy loss compared with\nthe baseline model.\n","authors":["Yingtao Shen","Minqing Sun","Jie Zhao","An Zou"],"pdf_url":"https://arxiv.org/pdf/2403.17447v1.pdf","comment":"10 pages, 15 figures"},{"id":"http://arxiv.org/abs/2306.07632v3","updated":"2024-03-26T07:00:27Z","published":"2023-06-13T09:02:57Z","title":"NeuS-PIR: Learning Relightable Neural Surface using Pre-Integrated\n Rendering","summary":" This paper presents a method, namely NeuS-PIR, for recovering relightable\nneural surfaces using pre-integrated rendering from multi-view images or video.\nUnlike methods based on NeRF and discrete meshes, our method utilizes implicit\nneural surface representation to reconstruct high-quality geometry, which\nfacilitates the factorization of the radiance field into two components: a\nspatially varying material field and an all-frequency lighting representation.\nThis factorization, jointly optimized using an adapted differentiable\npre-integrated rendering framework with material encoding regularization, in\nturn addresses the ambiguity of geometry reconstruction and leads to better\ndisentanglement and refinement of each scene property. Additionally, we\nintroduced a method to distil indirect illumination fields from the learned\nrepresentations, further recovering the complex illumination effect like\ninter-reflection. Consequently, our method enables advanced applications such\nas relighting, which can be seamlessly integrated with modern graphics engines.\nQualitative and quantitative experiments have shown that NeuS-PIR outperforms\nexisting methods across various tasks on both synthetic and real datasets.\nSource code is available at https://github.com/Sheldonmao/NeuSPIR\n","authors":["Shi Mao","Chenming Wu","Zhelun Shen","Yifan Wang","Dayan Wu","Liangjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.07632v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17432v1","updated":"2024-03-26T06:57:50Z","published":"2024-03-26T06:57:50Z","title":"Integrating Mamba Sequence Model and Hierarchical Upsampling Network for\n Accurate Semantic Segmentation of Multiple Sclerosis Legion","summary":" Integrating components from convolutional neural networks and state space\nmodels in medical image segmentation presents a compelling approach to enhance\naccuracy and efficiency. We introduce Mamba HUNet, a novel architecture\ntailored for robust and efficient segmentation tasks. Leveraging strengths from\nMamba UNet and the lighter version of Hierarchical Upsampling Network (HUNet),\nMamba HUNet combines convolutional neural networks local feature extraction\npower with state space models long range dependency modeling capabilities. We\nfirst converted HUNet into a lighter version, maintaining performance parity\nand then integrated this lighter HUNet into Mamba HUNet, further enhancing its\nefficiency. The architecture partitions input grayscale images into patches,\ntransforming them into 1D sequences for processing efficiency akin to Vision\nTransformers and Mamba models. Through Visual State Space blocks and patch\nmerging layers, hierarchical features are extracted while preserving spatial\ninformation. Experimental results on publicly available Magnetic Resonance\nImaging scans, notably in Multiple Sclerosis lesion segmentation, demonstrate\nMamba HUNet's effectiveness across diverse segmentation tasks. The model's\nrobustness and flexibility underscore its potential in handling complex\nanatomical structures. These findings establish Mamba HUNet as a promising\nsolution in advancing medical image segmentation, with implications for\nimproving clinical decision making processes.\n","authors":["Kazi Shahriar Sanjid","Md. Tanzim Hossain","Md. Shakib Shahariar Junayed","Dr. Mohammad Monir Uddin"],"pdf_url":"https://arxiv.org/pdf/2403.17432v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2403.17423v1","updated":"2024-03-26T06:40:03Z","published":"2024-03-26T06:40:03Z","title":"Test-time Adaptation Meets Image Enhancement: Improving Accuracy via\n Uncertainty-aware Logit Switching","summary":" Deep neural networks have achieved remarkable success in a variety of\ncomputer vision applications. However, there is a problem of degrading accuracy\nwhen the data distribution shifts between training and testing. As a solution\nof this problem, Test-time Adaptation~(TTA) has been well studied because of\nits practicality. Although TTA methods increase accuracy under distribution\nshift by updating the model at test time, using high-uncertainty predictions is\nknown to degrade accuracy. Since the input image is the root of the\ndistribution shift, we incorporate a new perspective on enhancing the input\nimage into TTA methods to reduce the prediction's uncertainty. We hypothesize\nthat enhancing the input image reduces prediction's uncertainty and increase\nthe accuracy of TTA methods. On the basis of our hypothesis, we propose a novel\nmethod: Test-time Enhancer and Classifier Adaptation~(TECA). In TECA, the\nclassification model is combined with the image enhancement model that\ntransforms input images into recognition-friendly ones, and these models are\nupdated by existing TTA methods. Furthermore, we found that the prediction from\nthe enhanced image does not always have lower uncertainty than the prediction\nfrom the original image. Thus, we propose logit switching, which compares the\nuncertainty measure of these predictions and outputs the lower one. In our\nexperiments, we evaluate TECA with various TTA methods and show that TECA\nreduces prediction's uncertainty and increases accuracy of TTA methods despite\nhaving no hyperparameters and little parameter overhead.\n","authors":["Shohei Enomoto","Naoya Hasegawa","Kazuki Adachi","Taku Sasaki","Shin'ya Yamaguchi","Satoshi Suzuki","Takeharu Eda"],"pdf_url":"https://arxiv.org/pdf/2403.17423v1.pdf","comment":"Accepted to IJCNN2024"},{"id":"http://arxiv.org/abs/2403.16169v2","updated":"2024-03-26T06:39:30Z","published":"2024-03-24T14:24:13Z","title":"Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method","summary":" Gaze plays a crucial role in revealing human attention and intention,\nshedding light on the cognitive processes behind human actions. The integration\nof gaze guidance with the dynamics of hand-object interactions boosts the\naccuracy of human motion prediction. However, the lack of datasets that capture\nthe intricate relationship and consistency among gaze, hand, and object\nmovements remains a substantial hurdle. In this paper, we introduce the first\nGaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task\nfor synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI,\nfeatures simultaneous 3D modeling of gaze, hand, and object interactions,\ncomprising 479 sequences with an average duration of 19.1 seconds, 812\nsub-sequences, and 33 objects of various sizes. We propose a hierarchical\nframework centered on a gaze-guided hand-object interaction diffusion model,\nnamed GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions\ninto spatial-temporal features and goal pose conditions at different levels of\ninformation granularity. During the diffusion phase, two gaze-conditioned\ndiffusion models are stacked to simplify the complex synthesis of hand-object\nmotions. Here, the object motion diffusion model generates sequences of object\nmotions based on gaze conditions, while the hand motion diffusion model\nproduces hand motions based on the generated object motion. To improve\nfine-grained goal pose alignment, we introduce a Spherical Gaussian constraint\nto guide the denoising step. In the subsequent post-diffusion phase, we\noptimize the generated hand motions using contact consistency. Our extensive\nexperiments highlight the uniqueness of our dataset and the effectiveness of\nour approach.\n","authors":["Jie Tian","Lingxiao Yang","Ran Ji","Yuexin Ma","Lan Xu","Jingyi Yu","Ye Shi","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17422v1","updated":"2024-03-26T06:35:55Z","published":"2024-03-26T06:35:55Z","title":"InterHandGen: Two-Hand Interaction Generation via Cascaded Reverse\n Diffusion","summary":" We present InterHandGen, a novel framework that learns the generative prior\nof two-hand interaction. Sampling from our model yields plausible and diverse\ntwo-hand shapes in close interaction with or without an object. Our prior can\nbe incorporated into any optimization or learning methods to reduce ambiguity\nin an ill-posed setup. Our key observation is that directly modeling the joint\ndistribution of multiple instances imposes high learning complexity due to its\ncombinatorial nature. Thus, we propose to decompose the modeling of joint\ndistribution into the modeling of factored unconditional and conditional single\ninstance distribution. In particular, we introduce a diffusion model that\nlearns the single-hand distribution unconditional and conditional to another\nhand via conditioning dropout. For sampling, we combine anti-penetration and\nclassifier-free guidance to enable plausible generation. Furthermore, we\nestablish the rigorous evaluation protocol of two-hand synthesis, where our\nmethod significantly outperforms baseline generative models in terms of\nplausibility and diversity. We also demonstrate that our diffusion prior can\nboost the performance of two-hand reconstruction from monocular in-the-wild\nimages, achieving new state-of-the-art accuracy.\n","authors":["Jihyun Lee","Shunsuke Saito","Giljoo Nam","Minhyuk Sung","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.17422v1.pdf","comment":"Accepted to CVPR 2024, project page:\n https://jyunlee.github.io/projects/interhandgen/"},{"id":"http://arxiv.org/abs/2403.17420v1","updated":"2024-03-26T06:27:50Z","published":"2024-03-26T06:27:50Z","title":"Learning to Visually Localize Sound Sources from Mixtures without Prior\n Source Knowledge","summary":" The goal of the multi-sound source localization task is to localize sound\nsources from the mixture individually. While recent multi-sound source\nlocalization methods have shown improved performance, they face challenges due\nto their reliance on prior information about the number of objects to be\nseparated. In this paper, to overcome this limitation, we present a novel\nmulti-sound source localization method that can perform localization without\nprior knowledge of the number of sound sources. To achieve this goal, we\npropose an iterative object identification (IOI) module, which can recognize\nsound-making objects in an iterative manner. After finding the regions of\nsound-making objects, we devise object similarity-aware clustering (OSC) loss\nto guide the IOI module to effectively combine regions of the same object but\nalso distinguish between different objects and backgrounds. It enables our\nmethod to perform accurate localization of sound-making objects without any\nprior knowledge. Extensive experimental results on the MUSIC and VGGSound\nbenchmarks show the significant performance improvements of the proposed method\nover the existing methods for both single and multi-source. Our code is\navailable at: https://github.com/VisualAIKHU/NoPrior_MultiSSL\n","authors":["Dongjin Kim","Sung Jin Um","Sangmin Lee","Jung Uk Kim"],"pdf_url":"https://arxiv.org/pdf/2403.17420v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2312.09551v2","updated":"2024-03-26T06:05:17Z","published":"2023-12-15T06:04:42Z","title":"Learning-based Axial Video Motion Magnification","summary":" Video motion magnification amplifies invisible small motions to be\nperceptible, which provides humans with a spatially dense and holistic\nunderstanding of small motions in the scene of interest. This is based on the\npremise that magnifying small motions enhances the legibility of motions. In\nthe real world, however, vibrating objects often possess convoluted systems\nthat have complex natural frequencies, modes, and directions. Existing motion\nmagnification often fails to improve legibility since the intricate motions\nstill retain complex characteristics even after being magnified, which may\ndistract us from analyzing them. In this work, we focus on improving legibility\nby proposing a new concept, axial motion magnification, which magnifies\ndecomposed motions along the user-specified direction. Axial motion\nmagnification can be applied to various applications where motions of specific\naxes are critical, by providing simplified and easily readable motion\ninformation. To achieve this, we propose a novel Motion Separation Module that\nenables to disentangle and magnify the motion representation along axes of\ninterest. Furthermore, we build a new synthetic training dataset for the axial\nmotion magnification task. Our proposed method improves the legibility of\nresulting motions along certain axes by adding a new feature: user\ncontrollability. Axial motion magnification is a more generalized concept;\nthus, our method can be directly adapted to the generic motion magnification\nand achieves favorable performance against competing methods.\n","authors":["Kwon Byung-Ki","Oh Hyun-Bin","Kim Jun-Seong","Hyunwoo Ha","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2312.09551v2.pdf","comment":"main paper: 12 pages, supplementary: 10 pages, 20 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.17409v1","updated":"2024-03-26T06:04:50Z","published":"2024-03-26T06:04:50Z","title":"Neural Clustering based Visual Representation Learning","summary":" We investigate a fundamental aspect of machine vision: the measurement of\nfeatures, by revisiting clustering, one of the most classic approaches in\nmachine learning and data analysis. Existing visual feature extractors,\nincluding ConvNets, ViTs, and MLPs, represent an image as rectangular regions.\nThough prevalent, such a grid-style paradigm is built upon engineering practice\nand lacks explicit modeling of data distribution. In this work, we propose\nfeature extraction with clustering (FEC), a conceptually elegant yet\nsurprisingly ad-hoc interpretable neural clustering framework, which views\nfeature extraction as a process of selecting representatives from data and thus\nautomatically captures the underlying data distribution. Given an image, FEC\nalternates between grouping pixels into individual clusters to abstract\nrepresentatives and updating the deep features of pixels with current\nrepresentatives. Such an iterative working mechanism is implemented in the form\nof several neural layers and the final representatives can be used for\ndownstream tasks. The cluster assignments across layers, which can be viewed\nand inspected by humans, make the forward process of FEC fully transparent and\nempower it with promising ad-hoc interpretability. Extensive experiments on\nvarious visual recognition models and tasks verify the effectiveness,\ngenerality, and interpretability of FEC. We expect this work will provoke a\nrethink of the current de facto grid-style paradigm.\n","authors":["Guikun Chen","Xia Li","Yi Yang","Wenguan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.17409v1.pdf","comment":"CVPR 2024. Code: https://github.com/guikunchen/FEC/"},{"id":"http://arxiv.org/abs/2403.17390v1","updated":"2024-03-26T05:19:15Z","published":"2024-03-26T05:19:15Z","title":"SSF3D: Strict Semi-Supervised 3D Object Detection with Switching Filter","summary":" SSF3D modified the semi-supervised 3D object detection (SS3DOD) framework,\nwhich designed specifically for point cloud data. Leveraging the\ncharacteristics of non-coincidence and weak correlation of target objects in\npoint cloud, we adopt a strategy of retaining only the truth-determining pseudo\nlabels and trimming the other fuzzy labels with points, instead of pursuing a\nbalance between the quantity and quality of pseudo labels. Besides, we notice\nthat changing the filter will make the model meet different distributed\ntargets, which is beneficial to break the training bottleneck. Two mechanism\nare introduced to achieve above ideas: strict threshold and filter switching.\nThe experiments are conducted to analyze the effectiveness of above approaches\nand their impact on the overall performance of the system. Evaluating on the\nKITTI dataset, SSF3D exhibits superior performance compared to the current\nstate-of-the-art methods. The code will be released here.\n","authors":["Songbur Wong"],"pdf_url":"https://arxiv.org/pdf/2403.17390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17387v1","updated":"2024-03-26T05:12:18Z","published":"2024-03-26T05:12:18Z","title":"Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object\n Detection","summary":" We delve into pseudo-labeling for semi-supervised monocular 3D object\ndetection (SSM3OD) and discover two primary issues: a misalignment between the\nprediction quality of 3D and 2D attributes and the tendency of depth\nsupervision derived from pseudo-labels to be noisy, leading to significant\noptimization conflicts with other reliable forms of supervision. We introduce a\nnovel decoupled pseudo-labeling (DPL) approach for SSM3OD. Our approach\nfeatures a Decoupled Pseudo-label Generation (DPG) module, designed to\nefficiently generate pseudo-labels by separately processing 2D and 3D\nattributes. This module incorporates a unique homography-based method for\nidentifying dependable pseudo-labels in BEV space, specifically for 3D\nattributes. Additionally, we present a DepthGradient Projection (DGP) module to\nmitigate optimization conflicts caused by noisy depth supervision of\npseudo-labels, effectively decoupling the depth gradient and removing\nconflicting gradients. This dual decoupling strategy-at both the pseudo-label\ngeneration and gradient levels-significantly improves the utilization of\npseudo-labels in SSM3OD. Our comprehensive experiments on the KITTI benchmark\ndemonstrate the superiority of our method over existing approaches.\n","authors":["Jiacheng Zhang","Jiaming Li","Xiangru Lin","Wei Zhang","Xiao Tan","Junyu Han","Errui Ding","Jingdong Wang","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17387v1.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2403.17377v1","updated":"2024-03-26T04:49:11Z","published":"2024-03-26T04:49:11Z","title":"Self-Rectifying Diffusion Sampling with Perturbed-Attention Guidance","summary":" Recent studies have demonstrated that diffusion models are capable of\ngenerating high-quality samples, but their quality heavily depends on sampling\nguidance techniques, such as classifier guidance (CG) and classifier-free\nguidance (CFG). These techniques are often not applicable in unconditional\ngeneration or in various downstream tasks such as image restoration. In this\npaper, we propose a novel sampling guidance, called Perturbed-Attention\nGuidance (PAG), which improves diffusion sample quality across both\nunconditional and conditional settings, achieving this without requiring\nadditional training or the integration of external modules. PAG is designed to\nprogressively enhance the structure of samples throughout the denoising\nprocess. It involves generating intermediate samples with degraded structure by\nsubstituting selected self-attention maps in diffusion U-Net with an identity\nmatrix, by considering the self-attention mechanisms' ability to capture\nstructural information, and guiding the denoising process away from these\ndegraded samples. In both ADM and Stable Diffusion, PAG surprisingly improves\nsample quality in conditional and even unconditional scenarios. Moreover, PAG\nsignificantly improves the baseline performance in various downstream tasks\nwhere existing guidances such as CG or CFG cannot be fully utilized, including\nControlNet with empty prompts and image restoration such as inpainting and\ndeblurring.\n","authors":["Donghoon Ahn","Hyoungwon Cho","Jaewon Min","Wooseok Jang","Jungwoo Kim","SeonHwa Kim","Hyun Hee Park","Kyong Hwan Jin","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2403.17377v1.pdf","comment":"Project page is available at\n https://ku-cvlab.github.io/Perturbed-Attention-Guidance"},{"id":"http://arxiv.org/abs/2403.17373v1","updated":"2024-03-26T04:27:56Z","published":"2024-03-26T04:27:56Z","title":"AIDE: An Automatic Data Engine for Object Detection in Autonomous\n Driving","summary":" Autonomous vehicle (AV) systems rely on robust perception models as a\ncornerstone of safety assurance. However, objects encountered on the road\nexhibit a long-tailed distribution, with rare or unseen categories posing\nchallenges to a deployed perception model. This necessitates an expensive\nprocess of continuously curating and annotating data with significant human\neffort. We propose to leverage recent advances in vision-language and large\nlanguage models to design an Automatic Data Engine (AIDE) that automatically\nidentifies issues, efficiently curates data, improves the model through\nauto-labeling, and verifies the model through generation of diverse scenarios.\nThis process operates iteratively, allowing for continuous self-improvement of\nthe model. We further establish a benchmark for open-world detection on AV\ndatasets to comprehensively evaluate various learning paradigms, demonstrating\nour method's superior performance at a reduced cost.\n","authors":["Mingfu Liang","Jong-Chyi Su","Samuel Schulter","Sparsh Garg","Shiyu Zhao","Ying Wu","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2403.17373v1.pdf","comment":"Accepted by CVPR-2024"},{"id":"http://arxiv.org/abs/2403.07636v2","updated":"2024-03-26T04:26:21Z","published":"2024-03-12T13:18:22Z","title":"Decomposing Disease Descriptions for Enhanced Pathology Detection: A\n Multi-Aspect Vision-Language Pre-training Framework","summary":" Medical vision language pre-training (VLP) has emerged as a frontier of\nresearch, enabling zero-shot pathological recognition by comparing the query\nimage with the textual descriptions for each disease. Due to the complex\nsemantics of biomedical texts, current methods struggle to align medical images\nwith key pathological findings in unstructured reports. This leads to the\nmisalignment with the target disease's textual representation. In this paper,\nwe introduce a novel VLP framework designed to dissect disease descriptions\ninto their fundamental aspects, leveraging prior knowledge about the visual\nmanifestations of pathologies. This is achieved by consulting a large language\nmodel and medical experts. Integrating a Transformer module, our approach\naligns an input image with the diverse elements of a disease, generating\naspect-centric image representations. By consolidating the matches from each\naspect, we improve the compatibility between an image and its associated\ndisease. Additionally, capitalizing on the aspect-oriented representations, we\npresent a dual-head Transformer tailored to process known and unknown diseases,\noptimizing the comprehensive detection efficacy. Conducting experiments on\nseven downstream datasets, ours improves the accuracy of recent methods by up\nto 8.56% and 17.0% for seen and unseen categories, respectively. Our code is\nreleased at https://github.com/HieuPhan33/MAVL.\n","authors":["Vu Minh Hieu Phan","Yutong Xie","Yuankai Qi","Lingqiao Liu","Liyang Liu","Bowen Zhang","Zhibin Liao","Qi Wu","Minh-Son To","Johan W. Verjans"],"pdf_url":"https://arxiv.org/pdf/2403.07636v2.pdf","comment":"Accepted at CVPR2024. Pre-print before final camera-ready version"},{"id":"http://arxiv.org/abs/2403.10518v2","updated":"2024-03-26T04:24:13Z","published":"2024-03-15T17:59:33Z","title":"Lodge: A Coarse to Fine Diffusion Network for Long Dance Generation\n Guided by the Characteristic Dance Primitives","summary":" We propose Lodge, a network capable of generating extremely long dance\nsequences conditioned on given music. We design Lodge as a two-stage coarse to\nfine diffusion architecture, and propose the characteristic dance primitives\nthat possess significant expressiveness as intermediate representations between\ntwo diffusion models. The first stage is global diffusion, which focuses on\ncomprehending the coarse-level music-dance correlation and production\ncharacteristic dance primitives. In contrast, the second-stage is the local\ndiffusion, which parallelly generates detailed motion sequences under the\nguidance of the dance primitives and choreographic rules. In addition, we\npropose a Foot Refine Block to optimize the contact between the feet and the\nground, enhancing the physical realism of the motion. Our approach can\nparallelly generate dance sequences of extremely long length, striking a\nbalance between global choreographic patterns and local motion quality and\nexpressiveness. Extensive experiments validate the efficacy of our method.\n","authors":["Ronghui Li","YuXiang Zhang","Yachao Zhang","Hongwen Zhang","Jie Guo","Yan Zhang","Yebin Liu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2403.10518v2.pdf","comment":"Accepted by CVPR2024, Project page:\n https://li-ronghui.github.io/lodge"},{"id":"http://arxiv.org/abs/2403.16209v2","updated":"2024-03-26T04:22:02Z","published":"2024-03-24T16:08:10Z","title":"Image Captioning in news report scenario","summary":" Image captioning strives to generate pertinent captions for specified images,\nsituating itself at the crossroads of Computer Vision (CV) and Natural Language\nProcessing (NLP). This endeavor is of paramount importance with far-reaching\napplications in recommendation systems, news outlets, social media, and beyond.\nParticularly within the realm of news reporting, captions are expected to\nencompass detailed information, such as the identities of celebrities captured\nin the images. However, much of the existing body of work primarily centers\naround understanding scenes and actions. In this paper, we explore the realm of\nimage captioning specifically tailored for celebrity photographs, illustrating\nits broad potential for enhancing news industry practices. This exploration\naims to augment automated news content generation, thereby facilitating a more\nnuanced dissemination of information. Our endeavor shows a broader horizon,\nenriching the narrative in news reporting through a more intuitive image\ncaptioning framework.\n","authors":["Tianrui Liu","Qi Cai","Changxin Xu","Bo Hong","Jize Xiong","Yuxin Qiao","Tsungwei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16209v2.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.12342v3","updated":"2024-03-26T04:17:42Z","published":"2023-11-21T04:28:12Z","title":"LoCo: Locally Constrained Training-Free Layout-to-Image Synthesis","summary":" Recent text-to-image diffusion models have reached an unprecedented level in\ngenerating high-quality images. However, their exclusive reliance on textual\nprompts often falls short in precise control of image compositions. In this\npaper, we propose LoCo, a training-free approach for layout-to-image Synthesis\nthat excels in producing high-quality images aligned with both textual prompts\nand layout instructions. Specifically, we introduce a Localized Attention\nConstraint (LAC), leveraging semantic affinity between pixels in self-attention\nmaps to create precise representations of desired objects and effectively\nensure the accurate placement of objects in designated regions. We further\npropose a Padding Token Constraint (PTC) to leverage the semantic information\nembedded in previously neglected padding tokens, improving the consistency\nbetween object appearance and layout instructions. LoCo seamlessly integrates\ninto existing text-to-image and layout-to-image models, enhancing their\nperformance in spatial control and addressing semantic failures observed in\nprior methods. Extensive experiments showcase the superiority of our approach,\nsurpassing existing state-of-the-art training-free layout-to-image methods both\nqualitatively and quantitatively across multiple benchmarks.\n","authors":["Peiang Zhao","Han Li","Ruiyang Jin","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.12342v3.pdf","comment":"Demo: https://huggingface.co/spaces/Pusheen/LoCo; Project page:\n https://momopusheen.github.io/LoCo/"},{"id":"http://arxiv.org/abs/2309.07322v2","updated":"2024-03-26T04:17:40Z","published":"2023-09-13T21:21:50Z","title":"$\\texttt{NePhi}$: Neural Deformation Fields for Approximately\n Diffeomorphic Medical Image Registration","summary":" This work proposes NePhi, a generalizable neural deformation model which\nresults in approximately diffeomorphic transformations. In contrast to the\npredominant voxel-based transformation fields used in learning-based\nregistration approaches, NePhi represents deformations functionally, leading to\ngreat flexibility within the design space of memory consumption during training\nand inference, inference time, registration accuracy, as well as transformation\nregularity. Specifically, NePhi 1) requires less memory compared to voxel-based\nlearning approaches, 2) improves inference speed by predicting latent codes,\ncompared to current existing neural deformation based registration approaches\nthat \\emph{only} rely on optimization, 3) improves accuracy via instance\noptimization, and 4) shows excellent deformation regularity which is highly\ndesirable for medical image registration. We demonstrate the performance of\nNePhi on a 2D synthetic dataset as well as for real 3D lung registration. Our\nresults show that NePhi can match the accuracy of voxel-based representations\nin a single-resolution registration setting. For multi-resolution registration,\nour method matches the accuracy of current SOTA learning-based registration\napproaches with instance optimization while reducing memory requirements by a\nfactor of five.\n","authors":["Lin Tian","Hastings Greer","Raúl San José Estépar","Soumyadip Sengupta","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2309.07322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19330v2","updated":"2024-03-26T04:15:53Z","published":"2024-02-29T16:33:12Z","title":"A Novel Approach to Industrial Defect Generation through Blended Latent\n Diffusion Model with Online Adaptation","summary":" Effectively addressing the challenge of industrial Anomaly Detection (AD)\nnecessitates an ample supply of defective samples, a constraint often hindered\nby their scarcity in industrial contexts. This paper introduces a novel\nalgorithm designed to augment defective samples, thereby enhancing AD\nperformance. The proposed method tailors the blended latent diffusion model for\ndefect sample generation, employing a diffusion model to generate defective\nsamples in the latent space. A feature editing process, controlled by a\n``trimap\" mask and text prompts, refines the generated samples. The image\ngeneration inference process is structured into three stages: a free diffusion\nstage, an editing diffusion stage, and an online decoder adaptation stage. This\nsophisticated inference strategy yields high-quality synthetic defective\nsamples with diverse pattern variations, leading to significantly improved AD\naccuracies based on the augmented training set. Specifically, on the widely\nrecognized MVTec AD dataset, the proposed method elevates the state-of-the-art\n(SOTA) performance of AD with augmented data by 1.5%, 1.9%, and 3.1% for AD\nmetrics AP, IAP, and IAP90, respectively. The implementation code of this work\ncan be found at the GitHub repository\nhttps://github.com/GrandpaXun242/AdaBLDM.git\n","authors":["Hanxi Li","Zhengxun Zhang","Hao Chen","Lin Wu","Bo Li","Deyin Liu","Mingwen Wang"],"pdf_url":"https://arxiv.org/pdf/2402.19330v2.pdf","comment":"13 pages,7 figures"},{"id":"http://arxiv.org/abs/2403.15931v2","updated":"2024-03-26T04:15:02Z","published":"2024-03-23T20:30:28Z","title":"X-Portrait: Expressive Portrait Animation with Hierarchical Motion\n Attention","summary":" We propose X-Portrait, an innovative conditional diffusion model tailored for\ngenerating expressive and temporally coherent portrait animation. Specifically,\ngiven a single portrait as appearance reference, we aim to animate it with\nmotion derived from a driving video, capturing both highly dynamic and subtle\nfacial expressions along with wide-range head movements. As its core, we\nleverage the generative prior of a pre-trained diffusion model as the rendering\nbackbone, while achieve fine-grained head pose and expression control with\nnovel controlling signals within the framework of ControlNet. In contrast to\nconventional coarse explicit controls such as facial landmarks, our motion\ncontrol module is learned to interpret the dynamics directly from the original\ndriving RGB inputs. The motion accuracy is further enhanced with a patch-based\nlocal control module that effectively enhance the motion attention to\nsmall-scale nuances like eyeball positions. Notably, to mitigate the identity\nleakage from the driving signals, we train our motion control modules with\nscaling-augmented cross-identity images, ensuring maximized disentanglement\nfrom the appearance reference modules. Experimental results demonstrate the\nuniversal effectiveness of X-Portrait across a diverse range of facial\nportraits and expressive driving sequences, and showcase its proficiency in\ngenerating captivating portrait animations with consistently maintained\nidentity characteristics.\n","authors":["You Xie","Hongyi Xu","Guoxian Song","Chao Wang","Yichun Shi","Linjie Luo"],"pdf_url":"https://arxiv.org/pdf/2403.15931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17369v1","updated":"2024-03-26T04:09:08Z","published":"2024-03-26T04:09:08Z","title":"CoDA: Instructive Chain-of-Domain Adaptation with Severity-Aware Visual\n Prompt Tuning","summary":" Unsupervised Domain Adaptation (UDA) aims to adapt models from labeled source\ndomains to unlabeled target domains. When adapting to adverse scenes, existing\nUDA methods fail to perform well due to the lack of instructions, leading their\nmodels to overlook discrepancies within all adverse scenes. To tackle this, we\npropose CoDA which instructs models to distinguish, focus, and learn from these\ndiscrepancies at scene and image levels. Specifically, CoDA consists of a\nChain-of-Domain (CoD) strategy and a Severity-Aware Visual Prompt Tuning\n(SAVPT) mechanism. CoD focuses on scene-level instructions to divide all\nadverse scenes into easy and hard scenes, guiding models to adapt from source\nto easy domains with easy scene images, and then to hard domains with hard\nscene images, thereby laying a solid foundation for whole adaptations. Building\nupon this foundation, we employ SAVPT to dive into more detailed image-level\ninstructions to boost performance. SAVPT features a novel metric Severity that\ndivides all adverse scene images into low-severity and high-severity images.\nThen Severity directs visual prompts and adapters, instructing models to\nconcentrate on unified severity features instead of scene-specific features,\nwithout adding complexity to the model architecture. CoDA achieves SOTA\nperformances on widely-used benchmarks under all adverse scenes. Notably, CoDA\noutperforms the existing ones by 4.6%, and 10.3% mIoU on the Foggy Driving, and\nFoggy Zurich benchmarks, respectively. Our code is available at\nhttps://github.com/Cuzyoung/CoDA\n","authors":["Ziyang Gong","Fuhao Li","Yupeng Deng","Deblina Bhattacharjee","Xiangwei Zhu","Zhenming Ji"],"pdf_url":"https://arxiv.org/pdf/2403.17369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16536v2","updated":"2024-03-26T03:56:34Z","published":"2024-03-25T08:26:42Z","title":"VMRNN: Integrating Vision Mamba and LSTM for Efficient and Accurate\n Spatiotemporal Forecasting","summary":" Combining CNNs or ViTs, with RNNs for spatiotemporal forecasting, has yielded\nunparalleled results in predicting temporal and spatial dynamics. However,\nmodeling extensive global information remains a formidable challenge; CNNs are\nlimited by their narrow receptive fields, and ViTs struggle with the intensive\ncomputational demands of their attention mechanisms. The emergence of recent\nMamba-based architectures has been met with enthusiasm for their exceptional\nlong-sequence modeling capabilities, surpassing established vision models in\nefficiency and accuracy, which motivates us to develop an innovative\narchitecture tailored for spatiotemporal forecasting. In this paper, we propose\nthe VMRNN cell, a new recurrent unit that integrates the strengths of Vision\nMamba blocks with LSTM. We construct a network centered on VMRNN cells to\ntackle spatiotemporal prediction tasks effectively. Our extensive evaluations\nshow that our proposed approach secures competitive results on a variety of\ntasks while maintaining a smaller model size. Our code is available at\nhttps://github.com/yyyujintang/VMRNN-PyTorch.\n","authors":["Yujin Tang","Peijie Dong","Zhenheng Tang","Xiaowen Chu","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2403.16536v2.pdf","comment":"11 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2403.17360v1","updated":"2024-03-26T03:53:00Z","published":"2024-03-26T03:53:00Z","title":"Activity-Biometrics: Person Identification from Daily Activities","summary":" In this work, we study a novel problem which focuses on person identification\nwhile performing daily activities. Learning biometric features from RGB videos\nis challenging due to spatio-temporal complexity and presence of appearance\nbiases such as clothing color and background. We propose ABNet, a novel\nframework which leverages disentanglement of biometric and non-biometric\nfeatures to perform effective person identification from daily activities.\nABNet relies on a bias-less teacher to learn biometric features from RGB videos\nand explicitly disentangle non-biometric features with the help of biometric\ndistortion. In addition, ABNet also exploits activity prior for biometrics\nwhich is enabled by joint biometric and activity learning. We perform\ncomprehensive evaluation of the proposed approach across five different\ndatasets which are derived from existing activity recognition benchmarks.\nFurthermore, we extensively compare ABNet with existing works in person\nidentification and demonstrate its effectiveness for activity-based biometrics\nacross all five datasets. The code and dataset can be accessed at:\n\\url{https://github.com/sacrcv/Activity-Biometrics/}\n","authors":["Shehreen Azad","Yogesh Singh Rawat"],"pdf_url":"https://arxiv.org/pdf/2403.17360v1.pdf","comment":"CVPR 2024 Main conference"},{"id":"http://arxiv.org/abs/2312.06734v2","updated":"2024-03-26T03:52:48Z","published":"2023-12-11T11:26:32Z","title":"DiffCast: A Unified Framework via Residual Diffusion for Precipitation\n Nowcasting","summary":" Precipitation nowcasting is an important spatio-temporal prediction task to\npredict the radar echoes sequences based on current observations, which can\nserve both meteorological science and smart city applications. Due to the\nchaotic evolution nature of the precipitation systems, it is a very challenging\nproblem. Previous studies address the problem either from the perspectives of\ndeterministic modeling or probabilistic modeling. However, their predictions\nsuffer from the blurry, high-value echoes fading away and position inaccurate\nissues. The root reason of these issues is that the chaotic evolutionary\nprecipitation systems are not appropriately modeled. Inspired by the nature of\nthe systems, we propose to decompose and model them from the perspective of\nglobal deterministic motion and local stochastic variations with residual\nmechanism. A unified and flexible framework that can equip any type of\nspatio-temporal models is proposed based on residual diffusion, which\neffectively tackles the shortcomings of previous methods. Extensive\nexperimental results on four publicly available radar datasets demonstrate the\neffectiveness and superiority of the proposed framework, compared to\nstate-of-the-art techniques. Our code is publicly available at\nhttps://github.com/DeminYu98/DiffCast.\n","authors":["Demin Yu","Xutao Li","Yunming Ye","Baoquan Zhang","Chuyao Luo","Kuai Dai","Rui Wang","Xunlai Chen"],"pdf_url":"https://arxiv.org/pdf/2312.06734v2.pdf","comment":"CVPR 2024; https://github.com/DeminYu98/DiffCast"},{"id":"http://arxiv.org/abs/2303.02490v2","updated":"2024-03-26T03:41:26Z","published":"2023-03-04T20:08:57Z","title":"Diffusion Models Generate Images Like Painters: an Analytical Theory of\n Outline First, Details Later","summary":" How do diffusion generative models convert pure noise into meaningful images?\nIn a variety of pretrained diffusion models (including conditional latent space\nmodels like Stable Diffusion), we observe that the reverse diffusion process\nthat underlies image generation has the following properties: (i) individual\ntrajectories tend to be low-dimensional and resemble 2D `rotations'; (ii)\nhigh-variance scene features like layout tend to emerge earlier, while\nlow-variance details tend to emerge later; and (iii) early perturbations tend\nto have a greater impact on image content than later perturbations. To\nunderstand these phenomena, we derive and study a closed-form solution to the\nprobability flow ODE for a Gaussian distribution, which shows that the reverse\ndiffusion state rotates towards a gradually-specified target on the image\nmanifold. It also shows that generation involves first committing to an\noutline, and then to finer and finer details. We find that this solution\naccurately describes the initial phase of image generation for pretrained\nmodels, and can in principle be used to make image generation more efficient by\nskipping reverse diffusion steps. Finally, we use our solution to characterize\nthe image manifold in Stable Diffusion. Our viewpoint reveals an unexpected\nsimilarity between generation by GANs and diffusion and provides a conceptual\nlink between diffusion and image retrieval.\n","authors":["Binxu Wang","John J. Vastola"],"pdf_url":"https://arxiv.org/pdf/2303.02490v2.pdf","comment":"44 pages, 28 figures. A briefer version was presented at NeurIPS23\n Workshop on Diffusion Models [arXiv:2311.10892]"},{"id":"http://arxiv.org/abs/1902.00615v3","updated":"2024-03-26T03:40:54Z","published":"2019-02-02T01:52:53Z","title":"Confidence-Triggered Detection: Accelerating Real-time\n Tracking-by-detection Systems","summary":" Real-time object tracking necessitates a delicate balance between speed and\naccuracy, a challenge exacerbated by the computational demands of deep learning\nmethods. In this paper, we propose Confidence-Triggered Detection (CTD), an\ninnovative approach that strategically bypasses object detection for frames\nclosely resembling intermediate states, leveraging tracker confidence scores.\nCTD not only enhances tracking speed but also preserves accuracy, surpassing\nexisting tracking algorithms. Through extensive evaluation across various\ntracker confidence thresholds, we identify an optimal trade-off between\ntracking speed and accuracy, providing crucial insights for parameter\nfine-tuning and enhancing CTD's practicality in real-world scenarios. Our\nexperiments across diverse detection models underscore the robustness and\nversatility of the CTD framework, demonstrating its potential to enable\nreal-time tracking in resource-constrained environments.\n","authors":["Zhicheng Ding","Zhixin Lai","Siyang Li","Edward Wong"],"pdf_url":"https://arxiv.org/pdf/1902.00615v3.pdf","comment":"9 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.17346v1","updated":"2024-03-26T03:10:45Z","published":"2024-03-26T03:10:45Z","title":"TRAM: Global Trajectory and Motion of 3D Humans from in-the-wild Videos","summary":" We propose TRAM, a two-stage method to reconstruct a human's global\ntrajectory and motion from in-the-wild videos. TRAM robustifies SLAM to recover\nthe camera motion in the presence of dynamic humans and uses the scene\nbackground to derive the motion scale. Using the recovered camera as a\nmetric-scale reference frame, we introduce a video transformer model (VIMO) to\nregress the kinematic body motion of a human. By composing the two motions, we\nachieve accurate recovery of 3D humans in the world space, reducing global\nmotion errors by 60% from prior work. https://yufu-wang.github.io/tram4d/\n","authors":["Yufu Wang","Ziyun Wang","Lingjie Liu","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2403.17346v1.pdf","comment":"The project website: https://yufu-wang.github.io/tram4d/"},{"id":"http://arxiv.org/abs/2303.15230v2","updated":"2024-03-26T03:07:56Z","published":"2023-03-27T14:10:26Z","title":"Troika: Multi-Path Cross-Modal Traction for Compositional Zero-Shot\n Learning","summary":" Recent compositional zero-shot learning (CZSL) methods adapt pre-trained\nvision-language models (VLMs) by constructing trainable prompts only for\ncomposed state-object pairs. Relying on learning the joint representation of\nseen compositions, these methods ignore the explicit modeling of the state and\nobject, thus limiting the exploitation of pre-trained knowledge and\ngeneralization to unseen compositions. With a particular focus on the\nuniversality of the solution, in this work, we propose a novel paradigm for\nCZSL models that establishes three identification branches (i.e., Multi-Path)\nto jointly model the state, object, and composition. The presented Troika is\nour implementation that aligns the branch-specific prompt representations with\ndecomposed visual features. To calibrate the bias between semantically similar\nmulti-modal representations, we further devise a Cross-Modal Traction module\ninto Troika that shifts the prompt representation towards the current visual\ncontent. We conduct extensive experiments on three popular benchmarks, where\nour method significantly outperforms existing methods in both closed-world and\nopen-world settings. The code will be available at\nhttps://github.com/bighuang624/Troika.\n","authors":["Siteng Huang","Biao Gong","Yutong Feng","Min Zhang","Yiliang Lv","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2303.15230v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17343v1","updated":"2024-03-26T03:05:20Z","published":"2024-03-26T03:05:20Z","title":"Language Models are Free Boosters for Biomedical Imaging Tasks","summary":" In this study, we uncover the unexpected efficacy of residual-based large\nlanguage models (LLMs) as part of encoders for biomedical imaging tasks, a\ndomain traditionally devoid of language or textual data. The approach diverges\nfrom established methodologies by utilizing a frozen transformer block,\nextracted from pre-trained LLMs, as an innovative encoder layer for the direct\nprocessing of visual tokens. This strategy represents a significant departure\nfrom the standard multi-modal vision-language frameworks, which typically hinge\non language-driven prompts and inputs. We found that these LLMs could boost\nperformance across a spectrum of biomedical imaging applications, including\nboth 2D and 3D visual classification tasks, serving as plug-and-play boosters.\nMore interestingly, as a byproduct, we found that the proposed framework\nachieved superior performance, setting new state-of-the-art results on\nextensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we\naim to open new avenues for employing LLMs in biomedical imaging and enriching\nthe understanding of their potential in this specialized domain.\n","authors":["Zhixin Lai","Jing Wu","Suiyao Chen","Yucheng Zhou","Anna Hovakimyan","Naira Hovakimyan"],"pdf_url":"https://arxiv.org/pdf/2403.17343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17342v1","updated":"2024-03-26T03:03:50Z","published":"2024-03-26T03:03:50Z","title":"The Solution for the ICCV 2023 1st Scientific Figure Captioning\n Challenge","summary":" In this paper, we propose a solution for improving the quality of captions\ngenerated for figures in papers. We adopt the approach of summarizing the\ntextual content in the paper to generate image captions. Throughout our study,\nwe encounter discrepancies in the OCR information provided in the official\ndataset. To rectify this, we employ the PaddleOCR toolkit to extract OCR\ninformation from all images. Moreover, we observe that certain textual content\nin the official paper pertains to images that are not relevant for captioning,\nthereby introducing noise during caption generation. To mitigate this issue, we\nleverage LLaMA to extract image-specific information by querying the textual\ncontent based on image mentions, effectively filtering out extraneous\ninformation. Additionally, we recognize a discrepancy between the primary use\nof maximum likelihood estimation during text generation and the evaluation\nmetrics such as ROUGE employed to assess the quality of generated captions. To\nbridge this gap, we integrate the BRIO model framework, enabling a more\ncoherent alignment between the generation and evaluation processes. Our\napproach ranked first in the final test with a score of 4.49.\n","authors":["Dian Chao","Xin Song","Shupeng Zhong","Boyuan Wang","Xiangyu Wu","Chen Zhu","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14518v2","updated":"2024-03-26T02:45:29Z","published":"2023-12-22T08:31:11Z","title":"Joint Learning Neuronal Skeleton and Brain Circuit Topology with\n Permutation Invariant Encoders for Neuron Classification","summary":" Determining the types of neurons within a nervous system plays a significant\nrole in the analysis of brain connectomics and the investigation of\nneurological diseases. However, the efficiency of utilizing anatomical,\nphysiological, or molecular characteristics of neurons is relatively low and\ncostly. With the advancements in electron microscopy imaging and analysis\ntechniques for brain tissue, we are able to obtain whole-brain connectome\nconsisting neuronal high-resolution morphology and connectivity information.\nHowever, few models are built based on such data for automated neuron\nclassification. In this paper, we propose NeuNet, a framework that combines\nmorphological information of neurons obtained from skeleton and topological\ninformation between neurons obtained from neural circuit. Specifically, NeuNet\nconsists of three components, namely Skeleton Encoder, Connectome Encoder, and\nReadout Layer. Skeleton Encoder integrates the local information of neurons in\na bottom-up manner, with a one-dimensional convolution in neural skeleton's\npoint data; Connectome Encoder uses a graph neural network to capture the\ntopological information of neural circuit; finally, Readout Layer fuses the\nabove two information and outputs classification results. We reprocess and\nrelease two new datasets for neuron classification task from volume electron\nmicroscopy(VEM) images of human brain cortex and Drosophila brain. Experiments\non these two datasets demonstrated the effectiveness of our model with accuracy\nof 0.9169 and 0.9363, respectively. Code and data are available at:\nhttps://github.com/WHUminghui/NeuNet.\n","authors":["Minghui Liao","Guojia Wan","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2312.14518v2.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2403.02981v2","updated":"2024-03-26T02:39:15Z","published":"2024-03-05T13:59:21Z","title":"Doubly Abductive Counterfactual Inference for Text-based Image Editing","summary":" We study text-based image editing (TBIE) of a single image by counterfactual\ninference because it is an elegant formulation to precisely address the\nrequirement: the edited image should retain the fidelity of the original one.\nThrough the lens of the formulation, we find that the crux of TBIE is that\nexisting techniques hardly achieve a good trade-off between editability and\nfidelity, mainly due to the overfitting of the single-image fine-tuning. To\nthis end, we propose a Doubly Abductive Counterfactual inference framework\n(DAC). We first parameterize an exogenous variable as a UNet LoRA, whose\nabduction can encode all the image details. Second, we abduct another exogenous\nvariable parameterized by a text encoder LoRA, which recovers the lost\neditability caused by the overfitted first abduction. Thanks to the second\nabduction, which exclusively encodes the visual transition from post-edit to\npre-edit, its inversion -- subtracting the LoRA -- effectively reverts pre-edit\nback to post-edit, thereby accomplishing the edit. Through extensive\nexperiments, our DAC achieves a good trade-off between editability and\nfidelity. Thus, we can support a wide spectrum of user editing intents,\nincluding addition, removal, manipulation, replacement, style transfer, and\nfacial change, which are extensively validated in both qualitative and\nquantitative evaluations. Codes are in https://github.com/xuesong39/DAC.\n","authors":["Xue Song","Jiequan Cui","Hanwang Zhang","Jingjing Chen","Richang Hong","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.02981v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17334v1","updated":"2024-03-26T02:34:48Z","published":"2024-03-26T02:34:48Z","title":"OVER-NAV: Elevating Iterative Vision-and-Language Navigation with\n Open-Vocabulary Detection and StructurEd Representation","summary":" Recent advances in Iterative Vision-and-Language Navigation (IVLN) introduce\na more meaningful and practical paradigm of VLN by maintaining the agent's\nmemory across tours of scenes. Although the long-term memory aligns better with\nthe persistent nature of the VLN task, it poses more challenges on how to\nutilize the highly unstructured navigation memory with extremely sparse\nsupervision. Towards this end, we propose OVER-NAV, which aims to go over and\nbeyond the current arts of IVLN techniques. In particular, we propose to\nincorporate LLMs and open-vocabulary detectors to distill key information and\nestablish correspondence between multi-modal signals. Such a mechanism\nintroduces reliable cross-modal supervision and enables on-the-fly\ngeneralization to unseen scenes without the need of extra annotation and\nre-training. To fully exploit the interpreted navigation data, we further\nintroduce a structured representation, coded Omnigraph, to effectively\nintegrate multi-modal information along the tour. Accompanied with a novel\nomnigraph fusion mechanism, OVER-NAV is able to extract the most relevant\nknowledge from omnigraph for a more accurate navigating action. In addition,\nOVER-NAV seamlessly supports both discrete and continuous environments under a\nunified framework. We demonstrate the superiority of OVER-NAV in extensive\nexperiments.\n","authors":["Ganlong Zhao","Guanbin Li","Weikai Chen","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2403.17334v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17332v1","updated":"2024-03-26T02:32:52Z","published":"2024-03-26T02:32:52Z","title":"Labeling subtypes in a Parkinson's Cohort using Multifeatures in MRI -\n Integrating Grey and White Matter Information","summary":" Thresholding of networks has long posed a challenge in brain connectivity\nanalysis. Weighted networks are typically binarized using threshold measures to\nfacilitate network analysis. Previous studies on MRI-based brain networks have\npredominantly utilized density or sparsity-based thresholding techniques,\noptimized within specific ranges derived from network metrics such as path\nlength, clustering coefficient, and small-world index. Thus, determination of a\nsingle threshold value for facilitating comparative analysis of networks\nremains elusive. To address this, our study introduces Mutual K-Nearest\nNeighbor (MKNN)-based thresholding for brain network analysis. Here, nearest\nneighbor selection is based on the highest correlation between features of\nbrain regions. Construction of brain networks was accomplished by computing\nPearson correlations between grey matter volume and white matter volume for\neach pair of brain regions. Structural MRI data from 180 Parkinsons patients\nand 70 controls from the NIMHANS, India were analyzed. Subtypes within\nParkinsons disease were identified based on grey and white matter volume\natrophy using source-based morphometric decomposition. The loading coefficients\nwere correlated with clinical features to discern clinical relationship with\nthe deciphered subtypes. Our data-mining approach revealed: Subtype A (N = 51,\nintermediate type), Subtype B (N = 57, mild-severe type with mild motor\nsymptoms), and Subtype AB (N = 36, most-severe type with predominance in motor\nimpairment). Subtype-specific weighted matrices were binarized using MKNN-based\nthresholding for brain network analysis. Permutation tests on network metrics\nof resulting bipartite graphs demonstrated significant group differences in\nbetweenness centrality and participation coefficient. The identified hubs were\nspecific to each subtype, with some hubs conserved across different subtypes.\n","authors":["Tanmayee Samantaray","Jitender Saini","Pramod Kumar Pal","Bithiah Grace Jaganathan","Vijaya V Saradhi","Gupta CN"],"pdf_url":"https://arxiv.org/pdf/2403.17332v1.pdf","comment":"31 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.17330v1","updated":"2024-03-26T02:28:49Z","published":"2024-03-26T02:28:49Z","title":"Staircase Localization for Autonomous Exploration in Urban Environments","summary":" A staircase localization method is proposed for robots to explore urban\nenvironments autonomously. The proposed method employs a modular design in the\nform of a cascade pipeline consisting of three modules of stair detection, line\nsegment detection, and stair localization modules. The stair detection module\nutilizes an object detection algorithm based on deep learning to generate a\nregion of interest (ROI). From the ROI, line segment features are extracted\nusing a deep line segment detection algorithm. The extracted line segments are\nused to localize a staircase in terms of position, orientation, and stair\ndirection. The stair detection and localization are performed only with a\nsingle RGB-D camera. Each component of the proposed pipeline does not need to\nbe designed particularly for staircases, which makes it easy to maintain the\nwhole pipeline and replace each component with state-of-the-art deep learning\ndetection techniques. The results of real-world experiments show that the\nproposed method can perform accurate stair detection and localization during\nautonomous exploration for various structured and unstructured upstairs and\ndownstairs with shadows, dirt, and occlusions by artificial and natural\nobjects.\n","authors":["Jinrae Kim","Sunggoo Jung","Sung-Kyun Kim","Youdan Kim","Ali-akbar Agha-mohammadi"],"pdf_url":"https://arxiv.org/pdf/2403.17330v1.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.16080v2","updated":"2024-03-26T02:25:58Z","published":"2024-03-24T10:06:40Z","title":"PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic\n Human Modeling","summary":" High-quality human reconstruction and photo-realistic rendering of a dynamic\nscene is a long-standing problem in computer vision and graphics. Despite\nconsiderable efforts invested in developing various capture systems and\nreconstruction algorithms, recent advancements still struggle with loose or\noversized clothing and overly complex poses. In part, this is due to the\nchallenges of acquiring high-quality human datasets. To facilitate the\ndevelopment of these fields, in this paper, we present PKU-DyMVHumans, a\nversatile human-centric dataset for high-fidelity reconstruction and rendering\nof dynamic human scenarios from dense multi-view videos. It comprises 8.2\nmillion frames captured by more than 56 synchronized cameras across diverse\nscenarios. These sequences comprise 32 human subjects across 45 different\nscenarios, each with a high-detailed appearance and realistic human motion.\nInspired by recent advancements in neural radiance field (NeRF)-based scene\nrepresentations, we carefully set up an off-the-shelf framework that is easy to\nprovide those state-of-the-art NeRF-based implementations and benchmark on\nPKU-DyMVHumans dataset. It is paving the way for various applications like\nfine-grained foreground/background decomposition, high-quality human\nreconstruction and photo-realistic novel view synthesis of a dynamic scene.\nExtensive studies are performed on the benchmark, demonstrating new\nobservations and challenges that emerge from using such high-fidelity dynamic\ndata. The dataset is available at: https://pku-dymvhumans.github.io.\n","authors":["Xiaoyun Zheng","Liwei Liao","Xufeng Li","Jianbo Jiao","Rongjie Wang","Feng Gao","Shiqi Wang","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17327v1","updated":"2024-03-26T02:21:36Z","published":"2024-03-26T02:21:36Z","title":"Accuracy enhancement method for speech emotion recognition from\n spectrogram using temporal frequency correlation and positional information\n learning through knowledge transfer","summary":" In this paper, we propose a method to improve the accuracy of speech emotion\nrecognition (SER) by using vision transformer (ViT) to attend to the\ncorrelation of frequency (y-axis) with time (x-axis) in spectrogram and\ntransferring positional information between ViT through knowledge transfer. The\nproposed method has the following originality i) We use vertically segmented\npatches of log-Mel spectrogram to analyze the correlation of frequencies over\ntime. This type of patch allows us to correlate the most relevant frequencies\nfor a particular emotion with the time they were uttered. ii) We propose the\nuse of image coordinate encoding, an absolute positional encoding suitable for\nViT. By normalizing the x, y coordinates of the image to -1 to 1 and\nconcatenating them to the image, we can effectively provide valid absolute\npositional information for ViT. iii) Through feature map matching, the locality\nand location information of the teacher network is effectively transmitted to\nthe student network. Teacher network is a ViT that contains locality of\nconvolutional stem and absolute position information through image coordinate\nencoding, and student network is a structure that lacks positional encoding in\nthe basic ViT structure. In feature map matching stage, we train through the\nmean absolute error (L1 loss) to minimize the difference between the feature\nmaps of the two networks. To validate the proposed method, three emotion\ndatasets (SAVEE, EmoDB, and CREMA-D) consisting of speech were converted into\nlog-Mel spectrograms for comparison experiments. The experimental results show\nthat the proposed method significantly outperforms the state-of-the-art methods\nin terms of weighted accuracy while requiring significantly fewer floating\npoint operations (FLOPs). Overall, the proposed method offers an promising\nsolution for SER by providing improved efficiency and performance.\n","authors":["Jeong-Yoon Kim","Seung-Ho Lee"],"pdf_url":"https://arxiv.org/pdf/2403.17327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10474v3","updated":"2024-03-26T01:11:52Z","published":"2023-05-17T17:59:16Z","title":"Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models","summary":" Despite tremendous progress in generating high-quality images using diffusion\nmodels, synthesizing a sequence of animated frames that are both photorealistic\nand temporally coherent is still in its infancy. While off-the-shelf\nbillion-scale datasets for image generation are available, collecting similar\nvideo data of the same scale is still challenging. Also, training a video\ndiffusion model is computationally much more expensive than its image\ncounterpart. In this work, we explore finetuning a pretrained image diffusion\nmodel with video data as a practical solution for the video synthesis task. We\nfind that naively extending the image noise prior to video noise prior in video\ndiffusion leads to sub-optimal performance. Our carefully designed video noise\nprior leads to substantially better performance. Extensive experimental\nvalidation shows that our model, Preserve Your Own Correlation (PYoCo), attains\nSOTA zero-shot text-to-video results on the UCF-101 and MSR-VTT benchmarks. It\nalso achieves SOTA video generation quality on the small-scale UCF-101\nbenchmark with a $10\\times$ smaller model using significantly less computation\nthan the prior art.\n","authors":["Songwei Ge","Seungjun Nah","Guilin Liu","Tyler Poon","Andrew Tao","Bryan Catanzaro","David Jacobs","Jia-Bin Huang","Ming-Yu Liu","Yogesh Balaji"],"pdf_url":"https://arxiv.org/pdf/2305.10474v3.pdf","comment":"ICCV 2023. Project webpage:\n https://research.nvidia.com/labs/dir/pyoco"},{"id":"http://arxiv.org/abs/2403.17301v1","updated":"2024-03-26T01:06:47Z","published":"2024-03-26T01:06:47Z","title":"Physical 3D Adversarial Attacks against Monocular Depth Estimation in\n Autonomous Driving","summary":" Deep learning-based monocular depth estimation (MDE), extensively applied in\nautonomous driving, is known to be vulnerable to adversarial attacks. Previous\nphysical attacks against MDE models rely on 2D adversarial patches, so they\nonly affect a small, localized region in the MDE map but fail under various\nviewpoints. To address these limitations, we propose 3D Depth Fool\n(3D$^2$Fool), the first 3D texture-based adversarial attack against MDE models.\n3D$^2$Fool is specifically optimized to generate 3D adversarial textures\nagnostic to model types of vehicles and to have improved robustness in bad\nweather conditions, such as rain and fog. Experimental results validate the\nsuperior performance of our 3D$^2$Fool across various scenarios, including\nvehicles, MDE models, weather conditions, and viewpoints. Real-world\nexperiments with printed 3D textures on physical vehicle models further\ndemonstrate that our 3D$^2$Fool can cause an MDE error of over 10 meters.\n","authors":["Junhao Zheng","Chenhao Lin","Jiahao Sun","Zhengyu Zhao","Qian Li","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2403.17301v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17293v1","updated":"2024-03-26T00:41:54Z","published":"2024-03-26T00:41:54Z","title":"Tracing and segmentation of molecular patterns in 3-dimensional\n cryo-et/em density maps through algorithmic image processing and deep\n learning-based techniques","summary":" Understanding the structures of biological macromolecules is highly important\nas they are closely associated with cellular functionalities. Comprehending the\nprecise organization actin filaments is crucial because they form the dynamic\ncytoskeleton, which offers structural support to cells and connects the cell's\ninterior with its surroundings. However, determining the precise organization\nof actin filaments is challenging due to the poor quality of cryo-electron\ntomography (cryo-ET) images, which suffer from low signal-to-noise (SNR) ratios\nand the presence of missing wedge, as well as diverse shape characteristics of\nactin filaments. To address these formidable challenges, the primary component\nof this dissertation focuses on developing sophisticated computational\ntechniques for tracing actin filaments. In particular, three novel\nmethodologies have been developed: i) BundleTrac, for tracing bundle-like actin\nfilaments found in Stereocilium, ii) Spaghetti Tracer, for tracing filaments\nthat move individually with loosely cohesive movements, and iii) Struwwel\nTracer, for tracing randomly orientated actin filaments in the actin network.\nThe second component of the dissertation introduces a convolutional neural\nnetwork (CNN) based segmentation model to determine the location of protein\nsecondary structures, such as helices and beta-sheets, in medium-resolution\n(5-10 Angstrom) 3-dimensional cryo-electron microscopy (cryo-EM) images. This\nmethodology later evolved into a tool named DeepSSETracer. The final component\nof the dissertation presents a novel algorithm, cylindrical fit measure, to\nestimate image structure match at helix regions in medium-resolution cryo-EM\nimages. Overall, my dissertation has made significant contributions to\naddressing critical research challenges in structural biology by introducing\nvarious computational methods and tools.\n","authors":["Salim Sazzed"],"pdf_url":"https://arxiv.org/pdf/2403.17293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18158v1","updated":"2024-03-26T23:47:17Z","published":"2024-03-26T23:47:17Z","title":"The Effects of Short Video-Sharing Services on Video Copy Detection","summary":" The short video-sharing services that allow users to post 10-30 second videos\n(e.g., YouTube Shorts and TikTok) have attracted a lot of attention in recent\nyears. However, conventional video copy detection (VCD) methods mainly focus on\ngeneral video-sharing services (e.g., YouTube and Bilibili), and the effects of\nshort video-sharing services on video copy detection are still unclear.\nConsidering that illegally copied videos in short video-sharing services have\nservice-distinctive characteristics, especially in those time lengths, the pros\nand cons of VCD in those services are required to be analyzed. In this paper,\nwe examine the effects of short video-sharing services on VCD by constructing a\ndataset that has short video-sharing service characteristics. Our novel dataset\nis automatically constructed from the publicly available dataset to have\nreference videos and fixed short-time-length query videos, and such automation\nprocedures assure the reproducibility and data privacy preservation of this\npaper. From the experimental results focusing on segment-level and video-level\nsituations, we can see that three effects: \"Segment-level VCD in short\nvideo-sharing services is more difficult than those in general video-sharing\nservices\", \"Video-level VCD in short video-sharing services is easier than\nthose in general video-sharing services\", \"The video alignment component mainly\nsuppress the detection performance in short video-sharing services\".\n","authors":["Rintaro Yanagi","Yamato Okamoto","Shuhei Yokoo","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2403.18158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18151v1","updated":"2024-03-26T23:32:29Z","published":"2024-03-26T23:32:29Z","title":"Automated Report Generation for Lung Cytological Images Using a CNN\n Vision Classifier and Multiple-Transformer Text Decoders: Preliminary Study","summary":" Cytology plays a crucial role in lung cancer diagnosis. Pulmonary cytology\ninvolves cell morphological characterization in the specimen and reporting the\ncorresponding findings, which are extremely burdensome tasks. In this study, we\npropose a report-generation technique for lung cytology images. In total, 71\nbenign and 135 malignant pulmonary cytology specimens were collected. Patch\nimages were extracted from the captured specimen images, and the findings were\nassigned to each image as a dataset for report generation. The proposed method\nconsists of a vision model and a text decoder. In the former, a convolutional\nneural network (CNN) is used to classify a given image as benign or malignant,\nand the features related to the image are extracted from the intermediate\nlayer. Independent text decoders for benign and malignant cells are prepared\nfor text generation, and the text decoder switches according to the CNN\nclassification results. The text decoder is configured using a Transformer that\nuses the features obtained from the CNN for report generation. Based on the\nevaluation results, the sensitivity and specificity were 100% and 96.4%,\nrespectively, for automated benign and malignant case classification, and the\nsaliency map indicated characteristic benign and malignant areas. The grammar\nand style of the generated texts were confirmed as correct and in better\nagreement with gold standard compared to existing LLM-based image-captioning\nmethods and single-text-decoder ablation model. These results indicate that the\nproposed method is useful for pulmonary cytology classification and reporting.\n","authors":["Atsushi Teramoto","Ayano Michiba","Yuka Kiriyama","Tetsuya Tsukamoto","Kazuyoshi Imaizumi","Hiroshi Fujita"],"pdf_url":"https://arxiv.org/pdf/2403.18151v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2403.16335v2","updated":"2024-03-26T23:29:49Z","published":"2024-03-25T00:17:43Z","title":"MEDDAP: Medical Dataset Enhancement via Diversified Augmentation\n Pipeline","summary":" The effectiveness of Deep Neural Networks (DNNs) heavily relies on the\nabundance and accuracy of available training data. However, collecting and\nannotating data on a large scale is often both costly and time-intensive,\nparticularly in medical cases where practitioners are already occupied with\ntheir duties. Moreover, ensuring that the model remains robust across various\nscenarios of image capture is crucial in medical domains, especially when\ndealing with ultrasound images that vary based on the settings of different\ndevices and the manual operation of the transducer. To address this challenge,\nwe introduce a novel pipeline called MEDDAP, which leverages Stable Diffusion\n(SD) models to augment existing small datasets by automatically generating new\ninformative labeled samples. Pretrained checkpoints for SD are typically based\non natural images, and training them for medical images requires significant\nGPU resources due to their heavy parameters. To overcome this challenge, we\nintroduce USLoRA (Ultrasound Low-Rank Adaptation), a novel fine-tuning method\ntailored specifically for ultrasound applications. USLoRA allows for selective\nfine-tuning of weights within SD, requiring fewer than 0.1\\% of parameters\ncompared to fully fine-tuning only the UNet portion of SD. To enhance dataset\ndiversity, we incorporate different adjectives into the generation process\nprompts, thereby desensitizing the classifiers to intensity changes across\ndifferent images. This approach is inspired by clinicians' decision-making\nprocesses regarding breast tumors, where tumor shape often plays a more crucial\nrole than intensity. In conclusion, our pipeline not only outperforms\nclassifiers trained on the original dataset but also demonstrates superior\nperformance when encountering unseen datasets. The source code is available at\nhttps://github.com/yasamin-med/MEDDAP.\n","authors":["Yasamin Medghalchi","Niloufar Zakariaei","Arman Rahmim","Ilker Hacihaliloglu"],"pdf_url":"https://arxiv.org/pdf/2403.16335v2.pdf","comment":"submitted to miccai 2024 submitted to miccai 2024 Submitted to\n MICCAI-2024"},{"id":"http://arxiv.org/abs/2308.02396v2","updated":"2024-03-26T23:17:24Z","published":"2023-07-24T17:09:40Z","title":"HOOD: Real-Time Human Presence and Out-of-Distribution Detection Using\n FMCW Radar","summary":" Detecting human presence indoors with millimeter-wave frequency-modulated\ncontinuous-wave (FMCW) radar faces challenges from both moving and stationary\nclutter. This work proposes a robust and real-time capable human presence and\nout-of-distribution (OOD) detection method using 60 GHz short-range FMCW radar.\nHOOD solves the human presence and OOD detection problems simultaneously in a\nsingle pipeline. Our solution relies on a reconstruction-based architecture and\nworks with radar macro and micro range-Doppler images (RDIs). HOOD aims to\naccurately detect the presence of humans in the presence or absence of moving\nand stationary disturbers. Since HOOD is also an OOD detector, it aims to\ndetect moving or stationary clutters as OOD in humans' absence and predicts the\ncurrent scene's output as \"no presence.\" HOOD performs well in diverse\nscenarios, demonstrating its effectiveness across different human activities\nand situations. On our dataset collected with a 60 GHz short-range FMCW radar,\nwe achieve an average AUROC of 94.36%. Additionally, our extensive evaluations\nand experiments demonstrate that HOOD outperforms state-of-the-art (SOTA) OOD\ndetection methods in terms of common OOD detection metrics. Importantly, HOOD\nalso perfectly fits on Raspberry Pi 3B+ with an ARM Cortex-A53 CPU, which\nshowcases its versatility across different hardware environments. Videos of our\nhuman presence detection experiments are available at:\nhttps://muskahya.github.io/HOOD\n","authors":["Sabri Mustafa Kahya","Muhammet Sami Yavuz","Eckehard Steinbach"],"pdf_url":"https://arxiv.org/pdf/2308.02396v2.pdf","comment":"10 pages, 2 figures, project page: https://muskahya.github.io/HOOD"},{"id":"http://arxiv.org/abs/2403.18144v1","updated":"2024-03-26T23:05:24Z","published":"2024-03-26T23:05:24Z","title":"Leak and Learn: An Attacker's Cookbook to Train Using Leaked Data from\n Federated Learning","summary":" Federated learning is a decentralized learning paradigm introduced to\npreserve privacy of client data. Despite this, prior work has shown that an\nattacker at the server can still reconstruct the private training data using\nonly the client updates. These attacks are known as data reconstruction attacks\nand fall into two major categories: gradient inversion (GI) and linear layer\nleakage attacks (LLL). However, despite demonstrating the effectiveness of\nthese attacks in breaching privacy, prior work has not investigated the\nusefulness of the reconstructed data for downstream tasks. In this work, we\nexplore data reconstruction attacks through the lens of training and improving\nmodels with leaked data. We demonstrate the effectiveness of both GI and LLL\nattacks in maliciously training models using the leaked data more accurately\nthan a benign federated learning strategy. Counter-intuitively, this bump in\ntraining quality can occur despite limited reconstruction quality or a small\ntotal number of leaked images. Finally, we show the limitations of these\nattacks for downstream training, individually for GI attacks and for LLL\nattacks.\n","authors":["Joshua C. Zhao","Ahaan Dabholkar","Atul Sharma","Saurabh Bagchi"],"pdf_url":"https://arxiv.org/pdf/2403.18144v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2303.09618v2","updated":"2024-03-26T22:59:52Z","published":"2023-03-16T19:47:41Z","title":"HIVE: Harnessing Human Feedback for Instructional Visual Editing","summary":" Incorporating human feedback has been shown to be crucial to align text\ngenerated by large language models to human preferences. We hypothesize that\nstate-of-the-art instructional image editing models, where outputs are\ngenerated based on an input image and an editing instruction, could similarly\nbenefit from human feedback, as their outputs may not adhere to the correct\ninstructions and preferences of users. In this paper, we present a novel\nframework to harness human feedback for instructional visual editing (HIVE).\nSpecifically, we collect human feedback on the edited images and learn a reward\nfunction to capture the underlying user preferences. We then introduce scalable\ndiffusion model fine-tuning methods that can incorporate human preferences\nbased on the estimated reward. Besides, to mitigate the bias brought by the\nlimitation of data, we contribute a new 1M training dataset, a 3.6K reward\ndataset for rewards learning, and a 1K evaluation dataset to boost the\nperformance of instructional image editing. We conduct extensive empirical\nexperiments quantitatively and qualitatively, showing that HIVE is favored over\nprevious state-of-the-art instructional image editing approaches by a large\nmargin.\n","authors":["Shu Zhang","Xinyi Yang","Yihao Feng","Can Qin","Chia-Chih Chen","Ning Yu","Zeyuan Chen","Huan Wang","Silvio Savarese","Stefano Ermon","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2303.09618v2.pdf","comment":"In CVPR, 2024"},{"id":"http://arxiv.org/abs/2403.18139v1","updated":"2024-03-26T22:50:36Z","published":"2024-03-26T22:50:36Z","title":"Pseudo-MRI-Guided PET Image Reconstruction Method Based on a Diffusion\n Probabilistic Model","summary":" Anatomically guided PET reconstruction using MRI information has been shown\nto have the potential to improve PET image quality. However, these improvements\nare limited to PET scans with paired MRI information. In this work we employed\na diffusion probabilistic model (DPM) to infer T1-weighted-MRI (deep-MRI)\nimages from FDG-PET brain images. We then use the DPM-generated T1w-MRI to\nguide the PET reconstruction. The model was trained with brain FDG scans, and\ntested in datasets containing multiple levels of counts. Deep-MRI images\nappeared somewhat degraded than the acquired MRI images. Regarding PET image\nquality, volume of interest analysis in different brain regions showed that\nboth PET reconstructed images using the acquired and the deep-MRI images\nimproved image quality compared to OSEM. Same conclusions were found analysing\nthe decimated datasets. A subjective evaluation performed by two physicians\nconfirmed that OSEM scored consistently worse than the MRI-guided PET images\nand no significant differences were observed between the MRI-guided PET images.\nThis proof of concept shows that it is possible to infer DPM-based MRI imagery\nto guide the PET reconstruction, enabling the possibility of changing\nreconstruction parameters such as the strength of the prior on anatomically\nguided PET reconstruction in the absence of MRI.\n","authors":["Weijie Gan","Huidong Xie","Carl von Gall","Günther Platsch","Michael T. Jurkiewicz","Andrea Andrade","Udunna C. Anazodo","Ulugbek S. Kamilov","Hongyu An","Jorge Cabello"],"pdf_url":"https://arxiv.org/pdf/2403.18139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01973v3","updated":"2024-03-26T22:46:10Z","published":"2023-04-04T17:31:15Z","title":"ERM++: An Improved Baseline for Domain Generalization","summary":" Domain Generalization (DG) measures a classifier's ability to generalize to\nnew distributions of data it was not trained on. Recent work has shown that a\nhyperparameter-tuned Empirical Risk Minimization (ERM) training procedure, that\nis simply minimizing the empirical risk on the source domains, can outperform\nmost existing DG methods. ERM has achieved such strong results while only\ntuning hyper-parameters such as learning rate, weight decay, batch size, and\ndropout. However there are additional hyperparameters which further limit\noverfitting and catastrophic forgetting. We therefore focus on tuning\npreviously untuned hyper-parameters, including training amount, initialization,\nand additional regularizers. We call the resulting stronger baseline ERM++.\nERM++ improves the performance of DG by over 5% compared to prior ERM baselines\non a standard benchmark of 5 datasets with a ResNet-50 and over 15% with a\nViT-B/16, and outperforms all SOTA methods on DomainBed with both\narchitectures. We also explore the relationship between DG performance and\nsimilarity to pre-training data, and find that similarity to pre-training data\ndistributions is an important driver of performance, but that ERM++ with\nstronger initializations can deliver strong performance even on dissimilar\ndatasets.Code is released at https://github.com/piotr-teterwak/erm_plusplus.\n","authors":["Piotr Teterwak","Kuniaki Saito","Theodoros Tsiligkaridis","Kate Saenko","Bryan A. Plummer"],"pdf_url":"https://arxiv.org/pdf/2304.01973v3.pdf","comment":"An improved baseline for Domain Generalization"},{"id":"http://arxiv.org/abs/2403.13680v2","updated":"2024-03-26T22:45:20Z","published":"2024-03-20T15:38:53Z","title":"Step-Calibrated Diffusion for Biomedical Optical Image Restoration","summary":" High-quality, high-resolution medical imaging is essential for clinical care.\nRaman-based biomedical optical imaging uses non-ionizing infrared radiation to\nevaluate human tissues in real time and is used for early cancer detection,\nbrain tumor diagnosis, and intraoperative tissue analysis. Unfortunately,\noptical imaging is vulnerable to image degradation due to laser scattering and\nabsorption, which can result in diagnostic errors and misguided treatment.\nRestoration of optical images is a challenging computer vision task because the\nsources of image degradation are multi-factorial, stochastic, and\ntissue-dependent, preventing a straightforward method to obtain paired\nlow-quality/high-quality data. Here, we present Restorative Step-Calibrated\nDiffusion (RSCD), an unpaired image restoration method that views the image\nrestoration problem as completing the finishing steps of a diffusion-based\nimage generation task. RSCD uses a step calibrator model to dynamically\ndetermine the severity of image degradation and the number of steps required to\ncomplete the reverse diffusion process for image restoration. RSCD outperforms\nother widely used unpaired image restoration methods on both image quality and\nperceptual evaluation metrics for restoring optical images. Medical imaging\nexperts consistently prefer images restored using RSCD in blinded comparison\nexperiments and report minimal to no hallucinations. Finally, we show that RSCD\nimproves performance on downstream clinical imaging tasks, including automated\nbrain tumor diagnosis and deep tissue imaging. Our code is available at\nhttps://github.com/MLNeurosurg/restorative_step-calibrated_diffusion.\n","authors":["Yiwei Lyu","Sung Jik Cha","Cheng Jiang","Asadur Chowdury","Xinhai Hou","Edward Harake","Akhil Kondepudi","Christian Freudiger","Honglak Lee","Todd C. Hollon"],"pdf_url":"https://arxiv.org/pdf/2403.13680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18134v1","updated":"2024-03-26T22:31:05Z","published":"2024-03-26T22:31:05Z","title":"Integrative Graph-Transformer Framework for Histopathology Whole Slide\n Image Representation and Classification","summary":" In digital pathology, the multiple instance learning (MIL) strategy is widely\nused in the weakly supervised histopathology whole slide image (WSI)\nclassification task where giga-pixel WSIs are only labeled at the slide level.\nHowever, existing attention-based MIL approaches often overlook contextual\ninformation and intrinsic spatial relationships between neighboring tissue\ntiles, while graph-based MIL frameworks have limited power to recognize the\nlong-range dependencies. In this paper, we introduce the integrative\ngraph-transformer framework that simultaneously captures the context-aware\nrelational features and global WSI representations through a novel Graph\nTransformer Integration (GTI) block. Specifically, each GTI block consists of a\nGraph Convolutional Network (GCN) layer modeling neighboring relations at the\nlocal instance level and an efficient global attention model capturing\ncomprehensive global information from extensive feature embeddings. Extensive\nexperiments on three publicly available WSI datasets: TCGA-NSCLC, TCGA-RCC and\nBRIGHT, demonstrate the superiority of our approach over current\nstate-of-the-art MIL methods, achieving an improvement of 1.0% to 2.6% in\naccuracy and 0.7%-1.6% in AUROC.\n","authors":["Zhan Shi","Jingwei Zhang","Jun Kong","Fusheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18132v1","updated":"2024-03-26T22:26:39Z","published":"2024-03-26T22:26:39Z","title":"Recommendation of data-free class-incremental learning algorithms by\n simulating future data","summary":" Class-incremental learning deals with sequential data streams composed of\nbatches of classes. Various algorithms have been proposed to address the\nchallenging case where samples from past classes cannot be stored. However,\nselecting an appropriate algorithm for a user-defined setting is an open\nproblem, as the relative performance of these algorithms depends on the\nincremental settings. To solve this problem, we introduce an algorithm\nrecommendation method that simulates the future data stream. Given an initial\nset of classes, it leverages generative models to simulate future classes from\nthe same visual domain. We evaluate recent algorithms on the simulated stream\nand recommend the one which performs best in the user-defined incremental\nsetting. We illustrate the effectiveness of our method on three large datasets\nusing six algorithms and six incremental settings. Our method outperforms\ncompetitive baselines, and performance is close to that of an oracle choosing\nthe best algorithm in each setting. This work contributes to facilitate the\npractical deployment of incremental learning.\n","authors":["Eva Feillet","Adrian Popescu","Céline Hudelot"],"pdf_url":"https://arxiv.org/pdf/2403.18132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16967v2","updated":"2024-03-26T22:00:27Z","published":"2024-03-25T17:26:08Z","title":"Visual Whole-Body Control for Legged Loco-Manipulation","summary":" We study the problem of mobile manipulation using legged robots equipped with\nan arm, namely legged loco-manipulation. The robot legs, while usually utilized\nfor mobility, offer an opportunity to amplify the manipulation capabilities by\nconducting whole-body control. That is, the robot can control the legs and the\narm at the same time to extend its workspace. We propose a framework that can\nconduct the whole-body control autonomously with visual observations. Our\napproach, namely Visual Whole-Body Control(VBC), is composed of a low-level\npolicy using all degrees of freedom to track the end-effector manipulator\nposition and a high-level policy proposing the end-effector position based on\nvisual inputs. We train both levels of policies in simulation and perform\nSim2Real transfer for real robot deployment. We perform extensive experiments\nand show significant improvements over baselines in picking up diverse objects\nin different configurations (heights, locations, orientations) and\nenvironments. Project page: https://wholebody-b1.github.io\n","authors":["Minghuan Liu","Zixuan Chen","Xuxin Cheng","Yandong Ji","Ruihan Yang","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16967v2.pdf","comment":"The first two authors contribute equally. Project page:\n https://wholebody-b1.github.io"},{"id":"http://arxiv.org/abs/2312.01629v2","updated":"2024-03-26T21:58:28Z","published":"2023-12-04T05:13:59Z","title":"CLAMP: Contrastive LAnguage Model Prompt-tuning","summary":" Large language models (LLMs) have emerged as powerful general-purpose\ninterfaces for many machine learning problems. Recent work has adapted LLMs to\ngenerative visual tasks like image captioning, visual question answering, and\nvisual chat, using a relatively small amount of instruction-tuning data. In\nthis paper, we explore whether modern LLMs can also be adapted to classifying\nan image into a set of categories. First, we evaluate multimodal LLMs that are\ntuned for generative tasks on zero-shot image classification and find that\ntheir performance is far below that of specialized models like CLIP. We then\npropose an approach for light fine-tuning of LLMs using the same contrastive\nimage-caption matching objective as CLIP. Our results show that LLMs can,\nindeed, achieve good image classification performance when adapted this way.\nOur approach beats state-of-the-art mLLMs by 13% and slightly outperforms\ncontrastive learning with a custom text model, while also retaining the LLM's\ngenerative abilities. LLM initialization appears to particularly help\nclassification in domains under-represented in the visual pre-training data.\n","authors":["Piotr Teterwak","Ximeng Sun","Bryan A. Plummer","Kate Saenko","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2312.01629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18118v1","updated":"2024-03-26T21:48:27Z","published":"2024-03-26T21:48:27Z","title":"EgoLifter: Open-world 3D Segmentation for Egocentric Perception","summary":" In this paper we present EgoLifter, a novel system that can automatically\nsegment scenes captured from egocentric sensors into a complete decomposition\nof individual 3D objects. The system is specifically designed for egocentric\ndata where scenes contain hundreds of objects captured from natural\n(non-scanning) motion. EgoLifter adopts 3D Gaussians as the underlying\nrepresentation of 3D scenes and objects and uses segmentation masks from the\nSegment Anything Model (SAM) as weak supervision to learn flexible and\npromptable definitions of object instances free of any specific object\ntaxonomy. To handle the challenge of dynamic objects in ego-centric videos, we\ndesign a transient prediction module that learns to filter out dynamic objects\nin the 3D reconstruction. The result is a fully automatic pipeline that is able\nto reconstruct 3D object instances as collections of 3D Gaussians that\ncollectively compose the entire scene. We created a new benchmark on the Aria\nDigital Twin dataset that quantitatively demonstrates its state-of-the-art\nperformance in open-world 3D segmentation from natural egocentric input. We run\nEgoLifter on various egocentric activity datasets which shows the promise of\nthe method for 3D egocentric perception at scale.\n","authors":["Qiao Gu","Zhaoyang Lv","Duncan Frost","Simon Green","Julian Straub","Chris Sweeney"],"pdf_url":"https://arxiv.org/pdf/2403.18118v1.pdf","comment":"Preprint. Project page: https://egolifter.github.io/"},{"id":"http://arxiv.org/abs/2403.18117v1","updated":"2024-03-26T21:47:24Z","published":"2024-03-26T21:47:24Z","title":"TDIP: Tunable Deep Image Processing, a Real Time Melt Pool Monitoring\n Solution","summary":" In the era of Industry 4.0, Additive Manufacturing (AM), particularly metal\nAM, has emerged as a significant contributor due to its innovative and\ncost-effective approach to fabricate highly intricate geometries. Despite its\npotential, this industry still lacks real-time capable process monitoring\nalgorithms. Recent advancements in this field suggest that Melt Pool (MP)\nsignatures during the fabrication process contain crucial information about\nprocess dynamics and quality. To obtain this information, various sensory\napproaches, such as high-speed cameras-based vision modules are employed for\nonline fabrication monitoring. However, many conventional in-depth analyses\nstill cannot process all the recorded data simultaneously. Although\nconventional Image Processing (ImP) solutions provide a targeted tunable\napproach, they pose a trade-off between convergence certainty and convergence\nspeed. As a result, conventional methods are not suitable for a dynamically\nchanging application like MP monitoring. Therefore, this article proposes the\nimplementation of a Tunable Deep Image Processing (TDIP) method to address the\ndata-rich monitoring needs in real-time. The proposed model is first trained to\nreplicate an ImP algorithm with tunable features and methodology. The TDIP\nmodel is then further improved to account for MP geometries and fabrication\nquality based on the vision input and process parameters. The TDIP model\nachieved over 94% estimation accuracy with more than 96% R2 score for quality,\ngeometry, and MP signature estimation and isolation. The TDIP model can process\n500 images per second, while conventional methods taking a few minutes per\nimage. This significant processing time reduction enables the integration of\nvision-based monitoring in real-time for processes and quality estimation.\n","authors":["Javid Akhavan","Youmna Mahmoud","Ke Xu","Jiaqi Lyu","Souran Manoochehri"],"pdf_url":"https://arxiv.org/pdf/2403.18117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18116v1","updated":"2024-03-26T21:45:29Z","published":"2024-03-26T21:45:29Z","title":"QuakeSet: A Dataset and Low-Resource Models to Monitor Earthquakes\n through Sentinel-1","summary":" Earthquake monitoring is necessary to promptly identify the affected areas,\nthe severity of the events, and, finally, to estimate damages and plan the\nactions needed for the restoration process. The use of seismic stations to\nmonitor the strength and origin of earthquakes is limited when dealing with\nremote areas (we cannot have global capillary coverage). Identification and\nanalysis of all affected areas is mandatory to support areas not monitored by\ntraditional stations. Using social media images in crisis management has proven\neffective in various situations. However, they are still limited by the\npossibility of using communication infrastructures in case of an earthquake and\nby the presence of people in the area. Moreover, social media images and\nmessages cannot be used to estimate the actual severity of earthquakes and\ntheir characteristics effectively. The employment of satellites to monitor\nchanges around the globe grants the possibility of exploiting instrumentation\nthat is not limited by the visible spectrum, the presence of land\ninfrastructures, and people in the affected areas. In this work, we propose a\nnew dataset composed of images taken from Sentinel-1 and a new series of tasks\nto help monitor earthquakes from a new detailed view. Coupled with the data, we\nprovide a series of traditional machine learning and deep learning models as\nbaselines to assess the effectiveness of ML-based models in earthquake\nanalysis.\n","authors":["Daniele Rege Cambrin","Paolo Garza"],"pdf_url":"https://arxiv.org/pdf/2403.18116v1.pdf","comment":"Accepted at ISCRAM 2024"},{"id":"http://arxiv.org/abs/2311.02749v3","updated":"2024-03-26T21:42:34Z","published":"2023-11-05T19:59:36Z","title":"Fast Point Cloud to Mesh Reconstruction for Deformable Object Tracking","summary":" The world around us is full of soft objects we perceive and deform with\ndexterous hand movements. For a robotic hand to control soft objects, it has to\nacquire online state feedback of the deforming object. While RGB-D cameras can\ncollect occluded point clouds at a rate of 30Hz, this does not represent a\ncontinuously trackable object surface. Hence, in this work, we developed a\nmethod that takes as input a template mesh which is the mesh of an object in\nits non-deformed state and a deformed point cloud of the same object, and then\nshapes the template mesh such that it matches the deformed point cloud. The\nreconstruction of meshes from point clouds has long been studied in the field\nof Computer graphics under 3D reconstruction and 4D reconstruction, however,\nboth lack the speed and generalizability needed for robotics applications. Our\nmodel is designed using a point cloud auto-encoder and a Real-NVP architecture.\nOur trained model can perform mesh reconstruction and tracking at a rate of\n58Hz on a template mesh of 3000 vertices and a deformed point cloud of 5000\npoints and is generalizable to the deformations of six different object\ncategories which are assumed to be made of soft material in our experiments\n(scissors, hammer, foam brick, cleanser bottle, orange, and dice). The object\nmeshes are taken from the YCB benchmark dataset. An instance of a downstream\napplication can be the control algorithm for a robotic hand that requires\nonline feedback from the state of the manipulated object which would allow\nonline grasp adaptation in a closed-loop manner. Furthermore, the tracking\ncapacity of our method can help in the system identification of deforming\nobjects in a marker-free approach. In future work, we will extend our trained\nmodel to generalize beyond six object categories and additionally to real-world\ndeforming point clouds.\n","authors":["Elham Amin Mansour","Hehui Zheng","Robert K. Katzschmann"],"pdf_url":"https://arxiv.org/pdf/2311.02749v3.pdf","comment":"8 pages with appendix,16 figures"},{"id":"http://arxiv.org/abs/2403.18114v1","updated":"2024-03-26T21:37:25Z","published":"2024-03-26T21:37:25Z","title":"Segment Any Medical Model Extended","summary":" The Segment Anything Model (SAM) has drawn significant attention from\nresearchers who work on medical image segmentation because of its\ngeneralizability. However, researchers have found that SAM may have limited\nperformance on medical images compared to state-of-the-art non-foundation\nmodels. Regardless, the community sees potential in extending, fine-tuning,\nmodifying, and evaluating SAM for analysis of medical imaging. An increasing\nnumber of works have been published focusing on the mentioned four directions,\nwhere variants of SAM are proposed. To this end, a unified platform helps push\nthe boundary of the foundation model for medical images, facilitating the use,\nmodification, and validation of SAM and its variants in medical image\nsegmentation. In this work, we introduce SAMM Extended (SAMME), a platform that\nintegrates new SAM variant models, adopts faster communication protocols,\naccommodates new interactive modes, and allows for fine-tuning of subcomponents\nof the models. These features can expand the potential of foundation models\nlike SAM, and the results can be translated to applications such as\nimage-guided therapy, mixed reality interaction, robotic navigation, and data\naugmentation.\n","authors":["Yihao Liu","Jiaming Zhang","Andres Diaz-Pinto","Haowei Li","Alejandro Martin-Gomez","Amir Kheradmand","Mehran Armand"],"pdf_url":"https://arxiv.org/pdf/2403.18114v1.pdf","comment":"The content of the manuscript has been presented in SPIE Medical\n Imaging 2024, and had been accepted to appear in the proceedings of the\n conference"},{"id":"http://arxiv.org/abs/2312.02126v2","updated":"2024-03-26T21:20:57Z","published":"2023-12-04T18:53:24Z","title":"SplaTAM: Splat, Track & Map 3D Gaussians for Dense RGB-D SLAM","summary":" Dense simultaneous localization and mapping (SLAM) is crucial for robotics\nand augmented reality applications. However, current methods are often hampered\nby the non-volumetric or implicit way they represent a scene. This work\nintroduces SplaTAM, an approach that, for the first time, leverages explicit\nvolumetric representations, i.e., 3D Gaussians, to enable high-fidelity\nreconstruction from a single unposed RGB-D camera, surpassing the capabilities\nof existing methods. SplaTAM employs a simple online tracking and mapping\nsystem tailored to the underlying Gaussian representation. It utilizes a\nsilhouette mask to elegantly capture the presence of scene density. This\ncombination enables several benefits over prior representations, including fast\nrendering and dense optimization, quickly determining if areas have been\npreviously mapped, and structured map expansion by adding more Gaussians.\nExtensive experiments show that SplaTAM achieves up to 2x superior performance\nin camera pose estimation, map construction, and novel-view synthesis over\nexisting methods, paving the way for more immersive high-fidelity SLAM\napplications.\n","authors":["Nikhil Keetha","Jay Karhade","Krishna Murthy Jatavallabhula","Gengshan Yang","Sebastian Scherer","Deva Ramanan","Jonathon Luiten"],"pdf_url":"https://arxiv.org/pdf/2312.02126v2.pdf","comment":"CVPR 2024. Website: https://spla-tam.github.io/"},{"id":"http://arxiv.org/abs/2403.18104v1","updated":"2024-03-26T21:04:18Z","published":"2024-03-26T21:04:18Z","title":"Mathematical Foundation and Corrections for Full Range Head Pose\n Estimation","summary":" Numerous works concerning head pose estimation (HPE) offer algorithms or\nproposed neural network-based approaches for extracting Euler angles from\neither facial key points or directly from images of the head region. However,\nmany works failed to provide clear definitions of the coordinate systems and\nEuler or Tait-Bryan angles orders in use. It is a well-known fact that rotation\nmatrices depend on coordinate systems, and yaw, roll, and pitch angles are\nsensitive to their application order. Without precise definitions, it becomes\nchallenging to validate the correctness of the output head pose and drawing\nroutines employed in prior works. In this paper, we thoroughly examined the\nEuler angles defined in the 300W-LP dataset, head pose estimation such as\n3DDFA-v2, 6D-RepNet, WHENet, etc, and the validity of their drawing routines of\nthe Euler angles. When necessary, we infer their coordinate system and sequence\nof yaw, roll, pitch from provided code. This paper presents (1) code and\nalgorithms for inferring coordinate system from provided source code, code for\nEuler angle application order and extracting precise rotation matrices and the\nEuler angles, (2) code and algorithms for converting poses from one rotation\nsystem to another, (3) novel formulae for 2D augmentations of the rotation\nmatrices, and (4) derivations and code for the correct drawing routines for\nrotation matrices and poses. This paper also addresses the feasibility of\ndefining rotations with right-handed coordinate system in Wikipedia and SciPy,\nwhich makes the Euler angle extraction much easier for full-range head pose\nresearch.\n","authors":["Huei-Chung Hu","Xuyang Wu","Yuan Wang","Yi Fang","Hsin-Tai Wu"],"pdf_url":"https://arxiv.org/pdf/2403.18104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18103v1","updated":"2024-03-26T21:01:41Z","published":"2024-03-26T21:01:41Z","title":"Tutorial on Diffusion Models for Imaging and Vision","summary":" The astonishing growth of generative tools in recent years has empowered many\nexciting applications in text-to-image generation and text-to-video generation.\nThe underlying principle behind these generative tools is the concept of\ndiffusion, a particular sampling mechanism that has overcome some shortcomings\nthat were deemed difficult in the previous approaches. The goal of this\ntutorial is to discuss the essential ideas underlying the diffusion models. The\ntarget audience of this tutorial includes undergraduate and graduate students\nwho are interested in doing research on diffusion models or applying these\nmodels to solve other problems.\n","authors":["Stanley H. Chan"],"pdf_url":"https://arxiv.org/pdf/2403.18103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18096v1","updated":"2024-03-26T20:41:35Z","published":"2024-03-26T20:41:35Z","title":"Efficient Multi-Band Temporal Video Filter for Reducing Human-Robot\n Interaction","summary":" Although mobile robots have on-board sensors to perform navigation, their\nefficiency in completing paths can be enhanced by planning to avoid human\ninteraction. Infrastructure cameras can capture human activity continuously for\nthe purpose of compiling activity analytics to choose efficient times and\nroutes. We describe a cascade temporal filtering method to efficiently extract\nshort- and long-term activity in two time dimensions, isochronal and\nchronological, for use in global path planning and local navigation\nrespectively. The temporal filter has application either independently, or, if\nobject recognition is also required, it can be used as a pre-filter to perform\nactivity-gating of the more computationally expensive neural network\nprocessing. For a testbed 32-camera network, we show how this hybrid approach\ncan achieve over 8 times improvement in frames per second throughput and 6.5\ntimes reduction of system power use. We also show how the cost map of static\nobjects in the ROS robot software development framework is augmented with\ndynamic regions determined from the temporal filter.\n","authors":["Lawrence O'Gorman"],"pdf_url":"https://arxiv.org/pdf/2403.18096v1.pdf","comment":"15 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.18094v1","updated":"2024-03-26T20:30:55Z","published":"2024-03-26T20:30:55Z","title":"A Personalized Video-Based Hand Taxonomy: Application for Individuals\n with Spinal Cord Injury","summary":" Hand function is critical for our interactions and quality of life. Spinal\ncord injuries (SCI) can impair hand function, reducing independence. A\ncomprehensive evaluation of function in home and community settings requires a\nhand grasp taxonomy for individuals with impaired hand function. Developing\nsuch a taxonomy is challenging due to unrepresented grasp types in standard\ntaxonomies, uneven data distribution across injury levels, and limited data.\nThis study aims to automatically identify the dominant distinct hand grasps in\negocentric video using semantic clustering. Egocentric video recordings\ncollected in the homes of 19 individual with cervical SCI were used to cluster\ngrasping actions with semantic significance. A deep learning model integrating\nposture and appearance data was employed to create a personalized hand\ntaxonomy. Quantitative analysis reveals a cluster purity of 67.6% +- 24.2% with\nwith 18.0% +- 21.8% redundancy. Qualitative assessment revealed meaningful\nclusters in video content. This methodology provides a flexible and effective\nstrategy to analyze hand function in the wild. It offers researchers and\nclinicians an efficient tool for evaluating hand function, aiding sensitive\nassessments and tailored intervention plans.\n","authors":["Mehdy Dousty","David J. Fleet","José Zariffa"],"pdf_url":"https://arxiv.org/pdf/2403.18094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18092v1","updated":"2024-03-26T20:23:48Z","published":"2024-03-26T20:23:48Z","title":"OCAI: Improving Optical Flow Estimation by Occlusion and Consistency\n Aware Interpolation","summary":" The scarcity of ground-truth labels poses one major challenge in developing\noptical flow estimation models that are both generalizable and robust. While\ncurrent methods rely on data augmentation, they have yet to fully exploit the\nrich information available in labeled video sequences. We propose OCAI, a\nmethod that supports robust frame interpolation by generating intermediate\nvideo frames alongside optical flows in between. Utilizing a forward warping\napproach, OCAI employs occlusion awareness to resolve ambiguities in pixel\nvalues and fills in missing values by leveraging the forward-backward\nconsistency of optical flows. Additionally, we introduce a teacher-student\nstyle semi-supervised learning method on top of the interpolated frames. Using\na pair of unlabeled frames and the teacher model's predicted optical flow, we\ngenerate interpolated frames and flows to train a student model. The teacher's\nweights are maintained using Exponential Moving Averaging of the student. Our\nevaluations demonstrate perceptually superior interpolation quality and\nenhanced optical flow accuracy on established benchmarks such as Sintel and\nKITTI.\n","authors":["Jisoo Jeong","Hong Cai","Risheek Garrepalli","Jamie Menjay Lin","Munawar Hayat","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2403.18092v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18080v1","updated":"2024-03-26T20:02:48Z","published":"2024-03-26T20:02:48Z","title":"EgoPoseFormer: A Simple Baseline for Egocentric 3D Human Pose Estimation","summary":" We present EgoPoseFormer, a simple yet effective transformer-based model for\nstereo egocentric human pose estimation. The main challenge in egocentric pose\nestimation is overcoming joint invisibility, which is caused by self-occlusion\nor a limited field of view (FOV) of head-mounted cameras. Our approach\novercomes this challenge by incorporating a two-stage pose estimation paradigm:\nin the first stage, our model leverages the global information to estimate each\njoint's coarse location, then in the second stage, it employs a DETR style\ntransformer to refine the coarse locations by exploiting fine-grained stereo\nvisual features. In addition, we present a deformable stereo operation to\nenable our transformer to effectively process multi-view features, which\nenables it to accurately localize each joint in the 3D world. We evaluate our\nmethod on the stereo UnrealEgo dataset and show it significantly outperforms\nprevious approaches while being computationally efficient: it improves MPJPE by\n27.4mm (45% improvement) with only 7.9% model parameters and 13.1% FLOPs\ncompared to the state-of-the-art. Surprisingly, with proper training\ntechniques, we find that even our first-stage pose proposal network can achieve\nsuperior performance compared to previous arts. We also show that our method\ncan be seamlessly extended to monocular settings, which achieves\nstate-of-the-art performance on the SceneEgo dataset, improving MPJPE by 25.5mm\n(21% improvement) compared to the best existing method with only 60.7% model\nparameters and 36.4% FLOPs.\n","authors":["Chenhongyi Yang","Anastasia Tkach","Shreyas Hampali","Linguang Zhang","Elliot J. Crowley","Cem Keskin"],"pdf_url":"https://arxiv.org/pdf/2403.18080v1.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2403.18074v1","updated":"2024-03-26T19:54:21Z","published":"2024-03-26T19:54:21Z","title":"Every Shot Counts: Using Exemplars for Repetition Counting in Videos","summary":" Video repetition counting infers the number of repetitions of recurring\nactions or motion within a video. We propose an exemplar-based approach that\ndiscovers visual correspondence of video exemplars across repetitions within\ntarget videos. Our proposed Every Shot Counts (ESCounts) model is an\nattention-based encoder-decoder that encodes videos of varying lengths\nalongside exemplars from the same and different videos. In training, ESCounts\nregresses locations of high correspondence to the exemplars within the video.\nIn tandem, our method learns a latent that encodes representations of general\nrepetitive motions, which we use for exemplar-free, zero-shot inference.\nExtensive experiments over commonly used datasets (RepCount, Countix, and\nUCFRep) showcase ESCounts obtaining state-of-the-art performance across all\nthree datasets. On RepCount, ESCounts increases the off-by-one from 0.39 to\n0.56 and decreases the mean absolute error from 0.38 to 0.21. Detailed\nablations further demonstrate the effectiveness of our method.\n","authors":["Saptarshi Sinha","Alexandros Stergiou","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2403.18074v1.pdf","comment":"Project website: https://sinhasaptarshi.github.io/escounts"},{"id":"http://arxiv.org/abs/2403.18067v1","updated":"2024-03-26T19:36:50Z","published":"2024-03-26T19:36:50Z","title":"State of the art applications of deep learning within tracking and\n detecting marine debris: A survey","summary":" Deep learning techniques have been explored within the marine litter problem\nfor approximately 20 years but the majority of the research has developed\nrapidly in the last five years. We provide an in-depth, up to date, summary and\nanalysis of 28 of the most recent and significant contributions of deep\nlearning in marine debris. From cross referencing the research paper results,\nthe YOLO family significantly outperforms all other methods of object detection\nbut there are many respected contributions to this field that have\ncategorically agreed that a comprehensive database of underwater debris is not\ncurrently available for machine learning. Using a small dataset curated and\nlabelled by us, we tested YOLOv5 on a binary classification task and found the\naccuracy was low and the rate of false positives was high; highlighting the\nimportance of a comprehensive database. We conclude this survey with over 40\nfuture research recommendations and open challenges.\n","authors":["Zoe Moorton","Dr. Zeyneb Kurt","Dr. Wai Lok Woo"],"pdf_url":"https://arxiv.org/pdf/2403.18067v1.pdf","comment":"Review paper, 60 pages including references, 1 figure, 3 tables, 1\n supplementary data"},{"id":"http://arxiv.org/abs/2403.18063v1","updated":"2024-03-26T19:29:21Z","published":"2024-03-26T19:29:21Z","title":"Spectral Convolutional Transformer: Harmonizing Real vs. Complex\n Multi-View Spectral Operators for Vision Transformer","summary":" Transformers used in vision have been investigated through diverse\narchitectures - ViT, PVT, and Swin. These have worked to improve the attention\nmechanism and make it more efficient. Differently, the need for including local\ninformation was felt, leading to incorporating convolutions in transformers\nsuch as CPVT and CvT. Global information is captured using a complex Fourier\nbasis to achieve global token mixing through various methods, such as AFNO,\nGFNet, and Spectformer. We advocate combining three diverse views of data -\nlocal, global, and long-range dependence. We also investigate the simplest\nglobal representation using only the real domain spectral representation -\nobtained through the Hartley transform. We use a convolutional operator in the\ninitial layers to capture local information. Through these two contributions,\nwe are able to optimize and obtain a spectral convolution transformer (SCT)\nthat provides improved performance over the state-of-the-art methods while\nreducing the number of parameters. Through extensive experiments, we show that\nSCT-C-small gives state-of-the-art performance on the ImageNet dataset and\nreaches 84.5\\% top-1 accuracy, while SCT-C-Large reaches 85.9\\% and SCT-C-Huge\nreaches 86.4\\%. We evaluate SCT on transfer learning on datasets such as\nCIFAR-10, CIFAR-100, Oxford Flower, and Stanford Car. We also evaluate SCT on\ndownstream tasks i.e. instance segmentation on the MSCOCO dataset. The project\npage is available on this webpage.\\url{https://github.com/badripatro/sct}\n","authors":["Badri N. Patro","Vinay P. Namboodiri","Vijay S. Agneeswaran"],"pdf_url":"https://arxiv.org/pdf/2403.18063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08344v2","updated":"2024-03-26T19:25:53Z","published":"2023-12-13T18:28:09Z","title":"FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects","summary":" We present FoundationPose, a unified foundation model for 6D object pose\nestimation and tracking, supporting both model-based and model-free setups. Our\napproach can be instantly applied at test-time to a novel object without\nfine-tuning, as long as its CAD model is given, or a small number of reference\nimages are captured. We bridge the gap between these two setups with a neural\nimplicit representation that allows for effective novel view synthesis, keeping\nthe downstream pose estimation modules invariant under the same unified\nframework. Strong generalizability is achieved via large-scale synthetic\ntraining, aided by a large language model (LLM), a novel transformer-based\narchitecture, and contrastive learning formulation. Extensive evaluation on\nmultiple public datasets involving challenging scenarios and objects indicate\nour unified approach outperforms existing methods specialized for each task by\na large margin. In addition, it even achieves comparable results to\ninstance-level methods despite the reduced assumptions. Project page:\nhttps://nvlabs.github.io/FoundationPose/\n","authors":["Bowen Wen","Wei Yang","Jan Kautz","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2312.08344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18040v1","updated":"2024-03-26T18:52:48Z","published":"2024-03-26T18:52:48Z","title":"Global Point Cloud Registration Network for Large Transformations","summary":" Three-dimensional data registration is an established yet challenging problem\nthat is key in many different applications, such as mapping the environment for\nautonomous vehicles, and modeling objects and people for avatar creation, among\nmany others. Registration refers to the process of mapping multiple data into\nthe same coordinate system by means of matching correspondences and\ntransformation estimation. Novel proposals exploit the benefits of deep\nlearning architectures for this purpose, as they learn the best features for\nthe data, providing better matches and hence results. However, the state of the\nart is usually focused on cases of relatively small transformations, although\nin certain applications and in a real and practical environment, large\ntransformations are very common. In this paper, we present ReLaTo (Registration\nfor Large Transformations), an architecture that faces the cases where large\ntransformations happen while maintaining good performance for local\ntransformations. This proposal uses a novel Softmax pooling layer to find\ncorrespondences in a bilateral consensus manner between two point sets,\nsampling the most confident matches. These matches are used to estimate a\ncoarse and global registration using weighted Singular Value Decomposition\n(SVD). A target-guided denoising step is then applied to both the obtained\nmatches and latent features, estimating the final fine registration considering\nthe local geometry. All these steps are carried out following an end-to-end\napproach, which has been shown to improve 10 state-of-the-art registration\nmethods in two datasets commonly used for this task (ModelNet40 and KITTI),\nespecially in the case of large transformations.\n","authors":["Hanz Cuevas-Velasquez","Alejandro Galán-Cuenca","Antonio Javier Gallego","Marcelo Saval-Calvo","Robert B. Fisher"],"pdf_url":"https://arxiv.org/pdf/2403.18040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18038v1","updated":"2024-03-26T18:49:56Z","published":"2024-03-26T18:49:56Z","title":"TGGLinesPlus: A robust topological graph-guided computer vision\n algorithm for line detection from images","summary":" Line detection is a classic and essential problem in image processing,\ncomputer vision and machine intelligence. Line detection has many important\napplications, including image vectorization (e.g., document recognition and art\ndesign), indoor mapping, and important societal challenges (e.g., sea ice\nfracture line extraction from satellite imagery). Many line detection\nalgorithms and methods have been developed, but robust and intuitive methods\nare still lacking. In this paper, we proposed and implemented a topological\ngraph-guided algorithm, named TGGLinesPlus, for line detection. Our experiments\non images from a wide range of domains have demonstrated the flexibility of our\nTGGLinesPlus algorithm. We also benchmarked our algorithm with five classic and\nstate-of-the-art line detection methods and the results demonstrate the\nrobustness of TGGLinesPlus. We hope our open-source implementation of\nTGGLinesPlus will inspire and pave the way for many applications where spatial\nscience matters.\n","authors":["Liping Yang","Joshua Driscol","Ming Gong","Shujie Wang","Catherine G. Potts"],"pdf_url":"https://arxiv.org/pdf/2403.18038v1.pdf","comment":"Our TGGLinesPlus Python implementation is open source. 27 pages, 8\n figures and 4 tables"},{"id":"http://arxiv.org/abs/2403.18036v1","updated":"2024-03-26T18:41:07Z","published":"2024-03-26T18:41:07Z","title":"Move as You Say, Interact as You Can: Language-guided Human Motion\n Generation with Scene Affordance","summary":" Despite significant advancements in text-to-motion synthesis, generating\nlanguage-guided human motion within 3D environments poses substantial\nchallenges. These challenges stem primarily from (i) the absence of powerful\ngenerative models capable of jointly modeling natural language, 3D scenes, and\nhuman motion, and (ii) the generative models' intensive data requirements\ncontrasted with the scarcity of comprehensive, high-quality,\nlanguage-scene-motion datasets. To tackle these issues, we introduce a novel\ntwo-stage framework that employs scene affordance as an intermediate\nrepresentation, effectively linking 3D scene grounding and conditional motion\ngeneration. Our framework comprises an Affordance Diffusion Model (ADM) for\npredicting explicit affordance map and an Affordance-to-Motion Diffusion Model\n(AMDM) for generating plausible human motions. By leveraging scene affordance\nmaps, our method overcomes the difficulty in generating human motion under\nmultimodal condition signals, especially when training with limited data\nlacking extensive language-scene-motion pairs. Our extensive experiments\ndemonstrate that our approach consistently outperforms all baselines on\nestablished benchmarks, including HumanML3D and HUMANISE. Additionally, we\nvalidate our model's exceptional generalization capabilities on a specially\ncurated evaluation set featuring previously unseen descriptions and scenes.\n","authors":["Zan Wang","Yixin Chen","Baoxiong Jia","Puhao Li","Jinlu Zhang","Jingze Zhang","Tengyu Liu","Yixin Zhu","Wei Liang","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2403.18036v1.pdf","comment":"CVPR 2024; 16 pages"},{"id":"http://arxiv.org/abs/2403.18035v1","updated":"2024-03-26T18:40:36Z","published":"2024-03-26T18:40:36Z","title":"Bidirectional Consistency Models","summary":" Diffusion models (DMs) are capable of generating remarkably high-quality\nsamples by iteratively denoising a random vector, a process that corresponds to\nmoving along the probability flow ordinary differential equation (PF ODE).\nInterestingly, DMs can also invert an input image to noise by moving backward\nalong the PF ODE, a key operation for downstream tasks such as interpolation\nand image editing. However, the iterative nature of this process restricts its\nspeed, hindering its broader application. Recently, Consistency Models (CMs)\nhave emerged to address this challenge by approximating the integral of the PF\nODE, thereby bypassing the need to iterate. Yet, the absence of an explicit ODE\nsolver complicates the inversion process. To resolve this, we introduce the\nBidirectional Consistency Model (BCM), which learns a single neural network\nthat enables both forward and backward traversal along the PF ODE, efficiently\nunifying generation and inversion tasks within one framework. Notably, our\nproposed method enables one-step generation and inversion while also allowing\nthe use of additional steps to enhance generation quality or reduce\nreconstruction error. Furthermore, by leveraging our model's bidirectional\nconsistency, we introduce a sampling strategy that can enhance FID while\npreserving the generated image content. We further showcase our model's\ncapabilities in several downstream tasks, such as interpolation and inpainting,\nand present demonstrations of potential applications, including blind\nrestoration of compressed images and defending black-box adversarial attacks.\n","authors":["Liangchen Li","Jiajun He"],"pdf_url":"https://arxiv.org/pdf/2403.18035v1.pdf","comment":"40 pages, 25 figures"},{"id":"http://arxiv.org/abs/2403.18033v1","updated":"2024-03-26T18:39:38Z","published":"2024-03-26T18:39:38Z","title":"SpectralWaste Dataset: Multimodal Data for Waste Sorting Automation","summary":" The increase in non-biodegradable waste is a worldwide concern. Recycling\nfacilities play a crucial role, but their automation is hindered by the complex\ncharacteristics of waste recycling lines like clutter or object deformation. In\naddition, the lack of publicly available labeled data for these environments\nmakes developing robust perception systems challenging. Our work explores the\nbenefits of multimodal perception for object segmentation in real waste\nmanagement scenarios. First, we present SpectralWaste, the first dataset\ncollected from an operational plastic waste sorting facility that provides\nsynchronized hyperspectral and conventional RGB images. This dataset contains\nlabels for several categories of objects that commonly appear in sorting plants\nand need to be detected and separated from the main trash flow for several\nreasons, such as security in the management line or reuse. Additionally, we\npropose a pipeline employing different object segmentation architectures and\nevaluate the alternatives on our dataset, conducting an extensive analysis for\nboth multimodal and unimodal alternatives. Our evaluation pays special\nattention to efficiency and suitability for real-time processing and\ndemonstrates how HSI can bring a boost to RGB-only perception in these\nrealistic industrial settings without much computational overhead.\n","authors":["Sara Casao","Fernando Peña","Alberto Sabater","Rosa Castillón","Darío Suárez","Eduardo Montijano","Ana C. Murillo"],"pdf_url":"https://arxiv.org/pdf/2403.18033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18028v1","updated":"2024-03-26T18:29:39Z","published":"2024-03-26T18:29:39Z","title":"Predicting species occurrence patterns from partial observations","summary":" To address the interlinked biodiversity and climate crises, we need an\nunderstanding of where species occur and how these patterns are changing.\nHowever, observational data on most species remains very limited, and the\namount of data available varies greatly between taxonomic groups. We introduce\nthe problem of predicting species occurrence patterns given (a) satellite\nimagery, and (b) known information on the occurrence of other species. To\nevaluate algorithms on this task, we introduce SatButterfly, a dataset of\nsatellite images, environmental data and observational data for butterflies,\nwhich is designed to pair with the existing SatBird dataset of bird\nobservational data. To address this task, we propose a general model, R-Tran,\nfor predicting species occurrence patterns that enables the use of partial\nobservational data wherever found. We find that R-Tran outperforms other\nmethods in predicting species encounter rates with partial information both\nwithin a taxon (birds) and across taxa (birds and butterflies). Our approach\nopens new perspectives to leveraging insights from species with abundant data\nto other species with scarce data, by modelling the ecosystems in which they\nco-occur.\n","authors":["Hager Radi Abdelwahed","Mélisande Teng","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2403.18028v1.pdf","comment":"Tackling Climate Change with Machine Learning workshop at ICLR 2024"},{"id":"http://arxiv.org/abs/2312.09138v2","updated":"2024-03-26T18:16:26Z","published":"2023-12-14T17:09:57Z","title":"Living Scenes: Multi-object Relocalization and Reconstruction in\n Changing 3D Environments","summary":" Research into dynamic 3D scene understanding has primarily focused on\nshort-term change tracking from dense observations, while little attention has\nbeen paid to long-term changes with sparse observations. We address this gap\nwith MoRE, a novel approach for multi-object relocalization and reconstruction\nin evolving environments. We view these environments as \"living scenes\" and\nconsider the problem of transforming scans taken at different points in time\ninto a 3D reconstruction of the object instances, whose accuracy and\ncompleteness increase over time. At the core of our method lies an\nSE(3)-equivariant representation in a single encoder-decoder network, trained\non synthetic data. This representation enables us to seamlessly tackle instance\nmatching, registration, and reconstruction. We also introduce a joint\noptimization algorithm that facilitates the accumulation of point clouds\noriginating from the same instance across multiple scans taken at different\npoints in time. We validate our method on synthetic and real-world data and\ndemonstrate state-of-the-art performance in both end-to-end performance and\nindividual subtasks.\n","authors":["Liyuan Zhu","Shengyu Huang","Konrad Schindler","Iro Armeni"],"pdf_url":"https://arxiv.org/pdf/2312.09138v2.pdf","comment":"CVPR 2024 camera-ready"},{"id":"http://arxiv.org/abs/2403.16271v2","updated":"2024-03-26T18:11:28Z","published":"2024-03-24T19:32:39Z","title":"Object Detectors in the Open Environment: Challenges, Solutions, and\n Outlook","summary":" With the emergence of foundation models, deep learning-based object detectors\nhave shown practical usability in closed set scenarios. However, for real-world\ntasks, object detectors often operate in open environments, where crucial\nfactors (e.g., data distribution, objective) that influence model learning are\noften changing. The dynamic and intricate nature of the open environment poses\nnovel and formidable challenges to object detectors. Unfortunately, current\nresearch on object detectors in open environments lacks a comprehensive\nanalysis of their distinctive characteristics, challenges, and corresponding\nsolutions, which hinders their secure deployment in critical real-world\nscenarios. This paper aims to bridge this gap by conducting a comprehensive\nreview and analysis of object detectors in open environments. We initially\nidentified limitations of key structural components within the existing\ndetection pipeline and propose the open environment object detector challenge\nframework that includes four quadrants (i.e., out-of-domain, out-of-category,\nrobust learning, and incremental learning) based on the dimensions of the data\n/ target changes. For each quadrant of challenges in the proposed framework, we\npresent a detailed description and systematic analysis of the overarching goals\nand core difficulties, systematically review the corresponding solutions, and\nbenchmark their performance over multiple widely adopted datasets. In addition,\nwe engage in a discussion of open problems and potential avenues for future\nresearch. This paper aims to provide a fresh, comprehensive, and systematic\nunderstanding of the challenges and solutions associated with open-environment\nobject detectors, thus catalyzing the development of more solid applications in\nreal-world scenarios. A project related to this survey can be found at\nhttps://github.com/LiangSiyuan21/OEOD_Survey.\n","authors":["Siyuan Liang","Wei Wang","Ruoyu Chen","Aishan Liu","Boxi Wu","Ee-Chien Chang","Xiaochun Cao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.16271v2.pdf","comment":"32 pages, 17 figures"},{"id":"http://arxiv.org/abs/2312.07472v4","updated":"2024-03-26T18:08:05Z","published":"2023-12-12T17:55:45Z","title":"MP5: A Multi-modal Open-ended Embodied System in Minecraft via Active\n Perception","summary":" It is a long-lasting goal to design an embodied system that can solve\nlong-horizon open-world tasks in human-like ways. However, existing approaches\nusually struggle with compound difficulties caused by the logic-aware\ndecomposition and context-aware execution of these tasks. To this end, we\nintroduce MP5, an open-ended multimodal embodied system built upon the\nchallenging Minecraft simulator, which can decompose feasible sub-objectives,\ndesign sophisticated situation-aware plans, and perform embodied action\ncontrol, with frequent communication with a goal-conditioned active perception\nscheme. Specifically, MP5 is developed on top of recent advances in Multimodal\nLarge Language Models (MLLMs), and the system is modulated into functional\nmodules that can be scheduled and collaborated to ultimately solve pre-defined\ncontext- and process-dependent tasks. Extensive experiments prove that MP5 can\nachieve a 22% success rate on difficult process-dependent tasks and a 91%\nsuccess rate on tasks that heavily depend on the context. Moreover, MP5\nexhibits a remarkable ability to address many open-ended tasks that are\nentirely novel.\n","authors":["Yiran Qin","Enshen Zhou","Qichang Liu","Zhenfei Yin","Lu Sheng","Ruimao Zhang","Yu Qiao","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2312.07472v4.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2306.16772v5","updated":"2024-03-26T18:04:33Z","published":"2023-06-29T08:13:57Z","title":"Learning from Synthetic Human Group Activities","summary":" The study of complex human interactions and group activities has become a\nfocal point in human-centric computer vision. However, progress in related\ntasks is often hindered by the challenges of obtaining large-scale labeled\ndatasets from real-world scenarios. To address the limitation, we introduce\nM3Act, a synthetic data generator for multi-view multi-group multi-person human\natomic actions and group activities. Powered by Unity Engine, M3Act features\nmultiple semantic groups, highly diverse and photorealistic images, and a\ncomprehensive set of annotations, which facilitates the learning of\nhuman-centered tasks across single-person, multi-person, and multi-group\nconditions. We demonstrate the advantages of M3Act across three core\nexperiments. The results suggest our synthetic dataset can significantly\nimprove the performance of several downstream methods and replace real-world\ndatasets to reduce cost. Notably, M3Act improves the state-of-the-art MOTRv2 on\nDanceTrack dataset, leading to a hop on the leaderboard from 10th to 2nd place.\nMoreover, M3Act opens new research for controllable 3D group activity\ngeneration. We define multiple metrics and propose a competitive baseline for\nthe novel task. Our code and data are available at our project page:\nhttp://cjerry1243.github.io/M3Act.\n","authors":["Che-Jui Chang","Danrui Li","Deep Patel","Parth Goel","Honglu Zhou","Seonghyeon Moon","Samuel S. Sohn","Sejong Yoon","Vladimir Pavlovic","Mubbasir Kapadia"],"pdf_url":"https://arxiv.org/pdf/2306.16772v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17998v1","updated":"2024-03-26T17:59:52Z","published":"2024-03-26T17:59:52Z","title":"Text Is MASS: Modeling as Stochastic Embedding for Text-Video Retrieval","summary":" The increasing prevalence of video clips has sparked growing interest in\ntext-video retrieval. Recent advances focus on establishing a joint embedding\nspace for text and video, relying on consistent embedding representations to\ncompute similarity. However, the text content in existing datasets is generally\nshort and concise, making it hard to fully describe the redundant semantics of\na video. Correspondingly, a single text embedding may be less expressive to\ncapture the video embedding and empower the retrieval. In this study, we\npropose a new stochastic text modeling method T-MASS, i.e., text is modeled as\na stochastic embedding, to enrich text embedding with a flexible and resilient\nsemantic range, yielding a text mass. To be specific, we introduce a\nsimilarity-aware radius module to adapt the scale of the text mass upon the\ngiven text-video pairs. Plus, we design and develop a support text\nregularization to further control the text mass during the training. The\ninference pipeline is also tailored to fully exploit the text mass for accurate\nretrieval. Empirical evidence suggests that T-MASS not only effectively\nattracts relevant text-video pairs while distancing irrelevant ones, but also\nenables the determination of precise text embeddings for relevant pairs. Our\nexperimental results show a substantial improvement of T-MASS over baseline (3%\nto 6.3% by R@1). Also, T-MASS achieves state-of-the-art performance on five\nbenchmark datasets, including MSRVTT, LSMDC, DiDeMo, VATEX, and Charades.\n","authors":["Jiamian Wang","Guohao Sun","Pichao Wang","Dongfang Liu","Sohail Dianat","Majid Rabbani","Raghuveer Rao","Zhiqiang Tao"],"pdf_url":"https://arxiv.org/pdf/2403.17998v1.pdf","comment":"Accepted by CVPR 2024, code and model are available at\n https://github.com/Jiamian-Wang/T-MASS-text-video-retrieval"},{"id":"http://arxiv.org/abs/2403.17837v1","updated":"2024-03-26T16:24:42Z","published":"2024-03-26T16:24:42Z","title":"GTA-HDR: A Large-Scale Synthetic Dataset for HDR Image Reconstruction","summary":" High Dynamic Range (HDR) content (i.e., images and videos) has a broad range\nof applications. However, capturing HDR content from real-world scenes is\nexpensive and time-consuming. Therefore, the challenging task of reconstructing\nvisually accurate HDR images from their Low Dynamic Range (LDR) counterparts is\ngaining attention in the vision research community. A major challenge in this\nresearch problem is the lack of datasets, which capture diverse scene\nconditions (e.g., lighting, shadows, weather, locations, landscapes, objects,\nhumans, buildings) and various image features (e.g., color, contrast,\nsaturation, hue, luminance, brightness, radiance). To address this gap, in this\npaper, we introduce GTA-HDR, a large-scale synthetic dataset of photo-realistic\nHDR images sampled from the GTA-V video game. We perform thorough evaluation of\nthe proposed dataset, which demonstrates significant qualitative and\nquantitative improvements of the state-of-the-art HDR image reconstruction\nmethods. Furthermore, we demonstrate the effectiveness of the proposed dataset\nand its impact on additional computer vision tasks including 3D human pose\nestimation, human body part segmentation, and holistic scene segmentation. The\ndataset, data collection pipeline, and evaluation code are available at:\nhttps://github.com/HrishavBakulBarua/GTA-HDR.\n","authors":["Hrishav Bakul Barua","Kalin Stefanov","KokSheik Wong","Abhinav Dhall","Ganesh Krishnasamy"],"pdf_url":"https://arxiv.org/pdf/2403.17837v1.pdf","comment":"Submitted to IEEE"},{"id":"http://arxiv.org/abs/2403.17757v1","updated":"2024-03-26T14:49:22Z","published":"2024-03-26T14:49:22Z","title":"Noise2Noise Denoising of CRISM Hyperspectral Data","summary":" Hyperspectral data acquired by the Compact Reconnaissance Imaging\nSpectrometer for Mars (CRISM) have allowed for unparalleled mapping of the\nsurface mineralogy of Mars. Due to sensor degradation over time, a significant\nportion of the recently acquired data is considered unusable. Here a new\ndata-driven model architecture, Noise2Noise4Mars (N2N4M), is introduced to\nremove noise from CRISM images. Our model is self-supervised and does not\nrequire zero-noise target data, making it well suited for use in Planetary\nScience applications where high quality labelled data is scarce. We demonstrate\nits strong performance on synthetic-noise data and CRISM images, and its impact\non downstream classification performance, outperforming benchmark methods on\nmost metrics. This allows for detailed analysis for critical sites of interest\non the Martian surface, including proposed lander sites.\n","authors":["Robert Platt","Rossella Arcucci","Cédric M. John"],"pdf_url":"https://arxiv.org/pdf/2403.17757v1.pdf","comment":"5 pages, 3 figures. Accepted as a conference paper at the ICLR 2024\n ML4RS Workshop"},{"id":"http://arxiv.org/abs/2403.17995v1","updated":"2024-03-26T14:47:05Z","published":"2024-03-26T14:47:05Z","title":"Semi-Supervised Image Captioning Considering Wasserstein Graph Matching","summary":" Image captioning can automatically generate captions for the given images,\nand the key challenge is to learn a mapping function from visual features to\nnatural language features. Existing approaches are mostly supervised ones,\ni.e., each image has a corresponding sentence in the training set. However,\nconsidering that describing images always requires a huge of manpower, we\nusually have limited amount of described images (i.e., image-text pairs) and a\nlarge number of undescribed images in real-world applications. Thereby, a\ndilemma is the \"Semi-Supervised Image Captioning\". To solve this problem, we\npropose a novel Semi-Supervised Image Captioning method considering Wasserstein\nGraph Matching (SSIC-WGM), which turns to adopt the raw image inputs to\nsupervise the generated sentences. Different from traditional single modal\nsemi-supervised methods, the difficulty of semi-supervised cross-modal learning\nlies in constructing intermediately comparable information among heterogeneous\nmodalities. In this paper, SSIC-WGM adopts the successful scene graphs as\nintermediate information, and constrains the generated sentences from two\naspects: 1) inter-modal consistency. SSIC-WGM constructs the scene graphs of\nthe raw image and generated sentence respectively, then employs the wasserstein\ndistance to better measure the similarity between region embeddings of\ndifferent graphs. 2) intra-modal consistency. SSIC-WGM takes the data\naugmentation techniques for the raw images, then constrains the consistency\namong augmented images and generated sentences. Consequently, SSIC-WGM combines\nthe cross-modal pseudo supervision and structure invariant measure for\nefficiently using the undescribed images, and learns more reasonable mapping\nfunction.\n","authors":["Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17725v1","updated":"2024-03-26T14:13:44Z","published":"2024-03-26T14:13:44Z","title":"Deep Learning for Segmentation of Cracks in High-Resolution Images of\n Steel Bridges","summary":" Automating the current bridge visual inspection practices using drones and\nimage processing techniques is a prominent way to make these inspections more\neffective, robust, and less expensive. In this paper, we investigate the\ndevelopment of a novel deep-learning method for the detection of fatigue cracks\nin high-resolution images of steel bridges. First, we present a novel and\nchallenging dataset comprising of images of cracks in steel bridges. Secondly,\nwe integrate the ConvNext neural network with a previous state-of-the-art\nencoder-decoder network for crack segmentation. We study and report, the\neffects of the use of background patches on the network performance when\napplied to high-resolution images of cracks in steel bridges. Finally, we\nintroduce a loss function that allows the use of more background patches for\nthe training process, which yields a significant reduction in false positive\nrates.\n","authors":["Andrii Kompanets","Gautam Pai","Remco Duits","Davide Leonetti","Bert Snijder"],"pdf_url":"https://arxiv.org/pdf/2403.17725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17994v1","updated":"2024-03-26T13:50:39Z","published":"2024-03-26T13:50:39Z","title":"Solution for Point Tracking Task of ICCV 1st Perception Test Challenge\n 2023","summary":" This report proposes an improved method for the Tracking Any Point (TAP)\ntask, which tracks any physical surface through a video. Several existing\napproaches have explored the TAP by considering the temporal relationships to\nobtain smooth point motion trajectories, however, they still suffer from the\ncumulative error caused by temporal prediction. To address this issue, we\npropose a simple yet effective approach called TAP with confident static points\n(TAPIR+), which focuses on rectifying the tracking of the static point in the\nvideos shot by a static camera. To clarify, our approach contains two key\ncomponents: (1) Multi-granularity Camera Motion Detection, which could identify\nthe video sequence by the static camera shot. (2) CMR-based point trajectory\nprediction with one moving object segmentation approach to isolate the static\npoint from the moving object. Our approach ranked first in the final test with\na score of 0.46.\n","authors":["Hongpeng Pan","Yang Yang","Zhongtian Fu","Yuxuan Zhang","Shian Du","Yi Xu","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2403.17994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17332v1","updated":"2024-03-26T02:32:52Z","published":"2024-03-26T02:32:52Z","title":"Labeling subtypes in a Parkinson's Cohort using Multifeatures in MRI --\n Integrating Grey and White Matter Information","summary":" Thresholding of networks has long posed a challenge in brain connectivity\nanalysis. Weighted networks are typically binarized using threshold measures to\nfacilitate network analysis. Previous studies on MRI-based brain networks have\npredominantly utilized density or sparsity-based thresholding techniques,\noptimized within specific ranges derived from network metrics such as path\nlength, clustering coefficient, and small-world index. Thus, determination of a\nsingle threshold value for facilitating comparative analysis of networks\nremains elusive. To address this, our study introduces Mutual K-Nearest\nNeighbor (MKNN)-based thresholding for brain network analysis. Here, nearest\nneighbor selection is based on the highest correlation between features of\nbrain regions. Construction of brain networks was accomplished by computing\nPearson correlations between grey matter volume and white matter volume for\neach pair of brain regions. Structural MRI data from 180 Parkinsons patients\nand 70 controls from the NIMHANS, India were analyzed. Subtypes within\nParkinsons disease were identified based on grey and white matter volume\natrophy using source-based morphometric decomposition. The loading coefficients\nwere correlated with clinical features to discern clinical relationship with\nthe deciphered subtypes. Our data-mining approach revealed: Subtype A (N = 51,\nintermediate type), Subtype B (N = 57, mild-severe type with mild motor\nsymptoms), and Subtype AB (N = 36, most-severe type with predominance in motor\nimpairment). Subtype-specific weighted matrices were binarized using MKNN-based\nthresholding for brain network analysis. Permutation tests on network metrics\nof resulting bipartite graphs demonstrated significant group differences in\nbetweenness centrality and participation coefficient. The identified hubs were\nspecific to each subtype, with some hubs conserved across different subtypes.\n","authors":["Tanmayee Samantaray","Jitender Saini","Pramod Kumar Pal","Bithiah Grace Jaganathan","Vijaya V Saradhi","Gupta CN"],"pdf_url":"https://arxiv.org/pdf/2403.17332v1.pdf","comment":"31 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.18873v1","updated":"2024-03-26T14:42:46Z","published":"2024-03-26T14:42:46Z","title":"Predicting risk of cardiovascular disease using retinal OCT imaging","summary":" We investigated the potential of optical coherence tomography (OCT) as an\nadditional imaging technique to predict future cardiovascular disease (CVD). We\nutilised a self-supervised deep learning approach based on Variational\nAutoencoders (VAE) to learn low-dimensional representations of high-dimensional\n3D OCT images and to capture distinct characteristics of different retinal\nlayers within the OCT image. A Random Forest (RF) classifier was subsequently\ntrained using the learned latent features and participant demographic and\nclinical data, to differentiate between patients at risk of CVD events (MI or\nstroke) and non-CVD cases. Our predictive model, trained on multimodal data,\nwas assessed based on its ability to correctly identify individuals likely to\nsuffer from a CVD event(MI or stroke), within a 5-year interval after image\nacquisition. Our self-supervised VAE feature selection and multimodal Random\nForest classifier differentiate between patients at risk of future CVD events\nand the control group with an AUC of 0.75, outperforming the clinically\nestablished QRISK3 score (AUC= 0.597). The choroidal layer visible in OCT\nimages was identified as an important predictor of future CVD events using a\nnovel approach to model explanability. Retinal OCT imaging provides a\ncost-effective and non-invasive alternative to predict the risk of\ncardiovascular disease and is readily accessible in optometry practices and\nhospitals.\n","authors":["Cynthia Maldonado-Garcia","Rodrigo Bonazzola","Enzo Ferrante","Thomas H Julian","Panagiotis I Sergouniotis","Nishant Ravikumara","Alejandro F Frangi"],"pdf_url":"https://arxiv.org/pdf/2403.18873v1.pdf","comment":"18 pages for main manuscript, 7 figures, 2 pages for appendix and\n preprint for a journal"},{"id":"http://arxiv.org/abs/2403.18871v1","updated":"2024-03-26T11:40:06Z","published":"2024-03-26T11:40:06Z","title":"Clinical Domain Knowledge-Derived Template Improves Post Hoc AI\n Explanations in Pneumothorax Classification","summary":" Background: Pneumothorax is an acute thoracic disease caused by abnormal air\ncollection between the lungs and chest wall. To address the opaqueness often\nassociated with deep learning (DL) models, explainable artificial intelligence\n(XAI) methods have been introduced to outline regions related to pneumothorax\ndiagnoses made by DL models. However, these explanations sometimes diverge from\nactual lesion areas, highlighting the need for further improvement. Method: We\npropose a template-guided approach to incorporate the clinical knowledge of\npneumothorax into model explanations generated by XAI methods, thereby\nenhancing the quality of these explanations. Utilizing one lesion delineation\ncreated by radiologists, our approach first generates a template that\nrepresents potential areas of pneumothorax occurrence. This template is then\nsuperimposed on model explanations to filter out extraneous explanations that\nfall outside the template's boundaries. To validate its efficacy, we carried\nout a comparative analysis of three XAI methods with and without our template\nguidance when explaining two DL models in two real-world datasets. Results: The\nproposed approach consistently improved baseline XAI methods across twelve\nbenchmark scenarios built on three XAI methods, two DL models, and two\ndatasets. The average incremental percentages, calculated by the performance\nimprovements over the baseline performance, were 97.8% in Intersection over\nUnion (IoU) and 94.1% in Dice Similarity Coefficient (DSC) when comparing model\nexplanations and ground-truth lesion areas. Conclusions: In the context of\npneumothorax diagnoses, we proposed a template-guided approach for improving AI\nexplanations. We anticipate that our template guidance will forge a fresh\napproach to elucidating AI models by integrating clinical domain expertise.\n","authors":["Han Yuan","Chuan Hong","Pengtao Jiang","Gangming Zhao","Nguyen Tuan Anh Tran","Xinxing Xu","Yet Yen Yan","Nan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.18871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18870v1","updated":"2024-03-26T11:23:08Z","published":"2024-03-26T11:23:08Z","title":"SugarcaneNet2024: An Optimized Weighted Average Ensemble Approach of\n LASSO Regularized Pre-trained Models for Sugarcane Disease Classification","summary":" Sugarcane, a key crop for the world's sugar industry, is prone to several\ndiseases that have a substantial negative influence on both its yield and\nquality. To effectively manage and implement preventative initiatives, diseases\nmust be detected promptly and accurately. In this study, we present a unique\nmodel called sugarcaneNet2024 that outperforms previous methods for\nautomatically and quickly detecting sugarcane disease through leaf image\nprocessing. Our proposed model consolidates an optimized weighted average\nensemble of seven customized and LASSO-regularized pre-trained models,\nparticularly InceptionV3, InceptionResNetV2, DenseNet201, DenseNet169,\nXception, and ResNet152V2. Initially, we added three more dense layers with\n0.0001 LASSO regularization, three 30% dropout layers, and three batch\nnormalizations with renorm enabled at the bottom of these pre-trained models to\nimprove the performance. The accuracy of sugarcane leaf disease classification\nwas greatly increased by this addition. Following this, several comparative\nstudies between the average ensemble and individual models were carried out,\nindicating that the ensemble technique performed better. The average ensemble\nof all modified pre-trained models produced outstanding outcomes: 100%, 99%,\n99%, and 99.45% for f1 score, precision, recall, and accuracy, respectively.\nPerformance was further enhanced by the implementation of an optimized weighted\naverage ensemble technique incorporated with grid search. This optimized\nsugarcaneNet2024 model performed the best for detecting sugarcane diseases,\nhaving achieved accuracy, precision, recall, and F1 score of 99.67%, 100%,\n100%, and 100% , respectively.\n","authors":["Md. Simul Hasan Talukder","Sharmin Akter","Abdullah Hafez Nur"],"pdf_url":"https://arxiv.org/pdf/2403.18870v1.pdf","comment":"32 pages, 11 Figures, 13 Tables"}]},"2024-03-27T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.18821v1","updated":"2024-03-27T17:59:56Z","published":"2024-03-27T17:59:56Z","title":"Real Acoustic Fields: An Audio-Visual Room Acoustics Dataset and\n Benchmark","summary":" We present a new dataset called Real Acoustic Fields (RAF) that captures real\nacoustic room data from multiple modalities. The dataset includes high-quality\nand densely captured room impulse response data paired with multi-view images,\nand precise 6DoF pose tracking data for sound emitters and listeners in the\nrooms. We used this dataset to evaluate existing methods for novel-view\nacoustic synthesis and impulse response generation which previously relied on\nsynthetic data. In our evaluation, we thoroughly assessed existing audio and\naudio-visual models against multiple criteria and proposed settings to enhance\ntheir performance on real-world data. We also conducted experiments to\ninvestigate the impact of incorporating visual data (i.e., images and depth)\ninto neural acoustic field models. Additionally, we demonstrated the\neffectiveness of a simple sim2real approach, where a model is pre-trained with\nsimulated data and fine-tuned with sparse real-world data, resulting in\nsignificant improvements in the few-shot learning approach. RAF is the first\ndataset to provide densely captured room acoustic data, making it an ideal\nresource for researchers working on audio and audio-visual neural acoustic\nfield modeling techniques. Demos and datasets are available on our project\npage: https://facebookresearch.github.io/real-acoustic-fields/\n","authors":["Ziyang Chen","Israel D. Gebru","Christian Richardt","Anurag Kumar","William Laney","Andrew Owens","Alexander Richard"],"pdf_url":"https://arxiv.org/pdf/2403.18821v1.pdf","comment":"Accepted to CVPR 2024. Project site:\n https://facebookresearch.github.io/real-acoustic-fields/"},{"id":"http://arxiv.org/abs/2403.18820v1","updated":"2024-03-27T17:59:54Z","published":"2024-03-27T17:59:54Z","title":"MetaCap: Meta-learning Priors from Multi-View Imagery for Sparse-view\n Human Performance Capture and Rendering","summary":" Faithful human performance capture and free-view rendering from sparse RGB\nobservations is a long-standing problem in Vision and Graphics. The main\nchallenges are the lack of observations and the inherent ambiguities of the\nsetting, e.g. occlusions and depth ambiguity. As a result, radiance fields,\nwhich have shown great promise in capturing high-frequency appearance and\ngeometry details in dense setups, perform poorly when na\\\"ively supervising\nthem on sparse camera views, as the field simply overfits to the sparse-view\ninputs. To address this, we propose MetaCap, a method for efficient and\nhigh-quality geometry recovery and novel view synthesis given very sparse or\neven a single view of the human. Our key idea is to meta-learn the radiance\nfield weights solely from potentially sparse multi-view videos, which can serve\nas a prior when fine-tuning them on sparse imagery depicting the human. This\nprior provides a good network weight initialization, thereby effectively\naddressing ambiguities in sparse-view capture. Due to the articulated structure\nof the human body and motion-induced surface deformations, learning such a\nprior is non-trivial. Therefore, we propose to meta-learn the field weights in\na pose-canonicalized space, which reduces the spatial feature range and makes\nfeature learning more effective. Consequently, one can fine-tune our field\nparameters to quickly generalize to unseen poses, novel illumination conditions\nas well as novel and sparse (even monocular) camera views. For evaluating our\nmethod under different scenarios, we collect a new dataset, WildDynaCap, which\ncontains subjects captured in, both, a dense camera dome and in-the-wild sparse\ncamera rigs, and demonstrate superior results compared to recent\nstate-of-the-art methods on both public and WildDynaCap dataset.\n","authors":["Guoxing Sun","Rishabh Dabral","Pascal Fua","Christian Theobalt","Marc Habermann"],"pdf_url":"https://arxiv.org/pdf/2403.18820v1.pdf","comment":"Project page: https://vcai.mpi-inf.mpg.de/projects/MetaCap/"},{"id":"http://arxiv.org/abs/2403.18819v1","updated":"2024-03-27T17:59:53Z","published":"2024-03-27T17:59:53Z","title":"Benchmarking Object Detectors with COCO: A New Path Forward","summary":" The Common Objects in Context (COCO) dataset has been instrumental in\nbenchmarking object detectors over the past decade. Like every dataset, COCO\ncontains subtle errors and imperfections stemming from its annotation\nprocedure. With the advent of high-performing models, we ask whether these\nerrors of COCO are hindering its utility in reliably benchmarking further\nprogress. In search for an answer, we inspect thousands of masks from COCO\n(2017 version) and uncover different types of errors such as imprecise mask\nboundaries, non-exhaustively annotated instances, and mislabeled masks. Due to\nthe prevalence of COCO, we choose to correct these errors to maintain\ncontinuity with prior research. We develop COCO-ReM (Refined Masks), a cleaner\nset of annotations with visibly better mask quality than COCO-2017. We evaluate\nfifty object detectors and find that models that predict visually sharper masks\nscore higher on COCO-ReM, affirming that they were being incorrectly penalized\ndue to errors in COCO-2017. Moreover, our models trained using COCO-ReM\nconverge faster and score higher than their larger variants trained using\nCOCO-2017, highlighting the importance of data quality in improving object\ndetectors. With these findings, we advocate using COCO-ReM for future object\ndetection research. Our dataset is available at https://cocorem.xyz\n","authors":["Shweta Singh","Aayan Yadav","Jitesh Jain","Humphrey Shi","Justin Johnson","Karan Desai"],"pdf_url":"https://arxiv.org/pdf/2403.18819v1.pdf","comment":"Technical report. Dataset website: https://cocorem.xyz and code:\n https://github.com/kdexd/coco-rem"},{"id":"http://arxiv.org/abs/2403.18818v1","updated":"2024-03-27T17:59:52Z","published":"2024-03-27T17:59:52Z","title":"ObjectDrop: Bootstrapping Counterfactuals for Photorealistic Object\n Removal and Insertion","summary":" Diffusion models have revolutionized image editing but often generate images\nthat violate physical laws, particularly the effects of objects on the scene,\ne.g., occlusions, shadows, and reflections. By analyzing the limitations of\nself-supervised approaches, we propose a practical solution centered on a\n\\q{counterfactual} dataset. Our method involves capturing a scene before and\nafter removing a single object, while minimizing other changes. By fine-tuning\na diffusion model on this dataset, we are able to not only remove objects but\nalso their effects on the scene. However, we find that applying this approach\nfor photorealistic object insertion requires an impractically large dataset. To\ntackle this challenge, we propose bootstrap supervision; leveraging our object\nremoval model trained on a small counterfactual dataset, we synthetically\nexpand this dataset considerably. Our approach significantly outperforms prior\nmethods in photorealistic object removal and insertion, particularly at\nmodeling the effects of objects on the scene.\n","authors":["Daniel Winter","Matan Cohen","Shlomi Fruchter","Yael Pritch","Alex Rav-Acha","Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2403.18818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18816v1","updated":"2024-03-27T17:59:33Z","published":"2024-03-27T17:59:33Z","title":"Garment3DGen: 3D Garment Stylization and Texture Generation","summary":" We introduce Garment3DGen a new method to synthesize 3D garment assets from a\nbase mesh given a single input image as guidance. Our proposed approach allows\nusers to generate 3D textured clothes based on both real and synthetic images,\nsuch as those generated by text prompts. The generated assets can be directly\ndraped and simulated on human bodies. First, we leverage the recent progress of\nimage to 3D diffusion methods to generate 3D garment geometries. However, since\nthese geometries cannot be utilized directly for downstream tasks, we propose\nto use them as pseudo ground-truth and set up a mesh deformation optimization\nprocedure that deforms a base template mesh to match the generated 3D target.\nSecond, we introduce carefully designed losses that allow the input base mesh\nto freely deform towards the desired target, yet preserve mesh quality and\ntopology such that they can be simulated. Finally, a texture estimation module\ngenerates high-fidelity texture maps that are globally and locally consistent\nand faithfully capture the input guidance, allowing us to render the generated\n3D assets. With Garment3DGen users can generate the textured 3D garment of\ntheir choice without the need of artist intervention. One can provide a textual\nprompt describing the garment they desire to generate a simulation-ready 3D\nasset. We present a plethora of quantitative and qualitative comparisons on\nvarious assets both real and generated and provide use-cases of how one can\ngenerate simulation-ready 3D garments.\n","authors":["Nikolaos Sarafianos","Tuur Stuyck","Xiaoyu Xiang","Yilei Li","Jovan Popovic","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2403.18816v1.pdf","comment":"Project Page: https://nsarafianos.github.io/garment3dgen"},{"id":"http://arxiv.org/abs/2403.18814v1","updated":"2024-03-27T17:59:04Z","published":"2024-03-27T17:59:04Z","title":"Mini-Gemini: Mining the Potential of Multi-modality Vision Language\n Models","summary":" In this work, we introduce Mini-Gemini, a simple and effective framework\nenhancing multi-modality Vision Language Models (VLMs). Despite the\nadvancements in VLMs facilitating basic visual dialog and reasoning, a\nperformance gap persists compared to advanced models like GPT-4 and Gemini. We\ntry to narrow the gap by mining the potential of VLMs for better performance\nand any-to-any workflow from three aspects, i.e., high-resolution visual\ntokens, high-quality data, and VLM-guided generation. To enhance visual tokens,\nwe propose to utilize an additional visual encoder for high-resolution\nrefinement without increasing the visual token count. We further construct a\nhigh-quality dataset that promotes precise image comprehension and\nreasoning-based generation, expanding the operational scope of current VLMs. In\ngeneral, Mini-Gemini further mines the potential of VLMs and empowers current\nframeworks with image understanding, reasoning, and generation simultaneously.\nMini-Gemini supports a series of dense and MoE Large Language Models (LLMs)\nfrom 2B to 34B. It is demonstrated to achieve leading performance in several\nzero-shot benchmarks and even surpasses the developed private models. Code and\nmodels are available at https://github.com/dvlab-research/MiniGemini.\n","authors":["Yanwei Li","Yuechen Zhang","Chengyao Wang","Zhisheng Zhong","Yixin Chen","Ruihang Chu","Shaoteng Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2403.18814v1.pdf","comment":"Code and models are available at\n https://github.com/dvlab-research/MiniGemini"},{"id":"http://arxiv.org/abs/2403.18811v1","updated":"2024-03-27T17:57:02Z","published":"2024-03-27T17:57:02Z","title":"Duolando: Follower GPT with Off-Policy Reinforcement Learning for Dance\n Accompaniment","summary":" We introduce a novel task within the field of 3D dance generation, termed\ndance accompaniment, which necessitates the generation of responsive movements\nfrom a dance partner, the \"follower\", synchronized with the lead dancer's\nmovements and the underlying musical rhythm. Unlike existing solo or group\ndance generation tasks, a duet dance scenario entails a heightened degree of\ninteraction between the two participants, requiring delicate coordination in\nboth pose and position. To support this task, we first build a large-scale and\ndiverse duet interactive dance dataset, DD100, by recording about 117 minutes\nof professional dancers' performances. To address the challenges inherent in\nthis task, we propose a GPT-based model, Duolando, which autoregressively\npredicts the subsequent tokenized motion conditioned on the coordinated\ninformation of the music, the leader's and the follower's movements. To further\nenhance the GPT's capabilities of generating stable results on unseen\nconditions (music and leader motions), we devise an off-policy reinforcement\nlearning strategy that allows the model to explore viable trajectories from\nout-of-distribution samplings, guided by human-defined rewards. Based on the\ncollected dataset and proposed method, we establish a benchmark with several\ncarefully designed metrics.\n","authors":["Li Siyao","Tianpei Gu","Zhitao Yang","Zhengyu Lin","Ziwei Liu","Henghui Ding","Lei Yang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2403.18811v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.18807v1","updated":"2024-03-27T17:53:30Z","published":"2024-03-27T17:53:30Z","title":"ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth\n Estimation","summary":" In the absence of parallax cues, a learning-based single image depth\nestimation (SIDE) model relies heavily on shading and contextual cues in the\nimage. While this simplicity is attractive, it is necessary to train such\nmodels on large and varied datasets, which are difficult to capture. It has\nbeen shown that using embeddings from pre-trained foundational models, such as\nCLIP, improves zero shot transfer in several applications. Taking inspiration\nfrom this, in our paper we explore the use of global image priors generated\nfrom a pre-trained ViT model to provide more detailed contextual information.\nWe argue that the embedding vector from a ViT model, pre-trained on a large\ndataset, captures greater relevant information for SIDE than the usual route of\ngenerating pseudo image captions, followed by CLIP based text embeddings. Based\non this idea, we propose a new SIDE model using a diffusion backbone which is\nconditioned on ViT embeddings. Our proposed design establishes a new\nstate-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of\n0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on\nKITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to\n0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model\ntrained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%)\nover NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%,\n18%, 45%, 9%) by ZoeDepth. The code is available at\nhttps://github.com/Aradhye2002/EcoDepth.\n","authors":["Suraj Patni","Aradhye Agarwal","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2403.18807v1.pdf","comment":"Accepted at IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2311.10319v4","updated":"2024-03-27T17:41:50Z","published":"2023-11-17T04:04:29Z","title":"Shifting to Machine Supervision: Annotation-Efficient Semi and\n Self-Supervised Learning for Automatic Medical Image Segmentation and\n Classification","summary":" Advancements in clinical treatment are increasingly constrained by the\nlimitations of supervised learning techniques, which depend heavily on large\nvolumes of annotated data. The annotation process is not only costly but also\ndemands substantial time from clinical specialists. Addressing this issue, we\nintroduce the S4MI (Self-Supervision and Semi-Supervision for Medical Imaging)\npipeline, a novel approach that leverages advancements in self-supervised and\nsemi-supervised learning. These techniques engage in auxiliary tasks that do\nnot require labeling, thus simplifying the scaling of machine supervision\ncompared to fully-supervised methods. Our study benchmarks these techniques on\nthree distinct medical imaging datasets to evaluate their effectiveness in\nclassification and segmentation tasks. Notably, we observed that self\nsupervised learning significantly surpassed the performance of supervised\nmethods in the classification of all evaluated datasets. Remarkably, the\nsemi-supervised approach demonstrated superior outcomes in segmentation,\noutperforming fully-supervised methods while using 50% fewer labels across all\ndatasets. In line with our commitment to contributing to the scientific\ncommunity, we have made the S4MI code openly accessible, allowing for broader\napplication and further development of these methods.\n","authors":["Pranav Singh","Raviteja Chukkapalli","Shravan Chaudhari","Luoyao Chen","Mei Chen","Jinqian Pan","Craig Smuda","Jacopo Cirrone"],"pdf_url":"https://arxiv.org/pdf/2311.10319v4.pdf","comment":"Seventeen pages (incl. references), five figures, and one table.\n (Under Review)"},{"id":"http://arxiv.org/abs/2403.18795v1","updated":"2024-03-27T17:40:14Z","published":"2024-03-27T17:40:14Z","title":"Gamba: Marry Gaussian Splatting with Mamba for single view 3D\n reconstruction","summary":" We tackle the challenge of efficiently reconstructing a 3D asset from a\nsingle image with growing demands for automated 3D content creation pipelines.\nPrevious methods primarily rely on Score Distillation Sampling (SDS) and Neural\nRadiance Fields (NeRF). Despite their significant success, these approaches\nencounter practical limitations due to lengthy optimization and considerable\nmemory usage. In this report, we introduce Gamba, an end-to-end amortized 3D\nreconstruction model from single-view images, emphasizing two main insights:\n(1) 3D representation: leveraging a large number of 3D Gaussians for an\nefficient 3D Gaussian splatting process; (2) Backbone design: introducing a\nMamba-based sequential network that facilitates context-dependent reasoning and\nlinear scalability with the sequence (token) length, accommodating a\nsubstantial number of Gaussians. Gamba incorporates significant advancements in\ndata preprocessing, regularization design, and training methodologies. We\nassessed Gamba against existing optimization-based and feed-forward 3D\ngeneration approaches using the real-world scanned OmniObject3D dataset. Here,\nGamba demonstrates competitive generation capabilities, both qualitatively and\nquantitatively, while achieving remarkable speed, approximately 0.6 second on a\nsingle NVIDIA A100 GPU.\n","authors":["Qiuhong Shen","Xuanyu Yi","Zike Wu","Pan Zhou","Hanwang Zhang","Shuicheng Yan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18791v1","updated":"2024-03-27T17:35:24Z","published":"2024-03-27T17:35:24Z","title":"Object Pose Estimation via the Aggregation of Diffusion Features","summary":" Estimating the pose of objects from images is a crucial task of 3D scene\nunderstanding, and recent approaches have shown promising results on very large\nbenchmarks. However, these methods experience a significant performance drop\nwhen dealing with unseen objects. We believe that it results from the limited\ngeneralizability of image features. To address this problem, we have an\nin-depth analysis on the features of diffusion models, e.g. Stable Diffusion,\nwhich hold substantial potential for modeling unseen objects. Based on this\nanalysis, we then innovatively introduce these diffusion features for object\npose estimation. To achieve this, we propose three distinct architectures that\ncan effectively capture and aggregate diffusion features of different\ngranularity, greatly improving the generalizability of object pose estimation.\nOur approach outperforms the state-of-the-art methods by a considerable margin\non three popular benchmark datasets, LM, O-LM, and T-LESS. In particular, our\nmethod achieves higher accuracy than the previous best arts on unseen objects:\n98.2% vs. 93.5% on Unseen LM, 85.9% vs. 76.3% on Unseen O-LM, showing the\nstrong generalizability of our method. Our code is released at\nhttps://github.com/Tianfu18/diff-feats-pose.\n","authors":["Tianfu Wang","Guosheng Hu","Hongguang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18791v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.18784v1","updated":"2024-03-27T17:32:04Z","published":"2024-03-27T17:32:04Z","title":"SplatFace: Gaussian Splat Face Reconstruction Leveraging an Optimizable\n Surface","summary":" We present SplatFace, a novel Gaussian splatting framework designed for 3D\nhuman face reconstruction without reliance on accurate pre-determined geometry.\nOur method is designed to simultaneously deliver both high-quality novel view\nrendering and accurate 3D mesh reconstructions. We incorporate a generic 3D\nMorphable Model (3DMM) to provide a surface geometric structure, making it\npossible to reconstruct faces with a limited set of input images. We introduce\na joint optimization strategy that refines both the Gaussians and the morphable\nsurface through a synergistic non-rigid alignment process. A novel distance\nmetric, splat-to-surface, is proposed to improve alignment by considering both\nthe Gaussian position and covariance. The surface information is also utilized\nto incorporate a world-space densification process, resulting in superior\nreconstruction quality. Our experimental analysis demonstrates that the\nproposed method is competitive with both other Gaussian splatting techniques in\nnovel view synthesis and other 3D reconstruction methods in producing 3D face\nmeshes with high geometric precision.\n","authors":["Jiahao Luo","Jing Liu","James Davis"],"pdf_url":"https://arxiv.org/pdf/2403.18784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18775v1","updated":"2024-03-27T17:23:39Z","published":"2024-03-27T17:23:39Z","title":"ImageNet-D: Benchmarking Neural Network Robustness on Diffusion\n Synthetic Object","summary":" We establish rigorous benchmarks for visual perception robustness. Synthetic\nimages such as ImageNet-C, ImageNet-9, and Stylized ImageNet provide specific\ntype of evaluation over synthetic corruptions, backgrounds, and textures, yet\nthose robustness benchmarks are restricted in specified variations and have low\nsynthetic quality. In this work, we introduce generative model as a data source\nfor synthesizing hard images that benchmark deep models' robustness. Leveraging\ndiffusion models, we are able to generate images with more diversified\nbackgrounds, textures, and materials than any prior work, where we term this\nbenchmark as ImageNet-D. Experimental results show that ImageNet-D results in a\nsignificant accuracy drop to a range of vision models, from the standard ResNet\nvisual classifier to the latest foundation models like CLIP and MiniGPT-4,\nsignificantly reducing their accuracy by up to 60\\%. Our work suggests that\ndiffusion models can be an effective source to test vision models. The code and\ndataset are available at https://github.com/chenshuang-zhang/imagenet_d.\n","authors":["Chenshuang Zhang","Fei Pan","Junmo Kim","In So Kweon","Chengzhi Mao"],"pdf_url":"https://arxiv.org/pdf/2403.18775v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2312.01220v2","updated":"2024-03-27T17:23:16Z","published":"2023-12-02T20:11:48Z","title":"Boosting Object Detection with Zero-Shot Day-Night Domain Adaptation","summary":" Detecting objects in low-light scenarios presents a persistent challenge, as\ndetectors trained on well-lit data exhibit significant performance degradation\non low-light data due to low visibility. Previous methods mitigate this issue\nby exploring image enhancement or object detection techniques with real\nlow-light image datasets. However, the progress is impeded by the inherent\ndifficulties about collecting and annotating low-light images. To address this\nchallenge, we propose to boost low-light object detection with zero-shot\nday-night domain adaptation, which aims to generalize a detector from well-lit\nscenarios to low-light ones without requiring real low-light data. Revisiting\nRetinex theory in the low-level vision, we first design a reflectance\nrepresentation learning module to learn Retinex-based illumination invariance\nin images with a carefully designed illumination invariance reinforcement\nstrategy. Next, an interchange-redecomposition-coherence procedure is\nintroduced to improve over the vanilla Retinex image decomposition process by\nperforming two sequential image decompositions and introducing a\nredecomposition cohering loss. Extensive experiments on ExDark, DARK FACE, and\nCODaN datasets show strong low-light generalizability of our method. Our code\nis available at https://github.com/ZPDu/DAI-Net.\n","authors":["Zhipeng Du","Miaojing Shi","Jiankang Deng"],"pdf_url":"https://arxiv.org/pdf/2312.01220v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.06054v4","updated":"2024-03-27T17:06:10Z","published":"2024-03-10T00:47:05Z","title":"Decoupled Data Consistency with Diffusion Purification for Image\n Restoration","summary":" Diffusion models have recently gained traction as a powerful class of deep\ngenerative priors, excelling in a wide range of image restoration tasks due to\ntheir exceptional ability to model data distributions. To solve image\nrestoration problems, many existing techniques achieve data consistency by\nincorporating additional likelihood gradient steps into the reverse sampling\nprocess of diffusion models. However, the additional gradient steps pose a\nchallenge for real-world practical applications as they incur a large\ncomputational overhead, thereby increasing inference time. They also present\nadditional difficulties when using accelerated diffusion model samplers, as the\nnumber of data consistency steps is limited by the number of reverse sampling\nsteps. In this work, we propose a novel diffusion-based image restoration\nsolver that addresses these issues by decoupling the reverse process from the\ndata consistency steps. Our method involves alternating between a\nreconstruction phase to maintain data consistency and a refinement phase that\nenforces the prior via diffusion purification. Our approach demonstrates\nversatility, making it highly adaptable for efficient problem-solving in latent\nspace. Additionally, it reduces the necessity for numerous sampling steps\nthrough the integration of consistency models. The efficacy of our approach is\nvalidated through comprehensive experiments across various image restoration\ntasks, including image denoising, deblurring, inpainting, and super-resolution.\n","authors":["Xiang Li","Soo Min Kwon","Ismail R. Alkhouri","Saiprasad Ravishankar","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2403.06054v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18762v1","updated":"2024-03-27T17:01:10Z","published":"2024-03-27T17:01:10Z","title":"ModaLink: Unifying Modalities for Efficient Image-to-PointCloud Place\n Recognition","summary":" Place recognition is an important task for robots and autonomous cars to\nlocalize themselves and close loops in pre-built maps. While single-modal\nsensor-based methods have shown satisfactory performance, cross-modal place\nrecognition that retrieving images from a point-cloud database remains a\nchallenging problem. Current cross-modal methods transform images into 3D\npoints using depth estimation for modality conversion, which are usually\ncomputationally intensive and need expensive labeled data for depth\nsupervision. In this work, we introduce a fast and lightweight framework to\nencode images and point clouds into place-distinctive descriptors. We propose\nan effective Field of View (FoV) transformation module to convert point clouds\ninto an analogous modality as images. This module eliminates the necessity for\ndepth estimation and helps subsequent modules achieve real-time performance. We\nfurther design a non-negative factorization-based encoder to extract mutually\nconsistent semantic features between point clouds and images. This encoder\nyields more distinctive global descriptors for retrieval. Experimental results\non the KITTI dataset show that our proposed methods achieve state-of-the-art\nperformance while running in real time. Additional evaluation on the HAOMO\ndataset covering a 17 km trajectory further shows the practical generalization\ncapabilities. We have released the implementation of our methods as open source\nat: https://github.com/haomo-ai/ModaLink.git.\n","authors":["Weidong Xie","Lun Luo","Nanfei Ye","Yi Ren","Shaoyi Du","Minhang Wang","Jintao Xu","Rui Ai","Weihao Gu","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2403.18762v1.pdf","comment":"8 pages, 11 figures, conference"},{"id":"http://arxiv.org/abs/2403.18756v1","updated":"2024-03-27T16:56:14Z","published":"2024-03-27T16:56:14Z","title":"Detection of subclinical atherosclerosis by image-based deep learning on\n chest x-ray","summary":" Aims. To develop a deep-learning based system for recognition of subclinical\natherosclerosis on a plain frontal chest x-ray. Methods and Results. A\ndeep-learning algorithm to predict coronary artery calcium (CAC) score (the\nAI-CAC model) was developed on 460 chest x-ray (80% training cohort, 20%\ninternal validation cohort) of primary prevention patients (58.4% male, median\nage 63 [51-74] years) with available paired chest x-ray and chest computed\ntomography (CT) indicated for any clinical reason and performed within 3\nmonths. The CAC score calculated on chest CT was used as ground truth. The\nmodel was validated on an temporally-independent cohort of 90 patients from the\nsame institution (external validation). The diagnostic accuracy of the AI-CAC\nmodel assessed by the area under the curve (AUC) was the primary outcome.\nOverall, median AI-CAC score was 35 (0-388) and 28.9% patients had no AI-CAC.\nAUC of the AI-CAC model to identify a CAC>0 was 0.90 in the internal validation\ncohort and 0.77 in the external validation cohort. Sensitivity was consistently\nabove 92% in both cohorts. In the overall cohort (n=540), among patients with\nAI-CAC=0, a single ASCVD event occurred, after 4.3 years. Patients with\nAI-CAC>0 had significantly higher Kaplan Meier estimates for ASCVD events\n(13.5% vs. 3.4%, log-rank=0.013). Conclusion. The AI-CAC model seems to\naccurately detect subclinical atherosclerosis on chest x-ray with elevated\nsensitivity, and to predict ASCVD events with elevated negative predictive\nvalue. Adoption of the AI-CAC model to refine CV risk stratification or as an\nopportunistic screening tool requires prospective evaluation.\n","authors":["Guglielmo Gallone","Francesco Iodice","Alberto Presta","Davide Tore","Ovidio de Filippo","Michele Visciano","Carlo Alberto Barbano","Alessandro Serafini","Paola Gorrini","Alessandro Bruno","Walter Grosso Marra","James Hughes","Mario Iannaccone","Paolo Fonio","Attilio Fiandrotti","Alessandro Depaoli","Marco Grangetto","Gaetano Maria de Ferrari","Fabrizio D'Ascenzo"],"pdf_url":"https://arxiv.org/pdf/2403.18756v1.pdf","comment":"Submitted to European Heart Journal - Cardiovascular Imaging Added\n also the additional material 44 pages (30 main paper, 14 additional\n material), 14 figures (5 main manuscript, 9 additional material)"},{"id":"http://arxiv.org/abs/2303.09817v2","updated":"2024-03-27T16:52:59Z","published":"2023-03-17T07:53:18Z","title":"Interpretable machine learning for time-to-event prediction in medicine\n and healthcare","summary":" Time-to-event prediction, e.g. cancer survival analysis or hospital length of\nstay, is a highly prominent machine learning task in medical and healthcare\napplications. However, only a few interpretable machine learning methods comply\nwith its challenges. To facilitate a comprehensive explanatory analysis of\nsurvival models, we formally introduce time-dependent feature effects and\nglobal feature importance explanations. We show how post-hoc interpretation\nmethods allow for finding biases in AI systems predicting length of stay using\na novel multi-modal dataset created from 1235 X-ray images with textual\nradiology reports annotated by human experts. Moreover, we evaluate cancer\nsurvival models beyond predictive performance to include the importance of\nmulti-omics feature groups based on a large-scale benchmark comprising 11\ndatasets from The Cancer Genome Atlas (TCGA). Model developers can use the\nproposed methods to debug and improve machine learning algorithms, while\nphysicians can discover disease biomarkers and assess their significance. We\nhope the contributed open data and code resources facilitate future work in the\nemerging research direction of explainable survival analysis.\n","authors":["Hubert Baniecki","Bartlomiej Sobieski","Patryk Szatkowski","Przemyslaw Bombinski","Przemyslaw Biecek"],"pdf_url":"https://arxiv.org/pdf/2303.09817v2.pdf","comment":"An extended version of an AIME 2023 paper submitted to Artificial\n Intelligence in Medicine"},{"id":"http://arxiv.org/abs/2403.14623v2","updated":"2024-03-27T16:49:35Z","published":"2024-03-21T17:59:41Z","title":"Simplified Diffusion Schrödinger Bridge","summary":" This paper introduces a novel theoretical simplification of the Diffusion\nSchr\\\"odinger Bridge (DSB) that facilitates its unification with Score-based\nGenerative Models (SGMs), addressing the limitations of DSB in complex data\ngeneration and enabling faster convergence and enhanced performance. By\nemploying SGMs as an initial solution for DSB, our approach capitalizes on the\nstrengths of both frameworks, ensuring a more efficient training process and\nimproving the performance of SGM. We also propose a reparameterization\ntechnique that, despite theoretical approximations, practically improves the\nnetwork's fitting capabilities. Our extensive experimental evaluations confirm\nthe effectiveness of the simplified DSB, demonstrating its significant\nimprovements. We believe the contributions of this work pave the way for\nadvanced generative modeling. The code is available at\nhttps://github.com/checkcrab/SDSB.\n","authors":["Zhicong Tang","Tiankai Hang","Shuyang Gu","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.14623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11107v2","updated":"2024-03-27T16:48:34Z","published":"2024-03-17T06:21:21Z","title":"Self-supervised co-salient object detection via feature correspondence\n at multiple scales","summary":" Our paper introduces a novel two-stage self-supervised approach for detecting\nco-occurring salient objects (CoSOD) in image groups without requiring\nsegmentation annotations. Unlike existing unsupervised methods that rely solely\non patch-level information (e.g. clustering patch descriptors) or on\ncomputation heavy off-the-shelf components for CoSOD, our lightweight model\nleverages feature correspondences at both patch and region levels,\nsignificantly improving prediction performance. In the first stage, we train a\nself-supervised network that detects co-salient regions by computing local\npatch-level feature correspondences across images. We obtain the segmentation\npredictions using confidence-based adaptive thresholding. In the next stage, we\nrefine these intermediate segmentations by eliminating the detected regions\n(within each image) whose averaged feature representations are dissimilar to\nthe foreground feature representation averaged across all the cross-attention\nmaps (from the previous stage). Extensive experiments on three CoSOD benchmark\ndatasets show that our self-supervised model outperforms the corresponding\nstate-of-the-art models by a huge margin (e.g. on the CoCA dataset, our model\nhas a 13.7% F-measure gain over the SOTA unsupervised CoSOD model). Notably,\nour self-supervised model also outperforms several recent fully supervised\nCoSOD models on the three test datasets (e.g., on the CoCA dataset, our model\nhas a 4.6% F-measure gain over a recent supervised CoSOD model).\n","authors":["Souradeep Chakraborty","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.11107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18734v1","updated":"2024-03-27T16:22:45Z","published":"2024-03-27T16:22:45Z","title":"A vascular synthetic model for improved aneurysm segmentation and\n detection via Deep Neural Networks","summary":" We hereby present a full synthetic model, able to mimic the various\nconstituents of the cerebral vascular tree: the cerebral arteries, the\nbifurcations and the intracranial aneurysms. By building this model, our goal\nwas to provide a substantial dataset of brain arteries which could be used by a\n3D Convolutional Neural Network (CNN) to either segment or detect/recognize\nvarious vascular diseases (such as artery dissection/thrombosis) or even some\nportions of the cerebral vasculature, such as the bifurcations or aneurysms. In\nthis study, we will particularly focus on Intra-Cranial Aneurysm (ICA)\ndetection and segmentation. The cerebral aneurysms most often occur on a\nparticular structure of the vascular tree named the Circle of Willis. Various\nstudies have been conducted to detect and monitor the ICAs and those based on\nDeep Learning (DL) achieve the best performances. Specifically, in this work,\nwe propose a full synthetic 3D model able to mimic the brain vasculature as\nacquired by Magnetic Resonance Angiography (MRA), and more particularly the\nTime Of Flight (TOF) principle. Among the various MRI modalities, the MRA-TOF\nallows to have a relatively good rendering of the blood vessels and is\nnon-invasive (no contrast liquid injection). Our model has been designed to\nsimultaneously mimic the arteries geometry, the ICA shape and the background\nnoise. The geometry of the vascular tree is modeled thanks to an interpolation\nwith 3D Spline functions, and the statistical properties of the background MRI\nnoise is collected from MRA acquisitions and reproduced within the model. In\nthis work, we thoroughly describe the synthetic vasculature model, we build up\na neural network designed for ICA segmentation and detection, and finally, we\ncarry out an in-depth evaluation of the performance gap gained thanks to the\nsynthetic model data augmentation.\n","authors":["Rafic Nader","Florent Autrusseau","Vincent L'Allinec","Romain Bourcier"],"pdf_url":"https://arxiv.org/pdf/2403.18734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18731v1","updated":"2024-03-27T16:21:24Z","published":"2024-03-27T16:21:24Z","title":"Enhancing Manufacturing Quality Prediction Models through the\n Integration of Explainability Methods","summary":" This research presents a method that utilizes explainability techniques to\namplify the performance of machine learning (ML) models in forecasting the\nquality of milling processes, as demonstrated in this paper through a\nmanufacturing use case. The methodology entails the initial training of ML\nmodels, followed by a fine-tuning phase where irrelevant features identified\nthrough explainability methods are eliminated. This procedural refinement\nresults in performance enhancements, paving the way for potential reductions in\nmanufacturing costs and a better understanding of the trained ML models. This\nstudy highlights the usefulness of explainability techniques in both explaining\nand optimizing predictive models in the manufacturing realm.\n","authors":["Dennis Gross","Helge Spieker","Arnaud Gotlieb","Ricardo Knoblauch"],"pdf_url":"https://arxiv.org/pdf/2403.18731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18730v1","updated":"2024-03-27T16:20:55Z","published":"2024-03-27T16:20:55Z","title":"Towards Image Ambient Lighting Normalization","summary":" Lighting normalization is a crucial but underexplored restoration task with\nbroad applications. However, existing works often simplify this task within the\ncontext of shadow removal, limiting the light sources to one and\noversimplifying the scene, thus excluding complex self-shadows and restricting\nsurface classes to smooth ones. Although promising, such simplifications hinder\ngeneralizability to more realistic settings encountered in daily use. In this\npaper, we propose a new challenging task termed Ambient Lighting Normalization\n(ALN), which enables the study of interactions between shadows, unifying image\nrestoration and shadow removal in a broader context. To address the lack of\nappropriate datasets for ALN, we introduce the large-scale high-resolution\ndataset Ambient6K, comprising samples obtained from multiple light sources and\nincluding self-shadows resulting from complex geometries, which is the first of\nits kind. For benchmarking, we select various mainstream methods and rigorously\nevaluate them on Ambient6K. Additionally, we propose IFBlend, a novel strong\nbaseline that maximizes Image-Frequency joint entropy to selectively restore\nlocal areas under different lighting conditions, without relying on shadow\nlocalization priors. Experiments show that IFBlend achieves SOTA scores on\nAmbient6K and exhibits competitive performance on conventional shadow removal\nbenchmarks compared to shadow-specific models with mask priors. The dataset,\nbenchmark, and code are available at https://github.com/fvasluianu97/IFBlend.\n","authors":["Florin-Alexandru Vasluianu","Tim Seizinger","Zongwei Wu","Rakesh Ranjan","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2403.18730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09992v3","updated":"2024-03-27T16:20:52Z","published":"2023-03-17T14:07:55Z","title":"LION: Implicit Vision Prompt Tuning","summary":" Despite recent competitive performance across a range of vision tasks, vision\nTransformers still have an issue of heavy computational costs. Recently, vision\nprompt learning has provided an economic solution to this problem without\nfine-tuning the whole large-scale models. However, the efficiency of existing\nmodels are still far from satisfactory due to insertion of extensive prompts\nblocks and trick prompt designs. In this paper, we propose an efficient vision\nmodel named impLicit vIsion prOmpt tuNing (LION), which is motivated by deep\nimplicit models with stable memory costs for various complex tasks. In\nparticular, we merely insect two equilibrium implicit layers in two ends of the\npre-trained main backbone with parameters in the backbone frozen. Moreover, we\nprune the parameters in these two layers according to lottery hypothesis. The\nperformance obtained by our LION are promising on a wide range of datasets. In\nparticular, our LION reduces up to 11.5% of training parameter numbers while\nobtaining higher performance compared with the state-of-the-art baseline VPT,\nespecially under challenging scenes. Furthermore, we find that our proposed\nLION had a good generalization performance, making it an easy way to boost\ntransfer learning in the future.\n","authors":["Haixin Wang","Jianlong Chang","Xiao Luo","Jinan Sun","Zhouchen Lin","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2303.09992v3.pdf","comment":"Accepted by AAAI2024; 9 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.18717v1","updated":"2024-03-27T16:06:37Z","published":"2024-03-27T16:06:37Z","title":"Semi-Supervised Learning for Deep Causal Generative Models","summary":" Developing models that can answer questions of the form \"How would $x$ change\nif $y$ had been $z$?\" is fundamental for advancing medical image analysis.\nTraining causal generative models that address such counterfactual questions,\nthough, currently requires that all relevant variables have been observed and\nthat corresponding labels are available in training data. However, clinical\ndata may not have complete records for all patients and state of the art causal\ngenerative models are unable to take full advantage of this. We thus develop,\nfor the first time, a semi-supervised deep causal generative model that\nexploits the causal relationships between variables to maximise the use of all\navailable data. We explore this in the setting where each sample is either\nfully labelled or fully unlabelled, as well as the more clinically realistic\ncase of having different labels missing for each sample. We leverage techniques\nfrom causal inference to infer missing values and subsequently generate\nrealistic counterfactuals, even for samples with incomplete labels.\n","authors":["Yasin Ibrahim","Hermione Warr","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2403.18717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18715v1","updated":"2024-03-27T16:04:47Z","published":"2024-03-27T16:04:47Z","title":"Mitigating Hallucinations in Large Vision-Language Models with\n Instruction Contrastive Decoding","summary":" Large Vision-Language Models (LVLMs) are increasingly adept at generating\ncontextually detailed and coherent responses from visual inputs. However, their\napplication in multimodal decision-making and open-ended generation is hindered\nby a notable rate of hallucinations, where generated text inaccurately\nrepresents the visual contents. To address this issue, this paper introduces\nthe Instruction Contrastive Decoding (ICD) method, a novel approach designed to\nreduce hallucinations during LVLM inference. Our method is inspired by our\nobservation that what we call disturbance instructions significantly exacerbate\nhallucinations in multimodal fusion modules. ICD contrasts distributions from\nstandard and instruction disturbance, thereby increasing alignment uncertainty\nand effectively subtracting hallucinated concepts from the original\ndistribution. Through comprehensive experiments on discriminative benchmarks\n(POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that\nICD significantly mitigates both object-level and attribute-level\nhallucinations. Moreover, our method not only addresses hallucinations but also\nsignificantly enhances the general perception and recognition capabilities of\nLVLMs.\n","authors":["Xintong Wang","Jingheng Pan","Liang Ding","Chris Biemann"],"pdf_url":"https://arxiv.org/pdf/2403.18715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18714v1","updated":"2024-03-27T16:02:00Z","published":"2024-03-27T16:02:00Z","title":"Bringing Textual Prompt to AI-Generated Image Quality Assessment","summary":" AI-Generated Images (AGIs) have inherent multimodal nature. Unlike\ntraditional image quality assessment (IQA) on natural scenarios, AGIs quality\nassessment (AGIQA) takes the correspondence of image and its textual prompt\ninto consideration. This is coupled in the ground truth score, which confuses\nthe unimodal IQA methods. To solve this problem, we introduce IP-IQA (AGIs\nQuality Assessment via Image and Prompt), a multimodal framework for AGIQA via\ncorresponding image and prompt incorporation. Specifically, we propose a novel\nincremental pretraining task named Image2Prompt for better understanding of\nAGIs and their corresponding textual prompts. An effective and efficient\nimage-prompt fusion module, along with a novel special [QA] token, are also\napplied. Both are plug-and-play and beneficial for the cooperation of image and\nits corresponding prompt. Experiments demonstrate that our IP-IQA achieves the\nstate-of-the-art on AGIQA-1k and AGIQA-3k datasets. Code will be available.\n","authors":["Bowen Qu","Haohui Li","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2403.18714v1.pdf","comment":"6 pages, 3 figures, accepted by ICME2024"},{"id":"http://arxiv.org/abs/2403.18711v1","updated":"2024-03-27T15:58:25Z","published":"2024-03-27T15:58:25Z","title":"SAT-NGP : Unleashing Neural Graphics Primitives for Fast Relightable\n Transient-Free 3D reconstruction from Satellite Imagery","summary":" Current stereo-vision pipelines produce high accuracy 3D reconstruction when\nusing multiple pairs or triplets of satellite images. However, these pipelines\nare sensitive to the changes between images that can occur as a result of\nmulti-date acquisitions. Such variations are mainly due to variable shadows,\nreflexions and transient objects (cars, vegetation). To take such changes into\naccount, Neural Radiance Fields (NeRF) have recently been applied to multi-date\nsatellite imagery. However, Neural methods are very compute-intensive, taking\ndozens of hours to learn, compared with minutes for standard stereo-vision\npipelines. Following the ideas of Instant Neural Graphics Primitives we propose\nto use an efficient sampling strategy and multi-resolution hash encoding to\naccelerate the learning. Our model, Satellite Neural Graphics Primitives\n(SAT-NGP) decreases the learning time to 15 minutes while maintaining the\nquality of the 3D reconstruction.\n","authors":["Camille Billouard","Dawa Derksen","Emmanuelle Sarrazin","Bruno Vallet"],"pdf_url":"https://arxiv.org/pdf/2403.18711v1.pdf","comment":"5 pages, 3 figures, 1 table; Accepted to International Geoscience and\n Remote Sensing Symposium (IGARSS) 2024; Code available at\n https://github.com/Ellimac0/SAT-NGP"},{"id":"http://arxiv.org/abs/2403.18708v1","updated":"2024-03-27T15:56:42Z","published":"2024-03-27T15:56:42Z","title":"Dense Vision Transformer Compression with Few Samples","summary":" Few-shot model compression aims to compress a large model into a more compact\none with only a tiny training set (even without labels). Block-level pruning\nhas recently emerged as a leading technique in achieving high accuracy and low\nlatency in few-shot CNN compression. But, few-shot compression for Vision\nTransformers (ViT) remains largely unexplored, which presents a new challenge.\nIn particular, the issue of sparse compression exists in traditional CNN\nfew-shot methods, which can only produce very few compressed models of\ndifferent model sizes. This paper proposes a novel framework for few-shot ViT\ncompression named DC-ViT. Instead of dropping the entire block, DC-ViT\nselectively eliminates the attention module while retaining and reusing\nportions of the MLP module. DC-ViT enables dense compression, which outputs\nnumerous compressed models that densely populate the range of model complexity.\nDC-ViT outperforms state-of-the-art few-shot compression methods by a\nsignificant margin of 10 percentage points, along with lower latency in the\ncompression of ViT and its variants.\n","authors":["Hanxiao Zhang","Yifan Zhou","Guo-Hua Wang","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2403.18708v1.pdf","comment":"Accepted to CVPR 2024. Note: Jianxin Wu is a contributing author for\n the arXiv version of this paper but is not listed as an author in the CVPR\n version due to his role as Program Chair"},{"id":"http://arxiv.org/abs/2401.15120v2","updated":"2024-03-27T15:49:52Z","published":"2024-01-26T03:44:58Z","title":"Incorporating simulated spatial context information improves the\n effectiveness of contrastive learning models","summary":" Visual learning often occurs in a specific context, where an agent acquires\nskills through exploration and tracking of its location in a consistent\nenvironment. The historical spatial context of the agent provides a similarity\nsignal for self-supervised contrastive learning. We present a unique approach,\ntermed Environmental Spatial Similarity (ESS), that complements existing\ncontrastive learning methods. Using images from simulated, photorealistic\nenvironments as an experimental setting, we demonstrate that ESS outperforms\ntraditional instance discrimination approaches. Moreover, sampling additional\ndata from the same environment substantially improves accuracy and provides new\naugmentations. ESS allows remarkable proficiency in room classification and\nspatial prediction tasks, especially in unfamiliar environments. This learning\nparadigm has the potential to enable rapid visual learning in agents operating\nin new environments with unique visual characteristics. Potentially\ntransformative applications span from robotics to space exploration. Our proof\nof concept demonstrates improved efficiency over methods that rely on\nextensive, disconnected datasets.\n","authors":["Lizhen Zhu","James Z. Wang","Wonseuk Lee","Brad Wyble"],"pdf_url":"https://arxiv.org/pdf/2401.15120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12091v3","updated":"2024-03-27T15:44:25Z","published":"2023-03-21T09:07:15Z","title":"Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised\n Learning","summary":" Semi-supervised learning (SSL) methods assume that labeled data, unlabeled\ndata and test data are from the same distribution. Open-set semi-supervised\nlearning (Open-set SSL) considers a more practical scenario, where unlabeled\ndata and test data contain new categories (outliers) not observed in labeled\ndata (inliers). Most previous works focused on outlier detection via binary\nclassifiers, which suffer from insufficient scalability and inability to\ndistinguish different types of uncertainty. In this paper, we propose a novel\nframework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these\nlimitations. Concretely, we first introduce evidential deep learning (EDL) as\nan outlier detector to quantify different types of uncertainty, and design\ndifferent uncertainty metrics for self-training and inference. Furthermore, we\npropose a novel adaptive negative optimization strategy, making EDL more\ntailored to the unlabeled dataset containing both inliers and outliers. As\ndemonstrated empirically, our proposed method outperforms existing\nstate-of-the-art methods across four datasets.\n","authors":["Yang Yu","Danruo Deng","Furui Liu","Yueming Jin","Qi Dou","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2303.12091v3.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2403.18690v1","updated":"2024-03-27T15:41:23Z","published":"2024-03-27T15:41:23Z","title":"Annolid: Annotate, Segment, and Track Anything You Need","summary":" Annolid is a deep learning-based software package designed for the\nsegmentation, labeling, and tracking of research targets within video files,\nfocusing primarily on animal behavior analysis. Based on state-of-the-art\ninstance segmentation methods, Annolid now harnesses the Cutie video object\nsegmentation model to achieve resilient, markerless tracking of multiple\nanimals from single annotated frames, even in environments in which they may be\npartially or entirely concealed by environmental features or by one another.\nOur integration of Segment Anything and Grounding-DINO strategies additionally\nenables the automatic masking and segmentation of recognizable animals and\nobjects by text command, removing the need for manual annotation. Annolid's\ncomprehensive approach to object segmentation flexibly accommodates a broad\nspectrum of behavior analysis applications, enabling the classification of\ndiverse behavioral states such as freezing, digging, pup huddling, and social\ninteractions in addition to the tracking of animals and their body parts.\n","authors":["Chen Yang","Thomas A. Cleland"],"pdf_url":"https://arxiv.org/pdf/2403.18690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08479v2","updated":"2024-03-27T15:38:27Z","published":"2023-12-13T19:38:50Z","title":"Vision Transformer-Based Deep Learning for Histologic Classification of\n Endometrial Cancer","summary":" Endometrial cancer, the fourth most common cancer in females in the United\nStates, with the lifetime risk for developing this disease is approximately\n2.8% in women. Precise histologic evaluation and molecular classification of\nendometrial cancer is important for effective patient management and\ndetermining the best treatment modalities. This study introduces EndoNet, which\nuses convolutional neural networks for extracting histologic features and a\nvision transformer for aggregating these features and classifying slides based\non their visual characteristics into high- and low- grade. The model was\ntrained on 929 digitized hematoxylin and eosin-stained whole-slide images of\nendometrial cancer from hysterectomy cases at Dartmouth-Health. It classifies\nthese slides into low-grade (Endometroid Grades 1 and 2) and high-grade\n(endometroid carcinoma FIGO grade 3, uterine serous carcinoma, carcinosarcoma)\ncategories. EndoNet was evaluated on an internal test set of 110 patients and\nan external test set of 100 patients from the public TCGA database. The model\nachieved a weighted average F1-score of 0.91 (95% CI: 0.86-0.95) and an AUC of\n0.95 (95% CI: 0.89-0.99) on the internal test, and 0.86 (95% CI: 0.80-0.94) for\nF1-score and 0.86 (95% CI: 0.75-0.93) for AUC on the external test. Pending\nfurther validation, EndoNet has the potential to support pathologists without\nthe need of manual annotations in classifying the grades of gynecologic\npathology tumors.\n","authors":["Manu Goyal","Laura J. Tafe","James X. Feng","Kristen E. Muller","Liesbeth Hondelink","Jessica L. Bentz","Saeed Hassanpour"],"pdf_url":"https://arxiv.org/pdf/2312.08479v2.pdf","comment":"4 Tables and 3 Figures"},{"id":"http://arxiv.org/abs/2308.06098v2","updated":"2024-03-27T15:26:44Z","published":"2023-08-11T12:18:53Z","title":"Automated Construction of Time-Space Diagrams for Traffic Analysis Using\n Street-View Video Sequence","summary":" Time-space diagrams are essential tools for analyzing traffic patterns and\noptimizing transportation infrastructure and traffic management strategies.\nTraditional data collection methods for these diagrams have limitations in\nterms of temporal and spatial coverage. Recent advancements in camera\ntechnology have overcome these limitations and provided extensive urban data.\nIn this study, we propose an innovative approach to constructing time-space\ndiagrams by utilizing street-view video sequences captured by cameras mounted\non moving vehicles. Using the state-of-the-art YOLOv5, StrongSORT, and\nphotogrammetry techniques for distance calculation, we can infer vehicle\ntrajectories from the video data and generate time-space diagrams. To evaluate\nthe effectiveness of our proposed method, we utilized datasets from the KITTI\ncomputer vision benchmark suite. The evaluation results demonstrate that our\napproach can generate trajectories from video data, although there are some\nerrors that can be mitigated by improving the performance of the detector,\ntracker, and distance calculation components. In conclusion, the utilization of\nstreet-view video sequences captured by cameras mounted on moving vehicles,\ncombined with state-of-the-art computer vision techniques, has immense\npotential for constructing comprehensive time-space diagrams. These diagrams\noffer valuable insights into traffic patterns and contribute to the design of\ntransportation infrastructure and traffic management strategies.\n","authors":["Tanay Rastogi","Mårten Björkman"],"pdf_url":"https://arxiv.org/pdf/2308.06098v2.pdf","comment":"The paper is published in 2023 IEEE 26th International Conference on\n Intelligent Transportation Systems (ITSC)"},{"id":"http://arxiv.org/abs/2403.18674v1","updated":"2024-03-27T15:17:10Z","published":"2024-03-27T15:17:10Z","title":"Deep Learning for Robust and Explainable Models in Computer Vision","summary":" Recent breakthroughs in machine and deep learning (ML and DL) research have\nprovided excellent tools for leveraging enormous amounts of data and optimizing\nhuge models with millions of parameters to obtain accurate networks for image\nprocessing. These developments open up tremendous opportunities for using\nartificial intelligence (AI) in the automation and human assisted AI industry.\nHowever, as more and more models are deployed and used in practice, many\nchallenges have emerged. This thesis presents various approaches that address\nrobustness and explainability challenges for using ML and DL in practice.\n Robustness and reliability are the critical components of any model before\ncertification and deployment in practice. Deep convolutional neural networks\n(CNNs) exhibit vulnerability to transformations of their inputs, such as\nrotation and scaling, or intentional manipulations as described in the\nadversarial attack literature. In addition, building trust in AI-based models\nrequires a better understanding of current models and developing methods that\nare more explainable and interpretable a priori.\n This thesis presents developments in computer vision models' robustness and\nexplainability. Furthermore, this thesis offers an example of using vision\nmodels' feature response visualization (models' interpretations) to improve\nrobustness despite interpretability and robustness being seemingly unrelated in\nthe related research. Besides methodological developments for robust and\nexplainable vision models, a key message of this thesis is introducing model\ninterpretation techniques as a tool for understanding vision models and\nimproving their design and robustness. In addition to the theoretical\ndevelopments, this thesis demonstrates several applications of ML and DL in\ndifferent contexts, such as medical imaging and affective computing.\n","authors":["Mohammadreza Amirian"],"pdf_url":"https://arxiv.org/pdf/2403.18674v1.pdf","comment":"150 pages, 37 figures, 12 tables"},{"id":"http://arxiv.org/abs/2311.15803v3","updated":"2024-03-27T15:05:19Z","published":"2023-11-27T13:25:47Z","title":"SOAC: Spatio-Temporal Overlap-Aware Multi-Sensor Calibration using\n Neural Radiance Fields","summary":" In rapidly-evolving domains such as autonomous driving, the use of multiple\nsensors with different modalities is crucial to ensure high operational\nprecision and stability. To correctly exploit the provided information by each\nsensor in a single common frame, it is essential for these sensors to be\naccurately calibrated. In this paper, we leverage the ability of Neural\nRadiance Fields (NeRF) to represent different sensors modalities in a common\nvolumetric representation to achieve robust and accurate spatio-temporal sensor\ncalibration. By designing a partitioning approach based on the visible part of\nthe scene for each sensor, we formulate the calibration problem using only the\noverlapping areas. This strategy results in a more robust and accurate\ncalibration that is less prone to failure. We demonstrate that our approach\nworks on outdoor urban scenes by validating it on multiple established driving\ndatasets. Results show that our method is able to get better accuracy and\nrobustness compared to existing methods.\n","authors":["Quentin Herau","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Cyrille Migniot","Pascal Vasseur","Cédric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2311.15803v3.pdf","comment":"Accepted at CVPR 2024. Project page: https://qherau.github.io/SOAC/"},{"id":"http://arxiv.org/abs/2403.18660v1","updated":"2024-03-27T15:03:38Z","published":"2024-03-27T15:03:38Z","title":"InstructBrush: Learning Attention-based Instruction Optimization for\n Image Editing","summary":" In recent years, instruction-based image editing methods have garnered\nsignificant attention in image editing. However, despite encompassing a wide\nrange of editing priors, these methods are helpless when handling editing tasks\nthat are challenging to accurately describe through language. We propose\nInstructBrush, an inversion method for instruction-based image editing methods\nto bridge this gap. It extracts editing effects from exemplar image pairs as\nediting instructions, which are further applied for image editing. Two key\ntechniques are introduced into InstructBrush, Attention-based Instruction\nOptimization and Transformation-oriented Instruction Initialization, to address\nthe limitations of the previous method in terms of inversion effects and\ninstruction generalization. To explore the ability of instruction inversion\nmethods to guide image editing in open scenarios, we establish a\nTransformationOriented Paired Benchmark (TOP-Bench), which contains a rich set\nof scenes and editing types. The creation of this benchmark paves the way for\nfurther exploration of instruction inversion. Quantitatively and qualitatively,\nour approach achieves superior performance in editing and is more semantically\nconsistent with the target editing effects.\n","authors":["Ruoyu Zhao","Qingnan Fan","Fei Kou","Shuai Qin","Hong Gu","Wei Wu","Pengcheng Xu","Mingrui Zhu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2403.18660v1.pdf","comment":"Project Page: https://royzhao926.github.io/InstructBrush/"},{"id":"http://arxiv.org/abs/2311.12386v3","updated":"2024-03-27T15:01:44Z","published":"2023-11-21T06:55:21Z","title":"Point, Segment and Count: A Generalized Framework for Object Counting","summary":" Class-agnostic object counting aims to count all objects in an image with\nrespect to example boxes or class names, \\emph{a.k.a} few-shot and zero-shot\ncounting. In this paper, we propose a generalized framework for both few-shot\nand zero-shot object counting based on detection. Our framework combines the\nsuperior advantages of two foundation models without compromising their\nzero-shot capability: (\\textbf{i}) SAM to segment all possible objects as mask\nproposals, and (\\textbf{ii}) CLIP to classify proposals to obtain accurate\nobject counts. However, this strategy meets the obstacles of efficiency\noverhead and the small crowded objects that cannot be localized and\ndistinguished. To address these issues, our framework, termed PseCo, follows\nthree steps: point, segment, and count. Specifically, we first propose a\nclass-agnostic object localization to provide accurate but least point prompts\nfor SAM, which consequently not only reduces computation costs but also avoids\nmissing small objects. Furthermore, we propose a generalized object\nclassification that leverages CLIP image/text embeddings as the classifier,\nfollowing a hierarchical knowledge distillation to obtain discriminative\nclassifications among hierarchical mask proposals. Extensive experimental\nresults on FSC-147, COCO, and LVIS demonstrate that PseCo achieves\nstate-of-the-art performance in both few-shot/zero-shot object\ncounting/detection. Code: https://github.com/Hzzone/PseCo\n","authors":["Zhizhong Huang","Mingliang Dai","Yi Zhang","Junping Zhang","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2311.12386v3.pdf","comment":"Accepted by CVPR 2024. Camera ready"},{"id":"http://arxiv.org/abs/2311.17532v3","updated":"2024-03-27T15:01:22Z","published":"2023-11-29T11:10:40Z","title":"Weakly-Supervised Emotion Transition Learning for Diverse 3D Co-speech\n Gesture Generation","summary":" Generating vivid and emotional 3D co-speech gestures is crucial for virtual\navatar animation in human-machine interaction applications. While the existing\nmethods enable generating the gestures to follow a single emotion label, they\noverlook that long gesture sequence modeling with emotion transition is more\npractical in real scenes. In addition, the lack of large-scale available\ndatasets with emotional transition speech and corresponding 3D human gestures\nalso limits the addressing of this task. To fulfill this goal, we first\nincorporate the ChatGPT-4 and an audio inpainting approach to construct the\nhigh-fidelity emotion transition human speeches. Considering obtaining the\nrealistic 3D pose annotations corresponding to the dynamically inpainted\nemotion transition audio is extremely difficult, we propose a novel weakly\nsupervised training strategy to encourage authority gesture transitions.\nSpecifically, to enhance the coordination of transition gestures w.r.t\ndifferent emotional ones, we model the temporal association representation\nbetween two different emotional gesture sequences as style guidance and infuse\nit into the transition generation. We further devise an emotion mixture\nmechanism that provides weak supervision based on a learnable mixed emotion\nlabel for transition gestures. Last, we present a keyframe sampler to supply\neffective initial posture cues in long sequences, enabling us to generate\ndiverse gestures. Extensive experiments demonstrate that our method outperforms\nthe state-of-the-art models constructed by adapting single emotion-conditioned\ncounterparts on our newly defined emotion transition task and datasets. Our\ncode and dataset will be released on the project page:\nhttps://xingqunqi-lab.github.io/Emo-Transition-Gesture/.\n","authors":["Xingqun Qi","Jiahao Pan","Peng Li","Ruibin Yuan","Xiaowei Chi","Mengfei Li","Wenhan Luo","Wei Xue","Shanghang Zhang","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2311.17532v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18649v1","updated":"2024-03-27T14:56:44Z","published":"2024-03-27T14:56:44Z","title":"Addressing Data Annotation Challenges in Multiple Sensors: A Solution\n for Scania Collected Datasets","summary":" Data annotation in autonomous vehicles is a critical step in the development\nof Deep Neural Network (DNN) based models or the performance evaluation of the\nperception system. This often takes the form of adding 3D bounding boxes on\ntime-sequential and registered series of point-sets captured from active\nsensors like Light Detection and Ranging (LiDAR) and Radio Detection and\nRanging (RADAR). When annotating multiple active sensors, there is a need to\nmotion compensate and translate the points to a consistent coordinate frame and\ntimestamp respectively. However, highly dynamic objects pose a unique\nchallenge, as they can appear at different timestamps in each sensor's data.\nWithout knowing the speed of the objects, their position appears to be\ndifferent in different sensor outputs. Thus, even after motion compensation,\nhighly dynamic objects are not matched from multiple sensors in the same frame,\nand human annotators struggle to add unique bounding boxes that capture all\nobjects. This article focuses on addressing this challenge, primarily within\nthe context of Scania collected datasets. The proposed solution takes a track\nof an annotated object as input and uses the Moving Horizon Estimation (MHE) to\nrobustly estimate its speed. The estimated speed profile is utilized to correct\nthe position of the annotated box and add boxes to object clusters missed by\nthe original annotation.\n","authors":["Ajinkya Khoche","Aron Asefaw","Alejandro Gonzalez","Bogdan Timus","Sina Sharif Mansouri","Patric Jensfelt"],"pdf_url":"https://arxiv.org/pdf/2403.18649v1.pdf","comment":"Accepted to European Control Conference 2024"},{"id":"http://arxiv.org/abs/2403.18637v1","updated":"2024-03-27T14:42:08Z","published":"2024-03-27T14:42:08Z","title":"Transformers-based architectures for stroke segmentation: A review","summary":" Stroke remains a significant global health concern, necessitating precise and\nefficient diagnostic tools for timely intervention and improved patient\noutcomes. The emergence of deep learning methodologies has transformed the\nlandscape of medical image analysis. Recently, Transformers, initially designed\nfor natural language processing, have exhibited remarkable capabilities in\nvarious computer vision applications, including medical image analysis. This\ncomprehensive review aims to provide an in-depth exploration of the\ncutting-edge Transformer-based architectures applied in the context of stroke\nsegmentation. It commences with an exploration of stroke pathology, imaging\nmodalities, and the challenges associated with accurate diagnosis and\nsegmentation. Subsequently, the review delves into the fundamental ideas of\nTransformers, offering detailed insights into their architectural intricacies\nand the underlying mechanisms that empower them to effectively capture complex\nspatial information within medical images. The existing literature is\nsystematically categorized and analyzed, discussing various approaches that\nleverage Transformers for stroke segmentation. A critical assessment is\nprovided, highlighting the strengths and limitations of these methods,\nincluding considerations of performance and computational efficiency.\nAdditionally, this review explores potential avenues for future research and\ndevelopment\n","authors":["Yalda Zafari-Ghadim","Essam A. Rashed","Mohamed Mabrok"],"pdf_url":"https://arxiv.org/pdf/2403.18637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.11041v3","updated":"2024-03-27T14:29:27Z","published":"2022-04-23T10:19:58Z","title":"Learning by Erasing: Conditional Entropy based Transferable\n Out-Of-Distribution Detection","summary":" Out-of-distribution (OOD) detection is essential to handle the distribution\nshifts between training and test scenarios. For a new in-distribution (ID)\ndataset, existing methods require retraining to capture the dataset-specific\nfeature representation or data distribution. In this paper, we propose a deep\ngenerative models (DGM) based transferable OOD detection method, which is\nunnecessary to retrain on a new ID dataset. We design an image erasing strategy\nto equip exclusive conditional entropy distribution for each ID dataset, which\ndetermines the discrepancy of DGM's posteriori ucertainty distribution on\ndifferent ID datasets. Owing to the powerful representation capacity of\nconvolutional neural networks, the proposed model trained on complex dataset\ncan capture the above discrepancy between ID datasets without retraining and\nthus achieve transferable OOD detection. We validate the proposed method on\nfive datasets and verity that ours achieves comparable performance to the\nstate-of-the-art group based OOD detection methods that need to be retrained to\ndeploy on new ID datasets. Our code is available at\nhttps://github.com/oOHCIOo/CETOOD.\n","authors":["Meng Xing","Zhiyong Feng","Yong Su","Changjae Oh"],"pdf_url":"https://arxiv.org/pdf/2204.11041v3.pdf","comment":"update new experimental results"},{"id":"http://arxiv.org/abs/2403.18605v1","updated":"2024-03-27T14:24:30Z","published":"2024-03-27T14:24:30Z","title":"FlexEdit: Flexible and Controllable Diffusion-based Object-centric Image\n Editing","summary":" Our work addresses limitations seen in previous approaches for object-centric\nediting problems, such as unrealistic results due to shape discrepancies and\nlimited control in object replacement or insertion. To this end, we introduce\nFlexEdit, a flexible and controllable editing framework for objects where we\niteratively adjust latents at each denoising step using our FlexEdit block.\nInitially, we optimize latents at test time to align with specified object\nconstraints. Then, our framework employs an adaptive mask, automatically\nextracted during denoising, to protect the background while seamlessly blending\nnew content into the target image. We demonstrate the versatility of FlexEdit\nin various object editing tasks and curate an evaluation test suite with\nsamples from both real and synthetic images, along with novel evaluation\nmetrics designed for object-centric editing. We conduct extensive experiments\non different editing scenarios, demonstrating the superiority of our editing\nframework over recent advanced text-guided image editing methods. Our project\npage is published at https://flex-edit.github.io/.\n","authors":["Trong-Tung Nguyen","Duc-Anh Nguyen","Anh Tran","Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2403.18605v1.pdf","comment":"Our project page: https://flex-edit.github.io/"},{"id":"http://arxiv.org/abs/2403.18600v1","updated":"2024-03-27T14:22:40Z","published":"2024-03-27T14:22:40Z","title":"RAP: Retrieval-Augmented Planner for Adaptive Procedure Planning in\n Instructional Videos","summary":" Procedure Planning in instructional videos entails generating a sequence of\naction steps based on visual observations of the initial and target states.\nDespite the rapid progress in this task, there remain several critical\nchallenges to be solved: (1) Adaptive procedures: Prior works hold an\nunrealistic assumption that the number of action steps is known and fixed,\nleading to non-generalizable models in real-world scenarios where the sequence\nlength varies. (2) Temporal relation: Understanding the step temporal relation\nknowledge is essential in producing reasonable and executable plans. (3)\nAnnotation cost: Annotating instructional videos with step-level labels (i.e.,\ntimestamp) or sequence-level labels (i.e., action category) is demanding and\nlabor-intensive, limiting its generalizability to large-scale datasets.In this\nwork, we propose a new and practical setting, called adaptive procedure\nplanning in instructional videos, where the procedure length is not fixed or\npre-determined. To address these challenges we introduce Retrieval-Augmented\nPlanner (RAP) model. Specifically, for adaptive procedures, RAP adaptively\ndetermines the conclusion of actions using an auto-regressive model\narchitecture. For temporal relation, RAP establishes an external memory module\nto explicitly retrieve the most relevant state-action pairs from the training\nvideos and revises the generated procedures. To tackle high annotation cost,\nRAP utilizes a weakly-supervised learning manner to expand the training dataset\nto other task-relevant, unannotated videos by generating pseudo labels for\naction steps. Experiments on CrossTask and COIN benchmarks show the superiority\nof RAP over traditional fixed-length models, establishing it as a strong\nbaseline solution for adaptive procedure planning.\n","authors":["Ali Zare","Yulei Niu","Hammad Ayyubi","Shih-fu Chang"],"pdf_url":"https://arxiv.org/pdf/2403.18600v1.pdf","comment":"23 pages, 6 figures, 12 tables"},{"id":"http://arxiv.org/abs/2403.18593v1","updated":"2024-03-27T14:18:09Z","published":"2024-03-27T14:18:09Z","title":"Homogeneous Tokenizer Matters: Homogeneous Visual Tokenizer for Remote\n Sensing Image Understanding","summary":" The tokenizer, as one of the fundamental components of large models, has long\nbeen overlooked or even misunderstood in visual tasks. One key factor of the\ngreat comprehension power of the large language model is that natural language\ntokenizers utilize meaningful words or subwords as the basic elements of\nlanguage. In contrast, mainstream visual tokenizers, represented by patch-based\nmethods such as Patch Embed, rely on meaningless rectangular patches as basic\nelements of vision, which cannot serve as effectively as words or subwords in\nlanguage. Starting from the essence of the tokenizer, we defined semantically\nindependent regions (SIRs) for vision. We designed a simple HOmogeneous visual\ntOKenizer: HOOK. HOOK mainly consists of two modules: the Object Perception\nModule (OPM) and the Object Vectorization Module (OVM). To achieve homogeneity,\nthe OPM splits the image into 4*4 pixel seeds and then utilizes the attention\nmechanism to perceive SIRs. The OVM employs cross-attention to merge seeds\nwithin the same SIR. To achieve adaptability, the OVM defines a variable number\nof learnable vectors as cross-attention queries, allowing for the adjustment of\ntoken quantity. We conducted experiments on the NWPU-RESISC45, WHU-RS19\nclassification dataset, and GID5 segmentation dataset for sparse and dense\ntasks. The results demonstrate that the visual tokens obtained by HOOK\ncorrespond to individual objects, which demonstrates homogeneity. HOOK\noutperformed Patch Embed by 6\\% and 10\\% in the two tasks and achieved\nstate-of-the-art performance compared to the baselines used for comparison.\nCompared to Patch Embed, which requires more than one hundred tokens for one\nimage, HOOK requires only 6 and 8 tokens for sparse and dense tasks,\nrespectively, resulting in efficiency improvements of 1.5 to 2.8 times. The\ncode is available at https://github.com/GeoX-Lab/Hook.\n","authors":["Run Shao","Zhaoyang Zhang","Chao Tao","Yunsheng Zhang","Chengli Peng","Haifeng Li"],"pdf_url":"https://arxiv.org/pdf/2403.18593v1.pdf","comment":"20 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2403.18589v1","updated":"2024-03-27T14:12:56Z","published":"2024-03-27T14:12:56Z","title":"Users prefer Jpegli over same-sized libjpeg-turbo or MozJPEG","summary":" We performed pairwise comparisons by human raters of JPEG images from\nMozJPEG, libjpeg-turbo and our new Jpegli encoder. When compressing images at a\nquality similar to libjpeg-turbo quality 95, the Jpegli images were 54% likely\nto be preferred over both libjpeg-turbo and MozJPEG images, but used only 2.8\nbits per pixel compared to libjpeg-turbo and MozJPEG that used 3.8 and 3.5 bits\nper pixel respectively. The raw ratings and source images are publicly\navailable for further analysis and study.\n","authors":["Martin Bruse","Luca Versari","Zoltan Szabadka","Jyrki Alakuijala"],"pdf_url":"https://arxiv.org/pdf/2403.18589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18587v1","updated":"2024-03-27T14:11:23Z","published":"2024-03-27T14:11:23Z","title":"The Impact of Uniform Inputs on Activation Sparsity and Energy-Latency\n Attacks in Computer Vision","summary":" Resource efficiency plays an important role for machine learning nowadays.\nThe energy and decision latency are two critical aspects to ensure a\nsustainable and practical application. Unfortunately, the energy consumption\nand decision latency are not robust against adversaries. Researchers have\nrecently demonstrated that attackers can compute and submit so-called sponge\nexamples at inference time to increase the energy consumption and decision\nlatency of neural networks. In computer vision, the proposed strategy crafts\ninputs with less activation sparsity which could otherwise be used to\naccelerate the computation. In this paper, we analyze the mechanism how these\nenergy-latency attacks reduce activation sparsity. In particular, we find that\ninput uniformity is a key enabler. A uniform image, that is, an image with\nmostly flat, uniformly colored surfaces, triggers more activations due to a\nspecific interplay of convolution, batch normalization, and ReLU activation.\nBased on these insights, we propose two new simple, yet effective strategies\nfor crafting sponge examples: sampling images from a probability distribution\nand identifying dense, yet inconspicuous inputs in natural datasets. We\nempirically examine our findings in a comprehensive evaluation with multiple\nimage classification models and show that our attack achieves the same sparsity\neffect as prior sponge-example methods, but at a fraction of computation\neffort. We also show that our sponge examples transfer between different neural\nnetworks. Finally, we discuss applications of our findings for the good by\nimproving efficiency by increasing sparsity.\n","authors":["Andreas Müller","Erwin Quiring"],"pdf_url":"https://arxiv.org/pdf/2403.18587v1.pdf","comment":"Accepted at the DLSP 2024"},{"id":"http://arxiv.org/abs/2312.07264v2","updated":"2024-03-27T14:09:10Z","published":"2023-12-12T13:44:53Z","title":"Dual Structure-Aware Image Filterings for Semi-supervised Medical Image\n Segmentation","summary":" Semi-supervised image segmentation has attracted great attention recently.\nThe key is how to leverage unlabeled images in the training process. Most\nmethods maintain consistent predictions of the unlabeled images under\nvariations (e.g., adding noise/perturbations, or creating alternative versions)\nin the image and/or model level. In most image-level variation, medical images\noften have prior structure information, which has not been well explored. In\nthis paper, we propose novel dual structure-aware image filterings (DSAIF) as\nthe image-level variations for semi-supervised medical image segmentation.\nMotivated by connected filtering that simplifies image via filtering in\nstructure-aware tree-based image representation, we resort to the dual contrast\ninvariant Max-tree and Min-tree representation. Specifically, we propose a\nnovel connected filtering that removes topologically equivalent nodes (i.e.\nconnected components) having no siblings in the Max/Min-tree. This results in\ntwo filtered images preserving topologically critical structure. Applying the\nproposed DSAIF to mutually supervised networks decreases the consensus of their\nerroneous predictions on unlabeled images. This helps to alleviate the\nconfirmation bias issue of overfitting to noisy pseudo labels of unlabeled\nimages, and thus effectively improves the segmentation performance. Extensive\nexperimental results on three benchmark datasets demonstrate that the proposed\nmethod significantly/consistently outperforms some state-of-the-art methods.\nThe source codes will be publicly available.\n","authors":["Yuliang Gu","Zhichao Sun","Tian Chen","Xin Xiao","Yepeng Liu","Yongchao Xu","Laurent Najman"],"pdf_url":"https://arxiv.org/pdf/2312.07264v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18575v1","updated":"2024-03-27T13:56:08Z","published":"2024-03-27T13:56:08Z","title":"HandBooster: Boosting 3D Hand-Mesh Reconstruction by Conditional\n Synthesis and Sampling of Hand-Object Interactions","summary":" Reconstructing 3D hand mesh robustly from a single image is very challenging,\ndue to the lack of diversity in existing real-world datasets. While data\nsynthesis helps relieve the issue, the syn-to-real gap still hinders its usage.\nIn this work, we present HandBooster, a new approach to uplift the data\ndiversity and boost the 3D hand-mesh reconstruction performance by training a\nconditional generative space on hand-object interactions and purposely sampling\nthe space to synthesize effective data samples. First, we construct versatile\ncontent-aware conditions to guide a diffusion model to produce realistic images\nwith diverse hand appearances, poses, views, and backgrounds; favorably,\naccurate 3D annotations are obtained for free. Then, we design a novel\ncondition creator based on our similarity-aware distribution sampling\nstrategies to deliberately find novel and realistic interaction poses that are\ndistinctive from the training set. Equipped with our method, several baselines\ncan be significantly improved beyond the SOTA on the HO3D and DexYCB\nbenchmarks. Our code will be released on\nhttps://github.com/hxwork/HandBooster_Pytorch.\n","authors":["Hao Xu","Haipeng Li","Yinqiao Wang","Shuaicheng Liu","Chi-Wing Fu"],"pdf_url":"https://arxiv.org/pdf/2403.18575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07636v3","updated":"2024-03-27T13:51:59Z","published":"2024-03-12T13:18:22Z","title":"Decomposing Disease Descriptions for Enhanced Pathology Detection: A\n Multi-Aspect Vision-Language Pre-training Framework","summary":" Medical vision language pre-training (VLP) has emerged as a frontier of\nresearch, enabling zero-shot pathological recognition by comparing the query\nimage with the textual descriptions for each disease. Due to the complex\nsemantics of biomedical texts, current methods struggle to align medical images\nwith key pathological findings in unstructured reports. This leads to the\nmisalignment with the target disease's textual representation. In this paper,\nwe introduce a novel VLP framework designed to dissect disease descriptions\ninto their fundamental aspects, leveraging prior knowledge about the visual\nmanifestations of pathologies. This is achieved by consulting a large language\nmodel and medical experts. Integrating a Transformer module, our approach\naligns an input image with the diverse elements of a disease, generating\naspect-centric image representations. By consolidating the matches from each\naspect, we improve the compatibility between an image and its associated\ndisease. Additionally, capitalizing on the aspect-oriented representations, we\npresent a dual-head Transformer tailored to process known and unknown diseases,\noptimizing the comprehensive detection efficacy. Conducting experiments on\nseven downstream datasets, ours improves the accuracy of recent methods by up\nto 8.56% and 17.0% for seen and unseen categories, respectively. Our code is\nreleased at https://github.com/HieuPhan33/MAVL.\n","authors":["Vu Minh Hieu Phan","Yutong Xie","Yuankai Qi","Lingqiao Liu","Liyang Liu","Bowen Zhang","Zhibin Liao","Qi Wu","Minh-Son To","Johan W. Verjans"],"pdf_url":"https://arxiv.org/pdf/2403.07636v3.pdf","comment":"Accepted at CVPR2024. Pre-print before final camera-ready version"},{"id":"http://arxiv.org/abs/2403.18565v1","updated":"2024-03-27T13:46:01Z","published":"2024-03-27T13:46:01Z","title":"Artifact Reduction in 3D and 4D Cone-beam Computed Tomography Images\n with Deep Learning -- A Review","summary":" Deep learning based approaches have been used to improve image quality in\ncone-beam computed tomography (CBCT), a medical imaging technique often used in\napplications such as image-guided radiation therapy, implant dentistry or\northopaedics. In particular, while deep learning methods have been applied to\nreduce various types of CBCT image artifacts arising from motion, metal\nobjects, or low-dose acquisition, a comprehensive review summarizing the\nsuccesses and shortcomings of these approaches, with a primary focus on the\ntype of artifacts rather than the architecture of neural networks, is lacking\nin the literature. In this review, the data generation and simulation\npipelines, and artifact reduction techniques are specifically investigated for\neach type of artifact. We provide an overview of deep learning techniques that\nhave successfully been shown to reduce artifacts in 3D, as well as in\ntime-resolved (4D) CBCT through the use of projection- and/or volume-domain\noptimizations, or by introducing neural networks directly within the CBCT\nreconstruction algorithms. Research gaps are identified to suggest avenues for\nfuture exploration. One of the key findings of this work is an observed trend\ntowards the use of generative models including GANs and score-based or\ndiffusion models, accompanied with the need for more diverse and open training\ndatasets and simulations.\n","authors":["Mohammadreza Amirian","Daniel Barco","Ivo Herzig","Frank-Peter Schilling"],"pdf_url":"https://arxiv.org/pdf/2403.18565v1.pdf","comment":"16 pages, 4 figures, 1 Table, published in IEEE Access Journal"},{"id":"http://arxiv.org/abs/2403.09700v2","updated":"2024-03-27T13:42:25Z","published":"2024-03-05T22:19:21Z","title":"Shapley Values-Powered Framework for Fair Reward Split in Content\n Produced by GenAI","summary":" It is evident that, currently, generative models are surpassed in quality by\nhuman professionals. However, with the advancements in Artificial Intelligence,\nthis gap will narrow, leading to scenarios where individuals who have dedicated\nyears of their lives to mastering a skill become obsolete due to their high\ncosts, which are inherently linked to the time they require to complete a task\n-- a task that AI could accomplish in minutes or seconds. To avoid future\nsocial upheavals, we must, even now, contemplate how to fairly assess the\ncontributions of such individuals in training generative models and how to\ncompensate them for the reduction or complete loss of their incomes. In this\nwork, we propose a method to structure collaboration between model developers\nand data providers. To achieve this, we employ Shapley Values to quantify the\ncontribution of artist(s) in an image generated by the Stable Diffusion-v1.5\nmodel and to equitably allocate the reward among them.\n","authors":["Alex Glinsky","Alexey Sokolsky"],"pdf_url":"https://arxiv.org/pdf/2403.09700v2.pdf","comment":"36 pages, 32 figures"},{"id":"http://arxiv.org/abs/2403.18554v1","updated":"2024-03-27T13:33:14Z","published":"2024-03-27T13:33:14Z","title":"CosalPure: Learning Concept from Group Images for Robust Co-Saliency\n Detection","summary":" Co-salient object detection (CoSOD) aims to identify the common and salient\n(usually in the foreground) regions across a given group of images. Although\nachieving significant progress, state-of-the-art CoSODs could be easily\naffected by some adversarial perturbations, leading to substantial accuracy\nreduction. The adversarial perturbations can mislead CoSODs but do not change\nthe high-level semantic information (e.g., concept) of the co-salient objects.\nIn this paper, we propose a novel robustness enhancement framework by first\nlearning the concept of the co-salient objects based on the input group images\nand then leveraging this concept to purify adversarial perturbations, which are\nsubsequently fed to CoSODs for robustness enhancement. Specifically, we propose\nCosalPure containing two modules, i.e., group-image concept learning and\nconcept-guided diffusion purification. For the first module, we adopt a\npre-trained text-to-image diffusion model to learn the concept of co-salient\nobjects within group images where the learned concept is robust to adversarial\nexamples. For the second module, we map the adversarial image to the latent\nspace and then perform diffusion generation by embedding the learned concept\ninto the noise prediction function as an extra condition. Our method can\neffectively alleviate the influence of the SOTA adversarial attack containing\ndifferent adversarial patterns, including exposure and noise. The extensive\nresults demonstrate that our method could enhance the robustness of CoSODs\nsignificantly.\n","authors":["Jiayi Zhu","Qing Guo","Felix Juefei-Xu","Yihao Huang","Yang Liu","Geguang Pu"],"pdf_url":"https://arxiv.org/pdf/2403.18554v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.18551v1","updated":"2024-03-27T13:31:39Z","published":"2024-03-27T13:31:39Z","title":"Attention Calibration for Disentangled Text-to-Image Personalization","summary":" Recent thrilling progress in large-scale text-to-image (T2I) models has\nunlocked unprecedented synthesis quality of AI-generated content (AIGC)\nincluding image generation, 3D and video composition. Further, personalized\ntechniques enable appealing customized production of a novel concept given only\nseveral images as reference. However, an intriguing problem persists: Is it\npossible to capture multiple, novel concepts from one single reference image?\nIn this paper, we identify that existing approaches fail to preserve visual\nconsistency with the reference image and eliminate cross-influence from\nconcepts. To alleviate this, we propose an attention calibration mechanism to\nimprove the concept-level understanding of the T2I model. Specifically, we\nfirst introduce new learnable modifiers bound with classes to capture\nattributes of multiple concepts. Then, the classes are separated and\nstrengthened following the activation of the cross-attention operation,\nensuring comprehensive and self-contained concepts. Additionally, we suppress\nthe attention activation of different classes to mitigate mutual influence\namong concepts. Together, our proposed method, dubbed DisenDiff, can learn\ndisentangled multiple concepts from one single image and produce novel\ncustomized images with learned concepts. We demonstrate that our method\noutperforms the current state of the art in both qualitative and quantitative\nevaluations. More importantly, our proposed techniques are compatible with LoRA\nand inpainting pipelines, enabling more interactive experiences.\n","authors":["Yanbing Zhang","Mengping Yang","Qin Zhou","Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18551v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18550v1","updated":"2024-03-27T13:30:48Z","published":"2024-03-27T13:30:48Z","title":"OrCo: Towards Better Generalization via Orthogonality and Contrast for\n Few-Shot Class-Incremental Learning","summary":" Few-Shot Class-Incremental Learning (FSCIL) introduces a paradigm in which\nthe problem space expands with limited data. FSCIL methods inherently face the\nchallenge of catastrophic forgetting as data arrives incrementally, making\nmodels susceptible to overwriting previously acquired knowledge. Moreover,\ngiven the scarcity of labeled samples available at any given time, models may\nbe prone to overfitting and find it challenging to strike a balance between\nextensive pretraining and the limited incremental data. To address these\nchallenges, we propose the OrCo framework built on two core principles:\nfeatures' orthogonality in the representation space, and contrastive learning.\nIn particular, we improve the generalization of the embedding space by\nemploying a combination of supervised and self-supervised contrastive losses\nduring the pretraining phase. Additionally, we introduce OrCo loss to address\nchallenges arising from data limitations during incremental sessions. Through\nfeature space perturbations and orthogonality between classes, the OrCo loss\nmaximizes margins and reserves space for the following incremental data. This,\nin turn, ensures the accommodation of incoming classes in the feature space\nwithout compromising previously acquired knowledge. Our experimental results\nshowcase state-of-the-art performance across three benchmark datasets,\nincluding mini-ImageNet, CIFAR100, and CUB datasets. Code is available at\nhttps://github.com/noorahmedds/OrCo\n","authors":["Noor Ahmed","Anna Kukleva","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2403.18550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18548v1","updated":"2024-03-27T13:27:02Z","published":"2024-03-27T13:27:02Z","title":"A Semi-supervised Nighttime Dehazing Baseline with Spatial-Frequency\n Aware and Realistic Brightness Constraint","summary":" Existing research based on deep learning has extensively explored the problem\nof daytime image dehazing. However, few studies have considered the\ncharacteristics of nighttime hazy scenes. There are two distinctions between\nnighttime and daytime haze. First, there may be multiple active colored light\nsources with lower illumination intensity in nighttime scenes, which may cause\nhaze, glow and noise with localized, coupled and frequency inconsistent\ncharacteristics. Second, due to the domain discrepancy between simulated and\nreal-world data, unrealistic brightness may occur when applying a dehazing\nmodel trained on simulated data to real-world data. To address the above two\nissues, we propose a semi-supervised model for real-world nighttime dehazing.\nFirst, the spatial attention and frequency spectrum filtering are implemented\nas a spatial-frequency domain information interaction module to handle the\nfirst issue. Second, a pseudo-label-based retraining strategy and a local\nwindow-based brightness loss for semi-supervised training process is designed\nto suppress haze and glow while achieving realistic brightness. Experiments on\npublic benchmarks validate the effectiveness of the proposed method and its\nsuperiority over state-of-the-art methods. The source code and Supplementary\nMaterials are placed in the https://github.com/Xiaofeng-life/SFSNiD.\n","authors":["Xiaofeng Cong","Jie Gui","Jing Zhang","Junming Hou","Hao Shen"],"pdf_url":"https://arxiv.org/pdf/2403.18548v1.pdf","comment":"This paper is accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.18546v1","updated":"2024-03-27T13:24:58Z","published":"2024-03-27T13:24:58Z","title":"Efficient Heatmap-Guided 6-Dof Grasp Detection in Cluttered Scenes","summary":" Fast and robust object grasping in clutter is a crucial component of\nrobotics. Most current works resort to the whole observed point cloud for 6-Dof\ngrasp generation, ignoring the guidance information excavated from global\nsemantics, thus limiting high-quality grasp generation and real-time\nperformance. In this work, we show that the widely used heatmaps are\nunderestimated in the efficiency of 6-Dof grasp generation. Therefore, we\npropose an effective local grasp generator combined with grasp heatmaps as\nguidance, which infers in a global-to-local semantic-to-point way.\nSpecifically, Gaussian encoding and the grid-based strategy are applied to\npredict grasp heatmaps as guidance to aggregate local points into graspable\nregions and provide global semantic information. Further, a novel non-uniform\nanchor sampling mechanism is designed to improve grasp accuracy and diversity.\nBenefiting from the high-efficiency encoding in the image space and focusing on\npoints in local graspable regions, our framework can perform high-quality grasp\ndetection in real-time and achieve state-of-the-art results. In addition, real\nrobot experiments demonstrate the effectiveness of our method with a success\nrate of 94% and a clutter completion rate of 100%. Our code is available at\nhttps://github.com/THU-VCLab/HGGD.\n","authors":["Siang Chen","Wei Tang","Pengwei Xie","Wenming Yang","Guijin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18546v1.pdf","comment":"Extensive results on GraspNet-1B dataset"},{"id":"http://arxiv.org/abs/2310.15081v3","updated":"2024-03-27T13:23:28Z","published":"2023-10-23T16:41:13Z","title":"E4S: Fine-grained Face Swapping via Editing With Regional GAN Inversion","summary":" This paper proposes a novel approach to face swapping from the perspective of\nfine-grained facial editing, dubbed \"editing for swapping\" (E4S). The\ntraditional face swapping methods rely on global feature extraction and fail to\npreserve the detailed source identity. In contrast, we propose a Regional GAN\nInversion (RGI) method, which allows the explicit disentanglement of shape and\ntexture. Specifically, our E4S performs face swapping in the latent space of a\npretrained StyleGAN, where a multi-scale mask-guided encoder is applied to\nproject the texture of each facial component into regional style codes and a\nmask-guided injection module manipulating feature maps with the style codes.\nBased on this disentanglement, face swapping can be simplified as style and\nmask swapping. Besides, due to the large lighting condition gap, transferring\nthe source skin into the target image may lead to disharmony lighting. We\npropose a re-coloring network to make the swapped face maintain the target\nlighting condition while preserving the source skin. Further, to deal with the\npotential mismatch areas during mask exchange, we design a face inpainting\nmodule to refine the face shape. The extensive comparisons with\nstate-of-the-art methods demonstrate that our E4S outperforms existing methods\nin preserving texture, shape, and lighting. Our implementation is available at\nhttps://github.com/e4s2024/E4S2024.\n","authors":["Maomao Li","Ge Yuan","Cairong Wang","Zhian Liu","Yong Zhang","Yongwei Nie","Jue Wang","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2310.15081v3.pdf","comment":"Project Page: https://e4s2024.github.io/ ;. arXiv admin note: text\n overlap with arXiv:2211.14068"},{"id":"http://arxiv.org/abs/2403.18525v1","updated":"2024-03-27T12:59:44Z","published":"2024-03-27T12:59:44Z","title":"Language Plays a Pivotal Role in the Object-Attribute Compositional\n Generalization of CLIP","summary":" Vision-language models, such as CLIP, have shown promising\nOut-of-Distribution (OoD) generalization under various types of distribution\nshifts. Recent studies attempted to investigate the leading cause of this\ncapability. In this work, we follow the same path, but focus on a specific type\nof OoD data - images with novel compositions of attribute-object pairs - and\nstudy whether such models can successfully classify those images into\ncomposition classes. We carefully designed an authentic image test dataset\ncalled ImageNet-AO, consisting of attributes for objects that are unlikely\nencountered in the CLIP training sets. We found that CLIPs trained with large\ndatasets such as OpenAI CLIP, LAION-400M, and LAION-2B show orders-of-magnitude\nimprovement in effective compositional OoD generalization compared to both\nsupervised models and CLIPs trained with smaller datasets, such as CC-12M and\nYFCC-15M. Our results provide evidence that the scale and diversity of training\ndata and language supervision play a key role in unlocking the compositional\ngeneralization abilities of vision-language models.\n","authors":["Reza Abbasi","Mohammad Samiei","Mohammad Hossein Rohban","Mahdieh Soleymani Baghshah"],"pdf_url":"https://arxiv.org/pdf/2403.18525v1.pdf","comment":"Oral accepted at OODCV 2023(http://www.ood-cv.org)"},{"id":"http://arxiv.org/abs/2403.18514v1","updated":"2024-03-27T12:44:57Z","published":"2024-03-27T12:44:57Z","title":"CT-3DFlow : Leveraging 3D Normalizing Flows for Unsupervised Detection\n of Pathological Pulmonary CT scans","summary":" Unsupervised pathology detection can be implemented by training a model on\nhealthy data only and measuring the deviation from the training set upon\ninference, for example with CNN-based feature extraction and one-class\nclassifiers, or reconstruction-score-based methods such as AEs, GANs and\nDiffusion models. Normalizing Flows (NF) have the ability to directly learn the\nprobability distribution of training examples through an invertible\narchitecture. We leverage this property in a novel 3D NF-based model named\nCT-3DFlow, specifically tailored for patient-level pulmonary pathology\ndetection in chest CT data. Our model is trained unsupervised on healthy 3D\npulmonary CT patches, and detects deviations from its log-likelihood\ndistribution as anomalies. We aggregate patches-level likelihood values from a\npatient's CT scan to provide a patient-level 'normal'/'abnormal' prediction.\nOut-of-distribution detection performance is evaluated using expert annotations\non a separate chest CT test dataset, outperforming other state-of-the-art\nmethods.\n","authors":["Aissam Djahnine","Alexandre Popoff","Emilien Jupin-Delevaux","Vincent Cottin","Olivier Nempont","Loic Boussel"],"pdf_url":"https://arxiv.org/pdf/2403.18514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04344v3","updated":"2024-03-27T12:44:55Z","published":"2023-06-07T11:18:53Z","title":"ViDA: Homeostatic Visual Domain Adapter for Continual Test Time\n Adaptation","summary":" Since real-world machine systems are running in non-stationary environments,\nContinual Test-Time Adaptation (CTTA) task is proposed to adapt the pre-trained\nmodel to continually changing target domains. Recently, existing methods mainly\nfocus on model-based adaptation, which aims to leverage a self-training manner\nto extract the target domain knowledge. However, pseudo labels can be noisy and\nthe updated model parameters are unreliable under dynamic data distributions,\nleading to error accumulation and catastrophic forgetting in the continual\nadaptation process. To tackle these challenges and maintain the model\nplasticity, we design a Visual Domain Adapter (ViDA) for CTTA, explicitly\nhandling both domain-specific and domain-shared knowledge. Specifically, we\nfirst comprehensively explore the different domain representations of the\nadapters with trainable high-rank or low-rank embedding spaces. Then we inject\nViDAs into the pre-trained model, which leverages high-rank and low-rank\nfeatures to adapt the current domain distribution and maintain the continual\ndomain-shared knowledge, respectively. To exploit the low-rank and high-rank\nViDAs more effectively, we further propose a Homeostatic Knowledge Allotment\n(HKA) strategy, which adaptively combines different knowledge from each ViDA.\nExtensive experiments conducted on four widely used benchmarks demonstrate that\nour proposed method achieves state-of-the-art performance in both\nclassification and segmentation CTTA tasks. Note that, our method can be\nregarded as a novel transfer paradigm for large-scale models, delivering\npromising results in adaptation to continually changing distributions. Project\npage: https://sites.google.com/view/iclr2024-vida/home.\n","authors":["Jiaming Liu","Senqiao Yang","Peidong Jia","Renrui Zhang","Ming Lu","Yandong Guo","Wei Xue","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.04344v3.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2403.18512v1","updated":"2024-03-27T12:41:30Z","published":"2024-03-27T12:41:30Z","title":"ParCo: Part-Coordinating Text-to-Motion Synthesis","summary":" We study a challenging task: text-to-motion synthesis, aiming to generate\nmotions that align with textual descriptions and exhibit coordinated movements.\nCurrently, the part-based methods introduce part partition into the motion\nsynthesis process to achieve finer-grained generation. However, these methods\nencounter challenges such as the lack of coordination between different part\nmotions and difficulties for networks to understand part concepts. Moreover,\nintroducing finer-grained part concepts poses computational complexity\nchallenges. In this paper, we propose Part-Coordinating Text-to-Motion\nSynthesis (ParCo), endowed with enhanced capabilities for understanding part\nmotions and communication among different part motion generators, ensuring a\ncoordinated and fined-grained motion synthesis. Specifically, we discretize\nwhole-body motion into multiple part motions to establish the prior concept of\ndifferent parts. Afterward, we employ multiple lightweight generators designed\nto synthesize different part motions and coordinate them through our part\ncoordination module. Our approach demonstrates superior performance on common\nbenchmarks with economic computations, including HumanML3D and KIT-ML,\nproviding substantial evidence of its effectiveness. Code is available at\nhttps://github.com/qrzou/ParCo .\n","authors":["Qiran Zou","Shangyuan Yuan","Shian Du","Yu Wang","Chang Liu","Yi Xu","Jie Chen","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2403.18512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16516v2","updated":"2024-03-27T12:32:31Z","published":"2024-03-25T08:00:43Z","title":"Visually Guided Generative Text-Layout Pre-training for Document\n Intelligence","summary":" Prior study shows that pre-training techniques can boost the performance of\nvisual document understanding (VDU), which typically requires models to gain\nabilities to perceive and reason both document texts and layouts (e.g.,\nlocations of texts and table-cells). To this end, we propose visually guided\ngenerative text-layout pre-training, named ViTLP. Given a document image, the\nmodel optimizes hierarchical language and layout modeling objectives to\ngenerate the interleaved text and layout sequence. In addition, to address the\nlimitation of processing long documents by Transformers, we introduce a\nstraightforward yet effective multi-segment generative pre-training scheme,\nfacilitating ViTLP to process word-intensive documents of any length. ViTLP can\nfunction as a native OCR model to localize and recognize texts of document\nimages. Besides, ViTLP can be effectively applied to various downstream VDU\ntasks. Extensive experiments show that ViTLP achieves competitive performance\nover existing baselines on benchmark VDU tasks, including information\nextraction, document classification, and document question answering.\n","authors":["Zhiming Mao","Haoli Bai","Lu Hou","Jiansheng Wei","Xin Jiang","Qun Liu","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2403.16516v2.pdf","comment":"Accepted to NAACL 2024 main conference. The first version of this\n paper was submitted to OpenReview\n (https://openreview.net/forum?id=ARtBIBAmNR) in June 2023"},{"id":"http://arxiv.org/abs/2312.06358v2","updated":"2024-03-27T12:24:29Z","published":"2023-12-11T13:05:54Z","title":"Intraoperative 2D/3D Image Registration via Differentiable X-ray\n Rendering","summary":" Surgical decisions are informed by aligning rapid portable 2D intraoperative\nimages (e.g., X-rays) to a high-fidelity 3D preoperative reference scan (e.g.,\nCT). 2D/3D image registration often fails in practice: conventional\noptimization methods are prohibitively slow and susceptible to local minima,\nwhile neural networks trained on small datasets fail on new patients or require\nimpractical landmark supervision. We present DiffPose, a self-supervised\napproach that leverages patient-specific simulation and differentiable\nphysics-based rendering to achieve accurate 2D/3D registration without relying\non manually labeled data. Preoperatively, a CNN is trained to regress the pose\nof a randomly oriented synthetic X-ray rendered from the preoperative CT. The\nCNN then initializes rapid intraoperative test-time optimization that uses the\ndifferentiable X-ray renderer to refine the solution. Our work further proposes\nseveral geometrically principled methods for sampling camera poses from\n$\\mathbf{SE}(3)$, for sparse differentiable rendering, and for driving\nregistration in the tangent space $\\mathfrak{se}(3)$ with geodesic and\nmultiscale locality-sensitive losses. DiffPose achieves sub-millimeter accuracy\nacross surgical datasets at intraoperative speeds, improving upon existing\nunsupervised methods by an order of magnitude and even outperforming supervised\nbaselines. Our code is available at https://github.com/eigenvivek/DiffPose.\n","authors":["Vivek Gopalakrishnan","Neel Dey","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2312.06358v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18501v1","updated":"2024-03-27T12:24:20Z","published":"2024-03-27T12:24:20Z","title":"HEMIT: H&E to Multiplex-immunohistochemistry Image Translation with\n Dual-Branch Pix2pix Generator","summary":" Computational analysis of multiplexed immunofluorescence histology data is\nemerging as an important method for understanding the tumour micro-environment\nin cancer. This work presents HEMIT, a dataset designed for translating\nHematoxylin and Eosin (H&E) sections to multiplex-immunohistochemistry (mIHC)\nimages, featuring DAPI, CD3, and panCK markers. Distinctively, HEMIT's mIHC\nimages are multi-component and cellular-level aligned with H&E, enriching\nsupervised stain translation tasks. To our knowledge, HEMIT is the first\npublicly available cellular-level aligned dataset that enables H&E to\nmulti-target mIHC image translation. This dataset provides the computer vision\ncommunity with a valuable resource to develop novel computational methods which\nhave the potential to gain new insights from H&E slide archives.\n We also propose a new dual-branch generator architecture, using residual\nConvolutional Neural Networks (CNNs) and Swin Transformers which achieves\nbetter translation outcomes than other popular algorithms. When evaluated on\nHEMIT, it outperforms pix2pixHD, pix2pix, U-Net, and ResNet, achieving the\nhighest overall score on key metrics including the Structural Similarity Index\nMeasure (SSIM), Pearson correlation score (R), and Peak signal-to-noise Ratio\n(PSNR). Additionally, downstream analysis has been used to further validate the\nquality of the generated mIHC images. These results set a new benchmark in the\nfield of stain translation tasks.\n","authors":["Chang Bian","Beth Philips","Tim Cootes","Martin Fergie"],"pdf_url":"https://arxiv.org/pdf/2403.18501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04698v3","updated":"2024-03-27T12:24:17Z","published":"2023-11-08T14:10:19Z","title":"Challenging Common Paradigms in Multi-Task Learning","summary":" While multi-task learning (MTL) has gained significant attention in recent\nyears, its underlying mechanisms remain poorly understood. Recent methods did\nnot yield consistent performance improvements over single task learning (STL)\nbaselines, underscoring the importance of gaining more profound insights about\nchallenges specific to MTL. In our study, we challenge paradigms in MTL in the\ncontext of STL: First, the impact of the choice of optimizer has only been\nmildly investigated in MTL. We show the pivotal role of common STL tools such\nas the Adam optimizer in MTL empirically in various experiments. To further\ninvestigate Adam's effectiveness, we theoretical derive a partial loss-scale\ninvariance under mild assumptions. Second, the notion of gradient conflicts has\noften been phrased as a specific problem in MTL. We delve into the role of\ngradient conflicts in MTL and compare it to STL. For angular gradient alignment\nwe find no evidence that this is a unique problem in MTL. We emphasize\ndifferences in gradient magnitude as the main distinguishing factor. Lastly, we\ncompare the transferability of features learned through MTL and STL on common\nimage corruptions, and find light evidence that MTL can lead to superior\ntransferability. Overall, we find surprising similarities between STL and MTL\nsuggesting to consider methods from both fields in a broader context.\n","authors":["Cathrin Elich","Lukas Kirchdorfer","Jan M. Köhler","Lukas Schott"],"pdf_url":"https://arxiv.org/pdf/2311.04698v3.pdf","comment":"-"},{"id":"http://arxiv.org/abs/2403.18495v1","updated":"2024-03-27T12:15:22Z","published":"2024-03-27T12:15:22Z","title":"Direct mineral content prediction from drill core images via transfer\n learning","summary":" Deep subsurface exploration is important for mining, oil and gas industries,\nas well as in the assessment of geological units for the disposal of chemical\nor nuclear waste, or the viability of geothermal energy systems. Typically,\ndetailed examinations of subsurface formations or units are performed on\ncuttings or core materials extracted during drilling campaigns, as well as on\ngeophysical borehole data, which provide detailed information about the\npetrophysical properties of the rocks. Depending on the volume of rock samples\nand the analytical program, the laboratory analysis and diagnostics can be very\ntime-consuming. This study investigates the potential of utilizing machine\nlearning, specifically convolutional neural networks (CNN), to assess the\nlithology and mineral content solely from analysis of drill core images, aiming\nto support and expedite the subsurface geological exploration. The paper\noutlines a comprehensive methodology, encompassing data preprocessing, machine\nlearning methods, and transfer learning techniques. The outcome reveals a\nremarkable 96.7% accuracy in the classification of drill core segments into\ndistinct formation classes. Furthermore, a CNN model was trained for the\nevaluation of mineral content using a learning data set from multidimensional\nlog analysis data (silicate, total clay, carbonate). When benchmarked against\nlaboratory XRD measurements on samples from the cores, both the advanced\nmultidimensional log analysis model and the neural network approach developed\nhere provide equally good performance. This work demonstrates that deep\nlearning and particularly transfer learning can support extracting\npetrophysical properties, including mineral content and formation\nclassification, from drill core images, thus offering a road map for enhancing\nmodel performance and data set quality in image-based analysis of drill cores.\n","authors":["Romana Boiger","Sergey V. Churakov","Ignacio Ballester Llagaria","Georg Kosakowski","Raphael Wüst","Nikolaos I. Prasianakis"],"pdf_url":"https://arxiv.org/pdf/2403.18495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02203v5","updated":"2024-03-27T12:12:45Z","published":"2023-07-05T10:54:50Z","title":"Neural Fields for Interactive Visualization of Statistical Dependencies\n in 3D Simulation Ensembles","summary":" We present the first neural network that has learned to compactly represent\nand can efficiently reconstruct the statistical dependencies between the values\nof physical variables at different spatial locations in large 3D simulation\nensembles. Going beyond linear dependencies, we consider mutual information as\na measure of non-linear dependence. We demonstrate learning and reconstruction\nwith a large weather forecast ensemble comprising 1000 members, each storing\nmultiple physical variables at a 250 x 352 x 20 simulation grid. By\ncircumventing compute-intensive statistical estimators at runtime, we\ndemonstrate significantly reduced memory and computation requirements for\nreconstructing the major dependence structures. This enables embedding the\nestimator into a GPU-accelerated direct volume renderer and interactively\nvisualizing all mutual dependencies for a selected domain point.\n","authors":["Fatemeh Farokhmanesh","Kevin Höhlein","Christoph Neuhauser","Tobias Necker","Martin Weissmann","Takemasa Miyoshi","Rüdiger Westermann"],"pdf_url":"https://arxiv.org/pdf/2307.02203v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18493v1","updated":"2024-03-27T12:08:41Z","published":"2024-03-27T12:08:41Z","title":"VersaT2I: Improving Text-to-Image Models with Versatile Reward","summary":" Recent text-to-image (T2I) models have benefited from large-scale and\nhigh-quality data, demonstrating impressive performance. However, these T2I\nmodels still struggle to produce images that are aesthetically pleasing,\ngeometrically accurate, faithful to text, and of good low-level quality. We\npresent VersaT2I, a versatile training framework that can boost the performance\nwith multiple rewards of any T2I model. We decompose the quality of the image\ninto several aspects such as aesthetics, text-image alignment, geometry,\nlow-level quality, etc. Then, for every quality aspect, we select high-quality\nimages in this aspect generated by the model as the training set to finetune\nthe T2I model using the Low-Rank Adaptation (LoRA). Furthermore, we introduce a\ngating function to combine multiple quality aspects, which can avoid conflicts\nbetween different quality aspects. Our method is easy to extend and does not\nrequire any manual annotation, reinforcement learning, or model architecture\nchanges. Extensive experiments demonstrate that VersaT2I outperforms the\nbaseline methods across various quality criteria.\n","authors":["Jianshu Guo","Wenhao Chai","Jie Deng","Hsiang-Wei Huang","Tian Ye","Yichen Xu","Jiawei Zhang","Jenq-Neng Hwang","Gaoang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18490v1","updated":"2024-03-27T12:05:22Z","published":"2024-03-27T12:05:22Z","title":"I2CKD : Intra- and Inter-Class Knowledge Distillation for Semantic\n Segmentation","summary":" This paper proposes a new knowledge distillation method tailored for image\nsemantic segmentation, termed Intra- and Inter-Class Knowledge Distillation\n(I2CKD). The focus of this method is on capturing and transferring knowledge\nbetween the intermediate layers of teacher (cumbersome model) and student\n(compact model). For knowledge extraction, we exploit class prototypes derived\nfrom feature maps. To facilitate knowledge transfer, we employ a triplet loss\nin order to minimize intra-class variances and maximize inter-class variances\nbetween teacher and student prototypes. Consequently, I2CKD enables the student\nto better mimic the feature representation of the teacher for each class,\nthereby enhancing the segmentation performance of the compact network.\nExtensive experiments on three segmentation datasets, i.e., Cityscapes, Pascal\nVOC and CamVid, using various teacher-student network pairs demonstrate the\neffectiveness of the proposed method.\n","authors":["Ayoub Karine","Thibault Napoléon","Maher Jridi"],"pdf_url":"https://arxiv.org/pdf/2403.18490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16943v2","updated":"2024-03-27T11:46:36Z","published":"2023-12-28T10:40:11Z","title":"SAR-Net: Multi-scale Direction-aware SAR Network via Global Information\n Fusion","summary":" Deep learning has driven significant progress in object detection using\nSynthetic Aperture Radar (SAR) imagery. Existing methods, while achieving\npromising results, often struggle to effectively integrate local and global\ninformation, particularly direction-aware features. This paper proposes\nSAR-Net, a novel framework specifically designed for global fusion of\ndirection-aware information in SAR object detection. SAR-Net leverages two key\ninnovations: the Unity Compensation Mechanism (UCM) and the Direction-aware\nAttention Module (DAM). UCM facilitates the establishment of complementary\nrelationships among features across different scales, enabling efficient global\ninformation fusion. Among them, Multi-scale Alignment Module (MAM) and distinct\nMulti-level Fusion Module (MFM) enhance feature integration by capturing both\ntexture detail and semantic information. Then, Multi-feature Embedding Module\n(MEM) feeds back global features into the primary branches, further improving\ninformation transmission. Additionally, DAM, through bidirectional attention\npolymerization, captures direction-aware information, effectively eliminating\nbackground interference. Extensive experiments demonstrate the effectiveness of\nSAR-Net, achieving state-of-the-art results on aircraft (SAR-AIRcraft-1.0) and\nship datasets (SSDD, HRSID), confirming its generalization capability and\nrobustness.\n","authors":["Mingxiang Cao","Jie Lei","Weiying Xie","Jiaqing Zhang","Daixun Li","Yunsong Li"],"pdf_url":"https://arxiv.org/pdf/2312.16943v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18476v1","updated":"2024-03-27T11:45:08Z","published":"2024-03-27T11:45:08Z","title":"Modeling uncertainty for Gaussian Splatting","summary":" We present Stochastic Gaussian Splatting (SGS): the first framework for\nuncertainty estimation using Gaussian Splatting (GS). GS recently advanced the\nnovel-view synthesis field by achieving impressive reconstruction quality at a\nfraction of the computational cost of Neural Radiance Fields (NeRF). However,\ncontrary to the latter, it still lacks the ability to provide information about\nthe confidence associated with their outputs. To address this limitation, in\nthis paper, we introduce a Variational Inference-based approach that seamlessly\nintegrates uncertainty prediction into the common rendering pipeline of GS.\nAdditionally, we introduce the Area Under Sparsification Error (AUSE) as a new\nterm in the loss function, enabling optimization of uncertainty estimation\nalongside image reconstruction. Experimental results on the LLFF dataset\ndemonstrate that our method outperforms existing approaches in terms of both\nimage rendering quality and uncertainty estimation accuracy. Overall, our\nframework equips practitioners with valuable insights into the reliability of\nsynthesized views, facilitating safer decision-making in real-world\napplications.\n","authors":["Luca Savant","Diego Valsesia","Enrico Magli"],"pdf_url":"https://arxiv.org/pdf/2403.18476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12028v2","updated":"2024-03-27T11:43:28Z","published":"2023-11-20T18:59:51Z","title":"Hourglass Tokenizer for Efficient Transformer-Based 3D Human Pose\n Estimation","summary":" Transformers have been successfully applied in the field of video-based 3D\nhuman pose estimation. However, the high computational costs of these video\npose transformers (VPTs) make them impractical on resource-constrained devices.\nIn this paper, we present a plug-and-play pruning-and-recovering framework,\ncalled Hourglass Tokenizer (HoT), for efficient transformer-based 3D human pose\nestimation from videos. Our HoT begins with pruning pose tokens of redundant\nframes and ends with recovering full-length tokens, resulting in a few pose\ntokens in the intermediate transformer blocks and thus improving the model\nefficiency. To effectively achieve this, we propose a token pruning cluster\n(TPC) that dynamically selects a few representative tokens with high semantic\ndiversity while eliminating the redundancy of video frames. In addition, we\ndevelop a token recovering attention (TRA) to restore the detailed\nspatio-temporal information based on the selected tokens, thereby expanding the\nnetwork output to the original full-length temporal resolution for fast\ninference. Extensive experiments on two benchmark datasets (i.e., Human3.6M and\nMPI-INF-3DHP) demonstrate that our method can achieve both high efficiency and\nestimation accuracy compared to the original VPT models. For instance, applying\nto MotionBERT and MixSTE on Human3.6M, our HoT can save nearly 50% FLOPs\nwithout sacrificing accuracy and nearly 40% FLOPs with only 0.2% accuracy drop,\nrespectively. Code and models are available at\nhttps://github.com/NationalGAILab/HoT.\n","authors":["Wenhao Li","Mengyuan Liu","Hong Liu","Pichao Wang","Jialun Cai","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2311.12028v2.pdf","comment":"Accepted by CVPR 2024, Open Sourced"},{"id":"http://arxiv.org/abs/2403.18471v1","updated":"2024-03-27T11:32:44Z","published":"2024-03-27T11:32:44Z","title":"DiffusionFace: Towards a Comprehensive Dataset for Diffusion-Based Face\n Forgery Analysis","summary":" The rapid progress in deep learning has given rise to hyper-realistic facial\nforgery methods, leading to concerns related to misinformation and security\nrisks. Existing face forgery datasets have limitations in generating\nhigh-quality facial images and addressing the challenges posed by evolving\ngenerative techniques. To combat this, we present DiffusionFace, the first\ndiffusion-based face forgery dataset, covering various forgery categories,\nincluding unconditional and Text Guide facial image generation, Img2Img,\nInpaint, and Diffusion-based facial exchange algorithms. Our DiffusionFace\ndataset stands out with its extensive collection of 11 diffusion models and the\nhigh-quality of the generated images, providing essential metadata and a\nreal-world internet-sourced forgery facial image dataset for evaluation.\nAdditionally, we provide an in-depth analysis of the data and introduce\npractical evaluation protocols to rigorously assess discriminative models'\neffectiveness in detecting counterfeit facial images, aiming to enhance\nsecurity in facial image authentication processes. The dataset is available for\ndownload at \\url{https://github.com/Rapisurazurite/DiffFace}.\n","authors":["Zhongxi Chen","Ke Sun","Ziyin Zhou","Xianming Lin","Xiaoshuai Sun","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2403.18471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18469v1","updated":"2024-03-27T11:28:57Z","published":"2024-03-27T11:28:57Z","title":"Density-guided Translator Boosts Synthetic-to-Real Unsupervised Domain\n Adaptive Segmentation of 3D Point Clouds","summary":" 3D synthetic-to-real unsupervised domain adaptive segmentation is crucial to\nannotating new domains. Self-training is a competitive approach for this task,\nbut its performance is limited by different sensor sampling patterns (i.e.,\nvariations in point density) and incomplete training strategies. In this work,\nwe propose a density-guided translator (DGT), which translates point density\nbetween domains, and integrates it into a two-stage self-training pipeline\nnamed DGT-ST. First, in contrast to existing works that simultaneously conduct\ndata generation and feature/output alignment within unstable adversarial\ntraining, we employ the non-learnable DGT to bridge the domain gap at the input\nlevel. Second, to provide a well-initialized model for self-training, we\npropose a category-level adversarial network in stage one that utilizes the\nprototype to prevent negative transfer. Finally, by leveraging the designs\nabove, a domain-mixed self-training method with source-aware consistency loss\nis proposed in stage two to narrow the domain gap further. Experiments on two\nsynthetic-to-real segmentation tasks (SynLiDAR $\\rightarrow$ semanticKITTI and\nSynLiDAR $\\rightarrow$ semanticPOSS) demonstrate that DGT-ST outperforms\nstate-of-the-art methods, achieving 9.4$\\%$ and 4.3$\\%$ mIoU improvements,\nrespectively. Code is available at \\url{https://github.com/yuan-zm/DGT-ST}.\n","authors":["Zhimin Yuan","Wankang Zeng","Yanfei Su","Weiquan Liu","Ming Cheng","Yulan Guo","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18469v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.18468v1","updated":"2024-03-27T11:28:32Z","published":"2024-03-27T11:28:32Z","title":"Deep Learning Segmentation and Classification of Red Blood Cells Using a\n Large Multi-Scanner Dataset","summary":" Digital pathology has recently been revolutionized by advancements in\nartificial intelligence, deep learning, and high-performance computing. With\nits advanced tools, digital pathology can help improve and speed up the\ndiagnostic process, reduce human errors, and streamline the reporting step. In\nthis paper, we report a new large red blood cell (RBC) image dataset and\npropose a two-stage deep learning framework for RBC image segmentation and\nclassification. The dataset is a highly diverse dataset of more than 100K RBCs\ncontaining eight different classes. The dataset, which is considerably larger\nthan any publicly available hematopathology dataset, was labeled independently\nby two hematopathologists who also manually created masks for RBC cell\nsegmentation. Subsequently, in the proposed framework, first, a U-Net model was\ntrained to achieve automatic RBC image segmentation. Second, an EfficientNetB0\nmodel was trained to classify RBC images into one of the eight classes using a\ntransfer learning approach with a 5X2 cross-validation scheme. An IoU of 98.03%\nand an average classification accuracy of 96.5% were attained on the test set.\nMoreover, we have performed experimental comparisons against several prominent\nCNN models. These comparisons show the superiority of the proposed model with a\ngood balance between performance and computational cost.\n","authors":["Mohamed Elmanna","Ahmed Elsafty","Yomna Ahmed","Muhammad Rushdi","Ahmed Morsy"],"pdf_url":"https://arxiv.org/pdf/2403.18468v1.pdf","comment":"15 pages, 12 figures, 8 tables"},{"id":"http://arxiv.org/abs/2403.18461v1","updated":"2024-03-27T11:19:34Z","published":"2024-03-27T11:19:34Z","title":"DiffStyler: Diffusion-based Localized Image Style Transfer","summary":" Image style transfer aims to imbue digital imagery with the distinctive\nattributes of style targets, such as colors, brushstrokes, shapes, whilst\nconcurrently preserving the semantic integrity of the content. Despite the\nadvancements in arbitrary style transfer methods, a prevalent challenge remains\nthe delicate equilibrium between content semantics and style attributes. Recent\ndevelopments in large-scale text-to-image diffusion models have heralded\nunprecedented synthesis capabilities, albeit at the expense of relying on\nextensive and often imprecise textual descriptions to delineate artistic\nstyles. Addressing these limitations, this paper introduces DiffStyler, a novel\napproach that facilitates efficient and precise arbitrary image style transfer.\nDiffStyler lies the utilization of a text-to-image Stable Diffusion model-based\nLoRA to encapsulate the essence of style targets. This approach, coupled with\nstrategic cross-LoRA feature and attention injection, guides the style transfer\nprocess. The foundation of our methodology is rooted in the observation that\nLoRA maintains the spatial feature consistency of UNet, a discovery that\nfurther inspired the development of a mask-wise style transfer technique. This\ntechnique employs masks extracted through a pre-trained FastSAM model,\nutilizing mask prompts to facilitate feature fusion during the denoising\nprocess, thereby enabling localized style transfer that preserves the original\nimage's unaffected regions. Moreover, our approach accommodates multiple style\ntargets through the use of corresponding masks. Through extensive\nexperimentation, we demonstrate that DiffStyler surpasses previous methods in\nachieving a more harmonious balance between content preservation and style\nintegration.\n","authors":["Shaoxu Li"],"pdf_url":"https://arxiv.org/pdf/2403.18461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10522v4","updated":"2024-03-27T11:18:51Z","published":"2023-11-17T13:43:43Z","title":"Enhancing Object Coherence in Layout-to-Image Synthesis","summary":" Layout-to-image synthesis is an emerging technique in conditional image\ngeneration. It aims to generate complex scenes, where users require fine\ncontrol over the layout of the objects in a scene. However, it remains\nchallenging to control the object coherence, including semantic coherence\n(e.g., the cat looks at the flowers or not) and physical coherence (e.g., the\nhand and the racket should not be misaligned). In this paper, we propose a\nnovel diffusion model with effective global semantic fusion (GSF) and\nself-similarity feature enhancement modules to guide the object coherence for\nthis task. For semantic coherence, we argue that the image caption contains\nrich information for defining the semantic relationship within the objects in\nthe images. Instead of simply employing cross-attention between captions and\ngenerated images, which addresses the highly relevant layout restriction and\nsemantic coherence separately and thus leads to unsatisfying results shown in\nour experiments, we develop GSF to fuse the supervision from the layout\nrestriction and semantic coherence requirement and exploit it to guide the\nimage synthesis process. Moreover, to improve the physical coherence, we\ndevelop a Self-similarity Coherence Attention (SCA) module to explicitly\nintegrate local contextual physical coherence into each pixel's generation\nprocess. Specifically, we adopt a self-similarity map to encode the coherence\nrestrictions and employ it to extract coherent features from text embedding.\nThrough visualization of our self-similarity map, we explore the essence of\nSCA, revealing that its effectiveness is not only in capturing reliable\nphysical coherence patterns but also in enhancing complex texture generation.\nExtensive experiments demonstrate the superiority of our proposed method in\nboth image generation quality and controllability.\n","authors":["Yibin Wang","Weizhong Zhang","Jianwei Zheng","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2311.10522v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18454v1","updated":"2024-03-27T11:13:20Z","published":"2024-03-27T11:13:20Z","title":"Scaling Vision-and-Language Navigation With Offline RL","summary":" The study of vision-and-language navigation (VLN) has typically relied on\nexpert trajectories, which may not always be available in real-world situations\ndue to the significant effort required to collect them. On the other hand,\nexisting approaches to training VLN agents that go beyond available expert data\ninvolve data augmentations or online exploration which can be tedious and\nrisky. In contrast, it is easy to access large repositories of suboptimal\noffline trajectories. Inspired by research in offline reinforcement learning\n(ORL), we introduce a new problem setup of VLN-ORL which studies VLN using\nsuboptimal demonstration data. We introduce a simple and effective\nreward-conditioned approach that can account for dataset suboptimality for\ntraining VLN agents, as well as benchmarks to evaluate progress and promote\nresearch in this area. We empirically study various noise models for\ncharacterizing dataset suboptimality among other unique challenges in VLN-ORL\nand instantiate it for the VLN$\\circlearrowright$BERT and MTVM architectures in\nthe R2R and RxR environments. Our experiments demonstrate that the proposed\nreward-conditioned approach leads to significant performance improvements, even\nin complex and intricate environments.\n","authors":["Valay Bundele","Mahesh Bhupati","Biplab Banerjee","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2403.18454v1.pdf","comment":"Published in Transactions on Machine Learning Research (04/2024)"},{"id":"http://arxiv.org/abs/2403.18452v1","updated":"2024-03-27T11:11:08Z","published":"2024-03-27T11:11:08Z","title":"SingularTrajectory: Universal Trajectory Predictor Using Diffusion Model","summary":" There are five types of trajectory prediction tasks: deterministic,\nstochastic, domain adaptation, momentary observation, and few-shot. These\nassociated tasks are defined by various factors, such as the length of input\npaths, data split and pre-processing methods. Interestingly, even though they\ncommonly take sequential coordinates of observations as input and infer future\npaths in the same coordinates as output, designing specialized architectures\nfor each task is still necessary. For the other task, generality issues can\nlead to sub-optimal performances. In this paper, we propose SingularTrajectory,\na diffusion-based universal trajectory prediction framework to reduce the\nperformance gap across the five tasks. The core of SingularTrajectory is to\nunify a variety of human dynamics representations on the associated tasks. To\ndo this, we first build a Singular space to project all types of motion\npatterns from each task into one embedding space. We next propose an adaptive\nanchor working in the Singular space. Unlike traditional fixed anchor methods\nthat sometimes yield unacceptable paths, our adaptive anchor enables correct\nanchors, which are put into a wrong location, based on a traversability map.\nFinally, we adopt a diffusion-based predictor to further enhance the prototype\npaths using a cascaded denoising process. Our unified framework ensures the\ngenerality across various benchmark settings such as input modality, and\ntrajectory lengths. Extensive experiments on five public benchmarks demonstrate\nthat SingularTrajectory substantially outperforms existing models, highlighting\nits effectiveness in estimating general dynamics of human movements. Code is\npublicly available at https://github.com/inhwanbae/SingularTrajectory .\n","authors":["Inhwan Bae","Young-Jae Park","Hae-Gon Jeon"],"pdf_url":"https://arxiv.org/pdf/2403.18452v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18447v1","updated":"2024-03-27T11:06:44Z","published":"2024-03-27T11:06:44Z","title":"Can Language Beat Numerical Regression? Language-Based Multimodal\n Trajectory Prediction","summary":" Language models have demonstrated impressive ability in context understanding\nand generative performance. Inspired by the recent success of language\nfoundation models, in this paper, we propose LMTraj (Language-based Multimodal\nTrajectory predictor), which recasts the trajectory prediction task into a sort\nof question-answering problem. Departing from traditional numerical regression\nmodels, which treat the trajectory coordinate sequence as continuous signals,\nwe consider them as discrete signals like text prompts. Specially, we first\ntransform an input space for the trajectory coordinate into the natural\nlanguage space. Here, the entire time-series trajectories of pedestrians are\nconverted into a text prompt, and scene images are described as text\ninformation through image captioning. The transformed numerical and image data\nare then wrapped into the question-answering template for use in a language\nmodel. Next, to guide the language model in understanding and reasoning\nhigh-level knowledge, such as scene context and social relationships between\npedestrians, we introduce an auxiliary multi-task question and answering. We\nthen train a numerical tokenizer with the prompt data. We encourage the\ntokenizer to separate the integer and decimal parts well, and leverage it to\ncapture correlations between the consecutive numbers in the language model.\nLastly, we train the language model using the numerical tokenizer and all of\nthe question-answer prompts. Here, we propose a beam-search-based most-likely\nprediction and a temperature-based multimodal prediction to implement both\ndeterministic and stochastic inferences. Applying our LMTraj, we show that the\nlanguage-based model can be a powerful pedestrian trajectory predictor, and\noutperforms existing numerical-based predictor methods. Code is publicly\navailable at https://github.com/inhwanbae/LMTrajectory .\n","authors":["Inhwan Bae","Junoh Lee","Hae-Gon Jeon"],"pdf_url":"https://arxiv.org/pdf/2403.18447v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18443v1","updated":"2024-03-27T11:00:33Z","published":"2024-03-27T11:00:33Z","title":"$\\mathrm{F^2Depth}$: Self-supervised Indoor Monocular Depth Estimation\n via Optical Flow Consistency and Feature Map Synthesis","summary":" Self-supervised monocular depth estimation methods have been increasingly\ngiven much attention due to the benefit of not requiring large, labelled\ndatasets. Such self-supervised methods require high-quality salient features\nand consequently suffer from severe performance drop for indoor scenes, where\nlow-textured regions dominant in the scenes are almost indiscriminative. To\naddress the issue, we propose a self-supervised indoor monocular depth\nestimation framework called $\\mathrm{F^2Depth}$. A self-supervised optical flow\nestimation network is introduced to supervise depth learning. To improve\noptical flow estimation performance in low-textured areas, only some patches of\npoints with more discriminative features are adopted for finetuning based on\nour well-designed patch-based photometric loss. The finetuned optical flow\nestimation network generates high-accuracy optical flow as a supervisory signal\nfor depth estimation. Correspondingly, an optical flow consistency loss is\ndesigned. Multi-scale feature maps produced by finetuned optical flow\nestimation network perform warping to compute feature map synthesis loss as\nanother supervisory signal for depth learning. Experimental results on the NYU\nDepth V2 dataset demonstrate the effectiveness of the framework and our\nproposed losses. To evaluate the generalization ability of our\n$\\mathrm{F^2Depth}$, we collect a Campus Indoor depth dataset composed of\napproximately 1500 points selected from 99 images in 18 scenes. Zero-shot\ngeneralization experiments on 7-Scenes dataset and Campus Indoor achieve\n$\\delta_1$ accuracy of 75.8% and 76.0% respectively. The accuracy results show\nthat our model can generalize well to monocular images captured in unknown\nindoor scenes.\n","authors":["Xiaotong Guo","Huijie Zhao","Shuwei Shao","Xudong Li","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.18443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.17126v2","updated":"2024-03-27T10:50:54Z","published":"2022-11-30T16:03:24Z","title":"BEVUDA: Multi-geometric Space Alignments for Domain Adaptive BEV 3D\n Object Detection","summary":" Vision-centric bird-eye-view (BEV) perception has shown promising potential\nin autonomous driving. Recent works mainly focus on improving efficiency or\naccuracy but neglect the challenges when facing environment changing, resulting\nin severe degradation of transfer performance. For BEV perception, we figure\nout the significant domain gaps existing in typical real-world cross-domain\nscenarios and comprehensively solve the Domain Adaption (DA) problem for\nmulti-view 3D object detection. Since BEV perception approaches are complicated\nand contain several components, the domain shift accumulation on multiple\ngeometric spaces (i.e., 2D, 3D Voxel, BEV) makes BEV DA even challenging. In\nthis paper, we propose a Multi-space Alignment Teacher-Student (MATS) framework\nto ease the domain shift accumulation, which consists of a Depth-Aware Teacher\n(DAT) and a Geometric-space Aligned Student (GAS) model. DAT tactfully combines\ntarget lidar and reliable depth prediction to construct depth-aware\ninformation, extracting target domain-specific knowledge in Voxel and BEV\nfeature spaces. It then transfers the sufficient domain knowledge of multiple\nspaces to the student model. In order to jointly alleviate the domain shift,\nGAS projects multi-geometric space features to a shared geometric embedding\nspace and decreases data distribution distance between two domains. To verify\nthe effectiveness of our method, we conduct BEV 3D object detection experiments\non three cross-domain scenarios and achieve state-of-the-art performance.\n","authors":["Jiaming Liu","Rongyu Zhang","Xiaoqi Li","Xiaowei Chi","Zehui Chen","Ming Lu","Yandong Guo","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.17126v2.pdf","comment":"Accepted by ICRA2024"},{"id":"http://arxiv.org/abs/2403.18442v1","updated":"2024-03-27T10:50:24Z","published":"2024-03-27T10:50:24Z","title":"Backpropagation-free Network for 3D Test-time Adaptation","summary":" Real-world systems often encounter new data over time, which leads to\nexperiencing target domain shifts. Existing Test-Time Adaptation (TTA) methods\ntend to apply computationally heavy and memory-intensive backpropagation-based\napproaches to handle this. Here, we propose a novel method that uses a\nbackpropagation-free approach for TTA for the specific case of 3D data. Our\nmodel uses a two-stream architecture to maintain knowledge about the source\ndomain as well as complementary target-domain-specific information. The\nbackpropagation-free property of our model helps address the well-known\nforgetting problem and mitigates the error accumulation issue. The proposed\nmethod also eliminates the need for the usually noisy process of\npseudo-labeling and reliance on costly self-supervised training. Moreover, our\nmethod leverages subspace learning, effectively reducing the distribution\nvariance between the two domains. Furthermore, the source-domain-specific and\nthe target-domain-specific streams are aligned using a novel entropy-based\nadaptive fusion strategy. Extensive experiments on popular benchmarks\ndemonstrate the effectiveness of our method. The code will be available at\nhttps://github.com/abie-e/BFTT3D.\n","authors":["Yanshuo Wang","Ali Cheraghian","Zeeshan Hayder","Jie Hong","Sameera Ramasinghe","Shafin Rahman","David Ahmedt-Aristizabal","Xuesong Li","Lars Petersson","Mehrtash Harandi"],"pdf_url":"https://arxiv.org/pdf/2403.18442v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2311.18113v2","updated":"2024-03-27T10:46:59Z","published":"2023-11-29T21:58:41Z","title":"Back to 3D: Few-Shot 3D Keypoint Detection with Back-Projected 2D\n Features","summary":" With the immense growth of dataset sizes and computing resources in recent\nyears, so-called foundation models have become popular in NLP and vision tasks.\nIn this work, we propose to explore foundation models for the task of keypoint\ndetection on 3D shapes. A unique characteristic of keypoint detection is that\nit requires semantic and geometric awareness while demanding high localization\naccuracy. To address this problem, we propose, first, to back-project features\nfrom large pre-trained 2D vision models onto 3D shapes and employ them for this\ntask. We show that we obtain robust 3D features that contain rich semantic\ninformation and analyze multiple candidate features stemming from different 2D\nfoundation models. Second, we employ a keypoint candidate optimization module\nwhich aims to match the average observed distribution of keypoints on the shape\nand is guided by the back-projected features. The resulting approach achieves a\nnew state of the art for few-shot keypoint detection on the KeyPointNet\ndataset, almost doubling the performance of the previous best methods.\n","authors":["Thomas Wimmer","Peter Wonka","Maks Ovsjanikov"],"pdf_url":"https://arxiv.org/pdf/2311.18113v2.pdf","comment":"Accepted to CVPR 2024, Project page:\n https://wimmerth.github.io/back-to-3d.html"},{"id":"http://arxiv.org/abs/2401.08742v2","updated":"2024-03-27T10:33:02Z","published":"2024-01-16T18:58:36Z","title":"Fast Dynamic 3D Object Generation from a Single-view Video","summary":" Generating dynamic 3D object from a single-view video is challenging due to\nthe lack of 4D labeled data. Extending image-to-3D pipelines by transferring\noff-the-shelf image generation models such as score distillation sampling,\nexisting methods tend to be slow and expensive to scale due to the need for\nback-propagating the information-limited supervision signals through a large\npretrained model. To address this, we propose an efficient video-to-4D object\ngeneration framework called Efficient4D. It generates high-quality\nspacetime-consistent images under different camera views, and then uses them as\nlabeled data to directly train a novel 4D Gaussian splatting model with\nexplicit point cloud geometry, enabling real-time rendering under continuous\ncamera trajectories. Extensive experiments on synthetic and real videos show\nthat Efficient4D offers a remarkable 20-fold increase in speed when compared to\nprior art alternatives while preserving the quality of novel view synthesis.\nFor example, Efficient4D takes only 6 mins to model a dynamic object, vs 120\nmins by Consistent4D.\n","authors":["Zijie Pan","Zeyu Yang","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.08742v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2403.18425v1","updated":"2024-03-27T10:26:42Z","published":"2024-03-27T10:26:42Z","title":"U-Sketch: An Efficient Approach for Sketch to Image Diffusion Models","summary":" Diffusion models have demonstrated remarkable performance in text-to-image\nsynthesis, producing realistic and high resolution images that faithfully\nadhere to the corresponding text-prompts. Despite their great success, they\nstill fall behind in sketch-to-image synthesis tasks, where in addition to\ntext-prompts, the spatial layout of the generated images has to closely follow\nthe outlines of certain reference sketches. Employing an MLP latent edge\npredictor to guide the spatial layout of the synthesized image by predicting\nedge maps at each denoising step has been recently proposed. Despite yielding\npromising results, the pixel-wise operation of the MLP does not take into\naccount the spatial layout as a whole, and demands numerous denoising\niterations to produce satisfactory images, leading to time inefficiency. To\nthis end, we introduce U-Sketch, a framework featuring a U-Net type latent edge\npredictor, which is capable of efficiently capturing both local and global\nfeatures, as well as spatial correlations between pixels. Moreover, we propose\nthe addition of a sketch simplification network that offers the user the choice\nof preprocessing and simplifying input sketches for enhanced outputs. The\nexperimental results, corroborated by user feedback, demonstrate that our\nproposed U-Net latent edge predictor leads to more realistic results, that are\nbetter aligned with the spatial outlines of the reference sketches, while\ndrastically reducing the number of required denoising steps and, consequently,\nthe overall execution time.\n","authors":["Ilias Mitsouras","Eleftherios Tsonis","Paraskevi Tzouveli","Athanasios Voulodimos"],"pdf_url":"https://arxiv.org/pdf/2403.18425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15098v2","updated":"2024-03-27T10:26:23Z","published":"2024-03-22T10:36:50Z","title":"UniTraj: A Unified Framework for Scalable Vehicle Trajectory Prediction","summary":" Vehicle trajectory prediction has increasingly relied on data-driven\nsolutions, but their ability to scale to different data domains and the impact\nof larger dataset sizes on their generalization remain under-explored. While\nthese questions can be studied by employing multiple datasets, it is\nchallenging due to several discrepancies, e.g., in data formats, map\nresolution, and semantic annotation types. To address these challenges, we\nintroduce UniTraj, a comprehensive framework that unifies various datasets,\nmodels, and evaluation criteria, presenting new opportunities for the vehicle\ntrajectory prediction field. In particular, using UniTraj, we conduct extensive\nexperiments and find that model performance significantly drops when\ntransferred to other datasets. However, enlarging data size and diversity can\nsubstantially improve performance, leading to a new state-of-the-art result for\nthe nuScenes dataset. We provide insights into dataset characteristics to\nexplain these findings. The code can be found here:\nhttps://github.com/vita-epfl/UniTraj\n","authors":["Lan Feng","Mohammadhossein Bahari","Kaouther Messaoud Ben Amor","Éloi Zablocki","Matthieu Cord","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2403.15098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12359v2","updated":"2024-03-27T10:18:04Z","published":"2023-12-19T17:40:27Z","title":"CLIP-DINOiser: Teaching CLIP a few DINO tricks for open-vocabulary\n semantic segmentation","summary":" The popular CLIP model displays impressive zero-shot capabilities thanks to\nits seamless interaction with arbitrary text prompts. However, its lack of\nspatial awareness makes it unsuitable for dense computer vision tasks, e.g.,\nsemantic segmentation, without an additional fine-tuning step that often uses\nannotations and can potentially suppress its original open-vocabulary\nproperties. Meanwhile, self-supervised representation methods have demonstrated\ngood localization properties without human-made annotations nor explicit\nsupervision. In this work, we take the best of both worlds and propose an\nopen-vocabulary semantic segmentation method, which does not require any\nannotations. We propose to locally improve dense MaskCLIP features, which are\ncomputed with a simple modification of CLIP's last pooling layer, by\nintegrating localization priors extracted from self-supervised features. By\ndoing so, we greatly improve the performance of MaskCLIP and produce smooth\noutputs. Moreover, we show that the used self-supervised feature properties can\ndirectly be learnt from CLIP features. Our method CLIP-DINOiser needs only a\nsingle forward pass of CLIP and two light convolutional layers at inference, no\nextra supervision nor extra memory and reaches state-of-the-art results on\nchallenging and fine-grained benchmarks such as COCO, Pascal Context,\nCityscapes and ADE20k. The code to reproduce our results is available at\nhttps://github.com/wysoczanska/clip_dinoiser.\n","authors":["Monika Wysoczańska","Oriane Siméoni","Michaël Ramamonjisoa","Andrei Bursuc","Tomasz Trzciński","Patrick Pérez"],"pdf_url":"https://arxiv.org/pdf/2312.12359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12480v2","updated":"2024-03-27T10:12:32Z","published":"2023-12-19T15:34:52Z","title":"Continual-MAE: Adaptive Distribution Masked Autoencoders for Continual\n Test-Time Adaptation","summary":" Continual Test-Time Adaptation (CTTA) is proposed to migrate a source\npre-trained model to continually changing target distributions, addressing\nreal-world dynamism. Existing CTTA methods mainly rely on entropy minimization\nor teacher-student pseudo-labeling schemes for knowledge extraction in\nunlabeled target domains. However, dynamic data distributions cause\nmiscalibrated predictions and noisy pseudo-labels in existing self-supervised\nlearning methods, hindering the effective mitigation of error accumulation and\ncatastrophic forgetting problems during the continual adaptation process. To\ntackle these issues, we propose a continual self-supervised method, Adaptive\nDistribution Masked Autoencoders (ADMA), which enhances the extraction of\ntarget domain knowledge while mitigating the accumulation of distribution\nshifts. Specifically, we propose a Distribution-aware Masking (DaM) mechanism\nto adaptively sample masked positions, followed by establishing consistency\nconstraints between the masked target samples and the original target samples.\nAdditionally, for masked tokens, we utilize an efficient decoder to reconstruct\na hand-crafted feature descriptor (e.g., Histograms of Oriented Gradients),\nleveraging its invariant properties to boost task-relevant representations.\nThrough conducting extensive experiments on four widely recognized benchmarks,\nour proposed method attains state-of-the-art performance in both classification\nand segmentation CTTA tasks. Our project page:\nhttps://sites.google.com/view/continual-mae/home.\n","authors":["Jiaming Liu","Ran Xu","Senqiao Yang","Renrui Zhang","Qizhe Zhang","Zehui Chen","Yandong Guo","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12480v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.18417v1","updated":"2024-03-27T10:09:38Z","published":"2024-03-27T10:09:38Z","title":"ECNet: Effective Controllable Text-to-Image Diffusion Models","summary":" The conditional text-to-image diffusion models have garnered significant\nattention in recent years. However, the precision of these models is often\ncompromised mainly for two reasons, ambiguous condition input and inadequate\ncondition guidance over single denoising loss. To address the challenges, we\nintroduce two innovative solutions. Firstly, we propose a Spatial Guidance\nInjector (SGI) which enhances conditional detail by encoding text inputs with\nprecise annotation information. This method directly tackles the issue of\nambiguous control inputs by providing clear, annotated guidance to the model.\nSecondly, to overcome the issue of limited conditional supervision, we\nintroduce Diffusion Consistency Loss (DCL), which applies supervision on the\ndenoised latent code at any given time step. This encourages consistency\nbetween the latent code at each time step and the input signal, thereby\nenhancing the robustness and accuracy of the output. The combination of SGI and\nDCL results in our Effective Controllable Network (ECNet), which offers a more\naccurate controllable end-to-end text-to-image generation framework with a more\nprecise conditioning input and stronger controllable supervision. We validate\nour approach through extensive experiments on generation under various\nconditions, such as human body skeletons, facial landmarks, and sketches of\ngeneral objects. The results consistently demonstrate that our method\nsignificantly enhances the controllability and robustness of the generated\nimages, outperforming existing state-of-the-art controllable text-to-image\nmodels.\n","authors":["Sicheng Li","Keqiang Sun","Zhixin Lai","Xiaoshi Wu","Feng Qiu","Haoran Xie","Kazunori Miyata","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.18417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06075v2","updated":"2024-03-27T09:51:15Z","published":"2023-09-12T09:12:37Z","title":"A2V: A Semi-Supervised Domain Adaptation Framework for Brain Vessel\n Segmentation via Two-Phase Training Angiography-to-Venography Translation","summary":" We present a semi-supervised domain adaptation framework for brain vessel\nsegmentation from different image modalities. Existing state-of-the-art methods\nfocus on a single modality, despite the wide range of available cerebrovascular\nimaging techniques. This can lead to significant distribution shifts that\nnegatively impact the generalization across modalities. By relying on annotated\nangiographies and a limited number of annotated venographies, our framework\naccomplishes image-to-image translation and semantic segmentation, leveraging a\ndisentangled and semantically rich latent space to represent heterogeneous data\nand perform image-level adaptation from source to target domains. Moreover, we\nreduce the typical complexity of cycle-based architectures and minimize the use\nof adversarial training, which allows us to build an efficient and intuitive\nmodel with stable training. We evaluate our method on magnetic resonance\nangiographies and venographies. While achieving state-of-the-art performance in\nthe source domain, our method attains a Dice score coefficient in the target\ndomain that is only 8.9% lower, highlighting its promising potential for robust\ncerebrovascular image segmentation across different modalities.\n","authors":["Francesco Galati","Daniele Falcetta","Rosa Cortese","Barbara Casolla","Ferran Prados","Ninon Burgos","Maria A. Zuluaga"],"pdf_url":"https://arxiv.org/pdf/2309.06075v2.pdf","comment":"Accepted at the 34th British Machine Vision Conference (BMVC)"},{"id":"http://arxiv.org/abs/2403.18407v1","updated":"2024-03-27T09:49:37Z","published":"2024-03-27T09:49:37Z","title":"A Channel-ensemble Approach: Unbiased and Low-variance Pseudo-labels is\n Critical for Semi-supervised Classification","summary":" Semi-supervised learning (SSL) is a practical challenge in computer vision.\nPseudo-label (PL) methods, e.g., FixMatch and FreeMatch, obtain the State Of\nThe Art (SOTA) performances in SSL. These approaches employ a\nthreshold-to-pseudo-label (T2L) process to generate PLs by truncating the\nconfidence scores of unlabeled data predicted by the self-training method.\nHowever, self-trained models typically yield biased and high-variance\npredictions, especially in the scenarios when a little labeled data are\nsupplied. To address this issue, we propose a lightweight channel-based\nensemble method to effectively consolidate multiple inferior PLs into the\ntheoretically guaranteed unbiased and low-variance one. Importantly, our\napproach can be readily extended to any SSL framework, such as FixMatch or\nFreeMatch. Experimental results demonstrate that our method significantly\noutperforms state-of-the-art techniques on CIFAR10/100 in terms of\neffectiveness and efficiency.\n","authors":["Jiaqi Wu","Junbiao Pang","Baochang Zhang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2403.18407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18406v1","updated":"2024-03-27T09:48:23Z","published":"2024-03-27T09:48:23Z","title":"An Image Grid Can Be Worth a Video: Zero-shot Video Question Answering\n Using a VLM","summary":" Stimulated by the sophisticated reasoning capabilities of recent Large\nLanguage Models (LLMs), a variety of strategies for bridging video modality\nhave been devised. A prominent strategy involves Video Language Models\n(VideoLMs), which train a learnable interface with video data to connect\nadvanced vision encoders with LLMs. Recently, an alternative strategy has\nsurfaced, employing readily available foundation models, such as VideoLMs and\nLLMs, across multiple stages for modality bridging. In this study, we introduce\na simple yet novel strategy where only a single Vision Language Model (VLM) is\nutilized. Our starting point is the plain insight that a video comprises a\nseries of images, or frames, interwoven with temporal information. The essence\nof video comprehension lies in adeptly managing the temporal aspects along with\nthe spatial details of each frame. Initially, we transform a video into a\nsingle composite image by arranging multiple frames in a grid layout. The\nresulting single image is termed as an image grid. This format, while\nmaintaining the appearance of a solitary image, effectively retains temporal\ninformation within the grid structure. Therefore, the image grid approach\nenables direct application of a single high-performance VLM without\nnecessitating any video-data training. Our extensive experimental analysis\nacross ten zero-shot video question answering benchmarks, including five\nopen-ended and five multiple-choice benchmarks, reveals that the proposed Image\nGrid Vision Language Model (IG-VLM) surpasses the existing methods in nine out\nof ten benchmarks.\n","authors":["Wonkyun Kim","Changin Choi","Wonseok Lee","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2403.18406v1.pdf","comment":"Our code is available at https://github.com/imagegridworth/IG-VLM"},{"id":"http://arxiv.org/abs/2403.05262v2","updated":"2024-03-27T09:43:41Z","published":"2024-03-08T12:35:07Z","title":"Debiasing Multimodal Large Language Models","summary":" In the realms of computer vision and natural language processing, Large\nVision-Language Models (LVLMs) have become indispensable tools, proficient in\ngenerating textual descriptions based on visual inputs. Despite their\nadvancements, our investigation reveals a noteworthy bias in the generated\ncontent, where the output is primarily influenced by the underlying Large\nLanguage Models (LLMs) prior rather than the input image. Our empirical\nexperiments underscore the persistence of this bias, as LVLMs often provide\nconfident answers even in the absence of relevant images or given incongruent\nvisual input. To rectify these biases and redirect the model's focus toward\nvision information, we introduce two simple, training-free strategies. Firstly,\nfor tasks such as classification or multi-choice question-answering (QA), we\npropose a ``calibration'' step through affine transformation to adjust the\noutput distribution. This ``Post-Hoc debias'' approach ensures uniform scores\nfor each answer when the image is absent, serving as an effective\nregularization technique to alleviate the influence of LLM priors. For more\nintricate open-ended generation tasks, we extend this method to ``Debias\nsampling'', drawing inspirations from contrastive decoding methods.\nFurthermore, our investigation sheds light on the instability of LVLMs across\nvarious decoding configurations. Through systematic exploration of different\nsettings, we significantly enhance performance, surpassing reported results and\nraising concerns about the fairness of existing evaluations. Comprehensive\nexperiments substantiate the effectiveness of our proposed strategies in\nmitigating biases. These strategies not only prove beneficial in minimizing\nhallucinations but also contribute to the generation of more helpful and\nprecise illustrations.\n","authors":["Yi-Fan Zhang","Weichen Yu","Qingsong Wen","Xue Wang","Zhang Zhang","Liang Wang","Rong Jin","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2403.05262v2.pdf","comment":"38 pages, 17 figures"},{"id":"http://arxiv.org/abs/2401.01647v2","updated":"2024-03-27T09:39:41Z","published":"2024-01-03T09:46:43Z","title":"SIGNeRF: Scene Integrated Generation for Neural Radiance Fields","summary":" Advances in image diffusion models have recently led to notable improvements\nin the generation of high-quality images. In combination with Neural Radiance\nFields (NeRFs), they enabled new opportunities in 3D generation. However, most\ngenerative 3D approaches are object-centric and applying them to editing\nexisting photorealistic scenes is not trivial. We propose SIGNeRF, a novel\napproach for fast and controllable NeRF scene editing and scene-integrated\nobject generation. A new generative update strategy ensures 3D consistency\nacross the edited images, without requiring iterative optimization. We find\nthat depth-conditioned diffusion models inherently possess the capability to\ngenerate 3D consistent views by requesting a grid of images instead of single\nviews. Based on these insights, we introduce a multi-view reference sheet of\nmodified images. Our method updates an image collection consistently based on\nthe reference sheet and refines the original NeRF with the newly generated\nimage set in one go. By exploiting the depth conditioning mechanism of the\nimage diffusion model, we gain fine control over the spatial location of the\nedit and enforce shape guidance by a selected region or an external mesh.\n","authors":["Jan-Niklas Dihlmann","Andreas Engelhardt","Hendrik Lensch"],"pdf_url":"https://arxiv.org/pdf/2401.01647v2.pdf","comment":"Project Page: https://signerf.jdihlmann.com"},{"id":"http://arxiv.org/abs/2403.18397v1","updated":"2024-03-27T09:35:56Z","published":"2024-03-27T09:35:56Z","title":"Colour and Brush Stroke Pattern Recognition in Abstract Art using\n Modified Deep Convolutional Generative Adversarial Networks","summary":" Abstract Art is an immensely popular, discussed form of art that often has\nthe ability to depict the emotions of an artist. Many researchers have made\nattempts to study abstract art in the form of edge detection, brush stroke and\nemotion recognition algorithms using machine and deep learning. This papers\ndescribes the study of a wide distribution of abstract paintings using\nGenerative Adversarial Neural Networks(GAN). GANs have the ability to learn and\nreproduce a distribution enabling researchers and scientists to effectively\nexplore and study the generated image space. However, the challenge lies in\ndeveloping an efficient GAN architecture that overcomes common training\npitfalls. This paper addresses this challenge by introducing a modified-DCGAN\n(mDCGAN) specifically designed for high-quality artwork generation. The\napproach involves a thorough exploration of the modifications made, delving\ninto the intricate workings of DCGANs, optimisation techniques, and\nregularisation methods aimed at improving stability and realism in art\ngeneration enabling effective study of generated patterns. The proposed mDCGAN\nincorporates meticulous adjustments in layer configurations and architectural\nchoices, offering tailored solutions to the unique demands of art generation\nwhile effectively combating issues like mode collapse and gradient vanishing.\nFurther this paper explores the generated latent space by performing random\nwalks to understand vector relationships between brush strokes and colours in\nthe abstract art space and a statistical analysis of unstable outputs after a\ncertain period of GAN training and compare its significant difference. These\nfindings validate the effectiveness of the proposed approach, emphasising its\npotential to revolutionise the field of digital art generation and digital art\necosystem.\n","authors":["Srinitish Srinivasan","Varenya Pathak"],"pdf_url":"https://arxiv.org/pdf/2403.18397v1.pdf","comment":"28 pages, 5 tables, 7 figures"},{"id":"http://arxiv.org/abs/2403.11656v2","updated":"2024-03-27T09:34:44Z","published":"2024-03-18T10:53:00Z","title":"LocalStyleFool: Regional Video Style Transfer Attack Using Segment\n Anything Model","summary":" Previous work has shown that well-crafted adversarial perturbations can\nthreaten the security of video recognition systems. Attackers can invade such\nmodels with a low query budget when the perturbations are semantic-invariant,\nsuch as StyleFool. Despite the query efficiency, the naturalness of the minutia\nareas still requires amelioration, since StyleFool leverages style transfer to\nall pixels in each frame. To close the gap, we propose LocalStyleFool, an\nimproved black-box video adversarial attack that superimposes regional\nstyle-transfer-based perturbations on videos. Benefiting from the popularity\nand scalably usability of Segment Anything Model (SAM), we first extract\ndifferent regions according to semantic information and then track them through\nthe video stream to maintain the temporal consistency. Then, we add\nstyle-transfer-based perturbations to several regions selected based on the\nassociative criterion of transfer-based gradient information and regional area.\nPerturbation fine adjustment is followed to make stylized videos adversarial.\nWe demonstrate that LocalStyleFool can improve both intra-frame and inter-frame\nnaturalness through a human-assessed survey, while maintaining competitive\nfooling rate and query efficiency. Successful experiments on the\nhigh-resolution dataset also showcase that scrupulous segmentation of SAM helps\nto improve the scalability of adversarial attacks under high-resolution data.\n","authors":["Yuxin Cao","Jinghao Li","Xi Xiao","Derui Wang","Minhui Xue","Hao Ge","Wei Liu","Guangwu Hu"],"pdf_url":"https://arxiv.org/pdf/2403.11656v2.pdf","comment":"Accepted to 2024 IEEE Security and Privacy Workshops (SPW)"},{"id":"http://arxiv.org/abs/2403.18388v1","updated":"2024-03-27T09:25:20Z","published":"2024-03-27T09:25:20Z","title":"FTBC: Forward Temporal Bias Correction for Optimizing ANN-SNN Conversion","summary":" Spiking Neural Networks (SNNs) offer a promising avenue for energy-efficient\ncomputing compared with Artificial Neural Networks (ANNs), closely mirroring\nbiological neural processes. However, this potential comes with inherent\nchallenges in directly training SNNs through spatio-temporal backpropagation --\nstemming from the temporal dynamics of spiking neurons and their discrete\nsignal processing -- which necessitates alternative ways of training, most\nnotably through ANN-SNN conversion. In this work, we introduce a lightweight\nForward Temporal Bias Correction (FTBC) technique, aimed at enhancing\nconversion accuracy without the computational overhead. We ground our method on\nprovided theoretical findings that through proper temporal bias calibration the\nexpected error of ANN-SNN conversion can be reduced to be zero after each time\nstep. We further propose a heuristic algorithm for finding the temporal bias\nonly in the forward pass, thus eliminating the computational burden of\nbackpropagation and we evaluate our method on CIFAR-10/100 and ImageNet\ndatasets, achieving a notable increase in accuracy on all datasets. Codes are\nreleased at a GitHub repository.\n","authors":["Xiaofeng Wu","Velibor Bojkovic","Bin Gu","Kun Suo","Kai Zou"],"pdf_url":"https://arxiv.org/pdf/2403.18388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06733v3","updated":"2024-03-27T09:24:56Z","published":"2023-12-11T10:43:28Z","title":"TULIP: Transformer for Upsampling of LiDAR Point Cloud","summary":" LiDAR Upsampling is a challenging task for the perception systems of robots\nand autonomous vehicles, due to the sparse and irregular structure of\nlarge-scale scene contexts. Recent works propose to solve this problem by\nconverting LiDAR data from 3D Euclidean space into an image super-resolution\nproblem in 2D image space. Although their methods can generate high-resolution\nrange images with fine-grained details, the resulting 3D point clouds often\nblur out details and predict invalid points. In this paper, we propose TULIP, a\nnew method to reconstruct high-resolution LiDAR point clouds from\nlow-resolution LiDAR input. We also follow a range image-based approach but\nspecifically modify the patch and window geometries of a Swin-Transformer-based\nnetwork to better fit the characteristics of range images. We conducted several\nexperiments on three public real-world and simulated datasets. TULIP\noutperforms state-of-the-art methods in all relevant metrics and generates\nrobust and more realistic point clouds than prior works.\n","authors":["Bin Yang","Patrick Pfreundschuh","Roland Siegwart","Marco Hutter","Peyman Moghadam","Vaishakh Patil"],"pdf_url":"https://arxiv.org/pdf/2312.06733v3.pdf","comment":"The paper was accepted by CVPR20224"},{"id":"http://arxiv.org/abs/2403.05218v2","updated":"2024-03-27T09:21:42Z","published":"2024-03-08T11:09:46Z","title":"3D Face Reconstruction Using A Spectral-Based Graph Convolution Encoder","summary":" Monocular 3D face reconstruction plays a crucial role in avatar generation,\nwith significant demand in web-related applications such as generating virtual\nfinancial advisors in FinTech. Current reconstruction methods predominantly\nrely on deep learning techniques and employ 2D self-supervision as a means to\nguide model learning. However, these methods encounter challenges in capturing\nthe comprehensive 3D structural information of the face due to the utilization\nof 2D images for model training purposes. To overcome this limitation and\nenhance the reconstruction of 3D structural features, we propose an innovative\napproach that integrates existing 2D features with 3D features to guide the\nmodel learning process. Specifically, we introduce the 3D-ID Loss, which\nleverages the high-dimensional structure features extracted from a\nSpectral-Based Graph Convolution Encoder applied to the facial mesh. This\napproach surpasses the sole reliance on the 3D information provided by the\nfacial mesh vertices coordinates. Our model is trained using 2D-3D data pairs\nfrom a combination of datasets and achieves state-of-the-art performance on the\nNoW benchmark.\n","authors":["Haoxin Xu","Zezheng Zhao","Yuxin Cao","Chunyu Chen","Hao Ge","Ziyao Liu"],"pdf_url":"https://arxiv.org/pdf/2403.05218v2.pdf","comment":"4 pages, 3 figures. Accepted to WWW 2024"},{"id":"http://arxiv.org/abs/2403.18383v1","updated":"2024-03-27T09:21:07Z","published":"2024-03-27T09:21:07Z","title":"Generative Multi-modal Models are Good Class-Incremental Learners","summary":" In class-incremental learning (CIL) scenarios, the phenomenon of catastrophic\nforgetting caused by the classifier's bias towards the current task has long\nposed a significant challenge. It is mainly caused by the characteristic of\ndiscriminative models. With the growing popularity of the generative\nmulti-modal models, we would explore replacing discriminative models with\ngenerative ones for CIL. However, transitioning from discriminative to\ngenerative models requires addressing two key challenges. The primary challenge\nlies in transferring the generated textual information into the classification\nof distinct categories. Additionally, it requires formulating the task of CIL\nwithin a generative framework. To this end, we propose a novel generative\nmulti-modal model (GMM) framework for class-incremental learning. Our approach\ndirectly generates labels for images using an adapted generative model. After\nobtaining the detailed text, we use a text encoder to extract text features and\nemploy feature matching to determine the most similar label as the\nclassification prediction. In the conventional CIL settings, we achieve\nsignificantly better results in long-sequence task scenarios. Under the\nFew-shot CIL setting, we have improved by at least 14\\% accuracy over all the\ncurrent state-of-the-art methods with significantly less forgetting. Our code\nis available at \\url{https://github.com/DoubleClass/GMM}.\n","authors":["Xusheng Cao","Haori Lu","Linlan Huang","Xialei Liu","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.18383v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2401.17879v2","updated":"2024-03-27T09:17:14Z","published":"2024-01-31T14:36:49Z","title":"AEROBLADE: Training-Free Detection of Latent Diffusion Images Using\n Autoencoder Reconstruction Error","summary":" With recent text-to-image models, anyone can generate deceptively realistic\nimages with arbitrary contents, fueling the growing threat of visual\ndisinformation. A key enabler for generating high-resolution images with low\ncomputational cost has been the development of latent diffusion models (LDMs).\nIn contrast to conventional diffusion models, LDMs perform the denoising\nprocess in the low-dimensional latent space of a pre-trained autoencoder (AE)\ninstead of the high-dimensional image space. Despite their relevance, the\nforensic analysis of LDMs is still in its infancy. In this work we propose\nAEROBLADE, a novel detection method which exploits an inherent component of\nLDMs: the AE used to transform images between image and latent space. We find\nthat generated images can be more accurately reconstructed by the AE than real\nimages, allowing for a simple detection approach based on the reconstruction\nerror. Most importantly, our method is easy to implement and does not require\nany training, yet nearly matches the performance of detectors that rely on\nextensive training. We empirically demonstrate that AEROBLADE is effective\nagainst state-of-the-art LDMs, including Stable Diffusion and Midjourney.\nBeyond detection, our approach allows for the qualitative analysis of images,\nwhich can be leveraged for identifying inpainted regions. We release our code\nand data at https://github.com/jonasricker/aeroblade .\n","authors":["Jonas Ricker","Denis Lukovnikov","Asja Fischer"],"pdf_url":"https://arxiv.org/pdf/2401.17879v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.00174v2","updated":"2024-03-27T09:13:19Z","published":"2024-02-29T22:58:13Z","title":"A citizen science toolkit to collect human perceptions of urban\n environments using open street view images","summary":" Street View-level Imagery (SVI) is a valuable data source for studies (e.g.,\nenvironmental assessments, green space identification or land cover\nclassification). While commercial SVI is available, such providers commonly\nrestrict copying or reuse in ways necessary for research. Open SVI datasets are\nreadily available from less restrictive sources, such as Mapillary, but due to\nthe heterogeneity of the images, these require substantial preprocessing,\nfiltering, and careful quality checks. We present an efficient method for\nautomated downloading, processing, cropping, and filtering open SVI, to be used\nin a survey of human perceptions of the streets portrayed in these images. We\ndemonstrate our open-source reusable SVI preparation and smartphone-friendly\nperception-survey software with Amsterdam (Netherlands) as the case study.\nUsing a citizen science approach, we collected from 331 people 22,637 ratings\nabout their perceptions for various criteria. We have published our software in\na public repository for future re-use and reproducibility.\n","authors":["Matthew Danish","SM Labib","Britta Ricker","Marco Helbich"],"pdf_url":"https://arxiv.org/pdf/2403.00174v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18373v1","updated":"2024-03-27T09:10:01Z","published":"2024-03-27T09:10:01Z","title":"BAM: Box Abstraction Monitors for Real-time OoD Detection in Object\n Detection","summary":" Out-of-distribution (OoD) detection techniques for deep neural networks\n(DNNs) become crucial thanks to their filtering of abnormal inputs, especially\nwhen DNNs are used in safety-critical applications and interact with an open\nand dynamic environment. Nevertheless, integrating OoD detection into\nstate-of-the-art (SOTA) object detection DNNs poses significant challenges,\npartly due to the complexity introduced by the SOTA OoD construction methods,\nwhich require the modification of DNN architecture and the introduction of\ncomplex loss functions. This paper proposes a simple, yet surprisingly\neffective, method that requires neither retraining nor architectural change in\nobject detection DNN, called Box Abstraction-based Monitors (BAM). The novelty\nof BAM stems from using a finite union of convex box abstractions to capture\nthe learned features of objects for in-distribution (ID) data, and an important\nobservation that features from OoD data are more likely to fall outside of\nthese boxes. The union of convex regions within the feature space allows the\nformation of non-convex and interpretable decision boundaries, overcoming the\nlimitations of VOS-like detectors without sacrificing real-time performance.\nExperiments integrating BAM into Faster R-CNN-based object detection DNNs\ndemonstrate a considerably improved performance against SOTA OoD detection\ntechniques.\n","authors":["Changshun Wu","Weicheng He","Chih-Hong Cheng","Xiaowei Huang","Saddek Bensalem"],"pdf_url":"https://arxiv.org/pdf/2403.18373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17905v2","updated":"2024-03-27T09:07:02Z","published":"2024-03-26T17:45:06Z","title":"Scalable Non-Cartesian Magnetic Resonance Imaging with R2D2","summary":" We propose a new approach for non-Cartesian magnetic resonance image\nreconstruction. While unrolled architectures provide robustness via\ndata-consistency layers, embedding measurement operators in Deep Neural Network\n(DNN) can become impractical at large scale. Alternative Plug-and-Play (PnP)\napproaches, where the denoising DNNs are blind to the measurement setting, are\nnot affected by this limitation and have also proven effective, but their\nhighly iterative nature also affects scalability. To address this scalability\nchallenge, we leverage the \"Residual-to-Residual DNN series for high-Dynamic\nrange imaging (R2D2)\" approach recently introduced in astronomical imaging.\nR2D2's reconstruction is formed as a series of residual images, iteratively\nestimated as outputs of DNNs taking the previous iteration's image estimate and\nassociated data residual as inputs. The method can be interpreted as a learned\nversion of the Matching Pursuit algorithm. We demonstrate R2D2 in simulation,\nconsidering radial k-space sampling acquisition sequences. Our preliminary\nresults suggest that R2D2 achieves: (i) suboptimal performance compared to its\nunrolled incarnation R2D2-Net, which is however non-scalable due to the\nnecessary embedding of NUFFT-based data-consistency layers; (ii) superior\nreconstruction quality to a scalable version of R2D2-Net embedding an FFT-based\napproximation for data consistency; (iii) superior reconstruction quality to\nPnP, while only requiring few iterations.\n","authors":["Yiwei Chen","Chao Tang","Amir Aghabiglou","Chung San Chu","Yves Wiaux"],"pdf_url":"https://arxiv.org/pdf/2403.17905v2.pdf","comment":"submitted to IEEE EUSIPCO 2024"},{"id":"http://arxiv.org/abs/2403.18370v1","updated":"2024-03-27T09:06:36Z","published":"2024-03-27T09:06:36Z","title":"Ship in Sight: Diffusion Models for Ship-Image Super Resolution","summary":" In recent years, remarkable advancements have been achieved in the field of\nimage generation, primarily driven by the escalating demand for high-quality\noutcomes across various image generation subtasks, such as inpainting,\ndenoising, and super resolution. A major effort is devoted to exploring the\napplication of super-resolution techniques to enhance the quality of\nlow-resolution images. In this context, our method explores in depth the\nproblem of ship image super resolution, which is crucial for coastal and port\nsurveillance. We investigate the opportunity given by the growing interest in\ntext-to-image diffusion models, taking advantage of the prior knowledge that\nsuch foundation models have already learned. In particular, we present a\ndiffusion-model-based architecture that leverages text conditioning during\ntraining while being class-aware, to best preserve the crucial details of the\nships during the generation of the super-resoluted image. Since the specificity\nof this task and the scarcity availability of off-the-shelf data, we also\nintroduce a large labeled ship dataset scraped from online ship images, mostly\nfrom ShipSpotting\\footnote{\\url{www.shipspotting.com}} website. Our method\nachieves more robust results than other deep learning models previously\nemployed for super resolution, as proven by the multiple experiments performed.\nMoreover, we investigate how this model can benefit downstream tasks, such as\nclassification and object detection, thus emphasizing practical implementation\nin a real-world scenario. Experimental results show flexibility, reliability,\nand impressive performance of the proposed framework over state-of-the-art\nmethods for different tasks. The code is available at:\nhttps://github.com/LuigiSigillo/ShipinSight .\n","authors":["Luigi Sigillo","Riccardo Fosco Gramaccioni","Alessandro Nicolosi","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2403.18370v1.pdf","comment":"Accepted at 2024 International Joint Conference on Neural Networks\n (IJCNN)"},{"id":"http://arxiv.org/abs/2312.10114v2","updated":"2024-03-27T09:00:54Z","published":"2023-12-15T09:49:21Z","title":"FoMo-Bench: a multi-modal, multi-scale and multi-task Forest Monitoring\n Benchmark for remote sensing foundation models","summary":" Forests are an essential part of Earth's ecosystems and natural systems, as\nwell as providing services on which humanity depends, yet they are rapidly\nchanging as a result of land use decisions and climate change. Understanding\nand mitigating negative effects requires parsing data on forests at global\nscale from a broad array of sensory modalities, and recently many such problems\nhave been approached using machine learning algorithms for remote sensing. To\ndate, forest-monitoring problems have largely been addressed in isolation.\nInspired by the rise of foundation models for computer vision and remote\nsensing, we here present the first unified Forest Monitoring Benchmark\n(FoMo-Bench). FoMo-Bench consists of 15 diverse datasets encompassing\nsatellite, aerial, and inventory data, covering a variety of geographical\nregions, and including multispectral, red-green-blue, synthetic aperture radar\n(SAR) and LiDAR data with various temporal, spatial and spectral resolutions.\nFoMo-Bench includes multiple types of forest-monitoring tasks, spanning\nclassification, segmentation, and object detection. To further enhance the\ndiversity of tasks and geographies represented in FoMo-Bench, we introduce a\nnovel global dataset, TalloS, combining satellite imagery with ground-based\nannotations for tree species classification, encompassing 1,000+ categories\nacross multiple hierarchical taxonomic levels (species, genus, family).\nFinally, we propose FoMo-Net, a baseline foundation model with the capacity to\nprocess any combination of commonly used spectral bands in remote sensing,\nacross diverse ground sampling distances and geographical locations worldwide.\nThis work aims to inspire research collaborations between machine learning and\nforest biology researchers in exploring scalable multi-modal and multi-task\nmodels for forest monitoring. All code and data will be made publicly\navailable.\n","authors":["Nikolaos Ioannis Bountos","Arthur Ouaknine","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2312.10114v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2402.19473v2","updated":"2024-03-27T09:00:25Z","published":"2024-02-29T18:59:01Z","title":"Retrieval-Augmented Generation for AI-Generated Content: A Survey","summary":" The development of Artificial Intelligence Generated Content (AIGC) has been\nfacilitated by advancements in model algorithms, the increasing scale of\nfoundation models, and the availability of ample high-quality datasets. While\nAIGC has achieved remarkable performance, it still faces several challenges,\nsuch as the difficulty of maintaining up-to-date and long-tail knowledge, the\nrisk of data leakage, and the high costs associated with training and\ninference. Retrieval-Augmented Generation(RAG) has recently emerged as a\nparadigm to address such challenges. In particular, RAG introduces the\ninformation retrieval process, which enhances the generation process by\nretrieving relevant objects from available data stores, leading to higher\naccuracy and better robustness. In this paper, we comprehensively review\nexisting efforts that integrate RAG technique into AIGC scenarios. We first\nclassify RAG foundations according to how the retriever augments the generator,\ndistilling the fundamental abstractions of the augmentation methodologies for\nvarious retrievers and generators. This unified perspective encompasses all RAG\nscenarios, illuminating advancements and pivotal technologies that help with\npotential future progress. We also summarize additional enhancements methods\nfor RAG, facilitating effective engineering and implementation of RAG systems.\nThen from another view, we survey on practical applications of RAG across\ndifferent modalities and tasks, offering valuable references for researchers\nand practitioners. Furthermore, we introduce the benchmarks for RAG, discuss\nthe limitations of current RAG systems, and suggest potential directions for\nfuture research.Project Repo: https://github.com/hymie122/RAG-Survey.\n","authors":["Penghao Zhao","Hailin Zhang","Qinhan Yu","Zhengren Wang","Yunteng Geng","Fangcheng Fu","Ling Yang","Wentao Zhang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2402.19473v2.pdf","comment":"Citing 380 papers, 36 pages, 16 figures. Project:\n https://github.com/hymie122/RAG-Survey"},{"id":"http://arxiv.org/abs/2310.03325v2","updated":"2024-03-27T08:54:35Z","published":"2023-10-05T05:41:21Z","title":"Learning Concept-Based Causal Transition and Symbolic Reasoning for\n Visual Planning","summary":" Visual planning simulates how humans make decisions to achieve desired goals\nin the form of searching for visual causal transitions between an initial\nvisual state and a final visual goal state. It has become increasingly\nimportant in egocentric vision with its advantages in guiding agents to perform\ndaily tasks in complex environments. In this paper, we propose an interpretable\nand generalizable visual planning framework consisting of i) a novel\nSubstitution-based Concept Learner (SCL) that abstracts visual inputs into\ndisentangled concept representations, ii) symbol abstraction and reasoning that\nperforms task planning via the self-learned symbols, and iii) a Visual Causal\nTransition model (ViCT) that grounds visual causal transitions to semantically\nsimilar real-world actions. Given an initial state, we perform goal-conditioned\nvisual planning with a symbolic reasoning method fueled by the learned\nrepresentations and causal transitions to reach the goal state. To verify the\neffectiveness of the proposed model, we collect a large-scale visual planning\ndataset based on AI2-THOR, dubbed as CCTP. Extensive experiments on this\nchallenging dataset demonstrate the superior performance of our method in\nvisual task planning. Empirically, we show that our framework can generalize to\nunseen task trajectories, unseen object categories, and real-world data.\nFurther details of this work are provided at\nhttps://fqyqc.github.io/ConTranPlan/.\n","authors":["Yilue Qian","Peiyu Yu","Ying Nian Wu","Yao Su","Wei Wang","Lifeng Fan"],"pdf_url":"https://arxiv.org/pdf/2310.03325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15837v2","updated":"2024-03-27T08:54:06Z","published":"2024-03-23T13:24:31Z","title":"Centered Masking for Language-Image Pre-Training","summary":" We introduce Gaussian masking for Language-Image Pre-Training (GLIP) a novel,\nstraightforward, and effective technique for masking image patches during\npre-training of a vision-language model. GLIP builds on Fast Language-Image\nPre-Training (FLIP), which randomly masks image patches while training a CLIP\nmodel. GLIP replaces random masking with centered masking, that uses a Gaussian\ndistribution and is inspired by the importance of image patches at the center\nof the image. GLIP retains the same computational savings as FLIP, while\nimproving performance across a range of downstream datasets and tasks, as\ndemonstrated by our experimental results. We show the benefits of GLIP to be\neasy to obtain, requiring no delicate tuning of the Gaussian, and also\napplicable to data sets containing images without an obvious center focus.\n","authors":["Mingliang Liang","Martha Larson"],"pdf_url":"https://arxiv.org/pdf/2403.15837v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18361v1","updated":"2024-03-27T08:53:13Z","published":"2024-03-27T08:53:13Z","title":"ViTAR: Vision Transformer with Any Resolution","summary":" his paper tackles a significant challenge faced by Vision Transformers\n(ViTs): their constrained scalability across different image resolutions.\nTypically, ViTs experience a performance decline when processing resolutions\ndifferent from those seen during training. Our work introduces two key\ninnovations to address this issue. Firstly, we propose a novel module for\ndynamic resolution adjustment, designed with a single Transformer block,\nspecifically to achieve highly efficient incremental token integration.\nSecondly, we introduce fuzzy positional encoding in the Vision Transformer to\nprovide consistent positional awareness across multiple resolutions, thereby\npreventing overfitting to any single training resolution. Our resulting model,\nViTAR (Vision Transformer with Any Resolution), demonstrates impressive\nadaptability, achieving 83.3\\% top-1 accuracy at a 1120x1120 resolution and\n80.4\\% accuracy at a 4032x4032 resolution, all while reducing computational\ncosts. ViTAR also shows strong performance in downstream tasks such as instance\nand semantic segmentation and can easily combined with self-supervised learning\ntechniques like Masked AutoEncoder. Our work provides a cost-effective solution\nfor enhancing the resolution scalability of ViTs, paving the way for more\nversatile and efficient high-resolution image processing.\n","authors":["Qihang Fan","Quanzeng You","Xiaotian Han","Yongfei Liu","Yunzhe Tao","Huaibo Huang","Ran He","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2403.18361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18360v1","updated":"2024-03-27T08:52:44Z","published":"2024-03-27T08:52:44Z","title":"Learning CNN on ViT: A Hybrid Model to Explicitly Class-specific\n Boundaries for Domain Adaptation","summary":" Most domain adaptation (DA) methods are based on either a convolutional\nneural networks (CNNs) or a vision transformers (ViTs). They align the\ndistribution differences between domains as encoders without considering their\nunique characteristics. For instance, ViT excels in accuracy due to its\nsuperior ability to capture global representations, while CNN has an advantage\nin capturing local representations. This fact has led us to design a hybrid\nmethod to fully take advantage of both ViT and CNN, called Explicitly\nClass-specific Boundaries (ECB). ECB learns CNN on ViT to combine their\ndistinct strengths. In particular, we leverage ViT's properties to explicitly\nfind class-specific decision boundaries by maximizing the discrepancy between\nthe outputs of the two classifiers to detect target samples far from the source\nsupport. In contrast, the CNN encoder clusters target features based on the\npreviously defined class-specific boundaries by minimizing the discrepancy\nbetween the probabilities of the two classifiers. Finally, ViT and CNN mutually\nexchange knowledge to improve the quality of pseudo labels and reduce the\nknowledge discrepancies of these models. Compared to conventional DA methods,\nour ECB achieves superior performance, which verifies its effectiveness in this\nhybrid model. The project website can be found\nhttps://dotrannhattuong.github.io/ECB/website/.\n","authors":["Ba Hung Ngo","Nhat-Tuong Do-Tran","Tuan-Ngoc Nguyen","Hae-Gon Jeon","Tae Jong Choi"],"pdf_url":"https://arxiv.org/pdf/2403.18360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18356v1","updated":"2024-03-27T08:48:47Z","published":"2024-03-27T08:48:47Z","title":"MonoHair: High-Fidelity Hair Modeling from a Monocular Video","summary":" Undoubtedly, high-fidelity 3D hair is crucial for achieving realism, artistic\nexpression, and immersion in computer graphics. While existing 3D hair modeling\nmethods have achieved impressive performance, the challenge of achieving\nhigh-quality hair reconstruction persists: they either require strict capture\nconditions, making practical applications difficult, or heavily rely on learned\nprior data, obscuring fine-grained details in images. To address these\nchallenges, we propose MonoHair,a generic framework to achieve high-fidelity\nhair reconstruction from a monocular video, without specific requirements for\nenvironments. Our approach bifurcates the hair modeling process into two main\nstages: precise exterior reconstruction and interior structure inference. The\nexterior is meticulously crafted using our Patch-based Multi-View Optimization\n(PMVO). This method strategically collects and integrates hair information from\nmultiple views, independent of prior data, to produce a high-fidelity exterior\n3D line map. This map not only captures intricate details but also facilitates\nthe inference of the hair's inner structure. For the interior, we employ a\ndata-driven, multi-view 3D hair reconstruction method. This method utilizes 2D\nstructural renderings derived from the reconstructed exterior, mirroring the\nsynthetic 2D inputs used during training. This alignment effectively bridges\nthe domain gap between our training data and real-world data, thereby enhancing\nthe accuracy and reliability of our interior structure inference. Lastly, we\ngenerate a strand model and resolve the directional ambiguity by our hair\ngrowth algorithm. Our experiments demonstrate that our method exhibits\nrobustness across diverse hairstyles and achieves state-of-the-art performance.\nFor more results, please refer to our project page\nhttps://keyuwu-cs.github.io/MonoHair/.\n","authors":["Keyu Wu","Lingchen Yang","Zhiyi Kuang","Yao Feng","Xutao Han","Yuefan Shen","Hongbo Fu","Kun Zhou","Youyi Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.18356v1.pdf","comment":"Accepted by IEEE CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18351v1","updated":"2024-03-27T08:42:47Z","published":"2024-03-27T08:42:47Z","title":"Generating Diverse Agricultural Data for Vision-Based Farming\n Applications","summary":" We present a specialized procedural model for generating synthetic\nagricultural scenes, focusing on soybean crops, along with various weeds. This\nmodel is capable of simulating distinct growth stages of these plants, diverse\nsoil conditions, and randomized field arrangements under varying lighting\nconditions. The integration of real-world textures and environmental factors\ninto the procedural generation process enhances the photorealism and\napplicability of the synthetic data. Our dataset includes 12,000 images with\nsemantic labels, offering a comprehensive resource for computer vision tasks in\nprecision agriculture, such as semantic segmentation for autonomous weed\ncontrol. We validate our model's effectiveness by comparing the synthetic data\nagainst real agricultural images, demonstrating its potential to significantly\naugment training data for machine learning models in agriculture. This approach\nnot only provides a cost-effective solution for generating high-quality,\ndiverse data but also addresses specific needs in agricultural vision tasks\nthat are not fully covered by general-purpose models.\n","authors":["Mikolaj Cieslak","Umabharathi Govindarajan","Alejandro Garcia","Anuradha Chandrashekar","Torsten Hädrich","Aleksander Mendoza-Drosik","Dominik L. Michels","Sören Pirk","Chia-Chun Fu","Wojciech Pałubicki"],"pdf_url":"https://arxiv.org/pdf/2403.18351v1.pdf","comment":"10 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.18347v1","updated":"2024-03-27T08:38:56Z","published":"2024-03-27T08:38:56Z","title":"A Quantum Fuzzy-based Approach for Real-Time Detection of Solar Coronal\n Holes","summary":" The detection and analysis of the solar coronal holes (CHs) is an important\nfield of study in the domain of solar physics. Mainly, it is required for the\nproper prediction of the geomagnetic storms which directly or indirectly affect\nvarious space and ground-based systems. For the detection of CHs till date, the\nsolar scientist depends on manual hand-drawn approaches. However, with the\nadvancement of image processing technologies, some automated image segmentation\nmethods have been used for the detection of CHs. In-spite of this, fast and\naccurate detection of CHs are till a major issues. Here in this work, a novel\nquantum computing-based fast fuzzy c-mean technique has been developed for fast\ndetection of the CHs region. The task has been carried out in two stages, in\nfirst stage the solar image has been segmented using a quantum computing based\nfast fuzzy c-mean (QCFFCM) and in the later stage the CHs has been extracted\nout from the segmented image based on image morphological operation. In the\nwork, quantum computing has been used to optimize the cost function of the fast\nfuzzy c-mean (FFCM) algorithm, where quantum approximate optimization algorithm\n(QAOA) has been used to optimize the quadratic part of the cost function. The\nproposed method has been tested for 193 \\AA{} SDO/AIA full-disk solar image\ndatasets and has been compared with the existing techniques. The outcome shows\nthe comparable performance of the proposed method with the existing one within\na very lesser time.\n","authors":["Sanmoy Bandyopadhyay","Suman Kundu"],"pdf_url":"https://arxiv.org/pdf/2403.18347v1.pdf","comment":"14 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.18346v1","updated":"2024-03-27T08:38:49Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from an over-reliance on unimodal biases (e.g., language\nbias and vision bias), leading to incorrect answers in complex multimodal\ntasks. To investigate this issue, we propose a causal framework to interpret\nthe biases in Visual Question Answering (VQA) problems. Within our framework,\nwe devise a causal graph to elucidate the predictions of MLLMs on VQA problems,\nand assess the causal effect of biases through an in-depth causal analysis.\nMotivated by the causal graph, we introduce a novel MORE dataset, consisting of\n12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities,\nnecessitating multi-hop reasoning and the surmounting of unimodal biases.\nFurthermore, we propose two strategies to mitigate unimodal biases and enhance\nMLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA)\nframework for limited-access MLLMs and the refinement of open-source MLLMs\nthrough fine-tuning. Extensive quantitative and qualitative experiments offer\nvaluable insights for future research.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18342v1","updated":"2024-03-27T08:32:48Z","published":"2024-03-27T08:32:48Z","title":"Learning Inclusion Matching for Animation Paint Bucket Colorization","summary":" Colorizing line art is a pivotal task in the production of hand-drawn cel\nanimation. This typically involves digital painters using a paint bucket tool\nto manually color each segment enclosed by lines, based on RGB values\npredetermined by a color designer. This frame-by-frame process is both arduous\nand time-intensive. Current automated methods mainly focus on segment matching.\nThis technique migrates colors from a reference to the target frame by aligning\nfeatures within line-enclosed segments across frames. However, issues like\nocclusion and wrinkles in animations often disrupt these direct\ncorrespondences, leading to mismatches. In this work, we introduce a new\nlearning-based inclusion matching pipeline, which directs the network to\ncomprehend the inclusion relationships between segments rather than relying\nsolely on direct visual correspondences. Our method features a two-stage\npipeline that integrates a coarse color warping module with an inclusion\nmatching module, enabling more nuanced and accurate colorization. To facilitate\nthe training of our network, we also develope a unique dataset, referred to as\nPaintBucket-Character. This dataset includes rendered line arts alongside their\ncolorized counterparts, featuring various 3D characters. Extensive experiments\ndemonstrate the effectiveness and superiority of our method over existing\ntechniques.\n","authors":["Yuekun Dai","Shangchen Zhou","Qinyue Li","Chongyi Li","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2403.18342v1.pdf","comment":"accepted to CVPR 2024. Project Page:\n https://ykdai.github.io/projects/InclusionMatching"},{"id":"http://arxiv.org/abs/2403.18339v1","updated":"2024-03-27T08:28:14Z","published":"2024-03-27T08:28:14Z","title":"H2ASeg: Hierarchical Adaptive Interaction and Weighting Network for\n Tumor Segmentation in PET/CT Images","summary":" Positron emission tomography (PET) combined with computed tomography (CT)\nimaging is routinely used in cancer diagnosis and prognosis by providing\ncomplementary information. Automatically segmenting tumors in PET/CT images can\nsignificantly improve examination efficiency. Traditional multi-modal\nsegmentation solutions mainly rely on concatenation operations for modality\nfusion, which fail to effectively model the non-linear dependencies between PET\nand CT modalities. Recent studies have investigated various approaches to\noptimize the fusion of modality-specific features for enhancing joint\nrepresentations. However, modality-specific encoders used in these methods\noperate independently, inadequately leveraging the synergistic relationships\ninherent in PET and CT modalities, for example, the complementarity between\nsemantics and structure. To address these issues, we propose a Hierarchical\nAdaptive Interaction and Weighting Network termed H2ASeg to explore the\nintrinsic cross-modal correlations and transfer potential complementary\ninformation. Specifically, we design a Modality-Cooperative Spatial Attention\n(MCSA) module that performs intra- and inter-modal interactions globally and\nlocally. Additionally, a Target-Aware Modality Weighting (TAMW) module is\ndeveloped to highlight tumor-related features within multi-modal features,\nthereby refining tumor segmentation. By embedding these modules across\ndifferent layers, H2ASeg can hierarchically model cross-modal correlations,\nenabling a nuanced understanding of both semantic and structural tumor\nfeatures. Extensive experiments demonstrate the superiority of H2ASeg,\noutperforming state-of-the-art methods on AutoPet-II and Hecktor2022\nbenchmarks. The code is released at https://github.com/G14nTDo4/H2ASeg.\n","authors":["Jinpeng Lu","Jingyun Chen","Linghan Cai","Songhan Jiang","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.18339v1.pdf","comment":"10 pages,4 figures"},{"id":"http://arxiv.org/abs/2403.17301v2","updated":"2024-03-27T08:23:09Z","published":"2024-03-26T01:06:47Z","title":"Physical 3D Adversarial Attacks against Monocular Depth Estimation in\n Autonomous Driving","summary":" Deep learning-based monocular depth estimation (MDE), extensively applied in\nautonomous driving, is known to be vulnerable to adversarial attacks. Previous\nphysical attacks against MDE models rely on 2D adversarial patches, so they\nonly affect a small, localized region in the MDE map but fail under various\nviewpoints. To address these limitations, we propose 3D Depth Fool\n(3D$^2$Fool), the first 3D texture-based adversarial attack against MDE models.\n3D$^2$Fool is specifically optimized to generate 3D adversarial textures\nagnostic to model types of vehicles and to have improved robustness in bad\nweather conditions, such as rain and fog. Experimental results validate the\nsuperior performance of our 3D$^2$Fool across various scenarios, including\nvehicles, MDE models, weather conditions, and viewpoints. Real-world\nexperiments with printed 3D textures on physical vehicle models further\ndemonstrate that our 3D$^2$Fool can cause an MDE error of over 10 meters.\n","authors":["Junhao Zheng","Chenhao Lin","Jiahao Sun","Zhengyu Zhao","Qian Li","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2403.17301v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2306.02928v2","updated":"2024-03-27T08:21:17Z","published":"2023-06-05T14:45:38Z","title":"Weakly-Supervised Conditional Embedding for Referred Visual Search","summary":" This paper introduces a new challenge for image similarity search in the\ncontext of fashion, addressing the inherent ambiguity in this domain stemming\nfrom complex images. We present Referred Visual Search (RVS), a task allowing\nusers to define more precisely the desired similarity, following recent\ninterest in the industry. We release a new large public dataset,\nLAION-RVS-Fashion, consisting of 272k fashion products with 842k images\nextracted from LAION, designed explicitly for this task. However, unlike\ntraditional visual search methods in the industry, we demonstrate that superior\nperformance can be achieved by bypassing explicit object detection and adopting\nweakly-supervised conditional contrastive learning on image tuples. Our method\nis lightweight and demonstrates robustness, reaching Recall at one superior to\nstrong detection-based baselines against 2M distractors. Code, data and models\nare available at https://www.github.com/Simon-Lepage/CondViT-LRVSF .\n","authors":["Simon Lepage","Jérémie Mary","David Picard"],"pdf_url":"https://arxiv.org/pdf/2306.02928v2.pdf","comment":"28 pages, 13 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.18334v1","updated":"2024-03-27T08:16:33Z","published":"2024-03-27T08:16:33Z","title":"DODA: Diffusion for Object-detection Domain Adaptation in Agriculture","summary":" The diverse and high-quality content generated by recent generative models\ndemonstrates the great potential of using synthetic data to train downstream\nmodels. However, in vision, especially in objection detection, related areas\nare not fully explored, the synthetic images are merely used to balance the\nlong tails of existing datasets, and the accuracy of the generated labels is\nlow, the full potential of generative models has not been exploited. In this\npaper, we propose DODA, a data synthesizer that can generate high-quality\nobject detection data for new domains in agriculture. Specifically, we improve\nthe controllability of layout-to-image through encoding layout as an image,\nthereby improving the quality of labels, and use a visual encoder to provide\nvisual clues for the diffusion model to decouple visual features from the\ndiffusion model, and empowering the model the ability to generate data in new\ndomains. On the Global Wheat Head Detection (GWHD) Dataset, which is the\nlargest dataset in agriculture and contains diverse domains, using the data\nsynthesized by DODA improves the performance of the object detector by\n12.74-17.76 AP$_{50}$ in the domain that was significantly shifted from the\ntraining data.\n","authors":["Shuai Xiang","Pieter M. Blok","James Burridge","Haozhou Wang","Wei Guo"],"pdf_url":"https://arxiv.org/pdf/2403.18334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18330v1","updated":"2024-03-27T08:11:25Z","published":"2024-03-27T08:11:25Z","title":"Tracking-Assisted Object Detection with Event Cameras","summary":" Event-based object detection has recently garnered attention in the computer\nvision community due to the exceptional properties of event cameras, such as\nhigh dynamic range and no motion blur. However, feature asynchronism and\nsparsity cause invisible objects due to no relative motion to the camera,\nposing a significant challenge in the task. Prior works have studied various\nmemory mechanisms to preserve as many features as possible at the current time,\nguided by temporal clues. While these implicit-learned memories retain some\nshort-term information, they still struggle to preserve long-term features\neffectively. In this paper, we consider those invisible objects as\npseudo-occluded objects and aim to reveal their features. Firstly, we introduce\nvisibility attribute of objects and contribute an auto-labeling algorithm to\nappend additional visibility labels on an existing event camera dataset.\nSecondly, we exploit tracking strategies for pseudo-occluded objects to\nmaintain their permanence and retain their bounding boxes, even when features\nhave not been available for a very long time. These strategies can be treated\nas an explicit-learned memory guided by the tracking objective to record the\ndisplacements of objects across frames. Lastly, we propose a spatio-temporal\nfeature aggregation module to enrich the latent features and a consistency loss\nto increase the robustness of the overall pipeline. We conduct comprehensive\nexperiments to verify our method's effectiveness where still objects are\nretained but real occluded objects are discarded. The results demonstrate that\n(1) the additional visibility labels can assist in supervised training, and (2)\nour method outperforms state-of-the-art approaches with a significant\nimprovement of 7.9% absolute mAP.\n","authors":["Ting-Kang Yen","Igor Morawski","Shusil Dangi","Kai He","Chung-Yi Lin","Jia-Fong Yeh","Hung-Ting Su","Winston Hsu"],"pdf_url":"https://arxiv.org/pdf/2403.18330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18328v1","updated":"2024-03-27T08:09:04Z","published":"2024-03-27T08:09:04Z","title":"PIPNet3D: Interpretable Detection of Alzheimer in MRI Scans","summary":" Information from neuroimaging examinations (CT, MRI) is increasingly used to\nsupport diagnoses of dementia, e.g., Alzheimer's disease. While current\nclinical practice is mainly based on visual inspection and feature engineering,\nDeep Learning approaches can be used to automate the analysis and to discover\nnew image-biomarkers. Part-prototype neural networks (PP-NN) are an alternative\nto standard blackbox models, and have shown promising results in general\ncomputer vision. PP-NN's base their reasoning on prototypical image regions\nthat are learned fully unsupervised, and combined with a simple-to-understand\ndecision layer. We present PIPNet3D, a PP-NN for volumetric images. We apply\nPIPNet3D to the clinical case study of Alzheimer's Disease diagnosis from\nstructural Magnetic Resonance Imaging (sMRI). We assess the quality of\nprototypes under a systematic evaluation framework, propose new metrics to\nevaluate brain prototypes and perform an evaluation with domain experts. Our\nresults show that PIPNet3D is an interpretable, compact model for Alzheimer's\ndiagnosis with its reasoning well aligned to medical domain knowledge. Notably,\nPIPNet3D achieves the same accuracy as its blackbox counterpart; and removing\nthe remaining clinically irrelevant prototypes from its decision process does\nnot decrease predictive performance.\n","authors":["Lisa Anita De Santi","Jörg Schlötterer","Michael Scheschenja","Joel Wessendorf","Meike Nauta","Vincenzo Positano","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2403.18328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10030v2","updated":"2024-03-27T07:52:10Z","published":"2024-03-15T05:30:29Z","title":"Multi-criteria Token Fusion with One-step-ahead Attention for Efficient\n Vision Transformers","summary":" Vision Transformer (ViT) has emerged as a prominent backbone for computer\nvision. For more efficient ViTs, recent works lessen the quadratic cost of the\nself-attention layer by pruning or fusing the redundant tokens. However, these\nworks faced the speed-accuracy trade-off caused by the loss of information.\nHere, we argue that token fusion needs to consider diverse relations between\ntokens to minimize information loss. In this paper, we propose a Multi-criteria\nToken Fusion (MCTF), that gradually fuses the tokens based on multi-criteria\n(e.g., similarity, informativeness, and size of fused tokens). Further, we\nutilize the one-step-ahead attention, which is the improved approach to capture\nthe informativeness of the tokens. By training the model equipped with MCTF\nusing a token reduction consistency, we achieve the best speed-accuracy\ntrade-off in the image classification (ImageNet1K). Experimental results prove\nthat MCTF consistently surpasses the previous reduction methods with and\nwithout training. Specifically, DeiT-T and DeiT-S with MCTF reduce FLOPs by\nabout 44% while improving the performance (+0.5%, and +0.3%) over the base\nmodel, respectively. We also demonstrate the applicability of MCTF in various\nVision Transformers (e.g., T2T-ViT, LV-ViT), achieving at least 31% speedup\nwithout performance degradation. Code is available at\nhttps://github.com/mlvlab/MCTF.\n","authors":["Sanghyeok Lee","Joonmyung Choi","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2403.10030v2.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.18321v1","updated":"2024-03-27T07:50:45Z","published":"2024-03-27T07:50:45Z","title":"Implementation of the Principal Component Analysis onto High-Performance\n Computer Facilities for Hyperspectral Dimensionality Reduction: Results and\n Comparisons","summary":" Dimensionality reduction represents a critical preprocessing step in order to\nincrease the efficiency and the performance of many hyperspectral imaging\nalgorithms. However, dimensionality reduction algorithms, such as the Principal\nComponent Analysis (PCA), suffer from their computationally demanding nature,\nbecoming advisable for their implementation onto high-performance computer\narchitectures for applications under strict latency constraints. This work\npresents the implementation of the PCA algorithm onto two different\nhigh-performance devices, namely, an NVIDIA Graphics Processing Unit (GPU) and\na Kalray manycore, uncovering a highly valuable set of tips and tricks in order\nto take full advantage of the inherent parallelism of these high-performance\ncomputing platforms, and hence, reducing the time that is required to process a\ngiven hyperspectral image. Moreover, the achieved results obtained with\ndifferent hyperspectral images have been compared with the ones that were\nobtained with a field programmable gate array (FPGA)-based implementation of\nthe PCA algorithm that has been recently published, providing, for the first\ntime in the literature, a comprehensive analysis in order to highlight the pros\nand cons of each option.\n","authors":["E. Martel","R. Lazcano","J. Lopez","D. Madroñal","R. Salvador","S. Lopez","E. Juarez","R. Guerra","C. Sanz","R. Sarmiento"],"pdf_url":"https://arxiv.org/pdf/2403.18321v1.pdf","comment":"30 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.18318v1","updated":"2024-03-27T07:40:51Z","published":"2024-03-27T07:40:51Z","title":"Uncertainty-Aware SAR ATR: Defending Against Adversarial Attacks via\n Bayesian Neural Networks","summary":" Adversarial attacks have demonstrated the vulnerability of Machine Learning\n(ML) image classifiers in Synthetic Aperture Radar (SAR) Automatic Target\nRecognition (ATR) systems. An adversarial attack can deceive the classifier\ninto making incorrect predictions by perturbing the input SAR images, for\nexample, with a few scatterers attached to the on-ground objects. Therefore, it\nis critical to develop robust SAR ATR systems that can detect potential\nadversarial attacks by leveraging the inherent uncertainty in ML classifiers,\nthereby effectively alerting human decision-makers. In this paper, we propose a\nnovel uncertainty-aware SAR ATR for detecting adversarial attacks.\nSpecifically, we leverage the capability of Bayesian Neural Networks (BNNs) in\nperforming image classification with quantified epistemic uncertainty to\nmeasure the confidence for each input SAR image. By evaluating the uncertainty,\nour method alerts when the input SAR image is likely to be adversarially\ngenerated. Simultaneously, we also generate visual explanations that reveal the\nspecific regions in the SAR image where the adversarial scatterers are likely\nto to be present, thus aiding human decision-making with hints of evidence of\nadversarial attacks. Experiments on the MSTAR dataset demonstrate that our\napproach can identify over 80% adversarial SAR images with fewer than 20% false\nalarms, and our visual explanations can identify up to over 90% of scatterers\nin an adversarial SAR image.\n","authors":["Tian Ye","Rajgopal Kannan","Viktor Prasanna","Carl Busart"],"pdf_url":"https://arxiv.org/pdf/2403.18318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08251v2","updated":"2024-03-27T07:33:42Z","published":"2022-12-16T02:43:52Z","title":"Task-Adaptive Saliency Guidance for Exemplar-free Class Incremental\n Learning","summary":" Exemplar-free Class Incremental Learning (EFCIL) aims to sequentially learn\ntasks with access only to data from the current one. EFCIL is of interest\nbecause it mitigates concerns about privacy and long-term storage of data,\nwhile at the same time alleviating the problem of catastrophic forgetting in\nincremental learning. In this work, we introduce task-adaptive saliency for\nEFCIL and propose a new framework, which we call Task-Adaptive Saliency\nSupervision (TASS), for mitigating the negative effects of saliency drift\nbetween different tasks. We first apply boundary-guided saliency to maintain\ntask adaptivity and \\textit{plasticity} on model attention. Besides, we\nintroduce task-agnostic low-level signals as auxiliary supervision to increase\nthe \\textit{stability} of model attention. Finally, we introduce a module for\ninjecting and recovering saliency noise to increase the robustness of saliency\npreservation. Our experiments demonstrate that our method can better preserve\nsaliency maps across tasks and achieve state-of-the-art results on the\nCIFAR-100, Tiny-ImageNet, and ImageNet-Subset EFCIL benchmarks. Code is\navailable at \\url{https://github.com/scok30/tass}.\n","authors":["Xialei Liu","Jiang-Tian Zhai","Andrew D. Bagdanov","Ke Li","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2212.08251v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2307.09136v2","updated":"2024-03-27T07:16:28Z","published":"2023-07-18T10:34:21Z","title":"The Effects of Mixed Sample Data Augmentation are Class Dependent","summary":" Mixed Sample Data Augmentation (MSDA) techniques, such as Mixup, CutMix, and\nPuzzleMix, have been widely acknowledged for enhancing performance in a variety\nof tasks. A previous study reported the class dependency of traditional data\naugmentation (DA), where certain classes benefit disproportionately compared to\nothers. This paper reveals a class dependent effect of MSDA, where some classes\nexperience improved performance while others experience degraded performance.\nThis research addresses the issue of class dependency in MSDA and proposes an\nalgorithm to mitigate it. The approach involves training on a mixture of MSDA\nand non-MSDA data, which not only mitigates the negative impact on the affected\nclasses, but also improves overall accuracy. Furthermore, we provide in-depth\nanalysis and discussion of why MSDA introduced class dependencies and which\nclasses are most likely to have them.\n","authors":["Haeil Lee","Hansang Lee","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.09136v2.pdf","comment":"21 pages, 18 figures, Overall Revision"},{"id":"http://arxiv.org/abs/2402.18920v5","updated":"2024-03-27T07:16:21Z","published":"2024-02-29T07:26:23Z","title":"Spectral Meets Spatial: Harmonising 3D Shape Matching and Interpolation","summary":" Although 3D shape matching and interpolation are highly interrelated, they\nare often studied separately and applied sequentially to relate different 3D\nshapes, thus resulting in sub-optimal performance. In this work we present a\nunified framework to predict both point-wise correspondences and shape\ninterpolation between 3D shapes. To this end, we combine the deep functional\nmap framework with classical surface deformation models to map shapes in both\nspectral and spatial domains. On the one hand, by incorporating spatial maps,\nour method obtains more accurate and smooth point-wise correspondences compared\nto previous functional map methods for shape matching. On the other hand, by\nintroducing spectral maps, our method gets rid of commonly used but\ncomputationally expensive geodesic distance constraints that are only valid for\nnear-isometric shape deformations. Furthermore, we propose a novel test-time\nadaptation scheme to capture both pose-dominant and shape-dominant\ndeformations. Using different challenging datasets, we demonstrate that our\nmethod outperforms previous state-of-the-art methods for both shape matching\nand interpolation, even compared to supervised approaches.\n","authors":["Dongliang Cao","Marvin Eisenberger","Nafie El Amrani","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2402.18920v5.pdf","comment":"accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2308.13356v3","updated":"2024-03-27T07:12:09Z","published":"2023-08-25T13:05:06Z","title":"CEIMVEN: An Approach of Cutting Edge Implementation of Modified Versions\n of EfficientNet (V1-V2) Architecture for Breast Cancer Detection and\n Classification from Ultrasound Images","summary":" Undoubtedly breast cancer identifies itself as one of the most widespread and\nterrifying cancers across the globe. Millions of women are getting affected\neach year from it. Breast cancer remains the major one for being the reason of\nlargest number of demise of women. In the recent time of research, Medical\nImage Computing and Processing has been playing a significant role for\ndetecting and classifying breast cancers from ultrasound images and mammograms,\nalong with the celestial touch of deep neural networks. In this research, we\nfocused mostly on our rigorous implementations and iterative result analysis of\ndifferent cutting-edge modified versions of EfficientNet architectures namely\nEfficientNet-V1 (b0-b7) and EfficientNet-V2 (b0-b3) with ultrasound image,\nnamed as CEIMVEN. We utilized transfer learning approach here for using the\npre-trained models of EfficientNet versions. We activated the hyper-parameter\ntuning procedures, added fully connected layers, discarded the unprecedented\noutliers and recorded the accuracy results from our custom modified\nEfficientNet architectures. Our deep learning model training approach was\nrelated to both identifying the cancer affected areas with region of interest\n(ROI) techniques and multiple classifications (benign, malignant and normal).\nThe approximate testing accuracies we got from the modified versions of\nEfficientNet-V1 (b0- 99.15%, b1- 98.58%, b2- 98.43%, b3- 98.01%, b4- 98.86%,\nb5- 97.72%, b6- 97.72%, b7- 98.72%) and EfficientNet-V2 (b0- 99.29%, b1-\n99.01%, b2- 98.72%, b3- 99.43%) are showing very bright future and strong\npotentials of deep learning approach for the successful detection and\nclassification of breast cancers from the ultrasound images at a very early\nstage. The code for this research is available here:\nhttps://github.com/ac005sheekar/CEIMVEN-Breast.\n","authors":["Sheekar Banerjee","Md. Kamrul Hasan Monir"],"pdf_url":"https://arxiv.org/pdf/2308.13356v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18301v1","updated":"2024-03-27T06:55:23Z","published":"2024-03-27T06:55:23Z","title":"Selective Mixup Fine-Tuning for Optimizing Non-Decomposable Objectives","summary":" The rise in internet usage has led to the generation of massive amounts of\ndata, resulting in the adoption of various supervised and semi-supervised\nmachine learning algorithms, which can effectively utilize the colossal amount\nof data to train models. However, before deploying these models in the real\nworld, these must be strictly evaluated on performance measures like worst-case\nrecall and satisfy constraints such as fairness. We find that current\nstate-of-the-art empirical techniques offer sub-optimal performance on these\npractical, non-decomposable performance objectives. On the other hand, the\ntheoretical techniques necessitate training a new model from scratch for each\nperformance objective. To bridge the gap, we propose SelMix, a selective\nmixup-based inexpensive fine-tuning technique for pre-trained models, to\noptimize for the desired objective. The core idea of our framework is to\ndetermine a sampling distribution to perform a mixup of features between\nsamples from particular classes such that it optimizes the given objective. We\ncomprehensively evaluate our technique against the existing empirical and\ntheoretically principled methods on standard benchmark datasets for imbalanced\nclassification. We find that proposed SelMix fine-tuning significantly improves\nthe performance for various practical non-decomposable objectives across\nbenchmarks.\n","authors":["Shrinivas Ramasubramanian","Harsh Rangwani","Sho Takemori","Kunal Samanta","Yuhei Umeda","Venkatesh Babu Radhakrishnan"],"pdf_url":"https://arxiv.org/pdf/2403.18301v1.pdf","comment":"ICLR 2024 SpotLight"},{"id":"http://arxiv.org/abs/2403.07392v3","updated":"2024-03-27T06:44:13Z","published":"2024-03-12T07:59:41Z","title":"ViT-CoMer: Vision Transformer with Convolutional Multi-scale Feature\n Interaction for Dense Predictions","summary":" Although Vision Transformer (ViT) has achieved significant success in\ncomputer vision, it does not perform well in dense prediction tasks due to the\nlack of inner-patch information interaction and the limited diversity of\nfeature scale. Most existing studies are devoted to designing vision-specific\ntransformers to solve the above problems, which introduce additional\npre-training costs. Therefore, we present a plain, pre-training-free, and\nfeature-enhanced ViT backbone with Convolutional Multi-scale feature\ninteraction, named ViT-CoMer, which facilitates bidirectional interaction\nbetween CNN and transformer. Compared to the state-of-the-art, ViT-CoMer has\nthe following advantages: (1) We inject spatial pyramid multi-receptive field\nconvolutional features into the ViT architecture, which effectively alleviates\nthe problems of limited local information interaction and single-feature\nrepresentation in ViT. (2) We propose a simple and efficient CNN-Transformer\nbidirectional fusion interaction module that performs multi-scale fusion across\nhierarchical features, which is beneficial for handling dense prediction tasks.\n(3) We evaluate the performance of ViT-CoMer across various dense prediction\ntasks, different frameworks, and multiple advanced pre-training. Notably, our\nViT-CoMer-L achieves 64.3% AP on COCO val2017 without extra training data, and\n62.1% mIoU on ADE20K val, both of which are comparable to state-of-the-art\nmethods. We hope ViT-CoMer can serve as a new backbone for dense prediction\ntasks to facilitate future research. The code will be released at\nhttps://github.com/Traffic-X/ViT-CoMer.\n","authors":["Chunlong Xia","Xinliang Wang","Feng Lv","Xin Hao","Yifeng Shi"],"pdf_url":"https://arxiv.org/pdf/2403.07392v3.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.18294v1","updated":"2024-03-27T06:40:26Z","published":"2024-03-27T06:40:26Z","title":"Multi-scale Unified Network for Image Classification","summary":" Convolutional Neural Networks (CNNs) have advanced significantly in visual\nrepresentation learning and recognition. However, they face notable challenges\nin performance and computational efficiency when dealing with real-world,\nmulti-scale image inputs. Conventional methods rescale all input images into a\nfixed size, wherein a larger fixed size favors performance but rescaling small\nsize images to a larger size incurs digitization noise and increased\ncomputation cost. In this work, we carry out a comprehensive, layer-wise\ninvestigation of CNN models in response to scale variation, based on Centered\nKernel Alignment (CKA) analysis. The observations reveal lower layers are more\nsensitive to input image scale variations than high-level layers. Inspired by\nthis insight, we propose Multi-scale Unified Network (MUSN) consisting of\nmulti-scale subnets, a unified network, and scale-invariant constraint. Our\nmethod divides the shallow layers into multi-scale subnets to enable feature\nextraction from multi-scale inputs, and the low-level features are unified in\ndeep layers for extracting high-level semantic features. A scale-invariant\nconstraint is posed to maintain feature consistency across different scales.\nExtensive experiments on ImageNet and other scale-diverse datasets, demonstrate\nthat MSUN achieves significant improvements in both model performance and\ncomputational efficiency. Particularly, MSUN yields an accuracy increase up to\n44.53% and diminishes FLOPs by 7.01-16.13% in multi-scale scenarios.\n","authors":["Wenzhuo Liu","Fei Zhu","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2403.18294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18293v1","updated":"2024-03-27T06:37:51Z","published":"2024-03-27T06:37:51Z","title":"Efficient Test-Time Adaptation of Vision-Language Models","summary":" Test-time adaptation with pre-trained vision-language models has attracted\nincreasing attention for tackling distribution shifts during the test time.\nThough prior studies have achieved very promising performance, they involve\nintensive computation which is severely unaligned with test-time adaptation. We\ndesign TDA, a training-free dynamic adapter that enables effective and\nefficient test-time adaptation with vision-language models. TDA works with a\nlightweight key-value cache that maintains a dynamic queue with few-shot pseudo\nlabels as values and the corresponding test-sample features as keys. Leveraging\nthe key-value cache, TDA allows adapting to test data gradually via progressive\npseudo label refinement which is super-efficient without incurring any\nbackpropagation. In addition, we introduce negative pseudo labeling that\nalleviates the adverse impact of pseudo label noises by assigning pseudo labels\nto certain negative classes when the model is uncertain about its pseudo label\npredictions. Extensive experiments over two benchmarks demonstrate TDA's\nsuperior effectiveness and efficiency as compared with the state-of-the-art.\nThe code has been released in \\url{https://kdiaaa.github.io/tda/}.\n","authors":["Adilbek Karmanov","Dayan Guan","Shijian Lu","Abdulmotaleb El Saddik","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2403.18293v1.pdf","comment":"Accepted to CVPR 2024. The code has been released in\n \\url{https://kdiaaa.github.io/tda/}"},{"id":"http://arxiv.org/abs/2403.18291v1","updated":"2024-03-27T06:28:19Z","published":"2024-03-27T06:28:19Z","title":"Towards Non-Exemplar Semi-Supervised Class-Incremental Learning","summary":" Deep neural networks perform remarkably well in close-world scenarios.\nHowever, novel classes emerged continually in real applications, making it\nnecessary to learn incrementally. Class-incremental learning (CIL) aims to\ngradually recognize new classes while maintaining the discriminability of old\nones. Existing CIL methods have two limitations: a heavy reliance on preserving\nold data for forgetting mitigation and the need for vast labeled data for\nknowledge adaptation. To overcome these issues, we propose a non-exemplar\nsemi-supervised CIL framework with contrastive learning and semi-supervised\nincremental prototype classifier (Semi-IPC). On the one hand, contrastive\nlearning helps the model learn rich representations, easing the trade-off\nbetween learning representations of new classes and forgetting that of old\nclasses. On the other hand, Semi-IPC learns a prototype for each class with\nunsupervised regularization, enabling the model to incrementally learn from\npartially labeled new data while maintaining the knowledge of old classes.\nExperiments on benchmark datasets demonstrate the strong performance of our\nmethod: without storing any old samples and only using less than 1% of labels,\nSemi-IPC outperforms advanced exemplar-based methods. We hope our work offers\nnew insights for future CIL research. The code will be made publicly available.\n","authors":["Wenzhuo Liu","Fei Zhu","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2403.18291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15864v2","updated":"2024-03-27T06:26:09Z","published":"2023-11-27T14:32:33Z","title":"InterControl: Generate Human Motion Interactions by Controlling Every\n Joint","summary":" Text-conditioned human motion synthesis has made remarkable progress with the\nemergence of diffusion models in recent research. However, the majority of\nthese motion diffusion models are primarily designed for a single character and\noverlook multi-human interactions. In our approach, we strive to explore this\nproblem by synthesizing human motion with interactions for a group of\ncharacters of any size. The key aspect of our approach is the adaptation of\nhuman-wise interactions as pairs of human joints that can be either in contact\nor separated by a desired distance. In contrast to existing methods that\nnecessitate training motion generation models on multi-human motion datasets\nwith a fixed number of characters, our approach inherently possesses the\nflexibility to model human interactions involving an arbitrary number of\nindividuals, thereby transcending the limitations imposed by the training data.\nWe introduce a novel controllable motion generation method, InterControl, to\nencourage the synthesized motions maintaining the desired distance between\njoint pairs. It consists of a motion controller and an inverse kinematics\nguidance module that realistically and accurately aligns the joints of\nsynthesized characters to the desired location. Furthermore, we demonstrate\nthat the distance between joint pairs for human-wise interactions can be\ngenerated using an off-the-shelf Large Language Model (LLM). Experimental\nresults highlight the capability of our framework to generate interactions with\nmultiple human characters and its potential to work with off-the-shelf\nphysics-based character simulators.\n","authors":["Zhenzhi Wang","Jingbo Wang","Yixuan Li","Dahua Lin","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2311.15864v2.pdf","comment":"Generate human interactions with only single-person data via joint\n contact pairs, code https://github.com/zhenzhiwang/intercontrol"},{"id":"http://arxiv.org/abs/2403.18282v1","updated":"2024-03-27T06:18:40Z","published":"2024-03-27T06:18:40Z","title":"SGDM: Static-Guided Dynamic Module Make Stronger Visual Models","summary":" The spatial attention mechanism has been widely used to improve object\ndetection performance. However, its operation is currently limited to static\nconvolutions lacking content-adaptive features. This paper innovatively\napproaches from the perspective of dynamic convolution. We propose Razor\nDynamic Convolution (RDConv) to address thetwo flaws in dynamic weight\nconvolution, making it hard to implement in spatial mechanism: 1) it is\ncomputation-heavy; 2) when generating weights, spatial information is\ndisregarded. Firstly, by using Razor Operation to generate certain features, we\nvastly reduce the parameters of the entire dynamic convolution operation.\nSecondly, we added a spatial branch inside RDConv to generate convolutional\nkernel parameters with richer spatial information. Embedding dynamic\nconvolution will also bring the problem of sensitivity to high-frequency noise.\nWe propose the Static-Guided Dynamic Module (SGDM) to address this limitation.\nBy using SGDM, we utilize a set of asymmetric static convolution kernel\nparameters to guide the construction of dynamic convolution. We introduce the\nmechanism of shared weights in static convolution to solve the problem of\ndynamic convolution being sensitive to high-frequency noise. Extensive\nexperiments illustrate that multiple different object detection backbones\nequipped with SGDM achieve a highly competitive boost in performance(e.g., +4%\nmAP with YOLOv5n on VOC and +1.7% mAP with YOLOv8n on COCO) with negligible\nparameter increase(i.e., +0.33M on YOLOv5n and +0.19M on YOLOv8n).\n","authors":["Wenjie Xing","Zhenchao Cui","Jing Qi"],"pdf_url":"https://arxiv.org/pdf/2403.18282v1.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.18281v1","updated":"2024-03-27T06:17:21Z","published":"2024-03-27T06:17:21Z","title":"AIR-HLoc: Adaptive Image Retrieval for Efficient Visual Localisation","summary":" State-of-the-art (SOTA) hierarchical localisation pipelines (HLoc) rely on\nimage retrieval (IR) techniques to establish 2D-3D correspondences by selecting\nthe $k$ most similar images from a reference image database for a given query\nimage. Although higher values of $k$ enhance localisation robustness, the\ncomputational cost for feature matching increases linearly with $k$. In this\npaper, we observe that queries that are the most similar to images in the\ndatabase result in a higher proportion of feature matches and, thus, more\naccurate positioning. Thus, a small number of images is sufficient for queries\nvery similar to images in the reference database. We then propose a novel\napproach, AIR-HLoc, which divides query images into different localisation\ndifficulty levels based on their similarity to the reference image database. We\nconsider an image with high similarity to the reference image as an easy query\nand an image with low similarity as a hard query. Easy queries show a limited\nimprovement in accuracy when increasing $k$. Conversely, higher values of $k$\nsignificantly improve accuracy for hard queries. Given the limited improvement\nin accuracy when increasing $k$ for easy queries and the significant\nimprovement for hard queries, we adapt the value of $k$ to the query's\ndifficulty level. Therefore, AIR-HLoc optimizes processing time by adaptively\nassigning different values of $k$ based on the similarity between the query and\nreference images without losing accuracy. Our extensive experiments on the\nCambridge Landmarks, 7Scenes, and Aachen Day-Night-v1.1 datasets demonstrate\nour algorithm's efficacy, reducing 30\\%, 26\\%, and 11\\% in computational\noverhead while maintaining SOTA accuracy compared to HLoc with fixed image\nretrieval.\n","authors":["Changkun Liu","Huajian Huang","Zhengyang Ma","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2403.18281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07711v2","updated":"2024-03-27T06:02:38Z","published":"2024-03-12T14:53:56Z","title":"SSM Meets Video Diffusion Models: Efficient Video Generation with\n Structured State Spaces","summary":" Given the remarkable achievements in image generation through diffusion\nmodels, the research community has shown increasing interest in extending these\nmodels to video generation. Recent diffusion models for video generation have\npredominantly utilized attention layers to extract temporal features. However,\nattention layers are limited by their memory consumption, which increases\nquadratically with the length of the sequence. This limitation presents\nsignificant challenges when attempting to generate longer video sequences using\ndiffusion models. To overcome this challenge, we propose leveraging state-space\nmodels (SSMs). SSMs have recently gained attention as viable alternatives due\nto their linear memory consumption relative to sequence length. In the\nexperiments, we first evaluate our SSM-based model with UCF101, a standard\nbenchmark of video generation. In addition, to investigate the potential of\nSSMs for longer video generation, we perform an experiment using the MineRL\nNavigate dataset, varying the number of frames to 64, 200, and 400. In these\nsettings, our SSM-based model can considerably save memory consumption for\nlonger sequences, while maintaining competitive FVD scores to the\nattention-based models. Our codes are available at\nhttps://github.com/shim0114/SSM-Meets-Video-Diffusion-Models.\n","authors":["Yuta Oshima","Shohei Taniguchi","Masahiro Suzuki","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2403.07711v2.pdf","comment":"Accepted as workshop paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2303.08231v3","updated":"2024-03-27T06:00:18Z","published":"2023-03-14T20:55:27Z","title":"Rotation-Invariant Transformer for Point Cloud Matching","summary":" The intrinsic rotation invariance lies at the core of matching point clouds\nwith handcrafted descriptors. However, it is widely despised by recent deep\nmatchers that obtain the rotation invariance extrinsically via data\naugmentation. As the finite number of augmented rotations can never span the\ncontinuous SO(3) space, these methods usually show instability when facing\nrotations that are rarely seen. To this end, we introduce RoITr, a\nRotation-Invariant Transformer to cope with the pose variations in the point\ncloud matching task. We contribute both on the local and global levels.\nStarting from the local level, we introduce an attention mechanism embedded\nwith Point Pair Feature (PPF)-based coordinates to describe the pose-invariant\ngeometry, upon which a novel attention-based encoder-decoder architecture is\nconstructed. We further propose a global transformer with rotation-invariant\ncross-frame spatial awareness learned by the self-attention mechanism, which\nsignificantly improves the feature distinctiveness and makes the model robust\nwith respect to the low overlap. Experiments are conducted on both the rigid\nand non-rigid public benchmarks, where RoITr outperforms all the\nstate-of-the-art models by a considerable margin in the low-overlapping\nscenarios. Especially when the rotations are enlarged on the challenging\n3DLoMatch benchmark, RoITr surpasses the existing methods by at least 13 and 5\npercentage points in terms of Inlier Ratio and Registration Recall,\nrespectively.\n","authors":["Hao Yu","Zheng Qin","Ji Hou","Mahdi Saleh","Dongsheng Li","Benjamin Busam","Slobodan Ilic"],"pdf_url":"https://arxiv.org/pdf/2303.08231v3.pdf","comment":"Accepted to CVPR 2023"},{"id":"http://arxiv.org/abs/2403.18274v1","updated":"2024-03-27T05:57:45Z","published":"2024-03-27T05:57:45Z","title":"DVLO: Deep Visual-LiDAR Odometry with Local-to-Global Feature Fusion and\n Bi-Directional Structure Alignment","summary":" Information inside visual and LiDAR data is well complementary derived from\nthe fine-grained texture of images and massive geometric information in point\nclouds. However, it remains challenging to explore effective visual-LiDAR\nfusion, mainly due to the intrinsic data structure inconsistency between two\nmodalities: Images are regular and dense, but LiDAR points are unordered and\nsparse. To address the problem, we propose a local-to-global fusion network\nwith bi-directional structure alignment. To obtain locally fused features, we\nproject points onto image plane as cluster centers and cluster image pixels\naround each center. Image pixels are pre-organized as pseudo points for\nimage-to-point structure alignment. Then, we convert points to pseudo images by\ncylindrical projection (point-to-image structure alignment) and perform\nadaptive global feature fusion between point features with local fused\nfeatures. Our method achieves state-of-the-art performance on KITTI odometry\nand FlyingThings3D scene flow datasets compared to both single-modal and\nmulti-modal methods. Codes will be released later.\n","authors":["Jiuming Liu","Dong Zhuo","Zhiheng Feng","Siting Zhu","Chensheng Peng","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18271v1","updated":"2024-03-27T05:55:16Z","published":"2024-03-27T05:55:16Z","title":"Unleashing the Potential of SAM for Medical Adaptation via Hierarchical\n Decoding","summary":" The Segment Anything Model (SAM) has garnered significant attention for its\nversatile segmentation abilities and intuitive prompt-based interface. However,\nits application in medical imaging presents challenges, requiring either\nsubstantial training costs and extensive medical datasets for full model\nfine-tuning or high-quality prompts for optimal performance. This paper\nintroduces H-SAM: a prompt-free adaptation of SAM tailored for efficient\nfine-tuning of medical images via a two-stage hierarchical decoding procedure.\nIn the initial stage, H-SAM employs SAM's original decoder to generate a prior\nprobabilistic mask, guiding a more intricate decoding process in the second\nstage. Specifically, we propose two key designs: 1) A class-balanced,\nmask-guided self-attention mechanism addressing the unbalanced label\ndistribution, enhancing image embedding; 2) A learnable mask cross-attention\nmechanism spatially modulating the interplay among different image regions\nbased on the prior mask. Moreover, the inclusion of a hierarchical pixel\ndecoder in H-SAM enhances its proficiency in capturing fine-grained and\nlocalized details. This approach enables SAM to effectively integrate learned\nmedical priors, facilitating enhanced adaptation for medical image segmentation\nwith limited samples. Our H-SAM demonstrates a 4.78% improvement in average\nDice compared to existing prompt-free SAM variants for multi-organ segmentation\nusing only 10% of 2D slices. Notably, without using any unlabeled data, H-SAM\neven outperforms state-of-the-art semi-supervised models relying on extensive\nunlabeled training data across various medical datasets. Our code is available\nat https://github.com/Cccccczh404/H-SAM.\n","authors":["Zhiheng Cheng","Qingyue Wei","Hongru Zhu","Yan Wang","Liangqiong Qu","Wei Shao","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.18271v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18270v1","updated":"2024-03-27T05:52:39Z","published":"2024-03-27T05:52:39Z","title":"Image Deraining via Self-supervised Reinforcement Learning","summary":" The quality of images captured outdoors is often affected by the weather. One\nfactor that interferes with sight is rain, which can obstruct the view of\nobservers and computer vision applications that rely on those images. The work\naims to recover rain images by removing rain streaks via Self-supervised\nReinforcement Learning (RL) for image deraining (SRL-Derain). We locate rain\nstreak pixels from the input rain image via dictionary learning and use\npixel-wise RL agents to take multiple inpainting actions to remove rain\nprogressively. To our knowledge, this work is the first attempt where\nself-supervised RL is applied to image deraining. Experimental results on\nseveral benchmark image-deraining datasets show that the proposed SRL-Derain\nperforms favorably against state-of-the-art few-shot and self-supervised\nderaining and denoising methods.\n","authors":["He-Hao Liao","Yan-Tsung Peng","Wen-Tao Chu","Ping-Chun Hsieh","Chung-Chi Tsai"],"pdf_url":"https://arxiv.org/pdf/2403.18270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18266v1","updated":"2024-03-27T05:38:48Z","published":"2024-03-27T05:38:48Z","title":"Branch-Tuning: Balancing Stability and Plasticity for Continual\n Self-Supervised Learning","summary":" Self-supervised learning (SSL) has emerged as an effective paradigm for\nderiving general representations from vast amounts of unlabeled data. However,\nas real-world applications continually integrate new content, the high\ncomputational and resource demands of SSL necessitate continual learning rather\nthan complete retraining. This poses a challenge in striking a balance between\nstability and plasticity when adapting to new information. In this paper, we\nemploy Centered Kernel Alignment for quantitatively analyzing model stability\nand plasticity, revealing the critical roles of batch normalization layers for\nstability and convolutional layers for plasticity. Motivated by this, we\npropose Branch-tuning, an efficient and straightforward method that achieves a\nbalance between stability and plasticity in continual SSL. Branch-tuning\nconsists of branch expansion and compression, and can be easily applied to\nvarious SSL methods without the need of modifying the original methods,\nretaining old data or models. We validate our method through incremental\nexperiments on various benchmark datasets, demonstrating its effectiveness and\npractical value in real-world scenarios. We hope our work offers new insights\nfor future continual self-supervised learning research. The code will be made\npublicly available.\n","authors":["Wenzhuo Liu","Fei Zhu","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2403.18266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03532v2","updated":"2024-03-27T05:28:55Z","published":"2024-03-06T08:18:02Z","title":"Extend Your Own Correspondences: Unsupervised Distant Point Cloud\n Registration by Progressive Distance Extension","summary":" Registration of point clouds collected from a pair of distant vehicles\nprovides a comprehensive and accurate 3D view of the driving scenario, which is\nvital for driving safety related applications, yet existing literature suffers\nfrom the expensive pose label acquisition and the deficiency to generalize to\nnew data distributions. In this paper, we propose EYOC, an unsupervised distant\npoint cloud registration method that adapts to new point cloud distributions on\nthe fly, requiring no global pose labels. The core idea of EYOC is to train a\nfeature extractor in a progressive fashion, where in each round, the feature\nextractor, trained with near point cloud pairs, can label slightly farther\npoint cloud pairs, enabling self-supervision on such far point cloud pairs.\nThis process continues until the derived extractor can be used to register\ndistant point clouds. Particularly, to enable high-fidelity correspondence\nlabel generation, we devise an effective spatial filtering scheme to select the\nmost representative correspondences to register a point cloud pair, and then\nutilize the aligned point clouds to discover more correct correspondences.\nExperiments show that EYOC can achieve comparable performance with\nstate-of-the-art supervised methods at a lower training cost. Moreover, it\noutwits supervised methods regarding generalization performance on new data\ndistributions.\n","authors":["Quan Liu","Hongzi Zhu","Zhenxi Wang","Yunsong Zhou","Shan Chang","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2403.03532v2.pdf","comment":"In Proceedings of the IEEE/CVF Conference on Computer Vision and\n Pattern Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2402.02561v2","updated":"2024-03-27T05:23:40Z","published":"2024-02-04T16:27:37Z","title":"Foundation Model Makes Clustering A Better Initialization For Cold-Start\n Active Learning","summary":" Active learning selects the most informative samples from the unlabelled\ndataset to annotate in the context of a limited annotation budget. While\nnumerous methods have been proposed for subsequent sample selection based on an\ninitialized model, scant attention has been paid to the indispensable phase of\nactive learning: selecting samples for model cold-start initialization. Most of\nthe previous studies resort to random sampling or naive clustering. However,\nrandom sampling is prone to fluctuation, and naive clustering suffers from\nconvergence speed, particularly when dealing with high-dimensional data such as\nimaging data. In this work, we propose to integrate foundation models with\nclustering methods to select samples for cold-start active learning\ninitialization. Foundation models refer to those trained on massive datasets by\nthe self-supervised paradigm and capable of generating informative and\ncompacted embeddings for various downstream tasks. Leveraging these embeddings\nto replace raw features such as pixel values, clustering quickly converges and\nidentifies better initial samples. For a comprehensive comparison, we included\na classic ImageNet-supervised model to acquire embeddings. Experiments on two\nclinical tasks of image classification and segmentation demonstrated that\nfoundation model-based clustering efficiently pinpointed informative initial\nsamples, leading to models showcasing enhanced performance than the baseline\nmethods. We envisage that this study provides an effective paradigm for future\ncold-start active learning.\n","authors":["Han Yuan","Chuan Hong"],"pdf_url":"https://arxiv.org/pdf/2402.02561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17456v3","updated":"2024-03-27T05:22:18Z","published":"2023-11-29T08:56:24Z","title":"DifFlow3D: Toward Robust Uncertainty-Aware Scene Flow Estimation with\n Iterative Diffusion-Based Refinement","summary":" Scene flow estimation, which aims to predict per-point 3D displacements of\ndynamic scenes, is a fundamental task in the computer vision field. However,\nprevious works commonly suffer from unreliable correlation caused by locally\nconstrained searching ranges, and struggle with accumulated inaccuracy arising\nfrom the coarse-to-fine structure. To alleviate these problems, we propose a\nnovel uncertainty-aware scene flow estimation network (DifFlow3D) with the\ndiffusion probabilistic model. Iterative diffusion-based refinement is designed\nto enhance the correlation robustness and resilience to challenging cases, e.g.\ndynamics, noisy inputs, repetitive patterns, etc. To restrain the generation\ndiversity, three key flow-related features are leveraged as conditions in our\ndiffusion model. Furthermore, we also develop an uncertainty estimation module\nwithin diffusion to evaluate the reliability of estimated scene flow. Our\nDifFlow3D achieves state-of-the-art performance, with 24.0% and 29.1% EPE3D\nreduction respectively on FlyingThings3D and KITTI 2015 datasets. Notably, our\nmethod achieves an unprecedented millimeter-level accuracy (0.0078m in EPE3D)\non the KITTI dataset. Additionally, our diffusion-based refinement paradigm can\nbe readily integrated as a plug-and-play module into existing scene flow\nnetworks, significantly increasing their estimation accuracy. Codes are\nreleased at https://github.com/IRMVLab/DifFlow3D.\n","authors":["Jiuming Liu","Guangming Wang","Weicai Ye","Chaokang Jiang","Jinru Han","Zhe Liu","Guofeng Zhang","Dalong Du","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17456v3.pdf","comment":"Camera-ready version of CVPR 2024. Codes are released at\n https://github.com/IRMVLab/DifFlow3D"},{"id":"http://arxiv.org/abs/2403.18260v1","updated":"2024-03-27T05:22:06Z","published":"2024-03-27T05:22:06Z","title":"Toward Interactive Regional Understanding in Vision-Large Language\n Models","summary":" Recent Vision-Language Pre-training (VLP) models have demonstrated\nsignificant advancements. Nevertheless, these models heavily rely on image-text\npairs that capture only coarse and global information of an image, leading to a\nlimitation in their regional understanding ability. In this work, we introduce\n\\textbf{RegionVLM}, equipped with explicit regional modeling capabilities,\nallowing them to understand user-indicated image regions. To achieve this, we\ndesign a simple yet innovative architecture, requiring no modifications to the\nmodel architecture or objective function. Additionally, we leverage a dataset\nthat contains a novel source of information, namely Localized Narratives, which\nhas been overlooked in previous VLP research. Our experiments demonstrate that\nour single generalist model not only achieves an interactive dialogue system\nbut also exhibits superior performance on various zero-shot region\nunderstanding tasks, without compromising its ability for global image\nunderstanding.\n","authors":["Jungbeom Lee","Sanghyuk Chun","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2403.18260v1.pdf","comment":"NAACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2209.02200v3","updated":"2024-03-27T05:16:02Z","published":"2022-09-06T03:42:18Z","title":"Task-wise Sampling Convolutions for Arbitrary-Oriented Object Detection\n in Aerial Images","summary":" Arbitrary-oriented object detection (AOOD) has been widely applied to locate\nand classify objects with diverse orientations in remote sensing images.\nHowever, the inconsistent features for the localization and classification\ntasks in AOOD models may lead to ambiguity and low-quality object predictions,\nwhich constrains the detection performance. In this article, an AOOD method\ncalled task-wise sampling convolutions (TS-Conv) is proposed. TS-Conv\nadaptively samples task-wise features from respective sensitive regions and\nmaps these features together in alignment to guide a dynamic label assignment\nfor better predictions. Specifically, sampling positions of the localization\nconvolution in TS-Conv are supervised by the oriented bounding box (OBB)\nprediction associated with spatial coordinates, while sampling positions and\nconvolutional kernel of the classification convolution are designed to be\nadaptively adjusted according to different orientations for improving the\norientation robustness of features. Furthermore, a dynamic\ntask-consistent-aware label assignment (DTLA) strategy is developed to select\noptimal candidate positions and assign labels dynamically according to ranked\ntask-aware scores obtained from TS-Conv. Extensive experiments on several\npublic datasets covering multiple scenes, multimodal images, and multiple\ncategories of objects demonstrate the effectiveness, scalability, and superior\nperformance of the proposed TS-Conv.\n","authors":["Zhanchao Huang","Wei Li","Xiang-Gen Xia","Hao Wang","Ran Tao"],"pdf_url":"https://arxiv.org/pdf/2209.02200v3.pdf","comment":"15 pages, 13 figures, 11 tables"},{"id":"http://arxiv.org/abs/2403.07359v4","updated":"2024-03-27T05:14:09Z","published":"2024-03-12T06:45:34Z","title":"FSC: Few-point Shape Completion","summary":" While previous studies have demonstrated successful 3D object shape\ncompletion with a sufficient number of points, they often fail in scenarios\nwhen a few points, e.g. tens of points, are observed. Surprisingly, via entropy\nanalysis, we find that even a few points, e.g. 64 points, could retain\nsubstantial information to help recover the 3D shape of the object. To address\nthe challenge of shape completion with very sparse point clouds, we then\npropose Few-point Shape Completion (FSC) model, which contains a novel\ndual-branch feature extractor for handling extremely sparse inputs, coupled\nwith an extensive branch for maximal point utilization with a saliency branch\nfor dynamic importance assignment. This model is further bolstered by a\ntwo-stage revision network that refines both the extracted features and the\ndecoder output, enhancing the detail and authenticity of the completed point\ncloud. Our experiments demonstrate the feasibility of recovering 3D shapes from\na few points. The proposed Few-point Shape Completion (FSC) model outperforms\nprevious methods on both few-point inputs and many-point inputs, and shows good\ngeneralizability to different object categories.\n","authors":["Xianzu Wu","Xianfeng Wu","Tianyu Luan","Yajing Bai","Zhongyuan Lai","Junsong Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.07359v4.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18258v1","updated":"2024-03-27T05:10:38Z","published":"2024-03-27T05:10:38Z","title":"Enhancing Generative Class Incremental Learning Performance with Model\n Forgetting Approach","summary":" This study presents a novel approach to Generative Class Incremental Learning\n(GCIL) by introducing the forgetting mechanism, aimed at dynamically managing\nclass information for better adaptation to streaming data. GCIL is one of the\nhot topics in the field of computer vision, and this is considered one of the\ncrucial tasks in society, specifically the continual learning of generative\nmodels. The ability to forget is a crucial brain function that facilitates\ncontinual learning by selectively discarding less relevant information for\nhumans. However, in the field of machine learning models, the concept of\nintentionally forgetting has not been extensively investigated. In this study\nwe aim to bridge this gap by incorporating the forgetting mechanisms into GCIL,\nthereby examining their impact on the models' ability to learn in continual\nlearning. Through our experiments, we have found that integrating the\nforgetting mechanisms significantly enhances the models' performance in\nacquiring new knowledge, underscoring the positive role that strategic\nforgetting plays in the process of continual learning.\n","authors":["Taro Togo","Ren Togo","Keisuke Maeda","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2403.18258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18252v1","updated":"2024-03-27T04:49:23Z","published":"2024-03-27T04:49:23Z","title":"Beyond Embeddings: The Promise of Visual Table in Multi-Modal Models","summary":" Visual representation learning has been a cornerstone in computer vision,\nevolving from supervised learning with human-annotated labels to aligning\nimage-text pairs from the Internet. Despite recent advancements in multi-modal\nlarge language models (MLLMs), the visual representations they rely on, such as\nCLIP embeddings, often lack access to external world knowledge critical for\nreal-world visual reasoning. In this work, we propose Visual Table, a novel\nvisual representation tailored for MLLMs. It provides hierarchical text\ndescriptions of holistic visual scenes, consisting of a scene description and\nmultiple object-centric descriptions that encompass categories, attributes, and\nknowledge at instance level. We further develop a scalable generator for visual\ntable generation and train it on small-scale annotations from GPT4V. Extensive\nevaluations demonstrate that, with generated visual tables as additional visual\nrepresentations, our model can consistently outperform the state-of-the-art\n(SOTA) MLLMs across diverse benchmarks. When visual tables serve as standalone\nvisual representations, our model can closely match or even beat the SOTA MLLMs\nthat are built on CLIP visual embeddings. Our code is available at\nhttps://github.com/LaVi-Lab/Visual-Table.\n","authors":["Yiwu Zhong","Zi-Yuan Hu","Michael R. Lyu","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18252v1.pdf","comment":"Project page: https://github.com/LaVi-Lab/Visual-Table"},{"id":"http://arxiv.org/abs/2403.18241v1","updated":"2024-03-27T04:09:34Z","published":"2024-03-27T04:09:34Z","title":"NeuSDFusion: A Spatial-Aware Generative Model for 3D Shape Completion,\n Reconstruction, and Generation","summary":" 3D shape generation aims to produce innovative 3D content adhering to\nspecific conditions and constraints. Existing methods often decompose 3D shapes\ninto a sequence of localized components, treating each element in isolation\nwithout considering spatial consistency. As a result, these approaches exhibit\nlimited versatility in 3D data representation and shape generation, hindering\ntheir ability to generate highly diverse 3D shapes that comply with the\nspecified constraints. In this paper, we introduce a novel spatial-aware 3D\nshape generation framework that leverages 2D plane representations for enhanced\n3D shape modeling. To ensure spatial coherence and reduce memory usage, we\nincorporate a hybrid shape representation technique that directly learns a\ncontinuous signed distance field representation of the 3D shape using\northogonal 2D planes. Additionally, we meticulously enforce spatial\ncorrespondences across distinct planes using a transformer-based autoencoder\nstructure, promoting the preservation of spatial relationships in the generated\n3D shapes. This yields an algorithm that consistently outperforms\nstate-of-the-art 3D shape generation methods on various tasks, including\nunconditional shape generation, multi-modal shape completion, single-view\nreconstruction, and text-to-shape synthesis.\n","authors":["Ruikai Cui","Weizhe Liu","Weixuan Sun","Senbo Wang","Taizhang Shang","Yang Li","Xibin Song","Han Yan","Zhennan Wu","Shenzhou Chen","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2403.18241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00374v4","updated":"2024-03-27T04:06:36Z","published":"2023-12-31T02:25:41Z","title":"EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via\n Expressive Masked Audio Gesture Modeling","summary":" We propose EMAGE, a framework to generate full-body human gestures from audio\nand masked gestures, encompassing facial, local body, hands, and global\nmovements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new\nmesh-level holistic co-speech dataset. BEAT2 combines MoShed SMPLX body with\nFLAME head parameters and further refines the modeling of head, neck, and\nfinger movements, offering a community-standardized, high-quality 3D motion\ncaptured dataset. EMAGE leverages masked body gesture priors during training to\nboost inference performance. It involves a Masked Audio Gesture Transformer,\nfacilitating joint training on audio-to-gesture generation and masked gesture\nreconstruction to effectively encode audio and body gesture hints. Encoded body\nhints from masked gestures are then separately employed to generate facial and\nbody movements. Moreover, EMAGE adaptively merges speech features from the\naudio's rhythm and content and utilizes four compositional VQ-VAEs to enhance\nthe results' fidelity and diversity. Experiments demonstrate that EMAGE\ngenerates holistic gestures with state-of-the-art performance and is flexible\nin accepting predefined spatial-temporal gesture inputs, generating complete,\naudio-synchronized results. Our code and dataset are available at\nhttps://pantomatrix.github.io/EMAGE/\n","authors":["Haiyang Liu","Zihao Zhu","Giorgio Becherini","Yichen Peng","Mingyang Su","You Zhou","Xuefei Zhe","Naoya Iwamoto","Bo Zheng","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2401.00374v4.pdf","comment":"Conflict of Interest Disclosure; CVPR Camera Ready; Project Page:\n https://pantomatrix.github.io/EMAGE/"},{"id":"http://arxiv.org/abs/2403.18238v1","updated":"2024-03-27T04:03:55Z","published":"2024-03-27T04:03:55Z","title":"TAFormer: A Unified Target-Aware Transformer for Video and Motion Joint\n Prediction in Aerial Scenes","summary":" As drone technology advances, using unmanned aerial vehicles for aerial\nsurveys has become the dominant trend in modern low-altitude remote sensing.\nThe surge in aerial video data necessitates accurate prediction for future\nscenarios and motion states of the interested target, particularly in\napplications like traffic management and disaster response. Existing video\nprediction methods focus solely on predicting future scenes (video frames),\nsuffering from the neglect of explicitly modeling target's motion states, which\nis crucial for aerial video interpretation. To address this issue, we introduce\na novel task called Target-Aware Aerial Video Prediction, aiming to\nsimultaneously predict future scenes and motion states of the target. Further,\nwe design a model specifically for this task, named TAFormer, which provides a\nunified modeling approach for both video and target motion states.\nSpecifically, we introduce Spatiotemporal Attention (STA), which decouples the\nlearning of video dynamics into spatial static attention and temporal dynamic\nattention, effectively modeling the scene appearance and motion. Additionally,\nwe design an Information Sharing Mechanism (ISM), which elegantly unifies the\nmodeling of video and target motion by facilitating information interaction\nthrough two sets of messenger tokens. Moreover, to alleviate the difficulty of\ndistinguishing targets in blurry predictions, we introduce Target-Sensitive\nGaussian Loss (TSGL), enhancing the model's sensitivity to both target's\nposition and content. Extensive experiments on UAV123VP and VisDroneVP (derived\nfrom single-object tracking datasets) demonstrate the exceptional performance\nof TAFormer in target-aware video prediction, showcasing its adaptability to\nthe additional requirements of aerial video interpretation for target\nawareness.\n","authors":["Liangyu Xu","Wanxuan Lu","Hongfeng Yu","Yongqiang Mao","Hanbo Bi","Chenglong Liu","Xian Sun","Kun Fu"],"pdf_url":"https://arxiv.org/pdf/2403.18238v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2311.08100v3","updated":"2024-03-27T04:00:07Z","published":"2023-11-14T11:53:24Z","title":"PPAD: Iterative Interactions of Prediction and Planning for End-to-end\n Autonomous Driving","summary":" We present a new interaction mechanism of prediction and planning for\nend-to-end autonomous driving, called PPAD (Iterative Interaction of Prediction\nand Planning Autonomous Driving), which considers the timestep-wise interaction\nto better integrate prediction and planning. An ego vehicle performs motion\nplanning at each timestep based on the trajectory prediction of surrounding\nagents (e.g., vehicles and pedestrians) and its local road conditions. Unlike\nexisting end-to-end autonomous driving frameworks, PPAD models the interactions\namong ego, agents, and the dynamic environment in an autoregressive manner by\ninterleaving the Prediction and Planning processes at every timestep, instead\nof a single sequential process of prediction followed by planning.\nSpecifically, we design ego-to-agent, ego-to-map, and ego-to-BEV interaction\nmechanisms with hierarchical dynamic key objects attention to better model the\ninteractions. The experiments on the nuScenes benchmark show that our approach\noutperforms state-of-the-art methods.\n","authors":["Zhili Chen","Maosheng Ye","Shuangjie Xu","Tongyi Cao","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2311.08100v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01616v2","updated":"2024-03-27T03:56:35Z","published":"2023-12-04T04:14:09Z","title":"SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation\n System","summary":" Accuracy and computational efficiency are the most important metrics to\nVisual Inertial Navigation System (VINS). The existing VINS algorithms with\neither high accuracy or low computational complexity, are difficult to provide\nthe high precision localization in resource-constrained devices. To this end,\nwe propose a novel filter-based VINS framework named SchurVINS, which could\nguarantee both high accuracy by building a complete residual model and low\ncomputational complexity with Schur complement. Technically, we first formulate\nthe full residual model where Gradient, Hessian and observation covariance are\nexplicitly modeled. Then Schur complement is employed to decompose the full\nmodel into ego-motion residual model and landmark residual model. Finally,\nExtended Kalman Filter (EKF) update is implemented in these two models with\nhigh efficiency. Experiments on EuRoC and TUM-VI datasets show that our method\nnotably outperforms state-of-the-art (SOTA) methods in both accuracy and\ncomputational complexity. The experimental code of SchurVINS is available at\nhttps://github.com/bytedance/SchurVINS.\n","authors":["Yunfei Fan","Tianyu Zhao","Guidong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01616v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08106v3","updated":"2024-03-27T03:55:39Z","published":"2023-10-12T08:01:11Z","title":"Generalized Logit Adjustment: Calibrating Fine-tuned Models by Removing\n Label Bias in Foundation Models","summary":" Foundation models like CLIP allow zero-shot transfer on various tasks without\nadditional training data. Yet, the zero-shot performance is less competitive\nthan a fully supervised one. Thus, to enhance the performance, fine-tuning and\nensembling are also commonly adopted to better fit the downstream tasks.\nHowever, we argue that such prior work has overlooked the inherent biases in\nfoundation models. Due to the highly imbalanced Web-scale training set, these\nfoundation models are inevitably skewed toward frequent semantics, and thus the\nsubsequent fine-tuning or ensembling is still biased. In this study, we\nsystematically examine the biases in foundation models and demonstrate the\nefficacy of our proposed Generalized Logit Adjustment (GLA) method. Note that\nbias estimation in foundation models is challenging, as most pre-train data\ncannot be explicitly accessed like in traditional long-tailed classification\ntasks. To this end, GLA has an optimization-based bias estimation approach for\ndebiasing foundation models. As our work resolves a fundamental flaw in the\npre-training, the proposed GLA demonstrates significant improvements across a\ndiverse range of tasks: it achieves 1.5 pp accuracy gains on ImageNet, an large\naverage improvement (1.4-4.6 pp) on 11 few-shot datasets, 2.4 pp gains on\nlong-tailed classification. Codes are in \\url{https://github.com/BeierZhu/GLA}.\n","authors":["Beier Zhu","Kaihua Tang","Qianru Sun","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08106v3.pdf","comment":"V2 proposed a more effective method for label distribution\n estimation. V1 fixed a typo in abstract; Accepted by NeurIPS2023"},{"id":"http://arxiv.org/abs/2206.08657v6","updated":"2024-03-27T03:53:23Z","published":"2022-06-17T09:42:35Z","title":"BridgeTower: Building Bridges Between Encoders in Vision-Language\n Representation Learning","summary":" Vision-Language (VL) models with the Two-Tower architecture have dominated\nvisual-language representation learning in recent years. Current VL models\neither use lightweight uni-modal encoders and learn to extract, align and fuse\nboth modalities simultaneously in a deep cross-modal encoder, or feed the\nlast-layer uni-modal representations from the deep pre-trained uni-modal\nencoders into the top cross-modal encoder. Both approaches potentially restrict\nvision-language representation learning and limit model performance. In this\npaper, we propose BridgeTower, which introduces multiple bridge layers that\nbuild a connection between the top layers of uni-modal encoders and each layer\nof the cross-modal encoder. This enables effective bottom-up cross-modal\nalignment and fusion between visual and textual representations of different\nsemantic levels of pre-trained uni-modal encoders in the cross-modal encoder.\nPre-trained with only 4M images, BridgeTower achieves state-of-the-art\nperformance on various downstream vision-language tasks. In particular, on the\nVQAv2 test-std set, BridgeTower achieves an accuracy of 78.73%, outperforming\nthe previous state-of-the-art model METER by 1.09% with the same pre-training\ndata and almost negligible additional parameters and computational costs.\nNotably, when further scaling the model, BridgeTower achieves an accuracy of\n81.15%, surpassing models that are pre-trained on orders-of-magnitude larger\ndatasets. Code and checkpoints are available at\nhttps://github.com/microsoft/BridgeTower.\n","authors":["Xiao Xu","Chenfei Wu","Shachar Rosenman","Vasudev Lal","Wanxiang Che","Nan Duan"],"pdf_url":"https://arxiv.org/pdf/2206.08657v6.pdf","comment":"Accepted by AAAI 2023, Oral"},{"id":"http://arxiv.org/abs/2403.04125v2","updated":"2024-03-27T03:53:14Z","published":"2024-03-07T00:44:21Z","title":"Scalable and Robust Transformer Decoders for Interpretable Image\n Classification with Foundation Models","summary":" Interpretable computer vision models can produce transparent predictions,\nwhere the features of an image are compared with prototypes from a training\ndataset and the similarity between them forms a basis for classification.\nNevertheless these methods are computationally expensive to train, introduce\nadditional complexity and may require domain knowledge to adapt\nhyper-parameters to a new dataset. Inspired by developments in object\ndetection, segmentation and large-scale self-supervised foundation vision\nmodels, we introduce Component Features (ComFe), a novel explainable-by-design\nimage classification approach using a transformer-decoder head and hierarchical\nmixture-modelling. With only global image labels and no segmentation or part\nannotations, ComFe can identify consistent image components, such as the head,\nbody, wings and tail of a bird, and the image background, and determine which\nof these features are informative in making a prediction. We demonstrate that\nComFe obtains higher accuracy compared to previous interpretable models across\na range of fine-grained vision benchmarks, without the need to individually\ntune hyper-parameters for each dataset. We also show that ComFe outperforms a\nnon-interpretable linear head across a range of datasets, including ImageNet,\nand improves performance on generalisation and robustness benchmarks.\n","authors":["Evelyn Mannix","Howard Bondell"],"pdf_url":"https://arxiv.org/pdf/2403.04125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11104v4","updated":"2024-03-27T03:47:20Z","published":"2023-01-26T13:58:46Z","title":"Discovering and Mitigating Visual Biases through Keyword Explanation","summary":" Addressing biases in computer vision models is crucial for real-world AI\ndeployments. However, mitigating visual biases is challenging due to their\nunexplainable nature, often identified indirectly through visualization or\nsample statistics, which necessitates additional human supervision for\ninterpretation. To tackle this issue, we propose the Bias-to-Text (B2T)\nframework, which interprets visual biases as keywords. Specifically, we extract\ncommon keywords from the captions of mispredicted images to identify potential\nbiases in the model. We then validate these keywords by measuring their\nsimilarity to the mispredicted images using a vision-language scoring model.\nThe keyword explanation form of visual bias offers several advantages, such as\na clear group naming for bias discovery and a natural extension for debiasing\nusing these group names. Our experiments demonstrate that B2T can identify\nknown biases, such as gender bias in CelebA, background bias in Waterbirds, and\ndistribution shifts in ImageNet-R/C. Additionally, B2T uncovers novel biases in\nlarger datasets, such as Dollar Street and ImageNet. For example, we discovered\na contextual bias between \"bee\" and \"flower\" in ImageNet. We also highlight\nvarious applications of B2T keywords, including debiased training, CLIP\nprompting, and model comparison.\n","authors":["Younghyun Kim","Sangwoo Mo","Minkyu Kim","Kyungmin Lee","Jaeho Lee","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2301.11104v4.pdf","comment":"CVPR 2024. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2403.18233v1","updated":"2024-03-27T03:39:57Z","published":"2024-03-27T03:39:57Z","title":"Benchmarking Image Transformers for Prostate Cancer Detection from\n Ultrasound Data","summary":" PURPOSE: Deep learning methods for classifying prostate cancer (PCa) in\nultrasound images typically employ convolutional networks (CNNs) to detect\ncancer in small regions of interest (ROI) along a needle trace region. However,\nthis approach suffers from weak labelling, since the ground-truth\nhistopathology labels do not describe the properties of individual ROIs.\nRecently, multi-scale approaches have sought to mitigate this issue by\ncombining the context awareness of transformers with a CNN feature extractor to\ndetect cancer from multiple ROIs using multiple-instance learning (MIL). In\nthis work, we present a detailed study of several image transformer\narchitectures for both ROI-scale and multi-scale classification, and a\ncomparison of the performance of CNNs and transformers for ultrasound-based\nprostate cancer classification. We also design a novel multi-objective learning\nstrategy that combines both ROI and core predictions to further mitigate label\nnoise. METHODS: We evaluate 3 image transformers on ROI-scale cancer\nclassification, then use the strongest model to tune a multi-scale classifier\nwith MIL. We train our MIL models using our novel multi-objective learning\nstrategy and compare our results to existing baselines. RESULTS: We find that\nfor both ROI-scale and multi-scale PCa detection, image transformer backbones\nlag behind their CNN counterparts. This deficit in performance is even more\nnoticeable for larger models. When using multi-objective learning, we can\nimprove performance of MIL, with a 77.9% AUROC, a sensitivity of 75.9%, and a\nspecificity of 66.3%. CONCLUSION: Convolutional networks are better suited for\nmodelling sparse datasets of prostate ultrasounds, producing more robust\nfeatures than transformers in PCa detection. Multi-scale methods remain the\nbest architecture for this task, with multi-objective learning presenting an\neffective way to improve performance.\n","authors":["Mohamed Harmanani","Paul F. R. Wilson","Fahimeh Fooladgar","Amoon Jamzad","Mahdi Gilany","Minh Nguyen Nhat To","Brian Wodlinger","Purang Abolmaesumi","Parvin Mousavi"],"pdf_url":"https://arxiv.org/pdf/2403.18233v1.pdf","comment":"early draft, 7 pages; Accepted to SPIE Medical Imaging 2024"},{"id":"http://arxiv.org/abs/2403.02649v2","updated":"2024-03-27T03:34:00Z","published":"2024-03-05T04:38:13Z","title":"Few-shot Learner Parameterization by Diffusion Time-steps","summary":" Even when using large multi-modal foundation models, few-shot learning is\nstill challenging -- if there is no proper inductive bias, it is nearly\nimpossible to keep the nuanced class attributes while removing the visually\nprominent attributes that spuriously correlate with class labels. To this end,\nwe find an inductive bias that the time-steps of a Diffusion Model (DM) can\nisolate the nuanced class attributes, i.e., as the forward diffusion adds noise\nto an image at each time-step, nuanced attributes are usually lost at an\nearlier time-step than the spurious attributes that are visually prominent.\nBuilding on this, we propose Time-step Few-shot (TiF) learner. We train\nclass-specific low-rank adapters for a text-conditioned DM to make up for the\nlost attributes, such that images can be accurately reconstructed from their\nnoisy ones given a prompt. Hence, at a small time-step, the adapter and prompt\nare essentially a parameterization of only the nuanced class attributes. For a\ntest image, we can use the parameterization to only extract the nuanced class\nattributes for classification. TiF learner significantly outperforms OpenCLIP\nand its adapters on a variety of fine-grained and customized few-shot learning\ntasks. Codes are in https://github.com/yue-zhongqi/tif.\n","authors":["Zhongqi Yue","Pan Zhou","Richang Hong","Hanwang Zhang","Qianru Sun"],"pdf_url":"https://arxiv.org/pdf/2403.02649v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18228v1","updated":"2024-03-27T03:31:16Z","published":"2024-03-27T03:31:16Z","title":"Fourier or Wavelet bases as counterpart self-attention in spikformer for\n efficient visual classification","summary":" Energy-efficient spikformer has been proposed by integrating the biologically\nplausible spiking neural network (SNN) and artificial Transformer, whereby the\nSpiking Self-Attention (SSA) is used to achieve both higher accuracy and lower\ncomputational cost. However, it seems that self-attention is not always\nnecessary, especially in sparse spike-form calculation manners. In this paper,\nwe innovatively replace vanilla SSA (using dynamic bases calculating from Query\nand Key) with spike-form Fourier Transform, Wavelet Transform, and their\ncombinations (using fixed triangular or wavelets bases), based on a key\nhypothesis that both of them use a set of basis functions for information\ntransformation. Hence, the Fourier-or-Wavelet-based spikformer (FWformer) is\nproposed and verified in visual classification tasks, including both static\nimage and event-based video datasets. The FWformer can achieve comparable or\neven higher accuracies ($0.4\\%$-$1.5\\%$), higher running speed ($9\\%$-$51\\%$\nfor training and $19\\%$-$70\\%$ for inference), reduced theoretical energy\nconsumption ($20\\%$-$25\\%$), and reduced GPU memory usage ($4\\%$-$26\\%$),\ncompared to the standard spikformer. Our result indicates the continuous\nrefinement of new Transformers, that are inspired either by biological\ndiscovery (spike-form), or information theory (Fourier or Wavelet Transform),\nis promising.\n","authors":["Qingyu Wang","Duzhen Zhang","Tilelin Zhang","Bo Xu"],"pdf_url":"https://arxiv.org/pdf/2403.18228v1.pdf","comment":"18 pages, 2 figures. arXiv admin note: substantial text overlap with\n arXiv:2308.02557"},{"id":"http://arxiv.org/abs/2304.14394v3","updated":"2024-03-27T03:23:12Z","published":"2023-04-27T17:56:29Z","title":"Unified Sequence-to-Sequence Learning for Single- and Multi-Modal Visual\n Object Tracking","summary":" In this paper, we introduce a new sequence-to-sequence learning framework for\nRGB-based and multi-modal object tracking. First, we present SeqTrack for\nRGB-based tracking. It casts visual tracking as a sequence generation task,\nforecasting object bounding boxes in an autoregressive manner. This differs\nfrom previous trackers, which depend on the design of intricate head networks,\nsuch as classification and regression heads. SeqTrack employs a basic\nencoder-decoder transformer architecture. The encoder utilizes a bidirectional\ntransformer for feature extraction, while the decoder generates bounding box\nsequences autoregressively using a causal transformer. The loss function is a\nplain cross-entropy. Second, we introduce SeqTrackv2, a unified\nsequence-to-sequence framework for multi-modal tracking tasks. Expanding upon\nSeqTrack, SeqTrackv2 integrates a unified interface for auxiliary modalities\nand a set of task-prompt tokens to specify the task. This enables it to manage\nmulti-modal tracking tasks using a unified model and parameter set. This\nsequence learning paradigm not only simplifies the tracking framework, but also\nshowcases superior performance across 14 challenging benchmarks spanning five\nsingle- and multi-modal tracking tasks. The code and models are available at\nhttps://github.com/chenxin-dlut/SeqTrackv2.\n","authors":["Xin Chen","Ben Kang","Jiawen Zhu","Dong Wang","Houwen Peng","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2304.14394v3.pdf","comment":"This is a new expanded version of our previous CVPR2023 paper\n \"SeqTrack: Sequence to Sequence Learning for Visual Object Tracking.\"\n SeqTrackv2 extends SeqTrack to four multi-modal tracking tasks with a unified\n model and parameter set"},{"id":"http://arxiv.org/abs/2402.17464v3","updated":"2024-03-27T03:13:52Z","published":"2024-02-27T12:42:06Z","title":"Generative 3D Part Assembly via Part-Whole-Hierarchy Message Passing","summary":" Generative 3D part assembly involves understanding part relationships and\npredicting their 6-DoF poses for assembling a realistic 3D shape. Prior work\noften focus on the geometry of individual parts, neglecting part-whole\nhierarchies of objects. Leveraging two key observations: 1) super-part poses\nprovide strong hints about part poses, and 2) predicting super-part poses is\neasier due to fewer superparts, we propose a part-whole-hierarchy message\npassing network for efficient 3D part assembly. We first introduce super-parts\nby grouping geometrically similar parts without any semantic labels. Then we\nemploy a part-whole hierarchical encoder, wherein a super-part encoder predicts\nlatent super-part poses based on input parts. Subsequently, we transform the\npoint cloud using the latent poses, feeding it to the part encoder for\naggregating super-part information and reasoning about part relationships to\npredict all part poses. In training, only ground-truth part poses are required.\nDuring inference, the predicted latent poses of super-parts enhance\ninterpretability. Experimental results on the PartNet dataset show that our\nmethod achieves state-of-the-art performance in part and connectivity accuracy\nand enables an interpretable hierarchical part assembly. Code is available at\nhttps://github.com/pkudba/3DHPA.\n","authors":["Bi'an Du","Xiang Gao","Wei Hu","Renjie Liao"],"pdf_url":"https://arxiv.org/pdf/2402.17464v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16421v2","updated":"2024-03-27T03:07:20Z","published":"2023-09-28T13:12:18Z","title":"Distilling ODE Solvers of Diffusion Models into Smaller Steps","summary":" Abstract Diffusion models have recently gained prominence as a novel category\nof generative models. Despite their success, these models face a notable\ndrawback in terms of slow sampling speeds, requiring a high number of function\nevaluations (NFE) in the order of hundreds or thousands. In response, both\nlearning-free and learning-based sampling strategies have been explored to\nexpedite the sampling process. Learning-free sampling employs various ordinary\ndifferential equation (ODE) solvers based on the formulation of diffusion ODEs.\nHowever, it encounters challenges in faithfully tracking the true sampling\ntrajectory, particularly for small NFE. Conversely, learning-based sampling\nmethods, such as knowledge distillation, demand extensive additional training,\nlimiting their practical applicability. To overcome these limitations, we\nintroduce Distilled-ODE solvers (D-ODE solvers), a straightforward distillation\napproach grounded in ODE solver formulations. Our method seamlessly integrates\nthe strengths of both learning-free and learning-based sampling. D-ODE solvers\nare constructed by introducing a single parameter adjustment to existing ODE\nsolvers. Furthermore, we optimize D-ODE solvers with smaller steps using\nknowledge distillation from ODE solvers with larger steps across a batch of\nsamples. Comprehensive experiments demonstrate the superior performance of\nD-ODE solvers compared to existing ODE solvers, including DDIM, PNDM,\nDPM-Solver, DEIS, and EDM, particularly in scenarios with fewer NFE. Notably,\nour method incurs negligible computational overhead compared to previous\ndistillation techniques, facilitating straightforward and rapid integration\nwith existing samplers. Qualitative analysis reveals that D-ODE solvers not\nonly enhance image quality but also faithfully follow the target ODE\ntrajectory.\n","authors":["Sanghwan Kim","Hao Tang","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2309.16421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04181v2","updated":"2024-03-27T02:51:24Z","published":"2023-10-06T11:53:04Z","title":"DiffPrompter: Differentiable Implicit Visual Prompts for\n Semantic-Segmentation in Adverse Conditions","summary":" Semantic segmentation in adverse weather scenarios is a critical task for\nautonomous driving systems. While foundation models have shown promise, the\nneed for specialized adaptors becomes evident for handling more challenging\nscenarios. We introduce DiffPrompter, a novel differentiable visual and latent\nprompting mechanism aimed at expanding the learning capabilities of existing\nadaptors in foundation models. Our proposed $\\nabla$HFC image processing block\nexcels particularly in adverse weather conditions, where conventional methods\noften fall short. Furthermore, we investigate the advantages of jointly\ntraining visual and latent prompts, demonstrating that this combined approach\nsignificantly enhances performance in out-of-distribution scenarios. Our\ndifferentiable visual prompts leverage parallel and series architectures to\ngenerate prompts, effectively improving object segmentation tasks in adverse\nconditions. Through a comprehensive series of experiments and evaluations, we\nprovide empirical evidence to support the efficacy of our approach. Project\npage at https://diffprompter.github.io.\n","authors":["Sanket Kalwar","Mihir Ungarala","Shruti Jain","Aaron Monis","Krishna Reddy Konda","Sourav Garg","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2310.04181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17343v2","updated":"2024-03-27T02:49:16Z","published":"2024-03-26T03:05:20Z","title":"Language Models are Free Boosters for Biomedical Imaging Tasks","summary":" In this study, we uncover the unexpected efficacy of residual-based large\nlanguage models (LLMs) as part of encoders for biomedical imaging tasks, a\ndomain traditionally devoid of language or textual data. The approach diverges\nfrom established methodologies by utilizing a frozen transformer block,\nextracted from pre-trained LLMs, as an innovative encoder layer for the direct\nprocessing of visual tokens. This strategy represents a significant departure\nfrom the standard multi-modal vision-language frameworks, which typically hinge\non language-driven prompts and inputs. We found that these LLMs could boost\nperformance across a spectrum of biomedical imaging applications, including\nboth 2D and 3D visual classification tasks, serving as plug-and-play boosters.\nMore interestingly, as a byproduct, we found that the proposed framework\nachieved superior performance, setting new state-of-the-art results on\nextensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we\naim to open new avenues for employing LLMs in biomedical imaging and enriching\nthe understanding of their potential in this specialized domain.\n","authors":["Zhixin Lai","Jing Wu","Suiyao Chen","Yucheng Zhou","Naira Hovakimyan"],"pdf_url":"https://arxiv.org/pdf/2403.17343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18211v1","updated":"2024-03-27T02:42:52Z","published":"2024-03-27T02:42:52Z","title":"NeuroPictor: Refining fMRI-to-Image Reconstruction via Multi-individual\n Pretraining and Multi-level Modulation","summary":" Recent fMRI-to-image approaches mainly focused on associating fMRI signals\nwith specific conditions of pre-trained diffusion models. These approaches,\nwhile producing high-quality images, capture only a limited aspect of the\ncomplex information in fMRI signals and offer little detailed control over\nimage creation. In contrast, this paper proposes to directly modulate the\ngeneration process of diffusion models using fMRI signals. Our approach,\nNeuroPictor, divides the fMRI-to-image process into three steps: i) fMRI\ncalibrated-encoding, to tackle multi-individual pre-training for a shared\nlatent space to minimize individual difference and enable the subsequent\ncross-subject training; ii) fMRI-to-image cross-subject pre-training,\nperceptually learning to guide diffusion model with high- and low-level\nconditions across different individuals; iii) fMRI-to-image single-subject\nrefining, similar with step ii but focus on adapting to particular individual.\nNeuroPictor extracts high-level semantic features from fMRI signals that\ncharacterizing the visual stimulus and incrementally fine-tunes the diffusion\nmodel with a low-level manipulation network to provide precise structural\ninstructions. By training with over 60,000 fMRI-image pairs from various\nindividuals, our model enjoys superior fMRI-to-image decoding capacity,\nparticularly in the within-subject setting, as evidenced in benchmark datasets.\nProject page: https://jingyanghuo.github.io/neuropictor/.\n","authors":["Jingyang Huo","Yikai Wang","Xuelin Qian","Yun Wang","Chong Li","Jianfeng Feng","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2403.18211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18208v1","updated":"2024-03-27T02:39:23Z","published":"2024-03-27T02:39:23Z","title":"An Evolutionary Network Architecture Search Framework with Adaptive\n Multimodal Fusion for Hand Gesture Recognition","summary":" Hand gesture recognition (HGR) based on multimodal data has attracted\nconsiderable attention owing to its great potential in applications. Various\nmanually designed multimodal deep networks have performed well in multimodal\nHGR (MHGR), but most of existing algorithms require a lot of expert experience\nand time-consuming manual trials. To address these issues, we propose an\nevolutionary network architecture search framework with the adaptive multimodel\nfusion (AMF-ENAS). Specifically, we design an encoding space that\nsimultaneously considers fusion positions and ratios of the multimodal data,\nallowing for the automatic construction of multimodal networks with different\narchitectures through decoding. Additionally, we consider three input streams\ncorresponding to intra-modal surface electromyography (sEMG), intra-modal\naccelerometer (ACC), and inter-modal sEMG-ACC. To automatically adapt to\nvarious datasets, the ENAS framework is designed to automatically search a MHGR\nnetwork with appropriate fusion positions and ratios. To the best of our\nknowledge, this is the first time that ENAS has been utilized in MHGR to tackle\nissues related to the fusion position and ratio of multimodal data.\nExperimental results demonstrate that AMF-ENAS achieves state-of-the-art\nperformance on the Ninapro DB2, DB3, and DB7 datasets.\n","authors":["Yizhang Xia","Shihao Song","Zhanglu Hou","Junwen Xu","Juan Zou","Yuan Liu","Shengxiang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.18208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18207v1","updated":"2024-03-27T02:35:36Z","published":"2024-03-27T02:35:36Z","title":"Road Obstacle Detection based on Unknown Objectness Scores","summary":" The detection of unknown traffic obstacles is vital to ensure safe autonomous\ndriving. The standard object-detection methods cannot identify unknown objects\nthat are not included under predefined categories. This is because\nobject-detection methods are trained to assign a background label to pixels\ncorresponding to the presence of unknown objects. To address this problem, the\npixel-wise anomaly-detection approach has attracted increased research\nattention. Anomaly-detection techniques, such as uncertainty estimation and\nperceptual difference from reconstructed images, make it possible to identify\npixels of unknown objects as out-of-distribution (OoD) samples. However, when\napplied to images with many unknowns and complex components, such as driving\nscenes, these methods often exhibit unstable performance. The purpose of this\nstudy is to achieve stable performance for detecting unknown objects by\nincorporating the object-detection fashions into the pixel-wise anomaly\ndetection methods. To achieve this goal, we adopt a semantic-segmentation\nnetwork with a sigmoid head that simultaneously provides pixel-wise anomaly\nscores and objectness scores. Our experimental results show that the objectness\nscores play an important role in improving the detection performance. Based on\nthese results, we propose a novel anomaly score by integrating these two\nscores, which we term as unknown objectness score. Quantitative evaluations\nshow that the proposed method outperforms state-of-the-art methods when applied\nto the publicly available datasets.\n","authors":["Chihiro Noguchi","Toshiaki Ohgushi","Masao Yamanaka"],"pdf_url":"https://arxiv.org/pdf/2403.18207v1.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2403.10066v3","updated":"2024-03-27T02:25:51Z","published":"2024-03-15T07:16:07Z","title":"Contrastive Pre-Training with Multi-View Fusion for No-Reference Point\n Cloud Quality Assessment","summary":" No-reference point cloud quality assessment (NR-PCQA) aims to automatically\nevaluate the perceptual quality of distorted point clouds without available\nreference, which have achieved tremendous improvements due to the utilization\nof deep neural networks. However, learning-based NR-PCQA methods suffer from\nthe scarcity of labeled data and usually perform suboptimally in terms of\ngeneralization. To solve the problem, we propose a novel contrastive\npre-training framework tailored for PCQA (CoPA), which enables the pre-trained\nmodel to learn quality-aware representations from unlabeled data. To obtain\nanchors in the representation space, we project point clouds with different\ndistortions into images and randomly mix their local patches to form mixed\nimages with multiple distortions. Utilizing the generated anchors, we constrain\nthe pre-training process via a quality-aware contrastive loss following the\nphilosophy that perceptual quality is closely related to both content and\ndistortion. Furthermore, in the model fine-tuning stage, we propose a\nsemantic-guided multi-view fusion module to effectively integrate the features\nof projected images from multiple perspectives. Extensive experiments show that\nour method outperforms the state-of-the-art PCQA methods on popular benchmarks.\nFurther investigations demonstrate that CoPA can also benefit existing\nlearning-based PCQA models.\n","authors":["Ziyu Shan","Yujie Zhang","Qi Yang","Haichen Yang","Yiling Xu","Jenq-Neng Hwang","Xiaozhong Xu","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.10066v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18201v1","updated":"2024-03-27T02:24:00Z","published":"2024-03-27T02:24:00Z","title":"Few-shot Online Anomaly Detection and Segmentation","summary":" Detecting anomaly patterns from images is a crucial artificial intelligence\ntechnique in industrial applications. Recent research in this domain has\nemphasized the necessity of a large volume of training data, overlooking the\npractical scenario where, post-deployment of the model, unlabeled data\ncontaining both normal and abnormal samples can be utilized to enhance the\nmodel's performance. Consequently, this paper focuses on addressing the\nchallenging yet practical few-shot online anomaly detection and segmentation\n(FOADS) task. Under the FOADS framework, models are trained on a few-shot\nnormal dataset, followed by inspection and improvement of their capabilities by\nleveraging unlabeled streaming data containing both normal and abnormal samples\nsimultaneously.\n To tackle this issue, we propose modeling the feature distribution of normal\nimages using a Neural Gas network, which offers the flexibility to adapt the\ntopology structure to identify outliers in the data flow. In order to achieve\nimproved performance with limited training samples, we employ multi-scale\nfeature embedding extracted from a CNN pre-trained on ImageNet to obtain a\nrobust representation. Furthermore, we introduce an algorithm that can\nincrementally update parameters without the need to store previous samples.\nComprehensive experimental results demonstrate that our method can achieve\nsubstantial performance under the FOADS setting, while ensuring that the time\ncomplexity remains within an acceptable range on MVTec AD and BTAD datasets.\n","authors":["Shenxing Wei","Xing Wei","Zhiheng Ma","Songlin Dong","Shaochen Zhang","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2403.18201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00154v2","updated":"2024-03-27T02:21:03Z","published":"2024-02-29T22:11:20Z","title":"LLMs in Political Science: Heralding a New Era of Visual Analysis","summary":" Interest is increasing among political scientists in leveraging the extensive\ninformation available in images. However, the challenge of interpreting these\nimages lies in the need for specialized knowledge in computer vision and access\nto specialized hardware. As a result, image analysis has been limited to a\nrelatively small group within the political science community. This landscape\ncould potentially change thanks to the rise of large language models (LLMs).\nThis paper aims to raise awareness of the feasibility of using Gemini for image\ncontent analysis. A retrospective analysis was conducted on a corpus of 688\nimages. Content reports were elicited from Gemini for each image and then\nmanually evaluated by the authors. We find that Gemini is highly accurate in\nperforming object detection, which is arguably the most common and fundamental\ntask in image analysis for political scientists. Equally important, we show\nthat it is easy to implement as the entire command consists of a single prompt\nin natural language; it is fast to run and should meet the time budget of most\nresearchers; and it is free to use and does not require any specialized\nhardware. In addition, we illustrate how political scientists can leverage\nGemini for other image understanding tasks, including face identification,\nsentiment analysis, and caption generation. Our findings suggest that Gemini\nand other similar LLMs have the potential to drastically stimulate and\naccelerate image research in political science and social sciences more\nbroadly.\n","authors":["Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2403.00154v2.pdf","comment":"7 pages, 3 tables"},{"id":"http://arxiv.org/abs/2403.18198v1","updated":"2024-03-27T02:16:04Z","published":"2024-03-27T02:16:04Z","title":"Generative Medical Segmentation","summary":" Rapid advancements in medical image segmentation performance have been\nsignificantly driven by the development of Convolutional Neural Networks (CNNs)\nand Vision Transformers (ViTs). However, these models introduce high\ncomputational demands and often have limited ability to generalize across\ndiverse medical imaging datasets. In this manuscript, we introduce Generative\nMedical Segmentation (GMS), a novel approach leveraging a generative model for\nimage segmentation. Concretely, GMS employs a robust pre-trained Variational\nAutoencoder (VAE) to derive latent representations of both images and masks,\nfollowed by a mapping model that learns the transition from image to mask in\nthe latent space. This process culminates in generating a precise segmentation\nmask within the image space using the pre-trained VAE decoder. The design of\nGMS leads to fewer learnable parameters in the model, resulting in a reduced\ncomputational burden and enhanced generalization capability. Our extensive\nexperimental analysis across five public datasets in different medical imaging\ndomains demonstrates GMS outperforms existing discriminative segmentation\nmodels and has remarkable domain generalization. Our experiments suggest GMS\ncould set a new benchmark for medical image segmentation, offering a scalable\nand effective solution. GMS implementation and model weights are available at\nhttps://github.com/King-HAW/GMS.\n","authors":["Jiayu Huo","Xi Ouyang","Sébastien Ourselin","Rachel Sparks"],"pdf_url":"https://arxiv.org/pdf/2403.18198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18196v1","updated":"2024-03-27T02:13:20Z","published":"2024-03-27T02:13:20Z","title":"Looking Beyond What You See: An Empirical Analysis on Subgroup\n Intersectional Fairness for Multi-label Chest X-ray Classification Using\n Social Determinants of Racial Health Inequities","summary":" There has been significant progress in implementing deep learning models in\ndisease diagnosis using chest X- rays. Despite these advancements, inherent\nbiases in these models can lead to disparities in prediction accuracy across\nprotected groups. In this study, we propose a framework to achieve accurate\ndiagnostic outcomes and ensure fairness across intersectional groups in\nhigh-dimensional chest X- ray multi-label classification. Transcending\ntraditional protected attributes, we consider complex interactions within\nsocial determinants, enabling a more granular benchmark and evaluation of\nfairness. We present a simple and robust method that involves retraining the\nlast classification layer of pre-trained models using a balanced dataset across\ngroups. Additionally, we account for fairness constraints and integrate\nclass-balanced fine-tuning for multi-label settings. The evaluation of our\nmethod on the MIMIC-CXR dataset demonstrates that our framework achieves an\noptimal tradeoff between accuracy and fairness compared to baseline methods.\n","authors":["Dana Moukheiber","Saurabh Mahindre","Lama Moukheiber","Mira Moukheiber","Mingchen Gao"],"pdf_url":"https://arxiv.org/pdf/2403.18196v1.pdf","comment":"ICCV CVAMD 2023"},{"id":"http://arxiv.org/abs/2403.18193v1","updated":"2024-03-27T02:06:25Z","published":"2024-03-27T02:06:25Z","title":"Middle Fusion and Multi-Stage, Multi-Form Prompts for Robust RGB-T\n Tracking","summary":" RGB-T tracking, a vital downstream task of object tracking, has made\nremarkable progress in recent years. Yet, it remains hindered by two major\nchallenges: 1) the trade-off between performance and efficiency; 2) the\nscarcity of training data. To address the latter challenge, some recent methods\nemploy prompts to fine-tune pre-trained RGB tracking models and leverage\nupstream knowledge in a parameter-efficient manner. However, these methods\ninadequately explore modality-independent patterns and disregard the dynamic\nreliability of different modalities in open scenarios. We propose M3PT, a novel\nRGB-T prompt tracking method that leverages middle fusion and multi-modal and\nmulti-stage visual prompts to overcome these challenges. We pioneer the use of\nthe middle fusion framework for RGB-T tracking, which achieves a balance\nbetween performance and efficiency. Furthermore, we incorporate the pre-trained\nRGB tracking model into the framework and utilize multiple flexible prompt\nstrategies to adapt the pre-trained model to the comprehensive exploration of\nuni-modal patterns and the improved modeling of fusion-modal features,\nharnessing the potential of prompt learning in RGB-T tracking. Our method\noutperforms the state-of-the-art methods on four challenging benchmarks, while\nattaining 46.1 fps inference speed.\n","authors":["Qiming Wang","Yongqiang Bai","Hongxing Song"],"pdf_url":"https://arxiv.org/pdf/2403.18193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00211v2","updated":"2024-03-27T01:50:06Z","published":"2024-03-01T01:07:40Z","title":"Trustworthy Self-Attention: Enabling the Network to Focus Only on the\n Most Relevant References","summary":" The prediction of optical flow for occluded points is still a difficult\nproblem that has not yet been solved. Recent methods use self-attention to find\nrelevant non-occluded points as references for estimating the optical flow of\noccluded points based on the assumption of self-similarity. However, they rely\non visual features of a single image and weak constraints, which are not\nsufficient to constrain the trained network to focus on erroneous and weakly\nrelevant reference points. We make full use of online occlusion recognition\ninformation to construct occlusion extended visual features and two strong\nconstraints, allowing the network to learn to focus only on the most relevant\nreferences without requiring occlusion ground truth to participate in the\ntraining of the network. Our method adds very few network parameters to the\noriginal framework, making it very lightweight. Extensive experiments show that\nour model has the greatest cross-dataset generalization. Our method achieves\nmuch greater error reduction, 18.6%, 16.2%, and 20.1% for all points,\nnon-occluded points, and occluded points respectively from the state-of-the-art\nGMA-base method, MATCHFlow(GMA), on Sintel Albedo pass. Furthermore, our model\nachieves state-of-the-art performance on the Sintel bench-marks, ranking \\#1\namong all published methods on Sintel clean pass. The code will be open-source.\n","authors":["Yu Jing","Tan Yujuan","Ren Ao","Liu Duo"],"pdf_url":"https://arxiv.org/pdf/2403.00211v2.pdf","comment":"Correct Figure 1"},{"id":"http://arxiv.org/abs/2403.18187v1","updated":"2024-03-27T01:40:21Z","published":"2024-03-27T01:40:21Z","title":"LayoutFlow: Flow Matching for Layout Generation","summary":" Finding a suitable layout represents a crucial task for diverse applications\nin graphic design. Motivated by simpler and smoother sampling trajectories, we\nexplore the use of Flow Matching as an alternative to current diffusion-based\nlayout generation models. Specifically, we propose LayoutFlow, an efficient\nflow-based model capable of generating high-quality layouts. Instead of\nprogressively denoising the elements of a noisy layout, our method learns to\ngradually move, or flow, the elements of an initial sample until it reaches its\nfinal prediction. In addition, we employ a conditioning scheme that allows us\nto handle various generation tasks with varying degrees of conditioning with a\nsingle model. Empirically, LayoutFlow performs on par with state-of-the-art\nmodels while being significantly faster.\n","authors":["Julian Jorge Andrade Guerreiro","Naoto Inoue","Kento Masui","Mayu Otani","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2403.18187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09069v2","updated":"2024-03-27T01:32:10Z","published":"2024-03-14T03:21:33Z","title":"Dyadic Interaction Modeling for Social Behavior Generation","summary":" Human-human communication is like a delicate dance where listeners and\nspeakers concurrently interact to maintain conversational dynamics. Hence, an\neffective model for generating listener nonverbal behaviors requires\nunderstanding the dyadic context and interaction. In this paper, we present an\neffective framework for creating 3D facial motions in dyadic interactions.\nExisting work consider a listener as a reactive agent with reflexive behaviors\nto the speaker's voice and facial motions. The heart of our framework is Dyadic\nInteraction Modeling (DIM), a pre-training approach that jointly models\nspeakers' and listeners' motions through masking and contrastive learning to\nlearn representations that capture the dyadic context. To enable the generation\nof non-deterministic behaviors, we encode both listener and speaker motions\ninto discrete latent representations, through VQ-VAE. The pre-trained model is\nfurther fine-tuned for motion generation. Extensive experiments demonstrate the\nsuperiority of our framework in generating listener motions, establishing a new\nstate-of-the-art according to the quantitative measures capturing the diversity\nand realism of generated motions. Qualitative results demonstrate the superior\ncapabilities of the proposed approach in generating diverse and realistic\nexpressions, eye blinks and head gestures.\n","authors":["Minh Tran","Di Chang","Maksim Siniukov","Mohammad Soleymani"],"pdf_url":"https://arxiv.org/pdf/2403.09069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18186v1","updated":"2024-03-27T01:28:36Z","published":"2024-03-27T01:28:36Z","title":"Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting","summary":" We present a method for large-mask pluralistic image inpainting based on the\ngenerative framework of discrete latent codes. Our method learns latent priors,\ndiscretized as tokens, by only performing computations at the visible locations\nof the image. This is realized by a restrictive partial encoder that predicts\nthe token label for each visible block, a bidirectional transformer that infers\nthe missing labels by only looking at these tokens, and a dedicated synthesis\nnetwork that couples the tokens with the partial image priors to generate\ncoherent and pluralistic complete image even under extreme mask settings.\nExperiments on public benchmarks validate our design choices as the proposed\nmethod outperforms strong baselines in both visual quality and diversity\nmetrics.\n","authors":["Haiwei Chen","Yajie Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.18186v1.pdf","comment":"cvpr 2024"},{"id":"http://arxiv.org/abs/2403.18180v1","updated":"2024-03-27T01:15:05Z","published":"2024-03-27T01:15:05Z","title":"Multi-Layer Dense Attention Decoder for Polyp Segmentation","summary":" Detecting and segmenting polyps is crucial for expediting the diagnosis of\ncolon cancer. This is a challenging task due to the large variations of polyps\nin color, texture, and lighting conditions, along with subtle differences\nbetween the polyp and its surrounding area. Recently, vision Transformers have\nshown robust abilities in modeling global context for polyp segmentation.\nHowever, they face two major limitations: the inability to learn local\nrelations among multi-level layers and inadequate feature aggregation in the\ndecoder. To address these issues, we propose a novel decoder architecture aimed\nat hierarchically aggregating locally enhanced multi-level dense features.\nSpecifically, we introduce a novel module named Dense Attention Gate (DAG),\nwhich adaptively fuses all previous layers' features to establish local feature\nrelations among all layers. Furthermore, we propose a novel nested decoder\narchitecture that hierarchically aggregates decoder features, thereby enhancing\nsemantic features. We incorporate our novel dense decoder with the PVT backbone\nnetwork and conduct evaluations on five polyp segmentation datasets: Kvasir,\nCVC-300, CVC-ColonDB, CVC-ClinicDB, and ETIS. Our experiments and comparisons\nwith nine competing segmentation models demonstrate that the proposed\narchitecture achieves state-of-the-art performance and outperforms the previous\nmodels on four datasets. The source code is available at:\nhttps://github.com/krushi1992/Dense-Decoder.\n","authors":["Krushi Patel","Fengjun Li","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18178v1","updated":"2024-03-27T01:12:31Z","published":"2024-03-27T01:12:31Z","title":"Online Embedding Multi-Scale CLIP Features into 3D Maps","summary":" This study introduces a novel approach to online embedding of multi-scale\nCLIP (Contrastive Language-Image Pre-Training) features into 3D maps. By\nharnessing CLIP, this methodology surpasses the constraints of conventional\nvocabulary-limited methods and enables the incorporation of semantic\ninformation into the resultant maps. While recent approaches have explored the\nembedding of multi-modal features in maps, they often impose significant\ncomputational costs, lacking practicality for exploring unfamiliar environments\nin real time. Our approach tackles these challenges by efficiently computing\nand embedding multi-scale CLIP features, thereby facilitating the exploration\nof unfamiliar environments through real-time map generation. Moreover, the\nembedding CLIP features into the resultant maps makes offline retrieval via\nlinguistic queries feasible. In essence, our approach simultaneously achieves\nreal-time object search and mapping of unfamiliar environments. Additionally,\nwe propose a zero-shot object-goal navigation system based on our mapping\napproach, and we validate its efficacy through object-goal navigation, offline\nobject retrieval, and multi-object-goal navigation in both simulated\nenvironments and real robot experiments. The findings demonstrate that our\nmethod not only exhibits swifter performance than state-of-the-art mapping\nmethods but also surpasses them in terms of the success rate of object-goal\nnavigation tasks.\n","authors":["Shun Taguchi","Hideki Deguchi"],"pdf_url":"https://arxiv.org/pdf/2403.18178v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.13729v2","updated":"2024-03-27T00:51:01Z","published":"2024-02-21T11:46:16Z","title":"Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet\n Representation","summary":" Generating high-quality videos that synthesize desired realistic content is a\nchallenging task due to their intricate high-dimensionality and complexity of\nvideos. Several recent diffusion-based methods have shown comparable\nperformance by compressing videos to a lower-dimensional latent space, using\ntraditional video autoencoder architecture. However, such method that employ\nstandard frame-wise 2D and 3D convolution fail to fully exploit the\nspatio-temporal nature of videos. To address this issue, we propose a novel\nhybrid video diffusion model, called HVDM, which can capture spatio-temporal\ndependencies more effectively. The HVDM is trained by a hybrid video\nautoencoder which extracts a disentangled representation of the video\nincluding: (i) a global context information captured by a 2D projected latent\n(ii) a local volume information captured by 3D convolutions with wavelet\ndecomposition (iii) a frequency information for improving the video\nreconstruction. Based on this disentangled representation, our hybrid\nautoencoder provide a more comprehensive video latent enriching the generated\nvideos with fine structures and details. Experiments on video generation\nbenchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed\napproach achieves state-of-the-art video generation quality, showing a wide\nrange of video applications (e.g., long video generation, image-to-video, and\nvideo dynamics control).\n","authors":["Kihong Kim","Haneol Lee","Jihye Park","Seyeon Kim","Kwanghee Lee","Seungryong Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2402.13729v2.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2401.17098v2","updated":"2024-03-27T00:46:26Z","published":"2024-01-30T15:29:32Z","title":"Deep Learning-Driven Approach for Handwritten Chinese Character\n Classification","summary":" Handwritten character recognition (HCR) is a challenging problem for machine\nlearning researchers. Unlike printed text data, handwritten character datasets\nhave more variation due to human-introduced bias. With numerous unique\ncharacter classes present, some data, such as Logographic Scripts or\nSino-Korean character sequences, bring new complications to the HCR problem.\nThe classification task on such datasets requires the model to learn\nhigh-complexity details of the images that share similar features. With recent\nadvances in computational resource availability and further computer vision\ntheory development, some research teams have effectively addressed the arising\nchallenges. Although known for achieving high accuracy while keeping the number\nof parameters small, many common approaches are still not generalizable and use\ndataset-specific solutions to achieve better results. Due to complex structure,\nexisting methods frequently prevent the solutions from gaining popularity. This\npaper proposes a highly scalable approach for detailed character image\nclassification by introducing the model architecture, data preprocessing steps,\nand testing design instructions. We also perform experiments to compare the\nperformance of our method with that of existing ones to show the improvements\nachieved.\n","authors":["Boris Kriuk","Fedor Kriuk"],"pdf_url":"https://arxiv.org/pdf/2401.17098v2.pdf","comment":"30 pages, 9 figures, 2 tables, preprint v2"},{"id":"http://arxiv.org/abs/2403.15931v3","updated":"2024-03-27T23:57:47Z","published":"2024-03-23T20:30:28Z","title":"X-Portrait: Expressive Portrait Animation with Hierarchical Motion\n Attention","summary":" We propose X-Portrait, an innovative conditional diffusion model tailored for\ngenerating expressive and temporally coherent portrait animation. Specifically,\ngiven a single portrait as appearance reference, we aim to animate it with\nmotion derived from a driving video, capturing both highly dynamic and subtle\nfacial expressions along with wide-range head movements. As its core, we\nleverage the generative prior of a pre-trained diffusion model as the rendering\nbackbone, while achieve fine-grained head pose and expression control with\nnovel controlling signals within the framework of ControlNet. In contrast to\nconventional coarse explicit controls such as facial landmarks, our motion\ncontrol module is learned to interpret the dynamics directly from the original\ndriving RGB inputs. The motion accuracy is further enhanced with a patch-based\nlocal control module that effectively enhance the motion attention to\nsmall-scale nuances like eyeball positions. Notably, to mitigate the identity\nleakage from the driving signals, we train our motion control modules with\nscaling-augmented cross-identity images, ensuring maximized disentanglement\nfrom the appearance reference modules. Experimental results demonstrate the\nuniversal effectiveness of X-Portrait across a diverse range of facial\nportraits and expressive driving sequences, and showcase its proficiency in\ngenerating captivating portrait animations with consistently maintained\nidentity characteristics.\n","authors":["You Xie","Hongyi Xu","Guoxian Song","Chao Wang","Yichun Shi","Linjie Luo"],"pdf_url":"https://arxiv.org/pdf/2403.15931v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12627v2","updated":"2024-03-27T23:54:26Z","published":"2023-06-22T01:33:47Z","title":"Targeted collapse regularized autoencoder for anomaly detection: black\n hole at the center","summary":" Autoencoders have been extensively used in the development of recent anomaly\ndetection techniques. The premise of their application is based on the notion\nthat after training the autoencoder on normal training data, anomalous inputs\nwill exhibit a significant reconstruction error. Consequently, this enables a\nclear differentiation between normal and anomalous samples. In practice,\nhowever, it is observed that autoencoders can generalize beyond the normal\nclass and achieve a small reconstruction error on some of the anomalous\nsamples. To improve the performance, various techniques propose additional\ncomponents and more sophisticated training procedures. In this work, we propose\na remarkably straightforward alternative: instead of adding neural network\ncomponents, involved computations, and cumbersome training, we complement the\nreconstruction loss with a computationally light term that regulates the norm\nof representations in the latent space. The simplicity of our approach\nminimizes the requirement for hyperparameter tuning and customization for new\napplications which, paired with its permissive data modality constraint,\nenhances the potential for successful adoption across a broad range of\napplications. We test the method on various visual and tabular benchmarks and\ndemonstrate that the technique matches and frequently outperforms more complex\nalternatives. We further demonstrate that implementing this idea in the context\nof state-of-the-art methods can further improve their performance. We also\nprovide a theoretical analysis and numerical simulations that help demonstrate\nthe underlying process that unfolds during training and how it helps with\nanomaly detection. This mitigates the black-box nature of autoencoder-based\nanomaly detection algorithms and offers an avenue for further investigation of\nadvantages, fail cases, and potential new directions.\n","authors":["Amin Ghafourian","Huanyi Shui","Devesh Upadhyay","Rajesh Gupta","Dimitar Filev","Iman Soltani Bozchalooi"],"pdf_url":"https://arxiv.org/pdf/2306.12627v2.pdf","comment":"18 pages, 4 figures, 8 tables"},{"id":"http://arxiv.org/abs/2311.13099v2","updated":"2024-03-27T23:49:07Z","published":"2023-11-22T01:58:26Z","title":"PIE-NeRF: Physics-based Interactive Elastodynamics with NeRF","summary":" We show that physics-based simulations can be seamlessly integrated with NeRF\nto generate high-quality elastodynamics of real-world objects. Unlike existing\nmethods, we discretize nonlinear hyperelasticity in a meshless way, obviating\nthe necessity for intermediate auxiliary shape proxies like a tetrahedral mesh\nor voxel grid. A quadratic generalized moving least square (Q-GMLS) is employed\nto capture nonlinear dynamics and large deformation on the implicit model. Such\nmeshless integration enables versatile simulations of complex and codimensional\nshapes. We adaptively place the least-square kernels according to the NeRF\ndensity field to significantly reduce the complexity of the nonlinear\nsimulation. As a result, physically realistic animations can be conveniently\nsynthesized using our method for a wide range of hyperelastic materials at an\ninteractive rate. For more information, please visit our project page at\nhttps://fytalon.github.io/pienerf/.\n","authors":["Yutao Feng","Yintong Shang","Xuan Li","Tianjia Shao","Chenfanfu Jiang","Yin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.13099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17729v2","updated":"2024-03-27T23:33:15Z","published":"2024-02-27T18:01:59Z","title":"Towards Fairness-Aware Adversarial Learning","summary":" Although adversarial training (AT) has proven effective in enhancing the\nmodel's robustness, the recently revealed issue of fairness in robustness has\nnot been well addressed, i.e. the robust accuracy varies significantly among\ndifferent categories. In this paper, instead of uniformly evaluating the\nmodel's average class performance, we delve into the issue of robust fairness,\nby considering the worst-case distribution across various classes. We propose a\nnovel learning paradigm, named Fairness-Aware Adversarial Learning (FAAL). As a\ngeneralization of conventional AT, we re-define the problem of adversarial\ntraining as a min-max-max framework, to ensure both robustness and fairness of\nthe trained model. Specifically, by taking advantage of distributional robust\noptimization, our method aims to find the worst distribution among different\ncategories, and the solution is guaranteed to obtain the upper bound\nperformance with high probability. In particular, FAAL can fine-tune an unfair\nrobust model to be fair within only two epochs, without compromising the\noverall clean and robust accuracies. Extensive experiments on various image\ndatasets validate the superior performance and efficiency of the proposed FAAL\ncompared to other state-of-the-art methods.\n","authors":["Yanghao Zhang","Tianle Zhang","Ronghui Mu","Xiaowei Huang","Wenjie Ruan"],"pdf_url":"https://arxiv.org/pdf/2402.17729v2.pdf","comment":"This work will appear in the CVPR 2024 conference proceedings"},{"id":"http://arxiv.org/abs/2203.13883v6","updated":"2024-03-27T23:27:58Z","published":"2022-03-25T19:45:33Z","title":"Multi-modal Misinformation Detection: Approaches, Challenges and\n Opportunities","summary":" As social media platforms are evolving from text-based forums into\nmulti-modal environments, the nature of misinformation in social media is also\ntransforming accordingly. Taking advantage of the fact that visual modalities\nsuch as images and videos are more favorable and attractive to the users and\ntextual contents are sometimes skimmed carelessly, misinformation spreaders\nhave recently targeted contextual connections between the modalities e.g., text\nand image. Hence many researchers have developed automatic techniques for\ndetecting possible cross-modal discordance in web-based content. We analyze,\ncategorize and identify existing approaches in addition to challenges and\nshortcomings they face in order to unearth new research opportunities in the\nfield of multi-modal misinformation detection.\n","authors":["Sara Abdali","Sina shaham","Bhaskar Krishnamachari"],"pdf_url":"https://arxiv.org/pdf/2203.13883v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03160v2","updated":"2024-03-27T22:58:34Z","published":"2023-12-05T22:04:49Z","title":"HybridNeRF: Efficient Neural Rendering via Adaptive Volumetric Surfaces","summary":" Neural radiance fields provide state-of-the-art view synthesis quality but\ntend to be slow to render. One reason is that they make use of volume\nrendering, thus requiring many samples (and model queries) per ray at render\ntime. Although this representation is flexible and easy to optimize, most\nreal-world objects can be modeled more efficiently with surfaces instead of\nvolumes, requiring far fewer samples per ray. This observation has spurred\nconsiderable progress in surface representations such as signed distance\nfunctions, but these may struggle to model semi-opaque and thin structures. We\npropose a method, HybridNeRF, that leverages the strengths of both\nrepresentations by rendering most objects as surfaces while modeling the\n(typically) small fraction of challenging regions volumetrically. We evaluate\nHybridNeRF against the challenging Eyeful Tower dataset along with other\ncommonly used view synthesis datasets. When comparing to state-of-the-art\nbaselines, including recent rasterization-based approaches, we improve error\nrates by 15-30% while achieving real-time framerates (at least 36 FPS) for\nvirtual-reality resolutions (2Kx2K).\n","authors":["Haithem Turki","Vasu Agrawal","Samuel Rota Bulò","Lorenzo Porzi","Peter Kontschieder","Deva Ramanan","Michael Zollhöfer","Christian Richardt"],"pdf_url":"https://arxiv.org/pdf/2312.03160v2.pdf","comment":"CVPR 2024 Project page: https://haithemturki.com/hybrid-nerf/"},{"id":"http://arxiv.org/abs/2403.19046v1","updated":"2024-03-27T22:50:48Z","published":"2024-03-27T22:50:48Z","title":"LITA: Language Instructed Temporal-Localization Assistant","summary":" There has been tremendous progress in multimodal Large Language Models\n(LLMs). Recent works have extended these models to video input with promising\ninstruction following capabilities. However, an important missing piece is\ntemporal localization. These models cannot accurately answer the \"When?\"\nquestions. We identify three key aspects that limit their temporal localization\ncapabilities: (i) time representation, (ii) architecture, and (iii) data. We\naddress these shortcomings by proposing Language Instructed\nTemporal-Localization Assistant (LITA) with the following features: (1) We\nintroduce time tokens that encode timestamps relative to the video length to\nbetter represent time in videos. (2) We introduce SlowFast tokens in the\narchitecture to capture temporal information at fine temporal resolution. (3)\nWe emphasize temporal localization data for LITA. In addition to leveraging\nexisting video datasets with timestamps, we propose a new task, Reasoning\nTemporal Localization (RTL), along with the dataset, ActivityNet-RTL, for\nlearning and evaluating this task. Reasoning temporal localization requires\nboth the reasoning and temporal localization of Video LLMs. LITA demonstrates\nstrong performance on this challenging task, nearly doubling the temporal mean\nintersection-over-union (mIoU) of baselines. In addition, we show that our\nemphasis on temporal localization also substantially improves video-based text\ngeneration compared to existing Video LLMs, including a 36% relative\nimprovement of Temporal Understanding. Code is available at:\nhttps://github.com/NVlabs/LITA\n","authors":["De-An Huang","Shijia Liao","Subhashree Radhakrishnan","Hongxu Yin","Pavlo Molchanov","Zhiding Yu","Jan Kautz"],"pdf_url":"https://arxiv.org/pdf/2403.19046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19043v1","updated":"2024-03-27T22:36:02Z","published":"2024-03-27T22:36:02Z","title":"Illicit object detection in X-ray images using Vision Transformers","summary":" Illicit object detection is a critical task performed at various\nhigh-security locations, including airports, train stations, subways, and\nports. The continuous and tedious work of examining thousands of X-ray images\nper hour can be mentally taxing. Thus, Deep Neural Networks (DNNs) can be used\nto automate the X-ray image analysis process, improve efficiency and alleviate\nthe security officers' inspection burden. The neural architectures typically\nutilized in relevant literature are Convolutional Neural Networks (CNNs), with\nVision Transformers (ViTs) rarely employed. In order to address this gap, this\npaper conducts a comprehensive evaluation of relevant ViT architectures on\nillicit item detection in X-ray images. This study utilizes both Transformer\nand hybrid backbones, such as SWIN and NextViT, and detectors, such as DINO and\nRT-DETR. The results demonstrate the remarkable accuracy of the DINO\nTransformer detector in the low-data regime, the impressive real-time\nperformance of YOLOv8, and the effectiveness of the hybrid NextViT backbone.\n","authors":["Jorgen Cani","Ioannis Mademlis","Adamantia Anna Rebolledo Chrysochoou","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2403.19043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.04291v2","updated":"2024-03-27T22:33:13Z","published":"2022-02-09T05:57:08Z","title":"L2B: Learning to Bootstrap Robust Models for Combating Label Noise","summary":" Deep neural networks have shown great success in representation learning.\nHowever, when learning with noisy labels (LNL), they can easily overfit and\nfail to generalize to new data. This paper introduces a simple and effective\nmethod, named Learning to Bootstrap (L2B), which enables models to bootstrap\nthemselves using their own predictions without being adversely affected by\nerroneous pseudo-labels. It achieves this by dynamically adjusting the\nimportance weight between real observed and generated labels, as well as\nbetween different samples through meta-learning. Unlike existing instance\nreweighting methods, the key to our method lies in a new, versatile objective\nthat enables implicit relabeling concurrently, leading to significant\nimprovements without incurring additional costs.\n L2B offers several benefits over the baseline methods. It yields more robust\nmodels that are less susceptible to the impact of noisy labels by guiding the\nbootstrapping procedure more effectively. It better exploits the valuable\ninformation contained in corrupted instances by adapting the weights of both\ninstances and labels. Furthermore, L2B is compatible with existing LNL methods\nand delivers competitive results spanning natural and medical imaging tasks\nincluding classification and segmentation under both synthetic and real-world\nnoise. Extensive experiments demonstrate that our method effectively mitigates\nthe challenges of noisy labels, often necessitating few to no validation\nsamples, and is well generalized to other tasks such as image segmentation.\nThis not only positions it as a robust complement to existing LNL techniques\nbut also underscores its practical applicability. The code and models are\navailable at https://github.com/yuyinzhou/l2b.\n","authors":["Yuyin Zhou","Xianhang Li","Fengze Liu","Qingyue Wei","Xuxi Chen","Lequan Yu","Cihang Xie","Matthew P. Lungren","Lei Xing"],"pdf_url":"https://arxiv.org/pdf/2202.04291v2.pdf","comment":"CVPR 2024; code is available at https://github.com/yuyinzhou/l2b"},{"id":"http://arxiv.org/abs/2204.11970v3","updated":"2024-03-27T22:02:30Z","published":"2022-04-25T21:20:27Z","title":"Visual Acuity Prediction on Real-Life Patient Data Using a Machine\n Learning Based Multistage System","summary":" In ophthalmology, intravitreal operative medication therapy (IVOM) is a\nwidespread treatment for diseases related to the age-related macular\ndegeneration (AMD), the diabetic macular edema (DME), as well as the retinal\nvein occlusion (RVO). However, in real-world settings, patients often suffer\nfrom loss of vision on time scales of years despite therapy, whereas the\nprediction of the visual acuity (VA) and the earliest possible detection of\ndeterioration under real-life conditions is challenging due to heterogeneous\nand incomplete data. In this contribution, we present a workflow for the\ndevelopment of a research-compatible data corpus fusing different IT systems of\nthe department of ophthalmology of a German maximum care hospital. The\nextensive data corpus allows predictive statements of the expected progression\nof a patient and his or her VA in each of the three diseases. For the disease\nAMD, we found out a significant deterioration of the visual acuity over time.\nWithin our proposed multistage system, we subsequently classify the VA\nprogression into the three groups of therapy \"winners\", \"stabilizers\", and\n\"losers\" (WSL classification scheme). Our OCT biomarker classification using an\nensemble of deep neural networks results in a classification accuracy\n(F1-score) of over 98 %, enabling us to complete incomplete OCT documentations\nwhile allowing us to exploit them for a more precise VA modelling process. Our\nVA prediction requires at least four VA examinations and optionally OCT\nbiomarkers from the same time period to predict the VA progression within a\nforecasted time frame, whereas our prediction is currently restricted to IVOM /\nno therapy. We achieve a final prediction accuracy of 69 % in macro average\nF1-score, while being in the same range as the ophthalmologists with 57.8 and\n50 +- 10.7 % F1-score.\n","authors":["Tobias Schlosser","Frederik Beuth","Trixy Meyer","Arunodhayan Sampath Kumar","Gabriel Stolze","Olga Furashova","Katrin Engelmann","Danny Kowerko"],"pdf_url":"https://arxiv.org/pdf/2204.11970v3.pdf","comment":"Preprint for journal Scientific Reports (Springer)"},{"id":"http://arxiv.org/abs/2403.13171v2","updated":"2024-03-27T21:43:37Z","published":"2024-03-19T21:52:19Z","title":"LUWA Dataset: Learning Lithic Use-Wear Analysis on Microscopic Images","summary":" Lithic Use-Wear Analysis (LUWA) using microscopic images is an underexplored\nvision-for-science research area. It seeks to distinguish the worked material,\nwhich is critical for understanding archaeological artifacts, material\ninteractions, tool functionalities, and dental records. However, this\nchallenging task goes beyond the well-studied image classification problem for\ncommon objects. It is affected by many confounders owing to the complex wear\nmechanism and microscopic imaging, which makes it difficult even for human\nexperts to identify the worked material successfully. In this paper, we\ninvestigate the following three questions on this unique vision task for the\nfirst time:(i) How well can state-of-the-art pre-trained models (like DINOv2)\ngeneralize to the rarely seen domain? (ii) How can few-shot learning be\nexploited for scarce microscopic images? (iii) How do the ambiguous\nmagnification and sensing modality influence the classification accuracy? To\nstudy these, we collaborated with archaeologists and built the first\nopen-source and the largest LUWA dataset containing 23,130 microscopic images\nwith different magnifications and sensing modalities. Extensive experiments\nshow that existing pre-trained models notably outperform human experts but\nstill leave a large gap for improvements. Most importantly, the LUWA dataset\nprovides an underexplored opportunity for vision and learning communities and\ncomplements existing image classification problems on common objects.\n","authors":["Jing Zhang","Irving Fang","Juexiao Zhang","Hao Wu","Akshat Kaushik","Alice Rodriguez","Hanwen Zhao","Zhuo Zheng","Radu Iovita","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2403.13171v2.pdf","comment":"CVPR"},{"id":"http://arxiv.org/abs/2403.19026v1","updated":"2024-03-27T21:43:12Z","published":"2024-03-27T21:43:12Z","title":"Egocentric Scene-aware Human Trajectory Prediction","summary":" Wearable collaborative robots stand to assist human wearers who need fall\nprevention assistance or wear exoskeletons. Such a robot needs to be able to\npredict the ego motion of the wearer based on egocentric vision and the\nsurrounding scene. In this work, we leveraged body-mounted cameras and sensors\nto anticipate the trajectory of human wearers through complex surroundings. To\nfacilitate research in ego-motion prediction, we have collected a comprehensive\nwalking scene navigation dataset centered on the user's perspective. We present\na method to predict human motion conditioning on the surrounding static scene.\nOur method leverages a diffusion model to produce a distribution of potential\nfuture trajectories, taking into account the user's observation of the\nenvironment. We introduce a compact representation to encode the user's visual\nmemory of the surroundings, as well as an efficient sample-generating technique\nto speed up real-time inference of a diffusion model. We ablate our model and\ncompare it to baselines, and results show that our model outperforms existing\nmethods on key metrics of collision avoidance and trajectory mode coverage.\n","authors":["Weizhuo Wang","C. Karen Liu","Monroe Kennedy III"],"pdf_url":"https://arxiv.org/pdf/2403.19026v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.19022v1","updated":"2024-03-27T21:24:20Z","published":"2024-03-27T21:24:20Z","title":"WALT3D: Generating Realistic Training Data from Time-Lapse Imagery for\n Reconstructing Dynamic Objects under Occlusion","summary":" Current methods for 2D and 3D object understanding struggle with severe\nocclusions in busy urban environments, partly due to the lack of large-scale\nlabeled ground-truth annotations for learning occlusion. In this work, we\nintroduce a novel framework for automatically generating a large, realistic\ndataset of dynamic objects under occlusions using freely available time-lapse\nimagery. By leveraging off-the-shelf 2D (bounding box, segmentation, keypoint)\nand 3D (pose, shape) predictions as pseudo-groundtruth, unoccluded 3D objects\nare identified automatically and composited into the background in a clip-art\nstyle, ensuring realistic appearances and physically accurate occlusion\nconfigurations. The resulting clip-art image with pseudo-groundtruth enables\nefficient training of object reconstruction methods that are robust to\nocclusions. Our method demonstrates significant improvements in both 2D and 3D\nreconstruction, particularly in scenarios with heavily occluded objects like\nvehicles and people in urban scenes.\n","authors":["Khiem Vuong","N. Dinesh Reddy","Robert Tamburo","Srinivasa G. Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2403.19022v1.pdf","comment":"To appear in CVPR 2024"},{"id":"http://arxiv.org/abs/2309.13863v2","updated":"2024-03-27T21:15:27Z","published":"2023-09-25T04:27:06Z","title":"SuPerPM: A Large Deformation-Robust Surgical Perception Framework Based\n on Deep Point Matching Learned from Physical Constrained Simulation Data","summary":" Manipulation of tissue with surgical tools often results in large\ndeformations that current methods in tracking and reconstructing algorithms\nhave not effectively addressed. A major source of tracking errors during large\ndeformations stems from wrong data association between observed sensor\nmeasurements with previously tracked scene. To mitigate this issue, we present\na surgical perception framework, SuPerPM, that leverages learning-based\nnon-rigid point cloud matching for data association, thus accommodating larger\ndeformations. The learning models typically require training data with ground\ntruth point cloud correspondences, which is challenging or even impractical to\ncollect in surgical environments. Thus, for tuning the learning model, we\ngather endoscopic data of soft tissue being manipulated by a surgical robot and\nthen establish correspondences between point clouds at different time points to\nserve as ground truth. This was achieved by employing a position-based dynamics\n(PBD) simulation to ensure that the correspondences adhered to physical\nconstraints. The proposed framework is demonstrated on several challenging\nsurgical datasets that are characterized by large deformations, achieving\nsuperior performance over state-of-the-art surgical scene tracking algorithms.\n","authors":["Shan Lin","Albert J. Miao","Ali Alabiad","Fei Liu","Kaiyuan Wang","Jingpei Lu","Florian Richter","Michael C. Yip"],"pdf_url":"https://arxiv.org/pdf/2309.13863v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19002v1","updated":"2024-03-27T20:52:30Z","published":"2024-03-27T20:52:30Z","title":"Robust Active Speaker Detection in Noisy Environments","summary":" This paper addresses the issue of active speaker detection (ASD) in noisy\nenvironments and formulates a robust active speaker detection (rASD) problem.\nExisting ASD approaches leverage both audio and visual modalities, but\nnon-speech sounds in the surrounding environment can negatively impact\nperformance. To overcome this, we propose a novel framework that utilizes\naudio-visual speech separation as guidance to learn noise-free audio features.\nThese features are then utilized in an ASD model, and both tasks are jointly\noptimized in an end-to-end framework. Our proposed framework mitigates residual\nnoise and audio quality reduction issues that can occur in a naive cascaded\ntwo-stage framework that directly uses separated speech for ASD, and enables\nthe two tasks to be optimized simultaneously. To further enhance the robustness\nof the audio features and handle inherent speech noises, we propose a dynamic\nweighted loss approach to train the speech separator. We also collected a\nreal-world noise audio dataset to facilitate investigations. Experiments\ndemonstrate that non-speech audio noises significantly impact ASD models, and\nour proposed approach improves ASD performance in noisy environments. The\nframework is general and can be applied to different ASD approaches to improve\ntheir robustness. Our code, models, and data will be released.\n","authors":["Siva Sai Nagender Vasireddy","Chenxu Zhang","Xiaohu Guo","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2403.19002v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.19001v1","updated":"2024-03-27T20:51:02Z","published":"2024-03-27T20:51:02Z","title":"Cross--domain Fiber Cluster Shape Analysis for Language Performance\n Cognitive Score Prediction","summary":" Shape plays an important role in computer graphics, offering informative\nfeatures to convey an object's morphology and functionality. Shape analysis in\nbrain imaging can help interpret structural and functionality correlations of\nthe human brain. In this work, we investigate the shape of the brain's 3D white\nmatter connections and its potential predictive relationship to human cognitive\nfunction. We reconstruct brain connections as sequences of 3D points using\ndiffusion magnetic resonance imaging (dMRI) tractography. To describe each\nconnection, we extract 12 shape descriptors in addition to traditional dMRI\nconnectivity and tissue microstructure features. We introduce a novel\nframework, Shape--fused Fiber Cluster Transformer (SFFormer), that leverages a\nmulti-head cross-attention feature fusion module to predict subject-specific\nlanguage performance based on dMRI tractography. We assess the performance of\nthe method on a large dataset including 1065 healthy young adults. The results\ndemonstrate that both the transformer-based SFFormer model and its inter/intra\nfeature fusion with shape, microstructure, and connectivity are informative,\nand together, they improve the prediction of subject-specific language\nperformance scores. Overall, our results indicate that the shape of the brain's\nconnections is predictive of human language function.\n","authors":["Yui Lo","Yuqian Chen","Dongnan Liu","Wan Liu","Leo Zekelman","Fan Zhang","Yogesh Rathi","Nikos Makris","Alexandra J. Golby","Weidong Cai","Lauren J. O'Donnell"],"pdf_url":"https://arxiv.org/pdf/2403.19001v1.pdf","comment":"2 figures, 11 pages"},{"id":"http://arxiv.org/abs/2310.14344v2","updated":"2024-03-27T20:48:37Z","published":"2023-10-22T16:31:01Z","title":"What's in a Prior? Learned Proximal Networks for Inverse Problems","summary":" Proximal operators are ubiquitous in inverse problems, commonly appearing as\npart of algorithmic strategies to regularize problems that are otherwise\nill-posed. Modern deep learning models have been brought to bear for these\ntasks too, as in the framework of plug-and-play or deep unrolling, where they\nloosely resemble proximal operators. Yet, something essential is lost in\nemploying these purely data-driven approaches: there is no guarantee that a\ngeneral deep network represents the proximal operator of any function, nor is\nthere any characterization of the function for which the network might provide\nsome approximate proximal. This not only makes guaranteeing convergence of\niterative schemes challenging but, more fundamentally, complicates the analysis\nof what has been learned by these networks about their training data. Herein we\nprovide a framework to develop learned proximal networks (LPN), prove that they\nprovide exact proximal operators for a data-driven nonconvex regularizer, and\nshow how a new training strategy, dubbed proximal matching, provably promotes\nthe recovery of the log-prior of the true data distribution. Such LPN provide\ngeneral, unsupervised, expressive proximal operators that can be used for\ngeneral inverse problems with convergence guarantees. We illustrate our results\nin a series of cases of increasing complexity, demonstrating that these models\nnot only result in state-of-the-art performance, but provide a window into the\nresulting priors learned from data.\n","authors":["Zhenghan Fang","Sam Buchanan","Jeremias Sulam"],"pdf_url":"https://arxiv.org/pdf/2310.14344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18996v1","updated":"2024-03-27T20:30:01Z","published":"2024-03-27T20:30:01Z","title":"Envisioning MedCLIP: A Deep Dive into Explainability for Medical\n Vision-Language Models","summary":" Explaining Deep Learning models is becoming increasingly important in the\nface of daily emerging multimodal models, particularly in safety-critical\ndomains like medical imaging. However, the lack of detailed investigations into\nthe performance of explainability methods on these models is widening the gap\nbetween their development and safe deployment. In this work, we analyze the\nperformance of various explainable AI methods on a vision-language model,\nMedCLIP, to demystify its inner workings. We also provide a simple methodology\nto overcome the shortcomings of these methods. Our work offers a different new\nperspective on the explainability of a recent well-known VLM in the medical\ndomain and our assessment method is generalizable to other current and possible\nfuture VLMs.\n","authors":["Anees Ur Rehman Hashmi","Dwarikanath Mahapatra","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.18996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11505v2","updated":"2024-03-27T20:10:05Z","published":"2024-03-18T06:20:49Z","title":"COVID-19 detection from pulmonary CT scans using a novel EfficientNet\n with attention mechanism","summary":" Manual analysis and diagnosis of COVID-19 through the examination of Computed\nTomography (CT) images of the lungs can be time-consuming and result in errors,\nespecially given high volume of patients and numerous images per patient. So,\nwe address the need for automation of this task by developing a new deep\nlearning model-based pipeline. Our motivation was sparked by the CVPR Workshop\non \"Domain Adaptation, Explainability and Fairness in AI for Medical Image\nAnalysis\", more specifically, the \"COVID-19 Diagnosis Competition (DEF-AI-MIA\nCOV19D)\" under the same Workshop. This challenge provides an opportunity to\nassess our proposed pipeline for COVID-19 detection from CT scan images. The\nsame pipeline incorporates the original EfficientNet, but with an added\nAttention Mechanism: EfficientNet-AM. Also, unlike the traditional/past\npipelines, which relied on a pre-processing step, our pipeline takes the raw\nselected input images without any such step, except for an image-selection step\nto simply reduce the number of CT images required for training and/or testing.\nMoreover, our pipeline is computationally efficient, as, for example, it does\nnot incorporate a decoder for segmenting the lungs. It also does not combine\ndifferent backbones nor combine RNN with a backbone, as other pipelines in the\npast did. Nevertheless, our pipeline still outperforms all approaches presented\nby other teams in last year's instance of the same challenge, at least based on\nthe validation subset of the competition dataset.\n","authors":["Ramy Farag","Parth Upadhyay","Yixiang Gao","Jacket Demby","Katherin Garces Montoya","Seyed Mohamad Ali Tousi","Gbenga Omotara","Guilherme DeSouza"],"pdf_url":"https://arxiv.org/pdf/2403.11505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18985v1","updated":"2024-03-27T20:07:39Z","published":"2024-03-27T20:07:39Z","title":"Robustness and Visual Explanation for Black Box Image, Video, and ECG\n Signal Classification with Reinforcement Learning","summary":" We present a generic Reinforcement Learning (RL) framework optimized for\ncrafting adversarial attacks on different model types spanning from ECG signal\nanalysis (1D), image classification (2D), and video classification (3D). The\nframework focuses on identifying sensitive regions and inducing\nmisclassifications with minimal distortions and various distortion types. The\nnovel RL method outperforms state-of-the-art methods for all three\napplications, proving its efficiency. Our RL approach produces superior\nlocalization masks, enhancing interpretability for image classification and ECG\nanalysis models. For applications such as ECG analysis, our platform highlights\ncritical ECG segments for clinicians while ensuring resilience against\nprevalent distortions. This comprehensive tool aims to bolster both resilience\nwith adversarial training and transparency across varied applications and data\ntypes.\n","authors":["Soumyendu Sarkar","Ashwin Ramesh Babu","Sajad Mousavi","Vineet Gundecha","Avisek Naug","Sahand Ghorbanpour"],"pdf_url":"https://arxiv.org/pdf/2403.18985v1.pdf","comment":"AAAI Proceedings reference:\n https://ojs.aaai.org/index.php/AAAI/article/view/30579"},{"id":"http://arxiv.org/abs/1903.06811v3","updated":"2024-03-27T20:03:41Z","published":"2019-03-15T21:35:13Z","title":"Multi-camera calibration with pattern rigs, including for\n non-overlapping cameras: CALICO","summary":" This paper describes CALICO, a method for multi-camera calibration suitable\nfor challenging contexts: stationary and mobile multi-camera systems, cameras\nwithout overlapping fields of view, and non-synchronized cameras. Recent\napproaches are roughly divided into infrastructure- and pattern-based.\nInfrastructure-based approaches use the scene's features to calibrate, while\npattern-based approaches use calibration patterns. Infrastructure-based\napproaches are not suitable for stationary camera systems, and pattern-based\napproaches may constrain camera placement because shared fields of view or\nextremely large patterns are required.\n CALICO is a pattern-based approach, where the multi-calibration problem is\nformulated using rigidity constraints between patterns and cameras. We use a\n{\\it pattern rig}: several patterns rigidly attached to each other or some\nstructure. We express the calibration problem as that of algebraic and\nreprojection error minimization problems. Simulated and real experiments\ndemonstrate the method in a variety of settings. CALICO compared favorably to\nKalibr. Mean reconstruction accuracy error was $\\le 0.71$ mm for real camera\nrigs, and $\\le 1.11$ for simulated camera rigs. Code and data releases are\navailable at \\cite{tabb_amy_2019_3520866} and\n\\url{https://github.com/amy-tabb/calico}.\n","authors":["Amy Tabb","Henry Medeiros","Mitchell J. Feldmann","Thiago T. Santos"],"pdf_url":"https://arxiv.org/pdf/1903.06811v3.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2403.18978v1","updated":"2024-03-27T19:52:55Z","published":"2024-03-27T19:52:55Z","title":"TextCraftor: Your Text Encoder Can be Image Quality Controller","summary":" Diffusion-based text-to-image generative models, e.g., Stable Diffusion, have\nrevolutionized the field of content generation, enabling significant\nadvancements in areas like image editing and video synthesis. Despite their\nformidable capabilities, these models are not without their limitations. It is\nstill challenging to synthesize an image that aligns well with the input text,\nand multiple runs with carefully crafted prompts are required to achieve\nsatisfactory results. To mitigate these limitations, numerous studies have\nendeavored to fine-tune the pre-trained diffusion models, i.e., UNet, utilizing\nvarious technologies. Yet, amidst these efforts, a pivotal question of\ntext-to-image diffusion model training has remained largely unexplored: Is it\npossible and feasible to fine-tune the text encoder to improve the performance\nof text-to-image diffusion models? Our findings reveal that, instead of\nreplacing the CLIP text encoder used in Stable Diffusion with other large\nlanguage models, we can enhance it through our proposed fine-tuning approach,\nTextCraftor, leading to substantial improvements in quantitative benchmarks and\nhuman assessments. Interestingly, our technique also empowers controllable\nimage generation through the interpolation of different text encoders\nfine-tuned with various rewards. We also demonstrate that TextCraftor is\northogonal to UNet finetuning, and can be combined to further improve\ngenerative quality.\n","authors":["Yanyu Li","Xian Liu","Anil Kag","Ju Hu","Yerlan Idelbayev","Dhritiman Sagar","Yanzhi Wang","Sergey Tulyakov","Jian Ren"],"pdf_url":"https://arxiv.org/pdf/2403.18978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05995v2","updated":"2024-03-27T18:21:12Z","published":"2023-12-10T20:57:31Z","title":"From Correspondences to Pose: Non-minimal Certifiably Optimal Relative\n Pose without Disambiguation","summary":" Estimating the relative camera pose from $n \\geq 5$ correspondences between\ntwo calibrated views is a fundamental task in computer vision. This process\ntypically involves two stages: 1) estimating the essential matrix between the\nviews, and 2) disambiguating among the four candidate relative poses that\nsatisfy the epipolar geometry. In this paper, we demonstrate a novel approach\nthat, for the first time, bypasses the second stage. Specifically, we show that\nit is possible to directly estimate the correct relative camera pose from\ncorrespondences without needing a post-processing step to enforce the\ncheirality constraint on the correspondences. Building on recent advances in\ncertifiable non-minimal optimization, we frame the relative pose estimation as\na Quadratically Constrained Quadratic Program (QCQP). By applying the\nappropriate constraints, we ensure the estimation of a camera pose that\ncorresponds to a valid 3D geometry and that is globally optimal when certified.\nWe validate our method through exhaustive synthetic and real-world experiments,\nconfirming the efficacy, efficiency and accuracy of the proposed approach. Code\nis available at https://github.com/javrtg/C2P.\n","authors":["Javier Tirado-Garín","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2312.05995v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2307.04132v3","updated":"2024-03-27T18:17:46Z","published":"2023-07-09T09:04:26Z","title":"Reasoning over the Behaviour of Objects in Video-Clips for Adverb-Type\n Recognition","summary":" In this work, following the intuition that adverbs describing scene-sequences\nare best identified by reasoning over high-level concepts of object-behavior,\nwe propose the design of a new framework that reasons over object-behaviours\nextracted from raw-video-clips to recognize the clip's corresponding\nadverb-types. Importantly, while previous works for general scene\nadverb-recognition assume knowledge of the clips underlying action-types, our\nmethod is directly applicable in the more general problem setting where the\naction-type of a video-clip is unknown. Specifically, we propose a novel\npipeline that extracts human-interpretable object-behaviour-facts from raw\nvideo clips and propose novel symbolic and transformer based reasoning methods\nthat operate over these extracted facts to identify adverb-types. Experiment\nresults demonstrate that our proposed methods perform favourably against the\nprevious state-of-the-art. Additionally, to support efforts in symbolic\nvideo-processing, we release two new datasets of object-behaviour-facts\nextracted from raw video clips - the MSR-VTT-ASP and ActivityNet-ASP datasets.\n","authors":["Amrit Diggavi Seshadri","Alessandra Russo"],"pdf_url":"https://arxiv.org/pdf/2307.04132v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18922v1","updated":"2024-03-27T18:13:16Z","published":"2024-03-27T18:13:16Z","title":"Lift3D: Zero-Shot Lifting of Any 2D Vision Model to 3D","summary":" In recent years, there has been an explosion of 2D vision models for numerous\ntasks such as semantic segmentation, style transfer or scene editing, enabled\nby large-scale 2D image datasets. At the same time, there has been renewed\ninterest in 3D scene representations such as neural radiance fields from\nmulti-view images. However, the availability of 3D or multiview data is still\nsubstantially limited compared to 2D image datasets, making extending 2D vision\nmodels to 3D data highly desirable but also very challenging. Indeed, extending\na single 2D vision operator like scene editing to 3D typically requires a\nhighly creative method specialized to that task and often requires per-scene\noptimization. In this paper, we ask the question of whether any 2D vision model\ncan be lifted to make 3D consistent predictions. We answer this question in the\naffirmative; our new Lift3D method trains to predict unseen views on feature\nspaces generated by a few visual models (i.e. DINO and CLIP), but then\ngeneralizes to novel vision operators and tasks, such as style transfer,\nsuper-resolution, open vocabulary segmentation and image colorization; for some\nof these tasks, there is no comparable previous 3D method. In many cases, we\neven outperform state-of-the-art methods specialized for the task in question.\nMoreover, Lift3D is a zero-shot method, in the sense that it requires no\ntask-specific training, nor scene-specific optimization.\n","authors":["Mukund Varma T","Peihao Wang","Zhiwen Fan","Zhangyang Wang","Hao Su","Ravi Ramamoorthi"],"pdf_url":"https://arxiv.org/pdf/2403.18922v1.pdf","comment":"Computer Vision and Pattern Recognition Conference (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.18921v1","updated":"2024-03-27T18:12:24Z","published":"2024-03-27T18:12:24Z","title":"SMOF: Streaming Modern CNNs on FPGAs with Smart Off-Chip Eviction","summary":" Convolutional Neural Networks (CNNs) have demonstrated their effectiveness in\nnumerous vision tasks. However, their high processing requirements necessitate\nefficient hardware acceleration to meet the application's performance targets.\nIn the space of FPGAs, streaming-based dataflow architectures are often adopted\nby users, as significant performance gains can be achieved through layer-wise\npipelining and reduced off-chip memory access by retaining data on-chip.\nHowever, modern topologies, such as the UNet, YOLO, and X3D models, utilise\nlong skip connections, requiring significant on-chip storage and thus limiting\nthe performance achieved by such system architectures. The paper addresses the\nabove limitation by introducing weight and activation eviction mechanisms to\noff-chip memory along the computational pipeline, taking into account the\navailable compute and memory resources. The proposed mechanism is incorporated\ninto an existing toolflow, expanding the design space by utilising off-chip\nmemory as a buffer. This enables the mapping of such modern CNNs to devices\nwith limited on-chip memory, under the streaming architecture design approach.\nSMOF has demonstrated the capacity to deliver competitive and, in some cases,\nstate-of-the-art performance across a spectrum of computer vision tasks,\nachieving up to 10.65 X throughput improvement compared to previous works.\n","authors":["Petros Toupas","Zhewen Yu","Christos-Savvas Bouganis","Dimitrios Tzovaras"],"pdf_url":"https://arxiv.org/pdf/2403.18921v1.pdf","comment":"12 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.18920v1","updated":"2024-03-27T18:09:55Z","published":"2024-03-27T18:09:55Z","title":"CPR: Retrieval Augmented Generation for Copyright Protection","summary":" Retrieval Augmented Generation (RAG) is emerging as a flexible and robust\ntechnique to adapt models to private users data without training, to handle\ncredit attribution, and to allow efficient machine unlearning at scale.\nHowever, RAG techniques for image generation may lead to parts of the retrieved\nsamples being copied in the model's output. To reduce risks of leaking private\ninformation contained in the retrieved set, we introduce Copy-Protected\ngeneration with Retrieval (CPR), a new method for RAG with strong copyright\nprotection guarantees in a mixed-private setting for diffusion models.CPR\nallows to condition the output of diffusion models on a set of retrieved\nimages, while also guaranteeing that unique identifiable information about\nthose example is not exposed in the generated outputs. In particular, it does\nso by sampling from a mixture of public (safe) distribution and private (user)\ndistribution by merging their diffusion scores at inference. We prove that CPR\nsatisfies Near Access Freeness (NAF) which bounds the amount of information an\nattacker may be able to extract from the generated images. We provide two\nalgorithms for copyright protection, CPR-KL and CPR-Choose. Unlike previously\nproposed rejection-sampling-based NAF methods, our methods enable efficient\ncopyright-protected sampling with a single run of backward diffusion. We show\nthat our method can be applied to any pre-trained conditional diffusion model,\nsuch as Stable Diffusion or unCLIP. In particular, we empirically show that\napplying CPR on top of unCLIP improves quality and text-to-image alignment of\nthe generated results (81.4 to 83.17 on TIFA benchmark), while enabling credit\nattribution, copy-right protection, and deterministic, constant time,\nunlearning.\n","authors":["Aditya Golatkar","Alessandro Achille","Luca Zancato","Yu-Xiang Wang","Ashwin Swaminathan","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2403.18920v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18915v1","updated":"2024-03-27T18:08:14Z","published":"2024-03-27T18:08:14Z","title":"PLOT-TAL -- Prompt Learning with Optimal Transport for Few-Shot Temporal\n Action Localization","summary":" This paper introduces a novel approach to temporal action localization (TAL)\nin few-shot learning. Our work addresses the inherent limitations of\nconventional single-prompt learning methods that often lead to overfitting due\nto the inability to generalize across varying contexts in real-world videos.\nRecognizing the diversity of camera views, backgrounds, and objects in videos,\nwe propose a multi-prompt learning framework enhanced with optimal transport.\nThis design allows the model to learn a set of diverse prompts for each action,\ncapturing general characteristics more effectively and distributing the\nrepresentation to mitigate the risk of overfitting. Furthermore, by employing\noptimal transport theory, we efficiently align these prompts with action\nfeatures, optimizing for a comprehensive representation that adapts to the\nmultifaceted nature of video data. Our experiments demonstrate significant\nimprovements in action localization accuracy and robustness in few-shot\nsettings on the standard challenging datasets of THUMOS-14 and EpicKitchens100,\nhighlighting the efficacy of our multi-prompt optimal transport approach in\novercoming the challenges of conventional few-shot TAL methods.\n","authors":["Edward Fish","Jon Weinbren","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2403.18915v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2403.18913v1","updated":"2024-03-27T18:06:31Z","published":"2024-03-27T18:06:31Z","title":"UniDepth: Universal Monocular Metric Depth Estimation","summary":" Accurate monocular metric depth estimation (MMDE) is crucial to solving\ndownstream tasks in 3D perception and modeling. However, the remarkable\naccuracy of recent MMDE methods is confined to their training domains. These\nmethods fail to generalize to unseen domains even in the presence of moderate\ndomain gaps, which hinders their practical applicability. We propose a new\nmodel, UniDepth, capable of reconstructing metric 3D scenes from solely single\nimages across domains. Departing from the existing MMDE methods, UniDepth\ndirectly predicts metric 3D points from the input image at inference time\nwithout any additional information, striving for a universal and flexible MMDE\nsolution. In particular, UniDepth implements a self-promptable camera module\npredicting dense camera representation to condition depth features. Our model\nexploits a pseudo-spherical output representation, which disentangles camera\nand depth representations. In addition, we propose a geometric invariance loss\nthat promotes the invariance of camera-prompted depth features. Thorough\nevaluations on ten datasets in a zero-shot regime consistently demonstrate the\nsuperior performance of UniDepth, even when compared with methods directly\ntrained on the testing domains. Code and models are available at:\nhttps://github.com/lpiccinelli-eth/unidepth\n","authors":["Luigi Piccinelli","Yung-Hsu Yang","Christos Sakaridis","Mattia Segu","Siyuan Li","Luc Van Gool","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2403.18913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18910v1","updated":"2024-03-27T18:02:49Z","published":"2024-03-27T18:02:49Z","title":"A Geometric Explanation of the Likelihood OOD Detection Paradox","summary":" Likelihood-based deep generative models (DGMs) commonly exhibit a puzzling\nbehaviour: when trained on a relatively complex dataset, they assign higher\nlikelihood values to out-of-distribution (OOD) data from simpler sources.\nAdding to the mystery, OOD samples are never generated by these DGMs despite\nhaving higher likelihoods. This two-pronged paradox has yet to be conclusively\nexplained, making likelihood-based OOD detection unreliable. Our primary\nobservation is that high-likelihood regions will not be generated if they\ncontain minimal probability mass. We demonstrate how this seeming contradiction\nof large densities yet low probability mass can occur around data confined to\nlow-dimensional manifolds. We also show that this scenario can be identified\nthrough local intrinsic dimension (LID) estimation, and propose a method for\nOOD detection which pairs the likelihoods and LID estimates obtained from a\npre-trained DGM. Our method can be applied to normalizing flows and score-based\ndiffusion models, and obtains results which match or surpass state-of-the-art\nOOD detection benchmarks using the same DGM backbones. Our code is available at\nhttps://github.com/layer6ai-labs/dgm_ood_detection.\n","authors":["Hamidreza Kamkari","Brendan Leigh Ross","Jesse C. Cresswell","Anthony L. Caterini","Rahul G. Krishnan","Gabriel Loaiza-Ganem"],"pdf_url":"https://arxiv.org/pdf/2403.18910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18908v1","updated":"2024-03-27T18:02:23Z","published":"2024-03-27T18:02:23Z","title":"Enhancing Multiple Object Tracking Accuracy via Quantum Annealing","summary":" Multiple object tracking (MOT), a key task in image recognition, presents a\npersistent challenge in balancing processing speed and tracking accuracy. This\nstudy introduces a novel approach that leverages quantum annealing (QA) to\nexpedite computation speed, while enhancing tracking accuracy through the\nensembling of object tracking processes. A method to improve the matching\nintegration process is also proposed. By utilizing the sequential nature of\nMOT, this study further augments the tracking method via reverse annealing\n(RA). Experimental validation confirms the maintenance of high accuracy with an\nannealing time of a mere 3 $\\mu$s per tracking process. The proposed method\nholds significant potential for real-time MOT applications, including traffic\nflow measurement for urban traffic light control, collision prediction for\nautonomous robots and vehicles, and management of products mass-produced in\nfactories.\n","authors":["Yasuyuki Ihara"],"pdf_url":"https://arxiv.org/pdf/2403.18908v1.pdf","comment":"19pages, 15 figures"},{"id":"http://arxiv.org/abs/2403.18886v1","updated":"2024-03-27T17:59:21Z","published":"2024-03-27T17:59:21Z","title":"Self-Expansion of Pre-trained Models with Mixture of Adapters for\n Continual Learning","summary":" Continual learning aims to learn from a stream of continuously arriving data\nwith minimum forgetting of previously learned knowledge. While previous works\nhave explored the effectiveness of leveraging the generalizable knowledge from\npre-trained models in continual learning, existing parameter-efficient\nfine-tuning approaches focus on the use of a predetermined or task-wise set of\nadapters or prompts. However, these approaches still suffer from forgetting due\nto task interference on jointly used parameters or restricted flexibility. The\nreliance on a static model architecture may lead to the allocation of excessive\nparameters that are not essential or, conversely, inadequate adaptation for\ndownstream tasks, given that the scale and distribution of incoming data are\nunpredictable in continual learning. We propose Self-Expansion of pre-trained\nmodels with Modularized Adaptation (SEMA), a novel fine-tuning approach which\nautomatically decides to reuse or add adapter modules on demand in continual\nlearning, depending on whether drastic distribution shift that could not be\nhandled by existing modules is detected at different representation levels. We\ndesign each adapter module to consist of an adapter and a representation\ndescriptor, specifically, implemented as an autoencoder. The representation\ndescriptor functions as a distributional shift indicator during training and\ntriggers adapter expansion. For better usage of the adapters, an expandable\nweighting router is learned jointly for mixture of adapter outputs. By\ncomparing with vision-transformer-based continual learning adaptation methods,\nwe demonstrate that the proposed framework outperforms the state-of-the-art\nwithout memory rehearsal.\n","authors":["Huiyi Wang","Haodong Lu","Lina Yao","Dong Gong"],"pdf_url":"https://arxiv.org/pdf/2403.18886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18878v1","updated":"2024-03-27T10:46:24Z","published":"2024-03-27T10:46:24Z","title":"AIC-UNet: Anatomy-informed Cascaded UNet for Robust Multi-Organ\n Segmentation","summary":" Imposing key anatomical features, such as the number of organs, their shapes,\nsizes, and relative positions, is crucial for building a robust multi-organ\nsegmentation model. Current attempts to incorporate anatomical features include\nbroadening effective receptive fields (ERF) size with resource- and\ndata-intensive modules such as self-attention or introducing organ-specific\ntopology regularizers, which may not scale to multi-organ segmentation problems\nwhere inter-organ relation also plays a huge role. We introduce a new approach\nto impose anatomical constraints on any existing encoder-decoder segmentation\nmodel by conditioning model prediction with learnable anatomy prior. More\nspecifically, given an abdominal scan, a part of the encoder spatially warps a\nlearnable prior to align with the given input scan using thin plate spline\n(TPS) grid interpolation. The warped prior is then integrated during the\ndecoding phase to guide the model for more anatomy-informed predictions. Code\nis available at\n\\hyperlink{https://anonymous.4open.science/r/AIC-UNet-7048}{https://anonymous.4open.science/r/AIC-UNet-7048}.\n","authors":["Young Seok Jeon","Hongfei Yang","Huazhu Fu","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2403.18878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16456v2","updated":"2024-03-27T04:14:59Z","published":"2024-01-29T09:12:23Z","title":"SHViT: Single-Head Vision Transformer with Memory Efficient Macro Design","summary":" Recently, efficient Vision Transformers have shown great performance with low\nlatency on resource-constrained devices. Conventionally, they use 4x4 patch\nembeddings and a 4-stage structure at the macro level, while utilizing\nsophisticated attention with multi-head configuration at the micro level. This\npaper aims to address computational redundancy at all design levels in a\nmemory-efficient manner. We discover that using larger-stride patchify stem not\nonly reduces memory access costs but also achieves competitive performance by\nleveraging token representations with reduced spatial redundancy from the early\nstages. Furthermore, our preliminary analyses suggest that attention layers in\nthe early stages can be substituted with convolutions, and several attention\nheads in the latter stages are computationally redundant. To handle this, we\nintroduce a single-head attention module that inherently prevents head\nredundancy and simultaneously boosts accuracy by parallelly combining global\nand local information. Building upon our solutions, we introduce SHViT, a\nSingle-Head Vision Transformer that obtains the state-of-the-art speed-accuracy\ntradeoff. For example, on ImageNet-1k, our SHViT-S4 is 3.3x, 8.1x, and 2.4x\nfaster than MobileViTv2 x1.0 on GPU, CPU, and iPhone12 mobile device,\nrespectively, while being 1.3% more accurate. For object detection and instance\nsegmentation on MS COCO using Mask-RCNN head, our model achieves performance\ncomparable to FastViT-SA12 while exhibiting 3.8x and 2.0x lower backbone\nlatency on GPU and mobile device, respectively.\n","authors":["Seokju Yun","Youngmin Ro"],"pdf_url":"https://arxiv.org/pdf/2401.16456v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19716v1","updated":"2024-03-27T17:41:16Z","published":"2024-03-27T17:41:16Z","title":"Capability-aware Prompt Reformulation Learning for Text-to-Image\n Generation","summary":" Text-to-image generation systems have emerged as revolutionary tools in the\nrealm of artistic creation, offering unprecedented ease in transforming textual\nprompts into visual art. However, the efficacy of these systems is intricately\nlinked to the quality of user-provided prompts, which often poses a challenge\nto users unfamiliar with prompt crafting. This paper addresses this challenge\nby leveraging user reformulation data from interaction logs to develop an\nautomatic prompt reformulation model. Our in-depth analysis of these logs\nreveals that user prompt reformulation is heavily dependent on the individual\nuser's capability, resulting in significant variance in the quality of\nreformulation pairs. To effectively use this data for training, we introduce\nthe Capability-aware Prompt Reformulation (CAPR) framework. CAPR innovatively\nintegrates user capability into the reformulation process through two key\ncomponents: the Conditional Reformulation Model (CRM) and Configurable\nCapability Features (CCF). CRM reformulates prompts according to a specified\nuser capability, as represented by CCF. The CCF, in turn, offers the\nflexibility to tune and guide the CRM's behavior. This enables CAPR to\neffectively learn diverse reformulation strategies across various user\ncapacities and to simulate high-capability user reformulation during inference.\nExtensive experiments on standard text-to-image generation benchmarks showcase\nCAPR's superior performance over existing baselines and its remarkable\nrobustness on unseen systems. Furthermore, comprehensive analyses validate the\neffectiveness of different components. CAPR can facilitate user-friendly\ninteraction with text-to-image systems and make advanced artistic creation more\nachievable for a broader range of users.\n","authors":["Jingtao Zhan","Qingyao Ai","Yiqun Liu","Jia Chen","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2403.19716v1.pdf","comment":"Accepted at SIGIR 2024"}]},"2024-03-28T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.19655v1","updated":"2024-03-28T17:59:50Z","published":"2024-03-28T17:59:50Z","title":"GaussianCube: Structuring Gaussian Splatting using Optimal Transport for\n 3D Generative Modeling","summary":" 3D Gaussian Splatting (GS) have achieved considerable improvement over Neural\nRadiance Fields in terms of 3D fitting fidelity and rendering speed. However,\nthis unstructured representation with scattered Gaussians poses a significant\nchallenge for generative modeling. To address the problem, we introduce\nGaussianCube, a structured GS representation that is both powerful and\nefficient for generative modeling. We achieve this by first proposing a\nmodified densification-constrained GS fitting algorithm which can yield\nhigh-quality fitting results using a fixed number of free Gaussians, and then\nre-arranging the Gaussians into a predefined voxel grid via Optimal Transport.\nThe structured grid representation allows us to use standard 3D U-Net as our\nbackbone in diffusion generative modeling without elaborate designs. Extensive\nexperiments conducted on ShapeNet and OmniObject3D show that our model achieves\nstate-of-the-art generation results both qualitatively and quantitatively,\nunderscoring the potential of GaussianCube as a powerful and versatile 3D\nrepresentation.\n","authors":["Bowen Zhang","Yiji Cheng","Jiaolong Yang","Chunyu Wang","Feng Zhao","Yansong Tang","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.19655v1.pdf","comment":"Project Page: https://gaussiancube.github.io/"},{"id":"http://arxiv.org/abs/2403.19654v1","updated":"2024-03-28T17:59:49Z","published":"2024-03-28T17:59:49Z","title":"RSMamba: Remote Sensing Image Classification with State Space Model","summary":" Remote sensing image classification forms the foundation of various\nunderstanding tasks, serving a crucial function in remote sensing image\ninterpretation. The recent advancements of Convolutional Neural Networks (CNNs)\nand Transformers have markedly enhanced classification accuracy. Nonetheless,\nremote sensing scene classification remains a significant challenge, especially\ngiven the complexity and diversity of remote sensing scenarios and the\nvariability of spatiotemporal resolutions. The capacity for whole-image\nunderstanding can provide more precise semantic cues for scene discrimination.\nIn this paper, we introduce RSMamba, a novel architecture for remote sensing\nimage classification. RSMamba is based on the State Space Model (SSM) and\nincorporates an efficient, hardware-aware design known as the Mamba. It\nintegrates the advantages of both a global receptive field and linear modeling\ncomplexity. To overcome the limitation of the vanilla Mamba, which can only\nmodel causal sequences and is not adaptable to two-dimensional image data, we\npropose a dynamic multi-path activation mechanism to augment Mamba's capacity\nto model non-causal data. Notably, RSMamba maintains the inherent modeling\nmechanism of the vanilla Mamba, yet exhibits superior performance across\nmultiple remote sensing image classification datasets. This indicates that\nRSMamba holds significant potential to function as the backbone of future\nvisual foundation models. The code will be available at\n\\url{https://github.com/KyanChen/RSMamba}.\n","authors":["Keyan Chen","Bowen Chen","Chenyang Liu","Wenyuan Li","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19653v1","updated":"2024-03-28T17:59:42Z","published":"2024-03-28T17:59:42Z","title":"Detecting Image Attribution for Text-to-Image Diffusion Models in RGB\n and Beyond","summary":" Modern text-to-image (T2I) diffusion models can generate images with\nremarkable realism and creativity. These advancements have sparked research in\nfake image detection and attribution, yet prior studies have not fully explored\nthe practical and scientific dimensions of this task. In addition to\nattributing images to 12 state-of-the-art T2I generators, we provide extensive\nanalyses on what inference stage hyperparameters and image modifications are\ndiscernible. Our experiments reveal that initialization seeds are highly\ndetectable, along with other subtle variations in the image generation process\nto some extent. We further investigate what visual traces are leveraged in\nimage attribution by perturbing high-frequency details and employing mid-level\nrepresentations of image style and structure. Notably, altering high-frequency\ninformation causes only slight reductions in accuracy, and training an\nattributor on style representations outperforms training on RGB images. Our\nanalyses underscore that fake images are detectable and attributable at various\nlevels of visual granularity than previously explored.\n","authors":["Katherine Xu","Lingzhi Zhang","Jianbo Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19653v1.pdf","comment":"Code available at https://github.com/k8xu/ImageAttribution"},{"id":"http://arxiv.org/abs/2403.19652v1","updated":"2024-03-28T17:59:30Z","published":"2024-03-28T17:59:30Z","title":"InterDreamer: Zero-Shot Text to 3D Dynamic Human-Object Interaction","summary":" Text-conditioned human motion generation has experienced significant\nadvancements with diffusion models trained on extensive motion capture data and\ncorresponding textual annotations. However, extending such success to 3D\ndynamic human-object interaction (HOI) generation faces notable challenges,\nprimarily due to the lack of large-scale interaction data and comprehensive\ndescriptions that align with these interactions. This paper takes the\ninitiative and showcases the potential of generating human-object interactions\nwithout direct training on text-interaction pair data. Our key insight in\nachieving this is that interaction semantics and dynamics can be decoupled.\nBeing unable to learn interaction semantics through supervised training, we\ninstead leverage pre-trained large models, synergizing knowledge from a large\nlanguage model and a text-to-motion model. While such knowledge offers\nhigh-level control over interaction semantics, it cannot grasp the intricacies\nof low-level interaction dynamics. To overcome this issue, we further introduce\na world model designed to comprehend simple physics, modeling how human actions\ninfluence object motion. By integrating these components, our novel framework,\nInterDreamer, is able to generate text-aligned 3D HOI sequences in a zero-shot\nmanner. We apply InterDreamer to the BEHAVE and CHAIRS datasets, and our\ncomprehensive experimental analysis demonstrates its capability to generate\nrealistic and coherent interaction sequences that seamlessly align with the\ntext directives.\n","authors":["Sirui Xu","Ziyin Wang","Yu-Xiong Wang","Liang-Yan Gui"],"pdf_url":"https://arxiv.org/pdf/2403.19652v1.pdf","comment":"Project Page: https://sirui-xu.github.io/InterDreamer/"},{"id":"http://arxiv.org/abs/2403.19651v1","updated":"2024-03-28T17:59:20Z","published":"2024-03-28T17:59:20Z","title":"MagicLens: Self-Supervised Image Retrieval with Open-Ended Instructions","summary":" Image retrieval, i.e., finding desired images given a reference image,\ninherently encompasses rich, multi-faceted search intents that are difficult to\ncapture solely using image-based measures. Recent work leverages text\ninstructions to allow users to more freely express their search intents.\nHowever, existing work primarily focuses on image pairs that are visually\nsimilar and/or can be characterized by a small set of pre-defined relations.\nThe core thesis of this paper is that text instructions can enable retrieving\nimages with richer relations beyond visual similarity. To show this, we\nintroduce MagicLens, a series of self-supervised image retrieval models that\nsupport open-ended instructions. MagicLens is built on a key novel insight:\nimage pairs that naturally occur on the same web pages contain a wide range of\nimplicit relations (e.g., inside view of), and we can bring those implicit\nrelations explicit by synthesizing instructions via large multimodal models\n(LMMs) and large language models (LLMs). Trained on 36.7M (query image,\ninstruction, target image) triplets with rich semantic relations mined from the\nweb, MagicLens achieves comparable or better results on eight benchmarks of\nvarious image retrieval tasks than prior state-of-the-art (SOTA) methods.\nRemarkably, it outperforms previous SOTA but with a 50X smaller model size on\nmultiple benchmarks. Additional human analyses on a 1.4M-image unseen corpus\nfurther demonstrate the diversity of search intents supported by MagicLens.\n","authors":["Kai Zhang","Yi Luan","Hexiang Hu","Kenton Lee","Siyuan Qiao","Wenhu Chen","Yu Su","Ming-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2403.19651v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2311.14097v3","updated":"2024-03-28T17:59:06Z","published":"2023-11-23T16:49:06Z","title":"ACT-Diffusion: Efficient Adversarial Consistency Training for One-step\n Diffusion Models","summary":" Though diffusion models excel in image generation, their step-by-step\ndenoising leads to slow generation speeds. Consistency training addresses this\nissue with single-step sampling but often produces lower-quality generations\nand requires high training costs. In this paper, we show that optimizing\nconsistency training loss minimizes the Wasserstein distance between target and\ngenerated distributions. As timestep increases, the upper bound accumulates\nprevious consistency training losses. Therefore, larger batch sizes are needed\nto reduce both current and accumulated losses. We propose Adversarial\nConsistency Training (ACT), which directly minimizes the Jensen-Shannon (JS)\ndivergence between distributions at each timestep using a discriminator.\nTheoretically, ACT enhances generation quality, and convergence. By\nincorporating a discriminator into the consistency training framework, our\nmethod achieves improved FID scores on CIFAR10 and ImageNet 64$\\times$64 and\nLSUN Cat 256$\\times$256 datasets, retains zero-shot image inpainting\ncapabilities, and uses less than $1/6$ of the original batch size and fewer\nthan $1/2$ of the model parameters and training steps compared to the baseline\nmethod, this leads to a substantial reduction in resource consumption. Our code\nis available:https://github.com/kong13661/ACT\n","authors":["Fei Kong","Jinhao Duan","Lichao Sun","Hao Cheng","Renjing Xu","Hengtao Shen","Xiaofeng Zhu","Xiaoshuang Shi","Kaidi Xu"],"pdf_url":"https://arxiv.org/pdf/2311.14097v3.pdf","comment":"To appear in CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19649v1","updated":"2024-03-28T17:57:27Z","published":"2024-03-28T17:57:27Z","title":"GraspXL: Generating Grasping Motions for Diverse Objects at Scale","summary":" Human hands possess the dexterity to interact with diverse objects such as\ngrasping specific parts of the objects and/or approaching them from desired\ndirections. More importantly, humans can grasp objects of any shape without\nobject-specific skills. Recent works synthesize grasping motions following\nsingle objectives such as a desired approach heading direction or a grasping\narea. Moreover, they usually rely on expensive 3D hand-object data during\ntraining and inference, which limits their capability to synthesize grasping\nmotions for unseen objects at scale. In this paper, we unify the generation of\nhand-object grasping motions across multiple motion objectives, diverse object\nshapes and dexterous hand morphologies in a policy learning framework GraspXL.\nThe objectives are composed of the graspable area, heading direction during\napproach, wrist rotation, and hand position. Without requiring any 3D\nhand-object interaction data, our policy trained with 58 objects can robustly\nsynthesize diverse grasping motions for more than 500k unseen objects with a\nsuccess rate of 82.2%. At the same time, the policy adheres to objectives,\nwhich enables the generation of diverse grasps per object. Moreover, we show\nthat our framework can be deployed to different dexterous hands and work with\nreconstructed or generated objects. We quantitatively and qualitatively\nevaluate our method to show the efficacy of our approach. Our model and code\nwill be available.\n","authors":["Hui Zhang","Sammy Christen","Zicong Fan","Otmar Hilliges","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2403.19649v1.pdf","comment":"Project Page: https://eth-ait.github.io/graspxl/"},{"id":"http://arxiv.org/abs/2403.19646v1","updated":"2024-03-28T17:55:42Z","published":"2024-03-28T17:55:42Z","title":"Change-Agent: Towards Interactive Comprehensive Change Interpretation\n and Analysis from Change Detection and Change Captioning","summary":" Monitoring changes in the Earth's surface is crucial for understanding\nnatural processes and human impacts, necessitating precise and comprehensive\ninterpretation methodologies. Remote sensing satellite imagery offers a unique\nperspective for monitoring these changes, leading to the emergence of remote\nsensing image change interpretation (RSICI) as a significant research focus.\nCurrent RSICI technology encompasses change detection and change captioning,\neach with its limitations in providing comprehensive interpretation. To address\nthis, we propose an interactive Change-Agent which integrates a multi-level\nchange interpretation (MCI) model as eyes and a large language model (LLM) as\nthe brain. Our Change-Agent can follow user instructions to achieve\ncomprehensive change interpretation and insightful analysis according to user\ninstructions, such as change detection and change captioning, change object\ncounting, change cause analysis, etc. Our proposed MCI model contains two\nbranches of pixel-level change detection and semantic-level change captioning,\nin which multiple BI-temporal Iterative Interaction (BI3) layers utilize Local\nPerception Enhancement (LPE) and the Global Difference Fusion Attention (GDFA)\nmodules to enhance the model's discriminative feature representation\ncapabilities. To train the MCI model, we build the LEVIR-MCI dataset with\nchange masks and captions of bi-temporal images. Extensive experiments\ndemonstrate the effectiveness of the proposed change interpretation model and\nhighlight the promising potential of our Change-Agent in facilitating\ncomprehensive and intelligent interpretation of surface changes. We will make\nour dataset and codebase of the change interpretation model and Change-Agent\npublicly available to facilitate future research at\nhttps://github.com/Chen-Yang-Liu/Change-Agent\n","authors":["Chenyang Liu","Keyan Chen","Haotian Zhang","Zipeng Qi","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01362v3","updated":"2024-03-28T17:55:39Z","published":"2023-07-03T21:33:40Z","title":"Direct Superpoints Matching for Robust Point Cloud Registration","summary":" Deep neural networks endow the downsampled superpoints with highly\ndiscriminative feature representations. Previous dominant point cloud\nregistration approaches match these feature representations as the first step,\ne.g., using the Sinkhorn algorithm. A RANSAC-like method is then usually\nadopted as a post-processing refinement to filter the outliers. Other dominant\nmethod is to directly predict the superpoint matchings using learned MLP\nlayers. Both of them have drawbacks: RANSAC-based methods are computationally\nintensive and prediction-based methods suffer from outputing non-existing\npoints in the point cloud. In this paper, we propose a straightforward and\neffective baseline to find correspondences of superpoints in a global matching\nmanner. We employ the normalized matching scores as weights for each\ncorrespondence, allowing us to reject the outliers and further weigh the rest\ninliers when fitting the transformation matrix without relying on the\ncumbersome RANSAC. Moreover, the entire model can be trained in an end-to-end\nfashion, leading to better accuracy. Our simple yet effective baseline shows\ncomparable or even better results than state-of-the-art methods on three\ndatasets including ModelNet, 3DMatch, and KITTI. We do not advocate our\napproach to be \\emph{the} solution for point cloud registration but use the\nresults to emphasize the role of matching strategy for point cloud\nregistration. The code and models are available at\nhttps://github.com/neu-vi/Superpoints_Registration.\n","authors":["Aniket Gupta","Yiming Xie","Hanumant Singh","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.01362v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19645v1","updated":"2024-03-28T17:55:16Z","published":"2024-03-28T17:55:16Z","title":"GANTASTIC: GAN-based Transfer of Interpretable Directions for\n Disentangled Image Editing in Text-to-Image Diffusion Models","summary":" The rapid advancement in image generation models has predominantly been\ndriven by diffusion models, which have demonstrated unparalleled success in\ngenerating high-fidelity, diverse images from textual prompts. Despite their\nsuccess, diffusion models encounter substantial challenges in the domain of\nimage editing, particularly in executing disentangled edits-changes that target\nspecific attributes of an image while leaving irrelevant parts untouched. In\ncontrast, Generative Adversarial Networks (GANs) have been recognized for their\nsuccess in disentangled edits through their interpretable latent spaces. We\nintroduce GANTASTIC, a novel framework that takes existing directions from\npre-trained GAN models-representative of specific, controllable attributes-and\ntransfers these directions into diffusion-based models. This novel approach not\nonly maintains the generative quality and diversity that diffusion models are\nknown for but also significantly enhances their capability to perform precise,\ntargeted image edits, thereby leveraging the best of both worlds.\n","authors":["Yusuf Dalva","Hidir Yesiltepe","Pinar Yanardag"],"pdf_url":"https://arxiv.org/pdf/2403.19645v1.pdf","comment":"Project page: https://gantastic.github.io"},{"id":"http://arxiv.org/abs/2304.09704v2","updated":"2024-03-28T17:53:08Z","published":"2023-04-19T14:49:31Z","title":"Learnable Earth Parser: Discovering 3D Prototypes in Aerial Scans","summary":" We propose an unsupervised method for parsing large 3D scans of real-world\nscenes with easily-interpretable shapes. This work aims to provide a practical\ntool for analyzing 3D scenes in the context of aerial surveying and mapping,\nwithout the need for user annotations. Our approach is based on a probabilistic\nreconstruction model that decomposes an input 3D point cloud into a small set\nof learned prototypical 3D shapes. The resulting reconstruction is visually\ninterpretable and can be used to perform unsupervised instance and low-shot\nsemantic segmentation of complex scenes. We demonstrate the usefulness of our\nmodel on a novel dataset of seven large aerial LiDAR scans from diverse\nreal-world scenarios. Our approach outperforms state-of-the-art unsupervised\nmethods in terms of decomposition accuracy while remaining visually\ninterpretable. Our code and dataset are available at\nhttps://romainloiseau.fr/learnable-earth-parser/\n","authors":["Romain Loiseau","Elliot Vincent","Mathieu Aubry","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2304.09704v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19638v1","updated":"2024-03-28T17:52:24Z","published":"2024-03-28T17:52:24Z","title":"Siamese Vision Transformers are Scalable Audio-visual Learners","summary":" Traditional audio-visual methods rely on independent audio and visual\nbackbones, which is costly and not scalable. In this work, we investigate using\nan audio-visual siamese network (AVSiam) for efficient and scalable\naudio-visual pretraining. Our framework uses a single shared vision transformer\nbackbone to process audio and visual inputs, improving its parameter\nefficiency, reducing the GPU memory footprint, and allowing us to scale our\nmethod to larger datasets and model sizes. We pretrain our model using a\ncontrastive audio-visual matching objective with a multi-ratio random masking\nscheme, which enables our model to process larger audio-visual instance\nbatches, helpful for contrastive learning. Unlike prior audio-visual methods,\nour method can robustly handle audio, visual, and audio-visual inputs with a\nsingle shared ViT backbone. Furthermore, despite using the shared backbone for\nboth modalities, AVSiam achieves competitive or even better results than prior\nmethods on AudioSet and VGGSound for audio-visual classification and retrieval.\nOur code is available at https://github.com/GenjiB/AVSiam\n","authors":["Yan-Bo Lin","Gedas Bertasius"],"pdf_url":"https://arxiv.org/pdf/2403.19638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19632v1","updated":"2024-03-28T17:47:31Z","published":"2024-03-28T17:47:31Z","title":"GauStudio: A Modular Framework for 3D Gaussian Splatting and Beyond","summary":" We present GauStudio, a novel modular framework for modeling 3D Gaussian\nSplatting (3DGS) to provide standardized, plug-and-play components for users to\neasily customize and implement a 3DGS pipeline. Supported by our framework, we\npropose a hybrid Gaussian representation with foreground and skyball background\nmodels. Experiments demonstrate this representation reduces artifacts in\nunbounded outdoor scenes and improves novel view synthesis. Finally, we propose\nGaussian Splatting Surface Reconstruction (GauS), a novel render-then-fuse\napproach for high-fidelity mesh reconstruction from 3DGS inputs without\nfine-tuning. Overall, our GauStudio framework, hybrid representation, and GauS\napproach enhance 3DGS modeling and rendering capabilities, enabling\nhigher-quality novel view synthesis and surface reconstruction.\n","authors":["Chongjie Ye","Yinyu Nie","Jiahao Chang","Yuantao Chen","Yihao Zhi","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2403.19632v1.pdf","comment":"Code: https://github.com/GAP-LAB-CUHK-SZ/gaustudio"},{"id":"http://arxiv.org/abs/2403.19622v1","updated":"2024-03-28T17:42:54Z","published":"2024-03-28T17:42:54Z","title":"RH20T-P: A Primitive-Level Robotic Dataset Towards Composable\n Generalization Agents","summary":" The ultimate goals of robotic learning is to acquire a comprehensive and\ngeneralizable robotic system capable of performing both seen skills within the\ntraining distribution and unseen skills in novel environments. Recent progress\nin utilizing language models as high-level planners has demonstrated that the\ncomplexity of tasks can be reduced through decomposing them into\nprimitive-level plans, making it possible to generalize on novel robotic tasks\nin a composable manner. Despite the promising future, the community is not yet\nadequately prepared for composable generalization agents, particularly due to\nthe lack of primitive-level real-world robotic datasets. In this paper, we\npropose a primitive-level robotic dataset, namely RH20T-P, which contains about\n33000 video clips covering 44 diverse and complicated robotic tasks. Each clip\nis manually annotated according to a set of meticulously designed primitive\nskills, facilitating the future development of composable generalization\nagents. To validate the effectiveness of RH20T-P, we also construct a potential\nand scalable agent based on RH20T-P, called RA-P. Equipped with two planners\nspecialized in task decomposition and motion planning, RA-P can adapt to novel\nphysical skills through composable generalization. Our website and videos can\nbe found at https://sites.google.com/view/rh20t-primitive/main. Dataset and\ncode will be made available soon.\n","authors":["Zeren Chen","Zhelun Shi","Xiaoya Lu","Lehan He","Sucheng Qian","Hao Shu Fang","Zhenfei Yin","Wanli Ouyang","Jing Shao","Yu Qiao","Cewu Lu","Lu Sheng"],"pdf_url":"https://arxiv.org/pdf/2403.19622v1.pdf","comment":"24 pages, 12 figures, 6 tables"},{"id":"http://arxiv.org/abs/2403.19620v1","updated":"2024-03-28T17:40:15Z","published":"2024-03-28T17:40:15Z","title":"Collaborative Interactive Evolution of Art in the Latent Space of Deep\n Generative Models","summary":" Generative Adversarial Networks (GANs) have shown great success in generating\nhigh quality images and are thus used as one of the main approaches to generate\nart images. However, usually the image generation process involves sampling\nfrom the latent space of the learned art representations, allowing little\ncontrol over the output. In this work, we first employ GANs that are trained to\nproduce creative images using an architecture known as Creative Adversarial\nNetworks (CANs), then, we employ an evolutionary approach to navigate within\nthe latent space of the models to discover images. We use automatic aesthetic\nand collaborative interactive human evaluation metrics to assess the generated\nimages. In the human interactive evaluation case, we propose a collaborative\nevaluation based on the assessments of several participants. Furthermore, we\nalso experiment with an intelligent mutation operator that aims to improve the\nquality of the images through local search based on an aesthetic measure. We\nevaluate the effectiveness of this approach by comparing the results produced\nby the automatic and collaborative interactive evolution. The results show that\nthe proposed approach can generate highly attractive art images when the\nevolution is guided by collaborative human feedback.\n","authors":["Ole Hall","Anil Yaman"],"pdf_url":"https://arxiv.org/pdf/2403.19620v1.pdf","comment":"Preprint. The Version of Record of this contribution is to be\n published in the proceedings of the 13th International Conference on\n Artificial Intelligence in Music, Sound, Art and Design (EvoMUSART) 2024"},{"id":"http://arxiv.org/abs/2304.09224v2","updated":"2024-03-28T17:36:50Z","published":"2023-04-18T18:23:20Z","title":"Quantum machine learning for image classification","summary":" Image classification, a pivotal task in multiple industries, faces\ncomputational challenges due to the burgeoning volume of visual data. This\nresearch addresses these challenges by introducing two quantum machine learning\nmodels that leverage the principles of quantum mechanics for effective\ncomputations. Our first model, a hybrid quantum neural network with parallel\nquantum circuits, enables the execution of computations even in the noisy\nintermediate-scale quantum era, where circuits with a large number of qubits\nare currently infeasible. This model demonstrated a record-breaking\nclassification accuracy of 99.21% on the full MNIST dataset, surpassing the\nperformance of known quantum-classical models, while having eight times fewer\nparameters than its classical counterpart. Also, the results of testing this\nhybrid model on a Medical MNIST (classification accuracy over 99%), and on\nCIFAR-10 (classification accuracy over 82%), can serve as evidence of the\ngeneralizability of the model and highlights the efficiency of quantum layers\nin distinguishing common features of input data. Our second model introduces a\nhybrid quantum neural network with a Quanvolutional layer, reducing image\nresolution via a convolution process. The model matches the performance of its\nclassical counterpart, having four times fewer trainable parameters, and\noutperforms a classical model with equal weight parameters. These models\nrepresent advancements in quantum machine learning research and illuminate the\npath towards more accurate image classification systems.\n","authors":["Arsenii Senokosov","Alexandr Sedykh","Asel Sagingalieva","Basil Kyriacou","Alexey Melnikov"],"pdf_url":"https://arxiv.org/pdf/2304.09224v2.pdf","comment":"13 pages, 10 figures, 1 table"},{"id":"http://arxiv.org/abs/2312.07360v2","updated":"2024-03-28T17:35:29Z","published":"2023-12-12T15:30:24Z","title":"Boosting Latent Diffusion with Flow Matching","summary":" Recently, there has been tremendous progress in visual synthesis and the\nunderlying generative models. Here, diffusion models (DMs) stand out\nparticularly, but lately, flow matching (FM) has also garnered considerable\ninterest. While DMs excel in providing diverse images, they suffer from long\ntraining and slow generation. With latent diffusion, these issues are only\npartially alleviated. Conversely, FM offers faster training and inference but\nexhibits less diversity in synthesis. We demonstrate that introducing FM\nbetween the Diffusion model and the convolutional decoder offers\nhigh-resolution image synthesis with reduced computational cost and model size.\nDiffusion can then efficiently provide the necessary generation diversity. FM\ncompensates for the lower resolution, mapping the small latent space to a\nhigh-dimensional one. Subsequently, the convolutional decoder of the LDM maps\nthese latents to high-resolution images. By combining the diversity of DMs, the\nefficiency of FMs, and the effectiveness of convolutional decoders, we achieve\nstate-of-the-art high-resolution image synthesis at $1024^2$ with minimal\ncomputational cost. Importantly, our approach is orthogonal to recent\napproximation and speed-up strategies for the underlying DMs, making it easily\nintegrable into various DM frameworks.\n","authors":["Johannes S. Fischer","Ming Gui","Pingchuan Ma","Nick Stracke","Stefan A. Baumann","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2312.07360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19615v1","updated":"2024-03-28T17:32:58Z","published":"2024-03-28T17:32:58Z","title":"SA-GS: Scale-Adaptive Gaussian Splatting for Training-Free Anti-Aliasing","summary":" In this paper, we present a Scale-adaptive method for Anti-aliasing Gaussian\nSplatting (SA-GS). While the state-of-the-art method Mip-Splatting needs\nmodifying the training procedure of Gaussian splatting, our method functions at\ntest-time and is training-free. Specifically, SA-GS can be applied to any\npretrained Gaussian splatting field as a plugin to significantly improve the\nfield's anti-alising performance. The core technique is to apply 2D\nscale-adaptive filters to each Gaussian during test time. As pointed out by\nMip-Splatting, observing Gaussians at different frequencies leads to mismatches\nbetween the Gaussian scales during training and testing. Mip-Splatting resolves\nthis issue using 3D smoothing and 2D Mip filters, which are unfortunately not\naware of testing frequency. In this work, we show that a 2D scale-adaptive\nfilter that is informed of testing frequency can effectively match the Gaussian\nscale, thus making the Gaussian primitive distribution remain consistent across\ndifferent testing frequencies. When scale inconsistency is eliminated, sampling\nrates smaller than the scene frequency result in conventional jaggedness, and\nwe propose to integrate the projected 2D Gaussian within each pixel during\ntesting. This integration is actually a limiting case of super-sampling, which\nsignificantly improves anti-aliasing performance over vanilla Gaussian\nSplatting. Through extensive experiments using various settings and both\nbounded and unbounded scenes, we show SA-GS performs comparably with or better\nthan Mip-Splatting. Note that super-sampling and integration are only effective\nwhen our scale-adaptive filtering is activated. Our codes, data and models are\navailable at https://github.com/zsy1987/SA-GS.\n","authors":["Xiaowei Song","Jv Zheng","Shiran Yuan","Huan-ang Gao","Jingwei Zhao","Xiang He","Weihao Gu","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.19615v1.pdf","comment":"Project page: https://kevinsong729.github.io/project-pages/SA-GS/\n Code: https://github.com/zsy1987/SA-GS"},{"id":"http://arxiv.org/abs/2403.19612v1","updated":"2024-03-28T17:32:01Z","published":"2024-03-28T17:32:01Z","title":"ILPO-NET: Network for the invariant recognition of arbitrary volumetric\n patterns in 3D","summary":" Effective recognition of spatial patterns and learning their hierarchy is\ncrucial in modern spatial data analysis. Volumetric data applications seek\ntechniques ensuring invariance not only to shifts but also to pattern\nrotations. While traditional methods can readily achieve translational\ninvariance, rotational invariance possesses multiple challenges and remains an\nactive area of research. Here, we present ILPO-Net (Invariant to Local Patterns\nOrientation Network), a novel approach that handles arbitrarily shaped patterns\nwith the convolutional operation inherently invariant to local spatial pattern\norientations using the Wigner matrix expansions. Our architecture seamlessly\nintegrates the new convolution operator and, when benchmarked on diverse\nvolumetric datasets such as MedMNIST and CATH, demonstrates superior\nperformance over the baselines with significantly reduced parameter counts - up\nto 1000 times fewer in the case of MedMNIST. Beyond these demonstrations,\nILPO-Net's rotational invariance paves the way for other applications across\nmultiple disciplines. Our code is publicly available at\nhttps://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPONet.\n","authors":["Dmitrii Zhemchuzhnikov","Sergei Grudinin"],"pdf_url":"https://arxiv.org/pdf/2403.19612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19611v1","updated":"2024-03-28T17:31:23Z","published":"2024-03-28T17:31:23Z","title":"Nearest Neighbor Classication for Classical Image Upsampling","summary":" Given a set of ordered pixel data in the form of an image, our goal is to\nperform upsampling on the data such that: the resulting resolution is improved\nby some factor, the final result passes the human test, having added new,\nbelievable, and realistic information and detail to the image, the time\ncomplexity for upscaling is relatively close to that of lossy upscaling\nimplementations.\n","authors":["Evan Matthews","Nicolas Prate"],"pdf_url":"https://arxiv.org/pdf/2403.19611v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2403.19607v1","updated":"2024-03-28T17:28:32Z","published":"2024-03-28T17:28:32Z","title":"SAID-NeRF: Segmentation-AIDed NeRF for Depth Completion of Transparent\n Objects","summary":" Acquiring accurate depth information of transparent objects using\noff-the-shelf RGB-D cameras is a well-known challenge in Computer Vision and\nRobotics. Depth estimation/completion methods are typically employed and\ntrained on datasets with quality depth labels acquired from either simulation,\nadditional sensors or specialized data collection setups and known 3d models.\nHowever, acquiring reliable depth information for datasets at scale is not\nstraightforward, limiting training scalability and generalization. Neural\nRadiance Fields (NeRFs) are learning-free approaches and have demonstrated wide\nsuccess in novel view synthesis and shape recovery. However, heuristics and\ncontrolled environments (lights, backgrounds, etc) are often required to\naccurately capture specular surfaces. In this paper, we propose using Visual\nFoundation Models (VFMs) for segmentation in a zero-shot, label-free way to\nguide the NeRF reconstruction process for these objects via the simultaneous\nreconstruction of semantic fields and extensions to increase robustness. Our\nproposed method Segmentation-AIDed NeRF (SAID-NeRF) shows significant\nperformance on depth completion datasets for transparent objects and robotic\ngrasping.\n","authors":["Avinash Ummadisingu","Jongkeum Choi","Koki Yamane","Shimpei Masuda","Naoki Fukaya","Kuniyuki Takahashi"],"pdf_url":"https://arxiv.org/pdf/2403.19607v1.pdf","comment":"8 pages. An accompanying video is available at\n https://www.youtube.com/watch?v=S4NCoUq4bmE"},{"id":"http://arxiv.org/abs/2403.19603v1","updated":"2024-03-28T17:27:44Z","published":"2024-03-28T17:27:44Z","title":"Semantic Map-based Generation of Navigation Instructions","summary":" We are interested in the generation of navigation instructions, either in\ntheir own right or as training material for robotic navigation task. In this\npaper, we propose a new approach to navigation instruction generation by\nframing the problem as an image captioning task using semantic maps as visual\ninput. Conventional approaches employ a sequence of panorama images to generate\nnavigation instructions. Semantic maps abstract away from visual details and\nfuse the information in multiple panorama images into a single top-down\nrepresentation, thereby reducing computational complexity to process the input.\nWe present a benchmark dataset for instruction generation using semantic maps,\npropose an initial model and ask human subjects to manually assess the quality\nof generated instructions. Our initial investigations show promise in using\nsemantic maps for instruction generation instead of a sequence of panorama\nimages, but there is vast scope for improvement. We release the code for data\npreparation and model training at https://github.com/chengzu-li/VLGen.\n","authors":["Chengzu Li","Chao Zhang","Simone Teufel","Rama Sanand Doddipatla","Svetlana Stoyanchev"],"pdf_url":"https://arxiv.org/pdf/2403.19603v1.pdf","comment":"5 pages, 2 figures, 3 tables (13 pages, 3 figures, 5 tables including\n references and appendices), accepted at LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2311.11278v2","updated":"2024-03-28T17:25:51Z","published":"2023-11-19T09:41:10Z","title":"Transcending Forgery Specificity with Latent Space Augmentation for\n Generalizable Deepfake Detection","summary":" Deepfake detection faces a critical generalization hurdle, with performance\ndeteriorating when there is a mismatch between the distributions of training\nand testing data. A broadly received explanation is the tendency of these\ndetectors to be overfitted to forgery-specific artifacts, rather than learning\nfeatures that are widely applicable across various forgeries. To address this\nissue, we propose a simple yet effective detector called LSDA\n(\\underline{L}atent \\underline{S}pace \\underline{D}ata\n\\underline{A}ugmentation), which is based on a heuristic idea: representations\nwith a wider variety of forgeries should be able to learn a more generalizable\ndecision boundary, thereby mitigating the overfitting of method-specific\nfeatures (see Fig.~\\ref{fig:toy}). Following this idea, we propose to enlarge\nthe forgery space by constructing and simulating variations within and across\nforgery features in the latent space. This approach encompasses the acquisition\nof enriched, domain-specific features and the facilitation of smoother\ntransitions between different forgery types, effectively bridging domain gaps.\nOur approach culminates in refining a binary classifier that leverages the\ndistilled knowledge from the enhanced features, striving for a generalizable\ndeepfake detector. Comprehensive experiments show that our proposed method is\nsurprisingly effective and transcends state-of-the-art detectors across several\nwidely used benchmarks.\n","authors":["Zhiyuan Yan","Yuhao Luo","Siwei Lyu","Qingshan Liu","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2311.11278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19600v1","updated":"2024-03-28T17:23:45Z","published":"2024-03-28T17:23:45Z","title":"Enhance Image Classification via Inter-Class Image Mixup with Diffusion\n Model","summary":" Text-to-image (T2I) generative models have recently emerged as a powerful\ntool, enabling the creation of photo-realistic images and giving rise to a\nmultitude of applications. However, the effective integration of T2I models\ninto fundamental image classification tasks remains an open question. A\nprevalent strategy to bolster image classification performance is through\naugmenting the training set with synthetic images generated by T2I models. In\nthis study, we scrutinize the shortcomings of both current generative and\nconventional data augmentation techniques. Our analysis reveals that these\nmethods struggle to produce images that are both faithful (in terms of\nforeground objects) and diverse (in terms of background contexts) for\ndomain-specific concepts. To tackle this challenge, we introduce an innovative\ninter-class data augmentation method known as Diff-Mix\n(https://github.com/Zhicaiwww/Diff-Mix), which enriches the dataset by\nperforming image translations between classes. Our empirical results\ndemonstrate that Diff-Mix achieves a better balance between faithfulness and\ndiversity, leading to a marked improvement in performance across diverse image\nclassification scenarios, including few-shot, conventional, and long-tail\nclassifications for domain-specific datasets.\n","authors":["Zhicai Wang","Longhui Wei","Tan Wang","Heyu Chen","Yanbin Hao","Xiang Wang","Xiangnan He","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2403.19600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17048v2","updated":"2024-03-28T17:23:15Z","published":"2023-11-28T18:55:37Z","title":"Zero-shot Referring Expression Comprehension via Structural Similarity\n Between Images and Captions","summary":" Zero-shot referring expression comprehension aims at localizing bounding\nboxes in an image corresponding to provided textual prompts, which requires:\n(i) a fine-grained disentanglement of complex visual scene and textual context,\nand (ii) a capacity to understand relationships among disentangled entities.\nUnfortunately, existing large vision-language alignment (VLA) models, e.g.,\nCLIP, struggle with both aspects so cannot be directly used for this task. To\nmitigate this gap, we leverage large foundation models to disentangle both\nimages and texts into triplets in the format of (subject, predicate, object).\nAfter that, grounding is accomplished by calculating the structural similarity\nmatrix between visual and textual triplets with a VLA model, and subsequently\npropagate it to an instance-level similarity matrix. Furthermore, to equip VLA\nmodels with the ability of relationship understanding, we design a\ntriplet-matching objective to fine-tune the VLA models on a collection of\ncurated dataset containing abundant entity relationships. Experiments\ndemonstrate that our visual grounding performance increase of up to 19.5% over\nthe SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo\ndataset, our zero-shot approach achieves comparable accuracy to the fully\nsupervised model. Code is available at\nhttps://github.com/Show-han/Zeroshot_REC.\n","authors":["Zeyu Han","Fangrui Zhu","Qianru Lao","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.17048v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19596v1","updated":"2024-03-28T17:20:39Z","published":"2024-03-28T17:20:39Z","title":"LocCa: Visual Pretraining with Location-aware Captioners","summary":" Image captioning has been shown as an effective pretraining method similar to\ncontrastive pretraining. However, the incorporation of location-aware\ninformation into visual pretraining remains an area with limited research. In\nthis paper, we propose a simple visual pretraining method with location-aware\ncaptioners (LocCa). LocCa uses a simple image captioner task interface, to\nteach a model to read out rich information, i.e. bounding box coordinates, and\ncaptions, conditioned on the image pixel input. Thanks to the multitask\ncapabilities of an encoder-decoder architecture, we show that an image\ncaptioner can easily handle multiple tasks during pretraining. Our experiments\ndemonstrate that LocCa outperforms standard captioners significantly on\nlocalization downstream tasks while maintaining comparable performance on\nholistic tasks.\n","authors":["Bo Wan","Michael Tschannen","Yongqin Xian","Filip Pavetic","Ibrahim Alabdulmohsin","Xiao Wang","André Susano Pinto","Andreas Steiner","Lucas Beyer","Xiaohua Zhai"],"pdf_url":"https://arxiv.org/pdf/2403.19596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19595v1","updated":"2024-03-28T17:19:16Z","published":"2024-03-28T17:19:16Z","title":"Situation Awareness for Driver-Centric Driving Style Adaptation","summary":" There is evidence that the driving style of an autonomous vehicle is\nimportant to increase the acceptance and trust of the passengers. The driving\nsituation has been found to have a significant influence on human driving\nbehavior. However, current driving style models only partially incorporate\ndriving environment information, limiting the alignment between an agent and\nthe given situation. Therefore, we propose a situation-aware driving style\nmodel based on different visual feature encoders pretrained on fleet data, as\nwell as driving behavior predictors, which are adapted to the driving style of\na specific driver. Our experiments show that the proposed method outperforms\nstatic driving styles significantly and forms plausible situation clusters.\nFurthermore, we found that feature encoders pretrained on our dataset lead to\nmore precise driving behavior modeling. In contrast, feature encoders\npretrained supervised and unsupervised on different data sources lead to more\nspecific situation clusters, which can be utilized to constrain and control the\ndriving style adaptation for specific situations. Moreover, in a real-world\nsetting, where driving style adaptation is happening iteratively, we found the\nMLP-based behavior predictors achieve good performance initially but suffer\nfrom catastrophic forgetting. In contrast, behavior predictors based on\nsituationdependent statistics can learn iteratively from continuous data\nstreams by design. Overall, our experiments show that important information for\ndriving behavior prediction is contained within the visual feature encoder. The\ndataset is publicly available at\nhuggingface.co/datasets/jHaselberger/SADC-Situation-Awareness-for-Driver-Centric-Driving-Style-Adaptation.\n","authors":["Johann Haselberger","Bonifaz Stuhr","Bernhard Schick","Steffen Müller"],"pdf_url":"https://arxiv.org/pdf/2403.19595v1.pdf","comment":"14 pages, 6 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2403.19593v1","updated":"2024-03-28T17:15:23Z","published":"2024-03-28T17:15:23Z","title":"Frame by Familiar Frame: Understanding Replication in Video Diffusion\n Models","summary":" Building on the momentum of image generation diffusion models, there is an\nincreasing interest in video-based diffusion models. However, video generation\nposes greater challenges due to its higher-dimensional nature, the scarcity of\ntraining data, and the complex spatiotemporal relationships involved. Image\ngeneration models, due to their extensive data requirements, have already\nstrained computational resources to their limits. There have been instances of\nthese models reproducing elements from the training samples, leading to\nconcerns and even legal disputes over sample replication. Video diffusion\nmodels, which operate with even more constrained datasets and are tasked with\ngenerating both spatial and temporal content, may be more prone to replicating\nsamples from their training sets. Compounding the issue, these models are often\nevaluated using metrics that inadvertently reward replication. In our paper, we\npresent a systematic investigation into the phenomenon of sample replication in\nvideo diffusion models. We scrutinize various recent diffusion models for video\nsynthesis, assessing their tendency to replicate spatial and temporal content\nin both unconditional and conditional generation scenarios. Our study\nidentifies strategies that are less likely to lead to replication. Furthermore,\nwe propose new evaluation strategies that take replication into account,\noffering a more accurate measure of a model's ability to generate the original\ncontent.\n","authors":["Aimon Rahman","Malsha V. Perera","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2403.19593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05950v2","updated":"2024-03-28T17:14:53Z","published":"2024-03-09T16:05:31Z","title":"Classifying Objects in 3D Point Clouds Using Recurrent Neural Network: A\n GRU LSTM Hybrid Approach","summary":" Accurate classification of objects in 3D point clouds is a significant\nproblem in several applications, such as autonomous navigation and\naugmented/virtual reality scenarios, which has become a research hot spot. In\nthis paper, we presented a deep learning strategy for 3D object classification\nin augmented reality. The proposed approach is a combination of the GRU and\nLSTM. LSTM networks learn longer dependencies well, but due to the number of\ngates, it takes longer to train; on the other hand, GRU networks have a weaker\nperformance than LSTM, but their training speed is much higher than GRU, which\nis The speed is due to its fewer gates. The proposed approach used the\ncombination of speed and accuracy of these two networks. The proposed approach\nachieved an accuracy of 0.99 in the 4,499,0641 points dataset, which includes\neight classes (unlabeled, man-made terrain, natural terrain, high vegetation,\nlow vegetation, buildings, hardscape, scanning artifacts, cars). Meanwhile, the\ntraditional machine learning approaches could achieve a maximum accuracy of\n0.9489 in the best case. Keywords: Point Cloud Classification, Virtual Reality,\nHybrid Model, GRULSTM, GRU, LSTM\n","authors":["Ramin Mousa","Mitra Khezli","Mohamadreza Azadi","Vahid Nikoofard","Saba Hesaraki"],"pdf_url":"https://arxiv.org/pdf/2403.05950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19589v1","updated":"2024-03-28T17:12:55Z","published":"2024-03-28T17:12:55Z","title":"TOD3Cap: Towards 3D Dense Captioning in Outdoor Scenes","summary":" 3D dense captioning stands as a cornerstone in achieving a comprehensive\nunderstanding of 3D scenes through natural language. It has recently witnessed\nremarkable achievements, particularly in indoor settings. However, the\nexploration of 3D dense captioning in outdoor scenes is hindered by two major\nchallenges: 1) the \\textbf{domain gap} between indoor and outdoor scenes, such\nas dynamics and sparse visual inputs, makes it difficult to directly adapt\nexisting indoor methods; 2) the \\textbf{lack of data} with comprehensive\nbox-caption pair annotations specifically tailored for outdoor scenes. To this\nend, we introduce the new task of outdoor 3D dense captioning. As input, we\nassume a LiDAR point cloud and a set of RGB images captured by the panoramic\ncamera rig. The expected output is a set of object boxes with captions. To\ntackle this task, we propose the TOD3Cap network, which leverages the BEV\nrepresentation to generate object box proposals and integrates Relation\nQ-Former with LLaMA-Adapter to generate rich captions for these objects. We\nalso introduce the TOD3Cap dataset, the largest one to our knowledge for 3D\ndense captioning in outdoor scenes, which contains 2.3M descriptions of 64.3K\noutdoor objects from 850 scenes. Notably, our TOD3Cap network can effectively\nlocalize and caption 3D objects in outdoor scenes, which outperforms baseline\nmethods by a significant margin (+9.6 CiDEr@0.5IoU). Code, data, and models are\npublicly available at https://github.com/jxbbb/TOD3Cap.\n","authors":["Bu Jin","Yupeng Zheng","Pengfei Li","Weize Li","Yuhang Zheng","Sujie Hu","Xinyu Liu","Jinwei Zhu","Zhijie Yan","Haiyang Sun","Kun Zhan","Peng Jia","Xiaoxiao Long","Yilun Chen","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.19589v1.pdf","comment":"Code, data, and models are publicly available at\n https://github.com/jxbbb/TOD3Cap"},{"id":"http://arxiv.org/abs/2403.19588v1","updated":"2024-03-28T17:12:39Z","published":"2024-03-28T17:12:39Z","title":"DenseNets Reloaded: Paradigm Shift Beyond ResNets and ViTs","summary":" This paper revives Densely Connected Convolutional Networks (DenseNets) and\nreveals the underrated effectiveness over predominant ResNet-style\narchitectures. We believe DenseNets' potential was overlooked due to untouched\ntraining methods and traditional design elements not fully revealing their\ncapabilities. Our pilot study shows dense connections through concatenation are\nstrong, demonstrating that DenseNets can be revitalized to compete with modern\narchitectures. We methodically refine suboptimal components - architectural\nadjustments, block redesign, and improved training recipes towards widening\nDenseNets and boosting memory efficiency while keeping concatenation shortcuts.\nOur models, employing simple architectural elements, ultimately surpass Swin\nTransformer, ConvNeXt, and DeiT-III - key architectures in the residual\nlearning lineage. Furthermore, our models exhibit near state-of-the-art\nperformance on ImageNet-1K, competing with the very recent models and\ndownstream tasks, ADE20k semantic segmentation, and COCO object\ndetection/instance segmentation. Finally, we provide empirical analyses that\nuncover the merits of the concatenation over additive shortcuts, steering a\nrenewed preference towards DenseNet-style designs. Our code is available at\nhttps://github.com/naver-ai/rdnet.\n","authors":["Donghyun Kim","Byeongho Heo","Dongyoon Han"],"pdf_url":"https://arxiv.org/pdf/2403.19588v1.pdf","comment":"Code at https://github.com/naver-ai/rdnet"},{"id":"http://arxiv.org/abs/2403.18346v2","updated":"2024-03-28T17:09:36Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from an over-reliance on unimodal biases (e.g., language\nbias and vision bias), leading to incorrect answers in complex multimodal\ntasks. To investigate this issue, we propose a causal framework to interpret\nthe biases in Visual Question Answering (VQA) problems. Within our framework,\nwe devise a causal graph to elucidate the predictions of MLLMs on VQA problems,\nand assess the causal effect of biases through an in-depth causal analysis.\nMotivated by the causal graph, we introduce a novel MORE dataset, consisting of\n12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities,\nnecessitating multi-hop reasoning and the surmounting of unimodal biases.\nFurthermore, we propose two strategies to mitigate unimodal biases and enhance\nMLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA)\nframework for limited-access MLLMs and the refinement of open-source MLLMs\nthrough fine-tuning. Extensive quantitative and qualitative experiments offer\nvaluable insights for future research. Our project page is at\nhttps://opencausalab.github.io/MORE.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19586v1","updated":"2024-03-28T17:08:58Z","published":"2024-03-28T17:08:58Z","title":"TOGS: Gaussian Splatting with Temporal Opacity Offset for Real-Time 4D\n DSA Rendering","summary":" Four-dimensional Digital Subtraction Angiography (4D DSA) is a medical\nimaging technique that provides a series of 2D images captured at different\nstages and angles during the process of contrast agent filling blood vessels.\nIt plays a significant role in the diagnosis of cerebrovascular diseases.\nImproving the rendering quality and speed under sparse sampling is important\nfor observing the status and location of lesions. The current methods exhibit\ninadequate rendering quality in sparse views and suffer from slow rendering\nspeed. To overcome these limitations, we propose TOGS, a Gaussian splatting\nmethod with opacity offset over time, which can effectively improve the\nrendering quality and speed of 4D DSA. We introduce an opacity offset table for\neach Gaussian to model the temporal variations in the radiance of the contrast\nagent. By interpolating the opacity offset table, the opacity variation of the\nGaussian at different time points can be determined. This enables us to render\nthe 2D DSA image at that specific moment. Additionally, we introduced a Smooth\nloss term in the loss function to mitigate overfitting issues that may arise in\nthe model when dealing with sparse view scenarios. During the training phase,\nwe randomly prune Gaussians, thereby reducing the storage overhead of the\nmodel. The experimental results demonstrate that compared to previous methods,\nthis model achieves state-of-the-art reconstruction quality under the same\nnumber of training views. Additionally, it enables real-time rendering while\nmaintaining low storage overhead. The code will be publicly available.\n","authors":["Shuai Zhang","Huangxuan Zhao","Zhenghong Zhou","Guanjun Wu","Chuansheng Zheng","Xinggang Wang","Wenyu Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07330v2","updated":"2024-03-28T17:07:38Z","published":"2023-12-12T14:45:45Z","title":"Learned representation-guided diffusion models for large-image\n generation","summary":" To synthesize high-fidelity samples, diffusion models typically require\nauxiliary data to guide the generation process. However, it is impractical to\nprocure the painstaking patch-level annotation effort required in specialized\ndomains like histopathology and satellite imagery; it is often performed by\ndomain experts and involves hundreds of millions of patches. Modern-day\nself-supervised learning (SSL) representations encode rich semantic and visual\ninformation. In this paper, we posit that such representations are expressive\nenough to act as proxies to fine-grained human labels. We introduce a novel\napproach that trains diffusion models conditioned on embeddings from SSL. Our\ndiffusion models successfully project these features back to high-quality\nhistopathology and remote sensing images. In addition, we construct larger\nimages by assembling spatially consistent patches inferred from SSL embeddings,\npreserving long-range dependencies. Augmenting real data by generating\nvariations of real images improves downstream classifier accuracy for\npatch-level and larger, image-scale classification tasks. Our models are\neffective even on datasets not encountered during training, demonstrating their\nrobustness and generalizability. Generating images from learned embeddings is\nagnostic to the source of the embeddings. The SSL embeddings used to generate a\nlarge image can either be extracted from a reference image, or sampled from an\nauxiliary model conditioned on any related modality (e.g. class labels, text,\ngenomic data). As proof of concept, we introduce the text-to-large image\nsynthesis paradigm where we successfully synthesize large pathology and\nsatellite images out of text descriptions.\n","authors":["Alexandros Graikos","Srikar Yellapragada","Minh-Quan Le","Saarthak Kapse","Prateek Prasanna","Joel Saltz","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2312.07330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17113v2","updated":"2024-03-28T17:07:28Z","published":"2023-11-28T12:05:41Z","title":"Human Gaussian Splatting: Real-time Rendering of Animatable Avatars","summary":" This work addresses the problem of real-time rendering of photorealistic\nhuman body avatars learned from multi-view videos. While the classical\napproaches to model and render virtual humans generally use a textured mesh,\nrecent research has developed neural body representations that achieve\nimpressive visual quality. However, these models are difficult to render in\nreal-time and their quality degrades when the character is animated with body\nposes different than the training observations. We propose an animatable human\nmodel based on 3D Gaussian Splatting, that has recently emerged as a very\nefficient alternative to neural radiance fields. The body is represented by a\nset of gaussian primitives in a canonical space which is deformed with a coarse\nto fine approach that combines forward skinning and local non-rigid refinement.\nWe describe how to learn our Human Gaussian Splatting (HuGS) model in an\nend-to-end fashion from multi-view observations, and evaluate it against the\nstate-of-the-art approaches for novel pose synthesis of clothed body. Our\nmethod achieves 1.5 dB PSNR improvement over the state-of-the-art on THuman4\ndataset while being able to render in real-time (80 fps for 512x512\nresolution).\n","authors":["Arthur Moreau","Jifei Song","Helisa Dhamo","Richard Shaw","Yiren Zhou","Eduardo Pérez-Pellitero"],"pdf_url":"https://arxiv.org/pdf/2311.17113v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19584v1","updated":"2024-03-28T17:07:02Z","published":"2024-03-28T17:07:02Z","title":"Img2Loc: Revisiting Image Geolocalization using Multi-modality\n Foundation Models and Image-based Retrieval-Augmented Generation","summary":" Geolocating precise locations from images presents a challenging problem in\ncomputer vision and information retrieval.Traditional methods typically employ\neither classification, which dividing the Earth surface into grid cells and\nclassifying images accordingly, or retrieval, which identifying locations by\nmatching images with a database of image-location pairs. However,\nclassification-based approaches are limited by the cell size and cannot yield\nprecise predictions, while retrieval-based systems usually suffer from poor\nsearch quality and inadequate coverage of the global landscape at varied scale\nand aggregation levels. To overcome these drawbacks, we present Img2Loc, a\nnovel system that redefines image geolocalization as a text generation task.\nThis is achieved using cutting-edge large multi-modality models like GPT4V or\nLLaVA with retrieval augmented generation. Img2Loc first employs CLIP-based\nrepresentations to generate an image-based coordinate query database. It then\nuniquely combines query results with images itself, forming elaborate prompts\ncustomized for LMMs. When tested on benchmark datasets such as Im2GPS3k and\nYFCC4k, Img2Loc not only surpasses the performance of previous state-of-the-art\nmodels but does so without any model training.\n","authors":["Zhongliang Zhou","Jielu Zhang","Zihan Guan","Mengxuan Hu","Ni Lao","Lan Mu","Sheng Li","Gengchen Mai"],"pdf_url":"https://arxiv.org/pdf/2403.19584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18028v2","updated":"2024-03-28T17:06:15Z","published":"2024-03-26T18:29:39Z","title":"Predicting Species Occurrence Patterns from Partial Observations","summary":" To address the interlinked biodiversity and climate crises, we need an\nunderstanding of where species occur and how these patterns are changing.\nHowever, observational data on most species remains very limited, and the\namount of data available varies greatly between taxonomic groups. We introduce\nthe problem of predicting species occurrence patterns given (a) satellite\nimagery, and (b) known information on the occurrence of other species. To\nevaluate algorithms on this task, we introduce SatButterfly, a dataset of\nsatellite images, environmental data and observational data for butterflies,\nwhich is designed to pair with the existing SatBird dataset of bird\nobservational data. To address this task, we propose a general model, R-Tran,\nfor predicting species occurrence patterns that enables the use of partial\nobservational data wherever found. We find that R-Tran outperforms other\nmethods in predicting species encounter rates with partial information both\nwithin a taxon (birds) and across taxa (birds and butterflies). Our approach\nopens new perspectives to leveraging insights from species with abundant data\nto other species with scarce data, by modelling the ecosystems in which they\nco-occur.\n","authors":["Hager Radi Abdelwahed","Mélisande Teng","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2403.18028v2.pdf","comment":"Tackling Climate Change with Machine Learning workshop at ICLR 2024"},{"id":"http://arxiv.org/abs/2403.19580v1","updated":"2024-03-28T17:05:04Z","published":"2024-03-28T17:05:04Z","title":"OV-Uni3DETR: Towards Unified Open-Vocabulary 3D Object Detection via\n Cycle-Modality Propagation","summary":" In the current state of 3D object detection research, the severe scarcity of\nannotated 3D data, substantial disparities across different data modalities,\nand the absence of a unified architecture, have impeded the progress towards\nthe goal of universality. In this paper, we propose \\textbf{OV-Uni3DETR}, a\nunified open-vocabulary 3D detector via cycle-modality propagation. Compared\nwith existing 3D detectors, OV-Uni3DETR offers distinct advantages: 1)\nOpen-vocabulary 3D detection: During training, it leverages various accessible\ndata, especially extensive 2D detection images, to boost training diversity.\nDuring inference, it can detect both seen and unseen classes. 2) Modality\nunifying: It seamlessly accommodates input data from any given modality,\neffectively addressing scenarios involving disparate modalities or missing\nsensor information, thereby supporting test-time modality switching. 3) Scene\nunifying: It provides a unified multi-modal model architecture for diverse\nscenes collected by distinct sensors. Specifically, we propose the\ncycle-modality propagation, aimed at propagating knowledge bridging 2D and 3D\nmodalities, to support the aforementioned functionalities. 2D semantic\nknowledge from large-vocabulary learning guides novel class discovery in the 3D\ndomain, and 3D geometric knowledge provides localization supervision for 2D\ndetection images. OV-Uni3DETR achieves the state-of-the-art performance on\nvarious scenarios, surpassing existing methods by more than 6\\% on average. Its\nperformance using only RGB images is on par with or even surpasses that of\nprevious point cloud based methods. Code and pre-trained models will be\nreleased later.\n","authors":["Zhenyu Wang","Yali Li","Taichi Liu","Hengshuang Zhao","Shengjin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19579v1","updated":"2024-03-28T17:04:07Z","published":"2024-03-28T17:04:07Z","title":"The Bad Batches: Enhancing Self-Supervised Learning in Image\n Classification Through Representative Batch Curation","summary":" The pursuit of learning robust representations without human supervision is a\nlongstanding challenge. The recent advancements in self-supervised contrastive\nlearning approaches have demonstrated high performance across various\nrepresentation learning challenges. However, current methods depend on the\nrandom transformation of training examples, resulting in some cases of\nunrepresentative positive pairs that can have a large impact on learning. This\nlimitation not only impedes the convergence of the learning process but the\nrobustness of the learnt representation as well as requiring larger batch sizes\nto improve robustness to such bad batches. This paper attempts to alleviate the\ninfluence of false positive and false negative pairs by employing pairwise\nsimilarity calculations through the Fr\\'echet ResNet Distance (FRD), thereby\nobtaining robust representations from unlabelled data. The effectiveness of the\nproposed method is substantiated by empirical results, where a linear\nclassifier trained on self-supervised contrastive representations achieved an\nimpressive 87.74\\% top-1 accuracy on STL10 and 99.31\\% on the Flower102\ndataset. These results emphasize the potential of the proposed approach in\npushing the boundaries of the state-of-the-art in self-supervised contrastive\nlearning, particularly for image classification tasks.\n","authors":["Ozgu Goksu","Nicolas Pugeault"],"pdf_url":"https://arxiv.org/pdf/2403.19579v1.pdf","comment":"8 Pages, 4 figures, IEEE WCCI 2024 Conference"},{"id":"http://arxiv.org/abs/2402.19470v2","updated":"2024-03-28T16:52:45Z","published":"2024-02-29T18:57:39Z","title":"Towards Generalizable Tumor Synthesis","summary":" Tumor synthesis enables the creation of artificial tumors in medical images,\nfacilitating the training of AI models for tumor detection and segmentation.\nHowever, success in tumor synthesis hinges on creating visually realistic\ntumors that are generalizable across multiple organs and, furthermore, the\nresulting AI models being capable of detecting real tumors in images sourced\nfrom different domains (e.g., hospitals). This paper made a progressive stride\ntoward generalizable tumor synthesis by leveraging a critical observation:\nearly-stage tumors (< 2cm) tend to have similar imaging characteristics in\ncomputed tomography (CT), whether they originate in the liver, pancreas, or\nkidneys. We have ascertained that generative AI models, e.g., Diffusion Models,\ncan create realistic tumors generalized to a range of organs even when trained\non a limited number of tumor examples from only one organ. Moreover, we have\nshown that AI models trained on these synthetic tumors can be generalized to\ndetect and segment real tumors from CT volumes, encompassing a broad spectrum\nof patient demographics, imaging protocols, and healthcare facilities.\n","authors":["Qi Chen","Xiaoxi Chen","Haorui Song","Zhiwei Xiong","Alan Yuille","Chen Wei","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.19470v2.pdf","comment":"The IEEE / CVF Computer Vision and Pattern Recognition Conference\n (CVPR 2024)"},{"id":"http://arxiv.org/abs/2311.17112v2","updated":"2024-03-28T16:51:18Z","published":"2023-11-28T11:23:34Z","title":"Parameter Efficient Fine-tuning via Cross Block Orchestration for\n Segment Anything Model","summary":" Parameter-efficient fine-tuning (PEFT) is an effective methodology to unleash\nthe potential of large foundation models in novel scenarios with limited\ntraining data. In the computer vision community, PEFT has shown effectiveness\nin image classification, but little research has studied its ability for image\nsegmentation. Fine-tuning segmentation models usually require a heavier\nadjustment of parameters to align the proper projection directions in the\nparameter space for new scenarios. This raises a challenge to existing PEFT\nalgorithms, as they often inject a limited number of individual parameters into\neach block, which prevents substantial adjustment of the projection direction\nof the parameter space due to the limitation of Hidden Markov Chain along\nblocks. In this paper, we equip PEFT with a cross-block orchestration mechanism\nto enable the adaptation of the Segment Anything Model (SAM) to various\ndownstream scenarios. We introduce a novel inter-block communication module,\nwhich integrates a learnable relation matrix to facilitate communication among\ndifferent coefficient sets of each PEFT block's parameter space. Moreover, we\npropose an intra-block enhancement module, which introduces a linear projection\nhead whose weights are generated from a hyper-complex layer, further enhancing\nthe impact of the adjustment of projection directions on the entire parameter\nspace. Extensive experiments on diverse benchmarks demonstrate that our\nproposed approach consistently improves the segmentation performance\nsignificantly on novel scenarios with only around 1K additional parameters.\n","authors":["Zelin Peng","Zhengqin Xu","Zhilin Zeng","Lingxi Xie","Qi Tian","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2311.17112v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2312.02137v2","updated":"2024-03-28T16:50:37Z","published":"2023-12-04T18:56:22Z","title":"MANUS: Markerless Grasp Capture using Articulated 3D Gaussians","summary":" Understanding how we grasp objects with our hands has important applications\nin areas like robotics and mixed reality. However, this challenging problem\nrequires accurate modeling of the contact between hands and objects. To capture\ngrasps, existing methods use skeletons, meshes, or parametric models that does\nnot represent hand shape accurately resulting in inaccurate contacts. We\npresent MANUS, a method for Markerless Hand-Object Grasp Capture using\nArticulated 3D Gaussians. We build a novel articulated 3D Gaussians\nrepresentation that extends 3D Gaussian splatting for high-fidelity\nrepresentation of articulating hands. Since our representation uses Gaussian\nprimitives, it enables us to efficiently and accurately estimate contacts\nbetween the hand and the object. For the most accurate results, our method\nrequires tens of camera views that current datasets do not provide. We\ntherefore build MANUS-Grasps, a new dataset that contains hand-object grasps\nviewed from 50+ cameras across 30+ scenes, 3 subjects, and comprising over 7M\nframes. In addition to extensive qualitative results, we also show that our\nmethod outperforms others on a quantitative contact evaluation method that uses\npaint transfer from the object to the hand.\n","authors":["Chandradeep Pokhariya","Ishaan N Shah","Angela Xing","Zekun Li","Kefan Chen","Avinash Sharma","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2312.02137v2.pdf","comment":"IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2312.11598v3","updated":"2024-03-28T16:49:40Z","published":"2023-12-18T18:16:52Z","title":"SkillDiffuser: Interpretable Hierarchical Planning via Skill\n Abstractions in Diffusion-Based Task Execution","summary":" Diffusion models have demonstrated strong potential for robotic trajectory\nplanning. However, generating coherent trajectories from high-level\ninstructions remains challenging, especially for long-range composition tasks\nrequiring multiple sequential skills. We propose SkillDiffuser, an end-to-end\nhierarchical planning framework integrating interpretable skill learning with\nconditional diffusion planning to address this problem. At the higher level,\nthe skill abstraction module learns discrete, human-understandable skill\nrepresentations from visual observations and language instructions. These\nlearned skill embeddings are then used to condition the diffusion model to\ngenerate customized latent trajectories aligned with the skills. This allows\ngenerating diverse state trajectories that adhere to the learnable skills. By\nintegrating skill learning with conditional trajectory generation,\nSkillDiffuser produces coherent behavior following abstract instructions across\ndiverse tasks. Experiments on multi-task robotic manipulation benchmarks like\nMeta-World and LOReL demonstrate state-of-the-art performance and\nhuman-interpretable skill representations from SkillDiffuser. More\nvisualization results and information could be found on our website.\n","authors":["Zhixuan Liang","Yao Mu","Hengbo Ma","Masayoshi Tomizuka","Mingyu Ding","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2312.11598v3.pdf","comment":"Accepted by CVPR 2024. Camera ready version. Project page:\n https://skilldiffuser.github.io/"},{"id":"http://arxiv.org/abs/2403.16385v2","updated":"2024-03-28T16:45:44Z","published":"2024-03-25T03:02:27Z","title":"Synthesize Step-by-Step: Tools, Templates and LLMs as Data Generators\n for Reasoning-Based Chart VQA","summary":" Understanding data visualizations like charts and plots requires reasoning\nabout both visual elements and numerics. Although strong in extractive\nquestions, current chart visual question answering (chart VQA) models suffer on\ncomplex reasoning questions. In this work, we address the lack of reasoning\nability by data augmentation. We leverage Large Language Models (LLMs), which\nhave shown to have strong reasoning ability, as an automatic data annotator\nthat generates question-answer annotations for chart images. The key innovation\nin our method lies in the Synthesize Step-by-Step strategy: our LLM-based data\ngenerator learns to decompose the complex question into step-by-step\nsub-questions (rationales), which are then used to derive the final answer\nusing external tools, i.e. Python. This step-wise generation procedure is\ntrained on synthetic data generated using a template-based QA generation\npipeline. Experimental results highlight the significance of the proposed\nstep-by-step generation. By training with the LLM-augmented data (LAMENDA), we\nsignificantly enhance the chart VQA models, achieving the state-of-the-art\naccuracy on the ChartQA and PlotQA datasets. In particular, our approach\nimproves the accuracy of the previous state-of-the-art approach from 38% to 54%\non the human-written questions in the ChartQA dataset, which needs strong\nreasoning. We hope our work underscores the potential of synthetic data and\nencourages further exploration of data augmentation using LLMs for\nreasoning-heavy tasks.\n","authors":["Zhuowan Li","Bhavan Jasani","Peng Tang","Shabnam Ghadar"],"pdf_url":"https://arxiv.org/pdf/2403.16385v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19554v1","updated":"2024-03-28T16:38:04Z","published":"2024-03-28T16:38:04Z","title":"Cross-Attention is Not Always Needed: Dynamic Cross-Attention for\n Audio-Visual Dimensional Emotion Recognition","summary":" In video-based emotion recognition, audio and visual modalities are often\nexpected to have a complementary relationship, which is widely explored using\ncross-attention. However, they may also exhibit weak complementary\nrelationships, resulting in poor representations of audio-visual features, thus\ndegrading the performance of the system. To address this issue, we propose\nDynamic Cross-Attention (DCA) that can dynamically select cross-attended or\nunattended features on the fly based on their strong or weak complementary\nrelationship with each other, respectively. Specifically, a simple yet\nefficient gating layer is designed to evaluate the contribution of the\ncross-attention mechanism and choose cross-attended features only when they\nexhibit a strong complementary relationship, otherwise unattended features. We\nevaluate the performance of the proposed approach on the challenging RECOLA and\nAff-Wild2 datasets. We also compare the proposed approach with other variants\nof cross-attention and show that the proposed model consistently improves the\nperformance on both datasets.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.19554v1.pdf","comment":"Accepted at IEEE ICME2024"},{"id":"http://arxiv.org/abs/2403.19549v1","updated":"2024-03-28T16:32:06Z","published":"2024-03-28T16:32:06Z","title":"GlORIE-SLAM: Globally Optimized RGB-only Implicit Encoding Point Cloud\n SLAM","summary":" Recent advancements in RGB-only dense Simultaneous Localization and Mapping\n(SLAM) have predominantly utilized grid-based neural implicit encodings and/or\nstruggle to efficiently realize global map and pose consistency. To this end,\nwe propose an efficient RGB-only dense SLAM system using a flexible neural\npoint cloud scene representation that adapts to keyframe poses and depth\nupdates, without needing costly backpropagation. Another critical challenge of\nRGB-only SLAM is the lack of geometric priors. To alleviate this issue, with\nthe aid of a monocular depth estimator, we introduce a novel DSPO layer for\nbundle adjustment which optimizes the pose and depth of keyframes along with\nthe scale of the monocular depth. Finally, our system benefits from loop\nclosure and online global bundle adjustment and performs either better or\ncompetitive to existing dense neural RGB SLAM methods in tracking, mapping and\nrendering accuracy on the Replica, TUM-RGBD and ScanNet datasets. The source\ncode will be made available.\n","authors":["Ganlin Zhang","Erik Sandström","Youmin Zhang","Manthan Patel","Luc Van Gool","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2403.19549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15981v2","updated":"2024-03-28T16:21:30Z","published":"2024-03-24T02:15:14Z","title":"Exploring Accurate 3D Phenotyping in Greenhouse through Neural Radiance\n Fields","summary":" Accurate collection of plant phenotyping is critical to optimising\nsustainable farming practices in precision agriculture. Traditional phenotyping\nin controlled laboratory environments, while valuable, falls short in\nunderstanding plant growth under real-world conditions. Emerging sensor and\ndigital technologies offer a promising approach for direct phenotyping of\nplants in farm environments. This study investigates a learning-based\nphenotyping method using the Neural Radiance Field to achieve accurate in-situ\nphenotyping of pepper plants in greenhouse environments. To quantitatively\nevaluate the performance of this method, traditional point cloud registration\non 3D scanning data is implemented for comparison. Experimental result shows\nthat NeRF(Neural Radiance Fields) achieves competitive accuracy compared to the\n3D scanning methods. The mean distance error between the scanner-based method\nand the NeRF-based method is 0.865mm. This study shows that the learning-based\nNeRF method achieves similar accuracy to 3D scanning-based methods but with\nimproved scalability and robustness.\n","authors":["Junhong Zhao","Wei Ying","Yaoqiang Pan","Zhenfeng Yi","Chao Chen","Kewei Hu","Hanwen Kang"],"pdf_url":"https://arxiv.org/pdf/2403.15981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.08989v3","updated":"2024-03-28T16:17:43Z","published":"2022-04-13T21:20:42Z","title":"Efficient Deep Learning-based Estimation of the Vital Signs on\n Smartphones","summary":" With the increasing use of smartphones in our daily lives, these devices have\nbecome capable of performing many complex tasks. Concerning the need for\ncontinuous monitoring of vital signs, especially for the elderly or those with\ncertain types of diseases, the development of algorithms that can estimate\nvital signs using smartphones has attracted researchers worldwide. In\nparticular, researchers have been exploring ways to estimate vital signs, such\nas heart rate, oxygen saturation levels, and respiratory rate, using algorithms\nthat can be run on smartphones. However, many of these algorithms require\nmultiple pre-processing steps that might introduce some implementation\noverheads or require the design of a couple of hand-crafted stages to obtain an\noptimal result. To address this issue, this research proposes a novel\nend-to-end solution to mobile-based vital sign estimation using deep learning\nthat eliminates the need for pre-processing. By using a fully convolutional\narchitecture, the proposed model has much fewer parameters and less\ncomputational complexity compared to the architectures that use fully-connected\nlayers as the prediction heads. This also reduces the risk of overfitting.\nAdditionally, a public dataset for vital sign estimation, which includes 62\nvideos collected from 35 men and 27 women, is provided. Overall, the proposed\nend-to-end approach promises significantly improved efficiency and performance\nfor on-device health monitoring on readily available consumer electronics.\n","authors":["Taha Samavati","Mahdi Farvardin","Aboozar Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2204.08989v3.pdf","comment":"10 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2403.19539v1","updated":"2024-03-28T16:13:22Z","published":"2024-03-28T16:13:22Z","title":"De-confounded Data-free Knowledge Distillation for Handling Distribution\n Shifts","summary":" Data-Free Knowledge Distillation (DFKD) is a promising task to train\nhigh-performance small models to enhance actual deployment without relying on\nthe original training data. Existing methods commonly avoid relying on private\ndata by utilizing synthetic or sampled data. However, a long-overlooked issue\nis that the severe distribution shifts between their substitution and original\ndata, which manifests as huge differences in the quality of images and class\nproportions. The harmful shifts are essentially the confounder that\nsignificantly causes performance bottlenecks. To tackle the issue, this paper\nproposes a novel perspective with causal inference to disentangle the student\nmodels from the impact of such shifts. By designing a customized causal graph,\nwe first reveal the causalities among the variables in the DFKD task.\nSubsequently, we propose a Knowledge Distillation Causal Intervention (KDCI)\nframework based on the backdoor adjustment to de-confound the confounder. KDCI\ncan be flexibly combined with most existing state-of-the-art baselines.\nExperiments in combination with six representative DFKD methods demonstrate the\neffectiveness of our KDCI, which can obviously help existing methods under\nalmost all settings, \\textit{e.g.}, improving the baseline by up to 15.54\\%\naccuracy on the CIFAR-100 dataset.\n","authors":["Yuzheng Wang","Dingkang Yang","Zhaoyu Chen","Yang Liu","Siao Liu","Wenqiang Zhang","Lihua Zhang","Lizhe Qi"],"pdf_url":"https://arxiv.org/pdf/2403.19539v1.pdf","comment":"Accepted by CVPR24"},{"id":"http://arxiv.org/abs/2403.19534v1","updated":"2024-03-28T16:07:55Z","published":"2024-03-28T16:07:55Z","title":"Locate, Assign, Refine: Taming Customized Image Inpainting with\n Text-Subject Guidance","summary":" Prior studies have made significant progress in image inpainting guided by\neither text or subject image. However, the research on editing with their\ncombined guidance is still in the early stages. To tackle this challenge, we\npresent LAR-Gen, a novel approach for image inpainting that enables seamless\ninpainting of masked scene images, incorporating both the textual prompts and\nspecified subjects. Our approach adopts a coarse-to-fine manner to ensure\nsubject identity preservation and local semantic coherence. The process\ninvolves (i) Locate: concatenating the noise with masked scene image to achieve\nprecise regional editing, (ii) Assign: employing decoupled cross-attention\nmechanism to accommodate multi-modal guidance, and (iii) Refine: using a novel\nRefineNet to supplement subject details. Additionally, to address the issue of\nscarce training data, we introduce a novel data construction pipeline. This\npipeline extracts substantial pairs of data consisting of local text prompts\nand corresponding visual instances from a vast image dataset, leveraging\npublicly available large models. Extensive experiments and varied application\nscenarios demonstrate the superiority of LAR-Gen in terms of both identity\npreservation and text semantic consistency. Project page can be found at\n\\url{https://ali-vilab.github.io/largen-page/}.\n","authors":["Yulin Pan","Chaojie Mao","Zeyinzi Jiang","Zhen Han","Jingfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.19534v1.pdf","comment":"22 pages, 14 figures"},{"id":"http://arxiv.org/abs/2403.19527v1","updated":"2024-03-28T16:02:03Z","published":"2024-03-28T16:02:03Z","title":"Instance-Adaptive and Geometric-Aware Keypoint Learning for\n Category-Level 6D Object Pose Estimation","summary":" Category-level 6D object pose estimation aims to estimate the rotation,\ntranslation and size of unseen instances within specific categories. In this\narea, dense correspondence-based methods have achieved leading performance.\nHowever, they do not explicitly consider the local and global geometric\ninformation of different instances, resulting in poor generalization ability to\nunseen instances with significant shape variations. To deal with this problem,\nwe propose a novel Instance-Adaptive and Geometric-Aware Keypoint Learning\nmethod for category-level 6D object pose estimation (AG-Pose), which includes\ntwo key designs: (1) The first design is an Instance-Adaptive Keypoint\nDetection module, which can adaptively detect a set of sparse keypoints for\nvarious instances to represent their geometric structures. (2) The second\ndesign is a Geometric-Aware Feature Aggregation module, which can efficiently\nintegrate the local and global geometric information into keypoint features.\nThese two modules can work together to establish robust keypoint-level\ncorrespondences for unseen instances, thus enhancing the generalization ability\nof the model.Experimental results on CAMERA25 and REAL275 datasets show that\nthe proposed AG-Pose outperforms state-of-the-art methods by a large margin\nwithout category-specific shape priors.\n","authors":["Xiao Lin","Wenfei Yang","Yuan Gao","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.19527v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.19522v1","updated":"2024-03-28T15:57:20Z","published":"2024-03-28T15:57:20Z","title":"Model Stock: All we need is just a few fine-tuned models","summary":" This paper introduces an efficient fine-tuning method for large pre-trained\nmodels, offering strong in-distribution (ID) and out-of-distribution (OOD)\nperformance. Breaking away from traditional practices that need a multitude of\nfine-tuned models for averaging, our approach employs significantly fewer\nmodels to achieve final weights yet yield superior accuracy. Drawing from key\ninsights in the weight space of fine-tuned weights, we uncover a strong link\nbetween the performance and proximity to the center of weight space. Based on\nthis, we introduce a method that approximates a center-close weight using only\ntwo fine-tuned models, applicable during or after training. Our innovative\nlayer-wise weight averaging technique surpasses state-of-the-art model methods\nsuch as Model Soup, utilizing only two fine-tuned models. This strategy can be\naptly coined Model Stock, highlighting its reliance on selecting a minimal\nnumber of models to draw a more optimized-averaged model. We demonstrate the\nefficacy of Model Stock with fine-tuned models based upon pre-trained CLIP\narchitectures, achieving remarkable performance on both ID and OOD tasks on the\nstandard benchmarks, all while barely bringing extra computational demands. Our\ncode and pre-trained models are available at\nhttps://github.com/naver-ai/model-stock.\n","authors":["Dong-Hwan Jang","Sangdoo Yun","Dongyoon Han"],"pdf_url":"https://arxiv.org/pdf/2403.19522v1.pdf","comment":"Code at https://github.com/naver-ai/model-stock"},{"id":"http://arxiv.org/abs/2401.01286v4","updated":"2024-03-28T15:56:55Z","published":"2024-01-02T16:54:58Z","title":"A Comprehensive Study of Knowledge Editing for Large Language Models","summary":" Large Language Models (LLMs) have shown extraordinary capabilities in\nunderstanding and generating text that closely mirrors human communication.\nHowever, a primary limitation lies in the significant computational demands\nduring training, arising from their extensive parameterization. This challenge\nis further intensified by the dynamic nature of the world, necessitating\nfrequent updates to LLMs to correct outdated information or integrate new\nknowledge, thereby ensuring their continued relevance. Note that many\napplications demand continual model adjustments post-training to address\ndeficiencies or undesirable behaviors. There is an increasing interest in\nefficient, lightweight methods for on-the-fly model modifications. To this end,\nrecent years have seen a burgeoning in the techniques of knowledge editing for\nLLMs, which aim to efficiently modify LLMs' behaviors within specific domains\nwhile preserving overall performance across various inputs. In this paper, we\nfirst define the knowledge editing problem and then provide a comprehensive\nreview of cutting-edge approaches. Drawing inspiration from educational and\ncognitive research theories, we propose a unified categorization criterion that\nclassifies knowledge editing methods into three groups: resorting to external\nknowledge, merging knowledge into the model, and editing intrinsic knowledge.\nFurthermore, we introduce a new benchmark, KnowEdit, for a comprehensive\nempirical evaluation of representative knowledge editing approaches.\nAdditionally, we provide an in-depth analysis of knowledge location, which can\ngive a deeper understanding of the knowledge structures inherent within LLMs.\nFinally, we discuss several potential applications of knowledge editing,\noutlining its broad and impactful implications.\n","authors":["Ningyu Zhang","Yunzhi Yao","Bozhong Tian","Peng Wang","Shumin Deng","Mengru Wang","Zekun Xi","Shengyu Mao","Jintian Zhang","Yuansheng Ni","Siyuan Cheng","Ziwen Xu","Xin Xu","Jia-Chen Gu","Yong Jiang","Pengjun Xie","Fei Huang","Lei Liang","Zhiqiang Zhang","Xiaowei Zhu","Jun Zhou","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2401.01286v4.pdf","comment":"Ongoing work; 52 pages, 282 citations; benchmark is available at\n https://huggingface.co/datasets/zjunlp/KnowEdit code is available at\n https://github.com/zjunlp/EasyEdit paper list is available at\n https://github.com/zjunlp/KnowledgeEditingPapers"},{"id":"http://arxiv.org/abs/2309.13610v2","updated":"2024-03-28T15:52:16Z","published":"2023-09-24T11:19:13Z","title":"VisionKG: Unleashing the Power of Visual Datasets via Knowledge Graph","summary":" The availability of vast amounts of visual data with heterogeneous features\nis a key factor for developing, testing, and benchmarking of new computer\nvision (CV) algorithms and architectures. Most visual datasets are created and\ncurated for specific tasks or with limited image data distribution for very\nspecific situations, and there is no unified approach to manage and access them\nacross diverse sources, tasks, and taxonomies. This not only creates\nunnecessary overheads when building robust visual recognition systems, but also\nintroduces biases into learning systems and limits the capabilities of\ndata-centric AI. To address these problems, we propose the Vision Knowledge\nGraph (VisionKG), a novel resource that interlinks, organizes and manages\nvisual datasets via knowledge graphs and Semantic Web technologies. It can\nserve as a unified framework facilitating simple access and querying of\nstate-of-the-art visual datasets, regardless of their heterogeneous formats and\ntaxonomies. One of the key differences between our approach and existing\nmethods is that ours is knowledge-based rather than metadatabased. It enhances\nthe enrichment of the semantics at both image and instance levels and offers\nvarious data retrieval and exploratory services via SPARQL. VisionKG currently\ncontains 519 million RDF triples that describe approximately 40 million\nentities, and are accessible at https://vision.semkg.org and through APIs. With\nthe integration of 30 datasets and four popular CV tasks, we demonstrate its\nusefulness across various scenarios when working with CV pipelines.\n","authors":["Jicheng Yuan","Anh Le-Tuan","Manh Nguyen-Duc","Trung-Kien Tran","Manfred Hauswirth","Danh Le-Phuoc"],"pdf_url":"https://arxiv.org/pdf/2309.13610v2.pdf","comment":"Accepted at ESWC 2024"},{"id":"http://arxiv.org/abs/2312.02069v2","updated":"2024-03-28T15:51:05Z","published":"2023-12-04T17:28:35Z","title":"GaussianAvatars: Photorealistic Head Avatars with Rigged 3D Gaussians","summary":" We introduce GaussianAvatars, a new method to create photorealistic head\navatars that are fully controllable in terms of expression, pose, and\nviewpoint. The core idea is a dynamic 3D representation based on 3D Gaussian\nsplats that are rigged to a parametric morphable face model. This combination\nfacilitates photorealistic rendering while allowing for precise animation\ncontrol via the underlying parametric model, e.g., through expression transfer\nfrom a driving sequence or by manually changing the morphable model parameters.\nWe parameterize each splat by a local coordinate frame of a triangle and\noptimize for explicit displacement offset to obtain a more accurate geometric\nrepresentation. During avatar reconstruction, we jointly optimize for the\nmorphable model parameters and Gaussian splat parameters in an end-to-end\nfashion. We demonstrate the animation capabilities of our photorealistic avatar\nin several challenging scenarios. For instance, we show reenactments from a\ndriving video, where our method outperforms existing works by a significant\nmargin.\n","authors":["Shenhan Qian","Tobias Kirschstein","Liam Schoneveld","Davide Davoli","Simon Giebenhain","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2312.02069v2.pdf","comment":"Project page: https://shenhanqian.github.io/gaussian-avatars"},{"id":"http://arxiv.org/abs/2308.16682v2","updated":"2024-03-28T15:49:42Z","published":"2023-08-31T12:36:50Z","title":"DiffusionPoser: Real-time Human Motion Reconstruction From Arbitrary\n Sparse Sensors Using Autoregressive Diffusion","summary":" Motion capture from a limited number of body-worn sensors, such as inertial\nmeasurement units (IMUs) and pressure insoles, has important applications in\nhealth, human performance, and entertainment. Recent work has focused on\naccurately reconstructing whole-body motion from a specific sensor\nconfiguration using six IMUs. While a common goal across applications is to use\nthe minimal number of sensors to achieve required accuracy, the optimal\narrangement of the sensors might differ from application to application. We\npropose a single diffusion model, DiffusionPoser, which reconstructs human\nmotion in real-time from an arbitrary combination of sensors, including IMUs\nplaced at specified locations, and, pressure insoles. Unlike existing methods,\nour model grants users the flexibility to determine the number and arrangement\nof sensors tailored to the specific activity of interest, without the need for\nretraining. A novel autoregressive inferencing scheme ensures real-time motion\nreconstruction that closely aligns with measured sensor signals. The generative\nnature of DiffusionPoser ensures realistic behavior, even for\ndegrees-of-freedom not directly measured. Qualitative results can be found on\nour website: https://diffusionposer.github.io/.\n","authors":["Tom Van Wouwe","Seunghwan Lee","Antoine Falisse","Scott Delp","C. Karen Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16682v2.pdf","comment":"accepted at CVPR2024"},{"id":"http://arxiv.org/abs/2403.19517v1","updated":"2024-03-28T15:48:16Z","published":"2024-03-28T15:48:16Z","title":"XScale-NVS: Cross-Scale Novel View Synthesis with Hash Featurized\n Manifold","summary":" We propose XScale-NVS for high-fidelity cross-scale novel view synthesis of\nreal-world large-scale scenes. Existing representations based on explicit\nsurface suffer from discretization resolution or UV distortion, while implicit\nvolumetric representations lack scalability for large scenes due to the\ndispersed weight distribution and surface ambiguity. In light of the above\nchallenges, we introduce hash featurized manifold, a novel hash-based\nfeaturization coupled with a deferred neural rendering framework. This approach\nfully unlocks the expressivity of the representation by explicitly\nconcentrating the hash entries on the 2D manifold, thus effectively\nrepresenting highly detailed contents independent of the discretization\nresolution. We also introduce a novel dataset, namely GigaNVS, to benchmark\ncross-scale, high-resolution novel view synthesis of realworld large-scale\nscenes. Our method significantly outperforms competing baselines on various\nreal-world scenes, yielding an average LPIPS that is 40% lower than prior\nstate-of-the-art on the challenging GigaNVS benchmark. Please see our project\npage at: xscalenvs.github.io.\n","authors":["Guangyu Wang","Jinzhi Zhang","Fan Wang","Ruqi Huang","Lu Fang"],"pdf_url":"https://arxiv.org/pdf/2403.19517v1.pdf","comment":"Accepted to CVPR 2024. Project page: xscalenvs.github.io/"},{"id":"http://arxiv.org/abs/2403.19514v1","updated":"2024-03-28T15:45:03Z","published":"2024-03-28T15:45:03Z","title":"CDIMC-net: Cognitive Deep Incomplete Multi-view Clustering Network","summary":" In recent years, incomplete multi-view clustering, which studies the\nchallenging multi-view clustering problem on missing views, has received\ngrowing research interests. Although a series of methods have been proposed to\naddress this issue, the following problems still exist: 1) Almost all of the\nexisting methods are based on shallow models, which is difficult to obtain\ndiscriminative common representations. 2) These methods are generally sensitive\nto noise or outliers since the negative samples are treated equally as the\nimportant samples. In this paper, we propose a novel incomplete multi-view\nclustering network, called Cognitive Deep Incomplete Multi-view Clustering\nNetwork (CDIMC-net), to address these issues. Specifically, it captures the\nhigh-level features and local structure of each view by incorporating the\nview-specific deep encoders and graph embedding strategy into a framework.\nMoreover, based on the human cognition, i.e., learning from easy to hard, it\nintroduces a self-paced strategy to select the most confident samples for model\ntraining, which can reduce the negative influence of outliers. Experimental\nresults on several incomplete datasets show that CDIMC-net outperforms the\nstate-of-the-art incomplete multi-view clustering methods.\n","authors":["Jie Wen","Zheng Zhang","Yong Xu","Bob Zhang","Lunke Fei","Guo-Sen Xie"],"pdf_url":"https://arxiv.org/pdf/2403.19514v1.pdf","comment":"Accepted by IJCAI 2020"},{"id":"http://arxiv.org/abs/2403.19508v1","updated":"2024-03-28T15:41:43Z","published":"2024-03-28T15:41:43Z","title":"Debiasing Cardiac Imaging with Controlled Latent Diffusion Models","summary":" The progress in deep learning solutions for disease diagnosis and prognosis\nbased on cardiac magnetic resonance imaging is hindered by highly imbalanced\nand biased training data. To address this issue, we propose a method to\nalleviate imbalances inherent in datasets through the generation of synthetic\ndata based on sensitive attributes such as sex, age, body mass index, and\nhealth condition. We adopt ControlNet based on a denoising diffusion\nprobabilistic model to condition on text assembled from patient metadata and\ncardiac geometry derived from segmentation masks using a large-cohort study,\nspecifically, the UK Biobank. We assess our method by evaluating the realism of\nthe generated images using established quantitative metrics. Furthermore, we\nconduct a downstream classification task aimed at debiasing a classifier by\nrectifying imbalances within underrepresented groups through synthetically\ngenerated samples. Our experiments demonstrate the effectiveness of the\nproposed approach in mitigating dataset imbalances, such as the scarcity of\nyounger patients or individuals with normal BMI level suffering from heart\nfailure. This work represents a major step towards the adoption of synthetic\ndata for the development of fair and generalizable models for medical\nclassification tasks. Notably, we conduct all our experiments using a single,\nconsumer-level GPU to highlight the feasibility of our approach within\nresource-constrained environments. Our code is available at\nhttps://github.com/faildeny/debiasing-cardiac-mri.\n","authors":["Grzegorz Skorupko","Richard Osuala","Zuzanna Szafranowska","Kaisar Kushibar","Nay Aung","Steffen E Petersen","Karim Lekadir","Polyxeni Gkontra"],"pdf_url":"https://arxiv.org/pdf/2403.19508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02496v3","updated":"2024-03-28T15:33:42Z","published":"2023-07-04T08:00:31Z","title":"Learning to reconstruct the bubble distribution with conductivity maps\n using Invertible Neural Networks and Error Diffusion","summary":" Electrolysis is crucial for eco-friendly hydrogen production, but gas bubbles\ngenerated during the process hinder reactions, reduce cell efficiency, and\nincrease energy consumption. Additionally, these gas bubbles cause changes in\nthe conductivity inside the cell, resulting in corresponding variations in the\ninduced magnetic field around the cell. Therefore, measuring these gas\nbubble-induced magnetic field fluctuations using external magnetic sensors and\nsolving the inverse problem of Biot-Savart Law allows for estimating the\nconductivity in the cell and, thus, bubble size and location. However,\ndetermining high-resolution conductivity maps from only a few induced magnetic\nfield measurements is an ill-posed inverse problem. To overcome this, we\nexploit Invertible Neural Networks (INNs) to reconstruct the conductivity\nfield. Our qualitative results and quantitative evaluation using random error\ndiffusion show that INN achieves far superior performance compared to Tikhonov\nregularization.\n","authors":["Nishant Kumar","Lukas Krause","Thomas Wondrak","Sven Eckert","Kerstin Eckert","Stefan Gumhold"],"pdf_url":"https://arxiv.org/pdf/2307.02496v3.pdf","comment":"Accepted for Oral presentation at WCIPT11 (11th World Congress on\n Industrial Process Tomography)"},{"id":"http://arxiv.org/abs/2403.19501v1","updated":"2024-03-28T15:31:36Z","published":"2024-03-28T15:31:36Z","title":"RELI11D: A Comprehensive Multimodal Human Motion Dataset and Method","summary":" Comprehensive capturing of human motions requires both accurate captures of\ncomplex poses and precise localization of the human within scenes. Most of the\nHPE datasets and methods primarily rely on RGB, LiDAR, or IMU data. However,\nsolely using these modalities or a combination of them may not be adequate for\nHPE, particularly for complex and fast movements. For holistic human motion\nunderstanding, we present RELI11D, a high-quality multimodal human motion\ndataset involves LiDAR, IMU system, RGB camera, and Event camera. It records\nthe motions of 10 actors performing 5 sports in 7 scenes, including 3.32 hours\nof synchronized LiDAR point clouds, IMU measurement data, RGB videos and Event\nsteams. Through extensive experiments, we demonstrate that the RELI11D presents\nconsiderable challenges and opportunities as it contains many rapid and complex\nmotions that require precise location. To address the challenge of integrating\ndifferent modalities, we propose LEIR, a multimodal baseline that effectively\nutilizes LiDAR Point Cloud, Event stream, and RGB through our cross-attention\nfusion strategy. We show that LEIR exhibits promising results for rapid motions\nand daily motions and that utilizing the characteristics of multiple modalities\ncan indeed improve HPE performance. Both the dataset and source code will be\nreleased publicly to the research community, fostering collaboration and\nenabling further exploration in this field.\n","authors":["Ming Yan","Yan Zhang","Shuqiang Cai","Shuqi Fan","Xincheng Lin","Yudi Dai","Siqi Shen","Chenglu Wen","Lan Xu","Yuexin Ma","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19501v1.pdf","comment":"CVPR2024, Project website: http://www.lidarhumanmotion.net/reli11d/"},{"id":"http://arxiv.org/abs/2403.19497v1","updated":"2024-03-28T15:27:34Z","published":"2024-03-28T15:27:34Z","title":"Surface-based parcellation and vertex-wise analysis of ultra\n high-resolution ex vivo 7 tesla MRI in neurodegenerative diseases","summary":" Magnetic resonance imaging (MRI) is the standard modality to understand human\nbrain structure and function in vivo (antemortem). Decades of research in human\nneuroimaging has led to the widespread development of methods and tools to\nprovide automated volume-based segmentations and surface-based parcellations\nwhich help localize brain functions to specialized anatomical regions. Recently\nex vivo (postmortem) imaging of the brain has opened-up avenues to study brain\nstructure at sub-millimeter ultra high-resolution revealing details not\npossible to observe with in vivo MRI. Unfortunately, there has been limited\nmethodological development in ex vivo MRI primarily due to lack of datasets and\nlimited centers with such imaging resources. Therefore, in this work, we\npresent one-of-its-kind dataset of 82 ex vivo T2w whole brain hemispheres MRI\nat 0.3 mm isotropic resolution spanning Alzheimer's disease and related\ndementias. We adapted and developed a fast and easy-to-use automated\nsurface-based pipeline to parcellate, for the first time, ultra high-resolution\nex vivo brain tissue at the native subject space resolution using the\nDesikan-Killiany-Tourville (DKT) brain atlas. This allows us to perform\nvertex-wise analysis in the template space and thereby link morphometry\nmeasures with pathology measurements derived from histology. We will\nopen-source our dataset docker container, Jupyter notebooks for ready-to-use\nout-of-the-box set of tools and command line options to advance ex vivo MRI\nclinical brain imaging research on the project webpage.\n","authors":["Pulkit Khandelwal","Michael Tran Duong","Constanza Fuentes","Amanda Denning","Winifred Trotman","Ranjit Ittyerah","Alejandra Bahena","Theresa Schuck","Marianna Gabrielyan","Karthik Prabhakaran","Daniel Ohm","Gabor Mizsei","John Robinson","Monica Munoz","John Detre","Edward Lee","David Irwin","Corey McMillan","M. Dylan Tisdall","Sandhitsu Das","David Wolk","Paul A. Yushkevich"],"pdf_url":"https://arxiv.org/pdf/2403.19497v1.pdf","comment":"Under review at MICCAI 2024"},{"id":"http://arxiv.org/abs/2403.19495v1","updated":"2024-03-28T15:27:13Z","published":"2024-03-28T15:27:13Z","title":"CoherentGS: Sparse Novel View Synthesis with Coherent 3D Gaussians","summary":" The field of 3D reconstruction from images has rapidly evolved in the past\nfew years, first with the introduction of Neural Radiance Field (NeRF) and more\nrecently with 3D Gaussian Splatting (3DGS). The latter provides a significant\nedge over NeRF in terms of the training and inference speed, as well as the\nreconstruction quality. Although 3DGS works well for dense input images, the\nunstructured point-cloud like representation quickly overfits to the more\nchallenging setup of extremely sparse input images (e.g., 3 images), creating a\nrepresentation that appears as a jumble of needles from novel views. To address\nthis issue, we propose regularized optimization and depth-based initialization.\nOur key idea is to introduce a structured Gaussian representation that can be\ncontrolled in 2D image space. We then constraint the Gaussians, in particular\ntheir position, and prevent them from moving independently during optimization.\nSpecifically, we introduce single and multiview constraints through an implicit\nconvolutional decoder and a total variation loss, respectively. With the\ncoherency introduced to the Gaussians, we further constrain the optimization\nthrough a flow-based loss function. To support our regularized optimization, we\npropose an approach to initialize the Gaussians using monocular depth estimates\nat each input view. We demonstrate significant improvements compared to the\nstate-of-the-art sparse-view NeRF-based approaches on a variety of scenes.\n","authors":["Avinash Paliwal","Wei Ye","Jinhui Xiong","Dmytro Kotovenko","Rakesh Ranjan","Vikas Chandra","Nima Khademi Kalantari"],"pdf_url":"https://arxiv.org/pdf/2403.19495v1.pdf","comment":"Project page: https://people.engr.tamu.edu/nimak/Papers/CoherentGS"},{"id":"http://arxiv.org/abs/2403.14472v2","updated":"2024-03-28T15:24:17Z","published":"2024-03-21T15:18:30Z","title":"Detoxifying Large Language Models via Knowledge Editing","summary":" This paper investigates using knowledge editing techniques to detoxify Large\nLanguage Models (LLMs). We construct a benchmark, SafeEdit, which covers nine\nunsafe categories with various powerful attack prompts and equips comprehensive\nmetrics for systematic evaluation. We conduct experiments with several\nknowledge editing approaches, indicating that knowledge editing has the\npotential to efficiently detoxify LLMs with limited impact on general\nperformance. Then, we propose a simple yet effective baseline, dubbed\nDetoxifying with Intraoperative Neural Monitoring (DINM), to diminish the\ntoxicity of LLMs within a few tuning steps via only one instance. We further\nprovide an in-depth analysis of the internal mechanism for various detoxify\napproaches, demonstrating that previous methods like SFT and DPO may merely\nsuppress the activations of toxic parameters, while DINM mitigates the toxicity\nof the toxic parameters to a certain extent, making permanent adjustments. We\nhope that these insights could shed light on future work of developing\ndetoxifying approaches and the underlying knowledge mechanisms of LLMs. Code\nand benchmark are available at https://github.com/zjunlp/EasyEdit.\n","authors":["Mengru Wang","Ningyu Zhang","Ziwen Xu","Zekun Xi","Shumin Deng","Yunzhi Yao","Qishen Zhang","Linyi Yang","Jindong Wang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14472v2.pdf","comment":"Ongoing work. Project website:\n https://zjunlp.github.io/project/SafeEdit Due to the specificity of the\n knowledge editing setting, we revise Tables 1 and 3 to present a fair\n comparison of experimental results. More experimental results will be updated\n soon"},{"id":"http://arxiv.org/abs/2403.17608v2","updated":"2024-03-28T15:24:16Z","published":"2024-03-26T11:39:00Z","title":"Fake or JPEG? Revealing Common Biases in Generated Image Detection\n Datasets","summary":" The widespread adoption of generative image models has highlighted the urgent\nneed to detect artificial content, which is a crucial step in combating\nwidespread manipulation and misinformation. Consequently, numerous detectors\nand associated datasets have emerged. However, many of these datasets\ninadvertently introduce undesirable biases, thereby impacting the effectiveness\nand evaluation of detectors. In this paper, we emphasize that many datasets for\nAI-generated image detection contain biases related to JPEG compression and\nimage size. Using the GenImage dataset, we demonstrate that detectors indeed\nlearn from these undesired factors. Furthermore, we show that removing the\nnamed biases substantially increases robustness to JPEG compression and\nsignificantly alters the cross-generator performance of evaluated detectors.\nSpecifically, it leads to more than 11 percentage points increase in\ncross-generator performance for ResNet50 and Swin-T detectors on the GenImage\ndataset, achieving state-of-the-art results.\n We provide the dataset and source codes of this paper on the anonymous\nwebsite: https://www.unbiased-genimage.org\n","authors":["Patrick Grommelt","Louis Weiss","Franz-Josef Pfreundt","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2403.17608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19492v1","updated":"2024-03-28T15:23:52Z","published":"2024-03-28T15:23:52Z","title":"Segmentation tool for images of cracks","summary":" Safety-critical infrastructures, such as bridges, are periodically inspected\nto check for existing damage, such as fatigue cracks and corrosion, and to\nguarantee the safe use of the infrastructure. Visual inspection is the most\nfrequent type of general inspection, despite the fact that its detection\ncapability is rather limited, especially for fatigue cracks. Machine learning\nalgorithms can be used for augmenting the capability of classical visual\ninspection of bridge structures, however, the implementation of such an\nalgorithm requires a massive annotated training dataset, which is\ntime-consuming to produce. This paper proposes a semi-automatic crack\nsegmentation tool that eases the manual segmentation of cracks on images needed\nto create a training dataset for a machine learning algorithm. Also, it can be\nused to measure the geometry of the crack. This tool makes use of an image\nprocessing algorithm, which was initially developed for the analysis of\nvascular systems on retinal images. The algorithm relies on a multi-orientation\nwavelet transform, which is applied to the image to construct the so-called\n\"orientation scores\", i.e. a modified version of the image. Afterwards, the\nfiltered orientation scores are used to formulate an optimal path problem that\nidentifies the crack. The globally optimal path between manually selected crack\nendpoints is computed, using a state-of-the-art geometric tracking method. The\npixel-wise segmentation is done afterwards using the obtained crack path. The\nproposed method outperforms fully automatic methods and shows potential to be\nan adequate alternative to the manual data annotation.\n","authors":["Andrii Kompanets","Remco Duits","Davide Leonetti","Nicky van den Berg","H. H."," Snijder"],"pdf_url":"https://arxiv.org/pdf/2403.19492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19490v1","updated":"2024-03-28T15:22:29Z","published":"2024-03-28T15:22:29Z","title":"Jointly Training and Pruning CNNs via Learnable Agent Guidance and\n Alignment","summary":" Structural model pruning is a prominent approach used for reducing the\ncomputational cost of Convolutional Neural Networks (CNNs) before their\ndeployment on resource-constrained devices. Yet, the majority of proposed ideas\nrequire a pretrained model before pruning, which is costly to secure. In this\npaper, we propose a novel structural pruning approach to jointly learn the\nweights and structurally prune architectures of CNN models. The core element of\nour method is a Reinforcement Learning (RL) agent whose actions determine the\npruning ratios of the CNN model's layers, and the resulting model's accuracy\nserves as its reward. We conduct the joint training and pruning by iteratively\ntraining the model's weights and the agent's policy, and we regularize the\nmodel's weights to align with the selected structure by the agent. The evolving\nmodel's weights result in a dynamic reward function for the agent, which\nprevents using prominent episodic RL methods with stationary environment\nassumption for our purpose. We address this challenge by designing a mechanism\nto model the complex changing dynamics of the reward function and provide a\nrepresentation of it to the RL agent. To do so, we take a learnable embedding\nfor each training epoch and employ a recurrent model to calculate a\nrepresentation of the changing environment. We train the recurrent model and\nembeddings using a decoder model to reconstruct observed rewards. Such a design\nempowers our agent to effectively leverage episodic observations along with the\nenvironment representations to learn a proper policy to determine performant\nsub-networks of the CNN model. Our extensive experiments on CIFAR-10 and\nImageNet using ResNets and MobileNets demonstrate the effectiveness of our\nmethod.\n","authors":["Alireza Ganjdanesh","Shangqian Gao","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2403.19490v1.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR\n 2024"},{"id":"http://arxiv.org/abs/2311.16516v4","updated":"2024-03-28T15:15:04Z","published":"2023-11-27T18:20:03Z","title":"Segment Every Out-of-Distribution Object","summary":" Semantic segmentation models, while effective for in-distribution categories,\nface challenges in real-world deployment due to encountering\nout-of-distribution (OoD) objects. Detecting these OoD objects is crucial for\nsafety-critical applications. Existing methods rely on anomaly scores, but\nchoosing a suitable threshold for generating masks presents difficulties and\ncan lead to fragmentation and inaccuracy. This paper introduces a method to\nconvert anomaly \\textbf{S}core \\textbf{T}o segmentation \\textbf{M}ask, called\nS2M, a simple and effective framework for OoD detection in semantic\nsegmentation. Unlike assigning anomaly scores to pixels, S2M directly segments\nthe entire OoD object. By transforming anomaly scores into prompts for a\npromptable segmentation model, S2M eliminates the need for threshold selection.\nExtensive experiments demonstrate that S2M outperforms the state-of-the-art by\napproximately 20% in IoU and 40% in mean F1 score, on average, across various\nbenchmarks including Fishyscapes, Segment-Me-If-You-Can, and RoadAnomaly\ndatasets.\n","authors":["Wenjie Zhao","Jia Li","Xin Dong","Yu Xiang","Yunhui Guo"],"pdf_url":"https://arxiv.org/pdf/2311.16516v4.pdf","comment":"20 pages, 14 figures"},{"id":"http://arxiv.org/abs/2403.19474v1","updated":"2024-03-28T15:01:58Z","published":"2024-03-28T15:01:58Z","title":"SG-PGM: Partial Graph Matching Network with Semantic Geometric Fusion\n for 3D Scene Graph Alignment and Its Downstream Tasks","summary":" Scene graphs have been recently introduced into 3D spatial understanding as a\ncomprehensive representation of the scene. The alignment between 3D scene\ngraphs is the first step of many downstream tasks such as scene graph aided\npoint cloud registration, mosaicking, overlap checking, and robot navigation.\nIn this work, we treat 3D scene graph alignment as a partial graph-matching\nproblem and propose to solve it with a graph neural network. We reuse the\ngeometric features learned by a point cloud registration method and associate\nthe clustered point-level geometric features with the node-level semantic\nfeature via our designed feature fusion module. Partial matching is enabled by\nusing a learnable method to select the top-k similar node pairs. Subsequent\ndownstream tasks such as point cloud registration are achieved by running a\npre-trained registration network within the matched regions. We further propose\na point-matching rescoring method, that uses the node-wise alignment of the 3D\nscene graph to reweight the matching candidates from a pre-trained point cloud\nregistration method. It reduces the false point correspondences estimated\nespecially in low-overlapping cases. Experiments show that our method improves\nthe alignment accuracy by 10~20% in low-overlap and random transformation\nscenarios and outperforms the existing work in multiple downstream tasks.\n","authors":["Yaxu Xie","Alain Pagani","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2403.19474v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.15905v3","updated":"2024-03-28T15:00:04Z","published":"2024-03-23T18:19:02Z","title":"Towards Low-Energy Adaptive Personalization for Resource-Constrained\n Devices","summary":" The personalization of machine learning (ML) models to address data drift is\na significant challenge in the context of Internet of Things (IoT)\napplications. Presently, most approaches focus on fine-tuning either the full\nbase model or its last few layers to adapt to new data, while often neglecting\nenergy costs. However, various types of data drift exist, and fine-tuning the\nfull base model or the last few layers may not result in optimal performance in\ncertain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy\nadaptive personalization framework designed for resource-constrained devices.\nWe categorize data drift and personalization into three types: input-level,\nfeature-level, and output-level. For each type, we fine-tune different blocks\nof the model to achieve optimal performance with reduced energy costs.\nSpecifically, input-, feature-, and output-level correspond to fine-tuning the\nfront, middle, and rear blocks of the model. We evaluate TBFT on a ResNet\nmodel, three datasets, three different training sizes, and a Raspberry Pi.\nCompared with the $Block Avg$, where each block is fine-tuned individually and\ntheir performance improvements are averaged, TBFT exhibits an improvement in\nmodel accuracy by an average of 15.30% whilst saving 41.57% energy consumption\non average compared with full fine-tuning.\n","authors":["Yushan Huang","Josh Millar","Yuxuan Long","Yuchen Zhao","Hamed Hadaddi"],"pdf_url":"https://arxiv.org/pdf/2403.15905v3.pdf","comment":"Accepetd to The 4th Workshop on Machine Learning and Systems\n (EuroMLSys '24)"},{"id":"http://arxiv.org/abs/2403.19473v1","updated":"2024-03-28T14:59:56Z","published":"2024-03-28T14:59:56Z","title":"Benchmarking Implicit Neural Representation and Geometric Rendering in\n Real-Time RGB-D SLAM","summary":" Implicit neural representation (INR), in combination with geometric\nrendering, has recently been employed in real-time dense RGB-D SLAM. Despite\nactive research endeavors being made, there lacks a unified protocol for fair\nevaluation, impeding the evolution of this area. In this work, we establish, to\nour knowledge, the first open-source benchmark framework to evaluate the\nperformance of a wide spectrum of commonly used INRs and rendering functions\nfor mapping and localization. The goal of our benchmark is to 1) gain an\nintuition of how different INRs and rendering functions impact mapping and\nlocalization and 2) establish a unified evaluation protocol w.r.t. the design\nchoices that may impact the mapping and localization. With the framework, we\nconduct a large suite of experiments, offering various insights in choosing the\nINRs and geometric rendering functions: for example, the dense feature grid\noutperforms other INRs (e.g. tri-plane and hash grid), even when geometric and\ncolor features are jointly encoded for memory efficiency. To extend the\nfindings into the practical scenario, a hybrid encoding strategy is proposed to\nbring the best of the accuracy and completion from the grid-based and\ndecomposition-based INRs. We further propose explicit hybrid encoding for\nhigh-fidelity dense grid mapping to comply with the RGB-D SLAM system that puts\nthe premise on robustness and computation efficiency.\n","authors":["Tongyan Hua","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19473v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18361v2","updated":"2024-03-28T14:59:44Z","published":"2024-03-27T08:53:13Z","title":"ViTAR: Vision Transformer with Any Resolution","summary":" This paper tackles a significant challenge faced by Vision Transformers\n(ViTs): their constrained scalability across different image resolutions.\nTypically, ViTs experience a performance decline when processing resolutions\ndifferent from those seen during training. Our work introduces two key\ninnovations to address this issue. Firstly, we propose a novel module for\ndynamic resolution adjustment, designed with a single Transformer block,\nspecifically to achieve highly efficient incremental token integration.\nSecondly, we introduce fuzzy positional encoding in the Vision Transformer to\nprovide consistent positional awareness across multiple resolutions, thereby\npreventing overfitting to any single training resolution. Our resulting model,\nViTAR (Vision Transformer with Any Resolution), demonstrates impressive\nadaptability, achieving 83.3\\% top-1 accuracy at a 1120x1120 resolution and\n80.4\\% accuracy at a 4032x4032 resolution, all while reducing computational\ncosts. ViTAR also shows strong performance in downstream tasks such as instance\nand semantic segmentation and can easily combined with self-supervised learning\ntechniques like Masked AutoEncoder. Our work provides a cost-effective solution\nfor enhancing the resolution scalability of ViTs, paving the way for more\nversatile and efficient high-resolution image processing.\n","authors":["Qihang Fan","Quanzeng You","Xiaotian Han","Yongfei Liu","Yunzhe Tao","Huaibo Huang","Ran He","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2403.18361v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17216v2","updated":"2024-03-28T14:58:59Z","published":"2023-11-28T20:40:45Z","title":"Self-Discovering Interpretable Diffusion Latent Directions for\n Responsible Text-to-Image Generation","summary":" Diffusion-based models have gained significant popularity for text-to-image\ngeneration due to their exceptional image-generation capabilities. A risk with\nthese models is the potential generation of inappropriate content, such as\nbiased or harmful images. However, the underlying reasons for generating such\nundesired content from the perspective of the diffusion model's internal\nrepresentation remain unclear. Previous work interprets vectors in an\ninterpretable latent space of diffusion models as semantic concepts. However,\nexisting approaches cannot discover directions for arbitrary concepts, such as\nthose related to inappropriate concepts. In this work, we propose a novel\nself-supervised approach to find interpretable latent directions for a given\nconcept. With the discovered vectors, we further propose a simple approach to\nmitigate inappropriate generation. Extensive experiments have been conducted to\nverify the effectiveness of our mitigation approach, namely, for fair\ngeneration, safe generation, and responsible text-enhancing generation. Project\npage: \\url{https://interpretdiffusion.github.io}.\n","authors":["Hang Li","Chengzhi Shen","Philip Torr","Volker Tresp","Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2311.17216v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19467v1","updated":"2024-03-28T14:47:32Z","published":"2024-03-28T14:47:32Z","title":"Beyond Talking -- Generating Holistic 3D Human Dyadic Motion for\n Communication","summary":" In this paper, we introduce an innovative task focused on human\ncommunication, aiming to generate 3D holistic human motions for both speakers\nand listeners. Central to our approach is the incorporation of factorization to\ndecouple audio features and the combination of textual semantic information,\nthereby facilitating the creation of more realistic and coordinated movements.\nWe separately train VQ-VAEs with respect to the holistic motions of both\nspeaker and listener. We consider the real-time mutual influence between the\nspeaker and the listener and propose a novel chain-like transformer-based\nauto-regressive model specifically designed to characterize real-world\ncommunication scenarios effectively which can generate the motions of both the\nspeaker and the listener simultaneously. These designs ensure that the results\nwe generate are both coordinated and diverse. Our approach demonstrates\nstate-of-the-art performance on two benchmark datasets. Furthermore, we\nintroduce the HoCo holistic communication dataset, which is a valuable resource\nfor future research. Our HoCo dataset and code will be released for research\npurposes upon acceptance.\n","authors":["Mingze Sun","Chao Xu","Xinyu Jiang","Yang Liu","Baigui Sun","Ruqi Huang"],"pdf_url":"https://arxiv.org/pdf/2403.19467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19456v1","updated":"2024-03-28T14:27:36Z","published":"2024-03-28T14:27:36Z","title":"Break-for-Make: Modular Low-Rank Adaptations for Composable\n Content-Style Customization","summary":" Personalized generation paradigms empower designers to customize visual\nintellectual properties with the help of textual descriptions by tuning or\nadapting pre-trained text-to-image models on a few images. Recent works explore\napproaches for concurrently customizing both content and detailed visual style\nappearance. However, these existing approaches often generate images where the\ncontent and style are entangled. In this study, we reconsider the customization\nof content and style concepts from the perspective of parameter space\nconstruction. Unlike existing methods that utilize a shared parameter space for\ncontent and style, we propose a learning framework that separates the parameter\nspace to facilitate individual learning of content and style, thereby enabling\ndisentangled content and style. To achieve this goal, we introduce \"partly\nlearnable projection\" (PLP) matrices to separate the original adapters into\ndivided sub-parameter spaces. We propose \"break-for-make\" customization\nlearning pipeline based on PLP, which is simple yet effective. We break the\noriginal adapters into \"up projection\" and \"down projection\", train content and\nstyle PLPs individually with the guidance of corresponding textual prompts in\nthe separate adapters, and maintain generalization by employing a\nmulti-correspondence projection learning strategy. Based on the adapters broken\napart for separate training content and style, we then make the entity\nparameter space by reconstructing the content and style PLPs matrices, followed\nby fine-tuning the combined adapter to generate the target object with the\ndesired appearance. Experiments on various styles, including textures,\nmaterials, and artistic style, show that our method outperforms\nstate-of-the-art single/multiple concept learning pipelines in terms of\ncontent-style-prompt alignment.\n","authors":["Yu Xu","Fan Tang","Juan Cao","Yuxin Zhang","Oliver Deussen","Weiming Dong","Jintao Li","Tong-Yee Lee"],"pdf_url":"https://arxiv.org/pdf/2403.19456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03008v2","updated":"2024-03-28T14:16:09Z","published":"2023-09-06T13:54:31Z","title":"Sparse 3D Reconstruction via Object-Centric Ray Sampling","summary":" We propose a novel method for 3D object reconstruction from a sparse set of\nviews captured from a 360-degree calibrated camera rig. We represent the object\nsurface through a hybrid model that uses both an MLP-based neural\nrepresentation and a triangle mesh. A key contribution in our work is a novel\nobject-centric sampling scheme of the neural representation, where rays are\nshared among all views. This efficiently concentrates and reduces the number of\nsamples used to update the neural model at each iteration. This sampling scheme\nrelies on the mesh representation to ensure also that samples are\nwell-distributed along its normals. The rendering is then performed efficiently\nby a differentiable renderer. We demonstrate that this sampling scheme results\nin a more effective training of the neural representation, does not require the\nadditional supervision of segmentation masks, yields state of the art 3D\nreconstructions, and works with sparse views on the Google's Scanned Objects,\nTank and Temples and MVMC Car datasets. Code available at:\nhttps://github.com/llukmancerkezi/ROSTER\n","authors":["Llukman Cerkezi","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2309.03008v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19444v1","updated":"2024-03-28T14:15:13Z","published":"2024-03-28T14:15:13Z","title":"Transparent and Clinically Interpretable AI for Lung Cancer Detection in\n Chest X-Rays","summary":" The rapidly advancing field of Explainable Artificial Intelligence (XAI) aims\nto tackle the issue of trust regarding the use of complex black-box deep\nlearning models in real-world applications. Existing post-hoc XAI techniques\nhave recently been shown to have poor performance on medical data, producing\nunreliable explanations which are infeasible for clinical use. To address this,\nwe propose an ante-hoc approach based on concept bottleneck models which\nintroduces for the first time clinical concepts into the classification\npipeline, allowing the user valuable insight into the decision-making process.\nOn a large public dataset of chest X-rays and associated medical reports, we\nfocus on the binary classification task of lung cancer detection. Our approach\nyields improved classification performance in lung cancer detection when\ncompared to baseline deep learning models (F1 > 0.9), while also generating\nclinically relevant and more reliable explanations than existing techniques. We\nevaluate our approach against post-hoc image XAI techniques LIME and SHAP, as\nwell as CXR-LLaVA, a recent textual XAI tool which operates in the context of\nquestion answering on chest X-rays.\n","authors":["Amy Rafferty","Rishi Ramaesh","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2403.19444v1.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.09412v2","updated":"2024-03-28T14:10:08Z","published":"2024-03-14T14:03:29Z","title":"OpenGraph: Open-Vocabulary Hierarchical 3D Graph Representation in\n Large-Scale Outdoor Environments","summary":" Environment representations endowed with sophisticated semantics are pivotal\nfor facilitating seamless interaction between robots and humans, enabling them\nto effectively carry out various tasks. Open-vocabulary maps, powered by\nVisual-Language models (VLMs), possess inherent advantages, including zero-shot\nlearning and support for open-set classes. However, existing open-vocabulary\nmaps are primarily designed for small-scale environments, such as desktops or\nrooms, and are typically geared towards limited-area tasks involving robotic\nindoor navigation or in-place manipulation. They face challenges in direct\ngeneralization to outdoor environments characterized by numerous objects and\ncomplex tasks, owing to limitations in both understanding level and map\nstructure. In this work, we propose OpenGraph, the first open-vocabulary\nhierarchical graph representation designed for large-scale outdoor\nenvironments. OpenGraph initially extracts instances and their captions from\nvisual images, enhancing textual reasoning by encoding them. Subsequently, it\nachieves 3D incremental object-centric mapping with feature embedding by\nprojecting images onto LiDAR point clouds. Finally, the environment is\nsegmented based on lane graph connectivity to construct a hierarchical graph.\nValidation results from public dataset SemanticKITTI demonstrate that OpenGraph\nachieves the highest segmentation and query accuracy. The source code of\nOpenGraph is publicly available at https://github.com/BIT-DYN/OpenGraph.\n","authors":["Yinan Deng","Jiahui Wang","Jingyu Zhao","Xinyu Tian","Guangyan Chen","Yi Yang","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2403.09412v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19438v1","updated":"2024-03-28T14:07:13Z","published":"2024-03-28T14:07:13Z","title":"SubjectDrive: Scaling Generative Data in Autonomous Driving via Subject\n Control","summary":" Autonomous driving progress relies on large-scale annotated datasets. In this\nwork, we explore the potential of generative models to produce vast quantities\nof freely-labeled data for autonomous driving applications and present\nSubjectDrive, the first model proven to scale generative data production in a\nway that could continuously improve autonomous driving applications. We\ninvestigate the impact of scaling up the quantity of generative data on the\nperformance of downstream perception models and find that enhancing data\ndiversity plays a crucial role in effectively scaling generative data\nproduction. Therefore, we have developed a novel model equipped with a subject\ncontrol mechanism, which allows the generative model to leverage diverse\nexternal data sources for producing varied and useful data. Extensive\nevaluations confirm SubjectDrive's efficacy in generating scalable autonomous\ndriving training data, marking a significant step toward revolutionizing data\nproduction methods in this field.\n","authors":["Binyuan Huang","Yuqing Wen","Yucheng Zhao","Yaosi Hu","Yingfei Liu","Fan Jia","Weixin Mao","Tiancai Wang","Chi Zhang","Chang Wen Chen","Zhenzhong Chen","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.19438v1.pdf","comment":"Project page: https://subjectdrive.github.io/"},{"id":"http://arxiv.org/abs/2403.19435v1","updated":"2024-03-28T14:04:17Z","published":"2024-03-28T14:04:17Z","title":"BAMM: Bidirectional Autoregressive Motion Model","summary":" Generating human motion from text has been dominated by denoising motion\nmodels either through diffusion or generative masking process. However, these\nmodels face great limitations in usability by requiring prior knowledge of the\nmotion length. Conversely, autoregressive motion models address this limitation\nby adaptively predicting motion endpoints, at the cost of degraded generation\nquality and editing capabilities. To address these challenges, we propose\nBidirectional Autoregressive Motion Model (BAMM), a novel text-to-motion\ngeneration framework. BAMM consists of two key components: (1) a motion\ntokenizer that transforms 3D human motion into discrete tokens in latent space,\nand (2) a masked self-attention transformer that autoregressively predicts\nrandomly masked tokens via a hybrid attention masking strategy. By unifying\ngenerative masked modeling and autoregressive modeling, BAMM captures rich and\nbidirectional dependencies among motion tokens, while learning the\nprobabilistic mapping from textual inputs to motion outputs with\ndynamically-adjusted motion sequence length. This feature enables BAMM to\nsimultaneously achieving high-quality motion generation with enhanced usability\nand built-in motion editability. Extensive experiments on HumanML3D and KIT-ML\ndatasets demonstrate that BAMM surpasses current state-of-the-art methods in\nboth qualitative and quantitative measures.\n","authors":["Ekkasit Pinyoanuntapong","Muhammad Usama Saleem","Pu Wang","Minwoo Lee","Srijan Das","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.19435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19428v1","updated":"2024-03-28T13:58:05Z","published":"2024-03-28T13:58:05Z","title":"Burst Super-Resolution with Diffusion Models for Improving Perceptual\n Quality","summary":" While burst LR images are useful for improving the SR image quality compared\nwith a single LR image, prior SR networks accepting the burst LR images are\ntrained in a deterministic manner, which is known to produce a blurry SR image.\nIn addition, it is difficult to perfectly align the burst LR images, making the\nSR image more blurry. Since such blurry images are perceptually degraded, we\naim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity\nimages can be reconstructed by diffusion models. However, prior SR methods\nusing the diffusion model are not properly optimized for the burst SR task.\nSpecifically, the reverse process starting from a random sample is not\noptimized for image enhancement and restoration methods, including burst SR. In\nour proposed method, on the other hand, burst LR features are used to\nreconstruct the initial burst SR image that is fed into an intermediate step in\nthe diffusion model. This reverse process from the intermediate step 1) skips\ndiffusion steps for reconstructing the global structure of the image and 2)\nfocuses on steps for refining detailed textures. Our experimental results\ndemonstrate that our method can improve the scores of the perceptual quality\nmetrics. Code: https://github.com/placerkyo/BSRD\n","authors":["Kyotaro Tokoro","Kazutoshi Akita","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.19428v1.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2403.19425v1","updated":"2024-03-28T13:56:26Z","published":"2024-03-28T13:56:26Z","title":"A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation:\n Generalizability and Clinical Utility Beyond the ISLES Challenge","summary":" Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment\ndecisions, and prognosis. However, image and disease variability hinder the\ndevelopment of generalizable AI algorithms with clinical value. We address this\ngap by presenting a novel ensemble algorithm derived from the 2022 Ischemic\nStroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient\nscans with ischemic stroke from various medical centers, facilitating the\ndevelopment of a wide range of cutting-edge segmentation algorithms by the\nresearch community. Through collaboration with leading teams, we combined\ntop-performing algorithms into an ensemble model that overcomes the limitations\nof individual solutions. Our ensemble model achieved superior ischemic lesion\ndetection and segmentation accuracy on our internal test set compared to\nindividual algorithms. This accuracy generalized well across diverse image and\ndisease variables. Furthermore, the model excelled in extracting clinical\nbiomarkers. Notably, in a Turing-like test, neuroradiologists consistently\npreferred the algorithm's segmentations over manual expert efforts,\nhighlighting increased comprehensiveness and precision. Validation using a\nreal-world external dataset (N=1686) confirmed the model's generalizability.\nThe algorithm's outputs also demonstrated strong correlations with clinical\nscores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived\nresults, underlining its clinical relevance. This study offers two key\nfindings. First, we present an ensemble algorithm\n(https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments\nischemic stroke lesions on DWI across diverse scenarios on par with expert\n(neuro)radiologists. Second, we show the potential for biomedical challenge\noutputs to extend beyond the challenge's initial objectives, demonstrating\ntheir real-world clinical applicability.\n","authors":["Ezequiel de la Rosa","Mauricio Reyes","Sook-Lei Liew","Alexandre Hutton","Roland Wiest","Johannes Kaesmacher","Uta Hanning","Arsany Hakim","Richard Zubal","Waldo Valenzuela","David Robben","Diana M. Sima","Vincenzo Anania","Arne Brys","James A. Meakin","Anne Mickan","Gabriel Broocks","Christian Heitkamp","Shengbo Gao","Kongming Liang","Ziji Zhang","Md Mahfuzur Rahman Siddiquee","Andriy Myronenko","Pooya Ashtari","Sabine Van Huffel","Hyun-su Jeong","Chi-ho Yoon","Chulhong Kim","Jiayu Huo","Sebastien Ourselin","Rachel Sparks","Albert Clèrigues","Arnau Oliver","Xavier Lladó","Liam Chalcroft","Ioannis Pappas","Jeroen Bertels","Ewout Heylen","Juliette Moreau","Nima Hatami","Carole Frindel","Abdul Qayyum","Moona Mazher","Domenec Puig","Shao-Chieh Lin","Chun-Jung Juan","Tianxi Hu","Lyndon Boone","Maged Goubran","Yi-Jui Liu","Susanne Wegener","Florian Kofler","Ivan Ezhov","Suprosanna Shit","Moritz R. Hernandez Petzsche","Bjoern Menze","Jan S. Kirschke","Benedikt Wiestler"],"pdf_url":"https://arxiv.org/pdf/2403.19425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19507v3","updated":"2024-03-28T13:51:37Z","published":"2023-05-31T02:35:41Z","title":"Manifold Constraint Regularization for Remote Sensing Image Generation","summary":" Generative Adversarial Networks (GANs) have shown notable accomplishments in\nremote sensing domain. However, this paper reveals that their performance on\nremote sensing images falls short when compared to their impressive results\nwith natural images. This study identifies a previously overlooked issue: GANs\nexhibit a heightened susceptibility to overfitting on remote sensing images.To\naddress this challenge, this paper analyzes the characteristics of remote\nsensing images and proposes manifold constraint regularization, a novel\napproach that tackles overfitting of GANs on remote sensing images for the\nfirst time. Our method includes a new measure for evaluating the structure of\nthe data manifold. Leveraging this measure, we propose the manifold constraint\nregularization term, which not only alleviates the overfitting problem, but\nalso promotes alignment between the generated and real data manifolds, leading\nto enhanced quality in the generated images. The effectiveness and versatility\nof this method have been corroborated through extensive validation on various\nremote sensing datasets and GAN models. The proposed method not only enhances\nthe quality of the generated images, reflected in a 3.13\\% improvement in\nFrechet Inception Distance (FID) score, but also boosts the performance of the\nGANs on downstream tasks, evidenced by a 3.76\\% increase in classification\naccuracy.\n","authors":["Xingzhe Su","Changwen Zheng","Wenwen Qiang","Fengge Wu","Junsuo Zhao","Fuchun Sun","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2305.19507v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08471v2","updated":"2024-03-28T13:47:42Z","published":"2023-10-09T20:18:10Z","title":"WinSyn: A High Resolution Testbed for Synthetic Data","summary":" We present WinSyn, a unique dataset and testbed for creating high-quality\nsynthetic data with procedural modeling techniques. The dataset contains\nhigh-resolution photographs of windows, selected from locations around the\nworld, with 89,318 individual window crops showcasing diverse geometric and\nmaterial characteristics. We evaluate a procedural model by training semantic\nsegmentation networks on both synthetic and real images and then comparing\ntheir performances on a shared test set of real images. Specifically, we\nmeasure the difference in mean Intersection over Union (mIoU) and determine the\neffective number of real images to match synthetic data's training performance.\nWe design a baseline procedural model as a benchmark and provide 21,290\nsynthetically generated images. By tuning the procedural model, key factors are\nidentified which significantly influence the model's fidelity in replicating\nreal-world scenarios. Importantly, we highlight the challenge of procedural\nmodeling using current techniques, especially in their ability to replicate the\nspatial semantics of real-world scenarios. This insight is critical because of\nthe potential of procedural models to bridge to hidden scene aspects such as\ndepth, reflectivity, material properties, and lighting conditions.\n","authors":["Tom Kelly","John Femiani","Peter Wonka"],"pdf_url":"https://arxiv.org/pdf/2310.08471v2.pdf","comment":"cvpr version"},{"id":"http://arxiv.org/abs/2403.19417v1","updated":"2024-03-28T13:47:19Z","published":"2024-03-28T13:47:19Z","title":"OAKINK2: A Dataset of Bimanual Hands-Object Manipulation in Complex Task\n Completion","summary":" We present OAKINK2, a dataset of bimanual object manipulation tasks for\ncomplex daily activities. In pursuit of constructing the complex tasks into a\nstructured representation, OAKINK2 introduces three level of abstraction to\norganize the manipulation tasks: Affordance, Primitive Task, and Complex Task.\nOAKINK2 features on an object-centric perspective for decoding the complex\ntasks, treating them as a sequence of object affordance fulfillment. The first\nlevel, Affordance, outlines the functionalities that objects in the scene can\nafford, the second level, Primitive Task, describes the minimal interaction\nunits that humans interact with the object to achieve its affordance, and the\nthird level, Complex Task, illustrates how Primitive Tasks are composed and\ninterdependent. OAKINK2 dataset provides multi-view image streams and precise\npose annotations for the human body, hands and various interacting objects.\nThis extensive collection supports applications such as interaction\nreconstruction and motion synthesis. Based on the 3-level abstraction of\nOAKINK2, we explore a task-oriented framework for Complex Task Completion\n(CTC). CTC aims to generate a sequence of bimanual manipulation to achieve task\nobjectives. Within the CTC framework, we employ Large Language Models (LLMs) to\ndecompose the complex task objectives into sequences of Primitive Tasks and\nhave developed a Motion Fulfillment Model that generates bimanual hand motion\nfor each Primitive Task. OAKINK2 datasets and models are available at\nhttps://oakink.net/v2.\n","authors":["Xinyu Zhan","Lixin Yang","Yifei Zhao","Kangrui Mao","Hanlin Xu","Zenan Lin","Kailin Li","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2403.19417v1.pdf","comment":"To be appeared in CVPR 2024. 26 pages"},{"id":"http://arxiv.org/abs/2403.05369v4","updated":"2024-03-28T13:41:05Z","published":"2024-03-08T15:00:44Z","title":"Frequency-Adaptive Dilated Convolution for Semantic Segmentation","summary":" Dilated convolution, which expands the receptive field by inserting gaps\nbetween its consecutive elements, is widely employed in computer vision. In\nthis study, we propose three strategies to improve individual phases of dilated\nconvolution from the view of spectrum analysis. Departing from the conventional\npractice of fixing a global dilation rate as a hyperparameter, we introduce\nFrequency-Adaptive Dilated Convolution (FADC), which dynamically adjusts\ndilation rates spatially based on local frequency components. Subsequently, we\ndesign two plug-in modules to directly enhance effective bandwidth and\nreceptive field size. The Adaptive Kernel (AdaKern) module decomposes\nconvolution weights into low-frequency and high-frequency components,\ndynamically adjusting the ratio between these components on a per-channel\nbasis. By increasing the high-frequency part of convolution weights, AdaKern\ncaptures more high-frequency components, thereby improving effective bandwidth.\nThe Frequency Selection (FreqSelect) module optimally balances high- and\nlow-frequency components in feature representations through spatially variant\nreweighting. It suppresses high frequencies in the background to encourage FADC\nto learn a larger dilation, thereby increasing the receptive field for an\nexpanded scope. Extensive experiments on segmentation and object detection\nconsistently validate the efficacy of our approach. The code is publicly\navailable at \\url{https://github.com/Linwei-Chen/FADC}.\n","authors":["Linwei Chen","Lin Gu","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.05369v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19415v1","updated":"2024-03-28T13:39:55Z","published":"2024-03-28T13:39:55Z","title":"Brain-Shift: Unsupervised Pseudo-Healthy Brain Synthesis for Novel\n Biomarker Extraction in Chronic Subdural Hematoma","summary":" Chronic subdural hematoma (cSDH) is a common neurological condition\ncharacterized by the accumulation of blood between the brain and the dura\nmater. This accumulation of blood can exert pressure on the brain, potentially\nleading to fatal outcomes. Treatment options for cSDH are limited to invasive\nsurgery or non-invasive management. Traditionally, the midline shift,\nhand-measured by experts from an ideal sagittal plane, and the hematoma volume\nhave been the primary metrics for quantifying and analyzing cSDH. However,\nthese approaches do not quantify the local 3D brain deformation caused by cSDH.\nWe propose a novel method using anatomy-aware unsupervised diffeomorphic\npseudo-healthy synthesis to generate brain deformation fields. The deformation\nfields derived from this process are utilized to extract biomarkers that\nquantify the shift in the brain due to cSDH. We use CT scans of 121 patients\nfor training and validation of our method and find that our metrics allow the\nidentification of patients who require surgery. Our results indicate that\nautomatically obtained brain deformation fields might contain prognostic value\nfor personalized cSDH treatment. Our implementation is available on:\ngithub.com/Barisimre/brain-morphing\n","authors":["Baris Imre","Elina Thibeau-Sutre","Jorieke Reimer","Kuan Kho","Jelmer M. Wolterink"],"pdf_url":"https://arxiv.org/pdf/2403.19415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19412v1","updated":"2024-03-28T13:36:00Z","published":"2024-03-28T13:36:00Z","title":"A Simple and Effective Point-based Network for Event Camera 6-DOFs Pose\n Relocalization","summary":" Event cameras exhibit remarkable attributes such as high dynamic range,\nasynchronicity, and low latency, making them highly suitable for vision tasks\nthat involve high-speed motion in challenging lighting conditions. These\ncameras implicitly capture movement and depth information in events, making\nthem appealing sensors for Camera Pose Relocalization (CPR) tasks.\nNevertheless, existing CPR networks based on events neglect the pivotal\nfine-grained temporal information in events, resulting in unsatisfactory\nperformance. Moreover, the energy-efficient features are further compromised by\nthe use of excessively complex models, hindering efficient deployment on edge\ndevices. In this paper, we introduce PEPNet, a simple and effective point-based\nnetwork designed to regress six degrees of freedom (6-DOFs) event camera poses.\nWe rethink the relationship between the event camera and CPR tasks, leveraging\nthe raw Point Cloud directly as network input to harness the high-temporal\nresolution and inherent sparsity of events. PEPNet is adept at abstracting the\nspatial and implicit temporal features through hierarchical structure and\nexplicit temporal features by Attentive Bi-directional Long Short-Term Memory\n(A-Bi-LSTM). By employing a carefully crafted lightweight design, PEPNet\ndelivers state-of-the-art (SOTA) performance on both indoor and outdoor\ndatasets with meager computational resources. Specifically, PEPNet attains a\nsignificant 38% and 33% performance improvement on the random split IJRR and\nM3ED datasets, respectively. Moreover, the lightweight design version\nPEPNet$_{tiny}$ accomplishes results comparable to the SOTA while employing a\nmere 0.5% of the parameters.\n","authors":["Hongwei Ren","Jiadong Zhu","Yue Zhou","Haotian FU","Yulong Huang","Bojun Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.19412v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19407v1","updated":"2024-03-28T13:32:49Z","published":"2024-03-28T13:32:49Z","title":"Towards Temporally Consistent Referring Video Object Segmentation","summary":" Referring Video Object Segmentation (R-VOS) methods face challenges in\nmaintaining consistent object segmentation due to temporal context variability\nand the presence of other visually similar objects. We propose an end-to-end\nR-VOS paradigm that explicitly models temporal instance consistency alongside\nthe referring segmentation. Specifically, we introduce a novel hybrid memory\nthat facilitates inter-frame collaboration for robust spatio-temporal matching\nand propagation. Features of frames with automatically generated high-quality\nreference masks are propagated to segment the remaining frames based on\nmulti-granularity association to achieve temporally consistent R-VOS.\nFurthermore, we propose a new Mask Consistency Score (MCS) metric to evaluate\nthe temporal consistency of video segmentation. Extensive experiments\ndemonstrate that our approach enhances temporal consistency by a significant\nmargin, leading to top-ranked performance on popular R-VOS benchmarks, i.e.,\nRef-YouTube-VOS (67.1%) and Ref-DAVIS17 (65.6%).\n","authors":["Bo Miao","Mohammed Bennamoun","Yongsheng Gao","Mubarak Shah","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2403.19407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18331v2","updated":"2024-03-28T13:27:33Z","published":"2023-11-30T08:02:49Z","title":"MRFP: Learning Generalizable Semantic Segmentation from Sim-2-Real with\n Multi-Resolution Feature Perturbation","summary":" Deep neural networks have shown exemplary performance on semantic scene\nunderstanding tasks on source domains, but due to the absence of style\ndiversity during training, enhancing performance on unseen target domains using\nonly single source domain data remains a challenging task. Generation of\nsimulated data is a feasible alternative to retrieving large style-diverse\nreal-world datasets as it is a cumbersome and budget-intensive process.\nHowever, the large domain-specfic inconsistencies between simulated and\nreal-world data pose a significant generalization challenge in semantic\nsegmentation. In this work, to alleviate this problem, we propose a novel\nMultiResolution Feature Perturbation (MRFP) technique to randomize\ndomain-specific fine-grained features and perturb style of coarse features. Our\nexperimental results on various urban-scene segmentation datasets clearly\nindicate that, along with the perturbation of style-information, perturbation\nof fine-feature components is paramount to learn domain invariant robust\nfeature maps for semantic segmentation models. MRFP is a simple and\ncomputationally efficient, transferable module with no additional learnable\nparameters or objective functions, that helps state-of-the-art deep neural\nnetworks to learn robust domain invariant features for simulation-to-real\nsemantic segmentation.\n","authors":["Sumanth Udupa","Prajwal Gurunath","Aniruddh Sikdar","Suresh Sundaram"],"pdf_url":"https://arxiv.org/pdf/2311.18331v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2311.11908v3","updated":"2024-03-28T13:16:50Z","published":"2023-11-20T16:40:29Z","title":"Continual Learning: Applications and the Road Forward","summary":" Continual learning is a subfield of machine learning, which aims to allow\nmachine learning models to continuously learn on new data, by accumulating\nknowledge without forgetting what was learned in the past. In this work, we\ntake a step back, and ask: \"Why should one care about continual learning in the\nfirst place?\". We set the stage by examining recent continual learning papers\npublished at four major machine learning conferences, and show that\nmemory-constrained settings dominate the field. Then, we discuss five open\nproblems in machine learning, and even though they might seem unrelated to\ncontinual learning at first sight, we show that continual learning will\ninevitably be part of their solution. These problems are model editing,\npersonalization and specialization, on-device learning, faster (re-)training\nand reinforcement learning. Finally, by comparing the desiderata from these\nunsolved problems and the current assumptions in continual learning, we\nhighlight and discuss four future directions for continual learning research.\nWe hope that this work offers an interesting perspective on the future of\ncontinual learning, while displaying its potential value and the paths we have\nto pursue in order to make it successful. This work is the result of the many\ndiscussions the authors had at the Dagstuhl seminar on Deep Continual Learning,\nin March 2023.\n","authors":["Eli Verwimp","Rahaf Aljundi","Shai Ben-David","Matthias Bethge","Andrea Cossu","Alexander Gepperth","Tyler L. Hayes","Eyke Hüllermeier","Christopher Kanan","Dhireesha Kudithipudi","Christoph H. Lampert","Martin Mundt","Razvan Pascanu","Adrian Popescu","Andreas S. Tolias","Joost van de Weijer","Bing Liu","Vincenzo Lomonaco","Tinne Tuytelaars","Gido M. van de Ven"],"pdf_url":"https://arxiv.org/pdf/2311.11908v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19386v1","updated":"2024-03-28T12:51:15Z","published":"2024-03-28T12:51:15Z","title":"PointCloud-Text Matching: Benchmark Datasets and a Baseline","summary":" In this paper, we present and study a new instance-level retrieval task:\nPointCloud-Text Matching~(PTM), which aims to find the exact cross-modal\ninstance that matches a given point-cloud query or text query. PTM could be\napplied to various scenarios, such as indoor/urban-canyon localization and\nscene retrieval. However, there exists no suitable and targeted dataset for PTM\nin practice. Therefore, we construct three new PTM benchmark datasets, namely\n3D2T-SR, 3D2T-NR, and 3D2T-QA. We observe that the data is challenging and with\nnoisy correspondence due to the sparsity, noise, or disorder of point clouds\nand the ambiguity, vagueness, or incompleteness of texts, which make existing\ncross-modal matching methods ineffective for PTM. To tackle these challenges,\nwe propose a PTM baseline, named Robust PointCloud-Text Matching method (RoMa).\nRoMa consists of two modules: a Dual Attention Perception module (DAP) and a\nRobust Negative Contrastive Learning module (RNCL). Specifically, DAP leverages\ntoken-level and feature-level attention to adaptively focus on useful local and\nglobal features, and aggregate them into common representations, thereby\nreducing the adverse impact of noise and ambiguity. To handle noisy\ncorrespondence, RNCL divides negative pairs, which are much less error-prone\nthan positive pairs, into clean and noisy subsets, and assigns them forward and\nreverse optimization directions respectively, thus enhancing robustness against\nnoisy correspondence. We conduct extensive experiments on our benchmarks and\ndemonstrate the superiority of our RoMa.\n","authors":["Yanglin Feng","Yang Qin","Dezhong Peng","Hongyuan Zhu","Xi Peng","Peng Hu"],"pdf_url":"https://arxiv.org/pdf/2403.19386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02051v2","updated":"2024-03-28T12:41:14Z","published":"2023-12-04T17:09:52Z","title":"TimeChat: A Time-sensitive Multimodal Large Language Model for Long\n Video Understanding","summary":" This work proposes TimeChat, a time-sensitive multimodal large language model\nspecifically designed for long video understanding. Our model incorporates two\nkey architectural contributions: (1) a timestamp-aware frame encoder that binds\nvisual content with the timestamp of each frame, and (2) a sliding video\nQ-Former that produces a video token sequence of varying lengths to accommodate\nvideos of various durations. Additionally, we construct an instruction-tuning\ndataset, encompassing 6 tasks and a total of 125K instances, to further enhance\nTimeChat's instruction-following performance. Experiment results across various\nvideo understanding tasks, such as dense captioning, temporal grounding, and\nhighlight detection, demonstrate TimeChat's strong zero-shot temporal\nlocalization and reasoning capabilities. For example, it achieves +9.2 F1 score\nand +2.8 CIDEr on YouCook2, +5.8 HIT@1 on QVHighlights, and +27.5 R@1 (IoU=0.5)\non Charades-STA, compared to state-of-the-art video large language models,\nholding the potential to serve as a versatile video assistant for long-form\nvideo comprehension tasks and satisfy realistic user requirements.\n","authors":["Shuhuai Ren","Linli Yao","Shicheng Li","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2312.02051v2.pdf","comment":"CVPR 2024 camera-ready version, code is available at\n https://github.com/RenShuhuai-Andy/TimeChat"},{"id":"http://arxiv.org/abs/2403.19376v1","updated":"2024-03-28T12:38:21Z","published":"2024-03-28T12:38:21Z","title":"NIGHT -- Non-Line-of-Sight Imaging from Indirect Time of Flight Data","summary":" The acquisition of objects outside the Line-of-Sight of cameras is a very\nintriguing but also extremely challenging research topic. Recent works showed\nthe feasibility of this idea exploiting transient imaging data produced by\ncustom direct Time of Flight sensors. In this paper, for the first time, we\ntackle this problem using only data from an off-the-shelf indirect Time of\nFlight sensor without any further hardware requirement. We introduced a Deep\nLearning model able to re-frame the surfaces where light bounces happen as a\nvirtual mirror. This modeling makes the task easier to handle and also\nfacilitates the construction of annotated training data. From the obtained data\nit is possible to retrieve the depth information of the hidden scene. We also\nprovide a first-in-its-kind synthetic dataset for the task and demonstrate the\nfeasibility of the proposed idea over it.\n","authors":["Matteo Caligiuri","Adriano Simonetto","Gianluca Agresti","Pietro Zanuttigh"],"pdf_url":"https://arxiv.org/pdf/2403.19376v1.pdf","comment":"Submitted to ECCV 24, 17 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.19366v1","updated":"2024-03-28T12:28:58Z","published":"2024-03-28T12:28:58Z","title":"Infrared Small Target Detection with Scale and Location Sensitivity","summary":" Recently, infrared small target detection (IRSTD) has been dominated by\ndeep-learning-based methods. However, these methods mainly focus on the design\nof complex model structures to extract discriminative features, leaving the\nloss functions for IRSTD under-explored. For example, the widely used\nIntersection over Union (IoU) and Dice losses lack sensitivity to the scales\nand locations of targets, limiting the detection performance of detectors. In\nthis paper, we focus on boosting detection performance with a more effective\nloss but a simpler model structure. Specifically, we first propose a novel\nScale and Location Sensitive (SLS) loss to handle the limitations of existing\nlosses: 1) for scale sensitivity, we compute a weight for the IoU loss based on\ntarget scales to help the detector distinguish targets with different scales:\n2) for location sensitivity, we introduce a penalty term based on the center\npoints of targets to help the detector localize targets more precisely. Then,\nwe design a simple Multi-Scale Head to the plain U-Net (MSHNet). By applying\nSLS loss to each scale of the predictions, our MSHNet outperforms existing\nstate-of-the-art methods by a large margin. In addition, the detection\nperformance of existing detectors can be further improved when trained with our\nSLS loss, demonstrating the effectiveness and generalization of our SLS loss.\nThe code is available at https://github.com/ying-fu/MSHNet.\n","authors":["Qiankun Liu","Rui Liu","Bolun Zheng","Hongkui Wang","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.19366v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2304.03198v6","updated":"2024-03-28T12:07:44Z","published":"2023-04-06T16:21:56Z","title":"RFAConv: Innovating Spatial Attention and Standard Convolutional\n Operation","summary":" Spatial attention has been widely used to improve the performance of\nconvolutional neural networks. However, it has certain limitations. In this\npaper, we propose a new perspective on the effectiveness of spatial attention,\nwhich is that the spatial attention mechanism essentially solves the problem of\nconvolutional kernel parameter sharing. However, the information contained in\nthe attention map generated by spatial attention is not sufficient for\nlarge-size convolutional kernels. Therefore, we propose a novel attention\nmechanism called Receptive-Field Attention (RFA). Existing spatial attention,\nsuch as Convolutional Block Attention Module (CBAM) and Coordinated Attention\n(CA) focus only on spatial features, which does not fully address the problem\nof convolutional kernel parameter sharing. In contrast, RFA not only focuses on\nthe receptive-field spatial feature but also provides effective attention\nweights for large-size convolutional kernels. The Receptive-Field Attention\nconvolutional operation (RFAConv), developed by RFA, represents a new approach\nto replace the standard convolution operation. It offers nearly negligible\nincrement of computational cost and parameters, while significantly improving\nnetwork performance. We conducted a series of experiments on ImageNet-1k, COCO,\nand VOC datasets to demonstrate the superiority of our approach. Of particular\nimportance, we believe that it is time to shift focus from spatial features to\nreceptive-field spatial features for current spatial attention mechanisms. In\nthis way, we can further improve network performance and achieve even better\nresults. The code and pre-trained models for the relevant tasks can be found at\nhttps://github.com/Liuchen1997/RFAConv.\n","authors":["Xin Zhang","Chen Liu","Degang Yang","Tingting Song","Yichen Ye","Ke Li","Yingze Song"],"pdf_url":"https://arxiv.org/pdf/2304.03198v6.pdf","comment":"12 pages, 11figures"},{"id":"http://arxiv.org/abs/2306.16324v2","updated":"2024-03-28T12:05:23Z","published":"2023-06-28T15:58:53Z","title":"DoseDiff: Distance-aware Diffusion Model for Dose Prediction in\n Radiotherapy","summary":" Treatment planning, which is a critical component of the radiotherapy\nworkflow, is typically carried out by a medical physicist in a time-consuming\ntrial-and-error manner. Previous studies have proposed knowledge-based or\ndeep-learning-based methods for predicting dose distribution maps to assist\nmedical physicists in improving the efficiency of treatment planning. However,\nthese dose prediction methods usually fail to effectively utilize distance\ninformation between surrounding tissues and targets or organs-at-risk (OARs).\nMoreover, they are poor at maintaining the distribution characteristics of ray\npaths in the predicted dose distribution maps, resulting in a loss of valuable\ninformation. In this paper, we propose a distance-aware diffusion model\n(DoseDiff) for precise prediction of dose distribution. We define dose\nprediction as a sequence of denoising steps, wherein the predicted dose\ndistribution map is generated with the conditions of the computed tomography\n(CT) image and signed distance maps (SDMs). The SDMs are obtained by distance\ntransformation from the masks of targets or OARs, which provide the distance\nfrom each pixel in the image to the outline of the targets or OARs. We further\npropose a multi-encoder and multi-scale fusion network (MMFNet) that\nincorporates multi-scale and transformer-based fusion modules to enhance\ninformation fusion between the CT image and SDMs at the feature level. We\nevaluate our model on two in-house datasets and a public dataset, respectively.\nThe results demonstrate that our DoseDiff method outperforms state-of-the-art\ndose prediction methods in terms of both quantitative performance and visual\nquality.\n","authors":["Yiwen Zhang","Chuanpu Li","Liming Zhong","Zeli Chen","Wei Yang","Xuetao Wang"],"pdf_url":"https://arxiv.org/pdf/2306.16324v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19336v1","updated":"2024-03-28T11:52:42Z","published":"2024-03-28T11:52:42Z","title":"IVLMap: Instance-Aware Visual Language Grounding for Consumer Robot\n Navigation","summary":" Vision-and-Language Navigation (VLN) is a challenging task that requires a\nrobot to navigate in photo-realistic environments with human natural language\npromptings. Recent studies aim to handle this task by constructing the semantic\nspatial map representation of the environment, and then leveraging the strong\nability of reasoning in large language models for generalizing code for guiding\nthe robot navigation. However, these methods face limitations in instance-level\nand attribute-level navigation tasks as they cannot distinguish different\ninstances of the same object. To address this challenge, we propose a new\nmethod, namely, Instance-aware Visual Language Map (IVLMap), to empower the\nrobot with instance-level and attribute-level semantic mapping, where it is\nautonomously constructed by fusing the RGBD video data collected from the robot\nagent with special-designed natural language map indexing in the bird's-in-eye\nview. Such indexing is instance-level and attribute-level. In particular, when\nintegrated with a large language model, IVLMap demonstrates the capability to\ni) transform natural language into navigation targets with instance and\nattribute information, enabling precise localization, and ii) accomplish\nzero-shot end-to-end navigation tasks based on natural language commands.\nExtensive navigation experiments are conducted. Simulation results illustrate\nthat our method can achieve an average improvement of 14.4\\% in navigation\naccuracy. Code and demo are released at https://ivlmap.github.io/.\n","authors":["Jiacui Huang","Hongtao Zhang","Mingbo Zhao","Zhou Wu"],"pdf_url":"https://arxiv.org/pdf/2403.19336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19334v1","updated":"2024-03-28T11:50:23Z","published":"2024-03-28T11:50:23Z","title":"Test-Time Domain Generalization for Face Anti-Spoofing","summary":" Face Anti-Spoofing (FAS) is pivotal in safeguarding facial recognition\nsystems against presentation attacks. While domain generalization (DG) methods\nhave been developed to enhance FAS performance, they predominantly focus on\nlearning domain-invariant features during training, which may not guarantee\ngeneralizability to unseen data that differs largely from the source\ndistributions. Our insight is that testing data can serve as a valuable\nresource to enhance the generalizability beyond mere evaluation for DG FAS. In\nthis paper, we introduce a novel Test-Time Domain Generalization (TTDG)\nframework for FAS, which leverages the testing data to boost the model's\ngeneralizability. Our method, consisting of Test-Time Style Projection (TTSP)\nand Diverse Style Shifts Simulation (DSSS), effectively projects the unseen\ndata to the seen domain space. In particular, we first introduce the innovative\nTTSP to project the styles of the arbitrarily unseen samples of the testing\ndistribution to the known source space of the training distributions. We then\ndesign the efficient DSSS to synthesize diverse style shifts via learnable\nstyle bases with two specifically designed losses in a hyperspherical feature\nspace. Our method eliminates the need for model updates at the test time and\ncan be seamlessly integrated into not only the CNN but also ViT backbones.\nComprehensive experiments on widely used cross-domain FAS benchmarks\ndemonstrate our method's state-of-the-art performance and effectiveness.\n","authors":["Qianyu Zhou","Ke-Yue Zhang","Taiping Yao","Xuequan Lu","Shouhong Ding","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2403.19334v1.pdf","comment":"Accepted to IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.18339v2","updated":"2024-03-28T11:46:25Z","published":"2024-03-27T08:28:14Z","title":"H2ASeg: Hierarchical Adaptive Interaction and Weighting Network for\n Tumor Segmentation in PET/CT Images","summary":" Positron emission tomography (PET) combined with computed tomography (CT)\nimaging is routinely used in cancer diagnosis and prognosis by providing\ncomplementary information. Automatically segmenting tumors in PET/CT images can\nsignificantly improve examination efficiency. Traditional multi-modal\nsegmentation solutions mainly rely on concatenation operations for modality\nfusion, which fail to effectively model the non-linear dependencies between PET\nand CT modalities. Recent studies have investigated various approaches to\noptimize the fusion of modality-specific features for enhancing joint\nrepresentations. However, modality-specific encoders used in these methods\noperate independently, inadequately leveraging the synergistic relationships\ninherent in PET and CT modalities, for example, the complementarity between\nsemantics and structure. To address these issues, we propose a Hierarchical\nAdaptive Interaction and Weighting Network termed H2ASeg to explore the\nintrinsic cross-modal correlations and transfer potential complementary\ninformation. Specifically, we design a Modality-Cooperative Spatial Attention\n(MCSA) module that performs intra- and inter-modal interactions globally and\nlocally. Additionally, a Target-Aware Modality Weighting (TAMW) module is\ndeveloped to highlight tumor-related features within multi-modal features,\nthereby refining tumor segmentation. By embedding these modules across\ndifferent layers, H2ASeg can hierarchically model cross-modal correlations,\nenabling a nuanced understanding of both semantic and structural tumor\nfeatures. Extensive experiments demonstrate the superiority of H2ASeg,\noutperforming state-of-the-art methods on AutoPet-II and Hecktor2022\nbenchmarks. The code is released at https://github.com/JinPLu/H2ASeg.\n","authors":["Jinpeng Lu","Jingyun Chen","Linghan Cai","Songhan Jiang","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.18339v2.pdf","comment":"10 pages,4 figures"},{"id":"http://arxiv.org/abs/2311.15596v2","updated":"2024-03-28T11:35:55Z","published":"2023-11-27T07:44:25Z","title":"EgoThink: Evaluating First-Person Perspective Thinking Capability of\n Vision-Language Models","summary":" Vision-language models (VLMs) have recently shown promising results in\ntraditional downstream tasks. Evaluation studies have emerged to assess their\nabilities, with the majority focusing on the third-person perspective, and only\na few addressing specific tasks from the first-person perspective. However, the\ncapability of VLMs to \"think\" from a first-person perspective, a crucial\nattribute for advancing autonomous agents and robotics, remains largely\nunexplored. To bridge this research gap, we introduce EgoThink, a novel visual\nquestion-answering benchmark that encompasses six core capabilities with twelve\ndetailed dimensions. The benchmark is constructed using selected clips from\negocentric videos, with manually annotated question-answer pairs containing\nfirst-person information. To comprehensively assess VLMs, we evaluate eighteen\npopular VLMs on EgoThink. Moreover, given the open-ended format of the answers,\nwe use GPT-4 as the automatic judge to compute single-answer grading.\nExperimental results indicate that although GPT-4V leads in numerous\ndimensions, all evaluated VLMs still possess considerable potential for\nimprovement in first-person perspective tasks. Meanwhile, enlarging the number\nof trainable parameters has the most significant impact on model performance on\nEgoThink. In conclusion, EgoThink serves as a valuable addition to existing\nevaluation benchmarks for VLMs, providing an indispensable resource for future\nresearch in the realm of embodied artificial intelligence and robotics.\n","authors":["Sijie Cheng","Zhicheng Guo","Jingwen Wu","Kechen Fang","Peng Li","Huaping Liu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19326v1","updated":"2024-03-28T11:33:02Z","published":"2024-03-28T11:33:02Z","title":"MedBN: Robust Test-Time Adaptation against Malicious Test Samples","summary":" Test-time adaptation (TTA) has emerged as a promising solution to address\nperformance decay due to unforeseen distribution shifts between training and\ntest data. While recent TTA methods excel in adapting to test data variations,\nsuch adaptability exposes a model to vulnerability against malicious examples,\nan aspect that has received limited attention. Previous studies have uncovered\nsecurity vulnerabilities within TTA even when a small proportion of the test\nbatch is maliciously manipulated. In response to the emerging threat, we\npropose median batch normalization (MedBN), leveraging the robustness of the\nmedian for statistics estimation within the batch normalization layer during\ntest-time inference. Our method is algorithm-agnostic, thus allowing seamless\nintegration with existing TTA frameworks. Our experimental results on benchmark\ndatasets, including CIFAR10-C, CIFAR100-C and ImageNet-C, consistently\ndemonstrate that MedBN outperforms existing approaches in maintaining robust\nperformance across different attack scenarios, encompassing both instant and\ncumulative attacks. Through extensive experiments, we show that our approach\nsustains the performance even in the absence of attacks, achieving a practical\nbalance between robustness and performance.\n","authors":["Hyejin Park","Jeongyeon Hwang","Sunung Mun","Sangdon Park","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2403.19326v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19322v1","updated":"2024-03-28T11:26:30Z","published":"2024-03-28T11:26:30Z","title":"Plug-and-Play Grounding of Reasoning in Multimodal Large Language Models","summary":" The surge of Multimodal Large Language Models (MLLMs), given their prominent\nemergent capabilities in instruction following and reasoning, has greatly\nadvanced the field of visual reasoning. However, constrained by their\nnon-lossless image tokenization, most MLLMs fall short of comprehensively\ncapturing details of text and objects, especially in high-resolution images. To\naddress this, we propose P2G, a novel framework for plug-and-play grounding of\nreasoning in MLLMs. Specifically, P2G exploits the tool-usage potential of\nMLLMs to employ expert agents to achieve on-the-fly grounding to critical\nvisual and textual objects of image, thus achieving deliberate reasoning via\nmultimodal prompting. We further create P2GB, a benchmark aimed at assessing\nMLLMs' ability to understand inter-object relationships and text in challenging\nhigh-resolution images. Comprehensive experiments on visual reasoning tasks\ndemonstrate the superiority of P2G. Noteworthy, P2G achieved comparable\nperformance with GPT-4V on P2GB, with a 7B backbone. Our work highlights the\npotential of plug-and-play grounding of reasoning and opens up a promising\nalternative beyond model scaling.\n","authors":["Jiaxing Chen","Yuxuan Liu","Dehu Li","Xiang An","Ziyong Feng","Yongle Zhao","Yin Xie"],"pdf_url":"https://arxiv.org/pdf/2403.19322v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.19319v1","updated":"2024-03-28T11:22:53Z","published":"2024-03-28T11:22:53Z","title":"Mesh2NeRF: Direct Mesh Supervision for Neural Radiance Field\n Representation and Generation","summary":" We present Mesh2NeRF, an approach to derive ground-truth radiance fields from\ntextured meshes for 3D generation tasks. Many 3D generative approaches\nrepresent 3D scenes as radiance fields for training. Their ground-truth\nradiance fields are usually fitted from multi-view renderings from a\nlarge-scale synthetic 3D dataset, which often results in artifacts due to\nocclusions or under-fitting issues. In Mesh2NeRF, we propose an analytic\nsolution to directly obtain ground-truth radiance fields from 3D meshes,\ncharacterizing the density field with an occupancy function featuring a defined\nsurface thickness, and determining view-dependent color through a reflection\nfunction considering both the mesh and environment lighting. Mesh2NeRF extracts\naccurate radiance fields which provides direct supervision for training\ngenerative NeRFs and single scene representation. We validate the effectiveness\nof Mesh2NeRF across various tasks, achieving a noteworthy 3.12dB improvement in\nPSNR for view synthesis in single scene representation on the ABO dataset, a\n0.69 PSNR enhancement in the single-view conditional generation of ShapeNet\nCars, and notably improved mesh extraction from NeRF in the unconditional\ngeneration of Objaverse Mugs.\n","authors":["Yujin Chen","Yinyu Nie","Benjamin Ummenhofer","Reiner Birkl","Michael Paulitsch","Matthias Müller","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2403.19319v1.pdf","comment":"Project page: https://terencecyj.github.io/projects/Mesh2NeRF/ Video:\n https://youtu.be/oufv1N3f7iY"},{"id":"http://arxiv.org/abs/2403.19316v1","updated":"2024-03-28T11:17:00Z","published":"2024-03-28T11:17:00Z","title":"Hypergraph-based Multi-View Action Recognition using Event Cameras","summary":" Action recognition from video data forms a cornerstone with wide-ranging\napplications. Single-view action recognition faces limitations due to its\nreliance on a single viewpoint. In contrast, multi-view approaches capture\ncomplementary information from various viewpoints for improved accuracy.\nRecently, event cameras have emerged as innovative bio-inspired sensors,\nleading to advancements in event-based action recognition. However, existing\nworks predominantly focus on single-view scenarios, leaving a gap in multi-view\nevent data exploitation, particularly in challenges like information deficit\nand semantic misalignment. To bridge this gap, we introduce HyperMV, a\nmulti-view event-based action recognition framework. HyperMV converts discrete\nevent data into frame-like representations and extracts view-related features\nusing a shared convolutional network. By treating segments as vertices and\nconstructing hyperedges using rule-based and KNN-based strategies, a multi-view\nhypergraph neural network that captures relationships across viewpoint and\ntemporal features is established. The vertex attention hypergraph propagation\nis also introduced for enhanced feature fusion. To prompt research in this\narea, we present the largest multi-view event-based action dataset\n$\\text{THU}^{\\text{MV-EACT}}\\text{-50}$, comprising 50 actions from 6\nviewpoints, which surpasses existing datasets by over tenfold. Experimental\nresults show that HyperMV significantly outperforms baselines in both\ncross-subject and cross-view scenarios, and also exceeds the state-of-the-arts\nin frame-based multi-view action recognition.\n","authors":["Yue Gao","Jiaxuan Lu","Siqi Li","Yipeng Li","Shaoyi Du"],"pdf_url":"https://arxiv.org/pdf/2403.19316v1.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence (TPAMI 2024)"},{"id":"http://arxiv.org/abs/2403.19314v1","updated":"2024-03-28T11:12:33Z","published":"2024-03-28T11:12:33Z","title":"Total-Decom: Decomposed 3D Scene Reconstruction with Minimal Interaction","summary":" Scene reconstruction from multi-view images is a fundamental problem in\ncomputer vision and graphics. Recent neural implicit surface reconstruction\nmethods have achieved high-quality results; however, editing and manipulating\nthe 3D geometry of reconstructed scenes remains challenging due to the absence\nof naturally decomposed object entities and complex object/background\ncompositions. In this paper, we present Total-Decom, a novel method for\ndecomposed 3D reconstruction with minimal human interaction. Our approach\nseamlessly integrates the Segment Anything Model (SAM) with hybrid\nimplicit-explicit neural surface representations and a mesh-based\nregion-growing technique for accurate 3D object decomposition. Total-Decom\nrequires minimal human annotations while providing users with real-time control\nover the granularity and quality of decomposition. We extensively evaluate our\nmethod on benchmark datasets and demonstrate its potential for downstream\napplications, such as animation and scene editing. The code is available at\n\\href{https://github.com/CVMI-Lab/Total-Decom.git}{https://github.com/CVMI-Lab/Total-Decom.git}.\n","authors":["Xiaoyang Lyu","Chirui Chang","Peng Dai","Yang-tian Sun","Xiaojuang Qi"],"pdf_url":"https://arxiv.org/pdf/2403.19314v1.pdf","comment":"8 pages, 7 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2211.01579v3","updated":"2024-03-28T10:53:54Z","published":"2022-11-03T04:19:27Z","title":"Data-free Defense of Black Box Models Against Adversarial Attacks","summary":" Several companies often safeguard their trained deep models (i.e., details of\narchitecture, learnt weights, training details etc.) from third-party users by\nexposing them only as black boxes through APIs. Moreover, they may not even\nprovide access to the training data due to proprietary reasons or sensitivity\nconcerns. In this work, we propose a novel defense mechanism for black box\nmodels against adversarial attacks in a data-free set up. We construct\nsynthetic data via generative model and train surrogate network using model\nstealing techniques. To minimize adversarial contamination on perturbed\nsamples, we propose 'wavelet noise remover' (WNR) that performs discrete\nwavelet decomposition on input images and carefully select only a few important\ncoefficients determined by our 'wavelet coefficient selection module' (WCSM).\nTo recover the high-frequency content of the image after noise removal via WNR,\nwe further train a 'regenerator' network with an objective to retrieve the\ncoefficients such that the reconstructed image yields similar to original\npredictions on the surrogate model. At test time, WNR combined with trained\nregenerator network is prepended to the black box network, resulting in a high\nboost in adversarial accuracy. Our method improves the adversarial accuracy on\nCIFAR-10 by 38.98% and 32.01% on state-of-the-art Auto Attack compared to\nbaseline, even when the attacker uses surrogate architecture (Alexnet-half and\nAlexnet) similar to the black box architecture (Alexnet) with same model\nstealing strategy as defender. The code is available at\nhttps://github.com/vcl-iisc/data-free-black-box-defense\n","authors":["Gaurav Kumar Nayak","Inder Khatri","Ruchit Rawal","Anirban Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2211.01579v3.pdf","comment":"CVPR Workshop (Under Review)"},{"id":"http://arxiv.org/abs/2403.19306v1","updated":"2024-03-28T10:42:49Z","published":"2024-03-28T10:42:49Z","title":"Sparse Generation: Making Pseudo Labels Sparse for weakly supervision\n with points","summary":" In recent years, research on point weakly supervised object detection (PWSOD)\nmethods in the field of computer vision has attracted people's attention.\nHowever, existing pseudo labels generation methods perform poorly in a small\namount of supervised annotation data and dense object detection tasks. We\nconsider the generation of weakly supervised pseudo labels as the result of\nmodel's sparse output, and propose a method called Sparse Generation to make\npseudo labels sparse. It constructs dense tensors through the relationship\nbetween data and detector model, optimizes three of its parameters, and obtains\na sparse tensor via coordinated calculation, thereby indirectly obtaining\nhigher quality pseudo labels, and solving the model's density problem in the\nsituation of only a small amount of supervised annotation data can be used. On\ntwo broadly used open-source datasets (RSOD, SIMD) and a self-built dataset\n(Bullet-Hole), the experimental results showed that the proposed method has a\nsignificant advantage in terms of overall performance metrics, comparing to\nthat state-of-the-art method.\n","authors":["Tian Ma","Chuyang Shang","Wanzhu Ren","Yuancheng Li","Jiiayi Yang","Jiali Qian"],"pdf_url":"https://arxiv.org/pdf/2403.19306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19294v1","updated":"2024-03-28T10:31:23Z","published":"2024-03-28T10:31:23Z","title":"FlowDepth: Decoupling Optical Flow for Self-Supervised Monocular Depth\n Estimation","summary":" Self-supervised multi-frame methods have currently achieved promising results\nin depth estimation. However, these methods often suffer from mismatch problems\ndue to the moving objects, which break the static assumption. Additionally,\nunfairness can occur when calculating photometric errors in high-freq or\nlow-texture regions of the images. To address these issues, existing approaches\nuse additional semantic priori black-box networks to separate moving objects\nand improve the model only at the loss level. Therefore, we propose FlowDepth,\nwhere a Dynamic Motion Flow Module (DMFM) decouples the optical flow by a\nmechanism-based approach and warps the dynamic regions thus solving the\nmismatch problem. For the unfairness of photometric errors caused by high-freq\nand low-texture regions, we use Depth-Cue-Aware Blur (DCABlur) and Cost-Volume\nsparsity loss respectively at the input and the loss level to solve the\nproblem. Experimental results on the KITTI and Cityscapes datasets show that\nour method outperforms the state-of-the-art methods.\n","authors":["Yiyang Sun","Zhiyuan Xu","Xiaonian Wang","Jing Yao"],"pdf_url":"https://arxiv.org/pdf/2403.19294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19278v1","updated":"2024-03-28T10:02:08Z","published":"2024-03-28T10:02:08Z","title":"CAT: Exploiting Inter-Class Dynamics for Domain Adaptive Object\n Detection","summary":" Domain adaptive object detection aims to adapt detection models to domains\nwhere annotated data is unavailable. Existing methods have been proposed to\naddress the domain gap using the semi-supervised student-teacher framework.\nHowever, a fundamental issue arises from the class imbalance in the labelled\ntraining set, which can result in inaccurate pseudo-labels. The relationship\nbetween classes, especially where one class is a majority and the other\nminority, has a large impact on class bias. We propose Class-Aware Teacher\n(CAT) to address the class bias issue in the domain adaptation setting. In our\nwork, we approximate the class relationships with our Inter-Class Relation\nmodule (ICRm) and exploit it to reduce the bias within the model. In this way,\nwe are able to apply augmentations to highly related classes, both inter- and\nintra-domain, to boost the performance of minority classes while having minimal\nimpact on majority classes. We further reduce the bias by implementing a\nclass-relation weight to our classification loss. Experiments conducted on\nvarious datasets and ablation studies show that our method is able to address\nthe class bias in the domain adaptation setting. On the Cityscapes to Foggy\nCityscapes dataset, we attained a 52.5 mAP, a substantial improvement over the\n51.2 mAP achieved by the state-of-the-art method.\n","authors":["Mikhail Kennerley","Jian-Gang Wang","Bharadwaj Veeravalli","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2403.19278v1.pdf","comment":"Accepted into CVPR 2024"},{"id":"http://arxiv.org/abs/2307.10924v2","updated":"2024-03-28T09:54:38Z","published":"2023-07-20T14:51:28Z","title":"Intrinsic Image Decomposition Using Point Cloud Representation","summary":" The purpose of intrinsic decomposition is to separate an image into its\nalbedo (reflective properties) and shading components (illumination\nproperties). This is challenging because it's an ill-posed problem.\nConventional approaches primarily concentrate on 2D imagery and fail to fully\nexploit the capabilities of 3D data representation. 3D point clouds offer a\nmore comprehensive format for representing scenes, as they combine geometric\nand color information effectively. To this end, in this paper, we introduce\nPoint Intrinsic Net (PoInt-Net), which leverages 3D point cloud data to\nconcurrently estimate albedo and shading maps. The merits of PoInt-Net include\nthe following aspects. First, the model is efficient, achieving consistent\nperformance across point clouds of any size with training only required on\nsmall-scale point clouds. Second, it exhibits remarkable robustness; even when\ntrained exclusively on datasets comprising individual objects, PoInt-Net\ndemonstrates strong generalization to unseen objects and scenes. Third, it\ndelivers superior accuracy over conventional 2D approaches, demonstrating\nenhanced performance across various metrics on different datasets. (Code\nReleased)\n","authors":["Xiaoyan Xing","Konrad Groh","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2307.10924v2.pdf","comment":"Code: https://github.com/xyxingx/PoInt-Net"},{"id":"http://arxiv.org/abs/2403.17633v2","updated":"2024-03-28T09:47:45Z","published":"2024-03-26T12:08:14Z","title":"UADA3D: Unsupervised Adversarial Domain Adaptation for 3D Object\n Detection with Sparse LiDAR and Large Domain Gaps","summary":" In this study, we address a gap in existing unsupervised domain adaptation\napproaches on LiDAR-based 3D object detection, which have predominantly\nconcentrated on adapting between established, high-density autonomous driving\ndatasets. We focus on sparser point clouds, capturing scenarios from different\nperspectives: not just from vehicles on the road but also from mobile robots on\nsidewalks, which encounter significantly different environmental conditions and\nsensor configurations. We introduce Unsupervised Adversarial Domain Adaptation\nfor 3D Object Detection (UADA3D). UADA3D does not depend on pre-trained source\nmodels or teacher-student architectures. Instead, it uses an adversarial\napproach to directly learn domain-invariant features. We demonstrate its\nefficacy in various adaptation scenarios, showing significant improvements in\nboth self-driving car and mobile robot domains. Our code is open-source and\nwill be available soon.\n","authors":["Maciej K Wozniak","Mattias Hansson","Marko Thiel","Patric Jensfelt"],"pdf_url":"https://arxiv.org/pdf/2403.17633v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19265v1","updated":"2024-03-28T09:44:20Z","published":"2024-03-28T09:44:20Z","title":"Neural Fields for 3D Tracking of Anatomy and Surgical Instruments in\n Monocular Laparoscopic Video Clips","summary":" Laparoscopic video tracking primarily focuses on two target types: surgical\ninstruments and anatomy. The former could be used for skill assessment, while\nthe latter is necessary for the projection of virtual overlays. Where\ninstrument and anatomy tracking have often been considered two separate\nproblems, in this paper, we propose a method for joint tracking of all\nstructures simultaneously. Based on a single 2D monocular video clip, we train\na neural field to represent a continuous spatiotemporal scene, used to create\n3D tracks of all surfaces visible in at least one frame. Due to the small size\nof instruments, they generally cover a small part of the image only, resulting\nin decreased tracking accuracy. Therefore, we propose enhanced class weighting\nto improve the instrument tracks. We evaluate tracking on video clips from\nlaparoscopic cholecystectomies, where we find mean tracking accuracies of 92.4%\nfor anatomical structures and 87.4% for instruments. Additionally, we assess\nthe quality of depth maps obtained from the method's scene reconstructions. We\nshow that these pseudo-depths have comparable quality to a state-of-the-art\npre-trained depth estimator. On laparoscopic videos in the SCARED dataset, the\nmethod predicts depth with an MAE of 2.9 mm and a relative error of 9.2%. These\nresults show the feasibility of using neural fields for monocular 3D\nreconstruction of laparoscopic scenes.\n","authors":["Beerend G. A. Gerats","Jelmer M. Wolterink","Seb P. Mol","Ivo A. M. J. Broeders"],"pdf_url":"https://arxiv.org/pdf/2403.19265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03795v3","updated":"2024-03-28T09:40:08Z","published":"2023-12-06T14:13:54Z","title":"AnimatableDreamer: Text-Guided Non-rigid 3D Model Generation and\n Reconstruction with Canonical Score Distillation","summary":" Advances in 3D generation have facilitated sequential 3D model generation\n(a.k.a 4D generation), yet its application for animatable objects with large\nmotion remains scarce. Our work proposes AnimatableDreamer, a text-to-4D\ngeneration framework capable of generating diverse categories of non-rigid\nobjects on skeletons extracted from a monocular video. At its core,\nAnimatableDreamer is equipped with our novel optimization design dubbed\nCanonical Score Distillation (CSD), which lifts 2D diffusion for temporal\nconsistent 4D generation. CSD, designed from a score gradient perspective,\ngenerates a canonical model with warp-robustness across different\narticulations. Notably, it also enhances the authenticity of bones and skinning\nby integrating inductive priors from a diffusion model. Furthermore, with\nmulti-view distillation, CSD infers invisible regions, thereby improving the\nfidelity of monocular non-rigid reconstruction. Extensive experiments\ndemonstrate the capability of our method in generating high-flexibility\ntext-guided 3D models from the monocular video, while also showing improved\nreconstruction performance over existing non-rigid reconstruction methods.\n","authors":["Xinzhou Wang","Yikai Wang","Junliang Ye","Zhengyi Wang","Fuchun Sun","Pengkun Liu","Ling Wang","Kai Sun","Xintong Wang","Bin He"],"pdf_url":"https://arxiv.org/pdf/2312.03795v3.pdf","comment":"Project page: https://animatabledreamer.github.io/"},{"id":"http://arxiv.org/abs/2311.15977v2","updated":"2024-03-28T09:31:05Z","published":"2023-11-27T16:23:01Z","title":"Text2Loc: 3D Point Cloud Localization from Natural Language","summary":" We tackle the problem of 3D point cloud localization based on a few natural\nlinguistic descriptions and introduce a novel neural network, Text2Loc, that\nfully interprets the semantic relationship between points and text. Text2Loc\nfollows a coarse-to-fine localization pipeline: text-submap global place\nrecognition, followed by fine localization. In global place recognition,\nrelational dynamics among each textual hint are captured in a hierarchical\ntransformer with max-pooling (HTM), whereas a balance between positive and\nnegative pairs is maintained using text-submap contrastive learning. Moreover,\nwe propose a novel matching-free fine localization method to further refine the\nlocation predictions, which completely removes the need for complicated\ntext-instance matching and is lighter, faster, and more accurate than previous\nmethods. Extensive experiments show that Text2Loc improves the localization\naccuracy by up to $2\\times$ over the state-of-the-art on the KITTI360Pose\ndataset. Our project page is publicly available at\n\\url{https://yan-xia.github.io/projects/text2loc/}.\n","authors":["Yan Xia","Letian Shi","Zifeng Ding","João F. Henriques","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2311.15977v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19254v1","updated":"2024-03-28T09:21:00Z","published":"2024-03-28T09:21:00Z","title":"Imperceptible Protection against Style Imitation from Diffusion Models","summary":" Recent progress in diffusion models has profoundly enhanced the fidelity of\nimage generation. However, this has raised concerns about copyright\ninfringements. While prior methods have introduced adversarial perturbations to\nprevent style imitation, most are accompanied by the degradation of artworks'\nvisual quality. Recognizing the importance of maintaining this, we develop a\nvisually improved protection method that preserves its protection capability.\nTo this end, we create a perceptual map to identify areas most sensitive to\nhuman eyes. We then adjust the protection intensity guided by an instance-aware\nrefinement. We also integrate a perceptual constraints bank to further improve\nthe imperceptibility. Results show that our method substantially elevates the\nquality of the protected image without compromising on protection efficacy.\n","authors":["Namhyuk Ahn","Wonhyuk Ahn","KiYoon Yoo","Daesik Kim","Seung-Hun Nam"],"pdf_url":"https://arxiv.org/pdf/2403.19254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08080v2","updated":"2024-03-28T09:20:33Z","published":"2023-10-12T07:10:12Z","title":"RT-SRTS: Angle-Agnostic Real-Time Simultaneous 3D Reconstruction and\n Tumor Segmentation from Single X-Ray Projection","summary":" Radiotherapy is one of the primary treatment methods for tumors, but the\norgan movement caused by respiration limits its accuracy. Recently, 3D imaging\nfrom a single X-ray projection has received extensive attention as a promising\napproach to address this issue. However, current methods can only reconstruct\n3D images without directly locating the tumor and are only validated for\nfixed-angle imaging, which fails to fully meet the requirements of motion\ncontrol in radiotherapy. In this study, a novel imaging method RT-SRTS is\nproposed which integrates 3D imaging and tumor segmentation into one network\nbased on multi-task learning (MTL) and achieves real-time simultaneous 3D\nreconstruction and tumor segmentation from a single X-ray projection at any\nangle. Furthermore, the attention enhanced calibrator (AEC) and\nuncertain-region elaboration (URE) modules have been proposed to aid feature\nextraction and improve segmentation accuracy. The proposed method was evaluated\non fifteen patient cases and compared with three state-of-the-art methods. It\nnot only delivers superior 3D reconstruction but also demonstrates commendable\ntumor segmentation results. Simultaneous reconstruction and segmentation can be\ncompleted in approximately 70 ms, significantly faster than the required time\nthreshold for real-time tumor tracking. The efficacies of both AEC and URE have\nalso been validated in ablation studies. The code of work is available at\nhttps://github.com/ZywooSimple/RT-SRTS.\n","authors":["Miao Zhu","Qiming Fu","Bo Liu","Mengxi Zhang","Bojian Li","Xiaoyan Luo","Fugen Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.08080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11342v2","updated":"2024-03-28T09:20:19Z","published":"2023-04-22T07:48:17Z","title":"NaviNeRF: NeRF-based 3D Representation Disentanglement by Latent\n Semantic Navigation","summary":" 3D representation disentanglement aims to identify, decompose, and manipulate\nthe underlying explanatory factors of 3D data, which helps AI fundamentally\nunderstand our 3D world. This task is currently under-explored and poses great\nchallenges: (i) the 3D representations are complex and in general contains much\nmore information than 2D image; (ii) many 3D representations are not well\nsuited for gradient-based optimization, let alone disentanglement. To address\nthese challenges, we use NeRF as a differentiable 3D representation, and\nintroduce a self-supervised Navigation to identify interpretable semantic\ndirections in the latent space. To our best knowledge, this novel method,\ndubbed NaviNeRF, is the first work to achieve fine-grained 3D disentanglement\nwithout any priors or supervisions. Specifically, NaviNeRF is built upon the\ngenerative NeRF pipeline, and equipped with an Outer Navigation Branch and an\nInner Refinement Branch. They are complementary -- the outer navigation is to\nidentify global-view semantic directions, and the inner refinement dedicates to\nfine-grained attributes. A synergistic loss is further devised to coordinate\ntwo branches. Extensive experiments demonstrate that NaviNeRF has a superior\nfine-grained 3D disentanglement ability than the previous 3D-aware models. Its\nperformance is also comparable to editing-oriented models relying on semantic\nor geometry priors.\n","authors":["Baao Xie","Bohan Li","Zequn Zhang","Junting Dong","Xin Jin","Jingyu Yang","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2304.11342v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19243v1","updated":"2024-03-28T08:58:20Z","published":"2024-03-28T08:58:20Z","title":"Sine Activated Low-Rank Matrices for Parameter Efficient Learning","summary":" Low-rank decomposition has emerged as a vital tool for enhancing parameter\nefficiency in neural network architectures, gaining traction across diverse\napplications in machine learning. These techniques significantly lower the\nnumber of parameters, striking a balance between compactness and performance.\nHowever, a common challenge has been the compromise between parameter\nefficiency and the accuracy of the model, where reduced parameters often lead\nto diminished accuracy compared to their full-rank counterparts. In this work,\nwe propose a novel theoretical framework that integrates a sinusoidal function\nwithin the low-rank decomposition process. This approach not only preserves the\nbenefits of the parameter efficiency characteristic of low-rank methods but\nalso increases the decomposition's rank, thereby enhancing model accuracy. Our\nmethod proves to be an adaptable enhancement for existing low-rank models, as\nevidenced by its successful application in Vision Transformers (ViT), Large\nLanguage Models (LLMs), Neural Radiance Fields (NeRF), and 3D shape modeling.\nThis demonstrates the wide-ranging potential and efficiency of our proposed\ntechnique.\n","authors":["Yiping Ji","Hemanth Saratchandran","Cameron Gordon","Zeyu Zhang","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2403.19243v1.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2403.19242v1","updated":"2024-03-28T08:54:40Z","published":"2024-03-28T08:54:40Z","title":"RTracker: Recoverable Tracking via PN Tree Structured Memory","summary":" Existing tracking methods mainly focus on learning better target\nrepresentation or developing more robust prediction models to improve tracking\nperformance. While tracking performance has significantly improved, the target\nloss issue occurs frequently due to tracking failures, complete occlusion, or\nout-of-view situations. However, considerably less attention is paid to the\nself-recovery issue of tracking methods, which is crucial for practical\napplications. To this end, we propose a recoverable tracking framework,\nRTracker, that uses a tree-structured memory to dynamically associate a tracker\nand a detector to enable self-recovery ability. Specifically, we propose a\nPositive-Negative Tree-structured memory to chronologically store and maintain\npositive and negative target samples. Upon the PN tree memory, we develop\ncorresponding walking rules for determining the state of the target and define\na set of control flows to unite the tracker and the detector in different\ntracking scenarios. Our core idea is to use the support samples of positive and\nnegative target categories to establish a relative distance-based criterion for\na reliable assessment of target loss. The favorable performance in comparison\nagainst the state-of-the-art methods on numerous challenging benchmarks\ndemonstrates the effectiveness of the proposed algorithm.\n","authors":["Yuqing Huang","Xin Li","Zikun Zhou","Yaowei Wang","Zhenyu He","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19242v1.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19238v1","updated":"2024-03-28T08:49:35Z","published":"2024-03-28T08:49:35Z","title":"Taming Lookup Tables for Efficient Image Retouching","summary":" The widespread use of high-definition screens in edge devices, such as\nend-user cameras, smartphones, and televisions, is spurring a significant\ndemand for image enhancement. Existing enhancement models often optimize for\nhigh performance while falling short of reducing hardware inference time and\npower consumption, especially on edge devices with constrained computing and\nstorage resources. To this end, we propose Image Color Enhancement Lookup Table\n(ICELUT) that adopts LUTs for extremely efficient edge inference, without any\nconvolutional neural network (CNN). During training, we leverage pointwise\n(1x1) convolution to extract color information, alongside a split fully\nconnected layer to incorporate global information. Both components are then\nseamlessly converted into LUTs for hardware-agnostic deployment. ICELUT\nachieves near-state-of-the-art performance and remarkably low power\nconsumption. We observe that the pointwise network structure exhibits robust\nscalability, upkeeping the performance even with a heavily downsampled 32x32\ninput image. These enable ICELUT, the first-ever purely LUT-based image\nenhancer, to reach an unprecedented speed of 0.4ms on GPU and 7ms on CPU, at\nleast one order faster than any CNN solution. Codes are available at\nhttps://github.com/Stephen0808/ICELUT.\n","authors":["Sidi Yang","Binxiao Huang","Mingdeng Cao","Yatai Ji","Hanzhong Guo","Ngai Wong","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19654v2","updated":"2024-03-28T08:47:14Z","published":"2023-10-30T15:38:43Z","title":"MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient\n image-text retrieval","summary":" Due to the success of large-scale visual-language pretraining (VLP) models\nand the widespread use of image-text retrieval in industry areas, it is now\ncritically necessary to reduce the model size and streamline their\nmobile-device deployment. Single- and dual-stream model structures are commonly\nused in image-text retrieval with the goal of closing the semantic gap between\ntextual and visual modalities. While single-stream models use deep feature\nfusion to achieve more accurate cross-model alignment, dual-stream models are\nbetter at offline indexing and fast inference.We propose a Multi-teacher\nCross-modality Alignment Distillation (MCAD) technique to integrate the\nadvantages of single- and dual-stream models. By incorporating the fused\nsingle-stream features into the image and text features of the dual-stream\nmodel, we formulate new modified teacher similarity distributions and features.\nThen, we conduct both distribution and feature distillation to boost the\ncapability of the student dual-stream model, achieving high retrieval\nperformance without increasing inference complexity.Extensive experiments\ndemonstrate the remarkable performance and high efficiency of MCAD on\nimage-text retrieval tasks. Furthermore, we implement a lightweight CLIP model\non Snapdragon/Dimensity chips with only $\\sim$100M running memory and\n$\\sim$8.0ms search latency, achieving the mobile-device application of VLP\nmodels.\n","authors":["Youbo Lei","Feifei He","Chen Chen","Yingbin Mo","Si Jia Li","Defeng Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2310.19654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19235v1","updated":"2024-03-28T08:47:02Z","published":"2024-03-28T08:47:02Z","title":"DreamSalon: A Staged Diffusion Framework for Preserving Identity-Context\n in Editable Face Generation","summary":" While large-scale pre-trained text-to-image models can synthesize diverse and\nhigh-quality human-centered images, novel challenges arise with a nuanced task\nof \"identity fine editing\": precisely modifying specific features of a subject\nwhile maintaining its inherent identity and context. Existing personalization\nmethods either require time-consuming optimization or learning additional\nencoders, adept in \"identity re-contextualization\". However, they often\nstruggle with detailed and sensitive tasks like human face editing. To address\nthese challenges, we introduce DreamSalon, a noise-guided, staged-editing\nframework, uniquely focusing on detailed image manipulations and\nidentity-context preservation. By discerning editing and boosting stages via\nthe frequency and gradient of predicted noises, DreamSalon first performs\ndetailed manipulations on specific features in the editing stage, guided by\nhigh-frequency information, and then employs stochastic denoising in the\nboosting stage to improve image quality. For more precise editing, DreamSalon\nsemantically mixes source and target textual prompts, guided by differences in\ntheir embedding covariances, to direct the model's focus on specific\nmanipulation areas. Our experiments demonstrate DreamSalon's ability to\nefficiently and faithfully edit fine details on human faces, outperforming\nexisting methods both qualitatively and quantitatively.\n","authors":["Haonan Lin","Mengmeng Wang","Yan Chen","Wenbin An","Yuzhe Yao","Guang Dai","Qianying Wang","Yong Liu","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19232v1","updated":"2024-03-28T08:44:36Z","published":"2024-03-28T08:44:36Z","title":"AZ-NAS: Assembling Zero-Cost Proxies for Network Architecture Search","summary":" Training-free network architecture search (NAS) aims to discover\nhigh-performing networks with zero-cost proxies, capturing network\ncharacteristics related to the final performance. However, network rankings\nestimated by previous training-free NAS methods have shown weak correlations\nwith the performance. To address this issue, we propose AZ-NAS, a novel\napproach that leverages the ensemble of various zero-cost proxies to enhance\nthe correlation between a predicted ranking of networks and the ground truth\nsubstantially in terms of the performance. To achieve this, we introduce four\nnovel zero-cost proxies that are complementary to each other, analyzing\ndistinct traits of architectures in the views of expressivity, progressivity,\ntrainability, and complexity. The proxy scores can be obtained simultaneously\nwithin a single forward and backward pass, making an overall NAS process highly\nefficient. In order to integrate the rankings predicted by our proxies\neffectively, we introduce a non-linear ranking aggregation method that\nhighlights the networks highly-ranked consistently across all the proxies.\nExperimental results conclusively demonstrate the efficacy and efficiency of\nAZ-NAS, outperforming state-of-the-art methods on standard benchmarks, all\nwhile maintaining a reasonable runtime cost.\n","authors":["Junghyup Lee","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2403.19232v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2401.11874v2","updated":"2024-03-28T08:40:08Z","published":"2024-01-22T12:00:37Z","title":"Detect-Order-Construct: A Tree Construction based Approach for\n Hierarchical Document Structure Analysis","summary":" Document structure analysis (aka document layout analysis) is crucial for\nunderstanding the physical layout and logical structure of documents, with\napplications in information retrieval, document summarization, knowledge\nextraction, etc. In this paper, we concentrate on Hierarchical Document\nStructure Analysis (HDSA) to explore hierarchical relationships within\nstructured documents created using authoring software employing hierarchical\nschemas, such as LaTeX, Microsoft Word, and HTML. To comprehensively analyze\nhierarchical document structures, we propose a tree construction based approach\nthat addresses multiple subtasks concurrently, including page object detection\n(Detect), reading order prediction of identified objects (Order), and the\nconstruction of intended hierarchical structure (Construct). We present an\neffective end-to-end solution based on this framework to demonstrate its\nperformance. To assess our approach, we develop a comprehensive benchmark\ncalled Comp-HRDoc, which evaluates the above subtasks simultaneously. Our\nend-to-end system achieves state-of-the-art performance on two large-scale\ndocument layout analysis datasets (PubLayNet and DocLayNet), a high-quality\nhierarchical document structure reconstruction dataset (HRDoc), and our\nComp-HRDoc benchmark. The Comp-HRDoc benchmark will be released to facilitate\nfurther research in this field.\n","authors":["Jiawei Wang","Kai Hu","Zhuoyao Zhong","Lei Sun","Qiang Huo"],"pdf_url":"https://arxiv.org/pdf/2401.11874v2.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2403.19225v1","updated":"2024-03-28T08:39:44Z","published":"2024-03-28T08:39:44Z","title":"Efficient and Effective Weakly-Supervised Action Segmentation via\n Action-Transition-Aware Boundary Alignment","summary":" Weakly-supervised action segmentation is a task of learning to partition a\nlong video into several action segments, where training videos are only\naccompanied by transcripts (ordered list of actions). Most of existing methods\nneed to infer pseudo segmentation for training by serial alignment between all\nframes and the transcript, which is time-consuming and hard to be parallelized\nwhile training. In this work, we aim to escape from this inefficient alignment\nwith massive but redundant frames, and instead to directly localize a few\naction transitions for pseudo segmentation generation, where a transition\nrefers to the change from an action segment to its next adjacent one in the\ntranscript. As the true transitions are submerged in noisy boundaries due to\nintra-segment visual variation, we propose a novel Action-Transition-Aware\nBoundary Alignment (ATBA) framework to efficiently and effectively filter out\nnoisy boundaries and detect transitions. In addition, to boost the semantic\nlearning in the case that noise is inevitably present in the pseudo\nsegmentation, we also introduce video-level losses to utilize the trusted\nvideo-level supervision. Extensive experiments show the effectiveness of our\napproach on both performance and training speed.\n","authors":["Angchi Xu","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.19225v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19221v1","updated":"2024-03-28T08:35:46Z","published":"2024-03-28T08:35:46Z","title":"Towards Multimodal Video Paragraph Captioning Models Robust to Missing\n Modality","summary":" Video paragraph captioning (VPC) involves generating detailed narratives for\nlong videos, utilizing supportive modalities such as speech and event\nboundaries. However, the existing models are constrained by the assumption of\nconstant availability of a single auxiliary modality, which is impractical\ngiven the diversity and unpredictable nature of real-world scenarios. To this\nend, we propose a Missing-Resistant framework MR-VPC that effectively harnesses\nall available auxiliary inputs and maintains resilience even in the absence of\ncertain modalities. Under this framework, we propose the Multimodal VPC (MVPC)\narchitecture integrating video, speech, and event boundary inputs in a unified\nmanner to process various auxiliary inputs. Moreover, to fortify the model\nagainst incomplete data, we introduce DropAM, a data augmentation strategy that\nrandomly omits auxiliary inputs, paired with DistillAM, a regularization target\nthat distills knowledge from teacher models trained on modality-complete data,\nenabling efficient learning in modality-deficient environments. Through\nexhaustive experimentation on YouCook2 and ActivityNet Captions, MR-VPC has\nproven to deliver superior performance on modality-complete and\nmodality-missing test data. This work highlights the significance of developing\nresilient VPC models and paves the way for more adaptive, robust multimodal\nvideo understanding.\n","authors":["Sishuo Chen","Lei Li","Shuhuai Ren","Rundong Gao","Yuanxin Liu","Xiaohan Bi","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2403.19221v1.pdf","comment":"Code available at https://github.com/lancopku/MR-VPC"},{"id":"http://arxiv.org/abs/2403.19220v1","updated":"2024-03-28T08:34:04Z","published":"2024-03-28T08:34:04Z","title":"GeoAuxNet: Towards Universal 3D Representation Learning for Multi-sensor\n Point Clouds","summary":" Point clouds captured by different sensors such as RGB-D cameras and LiDAR\npossess non-negligible domain gaps. Most existing methods design different\nnetwork architectures and train separately on point clouds from various\nsensors. Typically, point-based methods achieve outstanding performances on\neven-distributed dense point clouds from RGB-D cameras, while voxel-based\nmethods are more efficient for large-range sparse LiDAR point clouds. In this\npaper, we propose geometry-to-voxel auxiliary learning to enable voxel\nrepresentations to access point-level geometric information, which supports\nbetter generalisation of the voxel-based backbone with additional\ninterpretations of multi-sensor point clouds. Specifically, we construct\nhierarchical geometry pools generated by a voxel-guided dynamic point network,\nwhich efficiently provide auxiliary fine-grained geometric information adapted\nto different stages of voxel features. We conduct experiments on joint\nmulti-sensor datasets to demonstrate the effectiveness of GeoAuxNet. Enjoying\nelaborate geometric information, our method outperforms other models\ncollectively trained on multi-sensor datasets, and achieve competitive results\nwith the-state-of-art experts on each single dataset.\n","authors":["Shengjun Zhang","Xin Fei","Yueqi Duan"],"pdf_url":"https://arxiv.org/pdf/2403.19220v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2311.13120v3","updated":"2024-03-28T08:30:56Z","published":"2023-11-22T02:46:57Z","title":"Multi-modal In-Context Learning Makes an Ego-evolving Scene Text\n Recognizer","summary":" Scene text recognition (STR) in the wild frequently encounters challenges\nwhen coping with domain variations, font diversity, shape deformations, etc. A\nstraightforward solution is performing model fine-tuning tailored to a specific\nscenario, but it is computationally intensive and requires multiple model\ncopies for various scenarios. Recent studies indicate that large language\nmodels (LLMs) can learn from a few demonstration examples in a training-free\nmanner, termed \"In-Context Learning\" (ICL). Nevertheless, applying LLMs as a\ntext recognizer is unacceptably resource-consuming. Moreover, our pilot\nexperiments on LLMs show that ICL fails in STR, mainly attributed to the\ninsufficient incorporation of contextual information from diverse samples in\nthe training stage. To this end, we introduce E$^2$STR, a STR model trained\nwith context-rich scene text sequences, where the sequences are generated via\nour proposed in-context training strategy. E$^2$STR demonstrates that a\nregular-sized model is sufficient to achieve effective ICL capabilities in STR.\nExtensive experiments show that E$^2$STR exhibits remarkable training-free\nadaptation in various scenarios and outperforms even the fine-tuned\nstate-of-the-art approaches on public benchmarks. The code is released at\nhttps://github.com/bytedance/E2STR .\n","authors":["Zhen Zhao","Jingqun Tang","Chunhui Lin","Binghong Wu","Can Huang","Hao Liu","Xin Tan","Zhizhong Zhang","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2311.13120v3.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2402.05608v3","updated":"2024-03-28T08:28:44Z","published":"2024-02-08T12:08:42Z","title":"Scalable Diffusion Models with State Space Backbone","summary":" This paper presents a new exploration into a category of diffusion models\nbuilt upon state space architecture. We endeavor to train diffusion models for\nimage data, wherein the traditional U-Net backbone is supplanted by a state\nspace backbone, functioning on raw patches or latent space. Given its notable\nefficacy in accommodating long-range dependencies, Diffusion State Space Models\n(DiS) are distinguished by treating all inputs including time, condition, and\nnoisy image patches as tokens. Our assessment of DiS encompasses both\nunconditional and class-conditional image generation scenarios, revealing that\nDiS exhibits comparable, if not superior, performance to CNN-based or\nTransformer-based U-Net architectures of commensurate size. Furthermore, we\nanalyze the scalability of DiS, gauged by the forward pass complexity\nquantified in Gflops. DiS models with higher Gflops, achieved through\naugmentation of depth/width or augmentation of input tokens, consistently\ndemonstrate lower FID. In addition to demonstrating commendable scalability\ncharacteristics, DiS-H/2 models in latent space achieve performance levels akin\nto prior diffusion models on class-conditional ImageNet benchmarks at the\nresolution of 256$\\times$256 and 512$\\times$512, while significantly reducing\nthe computational burden. The code and models are available at:\nhttps://github.com/feizc/DiS.\n","authors":["Zhengcong Fei","Mingyuan Fan","Changqian Yu","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2402.05608v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00096v2","updated":"2024-03-28T08:25:27Z","published":"2023-11-30T13:32:43Z","title":"OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor for\n General Video Recognition","summary":" Due to the resource-intensive nature of training vision-language models on\nexpansive video data, a majority of studies have centered on adapting\npre-trained image-language models to the video domain. Dominant pipelines\npropose to tackle the visual discrepancies with additional temporal learners\nwhile overlooking the substantial discrepancy for web-scaled descriptive\nnarratives and concise action category names, leading to less distinct semantic\nspace and potential performance limitations. In this work, we prioritize the\nrefinement of text knowledge to facilitate generalizable video recognition. To\naddress the limitations of the less distinct semantic space of category names,\nwe prompt a large language model (LLM) to augment action class names into\nSpatio-Temporal Descriptors thus bridging the textual discrepancy and serving\nas a knowledge base for general recognition. Moreover, to assign the best\ndescriptors with different video instances, we propose Optimal Descriptor\nSolver, forming the video recognition problem as solving the optimal matching\nflow across frame-level representations and descriptors. Comprehensive\nevaluations in zero-shot, few-shot, and fully supervised video recognition\nhighlight the effectiveness of our approach. Our best model achieves a\nstate-of-the-art zero-shot accuracy of 75.1% on Kinetics-600.\n","authors":["Tongjia Chen","Hongshan Yu","Zhengeng Yang","Zechuan Li","Wei Sun","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00096v2.pdf","comment":"Technical report. Project Page: https://tomchen-ctj.github.io/OST/"},{"id":"http://arxiv.org/abs/2308.12532v6","updated":"2024-03-28T08:23:02Z","published":"2023-08-24T03:43:02Z","title":"FedSOL: Stabilized Orthogonal Learning with Proximal Restrictions in\n Federated Learning","summary":" Federated Learning (FL) aggregates locally trained models from individual\nclients to construct a global model. While FL enables learning a model with\ndata privacy, it often suffers from significant performance degradation when\nclients have heterogeneous data distributions. This data heterogeneity causes\nthe model to forget the global knowledge acquired from previously sampled\nclients after being trained on local datasets. Although the introduction of\nproximal objectives in local updates helps to preserve global knowledge, it can\nalso hinder local learning by interfering with local objectives. To address\nthis problem, we propose a novel method, Federated Stabilized Orthogonal\nLearning (FedSOL), which adopts an orthogonal learning strategy to balance the\ntwo conflicting objectives. FedSOL is designed to identify gradients of local\nobjectives that are inherently orthogonal to directions affecting the proximal\nobjective. Specifically, FedSOL targets parameter regions where learning on the\nlocal objective is minimally influenced by proximal weight perturbations. Our\nexperiments demonstrate that FedSOL consistently achieves state-of-the-art\nperformance across various scenarios.\n","authors":["Gihun Lee","Minchan Jeong","Sangmook Kim","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.12532v6.pdf","comment":"The IEEE/CVF Conference on Computer Vision and Pattern Recognition\n 2024 (CVPR 2024)"},{"id":"http://arxiv.org/abs/2403.19213v1","updated":"2024-03-28T08:21:56Z","published":"2024-03-28T08:21:56Z","title":"Learning Multiple Representations with Inconsistency-Guided Detail\n Regularization for Mask-Guided Matting","summary":" Mask-guided matting networks have achieved significant improvements and have\nshown great potential in practical applications in recent years. However,\nsimply learning matting representation from synthetic and\nlack-of-real-world-diversity matting data, these approaches tend to overfit\nlow-level details in wrong regions, lack generalization to objects with complex\nstructures and real-world scenes such as shadows, as well as suffer from\ninterference of background lines or textures. To address these challenges, in\nthis paper, we propose a novel auxiliary learning framework for mask-guided\nmatting models, incorporating three auxiliary tasks: semantic segmentation,\nedge detection, and background line detection besides matting, to learn\ndifferent and effective representations from different types of data and\nannotations. Our framework and model introduce the following key aspects: (1)\nto learn real-world adaptive semantic representation for objects with diverse\nand complex structures under real-world scenes, we introduce extra semantic\nsegmentation and edge detection tasks on more diverse real-world data with\nsegmentation annotations; (2) to avoid overfitting on low-level details, we\npropose a module to utilize the inconsistency between learned segmentation and\nmatting representations to regularize detail refinement; (3) we propose a novel\nbackground line detection task into our auxiliary learning framework, to\nsuppress interference of background lines or textures. In addition, we propose\na high-quality matting benchmark, Plant-Mat, to evaluate matting methods on\ncomplex structures. Extensively quantitative and qualitative results show that\nour approach outperforms state-of-the-art mask-guided methods.\n","authors":["Weihao Jiang","Zhaozhi Xie","Yuxiang Lu","Longjie Qi","Jingyong Cai","Hiroyuki Uchiyama","Bin Chen","Yue Ding","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.19213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03441v4","updated":"2024-03-28T08:09:07Z","published":"2023-12-06T11:50:14Z","title":"UFineBench: Towards Text-based Person Retrieval with Ultra-fine\n Granularity","summary":" Existing text-based person retrieval datasets often have relatively\ncoarse-grained text annotations. This hinders the model to comprehend the\nfine-grained semantics of query texts in real scenarios. To address this\nproblem, we contribute a new benchmark named \\textbf{UFineBench} for text-based\nperson retrieval with ultra-fine granularity.\n Firstly, we construct a new \\textbf{dataset} named UFine6926. We collect a\nlarge number of person images and manually annotate each image with two\ndetailed textual descriptions, averaging 80.8 words each. The average word\ncount is three to four times that of the previous datasets. In addition of\nstandard in-domain evaluation, we also propose a special \\textbf{evaluation\nparadigm} more representative of real scenarios. It contains a new evaluation\nset with cross domains, cross textual granularity and cross textual styles,\nnamed UFine3C, and a new evaluation metric for accurately measuring retrieval\nability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a\nmore efficient \\textbf{algorithm} especially designed for text-based person\nretrieval with ultra fine-grained texts. It achieves fine granularity mining by\nadopting a shared cross-modal granularity decoder and hard negative match\nmechanism.\n With standard in-domain evaluation, CFAM establishes competitive performance\nacross various datasets, especially on our ultra fine-grained UFine6926.\nFurthermore, by evaluating on UFine3C, we demonstrate that training on our\nUFine6926 significantly improves generalization to real scenarios compared with\nother coarse-grained datasets. The dataset and code will be made publicly\navailable at \\url{https://github.com/Zplusdragon/UFineBench}.\n","authors":["Jialong Zuo","Hanyu Zhou","Ying Nie","Feng Zhang","Tianyu Guo","Nong Sang","Yunhe Wang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2312.03441v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19205v1","updated":"2024-03-28T08:06:48Z","published":"2024-03-28T08:06:48Z","title":"From Activation to Initialization: Scaling Insights for Optimizing\n Neural Fields","summary":" In the realm of computer vision, Neural Fields have gained prominence as a\ncontemporary tool harnessing neural networks for signal representation. Despite\nthe remarkable progress in adapting these networks to solve a variety of\nproblems, the field still lacks a comprehensive theoretical framework. This\narticle aims to address this gap by delving into the intricate interplay\nbetween initialization and activation, providing a foundational basis for the\nrobust optimization of Neural Fields. Our theoretical insights reveal a\ndeep-seated connection among network initialization, architectural choices, and\nthe optimization process, emphasizing the need for a holistic approach when\ndesigning cutting-edge Neural Fields.\n","authors":["Hemanth Saratchandran","Sameera Ramasinghe","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2403.19205v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11956v3","updated":"2024-03-28T08:04:51Z","published":"2024-03-18T16:52:49Z","title":"Subjective-Aligned Dataset and Metric for Text-to-Video Quality\n Assessment","summary":" With the rapid development of generative models, Artificial\nIntelligence-Generated Contents (AIGC) have exponentially increased in daily\nlives. Among them, Text-to-Video (T2V) generation has received widespread\nattention. Though many T2V models have been released for generating high\nperceptual quality videos, there is still lack of a method to evaluate the\nquality of these videos quantitatively. To solve this issue, we establish the\nlargest-scale Text-to-Video Quality Assessment DataBase (T2VQA-DB) to date. The\ndataset is composed of 10,000 videos generated by 9 different T2V models. We\nalso conduct a subjective study to obtain each video's corresponding mean\nopinion score. Based on T2VQA-DB, we propose a novel transformer-based model\nfor subjective-aligned Text-to-Video Quality Assessment (T2VQA). The model\nextracts features from text-video alignment and video fidelity perspectives,\nthen it leverages the ability of a large language model to give the prediction\nscore. Experimental results show that T2VQA outperforms existing T2V metrics\nand SOTA video quality assessment models. Quantitative analysis indicates that\nT2VQA is capable of giving subjective-align predictions, validating its\neffectiveness. The dataset and code will be released at\nhttps://github.com/QMME/T2VQA.\n","authors":["Tengchuan Kou","Xiaohong Liu","Zicheng Zhang","Chunyi Li","Haoning Wu","Xiongkuo Min","Guangtao Zhai","Ning Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11956v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18807v2","updated":"2024-03-28T08:01:34Z","published":"2024-03-27T17:53:30Z","title":"ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth\n Estimation","summary":" In the absence of parallax cues, a learning-based single image depth\nestimation (SIDE) model relies heavily on shading and contextual cues in the\nimage. While this simplicity is attractive, it is necessary to train such\nmodels on large and varied datasets, which are difficult to capture. It has\nbeen shown that using embeddings from pre-trained foundational models, such as\nCLIP, improves zero shot transfer in several applications. Taking inspiration\nfrom this, in our paper we explore the use of global image priors generated\nfrom a pre-trained ViT model to provide more detailed contextual information.\nWe argue that the embedding vector from a ViT model, pre-trained on a large\ndataset, captures greater relevant information for SIDE than the usual route of\ngenerating pseudo image captions, followed by CLIP based text embeddings. Based\non this idea, we propose a new SIDE model using a diffusion backbone which is\nconditioned on ViT embeddings. Our proposed design establishes a new\nstate-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of\n0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on\nKITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to\n0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model\ntrained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%)\nover NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%,\n18%, 45%, 9%) by ZoeDepth. The code is available at\nhttps://ecodepth-iitd.github.io\n","authors":["Suraj Patni","Aradhye Agarwal","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2403.18807v2.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2403.19203v1","updated":"2024-03-28T08:00:14Z","published":"2024-03-28T08:00:14Z","title":"Single-Shared Network with Prior-Inspired Loss for Parameter-Efficient\n Multi-Modal Imaging Skin Lesion Classification","summary":" In this study, we introduce a multi-modal approach that efficiently\nintegrates multi-scale clinical and dermoscopy features within a single\nnetwork, thereby substantially reducing model parameters. The proposed method\nincludes three novel fusion schemes.\n Firstly, unlike current methods that usually employ two individual models for\nfor clinical and dermoscopy modalities, we verified that multimodal feature can\nbe learned by sharing the parameters of encoder while leaving the individual\nmodal-specific classifiers.\n Secondly, the shared cross-attention module can replace the individual one to\nefficiently interact between two modalities at multiple layers.\n Thirdly, different from current methods that equally optimize dermoscopy and\nclinical branches, inspired by prior knowledge that dermoscopy images play a\nmore significant role than clinical images, we propose a novel biased loss.\nThis loss guides the single-shared network to prioritize dermoscopy information\nover clinical information, implicitly learning a better joint feature\nrepresentation for the modal-specific task.\n Extensive experiments on a well-recognized Seven-Point Checklist (SPC)\ndataset and a collected dataset demonstrate the effectiveness of our method on\nboth CNN and Transformer structures. Furthermore, our method exhibits\nsuperiority in both accuracy and model parameters compared to currently\nadvanced methods.\n","authors":["Peng Tang","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2403.19203v1.pdf","comment":"This paper have submitted to Journal for review"},{"id":"http://arxiv.org/abs/2403.14760v2","updated":"2024-03-28T07:46:49Z","published":"2024-03-21T18:02:20Z","title":"Can 3D Vision-Language Models Truly Understand Natural Language?","summary":" Rapid advancements in 3D vision-language (3D-VL) tasks have opened up new\navenues for human interaction with embodied agents or robots using natural\nlanguage. Despite this progress, we find a notable limitation: existing 3D-VL\nmodels exhibit sensitivity to the styles of language input, struggling to\nunderstand sentences with the same semantic meaning but written in different\nvariants. This observation raises a critical question: Can 3D vision-language\nmodels truly understand natural language? To test the language\nunderstandability of 3D-VL models, we first propose a language robustness task\nfor systematically assessing 3D-VL models across various tasks, benchmarking\ntheir performance when presented with different language style variants.\nImportantly, these variants are commonly encountered in applications requiring\ndirect interaction with humans, such as embodied robotics, given the diversity\nand unpredictability of human language. We propose a 3D Language Robustness\nDataset, designed based on the characteristics of human language, to facilitate\nthe systematic study of robustness. Our comprehensive evaluation uncovers a\nsignificant drop in the performance of all existing models across various 3D-VL\ntasks. Even the state-of-the-art 3D-LLM fails to understand some variants of\nthe same sentences. Further in-depth analysis suggests that the existing models\nhave a fragile and biased fusion module, which stems from the low diversity of\nthe existing dataset. Finally, we propose a training-free module driven by LLM,\nwhich improves language robustness. Datasets and code will be available at\ngithub.\n","authors":["Weipeng Deng","Runyu Ding","Jihan Yang","Jiahui Liu","Yijiang Li","Xiaojuan Qi","Edith Ngai"],"pdf_url":"https://arxiv.org/pdf/2403.14760v2.pdf","comment":"https://github.com/VincentDENGP/3D-LR"},{"id":"http://arxiv.org/abs/2403.19193v1","updated":"2024-03-28T07:43:49Z","published":"2024-03-28T07:43:49Z","title":"Text Data-Centric Image Captioning with Interactive Prompts","summary":" Supervised image captioning approaches have made great progress, but it is\nchallenging to collect high-quality human-annotated image-text data. Recently,\nlarge-scale vision and language models (e.g., CLIP) and large-scale generative\nlanguage models (e.g., GPT-2) have shown strong performances in various tasks,\nwhich also provide some new solutions for image captioning with web paired\ndata, unpaired data or even text-only data. Among them, the mainstream solution\nis to project image embeddings into the text embedding space with the\nassistance of consistent representations between image-text pairs from the CLIP\nmodel. However, the current methods still face several challenges in adapting\nto the diversity of data configurations in a unified solution, accurately\nestimating image-text embedding bias, and correcting unsatisfactory prediction\nresults in the inference stage. This paper proposes a new Text data-centric\napproach with Interactive Prompts for image Captioning, named TIPCap. 1) We\nconsider four different settings which gradually reduce the dependence on\npaired data. 2) We construct a mapping module driven by multivariate Gaussian\ndistribution to mitigate the modality gap, which is applicable to the above\nfour different settings. 3) We propose a prompt interaction module that can\nincorporate optional prompt information before generating captions. Extensive\nexperiments show that our TIPCap outperforms other weakly or unsupervised image\ncaptioning methods and achieves a new state-of-the-art performance on two\nwidely used datasets, i.e., MS-COCO and Flickr30K.\n","authors":["Yiyu Wang","Hao Luo","Jungang Xu","Yingfei Sun","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15955v2","updated":"2024-03-28T07:30:25Z","published":"2024-03-23T23:22:54Z","title":"Finding needles in a haystack: A Black-Box Approach to Invisible\n Watermark Detection","summary":" In this paper, we propose WaterMark Detection (WMD), the first invisible\nwatermark detection method under a black-box and annotation-free setting. WMD\nis capable of detecting arbitrary watermarks within a given reference dataset\nusing a clean non-watermarked dataset as a reference, without relying on\nspecific decoding methods or prior knowledge of the watermarking techniques. We\ndevelop WMD using foundations of offset learning, where a clean non-watermarked\ndataset enables us to isolate the influence of only watermarked samples in the\nreference dataset. Our comprehensive evaluations demonstrate the effectiveness\nof WMD, significantly outperforming naive detection methods, which only yield\nAUC scores around 0.5. In contrast, WMD consistently achieves impressive\ndetection AUC scores, surpassing 0.9 in most single-watermark datasets and\nexceeding 0.7 in more challenging multi-watermark scenarios across diverse\ndatasets and watermarking methods. As invisible watermarks become increasingly\nprevalent, while specific decoding techniques remain undisclosed, our approach\nprovides a versatile solution and establishes a path toward increasing\naccountability, transparency, and trust in our digital visual content.\n","authors":["Minzhou Pan","Zhengting Wang","Xin Dong","Vikash Sehwag","Lingjuan Lyu","Xue Lin"],"pdf_url":"https://arxiv.org/pdf/2403.15955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09911v3","updated":"2024-03-28T07:16:11Z","published":"2023-08-19T05:34:13Z","title":"Noisy-Correspondence Learning for Text-to-Image Person Re-identification","summary":" Text-to-image person re-identification (TIReID) is a compelling topic in the\ncross-modal community, which aims to retrieve the target person based on a\ntextual query. Although numerous TIReID methods have been proposed and achieved\npromising performance, they implicitly assume the training image-text pairs are\ncorrectly aligned, which is not always the case in real-world scenarios. In\npractice, the image-text pairs inevitably exist under-correlated or even\nfalse-correlated, a.k.a noisy correspondence (NC), due to the low quality of\nthe images and annotation errors. To address this problem, we propose a novel\nRobust Dual Embedding method (RDE) that can learn robust visual-semantic\nassociations even with NC. Specifically, RDE consists of two main components:\n1) A Confident Consensus Division (CCD) module that leverages the dual-grained\ndecisions of dual embedding modules to obtain a consensus set of clean training\ndata, which enables the model to learn correct and reliable visual-semantic\nassociations. 2) A Triplet Alignment Loss (TAL) relaxes the conventional\nTriplet Ranking loss with the hardest negative samples to a log-exponential\nupper bound over all negative ones, thus preventing the model collapse under NC\nand can also focus on hard-negative samples for promising performance. We\nconduct extensive experiments on three public benchmarks, namely CUHK-PEDES,\nICFG-PEDES, and RSTPReID, to evaluate the performance and robustness of our\nRDE. Our method achieves state-of-the-art results both with and without\nsynthetic noisy correspondences on all three datasets. Code is available at\nhttps://github.com/QinYang79/RDE.\n","authors":["Yang Qin","Yingke Chen","Dezhong Peng","Xi Peng","Joey Tianyi Zhou","Peng Hu"],"pdf_url":"https://arxiv.org/pdf/2308.09911v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13102v2","updated":"2024-03-28T07:13:53Z","published":"2023-12-20T15:20:25Z","title":"SpecNeRF: Gaussian Directional Encoding for Specular Reflections","summary":" Neural radiance fields have achieved remarkable performance in modeling the\nappearance of 3D scenes. However, existing approaches still struggle with the\nview-dependent appearance of glossy surfaces, especially under complex lighting\nof indoor environments. Unlike existing methods, which typically assume distant\nlighting like an environment map, we propose a learnable Gaussian directional\nencoding to better model the view-dependent effects under near-field lighting\nconditions. Importantly, our new directional encoding captures the\nspatially-varying nature of near-field lighting and emulates the behavior of\nprefiltered environment maps. As a result, it enables the efficient evaluation\nof preconvolved specular color at any 3D location with varying roughness\ncoefficients. We further introduce a data-driven geometry prior that helps\nalleviate the shape radiance ambiguity in reflection modeling. We show that our\nGaussian directional encoding and geometry prior significantly improve the\nmodeling of challenging specular reflections in neural radiance fields, which\nhelps decompose appearance into more physically meaningful components.\n","authors":["Li Ma","Vasu Agrawal","Haithem Turki","Changil Kim","Chen Gao","Pedro Sander","Michael Zollhöfer","Christian Richardt"],"pdf_url":"https://arxiv.org/pdf/2312.13102v2.pdf","comment":"Accepted to CVPR2024, Project page:\n https://limacv.github.io/SpecNeRF_web/"},{"id":"http://arxiv.org/abs/2403.19177v1","updated":"2024-03-28T07:01:11Z","published":"2024-03-28T07:01:11Z","title":"Rethinking Information Loss in Medical Image Segmentation with\n Various-sized Targets","summary":" Medical image segmentation presents the challenge of segmenting various-size\ntargets, demanding the model to effectively capture both local and global\ninformation. Despite recent efforts using CNNs and ViTs to predict annotations\nof different scales, these approaches often struggle to effectively balance the\ndetection of targets across varying sizes. Simply utilizing local information\nfrom CNNs and global relationships from ViTs without considering potential\nsignificant divergence in latent feature distributions may result in\nsubstantial information loss. To address this issue, in this paper, we will\nintroduce a novel Stagger Network (SNet) and argues that a well-designed fusion\nstructure can mitigate the divergence in latent feature distributions between\nCNNs and ViTs, thereby reducing information loss. Specifically, to emphasize\nboth global dependencies and local focus, we design a Parallel Module to bridge\nthe semantic gap. Meanwhile, we propose the Stagger Module, trying to fuse the\nselected features that are more semantically similar. An Information Recovery\nModule is further adopted to recover complementary information back to the\nnetwork. As a key contribution, we theoretically analyze that the proposed\nparallel and stagger strategies would lead to less information loss, thus\ncertifying the SNet's rationale. Experimental results clearly proved that the\nproposed SNet excels comparisons with recent SOTAs in segmenting on the Synapse\ndataset where targets are in various sizes. Besides, it also demonstrates\nsuperiority on the ACDC and the MoNuSeg datasets where targets are with more\nconsistent dimensions.\n","authors":["Tianyi Liu","Zhaorui Tan","Kaizhu Huang","Haochuan Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.19177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16169v3","updated":"2024-03-28T06:56:45Z","published":"2024-03-24T14:24:13Z","title":"Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method","summary":" Gaze plays a crucial role in revealing human attention and intention,\nshedding light on the cognitive processes behind human actions. The integration\nof gaze guidance with the dynamics of hand-object interactions boosts the\naccuracy of human motion prediction. However, the lack of datasets that capture\nthe intricate relationship and consistency among gaze, hand, and object\nmovements remains a substantial hurdle. In this paper, we introduce the first\nGaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task\nfor synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI,\nfeatures simultaneous 3D modeling of gaze, hand, and object interactions,\ncomprising 479 sequences with an average duration of 19.1 seconds, 812\nsub-sequences, and 33 objects of various sizes. We propose a hierarchical\nframework centered on a gaze-guided hand-object interaction diffusion model,\nnamed GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions\ninto spatial-temporal features and goal pose conditions at different levels of\ninformation granularity. During the diffusion phase, two gaze-conditioned\ndiffusion models are stacked to simplify the complex synthesis of hand-object\nmotions. Here, the object motion diffusion model generates sequences of object\nmotions based on gaze conditions, while the hand motion diffusion model\nproduces hand motions based on the generated object motion. To improve\nfine-grained goal pose alignment, we introduce a Spherical Gaussian constraint\nto guide the denoising step. In the subsequent post-diffusion phase, we\noptimize the generated hand motions using contact consistency. Our extensive\nexperiments highlight the uniqueness of our dataset and the effectiveness of\nour approach.\n","authors":["Jie Tian","Lingxiao Yang","Ran Ji","Yuexin Ma","Lan Xu","Jingyi Yu","Ye Shi","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16169v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19174v1","updated":"2024-03-28T06:46:45Z","published":"2024-03-28T06:46:45Z","title":"Algorithmic Ways of Seeing: Using Object Detection to Facilitate Art\n Exploration","summary":" This Research through Design paper explores how object detection may be\napplied to a large digital art museum collection to facilitate new ways of\nencountering and experiencing art. We present the design and evaluation of an\ninteractive application called SMKExplore, which allows users to explore a\nmuseum's digital collection of paintings by browsing through objects detected\nin the images, as a novel form of open-ended exploration. We provide three\ncontributions. First, we show how an object detection pipeline can be\nintegrated into a design process for visual exploration. Second, we present the\ndesign and development of an app that enables exploration of an art museum's\ncollection. Third, we offer reflections on future possibilities for museums and\nHCI researchers to incorporate object detection techniques into the\ndigitalization of museums.\n","authors":["Louie Søs Meyer","Johanne Engel Aaen","Anitamalina Regitse Tranberg","Peter Kun","Matthias Freiberger","Sebastian Risi","Anders Sundnes Løvlie"],"pdf_url":"https://arxiv.org/pdf/2403.19174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00365v2","updated":"2024-03-28T06:38:55Z","published":"2023-12-31T01:39:38Z","title":"HQ-VAE: Hierarchical Discrete Representation Learning with Variational\n Bayes","summary":" Vector quantization (VQ) is a technique to deterministically learn features\nwith discrete codebook representations. It is commonly performed with a\nvariational autoencoding model, VQ-VAE, which can be further extended to\nhierarchical structures for making high-fidelity reconstructions. However, such\nhierarchical extensions of VQ-VAE often suffer from the codebook/layer collapse\nissue, where the codebook is not efficiently used to express the data, and\nhence degrades reconstruction accuracy. To mitigate this problem, we propose a\nnovel unified framework to stochastically learn hierarchical discrete\nrepresentation on the basis of the variational Bayes framework, called\nhierarchically quantized variational autoencoder (HQ-VAE). HQ-VAE naturally\ngeneralizes the hierarchical variants of VQ-VAE, such as VQ-VAE-2 and\nresidual-quantized VAE (RQ-VAE), and provides them with a Bayesian training\nscheme. Our comprehensive experiments on image datasets show that HQ-VAE\nenhances codebook usage and improves reconstruction performance. We also\nvalidated HQ-VAE in terms of its applicability to a different modality with an\naudio dataset.\n","authors":["Yuhta Takida","Yukara Ikemiya","Takashi Shibuya","Kazuki Shimada","Woosung Choi","Chieh-Hsin Lai","Naoki Murata","Toshimitsu Uesaka","Kengo Uchida","Wei-Hsiang Liao","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2401.00365v2.pdf","comment":"34 pages with 17 figures, accepted for TMLR"},{"id":"http://arxiv.org/abs/2403.19164v1","updated":"2024-03-28T06:22:45Z","published":"2024-03-28T06:22:45Z","title":"RecDiffusion: Rectangling for Image Stitching with Diffusion Models","summary":" Image stitching from different captures often results in non-rectangular\nboundaries, which is often considered unappealing. To solve non-rectangular\nboundaries, current solutions involve cropping, which discards image content,\ninpainting, which can introduce unrelated content, or warping, which can\ndistort non-linear features and introduce artifacts. To overcome these issues,\nwe introduce a novel diffusion-based learning framework, \\textbf{RecDiffusion},\nfor image stitching rectangling. This framework combines Motion Diffusion\nModels (MDM) to generate motion fields, effectively transitioning from the\nstitched image's irregular borders to a geometrically corrected intermediary.\nFollowed by Content Diffusion Models (CDM) for image detail refinement.\nNotably, our sampling process utilizes a weighted map to identify regions\nneeding correction during each iteration of CDM. Our RecDiffusion ensures\ngeometric accuracy and overall visual appeal, surpassing all previous methods\nin both quantitative and qualitative measures when evaluated on public\nbenchmarks. Code is released at https://github.com/lhaippp/RecDiffusion.\n","authors":["Tianhao Zhou","Haipeng Li","Ziyi Wang","Ao Luo","Chen-Lin Zhang","Jiajun Li","Bing Zeng","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10522v5","updated":"2024-03-28T06:20:10Z","published":"2023-11-17T13:43:43Z","title":"Enhancing Object Coherence in Layout-to-Image Synthesis","summary":" Layout-to-image synthesis is an emerging technique in conditional image\ngeneration. It aims to generate complex scenes, where users require fine\ncontrol over the layout of the objects in a scene. However, it remains\nchallenging to control the object coherence, including semantic coherence\n(e.g., the cat looks at the flowers or not) and physical coherence (e.g., the\nhand and the racket should not be misaligned). In this paper, we propose a\nnovel diffusion model with effective global semantic fusion (GSF) and\nself-similarity feature enhancement modules to guide the object coherence for\nthis task. For semantic coherence, we argue that the image caption contains\nrich information for defining the semantic relationship within the objects in\nthe images. Instead of simply employing cross-attention between captions and\ngenerated images, which addresses the highly relevant layout restriction and\nsemantic coherence separately and thus leads to unsatisfying results shown in\nour experiments, we develop GSF to fuse the supervision from the layout\nrestriction and semantic coherence requirement and exploit it to guide the\nimage synthesis process. Moreover, to improve the physical coherence, we\ndevelop a Self-similarity Coherence Attention (SCA) module to explicitly\nintegrate local contextual physical coherence into each pixel's generation\nprocess. Specifically, we adopt a self-similarity map to encode the coherence\nrestrictions and employ it to extract coherent features from text embedding.\nThrough visualization of our self-similarity map, we explore the essence of\nSCA, revealing that its effectiveness is not only in capturing reliable\nphysical coherence patterns but also in enhancing complex texture generation.\nExtensive experiments demonstrate the superiority of our proposed method in\nboth image generation quality and controllability.\n","authors":["Yibin Wang","Weizhong Zhang","Jianwei Zheng","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2311.10522v5.pdf","comment":"GitHub: https://github.com/CodeGoat24/EOCNet"},{"id":"http://arxiv.org/abs/2403.19163v1","updated":"2024-03-28T06:18:12Z","published":"2024-03-28T06:18:12Z","title":"D'OH: Decoder-Only random Hypernetworks for Implicit Neural\n Representations","summary":" Deep implicit functions have been found to be an effective tool for\nefficiently encoding all manner of natural signals. Their attractiveness stems\nfrom their ability to compactly represent signals with little to no off-line\ntraining data. Instead, they leverage the implicit bias of deep networks to\ndecouple hidden redundancies within the signal. In this paper, we explore the\nhypothesis that additional compression can be achieved by leveraging the\nredundancies that exist between layers. We propose to use a novel run-time\ndecoder-only hypernetwork - that uses no offline training data - to better\nmodel this cross-layer parameter redundancy. Previous applications of\nhyper-networks with deep implicit functions have applied feed-forward\nencoder/decoder frameworks that rely on large offline datasets that do not\ngeneralize beyond the signals they were trained on. We instead present a\nstrategy for the initialization of run-time deep implicit functions for\nsingle-instance signals through a Decoder-Only randomly projected Hypernetwork\n(D'OH). By directly changing the dimension of a latent code to approximate a\ntarget implicit neural architecture, we provide a natural way to vary the\nmemory footprint of neural representations without the costly need for neural\narchitecture search on a space of alternative low-rate structures.\n","authors":["Cameron Gordon","Lachlan Ewen MacDonald","Hemanth Saratchandran","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2403.19163v1.pdf","comment":"29 pages, 17 figures"},{"id":"http://arxiv.org/abs/2403.19160v1","updated":"2024-03-28T06:05:14Z","published":"2024-03-28T06:05:14Z","title":"Within the Dynamic Context: Inertia-aware 3D Human Modeling with Pose\n Sequence","summary":" Neural rendering techniques have significantly advanced 3D human body\nmodeling. However, previous approaches often overlook dynamics induced by\nfactors such as motion inertia, leading to challenges in scenarios like abrupt\nstops after rotation, where the pose remains static while the appearance\nchanges. This limitation arises from reliance on a single pose as conditional\ninput, resulting in ambiguity in mapping one pose to multiple appearances. In\nthis study, we elucidate that variations in human appearance depend not only on\nthe current frame's pose condition but also on past pose states. Therefore, we\nintroduce Dyco, a novel method utilizing the delta pose sequence representation\nfor non-rigid deformations and canonical space to effectively model temporal\nappearance variations. To prevent a decrease in the model's generalization\nability to novel poses, we further propose low-dimensional global context to\nreduce unnecessary inter-body part dependencies and a quantization operation to\nmitigate overfitting of the delta pose sequence by the model. To validate the\neffectiveness of our approach, we collected a novel dataset named I3D-Human,\nwith a focus on capturing temporal changes in clothing appearance under\napproximate poses. Through extensive experiments on both I3D-Human and existing\ndatasets, our approach demonstrates superior qualitative and quantitative\nperformance. In addition, our inertia-aware 3D human method can unprecedentedly\nsimulate appearance changes caused by inertia at different velocities.\n","authors":["Yutong Chen","Yifan Zhan","Zhihang Zhong","Wei Wang","Xiao Sun","Yu Qiao","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.19160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16473v3","updated":"2024-03-28T05:47:24Z","published":"2023-11-26T02:35:09Z","title":"GS-IR: 3D Gaussian Splatting for Inverse Rendering","summary":" We propose GS-IR, a novel inverse rendering approach based on 3D Gaussian\nSplatting (GS) that leverages forward mapping volume rendering to achieve\nphotorealistic novel view synthesis and relighting results. Unlike previous\nworks that use implicit neural representations and volume rendering (e.g.\nNeRF), which suffer from low expressive power and high computational\ncomplexity, we extend GS, a top-performance representation for novel view\nsynthesis, to estimate scene geometry, surface material, and environment\nillumination from multi-view images captured under unknown lighting conditions.\nThere are two main problems when introducing GS to inverse rendering: 1) GS\ndoes not support producing plausible normal natively; 2) forward mapping (e.g.\nrasterization and splatting) cannot trace the occlusion like backward mapping\n(e.g. ray tracing). To address these challenges, our GS-IR proposes an\nefficient optimization scheme that incorporates a depth-derivation-based\nregularization for normal estimation and a baking-based occlusion to model\nindirect lighting. The flexible and expressive GS representation allows us to\nachieve fast and compact geometry reconstruction, photorealistic novel view\nsynthesis, and effective physically-based rendering. We demonstrate the\nsuperiority of our method over baseline methods through qualitative and\nquantitative evaluations on various challenging scenes.\n","authors":["Zhihao Liang","Qi Zhang","Ying Feng","Ying Shan","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2311.16473v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19158v1","updated":"2024-03-28T05:44:48Z","published":"2024-03-28T05:44:48Z","title":"Uncertainty-Aware Deep Video Compression with Ensembles","summary":" Deep learning-based video compression is a challenging task, and many\nprevious state-of-the-art learning-based video codecs use optical flows to\nexploit the temporal correlation between successive frames and then compress\nthe residual error. Although these two-stage models are end-to-end optimized,\nthe epistemic uncertainty in the motion estimation and the aleatoric\nuncertainty from the quantization operation lead to errors in the intermediate\nrepresentations and introduce artifacts in the reconstructed frames. This\ninherent flaw limits the potential for higher bit rate savings. To address this\nissue, we propose an uncertainty-aware video compression model that can\neffectively capture the predictive uncertainty with deep ensembles.\nAdditionally, we introduce an ensemble-aware loss to encourage the diversity\namong ensemble members and investigate the benefits of incorporating\nadversarial training in the video compression task. Experimental results on\n1080p sequences show that our model can effectively save bits by more than 20%\ncompared to DVC Pro.\n","authors":["Wufei Ma","Jiahao Li","Bin Li","Yan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.19158v1.pdf","comment":"Published on IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2306.02240v2","updated":"2024-03-28T05:35:46Z","published":"2023-06-04T02:55:25Z","title":"ProTeCt: Prompt Tuning for Taxonomic Open Set Classification","summary":" Visual-language foundation models, like CLIP, learn generalized\nrepresentations that enable zero-shot open-set classification. Few-shot\nadaptation methods, based on prompt tuning, have been shown to further improve\nperformance on downstream datasets. However, these methods do not fare well in\nthe taxonomic open set (TOS) setting, where the classifier is asked to make\npredictions from label sets across different levels of semantic granularity.\nFrequently, they infer incorrect labels at coarser taxonomic class levels, even\nwhen the inference at the leaf level (original class labels) is correct. To\naddress this problem, we propose a prompt tuning technique that calibrates the\nhierarchical consistency of model predictions. A set of metrics of hierarchical\nconsistency, the Hierarchical Consistent Accuracy (HCA) and the Mean Treecut\nAccuracy (MTA), are first proposed to evaluate TOS model performance. A new\nPrompt Tuning for Hierarchical Consistency (ProTeCt) technique is then proposed\nto calibrate classification across label set granularities. Results show that\nProTeCt can be combined with existing prompt tuning methods to significantly\nimprove TOS classification without degrading the leaf level classification\nperformance.\n","authors":["Tz-Ying Wu","Chih-Hui Ho","Nuno Vasconcelos"],"pdf_url":"https://arxiv.org/pdf/2306.02240v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14302v2","updated":"2024-03-28T05:13:43Z","published":"2024-03-21T11:16:42Z","title":"SpikingResformer: Bridging ResNet and Vision Transformer in Spiking\n Neural Networks","summary":" The remarkable success of Vision Transformers in Artificial Neural Networks\n(ANNs) has led to a growing interest in incorporating the self-attention\nmechanism and transformer-based architecture into Spiking Neural Networks\n(SNNs). While existing methods propose spiking self-attention mechanisms that\nare compatible with SNNs, they lack reasonable scaling methods, and the overall\narchitectures proposed by these methods suffer from a bottleneck in effectively\nextracting local features. To address these challenges, we propose a novel\nspiking self-attention mechanism named Dual Spike Self-Attention (DSSA) with a\nreasonable scaling method. Based on DSSA, we propose a novel spiking Vision\nTransformer architecture called SpikingResformer, which combines the\nResNet-based multi-stage architecture with our proposed DSSA to improve both\nperformance and energy efficiency while reducing parameters. Experimental\nresults show that SpikingResformer achieves higher accuracy with fewer\nparameters and lower energy consumption than other spiking Vision Transformer\ncounterparts. Notably, our SpikingResformer-L achieves 79.40% top-1 accuracy on\nImageNet with 4 time-steps, which is the state-of-the-art result in the SNN\nfield.\n","authors":["Xinyu Shi","Zecheng Hao","Zhaofei Yu"],"pdf_url":"https://arxiv.org/pdf/2403.14302v2.pdf","comment":"To be published in the 2024 IEEE/CVF Conference on Computer Vision\n and Pattern Recognition (CVPR)"},{"id":"http://arxiv.org/abs/2403.19150v1","updated":"2024-03-28T05:08:25Z","published":"2024-03-28T05:08:25Z","title":"Towards Understanding Dual BN In Hybrid Adversarial Training","summary":" There is a growing concern about applying batch normalization (BN) in\nadversarial training (AT), especially when the model is trained on both\nadversarial samples and clean samples (termed Hybrid-AT). With the assumption\nthat adversarial and clean samples are from two different domains, a common\npractice in prior works is to adopt Dual BN, where BN and BN are used for\nadversarial and clean branches, respectively. A popular belief for motivating\nDual BN is that estimating normalization statistics of this mixture\ndistribution is challenging and thus disentangling it for normalization\nachieves stronger robustness. In contrast to this belief, we reveal that\ndisentangling statistics plays a less role than disentangling affine parameters\nin model training. This finding aligns with prior work (Rebuffi et al., 2023),\nand we build upon their research for further investigations. We demonstrate\nthat the domain gap between adversarial and clean samples is not very large,\nwhich is counter-intuitive considering the significant influence of adversarial\nperturbation on the model accuracy. We further propose a two-task hypothesis\nwhich serves as the empirical foundation and a unified framework for Hybrid-AT\nimprovement. We also investigate Dual BN in test-time and reveal that affine\nparameters characterize the robustness during inference. Overall, our work\nsheds new light on understanding the mechanism of Dual BN in Hybrid-AT and its\nunderlying justification.\n","authors":["Chenshuang Zhang","Chaoning Zhang","Kang Zhang","Axi Niu","Junmo Kim","In So Kweon"],"pdf_url":"https://arxiv.org/pdf/2403.19150v1.pdf","comment":"Accepted at TMLR"},{"id":"http://arxiv.org/abs/2403.19144v1","updated":"2024-03-28T04:35:42Z","published":"2024-03-28T04:35:42Z","title":"MoDiTalker: Motion-Disentangled Diffusion Model for High-Fidelity\n Talking Head Generation","summary":" Conventional GAN-based models for talking head generation often suffer from\nlimited quality and unstable training. Recent approaches based on diffusion\nmodels aimed to address these limitations and improve fidelity. However, they\nstill face challenges, including extensive sampling times and difficulties in\nmaintaining temporal consistency due to the high stochasticity of diffusion\nmodels. To overcome these challenges, we propose a novel motion-disentangled\ndiffusion model for high-quality talking head generation, dubbed MoDiTalker. We\nintroduce the two modules: audio-to-motion (AToM), designed to generate a\nsynchronized lip motion from audio, and motion-to-video (MToV), designed to\nproduce high-quality head video following the generated motion. AToM excels in\ncapturing subtle lip movements by leveraging an audio attention mechanism. In\naddition, MToV enhances temporal consistency by leveraging an efficient\ntri-plane representation. Our experiments conducted on standard benchmarks\ndemonstrate that our model achieves superior performance compared to existing\nmodels. We also provide comprehensive ablation studies and user study results.\n","authors":["Seyeon Kim","Siyoon Jin","Jihye Park","Kihong Kim","Jiyoung Kim","Jisu Nam","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2403.19144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19140v1","updated":"2024-03-28T04:24:56Z","published":"2024-03-28T04:24:56Z","title":"QNCD: Quantization Noise Correction for Diffusion Models","summary":" Diffusion models have revolutionized image synthesis, setting new benchmarks\nin quality and creativity. However, their widespread adoption is hindered by\nthe intensive computation required during the iterative denoising process.\nPost-training quantization (PTQ) presents a solution to accelerate sampling,\naibeit at the expense of sample quality, extremely in low-bit settings.\nAddressing this, our study introduces a unified Quantization Noise Correction\nScheme (QNCD), aimed at minishing quantization noise throughout the sampling\nprocess. We identify two primary quantization challenges: intra and inter\nquantization noise. Intra quantization noise, mainly exacerbated by embeddings\nin the resblock module, extends activation quantization ranges, increasing\ndisturbances in each single denosing step. Besides, inter quantization noise\nstems from cumulative quantization deviations across the entire denoising\nprocess, altering data distributions step-by-step. QNCD combats these through\nembedding-derived feature smoothing for eliminating intra quantization noise\nand an effective runtime noise estimatiation module for dynamicly filtering\ninter quantization noise. Extensive experiments demonstrate that our method\noutperforms previous quantization methods for diffusion models, achieving\nlossless results in W4A8 and W8A8 quantization settings on ImageNet (LDM-4).\nCode is available at: https://github.com/huanpengchu/QNCD\n","authors":["Huanpeng Chu","Wei Wu","Chengjie Zang","Kun Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.19140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19137v1","updated":"2024-03-28T04:15:58Z","published":"2024-03-28T04:15:58Z","title":"CLAP4CLIP: Continual Learning with Probabilistic Finetuning for\n Vision-Language Models","summary":" Continual learning (CL) aims to help deep neural networks to learn new\nknowledge while retaining what has been learned. Recently, pre-trained\nvision-language models such as CLIP, with powerful generalization ability, have\nbeen gaining traction as practical CL candidates. However, the domain mismatch\nbetween the pre-training and the downstream CL tasks calls for finetuning of\nthe CLIP on the latter. The deterministic nature of the existing finetuning\nmethods makes them overlook the many possible interactions across the\nmodalities and deems them unsafe for high-risk CL tasks requiring reliable\nuncertainty estimation. To address these, our work proposes Continual LeArning\nwith Probabilistic finetuning (CLAP). CLAP develops probabilistic modeling over\ntask-specific modules with visual-guided text features, providing more reliable\nfine-tuning in CL. It further alleviates forgetting by exploiting the rich\npre-trained knowledge of CLIP for weight initialization and distribution\nregularization of task-specific modules. Cooperating with the diverse range of\nexisting prompting methods, CLAP can surpass the predominant deterministic\nfinetuning approaches for CL with CLIP. Lastly, we study the superior\nuncertainty estimation abilities of CLAP for novel data detection and exemplar\nselection within CL setups. Our code is available at\n\\url{https://github.com/srvCodes/clap4clip}.\n","authors":["Saurav Jha","Dong Gong","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2403.19137v1.pdf","comment":"Work under review"},{"id":"http://arxiv.org/abs/2402.19161v2","updated":"2024-03-28T04:07:57Z","published":"2024-02-29T13:45:13Z","title":"MemoNav: Working Memory Model for Visual Navigation","summary":" Image-goal navigation is a challenging task that requires an agent to\nnavigate to a goal indicated by an image in unfamiliar environments. Existing\nmethods utilizing diverse scene memories suffer from inefficient exploration\nsince they use all historical observations for decision-making without\nconsidering the goal-relevant fraction. To address this limitation, we present\nMemoNav, a novel memory model for image-goal navigation, which utilizes a\nworking memory-inspired pipeline to improve navigation performance.\nSpecifically, we employ three types of navigation memory. The node features on\na map are stored in the short-term memory (STM), as these features are\ndynamically updated. A forgetting module then retains the informative STM\nfraction to increase efficiency. We also introduce long-term memory (LTM) to\nlearn global scene representations by progressively aggregating STM features.\nSubsequently, a graph attention module encodes the retained STM and the LTM to\ngenerate working memory (WM) which contains the scene features essential for\nefficient navigation. The synergy among these three memory types boosts\nnavigation performance by enabling the agent to learn and leverage\ngoal-relevant scene features within a topological map. Our evaluation on\nmulti-goal tasks demonstrates that MemoNav significantly outperforms previous\nmethods across all difficulty levels in both Gibson and Matterport3D scenes.\nQualitative results further illustrate that MemoNav plans more efficient\nroutes.\n","authors":["Hongxin Li","Zeyu Wang","Xu Yang","Yuran Yang","Shuqi Mei","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.19161v2.pdf","comment":"Accepted to CVPR 2024. Code: https://github.com/ZJULiHongxin/MemoNav"},{"id":"http://arxiv.org/abs/2403.18605v2","updated":"2024-03-28T03:56:07Z","published":"2024-03-27T14:24:30Z","title":"FlexEdit: Flexible and Controllable Diffusion-based Object-centric Image\n Editing","summary":" Our work addresses limitations seen in previous approaches for object-centric\nediting problems, such as unrealistic results due to shape discrepancies and\nlimited control in object replacement or insertion. To this end, we introduce\nFlexEdit, a flexible and controllable editing framework for objects where we\niteratively adjust latents at each denoising step using our FlexEdit block.\nInitially, we optimize latents at test time to align with specified object\nconstraints. Then, our framework employs an adaptive mask, automatically\nextracted during denoising, to protect the background while seamlessly blending\nnew content into the target image. We demonstrate the versatility of FlexEdit\nin various object editing tasks and curate an evaluation test suite with\nsamples from both real and synthetic images, along with novel evaluation\nmetrics designed for object-centric editing. We conduct extensive experiments\non different editing scenarios, demonstrating the superiority of our editing\nframework over recent advanced text-guided image editing methods. Our project\npage is published at https://flex-edit.github.io/.\n","authors":["Trong-Tung Nguyen","Duc-Anh Nguyen","Anh Tran","Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2403.18605v2.pdf","comment":"Our project page: https://flex-edit.github.io/"},{"id":"http://arxiv.org/abs/2403.19128v1","updated":"2024-03-28T03:51:14Z","published":"2024-03-28T03:51:14Z","title":"OmniParser: A Unified Framework for Text Spotting, Key Information\n Extraction and Table Recognition","summary":" Recently, visually-situated text parsing (VsTP) has experienced notable\nadvancements, driven by the increasing demand for automated document\nunderstanding and the emergence of Generative Large Language Models (LLMs)\ncapable of processing document-based questions. Various methods have been\nproposed to address the challenging problem of VsTP. However, due to the\ndiversified targets and heterogeneous schemas, previous works usually design\ntask-specific architectures and objectives for individual tasks, which\ninadvertently leads to modal isolation and complex workflow. In this paper, we\npropose a unified paradigm for parsing visually-situated text across diverse\nscenarios. Specifically, we devise a universal model, called OmniParser, which\ncan simultaneously handle three typical visually-situated text parsing tasks:\ntext spotting, key information extraction, and table recognition. In\nOmniParser, all tasks share the unified encoder-decoder architecture, the\nunified objective: point-conditioned text generation, and the unified input &\noutput representation: prompt & structured sequences. Extensive experiments\ndemonstrate that the proposed OmniParser achieves state-of-the-art (SOTA) or\nhighly competitive performances on 7 datasets for the three visually-situated\ntext parsing tasks, despite its unified, concise design. The code is available\nat https://github.com/AlibabaResearch/AdvancedLiterateMachinery.\n","authors":["Jianqiang Wan","Sibo Song","Wenwen Yu","Yuliang Liu","Wenqing Cheng","Fei Huang","Xiang Bai","Cong Yao","Zhibo Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19128v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2303.05699v4","updated":"2024-03-28T03:48:40Z","published":"2023-03-10T04:49:01Z","title":"Feature Unlearning for Pre-trained GANs and VAEs","summary":" We tackle the problem of feature unlearning from a pre-trained image\ngenerative model: GANs and VAEs. Unlike a common unlearning task where an\nunlearning target is a subset of the training set, we aim to unlearn a specific\nfeature, such as hairstyle from facial images, from the pre-trained generative\nmodels. As the target feature is only presented in a local region of an image,\nunlearning the entire image from the pre-trained model may result in losing\nother details in the remaining region of the image. To specify which features\nto unlearn, we collect randomly generated images that contain the target\nfeatures. We then identify a latent representation corresponding to the target\nfeature and then use the representation to fine-tune the pre-trained model.\nThrough experiments on MNIST, CelebA, and FFHQ datasets, we show that target\nfeatures are successfully removed while keeping the fidelity of the original\nmodels. Further experiments with an adversarial attack show that the unlearned\nmodel is more robust under the presence of malicious parties.\n","authors":["Saemi Moon","Seunghyuk Cho","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2303.05699v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19124v1","updated":"2024-03-28T03:35:00Z","published":"2024-03-28T03:35:00Z","title":"PoCo: A Self-Supervised Approach via Polar Transformation Based\n Progressive Contrastive Learning for Ophthalmic Disease Diagnosis","summary":" Automatic ophthalmic disease diagnosis on fundus images is important in\nclinical practice. However, due to complex fundus textures and limited\nannotated data, developing an effective automatic method for this problem is\nstill challenging. In this paper, we present a self-supervised method via polar\ntransformation based progressive contrastive learning, called PoCo, for\nophthalmic disease diagnosis. Specifically, we novelly inject the polar\ntransformation into contrastive learning to 1) promote contrastive learning\npre-training to be faster and more stable and 2) naturally capture task-free\nand rotation-related textures, which provides insights into disease recognition\non fundus images. Beneficially, simple normal translation-invariant convolution\non transformed images can equivalently replace the complex rotation-invariant\nand sector convolution on raw images. After that, we develop a progressive\ncontrastive learning method to efficiently utilize large unannotated images and\na novel progressive hard negative sampling scheme to gradually reduce the\nnegative sample number for efficient training and performance enhancement.\nExtensive experiments on three public ophthalmic disease datasets show that our\nPoCo achieves state-of-the-art performance with good generalization ability,\nvalidating that our method can reduce annotation efforts and provide reliable\ndiagnosis. Codes are available at \\url{https://github.com/wjh892521292/PoCo}.\n","authors":["Jinhong Wang","Tingting Chen","Jintai Chen","Yixuan Wu","Yuyang Xu","Danny Chen","Haochao Ying","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2403.19124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03596v2","updated":"2024-03-28T03:26:51Z","published":"2023-12-06T16:35:59Z","title":"MMM: Generative Masked Motion Model","summary":" Recent advances in text-to-motion generation using diffusion and\nautoregressive models have shown promising results. However, these models often\nsuffer from a trade-off between real-time performance, high fidelity, and\nmotion editability. To address this gap, we introduce MMM, a novel yet simple\nmotion generation paradigm based on Masked Motion Model. MMM consists of two\nkey components: (1) a motion tokenizer that transforms 3D human motion into a\nsequence of discrete tokens in latent space, and (2) a conditional masked\nmotion transformer that learns to predict randomly masked motion tokens,\nconditioned on the pre-computed text tokens. By attending to motion and text\ntokens in all directions, MMM explicitly captures inherent dependency among\nmotion tokens and semantic mapping between motion and text tokens. During\ninference, this allows parallel and iterative decoding of multiple motion\ntokens that are highly consistent with fine-grained text descriptions,\ntherefore simultaneously achieving high-fidelity and high-speed motion\ngeneration. In addition, MMM has innate motion editability. By simply placing\nmask tokens in the place that needs editing, MMM automatically fills the gaps\nwhile guaranteeing smooth transitions between editing and non-editing parts.\nExtensive experiments on the HumanML3D and KIT-ML datasets demonstrate that MMM\nsurpasses current leading methods in generating high-quality motion (evidenced\nby superior FID scores of 0.08 and 0.429), while offering advanced editing\nfeatures such as body-part modification, motion in-betweening, and the\nsynthesis of long motion sequences. In addition, MMM is two orders of magnitude\nfaster on a single mid-range GPU than editable motion diffusion models. Our\nproject page is available at \\url{https://exitudio.github.io/MMM-page}.\n","authors":["Ekkasit Pinyoanuntapong","Pu Wang","Minwoo Lee","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.03596v2.pdf","comment":"accepted to CVPR"},{"id":"http://arxiv.org/abs/2403.16002v2","updated":"2024-03-28T03:22:52Z","published":"2024-03-24T04:15:50Z","title":"SDSTrack: Self-Distillation Symmetric Adapter Learning for Multi-Modal\n Visual Object Tracking","summary":" Multimodal Visual Object Tracking (VOT) has recently gained significant\nattention due to its robustness. Early research focused on fully fine-tuning\nRGB-based trackers, which was inefficient and lacked generalized representation\ndue to the scarcity of multimodal data. Therefore, recent studies have utilized\nprompt tuning to transfer pre-trained RGB-based trackers to multimodal data.\nHowever, the modality gap limits pre-trained knowledge recall, and the\ndominance of the RGB modality persists, preventing the full utilization of\ninformation from other modalities. To address these issues, we propose a novel\nsymmetric multimodal tracking framework called SDSTrack. We introduce\nlightweight adaptation for efficient fine-tuning, which directly transfers the\nfeature extraction ability from RGB to other domains with a small number of\ntrainable parameters and integrates multimodal features in a balanced,\nsymmetric manner. Furthermore, we design a complementary masked patch\ndistillation strategy to enhance the robustness of trackers in complex\nenvironments, such as extreme weather, poor imaging, and sensor failure.\nExtensive experiments demonstrate that SDSTrack outperforms state-of-the-art\nmethods in various multimodal tracking scenarios, including RGB+Depth,\nRGB+Thermal, and RGB+Event tracking, and exhibits impressive results in extreme\nconditions. Our source code is available at https://github.com/hoqolo/SDSTrack.\n","authors":["Xiaojun Hou","Jiazheng Xing","Yijie Qian","Yaowei Guo","Shuo Xin","Junhao Chen","Kai Tang","Mengmeng Wang","Zhengkai Jiang","Liang Liu","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16002v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2304.05684v3","updated":"2024-03-28T03:15:57Z","published":"2023-04-12T08:12:29Z","title":"InterGen: Diffusion-based Multi-human Motion Generation under Complex\n Interactions","summary":" We have recently seen tremendous progress in diffusion advances for\ngenerating realistic human motions. Yet, they largely disregard the multi-human\ninteractions. In this paper, we present InterGen, an effective diffusion-based\napproach that incorporates human-to-human interactions into the motion\ndiffusion process, which enables layman users to customize high-quality\ntwo-person interaction motions, with only text guidance. We first contribute a\nmultimodal dataset, named InterHuman. It consists of about 107M frames for\ndiverse two-person interactions, with accurate skeletal motions and 23,337\nnatural language descriptions. For the algorithm side, we carefully tailor the\nmotion diffusion model to our two-person interaction setting. To handle the\nsymmetry of human identities during interactions, we propose two cooperative\ntransformer-based denoisers that explicitly share weights, with a mutual\nattention mechanism to further connect the two denoising processes. Then, we\npropose a novel representation for motion input in our interaction diffusion\nmodel, which explicitly formulates the global relations between the two\nperformers in the world frame. We further introduce two novel regularization\nterms to encode spatial relations, equipped with a corresponding damping scheme\nduring the training of our interaction diffusion model. Extensive experiments\nvalidate the effectiveness and generalizability of InterGen. Notably, it can\ngenerate more diverse and compelling two-person motions than previous methods\nand enables various downstream applications for human interactions.\n","authors":["Han Liang","Wenqian Zhang","Wenxuan Li","Jingyi Yu","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2304.05684v3.pdf","comment":"accepted by IJCV 2024"},{"id":"http://arxiv.org/abs/2403.19111v1","updated":"2024-03-28T03:07:16Z","published":"2024-03-28T03:07:16Z","title":"Patch Spatio-Temporal Relation Prediction for Video Anomaly Detection","summary":" Video Anomaly Detection (VAD), aiming to identify abnormalities within a\nspecific context and timeframe, is crucial for intelligent Video Surveillance\nSystems. While recent deep learning-based VAD models have shown promising\nresults by generating high-resolution frames, they often lack competence in\npreserving detailed spatial and temporal coherence in video frames. To tackle\nthis issue, we propose a self-supervised learning approach for VAD through an\ninter-patch relationship prediction task. Specifically, we introduce a\ntwo-branch vision transformer network designed to capture deep visual features\nof video frames, addressing spatial and temporal dimensions responsible for\nmodeling appearance and motion patterns, respectively. The inter-patch\nrelationship in each dimension is decoupled into inter-patch similarity and the\norder information of each patch. To mitigate memory consumption, we convert the\norder information prediction task into a multi-label learning problem, and the\ninter-patch similarity prediction task into a distance matrix regression\nproblem. Comprehensive experiments demonstrate the effectiveness of our method,\nsurpassing pixel-generation-based methods by a significant margin across three\npublic benchmarks. Additionally, our approach outperforms other self-supervised\nlearning-based methods.\n","authors":["Hao Shen","Lu Shi","Wanru Xu","Yigang Cen","Linna Zhang","Gaoyun An"],"pdf_url":"https://arxiv.org/pdf/2403.19111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19107v1","updated":"2024-03-28T02:51:33Z","published":"2024-03-28T02:51:33Z","title":"Synthetic Medical Imaging Generation with Generative Adversarial\n Networks For Plain Radiographs","summary":" In medical imaging, access to data is commonly limited due to patient privacy\nrestrictions and the issue that it can be difficult to acquire enough data in\nthe case of rare diseases.[1] The purpose of this investigation was to develop\na reusable open-source synthetic image generation pipeline, the GAN Image\nSynthesis Tool (GIST), that is easy to use as well as easy to deploy. The\npipeline helps to improve and standardize AI algorithms in the digital health\nspace by generating high quality synthetic image data that is not linked to\nspecific patients. Its image generation capabilities include the ability to\ngenerate imaging of pathologies or injuries with low incidence rates. This\nimprovement of digital health AI algorithms could improve diagnostic accuracy,\naid in patient care, decrease medicolegal claims, and ultimately decrease the\noverall cost of healthcare. The pipeline builds on existing Generative\nAdversarial Networks (GANs) algorithms, and preprocessing and evaluation steps\nwere included for completeness. For this work, we focused on ensuring the\npipeline supports radiography, with a focus on synthetic knee and elbow x-ray\nimages. In designing the pipeline, we evaluated the performance of current GAN\narchitectures, studying the performance on available x-ray data. We show that\nthe pipeline is capable of generating high quality and clinically relevant\nimages based on a lay person's evaluation and the Fr\\'echet Inception Distance\n(FID) metric.\n","authors":["John R. McNulty","Lee Kho","Alexandria L. Case","Charlie Fornaca","Drew Johnston","David Slater","Joshua M. Abzug","Sybil A. Russell"],"pdf_url":"https://arxiv.org/pdf/2403.19107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19104v1","updated":"2024-03-28T02:39:45Z","published":"2024-03-28T02:39:45Z","title":"CRKD: Enhanced Camera-Radar Object Detection with Cross-modality\n Knowledge Distillation","summary":" In the field of 3D object detection for autonomous driving, LiDAR-Camera (LC)\nfusion is the top-performing sensor configuration. Still, LiDAR is relatively\nhigh cost, which hinders adoption of this technology for consumer automobiles.\nAlternatively, camera and radar are commonly deployed on vehicles already on\nthe road today, but performance of Camera-Radar (CR) fusion falls behind LC\nfusion. In this work, we propose Camera-Radar Knowledge Distillation (CRKD) to\nbridge the performance gap between LC and CR detectors with a novel\ncross-modality KD framework. We use the Bird's-Eye-View (BEV) representation as\nthe shared feature space to enable effective knowledge distillation. To\naccommodate the unique cross-modality KD path, we propose four distillation\nlosses to help the student learn crucial features from the teacher model. We\npresent extensive evaluations on the nuScenes dataset to demonstrate the\neffectiveness of the proposed CRKD framework. The project page for CRKD is\nhttps://song-jingyu.github.io/CRKD.\n","authors":["Lingjun Zhao","Jingyu Song","Katherine A. Skinner"],"pdf_url":"https://arxiv.org/pdf/2403.19104v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19103v1","updated":"2024-03-28T02:35:53Z","published":"2024-03-28T02:35:53Z","title":"Automated Black-box Prompt Engineering for Personalized Text-to-Image\n Generation","summary":" Prompt engineering is effective for controlling the output of text-to-image\n(T2I) generative models, but it is also laborious due to the need for manually\ncrafted prompts. This challenge has spurred the development of algorithms for\nautomated prompt generation. However, these methods often struggle with\ntransferability across T2I models, require white-box access to the underlying\nmodel, and produce non-intuitive prompts. In this work, we introduce PRISM, an\nalgorithm that automatically identifies human-interpretable and transferable\nprompts that can effectively generate desired concepts given only black-box\naccess to T2I models. Inspired by large language model (LLM) jailbreaking,\nPRISM leverages the in-context learning ability of LLMs to iteratively refine\nthe candidate prompts distribution for given reference images. Our experiments\ndemonstrate the versatility and effectiveness of PRISM in generating accurate\nprompts for objects, styles and images across multiple T2I models, including\nStable Diffusion, DALL-E, and Midjourney.\n","authors":["Yutong He","Alexander Robey","Naoki Murata","Yiding Jiang","Joshua Williams","George J. Pappas","Hamed Hassani","Yuki Mitsufuji","Ruslan Salakhutdinov","J. Zico Kolter"],"pdf_url":"https://arxiv.org/pdf/2403.19103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19101v1","updated":"2024-03-28T02:31:06Z","published":"2024-03-28T02:31:06Z","title":"AAPMT: AGI Assessment Through Prompt and Metric Transformer","summary":" The emergence of text-to-image models marks a significant milestone in the\nevolution of AI-generated images (AGIs), expanding their use in diverse domains\nlike design, entertainment, and more. Despite these breakthroughs, the quality\nof AGIs often remains suboptimal, highlighting the need for effective\nevaluation methods. These methods are crucial for assessing the quality of\nimages relative to their textual descriptions, and they must accurately mirror\nhuman perception. Substantial progress has been achieved in this domain, with\ninnovative techniques such as BLIP and DBCNN contributing significantly.\nHowever, recent studies, including AGIQA-3K, reveal a notable discrepancy\nbetween current methods and state-of-the-art (SOTA) standards. This gap\nemphasizes the necessity for a more sophisticated and precise evaluation\nmetric. In response, our objective is to develop a model that could give\nratings for metrics, which focuses on parameters like perceptual quality,\nauthenticity, and the correspondence between text and image, that more closely\naligns with human perception. In our paper, we introduce a range of effective\nmethods, including prompt designs and the Metric Transformer. The Metric\nTransformer is a novel structure inspired by the complex interrelationships\namong various AGI quality metrics. The code is available at\nhttps://github.com/huskydoge/CS3324-Digital-Image-Processing/tree/main/Assignment1\n","authors":["Benhao Huang"],"pdf_url":"https://arxiv.org/pdf/2403.19101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13302v3","updated":"2024-03-28T02:24:38Z","published":"2023-09-23T08:24:36Z","title":"Gaining the Sparse Rewards by Exploring Lottery Tickets in Spiking\n Neural Network","summary":" Deploying energy-efficient deep learning algorithms on computational-limited\ndevices, such as robots, is still a pressing issue for real-world applications.\nSpiking Neural Networks (SNNs), a novel brain-inspired algorithm, offer a\npromising solution due to their low-latency and low-energy properties over\ntraditional Artificial Neural Networks (ANNs). Despite their advantages, the\ndense structure of deep SNNs can still result in extra energy consumption. The\nLottery Ticket Hypothesis (LTH) posits that within dense neural networks, there\nexist winning Lottery Tickets (LTs), namely sub-networks, that can be obtained\nwithout compromising performance. Inspired by this, this paper delves into the\nspiking-based LTs (SLTs), examining their unique properties and potential for\nextreme efficiency. Then, two significant sparse \\textbf{\\textit{Rewards}} are\ngained through comprehensive explorations and meticulous experiments on SLTs\nacross various dense structures. Moreover, a sparse algorithm tailored for\nspiking transformer structure, which incorporates convolution operations into\nthe Patch Embedding Projection (ConvPEP) module, has been proposed to achieve\nMulti-level Sparsity (MultiSp). MultiSp refers to (1) Patch number sparsity;\n(2) ConvPEP weights sparsity and binarization; and (3) ConvPEP activation layer\nbinarization. Extensive experiments demonstrate that our method achieves\nextreme sparsity with only a slight performance decrease, paving the way for\ndeploying energy-efficient neural networks in robotics and beyond.\n","authors":["Hao Cheng","Jiahang Cao","Erjia Xiao","Mengshu Sun","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2309.13302v3.pdf","comment":"This paper is under submission"},{"id":"http://arxiv.org/abs/2403.19098v1","updated":"2024-03-28T02:22:28Z","published":"2024-03-28T02:22:28Z","title":"GraphAD: Interaction Scene Graph for End-to-end Autonomous Driving","summary":" Modeling complicated interactions among the ego-vehicle, road agents, and map\nelements has been a crucial part for safety-critical autonomous driving.\nPrevious works on end-to-end autonomous driving rely on the attention mechanism\nfor handling heterogeneous interactions, which fails to capture the geometric\npriors and is also computationally intensive. In this paper, we propose the\nInteraction Scene Graph (ISG) as a unified method to model the interactions\namong the ego-vehicle, road agents, and map elements. With the representation\nof the ISG, the driving agents aggregate essential information from the most\ninfluential elements, including the road agents with potential collisions and\nthe map elements to follow. Since a mass of unnecessary interactions are\nomitted, the more efficient scene-graph-based framework is able to focus on\nindispensable connections and leads to better performance. We evaluate the\nproposed method for end-to-end autonomous driving on the nuScenes dataset.\nCompared with strong baselines, our method significantly outperforms in the\nfull-stack driving tasks, including perception, prediction, and planning. Code\nwill be released at https://github.com/zhangyp15/GraphAD.\n","authors":["Yunpeng Zhang","Deheng Qian","Ding Li","Yifeng Pan","Yong Chen","Zhenbao Liang","Zhiyao Zhang","Shurui Zhang","Hongxu Li","Maolei Fu","Yun Ye","Zhujin Liang","Yi Shan","Dalong Du"],"pdf_url":"https://arxiv.org/pdf/2403.19098v1.pdf","comment":"project page: https://github.com/zhangyp15/GraphAD"},{"id":"http://arxiv.org/abs/2403.19080v1","updated":"2024-03-28T01:05:06Z","published":"2024-03-28T01:05:06Z","title":"MMCert: Provable Defense against Adversarial Attacks to Multi-modal\n Models","summary":" Different from a unimodal model whose input is from a single modality, the\ninput (called multi-modal input) of a multi-modal model is from multiple\nmodalities such as image, 3D points, audio, text, etc. Similar to unimodal\nmodels, many existing studies show that a multi-modal model is also vulnerable\nto adversarial perturbation, where an attacker could add small perturbation to\nall modalities of a multi-modal input such that the multi-modal model makes\nincorrect predictions for it. Existing certified defenses are mostly designed\nfor unimodal models, which achieve sub-optimal certified robustness guarantees\nwhen extended to multi-modal models as shown in our experimental results. In\nour work, we propose MMCert, the first certified defense against adversarial\nattacks to a multi-modal model. We derive a lower bound on the performance of\nour MMCert under arbitrary adversarial attacks with bounded perturbations to\nboth modalities (e.g., in the context of auto-driving, we bound the number of\nchanged pixels in both RGB image and depth image). We evaluate our MMCert using\ntwo benchmark datasets: one for the multi-modal road segmentation task and the\nother for the multi-modal emotion recognition task. Moreover, we compare our\nMMCert with a state-of-the-art certified defense extended from unimodal models.\nOur experimental results show that our MMCert outperforms the baseline.\n","authors":["Yanting Wang","Hongye Fu","Wei Zou","Jinyuan Jia"],"pdf_url":"https://arxiv.org/pdf/2403.19080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19079v1","updated":"2024-03-28T01:00:08Z","published":"2024-03-28T01:00:08Z","title":"A Real-Time Framework for Domain-Adaptive Underwater Object Detection\n with Image Enhancement","summary":" In recent years, significant progress has been made in the field of\nunderwater image enhancement (UIE). However, its practical utility for\nhigh-level vision tasks, such as underwater object detection (UOD) in\nAutonomous Underwater Vehicles (AUVs), remains relatively unexplored. It may be\nattributed to several factors: (1) Existing methods typically employ UIE as a\npre-processing step, which inevitably introduces considerable computational\noverhead and latency. (2) The process of enhancing images prior to training\nobject detectors may not necessarily yield performance improvements. (3) The\ncomplex underwater environments can induce significant domain shifts across\ndifferent scenarios, seriously deteriorating the UOD performance. To address\nthese challenges, we introduce EnYOLO, an integrated real-time framework\ndesigned for simultaneous UIE and UOD with domain-adaptation capability.\nSpecifically, both the UIE and UOD task heads share the same network backbone\nand utilize a lightweight design. Furthermore, to ensure balanced training for\nboth tasks, we present a multi-stage training strategy aimed at consistently\nenhancing their performance. Additionally, we propose a novel domain-adaptation\nstrategy to align feature embeddings originating from diverse underwater\nenvironments. Comprehensive experiments demonstrate that our framework not only\nachieves state-of-the-art (SOTA) performance in both UIE and UOD tasks, but\nalso shows superior adaptability when applied to different underwater\nscenarios. Our efficiency analysis further highlights the substantial potential\nof our framework for onboard deployment.\n","authors":["Junjie Wen","Jinqiang Cui","Benyun Zhao","Bingxin Han","Xuchen Liu","Zhi Gao","Ben M. Chen"],"pdf_url":"https://arxiv.org/pdf/2403.19079v1.pdf","comment":"accepted by ICRA24"},{"id":"http://arxiv.org/abs/2403.08059v2","updated":"2024-03-28T00:59:37Z","published":"2024-03-12T20:11:38Z","title":"FluoroSAM: A Language-aligned Foundation Model for X-ray Image\n Segmentation","summary":" Automated X-ray image segmentation would accelerate research and development\nin diagnostic and interventional precision medicine. Prior efforts have\ncontributed task-specific models capable of solving specific image analysis\nproblems, but the utility of these models is restricted to their particular\ntask domain, and expanding to broader use requires additional data, labels, and\nretraining efforts. Recently, foundation models (FMs) -- machine learning\nmodels trained on large amounts of highly variable data thus enabling broad\napplicability -- have emerged as promising tools for automated image analysis.\nExisting FMs for medical image analysis focus on scenarios and modalities where\nobjects are clearly defined by visually apparent boundaries, such as surgical\ntool segmentation in endoscopy. X-ray imaging, by contrast, does not generally\noffer such clearly delineated boundaries or structure priors. During X-ray\nimage formation, complex 3D structures are projected in transmission onto the\nimaging plane, resulting in overlapping features of varying opacity and shape.\nTo pave the way toward an FM for comprehensive and automated analysis of\narbitrary medical X-ray images, we develop FluoroSAM, a language-aligned\nvariant of the Segment-Anything Model, trained from scratch on 1.6M synthetic\nX-ray images. FluoroSAM is trained on data including masks for 128 organ types\nand 464 non-anatomical objects, such as tools and implants. In real X-ray\nimages of cadaveric specimens, FluoroSAM is able to segment bony anatomical\nstructures based on text-only prompting with 0.51 and 0.79 DICE with\npoint-based refinement, outperforming competing SAM variants for all\nstructures. FluoroSAM is also capable of zero-shot generalization to segmenting\nclasses beyond the training set thanks to its language alignment, which we\ndemonstrate for full lung segmentation on real chest X-rays.\n","authors":["Benjamin D. Killeen","Liam J. Wang","Han Zhang","Mehran Armand","Russell H. Taylor","Dave Dreizin","Greg Osgood","Mathias Unberath"],"pdf_url":"https://arxiv.org/pdf/2403.08059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19078v1","updated":"2024-03-28T00:50:02Z","published":"2024-03-28T00:50:02Z","title":"MVEB: Self-Supervised Learning with Multi-View Entropy Bottleneck","summary":" Self-supervised learning aims to learn representation that can be effectively\ngeneralized to downstream tasks. Many self-supervised approaches regard two\nviews of an image as both the input and the self-supervised signals, assuming\nthat either view contains the same task-relevant information and the shared\ninformation is (approximately) sufficient for predicting downstream tasks.\nRecent studies show that discarding superfluous information not shared between\nthe views can improve generalization. Hence, the ideal representation is\nsufficient for downstream tasks and contains minimal superfluous information,\ntermed minimal sufficient representation. One can learn this representation by\nmaximizing the mutual information between the representation and the supervised\nview while eliminating superfluous information. Nevertheless, the computation\nof mutual information is notoriously intractable. In this work, we propose an\nobjective termed multi-view entropy bottleneck (MVEB) to learn minimal\nsufficient representation effectively. MVEB simplifies the minimal sufficient\nlearning to maximizing both the agreement between the embeddings of two views\nand the differential entropy of the embedding distribution. Our experiments\nconfirm that MVEB significantly improves performance. For example, it achieves\ntop-1 accuracy of 76.9\\% on ImageNet with a vanilla ResNet-50 backbone on\nlinear evaluation. To the best of our knowledge, this is the new\nstate-of-the-art result with ResNet-50.\n","authors":["Liangjian Wen","Xiasi Wang","Jianzhuang Liu","Zenglin Xu"],"pdf_url":"https://arxiv.org/pdf/2403.19078v1.pdf","comment":"Accepted by TPAMI"},{"id":"http://arxiv.org/abs/2401.03707v2","updated":"2024-03-28T00:43:21Z","published":"2024-01-08T07:34:43Z","title":"FMA-Net: Flow-Guided Dynamic Filtering and Iterative Feature Refinement\n with Multi-Attention for Joint Video Super-Resolution and Deblurring","summary":" We present a joint learning scheme of video super-resolution and deblurring,\ncalled VSRDB, to restore clean high-resolution (HR) videos from blurry\nlow-resolution (LR) ones. This joint restoration problem has drawn much less\nattention compared to single restoration problems. In this paper, we propose a\nnovel flow-guided dynamic filtering (FGDF) and iterative feature refinement\nwith multi-attention (FRMA), which constitutes our VSRDB framework, denoted as\nFMA-Net. Specifically, our proposed FGDF enables precise estimation of both\nspatio-temporally-variant degradation and restoration kernels that are aware of\nmotion trajectories through sophisticated motion representation learning.\nCompared to conventional dynamic filtering, the FGDF enables the FMA-Net to\neffectively handle large motions into the VSRDB. Additionally, the stacked FRMA\nblocks trained with our novel temporal anchor (TA) loss, which temporally\nanchors and sharpens features, refine features in a course-to-fine manner\nthrough iterative updates. Extensive experiments demonstrate the superiority of\nthe proposed FMA-Net over state-of-the-art methods in terms of both\nquantitative and qualitative quality. Codes and pre-trained models are\navailable at: https://kaist-viclab.github.io/fmanet-site\n","authors":["Geunhyuk Youk","Jihyong Oh","Munchurl Kim"],"pdf_url":"https://arxiv.org/pdf/2401.03707v2.pdf","comment":"CVPR2024 (camera-ready version). The last two authors are\n co-corresponding authors. Please visit our project page at\n https://kaist-viclab.github.io/fmanet-site"},{"id":"http://arxiv.org/abs/2403.19076v1","updated":"2024-03-28T00:34:56Z","published":"2024-03-28T00:34:56Z","title":"Tiny Machine Learning: Progress and Futures","summary":" Tiny Machine Learning (TinyML) is a new frontier of machine learning. By\nsqueezing deep learning models into billions of IoT devices and\nmicrocontrollers (MCUs), we expand the scope of AI applications and enable\nubiquitous intelligence. However, TinyML is challenging due to hardware\nconstraints: the tiny memory resource makes it difficult to hold deep learning\nmodels designed for cloud and mobile platforms. There is also limited compiler\nand inference engine support for bare-metal devices. Therefore, we need to\nco-design the algorithm and system stack to enable TinyML. In this review, we\nwill first discuss the definition, challenges, and applications of TinyML. We\nthen survey the recent progress in TinyML and deep learning on MCUs. Next, we\nwill introduce MCUNet, showing how we can achieve ImageNet-scale AI\napplications on IoT devices with system-algorithm co-design. We will further\nextend the solution from inference to training and introduce tiny on-device\ntraining techniques. Finally, we present future directions in this area.\nToday's large model might be tomorrow's tiny model. The scope of TinyML should\nevolve and adapt over time.\n","authors":["Ji Lin","Ligeng Zhu","Wei-Ming Chen","Wei-Chen Wang","Song Han"],"pdf_url":"https://arxiv.org/pdf/2403.19076v1.pdf","comment":"IEEE Circuits and Systems Magazine (2023). arXiv admin note: text\n overlap with arXiv:2206.15472"},{"id":"http://arxiv.org/abs/2403.19067v1","updated":"2024-03-28T00:14:53Z","published":"2024-03-28T00:14:53Z","title":"Low-Rank Rescaled Vision Transformer Fine-Tuning: A Residual Design\n Approach","summary":" Parameter-efficient fine-tuning for pre-trained Vision Transformers aims to\nadeptly tailor a model to downstream tasks by learning a minimal set of new\nadaptation parameters while preserving the frozen majority of pre-trained\nparameters. Striking a balance between retaining the generalizable\nrepresentation capacity of the pre-trained model and acquiring task-specific\nfeatures poses a key challenge. Currently, there is a lack of focus on guiding\nthis delicate trade-off. In this study, we approach the problem from the\nperspective of Singular Value Decomposition (SVD) of pre-trained parameter\nmatrices, providing insights into the tuning dynamics of existing methods.\nBuilding upon this understanding, we propose a Residual-based Low-Rank\nRescaling (RLRR) fine-tuning strategy. This strategy not only enhances\nflexibility in parameter tuning but also ensures that new parameters do not\ndeviate excessively from the pre-trained model through a residual design.\nExtensive experiments demonstrate that our method achieves competitive\nperformance across various downstream image classification tasks, all while\nmaintaining comparable new parameters. We believe this work takes a step\nforward in offering a unified perspective for interpreting existing methods and\nserves as motivation for the development of new approaches that move closer to\neffectively considering the crucial trade-off mentioned above. Our code is\navailable at\n\\href{https://github.com/zstarN70/RLRR.git}{https://github.com/zstarN70/RLRR.git}.\n","authors":["Wei Dong","Xing Zhang","Bihui Chen","Dawei Yan","Zhijun Lin","Qingsen Yan","Peng Wang","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19066v1","updated":"2024-03-28T00:11:12Z","published":"2024-03-28T00:11:12Z","title":"Generative Quanta Color Imaging","summary":" The astonishing development of single-photon cameras has created an\nunprecedented opportunity for scientific and industrial imaging. However, the\nhigh data throughput generated by these 1-bit sensors creates a significant\nbottleneck for low-power applications. In this paper, we explore the\npossibility of generating a color image from a single binary frame of a\nsingle-photon camera. We evidently find this problem being particularly\ndifficult to standard colorization approaches due to the substantial degree of\nexposure variation. The core innovation of our paper is an exposure synthesis\nmodel framed under a neural ordinary differential equation (Neural ODE) that\nallows us to generate a continuum of exposures from a single observation. This\ninnovation ensures consistent exposure in binary images that colorizers take\non, resulting in notably enhanced colorization. We demonstrate applications of\nthe method in single-image and burst colorization and show superior generative\nperformance over baselines. Project website can be found at\nhttps://vishal-s-p.github.io/projects/2023/generative_quanta_color.html.\n","authors":["Vishal Purohit","Junjie Luo","Yiheng Chi","Qi Guo","Stanley H. Chan","Qiang Qiu"],"pdf_url":"https://arxiv.org/pdf/2403.19066v1.pdf","comment":"Accepted at IEEE Conference on Computer Vision and Pattern\n Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.19203v1","updated":"2024-03-28T08:00:14Z","published":"2024-03-28T08:00:14Z","title":"Single-Shared Network with Prior-Inspired Loss for Parameter-Efficient\n Multi-Modal Imaging Skin Lesion Classification","summary":" In this study, we introduce a multi-modal approach that efficiently\nintegrates multi-scale clinical and dermoscopy features within a single\nnetwork, thereby substantially reducing model parameters. The proposed method\nincludes three novel fusion schemes. Firstly, unlike current methods that\nusually employ two individual models for for clinical and dermoscopy\nmodalities, we verified that multimodal feature can be learned by sharing the\nparameters of encoder while leaving the individual modal-specific classifiers.\nSecondly, the shared cross-attention module can replace the individual one to\nefficiently interact between two modalities at multiple layers. Thirdly,\ndifferent from current methods that equally optimize dermoscopy and clinical\nbranches, inspired by prior knowledge that dermoscopy images play a more\nsignificant role than clinical images, we propose a novel biased loss. This\nloss guides the single-shared network to prioritize dermoscopy information over\nclinical information, implicitly learning a better joint feature representation\nfor the modal-specific task. Extensive experiments on a well-recognized\nSeven-Point Checklist (SPC) dataset and a collected dataset demonstrate the\neffectiveness of our method on both CNN and Transformer structures.\nFurthermore, our method exhibits superiority in both accuracy and model\nparameters compared to currently advanced methods.\n","authors":["Peng Tang","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2403.19203v1.pdf","comment":"This paper have submitted to Journal for review"},{"id":"http://arxiv.org/abs/2403.19076v1","updated":"2024-03-28T00:34:56Z","published":"2024-03-28T00:34:56Z","title":"Tiny Machine Learning: Progress and Futures","summary":" Tiny Machine Learning (TinyML) is a new frontier of machine learning. By\nsqueezing deep learning models into billions of IoT devices and\nmicrocontrollers (MCUs), we expand the scope of AI applications and enable\nubiquitous intelligence. However, TinyML is challenging due to hardware\nconstraints: the tiny memory resource makes it difficult to hold deep learning\nmodels designed for cloud and mobile platforms. There is also limited compiler\nand inference engine support for bare-metal devices. Therefore, we need to\nco-design the algorithm and system stack to enable TinyML. In this review, we\nwill first discuss the definition, challenges, and applications of TinyML. We\nthen survey the recent progress in TinyML and deep learning on MCUs. Next, we\nwill introduce MCUNet, showing how we can achieve ImageNet-scale AI\napplications on IoT devices with system-algorithm co-design. We will further\nextend the solution from inference to training and introduce tiny on-device\ntraining techniques. Finally, we present future directions in this area.\nToday's large model might be tomorrow's tiny model. The scope of TinyML should\nevolve and adapt over time.\n","authors":["Ji Lin","Ligeng Zhu","Wei-Ming Chen","Wei-Chen Wang","Song Han"],"pdf_url":"https://arxiv.org/pdf/2403.19076v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2206.15472"},{"id":"http://arxiv.org/abs/2403.19885v1","updated":"2024-03-28T23:51:51Z","published":"2024-03-28T23:51:51Z","title":"Towards Long Term SLAM on Thermal Imagery","summary":" Visual SLAM with thermal imagery, and other low contrast visually degraded\nenvironments such as underwater, or in areas dominated by snow and ice, remain\na difficult problem for many state of the art (SOTA) algorithms. In addition to\nchallenging front-end data association, thermal imagery presents an additional\ndifficulty for long term relocalization and map reuse. The relative\ntemperatures of objects in thermal imagery change dramatically from day to\nnight. Feature descriptors typically used for relocalization in SLAM are unable\nto maintain consistency over these diurnal changes. We show that learned\nfeature descriptors can be used within existing Bag of Word based localization\nschemes to dramatically improve place recognition across large temporal gaps in\nthermal imagery. In order to demonstrate the effectiveness of our trained\nvocabulary, we have developed a baseline SLAM system, integrating learned\nfeatures and matching into a classical SLAM algorithm. Our system demonstrates\ngood local tracking on challenging thermal imagery, and relocalization that\novercomes dramatic day to night thermal appearance changes. Our code and\ndatasets are available here:\nhttps://github.com/neufieldrobotics/IRSLAM_Baseline\n","authors":["Colin Keil","Aniket Gupta","Pushyami Kaveti","Hanumant Singh"],"pdf_url":"https://arxiv.org/pdf/2403.19885v1.pdf","comment":"8 pages, 7 figures, Submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2403.19882v1","updated":"2024-03-28T23:31:59Z","published":"2024-03-28T23:31:59Z","title":"Enhancing Efficiency in Vision Transformer Networks: Design Techniques\n and Insights","summary":" Intrigued by the inherent ability of the human visual system to identify\nsalient regions in complex scenes, attention mechanisms have been seamlessly\nintegrated into various Computer Vision (CV) tasks. Building upon this\nparadigm, Vision Transformer (ViT) networks exploit attention mechanisms for\nimproved efficiency. This review navigates the landscape of redesigned\nattention mechanisms within ViTs, aiming to enhance their performance. This\npaper provides a comprehensive exploration of techniques and insights for\ndesigning attention mechanisms, systematically reviewing recent literature in\nthe field of CV. This survey begins with an introduction to the theoretical\nfoundations and fundamental concepts underlying attention mechanisms. We then\npresent a systematic taxonomy of various attention mechanisms within ViTs,\nemploying redesigned approaches. A multi-perspective categorization is proposed\nbased on their application, objectives, and the type of attention applied. The\nanalysis includes an exploration of the novelty, strengths, weaknesses, and an\nin-depth evaluation of the different proposed strategies. This culminates in\nthe development of taxonomies that highlight key properties and contributions.\nFinally, we gather the reviewed studies along with their available open-source\nimplementations at our\n\\href{https://github.com/mindflow-institue/Awesome-Attention-Mechanism-in-Medical-Imaging}{GitHub}\\footnote{\\url{https://github.com/xmindflow/Awesome-Attention-Mechanism-in-Medical-Imaging}}.\nWe aim to regularly update it with the most recent relevant papers.\n","authors":["Moein Heidari","Reza Azad","Sina Ghorbani Kolahi","René Arimond","Leon Niggemeier","Alaa Sulaiman","Afshin Bozorgpour","Ehsan Khodapanah Aghdam","Amirhossein Kazerouni","Ilker Hacihaliloglu","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2403.19882v1.pdf","comment":"Submitted to Computational Visual Media Journal"},{"id":"http://arxiv.org/abs/2403.19880v1","updated":"2024-03-28T23:26:45Z","published":"2024-03-28T23:26:45Z","title":"Vision-Language Synthetic Data Enhances Echocardiography Downstream\n Tasks","summary":" High-quality, large-scale data is essential for robust deep learning models\nin medical applications, particularly ultrasound image analysis. Diffusion\nmodels facilitate high-fidelity medical image generation, reducing the costs\nassociated with acquiring and annotating new images. This paper utilizes recent\nvision-language models to produce diverse and realistic synthetic\nechocardiography image data, preserving key features of the original images\nguided by textual and semantic label maps. Specifically, we investigate three\npotential avenues: unconditional generation, generation guided by text, and a\nhybrid approach incorporating both textual and semantic supervision. We show\nthat the rich contextual information present in the synthesized data\npotentially enhances the accuracy and interpretability of downstream tasks,\nsuch as echocardiography segmentation and classification with improved metrics\nand faster convergence. Our implementation with checkpoints, prompts, and the\ncreated synthetic dataset will be publicly available at\n\\href{https://github.com/Pooria90/DiffEcho}{GitHub}.\n","authors":["Pooria Ashrafian","Milad Yazdani","Moein Heidari","Dena Shahriari","Ilker Hacihaliloglu"],"pdf_url":"https://arxiv.org/pdf/2403.19880v1.pdf","comment":"Submitted as a conference paper to MICCAI 2024"},{"id":"http://arxiv.org/abs/2310.01779v3","updated":"2024-03-28T22:27:12Z","published":"2023-10-03T04:01:27Z","title":"HallE-Control: Controlling Object Hallucination in Large Multimodal\n Models","summary":" Current Large Multimodal Models (LMMs) achieve remarkable progress, yet there\nremains significant uncertainty regarding their ability to accurately apprehend\nvisual details, that is, in performing detailed captioning. To address this, we\nintroduce $\\textit{CCEval}$, a GPT-4 assisted evaluation method for detailed\ncaptioning. Interestingly, while LMMs demonstrate minimal object existence\nhallucination in existing VQA benchmarks, our proposed evaluation reveals\ncontinued susceptibility to such hallucinations. In this paper, we make the\nfirst attempt to investigate such hallucination from different aspects,\nincluding image resolution, the language decoder size, and instruction data\namount, quality, granularity. Our findings underscore the unwarranted inference\nwhen the language description includes details at a finer object granularity\nthan what the vision module can ground or verify, thus inducing hallucination.\nTo control such hallucinations, we further attribute the reliability of\ncaptioning to contextual knowledge (involving only contextually grounded\nobjects) and parametric knowledge (containing inferred objects by the model).\nThus, we introduce $\\textit{HallE-Control}$, a controllable LMM in terms of\n$\\textbf{Hall}$ucination in object $\\textbf{E}$xistence. HallE-Control can\ncondition the captioning to shift between (i) exclusively depicting contextual\nknowledge for grounded objects and (ii) blending it with parametric knowledge\nto imagine inferred objects. Our method reduces hallucination by 44% compared\nto LLaVA$_{7B}$ and maintains the object coverage.\n","authors":["Bohan Zhai","Shijia Yang","Chenfeng Xu","Sheng Shen","Kurt Keutzer","Chunyuan Li","Manling Li"],"pdf_url":"https://arxiv.org/pdf/2310.01779v3.pdf","comment":"Our code is publicly available at\n https://github.com/bronyayang/HallE_Control"},{"id":"http://arxiv.org/abs/2403.19866v1","updated":"2024-03-28T22:25:05Z","published":"2024-03-28T22:25:05Z","title":"Is Synthetic Image Useful for Transfer Learning? An Investigation into\n Data Generation, Volume, and Utilization","summary":" Synthetic image data generation represents a promising avenue for training\ndeep learning models, particularly in the realm of transfer learning, where\nobtaining real images within a specific domain can be prohibitively expensive\ndue to privacy and intellectual property considerations. This work delves into\nthe generation and utilization of synthetic images derived from text-to-image\ngenerative models in facilitating transfer learning paradigms. Despite the high\nvisual fidelity of the generated images, we observe that their naive\nincorporation into existing real-image datasets does not consistently enhance\nmodel performance due to the inherent distribution gap between synthetic and\nreal images. To address this issue, we introduce a novel two-stage framework\ncalled bridged transfer, which initially employs synthetic images for\nfine-tuning a pre-trained model to improve its transferability and subsequently\nuses real data for rapid adaptation. Alongside, We propose dataset style\ninversion strategy to improve the stylistic alignment between synthetic and\nreal images. Our proposed methods are evaluated across 10 different datasets\nand 5 distinct models, demonstrating consistent improvements, with up to 30%\naccuracy increase on classification tasks. Intriguingly, we note that the\nenhancements were not yet saturated, indicating that the benefits may further\nincrease with an expanded volume of synthetic data.\n","authors":["Yuhang Li","Xin Dong","Chen Chen","Jingtao Li","Yuxin Wen","Michael Spranger","Lingjuan Lyu"],"pdf_url":"https://arxiv.org/pdf/2403.19866v1.pdf","comment":"ICLR24 Score 6865\n https://openreview.net/forum?id=CjPt1AC6w0&referrer=%5Bthe%20profile%20of%20Chen%20Chen%5D(%2Fprofile%3Fid%3D~Chen_Chen20)"},{"id":"http://arxiv.org/abs/2403.19863v1","updated":"2024-03-28T22:17:19Z","published":"2024-03-28T22:17:19Z","title":"DeNetDM: Debiasing by Network Depth Modulation","summary":" When neural networks are trained on biased datasets, they tend to\ninadvertently learn spurious correlations, leading to challenges in achieving\nstrong generalization and robustness. Current approaches to address such biases\ntypically involve utilizing bias annotations, reweighting based on pseudo-bias\nlabels, or enhancing diversity within bias-conflicting data points through\naugmentation techniques. We introduce DeNetDM, a novel debiasing method based\non the observation that shallow neural networks prioritize learning core\nattributes, while deeper ones emphasize biases when tasked with acquiring\ndistinct information. Using a training paradigm derived from Product of\nExperts, we create both biased and debiased branches with deep and shallow\narchitectures and then distill knowledge to produce the target debiased model.\nExtensive experiments and analyses demonstrate that our approach outperforms\ncurrent debiasing techniques, achieving a notable improvement of around 5% in\nthree datasets, encompassing both synthetic and real-world data. Remarkably,\nDeNetDM accomplishes this without requiring annotations pertaining to bias\nlabels or bias types, while still delivering performance on par with supervised\ncounterparts. Furthermore, our approach effectively harnesses the diversity of\nbias-conflicting points within the data, surpassing previous methods and\nobviating the need for explicit augmentation-based methods to enhance the\ndiversity of such bias-conflicting points. The source code will be available\nupon acceptance.\n","authors":["Silpa Vadakkeeveetil Sreelatha","Adarsh Kappiyath","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2403.19863v1.pdf","comment":"23 pages including supplementary"},{"id":"http://arxiv.org/abs/2312.10144v3","updated":"2024-03-28T21:32:10Z","published":"2023-12-15T19:00:07Z","title":"Data-Efficient Multimodal Fusion on a Single GPU","summary":" The goal of multimodal alignment is to learn a single latent space that is\nshared between multimodal inputs. The most powerful models in this space have\nbeen trained using massive datasets of paired inputs and large-scale\ncomputational resources, making them prohibitively expensive to train in many\npractical scenarios. We surmise that existing unimodal encoders pre-trained on\nlarge amounts of unimodal data should provide an effective bootstrap to create\nmultimodal models from unimodal ones at much lower costs. We therefore propose\nFuseMix, a multimodal augmentation scheme that operates on the latent spaces of\narbitrary pre-trained unimodal encoders. Using FuseMix for multimodal\nalignment, we achieve competitive performance -- and in certain cases\noutperform state-of-the art methods -- in both image-text and audio-text\nretrieval, with orders of magnitude less compute and data: for example, we\noutperform CLIP on the Flickr30K text-to-image retrieval task with $\\sim \\!\n600\\times$ fewer GPU days and $\\sim \\! 80\\times$ fewer image-text pairs.\nAdditionally, we show how our method can be applied to convert pre-trained\ntext-to-image generative models into audio-to-image ones. Code is available at:\nhttps://github.com/layer6ai-labs/fusemix.\n","authors":["Noël Vouitsis","Zhaoyan Liu","Satya Krishna Gorti","Valentin Villecroze","Jesse C. Cresswell","Guangwei Yu","Gabriel Loaiza-Ganem","Maksims Volkovs"],"pdf_url":"https://arxiv.org/pdf/2312.10144v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2402.17951v3","updated":"2024-03-28T21:29:56Z","published":"2024-02-28T00:20:25Z","title":"QN-Mixer: A Quasi-Newton MLP-Mixer Model for Sparse-View CT\n Reconstruction","summary":" Inverse problems span across diverse fields. In medical contexts, computed\ntomography (CT) plays a crucial role in reconstructing a patient's internal\nstructure, presenting challenges due to artifacts caused by inherently\nill-posed inverse problems. Previous research advanced image quality via\npost-processing and deep unrolling algorithms but faces challenges, such as\nextended convergence times with ultra-sparse data. Despite enhancements,\nresulting images often show significant artifacts, limiting their effectiveness\nfor real-world diagnostic applications. We aim to explore deep second-order\nunrolling algorithms for solving imaging inverse problems, emphasizing their\nfaster convergence and lower time complexity compared to common first-order\nmethods like gradient descent. In this paper, we introduce QN-Mixer, an\nalgorithm based on the quasi-Newton approach. We use learned parameters through\nthe BFGS algorithm and introduce Incept-Mixer, an efficient neural architecture\nthat serves as a non-local regularization term, capturing long-range\ndependencies within images. To address the computational demands typically\nassociated with quasi-Newton algorithms that require full Hessian matrix\ncomputations, we present a memory-efficient alternative. Our approach\nintelligently downsamples gradient information, significantly reducing\ncomputational requirements while maintaining performance. The approach is\nvalidated through experiments on the sparse-view CT problem, involving various\ndatasets and scanning protocols, and is compared with post-processing and deep\nunrolling state-of-the-art approaches. Our method outperforms existing\napproaches and achieves state-of-the-art performance in terms of SSIM and PSNR,\nall while reducing the number of unrolling iterations required.\n","authors":["Ishak Ayad","Nicolas Larue","Maï K. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2402.17951v3.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://towzeur.github.io/QN-Mixer/"},{"id":"http://arxiv.org/abs/2312.00598v2","updated":"2024-03-28T21:29:55Z","published":"2023-12-01T14:03:30Z","title":"Learning from One Continuous Video Stream","summary":" We introduce a framework for online learning from a single continuous video\nstream -- the way people and animals learn, without mini-batches, data\naugmentation or shuffling. This poses great challenges given the high\ncorrelation between consecutive video frames and there is very little prior\nwork on it. Our framework allows us to do a first deep dive into the topic and\nincludes a collection of streams and tasks composed from two existing video\ndatasets, plus methodology for performance evaluation that considers both\nadaptation and generalization. We employ pixel-to-pixel modelling as a\npractical and flexible way to switch between pre-training and single-stream\nevaluation as well as between arbitrary tasks, without ever requiring changes\nto models and always using the same pixel loss. Equipped with this framework we\nobtained large single-stream learning gains from pre-training with a novel\nfamily of future prediction tasks, found that momentum hurts, and that the pace\nof weight updates matters. The combination of these insights leads to matching\nthe performance of IID learning with batch size 1, when using the same\narchitecture and without costly replay buffers.\n","authors":["João Carreira","Michael King","Viorica Pătrăucean","Dilara Gokay","Cătălin Ionescu","Yi Yang","Daniel Zoran","Joseph Heyward","Carl Doersch","Yusuf Aytar","Dima Damen","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2312.00598v2.pdf","comment":"CVPR camera ready version"},{"id":"http://arxiv.org/abs/2403.17343v3","updated":"2024-03-28T21:28:00Z","published":"2024-03-26T03:05:20Z","title":"Residual-based Language Models are Free Boosters for Biomedical Imaging","summary":" In this study, we uncover the unexpected efficacy of residual-based large\nlanguage models (LLMs) as part of encoders for biomedical imaging tasks, a\ndomain traditionally devoid of language or textual data. The approach diverges\nfrom established methodologies by utilizing a frozen transformer block,\nextracted from pre-trained LLMs, as an innovative encoder layer for the direct\nprocessing of visual tokens. This strategy represents a significant departure\nfrom the standard multi-modal vision-language frameworks, which typically hinge\non language-driven prompts and inputs. We found that these LLMs could boost\nperformance across a spectrum of biomedical imaging applications, including\nboth 2D and 3D visual classification tasks, serving as plug-and-play boosters.\nMore interestingly, as a byproduct, we found that the proposed framework\nachieved superior performance, setting new state-of-the-art results on\nextensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we\naim to open new avenues for employing LLMs in biomedical imaging and enriching\nthe understanding of their potential in this specialized domain.\n","authors":["Zhixin Lai","Jing Wu","Suiyao Chen","Yucheng Zhou","Naira Hovakimyan"],"pdf_url":"https://arxiv.org/pdf/2403.17343v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19838v1","updated":"2024-03-28T21:18:33Z","published":"2024-03-28T21:18:33Z","title":"Multi-Frame, Lightweight & Efficient Vision-Language Models for Question\n Answering in Autonomous Driving","summary":" Vision-Language Models (VLMs) and Multi-Modal Language models (MMLMs) have\nbecome prominent in autonomous driving research, as these models can provide\ninterpretable textual reasoning and responses for end-to-end autonomous driving\nsafety tasks using traffic scene images and other data modalities. However,\ncurrent approaches to these systems use expensive large language model (LLM)\nbackbones and image encoders, making such systems unsuitable for real-time\nautonomous driving systems where tight memory constraints exist and fast\ninference time is necessary. To address these previous issues, we develop\nEM-VLM4AD, an efficient, lightweight, multi-frame vision language model which\nperforms Visual Question Answering for autonomous driving. In comparison to\nprevious approaches, EM-VLM4AD requires at least 10 times less memory and\nfloating point operations, while also achieving higher BLEU-4, METEOR, CIDEr,\nand ROGUE scores than the existing baseline on the DriveLM dataset. EM-VLM4AD\nalso exhibits the ability to extract relevant information from traffic views\nrelated to prompts and can answer questions for various autonomous driving\nsubtasks. We release our code to train and evaluate our model at\nhttps://github.com/akshaygopalkr/EM-VLM4AD.\n","authors":["Akshay Gopalkrishnan","Ross Greer","Mohan Trivedi"],"pdf_url":"https://arxiv.org/pdf/2403.19838v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.19837v1","updated":"2024-03-28T21:15:38Z","published":"2024-03-28T21:15:38Z","title":"Concept-based Analysis of Neural Networks via Vision-Language Models","summary":" Formal analysis of vision-based deep neural networks (DNNs) is highly\ndesirable but it is very challenging due to the difficulty of expressing formal\nspecifications for vision tasks and the lack of efficient verification\nprocedures. In this paper, we propose to leverage emerging multimodal,\nvision-language, foundation models (VLMs) as a lens through which we can reason\nabout vision models. VLMs have been trained on a large body of images\naccompanied by their textual description, and are thus implicitly aware of\nhigh-level, human-understandable concepts describing the images. We describe a\nlogical specification language $\\texttt{Con}_{\\texttt{spec}}$ designed to\nfacilitate writing specifications in terms of these concepts. To define and\nformally check $\\texttt{Con}_{\\texttt{spec}}$ specifications, we leverage a\nVLM, which provides a means to encode and efficiently check natural-language\nproperties of vision models. We demonstrate our techniques on a ResNet-based\nclassifier trained on the RIVAL-10 dataset leveraging CLIP as the multimodal\nmodel.\n","authors":["Ravi Mangal","Nina Narodytska","Divya Gopinath","Boyue Caroline Hu","Anirban Roy","Susmit Jha","Corina Pasareanu"],"pdf_url":"https://arxiv.org/pdf/2403.19837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17166v2","updated":"2024-03-28T20:49:55Z","published":"2023-09-29T11:59:57Z","title":"Advances in Kidney Biopsy Lesion Assessment through Dense Instance\n Segmentation","summary":" Renal biopsies are the gold standard for diagnosis of kidney diseases. Lesion\nscores made by renal pathologists are semi-quantitative and exhibit high\ninter-observer variability. Automating lesion classification within segmented\nanatomical structures can provide decision support in quantification analysis\nand reduce the inter-observer variability. Nevertheless, classifying lesions in\nregions-of-interest (ROIs) is clinically challenging due to (a) a large amount\nof densely packed anatomical objects (up to 1000), (b) class imbalance across\ndifferent compartments (at least 3), (c) significant variation in object scales\n(i.e. sizes and shapes), and (d) the presence of multi-label lesions per\nanatomical structure. Existing models lack the capacity to address these\ncomplexities efficiently and generically. This paper presents \\textbf{a\ngeneralized technical solution} for large-scale, multi-source datasets with\ndiverse lesions. Our approach utilizes two sub-networks: dense instance\nsegmentation and lesion classification. We introduce \\textbf{DiffRegFormer}, an\nend-to-end dense instance segmentation model designed for multi-class,\nmulti-scale objects within ROIs. Combining diffusion models, transformers, and\nRCNNs, DiffRegFormer efficiently recognizes over 500 objects across three\nanatomical classes (glomeruli, tubuli, arteries) within ROIs on a single NVIDIA\nGeForce RTX 3090 GPU. On a dataset of 303 ROIs (from 148 Jones' silver-stained\nrenal WSIs), it outperforms state of art models, achieving AP of 52.1\\%\n(detection) and 46.8\\% (segmentation). Our lesion classification sub-network\nachieves 89.2\\% precision and 64.6\\% recall on 21889 object patches (from the\n303 ROIs). Importantly, the model demonstrates direct domain transfer to\nPAS-stained WSIs without fine-tuning.\n","authors":["Zhan Xiong","Junling He","Pieter Valkema","Tri Q. Nguyen","Maarten Naesens","Jesper Kers","Fons J. Verbeek"],"pdf_url":"https://arxiv.org/pdf/2309.17166v2.pdf","comment":"16 pages, 15 figures, 6 tables, Journal"},{"id":"http://arxiv.org/abs/2403.13199v2","updated":"2024-03-28T20:06:38Z","published":"2024-03-19T23:23:35Z","title":"DecentNeRFs: Decentralized Neural Radiance Fields from Crowdsourced\n Images","summary":" Neural radiance fields (NeRFs) show potential for transforming images\ncaptured worldwide into immersive 3D visual experiences. However, most of this\ncaptured visual data remains siloed in our camera rolls as these images contain\npersonal details. Even if made public, the problem of learning 3D\nrepresentations of billions of scenes captured daily in a centralized manner is\ncomputationally intractable. Our approach, DecentNeRF, is the first attempt at\ndecentralized, crowd-sourced NeRFs that require $\\sim 10^4\\times$ less server\ncomputing for a scene than a centralized approach. Instead of sending the raw\ndata, our approach requires users to send a 3D representation, distributing the\nhigh computation cost of training centralized NeRFs between the users. It\nlearns photorealistic scene representations by decomposing users' 3D views into\npersonal and global NeRFs and a novel optimally weighted aggregation of only\nthe latter. We validate the advantage of our approach to learn NeRFs with\nphotorealism and minimal server computation cost on structured synthetic and\nreal-world photo tourism datasets. We further analyze how secure aggregation of\nglobal NeRFs in DecentNeRF minimizes the undesired reconstruction of personal\ncontent by the server.\n","authors":["Zaid Tasneem","Akshat Dave","Abhishek Singh","Kushagra Tiwary","Praneeth Vepakomma","Ashok Veeraraghavan","Ramesh Raskar"],"pdf_url":"https://arxiv.org/pdf/2403.13199v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02402v2","updated":"2024-03-28T20:01:02Z","published":"2023-11-04T13:28:06Z","title":"Hybrid quantum image classification and federated learning for hepatic\n steatosis diagnosis","summary":" In the realm of liver transplantation, accurately determining hepatic\nsteatosis levels is crucial. Recognizing the essential need for improved\ndiagnostic precision, particularly for optimizing diagnosis time by swiftly\nhandling easy-to-solve cases and allowing the expert time to focus on more\ncomplex cases, this study aims to develop cutting-edge algorithms that enhance\nthe classification of liver biopsy images. Additionally, the challenge of\nmaintaining data privacy arises when creating automated algorithmic solutions,\nas sharing patient data between hospitals is restricted, further complicating\nthe development and validation process. This research tackles diagnostic\naccuracy by leveraging novel techniques from the rapidly evolving field of\nquantum machine learning, known for their superior generalization abilities.\nConcurrently, it addresses privacy concerns through the implementation of\nprivacy-conscious collaborative machine learning with federated learning. We\nintroduce a hybrid quantum neural network model that leverages real-world\nclinical data to assess non-alcoholic liver steatosis accurately. This model\nachieves an image classification accuracy of 97%, surpassing traditional\nmethods by 1.8%. Moreover, by employing a federated learning approach that\nallows data from different clients to be shared while ensuring privacy, we\nmaintain an accuracy rate exceeding 90%. This initiative marks a significant\nstep towards a scalable, collaborative, efficient, and dependable computational\nframework that aids clinical pathologists in their daily diagnostic tasks.\n","authors":["Luca Lusnig","Asel Sagingalieva","Mikhail Surmach","Tatjana Protasevich","Ovidiu Michiu","Joseph McLoughlin","Christopher Mansell","Graziano de' Petris","Deborah Bonazza","Fabrizio Zanconati","Alexey Melnikov","Fabio Cavalli"],"pdf_url":"https://arxiv.org/pdf/2311.02402v2.pdf","comment":"13 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.19811v1","updated":"2024-03-28T19:45:35Z","published":"2024-03-28T19:45:35Z","title":"X-MIC: Cross-Modal Instance Conditioning for Egocentric Action\n Generalization","summary":" Lately, there has been growing interest in adapting vision-language models\n(VLMs) to image and third-person video classification due to their success in\nzero-shot recognition. However, the adaptation of these models to egocentric\nvideos has been largely unexplored. To address this gap, we propose a simple\nyet effective cross-modal adaptation framework, which we call X-MIC. Using a\nvideo adapter, our pipeline learns to align frozen text embeddings to each\negocentric video directly in the shared embedding space. Our novel adapter\narchitecture retains and improves generalization of the pre-trained VLMs by\ndisentangling learnable temporal modeling and frozen visual encoder. This\nresults in an enhanced alignment of text embeddings to each egocentric video,\nleading to a significant improvement in cross-dataset generalization. We\nevaluate our approach on the Epic-Kitchens, Ego4D, and EGTEA datasets for\nfine-grained cross-dataset action generalization, demonstrating the\neffectiveness of our method. Code is available at\nhttps://github.com/annusha/xmic\n","authors":["Anna Kukleva","Fadime Sener","Edoardo Remelli","Bugra Tekin","Eric Sauser","Bernt Schiele","Shugao Ma"],"pdf_url":"https://arxiv.org/pdf/2403.19811v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19787v1","updated":"2024-03-28T19:11:26Z","published":"2024-03-28T19:11:26Z","title":"JIST: Joint Image and Sequence Training for Sequential Visual Place\n Recognition","summary":" Visual Place Recognition aims at recognizing previously visited places by\nrelying on visual clues, and it is used in robotics applications for SLAM and\nlocalization. Since typically a mobile robot has access to a continuous stream\nof frames, this task is naturally cast as a sequence-to-sequence localization\nproblem. Nevertheless, obtaining sequences of labelled data is much more\nexpensive than collecting isolated images, which can be done in an automated\nway with little supervision. As a mitigation to this problem, we propose a\nnovel Joint Image and Sequence Training protocol (JIST) that leverages large\nuncurated sets of images through a multi-task learning framework. With JIST we\nalso introduce SeqGeM, an aggregation layer that revisits the popular GeM\npooling to produce a single robust and compact embedding from a sequence of\nsingle-frame embeddings. We show that our model is able to outperform previous\nstate of the art while being faster, using 8 times smaller descriptors, having\na lighter architecture and allowing to process sequences of various lengths.\nCode is available at https://github.com/ga1i13o/JIST\n","authors":["Gabriele Berton","Gabriele Trivigno","Barbara Caputo","Carlo Masone"],"pdf_url":"https://arxiv.org/pdf/2403.19787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19786v1","updated":"2024-03-28T19:10:54Z","published":"2024-03-28T19:10:54Z","title":"Zero-shot Prompt-based Video Encoder for Surgical Gesture Recognition","summary":" Purpose: Surgical video is an important data stream for gesture recognition.\nThus, robust visual encoders for those data-streams is similarly important.\nMethods: Leveraging the Bridge-Prompt framework, we fine-tune a pre-trained\nvision-text model (CLIP) for gesture recognition in surgical videos. This can\nutilize extensive outside video data such as text, but also make use of label\nmeta-data and weakly supervised contrastive losses. Results: Our experiments\nshow that prompt-based video encoder outperforms standard encoders in surgical\ngesture recognition tasks. Notably, it displays strong performance in zero-shot\nscenarios, where gestures/tasks that were not provided during the encoder\ntraining phase are included in the prediction phase. Additionally, we measure\nthe benefit of inclusion text descriptions in the feature extractor training\nschema. Conclusion: Bridge-Prompt and similar pre-trained+fine-tuned video\nencoder models present significant visual representation for surgical robotics,\nespecially in gesture recognition tasks. Given the diverse range of surgical\ntasks (gestures), the ability of these models to zero-shot transfer without the\nneed for any task (gesture) specific retraining makes them invaluable.\n","authors":["Mingxing Rao","Yinhong Qin","Soheil Kolouri","Jie Ying Wu","Daniel Moyer"],"pdf_url":"https://arxiv.org/pdf/2403.19786v1.pdf","comment":"17 pages,4 figures, 7 tables, IPCAI 2024"},{"id":"http://arxiv.org/abs/2403.19782v1","updated":"2024-03-28T19:07:26Z","published":"2024-03-28T19:07:26Z","title":"ENet-21: An Optimized light CNN Structure for Lane Detection","summary":" Lane detection for autonomous vehicles is an important concept, yet it is a\nchallenging issue of driver assistance systems in modern vehicles. The\nemergence of deep learning leads to significant progress in self-driving cars.\nConventional deep learning-based methods handle lane detection problems as a\nbinary segmentation task and determine whether a pixel belongs to a line. These\nmethods rely on the assumption of a fixed number of lanes, which does not\nalways work. This study aims to develop an optimal structure for the lane\ndetection problem, offering a promising solution for driver assistance features\nin modern vehicles by utilizing a machine learning method consisting of binary\nsegmentation and Affinity Fields that can manage varying numbers of lanes and\nlane change scenarios. In this approach, the Convolutional Neural Network\n(CNN), is selected as a feature extractor, and the final output is obtained\nthrough clustering of the semantic segmentation and Affinity Field outputs. Our\nmethod uses less complex CNN architecture than exi\n","authors":["Seyed Rasoul Hosseini","Mohammad Teshnehlab"],"pdf_url":"https://arxiv.org/pdf/2403.19782v1.pdf","comment":"The paper is under review by Soft Computing journal"},{"id":"http://arxiv.org/abs/2403.19780v1","updated":"2024-03-28T19:06:37Z","published":"2024-03-28T19:06:37Z","title":"Mitigating Motion Blur in Neural Radiance Fields with Events and Frames","summary":" Neural Radiance Fields (NeRFs) have shown great potential in novel view\nsynthesis. However, they struggle to render sharp images when the data used for\ntraining is affected by motion blur. On the other hand, event cameras excel in\ndynamic scenes as they measure brightness changes with microsecond resolution\nand are thus only marginally affected by blur. Recent methods attempt to\nenhance NeRF reconstructions under camera motion by fusing frames and events.\nHowever, they face challenges in recovering accurate color content or constrain\nthe NeRF to a set of predefined camera poses, harming reconstruction quality in\nchallenging conditions. This paper proposes a novel formulation addressing\nthese issues by leveraging both model- and learning-based modules. We\nexplicitly model the blur formation process, exploiting the event double\nintegral as an additional model-based prior. Additionally, we model the\nevent-pixel response using an end-to-end learnable response function, allowing\nour method to adapt to non-idealities in the real event-camera sensor. We show,\non synthetic and real data, that the proposed approach outperforms existing\ndeblur NeRFs that use only frames as well as those that combine frames and\nevents by +6.13dB and +2.48dB, respectively.\n","authors":["Marco Cannici","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2403.19780v1.pdf","comment":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR),\n 2024"},{"id":"http://arxiv.org/abs/2403.19776v1","updated":"2024-03-28T18:58:43Z","published":"2024-03-28T18:58:43Z","title":"CLoRA: A Contrastive Approach to Compose Multiple LoRA Models","summary":" Low-Rank Adaptations (LoRAs) have emerged as a powerful and popular technique\nin the field of image generation, offering a highly effective way to adapt and\nrefine pre-trained deep learning models for specific tasks without the need for\ncomprehensive retraining. By employing pre-trained LoRA models, such as those\nrepresenting a specific cat and a particular dog, the objective is to generate\nan image that faithfully embodies both animals as defined by the LoRAs.\nHowever, the task of seamlessly blending multiple concept LoRAs to capture a\nvariety of concepts in one image proves to be a significant challenge. Common\napproaches often fall short, primarily because the attention mechanisms within\ndifferent LoRA models overlap, leading to scenarios where one concept may be\ncompletely ignored (e.g., omitting the dog) or where concepts are incorrectly\ncombined (e.g., producing an image of two cats instead of one cat and one dog).\nTo overcome these issues, CLoRA addresses them by updating the attention maps\nof multiple LoRA models and leveraging them to create semantic masks that\nfacilitate the fusion of latent representations. Our method enables the\ncreation of composite images that truly reflect the characteristics of each\nLoRA, successfully merging multiple concepts or styles. Our comprehensive\nevaluations, both qualitative and quantitative, demonstrate that our approach\noutperforms existing methodologies, marking a significant advancement in the\nfield of image generation with LoRAs. Furthermore, we share our source code,\nbenchmark dataset, and trained LoRA models to promote further research on this\ntopic.\n","authors":["Tuna Han Salih Meral","Enis Simsar","Federico Tombari","Pinar Yanardag"],"pdf_url":"https://arxiv.org/pdf/2403.19776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03629v2","updated":"2024-03-28T18:50:50Z","published":"2023-10-05T16:03:25Z","title":"Wasserstein Distortion: Unifying Fidelity and Realism","summary":" We introduce a distortion measure for images, Wasserstein distortion, that\nsimultaneously generalizes pixel-level fidelity on the one hand and realism or\nperceptual quality on the other. We show how Wasserstein distortion reduces to\na pure fidelity constraint or a pure realism constraint under different\nparameter choices and discuss its metric properties. Pairs of images that are\nclose under Wasserstein distortion illustrate its utility. In particular, we\ngenerate random textures that have high fidelity to a reference texture in one\nlocation of the image and smoothly transition to an independent realization of\nthe texture as one moves away from this point. Wasserstein distortion attempts\nto generalize and unify prior work on texture generation, image realism and\ndistortion, and models of the early human visual system, in the form of an\noptimizable metric in the mathematical sense.\n","authors":["Yang Qiu","Aaron B. Wagner","Johannes Ballé","Lucas Theis"],"pdf_url":"https://arxiv.org/pdf/2310.03629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19773v1","updated":"2024-03-28T18:50:19Z","published":"2024-03-28T18:50:19Z","title":"ShapeFusion: A 3D diffusion model for localized shape editing","summary":" In the realm of 3D computer vision, parametric models have emerged as a\nground-breaking methodology for the creation of realistic and expressive 3D\navatars. Traditionally, they rely on Principal Component Analysis (PCA), given\nits ability to decompose data to an orthonormal space that maximally captures\nshape variations. However, due to the orthogonality constraints and the global\nnature of PCA's decomposition, these models struggle to perform localized and\ndisentangled editing of 3D shapes, which severely affects their use in\napplications requiring fine control such as face sculpting. In this paper, we\nleverage diffusion models to enable diverse and fully localized edits on 3D\nmeshes, while completely preserving the un-edited regions. We propose an\neffective diffusion masking training strategy that, by design, facilitates\nlocalized manipulation of any shape region, without being limited to predefined\nregions or to sparse sets of predefined control vertices. Following our\nframework, a user can explicitly set their manipulation region of choice and\ndefine an arbitrary set of vertices as handles to edit a 3D mesh. Compared to\nthe current state-of-the-art our method leads to more interpretable shape\nmanipulations than methods relying on latent code state, greater localization\nand generation diversity while offering faster inference than optimization\nbased approaches. Project page: https://rolpotamias.github.io/Shapefusion/\n","authors":["Rolandos Alexandros Potamias","Michail Tarasiou Stylianos Ploumpis","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2403.19773v1.pdf","comment":"Project Page: https://rolpotamias.github.io/Shapefusion/"},{"id":"http://arxiv.org/abs/2210.06186v3","updated":"2024-03-28T18:49:33Z","published":"2022-10-12T13:15:54Z","title":"GOTCHA: Real-Time Video Deepfake Detection via Challenge-Response","summary":" With the rise of AI-enabled Real-Time Deepfakes (RTDFs), the integrity of\nonline video interactions has become a growing concern. RTDFs have now made it\nfeasible to replace an imposter's face with their victim in live video\ninteractions. Such advancement in deepfakes also coaxes detection to rise to\nthe same standard. However, existing deepfake detection techniques are\nasynchronous and hence ill-suited for RTDFs. To bridge this gap, we propose a\nchallenge-response approach that establishes authenticity in live settings. We\nfocus on talking-head style video interaction and present a taxonomy of\nchallenges that specifically target inherent limitations of RTDF generation\npipelines. We evaluate representative examples from the taxonomy by collecting\na unique dataset comprising eight challenges, which consistently and visibly\ndegrades the quality of state-of-the-art deepfake generators. These results are\ncorroborated both by humans and a new automated scoring function, leading to\n88.6% and 80.1% AUC, respectively. The findings underscore the promising\npotential of challenge-response systems for explainable and scalable real-time\ndeepfake detection in practical scenarios. We provide access to data and code\nat https://github.com/mittalgovind/GOTCHA-Deepfakes\n","authors":["Govind Mittal","Chinmay Hegde","Nasir Memon"],"pdf_url":"https://arxiv.org/pdf/2210.06186v3.pdf","comment":"20 pages, 19 figures, Code and data released"},{"id":"http://arxiv.org/abs/2403.19768v1","updated":"2024-03-28T18:43:25Z","published":"2024-03-28T18:43:25Z","title":"Using Deep Learning to Increase Eye-Tracking Robustness, Accuracy, and\n Precision in Virtual Reality","summary":" Algorithms for the estimation of gaze direction from mobile and video-based\neye trackers typically involve tracking a feature of the eye that moves through\nthe eye camera image in a way that covaries with the shifting gaze direction,\nsuch as the center or boundaries of the pupil. Tracking these features using\ntraditional computer vision techniques can be difficult due to partial\nocclusion and environmental reflections. Although recent efforts to use machine\nlearning (ML) for pupil tracking have demonstrated superior results when\nevaluated using standard measures of segmentation performance, little is known\nof how these networks may affect the quality of the final gaze estimate. This\nwork provides an objective assessment of the impact of several contemporary\nML-based methods for eye feature tracking when the subsequent gaze estimate is\nproduced using either feature-based or model-based methods. Metrics include the\naccuracy and precision of the gaze estimate, as well as drop-out rate.\n","authors":["Kevin Barkevich","Reynold Bailey","Gabriel J. Diaz"],"pdf_url":"https://arxiv.org/pdf/2403.19768v1.pdf","comment":"16 pages, 10 figures, accepted to ETRA 2024 Full Papers"},{"id":"http://arxiv.org/abs/2302.06089v5","updated":"2024-03-28T18:31:28Z","published":"2023-02-13T04:17:47Z","title":"Federated attention consistent learning models for prostate cancer\n diagnosis and Gleason grading","summary":" Artificial intelligence (AI) holds significant promise in transforming\nmedical imaging, enhancing diagnostics, and refining treatment strategies.\nHowever, the reliance on extensive multicenter datasets for training AI models\nposes challenges due to privacy concerns. Federated learning provides a\nsolution by facilitating collaborative model training across multiple centers\nwithout sharing raw data. This study introduces a federated\nattention-consistent learning (FACL) framework to address challenges associated\nwith large-scale pathological images and data heterogeneity. FACL enhances\nmodel generalization by maximizing attention consistency between local clients\nand the server model. To ensure privacy and validate robustness, we\nincorporated differential privacy by introducing noise during parameter\ntransfer. We assessed the effectiveness of FACL in cancer diagnosis and Gleason\ngrading tasks using 19,461 whole-slide images of prostate cancer from multiple\ncenters. In the diagnosis task, FACL achieved an area under the curve (AUC) of\n0.9718, outperforming seven centers with an average AUC of 0.9499 when\ncategories are relatively balanced. For the Gleason grading task, FACL attained\na Kappa score of 0.8463, surpassing the average Kappa score of 0.7379 from six\ncenters. In conclusion, FACL offers a robust, accurate, and cost-effective AI\ntraining model for prostate cancer pathology while maintaining effective data\nsafeguards.\n","authors":["Fei Kong","Xiyue Wang","Jinxi Xiang","Sen Yang","Xinran Wang","Meng Yue","Jun Zhang","Junhan Zhao","Xiao Han","Yuhan Dong","Biyue Zhu","Fang Wang","Yueping Liu"],"pdf_url":"https://arxiv.org/pdf/2302.06089v5.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2311.17693v2","updated":"2024-03-28T18:24:46Z","published":"2023-11-29T15:00:06Z","title":"Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using\n Reinforcement and Imitation Learning","summary":" Robotic-assisted surgical systems have demonstrated significant potential in\nenhancing surgical precision and minimizing human errors. However, existing\nsystems lack the ability to accommodate the unique preferences and requirements\nof individual surgeons. Additionally, they primarily focus on general surgeries\n(e.g., laparoscopy) and are not suitable for highly precise microsurgeries,\nsuch as ophthalmic procedures. Thus, we propose a simulation-based image-guided\napproach for surgeon-centered autonomous agents that can adapt to the\nindividual surgeon's skill level and preferred surgical techniques during\nophthalmic cataract surgery. Our approach utilizes a simulated environment to\ntrain reinforcement and imitation learning agents guided by image data to\nperform all tasks of the incision phase of cataract surgery. By integrating the\nsurgeon's actions and preferences into the training process with the\nsurgeon-in-the-loop, our approach enables the robot to implicitly learn and\nadapt to the individual surgeon's unique approach through demonstrations. This\nresults in a more intuitive and personalized surgical experience for the\nsurgeon. Simultaneously, it ensures consistent performance for the autonomous\nrobotic apprentice. We define and evaluate the effectiveness of our approach\nusing our proposed metrics; and highlight the trade-off between a generic agent\nand a surgeon-centered adapted agent. Moreover, our approach has the potential\nto extend to other ophthalmic surgical procedures, opening the door to a new\ngeneration of surgeon-in-the-loop autonomous surgical robots. We provide an\nopen-source simulation framework for future development and reproducibility.\n","authors":["Amr Gomaa","Bilal Mahdy","Niko Kleer","Antonio Krüger"],"pdf_url":"https://arxiv.org/pdf/2311.17693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09997v3","updated":"2024-03-28T18:24:20Z","published":"2023-07-19T14:10:55Z","title":"TUNeS: A Temporal U-Net with Self-Attention for Video-based Surgical\n Phase Recognition","summary":" To enable context-aware computer assistance in the operating room of the\nfuture, cognitive systems need to understand automatically which surgical phase\nis being performed by the medical team. The primary source of information for\nsurgical phase recognition is typically video, which presents two challenges:\nextracting meaningful features from the video stream and effectively modeling\ntemporal information in the sequence of visual features. For temporal modeling,\nattention mechanisms have gained popularity due to their ability to capture\nlong-range dependencies. In this paper, we explore design choices for attention\nin existing temporal models for surgical phase recognition and propose a novel\napproach that uses attention more effectively and does not require hand-crafted\nconstraints: TUNeS, an efficient and simple temporal model that incorporates\nself-attention at the core of a convolutional U-Net structure. In addition, we\npropose to train the feature extractor, a standard CNN, together with an LSTM\non preferably long video segments, i.e., with long temporal context. In our\nexperiments, almost all temporal models performed better on top of feature\nextractors that were trained with longer temporal context. On these\ncontextualized features, TUNeS achieves state-of-the-art results on the\nCholec80 and AutoLaparo datasets.\n","authors":["Isabel Funke","Dominik Rivoir","Stefanie Krell","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2307.09997v3.pdf","comment":"Major revision: comparison to Temporal U-Transformer"},{"id":"http://arxiv.org/abs/2403.19738v1","updated":"2024-03-28T17:54:38Z","published":"2024-03-28T17:54:38Z","title":"MIST: Mitigating Intersectional Bias with Disentangled Cross-Attention\n Editing in Text-to-Image Diffusion Models","summary":" Diffusion-based text-to-image models have rapidly gained popularity for their\nability to generate detailed and realistic images from textual descriptions.\nHowever, these models often reflect the biases present in their training data,\nespecially impacting marginalized groups. While prior efforts to debias\nlanguage models have focused on addressing specific biases, such as racial or\ngender biases, efforts to tackle intersectional bias have been limited.\nIntersectional bias refers to the unique form of bias experienced by\nindividuals at the intersection of multiple social identities. Addressing\nintersectional bias is crucial because it amplifies the negative effects of\ndiscrimination based on race, gender, and other identities. In this paper, we\nintroduce a method that addresses intersectional bias in diffusion-based\ntext-to-image models by modifying cross-attention maps in a disentangled\nmanner. Our approach utilizes a pre-trained Stable Diffusion model, eliminates\nthe need for an additional set of reference images, and preserves the original\nquality for unaltered concepts. Comprehensive experiments demonstrate that our\nmethod surpasses existing approaches in mitigating both single and\nintersectional biases across various attributes. We make our source code and\ndebiased models for various attributes available to encourage fairness in\ngenerative models and to support further research.\n","authors":["Hidir Yesiltepe","Kiymet Akdemir","Pinar Yanardag"],"pdf_url":"https://arxiv.org/pdf/2403.19738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09669v3","updated":"2024-03-28T04:45:23Z","published":"2024-01-30T08:18:20Z","title":"STREAM: Spatio-TempoRal Evaluation and Analysis Metric for Video\n Generative Models","summary":" Image generative models have made significant progress in generating\nrealistic and diverse images, supported by comprehensive guidance from various\nevaluation metrics. However, current video generative models struggle to\ngenerate even short video clips, with limited tools that provide insights for\nimprovements. Current video evaluation metrics are simple adaptations of image\nmetrics by switching the embeddings with video embedding networks, which may\nunderestimate the unique characteristics of video. Our analysis reveals that\nthe widely used Frechet Video Distance (FVD) has a stronger emphasis on the\nspatial aspect than the temporal naturalness of video and is inherently\nconstrained by the input size of the embedding networks used, limiting it to 16\nframes. Additionally, it demonstrates considerable instability and diverges\nfrom human evaluations. To address the limitations, we propose STREAM, a new\nvideo evaluation metric uniquely designed to independently evaluate spatial and\ntemporal aspects. This feature allows comprehensive analysis and evaluation of\nvideo generative models from various perspectives, unconstrained by video\nlength. We provide analytical and experimental evidence demonstrating that\nSTREAM provides an effective evaluation tool for both visual and temporal\nquality of videos, offering insights into area of improvement for video\ngenerative models. To the best of our knowledge, STREAM is the first evaluation\nmetric that can separately assess the temporal and spatial aspects of videos.\nOur code is available at https://github.com/pro2nit/STREAM.\n","authors":["Pum Jun Kim","Seojun Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2403.09669v3.pdf","comment":"Our work is accepted to ICLR 2024"}]},"2024-03-29T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.20331v1","updated":"2024-03-29T17:59:53Z","published":"2024-03-29T17:59:53Z","title":"Unsolvable Problem Detection: Evaluating Trustworthiness of Vision\n Language Models","summary":" This paper introduces a novel and significant challenge for Vision Language\nModels (VLMs), termed Unsolvable Problem Detection (UPD). UPD examines the\nVLM's ability to withhold answers when faced with unsolvable problems in the\ncontext of Visual Question Answering (VQA) tasks. UPD encompasses three\ndistinct settings: Absent Answer Detection (AAD), Incompatible Answer Set\nDetection (IASD), and Incompatible Visual Question Detection (IVQD). To deeply\ninvestigate the UPD problem, extensive experiments indicate that most VLMs,\nincluding GPT-4V and LLaVA-Next-34B, struggle with our benchmarks to varying\nextents, highlighting significant room for the improvements. To address UPD, we\nexplore both training-free and training-based solutions, offering new insights\ninto their effectiveness and limitations. We hope our insights, together with\nfuture efforts within the proposed UPD settings, will enhance the broader\nunderstanding and development of more practical and reliable VLMs.\n","authors":["Atsuyuki Miyai","Jingkang Yang","Jingyang Zhang","Yifei Ming","Qing Yu","Go Irie","Yixuan Li","Hai Li","Ziwei Liu","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2403.20331v1.pdf","comment":"Code: https://github.com/AtsuMiyai/UPD"},{"id":"http://arxiv.org/abs/2403.20330v1","updated":"2024-03-29T17:59:34Z","published":"2024-03-29T17:59:34Z","title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","summary":" Large vision-language models (LVLMs) have recently achieved rapid progress,\nsparking numerous studies to evaluate their multi-modal capabilities. However,\nwe dig into current evaluation works and identify two primary issues: 1) Visual\ncontent is unnecessary for many samples. The answers can be directly inferred\nfrom the questions and options, or the world knowledge embedded in LLMs. This\nphenomenon is prevalent across current benchmarks. For instance, GeminiPro\nachieves 42.9% on the MMMU benchmark without any visual input, and outperforms\nthe random choice baseline across six benchmarks over 20% on average. 2)\nUnintentional data leakage exists in LLM and LVLM training. LLM and LVLM could\nstill answer some visual-necessary questions without visual content, indicating\nthe memorizing of these samples within large-scale training data. For example,\nSphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM\nbackbone with 17.9%. Both problems lead to misjudgments of actual multi-modal\ngains and potentially misguide the study of LVLM. To this end, we present\nMMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500\nsamples meticulously selected by humans. MMStar benchmarks 6 core capabilities\nand 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with\ncarefully balanced and purified samples. These samples are first roughly\nselected from current benchmarks with an automated pipeline, human review is\nthen involved to ensure each curated sample exhibits visual dependency, minimal\ndata leakage, and requires advanced multi-modal capabilities. Moreover, two\nmetrics are developed to measure data leakage and actual performance gain in\nmulti-modal training. We evaluate 16 leading LVLMs on MMStar to assess their\nmulti-modal capabilities, and on 7 benchmarks with the proposed metrics to\ninvestigate their data leakage and actual multi-modal gain.\n","authors":["Lin Chen","Jinsong Li","Xiaoyi Dong","Pan Zhang","Yuhang Zang","Zehui Chen","Haodong Duan","Jiaqi Wang","Yu Qiao","Dahua Lin","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.20330v1.pdf","comment":"Project page: https://mmstar-benchmark.github.io/"},{"id":"http://arxiv.org/abs/2311.17245v5","updated":"2024-03-29T17:58:34Z","published":"2023-11-28T21:39:20Z","title":"LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and\n 200+ FPS","summary":" Recent advancements in real-time neural rendering using point-based\ntechniques have paved the way for the widespread adoption of 3D\nrepresentations. However, foundational approaches like 3D Gaussian Splatting\ncome with a substantial storage overhead caused by growing the SfM points to\nmillions, often demanding gigabyte-level disk space for a single unbounded\nscene, posing significant scalability challenges and hindering the splatting\nefficiency.\n To address this challenge, we introduce LightGaussian, a novel method\ndesigned to transform 3D Gaussians into a more efficient and compact format.\nDrawing inspiration from the concept of Network Pruning, LightGaussian\nidentifies Gaussians that are insignificant in contributing to the scene\nreconstruction and adopts a pruning and recovery process, effectively reducing\nredundancy in Gaussian counts while preserving visual effects. Additionally,\nLightGaussian employs distillation and pseudo-view augmentation to distill\nspherical harmonics to a lower degree, allowing knowledge transfer to more\ncompact representations while maintaining reflectance. Furthermore, we propose\na hybrid scheme, VecTree Quantization, to quantize all attributes, resulting in\nlower bitwidth representations with minimal accuracy losses.\n In summary, LightGaussian achieves an averaged compression rate over 15x\nwhile boosting the FPS from 139 to 215, enabling an efficient representation of\ncomplex scenes on Mip-NeRF 360, Tank and Temple datasets.\n Project website: https://lightgaussian.github.io/\n","authors":["Zhiwen Fan","Kevin Wang","Kairun Wen","Zehao Zhu","Dejia Xu","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17245v5.pdf","comment":"16pages, 8figures"},{"id":"http://arxiv.org/abs/2310.11256v2","updated":"2024-03-29T17:50:17Z","published":"2023-10-17T13:22:36Z","title":"Gromov-Wassertein-like Distances in the Gaussian Mixture Models Space","summary":" The Gromov-Wasserstein (GW) distance is frequently used in machine learning\nto compare distributions across distinct metric spaces. Despite its utility, it\nremains computationally intensive, especially for large-scale problems.\nRecently, a novel Wasserstein distance specifically tailored for Gaussian\nmixture models and known as MW (mixture Wasserstein) has been introduced by\nseveral authors. In scenarios where data exhibit clustering, this approach\nsimplifies to a small-scale discrete optimal transport problem, which\ncomplexity depends solely on the number of Gaussian components in the GMMs.\nThis paper aims to extend MW by introducing new Gromov-type distances. These\ndistances are designed to be isometry-invariant in Euclidean spaces and are\napplicable for comparing GMMs across different dimensional spaces. Our first\ncontribution is the Mixture Gromov Wasserstein distance (MGW), which can be\nviewed as a Gromovized version of MW. This new distance has a straightforward\ndiscrete formulation, making it highly efficient for estimating distances\nbetween GMMs in practical applications. To facilitate the derivation of a\ntransport plan between GMMs, we present a second distance, the Embedded\nWasserstein distance (EW). This distance turns out to be closely related to\nseveral recent alternatives to Gromov-Wasserstein. We show that EW can be\nadapted to derive a distance as well as optimal transportation plans between\nGMMs. We demonstrate the efficiency of these newly proposed distances on medium\nto large-scale problems, including shape matching and hyperspectral image color\ntransfer.\n","authors":["Antoine Salmona","Julie Delon","Agnès Desolneux"],"pdf_url":"https://arxiv.org/pdf/2310.11256v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2310.05737v3","updated":"2024-03-29T17:44:41Z","published":"2023-10-09T14:10:29Z","title":"Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation","summary":" While Large Language Models (LLMs) are the dominant models for generative\ntasks in language, they do not perform as well as diffusion models on image and\nvideo generation. To effectively use LLMs for visual generation, one crucial\ncomponent is the visual tokenizer that maps pixel-space inputs to discrete\ntokens appropriate for LLM learning. In this paper, we introduce MAGVIT-v2, a\nvideo tokenizer designed to generate concise and expressive tokens for both\nvideos and images using a common token vocabulary. Equipped with this new\ntokenizer, we show that LLMs outperform diffusion models on standard image and\nvideo generation benchmarks including ImageNet and Kinetics. In addition, we\ndemonstrate that our tokenizer surpasses the previously top-performing video\ntokenizer on two more tasks: (1) video compression comparable to the\nnext-generation video codec (VCC) according to human evaluations, and (2)\nlearning effective representations for action recognition tasks.\n","authors":["Lijun Yu","José Lezama","Nitesh B. Gundavarapu","Luca Versari","Kihyuk Sohn","David Minnen","Yong Cheng","Vighnesh Birodkar","Agrim Gupta","Xiuye Gu","Alexander G. Hauptmann","Boqing Gong","Ming-Hsuan Yang","Irfan Essa","David A. Ross","Lu Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.05737v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.20320v1","updated":"2024-03-29T17:43:58Z","published":"2024-03-29T17:43:58Z","title":"MTLoRA: A Low-Rank Adaptation Approach for Efficient Multi-Task Learning","summary":" Adapting models pre-trained on large-scale datasets to a variety of\ndownstream tasks is a common strategy in deep learning. Consequently,\nparameter-efficient fine-tuning methods have emerged as a promising way to\nadapt pre-trained models to different tasks while training only a minimal\nnumber of parameters. While most of these methods are designed for single-task\nadaptation, parameter-efficient training in Multi-Task Learning (MTL)\narchitectures is still unexplored. In this paper, we introduce MTLoRA, a novel\nframework for parameter-efficient training of MTL models. MTLoRA employs\nTask-Agnostic and Task-Specific Low-Rank Adaptation modules, which effectively\ndisentangle the parameter space in MTL fine-tuning, thereby enabling the model\nto adeptly handle both task specialization and interaction within MTL contexts.\nWe applied MTLoRA to hierarchical-transformer-based MTL architectures, adapting\nthem to multiple downstream dense prediction tasks. Our extensive experiments\non the PASCAL dataset show that MTLoRA achieves higher accuracy on downstream\ntasks compared to fully fine-tuning the MTL model while reducing the number of\ntrainable parameters by 3.6x. Furthermore, MTLoRA establishes a Pareto-optimal\ntrade-off between the number of trainable parameters and the accuracy of the\ndownstream tasks, outperforming current state-of-the-art parameter-efficient\ntraining methods in both accuracy and efficiency. Our code is publicly\navailable.\n","authors":["Ahmed Agiza","Marina Neseem","Sherief Reda"],"pdf_url":"https://arxiv.org/pdf/2403.20320v1.pdf","comment":"Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2401.15741v4","updated":"2024-03-29T17:42:21Z","published":"2024-01-28T19:58:19Z","title":"SERNet-Former: Semantic Segmentation by Efficient Residual Network with\n Attention-Boosting Gates and Attention-Fusion Networks","summary":" Improving the efficiency of state-of-the-art methods in semantic segmentation\nrequires overcoming the increasing computational cost as well as issues such as\nfusing semantic information from global and local contexts. Based on the recent\nsuccess and problems that convolutional neural networks (CNNs) encounter in\nsemantic segmentation, this research proposes an encoder-decoder architecture\nwith a unique efficient residual network, Efficient-ResNet. Attention-boosting\ngates (AbGs) and attention-boosting modules (AbMs) are deployed by aiming to\nfuse the equivariant and feature-based semantic information with the equivalent\nsizes of the output of global context of the efficient residual network in the\nencoder. Respectively, the decoder network is developed with the additional\nattention-fusion networks (AfNs) inspired by AbM. AfNs are designed to improve\nthe efficiency in the one-to-one conversion of the semantic information by\ndeploying additional convolution layers in the decoder part. Our network is\ntested on the challenging CamVid and Cityscapes datasets, and the proposed\nmethods reveal significant improvements on the residual networks. To the best\nof our knowledge, the developed network, SERNet-Former, achieves\nstate-of-the-art results (84.62 % mean IoU) on CamVid dataset and challenging\nresults (87.35 % mean IoU) on Cityscapes validation dataset.\n","authors":["Serdar Erisen"],"pdf_url":"https://arxiv.org/pdf/2401.15741v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20318v1","updated":"2024-03-29T17:41:57Z","published":"2024-03-29T17:41:57Z","title":"SeaBird: Segmentation in Bird's View with Dice Loss Improves Monocular\n 3D Detection of Large Objects","summary":" Monocular 3D detectors achieve remarkable performance on cars and smaller\nobjects. However, their performance drops on larger objects, leading to fatal\naccidents. Some attribute the failures to training data scarcity or their\nreceptive field requirements of large objects. In this paper, we highlight this\nunderstudied problem of generalization to large objects. We find that modern\nfrontal detectors struggle to generalize to large objects even on nearly\nbalanced datasets. We argue that the cause of failure is the sensitivity of\ndepth regression losses to noise of larger objects. To bridge this gap, we\ncomprehensively investigate regression and dice losses, examining their\nrobustness under varying error levels and object sizes. We mathematically prove\nthat the dice loss leads to superior noise-robustness and model convergence for\nlarge objects compared to regression losses for a simplified case. Leveraging\nour theoretical insights, we propose SeaBird (Segmentation in Bird's View) as\nthe first step towards generalizing to large objects. SeaBird effectively\nintegrates BEV segmentation on foreground objects for 3D detection, with the\nsegmentation head trained with the dice loss. SeaBird achieves SoTA results on\nthe KITTI-360 leaderboard and improves existing detectors on the nuScenes\nleaderboard, particularly for large objects. Code and models at\nhttps://github.com/abhi1kumar/SeaBird\n","authors":["Abhinav Kumar","Yuliang Guo","Xinyu Huang","Liu Ren","Xiaoming Liu"],"pdf_url":"https://arxiv.org/pdf/2403.20318v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.20317v1","updated":"2024-03-29T17:40:37Z","published":"2024-03-29T17:40:37Z","title":"Convolutional Prompting meets Language Models for Continual Learning","summary":" Continual Learning (CL) enables machine learning models to learn from\ncontinuously shifting new training data in absence of data from old tasks.\nRecently, pretrained vision transformers combined with prompt tuning have shown\npromise for overcoming catastrophic forgetting in CL. These approaches rely on\na pool of learnable prompts which can be inefficient in sharing knowledge\nacross tasks leading to inferior performance. In addition, the lack of\nfine-grained layer specific prompts does not allow these to fully express the\nstrength of the prompts for CL. We address these limitations by proposing\nConvPrompt, a novel convolutional prompt creation mechanism that maintains\nlayer-wise shared embeddings, enabling both layer-specific learning and better\nconcept transfer across tasks. The intelligent use of convolution enables us to\nmaintain a low parameter overhead without compromising performance. We further\nleverage Large Language Models to generate fine-grained text descriptions of\neach category which are used to get task similarity and dynamically decide the\nnumber of prompts to be learned. Extensive experiments demonstrate the\nsuperiority of ConvPrompt and improves SOTA by ~3% with significantly less\nparameter overhead. We also perform strong ablation over various modules to\ndisentangle the importance of different components.\n","authors":["Anurag Roy","Riddhiman Moulick","Vinay K. Verma","Saptarshi Ghosh","Abir Das"],"pdf_url":"https://arxiv.org/pdf/2403.20317v1.pdf","comment":"CVPR 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2307.08727v2","updated":"2024-03-29T17:38:00Z","published":"2023-07-17T17:48:06Z","title":"Learning to Count without Annotations","summary":" While recent supervised methods for reference-based object counting continue\nto improve the performance on benchmark datasets, they have to rely on small\ndatasets due to the cost associated with manually annotating dozens of objects\nin images. We propose UnCounTR, a model that can learn this task without\nrequiring any manual annotations. To this end, we construct \"Self-Collages\",\nimages with various pasted objects as training samples, that provide a rich\nlearning signal covering arbitrary object types and counts. Our method builds\non existing unsupervised representations and segmentation techniques to\nsuccessfully demonstrate for the first time the ability of reference-based\ncounting without manual supervision. Our experiments show that our method not\nonly outperforms simple baselines and generic models such as FasterRCNN and\nDETR, but also matches the performance of supervised counting models in some\ndomains.\n","authors":["Lukas Knobel","Tengda Han","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2307.08727v2.pdf","comment":"Accepted at CVPR'24. Code available at\n https://github.com/lukasknobel/SelfCollages"},{"id":"http://arxiv.org/abs/2310.18274v2","updated":"2024-03-29T17:34:40Z","published":"2023-10-27T16:59:51Z","title":"LipSim: A Provably Robust Perceptual Similarity Metric","summary":" Recent years have seen growing interest in developing and applying perceptual\nsimilarity metrics. Research has shown the superiority of perceptual metrics\nover pixel-wise metrics in aligning with human perception and serving as a\nproxy for the human visual system. On the other hand, as perceptual metrics\nrely on neural networks, there is a growing concern regarding their resilience,\ngiven the established vulnerability of neural networks to adversarial attacks.\nIt is indeed logical to infer that perceptual metrics may inherit both the\nstrengths and shortcomings of neural networks. In this work, we demonstrate the\nvulnerability of state-of-the-art perceptual similarity metrics based on an\nensemble of ViT-based feature extractors to adversarial attacks. We then\npropose a framework to train a robust perceptual similarity metric called\nLipSim (Lipschitz Similarity Metric) with provable guarantees. By leveraging\n1-Lipschitz neural networks as the backbone, LipSim provides guarded areas\naround each data point and certificates for all perturbations within an\n$\\ell_2$ ball. Finally, a comprehensive set of experiments shows the\nperformance of LipSim in terms of natural and certified scores and on the image\nretrieval application. The code is available at\nhttps://github.com/SaraGhazanfari/LipSim.\n","authors":["Sara Ghazanfari","Alexandre Araujo","Prashanth Krishnamurthy","Farshad Khorrami","Siddharth Garg"],"pdf_url":"https://arxiv.org/pdf/2310.18274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20312v1","updated":"2024-03-29T17:33:42Z","published":"2024-03-29T17:33:42Z","title":"Learn \"No\" to Say \"Yes\" Better: Improving Vision-Language Models via\n Negations","summary":" Existing vision-language models (VLMs) treat text descriptions as a unit,\nconfusing individual concepts in a prompt and impairing visual semantic\nmatching and reasoning. An important aspect of reasoning in logic and language\nis negations. This paper highlights the limitations of popular VLMs such as\nCLIP, at understanding the implications of negations, i.e., the effect of the\nword \"not\" in a given prompt. To enable evaluation of VLMs on fluent prompts\nwith negations, we present CC-Neg, a dataset containing 228,246 images, true\ncaptions and their corresponding negated captions. Using CC-Neg along with\nmodifications to the contrastive loss of CLIP, our proposed CoN-CLIP framework,\nhas an improved understanding of negations. This training paradigm improves\nCoN-CLIP's ability to encode semantics reliably, resulting in 3.85% average\ngain in top-1 accuracy for zero-shot image classification across 8 datasets.\nFurther, CoN-CLIP outperforms CLIP on challenging compositionality benchmarks\nsuch as SugarCREPE by 4.4%, showcasing emergent compositional understanding of\nobjects, relations, and attributes in text. Overall, our work addresses a\ncrucial limitation of VLMs by introducing a dataset and framework that\nstrengthens semantic associations between images and text, demonstrating\nimproved large-scale foundation models with significantly reduced computational\ncost, promoting efficiency and accessibility.\n","authors":["Jaisidh Singh","Ishaan Shrivastava","Mayank Vatsa","Richa Singh","Aparna Bharati"],"pdf_url":"https://arxiv.org/pdf/2403.20312v1.pdf","comment":"14 pages + 6 figures in main manuscript (excluding references)"},{"id":"http://arxiv.org/abs/2312.01215v2","updated":"2024-03-29T17:30:58Z","published":"2023-12-02T19:49:27Z","title":"RNb-NeuS: Reflectance and Normal-based Multi-View 3D Reconstruction","summary":" This paper introduces a versatile paradigm for integrating multi-view\nreflectance (optional) and normal maps acquired through photometric stereo. Our\napproach employs a pixel-wise joint re-parameterization of reflectance and\nnormal, considering them as a vector of radiances rendered under simulated,\nvarying illumination. This re-parameterization enables the seamless integration\nof reflectance and normal maps as input data in neural volume rendering-based\n3D reconstruction while preserving a single optimization objective. In\ncontrast, recent multi-view photometric stereo (MVPS) methods depend on\nmultiple, potentially conflicting objectives. Despite its apparent simplicity,\nour proposed approach outperforms state-of-the-art approaches in MVPS\nbenchmarks across F-score, Chamfer distance, and mean angular error metrics.\nNotably, it significantly improves the detailed 3D reconstruction of areas with\nhigh curvature or low visibility.\n","authors":["Baptiste Brument","Robin Bruneau","Yvain Quéau","Jean Mélou","François Bernard Lauze"," Jean-Denis","Jean-Denis Durou","Lilian Calvet"],"pdf_url":"https://arxiv.org/pdf/2312.01215v2.pdf","comment":"14 pages, 13 figures, 7 tables. Accepted to CVPR 2024. The project\n page can be accessed via\n https://robinbruneau.github.io/publications/rnb_neus.html. The source code is\n available at https://github.com/bbrument/RNb-NeuS"},{"id":"http://arxiv.org/abs/2403.20309v1","updated":"2024-03-29T17:29:58Z","published":"2024-03-29T17:29:58Z","title":"InstantSplat: Unbounded Sparse-view Pose-free Gaussian Splatting in 40\n Seconds","summary":" While novel view synthesis (NVS) has made substantial progress in 3D computer\nvision, it typically requires an initial estimation of camera intrinsics and\nextrinsics from dense viewpoints. This pre-processing is usually conducted via\na Structure-from-Motion (SfM) pipeline, a procedure that can be slow and\nunreliable, particularly in sparse-view scenarios with insufficient matched\nfeatures for accurate reconstruction. In this work, we integrate the strengths\nof point-based representations (e.g., 3D Gaussian Splatting, 3D-GS) with\nend-to-end dense stereo models (DUSt3R) to tackle the complex yet unresolved\nissues in NVS under unconstrained settings, which encompasses pose-free and\nsparse view challenges. Our framework, InstantSplat, unifies dense stereo\npriors with 3D-GS to build 3D Gaussians of large-scale scenes from sparseview &\npose-free images in less than 1 minute. Specifically, InstantSplat comprises a\nCoarse Geometric Initialization (CGI) module that swiftly establishes a\npreliminary scene structure and camera parameters across all training views,\nutilizing globally-aligned 3D point maps derived from a pre-trained dense\nstereo pipeline. This is followed by the Fast 3D-Gaussian Optimization (F-3DGO)\nmodule, which jointly optimizes the 3D Gaussian attributes and the initialized\nposes with pose regularization. Experiments conducted on the large-scale\noutdoor Tanks & Temples datasets demonstrate that InstantSplat significantly\nimproves SSIM (by 32%) while concurrently reducing Absolute Trajectory Error\n(ATE) by 80%. These establish InstantSplat as a viable solution for scenarios\ninvolving posefree and sparse-view conditions. Project page:\ninstantsplat.github.io.\n","authors":["Zhiwen Fan","Wenyan Cong","Kairun Wen","Kevin Wang","Jian Zhang","Xinghao Ding","Danfei Xu","Boris Ivanovic","Marco Pavone","Georgios Pavlakos","Zhangyang Wang","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2403.20309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01362v4","updated":"2024-03-29T17:11:38Z","published":"2023-07-03T21:33:40Z","title":"A Strong Baseline for Point Cloud Registration via Direct Superpoints\n Matching","summary":" Deep neural networks endow the downsampled superpoints with highly\ndiscriminative feature representations. Previous dominant point cloud\nregistration approaches match these feature representations as the first step,\ne.g., using the Sinkhorn algorithm. A RANSAC-like method is then usually\nadopted as a post-processing refinement to filter the outliers. Other dominant\nmethod is to directly predict the superpoint matchings using learned MLP\nlayers. Both of them have drawbacks: RANSAC-based methods are computationally\nintensive and prediction-based methods suffer from outputing non-existing\npoints in the point cloud. In this paper, we propose a straightforward and\neffective baseline to find correspondences of superpoints in a global matching\nmanner. We employ the normalized matching scores as weights for each\ncorrespondence, allowing us to reject the outliers and further weigh the rest\ninliers when fitting the transformation matrix without relying on the\ncumbersome RANSAC. Moreover, the entire model can be trained in an end-to-end\nfashion, leading to better accuracy. Our simple yet effective baseline shows\ncomparable or even better results than state-of-the-art methods on three\ndatasets including ModelNet, 3DMatch, and KITTI. We do not advocate our\napproach to be \\emph{the} solution for point cloud registration but use the\nresults to emphasize the role of matching strategy for point cloud\nregistration. The code and models are available at\nhttps://github.com/neu-vi/Superpoints_Registration.\n","authors":["Aniket Gupta","Yiming Xie","Hanumant Singh","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.01362v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20287v1","updated":"2024-03-29T16:58:13Z","published":"2024-03-29T16:58:13Z","title":"Benchmarking Counterfactual Image Generation","summary":" Counterfactual image generation is pivotal for understanding the causal\nrelations of variables, with applications in interpretability and generation of\nunbiased synthetic data. However, evaluating image generation is a\nlong-standing challenge in itself. The need to evaluate counterfactual\ngeneration compounds on this challenge, precisely because counterfactuals, by\ndefinition, are hypothetical scenarios without observable ground truths. In\nthis paper, we present a novel comprehensive framework aimed at benchmarking\ncounterfactual image generation methods. We incorporate metrics that focus on\nevaluating diverse aspects of counterfactuals, such as composition,\neffectiveness, minimality of interventions, and image realism. We assess the\nperformance of three distinct conditional image generation model types, based\non the Structural Causal Model paradigm. Our work is accompanied by a\nuser-friendly Python package which allows to further evaluate and benchmark\nexisting and future counterfactual image generation methods. Our framework is\nextendable to additional SCM and other causal methods, generative models, and\ndatasets.\n","authors":["Thomas Melistas","Nikos Spyrou","Nefeli Gkouti","Pedro Sanchez","Athanasios Vlontzos","Giorgos Papanastasiou","Sotirios A. Tsaftaris"],"pdf_url":"https://arxiv.org/pdf/2403.20287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02560v2","updated":"2024-03-29T16:56:33Z","published":"2023-04-05T16:30:36Z","title":"VicTR: Video-conditioned Text Representations for Activity Recognition","summary":" Vision-Language models (VLMs) have excelled in the image-domain -- especially\nin zero-shot settings -- thanks to the availability of vast pretraining data\n(i.e., paired image-text samples). However for videos, such paired data is not\nas abundant. Therefore, video-VLMs are usually designed by adapting pretrained\nimage-VLMs to the video-domain, instead of training from scratch. All such\nrecipes rely on augmenting visual embeddings with temporal information (i.e.,\nimage $\\rightarrow$ video), often keeping text embeddings unchanged or even\nbeing discarded. In this paper, we argue the contrary, that better video-VLMs\ncan be designed by focusing more on augmenting text, rather than visual\ninformation. More specifically, we introduce Video-conditioned Text\nRepresentations (VicTR): a form of text embeddings optimized w.r.t. visual\nembeddings, creating a more-flexible contrastive latent space. Our model can\nfurther make use of freely-available semantic information, in the form of\nvisually-grounded auxiliary text (e.g. object or scene information). We\nevaluate our model on few-shot, zero-shot (HMDB-51, UCF-101), short-form\n(Kinetics-400) and long-form (Charades) activity recognition benchmarks,\nshowing strong performance among video-VLMs.\n","authors":["Kumara Kahatapitiya","Anurag Arnab","Arsha Nagrani","Michael S. Ryoo"],"pdf_url":"https://arxiv.org/pdf/2304.02560v2.pdf","comment":"To appear at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15905v4","updated":"2024-03-29T16:53:58Z","published":"2024-03-23T18:19:02Z","title":"Towards Low-Energy Adaptive Personalization for Resource-Constrained\n Devices","summary":" The personalization of machine learning (ML) models to address data drift is\na significant challenge in the context of Internet of Things (IoT)\napplications. Presently, most approaches focus on fine-tuning either the full\nbase model or its last few layers to adapt to new data, while often neglecting\nenergy costs. However, various types of data drift exist, and fine-tuning the\nfull base model or the last few layers may not result in optimal performance in\ncertain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy\nadaptive personalization framework designed for resource-constrained devices.\nWe categorize data drift and personalization into three types: input-level,\nfeature-level, and output-level. For each type, we fine-tune different blocks\nof the model to achieve optimal performance with reduced energy costs.\nSpecifically, input-, feature-, and output-level correspond to fine-tuning the\nfront, middle, and rear blocks of the model. We evaluate TBFT on a ResNet\nmodel, three datasets, three different training sizes, and a Raspberry Pi.\nCompared with the $Block Avg$, where each block is fine-tuned individually and\ntheir performance improvements are averaged, TBFT exhibits an improvement in\nmodel accuracy by an average of 15.30% whilst saving 41.57% energy consumption\non average compared with full fine-tuning.\n","authors":["Yushan Huang","Josh Millar","Yuxuan Long","Yuchen Zhao","Hamed Haddadi"],"pdf_url":"https://arxiv.org/pdf/2403.15905v4.pdf","comment":"Accepetd to The 4th Workshop on Machine Learning and Systems\n (EuroMLSys '24)"},{"id":"http://arxiv.org/abs/2312.05291v2","updated":"2024-03-29T16:49:59Z","published":"2023-12-08T18:14:21Z","title":"GlitchBench: Can large multimodal models detect video game glitches?","summary":" Large multimodal models (LMMs) have evolved from large language models (LLMs)\nto integrate multiple input modalities, such as visual inputs. This integration\naugments the capacity of LLMs for tasks requiring visual comprehension and\nreasoning. However, the extent and limitations of their enhanced abilities are\nnot fully understood, especially when it comes to real-world tasks. To address\nthis gap, we introduce GlitchBench, a novel benchmark derived from video game\nquality assurance tasks, to test and evaluate the reasoning capabilities of\nLMMs. Our benchmark is curated from a variety of unusual and glitched scenarios\nfrom video games and aims to challenge both the visual and linguistic reasoning\npowers of LMMs in detecting and interpreting out-of-the-ordinary events. We\nevaluate multiple state-of-the-art LMMs, and we show that GlitchBench presents\na new challenge for these models. Code and data are available at:\nhttps://glitchbench.github.io/\n","authors":["Mohammad Reza Taesiri","Tianjun Feng","Anh Nguyen","Cor-Paul Bezemer"],"pdf_url":"https://arxiv.org/pdf/2312.05291v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.04670v2","updated":"2024-03-29T16:39:28Z","published":"2023-12-07T20:11:03Z","title":"Rapid Motor Adaptation for Robotic Manipulator Arms","summary":" Developing generalizable manipulation skills is a core challenge in embodied\nAI. This includes generalization across diverse task configurations,\nencompassing variations in object shape, density, friction coefficient, and\nexternal disturbances such as forces applied to the robot. Rapid Motor\nAdaptation (RMA) offers a promising solution to this challenge. It posits that\nessential hidden variables influencing an agent's task performance, such as\nobject mass and shape, can be effectively inferred from the agent's action and\nproprioceptive history. Drawing inspiration from RMA in locomotion and in-hand\nrotation, we use depth perception to develop agents tailored for rapid motor\nadaptation in a variety of manipulation tasks. We evaluated our agents on four\nchallenging tasks from the Maniskill2 benchmark, namely pick-and-place\noperations with hundreds of objects from the YCB and EGAD datasets, peg\ninsertion with precise position and orientation, and operating a variety of\nfaucets and handles, with customized environment variations. Empirical results\ndemonstrate that our agents surpass state-of-the-art methods like automatic\ndomain randomization and vision-based policies, obtaining better generalization\nperformance and sample efficiency.\n","authors":["Yichao Liang","Kevin Ellis","João Henriques"],"pdf_url":"https://arxiv.org/pdf/2312.04670v2.pdf","comment":"Accepted at CVPR 2024. 12 pages"},{"id":"http://arxiv.org/abs/2312.02214v2","updated":"2024-03-29T16:31:44Z","published":"2023-12-03T07:23:53Z","title":"FlashAvatar: High-fidelity Head Avatar with Efficient Gaussian Embedding","summary":" We propose FlashAvatar, a novel and lightweight 3D animatable avatar\nrepresentation that could reconstruct a digital avatar from a short monocular\nvideo sequence in minutes and render high-fidelity photo-realistic images at\n300FPS on a consumer-grade GPU. To achieve this, we maintain a uniform 3D\nGaussian field embedded in the surface of a parametric face model and learn\nextra spatial offset to model non-surface regions and subtle facial details.\nWhile full use of geometric priors can capture high-frequency facial details\nand preserve exaggerated expressions, proper initialization can help reduce the\nnumber of Gaussians, thus enabling super-fast rendering speed. Extensive\nexperimental results demonstrate that FlashAvatar outperforms existing works\nregarding visual quality and personalized details and is almost an order of\nmagnitude faster in rendering speed. Project page:\nhttps://ustc3dv.github.io/FlashAvatar/\n","authors":["Jun Xiang","Xuan Gao","Yudong Guo","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02214v2.pdf","comment":"Project page: https://ustc3dv.github.io/FlashAvatar/"},{"id":"http://arxiv.org/abs/2403.20275v1","updated":"2024-03-29T16:30:17Z","published":"2024-03-29T16:30:17Z","title":"Snap-it, Tap-it, Splat-it: Tactile-Informed 3D Gaussian Splatting for\n Reconstructing Challenging Surfaces","summary":" Touch and vision go hand in hand, mutually enhancing our ability to\nunderstand the world. From a research perspective, the problem of mixing touch\nand vision is underexplored and presents interesting challenges. To this end,\nwe propose Tactile-Informed 3DGS, a novel approach that incorporates touch data\n(local depth maps) with multi-view vision data to achieve surface\nreconstruction and novel view synthesis. Our method optimises 3D Gaussian\nprimitives to accurately model the object's geometry at points of contact. By\ncreating a framework that decreases the transmittance at touch locations, we\nachieve a refined surface reconstruction, ensuring a uniformly smooth depth\nmap. Touch is particularly useful when considering non-Lambertian objects (e.g.\nshiny or reflective surfaces) since contemporary methods tend to fail to\nreconstruct with fidelity specular highlights. By combining vision and tactile\nsensing, we achieve more accurate geometry reconstructions with fewer images\nthan prior methods. We conduct evaluation on objects with glossy and reflective\nsurfaces and demonstrate the effectiveness of our approach, offering\nsignificant improvements in reconstruction quality.\n","authors":["Mauro Comi","Alessio Tonioni","Max Yang","Jonathan Tremblay","Valts Blukis","Yijiong Lin","Nathan F. Lepora","Laurence Aitchison"],"pdf_url":"https://arxiv.org/pdf/2403.20275v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2403.20273v1","updated":"2024-03-29T16:27:40Z","published":"2024-03-29T16:27:40Z","title":"CATSNet: a context-aware network for Height Estimation in a Forested\n Area based on Pol-TomoSAR data","summary":" Tropical forests are a key component of the global carbon cycle. With plans\nfor upcoming space-borne missions like BIOMASS to monitor forestry, several\nairborne missions, including TropiSAR and AfriSAR campaigns, have been\nsuccessfully launched and experimented. Typical Synthetic Aperture Radar\nTomography (TomoSAR) methods involve complex models with low accuracy and high\ncomputation costs. In recent years, deep learning methods have also gained\nattention in the TomoSAR framework, showing interesting performance. Recently,\na solution based on a fully connected Tomographic Neural Network (TSNN) has\ndemonstrated its effectiveness in accurately estimating forest and ground\nheights by exploiting the pixel-wise elements of the covariance matrix derived\nfrom TomoSAR data. This work instead goes beyond the pixel-wise approach to\ndefine a context-aware deep learning-based solution named CATSNet. A\nconvolutional neural network is considered to leverage patch-based information\nand extract features from a neighborhood rather than focus on a single pixel.\nThe training is conducted by considering TomoSAR data as the input and Light\nDetection and Ranging (LiDAR) values as the ground truth. The experimental\nresults show striking advantages in both performance and generalization ability\nby leveraging context information within Multiple Baselines (MB) TomoSAR data\nacross different polarimetric modalities, surpassing existing techniques.\n","authors":["Wenyu Yang","Sergio Vitale","Hossein Aghababaei","Giampaolo Ferraioli","Vito Pascazio","Gilda Schirinzi"],"pdf_url":"https://arxiv.org/pdf/2403.20273v1.pdf","comment":"Submitted to IEEE TGRS, under review"},{"id":"http://arxiv.org/abs/2403.20271v1","updated":"2024-03-29T16:26:20Z","published":"2024-03-29T16:26:20Z","title":"Draw-and-Understand: Leveraging Visual Prompts to Enable MLLMs to\n Comprehend What You Want","summary":" The interaction between humans and artificial intelligence (AI) is a crucial\nfactor that reflects the effectiveness of multimodal large language models\n(MLLMs). However, current MLLMs primarily focus on image-level comprehension\nand limit interaction to textual instructions, thereby constraining their\nflexibility in usage and depth of response. In this paper, we introduce the\nDraw-and-Understand project: a new model, a multi-domain dataset, and a\nchallenging benchmark for visual prompting. Specifically, we propose SPHINX-V,\na new end-to-end trained Multimodal Large Language Model (MLLM) that connects a\nvision encoder, a visual prompt encoder and an LLM for various visual prompts\n(points, bounding boxes, and free-form shape) and language understanding. To\nadvance visual prompting research for MLLMs, we introduce MDVP-Data and\nMDVP-Bench. MDVP-Data features a multi-domain dataset containing 1.6M unique\nimage-visual prompt-text instruction-following samples, including natural\nimages, document images, OCR images, mobile screenshots, web screenshots, and\nmulti-panel images. Furthermore, we present MDVP-Bench, a comprehensive and\nchallenging benchmark to assess a model's capability in understanding visual\nprompting instructions. Our experiments demonstrate SPHINX-V's impressive\nmultimodal interaction capabilities through visual prompting, revealing\nsignificant improvements in detailed pixel-level description and\nquestion-answering abilities.\n","authors":["Weifeng Lin","Xinyu Wei","Ruichuan An","Peng Gao","Bocheng Zou","Yulin Luo","Siyuan Huang","Shanghang Zhang","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.20271v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.16970v2","updated":"2024-03-29T16:14:41Z","published":"2024-03-25T17:31:12Z","title":"Joint chest X-ray diagnosis and clinical visual attention prediction\n with multi-stage cooperative learning: enhancing interpretability","summary":" As deep learning has become the state-of-the-art for computer-assisted\ndiagnosis, interpretability of the automatic decisions is crucial for clinical\ndeployment. While various methods were proposed in this domain, visual\nattention maps of clinicians during radiological screening offer a unique asset\nto provide important insights and can potentially enhance the quality of\ncomputer-assisted diagnosis. With this paper, we introduce a novel\ndeep-learning framework for joint disease diagnosis and prediction of\ncorresponding visual saliency maps for chest X-ray scans. Specifically, we\ndesigned a novel dual-encoder multi-task UNet, which leverages both a\nDenseNet201 backbone and a Residual and Squeeze-and-Excitation block-based\nencoder to extract diverse features for saliency map prediction, and a\nmulti-scale feature-fusion classifier to perform disease classification. To\ntackle the issue of asynchronous training schedules of individual tasks in\nmulti-task learning, we proposed a multi-stage cooperative learning strategy,\nwith contrastive learning for feature encoder pretraining to boost performance.\nExperiments show that our proposed method outperformed existing techniques for\nchest X-ray diagnosis and the quality of visual saliency map prediction.\n","authors":["Zirui Qiu","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.16970v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01482v3","updated":"2024-03-29T16:13:13Z","published":"2024-03-03T11:24:16Z","title":"EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised\n Semantic Segmentation","summary":" Semantic segmentation has innately relied on extensive pixel-level annotated\ndata, leading to the emergence of unsupervised methodologies. Among them,\nleveraging self-supervised Vision Transformers for unsupervised semantic\nsegmentation (USS) has been making steady progress with expressive deep\nfeatures. Yet, for semantically segmenting images with complex objects, a\npredominant challenge remains: the lack of explicit object-level semantic\nencoding in patch-level features. This technical limitation often leads to\ninadequate segmentation of complex objects with diverse structures. To address\nthis gap, we present a novel approach, EAGLE, which emphasizes object-centric\nrepresentation learning for unsupervised semantic segmentation. Specifically,\nwe introduce EiCue, a spectral technique providing semantic and structural cues\nthrough an eigenbasis derived from the semantic similarity matrix of deep image\nfeatures and color affinity from an image. Further, by incorporating our\nobject-centric contrastive loss with EiCue, we guide our model to learn\nobject-level representations with intra- and inter-image object-feature\nconsistency, thereby enhancing semantic accuracy. Extensive experiments on\nCOCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art\nUSS results of EAGLE with accurate and consistent semantic segmentation across\ncomplex scenes.\n","authors":["Chanyoung Kim","Woojung Han","Dayun Ju","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.01482v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20260v1","updated":"2024-03-29T16:08:59Z","published":"2024-03-29T16:08:59Z","title":"Prototype-based Interpretable Breast Cancer Prediction Models: Analysis\n and Challenges","summary":" Deep learning models have achieved high performance in medical applications,\nhowever, their adoption in clinical practice is hindered due to their black-box\nnature. Self-explainable models, like prototype-based models, can be especially\nbeneficial as they are interpretable by design. However, if the learnt\nprototypes are of low quality then the prototype-based models are as good as\nblack-box. Having high quality prototypes is a pre-requisite for a truly\ninterpretable model. In this work, we propose a prototype evaluation framework\nfor coherence (PEF-C) for quantitatively evaluating the quality of the\nprototypes based on domain knowledge. We show the use of PEF-C in the context\nof breast cancer prediction using mammography. Existing works on\nprototype-based models on breast cancer prediction using mammography have\nfocused on improving the classification performance of prototype-based models\ncompared to black-box models and have evaluated prototype quality through\nanecdotal evidence. We are the first to go beyond anecdotal evidence and\nevaluate the quality of the mammography prototypes systematically using our\nPEF-C. Specifically, we apply three state-of-the-art prototype-based models,\nProtoPNet, BRAIxProtoPNet++ and PIP-Net on mammography images for breast cancer\nprediction and evaluate these models w.r.t. i) classification performance, and\nii) quality of the prototypes, on three public datasets. Our results show that\nprototype-based models are competitive with black-box models in terms of\nclassification performance, and achieve a higher score in detecting ROIs.\nHowever, the quality of the prototypes are not yet sufficient and can be\nimproved in aspects of relevance, purity and learning a variety of prototypes.\nWe call the XAI community to systematically evaluate the quality of the\nprototypes to check their true usability in high stake decisions and improve\nsuch models further.\n","authors":["Shreyasi Pathak","Jörg Schlötterer","Jeroen Veltman","Jeroen Geerdink","Maurice van Keulen","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2403.20260v1.pdf","comment":"21 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.20254v1","updated":"2024-03-29T16:01:00Z","published":"2024-03-29T16:01:00Z","title":"Benchmarking the Robustness of Temporal Action Detection Models Against\n Temporal Corruptions","summary":" Temporal action detection (TAD) aims to locate action positions and recognize\naction categories in long-term untrimmed videos. Although many methods have\nachieved promising results, their robustness has not been thoroughly studied.\nIn practice, we observe that temporal information in videos can be occasionally\ncorrupted, such as missing or blurred frames. Interestingly, existing methods\noften incur a significant performance drop even if only one frame is affected.\nTo formally evaluate the robustness, we establish two temporal corruption\nrobustness benchmarks, namely THUMOS14-C and ActivityNet-v1.3-C. In this paper,\nwe extensively analyze the robustness of seven leading TAD methods and obtain\nsome interesting findings: 1) Existing methods are particularly vulnerable to\ntemporal corruptions, and end-to-end methods are often more susceptible than\nthose with a pre-trained feature extractor; 2) Vulnerability mainly comes from\nlocalization error rather than classification error; 3) When corruptions occur\nin the middle of an action instance, TAD models tend to yield the largest\nperformance drop. Besides building a benchmark, we further develop a simple but\neffective robust training method to defend against temporal corruptions,\nthrough the FrameDrop augmentation and Temporal-Robust Consistency loss.\nRemarkably, our approach not only improves robustness but also yields promising\nimprovements on clean data. We believe that this study will serve as a\nbenchmark for future research in robust video analysis. Source code and models\nare available at https://github.com/Alvin-Zeng/temporal-robustness-benchmark.\n","authors":["Runhao Zeng","Xiaoyong Chen","Jiaming Liang","Huisi Wu","Guangzhong Cao","Yong Guo"],"pdf_url":"https://arxiv.org/pdf/2403.20254v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.20253v1","updated":"2024-03-29T15:59:11Z","published":"2024-03-29T15:59:11Z","title":"MedCLIP-SAM: Bridging Text and Image Towards Universal Medical Image\n Segmentation","summary":" Medical image segmentation of anatomical structures and pathology is crucial\nin modern clinical diagnosis, disease study, and treatment planning. To date,\ngreat progress has been made in deep learning-based segmentation techniques,\nbut most methods still lack data efficiency, generalizability, and\ninteractability. Consequently, the development of new, precise segmentation\nmethods that demand fewer labeled datasets is of utmost importance in medical\nimage analysis. Recently, the emergence of foundation models, such as CLIP and\nSegment-Anything-Model (SAM), with comprehensive cross-domain representation\nopened the door for interactive and universal image segmentation. However,\nexploration of these models for data-efficient medical image segmentation is\nstill limited, but is highly necessary. In this paper, we propose a novel\nframework, called MedCLIP-SAM that combines CLIP and SAM models to generate\nsegmentation of clinical scans using text prompts in both zero-shot and weakly\nsupervised settings. To achieve this, we employed a new Decoupled Hard Negative\nNoise Contrastive Estimation (DHN-NCE) loss to fine-tune the BiomedCLIP model\nand the recent gScoreCAM to generate prompts to obtain segmentation masks from\nSAM in a zero-shot setting. Additionally, we explored the use of zero-shot\nsegmentation labels in a weakly supervised paradigm to improve the segmentation\nquality further. By extensively testing three diverse segmentation tasks and\nmedical image modalities (breast tumor ultrasound, brain tumor MRI, and lung\nX-ray), our proposed framework has demonstrated excellent accuracy.\n","authors":["Taha Koleilat","Hojat Asgariandehkordi","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.20253v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2403.20251v1","updated":"2024-03-29T15:57:38Z","published":"2024-03-29T15:57:38Z","title":"Latent Embedding Clustering for Occlusion Robust Head Pose Estimation","summary":" Head pose estimation has become a crucial area of research in computer vision\ngiven its usefulness in a wide range of applications, including robotics,\nsurveillance, or driver attention monitoring. One of the most difficult\nchallenges in this field is managing head occlusions that frequently take place\nin real-world scenarios. In this paper, we propose a novel and efficient\nframework that is robust in real world head occlusion scenarios. In particular,\nwe propose an unsupervised latent embedding clustering with regression and\nclassification components for each pose angle. The model optimizes latent\nfeature representations for occluded and non-occluded images through a\nclustering term while improving fine-grained angle predictions. Experimental\nevaluation on in-the-wild head pose benchmark datasets reveal competitive\nperformance in comparison to state-of-the-art methodologies with the advantage\nof having a significant data reduction. We observe a substantial improvement in\noccluded head pose estimation. Also, an ablation study is conducted to\nascertain the impact of the clustering term within our proposed framework.\n","authors":["José Celestino","Manuel Marques","Jacinto C. Nascimento"],"pdf_url":"https://arxiv.org/pdf/2403.20251v1.pdf","comment":"Accepted at 18th IEEE International Conference on Automatic Face and\n Gesture Recognition (FG'24)"},{"id":"http://arxiv.org/abs/2311.13612v2","updated":"2024-03-29T15:55:48Z","published":"2023-11-21T23:30:01Z","title":"Descriptor and Word Soups: Overcoming the Parameter Efficiency Accuracy\n Tradeoff for Out-of-Distribution Few-shot Learning","summary":" Over the past year, a large body of multimodal research has emerged around\nzero-shot evaluation using GPT descriptors. These studies boost the zero-shot\naccuracy of pretrained VL models with an ensemble of label-specific text\ngenerated by GPT. A recent study, WaffleCLIP, demonstrated that similar\nzero-shot accuracy can be achieved with an ensemble of random descriptors.\nHowever, both zero-shot methods are un-trainable and consequently sub-optimal\nwhen some few-shot out-of-distribution (OOD) training data is available.\nInspired by these prior works, we present two more flexible methods called\ndescriptor and word soups, which do not require an LLM at test time and can\nleverage training data to increase OOD target accuracy. Descriptor soup\ngreedily selects a small set of textual descriptors using generic few-shot\ntraining data, then calculates robust class embeddings using the selected\ndescriptors. Word soup greedily assembles a chain of words in a similar manner.\nCompared to existing few-shot soft prompt tuning methods, word soup requires\nfewer parameters by construction and less GPU memory, since it does not require\nbackpropagation. Both soups outperform current published few-shot methods, even\nwhen combined with SoTA zero-shot methods, on cross-dataset and domain\ngeneralization benchmarks. Compared with SoTA prompt and descriptor ensembling\nmethods, such as ProDA and WaffleCLIP, word soup achieves higher OOD accuracy\nwith fewer ensemble members. Please checkout our code:\ngithub.com/Chris210634/word_soups\n","authors":["Christopher Liao","Theodoros Tsiligkaridis","Brian Kulis"],"pdf_url":"https://arxiv.org/pdf/2311.13612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20249v1","updated":"2024-03-29T15:54:36Z","published":"2024-03-29T15:54:36Z","title":"Relation Rectification in Diffusion Model","summary":" Despite their exceptional generative abilities, large text-to-image diffusion\nmodels, much like skilled but careless artists, often struggle with accurately\ndepicting visual relationships between objects. This issue, as we uncover\nthrough careful analysis, arises from a misaligned text encoder that struggles\nto interpret specific relationships and differentiate the logical order of\nassociated objects. To resolve this, we introduce a novel task termed Relation\nRectification, aiming to refine the model to accurately represent a given\nrelationship it initially fails to generate. To address this, we propose an\ninnovative solution utilizing a Heterogeneous Graph Convolutional Network\n(HGCN). It models the directional relationships between relation terms and\ncorresponding objects within the input prompts. Specifically, we optimize the\nHGCN on a pair of prompts with identical relational words but reversed object\norders, supplemented by a few reference images. The lightweight HGCN adjusts\nthe text embeddings generated by the text encoder, ensuring the accurate\nreflection of the textual relation in the embedding space. Crucially, our\nmethod retains the parameters of the text encoder and diffusion model,\npreserving the model's robust performance on unrelated descriptions. We\nvalidated our approach on a newly curated dataset of diverse relational data,\ndemonstrating both quantitative and qualitative enhancements in generating\nimages with precise visual relations. Project page:\nhttps://wuyinwei-hah.github.io/rrnet.github.io/.\n","authors":["Yinwei Wu","Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2403.20249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05269v2","updated":"2024-03-29T15:44:05Z","published":"2023-12-07T19:19:25Z","title":"LifelongMemory: Leveraging LLMs for Answering Queries in Long-form\n Egocentric Videos","summary":" In this paper we introduce LifelongMemory, a new framework for accessing\nlong-form egocentric videographic memory through natural language question\nanswering and retrieval. LifelongMemory generates concise video activity\ndescriptions of the camera wearer and leverages the zero-shot capabilities of\npretrained large language models to perform reasoning over long-form video\ncontext. Furthermore, Lifelong Memory uses a confidence and explanation module\nto produce confident, high-quality, and interpretable answers. Our approach\nachieves state-of-the-art performance on the EgoSchema benchmark for question\nanswering and is highly competitive on the natural language query (NLQ)\nchallenge of Ego4D. Code is available at\nhttps://github.com/Agentic-Learning-AI-Lab/lifelong-memory.\n","authors":["Ying Wang","Yanlai Yang","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2312.05269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.04119v2","updated":"2024-03-29T15:27:47Z","published":"2022-12-08T07:29:07Z","title":"DialogCC: An Automated Pipeline for Creating High-Quality Multi-Modal\n Dialogue Dataset","summary":" As sharing images in an instant message is a crucial factor, there has been\nactive research on learning an image-text multi-modal dialogue models. However,\ntraining a well-generalized multi-modal dialogue model remains challenging due\nto the low quality and limited diversity of images per dialogue in existing\nmulti-modal dialogue datasets. In this paper, we propose an automated pipeline\nto construct a multi-modal dialogue dataset, ensuring both dialogue quality and\nimage diversity without requiring minimum human effort. In our pipeline, to\nguarantee the coherence between images and dialogue, we prompt GPT-4 to infer\npotential image-sharing moments - specifically, the utterance, speaker,\nrationale, and image description. Furthermore, we leverage CLIP similarity to\nmaintain consistency between aligned multiple images to the utterance. Through\nthis pipeline, we introduce DialogCC, a high-quality and diverse multi-modal\ndialogue dataset that surpasses existing datasets in terms of quality and\ndiversity in human evaluation. Our comprehensive experiments highlight that\nwhen multi-modal dialogue models are trained using our dataset, their\ngeneralization performance on unseen dialogue datasets is significantly\nenhanced. We make our source code and dataset publicly available.\n","authors":["Young-Jun Lee","Byungsoo Ko","Han-Gyu Kim","Jonghwan Hyeon","Ho-Jin Choi"],"pdf_url":"https://arxiv.org/pdf/2212.04119v2.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2403.20236v1","updated":"2024-03-29T15:26:44Z","published":"2024-03-29T15:26:44Z","title":"Long-Tailed Anomaly Detection with Learnable Class Names","summary":" Anomaly detection (AD) aims to identify defective images and localize their\ndefects (if any). Ideally, AD models should be able to detect defects over many\nimage classes; without relying on hard-coded class names that can be\nuninformative or inconsistent across datasets; learn without anomaly\nsupervision; and be robust to the long-tailed distributions of real-world\napplications. To address these challenges, we formulate the problem of\nlong-tailed AD by introducing several datasets with different levels of class\nimbalance and metrics for performance evaluation. We then propose a novel\nmethod, LTAD, to detect defects from multiple and long-tailed classes, without\nrelying on dataset class names. LTAD combines AD by reconstruction and semantic\nAD modules. AD by reconstruction is implemented with a transformer-based\nreconstruction module. Semantic AD is implemented with a binary classifier,\nwhich relies on learned pseudo class names and a pretrained foundation model.\nThese modules are learned over two phases. Phase 1 learns the pseudo-class\nnames and a variational autoencoder (VAE) for feature synthesis that augments\nthe training data to combat long-tails. Phase 2 then learns the parameters of\nthe reconstruction and classification modules of LTAD. Extensive experiments\nusing the proposed long-tailed datasets show that LTAD substantially\noutperforms the state-of-the-art methods for most forms of dataset imbalance.\nThe long-tailed dataset split is available at\nhttps://zenodo.org/records/10854201 .\n","authors":["Chih-Hui Ho","Kuan-Chuan Peng","Nuno Vasconcelos"],"pdf_url":"https://arxiv.org/pdf/2403.20236v1.pdf","comment":"This paper is accepted to CVPR 2024. The supplementary material is\n included. The long-tailed dataset split is available at\n https://zenodo.org/records/10854201"},{"id":"http://arxiv.org/abs/2403.20231v1","updated":"2024-03-29T15:20:34Z","published":"2024-03-29T15:20:34Z","title":"U-VAP: User-specified Visual Appearance Personalization via Decoupled\n Self Augmentation","summary":" Concept personalization methods enable large text-to-image models to learn\nspecific subjects (e.g., objects/poses/3D models) and synthesize renditions in\nnew contexts. Given that the image references are highly biased towards visual\nattributes, state-of-the-art personalization models tend to overfit the whole\nsubject and cannot disentangle visual characteristics in pixel space. In this\nstudy, we proposed a more challenging setting, namely fine-grained visual\nappearance personalization. Different from existing methods, we allow users to\nprovide a sentence describing the desired attributes. A novel decoupled\nself-augmentation strategy is proposed to generate target-related and\nnon-target samples to learn user-specified visual attributes. These augmented\ndata allow for refining the model's understanding of the target attribute while\nmitigating the impact of unrelated attributes. At the inference stage,\nadjustments are conducted on semantic space through the learned target and\nnon-target embeddings to further enhance the disentanglement of target\nattributes. Extensive experiments on various kinds of visual attributes with\nSOTA personalization methods show the ability of the proposed method to mimic\ntarget visual appearance in novel contexts, thus improving the controllability\nand flexibility of personalization.\n","authors":["You Wu","Kean Liu","Xiaoyue Mi","Fan Tang","Juan Cao","Jintao Li"],"pdf_url":"https://arxiv.org/pdf/2403.20231v1.pdf","comment":"14 pages, 13 figures, 2 tables"},{"id":"http://arxiv.org/abs/2303.06346v2","updated":"2024-03-29T15:10:29Z","published":"2023-03-11T08:42:54Z","title":"3DInAction: Understanding Human Actions in 3D Point Clouds","summary":" We propose a novel method for 3D point cloud action recognition.\nUnderstanding human actions in RGB videos has been widely studied in recent\nyears, however, its 3D point cloud counterpart remains under-explored. This is\nmostly due to the inherent limitation of the point cloud data modality -- lack\nof structure, permutation invariance, and varying number of points -- which\nmakes it difficult to learn a spatio-temporal representation. To address this\nlimitation, we propose the 3DinAction pipeline that first estimates patches\nmoving in time (t-patches) as a key building block, alongside a hierarchical\narchitecture that learns an informative spatio-temporal representation. We show\nthat our method achieves improved performance on existing datasets, including\nDFAUST and IKEA ASM. Code is publicly available at\nhttps://github.com/sitzikbs/3dincaction.\n","authors":["Yizhak Ben-Shabat","Oren Shrout","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2303.06346v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20225v1","updated":"2024-03-29T15:08:37Z","published":"2024-03-29T15:08:37Z","title":"MTMMC: A Large-Scale Real-World Multi-Modal Camera Tracking Benchmark","summary":" Multi-target multi-camera tracking is a crucial task that involves\nidentifying and tracking individuals over time using video streams from\nmultiple cameras. This task has practical applications in various fields, such\nas visual surveillance, crowd behavior analysis, and anomaly detection.\nHowever, due to the difficulty and cost of collecting and labeling data,\nexisting datasets for this task are either synthetically generated or\nartificially constructed within a controlled camera network setting, which\nlimits their ability to model real-world dynamics and generalize to diverse\ncamera configurations. To address this issue, we present MTMMC, a real-world,\nlarge-scale dataset that includes long video sequences captured by 16\nmulti-modal cameras in two different environments - campus and factory - across\nvarious time, weather, and season conditions. This dataset provides a\nchallenging test-bed for studying multi-camera tracking under diverse\nreal-world complexities and includes an additional input modality of spatially\naligned and temporally synchronized RGB and thermal cameras, which enhances the\naccuracy of multi-camera tracking. MTMMC is a super-set of existing datasets,\nbenefiting independent fields such as person detection, re-identification, and\nmultiple object tracking. We provide baselines and new learning setups on this\ndataset and set the reference scores for future studies. The datasets, models,\nand test server will be made publicly available.\n","authors":["Sanghyun Woo","Kwanyong Park","Inkyu Shin","Myungchul Kim","In So Kweon"],"pdf_url":"https://arxiv.org/pdf/2403.20225v1.pdf","comment":"Accepted on CVPR 2024"},{"id":"http://arxiv.org/abs/2312.02216v2","updated":"2024-03-29T14:59:13Z","published":"2023-12-03T10:41:06Z","title":"DragVideo: Interactive Drag-style Video Editing","summary":" Video generation models have shown their superior ability to generate\nphoto-realistic video. However, how to accurately control (or edit) the video\nremains a formidable challenge. The main issues are: 1) how to perform direct\nand accurate user control in editing; 2) how to execute editings like changing\nshape, expression, and layout without unsightly distortion and artifacts to the\nedited content; and 3) how to maintain spatio-temporal consistency of video\nafter editing. To address the above issues, we propose DragVideo, a general\ndrag-style video editing framework. Inspired by DragGAN, DragVideo addresses\nissues 1) and 2) by proposing the drag-style video latent optimization method\nwhich gives desired control by updating noisy video latent according to drag\ninstructions through video-level drag objective function. We amend issue 3) by\nintegrating the video diffusion model with sample-specific LoRA and Mutual\nSelf-Attention in DragVideo to ensure the edited result is spatio-temporally\nconsistent. We also present a series of testing examples for drag-style video\nediting and conduct extensive experiments across a wide array of challenging\nediting tasks, such as motion, skeleton editing, etc, underscoring DragVideo\ncan edit video in an intuitive, faithful to the user's intention manner, with\nnearly unnoticeable distortion and artifacts, while maintaining spatio-temporal\nconsistency. While traditional prompt-based video editing fails to do the\nformer two and directly applying image drag editing fails in the last,\nDragVideo's versatility and generality are emphasized. Github link:\nhttps://github.com/RickySkywalker/DragVideo-Official.\n","authors":["Yufan Deng","Ruida Wang","Yuhao Zhang","Yu-Wing Tai","Chi-Keung Tang"],"pdf_url":"https://arxiv.org/pdf/2312.02216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12440v2","updated":"2024-03-29T14:55:50Z","published":"2024-03-19T04:54:59Z","title":"Self-learning Canonical Space for Multi-view 3D Human Pose Estimation","summary":" Multi-view 3D human pose estimation is naturally superior to single view one,\nbenefiting from more comprehensive information provided by images of multiple\nviews. The information includes camera poses, 2D/3D human poses, and 3D\ngeometry. However, the accurate annotation of these information is hard to\nobtain, making it challenging to predict accurate 3D human pose from multi-view\nimages. To deal with this issue, we propose a fully self-supervised framework,\nnamed cascaded multi-view aggregating network (CMANet), to construct a\ncanonical parameter space to holistically integrate and exploit multi-view\ninformation. In our framework, the multi-view information is grouped into two\ncategories: 1) intra-view information , 2) inter-view information. Accordingly,\nCMANet consists of two components: intra-view module (IRV) and inter-view\nmodule (IEV). IRV is used for extracting initial camera pose and 3D human pose\nof each view; IEV is to fuse complementary pose information and cross-view 3D\ngeometry for a final 3D human pose. To facilitate the aggregation of the intra-\nand inter-view, we define a canonical parameter space, depicted by per-view\ncamera pose and human pose and shape parameters ($\\theta$ and $\\beta$) of SMPL\nmodel, and propose a two-stage learning procedure. At first stage, IRV learns\nto estimate camera pose and view-dependent 3D human pose supervised by\nconfident output of an off-the-shelf 2D keypoint detector. At second stage, IRV\nis frozen and IEV further refines the camera pose and optimizes the 3D human\npose by implicitly encoding the cross-view complement and 3D geometry\nconstraint, achieved by jointly fitting predicted multi-view 2D keypoints. The\nproposed framework, modules, and learning strategy are demonstrated to be\neffective by comprehensive experiments and CMANet is superior to\nstate-of-the-art methods in extensive quantitative and qualitative analysis.\n","authors":["Xiaoben Li","Mancheng Meng","Ziyan Wu","Terrence Chen","Fan Yang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2403.12440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13612v2","updated":"2024-03-29T23:42:05Z","published":"2023-03-23T18:55:43Z","title":"NOPE: Novel Object Pose Estimation from a Single Image","summary":" The practicality of 3D object pose estimation remains limited for many\napplications due to the need for prior knowledge of a 3D model and a training\nperiod for new objects. To address this limitation, we propose an approach that\ntakes a single image of a new object as input and predicts the relative pose of\nthis object in new images without prior knowledge of the object's 3D model and\nwithout requiring training time for new objects and categories. We achieve this\nby training a model to directly predict discriminative embeddings for\nviewpoints surrounding the object. This prediction is done using a simple U-Net\narchitecture with attention and conditioned on the desired pose, which yields\nextremely fast inference. We compare our approach to state-of-the-art methods\nand show it outperforms them both in terms of accuracy and robustness. Our\nsource code is publicly available at https://github.com/nv-nguyen/nope\n","authors":["Van Nguyen Nguyen","Thibault Groueix","Yinlin Hu","Mathieu Salzmann","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2303.13612v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.18528v2","updated":"2024-03-29T23:41:13Z","published":"2024-02-28T18:08:03Z","title":"Gradient Reweighting: Towards Imbalanced Class-Incremental Learning","summary":" Class-Incremental Learning (CIL) trains a model to continually recognize new\nclasses from non-stationary data while retaining learned knowledge. A major\nchallenge of CIL arises when applying to real-world data characterized by\nnon-uniform distribution, which introduces a dual imbalance problem involving\n(i) disparities between stored exemplars of old tasks and new class data\n(inter-phase imbalance), and (ii) severe class imbalances within each\nindividual task (intra-phase imbalance). We show that this dual imbalance issue\ncauses skewed gradient updates with biased weights in FC layers, thus inducing\nover/under-fitting and catastrophic forgetting in CIL. Our method addresses it\nby reweighting the gradients towards balanced optimization and unbiased\nclassifier learning. Additionally, we observe imbalanced forgetting where\nparadoxically the instance-rich classes suffer higher performance degradation\nduring CIL due to a larger amount of training data becoming unavailable in\nsubsequent learning phases. To tackle this, we further introduce a\ndistribution-aware knowledge distillation loss to mitigate forgetting by\naligning output logits proportionally with the distribution of lost training\ndata. We validate our method on CIFAR-100, ImageNetSubset, and Food101 across\nvarious evaluation protocols and demonstrate consistent improvements compared\nto existing works, showing great potential to apply CIL in real-world scenarios\nwith enhanced robustness and effectiveness.\n","authors":["Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.18528v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.10171v2","updated":"2024-03-29T22:47:59Z","published":"2024-01-18T18:01:19Z","title":"SHINOBI: Shape and Illumination using Neural Object Decomposition via\n BRDF Optimization In-the-wild","summary":" We present SHINOBI, an end-to-end framework for the reconstruction of shape,\nmaterial, and illumination from object images captured with varying lighting,\npose, and background. Inverse rendering of an object based on unconstrained\nimage collections is a long-standing challenge in computer vision and graphics\nand requires a joint optimization over shape, radiance, and pose. We show that\nan implicit shape representation based on a multi-resolution hash encoding\nenables faster and robust shape reconstruction with joint camera alignment\noptimization that outperforms prior work. Further, to enable the editing of\nillumination and object reflectance (i.e. material) we jointly optimize BRDF\nand illumination together with the object's shape. Our method is class-agnostic\nand works on in-the-wild image collections of objects to produce relightable 3D\nassets for several use cases such as AR/VR, movies, games, etc. Project page:\nhttps://shinobi.aengelhardt.com Video:\nhttps://www.youtube.com/watch?v=iFENQ6AcYd8&feature=youtu.be\n","authors":["Andreas Engelhardt","Amit Raj","Mark Boss","Yunzhi Zhang","Abhishek Kar","Yuanzhen Li","Deqing Sun","Ricardo Martin Brualla","Jonathan T. Barron","Hendrik P. A. Lensch","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2401.10171v2.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR 2024). Updated supplementary material and acknowledgements"},{"id":"http://arxiv.org/abs/2403.17173v2","updated":"2024-03-29T22:46:03Z","published":"2024-03-25T20:39:58Z","title":"Task2Box: Box Embeddings for Modeling Asymmetric Task Relationships","summary":" Modeling and visualizing relationships between tasks or datasets is an\nimportant step towards solving various meta-tasks such as dataset discovery,\nmulti-tasking, and transfer learning. However, many relationships, such as\ncontainment and transferability, are naturally asymmetric and current\napproaches for representation and visualization (e.g., t-SNE) do not readily\nsupport this. We propose Task2Box, an approach to represent tasks using box\nembeddings -- axis-aligned hyperrectangles in low dimensional spaces -- that\ncan capture asymmetric relationships between them through volumetric overlaps.\nWe show that Task2Box accurately predicts unseen hierarchical relationships\nbetween nodes in ImageNet and iNaturalist datasets, as well as transferability\nbetween tasks in the Taskonomy benchmark. We also show that box embeddings\nestimated from task representations (e.g., CLIP, Task2Vec, or attribute based)\ncan be used to predict relationships between unseen tasks more accurately than\nclassifiers trained on the same representations, as well as handcrafted\nasymmetric distances (e.g., KL divergence). This suggests that low-dimensional\nbox embeddings can effectively capture these task relationships and have the\nadded advantage of being interpretable. We use the approach to visualize\nrelationships among publicly available image classification datasets on popular\ndataset hosting platform called Hugging Face.\n","authors":["Rangel Daroya","Aaron Sun","Subhransu Maji"],"pdf_url":"https://arxiv.org/pdf/2403.17173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08237v2","updated":"2024-03-29T22:29:03Z","published":"2023-01-19T18:54:43Z","title":"LoCoNet: Long-Short Context Network for Active Speaker Detection","summary":" Active Speaker Detection (ASD) aims to identify who is speaking in each frame\nof a video. ASD reasons from audio and visual information from two contexts:\nlong-term intra-speaker context and short-term inter-speaker context. Long-term\nintra-speaker context models the temporal dependencies of the same speaker,\nwhile short-term inter-speaker context models the interactions of speakers in\nthe same scene. These two contexts are complementary to each other and can help\ninfer the active speaker. Motivated by these observations, we propose LoCoNet,\na simple yet effective Long-Short Context Network that models the long-term\nintra-speaker context and short-term inter-speaker context. We use\nself-attention to model long-term intra-speaker context due to its\neffectiveness in modeling long-range dependencies, and convolutional blocks\nthat capture local patterns to model short-term inter-speaker context.\nExtensive experiments show that LoCoNet achieves state-of-the-art performance\non multiple datasets, achieving an mAP of 95.2%(+1.1%) on AVA-ActiveSpeaker,\n68.1%(+22%) on Columbia dataset, 97.2%(+2.8%) on Talkies dataset and\n59.7%(+8.0%) on Ego4D dataset. Moreover, in challenging cases where multiple\nspeakers are present, or face of active speaker is much smaller than other\nfaces in the same scene, LoCoNet outperforms previous state-of-the-art methods\nby 3.4% on the AVA-ActiveSpeaker dataset. The code will be released at\nhttps://github.com/SJTUwxz/LoCoNet_ASD.\n","authors":["Xizi Wang","Feng Cheng","Gedas Bertasius","David Crandall"],"pdf_url":"https://arxiv.org/pdf/2301.08237v2.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19076v2","updated":"2024-03-29T21:33:39Z","published":"2024-03-28T00:34:56Z","title":"Tiny Machine Learning: Progress and Futures","summary":" Tiny Machine Learning (TinyML) is a new frontier of machine learning. By\nsqueezing deep learning models into billions of IoT devices and\nmicrocontrollers (MCUs), we expand the scope of AI applications and enable\nubiquitous intelligence. However, TinyML is challenging due to hardware\nconstraints: the tiny memory resource makes it difficult to hold deep learning\nmodels designed for cloud and mobile platforms. There is also limited compiler\nand inference engine support for bare-metal devices. Therefore, we need to\nco-design the algorithm and system stack to enable TinyML. In this review, we\nwill first discuss the definition, challenges, and applications of TinyML. We\nthen survey the recent progress in TinyML and deep learning on MCUs. Next, we\nwill introduce MCUNet, showing how we can achieve ImageNet-scale AI\napplications on IoT devices with system-algorithm co-design. We will further\nextend the solution from inference to training and introduce tiny on-device\ntraining techniques. Finally, we present future directions in this area.\nToday's large model might be tomorrow's tiny model. The scope of TinyML should\nevolve and adapt over time.\n","authors":["Ji Lin","Ligeng Zhu","Wei-Ming Chen","Wei-Chen Wang","Song Han"],"pdf_url":"https://arxiv.org/pdf/2403.19076v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2206.15472"},{"id":"http://arxiv.org/abs/2403.08848v2","updated":"2024-03-29T21:22:06Z","published":"2024-03-13T16:57:04Z","title":"FocusMAE: Gallbladder Cancer Detection from Ultrasound Videos with\n Focused Masked Autoencoders","summary":" In recent years, automated Gallbladder Cancer (GBC) detection has gained the\nattention of researchers. Current state-of-the-art (SOTA) methodologies relying\non ultrasound sonography (US) images exhibit limited generalization,\nemphasizing the need for transformative approaches. We observe that individual\nUS frames may lack sufficient information to capture disease manifestation.\nThis study advocates for a paradigm shift towards video-based GBC detection,\nleveraging the inherent advantages of spatiotemporal representations. Employing\nthe Masked Autoencoder (MAE) for representation learning, we address\nshortcomings in conventional image-based methods. We propose a novel design\ncalled FocusMAE to systematically bias the selection of masking tokens from\nhigh-information regions, fostering a more refined representation of\nmalignancy. Additionally, we contribute the most extensive US video dataset for\nGBC detection. We also note that, this is the first study on US video-based GBC\ndetection. We validate the proposed methods on the curated dataset, and report\na new state-of-the-art (SOTA) accuracy of 96.4% for the GBC detection problem,\nagainst an accuracy of 84% by current Image-based SOTA - GBCNet, and RadFormer,\nand 94.7% by Video-based SOTA - AdaMAE. We further demonstrate the generality\nof the proposed FocusMAE on a public CT-based Covid detection dataset,\nreporting an improvement in accuracy by 3.3% over current baselines. The source\ncode and pretrained models are available at:\nhttps://gbc-iitd.github.io/focusmae\n","authors":["Soumen Basu","Mayuna Gupta","Chetan Madan","Pankaj Gupta","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2403.08848v2.pdf","comment":"To Appear at CVPR 2024"},{"id":"http://arxiv.org/abs/2402.01858v2","updated":"2024-03-29T21:18:37Z","published":"2024-02-02T19:28:33Z","title":"Explaining latent representations of generative models with large\n multimodal models","summary":" Learning interpretable representations of data generative latent factors is\nan important topic for the development of artificial intelligence. With the\nrise of the large multimodal model, it can align images with text to generate\nanswers. In this work, we propose a framework to comprehensively explain each\nlatent variable in the generative models using a large multimodal model. We\nfurther measure the uncertainty of our generated explanations, quantitatively\nevaluate the performance of explanation generation among multiple large\nmultimodal models, and qualitatively visualize the variations of each latent\nvariable to learn the disentanglement effects of different generative models on\nexplanations. Finally, we discuss the explanatory capabilities and limitations\nof state-of-the-art large multimodal models.\n","authors":["Mengdan Zhu","Zhenke Liu","Bo Pan","Abhinav Angirekula","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.01858v2.pdf","comment":"ICLR 2024 Workshop Paper on Reliable and Responsible Foundation\n Models"},{"id":"http://arxiv.org/abs/2311.10696v2","updated":"2024-03-29T20:17:29Z","published":"2023-11-17T18:28:32Z","title":"Versatile Medical Image Segmentation Learned from Multi-Source Datasets\n via Model Self-Disambiguation","summary":" A versatile medical image segmentation model applicable to images acquired\nwith diverse equipment and protocols can facilitate model deployment and\nmaintenance. However, building such a model typically demands a large, diverse,\nand fully annotated dataset, which is challenging to obtain due to the\nlabor-intensive nature of data curation. To address this challenge, we propose\na cost-effective alternative that harnesses multi-source data with only partial\nor sparse segmentation labels for training, substantially reducing the cost of\ndeveloping a versatile model. We devise strategies for model\nself-disambiguation, prior knowledge incorporation, and imbalance mitigation to\ntackle challenges associated with inconsistently labeled multi-source data,\nincluding label ambiguity and modality, dataset, and class imbalances.\nExperimental results on a multi-modal dataset compiled from eight different\nsources for abdominal structure segmentation have demonstrated the\neffectiveness and superior performance of our method compared to\nstate-of-the-art alternative approaches. We anticipate that its cost-saving\nfeatures, which optimize the utilization of existing annotated data and reduce\nannotation efforts for new data, will have a significant impact in the field.\n","authors":["Xiaoyang Chen","Hao Zheng","Yuemeng Li","Yuncong Ma","Liang Ma","Hongming Li","Yong Fan"],"pdf_url":"https://arxiv.org/pdf/2311.10696v2.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2201.09929v3","updated":"2024-03-29T20:10:38Z","published":"2022-01-24T19:37:04Z","title":"Euclidean and Affine Curve Reconstruction","summary":" We consider practical aspects of reconstructing planar curves with prescribed\nEuclidean or affine curvatures. These curvatures are invariant under the\nspecial Euclidean group and the equi-affine groups, respectively, and play an\nimportant role in computer vision and shape analysis. We discuss and implement\nalgorithms for such reconstruction, and give estimates on how close\nreconstructed curves are relative to the closeness of their curvatures in\nappropriate metrics. Several illustrative examples are provided.\n","authors":["Jose Agudelo","Brooke Dippold","Ian Klein","Alex Kokot","Eric Geiger","Irina Kogan"],"pdf_url":"https://arxiv.org/pdf/2201.09929v3.pdf","comment":"This paper is a result of an REU project conducted at the North\n Carolina State University in the Summer and Fall 2020. This version has\n several minor corrections"},{"id":"http://arxiv.org/abs/2312.03816v3","updated":"2024-03-29T19:50:38Z","published":"2023-12-06T18:56:14Z","title":"AVID: Any-Length Video Inpainting with Diffusion Model","summary":" Recent advances in diffusion models have successfully enabled text-guided\nimage inpainting. While it seems straightforward to extend such editing\ncapability into the video domain, there have been fewer works regarding\ntext-guided video inpainting. Given a video, a masked region at its initial\nframe, and an editing prompt, it requires a model to do infilling at each frame\nfollowing the editing guidance while keeping the out-of-mask region intact.\nThere are three main challenges in text-guided video inpainting: ($i$) temporal\nconsistency of the edited video, ($ii$) supporting different inpainting types\nat different structural fidelity levels, and ($iii$) dealing with variable\nvideo length. To address these challenges, we introduce Any-Length Video\nInpainting with Diffusion Model, dubbed as AVID. At its core, our model is\nequipped with effective motion modules and adjustable structure guidance, for\nfixed-length video inpainting. Building on top of that, we propose a novel\nTemporal MultiDiffusion sampling pipeline with a middle-frame attention\nguidance mechanism, facilitating the generation of videos with any desired\nduration. Our comprehensive experiments show our model can robustly deal with\nvarious inpainting types at different video duration ranges, with high quality.\nMore visualization results are made publicly available at\nhttps://zhang-zx.github.io/AVID/ .\n","authors":["Zhixing Zhang","Bichen Wu","Xiaoyan Wang","Yaqiao Luo","Luxin Zhang","Yinan Zhao","Peter Vajda","Dimitris Metaxas","Licheng Yu"],"pdf_url":"https://arxiv.org/pdf/2312.03816v3.pdf","comment":"Project website: https://zhang-zx.github.io/AVID/"},{"id":"http://arxiv.org/abs/2402.07245v2","updated":"2024-03-29T19:47:50Z","published":"2024-02-11T17:09:21Z","title":"Semi-Mamba-UNet: Pixel-Level Contrastive and Pixel-Level\n Cross-Supervised Visual Mamba-based UNet for Semi-Supervised Medical Image\n Segmentation","summary":" Medical image segmentation is essential in diagnostics, treatment planning,\nand healthcare, with deep learning offering promising advancements. Notably,\nConvolutional Neural Network (CNN) excel in capturing local image features,\nwhereas Vision Transformer (ViT) adeptly model long-range dependencies through\nmulti-head self-attention mechanisms. Despite their strengths, both CNN and ViT\nface challenges in efficiently processing long-range dependencies within\nmedical images, often requiring substantial computational resources. This\nissue, combined with the high cost and limited availability of expert\nannotations, poses significant obstacles to achieving precise segmentation. To\naddress these challenges, this paper introduces the Semi-Mamba-UNet, which\nintegrates a visual mamba-based UNet architecture with a conventional UNet into\na semi-supervised learning (SSL) framework. This innovative SSL approach\nleverages dual networks to jointly generate pseudo labels and cross supervise\neach other, drawing inspiration from consistency regularization techniques.\nFurthermore, we introduce a self-supervised pixel-level contrastive learning\nstrategy, employing a projector pair to further enhance feature learning\ncapabilities. Our comprehensive evaluation on a publicly available MRI cardiac\nsegmentation dataset, comparing against various SSL frameworks with different\nUNet-based segmentation networks, highlights the superior performance of\nSemi-Mamba-UNet. The source code has been made publicly accessible.\n","authors":["Chao Ma","Ziyang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.07245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12194v2","updated":"2024-03-29T19:37:18Z","published":"2023-11-20T21:20:37Z","title":"DiffAvatar: Simulation-Ready Garment Optimization with Differentiable\n Simulation","summary":" The realism of digital avatars is crucial in enabling telepresence\napplications with self-expression and customization. While physical simulations\ncan produce realistic motions for clothed humans, they require high-quality\ngarment assets with associated physical parameters for cloth simulations.\nHowever, manually creating these assets and calibrating their parameters is\nlabor-intensive and requires specialized expertise. Current methods focus on\nreconstructing geometry, but don't generate complete assets for physics-based\napplications. To address this gap, we propose \\papername,~a novel approach that\nperforms body and garment co-optimization using differentiable simulation. By\nintegrating physical simulation into the optimization loop and accounting for\nthe complex nonlinear behavior of cloth and its intricate interaction with the\nbody, our framework recovers body and garment geometry and extracts important\nmaterial parameters in a physically plausible way. Our experiments demonstrate\nthat our approach generates realistic clothing and body shape suitable for\ndownstream applications. We provide additional insights and results on our\nwebpage: https://people.csail.mit.edu/liyifei/publication/diffavatar/\n","authors":["Yifei Li","Hsiao-yu Chen","Egor Larionov","Nikolaos Sarafianos","Wojciech Matusik","Tuur Stuyck"],"pdf_url":"https://arxiv.org/pdf/2311.12194v2.pdf","comment":"CVPR 2024; Project page:\n https://people.csail.mit.edu/liyifei/publication/diffavatar/"},{"id":"http://arxiv.org/abs/2403.11821v2","updated":"2024-03-29T19:27:23Z","published":"2024-03-18T14:24:20Z","title":"Evaluating Text-to-Image Synthesis: Survey and Taxonomy of Image Quality\n Metrics","summary":" Recent advances in text-to-image synthesis enabled through a combination of\nlanguage and vision foundation models have led to a proliferation of the tools\navailable and an increased attention to the field. When conducting\ntext-to-image synthesis, a central goal is to ensure that the content between\ntext and image is aligned. As such, there exist numerous evaluation metrics\nthat aim to mimic human judgement. However, it is often unclear which metric to\nuse for evaluating text-to-image synthesis systems as their evaluation is\nhighly nuanced. In this work, we provide a comprehensive overview of existing\ntext-to-image evaluation metrics. Based on our findings, we propose a new\ntaxonomy for categorizing these metrics. Our taxonomy is grounded in the\nassumption that there are two main quality criteria, namely compositionality\nand generality, which ideally map to human preferences. Ultimately, we derive\nguidelines for practitioners conducting text-to-image evaluation, discuss open\nchallenges of evaluation mechanisms, and surface limitations of current\nmetrics.\n","authors":["Sebastian Hartwig","Dominik Engel","Leon Sick","Hannah Kniesel","Tristan Payer","Poonam Poonam","Michael Glöckler","Alex Bäuerle","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2403.11821v2.pdf","comment":"preprint, 21 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.15571v2","updated":"2024-03-29T19:14:36Z","published":"2024-03-22T18:52:10Z","title":"Augmented Reality Warnings in Roadway Work Zones: Evaluating the Effect\n of Modality on Worker Reaction Times","summary":" Given the aging highway infrastructure requiring extensive rebuilding and\nenhancements, and the consequent rise in the number of work zones, there is an\nurgent need to develop advanced safety systems to protect workers. While\nAugmented Reality (AR) holds significant potential for delivering warnings to\nworkers, its integration into roadway work zones remains relatively unexplored.\nThe primary objective of this study is to improve safety measures within\nroadway work zones by conducting an extensive analysis of how different\ncombinations of multimodal AR warnings influence the reaction times of workers.\nThis paper addresses this gap through a series of experiments that aim to\nreplicate the distinctive conditions of roadway work zones, both in real-world\nand virtual reality environments. Our approach comprises three key components:\nan advanced AR system prototype, a VR simulation of AR functionality within the\nwork zone environment, and the Wizard of Oz technique to synchronize user\nexperiences across experiments. To assess reaction times, we leverage both the\nsimple reaction time (SRT) technique and an innovative vision-based metric that\nutilizes real-time pose estimation. By conducting five experiments in\ncontrolled outdoor work zones and indoor VR settings, our study provides\nvaluable information on how various multimodal AR warnings impact workers\nreaction times. Furthermore, our findings reveal the disparities in reaction\ntimes between VR simulations and real-world scenarios, thereby gauging VR's\ncapability to mirror the dynamics of roadway work zones. Furthermore, our\nresults substantiate the potential and reliability of vision-based reaction\ntime measurements. These insights resonate well with those derived using the\nSRT technique, underscoring the viability of this approach for tangible\nreal-world uses.\n","authors":["Sepehr Sabeti","Fatemeh Banani Ardecani","Omidreza Shoghli"],"pdf_url":"https://arxiv.org/pdf/2403.15571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01482v2","updated":"2024-03-29T18:52:59Z","published":"2024-01-03T01:11:16Z","title":"Incorporating Geo-Diverse Knowledge into Prompting for Increased\n Geographical Robustness in Object Recognition","summary":" Existing object recognition models have been shown to lack robustness in\ndiverse geographical scenarios due to domain shifts in design and context.\nClass representations need to be adapted to more accurately reflect an object\nconcept under these shifts. In the absence of training data from target\ngeographies, we hypothesize that geographically diverse descriptive knowledge\nof categories can enhance robustness. For this purpose, we explore the\nfeasibility of probing a large language model for geography-based object\nknowledge, and we examine the effects of integrating knowledge into zero-shot\nand learnable soft prompting with CLIP. Within this exploration, we propose\ngeography knowledge regularization to ensure that soft prompts trained on a\nsource set of geographies generalize to an unseen target set. Accuracy gains\nover prompting baselines on DollarStreet while training only on Europe data are\nup to +2.8/1.2/1.6 on target data from Africa/Asia/Americas, and +4.6 overall\non the hardest classes. Competitive performance is shown vs. few-shot target\ntraining, and analysis is provided to direct future study of geographical\nrobustness.\n","authors":["Kyle Buettner","Sina Malakouti","Xiang Lorraine Li","Adriana Kovashka"],"pdf_url":"https://arxiv.org/pdf/2401.01482v2.pdf","comment":"To appear in IEEE/CVF Computer Vision and Pattern Recognition\n Conference (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.17801v2","updated":"2024-03-29T18:45:35Z","published":"2024-03-26T15:40:05Z","title":"Towards 3D Vision with Low-Cost Single-Photon Cameras","summary":" We present a method for reconstructing 3D shape of arbitrary Lambertian\nobjects based on measurements by miniature, energy-efficient, low-cost\nsingle-photon cameras. These cameras, operating as time resolved image sensors,\nilluminate the scene with a very fast pulse of diffuse light and record the\nshape of that pulse as it returns back from the scene at a high temporal\nresolution. We propose to model this image formation process, account for its\nnon-idealities, and adapt neural rendering to reconstruct 3D geometry from a\nset of spatially distributed sensors with known poses. We show that our\napproach can successfully recover complex 3D shapes from simulated data. We\nfurther demonstrate 3D object reconstruction from real-world captures,\nutilizing measurements from a commodity proximity sensor. Our work draws a\nconnection between image-based modeling and active range scanning and is a step\ntowards 3D vision with single-photon cameras.\n","authors":["Fangzhou Mu","Carter Sifferman","Sacha Jungerman","Yiquan Li","Mark Han","Michael Gleicher","Mohit Gupta","Yin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00604v2","updated":"2024-03-29T18:33:30Z","published":"2023-12-31T23:04:25Z","title":"SteinDreamer: Variance Reduction for Text-to-3D Score Distillation via\n Stein Identity","summary":" Score distillation has emerged as one of the most prevalent approaches for\ntext-to-3D asset synthesis. Essentially, score distillation updates 3D\nparameters by lifting and back-propagating scores averaged over different\nviews. In this paper, we reveal that the gradient estimation in score\ndistillation is inherent to high variance. Through the lens of variance\nreduction, the effectiveness of SDS and VSD can be interpreted as applications\nof various control variates to the Monte Carlo estimator of the distilled\nscore. Motivated by this rethinking and based on Stein's identity, we propose a\nmore general solution to reduce variance for score distillation, termed Stein\nScore Distillation (SSD). SSD incorporates control variates constructed by\nStein identity, allowing for arbitrary baseline functions. This enables us to\ninclude flexible guidance priors and network architectures to explicitly\noptimize for variance reduction. In our experiments, the overall pipeline,\ndubbed SteinDreamer, is implemented by instantiating the control variate with a\nmonocular depth estimator. The results suggest that SSD can effectively reduce\nthe distillation variance and consistently improve visual quality for both\nobject- and scene-level generation. Moreover, we demonstrate that SteinDreamer\nachieves faster convergence than existing methods due to more stable gradient\nupdates.\n","authors":["Peihao Wang","Zhiwen Fan","Dejia Xu","Dilin Wang","Sreyas Mohan","Forrest Iandola","Rakesh Ranjan","Yilei Li","Qiang Liu","Zhangyang Wang","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2401.00604v2.pdf","comment":"Project page: https://vita-group.github.io/SteinDreamer/"},{"id":"http://arxiv.org/abs/2307.08919v3","updated":"2024-03-29T18:19:36Z","published":"2023-07-18T01:31:47Z","title":"Systematic comparison of semi-supervised and self-supervised learning\n for medical image classification","summary":" In typical medical image classification problems, labeled data is scarce\nwhile unlabeled data is more available. Semi-supervised learning and\nself-supervised learning are two different research directions that can improve\naccuracy by learning from extra unlabeled data. Recent methods from both\ndirections have reported significant gains on traditional benchmarks. Yet past\nbenchmarks do not focus on medical tasks and rarely compare self- and semi-\nmethods together on an equal footing. Furthermore, past benchmarks often handle\nhyperparameter tuning suboptimally. First, they may not tune hyperparameters at\nall, leading to underfitting. Second, when tuning does occur, it often\nunrealistically uses a labeled validation set that is much larger than the\ntraining set. Therefore currently published rankings might not always\ncorroborate with their practical utility This study contributes a systematic\nevaluation of self- and semi- methods with a unified experimental protocol\nintended to guide a practitioner with scarce overall labeled data and a limited\ncompute budget. We answer two key questions: Can hyperparameter tuning be\neffective with realistic-sized validation sets? If so, when all methods are\ntuned well, which self- or semi-supervised methods achieve the best accuracy?\nOur study compares 13 representative semi- and self-supervised methods to\nstrong labeled-set-only baselines on 4 medical datasets. From 20000+ GPU hours\nof computation, we provide valuable best practices to resource-constrained\npractitioners: hyperparameter tuning is effective, and the semi-supervised\nmethod known as MixMatch delivers the most reliable gains across 4 datasets.\n","authors":["Zhe Huang","Ruijie Jiang","Shuchin Aeron","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2307.08919v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18784v2","updated":"2024-03-29T18:16:29Z","published":"2024-03-27T17:32:04Z","title":"SplatFace: Gaussian Splat Face Reconstruction Leveraging an Optimizable\n Surface","summary":" We present SplatFace, a novel Gaussian splatting framework designed for 3D\nhuman face reconstruction without reliance on accurate pre-determined geometry.\nOur method is designed to simultaneously deliver both high-quality novel view\nrendering and accurate 3D mesh reconstructions. We incorporate a generic 3D\nMorphable Model (3DMM) to provide a surface geometric structure, making it\npossible to reconstruct faces with a limited set of input images. We introduce\na joint optimization strategy that refines both the Gaussians and the morphable\nsurface through a synergistic non-rigid alignment process. A novel distance\nmetric, splat-to-surface, is proposed to improve alignment by considering both\nthe Gaussian position and covariance. The surface information is also utilized\nto incorporate a world-space densification process, resulting in superior\nreconstruction quality. Our experimental analysis demonstrates that the\nproposed method is competitive with both other Gaussian splatting techniques in\nnovel view synthesis and other 3D reconstruction methods in producing 3D face\nmeshes with high geometric precision.\n","authors":["Jiahao Luo","Jing Liu","James Davis"],"pdf_url":"https://arxiv.org/pdf/2403.18784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00909v2","updated":"2024-03-29T18:04:37Z","published":"2023-12-31T22:47:06Z","title":"Taming Mode Collapse in Score Distillation for Text-to-3D Generation","summary":" Despite the remarkable performance of score distillation in text-to-3D\ngeneration, such techniques notoriously suffer from view inconsistency issues,\nalso known as \"Janus\" artifact, where the generated objects fake each view with\nmultiple front faces. Although empirically effective methods have approached\nthis problem via score debiasing or prompt engineering, a more rigorous\nperspective to explain and tackle this problem remains elusive. In this paper,\nwe reveal that the existing score distillation-based text-to-3D generation\nframeworks degenerate to maximal likelihood seeking on each view independently\nand thus suffer from the mode collapse problem, manifesting as the Janus\nartifact in practice. To tame mode collapse, we improve score distillation by\nre-establishing the entropy term in the corresponding variational objective,\nwhich is applied to the distribution of rendered images. Maximizing the entropy\nencourages diversity among different views in generated 3D assets, thereby\nmitigating the Janus problem. Based on this new objective, we derive a new\nupdate rule for 3D score distillation, dubbed Entropic Score Distillation\n(ESD). We theoretically reveal that ESD can be simplified and implemented by\njust adopting the classifier-free guidance trick upon variational score\ndistillation. Although embarrassingly straightforward, our extensive\nexperiments successfully demonstrate that ESD can be an effective treatment for\nJanus artifacts in score distillation.\n","authors":["Peihao Wang","Dejia Xu","Zhiwen Fan","Dilin Wang","Sreyas Mohan","Forrest Iandola","Rakesh Ranjan","Yilei Li","Qiang Liu","Zhangyang Wang","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2401.00909v2.pdf","comment":"Project page: https://vita-group.github.io/3D-Mode-Collapse/"},{"id":"http://arxiv.org/abs/2401.06407v2","updated":"2024-03-29T18:02:27Z","published":"2024-01-12T07:04:44Z","title":"UAV-Borne Mapping Algorithms for Low-Altitude and High-Speed Drone\n Applications","summary":" This article presents an analysis of current state-of-the-art sensors and how\nthese sensors work with several mapping algorithms for UAV (Unmanned Aerial\nVehicle) applications, focusing on low-altitude and high-speed scenarios. A new\nexperimental construct is created using highly realistic environments made\npossible by integrating the AirSim simulator with Google 3D maps models using\nthe Cesium Tiles plugin. Experiments are conducted in this high-realism\nsimulated environment to evaluate the performance of three distinct mapping\nalgorithms: (1) Direct Sparse Odometry (DSO), (2) Stereo DSO (SDSO), and (3)\nDSO Lite (DSOL). Experimental results evaluate algorithms based on their\nmeasured geometric accuracy and computational speed. The results provide\nvaluable insights into the strengths and limitations of each algorithm.\nFindings quantify compromises in UAV algorithm selection, allowing researchers\nto find the mapping solution best suited to their application, which often\nrequires a compromise between computational performance and the density and\naccuracy of geometric map estimates. Results indicate that for UAVs with\nrestrictive computing resources, DSOL is the best option. For systems with\npayload capacity and modest compute resources, SDSO is the best option. If only\none camera is available, DSO is the option to choose for applications that\nrequire dense mapping results.\n","authors":["Jincheng Zhang","Artur Wolek","Andrew R. Willis"],"pdf_url":"https://arxiv.org/pdf/2401.06407v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00762v2","updated":"2024-03-29T18:02:03Z","published":"2024-03-01T18:59:03Z","title":"Point Cloud Mamba: Point Cloud Learning via State Space Model","summary":" In this work, for the first time, we demonstrate that Mamba-based point cloud\nmethods can outperform point-based methods. Mamba exhibits strong global\nmodeling capabilities and linear computational complexity, making it highly\nattractive for point cloud analysis. To enable more effective processing of 3-D\npoint cloud data by Mamba, we propose a novel Consistent Traverse Serialization\nto convert point clouds into 1-D point sequences while ensuring that\nneighboring points in the sequence are also spatially adjacent. Consistent\nTraverse Serialization yields six variants by permuting the order of x, y, and\nz coordinates, and the synergistic use of these variants aids Mamba in\ncomprehensively observing point cloud data. Furthermore, to assist Mamba in\nhandling point sequences with different orders more effectively, we introduce\npoint prompts to inform Mamba of the sequence's arrangement rules. Finally, we\npropose positional encoding based on spatial coordinate mapping to inject\npositional information into point cloud sequences better. Based on these\nimprovements, we construct a point cloud network named Point Cloud Mamba, which\ncombines local and global modeling. Point Cloud Mamba surpasses the SOTA\npoint-based method PointNeXt and achieves new SOTA performance on the\nScanObjectNN, ModelNet40, and ShapeNetPart datasets.\n","authors":["Tao Zhang","Xiangtai Li","Haobo Yuan","Shunping Ji","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2403.00762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20213v1","updated":"2024-03-29T14:50:43Z","published":"2024-03-29T14:50:43Z","title":"H2RSVLM: Towards Helpful and Honest Remote Sensing Large Vision Language\n Model","summary":" The generic large Vision-Language Models (VLMs) is rapidly developing, but\nstill perform poorly in Remote Sensing (RS) domain, which is due to the unique\nand specialized nature of RS imagery and the comparatively limited spatial\nperception of current VLMs. Existing Remote Sensing specific Vision Language\nModels (RSVLMs) still have considerable potential for improvement, primarily\nowing to the lack of large-scale, high-quality RS vision-language datasets. We\nconstructed HqDC-1.4M, the large scale High quality and Detailed Captions for\nRS images, containing 1.4 million image-caption pairs, which not only enhance\nthe RSVLM's understanding of RS images but also significantly improve the\nmodel's spatial perception abilities, such as localization and counting,\nthereby increasing the helpfulness of the RSVLM. Moreover, to address the\ninevitable \"hallucination\" problem in RSVLM, we developed RSSA, the first\ndataset aimed at enhancing the Self-Awareness capability of RSVLMs. By\nincorporating a variety of unanswerable questions into typical RS visual\nquestion-answering tasks, RSSA effectively improves the truthfulness and\nreduces the hallucinations of the model's outputs, thereby enhancing the\nhonesty of the RSVLM. Based on these datasets, we proposed the H2RSVLM, the\nHelpful and Honest Remote Sensing Vision Language Model. H2RSVLM has achieved\noutstanding performance on multiple RS public datasets and is capable of\nrecognizing and refusing to answer the unanswerable questions, effectively\nmitigating the incorrect generations. We will release the code, data and model\nweights at https://github.com/opendatalab/H2RSVLM .\n","authors":["Chao Pang","Jiang Wu","Jiayu Li","Yi Liu","Jiaxing Sun","Weijia Li","Xingxing Weng","Shuai Wang","Litong Feng","Gui-Song Xia","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2403.20213v1.pdf","comment":"Equal contribution: Chao Pang, Jiang Wu; Corresponding author:\n Gui-Song Xia, Conghui He"},{"id":"http://arxiv.org/abs/2403.10897v2","updated":"2024-03-29T14:49:11Z","published":"2024-03-16T11:21:24Z","title":"Rethinking Multi-view Representation Learning via Distilled\n Disentangling","summary":" Multi-view representation learning aims to derive robust representations that\nare both view-consistent and view-specific from diverse data sources. This\npaper presents an in-depth analysis of existing approaches in this domain,\nhighlighting a commonly overlooked aspect: the redundancy between\nview-consistent and view-specific representations. To this end, we propose an\ninnovative framework for multi-view representation learning, which incorporates\na technique we term 'distilled disentangling'. Our method introduces the\nconcept of masked cross-view prediction, enabling the extraction of compact,\nhigh-quality view-consistent representations from various sources without\nincurring extra computational overhead. Additionally, we develop a distilled\ndisentangling module that efficiently filters out consistency-related\ninformation from multi-view representations, resulting in purer view-specific\nrepresentations. This approach significantly reduces redundancy between\nview-consistent and view-specific representations, enhancing the overall\nefficiency of the learning process. Our empirical evaluations reveal that\nhigher mask ratios substantially improve the quality of view-consistent\nrepresentations. Moreover, we find that reducing the dimensionality of\nview-consistent representations relative to that of view-specific\nrepresentations further refines the quality of the combined representations.\nOur code is accessible at: https://github.com/Guanzhou-Ke/MRDD.\n","authors":["Guanzhou Ke","Bo Wang","Xiaoli Wang","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2403.10897v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12931v2","updated":"2024-03-29T14:48:42Z","published":"2024-03-19T17:34:27Z","title":"You Only Sample Once: Taming One-Step Text-To-Image Synthesis by\n Self-Cooperative Diffusion GANs","summary":" We introduce YOSO, a novel generative model designed for rapid, scalable, and\nhigh-fidelity one-step image synthesis. This is achieved by integrating the\ndiffusion process with GANs. Specifically, we smooth the distribution by the\ndenoising generator itself, performing self-cooperative learning. We show that\nour method can serve as a one-step generation model training from scratch with\ncompetitive performance. Moreover, we show that our method can be extended to\nfinetune pre-trained text-to-image diffusion for high-quality one-step\ntext-to-image synthesis even with LoRA fine-tuning. In particular, we provide\nthe first diffusion transformer that can generate images in one step trained on\n512 resolution, with the capability of adapting to 1024 resolution without\nexplicit training. Our code is provided at https://github.com/Luo-Yihong/YOSO.\n","authors":["Yihong Luo","Xiaolong Chen","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2403.12931v2.pdf","comment":"Early version"},{"id":"http://arxiv.org/abs/2403.11371v4","updated":"2024-03-29T14:19:56Z","published":"2024-03-17T23:29:41Z","title":"V2X-DGW: Domain Generalization for Multi-agent Perception under Adverse\n Weather Conditions","summary":" Current LiDAR-based Vehicle-to-Everything (V2X) multi-agent perception\nsystems have shown the significant success on 3D object detection. While these\nmodels perform well in the trained clean weather, they struggle in unseen\nadverse weather conditions with the real-world domain gap. In this paper, we\npropose a domain generalization approach, named V2X-DGW, for LiDAR-based 3D\nobject detection on multi-agent perception system under adverse weather\nconditions. Not only in the clean weather does our research aim to ensure\nfavorable multi-agent performance, but also in the unseen adverse weather\nconditions by learning only on the clean weather data. To advance research in\nthis area, we have simulated the impact of three prevalent adverse weather\nconditions on two widely-used multi-agent datasets, resulting in the creation\nof two novel benchmark datasets: OPV2V-w and V2XSet-w.\n To this end, we first introduce the Adaptive Weather Augmentation (AWA) to\nmimic the unseen adverse weather conditions, and then propose two alignments\nfor generalizable representation learning: Trust-region Weather-invariant\nAlignment (TWA) and Agent-aware Contrastive Alignment (ACA). Extensive\nexperimental results demonstrate that our V2X-DGW achieved improvements in the\nunseen adverse weather conditions.\n","authors":["Baolu Li","Jinlong Li","Xinyu Liu","Runsheng Xu","Zhengzhong Tu","Jiacheng Guo","Xiaopeng Li","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2403.11371v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20195v1","updated":"2024-03-29T14:17:30Z","published":"2024-03-29T14:17:30Z","title":"Enhancing Lithological Mapping with Spatially Constrained Bayesian\n Network (SCB-Net): An Approach for Field Data-Constrained Predictions with\n Uncertainty Evaluation","summary":" Geological maps are an extremely valuable source of information for the Earth\nsciences. They provide insights into mineral exploration, vulnerability to\nnatural hazards, and many other applications. These maps are created using\nnumerical or conceptual models that use geological observations to extrapolate\ndata. Geostatistical techniques have traditionally been used to generate\nreliable predictions that take into account the spatial patterns inherent in\nthe data. However, as the number of auxiliary variables increases, these\nmethods become more labor-intensive. Additionally, traditional machine learning\nmethods often struggle with spatially correlated data and extracting valuable\nnon-linear information from geoscientific datasets. To address these\nlimitations, a new architecture called the Spatially Constrained Bayesian\nNetwork (SCB-Net) has been developed. The SCB-Net aims to effectively exploit\nthe information from auxiliary variables while producing spatially constrained\npredictions. It is made up of two parts, the first part focuses on learning\nunderlying patterns in the auxiliary variables while the second part integrates\nground-truth data and the learned embeddings from the first part. Moreover, to\nassess model uncertainty, a technique called Monte Carlo dropout is used as a\nBayesian approximation. The SCB-Net has been applied to two selected areas in\nnorthern Quebec, Canada, and has demonstrated its potential in generating\nfield-data-constrained lithological maps while allowing assessment of\nprediction uncertainty for decision-making. This study highlights the promising\nadvancements of deep neural networks in geostatistics, particularly in handling\ncomplex spatial feature learning tasks, leading to improved spatial information\ntechniques.\n","authors":["Victor Silva dos Santos","Erwan Gloaguen","Shiva Tirdad"],"pdf_url":"https://arxiv.org/pdf/2403.20195v1.pdf","comment":"17 pages, 3559 words, 14 figures"},{"id":"http://arxiv.org/abs/2403.20193v1","updated":"2024-03-29T14:14:22Z","published":"2024-03-29T14:14:22Z","title":"Motion Inversion for Video Customization","summary":" In this research, we present a novel approach to motion customization in\nvideo generation, addressing the widespread gap in the thorough exploration of\nmotion representation within video generative models. Recognizing the unique\nchallenges posed by video's spatiotemporal nature, our method introduces Motion\nEmbeddings, a set of explicit, temporally coherent one-dimensional embeddings\nderived from a given video. These embeddings are designed to integrate\nseamlessly with the temporal transformer modules of video diffusion models,\nmodulating self-attention computations across frames without compromising\nspatial integrity. Our approach offers a compact and efficient solution to\nmotion representation and enables complex manipulations of motion\ncharacteristics through vector arithmetic in the embedding space. Furthermore,\nwe identify the Temporal Discrepancy in video generative models, which refers\nto variations in how different motion modules process temporal relationships\nbetween frames. We leverage this understanding to optimize the integration of\nour motion embeddings. Our contributions include the introduction of a tailored\nmotion embedding for customization tasks, insights into the temporal processing\ndifferences in video models, and a demonstration of the practical advantages\nand effectiveness of our method through extensive experiments.\n","authors":["Luozhou Wang","Guibao Shen","Yixun Liang","Xin Tao","Pengfei Wan","Di Zhang","Yijun Li","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2403.20193v1.pdf","comment":"Project Page:\n \\href{https://wileewang.github.io/MotionInversion/}{https://wileewang.github.io/MotionInversion/}"},{"id":"http://arxiv.org/abs/2403.20186v1","updated":"2024-03-29T14:04:45Z","published":"2024-03-29T14:04:45Z","title":"Sketch-to-Architecture: Generative AI-aided Architectural Design","summary":" Recently, the development of large-scale models has paved the way for various\ninterdisciplinary research, including architecture. By using generative AI, we\npresent a novel workflow that utilizes AI models to generate conceptual\nfloorplans and 3D models from simple sketches, enabling rapid ideation and\ncontrolled generation of architectural renderings based on textual\ndescriptions. Our work demonstrates the potential of generative AI in the\narchitectural design process, pointing towards a new direction of\ncomputer-aided architectural design. Our project website is available at:\nhttps://zrealli.github.io/sketch2arc\n","authors":["Pengzhi Li","Baijuan Li","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.20186v1.pdf","comment":"Pacific Graphics 2023, accepted as Poster"},{"id":"http://arxiv.org/abs/2403.20183v1","updated":"2024-03-29T13:57:46Z","published":"2024-03-29T13:57:46Z","title":"HARMamba: Efficient Wearable Sensor Human Activity Recognition Based on\n Bidirectional Selective SSM","summary":" Wearable sensor human activity recognition (HAR) is a crucial area of\nresearch in activity sensing. While transformer-based temporal deep learning\nmodels have been extensively studied and implemented, their large number of\nparameters present significant challenges in terms of system computing load and\nmemory usage, rendering them unsuitable for real-time mobile activity\nrecognition applications. Recently, an efficient hardware-aware state space\nmodel (SSM) called Mamba has emerged as a promising alternative. Mamba\ndemonstrates strong potential in long sequence modeling, boasts a simpler\nnetwork architecture, and offers an efficient hardware-aware design. Leveraging\nSSM for activity recognition represents an appealing avenue for exploration. In\nthis study, we introduce HARMamba, which employs a more lightweight selective\nSSM as the foundational model architecture for activity recognition. The goal\nis to address the computational resource constraints encountered in real-time\nactivity recognition scenarios. Our approach involves processing sensor data\nflow by independently learning each channel and segmenting the data into\n\"patches\". The marked sensor sequence's position embedding serves as the input\ntoken for the bidirectional state space model, ultimately leading to activity\ncategorization through the classification head. Compared to established\nactivity recognition frameworks like Transformer-based models, HARMamba\nachieves superior performance while also reducing computational and memory\noverhead. Furthermore, our proposed method has been extensively tested on four\npublic activity datasets: PAMAP2, WISDM, UNIMIB, and UCI, demonstrating\nimpressive performance in activity recognition tasks.\n","authors":["Shuangjian Li","Tao Zhu","Furong Duan","Liming Chen","Huansheng Ning","Yaping Wan"],"pdf_url":"https://arxiv.org/pdf/2403.20183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01705v2","updated":"2024-03-29T13:53:33Z","published":"2023-04-04T11:01:46Z","title":"Cross-modal tumor segmentation using generative blending augmentation\n and self training","summary":" \\textit{Objectives}: Data scarcity and domain shifts lead to biased training\nsets that do not accurately represent deployment conditions. A related\npractical problem is cross-modal image segmentation, where the objective is to\nsegment unlabelled images using previously labelled datasets from other imaging\nmodalities. \\textit{Methods}: We propose a cross-modal segmentation method\nbased on conventional image synthesis boosted by a new data augmentation\ntechnique called Generative Blending Augmentation (GBA). GBA leverages a SinGAN\nmodel to learn representative generative features from a single training image\nto diversify realistically tumor appearances. This way, we compensate for image\nsynthesis errors, subsequently improving the generalization power of a\ndownstream segmentation model. The proposed augmentation is further combined to\nan iterative self-training procedure leveraging pseudo labels at each pass.\n\\textit{Results}: The proposed solution ranked first for vestibular schwannoma\n(VS) segmentation during the validation and test phases of the MICCAI CrossMoDA\n2022 challenge, with best mean Dice similarity and average symmetric surface\ndistance measures. \\textit{Conclusion and significance}: Local contrast\nalteration of tumor appearances and iterative self-training with pseudo labels\nare likely to lead to performance improvements in a variety of segmentation\ncontexts.\n","authors":["Guillaume Sallé","Pierre-Henri Conze","Julien Bert","Nicolas Boussion","Dimitris Visvikis","Vincent Jaouen"],"pdf_url":"https://arxiv.org/pdf/2304.01705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20173v1","updated":"2024-03-29T13:40:44Z","published":"2024-03-29T13:40:44Z","title":"MCNet: A crowd denstity estimation network based on integrating\n multiscale attention module","summary":" Aiming at the metro video surveillance system has not been able to\neffectively solve the metro crowd density estimation problem, a Metro Crowd\ndensity estimation Network (called MCNet) is proposed to automatically classify\ncrowd density level of passengers. Firstly, an Integrating Multi-scale\nAttention (IMA) module is proposed to enhance the ability of the plain\nclassifiers to extract semantic crowd texture features to accommodate to the\ncharacteristics of the crowd texture feature. The innovation of the IMA module\nis to fuse the dilation convolution, multiscale feature extraction and\nattention mechanism to obtain multi-scale crowd feature activation from a\nlarger receptive field with lower computational cost, and to strengthen the\ncrowds activation state of convolutional features in top layers. Secondly, a\nnovel lightweight crowd texture feature extraction network is proposed, which\ncan directly process video frames and automatically extract texture features\nfor crowd density estimation, while its faster image processing speed and fewer\nnetwork parameters make it flexible to be deployed on embedded platforms with\nlimited hardware resources. Finally, this paper integrates IMA module and the\nlightweight crowd texture feature extraction network to construct the MCNet,\nand validate the feasibility of this network on image classification dataset:\nCifar10 and four crowd density datasets: PETS2009, Mall, QUT and SH_METRO to\nvalidate the MCNet whether can be a suitable solution for crowd density\nestimation in metro video surveillance where there are image processing\nchallenges such as high density, high occlusion, perspective distortion and\nlimited hardware resources.\n","authors":["Qiang Guo","Rubo Zhang","Di Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.20173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20168v1","updated":"2024-03-29T13:35:37Z","published":"2024-03-29T13:35:37Z","title":"Unsupervised Tumor-Aware Distillation for Multi-Modal Brain Image\n Translation","summary":" Multi-modal brain images from MRI scans are widely used in clinical diagnosis\nto provide complementary information from different modalities. However,\nobtaining fully paired multi-modal images in practice is challenging due to\nvarious factors, such as time, cost, and artifacts, resulting in\nmodality-missing brain images. To address this problem, unsupervised\nmulti-modal brain image translation has been extensively studied. Existing\nmethods suffer from the problem of brain tumor deformation during translation,\nas they fail to focus on the tumor areas when translating the whole images. In\nthis paper, we propose an unsupervised tumor-aware distillation teacher-student\nnetwork called UTAD-Net, which is capable of perceiving and translating tumor\nareas precisely. Specifically, our model consists of two parts: a teacher\nnetwork and a student network. The teacher network learns an end-to-end mapping\nfrom source to target modality using unpaired images and corresponding tumor\nmasks first. Then, the translation knowledge is distilled into the student\nnetwork, enabling it to generate more realistic tumor areas and whole images\nwithout masks. Experiments show that our model achieves competitive performance\non both quantitative and qualitative evaluations of image quality compared with\nstate-of-the-art methods. Furthermore, we demonstrate the effectiveness of the\ngenerated images on downstream segmentation tasks. Our code is available at\nhttps://github.com/scut-HC/UTAD-Net.\n","authors":["Chuan Huang","Jia Wei","Rui Li"],"pdf_url":"https://arxiv.org/pdf/2403.20168v1.pdf","comment":"8 pages, 5 figures. It has been provisionally accepted for IJCNN 2024"},{"id":"http://arxiv.org/abs/2304.00746v4","updated":"2024-03-29T13:32:53Z","published":"2023-04-03T06:40:52Z","title":"VGTS: Visually Guided Text Spotting for Novel Categories in Historical\n Manuscripts","summary":" In the field of historical manuscript research, scholars frequently encounter\nnovel symbols in ancient texts, investing considerable effort in their\nidentification and documentation. Although existing object detection methods\nachieve impressive performance on known categories, they struggle to recognize\nnovel symbols without retraining. To address this limitation, we propose a\nVisually Guided Text Spotting (VGTS) approach that accurately spots novel\ncharacters using just one annotated support sample. The core of VGTS is a\nspatial alignment module consisting of a Dual Spatial Attention (DSA) block and\na Geometric Matching (GM) block. The DSA block aims to identify, focus on, and\nlearn discriminative spatial regions in the support and query images, mimicking\nthe human visual spotting process. It first refines the support image by\nanalyzing inter-channel relationships to identify critical areas, and then\nrefines the query image by focusing on informative key points. The GM block, on\nthe other hand, establishes the spatial correspondence between the two images,\nenabling accurate localization of the target character in the query image. To\ntackle the example imbalance problem in low-resource spotting tasks, we develop\na novel torus loss function that enhances the discriminative power of the\nembedding space for distance metric learning. To further validate our approach,\nwe introduce a new dataset featuring ancient Dongba hieroglyphics (DBH)\nassociated with the Naxi minority of China. Extensive experiments on the DBH\ndataset and other public datasets, including EGY, VML-HD, TKH, and NC, show\nthat VGTS consistently surpasses state-of-the-art methods. The proposed\nframework exhibits great potential for application in historical manuscript\ntext spotting, enabling scholars to efficiently identify and document novel\nsymbols with minimal annotation effort.\n","authors":["Wenbo Hu","Hongjian Zhan","Xinchen Ma","Cong Liu","Bing Yin","Yue Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00746v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20159v1","updated":"2024-03-29T13:16:05Z","published":"2024-03-29T13:16:05Z","title":"HGS-Mapping: Online Dense Mapping Using Hybrid Gaussian Representation\n in Urban Scenes","summary":" Online dense mapping of urban scenes forms a fundamental cornerstone for\nscene understanding and navigation of autonomous vehicles. Recent advancements\nin mapping methods are mainly based on NeRF, whose rendering speed is too slow\nto meet online requirements. 3D Gaussian Splatting (3DGS), with its rendering\nspeed hundreds of times faster than NeRF, holds greater potential in online\ndense mapping. However, integrating 3DGS into a street-view dense mapping\nframework still faces two challenges, including incomplete reconstruction due\nto the absence of geometric information beyond the LiDAR coverage area and\nextensive computation for reconstruction in large urban scenes. To this end, we\npropose HGS-Mapping, an online dense mapping framework in unbounded large-scale\nscenes. To attain complete construction, our framework introduces Hybrid\nGaussian Representation, which models different parts of the entire scene using\nGaussians with distinct properties. Furthermore, we employ a hybrid Gaussian\ninitialization mechanism and an adaptive update method to achieve high-fidelity\nand rapid reconstruction. To the best of our knowledge, we are the first to\nintegrate Gaussian representation into online dense mapping of urban scenes.\nOur approach achieves SOTA reconstruction accuracy while only employing 66%\nnumber of Gaussians, leading to 20% faster reconstruction speed.\n","authors":["Ke Wu","Kaizhao Zhang","Zhiwei Zhang","Shanshuai Yuan","Muer Tie","Julong Wei","Zijun Xu","Jieru Zhao","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2403.20159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00274v2","updated":"2024-03-29T13:14:59Z","published":"2024-03-01T04:31:56Z","title":"CustomListener: Text-guided Responsive Interaction for User-friendly\n Listening Head Generation","summary":" Listening head generation aims to synthesize a non-verbal responsive listener\nhead by modeling the correlation between the speaker and the listener in\ndynamic conversion.The applications of listener agent generation in virtual\ninteraction have promoted many works achieving the diverse and fine-grained\nmotion generation. However, they can only manipulate motions through simple\nemotional labels, but cannot freely control the listener's motions. Since\nlistener agents should have human-like attributes (e.g. identity, personality)\nwhich can be freely customized by users, this limits their realism. In this\npaper, we propose a user-friendly framework called CustomListener to realize\nthe free-form text prior guided listener generation. To achieve\nspeaker-listener coordination, we design a Static to Dynamic Portrait module\n(SDP), which interacts with speaker information to transform static text into\ndynamic portrait token with completion rhythm and amplitude information. To\nachieve coherence between segments, we design a Past Guided Generation Module\n(PGG) to maintain the consistency of customized listener attributes through the\nmotion prior, and utilize a diffusion-based structure conditioned on the\nportrait token and the motion prior to realize the controllable generation. To\ntrain and evaluate our model, we have constructed two text-annotated listening\nhead datasets based on ViCo and RealTalk, which provide text-video paired\nlabels. Extensive experiments have verified the effectiveness of our model.\n","authors":["Xi Liu","Ying Guo","Cheng Zhen","Tong Li","Yingying Ao","Pengfei Yan"],"pdf_url":"https://arxiv.org/pdf/2403.00274v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2401.06312v4","updated":"2024-03-29T13:10:56Z","published":"2024-01-12T00:49:49Z","title":"Video Super-Resolution Transformer with Masked Inter&Intra-Frame\n Attention","summary":" Recently, Vision Transformer has achieved great success in recovering missing\ndetails in low-resolution sequences, i.e., the video super-resolution (VSR)\ntask. Despite its superiority in VSR accuracy, the heavy computational burden\nas well as the large memory footprint hinder the deployment of\nTransformer-based VSR models on constrained devices. In this paper, we address\nthe above issue by proposing a novel feature-level masked processing framework:\nVSR with Masked Intra and inter frame Attention (MIA-VSR). The core of MIA-VSR\nis leveraging feature-level temporal continuity between adjacent frames to\nreduce redundant computations and make more rational use of previously enhanced\nSR features. Concretely, we propose an intra-frame and inter-frame attention\nblock which takes the respective roles of past features and input features into\nconsideration and only exploits previously enhanced features to provide\nsupplementary information. In addition, an adaptive block-wise mask prediction\nmodule is developed to skip unimportant computations according to feature\nsimilarity between adjacent frames. We conduct detailed ablation studies to\nvalidate our contributions and compare the proposed method with recent\nstate-of-the-art VSR approaches. The experimental results demonstrate that\nMIA-VSR improves the memory and computation efficiency over state-of-the-art\nmethods, without trading off PSNR accuracy. The code is available at\nhttps://github.com/LabShuHangGU/MIA-VSR.\n","authors":["Xingyu Zhou","Leheng Zhang","Xiaorui Zhao","Keze Wang","Leida Li","Shuhang Gu"],"pdf_url":"https://arxiv.org/pdf/2401.06312v4.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2307.09591v2","updated":"2024-03-29T13:04:03Z","published":"2023-07-18T19:56:20Z","title":"Gradient strikes back: How filtering out high frequencies improves\n explanations","summary":" Attribution methods correspond to a class of explainability methods (XAI)\nthat aim to assess how individual inputs contribute to a model's\ndecision-making process. We have identified a significant limitation in one\ntype of attribution methods, known as \"white-box\" methods. Although highly\nefficient, these methods rely on a gradient signal that is often contaminated\nby high-frequency noise. To overcome this limitation, we introduce a new\napproach called \"FORGrad\". This simple method effectively filters out noise\nartifacts by using optimal cut-off frequencies tailored to the unique\ncharacteristics of each model architecture. Our findings show that FORGrad\nconsistently enhances the performance of already existing white-box methods,\nenabling them to compete effectively with more accurate yet computationally\ndemanding \"black-box\" methods. We anticipate that our research will foster\nbroader adoption of simpler and more efficient white-box methods for\nexplainability, offering a better balance between faithfulness and\ncomputational efficiency.\n","authors":["Sabine Muzellec","Thomas Fel","Victor Boutin","Léo andéol","Rufin VanRullen","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2307.09591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08396v5","updated":"2024-03-29T12:50:38Z","published":"2023-05-15T07:23:54Z","title":"MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation","summary":" Since their emergence, Convolutional Neural Networks (CNNs) have made\nsignificant strides in medical image analysis. However, the local nature of the\nconvolution operator may pose a limitation for capturing global and long-range\ninteractions in CNNs. Recently, Transformers have gained popularity in the\ncomputer vision community and also in medical image segmentation due to their\nability to process global features effectively. The scalability issues of the\nself-attention mechanism and lack of the CNN-like inductive bias may have\nlimited their adoption. Therefore, hybrid Vision transformers\n(CNN-Transformer), exploiting the advantages of both Convolution and\nSelf-attention Mechanisms, have gained importance. In this work, we present\nMaxViT-UNet, a new Encoder-Decoder based UNet type hybrid vision transformer\n(CNN-Transformer) for medical image segmentation. The proposed Hybrid Decoder\nis designed to harness the power of both the convolution and self-attention\nmechanisms at each decoding stage with a nominal memory and computational\nburden. The inclusion of multi-axis self-attention, within each decoder stage,\nsignificantly enhances the discriminating capacity between the object and\nbackground regions, thereby helping in improving the segmentation efficiency.\nIn the Hybrid Decoder, a new block is also proposed. The fusion process\ncommences by integrating the upsampled lower-level decoder features, obtained\nthrough transpose convolution, with the skip-connection features derived from\nthe hybrid encoder. Subsequently, the fused features undergo refinement through\nthe utilization of a multi-axis attention mechanism. The proposed decoder block\nis repeated multiple times to segment the nuclei regions progressively.\nExperimental results on MoNuSeg18 and MoNuSAC20 datasets demonstrate the\neffectiveness of the proposed technique.\n","authors":["Abdul Rehman Khan","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2305.08396v5.pdf","comment":"19 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.20153v1","updated":"2024-03-29T12:49:40Z","published":"2024-03-29T12:49:40Z","title":"Talk3D: High-Fidelity Talking Portrait Synthesis via Personalized 3D\n Generative Prior","summary":" Recent methods for audio-driven talking head synthesis often optimize neural\nradiance fields (NeRF) on a monocular talking portrait video, leveraging its\ncapability to render high-fidelity and 3D-consistent novel-view frames.\nHowever, they often struggle to reconstruct complete face geometry due to the\nabsence of comprehensive 3D information in the input monocular videos. In this\npaper, we introduce a novel audio-driven talking head synthesis framework,\ncalled Talk3D, that can faithfully reconstruct its plausible facial geometries\nby effectively adopting the pre-trained 3D-aware generative prior. Given the\npersonalized 3D generative model, we present a novel audio-guided attention\nU-Net architecture that predicts the dynamic face variations in the NeRF space\ndriven by audio. Furthermore, our model is further modulated by audio-unrelated\nconditioning tokens which effectively disentangle variations unrelated to audio\nfeatures. Compared to existing methods, our method excels in generating\nrealistic facial geometries even under extreme head poses. We also conduct\nextensive experiments showing our approach surpasses state-of-the-art\nbenchmarks in terms of both quantitative and qualitative evaluations.\n","authors":["Jaehoon Ko","Kyusun Cho","Joungbin Lee","Heeji Yoon","Sangmin Lee","Sangjun Ahn","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2403.20153v1.pdf","comment":"Project page: https://ku-cvlab.github.io/Talk3D/"},{"id":"http://arxiv.org/abs/2403.12687v2","updated":"2024-03-29T12:45:27Z","published":"2024-03-19T12:45:52Z","title":"Audio-Visual Compound Expression Recognition Method based on Late\n Modality Fusion and Rule-based Decision","summary":" This paper presents the results of the SUN team for the Compound Expressions\nRecognition Challenge of the 6th ABAW Competition. We propose a novel\naudio-visual method for compound expression recognition. Our method relies on\nemotion recognition models that fuse modalities at the emotion probability\nlevel, while decisions regarding the prediction of compound expressions are\nbased on predefined rules. Notably, our method does not use any training data\nspecific to the target task. Thus, the problem is a zero-shot classification\ntask. The method is evaluated in multi-corpus training and cross-corpus\nvalidation setups. Using our proposed method is achieved an F1-score value\nequals to 22.01% on the C-EXPR-DB test subset. Our findings from the challenge\ndemonstrate that the proposed method can potentially form a basis for\ndeveloping intelligent tools for annotating audio-visual data in the context of\nhuman's basic and compound emotions.\n","authors":["Elena Ryumina","Maxim Markitantov","Dmitry Ryumin","Heysem Kaya","Alexey Karpov"],"pdf_url":"https://arxiv.org/pdf/2403.12687v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2402.11677v2","updated":"2024-03-29T12:34:34Z","published":"2024-02-18T18:56:13Z","title":"MultiCorrupt: A Multi-Modal Robustness Dataset and Benchmark of\n LiDAR-Camera Fusion for 3D Object Detection","summary":" Multi-modal 3D object detection models for automated driving have\ndemonstrated exceptional performance on computer vision benchmarks like\nnuScenes. However, their reliance on densely sampled LiDAR point clouds and\nmeticulously calibrated sensor arrays poses challenges for real-world\napplications. Issues such as sensor misalignment, miscalibration, and disparate\nsampling frequencies lead to spatial and temporal misalignment in data from\nLiDAR and cameras. Additionally, the integrity of LiDAR and camera data is\noften compromised by adverse environmental conditions such as inclement\nweather, leading to occlusions and noise interference. To address this\nchallenge, we introduce MultiCorrupt, a comprehensive benchmark designed to\nevaluate the robustness of multi-modal 3D object detectors against ten distinct\ntypes of corruptions. We evaluate five state-of-the-art multi-modal detectors\non MultiCorrupt and analyze their performance in terms of their resistance\nability. Our results show that existing methods exhibit varying degrees of\nrobustness depending on the type of corruption and their fusion strategy. We\nprovide insights into which multi-modal design choices make such models robust\nagainst certain perturbations. The dataset generation code and benchmark are\nopen-sourced at https://github.com/ika-rwth-aachen/MultiCorrupt.\n","authors":["Till Beemelmanns","Quan Zhang","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2402.11677v2.pdf","comment":"Code: https://github.com/ika-rwth-aachen/MultiCorrupt"},{"id":"http://arxiv.org/abs/2311.15851v3","updated":"2024-03-29T12:25:45Z","published":"2023-11-27T14:17:41Z","title":"Single-Model and Any-Modality for Video Object Tracking","summary":" In the realm of video object tracking, auxiliary modalities such as depth,\nthermal, or event data have emerged as valuable assets to complement the RGB\ntrackers. In practice, most existing RGB trackers learn a single set of\nparameters to use them across datasets and applications. However, a similar\nsingle-model unification for multi-modality tracking presents several\nchallenges. These challenges stem from the inherent heterogeneity of inputs --\neach with modality-specific representations, the scarcity of multi-modal\ndatasets, and the absence of all the modalities at all times. In this work, we\nintroduce Un-Track, a Unified Tracker of a single set of parameters for any\nmodality. To handle any modality, our method learns their common latent space\nthrough low-rank factorization and reconstruction techniques. More importantly,\nwe use only the RGB-X pairs to learn the common latent space. This unique\nshared representation seamlessly binds all modalities together, enabling\neffective unification and accommodating any missing modality, all within a\nsingle transformer-based architecture. Our Un-Track achieves +8.1 absolute\nF-score gain, on the DepthTrack dataset, by introducing only +2.14 (over 21.50)\nGFLOPs with +6.6M (over 93M) parameters, through a simple yet efficient\nprompting strategy. Extensive comparisons on five benchmark datasets with\ndifferent modalities show that Un-Track surpasses both SOTA unified trackers\nand modality-specific counterparts, validating our effectiveness and\npracticality. The source code is publicly available at\nhttps://github.com/Zongwei97/UnTrack.\n","authors":["Zongwei Wu","Jilai Zheng","Xiangxuan Ren","Florin-Alexandru Vasluianu","Chao Ma","Danda Pani Paudel","Luc Van Gool","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2311.15851v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.20142v1","updated":"2024-03-29T12:23:58Z","published":"2024-03-29T12:23:58Z","title":"StegoGAN: Leveraging Steganography for Non-Bijective Image-to-Image\n Translation","summary":" Most image-to-image translation models postulate that a unique correspondence\nexists between the semantic classes of the source and target domains. However,\nthis assumption does not always hold in real-world scenarios due to divergent\ndistributions, different class sets, and asymmetrical information\nrepresentation. As conventional GANs attempt to generate images that match the\ndistribution of the target domain, they may hallucinate spurious instances of\nclasses absent from the source domain, thereby diminishing the usefulness and\nreliability of translated images. CycleGAN-based methods are also known to hide\nthe mismatched information in the generated images to bypass cycle consistency\nobjectives, a process known as steganography. In response to the challenge of\nnon-bijective image translation, we introduce StegoGAN, a novel model that\nleverages steganography to prevent spurious features in generated images. Our\napproach enhances the semantic consistency of the translated images without\nrequiring additional postprocessing or supervision. Our experimental\nevaluations demonstrate that StegoGAN outperforms existing GAN-based models\nacross various non-bijective image-to-image translation tasks, both\nqualitatively and quantitatively. Our code and pretrained models are accessible\nat https://github.com/sian-wusidi/StegoGAN.\n","authors":["Sidi Wu","Yizi Chen","Samuel Mermet","Lorenz Hurni","Konrad Schindler","Nicolas Gonthier","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2403.20142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00320v2","updated":"2024-03-29T12:04:52Z","published":"2023-12-30T20:52:20Z","title":"DXAI: Explaining Classification by Image Decomposition","summary":" We propose a new way to explain and to visualize neural network\nclassification through a decomposition-based explainable AI (DXAI). Instead of\nproviding an explanation heatmap, our method yields a decomposition of the\nimage into class-agnostic and class-distinct parts, with respect to the data\nand chosen classifier. Following a fundamental signal processing paradigm of\nanalysis and synthesis, the original image is the sum of the decomposed parts.\nWe thus obtain a radically different way of explaining classification. The\nclass-agnostic part ideally is composed of all image features which do not\nposses class information, where the class-distinct part is its complementary.\nThis new visualization can be more helpful and informative in certain\nscenarios, especially when the attributes are dense, global and additive in\nnature, for instance, when colors or textures are essential for class\ndistinction. Code is available at https://github.com/dxai2024/dxai.\n","authors":["Elnatan Kadar","Guy Gilboa"],"pdf_url":"https://arxiv.org/pdf/2401.00320v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20126v1","updated":"2024-03-29T11:31:12Z","published":"2024-03-29T11:31:12Z","title":"ECLIPSE: Efficient Continual Learning in Panoptic Segmentation with\n Visual Prompt Tuning","summary":" Panoptic segmentation, combining semantic and instance segmentation, stands\nas a cutting-edge computer vision task. Despite recent progress with deep\nlearning models, the dynamic nature of real-world applications necessitates\ncontinual learning, where models adapt to new classes (plasticity) over time\nwithout forgetting old ones (catastrophic forgetting). Current continual\nsegmentation methods often rely on distillation strategies like knowledge\ndistillation and pseudo-labeling, which are effective but result in increased\ntraining complexity and computational overhead. In this paper, we introduce a\nnovel and efficient method for continual panoptic segmentation based on Visual\nPrompt Tuning, dubbed ECLIPSE. Our approach involves freezing the base model\nparameters and fine-tuning only a small set of prompt embeddings, addressing\nboth catastrophic forgetting and plasticity and significantly reducing the\ntrainable parameters. To mitigate inherent challenges such as error propagation\nand semantic drift in continual segmentation, we propose logit manipulation to\neffectively leverage common knowledge across the classes. Experiments on ADE20K\ncontinual panoptic segmentation benchmark demonstrate the superiority of\nECLIPSE, notably its robustness against catastrophic forgetting and its\nreasonable plasticity, achieving a new state-of-the-art. The code is available\nat https://github.com/clovaai/ECLIPSE.\n","authors":["Beomyoung Kim","Joonsang Yu","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.20126v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.00648v2","updated":"2024-03-29T11:30:20Z","published":"2023-12-01T15:20:58Z","title":"SPOT: Self-Training with Patch-Order Permutation for Object-Centric\n Learning with Autoregressive Transformers","summary":" Unsupervised object-centric learning aims to decompose scenes into\ninterpretable object entities, termed slots. Slot-based auto-encoders stand out\nas a prominent method for this task. Within them, crucial aspects include\nguiding the encoder to generate object-specific slots and ensuring the decoder\nutilizes them during reconstruction. This work introduces two novel techniques,\n(i) an attention-based self-training approach, which distills superior\nslot-based attention masks from the decoder to the encoder, enhancing object\nsegmentation, and (ii) an innovative patch-order permutation strategy for\nautoregressive transformers that strengthens the role of slot vectors in\nreconstruction. The effectiveness of these strategies is showcased\nexperimentally. The combined approach significantly surpasses prior slot-based\nautoencoder methods in unsupervised object segmentation, especially with\ncomplex real-world images. We provide the implementation code at\nhttps://github.com/gkakogeorgiou/spot .\n","authors":["Ioannis Kakogeorgiou","Spyros Gidaris","Konstantinos Karantzalos","Nikos Komodakis"],"pdf_url":"https://arxiv.org/pdf/2312.00648v2.pdf","comment":"CVPR 2024. Code: https://github.com/gkakogeorgiou/spot"},{"id":"http://arxiv.org/abs/2401.00616v3","updated":"2024-03-29T11:27:32Z","published":"2024-01-01T00:08:39Z","title":"GD^2-NeRF: Generative Detail Compensation via GAN and Diffusion for\n One-shot Generalizable Neural Radiance Fields","summary":" In this paper, we focus on the One-shot Novel View Synthesis (O-NVS) task\nwhich targets synthesizing photo-realistic novel views given only one reference\nimage per scene. Previous One-shot Generalizable Neural Radiance Fields\n(OG-NeRF) methods solve this task in an inference-time finetuning-free manner,\nyet suffer the blurry issue due to the encoder-only architecture that highly\nrelies on the limited reference image. On the other hand, recent\ndiffusion-based image-to-3d methods show vivid plausible results via distilling\npre-trained 2D diffusion models into a 3D representation, yet require tedious\nper-scene optimization. Targeting these issues, we propose the GD$^2$-NeRF, a\nGenerative Detail compensation framework via GAN and Diffusion that is both\ninference-time finetuning-free and with vivid plausible details. In detail,\nfollowing a coarse-to-fine strategy, GD$^2$-NeRF is mainly composed of a\nOne-stage Parallel Pipeline (OPP) and a 3D-consistent Detail Enhancer\n(Diff3DE). At the coarse stage, OPP first efficiently inserts the GAN model\ninto the existing OG-NeRF pipeline for primarily relieving the blurry issue\nwith in-distribution priors captured from the training dataset, achieving a\ngood balance between sharpness (LPIPS, FID) and fidelity (PSNR, SSIM). Then, at\nthe fine stage, Diff3DE further leverages the pre-trained image diffusion\nmodels to complement rich out-distribution details while maintaining decent 3D\nconsistency. Extensive experiments on both the synthetic and real-world\ndatasets show that GD$^2$-NeRF noticeably improves the details while without\nper-scene finetuning.\n","authors":["Xiao Pan","Zongxin Yang","Shuai Bai","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2401.00616v3.pdf","comment":"Submitted to Journal"},{"id":"http://arxiv.org/abs/2403.20112v1","updated":"2024-03-29T10:49:02Z","published":"2024-03-29T10:49:02Z","title":"Segmentation, Classification and Interpretation of Breast Cancer Medical\n Images using Human-in-the-Loop Machine Learning","summary":" This paper explores the application of Human-in-the-Loop (HITL) strategies in\ntraining machine learning models in the medical domain. In this case a\ndoctor-in-the-loop approach is proposed to leverage human expertise in dealing\nwith large and complex data. Specifically, the paper deals with the integration\nof genomic data and Whole Slide Imaging (WSI) analysis of breast cancer. Three\ndifferent tasks were developed: segmentation of histopathological images,\nclassification of this images regarding the genomic subtype of the cancer and,\nfinally, interpretation of the machine learning results. The involvement of a\npathologist helped us to develop a better segmentation model and to enhance the\nexplainatory capabilities of the models, but the classification results were\nsuboptimal, highlighting the limitations of this approach: despite involving\nhuman experts, complex domains can still pose challenges, and a HITL approach\nmay not always be effective.\n","authors":["David Vázquez-Lema","Eduardo Mosqueira-Rey","Elena Hernández-Pereira","Carlos Fernández-Lozano","Fernando Seara-Romera","Jorge Pombo-Otero"],"pdf_url":"https://arxiv.org/pdf/2403.20112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20106v1","updated":"2024-03-29T10:40:41Z","published":"2024-03-29T10:40:41Z","title":"Aggregating Local and Global Features via Selective State Spaces Model\n for Efficient Image Deblurring","summary":" Image deblurring is a process of restoring a high quality image from the\ncorresponding blurred image. Significant progress in this field has been made\npossible by the emergence of various effective deep learning models, including\nCNNs and Transformers. However, these methods often face the dilemma between\neliminating long-range blur degradation perturbations and maintaining\ncomputational efficiency, which hinders their practical application. To address\nthis issue, we propose an efficient image deblurring network that leverages\nselective structured state spaces model to aggregate enriched and accurate\nfeatures. Specifically, we design an aggregate local and global block\n(ALGBlock) to capture and fuse both local invariant properties and non-local\ninformation. The ALGBlock consists of two blocks: (1) The local block models\nlocal connectivity using simplified channel attention. (2) The global block\ncaptures long-range dependency features with linear complexity through\nselective structured state spaces. Nevertheless, we note that the image details\nare local features of images, we accentuate the local part for restoration by\nrecalibrating the weight when aggregating the two branches for recovery.\nExperimental results demonstrate that the proposed method outperforms\nstate-of-the-art approaches on widely used benchmarks, highlighting its\nsuperior performance.\n","authors":["Hu Gao","Depeng Dang"],"pdf_url":"https://arxiv.org/pdf/2403.20106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20105v1","updated":"2024-03-29T10:38:25Z","published":"2024-03-29T10:38:25Z","title":"FreeSeg-Diff: Training-Free Open-Vocabulary Segmentation with Diffusion\n Models","summary":" Foundation models have exhibited unprecedented capabilities in tackling many\ndomains and tasks. Models such as CLIP are currently widely used to bridge\ncross-modal representations, and text-to-image diffusion models are arguably\nthe leading models in terms of realistic image generation. Image generative\nmodels are trained on massive datasets that provide them with powerful internal\nspatial representations. In this work, we explore the potential benefits of\nsuch representations, beyond image generation, in particular, for dense visual\nprediction tasks. We focus on the task of image segmentation, which is\ntraditionally solved by training models on closed-vocabulary datasets, with\npixel-level annotations. To avoid the annotation cost or training large\ndiffusion models, we constraint our setup to be zero-shot and training-free. In\na nutshell, our pipeline leverages different and relatively small-sized,\nopen-source foundation models for zero-shot open-vocabulary segmentation. The\npipeline is as follows: the image is passed to both a captioner model (i.e.\nBLIP) and a diffusion model (i.e., Stable Diffusion Model) to generate a text\ndescription and visual representation, respectively. The features are clustered\nand binarized to obtain class agnostic masks for each object. These masks are\nthen mapped to a textual class, using the CLIP model to support\nopen-vocabulary. Finally, we add a refinement step that allows to obtain a more\nprecise segmentation mask. Our approach (dubbed FreeSeg-Diff), which does not\nrely on any training, outperforms many training-based approaches on both Pascal\nVOC and COCO datasets. In addition, we show very competitive results compared\nto the recent weakly-supervised segmentation approaches. We provide\ncomprehensive experiments showing the superiority of diffusion model features\ncompared to other pretrained models. Project page:\nhttps://bcorrad.github.io/freesegdiff/\n","authors":["Barbara Toniella Corradini","Mustafa Shukor","Paul Couairon","Guillaume Couairon","Franco Scarselli","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2403.20105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20101v1","updated":"2024-03-29T10:31:32Z","published":"2024-03-29T10:31:32Z","title":"RealKIE: Five Novel Datasets for Enterprise Key Information Extraction","summary":" We introduce RealKIE, a benchmark of five challenging datasets aimed at\nadvancing key information extraction methods, with an emphasis on enterprise\napplications. The datasets include a diverse range of documents including SEC\nS1 Filings, US Non-disclosure Agreements, UK Charity Reports, FCC Invoices, and\nResource Contracts. Each presents unique challenges: poor text serialization,\nsparse annotations in long documents, and complex tabular layouts. These\ndatasets provide a realistic testing ground for key information extraction\ntasks like investment analysis and legal data processing.\n In addition to presenting these datasets, we offer an in-depth description of\nthe annotation process, document processing techniques, and baseline modeling\napproaches. This contribution facilitates the development of NLP models capable\nof handling practical challenges and supports further research into information\nextraction technologies applicable to industry-specific problems.\n The annotated data and OCR outputs are available to download at\nhttps://indicodatasolutions.github.io/RealKIE/ code to reproduce the baselines\nwill be available shortly.\n","authors":["Benjamin Townsend","Madison May","Christopher Wells"],"pdf_url":"https://arxiv.org/pdf/2403.20101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11600v2","updated":"2024-03-29T10:18:02Z","published":"2023-11-20T08:27:56Z","title":"Deep Equilibrium Diffusion Restoration with Parallel Sampling","summary":" Diffusion model-based image restoration (IR) aims to use diffusion models to\nrecover high-quality (HQ) images from degraded images, achieving promising\nperformance. Due to the inherent property of diffusion models, most existing\nmethods need long serial sampling chains to restore HQ images step-by-step,\nresulting in expensive sampling time and high computation costs. Moreover, such\nlong sampling chains hinder understanding the relationship between inputs and\nrestoration results since it is hard to compute the gradients in the whole\nchains. In this work, we aim to rethink the diffusion model-based IR models\nthrough a different perspective, i.e., a deep equilibrium (DEQ) fixed point\nsystem, called DeqIR. Specifically, we derive an analytical solution by\nmodeling the entire sampling chain in these IR models as a joint multivariate\nfixed point system. Based on the analytical solution, we can conduct parallel\nsampling and restore HQ images without training. Furthermore, we compute fast\ngradients via DEQ inversion and found that initialization optimization can\nboost image quality and control the generation direction. Extensive experiments\non benchmarks demonstrate the effectiveness of our method on typical IR tasks\nand real-world settings.\n","authors":["Jiezhang Cao","Yue Shi","Kai Zhang","Yulun Zhang","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.11600v2.pdf","comment":"CVPR'2024"},{"id":"http://arxiv.org/abs/2311.01025v2","updated":"2024-03-29T10:09:33Z","published":"2023-11-02T06:38:19Z","title":"Integrating Language-Derived Appearance Elements with Visual Cues in\n Pedestrian Detection","summary":" Large language models (LLMs) have shown their capabilities in understanding\ncontextual and semantic information regarding knowledge of instance\nappearances. In this paper, we introduce a novel approach to utilize the\nstrengths of LLMs in understanding contextual appearance variations and to\nleverage this knowledge into a vision model (here, pedestrian detection). While\npedestrian detection is considered one of the crucial tasks directly related to\nour safety (e.g., intelligent driving systems), it is challenging because of\nvarying appearances and poses in diverse scenes. Therefore, we propose to\nformulate language-derived appearance elements and incorporate them with visual\ncues in pedestrian detection. To this end, we establish a description corpus\nthat includes numerous narratives describing various appearances of pedestrians\nand other instances. By feeding them through an LLM, we extract appearance\nknowledge sets that contain the representations of appearance variations.\nSubsequently, we perform a task-prompting process to obtain appearance elements\nwhich are guided representative appearance knowledge relevant to a downstream\npedestrian detection task. The obtained knowledge elements are adaptable to\nvarious detection frameworks, so that we can provide plentiful appearance\ninformation by integrating the language-derived appearance elements with visual\ncues within a detector. Through comprehensive experiments with various\npedestrian detectors, we verify the adaptability and effectiveness of our\nmethod showing noticeable performance gains and achieving state-of-the-art\ndetection performance on two public pedestrian detection benchmarks (i.e.,\nCrowdHuman and WiderPedestrian).\n","authors":["Sungjune Park","Hyunjun Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2311.01025v2.pdf","comment":"11 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.03203v2","updated":"2024-03-29T10:09:30Z","published":"2023-12-06T00:46:30Z","title":"Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled\n Feature Fields","summary":" 3D scene representations have gained immense popularity in recent years.\nMethods that use Neural Radiance fields are versatile for traditional tasks\nsuch as novel view synthesis. In recent times, some work has emerged that aims\nto extend the functionality of NeRF beyond view synthesis, for semantically\naware tasks such as editing and segmentation using 3D feature field\ndistillation from 2D foundation models. However, these methods have two major\nlimitations: (a) they are limited by the rendering speed of NeRF pipelines, and\n(b) implicitly represented feature fields suffer from continuity artifacts\nreducing feature quality. Recently, 3D Gaussian Splatting has shown\nstate-of-the-art performance on real-time radiance field rendering. In this\nwork, we go one step further: in addition to radiance field rendering, we\nenable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D\nfoundation model distillation. This translation is not straightforward: naively\nincorporating feature fields in the 3DGS framework encounters significant\nchallenges, notably the disparities in spatial resolution and channel\nconsistency between RGB images and feature maps. We propose architectural and\ntraining changes to efficiently avert this problem. Our proposed method is\ngeneral, and our experiments showcase novel view semantic segmentation,\nlanguage-guided editing and segment anything through learning feature fields\nfrom state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across\nexperiments, our distillation method is able to provide comparable or better\nresults, while being significantly faster to both train and render.\nAdditionally, to the best of our knowledge, we are the first method to enable\npoint and bounding-box prompting for radiance field manipulation, by leveraging\nthe SAM model. Project website at: https://feature-3dgs.github.io/\n","authors":["Shijie Zhou","Haoran Chang","Sicheng Jiang","Zhiwen Fan","Zehao Zhu","Dejia Xu","Pradyumna Chari","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2312.03203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20092v1","updated":"2024-03-29T10:05:29Z","published":"2024-03-29T10:05:29Z","title":"Modeling Weather Uncertainty for Multi-weather Co-Presence Estimation","summary":" Images from outdoor scenes may be taken under various weather conditions. It\nis well studied that weather impacts the performance of computer vision\nalgorithms and needs to be handled properly. However, existing algorithms model\nweather condition as a discrete status and estimate it using multi-label\nclassification. The fact is that, physically, specifically in meteorology,\nweather are modeled as a continuous and transitional status. Instead of\ndirectly implementing hard classification as existing multi-weather\nclassification methods do, we consider the physical formulation of\nmulti-weather conditions and model the impact of physical-related parameter on\nlearning from the image appearance. In this paper, we start with solid revisit\nof the physics definition of weather and how it can be described as a\ncontinuous machine learning and computer vision task. Namely, we propose to\nmodel the weather uncertainty, where the level of probability and co-existence\nof multiple weather conditions are both considered. A Gaussian mixture model is\nused to encapsulate the weather uncertainty and a uncertainty-aware\nmulti-weather learning scheme is proposed based on prior-posterior learning. A\nnovel multi-weather co-presence estimation transformer (MeFormer) is proposed.\nIn addition, a new multi-weather co-presence estimation (MePe) dataset, along\nwith 14 fine-grained weather categories and 16,078 samples, is proposed to\nbenchmark both conventional multi-label weather classification task and\nmulti-weather co-presence estimation task. Large scale experiments show that\nthe proposed method achieves state-of-the-art performance and substantial\ngeneralization capabilities on both the conventional multi-label weather\nclassification task and the proposed multi-weather co-presence estimation task.\nBesides, modeling weather uncertainty also benefits adverse-weather semantic\nsegmentation.\n","authors":["Qi Bi","Shaodi You","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2403.20092v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.13604v2","updated":"2024-03-29T09:59:34Z","published":"2023-09-24T10:48:20Z","title":"Distribution-Aware Continual Test-Time Adaptation for Semantic\n Segmentation","summary":" Since autonomous driving systems usually face dynamic and ever-changing\nenvironments, continual test-time adaptation (CTTA) has been proposed as a\nstrategy for transferring deployed models to continually changing target\ndomains. However, the pursuit of long-term adaptation often introduces\ncatastrophic forgetting and error accumulation problems, which impede the\npractical implementation of CTTA in the real world. Recently, existing CTTA\nmethods mainly focus on utilizing a majority of parameters to fit target domain\nknowledge through self-training. Unfortunately, these approaches often amplify\nthe challenge of error accumulation due to noisy pseudo-labels, and pose\npractical limitations stemming from the heavy computational costs associated\nwith entire model updates. In this paper, we propose a distribution-aware\ntuning (DAT) method to make the semantic segmentation CTTA efficient and\npractical in real-world applications. DAT adaptively selects and updates two\nsmall groups of trainable parameters based on data distribution during the\ncontinual adaptation process, including domain-specific parameters (DSP) and\ntask-relevant parameters (TRP). Specifically, DSP exhibits sensitivity to\noutputs with substantial distribution shifts, effectively mitigating the\nproblem of error accumulation. In contrast, TRP are allocated to positions that\nare responsive to outputs with minor distribution shifts, which are fine-tuned\nto avoid the catastrophic forgetting problem. In addition, since CTTA is a\ntemporal task, we introduce the Parameter Accumulation Update (PAU) strategy to\ncollect the updated DSP and TRP in target domain sequences. We conduct\nextensive experiments on two widely-used semantic segmentation CTTA benchmarks,\nachieving promising performance compared to previous state-of-the-art methods.\n","authors":["Jiayi Ni","Senqiao Yang","Ran Xu","Jiaming Liu","Xiaoqi Li","Wenyu Jiao","Zehui Chen","Yi Liu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.13604v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20086v1","updated":"2024-03-29T09:46:14Z","published":"2024-03-29T09:46:14Z","title":"Selective Attention-based Modulation for Continual Learning","summary":" We present SAM, a biologically-plausible selective attention-driven\nmodulation approach to enhance classification models in a continual learning\nsetting. Inspired by neurophysiological evidence that the primary visual cortex\ndoes not contribute to object manifold untangling for categorization and that\nprimordial attention biases are still embedded in the modern brain, we propose\nto employ auxiliary saliency prediction features as a modulation signal to\ndrive and stabilize the learning of a sequence of non-i.i.d. classification\ntasks. Experimental results confirm that SAM effectively enhances the\nperformance (in some cases up to about twenty percent points) of\nstate-of-the-art continual learning methods, both in class-incremental and\ntask-incremental settings. Moreover, we show that attention-based modulation\nsuccessfully encourages the learning of features that are more robust to the\npresence of spurious features and to adversarial attacks than baseline methods.\nCode is available at: https://github.com/perceivelab/SAM.\n","authors":["Giovanni Bellitto","Federica Proietto Salanitri","Matteo Pennisi","Matteo Boschini","Angelo Porrello","Simone Calderara","Simone Palazzo","Concetto Spampinato"],"pdf_url":"https://arxiv.org/pdf/2403.20086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20080v1","updated":"2024-03-29T09:22:44Z","published":"2024-03-29T09:22:44Z","title":"Mixed-precision Supernet Training from Vision Foundation Models using\n Low Rank Adapter","summary":" Compression of large and performant vision foundation models (VFMs) into\narbitrary bit-wise operations (BitOPs) allows their deployment on various\nhardware. We propose to fine-tune a VFM to a mixed-precision quantized\nsupernet. The supernet-based neural architecture search (NAS) can be adopted\nfor this purpose, which trains a supernet, and then subnets within arbitrary\nhardware budgets can be extracted. However, existing methods face difficulties\nin optimizing the mixed-precision search space and incurring large memory costs\nduring training. To tackle these challenges, first, we study the effective\nsearch space design for fine-tuning a VFM by comparing different operators\n(such as resolution, feature size, width, depth, and bit-widths) in terms of\nperformance and BitOPs reduction. Second, we propose memory-efficient supernet\ntraining using a low-rank adapter (LoRA) and a progressive training strategy.\nThe proposed method is evaluated for the recently proposed VFM, Segment\nAnything Model, fine-tuned on segmentation tasks. The searched model yields\nabout a 95% reduction in BitOPs without incurring performance degradation.\n","authors":["Yuiko Sakuma","Masakazu Yoshimura","Junji Otsuka","Atsushi Irie","Takeshi Ohashi"],"pdf_url":"https://arxiv.org/pdf/2403.20080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20079v1","updated":"2024-03-29T09:20:29Z","published":"2024-03-29T09:20:29Z","title":"SGD: Street View Synthesis with Gaussian Splatting and Diffusion Prior","summary":" Novel View Synthesis (NVS) for street scenes play a critical role in the\nautonomous driving simulation. The current mainstream technique to achieve it\nis neural rendering, such as Neural Radiance Fields (NeRF) and 3D Gaussian\nSplatting (3DGS). Although thrilling progress has been made, when handling\nstreet scenes, current methods struggle to maintain rendering quality at the\nviewpoint that deviates significantly from the training viewpoints. This issue\nstems from the sparse training views captured by a fixed camera on a moving\nvehicle. To tackle this problem, we propose a novel approach that enhances the\ncapacity of 3DGS by leveraging prior from a Diffusion Model along with\ncomplementary multi-modal data. Specifically, we first fine-tune a Diffusion\nModel by adding images from adjacent frames as condition, meanwhile exploiting\ndepth data from LiDAR point clouds to supply additional spatial information.\nThen we apply the Diffusion Model to regularize the 3DGS at unseen views during\ntraining. Experimental results validate the effectiveness of our method\ncompared with current state-of-the-art models, and demonstrate its advance in\nrendering images from broader views.\n","authors":["Zhongrui Yu","Haoran Wang","Jinze Yang","Hanzhang Wang","Zeke Xie","Yunfeng Cai","Jiale Cao","Zhong Ji","Mingming Sun"],"pdf_url":"https://arxiv.org/pdf/2403.20079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20078v1","updated":"2024-03-29T09:19:52Z","published":"2024-03-29T09:19:52Z","title":"Negative Label Guided OOD Detection with Pretrained Vision-Language\n Models","summary":" Out-of-distribution (OOD) detection aims at identifying samples from unknown\nclasses, playing a crucial role in trustworthy models against errors on\nunexpected inputs. Extensive research has been dedicated to exploring OOD\ndetection in the vision modality. Vision-language models (VLMs) can leverage\nboth textual and visual information for various multi-modal applications,\nwhereas few OOD detection methods take into account information from the text\nmodality. In this paper, we propose a novel post hoc OOD detection method,\ncalled NegLabel, which takes a vast number of negative labels from extensive\ncorpus databases. We design a novel scheme for the OOD score collaborated with\nnegative labels. Theoretical analysis helps to understand the mechanism of\nnegative labels. Extensive experiments demonstrate that our method NegLabel\nachieves state-of-the-art performance on various OOD detection benchmarks and\ngeneralizes well on multiple VLM architectures. Furthermore, our method\nNegLabel exhibits remarkable robustness against diverse domain shifts. The\ncodes are available at https://github.com/tmlr-group/NegLabel.\n","authors":["Xue Jiang","Feng Liu","Zhen Fang","Hong Chen","Tongliang Liu","Feng Zheng","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2403.20078v1.pdf","comment":"ICLR 2024 Spotlight"},{"id":"http://arxiv.org/abs/2403.20058v1","updated":"2024-03-29T08:47:49Z","published":"2024-03-29T08:47:49Z","title":"Revolutionizing Disease Diagnosis with simultaneous functional PET/MR\n and Deeply Integrated Brain Metabolic, Hemodynamic, and Perfusion Networks","summary":" Simultaneous functional PET/MR (sf-PET/MR) presents a cutting-edge multimodal\nneuroimaging technique. It provides an unprecedented opportunity for\nconcurrently monitoring and integrating multifaceted brain networks built by\nspatiotemporally covaried metabolic activity, neural activity, and cerebral\nblood flow (perfusion). Albeit high scientific/clinical values, short in\nhardware accessibility of PET/MR hinders its applications, let alone modern\nAI-based PET/MR fusion models. Our objective is to develop a clinically\nfeasible AI-based disease diagnosis model trained on comprehensive sf-PET/MR\ndata with the power of, during inferencing, allowing single modality input\n(e.g., PET only) as well as enforcing multimodal-based accuracy. To this end,\nwe propose MX-ARM, a multimodal MiXture-of-experts Alignment and Reconstruction\nModel. It is modality detachable and exchangeable, allocating different\nmulti-layer perceptrons dynamically (\"mixture of experts\") through learnable\nweights to learn respective representations from different modalities. Such\ndesign will not sacrifice model performance in uni-modal situation. To fully\nexploit the inherent complex and nonlinear relation among modalities while\nproducing fine-grained representations for uni-modal inference, we subsequently\nadd a modal alignment module to line up a dominant modality (e.g., PET) with\nrepresentations of auxiliary modalities (MR). We further adopt multimodal\nreconstruction to promote the quality of learned features. Experiments on\nprecious multimodal sf-PET/MR data for Mild Cognitive Impairment diagnosis\nshowcase the efficacy of our model toward clinically feasible precision\nmedicine.\n","authors":["Luoyu Wang","Yitian Tao","Qing Yang","Yan Liang","Siwei Liu","Hongcheng Shi","Dinggang Shen","Han Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.20058v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2209.11964v2","updated":"2024-03-29T08:46:46Z","published":"2022-09-24T08:57:10Z","title":"Strong Transferable Adversarial Attacks via Ensembled Asymptotically\n Normal Distribution Learning","summary":" Strong adversarial examples are crucial for evaluating and enhancing the\nrobustness of deep neural networks. However, the performance of popular attacks\nis usually sensitive, for instance, to minor image transformations, stemming\nfrom limited information -- typically only one input example, a handful of\nwhite-box source models, and undefined defense strategies. Hence, the crafted\nadversarial examples are prone to overfit the source model, which hampers their\ntransferability to unknown architectures. In this paper, we propose an approach\nnamed Multiple Asymptotically Normal Distribution Attacks (MultiANDA) which\nexplicitly characterize adversarial perturbations from a learned distribution.\nSpecifically, we approximate the posterior distribution over the perturbations\nby taking advantage of the asymptotic normality property of stochastic gradient\nascent (SGA), then employ the deep ensemble strategy as an effective proxy for\nBayesian marginalization in this process, aiming to estimate a mixture of\nGaussians that facilitates a more thorough exploration of the potential\noptimization space. The approximated posterior essentially describes the\nstationary distribution of SGA iterations, which captures the geometric\ninformation around the local optimum. Thus, MultiANDA allows drawing an\nunlimited number of adversarial perturbations for each input and reliably\nmaintains the transferability. Our proposed method outperforms ten\nstate-of-the-art black-box attacks on deep learning models with or without\ndefenses through extensive experiments on seven normally trained and seven\ndefense models.\n","authors":["Zhengwei Fang","Rui Wang","Tao Huang","Liping Jing"],"pdf_url":"https://arxiv.org/pdf/2209.11964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16653v2","updated":"2024-03-29T08:39:23Z","published":"2023-09-28T17:55:05Z","title":"DreamGaussian: Generative Gaussian Splatting for Efficient 3D Content\n Creation","summary":" Recent advances in 3D content creation mostly leverage optimization-based 3D\ngeneration via score distillation sampling (SDS). Though promising results have\nbeen exhibited, these methods often suffer from slow per-sample optimization,\nlimiting their practical usage. In this paper, we propose DreamGaussian, a\nnovel 3D content generation framework that achieves both efficiency and quality\nsimultaneously. Our key insight is to design a generative 3D Gaussian Splatting\nmodel with companioned mesh extraction and texture refinement in UV space. In\ncontrast to the occupancy pruning used in Neural Radiance Fields, we\ndemonstrate that the progressive densification of 3D Gaussians converges\nsignificantly faster for 3D generative tasks. To further enhance the texture\nquality and facilitate downstream applications, we introduce an efficient\nalgorithm to convert 3D Gaussians into textured meshes and apply a fine-tuning\nstage to refine the details. Extensive experiments demonstrate the superior\nefficiency and competitive generation quality of our proposed approach.\nNotably, DreamGaussian produces high-quality textured meshes in just 2 minutes\nfrom a single-view image, achieving approximately 10 times acceleration\ncompared to existing methods.\n","authors":["Jiaxiang Tang","Jiawei Ren","Hang Zhou","Ziwei Liu","Gang Zeng"],"pdf_url":"https://arxiv.org/pdf/2309.16653v2.pdf","comment":"Camera-ready version. Project page: https://dreamgaussian.github.io/"},{"id":"http://arxiv.org/abs/2311.14671v2","updated":"2024-03-29T08:36:41Z","published":"2023-11-24T18:59:42Z","title":"SEGIC: Unleashing the Emergent Correspondence for In-Context\n Segmentation","summary":" In-context segmentation aims at segmenting novel images using a few labeled\nexample images, termed as \"in-context examples\", exploring content similarities\nbetween examples and the target. The resulting models can be generalized\nseamlessly to novel segmentation tasks, significantly reducing the labeling and\ntraining costs compared with conventional pipelines. However, in-context\nsegmentation is more challenging than classic ones requiring the model to learn\nsegmentation rules conditioned on a few samples. Unlike previous work with\nad-hoc or non-end-to-end designs, we propose SEGIC, an end-to-end\nsegment-in-context framework built upon a single vision foundation model (VFM).\nIn particular, SEGIC leverages the emergent correspondence within VFM to\ncapture dense relationships between target images and in-context samples. As\nsuch, information from in-context samples is then extracted into three types of\ninstructions, i.e. geometric, visual, and meta instructions, serving as\nexplicit conditions for the final mask prediction. SEGIC is a straightforward\nyet effective approach that yields state-of-the-art performance on one-shot\nsegmentation benchmarks. Notably, SEGIC can be easily generalized to diverse\ntasks, including video object segmentation and open-vocabulary segmentation.\nCode will be available at https://github.com/MengLcool/SEGIC.\n","authors":["Lingchen Meng","Shiyi Lan","Hengduo Li","Jose M. Alvarez","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.14671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20047v1","updated":"2024-03-29T08:33:05Z","published":"2024-03-29T08:33:05Z","title":"Embracing Unknown Step by Step: Towards Reliable Sparse Training in Real\n World","summary":" Sparse training has emerged as a promising method for resource-efficient deep\nneural networks (DNNs) in real-world applications. However, the reliability of\nsparse models remains a crucial concern, particularly in detecting unknown\nout-of-distribution (OOD) data. This study addresses the knowledge gap by\ninvestigating the reliability of sparse training from an OOD perspective and\nreveals that sparse training exacerbates OOD unreliability. The lack of unknown\ninformation and the sparse constraints hinder the effective exploration of\nweight space and accurate differentiation between known and unknown knowledge.\nTo tackle these challenges, we propose a new unknown-aware sparse training\nmethod, which incorporates a loss modification, auto-tuning strategy, and a\nvoting scheme to guide weight space exploration and mitigate confusion between\nknown and unknown information without incurring significant additional costs or\nrequiring access to additional OOD data. Theoretical insights demonstrate how\nour method reduces model confidence when faced with OOD samples. Empirical\nexperiments across multiple datasets, model architectures, and sparsity levels\nvalidate the effectiveness of our method, with improvements of up to\n\\textbf{8.4\\%} in AUROC while maintaining comparable or higher accuracy and\ncalibration. This research enhances the understanding and readiness of sparse\nDNNs for deployment in resource-limited applications. Our code is available on:\n\\url{https://github.com/StevenBoys/MOON}.\n","authors":["Bowen Lei","Dongkuan Xu","Ruqi Zhang","Bani Mallick"],"pdf_url":"https://arxiv.org/pdf/2403.20047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09866v2","updated":"2024-03-29T08:25:36Z","published":"2023-12-15T15:09:30Z","title":"PLGSLAM: Progressive Neural Scene Represenation with Local to Global\n Bundle Adjustment","summary":" Neural implicit scene representations have recently shown encouraging results\nin dense visual SLAM. However, existing methods produce low-quality scene\nreconstruction and low-accuracy localization performance when scaling up to\nlarge indoor scenes and long sequences. These limitations are mainly due to\ntheir single, global radiance field with finite capacity, which does not adapt\nto large scenarios. Their end-to-end pose networks are also not robust enough\nwith the growth of cumulative errors in large scenes. To this end, we introduce\nPLGSLAM, a neural visual SLAM system capable of high-fidelity surface\nreconstruction and robust camera tracking in real-time. To handle large-scale\nindoor scenes, PLGSLAM proposes a progressive scene representation method which\ndynamically allocates new local scene representation trained with frames within\na local sliding window. This allows us to scale up to larger indoor scenes and\nimproves robustness (even under pose drifts). In local scene representation,\nPLGSLAM utilizes tri-planes for local high-frequency features with multi-layer\nperceptron (MLP) networks for the low-frequency feature, achieving smoothness\nand scene completion in unobserved areas. Moreover, we propose local-to-global\nbundle adjustment method with a global keyframe database to address the\nincreased pose drifts on long sequences. Experimental results demonstrate that\nPLGSLAM achieves state-of-the-art scene reconstruction results and tracking\nperformance across various datasets and scenarios (both in small and\nlarge-scale indoor environments).\n","authors":["Tianchen Deng","Guole Shen","Tong Qin","Jianyu Wang","Wentao Zhao","Jingchuan Wang","Danwei Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.09866v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2304.14178v3","updated":"2024-03-29T08:13:38Z","published":"2023-04-27T13:27:01Z","title":"mPLUG-Owl: Modularization Empowers Large Language Models with\n Multimodality","summary":" Large language models (LLMs) have demonstrated impressive zero-shot abilities\non a variety of open-ended tasks, while recent research has also explored the\nuse of LLMs for multi-modal generation. In this study, we introduce mPLUG-Owl,\na novel training paradigm that equips LLMs with multi-modal abilities through\nmodularized learning of foundation LLM, a visual knowledge module, and a visual\nabstractor module. This approach can support multiple modalities and facilitate\ndiverse unimodal and multimodal abilities through modality collaboration. The\ntraining paradigm of mPLUG-Owl involves a two-stage method for aligning image\nand text, which learns visual knowledge with the assistance of LLM while\nmaintaining and even improving the generation abilities of LLM. In the first\nstage, the visual knowledge module and abstractor module are trained with a\nfrozen LLM module to align the image and text. In the second stage,\nlanguage-only and multi-modal supervised datasets are used to jointly fine-tune\na low-rank adaption (LoRA) module on LLM and the abstractor module by freezing\nthe visual knowledge module. We carefully build a visually-related instruction\nevaluation set OwlEval. Experimental results show that our model outperforms\nexisting multi-modal models, demonstrating mPLUG-Owl's impressive instruction\nand visual understanding ability, multi-turn conversation ability, and\nknowledge reasoning ability. Besides, we observe some unexpected and exciting\nabilities such as multi-image correlation and scene text understanding, which\nmakes it possible to leverage it for harder real scenarios, such as vision-only\ndocument comprehension. Our code, pre-trained model, instruction-tuned models,\nand evaluation set are available at https://github.com/X-PLUG/mPLUG-Owl. The\nonline demo is available at https://www.modelscope.cn/studios/damo/mPLUG-Owl.\n","authors":["Qinghao Ye","Haiyang Xu","Guohai Xu","Jiabo Ye","Ming Yan","Yiyang Zhou","Junyang Wang","Anwen Hu","Pengcheng Shi","Yaya Shi","Chenliang Li","Yuanhong Xu","Hehong Chen","Junfeng Tian","Qi Qian","Ji Zhang","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2304.14178v3.pdf","comment":"Working in Process"},{"id":"http://arxiv.org/abs/2402.18918v2","updated":"2024-03-29T08:06:38Z","published":"2024-02-29T07:20:02Z","title":"SNE-RoadSegV2: Advancing Heterogeneous Feature Fusion and Fallibility\n Awareness for Freespace Detection","summary":" Feature-fusion networks with duplex encoders have proven to be an effective\ntechnique to solve the freespace detection problem. However, despite the\ncompelling results achieved by previous research efforts, the exploration of\nadequate and discriminative heterogeneous feature fusion, as well as the\ndevelopment of fallibility-aware loss functions remains relatively scarce. This\npaper makes several significant contributions to address these limitations: (1)\nIt presents a novel heterogeneous feature fusion block, comprising a holistic\nattention module, a heterogeneous feature contrast descriptor, and an\naffinity-weighted feature recalibrator, enabling a more in-depth exploitation\nof the inherent characteristics of the extracted features, (2) it incorporates\nboth inter-scale and intra-scale skip connections into the decoder architecture\nwhile eliminating redundant ones, leading to both improved accuracy and\ncomputational efficiency, and (3) it introduces two fallibility-aware loss\nfunctions that separately focus on semantic-transition and depth-inconsistent\nregions, collectively contributing to greater supervision during model\ntraining. Our proposed heterogeneous feature fusion network (SNE-RoadSegV2),\nwhich incorporates all these innovative components, demonstrates superior\nperformance in comparison to all other freespace detection algorithms across\nmultiple public datasets. Notably, it ranks the 1st on the official KITTI Road\nbenchmark.\n","authors":["Yi Feng","Yu Ma","Qijun Chen","Ioannis Pitas","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2402.18918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20035v1","updated":"2024-03-29T08:03:42Z","published":"2024-03-29T08:03:42Z","title":"UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces\n Parameters for Skin Lesion Segmentation","summary":" Traditionally for improving the segmentation performance of models, most\napproaches prefer to use adding more complex modules. And this is not suitable\nfor the medical field, especially for mobile medical devices, where\ncomputationally loaded models are not suitable for real clinical environments\ndue to computational resource constraints. Recently, state-space models (SSMs),\nrepresented by Mamba, have become a strong competitor to traditional CNNs and\nTransformers. In this paper, we deeply explore the key elements of parameter\ninfluence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight\nVM-UNet) based on this. Specifically, we propose a method for processing\nfeatures in parallel Vision Mamba, named PVM Layer, which achieves excellent\nperformance with the lowest computational load while keeping the overall number\nof processing channels constant. We conducted comparisons and ablation\nexperiments with several state-of-the-art lightweight models on three skin\nlesion public datasets and demonstrated that the UltraLight VM-UNet exhibits\nthe same strong performance competitiveness with parameters of only 0.049M and\nGFLOPs of 0.060. In addition, this study deeply explores the key elements of\nparameter influence in Mamba, which will lay a theoretical foundation for Mamba\nto possibly become a new mainstream module for lightweighting in the future.\nThe code is available from https://github.com/wurenkai/UltraLight-VM-UNet .\n","authors":["Renkai Wu","Yinghao Liu","Pengchen Liang","Qing Chang"],"pdf_url":"https://arxiv.org/pdf/2403.20035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18795v2","updated":"2024-03-29T08:02:14Z","published":"2024-03-27T17:40:14Z","title":"Gamba: Marry Gaussian Splatting with Mamba for single view 3D\n reconstruction","summary":" We tackle the challenge of efficiently reconstructing a 3D asset from a\nsingle image with growing demands for automated 3D content creation pipelines.\nPrevious methods primarily rely on Score Distillation Sampling (SDS) and Neural\nRadiance Fields (NeRF). Despite their significant success, these approaches\nencounter practical limitations due to lengthy optimization and considerable\nmemory usage. In this report, we introduce Gamba, an end-to-end amortized 3D\nreconstruction model from single-view images, emphasizing two main insights:\n(1) 3D representation: leveraging a large number of 3D Gaussians for an\nefficient 3D Gaussian splatting process; (2) Backbone design: introducing a\nMamba-based sequential network that facilitates context-dependent reasoning and\nlinear scalability with the sequence (token) length, accommodating a\nsubstantial number of Gaussians. Gamba incorporates significant advancements in\ndata preprocessing, regularization design, and training methodologies. We\nassessed Gamba against existing optimization-based and feed-forward 3D\ngeneration approaches using the real-world scanned OmniObject3D dataset. Here,\nGamba demonstrates competitive generation capabilities, both qualitatively and\nquantitatively, while achieving remarkable speed, approximately 0.6 second on a\nsingle NVIDIA A100 GPU.\n","authors":["Qiuhong Shen","Xuanyu Yi","Zike Wu","Pan Zhou","Hanwang Zhang","Shuicheng Yan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18795v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20034v1","updated":"2024-03-29T07:59:37Z","published":"2024-03-29T07:59:37Z","title":"NeSLAM: Neural Implicit Mapping and Self-Supervised Feature Tracking\n With Depth Completion and Denoising","summary":" In recent years, there have been significant advancements in 3D\nreconstruction and dense RGB-D SLAM systems. One notable development is the\napplication of Neural Radiance Fields (NeRF) in these systems, which utilizes\nimplicit neural representation to encode 3D scenes. This extension of NeRF to\nSLAM has shown promising results. However, the depth images obtained from\nconsumer-grade RGB-D sensors are often sparse and noisy, which poses\nsignificant challenges for 3D reconstruction and affects the accuracy of the\nrepresentation of the scene geometry. Moreover, the original hierarchical\nfeature grid with occupancy value is inaccurate for scene geometry\nrepresentation. Furthermore, the existing methods select random pixels for\ncamera tracking, which leads to inaccurate localization and is not robust in\nreal-world indoor environments. To this end, we present NeSLAM, an advanced\nframework that achieves accurate and dense depth estimation, robust camera\ntracking, and realistic synthesis of novel views. First, a depth completion and\ndenoising network is designed to provide dense geometry prior and guide the\nneural implicit representation optimization. Second, the occupancy scene\nrepresentation is replaced with Signed Distance Field (SDF) hierarchical scene\nrepresentation for high-quality reconstruction and view synthesis. Furthermore,\nwe also propose a NeRF-based self-supervised feature tracking algorithm for\nrobust real-time tracking. Experiments on various indoor datasets demonstrate\nthe effectiveness and accuracy of the system in reconstruction, tracking\nquality, and novel view synthesis.\n","authors":["Tianchen Deng","Yanbo Wang","Hongle Xie","Hesheng Wang","Jingchuan Wang","Danwei Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2403.20034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20032v1","updated":"2024-03-29T07:58:21Z","published":"2024-03-29T07:58:21Z","title":"HO-Gaussian: Hybrid Optimization of 3D Gaussian Splatting for Urban\n Scenes","summary":" The rapid growth of 3D Gaussian Splatting (3DGS) has revolutionized neural\nrendering, enabling real-time production of high-quality renderings. However,\nthe previous 3DGS-based methods have limitations in urban scenes due to\nreliance on initial Structure-from-Motion(SfM) points and difficulties in\nrendering distant, sky and low-texture areas. To overcome these challenges, we\npropose a hybrid optimization method named HO-Gaussian, which combines a\ngrid-based volume with the 3DGS pipeline. HO-Gaussian eliminates the dependency\non SfM point initialization, allowing for rendering of urban scenes, and\nincorporates the Point Densitification to enhance rendering quality in\nproblematic regions during training. Furthermore, we introduce Gaussian\nDirection Encoding as an alternative for spherical harmonics in the rendering\npipeline, which enables view-dependent color representation. To account for\nmulti-camera systems, we introduce neural warping to enhance object consistency\nacross different cameras. Experimental results on widely used autonomous\ndriving datasets demonstrate that HO-Gaussian achieves photo-realistic\nrendering in real-time on multi-camera urban datasets.\n","authors":["Zhuopeng Li","Yilin Zhang","Chenming Wu","Jianke Zhu","Liangjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.20032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20031v1","updated":"2024-03-29T07:53:06Z","published":"2024-03-29T07:53:06Z","title":"A Unified Framework for Human-centric Point Cloud Video Understanding","summary":" Human-centric Point Cloud Video Understanding (PVU) is an emerging field\nfocused on extracting and interpreting human-related features from sequences of\nhuman point clouds, further advancing downstream human-centric tasks and\napplications. Previous works usually focus on tackling one specific task and\nrely on huge labeled data, which has poor generalization capability.\nConsidering that human has specific characteristics, including the structural\nsemantics of human body and the dynamics of human motions, we propose a unified\nframework to make full use of the prior knowledge and explore the inherent\nfeatures in the data itself for generalized human-centric point cloud video\nunderstanding. Extensive experiments demonstrate that our method achieves\nstate-of-the-art performance on various human-related tasks, including action\nrecognition and 3D pose estimation. All datasets and code will be released\nsoon.\n","authors":["Yiteng Xu","Kecheng Ye","Xiao Han","Yiming Ren","Xinge Zhu","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2403.20031v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17496v2","updated":"2024-03-29T07:38:21Z","published":"2024-03-26T08:53:25Z","title":"Dr.Hair: Reconstructing Scalp-Connected Hair Strands without\n Pre-training via Differentiable Rendering of Line Segments","summary":" In the film and gaming industries, achieving a realistic hair appearance\ntypically involves the use of strands originating from the scalp. However,\nreconstructing these strands from observed surface images of hair presents\nsignificant challenges. The difficulty in acquiring Ground Truth (GT) data has\nled state-of-the-art learning-based methods to rely on pre-training with\nmanually prepared synthetic CG data. This process is not only labor-intensive\nand costly but also introduces complications due to the domain gap when\ncompared to real-world data. In this study, we propose an optimization-based\napproach that eliminates the need for pre-training. Our method represents hair\nstrands as line segments growing from the scalp and optimizes them using a\nnovel differentiable rendering algorithm. To robustly optimize a substantial\nnumber of slender explicit geometries, we introduce 3D orientation estimation\nutilizing global optimization, strand initialization based on Laplace's\nequation, and reparameterization that leverages geometric connectivity and\nspatial proximity. Unlike existing optimization-based methods, our method is\ncapable of reconstructing internal hair flow in an absolute direction. Our\nmethod exhibits robust and accurate inverse rendering, surpassing the quality\nof existing methods and significantly improving processing speed.\n","authors":["Yusuke Takimoto","Hikari Takehara","Hiroyuki Sato","Zihao Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.17496v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.20026v1","updated":"2024-03-29T07:28:50Z","published":"2024-03-29T07:28:50Z","title":"FSMR: A Feature Swapping Multi-modal Reasoning Approach with Joint\n Textual and Visual Clues","summary":" Multi-modal reasoning plays a vital role in bridging the gap between textual\nand visual information, enabling a deeper understanding of the context. This\npaper presents the Feature Swapping Multi-modal Reasoning (FSMR) model,\ndesigned to enhance multi-modal reasoning through feature swapping. FSMR\nleverages a pre-trained visual-language model as an encoder, accommodating both\ntext and image inputs for effective feature representation from both\nmodalities. It introduces a unique feature swapping module, enabling the\nexchange of features between identified objects in images and corresponding\nvocabulary words in text, thereby enhancing the model's comprehension of the\ninterplay between images and text. To further bolster its multi-modal alignment\ncapabilities, FSMR incorporates a multi-modal cross-attention mechanism,\nfacilitating the joint modeling of textual and visual information. During\ntraining, we employ image-text matching and cross-entropy losses to ensure\nsemantic consistency between visual and language elements. Extensive\nexperiments on the PMR dataset demonstrate FSMR's superiority over\nstate-of-the-art baseline models across various performance metrics.\n","authors":["Shuang Li","Jiahua Wang","Lijie Wen"],"pdf_url":"https://arxiv.org/pdf/2403.20026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10908v2","updated":"2024-03-29T07:24:23Z","published":"2023-12-18T03:34:07Z","title":"CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update","summary":" Utilizing large language models (LLMs) to compose off-the-shelf visual tools\nrepresents a promising avenue of research for developing robust visual\nassistants capable of addressing diverse visual tasks. However, these methods\noften overlook the potential for continual learning, typically by freezing the\nutilized tools, thus limiting their adaptation to environments requiring new\nknowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual\nAssistant, which operates within a framework encompassing inference,\nreflection, and learning phases. During the inference phase, LLMs generate\nprograms and execute corresponding tools to complete assigned tasks. In the\nreflection phase, a multimodal global-local reflection scheme analyzes human\nfeedback to determine which tools require updating. Lastly, the learning phase\nemploys three flexible approaches to automatically gather training data and\nintroduces a novel prompt tuning scheme to update the tools, allowing CLOVA to\nefficiently acquire new knowledge. Experimental findings demonstrate that CLOVA\nsurpasses existing tool-usage methods by 5% in visual question answering and\nmultiple-image reasoning, by 10% in knowledge tagging, and by 20% in image\nediting. These results underscore the significance of the continual learning\ncapability in general visual assistants.\n","authors":["Zhi Gao","Yuntao Du","Xintong Zhang","Xiaojian Ma","Wenjuan Han","Song-Chun Zhu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2312.10908v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15082v3","updated":"2024-03-29T07:20:42Z","published":"2024-03-22T10:06:31Z","title":"Cell Variational Information Bottleneck Network","summary":" In this work, we propose Cell Variational Information Bottleneck Network\n(cellVIB), a convolutional neural network using information bottleneck\nmechanism, which can be combined with the latest feedforward network\narchitecture in an end-to-end training method. Our Cell Variational Information\nBottleneck Network is constructed by stacking VIB cells, which generate feature\nmaps with uncertainty. As layers going deeper, the regularization effect will\ngradually increase, instead of directly adding excessive regular constraints to\nthe output layer of the model as in Deep VIB. Under each VIB cell, the\nfeedforward process learns an independent mean term and an standard deviation\nterm, and predicts the Gaussian distribution based on them. The feedback\nprocess is based on reparameterization trick for effective training. This work\nperforms an extensive analysis on MNIST dataset to verify the effectiveness of\neach VIB cells, and provides an insightful analysis on how the VIB cells affect\nmutual information. Experiments conducted on CIFAR-10 also prove that our\ncellVIB is robust against noisy labels during training and against corrupted\nimages during testing. Then, we validate our method on PACS dataset, whose\nresults show that the VIB cells can significantly improve the generalization\nperformance of the basic model. Finally, in a more complex representation\nlearning task, face recognition, our network structure has also achieved very\ncompetitive results.\n","authors":["Zhonghua Zhai","Chen Ju","Jinsong Lan","Shuai Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.15082v3.pdf","comment":"Found errors in the article, therefore postponing publication for now"},{"id":"http://arxiv.org/abs/2403.20022v1","updated":"2024-03-29T07:16:34Z","published":"2024-03-29T07:16:34Z","title":"Psychometry: An Omnifit Model for Image Reconstruction from Human Brain\n Activity","summary":" Reconstructing the viewed images from human brain activity bridges human and\ncomputer vision through the Brain-Computer Interface. The inherent variability\nin brain function between individuals leads existing literature to focus on\nacquiring separate models for each individual using their respective brain\nsignal data, ignoring commonalities between these data. In this article, we\ndevise Psychometry, an omnifit model for reconstructing images from functional\nMagnetic Resonance Imaging (fMRI) obtained from different subjects. Psychometry\nincorporates an omni mixture-of-experts (Omni MoE) module where all the experts\nwork together to capture the inter-subject commonalities, while each expert\nassociated with subject-specific parameters copes with the individual\ndifferences. Moreover, Psychometry is equipped with a retrieval-enhanced\ninference strategy, termed Ecphory, which aims to enhance the learned fMRI\nrepresentation via retrieving from prestored subject-specific memories. These\ndesigns collectively render Psychometry omnifit and efficient, enabling it to\ncapture both inter-subject commonality and individual specificity across\nsubjects. As a result, the enhanced fMRI representations serve as conditional\nsignals to guide a generation model to reconstruct high-quality and realistic\nimages, establishing Psychometry as state-of-the-art in terms of both\nhigh-level and low-level metrics.\n","authors":["Ruijie Quan","Wenguan Wang","Zhibo Tian","Fan Ma","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.20022v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.20018v1","updated":"2024-03-29T07:14:14Z","published":"2024-03-29T07:14:14Z","title":"SCINeRF: Neural Radiance Fields from a Snapshot Compressive Image","summary":" In this paper, we explore the potential of Snapshot Compressive Imaging (SCI)\ntechnique for recovering the underlying 3D scene representation from a single\ntemporal compressed image. SCI is a cost-effective method that enables the\nrecording of high-dimensional data, such as hyperspectral or temporal\ninformation, into a single image using low-cost 2D imaging sensors. To achieve\nthis, a series of specially designed 2D masks are usually employed, which not\nonly reduces storage requirements but also offers potential privacy protection.\nInspired by this, to take one step further, our approach builds upon the\npowerful 3D scene representation capabilities of neural radiance fields (NeRF).\nSpecifically, we formulate the physical imaging process of SCI as part of the\ntraining of NeRF, allowing us to exploit its impressive performance in\ncapturing complex scene structures. To assess the effectiveness of our method,\nwe conduct extensive evaluations using both synthetic data and real data\ncaptured by our SCI system. Extensive experimental results demonstrate that our\nproposed approach surpasses the state-of-the-art methods in terms of image\nreconstruction and novel view image synthesis. Moreover, our method also\nexhibits the ability to restore high frame-rate multi-view consistent images by\nleveraging SCI and the rendering capabilities of NeRF. The code is available at\nhttps://github.com/WU-CVGL/SCINeRF.\n","authors":["Yunhao Li","Xiaodong Wang","Ping Wang","Xin Yuan","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.20018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20013v1","updated":"2024-03-29T06:58:57Z","published":"2024-03-29T06:58:57Z","title":"DerainNeRF: 3D Scene Estimation with Adhesive Waterdrop Removal","summary":" When capturing images through the glass during rainy or snowy weather\nconditions, the resulting images often contain waterdrops adhered on the glass\nsurface, and these waterdrops significantly degrade the image quality and\nperformance of many computer vision algorithms. To tackle these limitations, we\npropose a method to reconstruct the clear 3D scene implicitly from multi-view\nimages degraded by waterdrops. Our method exploits an attention network to\npredict the location of waterdrops and then train a Neural Radiance Fields to\nrecover the 3D scene implicitly. By leveraging the strong scene representation\ncapabilities of NeRF, our method can render high-quality novel-view images with\nwaterdrops removed. Extensive experimental results on both synthetic and real\ndatasets show that our method is able to generate clear 3D scenes and\noutperforms existing state-of-the-art (SOTA) image adhesive waterdrop removal\nmethods.\n","authors":["Yunhao Li","Jing Wu","Lingzhe Zhao","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.20013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20012v1","updated":"2024-03-29T06:53:52Z","published":"2024-03-29T06:53:52Z","title":"Colorful Cutout: Enhancing Image Data Augmentation with Curriculum\n Learning","summary":" Data augmentation is one of the regularization strategies for the training of\ndeep learning models, which enhances generalizability and prevents overfitting,\nleading to performance improvement. Although researchers have proposed various\ndata augmentation techniques, they often lack consideration for the difficulty\nof augmented data. Recently, another line of research suggests incorporating\nthe concept of curriculum learning with data augmentation in the field of\nnatural language processing. In this study, we adopt curriculum data\naugmentation for image data augmentation and propose colorful cutout, which\ngradually increases the noise and difficulty introduced in the augmented image.\nOur experimental results highlight the possibility of curriculum data\naugmentation for image data. We publicly released our source code to improve\nthe reproducibility of our study.\n","authors":["Juhwan Choi","YoungBin Kim"],"pdf_url":"https://arxiv.org/pdf/2403.20012v1.pdf","comment":"ICLR 2024 Tiny Papers"},{"id":"http://arxiv.org/abs/2403.12236v2","updated":"2024-03-29T06:41:07Z","published":"2024-03-18T20:33:44Z","title":"Improving Generalization via Meta-Learning on Hard Samples","summary":" Learned reweighting (LRW) approaches to supervised learning use an\noptimization criterion to assign weights for training instances, in order to\nmaximize performance on a representative validation dataset. We pose and\nformalize the problem of optimized selection of the validation set used in LRW\ntraining, to improve classifier generalization. In particular, we show that\nusing hard-to-classify instances in the validation set has both a theoretical\nconnection to, and strong empirical evidence of generalization. We provide an\nefficient algorithm for training this meta-optimized model, as well as a simple\ntrain-twice heuristic for careful comparative study. We demonstrate that LRW\nwith easy validation data performs consistently worse than LRW with hard\nvalidation data, establishing the validity of our meta-optimization problem.\nOur proposed algorithm outperforms a wide range of baselines on a range of\ndatasets and domain shift challenges (Imagenet-1K, CIFAR-100, Clothing-1M,\nCAMELYON, WILDS, etc.), with ~1% gains using VIT-B on Imagenet. We also show\nthat using naturally hard examples for validation (Imagenet-R / Imagenet-A) in\nLRW training for Imagenet improves performance on both clean and naturally hard\ntest instances by 1-2%. Secondary analyses show that using hard validation data\nin an LRW framework improves margins on test data, hinting at the mechanism\nunderlying our empirical gains. We believe this work opens up new research\ndirections for the meta-optimization of meta-learning in a supervised learning\ncontext.\n","authors":["Nishant Jain","Arun S. Suggala","Pradeep Shenoy"],"pdf_url":"https://arxiv.org/pdf/2403.12236v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.20002v1","updated":"2024-03-29T06:33:13Z","published":"2024-03-29T06:33:13Z","title":"Grounding and Enhancing Grid-based Models for Neural Fields","summary":" Many contemporary studies utilize grid-based models for neural field\nrepresentation, but a systematic analysis of grid-based models is still\nmissing, hindering the improvement of those models. Therefore, this paper\nintroduces a theoretical framework for grid-based models. This framework points\nout that these models' approximation and generalization behaviors are\ndetermined by grid tangent kernels (GTK), which are intrinsic properties of\ngrid-based models. The proposed framework facilitates a consistent and\nsystematic analysis of diverse grid-based models. Furthermore, the introduced\nframework motivates the development of a novel grid-based model named the\nMultiplicative Fourier Adaptive Grid (MulFAGrid). The numerical analysis\ndemonstrates that MulFAGrid exhibits a lower generalization bound than its\npredecessors, indicating its robust generalization performance. Empirical\nstudies reveal that MulFAGrid achieves state-of-the-art performance in various\ntasks, including 2D image fitting, 3D signed distance field (SDF)\nreconstruction, and novel view synthesis, demonstrating superior representation\nability. The project website is available at\nhttps://sites.google.com/view/cvpr24-2034-submission/home.\n","authors":["Zelin Zhao","Fenglei Fan","Wenlong Liao","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2403.20002v1.pdf","comment":"Accepted in CVPR24"},{"id":"http://arxiv.org/abs/2312.13528v2","updated":"2024-03-29T05:57:33Z","published":"2023-12-21T02:01:19Z","title":"DyBluRF: Dynamic Deblurring Neural Radiance Fields for Blurry Monocular\n Video","summary":" Neural Radiance Fields (NeRF), initially developed for static scenes, have\ninspired many video novel view synthesis techniques. However, the challenge for\nvideo view synthesis arises from motion blur, a consequence of object or camera\nmovement during exposure, which hinders the precise synthesis of sharp\nspatio-temporal views. In response, we propose a novel dynamic deblurring NeRF\nframework for blurry monocular video, called DyBluRF, consisting of a Base Ray\nInitialization (BRI) stage and a Motion Decomposition-based Deblurring (MDD)\nstage. Our DyBluRF is the first that handles the novel view synthesis for\nblurry monocular video with a novel two-stage framework. In the BRI stage, we\ncoarsely reconstruct dynamic 3D scenes and jointly initialize the base ray,\nwhich is further used to predict latent sharp rays, using the inaccurate camera\npose information from the given blurry frames. In the MDD stage, we introduce a\nnovel Incremental Latent Sharp-rays Prediction (ILSP) approach for the blurry\nmonocular video frames by decomposing the latent sharp rays into global camera\nmotion and local object motion components. We further propose two loss\nfunctions for effective geometry regularization and decomposition of static and\ndynamic scene components without any mask supervision. Experiments show that\nDyBluRF outperforms qualitatively and quantitatively the SOTA methods.\n","authors":["Minh-Quan Viet Bui","Jongmin Park","Jihyong Oh","Munchurl Kim"],"pdf_url":"https://arxiv.org/pdf/2312.13528v2.pdf","comment":"The first two authors contributed equally to this work (equal\n contribution). The last two authors advised equally to this work. Please\n visit our project page at https://kaist-viclab.github.io/dyblurf-site/"},{"id":"http://arxiv.org/abs/2403.19985v1","updated":"2024-03-29T05:39:47Z","published":"2024-03-29T05:39:47Z","title":"Stable Surface Regularization for Fast Few-Shot NeRF","summary":" This paper proposes an algorithm for synthesizing novel views under few-shot\nsetup. The main concept is to develop a stable surface regularization technique\ncalled Annealing Signed Distance Function (ASDF), which anneals the surface in\na coarse-to-fine manner to accelerate convergence speed. We observe that the\nEikonal loss - which is a widely known geometric regularization - requires\ndense training signal to shape different level-sets of SDF, leading to\nlow-fidelity results under few-shot training. In contrast, the proposed surface\nregularization successfully reconstructs scenes and produce high-fidelity\ngeometry with stable training. Our method is further accelerated by utilizing\ngrid representation and monocular geometric priors. Finally, the proposed\napproach is up to 45 times faster than existing few-shot novel view synthesis\nmethods, and it produces comparable results in the ScanNet dataset and\nNeRF-Real dataset.\n","authors":["Byeongin Joung","Byeong-Uk Lee","Jaesung Choe","Ukcheol Shin","Minjun Kang","Taeyeop Lee","In So Kweon","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2403.19985v1.pdf","comment":"3DV 2024"},{"id":"http://arxiv.org/abs/2403.19983v1","updated":"2024-03-29T05:35:04Z","published":"2024-03-29T05:35:04Z","title":"A multi-stage semi-supervised learning for ankle fracture classification\n on CT images","summary":" Because of the complicated mechanism of ankle injury, it is very difficult to\ndiagnose ankle fracture in clinic. In order to simplify the process of fracture\ndiagnosis, an automatic diagnosis model of ankle fracture was proposed.\nFirstly, a tibia-fibula segmentation network is proposed for the joint\ntibiofibular region of the ankle joint, and the corresponding segmentation\ndataset is established on the basis of fracture data. Secondly, the image\nregistration method is used to register the bone segmentation mask with the\nnormal bone mask. Finally, a semi-supervised classifier is constructed to make\nfull use of a large number of unlabeled data to classify ankle fractures.\nExperiments show that the proposed method can segment fractures with fracture\nlines accurately and has better performance than the general method. At the\nsame time, this method is superior to classification network in several\nindexes.\n","authors":["Hongzhi Liu","Guicheng Li","Jiacheng Nie","Hui Tang","Chunfeng Yang","Qianjin Feng","Hailin Xu","Yang Chen"],"pdf_url":"https://arxiv.org/pdf/2403.19983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19980v1","updated":"2024-03-29T05:23:34Z","published":"2024-03-29T05:23:34Z","title":"A Parallel Attention Network for Cattle Face Recognition","summary":" Cattle face recognition holds paramount significance in domains such as\nanimal husbandry and behavioral research. Despite significant progress in\nconfined environments, applying these accomplishments in wild settings remains\nchallenging. Thus, we create the first large-scale cattle face recognition\ndataset, ICRWE, for wild environments. It encompasses 483 cattle and 9,816\nhigh-resolution image samples. Each sample undergoes annotation for face\nfeatures, light conditions, and face orientation. Furthermore, we introduce a\nnovel parallel attention network, PANet. Comprising several cascaded\nTransformer modules, each module incorporates two parallel Position Attention\nModules (PAM) and Feature Mapping Modules (FMM). PAM focuses on local and\nglobal features at each image position through parallel channel attention, and\nFMM captures intricate feature patterns through non-linear mappings.\nExperimental results indicate that PANet achieves a recognition accuracy of\n88.03% on the ICRWE dataset, establishing itself as the current\nstate-of-the-art approach. The source code is available in the supplementary\nmaterials.\n","authors":["Jiayu Li","Xuechao Zou","Shiying Wang","Ben Chen","Junliang Xing","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2403.19980v1.pdf","comment":"Accepted by ICME 2024"},{"id":"http://arxiv.org/abs/2403.19979v1","updated":"2024-03-29T05:23:12Z","published":"2024-03-29T05:23:12Z","title":"Semantically-Shifted Incremental Adapter-Tuning is A Continual\n ViTransformer","summary":" Class-incremental learning (CIL) aims to enable models to continuously learn\nnew classes while overcoming catastrophic forgetting. The introduction of\npre-trained models has brought new tuning paradigms to CIL. In this paper, we\nrevisit different parameter-efficient tuning (PET) methods within the context\nof continual learning. We observe that adapter tuning demonstrates superiority\nover prompt-based methods, even without parameter expansion in each learning\nsession. Motivated by this, we propose incrementally tuning the shared adapter\nwithout imposing parameter update constraints, enhancing the learning capacity\nof the backbone. Additionally, we employ feature sampling from stored\nprototypes to retrain a unified classifier, further improving its performance.\nWe estimate the semantic shift of old prototypes without access to past samples\nand update stored prototypes session by session. Our proposed method eliminates\nmodel expansion and avoids retaining any image samples. It surpasses previous\npre-trained model-based CIL methods and demonstrates remarkable continual\nlearning capabilities. Experimental results on five CIL benchmarks validate the\neffectiveness of our approach, achieving state-of-the-art (SOTA) performance.\n","authors":["Yuwen Tan","Qinhao Zhou","Xiang Xiang","Ke Wang","Yuchuan Wu","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.19979v1.pdf","comment":"To appear at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16558v2","updated":"2024-03-29T05:12:45Z","published":"2024-03-25T09:17:15Z","title":"Elysium: Exploring Object-level Perception in Videos via MLLM","summary":" Multi-modal Large Language Models (MLLMs) have demonstrated their ability to\nperceive objects in still images, but their application in video-related tasks,\nsuch as object tracking, remains understudied. This lack of exploration is\nprimarily due to two key challenges. Firstly, extensive pretraining on\nlarge-scale video datasets is required to equip MLLMs with the capability to\nperceive objects across multiple frames and understand inter-frame\nrelationships. Secondly, processing a large number of frames within the context\nwindow of Large Language Models (LLMs) can impose a significant computational\nburden. To address the first challenge, we introduce ElysiumTrack-1M, a\nlarge-scale video dataset supported for three tasks: Single Object Tracking\n(SOT), Referring Single Object Tracking (RSOT), and Video Referring Expression\nGeneration (Video-REG). ElysiumTrack-1M contains 1.27 million annotated video\nframes with corresponding object boxes and descriptions. Leveraging this\ndataset, we conduct training of MLLMs and propose a token-compression model\nT-Selector to tackle the second challenge. Our proposed approach, Elysium:\nExploring Object-level Perception in Videos via MLLM, is an end-to-end\ntrainable MLLM that attempts to conduct object-level tasks in videos without\nrequiring any additional plug-in or expert models. All codes and datasets are\navailable at https://github.com/Hon-Wong/Elysium.\n","authors":["Han Wang","Yanjie Wang","Yongjie Ye","Yuxiang Nie","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2403.16558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19976v1","updated":"2024-03-29T04:58:56Z","published":"2024-03-29T04:58:56Z","title":"eTraM: Event-based Traffic Monitoring Dataset","summary":" Event cameras, with their high temporal and dynamic range and minimal memory\nusage, have found applications in various fields. However, their potential in\nstatic traffic monitoring remains largely unexplored. To facilitate this\nexploration, we present eTraM - a first-of-its-kind, fully event-based traffic\nmonitoring dataset. eTraM offers 10 hr of data from different traffic scenarios\nin various lighting and weather conditions, providing a comprehensive overview\nof real-world situations. Providing 2M bounding box annotations, it covers\neight distinct classes of traffic participants, ranging from vehicles to\npedestrians and micro-mobility. eTraM's utility has been assessed using\nstate-of-the-art methods for traffic participant detection, including RVT, RED,\nand YOLOv8. We quantitatively evaluate the ability of event-based models to\ngeneralize on nighttime and unseen scenes. Our findings substantiate the\ncompelling potential of leveraging event cameras for traffic monitoring,\nopening new avenues for research and application. eTraM is available at\nhttps://eventbasedvision.github.io/eTraM\n","authors":["Aayush Atul Verma","Bharatesh Chakravarthi","Arpitsinh Vaghela","Hua Wei","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19975v1","updated":"2024-03-29T04:58:33Z","published":"2024-03-29T04:58:33Z","title":"Context-Aware Integration of Language and Visual References for Natural\n Language Tracking","summary":" Tracking by natural language specification (TNL) aims to consistently\nlocalize a target in a video sequence given a linguistic description in the\ninitial frame. Existing methodologies perform language-based and template-based\nmatching for target reasoning separately and merge the matching results from\ntwo sources, which suffer from tracking drift when language and visual\ntemplates miss-align with the dynamic target state and ambiguity in the later\nmerging stage. To tackle the issues, we propose a joint multi-modal tracking\nframework with 1) a prompt modulation module to leverage the complementarity\nbetween temporal visual templates and language expressions, enabling precise\nand context-aware appearance and linguistic cues, and 2) a unified target\ndecoding module to integrate the multi-modal reference cues and executes the\nintegrated queries on the search image to predict the target location in an\nend-to-end manner directly. This design ensures spatio-temporal consistency by\nleveraging historical visual information and introduces an integrated solution,\ngenerating predictions in a single step. Extensive experiments conducted on\nTNL2K, OTB-Lang, LaSOT, and RefCOCOg validate the efficacy of our proposed\napproach. The results demonstrate competitive performance against\nstate-of-the-art methods for both tracking and grounding.\n","authors":["Yanyan Shao","Shuting He","Qi Ye","Yuchao Feng","Wenhan Luo","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2403.19975v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2311.17132v2","updated":"2024-03-29T04:55:51Z","published":"2023-11-28T18:03:27Z","title":"TransNeXt: Robust Foveal Visual Perception for Vision Transformers","summary":" Due to the depth degradation effect in residual connections, many efficient\nVision Transformers models that rely on stacking layers for information\nexchange often fail to form sufficient information mixing, leading to unnatural\nvisual perception. To address this issue, in this paper, we propose Aggregated\nAttention, a biomimetic design-based token mixer that simulates biological\nfoveal vision and continuous eye movement while enabling each token on the\nfeature map to have a global perception. Furthermore, we incorporate learnable\ntokens that interact with conventional queries and keys, which further\ndiversifies the generation of affinity matrices beyond merely relying on the\nsimilarity between queries and keys. Our approach does not rely on stacking for\ninformation exchange, thus effectively avoiding depth degradation and achieving\nnatural visual perception. Additionally, we propose Convolutional GLU, a\nchannel mixer that bridges the gap between GLU and SE mechanism, which empowers\neach token to have channel attention based on its nearest neighbor image\nfeatures, enhancing local modeling capability and model robustness. We combine\naggregated attention and convolutional GLU to create a new visual backbone\ncalled TransNeXt. Extensive experiments demonstrate that our TransNeXt achieves\nstate-of-the-art performance across multiple model sizes. At a resolution of\n$224^2$, TransNeXt-Tiny attains an ImageNet accuracy of 84.0%, surpassing\nConvNeXt-B with 69% fewer parameters. Our TransNeXt-Base achieves an ImageNet\naccuracy of 86.2% and an ImageNet-A accuracy of 61.6% at a resolution of\n$384^2$, a COCO object detection mAP of 57.1, and an ADE20K semantic\nsegmentation mIoU of 54.7.\n","authors":["Dai Shi"],"pdf_url":"https://arxiv.org/pdf/2311.17132v2.pdf","comment":"CVPR 2024 Camera-ready Version. Project Page:\n https://github.com/DaiShiResearch/TransNeXt"},{"id":"http://arxiv.org/abs/2312.11461v2","updated":"2024-03-29T04:32:57Z","published":"2023-12-18T18:59:12Z","title":"GAvatar: Animatable 3D Gaussian Avatars with Implicit Mesh Learning","summary":" Gaussian splatting has emerged as a powerful 3D representation that harnesses\nthe advantages of both explicit (mesh) and implicit (NeRF) 3D representations.\nIn this paper, we seek to leverage Gaussian splatting to generate realistic\nanimatable avatars from textual descriptions, addressing the limitations (e.g.,\nflexibility and efficiency) imposed by mesh or NeRF-based representations.\nHowever, a naive application of Gaussian splatting cannot generate high-quality\nanimatable avatars and suffers from learning instability; it also cannot\ncapture fine avatar geometries and often leads to degenerate body parts. To\ntackle these problems, we first propose a primitive-based 3D Gaussian\nrepresentation where Gaussians are defined inside pose-driven primitives to\nfacilitate animation. Second, to stabilize and amortize the learning of\nmillions of Gaussians, we propose to use neural implicit fields to predict the\nGaussian attributes (e.g., colors). Finally, to capture fine avatar geometries\nand extract detailed meshes, we propose a novel SDF-based implicit mesh\nlearning approach for 3D Gaussians that regularizes the underlying geometries\nand extracts highly detailed textured meshes. Our proposed method, GAvatar,\nenables the large-scale generation of diverse animatable avatars using only\ntext prompts. GAvatar significantly surpasses existing methods in terms of both\nappearance and geometry quality, and achieves extremely fast rendering (100\nfps) at 1K resolution.\n","authors":["Ye Yuan","Xueting Li","Yangyi Huang","Shalini De Mello","Koki Nagano","Jan Kautz","Umar Iqbal"],"pdf_url":"https://arxiv.org/pdf/2312.11461v2.pdf","comment":"CVPR 2024. Project website: https://nvlabs.github.io/GAvatar"},{"id":"http://arxiv.org/abs/2403.19969v1","updated":"2024-03-29T04:28:06Z","published":"2024-03-29T04:28:06Z","title":"Separate, Dynamic and Differentiable (SMART) Pruner for Block/Output\n Channel Pruning on Computer Vision Tasks","summary":" Deep Neural Network (DNN) pruning has emerged as a key strategy to reduce\nmodel size, improve inference latency, and lower power consumption on DNN\naccelerators. Among various pruning techniques, block and output channel\npruning have shown significant potential in accelerating hardware performance.\nHowever, their accuracy often requires further improvement. In response to this\nchallenge, we introduce a separate, dynamic and differentiable (SMART) pruner.\nThis pruner stands out by utilizing a separate, learnable probability mask for\nweight importance ranking, employing a differentiable Top k operator to achieve\ntarget sparsity, and leveraging a dynamic temperature parameter trick to escape\nfrom non-sparse local minima. In our experiments, the SMART pruner consistently\ndemonstrated its superiority over existing pruning methods across a wide range\nof tasks and models on block and output channel pruning. Additionally, we\nextend our testing to Transformer-based models in N:M pruning scenarios, where\nSMART pruner also yields state-of-the-art results, demonstrating its\nadaptability and robustness across various neural network architectures, and\npruning types.\n","authors":["Guanhua Ding","Zexi Ye","Zhen Zhong","Gang Li","David Shao"],"pdf_url":"https://arxiv.org/pdf/2403.19969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10521v3","updated":"2024-03-29T04:21:27Z","published":"2024-03-15T17:59:53Z","title":"P-MapNet: Far-seeing Map Generator Enhanced by both SDMap and HDMap\n Priors","summary":" Autonomous vehicles are gradually entering city roads today, with the help of\nhigh-definition maps (HDMaps). However, the reliance on HDMaps prevents\nautonomous vehicles from stepping into regions without this expensive digital\ninfrastructure. This fact drives many researchers to study online HDMap\ngeneration algorithms, but the performance of these algorithms at far regions\nis still unsatisfying. We present P-MapNet, in which the letter P highlights\nthe fact that we focus on incorporating map priors to improve model\nperformance. Specifically, we exploit priors in both SDMap and HDMap. On one\nhand, we extract weakly aligned SDMap from OpenStreetMap, and encode it as an\nadditional conditioning branch. Despite the misalignment challenge, our\nattention-based architecture adaptively attends to relevant SDMap skeletons and\nsignificantly improves performance. On the other hand, we exploit a masked\nautoencoder to capture the prior distribution of HDMap, which can serve as a\nrefinement module to mitigate occlusions and artifacts. We benchmark on the\nnuScenes and Argoverse2 datasets. Through comprehensive experiments, we show\nthat: (1) our SDMap prior can improve online map generation performance, using\nboth rasterized (by up to $+18.73$ $\\rm mIoU$) and vectorized (by up to $+8.50$\n$\\rm mAP$) output representations. (2) our HDMap prior can improve map\nperceptual metrics by up to $6.34\\%$. (3) P-MapNet can be switched into\ndifferent inference modes that covers different regions of the\naccuracy-efficiency trade-off landscape. (4) P-MapNet is a far-seeing solution\nthat brings larger improvements on longer ranges. Codes and models are publicly\navailable at https://jike5.github.io/P-MapNet.\n","authors":["Zhou Jiang","Zhenxin Zhu","Pengfei Li","Huan-ang Gao","Tianyuan Yuan","Yongliang Shi","Hang Zhao","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.10521v3.pdf","comment":"Code: https://jike5.github.io/P-MapNet"},{"id":"http://arxiv.org/abs/2211.13398v3","updated":"2024-03-29T04:13:49Z","published":"2022-11-24T03:27:00Z","title":"CPPF++: Uncertainty-Aware Sim2Real Object Pose Estimation by Vote\n Aggregation","summary":" Object pose estimation constitutes a critical area within the domain of 3D\nvision. While contemporary state-of-the-art methods that leverage real-world\npose annotations have demonstrated commendable performance, the procurement of\nsuch real training data incurs substantial costs. This paper focuses on a\nspecific setting wherein only 3D CAD models are utilized as a priori knowledge,\ndevoid of any background or clutter information. We introduce a novel method,\nCPPF++, designed for sim-to-real pose estimation. This method builds upon the\nfoundational point-pair voting scheme of CPPF, reformulating it through a\nprobabilistic view. To address the challenge posed by vote collision, we\npropose a novel approach that involves modeling the voting uncertainty by\nestimating the probabilistic distribution of each point pair within the\ncanonical space. Furthermore, we augment the contextual information provided by\neach voting unit through the introduction of N-point tuples. To enhance the\nrobustness and accuracy of the model, we incorporate several innovative\nmodules, including noisy pair filtering, online alignment optimization, and a\ntuple feature ensemble. Alongside these methodological advancements, we\nintroduce a new category-level pose estimation dataset, named DiversePose 300.\nEmpirical evidence demonstrates that our method significantly surpasses\nprevious sim-to-real approaches and achieves comparable or superior performance\non novel datasets. Our code is available on https://github.com/qq456cvb/CPPF2.\n","authors":["Yang You","Wenhao He","Jin Liu","Hongkai Xiong","Weiming Wang","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2211.13398v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19967v1","updated":"2024-03-29T04:10:07Z","published":"2024-03-29T04:10:07Z","title":"Rewrite the Stars","summary":" Recent studies have drawn attention to the untapped potential of the \"star\noperation\" (element-wise multiplication) in network design. While intuitive\nexplanations abound, the foundational rationale behind its application remains\nlargely unexplored. Our study attempts to reveal the star operation's ability\nto map inputs into high-dimensional, non-linear feature spaces -- akin to\nkernel tricks -- without widening the network. We further introduce StarNet, a\nsimple yet powerful prototype, demonstrating impressive performance and low\nlatency under compact network structure and efficient budget. Like stars in the\nsky, the star operation appears unremarkable but holds a vast universe of\npotential. Our work encourages further exploration across tasks, with codes\navailable at https://github.com/ma-xu/Rewrite-the-Stars.\n","authors":["Xu Ma","Xiyang Dai","Yue Bai","Yizhou Wang","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2403.19967v1.pdf","comment":"Accepted by CVPR 2024. Codes are made publically available at\n https://github.com/ma-xu/Rewrite-the-Stars"},{"id":"http://arxiv.org/abs/2311.16714v2","updated":"2024-03-29T04:07:25Z","published":"2023-11-28T11:53:56Z","title":"Embodied Multi-Modal Agent trained by an LLM from a Parallel TextWorld","summary":" While large language models (LLMs) excel in a simulated world of texts, they\nstruggle to interact with the more realistic world without perceptions of other\nmodalities such as visual or audio signals. Although vision-language models\n(VLMs) integrate LLM modules (1) aligned with static image features, and (2)\nmay possess prior knowledge of world dynamics (as demonstrated in the text\nworld), they have not been trained in an embodied visual world and thus cannot\nalign with its dynamics. On the other hand, training an embodied agent in a\nnoisy visual world without expert guidance is often challenging and\ninefficient. In this paper, we train a VLM agent living in a visual world using\nan LLM agent excelling in a parallel text world. Specifically, we distill LLM's\nreflection outcomes (improved actions by analyzing mistakes) in a text world's\ntasks to finetune the VLM on the same tasks of the visual world, resulting in\nan Embodied Multi-Modal Agent (EMMA) quickly adapting to the visual world\ndynamics. Such cross-modality imitation learning between the two parallel\nworlds is achieved by a novel DAgger-DPO algorithm, enabling EMMA to generalize\nto a broad scope of new tasks without any further guidance from the LLM expert.\nExtensive evaluations on the ALFWorld benchmark's diverse tasks highlight\nEMMA's superior performance to SOTA VLM-based agents, e.g., 20%-70% improvement\nin the success rate.\n","authors":["Yijun Yang","Tianyi Zhou","Kanxue Li","Dapeng Tao","Lusong Li","Li Shen","Xiaodong He","Jing Jiang","Yuhui Shi"],"pdf_url":"https://arxiv.org/pdf/2311.16714v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19966v1","updated":"2024-03-29T04:02:51Z","published":"2024-03-29T04:02:51Z","title":"Multi-task Magnetic Resonance Imaging Reconstruction using Meta-learning","summary":" Using single-task deep learning methods to reconstruct Magnetic Resonance\nImaging (MRI) data acquired with different imaging sequences is inherently\nchallenging. The trained deep learning model typically lacks generalizability,\nand the dissimilarity among image datasets with different types of contrast\nleads to suboptimal learning performance. This paper proposes a meta-learning\napproach to efficiently learn image features from multiple MR image datasets.\nOur algorithm can perform multi-task learning to simultaneously reconstruct MR\nimages acquired using different imaging sequences with different image\ncontrasts. The experiment results demonstrate the ability of our new\nmeta-learning reconstruction method to successfully reconstruct\nhighly-undersampled k-space data from multiple MRI datasets simultaneously,\noutperforming other compelling reconstruction methods previously developed for\nsingle-task learning.\n","authors":["Wanyu Bian","Albert Jang","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19964v1","updated":"2024-03-29T03:56:19Z","published":"2024-03-29T03:56:19Z","title":"FairRAG: Fair Human Generation via Fair Retrieval Augmentation","summary":" Existing text-to-image generative models reflect or even amplify societal\nbiases ingrained in their training data. This is especially concerning for\nhuman image generation where models are biased against certain demographic\ngroups. Existing attempts to rectify this issue are hindered by the inherent\nlimitations of the pre-trained models and fail to substantially improve\ndemographic diversity. In this work, we introduce Fair Retrieval Augmented\nGeneration (FairRAG), a novel framework that conditions pre-trained generative\nmodels on reference images retrieved from an external image database to improve\nfairness in human generation. FairRAG enables conditioning through a\nlightweight linear module that projects reference images into the textual\nspace. To enhance fairness, FairRAG applies simple-yet-effective debiasing\nstrategies, providing images from diverse demographic groups during the\ngenerative process. Extensive experiments demonstrate that FairRAG outperforms\nexisting methods in terms of demographic diversity, image-text alignment, and\nimage fidelity while incurring minimal computational overhead during inference.\n","authors":["Robik Shrestha","Yang Zou","Qiuyu Chen","Zhiheng Li","Yusheng Xie","Siqi Deng"],"pdf_url":"https://arxiv.org/pdf/2403.19964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19963v1","updated":"2024-03-29T03:48:35Z","published":"2024-03-29T03:48:35Z","title":"Efficient Modulation for Vision Networks","summary":" In this work, we present efficient modulation, a novel design for efficient\nvision networks. We revisit the modulation mechanism, which operates input\nthrough convolutional context modeling and feature projection layers, and fuses\nfeatures via element-wise multiplication and an MLP block. We demonstrate that\nthe modulation mechanism is particularly well suited for efficient networks and\nfurther tailor the modulation design by proposing the efficient modulation\n(EfficientMod) block, which is considered the essential building block for our\nnetworks. Benefiting from the prominent representational ability of modulation\nmechanism and the proposed efficient design, our network can accomplish better\ntrade-offs between accuracy and efficiency and set new state-of-the-art\nperformance in the zoo of efficient networks. When integrating EfficientMod\nwith the vanilla self-attention block, we obtain the hybrid architecture which\nfurther improves the performance without loss of efficiency. We carry out\ncomprehensive experiments to verify EfficientMod's performance. With fewer\nparameters, our EfficientMod-s performs 0.6 top-1 accuracy better than\nEfficientFormerV2-s2 and is 25% faster on GPU, and 2.9 better than\nMobileViTv2-1.0 at the same GPU latency. Additionally, our method presents a\nnotable improvement in downstream tasks, outperforming EfficientFormerV2-s by\n3.6 mIoU on the ADE20K benchmark. Code and checkpoints are available at\nhttps://github.com/ma-xu/EfficientMod.\n","authors":["Xu Ma","Xiyang Dai","Jianwei Yang","Bin Xiao","Yinpeng Chen","Yun Fu","Lu Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.19963v1.pdf","comment":"Accepted by ICLR 2024. Codes are made publically available at\n https://github.com/ma-xu/EfficientMod"},{"id":"http://arxiv.org/abs/2310.05916v4","updated":"2024-03-29T03:40:47Z","published":"2023-10-09T17:59:04Z","title":"Interpreting CLIP's Image Representation via Text-Based Decomposition","summary":" We investigate the CLIP image encoder by analyzing how individual model\ncomponents affect the final representation. We decompose the image\nrepresentation as a sum across individual image patches, model layers, and\nattention heads, and use CLIP's text representation to interpret the summands.\nInterpreting the attention heads, we characterize each head's role by\nautomatically finding text representations that span its output space, which\nreveals property-specific roles for many heads (e.g. location or shape). Next,\ninterpreting the image patches, we uncover an emergent spatial localization\nwithin CLIP. Finally, we use this understanding to remove spurious features\nfrom CLIP and to create a strong zero-shot image segmenter. Our results\nindicate that a scalable understanding of transformer models is attainable and\ncan be used to repair and improve models.\n","authors":["Yossi Gandelsman","Alexei A. Efros","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2310.05916v4.pdf","comment":"Project page and code:\n https://yossigandelsman.github.io/clip_decomposition/"},{"id":"http://arxiv.org/abs/2403.19949v1","updated":"2024-03-29T03:15:31Z","published":"2024-03-29T03:15:31Z","title":"FairCLIP: Harnessing Fairness in Vision-Language Learning","summary":" Fairness is a critical concern in deep learning, especially in healthcare,\nwhere these models influence diagnoses and treatment decisions. Although\nfairness has been investigated in the vision-only domain, the fairness of\nmedical vision-language (VL) models remains unexplored due to the scarcity of\nmedical VL datasets for studying fairness. To bridge this research gap, we\nintroduce the first fair vision-language medical dataset FairVLMed that\nprovides detailed demographic attributes, ground-truth labels, and clinical\nnotes to facilitate an in-depth examination of fairness within VL foundation\nmodels. Using FairVLMed, we conduct a comprehensive fairness analysis of two\nwidely-used VL models (CLIP and BLIP2), pre-trained on both natural and medical\ndomains, across four different protected attributes. Our results highlight\nsignificant biases in all VL models, with Asian, Male, Non-Hispanic, and\nSpanish being the preferred subgroups across the protected attributes of race,\ngender, ethnicity, and language, respectively. In order to alleviate these\nbiases, we propose FairCLIP, an optimal-transport-based approach that achieves\na favorable trade-off between performance and fairness by reducing the Sinkhorn\ndistance between the overall sample distribution and the distributions\ncorresponding to each demographic group. As the first VL dataset of its kind,\nFairVLMed holds the potential to catalyze advancements in the development of\nmachine learning models that are both ethically aware and clinically effective.\nOur dataset and code are available at\nhttps://ophai.hms.harvard.edu/datasets/fairvlmed10k.\n","authors":["Yan Luo","Min Shi","Muhammad Osama Khan","Muhammad Muneeb Afzal","Hao Huang","Shuaihang Yuan","Yu Tian","Luo Song","Ava Kouhana","Tobias Elze","Yi Fang","Mengyu Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19949v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19944v1","updated":"2024-03-29T02:55:07Z","published":"2024-03-29T02:55:07Z","title":"Binarized Low-light Raw Video Enhancement","summary":" Recently, deep neural networks have achieved excellent performance on\nlow-light raw video enhancement. However, they often come with high\ncomputational complexity and large memory costs, which hinder their\napplications on resource-limited devices. In this paper, we explore the\nfeasibility of applying the extremely compact binary neural network (BNN) to\nlow-light raw video enhancement. Nevertheless, there are two main issues with\nbinarizing video enhancement models. One is how to fuse the temporal\ninformation to improve low-light denoising without complex modules. The other\nis how to narrow the performance gap between binary convolutions with the full\nprecision ones. To address the first issue, we introduce a spatial-temporal\nshift operation, which is easy-to-binarize and effective. The temporal shift\nefficiently aggregates the features of neighbor frames and the spatial shift\nhandles the misalignment caused by the large motion in videos. For the second\nissue, we present a distribution-aware binary convolution, which captures the\ndistribution characteristics of real-valued input and incorporates them into\nplain binary convolutions to alleviate the degradation in performance.\nExtensive quantitative and qualitative experiments have shown our\nhigh-efficiency binarized low-light raw video enhancement method can attain a\npromising performance.\n","authors":["Gengchen Zhang","Yulun Zhang","Xin Yuan","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.19944v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19935v1","updated":"2024-03-29T02:42:22Z","published":"2024-03-29T02:42:22Z","title":"CP HDR: A feature point detection and description library for LDR and\n HDR images","summary":" In computer vision, characteristics refer to image regions with unique\nproperties, such as corners, edges, textures, or areas with high contrast.\nThese regions can be represented through feature points (FPs). FP detection and\ndescription are fundamental steps to many computer vision tasks. Most FP\ndetection and description methods use low dynamic range (LDR) images,\nsufficient for most applications involving digital images. However, LDR images\nmay have saturated pixels in scenes with extreme light conditions, which\ndegrade FP detection. On the other hand, high dynamic range (HDR) images\nusually present a greater dynamic range but FP detection algorithms do not take\nadvantage of all the information in such images. In this study, we present a\nsystematic review of image detection and description algorithms that use HDR\nimages as input. We developed a library called CP_HDR that implements the\nHarris corner detector, SIFT detector and descriptor, and two modifications of\nthose algorithms specialized in HDR images, called SIFT for HDR (SfHDR) and\nHarris for HDR (HfHDR). Previous studies investigated the use of HDR images in\nFP detection, but we did not find studies investigating the use of HDR images\nin FP description. Using uniformity, repeatability rate, mean average\nprecision, and matching rate metrics, we compared the performance of the CP_HDR\nalgorithms using LDR and HDR images. We observed an increase in the uniformity\nof the distribution of FPs among the high-light, mid-light, and low-light areas\nof the images. The results show that using HDR images as input to detection\nalgorithms improves performance and that SfHDR and HfHDR enhance FP\ndescription.\n","authors":["Artur Santos Nascimento","Valter Guilherme Silva de Souza","Daniel Oliveira Dantas","Beatriz Trinchão Andrade"],"pdf_url":"https://arxiv.org/pdf/2403.19935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19080v2","updated":"2024-03-29T02:31:10Z","published":"2024-03-28T01:05:06Z","title":"MMCert: Provable Defense against Adversarial Attacks to Multi-modal\n Models","summary":" Different from a unimodal model whose input is from a single modality, the\ninput (called multi-modal input) of a multi-modal model is from multiple\nmodalities such as image, 3D points, audio, text, etc. Similar to unimodal\nmodels, many existing studies show that a multi-modal model is also vulnerable\nto adversarial perturbation, where an attacker could add small perturbation to\nall modalities of a multi-modal input such that the multi-modal model makes\nincorrect predictions for it. Existing certified defenses are mostly designed\nfor unimodal models, which achieve sub-optimal certified robustness guarantees\nwhen extended to multi-modal models as shown in our experimental results. In\nour work, we propose MMCert, the first certified defense against adversarial\nattacks to a multi-modal model. We derive a lower bound on the performance of\nour MMCert under arbitrary adversarial attacks with bounded perturbations to\nboth modalities (e.g., in the context of auto-driving, we bound the number of\nchanged pixels in both RGB image and depth image). We evaluate our MMCert using\ntwo benchmark datasets: one for the multi-modal road segmentation task and the\nother for the multi-modal emotion recognition task. Moreover, we compare our\nMMCert with a state-of-the-art certified defense extended from unimodal models.\nOur experimental results show that our MMCert outperforms the baseline.\n","authors":["Yanting Wang","Hongye Fu","Wei Zou","Jinyuan Jia"],"pdf_url":"https://arxiv.org/pdf/2403.19080v2.pdf","comment":"To appear in CVPR'24"},{"id":"http://arxiv.org/abs/2403.19924v1","updated":"2024-03-29T02:22:54Z","published":"2024-03-29T02:22:54Z","title":"SceneTracker: Long-term Scene Flow Estimation Network","summary":" Considering the complementarity of scene flow estimation in the spatial\ndomain's focusing capability and 3D object tracking in the temporal domain's\ncoherence, this study aims to address a comprehensive new task that can\nsimultaneously capture fine-grained and long-term 3D motion in an online\nmanner: long-term scene flow estimation (LSFE). We introduce SceneTracker, a\nnovel learning-based LSFE network that adopts an iterative approach to\napproximate the optimal trajectory. Besides, it dynamically indexes and\nconstructs appearance and depth correlation features simultaneously and employs\nthe Transformer to explore and utilize long-range connections within and\nbetween trajectories. With detailed experiments, SceneTracker shows superior\ncapabilities in handling 3D spatial occlusion and depth noise interference,\nhighly tailored to the LSFE task's needs. The code for SceneTracker is\navailable at https://github.com/wwsource/SceneTracker.\n","authors":["Bo Wang","Jian Li","Yang Yu","Li Liu","Zhenping Sun","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2403.19924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17095v2","updated":"2024-03-29T02:18:40Z","published":"2023-11-28T06:42:58Z","title":"Emergent Open-Vocabulary Semantic Segmentation from Off-the-shelf\n Vision-Language Models","summary":" From image-text pairs, large-scale vision-language models (VLMs) learn to\nimplicitly associate image regions with words, which prove effective for tasks\nlike visual question answering. However, leveraging the learned association for\nopen-vocabulary semantic segmentation remains a challenge. In this paper, we\npropose a simple, yet extremely effective, training-free technique,\nPlug-and-Play Open-Vocabulary Semantic Segmentation (PnP-OVSS) for this task.\nPnP-OVSS leverages a VLM with direct text-to-image cross-attention and an\nimage-text matching loss. To balance between over-segmentation and\nunder-segmentation, we introduce Salience Dropout; by iteratively dropping\npatches that the model is most attentive to, we are able to better resolve the\nentire extent of the segmentation mask. \\shortname{} does not require any\nneural network training and performs hyperparameter tuning without the need for\nany segmentation annotations, even for a validation set. PnP-OVSS demonstrates\nsubstantial improvements over comparable baselines (+29.4% mIoU on Pascal VOC,\n+13.2% mIoU on Pascal Context, +14.0% mIoU on MS COCO, and +11.4% mIoU on\nADE-20K.) and even outperforms most baselines that conduct additional network\ntraining on top of pretrained VLMs. Our codebase is at\nhttps://github.com/letitiabanana/PnP-OVSS.\n","authors":["Jiayun Luo","Siddhesh Khandelwal","Leonid Sigal","Boyang Li"],"pdf_url":"https://arxiv.org/pdf/2311.17095v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19920v1","updated":"2024-03-29T02:17:09Z","published":"2024-03-29T02:17:09Z","title":"MI-NeRF: Learning a Single Face NeRF from Multiple Identities","summary":" In this work, we introduce a method that learns a single dynamic neural\nradiance field (NeRF) from monocular talking face videos of multiple\nidentities. NeRFs have shown remarkable results in modeling the 4D dynamics and\nappearance of human faces. However, they require per-identity optimization.\nAlthough recent approaches have proposed techniques to reduce the training and\nrendering time, increasing the number of identities can be expensive. We\nintroduce MI-NeRF (multi-identity NeRF), a single unified network that models\ncomplex non-rigid facial motion for multiple identities, using only monocular\nvideos of arbitrary length. The core premise in our method is to learn the\nnon-linear interactions between identity and non-identity specific information\nwith a multiplicative module. By training on multiple videos simultaneously,\nMI-NeRF not only reduces the total training time compared to standard\nsingle-identity NeRFs, but also demonstrates robustness in synthesizing novel\nexpressions for any input identity. We present results for both facial\nexpression transfer and talking face video synthesis. Our method can be further\npersonalized for a target identity given only a short video.\n","authors":["Aggelina Chatziagapi","Grigorios G. Chrysos","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.19920v1.pdf","comment":"Project page: https://aggelinacha.github.io/MI-NeRF/"},{"id":"http://arxiv.org/abs/2403.19919v1","updated":"2024-03-29T02:10:38Z","published":"2024-03-29T02:10:38Z","title":"Diff-Reg v1: Diffusion Matching Model for Registration Problem","summary":" Establishing reliable correspondences is essential for registration tasks\nsuch as 3D and 2D3D registration. Existing methods commonly leverage geometric\nor semantic point features to generate potential correspondences. However,\nthese features may face challenges such as large deformation, scale\ninconsistency, and ambiguous matching problems (e.g., symmetry). Additionally,\nmany previous methods, which rely on single-pass prediction, may struggle with\nlocal minima in complex scenarios. To mitigate these challenges, we introduce a\ndiffusion matching model for robust correspondence construction. Our approach\ntreats correspondence estimation as a denoising diffusion process within the\ndoubly stochastic matrix space, which gradually denoises (refines) a doubly\nstochastic matching matrix to the ground-truth one for high-quality\ncorrespondence estimation. It involves a forward diffusion process that\ngradually introduces Gaussian noise into the ground truth matching matrix and a\nreverse denoising process that iteratively refines the noisy matching matrix.\nIn particular, the feature extraction from the backbone occurs only once during\nthe inference phase. Our lightweight denoising module utilizes the same feature\nat each reverse sampling step. Evaluation of our method on both 3D and 2D3D\nregistration tasks confirms its effectiveness.\n","authors":["Qianliang Wu","Haobo Jiang","Lei Luo","Jun Li","Yaqing Ding","Jin Xie","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19919v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2401.00436"},{"id":"http://arxiv.org/abs/2403.19915v1","updated":"2024-03-29T02:03:00Z","published":"2024-03-29T02:03:00Z","title":"Using Images as Covariates: Measuring Curb Appeal with Deep Learning","summary":" This paper details an innovative methodology to integrate image data into\ntraditional econometric models. Motivated by forecasting sales prices for\nresidential real estate, we harness the power of deep learning to add\n\"information\" contained in images as covariates. Specifically, images of homes\nwere categorized and encoded using an ensemble of image classifiers (ResNet-50,\nVGG16, MobileNet, and Inception V3). Unique features presented within each\nimage were further encoded through panoptic segmentation. Forecasts from a\nneural network trained on the encoded data results in improved out-of-sample\npredictive power. We also combine these image-based forecasts with standard\nhedonic real estate property and location characteristics, resulting in a\nunified dataset. We show that image-based forecasts increase the accuracy of\nhedonic forecasts when encoded features are regarded as additional covariates.\nWe also attempt to \"explain\" which covariates the image-based forecasts are\nmost highly correlated with. The study exemplifies the benefits of\ninterdisciplinary methodologies, merging machine learning and econometrics to\nharness untapped data sources for more accurate forecasting.\n","authors":["Ardyn Nordstrom","Morgan Nordstrom","Matthew D. Webb"],"pdf_url":"https://arxiv.org/pdf/2403.19915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19912v1","updated":"2024-03-29T01:46:11Z","published":"2024-03-29T01:46:11Z","title":"Automated Identification and Segmentation of Hi Sources in CRAFTS Using\n Deep Learning Method","summary":" We introduce a machine learning-based method for extracting HI sources from\n3D spectral data, and construct a dedicated dataset of HI sources from CRAFTS.\nOur custom dataset provides comprehensive resources for HI source detection.\nUtilizing the 3D-Unet segmentation architecture, our method reliably identifies\nand segments HI sources, achieving notable performance metrics with recall\nrates reaching 91.6% and accuracy levels at 95.7%. These outcomes substantiate\nthe value of our custom dataset and the efficacy of our proposed network in\nidentifying HI source. Our code is publicly available at\nhttps://github.com/fishszh/HISF.\n","authors":["Zihao Song","Huaxi Chen","Donghui Quan","Di Li","Yinghui Zheng","Shulei Ni","Yunchuan Chen","Yun Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.19912v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.13729v3","updated":"2024-03-29T01:42:02Z","published":"2024-02-21T11:46:16Z","title":"Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet\n Representation","summary":" Generating high-quality videos that synthesize desired realistic content is a\nchallenging task due to their intricate high-dimensionality and complexity of\nvideos. Several recent diffusion-based methods have shown comparable\nperformance by compressing videos to a lower-dimensional latent space, using\ntraditional video autoencoder architecture. However, such method that employ\nstandard frame-wise 2D and 3D convolution fail to fully exploit the\nspatio-temporal nature of videos. To address this issue, we propose a novel\nhybrid video diffusion model, called HVDM, which can capture spatio-temporal\ndependencies more effectively. The HVDM is trained by a hybrid video\nautoencoder which extracts a disentangled representation of the video\nincluding: (i) a global context information captured by a 2D projected latent\n(ii) a local volume information captured by 3D convolutions with wavelet\ndecomposition (iii) a frequency information for improving the video\nreconstruction. Based on this disentangled representation, our hybrid\nautoencoder provide a more comprehensive video latent enriching the generated\nvideos with fine structures and details. Experiments on video generation\nbenchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed\napproach achieves state-of-the-art video generation quality, showing a wide\nrange of video applications (e.g., long video generation, image-to-video, and\nvideo dynamics control).\n","authors":["Kihong Kim","Haneol Lee","Jihye Park","Seyeon Kim","Kwanghee Lee","Seungryong Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2402.13729v3.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2311.15153v4","updated":"2024-03-29T01:18:37Z","published":"2023-11-26T01:05:55Z","title":"Predicting Gradient is Better: Exploring Self-Supervised Learning for\n SAR ATR with a Joint-Embedding Predictive Architecture","summary":" The growing Synthetic Aperture Radar (SAR) data has the potential to build a\nfoundation model through Self-Supervised Learning (SSL) methods, which can\nachieve various SAR Automatic Target Recognition (ATR) tasks with pre-training\nin large-scale unlabeled data and fine-tuning in small labeled samples. SSL\naims to construct supervision signals directly from the data, which minimizes\nthe need for expensive expert annotation and maximizes the use of the expanding\ndata pool for a foundational model. This study investigates an effective SSL\nmethod for SAR ATR, which can pave the way for a foundation model in SAR ATR.\nThe primary obstacles faced in SSL for SAR ATR are the small targets in remote\nsensing and speckle noise in SAR images, corresponding to the SSL approach and\nsignals. To overcome these challenges, we present a novel Joint-Embedding\nPredictive Architecture for SAR ATR (SAR-JEPA), which leverages local masked\npatches to predict the multi-scale SAR gradient representations of unseen\ncontext. The key aspect of SAR-JEPA is integrating SAR domain features to\nensure high-quality self-supervised signals as target features. Besides, we\nemploy local masks and multi-scale features to accommodate the various small\ntargets in remote sensing. By fine-tuning and evaluating our framework on three\ntarget recognition datasets (vehicle, ship, and aircraft) with four other\ndatasets as pre-training, we demonstrate its outperformance over other SSL\nmethods and its effectiveness with increasing SAR data. This study showcases\nthe potential of SSL for SAR target recognition across diverse targets, scenes,\nand sensors.\n","authors":["Weijie Li","Yang Wei","Tianpeng Liu","Yuenan Hou","Yuxuan Li","Zhen Liu","Yongxiang Liu","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15153v4.pdf","comment":"Our codes at https://github.com/waterdisappear/SAR-JEPA"},{"id":"http://arxiv.org/abs/2403.19905v1","updated":"2024-03-29T01:11:56Z","published":"2024-03-29T01:11:56Z","title":"Classification of Diabetic Retinopathy using Pre-Trained Deep Learning\n Models","summary":" Diabetic Retinopathy (DR) stands as the leading cause of blindness globally,\nparticularly affecting individuals between the ages of 20 and 70. This paper\npresents a Computer-Aided Diagnosis (CAD) system designed for the automatic\nclassification of retinal images into five distinct classes: Normal, Mild,\nModerate, Severe, and Proliferative Diabetic Retinopathy (PDR). The proposed\nsystem leverages Convolutional Neural Networks (CNNs) employing pre-trained\ndeep learning models. Through the application of fine-tuning techniques, our\nmodel is trained on fundus images of diabetic retinopathy with resolutions of\n350x350x3 and 224x224x3. Experimental results obtained on the Kaggle platform,\nutilizing resources comprising 4 CPUs, 17 GB RAM, and 1 GB Disk, demonstrate\nthe efficacy of our approach. The achieved Area Under the Curve (AUC) values\nfor CNN, MobileNet, VGG-16, InceptionV3, and InceptionResNetV2 models are 0.50,\n0.70, 0.53, 0.63, and 0.69, respectively.\n","authors":["Inas Al-Kamachy","Prof. Dr. Reza Hassanpour","Prof. Roya Choupani"],"pdf_url":"https://arxiv.org/pdf/2403.19905v1.pdf","comment":"3 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2310.10375v2","updated":"2024-03-29T01:08:12Z","published":"2023-10-16T13:16:09Z","title":"GTA: A Geometry-Aware Attention Mechanism for Multi-View Transformers","summary":" As transformers are equivariant to the permutation of input tokens, encoding\nthe positional information of tokens is necessary for many tasks. However,\nsince existing positional encoding schemes have been initially designed for NLP\ntasks, their suitability for vision tasks, which typically exhibit different\nstructural properties in their data, is questionable. We argue that existing\npositional encoding schemes are suboptimal for 3D vision tasks, as they do not\nrespect their underlying 3D geometric structure. Based on this hypothesis, we\npropose a geometry-aware attention mechanism that encodes the geometric\nstructure of tokens as relative transformation determined by the geometric\nrelationship between queries and key-value pairs. By evaluating on multiple\nnovel view synthesis (NVS) datasets in the sparse wide-baseline multi-view\nsetting, we show that our attention, called Geometric Transform Attention\n(GTA), improves learning efficiency and performance of state-of-the-art\ntransformer-based NVS models without any additional learned parameters and only\nminor computational overhead.\n","authors":["Takeru Miyato","Bernhard Jaeger","Max Welling","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2310.10375v2.pdf","comment":"Published as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2403.19904v1","updated":"2024-03-29T01:07:20Z","published":"2024-03-29T01:07:20Z","title":"Fully Geometric Panoramic Localization","summary":" We introduce a lightweight and accurate localization method that only\nutilizes the geometry of 2D-3D lines. Given a pre-captured 3D map, our approach\nlocalizes a panorama image, taking advantage of the holistic 360 view. The\nsystem mitigates potential privacy breaches or domain discrepancies by avoiding\ntrained or hand-crafted visual descriptors. However, as lines alone can be\nambiguous, we express distinctive yet compact spatial contexts from\nrelationships between lines, namely the dominant directions of parallel lines\nand the intersection between non-parallel lines. The resulting representations\nare efficient in processing time and memory compared to conventional visual\ndescriptor-based methods. Given the groups of dominant line directions and\ntheir intersections, we accelerate the search process to test thousands of pose\ncandidates in less than a millisecond without sacrificing accuracy. We\nempirically show that the proposed 2D-3D matching can localize panoramas for\nchallenging scenes with similar structures, dramatic domain shifts or\nillumination changes. Our fully geometric approach does not involve extensive\nparameter tuning or neural network training, making it a practical algorithm\nthat can be readily deployed in the real world. Project page including the code\nis available through this link: https://82magnolia.github.io/fgpl/.\n","authors":["Junho Kim","Jiwon Jeong","Young Min Kim"],"pdf_url":"https://arxiv.org/pdf/2403.19904v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19902v1","updated":"2024-03-29T01:05:23Z","published":"2024-03-29T01:05:23Z","title":"Heterogeneous Network Based Contrastive Learning Method for PolSAR Land\n Cover Classification","summary":" Polarimetric synthetic aperture radar (PolSAR) image interpretation is widely\nused in various fields. Recently, deep learning has made significant progress\nin PolSAR image classification. Supervised learning (SL) requires a large\namount of labeled PolSAR data with high quality to achieve better performance,\nhowever, manually labeled data is insufficient. This causes the SL to fail into\noverfitting and degrades its generalization performance. Furthermore, the\nscattering confusion problem is also a significant challenge that attracts more\nattention. To solve these problems, this article proposes a Heterogeneous\nNetwork based Contrastive Learning method(HCLNet). It aims to learn high-level\nrepresentation from unlabeled PolSAR data for few-shot classification according\nto multi-features and superpixels. Beyond the conventional CL, HCLNet\nintroduces the heterogeneous architecture for the first time to utilize\nheterogeneous PolSAR features better. And it develops two easy-to-use plugins\nto narrow the domain gap between optics and PolSAR, including feature filter\nand superpixel-based instance discrimination, which the former is used to\nenhance the complementarity of multi-features, and the latter is used to\nincrease the diversity of negative samples. Experiments demonstrate the\nsuperiority of HCLNet on three widely used PolSAR benchmark datasets compared\nwith state-of-the-art methods. Ablation studies also verify the importance of\neach component. Besides, this work has implications for how to efficiently\nutilize the multi-features of PolSAR data to learn better high-level\nrepresentation in CL and how to construct networks suitable for PolSAR data\nbetter.\n","authors":["Jianfeng Cai","Yue Ma","Zhixi Feng","Shuyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15585v3","updated":"2024-03-29T00:44:18Z","published":"2024-03-22T19:19:51Z","title":"MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis","summary":" Chest X-ray images are commonly used for predicting acute and chronic\ncardiopulmonary conditions, but efforts to integrate them with structured\nclinical data face challenges due to incomplete electronic health records\n(EHR). This paper introduces MedPromptX, the first model to integrate\nmultimodal large language models (MLLMs), few-shot prompting (FP) and visual\ngrounding (VG) to combine imagery with EHR data for chest X-ray diagnosis. A\npre-trained MLLM is utilized to complement the missing EHR information,\nproviding a comprehensive understanding of patients' medical history.\nAdditionally, FP reduces the necessity for extensive training of MLLMs while\neffectively tackling the issue of hallucination. Nevertheless, the process of\ndetermining the optimal number of few-shot examples and selecting high-quality\ncandidates can be burdensome, yet it profoundly influences model performance.\nHence, we propose a new technique that dynamically refines few-shot data for\nreal-time adjustment to new patient scenarios. Moreover, VG aids in focusing\nthe model's attention on relevant regions of interest in X-ray images,\nenhancing the identification of abnormalities. We release MedPromptX-VQA, a new\nin-context visual question answering dataset encompassing interleaved image and\nEHR data derived from MIMIC-IV and MIMIC-CXR databases. Results demonstrate the\nSOTA performance of MedPromptX, achieving an 11% improvement in F1-score\ncompared to the baselines. Code and data are available at\nhttps://github.com/BioMedIA-MBZUAI/MedPromptX\n","authors":["Mai A. Shaaban","Adnan Khan","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.15585v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19897v1","updated":"2024-03-29T00:36:38Z","published":"2024-03-29T00:36:38Z","title":"Disentangling Racial Phenotypes: Fine-Grained Control of Race-related\n Facial Phenotype Characteristics","summary":" Achieving an effective fine-grained appearance variation over 2D facial\nimages, whilst preserving facial identity, is a challenging task due to the\nhigh complexity and entanglement of common 2D facial feature encoding spaces.\nDespite these challenges, such fine-grained control, by way of disentanglement\nis a crucial enabler for data-driven racial bias mitigation strategies across\nmultiple automated facial analysis tasks, as it allows to analyse, characterise\nand synthesise human facial diversity. In this paper, we propose a novel GAN\nframework to enable fine-grained control over individual race-related phenotype\nattributes of the facial images. Our framework factors the latent (feature)\nspace into elements that correspond to race-related facial phenotype\nrepresentations, thereby separating phenotype aspects (e.g. skin, hair colour,\nnose, eye, mouth shapes), which are notoriously difficult to annotate robustly\nin real-world facial data. Concurrently, we also introduce a high quality\naugmented, diverse 2D face image dataset drawn from CelebA-HQ for GAN training.\nUnlike prior work, our framework only relies upon 2D imagery and related\nparameters to achieve state-of-the-art individual control over race-related\nphenotype attributes with improved photo-realistic output.\n","authors":["Seyma Yucer","Amir Atapour Abarghouei","Noura Al Moubayed","Toby P. Breckon"],"pdf_url":"https://arxiv.org/pdf/2403.19897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19896v1","updated":"2024-03-29T00:33:37Z","published":"2024-03-29T00:33:37Z","title":"Nonlinearity Enhanced Adaptive Activation Function","summary":" A simply implemented activation function with even cubic nonlinearity is\nintroduced that increases the accuracy of neural networks without substantial\nadditional computational resources. This is partially enabled through an\napparent tradeoff between convergence and accuracy. The activation function\ngeneralizes the standard RELU function by introducing additional degrees of\nfreedom through optimizable parameters that enable the degree of nonlinearity\nto be adjusted. The associated accuracy enhancement is quantified in the\ncontext of the MNIST digit data set through a comparison with standard\ntechniques.\n","authors":["David Yevick"],"pdf_url":"https://arxiv.org/pdf/2403.19896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19893v1","updated":"2024-03-29T00:28:26Z","published":"2024-03-29T00:28:26Z","title":"PLoc: A New Evaluation Criterion Based on Physical Location for\n Autonomous Driving Datasets","summary":" Autonomous driving has garnered significant attention as a key research area\nwithin artificial intelligence. In the context of autonomous driving scenarios,\nthe varying physical locations of objects correspond to different levels of\ndanger. However, conventional evaluation criteria for automatic driving object\ndetection often overlook the crucial aspect of an object's physical location,\nleading to evaluation results that may not accurately reflect the genuine\nthreat posed by the object to the autonomous driving vehicle. To enhance the\nsafety of autonomous driving, this paper introduces a novel evaluation\ncriterion based on physical location information, termed PLoc. This criterion\ntranscends the limitations of traditional criteria by acknowledging that the\nphysical location of pedestrians in autonomous driving scenarios can provide\nvaluable safety-related information. Furthermore, this paper presents a newly\nre-annotated dataset (ApolloScape-R) derived from ApolloScape. ApolloScape-R\ninvolves the relabeling of pedestrians based on the significance of their\nphysical location. The dataset is utilized to assess the performance of various\nobject detection models under the proposed PLoc criterion. Experimental results\ndemonstrate that the average accuracy of all object detection models in\nidentifying a person situated in the travel lane of an autonomous vehicle is\nlower than that for a person on a sidewalk. The dataset is publicly available\nat https://github.com/lnyrlyed/ApolloScape-R.git\n","authors":["Ruining Yang","Yuqi Peng"],"pdf_url":"https://arxiv.org/pdf/2403.19893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19888v1","updated":"2024-03-29T00:05:13Z","published":"2024-03-29T00:05:13Z","title":"MambaMixer: Efficient Selective State Space Models with Dual Token and\n Channel Selection","summary":" Recent advances in deep learning have mainly relied on Transformers due to\ntheir data dependency and ability to learn at scale. The attention module in\nthese architectures, however, exhibits quadratic time and space in input size,\nlimiting their scalability for long-sequence modeling. Despite recent attempts\nto design efficient and effective architecture backbone for multi-dimensional\ndata, such as images and multivariate time series, existing models are either\ndata independent, or fail to allow inter- and intra-dimension communication.\nRecently, State Space Models (SSMs), and more specifically Selective State\nSpace Models, with efficient hardware-aware implementation, have shown\npromising potential for long sequence modeling. Motivated by the success of\nSSMs, we present MambaMixer, a new architecture with data-dependent weights\nthat uses a dual selection mechanism across tokens and channels, called\nSelective Token and Channel Mixer. MambaMixer connects selective mixers using a\nweighted averaging mechanism, allowing layers to have direct access to early\nfeatures. As a proof of concept, we design Vision MambaMixer (ViM2) and Time\nSeries MambaMixer (TSM2) architectures based on the MambaMixer block and\nexplore their performance in various vision and time series forecasting tasks.\nOur results underline the importance of selective mixing across both tokens and\nchannels. In ImageNet classification, object detection, and semantic\nsegmentation tasks, ViM2 achieves competitive performance with well-established\nvision models and outperforms SSM-based vision models. In time series\nforecasting, TSM2 achieves outstanding performance compared to state-of-the-art\nmethods while demonstrating significantly improved computational cost. These\nresults show that while Transformers, cross-channel attention, and MLPs are\nsufficient for good performance in time series forecasting, neither is\nnecessary.\n","authors":["Ali Behrouz","Michele Santacatterina","Ramin Zabih"],"pdf_url":"https://arxiv.org/pdf/2403.19888v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2404.00191v1","updated":"2024-03-29T23:17:49Z","published":"2024-03-29T23:17:49Z","title":"Optimal Blackjack Strategy Recommender: A Comprehensive Study on\n Computer Vision Integration for Enhanced Gameplay","summary":" This research project investigates the application of several computer vision\ntechniques for playing card detection and recognition in the context of the\npopular casino game, blackjack. The primary objective is to develop a robust\nsystem that is capable of detecting and accurately classifying playing cards in\nreal-time, and displaying the optimal move recommendation based on the given\nimage of the current game. The proposed methodology involves using K-Means for\nimage segmentation, card reprojection and feature extraction, training of the\nKNN classifier using a labeled dataset, and integration of the detection system\ninto a Blackjack Basic Strategy recommendation algorithm. Further, the study\naims to observe the effectiveness of this approach in detecting various card\ndesigns under different lighting conditions and occlusions. Overall, the\nproject examines the potential benefits of incorporating computer vision\ntechniques, with a specific focus on card detection, into commonly played games\naiming to enhance player decision-making and optimize strategic outcomes. The\nresults obtained from our experimental evaluations with models developed under\nconsiderable time constraints, highlight the potential for practical\nimplementation in real-world casino environments and across other similarly\nstructured games.\n","authors":["Krishnanshu Gupta","Devon Bolt","Ben Hinchliff"],"pdf_url":"https://arxiv.org/pdf/2404.00191v1.pdf","comment":"24 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.00185v1","updated":"2024-03-29T22:51:45Z","published":"2024-03-29T22:51:45Z","title":"On Inherent Adversarial Robustness of Active Vision Systems","summary":" Current Deep Neural Networks are vulnerable to adversarial examples, which\nalter their predictions by adding carefully crafted noise. Since human eyes are\nrobust to such inputs, it is possible that the vulnerability stems from the\nstandard way of processing inputs in one shot by processing every pixel with\nthe same importance. In contrast, neuroscience suggests that the human vision\nsystem can differentiate salient features by (1) switching between multiple\nfixation points (saccades) and (2) processing the surrounding with a\nnon-uniform external resolution (foveation). In this work, we advocate that the\nintegration of such active vision mechanisms into current deep learning systems\ncan offer robustness benefits. Specifically, we empirically demonstrate the\ninherent robustness of two active vision methods - GFNet and FALcon - under a\nblack box threat model. By learning and inferencing based on downsampled\nglimpses obtained from multiple distinct fixation points within an input, we\nshow that these active methods achieve (2-3) times greater robustness compared\nto a standard passive convolutional network under state-of-the-art adversarial\nattacks. More importantly, we provide illustrative and interpretable\nvisualization analysis that demonstrates how performing inference from distinct\nfixation points makes active vision methods less vulnerable to malicious\ninputs.\n","authors":["Amitangshu Mukherjee","Timur Ibrayev","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2404.00185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00179v1","updated":"2024-03-29T22:24:12Z","published":"2024-03-29T22:24:12Z","title":"Multi-Region Transfer Learning for Segmentation of Crop Field Boundaries\n in Satellite Images with Limited Labels","summary":" The goal of field boundary delineation is to predict the polygonal boundaries\nand interiors of individual crop fields in overhead remotely sensed images\n(e.g., from satellites or drones). Automatic delineation of field boundaries is\na necessary task for many real-world use cases in agriculture, such as\nestimating cultivated area in a region or predicting end-of-season yield in a\nfield. Field boundary delineation can be framed as an instance segmentation\nproblem, but presents unique research challenges compared to traditional\ncomputer vision datasets used for instance segmentation. The practical\napplicability of previous work is also limited by the assumption that a\nsufficiently-large labeled dataset is available where field boundary\ndelineation models will be applied, which is not the reality for most regions\n(especially under-resourced regions such as Sub-Saharan Africa). We present an\napproach for segmentation of crop field boundaries in satellite images in\nregions lacking labeled data that uses multi-region transfer learning to adapt\nmodel weights for the target region. We show that our approach outperforms\nexisting methods and that multi-region transfer learning substantially boosts\nperformance for multiple model architectures. Our implementation and datasets\nare publicly available to enable use of the approach by end-users and serve as\na benchmark for future work.\n","authors":["Hannah Kerner","Saketh Sundar","Mathan Satish"],"pdf_url":"https://arxiv.org/pdf/2404.00179v1.pdf","comment":"Accepted for 2023 AAAI Workshop on AI to Accelerate Science and\n Engineering"},{"id":"http://arxiv.org/abs/2404.00172v1","updated":"2024-03-29T22:03:53Z","published":"2024-03-29T22:03:53Z","title":"Universal Bovine Identification via Depth Data and Deep Metric Learning","summary":" This paper proposes and evaluates, for the first time, a top-down (dorsal\nview), depth-only deep learning system for accurately identifying individual\ncattle and provides associated code, datasets, and training weights for\nimmediate reproducibility. An increase in herd size skews the cow-to-human\nratio at the farm and makes the manual monitoring of individuals more\nchallenging. Therefore, real-time cattle identification is essential for the\nfarms and a crucial step towards precision livestock farming. Underpinned by\nour previous work, this paper introduces a deep-metric learning method for\ncattle identification using depth data from an off-the-shelf 3D camera. The\nmethod relies on CNN and MLP backbones that learn well-generalised embedding\nspaces from the body shape to differentiate individuals -- requiring neither\nspecies-specific coat patterns nor close-up muzzle prints for operation. The\nnetwork embeddings are clustered using a simple algorithm such as $k$-NN for\nhighly accurate identification, thus eliminating the need to retrain the\nnetwork for enrolling new individuals. We evaluate two backbone architectures,\nResNet, as previously used to identify Holstein Friesians using RGB images, and\nPointNet, which is specialised to operate on 3D point clouds. We also present\nCowDepth2023, a new dataset containing 21,490 synchronised colour-depth image\npairs of 99 cows, to evaluate the backbones. Both ResNet and PointNet\narchitectures, which consume depth maps and point clouds, respectively, led to\nhigh accuracy that is on par with the coat pattern-based backbone.\n","authors":["Asheesh Sharma","Lucy Randewich","William Andrew","Sion Hannuna","Neill Campbell","Siobhan Mullan","Andrew W. Dowsey","Melvyn Smith","Mark Hansen","Tilo Burghardt"],"pdf_url":"https://arxiv.org/pdf/2404.00172v1.pdf","comment":"LaTeX, 38 pages, 14 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.00168v1","updated":"2024-03-29T21:52:01Z","published":"2024-03-29T21:52:01Z","title":"Multi-Level Neural Scene Graphs for Dynamic Urban Environments","summary":" We estimate the radiance field of large-scale dynamic areas from multiple\nvehicle captures under varying environmental conditions. Previous works in this\ndomain are either restricted to static environments, do not scale to more than\na single short video, or struggle to separately represent dynamic object\ninstances. To this end, we present a novel, decomposable radiance field\napproach for dynamic urban environments. We propose a multi-level neural scene\ngraph representation that scales to thousands of images from dozens of\nsequences with hundreds of fast-moving objects. To enable efficient training\nand rendering of our representation, we develop a fast composite ray sampling\nand rendering scheme. To test our approach in urban driving scenarios, we\nintroduce a new, novel view synthesis benchmark. We show that our approach\noutperforms prior art by a significant margin on both established and our\nproposed benchmark while being faster in training and rendering.\n","authors":["Tobias Fischer","Lorenzo Porzi","Samuel Rota Bulò","Marc Pollefeys","Peter Kontschieder"],"pdf_url":"https://arxiv.org/pdf/2404.00168v1.pdf","comment":"CVPR 2024. Project page is available at\n https://tobiasfshr.github.io/pub/ml-nsg/"},{"id":"http://arxiv.org/abs/2404.00166v1","updated":"2024-03-29T21:45:53Z","published":"2024-03-29T21:45:53Z","title":"Uncovering Bias in Large Vision-Language Models with Counterfactuals","summary":" With the advent of Large Language Models (LLMs) possessing increasingly\nimpressive capabilities, a number of Large Vision-Language Models (LVLMs) have\nbeen proposed to augment LLMs with visual inputs. Such models condition\ngenerated text on both an input image and a text prompt, enabling a variety of\nuse cases such as visual question answering and multimodal chat. While prior\nstudies have examined the social biases contained in text generated by LLMs,\nthis topic has been relatively unexplored in LVLMs. Examining social biases in\nLVLMs is particularly challenging due to the confounding contributions of bias\ninduced by information contained across the text and visual modalities. To\naddress this challenging problem, we conduct a large-scale study of text\ngenerated by different LVLMs under counterfactual changes to input images.\nSpecifically, we present LVLMs with identical open-ended text prompts while\nconditioning on images from different counterfactual sets, where each set\ncontains images which are largely identical in their depiction of a common\nsubject (e.g., a doctor), but vary only in terms of intersectional social\nattributes (e.g., race and gender). We comprehensively evaluate the text\nproduced by different LVLMs under this counterfactual generation setting and\nfind that social attributes such as race, gender, and physical characteristics\ndepicted in input images can significantly influence toxicity and the\ngeneration of competency-associated words.\n","authors":["Phillip Howard","Anahita Bhiwandiwalla","Kathleen C. Fraser","Svetlana Kiritchenko"],"pdf_url":"https://arxiv.org/pdf/2404.00166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00163v1","updated":"2024-03-29T21:40:12Z","published":"2024-03-29T21:40:12Z","title":"CT respiratory motion synthesis using joint supervised and adversarial\n learning","summary":" Objective: Four-dimensional computed tomography (4DCT) imaging consists in\nreconstructing a CT acquisition into multiple phases to track internal organ\nand tumor motion. It is commonly used in radiotherapy treatment planning to\nestablish planning target volumes. However, 4DCT increases protocol complexity,\nmay not align with patient breathing during treatment, and lead to higher\nradiation delivery. Approach: In this study, we propose a deep synthesis method\nto generate pseudo respiratory CT phases from static images for motion-aware\ntreatment planning. The model produces patient-specific deformation vector\nfields (DVFs) by conditioning synthesis on external patient surface-based\nestimation, mimicking respiratory monitoring devices. A key methodological\ncontribution is to encourage DVF realism through supervised DVF training while\nusing an adversarial term jointly not only on the warped image but also on the\nmagnitude of the DVF itself. This way, we avoid excessive smoothness typically\nobtained through deep unsupervised learning, and encourage correlations with\nthe respiratory amplitude. Main results: Performance is evaluated using real\n4DCT acquisitions with smaller tumor volumes than previously reported. Results\ndemonstrate for the first time that the generated pseudo-respiratory CT phases\ncan capture organ and tumor motion with similar accuracy to repeated 4DCT scans\nof the same patient. Mean inter-scans tumor center-of-mass distances and Dice\nsimilarity coefficients were $1.97$mm and $0.63$, respectively, for real 4DCT\nphases and $2.35$mm and $0.71$ for synthetic phases, and compares favorably to\na state-of-the-art technique (RMSim).\n","authors":["Yi-Heng Cao","Vincent Bourbonne","François Lucia","Ulrike Schick","Julien Bert","Vincent Jaouen","Dimitris Visvikis"],"pdf_url":"https://arxiv.org/pdf/2404.00163v1.pdf","comment":"to appear in Phys. Med. Biol"},{"id":"http://arxiv.org/abs/2404.00149v1","updated":"2024-03-29T20:43:55Z","published":"2024-03-29T20:43:55Z","title":"VSRD: Instance-Aware Volumetric Silhouette Rendering for Weakly\n Supervised 3D Object Detection","summary":" Monocular 3D object detection poses a significant challenge in 3D scene\nunderstanding due to its inherently ill-posed nature in monocular depth\nestimation. Existing methods heavily rely on supervised learning using abundant\n3D labels, typically obtained through expensive and labor-intensive annotation\non LiDAR point clouds. To tackle this problem, we propose a novel weakly\nsupervised 3D object detection framework named VSRD (Volumetric Silhouette\nRendering for Detection) to train 3D object detectors without any 3D\nsupervision but only weak 2D supervision. VSRD consists of multi-view 3D\nauto-labeling and subsequent training of monocular 3D object detectors using\nthe pseudo labels generated in the auto-labeling stage. In the auto-labeling\nstage, we represent the surface of each instance as a signed distance field\n(SDF) and render its silhouette as an instance mask through our proposed\ninstance-aware volumetric silhouette rendering. To directly optimize the 3D\nbounding boxes through rendering, we decompose the SDF of each instance into\nthe SDF of a cuboid and the residual distance field (RDF) that represents the\nresidual from the cuboid. This mechanism enables us to optimize the 3D bounding\nboxes in an end-to-end manner by comparing the rendered instance masks with the\nground truth instance masks. The optimized 3D bounding boxes serve as effective\ntraining data for 3D object detection. We conduct extensive experiments on the\nKITTI-360 dataset, demonstrating that our method outperforms the existing\nweakly supervised 3D object detection methods. The code is available at\nhttps://github.com/skmhrk1209/VSRD.\n","authors":["Zihua Liu","Hiroki Sakuma","Masatoshi Okutomi"],"pdf_url":"https://arxiv.org/pdf/2404.00149v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00146v1","updated":"2024-03-29T20:39:37Z","published":"2024-03-29T20:39:37Z","title":"Fast OMP for Exact Recovery and Sparse Approximation","summary":" Orthogonal Matching Pursuit (OMP) has been a powerful method in sparse signal\nrecovery and approximation. However OMP suffers computational issue when the\nsignal has large number of non-zeros. This paper advances OMP in two fronts: it\noffers a fast algorithm for the orthogonal projection of the input signal at\neach iteration, and a new selection criterion for making the greedy choice,\nwhich reduces the number of iterations it takes to recover the signal. The\nproposed modifications to OMP directly reduce the computational complexity.\nExperiment results show significant improvement over the classical OMP in\ncomputation time. The paper also provided a sufficient condition for exact\nrecovery under the new greedy choice criterion. For general signals that may\nnot have sparse representations, the paper provides a bound for the\napproximation error. The approximation error is at the same order as OMP but is\nobtained within fewer iterations and less time.\n","authors":["Huiyuan Yu","Jia He","Maggie Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.00146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00144v1","updated":"2024-03-29T20:32:30Z","published":"2024-03-29T20:32:30Z","title":"An Interpretable Cross-Attentive Multi-modal MRI Fusion Framework for\n Schizophrenia Diagnosis","summary":" Both functional and structural magnetic resonance imaging (fMRI and sMRI) are\nwidely used for the diagnosis of mental disorder. However, combining\ncomplementary information from these two modalities is challenging due to their\nheterogeneity. Many existing methods fall short of capturing the interaction\nbetween these modalities, frequently defaulting to a simple combination of\nlatent features. In this paper, we propose a novel Cross-Attentive Multi-modal\nFusion framework (CAMF), which aims to capture both intra-modal and inter-modal\nrelationships between fMRI and sMRI, enhancing multi-modal data representation.\nSpecifically, our CAMF framework employs self-attention modules to identify\ninteractions within each modality while cross-attention modules identify\ninteractions between modalities. Subsequently, our approach optimizes the\nintegration of latent features from both modalities. This approach\nsignificantly improves classification accuracy, as demonstrated by our\nevaluations on two extensive multi-modal brain imaging datasets, where CAMF\nconsistently outperforms existing methods. Furthermore, the gradient-guided\nScore-CAM is applied to interpret critical functional networks and brain\nregions involved in schizophrenia. The bio-markers identified by CAMF align\nwith established research, potentially offering new insights into the diagnosis\nand pathological endophenotypes of schizophrenia.\n","authors":["Ziyu Zhou","Anton Orlichenko","Gang Qu","Zening Fu","Vince D Calhoun","Zhengming Ding","Yu-Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00132v1","updated":"2024-03-29T19:58:13Z","published":"2024-03-29T19:58:13Z","title":"FetalDiffusion: Pose-Controllable 3D Fetal MRI Synthesis with\n Conditional Diffusion Model","summary":" The quality of fetal MRI is significantly affected by unpredictable and\nsubstantial fetal motion, leading to the introduction of artifacts even when\nfast acquisition sequences are employed. The development of 3D real-time fetal\npose estimation approaches on volumetric EPI fetal MRI opens up a promising\navenue for fetal motion monitoring and prediction. Challenges arise in fetal\npose estimation due to limited number of real scanned fetal MR training images,\nhindering model generalization when the acquired fetal MRI lacks adequate pose.\n In this study, we introduce FetalDiffusion, a novel approach utilizing a\nconditional diffusion model to generate 3D synthetic fetal MRI with\ncontrollable pose. Additionally, an auxiliary pose-level loss is adopted to\nenhance model performance. Our work demonstrates the success of this proposed\nmodel by producing high-quality synthetic fetal MRI images with accurate and\nrecognizable fetal poses, comparing favorably with in-vivo real fetal MRI.\nFurthermore, we show that the integration of synthetic fetal MR images enhances\nthe fetal pose estimation model's performance, particularly when the number of\navailable real scanned data is limited resulting in 15.4% increase in PCK and\n50.2% reduced in mean error. All experiments are done on a single 32GB V100\nGPU. Our method holds promise for improving real-time tracking models, thereby\naddressing fetal motion issues more effectively.\n","authors":["Molin Zhang","Polina Golland","Patricia Ellen Grant","Elfar Adalsteinsson"],"pdf_url":"https://arxiv.org/pdf/2404.00132v1.pdf","comment":"8 pages, 3 figures, 2 tables, submitted to MICCAI 2024, code\n available if accepted"},{"id":"http://arxiv.org/abs/2404.00130v1","updated":"2024-03-29T19:51:34Z","published":"2024-03-29T19:51:34Z","title":"FISBe: A real-world benchmark dataset for instance segmentation of\n long-range thin filamentous structures","summary":" Instance segmentation of neurons in volumetric light microscopy images of\nnervous systems enables groundbreaking research in neuroscience by facilitating\njoint functional and morphological analyses of neural circuits at cellular\nresolution. Yet said multi-neuron light microscopy data exhibits extremely\nchallenging properties for the task of instance segmentation: Individual\nneurons have long-ranging, thin filamentous and widely branching morphologies,\nmultiple neurons are tightly inter-weaved, and partial volume effects, uneven\nillumination and noise inherent to light microscopy severely impede local\ndisentangling as well as long-range tracing of individual neurons. These\nproperties reflect a current key challenge in machine learning research, namely\nto effectively capture long-range dependencies in the data. While respective\nmethodological research is buzzing, to date methods are typically benchmarked\non synthetic datasets. To address this gap, we release the FlyLight Instance\nSegmentation Benchmark (FISBe) dataset, the first publicly available\nmulti-neuron light microscopy dataset with pixel-wise annotations. In addition,\nwe define a set of instance segmentation metrics for benchmarking that we\ndesigned to be meaningful with regard to downstream analyses. Lastly, we\nprovide three baselines to kick off a competition that we envision to both\nadvance the field of machine learning regarding methodology for capturing\nlong-range data dependencies, and facilitate scientific discovery in basic\nneuroscience.\n","authors":["Lisa Mais","Peter Hirsch","Claire Managan","Ramya Kandarpa","Josef Lorenz Rumberger","Annika Reinke","Lena Maier-Hein","Gudrun Ihrke","Dagmar Kainmueller"],"pdf_url":"https://arxiv.org/pdf/2404.00130v1.pdf","comment":"CVPR2024, Project page: https://kainmueller-lab.github.io/fisbe"},{"id":"http://arxiv.org/abs/2404.00122v1","updated":"2024-03-29T19:25:09Z","published":"2024-03-29T19:25:09Z","title":"AgileFormer: Spatially Agile Transformer UNet for Medical Image\n Segmentation","summary":" In the past decades, deep neural networks, particularly convolutional neural\nnetworks, have achieved state-of-the-art performance in a variety of medical\nimage segmentation tasks. Recently, the introduction of the vision transformer\n(ViT) has significantly altered the landscape of deep segmentation models.\nThere has been a growing focus on ViTs, driven by their excellent performance\nand scalability. However, we argue that the current design of the vision\ntransformer-based UNet (ViT-UNet) segmentation models may not effectively\nhandle the heterogeneous appearance (e.g., varying shapes and sizes) of objects\nof interest in medical image segmentation tasks. To tackle this challenge, we\npresent a structured approach to introduce spatially dynamic components to the\nViT-UNet. This adaptation enables the model to effectively capture features of\ntarget objects with diverse appearances. This is achieved by three main\ncomponents: \\textbf{(i)} deformable patch embedding; \\textbf{(ii)} spatially\ndynamic multi-head attention; \\textbf{(iii)} deformable positional encoding.\nThese components were integrated into a novel architecture, termed AgileFormer.\nAgileFormer is a spatially agile ViT-UNet designed for medical image\nsegmentation. Experiments in three segmentation tasks using publicly available\ndatasets demonstrated the effectiveness of the proposed method. The code is\navailable at\n\\href{https://github.com/sotiraslab/AgileFormer}{https://github.com/sotiraslab/AgileFormer}.\n","authors":["Peijie Qiu","Jin Yang","Sayantan Kumar","Soumyendu Sekhar Ghosh","Aristeidis Sotiras"],"pdf_url":"https://arxiv.org/pdf/2404.00122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00114v1","updated":"2024-03-29T19:09:08Z","published":"2024-03-29T19:09:08Z","title":"Deepfake Sentry: Harnessing Ensemble Intelligence for Resilient\n Detection and Generalisation","summary":" Recent advancements in Generative Adversarial Networks (GANs) have enabled\nphotorealistic image generation with high quality. However, the malicious use\nof such generated media has raised concerns regarding visual misinformation.\nAlthough deepfake detection research has demonstrated high accuracy, it is\nvulnerable to advances in generation techniques and adversarial iterations on\ndetection countermeasures. To address this, we propose a proactive and\nsustainable deepfake training augmentation solution that introduces artificial\nfingerprints into models. We achieve this by employing an ensemble learning\napproach that incorporates a pool of autoencoders that mimic the effect of the\nartefacts introduced by the deepfake generator models. Experiments on three\ndatasets reveal that our proposed ensemble autoencoder-based data augmentation\nlearning approach offers improvements in terms of generalisation, resistance\nagainst basic data perturbations such as noise, blurring, sharpness\nenhancement, and affine transforms, resilience to commonly used lossy\ncompression algorithms such as JPEG, and enhanced resistance against\nadversarial attacks.\n","authors":["Liviu-Daniel Ştefan","Dan-Cristian Stanciu","Mihai Dogariu","Mihai Gabriel Constantin","Andrei Cosmin Jitaru","Bogdan Ionescu"],"pdf_url":"https://arxiv.org/pdf/2404.00114v1.pdf","comment":"16 pages, 1 figure, U.P.B. Sci. Bull., Series C, Vol. 85, Iss. 4,\n 2023"},{"id":"http://arxiv.org/abs/2202.03583v4","updated":"2024-03-29T18:57:25Z","published":"2022-02-08T00:43:57Z","title":"Multi-Label Classification of Thoracic Diseases using Dense\n Convolutional Network on Chest Radiographs","summary":" Traditional methods of identifying pathologies in X-ray images rely heavily\non skilled human interpretation and are often time-consuming. The advent of\ndeep learning techniques has enabled the development of automated disease\ndiagnosis systems. Still, the performance of such systems is opaque to\nend-users and limited to detecting a single pathology. In this paper, we\npropose a multi-label disease prediction model that allows the detection of\nmore than one pathology at a given test time. We use a dense convolutional\nneural network (DenseNet) for disease diagnosis. Our proposed model achieved\nthe highest AUC score of 0.896 for the condition Cardiomegaly with an accuracy\nof 0.826, while the lowest AUC score was obtained for Nodule, at 0.655 with an\naccuracy of 0.66. To build trust in decision-making, we generated heatmaps on\nX-rays to visualize the regions where the model paid attention to make certain\npredictions. Our proposed automated disease prediction model obtained highly\nconfident high-performance metrics in multi-label disease prediction tasks.\n","authors":["Dipkamal Bhusal","Sanjeeb Prasad Panday"],"pdf_url":"https://arxiv.org/pdf/2202.03583v4.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2404.00107v1","updated":"2024-03-29T18:38:59Z","published":"2024-03-29T18:38:59Z","title":"Robust Ensemble Person Re-Identification via Orthogonal Fusion with\n Occlusion Handling","summary":" Occlusion remains one of the major challenges in person reidentification\n(ReID) as a result of the diversity of poses and the variation of appearances.\nDeveloping novel architectures to improve the robustness of occlusion-aware\nperson Re-ID requires new insights, especially on low-resolution edge cameras.\nWe propose a deep ensemble model that harnesses both CNN and Transformer\narchitectures to generate robust feature representations. To achieve robust\nRe-ID without the need to manually label occluded regions, we propose to take\nan ensemble learning-based approach derived from the analogy between\narbitrarily shaped occluded regions and robust feature representation. Using\nthe orthogonality principle, our developed deep CNN model makes use of masked\nautoencoder (MAE) and global-local feature fusion for robust person\nidentification. Furthermore, we present a part occlusion-aware transformer\ncapable of learning feature space that is robust to occluded regions.\nExperimental results are reported on several Re-ID datasets to show the\neffectiveness of our developed ensemble model named orthogonal fusion with\nocclusion handling (OFOH). Compared to competing methods, the proposed OFOH\napproach has achieved competent rank-1 and mAP performance.\n","authors":["Syeda Nyma Ferdous","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2404.00107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00103v1","updated":"2024-03-29T18:23:34Z","published":"2024-03-29T18:23:34Z","title":"PikeLPN: Mitigating Overlooked Inefficiencies of Low-Precision Neural\n Networks","summary":" Low-precision quantization is recognized for its efficacy in neural network\noptimization. Our analysis reveals that non-quantized elementwise operations\nwhich are prevalent in layers such as parameterized activation functions, batch\nnormalization, and quantization scaling dominate the inference cost of\nlow-precision models. These non-quantized elementwise operations are commonly\noverlooked in SOTA efficiency metrics such as Arithmetic Computation Effort\n(ACE). In this paper, we propose ACEv2 - an extended version of ACE which\noffers a better alignment with the inference cost of quantized models and their\nenergy consumption on ML hardware. Moreover, we introduce PikeLPN, a model that\naddresses these efficiency issues by applying quantization to both elementwise\noperations and multiply-accumulate operations. In particular, we present a\nnovel quantization technique for batch normalization layers named QuantNorm\nwhich allows for quantizing the batch normalization parameters without\ncompromising the model performance. Additionally, we propose applying Double\nQuantization where the quantization scaling parameters are quantized.\nFurthermore, we recognize and resolve the issue of distribution mismatch in\nSeparable Convolution layers by introducing Distribution-Heterogeneous\nQuantization which enables quantizing them to low-precision. PikeLPN achieves\nPareto-optimality in efficiency-accuracy trade-off with up to 3X efficiency\nimprovement compared to SOTA low-precision models.\n","authors":["Marina Neseem","Conor McCullough","Randy Hsin","Chas Leichner","Shan Li","In Suk Chong","Andrew G. Howard","Lukasz Lew","Sherief Reda","Ville-Mikko Rautio","Daniele Moro"],"pdf_url":"https://arxiv.org/pdf/2404.00103v1.pdf","comment":"Accepted in CVPR 2024. 10 Figures, 9 Tables"},{"id":"http://arxiv.org/abs/2404.00098v1","updated":"2024-03-29T18:09:11Z","published":"2024-03-29T18:09:11Z","title":"Sparse Views, Near Light: A Practical Paradigm for Uncalibrated\n Point-light Photometric Stereo","summary":" Neural approaches have shown a significant progress on camera-based\nreconstruction. But they require either a fairly dense sampling of the viewing\nsphere, or pre-training on an existing dataset, thereby limiting their\ngeneralizability. In contrast, photometric stereo (PS) approaches have shown\ngreat potential for achieving high-quality reconstruction under sparse\nviewpoints. Yet, they are impractical because they typically require tedious\nlaboratory conditions, are restricted to dark rooms, and often multi-staged,\nmaking them subject to accumulated errors. To address these shortcomings, we\npropose an end-to-end uncalibrated multi-view PS framework for reconstructing\nhigh-resolution shapes acquired from sparse viewpoints in a real-world\nenvironment. We relax the dark room assumption, and allow a combination of\nstatic ambient lighting and dynamic near LED lighting, thereby enabling easy\ndata capture outside the lab. Experimental validation confirms that it\noutperforms existing baseline approaches in the regime of sparse viewpoints by\na large margin. This allows to bring high-accuracy 3D reconstruction from the\ndark room to the real world, while maintaining a reasonable data capture\ncomplexity.\n","authors":["Mohammed Brahimi","Bjoern Haefner","Zhenzhang Ye","Bastian Goldluecke","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.00098v1.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00086v1","updated":"2024-03-29T17:58:50Z","published":"2024-03-29T17:58:50Z","title":"DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries","summary":" Modern video segmentation methods adopt object queries to perform inter-frame\nassociation and demonstrate satisfactory performance in tracking continuously\nappearing objects despite large-scale motion and transient occlusion.\n However, they all underperform on newly emerging and disappearing objects\nthat are common in the real world because they attempt to model object\nemergence and disappearance through feature transitions between background and\nforeground queries that have significant feature gaps. We introduce Dynamic\nAnchor Queries (DAQ) to shorten the transition gap between the anchor and\ntarget queries by dynamically generating anchor queries based on the features\nof potential candidates.\n Furthermore, we introduce a query-level object Emergence and Disappearance\nSimulation (EDS) strategy, which unleashes DAQ's potential without any\nadditional cost.\n Finally, we combine our proposed DAQ and EDS with DVIS~\\cite{zhang2023dvis}\nto obtain DVIS-DAQ.\n Extensive experiments demonstrate that DVIS-DAQ achieves a new\nstate-of-the-art (SOTA) performance on five mainstream video segmentation\nbenchmarks. Code and models are available at\n\\url{https://github.com/SkyworkAI/DAQ-VS}.\n","authors":["Yikang Zhou","Tao Zhang","Shunping JI","Shuicheng Yan","Xiangtai Li"],"pdf_url":"https://arxiv.org/pdf/2404.00086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01330v1","updated":"2024-03-29T15:27:28Z","published":"2024-03-29T15:27:28Z","title":"Holo-VQVAE: VQ-VAE for phase-only holograms","summary":" Holography stands at the forefront of visual technology innovation, offering\nimmersive, three-dimensional visualizations through the manipulation of light\nwave amplitude and phase. Contemporary research in hologram generation has\npredominantly focused on image-to-hologram conversion, producing holograms from\nexisting images. These approaches, while effective, inherently limit the scope\nof innovation and creativity in hologram generation. In response to this\nlimitation, we present Holo-VQVAE, a novel generative framework tailored for\nphase-only holograms (POHs). Holo-VQVAE leverages the architecture of Vector\nQuantized Variational AutoEncoders, enabling it to learn the complex\ndistributions of POHs. Furthermore, it integrates the Angular Spectrum Method\ninto the training process, facilitating learning in the image domain. This\nframework allows for the generation of unseen, diverse holographic content\ndirectly from its intricately learned latent space without requiring\npre-existing images. This pioneering work paves the way for groundbreaking\napplications and methodologies in holographic content creation, opening a new\nera in the exploration of holographic content.\n","authors":["Joohyun Park","Hyeongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2404.01330v1.pdf","comment":null}]},"2024-04-01T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.17701v2","updated":"2024-04-01T02:31:10Z","published":"2024-03-26T13:40:18Z","title":"Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical\n Image Segmentation","summary":" Image segmentation holds a vital position in the realms of diagnosis and\ntreatment within the medical domain. Traditional convolutional neural networks\n(CNNs) and Transformer models have made significant advancements in this realm,\nbut they still encounter challenges because of limited receptive field or high\ncomputing complexity. Recently, State Space Models (SSMs), particularly Mamba\nand its variants, have demonstrated notable performance in the field of vision.\nHowever, their feature extraction methods may not be sufficiently effective and\nretain some redundant structures, leaving room for parameter reduction.\nMotivated by previous spatial and channel attention methods, we propose Triplet\nMamba-UNet. The method leverages residual VSS Blocks to extract intensive\ncontextual features, while Triplet SSM is employed to fuse features across\nspatial and channel dimensions. We conducted experiments on ISIC17, ISIC18,\nCVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets,\ndemonstrating the superior segmentation performance of our proposed TM-UNet.\nAdditionally, compared to the previous VM-UNet, our model achieves a one-third\nreduction in parameters.\n","authors":["Hao Tang","Lianglun Cheng","Guoheng Huang","Zhengguang Tan","Junhao Lu","Kaihong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.17701v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18933v2","updated":"2024-04-01T02:06:07Z","published":"2024-02-29T08:01:31Z","title":"Modality-Agnostic Structural Image Representation Learning for\n Deformable Multi-Modality Medical Image Registration","summary":" Establishing dense anatomical correspondence across distinct imaging\nmodalities is a foundational yet challenging procedure for numerous medical\nimage analysis studies and image-guided radiotherapy. Existing multi-modality\nimage registration algorithms rely on statistical-based similarity measures or\nlocal structural image representations. However, the former is sensitive to\nlocally varying noise, while the latter is not discriminative enough to cope\nwith complex anatomical structures in multimodal scans, causing ambiguity in\ndetermining the anatomical correspondence across scans with different\nmodalities. In this paper, we propose a modality-agnostic structural\nrepresentation learning method, which leverages Deep Neighbourhood\nSelf-similarity (DNS) and anatomy-aware contrastive learning to learn\ndiscriminative and contrast-invariance deep structural image representations\n(DSIR) without the need for anatomical delineations or pre-aligned training\nimages. We evaluate our method on multiphase CT, abdomen MR-CT, and brain MR\nT1w-T2w registration. Comprehensive results demonstrate that our method is\nsuperior to the conventional local structural representation and\nstatistical-based similarity measures in terms of discriminability and\naccuracy.\n","authors":["Tony C. W. Mok","Zi Li","Yunhao Bai","Jianpeng Zhang","Wei Liu","Yan-Jie Zhou","Ke Yan","Dakai Jin","Yu Shi","Xiaoli Yin","Le Lu","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.18933v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.19898v2","updated":"2024-04-01T01:27:14Z","published":"2024-03-29T00:40:12Z","title":"Structure Matters: Tackling the Semantic Discrepancy in Diffusion Models\n for Image Inpainting","summary":" Denoising diffusion probabilistic models for image inpainting aim to add the\nnoise to the texture of image during the forward process and recover masked\nregions with unmasked ones of the texture via the reverse denoising\nprocess.Despite the meaningful semantics generation,the existing arts suffer\nfrom the semantic discrepancy between masked and unmasked regions, since the\nsemantically dense unmasked texture fails to be completely degraded while the\nmasked regions turn to the pure noise in diffusion process,leading to the large\ndiscrepancy between them. In this paper,we aim to answer how unmasked semantics\nguide texture denoising process;together with how to tackle the semantic\ndiscrepancy,to facilitate the consistent and meaningful semantics generation.\nTo this end,we propose a novel structure-guided diffusion model named\nStrDiffusion,to reformulate the conventional texture denoising process under\nstructure guidance to derive a simplified denoising objective for image\ninpainting,while revealing:1)the semantically sparse structure is beneficial to\ntackle semantic discrepancy in early stage, while dense texture generates\nreasonable semantics in late stage;2)the semantics from unmasked regions\nessentially offer the time-dependent structure guidance for the texture\ndenoising process,benefiting from the time-dependent sparsity of the structure\nsemantics.For the denoising process,a structure-guided neural network is\ntrained to estimate the simplified denoising objective by exploiting the\nconsistency of the denoised structure between masked and unmasked\nregions.Besides,we devise an adaptive resampling strategy as a formal criterion\nas whether structure is competent to guide the texture denoising process,while\nregulate their semantic correlations.Extensive experiments validate the merits\nof StrDiffusion over the state-of-the-arts.Our code is available at\nhttps://github.com/htyjers/StrDiffusion.\n","authors":["Haipeng Liu","Yang Wang","Biao Qian","Meng Wang","Yong Rui"],"pdf_url":"https://arxiv.org/pdf/2403.19898v2.pdf","comment":"15 pages, 10 figures, to appear CVPR 2024"},{"id":"http://arxiv.org/abs/2401.13964v3","updated":"2024-04-01T01:26:12Z","published":"2024-01-25T05:55:03Z","title":"An Extensible Framework for Open Heterogeneous Collaborative Perception","summary":" Collaborative perception aims to mitigate the limitations of single-agent\nperception, such as occlusions, by facilitating data exchange among multiple\nagents. However, most current works consider a homogeneous scenario where all\nagents use identity sensors and perception models. In reality, heterogeneous\nagent types may continually emerge and inevitably face a domain gap when\ncollaborating with existing agents. In this paper, we introduce a new open\nheterogeneous problem: how to accommodate continually emerging new\nheterogeneous agent types into collaborative perception, while ensuring high\nperception performance and low integration cost? To address this problem, we\npropose HEterogeneous ALliance (HEAL), a novel extensible collaborative\nperception framework. HEAL first establishes a unified feature space with\ninitial agents via a novel multi-scale foreground-aware Pyramid Fusion network.\nWhen heterogeneous new agents emerge with previously unseen modalities or\nmodels, we align them to the established unified space with an innovative\nbackward alignment. This step only involves individual training on the new\nagent type, thus presenting extremely low training costs and high\nextensibility. To enrich agents' data heterogeneity, we bring OPV2V-H, a new\nlarge-scale dataset with more diverse sensor types. Extensive experiments on\nOPV2V-H and DAIR-V2X datasets show that HEAL surpasses SOTA methods in\nperformance while reducing the training parameters by 91.5% when integrating 3\nnew agent types. We further implement a comprehensive codebase at:\nhttps://github.com/yifanlu0227/HEAL\n","authors":["Yifan Lu","Yue Hu","Yiqi Zhong","Dequan Wang","Yanfeng Wang","Siheng Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13964v3.pdf","comment":"Accepted by ICLR 2024. The code and data are open-sourced at\n https://github.com/yifanlu0227/HEAL"},{"id":"http://arxiv.org/abs/2403.12686v2","updated":"2024-04-01T01:23:16Z","published":"2024-03-19T12:45:18Z","title":"WaterVG: Waterway Visual Grounding based on Text-Guided Vision and\n mmWave Radar","summary":" The perception of waterways based on human intent is significant for\nautonomous navigation and operations of Unmanned Surface Vehicles (USVs) in\nwater environments. Inspired by visual grounding, we introduce WaterVG, the\nfirst visual grounding dataset designed for USV-based waterway perception based\non human prompts. WaterVG encompasses prompts describing multiple targets, with\nannotations at the instance level including bounding boxes and masks. Notably,\nWaterVG includes 11,568 samples with 34,987 referred targets, whose prompts\nintegrates both visual and radar characteristics. The pattern of text-guided\ntwo sensors equips a finer granularity of text prompts with visual and radar\nfeatures of referred targets. Moreover, we propose a low-power visual grounding\nmodel, Potamoi, which is a multi-task model with a well-designed Phased\nHeterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting\n(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts\nrequired radar features to fuse with vision for prompt alignment. MHSCA is an\nefficient fusion module with a remarkably small parameter count and FLOPs,\nelegantly fusing scenario context captured by two sensors with linguistic\nfeatures, which performs expressively on visual grounding tasks. Comprehensive\nexperiments and evaluations have been conducted on WaterVG, where our Potamoi\narchives state-of-the-art performances compared with counterparts.\n","authors":["Runwei Guan","Liye Jia","Fengyufan Yang","Shanliang Yao","Erick Purwanto","Xiaohui Zhu","Eng Gee Lim","Jeremy Smith","Ka Lok Man","Xuming Hu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2403.12686v2.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2312.15130v2","updated":"2024-04-01T00:22:18Z","published":"2023-12-23T01:38:41Z","title":"PACE: A Large-Scale Dataset with Pose Annotations in Cluttered\n Environments","summary":" Pose estimation is a crucial task in computer vision and robotics, enabling\nthe tracking and manipulation of objects in images or videos. While several\ndatasets exist for pose estimation, there is a lack of large-scale datasets\nspecifically focusing on cluttered scenes with occlusions. We introduce PACE\n(Pose Annotations in Cluttered Environments), a large-scale benchmark designed\nto advance the development and evaluation of pose estimation methods in\ncluttered scenarios. PACE consists of 54,945 frames with 257,673 annotations\nacross 300 videos, covering 576 objects from 44 categories and featuring a mix\nof rigid and articulated items in cluttered scenes. To annotate the real-world\ndata efficiently, we developed an innovative annotation system utilizing a\ncalibrated 3-camera setup. We test state-of-the-art algorithms in PACE along\ntwo tracks: pose estimation, and object pose tracking, revealing the\nbenchmark's challenges and research opportunities. Our code and data is\navailable on https://github.com/qq456cvb/PACE.\n","authors":["Yang You","Kai Xiong","Zhening Yang","Zhengxiang Huang","Junwei Zhou","Ruoxi Shi","Zhou Fang","Adam W. Harley","Leonidas Guibas","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2312.15130v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20271v2","updated":"2024-04-01T03:25:30Z","published":"2024-03-29T16:26:20Z","title":"Draw-and-Understand: Leveraging Visual Prompts to Enable MLLMs to\n Comprehend What You Want","summary":" The interaction between humans and artificial intelligence (AI) is a crucial\nfactor that reflects the effectiveness of multimodal large language models\n(MLLMs). However, current MLLMs primarily focus on image-level comprehension\nand limit interaction to textual instructions, thereby constraining their\nflexibility in usage and depth of response. In this paper, we introduce the\nDraw-and-Understand project: a new model, a multi-domain dataset, and a\nchallenging benchmark for visual prompting. Specifically, we propose SPHINX-V,\na new end-to-end trained Multimodal Large Language Model (MLLM) that connects a\nvision encoder, a visual prompt encoder and an LLM for various visual prompts\n(points, bounding boxes, and free-form shape) and language understanding. To\nadvance visual prompting research for MLLMs, we introduce MDVP-Data and\nMDVP-Bench. MDVP-Data features a multi-domain dataset containing 1.6M unique\nimage-visual prompt-text instruction-following samples, including natural\nimages, document images, OCR images, mobile screenshots, web screenshots, and\nmulti-panel images. Furthermore, we present MDVP-Bench, a comprehensive and\nchallenging benchmark to assess a model's capability in understanding visual\nprompting instructions. Our experiments demonstrate SPHINX-V's impressive\nmultimodal interaction capabilities through visual prompting, revealing\nsignificant improvements in detailed pixel-level description and\nquestion-answering abilities.\n","authors":["Weifeng Lin","Xinyu Wei","Ruichuan An","Peng Gao","Bocheng Zou","Yulin Luo","Siyuan Huang","Shanghang Zhang","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.20271v2.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.19435v3","updated":"2024-04-01T13:02:20Z","published":"2024-03-28T14:04:17Z","title":"BAMM: Bidirectional Autoregressive Motion Model","summary":" Generating human motion from text has been dominated by denoising motion\nmodels either through diffusion or generative masking process. However, these\nmodels face great limitations in usability by requiring prior knowledge of the\nmotion length. Conversely, autoregressive motion models address this limitation\nby adaptively predicting motion endpoints, at the cost of degraded generation\nquality and editing capabilities. To address these challenges, we propose\nBidirectional Autoregressive Motion Model (BAMM), a novel text-to-motion\ngeneration framework. BAMM consists of two key components: (1) a motion\ntokenizer that transforms 3D human motion into discrete tokens in latent space,\nand (2) a masked self-attention transformer that autoregressively predicts\nrandomly masked tokens via a hybrid attention masking strategy. By unifying\ngenerative masked modeling and autoregressive modeling, BAMM captures rich and\nbidirectional dependencies among motion tokens, while learning the\nprobabilistic mapping from textual inputs to motion outputs with\ndynamically-adjusted motion sequence length. This feature enables BAMM to\nsimultaneously achieving high-quality motion generation with enhanced usability\nand built-in motion editability. Extensive experiments on HumanML3D and KIT-ML\ndatasets demonstrate that BAMM surpasses current state-of-the-art methods in\nboth qualitative and quantitative measures. Our project page is available at\nhttps://exitudio.github.io/BAMM-page\n","authors":["Ekkasit Pinyoanuntapong","Muhammad Usama Saleem","Pu Wang","Minwoo Lee","Srijan Das","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.19435v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19926v2","updated":"2024-04-01T08:52:20Z","published":"2024-03-29T02:26:22Z","title":"Video-Based Human Pose Regression via Decoupled Space-Time Aggregation","summary":" By leveraging temporal dependency in video sequences, multi-frame human pose\nestimation algorithms have demonstrated remarkable results in complicated\nsituations, such as occlusion, motion blur, and video defocus. These algorithms\nare predominantly based on heatmaps, resulting in high computation and storage\nrequirements per frame, which limits their flexibility and real-time\napplication in video scenarios, particularly on edge devices. In this paper, we\ndevelop an efficient and effective video-based human pose regression method,\nwhich bypasses intermediate representations such as heatmaps and instead\ndirectly maps the input to the output joint coordinates. Despite the inherent\nspatial correlation among adjacent joints of the human pose, the temporal\ntrajectory of each individual joint exhibits relative independence. In light of\nthis, we propose a novel Decoupled Space-Time Aggregation network (DSTA) to\nseparately capture the spatial contexts between adjacent joints and the\ntemporal cues of each individual joint, thereby avoiding the conflation of\nspatiotemporal dimensions. Concretely, DSTA learns a dedicated feature token\nfor each joint to facilitate the modeling of their spatiotemporal dependencies.\nWith the proposed joint-wise local-awareness attention mechanism, our method is\ncapable of efficiently and flexibly utilizing the spatial dependency of\nadjacent joints and the temporal dependency of each joint itself. Extensive\nexperiments demonstrate the superiority of our method. Compared to previous\nregression-based single-frame human pose estimation methods, DSTA significantly\nenhances performance, achieving an 8.9 mAP improvement on PoseTrack2017.\nFurthermore, our approach either surpasses or is on par with the\nstate-of-the-art heatmap-based multi-frame human pose estimation methods.\nProject page: https://github.com/zgspose/DSTA.\n","authors":["Jijie He","Wenwu Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19926v2.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.17076v3","updated":"2024-04-01T03:17:09Z","published":"2023-11-27T22:23:27Z","title":"Compositional Chain-of-Thought Prompting for Large Multimodal Models","summary":" The combination of strong visual backbones and Large Language Model (LLM)\nreasoning has led to Large Multimodal Models (LMMs) becoming the current\nstandard for a wide range of vision and language (VL) tasks. However, recent\nresearch has shown that even the most advanced LMMs still struggle to capture\naspects of compositional visual reasoning, such as attributes and relationships\nbetween objects. One solution is to utilize scene graphs (SGs)--a formalization\nof objects and their relations and attributes that has been extensively used as\na bridge between the visual and textual domains. Yet, scene graph data requires\nscene graph annotations, which are expensive to collect and thus not easily\nscalable. Moreover, finetuning an LMM based on SG data can lead to catastrophic\nforgetting of the pretraining objective. To overcome this, inspired by\nchain-of-thought methods, we propose Compositional Chain-of-Thought (CCoT), a\nnovel zero-shot Chain-of-Thought prompting method that utilizes SG\nrepresentations in order to extract compositional knowledge from an LMM.\nSpecifically, we first generate an SG using the LMM, and then use that SG in\nthe prompt to produce a response. Through extensive experiments, we find that\nthe proposed CCoT approach not only improves LMM performance on several vision\nand language VL compositional benchmarks but also improves the performance of\nseveral popular LMMs on general multimodal benchmarks, without the need for\nfine-tuning or annotated ground-truth SGs. Code:\nhttps://github.com/chancharikmitra/CCoT\n","authors":["Chancharik Mitra","Brandon Huang","Trevor Darrell","Roei Herzig"],"pdf_url":"https://arxiv.org/pdf/2311.17076v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19797v2","updated":"2024-04-01T02:57:07Z","published":"2024-03-28T19:25:25Z","title":"Efficient 3D Instance Mapping and Localization with Neural Fields","summary":" We tackle the problem of learning an implicit scene representation for 3D\ninstance segmentation from a sequence of posed RGB images. Towards this, we\nintroduce 3DIML, a novel framework that efficiently learns a label field that\nmay be rendered from novel viewpoints to produce view-consistent instance\nsegmentation masks. 3DIML significantly improves upon training and inference\nruntimes of existing implicit scene representation based methods. Opposed to\nprior art that optimizes a neural field in a self-supervised manner, requiring\ncomplicated training procedures and loss function design, 3DIML leverages a\ntwo-phase process. The first phase, InstanceMap, takes as input 2D segmentation\nmasks of the image sequence generated by a frontend instance segmentation\nmodel, and associates corresponding masks across images to 3D labels. These\nalmost view-consistent pseudolabel masks are then used in the second phase,\nInstanceLift, to supervise the training of a neural label field, which\ninterpolates regions missed by InstanceMap and resolves ambiguities.\nAdditionally, we introduce InstanceLoc, which enables near realtime\nlocalization of instance masks given a trained label field and an off-the-shelf\nimage segmentation model by fusing outputs from both. We evaluate 3DIML on\nsequences from the Replica and ScanNet datasets and demonstrate 3DIML's\neffectiveness under mild assumptions for the image sequences. We achieve a\nlarge practical speedup over existing implicit scene representation methods\nwith comparable quality, showcasing its potential to facilitate faster and more\neffective 3D scene understanding.\n","authors":["George Tang","Krishna Murthy Jatavallabhula","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2403.19797v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19646v2","updated":"2024-04-01T08:00:56Z","published":"2024-03-28T17:55:42Z","title":"Change-Agent: Towards Interactive Comprehensive Remote Sensing Change\n Interpretation and Analysis","summary":" Monitoring changes in the Earth's surface is crucial for understanding\nnatural processes and human impacts, necessitating precise and comprehensive\ninterpretation methodologies. Remote sensing satellite imagery offers a unique\nperspective for monitoring these changes, leading to the emergence of remote\nsensing image change interpretation (RSICI) as a significant research focus.\nCurrent RSICI technology encompasses change detection and change captioning,\neach with its limitations in providing comprehensive interpretation. To address\nthis, we propose an interactive Change-Agent, which can follow user\ninstructions to achieve comprehensive change interpretation and insightful\nanalysis according to user instructions, such as change detection and change\ncaptioning, change object counting, change cause analysis, etc. The\nChange-Agent integrates a multi-level change interpretation (MCI) model as the\neyes and a large language model (LLM) as the brain. The MCI model contains two\nbranches of pixel-level change detection and semantic-level change captioning,\nin which multiple BI-temporal Iterative Interaction (BI3) layers utilize Local\nPerception Enhancement (LPE) and the Global Difference Fusion Attention (GDFA)\nmodules to enhance the model's discriminative feature representation\ncapabilities. To support the training of the MCI model, we build the LEVIR-MCI\ndataset with a large number of change masks and captions of changes. Extensive\nexperiments demonstrate the effectiveness of the proposed MCI model and\nhighlight the promising potential of our Change-Agent in facilitating\ncomprehensive and intelligent interpretation of surface changes. To facilitate\nfuture research, we will make our dataset and codebase of the MCI model and\nChange-Agent publicly available at\nhttps://github.com/Chen-Yang-Liu/Change-Agent\n","authors":["Chenyang Liu","Keyan Chen","Haotian Zhang","Zipeng Qi","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19646v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05369v5","updated":"2024-04-01T07:26:06Z","published":"2024-03-08T15:00:44Z","title":"Frequency-Adaptive Dilated Convolution for Semantic Segmentation","summary":" Dilated convolution, which expands the receptive field by inserting gaps\nbetween its consecutive elements, is widely employed in computer vision. In\nthis study, we propose three strategies to improve individual phases of dilated\nconvolution from the view of spectrum analysis. Departing from the conventional\npractice of fixing a global dilation rate as a hyperparameter, we introduce\nFrequency-Adaptive Dilated Convolution (FADC), which dynamically adjusts\ndilation rates spatially based on local frequency components. Subsequently, we\ndesign two plug-in modules to directly enhance effective bandwidth and\nreceptive field size. The Adaptive Kernel (AdaKern) module decomposes\nconvolution weights into low-frequency and high-frequency components,\ndynamically adjusting the ratio between these components on a per-channel\nbasis. By increasing the high-frequency part of convolution weights, AdaKern\ncaptures more high-frequency components, thereby improving effective bandwidth.\nThe Frequency Selection (FreqSelect) module optimally balances high- and\nlow-frequency components in feature representations through spatially variant\nreweighting. It suppresses high frequencies in the background to encourage FADC\nto learn a larger dilation, thereby increasing the receptive field for an\nexpanded scope. Extensive experiments on segmentation and object detection\nconsistently validate the efficacy of our approach. The code is publicly\navailable at https://github.com/Linwei-Chen/FADC.\n","authors":["Linwei Chen","Lin Gu","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.05369v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10030v3","updated":"2024-04-01T05:22:52Z","published":"2024-03-15T05:30:29Z","title":"Multi-criteria Token Fusion with One-step-ahead Attention for Efficient\n Vision Transformers","summary":" Vision Transformer (ViT) has emerged as a prominent backbone for computer\nvision. For more efficient ViTs, recent works lessen the quadratic cost of the\nself-attention layer by pruning or fusing the redundant tokens. However, these\nworks faced the speed-accuracy trade-off caused by the loss of information.\nHere, we argue that token fusion needs to consider diverse relations between\ntokens to minimize information loss. In this paper, we propose a Multi-criteria\nToken Fusion (MCTF), that gradually fuses the tokens based on multi-criteria\n(e.g., similarity, informativeness, and size of fused tokens). Further, we\nutilize the one-step-ahead attention, which is the improved approach to capture\nthe informativeness of the tokens. By training the model equipped with MCTF\nusing a token reduction consistency, we achieve the best speed-accuracy\ntrade-off in the image classification (ImageNet1K). Experimental results prove\nthat MCTF consistently surpasses the previous reduction methods with and\nwithout training. Specifically, DeiT-T and DeiT-S with MCTF reduce FLOPs by\nabout 44% while improving the performance (+0.5%, and +0.3%) over the base\nmodel, respectively. We also demonstrate the applicability of MCTF in various\nVision Transformers (e.g., T2T-ViT, LV-ViT), achieving at least 31% speedup\nwithout performance degradation. Code is available at\nhttps://github.com/mlvlab/MCTF.\n","authors":["Sanghyeok Lee","Joonmyung Choi","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2403.10030v3.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.17639v2","updated":"2024-04-01T14:15:51Z","published":"2024-03-26T12:21:47Z","title":"High-Resolution Image Translation Model Based on Grayscale Redefinition","summary":" Image-to-image translation is a technique that focuses on transferring images\nfrom one domain to another while maintaining the essential content\nrepresentations. In recent years, image-to-image translation has gained\nsignificant attention and achieved remarkable advancements due to its diverse\napplications in computer vision and image processing tasks. In this work, we\npropose an innovative method for image translation between different domains.\nFor high-resolution image translation tasks, we use a grayscale adjustment\nmethod to achieve pixel-level translation. For other tasks, we utilize the\nPix2PixHD model with a coarse-to-fine generator, multi-scale discriminator, and\nimproved loss to enhance the image translation performance. On the other hand,\nto tackle the issue of sparse training data, we adopt model weight\ninitialization from other task to optimize the performance of the current task.\n","authors":["Xixian Wu","Dian Chao","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15388v3","updated":"2024-04-01T14:08:06Z","published":"2024-03-22T17:59:52Z","title":"LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal\n Models","summary":" Large Multimodal Models (LMMs) have shown significant reasoning capabilities\nby connecting a visual encoder and a large language model. LMMs typically use a\nfixed amount of visual tokens, such as the penultimate layer features in the\nCLIP visual encoder, as the prefix content. Recent LMMs incorporate more\ncomplex visual inputs, such as high-resolution images and videos, which\nincrease the number of visual tokens significantly. However, due to the design\nof the Transformer architecture, computational costs associated with these\nmodels tend to increase quadratically with the number of input tokens. To\ntackle this problem, we explore a token reduction mechanism and find, similar\nto prior work, that many visual tokens are spatially redundant. Based on this,\nwe propose PruMerge, a novel adaptive visual token reduction approach, which\nlargely reduces the number of visual tokens while maintaining comparable model\nperformance. We first select the unpruned visual tokens based on their\nsimilarity to class tokens and spatial tokens. We then cluster the pruned\ntokens based on key similarity and merge the clustered tokens with the unpruned\ntokens to supplement their information. Empirically, when applied to LLaVA-1.5,\nour approach can compress the visual tokens by 18 times on average, and achieve\ncomparable performance across diverse visual question-answering and reasoning\ntasks. Code and checkpoints are at https://llava-prumerge.github.io/.\n","authors":["Yuzhang Shang","Mu Cai","Bingxin Xu","Yong Jae Lee","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.15388v3.pdf","comment":"Project page: https://llava-prumerge.github.io/"},{"id":"http://arxiv.org/abs/2404.01524v1","updated":"2024-04-01T23:11:15Z","published":"2024-04-01T23:11:15Z","title":"On Train-Test Class Overlap and Detection for Image Retrieval","summary":" How important is it for training and evaluation sets to not have class\noverlap in image retrieval? We revisit Google Landmarks v2 clean, the most\npopular training set, by identifying and removing class overlap with Revisited\nOxford and Paris [34], the most popular evaluation set. By comparing the\noriginal and the new RGLDv2-clean on a benchmark of reproduced state-of-the-art\nmethods, our findings are striking. Not only is there a dramatic drop in\nperformance, but it is inconsistent across methods, changing the ranking.What\ndoes it take to focus on objects or interest and ignore background clutter when\nindexing? Do we need to train an object detector and the representation\nseparately? Do we need location supervision? We introduce Single-stage\nDetect-to-Retrieve (CiDeR), an end-to-end, single-stage pipeline to detect\nobjects of interest and extract a global image representation. We outperform\nprevious state-of-the-art on both existing training sets and the new\nRGLDv2-clean. Our dataset is available at\nhttps://github.com/dealicious-inc/RGLDv2-clean.\n","authors":["Chull Hwan Song","Jooyoung Yoon","Taebaek Hwang","Shunghyun Choi","Yeong Hyeon Gu","Yannis Avrithis"],"pdf_url":"https://arxiv.org/pdf/2404.01524v1.pdf","comment":"CVPR2024 Accepted"},{"id":"http://arxiv.org/abs/2404.01518v1","updated":"2024-04-01T22:53:47Z","published":"2024-04-01T22:53:47Z","title":"Temporally Consistent Unbalanced Optimal Transport for Unsupervised\n Action Segmentation","summary":" We propose a novel approach to the action segmentation task for long,\nuntrimmed videos, based on solving an optimal transport problem. By encoding a\ntemporal consistency prior into a Gromov-Wasserstein problem, we are able to\ndecode a temporally consistent segmentation from a noisy affinity/matching cost\nmatrix between video frames and action classes. Unlike previous approaches, our\nmethod does not require knowing the action order for a video to attain temporal\nconsistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can\nbe efficiently solved on GPUs using a few iterations of projected mirror\ndescent. We demonstrate the effectiveness of our method in an unsupervised\nlearning setting, where our method is used to generate pseudo-labels for\nself-training. We evaluate our segmentation approach and unsupervised learning\npipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly\ndatasets, yielding state-of-the-art results for the unsupervised video action\nsegmentation task.\n","authors":["Ming Xu","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2404.01518v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19837v2","updated":"2024-04-01T22:34:37Z","published":"2024-03-28T21:15:38Z","title":"Concept-based Analysis of Neural Networks via Vision-Language Models","summary":" The analysis of vision-based deep neural networks (DNNs) is highly desirable\nbut it is very challenging due to the difficulty of expressing formal\nspecifications for vision tasks and the lack of efficient verification\nprocedures. In this paper, we propose to leverage emerging multimodal,\nvision-language, foundation models (VLMs) as a lens through which we can reason\nabout vision models. VLMs have been trained on a large body of images\naccompanied by their textual description, and are thus implicitly aware of\nhigh-level, human-understandable concepts describing the images. We describe a\nlogical specification language $\\texttt{Con}_{\\texttt{spec}}$ designed to\nfacilitate writing specifications in terms of these concepts. To define and\nformally check $\\texttt{Con}_{\\texttt{spec}}$ specifications, we build a map\nbetween the internal representations of a given vision model and a VLM, leading\nto an efficient verification procedure of natural-language properties for\nvision models. We demonstrate our techniques on a ResNet-based classifier\ntrained on the RIVAL-10 dataset using CLIP as the multimodal model.\n","authors":["Ravi Mangal","Nina Narodytska","Divya Gopinath","Boyue Caroline Hu","Anirban Roy","Susmit Jha","Corina Pasareanu"],"pdf_url":"https://arxiv.org/pdf/2403.19837v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01509v1","updated":"2024-04-01T22:25:48Z","published":"2024-04-01T22:25:48Z","title":"Can Biases in ImageNet Models Explain Generalization?","summary":" The robust generalization of models to rare, in-distribution (ID) samples\ndrawn from the long tail of the training distribution and to\nout-of-training-distribution (OOD) samples is one of the major challenges of\ncurrent deep learning methods. For image classification, this manifests in the\nexistence of adversarial attacks, the performance drops on distorted images,\nand a lack of generalization to concepts such as sketches. The current\nunderstanding of generalization in neural networks is very limited, but some\nbiases that differentiate models from human vision have been identified and\nmight be causing these limitations. Consequently, several attempts with varying\nsuccess have been made to reduce these biases during training to improve\ngeneralization. We take a step back and sanity-check these attempts. Fixing the\narchitecture to the well-established ResNet-50, we perform a large-scale study\non 48 ImageNet models obtained via different training methods to understand how\nand if these biases - including shape bias, spectral biases, and critical bands\n- interact with generalization. Our extensive study results reveal that\ncontrary to previous findings, these biases are insufficient to accurately\npredict the generalization of a model holistically. We provide access to all\ncheckpoints and evaluation code at\nhttps://github.com/paulgavrikov/biases_vs_generalization\n","authors":["Paul Gavrikov","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2404.01509v1.pdf","comment":"Accepted at CVPR2024"},{"id":"http://arxiv.org/abs/2401.02402v2","updated":"2024-04-01T22:21:00Z","published":"2024-01-04T18:39:32Z","title":"3D Open-Vocabulary Panoptic Segmentation with 2D-3D Vision-Language\n Distillation","summary":" 3D panoptic segmentation is a challenging perception task, especially in\nautonomous driving. It aims to predict both semantic and instance annotations\nfor 3D points in a scene. Although prior 3D panoptic segmentation approaches\nhave achieved great performance on closed-set benchmarks, generalizing these\napproaches to unseen things and unseen stuff categories remains an open\nproblem. For unseen object categories, 2D open-vocabulary segmentation has\nachieved promising results that solely rely on frozen CLIP backbones and\nensembling multiple classification outputs. However, we find that simply\nextending these 2D models to 3D does not guarantee good performance due to poor\nper-mask classification quality, especially for novel stuff categories. In this\npaper, we propose the first method to tackle 3D open-vocabulary panoptic\nsegmentation. Our model takes advantage of the fusion between learnable LiDAR\nfeatures and dense frozen vision CLIP features, using a single classification\nhead to make predictions for both base and novel classes. To further improve\nthe classification performance on novel classes and leverage the CLIP model, we\npropose two novel loss functions: object-level distillation loss and\nvoxel-level distillation loss. Our experiments on the nuScenes and\nSemanticKITTI datasets show that our method outperforms the strong baseline by\na large margin.\n","authors":["Zihao Xiao","Longlong Jing","Shangxuan Wu","Alex Zihao Zhu","Jingwei Ji","Chiyu Max Jiang","Wei-Chih Hung","Thomas Funkhouser","Weicheng Kuo","Anelia Angelova","Yin Zhou","Shiwei Sheng"],"pdf_url":"https://arxiv.org/pdf/2401.02402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01501v1","updated":"2024-04-01T21:49:05Z","published":"2024-04-01T21:49:05Z","title":"MosquitoFusion: A Multiclass Dataset for Real-Time Detection of\n Mosquitoes, Swarms, and Breeding Sites Using Deep Learning","summary":" In this paper, we present an integrated approach to real-time mosquito\ndetection using our multiclass dataset (MosquitoFusion) containing 1204 diverse\nimages and leverage cutting-edge technologies, specifically computer vision, to\nautomate the identification of Mosquitoes, Swarms, and Breeding Sites. The\npre-trained YOLOv8 model, trained on this dataset, achieved a mean Average\nPrecision (mAP@50) of 57.1%, with precision at 73.4% and recall at 50.5%. The\nintegration of Geographic Information Systems (GIS) further enriches the depth\nof our analysis, providing valuable insights into spatial patterns. The dataset\nand code are available at https://github.com/faiyazabdullah/MosquitoFusion.\n","authors":["Md. Faiyaz Abdullah Sayeedi","Fahim Hafiz","Md Ashiqur Rahman"],"pdf_url":"https://arxiv.org/pdf/2404.01501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01492v1","updated":"2024-04-01T21:28:50Z","published":"2024-04-01T21:28:50Z","title":"Modality Translation for Object Detection Adaptation Without Forgetting\n Prior Knowledge","summary":" A common practice in deep learning consists of training large neural networks\non massive datasets to perform accurately for different domains and tasks.\nWhile this methodology may work well in numerous application areas, it only\napplies across modalities due to a larger distribution shift in data captured\nusing different sensors. This paper focuses on the problem of adapting a large\nobject detection model to one or multiple modalities while being efficient. To\ndo so, we propose ModTr as an alternative to the common approach of fine-tuning\nlarge models. ModTr consists of adapting the input with a small transformation\nnetwork trained to minimize the detection loss directly. The original model can\ntherefore work on the translated inputs without any further change or\nfine-tuning to its parameters. Experimental results on translating from IR to\nRGB images on two well-known datasets show that this simple ModTr approach\nprovides detectors that can perform comparably or better than the standard\nfine-tuning without forgetting the original knowledge. This opens the doors to\na more flexible and efficient service-based detection pipeline in which,\ninstead of using a different detector for each modality, a unique and unaltered\nserver is constantly running, where multiple modalities with the corresponding\ntranslations can query it. Code: https://github.com/heitorrapela/ModTr.\n","authors":["Heitor Rapela Medeiros","Masih Aminbeidokhti","Fidel Guerrero Pena","David Latortue","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2404.01492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01491v1","updated":"2024-04-01T21:23:03Z","published":"2024-04-01T21:23:03Z","title":"SUGAR: Pre-training 3D Visual Representations for Robotics","summary":" Learning generalizable visual representations from Internet data has yielded\npromising results for robotics. Yet, prevailing approaches focus on\npre-training 2D representations, being sub-optimal to deal with occlusions and\naccurately localize objects in complex 3D scenes. Meanwhile, 3D representation\nlearning has been limited to single-object understanding. To address these\nlimitations, we introduce a novel 3D pre-training framework for robotics named\nSUGAR that captures semantic, geometric and affordance properties of objects\nthrough 3D point clouds. We underscore the importance of cluttered scenes in 3D\nrepresentation learning, and automatically construct a multi-object dataset\nbenefiting from cost-free supervision in simulation. SUGAR employs a versatile\ntransformer-based model to jointly address five pre-training tasks, namely\ncross-modal knowledge distillation for semantic learning, masked point modeling\nto understand geometry structures, grasping pose synthesis for object\naffordance, 3D instance segmentation and referring expression grounding to\nanalyze cluttered scenes. We evaluate our learned representation on three\nrobotic-related tasks, namely, zero-shot 3D object recognition, referring\nexpression grounding, and language-driven robotic manipulation. Experimental\nresults show that SUGAR's 3D representation outperforms state-of-the-art 2D and\n3D representations.\n","authors":["Shizhe Chen","Ricardo Garcia","Ivan Laptev","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2404.01491v1.pdf","comment":"Accepted to CVPR 2024. Project webpage:\n https://cshizhe.github.io/projects/robot_sugar.html"},{"id":"http://arxiv.org/abs/2404.01486v1","updated":"2024-04-01T21:11:43Z","published":"2024-04-01T21:11:43Z","title":"QuAD: Query-based Interpretable Neural Motion Planning for Autonomous\n Driving","summary":" A self-driving vehicle must understand its environment to determine the\nappropriate action. Traditional autonomy systems rely on object detection to\nfind the agents in the scene. However, object detection assumes a discrete set\nof objects and loses information about uncertainty, so any errors compound when\npredicting the future behavior of those agents. Alternatively, dense occupancy\ngrid maps have been utilized to understand free-space. However, predicting a\ngrid for the entire scene is wasteful since only certain spatio-temporal\nregions are reachable and relevant to the self-driving vehicle. We present a\nunified, interpretable, and efficient autonomy framework that moves away from\ncascading modules that first perceive, then predict, and finally plan. Instead,\nwe shift the paradigm to have the planner query occupancy at relevant\nspatio-temporal points, restricting the computation to those regions of\ninterest. Exploiting this representation, we evaluate candidate trajectories\naround key factors such as collision avoidance, comfort, and progress for\nsafety and interpretability. Our approach achieves better highway driving\nquality than the state-of-the-art in high-fidelity closed-loop simulations.\n","authors":["Sourav Biswas","Sergio Casas","Quinlan Sykora","Ben Agro","Abbas Sadat","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2404.01486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01476v1","updated":"2024-04-01T20:58:24Z","published":"2024-04-01T20:58:24Z","title":"TraveLER: A Multi-LMM Agent Framework for Video Question-Answering","summary":" Recently, Large Multimodal Models (LMMs) have made significant progress in\nvideo question-answering using a frame-wise approach by leveraging large-scale,\nimage-based pretraining in a zero-shot manner. While image-based methods for\nvideos have shown impressive performance, a current limitation is that they\noften overlook how key timestamps are selected and cannot adjust when incorrect\ntimestamps are identified. Moreover, they are unable to extract details\nrelevant to the question, instead providing general descriptions of the frame.\nTo overcome this, we design a multi-LMM agent framework that travels along the\nvideo, iteratively collecting relevant information from keyframes through\ninteractive question-asking until there is sufficient information to answer the\nquestion. Specifically, we propose TraveLER, a model that can create a plan to\n\"Traverse\" through the video, ask questions about individual frames to \"Locate\"\nand store key information, and then \"Evaluate\" if there is enough information\nto answer the question. Finally, if there is not enough information, our method\nis able to \"Replan\" based on its collected knowledge. Through extensive\nexperiments, we find that the proposed TraveLER approach improves performance\non several video question-answering benchmarks, such as NExT-QA, STAR, and\nPerception Test, without the need to fine-tune on specific datasets.\n","authors":["Chuyi Shang","Amos You","Sanjay Subramanian","Trevor Darrell","Roei Herzig"],"pdf_url":"https://arxiv.org/pdf/2404.01476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10516v2","updated":"2024-04-01T20:57:45Z","published":"2024-03-15T17:57:06Z","title":"FeatUp: A Model-Agnostic Framework for Features at Any Resolution","summary":" Deep features are a cornerstone of computer vision research, capturing image\nsemantics and enabling the community to solve downstream tasks even in the\nzero- or few-shot regime. However, these features often lack the spatial\nresolution to directly perform dense prediction tasks like segmentation and\ndepth prediction because models aggressively pool information over large areas.\nIn this work, we introduce FeatUp, a task- and model-agnostic framework to\nrestore lost spatial information in deep features. We introduce two variants of\nFeatUp: one that guides features with high-resolution signal in a single\nforward pass, and one that fits an implicit model to a single image to\nreconstruct features at any resolution. Both approaches use a multi-view\nconsistency loss with deep analogies to NeRFs. Our features retain their\noriginal semantics and can be swapped into existing applications to yield\nresolution and performance gains even without re-training. We show that FeatUp\nsignificantly outperforms other feature upsampling and image super-resolution\napproaches in class activation map generation, transfer learning for\nsegmentation and depth prediction, and end-to-end training for semantic\nsegmentation.\n","authors":["Stephanie Fu","Mark Hamilton","Laura Brandt","Axel Feldman","Zhoutong Zhang","William T. Freeman"],"pdf_url":"https://arxiv.org/pdf/2403.10516v2.pdf","comment":"Accepted to the International Conference on Learning Representations\n (ICLR) 2024"},{"id":"http://arxiv.org/abs/2404.01464v1","updated":"2024-04-01T20:25:04Z","published":"2024-04-01T20:25:04Z","title":"Data-Efficient Unsupervised Interpolation Without Any Intermediate Frame\n for 4D Medical Images","summary":" 4D medical images, which represent 3D images with temporal information, are\ncrucial in clinical practice for capturing dynamic changes and monitoring\nlong-term disease progression. However, acquiring 4D medical images poses\nchallenges due to factors such as radiation exposure and imaging duration,\nnecessitating a balance between achieving high temporal resolution and\nminimizing adverse effects. Given these circumstances, not only is data\nacquisition challenging, but increasing the frame rate for each dataset also\nproves difficult. To address this challenge, this paper proposes a simple yet\neffective Unsupervised Volumetric Interpolation framework, UVI-Net. This\nframework facilitates temporal interpolation without the need for any\nintermediate frames, distinguishing it from the majority of other existing\nunsupervised methods. Experiments on benchmark datasets demonstrate significant\nimprovements across diverse evaluation metrics compared to unsupervised and\nsupervised baselines. Remarkably, our approach achieves this superior\nperformance even when trained with a dataset as small as one, highlighting its\nexceptional robustness and efficiency in scenarios with sparse supervision.\nThis positions UVI-Net as a compelling alternative for 4D medical imaging,\nparticularly in settings where data availability is limited. The source code is\navailable at https://github.com/jungeun122333/UVI-Net.\n","authors":["JungEun Kim","Hangyul Yoon","Geondo Park","Kyungsu Kim","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01464v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2310.16112v2","updated":"2024-04-01T20:18:02Z","published":"2023-10-24T18:26:22Z","title":"Towards long-tailed, multi-label disease classification from chest\n X-ray: Overview of the CXR-LT challenge","summary":" Many real-world image recognition problems, such as diagnostic medical\nimaging exams, are \"long-tailed\" $\\unicode{x2013}$ there are a few common\nfindings followed by many more relatively rare conditions. In chest\nradiography, diagnosis is both a long-tailed and multi-label problem, as\npatients often present with multiple findings simultaneously. While researchers\nhave begun to study the problem of long-tailed learning in medical image\nrecognition, few have studied the interaction of label imbalance and label\nco-occurrence posed by long-tailed, multi-label disease classification. To\nengage with the research community on this emerging topic, we conducted an open\nchallenge, CXR-LT, on long-tailed, multi-label thorax disease classification\nfrom chest X-rays (CXRs). We publicly release a large-scale benchmark dataset\nof over 350,000 CXRs, each labeled with at least one of 26 clinical findings\nfollowing a long-tailed distribution. We synthesize common themes of\ntop-performing solutions, providing practical recommendations for long-tailed,\nmulti-label medical image classification. Finally, we use these insights to\npropose a path forward involving vision-language foundation models for few- and\nzero-shot disease classification.\n","authors":["Gregory Holste","Yiliang Zhou","Song Wang","Ajay Jaiswal","Mingquan Lin","Sherry Zhuge","Yuzhe Yang","Dongkyun Kim","Trong-Hieu Nguyen-Mau","Minh-Triet Tran","Jaehyup Jeong","Wongi Park","Jongbin Ryu","Feng Hong","Arsh Verma","Yosuke Yamagishi","Changhyun Kim","Hyeryeong Seo","Myungjoo Kang","Leo Anthony Celi","Zhiyong Lu","Ronald M. Summers","George Shih","Zhangyang Wang","Yifan Peng"],"pdf_url":"https://arxiv.org/pdf/2310.16112v2.pdf","comment":"Update after major revision"},{"id":"http://arxiv.org/abs/2403.02090v2","updated":"2024-04-01T20:03:38Z","published":"2024-03-04T14:46:58Z","title":"Modeling Multimodal Social Interactions: New Challenges and Baselines\n with Densely Aligned Representations","summary":" Understanding social interactions involving both verbal and non-verbal cues\nis essential for effectively interpreting social situations. However, most\nprior works on multimodal social cues focus predominantly on single-person\nbehaviors or rely on holistic visual representations that are not aligned to\nutterances in multi-party environments. Consequently, they are limited in\nmodeling the intricate dynamics of multi-party interactions. In this paper, we\nintroduce three new challenging tasks to model the fine-grained dynamics\nbetween multiple people: speaking target identification, pronoun coreference\nresolution, and mentioned player prediction. We contribute extensive data\nannotations to curate these new challenges in social deduction game settings.\nFurthermore, we propose a novel multimodal baseline that leverages densely\naligned language-visual representations by synchronizing visual features with\ntheir corresponding utterances. This facilitates concurrently capturing verbal\nand non-verbal cues pertinent to social reasoning. Experiments demonstrate the\neffectiveness of the proposed approach with densely aligned multimodal\nrepresentations in modeling fine-grained social interactions. Project website:\nhttps://sangmin-git.github.io/projects/MMSI.\n","authors":["Sangmin Lee","Bolin Lai","Fiona Ryan","Bikram Boote","James M. Rehg"],"pdf_url":"https://arxiv.org/pdf/2403.02090v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01446v1","updated":"2024-04-01T19:33:41Z","published":"2024-04-01T19:33:41Z","title":"Finding Regions of Interest in Whole Slide Images Using Multiple\n Instance Learning","summary":" Whole Slide Images (WSI), obtained by high-resolution digital scanning of\nmicroscope slides at multiple scales, are the cornerstone of modern Digital\nPathology. However, they represent a particular challenge to\nAI-based/AI-mediated analysis because pathology labeling is typically done at\nslide-level, instead of tile-level. It is not just that medical diagnostics is\nrecorded at the specimen level, the detection of oncogene mutation is also\nexperimentally obtained, and recorded by initiatives like The Cancer Genome\nAtlas (TCGA), at the slide level. This configures a dual challenge: a)\naccurately predicting the overall cancer phenotype and b) finding out what\ncellular morphologies are associated with it at the tile level. To address\nthese challenges, a weakly supervised Multiple Instance Learning (MIL) approach\nwas explored for two prevalent cancer types, Invasive Breast Carcinoma\n(TCGA-BRCA) and Lung Squamous Cell Carcinoma (TCGA-LUSC). This approach was\nexplored for tumor detection at low magnification levels and TP53 mutations at\nvarious levels. Our results show that a novel additive implementation of MIL\nmatched the performance of reference implementation (AUC 0.96), and was only\nslightly outperformed by Attention MIL (AUC 0.97). More interestingly from the\nperspective of the molecular pathologist, these different AI architectures\nidentify distinct sensitivities to morphological features (through the\ndetection of Regions of Interest, RoI) at different amplification levels.\nTellingly, TP53 mutation was most sensitive to features at the higher\napplications where cellular morphology is resolved.\n","authors":["Martim Afonso","Praphulla M. S. Bhawsar","Monjoy Saha","Jonas S. Almeida","Arlindo L. Oliveira"],"pdf_url":"https://arxiv.org/pdf/2404.01446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01440v1","updated":"2024-04-01T19:23:00Z","published":"2024-04-01T19:23:00Z","title":"Neural Implicit Representation for Building Digital Twins of Unknown\n Articulated Objects","summary":" We address the problem of building digital twins of unknown articulated\nobjects from two RGBD scans of the object at different articulation states. We\ndecompose the problem into two stages, each addressing distinct aspects. Our\nmethod first reconstructs object-level shape at each state, then recovers the\nunderlying articulation model including part segmentation and joint\narticulations that associate the two states. By explicitly modeling point-level\ncorrespondences and exploiting cues from images, 3D reconstructions, and\nkinematics, our method yields more accurate and stable results compared to\nprior work. It also handles more than one movable part and does not rely on any\nobject shape or structure priors. Project page:\nhttps://github.com/NVlabs/DigitalTwinArt\n","authors":["Yijia Weng","Bowen Wen","Jonathan Tremblay","Valts Blukis","Dieter Fox","Leonidas Guibas","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2404.01440v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01438v1","updated":"2024-04-01T19:22:43Z","published":"2024-04-01T19:22:43Z","title":"Generation and Detection of Sign Language Deepfakes - A Linguistic and\n Visual Analysis","summary":" A question in the realm of deepfakes is slowly emerging pertaining to whether\nwe can go beyond facial deepfakes and whether it would be beneficial to\nsociety. Therefore, this research presents a positive application of deepfake\ntechnology in upper body generation, while performing sign-language for the\nDeaf and Hard of Hearing (DHoH) community. The resulting videos are later\nvetted with a sign language expert. This is particularly helpful, given the\nintricate nature of sign language, a scarcity of sign language experts, and\npotential benefits for health and education. The objectives of this work\nencompass constructing a reliable deepfake dataset, evaluating its technical\nand visual credibility through computer vision and natural language processing\nmodels, and assessing the plausibility of the generated content. With over 1200\nvideos, featuring both previously seen and unseen individuals for the\ngeneration model, using the help of a sign language expert, we establish a\ndeepfake dataset in sign language that can further be utilized to detect fake\nvideos that may target certain people of determination.\n","authors":["Shahzeb Naeem","Muhammad Riyyan Khan","Usman Tariq","Abhinav Dhall","Carlos Ivan Colon","Hasan Al-Nashash"],"pdf_url":"https://arxiv.org/pdf/2404.01438v1.pdf","comment":"13 pages, 13 figures, Computer Vision and Image Understanding Journal"},{"id":"http://arxiv.org/abs/2404.01437v1","updated":"2024-04-01T19:20:32Z","published":"2024-04-01T19:20:32Z","title":"The Radar Ghost Dataset -- An Evaluation of Ghost Objects in Automotive\n Radar Data","summary":" Radar sensors have a long tradition in advanced driver assistance systems\n(ADAS) and also play a major role in current concepts for autonomous vehicles.\nTheir importance is reasoned by their high robustness against meteorological\neffects, such as rain, snow, or fog, and the radar's ability to measure\nrelative radial velocity differences via the Doppler effect. The cause for\nthese advantages, namely the large wavelength, is also one of the drawbacks of\nradar sensors. Compared to camera or lidar sensor, a lot more surfaces in a\ntypical traffic scenario appear flat relative to the radar's emitted signal.\nThis results in multi-path reflections or so called ghost detections in the\nradar signal. Ghost objects pose a major source for potential false positive\ndetections in a vehicle's perception pipeline. Therefore, it is important to be\nable to segregate multi-path reflections from direct ones. In this article, we\npresent a dataset with detailed manual annotations for different kinds of ghost\ndetections. Moreover, two different approaches for identifying these kinds of\nobjects are evaluated. We hope that our dataset encourages more researchers to\nengage in the fields of multi-path object suppression or exploitation.\n","authors":["Florian Kraus","Nicolas Scheiner","Werner Ritter","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2404.01437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14334v2","updated":"2024-04-01T19:16:24Z","published":"2023-05-23T17:58:05Z","title":"Diffusion Hyperfeatures: Searching Through Time and Space for Semantic\n Correspondence","summary":" Diffusion models have been shown to be capable of generating high-quality\nimages, suggesting that they could contain meaningful internal representations.\nUnfortunately, the feature maps that encode a diffusion model's internal\ninformation are spread not only over layers of the network, but also over\ndiffusion timesteps, making it challenging to extract useful descriptors. We\npropose Diffusion Hyperfeatures, a framework for consolidating multi-scale and\nmulti-timestep feature maps into per-pixel feature descriptors that can be used\nfor downstream tasks. These descriptors can be extracted for both synthetic and\nreal images using the generation and inversion processes. We evaluate the\nutility of our Diffusion Hyperfeatures on the task of semantic keypoint\ncorrespondence: our method achieves superior performance on the SPair-71k real\nimage benchmark. We also demonstrate that our method is flexible and\ntransferable: our feature aggregation network trained on the inversion features\nof real image pairs can be used on the generation features of synthetic image\npairs with unseen objects and compositions. Our code is available at\nhttps://diffusion-hyperfeatures.github.io.\n","authors":["Grace Luo","Lisa Dunlap","Dong Huk Park","Aleksander Holynski","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2305.14334v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2306.06189v2","updated":"2024-04-01T19:14:25Z","published":"2023-06-09T18:41:37Z","title":"FasterViT: Fast Vision Transformers with Hierarchical Attention","summary":" We design a new family of hybrid CNN-ViT neural networks, named FasterViT,\nwith a focus on high image throughput for computer vision (CV) applications.\nFasterViT combines the benefits of fast local representation learning in CNNs\nand global modeling properties in ViT. Our newly introduced Hierarchical\nAttention (HAT) approach decomposes global self-attention with quadratic\ncomplexity into a multi-level attention with reduced computational costs. We\nbenefit from efficient window-based self-attention. Each window has access to\ndedicated carrier tokens that participate in local and global representation\nlearning. At a high level, global self-attentions enable the efficient\ncross-window communication at lower costs. FasterViT achieves a SOTA\nPareto-front in terms of accuracy and image throughput. We have extensively\nvalidated its effectiveness on various CV tasks including classification,\nobject detection and segmentation. We also show that HAT can be used as a\nplug-and-play module for existing networks and enhance them. We further\ndemonstrate significantly faster and more accurate performance than competitive\ncounterparts for images with high resolution. Code is available at\nhttps://github.com/NVlabs/FasterViT.\n","authors":["Ali Hatamizadeh","Greg Heinrich","Hongxu Yin","Andrew Tao","Jose M. Alvarez","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2306.06189v2.pdf","comment":"ICLR'24 Accepted Paper"},{"id":"http://arxiv.org/abs/2404.01424v1","updated":"2024-04-01T18:59:13Z","published":"2024-04-01T18:59:13Z","title":"DPMesh: Exploiting Diffusion Prior for Occluded Human Mesh Recovery","summary":" The recovery of occluded human meshes presents challenges for current methods\ndue to the difficulty in extracting effective image features under severe\nocclusion. In this paper, we introduce DPMesh, an innovative framework for\noccluded human mesh recovery that capitalizes on the profound diffusion prior\nabout object structure and spatial relationships embedded in a pre-trained\ntext-to-image diffusion model. Unlike previous methods reliant on conventional\nbackbones for vanilla feature extraction, DPMesh seamlessly integrates the\npre-trained denoising U-Net with potent knowledge as its image backbone and\nperforms a single-step inference to provide occlusion-aware information. To\nenhance the perception capability for occluded poses, DPMesh incorporates\nwell-designed guidance via condition injection, which produces effective\ncontrols from 2D observations for the denoising U-Net. Furthermore, we explore\na dedicated noisy key-point reasoning approach to mitigate disturbances arising\nfrom occlusion and crowded scenarios. This strategy fully unleashes the\nperceptual capability of the diffusion prior, thereby enhancing accuracy.\nExtensive experiments affirm the efficacy of our framework, as we outperform\nstate-of-the-art methods on both occlusion-specific and standard datasets. The\npersuasive results underscore its ability to achieve precise and robust 3D\nhuman mesh recovery, particularly in challenging scenarios involving occlusion\nand crowded scenes.\n","authors":["Yixuan Zhu","Ao Li","Yansong Tang","Wenliang Zhao","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2404.01424v1.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2312.02139v2","updated":"2024-04-01T18:55:16Z","published":"2023-12-04T18:57:01Z","title":"DiffiT: Diffusion Vision Transformers for Image Generation","summary":" Diffusion models with their powerful expressivity and high sample quality\nhave achieved State-Of-The-Art (SOTA) performance in the generative domain. The\npioneering Vision Transformer (ViT) has also demonstrated strong modeling\ncapabilities and scalability, especially for recognition tasks. In this paper,\nwe study the effectiveness of ViTs in diffusion-based generative learning and\npropose a new model denoted as Diffusion Vision Transformers (DiffiT).\nSpecifically, we propose a methodology for finegrained control of the denoising\nprocess and introduce the Time-dependant Multihead Self Attention (TMSA)\nmechanism. DiffiT is surprisingly effective in generating high-fidelity images\nwith significantly better parameter efficiency. We also propose latent and\nimage space DiffiT models and show SOTA performance on a variety of\nclass-conditional and unconditional synthesis tasks at different resolutions.\nThe Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet-256\ndataset while having 19.85%, 16.88% less parameters than other\nTransformer-based diffusion models such as MDT and DiT, respectively. Code:\nhttps://github.com/NVlabs/DiffiT\n","authors":["Ali Hatamizadeh","Jiaming Song","Guilin Liu","Jan Kautz","Arash Vahdat"],"pdf_url":"https://arxiv.org/pdf/2312.02139v2.pdf","comment":"Revised Tech report"},{"id":"http://arxiv.org/abs/2404.01415v1","updated":"2024-04-01T18:41:30Z","published":"2024-04-01T18:41:30Z","title":"On the Faithfulness of Vision Transformer Explanations","summary":" To interpret Vision Transformers, post-hoc explanations assign salience\nscores to input pixels, providing human-understandable heatmaps. However,\nwhether these interpretations reflect true rationales behind the model's output\nis still underexplored. To address this gap, we study the faithfulness\ncriterion of explanations: the assigned salience scores should represent the\ninfluence of the corresponding input pixels on the model's predictions. To\nevaluate faithfulness, we introduce Salience-guided Faithfulness Coefficient\n(SaCo), a novel evaluation metric leveraging essential information of salience\ndistribution. Specifically, we conduct pair-wise comparisons among distinct\npixel groups and then aggregate the differences in their salience scores,\nresulting in a coefficient that indicates the explanation's degree of\nfaithfulness. Our explorations reveal that current metrics struggle to\ndifferentiate between advanced explanation methods and Random Attribution,\nthereby failing to capture the faithfulness property. In contrast, our proposed\nSaCo offers a reliable faithfulness measurement, establishing a robust metric\nfor interpretations. Furthermore, our SaCo demonstrates that the use of\ngradient and multi-layer aggregation can markedly enhance the faithfulness of\nattention-based explanation, shedding light on potential paths for advancing\nVision Transformer explainability.\n","authors":["Junyi Wu","Weitai Kang","Hao Tang","Yuan Hong","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2404.01415v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01409v1","updated":"2024-04-01T18:26:29Z","published":"2024-04-01T18:26:29Z","title":"OVFoodSeg: Elevating Open-Vocabulary Food Image Segmentation via\n Image-Informed Textual Representation","summary":" In the realm of food computing, segmenting ingredients from images poses\nsubstantial challenges due to the large intra-class variance among the same\ningredients, the emergence of new ingredients, and the high annotation costs\nassociated with large food segmentation datasets. Existing approaches primarily\nutilize a closed-vocabulary and static text embeddings setting. These methods\noften fall short in effectively handling the ingredients, particularly new and\ndiverse ones. In response to these limitations, we introduce OVFoodSeg, a\nframework that adopts an open-vocabulary setting and enhances text embeddings\nwith visual context. By integrating vision-language models (VLMs), our approach\nenriches text embedding with image-specific information through two innovative\nmodules, eg, an image-to-text learner FoodLearner and an Image-Informed Text\nEncoder. The training process of OVFoodSeg is divided into two stages: the\npre-training of FoodLearner and the subsequent learning phase for segmentation.\nThe pre-training phase equips FoodLearner with the capability to align visual\ninformation with corresponding textual representations that are specifically\nrelated to food, while the second phase adapts both the FoodLearner and the\nImage-Informed Text Encoder for the segmentation task. By addressing the\ndeficiencies of previous models, OVFoodSeg demonstrates a significant\nimprovement, achieving an 4.9\\% increase in mean Intersection over Union (mIoU)\non the FoodSeg103 dataset, setting a new milestone for food image segmentation.\n","authors":["Xiongwei Wu","Sicheng Yu","Ee-Peng Lim","Chong-Wah Ngo"],"pdf_url":"https://arxiv.org/pdf/2404.01409v1.pdf","comment":"CVPR 2024; 12 pages"},{"id":"http://arxiv.org/abs/2403.18807v3","updated":"2024-04-01T18:26:22Z","published":"2024-03-27T17:53:30Z","title":"ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth\n Estimation","summary":" In the absence of parallax cues, a learning-based single image depth\nestimation (SIDE) model relies heavily on shading and contextual cues in the\nimage. While this simplicity is attractive, it is necessary to train such\nmodels on large and varied datasets, which are difficult to capture. It has\nbeen shown that using embeddings from pre-trained foundational models, such as\nCLIP, improves zero shot transfer in several applications. Taking inspiration\nfrom this, in our paper we explore the use of global image priors generated\nfrom a pre-trained ViT model to provide more detailed contextual information.\nWe argue that the embedding vector from a ViT model, pre-trained on a large\ndataset, captures greater relevant information for SIDE than the usual route of\ngenerating pseudo image captions, followed by CLIP based text embeddings. Based\non this idea, we propose a new SIDE model using a diffusion backbone which is\nconditioned on ViT embeddings. Our proposed design establishes a new\nstate-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of\n0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on\nKITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to\n0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model\ntrained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%)\nover NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%,\n18%, 45%, 9%) by ZoeDepth. The project page is available at\nhttps://ecodepth-iitd.github.io\n","authors":["Suraj Patni","Aradhye Agarwal","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2403.18807v3.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2404.01402v1","updated":"2024-04-01T18:12:09Z","published":"2024-04-01T18:12:09Z","title":"ContactHandover: Contact-Guided Robot-to-Human Object Handover","summary":" Robot-to-human object handover is an important step in many human robot\ncollaboration tasks. A successful handover requires the robot to maintain a\nstable grasp on the object while making sure the human receives the object in a\nnatural and easy-to-use manner. We propose ContactHandover, a robot to human\nhandover system that consists of two phases: a contact-guided grasping phase\nand an object delivery phase. During the grasping phase, ContactHandover\npredicts both 6-DoF robot grasp poses and a 3D affordance map of human contact\npoints on the object. The robot grasp poses are reranked by penalizing those\nthat block human contact points, and the robot executes the highest ranking\ngrasp. During the delivery phase, the robot end effector pose is computed by\nmaximizing human contact points close to the human while minimizing the human\narm joint torques and displacements. We evaluate our system on 27 diverse\nhousehold objects and show that our system achieves better visibility and\nreachability of human contacts to the receiver compared to several baselines.\nMore results can be found on\nhttps://clairezixiwang.github.io/ContactHandover.github.io\n","authors":["Zixi Wang","Zeyi Liu","Nicolas Ouporov","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2404.01402v1.pdf","comment":"Project website:\n https://clairezixiwang.github.io/ContactHandover.github.io/"},{"id":"http://arxiv.org/abs/2308.00622v2","updated":"2024-04-01T18:10:28Z","published":"2023-08-01T15:49:40Z","title":"NeRT: Implicit Neural Representations for General Unsupervised\n Turbulence Mitigation","summary":" The atmospheric and water turbulence mitigation problems have emerged as\nchallenging inverse problems in computer vision and optics communities over the\nyears. However, current methods either rely heavily on the quality of the\ntraining dataset or fail to generalize over various scenarios, such as static\nscenes, dynamic scenes, and text reconstructions. We propose a general implicit\nneural representation for unsupervised atmospheric and water turbulence\nmitigation (NeRT). NeRT leverages the implicit neural representations and the\nphysically correct tilt-then-blur turbulence model to reconstruct the clean,\nundistorted image, given only dozens of distorted input images. Moreover, we\nshow that NeRT outperforms the state-of-the-art through various qualitative and\nquantitative evaluations of atmospheric and water turbulence datasets.\nFurthermore, we demonstrate the ability of NeRT to eliminate uncontrolled\nturbulence from real-world environments. Lastly, we incorporate NeRT into\ncontinuously captured video sequences and demonstrate $48 \\times$ speedup.\n","authors":["Weiyun Jiang","Yuhao Liu","Vivek Boominathan","Ashok Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2308.00622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19022v2","updated":"2024-04-01T18:09:49Z","published":"2024-03-27T21:24:20Z","title":"WALT3D: Generating Realistic Training Data from Time-Lapse Imagery for\n Reconstructing Dynamic Objects under Occlusion","summary":" Current methods for 2D and 3D object understanding struggle with severe\nocclusions in busy urban environments, partly due to the lack of large-scale\nlabeled ground-truth annotations for learning occlusion. In this work, we\nintroduce a novel framework for automatically generating a large, realistic\ndataset of dynamic objects under occlusions using freely available time-lapse\nimagery. By leveraging off-the-shelf 2D (bounding box, segmentation, keypoint)\nand 3D (pose, shape) predictions as pseudo-groundtruth, unoccluded 3D objects\nare identified automatically and composited into the background in a clip-art\nstyle, ensuring realistic appearances and physically accurate occlusion\nconfigurations. The resulting clip-art image with pseudo-groundtruth enables\nefficient training of object reconstruction methods that are robust to\nocclusions. Our method demonstrates significant improvements in both 2D and 3D\nreconstruction, particularly in scenarios with heavily occluded objects like\nvehicles and people in urban scenes.\n","authors":["Khiem Vuong","N. Dinesh Reddy","Robert Tamburo","Srinivasa G. Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2403.19022v2.pdf","comment":"To appear in CVPR 2024. Homepage: https://www.cs.cmu.edu/~walt3d"},{"id":"http://arxiv.org/abs/2404.01397v1","updated":"2024-04-01T18:08:58Z","published":"2024-04-01T18:08:58Z","title":"Object-conditioned Bag of Instances for Few-Shot Personalized Instance\n Recognition","summary":" Nowadays, users demand for increased personalization of vision systems to\nlocalize and identify personal instances of objects (e.g., my dog rather than\ndog) from a few-shot dataset only. Despite outstanding results of deep networks\non classical label-abundant benchmarks (e.g., those of the latest YOLOv8 model\nfor standard object detection), they struggle to maintain within-class\nvariability to represent different instances rather than object categories\nonly. We construct an Object-conditioned Bag of Instances (OBoI) based on\nmulti-order statistics of extracted features, where generic object detection\nmodels are extended to search and identify personal instances from the OBoI's\nmetric space, without need for backpropagation. By relying on multi-order\nstatistics, OBoI achieves consistent superior accuracy in distinguishing\ndifferent instances. In the results, we achieve 77.1% personal object\nrecognition accuracy in case of 18 personal instances, showing about 12%\nrelative gain over the state of the art.\n","authors":["Umberto Michieli","Jijoong Moon","Daehyun Kim","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2404.01397v1.pdf","comment":"ICASSP 2024. Copyright 2024 IEEE. Personal use of this material is\n permitted. Permission from IEEE must be obtained for all other uses, in any\n current or future media, including reprinting/republishing this material for\n advertising or promotional purposes, creating new collective works, for\n resale or redistribution to servers or lists, or reuse of any copyrighted\n component of this work in other"},{"id":"http://arxiv.org/abs/2404.01300v1","updated":"2024-04-01T17:59:55Z","published":"2024-04-01T17:59:55Z","title":"NeRF-MAE : Masked AutoEncoders for Self Supervised 3D representation\n Learning for Neural Radiance Fields","summary":" Neural fields excel in computer vision and robotics due to their ability to\nunderstand the 3D visual world such as inferring semantics, geometry, and\ndynamics. Given the capabilities of neural fields in densely representing a 3D\nscene from 2D images, we ask the question: Can we scale their self-supervised\npretraining, specifically using masked autoencoders, to generate effective 3D\nrepresentations from posed RGB images. Owing to the astounding success of\nextending transformers to novel data modalities, we employ standard 3D Vision\nTransformers to suit the unique formulation of NeRFs. We leverage NeRF's\nvolumetric grid as a dense input to the transformer, contrasting it with other\n3D representations such as pointclouds where the information density can be\nuneven, and the representation is irregular. Due to the difficulty of applying\nmasked autoencoders to an implicit representation, such as NeRF, we opt for\nextracting an explicit representation that canonicalizes scenes across domains\nby employing the camera trajectory for sampling. Our goal is made possible by\nmasking random patches from NeRF's radiance and density grid and employing a\nstandard 3D Swin Transformer to reconstruct the masked patches. In doing so,\nthe model can learn the semantic and spatial structure of complete scenes. We\npretrain this representation at scale on our proposed curated posed-RGB data,\ntotaling over 1.6 million images. Once pretrained, the encoder is used for\neffective 3D transfer learning. Our novel self-supervised pretraining for\nNeRFs, NeRF-MAE, scales remarkably well and improves performance on various\nchallenging 3D tasks. Utilizing unlabeled posed 2D data for pretraining,\nNeRF-MAE significantly outperforms self-supervised 3D pretraining and NeRF\nscene understanding baselines on Front3D and ScanNet datasets with an absolute\nperformance improvement of over 20% AP50 and 8% AP25 for 3D object detection.\n","authors":["Muhammad Zubair Irshad","Sergey Zakahrov","Vitor Guizilini","Adrien Gaidon","Zsolt Kira","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2404.01300v1.pdf","comment":"29 pages, 13 figures. Project Page: https://nerf-mae.github.io/"},{"id":"http://arxiv.org/abs/2404.01298v1","updated":"2024-04-01T17:59:53Z","published":"2024-04-01T17:59:53Z","title":"Noise2Image: Noise-Enabled Static Scene Recovery for Event Cameras","summary":" Event cameras capture changes of intensity over time as a stream of 'events'\nand generally cannot measure intensity itself; hence, they are only used for\nimaging dynamic scenes. However, fluctuations due to random photon arrival\ninevitably trigger noise events, even for static scenes. While previous efforts\nhave been focused on filtering out these undesirable noise events to improve\nsignal quality, we find that, in the photon-noise regime, these noise events\nare correlated with the static scene intensity. We analyze the noise event\ngeneration and model its relationship to illuminance. Based on this\nunderstanding, we propose a method, called Noise2Image, to leverage the\nilluminance-dependent noise characteristics to recover the static parts of a\nscene, which are otherwise invisible to event cameras. We experimentally\ncollect a dataset of noise events on static scenes to train and validate\nNoise2Image. Our results show that Noise2Image can robustly recover intensity\nimages solely from noise events, providing a novel approach for capturing\nstatic scenes in event cameras, without additional hardware.\n","authors":["Ruiming Cao","Dekel Galor","Amit Kohli","Jacob L Yates","Laura Waller"],"pdf_url":"https://arxiv.org/pdf/2404.01298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01299v1","updated":"2024-04-01T17:59:53Z","published":"2024-04-01T17:59:53Z","title":"CausalChaos! Dataset for Comprehensive Causal Action Question Answering\n Over Longer Causal Chains Grounded in Dynamic Visual Scenes","summary":" Causal video question answering (QA) has garnered increasing interest, yet\nexisting datasets often lack depth in causal reasoning analysis. To address\nthis gap, we capitalize on the unique properties of cartoons and construct\nCausalChaos!, a novel, challenging causal Why-QA dataset built upon the iconic\n\"Tom and Jerry\" cartoon series. With thoughtful questions and multi-level\nanswers, our dataset contains much longer causal chains embedded in dynamic\ninteractions and visuals, at the same time principles of animation allows\nanimators to create well-defined, unambiguous causal relationships. These\nfactors allow models to solve more challenging, yet well-defined causal\nrelationships. We also introduce hard negative mining, including\nCausalConfusion version. While models perform well, there is much room for\nimprovement, especially, on open-ended answers. We identify more\nadvanced/explicit causal relationship modeling and joint modeling of vision and\nlanguage as the immediate areas for future efforts to focus upon. Along with\nthe other complementary datasets, our new challenging dataset will pave the way\nfor these developments in the field. We will release our dataset, codes, and\nmodels to help future efforts in this domain.\n","authors":["Ting En Lam","Yuhan Chen","Elston Tan","Eric Peh","Ruirui Chen","Paritosh Parmar","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2404.01299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00610v2","updated":"2024-04-01T17:59:52Z","published":"2023-09-01T17:57:02Z","title":"CityDreamer: Compositional Generative Model of Unbounded 3D Cities","summary":" 3D city generation is a desirable yet challenging task, since humans are more\nsensitive to structural distortions in urban environments. Additionally,\ngenerating 3D cities is more complex than 3D natural scenes since buildings, as\nobjects of the same class, exhibit a wider range of appearances compared to the\nrelatively consistent appearance of objects like trees in natural scenes. To\naddress these challenges, we propose \\textbf{CityDreamer}, a compositional\ngenerative model designed specifically for unbounded 3D cities. Our key insight\nis that 3D city generation should be a composition of different types of neural\nfields: 1) various building instances, and 2) background stuff, such as roads\nand green lands. Specifically, we adopt the bird's eye view scene\nrepresentation and employ a volumetric render for both instance-oriented and\nstuff-oriented neural fields. The generative hash grid and periodic positional\nembedding are tailored as scene parameterization to suit the distinct\ncharacteristics of building instances and background stuff. Furthermore, we\ncontribute a suite of CityGen Datasets, including OSM and GoogleEarth, which\ncomprises a vast amount of real-world city imagery to enhance the realism of\nthe generated 3D cities both in their layouts and appearances. CityDreamer\nachieves state-of-the-art performance not only in generating realistic 3D\ncities but also in localized editing within the generated cities.\n","authors":["Haozhe Xie","Zhaoxi Chen","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2309.00610v2.pdf","comment":"CVPR 2024. Project page: https://haozhexie.com/project/city-dreamer"},{"id":"http://arxiv.org/abs/2404.01367v1","updated":"2024-04-01T17:59:48Z","published":"2024-04-01T17:59:48Z","title":"Bigger is not Always Better: Scaling Properties of Latent Diffusion\n Models","summary":" We study the scaling properties of latent diffusion models (LDMs) with an\nemphasis on their sampling efficiency. While improved network architecture and\ninference algorithms have shown to effectively boost sampling efficiency of\ndiffusion models, the role of model size -- a critical determinant of sampling\nefficiency -- has not been thoroughly examined. Through empirical analysis of\nestablished text-to-image diffusion models, we conduct an in-depth\ninvestigation into how model size influences sampling efficiency across varying\nsampling steps. Our findings unveil a surprising trend: when operating under a\ngiven inference budget, smaller models frequently outperform their larger\nequivalents in generating high-quality results. Moreover, we extend our study\nto demonstrate the generalizability of the these findings by applying various\ndiffusion samplers, exploring diverse downstream tasks, evaluating\npost-distilled models, as well as comparing performance relative to training\ncompute. These findings open up new pathways for the development of LDM scaling\nstrategies which can be employed to enhance generative capabilities within\nlimited inference budgets.\n","authors":["Kangfu Mei","Zhengzhong Tu","Mauricio Delbracio","Hossein Talebi","Vishal M. Patel","Peyman Milanfar"],"pdf_url":"https://arxiv.org/pdf/2404.01367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01297v1","updated":"2024-04-01T17:59:15Z","published":"2024-04-01T17:59:15Z","title":"Streaming Dense Video Captioning","summary":" An ideal model for dense video captioning -- predicting captions localized\ntemporally in a video -- should be able to handle long input videos, predict\nrich, detailed textual descriptions, and be able to produce outputs before\nprocessing the entire video. Current state-of-the-art models, however, process\na fixed number of downsampled frames, and make a single full prediction after\nseeing the whole video. We propose a streaming dense video captioning model\nthat consists of two novel components: First, we propose a new memory module,\nbased on clustering incoming tokens, which can handle arbitrarily long videos\nas the memory is of a fixed size. Second, we develop a streaming decoding\nalgorithm that enables our model to make predictions before the entire video\nhas been processed. Our model achieves this streaming ability, and\nsignificantly improves the state-of-the-art on three dense video captioning\nbenchmarks: ActivityNet, YouCook2 and ViTT. Our code is released at\nhttps://github.com/google-research/scenic.\n","authors":["Xingyi Zhou","Anurag Arnab","Shyamal Buch","Shen Yan","Austin Myers","Xuehan Xiong","Arsha Nagrani","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2404.01297v1.pdf","comment":"CVPR 2024. Code is available at\n https://github.com/google-research/scenic/tree/main/scenic/projects/streaming_dvc"},{"id":"http://arxiv.org/abs/2404.01296v1","updated":"2024-04-01T17:59:11Z","published":"2024-04-01T17:59:11Z","title":"MagicMirror: Fast and High-Quality Avatar Generation with a Constrained\n Search Space","summary":" We introduce a novel framework for 3D human avatar generation and\npersonalization, leveraging text prompts to enhance user engagement and\ncustomization. Central to our approach are key innovations aimed at overcoming\nthe challenges in photo-realistic avatar synthesis. Firstly, we utilize a\nconditional Neural Radiance Fields (NeRF) model, trained on a large-scale\nunannotated multi-view dataset, to create a versatile initial solution space\nthat accelerates and diversifies avatar generation. Secondly, we develop a\ngeometric prior, leveraging the capabilities of Text-to-Image Diffusion Models,\nto ensure superior view invariance and enable direct optimization of avatar\ngeometry. These foundational ideas are complemented by our optimization\npipeline built on Variational Score Distillation (VSD), which mitigates texture\nloss and over-saturation issues. As supported by our extensive experiments,\nthese strategies collectively enable the creation of custom avatars with\nunparalleled visual quality and better adherence to input text prompts. You can\nfind more results and videos in our website:\nhttps://syntec-research.github.io/MagicMirror\n","authors":["Armand Comas-Massagué","Di Qiu","Menglei Chai","Marcel Bühler","Amit Raj","Ruiqi Gao","Qiangeng Xu","Mark Matthews","Paulo Gotardo","Octavia Camps","Sergio Orts-Escolano","Thabo Beeler"],"pdf_url":"https://arxiv.org/pdf/2404.01296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01294v1","updated":"2024-04-01T17:59:05Z","published":"2024-04-01T17:59:05Z","title":"CosmicMan: A Text-to-Image Foundation Model for Humans","summary":" We present CosmicMan, a text-to-image foundation model specialized for\ngenerating high-fidelity human images. Unlike current general-purpose\nfoundation models that are stuck in the dilemma of inferior quality and\ntext-image misalignment for humans, CosmicMan enables generating\nphoto-realistic human images with meticulous appearance, reasonable structure,\nand precise text-image alignment with detailed dense descriptions. At the heart\nof CosmicMan's success are the new reflections and perspectives on data and\nmodels: (1) We found that data quality and a scalable data production flow are\nessential for the final results from trained models. Hence, we propose a new\ndata production paradigm, Annotate Anyone, which serves as a perpetual data\nflywheel to produce high-quality data with accurate yet cost-effective\nannotations over time. Based on this, we constructed a large-scale dataset,\nCosmicMan-HQ 1.0, with 6 Million high-quality real-world human images in a mean\nresolution of 1488x1255, and attached with precise text annotations deriving\nfrom 115 Million attributes in diverse granularities. (2) We argue that a\ntext-to-image foundation model specialized for humans must be pragmatic -- easy\nto integrate into down-streaming tasks while effective in producing\nhigh-quality human images. Hence, we propose to model the relationship between\ndense text descriptions and image pixels in a decomposed manner, and present\nDecomposed-Attention-Refocusing (Daring) training framework. It seamlessly\ndecomposes the cross-attention features in existing text-to-image diffusion\nmodel, and enforces attention refocusing without adding extra modules. Through\nDaring, we show that explicitly discretizing continuous text space into several\nbasic groups that align with human body structure is the key to tackling the\nmisalignment problem in a breeze.\n","authors":["Shikai Li","Jianglin Fu","Kaiyuan Liu","Wentao Wang","Kwan-Yee Lin","Wayne Wu"],"pdf_url":"https://arxiv.org/pdf/2404.01294v1.pdf","comment":"Accepted by CVPR 2024. The supplementary material is included.\n Project Page: https://cosmicman-cvpr2024.github.io"},{"id":"http://arxiv.org/abs/2404.01292v1","updated":"2024-04-01T17:58:30Z","published":"2024-04-01T17:58:30Z","title":"Measuring Style Similarity in Diffusion Models","summary":" Generative models are now widely used by graphic designers and artists. Prior\nworks have shown that these models remember and often replicate content from\ntheir training data during generation. Hence as their proliferation increases,\nit has become important to perform a database search to determine whether the\nproperties of the image are attributable to specific training data, every time\nbefore a generated image is used for professional purposes. Existing tools for\nthis purpose focus on retrieving images of similar semantic content. Meanwhile,\nmany artists are concerned with style replication in text-to-image models. We\npresent a framework for understanding and extracting style descriptors from\nimages. Our framework comprises a new dataset curated using the insight that\nstyle is a subjective property of an image that captures complex yet meaningful\ninteractions of factors including but not limited to colors, textures, shapes,\netc. We also propose a method to extract style descriptors that can be used to\nattribute style of a generated image to the images used in the training dataset\nof a text-to-image model. We showcase promising results in various style\nretrieval tasks. We also quantitatively and qualitatively analyze style\nattribution and matching in the Stable Diffusion model. Code and artifacts are\navailable at https://github.com/learn2phoenix/CSD.\n","authors":["Gowthami Somepalli","Anubhav Gupta","Kamal Gupta","Shramay Palta","Micah Goldblum","Jonas Geiping","Abhinav Shrivastava","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2404.01292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01291v1","updated":"2024-04-01T17:58:06Z","published":"2024-04-01T17:58:06Z","title":"Evaluating Text-to-Visual Generation with Image-to-Text Generation","summary":" Despite significant progress in generative AI, comprehensive evaluation\nremains challenging because of the lack of effective metrics and standardized\nbenchmarks. For instance, the widely-used CLIPScore measures the alignment\nbetween a (generated) image and text prompt, but it fails to produce reliable\nscores for complex prompts involving compositions of objects, attributes, and\nrelations. One reason is that text encoders of CLIP can notoriously act as a\n\"bag of words\", conflating prompts such as \"the horse is eating the grass\" with\n\"the grass is eating the horse\". To address this, we introduce the VQAScore,\nwhich uses a visual-question-answering (VQA) model to produce an alignment\nscore by computing the probability of a \"Yes\" answer to a simple \"Does this\nfigure show '{text}'?\" question. Though simpler than prior art, VQAScore\ncomputed with off-the-shelf models produces state-of-the-art results across\nmany (8) image-text alignment benchmarks. We also compute VQAScore with an\nin-house model that follows best practices in the literature. For example, we\nuse a bidirectional image-question encoder that allows image embeddings to\ndepend on the question being asked (and vice versa). Our in-house model,\nCLIP-FlanT5, outperforms even the strongest baselines that make use of the\nproprietary GPT-4V. Interestingly, although we train with only images, VQAScore\ncan also align text with video and 3D models. VQAScore allows researchers to\nbenchmark text-to-visual generation using complex texts that capture the\ncompositional structure of real-world prompts. We introduce GenAI-Bench, a more\nchallenging benchmark with 1,600 compositional text prompts that require\nparsing scenes, objects, attributes, relationships, and high-order reasoning\nlike comparison and logic. GenAI-Bench also offers over 15,000 human ratings\nfor leading image and video generation models such as Stable Diffusion, DALL-E\n3, and Gen2.\n","authors":["Zhiqiu Lin","Deepak Pathak","Baiqi Li","Jiayao Li","Xide Xia","Graham Neubig","Pengchuan Zhang","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2404.01291v1.pdf","comment":"We open-source our data, model, and code at:\n https://github.com/linzhiqiu/t2v_metrics ; Project page:\n https://linzhiqiu.github.io/papers/vqascore"},{"id":"http://arxiv.org/abs/2403.13802v2","updated":"2024-04-01T17:58:02Z","published":"2024-03-20T17:59:14Z","title":"ZigMa: A DiT-style Zigzag Mamba Diffusion Model","summary":" The diffusion model has long been plagued by scalability and quadratic\ncomplexity issues, especially within transformer-based structures. In this\nstudy, we aim to leverage the long sequence modeling capability of a\nState-Space Model called Mamba to extend its applicability to visual data\ngeneration. Firstly, we identify a critical oversight in most current\nMamba-based vision methods, namely the lack of consideration for spatial\ncontinuity in the scan scheme of Mamba. Secondly, building upon this insight,\nwe introduce a simple, plug-and-play, zero-parameter method named Zigzag Mamba,\nwhich outperforms Mamba-based baselines and demonstrates improved speed and\nmemory utilization compared to transformer-based baselines. Lastly, we\nintegrate Zigzag Mamba with the Stochastic Interpolant framework to investigate\nthe scalability of the model on large-resolution visual datasets, such as\nFacesHQ $1024\\times 1024$ and UCF101, MultiModal-CelebA-HQ, and MS COCO\n$256\\times 256$ . Code will be released at https://taohu.me/zigma/\n","authors":["Vincent Tao Hu","Stefan Andreas Baumann","Ming Gui","Olga Grebenkova","Pingchuan Ma","Johannes Fischer","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2403.13802v2.pdf","comment":"Project Page: https://taohu.me/zigma/"},{"id":"http://arxiv.org/abs/2404.01284v1","updated":"2024-04-01T17:55:11Z","published":"2024-04-01T17:55:11Z","title":"Large Motion Model for Unified Multi-Modal Motion Generation","summary":" Human motion generation, a cornerstone technique in animation and video\nproduction, has widespread applications in various tasks like text-to-motion\nand music-to-dance. Previous works focus on developing specialist models\ntailored for each task without scalability. In this work, we present Large\nMotion Model (LMM), a motion-centric, multi-modal framework that unifies\nmainstream motion generation tasks into a generalist model. A unified motion\nmodel is appealing since it can leverage a wide range of motion data to achieve\nbroad generalization beyond a single task. However, it is also challenging due\nto the heterogeneous nature of substantially different motion data and tasks.\nLMM tackles these challenges from three principled aspects: 1) Data: We\nconsolidate datasets with different modalities, formats and tasks into a\ncomprehensive yet unified motion generation dataset, MotionVerse, comprising 10\ntasks, 16 datasets, a total of 320k sequences, and 100 million frames. 2)\nArchitecture: We design an articulated attention mechanism ArtAttention that\nincorporates body part-aware modeling into Diffusion Transformer backbone. 3)\nPre-Training: We propose a novel pre-training strategy for LMM, which employs\nvariable frame rates and masking forms, to better exploit knowledge from\ndiverse training data. Extensive experiments demonstrate that our generalist\nLMM achieves competitive performance across various standard motion generation\ntasks over state-of-the-art specialist models. Notably, LMM exhibits strong\ngeneralization capabilities and emerging properties across many unseen tasks.\nAdditionally, our ablation studies reveal valuable insights about training and\nscaling up large motion models for future research.\n","authors":["Mingyuan Zhang","Daisheng Jin","Chenyang Gu","Fangzhou Hong","Zhongang Cai","Jingfang Huang","Chongzhi Zhang","Xinying Guo","Lei Yang","Ying He","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01284v1.pdf","comment":"Homepage: https://mingyuan-zhang.github.io/projects/LMM.html"},{"id":"http://arxiv.org/abs/2404.01282v1","updated":"2024-04-01T17:54:34Z","published":"2024-04-01T17:54:34Z","title":"LoSA: Long-Short-range Adapter for Scaling End-to-End Temporal Action\n Localization","summary":" Temporal Action Localization (TAL) involves localizing and classifying action\nsnippets in an untrimmed video. The emergence of large video foundation models\nhas led RGB-only video backbones to outperform previous methods needing both\nRGB and optical flow modalities. Leveraging these large models is often limited\nto training only the TAL head due to the prohibitively large GPU memory\nrequired to adapt the video backbone for TAL. To overcome this limitation, we\nintroduce LoSA, the first memory-and-parameter-efficient backbone adapter\ndesigned specifically for TAL to handle untrimmed videos. LoSA specializes for\nTAL by introducing Long-Short-range Adapters that adapt the intermediate layers\nof the video backbone over different temporal ranges. These adapters run\nparallel to the video backbone to significantly reduce memory footprint. LoSA\nalso includes Long-Short-range Fusion that strategically combines the output of\nthese adapters from the video backbone layers to enhance the video features\nprovided to the TAL head. Experiments show that LoSA significantly outperforms\nall existing methods on standard TAL benchmarks, THUMOS-14 and\nActivityNet-v1.3, by scaling end-to-end backbone adaptation to\nbillion-parameter-plus models like VideoMAEv2~(ViT-g) and leveraging them\nbeyond head-only transfer learning.\n","authors":["Akshita Gupta","Gaurav Mittal","Ahmed Magooda","Ye Yu","Graham W. Taylor","Mei Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01278v1","updated":"2024-04-01T17:52:17Z","published":"2024-04-01T17:52:17Z","title":"BiPer: Binary Neural Networks using a Periodic Function","summary":" Quantized neural networks employ reduced precision representations for both\nweights and activations. This quantization process significantly reduces the\nmemory requirements and computational complexity of the network. Binary Neural\nNetworks (BNNs) are the extreme quantization case, representing values with\njust one bit. Since the sign function is typically used to map real values to\nbinary values, smooth approximations are introduced to mimic the gradients\nduring error backpropagation. Thus, the mismatch between the forward and\nbackward models corrupts the direction of the gradient, causing training\ninconsistency problems and performance degradation. In contrast to current BNN\napproaches, we propose to employ a binary periodic (BiPer) function during\nbinarization. Specifically, we use a square wave for the forward pass to obtain\nthe binary values and employ the trigonometric sine function with the same\nperiod of the square wave as a differentiable surrogate during the backward\npass. We demonstrate that this approach can control the quantization error by\nusing the frequency of the periodic function and improves network performance.\nExtensive experiments validate the effectiveness of BiPer in benchmark datasets\nand network architectures, with improvements of up to 1% and 0.69% with respect\nto state-of-the-art methods in the classification task over CIFAR-10 and\nImageNet, respectively. Our code is publicly available at\nhttps://github.com/edmav4/BiPer.\n","authors":["Edwin Vargas","Claudia Correa","Carlos Hinojosa","Henry Arguello"],"pdf_url":"https://arxiv.org/pdf/2404.01278v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13549v2","updated":"2024-04-01T17:51:54Z","published":"2023-06-23T15:21:52Z","title":"A Survey on Multimodal Large Language Models","summary":" Recently, Multimodal Large Language Model (MLLM) represented by GPT-4V has\nbeen a new rising research hotspot, which uses powerful Large Language Models\n(LLMs) as a brain to perform multimodal tasks. The surprising emergent\ncapabilities of MLLM, such as writing stories based on images and OCR-free math\nreasoning, are rare in traditional multimodal methods, suggesting a potential\npath to artificial general intelligence. To this end, both academia and\nindustry have endeavored to develop MLLMs that can compete with or even better\nthan GPT-4V, pushing the limit of research at a surprising speed. In this\npaper, we aim to trace and summarize the recent progress of MLLMs. First of\nall, we present the basic formulation of MLLM and delineate its related\nconcepts, including architecture, training strategy and data, as well as\nevaluation. Then, we introduce research topics about how MLLMs can be extended\nto support more granularity, modalities, languages, and scenarios. We continue\nwith multimodal hallucination and extended techniques, including Multimodal ICL\n(M-ICL), Multimodal CoT (M-CoT), and LLM-Aided Visual Reasoning (LAVR). To\nconclude the paper, we discuss existing challenges and point out promising\nresearch directions. In light of the fact that the era of MLLM has only just\nbegun, we will keep updating this survey and hope it can inspire more research.\nAn associated GitHub link collecting the latest papers is available at\nhttps://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models.\n","authors":["Shukang Yin","Chaoyou Fu","Sirui Zhao","Ke Li","Xing Sun","Tong Xu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2306.13549v2.pdf","comment":"Project\n page:https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models"},{"id":"http://arxiv.org/abs/2404.01272v1","updated":"2024-04-01T17:48:15Z","published":"2024-04-01T17:48:15Z","title":"Language Guided Domain Generalized Medical Image Segmentation","summary":" Single source domain generalization (SDG) holds promise for more reliable and\nconsistent image segmentation across real-world clinical settings particularly\nin the medical domain, where data privacy and acquisition cost constraints\noften limit the availability of diverse datasets. Depending solely on visual\nfeatures hampers the model's capacity to adapt effectively to various domains,\nprimarily because of the presence of spurious correlations and domain-specific\ncharacteristics embedded within the image features. Incorporating text features\nalongside visual features is a potential solution to enhance the model's\nunderstanding of the data, as it goes beyond pixel-level information to provide\nvaluable context. Textual cues describing the anatomical structures, their\nappearances, and variations across various imaging modalities can guide the\nmodel in domain adaptation, ultimately contributing to more robust and\nconsistent segmentation. In this paper, we propose an approach that explicitly\nleverages textual information by incorporating a contrastive learning mechanism\nguided by the text encoder features to learn a more robust feature\nrepresentation. We assess the effectiveness of our text-guided contrastive\nfeature alignment technique in various scenarios, including cross-modality,\ncross-sequence, and cross-site settings for different segmentation tasks. Our\napproach achieves favorable performance against existing methods in literature.\nOur code and model weights are available at\nhttps://github.com/ShahinaKK/LG_SDG.git.\n","authors":["Shahina Kunhimon","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2404.01272v1.pdf","comment":"Accepted at ISBI2024"},{"id":"http://arxiv.org/abs/2310.01393v3","updated":"2024-04-01T17:40:59Z","published":"2023-10-02T17:52:24Z","title":"DST-Det: Simple Dynamic Self-Training for Open-Vocabulary Object\n Detection","summary":" Open-vocabulary object detection (OVOD) aims to detect the objects beyond the\nset of classes observed during training. This work introduces a straightforward\nand efficient strategy that utilizes pre-trained vision-language models (VLM),\nlike CLIP, to identify potential novel classes through zero-shot\nclassification. Previous methods use a class-agnostic region proposal network\nto detect object proposals and consider the proposals that do not match the\nground truth as background. Unlike these methods, our method will select a\nsubset of proposals that will be considered as background during the training.\nThen, we treat them as novel classes during training. We refer to this approach\nas the self-training strategy, which enhances recall and accuracy for novel\nclasses without requiring extra annotations, datasets, and re-training.\nCompared to previous pseudo methods, our approach does not require re-training\nand offline labeling processing, which is more efficient and effective in\none-shot training. Empirical evaluations on three datasets, including LVIS,\nV3Det, and COCO, demonstrate significant improvements over the baseline\nperformance without incurring additional parameters or computational costs\nduring inference. In addition, we also apply our method to various baselines.\nIn particular, compared with the previous method, F-VLM, our method achieves a\n1.7% improvement on the LVIS dataset. Combined with the recent method CLIPSelf,\nour method also achieves 46.7 novel class AP on COCO without introducing extra\ndata for pertaining. We also achieve over 6.5% improvement over the F-VLM\nbaseline in the recent challenging V3Det dataset. We release our code and\nmodels at https://github.com/xushilin1/dst-det.\n","authors":["Shilin Xu","Xiangtai Li","Size Wu","Wenwei Zhang","Yunhai Tong","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2310.01393v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01990v1","updated":"2024-04-01T17:38:25Z","published":"2024-04-01T17:38:25Z","title":"What is Point Supervision Worth in Video Instance Segmentation?","summary":" Video instance segmentation (VIS) is a challenging vision task that aims to\ndetect, segment, and track objects in videos. Conventional VIS methods rely on\ndensely-annotated object masks which are expensive. We reduce the human\nannotations to only one point for each object in a video frame during training,\nand obtain high-quality mask predictions close to fully supervised models. Our\nproposed training method consists of a class-agnostic proposal generation\nmodule to provide rich negative samples and a spatio-temporal point-based\nmatcher to match the object queries with the provided point annotations.\nComprehensive experiments on three VIS benchmarks demonstrate competitive\nperformance of the proposed framework, nearly matching fully supervised\nmethods.\n","authors":["Shuaiyi Huang","De-An Huang","Zhiding Yu","Shiyi Lan","Subhashree Radhakrishnan","Jose M. Alvarez","Abhinav Shrivastava","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2404.01990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11696v5","updated":"2024-04-01T17:34:34Z","published":"2023-08-22T17:59:30Z","title":"Efficient Benchmarking of Language Models","summary":" The increasing versatility of language models (LMs) has given rise to a new\nclass of benchmarks that comprehensively assess a broad range of capabilities.\nSuch benchmarks are associated with massive computational costs, extending to\nthousands of GPU hours per model. However, the efficiency aspect of these\nevaluation efforts had raised little discussion in the literature. In this\nwork, we present the problem of Efficient Benchmarking, namely, intelligently\nreducing the computation costs of LM evaluation without compromising\nreliability. Using the HELM benchmark as a test case, we investigate how\ndifferent benchmark design choices affect the computation-reliability\ntrade-off. We propose to evaluate the reliability of such decisions, by using a\nnew measure -- Decision Impact on Reliability, DIoR for short. We find, for\nexample, that a benchmark leader may change by merely removing a low-ranked\nmodel from the benchmark, and observe that a correct benchmark ranking can be\nobtained by considering only a fraction of the evaluation examples. Based on\nour findings, we outline a set of concrete recommendations for efficient\nbenchmark design and utilization practices. To take a step further, we use our\nfindings to propose an evaluation algorithm, that, when applied to the HELM\nbenchmark, leads to dramatic cost savings with minimal loss of benchmark\nreliability, often reducing computation by x100 or more.\n","authors":["Yotam Perlitz","Elron Bandel","Ariel Gera","Ofir Arviv","Liat Ein-Dor","Eyal Shnarch","Noam Slonim","Michal Shmueli-Scheuer","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2308.11696v5.pdf","comment":"Accepted to NAACL main track"},{"id":"http://arxiv.org/abs/2404.01260v1","updated":"2024-04-01T17:30:56Z","published":"2024-04-01T17:30:56Z","title":"Bridging Remote Sensors with Multisensor Geospatial Foundation Models","summary":" In the realm of geospatial analysis, the diversity of remote sensors,\nencompassing both optical and microwave technologies, offers a wealth of\ndistinct observational capabilities. Recognizing this, we present msGFM, a\nmultisensor geospatial foundation model that effectively unifies data from four\nkey sensor modalities. This integration spans an expansive dataset of two\nmillion multisensor images. msGFM is uniquely adept at handling both paired and\nunpaired sensor data. For data originating from identical geolocations, our\nmodel employs an innovative cross-sensor pretraining approach in masked image\nmodeling, enabling the synthesis of joint representations from diverse sensors.\nmsGFM, incorporating four remote sensors, upholds strong performance, forming a\ncomprehensive model adaptable to various sensor types. msGFM has demonstrated\nenhanced proficiency in a range of both single-sensor and multisensor\ndownstream tasks. These include scene classification, segmentation, cloud\nremoval, and pan-sharpening. A key discovery of our research is that\nrepresentations derived from natural images are not always compatible with the\ndistinct characteristics of geospatial remote sensors, underscoring the\nlimitations of existing representations in this field. Our work can serve as a\nguide for developing multisensor geospatial pretraining models, paving the way\nfor more advanced geospatial capabilities.\n","authors":["Boran Han","Shuai Zhang","Xingjian Shi","Markus Reichstein"],"pdf_url":"https://arxiv.org/pdf/2404.01260v1.pdf","comment":"Accepted to CVPR"},{"id":"http://arxiv.org/abs/2310.00031v3","updated":"2024-04-01T17:27:12Z","published":"2023-09-29T05:16:41Z","title":"Text-image Alignment for Diffusion-based Perception","summary":" Diffusion models are generative models with impressive text-to-image\nsynthesis capabilities and have spurred a new wave of creative methods for\nclassical machine learning tasks. However, the best way to harness the\nperceptual knowledge of these generative models for visual tasks is still an\nopen question. Specifically, it is unclear how to use the prompting interface\nwhen applying diffusion backbones to vision tasks. We find that automatically\ngenerated captions can improve text-image alignment and significantly enhance a\nmodel's cross-attention maps, leading to better perceptual performance. Our\napproach improves upon the current state-of-the-art (SOTA) in diffusion-based\nsemantic segmentation on ADE20K and the current overall SOTA for depth\nestimation on NYUv2. Furthermore, our method generalizes to the cross-domain\nsetting. We use model personalization and caption modifications to align our\nmodel to the target domain and find improvements over unaligned baselines. Our\ncross-domain object detection model, trained on Pascal VOC, achieves SOTA\nresults on Watercolor2K. Our cross-domain segmentation method, trained on\nCityscapes, achieves SOTA results on Dark Zurich-val and Nighttime Driving.\nProject page: https://www.vision.caltech.edu/tadp/. Code:\nhttps://github.com/damaggu/TADP.\n","authors":["Neehar Kondapaneni","Markus Marks","Manuel Knott","Rogerio Guimaraes","Pietro Perona"],"pdf_url":"https://arxiv.org/pdf/2310.00031v3.pdf","comment":"Project page: https://www.vision.caltech.edu/tadp/, Code page:\n github.com/damaggu/TADP"},{"id":"http://arxiv.org/abs/2212.00210v3","updated":"2024-04-01T17:19:02Z","published":"2022-12-01T01:39:28Z","title":"Shape-Guided Diffusion with Inside-Outside Attention","summary":" We introduce precise object silhouette as a new form of user control in\ntext-to-image diffusion models, which we dub Shape-Guided Diffusion. Our\ntraining-free method uses an Inside-Outside Attention mechanism during the\ninversion and generation process to apply a shape constraint to the cross- and\nself-attention maps. Our mechanism designates which spatial region is the\nobject (inside) vs. background (outside) then associates edits to the correct\nregion. We demonstrate the efficacy of our method on the shape-guided editing\ntask, where the model must replace an object according to a text prompt and\nobject mask. We curate a new ShapePrompts benchmark derived from MS-COCO and\nachieve SOTA results in shape faithfulness without a degradation in text\nalignment or image realism according to both automatic metrics and annotator\nratings. Our data and code will be made available at\nhttps://shape-guided-diffusion.github.io.\n","authors":["Dong Huk Park","Grace Luo","Clayton Toste","Samaneh Azadi","Xihui Liu","Maka Karalashvili","Anna Rohrbach","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2212.00210v3.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2404.01249v1","updated":"2024-04-01T17:12:47Z","published":"2024-04-01T17:12:47Z","title":"FireANTs: Adaptive Riemannian Optimization for Multi-Scale Diffeomorphic\n Registration","summary":" Diffeomorphic Image Registration is a critical part of the analysis in\nvarious imaging modalities and downstream tasks like image translation,\nsegmentation, and atlas building. Registration algorithms based on optimization\nhave stood the test of time in terms of accuracy, reliability, and robustness\nacross a wide spectrum of modalities and acquisition settings. However, these\nalgorithms converge slowly, are prohibitively expensive to run, and their usage\nrequires a steep learning curve, limiting their scalability to larger clinical\nand scientific studies. In this paper, we develop multi-scale Adaptive\nRiemannian Optimization algorithms for diffeomorphic image registration. We\ndemonstrate compelling improvements on image registration across a spectrum of\nmodalities and anatomies by measuring structural and landmark overlap of the\nregistered image volumes. Our proposed framework leads to a consistent\nimprovement in performance, and from 300x up to 2000x speedup over existing\nalgorithms. Our modular library design makes it easy to use and allows\ncustomization via user-defined cost functions.\n","authors":["Rohit Jena","Pratik Chaudhari","James C. Gee"],"pdf_url":"https://arxiv.org/pdf/2404.01249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01248v1","updated":"2024-04-01T17:09:40Z","published":"2024-04-01T17:09:40Z","title":"Scalable Scene Modeling from Perspective Imaging: Physics-based\n Appearance and Geometry Inference","summary":" 3D scene modeling techniques serve as the bedrocks in the geospatial\nengineering and computer science, which drives many applications ranging from\nautomated driving, terrain mapping, navigation, virtual, augmented, mixed, and\nextended reality (for gaming and movie industry etc.). This dissertation\npresents a fraction of contributions that advances 3D scene modeling to its\nstate of the art, in the aspects of both appearance and geometry modeling. In\ncontrast to the prevailing deep learning methods, as a core contribution, this\nthesis aims to develop algorithms that follow first principles, where\nsophisticated physic-based models are introduced alongside with simpler\nlearning and inference tasks. The outcomes of these algorithms yield processes\nthat can consume much larger volume of data for highly accurate reconstructing\n3D scenes at a scale without losing methodological generality, which are not\npossible by contemporary complex-model based deep learning methods.\nSpecifically, the dissertation introduces three novel methodologies that\naddress the challenges of inferring appearance and geometry through\nphysics-based modeling.\n Overall, the research encapsulated in this dissertation marks a series of\nmethodological triumphs in the processing of complex datasets. By navigating\nthe confluence of deep learning, computational geometry, and photogrammetry,\nthis work lays down a robust framework for future exploration and practical\napplication in the rapidly evolving field of 3D scene reconstruction. The\noutcomes of these studies are evidenced through rigorous experiments and\ncomparisons with existing state-of-the-art methods, demonstrating the efficacy\nand scalability of the proposed approaches.\n","authors":["Shuang Song"],"pdf_url":"https://arxiv.org/pdf/2404.01248v1.pdf","comment":"Ph.D. Dissertation, Geospatial Data Analytics Lab, The Ohio State\n University, 2024. arXiv admin note: text overlap with arXiv:2108.08378"},{"id":"http://arxiv.org/abs/2404.01247v1","updated":"2024-04-01T17:08:50Z","published":"2024-04-01T17:08:50Z","title":"An image speaks a thousand words, but can everyone listen? On\n translating images for cultural relevance","summary":" Given the rise of multimedia content, human translators increasingly focus on\nculturally adapting not only words but also other modalities such as images to\nconvey the same meaning. While several applications stand to benefit from this,\nmachine translation systems remain confined to dealing with language in speech\nand text. In this work, we take a first step towards translating images to make\nthem culturally relevant. First, we build three pipelines comprising\nstate-of-the-art generative models to do the task. Next, we build a two-part\nevaluation dataset: i) concept: comprising 600 images that are cross-culturally\ncoherent, focusing on a single concept per image, and ii) application:\ncomprising 100 images curated from real-world applications. We conduct a\nmulti-faceted human evaluation of translated images to assess for cultural\nrelevance and meaning preservation. We find that as of today, image-editing\nmodels fail at this task, but can be improved by leveraging LLMs and retrievers\nin the loop. Best pipelines can only translate 5% of images for some countries\nin the easier concept dataset and no translation is successful for some\ncountries in the application dataset, highlighting the challenging nature of\nthe task. Our code and data is released here:\nhttps://github.com/simran-khanuja/image-transcreation.\n","authors":["Simran Khanuja","Sathyanarayanan Ramamoorthy","Yueqi Song","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2404.01247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.10460v2","updated":"2024-04-01T17:04:01Z","published":"2023-01-25T08:40:34Z","title":"HAL3D: Hierarchical Active Learning for Fine-Grained 3D Part Labeling","summary":" We present the first active learning tool for fine-grained 3D part labeling,\na problem which challenges even the most advanced deep learning (DL) methods\ndue to the significant structural variations among the small and intricate\nparts. For the same reason, the necessary data annotation effort is tremendous,\nmotivating approaches to minimize human involvement. Our labeling tool\niteratively verifies or modifies part labels predicted by a deep neural\nnetwork, with human feedback continually improving the network prediction. To\neffectively reduce human efforts, we develop two novel features in our tool,\nhierarchical and symmetry-aware active labeling. Our human-in-the-loop\napproach, coined HAL3D, achieves 100% accuracy (barring human errors) on any\ntest set with pre-defined hierarchical part labels, with 80% time-saving over\nmanual effort.\n","authors":["Fenggen Yu","Yiming Qian","Francisca Gil-Ureta","Brian Jackson","Eric Bennett","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2301.10460v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2404.01243v1","updated":"2024-04-01T17:03:29Z","published":"2024-04-01T17:03:29Z","title":"A Unified and Interpretable Emotion Representation and Expression\n Generation","summary":" Canonical emotions, such as happy, sad, and fearful, are easy to understand\nand annotate. However, emotions are often compound, e.g. happily surprised, and\ncan be mapped to the action units (AUs) used for expressing emotions, and\ntrivially to the canonical ones. Intuitively, emotions are continuous as\nrepresented by the arousal-valence (AV) model. An interpretable unification of\nthese four modalities - namely, Canonical, Compound, AUs, and AV - is highly\ndesirable, for a better representation and understanding of emotions. However,\nsuch unification remains to be unknown in the current literature. In this work,\nwe propose an interpretable and unified emotion model, referred as C2A2. We\nalso develop a method that leverages labels of the non-unified models to\nannotate the novel unified one. Finally, we modify the text-conditional\ndiffusion models to understand continuous numbers, which are then used to\ngenerate continuous expressions using our unified emotion model. Through\nquantitative and qualitative experiments, we show that our generated images are\nrich and capture subtle expressions. Our work allows a fine-grained generation\nof expressions in conjunction with other textual inputs and offers a new label\nspace for emotions at the same time.\n","authors":["Reni Paskaleva","Mykyta Holubakha","Andela Ilic","Saman Motamed","Luc Van Gool","Danda Paudel"],"pdf_url":"https://arxiv.org/pdf/2404.01243v1.pdf","comment":"10 pages, 9 figures, 3 tables Accepted at CVPR 2024. Project page:\n https://emotion-diffusion.github.io"},{"id":"http://arxiv.org/abs/2401.09627v4","updated":"2024-04-01T17:03:08Z","published":"2024-01-17T22:34:20Z","title":"SymTC: A Symbiotic Transformer-CNN Net for Instance Segmentation of\n Lumbar Spine MRI","summary":" Intervertebral disc disease, a prevalent ailment, frequently leads to\nintermittent or persistent low back pain, and diagnosing and assessing of this\ndisease rely on accurate measurement of vertebral bone and intervertebral disc\ngeometries from lumbar MR images. Deep neural network (DNN) models may assist\nclinicians with more efficient image segmentation of individual instances\n(disks and vertebrae) of the lumbar spine in an automated way, which is termed\nas instance image segmentation. In this work, we proposed SymTC, an innovative\nlumbar spine MR image segmentation model that combines the strengths of\nTransformer and Convolutional Neural Network (CNN). Specifically, we designed a\nparallel dual-path architecture to merge CNN layers and Transformer layers, and\nwe integrated a novel position embedding into the self-attention module of\nTransformer, enhancing the utilization of positional information for more\naccurate segmentation. To further improves model performance, we introduced a\nnew data augmentation technique to create synthetic yet realistic MR image\ndataset, named SSMSpine, which is made publicly available. We evaluated our\nSymTC and the other 15 existing image segmentation models on our private\nin-house dataset and the public SSMSpine dataset, using two metrics, Dice\nSimilarity Coefficient and 95% Hausdorff Distance. The results show that our\nSymTC has the best performance for segmenting vertebral bones and\nintervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine\ndataset are available at https://github.com/jiasongchen/SymTC.\n","authors":["Jiasong Chen","Linchen Qian","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2401.09627v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01240v1","updated":"2024-04-01T16:58:32Z","published":"2024-04-01T16:58:32Z","title":"AURORA: Navigating UI Tarpits via Automated Neural Screen Understanding","summary":" Nearly a decade of research in software engineering has focused on automating\nmobile app testing to help engineers in overcoming the unique challenges\nassociated with the software platform. Much of this work has come in the form\nof Automated Input Generation tools (AIG tools) that dynamically explore app\nscreens. However, such tools have repeatedly been demonstrated to achieve\nlower-than-expected code coverage - particularly on sophisticated proprietary\napps. Prior work has illustrated that a primary cause of these coverage\ndeficiencies is related to so-called tarpits, or complex screens that are\ndifficult to navigate.\n In this paper, we take a critical step toward enabling AIG tools to\neffectively navigate tarpits during app exploration through a new form of\nautomated semantic screen understanding. We introduce AURORA, a technique that\nlearns from the visual and textual patterns that exist in mobile app UIs to\nautomatically detect common screen designs and navigate them accordingly. The\nkey idea of AURORA is that there are a finite number of mobile app screen\ndesigns, albeit with subtle variations, such that the general patterns of\ndifferent categories of UI designs can be learned. As such, AURORA employs a\nmulti-modal, neural screen classifier that is able to recognize the most common\ntypes of UI screen designs. After recognizing a given screen, it then applies a\nset of flexible and generalizable heuristics to properly navigate the screen.\nWe evaluated AURORA both on a set of 12 apps with known tarpits from prior\nwork, and on a new set of five of the most popular apps from the Google Play\nstore. Our results indicate that AURORA is able to effectively navigate tarpit\nscreens, outperforming prior approaches that avoid tarpits by 19.6% in terms of\nmethod coverage. The improvements can be attributed to AURORA's UI design\nclassification and heuristic navigation techniques.\n","authors":["Safwat Ali Khan","Wenyu Wang","Yiran Ren","Bin Zhu","Jiangfan Shi","Alyssa McGowan","Wing Lam","Kevin Moran"],"pdf_url":"https://arxiv.org/pdf/2404.01240v1.pdf","comment":"Published at 17th IEEE International Conference on Software Testing,\n Verification and Validation (ICST) 2024, 12 pages"},{"id":"http://arxiv.org/abs/2311.10707v2","updated":"2024-04-01T16:56:13Z","published":"2023-11-17T18:57:40Z","title":"Multimodal Representation Learning by Alternating Unimodal Adaptation","summary":" Multimodal learning, which integrates data from diverse sensory modes, plays\na pivotal role in artificial intelligence. However, existing multimodal\nlearning methods often struggle with challenges where some modalities appear\nmore dominant than others during multimodal learning, resulting in suboptimal\nperformance. To address this challenge, we propose MLA (Multimodal Learning\nwith Alternating Unimodal Adaptation). MLA reframes the conventional joint\nmultimodal learning process by transforming it into an alternating unimodal\nlearning process, thereby minimizing interference between modalities.\nSimultaneously, it captures cross-modal interactions through a shared head,\nwhich undergoes continuous optimization across different modalities. This\noptimization process is controlled by a gradient modification mechanism to\nprevent the shared head from losing previously acquired information. During the\ninference phase, MLA utilizes a test-time uncertainty-based model fusion\nmechanism to integrate multimodal information. Extensive experiments are\nconducted on five diverse datasets, encompassing scenarios with complete\nmodalities and scenarios with missing modalities. These experiments demonstrate\nthe superiority of MLA over competing prior approaches. Our code is available\nat\nhttps://github.com/Cecile-hi/Multimodal-Learning-with-Alternating-Unimodal-Adaptation.\n","authors":["Xiaohui Zhang","Jaehong Yoon","Mohit Bansal","Huaxiu Yao"],"pdf_url":"https://arxiv.org/pdf/2311.10707v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.12259v2","updated":"2024-04-01T16:55:10Z","published":"2024-02-19T16:15:03Z","title":"Open3DSG: Open-Vocabulary 3D Scene Graphs from Point Clouds with\n Queryable Objects and Open-Set Relationships","summary":" Current approaches for 3D scene graph prediction rely on labeled datasets to\ntrain models for a fixed set of known object classes and relationship\ncategories. We present Open3DSG, an alternative approach to learn 3D scene\ngraph prediction in an open world without requiring labeled scene graph data.\nWe co-embed the features from a 3D scene graph prediction backbone with the\nfeature space of powerful open world 2D vision language foundation models. This\nenables us to predict 3D scene graphs from 3D point clouds in a zero-shot\nmanner by querying object classes from an open vocabulary and predicting the\ninter-object relationships from a grounded LLM with scene graph features and\nqueried object classes as context. Open3DSG is the first 3D point cloud method\nto predict not only explicit open-vocabulary object classes, but also open-set\nrelationships that are not limited to a predefined label set, making it\npossible to express rare as well as specific objects and relationships in the\npredicted 3D scene graph. Our experiments show that Open3DSG is effective at\npredicting arbitrary object classes as well as their complex inter-object\nrelationships describing spatial, supportive, semantic and comparative\nrelationships.\n","authors":["Sebastian Koch","Narunas Vaskevicius","Mirco Colosi","Pedro Hermosilla","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2402.12259v2.pdf","comment":"CVPR 2024. Project page: https://kochsebastian.com/open3dsg"},{"id":"http://arxiv.org/abs/2403.15443v2","updated":"2024-04-01T16:37:08Z","published":"2024-03-17T16:12:50Z","title":"Introducing an ensemble method for the early detection of Alzheimer's\n disease through the analysis of PET scan images","summary":" Alzheimer's disease is a progressive neurodegenerative disorder that\nprimarily affects cognitive functions such as memory, thinking, and behavior.\nIn this disease, there is a critical phase, mild cognitive impairment, that is\nreally important to be diagnosed early since some patients with progressive MCI\nwill develop the disease. This study delves into the challenging task of\nclassifying Alzheimer's disease into four distinct groups: control normal (CN),\nprogressive mild cognitive impairment (pMCI), stable mild cognitive impairment\n(sMCI), and Alzheimer's disease (AD). This classification is based on a\nthorough examination of PET scan images obtained from the ADNI dataset, which\nprovides a thorough understanding of the disease's progression. Several\ndeep-learning and traditional machine-learning models have been used to detect\nAlzheimer's disease. In this paper, three deep-learning models, namely VGG16\nand AlexNet, and a custom Convolutional neural network (CNN) with 8-fold\ncross-validation have been used for classification. Finally, an ensemble\ntechnique is used to improve the overall result of these models. The results\nshow that using deep-learning models to tell the difference between MCI\npatients gives an overall average accuracy of 93.13% and an AUC of 94.4%.\n","authors":["Arezoo Borji","Taha-Hossein Hejazi","Abbas Seifi"],"pdf_url":"https://arxiv.org/pdf/2403.15443v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.11143v2","updated":"2024-04-01T16:37:00Z","published":"2022-04-23T21:46:17Z","title":"Supplementing Missing Visions via Dialog for Scene Graph Generations","summary":" Most current AI systems rely on the premise that the input visual data are\nsufficient to achieve competitive performance in various computer vision tasks.\nHowever, the classic task setup rarely considers the challenging, yet common\npractical situations where the complete visual data may be inaccessible due to\nvarious reasons (e.g., restricted view range and occlusions). To this end, we\ninvestigate a computer vision task setting with incomplete visual input data.\nSpecifically, we exploit the Scene Graph Generation (SGG) task with various\nlevels of visual data missingness as input. While insufficient visual input\nintuitively leads to performance drop, we propose to supplement the missing\nvisions via the natural language dialog interactions to better accomplish the\ntask objective. We design a model-agnostic Supplementary Interactive Dialog\n(SI-Dial) framework that can be jointly learned with most existing models,\nendowing the current AI systems with the ability of question-answer\ninteractions in natural language. We demonstrate the feasibility of such a task\nsetting with missing visual input and the effectiveness of our proposed dialog\nmodule as the supplementary information source through extensive experiments\nand analysis, by achieving promising performance improvement over multiple\nbaselines.\n","authors":["Zhenghao Zhao","Ye Zhu","Xiaoguang Zhu","Yuzhang Shang","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2204.11143v2.pdf","comment":"ICASSP 2024"},{"id":"http://arxiv.org/abs/2404.01223v1","updated":"2024-04-01T16:31:04Z","published":"2024-04-01T16:31:04Z","title":"Feature Splatting: Language-Driven Physics-Based Scene Synthesis and\n Editing","summary":" Scene representations using 3D Gaussian primitives have produced excellent\nresults in modeling the appearance of static and dynamic 3D scenes. Many\ngraphics applications, however, demand the ability to manipulate both the\nappearance and the physical properties of objects. We introduce Feature\nSplatting, an approach that unifies physics-based dynamic scene synthesis with\nrich semantics from vision language foundation models that are grounded by\nnatural language. Our first contribution is a way to distill high-quality,\nobject-centric vision-language features into 3D Gaussians, that enables\nsemi-automatic scene decomposition using text queries. Our second contribution\nis a way to synthesize physics-based dynamics from an otherwise static scene\nusing a particle-based simulator, in which material properties are assigned\nautomatically via text queries. We ablate key techniques used in this pipeline,\nto illustrate the challenge and opportunities in using feature-carrying 3D\nGaussians as a unified format for appearance, geometry, material properties and\nsemantics grounded on natural language. Project website:\nhttps://feature-splatting.github.io/\n","authors":["Ri-Zhao Qiu","Ge Yang","Weijia Zeng","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.01223v1.pdf","comment":"Project website: https://feature-splatting.github.io/"},{"id":"http://arxiv.org/abs/2404.01220v1","updated":"2024-04-01T16:25:08Z","published":"2024-04-01T16:25:08Z","title":"Entity-Centric Reinforcement Learning for Object Manipulation from\n Pixels","summary":" Manipulating objects is a hallmark of human intelligence, and an important\ntask in domains such as robotics. In principle, Reinforcement Learning (RL)\noffers a general approach to learn object manipulation. In practice, however,\ndomains with more than a few objects are difficult for RL agents due to the\ncurse of dimensionality, especially when learning from raw image observations.\nIn this work we propose a structured approach for visual RL that is suitable\nfor representing multiple objects and their interaction, and use it to learn\ngoal-conditioned manipulation of several objects. Key to our method is the\nability to handle goals with dependencies between the objects (e.g., moving\nobjects in a certain order). We further relate our architecture to the\ngeneralization capability of the trained agent, based on a theoretical result\nfor compositional generalization, and demonstrate agents that learn with 3\nobjects but generalize to similar tasks with over 10 objects. Videos and code\nare available on the project website:\nhttps://sites.google.com/view/entity-centric-rl\n","authors":["Dan Haramati","Tal Daniel","Aviv Tamar"],"pdf_url":"https://arxiv.org/pdf/2404.01220v1.pdf","comment":"ICLR 2024 Spotlight. Videos and code are available on the project\n website: https://sites.google.com/view/entity-centric-rl"},{"id":"http://arxiv.org/abs/2304.12306v3","updated":"2024-04-01T16:18:16Z","published":"2023-04-24T17:56:12Z","title":"Segment Anything in Medical Images","summary":" Medical image segmentation is a critical component in clinical practice,\nfacilitating accurate diagnosis, treatment planning, and disease monitoring.\nHowever, existing methods, often tailored to specific modalities or disease\ntypes, lack generalizability across the diverse spectrum of medical image\nsegmentation tasks. Here we present MedSAM, a foundation model designed for\nbridging this gap by enabling universal medical image segmentation. The model\nis developed on a large-scale medical image dataset with 1,570,263 image-mask\npairs, covering 10 imaging modalities and over 30 cancer types. We conduct a\ncomprehensive evaluation on 86 internal validation tasks and 60 external\nvalidation tasks, demonstrating better accuracy and robustness than\nmodality-wise specialist models. By delivering accurate and efficient\nsegmentation across a wide spectrum of tasks, MedSAM holds significant\npotential to expedite the evolution of diagnostic tools and the personalization\nof treatment plans.\n","authors":["Jun Ma","Yuting He","Feifei Li","Lin Han","Chenyu You","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2304.12306v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05864v2","updated":"2024-04-01T16:11:58Z","published":"2023-08-10T21:59:23Z","title":"The Multi-modality Cell Segmentation Challenge: Towards Universal\n Solutions","summary":" Cell segmentation is a critical step for quantitative single-cell analysis in\nmicroscopy images. Existing cell segmentation methods are often tailored to\nspecific modalities or require manual interventions to specify hyper-parameters\nin different experimental settings. Here, we present a multi-modality cell\nsegmentation benchmark, comprising over 1500 labeled images derived from more\nthan 50 diverse biological experiments. The top participants developed a\nTransformer-based deep-learning algorithm that not only exceeds existing\nmethods but can also be applied to diverse microscopy images across imaging\nplatforms and tissue types without manual parameter adjustments. This benchmark\nand the improved algorithm offer promising avenues for more accurate and\nversatile cell analysis in microscopy imaging.\n","authors":["Jun Ma","Ronald Xie","Shamini Ayyadhury","Cheng Ge","Anubha Gupta","Ritu Gupta","Song Gu","Yao Zhang","Gihun Lee","Joonkee Kim","Wei Lou","Haofeng Li","Eric Upschulte","Timo Dickscheid","José Guilherme de Almeida","Yixin Wang","Lin Han","Xin Yang","Marco Labagnara","Vojislav Gligorovski","Maxime Scheder","Sahand Jamal Rahi","Carly Kempster","Alice Pollitt","Leon Espinosa","Tâm Mignot","Jan Moritz Middeke","Jan-Niklas Eckardt","Wangkai Li","Zhaoyang Li","Xiaochen Cai","Bizhe Bai","Noah F. Greenwald","David Van Valen","Erin Weisbart","Beth A. Cimini","Trevor Cheung","Oscar Brück","Gary D. Bader","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05864v2.pdf","comment":"NeurIPS22 Cell Segmentation Challenge:\n https://neurips22-cellseg.grand-challenge.org/ . Nature Methods (2024)"},{"id":"http://arxiv.org/abs/2404.01207v1","updated":"2024-04-01T16:09:12Z","published":"2024-04-01T16:09:12Z","title":"Vision-language models for decoding provider attention during neonatal\n resuscitation","summary":" Neonatal resuscitations demand an exceptional level of attentiveness from\nproviders, who must process multiple streams of information simultaneously.\nGaze strongly influences decision making; thus, understanding where a provider\nis looking during neonatal resuscitations could inform provider training,\nenhance real-time decision support, and improve the design of delivery rooms\nand neonatal intensive care units (NICUs). Current approaches to quantifying\nneonatal providers' gaze rely on manual coding or simulations, which limit\nscalability and utility. Here, we introduce an automated, real-time, deep\nlearning approach capable of decoding provider gaze into semantic classes\ndirectly from first-person point-of-view videos recorded during live\nresuscitations. Combining state-of-the-art, real-time segmentation with\nvision-language models (CLIP), our low-shot pipeline attains 91\\%\nclassification accuracy in identifying gaze targets without training. Upon\nfine-tuning, the performance of our gaze-guided vision transformer exceeds 98\\%\naccuracy in gaze classification, approaching human-level precision. This\nsystem, capable of real-time inference, enables objective quantification of\nprovider attention dynamics during live neonatal resuscitation. Our approach\noffers a scalable solution that seamlessly integrates with existing\ninfrastructure for data-scarce gaze analysis, thereby offering new\nopportunities for understanding and refining clinical decision making.\n","authors":["Felipe Parodi","Jordan Matelsky","Alejandra Regla-Vargas","Elizabeth Foglia","Charis Lim","Danielle Weinberg","Konrad Kording","Heidi Herrick","Michael Platt"],"pdf_url":"https://arxiv.org/pdf/2404.01207v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.01203v1","updated":"2024-04-01T15:59:32Z","published":"2024-04-01T15:59:32Z","title":"Video Interpolation with Diffusion Models","summary":" We present VIDIM, a generative model for video interpolation, which creates\nshort videos given a start and end frame. In order to achieve high fidelity and\ngenerate motions unseen in the input data, VIDIM uses cascaded diffusion models\nto first generate the target video at low resolution, and then generate the\nhigh-resolution video conditioned on the low-resolution generated video. We\ncompare VIDIM to previous state-of-the-art methods on video interpolation, and\ndemonstrate how such works fail in most settings where the underlying motion is\ncomplex, nonlinear, or ambiguous while VIDIM can easily handle such cases. We\nadditionally demonstrate how classifier-free guidance on the start and end\nframe and conditioning the super-resolution model on the original\nhigh-resolution frames without additional parameters unlocks high-fidelity\nresults. VIDIM is fast to sample from as it jointly denoises all the frames to\nbe generated, requires less than a billion parameters per diffusion model to\nproduce compelling results, and still enjoys scalability and improved quality\nat larger parameter counts.\n","authors":["Siddhant Jain","Daniel Watson","Eric Tabellion","Aleksander Hołyński","Ben Poole","Janne Kontkanen"],"pdf_url":"https://arxiv.org/pdf/2404.01203v1.pdf","comment":"CVPR 2024, Project page at https://vidim-interpolation.github.io/"},{"id":"http://arxiv.org/abs/2404.01197v1","updated":"2024-04-01T15:55:25Z","published":"2024-04-01T15:55:25Z","title":"Getting it Right: Improving Spatial Consistency in Text-to-Image Models","summary":" One of the key shortcomings in current text-to-image (T2I) models is their\ninability to consistently generate images which faithfully follow the spatial\nrelationships specified in the text prompt. In this paper, we offer a\ncomprehensive investigation of this limitation, while also developing datasets\nand methods that achieve state-of-the-art performance. First, we find that\ncurrent vision-language datasets do not represent spatial relationships well\nenough; to alleviate this bottleneck, we create SPRIGHT, the first\nspatially-focused, large scale dataset, by re-captioning 6 million images from\n4 widely used vision datasets. Through a 3-fold evaluation and analysis\npipeline, we find that SPRIGHT largely improves upon existing datasets in\ncapturing spatial relationships. To demonstrate its efficacy, we leverage only\n~0.25% of SPRIGHT and achieve a 22% improvement in generating spatially\naccurate images while also improving the FID and CMMD scores. Secondly, we find\nthat training on images containing a large number of objects results in\nsubstantial improvements in spatial consistency. Notably, we attain\nstate-of-the-art on T2I-CompBench with a spatial score of 0.2133, by\nfine-tuning on <500 images. Finally, through a set of controlled experiments\nand ablations, we document multiple findings that we believe will enhance the\nunderstanding of factors that affect spatial consistency in text-to-image\nmodels. We publicly release our dataset and model to foster further research in\nthis area.\n","authors":["Agneet Chatterjee","Gabriela Ben Melech Stan","Estelle Aflalo","Sayak Paul","Dhruba Ghosh","Tejas Gokhale","Ludwig Schmidt","Hannaneh Hajishirzi","Vasudev Lal","Chitta Baral","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01197v1.pdf","comment":"project webpage : https://spright-t2i.github.io/"},{"id":"http://arxiv.org/abs/2404.01194v1","updated":"2024-04-01T15:52:14Z","published":"2024-04-01T15:52:14Z","title":"Adaptive Query Prompting for Multi-Domain Landmark Detection","summary":" Medical landmark detection is crucial in various medical imaging modalities\nand procedures. Although deep learning-based methods have achieve promising\nperformance, they are mostly designed for specific anatomical regions or tasks.\nIn this work, we propose a universal model for multi-domain landmark detection\nby leveraging transformer architecture and developing a prompting component,\nnamed as Adaptive Query Prompting (AQP). Instead of embedding additional\nmodules in the backbone network, we design a separate module to generate\nprompts that can be effectively extended to any other transformer network. In\nour proposed AQP, prompts are learnable parameters maintained in a memory space\ncalled prompt pool. The central idea is to keep the backbone frozen and then\noptimize prompts to instruct the model inference process. Furthermore, we\nemploy a lightweight decoder to decode landmarks from the extracted features,\nnamely Light-MLD. Thanks to the lightweight nature of the decoder and AQP, we\ncan handle multiple datasets by sharing the backbone encoder and then only\nperform partial parameter tuning without incurring much additional cost. It has\nthe potential to be extended to more landmark detection tasks. We conduct\nexperiments on three widely used X-ray datasets for different medical landmark\ndetection tasks. Our proposed Light-MLD coupled with AQP achieves SOTA\nperformance on many metrics even without the use of elaborate structural\ndesigns or complex frameworks.\n","authors":["Qiusen Wei","Guoheng Huang","Xiaochen Yuan","Xuhang Chen","Guo Zhong","Jianwen Huang","Jiajie Huang"],"pdf_url":"https://arxiv.org/pdf/2404.01194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01192v1","updated":"2024-04-01T15:49:50Z","published":"2024-04-01T15:49:50Z","title":"iMD4GC: Incomplete Multimodal Data Integration to Advance Precise\n Treatment Response Prediction and Survival Analysis for Gastric Cancer","summary":" Gastric cancer (GC) is a prevalent malignancy worldwide, ranking as the fifth\nmost common cancer with over 1 million new cases and 700 thousand deaths in\n2020. Locally advanced gastric cancer (LAGC) accounts for approximately\ntwo-thirds of GC diagnoses, and neoadjuvant chemotherapy (NACT) has emerged as\nthe standard treatment for LAGC. However, the effectiveness of NACT varies\nsignificantly among patients, with a considerable subset displaying treatment\nresistance. Ineffective NACT not only leads to adverse effects but also misses\nthe optimal therapeutic window, resulting in lower survival rate. However,\nexisting multimodal learning methods assume the availability of all modalities\nfor each patient, which does not align with the reality of clinical practice.\nThe limited availability of modalities for each patient would cause information\nloss, adversely affecting predictive accuracy. In this study, we propose an\nincomplete multimodal data integration framework for GC (iMD4GC) to address the\nchallenges posed by incomplete multimodal data, enabling precise response\nprediction and survival analysis. Specifically, iMD4GC incorporates unimodal\nattention layers for each modality to capture intra-modal information.\nSubsequently, the cross-modal interaction layers explore potential inter-modal\ninteractions and capture complementary information across modalities, thereby\nenabling information compensation for missing modalities. To evaluate iMD4GC,\nwe collected three multimodal datasets for GC study: GastricRes (698 cases) for\nresponse prediction, GastricSur (801 cases) for survival analysis, and\nTCGA-STAD (400 cases) for survival analysis. The scale of our datasets is\nsignificantly larger than previous studies. The iMD4GC achieved impressive\nperformance with an 80.2% AUC on GastricRes, 71.4% C-index on GastricSur, and\n66.1% C-index on TCGA-STAD, significantly surpassing other compared methods.\n","authors":["Fengtao Zhou","Yingxue Xu","Yanfen Cui","Shenyan Zhang","Yun Zhu","Weiyang He","Jiguang Wang","Xin Wang","Ronald Chan","Louis Ho Shing Lau","Chu Han","Dafu Zhang","Zhenhui Li","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01192v1.pdf","comment":"27 pages, 9 figures, 3 tables (under review)"},{"id":"http://arxiv.org/abs/2311.01017v4","updated":"2024-04-01T15:41:50Z","published":"2023-11-02T06:21:56Z","title":"Copilot4D: Learning Unsupervised World Models for Autonomous Driving via\n Discrete Diffusion","summary":" Learning world models can teach an agent how the world works in an\nunsupervised manner. Even though it can be viewed as a special case of sequence\nmodeling, progress for scaling world models on robotic applications such as\nautonomous driving has been somewhat less rapid than scaling language models\nwith Generative Pre-trained Transformers (GPT). We identify two reasons as\nmajor bottlenecks: dealing with complex and unstructured observation space, and\nhaving a scalable generative model. Consequently, we propose Copilot4D, a novel\nworld modeling approach that first tokenizes sensor observations with VQVAE,\nthen predicts the future via discrete diffusion. To efficiently decode and\ndenoise tokens in parallel, we recast Masked Generative Image Transformer as\ndiscrete diffusion and enhance it with a few simple changes, resulting in\nnotable improvement. When applied to learning world models on point cloud\nobservations, Copilot4D reduces prior SOTA Chamfer distance by more than 65%\nfor 1s prediction, and more than 50% for 3s prediction, across NuScenes, KITTI\nOdometry, and Argoverse2 datasets. Our results demonstrate that discrete\ndiffusion on tokenized agent experience can unlock the power of GPT-like\nunsupervised learning for robotics.\n","authors":["Lunjun Zhang","Yuwen Xiong","Ze Yang","Sergio Casas","Rui Hu","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2311.01017v4.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.01179v1","updated":"2024-04-01T15:31:04Z","published":"2024-04-01T15:31:04Z","title":"BEM: Balanced and Entropy-based Mix for Long-Tailed Semi-Supervised\n Learning","summary":" Data mixing methods play a crucial role in semi-supervised learning (SSL),\nbut their application is unexplored in long-tailed semi-supervised learning\n(LTSSL). The primary reason is that the in-batch mixing manner fails to address\nclass imbalance. Furthermore, existing LTSSL methods mainly focus on\nre-balancing data quantity but ignore class-wise uncertainty, which is also\nvital for class balance. For instance, some classes with sufficient samples\nmight still exhibit high uncertainty due to indistinguishable features. To this\nend, this paper introduces the Balanced and Entropy-based Mix (BEM), a\npioneering mixing approach to re-balance the class distribution of both data\nquantity and uncertainty. Specifically, we first propose a class balanced mix\nbank to store data of each class for mixing. This bank samples data based on\nthe estimated quantity distribution, thus re-balancing data quantity. Then, we\npresent an entropy-based learning approach to re-balance class-wise\nuncertainty, including entropy-based sampling strategy, entropy-based selection\nmodule, and entropy-based class balanced loss. Our BEM first leverages data\nmixing for improving LTSSL, and it can also serve as a complement to the\nexisting re-balancing methods. Experimental results show that BEM significantly\nenhances various LTSSL frameworks and achieves state-of-the-art performances\nacross multiple benchmarks.\n","authors":["Hongwei Zheng","Linyuan Zhou","Han Li","Jinming Su","Xiaoming Wei","Xiaoming Xu"],"pdf_url":"https://arxiv.org/pdf/2404.01179v1.pdf","comment":"This paper is accepted to CVPR 2024. The supplementary material is\n included"},{"id":"http://arxiv.org/abs/2404.01174v1","updated":"2024-04-01T15:26:44Z","published":"2024-04-01T15:26:44Z","title":"SpikeMba: Multi-Modal Spiking Saliency Mamba for Temporal Video\n Grounding","summary":" Temporal video grounding (TVG) is a critical task in video content\nunderstanding. Despite significant advancements, existing methods often limit\nin capturing the fine-grained relationships between multimodal inputs and the\nhigh computational costs with processing long video sequences. To address these\nlimitations, we introduce a novel SpikeMba: multi-modal spiking saliency mamba\nfor temporal video grounding. In our work, we integrate the Spiking Neural\nNetworks (SNNs) and state space models (SSMs) to capture the fine-grained\nrelationships of multimodal features effectively. Specifically, we introduce\nthe relevant slots to enhance the model's memory capabilities, enabling a\ndeeper contextual understanding of video sequences. The contextual moment\nreasoner leverages these slots to maintain a balance between contextual\ninformation preservation and semantic relevance exploration. Simultaneously,\nthe spiking saliency detector capitalizes on the unique properties of SNNs to\naccurately locate salient proposals. Our experiments demonstrate the\neffectiveness of SpikeMba, which consistently outperforms state-of-the-art\nmethods across mainstream benchmarks.\n","authors":["Wenrui Li","Xiaopeng Hong","Xiaopeng Fan"],"pdf_url":"https://arxiv.org/pdf/2404.01174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01168v1","updated":"2024-04-01T15:16:33Z","published":"2024-04-01T15:16:33Z","title":"Mirror-3DGS: Incorporating Mirror Reflections into 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3DGS) has marked a significant breakthrough in the\nrealm of 3D scene reconstruction and novel view synthesis. However, 3DGS, much\nlike its predecessor Neural Radiance Fields (NeRF), struggles to accurately\nmodel physical reflections, particularly in mirrors that are ubiquitous in\nreal-world scenes. This oversight mistakenly perceives reflections as separate\nentities that physically exist, resulting in inaccurate reconstructions and\ninconsistent reflective properties across varied viewpoints. To address this\npivotal challenge, we introduce Mirror-3DGS, an innovative rendering framework\ndevised to master the intricacies of mirror geometries and reflections, paving\nthe way for the generation of realistically depicted mirror reflections. By\ningeniously incorporating mirror attributes into the 3DGS and leveraging the\nprinciple of plane mirror imaging, Mirror-3DGS crafts a mirrored viewpoint to\nobserve from behind the mirror, enriching the realism of scene renderings.\nExtensive assessments, spanning both synthetic and real-world scenes, showcase\nour method's ability to render novel views with enhanced fidelity in real-time,\nsurpassing the state-of-the-art Mirror-NeRF specifically within the challenging\nmirror regions. Our code will be made publicly available for reproducible\nresearch.\n","authors":["Jiarui Meng","Haijie Li","Yanmin Wu","Qiankun Gao","Shuzhou Yang","Jian Zhang","Siwei Ma"],"pdf_url":"https://arxiv.org/pdf/2404.01168v1.pdf","comment":"22 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.01160v1","updated":"2024-04-01T15:06:20Z","published":"2024-04-01T15:06:20Z","title":"Diagnosis of Skin Cancer Using VGG16 and VGG19 Based Transfer Learning\n Models","summary":" Today, skin cancer is considered as one of the most dangerous and common\ncancers in the world which demands special attention. Skin cancer may be\ndeveloped in different types; including melanoma, actinic keratosis, basal cell\ncarcinoma, squamous cell carcinoma, and Merkel cell carcinoma. Among them,\nmelanoma is more unpredictable. Melanoma cancer can be diagnosed at early\nstages increasing the possibility of disease treatment. Automatic\nclassification of skin lesions is a challenging task due to diverse forms and\ngrades of the disease, demanding the requirement of novel methods\nimplementation. Deep convolution neural networks (CNN) have shown an excellent\npotential for data and image classification. In this article, we inspect skin\nlesion classification problem using CNN techniques. Remarkably, we present that\nprominent classification accuracy of lesion detection can be obtained by proper\ndesigning and applying of transfer learning framework on pre-trained neural\nnetworks, without any requirement for data enlargement procedures i.e. merging\nVGG16 and VGG19 architectures pre-trained by a generic dataset with modified\nAlexNet network, and then, fine-tuned by a subject-specific dataset containing\ndermatology images. The convolution neural network was trained using 2541\nimages and, in particular, dropout was used to prevent the network from\noverfitting. Finally, the validity of the model was checked by applying the\nK-fold cross validation method. The proposed model increased classification\naccuracy by 3% (from 94.2% to 98.18%) in comparison with other methods.\n","authors":["Amir Faghihi","Mohammadreza Fathollahi","Roozbeh Rajabi"],"pdf_url":"https://arxiv.org/pdf/2404.01160v1.pdf","comment":"15 pages, journal"},{"id":"http://arxiv.org/abs/2404.01156v1","updated":"2024-04-01T15:01:38Z","published":"2024-04-01T15:01:38Z","title":"SyncMask: Synchronized Attentional Masking for Fashion-centric\n Vision-Language Pretraining","summary":" Vision-language models (VLMs) have made significant strides in cross-modal\nunderstanding through large-scale paired datasets. However, in fashion domain,\ndatasets often exhibit a disparity between the information conveyed in image\nand text. This issue stems from datasets containing multiple images of a single\nfashion item all paired with one text, leading to cases where some textual\ndetails are not visible in individual images. This mismatch, particularly when\nnon-co-occurring elements are masked, undermines the training of conventional\nVLM objectives like Masked Language Modeling and Masked Image Modeling, thereby\nhindering the model's ability to accurately align fine-grained visual and\ntextual features. Addressing this problem, we propose Synchronized attentional\nMasking (SyncMask), which generate masks that pinpoint the image patches and\nword tokens where the information co-occur in both image and text. This\nsynchronization is accomplished by harnessing cross-attentional features\nobtained from a momentum model, ensuring a precise alignment between the two\nmodalities. Additionally, we enhance grouped batch sampling with semi-hard\nnegatives, effectively mitigating false negative issues in Image-Text Matching\nand Image-Text Contrastive learning objectives within fashion datasets. Our\nexperiments demonstrate the effectiveness of the proposed approach,\noutperforming existing methods in three downstream tasks.\n","authors":["Chull Hwan Song","Taebaek Hwang","Jooyoung Yoon","Shunghyun Choi","Yeong Hyeon Gu"],"pdf_url":"https://arxiv.org/pdf/2404.01156v1.pdf","comment":"CVPR2024 Accepted"},{"id":"http://arxiv.org/abs/2404.01154v1","updated":"2024-04-01T14:59:13Z","published":"2024-04-01T14:59:13Z","title":"Uncovering the Text Embedding in Text-to-Image Diffusion Models","summary":" The correspondence between input text and the generated image exhibits\nopacity, wherein minor textual modifications can induce substantial deviations\nin the generated image. While, text embedding, as the pivotal intermediary\nbetween text and images, remains relatively underexplored. In this paper, we\naddress this research gap by delving into the text embedding space, unleashing\nits capacity for controllable image editing and explicable semantic direction\nattributes within a learning-free framework. Specifically, we identify two\ncritical insights regarding the importance of per-word embedding and their\ncontextual correlations within text embedding, providing instructive principles\nfor learning-free image editing. Additionally, we find that text embedding\ninherently possesses diverse semantic potentials, and further reveal this\nproperty through the lens of singular value decomposition (SVD). These\nuncovered properties offer practical utility for image editing and semantic\ndiscovery. More importantly, we expect the in-depth analyses and findings of\nthe text embedding can enhance the understanding of text-to-image diffusion\nmodels.\n","authors":["Hu Yu","Hao Luo","Fan Wang","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.01154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01151v1","updated":"2024-04-01T14:53:36Z","published":"2024-04-01T14:53:36Z","title":"Detect2Interact: Localizing Object Key Field in Visual Question\n Answering (VQA) with LLMs","summary":" Localization plays a crucial role in enhancing the practicality and precision\nof VQA systems. By enabling fine-grained identification and interaction with\nspecific parts of an object, it significantly improves the system's ability to\nprovide contextually relevant and spatially accurate responses, crucial for\napplications in dynamic environments like robotics and augmented reality.\nHowever, traditional systems face challenges in accurately mapping objects\nwithin images to generate nuanced and spatially aware responses. In this work,\nwe introduce \"Detect2Interact\", which addresses these challenges by introducing\nan advanced approach for fine-grained object visual key field detection. First,\nwe use the segment anything model (SAM) to generate detailed spatial maps of\nobjects in images. Next, we use Vision Studio to extract semantic object\ndescriptions. Third, we employ GPT-4's common sense knowledge, bridging the gap\nbetween an object's semantics and its spatial map. As a result, Detect2Interact\nachieves consistent qualitative results on object key field detection across\nextensive test cases and outperforms the existing VQA system with object\ndetection by providing a more reasonable and finer visual representation.\n","authors":["Jialou Wang","Manli Zhu","Yulei Li","Honglei Li","Longzhi Yang","Wai Lok Woo"],"pdf_url":"https://arxiv.org/pdf/2404.01151v1.pdf","comment":"Accepted to IEEE Intelligent Systems"},{"id":"http://arxiv.org/abs/2401.10786v2","updated":"2024-04-01T14:53:00Z","published":"2024-01-19T16:15:37Z","title":"Sat2Scene: 3D Urban Scene Generation from Satellite Images with\n Diffusion","summary":" Directly generating scenes from satellite imagery offers exciting\npossibilities for integration into applications like games and map services.\nHowever, challenges arise from significant view changes and scene scale.\nPrevious efforts mainly focused on image or video generation, lacking\nexploration into the adaptability of scene generation for arbitrary views.\nExisting 3D generation works either operate at the object level or are\ndifficult to utilize the geometry obtained from satellite imagery. To overcome\nthese limitations, we propose a novel architecture for direct 3D scene\ngeneration by introducing diffusion models into 3D sparse representations and\ncombining them with neural rendering techniques. Specifically, our approach\ngenerates texture colors at the point level for a given geometry using a 3D\ndiffusion model first, which is then transformed into a scene representation in\na feed-forward manner. The representation can be utilized to render arbitrary\nviews which would excel in both single-frame quality and inter-frame\nconsistency. Experiments in two city-scale datasets show that our model\ndemonstrates proficiency in generating photo-realistic street-view image\nsequences and cross-view urban scenes from satellite imagery.\n","authors":["Zuoyue Li","Zhenqiang Li","Zhaopeng Cui","Marc Pollefeys","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2401.10786v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01143v1","updated":"2024-04-01T14:42:57Z","published":"2024-04-01T14:42:57Z","title":"Condition-Aware Neural Network for Controlled Image Generation","summary":" We present Condition-Aware Neural Network (CAN), a new method for adding\ncontrol to image generative models. In parallel to prior conditional control\nmethods, CAN controls the image generation process by dynamically manipulating\nthe weight of the neural network. This is achieved by introducing a\ncondition-aware weight generation module that generates conditional weight for\nconvolution/linear layers based on the input condition. We test CAN on\nclass-conditional image generation on ImageNet and text-to-image generation on\nCOCO. CAN consistently delivers significant improvements for diffusion\ntransformer models, including DiT and UViT. In particular, CAN combined with\nEfficientViT (CaT) achieves 2.78 FID on ImageNet 512x512, surpassing DiT-XL/2\nwhile requiring 52x fewer MACs per sampling step.\n","authors":["Han Cai","Muyang Li","Zhuoyang Zhang","Qinsheng Zhang","Ming-Yu Liu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2404.01143v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2211.14049v3","updated":"2024-04-01T14:38:13Z","published":"2022-11-25T12:09:12Z","title":"Task-Oriented Communication for Edge Video Analytics","summary":" With the development of artificial intelligence (AI) techniques and the\nincreasing popularity of camera-equipped devices, many edge video analytics\napplications are emerging, calling for the deployment of computation-intensive\nAI models at the network edge. Edge inference is a promising solution to move\nthe computation-intensive workloads from low-end devices to a powerful edge\nserver for video analytics, but the device-server communications will remain a\nbottleneck due to the limited bandwidth. This paper proposes a task-oriented\ncommunication framework for edge video analytics, where multiple devices\ncollect the visual sensory data and transmit the informative features to an\nedge server for processing. To enable low-latency inference, this framework\nremoves video redundancy in spatial and temporal domains and transmits minimal\ninformation that is essential for the downstream task, rather than\nreconstructing the videos at the edge server. Specifically, it extracts compact\ntask-relevant features based on the deterministic information bottleneck (IB)\nprinciple, which characterizes a tradeoff between the informativeness of the\nfeatures and the communication cost. As the features of consecutive frames are\ntemporally correlated, we propose a temporal entropy model (TEM) to reduce the\nbitrate by taking the previous features as side information in feature\nencoding. To further improve the inference performance, we build a\nspatial-temporal fusion module at the server to integrate features of the\ncurrent and previous frames for joint inference. Extensive experiments on video\nanalytics tasks evidence that the proposed framework effectively encodes\ntask-relevant information of video data and achieves a better rate-performance\ntradeoff than existing methods.\n","authors":["Jiawei Shao","Xinjie Zhang","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.14049v3.pdf","comment":"This paper was accepted to IEEE Transactions on Wireless\n Communications (TWC)"},{"id":"http://arxiv.org/abs/2404.01139v1","updated":"2024-04-01T14:34:47Z","published":"2024-04-01T14:34:47Z","title":"Structured Initialization for Attention in Vision Transformers","summary":" The training of vision transformer (ViT) networks on small-scale datasets\nposes a significant challenge. By contrast, convolutional neural networks\n(CNNs) have an architectural inductive bias enabling them to perform well on\nsuch problems. In this paper, we argue that the architectural bias inherent to\nCNNs can be reinterpreted as an initialization bias within ViT. This insight is\nsignificant as it empowers ViTs to perform equally well on small-scale problems\nwhile maintaining their flexibility for large-scale applications. Our\ninspiration for this ``structured'' initialization stems from our empirical\nobservation that random impulse filters can achieve comparable performance to\nlearned filters within CNNs. Our approach achieves state-of-the-art performance\nfor data-efficient ViT learning across numerous benchmarks including CIFAR-10,\nCIFAR-100, and SVHN.\n","authors":["Jianqiao Zheng","Xueqian Li","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2404.01139v1.pdf","comment":"20 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.01133v1","updated":"2024-04-01T14:24:40Z","published":"2024-04-01T14:24:40Z","title":"CityGaussian: Real-time High-quality Large-Scale Scene Rendering with\n Gaussians","summary":" The advancement of real-time 3D scene reconstruction and novel view synthesis\nhas been significantly propelled by 3D Gaussian Splatting (3DGS). However,\neffectively training large-scale 3DGS and rendering it in real-time across\nvarious scales remains challenging. This paper introduces CityGaussian\n(CityGS), which employs a novel divide-and-conquer training approach and\nLevel-of-Detail (LoD) strategy for efficient large-scale 3DGS training and\nrendering. Specifically, the global scene prior and adaptive training data\nselection enables efficient training and seamless fusion. Based on fused\nGaussian primitives, we generate different detail levels through compression,\nand realize fast rendering across various scales through the proposed\nblock-wise detail levels selection and aggregation strategy. Extensive\nexperimental results on large-scale scenes demonstrate that our approach\nattains state-of-theart rendering quality, enabling consistent real-time\nrendering of largescale scenes across vastly different scales. Our project page\nis available at https://dekuliutesla.github.io/citygs/.\n","authors":["Yang Liu","He Guan","Chuanchen Luo","Lue Fan","Junran Peng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01133v1.pdf","comment":"Project Page: https://dekuliutesla.github.io/citygs/"},{"id":"http://arxiv.org/abs/2404.01127v1","updated":"2024-04-01T14:06:48Z","published":"2024-04-01T14:06:48Z","title":"Medical Visual Prompting (MVP): A Unified Framework for Versatile and\n High-Quality Medical Image Segmentation","summary":" Accurate segmentation of lesion regions is crucial for clinical diagnosis and\ntreatment across various diseases. While deep convolutional networks have\nachieved satisfactory results in medical image segmentation, they face\nchallenges such as loss of lesion shape information due to continuous\nconvolution and downsampling, as well as the high cost of manually labeling\nlesions with varying shapes and sizes. To address these issues, we propose a\nnovel medical visual prompting (MVP) framework that leverages pre-training and\nprompting concepts from natural language processing (NLP). The framework\nutilizes three key components: Super-Pixel Guided Prompting (SPGP) for\nsuperpixelating the input image, Image Embedding Guided Prompting (IEGP) for\nfreezing patch embedding and merging with superpixels to provide visual\nprompts, and Adaptive Attention Mechanism Guided Prompting (AAGP) for\npinpointing prompt content and efficiently adapting all layers. By integrating\nSPGP, IEGP, and AAGP, the MVP enables the segmentation network to better learn\nshape prompting information and facilitates mutual learning across different\ntasks. Extensive experiments conducted on five datasets demonstrate superior\nperformance of this method in various challenging medical image tasks, while\nsimplifying single-task medical segmentation models. This novel framework\noffers improved performance with fewer parameters and holds significant\npotential for accurate segmentation of lesion regions in various medical tasks,\nmaking it clinically valuable.\n","authors":["Yulin Chen","Guoheng Huang","Kai Huang","Zijin Lin","Guo Zhong","Shenghong Luo","Jie Deng","Jian Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.01127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01123v1","updated":"2024-04-01T13:57:46Z","published":"2024-04-01T13:57:46Z","title":"CLIPtone: Unsupervised Learning for Text-based Image Tone Adjustment","summary":" Recent image tone adjustment (or enhancement) approaches have predominantly\nadopted supervised learning for learning human-centric perceptual assessment.\nHowever, these approaches are constrained by intrinsic challenges of supervised\nlearning. Primarily, the requirement for expertly-curated or retouched images\nescalates the data acquisition expenses. Moreover, their coverage of target\nstyle is confined to stylistic variants inferred from the training data. To\nsurmount the above challenges, we propose an unsupervised learning-based\napproach for text-based image tone adjustment method, CLIPtone, that extends an\nexisting image enhancement method to accommodate natural language descriptions.\nSpecifically, we design a hyper-network to adaptively modulate the pretrained\nparameters of the backbone model based on text description. To assess whether\nthe adjusted image aligns with the text description without ground truth image,\nwe utilize CLIP, which is trained on a vast set of language-image pairs and\nthus encompasses knowledge of human perception. The major advantages of our\napproach are three fold: (i) minimal data collection expenses, (ii) support for\na range of adjustments, and (iii) the ability to handle novel text descriptions\nunseen in training. Our approach's efficacy is demonstrated through\ncomprehensive experiments, including a user study.\n","authors":["Hyeongmin Lee","Kyoungkook Kang","Jungseul Ok","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.01123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01121v1","updated":"2024-04-01T13:55:44Z","published":"2024-04-01T13:55:44Z","title":"CMT: Cross Modulation Transformer with Hybrid Loss for Pansharpening","summary":" Pansharpening aims to enhance remote sensing image (RSI) quality by merging\nhigh-resolution panchromatic (PAN) with multispectral (MS) images. However,\nprior techniques struggled to optimally fuse PAN and MS images for enhanced\nspatial and spectral information, due to a lack of a systematic framework\ncapable of effectively coordinating their individual strengths. In response, we\npresent the Cross Modulation Transformer (CMT), a pioneering method that\nmodifies the attention mechanism. This approach utilizes a robust modulation\ntechnique from signal processing, integrating it into the attention mechanism's\ncalculations. It dynamically tunes the weights of the carrier's value (V)\nmatrix according to the modulator's features, thus resolving historical\nchallenges and achieving a seamless integration of spatial and spectral\nattributes. Furthermore, considering that RSI exhibits large-scale features and\nedge details along with local textures, we crafted a hybrid loss function that\ncombines Fourier and wavelet transforms to effectively capture these\ncharacteristics, thereby enhancing both spatial and spectral accuracy in\npansharpening. Extensive experiments demonstrate our framework's superior\nperformance over existing state-of-the-art methods. The code will be publicly\navailable to encourage further research.\n","authors":["Wen-Jie Shu","Hong-Xia Dou","Rui Wen","Xiao Wu","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2404.01121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01120v1","updated":"2024-04-01T13:55:40Z","published":"2024-04-01T13:55:40Z","title":"Motion Blur Decomposition with Cross-shutter Guidance","summary":" Motion blur is a frequently observed image artifact, especially under\ninsufficient illumination where exposure time has to be prolonged so as to\ncollect more photons for a bright enough image. Rather than simply removing\nsuch blurring effects, recent researches have aimed at decomposing a blurry\nimage into multiple sharp images with spatial and temporal coherence. Since\nmotion blur decomposition itself is highly ambiguous, priors from neighbouring\nframes or human annotation are usually needed for motion disambiguation. In\nthis paper, inspired by the complementary exposure characteristics of a global\nshutter (GS) camera and a rolling shutter (RS) camera, we propose to utilize\nthe ordered scanline-wise delay in a rolling shutter image to robustify motion\ndecomposition of a single blurry image. To evaluate this novel dual imaging\nsetting, we construct a triaxial system to collect realistic data, as well as a\ndeep network architecture that explicitly addresses temporal and contextual\ninformation through reciprocal branches for cross-shutter motion blur\ndecomposition. Experiment results have verified the effectiveness of our\nproposed algorithm, as well as the validity of our dual imaging setting.\n","authors":["Xiang Ji","Haiyang Jiang","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.01120v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2312.06505v4","updated":"2024-04-01T13:52:30Z","published":"2023-12-11T16:31:55Z","title":"Grounded Question-Answering in Long Egocentric Videos","summary":" Existing approaches to video understanding, mainly designed for short videos\nfrom a third-person perspective, are limited in their applicability in certain\nfields, such as robotics. In this paper, we delve into open-ended\nquestion-answering (QA) in long, egocentric videos, which allows individuals or\nrobots to inquire about their own past visual experiences. This task presents\nunique challenges, including the complexity of temporally grounding queries\nwithin extensive video content, the high resource demands for precise data\nannotation, and the inherent difficulty of evaluating open-ended answers due to\ntheir ambiguous nature. Our proposed approach tackles these challenges by (i)\nintegrating query grounding and answering within a unified model to reduce\nerror propagation; (ii) employing large language models for efficient and\nscalable data synthesis; and (iii) introducing a close-ended QA task for\nevaluation, to manage answer ambiguity. Extensive experiments demonstrate the\neffectiveness of our method, which also achieves state-of-the-art performance\non the QaEgo4D and Ego4D-NLQ benchmarks. Code, data, and models are available\nat https://github.com/Becomebright/GroundVQA.\n","authors":["Shangzhe Di","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2312.06505v4.pdf","comment":"Accepted to CVPR 2024. Project website at https://dszdsz.cn/GroundVQA"},{"id":"http://arxiv.org/abs/2404.01102v1","updated":"2024-04-01T13:23:04Z","published":"2024-04-01T13:23:04Z","title":"Diffusion based Zero-shot Medical Image-to-Image Translation for Cross\n Modality Segmentation","summary":" Cross-modality image segmentation aims to segment the target modalities using\na method designed in the source modality. Deep generative models can translate\nthe target modality images into the source modality, thus enabling\ncross-modality segmentation. However, a vast body of existing cross-modality\nimage translation methods relies on supervised learning. In this work, we aim\nto address the challenge of zero-shot learning-based image translation tasks\n(extreme scenarios in the target modality is unseen in the training phase). To\nleverage generative learning for zero-shot cross-modality image segmentation,\nwe propose a novel unsupervised image translation method. The framework learns\nto translate the unseen source image to the target modality for image\nsegmentation by leveraging the inherent statistical consistency between\ndifferent modalities for diffusion guidance. Our framework captures identical\ncross-modality features in the statistical domain, offering diffusion guidance\nwithout relying on direct mappings between the source and target domains. This\nadvantage allows our method to adapt to changing source domains without the\nneed for retraining, making it highly practical when sufficient labeled source\ndomain data is not available. The proposed framework is validated in zero-shot\ncross-modality image segmentation tasks through empirical comparisons with\ninfluential generative models, including adversarial-based and diffusion-based\nmodels.\n","authors":["Zihao Wang","Yingyu Yang","Yuzhou Chen","Tingting Yuan","Maxime Sermesant","Herve Delingette"],"pdf_url":"https://arxiv.org/pdf/2404.01102v1.pdf","comment":"Neurips 2023 Diffusion Workshop"},{"id":"http://arxiv.org/abs/2404.01101v1","updated":"2024-04-01T13:21:05Z","published":"2024-04-01T13:21:05Z","title":"UFID: A Unified Framework for Input-level Backdoor Detection on\n Diffusion Models","summary":" Diffusion Models are vulnerable to backdoor attacks, where malicious\nattackers inject backdoors by poisoning some parts of the training samples\nduring the training stage. This poses a serious threat to the downstream users,\nwho query the diffusion models through the API or directly download them from\nthe internet. To mitigate the threat of backdoor attacks, there have been a\nplethora of investigations on backdoor detections. However, none of them\ndesigned a specialized backdoor detection method for diffusion models,\nrendering the area much under-explored. Moreover, these prior methods mainly\nfocus on the traditional neural networks in the classification task, which\ncannot be adapted to the backdoor detections on the generative task easily.\nAdditionally, most of the prior methods require white-box access to model\nweights and architectures, or the probability logits as additional information,\nwhich are not always practical. In this paper, we propose a Unified Framework\nfor Input-level backdoor Detection (UFID) on the diffusion models, which is\nmotivated by observations in the diffusion models and further validated with a\ntheoretical causality analysis. Extensive experiments across different datasets\non both conditional and unconditional diffusion models show that our method\nachieves a superb performance on detection effectiveness and run-time\nefficiency. The code is available at\nhttps://github.com/GuanZihan/official_UFID.\n","authors":["Zihan Guan","Mengxuan Hu","Sheng Li","Anil Vullikanti"],"pdf_url":"https://arxiv.org/pdf/2404.01101v1.pdf","comment":"20 pages,18 figures"},{"id":"http://arxiv.org/abs/2402.19231v2","updated":"2024-04-01T13:16:01Z","published":"2024-02-29T15:05:11Z","title":"CricaVPR: Cross-image Correlation-aware Representation Learning for\n Visual Place Recognition","summary":" Over the past decade, most methods in visual place recognition (VPR) have\nused neural networks to produce feature representations. These networks\ntypically produce a global representation of a place image using only this\nimage itself and neglect the cross-image variations (e.g. viewpoint and\nillumination), which limits their robustness in challenging scenes. In this\npaper, we propose a robust global representation method with cross-image\ncorrelation awareness for VPR, named CricaVPR. Our method uses the attention\nmechanism to correlate multiple images within a batch. These images can be\ntaken in the same place with different conditions or viewpoints, or even\ncaptured from different places. Therefore, our method can utilize the\ncross-image variations as a cue to guide the representation learning, which\nensures more robust features are produced. To further facilitate the\nrobustness, we propose a multi-scale convolution-enhanced adaptation method to\nadapt pre-trained visual foundation models to the VPR task, which introduces\nthe multi-scale local information to further enhance the cross-image\ncorrelation-aware representation. Experimental results show that our method\noutperforms state-of-the-art methods by a large margin with significantly less\ntraining time. The code is released at https://github.com/Lu-Feng/CricaVPR.\n","authors":["Feng Lu","Xiangyuan Lan","Lijun Zhang","Dongmei Jiang","Yaowei Wang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.19231v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2311.17049v2","updated":"2024-04-01T13:06:06Z","published":"2023-11-28T18:55:42Z","title":"MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced\n Training","summary":" Contrastive pretraining of image-text foundation models, such as CLIP,\ndemonstrated excellent zero-shot performance and improved robustness on a wide\nrange of downstream tasks. However, these models utilize large\ntransformer-based encoders with significant memory and latency overhead which\npose challenges for deployment on mobile devices. In this work, we introduce\nMobileCLIP -- a new family of efficient image-text models optimized for runtime\nperformance along with a novel and efficient training approach, namely\nmulti-modal reinforced training. The proposed training approach leverages\nknowledge transfer from an image captioning model and an ensemble of strong\nCLIP encoders to improve the accuracy of efficient models. Our approach avoids\ntrain-time compute overhead by storing the additional knowledge in a reinforced\ndataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for\nzero-shot classification and retrieval tasks on several datasets. Our\nMobileCLIP-S2 variant is 2.3$\\times$ faster while more accurate compared to\nprevious best CLIP model based on ViT-B/16. We further demonstrate the\neffectiveness of our multi-modal reinforced training by training a CLIP model\nbased on ViT-B/16 image backbone and achieving +2.9% average performance\nimprovement on 38 evaluation benchmarks compared to the previous best.\nMoreover, we show that the proposed approach achieves 10$\\times$-1000$\\times$\nimproved learning efficiency when compared with non-reinforced CLIP training.\nCode and models are available at https://github.com/apple/ml-mobileclip .\n","authors":["Pavan Kumar Anasosalu Vasu","Hadi Pouransari","Fartash Faghri","Raviteja Vemulapalli","Oncel Tuzel"],"pdf_url":"https://arxiv.org/pdf/2311.17049v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01094v1","updated":"2024-04-01T12:59:49Z","published":"2024-04-01T12:59:49Z","title":"HairFastGAN: Realistic and Robust Hair Transfer with a Fast\n Encoder-Based Approach","summary":" Our paper addresses the complex task of transferring a hairstyle from a\nreference image to an input photo for virtual hair try-on. This task is\nchallenging due to the need to adapt to various photo poses, the sensitivity of\nhairstyles, and the lack of objective metrics. The current state of the art\nhairstyle transfer methods use an optimization process for different parts of\nthe approach, making them inexcusably slow. At the same time, faster\nencoder-based models are of very low quality because they either operate in\nStyleGAN's W+ space or use other low-dimensional image generators.\nAdditionally, both approaches have a problem with hairstyle transfer when the\nsource pose is very different from the target pose, because they either don't\nconsider the pose at all or deal with it inefficiently. In our paper, we\npresent the HairFast model, which uniquely solves these problems and achieves\nhigh resolution, near real-time performance, and superior reconstruction\ncompared to optimization problem-based methods. Our solution includes a new\narchitecture operating in the FS latent space of StyleGAN, an enhanced\ninpainting approach, and improved encoders for better alignment, color\ntransfer, and a new encoder for post-processing. The effectiveness of our\napproach is demonstrated on realism metrics after random hairstyle transfer and\nreconstruction when the original hairstyle is transferred. In the most\ndifficult scenario of transferring both shape and color of a hairstyle from\ndifferent images, our method performs in less than a second on the Nvidia V100.\nOur code is available at https://github.com/AIRI-Institute/HairFastGAN.\n","authors":["Maxim Nikolaev","Mikhail Kuznetsov","Dmitry Vetrov","Aibek Alanov"],"pdf_url":"https://arxiv.org/pdf/2404.01094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01089v1","updated":"2024-04-01T12:43:22Z","published":"2024-04-01T12:43:22Z","title":"Texture-Preserving Diffusion Models for High-Fidelity Virtual Try-On","summary":" Image-based virtual try-on is an increasingly important task for online\nshopping. It aims to synthesize images of a specific person wearing a specified\ngarment. Diffusion model-based approaches have recently become popular, as they\nare excellent at image synthesis tasks. However, these approaches usually\nemploy additional image encoders and rely on the cross-attention mechanism for\ntexture transfer from the garment to the person image, which affects the\ntry-on's efficiency and fidelity. To address these issues, we propose an\nTexture-Preserving Diffusion (TPD) model for virtual try-on, which enhances the\nfidelity of the results and introduces no additional image encoders.\nAccordingly, we make contributions from two aspects. First, we propose to\nconcatenate the masked person and reference garment images along the spatial\ndimension and utilize the resulting image as the input for the diffusion\nmodel's denoising UNet. This enables the original self-attention layers\ncontained in the diffusion model to achieve efficient and accurate texture\ntransfer. Second, we propose a novel diffusion-based method that predicts a\nprecise inpainting mask based on the person and reference garment images,\nfurther enhancing the reliability of the try-on results. In addition, we\nintegrate mask prediction and image synthesis into a single compact model. The\nexperimental results show that our approach can be applied to various try-on\ntasks, e.g., garment-to-person and person-to-person try-ons, and significantly\noutperforms state-of-the-art methods on popular VITON, VITON-HD databases.\n","authors":["Xu Yang","Changxing Ding","Zhibin Hong","Junhao Huang","Jin Tao","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2404.01089v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01081v1","updated":"2024-04-01T12:21:56Z","published":"2024-04-01T12:21:56Z","title":"PhysReaction: Physically Plausible Real-Time Humanoid Reaction Synthesis\n via Forward Dynamics Guided 4D Imitation","summary":" Humanoid Reaction Synthesis is pivotal for creating highly interactive and\nempathetic robots that can seamlessly integrate into human environments,\nenhancing the way we live, work, and communicate. However, it is difficult to\nlearn the diverse interaction patterns of multiple humans and generate\nphysically plausible reactions. The kinematics-based approaches face\nchallenges, including issues like floating feet, sliding, penetration, and\nother problems that defy physical plausibility. The existing physics-based\nmethod often relies on kinematics-based methods to generate reference states,\nwhich struggle with the challenges posed by kinematic noise during action\nexecution. Constrained by their reliance on diffusion models, these methods are\nunable to achieve real-time inference. In this work, we propose a Forward\nDynamics Guided 4D Imitation method to generate physically plausible human-like\nreactions. The learned policy is capable of generating physically plausible and\nhuman-like reactions in real-time, significantly improving the speed(x33) and\nquality of reactions compared with the existing method. Our experiments on the\nInterHuman and Chi3D datasets, along with ablation studies, demonstrate the\neffectiveness of our approach.\n","authors":["Yunze Liu","Changxi Chen","Chenjing Ding","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2404.01081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01079v1","updated":"2024-04-01T12:19:54Z","published":"2024-04-01T12:19:54Z","title":"Stale Diffusion: Hyper-realistic 5D Movie Generation Using Old-school\n Methods","summary":" Two years ago, Stable Diffusion achieved super-human performance at\ngenerating images with super-human numbers of fingers. Following the steady\ndecline of its technical novelty, we propose Stale Diffusion, a method that\nsolidifies and ossifies Stable Diffusion in a maximum-entropy state. Stable\nDiffusion works analogously to a barn (the Stable) from which an infinite set\nof horses have escaped (the Diffusion). As the horses have long left the barn,\nour proposal may be seen as antiquated and irrelevant. Nevertheless, we\nvigorously defend our claim of novelty by identifying as early adopters of the\nSlow Science Movement, which will produce extremely important pearls of wisdom\nin the future. Our speed of contributions can also be seen as a quasi-static\nimplementation of the recent call to pause AI experiments, which we\nwholeheartedly support. As a result of a careful archaeological expedition to\n18-months-old Git commit histories, we found that naturally-accumulating errors\nhave produced a novel entropy-maximising Stale Diffusion method, that can\nproduce sleep-inducing hyper-realistic 5D video that is as good as one's\nimagination.\n","authors":["Joao F. Henriques","Dylan Campbell","Tengda Han"],"pdf_url":"https://arxiv.org/pdf/2404.01079v1.pdf","comment":"SIGBOVIK 2024"},{"id":"http://arxiv.org/abs/2404.01074v1","updated":"2024-04-01T12:16:00Z","published":"2024-04-01T12:16:00Z","title":"Prompt Learning for Oriented Power Transmission Tower Detection in\n High-Resolution SAR Images","summary":" Detecting transmission towers from synthetic aperture radar (SAR) images\nremains a challenging task due to the comparatively small size and side-looking\ngeometry, with background clutter interference frequently hindering tower\nidentification. A large number of interfering signals superimposes the return\nsignal from the tower. We found that localizing or prompting positions of power\ntransmission towers is beneficial to address this obstacle. Based on this\nrevelation, this paper introduces prompt learning into the oriented object\ndetector (P2Det) for multimodal information learning. P2Det contains the sparse\nprompt coding and cross-attention between the multimodal data. Specifically,\nthe sparse prompt encoder (SPE) is proposed to represent point locations,\nconverting prompts into sparse embeddings. The image embeddings are generated\nthrough the Transformer layers. Then a two-way fusion module (TWFM) is proposed\nto calculate the cross-attention of the two different embeddings. The\ninteraction of image-level and prompt-level features is utilized to address the\nclutter interference. A shape-adaptive refinement module (SARM) is proposed to\nreduce the effect of aspect ratio. Extensive experiments demonstrated the\neffectiveness of the proposed model on high-resolution SAR images. P2Det\nprovides a novel insight for multimodal object detection due to its competitive\nperformance.\n","authors":["Tianyang Li","Chao Wang","Hong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01074v1.pdf","comment":"22 pages, 12figures"},{"id":"http://arxiv.org/abs/2404.01065v1","updated":"2024-04-01T11:57:40Z","published":"2024-04-01T11:57:40Z","title":"T-Mamba: Frequency-Enhanced Gated Long-Range Dependency for Tooth 3D\n CBCT Segmentation","summary":" Efficient tooth segmentation in three-dimensional (3D) imaging, critical for\northodontic diagnosis, remains challenging due to noise, low contrast, and\nartifacts in CBCT images. Both convolutional Neural Networks (CNNs) and\ntransformers have emerged as popular architectures for image segmentation.\nHowever, their efficacy in handling long-range dependencies is limited due to\ninherent locality or computational complexity. To address this issue, we\npropose T-Mamba, integrating shared positional encoding and frequency-based\nfeatures into vision mamba, to address limitations in spatial position\npreservation and feature enhancement in frequency domain. Besides, we also\ndesign a gate selection unit to integrate two features in spatial domain and\none feature in frequency domain adaptively. T-Mamba is the first work to\nintroduce frequency-based features into vision mamba. Extensive experiments\ndemonstrate that T-Mamba achieves new SOTA results on the public Tooth CBCT\ndataset and outperforms previous SOTA methods by a large margin, i.e., IoU +\n3.63%, SO + 2.43%, DSC +2.30%, HD -4.39mm, and ASSD -0.37mm. The code and\nmodels are publicly available at https://github.com/isbrycee/T-Mamba.\n","authors":["Jing Hao","Lei He","Kuo Feng Hung"],"pdf_url":"https://arxiv.org/pdf/2404.01065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01064v1","updated":"2024-04-01T11:57:34Z","published":"2024-04-01T11:57:34Z","title":"Roadside Monocular 3D Detection via 2D Detection Prompting","summary":" The problem of roadside monocular 3D detection requires detecting objects of\ninterested classes in a 2D RGB frame and predicting their 3D information such\nas locations in bird's-eye-view (BEV). It has broad applications in traffic\ncontrol, vehicle-vehicle communication, and vehicle-infrastructure cooperative\nperception. To approach this problem, we present a novel and simple method by\nprompting the 3D detector using 2D detections. Our method builds on a key\ninsight that, compared with 3D detectors, a 2D detector is much easier to train\nand performs significantly better w.r.t detections on the 2D image plane. That\nsaid, one can exploit 2D detections of a well-trained 2D detector as prompts to\na 3D detector, being trained in a way of inflating such 2D detections to 3D\ntowards 3D detection. To construct better prompts using the 2D detector, we\nexplore three techniques: (a) concatenating both 2D and 3D detectors' features,\n(b) attentively fusing 2D and 3D detectors' features, and (c) encoding\npredicted 2D boxes x, y, width, height, label and attentively fusing such with\nthe 3D detector's features. Surprisingly, the third performs the best.\nMoreover, we present a yaw tuning tactic and a class-grouping strategy that\nmerges classes based on their functionality; these techniques improve 3D\ndetection performance further. Comprehensive ablation studies and extensive\nexperiments demonstrate that our method resoundingly outperforms prior works,\nachieving the state-of-the-art on two large-scale roadside 3D detection\nbenchmarks.\n","authors":["Yechi Ma","Shuoquan Wei","Churun Zhang","Wei Hua","Yanan Li","Shu Kong"],"pdf_url":"https://arxiv.org/pdf/2404.01064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03377v2","updated":"2024-04-01T11:55:46Z","published":"2023-06-06T03:37:41Z","title":"TextFormer: A Query-based End-to-End Text Spotter with Mixed Supervision","summary":" End-to-end text spotting is a vital computer vision task that aims to\nintegrate scene text detection and recognition into a unified framework.\nTypical methods heavily rely on Region-of-Interest (RoI) operations to extract\nlocal features and complex post-processing steps to produce final predictions.\nTo address these limitations, we propose TextFormer, a query-based end-to-end\ntext spotter with Transformer architecture. Specifically, using query embedding\nper text instance, TextFormer builds upon an image encoder and a text decoder\nto learn a joint semantic understanding for multi-task modeling. It allows for\nmutual training and optimization of classification, segmentation, and\nrecognition branches, resulting in deeper feature sharing without sacrificing\nflexibility or simplicity. Additionally, we design an Adaptive Global\naGgregation (AGG) module to transfer global features into sequential features\nfor reading arbitrarily-shaped texts, which overcomes the sub-optimization\nproblem of RoI operations. Furthermore, potential corpus information is\nutilized from weak annotations to full labels through mixed supervision,\nfurther improving text detection and end-to-end text spotting results.\nExtensive experiments on various bilingual (i.e., English and Chinese)\nbenchmarks demonstrate the superiority of our method. Especially on TDA-ReCTS\ndataset, TextFormer surpasses the state-of-the-art method in terms of 1-NED by\n13.2%.\n","authors":["Yukun Zhai","Xiaoqiang Zhang","Xiameng Qin","Sanyuan Zhao","Xingping Dong","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2306.03377v2.pdf","comment":"Machine Intelligence Research, MIR 2024"},{"id":"http://arxiv.org/abs/2312.08568v2","updated":"2024-04-01T11:49:22Z","published":"2023-12-13T23:41:17Z","title":"NViST: In the Wild New View Synthesis from a Single Image with\n Transformers","summary":" We propose NViST, a transformer-based model for efficient and generalizable\nnovel-view synthesis from a single image for real-world scenes. In contrast to\nmany methods that are trained on synthetic data, object-centred scenarios, or\nin a category-specific manner, NViST is trained on MVImgNet, a large-scale\ndataset of casually-captured real-world videos of hundreds of object categories\nwith diverse backgrounds. NViST transforms image inputs directly into a\nradiance field, conditioned on camera parameters via adaptive layer\nnormalisation. In practice, NViST exploits fine-tuned masked autoencoder (MAE)\nfeatures and translates them to 3D output tokens via cross-attention, while\naddressing occlusions with self-attention. To move away from object-centred\ndatasets and enable full scene synthesis, NViST adopts a 6-DOF camera pose\nmodel and only requires relative pose, dropping the need for canonicalization\nof the training data, which removes a substantial barrier to it being used on\ncasually captured datasets. We show results on unseen objects and categories\nfrom MVImgNet and even generalization to casual phone captures. We conduct\nqualitative and quantitative evaluations on MVImgNet and ShapeNet to show that\nour model represents a step forward towards enabling true in-the-wild\ngeneralizable novel-view synthesis from a single image. Project webpage:\nhttps://wbjang.github.io/nvist_webpage.\n","authors":["Wonbong Jang","Lourdes Agapito"],"pdf_url":"https://arxiv.org/pdf/2312.08568v2.pdf","comment":"CVPR 2024, Project page: https://wbjang.github.io/nvist_webpage"},{"id":"http://arxiv.org/abs/2311.18608v2","updated":"2024-04-01T11:44:25Z","published":"2023-11-30T15:06:10Z","title":"Contrastive Denoising Score for Text-guided Latent Diffusion Image\n Editing","summary":" With the remarkable advent of text-to-image diffusion models, image editing\nmethods have become more diverse and continue to evolve. A promising recent\napproach in this realm is Delta Denoising Score (DDS) - an image editing\ntechnique based on Score Distillation Sampling (SDS) framework that leverages\nthe rich generative prior of text-to-image diffusion models. However, relying\nsolely on the difference between scoring functions is insufficient for\npreserving specific structural elements from the original image, a crucial\naspect of image editing. To address this, here we present an embarrassingly\nsimple yet very powerful modification of DDS, called Contrastive Denoising\nScore (CDS), for latent diffusion models (LDM). Inspired by the similarities\nand differences between DDS and the contrastive learning for unpaired\nimage-to-image translation(CUT), we introduce a straightforward approach using\nCUT loss within the DDS framework. Rather than employing auxiliary networks as\nin the original CUT approach, we leverage the intermediate features of LDM,\nspecifically those from the self-attention layers, which possesses rich spatial\ninformation. Our approach enables zero-shot image-to-image translation and\nneural radiance field (NeRF) editing, achieving structural correspondence\nbetween the input and output while maintaining content controllability.\nQualitative results and comparisons demonstrates the effectiveness of our\nproposed method. Project page: https://hyelinnam.github.io/CDS/\n","authors":["Hyelin Nam","Gihyun Kwon","Geon Yeong Park","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.18608v2.pdf","comment":"CVPR 2024 (poster); Project page: https://hyelinnam.github.io/CDS/"},{"id":"http://arxiv.org/abs/2312.14457v2","updated":"2024-04-01T11:42:43Z","published":"2023-12-22T06:15:03Z","title":"QUAR-VLA: Vision-Language-Action Model for Quadruped Robots","summary":" The important manifestation of robot intelligence is the ability to naturally\ninteract and autonomously make decisions. Traditional approaches to robot\ncontrol often compartmentalize perception, planning, and decision-making,\nsimplifying system design but limiting the synergy between different\ninformation streams. This compartmentalization poses challenges in achieving\nseamless autonomous reasoning, decision-making, and action execution. To\naddress these limitations, a novel paradigm, named Vision-Language-Action tasks\nfor QUAdruped Robots (QUAR-VLA), has been introduced in this paper. This\napproach tightly integrates visual information and instructions to generate\nexecutable actions, effectively merging perception, planning, and\ndecision-making. The central idea is to elevate the overall intelligence of the\nrobot. Within this framework, a notable challenge lies in aligning fine-grained\ninstructions with visual perception information. This emphasizes the complexity\ninvolved in ensuring that the robot accurately interprets and acts upon\ndetailed instructions in harmony with its visual observations. Consequently, we\npropose QUAdruped Robotic Transformer (QUART), a family of VLA models to\nintegrate visual information and instructions from diverse modalities as input\nand generates executable actions for real-world robots and present QUAdruped\nRobot Dataset (QUARD), a large-scale multi-task dataset including navigation,\ncomplex terrain locomotion, and whole-body manipulation tasks for training\nQUART models. Our extensive evaluation (4000 evaluation trials) shows that our\napproach leads to performant robotic policies and enables QUART to obtain a\nrange of emergent capabilities.\n","authors":["Pengxiang Ding","Han Zhao","Zhitao Wang","Zhenyu Wei","Shangke Lyu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14457v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01053v1","updated":"2024-04-01T11:23:38Z","published":"2024-04-01T11:23:38Z","title":"HAHA: Highly Articulated Gaussian Human Avatars with Textured Mesh Prior","summary":" We present HAHA - a novel approach for animatable human avatar generation\nfrom monocular input videos. The proposed method relies on learning the\ntrade-off between the use of Gaussian splatting and a textured mesh for\nefficient and high fidelity rendering. We demonstrate its efficiency to animate\nand render full-body human avatars controlled via the SMPL-X parametric model.\nOur model learns to apply Gaussian splatting only in areas of the SMPL-X mesh\nwhere it is necessary, like hair and out-of-mesh clothing. This results in a\nminimal number of Gaussians being used to represent the full avatar, and\nreduced rendering artifacts. This allows us to handle the animation of small\nbody parts such as fingers that are traditionally disregarded. We demonstrate\nthe effectiveness of our approach on two open datasets: SnapshotPeople and\nX-Humans. Our method demonstrates on par reconstruction quality to the\nstate-of-the-art on SnapshotPeople, while using less than a third of Gaussians.\nHAHA outperforms previous state-of-the-art on novel poses from X-Humans both\nquantitatively and qualitatively.\n","authors":["David Svitov","Pietro Morerio","Lourdes Agapito","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2404.01053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01051v1","updated":"2024-04-01T11:12:06Z","published":"2024-04-01T11:12:06Z","title":"Action Detection via an Image Diffusion Process","summary":" Action detection aims to localize the starting and ending points of action\ninstances in untrimmed videos, and predict the classes of those instances. In\nthis paper, we make the observation that the outputs of the action detection\ntask can be formulated as images. Thus, from a novel perspective, we tackle\naction detection via a three-image generation process to generate starting\npoint, ending point and action-class predictions as images via our proposed\nAction Detection Image Diffusion (ADI-Diff) framework. Furthermore, since our\nimages differ from natural images and exhibit special properties, we further\nexplore a Discrete Action-Detection Diffusion Process and a Row-Column\nTransformer design to better handle their processing. Our ADI-Diff framework\nachieves state-of-the-art results on two widely-used datasets.\n","authors":["Lin Geng Foo","Tianjiao Li","Hossein Rahmani","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01051v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01050v1","updated":"2024-04-01T11:09:40Z","published":"2024-04-01T11:09:40Z","title":"Drag Your Noise: Interactive Point-based Editing via Diffusion Semantic\n Propagation","summary":" Point-based interactive editing serves as an essential tool to complement the\ncontrollability of existing generative models. A concurrent work,\nDragDiffusion, updates the diffusion latent map in response to user inputs,\ncausing global latent map alterations. This results in imprecise preservation\nof the original content and unsuccessful editing due to gradient vanishing. In\ncontrast, we present DragNoise, offering robust and accelerated editing without\nretracing the latent map. The core rationale of DragNoise lies in utilizing the\npredicted noise output of each U-Net as a semantic editor. This approach is\ngrounded in two critical observations: firstly, the bottleneck features of\nU-Net inherently possess semantically rich features ideal for interactive\nediting; secondly, high-level semantics, established early in the denoising\nprocess, show minimal variation in subsequent stages. Leveraging these\ninsights, DragNoise edits diffusion semantics in a single denoising step and\nefficiently propagates these changes, ensuring stability and efficiency in\ndiffusion editing. Comparative experiments reveal that DragNoise achieves\nsuperior control and semantic retention, reducing the optimization time by over\n50% compared to DragDiffusion. Our codes are available at\nhttps://github.com/haofengl/DragNoise.\n","authors":["Haofeng Liu","Chenshu Xu","Yifei Yang","Lihua Zeng","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2404.01050v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01036v1","updated":"2024-04-01T10:43:50Z","published":"2024-04-01T10:43:50Z","title":"Higher education assessment practice in the era of generative AI tools","summary":" The higher education (HE) sector benefits every nation's economy and society\nat large. However, their contributions are challenged by advanced technologies\nlike generative artificial intelligence (GenAI) tools. In this paper, we\nprovide a comprehensive assessment of GenAI tools towards assessment and\npedagogic practice and, subsequently, discuss the potential impacts. This study\nexperimented using three assessment instruments from data science, data\nanalytics, and construction management disciplines. Our findings are two-fold:\nfirst, the findings revealed that GenAI tools exhibit subject knowledge,\nproblem-solving, analytical, critical thinking, and presentation skills and\nthus can limit learning when used unethically. Secondly, the design of the\nassessment of certain disciplines revealed the limitations of the GenAI tools.\nBased on our findings, we made recommendations on how AI tools can be utilised\nfor teaching and learning in HE.\n","authors":["Bayode Ogunleye","Kudirat Ibilola Zakariyyah","Oluwaseun Ajao","Olakunle Olayinka","Hemlata Sharma"],"pdf_url":"https://arxiv.org/pdf/2404.01036v1.pdf","comment":"11 pages, 7 tables published in the Journal of Applied Learning &\n Teaching"},{"id":"http://arxiv.org/abs/2404.01024v1","updated":"2024-04-01T10:08:23Z","published":"2024-04-01T10:08:23Z","title":"AIGCOIQA2024: Perceptual Quality Assessment of AI Generated\n Omnidirectional Images","summary":" In recent years, the rapid advancement of Artificial Intelligence Generated\nContent (AIGC) has attracted widespread attention. Among the AIGC, AI generated\nomnidirectional images hold significant potential for Virtual Reality (VR) and\nAugmented Reality (AR) applications, hence omnidirectional AIGC techniques have\nalso been widely studied. AI-generated omnidirectional images exhibit unique\ndistortions compared to natural omnidirectional images, however, there is no\ndedicated Image Quality Assessment (IQA) criteria for assessing them. This\nstudy addresses this gap by establishing a large-scale AI generated\nomnidirectional image IQA database named AIGCOIQA2024 and constructing a\ncomprehensive benchmark. We first generate 300 omnidirectional images based on\n5 AIGC models utilizing 25 text prompts. A subjective IQA experiment is\nconducted subsequently to assess human visual preferences from three\nperspectives including quality, comfortability, and correspondence. Finally, we\nconduct a benchmark experiment to evaluate the performance of state-of-the-art\nIQA models on our database. The database will be released to facilitate future\nresearch.\n","authors":["Liu Yang","Huiyu Duan","Long Teng","Yucheng Zhu","Xiaohong Liu","Menghan Hu","Xiongkuo Min","Guangtao Zhai","Patrick Le Callet"],"pdf_url":"https://arxiv.org/pdf/2404.01024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00343v5","updated":"2024-04-01T09:49:27Z","published":"2023-12-01T04:35:47Z","title":"OpenStereo: A Comprehensive Benchmark for Stereo Matching and Strong\n Baseline","summary":" Stereo matching aims to estimate the disparity between matching pixels in a\nstereo image pair, which is of great importance to robotics, autonomous\ndriving, and other computer vision tasks. Despite the development of numerous\nimpressive methods in recent years, replicating their results and determining\nthe most suitable architecture for practical application remains challenging.\nAddressing this gap, our paper introduces a comprehensive benchmark focusing on\npractical applicability rather than solely on performance enhancement.\nSpecifically, we develop a flexible and efficient stereo matching codebase,\ncalled OpenStereo. OpenStereo includes training and inference codes of more\nthan 10 network models, making it, to our knowledge, the most complete stereo\nmatching toolbox available. Based on OpenStereo, we conducted experiments and\nhave achieved or surpassed the performance metrics reported in the original\npaper. Additionally, we carry out an exhaustive analysis and deconstruction of\nrecent developments in stereo matching through comprehensive ablative\nexperiments. These investigations inspired the creation of StereoBase, a strong\nbaseline model. Our StereoBase ranks 1st on SceneFlow, KITTI 2015, 2012\n(Reflective) among published methods and achieves the best performance across\nall metrics. In addition, StereoBase has strong cross-dataset\ngeneralization.Code is available at\n\\url{https://github.com/XiandaGuo/OpenStereo}.\n","authors":["Xianda Guo","Juntao Lu","Chenming Zhang","Yiqi Wang","Yiqun Duan","Tian Yang","Zheng Zhu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00343v5.pdf","comment":"Code is available at: https://github.com/XiandaGuo/OpenStereo"},{"id":"http://arxiv.org/abs/2308.14418v4","updated":"2024-04-01T09:35:30Z","published":"2023-08-28T08:54:27Z","title":"Multiscale and Multilayer Contrastive Learning for Domain Generalization","summary":" During the past decade, deep neural networks have led to fast-paced progress\nand significant achievements in computer vision problems, for both academia and\nindustry. Yet despite their success, state-of-the-art image classification\napproaches fail to generalize well in previously unseen visual contexts, as\nrequired by many real-world applications. In this paper, we focus on this\ndomain generalization (DG) problem and argue that the generalization ability of\ndeep convolutional neural networks can be improved by taking advantage of\nmulti-layer and multi-scaled representations of the network. We introduce a\nframework that aims at improving domain generalization of image classifiers by\ncombining both low-level and high-level features at multiple scales, enabling\nthe network to implicitly disentangle representations in its latent space and\nlearn domain-invariant attributes of the depicted objects. Additionally, to\nfurther facilitate robust representation learning, we propose a novel objective\nfunction, inspired by contrastive learning, which aims at constraining the\nextracted representations to remain invariant under distribution shifts. We\ndemonstrate the effectiveness of our method by evaluating on the domain\ngeneralization datasets of PACS, VLCS, Office-Home and NICO. Through extensive\nexperimentation, we show that our model is able to surpass the performance of\nprevious DG methods and consistently produce competitive and state-of-the-art\nresults in all datasets\n","authors":["Aristotelis Ballas","Christos Diou"],"pdf_url":"https://arxiv.org/pdf/2308.14418v4.pdf","comment":"Manuscript accepted in: IEEE Transactions on Artificial Intelligence\n (March 2024)"},{"id":"http://arxiv.org/abs/2404.01014v1","updated":"2024-04-01T09:34:55Z","published":"2024-04-01T09:34:55Z","title":"Harnessing Large Language Models for Training-free Video Anomaly\n Detection","summary":" Video anomaly detection (VAD) aims to temporally locate abnormal events in a\nvideo. Existing works mostly rely on training deep models to learn the\ndistribution of normality with either video-level supervision, one-class\nsupervision, or in an unsupervised setting. Training-based methods are prone to\nbe domain-specific, thus being costly for practical deployment as any domain\nchange will involve data collection and model training. In this paper, we\nradically depart from previous efforts and propose LAnguage-based VAD (LAVAD),\na method tackling VAD in a novel, training-free paradigm, exploiting the\ncapabilities of pre-trained large language models (LLMs) and existing\nvision-language models (VLMs). We leverage VLM-based captioning models to\ngenerate textual descriptions for each frame of any test video. With the\ntextual scene description, we then devise a prompting mechanism to unlock the\ncapability of LLMs in terms of temporal aggregation and anomaly score\nestimation, turning LLMs into an effective video anomaly detector. We further\nleverage modality-aligned VLMs and propose effective techniques based on\ncross-modal similarity for cleaning noisy captions and refining the LLM-based\nanomaly scores. We evaluate LAVAD on two large datasets featuring real-world\nsurveillance scenarios (UCF-Crime and XD-Violence), showing that it outperforms\nboth unsupervised and one-class methods without requiring any training or data\ncollection.\n","authors":["Luca Zanella","Willi Menapace","Massimiliano Mancini","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.01014v1.pdf","comment":"CVPR 2024. Project website at https://lucazanella.github.io/lavad/"},{"id":"http://arxiv.org/abs/2404.01013v1","updated":"2024-04-01T09:34:51Z","published":"2024-04-01T09:34:51Z","title":"Teeth-SEG: An Efficient Instance Segmentation Framework for Orthodontic\n Treatment based on Anthropic Prior Knowledge","summary":" Teeth localization, segmentation, and labeling in 2D images have great\npotential in modern dentistry to enhance dental diagnostics, treatment\nplanning, and population-based studies on oral health. However, general\ninstance segmentation frameworks are incompetent due to 1) the subtle\ndifferences between some teeth' shapes (e.g., maxillary first premolar and\nsecond premolar), 2) the teeth's position and shape variation across subjects,\nand 3) the presence of abnormalities in the dentition (e.g., caries and\nedentulism). To address these problems, we propose a ViT-based framework named\nTeethSEG, which consists of stacked Multi-Scale Aggregation (MSA) blocks and an\nAnthropic Prior Knowledge (APK) layer. Specifically, to compose the two\nmodules, we design 1) a unique permutation-based upscaler to ensure high\nefficiency while establishing clear segmentation boundaries with 2) multi-head\nself/cross-gating layers to emphasize particular semantics meanwhile\nmaintaining the divergence between token embeddings. Besides, we collect 3) the\nfirst open-sourced intraoral image dataset IO150K, which comprises over 150k\nintraoral photos, and all photos are annotated by orthodontists using a\nhuman-machine hybrid algorithm. Experiments on IO150K demonstrate that our\nTeethSEG outperforms the state-of-the-art segmentation models on dental image\nsegmentation.\n","authors":["Bo Zou","Shaofeng Wang","Hao Liu","Gaoyue Sun","Yajie Wang","FeiFei Zuo","Chengbin Quan","Youjian Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.01013v1.pdf","comment":"This paper has been accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2305.15357v5","updated":"2024-04-01T09:29:49Z","published":"2023-05-24T17:09:54Z","title":"Solving Diffusion ODEs with Optimal Boundary Conditions for Better Image\n Super-Resolution","summary":" Diffusion models, as a kind of powerful generative model, have given\nimpressive results on image super-resolution (SR) tasks. However, due to the\nrandomness introduced in the reverse process of diffusion models, the\nperformances of diffusion-based SR models are fluctuating at every time of\nsampling, especially for samplers with few resampled steps. This inherent\nrandomness of diffusion models results in ineffectiveness and instability,\nmaking it challenging for users to guarantee the quality of SR results.\nHowever, our work takes this randomness as an opportunity: fully analyzing and\nleveraging it leads to the construction of an effective plug-and-play sampling\nmethod that owns the potential to benefit a series of diffusion-based SR\nmethods. More in detail, we propose to steadily sample high-quality SR images\nfrom pre-trained diffusion-based SR models by solving diffusion ordinary\ndifferential equations (diffusion ODEs) with optimal boundary conditions (BCs)\nand analyze the characteristics between the choices of BCs and their\ncorresponding SR results. Our analysis shows the route to obtain an\napproximately optimal BC via an efficient exploration in the whole space. The\nquality of SR results sampled by the proposed method with fewer steps\noutperforms the quality of results sampled by current methods with randomness\nfrom the same pre-trained diffusion-based SR model, which means that our\nsampling method \"boosts\" current diffusion-based SR models without any\nadditional training.\n","authors":["Yiyang Ma","Huan Yang","Wenhan Yang","Jianlong Fu","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2305.15357v5.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2403.11270v2","updated":"2024-04-01T09:11:13Z","published":"2024-03-17T16:48:46Z","title":"Bilateral Propagation Network for Depth Completion","summary":" Depth completion aims to derive a dense depth map from sparse depth\nmeasurements with a synchronized color image. Current state-of-the-art (SOTA)\nmethods are predominantly propagation-based, which work as an iterative\nrefinement on the initial estimated dense depth. However, the initial depth\nestimations mostly result from direct applications of convolutional layers on\nthe sparse depth map. In this paper, we present a Bilateral Propagation Network\n(BP-Net), that propagates depth at the earliest stage to avoid directly\nconvolving on sparse data. Specifically, our approach propagates the target\ndepth from nearby depth measurements via a non-linear model, whose coefficients\nare generated through a multi-layer perceptron conditioned on both\n\\emph{radiometric difference} and \\emph{spatial distance}. By integrating\nbilateral propagation with multi-modal fusion and depth refinement in a\nmulti-scale framework, our BP-Net demonstrates outstanding performance on both\nindoor and outdoor scenes. It achieves SOTA on the NYUv2 dataset and ranks 1st\non the KITTI depth completion benchmark at the time of submission. Experimental\nresults not only show the effectiveness of bilateral propagation but also\nemphasize the significance of early-stage propagation in contrast to the\nrefinement stage. Our code and trained models will be available on the project\npage.\n","authors":["Jie Tang","Fei-Peng Tian","Boshi An","Jian Li","Ping Tan"],"pdf_url":"https://arxiv.org/pdf/2403.11270v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.04466v2","updated":"2024-04-01T08:56:59Z","published":"2023-12-07T17:39:25Z","title":"Emotional Speech-driven 3D Body Animation via Disentangled Latent\n Diffusion","summary":" Existing methods for synthesizing 3D human gestures from speech have shown\npromising results, but they do not explicitly model the impact of emotions on\nthe generated gestures. Instead, these methods directly output animations from\nspeech without control over the expressed emotion. To address this limitation,\nwe present AMUSE, an emotional speech-driven body animation model based on\nlatent diffusion. Our observation is that content (i.e., gestures related to\nspeech rhythm and word utterances), emotion, and personal style are separable.\nTo account for this, AMUSE maps the driving audio to three disentangled latent\nvectors: one for content, one for emotion, and one for personal style. A latent\ndiffusion model, trained to generate gesture motion sequences, is then\nconditioned on these latent vectors. Once trained, AMUSE synthesizes 3D human\ngestures directly from speech with control over the expressed emotions and\nstyle by combining the content from the driving speech with the emotion and\nstyle of another speech sequence. Randomly sampling the noise of the diffusion\nmodel further generates variations of the gesture with the same emotional\nexpressivity. Qualitative, quantitative, and perceptual evaluations demonstrate\nthat AMUSE outputs realistic gesture sequences. Compared to the state of the\nart, the generated gestures are better synchronized with the speech content,\nand better represent the emotion expressed by the input speech. Our code is\navailable at amuse.is.tue.mpg.de.\n","authors":["Kiran Chhatre","Radek Daněček","Nikos Athanasiou","Giorgio Becherini","Christopher Peters","Michael J. Black","Timo Bolkart"],"pdf_url":"https://arxiv.org/pdf/2312.04466v2.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR) 2024.\n Webpage: https://amuse.is.tue.mpg.de/"},{"id":"http://arxiv.org/abs/2404.00994v1","updated":"2024-04-01T08:44:11Z","published":"2024-04-01T08:44:11Z","title":"AMOR: Ambiguous Authorship Order","summary":" As we all know, writing scientific papers together with our beloved\ncolleagues is a truly remarkable experience (partially): endless discussions\nabout the same useless paragraph over and over again, followed by long days and\nlong nights -- both at the same time. What a wonderful ride it is! What a\nbeautiful life we have. But wait, there's one tiny little problem that utterly\nshatters the peace, turning even renowned scientists into bloodthirsty\nmonsters: author order. The reason is that, contrary to widespread opinion,\nit's not the font size that matters, but the way things are ordered. Of course,\nthis is a fairly well-known fact among scientists all across the planet (and\nbeyond) and explains clearly why we regularly have to read about yet another\nescalated paper submission in local police reports.\n In this paper, we take an important step backwards to tackle this issue by\nsolving the so-called author ordering problem (AOP) once and for all.\nSpecifically, we propose AMOR, a system that replaces silly constructs like\nco-first or co-middle authorship with a simple yet easy probabilistic approach\nbased on random shuffling of the author list at viewing time. In addition to\nAOP, we also solve the ambiguous author ordering citation problem} (AAOCP) on\nthe fly. Stop author violence, be human.\n","authors":["Maximilian Weiherer","Andreea Dogaru","Shreya Kapoor","Hannah Schieber","Bernhard Egger"],"pdf_url":"https://arxiv.org/pdf/2404.00994v1.pdf","comment":"SIGBOVIK '24 submission"},{"id":"http://arxiv.org/abs/2404.00992v1","updated":"2024-04-01T08:37:57Z","published":"2024-04-01T08:37:57Z","title":"SGCNeRF: Few-Shot Neural Rendering via Sparse Geometric Consistency\n Guidance","summary":" Neural Radiance Field (NeRF) technology has made significant strides in\ncreating novel viewpoints. However, its effectiveness is hampered when working\nwith sparsely available views, often leading to performance dips due to\noverfitting. FreeNeRF attempts to overcome this limitation by integrating\nimplicit geometry regularization, which incrementally improves both geometry\nand textures. Nonetheless, an initial low positional encoding bandwidth results\nin the exclusion of high-frequency elements. The quest for a holistic approach\nthat simultaneously addresses overfitting and the preservation of\nhigh-frequency details remains ongoing. This study introduces a novel feature\nmatching based sparse geometry regularization module. This module excels in\npinpointing high-frequency keypoints, thereby safeguarding the integrity of\nfine details. Through progressive refinement of geometry and textures across\nNeRF iterations, we unveil an effective few-shot neural rendering architecture,\ndesignated as SGCNeRF, for enhanced novel view synthesis. Our experiments\ndemonstrate that SGCNeRF not only achieves superior geometry-consistent\noutcomes but also surpasses FreeNeRF, with improvements of 0.7 dB and 0.6 dB in\nPSNR on the LLFF and DTU datasets, respectively.\n","authors":["Yuru Xiao","Xianming Liu","Deming Zhai","Kui Jiang","Junjun Jiang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2404.00992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00989v1","updated":"2024-04-01T08:34:42Z","published":"2024-04-01T08:34:42Z","title":"360+x: A Panoptic Multi-modal Scene Understanding Dataset","summary":" Human perception of the world is shaped by a multitude of viewpoints and\nmodalities. While many existing datasets focus on scene understanding from a\ncertain perspective (e.g. egocentric or third-person views), our dataset offers\na panoptic perspective (i.e. multiple viewpoints with multiple data\nmodalities). Specifically, we encapsulate third-person panoramic and front\nviews, as well as egocentric monocular/binocular views with rich modalities\nincluding video, multi-channel audio, directional binaural delay, location data\nand textual scene descriptions within each scene captured, presenting\ncomprehensive observation of the world. Figure 1 offers a glimpse of all 28\nscene categories of our 360+x dataset. To the best of our knowledge, this is\nthe first database that covers multiple viewpoints with multiple data\nmodalities to mimic how daily information is accessed in the real world.\nThrough our benchmark analysis, we presented 5 different scene understanding\ntasks on the proposed 360+x dataset to evaluate the impact and benefit of each\ndata modality and perspective in panoptic scene understanding. We hope this\nunique dataset could broaden the scope of comprehensive scene understanding and\nencourage the community to approach these problems from more diverse\nperspectives.\n","authors":["Hao Chen","Yuqi Hou","Chenyuan Qu","Irene Testini","Xiaohan Hong","Jianbo Jiao"],"pdf_url":"https://arxiv.org/pdf/2404.00989v1.pdf","comment":"To access the public dataset, please visit\n https://x360dataset.github.io"},{"id":"http://arxiv.org/abs/2107.11267v3","updated":"2024-04-01T08:28:33Z","published":"2021-07-23T14:34:57Z","title":"Dense Supervision Propagation for Weakly Supervised Semantic\n Segmentation on 3D Point Clouds","summary":" Semantic segmentation on 3D point clouds is an important task for 3D scene\nunderstanding. While dense labeling on 3D data is expensive and time-consuming,\nonly a few works address weakly supervised semantic point cloud segmentation\nmethods to relieve the labeling cost by learning from simpler and cheaper\nlabels. Meanwhile, there are still huge performance gaps between existing\nweakly supervised methods and state-of-the-art fully supervised methods. In\nthis paper, we train a semantic point cloud segmentation network with only a\nsmall portion of points being labeled. We argue that we can better utilize the\nlimited supervision information as we densely propagate the supervision signal\nfrom the labeled points to other points within and across the input samples.\nSpecifically, we propose a cross-sample feature reallocating module to transfer\nsimilar features and therefore re-route the gradients across two samples with\ncommon classes and an intra-sample feature redistribution module to propagate\nsupervision signals on unlabeled points across and within point cloud samples.\nWe conduct extensive experiments on public datasets S3DIS and ScanNet. Our\nweakly supervised method with only 10% and 1% of labels can produce compatible\nresults with the fully supervised counterpart.\n","authors":["Jiacheng Wei","Guosheng Lin","Kim-Hui Yap","Fayao Liu","Tzu-Yi Hung"],"pdf_url":"https://arxiv.org/pdf/2107.11267v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00987v1","updated":"2024-04-01T08:20:18Z","published":"2024-04-01T08:20:18Z","title":"FlexiDreamer: Single Image-to-3D Generation with FlexiCubes","summary":" 3D content generation from text prompts or single images has made remarkable\nprogress in quality and speed recently. One of its dominant paradigms involves\ngenerating consistent multi-view images followed by a sparse-view\nreconstruction. However, due to the challenge of directly deforming the mesh\nrepresentation to approach the target topology, most methodologies learn an\nimplicit representation (such as NeRF) during the sparse-view reconstruction\nand acquire the target mesh by a post-processing extraction. Although the\nimplicit representation can effectively model rich 3D information, its training\ntypically entails a long convergence time. In addition, the post-extraction\noperation from the implicit field also leads to undesirable visual artifacts.\nIn this paper, we propose FlexiDreamer, a novel single image-to-3d generation\nframework that reconstructs the target mesh in an end-to-end manner. By\nleveraging a flexible gradient-based extraction known as FlexiCubes, our method\ncircumvents the defects brought by the post-processing and facilitates a direct\nacquisition of the target mesh. Furthermore, we incorporate a multi-resolution\nhash grid encoding scheme that progressively activates the encoding levels into\nthe implicit field in FlexiCubes to help capture geometric details for per-step\noptimization. Notably, FlexiDreamer recovers a dense 3D structure from a\nsingle-view image in approximately 1 minute on a single NVIDIA A100 GPU,\noutperforming previous methodologies by a large margin.\n","authors":["Ruowen Zhao","Zhengyi Wang","Yikai Wang","Zihan Zhou","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.00987v1.pdf","comment":"project page:https://flexidreamer.github.io"},{"id":"http://arxiv.org/abs/2404.00986v1","updated":"2024-04-01T08:18:38Z","published":"2024-04-01T08:18:38Z","title":"Make Continual Learning Stronger via C-Flat","summary":" Model generalization ability upon incrementally acquiring dynamically\nupdating knowledge from sequentially arriving tasks is crucial to tackle the\nsensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape\nsharpness minimization seeking for flat minima lying in neighborhoods with\nuniform low loss or smooth gradient is proven to be a strong training regime\nimproving model generalization compared with loss minimization based optimizer\nlike SGD. Yet only a few works have discussed this training regime for CL,\nproving that dedicated designed zeroth-order sharpness optimizer can improve CL\nperformance. In this work, we propose a Continual Flatness (C-Flat) method\nfeaturing a flatter loss landscape tailored for CL. C-Flat could be easily\ncalled with only one line of code and is plug-and-play to any CL methods. A\ngeneral framework of C-Flat applied to all CL categories and a thorough\ncomparison with loss minima optimizer and flat minima based CL approaches is\npresented in this paper, showing that our method can boost CL performance in\nalmost all cases. Code will be publicly available upon publication.\n","authors":["Ang Bian","Wei Li","Hangjie Yuan","Chengrong Yu","Zixiang Zhao","Mang Wang","Aojun Lu","Tao Feng"],"pdf_url":"https://arxiv.org/pdf/2404.00986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00980v1","updated":"2024-04-01T07:52:05Z","published":"2024-04-01T07:52:05Z","title":"CAMO: Correlation-Aware Mask Optimization with Modulated Reinforcement\n Learning","summary":" Optical proximity correction (OPC) is a vital step to ensure printability in\nmodern VLSI manufacturing. Various OPC approaches based on machine learning\nhave been proposed to pursue performance and efficiency, which are typically\ndata-driven and hardly involve any particular considerations of the OPC\nproblem, leading to potential performance or efficiency bottlenecks. In this\npaper, we propose CAMO, a reinforcement learning-based OPC system that\nspecifically integrates important principles of the OPC problem. CAMO\nexplicitly involves the spatial correlation among the movements of neighboring\nsegments and an OPC-inspired modulation for movement action selection.\nExperiments are conducted on both via layer patterns and metal layer patterns.\nThe results demonstrate that CAMO outperforms state-of-the-art OPC engines from\nboth academia and industry.\n","authors":["Xiaoxiao Liang","Haoyu Yang","Kang Liu","Bei Yu","Yuzhe Ma"],"pdf_url":"https://arxiv.org/pdf/2404.00980v1.pdf","comment":"Accepted by DAC 2024"},{"id":"http://arxiv.org/abs/2404.00979v1","updated":"2024-04-01T07:50:10Z","published":"2024-04-01T07:50:10Z","title":"PDF: A Probability-Driven Framework for Open World 3D Point Cloud\n Semantic Segmentation","summary":" Existing point cloud semantic segmentation networks cannot identify unknown\nclasses and update their knowledge, due to a closed-set and static perspective\nof the real world, which would induce the intelligent agent to make bad\ndecisions. To address this problem, we propose a Probability-Driven Framework\n(PDF) for open world semantic segmentation that includes (i) a lightweight\nU-decoder branch to identify unknown classes by estimating the uncertainties,\n(ii) a flexible pseudo-labeling scheme to supply geometry features along with\nprobability distribution features of unknown classes by generating pseudo\nlabels, and (iii) an incremental knowledge distillation strategy to incorporate\nnovel classes into the existing knowledge base gradually. Our framework enables\nthe model to behave like human beings, which could recognize unknown objects\nand incrementally learn them with the corresponding knowledge. Experimental\nresults on the S3DIS and ScanNetv2 datasets demonstrate that the proposed PDF\noutperforms other methods by a large margin in both important tasks of open\nworld semantic segmentation.\n","authors":["Jinfeng Xu","Siyuan Yang","Xianzhi Li","Yuan Tang","Yixue Hao","Long Hu","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2404.00979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00974v1","updated":"2024-04-01T07:45:42Z","published":"2024-04-01T07:45:42Z","title":"Improving Visual Recognition with Hyperbolical Visual Hierarchy Mapping","summary":" Visual scenes are naturally organized in a hierarchy, where a coarse semantic\nis recursively comprised of several fine details. Exploring such a visual\nhierarchy is crucial to recognize the complex relations of visual elements,\nleading to a comprehensive scene understanding. In this paper, we propose a\nVisual Hierarchy Mapper (Hi-Mapper), a novel approach for enhancing the\nstructured understanding of the pre-trained Deep Neural Networks (DNNs).\nHi-Mapper investigates the hierarchical organization of the visual scene by 1)\npre-defining a hierarchy tree through the encapsulation of probability\ndensities; and 2) learning the hierarchical relations in hyperbolic space with\na novel hierarchical contrastive loss. The pre-defined hierarchy tree\nrecursively interacts with the visual features of the pre-trained DNNs through\nhierarchy decomposition and encoding procedures, thereby effectively\nidentifying the visual hierarchy and enhancing the recognition of an entire\nscene. Extensive experiments demonstrate that Hi-Mapper significantly enhances\nthe representation capability of DNNs, leading to an improved performance on\nvarious tasks, including image classification and dense prediction tasks.\n","authors":["Hyeongjun Kwon","Jinhyun Jang","Jin Kim","Kwonyoung Kim","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2404.00974v1.pdf","comment":"This paper is accepted to CVPR 2024. The supplementary material is\n included. The code is available at\n \\url{https://github.com/kwonjunn01/Hi-Mapper}"},{"id":"http://arxiv.org/abs/2404.00973v1","updated":"2024-04-01T07:44:24Z","published":"2024-04-01T07:44:24Z","title":"VideoDistill: Language-aware Vision Distillation for Video Question\n Answering","summary":" Significant advancements in video question answering (VideoQA) have been made\nthanks to thriving large image-language pretraining frameworks. Although these\nimage-language models can efficiently represent both video and language\nbranches, they typically employ a goal-free vision perception process and do\nnot interact vision with language well during the answer generation, thus\nomitting crucial visual cues. In this paper, we are inspired by the human\nrecognition and learning pattern and propose VideoDistill, a framework with\nlanguage-aware (i.e., goal-driven) behavior in both vision perception and\nanswer generation process. VideoDistill generates answers only from\nquestion-related visual embeddings and follows a thinking-observing-answering\napproach that closely resembles human behavior, distinguishing it from previous\nresearch. Specifically, we develop a language-aware gating mechanism to replace\nthe standard cross-attention, avoiding language's direct fusion into visual\nrepresentations. We incorporate this mechanism into two key components of the\nentire framework. The first component is a differentiable sparse sampling\nmodule, which selects frames containing the necessary dynamics and semantics\nrelevant to the questions. The second component is a vision refinement module\nthat merges existing spatial-temporal attention layers to ensure the extraction\nof multi-grained visual semantics associated with the questions. We conduct\nexperimental evaluations on various challenging video question-answering\nbenchmarks, and VideoDistill achieves state-of-the-art performance in both\ngeneral and long-form VideoQA datasets. In Addition, we verify that\nVideoDistill can effectively alleviate the utilization of language shortcut\nsolutions in the EgoTaskQA dataset.\n","authors":["Bo Zou","Chao Yang","Yu Qiao","Chengbin Quan","Youjian Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.00973v1.pdf","comment":"This paper is accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2312.02010v3","updated":"2024-04-01T07:21:52Z","published":"2023-12-04T16:32:51Z","title":"Towards Learning a Generalist Model for Embodied Navigation","summary":" Building a generalist agent that can interact with the world is the\nintriguing target of AI systems, thus spurring the research for embodied\nnavigation, where an agent is required to navigate according to instructions or\nrespond to queries. Despite the major progress attained, previous works\nprimarily focus on task-specific agents and lack generalizability to unseen\nscenarios. Recently, LLMs have presented remarkable capabilities across various\nfields, and provided a promising opportunity for embodied navigation. Drawing\non this, we propose the first generalist model for embodied navigation,\nNaviLLM. It adapts LLMs to embodied navigation by introducing schema-based\ninstruction. The schema-based instruction flexibly casts various tasks into\ngeneration problems, thereby unifying a wide range of tasks. This approach\nallows us to integrate diverse data sources from various datasets into the\ntraining, equipping NaviLLM with a wide range of capabilities required by\nembodied navigation. We conduct extensive experiments to evaluate the\nperformance and generalizability of our model. The experimental results\ndemonstrate that our unified model achieves state-of-the-art performance on\nCVDN, SOON, and ScanQA. Specifically, it surpasses the previous\nstats-of-the-art method by a significant margin of 29% in goal progress on\nCVDN. Moreover, our model also demonstrates strong generalizability and\npresents impressive results on unseen tasks, e.g., embodied question answering\nand 3D captioning.\n","authors":["Duo Zheng","Shijia Huang","Lin Zhao","Yiwu Zhong","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02010v3.pdf","comment":"Accepted by CVPR 2024 (14 pages, 3 figures)"},{"id":"http://arxiv.org/abs/2401.05698v2","updated":"2024-04-01T07:19:40Z","published":"2024-01-11T07:00:07Z","title":"HiCMAE: Hierarchical Contrastive Masked Autoencoder for Self-Supervised\n Audio-Visual Emotion Recognition","summary":" Audio-Visual Emotion Recognition (AVER) has garnered increasing attention in\nrecent years for its critical role in creating emotion-ware intelligent\nmachines. Previous efforts in this area are dominated by the supervised\nlearning paradigm. Despite significant progress, supervised learning is meeting\nits bottleneck due to the longstanding data scarcity issue in AVER. Motivated\nby recent advances in self-supervised learning, we propose Hierarchical\nContrastive Masked Autoencoder (HiCMAE), a novel self-supervised framework that\nleverages large-scale self-supervised pre-training on vast unlabeled\naudio-visual data to promote the advancement of AVER. Following prior arts in\nself-supervised audio-visual representation learning, HiCMAE adopts two primary\nforms of self-supervision for pre-training, namely masked data modeling and\ncontrastive learning. Unlike them which focus exclusively on top-layer\nrepresentations while neglecting explicit guidance of intermediate layers,\nHiCMAE develops a three-pronged strategy to foster hierarchical audio-visual\nfeature learning and improve the overall quality of learned representations. To\nverify the effectiveness of HiCMAE, we conduct extensive experiments on 9\ndatasets covering both categorical and dimensional AVER tasks. Experimental\nresults show that our method significantly outperforms state-of-the-art\nsupervised and self-supervised audio-visual methods, which indicates that\nHiCMAE is a powerful audio-visual emotion representation learner. Codes and\nmodels will be publicly available at https://github.com/sunlicai/HiCMAE.\n","authors":["Licai Sun","Zheng Lian","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2401.05698v2.pdf","comment":"Accepted by Information Fusion. The code is available at\n https://github.com/sunlicai/HiCMAE"},{"id":"http://arxiv.org/abs/2404.00964v1","updated":"2024-04-01T07:17:02Z","published":"2024-04-01T07:17:02Z","title":"S2RC-GCN: A Spatial-Spectral Reliable Contrastive Graph Convolutional\n Network for Complex Land Cover Classification Using Hyperspectral Images","summary":" Spatial correlations between different ground objects are an important\nfeature of mining land cover research. Graph Convolutional Networks (GCNs) can\neffectively capture such spatial feature representations and have demonstrated\npromising results in performing hyperspectral imagery (HSI) classification\ntasks of complex land. However, the existing GCN-based HSI classification\nmethods are prone to interference from redundant information when extracting\ncomplex features. To classify complex scenes more effectively, this study\nproposes a novel spatial-spectral reliable contrastive graph convolutional\nclassification framework named S2RC-GCN. Specifically, we fused the spectral\nand spatial features extracted by the 1D- and 2D-encoder, and the 2D-encoder\nincludes an attention model to automatically extract important information. We\nthen leveraged the fused high-level features to construct graphs and fed the\nresulting graphs into the GCNs to determine more effective graph\nrepresentations. Furthermore, a novel reliable contrastive graph convolution\nwas proposed for reliable contrastive learning to learn and fuse robust\nfeatures. Finally, to test the performance of the model on complex object\nclassification, we used imagery taken by Gaofen-5 in the Jiang Xia area to\nconstruct complex land cover datasets. The test results show that compared with\nother models, our model achieved the best results and effectively improved the\nclassification performance of complex remote sensing imagery.\n","authors":["Renxiang Guan","Zihao Li","Chujia Song","Guo Yu","Xianju Li","Ruyi Feng"],"pdf_url":"https://arxiv.org/pdf/2404.00964v1.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2404.00959v1","updated":"2024-04-01T06:59:56Z","published":"2024-04-01T06:59:56Z","title":"Equivariant Local Reference Frames for Unsupervised Non-rigid Point\n Cloud Shape Correspondence","summary":" Unsupervised non-rigid point cloud shape correspondence underpins a multitude\nof 3D vision tasks, yet itself is non-trivial given the exponential complexity\nstemming from inter-point degree-of-freedom, i.e., pose transformations. Based\non the assumption of local rigidity, one solution for reducing complexity is to\ndecompose the overall shape into independent local regions using Local\nReference Frames (LRFs) that are invariant to SE(3) transformations. However,\nthe focus solely on local structure neglects global geometric contexts,\nresulting in less distinctive LRFs that lack crucial semantic information\nnecessary for effective matching. Furthermore, such complexity introduces\nout-of-distribution geometric contexts during inference, thus complicating\ngeneralization. To this end, we introduce 1) EquiShape, a novel structure\ntailored to learn pair-wise LRFs with global structural cues for both spatial\nand semantic consistency, and 2) LRF-Refine, an optimization strategy generally\napplicable to LRF-based methods, aimed at addressing the generalization\nchallenges. Specifically, for EquiShape, we employ cross-talk within separate\nequivariant graph neural networks (Cross-GVP) to build long-range dependencies\nto compensate for the lack of semantic information in local structure modeling,\ndeducing pair-wise independent SE(3)-equivariant LRF vectors for each point.\nFor LRF-Refine, the optimization adjusts LRFs within specific contexts and\nknowledge, enhancing the geometric and semantic generalizability of point\nfeatures. Our overall framework surpasses the state-of-the-art methods by a\nlarge margin on three benchmarks. Code and models will be publicly available.\n","authors":["Ling Wang","Runfa Chen","Yikai Wang","Fuchun Sun","Xinzhou Wang","Sun Kai","Guangyuan Fu","Jianwei Zhang","Wenbing Huang"],"pdf_url":"https://arxiv.org/pdf/2404.00959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06226v2","updated":"2024-04-01T06:57:31Z","published":"2023-12-11T09:14:42Z","title":"Invariant Representation via Decoupling Style and Spurious Features from\n Images","summary":" This paper considers the out-of-distribution (OOD) generalization problem\nunder the setting that both style distribution shift and spurious features\nexist and domain labels are missing. This setting frequently arises in\nreal-world applications and is underlooked because previous approaches mainly\nhandle either of these two factors. The critical challenge is decoupling style\nand spurious features in the absence of domain labels. To address this\nchallenge, we first propose a structural causal model (SCM) for the image\ngeneration process, which captures both style distribution shift and spurious\nfeatures. The proposed SCM enables us to design a new framework called IRSS,\nwhich can gradually separate style distribution and spurious features from\nimages by introducing adversarial neural networks and multi-environment\noptimization, thus achieving OOD generalization. Moreover, it does not require\nadditional supervision (e.g., domain labels) other than the images and their\ncorresponding labels. Experiments on benchmark datasets demonstrate that IRSS\noutperforms traditional OOD methods and solves the problem of Invariant risk\nminimization (IRM) degradation, enabling the extraction of invariant features\nunder distribution shift.\n","authors":["Ruimeng Li","Yuanhao Pu","Zhaoyi Li","Hong Xie","Defu Lian"],"pdf_url":"https://arxiv.org/pdf/2312.06226v2.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2403.07494v2","updated":"2024-04-01T06:52:46Z","published":"2024-03-12T10:33:26Z","title":"SemGauss-SLAM: Dense Semantic Gaussian Splatting SLAM","summary":" We propose SemGauss-SLAM, the first semantic SLAM system utilizing 3D\nGaussian representation, that enables accurate 3D semantic mapping, robust\ncamera tracking, and high-quality rendering in real-time. In this system, we\nincorporate semantic feature embedding into 3D Gaussian representation, which\neffectively encodes semantic information within the spatial layout of the\nenvironment for precise semantic scene representation. Furthermore, we propose\nfeature-level loss for updating 3D Gaussian representation, enabling\nhigher-level guidance for 3D Gaussian optimization. In addition, to reduce\ncumulative drift and improve reconstruction accuracy, we introduce\nsemantic-informed bundle adjustment leveraging semantic associations for joint\noptimization of 3D Gaussian representation and camera poses, leading to more\nrobust tracking and consistent mapping. Our SemGauss-SLAM method demonstrates\nsuperior performance over existing dense semantic SLAM methods in terms of\nmapping and tracking accuracy on Replica and ScanNet datasets, while also\nshowing excellent capabilities in novel-view semantic synthesis and 3D semantic\nmapping.\n","authors":["Siting Zhu","Renjie Qin","Guangming Wang","Jiuming Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.07494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13061v2","updated":"2024-04-01T06:42:17Z","published":"2023-04-25T18:00:08Z","title":"iMixer: hierarchical Hopfield network implies an invertible, implicit\n and iterative MLP-Mixer","summary":" In the last few years, the success of Transformers in computer vision has\nstimulated the discovery of many alternative models that compete with\nTransformers, such as the MLP-Mixer. Despite their weak inductive bias, these\nmodels have achieved performance comparable to well-studied convolutional\nneural networks. Recent studies on modern Hopfield networks suggest the\ncorrespondence between certain energy-based associative memory models and\nTransformers or MLP-Mixer, and shed some light on the theoretical background of\nthe Transformer-type architectures design. In this paper, we generalize the\ncorrespondence to the recently introduced hierarchical Hopfield network, and\nfind iMixer, a novel generalization of MLP-Mixer model. Unlike ordinary\nfeedforward neural networks, iMixer involves MLP layers that propagate forward\nfrom the output side to the input side. We characterize the module as an\nexample of invertible, implicit, and iterative mixing module. We evaluate the\nmodel performance with various datasets on image classification tasks, and find\nthat iMixer, despite its unique architecture, exhibits stable learning\ncapabilities and achieves performance comparable to or better than the baseline\nvanilla MLP-Mixer. The results imply that the correspondence between the\nHopfield networks and the Mixer models serves as a principle for understanding\na broader class of Transformer-like architecture designs.\n","authors":["Toshihiro Ota","Masato Taki"],"pdf_url":"https://arxiv.org/pdf/2304.13061v2.pdf","comment":"19 pages. v2: minor improvements"},{"id":"http://arxiv.org/abs/2403.09055v2","updated":"2024-04-01T06:26:23Z","published":"2024-03-14T02:51:01Z","title":"StreamMultiDiffusion: Real-Time Interactive Generation with Region-Based\n Semantic Control","summary":" The enormous success of diffusion models in text-to-image synthesis has made\nthem promising candidates for the next generation of end-user applications for\nimage generation and editing. Previous works have focused on improving the\nusability of diffusion models by reducing the inference time or increasing user\ninteractivity by allowing new, fine-grained controls such as region-based text\nprompts. However, we empirically find that integrating both branches of works\nis nontrivial, limiting the potential of diffusion models. To solve this\nincompatibility, we present StreamMultiDiffusion, the first real-time\nregion-based text-to-image generation framework. By stabilizing fast inference\ntechniques and restructuring the model into a newly proposed multi-prompt\nstream batch architecture, we achieve $\\times 10$ faster panorama generation\nthan existing solutions, and the generation speed of 1.57 FPS in region-based\ntext-to-image synthesis on a single RTX 2080 Ti GPU. Our solution opens up a\nnew paradigm for interactive image generation named semantic palette, where\nhigh-quality images are generated in real-time from given multiple hand-drawn\nregions, encoding prescribed semantic meanings (e.g., eagle, girl). Our code\nand demo application are available at\nhttps://github.com/ironjr/StreamMultiDiffusion.\n","authors":["Jaerin Lee","Daniel Sungho Jung","Kanggeon Lee","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2403.09055v2.pdf","comment":"29 pages, 16 figures. v2: typos corrected, references added. Project\n page: https://jaerinlee.com/research/StreamMultiDiffusion"},{"id":"http://arxiv.org/abs/2404.00949v1","updated":"2024-04-01T06:22:28Z","published":"2024-04-01T06:22:28Z","title":"Harnessing The Power of Attention For Patch-Based Biomedical Image\n Classification","summary":" Biomedical image analysis can be facilitated by an innovative architecture\nrooted in self-attention mechanisms. The traditional convolutional neural\nnetwork (CNN), characterized by fixed-sized windows, needs help capturing\nintricate spatial and temporal relations at the pixel level. The immutability\nof CNN filter weights post-training further restricts input fluctuations.\nRecognizing these limitations, we propose a new paradigm of attention-based\nmodels instead of convolutions. As an alternative to traditional CNNs, these\nmodels demonstrate robust modelling capabilities and the ability to grasp\ncomprehensive long-range contextual information efficiently. Providing a\nsolution to critical challenges faced by attention-based vision models such as\ninductive bias, weight sharing, receptive field limitations, and data handling\nin high resolution, our work combines non-overlapping (vanilla patching) with\nnovel overlapped Shifted Patching Techniques (S.P.T.s) to induce local context\nthat enhances model generalization. Moreover, we examine the novel Lancoz5\ninterpolation technique, which adapts variable image sizes to higher\nresolutions. Experimental evidence validates our model's generalization\neffectiveness, comparing favourably with existing approaches. Attention-based\nmethods are particularly effective with ample data, especially when advanced\ndata augmentation methodologies are integrated to strengthen their robustness.\n","authors":["Gousia Habib","Shaima Qureshi","Malik ishfaq"],"pdf_url":"https://arxiv.org/pdf/2404.00949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00946v1","updated":"2024-04-01T06:10:11Z","published":"2024-04-01T06:10:11Z","title":"Exploring the Efficacy of Group-Normalization in Deep Learning Models\n for Alzheimer's Disease Classification","summary":" Batch Normalization is an important approach to advancing deep learning since\nit allows multiple networks to train simultaneously. A problem arises when\nnormalizing along the batch dimension because B.N.'s error increases\nsignificantly as batch size shrinks because batch statistics estimates are\ninaccurate. As a result, computer vision tasks like detection, segmentation,\nand video, which require tiny batches based on memory consumption, aren't\nsuitable for using Batch Normalization for larger model training and feature\ntransfer. Here, we explore Group Normalization as an easy alternative to using\nBatch Normalization A Group Normalization is a channel normalization method in\nwhich each group is divided into different channels, and the corresponding mean\nand variance are calculated for each group. Group Normalization computations\nare accurate across a wide range of batch sizes and are independent of batch\nsize. When trained using a large ImageNet database on ResNet-50, GN achieves a\nvery low error rate of 10.6% compared to Batch Normalization. when a smaller\nbatch size of only 2 is used. For usual batch sizes, the performance of G.N. is\ncomparable to that of Batch Normalization, but at the same time, it outperforms\nother normalization techniques. Implementing Group Normalization as a direct\nalternative to B.N to combat the serious challenges faced by the Batch\nNormalization in deep learning models with comparable or improved\nclassification accuracy. Additionally, Group Normalization can be naturally\ntransferred from the pre-training to the fine-tuning phase. .\n","authors":["Gousia Habib","Ishfaq Ahmed Malik","Jameel Ahmad","Imtiaz Ahmed","Shaima Qureshi"],"pdf_url":"https://arxiv.org/pdf/2404.00946v1.pdf","comment":"19 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.09935v2","updated":"2024-04-01T05:57:55Z","published":"2023-12-15T16:44:38Z","title":"LogoStyleFool: Vitiating Video Recognition Systems via Logo Style\n Transfer","summary":" Video recognition systems are vulnerable to adversarial examples. Recent\nstudies show that style transfer-based and patch-based unrestricted\nperturbations can effectively improve attack efficiency. These attacks,\nhowever, face two main challenges: 1) Adding large stylized perturbations to\nall pixels reduces the naturalness of the video and such perturbations can be\neasily detected. 2) Patch-based video attacks are not extensible to targeted\nattacks due to the limited search space of reinforcement learning that has been\nwidely used in video attacks recently. In this paper, we focus on the video\nblack-box setting and propose a novel attack framework named LogoStyleFool by\nadding a stylized logo to the clean video. We separate the attack into three\nstages: style reference selection, reinforcement-learning-based logo style\ntransfer, and perturbation optimization. We solve the first challenge by\nscaling down the perturbation range to a regional logo, while the second\nchallenge is addressed by complementing an optimization stage after\nreinforcement learning. Experimental results substantiate the overall\nsuperiority of LogoStyleFool over three state-of-the-art patch-based attacks in\nterms of attack performance and semantic preservation. Meanwhile, LogoStyleFool\nstill maintains its performance against two existing patch-based defense\nmethods. We believe that our research is beneficial in increasing the attention\nof the security community to such subregional style transfer attacks.\n","authors":["Yuxin Cao","Ziyu Zhao","Xi Xiao","Derui Wang","Minhui Xue","Jin Lu"],"pdf_url":"https://arxiv.org/pdf/2312.09935v2.pdf","comment":"14 pages, 3 figures. Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2203.16000v4","updated":"2024-04-01T05:51:31Z","published":"2022-03-30T02:18:16Z","title":"StyleFool: Fooling Video Classification Systems via Style Transfer","summary":" Video classification systems are vulnerable to adversarial attacks, which can\ncreate severe security problems in video verification. Current black-box\nattacks need a large number of queries to succeed, resulting in high\ncomputational overhead in the process of attack. On the other hand, attacks\nwith restricted perturbations are ineffective against defenses such as\ndenoising or adversarial training. In this paper, we focus on unrestricted\nperturbations and propose StyleFool, a black-box video adversarial attack via\nstyle transfer to fool the video classification system. StyleFool first\nutilizes color theme proximity to select the best style image, which helps\navoid unnatural details in the stylized videos. Meanwhile, the target class\nconfidence is additionally considered in targeted attacks to influence the\noutput distribution of the classifier by moving the stylized video closer to or\neven across the decision boundary. A gradient-free method is then employed to\nfurther optimize the adversarial perturbations. We carry out extensive\nexperiments to evaluate StyleFool on two standard datasets, UCF-101 and\nHMDB-51. The experimental results demonstrate that StyleFool outperforms the\nstate-of-the-art adversarial attacks in terms of both the number of queries and\nthe robustness against existing defenses. Moreover, 50% of the stylized videos\nin untargeted attacks do not need any query since they can already fool the\nvideo classification model. Furthermore, we evaluate the indistinguishability\nthrough a user study to show that the adversarial samples of StyleFool look\nimperceptible to human eyes, despite unrestricted perturbations.\n","authors":["Yuxin Cao","Xi Xiao","Ruoxi Sun","Derui Wang","Minhui Xue","Sheng Wen"],"pdf_url":"https://arxiv.org/pdf/2203.16000v4.pdf","comment":"18 pages, 9 figures. Accepted to S&P 2023"},{"id":"http://arxiv.org/abs/2404.00938v1","updated":"2024-04-01T05:50:56Z","published":"2024-04-01T05:50:56Z","title":"How Can Large Language Models Enable Better Socially Assistive\n Human-Robot Interaction: A Brief Survey","summary":" Socially assistive robots (SARs) have shown great success in providing\npersonalized cognitive-affective support for user populations with special\nneeds such as older adults, children with autism spectrum disorder (ASD), and\nindividuals with mental health challenges. The large body of work on SAR\ndemonstrates its potential to provide at-home support that complements\nclinic-based interventions delivered by mental health professionals, making\nthese interventions more effective and accessible. However, there are still\nseveral major technical challenges that hinder SAR-mediated interactions and\ninterventions from reaching human-level social intelligence and efficacy. With\nthe recent advances in large language models (LLMs), there is an increased\npotential for novel applications within the field of SAR that can significantly\nexpand the current capabilities of SARs. However, incorporating LLMs introduces\nnew risks and ethical concerns that have not yet been encountered, and must be\ncarefully be addressed to safely deploy these more advanced systems. In this\nwork, we aim to conduct a brief survey on the use of LLMs in SAR technologies,\nand discuss the potentials and risks of applying LLMs to the following three\nmajor technical challenges of SAR: 1) natural language dialog; 2) multimodal\nunderstanding; 3) LLMs as robot policies.\n","authors":["Zhonghao Shi","Ellen Landrum","Amy O' Connell","Mina Kian","Leticia Pinto-Alva","Kaleen Shrestha","Xiaoyuan Zhu","Maja J Matarić"],"pdf_url":"https://arxiv.org/pdf/2404.00938v1.pdf","comment":"2 pages, to be submitted to 2024 AAAI Spring Symposium"},{"id":"http://arxiv.org/abs/2403.01414v2","updated":"2024-04-01T05:44:41Z","published":"2024-03-03T06:58:35Z","title":"Unsigned Orthogonal Distance Fields: An Accurate Neural Implicit\n Representation for Diverse 3D Shapes","summary":" Neural implicit representation of geometric shapes has witnessed considerable\nadvancements in recent years. However, common distance field based implicit\nrepresentations, specifically signed distance field (SDF) for watertight shapes\nor unsigned distance field (UDF) for arbitrary shapes, routinely suffer from\ndegradation of reconstruction accuracy when converting to explicit surface\npoints and meshes. In this paper, we introduce a novel neural implicit\nrepresentation based on unsigned orthogonal distance fields (UODFs). In UODFs,\nthe minimal unsigned distance from any spatial point to the shape surface is\ndefined solely in one orthogonal direction, contrasting with the\nmulti-directional determination made by SDF and UDF. Consequently, every point\nin the 3D UODFs can directly access its closest surface points along three\northogonal directions. This distinctive feature leverages the accurate\nreconstruction of surface points without interpolation errors. We verify the\neffectiveness of UODFs through a range of reconstruction examples, extending\nfrom simple watertight or non-watertight shapes to complex shapes that include\nhollows, internal or assembling structures.\n","authors":["Yujie Lu","Long Wan","Nayu Ding","Yulong Wang","Shuhan Shen","Shen Cai","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2403.01414v2.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2311.03149v2","updated":"2024-04-01T05:37:19Z","published":"2023-11-06T14:44:34Z","title":"Asymmetric Masked Distillation for Pre-Training Small Foundation Models","summary":" Self-supervised foundation models have shown great potential in computer\nvision thanks to the pre-training paradigm of masked autoencoding. Scale is a\nprimary factor influencing the performance of these foundation models. However,\nthese large foundation models often result in high computational cost. This\npaper focuses on pre-training relatively small vision transformer models that\ncould be efficiently adapted to downstream tasks. Specifically, taking\ninspiration from knowledge distillation in model compression, we propose a new\nasymmetric masked distillation (AMD) framework for pre-training relatively\nsmall models with autoencoding. The core of AMD is to devise an asymmetric\nmasking strategy, where the teacher model is enabled to see more context\ninformation with a lower masking ratio, while the student model is still\nequipped with a high masking ratio. We design customized multi-layer feature\nalignment between the teacher encoder and student encoder to regularize the\npre-training of student MAE. To demonstrate the effectiveness and versatility\nof AMD, we apply it to both ImageMAE and VideoMAE for pre-training relatively\nsmall ViT models. AMD achieved 84.6% classification accuracy on IN1K using the\nViT-B model. And AMD achieves 73.3% classification accuracy using the ViT-B\nmodel on the Something-in-Something V2 dataset, a 3.7% improvement over the\noriginal ViT-B model from VideoMAE. We also transfer AMD pre-trained models to\ndownstream tasks and obtain consistent performance improvement over the\noriginal masked autoencoding. The code and models are available at\nhttps://github.com/MCG-NJU/AMD.\n","authors":["Zhiyu Zhao","Bingkun Huang","Sen Xing","Gangshan Wu","Yu Qiao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.03149v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00931v1","updated":"2024-04-01T05:19:50Z","published":"2024-04-01T05:19:50Z","title":"GOV-NeSF: Generalizable Open-Vocabulary Neural Semantic Fields","summary":" Recent advancements in vision-language foundation models have significantly\nenhanced open-vocabulary 3D scene understanding. However, the generalizability\nof existing methods is constrained due to their framework designs and their\nreliance on 3D data. We address this limitation by introducing Generalizable\nOpen-Vocabulary Neural Semantic Fields (GOV-NeSF), a novel approach offering a\ngeneralizable implicit representation of 3D scenes with open-vocabulary\nsemantics. We aggregate the geometry-aware features using a cost volume, and\npropose a Multi-view Joint Fusion module to aggregate multi-view features\nthrough a cross-view attention mechanism, which effectively predicts\nview-specific blending weights for both colors and open-vocabulary features.\nRemarkably, our GOV-NeSF exhibits state-of-the-art performance in both 2D and\n3D open-vocabulary semantic segmentation, eliminating the need for ground truth\nsemantic labels or depth priors, and effectively generalize across scenes and\ndatasets without fine-tuning.\n","authors":["Yunsong Wang","Hanlin Chen","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.00931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01352v1","updated":"2024-04-01T05:12:55Z","published":"2024-04-01T05:12:55Z","title":"VortexViz: Finding Vortex Boundaries by Learning from Particle\n Trajectories","summary":" Vortices are studied in various scientific disciplines, offering insights\ninto fluid flow behavior. Visualizing the boundary of vortices is crucial for\nunderstanding flow phenomena and detecting flow irregularities. This paper\naddresses the challenge of accurately extracting vortex boundaries using deep\nlearning techniques. While existing methods primarily train on velocity\ncomponents, we propose a novel approach incorporating particle trajectories\n(streamlines or pathlines) into the learning process. By leveraging the\nregional/local characteristics of the flow field captured by streamlines or\npathlines, our methodology aims to enhance the accuracy of vortex boundary\nextraction.\n","authors":["Akila de Silva","Nicholas Tee","Omkar Ghanekar","Fahim Hasan Khan","Gregory Dusek","James Davis","Alex Pang"],"pdf_url":"https://arxiv.org/pdf/2404.01352v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.00928v1","updated":"2024-04-01T05:12:30Z","published":"2024-04-01T05:12:30Z","title":"Instance-Aware Group Quantization for Vision Transformers","summary":" Post-training quantization (PTQ) is an efficient model compression technique\nthat quantizes a pretrained full-precision model using only a small calibration\nset of unlabeled samples without retraining. PTQ methods for convolutional\nneural networks (CNNs) provide quantization results comparable to\nfull-precision counterparts. Directly applying them to vision transformers\n(ViTs), however, incurs severe performance degradation, mainly due to the\ndifferences in architectures between CNNs and ViTs. In particular, the\ndistribution of activations for each channel vary drastically according to\ninput instances, making PTQ methods for CNNs inappropriate for ViTs. To address\nthis, we introduce instance-aware group quantization for ViTs (IGQ-ViT). To\nthis end, we propose to split the channels of activation maps into multiple\ngroups dynamically for each input instance, such that activations within each\ngroup share similar statistical properties. We also extend our scheme to\nquantize softmax attentions across tokens. In addition, the number of groups\nfor each layer is adjusted to minimize the discrepancies between predictions\nfrom quantized and full-precision models, under a bit-operation (BOP)\nconstraint. We show extensive experimental results on image classification,\nobject detection, and instance segmentation, with various transformer\narchitectures, demonstrating the effectiveness of our approach.\n","authors":["Jaehyeon Moon","Dohyung Kim","Junyong Cheon","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2404.00928v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00925v1","updated":"2024-04-01T05:07:13Z","published":"2024-04-01T05:07:13Z","title":"LLMs are Good Sign Language Translators","summary":" Sign Language Translation (SLT) is a challenging task that aims to translate\nsign videos into spoken language. Inspired by the strong translation\ncapabilities of large language models (LLMs) that are trained on extensive\nmultilingual text corpora, we aim to harness off-the-shelf LLMs to handle SLT.\nIn this paper, we regularize the sign videos to embody linguistic\ncharacteristics of spoken language, and propose a novel SignLLM framework to\ntransform sign videos into a language-like representation for improved\nreadability by off-the-shelf LLMs. SignLLM comprises two key modules: (1) The\nVector-Quantized Visual Sign module converts sign videos into a sequence of\ndiscrete character-level sign tokens, and (2) the Codebook Reconstruction and\nAlignment module converts these character-level tokens into word-level sign\nrepresentations using an optimal transport formulation. A sign-text alignment\nloss further bridges the gap between sign and text tokens, enhancing semantic\ncompatibility. We achieve state-of-the-art gloss-free results on two\nwidely-used SLT benchmarks.\n","authors":["Jia Gong","Lin Geng Foo","Yixuan He","Hossein Rahmani","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00925v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00924v1","updated":"2024-04-01T05:01:52Z","published":"2024-04-01T05:01:52Z","title":"BadPart: Unified Black-box Adversarial Patch Attacks against Pixel-wise\n Regression Tasks","summary":" Pixel-wise regression tasks (e.g., monocular depth estimation (MDE) and\noptical flow estimation (OFE)) have been widely involved in our daily life in\napplications like autonomous driving, augmented reality and video composition.\nAlthough certain applications are security-critical or bear societal\nsignificance, the adversarial robustness of such models are not sufficiently\nstudied, especially in the black-box scenario. In this work, we introduce the\nfirst unified black-box adversarial patch attack framework against pixel-wise\nregression tasks, aiming to identify the vulnerabilities of these models under\nquery-based black-box attacks. We propose a novel square-based adversarial\npatch optimization framework and employ probabilistic square sampling and\nscore-based gradient estimation techniques to generate the patch effectively\nand efficiently, overcoming the scalability problem of previous black-box patch\nattacks. Our attack prototype, named BadPart, is evaluated on both MDE and OFE\ntasks, utilizing a total of 7 models. BadPart surpasses 3 baseline methods in\nterms of both attack performance and efficiency. We also apply BadPart on the\nGoogle online service for portrait depth estimation, causing 43.5% relative\ndistance error with 50K queries. State-of-the-art (SOTA) countermeasures cannot\ndefend our attack effectively.\n","authors":["Zhiyuan Cheng","Zhaoyi Liu","Tengda Guo","Shiwei Feng","Dongfang Liu","Mingjie Tang","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00923v1","updated":"2024-04-01T04:57:41Z","published":"2024-04-01T04:57:41Z","title":"MM3DGS SLAM: Multi-modal 3D Gaussian Splatting for SLAM Using Vision,\n Depth, and Inertial Measurements","summary":" Simultaneous localization and mapping is essential for position tracking and\nscene understanding. 3D Gaussian-based map representations enable\nphotorealistic reconstruction and real-time rendering of scenes using multiple\nposed cameras. We show for the first time that using 3D Gaussians for map\nrepresentation with unposed camera images and inertial measurements can enable\naccurate SLAM. Our method, MM3DGS, addresses the limitations of prior neural\nradiance field-based representations by enabling faster rendering, scale\nawareness, and improved trajectory tracking. Our framework enables\nkeyframe-based mapping and tracking utilizing loss functions that incorporate\nrelative pose transformations from pre-integrated inertial measurements, depth\nestimates, and measures of photometric rendering quality. We also release a\nmulti-modal dataset, UT-MM, collected from a mobile robot equipped with a\ncamera and an inertial measurement unit. Experimental evaluation on several\nscenes from the dataset shows that MM3DGS achieves 3x improvement in tracking\nand 5% improvement in photometric rendering quality compared to the current\n3DGS SLAM state-of-the-art, while allowing real-time rendering of a\nhigh-resolution dense 3D map. Project Webpage:\nhttps://vita-group.github.io/MM3DGS-SLAM\n","authors":["Lisong C. Sun","Neel P. Bhatt","Jonathan C. Liu","Zhiwen Fan","Zhangyang Wang","Todd E. Humphreys","Ufuk Topcu"],"pdf_url":"https://arxiv.org/pdf/2404.00923v1.pdf","comment":"Project Webpage: https://vita-group.github.io/MM3DGS-SLAM"},{"id":"http://arxiv.org/abs/2401.02400v2","updated":"2024-04-01T04:56:37Z","published":"2024-01-04T18:32:48Z","title":"Learning the 3D Fauna of the Web","summary":" Learning 3D models of all animals on the Earth requires massively scaling up\nexisting solutions. With this ultimate goal in mind, we develop 3D-Fauna, an\napproach that learns a pan-category deformable 3D animal model for more than\n100 animal species jointly. One crucial bottleneck of modeling animals is the\nlimited availability of training data, which we overcome by simply learning\nfrom 2D Internet images. We show that prior category-specific attempts fail to\ngeneralize to rare species with limited training images. We address this\nchallenge by introducing the Semantic Bank of Skinned Models (SBSM), which\nautomatically discovers a small set of base animal shapes by combining\ngeometric inductive priors with semantic knowledge implicitly captured by an\noff-the-shelf self-supervised feature extractor. To train such a model, we also\ncontribute a new large-scale dataset of diverse animal species. At inference\ntime, given a single image of any quadruped animal, our model reconstructs an\narticulated 3D mesh in a feed-forward fashion within seconds.\n","authors":["Zizhang Li","Dor Litvak","Ruining Li","Yunzhi Zhang","Tomas Jakab","Christian Rupprecht","Shangzhe Wu","Andrea Vedaldi","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2401.02400v2.pdf","comment":"The first two authors contributed equally to this work. The last\n three authors contributed equally. Project page:\n https://kyleleey.github.io/3DFauna/"},{"id":"http://arxiv.org/abs/2404.00922v1","updated":"2024-04-01T04:55:02Z","published":"2024-04-01T04:55:02Z","title":"Towards Memorization-Free Diffusion Models","summary":" Pretrained diffusion models and their outputs are widely accessible due to\ntheir exceptional capacity for synthesizing high-quality images and their\nopen-source nature. The users, however, may face litigation risks owing to the\nmodels' tendency to memorize and regurgitate training data during inference. To\naddress this, we introduce Anti-Memorization Guidance (AMG), a novel framework\nemploying three targeted guidance strategies for the main causes of\nmemorization: image and caption duplication, and highly specific user prompts.\nConsequently, AMG ensures memorization-free outputs while maintaining high\nimage quality and text alignment, leveraging the synergy of its guidance\nmethods, each indispensable in its own right. AMG also features an innovative\nautomatic detection system for potential memorization during each step of\ninference process, allows selective application of guidance strategies,\nminimally interfering with the original sampling process to preserve output\nutility. We applied AMG to pretrained Denoising Diffusion Probabilistic Models\n(DDPM) and Stable Diffusion across various generation tasks. The results\ndemonstrate that AMG is the first approach to successfully eradicates all\ninstances of memorization with no or marginal impacts on image quality and\ntext-alignment, as evidenced by FID and CLIP scores.\n","authors":["Chen Chen","Daochang Liu","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.00922v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.00921v1","updated":"2024-04-01T04:53:06Z","published":"2024-04-01T04:53:06Z","title":"Towards Label-Efficient Human Matting: A Simple Baseline for Weakly\n Semi-Supervised Trimap-Free Human Matting","summary":" This paper presents a new practical training method for human matting, which\ndemands delicate pixel-level human region identification and significantly\nlaborious annotations. To reduce the annotation cost, most existing matting\napproaches often rely on image synthesis to augment the dataset. However, the\nunnaturalness of synthesized training images brings in a new domain\ngeneralization challenge for natural images. To address this challenge, we\nintroduce a new learning paradigm, weakly semi-supervised human matting\n(WSSHM), which leverages a small amount of expensive matte labels and a large\namount of budget-friendly segmentation labels, to save the annotation cost and\nresolve the domain generalization problem. To achieve the goal of WSSHM, we\npropose a simple and effective training method, named Matte Label Blending\n(MLB), that selectively guides only the beneficial knowledge of the\nsegmentation and matte data to the matting model. Extensive experiments with\nour detailed analysis demonstrate our method can substantially improve the\nrobustness of the matting model using a few matte data and numerous\nsegmentation data. Our training method is also easily applicable to real-time\nmodels, achieving competitive accuracy with breakneck inference speed (328 FPS\non NVIDIA V100 GPU). The implementation code is available at\n\\url{https://github.com/clovaai/WSSHM}.\n","authors":["Beomyoung Kim","Myeong Yeon Yi","Joonsang Yu","Young Joon Yoo","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2404.00921v1.pdf","comment":"Preprint, 15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2305.19556v3","updated":"2024-04-01T04:45:30Z","published":"2023-05-31T04:50:32Z","title":"Exploring Phonetic Context-Aware Lip-Sync For Talking Face Generation","summary":" Talking face generation is the challenging task of synthesizing a natural and\nrealistic face that requires accurate synchronization with a given audio. Due\nto co-articulation, where an isolated phone is influenced by the preceding or\nfollowing phones, the articulation of a phone varies upon the phonetic context.\nTherefore, modeling lip motion with the phonetic context can generate more\nspatio-temporally aligned lip movement. In this respect, we investigate the\nphonetic context in generating lip motion for talking face generation. We\npropose Context-Aware Lip-Sync framework (CALS), which explicitly leverages\nphonetic context to generate lip movement of the target face. CALS is comprised\nof an Audio-to-Lip module and a Lip-to-Face module. The former is pretrained\nbased on masked learning to map each phone to a contextualized lip motion unit.\nThe contextualized lip motion unit then guides the latter in synthesizing a\ntarget identity with context-aware lip motion. From extensive experiments, we\nverify that simply exploiting the phonetic context in the proposed CALS\nframework effectively enhances spatio-temporal alignment. We also demonstrate\nthe extent to which the phonetic context assists in lip synchronization and\nfind the effective window size for lip generation to be approximately 1.2\nseconds.\n","authors":["Se Jin Park","Minsu Kim","Jeongsoo Choi","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2305.19556v3.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2404.00916v1","updated":"2024-04-01T04:43:45Z","published":"2024-04-01T04:43:45Z","title":"Gyro-based Neural Single Image Deblurring","summary":" In this paper, we present GyroDeblurNet, a novel single image deblurring\nmethod that utilizes a gyro sensor to effectively resolve the ill-posedness of\nimage deblurring. The gyro sensor provides valuable information about camera\nmotion during exposure time that can significantly improve deblurring quality.\nHowever, effectively exploiting real-world gyro data is challenging due to\nsignificant errors from various sources including sensor noise, the disparity\nbetween the positions of a camera module and a gyro sensor, the absence of\ntranslational motion information, and moving objects whose motions cannot be\ncaptured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with\ntwo novel neural network blocks: a gyro refinement block and a gyro deblurring\nblock. The gyro refinement block refines the error-ridden gyro data using the\nblur information from the input image. On the other hand, the gyro deblurring\nblock removes blur from the input image using the refined gyro data and further\ncompensates for gyro error by leveraging the blur information from the input\nimage. For training a neural network with erroneous gyro data, we propose a\ntraining strategy based on the curriculum learning. We also introduce a novel\ngyro data embedding scheme to represent real-world intricate camera shakes.\nFinally, we present a synthetic dataset and a real dataset for the training and\nevaluation of gyro-based single image deblurring. Our experiments demonstrate\nthat our approach achieves state-of-the-art deblurring quality by effectively\nutilizing erroneous gyro data.\n","authors":["Heemin Yang","Jaesung Rim","Seung-Hwan Baek","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.00916v1.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.00915v1","updated":"2024-04-01T04:43:39Z","published":"2024-04-01T04:43:39Z","title":"Scalable 3D Registration via Truncated Entry-wise Absolute Residuals","summary":" Given an input set of $3$D point pairs, the goal of outlier-robust $3$D\nregistration is to compute some rotation and translation that align as many\npoint pairs as possible. This is an important problem in computer vision, for\nwhich many highly accurate approaches have been recently proposed. Despite\ntheir impressive performance, these approaches lack scalability, often\noverflowing the $16$GB of memory of a standard laptop to handle roughly\n$30,000$ point pairs. In this paper, we propose a $3$D registration approach\nthat can process more than ten million ($10^7$) point pairs with over $99\\%$\nrandom outliers. Moreover, our method is efficient, entails low memory costs,\nand maintains high accuracy at the same time. We call our method TEAR, as it\ninvolves minimizing an outlier-robust loss that computes Truncated Entry-wise\nAbsolute Residuals. To minimize this loss, we decompose the original\n$6$-dimensional problem into two subproblems of dimensions $3$ and $2$,\nrespectively, solved in succession to global optimality via a customized\nbranch-and-bound method. While branch-and-bound is often slow and unscalable,\nthis does not apply to TEAR as we propose novel bounding functions that are\ntight and computationally efficient. Experiments on various datasets are\nconducted to validate the scalability and efficiency of our method.\n","authors":["Tianyu Huang","Liangzu Peng","René Vidal","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00915v1.pdf","comment":"24 pages, 12 figures. Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00913v1","updated":"2024-04-01T04:39:21Z","published":"2024-04-01T04:39:21Z","title":"LLaMA-Excitor: General Instruction Tuning via Indirect Feature\n Interaction","summary":" Existing methods to fine-tune LLMs, like Adapter, Prefix-tuning, and LoRA,\nwhich introduce extra modules or additional input sequences to inject new\nskills or knowledge, may compromise the innate abilities of LLMs. In this\npaper, we propose LLaMA-Excitor, a lightweight method that stimulates the LLMs'\npotential to better follow instructions by gradually paying more attention to\nworthwhile information. Specifically, the LLaMA-Excitor does not directly\nchange the intermediate hidden state during the self-attention calculation of\nthe transformer structure. We designed the Excitor block as a bypass module for\nthe similarity score computation in LLMs' self-attention to reconstruct keys\nand change the importance of values by learnable prompts. LLaMA-Excitor ensures\na self-adaptive allocation of additional attention to input instructions, thus\neffectively preserving LLMs' pre-trained knowledge when fine-tuning LLMs on\nlow-quality instruction-following datasets. Furthermore, we unify the modeling\nof multi-modal tuning and language-only tuning, extending LLaMA-Excitor to a\npowerful visual instruction follower without the need for complex multi-modal\nalignment. Our proposed approach is evaluated in language-only and multi-modal\ntuning experimental scenarios. Notably, LLaMA-Excitor is the only method that\nmaintains basic capabilities while achieving a significant improvement (+6%) on\nthe MMLU benchmark. In the visual instruction tuning, we achieve a new\nstate-of-the-art image captioning performance of 157.5 CIDEr on MSCOCO, and a\ncomparable performance (88.39%) on ScienceQA to cutting-edge models with more\nparameters and extensive vision-language pertaining.\n","authors":["Bo Zou","Chao Yang","Yu Qiao","Chengbin Quan","Youjian Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.00913v1.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00909v1","updated":"2024-04-01T04:28:01Z","published":"2024-04-01T04:28:01Z","title":"Learning by Correction: Efficient Tuning Task for Zero-Shot Generative\n Vision-Language Reasoning","summary":" Generative vision-language models (VLMs) have shown impressive performance in\nzero-shot vision-language tasks like image captioning and visual question\nanswering. However, improving their zero-shot reasoning typically requires\nsecond-stage instruction tuning, which relies heavily on human-labeled or large\nlanguage model-generated annotation, incurring high labeling costs. To tackle\nthis challenge, we introduce Image-Conditioned Caption Correction (ICCC), a\nnovel pre-training task designed to enhance VLMs' zero-shot performance without\nthe need for labeled task-aware data. The ICCC task compels VLMs to rectify\nmismatches between visual and language concepts, thereby enhancing instruction\nfollowing and text generation conditioned on visual inputs. Leveraging language\nstructure and a lightweight dependency parser, we construct data samples of\nICCC task from image-text datasets with low labeling and computation costs.\nExperimental results on BLIP-2 and InstructBLIP demonstrate significant\nimprovements in zero-shot image-text generation-based VL tasks through ICCC\ninstruction tuning.\n","authors":["Rongjie Li","Yu Wu","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2404.00909v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.01351v1","updated":"2024-04-01T04:21:49Z","published":"2024-04-01T04:21:49Z","title":"AETTA: Label-Free Accuracy Estimation for Test-Time Adaptation","summary":" Test-time adaptation (TTA) has emerged as a viable solution to adapt\npre-trained models to domain shifts using unlabeled test data. However, TTA\nfaces challenges of adaptation failures due to its reliance on blind adaptation\nto unknown test samples in dynamic scenarios. Traditional methods for\nout-of-distribution performance estimation are limited by unrealistic\nassumptions in the TTA context, such as requiring labeled data or re-training\nmodels. To address this issue, we propose AETTA, a label-free accuracy\nestimation algorithm for TTA. We propose the prediction disagreement as the\naccuracy estimate, calculated by comparing the target model prediction with\ndropout inferences. We then improve the prediction disagreement to extend the\napplicability of AETTA under adaptation failures. Our extensive evaluation with\nfour baselines and six TTA methods demonstrates that AETTA shows an average of\n19.8%p more accurate estimation compared with the baselines. We further\ndemonstrate the effectiveness of accuracy estimation with a model recovery case\nstudy, showcasing the practicality of our model recovery based on accuracy\nestimation. The source code is available at https://github.com/taeckyung/AETTA.\n","authors":["Taeckyung Lee","Sorn Chottananurak","Taesik Gong","Sung-Ju Lee"],"pdf_url":"https://arxiv.org/pdf/2404.01351v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00906v1","updated":"2024-04-01T04:21:01Z","published":"2024-04-01T04:21:01Z","title":"From Pixels to Graphs: Open-Vocabulary Scene Graph Generation with\n Vision-Language Models","summary":" Scene graph generation (SGG) aims to parse a visual scene into an\nintermediate graph representation for downstream reasoning tasks. Despite\nrecent advancements, existing methods struggle to generate scene graphs with\nnovel visual relation concepts. To address this challenge, we introduce a new\nopen-vocabulary SGG framework based on sequence generation. Our framework\nleverages vision-language pre-trained models (VLM) by incorporating an\nimage-to-graph generation paradigm. Specifically, we generate scene graph\nsequences via image-to-text generation with VLM and then construct scene graphs\nfrom these sequences. By doing so, we harness the strong capabilities of VLM\nfor open-vocabulary SGG and seamlessly integrate explicit relational modeling\nfor enhancing the VL tasks. Experimental results demonstrate that our design\nnot only achieves superior performance with an open vocabulary but also\nenhances downstream vision-language task performance through explicit relation\nmodeling knowledge.\n","authors":["Rongjie Li","Songyang Zhang","Dahua Lin","Kai Chen","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2404.00906v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15918v2","updated":"2024-04-01T04:15:46Z","published":"2024-03-23T19:21:31Z","title":"An Embarrassingly Simple Defense Against Backdoor Attacks On SSL","summary":" Self Supervised Learning (SSL) has emerged as a powerful paradigm to tackle\ndata landscapes with absence of human supervision. The ability to learn\nmeaningful tasks without the use of labeled data makes SSL a popular method to\nmanage large chunks of data in the absence of labels. However, recent work\nindicates SSL to be vulnerable to backdoor attacks, wherein models can be\ncontrolled, possibly maliciously, to suit an adversary's motives. Li et. al\n(2022) introduce a novel frequency-based backdoor attack: CTRL. They show that\nCTRL can be used to efficiently and stealthily gain control over a victim's\nmodel trained using SSL. In this work, we devise two defense strategies against\nfrequency-based attacks in SSL: One applicable before model training and the\nsecond to be applied during model inference. Our first contribution utilizes\nthe invariance property of the downstream task to defend against backdoor\nattacks in a generalizable fashion. We observe the ASR (Attack Success Rate) to\nreduce by over 60% across experiments. Our Inference-time defense relies on\nevasiveness of the attack and uses the luminance channel to defend against\nattacks. Using object classification as the downstream task for SSL, we\ndemonstrate successful defense strategies that do not require re-training of\nthe model. Code is available at https://github.com/Aryan-Satpathy/Backdoor.\n","authors":["Aryan Satpathy","Nilaksh Nilaksh","Dhruva Rajwade"],"pdf_url":"https://arxiv.org/pdf/2403.15918v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2302.10174v2","updated":"2024-04-01T04:00:31Z","published":"2023-02-20T18:59:04Z","title":"Towards Universal Fake Image Detectors that Generalize Across Generative\n Models","summary":" With generative models proliferating at a rapid rate, there is a growing need\nfor general purpose fake image detectors. In this work, we first show that the\nexisting paradigm, which consists of training a deep network for real-vs-fake\nclassification, fails to detect fake images from newer breeds of generative\nmodels when trained to detect GAN fake images. Upon analysis, we find that the\nresulting classifier is asymmetrically tuned to detect patterns that make an\nimage fake. The real class becomes a sink class holding anything that is not\nfake, including generated images from models not accessible during training.\nBuilding upon this discovery, we propose to perform real-vs-fake classification\nwithout learning; i.e., using a feature space not explicitly trained to\ndistinguish real from fake images. We use nearest neighbor and linear probing\nas instantiations of this idea. When given access to the feature space of a\nlarge pretrained vision-language model, the very simple baseline of nearest\nneighbor classification has surprisingly good generalization ability in\ndetecting fake images from a wide variety of generative models; e.g., it\nimproves upon the SoTA by +15.07 mAP and +25.90% acc when tested on unseen\ndiffusion and autoregressive models.\n","authors":["Utkarsh Ojha","Yuheng Li","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2302.10174v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00901v1","updated":"2024-04-01T03:58:51Z","published":"2024-04-01T03:58:51Z","title":"Slightly Shift New Classes to Remember Old Classes for Video\n Class-Incremental Learning","summary":" Recent video class-incremental learning usually excessively pursues the\naccuracy of the newly seen classes and relies on memory sets to mitigate\ncatastrophic forgetting of the old classes. However, limited storage only\nallows storing a few representative videos. So we propose SNRO, which slightly\nshifts the features of new classes to remember old classes. Specifically, SNRO\ncontains Examples Sparse(ES) and Early Break(EB). ES decimates at a lower\nsample rate to build memory sets and uses interpolation to align those sparse\nframes in the future. By this, SNRO stores more examples under the same memory\nconsumption and forces the model to focus on low-semantic features which are\nharder to be forgotten. EB terminates the training at a small epoch, preventing\nthe model from overstretching into the high-semantic space of the current task.\nExperiments on UCF101, HMDB51, and UESTC-MMEA-CL datasets show that SNRO\nperforms better than other approaches while consuming the same memory\nconsumption.\n","authors":["Jian Jiao","Yu Dai","Hefei Mei","Heqian Qiu","Chuanyang Gong","Shiyuan Tang","Xinpeng Hao","Hongliang Li"],"pdf_url":"https://arxiv.org/pdf/2404.00901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18363v3","updated":"2024-04-01T03:41:27Z","published":"2023-11-30T09:03:47Z","title":"Each Test Image Deserves A Specific Prompt: Continual Test-Time\n Adaptation for 2D Medical Image Segmentation","summary":" Distribution shift widely exists in medical images acquired from different\nmedical centres and poses a significant obstacle to deploying the pre-trained\nsemantic segmentation model in real-world applications. Test-time adaptation\nhas proven its effectiveness in tackling the cross-domain distribution shift\nduring inference. However, most existing methods achieve adaptation by updating\nthe pre-trained models, rendering them susceptible to error accumulation and\ncatastrophic forgetting when encountering a series of distribution shifts\n(i.e., under the continual test-time adaptation setup). To overcome these\nchallenges caused by updating the models, in this paper, we freeze the\npre-trained model and propose the Visual Prompt-based Test-Time Adaptation\n(VPTTA) method to train a specific prompt for each test image to align the\nstatistics in the batch normalization layers. Specifically, we present the\nlow-frequency prompt, which is lightweight with only a few parameters and can\nbe effectively trained in a single iteration. To enhance prompt initialization,\nwe equip VPTTA with a memory bank to benefit the current prompt from previous\nones. Additionally, we design a warm-up mechanism, which mixes source and\ntarget statistics to construct warm-up statistics, thereby facilitating the\ntraining process. Extensive experiments demonstrate the superiority of our\nVPTTA over other state-of-the-art methods on two medical image segmentation\nbenchmark tasks. The code and weights of pre-trained source models are\navailable at https://github.com/Chen-Ziyang/VPTTA.\n","authors":["Ziyang Chen","Yiwen Ye","Mengkang Lu","Yongsheng Pan","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2311.18363v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15011v2","updated":"2024-04-01T03:37:24Z","published":"2023-11-25T12:34:02Z","title":"VSCode: General Visual Salient and Camouflaged Object Detection with 2D\n Prompt Learning","summary":" Salient object detection (SOD) and camouflaged object detection (COD) are\nrelated yet distinct binary mapping tasks. These tasks involve multiple\nmodalities, sharing commonalities and unique cues. Existing research often\nemploys intricate task-specific specialist models, potentially leading to\nredundancy and suboptimal results. We introduce VSCode, a generalist model with\nnovel 2D prompt learning, to jointly address four SOD tasks and three COD\ntasks. We utilize VST as the foundation model and introduce 2D prompts within\nthe encoder-decoder architecture to learn domain and task-specific knowledge on\ntwo separate dimensions. A prompt discrimination loss helps disentangle\npeculiarities to benefit model optimization. VSCode outperforms\nstate-of-the-art methods across six tasks on 26 datasets and exhibits zero-shot\ngeneralization to unseen tasks by combining 2D prompts, such as RGB-D COD.\nSource code has been available at https://github.com/Sssssuperior/VSCode.\n","authors":["Ziyang Luo","Nian Liu","Wangbo Zhao","Xuguang Yang","Dingwen Zhang","Deng-Ping Fan","Fahad Khan","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2311.15011v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03477v3","updated":"2024-04-01T03:35:25Z","published":"2024-03-06T05:33:50Z","title":"Continual Segmentation with Disentangled Objectness Learning and Class\n Recognition","summary":" Most continual segmentation methods tackle the problem as a per-pixel\nclassification task. However, such a paradigm is very challenging, and we find\nquery-based segmenters with built-in objectness have inherent advantages\ncompared with per-pixel ones, as objectness has strong transfer ability and\nforgetting resistance. Based on these findings, we propose CoMasTRe by\ndisentangling continual segmentation into two stages: forgetting-resistant\ncontinual objectness learning and well-researched continual classification.\nCoMasTRe uses a two-stage segmenter learning class-agnostic mask proposals at\nthe first stage and leaving recognition to the second stage. During continual\nlearning, a simple but effective distillation is adopted to strengthen\nobjectness. To further mitigate the forgetting of old classes, we design a\nmulti-label class distillation strategy suited for segmentation. We assess the\neffectiveness of CoMasTRe on PASCAL VOC and ADE20K. Extensive experiments show\nthat our method outperforms per-pixel and query-based methods on both datasets.\nCode will be available at https://github.com/jordangong/CoMasTRe.\n","authors":["Yizheng Gong","Siyue Yu","Xiaoyang Wang","Jimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.03477v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00891v1","updated":"2024-04-01T03:35:09Z","published":"2024-04-01T03:35:09Z","title":"Marrying NeRF with Feature Matching for One-step Pose Estimation","summary":" Given the image collection of an object, we aim at building a real-time\nimage-based pose estimation method, which requires neither its CAD model nor\nhours of object-specific training. Recent NeRF-based methods provide a\npromising solution by directly optimizing the pose from pixel loss between\nrendered and target images. However, during inference, they require long\nconverging time, and suffer from local minima, making them impractical for\nreal-time robot applications. We aim at solving this problem by marrying image\nmatching with NeRF. With 2D matches and depth rendered by NeRF, we directly\nsolve the pose in one step by building 2D-3D correspondences between target and\ninitial view, thus allowing for real-time prediction. Moreover, to improve the\naccuracy of 2D-3D correspondences, we propose a 3D consistent point mining\nstrategy, which effectively discards unfaithful points reconstruted by NeRF.\nMoreover, current NeRF-based methods naively optimizing pixel loss fail at\noccluded images. Thus, we further propose a 2D matches based sampling strategy\nto preclude the occluded area. Experimental results on representative datasets\nprove that our method outperforms state-of-the-art methods, and improves\ninference efficiency by 90x, achieving real-time prediction at 6 FPS.\n","authors":["Ronghan Chen","Yang Cong","Yu Ren"],"pdf_url":"https://arxiv.org/pdf/2404.00891v1.pdf","comment":"ICRA, 2024. Video https://www.youtube.com/watch?v=70fgUobOFWo"},{"id":"http://arxiv.org/abs/2404.00879v1","updated":"2024-04-01T03:18:12Z","published":"2024-04-01T03:18:12Z","title":"Model-Agnostic Human Preference Inversion in Diffusion Models","summary":" Efficient text-to-image generation remains a challenging task due to the high\ncomputational costs associated with the multi-step sampling in diffusion\nmodels. Although distillation of pre-trained diffusion models has been\nsuccessful in reducing sampling steps, low-step image generation often falls\nshort in terms of quality. In this study, we propose a novel sampling design to\nachieve high-quality one-step image generation aligning with human preferences,\nparticularly focusing on exploring the impact of the prior noise distribution.\nOur approach, Prompt Adaptive Human Preference Inversion (PAHI), optimizes the\nnoise distributions for each prompt based on human preferences without the need\nfor fine-tuning diffusion models. Our experiments showcase that the tailored\nnoise distributions significantly improve image quality with only a marginal\nincrease in computational cost. Our findings underscore the importance of noise\noptimization and pave the way for efficient and high-quality text-to-image\nsynthesis.\n","authors":["Jeeyung Kim","Ze Wang","Qiang Qiu"],"pdf_url":"https://arxiv.org/pdf/2404.00879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00878v1","updated":"2024-04-01T03:15:41Z","published":"2024-04-01T03:15:41Z","title":"TryOn-Adapter: Efficient Fine-Grained Clothing Identity Adaptation for\n High-Fidelity Virtual Try-On","summary":" Virtual try-on focuses on adjusting the given clothes to fit a specific\nperson seamlessly while avoiding any distortion of the patterns and textures of\nthe garment. However, the clothing identity uncontrollability and training\ninefficiency of existing diffusion-based methods, which struggle to maintain\nthe identity even with full parameter training, are significant limitations\nthat hinder the widespread applications. In this work, we propose an effective\nand efficient framework, termed TryOn-Adapter. Specifically, we first decouple\nclothing identity into fine-grained factors: style for color and category\ninformation, texture for high-frequency details, and structure for smooth\nspatial adaptive transformation. Our approach utilizes a pre-trained\nexemplar-based diffusion model as the fundamental network, whose parameters are\nfrozen except for the attention layers. We then customize three lightweight\nmodules (Style Preserving, Texture Highlighting, and Structure Adapting)\nincorporated with fine-tuning techniques to enable precise and efficient\nidentity control. Meanwhile, we introduce the training-free T-RePaint strategy\nto further enhance clothing identity preservation while maintaining the\nrealistic try-on effect during the inference. Our experiments demonstrate that\nour approach achieves state-of-the-art performance on two widely-used\nbenchmarks. Additionally, compared with recent full-tuning diffusion-based\nmethods, we only use about half of their tunable parameters during training.\nThe code will be made publicly available at\nhttps://github.com/jiazheng-xing/TryOn-Adapter.\n","authors":["Jiazheng Xing","Chao Xu","Yijie Qian","Yang Liu","Guang Dai","Baigui Sun","Yong Liu","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00876v1","updated":"2024-04-01T03:13:32Z","published":"2024-04-01T03:13:32Z","title":"MGMap: Mask-Guided Learning for Online Vectorized HD Map Construction","summary":" Currently, high-definition (HD) map construction leans towards a lightweight\nonline generation tendency, which aims to preserve timely and reliable road\nscene information. However, map elements contain strong shape priors. Subtle\nand sparse annotations make current detection-based frameworks ambiguous in\nlocating relevant feature scopes and cause the loss of detailed structures in\nprediction. To alleviate these problems, we propose MGMap, a mask-guided\napproach that effectively highlights the informative regions and achieves\nprecise map element localization by introducing the learned masks.\nSpecifically, MGMap employs learned masks based on the enhanced multi-scale BEV\nfeatures from two perspectives. At the instance level, we propose the\nMask-activated instance (MAI) decoder, which incorporates global instance and\nstructural information into instance queries by the activation of instance\nmasks. At the point level, a novel position-guided mask patch refinement\n(PG-MPR) module is designed to refine point locations from a finer-grained\nperspective, enabling the extraction of point-specific patch information.\nCompared to the baselines, our proposed MGMap achieves a notable improvement of\naround 10 mAP for different input modalities. Extensive experiments also\ndemonstrate that our approach showcases strong robustness and generalization\ncapabilities. Our code can be found at https://github.com/xiaolul2/MGMap.\n","authors":["Xiaolu Liu","Song Wang","Wentong Li","Ruizi Yang","Junbo Chen","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.00876v1.pdf","comment":"18 pages, 11 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.16279v2","updated":"2024-04-01T03:13:21Z","published":"2023-10-25T01:24:12Z","title":"TransPose: 6D Object Pose Estimation with Geometry-Aware Transformer","summary":" Estimating the 6D object pose is an essential task in many applications. Due\nto the lack of depth information, existing RGB-based methods are sensitive to\nocclusion and illumination changes. How to extract and utilize the geometry\nfeatures in depth information is crucial to achieve accurate predictions. To\nthis end, we propose TransPose, a novel 6D pose framework that exploits\nTransformer Encoder with geometry-aware module to develop better learning of\npoint cloud feature representations. Specifically, we first uniformly sample\npoint cloud and extract local geometry features with the designed local feature\nextractor base on graph convolution network. To improve robustness to\nocclusion, we adopt Transformer to perform the exchange of global information,\nmaking each local feature contains global information. Finally, we introduce\ngeometry-aware module in Transformer Encoder, which to form an effective\nconstrain for point cloud feature learning and makes the global information\nexchange more tightly coupled with point cloud tasks. Extensive experiments\nindicate the effectiveness of TransPose, our pose estimation pipeline achieves\ncompetitive results on three benchmark datasets.\n","authors":["Xiao Lin","Deming Wang","Guangliang Zhou","Chengju Liu","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.16279v2.pdf","comment":"accept by NEUROCOMPUTING"},{"id":"http://arxiv.org/abs/2312.16457v2","updated":"2024-04-01T03:10:53Z","published":"2023-12-27T08:00:47Z","title":"City-on-Web: Real-time Neural Rendering of Large-scale Scenes on the Web","summary":" Existing neural radiance field-based methods can achieve real-time rendering\nof small scenes on the web platform. However, extending these methods to\nlarge-scale scenes still poses significant challenges due to limited resources\nin computation, memory, and bandwidth. In this paper, we propose City-on-Web,\nthe first method for real-time rendering of large-scale scenes on the web. We\npropose a block-based volume rendering method to guarantee 3D consistency and\ncorrect occlusion between blocks, and introduce a Level-of-Detail strategy\ncombined with dynamic loading/unloading of resources to significantly reduce\nmemory demands. Our system achieves real-time rendering of large-scale scenes\nat approximately 32FPS with RTX 3060 GPU on the web and maintains rendering\nquality comparable to the current state-of-the-art novel view synthesis\nmethods.\n","authors":["Kaiwen Song","Xiaoyi Zeng","Chenqu Ren","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.16457v2.pdf","comment":"Project page: https://ustc3dv.github.io/City-on-Web/"},{"id":"http://arxiv.org/abs/2404.00874v1","updated":"2024-04-01T03:06:23Z","published":"2024-04-01T03:06:23Z","title":"DiSR-NeRF: Diffusion-Guided View-Consistent Super-Resolution NeRF","summary":" We present DiSR-NeRF, a diffusion-guided framework for view-consistent\nsuper-resolution (SR) NeRF. Unlike prior works, we circumvent the requirement\nfor high-resolution (HR) reference images by leveraging existing powerful 2D\nsuper-resolution models. Nonetheless, independent SR 2D images are often\ninconsistent across different views. We thus propose Iterative 3D\nSynchronization (I3DS) to mitigate the inconsistency problem via the inherent\nmulti-view consistency property of NeRF. Specifically, our I3DS alternates\nbetween upscaling low-resolution (LR) rendered images with diffusion models,\nand updating the underlying 3D representation with standard NeRF training. We\nfurther introduce Renoised Score Distillation (RSD), a novel score-distillation\nobjective for 2D image resolution. Our RSD combines features from ancestral\nsampling and Score Distillation Sampling (SDS) to generate sharp images that\nare also LR-consistent. Qualitative and quantitative results on both synthetic\nand real-world datasets demonstrate that our DiSR-NeRF can achieve better\nresults on NeRF super-resolution compared with existing works. Code and video\nresults available at the project website.\n","authors":["Jie Long Lee","Chen Li","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.00874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01901v2","updated":"2024-04-01T03:00:21Z","published":"2024-03-04T09:59:48Z","title":"FaceChain-ImagineID: Freely Crafting High-Fidelity Diverse Talking Faces\n from Disentangled Audio","summary":" In this paper, we abstract the process of people hearing speech, extracting\nmeaningful cues, and creating various dynamically audio-consistent talking\nfaces, termed Listening and Imagining, into the task of high-fidelity diverse\ntalking faces generation from a single audio. Specifically, it involves two\ncritical challenges: one is to effectively decouple identity, content, and\nemotion from entangled audio, and the other is to maintain intra-video\ndiversity and inter-video consistency. To tackle the issues, we first dig out\nthe intricate relationships among facial factors and simplify the decoupling\nprocess, tailoring a Progressive Audio Disentanglement for accurate facial\ngeometry and semantics learning, where each stage incorporates a customized\ntraining module responsible for a specific factor. Secondly, to achieve\nvisually diverse and audio-synchronized animation solely from input audio\nwithin a single model, we introduce the Controllable Coherent Frame generation,\nwhich involves the flexible integration of three trainable adapters with frozen\nLatent Diffusion Models (LDMs) to focus on maintaining facial geometry and\nsemantics, as well as texture and temporal coherence between frames. In this\nway, we inherit high-quality diverse generation from LDMs while significantly\nimproving their controllability at a low training cost. Extensive experiments\ndemonstrate the flexibility and effectiveness of our method in handling this\nparadigm. The codes will be released at\nhttps://github.com/modelscope/facechain.\n","authors":["Chao Xu","Yang Liu","Jiazheng Xing","Weida Wang","Mingze Sun","Jun Dan","Tianxin Huang","Siyuan Li","Zhi-Qi Cheng","Ying Tai","Baigui Sun"],"pdf_url":"https://arxiv.org/pdf/2403.01901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06742v2","updated":"2024-04-01T03:00:06Z","published":"2023-12-11T18:59:06Z","title":"Honeybee: Locality-enhanced Projector for Multimodal LLM","summary":" In Multimodal Large Language Models (MLLMs), a visual projector plays a\ncrucial role in bridging pre-trained vision encoders with LLMs, enabling\nprofound visual understanding while harnessing the LLMs' robust capabilities.\nDespite the importance of the visual projector, it has been relatively less\nexplored. In this study, we first identify two essential projector properties:\n(i) flexibility in managing the number of visual tokens, crucial for MLLMs'\noverall efficiency, and (ii) preservation of local context from visual\nfeatures, vital for spatial understanding. Based on these findings, we propose\na novel projector design that is both flexible and locality-enhanced,\neffectively satisfying the two desirable properties. Additionally, we present\ncomprehensive strategies to effectively utilize multiple and multifaceted\ninstruction datasets. Through extensive experiments, we examine the impact of\nindividual design choices. Finally, our proposed MLLM, Honeybee, remarkably\noutperforms previous state-of-the-art methods across various benchmarks,\nincluding MME, MMBench, SEED-Bench, and LLaVA-Bench, achieving significantly\nhigher efficiency. Code and models are available at\nhttps://github.com/kakaobrain/honeybee.\n","authors":["Junbum Cha","Wooyoung Kang","Jonghwan Mun","Byungseok Roh"],"pdf_url":"https://arxiv.org/pdf/2312.06742v2.pdf","comment":"CVPR 2024 camera-ready"},{"id":"http://arxiv.org/abs/2309.16211v2","updated":"2024-04-01T02:49:49Z","published":"2023-09-28T07:37:18Z","title":"VDC: Versatile Data Cleanser based on Visual-Linguistic Inconsistency by\n Multimodal Large Language Models","summary":" The role of data in building AI systems has recently been emphasized by the\nemerging concept of data-centric AI. Unfortunately, in the real-world, datasets\nmay contain dirty samples, such as poisoned samples from backdoor attack, noisy\nlabels in crowdsourcing, and even hybrids of them. The presence of such dirty\nsamples makes the DNNs vunerable and unreliable.Hence, it is critical to detect\ndirty samples to improve the quality and realiability of dataset. Existing\ndetectors only focus on detecting poisoned samples or noisy labels, that are\noften prone to weak generalization when dealing with dirty samples from other\ndomains.In this paper, we find a commonality of various dirty samples is\nvisual-linguistic inconsistency between images and associated labels. To\ncapture the semantic inconsistency between modalities, we propose versatile\ndata cleanser (VDC) leveraging the surpassing capabilities of multimodal large\nlanguage models (MLLM) in cross-modal alignment and reasoning.It consists of\nthree consecutive modules: the visual question generation module to generate\ninsightful questions about the image; the visual question answering module to\nacquire the semantics of the visual content by answering the questions with\nMLLM; followed by the visual answer evaluation module to evaluate the\ninconsistency.Extensive experiments demonstrate its superior performance and\ngeneralization to various categories and types of dirty samples. The code is\navailable at \\url{https://github.com/zihao-ai/vdc}.\n","authors":["Zihao Zhu","Mingda Zhang","Shaokui Wei","Bingzhe Wu","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2309.16211v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2312.14985v2","updated":"2024-04-01T02:29:20Z","published":"2023-12-22T05:00:30Z","title":"UniHuman: A Unified Model for Editing Human Images in the Wild","summary":" Human image editing includes tasks like changing a person's pose, their\nclothing, or editing the image according to a text prompt. However, prior work\noften tackles these tasks separately, overlooking the benefit of mutual\nreinforcement from learning them jointly. In this paper, we propose UniHuman, a\nunified model that addresses multiple facets of human image editing in\nreal-world settings. To enhance the model's generation quality and\ngeneralization capacity, we leverage guidance from human visual encoders and\nintroduce a lightweight pose-warping module that can exploit different pose\nrepresentations, accommodating unseen textures and patterns. Furthermore, to\nbridge the disparity between existing human editing benchmarks with real-world\ndata, we curated 400K high-quality human image-text pairs for training and\ncollected 2K human images for out-of-domain testing, both encompassing diverse\nclothing styles, backgrounds, and age groups. Experiments on both in-domain and\nout-of-domain test sets demonstrate that UniHuman outperforms task-specific\nmodels by a significant margin. In user studies, UniHuman is preferred by the\nusers in an average of 77% of cases. Our project is available at\nhttps://github.com/NannanLi999/UniHuman.\n","authors":["Nannan Li","Qing Liu","Krishna Kumar Singh","Yilin Wang","Jianming Zhang","Bryan A. Plummer","Zhe Lin"],"pdf_url":"https://arxiv.org/pdf/2312.14985v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00860v1","updated":"2024-04-01T02:01:33Z","published":"2024-04-01T02:01:33Z","title":"Lipsum-FT: Robust Fine-Tuning of Zero-Shot Models Using Random Text\n Guidance","summary":" Large-scale contrastive vision-language pre-trained models provide the\nzero-shot model achieving competitive performance across a range of image\nclassification tasks without requiring training on downstream data. Recent\nworks have confirmed that while additional fine-tuning of the zero-shot model\non the reference data results in enhanced downstream performance, it\ncompromises the model's robustness against distribution shifts. Our\ninvestigation begins by examining the conditions required to achieve the goals\nof robust fine-tuning, employing descriptions based on feature distortion\ntheory and joint energy-based models. Subsequently, we propose a novel robust\nfine-tuning algorithm, Lipsum-FT, that effectively utilizes the language\nmodeling aspect of the vision-language pre-trained models. Extensive\nexperiments conducted on distribution shift scenarios in DomainNet and ImageNet\nconfirm the superiority of our proposed Lipsum-FT approach over existing robust\nfine-tuning methods.\n","authors":["Giung Nam","Byeongho Heo","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2404.00860v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.00857v1","updated":"2024-04-01T01:56:27Z","published":"2024-04-01T01:56:27Z","title":"Meta Episodic learning with Dynamic Task Sampling for CLIP-based Point\n Cloud Classification","summary":" Point cloud classification refers to the process of assigning semantic labels\nor categories to individual points within a point cloud data structure. Recent\nworks have explored the extension of pre-trained CLIP to 3D recognition. In\nthis direction, CLIP-based point cloud models like PointCLIP, CLIP2Point have\nbecome state-of-the-art methods in the few-shot setup. Although these methods\nshow promising performance for some classes like airplanes, desks, guitars,\netc, the performance for some classes like the cup, flower pot, sink,\nnightstand, etc is still far from satisfactory. This is due to the fact that\nthe adapter of CLIP-based models is trained using randomly sampled N-way K-shot\ndata in the standard supervised learning setup. In this paper, we propose a\nnovel meta-episodic learning framework for CLIP-based point cloud\nclassification, addressing the challenges of limited training examples and\nsampling unknown classes. Additionally, we introduce dynamic task sampling\nwithin the episode based on performance memory. This sampling strategy\neffectively addresses the challenge of sampling unknown classes, ensuring that\nthe model learns from a diverse range of classes and promotes the exploration\nof underrepresented categories. By dynamically updating the performance memory,\nwe adaptively prioritize the sampling of classes based on their performance,\nenhancing the model's ability to handle challenging and real-world scenarios.\nExperiments show an average performance gain of 3-6\\% on ModelNet40 and\nScanobjectNN datasets in a few-shot setup.\n","authors":["Shuvozit Ghose","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00855v1","updated":"2024-04-01T01:49:08Z","published":"2024-04-01T01:49:08Z","title":"TSOM: Small Object Motion Detection Neural Network Inspired by Avian\n Visual Circuit","summary":" Detecting small moving objects in complex backgrounds from an overhead\nperspective is a highly challenging task for machine vision systems. As an\ninspiration from nature, the avian visual system is capable of processing\nmotion information in various complex aerial scenes, and its Retina-OT-Rt\nvisual circuit is highly sensitive to capturing the motion information of small\nobjects from high altitudes. However, more needs to be done on small object\nmotion detection algorithms based on the avian visual system. In this paper, we\nconducted mathematical modeling based on extensive studies of the biological\nmechanisms of the Retina-OT-Rt visual circuit. Based on this, we proposed a\nnovel tectum small object motion detection neural network (TSOM). The neural\nnetwork includes the retina, SGC dendritic, SGC Soma, and Rt layers, each layer\ncorresponding to neurons in the visual pathway. The Retina layer is responsible\nfor accurately projecting input content, the SGC dendritic layer perceives and\nencodes spatial-temporal information, the SGC Soma layer computes complex\nmotion information and extracts small objects, and the Rt layer integrates and\ndecodes motion information from multiple directions to determine the position\nof small objects. Extensive experiments on pigeon neurophysiological\nexperiments and image sequence data showed that the TSOM is biologically\ninterpretable and effective in extracting reliable small object motion features\nfrom complex high-altitude backgrounds.\n","authors":["Pignge Hu","Xiaoteng Zhang","Mengmeng Li","Yingjie Zhu","Li Shi"],"pdf_url":"https://arxiv.org/pdf/2404.00855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00852v1","updated":"2024-04-01T01:45:30Z","published":"2024-04-01T01:45:30Z","title":"Ensemble Learning for Vietnamese Scene Text Spotting in Urban\n Environments","summary":" This paper presents a simple yet efficient ensemble learning framework for\nVietnamese scene text spotting. Leveraging the power of ensemble learning,\nwhich combines multiple models to yield more accurate predictions, our approach\naims to significantly enhance the performance of scene text spotting in\nchallenging urban settings. Through experimental evaluations on the VinText\ndataset, our proposed method achieves a significant improvement in accuracy\ncompared to existing methods with an impressive accuracy of 5%. These results\nunequivocally demonstrate the efficacy of ensemble learning in the context of\nVietnamese scene text spotting in urban environments, highlighting its\npotential for real world applications, such as text detection and recognition\nin urban signage, advertisements, and various text-rich urban scenes.\n","authors":["Hieu Nguyen","Cong-Hoang Ta","Phuong-Thuy Le-Nguyen","Minh-Triet Tran","Trung-Nghia Le"],"pdf_url":"https://arxiv.org/pdf/2404.00852v1.pdf","comment":"RIVF 2023"},{"id":"http://arxiv.org/abs/2404.00851v1","updated":"2024-04-01T01:42:23Z","published":"2024-04-01T01:42:23Z","title":"Prompt Learning via Meta-Regularization","summary":" Pre-trained vision-language models have shown impressive success on various\ncomputer vision tasks with their zero-shot generalizability. Recently, prompt\nlearning approaches have been explored to efficiently and effectively adapt the\nvision-language models to a variety of downstream tasks. However, most existing\nprompt learning methods suffer from task overfitting since the general\nknowledge of the pre-trained vision language models is forgotten while the\nprompts are finetuned on a small data set from a specific target task. To\naddress this issue, we propose a Prompt Meta-Regularization (ProMetaR) to\nimprove the generalizability of prompt learning for vision-language models.\nSpecifically, ProMetaR meta-learns both the regularizer and the soft prompts to\nharness the task-specific knowledge from the downstream tasks and task-agnostic\ngeneral knowledge from the vision-language models. Further, ProMetaR augments\nthe task to generate multiple virtual tasks to alleviate the meta-overfitting.\nIn addition, we provide the analysis to comprehend how ProMetaR improves the\ngeneralizability of prompt tuning in the perspective of the gradient alignment.\nOur extensive experiments demonstrate that our ProMetaR improves the\ngeneralizability of conventional prompt learning methods under\nbase-to-base/base-to-new and domain generalization settings. The code of\nProMetaR is available at https://github.com/mlvlab/ProMetaR.\n","authors":["Jinyoung Park","Juyeon Ko","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00851v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2307.16368v3","updated":"2024-04-01T01:33:53Z","published":"2023-07-31T02:14:19Z","title":"AntGPT: Can Large Language Models Help Long-term Action Anticipation\n from Videos?","summary":" Can we better anticipate an actor's future actions (e.g. mix eggs) by knowing\nwhat commonly happens after his/her current action (e.g. crack eggs)? What if\nwe also know the longer-term goal of the actor (e.g. making egg fried rice)?\nThe long-term action anticipation (LTA) task aims to predict an actor's future\nbehavior from video observations in the form of verb and noun sequences, and it\nis crucial for human-machine interaction. We propose to formulate the LTA task\nfrom two perspectives: a bottom-up approach that predicts the next actions\nautoregressively by modeling temporal dynamics; and a top-down approach that\ninfers the goal of the actor and plans the needed procedure to accomplish the\ngoal. We hypothesize that large language models (LLMs), which have been\npretrained on procedure text data (e.g. recipes, how-tos), have the potential\nto help LTA from both perspectives. It can help provide the prior knowledge on\nthe possible next actions, and infer the goal given the observed part of a\nprocedure, respectively. To leverage the LLMs, we propose a two-stage\nframework, AntGPT. It first recognizes the actions already performed in the\nobserved videos and then asks an LLM to predict the future actions via\nconditioned generation, or to infer the goal and plan the whole procedure by\nchain-of-thought prompting. Empirical results on the Ego4D LTA v1 and v2\nbenchmarks, EPIC-Kitchens-55, as well as EGTEA GAZE+ demonstrate the\neffectiveness of our proposed approach. AntGPT achieves state-of-the-art\nperformance on all above benchmarks, and can successfully infer the goal and\nthus perform goal-conditioned \"counterfactual\" prediction via qualitative\nanalysis. Code and model will be released at\nhttps://brown-palm.github.io/AntGPT\n","authors":["Qi Zhao","Shijie Wang","Ce Zhang","Changcheng Fu","Minh Quan Do","Nakul Agarwal","Kwonjoon Lee","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2307.16368v3.pdf","comment":"ICLR 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2404.00849v1","updated":"2024-04-01T01:32:11Z","published":"2024-04-01T01:32:11Z","title":"Generating Content for HDR Deghosting from Frequency View","summary":" Recovering ghost-free High Dynamic Range (HDR) images from multiple Low\nDynamic Range (LDR) images becomes challenging when the LDR images exhibit\nsaturation and significant motion. Recent Diffusion Models (DMs) have been\nintroduced in HDR imaging field, demonstrating promising performance,\nparticularly in achieving visually perceptible results compared to previous\nDNN-based methods. However, DMs require extensive iterations with large models\nto estimate entire images, resulting in inefficiency that hinders their\npractical application. To address this challenge, we propose the Low-Frequency\naware Diffusion (LF-Diff) model for ghost-free HDR imaging. The key idea of\nLF-Diff is implementing the DMs in a highly compacted latent space and\nintegrating it into a regression-based model to enhance the details of\nreconstructed images. Specifically, as low-frequency information is closely\nrelated to human visual perception we propose to utilize DMs to create compact\nlow-frequency priors for the reconstruction process. In addition, to take full\nadvantage of the above low-frequency priors, the Dynamic HDR Reconstruction\nNetwork (DHRNet) is carried out in a regression-based manner to obtain final\nHDR images. Extensive experiments conducted on synthetic and real-world\nbenchmark datasets demonstrate that our LF-Diff performs favorably against\nseveral state-of-the-art methods and is 10$\\times$ faster than previous\nDM-based methods.\n","authors":["Tao Hu","Qingsen Yan","Yuankai Qi","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00849v1.pdf","comment":"This paper is accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.19898v2","updated":"2024-04-01T01:27:14Z","published":"2024-03-29T00:40:12Z","title":"Structure Matters: Tackling the Semantic Discrepancy in Diffusion Models\n for Image Inpainting","summary":" Denoising diffusion probabilistic models for image inpainting aim to add the\nnoise to the texture of image during the forward process and recover masked\nregions with unmasked ones of the texture via the reverse denoising process.\nDespite the meaningful semantics generation, the existing arts suffer from the\nsemantic discrepancy between masked and unmasked regions, since the\nsemantically dense unmasked texture fails to be completely degraded while the\nmasked regions turn to the pure noise in diffusion process, leading to the\nlarge discrepancy between them. In this paper, we aim to answer how unmasked\nsemantics guide texture denoising process;together with how to tackle the\nsemantic discrepancy, to facilitate the consistent and meaningful semantics\ngeneration. To this end, we propose a novel structure-guided diffusion model\nnamed StrDiffusion, to reformulate the conventional texture denoising process\nunder structure guidance to derive a simplified denoising objective for image\ninpainting, while revealing: 1) the semantically sparse structure is beneficial\nto tackle semantic discrepancy in early stage, while dense texture generates\nreasonable semantics in late stage; 2) the semantics from unmasked regions\nessentially offer the time-dependent structure guidance for the texture\ndenoising process, benefiting from the time-dependent sparsity of the structure\nsemantics. For the denoising process, a structure-guided neural network is\ntrained to estimate the simplified denoising objective by exploiting the\nconsistency of the denoised structure between masked and unmasked regions.\nBesides, we devise an adaptive resampling strategy as a formal criterion as\nwhether structure is competent to guide the texture denoising process, while\nregulate their semantic correlations. Extensive experiments validate the merits\nof StrDiffusion over the state-of-the-arts. Our code is available at\nhttps://github.com/htyjers/StrDiffusion.\n","authors":["Haipeng Liu","Yang Wang","Biao Qian","Meng Wang","Yong Rui"],"pdf_url":"https://arxiv.org/pdf/2403.19898v2.pdf","comment":"15 pages, 10 figures, to appear CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00847v1","updated":"2024-04-01T01:25:06Z","published":"2024-04-01T01:25:06Z","title":"Collaborative Learning of Anomalies with Privacy (CLAP) for Unsupervised\n Video Anomaly Detection: A New Baseline","summary":" Unsupervised (US) video anomaly detection (VAD) in surveillance applications\nis gaining more popularity recently due to its practical real-world\napplications. As surveillance videos are privacy sensitive and the availability\nof large-scale video data may enable better US-VAD systems, collaborative\nlearning can be highly rewarding in this setting. However, due to the extremely\nchallenging nature of the US-VAD task, where learning is carried out without\nany annotations, privacy-preserving collaborative learning of US-VAD systems\nhas not been studied yet. In this paper, we propose a new baseline for anomaly\ndetection capable of localizing anomalous events in complex surveillance videos\nin a fully unsupervised fashion without any labels on a privacy-preserving\nparticipant-based distributed training configuration. Additionally, we propose\nthree new evaluation protocols to benchmark anomaly detection approaches on\nvarious scenarios of collaborations and data availability. Based on these\nprotocols, we modify existing VAD datasets to extensively evaluate our approach\nas well as existing US SOTA methods on two large-scale datasets including\nUCF-Crime and XD-Violence. All proposed evaluation protocols, dataset splits,\nand codes are available here: https://github.com/AnasEmad11/CLAP\n","authors":["Anas Al-lahham","Muhammad Zaigham Zaheer","Nurbek Tastan","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2404.00847v1.pdf","comment":"Accepted in IEEE/CVF Computer Vision and Pattern Recognition\n Conference (CVPR), 2024"},{"id":"http://arxiv.org/abs/2404.00846v1","updated":"2024-04-01T01:23:58Z","published":"2024-04-01T01:23:58Z","title":"Transfer Learning with Point Transformers","summary":" Point Transformers are near state-of-the-art models for classification,\nsegmentation, and detection tasks on Point Cloud data. They utilize a self\nattention based mechanism to model large range spatial dependencies between\nmultiple point sets. In this project we explore two things: classification\nperformance of these attention based networks on ModelNet10 dataset and then,\nwe use the trained model to classify 3D MNIST dataset after finetuning. We also\ntrain the model from scratch on 3D MNIST dataset to compare the performance of\nfinetuned and from-scratch model on the MNIST dataset. We observe that since\nthe two datasets have a large difference in the degree of the distributions,\ntransfer learned models do not outperform the from-scratch models in this case.\nAlthough we do expect transfer learned models to converge faster since they\nalready know the lower level edges, corners, etc features from the ModelNet10\ndataset.\n","authors":["Kartik Gupta","Rahul Vippala","Sahima Srivastava"],"pdf_url":"https://arxiv.org/pdf/2404.00846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13831v3","updated":"2024-04-01T01:18:26Z","published":"2023-11-23T07:25:31Z","title":"Posterior Distillation Sampling","summary":" We introduce Posterior Distillation Sampling (PDS), a novel optimization\nmethod for parametric image editing based on diffusion models. Existing\noptimization-based methods, which leverage the powerful 2D prior of diffusion\nmodels to handle various parametric images, have mainly focused on generation.\nUnlike generation, editing requires a balance between conforming to the target\nattribute and preserving the identity of the source content. Recent 2D image\nediting methods have achieved this balance by leveraging the stochastic latent\nencoded in the generative process of diffusion models. To extend the editing\ncapabilities of diffusion models shown in pixel space to parameter space, we\nreformulate the 2D image editing method into an optimization form named PDS.\nPDS matches the stochastic latents of the source and the target, enabling the\nsampling of targets in diverse parameter spaces that align with a desired\nattribute while maintaining the source's identity. We demonstrate that this\noptimization resembles running a generative process with the target attribute,\nbut aligning this process with the trajectory of the source's generative\nprocess. Extensive editing results in Neural Radiance Fields and Scalable\nVector Graphics representations demonstrate that PDS is capable of sampling\ntargets to fulfill the aforementioned balance across various parameter spaces.\n","authors":["Juil Koo","Chanho Park","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2311.13831v3.pdf","comment":"Project page: https://posterior-distillation-sampling.github.io/"},{"id":"http://arxiv.org/abs/2404.00842v1","updated":"2024-04-01T00:47:02Z","published":"2024-04-01T00:47:02Z","title":"An N-Point Linear Solver for Line and Motion Estimation with Event\n Cameras","summary":" Event cameras respond primarily to edges--formed by strong gradients--and are\nthus particularly well-suited for line-based motion estimation. Recent work has\nshown that events generated by a single line each satisfy a polynomial\nconstraint which describes a manifold in the space-time volume. Multiple such\nconstraints can be solved simultaneously to recover the partial linear velocity\nand line parameters. In this work, we show that, with a suitable line\nparametrization, this system of constraints is actually linear in the unknowns,\nwhich allows us to design a novel linear solver. Unlike existing solvers, our\nlinear solver (i) is fast and numerically stable since it does not rely on\nexpensive root finding, (ii) can solve both minimal and overdetermined systems\nwith more than 5 events, and (iii) admits the characterization of all\ndegenerate cases and multiple solutions. The found line parameters are\nsingularity-free and have a fixed scale, which eliminates the need for\nauxiliary constraints typically encountered in previous work. To recover the\nfull linear camera velocity we fuse observations from multiple lines with a\nnovel velocity averaging scheme that relies on a geometrically-motivated\nresidual, and thus solves the problem more efficiently than previous schemes\nwhich minimize an algebraic residual. Extensive experiments in synthetic and\nreal-world settings demonstrate that our method surpasses the previous work in\nnumerical stability, and operates over 600 times faster.\n","authors":["Ling Gao","Daniel Gehrig","Hang Su","Davide Scaramuzza","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2404.00842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00838v1","updated":"2024-04-01T00:31:11Z","published":"2024-04-01T00:31:11Z","title":"3MOS: Multi-sources, Multi-resolutions, and Multi-scenes dataset for\n Optical-SAR image matching","summary":" Optical-SAR image matching is a fundamental task for image fusion and visual\nnavigation. However, all large-scale open SAR dataset for methods development\nare collected from single platform, resulting in limited satellite types and\nspatial resolutions. Since images captured by different sensors vary\nsignificantly in both geometric and radiometric appearance, existing methods\nmay fail to match corresponding regions containing the same content. Besides,\nmost of existing datasets have not been categorized based on the\ncharacteristics of different scenes. To encourage the design of more general\nmulti-modal image matching methods, we introduce a large-scale\nMulti-sources,Multi-resolutions, and Multi-scenes dataset for Optical-SAR image\nmatching(3MOS). It consists of 155K optical-SAR image pairs, including SAR data\nfrom six commercial satellites, with resolutions ranging from 1.25m to 12.5m.\nThe data has been classified into eight scenes including urban, rural, plains,\nhills, mountains, water, desert, and frozen earth. Extensively experiments show\nthat none of state-of-the-art methods achieve consistently superior performance\nacross different sources, resolutions and scenes. In addition, the distribution\nof data has a substantial impact on the matching capability of deep learning\nmodels, this proposes the domain adaptation challenge in optical-SAR image\nmatching. Our data and code will be available at:https://github.com/3M-OS/3MOS.\n","authors":["Yibin Ye","Xichao Teng","Shuo Chen","Yijie Bian","Tao Tan","Zhang Li"],"pdf_url":"https://arxiv.org/pdf/2404.00838v1.pdf","comment":"20pages 17 figures"},{"id":"http://arxiv.org/abs/2404.00837v1","updated":"2024-04-01T00:23:22Z","published":"2024-04-01T00:23:22Z","title":"Automated HER2 Scoring in Breast Cancer Images Using Deep Learning and\n Pyramid Sampling","summary":" Human epidermal growth factor receptor 2 (HER2) is a critical protein in\ncancer cell growth that signifies the aggressiveness of breast cancer (BC) and\nhelps predict its prognosis. Accurate assessment of immunohistochemically (IHC)\nstained tissue slides for HER2 expression levels is essential for both\ntreatment guidance and understanding of cancer mechanisms. Nevertheless, the\ntraditional workflow of manual examination by board-certified pathologists\nencounters challenges, including inter- and intra-observer inconsistency and\nextended turnaround times. Here, we introduce a deep learning-based approach\nutilizing pyramid sampling for the automated classification of HER2 status in\nIHC-stained BC tissue images. Our approach analyzes morphological features at\nvarious spatial scales, efficiently managing the computational load and\nfacilitating a detailed examination of cellular and larger-scale tissue-level\ndetails. This method addresses the tissue heterogeneity of HER2 expression by\nproviding a comprehensive view, leading to a blind testing classification\naccuracy of 84.70%, on a dataset of 523 core images from tissue microarrays.\nOur automated system, proving reliable as an adjunct pathology tool, has the\npotential to enhance diagnostic precision and evaluation speed, and might\nsignificantly impact cancer treatment planning.\n","authors":["Sahan Yoruc Selcuk","Xilin Yang","Bijie Bai","Yijie Zhang","Yuzhu Li","Musa Aydin","Aras Firat Unal","Aditya Gomatam","Zhen Guo","Darrow Morgan Angus","Goren Kolodney","Karine Atlan","Tal Keidar Haran","Nir Pillar","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2404.00837v1.pdf","comment":"21 Pages, 7 Figures"},{"id":"http://arxiv.org/abs/2404.00834v1","updated":"2024-04-01T00:18:17Z","published":"2024-04-01T00:18:17Z","title":"Towards Robust Event-guided Low-Light Image Enhancement: A Large-Scale\n Real-World Event-Image Dataset and Novel Approach","summary":" Event camera has recently received much attention for low-light image\nenhancement (LIE) thanks to their distinct advantages, such as high dynamic\nrange. However, current research is prohibitively restricted by the lack of\nlarge-scale, real-world, and spatial-temporally aligned event-image datasets.\nTo this end, we propose a real-world (indoor and outdoor) dataset comprising\nover 30K pairs of images and events under both low and normal illumination\nconditions. To achieve this, we utilize a robotic arm that traces a consistent\nnon-linear trajectory to curate the dataset with spatial alignment precision\nunder 0.03mm. We then introduce a matching alignment strategy, rendering 90% of\nour dataset with errors less than 0.01s. Based on the dataset, we propose a\nnovel event-guided LIE approach, called EvLight, towards robust performance in\nreal-world low-light scenes. Specifically, we first design the multi-scale\nholistic fusion branch to extract holistic structural and textural information\nfrom both events and images. To ensure robustness against variations in the\nregional illumination and noise, we then introduce a Signal-to-Noise-Ratio\n(SNR)-guided regional feature selection to selectively fuse features of images\nfrom regions with high SNR and enhance those with low SNR by extracting\nregional structure information from events. Extensive experiments on our\ndataset and the synthetic SDSD dataset demonstrate our EvLight significantly\nsurpasses the frame-based methods. Code and datasets are available at\nhttps://vlislab22.github.io/eg-lowlight/.\n","authors":["Guoqiang Liang","Kanghao Chen","Hangyu Li","Yunfan Lu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00834v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01438v1","updated":"2024-04-01T19:22:43Z","published":"2024-04-01T19:22:43Z","title":"Generation and Detection of Sign Language Deepfakes -- A Linguistic and\n Visual Analysis","summary":" A question in the realm of deepfakes is slowly emerging pertaining to whether\nwe can go beyond facial deepfakes and whether it would be beneficial to\nsociety. Therefore, this research presents a positive application of deepfake\ntechnology in upper body generation, while performing sign-language for the\nDeaf and Hard of Hearing (DHoH) community. The resulting videos are later\nvetted with a sign language expert. This is particularly helpful, given the\nintricate nature of sign language, a scarcity of sign language experts, and\npotential benefits for health and education. The objectives of this work\nencompass constructing a reliable deepfake dataset, evaluating its technical\nand visual credibility through computer vision and natural language processing\nmodels, and assessing the plausibility of the generated content. With over 1200\nvideos, featuring both previously seen and unseen individuals for the\ngeneration model, using the help of a sign language expert, we establish a\ndeepfake dataset in sign language that can further be utilized to detect fake\nvideos that may target certain people of determination.\n","authors":["Shahzeb Naeem","Muhammad Riyyan Khan","Usman Tariq","Abhinav Dhall","Carlos Ivan Colon","Hasan Al-Nashash"],"pdf_url":"https://arxiv.org/pdf/2404.01438v1.pdf","comment":"13 pages, 13 figures, Computer Vision and Image Understanding Journal"},{"id":"http://arxiv.org/abs/2404.03687v1","updated":"2024-04-01T20:44:28Z","published":"2024-04-01T20:44:28Z","title":"DRIVE: Dual Gradient-Based Rapid Iterative Pruning","summary":" Modern deep neural networks (DNNs) consist of millions of parameters,\nnecessitating high-performance computing during training and inference. Pruning\nis one solution that significantly reduces the space and time complexities of\nDNNs. Traditional pruning methods that are applied post-training focus on\nstreamlining inference, but there are recent efforts to leverage sparsity early\non by pruning before training. Pruning methods, such as iterative\nmagnitude-based pruning (IMP) achieve up to a 90% parameter reduction while\nretaining accuracy comparable to the original model. However, this leads to\nimpractical runtime as it relies on multiple train-prune-reset cycles to\nidentify and eliminate redundant parameters. In contrast, training agnostic\nearly pruning methods, such as SNIP and SynFlow offer fast pruning but fall\nshort of the accuracy achieved by IMP at high sparsities. To bridge this gap,\nwe present Dual Gradient-Based Rapid Iterative Pruning (DRIVE), which leverages\ndense training for initial epochs to counteract the randomness inherent at the\ninitialization. Subsequently, it employs a unique dual gradient-based metric\nfor parameter ranking. It has been experimentally demonstrated for VGG and\nResNet architectures on CIFAR-10/100 and Tiny ImageNet, and ResNet on ImageNet\nthat DRIVE consistently has superior performance over other training-agnostic\nearly pruning methods in accuracy. Notably, DRIVE is 43$\\times$ to 869$\\times$\nfaster than IMP for pruning.\n","authors":["Dhananjay Saikumar","Blesson Varghese"],"pdf_url":"https://arxiv.org/pdf/2404.03687v1.pdf","comment":null}]},"2024-03-31T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.01998v2","updated":"2024-03-31T22:58:09Z","published":"2023-12-04T16:22:06Z","title":"Language-only Efficient Training of Zero-shot Composed Image Retrieval","summary":" Composed image retrieval (CIR) task takes a composed query of image and text,\naiming to search relative images for both conditions. Conventional CIR\napproaches need a training dataset composed of triplets of query image, query\ntext, and target image, which is very expensive to collect. Several recent\nworks have worked on the zero-shot (ZS) CIR paradigm to tackle the issue\nwithout using pre-collected triplets. However, the existing ZS-CIR methods show\nlimited backbone scalability and generalizability due to the lack of diversity\nof the input texts during training. We propose a novel CIR framework, only\nusing language for its training. Our LinCIR (Language-only training for CIR)\ncan be trained only with text datasets by a novel self-supervision named\nself-masking projection (SMP). We project the text latent embedding to the\ntoken embedding space and construct a new text by replacing the keyword tokens\nof the original text. Then, we let the new and original texts have the same\nlatent embedding vector. With this simple strategy, LinCIR is surprisingly\nefficient and highly effective; LinCIR with CLIP ViT-G backbone is trained in\n48 minutes and shows the best ZS-CIR performances on four different CIR\nbenchmarks, CIRCO, GeneCIS, FashionIQ, and CIRR, even outperforming supervised\nmethod on FashionIQ. Code is available at https://github.com/navervision/lincir\n","authors":["Geonmo Gu","Sanghyuk Chun","Wonjae Kim","Yoohoon Kang","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2312.01998v2.pdf","comment":"CVPR 2024 camera-ready; First two authors contributed equally; 17\n pages, 3.1MB"},{"id":"http://arxiv.org/abs/2311.13958v2","updated":"2024-03-31T22:39:12Z","published":"2023-11-23T12:16:33Z","title":"Handling The Non-Smooth Challenge in Tensor SVD: A Multi-Objective\n Tensor Recovery Framework","summary":" Recently, numerous tensor singular value decomposition (t-SVD)-based tensor\nrecovery methods have shown promise in processing visual data, such as color\nimages and videos. However, these methods often suffer from severe performance\ndegradation when confronted with tensor data exhibiting non-smooth changes. It\nhas been commonly observed in real-world scenarios but ignored by the\ntraditional t-SVD-based methods. In this work, we introduce a novel tensor\nrecovery model with a learnable tensor nuclear norm to address such a\nchallenge. We develop a new optimization algorithm named the Alternating\nProximal Multiplier Method (APMM) to iteratively solve the proposed tensor\ncompletion model. Theoretical analysis demonstrates the convergence of the\nproposed APMM to the Karush-Kuhn-Tucker (KKT) point of the optimization\nproblem. In addition, we propose a multi-objective tensor recovery framework\nbased on APMM to efficiently explore the correlations of tensor data across its\nvarious dimensions, providing a new perspective on extending the t-SVD-based\nmethod to higher-order tensor cases. Numerical experiments demonstrated the\neffectiveness of the proposed method in tensor completion.\n","authors":["Jingjing Zheng","Wanglong Lu","Wenzhe Wang","Yankai Cao","Xiaoqin Zhang","Xianta Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.13958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18822v2","updated":"2024-03-31T21:11:59Z","published":"2023-11-30T18:58:17Z","title":"ElasticDiffusion: Training-free Arbitrary Size Image Generation through\n Global-Local Content Separation","summary":" Diffusion models have revolutionized image generation in recent years, yet\nthey are still limited to a few sizes and aspect ratios. We propose\nElasticDiffusion, a novel training-free decoding method that enables pretrained\ntext-to-image diffusion models to generate images with various sizes.\nElasticDiffusion attempts to decouple the generation trajectory of a pretrained\nmodel into local and global signals. The local signal controls low-level pixel\ninformation and can be estimated on local patches, while the global signal is\nused to maintain overall structural consistency and is estimated with a\nreference image. We test our method on CelebA-HQ (faces) and LAION-COCO\n(objects/indoor/outdoor scenes). Our experiments and qualitative results show\nsuperior image coherence quality across aspect ratios compared to\nMultiDiffusion and the standard decoding strategy of Stable Diffusion. Project\npage: https://elasticdiffusion.github.io/\n","authors":["Moayed Haji-Ali","Guha Balakrishnan","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2311.18822v2.pdf","comment":"Accepted at CVPR 2024. Project Page:\n https://elasticdiffusion.github.io/"},{"id":"http://arxiv.org/abs/2309.15204v2","updated":"2024-03-31T20:59:03Z","published":"2023-09-26T19:05:18Z","title":"CLRmatchNet: Enhancing Curved Lane Detection with Deep Matching Process","summary":" Lane detection plays a crucial role in autonomous driving by providing vital\ndata to ensure safe navigation. Modern algorithms rely on anchor-based\ndetectors, which are then followed by a label-assignment process to categorize\ntraining detections as positive or negative instances based on learned\ngeometric attributes. Accurate label assignment has great impact on the model\nperformance, that is usually relying on a pre-defined classical cost function\nevaluating GT-prediction alignment. However, classical label assignment methods\nface limitations due to their reliance on predefined cost functions derived\nfrom low-dimensional models, potentially impacting their optimality. Our\nresearch introduces MatchNet, a deep learning submodule-based approach aimed at\nimproving the label assignment process. Integrated into a state-of-the-art lane\ndetection network such as the Cross Layer Refinement Network for Lane Detection\n(CLRNet), MatchNet replaces the conventional label assignment process with a\nsubmodule network. The integrated model, CLRmatchNet, surpasses CLRNet, showing\nsubstantial improvements in scenarios involving curved lanes, with remarkable\nimprovement across all backbones of +2.8% for ResNet34, +2.3% for ResNet101,\nand +2.96% for DLA34. In addition, it maintains or even improves comparable\nresults in other sections. Our method boosts the confidence level in lane\ndetection, allowing an increase in the confidence threshold. Our code is\navailable at: https://github.com/sapirkontente/CLRmatchNet.git\n","authors":["Sapir Kontente","Roy Orfaig","Ben-Zion Bobrovsky"],"pdf_url":"https://arxiv.org/pdf/2309.15204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13716v2","updated":"2024-03-31T19:23:55Z","published":"2023-11-22T22:20:10Z","title":"DiverseNet: Decision Diversified Semi-supervised Semantic Segmentation\n Networks for Remote Sensing Imagery","summary":" Semi-supervised learning aims to help reduce the cost of the manual labelling\nprocess by leveraging valuable features extracted from a substantial pool of\nunlabeled data alongside a limited set of labelled data during the training\nphase. Since pixel-level manual labelling in large-scale remote sensing imagery\nis expensive, semi-supervised learning becomes an appropriate solution to this.\nHowever, most of the existing consistency learning frameworks based on network\nperturbation are very bulky. There is still a lack of lightweight and efficient\nperturbation methods to promote the diversity of features and the precision of\npseudo labels during training. In order to fill this gap, we propose DiverseNet\nwhich explores multi-head and multi-model semi-supervised learning algorithms\nby simultaneously enhancing precision and diversity during training. The two\nproposed methods in the DiverseNet family, namely DiverseHead and DiverseModel,\nboth achieve the better semantic segmentation performance in four widely\nutilised remote sensing imagery data sets compared to state-of-the-art\nsemi-supervised learning methods. Meanwhile, the proposed DiverseHead\narchitecture is simple and relatively lightweight in terms of parameter space\ncompared to the state-of-the-art methods whilst reaching high-performance\nresults for all the tested data sets.\n","authors":["Wanli Ma","Oktay Karakus","Paul L. Rosin"],"pdf_url":"https://arxiv.org/pdf/2311.13716v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08409v2","updated":"2024-03-31T19:01:07Z","published":"2024-01-16T14:49:26Z","title":"Faster ISNet for Background Bias Mitigation on Deep Neural Networks","summary":" Bias or spurious correlations in image backgrounds can impact neural\nnetworks, causing shortcut learning (Clever Hans Effect) and hampering\ngeneralization to real-world data. ISNet, a recently introduced architecture,\nproposed the optimization of Layer-Wise Relevance Propagation (LRP, an\nexplanation technique) heatmaps, to mitigate the influence of backgrounds on\ndeep classifiers. However, ISNet's training time scales linearly with the\nnumber of classes in an application. Here, we propose reformulated\narchitectures whose training time becomes independent from this number.\nAdditionally, we introduce a concise and model-agnostic LRP implementation. We\nchallenge the proposed architectures using synthetic background bias, and\nCOVID-19 detection in chest X-rays, an application that commonly presents\nbackground bias. The networks hindered background attention and shortcut\nlearning, surpassing multiple state-of-the-art models on out-of-distribution\ntest datasets. Representing a potentially massive training speed improvement\nover ISNet, the proposed architectures introduce LRP optimization into a gamut\nof applications that the original model cannot feasibly handle.\n","authors":["Pedro R. A. S. Bassi","Sergio Decherchi","Andrea Cavalli"],"pdf_url":"https://arxiv.org/pdf/2401.08409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10671v2","updated":"2024-03-31T18:37:10Z","published":"2023-12-17T10:07:03Z","title":"Open3DIS: Open-Vocabulary 3D Instance Segmentation with 2D Mask Guidance","summary":" We introduce Open3DIS, a novel solution designed to tackle the problem of\nOpen-Vocabulary Instance Segmentation within 3D scenes. Objects within 3D\nenvironments exhibit diverse shapes, scales, and colors, making precise\ninstance-level identification a challenging task. Recent advancements in\nOpen-Vocabulary scene understanding have made significant strides in this area\nby employing class-agnostic 3D instance proposal networks for object\nlocalization and learning queryable features for each 3D mask. While these\nmethods produce high-quality instance proposals, they struggle with identifying\nsmall-scale and geometrically ambiguous objects. The key idea of our method is\na new module that aggregates 2D instance masks across frames and maps them to\ngeometrically coherent point cloud regions as high-quality object proposals\naddressing the above limitations. These are then combined with 3D\nclass-agnostic instance proposals to include a wide range of objects in the\nreal world. To validate our approach, we conducted experiments on three\nprominent datasets, including ScanNet200, S3DIS, and Replica, demonstrating\nsignificant performance gains in segmenting objects with diverse categories\nover the state-of-the-art approaches.\n","authors":["Phuc D. A. Nguyen","Tuan Duc Ngo","Chuang Gan","Evangelos Kalogerakis","Anh Tran","Cuong Pham","Khoi Nguyen"],"pdf_url":"https://arxiv.org/pdf/2312.10671v2.pdf","comment":"CVPR 2024. Project page: https://open3dis.github.io/"},{"id":"http://arxiv.org/abs/2312.02142v4","updated":"2024-03-31T18:11:18Z","published":"2023-12-04T18:58:40Z","title":"Object Recognition as Next Token Prediction","summary":" We present an approach to pose object recognition as next token prediction.\nThe idea is to apply a language decoder that auto-regressively predicts the\ntext tokens from image embeddings to form labels. To ground this prediction\nprocess in auto-regression, we customize a non-causal attention mask for the\ndecoder, incorporating two key features: modeling tokens from different labels\nto be independent, and treating image tokens as a prefix. This masking\nmechanism inspires an efficient method - one-shot sampling - to simultaneously\nsample tokens of multiple labels in parallel and rank generated labels by their\nprobabilities during inference. To further enhance the efficiency, we propose a\nsimple strategy to construct a compact decoder by simply discarding the\nintermediate blocks of a pretrained language model. This approach yields a\ndecoder that matches the full model's performance while being notably more\nefficient. The code is available at https://github.com/kaiyuyue/nxtp\n","authors":["Kaiyu Yue","Bor-Chun Chen","Jonas Geiping","Hengduo Li","Tom Goldstein","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2312.02142v4.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.12289v3","updated":"2024-03-31T17:08:00Z","published":"2024-02-19T17:04:04Z","title":"DriveVLM: The Convergence of Autonomous Driving and Large\n Vision-Language Models","summary":" A primary hurdle of autonomous driving in urban environments is understanding\ncomplex and long-tail scenarios, such as challenging road conditions and\ndelicate human behaviors. We introduce DriveVLM, an autonomous driving system\nleveraging Vision-Language Models (VLMs) for enhanced scene understanding and\nplanning capabilities. DriveVLM integrates a unique combination of\nchain-of-thought (CoT) modules for scene description, scene analysis, and\nhierarchical planning. Furthermore, recognizing the limitations of VLMs in\nspatial reasoning and heavy computational requirements, we propose\nDriveVLM-Dual, a hybrid system that synergizes the strengths of DriveVLM with\nthe traditional autonomous driving pipeline. DriveVLM-Dual achieves robust\nspatial understanding and real-time inference speed. Extensive experiments on\nboth the nuScenes dataset and our SUP-AD dataset demonstrate the effectiveness\nof DriveVLM and the enhanced performance of DriveVLM-Dual, surpassing existing\nmethods in complex and unpredictable driving conditions.\n","authors":["Xiaoyu Tian","Junru Gu","Bailin Li","Yicheng Liu","Chenxu Hu","Yang Wang","Kun Zhan","Peng Jia","Xianpeng Lang","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.12289v3.pdf","comment":"Project Page: https://tsinghua-mars-lab.github.io/DriveVLM/"},{"id":"http://arxiv.org/abs/2311.14218v2","updated":"2024-03-31T17:05:15Z","published":"2023-11-23T22:27:31Z","title":"A New Benchmark and Model for Challenging Image Manipulation Detection","summary":" The ability to detect manipulation in multimedia data is vital in digital\nforensics. Existing Image Manipulation Detection (IMD) methods are mainly based\non detecting anomalous features arisen from image editing or double compression\nartifacts. All existing IMD techniques encounter challenges when it comes to\ndetecting small tampered regions from a large image. Moreover,\ncompression-based IMD approaches face difficulties in cases of double\ncompression of identical quality factors. To investigate the State-of-The-Art\n(SoTA) IMD methods in those challenging conditions, we introduce a new\nChallenging Image Manipulation Detection (CIMD) benchmark dataset, which\nconsists of two subsets, for evaluating editing-based and compression-based IMD\nmethods, respectively. The dataset images were manually taken and tampered with\nhigh-quality annotations. In addition, we propose a new two-branch network\nmodel based on HRNet that can better detect both the image-editing and\ncompression artifacts in those challenging conditions. Extensive experiments on\nthe CIMD benchmark show that our model significantly outperforms SoTA IMD\nmethods on CIMD.\n","authors":["Zhenfei Zhang","Mingyang Li","Ming-Ching Chang"],"pdf_url":"https://arxiv.org/pdf/2311.14218v2.pdf","comment":"9 pages, 6 figures, 3 tabels. AAAI-24"},{"id":"http://arxiv.org/abs/2309.11281v3","updated":"2024-03-31T16:59:45Z","published":"2023-09-20T13:05:42Z","title":"Language-driven Object Fusion into Neural Radiance Fields with\n Pose-Conditioned Dataset Updates","summary":" Neural radiance field is an emerging rendering method that generates\nhigh-quality multi-view consistent images from a neural scene representation\nand volume rendering. Although neural radiance field-based techniques are\nrobust for scene reconstruction, their ability to add or remove objects remains\nlimited. This paper proposes a new language-driven approach for object\nmanipulation with neural radiance fields through dataset updates. Specifically,\nto insert a new foreground object represented by a set of multi-view images\ninto a background radiance field, we use a text-to-image diffusion model to\nlearn and generate combined images that fuse the object of interest into the\ngiven background across views. These combined images are then used for refining\nthe background radiance field so that we can render view-consistent images\ncontaining both the object and the background. To ensure view consistency, we\npropose a dataset updates strategy that prioritizes radiance field training\nwith camera views close to the already-trained views prior to propagating the\ntraining to remaining views. We show that under the same dataset updates\nstrategy, we can easily adapt our method for object insertion using data from\ntext-to-3D models as well as object removal. Experimental results show that our\nmethod generates photorealistic images of the edited scenes, and outperforms\nstate-of-the-art methods in 3D reconstruction and neural radiance field\nblending.\n","authors":["Ka Chun Shum","Jaeyeon Kim","Binh-Son Hua","Duc Thanh Nguyen","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2309.11281v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.19276v4","updated":"2024-03-31T15:19:30Z","published":"2024-02-29T15:44:00Z","title":"Modular Blind Video Quality Assessment","summary":" Blind video quality assessment (BVQA) plays a pivotal role in evaluating and\nimproving the viewing experience of end-users across a wide range of\nvideo-based platforms and services. Contemporary deep learning-based models\nprimarily analyze video content in its aggressively subsampled format, while\nbeing blind to the impact of the actual spatial resolution and frame rate on\nvideo quality. In this paper, we propose a modular BVQA model and a method of\ntraining it to improve its modularity. Our model comprises a base quality\npredictor, a spatial rectifier, and a temporal rectifier, responding to the\nvisual content and distortion, spatial resolution, and frame rate changes on\nvideo quality, respectively. During training, spatial and temporal rectifiers\nare dropped out with some probabilities to render the base quality predictor a\nstandalone BVQA model, which should work better with the rectifiers. Extensive\nexperiments on both professionally-generated content and user-generated content\nvideo databases show that our quality model achieves superior or comparable\nperformance to current methods. Additionally, the modularity of our model\noffers an opportunity to analyze existing video quality databases in terms of\ntheir spatial and temporal complexity.\n","authors":["Wen Wen","Mu Li","Yabin Zhang","Yiting Liao","Junlin Li","Li Zhang","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2402.19276v4.pdf","comment":"Accepted by CVPR 2024; Camera-ready version"},{"id":"http://arxiv.org/abs/2303.06797v2","updated":"2024-03-31T14:35:18Z","published":"2023-03-13T01:07:32Z","title":"Multi-Channel Orthogonal Transform-Based Perceptron Layers for Efficient\n ResNets","summary":" In this paper, we propose a set of transform-based neural network layers as\nan alternative to the $3\\times3$ Conv2D layers in Convolutional Neural Networks\n(CNNs). The proposed layers can be implemented based on orthogonal transforms\nsuch as the Discrete Cosine Transform (DCT), Hadamard transform (HT), and\nbiorthogonal Block Wavelet Transform (BWT). Furthermore, by taking advantage of\nthe convolution theorems, convolutional filtering operations are performed in\nthe transform domain using element-wise multiplications. Trainable\nsoft-thresholding layers, that remove noise in the transform domain, bring\nnonlinearity to the transform domain layers. Compared to the Conv2D layer,\nwhich is spatial-agnostic and channel-specific, the proposed layers are\nlocation-specific and channel-specific. Moreover, these proposed layers reduce\nthe number of parameters and multiplications significantly while improving the\naccuracy results of regular ResNets on the ImageNet-1K classification task.\nFurthermore, they can be inserted with a batch normalization layer before the\nglobal average pooling layer in the conventional ResNets as an additional layer\nto improve classification accuracy.\n","authors":["Hongyi Pan","Emadeldeen Hamdan","Xin Zhu","Salih Atici","Ahmet Enis Cetin"],"pdf_url":"https://arxiv.org/pdf/2303.06797v2.pdf","comment":"This work is accepted to IEEE Transactions on Neural Networks and\n Learning Systems. The initial title is \"Orthogonal Transform Domain\n Approaches for the Convolutional Layer\". We changed it to \"Multi-Channel\n Orthogonal Transform-Based Perceptron Layers for Efficient ResNets\" based on\n reviewer's comment. arXiv admin note: text overlap with arXiv:2211.08577"},{"id":"http://arxiv.org/abs/2403.14119v3","updated":"2024-03-31T13:36:54Z","published":"2024-03-21T04:08:29Z","title":"C-TPT: Calibrated Test-Time Prompt Tuning for Vision-Language Models via\n Text Feature Dispersion","summary":" In deep learning, test-time adaptation has gained attention as a method for\nmodel fine-tuning without the need for labeled data. A prime exemplification is\nthe recently proposed test-time prompt tuning for large-scale vision-language\nmodels such as CLIP. Unfortunately, these prompts have been mainly developed to\nimprove accuracy, overlooking the importance of calibration, which is a crucial\naspect for quantifying prediction uncertainty. However, traditional calibration\nmethods rely on substantial amounts of labeled data, making them impractical\nfor test-time scenarios. To this end, this paper explores calibration during\ntest-time prompt tuning by leveraging the inherent properties of CLIP. Through\na series of observations, we find that the prompt choice significantly affects\nthe calibration in CLIP, where the prompts leading to higher text feature\ndispersion result in better-calibrated predictions. Introducing the Average\nText Feature Dispersion (ATFD), we establish its relationship with calibration\nerror and present a novel method, Calibrated Test-time Prompt Tuning (C-TPT),\nfor optimizing prompts during test-time with enhanced calibration. Through\nextensive experiments on different CLIP architectures and datasets, we show\nthat C-TPT can effectively improve the calibration of test-time prompt tuning\nwithout needing labeled data. The code is publicly accessible at\nhttps://github.com/hee-suk-yoon/C-TPT.\n","authors":["Hee Suk Yoon","Eunseop Yoon","Joshua Tian Jin Tee","Mark Hasegawa-Johnson","Yingzhen Li","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2403.14119v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.19456v2","updated":"2024-03-31T13:26:11Z","published":"2024-03-28T14:27:36Z","title":"Break-for-Make: Modular Low-Rank Adaptations for Composable\n Content-Style Customization","summary":" Personalized generation paradigms empower designers to customize visual\nintellectual properties with the help of textual descriptions by tuning or\nadapting pre-trained text-to-image models on a few images. Recent works explore\napproaches for concurrently customizing both content and detailed visual style\nappearance. However, these existing approaches often generate images where the\ncontent and style are entangled. In this study, we reconsider the customization\nof content and style concepts from the perspective of parameter space\nconstruction. Unlike existing methods that utilize a shared parameter space for\ncontent and style, we propose a learning framework that separates the parameter\nspace to facilitate individual learning of content and style, thereby enabling\ndisentangled content and style. To achieve this goal, we introduce \"partly\nlearnable projection\" (PLP) matrices to separate the original adapters into\ndivided sub-parameter spaces. We propose \"break-for-make\" customization\nlearning pipeline based on PLP, which is simple yet effective. We break the\noriginal adapters into \"up projection\" and \"down projection\", train content and\nstyle PLPs individually with the guidance of corresponding textual prompts in\nthe separate adapters, and maintain generalization by employing a\nmulti-correspondence projection learning strategy. Based on the adapters broken\napart for separate training content and style, we then make the entity\nparameter space by reconstructing the content and style PLPs matrices, followed\nby fine-tuning the combined adapter to generate the target object with the\ndesired appearance. Experiments on various styles, including textures,\nmaterials, and artistic style, show that our method outperforms\nstate-of-the-art single/multiple concept learning pipelines in terms of\ncontent-style-prompt alignment.\n","authors":["Yu Xu","Fan Tang","Juan Cao","Yuxin Zhang","Oliver Deussen","Weiming Dong","Jintao Li","Tong-Yee Lee"],"pdf_url":"https://arxiv.org/pdf/2403.19456v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18254v2","updated":"2024-03-31T13:13:37Z","published":"2023-11-30T05:05:38Z","title":"Sketch Input Method Editor: A Comprehensive Dataset and Methodology for\n Systematic Input Recognition","summary":" With the recent surge in the use of touchscreen devices, free-hand sketching\nhas emerged as a promising modality for human-computer interaction. While\nprevious research has focused on tasks such as recognition, retrieval, and\ngeneration of familiar everyday objects, this study aims to create a Sketch\nInput Method Editor (SketchIME) specifically designed for a professional C4I\nsystem. Within this system, sketches are utilized as low-fidelity prototypes\nfor recommending standardized symbols in the creation of comprehensive\nsituation maps. This paper also presents a systematic dataset comprising 374\nspecialized sketch types, and proposes a simultaneous recognition and\nsegmentation architecture with multilevel supervision between recognition and\nsegmentation to improve performance and enhance interpretability. By\nincorporating few-shot domain adaptation and class-incremental learning, the\nnetwork's ability to adapt to new users and extend to new task-specific classes\nis significantly enhanced. Results from experiments conducted on both the\nproposed dataset and the SPG dataset illustrate the superior performance of the\nproposed architecture. Our dataset and code are publicly available at\nhttps://github.com/GuangmingZhu/SketchIME.\n","authors":["Guangming Zhu","Siyuan Wang","Qing Cheng","Kelong Wu","Hao Li","Liang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.18254v2.pdf","comment":"The paper has been accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.08443v2","updated":"2024-03-31T12:39:48Z","published":"2023-08-16T15:51:05Z","title":"High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement:\n Establishing a Novel Baseline and Benchmark","summary":" Lake extraction from remote sensing imagery is a complex challenge due to the\nvaried lake shapes and data noise. Current methods rely on multispectral image\ndatasets, making it challenging to learn lake features accurately from pixel\narrangements. This, in turn, affects model learning and the creation of\naccurate segmentation masks. This paper introduces a prompt-based dataset\nconstruction approach that provides approximate lake locations using point,\nbox, and mask prompts. We also propose a two-stage prompt enhancement\nframework, LEPrompter, with prompt-based and prompt-free stages during\ntraining. The prompt-based stage employs a prompt encoder to extract prior\ninformation, integrating prompt tokens and image embedding through self- and\ncross-attention in the prompt decoder. Prompts are deactivated to ensure\nindependence during inference, enabling automated lake extraction without\nintroducing additional parameters and GFlops. Extensive experiments showcase\nperformance improvements of our proposed approach compared to the previous\nstate-of-the-art method. The source code is available at\nhttps://github.com/BastianChen/LEPrompter.\n","authors":["Ben Chen","Xuechao Zou","Kai Li","Yu Zhang","Junliang Xing","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2308.08443v2.pdf","comment":"Accepted by ICME 2024"},{"id":"http://arxiv.org/abs/2308.01813v2","updated":"2024-03-31T12:27:16Z","published":"2023-08-03T15:21:08Z","title":"Deep Neural Networks Fused with Textures for Image Classification","summary":" Fine-grained image classification (FGIC) is a challenging task in computer\nvision for due to small visual differences among inter-subcategories, but,\nlarge intra-class variations. Deep learning methods have achieved remarkable\nsuccess in solving FGIC. In this paper, we propose a fusion approach to address\nFGIC by combining global texture with local patch-based information. The first\npipeline extracts deep features from various fixed-size non-overlapping patches\nand encodes features by sequential modelling using the long short-term memory\n(LSTM). Another path computes image-level textures at multiple scales using the\nlocal binary patterns (LBP). The advantages of both streams are integrated to\nrepresent an efficient feature vector for image classification. The method is\ntested on eight datasets representing the human faces, skin lesions, food\ndishes, marine lives, etc. using four standard backbone CNNs. Our method has\nattained better classification accuracy over existing methods with notable\nmargins.\n","authors":["Asish Bera","Debotosh Bhattacharjee","Mita Nasipuri"],"pdf_url":"https://arxiv.org/pdf/2308.01813v2.pdf","comment":"14 pages, 6 figures, 4 tables, conference"},{"id":"http://arxiv.org/abs/2402.12677v2","updated":"2024-03-31T12:18:51Z","published":"2024-02-20T02:54:03Z","title":"Object-level Geometric Structure Preserving for Natural Image Stitching","summary":" The topic of stitching images with globally natural structures holds\nparamount significance. Current methodologies exhibit the ability to preserve\nlocal geometric structures, yet fall short in maintaining relationships between\nthese geometric structures. In this paper, we endeavor to safeguard the\noverall, OBJect-level structures within images based on Global Similarity\nPrior, while concurrently mitigating distortion and ghosting artifacts with\nOBJ-GSP. Our approach leverages the Segment Anything Model to extract geometric\nstructures with semantic information, enhancing the algorithm's ability to\npreserve objects in a manner that aligns more intuitively with human\nperception. We seek to identify spatial constraints that govern the\nrelationships between various geometric boundaries. Recognizing that multiple\ngeometric boundaries collectively define complete objects, we employ triangular\nmeshes to safeguard not only individual geometric structures but also the\noverall shapes of objects within the images. Empirical evaluations across\nmultiple image stitching datasets demonstrate that our method establishes a new\nstate-of-the-art benchmark in image stitching. Our implementation and dataset\nis publicly available at https://github.com/RussRobin/OBJ-GSP .\n","authors":["Wenxiao Cai","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2402.12677v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04192v4","updated":"2024-03-31T12:10:24Z","published":"2023-07-09T14:54:30Z","title":"Self-Adaptive Sampling for Efficient Video Question-Answering on\n Image--Text Models","summary":" Video question-answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image-text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v4.pdf","comment":"13 pages, 7 figures, accepted to Findings of NAACL 2024"},{"id":"http://arxiv.org/abs/2311.12588v2","updated":"2024-03-31T12:06:55Z","published":"2023-11-21T13:21:22Z","title":"HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning\n for RGB-D 6DoF Object Pose Estimation","summary":" In this work, we present a novel dense-correspondence method for 6DoF object\npose estimation from a single RGB-D image. While many existing data-driven\nmethods achieve impressive performance, they tend to be time-consuming due to\ntheir reliance on rendering-based refinement approaches. To circumvent this\nlimitation, we present HiPose, which establishes 3D-3D correspondences in a\ncoarse-to-fine manner with a hierarchical binary surface encoding. Unlike\nprevious dense-correspondence methods, we estimate the correspondence surface\nby employing point-to-surface matching and iteratively constricting the surface\nuntil it becomes a correspondence point while gradually removing outliers.\nExtensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate\nthat our method surpasses all refinement-free methods and is even on par with\nexpensive refinement-based approaches. Crucially, our approach is\ncomputationally efficient and enables real-time critical applications with high\naccuracy requirements.\n","authors":["Yongliang Lin","Yongzhi Su","Praveen Nathan","Sandeep Inuganti","Yan Di","Martin Sundermeyer","Fabian Manhardt","Didier Stricke","Jason Rambach","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12588v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2302.02314v4","updated":"2024-03-31T11:58:28Z","published":"2023-02-05T06:27:45Z","title":"CECT: Controllable Ensemble CNN and Transformer for COVID-19 Image\n Classification","summary":" The COVID-19 pandemic has resulted in hundreds of million cases and numerous\ndeaths worldwide. Here, we develop a novel classification network CECT by\ncontrollable ensemble convolutional neural network and transformer to provide a\ntimely and accurate COVID-19 diagnosis. The CECT is composed of a parallel\nconvolutional encoder block, an aggregate transposed-convolutional decoder\nblock, and a windowed attention classification block. Each block captures\nfeatures at different scales from 28 $\\times$ 28 to 224 $\\times$ 224 from the\ninput, composing enriched and comprehensive information. Different from\nexisting methods, our CECT can capture features at both multi-local and global\nscales without any sophisticated module design. Moreover, the contribution of\nlocal features at different scales can be controlled with the proposed ensemble\ncoefficients. We evaluate CECT on two public COVID-19 datasets and it reaches\nthe highest accuracy of 98.1% in the intra-dataset evaluation, outperforming\nexisting state-of-the-art methods. Moreover, the developed CECT achieves an\naccuracy of 90.9% on the unseen dataset in the inter-dataset evaluation,\nshowing extraordinary generalization ability. With remarkable feature capture\nability and generalization ability, we believe CECT can be extended to other\nmedical scenarios as a powerful diagnosis tool. Code is available at\nhttps://github.com/NUS-Tim/CECT.\n","authors":["Zhaoshan Liu","Lei Shen"],"pdf_url":"https://arxiv.org/pdf/2302.02314v4.pdf","comment":"Computers in Biology and Medicine Accepted"},{"id":"http://arxiv.org/abs/2303.11797v2","updated":"2024-03-31T11:53:55Z","published":"2023-03-21T12:28:21Z","title":"CAT-Seg: Cost Aggregation for Open-Vocabulary Semantic Segmentation","summary":" Open-vocabulary semantic segmentation presents the challenge of labeling each\npixel within an image based on a wide range of text descriptions. In this work,\nwe introduce a novel cost-based approach to adapt vision-language foundation\nmodels, notably CLIP, for the intricate task of semantic segmentation. Through\naggregating the cosine similarity score, i.e., the cost volume between image\nand text embeddings, our method potently adapts CLIP for segmenting seen and\nunseen classes by fine-tuning its encoders, addressing the challenges faced by\nexisting methods in handling unseen classes. Building upon this, we explore\nmethods to effectively aggregate the cost volume considering its multi-modal\nnature of being established between image and text embeddings. Furthermore, we\nexamine various methods for efficiently fine-tuning CLIP.\n","authors":["Seokju Cho","Heeseong Shin","Sunghwan Hong","Anurag Arnab","Paul Hongsuck Seo","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2303.11797v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://ku-cvlab.github.io/CAT-Seg/"},{"id":"http://arxiv.org/abs/2310.14159v3","updated":"2024-03-31T10:51:06Z","published":"2023-10-22T03:01:38Z","title":"Can Language Models Laugh at YouTube Short-form Videos?","summary":" As short-form funny videos on social networks are gaining popularity, it\nbecomes demanding for AI models to understand them for better communication\nwith humans. Unfortunately, previous video humor datasets target specific\ndomains, such as speeches or sitcoms, and mostly focus on verbal cues. We\ncurate a user-generated dataset of 10K multimodal funny videos from YouTube,\ncalled ExFunTube. Using a video filtering pipeline with GPT-3.5, we verify both\nverbal and visual elements contributing to humor. After filtering, we annotate\neach video with timestamps and text explanations for funny moments. Our\nExFunTube is unique over existing datasets in that our videos cover a wide\nrange of domains with various types of humor that necessitate a multimodal\nunderstanding of the content. Also, we develop a zero-shot video-to-text\nprompting to maximize video humor understanding of large language models\n(LLMs). With three different evaluation methods using automatic scores,\nrationale quality experiments, and human evaluations, we show that our\nprompting significantly improves LLMs' ability for humor explanation.\n","authors":["Dayoon Ko","Sangho Lee","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2310.14159v3.pdf","comment":"EMNLP 2023; references added"},{"id":"http://arxiv.org/abs/2312.04553v2","updated":"2024-03-31T10:27:03Z","published":"2023-12-07T18:59:21Z","title":"SPIDeRS: Structured Polarization for Invisible Depth and Reflectance\n Sensing","summary":" Can we capture shape and reflectance in stealth? Such capability would be\nvaluable for many application domains in vision, xR, robotics, and HCI. We\nintroduce structured polarization for invisible depth and reflectance sensing\n(SPIDeRS), the first depth and reflectance sensing method using patterns of\npolarized light. The key idea is to modulate the angle of linear polarization\n(AoLP) of projected light at each pixel. The use of polarization makes it\ninvisible and lets us recover not only depth but also directly surface normals\nand even reflectance. We implement SPIDeRS with a liquid crystal spatial light\nmodulator (SLM) and a polarimetric camera. We derive a novel method for\nrobustly extracting the projected structured polarization pattern from the\npolarimetric object appearance. We evaluate the effectiveness of SPIDeRS by\napplying it to a number of real-world objects. The results show that our method\nsuccessfully reconstructs object shapes of various materials and is robust to\ndiffuse reflection and ambient light. We also demonstrate relighting using\nrecovered surface normals and reflectance. We believe SPIDeRS opens a new\navenue of polarization use in visual sensing.\n","authors":["Tomoki Ichikawa","Shohei Nobuhara","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2312.04553v2.pdf","comment":"to be published in CVPR 2024"},{"id":"http://arxiv.org/abs/2312.01196v2","updated":"2024-03-31T10:20:37Z","published":"2023-12-02T18:06:24Z","title":"Neural Parametric Gaussians for Monocular Non-Rigid Object\n Reconstruction","summary":" Reconstructing dynamic objects from monocular videos is a severely\nunderconstrained and challenging problem, and recent work has approached it in\nvarious directions. However, owing to the ill-posed nature of this problem,\nthere has been no solution that can provide consistent, high-quality novel\nviews from camera positions that are significantly different from the training\nviews. In this work, we introduce Neural Parametric Gaussians (NPGs) to take on\nthis challenge by imposing a two-stage approach: first, we fit a low-rank\nneural deformation model, which then is used as regularization for non-rigid\nreconstruction in the second stage. The first stage learns the object's\ndeformations such that it preserves consistency in novel views. The second\nstage obtains high reconstruction quality by optimizing 3D Gaussians that are\ndriven by the coarse model. To this end, we introduce a local 3D Gaussian\nrepresentation, where temporally shared Gaussians are anchored in and deformed\nby local oriented volumes. The resulting combined model can be rendered as\nradiance fields, resulting in high-quality photo-realistic reconstructions of\nthe non-rigidly deforming objects. We demonstrate that NPGs achieve superior\nresults compared to previous works, especially in challenging scenarios with\nfew multi-view cues.\n","authors":["Devikalyan Das","Christopher Wewer","Raza Yunus","Eddy Ilg","Jan Eric Lenssen"],"pdf_url":"https://arxiv.org/pdf/2312.01196v2.pdf","comment":"Accepted at CVPR 2024 | Project Website:\n https://geometric-rl.mpi-inf.mpg.de/npg"},{"id":"http://arxiv.org/abs/2302.10306v2","updated":"2024-03-31T10:01:44Z","published":"2023-01-25T11:00:32Z","title":"Deep Convolutional Framelet Denoising for Panoramic by Mixed Wavelet\n Integration","summary":" Enhancing quality and removing noise during preprocessing is one of the most\ncritical steps in image processing. X-ray images are created by photons\ncolliding with atoms and the variation in scattered noise absorption. This\nnoise leads to a deterioration in the graph's medical quality and, at times,\nresults in repetition, thereby increasing the patient's effective dose. One of\nthe most critical challenges in this area has consistently been lowering the\nimage noise. Techniques like BM3d, low-pass filters, and Autoencoder have taken\nthis step. Owing to their structural design and high rate of repetition, neural\nnetworks employing diverse architectures have, over the past decade, achieved\nnoise reduction with satisfactory outcomes, surpassing the traditional BM3D and\nlow-pass filters. The combination of the Hankel matrix with neural networks\nrepresents one of these configurations. The Hankel matrix aims to identify a\nlocal circle by separating individual values into local and non-local\ncomponents, utilizing a non-local matrix. A non-local matrix can be created\nusing the wave or DCT. This paper suggests integrating the waveform with the\nDaubechies (D4) wavelet due to its higher energy concentration and employs the\nu-Net neural network architecture, which incorporates the waveform exclusively\nat each stage. The outcomes were evaluated using the PSNR and SSIM criteria,\nand the outcomes were verified by using various waves. The effectiveness of a\none-wave network has increased from 0.5% to 1.2%, according to studies done on\nother datasets.\n","authors":["Masoud Shahraki Mohammadi","Seyed Javad Seyed Mahdavi Chabok"],"pdf_url":"https://arxiv.org/pdf/2302.10306v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08255v3","updated":"2024-03-31T09:33:50Z","published":"2023-12-13T16:18:40Z","title":"OCTDL: Optical Coherence Tomography Dataset for Image-Based Deep\n Learning Methods","summary":" Optical coherence tomography (OCT) is a non-invasive imaging technique with\nextensive clinical applications in ophthalmology. OCT enables the visualization\nof the retinal layers, playing a vital role in the early detection and\nmonitoring of retinal diseases. OCT uses the principle of light wave\ninterference to create detailed images of the retinal microstructures, making\nit a valuable tool for diagnosing ocular conditions. This work presents an\nopen-access OCT dataset (OCTDL) comprising over 2000 OCT images labeled\naccording to disease group and retinal pathology. The dataset consists of OCT\nrecords of patients with Age-related Macular Degeneration (AMD), Diabetic\nMacular Edema (DME), Epiretinal Membrane (ERM), Retinal Artery Occlusion (RAO),\nRetinal Vein Occlusion (RVO), and Vitreomacular Interface Disease (VID). The\nimages were acquired with an Optovue Avanti RTVue XR using raster scanning\nprotocols with dynamic scan length and image resolution. Each retinal b-scan\nwas acquired by centering on the fovea and interpreted and cataloged by an\nexperienced retinal specialist. In this work, we applied Deep Learning\nclassification techniques to this new open-access dataset.\n","authors":["Mikhail Kulyabin","Aleksei Zhdanov","Anastasia Nikiforova","Andrey Stepichev","Anna Kuznetsova","Mikhail Ronkin","Vasilii Borisov","Alexander Bogachev","Sergey Korotkich","Paul A Constable","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2312.08255v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05869v2","updated":"2024-03-31T09:31:56Z","published":"2024-02-08T17:57:59Z","title":"Adaptive Surface Normal Constraint for Geometric Estimation from\n Monocular Images","summary":" We introduce a novel approach to learn geometries such as depth and surface\nnormal from images while incorporating geometric context. The difficulty of\nreliably capturing geometric context in existing methods impedes their ability\nto accurately enforce the consistency between the different geometric\nproperties, thereby leading to a bottleneck of geometric estimation quality. We\ntherefore propose the Adaptive Surface Normal (ASN) constraint, a simple yet\nefficient method. Our approach extracts geometric context that encodes the\ngeometric variations present in the input image and correlates depth estimation\nwith geometric constraints. By dynamically determining reliable local geometry\nfrom randomly sampled candidates, we establish a surface normal constraint,\nwhere the validity of these candidates is evaluated using the geometric\ncontext. Furthermore, our normal estimation leverages the geometric context to\nprioritize regions that exhibit significant geometric variations, which makes\nthe predicted normals accurately capture intricate and detailed geometric\ninformation. Through the integration of geometric context, our method unifies\ndepth and surface normal estimations within a cohesive framework, which enables\nthe generation of high-quality 3D geometry from images. We validate the\nsuperiority of our approach over state-of-the-art methods through extensive\nevaluations and comparisons on diverse indoor and outdoor datasets, showcasing\nits efficiency and robustness.\n","authors":["Xiaoxiao Long","Yuhang Zheng","Yupeng Zheng","Beiwen Tian","Cheng Lin","Lingjie Liu","Hao Zhao","Guyue Zhou","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2402.05869v2.pdf","comment":"Accepted by TPAMI. arXiv admin note: substantial text overlap with\n arXiv:2103.15483"},{"id":"http://arxiv.org/abs/2311.15672v2","updated":"2024-03-31T09:10:24Z","published":"2023-11-27T10:01:31Z","title":"HAVE-FUN: Human Avatar Reconstruction from Few-Shot Unconstrained Images","summary":" As for human avatar reconstruction, contemporary techniques commonly\nnecessitate the acquisition of costly data and struggle to achieve satisfactory\nresults from a small number of casual images. In this paper, we investigate\nthis task from a few-shot unconstrained photo album. The reconstruction of\nhuman avatars from such data sources is challenging because of limited data\namount and dynamic articulated poses. For handling dynamic data, we integrate a\nskinning mechanism with deep marching tetrahedra (DMTet) to form a drivable\ntetrahedral representation, which drives arbitrary mesh topologies generated by\nthe DMTet for the adaptation of unconstrained images. To effectively mine\ninstructive information from few-shot data, we devise a two-phase optimization\nmethod with few-shot reference and few-shot guidance. The former focuses on\naligning avatar identity with reference images, while the latter aims to\ngenerate plausible appearances for unseen regions. Overall, our framework,\ncalled HaveFun, can undertake avatar reconstruction, rendering, and animation.\nExtensive experiments on our developed benchmarks demonstrate that HaveFun\nexhibits substantially superior performance in reconstructing the human body\nand hand. Project website: https://seanchenxy.github.io/HaveFunWeb/.\n","authors":["Xihe Yang","Xingyu Chen","Daiheng Gao","Shaohui Wang","Xiaoguang Han","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15070v2","updated":"2024-03-31T08:36:57Z","published":"2023-08-29T07:11:52Z","title":"DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior","summary":" We present DiffBIR, a general restoration pipeline that could handle\ndifferent blind image restoration tasks in a unified framework. DiffBIR\ndecouples blind image restoration problem into two stages: 1) degradation\nremoval: removing image-independent content; 2) information regeneration:\ngenerating the lost image content. Each stage is developed independently but\nthey work seamlessly in a cascaded manner. In the first stage, we use\nrestoration modules to remove degradations and obtain high-fidelity restored\nresults. For the second stage, we propose IRControlNet that leverages the\ngenerative ability of latent diffusion models to generate realistic details.\nSpecifically, IRControlNet is trained based on specially produced condition\nimages without distracting noisy content for stable generation performance.\nMoreover, we design a region-adaptive restoration guidance that can modify the\ndenoising process during inference without model re-training, allowing users to\nbalance realness and fidelity through a tunable guidance scale. Extensive\nexperiments have demonstrated DiffBIR's superiority over state-of-the-art\napproaches for blind image super-resolution, blind face restoration and blind\nimage denoising tasks on both synthetic and real-world datasets. The code is\navailable at https://github.com/XPixelGroup/DiffBIR.\n","authors":["Xinqi Lin","Jingwen He","Ziyan Chen","Zhaoyang Lyu","Bo Dai","Fanghua Yu","Wanli Ouyang","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2308.15070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.13474v2","updated":"2024-03-31T08:12:21Z","published":"2022-08-29T10:19:10Z","title":"Prompt Tuning with Soft Context Sharing for Vision-Language Models","summary":" Vision-language models have recently shown great potential on many tasks in\ncomputer vision. Meanwhile, prior work demonstrates prompt tuning designed for\nvision-language models could acquire superior performance on few-shot image\nrecognition compared to linear probe, a strong baseline. In practice, many\nfew-shot tasks are inherently correlated, particularly within specialized\ndomains. However, such information is overlooked previously. Inspired by the\nfact that modeling task relationship by multi-task learning can usually boost\nperformance, we propose a novel method SoftCPT (Soft Context Sharing for Prompt\nTuning) to tune pre-trained vision-language models on multiple target few-shot\ntasks jointly. Specifically, we design a task-shared meta network to generate\nprompt context for each task using task name together with a learnable task\ncontext as input. The parameters of this meta network as well as the task\ncontext are tuned on the joint training set of all tasks. As such, the prompt\ncontext of all tasks will be shared in a soft manner. Extensive experiments\nacross four multi-task few-shot datasets covering 44 tasks and 1593 categories\ndemonstrate that SoftCPT significantly outperforms single-task prompt tuning\nmethods, highlighting the effectiveness of multi-task learning for\nvision-language prompt tuning. Code is available at\nhttps://github.com/kding1225/softcpt.\n","authors":["Kun Ding","Ying Wang","Pengzhang Liu","Qiang Yu","Haojian Zhang","Shiming Xiang","Chunhong Pan"],"pdf_url":"https://arxiv.org/pdf/2208.13474v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2311.16096v3","updated":"2024-03-31T08:06:12Z","published":"2023-11-27T18:59:04Z","title":"Animatable Gaussians: Learning Pose-dependent Gaussian Maps for\n High-fidelity Human Avatar Modeling","summary":" Modeling animatable human avatars from RGB videos is a long-standing and\nchallenging problem. Recent works usually adopt MLP-based neural radiance\nfields (NeRF) to represent 3D humans, but it remains difficult for pure MLPs to\nregress pose-dependent garment details. To this end, we introduce Animatable\nGaussians, a new avatar representation that leverages powerful 2D CNNs and 3D\nGaussian splatting to create high-fidelity avatars. To associate 3D Gaussians\nwith the animatable avatar, we learn a parametric template from the input\nvideos, and then parameterize the template on two front \\& back canonical\nGaussian maps where each pixel represents a 3D Gaussian. The learned template\nis adaptive to the wearing garments for modeling looser clothes like dresses.\nSuch template-guided 2D parameterization enables us to employ a powerful\nStyleGAN-based CNN to learn the pose-dependent Gaussian maps for modeling\ndetailed dynamic appearances. Furthermore, we introduce a pose projection\nstrategy for better generalization given novel poses. Overall, our method can\ncreate lifelike avatars with dynamic, realistic and generalized appearances.\nExperiments show that our method outperforms other state-of-the-art approaches.\nCode: https://github.com/lizhe00/AnimatableGaussians\n","authors":["Zhe Li","Zerong Zheng","Lizhen Wang","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16096v3.pdf","comment":"Accepted by CVPR 2024, Projectpage:\n https://animatable-gaussians.github.io/, Code:\n https://github.com/lizhe00/AnimatableGaussians"},{"id":"http://arxiv.org/abs/2210.09846v3","updated":"2024-03-31T07:50:22Z","published":"2022-10-15T11:00:54Z","title":"G-PECNet: Towards a Generalizable Pedestrian Trajectory Prediction\n System","summary":" Navigating dynamic physical environments without obstructing or damaging\nhuman assets is of quintessential importance for social robots. In this work,\nwe solve autonomous drone navigation's sub-problem of predicting out-of-domain\nhuman and agent trajectories using a deep generative model. Our method:\nGeneral-PECNet or G-PECNet observes an improvement of 9.5\\% on the Final\nDisplacement Error (FDE) on 2020's benchmark: PECNet through a combination of\narchitectural improvements inspired by periodic activation functions and\nsynthetic trajectory (data) augmentations using Hidden Markov Models (HMMs) and\nReinforcement Learning (RL). Additionally, we propose a simple\ngeometry-inspired metric for trajectory non-linearity and outlier detection,\nhelpful for the task. Code available at\nhttps://github.com/Aryan-Garg/PECNet-Pedestrian-Trajectory-Prediction.git\n","authors":["Aryan Garg","Renu M. Rameshan"],"pdf_url":"https://arxiv.org/pdf/2210.09846v3.pdf","comment":"Notable ICLR Tiny Paper 2024"},{"id":"http://arxiv.org/abs/2402.09989v3","updated":"2024-03-31T07:47:59Z","published":"2024-02-15T14:54:33Z","title":"LLMs as Bridges: Reformulating Grounded Multimodal Named Entity\n Recognition","summary":" Grounded Multimodal Named Entity Recognition (GMNER) is a nascent multimodal\ntask that aims to identify named entities, entity types and their corresponding\nvisual regions. GMNER task exhibits two challenging properties: 1) The weak\ncorrelation between image-text pairs in social media results in a significant\nportion of named entities being ungroundable. 2) There exists a distinction\nbetween coarse-grained referring expressions commonly used in similar tasks\n(e.g., phrase localization, referring expression comprehension) and\nfine-grained named entities. In this paper, we propose RiVEG, a unified\nframework that reformulates GMNER into a joint MNER-VE-VG task by leveraging\nlarge language models (LLMs) as a connecting bridge. This reformulation brings\ntwo benefits: 1) It maintains the optimal MNER performance and eliminates the\nneed for employing object detection methods to pre-extract regional features,\nthereby naturally addressing two major limitations of existing GMNER methods.\n2) The introduction of entity expansion expression and Visual Entailment (VE)\nModule unifies Visual Grounding (VG) and Entity Grounding (EG). It enables\nRiVEG to effortlessly inherit the Visual Entailment and Visual Grounding\ncapabilities of any current or prospective multimodal pretraining models.\nExtensive experiments demonstrate that RiVEG outperforms state-of-the-art\nmethods on the existing GMNER dataset and achieves absolute leads of 10.65%,\n6.21%, and 8.83% in all three subtasks.\n","authors":["Jinyuan Li","Han Li","Di Sun","Jiahao Wang","Wenkun Zhang","Zan Wang","Gang Pan"],"pdf_url":"https://arxiv.org/pdf/2402.09989v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07636v4","updated":"2024-03-31T07:42:17Z","published":"2024-03-12T13:18:22Z","title":"Decomposing Disease Descriptions for Enhanced Pathology Detection: A\n Multi-Aspect Vision-Language Pre-training Framework","summary":" Medical vision language pre-training (VLP) has emerged as a frontier of\nresearch, enabling zero-shot pathological recognition by comparing the query\nimage with the textual descriptions for each disease. Due to the complex\nsemantics of biomedical texts, current methods struggle to align medical images\nwith key pathological findings in unstructured reports. This leads to the\nmisalignment with the target disease's textual representation. In this paper,\nwe introduce a novel VLP framework designed to dissect disease descriptions\ninto their fundamental aspects, leveraging prior knowledge about the visual\nmanifestations of pathologies. This is achieved by consulting a large language\nmodel and medical experts. Integrating a Transformer module, our approach\naligns an input image with the diverse elements of a disease, generating\naspect-centric image representations. By consolidating the matches from each\naspect, we improve the compatibility between an image and its associated\ndisease. Additionally, capitalizing on the aspect-oriented representations, we\npresent a dual-head Transformer tailored to process known and unknown diseases,\noptimizing the comprehensive detection efficacy. Conducting experiments on\nseven downstream datasets, ours improves the accuracy of recent methods by up\nto 8.56% and 17.26% for seen and unseen categories, respectively. Our code is\nreleased at https://github.com/HieuPhan33/MAVL.\n","authors":["Vu Minh Hieu Phan","Yutong Xie","Yuankai Qi","Lingqiao Liu","Liyang Liu","Bowen Zhang","Zhibin Liao","Qi Wu","Minh-Son To","Johan W. Verjans"],"pdf_url":"https://arxiv.org/pdf/2403.07636v4.pdf","comment":"Accepted at CVPR2024. Pre-print before final camera-ready version"},{"id":"http://arxiv.org/abs/2312.01616v3","updated":"2024-03-31T05:57:57Z","published":"2023-12-04T04:14:09Z","title":"SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation\n System","summary":" Accuracy and computational efficiency are the most important metrics to\nVisual Inertial Navigation System (VINS). The existing VINS algorithms with\neither high accuracy or low computational complexity, are difficult to provide\nthe high precision localization in resource-constrained devices. To this end,\nwe propose a novel filter-based VINS framework named SchurVINS, which could\nguarantee both high accuracy by building a complete residual model and low\ncomputational complexity with Schur complement. Technically, we first formulate\nthe full residual model where Gradient, Hessian and observation covariance are\nexplicitly modeled. Then Schur complement is employed to decompose the full\nmodel into ego-motion residual model and landmark residual model. Finally,\nExtended Kalman Filter (EKF) update is implemented in these two models with\nhigh efficiency. Experiments on EuRoC and TUM-VI datasets show that our method\nnotably outperforms state-of-the-art (SOTA) methods in both accuracy and\ncomputational complexity. The experimental code of SchurVINS is available at\nhttps://github.com/bytedance/SchurVINS.\n","authors":["Yunfei Fan","Tianyu Zhao","Guidong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01616v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16051v2","updated":"2024-03-31T05:51:58Z","published":"2024-03-24T07:36:38Z","title":"Segment Anything Model for Road Network Graph Extraction","summary":" We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for\nextracting large-scale, vectorized road network graphs from satellite imagery.\nTo predict graph geometry, we formulate it as a dense semantic segmentation\ntask, leveraging the inherent strengths of SAM. The image encoder of SAM is\nfine-tuned to produce probability masks for roads and intersections, from which\nthe graph vertices are extracted via simple non-maximum suppression. To predict\ngraph topology, we designed a lightweight transformer-based graph neural\nnetwork, which leverages the SAM image embeddings to estimate the edge\nexistence probabilities between vertices. Our approach directly predicts the\ngraph vertices and edges for large regions without expensive and complex\npost-processing heuristics, and is capable of building complete road network\ngraphs spanning multiple square kilometers in a matter of seconds. With its\nsimple, straightforward, and minimalist design, SAM-Road achieves comparable\naccuracy with the state-of-the-art method RNGDet++, while being 40 times faster\non the City-scale dataset. We thus demonstrate the power of a foundational\nvision model when applied to a graph learning task. The code is available at\nhttps://github.com/htcr/sam_road.\n","authors":["Congrui Hetang","Haoru Xue","Cindy Le","Tianwei Yue","Wenping Wang","Yihui He"],"pdf_url":"https://arxiv.org/pdf/2403.16051v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10145v2","updated":"2024-03-31T05:24:35Z","published":"2024-03-15T09:44:02Z","title":"RCooper: A Real-world Large-scale Dataset for Roadside Cooperative\n Perception","summary":" The value of roadside perception, which could extend the boundaries of\nautonomous driving and traffic management, has gradually become more prominent\nand acknowledged in recent years. However, existing roadside perception\napproaches only focus on the single-infrastructure sensor system, which cannot\nrealize a comprehensive understanding of a traffic area because of the limited\nsensing range and blind spots. Orienting high-quality roadside perception, we\nneed Roadside Cooperative Perception (RCooper) to achieve practical\narea-coverage roadside perception for restricted traffic areas. Rcooper has its\nown domain-specific challenges, but further exploration is hindered due to the\nlack of datasets. We hence release the first real-world, large-scale RCooper\ndataset to bloom the research on practical roadside cooperative perception,\nincluding detection and tracking. The manually annotated dataset comprises 50k\nimages and 30k point clouds, including two representative traffic scenes (i.e.,\nintersection and corridor). The constructed benchmarks prove the effectiveness\nof roadside cooperation perception and demonstrate the direction of further\nresearch. Codes and dataset can be accessed at:\nhttps://github.com/AIR-THU/DAIR-RCooper.\n","authors":["Ruiyang Hao","Siqi Fan","Yingru Dai","Zhenlin Zhang","Chenxi Li","Yuntian Wang","Haibao Yu","Wenxian Yang","Jirui Yuan","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2403.10145v2.pdf","comment":"Accepted by CVPR2024. 10 pages with 6 figures"},{"id":"http://arxiv.org/abs/2401.06415v2","updated":"2024-03-31T05:22:00Z","published":"2024-01-12T07:23:02Z","title":"3D Reconstruction of Interacting Multi-Person in Clothing from a Single\n Image","summary":" This paper introduces a novel pipeline to reconstruct the geometry of\ninteracting multi-person in clothing on a globally coherent scene space from a\nsingle image. The main challenge arises from the occlusion: a part of a human\nbody is not visible from a single view due to the occlusion by others or the\nself, which introduces missing geometry and physical implausibility (e.g.,\npenetration). We overcome this challenge by utilizing two human priors for\ncomplete 3D geometry and surface contacts. For the geometry prior, an encoder\nlearns to regress the image of a person with missing body parts to the latent\nvectors; a decoder decodes these vectors to produce 3D features of the\nassociated geometry; and an implicit network combines these features with a\nsurface normal map to reconstruct a complete and detailed 3D humans. For the\ncontact prior, we develop an image-space contact detector that outputs a\nprobability distribution of surface contacts between people in 3D. We use these\npriors to globally refine the body poses, enabling the penetration-free and\naccurate reconstruction of interacting multi-person in clothing on the scene\nspace. The results demonstrate that our method is complete, globally coherent,\nand physically plausible compared to existing methods.\n","authors":["Junuk Cha","Hansol Lee","Jaewon Kim","Nhat Nguyen Bao Truong","Jae Shin Yoon","Seungryul Baek"],"pdf_url":"https://arxiv.org/pdf/2401.06415v2.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2312.16084v2","updated":"2024-03-31T04:45:58Z","published":"2023-12-26T15:14:37Z","title":"LangSplat: 3D Language Gaussian Splatting","summary":" Humans live in a 3D world and commonly use natural language to interact with\na 3D scene. Modeling a 3D language field to support open-ended language queries\nin 3D has gained increasing attention recently. This paper introduces\nLangSplat, which constructs a 3D language field that enables precise and\nefficient open-vocabulary querying within 3D spaces. Unlike existing methods\nthat ground CLIP language embeddings in a NeRF model, LangSplat advances the\nfield by utilizing a collection of 3D Gaussians, each encoding language\nfeatures distilled from CLIP, to represent the language field. By employing a\ntile-based splatting technique for rendering language features, we circumvent\nthe costly rendering process inherent in NeRF. Instead of directly learning\nCLIP embeddings, LangSplat first trains a scene-wise language autoencoder and\nthen learns language features on the scene-specific latent space, thereby\nalleviating substantial memory demands imposed by explicit modeling. Existing\nmethods struggle with imprecise and vague 3D language fields, which fail to\ndiscern clear boundaries between objects. We delve into this issue and propose\nto learn hierarchical semantics using SAM, thereby eliminating the need for\nextensively querying the language field across various scales and the\nregularization of DINO features. Extensive experimental results show that\nLangSplat significantly outperforms the previous state-of-the-art method LERF\nby a large margin. Notably, LangSplat is extremely efficient, achieving a 199\n$\\times$ speedup compared to LERF at the resolution of 1440 $\\times$ 1080. We\nstrongly recommend readers to check out our video results at\nhttps://langsplat.github.io/\n","authors":["Minghan Qin","Wanhua Li","Jiawei Zhou","Haoqian Wang","Hanspeter Pfister"],"pdf_url":"https://arxiv.org/pdf/2312.16084v2.pdf","comment":"CVPR 2024. Project Page: https://langsplat.github.io"},{"id":"http://arxiv.org/abs/2303.08314v3","updated":"2024-03-31T04:11:30Z","published":"2023-03-15T02:08:20Z","title":"Guided Slot Attention for Unsupervised Video Object Segmentation","summary":" Unsupervised video object segmentation aims to segment the most prominent\nobject in a video sequence. However, the existence of complex backgrounds and\nmultiple foreground objects make this task challenging. To address this issue,\nwe propose a guided slot attention network to reinforce spatial structural\ninformation and obtain better foreground--background separation. The foreground\nand background slots, which are initialized with query guidance, are\niteratively refined based on interactions with template information.\nFurthermore, to improve slot--template interaction and effectively fuse global\nand local features in the target and reference frames, K-nearest neighbors\nfiltering and a feature aggregation transformer are introduced. The proposed\nmodel achieves state-of-the-art performance on two popular datasets.\nAdditionally, we demonstrate the robustness of the proposed model in\nchallenging scenes through various comparative experiments.\n","authors":["Minhyeok Lee","Suhwan Cho","Dogyoon Lee","Chaewon Park","Jungho Lee","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2303.08314v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2312.08963v2","updated":"2024-03-31T02:18:23Z","published":"2023-12-14T14:10:57Z","title":"LEMON: Learning 3D Human-Object Interaction Relation from 2D Images","summary":" Learning 3D human-object interaction relation is pivotal to embodied AI and\ninteraction modeling. Most existing methods approach the goal by learning to\npredict isolated interaction elements, e.g., human contact, object affordance,\nand human-object spatial relation, primarily from the perspective of either the\nhuman or the object. Which underexploit certain correlations between the\ninteraction counterparts (human and object), and struggle to address the\nuncertainty in interactions. Actually, objects' functionalities potentially\naffect humans' interaction intentions, which reveals what the interaction is.\nMeanwhile, the interacting humans and objects exhibit matching geometric\nstructures, which presents how to interact. In light of this, we propose\nharnessing these inherent correlations between interaction counterparts to\nmitigate the uncertainty and jointly anticipate the above interaction elements\nin 3D space. To achieve this, we present LEMON (LEarning 3D huMan-Object\niNteraction relation), a unified model that mines interaction intentions of the\ncounterparts and employs curvatures to guide the extraction of geometric\ncorrelations, combining them to anticipate the interaction elements. Besides,\nthe 3D Interaction Relation dataset (3DIR) is collected to serve as the test\nbed for training and evaluation. Extensive experiments demonstrate the\nsuperiority of LEMON over methods estimating each element in isolation.\n","authors":["Yuhang Yang","Wei Zhai","Hongchen Luo","Yang Cao","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2312.08963v2.pdf","comment":"accept by CVPR2024"},{"id":"http://arxiv.org/abs/2305.18171v4","updated":"2024-03-31T22:58:38Z","published":"2023-05-29T16:02:09Z","title":"Improved Probabilistic Image-Text Representations","summary":" Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,\nsuffers from the inherent ambiguity arising from multiplicity and imperfect\nannotations. Deterministic functions are not sufficiently powerful to capture\nambiguity, prompting the exploration of probabilistic embeddings to tackle the\nchallenge. However, the existing probabilistic ITM approach encounters two key\nshortcomings; the burden of heavy computations due to the Monte Carlo\napproximation, and the loss saturation issue in the face of abundant false\nnegatives. To overcome the issues, this paper presents an improved\nProbabilistic Cross-Modal Embeddings (named PCME++) by introducing a new\nprobabilistic distance with a closed-form solution. In addition, two\noptimization techniques are proposed to enhance PCME++ further: first, the\nincorporation of pseudo-positives to prevent the negative effect under massive\nfalse negatives; second, mixed sample data augmentation for probabilistic\nmatching. Experimental results on MS-COCO Caption and two extended benchmarks,\nCxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to\nstate-of-the-art ITM methods. The robustness of PCME++ is also evaluated under\nnoisy image-text correspondences. In addition, the potential applicability of\nPCME++ in automatic prompt-filtering for zero-shot classification is shown. The\ncode is available at https://github.com/naver-ai/pcmepp\n","authors":["Sanghyuk Chun"],"pdf_url":"https://arxiv.org/pdf/2305.18171v4.pdf","comment":"ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp.\n Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB"},{"id":"http://arxiv.org/abs/2404.00815v1","updated":"2024-03-31T22:18:56Z","published":"2024-03-31T22:18:56Z","title":"Towards Realistic Scene Generation with LiDAR Diffusion Models","summary":" Diffusion models (DMs) excel in photo-realistic image synthesis, but their\nadaptation to LiDAR scene generation poses a substantial hurdle. This is\nprimarily because DMs operating in the point space struggle to preserve the\ncurve-like patterns and 3D geometry of LiDAR scenes, which consumes much of\ntheir representation power. In this paper, we propose LiDAR Diffusion Models\n(LiDMs) to generate LiDAR-realistic scenes from a latent space tailored to\ncapture the realism of LiDAR scenes by incorporating geometric priors into the\nlearning pipeline. Our method targets three major desiderata: pattern realism,\ngeometry realism, and object realism. Specifically, we introduce curve-wise\ncompression to simulate real-world LiDAR patterns, point-wise coordinate\nsupervision to learn scene geometry, and patch-wise encoding for a full 3D\nobject context. With these three core designs, our method achieves competitive\nperformance on unconditional LiDAR generation in 64-beam scenario and state of\nthe art on conditional LiDAR generation, while maintaining high efficiency\ncompared to point-based DMs (up to 107$\\times$ faster). Furthermore, by\ncompressing LiDAR scenes into a latent space, we enable the controllability of\nDMs with various conditions such as semantic maps, camera views, and text\nprompts. Our code and pretrained weights are available at\nhttps://github.com/hancyran/LiDAR-Diffusion.\n","authors":["Haoxi Ran","Vitor Guizilini","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00815v1.pdf","comment":"CVPR 2024. Code available at\n https://github.com/hancyran/LiDAR-Diffusion"},{"id":"http://arxiv.org/abs/2404.00807v1","updated":"2024-03-31T21:43:08Z","published":"2024-03-31T21:43:08Z","title":"GAMA-IR: Global Additive Multidimensional Averaging for Fast Image\n Restoration","summary":" Deep learning-based methods have shown remarkable success for various image\nrestoration tasks such as denoising and deblurring. The current\nstate-of-the-art networks are relatively deep and utilize (variants of) self\nattention mechanisms. Those networks are significantly slower than shallow\nconvolutional networks, which however perform worse. In this paper, we\nintroduce an image restoration network that is both fast and yields excellent\nimage quality. The network is designed to minimize the latency and memory\nconsumption when executed on a standard GPU, while maintaining state-of-the-art\nperformance. The network is a simple shallow network with an efficient block\nthat implements global additive multidimensional averaging operations. This\nblock can capture global information and enable a large receptive field even\nwhen used in shallow networks with minimal computational overhead. Through\nextensive experiments and evaluations on diverse tasks, we demonstrate that our\nnetwork achieves comparable or even superior results to existing\nstate-of-the-art image restoration networks with less latency. For instance, we\nexceed the state-of-the-art result on real-world SIDD denoising by 0.11dB,\nwhile being 2 to 10 times faster.\n","authors":["Youssef Mansour","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2404.00807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00801v1","updated":"2024-03-31T21:17:48Z","published":"2024-03-31T21:17:48Z","title":"$R^2$-Tuning: Efficient Image-to-Video Transfer Learning for Video\n Temporal Grounding","summary":" Video temporal grounding (VTG) is a fine-grained video understanding problem\nthat aims to ground relevant clips in untrimmed videos given natural language\nqueries. Most existing VTG models are built upon frame-wise final-layer CLIP\nfeatures, aided by additional temporal backbones (e.g., SlowFast) with\nsophisticated temporal reasoning mechanisms. In this work, we claim that CLIP\nitself already shows great potential for fine-grained spatial-temporal\nmodeling, as each layer offers distinct yet useful information under different\ngranularity levels. Motivated by this, we propose Reversed Recurrent Tuning\n($R^2$-Tuning), a parameter- and memory-efficient transfer learning framework\nfor video temporal grounding. Our method learns a lightweight $R^2$ Block\ncontaining only 1.5% of the total parameters to perform progressive\nspatial-temporal modeling. Starting from the last layer of CLIP, $R^2$ Block\nrecurrently aggregates spatial features from earlier layers, then refines\ntemporal correlation conditioning on the given query, resulting in a\ncoarse-to-fine scheme. $R^2$-Tuning achieves state-of-the-art performance\nacross three VTG tasks (i.e., moment retrieval, highlight detection, and video\nsummarization) on six public benchmarks (i.e., QVHighlights, Charades-STA,\nEgo4D-NLQ, TACoS, YouTube Highlights, and TVSum) even without the additional\nbackbone, demonstrating the significance and effectiveness of the proposed\nscheme. Our code is available at https://github.com/yeliudev/R2-Tuning.\n","authors":["Ye Liu","Jixuan He","Wanhua Li","Junsik Kim","Donglai Wei","Hanspeter Pfister","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.00801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00785v1","updated":"2024-03-31T20:08:23Z","published":"2024-03-31T20:08:23Z","title":"Disentangling Hippocampal Shape Variations: A Study of Neurological\n Disorders Using Graph Variational Autoencoder with Contrastive Learning","summary":" This paper presents a comprehensive study focused on disentangling\nhippocampal shape variations from diffusion tensor imaging (DTI) datasets\nwithin the context of neurological disorders. Leveraging a Graph Variational\nAutoencoder (VAE) enhanced with Supervised Contrastive Learning, our approach\naims to improve interpretability by disentangling two distinct latent variables\ncorresponding to age and the presence of diseases. In our ablation study, we\ninvestigate a range of VAE architectures and contrastive loss functions,\nshowcasing the enhanced disentanglement capabilities of our approach. This\nevaluation uses synthetic 3D torus mesh data and real 3D hippocampal mesh\ndatasets derived from the DTI hippocampal dataset. Our supervised\ndisentanglement model outperforms several state-of-the-art (SOTA) methods like\nattribute and guided VAEs in terms of disentanglement scores. Our model\ndistinguishes between age groups and disease status in patients with Multiple\nSclerosis (MS) using the hippocampus data. Our Graph VAE with Supervised\nContrastive Learning shows the volume changes of the hippocampus of MS\npopulations at different ages, and the result is consistent with the current\nneuroimaging literature. This research provides valuable insights into the\nrelationship between neurological disorder and hippocampal shape changes in\ndifferent age groups of MS populations using a Graph VAE with Supervised\nContrastive loss.\n","authors":["Jakaria Rabbi","Johannes Kiechle","Christian Beaulieu","Nilanjan Ray","Dana Cobzas"],"pdf_url":"https://arxiv.org/pdf/2404.00785v1.pdf","comment":"Length: 23 pages and submitted to the journal: MELBA (Machine\n Learning for Biomedical Imaging)"},{"id":"http://arxiv.org/abs/2404.00777v1","updated":"2024-03-31T19:28:04Z","published":"2024-03-31T19:28:04Z","title":"Privacy-preserving Optics for Enhancing Protection in Face\n De-identification","summary":" The modern surge in camera usage alongside widespread computer vision\ntechnology applications poses significant privacy and security concerns.\nCurrent artificial intelligence (AI) technologies aid in recognizing relevant\nevents and assisting in daily tasks in homes, offices, hospitals, etc. The need\nto access or process personal information for these purposes raises privacy\nconcerns. While software-level solutions like face de-identification provide a\ngood privacy/utility trade-off, they present vulnerabilities to sniffing\nattacks. In this paper, we propose a hardware-level face de-identification\nmethod to solve this vulnerability. Specifically, our approach first learns an\noptical encoder along with a regression model to obtain a face heatmap while\nhiding the face identity from the source image. We also propose an\nanonymization framework that generates a new face using the privacy-preserving\nimage, face heatmap, and a reference face image from a public dataset as input.\nWe validate our approach with extensive simulations and hardware experiments.\n","authors":["Jhon Lopez","Carlos Hinojosa","Henry Arguello","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2404.00777v1.pdf","comment":"Accepted to CVPR 2024. Project Website and Code coming soon"},{"id":"http://arxiv.org/abs/2404.00767v1","updated":"2024-03-31T18:45:13Z","published":"2024-03-31T18:45:13Z","title":"Intensity-based 3D motion correction for cardiac MR images","summary":" Cardiac magnetic resonance (CMR) image acquisition requires subjects to hold\ntheir breath while 2D cine images are acquired. This process assumes that the\nheart remains in the same position across all slices. However, differences in\nbreathhold positions or patient motion introduce 3D slice misalignments. In\nthis work, we propose an algorithm that simultaneously aligns all SA and LA\nslices by maximizing the pair-wise intensity agreement between their\nintersections. Unlike previous works, our approach is formulated as a\nsubject-specific optimization problem and requires no prior knowledge of the\nunderlying anatomy. We quantitatively demonstrate that the proposed method is\nrobust against a large range of rotations and translations by synthetically\nmisaligning 10 motion-free datasets and aligning them back using the proposed\nmethod.\n","authors":["Nil Stolt-Ansó","Vasiliki Sideri-Lampretsa","Maik Dannecker","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2404.00767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00742v1","updated":"2024-03-31T17:18:57Z","published":"2024-03-31T17:18:57Z","title":"Adapting to Length Shift: FlexiLength Network for Trajectory Prediction","summary":" Trajectory prediction plays an important role in various applications,\nincluding autonomous driving, robotics, and scene understanding. Existing\napproaches mainly focus on developing compact neural networks to increase\nprediction precision on public datasets, typically employing a standardized\ninput duration. However, a notable issue arises when these models are evaluated\nwith varying observation lengths, leading to a significant performance drop, a\nphenomenon we term the Observation Length Shift. To address this issue, we\nintroduce a general and effective framework, the FlexiLength Network (FLN), to\nenhance the robustness of existing trajectory prediction techniques against\nvarying observation periods. Specifically, FLN integrates trajectory data with\ndiverse observation lengths, incorporates FlexiLength Calibration (FLC) to\nacquire temporal invariant representations, and employs FlexiLength Adaptation\n(FLA) to further refine these representations for more accurate future\ntrajectory predictions. Comprehensive experiments on multiple datasets, ie,\nETH/UCY, nuScenes, and Argoverse 1, demonstrate the effectiveness and\nflexibility of our proposed FLN framework.\n","authors":["Yi Xu","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2404.00742v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00741v1","updated":"2024-03-31T17:02:24Z","published":"2024-03-31T17:02:24Z","title":"Rethinking Interactive Image Segmentation with Low Latency, High\n Quality, and Diverse Prompts","summary":" The goal of interactive image segmentation is to delineate specific regions\nwithin an image via visual or language prompts. Low-latency and high-quality\ninteractive segmentation with diverse prompts remain challenging for existing\nspecialist and generalist models. Specialist models, with their limited prompts\nand task-specific designs, experience high latency because the image must be\nrecomputed every time the prompt is updated, due to the joint encoding of image\nand visual prompts. Generalist models, exemplified by the Segment Anything\nModel (SAM), have recently excelled in prompt diversity and efficiency, lifting\nimage segmentation to the foundation model era. However, for high-quality\nsegmentations, SAM still lags behind state-of-the-art specialist models despite\nSAM being trained with x100 more segmentation masks. In this work, we delve\ndeep into the architectural differences between the two types of models. We\nobserve that dense representation and fusion of visual prompts are the key\ndesign choices contributing to the high segmentation quality of specialist\nmodels. In light of this, we reintroduce this dense design into the generalist\nmodels, to facilitate the development of generalist models with high\nsegmentation quality. To densely represent diverse visual prompts, we propose\nto use a dense map to capture five types: clicks, boxes, polygons, scribbles,\nand masks. Thus, we propose SegNext, a next-generation interactive segmentation\napproach offering low latency, high quality, and diverse prompt support. Our\nmethod outperforms current state-of-the-art methods on HQSeg-44K and DAVIS,\nboth quantitatively and qualitatively.\n","authors":["Qin Liu","Jaemin Cho","Mohit Bansal","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2404.00741v1.pdf","comment":"CVPR 2024 https://github.com/uncbiag/SegNext"},{"id":"http://arxiv.org/abs/2404.00726v1","updated":"2024-03-31T15:56:41Z","published":"2024-03-31T15:56:41Z","title":"MugenNet: A Novel Combined Convolution Neural Network and Transformer\n Network with its Application for Colonic Polyp Image Segmentation","summary":" Biomedical image segmentation is a very important part in disease diagnosis.\nThe term \"colonic polyps\" refers to polypoid lesions that occur on the surface\nof the colonic mucosa within the intestinal lumen. In clinical practice, early\ndetection of polyps is conducted through colonoscopy examinations and\nbiomedical image processing. Therefore, the accurate polyp image segmentation\nis of great significance in colonoscopy examinations. Convolutional Neural\nNetwork (CNN) is a common automatic segmentation method, but its main\ndisadvantage is the long training time. Transformer utilizes a self-attention\nmechanism, which essentially assigns different importance weights to each piece\nof information, thus achieving high computational efficiency during\nsegmentation. However, a potential drawback is the risk of information loss. In\nthe study reported in this paper, based on the well-known hybridization\nprinciple, we proposed a method to combine CNN and Transformer to retain the\nstrengths of both, and we applied this method to build a system called MugenNet\nfor colonic polyp image segmentation. We conducted a comprehensive experiment\nto compare MugenNet with other CNN models on five publicly available datasets.\nThe ablation experiment on MugentNet was conducted as well. The experimental\nresults show that MugenNet achieves significantly higher processing speed and\naccuracy compared with CNN alone. The generalized implication with our work is\na method to optimally combine two complimentary methods of machine learning.\n","authors":["Chen Peng","Zhiqin Qian","Kunyu Wang","Qi Luo","Zhuming Bi","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00724v1","updated":"2024-03-31T15:50:52Z","published":"2024-03-31T15:50:52Z","title":"Absolute-Unified Multi-Class Anomaly Detection via Class-Agnostic\n Distribution Alignment","summary":" Conventional unsupervised anomaly detection (UAD) methods build separate\nmodels for each object category. Recent studies have proposed to train a\nunified model for multiple classes, namely model-unified UAD. However, such\nmethods still implement the unified model separately on each class during\ninference with respective anomaly decision thresholds, which hinders their\napplication when the image categories are entirely unavailable. In this work,\nwe present a simple yet powerful method to address multi-class anomaly\ndetection without any class information, namely \\textit{absolute-unified} UAD.\nWe target the crux of prior works in this challenging setting: different\nobjects have mismatched anomaly score distributions. We propose Class-Agnostic\nDistribution Alignment (CADA) to align the mismatched score distribution of\neach implicit class without knowing class information, which enables unified\nanomaly detection for all classes and samples. The essence of CADA is to\npredict each class's score distribution of normal samples given any image,\nnormal or anomalous, of this class. As a general component, CADA can activate\nthe potential of nearly all UAD methods under absolute-unified setting. Our\napproach is extensively evaluated under the proposed setting on two popular UAD\nbenchmark datasets, MVTec AD and VisA, where we exceed previous\nstate-of-the-art by a large margin.\n","authors":["Jia Guo","Shuai Lu","Weihang Zhang","Huiqi Li"],"pdf_url":"https://arxiv.org/pdf/2404.00724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00717v1","updated":"2024-03-31T15:22:11Z","published":"2024-03-31T15:22:11Z","title":"End-to-End Autonomous Driving through V2X Cooperation","summary":" Cooperatively utilizing both ego-vehicle and infrastructure sensor data via\nV2X communication has emerged as a promising approach for advanced autonomous\ndriving. However, current research mainly focuses on improving individual\nmodules, rather than taking end-to-end learning to optimize final planning\nperformance, resulting in underutilized data potential. In this paper, we\nintroduce UniV2X, a pioneering cooperative autonomous driving framework that\nseamlessly integrates all key driving modules across diverse views into a\nunified network. We propose a sparse-dense hybrid data transmission and fusion\nmechanism for effective vehicle-infrastructure cooperation, offering three\nadvantages: 1) Effective for simultaneously enhancing agent perception, online\nmapping, and occupancy prediction, ultimately improving planning performance.\n2) Transmission-friendly for practical and limited communication conditions. 3)\nReliable data fusion with interpretability of this hybrid data. We implement\nUniV2X, as well as reproducing several benchmark methods, on the challenging\nDAIR-V2X, the real-world cooperative driving dataset. Experimental results\ndemonstrate the effectiveness of UniV2X in significantly enhancing planning\nperformance, as well as all intermediate output performance. Code is at\nhttps://github.com/AIR-THU/UniV2X.\n","authors":["Haibao Yu","Wenxian Yang","Jiaru Zhong","Zhenwei Yang","Siqi Fan","Ping Luo","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2404.00717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00714v1","updated":"2024-03-31T15:18:38Z","published":"2024-03-31T15:18:38Z","title":"Neural Radiance Field-based Visual Rendering: A Comprehensive Review","summary":" In recent years, Neural Radiance Fields (NeRF) has made remarkable progress\nin the field of computer vision and graphics, providing strong technical\nsupport for solving key tasks including 3D scene understanding, new perspective\nsynthesis, human body reconstruction, robotics, and so on, the attention of\nacademics to this research result is growing. As a revolutionary neural\nimplicit field representation, NeRF has caused a continuous research boom in\nthe academic community. Therefore, the purpose of this review is to provide an\nin-depth analysis of the research literature on NeRF within the past two years,\nto provide a comprehensive academic perspective for budding researchers. In\nthis paper, the core architecture of NeRF is first elaborated in detail,\nfollowed by a discussion of various improvement strategies for NeRF, and case\nstudies of NeRF in diverse application scenarios, demonstrating its practical\nutility in different domains. In terms of datasets and evaluation metrics, This\npaper details the key resources needed for NeRF model training. Finally, this\npaper provides a prospective discussion on the future development trends and\npotential challenges of NeRF, aiming to provide research inspiration for\nresearchers in the field and to promote the further development of related\ntechnologies.\n","authors":["Mingyuan Yao","Yukang Huo","Yang Ran","Qingbin Tian","Ruifeng Wang","Haihua Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00714v1.pdf","comment":"35 pages, 22 figures, 14 tables, 18 formulas"},{"id":"http://arxiv.org/abs/2404.00710v1","updated":"2024-03-31T15:03:31Z","published":"2024-03-31T15:03:31Z","title":"Unknown Prompt, the only Lacuna: Unveiling CLIP's Potential for Open\n Domain Generalization","summary":" We delve into Open Domain Generalization (ODG), marked by domain and category\nshifts between training's labeled source and testing's unlabeled target\ndomains. Existing solutions to ODG face limitations due to constrained\ngeneralizations of traditional CNN backbones and errors in detecting target\nopen samples in the absence of prior knowledge. Addressing these pitfalls, we\nintroduce ODG-CLIP, harnessing the semantic prowess of the vision-language\nmodel, CLIP. Our framework brings forth three primary innovations: Firstly,\ndistinct from prevailing paradigms, we conceptualize ODG as a multi-class\nclassification challenge encompassing both known and novel categories. Central\nto our approach is modeling a unique prompt tailored for detecting unknown\nclass samples, and to train this, we employ a readily accessible stable\ndiffusion model, elegantly generating proxy images for the open class.\nSecondly, aiming for domain-tailored classification (prompt) weights while\nensuring a balance of precision and simplicity, we devise a novel visual\nstylecentric prompt learning mechanism. Finally, we infuse images with\nclass-discriminative knowledge derived from the prompt space to augment the\nfidelity of CLIP's visual embeddings. We introduce a novel objective to\nsafeguard the continuity of this infused semantic intel across domains,\nespecially for the shared classes. Through rigorous testing on diverse\ndatasets, covering closed and open-set DG contexts, ODG-CLIP demonstrates clear\nsupremacy, consistently outpacing peers with performance boosts between 8%-16%.\nCode will be available at https://github.com/mainaksingha01/ODG-CLIP.\n","authors":["Mainak Singha","Ankit Jha","Shirsha Bose","Ashwin Nair","Moloud Abdar","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2404.00710v1.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00701v1","updated":"2024-03-31T14:37:25Z","published":"2024-03-31T14:37:25Z","title":"Training-Free Semantic Segmentation via LLM-Supervision","summary":" Recent advancements in open vocabulary models, like CLIP, have notably\nadvanced zero-shot classification and segmentation by utilizing natural\nlanguage for class-specific embeddings. However, most research has focused on\nimproving model accuracy through prompt engineering, prompt learning, or\nfine-tuning with limited labeled data, thereby overlooking the importance of\nrefining the class descriptors. This paper introduces a new approach to\ntext-supervised semantic segmentation using supervision by a large language\nmodel (LLM) that does not require extra training. Our method starts from an\nLLM, like GPT-3, to generate a detailed set of subclasses for more accurate\nclass representation. We then employ an advanced text-supervised semantic\nsegmentation model to apply the generated subclasses as target labels,\nresulting in diverse segmentation results tailored to each subclass's unique\ncharacteristics. Additionally, we propose an assembly that merges the\nsegmentation maps from the various subclass descriptors to ensure a more\ncomprehensive representation of the different aspects in the test images.\nThrough comprehensive experiments on three standard benchmarks, our method\noutperforms traditional text-supervised semantic segmentation methods by a\nmarked margin.\n","authors":["Wenfang Sun","Yingjun Du","Gaowen Liu","Ramana Kompella","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2404.00701v1.pdf","comment":"22 pages,10 figures, conference"},{"id":"http://arxiv.org/abs/2404.00694v1","updated":"2024-03-31T14:04:57Z","published":"2024-03-31T14:04:57Z","title":"DMSSN: Distilled Mixed Spectral-Spatial Network for Hyperspectral\n Salient Object Detection","summary":" Hyperspectral salient object detection (HSOD) has exhibited remarkable\npromise across various applications, particularly in intricate scenarios where\nconventional RGB-based approaches fall short. Despite the considerable progress\nin HSOD method advancements, two critical challenges require immediate\nattention. Firstly, existing hyperspectral data dimension reduction techniques\nincur a loss of spectral information, which adversely affects detection\naccuracy. Secondly, previous methods insufficiently harness the inherent\ndistinctive attributes of hyperspectral images (HSIs) during the feature\nextraction process. To address these challenges, we propose a novel approach\ntermed the Distilled Mixed Spectral-Spatial Network (DMSSN), comprising a\nDistilled Spectral Encoding process and a Mixed Spectral-Spatial Transformer\n(MSST) feature extraction network. The encoding process utilizes knowledge\ndistillation to construct a lightweight autoencoder for dimension reduction,\nstriking a balance between robust encoding capabilities and low computational\ncosts. The MSST extracts spectral-spatial features through multiple attention\nhead groups, collaboratively enhancing its resistance to intricate scenarios.\nMoreover, we have created a large-scale HSOD dataset, HSOD-BIT, to tackle the\nissue of data scarcity in this field and meet the fundamental data requirements\nof deep network training. Extensive experiments demonstrate that our proposed\nDMSSN achieves state-of-the-art performance on multiple datasets. We will soon\nmake the code and dataset publicly available on\nhttps://github.com/anonymous0519/HSOD-BIT.\n","authors":["Haolin Qin","Tingfa Xu","Peifu Liu","Jingxuan Xu","Jianan Li"],"pdf_url":"https://arxiv.org/pdf/2404.00694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00680v1","updated":"2024-03-31T13:12:41Z","published":"2024-03-31T13:12:41Z","title":"Learning to Rank Patches for Unbiased Image Redundancy Reduction","summary":" Images suffer from heavy spatial redundancy because pixels in neighboring\nregions are spatially correlated. Existing approaches strive to overcome this\nlimitation by reducing less meaningful image regions. However, current leading\nmethods rely on supervisory signals. They may compel models to preserve content\nthat aligns with labeled categories and discard content belonging to unlabeled\ncategories. This categorical inductive bias makes these methods less effective\nin real-world scenarios. To address this issue, we propose a self-supervised\nframework for image redundancy reduction called Learning to Rank Patches\n(LTRP). We observe that image reconstruction of masked image modeling models is\nsensitive to the removal of visible patches when the masking ratio is high\n(e.g., 90\\%). Building upon it, we implement LTRP via two steps: inferring the\nsemantic density score of each patch by quantifying variation between\nreconstructions with and without this patch, and learning to rank the patches\nwith the pseudo score. The entire process is self-supervised, thus getting out\nof the dilemma of categorical inductive bias. We design extensive experiments\non different datasets and tasks. The results demonstrate that LTRP outperforms\nboth supervised and other self-supervised methods due to the fair assessment of\nimage content.\n","authors":["Yang Luo","Zhineng Chen","Peng Zhou","Zuxuan Wu","Xieping Gao","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.00680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00679v1","updated":"2024-03-31T13:09:06Z","published":"2024-03-31T13:09:06Z","title":"Weak-to-Strong 3D Object Detection with X-Ray Distillation","summary":" This paper addresses the critical challenges of sparsity and occlusion in\nLiDAR-based 3D object detection. Current methods often rely on supplementary\nmodules or specific architectural designs, potentially limiting their\napplicability to new and evolving architectures. To our knowledge, we are the\nfirst to propose a versatile technique that seamlessly integrates into any\nexisting framework for 3D Object Detection, marking the first instance of\nWeak-to-Strong generalization in 3D computer vision. We introduce a novel\nframework, X-Ray Distillation with Object-Complete Frames, suitable for both\nsupervised and semi-supervised settings, that leverages the temporal aspect of\npoint cloud sequences. This method extracts crucial information from both\nprevious and subsequent LiDAR frames, creating Object-Complete frames that\nrepresent objects from multiple viewpoints, thus addressing occlusion and\nsparsity. Given the limitation of not being able to generate Object-Complete\nframes during online inference, we utilize Knowledge Distillation within a\nTeacher-Student framework. This technique encourages the strong Student model\nto emulate the behavior of the weaker Teacher, which processes simple and\ninformative Object-Complete frames, effectively offering a comprehensive view\nof objects as if seen through X-ray vision. Our proposed methods surpass\nstate-of-the-art in semi-supervised learning by 1-1.5 mAP and enhance the\nperformance of five established supervised models by 1-2 mAP on standard\nautonomous driving datasets, even with default hyperparameters. Code for\nObject-Complete frames is available here:\nhttps://github.com/sakharok13/X-Ray-Teacher-Patching-Tools.\n","authors":["Alexander Gambashidze","Aleksandr Dadukin","Maksim Golyadkin","Maria Razzhivina","Ilya Makarov"],"pdf_url":"https://arxiv.org/pdf/2404.00679v1.pdf","comment":"Computer Vision and Pattern Recognition 2024"},{"id":"http://arxiv.org/abs/2404.00678v1","updated":"2024-03-31T13:07:00Z","published":"2024-03-31T13:07:00Z","title":"OmniSDF: Scene Reconstruction using Omnidirectional Signed Distance\n Functions and Adaptive Binoctrees","summary":" We present a method to reconstruct indoor and outdoor static scene geometry\nand appearance from an omnidirectional video moving in a small circular sweep.\nThis setting is challenging because of the small baseline and large depth\nranges, making it difficult to find ray crossings. To better constrain the\noptimization, we estimate geometry as a signed distance field within a\nspherical binoctree data structure and use a complementary efficient tree\ntraversal strategy based on a breadth-first search for sampling. Unlike regular\ngrids or trees, the shape of this structure well-matches the camera setting,\ncreating a better memory-quality trade-off. From an initial depth estimate, the\nbinoctree is adaptively subdivided throughout the optimization; previous\nmethods use a fixed depth that leaves the scene undersampled. In comparison\nwith three neural optimization methods and two non-neural methods, ours shows\ndecreased geometry error on average, especially in a detailed scene, while\nsignificantly reducing the required number of voxels to represent such details.\n","authors":["Hakyeong Kim","Andreas Meuleman","Hyeonjoong Jang","James Tompkin","Min H. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00676v1","updated":"2024-03-31T12:55:05Z","published":"2024-03-31T12:55:05Z","title":"OmniLocalRF: Omnidirectional Local Radiance Fields from Dynamic Videos","summary":" Omnidirectional cameras are extensively used in various applications to\nprovide a wide field of vision. However, they face a challenge in synthesizing\nnovel views due to the inevitable presence of dynamic objects, including the\nphotographer, in their wide field of view. In this paper, we introduce a new\napproach called Omnidirectional Local Radiance Fields (OmniLocalRF) that can\nrender static-only scene views, removing and inpainting dynamic objects\nsimultaneously. Our approach combines the principles of local radiance fields\nwith the bidirectional optimization of omnidirectional rays. Our input is an\nomnidirectional video, and we evaluate the mutual observations of the entire\nangle between the previous and current frames. To reduce ghosting artifacts of\ndynamic objects and inpaint occlusions, we devise a multi-resolution motion\nmask prediction module. Unlike existing methods that primarily separate dynamic\ncomponents through the temporal domain, our method uses multi-resolution neural\nfeature planes for precise segmentation, which is more suitable for long\n360-degree videos. Our experiments validate that OmniLocalRF outperforms\nexisting methods in both qualitative and quantitative metrics, especially in\nscenarios with complex real-world scenes. In particular, our approach\neliminates the need for manual interaction, such as drawing motion masks by\nhand and additional pose estimation, making it a highly effective and efficient\nsolution.\n","authors":["Dongyoung Choi","Hyeonjoong Jang","Min H. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00674v1","updated":"2024-03-31T12:45:23Z","published":"2024-03-31T12:45:23Z","title":"Knowledge NeRF: Few-shot Novel View Synthesis for Dynamic Articulated\n Objects","summary":" We present Knowledge NeRF to synthesize novel views for dynamic\nscenes.Reconstructing dynamic 3D scenes from few sparse views and rendering\nthem from arbitrary perspectives is a challenging problem with applications in\nvarious domains. Previous dynamic NeRF methods learn the deformation of\narticulated objects from monocular videos. However, qualities of their\nreconstructed scenes are limited.To clearly reconstruct dynamic scenes, we\npropose a new framework by considering two frames at a time.We pretrain a NeRF\nmodel for an articulated object.When articulated objects moves, Knowledge NeRF\nlearns to generate novel views at the new state by incorporating past knowledge\nin the pretrained NeRF model with minimal observations in the present state. We\npropose a projection module to adapt NeRF for dynamic scenes, learning the\ncorrespondence between pretrained knowledge base and current states.\nExperimental results demonstrate the effectiveness of our method in\nreconstructing dynamic 3D scenes with 5 input images in one state. Knowledge\nNeRF is a new pipeline and promising solution for novel view synthesis in\ndynamic articulated objects. The data and implementation are publicly available\nat https://github.com/RussRobin/Knowledge_NeRF.\n","authors":["Wenxiao Cai","Xinyue Leiınst","Xinyu He","Junming Leo Chen","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00672v1","updated":"2024-03-31T12:44:24Z","published":"2024-03-31T12:44:24Z","title":"A General and Efficient Training for Transformer via Token Expansion","summary":" The remarkable performance of Vision Transformers (ViTs) typically requires\nan extremely large training cost. Existing methods have attempted to accelerate\nthe training of ViTs, yet typically disregard method universality with accuracy\ndropping. Meanwhile, they break the training consistency of the original\ntransformers, including the consistency of hyper-parameters, architecture, and\nstrategy, which prevents them from being widely applied to different\nTransformer networks. In this paper, we propose a novel token growth scheme\nToken Expansion (termed ToE) to achieve consistent training acceleration for\nViTs. We introduce an \"initialization-expansion-merging\" pipeline to maintain\nthe integrity of the intermediate feature distribution of original\ntransformers, preventing the loss of crucial learnable information in the\ntraining process. ToE can not only be seamlessly integrated into the training\nand fine-tuning process of transformers (e.g., DeiT and LV-ViT), but also\neffective for efficient training frameworks (e.g., EfficientTrain), without\ntwisting the original training hyper-parameters, architecture, and introducing\nadditional training strategies. Extensive experiments demonstrate that ToE\nachieves about 1.3x faster for the training of ViTs in a lossless manner, or\neven with performance gains over the full-token training baselines. Code is\navailable at https://github.com/Osilly/TokenExpansion .\n","authors":["Wenxuan Huang","Yunhang Shen","Jiao Xie","Baochang Zhang","Gaoqi He","Ke Li","Xing Sun","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2404.00672v1.pdf","comment":"Accepted to CVPR 2024. Code is available at\n https://github.com/Osilly/TokenExpansion"},{"id":"http://arxiv.org/abs/2404.00670v1","updated":"2024-03-31T12:35:23Z","published":"2024-03-31T12:35:23Z","title":"Statistical Analysis by Semiparametric Additive Regression and LSTM-FCN\n Based Hierarchical Classification for Computer Vision Quantification of\n Parkinsonian Bradykinesia","summary":" Bradykinesia, characterized by involuntary slowing or decrement of movement,\nis a fundamental symptom of Parkinson's Disease (PD) and is vital for its\nclinical diagnosis. Despite various methodologies explored to quantify\nbradykinesia, computer vision-based approaches have shown promising results.\nHowever, these methods often fall short in adequately addressing key\nbradykinesia characteristics in repetitive limb movements: \"occasional arrest\"\nand \"decrement in amplitude.\"\n This research advances vision-based quantification of bradykinesia by\nintroducing nuanced numerical analysis to capture decrement in amplitudes and\nemploying a simple deep learning technique, LSTM-FCN, for precise\nclassification of occasional arrests. Our approach structures the\nclassification process hierarchically, tailoring it to the unique dynamics of\nbradykinesia in PD.\n Statistical analysis of the extracted features, including those representing\narrest and fatigue, has demonstrated their statistical significance in most\ncases. This finding underscores the importance of considering \"occasional\narrest\" and \"decrement in amplitude\" in bradykinesia quantification of limb\nmovement. Our enhanced diagnostic tool has been rigorously tested on an\nextensive dataset comprising 1396 motion videos from 310 PD patients, achieving\nan accuracy of 80.3%. The results confirm the robustness and reliability of our\nmethod.\n","authors":["Youngseo Cho","In Hee Kwak","Dohyeon Kim","Jinhee Na","Hanjoo Sung","Jeongjae Lee","Young Eun Kim","Hyeo-il Ma"],"pdf_url":"https://arxiv.org/pdf/2404.00670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00667v1","updated":"2024-03-31T12:22:23Z","published":"2024-03-31T12:22:23Z","title":"Weakly-Supervised Cross-Domain Segmentation of Electron Microscopy with\n Sparse Point Annotation","summary":" Accurate segmentation of organelle instances from electron microscopy (EM)\nimages plays an essential role in many neuroscience researches. However,\npractical scenarios usually suffer from high annotation costs, label scarcity,\nand large domain diversity. While unsupervised domain adaptation (UDA) that\nassumes no annotation effort on the target data is promising to alleviate these\nchallenges, its performance on complicated segmentation tasks is still far from\npractical usage. To address these issues, we investigate a highly\nannotation-efficient weak supervision, which assumes only sparse center-points\non a small subset of object instances in the target training images. To achieve\naccurate segmentation with partial point annotations, we introduce instance\ncounting and center detection as auxiliary tasks and design a multitask\nlearning framework to leverage correlations among the counting, detection, and\nsegmentation, which are all tasks with partial or no supervision. Building upon\nthe different domain-invariances of the three tasks, we enforce counting\nestimation with a novel soft consistency loss as a global prior for center\ndetection, which further guides the per-pixel segmentation. To further\ncompensate for annotation sparsity, we develop a cross-position cut-and-paste\nfor label augmentation and an entropy-based pseudo-label selection. The\nexperimental results highlight that, by simply using extremely weak annotation,\ne.g., 15\\% sparse points, for model training, the proposed model is capable of\nsignificantly outperforming UDA methods and produces comparable performance as\nthe supervised counterpart. The high robustness of our model shown in the\nvalidations and the low requirement of expert knowledge for sparse point\nannotation further improve the potential application value of our model.\n","authors":["Dafei Qiu","Shan Xiong","Jiajin Yi","Jialin Peng"],"pdf_url":"https://arxiv.org/pdf/2404.00667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00661v1","updated":"2024-03-31T12:07:04Z","published":"2024-03-31T12:07:04Z","title":"DeeDSR: Towards Real-World Image Super-Resolution via Degradation-Aware\n Stable Diffusion","summary":" Diffusion models, known for their powerful generative capabilities, play a\ncrucial role in addressing real-world super-resolution challenges. However,\nthese models often focus on improving local textures while neglecting the\nimpacts of global degradation, which can significantly reduce semantic fidelity\nand lead to inaccurate reconstructions and suboptimal super-resolution\nperformance. To address this issue, we introduce a novel two-stage,\ndegradation-aware framework that enhances the diffusion model's ability to\nrecognize content and degradation in low-resolution images. In the first stage,\nwe employ unsupervised contrastive learning to obtain representations of image\ndegradations. In the second stage, we integrate a degradation-aware module into\na simplified ControlNet, enabling flexible adaptation to various degradations\nbased on the learned representations. Furthermore, we decompose the\ndegradation-aware features into global semantics and local details branches,\nwhich are then injected into the diffusion denoising module to modulate the\ntarget generation. Our method effectively recovers semantically precise and\nphotorealistic details, particularly under significant degradation conditions,\ndemonstrating state-of-the-art performance across various benchmarks. Codes\nwill be released at https://github.com/bichunyang419/DeeDSR.\n","authors":["Chunyang Bi","Xin Luo","Sheng Shen","Mengxi Zhang","Huanjing Yue","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00653v1","updated":"2024-03-31T11:43:39Z","published":"2024-03-31T11:43:39Z","title":"Dual DETRs for Multi-Label Temporal Action Detection","summary":" Temporal Action Detection (TAD) aims to identify the action boundaries and\nthe corresponding category within untrimmed videos. Inspired by the success of\nDETR in object detection, several methods have adapted the query-based\nframework to the TAD task. However, these approaches primarily followed DETR to\npredict actions at the instance level (i.e., identify each action by its center\npoint), leading to sub-optimal boundary localization. To address this issue, we\npropose a new Dual-level query-based TAD framework, namely DualDETR, to detect\nactions from both instance-level and boundary-level. Decoding at different\nlevels requires semantics of different granularity, therefore we introduce a\ntwo-branch decoding structure. This structure builds distinctive decoding\nprocesses for different levels, facilitating explicit capture of temporal cues\nand semantics at each level. On top of the two-branch design, we present a\njoint query initialization strategy to align queries from both levels.\nSpecifically, we leverage encoder proposals to match queries from each level in\na one-to-one manner. Then, the matched queries are initialized using position\nand content prior from the matched action proposal. The aligned dual-level\nqueries can refine the matched proposal with complementary cues during\nsubsequent decoding. We evaluate DualDETR on three challenging multi-label TAD\nbenchmarks. The experimental results demonstrate the superior performance of\nDualDETR to the existing state-of-the-art methods, achieving a substantial\nimprovement under det-mAP and delivering impressive results under seg-mAP.\n","authors":["Yuhan Zhu","Guozhen Zhang","Jing Tan","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00653v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00650v1","updated":"2024-03-31T11:37:43Z","published":"2024-03-31T11:37:43Z","title":"Deep Instruction Tuning for Segment Anything Model","summary":" Segment Anything Model (SAM) exhibits powerful yet versatile capabilities on\n(un) conditional image segmentation tasks recently. Although SAM can support\nvarious segmentation prompts, we note that, compared to point- and box-guided\nsegmentation, it performs much worse on text-instructed tasks. We argue that\ndeep text instruction tuning is key to mitigate such shortcoming caused by the\nshallow fusion scheme in its default light-weight mask decoder. In this paper,\ntwo \\emph{deep instruction tuning} (DIT) methods are proposed, one is\nend-to-end and the other is layer-wise. With these tuning methods, we can\nregard the image encoder of SAM as a stand-alone vision-language learner in\ncontrast to building another deep fusion branch. Extensive experiments on three\nhighly competitive benchmark datasets of referring image segmentation show that\na simple end-to-end DIT improves SAM by a large margin, with layer-wise DIT\nfurther boosts the performance to state-of-the-art. Our code is anonymously\nreleased at: https://github.com/wysnzzzz/DIT.\n","authors":["Xiaorui Huang","Gen Luo","Chaoyang Zhu","Bo Tong","Yiyi Zhou","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.00650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00648v1","updated":"2024-03-31T11:33:39Z","published":"2024-03-31T11:33:39Z","title":"SpiralMLP: A Lightweight Vision MLP Architecture","summary":" We present SpiralMLP, a novel architecture that introduces a Spiral FC layer\nas a replacement for the conventional Token Mixing approach. Differing from\nseveral existing MLP-based models that primarily emphasize axes, our Spiral FC\nlayer is designed as a deformable convolution layer with spiral-like offsets.\nWe further adapt Spiral FC into two variants: Self-Spiral FC and Cross-Spiral\nFC, which enable both local and global feature integration seamlessly,\neliminating the need for additional processing steps. To thoroughly investigate\nthe effectiveness of the spiral-like offsets and validate our design, we\nconduct ablation studies and explore optimal configurations. In empirical\ntests, SpiralMLP reaches state-of-the-art performance, similar to Transformers,\nCNNs, and other MLPs, benchmarking on ImageNet-1k, COCO and ADE20K. SpiralMLP\nstill maintains linear computational complexity O(HW) and is compatible with\nvarying input image resolutions. Our study reveals that targeting the full\nreceptive field is not essential for achieving high performance, instead,\nadopting a refined approach offers better results.\n","authors":["Haojie Mu","Burhan Ul Tayyab","Nicholas Chua"],"pdf_url":"https://arxiv.org/pdf/2404.00648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00645v1","updated":"2024-03-31T11:09:19Z","published":"2024-03-31T11:09:19Z","title":"Attire-Based Anomaly Detection in Restricted Areas Using YOLOv8 for\n Enhanced CCTV Security","summary":" This research introduces an innovative security enhancement approach,\nemploying advanced image analysis and soft computing. The focus is on an\nintelligent surveillance system that detects unauthorized individuals in\nrestricted areas by analyzing attire. Traditional security measures face\nchallenges in monitoring unauthorized access. Leveraging YOLOv8, an advanced\nobject detection algorithm, our system identifies authorized personnel based on\ntheir attire in CCTV footage. The methodology involves training the YOLOv8\nmodel on a comprehensive dataset of uniform patterns, ensuring precise\nrecognition in specific regions. Soft computing techniques enhance adaptability\nto dynamic environments and varying lighting conditions. This research\ncontributes to image analysis and soft computing, providing a sophisticated\nsecurity solution. Emphasizing uniform-based anomaly detection, it establishes\na foundation for robust security systems in restricted areas. The outcomes\nhighlight the potential of YOLOv8-based surveillance in ensuring safety in\nsensitive locations.\n","authors":["Abdul Aziz A. B","Aindri Bajpai"],"pdf_url":"https://arxiv.org/pdf/2404.00645v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.00633v1","updated":"2024-03-31T10:01:20Z","published":"2024-03-31T10:01:20Z","title":"IPT-V2: Efficient Image Processing Transformer using Hierarchical\n Attentions","summary":" Recent advances have demonstrated the powerful capability of transformer\narchitecture in image restoration. However, our analysis indicates that\nexisting transformerbased methods can not establish both exact global and local\ndependencies simultaneously, which are much critical to restore the details and\nmissing content of degraded images. To this end, we present an efficient image\nprocessing transformer architecture with hierarchical attentions, called IPTV2,\nadopting a focal context self-attention (FCSA) and a global grid self-attention\n(GGSA) to obtain adequate token interactions in local and global receptive\nfields. Specifically, FCSA applies the shifted window mechanism into the\nchannel self-attention, helps capture the local context and mutual interaction\nacross channels. And GGSA constructs long-range dependencies in the\ncross-window grid, aggregates global information in spatial dimension.\nMoreover, we introduce structural re-parameterization technique to feed-forward\nnetwork to further improve the model capability. Extensive experiments\ndemonstrate that our proposed IPT-V2 achieves state-of-the-art results on\nvarious image processing tasks, covering denoising, deblurring, deraining and\nobtains much better trade-off for performance and computational complexity than\nprevious methods. Besides, we extend our method to image generation as latent\ndiffusion backbone, and significantly outperforms DiTs.\n","authors":["Zhijun Tu","Kunpeng Du","Hanting Chen","Hailing Wang","Wei Li","Jie Hu","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00626v1","updated":"2024-03-31T09:32:31Z","published":"2024-03-31T09:32:31Z","title":"Domain Generalizable Person Search Using Unreal Dataset","summary":" Collecting and labeling real datasets to train the person search networks not\nonly requires a lot of time and effort, but also accompanies privacy issues.\nThe weakly-supervised and unsupervised domain adaptation methods have been\nproposed to alleviate the labeling burden for target datasets, however, their\ngeneralization capability is limited. We introduce a novel person search method\nbased on the domain generalization framework, that uses an automatically\nlabeled unreal dataset only for training but is applicable to arbitrary unseen\nreal datasets. To alleviate the domain gaps when transferring the knowledge\nfrom the unreal source dataset to the real target datasets, we estimate the\nfidelity of person instances which is then used to train the end-to-end network\nadaptively. Moreover, we devise a domain-invariant feature learning scheme to\nencourage the network to suppress the domain-related features. Experimental\nresults demonstrate that the proposed method provides the competitive\nperformance to existing person search methods even though it is applicable to\narbitrary unseen datasets without any prior knowledge and re-training burdens.\n","authors":["Minyoung Oh","Duhyun Kim","Jae-Young Sim"],"pdf_url":"https://arxiv.org/pdf/2404.00626v1.pdf","comment":"AAAI2024 accepted"},{"id":"http://arxiv.org/abs/2404.00618v1","updated":"2024-03-31T09:10:32Z","published":"2024-03-31T09:10:32Z","title":"A Multi-Branched Radial Basis Network Approach to Predicting Complex\n Chaotic Behaviours","summary":" In this study, we propose a multi branched network approach to predict the\ndynamics of a physics attractor characterized by intricate and chaotic\nbehavior. We introduce a unique neural network architecture comprised of Radial\nBasis Function (RBF) layers combined with an attention mechanism designed to\neffectively capture nonlinear inter-dependencies inherent in the attractor's\ntemporal evolution. Our results demonstrate successful prediction of the\nattractor's trajectory across 100 predictions made using a real-world dataset\nof 36,700 time-series observations encompassing approximately 28 minutes of\nactivity. To further illustrate the performance of our proposed technique, we\nprovide comprehensive visualizations depicting the attractor's original and\npredicted behaviors alongside quantitative measures comparing observed versus\nestimated outcomes. Overall, this work showcases the potential of advanced\nmachine learning algorithms in elucidating hidden structures in complex\nphysical systems while offering practical applications in various domains\nrequiring accurate short-term forecasting capabilities.\n","authors":["Aarush Sinha"],"pdf_url":"https://arxiv.org/pdf/2404.00618v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.00611v1","updated":"2024-03-31T09:01:17Z","published":"2024-03-31T09:01:17Z","title":"Object-level Copy-Move Forgery Image Detection based on Inconsistency\n Mining","summary":" In copy-move tampering operations, perpetrators often employ techniques, such\nas blurring, to conceal tampering traces, posing significant challenges to the\ndetection of object-level targets with intact structures. Focus on these\nchallenges, this paper proposes an Object-level Copy-Move Forgery Image\nDetection based on Inconsistency Mining (IMNet). To obtain complete\nobject-level targets, we customize prototypes for both the source and tampered\nregions and dynamically update them. Additionally, we extract inconsistent\nregions between coarse similar regions obtained through self-correlation\ncalculations and regions composed of prototypes. The detected inconsistent\nregions are used as supplements to coarse similar regions to refine pixel-level\ndetection. We operate experiments on three public datasets which validate the\neffectiveness and the robustness of the proposed IMNet.\n","authors":["Jingyu Wang","Niantai Jing","Ziyao Liu","Jie Nie","Yuxin Qi","Chi-Hung Chi","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2404.00611v1.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.00603v1","updated":"2024-03-31T08:28:42Z","published":"2024-03-31T08:28:42Z","title":"Weak Distribution Detectors Lead to Stronger Generalizability of\n Vision-Language Prompt Tuning","summary":" We propose a generalized method for boosting the generalization ability of\npre-trained vision-language models (VLMs) while fine-tuning on downstream\nfew-shot tasks. The idea is realized by exploiting out-of-distribution (OOD)\ndetection to predict whether a sample belongs to a base distribution or a novel\ndistribution and then using the score generated by a dedicated competition\nbased scoring function to fuse the zero-shot and few-shot classifier. The fused\nclassifier is dynamic, which will bias towards the zero-shot classifier if a\nsample is more likely from the distribution pre-trained on, leading to improved\nbase-to-novel generalization ability. Our method is performed only in test\nstage, which is applicable to boost existing methods without time-consuming\nre-training. Extensive experiments show that even weak distribution detectors\ncan still improve VLMs' generalization ability. Specifically, with the help of\nOOD detectors, the harmonic mean of CoOp and ProGrad increase by 2.6 and 1.5\npercentage points over 11 recognition datasets in the base-to-novel setting.\n","authors":["Kun Ding","Haojian Zhang","Qiang Yu","Ying Wang","Shiming Xiang","Chunhong Pan"],"pdf_url":"https://arxiv.org/pdf/2404.00603v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2404.00597v1","updated":"2024-03-31T08:08:05Z","published":"2024-03-31T08:08:05Z","title":"Parameter and Data-Efficient Spectral StyleDCGAN","summary":" We present a simple, highly parameter, and data-efficient adversarial network\nfor unconditional face generation. Our method: Spectral Style-DCGAN or SSD\nutilizes only 6.574 million parameters and 4739 dog faces from the Animal Faces\nHQ (AFHQ) dataset as training samples while preserving fidelity at low\nresolutions up to 64x64. Code available at\nhttps://github.com/Aryan-Garg/StyleDCGAN.\n","authors":["Aryan Garg"],"pdf_url":"https://arxiv.org/pdf/2404.00597v1.pdf","comment":"Notable ICLR Tiny Paper 2024"},{"id":"http://arxiv.org/abs/2404.00593v1","updated":"2024-03-31T07:56:07Z","published":"2024-03-31T07:56:07Z","title":"LAESI: Leaf Area Estimation with Synthetic Imagery","summary":" We introduce LAESI, a Synthetic Leaf Dataset of 100,000 synthetic leaf images\non millimeter paper, each with semantic masks and surface area labels. This\ndataset provides a resource for leaf morphology analysis primarily aimed at\nbeech and oak leaves. We evaluate the applicability of the dataset by training\nmachine learning models for leaf surface area prediction and semantic\nsegmentation, using real images for validation. Our validation shows that these\nmodels can be trained to predict leaf surface area with a relative error not\ngreater than an average human annotator. LAESI also provides an efficient\nframework based on 3D procedural models and generative AI for the large-scale,\ncontrollable generation of data with potential further applications in\nagriculture and biology. We evaluate the inclusion of generative AI in our\nprocedural data generation pipeline and show how data filtering based on\nannotation consistency results in datasets which allow training the highest\nperforming vision models.\n","authors":["Jacek Kałużny","Yannik Schreckenberg","Karol Cyganik","Peter Annighöfer","Sören Pirk","Dominik L. Michels","Mikolaj Cieslak","Farhah Assaad-Gerbert","Bedrich Benes","Wojciech Pałubicki"],"pdf_url":"https://arxiv.org/pdf/2404.00593v1.pdf","comment":"10 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.00588v1","updated":"2024-03-31T07:30:41Z","published":"2024-03-31T07:30:41Z","title":"Memory-based Cross-modal Semantic Alignment Network for Radiology Report\n Generation","summary":" Generating radiology reports automatically reduces the workload of\nradiologists and helps the diagnoses of specific diseases. Many existing\nmethods take this task as modality transfer process. However, since the key\ninformation related to disease accounts for a small proportion in both image\nand report, it is hard for the model to learn the latent relation between the\nradiology image and its report, thus failing to generate fluent and accurate\nradiology reports. To tackle this problem, we propose a memory-based\ncross-modal semantic alignment model (MCSAM) following an encoder-decoder\nparadigm. MCSAM includes a well initialized long-term clinical memory bank to\nlearn disease-related representations as well as prior knowledge for different\nmodalities to retrieve and use the retrieved memory to perform feature\nconsolidation. To ensure the semantic consistency of the retrieved cross modal\nprior knowledge, a cross-modal semantic alignment module (SAM) is proposed. SAM\nis also able to generate semantic visual feature embeddings which can be added\nto the decoder and benefits report generation. More importantly, to memorize\nthe state and additional information while generating reports with the decoder,\nwe use learnable memory tokens which can be seen as prompts. Extensive\nexperiments demonstrate the promising performance of our proposed method which\ngenerates state-of-the-art performance on the MIMIC-CXR dataset.\n","authors":["Yitian Tao","Liyan Ma","Jing Yu","Han Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00588v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.00578v1","updated":"2024-03-31T06:55:12Z","published":"2024-03-31T06:55:12Z","title":"M3D: Advancing 3D Medical Image Analysis with Multi-Modal Large Language\n Models","summary":" Medical image analysis is essential to clinical diagnosis and treatment,\nwhich is increasingly supported by multi-modal large language models (MLLMs).\nHowever, previous research has primarily focused on 2D medical images, leaving\n3D images under-explored, despite their richer spatial information. This paper\naims to advance 3D medical image analysis with MLLMs. To this end, we present a\nlarge-scale 3D multi-modal medical dataset, M3D-Data, comprising 120K\nimage-text pairs and 662K instruction-response pairs specifically tailored for\nvarious 3D medical tasks, such as image-text retrieval, report generation,\nvisual question answering, positioning, and segmentation. Additionally, we\npropose M3D-LaMed, a versatile multi-modal large language model for 3D medical\nimage analysis. Furthermore, we introduce a new 3D multi-modal medical\nbenchmark, M3D-Bench, which facilitates automatic evaluation across eight\ntasks. Through comprehensive evaluation, our method proves to be a robust model\nfor 3D medical image analysis, outperforming existing solutions. All code,\ndata, and models are publicly available at: https://github.com/BAAI-DCAI/M3D.\n","authors":["Fan Bai","Yuxin Du","Tiejun Huang","Max Q. -H. Meng","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.00578v1.pdf","comment":"MLLM, 3D medical image analysis"},{"id":"http://arxiv.org/abs/2404.00576v1","updated":"2024-03-31T06:38:08Z","published":"2024-03-31T06:38:08Z","title":"Automated Bi-Fold Weighted Ensemble Algorithms and its Application to\n Brain Tumor Detection and Classification","summary":" The uncontrolled and unstructured growth of brain cells is known as brain\ntumor, which has one of the highest mortality rates among diseases from all\ntypes of cancers. Due to limited diagnostic and treatment capabilities, they\npose significant challenges, especially in third-world countries. Early\ndiagnosis plays a vital role in effectively managing brain tumors and reducing\nmortality rates. However, the availability of diagnostic methods is hindered by\nvarious limitations, including high costs and lengthy result acquisition times,\nimpeding early detection of the disease. In this study, we present two\ncutting-edge bi-fold weighted voting ensemble models that aim to boost the\neffectiveness of weighted ensemble methods. These two proposed methods combine\nthe classification outcomes from multiple classifiers and determine the optimal\nresult by selecting the one with the highest probability in the first approach,\nand the highest weighted prediction in the second technique. These approaches\nsignificantly improve the overall performance of weighted ensemble techniques.\nIn the first proposed method, we improve the soft voting technique (SVT) by\nintroducing a novel unsupervised weight calculating schema (UWCS) to enhance\nits weight assigning capability, known as the extended soft voting technique\n(ESVT). Secondly, we propose a novel weighted method (NWM) by using the\nproposed UWCS. Both of our approaches incorporate three distinct models: a\ncustom-built CNN, VGG-16, and InceptionResNetV2 which has been trained on\npublicly available datasets. The effectiveness of our proposed systems is\nevaluated through blind testing, where exceptional results are achieved. We\nthen establish a comparative analysis of the performance of our proposed\nmethods with that of SVT to show their superiority and effectiveness.\n","authors":["PoTsang B. Huang","Muhammad Rizwan","Mehboob Ali"],"pdf_url":"https://arxiv.org/pdf/2404.00576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00563v1","updated":"2024-03-31T05:07:06Z","published":"2024-03-31T05:07:06Z","title":"Exploiting Inter-sample and Inter-feature Relations in Dataset\n Distillation","summary":" Dataset distillation has emerged as a promising approach in deep learning,\nenabling efficient training with small synthetic datasets derived from larger\nreal ones. Particularly, distribution matching-based distillation methods\nattract attention thanks to its effectiveness and low computational cost.\nHowever, these methods face two primary limitations: the dispersed feature\ndistribution within the same class in synthetic datasets, reducing class\ndiscrimination, and an exclusive focus on mean feature consistency, lacking\nprecision and comprehensiveness. To address these challenges, we introduce two\nnovel constraints: a class centralization constraint and a covariance matching\nconstraint. The class centralization constraint aims to enhance class\ndiscrimination by more closely clustering samples within classes. The\ncovariance matching constraint seeks to achieve more accurate feature\ndistribution matching between real and synthetic datasets through local feature\ncovariance matrices, particularly beneficial when sample sizes are much smaller\nthan the number of features. Experiments demonstrate notable improvements with\nthese constraints, yielding performance boosts of up to 6.6% on CIFAR10, 2.9%\non SVHN, 2.5% on CIFAR100, and 2.5% on TinyImageNet, compared to the\nstate-of-the-art relevant methods. In addition, our method maintains robust\nperformance in cross-architecture settings, with a maximum performance drop of\n1.7% on four architectures. Code is available at\nhttps://github.com/VincenDen/IID.\n","authors":["Wenxiao Deng","Wenbin Li","Tianyu Ding","Lei Wang","Hongguang Zhang","Kuihua Huang","Jing Huo","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2404.00563v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00558v1","updated":"2024-03-31T04:39:40Z","published":"2024-03-31T04:39:40Z","title":"GAN with Skip Patch Discriminator for Biological Electron Microscopy\n Image Generation","summary":" Generating realistic electron microscopy (EM) images has been a challenging\nproblem due to their complex global and local structures. Isola et al. proposed\npix2pix, a conditional Generative Adversarial Network (GAN), for the general\npurpose of image-to-image translation; which fails to generate realistic EM\nimages. We propose a new architecture for the discriminator in the GAN\nproviding access to multiple patch sizes using skip patches and generating\nrealistic EM images.\n","authors":["Nishith Ranjon Roy","Nailah Rawnaq","Tulin Kaman"],"pdf_url":"https://arxiv.org/pdf/2404.00558v1.pdf","comment":"4 pages, International Conference on Computational and Mathematical\n Biomedical Engineering"},{"id":"http://arxiv.org/abs/2404.00552v1","updated":"2024-03-31T03:53:45Z","published":"2024-03-31T03:53:45Z","title":"Comparison of Methods in Human Skin Decomposition","summary":" Decomposition of skin pigment plays an important role in medical fields.\nHuman skin can be decomposed into two primitive components, hemoglobin and\nmelanin. It is our goal to apply these results for diagnosis of skin cancer. In\nthis paper, various methods for skin pigment decomposition are reviewed\ncomparatively and the performance of each method is evaluated both\ntheoretically and experimentally. In addition, isometric feature mapping\n(Isomap) is introduced in order to improve the dimensionality reduction\nperformance in context of skin decomposition.\n","authors":["Hao Gong","Michel Desvignes"],"pdf_url":"https://arxiv.org/pdf/2404.00552v1.pdf","comment":"4 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.00549v1","updated":"2024-03-31T03:35:43Z","published":"2024-03-31T03:35:43Z","title":"Pneumonia App: a mobile application for efficient pediatric pneumonia\n diagnosis using explainable convolutional neural networks (CNN)","summary":" Mycoplasma pneumoniae pneumonia (MPP) poses significant diagnostic challenges\nin pediatric healthcare, especially in regions like China where it's prevalent.\nWe introduce PneumoniaAPP, a mobile application leveraging deep learning\ntechniques for rapid MPP detection. Our approach capitalizes on convolutional\nneural networks (CNNs) trained on a comprehensive dataset comprising 3345 chest\nX-ray (CXR) images, which includes 833 CXR images revealing MPP and\nadditionally augmented with samples from a public dataset. The CNN model\nachieved an accuracy of 88.20% and an AUROC of 0.9218 across all classes, with\na specific accuracy of 97.64% for the mycoplasma class, as demonstrated on the\ntesting dataset. Furthermore, we integrated explainability techniques into\nPneumoniaAPP to aid respiratory physicians in lung opacity localization. Our\ncontribution extends beyond existing research by targeting pediatric MPP,\nemphasizing the age group of 0-12 years, and prioritizing deployment on mobile\ndevices. This work signifies a significant advancement in pediatric pneumonia\ndiagnosis, offering a reliable and accessible tool to alleviate diagnostic\nburdens in healthcare settings.\n","authors":["Jiaming Deng","Zhenglin Chen","Minjiang Chen","Lulu Xu","Jiaqi Yang","Zhendong Luo","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2404.00549v1.pdf","comment":"27 Pages,7 figures"},{"id":"http://arxiv.org/abs/2404.00548v1","updated":"2024-03-31T03:30:37Z","published":"2024-03-31T03:30:37Z","title":"Denoising Distillation Makes Event-Frame Transformers as Accurate Gaze\n Trackers","summary":" This paper tackles the problem of passive gaze estimation using both event\nand frame data. Considering inherently different physiological structures, it's\nintractable to accurately estimate purely based on a given state. Thus, we\nreformulate the gaze estimation as the quantification of state transitions from\nthe current state to several prior registered anchor states. Technically, we\npropose a two-stage learning-based gaze estimation framework to divide the\nwhole gaze estimation process into a coarse-to-fine process of anchor state\nselection and final gaze location. Moreover, to improve generalization ability,\nwe align a group of local experts with a student network, where a novel\ndenoising distillation algorithm is introduced to utilize denoising diffusion\ntechnique to iteratively remove inherent noise of event data. Extensive\nexperiments demonstrate the effectiveness of the proposed method, which greatly\nsurpasses state-of-the-art methods by a large extent of 15$\\%$. The code will\nbe publicly available at\nhttps://github.com/jdjdli/Denoise_distill_EF_gazetracker.\n","authors":["Jiading Li","Zhiyu Zhu","Jinhui Hou","Junhui Hou","Jinjian Wu"],"pdf_url":"https://arxiv.org/pdf/2404.00548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00546v1","updated":"2024-03-31T03:24:48Z","published":"2024-03-31T03:24:48Z","title":"On the Estimation of Image-matching Uncertainty in Visual Place\n Recognition","summary":" In Visual Place Recognition (VPR) the pose of a query image is estimated by\ncomparing the image to a map of reference images with known reference poses. As\nis typical for image retrieval problems, a feature extractor maps the query and\nreference images to a feature space, where a nearest neighbor search is then\nperformed. However, till recently little attention has been given to\nquantifying the confidence that a retrieved reference image is a correct match.\nHighly certain but incorrect retrieval can lead to catastrophic failure of\nVPR-based localization pipelines. This work compares for the first time the\nmain approaches for estimating the image-matching uncertainty, including the\ntraditional retrieval-based uncertainty estimation, more recent data-driven\naleatoric uncertainty estimation, and the compute-intensive geometric\nverification. We further formulate a simple baseline method, ``SUE'', which\nunlike the other methods considers the freely-available poses of the reference\nimages in the map. Our experiments reveal that a simple L2-distance between the\nquery and reference descriptors is already a better estimate of image-matching\nuncertainty than current data-driven approaches. SUE outperforms the other\nefficient uncertainty estimation methods, and its uncertainty estimates\ncomplement the computationally expensive geometric verification approach.\nFuture works for uncertainty estimation in VPR should consider the baselines\ndiscussed in this work.\n","authors":["Mubariz Zaffar","Liangliang Nan","Julian F. P. Kooij"],"pdf_url":"https://arxiv.org/pdf/2404.00546v1.pdf","comment":"To appear in the proceedings of the IEEE/CVF Conference on Computer\n Vision and Pattern Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2404.00544v1","updated":"2024-03-31T03:16:08Z","published":"2024-03-31T03:16:08Z","title":"Deep Extrinsic Manifold Representation for Vision Tasks","summary":" Non-Euclidean data is frequently encountered across different fields, yet\nthere is limited literature that addresses the fundamental challenge of\ntraining neural networks with manifold representations as outputs. We introduce\nthe trick named Deep Extrinsic Manifold Representation (DEMR) for visual tasks\nin this context. DEMR incorporates extrinsic manifold embedding into deep\nneural networks, which helps generate manifold representations. The DEMR\napproach does not directly optimize the complex geodesic loss. Instead, it\nfocuses on optimizing the computation graph within the embedded Euclidean\nspace, allowing for adaptability to various architectural requirements. We\nprovide empirical evidence supporting the proposed concept on two types of\nmanifolds, $SE(3)$ and its associated quotient manifolds. This evidence offers\ntheoretical assurances regarding feasibility, asymptotic properties, and\ngeneralization capability. The experimental results show that DEMR effectively\nadapts to point cloud alignment, producing outputs in $ SE(3) $, as well as in\nillumination subspace learning with outputs on the Grassmann manifold.\n","authors":["Tongtong Zhang","Xian Wei","Yuanxiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.00544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00540v1","updated":"2024-03-31T03:02:35Z","published":"2024-03-31T03:02:35Z","title":"Embodied Active Defense: Leveraging Recurrent Feedback to Counter\n Adversarial Patches","summary":" The vulnerability of deep neural networks to adversarial patches has\nmotivated numerous defense strategies for boosting model robustness. However,\nthe prevailing defenses depend on single observation or pre-established\nadversary information to counter adversarial patches, often failing to be\nconfronted with unseen or adaptive adversarial attacks and easily exhibiting\nunsatisfying performance in dynamic 3D environments. Inspired by active human\nperception and recurrent feedback mechanisms, we develop Embodied Active\nDefense (EAD), a proactive defensive strategy that actively contextualizes\nenvironmental information to address misaligned adversarial patches in 3D\nreal-world settings. To achieve this, EAD develops two central recurrent\nsub-modules, i.e., a perception module and a policy module, to implement two\ncritical functions of active vision. These models recurrently process a series\nof beliefs and observations, facilitating progressive refinement of their\ncomprehension of the target object and enabling the development of strategic\nactions to counter adversarial patches in 3D environments. To optimize learning\nefficiency, we incorporate a differentiable approximation of environmental\ndynamics and deploy patches that are agnostic to the adversary strategies.\nExtensive experiments demonstrate that EAD substantially enhances robustness\nagainst a variety of patches within just a few steps through its action policy\nin safety-critical tasks (e.g., face recognition and object detection), without\ncompromising standard accuracy. Furthermore, due to the attack-agnostic\ncharacteristic, EAD facilitates excellent generalization to unseen attacks,\ndiminishing the averaged attack success rate by 95 percent across a range of\nunseen adversarial attacks.\n","authors":["Lingxuan Wu","Xiao Yang","Yinpeng Dong","Liuwei Xie","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.00540v1.pdf","comment":"27pages"},{"id":"http://arxiv.org/abs/1910.07655v4","updated":"2024-03-31T02:57:09Z","published":"2019-10-16T06:35:50Z","title":"Deep Semantic Segmentation of Natural and Medical Images: A Review","summary":" The semantic image segmentation task consists of classifying each pixel of an\nimage into an instance, where each instance corresponds to a class. This task\nis a part of the concept of scene understanding or better explaining the global\ncontext of an image. In the medical image analysis domain, image segmentation\ncan be used for image-guided interventions, radiotherapy, or improved\nradiological diagnostics. In this review, we categorize the leading deep\nlearning-based medical and non-medical image segmentation solutions into six\nmain groups of deep architectural, data synthesis-based, loss function-based,\nsequenced models, weakly supervised, and multi-task methods and provide a\ncomprehensive review of the contributions in each of these groups. Further, for\neach group, we analyze each variant of these groups and discuss the limitations\nof the current approaches and present potential future research directions for\nsemantic image segmentation.\n","authors":["Saeid Asgari Taghanaki","Kumar Abhishek","Joseph Paul Cohen","Julien Cohen-Adad","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/1910.07655v4.pdf","comment":"45 pages, 16 figures. Accepted for publication in Springer Artificial\n Intelligence Review"},{"id":"http://arxiv.org/abs/2404.00532v1","updated":"2024-03-31T02:16:16Z","published":"2024-03-31T02:16:16Z","title":"LLMs are Good Action Recognizers","summary":" Skeleton-based action recognition has attracted lots of research attention.\nRecently, to build an accurate skeleton-based action recognizer, a variety of\nworks have been proposed. Among them, some works use large model architectures\nas backbones of their recognizers to boost the skeleton data representation\ncapability, while some other works pre-train their recognizers on external data\nto enrich the knowledge. In this work, we observe that large language models\nwhich have been extensively used in various natural language processing tasks\ngenerally hold both large model architectures and rich implicit knowledge.\nMotivated by this, we propose a novel LLM-AR framework, in which we investigate\ntreating the Large Language Model as an Action Recognizer. In our framework, we\npropose a linguistic projection process to project each input action signal\n(i.e., each skeleton sequence) into its ``sentence format'' (i.e., an ``action\nsentence''). Moreover, we also incorporate our framework with several designs\nto further facilitate this linguistic projection process. Extensive experiments\ndemonstrate the efficacy of our proposed framework.\n","authors":["Haoxuan Qu","Yujun Cai","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00532v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00524v1","updated":"2024-03-31T01:58:04Z","published":"2024-03-31T01:58:04Z","title":"TexVocab: Texture Vocabulary-conditioned Human Avatars","summary":" To adequately utilize the available image evidence in multi-view video-based\navatar modeling, we propose TexVocab, a novel avatar representation that\nconstructs a texture vocabulary and associates body poses with texture maps for\nanimation. Given multi-view RGB videos, our method initially back-projects all\nthe available images in the training videos to the posed SMPL surface,\nproducing texture maps in the SMPL UV domain. Then we construct pairs of human\nposes and texture maps to establish a texture vocabulary for encoding dynamic\nhuman appearances under various poses. Unlike the commonly used joint-wise\nmanner, we further design a body-part-wise encoding strategy to learn the\nstructural effects of the kinematic chain. Given a driving pose, we query the\npose feature hierarchically by decomposing the pose vector into several body\nparts and interpolating the texture features for synthesizing fine-grained\nhuman dynamics. Overall, our method is able to create animatable human avatars\nwith detailed and dynamic appearances from RGB videos, and the experiments show\nthat our method outperforms state-of-the-art approaches. The project page can\nbe found at https://texvocab.github.io/.\n","authors":["Yuxiao Liu","Zhe Li","Yebin Liu","Haoqian Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00513v1","updated":"2024-03-31T01:20:16Z","published":"2024-03-31T01:20:16Z","title":"Transformer based Pluralistic Image Completion with Reduced Information\n Loss","summary":" Transformer based methods have achieved great success in image inpainting\nrecently. However, we find that these solutions regard each pixel as a token,\nthus suffering from an information loss issue from two aspects: 1) They\ndownsample the input image into much lower resolutions for efficiency\nconsideration. 2) They quantize $256^3$ RGB values to a small number (such as\n512) of quantized color values. The indices of quantized pixels are used as\ntokens for the inputs and prediction targets of the transformer. To mitigate\nthese issues, we propose a new transformer based framework called \"PUT\".\nSpecifically, to avoid input downsampling while maintaining computation\nefficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts\nthe masked image into non-overlapped patch tokens and the decoder recovers the\nmasked regions from the inpainted tokens while keeping the unmasked regions\nunchanged. To eliminate the information loss caused by input quantization, an\nUn-quantized Transformer is applied. It directly takes features from the\nP-VQVAE encoder as input without any quantization and only regards the\nquantized tokens as prediction targets. Furthermore, to make the inpainting\nprocess more controllable, we introduce semantic and structural conditions as\nextra guidance. Extensive experiments show that our method greatly outperforms\nexisting transformer based methods on image fidelity and achieves much higher\ndiversity and better fidelity than state-of-the-art pluralistic inpainting\nmethods on complex large-scale datasets (e.g., ImageNet). Codes are available\nat https://github.com/liuqk3/PUT.\n","authors":["Qiankun Liu","Yuqi Jiang","Zhentao Tan","Dongdong Chen","Ying Fu","Qi Chu","Gang Hua","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.00513v1.pdf","comment":"Accepted by TPAMI (2024)"},{"id":"http://arxiv.org/abs/2403.17719v2","updated":"2024-03-31T01:14:36Z","published":"2024-03-25T05:21:26Z","title":"Resolution Limit of Single-Photon LiDAR","summary":" Single-photon Light Detection and Ranging (LiDAR) systems are often equipped\nwith an array of detectors for improved spatial resolution and sensing speed.\nHowever, given a fixed amount of flux produced by the laser transmitter across\nthe scene, the per-pixel Signal-to-Noise Ratio (SNR) will decrease when more\npixels are packed in a unit space. This presents a fundamental trade-off\nbetween the spatial resolution of the sensor array and the SNR received at each\npixel. Theoretical characterization of this fundamental limit is explored. By\nderiving the photon arrival statistics and introducing a series of new\napproximation techniques, the Mean Squared Error (MSE) of the\nmaximum-likelihood estimator of the time delay is derived. The theoretical\npredictions align well with simulations and real data.\n","authors":["Stanley H. Chan","Hashan K. Weerasooriya","Weijian Zhang","Pamela Abshire","Istvan Gyongy","Robert K. Henderson"],"pdf_url":"https://arxiv.org/pdf/2403.17719v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00510v1","updated":"2024-03-31T01:05:28Z","published":"2024-03-31T01:05:28Z","title":"Denoising Low-dose Images Using Deep Learning of Time Series Images","summary":" Digital image devices have been widely applied in many fields, including\nscientific imaging, recognition of individuals, and remote sensing. As the\napplication of these imaging technologies to autonomous driving and\nmeasurement, image noise generated when observation cannot be performed with a\nsufficient dose has become a major problem. Machine learning denoise technology\nis expected to be the solver of this problem, but there are the following\nproblems. Here we report, artifacts generated by machine learning denoise in\nultra-low dose observation using an in-situ observation video of an electron\nmicroscope as an example. And as a method to solve this problem, we propose a\nmethod to decompose a time series image into a 2D image of the spatial axis and\ntime to perform machine learning denoise. Our method opens new avenues accurate\nand stable reconstruction of continuous high-resolution images from low-dose\nimaging in science, industry, and life.\n","authors":["Yang Shao","Toshie Yaguchi","Toshiaki Tanigaki"],"pdf_url":"https://arxiv.org/pdf/2404.00510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00509v1","updated":"2024-03-31T00:59:10Z","published":"2024-03-31T00:59:10Z","title":"DailyMAE: Towards Pretraining Masked Autoencoders in One Day","summary":" Recently, masked image modeling (MIM), an important self-supervised learning\n(SSL) method, has drawn attention for its effectiveness in learning data\nrepresentation from unlabeled data. Numerous studies underscore the advantages\nof MIM, highlighting how models pretrained on extensive datasets can enhance\nthe performance of downstream tasks. However, the high computational demands of\npretraining pose significant challenges, particularly within academic\nenvironments, thereby impeding the SSL research progress. In this study, we\npropose efficient training recipes for MIM based SSL that focuses on mitigating\ndata loading bottlenecks and employing progressive training techniques and\nother tricks to closely maintain pretraining performance. Our library enables\nthe training of a MAE-Base/16 model on the ImageNet 1K dataset for 800 epochs\nwithin just 18 hours, using a single machine equipped with 8 A100 GPUs. By\nachieving speed gains of up to 5.8 times, this work not only demonstrates the\nfeasibility of conducting high-efficiency SSL training but also paves the way\nfor broader accessibility and promotes advancement in SSL research particularly\nfor prototyping and initial testing of SSL ideas. The code is available in\nhttps://github.com/erow/FastSSL.\n","authors":["Jiantao Wu","Shentong Mo","Sara Atito","Zhenhua Feng","Josef Kittler","Muhammad Awais"],"pdf_url":"https://arxiv.org/pdf/2404.00509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00504v1","updated":"2024-03-31T00:20:53Z","published":"2024-03-31T00:20:53Z","title":"NYC-Indoor-VPR: A Long-Term Indoor Visual Place Recognition Dataset with\n Semi-Automatic Annotation","summary":" Visual Place Recognition (VPR) in indoor environments is beneficial to humans\nand robots for better localization and navigation. It is challenging due to\nappearance changes at various frequencies, and difficulties of obtaining ground\ntruth metric trajectories for training and evaluation. This paper introduces\nthe NYC-Indoor-VPR dataset, a unique and rich collection of over 36,000 images\ncompiled from 13 distinct crowded scenes in New York City taken under varying\nlighting conditions with appearance changes. Each scene has multiple revisits\nacross a year. To establish the ground truth for VPR, we propose a\nsemiautomatic annotation approach that computes the positional information of\neach image. Our method specifically takes pairs of videos as input and yields\nmatched pairs of images along with their estimated relative locations. The\naccuracy of this matching is refined by human annotators, who utilize our\nannotation software to correlate the selected keyframes. Finally, we present a\nbenchmark evaluation of several state-of-the-art VPR algorithms using our\nannotated dataset, revealing its challenge and thus value for VPR research.\n","authors":["Diwei Sheng","Anbang Yang","John-Ross Rizzo","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2404.00504v1.pdf","comment":"7 pages, 7 figures, published in 2024 IEEE International Conference\n on Robotics and Automation (ICRA 2024)"}]},"2024-03-30T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2402.03769v2","updated":"2024-03-30T23:04:10Z","published":"2024-02-06T07:22:50Z","title":"AttackNet: Enhancing Biometric Security via Tailored Convolutional\n Neural Network Architectures for Liveness Detection","summary":" Biometric security is the cornerstone of modern identity verification and\nauthentication systems, where the integrity and reliability of biometric\nsamples is of paramount importance. This paper introduces AttackNet, a bespoke\nConvolutional Neural Network architecture, meticulously designed to combat\nspoofing threats in biometric systems. Rooted in deep learning methodologies,\nthis model offers a layered defense mechanism, seamlessly transitioning from\nlow-level feature extraction to high-level pattern discernment. Three\ndistinctive architectural phases form the crux of the model, each underpinned\nby judiciously chosen activation functions, normalization techniques, and\ndropout layers to ensure robustness and resilience against adversarial attacks.\nBenchmarking our model across diverse datasets affirms its prowess, showcasing\nsuperior performance metrics in comparison to contemporary models. Furthermore,\na detailed comparative analysis accentuates the model's efficacy, drawing\nparallels with prevailing state-of-the-art methodologies. Through iterative\nrefinement and an informed architectural strategy, AttackNet underscores the\npotential of deep learning in safeguarding the future of biometric security.\n","authors":["Oleksandr Kuznetsov","Dmytro Zakharov","Emanuele Frontoni","Andrea Maranesi"],"pdf_url":"https://arxiv.org/pdf/2402.03769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09373v3","updated":"2024-03-30T22:10:36Z","published":"2023-03-16T15:01:50Z","title":"MAPSeg: Unified Unsupervised Domain Adaptation for Heterogeneous Medical\n Image Segmentation Based on 3D Masked Autoencoding and Pseudo-Labeling","summary":" Robust segmentation is critical for deriving quantitative measures from\nlarge-scale, multi-center, and longitudinal medical scans. Manually annotating\nmedical scans, however, is expensive and labor-intensive and may not always be\navailable in every domain. Unsupervised domain adaptation (UDA) is a\nwell-studied technique that alleviates this label-scarcity problem by\nleveraging available labels from another domain. In this study, we introduce\nMasked Autoencoding and Pseudo-Labeling Segmentation (MAPSeg), a\n$\\textbf{unified}$ UDA framework with great versatility and superior\nperformance for heterogeneous and volumetric medical image segmentation. To the\nbest of our knowledge, this is the first study that systematically reviews and\ndevelops a framework to tackle four different domain shifts in medical image\nsegmentation. More importantly, MAPSeg is the first framework that can be\napplied to $\\textbf{centralized}$, $\\textbf{federated}$, and\n$\\textbf{test-time}$ UDA while maintaining comparable performance. We compare\nMAPSeg with previous state-of-the-art methods on a private infant brain MRI\ndataset and a public cardiac CT-MRI dataset, and MAPSeg outperforms others by a\nlarge margin (10.5 Dice improvement on the private MRI dataset and 5.7 on the\npublic CT-MRI dataset). MAPSeg poses great practical value and can be applied\nto real-world problems. GitHub: https://github.com/XuzheZ/MAPSeg/.\n","authors":["Xuzhe Zhang","Yuhao Wu","Elsa Angelini","Ang Li","Jia Guo","Jerod M. Rasmussen","Thomas G. O'Connor","Pathik D. Wadhwa","Andrea Parolin Jackowski","Hai Li","Jonathan Posner","Andrew F. Laine","Yun Wang"],"pdf_url":"https://arxiv.org/pdf/2303.09373v3.pdf","comment":"CVPR 2024 camera-ready (8 pages, 3 figures) with the supplemental\n materials (5 pages, 4 figures). Xuzhe Zhang and Yuhao Wu are co-first\n authors. Andrew F. Laine and Yun Wang are co-senior supervising authors"},{"id":"http://arxiv.org/abs/2310.07889v2","updated":"2024-03-30T22:00:22Z","published":"2023-10-11T20:52:30Z","title":"LangNav: Language as a Perceptual Representation for Navigation","summary":" We explore the use of language as a perceptual representation for\nvision-and-language navigation (VLN), with a focus on low-data settings. Our\napproach uses off-the-shelf vision systems for image captioning and object\ndetection to convert an agent's egocentric panoramic view at each time step\ninto natural language descriptions. We then finetune a pretrained language\nmodel to select an action, based on the current view and the trajectory\nhistory, that would best fulfill the navigation instructions. In contrast to\nthe standard setup which adapts a pretrained language model to work directly\nwith continuous visual features from pretrained vision models, our approach\ninstead uses (discrete) language as the perceptual representation. We explore\nseveral use cases of our language-based navigation (LangNav) approach on the\nR2R VLN benchmark: generating synthetic trajectories from a prompted language\nmodel (GPT-4) with which to finetune a smaller language model; domain transfer\nwhere we transfer a policy learned on one simulated environment (ALFRED) to\nanother (more realistic) environment (R2R); and combining both vision- and\nlanguage-based representations for VLN. Our approach is found to improve upon\nbaselines that rely on visual features in settings where only a few expert\ntrajectories (10-100) are available, demonstrating the potential of language as\na perceptual representation for navigation.\n","authors":["Bowen Pan","Rameswar Panda","SouYoung Jin","Rogerio Feris","Aude Oliva","Phillip Isola","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2310.07889v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17791v2","updated":"2024-03-30T20:51:33Z","published":"2023-11-29T16:35:24Z","title":"U-Net v2: Rethinking the Skip Connections of U-Net for Medical Image\n Segmentation","summary":" In this paper, we introduce U-Net v2, a new robust and efficient U-Net\nvariant for medical image segmentation. It aims to augment the infusion of\nsemantic information into low-level features while simultaneously refining\nhigh-level features with finer details. For an input image, we begin by\nextracting multi-level features with a deep neural network encoder. Next, we\nenhance the feature map of each level by infusing semantic information from\nhigher-level features and integrating finer details from lower-level features\nthrough Hadamard product. Our novel skip connections empower features of all\nthe levels with enriched semantic characteristics and intricate details. The\nimproved features are subsequently transmitted to the decoder for further\nprocessing and segmentation. Our method can be seamlessly integrated into any\nEncoder-Decoder network. We evaluate our method on several public medical image\nsegmentation datasets for skin lesion segmentation and polyp segmentation, and\nthe experimental results demonstrate the segmentation accuracy of our new\nmethod over state-of-the-art methods, while preserving memory and computational\nefficiency. Code is available at: https://github.com/yaoppeng/U-Net_v2\n","authors":["Yaopeng Peng","Milan Sonka","Danny Z. Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13659v2","updated":"2024-03-30T20:14:03Z","published":"2024-03-20T15:08:43Z","title":"Recursive Joint Cross-Modal Attention for Multimodal Fusion in\n Dimensional Emotion Recognition","summary":" Though multimodal emotion recognition has achieved significant progress over\nrecent years, the potential of rich synergic relationships across the\nmodalities is not fully exploited. In this paper, we introduce Recursive Joint\nCross-Modal Attention (RJCMA) to effectively capture both intra-and inter-modal\nrelationships across audio, visual and text modalities for dimensional emotion\nrecognition. In particular, we compute the attention weights based on\ncross-correlation between the joint audio-visual-text feature representations\nand the feature representations of individual modalities to simultaneously\ncapture intra- and inter-modal relationships across the modalities. The\nattended features of the individual modalities are again fed as input to the\nfusion model in a recursive mechanism to obtain more refined feature\nrepresentations. We have also explored Temporal Convolutional Networks (TCNs)\nto improve the temporal modeling of the feature representations of individual\nmodalities. Extensive experiments are conducted to evaluate the performance of\nthe proposed fusion model on the challenging Affwild2 dataset. By effectively\ncapturing the synergic intra- and inter-modal relationships across audio,\nvisual and text modalities, the proposed fusion model achieves a Concordance\nCorrelation Coefficient (CCC) of 0.585 (0.542) and 0.659 (0.619) for valence\nand arousal respectively on the validation set (test set). This shows a\nsignificant improvement over the baseline of 0.24 (0.211) and 0.20 (0.191) for\nvalence and arousal respectively on the validation set (test set) of the\nvalence-arousal challenge of 6th Affective Behavior Analysis in-the-Wild (ABAW)\ncompetition.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.13659v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09383v3","updated":"2024-03-30T18:22:34Z","published":"2023-03-16T15:13:09Z","title":"Unifying Top-down and Bottom-up Scanpath Prediction Using Transformers","summary":" Most models of visual attention aim at predicting either top-down or\nbottom-up control, as studied using different visual search and free-viewing\ntasks. In this paper we propose the Human Attention Transformer (HAT), a single\nmodel that predicts both forms of attention control. HAT uses a novel\ntransformer-based architecture and a simplified foveated retina that\ncollectively create a spatio-temporal awareness akin to the dynamic visual\nworking memory of humans. HAT not only establishes a new state-of-the-art in\npredicting the scanpath of fixations made during target-present and\ntarget-absent visual search and ``taskless'' free viewing, but also makes human\ngaze behavior interpretable. Unlike previous methods that rely on a coarse grid\nof fixation cells and experience information loss due to fixation\ndiscretization, HAT features a sequential dense prediction architecture and\noutputs a dense heatmap for each fixation, thus avoiding discretizing\nfixations. HAT sets a new standard in computational attention, which emphasizes\neffectiveness, generality, and interpretability. HAT's demonstrated scope and\napplicability will likely inspire the development of new attention models that\ncan better predict human behavior in various attention-demanding scenarios.\nCode is available at https://github.com/cvlab-stonybrook/HAT.\n","authors":["Zhibo Yang","Sounak Mondal","Seoyoung Ahn","Ruoyu Xue","Gregory Zelinsky","Minh Hoai","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2303.09383v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.05079v2","updated":"2024-03-30T17:51:35Z","published":"2024-02-07T18:33:04Z","title":"Mamba-UNet: UNet-Like Pure Visual Mamba for Medical Image Segmentation","summary":" In recent advancements in medical image analysis, Convolutional Neural\nNetworks (CNN) and Vision Transformers (ViT) have set significant benchmarks.\nWhile the former excels in capturing local features through its convolution\noperations, the latter achieves remarkable global context understanding by\nleveraging self-attention mechanisms. However, both architectures exhibit\nlimitations in efficiently modeling long-range dependencies within medical\nimages, which is a critical aspect for precise segmentation. Inspired by the\nMamba architecture, known for its proficiency in handling long sequences and\nglobal contextual information with enhanced computational efficiency as a State\nSpace Model (SSM), we propose Mamba-UNet, a novel architecture that synergizes\nthe U-Net in medical image segmentation with Mamba's capability. Mamba-UNet\nadopts a pure Visual Mamba (VMamba)-based encoder-decoder structure, infused\nwith skip connections to preserve spatial information across different scales\nof the network. This design facilitates a comprehensive feature learning\nprocess, capturing intricate details and broader semantic contexts within\nmedical images. We introduce a novel integration mechanism within the VMamba\nblocks to ensure seamless connectivity and information flow between the encoder\nand decoder paths, enhancing the segmentation performance. We conducted\nexperiments on publicly available ACDC MRI Cardiac segmentation dataset, and\nSynapse CT Abdomen segmentation dataset. The results show that Mamba-UNet\noutperforms several types of UNet in medical image segmentation under the same\nhyper-parameter setting. The source code and baseline implementations are\navailable.\n","authors":["Ziyang Wang","Jian-Qing Zheng","Yichi Zhang","Ge Cui","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2402.05079v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18216v2","updated":"2024-03-30T17:00:18Z","published":"2023-05-29T17:00:40Z","title":"Towards minimizing efforts for Morphing Attacks -- Deep embeddings for\n morphing pair selection and improved Morphing Attack Detection","summary":" Face Morphing Attacks pose a threat to the security of identity documents,\nespecially with respect to a subsequent access control process, because it\nenables both individuals involved to exploit the same document. In this study,\nface embeddings serve two purposes: pre-selecting images for large-scale\nMorphing Attack generation and detecting potential Morphing Attacks. We build\nupon previous embedding studies in both use cases using the MagFace model. For\nthe first objective, we employ an pre-selection algorithm that pairs\nindividuals based on face embedding similarity. We quantify the attack\npotential of differently morphed face images to compare the usability of\npre-selection in automatically generating numerous successful Morphing Attacks.\nRegarding the second objective, we compare embeddings from two state-of-the-art\nface recognition systems in terms of their ability to detect Morphing Attacks.\nOur findings demonstrate that ArcFace and MagFace provide valuable face\nembeddings for image pre-selection. Both open-source and COTS face recognition\nsystems are susceptible to generated attacks, particularly when pre-selection\nis based on embeddings rather than random pairing which was only constrained by\nsoft biometrics. More accurate face recognition systems exhibit greater\nvulnerability to attacks, with COTS systems being the most susceptible.\nAdditionally, MagFace embeddings serve as a robust alternative for detecting\nmorphed face images compared to the previously used ArcFace embeddings. The\nresults endorse the advantages of face embeddings in more effective image\npre-selection for face morphing and accurate detection of morphed face images.\nThis is supported by extensive analysis of various designed attacks. The\nMagFace model proves to be a powerful alternative to the commonly used ArcFace\nmodel for both objectives, pre-selection and attack detection.\n","authors":["Roman Kessler","Kiran Raja","Juan Tapia","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2305.18216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19314v2","updated":"2024-03-30T16:36:17Z","published":"2024-03-28T11:12:33Z","title":"Total-Decom: Decomposed 3D Scene Reconstruction with Minimal Interaction","summary":" Scene reconstruction from multi-view images is a fundamental problem in\ncomputer vision and graphics. Recent neural implicit surface reconstruction\nmethods have achieved high-quality results; however, editing and manipulating\nthe 3D geometry of reconstructed scenes remains challenging due to the absence\nof naturally decomposed object entities and complex object/background\ncompositions. In this paper, we present Total-Decom, a novel method for\ndecomposed 3D reconstruction with minimal human interaction. Our approach\nseamlessly integrates the Segment Anything Model (SAM) with hybrid\nimplicit-explicit neural surface representations and a mesh-based\nregion-growing technique for accurate 3D object decomposition. Total-Decom\nrequires minimal human annotations while providing users with real-time control\nover the granularity and quality of decomposition. We extensively evaluate our\nmethod on benchmark datasets and demonstrate its potential for downstream\napplications, such as animation and scene editing. The code is available at\nhttps://github.com/CVMI-Lab/Total-Decom.git.\n","authors":["Xiaoyang Lyu","Chirui Chang","Peng Dai","Yang-Tian Sun","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2403.19314v2.pdf","comment":"8 pages, 7 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.10038v2","updated":"2024-03-30T16:10:47Z","published":"2024-02-15T16:00:58Z","title":"RS-DPO: A Hybrid Rejection Sampling and Direct Preference Optimization\n Method for Alignment of Large Language Models","summary":" Reinforcement learning from human feedback (RLHF) has been extensively\nemployed to align large language models with user intent. However, proximal\npolicy optimization (PPO) based RLHF is occasionally unstable requiring\nsignificant hyperparameter finetuning, and computationally expensive to\nmaximize the estimated reward during alignment. Recently, direct preference\noptimization (DPO) is proposed to address those challenges. However, DPO relies\non contrastive responses generated from human annotator and alternative LLM,\ninstead of the policy model, limiting the effectiveness of the RLHF. In this\npaper, we addresses both challenges by systematically combining rejection\nsampling (RS) and DPO. Our proposed method, RS-DPO, initiates with the\ndevelopment of a supervised fine-tuned policy model (SFT). A varied set of k\nresponses per prompt are sampled directly from the SFT model. RS-DPO identifies\npairs of contrastive samples based on their reward distribution. Finally, we\napply DPO with the contrastive samples to align the model to human preference.\nOur experiments indicate that our proposed method effectively fine-tunes LLMs\nwith limited resource environments, leading to improved alignment with user\nintent. Furthermore, it outperforms existing methods, including RS, PPO, and\nDPO.\n","authors":["Saeed Khaki","JinJin Li","Lan Ma","Liu Yang","Prathap Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2402.10038v2.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.10509v2","updated":"2024-03-30T16:09:14Z","published":"2023-08-21T06:50:29Z","title":"An Examination of the Compositionality of Large Generative\n Vision-Language Models","summary":" With the success of Large Language Models (LLMs), many Generative\nVision-Language Models (GVLMs) have been constructed via multimodal instruction\ntuning. However, the performance of GVLMs in multimodal compositional reasoning\nremains under-explored. In this paper, we examine both the evaluation metrics\n(VisualGPTScore, etc.) and current benchmarks for evaluating the\ncompositionality of GVLMs. We identify the syntactical bias in current\nbenchmarks, which is exploited by the linguistic capability of GVLMs. The bias\nrenders VisualGPTScore an insufficient metric for assessing GVLMs. To combat\nthis, we first introduce a SyntaxBias Score, leveraging LLMs to quantify such\nbias for mitigation. A challenging new task is subsequently added to evaluate\nthe robustness of GVLMs against inherent inclination toward syntactical\ncorrectness. Using the bias-mitigated datasets and the new task, we propose a\nnovel benchmark, namely SyntActically DE-biased benchmark (SADE). Our study\nprovides an unbiased benchmark for the compositionality of GVLMs, facilitating\nfuture research in this direction (Code and dataset are available at\nhttps://github.com/TeleeMa/SADE).\n","authors":["Teli Ma","Rong Li","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2308.10509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05471v2","updated":"2024-03-30T16:06:36Z","published":"2023-07-11T17:56:22Z","title":"Scale Alone Does not Improve Mechanistic Interpretability in Vision\n Models","summary":" In light of the recent widespread adoption of AI systems, understanding the\ninternal information processing of neural networks has become increasingly\ncritical. Most recently, machine vision has seen remarkable progress by scaling\nneural networks to unprecedented levels in dataset and model size. We here ask\nwhether this extraordinary increase in scale also positively impacts the field\nof mechanistic interpretability. In other words, has our understanding of the\ninner workings of scaled neural networks improved as well? We use a\npsychophysical paradigm to quantify one form of mechanistic interpretability\nfor a diverse suite of nine models and find no scaling effect for\ninterpretability - neither for model nor dataset size. Specifically, none of\nthe investigated state-of-the-art models are easier to interpret than the\nGoogLeNet model from almost a decade ago. Latest-generation vision models\nappear even less interpretable than older architectures, hinting at a\nregression rather than improvement, with modern models sacrificing\ninterpretability for accuracy. These results highlight the need for models\nexplicitly designed to be mechanistically interpretable and the need for more\nhelpful interpretability methods to increase our understanding of networks at\nan atomic level. We release a dataset containing more than 130'000 human\nresponses from our psychophysical evaluation of 767 units across nine models.\nThis dataset facilitates research on automated instead of human-based\ninterpretability evaluations, which can ultimately be leveraged to directly\noptimize the mechanistic interpretability of models.\n","authors":["Roland S. Zimmermann","Thomas Klein","Wieland Brendel"],"pdf_url":"https://arxiv.org/pdf/2307.05471v2.pdf","comment":"Spotlight at NeurIPS 2023. The first two authors contributed equally.\n Code available at https://brendel-group.github.io/imi/"},{"id":"http://arxiv.org/abs/2403.13589v2","updated":"2024-03-30T15:53:31Z","published":"2024-03-20T13:37:29Z","title":"ReGround: Improving Textual and Spatial Grounding at No Cost","summary":" When an image generation process is guided by both a text prompt and spatial\ncues, such as a set of bounding boxes, do these elements work in harmony, or\ndoes one dominate the other? Our analysis of a pretrained image diffusion model\nthat integrates gated self-attention into the U-Net reveals that spatial\ngrounding often outweighs textual grounding due to the sequential flow from\ngated self-attention to cross-attention. We demonstrate that such bias can be\nsignificantly mitigated without sacrificing accuracy in either grounding by\nsimply rewiring the network architecture, changing from sequential to parallel\nfor gated self-attention and cross-attention. This surprisingly simple yet\neffective solution does not require any fine-tuning of the network but\nsignificantly reduces the trade-off between the two groundings. Our experiments\ndemonstrate significant improvements from the original GLIGEN to the rewired\nversion in the trade-off between textual grounding and spatial grounding.\n","authors":["Yuseung Lee","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2403.13589v2.pdf","comment":"Project page: https://re-ground.github.io/"},{"id":"http://arxiv.org/abs/2401.17759v3","updated":"2024-03-30T15:46:35Z","published":"2024-01-31T11:36:12Z","title":"Rapid post-disaster infrastructure damage characterisation enabled by\n remote sensing and deep learning technologies -- a tiered approach","summary":" Critical infrastructure are systematically targeted during wars and extensive\nnatural disasters because critical infrastructure is vital for enabling\nconnectivity and transportation of people and goods, and hence, underpins\nnational and international economic growth. Mass destruction of transport\nassets, in conjunction with minimal or no accessibility in the wake of natural\nand anthropogenic disasters, prevents us from delivering rapid recovery and\nadaptation. A solution to this challenge is to use technology that enables\nstand-off observations. Nevertheless, no methods exist for the integrated\ncharacterisation of damage at multiple scales, i.e. regional, asset, and\nstructural scales, while there is no systematic correlation between\ninfrastructure damage assessments across these scales. We propose a methodology\nbased on an integrated multi-scale tiered approach to fill this capability gap.\nIn doing so, we demonstrate how damage characterisation can be enabled by\nfit-for-purpose digital technologies. Next, the methodology is applied and\nvalidated to a case study in Ukraine that includes 17 bridges all damages by\nhuman targeted interventions. From macro to micro, we deploy technology to\nintegrate assessments at scale, using from Sentinel-1 SAR images, crowdsourced\ninformation, and high-resolution images to deep learning to characterise\ninfrastructure damage. For the first time, the interferometric coherence\ndifference and semantic segmentation of images were deployed to improve the\nreliability of damage characterisations at different scales, i.e. regional,\ninfrastructure asset and component, with the aim of enhancing the damage\ncharacterisation accuracy. This integrated approach accelerates\ndecision-making, and therefore, facilitates more efficient restoration and\nadaptation efforts, ultimately fostering resilience into our infrastructure.\n","authors":["Nadiia Kopiika","Andreas Karavias","Pavlos Krassakis","Zehao Ye","Jelena Ninic","Nataliya Shakhovska","Nikolaos Koukouzas","Sotirios Argyroudis","Stergios-Aristoteles Mitoulis"],"pdf_url":"https://arxiv.org/pdf/2401.17759v3.pdf","comment":"Main text (33 pages,15 figures); Supplementary materials (19 pages)"},{"id":"http://arxiv.org/abs/2312.09238v2","updated":"2024-03-30T15:35:16Z","published":"2023-12-14T18:58:12Z","title":"Auto MC-Reward: Automated Dense Reward Design with Large Language Models\n for Minecraft","summary":" Many reinforcement learning environments (e.g., Minecraft) provide only\nsparse rewards that indicate task completion or failure with binary values. The\nchallenge in exploration efficiency in such environments makes it difficult for\nreinforcement-learning-based agents to learn complex tasks. To address this,\nthis paper introduces an advanced learning system, named Auto MC-Reward, that\nleverages Large Language Models (LLMs) to automatically design dense reward\nfunctions, thereby enhancing the learning efficiency. Auto MC-Reward consists\nof three important components: Reward Designer, Reward Critic, and Trajectory\nAnalyzer. Given the environment information and task descriptions, the Reward\nDesigner first design the reward function by coding an executable Python\nfunction with predefined observation inputs. Then, our Reward Critic will be\nresponsible for verifying the code, checking whether the code is\nself-consistent and free of syntax and semantic errors. Further, the Trajectory\nAnalyzer summarizes possible failure causes and provides refinement suggestions\naccording to collected trajectories. In the next round, Reward Designer will\nfurther refine and iterate the dense reward function based on feedback.\nExperiments demonstrate a significant improvement in the success rate and\nlearning efficiency of our agents in complex tasks in Minecraft, such as\nobtaining diamond with the efficient ability to avoid lava, and efficiently\nexplore trees and animals that are sparse in the plains biome.\n","authors":["Hao Li","Xue Yang","Zhaokai Wang","Xizhou Zhu","Jie Zhou","Yu Qiao","Xiaogang Wang","Hongsheng Li","Lewei Lu","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2312.09238v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.16774v2","updated":"2024-03-30T14:38:44Z","published":"2024-02-26T17:45:00Z","title":"Video-Based Autism Detection with Deep Learning","summary":" Individuals with Autism Spectrum Disorder (ASD) often experience challenges\nin health, communication, and sensory processing; therefore, early diagnosis is\nnecessary for proper treatment and care. In this work, we consider the problem\nof detecting or classifying ASD children to aid medical professionals in early\ndiagnosis. We develop a deep learning model that analyzes video clips of\nchildren reacting to sensory stimuli, with the intent of capturing key\ndifferences in reactions and behavior between ASD and non-ASD participants.\nUnlike many recent studies in ASD classification with MRI data, which require\nexpensive specialized equipment, our method utilizes a powerful but relatively\naffordable GPU, a standard computer setup, and a video camera for inference.\nResults show that our model effectively generalizes and understands key\ndifferences in the distinct movements of the children. It is noteworthy that\nour model exhibits successful classification performance despite the limited\namount of data for a deep learning problem and limited temporal information\navailable for learning, even with the motion artifacts.\n","authors":["M. Serna-Aguilera","X. B. Nguyen","A. Singh","L. Rockers","S. Park","L. Neely","H. Seo","K. Luu"],"pdf_url":"https://arxiv.org/pdf/2402.16774v2.pdf","comment":"Poster Abstract. Accepted into 2024 IEEE Green Technologies\n Conference"},{"id":"http://arxiv.org/abs/2311.15855v2","updated":"2024-03-30T14:21:40Z","published":"2023-11-27T14:22:07Z","title":"SiTH: Single-view Textured Human Reconstruction with Image-Conditioned\n Diffusion","summary":" A long-standing goal of 3D human reconstruction is to create lifelike and\nfully detailed 3D humans from single-view images. The main challenge lies in\ninferring unknown body shapes, appearances, and clothing details in areas not\nvisible in the images. To address this, we propose SiTH, a novel pipeline that\nuniquely integrates an image-conditioned diffusion model into a 3D mesh\nreconstruction workflow. At the core of our method lies the decomposition of\nthe challenging single-view reconstruction problem into generative\nhallucination and reconstruction subproblems. For the former, we employ a\npowerful generative diffusion model to hallucinate unseen back-view appearance\nbased on the input images. For the latter, we leverage skinned body meshes as\nguidance to recover full-body texture meshes from the input and back-view\nimages. SiTH requires as few as 500 3D human scans for training while\nmaintaining its generality and robustness to diverse images. Extensive\nevaluations on two 3D human benchmarks, including our newly created one,\nhighlighted our method's superior accuracy and perceptual quality in 3D\ntextured human reconstruction. Our code and evaluation benchmark are available\nat https://ait.ethz.ch/sith\n","authors":["Hsuan-I Ho","Jie Song","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2311.15855v2.pdf","comment":"23 pages, 23 figures, CVPR 2024"},{"id":"http://arxiv.org/abs/2312.03029v2","updated":"2024-03-30T14:19:10Z","published":"2023-12-05T11:01:44Z","title":"Gaussian Head Avatar: Ultra High-fidelity Head Avatar via Dynamic\n Gaussians","summary":" Creating high-fidelity 3D head avatars has always been a research hotspot,\nbut there remains a great challenge under lightweight sparse view setups. In\nthis paper, we propose Gaussian Head Avatar represented by controllable 3D\nGaussians for high-fidelity head avatar modeling. We optimize the neutral 3D\nGaussians and a fully learned MLP-based deformation field to capture complex\nexpressions. The two parts benefit each other, thereby our method can model\nfine-grained dynamic details while ensuring expression accuracy. Furthermore,\nwe devise a well-designed geometry-guided initialization strategy based on\nimplicit SDF and Deep Marching Tetrahedra for the stability and convergence of\nthe training procedure. Experiments show our approach outperforms other\nstate-of-the-art sparse-view methods, achieving ultra high-fidelity rendering\nquality at 2K resolution even under exaggerated expressions.\n","authors":["Yuelang Xu","Benwang Chen","Zhe Li","Hongwen Zhang","Lizhen Wang","Zerong Zheng","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03029v2.pdf","comment":"Projectpage: https://yuelangx.github.io/gaussianheadavatar, Code:\n https://github.com/YuelangX/Gaussian-Head-Avatar"},{"id":"http://arxiv.org/abs/2312.09313v3","updated":"2024-03-30T14:01:27Z","published":"2023-12-14T19:38:06Z","title":"LatentEditor: Text Driven Local Editing of 3D Scenes","summary":" While neural fields have made significant strides in view synthesis and scene\nreconstruction, editing them poses a formidable challenge due to their implicit\nencoding of geometry and texture information from multi-view inputs. In this\npaper, we introduce \\textsc{LatentEditor}, an innovative framework designed to\nempower users with the ability to perform precise and locally controlled\nediting of neural fields using text prompts. Leveraging denoising diffusion\nmodels, we successfully embed real-world scenes into the latent space,\nresulting in a faster and more adaptable NeRF backbone for editing compared to\ntraditional methods. To enhance editing precision, we introduce a delta score\nto calculate the 2D mask in the latent space that serves as a guide for local\nmodifications while preserving irrelevant regions. Our novel pixel-level\nscoring approach harnesses the power of InstructPix2Pix (IP2P) to discern the\ndisparity between IP2P conditional and unconditional noise predictions in the\nlatent space. The edited latents conditioned on the 2D masks are then\niteratively updated in the training set to achieve 3D local editing. Our\napproach achieves faster editing speeds and superior output quality compared to\nexisting 3D editing models, bridging the gap between textual instructions and\nhigh-quality 3D scene editing in latent space. We show the superiority of our\napproach on four benchmark 3D datasets, LLFF, IN2N, NeRFStudio and NeRF-Art.\n","authors":["Umar Khalid","Hasan Iqbal","Nazmul Karim","Jing Hua","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.09313v3.pdf","comment":"Project Page: https://latenteditor.github.io/"},{"id":"http://arxiv.org/abs/2403.19002v2","updated":"2024-03-30T14:00:27Z","published":"2024-03-27T20:52:30Z","title":"Robust Active Speaker Detection in Noisy Environments","summary":" This paper addresses the issue of active speaker detection (ASD) in noisy\nenvironments and formulates a robust active speaker detection (rASD) problem.\nExisting ASD approaches leverage both audio and visual modalities, but\nnon-speech sounds in the surrounding environment can negatively impact\nperformance. To overcome this, we propose a novel framework that utilizes\naudio-visual speech separation as guidance to learn noise-free audio features.\nThese features are then utilized in an ASD model, and both tasks are jointly\noptimized in an end-to-end framework. Our proposed framework mitigates residual\nnoise and audio quality reduction issues that can occur in a naive cascaded\ntwo-stage framework that directly uses separated speech for ASD, and enables\nthe two tasks to be optimized simultaneously. To further enhance the robustness\nof the audio features and handle inherent speech noises, we propose a dynamic\nweighted loss approach to train the speech separator. We also collected a\nreal-world noise audio dataset to facilitate investigations. Experiments\ndemonstrate that non-speech audio noises significantly impact ASD models, and\nour proposed approach improves ASD performance in noisy environments. The\nframework is general and can be applied to different ASD approaches to improve\ntheir robustness. Our code, models, and data will be released.\n","authors":["Siva Sai Nagender Vasireddy","Chenxu Zhang","Xiaohu Guo","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2403.19002v2.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2205.14375v5","updated":"2024-03-30T13:49:58Z","published":"2022-05-28T09:08:50Z","title":"WaveMix: A Resource-efficient Neural Network for Image Analysis","summary":" We propose a novel neural architecture for computer vision -- WaveMix -- that\nis resource-efficient and yet generalizable and scalable. While using fewer\ntrainable parameters, GPU RAM, and computations, WaveMix networks achieve\ncomparable or better accuracy than the state-of-the-art convolutional neural\nnetworks, vision transformers, and token mixers for several tasks. This\nefficiency can translate to savings in time, cost, and energy. To achieve these\ngains we used multi-level two-dimensional discrete wavelet transform (2D-DWT)\nin WaveMix blocks, which has the following advantages: (1) It reorganizes\nspatial information based on three strong image priors -- scale-invariance,\nshift-invariance, and sparseness of edges -- (2) in a lossless manner without\nadding parameters, (3) while also reducing the spatial sizes of feature maps,\nwhich reduces the memory and time required for forward and backward passes, and\n(4) expanding the receptive field faster than convolutions do. The whole\narchitecture is a stack of self-similar and resolution-preserving WaveMix\nblocks, which allows architectural flexibility for various tasks and levels of\nresource availability. WaveMix establishes new benchmarks for segmentation on\nCityscapes; and for classification on Galaxy 10 DECals, Places-365, five EMNIST\ndatasets, and iNAT-mini and performs competitively on other benchmarks. Our\ncode and trained models are publicly available.\n","authors":["Pranav Jeevan","Kavitha Viswanathan","Anandu A S","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2205.14375v5.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.18035v2","updated":"2024-03-30T13:28:54Z","published":"2024-03-26T18:40:36Z","title":"Bidirectional Consistency Models","summary":" Diffusion models (DMs) are capable of generating remarkably high-quality\nsamples by iteratively denoising a random vector, a process that corresponds to\nmoving along the probability flow ordinary differential equation (PF ODE).\nInterestingly, DMs can also invert an input image to noise by moving backward\nalong the PF ODE, a key operation for downstream tasks such as interpolation\nand image editing. However, the iterative nature of this process restricts its\nspeed, hindering its broader application. Recently, Consistency Models (CMs)\nhave emerged to address this challenge by approximating the integral of the PF\nODE, largely reducing the number of iterations. Yet, the absence of an explicit\nODE solver complicates the inversion process. To resolve this, we introduce the\nBidirectional Consistency Model (BCM), which learns a single neural network\nthat enables both forward and backward traversal along the PF ODE, efficiently\nunifying generation and inversion tasks within one framework. Notably, our\nproposed method enables one-step generation and inversion while also allowing\nthe use of additional steps to enhance generation quality or reduce\nreconstruction error. Furthermore, by leveraging our model's bidirectional\nconsistency, we introduce a sampling strategy that can enhance FID while\npreserving the generated image content. We further showcase our model's\ncapabilities in several downstream tasks, such as interpolation and inpainting,\nand present demonstrations of potential applications, including blind\nrestoration of compressed images and defending black-box adversarial attacks.\n","authors":["Liangchen Li","Jiajun He"],"pdf_url":"https://arxiv.org/pdf/2403.18035v2.pdf","comment":"40 pages, 25 figures"},{"id":"http://arxiv.org/abs/2403.15952v2","updated":"2024-03-30T13:21:42Z","published":"2024-03-23T23:06:32Z","title":"IllusionVQA: A Challenging Optical Illusion Dataset for Vision Language\n Models","summary":" The advent of Vision Language Models (VLM) has allowed researchers to\ninvestigate the visual understanding of a neural network using natural\nlanguage. Beyond object classification and detection, VLMs are capable of\nvisual comprehension and common-sense reasoning. This naturally led to the\nquestion: How do VLMs respond when the image itself is inherently unreasonable?\nTo this end, we present IllusionVQA: a diverse dataset of challenging optical\nillusions and hard-to-interpret scenes to test the capability of VLMs in two\ndistinct multiple-choice VQA tasks - comprehension and soft localization.\nGPT4V, the best-performing VLM, achieves 62.99% accuracy (4-shot) on the\ncomprehension task and 49.7% on the localization task (4-shot and\nChain-of-Thought). Human evaluation reveals that humans achieve 91.03% and 100%\naccuracy in comprehension and localization. We discover that In-Context\nLearning (ICL) and Chain-of-Thought reasoning substantially degrade the\nperformance of GeminiPro on the localization task. Tangentially, we discover a\npotential weakness in the ICL capabilities of VLMs: they fail to locate optical\nillusions even when the correct answer is in the context window as a few-shot\nexample.\n","authors":["Haz Sameen Shahgir","Khondker Salman Sayeed","Abhik Bhattacharjee","Wasi Uddin Ahmad","Yue Dong","Rifat Shahriyar"],"pdf_url":"https://arxiv.org/pdf/2403.15952v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00081v2","updated":"2024-03-30T12:45:08Z","published":"2023-11-30T03:20:37Z","title":"Synthesize, Diagnose, and Optimize: Towards Fine-Grained Vision-Language\n Understanding","summary":" Vision language models (VLM) have demonstrated remarkable performance across\nvarious downstream tasks. However, understanding fine-grained visual-linguistic\nconcepts, such as attributes and inter-object relationships, remains a\nsignificant challenge. While several benchmarks aim to evaluate VLMs in finer\ngranularity, their primary focus remains on the linguistic aspect, neglecting\nthe visual dimension. Here, we highlight the importance of evaluating VLMs from\nboth a textual and visual perspective. We introduce a progressive pipeline to\nsynthesize images that vary in a specific attribute while ensuring consistency\nin all other aspects. Utilizing this data engine, we carefully design a\nbenchmark, SPEC, to diagnose the comprehension of object size, position,\nexistence, and count. Subsequently, we conduct a thorough evaluation of four\nleading VLMs on SPEC. Surprisingly, their performance is close to random guess,\nrevealing significant limitations. With this in mind, we propose a simple yet\neffective approach to optimize VLMs in fine-grained understanding, achieving\nsignificant improvements on SPEC without compromising the zero-shot\nperformance. Results on two additional fine-grained benchmarks also show\nconsistent improvements, further validating the transferability of our\napproach. Code and data are available at https://github.com/wjpoom/SPEC.\n","authors":["Wujian Peng","Sicheng Xie","Zuyao You","Shiyi Lan","Zuxuan Wu"],"pdf_url":"https://arxiv.org/pdf/2312.00081v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.14024v2","updated":"2024-03-30T12:39:06Z","published":"2023-12-21T16:54:09Z","title":"NICP: Neural ICP for 3D Human Registration at Scale","summary":" Aligning a template to 3D human point clouds is a long-standing problem\ncrucial for tasks like animation, reconstruction, and enabling supervised\nlearning pipelines. Recent data-driven methods leverage predicted surface\ncorrespondences; however, they are not robust to varied poses, identities, or\nnoise. In contrast, industrial solutions often rely on expensive manual\nannotations or multi-view capturing systems. Recently, neural fields have shown\npromising results. Still, their purely data-driven and extrinsic nature does\nnot incorporate any guidance toward the target surface, often resulting in a\ntrivial misalignment of the template registration. Currently, no method can be\nconsidered the standard for 3D Human registration, limiting the scalability of\ndownstream applications. In this work, we propose NSR, a pipeline that, for the\nfirst time, generalizes and scales across thousands of shapes and more than ten\ndifferent data sources. Our essential contribution is NICP, an ICP-style\nself-supervised task tailored to neural fields. NICP takes a few seconds, is\nself-supervised, and works out of the box on pre-trained neural fields. We\ncombine it with a localized Neural Field trained on a large MoCap dataset. NSR\nachieves the state of the art over public benchmarks, and the release of its\ncode and checkpoints will provide the community with a powerful tool useful for\nmany downstream tasks like dataset alignments, cleaning, or asset animation.\n","authors":["Riccardo Marin","Enric Corona","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2312.14024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16739v2","updated":"2024-03-30T12:08:08Z","published":"2023-11-28T12:35:13Z","title":"As-Plausible-As-Possible: Plausibility-Aware Mesh Deformation Using 2D\n Diffusion Priors","summary":" We present As-Plausible-as-Possible (APAP) mesh deformation technique that\nleverages 2D diffusion priors to preserve the plausibility of a mesh under\nuser-controlled deformation. Our framework uses per-face Jacobians to represent\nmesh deformations, where mesh vertex coordinates are computed via a\ndifferentiable Poisson Solve. The deformed mesh is rendered, and the resulting\n2D image is used in the Score Distillation Sampling (SDS) process, which\nenables extracting meaningful plausibility priors from a pretrained 2D\ndiffusion model. To better preserve the identity of the edited mesh, we\nfine-tune our 2D diffusion model with LoRA. Gradients extracted by SDS and a\nuser-prescribed handle displacement are then backpropagated to the per-face\nJacobians, and we use iterative gradient descent to compute the final\ndeformation that balances between the user edit and the output plausibility. We\nevaluate our method with 2D and 3D meshes and demonstrate qualitative and\nquantitative improvements when using plausibility priors over\ngeometry-preservation or distortion-minimization priors used by previous\ntechniques. Our project page is at: https://as-plausible-aspossible.github.io/\n","authors":["Seungwoo Yoo","Kunho Kim","Vladimir G. Kim","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2311.16739v2.pdf","comment":"Project page: https://as-plausible-as-possible.github.io/"},{"id":"http://arxiv.org/abs/2401.04071v2","updated":"2024-03-30T12:05:52Z","published":"2024-01-08T18:18:02Z","title":"Fun with Flags: Robust Principal Directions via Flag Manifolds","summary":" Principal component analysis (PCA), along with its extensions to manifolds\nand outlier contaminated data, have been indispensable in computer vision and\nmachine learning. In this work, we present a unifying formalism for PCA and its\nvariants, and introduce a framework based on the flags of linear subspaces, ie\na hierarchy of nested linear subspaces of increasing dimension, which not only\nallows for a common implementation but also yields novel variants, not explored\npreviously. We begin by generalizing traditional PCA methods that either\nmaximize variance or minimize reconstruction error. We expand these\ninterpretations to develop a wide array of new dimensionality reduction\nalgorithms by accounting for outliers and the data manifold. To devise a common\ncomputational approach, we recast robust and dual forms of PCA as optimization\nproblems on flag manifolds. We then integrate tangent space approximations of\nprincipal geodesic analysis (tangent-PCA) into this flag-based framework,\ncreating novel robust and dual geodesic PCA variations. The remarkable\nflexibility offered by the 'flagification' introduced here enables even more\nalgorithmic variants identified by specific flag types. Last but not least, we\npropose an effective convergent solver for these flag-formulations employing\nthe Stiefel manifold. Our empirical results on both real-world and synthetic\nscenarios, demonstrate the superiority of our novel algorithms, especially in\nterms of robustness to outliers on manifolds.\n","authors":["Nathan Mankovich","Gustau Camps-Valls","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2401.04071v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12463v2","updated":"2024-03-30T11:35:52Z","published":"2023-12-18T19:02:07Z","title":"Open Vocabulary Semantic Scene Sketch Understanding","summary":" We study the underexplored but fundamental vision problem of machine\nunderstanding of abstract freehand scene sketches. We introduce a sketch\nencoder that results in semantically-aware feature space, which we evaluate by\ntesting its performance on a semantic sketch segmentation task. To train our\nmodel we rely only on the availability of bitmap sketches with their brief\ncaptions and do not require any pixel-level annotations. To obtain\ngeneralization to a large set of sketches and categories, we build on a vision\ntransformer encoder pretrained with the CLIP model. We freeze the text encoder\nand perform visual-prompt tuning of the visual encoder branch while introducing\na set of critical modifications. Firstly, we augment the classical key-query\n(k-q) self-attention blocks with value-value (v-v) self-attention blocks.\nCentral to our model is a two-level hierarchical network design that enables\nefficient semantic disentanglement: The first level ensures holistic scene\nsketch encoding, and the second level focuses on individual categories. We,\nthen, in the second level of the hierarchy, introduce a cross-attention between\ntextual and visual branches. Our method outperforms zero-shot CLIP pixel\naccuracy of segmentation results by 37 points, reaching an accuracy of $85.5\\%$\non the FS-COCO sketch dataset. Finally, we conduct a user study that allows us\nto identify further improvements needed over our method to reconcile machine\nand human understanding of scene sketches.\n","authors":["Ahmed Bourouis","Judith Ellen Fan","Yulia Gryaditskaya"],"pdf_url":"https://arxiv.org/pdf/2312.12463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08835v3","updated":"2024-03-30T11:01:17Z","published":"2023-11-15T10:22:35Z","title":"Correlation-guided Query-Dependency Calibration in Video Representation\n Learning for Temporal Grounding","summary":" Video Temporal Grounding is to identify specific moments or highlights from a\nvideo corresponding to textual descriptions. Typical approaches in temporal\ngrounding treat all video clips equally during the encoding process regardless\nof their semantic relevance with the text query. Therefore, we propose\nCorrelation-Guided DEtection TRansformer(CG-DETR), exploring to provide clues\nfor query-associated video clips within the cross-modal attention. First, we\ndesign an adaptive cross-attention with dummy tokens. Dummy tokens conditioned\nby text query take portions of the attention weights, preventing irrelevant\nvideo clips from being represented by the text query. Yet, not all words\nequally inherit the text query's correlation to video clips. Thus, we further\nguide the cross-attention map by inferring the fine-grained correlation between\nvideo clips and words. We enable this by learning a joint embedding space for\nhigh-level concepts, i.e., moment and sentence level, and inferring the\nclip-word correlation. Lastly, we exploit the moment-specific characteristics\nand combine them with the context of each video to form a moment-adaptive\nsaliency detector. By exploiting the degrees of text engagement in each video\nclip, it precisely measures the highlightness of each clip. CG-DETR achieves\nstate-of-the-art results on various benchmarks for temporal grounding.\n","authors":["WonJun Moon","Sangeek Hyun","SuBeen Lee","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2311.08835v3.pdf","comment":"34 pages, 16 figures, 13 tables, Code is available at\n https://github.com/wjun0830/CGDETR"},{"id":"http://arxiv.org/abs/2312.02244v2","updated":"2024-03-30T10:49:41Z","published":"2023-12-04T12:30:07Z","title":"Geometrically-driven Aggregation for Zero-shot 3D Point Cloud\n Understanding","summary":" Zero-shot 3D point cloud understanding can be achieved via 2D Vision-Language\nModels (VLMs). Existing strategies directly map Vision-Language Models from 2D\npixels of rendered or captured views to 3D points, overlooking the inherent and\nexpressible point cloud geometric structure. Geometrically similar or close\nregions can be exploited for bolstering point cloud understanding as they are\nlikely to share semantic information. To this end, we introduce the first\ntraining-free aggregation technique that leverages the point cloud's 3D\ngeometric structure to improve the quality of the transferred Vision-Language\nModels. Our approach operates iteratively, performing local-to-global\naggregation based on geometric and semantic point-level reasoning. We benchmark\nour approach on three downstream tasks, including classification, part\nsegmentation, and semantic segmentation, with a variety of datasets\nrepresenting both synthetic/real-world, and indoor/outdoor scenarios. Our\napproach achieves new state-of-the-art results in all benchmarks. We will\nrelease the source code publicly.\n","authors":["Guofeng Mei","Luigi Riz","Yiming Wang","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.02244v2.pdf","comment":"Zero-shot, point cloud, 2D Vision-Language Models, geometric\n structure, training-free"},{"id":"http://arxiv.org/abs/2312.01307v2","updated":"2024-03-30T10:46:34Z","published":"2023-12-03T07:22:42Z","title":"SAGE: Bridging Semantic and Actionable Parts for GEneralizable\n Manipulation of Articulated Objects","summary":" To interact with daily-life articulated objects of diverse structures and\nfunctionalities, understanding the object parts plays a central role in both\nuser instruction comprehension and task execution. However, the possible\ndiscordance between the semantic meaning and physics functionalities of the\nparts poses a challenge for designing a general system. To address this\nproblem, we propose SAGE, a novel framework that bridges semantic and\nactionable parts of articulated objects to achieve generalizable manipulation\nunder natural language instructions. More concretely, given an articulated\nobject, we first observe all the semantic parts on it, conditioned on which an\ninstruction interpreter proposes possible action programs that concretize the\nnatural language instruction. Then, a part-grounding module maps the semantic\nparts into so-called Generalizable Actionable Parts (GAParts), which inherently\ncarry information about part motion. End-effector trajectories are predicted on\nthe GAParts, which, together with the action program, form an executable\npolicy. Additionally, an interactive feedback module is incorporated to respond\nto failures, which closes the loop and increases the robustness of the overall\nframework. Key to the success of our framework is the joint proposal and\nknowledge fusion between a large vision-language model (VLM) and a small\ndomain-specific model for both context comprehension and part perception, with\nthe former providing general intuitions and the latter serving as expert facts.\nBoth simulation and real-robot experiments show our effectiveness in handling a\nlarge variety of articulated objects with diverse language-instructed goals.\n","authors":["Haoran Geng","Songlin Wei","Congyue Deng","Bokui Shen","He Wang","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2312.01307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13347v2","updated":"2024-03-30T09:45:38Z","published":"2024-03-20T07:15:22Z","title":"vid-TLDR: Training Free Token merging for Light-weight Video Transformer","summary":" Video Transformers have become the prevalent solution for various video\ndownstream tasks with superior expressive power and flexibility. However, these\nvideo transformers suffer from heavy computational costs induced by the massive\nnumber of tokens across the entire video frames, which has been the major\nbarrier to training the model. Further, the patches irrelevant to the main\ncontents, e.g., backgrounds, degrade the generalization performance of models.\nTo tackle these issues, we propose training free token merging for lightweight\nvideo Transformer (vid-TLDR) that aims to enhance the efficiency of video\nTransformers by merging the background tokens without additional training. For\nvid-TLDR, we introduce a novel approach to capture the salient regions in\nvideos only with the attention map. Further, we introduce the saliency-aware\ntoken merging strategy by dropping the background tokens and sharpening the\nobject scores. Our experiments show that vid-TLDR significantly mitigates the\ncomputational complexity of video Transformers while achieving competitive\nperformance compared to the base model without vid-TLDR. Code is available at\nhttps://github.com/mlvlab/vid-TLDR.\n","authors":["Joonmyung Choi","Sanghyeok Lee","Jaewon Chu","Minhyuk Choi","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2403.13347v2.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2402.17726v3","updated":"2024-03-30T09:35:47Z","published":"2024-02-27T17:58:09Z","title":"VRP-SAM: SAM with Visual Reference Prompt","summary":" In this paper, we propose a novel Visual Reference Prompt (VRP) encoder that\nempowers the Segment Anything Model (SAM) to utilize annotated reference images\nas prompts for segmentation, creating the VRP-SAM model. In essence, VRP-SAM\ncan utilize annotated reference images to comprehend specific objects and\nperform segmentation of specific objects in target image. It is note that the\nVRP encoder can support a variety of annotation formats for reference images,\nincluding \\textbf{point}, \\textbf{box}, \\textbf{scribble}, and \\textbf{mask}.\nVRP-SAM achieves a breakthrough within the SAM framework by extending its\nversatility and applicability while preserving SAM's inherent strengths, thus\nenhancing user-friendliness. To enhance the generalization ability of VRP-SAM,\nthe VRP encoder adopts a meta-learning strategy. To validate the effectiveness\nof VRP-SAM, we conducted extensive empirical studies on the Pascal and COCO\ndatasets. Remarkably, VRP-SAM achieved state-of-the-art performance in visual\nreference segmentation with minimal learnable parameters. Furthermore, VRP-SAM\ndemonstrates strong generalization capabilities, allowing it to perform\nsegmentation of unseen objects and enabling cross-domain segmentation. The\nsource code and models will be available at\n\\url{https://github.com/syp2ysy/VRP-SAM}\n","authors":["Yanpeng Sun","Jiahui Chen","Shan Zhang","Xinyu Zhang","Qiang Chen","Gang Zhang","Errui Ding","Jingdong Wang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2402.17726v3.pdf","comment":"Accepted by CVPR 2024; The camera-ready version"},{"id":"http://arxiv.org/abs/2011.14598v4","updated":"2024-03-30T09:24:13Z","published":"2020-11-30T07:44:52Z","title":"Video Self-Stitching Graph Network for Temporal Action Localization","summary":" Temporal action localization (TAL) in videos is a challenging task,\nespecially due to the large variation in action temporal scales. Short actions\nusually occupy a major proportion in the datasets, but tend to have the lowest\nperformance. In this paper, we confront the challenge of short actions and\npropose a multi-level cross-scale solution dubbed as video self-stitching graph\nnetwork (VSGN). We have two key components in VSGN: video self-stitching (VSS)\nand cross-scale graph pyramid network (xGPN). In VSS, we focus on a short\nperiod of a video and magnify it along the temporal dimension to obtain a\nlarger scale. We stitch the original clip and its magnified counterpart in one\ninput sequence to take advantage of the complementary properties of both\nscales. The xGPN component further exploits the cross-scale correlations by a\npyramid of cross-scale graph networks, each containing a hybrid module to\naggregate features from across scales as well as within the same scale. Our\nVSGN not only enhances the feature representations, but also generates more\npositive anchors for short actions and more short training samples. Experiments\ndemonstrate that VSGN obviously improves the localization performance of short\nactions as well as achieving the state-of-the-art overall performance on\nTHUMOS-14 and ActivityNet-v1.3.\n","authors":["Chen Zhao","Ali Thabet","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2011.14598v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07710v2","updated":"2024-03-30T09:20:36Z","published":"2024-02-12T15:23:19Z","title":"Optimizing Sparse Convolution on GPUs with CUDA for 3D Point Cloud\n Processing in Embedded Systems","summary":" In recent years, there has been a significant increase in the utilization of\ndeep learning methods, particularly convolutional neural networks (CNNs), which\nhave emerged as the dominant approach in various domains that involve\nstructured grid data, such as picture analysis and processing. Nevertheless,\nthe exponential growth in the utilization of LiDAR and 3D sensors across many\ndomains has resulted in an increased need for the analysis of 3D point clouds.\nThe utilization of 3D point clouds is crucial in various applications,\nincluding object recognition and segmentation, as they offer a spatial\ndepiction of things within a three-dimensional environment. In contrast to\nphotos, point clouds exhibit sparsity and lack a regular grid, hence posing\ndistinct processing and computational issues.\n","authors":["Chester Luo","Kevin Lai"],"pdf_url":"https://arxiv.org/pdf/2402.07710v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2205.10490v2","updated":"2024-03-30T08:52:40Z","published":"2022-05-21T02:38:16Z","title":"Aligning Logits Generatively for Principled Black-Box Knowledge\n Distillation","summary":" Black-Box Knowledge Distillation (B2KD) is a formulated problem for\ncloud-to-edge model compression with invisible data and models hosted on the\nserver. B2KD faces challenges such as limited Internet exchange and edge-cloud\ndisparity of data distributions. In this paper, we formalize a two-step\nworkflow consisting of deprivatization and distillation, and theoretically\nprovide a new optimization direction from logits to cell boundary different\nfrom direct logits alignment. With its guidance, we propose a new method\nMapping-Emulation KD (MEKD) that distills a black-box cumbersome model into a\nlightweight one. Our method does not differentiate between treating soft or\nhard responses, and consists of: 1) deprivatization: emulating the inverse\nmapping of the teacher function with a generator, and 2) distillation: aligning\nlow-dimensional logits of the teacher and student models by reducing the\ndistance of high-dimensional image points. For different teacher-student pairs,\nour method yields inspiring distillation performance on various benchmarks, and\noutperforms the previous state-of-the-art approaches.\n","authors":["Jing Ma","Xiang Xiang","Ke Wang","Yuchuan Wu","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2205.10490v2.pdf","comment":"To appear at CVPR 2024; significantly rewritten with extra\n experiments since the preliminary report"},{"id":"http://arxiv.org/abs/2311.17516v4","updated":"2024-03-30T08:35:17Z","published":"2023-11-29T10:39:53Z","title":"MMA-Diffusion: MultiModal Attack on Diffusion Models","summary":" In recent years, Text-to-Image (T2I) models have seen remarkable\nadvancements, gaining widespread adoption. However, this progress has\ninadvertently opened avenues for potential misuse, particularly in generating\ninappropriate or Not-Safe-For-Work (NSFW) content. Our work introduces\nMMA-Diffusion, a framework that presents a significant and realistic threat to\nthe security of T2I models by effectively circumventing current defensive\nmeasures in both open-source models and commercial online services. Unlike\nprevious approaches, MMA-Diffusion leverages both textual and visual modalities\nto bypass safeguards like prompt filters and post-hoc safety checkers, thus\nexposing and highlighting the vulnerabilities in existing defense mechanisms.\n","authors":["Yijun Yang","Ruiyuan Gao","Xiaosen Wang","Tsung-Yi Ho","Nan Xu","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2311.17516v4.pdf","comment":"CVPR 2024. Our codes and benchmarks are available at\n https://github.com/cure-lab/MMA-Diffusion"},{"id":"http://arxiv.org/abs/2401.04105v2","updated":"2024-03-30T08:06:01Z","published":"2024-01-08T18:59:31Z","title":"Dr$^2$Net: Dynamic Reversible Dual-Residual Networks for\n Memory-Efficient Finetuning","summary":" Large pretrained models are increasingly crucial in modern computer vision\ntasks. These models are typically used in downstream tasks by end-to-end\nfinetuning, which is highly memory-intensive for tasks with high-resolution\ndata, e.g., video understanding, small object detection, and point cloud\nanalysis. In this paper, we propose Dynamic Reversible Dual-Residual Networks,\nor Dr$^2$Net, a novel family of network architectures that acts as a surrogate\nnetwork to finetune a pretrained model with substantially reduced memory\nconsumption. Dr$^2$Net contains two types of residual connections, one\nmaintaining the residual structure in the pretrained models, and the other\nmaking the network reversible. Due to its reversibility, intermediate\nactivations, which can be reconstructed from output, are cleared from memory\nduring training. We use two coefficients on either type of residual connections\nrespectively, and introduce a dynamic training strategy that seamlessly\ntransitions the pretrained model to a reversible network with much higher\nnumerical precision. We evaluate Dr$^2$Net on various pretrained models and\nvarious tasks, and show that it can reach comparable performance to\nconventional finetuning but with significantly less memory usage.\n","authors":["Chen Zhao","Shuming Liu","Karttikeya Mangalam","Guocheng Qian","Fatimah Zohra","Abdulmohsen Alghannam","Jitendra Malik","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2401.04105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08869v2","updated":"2024-03-30T07:23:20Z","published":"2023-12-10T08:25:41Z","title":"I'M HOI: Inertia-aware Monocular Capture of 3D Human-Object Interactions","summary":" We are living in a world surrounded by diverse and \"smart\" devices with rich\nmodalities of sensing ability. Conveniently capturing the interactions between\nus humans and these objects remains far-reaching. In this paper, we present\nI'm-HOI, a monocular scheme to faithfully capture the 3D motions of both the\nhuman and object in a novel setting: using a minimal amount of RGB camera and\nobject-mounted Inertial Measurement Unit (IMU). It combines general motion\ninference and category-aware refinement. For the former, we introduce a\nholistic human-object tracking method to fuse the IMU signals and the RGB\nstream and progressively recover the human motions and subsequently the\ncompanion object motions. For the latter, we tailor a category-aware motion\ndiffusion model, which is conditioned on both the raw IMU observations and the\nresults from the previous stage under over-parameterization representation. It\nsignificantly refines the initial results and generates vivid body, hand, and\nobject motions. Moreover, we contribute a large dataset with ground truth human\nand object motions, dense RGB inputs, and rich object-mounted IMU measurements.\nExtensive experiments demonstrate the effectiveness of I'm-HOI under a hybrid\ncapture setting. Our dataset and code will be released to the community.\n","authors":["Chengfeng Zhao","Juze Zhang","Jiashen Du","Ziwei Shan","Junye Wang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.08869v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://afterjourney00.github.io/IM-HOI.github.io/"},{"id":"http://arxiv.org/abs/2312.05716v2","updated":"2024-03-30T07:04:15Z","published":"2023-12-10T00:51:05Z","title":"Initialization Matters for Adversarial Transfer Learning","summary":" With the prevalence of the Pretraining-Finetuning paradigm in transfer\nlearning, the robustness of downstream tasks has become a critical concern. In\nthis work, we delve into adversarial robustness in transfer learning and reveal\nthe critical role of initialization, including both the pretrained model and\nthe linear head. First, we discover the necessity of an adversarially robust\npretrained model. Specifically, we reveal that with a standard pretrained\nmodel, Parameter-Efficient Finetuning (PEFT) methods either fail to be\nadversarially robust or continue to exhibit significantly degraded adversarial\nrobustness on downstream tasks, even with adversarial training during\nfinetuning. Leveraging a robust pretrained model, surprisingly, we observe that\na simple linear probing can outperform full finetuning and other PEFT methods\nwith random initialization on certain datasets. We further identify that linear\nprobing excels in preserving robustness from the robust pretraining. Based on\nthis, we propose Robust Linear Initialization (RoLI) for adversarial\nfinetuning, which initializes the linear head with the weights obtained by\nadversarial linear probing to maximally inherit the robustness from\npretraining. Across five different image classification datasets, we\ndemonstrate the effectiveness of RoLI and achieve new state-of-the-art results.\nOur code is available at \\url{https://github.com/DongXzz/RoLI}.\n","authors":["Andong Hua","Jindong Gu","Zhiyu Xue","Nicholas Carlini","Eric Wong","Yao Qin"],"pdf_url":"https://arxiv.org/pdf/2312.05716v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2307.12872v2","updated":"2024-03-30T06:59:35Z","published":"2023-07-24T15:10:22Z","title":"Latent Code Augmentation Based on Stable Diffusion for Data-free\n Substitute Attacks","summary":" Since the training data of the target model is not available in the black-box\nsubstitute attack, most recent schemes utilize GANs to generate data for\ntraining the substitute model. However, these GANs-based schemes suffer from\nlow training efficiency as the generator needs to be retrained for each target\nmodel during the substitute training process, as well as low generation\nquality. To overcome these limitations, we consider utilizing the diffusion\nmodel to generate data, and propose a novel data-free substitute attack scheme\nbased on the Stable Diffusion (SD) to improve the efficiency and accuracy of\nsubstitute training. Despite the data generated by the SD exhibiting high\nquality, it presents a different distribution of domains and a large variation\nof positive and negative samples for the target model. For this problem, we\npropose Latent Code Augmentation (LCA) to facilitate SD in generating data that\naligns with the data distribution of the target model. Specifically, we augment\nthe latent codes of the inferred member data with LCA and use them as guidance\nfor SD. With the guidance of LCA, the data generated by the SD not only meets\nthe discriminative criteria of the target model but also exhibits high\ndiversity. By utilizing this data, it is possible to train the substitute model\nthat closely resembles the target model more efficiently. Extensive experiments\ndemonstrate that our LCA achieves higher attack success rates and requires\nfewer query budgets compared to GANs-based schemes for different target models.\nOur codes are available at \\url{https://github.com/LzhMeng/LCA}.\n","authors":["Mingwen Shao","Lingzhuang Meng","Yuanjian Qiao","Lixu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.12872v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.01327v2","updated":"2024-03-30T06:50:28Z","published":"2023-09-04T03:06:04Z","title":"Can I Trust Your Answer? Visually Grounded Video Question Answering","summary":" We study visually grounded VideoQA in response to the emerging trends of\nutilizing pretraining techniques for video-language understanding.\nSpecifically, by forcing vision-language models (VLMs) to answer questions and\nsimultaneously provide visual evidence, we seek to ascertain the extent to\nwhich the predictions of such techniques are genuinely anchored in relevant\nvideo content, versus spurious correlations from language or irrelevant visual\ncontext. Towards this, we construct NExT-GQA -- an extension of NExT-QA with\n10.5$K$ temporal grounding (or location) labels tied to the original QA pairs.\nWith NExT-GQA, we scrutinize a series of state-of-the-art VLMs. Through\npost-hoc attention analysis, we find that these models are extremely weak in\nsubstantiating the answers despite their strong QA performance. This exposes\nthe limitation of current VLMs in making reliable predictions. As a remedy, we\nfurther explore and propose a grounded-QA method via Gaussian mask optimization\nand cross-modal learning. Experiments with different backbones demonstrate that\nthis grounding mechanism improves both grounding and QA. With these efforts, we\naim to push towards trustworthy VLMs in VQA systems. Our dataset and code are\navailable at https://github.com/doc-doc/NExT-GQA.\n","authors":["Junbin Xiao","Angela Yao","Yicong Li","Tat Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2309.01327v2.pdf","comment":"Accepted to CVPR'24. (Compared with preprint version, we mainly\n improve the presentation, discuss more related works, and extend experiments\n in Appendix.)"},{"id":"http://arxiv.org/abs/2312.02224v2","updated":"2024-03-30T06:50:25Z","published":"2023-12-03T22:05:05Z","title":"Tracing Hyperparameter Dependencies for Model Parsing via Learnable\n Graph Pooling Network","summary":" Model Parsing defines the research task of predicting hyperparameters of the\ngenerative model (GM), given a generated image as input. Since a diverse set of\nhyperparameters is jointly employed by the generative model, and dependencies\noften exist among them, it is crucial to learn these hyperparameter\ndependencies for the improved model parsing performance. To explore such\nimportant dependencies, we propose a novel model parsing method called\nLearnable Graph Pooling Network (LGPN). Specifically, we transform model\nparsing into a graph node classification task, using graph nodes and edges to\nrepresent hyperparameters and their dependencies, respectively. Furthermore,\nLGPN incorporates a learnable pooling-unpooling mechanism tailored to model\nparsing, which adaptively learns hyperparameter dependencies of GMs used to\ngenerate the input image. We also extend our proposed method to CNN-generated\nimage detection and coordinate attacks detection. Empirically, we achieve\nstate-of-the-art results in model parsing and its extended applications,\nshowing the effectiveness of our method. Our source code are available.\n","authors":["Xiao Guo","Vishal Asnani","Sijia Liu","Xiaoming Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02224v2.pdf","comment":"24 pages, 15 figures, 17 tables"},{"id":"http://arxiv.org/abs/2403.15955v3","updated":"2024-03-30T06:42:02Z","published":"2024-03-23T23:22:54Z","title":"Finding needles in a haystack: A Black-Box Approach to Invisible\n Watermark Detection","summary":" In this paper, we propose WaterMark Detection (WMD), the first invisible\nwatermark detection method under a black-box and annotation-free setting. WMD\nis capable of detecting arbitrary watermarks within a given reference dataset\nusing a clean non-watermarked dataset as a reference, without relying on\nspecific decoding methods or prior knowledge of the watermarking techniques. We\ndevelop WMD using foundations of offset learning, where a clean non-watermarked\ndataset enables us to isolate the influence of only watermarked samples in the\nreference dataset. Our comprehensive evaluations demonstrate the effectiveness\nof WMD, significantly outperforming naive detection methods, which only yield\nAUC scores around 0.5. In contrast, WMD consistently achieves impressive\ndetection AUC scores, surpassing 0.9 in most single-watermark datasets and\nexceeding 0.7 in more challenging multi-watermark scenarios across diverse\ndatasets and watermarking methods. As invisible watermarks become increasingly\nprevalent, while specific decoding techniques remain undisclosed, our approach\nprovides a versatile solution and establishes a path toward increasing\naccountability, transparency, and trust in our digital visual content.\n","authors":["Minzhou Pan","Zhenting Wang","Xin Dong","Vikash Sehwag","Lingjuan Lyu","Xue Lin"],"pdf_url":"https://arxiv.org/pdf/2403.15955v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12790v2","updated":"2024-03-30T06:36:06Z","published":"2023-09-22T11:02:57Z","title":"NTO3D: Neural Target Object 3D Reconstruction with Segment Anything","summary":" Neural 3D reconstruction from multi-view images has recently attracted\nincreasing attention from the community. Existing methods normally learn a\nneural field for the whole scene, while it is still under-explored how to\nreconstruct a target object indicated by users. Considering the Segment\nAnything Model (SAM) has shown effectiveness in segmenting any 2D images, in\nthis paper, we propose NTO3D, a novel high-quality Neural Target Object 3D\n(NTO3D) reconstruction method, which leverages the benefits of both neural\nfield and SAM. We first propose a novel strategy to lift the multi-view 2D\nsegmentation masks of SAM into a unified 3D occupancy field. The 3D occupancy\nfield is then projected into 2D space and generates the new prompts for SAM.\nThis process is iterative until convergence to separate the target object from\nthe scene. After this, we then lift the 2D features of the SAM encoder into a\n3D feature field in order to improve the reconstruction quality of the target\nobject. NTO3D lifts the 2D masks and features of SAM into the 3D neural field\nfor high-quality neural target object 3D reconstruction. We conduct detailed\nexperiments on several benchmark datasets to demonstrate the advantages of our\nmethod. The code will be available at: https://github.com/ucwxb/NTO3D.\n","authors":["Xiaobao Wei","Renrui Zhang","Jiarui Wu","Jiaming Liu","Ming Lu","Yandong Guo","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.12790v2.pdf","comment":"accepted by CVPR24"},{"id":"http://arxiv.org/abs/2310.02279v3","updated":"2024-03-30T06:29:48Z","published":"2023-10-01T05:07:17Z","title":"Consistency Trajectory Models: Learning Probability Flow ODE Trajectory\n of Diffusion","summary":" Consistency Models (CM) (Song et al., 2023) accelerate score-based diffusion\nmodel sampling at the cost of sample quality but lack a natural way to\ntrade-off quality for speed. To address this limitation, we propose Consistency\nTrajectory Model (CTM), a generalization encompassing CM and score-based models\nas special cases. CTM trains a single neural network that can -- in a single\nforward pass -- output scores (i.e., gradients of log-density) and enables\nunrestricted traversal between any initial and final time along the Probability\nFlow Ordinary Differential Equation (ODE) in a diffusion process. CTM enables\nthe efficient combination of adversarial training and denoising score matching\nloss to enhance performance and achieves new state-of-the-art FIDs for\nsingle-step diffusion model sampling on CIFAR-10 (FID 1.73) and ImageNet at\n64x64 resolution (FID 1.92). CTM also enables a new family of sampling schemes,\nboth deterministic and stochastic, involving long jumps along the ODE solution\ntrajectories. It consistently improves sample quality as computational budgets\nincrease, avoiding the degradation seen in CM. Furthermore, unlike CM, CTM's\naccess to the score function can streamline the adoption of established\ncontrollable/conditional generation methods from the diffusion community. This\naccess also enables the computation of likelihood. The code is available at\nhttps://github.com/sony/ctm.\n","authors":["Dongjun Kim","Chieh-Hsin Lai","Wei-Hsiang Liao","Naoki Murata","Yuhta Takida","Toshimitsu Uesaka","Yutong He","Yuki Mitsufuji","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2310.02279v3.pdf","comment":"International Conference on Learning Representations"},{"id":"http://arxiv.org/abs/2403.19026v2","updated":"2024-03-30T06:15:36Z","published":"2024-03-27T21:43:12Z","title":"Egocentric Scene-aware Human Trajectory Prediction","summary":" Wearable collaborative robots stand to assist human wearers who need fall\nprevention assistance or wear exoskeletons. Such a robot needs to be able to\npredict the ego motion of the wearer based on egocentric vision and the\nsurrounding scene. In this work, we leveraged body-mounted cameras and sensors\nto anticipate the trajectory of human wearers through complex surroundings. To\nfacilitate research in ego-motion prediction, we have collected a comprehensive\nwalking scene navigation dataset centered on the user's perspective. We present\na method to predict human motion conditioning on the surrounding static scene.\nOur method leverages a diffusion model to produce a distribution of potential\nfuture trajectories, taking into account the user's observation of the\nenvironment. We introduce a compact representation to encode the user's visual\nmemory of the surroundings, as well as an efficient sample-generating technique\nto speed up real-time inference of a diffusion model. We ablate our model and\ncompare it to baselines, and results show that our model outperforms existing\nmethods on key metrics of collision avoidance and trajectory mode coverage.\n","authors":["Weizhuo Wang","C. Karen Liu","Monroe Kennedy III"],"pdf_url":"https://arxiv.org/pdf/2403.19026v2.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.18102v2","updated":"2024-03-30T06:06:38Z","published":"2024-02-28T06:45:47Z","title":"Passive Snapshot Coded Aperture Dual-Pixel RGB-D Imaging","summary":" Passive, compact, single-shot 3D sensing is useful in many application areas\nsuch as microscopy, medical imaging, surgical navigation, and autonomous\ndriving where form factor, time, and power constraints can exist. Obtaining\nRGB-D scene information over a short imaging distance, in an ultra-compact form\nfactor, and in a passive, snapshot manner is challenging. Dual-pixel (DP)\nsensors are a potential solution to achieve the same. DP sensors collect light\nrays from two different halves of the lens in two interleaved pixel arrays,\nthus capturing two slightly different views of the scene, like a stereo camera\nsystem. However, imaging with a DP sensor implies that the defocus blur size is\ndirectly proportional to the disparity seen between the views. This creates a\ntrade-off between disparity estimation vs. deblurring accuracy. To improve this\ntrade-off effect, we propose CADS (Coded Aperture Dual-Pixel Sensing), in which\nwe use a coded aperture in the imaging lens along with a DP sensor. In our\napproach, we jointly learn an optimal coded pattern and the reconstruction\nalgorithm in an end-to-end optimization setting. Our resulting CADS imaging\nsystem demonstrates improvement of >1.5dB PSNR in all-in-focus (AIF) estimates\nand 5-6% in depth estimation quality over naive DP sensing for a wide range of\naperture settings. Furthermore, we build the proposed CADS prototypes for DSLR\nphotography settings and in an endoscope and a dermoscope form factor. Our\nnovel coded dual-pixel sensing approach demonstrates accurate RGB-D\nreconstruction results in simulations and real-world experiments in a passive,\nsnapshot, and compact manner.\n","authors":["Bhargav Ghanekar","Salman Siddique Khan","Pranav Sharma","Shreyas Singh","Vivek Boominathan","Kaushik Mitra","Ashok Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2402.18102v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08653v2","updated":"2024-03-30T06:05:40Z","published":"2023-12-14T04:47:20Z","title":"SKDF: A Simple Knowledge Distillation Framework for Distilling\n Open-Vocabulary Knowledge to Open-world Object Detector","summary":" In this paper, we attempt to specialize the VLM model for OWOD tasks by\ndistilling its open-world knowledge into a language-agnostic detector.\nSurprisingly, we observe that the combination of a simple \\textbf{knowledge\ndistillation} approach and the automatic pseudo-labeling mechanism in OWOD can\nachieve better performance for unknown object detection, even with a small\namount of data. Unfortunately, knowledge distillation for unknown objects\nseverely affects the learning of detectors with conventional structures for\nknown objects, leading to catastrophic forgetting. To alleviate these problems,\nwe propose the \\textbf{down-weight loss function} for knowledge distillation\nfrom vision-language to single vision modality. Meanwhile, we propose the\n\\textbf{cascade decouple decoding structure} that decouples the learning of\nlocalization and recognition to reduce the impact of category interactions of\nknown and unknown objects on the localization learning process. Ablation\nexperiments demonstrate that both of them are effective in mitigating the\nimpact of open-world knowledge distillation on the learning of known objects.\nAdditionally, to alleviate the current lack of comprehensive benchmarks for\nevaluating the ability of the open-world detector to detect unknown objects in\nthe open world, we propose two benchmarks, which we name\n\"\\textbf{StandardSet}$\\heartsuit$\" and \"\\textbf{IntensiveSet}$\\spadesuit$\"\nrespectively, based on the complexity of their testing scenarios. Comprehensive\nexperiments performed on OWOD, MS-COCO, and our proposed benchmarks demonstrate\nthe effectiveness of our methods. The code and proposed dataset are available\nat \\url{https://github.com/xiaomabufei/SKDF}.\n","authors":["Shuailei Ma","Yuefeng Wang","Ying Wei","Jiaqi Fan","Enming Zhang","Xinyu Sun","Peihao Chen"],"pdf_url":"https://arxiv.org/pdf/2312.08653v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.11623"},{"id":"http://arxiv.org/abs/2312.04551v2","updated":"2024-03-30T06:02:45Z","published":"2023-12-07T18:59:18Z","title":"Free3D: Consistent Novel View Synthesis without 3D Representation","summary":" We introduce Free3D, a simple accurate method for monocular open-set novel\nview synthesis (NVS). Similar to Zero-1-to-3, we start from a pre-trained 2D\nimage generator for generalization, and fine-tune it for NVS. Compared to other\nworks that took a similar approach, we obtain significant improvements without\nresorting to an explicit 3D representation, which is slow and memory-consuming,\nand without training an additional network for 3D reconstruction. Our key\ncontribution is to improve the way the target camera pose is encoded in the\nnetwork, which we do by introducing a new ray conditioning normalization (RCN)\nlayer. The latter injects pose information in the underlying 2D image generator\nby telling each pixel its viewing direction. We further improve multi-view\nconsistency by using light-weight multi-view attention layers and by sharing\ngeneration noise between the different views. We train Free3D on the Objaverse\ndataset and demonstrate excellent generalization to new categories in new\ndatasets, including OmniObject3D and GSO. The project page is available at\nhttps://chuanxiaz.com/free3d/.\n","authors":["Chuanxia Zheng","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2312.04551v2.pdf","comment":"webpage: https://chuanxiaz.com/free3d/, code:\n https://github.com/lyndonzheng/Free3D"},{"id":"http://arxiv.org/abs/2402.04476v2","updated":"2024-03-30T05:18:05Z","published":"2024-02-06T23:52:10Z","title":"Dual-View Visual Contextualization for Web Navigation","summary":" Automatic web navigation aims to build a web agent that can follow language\ninstructions to execute complex and diverse tasks on real-world websites.\nExisting work primarily takes HTML documents as input, which define the\ncontents and action spaces (i.e., actionable elements and operations) of\nwebpages. Nevertheless, HTML documents may not provide a clear task-related\ncontext for each element, making it hard to select the right (sequence of)\nactions. In this paper, we propose to contextualize HTML elements through their\n\"dual views\" in webpage screenshots: each HTML element has its corresponding\nbounding box and visual content in the screenshot. We build upon the insight --\nweb developers tend to arrange task-related elements nearby on webpages to\nenhance user experiences -- and propose to contextualize each element with its\nneighbor elements, using both textual and visual features. The resulting\nrepresentations of HTML elements are more informative for the agent to take\naction. We validate our method on the recently released Mind2Web dataset, which\nfeatures diverse navigation domains and tasks on real-world websites. Our\nmethod consistently outperforms the baseline in all the scenarios, including\ncross-task, cross-website, and cross-domain ones.\n","authors":["Jihyung Kil","Chan Hee Song","Boyuan Zheng","Xiang Deng","Yu Su","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2402.04476v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2309.04372v2","updated":"2024-03-30T05:05:52Z","published":"2023-09-08T15:06:05Z","title":"MoEController: Instruction-based Arbitrary Image Manipulation with\n Mixture-of-Expert Controllers","summary":" Diffusion-model-based text-guided image generation has recently made\nastounding progress, producing fascinating results in open-domain image\nmanipulation tasks. Few models, however, currently have complete zero-shot\ncapabilities for both global and local image editing due to the complexity and\ndiversity of image manipulation tasks. In this work, we propose a method with a\nmixture-of-expert (MOE) controllers to align the text-guided capacity of\ndiffusion models with different kinds of human instructions, enabling our model\nto handle various open-domain image manipulation tasks with natural language\ninstructions. First, we use large language models (ChatGPT) and conditional\nimage synthesis models (ControlNet) to generate a large number of global image\ntransfer dataset in addition to the instruction-based local image editing\ndataset. Then, using an MOE technique and task-specific adaptation training on\na large-scale dataset, our conditional diffusion model can edit images globally\nand locally. Extensive experiments demonstrate that our approach performs\nsurprisingly well on various image manipulation tasks when dealing with\nopen-domain images and arbitrary human instructions. Please refer to our\nproject page: [https://oppo-mente-lab.github.io/moe_controller/]\n","authors":["Sijia Li","Chen Chen","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2309.04372v2.pdf","comment":"6 pages,6 figures"},{"id":"http://arxiv.org/abs/2403.10988v2","updated":"2024-03-30T04:56:05Z","published":"2024-03-16T18:04:12Z","title":"Boosting Flow-based Generative Super-Resolution Models via Learned Prior","summary":" Flow-based super-resolution (SR) models have demonstrated astonishing\ncapabilities in generating high-quality images. However, these methods\nencounter several challenges during image generation, such as grid artifacts,\nexploding inverses, and suboptimal results due to a fixed sampling temperature.\nTo overcome these issues, this work introduces a conditional learned prior to\nthe inference phase of a flow-based SR model. This prior is a latent code\npredicted by our proposed latent module conditioned on the low-resolution\nimage, which is then transformed by the flow model into an SR image. Our\nframework is designed to seamlessly integrate with any contemporary flow-based\nSR model without modifying its architecture or pre-trained weights. We evaluate\nthe effectiveness of our proposed framework through extensive experiments and\nablation analyses. The proposed framework successfully addresses all the\ninherent issues in flow-based SR models and enhances their performance in\nvarious SR scenarios. Our code is available at:\nhttps://github.com/liyuantsao/FlowSR-LP\n","authors":["Li-Yuan Tsao","Yi-Chen Lo","Chia-Che Chang","Hao-Wei Chen","Roy Tseng","Chien Feng","Chun-Yi Lee"],"pdf_url":"https://arxiv.org/pdf/2403.10988v2.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2312.02134v3","updated":"2024-03-30T04:22:34Z","published":"2023-12-04T18:55:45Z","title":"GaussianAvatar: Towards Realistic Human Avatar Modeling from a Single\n Video via Animatable 3D Gaussians","summary":" We present GaussianAvatar, an efficient approach to creating realistic human\navatars with dynamic 3D appearances from a single video. We start by\nintroducing animatable 3D Gaussians to explicitly represent humans in various\nposes and clothing styles. Such an explicit and animatable representation can\nfuse 3D appearances more efficiently and consistently from 2D observations. Our\nrepresentation is further augmented with dynamic properties to support\npose-dependent appearance modeling, where a dynamic appearance network along\nwith an optimizable feature tensor is designed to learn the\nmotion-to-appearance mapping. Moreover, by leveraging the differentiable motion\ncondition, our method enables a joint optimization of motions and appearances\nduring avatar modeling, which helps to tackle the long-standing issue of\ninaccurate motion estimation in monocular settings. The efficacy of\nGaussianAvatar is validated on both the public dataset and our collected\ndataset, demonstrating its superior performances in terms of appearance quality\nand rendering efficiency.\n","authors":["Liangxiao Hu","Hongwen Zhang","Yuxiang Zhang","Boyao Zhou","Boning Liu","Shengping Zhang","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2312.02134v3.pdf","comment":"Project Page: https://huliangxiao.github.io/GaussianAvatar"},{"id":"http://arxiv.org/abs/2401.00374v5","updated":"2024-03-30T04:15:34Z","published":"2023-12-31T02:25:41Z","title":"EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via\n Expressive Masked Audio Gesture Modeling","summary":" We propose EMAGE, a framework to generate full-body human gestures from audio\nand masked gestures, encompassing facial, local body, hands, and global\nmovements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new\nmesh-level holistic co-speech dataset. BEAT2 combines a MoShed SMPL-X body with\nFLAME head parameters and further refines the modeling of head, neck, and\nfinger movements, offering a community-standardized, high-quality 3D motion\ncaptured dataset. EMAGE leverages masked body gesture priors during training to\nboost inference performance. It involves a Masked Audio Gesture Transformer,\nfacilitating joint training on audio-to-gesture generation and masked gesture\nreconstruction to effectively encode audio and body gesture hints. Encoded body\nhints from masked gestures are then separately employed to generate facial and\nbody movements. Moreover, EMAGE adaptively merges speech features from the\naudio's rhythm and content and utilizes four compositional VQ-VAEs to enhance\nthe results' fidelity and diversity. Experiments demonstrate that EMAGE\ngenerates holistic gestures with state-of-the-art performance and is flexible\nin accepting predefined spatial-temporal gesture inputs, generating complete,\naudio-synchronized results. Our code and dataset are available\nhttps://pantomatrix.github.io/EMAGE/\n","authors":["Haiyang Liu","Zihao Zhu","Giorgio Becherini","Yichen Peng","Mingyang Su","You Zhou","Xuefei Zhe","Naoya Iwamoto","Bo Zheng","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2401.00374v5.pdf","comment":"Fix typos; Conflict of Interest Disclosure; CVPR Camera Ready;\n Project Page: https://pantomatrix.github.io/EMAGE/"},{"id":"http://arxiv.org/abs/2302.05043v2","updated":"2024-03-30T03:50:28Z","published":"2023-02-10T04:12:11Z","title":"A Review of Predictive and Contrastive Self-supervised Learning for\n Medical Images","summary":" Over the last decade, supervised deep learning on manually annotated big data\nhas been progressing significantly on computer vision tasks. But the\napplication of deep learning in medical image analysis was limited by the\nscarcity of high-quality annotated medical imaging data. An emerging solution\nis self-supervised learning (SSL), among which contrastive SSL is the most\nsuccessful approach to rivalling or outperforming supervised learning. This\nreview investigates several state-of-the-art contrastive SSL algorithms\noriginally on natural images as well as their adaptations for medical images,\nand concludes by discussing recent advances, current limitations, and future\ndirections in applying contrastive SSL in the medical domain.\n","authors":["Wei-Chien Wang","Euijoon Ahn","Dagan Feng","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2302.05043v2.pdf","comment":"Article links:\n https://link.springer.com/article/10.1007/s11633-022-1406-4"},{"id":"http://arxiv.org/abs/2403.17610v2","updated":"2024-03-30T03:46:10Z","published":"2024-03-26T11:43:05Z","title":"MMVP: A Multimodal MoCap Dataset with Vision and Pressure Sensors","summary":" Foot contact is an important cue for human motion capture, understanding, and\ngeneration. Existing datasets tend to annotate dense foot contact using visual\nmatching with thresholding or incorporating pressure signals. However, these\napproaches either suffer from low accuracy or are only designed for small-range\nand slow motion. There is still a lack of a vision-pressure multimodal dataset\nwith large-range and fast human motion, as well as accurate and dense\nfoot-contact annotation. To fill this gap, we propose a Multimodal MoCap\nDataset with Vision and Pressure sensors, named MMVP. MMVP provides accurate\nand dense plantar pressure signals synchronized with RGBD observations, which\nis especially useful for both plausible shape estimation, robust pose fitting\nwithout foot drifting, and accurate global translation tracking. To validate\nthe dataset, we propose an RGBD-P SMPL fitting method and also a\nmonocular-video-based baseline framework, VP-MoCap, for human motion capture.\nExperiments demonstrate that our RGBD-P SMPL Fitting results significantly\noutperform pure visual motion capture. Moreover, VP-MoCap outperforms SOTA\nmethods in foot-contact and global translation estimation accuracy. We believe\nthe configuration of the dataset and the baseline frameworks will stimulate the\nresearch in this direction and also provide a good reference for MoCap\napplications in various domains. Project page:\nhttps://metaverse-ai-lab-thu.github.io/MMVP-Dataset/.\n","authors":["He Zhang","Shenghao Ren","Haolei Yuan","Jianhui Zhao","Fan Li","Shuangpeng Sun","Zhenghao Liang","Tao Yu","Qiu Shen","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2403.17610v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2312.01280v2","updated":"2024-03-30T03:30:18Z","published":"2023-12-03T04:36:04Z","title":"Brain Decodes Deep Nets","summary":" We developed a tool for visualizing and analyzing large pre-trained vision\nmodels by mapping them onto the brain, thus exposing their hidden inside. Our\ninnovation arises from a surprising usage of brain encoding: predicting brain\nfMRI measurements in response to images. We report two findings. First,\nexplicit mapping between the brain and deep-network features across dimensions\nof space, layers, scales, and channels is crucial. This mapping method,\nFactorTopy, is plug-and-play for any deep-network; with it, one can paint a\npicture of the network onto the brain (literally!). Second, our visualization\nshows how different training methods matter: they lead to remarkable\ndifferences in hierarchical organization and scaling behavior, growing with\nmore data or network capacity. It also provides insight into fine-tuning: how\npre-trained models change when adapting to small datasets. We found brain-like\nhierarchically organized network suffer less from catastrophic forgetting after\nfine-tuned.\n","authors":["Huzheng Yang","James Gee","Jianbo Shi"],"pdf_url":"https://arxiv.org/pdf/2312.01280v2.pdf","comment":"Website: see https://huzeyann.github.io/brain-decodes-deep-nets .\n Code: see https://github.com/huzeyann/BrainDecodesDeepNets"},{"id":"http://arxiv.org/abs/2312.09243v2","updated":"2024-03-30T03:08:43Z","published":"2023-12-14T18:58:52Z","title":"OccNeRF: Advancing 3D Occupancy Prediction in LiDAR-Free Environments","summary":" As a fundamental task of vision-based perception, 3D occupancy prediction\nreconstructs 3D structures of surrounding environments. It provides detailed\ninformation for autonomous driving planning and navigation. However, most\nexisting methods heavily rely on the LiDAR point clouds to generate occupancy\nground truth, which is not available in the vision-based system. In this paper,\nwe propose an OccNeRF method for training occupancy networks without 3D\nsupervision. Different from previous works which consider a bounded scene, we\nparameterize the reconstructed occupancy fields and reorganize the sampling\nstrategy to align with the cameras' infinite perceptive range. The neural\nrendering is adopted to convert occupancy fields to multi-camera depth maps,\nsupervised by multi-frame photometric consistency. Moreover, for semantic\noccupancy prediction, we design several strategies to polish the prompts and\nfilter the outputs of a pretrained open-vocabulary 2D segmentation model.\nExtensive experiments for both self-supervised depth estimation and 3D\noccupancy prediction tasks on nuScenes and SemanticKITTI datasets demonstrate\nthe effectiveness of our method.\n","authors":["Chubin Zhang","Juncheng Yan","Yi Wei","Jiaxin Li","Li Liu","Yansong Tang","Yueqi Duan","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2312.09243v2.pdf","comment":"Code: https://github.com/LinShan-Bin/OccNeRF"},{"id":"http://arxiv.org/abs/2403.19001v2","updated":"2024-03-30T02:42:08Z","published":"2024-03-27T20:51:02Z","title":"Cross-domain Fiber Cluster Shape Analysis for Language Performance\n Cognitive Score Prediction","summary":" Shape plays an important role in computer graphics, offering informative\nfeatures to convey an object's morphology and functionality. Shape analysis in\nbrain imaging can help interpret structural and functionality correlations of\nthe human brain. In this work, we investigate the shape of the brain's 3D white\nmatter connections and its potential predictive relationship to human cognitive\nfunction. We reconstruct brain connections as sequences of 3D points using\ndiffusion magnetic resonance imaging (dMRI) tractography. To describe each\nconnection, we extract 12 shape descriptors in addition to traditional dMRI\nconnectivity and tissue microstructure features. We introduce a novel\nframework, Shape--fused Fiber Cluster Transformer (SFFormer), that leverages a\nmulti-head cross-attention feature fusion module to predict subject-specific\nlanguage performance based on dMRI tractography. We assess the performance of\nthe method on a large dataset including 1065 healthy young adults. The results\ndemonstrate that both the transformer-based SFFormer model and its inter/intra\nfeature fusion with shape, microstructure, and connectivity are informative,\nand together, they improve the prediction of subject-specific language\nperformance scores. Overall, our results indicate that the shape of the brain's\nconnections is predictive of human language function.\n","authors":["Yui Lo","Yuqian Chen","Dongnan Liu","Wan Liu","Leo Zekelman","Fan Zhang","Yogesh Rathi","Nikos Makris","Alexandra J. Golby","Weidong Cai","Lauren J. O'Donnell"],"pdf_url":"https://arxiv.org/pdf/2403.19001v2.pdf","comment":"2 figures, 11 pages"},{"id":"http://arxiv.org/abs/2401.00901v2","updated":"2024-03-30T02:30:14Z","published":"2023-12-31T13:53:37Z","title":"Video-GroundingDINO: Towards Open-Vocabulary Spatio-Temporal Video\n Grounding","summary":" Video grounding aims to localize a spatio-temporal section in a video\ncorresponding to an input text query. This paper addresses a critical\nlimitation in current video grounding methodologies by introducing an\nOpen-Vocabulary Spatio-Temporal Video Grounding task. Unlike prevalent\nclosed-set approaches that struggle with open-vocabulary scenarios due to\nlimited training data and predefined vocabularies, our model leverages\npre-trained representations from foundational spatial grounding models. This\nempowers it to effectively bridge the semantic gap between natural language and\ndiverse visual content, achieving strong performance in closed-set and\nopen-vocabulary settings. Our contributions include a novel spatio-temporal\nvideo grounding model, surpassing state-of-the-art results in closed-set\nevaluations on multiple datasets and demonstrating superior performance in\nopen-vocabulary scenarios. Notably, the proposed model outperforms\nstate-of-the-art methods in closed-set settings on VidSTG (Declarative and\nInterrogative) and HC-STVG (V1 and V2) datasets. Furthermore, in\nopen-vocabulary evaluations on HC-STVG V1 and YouCook-Interactions, our model\nsurpasses the recent best-performing models by $4.88$ m_vIoU and $1.83\\%$\naccuracy, demonstrating its efficacy in handling diverse linguistic and visual\nconcepts for improved video understanding. Our codes will be publicly released.\n","authors":["Syed Talal Wasim","Muzammal Naseer","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2401.00901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09788v2","updated":"2024-03-30T01:21:42Z","published":"2023-12-15T13:43:24Z","title":"Collaborating Foundation Models for Domain Generalized Semantic\n Segmentation","summary":" Domain Generalized Semantic Segmentation (DGSS) deals with training a model\non a labeled source domain with the aim of generalizing to unseen domains\nduring inference. Existing DGSS methods typically effectuate robust features by\nmeans of Domain Randomization (DR). Such an approach is often limited as it can\nonly account for style diversification and not content. In this work, we take\nan orthogonal approach to DGSS and propose to use an assembly of CoLlaborative\nFOUndation models for Domain Generalized Semantic Segmentation (CLOUDS). In\ndetail, CLOUDS is a framework that integrates FMs of various kinds: (i) CLIP\nbackbone for its robust feature representation, (ii) generative models to\ndiversify the content, thereby covering various modes of the possible target\ndistribution, and (iii) Segment Anything Model (SAM) for iteratively refining\nthe predictions of the segmentation model. Extensive experiments show that our\nCLOUDS excels in adapting from synthetic to real DGSS benchmarks and under\nvarying weather conditions, notably outperforming prior methods by 5.6% and\n6.7% on averaged miou, respectively. The code is available at :\nhttps://github.com/yasserben/CLOUDS\n","authors":["Yasser Benigmim","Subhankar Roy","Slim Essid","Vicky Kalogeiton","Stéphane Lathuilière"],"pdf_url":"https://arxiv.org/pdf/2312.09788v2.pdf","comment":"https://github.com/yasserben/CLOUDS ; Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2303.02835v2","updated":"2024-03-30T01:21:22Z","published":"2023-03-06T02:05:14Z","title":"Traffic Scene Parsing through the TSP6K Dataset","summary":" Traffic scene perception in computer vision is a critically important task to\nachieve intelligent cities. To date, most existing datasets focus on autonomous\ndriving scenes. We observe that the models trained on those driving datasets\noften yield unsatisfactory results on traffic monitoring scenes. However,\nlittle effort has been put into improving the traffic monitoring scene\nunderstanding, mainly due to the lack of specific datasets. To fill this gap,\nwe introduce a specialized traffic monitoring dataset, termed TSP6K, containing\nimages from the traffic monitoring scenario, with high-quality pixel-level and\ninstance-level annotations. The TSP6K dataset captures more crowded traffic\nscenes with several times more traffic participants than the existing driving\nscenes. We perform a detailed analysis of the dataset and comprehensively\nevaluate previous popular scene parsing methods, instance segmentation methods\nand unsupervised domain adaption methods. Furthermore, considering the vast\ndifference in instance sizes, we propose a detail refining decoder for scene\nparsing, which recovers the details of different semantic regions in traffic\nscenes owing to the proposed TSP6K dataset. Experiments show its effectiveness\nin parsing the traffic monitoring scenes. Code and dataset are available at\nhttps://github.com/PengtaoJiang/TSP6K.\n","authors":["Peng-Tao Jiang","Yuqi Yang","Yang Cao","Qibin Hou","Ming-Ming Cheng","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2303.02835v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2311.12981v2","updated":"2024-03-30T01:18:21Z","published":"2023-11-21T20:33:17Z","title":"SD-NAE: Generating Natural Adversarial Examples with Stable Diffusion","summary":" Natural Adversarial Examples (NAEs), images arising naturally from the\nenvironment and capable of deceiving classifiers, are instrumental in robustly\nevaluating and identifying vulnerabilities in trained models. In this work,\nunlike prior works that passively collect NAEs from real images, we propose to\nactively synthesize NAEs using the state-of-the-art Stable Diffusion.\nSpecifically, our method formulates a controlled optimization process, where we\nperturb the token embedding that corresponds to a specified class to generate\nNAEs. This generation process is guided by the gradient of loss from the target\nclassifier, ensuring that the created image closely mimics the ground-truth\nclass yet fools the classifier. Named SD-NAE (Stable Diffusion for Natural\nAdversarial Examples), our innovative method is effective in producing valid\nand useful NAEs, which is demonstrated through a meticulously designed\nexperiment. Code is available at https://github.com/linyueqian/SD-NAE.\n","authors":["Yueqian Lin","Jingyang Zhang","Yiran Chen","Hai Li"],"pdf_url":"https://arxiv.org/pdf/2311.12981v2.pdf","comment":"Accepted by ICLR 2024 TinyPapers"},{"id":"http://arxiv.org/abs/2403.19549v2","updated":"2024-03-30T00:24:44Z","published":"2024-03-28T16:32:06Z","title":"GlORIE-SLAM: Globally Optimized RGB-only Implicit Encoding Point Cloud\n SLAM","summary":" Recent advancements in RGB-only dense Simultaneous Localization and Mapping\n(SLAM) have predominantly utilized grid-based neural implicit encodings and/or\nstruggle to efficiently realize global map and pose consistency. To this end,\nwe propose an efficient RGB-only dense SLAM system using a flexible neural\npoint cloud scene representation that adapts to keyframe poses and depth\nupdates, without needing costly backpropagation. Another critical challenge of\nRGB-only SLAM is the lack of geometric priors. To alleviate this issue, with\nthe aid of a monocular depth estimator, we introduce a novel DSPO layer for\nbundle adjustment which optimizes the pose and depth of keyframes along with\nthe scale of the monocular depth. Finally, our system benefits from loop\nclosure and online global bundle adjustment and performs either better or\ncompetitive to existing dense neural RGB SLAM methods in tracking, mapping and\nrendering accuracy on the Replica, TUM-RGBD and ScanNet datasets. The source\ncode will be made available.\n","authors":["Ganlin Zhang","Erik Sandström","Youmin Zhang","Manthan Patel","Luc Van Gool","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2403.19549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00498v1","updated":"2024-03-30T23:42:23Z","published":"2024-03-30T23:42:23Z","title":"94% on CIFAR-10 in 3.29 Seconds on a Single GPU","summary":" CIFAR-10 is among the most widely used datasets in machine learning,\nfacilitating thousands of research projects per year. To accelerate research\nand reduce the cost of experiments, we introduce training methods for CIFAR-10\nwhich reach 94% accuracy in 3.29 seconds, 95% in 10.4 seconds, and 96% in 46.3\nseconds, when run on a single NVIDIA A100 GPU. As one factor contributing to\nthese training speeds, we propose a derandomized variant of horizontal flipping\naugmentation, which we show improves over the standard method in every case\nwhere flipping is beneficial over no flipping at all. Our code is released at\nhttps://github.com/KellerJordan/cifar10-airbench.\n","authors":["Keller Jordan"],"pdf_url":"https://arxiv.org/pdf/2404.00498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00491v1","updated":"2024-03-30T23:19:40Z","published":"2024-03-30T23:19:40Z","title":"Denoising Monte Carlo Renders With Diffusion Models","summary":" Physically-based renderings contain Monte-Carlo noise, with variance that\nincreases as the number of rays per pixel decreases. This noise, while\nzero-mean for good modern renderers, can have heavy tails (most notably, for\nscenes containing specular or refractive objects). Learned methods for\nrestoring low fidelity renders are highly developed, because suppressing render\nnoise means one can save compute and use fast renders with few rays per pixel.\nWe demonstrate that a diffusion model can denoise low fidelity renders\nsuccessfully. Furthermore, our method can be conditioned on a variety of\nnatural render information, and this conditioning helps performance.\nQuantitative experiments show that our method is competitive with SOTA across a\nrange of sampling rates, but current metrics slightly favor competitor methods.\nQualitative examination of the reconstructions suggests that the metrics\nthemselves may not be reliable. The image prior applied by a diffusion method\nstrongly favors reconstructions that are \"like\" real images -- so have straight\nshadow boundaries, curved specularities, no \"fireflies\" and the like -- and\nmetrics do not account for this. We show numerous examples where methods\npreferred by current metrics produce qualitatively weaker reconstructions than\nours.\n","authors":["Vaibhav Vavilala","Rahul Vasanth","David Forsyth"],"pdf_url":"https://arxiv.org/pdf/2404.00491v1.pdf","comment":"14 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.00485v1","updated":"2024-03-30T22:28:29Z","published":"2024-03-30T22:28:29Z","title":"DiffHuman: Probabilistic Photorealistic 3D Reconstruction of Humans","summary":" We present DiffHuman, a probabilistic method for photorealistic 3D human\nreconstruction from a single RGB image. Despite the ill-posed nature of this\nproblem, most methods are deterministic and output a single solution, often\nresulting in a lack of geometric detail and blurriness in unseen or uncertain\nregions. In contrast, DiffHuman predicts a probability distribution over 3D\nreconstructions conditioned on an input 2D image, which allows us to sample\nmultiple detailed 3D avatars that are consistent with the image. DiffHuman is\nimplemented as a conditional diffusion model that denoises pixel-aligned 2D\nobservations of an underlying 3D shape representation. During inference, we may\nsample 3D avatars by iteratively denoising 2D renders of the predicted 3D\nrepresentation. Furthermore, we introduce a generator neural network that\napproximates rendering with considerably reduced runtime (55x speed up),\nresulting in a novel dual-branch diffusion framework. Our experiments show that\nDiffHuman can produce diverse and detailed reconstructions for the parts of the\nperson that are unseen or uncertain in the input image, while remaining\ncompetitive with the state-of-the-art when reconstructing visible surfaces.\n","authors":["Akash Sengupta","Thiemo Alldieck","Nikos Kolotouros","Enric Corona","Andrei Zanfir","Cristian Sminchisescu"],"pdf_url":"https://arxiv.org/pdf/2404.00485v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00471v1","updated":"2024-03-30T20:34:49Z","published":"2024-03-30T20:34:49Z","title":"Score-Based Diffusion Models for Photoacoustic Tomography Image\n Reconstruction","summary":" Photoacoustic tomography (PAT) is a rapidly-evolving medical imaging modality\nthat combines optical absorption contrast with ultrasound imaging depth. One\nchallenge in PAT is image reconstruction with inadequate acoustic signals due\nto limited sensor coverage or due to the density of the transducer array. Such\ncases call for solving an ill-posed inverse reconstruction problem. In this\nwork, we use score-based diffusion models to solve the inverse problem of\nreconstructing an image from limited PAT measurements. The proposed approach\nallows us to incorporate an expressive prior learned by a diffusion model on\nsimulated vessel structures while still being robust to varying transducer\nsparsity conditions.\n","authors":["Sreemanti Dey","Snigdha Saha","Berthy T. Feng","Manxiu Cui","Laure Delisle","Oscar Leong","Lihong V. Wang","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2404.00471v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2404.00469v1","updated":"2024-03-30T20:25:16Z","published":"2024-03-30T20:25:16Z","title":"SceneGraphLoc: Cross-Modal Coarse Visual Localization on 3D Scene Graphs","summary":" We introduce a novel problem, i.e., the localization of an input image within\na multi-modal reference map represented by a database of 3D scene graphs. These\ngraphs comprise multiple modalities, including object-level point clouds,\nimages, attributes, and relationships between objects, offering a lightweight\nand efficient alternative to conventional methods that rely on extensive image\ndatabases. Given the available modalities, the proposed method SceneGraphLoc\nlearns a fixed-sized embedding for each node (i.e., representing an object\ninstance) in the scene graph, enabling effective matching with the objects\nvisible in the input query image. This strategy significantly outperforms other\ncross-modal methods, even without incorporating images into the map embeddings.\nWhen images are leveraged, SceneGraphLoc achieves performance close to that of\nstate-of-the-art techniques depending on large image databases, while requiring\nthree orders-of-magnitude less storage and operating orders-of-magnitude\nfaster. The code will be made public.\n","authors":["Yang Miao","Francis Engelmann","Olga Vysotska","Federico Tombari","Marc Pollefeys","Dániel Béla Baráth"],"pdf_url":"https://arxiv.org/pdf/2404.00469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00429v1","updated":"2024-03-30T17:29:13Z","published":"2024-03-30T17:29:13Z","title":"Multiway Point Cloud Mosaicking with Diffusion and Global Optimization","summary":" We introduce a novel framework for multiway point cloud mosaicking (named\nWednesday), designed to co-align sets of partially overlapping point clouds --\ntypically obtained from 3D scanners or moving RGB-D cameras -- into a unified\ncoordinate system. At the core of our approach is ODIN, a learned pairwise\nregistration algorithm that iteratively identifies overlaps and refines\nattention scores, employing a diffusion-based process for denoising pairwise\ncorrelation matrices to enhance matching accuracy. Further steps include\nconstructing a pose graph from all point clouds, performing rotation averaging,\na novel robust algorithm for re-estimating translations optimally in terms of\nconsensus maximization and translation optimization. Finally, the point cloud\nrotations and positions are optimized jointly by a diffusion-based approach.\nTested on four diverse, large-scale datasets, our method achieves\nstate-of-the-art pairwise and multiway registration results by a large margin\non all benchmarks. Our code and models are available at\nhttps://github.com/jinsz/Multiway-Point-Cloud-Mosaicking-with-Diffusion-and-Global-Optimization.\n","authors":["Shengze Jin","Iro Armeni","Marc Pollefeys","Daniel Barath"],"pdf_url":"https://arxiv.org/pdf/2404.00429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00427v1","updated":"2024-03-30T17:21:07Z","published":"2024-03-30T17:21:07Z","title":"Extracting Manifold Information from Point Clouds","summary":" A kernel based method is proposed for the construction of signature\n(defining) functions of subsets of $\\mathbb{R}^d$. The subsets can range from\nfull dimensional manifolds (open subsets) to point clouds (a finite number of\npoints) and include bounded smooth manifolds of any codimension. The\ninterpolation and analysis of point clouds are the main application. Two\nextreme cases in terms of regularity are considered, where the data set is\ninterpolated by an analytic surface, at the one extreme, and by a H\\\"older\ncontinuous surface, at the other. The signature function can be computed as a\nlinear combination of translated kernels, the coefficients of which are the\nsolution of a finite dimensional linear problem. Once it is obtained, it can be\nused to estimate the dimension as well as the normal and the curvatures of the\ninterpolated surface. The method is global and does not require explicit\nknowledge of local neighborhoods or any other structure present in the data\nset. It admits a variational formulation with a natural ``regularized''\ncounterpart, that proves to be useful in dealing with data sets corrupted by\nnumerical error or noise. The underlying analytical structure of the approach\nis presented in general before it is applied to the case of point clouds.\n","authors":["Patrick Guidotti"],"pdf_url":"https://arxiv.org/pdf/2404.00427v1.pdf","comment":"27 pages, 16 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.00419v1","updated":"2024-03-30T16:54:45Z","published":"2024-03-30T16:54:45Z","title":"Do Vision-Language Models Understand Compound Nouns?","summary":" Open-vocabulary vision-language models (VLMs) like CLIP, trained using\ncontrastive loss, have emerged as a promising new paradigm for text-to-image\nretrieval. However, do VLMs understand compound nouns (CNs) (e.g., lab coat) as\nwell as they understand nouns (e.g., lab)? We curate Compun, a novel benchmark\nwith 400 unique and commonly used CNs, to evaluate the effectiveness of VLMs in\ninterpreting CNs. The Compun benchmark challenges a VLM for text-to-image\nretrieval where, given a text prompt with a CN, the task is to select the\ncorrect image that shows the CN among a pair of distractor images that show the\nconstituent nouns that make up the CN. Next, we perform an in-depth analysis to\nhighlight CLIPs' limited understanding of certain types of CNs. Finally, we\npresent an alternative framework that moves beyond hand-written templates for\ntext prompts widely used by CLIP-like models. We employ a Large Language Model\nto generate multiple diverse captions that include the CN as an object in the\nscene described by the caption. Our proposed method improves CN understanding\nof CLIP by 8.25% on Compun. Code and benchmark are available at:\nhttps://github.com/sonalkum/Compun\n","authors":["Sonal Kumar","Sreyan Ghosh","S Sakshi","Utkarsh Tyagi","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2404.00419v1.pdf","comment":"Accepted to NAACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2404.00418v1","updated":"2024-03-30T16:54:35Z","published":"2024-03-30T16:54:35Z","title":"Continual Learning for Autonomous Robots: A Prototype-based Approach","summary":" Humans and animals learn throughout their lives from limited amounts of\nsensed data, both with and without supervision. Autonomous, intelligent robots\nof the future are often expected to do the same. The existing continual\nlearning (CL) methods are usually not directly applicable to robotic settings:\nthey typically require buffering and a balanced replay of training data. A\nfew-shot online continual learning (FS-OCL) setting has been proposed to\naddress more realistic scenarios where robots must learn from a non-repeated\nsparse data stream. To enable truly autonomous life-long learning, an\nadditional challenge of detecting novelties and learning new items without\nsupervision needs to be addressed. We address this challenge with our new\nprototype-based approach called Continually Learning Prototypes (CLP). In\naddition to being capable of FS-OCL learning, CLP also detects novel objects\nand learns them without supervision. To mitigate forgetting, CLP utilizes a\nnovel metaplasticity mechanism that adapts the learning rate individually per\nprototype. CLP is rehearsal-free, hence does not require a memory buffer, and\nis compatible with neuromorphic hardware, characterized by ultra-low power\nconsumption, real-time processing abilities, and on-chip learning. Indeed, we\nhave open-sourced a simple version of CLP in the neuromorphic software\nframework Lava, targetting Intel's neuromorphic chip Loihi 2. We evaluate CLP\non a robotic vision dataset, OpenLORIS. In a low-instance FS-OCL scenario, CLP\nshows state-of-the-art results. In the open world, CLP detects novelties with\nsuperior precision and recall and learns features of the detected novel classes\nwithout supervision, achieving a strong baseline of 99% base class and 65%/76%\n(5-shot/10-shot) novel class accuracy.\n","authors":["Elvin Hajizada","Balachandran Swaminathan","Yulia Sandamirskaya"],"pdf_url":"https://arxiv.org/pdf/2404.00418v1.pdf","comment":"Submitted to IEEE/RSJ International Conference on Intelligent Robots\n and Systems (IROS)"},{"id":"http://arxiv.org/abs/2404.00417v1","updated":"2024-03-30T16:53:10Z","published":"2024-03-30T16:53:10Z","title":"Orchestrate Latent Expertise: Advancing Online Continual Learning with\n Multi-Level Supervision and Reverse Self-Distillation","summary":" To accommodate real-world dynamics, artificial intelligence systems need to\ncope with sequentially arriving content in an online manner. Beyond regular\nContinual Learning (CL) attempting to address catastrophic forgetting with\noffline training of each task, Online Continual Learning (OCL) is a more\nchallenging yet realistic setting that performs CL in a one-pass data stream.\nCurrent OCL methods primarily rely on memory replay of old training samples.\nHowever, a notable gap from CL to OCL stems from the additional\noverfitting-underfitting dilemma associated with the use of rehearsal buffers:\nthe inadequate learning of new training samples (underfitting) and the repeated\nlearning of a few old training samples (overfitting). To this end, we introduce\na novel approach, Multi-level Online Sequential Experts (MOSE), which\ncultivates the model as stacked sub-experts, integrating multi-level\nsupervision and reverse self-distillation. Supervision signals across multiple\nstages facilitate appropriate convergence of the new task while gathering\nvarious strengths from experts by knowledge distillation mitigates the\nperformance decline of old tasks. MOSE demonstrates remarkable efficacy in\nlearning new samples and preserving past knowledge through multi-level experts,\nthereby significantly advancing OCL performance over state-of-the-art baselines\n(e.g., up to 7.3% on Split CIFAR-100 and 6.1% on Split Tiny-ImageNet).\n","authors":["HongWei Yan","Liyuan Wang","Kaisheng Ma","Yi Zhong"],"pdf_url":"https://arxiv.org/pdf/2404.00417v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00412v1","updated":"2024-03-30T16:43:40Z","published":"2024-03-30T16:43:40Z","title":"SVGCraft: Beyond Single Object Text-to-SVG Synthesis with Comprehensive\n Canvas Layout","summary":" Generating VectorArt from text prompts is a challenging vision task,\nrequiring diverse yet realistic depictions of the seen as well as unseen\nentities. However, existing research has been mostly limited to the generation\nof single objects, rather than comprehensive scenes comprising multiple\nelements. In response, this work introduces SVGCraft, a novel end-to-end\nframework for the creation of vector graphics depicting entire scenes from\ntextual descriptions. Utilizing a pre-trained LLM for layout generation from\ntext prompts, this framework introduces a technique for producing masked\nlatents in specified bounding boxes for accurate object placement. It\nintroduces a fusion mechanism for integrating attention maps and employs a\ndiffusion U-Net for coherent composition, speeding up the drawing process. The\nresulting SVG is optimized using a pre-trained encoder and LPIPS loss with\nopacity modulation to maximize similarity. Additionally, this work explores the\npotential of primitive shapes in facilitating canvas completion in constrained\nenvironments. Through both qualitative and quantitative assessments, SVGCraft\nis demonstrated to surpass prior works in abstraction, recognizability, and\ndetail, as evidenced by its performance metrics (CLIP-T: 0.4563, Cosine\nSimilarity: 0.6342, Confusion: 0.66, Aesthetic: 6.7832). The code will be\navailable at https://github.com/ayanban011/SVGCraft.\n","authors":["Ayan Banerjee","Nityanand Mathur","Josep Lladós","Umapada Pal","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2404.00412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00409v1","updated":"2024-03-30T16:35:38Z","published":"2024-03-30T16:35:38Z","title":"3DGSR: Implicit Surface Reconstruction with 3D Gaussian Splatting","summary":" In this paper, we present an implicit surface reconstruction method with 3D\nGaussian Splatting (3DGS), namely 3DGSR, that allows for accurate 3D\nreconstruction with intricate details while inheriting the high efficiency and\nrendering quality of 3DGS. The key insight is incorporating an implicit signed\ndistance field (SDF) within 3D Gaussians to enable them to be aligned and\njointly optimized. First, we introduce a differentiable SDF-to-opacity\ntransformation function that converts SDF values into corresponding Gaussians'\nopacities. This function connects the SDF and 3D Gaussians, allowing for\nunified optimization and enforcing surface constraints on the 3D Gaussians.\nDuring learning, optimizing the 3D Gaussians provides supervisory signals for\nSDF learning, enabling the reconstruction of intricate details. However, this\nonly provides sparse supervisory signals to the SDF at locations occupied by\nGaussians, which is insufficient for learning a continuous SDF. Then, to\naddress this limitation, we incorporate volumetric rendering and align the\nrendered geometric attributes (depth, normal) with those derived from 3D\nGaussians. This consistency regularization introduces supervisory signals to\nlocations not covered by discrete 3D Gaussians, effectively eliminating\nredundant surfaces outside the Gaussian sampling range. Our extensive\nexperimental results demonstrate that our 3DGSR method enables high-quality 3D\nsurface reconstruction while preserving the efficiency and rendering quality of\n3DGS. Besides, our method competes favorably with leading surface\nreconstruction techniques while offering a more efficient learning process and\nmuch better rendering qualities. The code will be available at\nhttps://github.com/CVMI-Lab/3DGSR.\n","authors":["Xiaoyang Lyu","Yang-Tian Sun","Yi-Hua Huang","Xiuzhe Wu","Ziyi Yang","Yilun Chen","Jiangmiao Pang","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2404.00409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00385v1","updated":"2024-03-30T14:58:40Z","published":"2024-03-30T14:58:40Z","title":"Constrained Layout Generation with Factor Graphs","summary":" This paper addresses the challenge of object-centric layout generation under\nspatial constraints, seen in multiple domains including floorplan design\nprocess. The design process typically involves specifying a set of spatial\nconstraints that include object attributes like size and inter-object relations\nsuch as relative positioning. Existing works, which typically represent objects\nas single nodes, lack the granularity to accurately model complex interactions\nbetween objects. For instance, often only certain parts of an object, like a\nroom's right wall, interact with adjacent objects. To address this gap, we\nintroduce a factor graph based approach with four latent variable nodes for\neach room, and a factor node for each constraint. The factor nodes represent\ndependencies among the variables to which they are connected, effectively\ncapturing constraints that are potentially of a higher order. We then develop\nmessage-passing on the bipartite graph, forming a factor graph neural network\nthat is trained to produce a floorplan that aligns with the desired\nrequirements. Our approach is simple and generates layouts faithful to the user\nrequirements, demonstrated by a large improvement in IOU scores over existing\nmethods. Additionally, our approach, being inferential and accurate, is\nwell-suited to the practical human-in-the-loop design process where\nspecifications evolve iteratively, offering a practical and powerful tool for\nAI-guided design.\n","authors":["Mohammed Haroon Dupty","Yanfei Dong","Sicong Leng","Guoji Fu","Yong Liang Goh","Wei Lu","Wee Sun Lee"],"pdf_url":"https://arxiv.org/pdf/2404.00385v1.pdf","comment":"To be published at IEEE/CVF CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00384v1","updated":"2024-03-30T14:51:07Z","published":"2024-03-30T14:51:07Z","title":"TTD: Text-Tag Self-Distillation Enhancing Image-Text Alignment in CLIP\n to Alleviate Single Tag Bias","summary":" We identify a critical bias in contemporary CLIP-based models, which we\ndenote as \\textit{single tag bias}. This bias manifests as a disproportionate\nfocus on a singular tag (word) while neglecting other pertinent tags, stemming\nfrom CLIP's text embeddings that prioritize one specific tag in image-text\nrelationships. When deconstructing text into individual tags, only one tag\ntends to have high relevancy with CLIP's image embedding, leading to an\nimbalanced tag relevancy. This results in an uneven alignment among multiple\ntags present in the text. To tackle this challenge, we introduce a novel\ntwo-step fine-tuning approach. First, our method leverages the similarity\nbetween tags and their nearest pixels for scoring, enabling the extraction of\nimage-relevant tags from the text. Second, we present a self-distillation\nstrategy aimed at aligning the combined masks from extracted tags with the\ntext-derived mask. This approach mitigates the single tag bias, thereby\nsignificantly improving the alignment of CLIP's model without necessitating\nadditional data or supervision. Our technique demonstrates model-agnostic\nimprovements in multi-tag classification and segmentation tasks, surpassing\ncompeting methods that rely on external resources. Code is available at\nhttps://github.com/shjo-april/TTD.\n","authors":["Sanghyun Jo","Soohyun Ryu","Sungyub Kim","Eunho Yang","Kyungsu Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00380v1","updated":"2024-03-30T14:35:31Z","published":"2024-03-30T14:35:31Z","title":"DHR: Dual Features-Driven Hierarchical Rebalancing in Inter- and\n Intra-Class Regions for Weakly-Supervised Semantic Segmentation","summary":" Weakly-supervised semantic segmentation (WSS) ensures high-quality\nsegmentation with limited data and excels when employed as input seed masks for\nlarge-scale vision models such as Segment Anything. However, WSS faces\nchallenges related to minor classes since those are overlooked in images with\nadjacent multiple classes, a limitation originating from the overfitting of\ntraditional expansion methods like Random Walk. We first address this by\nemploying unsupervised and weakly-supervised feature maps instead of\nconventional methodologies, allowing for hierarchical mask enhancement. This\nmethod distinctly categorizes higher-level classes and subsequently separates\ntheir associated lower-level classes, ensuring all classes are correctly\nrestored in the mask without losing minor ones. Our approach, validated through\nextensive experimentation, significantly improves WSS across five benchmarks\n(VOC: 79.8\\%, COCO: 53.9\\%, Context: 49.0\\%, ADE: 32.9\\%, Stuff: 37.4\\%),\nreducing the gap with fully supervised methods by over 84\\% on the VOC\nvalidation set. Code is available at https://github.com/shjo-april/DHR.\n","authors":["Sanghyun Jo","Fei Pan","In-Jae Yu","Kyungsu Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00373v1","updated":"2024-03-30T13:58:19Z","published":"2024-03-30T13:58:19Z","title":"The Devil is in the Edges: Monocular Depth Estimation with Edge-aware\n Consistency Fusion","summary":" This paper presents a novel monocular depth estimation method, named ECFNet,\nfor estimating high-quality monocular depth with clear edges and valid overall\nstructure from a single RGB image. We make a thorough inquiry about the key\nfactor that affects the edge depth estimation of the MDE networks, and come to\na ratiocination that the edge information itself plays a critical role in\npredicting depth details. Driven by this analysis, we propose to explicitly\nemploy the image edges as input for ECFNet and fuse the initial depths from\ndifferent sources to produce the final depth. Specifically, ECFNet first uses a\nhybrid edge detection strategy to get the edge map and edge-highlighted image\nfrom the input image, and then leverages a pre-trained MDE network to infer the\ninitial depths of the aforementioned three images. After that, ECFNet utilizes\na layered fusion module (LFM) to fuse the initial depth, which will be further\nupdated by a depth consistency module (DCM) to form the final estimation.\nExtensive experimental results on public datasets and ablation studies indicate\nthat our method achieves state-of-the-art performance. Project page:\nhttps://zrealli.github.io/edgedepth.\n","authors":["Pengzhi Li","Yikang Ding","Haohan Wang","Chengshuai Tang","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.00373v1.pdf","comment":"17 pages, 19 figures"},{"id":"http://arxiv.org/abs/2404.00368v1","updated":"2024-03-30T13:41:57Z","published":"2024-03-30T13:41:57Z","title":"Towards Variable and Coordinated Holistic Co-Speech Motion Generation","summary":" This paper addresses the problem of generating lifelike holistic co-speech\nmotions for 3D avatars, focusing on two key aspects: variability and\ncoordination. Variability allows the avatar to exhibit a wide range of motions\neven with similar speech content, while coordination ensures a harmonious\nalignment among facial expressions, hand gestures, and body poses. We aim to\nachieve both with ProbTalk, a unified probabilistic framework designed to\njointly model facial, hand, and body movements in speech. ProbTalk builds on\nthe variational autoencoder (VAE) architecture and incorporates three core\ndesigns. First, we introduce product quantization (PQ) to the VAE, which\nenriches the representation of complex holistic motion. Second, we devise a\nnovel non-autoregressive model that embeds 2D positional encoding into the\nproduct-quantized representation, thereby preserving essential structure\ninformation of the PQ codes. Last, we employ a secondary stage to refine the\npreliminary prediction, further sharpening the high-frequency details. Coupling\nthese three designs enables ProbTalk to generate natural and diverse holistic\nco-speech motions, outperforming several state-of-the-art methods in\nqualitative and quantitative evaluations, particularly in terms of realism. Our\ncode and model will be released for research purposes at\nhttps://feifeifeiliu.github.io/probtalk/.\n","authors":["Yifei Liu","Qiong Cao","Yandong Wen","Huaiguang Jiang","Changxing Ding"],"pdf_url":"https://arxiv.org/pdf/2404.00368v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00366v1","updated":"2024-03-30T13:38:07Z","published":"2024-03-30T13:38:07Z","title":"Efficient Multi-branch Segmentation Network for Situation Awareness in\n Autonomous Navigation","summary":" Real-time and high-precision situational awareness technology is critical for\nautonomous navigation of unmanned surface vehicles (USVs). In particular,\nrobust and fast obstacle semantic segmentation methods are essential. However,\ndistinguishing between the sea and the sky is challenging due to the\ndifferences between port and maritime environments. In this study, we built a\ndataset that captured perspectives from USVs and unmanned aerial vehicles in a\nmaritime port environment and analysed the data features. Statistical analysis\nrevealed a high correlation between the distribution of the sea and sky and row\npositional information. Based on this finding, a three-branch semantic\nsegmentation network with a row position encoding module (RPEM) was proposed to\nimprove the prediction accuracy between the sea and the sky. The proposed RPEM\nhighlights the effect of row coordinates on feature extraction. Compared to the\nbaseline, the three-branch network with RPEM significantly improved the ability\nto distinguish between the sea and the sky without significantly reducing the\ncomputational speed.\n","authors":["Guan-Cheng Zhou","Chen Chengb","Yan-zhou Chena"],"pdf_url":"https://arxiv.org/pdf/2404.00366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00362v1","updated":"2024-03-30T13:28:53Z","published":"2024-03-30T13:28:53Z","title":"STBA: Towards Evaluating the Robustness of DNNs for Query-Limited\n Black-box Scenario","summary":" Many attack techniques have been proposed to explore the vulnerability of\nDNNs and further help to improve their robustness. Despite the significant\nprogress made recently, existing black-box attack methods still suffer from\nunsatisfactory performance due to the vast number of queries needed to optimize\ndesired perturbations. Besides, the other critical challenge is that\nadversarial examples built in a noise-adding manner are abnormal and struggle\nto successfully attack robust models, whose robustness is enhanced by\nadversarial training against small perturbations. There is no doubt that these\ntwo issues mentioned above will significantly increase the risk of exposure and\nresult in a failure to dig deeply into the vulnerability of DNNs. Hence, it is\nnecessary to evaluate DNNs' fragility sufficiently under query-limited settings\nin a non-additional way. In this paper, we propose the Spatial Transform\nBlack-box Attack (STBA), a novel framework to craft formidable adversarial\nexamples in the query-limited scenario. Specifically, STBA introduces a flow\nfield to the high-frequency part of clean images to generate adversarial\nexamples and adopts the following two processes to enhance their naturalness\nand significantly improve the query efficiency: a) we apply an estimated flow\nfield to the high-frequency part of clean images to generate adversarial\nexamples instead of introducing external noise to the benign image, and b) we\nleverage an efficient gradient estimation method based on a batch of samples to\noptimize such an ideal flow field under query-limited settings. Compared to\nexisting score-based black-box baselines, extensive experiments indicated that\nSTBA could effectively improve the imperceptibility of the adversarial examples\nand remarkably boost the attack success rate under query-limited settings.\n","authors":["Renyang Liu","Kwok-Yan Lam","Wei Zhou","Sixing Wu","Jun Zhao","Dongting Hu","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2404.00362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00360v1","updated":"2024-03-30T13:24:58Z","published":"2024-03-30T13:24:58Z","title":"Reusable Architecture Growth for Continual Stereo Matching","summary":" The remarkable performance of recent stereo depth estimation models benefits\nfrom the successful use of convolutional neural networks to regress dense\ndisparity. Akin to most tasks, this needs gathering training data that covers a\nnumber of heterogeneous scenes at deployment time. However, training samples\nare typically acquired continuously in practical applications, making the\ncapability to learn new scenes continually even more crucial. For this purpose,\nwe propose to perform continual stereo matching where a model is tasked to 1)\ncontinually learn new scenes, 2) overcome forgetting previously learned scenes,\nand 3) continuously predict disparities at inference. We achieve this goal by\nintroducing a Reusable Architecture Growth (RAG) framework. RAG leverages\ntask-specific neural unit search and architecture growth to learn new scenes\ncontinually in both supervised and self-supervised manners. It can maintain\nhigh reusability during growth by reusing previous units while obtaining good\nperformance. Additionally, we present a Scene Router module to adaptively\nselect the scene-specific architecture path at inference. Comprehensive\nexperiments on numerous datasets show that our framework performs impressively\nin various weather, road, and city circumstances and surpasses the\nstate-of-the-art methods in more challenging cross-dataset settings. Further\nexperiments also demonstrate the adaptability of our method to unseen scenes,\nwhich can facilitate end-to-end stereo architecture learning and practical\ndeployment.\n","authors":["Chenghao Zhang","Gaofeng Meng","Bin Fan","Kun Tian","Zhaoxiang Zhang","Shiming Xiang","Chunhong Pan"],"pdf_url":"https://arxiv.org/pdf/2404.00360v1.pdf","comment":"Extended version of CVPR 2022 paper \"Continual Stereo Matching of\n Continuous Driving Scenes with Growing Architecture\" - Accepted to TPAMI in\n 2024"},{"id":"http://arxiv.org/abs/2404.00358v1","updated":"2024-03-30T13:20:04Z","published":"2024-03-30T13:20:04Z","title":"Spread Your Wings: A Radial Strip Transformer for Image Deblurring","summary":" Exploring motion information is important for the motion deblurring task.\nRecent the window-based transformer approaches have achieved decent performance\nin image deblurring. Note that the motion causing blurry results is usually\ncomposed of translation and rotation movements and the window-shift operation\nin the Cartesian coordinate system by the window-based transformer approaches\nonly directly explores translation motion in orthogonal directions. Thus, these\nmethods have the limitation of modeling the rotation part. To alleviate this\nproblem, we introduce the polar coordinate-based transformer, which has the\nangles and distance to explore rotation motion and translation information\ntogether. In this paper, we propose a Radial Strip Transformer (RST), which is\na transformer-based architecture that restores the blur images in a polar\ncoordinate system instead of a Cartesian one. RST contains a dynamic radial\nembedding module (DRE) to extract the shallow feature by a radial deformable\nconvolution. We design a polar mask layer to generate the offsets for the\ndeformable convolution, which can reshape the convolution kernel along the\nradius to better capture the rotation motion information. Furthermore, we\nproposed a radial strip attention solver (RSAS) as deep feature extraction,\nwhere the relationship of windows is organized by azimuth and radius. This\nattention module contains radial strip windows to reweight image features in\nthe polar coordinate, which preserves more useful information in rotation and\ntranslation motion together for better recovering the sharp images.\nExperimental results on six synthesis and real-world datasets prove that our\nmethod performs favorably against other SOTA methods for the image deblurring\ntask.\n","authors":["Duosheng Chen","Shihao Zhou","Jinshan Pan","Jinglei Shi","Lishen Qu","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00351v1","updated":"2024-03-30T13:04:46Z","published":"2024-03-30T13:04:46Z","title":"Rethinking Attention-Based Multiple Instance Learning for Whole-Slide\n Pathological Image Classification: An Instance Attribute Viewpoint","summary":" Multiple instance learning (MIL) is a robust paradigm for whole-slide\npathological image (WSI) analysis, processing gigapixel-resolution images with\nslide-level labels. As pioneering efforts, attention-based MIL (ABMIL) and its\nvariants are increasingly becoming popular due to the characteristics of\nsimultaneously handling clinical diagnosis and tumor localization. However, the\nattention mechanism exhibits limitations in discriminating between instances,\nwhich often misclassifies tissues and potentially impairs MIL performance. This\npaper proposes an Attribute-Driven MIL (AttriMIL) framework to address these\nissues. Concretely, we dissect the calculation process of ABMIL and present an\nattribute scoring mechanism that measures the contribution of each instance to\nbag prediction effectively, quantifying instance attributes. Based on attribute\nquantification, we develop a spatial attribute constraint and an attribute\nranking constraint to model instance correlations within and across slides,\nrespectively. These constraints encourage the network to capture the spatial\ncorrelation and semantic similarity of instances, improving the ability of\nAttriMIL to distinguish tissue types and identify challenging instances.\nAdditionally, AttriMIL employs a histopathology adaptive backbone that\nmaximizes the pre-trained model's feature extraction capability for collecting\npathological features. Extensive experiments on three public benchmarks\ndemonstrate that our AttriMIL outperforms existing state-of-the-art frameworks\nacross multiple evaluation metrics. The implementation code is available at\nhttps://github.com/MedCAI/AttriMIL.\n","authors":["Linghan Cai","Shenjin Huang","Ye Zhang","Jinpeng Lu","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00351v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.00349v1","updated":"2024-03-30T12:55:19Z","published":"2024-03-30T12:55:19Z","title":"SGDFormer: One-stage Transformer-based Architecture for Cross-Spectral\n Stereo Image Guided Denoising","summary":" Cross-spectral image guided denoising has shown its great potential in\nrecovering clean images with rich details, such as using the near-infrared\nimage to guide the denoising process of the visible one. To obtain such image\npairs, a feasible and economical way is to employ a stereo system, which is\nwidely used on mobile devices. Current works attempt to generate an aligned\nguidance image to handle the disparity between two images. However, due to\nocclusion, spectral differences and noise degradation, the aligned guidance\nimage generally exists ghosting and artifacts, leading to an unsatisfactory\ndenoised result. To address this issue, we propose a one-stage\ntransformer-based architecture, named SGDFormer, for cross-spectral Stereo\nimage Guided Denoising. The architecture integrates the correspondence modeling\nand feature fusion of stereo images into a unified network. Our transformer\nblock contains a noise-robust cross-attention (NRCA) module and a spatially\nvariant feature fusion (SVFF) module. The NRCA module captures the long-range\ncorrespondence of two images in a coarse-to-fine manner to alleviate the\ninterference of noise. The SVFF module further enhances salient structures and\nsuppresses harmful artifacts through dynamically selecting useful information.\nThanks to the above design, our SGDFormer can restore artifact-free images with\nfine structures, and achieves state-of-the-art performance on various datasets.\nAdditionally, our SGDFormer can be extended to handle other unaligned\ncross-model guided restoration tasks such as guided depth super-resolution.\n","authors":["Runmin Zhang","Zhu Yu","Zehua Sheng","Jiacheng Ying","Si-Yuan Cao","Shu-Jie Chen","Bailin Yang","Junwei Li","Hui-Liang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.00349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00345v1","updated":"2024-03-30T12:50:25Z","published":"2024-03-30T12:50:25Z","title":"MaGRITTe: Manipulative and Generative 3D Realization from Image, Topview\n and Text","summary":" The generation of 3D scenes from user-specified conditions offers a promising\navenue for alleviating the production burden in 3D applications. Previous\nstudies required significant effort to realize the desired scene, owing to\nlimited control conditions. We propose a method for controlling and generating\n3D scenes under multimodal conditions using partial images, layout information\nrepresented in the top view, and text prompts. Combining these conditions to\ngenerate a 3D scene involves the following significant difficulties: (1) the\ncreation of large datasets, (2) reflection on the interaction of multimodal\nconditions, and (3) domain dependence of the layout conditions. We decompose\nthe process of 3D scene generation into 2D image generation from the given\nconditions and 3D scene generation from 2D images. 2D image generation is\nachieved by fine-tuning a pretrained text-to-image model with a small\nartificial dataset of partial images and layouts, and 3D scene generation is\nachieved by layout-conditioned depth estimation and neural radiance fields\n(NeRF), thereby avoiding the creation of large datasets. The use of a common\nrepresentation of spatial information using 360-degree images allows for the\nconsideration of multimodal condition interactions and reduces the domain\ndependence of the layout control. The experimental results qualitatively and\nquantitatively demonstrated that the proposed method can generate 3D scenes in\ndiverse domains, from indoor to outdoor, according to multimodal conditions.\n","authors":["Takayuki Hara","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2404.00345v1.pdf","comment":"Project Page: https://hara012.github.io/MaGRITTe-project"},{"id":"http://arxiv.org/abs/2404.00335v1","updated":"2024-03-30T12:10:34Z","published":"2024-03-30T12:10:34Z","title":"Learing Trimaps via Clicks for Image Matting","summary":" Despite significant advancements in image matting, existing models heavily\ndepend on manually-drawn trimaps for accurate results in natural image\nscenarios. However, the process of obtaining trimaps is time-consuming, lacking\nuser-friendliness and device compatibility. This reliance greatly limits the\npractical application of all trimap-based matting methods. To address this\nissue, we introduce Click2Trimap, an interactive model capable of predicting\nhigh-quality trimaps and alpha mattes with minimal user click inputs. Through\nanalyzing real users' behavioral logic and characteristics of trimaps, we\nsuccessfully propose a powerful iterative three-class training strategy and a\ndedicated simulation function, making Click2Trimap exhibit versatility across\nvarious scenarios. Quantitative and qualitative assessments on synthetic and\nreal-world matting datasets demonstrate Click2Trimap's superior performance\ncompared to all existing trimap-free matting methods. Especially, in the user\nstudy, Click2Trimap achieves high-quality trimap and matting predictions in\njust an average of 5 seconds per image, demonstrating its substantial practical\nvalue in real-world applications.\n","authors":["Chenyi Zhang","Yihan Hu","Henghui Ding","Humphrey Shi","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.00335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00330v1","updated":"2024-03-30T12:01:04Z","published":"2024-03-30T12:01:04Z","title":"Memory-Scalable and Simplified Functional Map Learning","summary":" Deep functional maps have emerged in recent years as a prominent\nlearning-based framework for non-rigid shape matching problems. While early\nmethods in this domain only focused on learning in the functional domain, the\nlatest techniques have demonstrated that by promoting consistency between\nfunctional and pointwise maps leads to significant improvements in accuracy.\nUnfortunately, existing approaches rely heavily on the computation of large\ndense matrices arising from soft pointwise maps, which compromises their\nefficiency and scalability. To address this limitation, we introduce a novel\nmemory-scalable and efficient functional map learning pipeline. By leveraging\nthe specific structure of functional maps, we offer the possibility to achieve\nidentical results without ever storing the pointwise map in memory.\nFurthermore, based on the same approach, we present a differentiable map\nrefinement layer adapted from an existing axiomatic refinement algorithm.\nUnlike many functional map learning methods, which use this algorithm at a\npost-processing step, ours can be easily used at train time, enabling to\nenforce consistency between the refined and initial versions of the map. Our\nresulting approach is both simpler, more efficient and more numerically stable,\nby avoiding differentiation through a linear system, while achieving close to\nstate-of-the-art results in challenging scenarios.\n","authors":["Robin Magnet","Maks Ovsjanikov"],"pdf_url":"https://arxiv.org/pdf/2404.00330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00327v1","updated":"2024-03-30T11:41:19Z","published":"2024-03-30T11:41:19Z","title":"YNetr: Dual-Encoder architecture on Plain Scan Liver Tumors (PSLT)","summary":" Background: Liver tumors are abnormal growths in the liver that can be either\nbenign or malignant, with liver cancer being a significant health concern\nworldwide. However, there is no dataset for plain scan segmentation of liver\ntumors, nor any related algorithms. To fill this gap, we propose Plain Scan\nLiver Tumors(PSLT) and YNetr. Methods: A collection of 40 liver tumor plain\nscan segmentation datasets was assembled and annotated. Concurrently, we\nutilized Dice coefficient as the metric for assessing the segmentation outcomes\nproduced by YNetr, having advantage of capturing different frequency\ninformation. Results: The YNetr model achieved a Dice coefficient of 62.63% on\nthe PSLT dataset, surpassing the other publicly available model by an accuracy\nmargin of 1.22%. Comparative evaluations were conducted against a range of\nmodels including UNet 3+, XNet, UNetr, Swin UNetr, Trans-BTS, COTr, nnUNetv2\n(2D), nnUNetv2 (3D fullres), MedNext (2D) and MedNext(3D fullres). Conclusions:\nWe not only proposed a dataset named PSLT(Plain Scan Liver Tumors), but also\nexplored a structure called YNetr that utilizes wavelet transform to extract\ndifferent frequency information, which having the SOTA in PSLT by experiments.\n","authors":["Wen Sheng","Zhong Zheng","Jiajun Liu","Han Lu","Hanyuan Zhang","Zhengyong Jiang","Zhihong Zhang","Daoping Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.00327v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.00323v1","updated":"2024-03-30T11:28:05Z","published":"2024-03-30T11:28:05Z","title":"CLIP-driven Outliers Synthesis for few-shot OOD detection","summary":" Few-shot OOD detection focuses on recognizing out-of-distribution (OOD)\nimages that belong to classes unseen during training, with the use of only a\nsmall number of labeled in-distribution (ID) images. Up to now, a mainstream\nstrategy is based on large-scale vision-language models, such as CLIP. However,\nthese methods overlook a crucial issue: the lack of reliable OOD supervision\ninformation, which can lead to biased boundaries between in-distribution (ID)\nand OOD. To tackle this problem, we propose CLIP-driven Outliers\nSynthesis~(CLIP-OS). Firstly, CLIP-OS enhances patch-level features' perception\nby newly proposed patch uniform convolution, and adaptively obtains the\nproportion of ID-relevant information by employing CLIP-surgery-discrepancy,\nthus achieving separation between ID-relevant and ID-irrelevant. Next, CLIP-OS\nsynthesizes reliable OOD data by mixing up ID-relevant features from different\nclasses to provide OOD supervision information. Afterward, CLIP-OS leverages\nsynthetic OOD samples by unknown-aware prompt learning to enhance the\nseparability of ID and OOD. Extensive experiments across multiple benchmarks\ndemonstrate that CLIP-OS achieves superior few-shot OOD detection capability.\n","authors":["Hao Sun","Rundong He","Zhongyi Han","Zhicong Lin","Yongshun Gong","Yilong Yin"],"pdf_url":"https://arxiv.org/pdf/2404.00323v1.pdf","comment":"9 pages,5 figures"},{"id":"http://arxiv.org/abs/2404.00322v1","updated":"2024-03-30T11:21:11Z","published":"2024-03-30T11:21:11Z","title":"Instrument-tissue Interaction Detection Framework for Surgical Video\n Understanding","summary":" Instrument-tissue interaction detection task, which helps understand surgical\nactivities, is vital for constructing computer-assisted surgery systems but\nwith many challenges. Firstly, most models represent instrument-tissue\ninteraction in a coarse-grained way which only focuses on classification and\nlacks the ability to automatically detect instruments and tissues. Secondly,\nexisting works do not fully consider relations between intra- and inter-frame\nof instruments and tissues. In the paper, we propose to represent\ninstrument-tissue interaction as quintuple and present an\nInstrument-Tissue Interaction Detection Network (ITIDNet) to detect the\nquintuple for surgery videos understanding. Specifically, we propose a Snippet\nConsecutive Feature (SCF) Layer to enhance features by modeling relationships\nof proposals in the current frame using global context information in the video\nsnippet. We also propose a Spatial Corresponding Attention (SCA) Layer to\nincorporate features of proposals between adjacent frames through spatial\nencoding. To reason relationships between instruments and tissues, a Temporal\nGraph (TG) Layer is proposed with intra-frame connections to exploit\nrelationships between instruments and tissues in the same frame and inter-frame\nconnections to model the temporal information for the same instance. For\nevaluation, we build a cataract surgery video (PhacoQ) dataset and a\ncholecystectomy surgery video (CholecQ) dataset. Experimental results\ndemonstrate the promising performance of our model, which outperforms other\nstate-of-the-art models on both datasets.\n","authors":["Wenjun Lin","Yan Hu","Huazhu Fu","Mingming Yang","Chin-Boon Chng","Ryo Kawasaki","Cheekong Chui","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00318v1","updated":"2024-03-30T10:54:59Z","published":"2024-03-30T10:54:59Z","title":"Exploring Unseen Environments with Robots using Large Language and\n Vision Models through a Procedurally Generated 3D Scene Representation","summary":" Recent advancements in Generative Artificial Intelligence, particularly in\nthe realm of Large Language Models (LLMs) and Large Vision Language Models\n(LVLMs), have enabled the prospect of leveraging cognitive planners within\nrobotic systems. This work focuses on solving the object goal navigation\nproblem by mimicking human cognition to attend, perceive and store task\nspecific information and generate plans with the same. We introduce a\ncomprehensive framework capable of exploring an unfamiliar environment in\nsearch of an object by leveraging the capabilities of Large Language\nModels(LLMs) and Large Vision Language Models (LVLMs) in understanding the\nunderlying semantics of our world. A challenging task in using LLMs to generate\nhigh level sub-goals is to efficiently represent the environment around the\nrobot. We propose to use a 3D scene modular representation, with semantically\nrich descriptions of the object, to provide the LLM with task relevant\ninformation. But providing the LLM with a mass of contextual information (rich\n3D scene semantic representation), can lead to redundant and inefficient plans.\nWe propose to use an LLM based pruner that leverages the capabilities of\nin-context learning to prune out irrelevant goal specific information.\n","authors":["Arjun P S","Andrew Melnik","Gora Chand Nandi"],"pdf_url":"https://arxiv.org/pdf/2404.00318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00313v1","updated":"2024-03-30T10:37:56Z","published":"2024-03-30T10:37:56Z","title":"Harmonizing Light and Darkness: A Symphony of Prior-guided Data\n Synthesis and Adaptive Focus for Nighttime Flare Removal","summary":" Intense light sources often produce flares in captured images at night, which\ndeteriorates the visual quality and negatively affects downstream applications.\nIn order to train an effective flare removal network, a reliable dataset is\nessential. The mainstream flare removal datasets are semi-synthetic to reduce\nhuman labour, but these datasets do not cover typical scenarios involving\nmultiple scattering flares. To tackle this issue, we synthesize a prior-guided\ndataset named Flare7K*, which contains multi-flare images where the brightness\nof flares adheres to the laws of illumination. Besides, flares tend to occupy\nlocalized regions of the image but existing networks perform flare removal on\nthe entire image and sometimes modify clean areas incorrectly. Therefore, we\npropose a plug-and-play Adaptive Focus Module (AFM) that can adaptively mask\nthe clean background areas and assist models in focusing on the regions\nseverely affected by flares. Extensive experiments demonstrate that our data\nsynthesis method can better simulate real-world scenes and several models\nequipped with AFM achieve state-of-the-art performance on the real-world test\ndataset.\n","authors":["Lishen Qu","Shihao Zhou","Jinshan Pan","Jinglei Shi","Duosheng Chen","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00312v1","updated":"2024-03-30T10:25:28Z","published":"2024-03-30T10:25:28Z","title":"Bayesian Exploration of Pre-trained Models for Low-shot Image\n Classification","summary":" Low-shot image classification is a fundamental task in computer vision, and\nthe emergence of large-scale vision-language models such as CLIP has greatly\nadvanced the forefront of research in this field. However, most existing\nCLIP-based methods lack the flexibility to effectively incorporate other\npre-trained models that encompass knowledge distinct from CLIP. To bridge the\ngap, this work proposes a simple and effective probabilistic model ensemble\nframework based on Gaussian processes, which have previously demonstrated\nremarkable efficacy in processing small data. We achieve the integration of\nprior knowledge by specifying the mean function with CLIP and the kernel\nfunction with an ensemble of deep kernels built upon various pre-trained\nmodels. By regressing the classification label directly, our framework enables\nanalytical inference, straightforward uncertainty quantification, and\nprincipled hyper-parameter tuning. Through extensive experiments on standard\nbenchmarks, we demonstrate that our method consistently outperforms competitive\nensemble baselines regarding predictive performance. Additionally, we assess\nthe robustness of our method and the quality of the yielded uncertainty\nestimates on out-of-distribution datasets. We also illustrate that our method,\ndespite relying on label regression, still enjoys superior model calibration\ncompared to most deterministic baselines.\n","authors":["Yibo Miao","Yu Lei","Feng Zhou","Zhijie Deng"],"pdf_url":"https://arxiv.org/pdf/2404.00312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00308v1","updated":"2024-03-30T10:11:26Z","published":"2024-03-30T10:11:26Z","title":"ST-LLM: Large Language Models Are Effective Temporal Learners","summary":" Large Language Models (LLMs) have showcased impressive capabilities in text\ncomprehension and generation, prompting research efforts towards video LLMs to\nfacilitate human-AI interaction at the video level. However, how to effectively\nencode and understand videos in video-based dialogue systems remains to be\nsolved. In this paper, we investigate a straightforward yet unexplored\nquestion: Can we feed all spatial-temporal tokens into the LLM, thus delegating\nthe task of video sequence modeling to the LLMs? Surprisingly, this simple\napproach yields significant improvements in video understanding. Based upon\nthis, we propose ST-LLM, an effective video-LLM baseline with Spatial-Temporal\nsequence modeling inside LLM. Furthermore, to address the overhead and\nstability issues introduced by uncompressed video tokens within LLMs, we\ndevelop a dynamic masking strategy with tailor-made training objectives. For\nparticularly long videos, we have also designed a global-local input module to\nbalance efficiency and effectiveness. Consequently, we harness LLM for\nproficient spatial-temporal modeling, while upholding efficiency and stability.\nExtensive experimental results attest to the effectiveness of our method.\nThrough a more concise model and training pipeline, ST-LLM establishes a new\nstate-of-the-art result on VideoChatGPT-Bench and MVBench. Codes have been\navailable at https://github.com/TencentARC/ST-LLM.\n","authors":["Ruyang Liu","Chen Li","Haoran Tang","Yixiao Ge","Ying Shan","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2404.00308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00301v1","updated":"2024-03-30T09:43:40Z","published":"2024-03-30T09:43:40Z","title":"Monocular Identity-Conditioned Facial Reflectance Reconstruction","summary":" Recent 3D face reconstruction methods have made remarkable advancements, yet\nthere remain huge challenges in monocular high-quality facial reflectance\nreconstruction. Existing methods rely on a large amount of light-stage captured\ndata to learn facial reflectance models. However, the lack of subject diversity\nposes challenges in achieving good generalization and widespread applicability.\nIn this paper, we learn the reflectance prior in image space rather than UV\nspace and present a framework named ID2Reflectance. Our framework can directly\nestimate the reflectance maps of a single image while using limited reflectance\ndata for training. Our key insight is that reflectance data shares facial\nstructures with RGB faces, which enables obtaining expressive facial prior from\ninexpensive RGB data thus reducing the dependency on reflectance data. We first\nlearn a high-quality prior for facial reflectance. Specifically, we pretrain\nmulti-domain facial feature codebooks and design a codebook fusion method to\nalign the reflectance and RGB domains. Then, we propose an identity-conditioned\nswapping module that injects facial identity from the target image into the\npre-trained autoencoder to modify the identity of the source reflectance image.\nFinally, we stitch multi-view swapped reflectance images to obtain renderable\nassets. Extensive experiments demonstrate that our method exhibits excellent\ngeneralization capability and achieves state-of-the-art facial reflectance\nreconstruction results for in-the-wild faces. Our project page is\nhttps://xingyuren.github.io/id2reflectance/.\n","authors":["Xingyu Ren","Jiankang Deng","Yuhao Cheng","Jia Guo","Chao Ma","Yichao Yan","Wenhan Zhu","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00301v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00292v1","updated":"2024-03-30T08:51:23Z","published":"2024-03-30T08:51:23Z","title":"LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge\n Retrieval-Augmented Diffusion","summary":" Camouflaged vision perception is an important vision task with numerous\npractical applications. Due to the expensive collection and labeling costs,\nthis community struggles with a major bottleneck that the species category of\nits datasets is limited to a small number of object species. However, the\nexisting camouflaged generation methods require specifying the background\nmanually, thus failing to extend the camouflaged sample diversity in a low-cost\nmanner. In this paper, we propose a Latent Background Knowledge\nRetrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To\nour knowledge, our contributions mainly include: (1) For the first time, we\npropose a camouflaged generation paradigm that does not need to receive any\nbackground inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented\nmethod with interpretability for camouflaged generation, in which we propose an\nidea that knowledge retrieval and reasoning enhancement are separated\nexplicitly, to alleviate the task-specific challenges. Moreover, our method is\nnot restricted to specific foreground targets or backgrounds, offering a\npotential for extending camouflaged vision perception to more diverse domains.\n(3) Experimental results demonstrate that our method outperforms the existing\napproaches, generating more realistic camouflage images.\n","authors":["Pancheng Zhao","Peng Xu","Pengda Qin","Deng-Ping Fan","Zhicheng Zhang","Guoli Jia","Bowen Zhou","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00292v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00288v1","updated":"2024-03-30T08:42:34Z","published":"2024-03-30T08:42:34Z","title":"Seeing the Unseen: A Frequency Prompt Guided Transformer for Image\n Restoration","summary":" How to explore useful features from images as prompts to guide the deep image\nrestoration models is an effective way to solve image restoration. In contrast\nto mining spatial relations within images as prompt, which leads to\ncharacteristics of different frequencies being neglected and further remaining\nsubtle or undetectable artifacts in the restored image, we develop a Frequency\nPrompting image restoration method, dubbed FPro, which can effectively provide\nprompt components from a frequency perspective to guild the restoration model\naddress these differences. Specifically, we first decompose input features into\nseparate frequency parts via dynamically learned filters, where we introduce a\ngating mechanism for suppressing the less informative elements within the\nkernels. To propagate useful frequency information as prompt, we then propose a\ndual prompt block, consisting of a low-frequency prompt modulator (LPM) and a\nhigh-frequency prompt modulator (HPM), to handle signals from different bands\nrespectively. Each modulator contains a generation process to incorporate\nprompting components into the extracted frequency maps, and a modulation part\nthat modifies the prompt feature with the guidance of the decoder features.\nExperimental results on commonly used benchmarks have demonstrated the\nfavorable performance of our pipeline against SOTA methods on 5 image\nrestoration tasks, including deraining, deraindrop, demoir\\'eing, deblurring,\nand dehazing. The source code and pre-trained models will be available at\nhttps://github.com/joshyZhou/FPro.\n","authors":["Shihao Zhou","Jinshan Pan","Jinglei Shi","Duosheng Chen","Lishen Qu","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00288v1.pdf","comment":"18 pages, 10 figrues"},{"id":"http://arxiv.org/abs/2404.00285v1","updated":"2024-03-30T08:37:19Z","published":"2024-03-30T08:37:19Z","title":"Long-Tailed Recognition on Binary Networks by Calibrating A Pre-trained\n Model","summary":" Deploying deep models in real-world scenarios entails a number of challenges,\nincluding computational efficiency and real-world (e.g., long-tailed) data\ndistributions. We address the combined challenge of learning long-tailed\ndistributions using highly resource-efficient binary neural networks as\nbackbones. Specifically, we propose a calibrate-and-distill framework that uses\noff-the-shelf pretrained full-precision models trained on balanced datasets to\nuse as teachers for distillation when learning binary networks on long-tailed\ndatasets. To better generalize to various datasets, we further propose a novel\nadversarial balancing among the terms in the objective function and an\nefficient multiresolution learning scheme. We conducted the largest empirical\nstudy in the literature using 15 datasets, including newly derived long-tailed\ndatasets from existing balanced datasets, and show that our proposed method\noutperforms prior art by large margins (>14.33% on average).\n","authors":["Jihun Kim","Dahyun Kim","Hyungrok Jung","Taeil Oh","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2404.00285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00279v1","updated":"2024-03-30T08:05:00Z","published":"2024-03-30T08:05:00Z","title":"Look-Around Before You Leap: High-Frequency Injected Transformer for\n Image Restoration","summary":" Transformer-based approaches have achieved superior performance in image\nrestoration, since they can model long-term dependencies well. However, the\nlimitation in capturing local information restricts their capacity to remove\ndegradations. While existing approaches attempt to mitigate this issue by\nincorporating convolutional operations, the core component in Transformer,\ni.e., self-attention, which serves as a low-pass filter, could unintentionally\ndilute or even eliminate the acquired local patterns. In this paper, we propose\nHIT, a simple yet effective High-frequency Injected Transformer for image\nrestoration. Specifically, we design a window-wise injection module (WIM),\nwhich incorporates abundant high-frequency details into the feature map, to\nprovide reliable references for restoring high-quality images. We also develop\na bidirectional interaction module (BIM) to aggregate features at different\nscales using a mutually reinforced paradigm, resulting in spatially and\ncontextually improved representations. In addition, we introduce a spatial\nenhancement unit (SEU) to preserve essential spatial relationships that may be\nlost due to the computations carried out across channel dimensions in the BIM.\nExtensive experiments on 9 tasks (real noise, real rain streak, raindrop,\nmotion blur, moir\\'e, shadow, snow, haze, and low-light condition) demonstrate\nthat HIT with linear computational complexity performs favorably against the\nstate-of-the-art methods. The source code and pre-trained models will be\navailable at https://github.com/joshyZhou/HIT.\n","authors":["Shihao Zhou","Duosheng Chen","Jinshan Pan","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00279v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.00272v1","updated":"2024-03-30T07:27:36Z","published":"2024-03-30T07:27:36Z","title":"HSIMamba: Hyperpsectral Imaging Efficient Feature Learning with\n Bidirectional State Space for Classification","summary":" Classifying hyperspectral images is a difficult task in remote sensing, due\nto their complex high-dimensional data. To address this challenge, we propose\nHSIMamba, a novel framework that uses bidirectional reversed convolutional\nneural network pathways to extract spectral features more efficiently.\nAdditionally, it incorporates a specialized block for spatial analysis. Our\napproach combines the operational efficiency of CNNs with the dynamic feature\nextraction capability of attention mechanisms found in Transformers. However,\nit avoids the associated high computational demands. HSIMamba is designed to\nprocess data bidirectionally, significantly enhancing the extraction of\nspectral features and integrating them with spatial information for\ncomprehensive analysis. This approach improves classification accuracy beyond\ncurrent benchmarks and addresses computational inefficiencies encountered with\nadvanced models like Transformers. HSIMamba were tested against three widely\nrecognized datasets Houston 2013, Indian Pines, and Pavia University and\ndemonstrated exceptional performance, surpassing existing state-of-the-art\nmodels in HSI classification. This method highlights the methodological\ninnovation of HSIMamba and its practical implications, which are particularly\nvaluable in contexts where computational resources are limited. HSIMamba\nredefines the standards of efficiency and accuracy in HSI classification,\nthereby enhancing the capabilities of remote sensing applications.\nHyperspectral imaging has become a crucial tool for environmental surveillance,\nagriculture, and other critical areas that require detailed analysis of the\nEarth surface. Please see our code in HSIMamba for more details.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Alan Wee Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.00272v1.pdf","comment":"11 pages, 2 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.00269v1","updated":"2024-03-30T07:17:37Z","published":"2024-03-30T07:17:37Z","title":"IPoD: Implicit Field Learning with Point Diffusion for Generalizable 3D\n Object Reconstruction from Single RGB-D Images","summary":" Generalizable 3D object reconstruction from single-view RGB-D images remains\na challenging task, particularly with real-world data. Current state-of-the-art\nmethods develop Transformer-based implicit field learning, necessitating an\nintensive learning paradigm that requires dense query-supervision uniformly\nsampled throughout the entire space. We propose a novel approach, IPoD, which\nharmonizes implicit field learning with point diffusion. This approach treats\nthe query points for implicit field learning as a noisy point cloud for\niterative denoising, allowing for their dynamic adaptation to the target object\nshape. Such adaptive query points harness diffusion learning's capability for\ncoarse shape recovery and also enhances the implicit representation's ability\nto delineate finer details. Besides, an additional self-conditioning mechanism\nis designed to use implicit predictions as the guidance of diffusion learning,\nleading to a cooperative system. Experiments conducted on the CO3D-v2 dataset\naffirm the superiority of IPoD, achieving 7.8% improvement in F-score and 28.6%\nin Chamfer distance over existing methods. The generalizability of IPoD is also\ndemonstrated on the MVImgNet dataset. Our project page is at\nhttps://yushuang-wu.github.io/IPoD.\n","authors":["Yushuang Wu","Luyue Shi","Junhao Cai","Weihao Yuan","Lingteng Qiu","Zilong Dong","Liefeng Bo","Shuguang Cui","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2404.00269v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00262v1","updated":"2024-03-30T06:29:59Z","published":"2024-03-30T06:29:59Z","title":"Image-to-Image Matching via Foundation Models: A New Perspective for\n Open-Vocabulary Semantic Segmentation","summary":" Open-vocabulary semantic segmentation (OVS) aims to segment images of\narbitrary categories specified by class labels or captions. However, most\nprevious best-performing methods, whether pixel grouping methods or region\nrecognition methods, suffer from false matches between image features and\ncategory labels. We attribute this to the natural gap between the textual\nfeatures and visual features. In this work, we rethink how to mitigate false\nmatches from the perspective of image-to-image matching and propose a novel\nrelation-aware intra-modal matching (RIM) framework for OVS based on visual\nfoundation models. RIM achieves robust region classification by firstly\nconstructing diverse image-modal reference features and then matching them with\nregion features based on relation-aware ranking distribution. The proposed RIM\nenjoys several merits. First, the intra-modal reference features are better\naligned, circumventing potential ambiguities that may arise in cross-modal\nmatching. Second, the ranking-based matching process harnesses the structure\ninformation implicit in the inter-class relationships, making it more robust\nthan comparing individually. Extensive experiments on three benchmarks\ndemonstrate that RIM outperforms previous state-of-the-art methods by large\nmargins, obtaining a lead of more than 10% in mIoU on PASCAL VOC benchmark.\n","authors":["Yuan Wang","Rui Sun","Naisong Luo","Yuwen Pan","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00262v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.00260v1","updated":"2024-03-30T06:18:50Z","published":"2024-03-30T06:18:50Z","title":"Exploiting Self-Supervised Constraints in Image Super-Resolution","summary":" Recent advances in self-supervised learning, predominantly studied in\nhigh-level visual tasks, have been explored in low-level image processing. This\npaper introduces a novel self-supervised constraint for single image\nsuper-resolution, termed SSC-SR. SSC-SR uniquely addresses the divergence in\nimage complexity by employing a dual asymmetric paradigm and a target model\nupdated via exponential moving average to enhance stability. The proposed\nSSC-SR framework works as a plug-and-play paradigm and can be easily applied to\nexisting SR models. Empirical evaluations reveal that our SSC-SR framework\ndelivers substantial enhancements on a variety of benchmark datasets, achieving\nan average increase of 0.1 dB over EDSR and 0.06 dB over SwinIR. In addition,\nextensive ablation studies corroborate the effectiveness of each constituent in\nour SSC-SR framework. Codes are available at https://github.com/Aitical/SSCSR.\n","authors":["Gang Wu","Junjun Jiang","Kui Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00260v1.pdf","comment":"ICME 2024"},{"id":"http://arxiv.org/abs/2404.00257v1","updated":"2024-03-30T06:17:39Z","published":"2024-03-30T06:17:39Z","title":"YOLOOC: YOLO-based Open-Class Incremental Object Detection with Novel\n Class Discovery","summary":" Because of its use in practice, open-world object detection (OWOD) has gotten\na lot of attention recently. The challenge is how can a model detect novel\nclasses and then incrementally learn them without forgetting previously known\nclasses. Previous approaches hinge on strongly-supervised or weakly-supervised\nnovel-class data for novel-class detection, which may not apply to real\napplications. We construct a new benchmark that novel classes are only\nencountered at the inference stage. And we propose a new OWOD detector YOLOOC,\nbased on the YOLO architecture yet for the Open-Class setup. We introduce label\nsmoothing to prevent the detector from over-confidently mapping novel classes\nto known classes and to discover novel classes. Extensive experiments conducted\non our more realistic setup demonstrate the effectiveness of our method for\ndiscovering novel classes in our new benchmark.\n","authors":["Qian Wan","Xiang Xiang","Qinhao Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.00257v1.pdf","comment":"Initially submitted to ACCV 2022"},{"id":"http://arxiv.org/abs/2404.00252v1","updated":"2024-03-30T05:42:17Z","published":"2024-03-30T05:42:17Z","title":"Learned Scanpaths Aid Blind Panoramic Video Quality Assessment","summary":" Panoramic videos have the advantage of providing an immersive and interactive\nviewing experience. Nevertheless, their spherical nature gives rise to various\nand uncertain user viewing behaviors, which poses significant challenges for\npanoramic video quality assessment (PVQA). In this work, we propose an\nend-to-end optimized, blind PVQA method with explicit modeling of user viewing\npatterns through visual scanpaths. Our method consists of two modules: a\nscanpath generator and a quality assessor. The scanpath generator is initially\ntrained to predict future scanpaths by minimizing their expected code length\nand then jointly optimized with the quality assessor for quality prediction.\nOur blind PVQA method enables direct quality assessment of panoramic images by\ntreating them as videos composed of identical frames. Experiments on three\npublic panoramic image and video quality datasets, encompassing both synthetic\nand authentic distortions, validate the superiority of our blind PVQA model\nover existing methods.\n","authors":["Kanglong Fan","Wen Wen","Mu Li","Yifan Peng","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2404.00252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00234v1","updated":"2024-03-30T03:50:43Z","published":"2024-03-30T03:50:43Z","title":"Grid Diffusion Models for Text-to-Video Generation","summary":" Recent advances in the diffusion models have significantly improved\ntext-to-image generation. However, generating videos from text is a more\nchallenging task than generating images from text, due to the much larger\ndataset and higher computational cost required. Most existing video generation\nmethods use either a 3D U-Net architecture that considers the temporal\ndimension or autoregressive generation. These methods require large datasets\nand are limited in terms of computational costs compared to text-to-image\ngeneration. To tackle these challenges, we propose a simple but effective novel\ngrid diffusion for text-to-video generation without temporal dimension in\narchitecture and a large text-video paired dataset. We can generate a\nhigh-quality video using a fixed amount of GPU memory regardless of the number\nof frames by representing the video as a grid image. Additionally, since our\nmethod reduces the dimensions of the video to the dimensions of the image,\nvarious image-based methods can be applied to videos, such as text-guided video\nmanipulation from image manipulation. Our proposed method outperforms the\nexisting methods in both quantitative and qualitative evaluations,\ndemonstrating the suitability of our model for real-world video generation.\n","authors":["Taegyeong Lee","Soyeong Kwon","Taehwan Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00234v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00231v1","updated":"2024-03-30T03:23:52Z","published":"2024-03-30T03:23:52Z","title":"Attention-based Shape-Deformation Networks for Artifact-Free Geometry\n Reconstruction of Lumbar Spine from MR Images","summary":" Lumbar disc degeneration, a progressive structural wear and tear of lumbar\nintervertebral disc, is regarded as an essential role on low back pain, a\nsignificant global health concern. Automated lumbar spine geometry\nreconstruction from MR images will enable fast measurement of medical\nparameters to evaluate the lumbar status, in order to determine a suitable\ntreatment. Existing image segmentation-based techniques often generate\nerroneous segments or unstructured point clouds, unsuitable for medical\nparameter measurement. In this work, we present TransDeformer: a novel\nattention-based deep learning approach that reconstructs the contours of the\nlumbar spine with high spatial accuracy and mesh correspondence across\npatients, and we also present a variant of TransDeformer for error estimation.\nSpecially, we devise new attention modules with a new attention formula, which\nintegrates image features and tokenized contour features to predict the\ndisplacements of the points on a shape template without the need for image\nsegmentation. The deformed template reveals the lumbar spine geometry in the\ninput image. We develop a multi-stage training strategy to enhance model\nrobustness with respect to template initialization. Experiment results show\nthat our TransDeformer generates artifact-free geometry outputs, and its\nvariant predicts the error of a reconstructed geometry. Our code is available\nat https://github.com/linchenq/TransDeformer-Mesh.\n","authors":["Linchen Qian","Jiasong Chen","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2404.00231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00230v1","updated":"2024-03-30T03:19:50Z","published":"2024-03-30T03:19:50Z","title":"Latent Watermark: Inject and Detect Watermarks in Latent Diffusion Space","summary":" Watermarking is a tool for actively identifying and attributing the images\ngenerated by latent diffusion models. Existing methods face the dilemma of\nwatermark robustness and image quality. The reason for this dilemma is that\nwatermark detection is performed in pixel space, implying an intrinsic link\nbetween image quality and watermark robustness. In this paper, we highlight\nthat an effective solution to the problem is to both inject and detect\nwatermarks in latent space, and propose Latent Watermark (LW) with a\nprogressive training strategy. Experiments show that compared to the recently\nproposed methods such as StegaStamp, StableSignature, RoSteALS and TreeRing, LW\nnot only surpasses them in terms of robustness but also offers superior image\nquality. When we inject 64-bit messages, LW can achieve an identification\nperformance close to 100% and an attribution performance above 97% under 9\nsingle-attack scenarios and one all-attack scenario. Our code will be available\non GitHub.\n","authors":["Zheling Meng","Bo Peng","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2404.00230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00226v1","updated":"2024-03-30T02:56:54Z","published":"2024-03-30T02:56:54Z","title":"Design as Desired: Utilizing Visual Question Answering for Multimodal\n Pre-training","summary":" Multimodal pre-training demonstrates its potential in the medical domain,\nwhich learns medical visual representations from paired medical reports.\nHowever, many pre-training tasks require extra annotations from clinicians, and\nmost of them fail to explicitly guide the model to learn the desired features\nof different pathologies. To the best of our knowledge, we are the first to\nutilize Visual Question Answering (VQA) for multimodal pre-training to guide\nthe framework focusing on targeted pathological features. In this work, we\nleverage descriptions in medical reports to design multi-granular\nquestion-answer pairs associated with different diseases, which assist the\nframework in pre-training without requiring extra annotations from experts. We\nalso propose a novel pre-training framework with a quasi-textual feature\ntransformer, a module designed to transform visual features into a\nquasi-textual space closer to the textual domain via a contrastive learning\nstrategy. This narrows the vision-language gap and facilitates modality\nalignment. Our framework is applied to four downstream tasks: report\ngeneration, classification, segmentation, and detection across five datasets.\nExtensive experiments demonstrate the superiority of our framework compared to\nother state-of-the-art methods. Our code will be released upon acceptance.\n","authors":["Tongkun Su","Jun Li","Xi Zhang","Haibo Jin","Hao Chen","Qiong Wang","Faqin Lv","Baoliang Zhao","Yin Hu"],"pdf_url":"https://arxiv.org/pdf/2404.00226v1.pdf","comment":null}]},"2024-04-02T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.02157v1","updated":"2024-04-02T17:59:10Z","published":"2024-04-02T17:59:10Z","title":"Segment Any 3D Object with Language","summary":" In this paper, we investigate Open-Vocabulary 3D Instance Segmentation\n(OV-3DIS) with free-form language instructions. Earlier works that rely on only\nannotated base categories for training suffer from limited generalization to\nunseen novel categories. Recent works mitigate poor generalizability to novel\ncategories by generating class-agnostic masks or projecting generalized masks\nfrom 2D to 3D, but disregard semantic or geometry information, leading to\nsub-optimal performance. Instead, generating generalizable but semantic-related\nmasks directly from 3D point clouds would result in superior outcomes. In this\npaper, we introduce Segment any 3D Object with LanguagE (SOLE), which is a\nsemantic and geometric-aware visual-language learning framework with strong\ngeneralizability by generating semantic-related masks directly from 3D point\nclouds. Specifically, we propose a multimodal fusion network to incorporate\nmultimodal semantics in both backbone and decoder. In addition, to align the 3D\nsegmentation model with various language instructions and enhance the mask\nquality, we introduce three types of multimodal associations as supervision.\nOur SOLE outperforms previous methods by a large margin on ScanNetv2,\nScanNet200, and Replica benchmarks, and the results are even close to the\nfully-supervised counterpart despite the absence of class annotations in the\ntraining. Furthermore, extensive qualitative results demonstrate the\nversatility of our SOLE to language instructions.\n","authors":["Seungjun Lee","Yuyang Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02157v1.pdf","comment":"Project Page: https://cvrp-sole.github.io"},{"id":"http://arxiv.org/abs/2404.02155v1","updated":"2024-04-02T17:58:57Z","published":"2024-04-02T17:58:57Z","title":"Alpha Invariance: On Inverse Scaling Between Distance and Volume Density\n in Neural Radiance Fields","summary":" Scale-ambiguity in 3D scene dimensions leads to magnitude-ambiguity of\nvolumetric densities in neural radiance fields, i.e., the densities double when\nscene size is halved, and vice versa. We call this property alpha invariance.\nFor NeRFs to better maintain alpha invariance, we recommend 1) parameterizing\nboth distance and volume densities in log space, and 2) a\ndiscretization-agnostic initialization strategy to guarantee high ray\ntransmittance. We revisit a few popular radiance field models and find that\nthese systems use various heuristics to deal with issues arising from scene\nscaling. We test their behaviors and show our recipe to be more robust.\n","authors":["Joshua Ahn","Haochen Wang","Raymond A. Yeh","Greg Shakhnarovich"],"pdf_url":"https://arxiv.org/pdf/2404.02155v1.pdf","comment":"CVPR 2024. project page https://pals.ttic.edu/p/alpha-invariance"},{"id":"http://arxiv.org/abs/2404.02154v1","updated":"2024-04-02T17:58:49Z","published":"2024-04-02T17:58:49Z","title":"Dynamic Pre-training: Towards Efficient and Scalable All-in-One Image\n Restoration","summary":" All-in-one image restoration tackles different types of degradations with a\nunified model instead of having task-specific, non-generic models for each\ndegradation. The requirement to tackle multiple degradations using the same\nmodel can lead to high-complexity designs with fixed configuration that lack\nthe adaptability to more efficient alternatives. We propose DyNet, a dynamic\nfamily of networks designed in an encoder-decoder style for all-in-one image\nrestoration tasks. Our DyNet can seamlessly switch between its bulkier and\nlightweight variants, thereby offering flexibility for efficient model\ndeployment with a single round of training. This seamless switching is enabled\nby our weights-sharing mechanism, forming the core of our architecture and\nfacilitating the reuse of initialized module weights. Further, to establish\nrobust weights initialization, we introduce a dynamic pre-training strategy\nthat trains variants of the proposed DyNet concurrently, thereby achieving a\n50% reduction in GPU hours. To tackle the unavailability of large-scale dataset\nrequired in pre-training, we curate a high-quality, high-resolution image\ndataset named Million-IRD having 2M image samples. We validate our DyNet for\nimage denoising, deraining, and dehazing in all-in-one setting, achieving\nstate-of-the-art results with 31.34% reduction in GFlops and a 56.75% reduction\nin parameters compared to baseline models. The source codes and trained models\nare available at https://github.com/akshaydudhane16/DyNet.\n","authors":["Akshay Dudhane","Omkar Thawakar","Syed Waqas Zamir","Salman Khan","Fahad Shahbaz Khan","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.02154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02152v1","updated":"2024-04-02T17:58:35Z","published":"2024-04-02T17:58:35Z","title":"GeneAvatar: Generic Expression-Aware Volumetric Head Avatar Editing from\n a Single Image","summary":" Recently, we have witnessed the explosive growth of various volumetric\nrepresentations in modeling animatable head avatars. However, due to the\ndiversity of frameworks, there is no practical method to support high-level\napplications like 3D head avatar editing across different representations. In\nthis paper, we propose a generic avatar editing approach that can be\nuniversally applied to various 3DMM driving volumetric head avatars. To achieve\nthis goal, we design a novel expression-aware modification generative model,\nwhich enables lift 2D editing from a single image to a consistent 3D\nmodification field. To ensure the effectiveness of the generative modification\nprocess, we develop several techniques, including an expression-dependent\nmodification distillation scheme to draw knowledge from the large-scale head\navatar model and 2D facial texture editing tools, implicit latent space\nguidance to enhance model convergence, and a segmentation-based loss reweight\nstrategy for fine-grained texture inversion. Extensive experiments demonstrate\nthat our method delivers high-quality and consistent results across multiple\nexpression and viewpoints. Project page: https://zju3dv.github.io/geneavatar/\n","authors":["Chong Bao","Yinda Zhang","Yuan Li","Xiyu Zhang","Bangbang Yang","Hujun Bao","Marc Pollefeys","Guofeng Zhang","Zhaopeng Cui"],"pdf_url":"https://arxiv.org/pdf/2404.02152v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://zju3dv.github.io/geneavatar/"},{"id":"http://arxiv.org/abs/2404.02148v1","updated":"2024-04-02T17:58:03Z","published":"2024-04-02T17:58:03Z","title":"Diffusion$^2$: Dynamic 3D Content Generation via Score Composition of\n Orthogonal Diffusion Models","summary":" Recent advancements in 3D generation are predominantly propelled by\nimprovements in 3D-aware image diffusion models which are pretrained on\nInternet-scale image data and fine-tuned on massive 3D data, offering the\ncapability of producing highly consistent multi-view images. However, due to\nthe scarcity of synchronized multi-view video data, it is impractical to adapt\nthis paradigm to 4D generation directly. Despite that, the available video and\n3D data are adequate for training video and multi-view diffusion models that\ncan provide satisfactory dynamic and geometric priors respectively. In this\npaper, we present Diffusion$^2$, a novel framework for dynamic 3D content\ncreation that leverages the knowledge about geometric consistency and temporal\nsmoothness from these models to directly sample dense multi-view and\nmulti-frame images which can be employed to optimize continuous 4D\nrepresentation. Specifically, we design a simple yet effective denoising\nstrategy via score composition of video and multi-view diffusion models based\non the probability structure of the images to be generated. Owing to the high\nparallelism of the image generation and the efficiency of the modern 4D\nreconstruction pipeline, our framework can generate 4D content within few\nminutes. Furthermore, our method circumvents the reliance on 4D data, thereby\nhaving the potential to benefit from the scalability of the foundation video\nand multi-view diffusion models. Extensive experiments demonstrate the efficacy\nof our proposed framework and its capability to flexibly adapt to various types\nof prompts.\n","authors":["Zeyu Yang","Zijie Pan","Chun Gu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.02148v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2404.02145v1","updated":"2024-04-02T17:57:31Z","published":"2024-04-02T17:57:31Z","title":"Iterated Learning Improves Compositionality in Large Vision-Language\n Models","summary":" A fundamental characteristic common to both human vision and natural language\nis their compositional nature. Yet, despite the performance gains contributed\nby large vision and language pretraining, recent investigations find that\nmost-if not all-our state-of-the-art vision-language models struggle at\ncompositionality. They are unable to distinguish between images of \" a girl in\nwhite facing a man in black\" and \"a girl in black facing a man in white\".\nMoreover, prior work suggests that compositionality doesn't arise with scale:\nlarger model sizes or training data don't help. This paper develops a new\niterated training algorithm that incentivizes compositionality. We draw on\ndecades of cognitive science research that identifies cultural transmission-the\nneed to teach a new generation-as a necessary inductive prior that incentivizes\nhumans to develop compositional languages. Specifically, we reframe\nvision-language contrastive learning as the Lewis Signaling Game between a\nvision agent and a language agent, and operationalize cultural transmission by\niteratively resetting one of the agent's weights during training. After every\niteration, this training paradigm induces representations that become \"easier\nto learn\", a property of compositional languages: e.g. our model trained on\nCC3M and CC12M improves standard CLIP by 4.7%, 4.0% respectfully in the\nSugarCrepe benchmark.\n","authors":["Chenhao Zheng","Jieyu Zhang","Aniruddha Kembhavi","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.02145v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02135v1","updated":"2024-04-02T17:48:46Z","published":"2024-04-02T17:48:46Z","title":"ResNet with Integrated Convolutional Block Attention Module for Ship\n Classification Using Transfer Learning on Optical Satellite Imagery","summary":" This study proposes a novel transfer learning framework for effective ship\nclassification using high-resolution optical remote sensing satellite imagery.\nThe framework is based on the deep convolutional neural network model ResNet50\nand incorporates the Convolutional Block Attention Module (CBAM) to enhance\nperformance. CBAM enables the model to attend to salient features in the\nimages, allowing it to better discriminate between subtle differences between\nships and backgrounds. Furthermore, this study adopts a transfer learning\napproach tailored for accurately classifying diverse types of ships by\nfine-tuning a pre-trained model for the specific task. Experimental results\ndemonstrate the efficacy of the proposed framework in ship classification using\noptical remote sensing imagery, achieving a high classification accuracy of 94%\nacross 5 classes, outperforming existing methods. This research holds potential\napplications in maritime surveillance and management, illegal fishing\ndetection, and maritime traffic monitoring.\n","authors":["Ryan Donghan Kwon","Gangjoo Robin Nam","Jisoo Tak","Yeom Hyeok","Junseob Shin","Hyerin Cha","Kim Soo Bin"],"pdf_url":"https://arxiv.org/pdf/2404.02135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02132v1","updated":"2024-04-02T17:40:29Z","published":"2024-04-02T17:40:29Z","title":"ViTamin: Designing Scalable Vision Models in the Vision-Language Era","summary":" Recent breakthroughs in vision-language models (VLMs) start a new page in the\nvision community. The VLMs provide stronger and more generalizable feature\nembeddings compared to those from ImageNet-pretrained models, thanks to the\ntraining on the large-scale Internet image-text pairs. However, despite the\namazing achievement from the VLMs, vanilla Vision Transformers (ViTs) remain\nthe default choice for the image encoder. Although pure transformer proves its\neffectiveness in the text encoding area, it remains questionable whether it is\nalso the case for image encoding, especially considering that various types of\nnetworks are proposed on the ImageNet benchmark, which, unfortunately, are\nrarely studied in VLMs. Due to small data/model scale, the original conclusions\nof model design on ImageNet can be limited and biased. In this paper, we aim at\nbuilding an evaluation protocol of vision models in the vision-language era\nunder the contrastive language-image pretraining (CLIP) framework. We provide a\ncomprehensive way to benchmark different vision models, covering their\nzero-shot performance and scalability in both model and training data sizes. To\nthis end, we introduce ViTamin, a new vision models tailored for VLMs.\nViTamin-L significantly outperforms ViT-L by 2.0% ImageNet zero-shot accuracy,\nwhen using the same publicly available DataComp-1B dataset and the same\nOpenCLIP training scheme. ViTamin-L presents promising results on 60 diverse\nbenchmarks, including classification, retrieval, open-vocabulary detection and\nsegmentation, and large multi-modal models. When further scaling up the model\nsize, our ViTamin-XL with only 436M parameters attains 82.9% ImageNet zero-shot\naccuracy, surpassing 82.0% achieved by EVA-E that has ten times more parameters\n(4.4B).\n","authors":["Jienneg Chen","Qihang Yu","Xiaohui Shen","Alan Yuille","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02132v1.pdf","comment":"CVPR 2024; https://github.com/Beckschen/ViTamin"},{"id":"http://arxiv.org/abs/2308.12469v3","updated":"2024-04-02T17:40:03Z","published":"2023-08-23T23:44:44Z","title":"Diffuse, Attend, and Segment: Unsupervised Zero-Shot Segmentation using\n Stable Diffusion","summary":" Producing quality segmentation masks for images is a fundamental problem in\ncomputer vision. Recent research has explored large-scale supervised training\nto enable zero-shot segmentation on virtually any image style and unsupervised\ntraining to enable segmentation without dense annotations. However,\nconstructing a model capable of segmenting anything in a zero-shot manner\nwithout any annotations is still challenging. In this paper, we propose to\nutilize the self-attention layers in stable diffusion models to achieve this\ngoal because the pre-trained stable diffusion model has learned inherent\nconcepts of objects within its attention layers. Specifically, we introduce a\nsimple yet effective iterative merging process based on measuring KL divergence\namong attention maps to merge them into valid segmentation masks. The proposed\nmethod does not require any training or language dependency to extract quality\nsegmentation for any images. On COCO-Stuff-27, our method surpasses the prior\nunsupervised zero-shot SOTA method by an absolute 26% in pixel accuracy and 17%\nin mean IoU. The project page is at\n\\url{https://sites.google.com/view/diffseg/home}.\n","authors":["Junjiao Tian","Lavisha Aggarwal","Andrea Colaco","Zsolt Kira","Mar Gonzalez-Franco"],"pdf_url":"https://arxiv.org/pdf/2308.12469v3.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2310.05861v2","updated":"2024-04-02T17:37:42Z","published":"2023-10-09T16:57:57Z","title":"Rephrase, Augment, Reason: Visual Grounding of Questions for\n Vision-Language Models","summary":" An increasing number of vision-language tasks can be handled with little to\nno training, i.e., in a zero and few-shot manner, by marrying large language\nmodels (LLMs) to vision encoders, resulting in large vision-language models\n(LVLMs). While this has huge upsides, such as not requiring training data or\ncustom architectures, how an input is presented to an LVLM can have a major\nimpact on zero-shot model performance. In particular, inputs phrased in an\nunderspecified way can result in incorrect answers due to factors like missing\nvisual information, complex implicit reasoning, or linguistic ambiguity.\nTherefore, adding visually-grounded information to the input as a preemptive\nclarification should improve model performance by reducing underspecification,\ne.g., by localizing objects and disambiguating references. Similarly, in the\nVQA setting, changing the way questions are framed can make them easier for\nmodels to answer. To this end, we present Rephrase, Augment and Reason\n(RepARe), a gradient-free framework that extracts salient details about the\nimage using the underlying LVLM as a captioner and reasoner, in order to\npropose modifications to the original question. We then use the LVLM's\nconfidence over a generated answer as an unsupervised scoring function to\nselect the rephrased question most likely to improve zero-shot performance.\nFocusing on three visual question answering tasks, we show that RepARe can\nresult in a 3.85% (absolute) increase in zero-shot accuracy on VQAv2, 6.41%,\nand 7.94% points increase on A-OKVQA, and VizWiz respectively. Additionally, we\nfind that using gold answers for oracle question candidate selection achieves a\nsubstantial gain in VQA accuracy by up to 14.41%. Through extensive analysis,\nwe demonstrate that outputs from RepARe increase syntactic complexity, and\neffectively utilize vision-language interaction and the frozen LLM.\n","authors":["Archiki Prasad","Elias Stengel-Eskin","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2310.05861v2.pdf","comment":"ICLR 2024 camera-ready (23 pages), Code:\n https://github.com/archiki/RepARe"},{"id":"http://arxiv.org/abs/2404.02125v1","updated":"2024-04-02T17:32:12Z","published":"2024-04-02T17:32:12Z","title":"3D Congealing: 3D-Aware Image Alignment in the Wild","summary":" We propose 3D Congealing, a novel problem of 3D-aware alignment for 2D images\ncapturing semantically similar objects. Given a collection of unlabeled\nInternet images, our goal is to associate the shared semantic parts from the\ninputs and aggregate the knowledge from 2D images to a shared 3D canonical\nspace. We introduce a general framework that tackles the task without assuming\nshape templates, poses, or any camera parameters. At its core is a canonical 3D\nrepresentation that encapsulates geometric and semantic information. The\nframework optimizes for the canonical representation together with the pose for\neach input image, and a per-image coordinate map that warps 2D pixel\ncoordinates to the 3D canonical frame to account for the shape matching. The\noptimization procedure fuses prior knowledge from a pre-trained image\ngenerative model and semantic information from input images. The former\nprovides strong knowledge guidance for this under-constraint task, while the\nlatter provides the necessary information to mitigate the training data bias\nfrom the pre-trained model. Our framework can be used for various tasks such as\ncorrespondence matching, pose estimation, and image editing, achieving strong\nresults on real-world image datasets under challenging illumination conditions\nand on in-the-wild online image collections.\n","authors":["Yunzhi Zhang","Zizhang Li","Amit Raj","Andreas Engelhardt","Yuanzhen Li","Tingbo Hou","Jiajun Wu","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2404.02125v1.pdf","comment":"Project page:\n https://ai.stanford.edu/~yzzhang/projects/3d-congealing/"},{"id":"http://arxiv.org/abs/2404.02117v1","updated":"2024-04-02T17:23:22Z","published":"2024-04-02T17:23:22Z","title":"Pre-trained Vision and Language Transformers Are Few-Shot Incremental\n Learners","summary":" Few-Shot Class Incremental Learning (FSCIL) is a task that requires a model\nto learn new classes incrementally without forgetting when only a few samples\nfor each class are given. FSCIL encounters two significant challenges:\ncatastrophic forgetting and overfitting, and these challenges have driven prior\nstudies to primarily rely on shallow models, such as ResNet-18. Even though\ntheir limited capacity can mitigate both forgetting and overfitting issues, it\nleads to inadequate knowledge transfer during few-shot incremental sessions. In\nthis paper, we argue that large models such as vision and language transformers\npre-trained on large datasets can be excellent few-shot incremental learners.\nTo this end, we propose a novel FSCIL framework called PriViLege, Pre-trained\nVision and Language transformers with prompting functions and knowledge\ndistillation. Our framework effectively addresses the challenges of\ncatastrophic forgetting and overfitting in large models through new pre-trained\nknowledge tuning (PKT) and two losses: entropy-based divergence loss and\nsemantic knowledge distillation loss. Experimental results show that the\nproposed PriViLege significantly outperforms the existing state-of-the-art\nmethods with a large margin, e.g., +9.38% in CUB200, +20.58% in CIFAR-100, and\n+13.36% in miniImageNet. Our implementation code is available at\nhttps://github.com/KHU-AGI/PriViLege.\n","authors":["Keon-Hee Park","Kyungwoo Song","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2404.02117v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.12337v3","updated":"2024-04-02T17:23:16Z","published":"2023-12-19T17:03:50Z","title":"pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable\n Generalizable 3D Reconstruction","summary":" We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D\nradiance fields parameterized by 3D Gaussian primitives from pairs of images.\nOur model features real-time and memory-efficient rendering for scalable\ntraining as well as fast 3D reconstruction at inference time. To overcome local\nminima inherent to sparse and locally supported representations, we predict a\ndense probability distribution over 3D and sample Gaussian means from that\nprobability distribution. We make this sampling operation differentiable via a\nreparameterization trick, allowing us to back-propagate gradients through the\nGaussian splatting representation. We benchmark our method on wide-baseline\nnovel view synthesis on the real-world RealEstate10k and ACID datasets, where\nwe outperform state-of-the-art light field transformers and accelerate\nrendering by 2.5 orders of magnitude while reconstructing an interpretable and\neditable 3D radiance field.\n","authors":["David Charatan","Sizhe Li","Andrea Tagliasacchi","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2312.12337v3.pdf","comment":"Project page: https://dcharatan.github.io/pixelsplat"},{"id":"http://arxiv.org/abs/2404.02112v1","updated":"2024-04-02T17:13:04Z","published":"2024-04-02T17:13:04Z","title":"ImageNot: A contrast with ImageNet preserves model rankings","summary":" We introduce ImageNot, a dataset designed to match the scale of ImageNet\nwhile differing drastically in other aspects. We show that key model\narchitectures developed for ImageNet over the years rank identically when\ntrained and evaluated on ImageNot to how they rank on ImageNet. This is true\nwhen training models from scratch or fine-tuning them. Moreover, the relative\nimprovements of each model over earlier models strongly correlate in both\ndatasets. We further give evidence that ImageNot has a similar utility as\nImageNet for transfer learning purposes. Our work demonstrates a surprising\ndegree of external validity in the relative performance of image classification\nmodels. This stands in contrast with absolute accuracy numbers that typically\ndrop sharply even under small changes to a dataset.\n","authors":["Olawale Salaudeen","Moritz Hardt"],"pdf_url":"https://arxiv.org/pdf/2404.02112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03849v3","updated":"2024-04-02T17:11:45Z","published":"2024-03-06T16:49:33Z","title":"MedMamba: Vision Mamba for Medical Image Classification","summary":" Medical image classification is a very fundamental and crucial task in the\nfield of computer vision. These years, CNN-based and Transformer-based models\nhave been widely used to classify various medical images. Unfortunately, The\nlimitation of CNNs in long-range modeling capabilities prevents them from\neffectively extracting features in medical images, while Transformers are\nhampered by their quadratic computational complexity. Recent research has shown\nthat the state space model (SSM) represented by Mamba can efficiently model\nlong-range interactions while maintaining linear computational complexity.\nInspired by this, we propose Vision Mamba for medical image classification\n(MedMamba). More specifically, we introduce a novel Conv-SSM module. Conv-SSM\ncombines the local feature extraction ability of convolutional layers with the\nability of SSM to capture long-range dependency, thereby modeling medical\nimages with different modalities. To demonstrate the potential of MedMamba, we\nconducted extensive experiments using 14 publicly available medical datasets\nwith different imaging techniques and two private datasets built by ourselves.\nExtensive experimental results demonstrate that the proposed MedMamba performs\nwell in detecting lesions in various medical images. To the best of our\nknowledge, this is the first Vision Mamba tailored for medical image\nclassification. The purpose of this work is to establish a new baseline for\nmedical image classification tasks and provide valuable insights for the future\ndevelopment of more efficient and effective SSM-based artificial intelligence\nalgorithms and application systems in the medical. Source code has been\navailable at https://github.com/YubiaoYue/MedMamba.\n","authors":["Yubiao Yue","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2403.03849v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00095v2","updated":"2024-04-02T17:08:35Z","published":"2024-03-29T18:05:26Z","title":"GDA: Generalized Diffusion for Robust Test-time Adaptation","summary":" Machine learning models struggle with generalization when encountering\nout-of-distribution (OOD) samples with unexpected distribution shifts. For\nvision tasks, recent studies have shown that test-time adaptation employing\ndiffusion models can achieve state-of-the-art accuracy improvements on OOD\nsamples by generating new samples that align with the model's domain without\nthe need to modify the model's weights. Unfortunately, those studies have\nprimarily focused on pixel-level corruptions, thereby lacking the\ngeneralization to adapt to a broader range of OOD types. We introduce\nGeneralized Diffusion Adaptation (GDA), a novel diffusion-based test-time\nadaptation method robust against diverse OOD types. Specifically, GDA\niteratively guides the diffusion by applying a marginal entropy loss derived\nfrom the model, in conjunction with style and content preservation losses\nduring the reverse sampling process. In other words, GDA considers the model's\noutput behavior with the semantic information of the samples as a whole, which\ncan reduce ambiguity in downstream tasks during the generation process.\nEvaluation across various popular model architectures and OOD benchmarks shows\nthat GDA consistently outperforms prior work on diffusion-driven adaptation.\nNotably, it achieves the highest classification accuracy improvements, ranging\nfrom 4.4\\% to 5.02\\% on ImageNet-C and 2.5\\% to 7.4\\% on Rendition, Sketch, and\nStylized benchmarks. This performance highlights GDA's generalization to a\nbroader range of OOD benchmarks.\n","authors":["Yun-Yun Tsai","Fu-Chen Chen","Albert Y. C. Chen","Junfeng Yang","Che-Chun Su","Min Sun","Cheng-Hao Kuo"],"pdf_url":"https://arxiv.org/pdf/2404.00095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02106v1","updated":"2024-04-02T17:04:45Z","published":"2024-04-02T17:04:45Z","title":"Neural Ordinary Differential Equation based Sequential Image\n Registration for Dynamic Characterization","summary":" Deformable image registration (DIR) is crucial in medical image analysis,\nenabling the exploration of biological dynamics such as organ motions and\nlongitudinal changes in imaging. Leveraging Neural Ordinary Differential\nEquations (ODE) for registration, this extension work discusses how this\nframework can aid in the characterization of sequential biological processes.\nUtilizing the Neural ODE's ability to model state derivatives with neural\nnetworks, our Neural Ordinary Differential Equation Optimization-based (NODEO)\nframework considers voxels as particles within a dynamic system, defining\ndeformation fields through the integration of neural differential equations.\nThis method learns dynamics directly from data, bypassing the need for physical\npriors, making it exceptionally suitable for medical scenarios where such\npriors are unavailable or inapplicable. Consequently, the framework can discern\nunderlying dynamics and use sequence data to regularize the transformation\ntrajectory. We evaluated our framework on two clinical datasets: one for\ncardiac motion tracking and another for longitudinal brain MRI analysis.\nDemonstrating its efficacy in both 2D and 3D imaging scenarios, our framework\noffers flexibility and model agnosticism, capable of managing image sequences\nand facilitating label propagation throughout these sequences. This study\nprovides a comprehensive understanding of how the Neural ODE-based framework\nuniquely benefits the image registration challenge.\n","authors":["Yifan Wu","Mengjin Dong","Rohit Jena","Chen Qin","James C. Gee"],"pdf_url":"https://arxiv.org/pdf/2404.02106v1.pdf","comment":"Journal extension of NODEO: A Neural Ordinary Differential Equation\n Based Optimization Framework for Deformable Image Registration, CVPR 2022"},{"id":"http://arxiv.org/abs/2403.18360v2","updated":"2024-04-02T17:02:32Z","published":"2024-03-27T08:52:44Z","title":"Learning CNN on ViT: A Hybrid Model to Explicitly Class-specific\n Boundaries for Domain Adaptation","summary":" Most domain adaptation (DA) methods are based on either a convolutional\nneural networks (CNNs) or a vision transformers (ViTs). They align the\ndistribution differences between domains as encoders without considering their\nunique characteristics. For instance, ViT excels in accuracy due to its\nsuperior ability to capture global representations, while CNN has an advantage\nin capturing local representations. This fact has led us to design a hybrid\nmethod to fully take advantage of both ViT and CNN, called Explicitly\nClass-specific Boundaries (ECB). ECB learns CNN on ViT to combine their\ndistinct strengths. In particular, we leverage ViT's properties to explicitly\nfind class-specific decision boundaries by maximizing the discrepancy between\nthe outputs of the two classifiers to detect target samples far from the source\nsupport. In contrast, the CNN encoder clusters target features based on the\npreviously defined class-specific boundaries by minimizing the discrepancy\nbetween the probabilities of the two classifiers. Finally, ViT and CNN mutually\nexchange knowledge to improve the quality of pseudo labels and reduce the\nknowledge discrepancies of these models. Compared to conventional DA methods,\nour ECB achieves superior performance, which verifies its effectiveness in this\nhybrid model. The project website can be found\nhttps://dotrannhattuong.github.io/ECB/website/.\n","authors":["Ba Hung Ngo","Nhat-Tuong Do-Tran","Tuan-Ngoc Nguyen","Hae-Gon Jeon","Tae Jong Choi"],"pdf_url":"https://arxiv.org/pdf/2403.18360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02101v1","updated":"2024-04-02T16:52:41Z","published":"2024-04-02T16:52:41Z","title":"CameraCtrl: Enabling Camera Control for Text-to-Video Generation","summary":" Controllability plays a crucial role in video generation since it allows\nusers to create desired content. However, existing models largely overlooked\nthe precise control of camera pose that serves as a cinematic language to\nexpress deeper narrative nuances. To alleviate this issue, we introduce\nCameraCtrl, enabling accurate camera pose control for text-to-video(T2V)\nmodels. After precisely parameterizing the camera trajectory, a plug-and-play\ncamera module is then trained on a T2V model, leaving others untouched.\nAdditionally, a comprehensive study on the effect of various datasets is also\nconducted, suggesting that videos with diverse camera distribution and similar\nappearances indeed enhance controllability and generalization. Experimental\nresults demonstrate the effectiveness of CameraCtrl in achieving precise and\ndomain-adaptive camera control, marking a step forward in the pursuit of\ndynamic and customized video storytelling from textual and camera pose inputs.\nOur project website is at: https://hehao13.github.io/projects-CameraCtrl/.\n","authors":["Hao He","Yinghao Xu","Yuwei Guo","Gordon Wetzstein","Bo Dai","Hongsheng Li","Ceyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.02101v1.pdf","comment":"Project page: https://hehao13.github.io/projects-CameraCtrl/ Code:\n https://github.com/hehao13/CameraCtrl"},{"id":"http://arxiv.org/abs/2404.02098v1","updated":"2024-04-02T16:48:20Z","published":"2024-04-02T16:48:20Z","title":"BRAVEn: Improving Self-Supervised Pre-training for Visual and Auditory\n Speech Recognition","summary":" Self-supervision has recently shown great promise for learning visual and\nauditory speech representations from unlabelled data. In this work, we propose\nBRAVEn, an extension to the recent RAVEn method, which learns speech\nrepresentations entirely from raw audio-visual data. Our modifications to RAVEn\nenable BRAVEn to achieve state-of-the-art results among self-supervised methods\nin various settings. Moreover, we observe favourable scaling behaviour by\nincreasing the amount of unlabelled data well beyond other self-supervised\nworks. In particular, we achieve 20.0% / 1.7% word error rate for VSR / ASR on\nthe LRS3 test set, with only 30 hours of labelled data and no external ASR\nmodels. Our results suggest that readily available unlabelled audio-visual data\ncan largely replace costly transcribed data.\n","authors":["Alexandros Haliassos","Andreas Zinonos","Rodrigo Mira","Stavros Petridis","Maja Pantic"],"pdf_url":"https://arxiv.org/pdf/2404.02098v1.pdf","comment":"ICASSP 2024. Code: https://github.com/ahaliassos/raven"},{"id":"http://arxiv.org/abs/2404.00511v2","updated":"2024-04-02T16:46:24Z","published":"2024-03-31T01:16:02Z","title":"MIPS at SemEval-2024 Task 3: Multimodal Emotion-Cause Pair Extraction in\n Conversations with Multimodal Language Models","summary":" This paper presents our winning submission to Subtask 2 of SemEval 2024 Task\n3 on multimodal emotion cause analysis in conversations. We propose a novel\nMultimodal Emotion Recognition and Multimodal Emotion Cause Extraction\n(MER-MCE) framework that integrates text, audio, and visual modalities using\nspecialized emotion encoders. Our approach sets itself apart from\ntop-performing teams by leveraging modality-specific features for enhanced\nemotion understanding and causality inference. Experimental evaluation\ndemonstrates the advantages of our multimodal approach, with our submission\nachieving a competitive weighted F1 score of 0.3435, ranking third with a\nmargin of only 0.0339 behind the 1st team and 0.0025 behind the 2nd team.\nProject: https://github.com/MIPS-COLT/MER-MCE.git\n","authors":["Zebang Cheng","Fuqiang Niu","Yuxiang Lin","Zhi-Qi Cheng","Bowen Zhang","Xiaojiang Peng"],"pdf_url":"https://arxiv.org/pdf/2404.00511v2.pdf","comment":"Ranked 3rd in SemEval '24 Task 3 with F1 of 0.3435, close to 1st &\n 2nd by 0.0339 & 0.0025"},{"id":"http://arxiv.org/abs/2401.08629v2","updated":"2024-04-02T16:35:46Z","published":"2023-12-08T12:10:03Z","title":"Immature Green Apple Detection and Sizing in Commercial Orchards using\n YOLOv8 and Shape Fitting Techniques","summary":" Detecting and estimating size of apples during the early stages of growth is\ncrucial for predicting yield, pest management, and making informed decisions\nrelated to crop-load management, harvest and post-harvest logistics, and\nmarketing. Traditional fruit size measurement methods are laborious and\ntimeconsuming. This study employs the state-of-the-art YOLOv8 object detection\nand instance segmentation algorithm in conjunction with geometric shape fitting\ntechniques on 3D point cloud data to accurately determine the size of immature\ngreen apples (or fruitlet) in a commercial orchard environment. The methodology\nutilized two RGB-D sensors: Intel RealSense D435i and Microsoft Azure Kinect\nDK. Notably, the YOLOv8 instance segmentation models exhibited proficiency in\nimmature green apple detection, with the YOLOv8m-seg model achieving the\nhighest AP@0.5 and AP@0.75 scores of 0.94 and 0.91, respectively. Using the\nellipsoid fitting technique on images from the Azure Kinect, we achieved an\nRMSE of 2.35 mm, MAE of 1.66 mm, MAPE of 6.15 mm, and an R-squared value of 0.9\nin estimating the size of apple fruitlets. Challenges such as partial occlusion\ncaused some error in accurately delineating and sizing green apples using the\nYOLOv8-based segmentation technique, particularly in fruit clusters. In a\ncomparison with 102 outdoor samples, the size estimation technique performed\nbetter on the images acquired with Microsoft Azure Kinect than the same with\nIntel Realsense D435i. This superiority is evident from the metrics: the RMSE\nvalues (2.35 mm for Azure Kinect vs. 9.65 mm for Realsense D435i), MAE values\n(1.66 mm for Azure Kinect vs. 7.8 mm for Realsense D435i), and the R-squared\nvalues (0.9 for Azure Kinect vs. 0.77 for Realsense D435i).\n","authors":["Ranjan Sapkota","Dawood Ahmed","Martin Churuvija","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2401.08629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02084v1","updated":"2024-04-02T16:30:12Z","published":"2024-04-02T16:30:12Z","title":"Adaptive Feature Fusion Neural Network for Glaucoma Segmentation on\n Unseen Fundus Images","summary":" Fundus image segmentation on unseen domains is challenging, especially for\nthe over-parameterized deep models trained on the small medical datasets. To\naddress this challenge, we propose a method named Adaptive Feature-fusion\nNeural Network (AFNN) for glaucoma segmentation on unseen domains, which mainly\nconsists of three modules: domain adaptor, feature-fusion network, and\nself-supervised multi-task learning. Specifically, the domain adaptor helps the\npretrained-model fast adapt from other image domains to the medical fundus\nimage domain. Feature-fusion network and self-supervised multi-task learning\nfor the encoder and decoder are introduced to improve the domain generalization\nability. In addition, we also design the weighted-dice-loss to improve model\nperformance on complex optic-cup segmentation tasks. Our proposed method\nachieves a competitive performance over existing fundus segmentation methods on\nfour public glaucoma datasets.\n","authors":["Jiyuan Zhong","Hu Ke","Ming Yan"],"pdf_url":"https://arxiv.org/pdf/2404.02084v1.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.02082v1","updated":"2024-04-02T16:28:41Z","published":"2024-04-02T16:28:41Z","title":"WcDT: World-centric Diffusion Transformer for Traffic Scene Generation","summary":" In this paper, we introduce a novel approach for autonomous driving\ntrajectory generation by harnessing the complementary strengths of diffusion\nprobabilistic models (a.k.a., diffusion models) and transformers. Our proposed\nframework, termed the \"World-Centric Diffusion Transformer\" (WcDT), optimizes\nthe entire trajectory generation process, from feature extraction to model\ninference. To enhance the scene diversity and stochasticity, the historical\ntrajectory data is first preprocessed and encoded into latent space using\nDenoising Diffusion Probabilistic Models (DDPM) enhanced with Diffusion with\nTransformer (DiT) blocks. Then, the latent features, historical trajectories,\nHD map features, and historical traffic signal information are fused with\nvarious transformer-based encoders. The encoded traffic scenes are then decoded\nby a trajectory decoder to generate multimodal future trajectories.\nComprehensive experimental results show that the proposed approach exhibits\nsuperior performance in generating both realistic and diverse trajectories,\nshowing its potential for integration into automatic driving simulation\nsystems.\n","authors":["Chen Yang","Aaron Xuxiang Tian","Dong Chen","Tianyu Shi","Arsalan Heydarian"],"pdf_url":"https://arxiv.org/pdf/2404.02082v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.02072v1","updated":"2024-04-02T16:20:02Z","published":"2024-04-02T16:20:02Z","title":"EGTR: Extracting Graph from Transformer for Scene Graph Generation","summary":" Scene Graph Generation (SGG) is a challenging task of detecting objects and\npredicting relationships between objects. After DETR was developed, one-stage\nSGG models based on a one-stage object detector have been actively studied.\nHowever, complex modeling is used to predict the relationship between objects,\nand the inherent relationship between object queries learned in the multi-head\nself-attention of the object detector has been neglected. We propose a\nlightweight one-stage SGG model that extracts the relation graph from the\nvarious relationships learned in the multi-head self-attention layers of the\nDETR decoder. By fully utilizing the self-attention by-products, the relation\ngraph can be extracted effectively with a shallow relation extraction head.\nConsidering the dependency of the relation extraction task on the object\ndetection task, we propose a novel relation smoothing technique that adjusts\nthe relation label adaptively according to the quality of the detected objects.\nBy the relation smoothing, the model is trained according to the continuous\ncurriculum that focuses on object detection task at the beginning of training\nand performs multi-task learning as the object detection performance gradually\nimproves. Furthermore, we propose a connectivity prediction task that predicts\nwhether a relation exists between object pairs as an auxiliary task of the\nrelation extraction. We demonstrate the effectiveness and efficiency of our\nmethod for the Visual Genome and Open Image V6 datasets. Our code is publicly\navailable at https://github.com/naver-ai/egtr .\n","authors":["Jinbae Im","JeongYeon Nam","Nokyung Park","Hyungmin Lee","Seunghyun Park"],"pdf_url":"https://arxiv.org/pdf/2404.02072v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.06077v3","updated":"2024-04-02T16:19:22Z","published":"2023-06-05T17:22:54Z","title":"Semantically-Prompted Language Models Improve Visual Descriptions","summary":" Language-vision models like CLIP have made significant strides in vision\ntasks, such as zero-shot image classification (ZSIC). However, generating\nspecific and expressive visual descriptions remains challenging; descriptions\nproduced by current methods are often ambiguous and lacking in granularity. To\ntackle these issues, we propose V-GLOSS: Visual Glosses, a novel method built\nupon two key ideas. The first is Semantic Prompting, which conditions a\nlanguage model on structured semantic knowledge. The second is a new\ncontrastive algorithm that elicits fine-grained distinctions between similar\nconcepts. With both ideas, we demonstrate that V-GLOSS improves visual\ndescriptions and achieves strong results in the zero-shot setting on general\nand fine-grained image-classification datasets, including ImageNet, STL-10,\nFGVC Aircraft, and Flowers 102. Moreover, these descriptive capabilities\ncontribute to enhancing image-generation performance. Finally, we introduce a\nquality-tested silver dataset with descriptions generated with V-GLOSS for all\nImageNet classes.\n","authors":["Michael Ogezi","Bradley Hauer","Grzegorz Kondrak"],"pdf_url":"https://arxiv.org/pdf/2306.06077v3.pdf","comment":"To appear at NAACL 2024"},{"id":"http://arxiv.org/abs/2404.02067v1","updated":"2024-04-02T16:07:50Z","published":"2024-04-02T16:07:50Z","title":"Red-Teaming Segment Anything Model","summary":" Foundation models have emerged as pivotal tools, tackling many complex tasks\nthrough pre-training on vast datasets and subsequent fine-tuning for specific\napplications. The Segment Anything Model is one of the first and most\nwell-known foundation models for computer vision segmentation tasks. This work\npresents a multi-faceted red-teaming analysis that tests the Segment Anything\nModel against challenging tasks: (1) We analyze the impact of style transfer on\nsegmentation masks, demonstrating that applying adverse weather conditions and\nraindrops to dashboard images of city roads significantly distorts generated\nmasks. (2) We focus on assessing whether the model can be used for attacks on\nprivacy, such as recognizing celebrities' faces, and show that the model\npossesses some undesired knowledge in this task. (3) Finally, we check how\nrobust the model is to adversarial attacks on segmentation masks under text\nprompts. We not only show the effectiveness of popular white-box attacks and\nresistance to black-box attacks but also introduce a novel approach - Focused\nIterative Gradient Attack (FIGA) that combines white-box approaches to\nconstruct an efficient attack resulting in a smaller number of modified pixels.\nAll of our testing methods and analyses indicate a need for enhanced safety\nmeasures in foundation models for image segmentation.\n","authors":["Krzysztof Jankowski","Bartlomiej Sobieski","Mateusz Kwiatkowski","Jakub Szulc","Michal Janik","Hubert Baniecki","Przemyslaw Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.02067v1.pdf","comment":"CVPR 2024 - The 4th Workshop of Adversarial Machine Learning on\n Computer Vision: Robustness of Foundation Models"},{"id":"http://arxiv.org/abs/2404.02065v1","updated":"2024-04-02T16:06:20Z","published":"2024-04-02T16:06:20Z","title":"Multi-Level Label Correction by Distilling Proximate Patterns for\n Semi-supervised Semantic Segmentation","summary":" Semi-supervised semantic segmentation relieves the reliance on large-scale\nlabeled data by leveraging unlabeled data. Recent semi-supervised semantic\nsegmentation approaches mainly resort to pseudo-labeling methods to exploit\nunlabeled data. However, unreliable pseudo-labeling can undermine the\nsemi-supervision processes. In this paper, we propose an algorithm called\nMulti-Level Label Correction (MLLC), which aims to use graph neural networks to\ncapture structural relationships in Semantic-Level Graphs (SLGs) and\nClass-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically,\nSLGs represent semantic affinities between pairs of pixel features, and CLGs\ndescribe classification consistencies between pairs of pixel labels. With the\nsupport of proximate pattern information from graphs, MLLC can rectify\nincorrectly predicted pseudo-labels and can facilitate discriminative feature\nrepresentations. We design an end-to-end network to train and perform this\neffective label corrections mechanism. Experiments demonstrate that MLLC can\nsignificantly improve supervised baselines and outperforms state-of-the-art\napproaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets.\nSpecifically, MLLC improves the supervised baseline by at least 5% and 2% with\nDeepLabV2 and DeepLabV3+ respectively under different partition protocols.\n","authors":["Hui Xiao","Yuting Hong","Li Dong","Diqun Yan","Jiayan Zhuang","Junjie Xiong","Dongtai Liang","Chengbin Peng"],"pdf_url":"https://arxiv.org/pdf/2404.02065v1.pdf","comment":"12 pages, 8 figures. IEEE Transactions on Multimedia, 2024"},{"id":"http://arxiv.org/abs/2404.02059v1","updated":"2024-04-02T15:58:36Z","published":"2024-04-02T15:58:36Z","title":"IISAN: Efficiently Adapting Multimodal Representation for Sequential\n Recommendation with Decoupled PEFT","summary":" Multimodal foundation models are transformative in sequential recommender\nsystems, leveraging powerful representation learning capabilities. While\nParameter-efficient Fine-tuning (PEFT) is commonly used to adapt foundation\nmodels for recommendation tasks, most research prioritizes parameter\nefficiency, often overlooking critical factors like GPU memory efficiency and\ntraining speed. Addressing this gap, our paper introduces IISAN (Intra- and\nInter-modal Side Adapted Network for Multimodal Representation), a simple\nplug-and-play architecture using a Decoupled PEFT structure and exploiting both\nintra- and inter-modal adaptation.\n IISAN matches the performance of full fine-tuning (FFT) and state-of-the-art\nPEFT. More importantly, it significantly reduces GPU memory usage - from 47GB\nto just 3GB for multimodal sequential recommendation tasks. Additionally, it\naccelerates training time per epoch from 443s to 22s compared to FFT. This is\nalso a notable improvement over the Adapter and LoRA, which require 37-39 GB\nGPU memory and 350-380 seconds per epoch for training.\n Furthermore, we propose a new composite efficiency metric, TPME\n(Training-time, Parameter, and GPU Memory Efficiency) to alleviate the\nprevalent misconception that \"parameter efficiency represents overall\nefficiency\". TPME provides more comprehensive insights into practical\nefficiency comparisons between different methods. Besides, we give an\naccessible efficiency analysis of all PEFT and FFT approaches, which\ndemonstrate the superiority of IISAN. We release our codes and other materials\nat https://github.com/jjGenAILab/IISAN.\n","authors":["Junchen Fu","Xuri Ge","Xin Xin","Alexandros Karatzoglou","Ioannis Arapakis","Jie Wang","Joemon M Jose"],"pdf_url":"https://arxiv.org/pdf/2404.02059v1.pdf","comment":"Accepted by SIGIR2024"},{"id":"http://arxiv.org/abs/2307.09020v3","updated":"2024-04-02T15:46:19Z","published":"2023-07-18T07:20:31Z","title":"FISTNet: FusIon of STyle-path generative Networks for Facial Style\n Transfer","summary":" With the surge in emerging technologies such as Metaverse, spatial computing,\nand generative AI, the application of facial style transfer has gained a lot of\ninterest from researchers as well as startups enthusiasts alike. StyleGAN\nmethods have paved the way for transfer-learning strategies that could reduce\nthe dependency on the huge volume of data that is available for the training\nprocess. However, StyleGAN methods have the tendency of overfitting that\nresults in the introduction of artifacts in the facial images. Studies, such as\nDualStyleGAN, proposed the use of multipath networks but they require the\nnetworks to be trained for a specific style rather than generating a fusion of\nfacial styles at once. In this paper, we propose a FusIon of STyles (FIST)\nnetwork for facial images that leverages pre-trained multipath style transfer\nnetworks to eliminate the problem associated with lack of huge data volume in\nthe training phase along with the fusion of multiple styles at the output. We\nleverage pre-trained styleGAN networks with an external style pass that use\nresidual modulation block instead of a transform coding block. The method also\npreserves facial structure, identity, and details via the gated mapping unit\nintroduced in this study. The aforementioned components enable us to train the\nnetwork with very limited amount of data while generating high-quality stylized\nimages. Our training process adapts curriculum learning strategy to perform\nefficient, flexible style and model fusion in the generative space. We perform\nextensive experiments to show the superiority of FISTNet in comparison to\nexisting state-of-the-art methods.\n","authors":["Sunder Ali Khowaja","Lewis Nkenyereye","Ghulam Mujtaba","Ik Hyun Lee","Giancarlo Fortino","Kapal Dev"],"pdf_url":"https://arxiv.org/pdf/2307.09020v3.pdf","comment":"21 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2404.01188v2","updated":"2024-04-02T15:45:38Z","published":"2024-04-01T15:45:58Z","title":"MonoBox: Tightness-free Box-supervised Polyp Segmentation using\n Monotonicity Constraint","summary":" We propose MonoBox, an innovative box-supervised segmentation method\nconstrained by monotonicity to liberate its training from the user-unfriendly\nbox-tightness assumption. In contrast to conventional box-supervised\nsegmentation, where the box edges must precisely touch the target boundaries,\nMonoBox leverages imprecisely-annotated boxes to achieve robust pixel-wise\nsegmentation. The 'linchpin' is that, within the noisy zones around box edges,\nMonoBox discards the traditional misguiding multiple-instance learning loss,\nand instead optimizes a carefully-designed objective, termed monotonicity\nconstraint. Along directions transitioning from the foreground to background,\nthis new constraint steers responses to adhere to a trend of monotonically\ndecreasing values. Consequently, the originally unreliable learning within the\nnoisy zones is transformed into a correct and effective monotonicity\noptimization. Moreover, an adaptive label correction is introduced, enabling\nMonoBox to enhance the tightness of box annotations using predicted masks from\nthe previous epoch and dynamically shrink the noisy zones as training\nprogresses. We verify MonoBox in the box-supervised segmentation task of\npolyps, where satisfying box-tightness is challenging due to the vague\nboundaries between the polyp and normal tissues. Experiments on both public\nsynthetic and in-house real noisy datasets demonstrate that MonoBox exceeds\nother anti-noise state-of-the-arts by improving Dice by at least 5.5% and 3.3%,\nrespectively. Codes are at https://github.com/Huster-Hq/MonoBox.\n","authors":["Qiang Hu","Zhenyu Yi","Ying Zhou","Ting Li","Fan Huang","Mei Liu","Qiang Li","Zhiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.01188v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02046v1","updated":"2024-04-02T15:38:18Z","published":"2024-04-02T15:38:18Z","title":"Causality-based Transfer of Driving Scenarios to Unseen Intersections","summary":" Scenario-based testing of automated driving functions has become a promising\nmethod to reduce time and cost compared to real-world testing. In\nscenario-based testing automated functions are evaluated in a set of\npre-defined scenarios. These scenarios provide information about vehicle\nbehaviors, environmental conditions, or road characteristics using parameters.\nTo create realistic scenarios, parameters and parameter dependencies have to be\nfitted utilizing real-world data. However, due to the large variety of\nintersections and movement constellations found in reality, data may not be\navailable for certain scenarios. This paper proposes a methodology to\nsystematically analyze relations between parameters of scenarios. Bayesian\nnetworks are utilized to analyze causal dependencies in order to decrease the\namount of required data and to transfer causal patterns creating unseen\nscenarios. Thereby, infrastructural influences on movement patterns are\ninvestigated to generate realistic scenarios on unobserved intersections. For\nevaluation, scenarios and underlying parameters are extracted from the inD\ndataset. Movement patterns are estimated, transferred and checked against\nrecorded data from those initially unseen intersections.\n","authors":["Christoph Glasmacher","Michael Schuldes","Sleiman El Masri","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2404.02046v1.pdf","comment":"6 pages, 8 figures, 1 table, Accepted to be published as part of the\n 35th IEEE Intelligent Vehicles Symposium, June 2 - 5, 2024, Korea"},{"id":"http://arxiv.org/abs/2404.02041v1","updated":"2024-04-02T15:34:52Z","published":"2024-04-02T15:34:52Z","title":"SelfPose3d: Self-Supervised Multi-Person Multi-View 3d Pose Estimation","summary":" We present a new self-supervised approach, SelfPose3d, for estimating 3d\nposes of multiple persons from multiple camera views. Unlike current\nstate-of-the-art fully-supervised methods, our approach does not require any 2d\nor 3d ground-truth poses and uses only the multi-view input images from a\ncalibrated camera setup and 2d pseudo poses generated from an off-the-shelf 2d\nhuman pose estimator. We propose two self-supervised learning objectives:\nself-supervised person localization in 3d space and self-supervised 3d pose\nestimation. We achieve self-supervised 3d person localization by training the\nmodel on synthetically generated 3d points, serving as 3d person root\npositions, and on the projected root-heatmaps in all the views. We then model\nthe 3d poses of all the localized persons with a bottleneck representation, map\nthem onto all views obtaining 2d joints, and render them using 2d Gaussian\nheatmaps in an end-to-end differentiable manner. Afterwards, we use the\ncorresponding 2d joints and heatmaps from the pseudo 2d poses for learning. To\nalleviate the intrinsic inaccuracy of the pseudo labels, we propose an adaptive\nsupervision attention mechanism to guide the self-supervision. Our experiments\nand analysis on three public benchmark datasets, including Panoptic, Shelf, and\nCampus, show the effectiveness of our approach, which is comparable to\nfully-supervised methods. Code is available at\n\\url{https://github.com/CAMMA-public/SelfPose3D}\n","authors":["Vinkle Srivastav","Keqi Chen","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2404.02041v1.pdf","comment":"Accepted for CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10488v2","updated":"2024-04-02T15:34:04Z","published":"2024-03-15T17:23:38Z","title":"Joint Multimodal Transformer for Emotion Recognition in the Wild","summary":" Systems for multimodal emotion recognition (MMER) can typically outperform\nunimodal systems by leveraging the inter- and intra-modal relationships\nbetween, e.g., visual, textual, physiological, and auditory modalities. In this\npaper, an MMER method is proposed that relies on a joint multimodal transformer\nfor fusion with key-based cross-attention. This framework aims to exploit the\ndiverse and complementary nature of different modalities to improve predictive\naccuracy. Separate backbones capture intra-modal spatiotemporal dependencies\nwithin each modality over video sequences. Subsequently, a joint multimodal\ntransformer fusion architecture integrates the individual modality embeddings,\nallowing the model to capture inter-modal and intra-modal relationships\neffectively. Extensive experiments on two challenging expression recognition\ntasks: (1) dimensional emotion recognition on the Affwild2 dataset (with face\nand voice), and (2) pain estimation on the Biovid dataset (with face and\nbiosensors), indicate that the proposed method can work effectively with\ndifferent modalities. Empirical results show that MMER systems with our\nproposed fusion method allow us to outperform relevant baseline and\nstate-of-the-art methods.\n","authors":["Paul Waligora","Haseeb Aslam","Osama Zeeshan","Soufiane Belharbi","Alessandro Lameiras Koerich","Marco Pedersoli","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2403.10488v2.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.10172v2","updated":"2024-04-02T15:24:24Z","published":"2023-09-18T21:41:04Z","title":"Enhancing wind field resolution in complex terrain through a\n knowledge-driven machine learning approach","summary":" Atmospheric flows are governed by a broad variety of spatio-temporal scales,\nthus making real-time numerical modeling of such turbulent flows in complex\nterrain at high resolution computationally intractable. In this study, we\ndemonstrate a neural network approach motivated by Enhanced Super-Resolution\nGenerative Adversarial Networks to upscale low-resolution wind fields to\ngenerate high-resolution wind fields in an actual wind farm in Bessaker,\nNorway. The neural network-based model is shown to successfully reconstruct\nfully resolved 3D velocity fields from a coarser scale while respecting the\nlocal terrain and that it easily outperforms trilinear interpolation. We also\ndemonstrate that by using appropriate cost function based on domain knowledge,\nwe can alleviate the use of adversarial training.\n","authors":["Jacob Wulff Wold","Florian Stadtmann","Adil Rasheed","Mandar Tabib","Omer San","Jan-Tore Horn"],"pdf_url":"https://arxiv.org/pdf/2309.10172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17922v2","updated":"2024-04-02T15:20:58Z","published":"2023-11-29T18:59:59Z","title":"A Simple Recipe for Language-guided Domain Generalized Segmentation","summary":" Generalization to new domains not seen during training is one of the\nlong-standing challenges in deploying neural networks in real-world\napplications. Existing generalization techniques either necessitate external\nimages for augmentation, and/or aim at learning invariant representations by\nimposing various alignment constraints. Large-scale pretraining has recently\nshown promising generalization capabilities, along with the potential of\nbinding different modalities. For instance, the advent of vision-language\nmodels like CLIP has opened the doorway for vision models to exploit the\ntextual modality. In this paper, we introduce a simple framework for\ngeneralizing semantic segmentation networks by employing language as the source\nof randomization. Our recipe comprises three key ingredients: (i) the\npreservation of the intrinsic CLIP robustness through minimal fine-tuning, (ii)\nlanguage-driven local style augmentation, and (iii) randomization by locally\nmixing the source and augmented styles during training. Extensive experiments\nreport state-of-the-art results on various generalization benchmarks. Code is\naccessible at https://github.com/astra-vision/FAMix .\n","authors":["Mohammad Fahes","Tuan-Hung Vu","Andrei Bursuc","Patrick Pérez","Raoul de Charette"],"pdf_url":"https://arxiv.org/pdf/2311.17922v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01232v2","updated":"2024-04-02T15:03:33Z","published":"2024-04-01T16:51:13Z","title":"Open-Vocabulary Federated Learning with Multimodal Prototyping","summary":" Existing federated learning (FL) studies usually assume the training label\nspace and test label space are identical. However, in real-world applications,\nthis assumption is too ideal to be true. A new user could come up with queries\nthat involve data from unseen classes, and such open-vocabulary queries would\ndirectly defect such FL systems. Therefore, in this work, we explicitly focus\non the under-explored open-vocabulary challenge in FL. That is, for a new user,\nthe global server shall understand her/his query that involves arbitrary\nunknown classes. To address this problem, we leverage the pre-trained\nvision-language models (VLMs). In particular, we present a novel adaptation\nframework tailored for VLMs in the context of FL, named as Federated Multimodal\nPrototyping (Fed-MP). Fed-MP adaptively aggregates the local model weights\nbased on light-weight client residuals, and makes predictions based on a novel\nmultimodal prototyping mechanism. Fed-MP exploits the knowledge learned from\nthe seen classes, and robustifies the adapted VLM to unseen categories. Our\nempirical evaluation on various datasets validates the effectiveness of Fed-MP.\n","authors":["Huimin Zeng","Zhenrui Yue","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.01232v2.pdf","comment":"Accepted at NAACL 2024"},{"id":"http://arxiv.org/abs/2403.07888v2","updated":"2024-04-02T14:47:23Z","published":"2024-02-02T18:54:48Z","title":"Cross-modality debiasing: using language to mitigate sub-population\n shifts in imaging","summary":" Sub-population shift is a specific type of domain shift that highlights\nchanges in data distribution within specific sub-groups or populations between\ntraining and testing. Sub-population shift accounts for a significant source of\nalgorithmic bias and calls for distributional robustness. Recent studies found\ninherent distributional robustness in multi-modality foundation models, such as\nthe vision-language model CLIP, yet this robustness is vulnerable through\nparameter fine-tuning. In this paper, we propose leveraging the connection of\nrobustness among different modalities and reshaping the distributional\nrobustness of one modality with another. Specifically, in the context of the\ndistributional robustness of CLIP, we propose to leverage natural language\ninputs to debias the image feature representations, to improve worst-case\nperformance on sub-populations. Our extensive empirical studies show that image\nrepresentations debiased by natural language can achieve significant\nperformance improvement and reduction of performance instability under\nsub-population shifts.\n","authors":["Yijiang Pang","Bao Hoang","Jiayu Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.07888v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01998v1","updated":"2024-04-02T14:41:42Z","published":"2024-04-02T14:41:42Z","title":"Specularity Factorization for Low-Light Enhancement","summary":" We present a new additive image factorization technique that treats images to\nbe composed of multiple latent specular components which can be simply\nestimated recursively by modulating the sparsity during decomposition. Our\nmodel-driven {\\em RSFNet} estimates these factors by unrolling the optimization\ninto network layers requiring only a few scalars to be learned. The resultant\nfactors are interpretable by design and can be fused for different image\nenhancement tasks via a network or combined directly by the user in a\ncontrollable fashion. Based on RSFNet, we detail a zero-reference Low Light\nEnhancement (LLE) application trained without paired or unpaired supervision.\nOur system improves the state-of-the-art performance on standard benchmarks and\nachieves better generalization on multiple other datasets. We also integrate\nour factors with other task specific fusion networks for applications like\nderaining, deblurring and dehazing with negligible overhead thereby\nhighlighting the multi-domain and multi-task generalizability of our proposed\nRSFNet. The code and data is released for reproducibility on the project\nhomepage.\n","authors":["Saurabh Saini","P J Narayanan"],"pdf_url":"https://arxiv.org/pdf/2404.01998v1.pdf","comment":"CVPR 2024, Pages: 8(main)+4(references)+17(supp) = 29"},{"id":"http://arxiv.org/abs/2404.01995v1","updated":"2024-04-02T14:40:11Z","published":"2024-04-02T14:40:11Z","title":"A discussion about violin reduction: geometric analysis of contour lines\n and channel of minima","summary":" Some early violins have been reduced during their history to fit imposed\nmorphological standards, while more recent ones have been built directly to\nthese standards. We can observe differences between reduced and unreduced\ninstruments, particularly in their contour lines and channel of minima. In a\nrecent preliminary work, we computed and highlighted those two features for two\ninstruments using triangular 3D meshes acquired by photogrammetry, whose\nfidelity has been assessed and validated with sub-millimetre accuracy. We\npropose here an extension to a corpus of 38 violins, violas and cellos, and\nintroduce improved procedures, leading to a stronger discussion of the\ngeometric analysis. We first recall the material we are working with. We then\ndiscuss how to derive the best reference plane for the violin alignment, which\nis crucial for the computation of contour lines and channel of minima. Finally,\nwe show how to compute efficiently both characteristics and we illustrate our\nresults with a few examples.\n","authors":["Philémon Beghin","Anne-Emmanuelle Ceulemans","François Glineur"],"pdf_url":"https://arxiv.org/pdf/2404.01995v1.pdf","comment":"Paper accepted (before reviewing) for the Florence Heri-Tech 2024\n Conference"},{"id":"http://arxiv.org/abs/2404.01994v1","updated":"2024-04-02T14:40:04Z","published":"2024-04-02T14:40:04Z","title":"DELAN: Dual-Level Alignment for Vision-and-Language Navigation by\n Cross-Modal Contrastive Learning","summary":" Vision-and-Language navigation (VLN) requires an agent to navigate in unseen\nenvironment by following natural language instruction. For task completion, the\nagent needs to align and integrate various navigation modalities, including\ninstruction, observation and navigation history. Existing works primarily\nconcentrate on cross-modal attention at the fusion stage to achieve this\nobjective. Nevertheless, modality features generated by disparate uni-encoders\nreside in their own spaces, leading to a decline in the quality of cross-modal\nfusion and decision. To address this problem, we propose a Dual-levEL AligNment\n(DELAN) framework by cross-modal contrastive learning. This framework is\ndesigned to align various navigation-related modalities before fusion, thereby\nenhancing cross-modal interaction and action decision-making. Specifically, we\ndivide the pre-fusion alignment into dual levels: instruction-history level and\nlandmark-observation level according to their semantic correlations. We also\nreconstruct a dual-level instruction for adaptation to the dual-level\nalignment. As the training signals for pre-fusion alignment are extremely\nlimited, self-supervised contrastive learning strategies are employed to\nenforce the matching between different modalities. Our approach seamlessly\nintegrates with the majority of existing models, resulting in improved\nnavigation performance on various VLN benchmarks, including R2R, R4R, RxR and\nCVDN.\n","authors":["Mengfei Du","Binhao Wu","Jiwen Zhang","Zhihao Fan","Zejun Li","Ruipu Luo","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2404.01994v1.pdf","comment":"Accepted by LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2312.00057v2","updated":"2024-04-02T14:28:26Z","published":"2023-11-29T12:10:00Z","title":"VA3: Virtually Assured Amplification Attack on Probabilistic Copyright\n Protection for Text-to-Image Generative Models","summary":" The booming use of text-to-image generative models has raised concerns about\ntheir high risk of producing copyright-infringing content. While probabilistic\ncopyright protection methods provide a probabilistic guarantee against such\ninfringement, in this paper, we introduce Virtually Assured Amplification\nAttack (VA3), a novel online attack framework that exposes the vulnerabilities\nof these protection mechanisms. The proposed framework significantly amplifies\nthe probability of generating infringing content on the sustained interactions\nwith generative models and a non-trivial lower-bound on the success probability\nof each engagement. Our theoretical and experimental results demonstrate the\neffectiveness of our approach under various scenarios. These findings highlight\nthe potential risk of implementing probabilistic copyright protection in\npractical applications of text-to-image generative models. Code is available at\nhttps://github.com/South7X/VA3.\n","authors":["Xiang Li","Qianli Shen","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2312.00057v2.pdf","comment":"18 pages, 9 figures. Accept to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01988v1","updated":"2024-04-02T14:26:18Z","published":"2024-04-02T14:26:18Z","title":"Cooperative Students: Navigating Unsupervised Domain Adaptation in\n Nighttime Object Detection","summary":" Unsupervised Domain Adaptation (UDA) has shown significant advancements in\nobject detection under well-lit conditions; however, its performance degrades\nnotably in low-visibility scenarios, especially at night, posing challenges not\nonly for its adaptability in low signal-to-noise ratio (SNR) conditions but\nalso for the reliability and efficiency of automated vehicles. To address this\nproblem, we propose a \\textbf{Co}operative \\textbf{S}tudents (\\textbf{CoS})\nframework that innovatively employs global-local transformations (GLT) and a\nproxy-based target consistency (PTC) mechanism to capture the spatial\nconsistency in day- and night-time scenarios effectively, and thus bridge the\nsignificant domain shift across contexts. Building upon this, we further devise\nan adaptive IoU-informed thresholding (AIT) module to gradually avoid\noverlooking potential true positives and enrich the latent information in the\ntarget domain. Comprehensive experiments show that CoS essentially enhanced UDA\nperformance in low-visibility conditions and surpasses current state-of-the-art\ntechniques, achieving an increase in mAP of 3.0\\%, 1.9\\%, and 2.5\\% on BDD100K,\nSHIFT, and ACDC datasets, respectively. Code is available at\nhttps://github.com/jichengyuan/Cooperitive_Students.\n","authors":["Jicheng Yuan","Anh Le-Tuan","Manfred Hauswirth","Danh Le-Phuoc"],"pdf_url":"https://arxiv.org/pdf/2404.01988v1.pdf","comment":"Code is available at\n https://github.com/jichengyuan/Cooperitive_Students"},{"id":"http://arxiv.org/abs/2404.01984v1","updated":"2024-04-02T14:22:04Z","published":"2024-04-02T14:22:04Z","title":"Fashion Style Editing with Generative Human Prior","summary":" Image editing has been a long-standing challenge in the research community\nwith its far-reaching impact on numerous applications. Recently, text-driven\nmethods started to deliver promising results in domains like human faces, but\ntheir applications to more complex domains have been relatively limited. In\nthis work, we explore the task of fashion style editing, where we aim to\nmanipulate the fashion style of human imagery using text descriptions.\nSpecifically, we leverage a generative human prior and achieve fashion style\nediting by navigating its learned latent space. We first verify that the\nexisting text-driven editing methods fall short for our problem due to their\noverly simplified guidance signal, and propose two directions to reinforce the\nguidance: textual augmentation and visual referencing. Combined with our\nempirical findings on the latent space structure, our Fashion Style Editing\nframework (FaSE) successfully projects abstract fashion concepts onto human\nimages and introduces exciting new applications to the field.\n","authors":["Chaerin Kong","Seungyong Lee","Soohyeok Im","Wonsuk Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01984v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2404.01976v1","updated":"2024-04-02T14:16:59Z","published":"2024-04-02T14:16:59Z","title":"Joint-Task Regularization for Partially Labeled Multi-Task Learning","summary":" Multi-task learning has become increasingly popular in the machine learning\nfield, but its practicality is hindered by the need for large, labeled\ndatasets. Most multi-task learning methods depend on fully labeled datasets\nwherein each input example is accompanied by ground-truth labels for all target\ntasks. Unfortunately, curating such datasets can be prohibitively expensive and\nimpractical, especially for dense prediction tasks which require per-pixel\nlabels for each image. With this in mind, we propose Joint-Task Regularization\n(JTR), an intuitive technique which leverages cross-task relations to\nsimultaneously regularize all tasks in a single joint-task latent space to\nimprove learning when data is not fully labeled for all tasks. JTR stands out\nfrom existing approaches in that it regularizes all tasks jointly rather than\nseparately in pairs -- therefore, it achieves linear complexity relative to the\nnumber of tasks while previous methods scale quadratically. To demonstrate the\nvalidity of our approach, we extensively benchmark our method across a wide\nvariety of partially labeled scenarios based on NYU-v2, Cityscapes, and\nTaskonomy.\n","authors":["Kento Nishi","Junsik Kim","Wanhua Li","Hanspeter Pfister"],"pdf_url":"https://arxiv.org/pdf/2404.01976v1.pdf","comment":"Accepted paper to CVPR 2024 (main conference)"},{"id":"http://arxiv.org/abs/2308.13150v8","updated":"2024-04-02T14:14:26Z","published":"2023-08-25T03:08:41Z","title":"Dual-Activated Lightweight Attention ResNet50 for Automatic\n Histopathology Breast Cancer Image Classification","summary":" Automatic breast cancer classification in histopathology images is crucial\nfor precise diagnosis and treatment planning. Recently, classification\napproaches based on the ResNet architecture have gained popularity for\nsignificantly improving accuracy by using skip connections to mitigate\nvanishing gradient problems, thereby integrating low-level and high-level\nfeature information. Nevertheless, the conventional ResNet architecture faces\nchallenges such as data imbalance and limited interpretability, necessitating\ncross-domain knowledge and collaboration among medical experts. This study\neffectively addresses these challenges by introducing a novel method for breast\ncancer classification, the Dual-Activated Lightweight Attention ResNet50\n(DALAResNet50) model. It integrates a pre-trained ResNet50 model with a\nlightweight attention mechanism, embedding an attention module in the fourth\nlayer of ResNet50 and incorporating two fully connected layers with LeakyReLU\nand ReLU activation functions to enhance feature learning capabilities. The\nDALAResNet50 method was tested on breast cancer histopathology images from the\nBreakHis Database across magnification factors of 40X, 100X, 200X, and 400X,\nachieving accuracies of 98.5%, 98.7%, 97.9%, and 94.3%, respectively. It was\nalso compared with established deep learning models such as SEResNet50,\nDenseNet121, VGG16, VGG16Inception, ViT, Swin-Transformer, Dinov2_Vitb14, and\nResNet50. The reported results of DALAResNet50 have been shown to outperform\nthe compared approaches regarding accuracy, F1 score, IBA, and GMean,\ndemonstrating significant robustness and broad applicability when dealing with\ndifferent magnifications and imbalanced breast cancer datasets\n","authors":["Suxing Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13150v8.pdf","comment":"13 pages, 7 figures,7 tables"},{"id":"http://arxiv.org/abs/2310.13076v2","updated":"2024-04-02T14:14:16Z","published":"2023-10-19T18:14:33Z","title":"PatchCURE: Improving Certifiable Robustness, Model Utility, and\n Computation Efficiency of Adversarial Patch Defenses","summary":" State-of-the-art defenses against adversarial patch attacks can now achieve\nstrong certifiable robustness with a marginal drop in model utility. However,\nthis impressive performance typically comes at the cost of 10-100x more\ninference-time computation compared to undefended models -- the research\ncommunity has witnessed an intense three-way trade-off between certifiable\nrobustness, model utility, and computation efficiency. In this paper, we\npropose a defense framework named PatchCURE to approach this trade-off problem.\nPatchCURE provides sufficient \"knobs\" for tuning defense performance and allows\nus to build a family of defenses: the most robust PatchCURE instance can match\nthe performance of any existing state-of-the-art defense (without efficiency\nconsiderations); the most efficient PatchCURE instance has similar inference\nefficiency as undefended models. Notably, PatchCURE achieves state-of-the-art\nrobustness and utility performance across all different efficiency levels,\ne.g., 16-23% absolute clean accuracy and certified robust accuracy advantages\nover prior defenses when requiring computation efficiency to be close to\nundefended models. The family of PatchCURE defenses enables us to flexibly\nchoose appropriate defenses to satisfy given computation and/or utility\nconstraints in practice.\n","authors":["Chong Xiang","Tong Wu","Sihui Dai","Jonathan Petit","Suman Jana","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2310.13076v2.pdf","comment":"USENIX Security 2024. (extended) technical report"},{"id":"http://arxiv.org/abs/2404.01964v1","updated":"2024-04-02T13:57:30Z","published":"2024-04-02T13:57:30Z","title":"CAM-Based Methods Can See through Walls","summary":" CAM-based methods are widely-used post-hoc interpretability method that\nproduce a saliency map to explain the decision of an image classification\nmodel. The saliency map highlights the important areas of the image relevant to\nthe prediction. In this paper, we show that most of these methods can\nincorrectly attribute an important score to parts of the image that the model\ncannot see. We show that this phenomenon occurs both theoretically and\nexperimentally. On the theory side, we analyze the behavior of GradCAM on a\nsimple masked CNN model at initialization. Experimentally, we train a VGG-like\nmodel constrained to not use the lower part of the image and nevertheless\nobserve positive scores in the unseen part of the image. This behavior is\nevaluated quantitatively on two new datasets. We believe that this is\nproblematic, potentially leading to mis-interpretation of the model's behavior.\n","authors":["Magamed Taimeskhanov","Ronan Sicre","Damien Garreau"],"pdf_url":"https://arxiv.org/pdf/2404.01964v1.pdf","comment":"25 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.10974v2","updated":"2024-04-02T13:57:22Z","published":"2023-07-20T16:00:19Z","title":"Deep Multi-Threshold Spiking-UNet for Image Processing","summary":" U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Zheng-jun Zha","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v2.pdf","comment":"Accepted in NeuroComputing"},{"id":"http://arxiv.org/abs/2404.01959v1","updated":"2024-04-02T13:54:22Z","published":"2024-04-02T13:54:22Z","title":"Bi-LORA: A Vision-Language Approach for Synthetic Image Detection","summary":" Advancements in deep image synthesis techniques, such as generative\nadversarial networks (GANs) and diffusion models (DMs), have ushered in an era\nof generating highly realistic images. While this technological progress has\ncaptured significant interest, it has also raised concerns about the potential\ndifficulty in distinguishing real images from their synthetic counterparts.\nThis paper takes inspiration from the potent convergence capabilities between\nvision and language, coupled with the zero-shot nature of vision-language\nmodels (VLMs). We introduce an innovative method called Bi-LORA that leverages\nVLMs, combined with low-rank adaptation (LORA) tuning techniques, to enhance\nthe precision of synthetic image detection for unseen model-generated images.\nThe pivotal conceptual shift in our methodology revolves around reframing\nbinary classification as an image captioning task, leveraging the distinctive\ncapabilities of cutting-edge VLM, notably bootstrapping language image\npre-training (BLIP2). Rigorous and comprehensive experiments are conducted to\nvalidate the effectiveness of our proposed approach, particularly in detecting\nunseen diffusion-generated images from unknown diffusion-based generative\nmodels during training, showcasing robustness to noise, and demonstrating\ngeneralization capabilities to GANs. The obtained results showcase an\nimpressive average accuracy of 93.41% in synthetic image detection on unseen\ngeneration models. The code and models associated with this research can be\npublicly accessed at https://github.com/Mamadou-Keita/VLM-DETECT.\n","authors":["Mamadou Keita","Wassim Hamidouche","Hessen Bougueffa Eutamene","Abdenour Hadid","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2404.01959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01952v1","updated":"2024-04-02T13:47:15Z","published":"2024-04-02T13:47:15Z","title":"Automatic Wood Pith Detector: Local Orientation Estimation and Robust\n Accumulation","summary":" A fully automated technique for wood pith detection (APD), relying on the\nconcentric shape of the structure of wood ring slices, is introduced. The\nmethod estimates the ring's local orientations using the 2D structure tensor\nand finds the pith position, optimizing a cost function designed for this\nproblem. We also present a variant (APD-PCL), using the parallel coordinates\nspace, that enhances the method's effectiveness when there are no clear tree\nring patterns. Furthermore, refining previous work by Kurdthongmee, a YoloV8\nnet is trained for pith detection, producing a deep learning-based approach to\nthe same problem (APD-DL). All methods were tested on seven datasets, including\nimages captured under diverse conditions (controlled laboratory settings,\nsawmill, and forest) and featuring various tree species (Pinus taeda, Douglas\nfir, Abies alba, and Gleditsia triacanthos). All proposed approaches outperform\nexisting state-of-the-art methods and can be used in CPU-based real-time\napplications. Additionally, we provide a novel dataset comprising images of\ngymnosperm and angiosperm species. Dataset and source code are available at\nhttp://github.com/hmarichal93/apd.\n","authors":["Henry Marichal","Diego Passarella","Gregory Randall"],"pdf_url":"https://arxiv.org/pdf/2404.01952v1.pdf","comment":"18 pages, presented to ICPR 2024 conference"},{"id":"http://arxiv.org/abs/2404.01948v1","updated":"2024-04-02T13:43:08Z","published":"2024-04-02T13:43:08Z","title":"Quantifying Noise of Dynamic Vision Sensor","summary":" Dynamic visual sensors (DVS) are characterized by a large amount of\nbackground activity (BA) noise, which it is mixed with the original (cleaned)\nsensor signal. The dynamic nature of the signal and the absence in practical\napplication of the ground truth, it clearly makes difficult to distinguish\nbetween noise and the cleaned sensor signals using standard image processing\ntechniques. In this letter, a new technique is presented to characterise BA\nnoise derived from the Detrended Fluctuation Analysis (DFA). The proposed\ntechnique can be used to address an existing DVS issues, which is how to\nquantitatively characterised noise and signal without ground truth, and how to\nderive an optimal denoising filter parameters. The solution of the latter\nproblem is demonstrated for the popular real moving-car dataset.\n","authors":["Evgeny V. Votyakov","Alessandro Artusi"],"pdf_url":"https://arxiv.org/pdf/2404.01948v1.pdf","comment":"5 pages, 4 figures, submitted to the IEEE Signal Processing Letters"},{"id":"http://arxiv.org/abs/2404.01946v1","updated":"2024-04-02T13:42:29Z","published":"2024-04-02T13:42:29Z","title":"Synthetic Data for Robust Stroke Segmentation","summary":" Deep learning-based semantic segmentation in neuroimaging currently requires\nhigh-resolution scans and extensive annotated datasets, posing significant\nbarriers to clinical applicability. We present a novel synthetic framework for\nthe task of lesion segmentation, extending the capabilities of the established\nSynthSeg approach to accommodate large heterogeneous pathologies with\nlesion-specific augmentation strategies. Our method trains deep learning\nmodels, demonstrated here with the UNet architecture, using label maps derived\nfrom healthy and stroke datasets, facilitating the segmentation of both healthy\ntissue and pathological lesions without sequence-specific training data.\nEvaluated against in-domain and out-of-domain (OOD) datasets, our framework\ndemonstrates robust performance, rivaling current methods within the training\ndomain and significantly outperforming them on OOD data. This contribution\nholds promise for advancing medical imaging analysis in clinical settings,\nespecially for stroke pathology, by enabling reliable segmentation across\nvaried imaging sequences with reduced dependency on large annotated corpora.\nCode and weights available at https://github.com/liamchalcroft/SynthStroke.\n","authors":["Liam Chalcroft","Ioannis Pappas","Cathy J. Price","John Ashburner"],"pdf_url":"https://arxiv.org/pdf/2404.01946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01945v1","updated":"2024-04-02T13:41:22Z","published":"2024-04-02T13:41:22Z","title":"Event-assisted Low-Light Video Object Segmentation","summary":" In the realm of video object segmentation (VOS), the challenge of operating\nunder low-light conditions persists, resulting in notably degraded image\nquality and compromised accuracy when comparing query and memory frames for\nsimilarity computation. Event cameras, characterized by their high dynamic\nrange and ability to capture motion information of objects, offer promise in\nenhancing object visibility and aiding VOS methods under such low-light\nconditions. This paper introduces a pioneering framework tailored for low-light\nVOS, leveraging event camera data to elevate segmentation accuracy. Our\napproach hinges on two pivotal components: the Adaptive Cross-Modal Fusion\n(ACMF) module, aimed at extracting pertinent features while fusing image and\nevent modalities to mitigate noise interference, and the Event-Guided Memory\nMatching (EGMM) module, designed to rectify the issue of inaccurate matching\nprevalent in low-light settings. Additionally, we present the creation of a\nsynthetic LLE-DAVIS dataset and the curation of a real-world LLE-VOS dataset,\nencompassing frames and events. Experimental evaluations corroborate the\nefficacy of our method across both datasets, affirming its effectiveness in\nlow-light scenarios.\n","authors":["Hebei Li","Jin Wang","Jiahui Yuan","Yue Li","Wenming Weng","Yansong Peng","Yueyi Zhang","Zhiwei Xiong","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2404.01945v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01943v1","updated":"2024-04-02T13:36:03Z","published":"2024-04-02T13:36:03Z","title":"Lookahead Exploration with Neural Radiance Representation for Continuous\n Vision-Language Navigation","summary":" Vision-and-language navigation (VLN) enables the agent to navigate to a\nremote location following the natural language instruction in 3D environments.\nAt each navigation step, the agent selects from possible candidate locations\nand then makes the move. For better navigation planning, the lookahead\nexploration strategy aims to effectively evaluate the agent's next action by\naccurately anticipating the future environment of candidate locations. To this\nend, some existing works predict RGB images for future environments, while this\nstrategy suffers from image distortion and high computational cost. To address\nthese issues, we propose the pre-trained hierarchical neural radiance\nrepresentation model (HNR) to produce multi-level semantic features for future\nenvironments, which are more robust and efficient than pixel-wise RGB\nreconstruction. Furthermore, with the predicted future environmental\nrepresentations, our lookahead VLN model is able to construct the navigable\nfuture path tree and select the optimal path via efficient parallel evaluation.\nExtensive experiments on the VLN-CE datasets confirm the effectiveness of our\nmethod.\n","authors":["Zihan Wang","Xiangyang Li","Jiahao Yang","Yeqi Liu","Junjie Hu","Ming Jiang","Shuqiang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.01943v1.pdf","comment":"Accepted by CVPR 2024. The code is available at\n https://github.com/MrZihan/HNR-VLN"},{"id":"http://arxiv.org/abs/2404.01941v1","updated":"2024-04-02T13:33:31Z","published":"2024-04-02T13:33:31Z","title":"LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging","summary":" Human pose and shape (HPS) estimation with lensless imaging is not only\nbeneficial to privacy protection but also can be used in covert surveillance\nscenarios due to the small size and simple structure of this device. However,\nthis task presents significant challenges due to the inherent ambiguity of the\ncaptured measurements and lacks effective methods for directly estimating human\npose and shape from lensless data. In this paper, we propose the first\nend-to-end framework to recover 3D human poses and shapes from lensless\nmeasurements to our knowledge. We specifically design a multi-scale lensless\nfeature decoder to decode the lensless measurements through the optically\nencoded mask for efficient feature extraction. We also propose a double-head\nauxiliary supervision mechanism to improve the estimation accuracy of human\nlimb ends. Besides, we establish a lensless imaging system and verify the\neffectiveness of our method on various datasets acquired by our lensless\nimaging system.\n","authors":["Haoyang Ge","Qiao Feng","Hailong Jia","Xiongzheng Li","Xiangjun Yin","You Zhou","Jingyu Yang","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2404.01941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16003v2","updated":"2024-04-02T13:31:41Z","published":"2024-03-24T04:22:37Z","title":"Diverse Representation Embedding for Lifelong Person Re-Identification","summary":" Lifelong Person Re-Identification (LReID) aims to continuously learn from\nsuccessive data streams, matching individuals across multiple cameras. The key\nchallenge for LReID is how to effectively preserve old knowledge while\nincrementally learning new information, which is caused by task-level domain\ngaps and limited old task datasets. Existing methods based on CNN backbone are\ninsufficient to explore the representation of each instance from different\nperspectives, limiting model performance on limited old task datasets and new\ntask datasets. Unlike these methods, we propose a Diverse Representations\nEmbedding (DRE) framework that first explores a pure transformer for LReID. The\nproposed DRE preserves old knowledge while adapting to new information based on\ninstance-level and task-level layout. Concretely, an Adaptive Constraint Module\n(ACM) is proposed to implement integration and push away operations between\nmultiple overlapping representations generated by transformer-based backbone,\nobtaining rich and discriminative representations for each instance to improve\nadaptive ability of LReID. Based on the processed diverse representations, we\npropose Knowledge Update (KU) and Knowledge Preservation (KP) strategies at the\ntask-level layout by introducing the adjustment model and the learner model. KU\nstrategy enhances the adaptive learning ability of learner models for new\ninformation under the adjustment model prior, and KP strategy preserves old\nknowledge operated by representation-level alignment and logit-level\nsupervision in limited old task datasets while guaranteeing the adaptive\nlearning information capacity of the LReID model. Compared to state-of-the-art\nmethods, our method achieves significantly improved performance in holistic,\nlarge-scale, and occluded datasets.\n","authors":["Shiben Liu","Huijie Fan","Qiang Wang","Xiai Chen","Zhi Han","Yandong Tang"],"pdf_url":"https://arxiv.org/pdf/2403.16003v2.pdf","comment":"11 pages,7 Tables,3 Figures"},{"id":"http://arxiv.org/abs/2404.01933v1","updated":"2024-04-02T13:27:28Z","published":"2024-04-02T13:27:28Z","title":"PREGO: online mistake detection in PRocedural EGOcentric videos","summary":" Promptly identifying procedural errors from egocentric videos in an online\nsetting is highly challenging and valuable for detecting mistakes as soon as\nthey happen. This capability has a wide range of applications across various\nfields, such as manufacturing and healthcare. The nature of procedural mistakes\nis open-set since novel types of failures might occur, which calls for\none-class classifiers trained on correctly executed procedures. However, no\ntechnique can currently detect open-set procedural mistakes online. We propose\nPREGO, the first online one-class classification model for mistake detection in\nPRocedural EGOcentric videos. PREGO is based on an online action recognition\ncomponent to model the current action, and a symbolic reasoning module to\npredict the next actions. Mistake detection is performed by comparing the\nrecognized current action with the expected future one. We evaluate PREGO on\ntwo procedural egocentric video datasets, Assembly101 and Epic-tent, which we\nadapt for online benchmarking of procedural mistake detection to establish\nsuitable benchmarks, thus defining the Assembly101-O and Epic-tent-O datasets,\nrespectively.\n","authors":["Alessandro Flaborea","Guido Maria D'Amely di Melendugno","Leonardo Plini","Luca Scofano","Edoardo De Matteis","Antonino Furnari","Giovanni Maria Farinella","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2404.01933v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2312.16476v5","updated":"2024-04-02T13:25:04Z","published":"2023-12-27T08:50:01Z","title":"SVGDreamer: Text Guided SVG Generation with Diffusion Model","summary":" Recently, text-guided scalable vector graphics (SVGs) synthesis has shown\npromise in domains such as iconography and sketch. However, existing\ntext-to-SVG generation methods lack editability and struggle with visual\nquality and result diversity. To address these limitations, we propose a novel\ntext-guided vector graphics synthesis method called SVGDreamer. SVGDreamer\nincorporates a semantic-driven image vectorization (SIVE) process that enables\nthe decomposition of synthesis into foreground objects and background, thereby\nenhancing editability. Specifically, the SIVE process introduces\nattention-based primitive control and an attention-mask loss function for\neffective control and manipulation of individual elements. Additionally, we\npropose a Vectorized Particle-based Score Distillation (VPSD) approach to\naddress issues of shape over-smoothing, color over-saturation, limited\ndiversity, and slow convergence of the existing text-to-SVG generation methods\nby modeling SVGs as distributions of control points and colors. Furthermore,\nVPSD leverages a reward model to re-weight vector particles, which improves\naesthetic appeal and accelerates convergence. Extensive experiments are\nconducted to validate the effectiveness of SVGDreamer, demonstrating its\nsuperiority over baseline methods in terms of editability, visual quality, and\ndiversity. Project page:\n\\href{https://ximinng.github.io/SVGDreamer-project/}{https://ximinng.github.io/SVGDreamer-project/}\n","authors":["Ximing Xing","Haitao Zhou","Chuang Wang","Jing Zhang","Dong Xu","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2312.16476v5.pdf","comment":"Accepted by CVPR 2024. project link:\n https://ximinng.github.io/SVGDreamer-project/"},{"id":"http://arxiv.org/abs/2204.14030v5","updated":"2024-04-02T13:24:37Z","published":"2022-04-29T11:55:35Z","title":"Neural Implicit Representations for Physical Parameter Inference from a\n Single Video","summary":" Neural networks have recently been used to analyze diverse physical systems\nand to identify the underlying dynamics. While existing methods achieve\nimpressive results, they are limited by their strong demand for training data\nand their weak generalization abilities to out-of-distribution data. To\novercome these limitations, in this work we propose to combine neural implicit\nrepresentations for appearance modeling with neural ordinary differential\nequations (ODEs) for modelling physical phenomena to obtain a dynamic scene\nrepresentation that can be identified directly from visual observations. Our\nproposed model combines several unique advantages: (i) Contrary to existing\napproaches that require large training datasets, we are able to identify\nphysical parameters from only a single video. (ii) The use of neural implicit\nrepresentations enables the processing of high-resolution videos and the\nsynthesis of photo-realistic images. (iii) The embedded neural ODE has a known\nparametric form that allows for the identification of interpretable physical\nparameters, and (iv) long-term prediction in state space. (v) Furthermore, the\nphoto-realistic rendering of novel scenes with modified physical parameters\nbecomes possible.\n","authors":["Florian Hofherr","Lukas Koestler","Florian Bernard","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2204.14030v5.pdf","comment":"Published in IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2023"},{"id":"http://arxiv.org/abs/2404.01929v1","updated":"2024-04-02T13:23:21Z","published":"2024-04-02T13:23:21Z","title":"Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A\n Semi-Supervised Video Object Detection Method","summary":" This study aims to establish a computer-aided diagnostic system for lung\nlesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians\nin identifying lesion areas. During EBUS-transbronchial needle aspiration\n(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to\ndetermine the location of lesions. However, these images often contain\nsignificant noise and can be influenced by surrounding tissues or blood\nvessels, making interpretation challenging. Previous research has lacked the\napplication of object detection models to EBUS-TBNA, and there has been no\nwell-defined solution for annotating the EBUS-TBNA dataset. In related studies\non ultrasound images, although models have been successful in capturing target\nregions for their respective tasks, their training and predictions have been\nbased on two-dimensional images, limiting their ability to leverage temporal\nfeatures for improved predictions. This study introduces a three-dimensional\nimage-based object detection model. It utilizes an attention mechanism to\ncapture temporal correlations and we will implements a filtering mechanism to\nselect relevant information from previous frames. Subsequently, a\nteacher-student model training approach is employed to optimize the model\nfurther, leveraging unlabeled data. To mitigate the impact of poor-quality\npseudo-labels on the student model, we will add a special Gaussian Mixture\nModel (GMM) to ensure the quality of pseudo-labels.\n","authors":["Jyun-An Lin","Yun-Chien Cheng","Ching-Kai Lin"],"pdf_url":"https://arxiv.org/pdf/2404.01929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01925v1","updated":"2024-04-02T13:19:45Z","published":"2024-04-02T13:19:45Z","title":"Improving Bird's Eye View Semantic Segmentation by Task Decomposition","summary":" Semantic segmentation in bird's eye view (BEV) plays a crucial role in\nautonomous driving. Previous methods usually follow an end-to-end pipeline,\ndirectly predicting the BEV segmentation map from monocular RGB inputs.\nHowever, the challenge arises when the RGB inputs and BEV targets from distinct\nperspectives, making the direct point-to-point predicting hard to optimize. In\nthis paper, we decompose the original BEV segmentation task into two stages,\nnamely BEV map reconstruction and RGB-BEV feature alignment. In the first\nstage, we train a BEV autoencoder to reconstruct the BEV segmentation maps\ngiven corrupted noisy latent representation, which urges the decoder to learn\nfundamental knowledge of typical BEV patterns. The second stage involves\nmapping RGB input images into the BEV latent space of the first stage, directly\noptimizing the correlations between the two views at the feature level. Our\napproach simplifies the complexity of combining perception and generation into\ndistinct steps, equipping the model to handle intricate and challenging scenes\neffectively. Besides, we propose to transform the BEV segmentation map from the\nCartesian to the polar coordinate system to establish the column-wise\ncorrespondence between RGB images and BEV maps. Moreover, our method requires\nneither multi-scale features nor camera intrinsic parameters for depth\nestimation and saves computational overhead. Extensive experiments on nuScenes\nand Argoverse show the effectiveness and efficiency of our method. Code is\navailable at https://github.com/happytianhao/TaDe.\n","authors":["Tianhao Zhao","Yongcan Chen","Yu Wu","Tianyang Liu","Bo Du","Peilun Xiao","Shi Qiu","Hongda Yang","Guozhen Li","Yi Yang","Yutian Lin"],"pdf_url":"https://arxiv.org/pdf/2404.01925v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01924v1","updated":"2024-04-02T13:19:06Z","published":"2024-04-02T13:19:06Z","title":"Toward Efficient Visual Gyroscopes: Spherical Moments, Harmonics\n Filtering, and Masking Techniques for Spherical Camera Applications","summary":" Unlike a traditional gyroscope, a visual gyroscope estimates camera rotation\nthrough images. The integration of omnidirectional cameras, offering a larger\nfield of view compared to traditional RGB cameras, has proven to yield more\naccurate and robust results. However, challenges arise in situations that lack\nfeatures, have substantial noise causing significant errors, and where certain\nfeatures in the images lack sufficient strength, leading to less precise\nprediction results.\n Here, we address these challenges by introducing a novel visual gyroscope,\nwhich combines an analytical method with a neural network approach to provide a\nmore efficient and accurate rotation estimation from spherical images. The\npresented method relies on three key contributions: an adapted analytical\napproach to compute the spherical moments coefficients, introduction of masks\nfor better global feature representation, and the use of a multilayer\nperceptron to adaptively choose the best combination of masks and filters.\nExperimental results demonstrate superior performance of the proposed approach\nin terms of accuracy. The paper emphasizes the advantages of integrating\nmachine learning to optimize analytical solutions, discusses limitations, and\nsuggests directions for future research.\n","authors":["Yao Du","Carlos M. Mateo","Mirjana Maras","Tsun-Hsuan Wang","Marc Blanchon","Alexander Amini","Daniela Rus","Omar Tahri"],"pdf_url":"https://arxiv.org/pdf/2404.01924v1.pdf","comment":"Submitted to 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2403.15769v2","updated":"2024-04-02T13:16:46Z","published":"2024-03-23T08:54:03Z","title":"FusionINN: Invertible Image Fusion for Brain Tumor Monitoring","summary":" Image fusion typically employs non-invertible neural networks to merge\nmultiple source images into a single fused image. However, for clinical\nexperts, solely relying on fused images may be insufficient for making\ndiagnostic decisions, as the fusion mechanism blends features from source\nimages, thereby making it difficult to interpret the underlying tumor\npathology. We introduce FusionINN, a novel invertible image fusion framework,\ncapable of efficiently generating fused images and also decomposing them back\nto the source images by solving the inverse of the fusion process. FusionINN\nguarantees lossless one-to-one pixel mapping by integrating a normally\ndistributed latent image alongside the fused image to facilitate the generative\nmodeling of the decomposition process. To the best of our knowledge, we are the\nfirst to investigate the decomposability of fused images, which is particularly\ncrucial for life-sensitive applications such as medical image fusion compared\nto other tasks like multi-focus or multi-exposure image fusion. Our extensive\nexperimentation validates FusionINN over existing discriminative and generative\nfusion methods, both subjectively and objectively. Moreover, compared to a\nrecent denoising diffusion-based fusion model, our approach offers faster and\nqualitatively better fusion results. We also exhibit the clinical utility of\nour results in aiding disease prognosis.\n","authors":["Nishant Kumar","Ziyan Tao","Jaikirat Singh","Yang Li","Peiwen Sun","Binghui Zhao","Stefan Gumhold"],"pdf_url":"https://arxiv.org/pdf/2403.15769v2.pdf","comment":"Source code available at https://github.com/nish03/FusionINN"},{"id":"http://arxiv.org/abs/2404.00722v2","updated":"2024-04-02T13:15:36Z","published":"2024-03-31T15:34:45Z","title":"DRCT: Saving Image Super-resolution away from Information Bottleneck","summary":" In recent years, Vision Transformer-based applications to low-level vision\ntasks have achieved widespread success. Unlike CNN-based models, Transformers\nare more adept at capturing long-range dependencies, enabling the\nreconstruction of images utilizing information from non-local areas. In the\ndomain of super-resolution, Swin-transformer-based approaches have become\nmainstream due to their capacity to capture global spatial information and\ntheir shifting-window attention mechanism that facilitates the interchange of\ninformation between different windows. Many researchers have enhanced image\nquality and network efficiency by expanding the receptive field or designing\ncomplex networks, yielding commendable results. However, we observed that\nspatial information tends to diminish during the forward propagation process\ndue to increased depth, leading to a loss of spatial information and,\nconsequently, limiting the model's potential. To address this, we propose the\nDense-residual-connected Transformer (DRCT), aimed at mitigating the loss of\nspatial information through dense-residual connections between layers, thereby\nunleashing the model's potential and enhancing performance. Experiment results\nindicate that our approach is not only straightforward but also achieves\nremarkable efficiency, surpassing state-of-the-art methods and performing\ncommendably at NTIRE2024.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou"],"pdf_url":"https://arxiv.org/pdf/2404.00722v2.pdf","comment":"Submitted to NTIRE 2024"},{"id":"http://arxiv.org/abs/2404.01911v1","updated":"2024-04-02T12:57:22Z","published":"2024-04-02T12:57:22Z","title":"VLRM: Vision-Language Models act as Reward Models for Image Captioning","summary":" In this work, we present an unsupervised method for enhancing an image\ncaptioning model (in our case, BLIP2) using reinforcement learning and\nvision-language models like CLIP and BLIP2-ITM as reward models. The RL-tuned\nmodel is able to generate longer and more comprehensive descriptions. Our model\nreaches impressive 0.90 R@1 CLIP Recall score on MS-COCO Carpathy Test Split.\n Weights are available at\nhttps://huggingface.co/sashakunitsyn/vlrm-blip2-opt-2.7b.\n","authors":["Maksim Dzabraev","Alexander Kunitsyn","Andrei Ivaniuta"],"pdf_url":"https://arxiv.org/pdf/2404.01911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01258v2","updated":"2024-04-02T12:47:49Z","published":"2024-04-01T17:28:16Z","title":"Direct Preference Optimization of Video Large Multimodal Models from\n Language Model Reward","summary":" Preference modeling techniques, such as direct preference optimization (DPO),\nhas shown effective in enhancing the generalization abilities of large language\nmodel (LLM). However, in tasks involving video instruction-following, providing\ninformative feedback, especially for detecting hallucinations in generated\nresponses, remains a significant challenge. Previous studies have explored\nusing large large multimodal models (LMMs) as reward models to guide preference\nmodeling, but their ability to accurately assess the factuality of generated\nresponses compared to corresponding videos has not been conclusively\nestablished. This paper introduces a novel framework that utilizes detailed\nvideo captions as a proxy of video content, enabling language models to\nincorporate this information as supporting evidence for scoring video Question\nAnswering (QA) predictions. Our approach demonstrates robust alignment with\nOpenAI GPT-4V model's reward mechanism, which directly takes video frames as\ninput. Furthermore, we show that applying this tailored reward through DPO\nsignificantly improves the performance of video LMMs on video QA tasks.\n","authors":["Ruohong Zhang","Liangke Gui","Zhiqing Sun","Yihao Feng","Keyang Xu","Yuanhan Zhang","Di Fu","Chunyuan Li","Alexander Hauptmann","Yonatan Bisk","Yiming Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00129v3","updated":"2024-04-02T12:42:42Z","published":"2024-01-31T19:14:12Z","title":"CMRNext: Camera to LiDAR Matching in the Wild for Localization and\n Extrinsic Calibration","summary":" LiDARs are widely used for mapping and localization in dynamic environments.\nHowever, their high cost limits their widespread adoption. On the other hand,\nmonocular localization in LiDAR maps using inexpensive cameras is a\ncost-effective alternative for large-scale deployment. Nevertheless, most\nexisting approaches struggle to generalize to new sensor setups and\nenvironments, requiring retraining or fine-tuning. In this paper, we present\nCMRNext, a novel approach for camera-LIDAR matching that is independent of\nsensor-specific parameters, generalizable, and can be used in the wild for\nmonocular localization in LiDAR maps and camera-LiDAR extrinsic calibration.\nCMRNext exploits recent advances in deep neural networks for matching\ncross-modal data and standard geometric techniques for robust pose estimation.\nWe reformulate the point-pixel matching problem as an optical flow estimation\nproblem and solve the Perspective-n-Point problem based on the resulting\ncorrespondences to find the relative pose between the camera and the LiDAR\npoint cloud. We extensively evaluate CMRNext on six different robotic\nplatforms, including three publicly available datasets and three in-house\nrobots. Our experimental evaluations demonstrate that CMRNext outperforms\nexisting approaches on both tasks and effectively generalizes to previously\nunseen environments and sensor setups in a zero-shot manner. We make the code\nand pre-trained models publicly available at http://cmrnext.cs.uni-freiburg.de .\n","authors":["Daniele Cattaneo","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2402.00129v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00299v2","updated":"2024-04-02T12:34:09Z","published":"2024-03-30T09:24:25Z","title":"HOI-M3:Capture Multiple Humans and Objects Interaction within Contextual\n Environment","summary":" Humans naturally interact with both others and the surrounding multiple\nobjects, engaging in various social activities. However, recent advances in\nmodeling human-object interactions mostly focus on perceiving isolated\nindividuals and objects, due to fundamental data scarcity. In this paper, we\nintroduce HOI-M3, a novel large-scale dataset for modeling the interactions of\nMultiple huMans and Multiple objects. Notably, it provides accurate 3D tracking\nfor both humans and objects from dense RGB and object-mounted IMU inputs,\ncovering 199 sequences and 181M frames of diverse humans and objects under rich\nactivities. With the unique HOI-M3 dataset, we introduce two novel data-driven\ntasks with companion strong baselines: monocular capture and unstructured\ngeneration of multiple human-object interactions. Extensive experiments\ndemonstrate that our dataset is challenging and worthy of further research\nabout multiple human-object interactions and behavior analysis. Our HOI-M3\ndataset, corresponding codes, and pre-trained models will be disseminated to\nthe community for future research.\n","authors":["Juze Zhang","Jingyan Zhang","Zining Song","Zhanhe Shi","Chengfeng Zhao","Ye Shi","Jingyi Yu","Lan Xu","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00299v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01892v1","updated":"2024-04-02T12:29:31Z","published":"2024-04-02T12:29:31Z","title":"Minimize Quantization Output Error with Bias Compensation","summary":" Quantization is a promising method that reduces memory usage and\ncomputational intensity of Deep Neural Networks (DNNs), but it often leads to\nsignificant output error that hinder model deployment. In this paper, we\npropose Bias Compensation (BC) to minimize the output error, thus realizing\nultra-low-precision quantization without model fine-tuning. Instead of\noptimizing the non-convex quantization process as in most previous methods, the\nproposed BC bypasses the step to directly minimize the quantizing output error\nby identifying a bias vector for compensation. We have established that the\nminimization of output error through BC is a convex problem and provides an\nefficient strategy to procure optimal solutions associated with minimal output\nerror,without the need for training or fine-tuning. We conduct extensive\nexperiments on Vision Transformer models and Large Language Models, and the\nresults show that our method notably reduces quantization output error, thereby\npermitting ultra-low-precision post-training quantization and enhancing the\ntask performance of models. Especially, BC improves the accuracy of ViT-B with\n4-bit PTQ4ViT by 36.89% on the ImageNet-1k task, and decreases the perplexity\nof OPT-350M with 3-bit GPTQ by 5.97 on WikiText2.The code is in\nhttps://github.com/GongCheng1919/bias-compensation.\n","authors":["Cheng Gong","Haoshuai Zheng","Mengting Hu","Zheng Lin","Deng-Ping Fan","Yuzhi Zhang","Tao Li"],"pdf_url":"https://arxiv.org/pdf/2404.01892v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.01891v1","updated":"2024-04-02T12:29:04Z","published":"2024-04-02T12:29:04Z","title":"ASTRA: An Action Spotting TRAnsformer for Soccer Videos","summary":" In this paper, we introduce ASTRA, a Transformer-based model designed for the\ntask of Action Spotting in soccer matches. ASTRA addresses several challenges\ninherent in the task and dataset, including the requirement for precise action\nlocalization, the presence of a long-tail data distribution, non-visibility in\ncertain actions, and inherent label noise. To do so, ASTRA incorporates (a) a\nTransformer encoder-decoder architecture to achieve the desired output temporal\nresolution and to produce precise predictions, (b) a balanced mixup strategy to\nhandle the long-tail distribution of the data, (c) an uncertainty-aware\ndisplacement head to capture the label variability, and (d) input audio signal\nto enhance detection of non-visible actions. Results demonstrate the\neffectiveness of ASTRA, achieving a tight Average-mAP of 66.82 on the test set.\nMoreover, in the SoccerNet 2023 Action Spotting challenge, we secure the 3rd\nposition with an Average-mAP of 70.21 on the challenge set.\n","authors":["Artur Xarles","Sergio Escalera","Thomas B. Moeslund","Albert Clapés"],"pdf_url":"https://arxiv.org/pdf/2404.01891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01889v1","updated":"2024-04-02T12:28:40Z","published":"2024-04-02T12:28:40Z","title":"RAVE: Residual Vector Embedding for CLIP-Guided Backlit Image\n Enhancement","summary":" In this paper we propose a novel modification of Contrastive Language-Image\nPre-Training (CLIP) guidance for the task of unsupervised backlit image\nenhancement. Our work builds on the state-of-the-art CLIP-LIT approach, which\nlearns a prompt pair by constraining the text-image similarity between a prompt\n(negative/positive sample) and a corresponding image (backlit image/well-lit\nimage) in the CLIP embedding space. Learned prompts then guide an image\nenhancement network. Based on the CLIP-LIT framework, we propose two novel\nmethods for CLIP guidance. First, we show that instead of tuning prompts in the\nspace of text embeddings, it is possible to directly tune their embeddings in\nthe latent space without any loss in quality. This accelerates training and\npotentially enables the use of additional encoders that do not have a text\nencoder. Second, we propose a novel approach that does not require any prompt\ntuning. Instead, based on CLIP embeddings of backlit and well-lit images from\ntraining data, we compute the residual vector in the embedding space as a\nsimple difference between the mean embeddings of the well-lit and backlit\nimages. This vector then guides the enhancement network during training,\npushing a backlit image towards the space of well-lit images. This approach\nfurther dramatically reduces training time, stabilizes training and produces\nhigh quality enhanced images without artifacts, both in supervised and\nunsupervised training regimes. Additionally, we show that residual vectors can\nbe interpreted, revealing biases in training data, and thereby enabling\npotential bias correction.\n","authors":["Tatiana Gaintseva","Marting Benning","Gregory Slabaugh"],"pdf_url":"https://arxiv.org/pdf/2404.01889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01887v1","updated":"2024-04-02T12:26:17Z","published":"2024-04-02T12:26:17Z","title":"3D Scene Generation from Scene Graphs and Self-Attention","summary":" Synthesizing realistic and diverse indoor 3D scene layouts in a controllable\nfashion opens up applications in simulated navigation and virtual reality. As\nconcise and robust representations of a scene, scene graphs have proven to be\nwell-suited as the semantic control on the generated layout. We present a\nvariant of the conditional variational autoencoder (cVAE) model to synthesize\n3D scenes from scene graphs and floor plans. We exploit the properties of\nself-attention layers to capture high-level relationships between objects in a\nscene, and use these as the building blocks of our model. Our model, leverages\ngraph transformers to estimate the size, dimension and orientation of the\nobjects in a room while satisfying relationships in the given scene graph. Our\nexperiments shows self-attention layers leads to sparser (HOW MUCH) and more\ndiverse scenes (HOW MUCH)\\. Included in this work, we publish the first\nlarge-scale dataset for conditioned scene generation from scene graphs,\ncontaining over XXX rooms (of floor plans and scene graphs).\n","authors":["Pietro Bonazzi","Mengqi Wang","Diego Martin Arroyo","Fabian Manhardt","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.01887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01112v2","updated":"2024-04-02T12:21:38Z","published":"2024-04-01T13:38:16Z","title":"Few-shot point cloud reconstruction and denoising via learned Guassian\n splats renderings and fine-tuned diffusion features","summary":" Existing deep learning methods for the reconstruction and denoising of point\nclouds rely on small datasets of 3D shapes. We circumvent the problem by\nleveraging deep learning methods trained on billions of images. We propose a\nmethod to reconstruct point clouds from few images and to denoise point clouds\nfrom their rendering by exploiting prior knowledge distilled from image-based\ndeep learning models. To improve reconstruction in constraint settings, we\nregularize the training of a differentiable renderer with hybrid surface and\nappearance by introducing semantic consistency supervision. In addition, we\npropose a pipeline to finetune Stable Diffusion to denoise renderings of noisy\npoint clouds and we demonstrate how these learned filters can be used to remove\npoint cloud noise coming without 3D supervision. We compare our method with DSS\nand PointRadiance and achieved higher quality 3D reconstruction on the\nSketchfab Testset and SCUT Dataset.\n","authors":["Pietro Bonazzi"],"pdf_url":"https://arxiv.org/pdf/2404.01112v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01882v1","updated":"2024-04-02T12:15:25Z","published":"2024-04-02T12:15:25Z","title":"Scene Adaptive Sparse Transformer for Event-based Object Detection","summary":" While recent Transformer-based approaches have shown impressive performances\non event-based object detection tasks, their high computational costs still\ndiminish the low power consumption advantage of event cameras. Image-based\nworks attempt to reduce these costs by introducing sparse Transformers.\nHowever, they display inadequate sparsity and adaptability when applied to\nevent-based object detection, since these approaches cannot balance the fine\ngranularity of token-level sparsification and the efficiency of window-based\nTransformers, leading to reduced performance and efficiency. Furthermore, they\nlack scene-specific sparsity optimization, resulting in information loss and a\nlower recall rate. To overcome these limitations, we propose the Scene Adaptive\nSparse Transformer (SAST). SAST enables window-token co-sparsification,\nsignificantly enhancing fault tolerance and reducing computational overhead.\nLeveraging the innovative scoring and selection modules, along with the Masked\nSparse Window Self-Attention, SAST showcases remarkable scene-aware\nadaptability: It focuses only on important objects and dynamically optimizes\nsparsity level according to scene complexity, maintaining a remarkable balance\nbetween performance and computational cost. The evaluation results show that\nSAST outperforms all other dense and sparse networks in both performance and\nefficiency on two large-scale event-based object detection datasets (1Mpx and\nGen1). Code: https://github.com/Peterande/SAST\n","authors":["Yansong Peng","Hebei Li","Yueyi Zhang","Xiaoyan Sun","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2404.01882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01878v1","updated":"2024-04-02T12:08:26Z","published":"2024-04-02T12:08:26Z","title":"Real, fake and synthetic faces - does the coin have three sides?","summary":" With the ever-growing power of generative artificial intelligence, deepfake\nand artificially generated (synthetic) media have continued to spread online,\nwhich creates various ethical and moral concerns regarding their usage. To\ntackle this, we thus present a novel exploration of the trends and patterns\nobserved in real, deepfake and synthetic facial images. The proposed analysis\nis done in two parts: firstly, we incorporate eight deep learning models and\nanalyze their performances in distinguishing between the three classes of\nimages. Next, we look to further delve into the similarities and differences\nbetween these three sets of images by investigating their image properties both\nin the context of the entire image as well as in the context of specific\nregions within the image. ANOVA test was also performed and provided further\nclarity amongst the patterns associated between the images of the three\nclasses. From our findings, we observe that the investigated deeplearning\nmodels found it easier to detect synthetic facial images, with the ViT Patch-16\nmodel performing best on this task with a class-averaged sensitivity,\nspecificity, precision, and accuracy of 97.37%, 98.69%, 97.48%, and 98.25%,\nrespectively. This observation was supported by further analysis of various\nimage properties. We saw noticeable differences across the three category of\nimages. This analysis can help us build better algorithms for facial image\ngeneration, and also shows that synthetic, deepfake and real face images are\nindeed three different classes.\n","authors":["Shahzeb Naeem","Ramzi Al-Sharawi","Muhammad Riyyan Khan","Usman Tariq","Abhinav Dhall","Hasan Al-Nashash"],"pdf_url":"https://arxiv.org/pdf/2404.01878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13659v3","updated":"2024-04-02T11:42:50Z","published":"2024-03-20T15:08:43Z","title":"Recursive Joint Cross-Modal Attention for Multimodal Fusion in\n Dimensional Emotion Recognition","summary":" Though multimodal emotion recognition has achieved significant progress over\nrecent years, the potential of rich synergic relationships across the\nmodalities is not fully exploited. In this paper, we introduce Recursive Joint\nCross-Modal Attention (RJCMA) to effectively capture both intra-and inter-modal\nrelationships across audio, visual and text modalities for dimensional emotion\nrecognition. In particular, we compute the attention weights based on\ncross-correlation between the joint audio-visual-text feature representations\nand the feature representations of individual modalities to simultaneously\ncapture intra- and inter-modal relationships across the modalities. The\nattended features of the individual modalities are again fed as input to the\nfusion model in a recursive mechanism to obtain more refined feature\nrepresentations. We have also explored Temporal Convolutional Networks (TCNs)\nto improve the temporal modeling of the feature representations of individual\nmodalities. Extensive experiments are conducted to evaluate the performance of\nthe proposed fusion model on the challenging Affwild2 dataset. By effectively\ncapturing the synergic intra- and inter-modal relationships across audio,\nvisual and text modalities, the proposed fusion model achieves a Concordance\nCorrelation Coefficient (CCC) of 0.585 (0.542) and 0.659 (0.619) for valence\nand arousal respectively on the validation set (test set). This shows a\nsignificant improvement over the baseline of 0.24 (0.211) and 0.20 (0.191) for\nvalence and arousal respectively on the validation set (test set) of the\nvalence-arousal challenge of 6th Affective Behavior Analysis in-the-Wild (ABAW)\ncompetition.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.13659v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01862v1","updated":"2024-04-02T11:40:34Z","published":"2024-04-02T11:40:34Z","title":"Co-Speech Gesture Video Generation via Motion-Decoupled Diffusion Model","summary":" Co-speech gestures, if presented in the lively form of videos, can achieve\nsuperior visual effects in human-machine interaction. While previous works\nmostly generate structural human skeletons, resulting in the omission of\nappearance information, we focus on the direct generation of audio-driven\nco-speech gesture videos in this work. There are two main challenges: 1) A\nsuitable motion feature is needed to describe complex human movements with\ncrucial appearance information. 2) Gestures and speech exhibit inherent\ndependencies and should be temporally aligned even of arbitrary length. To\nsolve these problems, we present a novel motion-decoupled framework to generate\nco-speech gesture videos. Specifically, we first introduce a well-designed\nnonlinear TPS transformation to obtain latent motion features preserving\nessential appearance information. Then a transformer-based diffusion model is\nproposed to learn the temporal correlation between gestures and speech, and\nperforms generation in the latent motion space, followed by an optimal motion\nselection module to produce long-term coherent and consistent gesture videos.\nFor better visual perception, we further design a refinement network focusing\non missing details of certain areas. Extensive experimental results show that\nour proposed framework significantly outperforms existing approaches in both\nmotion and video-related evaluations. Our code, demos, and more resources are\navailable at https://github.com/thuhcsi/S2G-MDDiffusion.\n","authors":["Xu He","Qiaochu Huang","Zhensong Zhang","Zhiwei Lin","Zhiyong Wu","Sicheng Yang","Minglei Li","Zhiyi Chen","Songcen Xu","Xiaofei Wu"],"pdf_url":"https://arxiv.org/pdf/2404.01862v1.pdf","comment":"22 pages, 8 figures, CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00636v2","updated":"2024-04-02T11:31:50Z","published":"2024-03-31T10:13:55Z","title":"Learning to Generate Conditional Tri-plane for 3D-aware Expression\n Controllable Portrait Animation","summary":" In this paper, we present Export3D, a one-shot 3D-aware portrait animation\nmethod that is able to control the facial expression and camera view of a given\nportrait image. To achieve this, we introduce a tri-plane generator that\ndirectly generates a tri-plane of 3D prior by transferring the expression\nparameter of 3DMM into the source image. The tri-plane is then decoded into the\nimage of different view through a differentiable volume rendering. Existing\nportrait animation methods heavily rely on image warping to transfer the\nexpression in the motion space, challenging on disentanglement of appearance\nand expression. In contrast, we propose a contrastive pre-training framework\nfor appearance-free expression parameter, eliminating undesirable appearance\nswap when transferring a cross-identity expression. Extensive experiments show\nthat our pre-training framework can learn the appearance-free expression\nrepresentation hidden in 3DMM, and our model can generate 3D-aware expression\ncontrollable portrait image without appearance swap in the cross-identity\nmanner.\n","authors":["Taekyung Ki","Dongchan Min","Gyeongsu Chae"],"pdf_url":"https://arxiv.org/pdf/2404.00636v2.pdf","comment":"Project page: https://export3d.github.io"},{"id":"http://arxiv.org/abs/2404.01853v1","updated":"2024-04-02T11:30:22Z","published":"2024-04-02T11:30:22Z","title":"Pairwise Similarity Distribution Clustering for Noisy Label Learning","summary":" Noisy label learning aims to train deep neural networks using a large amount\nof samples with noisy labels, whose main challenge comes from how to deal with\nthe inaccurate supervision caused by wrong labels. Existing works either take\nthe label correction or sample selection paradigm to involve more samples with\naccurate labels into the training process. In this paper, we propose a simple\nyet effective sample selection algorithm, termed as Pairwise Similarity\nDistribution Clustering~(PSDC), to divide the training samples into one clean\nset and another noisy set, which can power any of the off-the-shelf\nsemi-supervised learning regimes to further train networks for different\ndownstream tasks. Specifically, we take the pairwise similarity between sample\npairs to represent the sample structure, and the Gaussian Mixture Model~(GMM)\nto model the similarity distribution between sample pairs belonging to the same\nnoisy cluster, therefore each sample can be confidently divided into the clean\nset or noisy set. Even under severe label noise rate, the resulting data\npartition mechanism has been proved to be more robust in judging the label\nconfidence in both theory and practice. Experimental results on various\nbenchmark datasets, such as CIFAR-10, CIFAR-100 and Clothing1M, demonstrate\nsignificant improvements over state-of-the-art methods.\n","authors":["Sihan Bai"],"pdf_url":"https://arxiv.org/pdf/2404.01853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18403v2","updated":"2024-04-02T11:17:49Z","published":"2023-11-30T09:55:46Z","title":"Corrupting Convolution-based Unlearnable Datasets with Pixel-based Image\n Transformations","summary":" Unlearnable datasets lead to a drastic drop in the generalization performance\nof models trained on them by introducing elaborate and imperceptible\nperturbations into clean training sets. Many existing defenses, e.g., JPEG\ncompression and adversarial training, effectively counter UDs based on\nnorm-constrained additive noise. However, a fire-new type of convolution-based\nUDs have been proposed and render existing defenses all ineffective, presenting\na greater challenge to defenders. To address this, we express the\nconvolution-based unlearnable sample as the result of multiplying a matrix by a\nclean sample in a simplified scenario, and formalize the intra-class matrix\ninconsistency as $\\Theta_{imi}$, inter-class matrix consistency as\n$\\Theta_{imc}$ to investigate the working mechanism of the convolution-based\nUDs. We conjecture that increasing both of these metrics will mitigate the\nunlearnability effect. Through validation experiments that commendably support\nour hypothesis, we further design a random matrix to boost both $\\Theta_{imi}$\nand $\\Theta_{imc}$, achieving a notable degree of defense effect. Hence, by\nbuilding upon and extending these facts, we first propose a brand-new image\nCOrruption that employs randomly multiplicative transformation via\nINterpolation operation to successfully defend against convolution-based UDs.\nOur approach leverages global pixel random interpolations, effectively\nsuppressing the impact of multiplicative noise in convolution-based UDs.\nAdditionally, we have also designed two new forms of convolution-based UDs, and\nfind that our defense is the most effective against them.\n","authors":["Xianlong Wang","Shengshan Hu","Minghui Li","Zhifei Yu","Ziqi Zhou","Leo Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.18403v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08707v4","updated":"2024-04-02T11:08:12Z","published":"2023-06-14T19:15:49Z","title":"VidEdit: Zero-Shot and Spatially Aware Text-Driven Video Editing","summary":" Recently, diffusion-based generative models have achieved remarkable success\nfor image generation and edition. However, existing diffusion-based video\nediting approaches lack the ability to offer precise control over generated\ncontent that maintains temporal consistency in long-term videos. On the other\nhand, atlas-based methods provide strong temporal consistency but are costly to\nedit a video and lack spatial control. In this work, we introduce VidEdit, a\nnovel method for zero-shot text-based video editing that guarantees robust\ntemporal and spatial consistency. In particular, we combine an atlas-based\nvideo representation with a pre-trained text-to-image diffusion model to\nprovide a training-free and efficient video editing method, which by design\nfulfills temporal smoothness. To grant precise user control over generated\ncontent, we utilize conditional information extracted from off-the-shelf\npanoptic segmenters and edge detectors which guides the diffusion sampling\nprocess. This method ensures a fine spatial control on targeted regions while\nstrictly preserving the structure of the original video. Our quantitative and\nqualitative experiments show that VidEdit outperforms state-of-the-art methods\non DAVIS dataset, regarding semantic faithfulness, image preservation, and\ntemporal consistency metrics. With this framework, processing a single video\nonly takes approximately one minute, and it can generate multiple compatible\nedits based on a unique text prompt. Project web-page at\nhttps://videdit.github.io\n","authors":["Paul Couairon","Clément Rambour","Jean-Emmanuel Haugeard","Nicolas Thome"],"pdf_url":"https://arxiv.org/pdf/2306.08707v4.pdf","comment":"TMLR 2024. Project web-page at https://videdit.github.io"},{"id":"http://arxiv.org/abs/2404.01843v1","updated":"2024-04-02T11:03:24Z","published":"2024-04-02T11:03:24Z","title":"Sketch3D: Style-Consistent Guidance for Sketch-to-3D Generation","summary":" Recently, image-to-3D approaches have achieved significant results with a\nnatural image as input. However, it is not always possible to access these\nenriched color input samples in practical applications, where only sketches are\navailable. Existing sketch-to-3D researches suffer from limitations in broad\napplications due to the challenges of lacking color information and multi-view\ncontent. To overcome them, this paper proposes a novel generation paradigm\nSketch3D to generate realistic 3D assets with shape aligned with the input\nsketch and color matching the textual description. Concretely, Sketch3D first\ninstantiates the given sketch in the reference image through the\nshape-preserving generation process. Second, the reference image is leveraged\nto deduce a coarse 3D Gaussian prior, and multi-view style-consistent guidance\nimages are generated based on the renderings of the 3D Gaussians. Finally,\nthree strategies are designed to optimize 3D Gaussians, i.e., structural\noptimization via a distribution transfer mechanism, color optimization with a\nstraightforward MSE loss and sketch similarity optimization with a CLIP-based\ngeometric similarity loss. Extensive visual comparisons and quantitative\nanalysis illustrate the advantage of our Sketch3D in generating realistic 3D\nassets while preserving consistency with the input.\n","authors":["Wangguandong Zheng","Haifeng Xia","Rui Chen","Ming Shao","Siyu Xia","Zhengming Ding"],"pdf_url":"https://arxiv.org/pdf/2404.01843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01842v1","updated":"2024-04-02T11:03:13Z","published":"2024-04-02T11:03:13Z","title":"Semi-Supervised Domain Adaptation for Wildfire Detection","summary":" Recently, both the frequency and intensity of wildfires have increased\nworldwide, primarily due to climate change. In this paper, we propose a novel\nprotocol for wildfire detection, leveraging semi-supervised Domain Adaptation\nfor object detection, accompanied by a corresponding dataset designed for use\nby both academics and industries. Our dataset encompasses 30 times more diverse\nlabeled scenes for the current largest benchmark wildfire dataset, HPWREN, and\nintroduces a new labeling policy for wildfire detection. Inspired by CoordConv,\nwe propose a robust baseline, Location-Aware Object Detection for\nSemi-Supervised Domain Adaptation (LADA), utilizing a teacher-student based\nframework capable of extracting translational variance features characteristic\nof wildfires. With only using 1% target domain labeled data, our framework\nsignificantly outperforms our source-only baseline by a notable margin of 3.8%\nin mean Average Precision on the HPWREN wildfire dataset. Our dataset is\navailable at https://github.com/BloomBerry/LADA.\n","authors":["JooYoung Jang","Youngseo Cha","Jisu Kim","SooHyung Lee","Geonu Lee","Minkook Cho","Young Hwang","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2404.01842v1.pdf","comment":"16 pages, 5 figures, 22 tables"},{"id":"http://arxiv.org/abs/2301.13418v4","updated":"2024-04-02T11:03:02Z","published":"2023-01-31T05:14:49Z","title":"BRAIxDet: Learning to Detect Malignant Breast Lesion with Incomplete\n Annotations","summary":" Methods to detect malignant lesions from screening mammograms are usually\ntrained with fully annotated datasets, where images are labelled with the\nlocalisation and classification of cancerous lesions. However, real-world\nscreening mammogram datasets commonly have a subset that is fully annotated and\nanother subset that is weakly annotated with just the global classification\n(i.e., without lesion localisation). Given the large size of such datasets,\nresearchers usually face a dilemma with the weakly annotated subset: to not use\nit or to fully annotate it. The first option will reduce detection accuracy\nbecause it does not use the whole dataset, and the second option is too\nexpensive given that the annotation needs to be done by expert radiologists. In\nthis paper, we propose a middle-ground solution for the dilemma, which is to\nformulate the training as a weakly- and semi-supervised learning problem that\nwe refer to as malignant breast lesion detection with incomplete annotations.\nTo address this problem, our new method comprises two stages, namely: 1)\npre-training a multi-view mammogram classifier with weak supervision from the\nwhole dataset, and 2) extending the trained classifier to become a multi-view\ndetector that is trained with semi-supervised student-teacher learning, where\nthe training set contains fully and weakly-annotated mammograms. We provide\nextensive detection results on two real-world screening mammogram datasets\ncontaining incomplete annotations, and show that our proposed approach achieves\nstate-of-the-art results in the detection of malignant breast lesions with\nincomplete annotations.\n","authors":["Yuanhong Chen","Yuyuan Liu","Chong Wang","Michael Elliott","Chun Fung Kwok","Carlos Pena-Solorzano","Yu Tian","Fengbei Liu","Helen Frazer","Davis J. McCarthy","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2301.13418v4.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2404.00675v2","updated":"2024-04-02T10:59:05Z","published":"2024-03-31T12:48:07Z","title":"LLM meets Vision-Language Models for Zero-Shot One-Class Classification","summary":" We consider the problem of zero-shot one-class visual classification. In this\nsetting, only the label of the target class is available, and the goal is to\ndiscriminate between positive and negative query samples without requiring any\nvalidation example from the target task. We propose a two-step solution that\nfirst queries large language models for visually confusing objects and then\nrelies on vision-language pre-trained models (e.g., CLIP) to perform\nclassification. By adapting large-scale vision benchmarks, we demonstrate the\nability of the proposed method to outperform adapted off-the-shelf alternatives\nin this setting. Namely, we propose a realistic benchmark where negative query\nsamples are drawn from the same original dataset as positive ones, including a\ngranularity-controlled version of iNaturalist, where negative samples are at a\nfixed distance in the taxonomy tree from the positive ones. Our work shows that\nit is possible to discriminate between a single category and other semantically\nrelated ones using only its label\n","authors":["Yassir Bendou","Giulia Lioi","Bastien Pasdeloup","Lukas Mauch","Ghouthi Boukli Hacene","Fabien Cardinaux","Vincent Gripon"],"pdf_url":"https://arxiv.org/pdf/2404.00675v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13720v8","updated":"2024-04-02T10:57:06Z","published":"2023-06-23T18:08:00Z","title":"Decoupled Diffusion Models: Simultaneous Image to Zero and Zero to Noise","summary":" We propose decoupled diffusion models (DDMs) for high-quality (un)conditioned\nimage generation in less than 10 function evaluations. In a nutshell, DDMs\ndecouple the forward image-to-noise mapping into \\textit{image-to-zero} mapping\nand \\textit{zero-to-noise} mapping. Under this framework, we mathematically\nderive 1) the training objectives and 2) for the reverse time the sampling\nformula based on an analytic transition probability which models image to zero\ntransition. The former enables DDMs to learn noise and image components\nsimultaneously which simplifies learning. Importantly, because of the latter's\nanalyticity in the \\textit{zero-to-image} sampling function, DDMs can avoid the\nordinary differential equation-based accelerators and instead naturally perform\nsampling with an arbitrary step size. Under the few function evaluation setups,\nDDMs experimentally yield very competitive performance compared with the state\nof the art in 1) unconditioned image generation, \\textit{e.g.}, CIFAR-10 and\nCelebA-HQ-256 and 2) image-conditioned downstream tasks such as\nsuper-resolution, saliency detection, edge detection, and image inpainting.\n","authors":["Yuhang Huang","Zheng Qin","Xinwang Liu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2306.13720v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07322v2","updated":"2024-04-02T10:35:32Z","published":"2023-12-12T14:37:36Z","title":"GenHowTo: Learning to Generate Actions and State Transformations from\n Instructional Videos","summary":" We address the task of generating temporally consistent and physically\nplausible images of actions and object state transformations. Given an input\nimage and a text prompt describing the targeted transformation, our generated\nimages preserve the environment and transform objects in the initial image. Our\ncontributions are threefold. First, we leverage a large body of instructional\nvideos and automatically mine a dataset of triplets of consecutive frames\ncorresponding to initial object states, actions, and resulting object\ntransformations. Second, equipped with this data, we develop and train a\nconditioned diffusion model dubbed GenHowTo. Third, we evaluate GenHowTo on a\nvariety of objects and actions and show superior performance compared to\nexisting methods. In particular, we introduce a quantitative evaluation where\nGenHowTo achieves 88% and 74% on seen and unseen interaction categories,\nrespectively, outperforming prior work by a large margin.\n","authors":["Tomáš Souček","Dima Damen","Michael Wray","Ivan Laptev","Josef Sivic"],"pdf_url":"https://arxiv.org/pdf/2312.07322v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.10636v2","updated":"2024-04-02T10:27:45Z","published":"2024-02-16T12:35:35Z","title":"PEGASUS: Personalized Generative 3D Avatars with Composable Attributes","summary":" We present PEGASUS, a method for constructing a personalized generative 3D\nface avatar from monocular video sources. Our generative 3D avatar enables\ndisentangled controls to selectively alter the facial attributes (e.g., hair or\nnose) while preserving the identity. Our approach consists of two stages:\nsynthetic database generation and constructing a personalized generative\navatar. We generate a synthetic video collection of the target identity with\nvarying facial attributes, where the videos are synthesized by borrowing the\nattributes from monocular videos of diverse identities. Then, we build a\nperson-specific generative 3D avatar that can modify its attributes\ncontinuously while preserving its identity. Through extensive experiments, we\ndemonstrate that our method of generating a synthetic database and creating a\n3D generative avatar is the most effective in preserving identity while\nachieving high realism. Subsequently, we introduce a zero-shot approach to\nachieve the same goal of generative modeling more efficiently by leveraging a\npreviously constructed personalized generative model.\n","authors":["Hyunsoo Cha","Byungjun Kim","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2402.10636v2.pdf","comment":"Accepted at CVPR 2024, Project Page:\n https://snuvclab.github.io/pegasus/"},{"id":"http://arxiv.org/abs/2403.16368v2","updated":"2024-04-02T10:25:07Z","published":"2024-03-25T02:17:20Z","title":"Distilling Semantic Priors from SAM to Efficient Image Restoration\n Models","summary":" In image restoration (IR), leveraging semantic priors from segmentation\nmodels has been a common approach to improve performance. The recent segment\nanything model (SAM) has emerged as a powerful tool for extracting advanced\nsemantic priors to enhance IR tasks. However, the computational cost of SAM is\nprohibitive for IR, compared to existing smaller IR models. The incorporation\nof SAM for extracting semantic priors considerably hampers the model inference\nefficiency. To address this issue, we propose a general framework to distill\nSAM's semantic knowledge to boost exiting IR models without interfering with\ntheir inference process. Specifically, our proposed framework consists of the\nsemantic priors fusion (SPF) scheme and the semantic priors distillation (SPD)\nscheme. SPF fuses two kinds of information between the restored image predicted\nby the original IR model and the semantic mask predicted by SAM for the refined\nrestored image. SPD leverages a self-distillation manner to distill the fused\nsemantic priors to boost the performance of original IR models. Additionally,\nwe design a semantic-guided relation (SGR) module for SPD, which ensures\nsemantic feature representation space consistency to fully distill the priors.\nWe demonstrate the effectiveness of our framework across multiple IR models and\ntasks, including deraining, deblurring, and denoising.\n","authors":["Quan Zhang","Xiaoyu Liu","Wei Li","Hanting Chen","Junchao Liu","Jie Hu","Zhiwei Xiong","Chun Yuan","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16368v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01819v1","updated":"2024-04-02T10:22:23Z","published":"2024-04-02T10:22:23Z","title":"Sparse Semi-DETR: Sparse Learnable Queries for Semi-Supervised Object\n Detection","summary":" In this paper, we address the limitations of the DETR-based semi-supervised\nobject detection (SSOD) framework, particularly focusing on the challenges\nposed by the quality of object queries. In DETR-based SSOD, the one-to-one\nassignment strategy provides inaccurate pseudo-labels, while the one-to-many\nassignments strategy leads to overlapping predictions. These issues compromise\ntraining efficiency and degrade model performance, especially in detecting\nsmall or occluded objects. We introduce Sparse Semi-DETR, a novel\ntransformer-based, end-to-end semi-supervised object detection solution to\novercome these challenges. Sparse Semi-DETR incorporates a Query Refinement\nModule to enhance the quality of object queries, significantly improving\ndetection capabilities for small and partially obscured objects. Additionally,\nwe integrate a Reliable Pseudo-Label Filtering Module that selectively filters\nhigh-quality pseudo-labels, thereby enhancing detection accuracy and\nconsistency. On the MS-COCO and Pascal VOC object detection benchmarks, Sparse\nSemi-DETR achieves a significant improvement over current state-of-the-art\nmethods that highlight Sparse Semi-DETR's effectiveness in semi-supervised\nobject detection, particularly in challenging scenarios involving small or\npartially obscured objects.\n","authors":["Tahira Shehzadi","Khurram Azeem Hashmi","Didier Stricker","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2404.01819v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.00918v2","updated":"2024-04-02T10:20:28Z","published":"2024-04-01T04:49:47Z","title":"Rethinking Saliency-Guided Weakly-Supervised Semantic Segmentation","summary":" This paper presents a fresh perspective on the role of saliency maps in\nweakly-supervised semantic segmentation (WSSS) and offers new insights and\nresearch directions based on our empirical findings. We conduct comprehensive\nexperiments and observe that the quality of the saliency map is a critical\nfactor in saliency-guided WSSS approaches. Nonetheless, we find that the\nsaliency maps used in previous works are often arbitrarily chosen, despite\ntheir significant impact on WSSS. Additionally, we observe that the choice of\nthe threshold, which has received less attention before, is non-trivial in\nWSSS. To facilitate more meaningful and rigorous research for saliency-guided\nWSSS, we introduce \\texttt{WSSS-BED}, a standardized framework for conducting\nresearch under unified conditions. \\texttt{WSSS-BED} provides various saliency\nmaps and activation maps for seven WSSS methods, as well as saliency maps from\nunsupervised salient object detection models.\n","authors":["Beomyoung Kim","Donghyun Kim","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2404.00918v2.pdf","comment":"Preprint, 17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.01816v1","updated":"2024-04-02T10:19:17Z","published":"2024-04-02T10:19:17Z","title":"Rethinking Annotator Simulation: Realistic Evaluation of Whole-Body PET\n Lesion Interactive Segmentation Methods","summary":" Interactive segmentation plays a crucial role in accelerating the annotation,\nparticularly in domains requiring specialized expertise such as nuclear\nmedicine. For example, annotating lesions in whole-body Positron Emission\nTomography (PET) images can require over an hour per volume. While previous\nworks evaluate interactive segmentation models through either real user studies\nor simulated annotators, both approaches present challenges. Real user studies\nare expensive and often limited in scale, while simulated annotators, also\nknown as robot users, tend to overestimate model performance due to their\nidealized nature. To address these limitations, we introduce four evaluation\nmetrics that quantify the user shift between real and simulated annotators. In\nan initial user study involving four annotators, we assess existing robot users\nusing our proposed metrics and find that robot users significantly deviate in\nperformance and annotation behavior compared to real annotators. Based on these\nfindings, we propose a more realistic robot user that reduces the user shift by\nincorporating human factors such as click variation and inter-annotator\ndisagreement. We validate our robot user in a second user study, involving four\nother annotators, and show it consistently reduces the simulated-to-real user\nshift compared to traditional robot users. By employing our robot user, we can\nconduct more large-scale and cost-efficient evaluations of interactive\nsegmentation models, while preserving the fidelity of real user studies. Our\nimplementation is based on MONAI Label and will be made publicly available.\n","authors":["Zdravko Marinov","Moon Kim","Jens Kleesiek","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2404.01816v1.pdf","comment":"10 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.16080v3","updated":"2024-04-02T10:16:05Z","published":"2024-03-24T10:06:40Z","title":"PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic\n Human Modeling","summary":" High-quality human reconstruction and photo-realistic rendering of a dynamic\nscene is a long-standing problem in computer vision and graphics. Despite\nconsiderable efforts invested in developing various capture systems and\nreconstruction algorithms, recent advancements still struggle with loose or\noversized clothing and overly complex poses. In part, this is due to the\nchallenges of acquiring high-quality human datasets. To facilitate the\ndevelopment of these fields, in this paper, we present PKU-DyMVHumans, a\nversatile human-centric dataset for high-fidelity reconstruction and rendering\nof dynamic human scenarios from dense multi-view videos. It comprises 8.2\nmillion frames captured by more than 56 synchronized cameras across diverse\nscenarios. These sequences comprise 32 human subjects across 45 different\nscenarios, each with a high-detailed appearance and realistic human motion.\nInspired by recent advancements in neural radiance field (NeRF)-based scene\nrepresentations, we carefully set up an off-the-shelf framework that is easy to\nprovide those state-of-the-art NeRF-based implementations and benchmark on\nPKU-DyMVHumans dataset. It is paving the way for various applications like\nfine-grained foreground/background decomposition, high-quality human\nreconstruction and photo-realistic novel view synthesis of a dynamic scene.\nExtensive studies are performed on the benchmark, demonstrating new\nobservations and challenges that emerge from using such high-fidelity dynamic\ndata.\n","authors":["Xiaoyun Zheng","Liwei Liao","Xufeng Li","Jianbo Jiao","Rongjie Wang","Feng Gao","Shiqi Wang","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16080v3.pdf","comment":"CVPR2024(accepted). Project page: https://pku-dymvhumans.github.io"},{"id":"http://arxiv.org/abs/2404.01810v1","updated":"2024-04-02T10:13:18Z","published":"2024-04-02T10:13:18Z","title":"Surface Reconstruction from Gaussian Splatting via Novel Stereo Views","summary":" The Gaussian splatting for radiance field rendering method has recently\nemerged as an efficient approach for accurate scene representation. It\noptimizes the location, size, color, and shape of a cloud of 3D Gaussian\nelements to visually match, after projection, or splatting, a set of given\nimages taken from various viewing directions. And yet, despite the proximity of\nGaussian elements to the shape boundaries, direct surface reconstruction of\nobjects in the scene is a challenge.\n We propose a novel approach for surface reconstruction from Gaussian\nsplatting models. Rather than relying on the Gaussian elements' locations as a\nprior for surface reconstruction, we leverage the superior novel-view synthesis\ncapabilities of 3DGS. To that end, we use the Gaussian splatting model to\nrender pairs of stereo-calibrated novel views from which we extract depth\nprofiles using a stereo matching method. We then combine the extracted RGB-D\nimages into a geometrically consistent surface. The resulting reconstruction is\nmore accurate and shows finer details when compared to other methods for\nsurface reconstruction from Gaussian splatting models, while requiring\nsignificantly less compute time compared to other surface reconstruction\nmethods.\n We performed extensive testing of the proposed method on in-the-wild scenes,\ntaken by a smartphone, showcasing its superior reconstruction abilities.\nAdditionally, we tested the proposed method on the Tanks and Temples benchmark,\nand it has surpassed the current leading method for surface reconstruction from\nGaussian splatting models. Project page: https://gs2mesh.github.io/.\n","authors":["Yaniv Wolf","Amit Bracha","Ron Kimmel"],"pdf_url":"https://arxiv.org/pdf/2404.01810v1.pdf","comment":"Project Page: https://gs2mesh.github.io/"},{"id":"http://arxiv.org/abs/2402.14000v2","updated":"2024-04-02T10:06:33Z","published":"2024-02-21T18:36:26Z","title":"Real-time 3D-aware Portrait Editing from a Single Image","summary":" This work presents 3DPE, a practical method that can efficiently edit a face\nimage following given prompts, like reference images or text descriptions, in a\n3D-aware manner. To this end, a lightweight module is distilled from a 3D\nportrait generator and a text-to-image model, which provide prior knowledge of\nface geometry and superior editing capability, respectively. Such a design\nbrings two compelling advantages over existing approaches. First, our system\nachieves real-time editing with a feedforward network (i.e., ~0.04s per image),\nover 100x faster than the second competitor. Second, thanks to the powerful\npriors, our module could focus on the learning of editing-related variations,\nsuch that it manages to handle various types of editing simultaneously in the\ntraining phase and further supports fast adaptation to user-specified\ncustomized types of editing during inference (e.g., with ~5min fine-tuning per\nstyle). The code, the model, and the interface will be made publicly available\nto facilitate future research.\n","authors":["Qingyan Bai","Zifan Shi","Yinghao Xu","Hao Ouyang","Qiuyu Wang","Ceyuan Yang","Xuan Wang","Gordon Wetzstein","Yujun Shen","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2402.14000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01801v1","updated":"2024-04-02T10:03:23Z","published":"2024-04-02T10:03:23Z","title":"EventSleep: Sleep Activity Recognition with Event Cameras","summary":" Event cameras are a promising technology for activity recognition in dark\nenvironments due to their unique properties. However, real event camera\ndatasets under low-lighting conditions are still scarce, which also limits the\nnumber of approaches to solve these kind of problems, hindering the potential\nof this technology in many applications. We present EventSleep, a new dataset\nand methodology to address this gap and study the suitability of event cameras\nfor a very relevant medical application: sleep monitoring for sleep disorders\nanalysis. The dataset contains synchronized event and infrared recordings\nemulating common movements that happen during the sleep, resulting in a new\nchallenging and unique dataset for activity recognition in dark environments.\nOur novel pipeline is able to achieve high accuracy under these challenging\nconditions and incorporates a Bayesian approach (Laplace ensembles) to increase\nthe robustness in the predictions, which is fundamental for medical\napplications. Our work is the first application of Bayesian neural networks for\nevent cameras, the first use of Laplace ensembles in a realistic problem, and\nalso demonstrates for the first time the potential of event cameras in a new\napplication domain: to enhance current sleep evaluation procedures. Our\nactivity recognition results highlight the potential of event cameras under\ndark conditions, and its capacity and robustness for sleep activity\nrecognition, and open problems as the adaptation of event data pre-processing\ntechniques to dark environments.\n","authors":["Carlos Plou","Nerea Gallego","Alberto Sabater","Eduardo Montijano","Pablo Urcola","Luis Montesano","Ruben Martinez-Cantin","Ana C. Murillo"],"pdf_url":"https://arxiv.org/pdf/2404.01801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13756v2","updated":"2024-04-02T10:01:44Z","published":"2024-02-21T12:34:31Z","title":"High-throughput Visual Nano-drone to Nano-drone Relative Localization\n using Onboard Fully Convolutional Networks","summary":" Relative drone-to-drone localization is a fundamental building block for any\nswarm operations. We address this task in the context of miniaturized\nnano-drones, i.e., 10cm in diameter, which show an ever-growing interest due to\nnovel use cases enabled by their reduced form factor. The price for their\nversatility comes with limited onboard resources, i.e., sensors, processing\nunits, and memory, which limits the complexity of the onboard algorithms. A\ntraditional solution to overcome these limitations is represented by\nlightweight deep learning models directly deployed aboard nano-drones. This\nwork tackles the challenging relative pose estimation between nano-drones using\nonly a gray-scale low-resolution camera and an ultra-low-power System-on-Chip\n(SoC) hosted onboard. We present a vertically integrated system based on a\nnovel vision-based fully convolutional neural network (FCNN), which runs at\n39Hz within 101mW onboard a Crazyflie nano-drone extended with the GWT GAP8\nSoC. We compare our FCNN against three State-of-the-Art (SoA) systems.\nConsidering the best-performing SoA approach, our model results in an R-squared\nimprovement from 32 to 47% on the horizontal image coordinate and from 18 to\n55% on the vertical image coordinate, on a real-world dataset of 30k images.\nFinally, our in-field tests show a reduction of the average tracking error of\n37% compared to a previous SoA work and an endurance performance up to the\nentire battery lifetime of 4 minutes.\n","authors":["Luca Crupi","Alessandro Giusti","Daniele Palossi"],"pdf_url":"https://arxiv.org/pdf/2402.13756v2.pdf","comment":"ICRA 2024, IEEE Conference"},{"id":"http://arxiv.org/abs/2403.16578v2","updated":"2024-04-02T09:55:02Z","published":"2024-03-25T09:43:56Z","title":"SegICL: A Universal In-context Learning Framework for Enhanced\n Segmentation in Medical Imaging","summary":" Medical image segmentation models adapting to new tasks in a training-free\nmanner through in-context learning is an exciting advancement. Universal\nsegmentation models aim to generalize across the diverse modality of medical\nimages, yet their effectiveness often diminishes when applied to\nout-of-distribution (OOD) data modalities and tasks, requiring intricate\nfine-tuning of model for optimal performance. For addressing this challenge, we\nintroduce SegICL, a novel approach leveraging In-Context Learning (ICL) for\nimage segmentation. Unlike existing methods, SegICL has the capability to\nemploy text-guided segmentation and conduct in-context learning with a small\nset of image-mask pairs, eliminating the need for training the model from\nscratch or fine-tuning for OOD tasks (including OOD modality and dataset).\nExtensive experimental validation of SegICL demonstrates a positive correlation\nbetween the number of prompt samples and segmentation performance on OOD\nmodalities and tasks. This indicates that SegICL effectively address new\nsegmentation tasks based on contextual information. Additionally, SegICL also\nexhibits comparable segmentation performance to mainstream models on OOD and\nin-distribution tasks. Our code will be released soon.\n","authors":["Lingdong Shen","Fangxin Shang","Yehui Yang","Xiaoshuang Huang","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2403.16578v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17702v2","updated":"2024-04-02T09:54:39Z","published":"2024-03-26T13:40:52Z","title":"The Solution for the CVPR 2023 1st foundation model challenge-Track2","summary":" In this paper, we propose a solution for cross-modal transportation\nretrieval. Due to the cross-domain problem of traffic images, we divide the\nproblem into two sub-tasks of pedestrian retrieval and vehicle retrieval\nthrough a simple strategy. In pedestrian retrieval tasks, we use IRRA as the\nbase model and specifically design an Attribute Classification to mine the\nknowledge implied by attribute labels. More importantly, We use the strategy of\nInclusion Relation Matching to make the image-text pairs with inclusion\nrelation have similar representation in the feature space. For the vehicle\nretrieval task, we use BLIP as the base model. Since aligning the color\nattributes of vehicles is challenging, we introduce attribute-based object\ndetection techniques to add color patch blocks to vehicle images for color data\naugmentation. This serves as strong prior information, helping the model\nperform the image-text alignment. At the same time, we incorporate labeled\nattributes into the image-text alignment loss to learn fine-grained alignment\nand prevent similar images and texts from being incorrectly separated. Our\napproach ranked first in the final B-board test with a score of 70.9.\n","authors":["Haonan Xu","Yurui Huang","Sishun Pan","Zhihao Guan","Yi Xu","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17702v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01790v1","updated":"2024-04-02T09:53:20Z","published":"2024-04-02T09:53:20Z","title":"Super-Resolution Analysis for Landfill Waste Classification","summary":" Illegal landfills are a critical issue due to their environmental, economic,\nand public health impacts. This study leverages aerial imagery for\nenvironmental crime monitoring. While advances in artificial intelligence and\ncomputer vision hold promise, the challenge lies in training models with\nhigh-resolution literature datasets and adapting them to open-access\nlow-resolution images. Considering the substantial quality differences and\nlimited annotation, this research explores the adaptability of models across\nthese domains. Motivated by the necessity for a comprehensive evaluation of\nwaste detection algorithms, it advocates cross-domain classification and\nsuper-resolution enhancement to analyze the impact of different image\nresolutions on waste classification as an evaluation to combat the\nproliferation of illegal landfills. We observed performance improvements by\nenhancing image quality but noted an influence on model sensitivity,\nnecessitating careful threshold fine-tuning.\n","authors":["Matias Molina","Rita P. Ribeiro","Bruno Veloso","João Gama"],"pdf_url":"https://arxiv.org/pdf/2404.01790v1.pdf","comment":"This article has been accepted by the Symposium on Intelligent Data\n Analysis (IDA 2024)"},{"id":"http://arxiv.org/abs/2305.05726v2","updated":"2024-04-02T09:52:41Z","published":"2023-05-09T19:17:07Z","title":"Vision-Language Models in Remote Sensing: Current Progress and Future\n Trends","summary":" The remarkable achievements of ChatGPT and GPT-4 have sparked a wave of\ninterest and research in the field of large language models for Artificial\nGeneral Intelligence (AGI). These models provide intelligent solutions close to\nhuman thinking, enabling us to use general artificial intelligence to solve\nproblems in various applications. However, in remote sensing (RS), the\nscientific literature on the implementation of AGI remains relatively scant.\nExisting AI-related research in remote sensing primarily focuses on visual\nunderstanding tasks while neglecting the semantic understanding of the objects\nand their relationships. This is where vision-language models excel, as they\nenable reasoning about images and their associated textual descriptions,\nallowing for a deeper understanding of the underlying semantics.\nVision-language models can go beyond visual recognition of RS images, model\nsemantic relationships, and generate natural language descriptions of the\nimage. This makes them better suited for tasks requiring visual and textual\nunderstanding, such as image captioning, and visual question answering. This\npaper provides a comprehensive review of the research on vision-language models\nin remote sensing, summarizing the latest progress, highlighting challenges,\nand identifying potential research opportunities.\n","authors":["Xiang Li","Congcong Wen","Yuan Hu","Zhenghang Yuan","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.05726v2.pdf","comment":"Accepted by IEEE Geoscience and Remote Sensing Magazine"},{"id":"http://arxiv.org/abs/2404.01780v1","updated":"2024-04-02T09:44:30Z","published":"2024-04-02T09:44:30Z","title":"CSST Strong Lensing Preparation: a Framework for Detecting Strong Lenses\n in the Multi-color Imaging Survey by the China Survey Space Telescope (CSST)","summary":" Strong gravitational lensing is a powerful tool for investigating dark matter\nand dark energy properties. With the advent of large-scale sky surveys, we can\ndiscover strong lensing systems on an unprecedented scale, which requires\nefficient tools to extract them from billions of astronomical objects. The\nexisting mainstream lens-finding tools are based on machine learning algorithms\nand applied to cut-out-centered galaxies. However, according to the design and\nsurvey strategy of optical surveys by CSST, preparing cutouts with multiple\nbands requires considerable efforts. To overcome these challenges, we have\ndeveloped a framework based on a hierarchical visual Transformer with a sliding\nwindow technique to search for strong lensing systems within entire images.\nMoreover, given that multi-color images of strong lensing systems can provide\ninsights into their physical characteristics, our framework is specifically\ncrafted to identify strong lensing systems in images with any number of\nchannels. As evaluated using CSST mock data based on an Semi-Analytic Model\nnamed CosmoDC2, our framework achieves precision and recall rates of 0.98 and\n0.90, respectively. To evaluate the effectiveness of our method in real\nobservations, we have applied it to a subset of images from the DESI Legacy\nImaging Surveys and media images from Euclid Early Release Observations. 61 new\nstrong lensing system candidates are discovered by our method. However, we also\nidentified false positives arising primarily from the simplified galaxy\nmorphology assumptions within the simulation. This underscores the practical\nlimitations of our approach while simultaneously highlighting potential avenues\nfor future improvements.\n","authors":["Xu Li","Ruiqi Sun","Jiameng Lv","Peng Jia","Nan Li","Chengliang Wei","Zou Hu","Xinzhong Er","Yun Chen","Zhang Ban","Yuedong Fang","Qi Guo","Dezi Liu","Guoliang Li","Lin Lin","Ming Li","Ran Li","Xiaobo Li","Yu Luo","Xianmin Meng","Jundan Nie","Zhaoxiang Qi","Yisheng Qiu","Li Shao","Hao Tian","Lei Wang","Wei Wang","Jingtian Xian","Youhua Xu","Tianmeng Zhang","Xin Zhang","Zhimin Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.01780v1.pdf","comment":"The paper is accepted by the AJ. The complete code could be\n downloaded with DOI of: 10.12149/101393. Comments are welcome"},{"id":"http://arxiv.org/abs/2312.02152v2","updated":"2024-04-02T09:40:33Z","published":"2023-12-04T18:59:44Z","title":"Steerers: A framework for rotation equivariant keypoint descriptors","summary":" Image keypoint descriptions that are discriminative and matchable over large\nchanges in viewpoint are vital for 3D reconstruction. However, descriptions\noutput by learned descriptors are typically not robust to camera rotation.\nWhile they can be made more robust by, e.g., data augmentation, this degrades\nperformance on upright images. Another approach is test-time augmentation,\nwhich incurs a significant increase in runtime. Instead, we learn a linear\ntransform in description space that encodes rotations of the input image. We\ncall this linear transform a steerer since it allows us to transform the\ndescriptions as if the image was rotated. From representation theory, we know\nall possible steerers for the rotation group. Steerers can be optimized (A)\ngiven a fixed descriptor, (B) jointly with a descriptor or (C) we can optimize\na descriptor given a fixed steerer. We perform experiments in these three\nsettings and obtain state-of-the-art results on the rotation invariant image\nmatching benchmarks AIMS and Roto-360. We publish code and model weights at\nhttps://github.com/georg-bn/rotation-steerers.\n","authors":["Georg Bökman","Johan Edstedt","Michael Felsberg","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2312.02152v2.pdf","comment":"CVPR 2024 Camera ready"},{"id":"http://arxiv.org/abs/2404.01775v1","updated":"2024-04-02T09:40:22Z","published":"2024-04-02T09:40:22Z","title":"A noisy elephant in the room: Is your out-of-distribution detector\n robust to label noise?","summary":" The ability to detect unfamiliar or unexpected images is essential for safe\ndeployment of computer vision systems. In the context of classification, the\ntask of detecting images outside of a model's training domain is known as\nout-of-distribution (OOD) detection. While there has been a growing research\ninterest in developing post-hoc OOD detection methods, there has been\ncomparably little discussion around how these methods perform when the\nunderlying classifier is not trained on a clean, carefully curated dataset. In\nthis work, we take a closer look at 20 state-of-the-art OOD detection methods\nin the (more realistic) scenario where the labels used to train the underlying\nclassifier are unreliable (e.g. crowd-sourced or web-scraped labels). Extensive\nexperiments across different datasets, noise types & levels, architectures and\ncheckpointing strategies provide insights into the effect of class label noise\non OOD detection, and show that poor separation between incorrectly classified\nID samples vs. OOD samples is an overlooked yet important limitation of\nexisting methods. Code: https://github.com/glhr/ood-labelnoise\n","authors":["Galadrielle Humblot-Renaux","Sergio Escalera","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2404.01775v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2402.17971v2","updated":"2024-04-02T09:32:51Z","published":"2024-02-28T01:32:59Z","title":"All in an Aggregated Image for In-Image Learning","summary":" This paper introduces a new in-context learning (ICL) mechanism called\nIn-Image Learning (I$^2$L) that combines demonstration examples, visual cues,\nand chain-of-thought reasoning into an aggregated image to enhance the\ncapabilities of Large Multimodal Models (e.g., GPT-4V) in multimodal reasoning\ntasks. Unlike previous approaches that rely on converting images to text or\nincorporating visual input into language models, I$^2$L consolidates all\ninformation into an aggregated image and leverages image processing,\nunderstanding, and reasoning abilities. This has several advantages: it reduces\ninaccurate textual descriptions of complex images, provides flexibility in\npositioning demonstration examples, and avoids multiple input images and\nlengthy prompts. We also introduce I$^2$L-Hybrid, a method that combines the\nstrengths of I$^2$L with other ICL methods. Specifically, it uses an automatic\nstrategy to select the most suitable method (I$^2$L or another certain ICL\nmethod) for a specific task instance. We conduct extensive experiments to\nassess the effectiveness of I$^2$L and I$^2$L-Hybrid on MathVista, which covers\na variety of complex multimodal reasoning tasks. Additionally, we investigate\nthe influence of image resolution, the number of demonstration examples in a\nsingle image, and the positions of these demonstrations in the aggregated image\non the effectiveness of I$^2$L. Our code is publicly available at\nhttps://github.com/AGI-Edgerunners/IIL.\n","authors":["Lei Wang","Wanyu Xu","Zhiqiang Hu","Yihuai Lan","Shan Dong","Hao Wang","Roy Ka-Wei Lee","Ee-Peng Lim"],"pdf_url":"https://arxiv.org/pdf/2402.17971v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.01765v1","updated":"2024-04-02T09:31:06Z","published":"2024-04-02T09:31:06Z","title":"Guidelines for Cerebrovascular Segmentation: Managing Imperfect\n Annotations in the context of Semi-Supervised Learning","summary":" Segmentation in medical imaging is an essential and often preliminary task in\nthe image processing chain, driving numerous efforts towards the design of\nrobust segmentation algorithms. Supervised learning methods achieve excellent\nperformances when fed with a sufficient amount of labeled data. However, such\nlabels are typically highly time-consuming, error-prone and expensive to\nproduce. Alternatively, semi-supervised learning approaches leverage both\nlabeled and unlabeled data, and are very useful when only a small fraction of\nthe dataset is labeled. They are particularly useful for cerebrovascular\nsegmentation, given that labeling a single volume requires several hours for an\nexpert. In addition to the challenge posed by insufficient annotations, there\nare concerns regarding annotation consistency. The task of annotating the\ncerebrovascular tree is inherently ambiguous. Due to the discrete nature of\nimages, the borders and extremities of vessels are often unclear. Consequently,\nannotations heavily rely on the expert subjectivity and on the underlying\nclinical objective. These discrepancies significantly increase the complexity\nof the segmentation task for the model and consequently impair the results.\nConsequently, it becomes imperative to provide clinicians with precise\nguidelines to improve the annotation process and construct more uniform\ndatasets. In this article, we investigate the data dependency of deep learning\nmethods within the context of imperfect data and semi-supervised learning, for\ncerebrovascular segmentation. Specifically, this study compares various\nstate-of-the-art semi-supervised methods based on unsupervised regularization\nand evaluates their performance in diverse quantity and quality data scenarios.\nBased on these experiments, we provide guidelines for the annotation and\ntraining of cerebrovascular segmentation models.\n","authors":["Pierre Rougé","Pierre-Henri Conze","Nicolas Passat","Odyssée Merveille"],"pdf_url":"https://arxiv.org/pdf/2404.01765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10208v2","updated":"2024-04-02T09:20:50Z","published":"2024-01-18T18:50:16Z","title":"MM-Interleaved: Interleaved Image-Text Generative Modeling via\n Multi-modal Feature Synchronizer","summary":" Developing generative models for interleaved image-text data has both\nresearch and practical value. It requires models to understand the interleaved\nsequences and subsequently generate images and text. However, existing attempts\nare limited by the issue that the fixed number of visual tokens cannot\nefficiently capture image details, which is particularly problematic in the\nmulti-image scenarios. To address this, this paper presents MM-Interleaved, an\nend-to-end generative model for interleaved image-text data. It introduces a\nmulti-scale and multi-image feature synchronizer module, allowing direct access\nto fine-grained image features in the previous context during the generation\nprocess. MM-Interleaved is end-to-end pre-trained on both paired and\ninterleaved image-text corpora. It is further enhanced through a supervised\nfine-tuning phase, wherein the model improves its ability to follow complex\nmulti-modal instructions. Experiments demonstrate the versatility of\nMM-Interleaved in recognizing visual details following multi-modal instructions\nand generating consistent images following both textual and visual conditions.\nCode and models are available at\n\\url{https://github.com/OpenGVLab/MM-Interleaved}.\n","authors":["Changyao Tian","Xizhou Zhu","Yuwen Xiong","Weiyun Wang","Zhe Chen","Wenhai Wang","Yuntao Chen","Lewei Lu","Tong Lu","Jie Zhou","Hongsheng Li","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2401.10208v2.pdf","comment":"20 pages, 9 figures, 17 tables"},{"id":"http://arxiv.org/abs/2404.01758v1","updated":"2024-04-02T09:18:52Z","published":"2024-04-02T09:18:52Z","title":"GEARS: Local Geometry-aware Hand-object Interaction Synthesis","summary":" Generating realistic hand motion sequences in interaction with objects has\ngained increasing attention with the growing interest in digital humans. Prior\nwork has illustrated the effectiveness of employing occupancy-based or\ndistance-based virtual sensors to extract hand-object interaction features.\nNonetheless, these methods show limited generalizability across object\ncategories, shapes and sizes. We hypothesize that this is due to two reasons:\n1) the limited expressiveness of employed virtual sensors, and 2) scarcity of\navailable training data. To tackle this challenge, we introduce a novel\njoint-centered sensor designed to reason about local object geometry near\npotential interaction regions. The sensor queries for object surface points in\nthe neighbourhood of each hand joint. As an important step towards mitigating\nthe learning complexity, we transform the points from global frame to hand\ntemplate frame and use a shared module to process sensor features of each\nindividual joint. This is followed by a spatio-temporal transformer network\naimed at capturing correlation among the joints in different dimensions.\nMoreover, we devise simple heuristic rules to augment the limited training\nsequences with vast static hand grasping samples. This leads to a broader\nspectrum of grasping types observed during training, in turn enhancing our\nmodel's generalization capability. We evaluate on two public datasets, GRAB and\nInterCap, where our method shows superiority over baselines both quantitatively\nand perceptually.\n","authors":["Keyang Zhou","Bharat Lal Bhatnagar","Jan Eric Lenssen","Gerard Pons-moll"],"pdf_url":"https://arxiv.org/pdf/2404.01758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06725v3","updated":"2024-04-02T09:18:36Z","published":"2023-12-11T05:20:52Z","title":"EpiDiff: Enhancing Multi-View Synthesis via Localized\n Epipolar-Constrained Diffusion","summary":" Generating multiview images from a single view facilitates the rapid\ngeneration of a 3D mesh conditioned on a single image. Recent methods that\nintroduce 3D global representation into diffusion models have shown the\npotential to generate consistent multiviews, but they have reduced generation\nspeed and face challenges in maintaining generalizability and quality. To\naddress this issue, we propose EpiDiff, a localized interactive multiview\ndiffusion model. At the core of the proposed approach is to insert a\nlightweight epipolar attention block into the frozen diffusion model,\nleveraging epipolar constraints to enable cross-view interaction among feature\nmaps of neighboring views. The newly initialized 3D modeling module preserves\nthe original feature distribution of the diffusion model, exhibiting\ncompatibility with a variety of base diffusion models. Experiments show that\nEpiDiff generates 16 multiview images in just 12 seconds, and it surpasses\nprevious methods in quality evaluation metrics, including PSNR, SSIM and LPIPS.\nAdditionally, EpiDiff can generate a more diverse distribution of views,\nimproving the reconstruction quality from generated multiviews. Please see our\nproject page at https://huanngzh.github.io/EpiDiff/.\n","authors":["Zehuan Huang","Hao Wen","Junting Dong","Yaohui Wang","Yangguang Li","Xinyuan Chen","Yan-Pei Cao","Ding Liang","Yu Qiao","Bo Dai","Lu Sheng"],"pdf_url":"https://arxiv.org/pdf/2312.06725v3.pdf","comment":"Project page: https://huanngzh.github.io/EpiDiff/"},{"id":"http://arxiv.org/abs/2306.05423v2","updated":"2024-04-02T09:12:29Z","published":"2023-06-08T17:59:32Z","title":"ADDP: Learning General Representations for Image Recognition and\n Generation with Alternating Denoising Diffusion Process","summary":" Image recognition and generation have long been developed independently of\neach other. With the recent trend towards general-purpose representation\nlearning, the development of general representations for both recognition and\ngeneration tasks is also promoted. However, preliminary attempts mainly focus\non generation performance, but are still inferior on recognition tasks. These\nmethods are modeled in the vector-quantized (VQ) space, whereas leading\nrecognition methods use pixels as inputs. Our key insights are twofold: (1)\npixels as inputs are crucial for recognition tasks; (2) VQ tokens as\nreconstruction targets are beneficial for generation tasks. These observations\nmotivate us to propose an Alternating Denoising Diffusion Process (ADDP) that\nintegrates these two spaces within a single representation learning framework.\nIn each denoising step, our method first decodes pixels from previous VQ\ntokens, then generates new VQ tokens from the decoded pixels. The diffusion\nprocess gradually masks out a portion of VQ tokens to construct the training\nsamples. The learned representations can be used to generate diverse\nhigh-fidelity images and also demonstrate excellent transfer performance on\nrecognition tasks. Extensive experiments show that our method achieves\ncompetitive performance on unconditional generation, ImageNet classification,\nCOCO detection, and ADE20k segmentation. Importantly, our method represents the\nfirst successful development of general representations applicable to both\ngeneration and dense recognition tasks. Code is released at\n\\url{https://github.com/ChangyaoTian/ADDP}.\n","authors":["Changyao Tian","Chenxin Tao","Jifeng Dai","Hao Li","Ziheng Li","Lewei Lu","Xiaogang Wang","Hongsheng Li","Gao Huang","Xizhou Zhu"],"pdf_url":"https://arxiv.org/pdf/2306.05423v2.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2404.01751v1","updated":"2024-04-02T09:07:05Z","published":"2024-04-02T09:07:05Z","title":"T-VSL: Text-Guided Visual Sound Source Localization in Mixtures","summary":" Visual sound source localization poses a significant challenge in identifying\nthe semantic region of each sounding source within a video. Existing\nself-supervised and weakly supervised source localization methods struggle to\naccurately distinguish the semantic regions of each sounding object,\nparticularly in multi-source mixtures. These methods often rely on audio-visual\ncorrespondence as guidance, which can lead to substantial performance drops in\ncomplex multi-source localization scenarios. The lack of access to individual\nsource sounds in multi-source mixtures during training exacerbates the\ndifficulty of learning effective audio-visual correspondence for localization.\nTo address this limitation, in this paper, we propose incorporating the text\nmodality as an intermediate feature guide using tri-modal joint embedding\nmodels (e.g., AudioCLIP) to disentangle the semantic audio-visual source\ncorrespondence in multi-source mixtures. Our framework, dubbed T-VSL, begins by\npredicting the class of sounding entities in mixtures. Subsequently, the\ntextual representation of each sounding source is employed as guidance to\ndisentangle fine-grained audio-visual source correspondence from multi-source\nmixtures, leveraging the tri-modal AudioCLIP embedding. This approach enables\nour framework to handle a flexible number of sources and exhibits promising\nzero-shot transferability to unseen classes during test time. Extensive\nexperiments conducted on the MUSIC, VGGSound, and VGGSound-Instruments datasets\ndemonstrate significant performance improvements over state-of-the-art methods.\n","authors":["Tanvir Mahmud","Yapeng Tian","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2404.01751v1.pdf","comment":"Tech report. Accepted in CVPR-2024"},{"id":"http://arxiv.org/abs/2404.01750v1","updated":"2024-04-02T09:05:47Z","published":"2024-04-02T09:05:47Z","title":"Exploring Latent Pathways: Enhancing the Interpretability of Autonomous\n Driving with a Variational Autoencoder","summary":" Autonomous driving presents a complex challenge, which is usually addressed\nwith artificial intelligence models that are end-to-end or modular in nature.\nWithin the landscape of modular approaches, a bio-inspired neural circuit\npolicy model has emerged as an innovative control module, offering a compact\nand inherently interpretable system to infer a steering wheel command from\nabstract visual features. Here, we take a leap forward by integrating a\nvariational autoencoder with the neural circuit policy controller, forming a\nsolution that directly generates steering commands from input camera images. By\nsubstituting the traditional convolutional neural network approach to feature\nextraction with a variational autoencoder, we enhance the system's\ninterpretability, enabling a more transparent and understandable\ndecision-making process.\n In addition to the architectural shift toward a variational autoencoder, this\nstudy introduces the automatic latent perturbation tool, a novel contribution\ndesigned to probe and elucidate the latent features within the variational\nautoencoder. The automatic latent perturbation tool automates the\ninterpretability process, offering granular insights into how specific latent\nvariables influence the overall model's behavior. Through a series of numerical\nexperiments, we demonstrate the interpretative power of the variational\nautoencoder-neural circuit policy model and the utility of the automatic latent\nperturbation tool in making the inner workings of autonomous driving systems\nmore transparent.\n","authors":["Anass Bairouk","Mirjana Maras","Simon Herlin","Alexander Amini","Marc Blanchon","Ramin Hasani","Patrick Chareyre","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2404.01750v1.pdf","comment":"Submitted to 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2404.01748v1","updated":"2024-04-02T09:04:56Z","published":"2024-04-02T09:04:56Z","title":"Global Mapping of Exposure and Physical Vulnerability Dynamics in Least\n Developed Countries using Remote Sensing and Machine Learning","summary":" As the world marked the midterm of the Sendai Framework for Disaster Risk\nReduction 2015-2030, many countries are still struggling to monitor their\nclimate and disaster risk because of the expensive large-scale survey of the\ndistribution of exposure and physical vulnerability and, hence, are not on\ntrack in reducing risks amidst the intensifying effects of climate change. We\npresent an ongoing effort in mapping this vital information using machine\nlearning and time-series remote sensing from publicly available Sentinel-1 SAR\nGRD and Sentinel-2 Harmonized MSI. We introduce the development of\n\"OpenSendaiBench\" consisting of 47 countries wherein most are least developed\n(LDCs), trained ResNet-50 deep learning models, and demonstrated the region of\nDhaka, Bangladesh by mapping the distribution of its informal constructions. As\na pioneering effort in auditing global disaster risk over time, this paper aims\nto advance the area of large-scale risk quantification in informing our\ncollective long-term efforts in reducing climate and disaster risk.\n","authors":["Joshua Dimasaka","Christian Geiß","Emily So"],"pdf_url":"https://arxiv.org/pdf/2404.01748v1.pdf","comment":"This is the camera-ready paper for the accepted poster at the 2nd\n Machine Learning for Remote Sensing Workshop, 12th International Conference\n on Learning Representations (ICLR) in Vienna, Austria, on the 11th of May\n 2024. Access the poster here: https://zenodo.org/doi/10.5281/zenodo.10903886\n Watch the video version of our poster here: https://youtu.be/N6ithJeCF4M"},{"id":"http://arxiv.org/abs/2402.11791v4","updated":"2024-04-02T09:02:04Z","published":"2024-02-19T02:41:37Z","title":"SDGE: Stereo Guided Depth Estimation for 360$^\\circ$ Camera Sets","summary":" Depth estimation is a critical technology in autonomous driving, and\nmulti-camera systems are often used to achieve a 360$^\\circ$ perception. These\n360$^\\circ$ camera sets often have limited or low-quality overlap regions,\nmaking multi-view stereo methods infeasible for the entire image.\nAlternatively, monocular methods may not produce consistent cross-view\npredictions. To address these issues, we propose the Stereo Guided Depth\nEstimation (SGDE) method, which enhances depth estimation of the full image by\nexplicitly utilizing multi-view stereo results on the overlap. We suggest\nbuilding virtual pinhole cameras to resolve the distortion problem of fisheye\ncameras and unify the processing for the two types of 360$^\\circ$ cameras. For\nhandling the varying noise on camera poses caused by unstable movement, the\napproach employs a self-calibration method to obtain highly accurate relative\nposes of the adjacent cameras with minor overlap. These enable the use of\nrobust stereo methods to obtain high-quality depth prior in the overlap region.\nThis prior serves not only as an additional input but also as pseudo-labels\nthat enhance the accuracy of depth estimation methods and improve cross-view\nprediction consistency. The effectiveness of SGDE is evaluated on one fisheye\ncamera dataset, Synthetic Urban, and two pinhole camera datasets, DDAD and\nnuScenes. Our experiments demonstrate that SGDE is effective for both\nsupervised and self-supervised depth estimation, and highlight the potential of\nour method for advancing downstream autonomous driving technologies, such as 3D\nobject detection and occupancy prediction.\n","authors":["Jialei Xu","Wei Yin","Dong Gong","Junjun Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2402.11791v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01745v1","updated":"2024-04-02T09:01:58Z","published":"2024-04-02T09:01:58Z","title":"Unleash the Potential of CLIP for Video Highlight Detection","summary":" Multimodal and large language models (LLMs) have revolutionized the\nutilization of open-world knowledge, unlocking novel potentials across various\ntasks and applications. Among these domains, the video domain has notably\nbenefited from their capabilities. In this paper, we present Highlight-CLIP\n(HL-CLIP), a method designed to excel in the video highlight detection task by\nleveraging the pre-trained knowledge embedded in multimodal models. By simply\nfine-tuning the multimodal encoder in combination with our innovative saliency\npooling technique, we have achieved the state-of-the-art performance in the\nhighlight detection task, the QVHighlight Benchmark, to the best of our\nknowledge.\n","authors":["Donghoon Han","Seunghyeon Seo","Eunhwan Park","Seong-Uk Nam","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2404.01745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01743v1","updated":"2024-04-02T09:01:21Z","published":"2024-04-02T09:01:21Z","title":"Atom-Level Optical Chemical Structure Recognition with Limited\n Supervision","summary":" Identifying the chemical structure from a graphical representation, or image,\nof a molecule is a challenging pattern recognition task that would greatly\nbenefit drug development. Yet, existing methods for chemical structure\nrecognition do not typically generalize well, and show diminished effectiveness\nwhen confronted with domains where data is sparse, or costly to generate, such\nas hand-drawn molecule images. To address this limitation, we propose a new\nchemical structure recognition tool that delivers state-of-the-art performance\nand can adapt to new domains with a limited number of data samples and\nsupervision. Unlike previous approaches, our method provides atom-level\nlocalization, and can therefore segment the image into the different atoms and\nbonds. Our model is the first model to perform OCSR with atom-level entity\ndetection with only SMILES supervision. Through rigorous and extensive\nbenchmarking, we demonstrate the preeminence of our chemical structure\nrecognition approach in terms of data efficiency, accuracy, and atom-level\nentity prediction.\n","authors":["Martijn Oldenhof","Edward De Brouwer","Adam Arany","Yves Moreau"],"pdf_url":"https://arxiv.org/pdf/2404.01743v1.pdf","comment":"Accepted in IEEE/CVF Conference on Computer Vision and Pattern\n Recognition 2024"},{"id":"http://arxiv.org/abs/2311.02072v2","updated":"2024-04-02T09:00:38Z","published":"2023-11-03T17:54:59Z","title":"HIPTrack: Visual Tracking with Historical Prompts","summary":" Trackers that follow Siamese paradigm utilize similarity matching between\ntemplate and search region features for tracking. Many methods have been\nexplored to enhance tracking performance by incorporating tracking history to\nbetter handle scenarios involving target appearance variations such as\ndeformation and occlusion. However, the utilization of historical information\nin existing methods is insufficient and incomprehensive, which typically\nrequires repetitive training and introduces a large amount of computation. In\nthis paper, we show that by providing a tracker that follows Siamese paradigm\nwith precise and updated historical information, a significant performance\nimprovement can be achieved with completely unchanged parameters. Based on\nthis, we propose a historical prompt network that uses refined historical\nforeground masks and historical visual features of the target to provide\ncomprehensive and precise prompts for the tracker. We build a novel tracker\ncalled HIPTrack based on the historical prompt network, which achieves\nconsiderable performance improvements without the need to retrain the entire\nmodel. We conduct experiments on seven datasets and experimental results\ndemonstrate that our method surpasses the current state-of-the-art trackers on\nLaSOT, LaSOText, GOT-10k and NfS. Furthermore, the historical prompt network\ncan seamlessly integrate as a plug-and-play module into existing trackers,\nproviding performance enhancements. The source code is available at\nhttps://github.com/WenRuiCai/HIPTrack.\n","authors":["Wenrui Cai","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.02072v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.03954v2","updated":"2024-04-02T08:59:57Z","published":"2024-03-06T18:58:49Z","title":"3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple\n 3D Representations","summary":" Imitation learning provides an efficient way to teach robots dexterous\nskills; however, learning complex skills robustly and generalizablely usually\nconsumes large amounts of human demonstrations. To tackle this challenging\nproblem, we present 3D Diffusion Policy (DP3), a novel visual imitation\nlearning approach that incorporates the power of 3D visual representations into\ndiffusion policies, a class of conditional action generative models. The core\ndesign of DP3 is the utilization of a compact 3D visual representation,\nextracted from sparse point clouds with an efficient point encoder. In our\nexperiments involving 72 simulation tasks, DP3 successfully handles most tasks\nwith just 10 demonstrations and surpasses baselines with a 24.2% relative\nimprovement. In 4 real robot tasks, DP3 demonstrates precise control with a\nhigh success rate of 85%, given only 40 demonstrations of each task, and shows\nexcellent generalization abilities in diverse aspects, including space,\nviewpoint, appearance, and instance. Interestingly, in real robot experiments,\nDP3 rarely violates safety requirements, in contrast to baseline methods which\nfrequently do, necessitating human intervention. Our extensive evaluation\nhighlights the critical importance of 3D representations in real-world robot\nlearning. Videos, code, and data are available on\nhttps://3d-diffusion-policy.github.io .\n","authors":["Yanjie Ze","Gu Zhang","Kangning Zhang","Chenyuan Hu","Muhan Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2403.03954v2.pdf","comment":"Videos, code, and data: https://3d-diffusion-policy.github.io"},{"id":"http://arxiv.org/abs/2307.09591v3","updated":"2024-04-02T08:55:51Z","published":"2023-07-18T19:56:20Z","title":"Saliency strikes back: How filtering out high frequencies improves\n white-box explanations","summary":" Attribution methods correspond to a class of explainability methods (XAI)\nthat aim to assess how individual inputs contribute to a model's\ndecision-making process. We have identified a significant limitation in one\ntype of attribution methods, known as \"white-box\" methods. Although highly\nefficient, these methods rely on a gradient signal that is often contaminated\nby high-frequency noise. To overcome this limitation, we introduce a new\napproach called \"FORGrad\". This simple method effectively filters out noise\nartifacts by using optimal cut-off frequencies tailored to the unique\ncharacteristics of each model architecture. Our findings show that FORGrad\nconsistently enhances the performance of already existing white-box methods,\nenabling them to compete effectively with more accurate yet computationally\ndemanding \"black-box\" methods. We anticipate that our research will foster\nbroader adoption of simpler and more efficient white-box methods for\nexplainability, offering a better balance between faithfulness and\ncomputational efficiency.\n","authors":["Sabine Muzellec","Thomas Fel","Victor Boutin","Léo andéol","Rufin VanRullen","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2307.09591v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03517v2","updated":"2024-04-02T08:40:54Z","published":"2023-12-06T14:24:26Z","title":"FRDiff : Feature Reuse for Universal Training-free Acceleration of\n Diffusion Models","summary":" The substantial computational costs of diffusion models, especially due to\nthe repeated denoising steps necessary for high-quality image generation,\npresent a major obstacle to their widespread adoption. While several studies\nhave attempted to address this issue by reducing the number of score function\nevaluations (NFE) using advanced ODE solvers without fine-tuning, the decreased\nnumber of denoising iterations misses the opportunity to update fine details,\nresulting in noticeable quality degradation. In our work, we introduce an\nadvanced acceleration technique that leverages the temporal redundancy inherent\nin diffusion models. Reusing feature maps with high temporal similarity opens\nup a new opportunity to save computation resources without compromising output\nquality. To realize the practical benefits of this intuition, we conduct an\nextensive analysis and propose a novel method, FRDiff. FRDiff is designed to\nharness the advantages of both reduced NFE and feature reuse, achieving a\nPareto frontier that balances fidelity and latency trade-offs in various\ngenerative tasks.\n","authors":["Junhyuk So","Jungwon Lee","Eunhyeok Park"],"pdf_url":"https://arxiv.org/pdf/2312.03517v2.pdf","comment":"Work in progress. Project page :\n https://jungwon-lee.github.io/Project_FRDiff/"},{"id":"http://arxiv.org/abs/2404.01727v1","updated":"2024-04-02T08:33:21Z","published":"2024-04-02T08:33:21Z","title":"Generalizing 6-DoF Grasp Detection via Domain Prior Knowledge","summary":" We focus on the generalization ability of the 6-DoF grasp detection method in\nthis paper. While learning-based grasp detection methods can predict grasp\nposes for unseen objects using the grasp distribution learned from the training\nset, they often exhibit a significant performance drop when encountering\nobjects with diverse shapes and structures. To enhance the grasp detection\nmethods' generalization ability, we incorporate domain prior knowledge of\nrobotic grasping, enabling better adaptation to objects with significant shape\nand structure differences. More specifically, we employ the physical constraint\nregularization during the training phase to guide the model towards predicting\ngrasps that comply with the physical rule on grasping. For the unstable grasp\nposes predicted on novel objects, we design a contact-score joint optimization\nusing the projection contact map to refine these poses in cluttered scenarios.\nExtensive experiments conducted on the GraspNet-1billion benchmark demonstrate\na substantial performance gain on the novel object set and the real-world\ngrasping experiments also demonstrate the effectiveness of our generalizing\n6-DoF grasp detection method.\n","authors":["Haoxiang Ma","Modi Shi","Boyang Gao","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.01727v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2401.04728v2","updated":"2024-04-02T08:29:09Z","published":"2024-01-09T18:59:04Z","title":"Morphable Diffusion: 3D-Consistent Diffusion for Single-image Avatar\n Creation","summary":" Recent advances in generative diffusion models have enabled the previously\nunfeasible capability of generating 3D assets from a single input image or a\ntext prompt. In this work, we aim to enhance the quality and functionality of\nthese models for the task of creating controllable, photorealistic human\navatars. We achieve this by integrating a 3D morphable model into the\nstate-of-the-art multi-view-consistent diffusion approach. We demonstrate that\naccurate conditioning of a generative pipeline on the articulated 3D model\nenhances the baseline model performance on the task of novel view synthesis\nfrom a single image. More importantly, this integration facilitates a seamless\nand accurate incorporation of facial expression and body pose control into the\ngeneration process. To the best of our knowledge, our proposed framework is the\nfirst diffusion model to enable the creation of fully 3D-consistent,\nanimatable, and photorealistic human avatars from a single image of an unseen\nsubject; extensive quantitative and qualitative evaluations demonstrate the\nadvantages of our approach over existing state-of-the-art avatar creation\nmodels on both novel view and novel expression synthesis tasks. The code for\nour project is publicly available.\n","authors":["Xiyi Chen","Marko Mihajlovic","Shaofei Wang","Sergey Prokudin","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2401.04728v2.pdf","comment":"[CVPR 2024] Project page:\n https://xiyichen.github.io/morphablediffusion/"},{"id":"http://arxiv.org/abs/2402.10739v3","updated":"2024-04-02T08:26:43Z","published":"2024-02-16T14:56:13Z","title":"PointMamba: A Simple State Space Model for Point Cloud Analysis","summary":" Transformers have become one of the foundational architectures in point cloud\nanalysis tasks due to their excellent global modeling ability. However, the\nattention mechanism has quadratic complexity and is difficult to extend to long\nsequence modeling due to limited computational resources and so on. Recently,\nstate space models (SSM), a new family of deep sequence models, have presented\ngreat potential for sequence modeling in NLP tasks. In this paper, taking\ninspiration from the success of SSM in NLP, we propose PointMamba, a framework\nwith global modeling and linear complexity. Specifically, by taking embedded\npoint patches as input, we proposed a reordering strategy to enhance SSM's\nglobal modeling ability by providing a more logical geometric scanning order.\nThe reordered point tokens are then sent to a series of Mamba blocks to\ncausally capture the point cloud structure. Experimental results show our\nproposed PointMamba outperforms the transformer-based counterparts on different\npoint cloud analysis datasets, while significantly saving about 44.3%\nparameters and 25% FLOPs, demonstrating the potential option for constructing\nfoundational 3D vision models. We hope our PointMamba can provide a new\nperspective for point cloud analysis. The code is available at\nhttps://github.com/LMD0311/PointMamba.\n","authors":["Dingkang Liang","Xin Zhou","Xinyu Wang","Xingkui Zhu","Wei Xu","Zhikang Zou","Xiaoqing Ye","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2402.10739v3.pdf","comment":"Work in progress. The code is available at\n https://github.com/LMD0311/PointMamba"},{"id":"http://arxiv.org/abs/2404.01725v1","updated":"2024-04-02T08:21:16Z","published":"2024-04-02T08:21:16Z","title":"Disentangled Pre-training for Human-Object Interaction Detection","summary":" Detecting human-object interaction (HOI) has long been limited by the amount\nof supervised data available. Recent approaches address this issue by\npre-training according to pseudo-labels, which align object regions with HOI\ntriplets parsed from image captions. However, pseudo-labeling is tricky and\nnoisy, making HOI pre-training a complex process. Therefore, we propose an\nefficient disentangled pre-training method for HOI detection (DP-HOI) to\naddress this problem. First, DP-HOI utilizes object detection and action\nrecognition datasets to pre-train the detection and interaction decoder layers,\nrespectively. Then, we arrange these decoder layers so that the pre-training\narchitecture is consistent with the downstream HOI detection task. This\nfacilitates efficient knowledge transfer. Specifically, the detection decoder\nidentifies reliable human instances in each action recognition dataset image,\ngenerates one corresponding query, and feeds it into the interaction decoder\nfor verb classification. Next, we combine the human instance verb predictions\nin the same image and impose image-level supervision. The DP-HOI structure can\nbe easily adapted to the HOI detection task, enabling effective model parameter\ninitialization. Therefore, it significantly enhances the performance of\nexisting HOI detection models on a broad range of rare categories. The code and\npre-trained weight are available at https://github.com/xingaoli/DP-HOI.\n","authors":["Zhuolong Li","Xingao Li","Changxing Ding","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2404.01725v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.01723v1","updated":"2024-04-02T08:17:39Z","published":"2024-04-02T08:17:39Z","title":"Contextual Embedding Learning to Enhance 2D Networks for Volumetric\n Image Segmentation","summary":" The segmentation of organs in volumetric medical images plays an important\nrole in computer-aided diagnosis and treatment/surgery planning. Conventional\n2D convolutional neural networks (CNNs) can hardly exploit the spatial\ncorrelation of volumetric data. Current 3D CNNs have the advantage to extract\nmore powerful volumetric representations but they usually suffer from occupying\nexcessive memory and computation nevertheless. In this study we aim to enhance\nthe 2D networks with contextual information for better volumetric image\nsegmentation. Accordingly, we propose a contextual embedding learning approach\nto facilitate 2D CNNs capturing spatial information properly. Our approach\nleverages the learned embedding and the slice-wisely neighboring matching as a\nsoft cue to guide the network. In such a way, the contextual information can be\ntransferred slice-by-slice thus boosting the volumetric representation of the\nnetwork. Experiments on challenging prostate MRI dataset (PROMISE12) and\nabdominal CT dataset (CHAOS) show that our contextual embedding learning can\neffectively leverage the inter-slice context and improve segmentation\nperformance. The proposed approach is a plug-and-play, and memory-efficient\nsolution to enhance the 2D networks for volumetric segmentation. The code will\nbe publicly available.\n","authors":["Zhuoyuan Wang","Dong Sun","Xiangyun Zeng","Ruodai Wu","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.01723v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.12113v4","updated":"2024-04-02T08:14:57Z","published":"2023-08-23T13:06:59Z","title":"Advancements in Point Cloud Data Augmentation for Deep Learning: A\n Survey","summary":" Deep learning (DL) has become one of the mainstream and effective methods for\npoint cloud analysis tasks such as detection, segmentation and classification.\nTo reduce overfitting during training DL models and improve model performance\nespecially when the amount and/or diversity of training data are limited,\naugmentation is often crucial. Although various point cloud data augmentation\nmethods have been widely used in different point cloud processing tasks, there\nare currently no published systematic surveys or reviews of these methods.\nTherefore, this article surveys these methods, categorizing them into a\ntaxonomy framework that comprises basic and specialized point cloud data\naugmentation methods. Through a comprehensive evaluation of these augmentation\nmethods, this article identifies their potentials and limitations, serving as a\nuseful reference for choosing appropriate augmentation methods. In addition,\npotential directions for future research are recommended. This survey\ncontributes to providing a holistic overview of the current state of point\ncloud data augmentation, promoting its wider application and development.\n","authors":["Qinfeng Zhu","Lei Fan","Ningxin Weng"],"pdf_url":"https://arxiv.org/pdf/2308.12113v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01717v1","updated":"2024-04-02T08:07:38Z","published":"2024-04-02T08:07:38Z","title":"AddSR: Accelerating Diffusion-based Blind Super-Resolution with\n Adversarial Diffusion Distillation","summary":" Blind super-resolution methods based on stable diffusion showcase formidable\ngenerative capabilities in reconstructing clear high-resolution images with\nintricate details from low-resolution inputs. However, their practical\napplicability is often hampered by poor efficiency, stemming from the\nrequirement of thousands or hundreds of sampling steps. Inspired by the\nefficient text-to-image approach adversarial diffusion distillation (ADD), we\ndesign AddSR to address this issue by incorporating the ideas of both\ndistillation and ControlNet. Specifically, we first propose a prediction-based\nself-refinement strategy to provide high-frequency information in the student\nmodel output with marginal additional time cost. Furthermore, we refine the\ntraining process by employing HR images, rather than LR images, to regulate the\nteacher model, providing a more robust constraint for distillation. Second, we\nintroduce a timestep-adapting loss to address the perception-distortion\nimbalance problem introduced by ADD. Extensive experiments demonstrate our\nAddSR generates better restoration results, while achieving faster speed than\nprevious SD-based state-of-the-art models (e.g., 7x faster than SeeSR).\n","authors":["Rui Xie","Ying Tai","Kai Zhang","Zhenyu Zhang","Jun Zhou","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01714v1","updated":"2024-04-02T07:57:17Z","published":"2024-04-02T07:57:17Z","title":"Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization\n Algorithm for Deep Learning","summary":" Training deep neural networks is a challenging task. In order to speed up\ntraining and enhance the performance of deep neural networks, we rectify the\nvanilla conjugate gradient as conjugate-gradient-like and incorporate it into\nthe generic Adam, and thus propose a new optimization algorithm named\nCG-like-Adam for deep learning. Specifically, both the first-order and the\nsecond-order moment estimation of generic Adam are replaced by the\nconjugate-gradient-like. Convergence analysis handles the cases where the\nexponential moving average coefficient of the first-order moment estimation is\nconstant and the first-order moment estimation is unbiased. Numerical\nexperiments show the superiority of the proposed algorithm based on the\nCIFAR10/100 dataset.\n","authors":["Jiawu Tian","Liwei Xu","Xiaowei Zhang","Yongqi Li"],"pdf_url":"https://arxiv.org/pdf/2404.01714v1.pdf","comment":"32 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.01709v1","updated":"2024-04-02T07:49:08Z","published":"2024-04-02T07:49:08Z","title":"Upsample Guidance: Scale Up Diffusion Models without Training","summary":" Diffusion models have demonstrated superior performance across various\ngenerative tasks including images, videos, and audio. However, they encounter\ndifficulties in directly generating high-resolution samples. Previously\nproposed solutions to this issue involve modifying the architecture, further\ntraining, or partitioning the sampling process into multiple stages. These\nmethods have the limitation of not being able to directly utilize pre-trained\nmodels as-is, requiring additional work. In this paper, we introduce upsample\nguidance, a technique that adapts pretrained diffusion model (e.g., $512^2$) to\ngenerate higher-resolution images (e.g., $1536^2$) by adding only a single term\nin the sampling process. Remarkably, this technique does not necessitate any\nadditional training or relying on external models. We demonstrate that upsample\nguidance can be applied to various models, such as pixel-space, latent space,\nand video diffusion models. We also observed that the proper selection of\nguidance scale can improve image quality, fidelity, and prompt alignment.\n","authors":["Juno Hwang","Yong-Hyun Park","Junghyo Jo"],"pdf_url":"https://arxiv.org/pdf/2404.01709v1.pdf","comment":"15 pages, 15 Figures"},{"id":"http://arxiv.org/abs/2404.01705v1","updated":"2024-04-02T07:38:16Z","published":"2024-04-02T07:38:16Z","title":"Samba: Semantic Segmentation of Remotely Sensed Images with State Space\n Model","summary":" High-resolution remotely sensed images poses a challenge for commonly used\nsemantic segmentation methods such as Convolutional Neural Network (CNN) and\nVision Transformer (ViT). CNN-based methods struggle with handling such\nhigh-resolution images due to their limited receptive field, while ViT faces\nchallenges to handle long sequences. Inspired by Mamba, which adopts a State\nSpace Model (SSM) to efficiently capture global semantic information, we\npropose a semantic segmentation framework for high-resolution remotely sensed\nimages, named Samba. Samba utilizes an encoder-decoder architecture, with Samba\nblocks serving as the encoder for efficient multi-level semantic information\nextraction, and UperNet functioning as the decoder. We evaluate Samba on the\nLoveDA dataset, comparing its performance against top-performing CNN and ViT\nmethods. The results reveal that Samba achieved unparalleled performance on\nLoveDA. This represents that the proposed Samba is an effective application of\nthe SSM in semantic segmentation of remotely sensed images, setting a new\nbenchmark in performance for Mamba-based techniques in this specific\napplication. The source code and baseline implementations are available at\nhttps://github.com/zhuqinfeng1999/Samba.\n","authors":["Qinfeng Zhu","Yuanzhi Cai","Yuan Fang","Yihan Yang","Cheng Chen","Lei Fan","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.01705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01703v1","updated":"2024-04-02T07:16:56Z","published":"2024-04-02T07:16:56Z","title":"Boosting Visual Recognition for Autonomous Driving in Real-world\n Degradations with Deep Channel Prior","summary":" The environmental perception of autonomous vehicles in normal conditions have\nachieved considerable success in the past decade. However, various unfavourable\nconditions such as fog, low-light, and motion blur will degrade image quality\nand pose tremendous threats to the safety of autonomous driving. That is, when\napplied to degraded images, state-of-the-art visual models often suffer\nperformance decline due to the feature content loss and artifact interference\ncaused by statistical and structural properties disruption of captured images.\nTo address this problem, this work proposes a novel Deep Channel Prior (DCP)\nfor degraded visual recognition. Specifically, we observe that, in the deep\nrepresentation space of pre-trained models, the channel correlations of\ndegraded features with the same degradation type have uniform distribution even\nif they have different content and semantics, which can facilitate the mapping\nrelationship learning between degraded and clear representations in\nhigh-sparsity feature space. Based on this, a novel plug-and-play Unsupervised\nFeature Enhancement Module (UFEM) is proposed to achieve unsupervised feature\ncorrection, where the multi-adversarial mechanism is introduced in the first\nstage of UFEM to achieve the latent content restoration and artifact removal in\nhigh-sparsity feature space. Then, the generated features are transferred to\nthe second stage for global correlation modulation under the guidance of DCP to\nobtain high-quality and recognition-friendly features. Evaluations of three\ntasks and eight benchmark datasets demonstrate that our proposed method can\ncomprehensively improve the performance of pre-trained models in real\ndegradation conditions. The source code is available at\nhttps://github.com/liyuhang166/Deep_Channel_Prior\n","authors":["Zhanwen Liu","Yuhang Li","Yang Wang","Bolin Gao","Yisheng An","Xiangmo Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.01703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00521v2","updated":"2024-04-02T07:15:34Z","published":"2024-03-31T01:41:36Z","title":"CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz\n continuity constrAIned Normalization","summary":" Generative Adversarial Networks (GANs) significantly advanced image\ngeneration but their performance heavily depends on abundant training data. In\nscenarios with limited data, GANs often struggle with discriminator overfitting\nand unstable training. Batch Normalization (BN), despite being known for\nenhancing generalization and training stability, has rarely been used in the\ndiscriminator of Data-Efficient GANs. Our work addresses this gap by\nidentifying a critical flaw in BN: the tendency for gradient explosion during\nthe centering and scaling steps. To tackle this issue, we present CHAIN\n(lipsCHitz continuity constrAIned Normalization), which replaces the\nconventional centering step with zero-mean regularization and integrates a\nLipschitz continuity constraint in the scaling step. CHAIN further enhances GAN\ntraining by adaptively interpolating the normalized and unnormalized features,\neffectively avoiding discriminator overfitting. Our theoretical analyses firmly\nestablishes CHAIN's effectiveness in reducing gradients in latent features and\nweights, improving stability and generalization in GAN training. Empirical\nevidence supports our theory. CHAIN achieves state-of-the-art results in\ndata-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven\nhigh-resolution few-shot image datasets.\n","authors":["Yao Ni","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2404.00521v2.pdf","comment":"Accepted by CVPR2024, 26 pages full version"},{"id":"http://arxiv.org/abs/2404.01700v1","updated":"2024-04-02T07:09:29Z","published":"2024-04-02T07:09:29Z","title":"MotionChain: Conversational Motion Controllers via Multimodal Prompts","summary":" Recent advancements in language models have demonstrated their adeptness in\nconducting multi-turn dialogues and retaining conversational context. However,\nthis proficiency remains largely unexplored in other multimodal generative\nmodels, particularly in human motion models. By integrating multi-turn\nconversations in controlling continuous virtual human movements, generative\nhuman motion models can achieve an intuitive and step-by-step process of human\ntask execution for humanoid robotics, game agents, or other embodied systems.\nIn this work, we present MotionChain, a conversational human motion controller\nto generate continuous and long-term human motion through multimodal prompts.\nSpecifically, MotionChain consists of multi-modal tokenizers that transform\nvarious data types such as text, image, and motion, into discrete tokens,\ncoupled with a Vision-Motion-aware Language model. By leveraging large-scale\nlanguage, vision-language, and vision-motion data to assist motion-related\ngeneration tasks, MotionChain thus comprehends each instruction in multi-turn\nconversation and generates human motions followed by these prompts. Extensive\nexperiments validate the efficacy of MotionChain, demonstrating\nstate-of-the-art performance in conversational motion generation, as well as\nmore intuitive manners of controlling and interacting with virtual humans.\n","authors":["Biao Jiang","Xin Chen","Chi Zhang","Fukun Yin","Zhuoyuan Li","Gang YU","Jiayuan Fan"],"pdf_url":"https://arxiv.org/pdf/2404.01700v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.01699v1","updated":"2024-04-02T07:08:15Z","published":"2024-04-02T07:08:15Z","title":"Task Integration Distillation for Object Detectors","summary":" Knowledge distillation is a widely adopted technique for model lightening.\nHowever, the performance of most knowledge distillation methods in the domain\nof object detection is not satisfactory. Typically, knowledge distillation\napproaches consider only the classification task among the two sub-tasks of an\nobject detector, largely overlooking the regression task. This oversight leads\nto a partial understanding of the object detector's comprehensive task,\nresulting in skewed estimations and potentially adverse effects. Therefore, we\npropose a knowledge distillation method that addresses both the classification\nand regression tasks, incorporating a task significance strategy. By evaluating\nthe importance of features based on the output of the detector's two sub-tasks,\nour approach ensures a balanced consideration of both classification and\nregression tasks in object detection. Drawing inspiration from real-world\nteaching processes and the definition of learning condition, we introduce a\nmethod that focuses on both key and weak areas. By assessing the value of\nfeatures for knowledge distillation based on their importance differences, we\naccurately capture the current model's learning situation. This method\neffectively prevents the issue of biased predictions about the model's learning\nreality caused by an incomplete utilization of the detector's outputs.\n","authors":["Hai Su","ZhenWen Jian","Songsen Yu"],"pdf_url":"https://arxiv.org/pdf/2404.01699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14828v2","updated":"2024-04-02T06:55:59Z","published":"2024-01-26T12:57:05Z","title":"TIP-Editor: An Accurate 3D Editor Following Both Text-Prompts And\n Image-Prompts","summary":" Text-driven 3D scene editing has gained significant attention owing to its\nconvenience and user-friendliness. However, existing methods still lack\naccurate control of the specified appearance and location of the editing result\ndue to the inherent limitations of the text description. To this end, we\npropose a 3D scene editing framework, TIPEditor, that accepts both text and\nimage prompts and a 3D bounding box to specify the editing region. With the\nimage prompt, users can conveniently specify the detailed appearance/style of\nthe target content in complement to the text description, enabling accurate\ncontrol of the appearance. Specifically, TIP-Editor employs a stepwise 2D\npersonalization strategy to better learn the representation of the existing\nscene and the reference image, in which a localization loss is proposed to\nencourage correct object placement as specified by the bounding box.\nAdditionally, TIPEditor utilizes explicit and flexible 3D Gaussian splatting as\nthe 3D representation to facilitate local editing while keeping the background\nunchanged. Extensive experiments have demonstrated that TIP-Editor conducts\naccurate editing following the text and image prompts in the specified bounding\nbox region, consistently outperforming the baselines in editing quality, and\nthe alignment to the prompts, qualitatively and quantitatively.\n","authors":["Jingyu Zhuang","Di Kang","Yan-Pei Cao","Guanbin Li","Liang Lin","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2401.14828v2.pdf","comment":"Accpeted by Siggraph 2024 & ACM Transactions on Graphics"},{"id":"http://arxiv.org/abs/2404.01692v1","updated":"2024-04-02T06:52:31Z","published":"2024-04-02T06:52:31Z","title":"Beyond Image Super-Resolution for Image Recognition with Task-Driven\n Perceptual Loss","summary":" In real-world scenarios, image recognition tasks, such as semantic\nsegmentation and object detection, often pose greater challenges due to the\nlack of information available within low-resolution (LR) content. Image\nsuper-resolution (SR) is one of the promising solutions for addressing the\nchallenges. However, due to the ill-posed property of SR, it is challenging for\ntypical SR methods to restore task-relevant high-frequency contents, which may\ndilute the advantage of utilizing the SR method. Therefore, in this paper, we\npropose Super-Resolution for Image Recognition (SR4IR) that effectively guides\nthe generation of SR images beneficial to achieving satisfactory image\nrecognition performance when processing LR images. The critical component of\nour SR4IR is the task-driven perceptual (TDP) loss that enables the SR network\nto acquire task-specific knowledge from a network tailored for a specific task.\nMoreover, we propose a cross-quality patch mix and an alternate training\nframework that significantly enhances the efficacy of the TDP loss by\naddressing potential problems when employing the TDP loss. Through extensive\nexperiments, we demonstrate that our SR4IR achieves outstanding task\nperformance by generating SR images useful for a specific image recognition\ntask, including semantic segmentation, object detection, and image\nclassification. The implementation code is available at\nhttps://github.com/JaehaKim97/SR4IR.\n","authors":["Jaeha Kim","Junghun Oh","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.01692v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01690v1","updated":"2024-04-02T06:49:38Z","published":"2024-04-02T06:49:38Z","title":"RefQSR: Reference-based Quantization for Image Super-Resolution Networks","summary":" Single image super-resolution (SISR) aims to reconstruct a high-resolution\nimage from its low-resolution observation. Recent deep learning-based SISR\nmodels show high performance at the expense of increased computational costs,\nlimiting their use in resource-constrained environments. As a promising\nsolution for computationally efficient network design, network quantization has\nbeen extensively studied. However, existing quantization methods developed for\nSISR have yet to effectively exploit image self-similarity, which is a new\ndirection for exploration in this study. We introduce a novel method called\nreference-based quantization for image super-resolution (RefQSR) that applies\nhigh-bit quantization to several representative patches and uses them as\nreferences for low-bit quantization of the rest of the patches in an image. To\nthis end, we design dedicated patch clustering and reference-based quantization\nmodules and integrate them into existing SISR network quantization methods. The\nexperimental results demonstrate the effectiveness of RefQSR on various SISR\nnetworks and quantization methods.\n","authors":["Hongjae Lee","Jun-Sang Yoo","Seung-Won Jung"],"pdf_url":"https://arxiv.org/pdf/2404.01690v1.pdf","comment":"Accepted by IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2404.01686v1","updated":"2024-04-02T06:43:22Z","published":"2024-04-02T06:43:22Z","title":"JRDB-PanoTrack: An Open-world Panoptic Segmentation and Tracking Robotic\n Dataset in Crowded Human Environments","summary":" Autonomous robot systems have attracted increasing research attention in\nrecent years, where environment understanding is a crucial step for robot\nnavigation, human-robot interaction, and decision. Real-world robot systems\nusually collect visual data from multiple sensors and are required to recognize\nnumerous objects and their movements in complex human-crowded settings.\nTraditional benchmarks, with their reliance on single sensors and limited\nobject classes and scenarios, fail to provide the comprehensive environmental\nunderstanding robots need for accurate navigation, interaction, and\ndecision-making. As an extension of JRDB dataset, we unveil JRDB-PanoTrack, a\nnovel open-world panoptic segmentation and tracking benchmark, towards more\ncomprehensive environmental perception. JRDB-PanoTrack includes (1) various\ndata involving indoor and outdoor crowded scenes, as well as comprehensive 2D\nand 3D synchronized data modalities; (2) high-quality 2D spatial panoptic\nsegmentation and temporal tracking annotations, with additional 3D label\nprojections for further spatial understanding; (3) diverse object classes for\nclosed- and open-world recognition benchmarks, with OSPA-based metrics for\nevaluation. Extensive evaluation of leading methods shows significant\nchallenges posed by our dataset.\n","authors":["Duy-Tho Le","Chenhui Gou","Stavya Datta","Hengcan Shi","Ian Reid","Jianfei Cai","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2404.01686v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.07711v3","updated":"2024-04-02T06:38:18Z","published":"2024-03-12T14:53:56Z","title":"SSM Meets Video Diffusion Models: Efficient Video Generation with\n Structured State Spaces","summary":" Given the remarkable achievements in image generation through diffusion\nmodels, the research community has shown increasing interest in extending these\nmodels to video generation. Recent diffusion models for video generation have\npredominantly utilized attention layers to extract temporal features. However,\nattention layers are limited by their memory consumption, which increases\nquadratically with the length of the sequence. This limitation presents\nsignificant challenges when attempting to generate longer video sequences using\ndiffusion models. To overcome this challenge, we propose leveraging state-space\nmodels (SSMs). SSMs have recently gained attention as viable alternatives due\nto their linear memory consumption relative to sequence length. In the\nexperiments, we first evaluate our SSM-based model with UCF101, a standard\nbenchmark of video generation. In addition, to investigate the potential of\nSSMs for longer video generation, we perform an experiment using the MineRL\nNavigate dataset, varying the number of frames to 64, 200, and 400. In these\nsettings, our SSM-based model can considerably save memory consumption for\nlonger sequences, while maintaining competitive FVD scores to the\nattention-based models. Our codes are available at\nhttps://github.com/shim0114/SSM-Meets-Video-Diffusion-Models.\n","authors":["Yuta Oshima","Shohei Taniguchi","Masahiro Suzuki","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2403.07711v3.pdf","comment":"Accepted as a workshop paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2404.01674v1","updated":"2024-04-02T06:25:16Z","published":"2024-04-02T06:25:16Z","title":"PRISM-TopoMap: Online Topological Mapping with Place Recognition and\n Scan Matching","summary":" Mapping is one of the crucial tasks enabling autonomous navigation of a\nmobile robot. Conventional mapping methods output dense geometric map\nrepresentation, e.g. an occupancy grid, which is not trivial to keep consistent\nfor the prolonged runs covering large environments. Meanwhile, capturing the\ntopological structure of the workspace enables fast path planning, is less\nprone to odometry error accumulation and does not consume much memory.\nFollowing this idea, this paper introduces PRISM-TopoMap -- a topological\nmapping method that maintains a graph of locally aligned locations not relying\non global metric coordinates. The proposed method involves learnable multimodal\nplace recognition paired with the scan matching pipeline for localization and\nloop closure in the graph of locations. The latter is updated online and the\nrobot is localized in a proper node at each time step. We conduct a broad\nexperimental evaluation of the suggested approach in a range of photo-realistic\nenvironments and on a real robot (wheeled differential driven Husky robot), and\ncompare it to state of the art. The results of the empirical evaluation confirm\nthat PRISM-Topomap consistently outperforms competitors across several measures\nof mapping and navigation efficiency and performs well on a real robot. The\ncode of PRISM-Topomap is open-sourced and available at\nhttps://github.com/kirillMouraviev/prism-topomap.\n","authors":["Kirill Muravyev","Alexander Melekhin","Dmitriy Yudin","Konstantin Yakovlev"],"pdf_url":"https://arxiv.org/pdf/2404.01674v1.pdf","comment":"This is a pre-print of the paper submitted to an IROS 2024 conference"},{"id":"http://arxiv.org/abs/2404.01673v1","updated":"2024-04-02T06:24:21Z","published":"2024-04-02T06:24:21Z","title":"A Universal Knowledge Embedded Contrastive Learning Framework for\n Hyperspectral Image Classification","summary":" Hyperspectral image (HSI) classification techniques have been intensively\nstudied and a variety of models have been developed. However, these HSI\nclassification models are confined to pocket models and unrealistic ways of\ndatasets partitioning. The former limits the generalization performance of the\nmodel and the latter is partitioned leads to inflated model evaluation metrics,\nwhich results in plummeting model performance in the real world. Therefore, we\npropose a universal knowledge embedded contrastive learning framework (KnowCL)\nfor supervised, unsupervised, and semisupervised HSI classification, which\nlargely closes the gap of HSI classification models between pocket models and\nstandard vision backbones. We present a new HSI processing pipeline in\nconjunction with a range of data transformation and augmentation techniques\nthat provide diverse data representations and realistic data partitioning. The\nproposed framework based on this pipeline is compatible with all kinds of\nbackbones and can fully exploit labeled and unlabeled samples with expected\ntraining time. Furthermore, we design a new loss function, which can adaptively\nfuse the supervised loss and unsupervised loss, enhancing the learning\nperformance. This proposed new classification paradigm shows great potentials\nin exploring for HSI classification technology. The code can be accessed at\nhttps://github.com/quanweiliu/KnowCL.\n","authors":["Quanwei Liu","Yanni Dong","Tao Huang","Lefei Zhang","Bo Do"],"pdf_url":"https://arxiv.org/pdf/2404.01673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16416v4","updated":"2024-04-02T06:17:34Z","published":"2024-01-29T18:55:29Z","title":"Endo-4DGS: Endoscopic Monocular Scene Reconstruction with 4D Gaussian\n Splatting","summary":" In the realm of robot-assisted minimally invasive surgery, dynamic scene\nreconstruction can significantly enhance downstream tasks and improve surgical\noutcomes. Neural Radiance Fields (NeRF)-based methods have recently risen to\nprominence for their exceptional ability to reconstruct scenes but are hampered\nby slow inference speed, prolonged training, and inconsistent depth estimation.\nSome previous work utilizes ground truth depth for optimization but is hard to\nacquire in the surgical domain. To overcome these obstacles, we present\nEndo-4DGS, a real-time endoscopic dynamic reconstruction approach that utilizes\n3D Gaussian Splatting (GS) for 3D representation. Specifically, we propose\nlightweight MLPs to capture temporal dynamics with Gaussian deformation fields.\nTo obtain a satisfactory Gaussian Initialization, we exploit a powerful depth\nestimation foundation model, Depth-Anything, to generate pseudo-depth maps as a\ngeometry prior. We additionally propose confidence-guided learning to tackle\nthe ill-pose problems in monocular depth estimation and enhance the\ndepth-guided reconstruction with surface normal constraints and depth\nregularization. Our approach has been validated on two surgical datasets, where\nit can effectively render in real-time, compute efficiently, and reconstruct\nwith remarkable accuracy.\n","authors":["Yiming Huang","Beilei Cui","Long Bai","Ziqi Guo","Mengya Xu","Mobarakol Islam","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2401.16416v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00658v2","updated":"2024-04-02T06:15:15Z","published":"2024-03-31T12:04:27Z","title":"KTPFormer: Kinematics and Trajectory Prior Knowledge-Enhanced\n Transformer for 3D Human Pose Estimation","summary":" This paper presents a novel Kinematics and Trajectory Prior\nKnowledge-Enhanced Transformer (KTPFormer), which overcomes the weakness in\nexisting transformer-based methods for 3D human pose estimation that the\nderivation of Q, K, V vectors in their self-attention mechanisms are all based\non simple linear mapping. We propose two prior attention modules, namely\nKinematics Prior Attention (KPA) and Trajectory Prior Attention (TPA) to take\nadvantage of the known anatomical structure of the human body and motion\ntrajectory information, to facilitate effective learning of global dependencies\nand features in the multi-head self-attention. KPA models kinematic\nrelationships in the human body by constructing a topology of kinematics, while\nTPA builds a trajectory topology to learn the information of joint motion\ntrajectory across frames. Yielding Q, K, V vectors with prior knowledge, the\ntwo modules enable KTPFormer to model both spatial and temporal correlations\nsimultaneously. Extensive experiments on three benchmarks (Human3.6M,\nMPI-INF-3DHP and HumanEva) show that KTPFormer achieves superior performance in\ncomparison to state-of-the-art methods. More importantly, our KPA and TPA\nmodules have lightweight plug-and-play designs and can be integrated into\nvarious transformer-based networks (i.e., diffusion-based) to improve the\nperformance with only a very small increase in the computational overhead. The\ncode is available at: https://github.com/JihuaPeng/KTPFormer.\n","authors":["Jihua Peng","Yanghong Zhou","P. Y. Mok"],"pdf_url":"https://arxiv.org/pdf/2404.00658v2.pdf","comment":"Accepted by CVPR 2024,GitHub\n code:https://github.com/JihuaPeng/KTPFormer"},{"id":"http://arxiv.org/abs/2403.19976v2","updated":"2024-04-02T06:03:32Z","published":"2024-03-29T04:58:56Z","title":"eTraM: Event-based Traffic Monitoring Dataset","summary":" Event cameras, with their high temporal and dynamic range and minimal memory\nusage, have found applications in various fields. However, their potential in\nstatic traffic monitoring remains largely unexplored. To facilitate this\nexploration, we present eTraM - a first-of-its-kind, fully event-based traffic\nmonitoring dataset. eTraM offers 10 hr of data from different traffic scenarios\nin various lighting and weather conditions, providing a comprehensive overview\nof real-world situations. Providing 2M bounding box annotations, it covers\neight distinct classes of traffic participants, ranging from vehicles to\npedestrians and micro-mobility. eTraM's utility has been assessed using\nstate-of-the-art methods for traffic participant detection, including RVT, RED,\nand YOLOv8. We quantitatively evaluate the ability of event-based models to\ngeneralize on nighttime and unseen scenes. Our findings substantiate the\ncompelling potential of leveraging event cameras for traffic monitoring,\nopening new avenues for research and application. eTraM is available at\nhttps://eventbasedvision.github.io/eTraM\n","authors":["Aayush Atul Verma","Bharatesh Chakravarthi","Arpitsinh Vaghela","Hua Wei","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19976v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12468v2","updated":"2024-04-02T06:00:11Z","published":"2023-12-19T07:05:39Z","title":"MaskINT: Video Editing via Interpolative Non-autoregressive Masked\n Transformers","summary":" Recent advances in generative AI have significantly enhanced image and video\nediting, particularly in the context of text prompt control. State-of-the-art\napproaches predominantly rely on diffusion models to accomplish these tasks.\nHowever, the computational demands of diffusion-based methods are substantial,\noften necessitating large-scale paired datasets for training, and therefore\nchallenging the deployment in real applications. To address these issues, this\npaper breaks down the text-based video editing task into two stages. First, we\nleverage an pre-trained text-to-image diffusion model to simultaneously edit\nfew keyframes in an zero-shot way. Second, we introduce an efficient model\ncalled MaskINT, which is built on non-autoregressive masked generative\ntransformers and specializes in frame interpolation between the edited\nkeyframes, using the structural guidance from intermediate frames. Experimental\nresults suggest that our MaskINT achieves comparable performance with\ndiffusion-based methodologies, while significantly improve the inference time.\nThis research offers a practical solution for text-based video editing and\nshowcases the potential of non-autoregressive masked generative transformers in\nthis domain.\n","authors":["Haoyu Ma","Shahin Mahdizadehaghdam","Bichen Wu","Zhipeng Fan","Yuchao Gu","Wenliang Zhao","Lior Shapira","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2312.12468v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01657v1","updated":"2024-04-02T05:59:43Z","published":"2024-04-02T05:59:43Z","title":"Release of Pre-Trained Models for the Japanese Language","summary":" AI democratization aims to create a world in which the average person can\nutilize AI techniques. To achieve this goal, numerous research institutes have\nattempted to make their results accessible to the public. In particular, large\npre-trained models trained on large-scale data have shown unprecedented\npotential, and their release has had a significant impact. However, most of the\nreleased models specialize in the English language, and thus, AI\ndemocratization in non-English-speaking communities is lagging significantly.\nTo reduce this gap in AI access, we released Generative Pre-trained Transformer\n(GPT), Contrastive Language and Image Pre-training (CLIP), Stable Diffusion,\nand Hidden-unit Bidirectional Encoder Representations from Transformers\n(HuBERT) pre-trained in Japanese. By providing these models, users can freely\ninterface with AI that aligns with Japanese cultural values and ensures the\nidentity of Japanese culture, thus enhancing the democratization of AI.\nAdditionally, experiments showed that pre-trained models specialized for\nJapanese can efficiently achieve high performance in Japanese tasks.\n","authors":["Kei Sawada","Tianyu Zhao","Makoto Shing","Kentaro Mitsui","Akio Kaga","Yukiya Hono","Toshiaki Wakatsuki","Koh Mitsuda"],"pdf_url":"https://arxiv.org/pdf/2404.01657v1.pdf","comment":"9 pages, 1 figure, 5 tables, accepted for LREC-COLING 2024. Models\n are publicly available at https://huggingface.co/rinna"},{"id":"http://arxiv.org/abs/2404.01656v1","updated":"2024-04-02T05:57:35Z","published":"2024-04-02T05:57:35Z","title":"Supporting Mitosis Detection AI Training with Inter-Observer Eye-Gaze\n Consistencies","summary":" The expansion of artificial intelligence (AI) in pathology tasks has\nintensified the demand for doctors' annotations in AI development. However,\ncollecting high-quality annotations from doctors is costly and time-consuming,\ncreating a bottleneck in AI progress. This study investigates eye-tracking as a\ncost-effective technology to collect doctors' behavioral data for AI training\nwith a focus on the pathology task of mitosis detection. One major challenge in\nusing eye-gaze data is the low signal-to-noise ratio, which hinders the\nextraction of meaningful information. We tackled this by levering the\nproperties of inter-observer eye-gaze consistencies and creating eye-gaze\nlabels from consistent eye-fixations shared by a group of observers. Our study\ninvolved 14 non-medical participants, from whom we collected eye-gaze data and\ngenerated eye-gaze labels based on varying group sizes. We assessed the\nefficacy of such eye-gaze labels by training Convolutional Neural Networks\n(CNNs) and comparing their performance to those trained with ground truth\nannotations and a heuristic-based baseline. Results indicated that CNNs trained\nwith our eye-gaze labels closely followed the performance of ground-truth-based\nCNNs, and significantly outperformed the baseline. Although primarily focused\non mitosis, we envision that insights from this study can be generalized to\nother medical imaging tasks.\n","authors":["Hongyan Gu","Zihan Yan","Ayesha Alvi","Brandon Day","Chunxu Yang","Zida Wu","Shino Magaki","Mohammad Haeri","Xiang 'Anthony' Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01656v1.pdf","comment":"Accepted by IEEE International Conference on Healthcare Informatics\n 2024"},{"id":"http://arxiv.org/abs/2404.01655v1","updated":"2024-04-02T05:56:17Z","published":"2024-04-02T05:56:17Z","title":"FashionEngine: Interactive Generation and Editing of 3D Clothed Humans","summary":" We present FashionEngine, an interactive 3D human generation and editing\nsystem that allows us to design 3D digital humans in a way that aligns with how\nhumans interact with the world, such as natural languages, visual perceptions,\nand hand-drawing. FashionEngine automates the 3D human production with three\nkey components: 1) A pre-trained 3D human diffusion model that learns to model\n3D humans in a semantic UV latent space from 2D image training data, which\nprovides strong priors for diverse generation and editing tasks. 2)\nMultimodality-UV Space encoding the texture appearance, shape topology, and\ntextual semantics of human clothing in a canonical UV-aligned space, which\nfaithfully aligns the user multimodal inputs with the implicit UV latent space\nfor controllable 3D human editing. The multimodality-UV space is shared across\ndifferent user inputs, such as texts, images, and sketches, which enables\nvarious joint multimodal editing tasks. 3) Multimodality-UV Aligned Sampler\nlearns to sample high-quality and diverse 3D humans from the diffusion prior\nfor multimodal user inputs. Extensive experiments validate FashionEngine's\nstate-of-the-art performance for conditional generation/editing tasks. In\naddition, we present an interactive user interface for our FashionEngine that\nenables both conditional and unconditional generation tasks, and editing tasks\nincluding pose/view/shape control, text-, image-, and sketch-driven 3D human\nediting and 3D virtual try-on, in a unified framework. Our project page is at:\nhttps://taohuumd.github.io/projects/FashionEngine.\n","authors":["Tao Hu","Fangzhou Hong","Zhaoxi Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01655v1.pdf","comment":"Project Page: https://taohuumd.github.io/projects/FashionEngine"},{"id":"http://arxiv.org/abs/2404.01654v1","updated":"2024-04-02T05:53:34Z","published":"2024-04-02T05:53:34Z","title":"AI WALKUP: A Computer-Vision Approach to Quantifying MDS-UPDRS in\n Parkinson's Disease","summary":" Parkinson's Disease (PD) is the second most common neurodegenerative\ndisorder. The existing assessment method for PD is usually the Movement\nDisorder Society - Unified Parkinson's Disease Rating Scale (MDS-UPDRS) to\nassess the severity of various types of motor symptoms and disease progression.\nHowever, manual assessment suffers from high subjectivity, lack of consistency,\nand high cost and low efficiency of manual communication. We want to use a\ncomputer vision based solution to capture human pose images based on a camera,\nreconstruct and perform motion analysis using algorithms, and extract the\nfeatures of the amount of motion through feature engineering. The proposed\napproach can be deployed on different smartphones, and the video recording and\nartificial intelligence analysis can be done quickly and easily through our\nAPP.\n","authors":["Xiang Xiang","Zihan Zhang","Jing Ma","Yao Deng"],"pdf_url":"https://arxiv.org/pdf/2404.01654v1.pdf","comment":"Technical report for AI WALKUP, an APP winning 3rd Prize of 2022 HUST\n GS AI Innovation and Design Competition"},{"id":"http://arxiv.org/abs/2310.16781v3","updated":"2024-04-02T05:50:21Z","published":"2023-10-25T17:15:55Z","title":"Kiki or Bouba? Sound Symbolism in Vision-and-Language Models","summary":" Although the mapping between sound and meaning in human language is assumed\nto be largely arbitrary, research in cognitive science has shown that there are\nnon-trivial correlations between particular sounds and meanings across\nlanguages and demographic groups, a phenomenon known as sound symbolism. Among\nthe many dimensions of meaning, sound symbolism is particularly salient and\nwell-demonstrated with regards to cross-modal associations between language and\nthe visual domain. In this work, we address the question of whether sound\nsymbolism is reflected in vision-and-language models such as CLIP and Stable\nDiffusion. Using zero-shot knowledge probing to investigate the inherent\nknowledge of these models, we find strong evidence that they do show this\npattern, paralleling the well-known kiki-bouba effect in psycholinguistics. Our\nwork provides a novel method for demonstrating sound symbolism and\nunderstanding its nature using computational tools. Our code will be made\npublicly available.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2310.16781v3.pdf","comment":"Accepted to NeurIPS 2023 (spotlight). Project webpage:\n https://kiki-bouba.github.io/"},{"id":"http://arxiv.org/abs/2312.12470v3","updated":"2024-04-02T05:37:25Z","published":"2023-12-19T08:14:14Z","title":"Rotated Multi-Scale Interaction Network for Referring Remote Sensing\n Image Segmentation","summary":" Referring Remote Sensing Image Segmentation (RRSIS) is a new challenge that\ncombines computer vision and natural language processing, delineating specific\nregions in aerial images as described by textual queries. Traditional Referring\nImage Segmentation (RIS) approaches have been impeded by the complex spatial\nscales and orientations found in aerial imagery, leading to suboptimal\nsegmentation results. To address these challenges, we introduce the Rotated\nMulti-Scale Interaction Network (RMSIN), an innovative approach designed for\nthe unique demands of RRSIS. RMSIN incorporates an Intra-scale Interaction\nModule (IIM) to effectively address the fine-grained detail required at\nmultiple scales and a Cross-scale Interaction Module (CIM) for integrating\nthese details coherently across the network. Furthermore, RMSIN employs an\nAdaptive Rotated Convolution (ARC) to account for the diverse orientations of\nobjects, a novel contribution that significantly enhances segmentation\naccuracy. To assess the efficacy of RMSIN, we have curated an expansive dataset\ncomprising 17,402 image-caption-mask triplets, which is unparalleled in terms\nof scale and variety. This dataset not only presents the model with a wide\nrange of spatial and rotational scenarios but also establishes a stringent\nbenchmark for the RRSIS task, ensuring a rigorous evaluation of performance.\nOur experimental evaluations demonstrate the exceptional performance of RMSIN,\nsurpassing existing state-of-the-art models by a significant margin. All\ndatasets and code are made available at https://github.com/Lsan2401/RMSIN.\n","authors":["Sihan Liu","Yiwei Ma","Xiaoqing Zhang","Haowei Wang","Jiayi Ji","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2312.12470v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01647v1","updated":"2024-04-02T05:32:39Z","published":"2024-04-02T05:32:39Z","title":"EDTalk: Efficient Disentanglement for Emotional Talking Head Synthesis","summary":" Achieving disentangled control over multiple facial motions and accommodating\ndiverse input modalities greatly enhances the application and entertainment of\nthe talking head generation. This necessitates a deep exploration of the\ndecoupling space for facial features, ensuring that they a) operate\nindependently without mutual interference and b) can be preserved to share with\ndifferent modal input, both aspects often neglected in existing methods. To\naddress this gap, this paper proposes a novel Efficient Disentanglement\nframework for Talking head generation (EDTalk). Our framework enables\nindividual manipulation of mouth shape, head pose, and emotional expression,\nconditioned on video or audio inputs. Specifically, we employ three lightweight\nmodules to decompose the facial dynamics into three distinct latent spaces\nrepresenting mouth, pose, and expression, respectively. Each space is\ncharacterized by a set of learnable bases whose linear combinations define\nspecific motions. To ensure independence and accelerate training, we enforce\northogonality among bases and devise an efficient training strategy to allocate\nmotion responsibilities to each space without relying on external knowledge.\nThe learned bases are then stored in corresponding banks, enabling shared\nvisual priors with audio input. Furthermore, considering the properties of each\nspace, we propose an Audio-to-Motion module for audio-driven talking head\nsynthesis. Experiments are conducted to demonstrate the effectiveness of\nEDTalk. We recommend watching the project website:\nhttps://tanshuai0219.github.io/EDTalk/\n","authors":["Shuai Tan","Bin Ji","Mengxiao Bi","Ye Pan"],"pdf_url":"https://arxiv.org/pdf/2404.01647v1.pdf","comment":"22 pages, 15 figures"},{"id":"http://arxiv.org/abs/2404.01645v1","updated":"2024-04-02T05:30:39Z","published":"2024-04-02T05:30:39Z","title":"ContrastCAD: Contrastive Learning-based Representation Learning for\n Computer-Aided Design Models","summary":" The success of Transformer-based models has encouraged many researchers to\nlearn CAD models using sequence-based approaches. However, learning CAD models\nis still a challenge, because they can be represented as complex shapes with\nlong construction sequences. Furthermore, the same CAD model can be expressed\nusing different CAD construction sequences. We propose a novel contrastive\nlearning-based approach, named ContrastCAD, that effectively captures semantic\ninformation within the construction sequences of the CAD model. ContrastCAD\ngenerates augmented views using dropout techniques without altering the shape\nof the CAD model. We also propose a new CAD data augmentation method, called a\nRandom Replace and Extrude (RRE) method, to enhance the learning performance of\nthe model when training an imbalanced training CAD dataset. Experimental\nresults show that the proposed RRE augmentation method significantly enhances\nthe learning performance of Transformer-based autoencoders, even for complex\nCAD models having very long construction sequences. The proposed ContrastCAD\nmodel is shown to be robust to permutation changes of construction sequences\nand performs better representation learning by generating representation spaces\nwhere similar CAD models are more closely clustered. Our codes are available at\nhttps://github.com/cm8908/ContrastCAD.\n","authors":["Minseop Jung","Minseong Kim","Jibum Kim"],"pdf_url":"https://arxiv.org/pdf/2404.01645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01390v2","updated":"2024-04-02T05:20:01Z","published":"2023-09-04T06:41:29Z","title":"Bridging the Projection Gap: Overcoming Projection Bias Through\n Parameterized Distance Learning","summary":" Generalized zero-shot learning (GZSL) aims to recognize samples from both\nseen and unseen classes using only seen class samples for training. However,\nGZSL methods are prone to bias towards seen classes during inference due to the\nprojection function being learned from seen classes. Most methods focus on\nlearning an accurate projection, but bias in the projection is inevitable. We\naddress this projection bias by proposing to learn a parameterized Mahalanobis\ndistance metric for robust inference. Our key insight is that the distance\ncomputation during inference is critical, even with a biased projection. We\nmake two main contributions - (1) We extend the VAEGAN (Variational Autoencoder\n\\& Generative Adversarial Networks) architecture with two branches to\nseparately output the projection of samples from seen and unseen classes,\nenabling more robust distance learning. (2) We introduce a novel loss function\nto optimize the Mahalanobis distance representation and reduce projection bias.\nExtensive experiments on four datasets show that our approach outperforms\nstate-of-the-art GZSL techniques with improvements of up to 3.5 \\% on the\nharmonic mean metric.\n","authors":["Chong Zhang","Mingyu Jin","Qinkai Yu","Haochen Xue","Shreyank N Gowda","Xiaobo Jin"],"pdf_url":"https://arxiv.org/pdf/2309.01390v2.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.01643v1","updated":"2024-04-02T05:19:27Z","published":"2024-04-02T05:19:27Z","title":"A Closer Look at Spatial-Slice Features Learning for COVID-19 Detection","summary":" Conventional Computed Tomography (CT) imaging recognition faces two\nsignificant challenges: (1) There is often considerable variability in the\nresolution and size of each CT scan, necessitating strict requirements for the\ninput size and adaptability of models. (2) CT-scan contains large number of\nout-of-distribution (OOD) slices. The crucial features may only be present in\nspecific spatial regions and slices of the entire CT scan. How can we\neffectively figure out where these are located? To deal with this, we introduce\nan enhanced Spatial-Slice Feature Learning (SSFL++) framework specifically\ndesigned for CT scan. It aim to filter out a OOD data within whole CT scan,\nenabling our to select crucial spatial-slice for analysis by reducing 70%\nredundancy totally. Meanwhile, we proposed Kernel-Density-based slice Sampling\n(KDS) method to improve the stability when training and inference stage,\ntherefore speeding up the rate of convergence and boosting performance. As a\nresult, the experiments demonstrate the promising performance of our model\nusing a simple EfficientNet-2D (E2D) model, even with only 1% of the training\ndata. The efficacy of our approach has been validated on the COVID-19-CT-DB\ndatasets provided by the DEF-AI-MIA workshop, in conjunction with CVPR 2024.\nOur source code will be made available.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yang Fan Chiang","Yi-Shiuan Chou","Chih-Yu Jiang","Shen-Chieh Tai","Chi-Han Tsai"],"pdf_url":"https://arxiv.org/pdf/2404.01643v1.pdf","comment":"Submitted to DEF-AI-MIA workshop. arXiv admin note: text overlap with\n arXiv:2403.11230"},{"id":"http://arxiv.org/abs/2404.00995v2","updated":"2024-04-02T05:16:55Z","published":"2024-04-01T08:46:35Z","title":"PosterLlama: Bridging Design Ability of Langauge Model to Contents-Aware\n Layout Generation","summary":" Visual layout plays a critical role in graphic design fields such as\nadvertising, posters, and web UI design. The recent trend towards content-aware\nlayout generation through generative models has shown promise, yet it often\noverlooks the semantic intricacies of layout design by treating it as a simple\nnumerical optimization. To bridge this gap, we introduce PosterLlama, a network\ndesigned for generating visually and textually coherent layouts by reformatting\nlayout elements into HTML code and leveraging the rich design knowledge\nembedded within language models. Furthermore, we enhance the robustness of our\nmodel with a unique depth-based poster augmentation strategy. This ensures our\ngenerated layouts remain semantically rich but also visually appealing, even\nwith limited data. Our extensive evaluations across several benchmarks\ndemonstrate that PosterLlama outperforms existing methods in producing\nauthentic and content-aware layouts. It supports an unparalleled range of\nconditions, including but not limited to unconditional layout generation,\nelement conditional layout generation, layout completion, among others, serving\nas a highly versatile user manipulation tool.\n","authors":["Jaejung Seol","Seojun Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2404.00995v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02957v3","updated":"2024-04-02T05:12:10Z","published":"2023-12-05T18:41:03Z","title":"Classification for everyone : Building geography agnostic models for\n fairer recognition","summary":" In this paper, we analyze different methods to mitigate inherent geographical\nbiases present in state of the art image classification models. We first\nquantitatively present this bias in two datasets - The Dollar Street Dataset\nand ImageNet, using images with location information. We then present different\nmethods which can be employed to reduce this bias. Finally, we analyze the\neffectiveness of the different techniques on making these models more robust to\ngeographical locations of the images.\n","authors":["Akshat Jindal","Shreya Singh","Soham Gadgil"],"pdf_url":"https://arxiv.org/pdf/2312.02957v3.pdf","comment":"typos corrected, references added"},{"id":"http://arxiv.org/abs/2309.16585v4","updated":"2024-04-02T05:10:02Z","published":"2023-09-28T16:44:31Z","title":"Text-to-3D using Gaussian Splatting","summary":" Automatic text-to-3D generation that combines Score Distillation Sampling\n(SDS) with the optimization of volume rendering has achieved remarkable\nprogress in synthesizing realistic 3D objects. Yet most existing text-to-3D\nmethods by SDS and volume rendering suffer from inaccurate geometry, e.g., the\nJanus issue, since it is hard to explicitly integrate 3D priors into implicit\n3D representations. Besides, it is usually time-consuming for them to generate\nelaborate 3D models with rich colors. In response, this paper proposes GSGEN, a\nnovel method that adopts Gaussian Splatting, a recent state-of-the-art\nrepresentation, to text-to-3D generation. GSGEN aims at generating high-quality\n3D objects and addressing existing shortcomings by exploiting the explicit\nnature of Gaussian Splatting that enables the incorporation of 3D prior.\nSpecifically, our method adopts a progressive optimization strategy, which\nincludes a geometry optimization stage and an appearance refinement stage. In\ngeometry optimization, a coarse representation is established under 3D point\ncloud diffusion prior along with the ordinary 2D SDS optimization, ensuring a\nsensible and 3D-consistent rough shape. Subsequently, the obtained Gaussians\nundergo an iterative appearance refinement to enrich texture details. In this\nstage, we increase the number of Gaussians by compactness-based densification\nto enhance continuity and improve fidelity. With these designs, our approach\ncan generate 3D assets with delicate details and accurate geometry. Extensive\nevaluations demonstrate the effectiveness of our method, especially for\ncapturing high-frequency components. Our code is available at\nhttps://github.com/gsgen3d/gsgen\n","authors":["Zilong Chen","Feng Wang","Yikai Wang","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2309.16585v4.pdf","comment":"To appear at CVPR 2024. Project page: https://gsgen3d.github.io.\n Code: https://github.com/gsgen3d/gsgen"},{"id":"http://arxiv.org/abs/2404.00875v2","updated":"2024-04-02T05:09:25Z","published":"2024-04-01T03:10:36Z","title":"DPA-Net: Structured 3D Abstraction from Sparse Views via Differentiable\n Primitive Assembly","summary":" We present a differentiable rendering framework to learn structured 3D\nabstractions in the form of primitive assemblies from sparse RGB images\ncapturing a 3D object. By leveraging differentiable volume rendering, our\nmethod does not require 3D supervision. Architecturally, our network follows\nthe general pipeline of an image-conditioned neural radiance field (NeRF)\nexemplified by pixelNeRF for color prediction. As our core contribution, we\nintroduce differential primitive assembly (DPA) into NeRF to output a 3D\noccupancy field in place of density prediction, where the predicted occupancies\nserve as opacity values for volume rendering. Our network, coined DPA-Net,\nproduces a union of convexes, each as an intersection of convex quadric\nprimitives, to approximate the target 3D object, subject to an abstraction loss\nand a masking loss, both defined in the image space upon volume rendering. With\ntest-time adaptation and additional sampling and loss designs aimed at\nimproving the accuracy and compactness of the obtained assemblies, our method\ndemonstrates superior performance over state-of-the-art alternatives for 3D\nprimitive abstraction from sparse views.\n","authors":["Fenggen Yu","Yiming Qian","Xu Zhang","Francisca Gil-Ureta","Brian Jackson","Eric Bennett","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00875v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.01225v2","updated":"2024-04-02T04:59:48Z","published":"2024-04-01T16:34:27Z","title":"SurMo: Surface-based 4D Motion Modeling for Dynamic Human Rendering","summary":" Dynamic human rendering from video sequences has achieved remarkable progress\nby formulating the rendering as a mapping from static poses to human images.\nHowever, existing methods focus on the human appearance reconstruction of every\nsingle frame while the temporal motion relations are not fully explored. In\nthis paper, we propose a new 4D motion modeling paradigm, SurMo, that jointly\nmodels the temporal dynamics and human appearances in a unified framework with\nthree key designs: 1) Surface-based motion encoding that models 4D human\nmotions with an efficient compact surface-based triplane. It encodes both\nspatial and temporal motion relations on the dense surface manifold of a\nstatistical body template, which inherits body topology priors for\ngeneralizable novel view synthesis with sparse training observations. 2)\nPhysical motion decoding that is designed to encourage physical motion learning\nby decoding the motion triplane features at timestep t to predict both spatial\nderivatives and temporal derivatives at the next timestep t+1 in the training\nstage. 3) 4D appearance decoding that renders the motion triplanes into images\nby an efficient volumetric surface-conditioned renderer that focuses on the\nrendering of body surfaces with motion learning conditioning. Extensive\nexperiments validate the state-of-the-art performance of our new paradigm and\nillustrate the expressiveness of surface-based motion triplanes for rendering\nhigh-fidelity view-consistent humans with fast motions and even\nmotion-dependent shadows. Our project page is at:\nhttps://taohuumd.github.io/projects/SurMo/\n","authors":["Tao Hu","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01225v2.pdf","comment":"Accepted to CVPR 2024. Project Page:\n https://taohuumd.github.io/projects/SurMo/"},{"id":"http://arxiv.org/abs/2404.01241v2","updated":"2024-04-02T04:56:45Z","published":"2024-04-01T17:00:18Z","title":"StructLDM: Structured Latent Diffusion for 3D Human Generation","summary":" Recent 3D human generative models have achieved remarkable progress by\nlearning 3D-aware GANs from 2D images. However, existing 3D human generative\nmethods model humans in a compact 1D latent space, ignoring the articulated\nstructure and semantics of human body topology. In this paper, we explore more\nexpressive and higher-dimensional latent space for 3D human modeling and\npropose StructLDM, a diffusion-based unconditional 3D human generative model,\nwhich is learned from 2D images. StructLDM solves the challenges imposed due to\nthe high-dimensional growth of latent space with three key designs: 1) A\nsemantic structured latent space defined on the dense surface manifold of a\nstatistical human body template. 2) A structured 3D-aware auto-decoder that\nfactorizes the global latent space into several semantic body parts\nparameterized by a set of conditional structured local NeRFs anchored to the\nbody template, which embeds the properties learned from the 2D training data\nand can be decoded to render view-consistent humans under different poses and\nclothing styles. 3) A structured latent diffusion model for generative human\nappearance sampling. Extensive experiments validate StructLDM's\nstate-of-the-art generation performance and illustrate the expressiveness of\nthe structured latent space over the well-adopted 1D latent space. Notably,\nStructLDM enables different levels of controllable 3D human generation and\nediting, including pose/view/shape control, and high-level tasks including\ncompositional generations, part-aware clothing editing, 3D virtual try-on, etc.\nOur project page is at: https://taohuumd.github.io/projects/StructLDM/.\n","authors":["Tao Hu","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01241v2.pdf","comment":"Project page: https://taohuumd.github.io/projects/StructLDM/"},{"id":"http://arxiv.org/abs/2404.00936v2","updated":"2024-04-02T04:55:27Z","published":"2024-04-01T05:46:15Z","title":"A Comprehensive Review of Knowledge Distillation in Computer Vision","summary":" Deep learning techniques have been demonstrated to surpass preceding\ncutting-edge machine learning techniques in recent years, with computer vision\nbeing one of the most prominent examples. However, deep learning models suffer\nfrom significant drawbacks when deployed in resource-constrained environments\ndue to their large model size and high complexity. Knowledge Distillation is\none of the prominent solutions to overcome this challenge. This review paper\nexamines the current state of research on knowledge distillation, a technique\nfor compressing complex models into smaller and simpler ones. The paper\nprovides an overview of the major principles and techniques associated with\nknowledge distillation and reviews the applications of knowledge distillation\nin the domain of computer vision. The review focuses on the benefits of\nknowledge distillation, as well as the problems that must be overcome to\nimprove its effectiveness.\n","authors":["Sheikh Musa Kaleem","Tufail Rouf","Gousia Habib","Tausifa jan Saleem","Brejesh Lall"],"pdf_url":"https://arxiv.org/pdf/2404.00936v2.pdf","comment":"37 pages ,10 figures"},{"id":"http://arxiv.org/abs/2404.01636v1","updated":"2024-04-02T04:53:39Z","published":"2024-04-02T04:53:39Z","title":"Learning to Control Camera Exposure via Reinforcement Learning","summary":" Adjusting camera exposure in arbitrary lighting conditions is the first step\nto ensure the functionality of computer vision applications. Poorly adjusted\ncamera exposure often leads to critical failure and performance degradation.\nTraditional camera exposure control methods require multiple convergence steps\nand time-consuming processes, making them unsuitable for dynamic lighting\nconditions. In this paper, we propose a new camera exposure control framework\nthat rapidly controls camera exposure while performing real-time processing by\nexploiting deep reinforcement learning. The proposed framework consists of four\ncontributions: 1) a simplified training ground to simulate real-world's diverse\nand dynamic lighting changes, 2) flickering and image attribute-aware reward\ndesign, along with lightweight state design for real-time processing, 3) a\nstatic-to-dynamic lighting curriculum to gradually improve the agent's\nexposure-adjusting capability, and 4) domain randomization techniques to\nalleviate the limitation of the training ground and achieve seamless\ngeneralization in the wild.As a result, our proposed method rapidly reaches a\ndesired exposure level within five steps with real-time processing (1 ms).\nAlso, the acquired images are well-exposed and show superiority in various\ncomputer vision tasks, such as feature extraction and object detection.\n","authors":["Kyunghyun Lee","Ukcheol Shin","Byeong-Uk Lee"],"pdf_url":"https://arxiv.org/pdf/2404.01636v1.pdf","comment":"Accepted at CVPR 2024, *First two authors contributed equally to this\n work. Project page link: https://sites.google.com/view/drl-ae"},{"id":"http://arxiv.org/abs/2404.01628v1","updated":"2024-04-02T04:29:01Z","published":"2024-04-02T04:29:01Z","title":"Learning Equi-angular Representations for Online Continual Learning","summary":" Online continual learning suffers from an underfitted solution due to\ninsufficient training for prompt model update (e.g., single-epoch training). To\naddress the challenge, we propose an efficient online continual learning method\nusing the neural collapse phenomenon. In particular, we induce neural collapse\nto form a simplex equiangular tight frame (ETF) structure in the representation\nspace so that the continuously learned model with a single epoch can better fit\nto the streamed data by proposing preparatory data training and residual\ncorrection in the representation space. With an extensive set of empirical\nvalidations using CIFAR-10/100, TinyImageNet, ImageNet-200, and ImageNet-1K, we\nshow that our proposed method outperforms state-of-the-art methods by a\nnoticeable margin in various online continual learning scenarios such as\ndisjoint and Gaussian scheduled continuous (i.e., boundary-free) data setups.\n","authors":["Minhyuk Seo","Hyunseo Koh","Wonje Jeung","Minjae Lee","San Kim","Hankook Lee","Sungjun Cho","Sungik Choi","Hyunwoo Kim","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2404.01628v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2311.07362v4","updated":"2024-04-02T04:12:43Z","published":"2023-11-13T14:26:24Z","title":"Volcano: Mitigating Multimodal Hallucination through Self-Feedback\n Guided Revision","summary":" Large multimodal models suffer from multimodal hallucination, where they\nprovide incorrect responses misaligned with the given visual information.\nRecent works have conjectured that one of the reasons behind multimodal\nhallucination is due to the vision encoder failing to ground on the image\nproperly. To mitigate this issue, we propose a novel approach that leverages\nself-feedback as visual cues. Building on this approach, we introduce Volcano,\na multimodal self-feedback guided revision model. Volcano generates natural\nlanguage feedback to its initial response based on the provided visual\ninformation and utilizes this feedback to self-revise its initial response.\nVolcano effectively reduces multimodal hallucination and achieves\nstate-of-the-art on MMHal-Bench, POPE, and GAVIE. It also improves on general\nmultimodal abilities and outperforms previous models on MM-Vet and MMBench.\nThrough qualitative analysis, we show that Volcano's feedback is properly\ngrounded on the image than the initial response. This indicates that Volcano\ncan provide itself with richer visual information through feedback generation,\nleading to self-correct hallucinations. We publicly release our model, data,\nand code at https://github.com/kaistAI/Volcano}{github.com/kaistAI/Volcano\n","authors":["Seongyun Lee","Sue Hyun Park","Yongrae Jo","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2311.07362v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06415v3","updated":"2024-04-02T04:00:30Z","published":"2024-01-12T07:23:02Z","title":"3D Reconstruction of Interacting Multi-Person in Clothing from a Single\n Image","summary":" This paper introduces a novel pipeline to reconstruct the geometry of\ninteracting multi-person in clothing on a globally coherent scene space from a\nsingle image. The main challenge arises from the occlusion: a part of a human\nbody is not visible from a single view due to the occlusion by others or the\nself, which introduces missing geometry and physical implausibility (e.g.,\npenetration). We overcome this challenge by utilizing two human priors for\ncomplete 3D geometry and surface contacts. For the geometry prior, an encoder\nlearns to regress the image of a person with missing body parts to the latent\nvectors; a decoder decodes these vectors to produce 3D features of the\nassociated geometry; and an implicit network combines these features with a\nsurface normal map to reconstruct a complete and detailed 3D humans. For the\ncontact prior, we develop an image-space contact detector that outputs a\nprobability distribution of surface contacts between people in 3D. We use these\npriors to globally refine the body poses, enabling the penetration-free and\naccurate reconstruction of interacting multi-person in clothing on the scene\nspace. The results demonstrate that our method is complete, globally coherent,\nand physically plausible compared to existing methods.\n","authors":["Junuk Cha","Hansol Lee","Jaewon Kim","Nhat Nguyen Bao Truong","Jae Shin Yoon","Seungryul Baek"],"pdf_url":"https://arxiv.org/pdf/2401.06415v3.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2402.18490v2","updated":"2024-04-02T03:50:34Z","published":"2024-02-28T17:18:38Z","title":"TAMM: TriAdapter Multi-Modal Learning for 3D Shape Understanding","summary":" The limited scale of current 3D shape datasets hinders the advancements in 3D\nshape understanding, and motivates multi-modal learning approaches which\ntransfer learned knowledge from data-abundant 2D image and language modalities\nto 3D shapes. However, even though the image and language representations have\nbeen aligned by cross-modal models like CLIP, we find that the image modality\nfails to contribute as much as the language in existing multi-modal 3D\nrepresentation learning methods. This is attributed to the domain shift in the\n2D images and the distinct focus of each modality. To more effectively leverage\nboth modalities in the pre-training, we introduce TriAdapter Multi-Modal\nLearning (TAMM) -- a novel two-stage learning approach based on three\nsynergistic adapters. First, our CLIP Image Adapter mitigates the domain gap\nbetween 3D-rendered images and natural images, by adapting the visual\nrepresentations of CLIP for synthetic image-text pairs. Subsequently, our Dual\nAdapters decouple the 3D shape representation space into two complementary\nsub-spaces: one focusing on visual attributes and the other for semantic\nunderstanding, which ensure a more comprehensive and effective multi-modal\npre-training. Extensive experiments demonstrate that TAMM consistently enhances\n3D representations for a wide range of 3D encoder architectures, pre-training\ndatasets, and downstream tasks. Notably, we boost the zero-shot classification\naccuracy on Objaverse-LVIS from 46.8\\% to 50.7\\%, and improve the 5-way 10-shot\nlinear probing classification accuracy on ModelNet40 from 96.1\\% to 99.0\\%.\nProject page: https://alanzhangcs.github.io/tamm-page.\n","authors":["Zhihao Zhang","Shengcao Cao","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2402.18490v2.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01030v2","updated":"2024-04-02T03:36:28Z","published":"2024-04-01T10:19:05Z","title":"Survey of Bias In Text-to-Image Generation: Definition, Evaluation, and\n Mitigation","summary":" The recent advancement of large and powerful models with Text-to-Image (T2I)\ngeneration abilities -- such as OpenAI's DALLE-3 and Google's Gemini -- enables\nusers to generate high-quality images from textual prompts. However, it has\nbecome increasingly evident that even simple prompts could cause T2I models to\nexhibit conspicuous social bias in generated images. Such bias might lead to\nboth allocational and representational harms in society, further marginalizing\nminority groups. Noting this problem, a large body of recent works has been\ndedicated to investigating different dimensions of bias in T2I systems.\nHowever, an extensive review of these studies is lacking, hindering a\nsystematic understanding of current progress and research gaps. We present the\nfirst extensive survey on bias in T2I generative models. In this survey, we\nreview prior studies on dimensions of bias: Gender, Skintone, and Geo-Culture.\nSpecifically, we discuss how these works define, evaluate, and mitigate\ndifferent aspects of bias. We found that: (1) while gender and skintone biases\nare widely studied, geo-cultural bias remains under-explored; (2) most works on\ngender and skintone bias investigated occupational association, while other\naspects are less frequently studied; (3) almost all gender bias works overlook\nnon-binary identities in their studies; (4) evaluation datasets and metrics are\nscattered, with no unified framework for measuring biases; and (5) current\nmitigation methods fail to resolve biases comprehensively. Based on current\nlimitations, we point out future research directions that contribute to\nhuman-centric definitions, evaluations, and mitigation of biases. We hope to\nhighlight the importance of studying biases in T2I systems, as well as\nencourage future efforts to holistically understand and tackle biases, building\nfair and trustworthy T2I technologies for everyone.\n","authors":["Yixin Wan","Arjun Subramonian","Anaelia Ovalle","Zongyu Lin","Ashima Suvarna","Christina Chance","Hritik Bansal","Rebecca Pattichis","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2404.01030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01614v1","updated":"2024-04-02T03:36:07Z","published":"2024-04-02T03:36:07Z","title":"LR-FPN: Enhancing Remote Sensing Object Detection with Location Refined\n Feature Pyramid Network","summary":" Remote sensing target detection aims to identify and locate critical targets\nwithin remote sensing images, finding extensive applications in agriculture and\nurban planning. Feature pyramid networks (FPNs) are commonly used to extract\nmulti-scale features. However, existing FPNs often overlook extracting\nlow-level positional information and fine-grained context interaction. To\naddress this, we propose a novel location refined feature pyramid network\n(LR-FPN) to enhance the extraction of shallow positional information and\nfacilitate fine-grained context interaction. The LR-FPN consists of two primary\nmodules: the shallow position information extraction module (SPIEM) and the\ncontextual interaction module (CIM). Specifically, SPIEM first maximizes the\nretention of solid location information of the target by simultaneously\nextracting positional and saliency information from the low-level feature map.\nSubsequently, CIM injects this robust location information into different\nlayers of the original FPN through spatial and channel interaction, explicitly\nenhancing the object area. Moreover, in spatial interaction, we introduce a\nsimple local and non-local interaction strategy to learn and retain the\nsaliency information of the object. Lastly, the LR-FPN can be readily\nintegrated into common object detection frameworks to improve performance\nsignificantly. Extensive experiments on two large-scale remote sensing datasets\n(i.e., DOTAV1.0 and HRSC2016) demonstrate that the proposed LR-FPN is superior\nto state-of-the-art object detection approaches. Our code and models will be\npublicly available.\n","authors":["Hanqian Li","Ruinan Zhang","Ye Pan","Junchi Ren","Fei Shen"],"pdf_url":"https://arxiv.org/pdf/2404.01614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01612v1","updated":"2024-04-02T03:29:23Z","published":"2024-04-02T03:29:23Z","title":"Spin-UP: Spin Light for Natural Light Uncalibrated Photometric Stereo","summary":" Natural Light Uncalibrated Photometric Stereo (NaUPS) relieves the strict\nenvironment and light assumptions in classical Uncalibrated Photometric Stereo\n(UPS) methods. However, due to the intrinsic ill-posedness and high-dimensional\nambiguities, addressing NaUPS is still an open question. Existing works impose\nstrong assumptions on the environment lights and objects' material, restricting\nthe effectiveness in more general scenarios. Alternatively, some methods\nleverage supervised learning with intricate models while lacking\ninterpretability, resulting in a biased estimation. In this work, we proposed\nSpin Light Uncalibrated Photometric Stereo (Spin-UP), an unsupervised method to\ntackle NaUPS in various environment lights and objects. The proposed method\nuses a novel setup that captures the object's images on a rotatable platform,\nwhich mitigates NaUPS's ill-posedness by reducing unknowns and provides\nreliable priors to alleviate NaUPS's ambiguities. Leveraging neural inverse\nrendering and the proposed training strategies, Spin-UP recovers surface\nnormals, environment light, and isotropic reflectance under complex natural\nlight with low computational cost. Experiments have shown that Spin-UP\noutperforms other supervised / unsupervised NaUPS methods and achieves\nstate-of-the-art performance on synthetic and real-world datasets. Codes and\ndata are available at https://github.com/LMozart/CVPR2024-SpinUP.\n","authors":["Zongrui Li","Zhan Lu","Haojie Yan","Boxin Shi","Gang Pan","Qian Zheng","Xudong Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.01612v1.pdf","comment":"Paper accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.17275v2","updated":"2024-04-02T03:18:07Z","published":"2024-02-27T07:42:55Z","title":"One-Shot Structure-Aware Stylized Image Synthesis","summary":" While GAN-based models have been successful in image stylization tasks, they\noften struggle with structure preservation while stylizing a wide range of\ninput images. Recently, diffusion models have been adopted for image\nstylization but still lack the capability to maintain the original quality of\ninput images. Building on this, we propose OSASIS: a novel one-shot stylization\nmethod that is robust in structure preservation. We show that OSASIS is able to\neffectively disentangle the semantics from the structure of an image, allowing\nit to control the level of content and style implemented to a given input. We\napply OSASIS to various experimental settings, including stylization with\nout-of-domain reference images and stylization with text-driven manipulation.\nResults show that OSASIS outperforms other stylization methods, especially for\ninput images that were rarely encountered during training, providing a\npromising solution to stylization via diffusion models.\n","authors":["Hansam Cho","Jonghyun Lee","Seunggyu Chang","Yonghyun Jeong"],"pdf_url":"https://arxiv.org/pdf/2402.17275v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.08736v3","updated":"2024-04-02T03:10:53Z","published":"2023-06-14T20:40:28Z","title":"LoSh: Long-Short Text Joint Prediction Network for Referring Video\n Object Segmentation","summary":" Referring video object segmentation (RVOS) aims to segment the target\ninstance referred by a given text expression in a video clip. The text\nexpression normally contains sophisticated description of the instance's\nappearance, action, and relation with others. It is therefore rather difficult\nfor a RVOS model to capture all these attributes correspondingly in the video;\nin fact, the model often favours more on the action- and relation-related\nvisual attributes of the instance. This can end up with partial or even\nincorrect mask prediction of the target instance. We tackle this problem by\ntaking a subject-centric short text expression from the original long text\nexpression. The short one retains only the appearance-related information of\nthe target instance so that we can use it to focus the model's attention on the\ninstance's appearance. We let the model make joint predictions using both long\nand short text expressions; and insert a long-short cross-attention module to\ninteract the joint features and a long-short predictions intersection loss to\nregulate the joint predictions. Besides the improvement on the linguistic part,\nwe also introduce a forward-backward visual consistency loss, which utilizes\noptical flows to warp visual features between the annotated frames and their\ntemporal neighbors for consistency. We build our method on top of two state of\nthe art pipelines. Extensive experiments on A2D-Sentences, Refer-YouTube-VOS,\nJHMDB-Sentences and Refer-DAVIS17 show impressive improvements of our\nmethod.Code is available at https://github.com/LinfengYuan1997/Losh.\n","authors":["Linfeng Yuan","Miaojing Shi","Zijie Yue","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08736v3.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2401.04332v2","updated":"2024-04-02T02:56:28Z","published":"2024-01-09T03:05:53Z","title":"Flexible filtrations for multiparameter persistent homology detect\n digital images","summary":" Two important problems in the field of Topological Data Analysis are defining\npractical multifiltrations on objects and showing ability of TDA to detect the\ngeometry. Motivated by the problems, we constuct three multifiltrations named\nmulti-GENEO, multi-DGENEO and mix-GENEO, and prove the stability of both the\ninterleaving distance and multiparameter persistence landscape of multi-GENEO\nwith respect to the pseudometric of the subspace of bounded functions. We also\ngive the estimations of upper bound for multi-DGENEO and mix-GENEO. Finally, we\nprovide experiment results on MNIST dataset to demonstrate our bifiltrations\nhave ability to detect geometric and topological differences of digital images.\n","authors":["Jiaxing He","Bingzhe Hou","Tieru Wu","Yue Xin"],"pdf_url":"https://arxiv.org/pdf/2401.04332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11679v2","updated":"2024-04-02T02:55:28Z","published":"2024-03-18T11:31:03Z","title":"NEDS-SLAM: A Novel Neural Explicit Dense Semantic SLAM Framework using\n 3D Gaussian Splatting","summary":" We propose NEDS-SLAM, an Explicit Dense semantic SLAM system based on 3D\nGaussian representation, that enables robust 3D semantic mapping, accurate\ncamera tracking, and high-quality rendering in real-time. In the system, we\npropose a Spatially Consistent Feature Fusion model to reduce the effect of\nerroneous estimates from pre-trained segmentation head on semantic\nreconstruction, achieving robust 3D semantic Gaussian mapping. Additionally, we\nemploy a lightweight encoder-decoder to compress the high-dimensional semantic\nfeatures into a compact 3D Gaussian representation, mitigating the burden of\nexcessive memory consumption. Furthermore, we leverage the advantage of 3D\nGaussian splatting, which enables efficient and differentiable novel view\nrendering, and propose a Virtual Camera View Pruning method to eliminate\noutlier GS points, thereby effectively enhancing the quality of scene\nrepresentations. Our NEDS-SLAM method demonstrates competitive performance over\nexisting dense semantic SLAM methods in terms of mapping and tracking accuracy\non Replica and ScanNet datasets, while also showing excellent capabilities in\n3D dense semantic mapping.\n","authors":["Yiming Ji","Yang Liu","Guanghu Xie","Boyu Ma","Zongwu Xie"],"pdf_url":"https://arxiv.org/pdf/2403.11679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01604v1","updated":"2024-04-02T02:52:05Z","published":"2024-04-02T02:52:05Z","title":"WaveDH: Wavelet Sub-bands Guided ConvNet for Efficient Image Dehazing","summary":" The surge in interest regarding image dehazing has led to notable\nadvancements in deep learning-based single image dehazing approaches,\nexhibiting impressive performance in recent studies. Despite these strides,\nmany existing methods fall short in meeting the efficiency demands of practical\napplications. In this paper, we introduce WaveDH, a novel and compact ConvNet\ndesigned to address this efficiency gap in image dehazing. Our WaveDH leverages\nwavelet sub-bands for guided up-and-downsampling and frequency-aware feature\nrefinement. The key idea lies in utilizing wavelet decomposition to extract\nlow-and-high frequency components from feature levels, allowing for faster\nprocessing while upholding high-quality reconstruction. The downsampling block\nemploys a novel squeeze-and-attention scheme to optimize the feature\ndownsampling process in a structurally compact manner through wavelet domain\nlearning, preserving discriminative features while discarding noise components.\nIn our upsampling block, we introduce a dual-upsample and fusion mechanism to\nenhance high-frequency component awareness, aiding in the reconstruction of\nhigh-frequency details. Departing from conventional dehazing methods that treat\nlow-and-high frequency components equally, our feature refinement block\nstrategically processes features with a frequency-aware approach. By employing\na coarse-to-fine methodology, it not only refines the details at frequency\nlevels but also significantly optimizes computational costs. The refinement is\nperformed in a maximum 8x downsampled feature space, striking a favorable\nefficiency-vs-accuracy trade-off. Extensive experiments demonstrate that our\nmethod, WaveDH, outperforms many state-of-the-art methods on several image\ndehazing benchmarks with significantly reduced computational costs. Our code is\navailable at https://github.com/AwesomeHwang/WaveDH.\n","authors":["Seongmin Hwang","Daeyoung Han","Cheolkon Jung","Moongu Jeon"],"pdf_url":"https://arxiv.org/pdf/2404.01604v1.pdf","comment":"Submitted to TMM"},{"id":"http://arxiv.org/abs/2403.19964v2","updated":"2024-04-02T02:34:22Z","published":"2024-03-29T03:56:19Z","title":"FairRAG: Fair Human Generation via Fair Retrieval Augmentation","summary":" Existing text-to-image generative models reflect or even amplify societal\nbiases ingrained in their training data. This is especially concerning for\nhuman image generation where models are biased against certain demographic\ngroups. Existing attempts to rectify this issue are hindered by the inherent\nlimitations of the pre-trained models and fail to substantially improve\ndemographic diversity. In this work, we introduce Fair Retrieval Augmented\nGeneration (FairRAG), a novel framework that conditions pre-trained generative\nmodels on reference images retrieved from an external image database to improve\nfairness in human generation. FairRAG enables conditioning through a\nlightweight linear module that projects reference images into the textual\nspace. To enhance fairness, FairRAG applies simple-yet-effective debiasing\nstrategies, providing images from diverse demographic groups during the\ngenerative process. Extensive experiments demonstrate that FairRAG outperforms\nexisting methods in terms of demographic diversity, image-text alignment, and\nimage fidelity while incurring minimal computational overhead during inference.\n","authors":["Robik Shrestha","Yang Zou","Qiuyu Chen","Zhiheng Li","Yusheng Xie","Siqi Deng"],"pdf_url":"https://arxiv.org/pdf/2403.19964v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01591v1","updated":"2024-04-02T02:31:13Z","published":"2024-04-02T02:31:13Z","title":"Language Model Guided Interpretable Video Action Reasoning","summary":" While neural networks have excelled in video action recognition tasks, their\nblack-box nature often obscures the understanding of their decision-making\nprocesses. Recent approaches used inherently interpretable models to analyze\nvideo actions in a manner akin to human reasoning. These models, however,\nusually fall short in performance compared to their black-box counterparts. In\nthis work, we present a new framework named Language-guided Interpretable\nAction Recognition framework (LaIAR). LaIAR leverages knowledge from language\nmodels to enhance both the recognition capabilities and the interpretability of\nvideo models. In essence, we redefine the problem of understanding video model\ndecisions as a task of aligning video and language models. Using the logical\nreasoning captured by the language model, we steer the training of the video\nmodel. This integrated approach not only improves the video model's\nadaptability to different domains but also boosts its overall performance.\nExtensive experiments on two complex video action datasets, Charades & CAD-120,\nvalidates the improved performance and interpretability of our LaIAR framework.\nThe code of LaIAR is available at https://github.com/NingWang2049/LaIAR.\n","authors":["Ning Wang","Guangming Zhu","HS Li","Liang Zhang","Syed Afaq Ali Shah","Mohammed Bennamoun"],"pdf_url":"https://arxiv.org/pdf/2404.01591v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01587v1","updated":"2024-04-02T02:29:41Z","published":"2024-04-02T02:29:41Z","title":"TSCM: A Teacher-Student Model for Vision Place Recognition Using\n Cross-Metric Knowledge Distillation","summary":" Visual place recognition (VPR) plays a pivotal role in autonomous exploration\nand navigation of mobile robots within complex outdoor environments. While\ncost-effective and easily deployed, camera sensors are sensitive to lighting\nand weather changes, and even slight image alterations can greatly affect VPR\nefficiency and precision. Existing methods overcome this by exploiting powerful\nyet large networks, leading to significant consumption of computational\nresources. In this paper, we propose a high-performance teacher and lightweight\nstudent distillation framework called TSCM. It exploits our devised\ncross-metric knowledge distillation to narrow the performance gap between the\nteacher and student models, maintaining superior performance while enabling\nminimal computational load during deployment. We conduct comprehensive\nevaluations on large-scale datasets, namely Pittsburgh30k and Pittsburgh250k.\nExperimental results demonstrate the superiority of our method over baseline\nmodels in terms of recognition accuracy and model parameter efficiency.\nMoreover, our ablation studies show that the proposed knowledge distillation\ntechnique surpasses other counterparts. The code of our method has been\nreleased at https://github.com/nubot-nudt/TSCM.\n","authors":["Yehui Shen","Mingmin Liu","Huimin Lu","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01580v1","updated":"2024-04-02T02:20:47Z","published":"2024-04-02T02:20:47Z","title":"Learning Temporal Cues by Predicting Objects Move for Multi-camera 3D\n Object Detection","summary":" In autonomous driving and robotics, there is a growing interest in utilizing\nshort-term historical data to enhance multi-camera 3D object detection,\nleveraging the continuous and correlated nature of input video streams. Recent\nwork has focused on spatially aligning BEV-based features over timesteps.\nHowever, this is often limited as its gain does not scale well with long-term\npast observations. To address this, we advocate for supervising a model to\npredict objects' poses given past observations, thus explicitly guiding to\nlearn objects' temporal cues. To this end, we propose a model called DAP\n(Detection After Prediction), consisting of a two-branch network: (i) a branch\nresponsible for forecasting the current objects' poses given past observations\nand (ii) another branch that detects objects based on the current and past\nobservations. The features predicting the current objects from branch (i) is\nfused into branch (ii) to transfer predictive knowledge. We conduct extensive\nexperiments with the large-scale nuScenes datasets, and we observe that\nutilizing such predictive information significantly improves the overall\ndetection performance. Our model can be used plug-and-play, showing consistent\nperformance gain.\n","authors":["Seokha Moon","Hongbeen Park","Jungphil Kwon","Jaekoo Lee","Jinkyu Kim"],"pdf_url":"https://arxiv.org/pdf/2404.01580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01579v1","updated":"2024-04-02T02:17:50Z","published":"2024-04-02T02:17:50Z","title":"Diffusion Deepfake","summary":" Recent progress in generative AI, primarily through diffusion models,\npresents significant challenges for real-world deepfake detection. The\nincreased realism in image details, diverse content, and widespread\naccessibility to the general public complicates the identification of these\nsophisticated deepfakes. Acknowledging the urgency to address the vulnerability\nof current deepfake detectors to this evolving threat, our paper introduces two\nextensive deepfake datasets generated by state-of-the-art diffusion models as\nother datasets are less diverse and low in quality. Our extensive experiments\nalso showed that our dataset is more challenging compared to the other face\ndeepfake datasets. Our strategic dataset creation not only challenge the\ndeepfake detectors but also sets a new benchmark for more evaluation. Our\ncomprehensive evaluation reveals the struggle of existing detection methods,\noften optimized for specific image domains and manipulations, to effectively\nadapt to the intricate nature of diffusion deepfakes, limiting their practical\nutility. To address this critical issue, we investigate the impact of enhancing\ntraining data diversity on representative detection methods. This involves\nexpanding the diversity of both manipulation techniques and image domains. Our\nfindings underscore that increasing training data diversity results in improved\ngeneralizability. Moreover, we propose a novel momentum difficulty boosting\nstrategy to tackle the additional challenge posed by training data\nheterogeneity. This strategy dynamically assigns appropriate sample weights\nbased on learning difficulty, enhancing the model's adaptability to both easy\nand challenging samples. Extensive experiments on both existing and newly\nproposed benchmarks demonstrate that our model optimization approach surpasses\nprior alternatives significantly.\n","authors":["Chaitali Bhattacharyya","Hanxiao Wang","Feng Zhang","Sungho Kim","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.01579v1.pdf","comment":"28 pages including Supplementary material"},{"id":"http://arxiv.org/abs/2404.01576v1","updated":"2024-04-02T02:12:00Z","published":"2024-04-02T02:12:00Z","title":"Leveraging Digital Perceptual Technologies for Remote Perception and\n Analysis of Human Biomechanical Processes: A Contactless Approach for\n Workload and Joint Force Assessment","summary":" This study presents an innovative computer vision framework designed to\nanalyze human movements in industrial settings, aiming to enhance biomechanical\nanalysis by integrating seamlessly with existing software. Through a\ncombination of advanced imaging and modeling techniques, the framework allows\nfor comprehensive scrutiny of human motion, providing valuable insights into\nkinematic patterns and kinetic data. Utilizing Convolutional Neural Networks\n(CNNs), Direct Linear Transform (DLT), and Long Short-Term Memory (LSTM)\nnetworks, the methodology accurately detects key body points, reconstructs 3D\nlandmarks, and generates detailed 3D body meshes. Extensive evaluations across\nvarious movements validate the framework's effectiveness, demonstrating\ncomparable results to traditional marker-based models with minor differences in\njoint angle estimations and precise estimations of weight and height.\nStatistical analyses consistently support the framework's reliability, with\njoint angle estimations showing less than a 5-degree difference for hip\nflexion, elbow flexion, and knee angle methods. Additionally, weight estimation\nexhibits an average error of less than 6 % for weight and less than 2 % for\nheight when compared to ground-truth values from 10 subjects. The integration\nof the Biomech-57 landmark skeleton template further enhances the robustness\nand reinforces the framework's credibility. This framework shows significant\npromise for meticulous biomechanical analysis in industrial contexts,\neliminating the need for cumbersome markers and extending its utility to\ndiverse research domains, including the study of specific exoskeleton devices'\nimpact on facilitating the prompt return of injured workers to their tasks.\n","authors":["Jesudara Omidokun","Darlington Egeonu","Bochen Jia","Liang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.08755v3","updated":"2024-04-02T02:09:15Z","published":"2020-10-17T09:54:51Z","title":"Variational Dynamic for Self-Supervised Exploration in Deep\n Reinforcement Learning","summary":" Efficient exploration remains a challenging problem in reinforcement\nlearning, especially for tasks where extrinsic rewards from environments are\nsparse or even totally disregarded. Significant advances based on intrinsic\nmotivation show promising results in simple environments but often get stuck in\nenvironments with multimodal and stochastic dynamics. In this work, we propose\na variational dynamic model based on the conditional variational inference to\nmodel the multimodality and stochasticity. We consider the environmental\nstate-action transition as a conditional generative process by generating the\nnext-state prediction under the condition of the current state, action, and\nlatent variable, which provides a better understanding of the dynamics and\nleads a better performance in exploration. We derive an upper bound of the\nnegative log-likelihood of the environmental transition and use such an upper\nbound as the intrinsic reward for exploration, which allows the agent to learn\nskills by self-supervised exploration without observing extrinsic rewards. We\nevaluate the proposed method on several image-based simulation tasks and a real\nrobotic manipulating task. Our method outperforms several state-of-the-art\nenvironment model-based exploration approaches.\n","authors":["Chenjia Bai","Peng Liu","Kaiyu Liu","Lingxiao Wang","Yingnan Zhao","Lei Han"],"pdf_url":"https://arxiv.org/pdf/2010.08755v3.pdf","comment":"IEEE Transactions on Neural Networks and Learning Systems (TNNLS)\n 2021"},{"id":"http://arxiv.org/abs/2404.00562v2","updated":"2024-04-02T02:08:55Z","published":"2024-03-31T04:56:30Z","title":"Text2HOI: Text-guided 3D Motion Generation for Hand-Object Interaction","summary":" This paper introduces the first text-guided work for generating the sequence\nof hand-object interaction in 3D. The main challenge arises from the lack of\nlabeled data where existing ground-truth datasets are nowhere near\ngeneralizable in interaction type and object category, which inhibits the\nmodeling of diverse 3D hand-object interaction with the correct physical\nimplication (e.g., contacts and semantics) from text prompts. To address this\nchallenge, we propose to decompose the interaction generation task into two\nsubtasks: hand-object contact generation; and hand-object motion generation.\nFor contact generation, a VAE-based network takes as input a text and an object\nmesh, and generates the probability of contacts between the surfaces of hands\nand the object during the interaction. The network learns a variety of local\ngeometry structure of diverse objects that is independent of the objects'\ncategory, and thus, it is applicable to general objects. For motion generation,\na Transformer-based diffusion model utilizes this 3D contact map as a strong\nprior for generating physically plausible hand-object motion as a function of\ntext prompts by learning from the augmented labeled dataset; where we annotate\ntext labels from many existing 3D hand and object motion data. Finally, we\nfurther introduce a hand refiner module that minimizes the distance between the\nobject surface and hand joints to improve the temporal stability of the\nobject-hand contacts and to suppress the penetration artifacts. In the\nexperiments, we demonstrate that our method can generate more realistic and\ndiverse interactions compared to other baseline methods. We also show that our\nmethod is applicable to unseen objects. We will release our model and newly\nlabeled data as a strong foundation for future research. Codes and data are\navailable in: https://github.com/JunukCha/Text2HOI.\n","authors":["Junuk Cha","Jihyeon Kim","Jae Shin Yoon","Seungryul Baek"],"pdf_url":"https://arxiv.org/pdf/2404.00562v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01571v1","updated":"2024-04-02T02:07:00Z","published":"2024-04-02T02:07:00Z","title":"Leveraging YOLO-World and GPT-4V LMMs for Zero-Shot Person Detection and\n Action Recognition in Drone Imagery","summary":" In this article, we explore the potential of zero-shot Large Multimodal\nModels (LMMs) in the domain of drone perception. We focus on person detection\nand action recognition tasks and evaluate two prominent LMMs, namely YOLO-World\nand GPT-4V(ision) using a publicly available dataset captured from aerial\nviews. Traditional deep learning approaches rely heavily on large and\nhigh-quality training datasets. However, in certain robotic settings, acquiring\nsuch datasets can be resource-intensive or impractical within a reasonable\ntimeframe. The flexibility of prompt-based Large Multimodal Models (LMMs) and\ntheir exceptional generalization capabilities have the potential to\nrevolutionize robotics applications in these scenarios. Our findings suggest\nthat YOLO-World demonstrates good detection performance. GPT-4V struggles with\naccurately classifying action classes but delivers promising results in\nfiltering out unwanted region proposals and in providing a general description\nof the scenery. This research represents an initial step in leveraging LMMs for\ndrone perception and establishes a foundation for future investigations in this\narea.\n","authors":["Christian Limberg","Artur Gonçalves","Bastien Rigault","Helmut Prendinger"],"pdf_url":"https://arxiv.org/pdf/2404.01571v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2403.19080v3","updated":"2024-04-02T02:05:46Z","published":"2024-03-28T01:05:06Z","title":"MMCert: Provable Defense against Adversarial Attacks to Multi-modal\n Models","summary":" Different from a unimodal model whose input is from a single modality, the\ninput (called multi-modal input) of a multi-modal model is from multiple\nmodalities such as image, 3D points, audio, text, etc. Similar to unimodal\nmodels, many existing studies show that a multi-modal model is also vulnerable\nto adversarial perturbation, where an attacker could add small perturbation to\nall modalities of a multi-modal input such that the multi-modal model makes\nincorrect predictions for it. Existing certified defenses are mostly designed\nfor unimodal models, which achieve sub-optimal certified robustness guarantees\nwhen extended to multi-modal models as shown in our experimental results. In\nour work, we propose MMCert, the first certified defense against adversarial\nattacks to a multi-modal model. We derive a lower bound on the performance of\nour MMCert under arbitrary adversarial attacks with bounded perturbations to\nboth modalities (e.g., in the context of auto-driving, we bound the number of\nchanged pixels in both RGB image and depth image). We evaluate our MMCert using\ntwo benchmark datasets: one for the multi-modal road segmentation task and the\nother for the multi-modal emotion recognition task. Moreover, we compare our\nMMCert with a state-of-the-art certified defense extended from unimodal models.\nOur experimental results show that our MMCert outperforms the baseline.\n","authors":["Yanting Wang","Hongye Fu","Wei Zou","Jinyuan Jia"],"pdf_url":"https://arxiv.org/pdf/2403.19080v3.pdf","comment":"To appear in CVPR'24"},{"id":"http://arxiv.org/abs/2404.01568v1","updated":"2024-04-02T02:01:21Z","published":"2024-04-02T02:01:21Z","title":"A Linear Time and Space Local Point Cloud Geometry Encoder via\n Vectorized Kernel Mixture (VecKM)","summary":" We propose VecKM, a novel local point cloud geometry encoder that is\ndescriptive, efficient and robust to noise. VecKM leverages a unique approach\nby vectorizing a kernel mixture to represent the local point clouds. Such\nrepresentation is descriptive and robust to noise, which is supported by two\ntheorems that confirm its ability to reconstruct and preserve the similarity of\nthe local shape. Moreover, VecKM is the first successful attempt to reduce the\ncomputation and memory costs from $O(n^2+nKd)$ to $O(nd)$ by sacrificing a\nmarginal constant factor, where $n$ is the size of the point cloud and $K$ is\nneighborhood size. The efficiency is primarily due to VecKM's unique\nfactorizable property that eliminates the need of explicitly grouping points\ninto neighborhoods. In the normal estimation task, VecKM demonstrates not only\n100x faster inference speed but also strongest descriptiveness and robustness\ncompared with existing popular encoders. In classification and segmentation\ntasks, integrating VecKM as a preprocessing module achieves consistently better\nperformance than the PointNet, PointNet++, and point transformer baselines, and\nruns consistently faster by up to 10x.\n","authors":["Dehao Yuan","Cornelia Fermüller","Tahseen Rabbani","Furong Huang","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2404.01568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01563v1","updated":"2024-04-02T01:57:08Z","published":"2024-04-02T01:57:08Z","title":"Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level\n Awareness","summary":" To obtain high-quality positron emission tomography (PET) while minimizing\nradiation exposure, a range of methods have been designed to reconstruct\nstandard-dose PET (SPET) from corresponding low-dose PET (LPET) images.\nHowever, most current methods merely learn the mapping between\nsingle-dose-level LPET and SPET images, but omit the dose disparity of LPET\nimages in clinical scenarios. In this paper, to reconstruct high-quality SPET\nimages from multi-dose-level LPET images, we design a novel two-phase\nmulti-dose-level PET reconstruction algorithm with dose level awareness,\ncontaining a pre-training phase and a SPET prediction phase. Specifically, the\npre-training phase is devised to explore both fine-grained discriminative\nfeatures and effective semantic representation. The SPET prediction phase\nadopts a coarse prediction network utilizing pre-learned dose level prior to\ngenerate preliminary result, and a refinement network to precisely preserve the\ndetails. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge\nDataset have demonstrated the superiority of our method.\n","authors":["Yuchen Fei","Yanmei Luo","Yan Wang","Jiaqi Cui","Yuanyuan Xu","Jiliu Zhou","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.01563v1.pdf","comment":"Accepted by ISBI2024"},{"id":"http://arxiv.org/abs/2403.16209v3","updated":"2024-04-02T01:57:00Z","published":"2024-03-24T16:08:10Z","title":"Image Captioning in news report scenario","summary":" Image captioning strives to generate pertinent captions for specified images,\nsituating itself at the crossroads of Computer Vision (CV) and Natural Language\nProcessing (NLP). This endeavor is of paramount importance with far-reaching\napplications in recommendation systems, news outlets, social media, and beyond.\nParticularly within the realm of news reporting, captions are expected to\nencompass detailed information, such as the identities of celebrities captured\nin the images. However, much of the existing body of work primarily centers\naround understanding scenes and actions. In this paper, we explore the realm of\nimage captioning specifically tailored for celebrity photographs, illustrating\nits broad potential for enhancing news industry practices. This exploration\naims to augment automated news content generation, thereby facilitating a more\nnuanced dissemination of information. Our endeavor shows a broader horizon,\nenriching the narrative in news reporting through a more intuitive image\ncaptioning framework.\n","authors":["Tianrui Liu","Qi Cai","Changxin Xu","Bo Hong","Jize Xiong","Yuxin Qiao","Tsungwei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16209v3.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.01548v1","updated":"2024-04-02T01:28:44Z","published":"2024-04-02T01:28:44Z","title":"mChartQA: A universal benchmark for multimodal Chart Question Answer\n based on Vision-Language Alignment and Reasoning","summary":" In the fields of computer vision and natural language processing, multimodal\nchart question-answering, especially involving color, structure, and textless\ncharts, poses significant challenges. Traditional methods, which typically\ninvolve either direct multimodal processing or a table-to-text conversion\nfollowed by language model analysis, have limitations in effectively handling\nthese complex scenarios. This paper introduces a novel multimodal chart\nquestion-answering model, specifically designed to address these intricate\ntasks. Our model integrates visual and linguistic processing, overcoming the\nconstraints of existing methods. We adopt a dual-phase training approach: the\ninitial phase focuses on aligning image and text representations, while the\nsubsequent phase concentrates on optimizing the model's interpretative and\nanalytical abilities in chart-related queries. This approach has demonstrated\nsuperior performance on multiple public datasets, particularly in handling\ncolor, structure, and textless chart questions, indicating its effectiveness in\ncomplex multimodal tasks.\n","authors":["Jingxuan Wei","Nan Xu","Guiyong Chang","Yin Luo","BiHui Yu","Ruifeng Guo"],"pdf_url":"https://arxiv.org/pdf/2404.01548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01547v1","updated":"2024-04-02T01:18:16Z","published":"2024-04-02T01:18:16Z","title":"Bidirectional Multi-Scale Implicit Neural Representations for Image\n Deraining","summary":" How to effectively explore multi-scale representations of rain streaks is\nimportant for image deraining. In contrast to existing Transformer-based\nmethods that depend mostly on single-scale rain appearance, we develop an\nend-to-end multi-scale Transformer that leverages the potentially useful\nfeatures in various scales to facilitate high-quality image reconstruction. To\nbetter explore the common degradation representations from spatially-varying\nrain streaks, we incorporate intra-scale implicit neural representations based\non pixel coordinates with the degraded inputs in a closed-loop design, enabling\nthe learned features to facilitate rain removal and improve the robustness of\nthe model in complex scenarios. To ensure richer collaborative representation\nfrom different scales, we embed a simple yet effective inter-scale\nbidirectional feedback operation into our multi-scale Transformer by performing\ncoarse-to-fine and fine-to-coarse information communication. Extensive\nexperiments demonstrate that our approach, named as NeRD-Rain, performs\nfavorably against the state-of-the-art ones on both synthetic and real-world\nbenchmark datasets. The source code and trained models are available at\nhttps://github.com/cschenxiang/NeRD-Rain.\n","authors":["Xiang Chen","Jinshan Pan","Jiangxin Dong"],"pdf_url":"https://arxiv.org/pdf/2404.01547v1.pdf","comment":"Project website: https://github.com/cschenxiang/NeRD-Rain"},{"id":"http://arxiv.org/abs/2404.00228v2","updated":"2024-04-02T01:16:20Z","published":"2024-03-30T03:16:37Z","title":"InfLoRA: Interference-Free Low-Rank Adaptation for Continual Learning","summary":" Continual learning requires the model to learn multiple tasks sequentially.\nIn continual learning, the model should possess the ability to maintain its\nperformance on old tasks (stability) and the ability to adapt to new tasks\ncontinuously (plasticity). Recently, parameter-efficient fine-tuning (PEFT),\nwhich involves freezing a pre-trained model and injecting a small number of\nlearnable parameters to adapt to downstream tasks, has gained increasing\npopularity in continual learning. Although existing continual learning methods\nbased on PEFT have demonstrated superior performance compared to those not\nbased on PEFT, most of them do not consider how to eliminate the interference\nof the new task on the old tasks, which inhibits the model from making a good\ntrade-off between stability and plasticity. In this work, we propose a new PEFT\nmethod, called interference-free low-rank adaptation (InfLoRA), for continual\nlearning. InfLoRA injects a small number of parameters to reparameterize the\npre-trained weights and shows that fine-tuning these injected parameters is\nequivalent to fine-tuning the pre-trained weights within a subspace.\nFurthermore, InfLoRA designs this subspace to eliminate the interference of the\nnew task on the old tasks, making a good trade-off between stability and\nplasticity. Experimental results show that InfLoRA outperforms existing\nstate-of-the-art continual learning methods on multiple datasets.\n","authors":["Yan-Shuo Liang","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2404.00228v2.pdf","comment":"Accepted by the 2024 IEEE/CVF Conference on Computer Vision and\n Pattern Recognition (CVPR 2024)"},{"id":"http://arxiv.org/abs/2404.01543v1","updated":"2024-04-02T00:55:50Z","published":"2024-04-02T00:55:50Z","title":"Efficient 3D Implicit Head Avatar with Mesh-anchored Hash Table\n Blendshapes","summary":" 3D head avatars built with neural implicit volumetric representations have\nachieved unprecedented levels of photorealism. However, the computational cost\nof these methods remains a significant barrier to their widespread adoption,\nparticularly in real-time applications such as virtual reality and\nteleconferencing. While attempts have been made to develop fast neural\nrendering approaches for static scenes, these methods cannot be simply employed\nto support realistic facial expressions, such as in the case of a dynamic\nfacial performance. To address these challenges, we propose a novel fast 3D\nneural implicit head avatar model that achieves real-time rendering while\nmaintaining fine-grained controllability and high rendering quality. Our key\nidea lies in the introduction of local hash table blendshapes, which are\nlearned and attached to the vertices of an underlying face parametric model.\nThese per-vertex hash-tables are linearly merged with weights predicted via a\nCNN, resulting in expression dependent embeddings. Our novel representation\nenables efficient density and color predictions using a lightweight MLP, which\nis further accelerated by a hierarchical nearest neighbor search method.\nExtensive experiments show that our approach runs in real-time while achieving\ncomparable rendering quality to state-of-the-arts and decent results on\nchallenging expressions.\n","authors":["Ziqian Bai","Feitong Tan","Sean Fanello","Rohit Pandey","Mingsong Dou","Shichen Liu","Ping Tan","Yinda Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01543v1.pdf","comment":"In CVPR2024. Project page:\n https://augmentedperception.github.io/monoavatar-plus"},{"id":"http://arxiv.org/abs/2211.09321v2","updated":"2024-04-02T00:33:42Z","published":"2022-11-17T03:43:04Z","title":"Interpretable Dimensionality Reduction by Feature Preserving Manifold\n Approximation and Projection","summary":" Nonlinear dimensionality reduction lacks interpretability due to the absence\nof source features in low-dimensional embedding space. We propose an\ninterpretable method featMAP to preserve source features by tangent space\nembedding. The core of our proposal is to utilize local singular value\ndecomposition (SVD) to approximate the tangent space which is embedded to\nlow-dimensional space by maintaining the alignment. Based on the embedding\ntangent space, featMAP enables the interpretability by locally demonstrating\nthe source features and feature importance. Furthermore, featMAP embeds the\ndata points by anisotropic projection to preserve the local similarity and\noriginal density. We apply featMAP to interpreting digit classification, object\ndetection and MNIST adversarial examples. FeatMAP uses source features to\nexplicitly distinguish the digits and objects and to explain the\nmisclassification of adversarial examples. We also compare featMAP with other\nstate-of-the-art methods on local and global metrics.\n","authors":["Yang Yang","Hongjian Sun","Jialei Gong","Di Yu"],"pdf_url":"https://arxiv.org/pdf/2211.09321v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19654v3","updated":"2024-04-02T00:12:21Z","published":"2023-10-30T15:38:43Z","title":"MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient\n image-text retrieval","summary":" Due to the success of large-scale visual-language pretraining (VLP) models\nand the widespread use of image-text retrieval in industry areas, it is now\ncritically necessary to reduce the model size and streamline their\nmobile-device deployment. Single- and dual-stream model structures are commonly\nused in image-text retrieval with the goal of closing the semantic gap between\ntextual and visual modalities. While single-stream models use deep feature\nfusion to achieve more accurate cross-model alignment, dual-stream models are\nbetter at offline indexing and fast inference.We propose a Multi-teacher\nCross-modality Alignment Distillation (MCAD) technique to integrate the\nadvantages of single- and dual-stream models. By incorporating the fused\nsingle-stream features into the image and text features of the dual-stream\nmodel, we formulate new modified teacher similarity distributions and features.\nThen, we conduct both distribution and feature distillation to boost the\ncapability of the student dual-stream model, achieving high retrieval\nperformance without increasing inference complexity.Extensive experiments\ndemonstrate the remarkable performance and high efficiency of MCAD on\nimage-text retrieval tasks. Furthermore, we implement a lightweight CLIP model\non Snapdragon/Dimensity chips with only $\\sim$100M running memory and\n$\\sim$8.0ms search latency, achieving the mobile-device application of VLP\nmodels.\n","authors":["Youbo Lei","Feifei He","Chen Chen","Yingbin Mo","Si Jia Li","Defeng Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2310.19654v3.pdf","comment":"Accepted by NAACL 2024 Findings"},{"id":"http://arxiv.org/abs/2311.09612v2","updated":"2024-04-02T00:11:50Z","published":"2023-11-16T06:50:26Z","title":"Efficient End-to-End Visual Document Understanding with Rationale\n Distillation","summary":" Understanding visually situated language requires interpreting complex\nlayouts of textual and visual elements. Pre-processing tools, such as optical\ncharacter recognition (OCR), can map document image inputs to textual tokens,\nthen large language models (LLMs) can reason over text. However, such methods\nhave high computational and engineering complexity. Can small pretrained\nimage-to-text models accurately understand visual documents through similar\nrecognition and reasoning steps instead? We propose Rationale Distillation\n(RD), which incorporates the outputs of OCR tools, LLMs, and larger multimodal\nmodels as intermediate \"rationales\", and trains a small student model to\npredict both rationales and answers. On three visual document understanding\nbenchmarks representing infographics, scanned documents, and figures, our\nPix2Struct (282M parameters) student model finetuned with RD outperforms the\nbase model by 4-5% absolute accuracy with only 1% higher computational cost.\n","authors":["Wang Zhu","Alekh Agarwal","Mandar Joshi","Robin Jia","Jesse Thomason","Kristina Toutanova"],"pdf_url":"https://arxiv.org/pdf/2311.09612v2.pdf","comment":"Accepted by NAACL 2024"},{"id":"http://arxiv.org/abs/2402.17128v4","updated":"2024-04-02T23:14:42Z","published":"2024-02-27T01:48:19Z","title":"OSCaR: Object State Captioning and State Change Representation","summary":" The capability of intelligent models to extrapolate and comprehend changes in\nobject states is a crucial yet demanding aspect of AI research, particularly\nthrough the lens of human interaction in real-world settings. This task\ninvolves describing complex visual environments, identifying active objects,\nand interpreting their changes as conveyed through language. Traditional\nmethods, which isolate object captioning and state change detection, offer a\nlimited view of dynamic environments. Moreover, relying on a small set of\nsymbolic words to represent changes has restricted the expressiveness of the\nlanguage. To address these challenges, in this paper, we introduce the Object\nState Captioning and State Change Representation (OSCaR) dataset and benchmark.\nOSCaR consists of 14,084 annotated video segments with nearly 1,000 unique\nobjects from various egocentric video collections. It sets a new testbed for\nevaluating multimodal large language models (MLLMs). Our experiments\ndemonstrate that while MLLMs show some skill, they lack a full understanding of\nobject state changes. The benchmark includes a fine-tuned model that, despite\ninitial capabilities, requires significant improvements in accuracy and\ngeneralization ability for effective understanding of these changes. Our code\nand dataset are available at https://github.com/nguyennm1024/OSCaR.\n","authors":["Nguyen Nguyen","Jing Bi","Ali Vosoughi","Yapeng Tian","Pooyan Fazli","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2402.17128v4.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2312.10105v3","updated":"2024-04-02T23:10:13Z","published":"2023-12-15T04:11:34Z","title":"SeiT++: Masked Token Modeling Improves Storage-efficient Training","summary":" Recent advancements in Deep Neural Network (DNN) models have significantly\nimproved performance across computer vision tasks. However, achieving highly\ngeneralizable and high-performing vision models requires expansive datasets,\nresulting in significant storage requirements. This storage challenge is a\ncritical bottleneck for scaling up models. A recent breakthrough by SeiT\nproposed the use of Vector-Quantized (VQ) feature vectors (i.e., tokens) as\nnetwork inputs for vision classification. This approach achieved 90% of the\nperformance of a model trained on full-pixel images with only 1% of the\nstorage. While SeiT needs labeled data, its potential in scenarios beyond fully\nsupervised learning remains largely untapped. In this paper, we extend SeiT by\nintegrating Masked Token Modeling (MTM) for self-supervised pre-training.\nRecognizing that self-supervised approaches often demand more data due to the\nlack of labels, we introduce TokenAdapt and ColorAdapt. These methods\nfacilitate comprehensive token-friendly data augmentation, effectively\naddressing the increased data requirements of self-supervised learning. We\nevaluate our approach across various scenarios, including storage-efficient\nImageNet-1k classification, fine-grained classification, ADE-20k semantic\nsegmentation, and robustness benchmarks. Experimental results demonstrate\nconsistent performance improvement in diverse experiments, validating the\neffectiveness of our method. Code is available at\nhttps://github.com/naver-ai/tokenadapt.\n","authors":["Minhyun Lee","Song Park","Byeongho Heo","Dongyoon Han","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2312.10105v3.pdf","comment":"First two authors contributed equally"},{"id":"http://arxiv.org/abs/2404.02353v1","updated":"2024-04-02T22:54:24Z","published":"2024-04-02T22:54:24Z","title":"Semantic Augmentation in Images using Language","summary":" Deep Learning models are incredibly data-hungry and require very large\nlabeled datasets for supervised learning. As a consequence, these models often\nsuffer from overfitting, limiting their ability to generalize to real-world\nexamples. Recent advancements in diffusion models have enabled the generation\nof photorealistic images based on textual inputs. Leveraging the substantial\ndatasets used to train these diffusion models, we propose a technique to\nutilize generated images to augment existing datasets. This paper explores\nvarious strategies for effective data augmentation to improve the out-of-domain\ngeneralization capabilities of deep learning models.\n","authors":["Sahiti Yerramilli","Jayant Sravan Tamarapalli","Tanmay Girish Kulkarni","Jonathan Francis","Eric Nyberg"],"pdf_url":"https://arxiv.org/pdf/2404.02353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02348v1","updated":"2024-04-02T22:49:25Z","published":"2024-04-02T22:49:25Z","title":"COVID-19 Detection Based on Blood Test Parameters using Various\n Artificial Intelligence Methods","summary":" In 2019, the world faced a new challenge: a COVID-19 disease caused by the\nnovel coronavirus, SARS-CoV-2. The virus rapidly spread across the globe,\nleading to a high rate of mortality, which prompted health organizations to\ntake measures to control its transmission. Early disease detection is crucial\nin the treatment process, and computer-based automatic detection systems have\nbeen developed to aid in this effort. These systems often rely on artificial\nintelligence (AI) approaches such as machine learning, neural networks, fuzzy\nsystems, and deep learning to classify diseases. This study aimed to\ndifferentiate COVID-19 patients from others using self-categorizing classifiers\nand employing various AI methods. This study used two datasets: the blood test\nsamples and radiography images. The best results for the blood test samples\nobtained from San Raphael Hospital, which include two classes of individuals,\nthose with COVID-19 and those with non-COVID diseases, were achieved through\nthe use of the Ensemble method (a combination of a neural network and two\nmachines learning methods). The results showed that this approach for COVID-19\ndiagnosis is cost-effective and provides results in a shorter amount of time\nthan other methods. The proposed model achieved an accuracy of 94.09% on the\ndataset used. Secondly, the radiographic images were divided into four classes:\nnormal, viral pneumonia, ground glass opacity, and COVID-19 infection. These\nwere used for segmentation and classification. The lung lobes were extracted\nfrom the images and then categorized into specific classes. We achieved an\naccuracy of 91.1% on the image dataset. Generally, this study highlights the\npotential of AI in detecting and managing COVID-19 and underscores the\nimportance of continued research and development in this field.\n","authors":["Kavian Khanjani","Seyed Rasoul Hosseini","Shahrzad Shashaani","Mohammad Teshnehlab"],"pdf_url":"https://arxiv.org/pdf/2404.02348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19866v2","updated":"2024-04-02T22:41:53Z","published":"2024-03-28T22:25:05Z","title":"Is Synthetic Image Useful for Transfer Learning? An Investigation into\n Data Generation, Volume, and Utilization","summary":" Synthetic image data generation represents a promising avenue for training\ndeep learning models, particularly in the realm of transfer learning, where\nobtaining real images within a specific domain can be prohibitively expensive\ndue to privacy and intellectual property considerations. This work delves into\nthe generation and utilization of synthetic images derived from text-to-image\ngenerative models in facilitating transfer learning paradigms. Despite the high\nvisual fidelity of the generated images, we observe that their naive\nincorporation into existing real-image datasets does not consistently enhance\nmodel performance due to the inherent distribution gap between synthetic and\nreal images. To address this issue, we introduce a novel two-stage framework\ncalled bridged transfer, which initially employs synthetic images for\nfine-tuning a pre-trained model to improve its transferability and subsequently\nuses real data for rapid adaptation. Alongside, We propose dataset style\ninversion strategy to improve the stylistic alignment between synthetic and\nreal images. Our proposed methods are evaluated across 10 different datasets\nand 5 distinct models, demonstrating consistent improvements, with up to 30%\naccuracy increase on classification tasks. Intriguingly, we note that the\nenhancements were not yet saturated, indicating that the benefits may further\nincrease with an expanded volume of synthetic data.\n","authors":["Yuhang Li","Xin Dong","Chen Chen","Jingtao Li","Yuxin Wen","Michael Spranger","Lingjuan Lyu"],"pdf_url":"https://arxiv.org/pdf/2403.19866v2.pdf","comment":"ICLR24 Score 6865 https://openreview.net/forum?id=CjPt1AC6w0"},{"id":"http://arxiv.org/abs/2404.02345v1","updated":"2024-04-02T22:39:35Z","published":"2024-04-02T22:39:35Z","title":"GaitSTR: Gait Recognition with Sequential Two-stream Refinement","summary":" Gait recognition aims to identify a person based on their walking sequences,\nserving as a useful biometric modality as it can be observed from long\ndistances without requiring cooperation from the subject. In representing a\nperson's walking sequence, silhouettes and skeletons are the two primary\nmodalities used. Silhouette sequences lack detailed part information when\noverlapping occurs between different body segments and are affected by carried\nobjects and clothing. Skeletons, comprising joints and bones connecting the\njoints, provide more accurate part information for different segments; however,\nthey are sensitive to occlusions and low-quality images, causing\ninconsistencies in frame-wise results within a sequence. In this paper, we\nexplore the use of a two-stream representation of skeletons for gait\nrecognition, alongside silhouettes. By fusing the combined data of silhouettes\nand skeletons, we refine the two-stream skeletons, joints, and bones through\nself-correction in graph convolution, along with cross-modal correction with\ntemporal consistency from silhouettes. We demonstrate that with refined\nskeletons, the performance of the gait recognition model can achieve further\nimprovement on public gait recognition datasets compared with state-of-the-art\nmethods without extra annotations.\n","authors":["Wanrong Zheng","Haidong Zhu","Zhaoheng Zheng","Ram Nevatia"],"pdf_url":"https://arxiv.org/pdf/2404.02345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02344v1","updated":"2024-04-02T22:37:34Z","published":"2024-04-02T22:37:34Z","title":"Effective Malware Detection for Embedded Computing Systems with Limited\n Exposure","summary":" One of the pivotal security threats for the embedded computing systems is\nmalicious software a.k.a malware. With efficiency and efficacy, Machine\nLearning (ML) has been widely adopted for malware detection in recent times.\nDespite being efficient, the existing techniques require a tremendous number of\nbenign and malware samples for training and modeling an efficient malware\ndetector. Furthermore, such constraints limit the detection of emerging malware\nsamples due to the lack of sufficient malware samples required for efficient\ntraining. To address such concerns, we introduce a code-aware data generation\ntechnique that generates multiple mutated samples of the limitedly seen malware\nby the devices. Loss minimization ensures that the generated samples closely\nmimic the limitedly seen malware and mitigate the impractical samples. Such\ndeveloped malware is further incorporated into the training set to formulate\nthe model that can efficiently detect the emerging malware despite having\nlimited exposure. The experimental results demonstrates that the proposed\ntechnique achieves an accuracy of 90% in detecting limitedly seen malware,\nwhich is approximately 3x more than the accuracy attained by state-of-the-art\ntechniques.\n","authors":["Sreenitha Kasarapu","Sanket Shukla","Rakibul Hassan","Avesta Sasan","Houman Homayoun","Sai Manoj Pudukotai Dinakarrao"],"pdf_url":"https://arxiv.org/pdf/2404.02344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11805v2","updated":"2024-04-02T22:35:21Z","published":"2023-12-19T02:39:27Z","title":"Gemini: A Family of Highly Capable Multimodal Models","summary":" This report introduces a new family of multimodal models, Gemini, that\nexhibit remarkable capabilities across image, audio, video, and text\nunderstanding. The Gemini family consists of Ultra, Pro, and Nano sizes,\nsuitable for applications ranging from complex reasoning tasks to on-device\nmemory-constrained use-cases. Evaluation on a broad range of benchmarks shows\nthat our most-capable Gemini Ultra model advances the state of the art in 30 of\n32 of these benchmarks - notably being the first model to achieve human-expert\nperformance on the well-studied exam benchmark MMLU, and improving the state of\nthe art in every one of the 20 multimodal benchmarks we examined. We believe\nthat the new capabilities of the Gemini family in cross-modal reasoning and\nlanguage understanding will enable a wide variety of use cases. We discuss our\napproach toward post-training and deploying Gemini models responsibly to users\nthrough services including Gemini, Gemini Advanced, Google AI Studio, and Cloud\nVertex AI.\n","authors":[" Gemini Team","Rohan Anil","Sebastian Borgeaud","Jean-Baptiste Alayrac","Jiahui Yu","Radu Soricut","Johan Schalkwyk","Andrew M. Dai","Anja Hauth","Katie Millican","David Silver","Melvin Johnson","Ioannis Antonoglou","Julian Schrittwieser","Amelia Glaese","Jilin Chen","Emily Pitler","Timothy Lillicrap","Angeliki Lazaridou","Orhan Firat","James Molloy","Michael Isard","Paul R. Barham","Tom Hennigan","Benjamin Lee","Fabio Viola","Malcolm Reynolds","Yuanzhong Xu","Ryan Doherty","Eli Collins","Clemens Meyer","Eliza Rutherford","Erica Moreira","Kareem Ayoub","Megha Goel","Jack Krawczyk","Cosmo Du","Ed Chi","Heng-Tze Cheng","Eric Ni","Purvi Shah","Patrick Kane","Betty Chan","Manaal Faruqui","Aliaksei Severyn","Hanzhao Lin","YaGuang Li","Yong Cheng","Abe Ittycheriah","Mahdis Mahdieh","Mia Chen","Pei Sun","Dustin Tran","Sumit Bagri","Balaji Lakshminarayanan","Jeremiah Liu","Andras Orban","Fabian Güra","Hao Zhou","Xinying Song","Aurelien Boffy","Harish Ganapathy","Steven Zheng","HyunJeong Choe","Ágoston Weisz","Tao Zhu","Yifeng Lu","Siddharth Gopal","Jarrod Kahn","Maciej Kula","Jeff Pitman","Rushin Shah","Emanuel Taropa","Majd Al Merey","Martin Baeuml","Zhifeng Chen","Laurent El Shafey","Yujing Zhang","Olcan Sercinoglu","George Tucker","Enrique Piqueras","Maxim Krikun","Iain Barr","Nikolay Savinov","Ivo Danihelka","Becca Roelofs","Anaïs White","Anders Andreassen","Tamara von Glehn","Lakshman Yagati","Mehran Kazemi","Lucas Gonzalez","Misha Khalman","Jakub Sygnowski","Alexandre Frechette","Charlotte Smith","Laura Culp","Lev Proleev","Yi Luan","Xi Chen","James Lottes","Nathan Schucher","Federico Lebron","Alban Rrustemi","Natalie Clay","Phil Crone","Tomas Kocisky","Jeffrey Zhao","Bartek Perz","Dian Yu","Heidi Howard","Adam Bloniarz","Jack W. Rae","Han Lu","Laurent Sifre","Marcello Maggioni","Fred Alcober","Dan Garrette","Megan Barnes","Shantanu Thakoor","Jacob Austin","Gabriel Barth-Maron","William Wong","Rishabh Joshi","Rahma Chaabouni","Deeni Fatiha","Arun Ahuja","Gaurav Singh Tomar","Evan Senter","Martin Chadwick","Ilya Kornakov","Nithya Attaluri","Iñaki Iturrate","Ruibo Liu","Yunxuan Li","Sarah Cogan","Jeremy Chen","Chao Jia","Chenjie Gu","Qiao Zhang","Jordan Grimstad","Ale Jakse Hartman","Xavier Garcia","Thanumalayan Sankaranarayana Pillai","Jacob Devlin","Michael Laskin","Diego de Las Casas","Dasha Valter","Connie Tao","Lorenzo Blanco","Adrià Puigdomènech Badia","David Reitter","Mianna Chen","Jenny Brennan","Clara Rivera","Sergey Brin","Shariq Iqbal","Gabriela Surita","Jane Labanowski","Abhi Rao","Stephanie Winkler","Emilio Parisotto","Yiming Gu","Kate Olszewska","Ravi Addanki","Antoine Miech","Annie Louis","Denis Teplyashin","Geoff Brown","Elliot Catt","Jan Balaguer","Jackie Xiang","Pidong Wang","Zoe Ashwood","Anton Briukhov","Albert Webson","Sanjay Ganapathy","Smit Sanghavi","Ajay Kannan","Ming-Wei Chang","Axel Stjerngren","Josip Djolonga","Yuting Sun","Ankur Bapna","Matthew Aitchison","Pedram Pejman","Henryk Michalewski","Tianhe Yu","Cindy Wang","Juliette Love","Junwhan Ahn","Dawn Bloxwich","Kehang Han","Peter Humphreys","Thibault Sellam","James Bradbury","Varun Godbole","Sina Samangooei","Bogdan Damoc","Alex Kaskasoli","Sébastien M. R. Arnold","Vijay Vasudevan","Shubham Agrawal","Jason Riesa","Dmitry Lepikhin","Richard Tanburn","Srivatsan Srinivasan","Hyeontaek Lim","Sarah Hodkinson","Pranav Shyam","Johan Ferret","Steven Hand","Ankush Garg","Tom Le Paine","Jian Li","Yujia Li","Minh Giang","Alexander Neitz","Zaheer Abbas","Sarah York","Machel Reid","Elizabeth Cole","Aakanksha Chowdhery","Dipanjan Das","Dominika Rogozińska","Vitaliy Nikolaev","Pablo Sprechmann","Zachary Nado","Lukas Zilka","Flavien Prost","Luheng He","Marianne Monteiro","Gaurav Mishra","Chris Welty","Josh Newlan","Dawei Jia","Miltiadis Allamanis","Clara Huiyi Hu","Raoul de Liedekerke","Justin Gilmer","Carl Saroufim","Shruti Rijhwani","Shaobo Hou","Disha Shrivastava","Anirudh Baddepudi","Alex Goldin","Adnan Ozturel","Albin Cassirer","Yunhan Xu","Daniel Sohn","Devendra Sachan","Reinald Kim Amplayo","Craig Swanson","Dessie Petrova","Shashi Narayan","Arthur Guez","Siddhartha Brahma","Jessica Landon","Miteyan Patel","Ruizhe Zhao","Kevin Villela","Luyu Wang","Wenhao Jia","Matthew Rahtz","Mai Giménez","Legg Yeung","James Keeling","Petko Georgiev","Diana Mincu","Boxi Wu","Salem Haykal","Rachel Saputro","Kiran Vodrahalli","James Qin","Zeynep Cankara","Abhanshu Sharma","Nick Fernando","Will Hawkins","Behnam Neyshabur","Solomon Kim","Adrian Hutter","Priyanka Agrawal","Alex Castro-Ros","George van den Driessche","Tao Wang","Fan Yang","Shuo-yiin Chang","Paul Komarek","Ross McIlroy","Mario Lučić","Guodong Zhang","Wael Farhan","Michael Sharman","Paul Natsev","Paul Michel","Yamini Bansal","Siyuan Qiao","Kris Cao","Siamak Shakeri","Christina Butterfield","Justin Chung","Paul Kishan Rubenstein","Shivani Agrawal","Arthur Mensch","Kedar Soparkar","Karel Lenc","Timothy Chung","Aedan Pope","Loren Maggiore","Jackie Kay","Priya Jhakra","Shibo Wang","Joshua Maynez","Mary Phuong","Taylor Tobin","Andrea Tacchetti","Maja Trebacz","Kevin Robinson","Yash Katariya","Sebastian Riedel","Paige Bailey","Kefan Xiao","Nimesh Ghelani","Lora Aroyo","Ambrose Slone","Neil Houlsby","Xuehan Xiong","Zhen Yang","Elena Gribovskaya","Jonas Adler","Mateo Wirth","Lisa Lee","Music Li","Thais Kagohara","Jay Pavagadhi","Sophie Bridgers","Anna Bortsova","Sanjay Ghemawat","Zafarali Ahmed","Tianqi Liu","Richard Powell","Vijay Bolina","Mariko Iinuma","Polina Zablotskaia","James Besley","Da-Woon Chung","Timothy Dozat","Ramona Comanescu","Xiance Si","Jeremy Greer","Guolong Su","Martin Polacek","Raphaël Lopez Kaufman","Simon Tokumine","Hexiang Hu","Elena Buchatskaya","Yingjie Miao","Mohamed Elhawaty","Aditya Siddhant","Nenad Tomasev","Jinwei Xing","Christina Greer","Helen Miller","Shereen Ashraf","Aurko Roy","Zizhao Zhang","Ada Ma","Angelos Filos","Milos Besta","Rory Blevins","Ted Klimenko","Chih-Kuan Yeh","Soravit Changpinyo","Jiaqi Mu","Oscar Chang","Mantas Pajarskas","Carrie Muir","Vered Cohen","Charline Le Lan","Krishna Haridasan","Amit Marathe","Steven Hansen","Sholto Douglas","Rajkumar Samuel","Mingqiu Wang","Sophia Austin","Chang Lan","Jiepu Jiang","Justin Chiu","Jaime Alonso Lorenzo","Lars Lowe Sjösund","Sébastien Cevey","Zach Gleicher","Thi Avrahami","Anudhyan Boral","Hansa Srinivasan","Vittorio Selo","Rhys May","Konstantinos Aisopos","Léonard Hussenot","Livio Baldini Soares","Kate Baumli","Michael B. Chang","Adrià Recasens","Ben Caine","Alexander Pritzel","Filip Pavetic","Fabio Pardo","Anita Gergely","Justin Frye","Vinay Ramasesh","Dan Horgan","Kartikeya Badola","Nora Kassner","Subhrajit Roy","Ethan Dyer","Víctor Campos Campos","Alex Tomala","Yunhao Tang","Dalia El Badawy","Elspeth White","Basil Mustafa","Oran Lang","Abhishek Jindal","Sharad Vikram","Zhitao Gong","Sergi Caelles","Ross Hemsley","Gregory Thornton","Fangxiaoyu Feng","Wojciech Stokowiec","Ce Zheng","Phoebe Thacker","Çağlar Ünlü","Zhishuai Zhang","Mohammad Saleh","James Svensson","Max Bileschi","Piyush Patil","Ankesh Anand","Roman Ring","Katerina Tsihlas","Arpi Vezer","Marco Selvi","Toby Shevlane","Mikel Rodriguez","Tom Kwiatkowski","Samira Daruki","Keran Rong","Allan Dafoe","Nicholas FitzGerald","Keren Gu-Lemberg","Mina Khan","Lisa Anne Hendricks","Marie Pellat","Vladimir Feinberg","James Cobon-Kerr","Tara Sainath","Maribeth Rauh","Sayed Hadi Hashemi","Richard Ives","Yana Hasson","Eric Noland","Yuan Cao","Nathan Byrd","Le Hou","Qingze Wang","Thibault Sottiaux","Michela Paganini","Jean-Baptiste Lespiau","Alexandre Moufarek","Samer Hassan","Kaushik Shivakumar","Joost van Amersfoort","Amol Mandhane","Pratik Joshi","Anirudh Goyal","Matthew Tung","Andrew Brock","Hannah Sheahan","Vedant Misra","Cheng Li","Nemanja Rakićević","Mostafa Dehghani","Fangyu Liu","Sid Mittal","Junhyuk Oh","Seb Noury","Eren Sezener","Fantine Huot","Matthew Lamm","Nicola De Cao","Charlie Chen","Sidharth Mudgal","Romina Stella","Kevin Brooks","Gautam Vasudevan","Chenxi Liu","Mainak Chain","Nivedita Melinkeri","Aaron Cohen","Venus Wang","Kristie Seymore","Sergey Zubkov","Rahul Goel","Summer Yue","Sai Krishnakumaran","Brian Albert","Nate Hurley","Motoki Sano","Anhad Mohananey","Jonah Joughin","Egor Filonov","Tomasz Kępa","Yomna Eldawy","Jiawern Lim","Rahul Rishi","Shirin Badiezadegan","Taylor Bos","Jerry Chang","Sanil Jain","Sri Gayatri Sundara Padmanabhan","Subha Puttagunta","Kalpesh Krishna","Leslie Baker","Norbert Kalb","Vamsi Bedapudi","Adam Kurzrok","Shuntong Lei","Anthony Yu","Oren Litvin","Xiang Zhou","Zhichun Wu","Sam Sobell","Andrea Siciliano","Alan Papir","Robby Neale","Jonas Bragagnolo","Tej Toor","Tina Chen","Valentin Anklin","Feiran Wang","Richie Feng","Milad Gholami","Kevin Ling","Lijuan Liu","Jules Walter","Hamid Moghaddam","Arun Kishore","Jakub Adamek","Tyler Mercado","Jonathan Mallinson","Siddhinita Wandekar","Stephen Cagle","Eran Ofek","Guillermo Garrido","Clemens Lombriser","Maksim Mukha","Botu Sun","Hafeezul Rahman Mohammad","Josip Matak","Yadi Qian","Vikas Peswani","Pawel Janus","Quan Yuan","Leif Schelin","Oana David","Ankur Garg","Yifan He","Oleksii Duzhyi","Anton Älgmyr","Timothée Lottaz","Qi Li","Vikas Yadav","Luyao Xu","Alex Chinien","Rakesh Shivanna","Aleksandr Chuklin","Josie Li","Carrie Spadine","Travis Wolfe","Kareem Mohamed","Subhabrata Das","Zihang Dai","Kyle He","Daniel von Dincklage","Shyam Upadhyay","Akanksha Maurya","Luyan Chi","Sebastian Krause","Khalid Salama","Pam G Rabinovitch","Pavan Kumar Reddy M","Aarush Selvan","Mikhail Dektiarev","Golnaz Ghiasi","Erdem Guven","Himanshu Gupta","Boyi Liu","Deepak Sharma","Idan Heimlich Shtacher","Shachi Paul","Oscar Akerlund","François-Xavier Aubet","Terry Huang","Chen Zhu","Eric Zhu","Elico Teixeira","Matthew Fritze","Francesco Bertolini","Liana-Eleonora Marinescu","Martin Bölle","Dominik Paulus","Khyatti Gupta","Tejasi Latkar","Max Chang","Jason Sanders","Roopa Wilson","Xuewei Wu","Yi-Xuan Tan","Lam Nguyen Thiet","Tulsee Doshi","Sid Lall","Swaroop Mishra","Wanming Chen","Thang Luong","Seth Benjamin","Jasmine Lee","Ewa Andrejczuk","Dominik Rabiej","Vipul Ranjan","Krzysztof Styrc","Pengcheng Yin","Jon Simon","Malcolm Rose Harriott","Mudit Bansal","Alexei Robsky","Geoff Bacon","David Greene","Daniil Mirylenka","Chen Zhou","Obaid Sarvana","Abhimanyu Goyal","Samuel Andermatt","Patrick Siegler","Ben Horn","Assaf Israel","Francesco Pongetti","Chih-Wei \"Louis\" Chen","Marco Selvatici","Pedro Silva","Kathie Wang","Jackson Tolins","Kelvin Guu","Roey Yogev","Xiaochen Cai","Alessandro Agostini","Maulik Shah","Hung Nguyen","Noah Ó Donnaile","Sébastien Pereira","Linda Friso","Adam Stambler","Adam Kurzrok","Chenkai Kuang","Yan Romanikhin","Mark Geller","ZJ Yan","Kane Jang","Cheng-Chun Lee","Wojciech Fica","Eric Malmi","Qijun Tan","Dan Banica","Daniel Balle","Ryan Pham","Yanping Huang","Diana Avram","Hongzhi Shi","Jasjot Singh","Chris Hidey","Niharika Ahuja","Pranab Saxena","Dan Dooley","Srividya Pranavi Potharaju","Eileen O'Neill","Anand Gokulchandran","Ryan Foley","Kai Zhao","Mike Dusenberry","Yuan Liu","Pulkit Mehta","Ragha Kotikalapudi","Chalence Safranek-Shrader","Andrew Goodman","Joshua Kessinger","Eran Globen","Prateek Kolhar","Chris Gorgolewski","Ali Ibrahim","Yang Song","Ali Eichenbaum","Thomas Brovelli","Sahitya Potluri","Preethi Lahoti","Cip Baetu","Ali Ghorbani","Charles Chen","Andy Crawford","Shalini Pal","Mukund Sridhar","Petru Gurita","Asier Mujika","Igor Petrovski","Pierre-Louis Cedoz","Chenmei Li","Shiyuan Chen","Niccolò Dal Santo","Siddharth Goyal","Jitesh Punjabi","Karthik Kappaganthu","Chester Kwak","Pallavi LV","Sarmishta Velury","Himadri Choudhury","Jamie Hall","Premal Shah","Ricardo Figueira","Matt Thomas","Minjie Lu","Ting Zhou","Chintu Kumar","Thomas Jurdi","Sharat Chikkerur","Yenai Ma","Adams Yu","Soo Kwak","Victor Ähdel","Sujeevan Rajayogam","Travis Choma","Fei Liu","Aditya Barua","Colin Ji","Ji Ho Park","Vincent Hellendoorn","Alex Bailey","Taylan Bilal","Huanjie Zhou","Mehrdad Khatir","Charles Sutton","Wojciech Rzadkowski","Fiona Macintosh","Konstantin Shagin","Paul Medina","Chen Liang","Jinjing Zhou","Pararth Shah","Yingying Bi","Attila Dankovics","Shipra Banga","Sabine Lehmann","Marissa Bredesen","Zifan Lin","John Eric Hoffmann","Jonathan Lai","Raynald Chung","Kai Yang","Nihal Balani","Arthur Bražinskas","Andrei Sozanschi","Matthew Hayes","Héctor Fernández Alcalde","Peter Makarov","Will Chen","Antonio Stella","Liselotte Snijders","Michael Mandl","Ante Kärrman","Paweł Nowak","Xinyi Wu","Alex Dyck","Krishnan Vaidyanathan","Raghavender R","Jessica Mallet","Mitch Rudominer","Eric Johnston","Sushil Mittal","Akhil Udathu","Janara Christensen","Vishal Verma","Zach Irving","Andreas Santucci","Gamaleldin Elsayed","Elnaz Davoodi","Marin Georgiev","Ian Tenney","Nan Hua","Geoffrey Cideron","Edouard Leurent","Mahmoud Alnahlawi","Ionut Georgescu","Nan Wei","Ivy Zheng","Dylan Scandinaro","Heinrich Jiang","Jasper Snoek","Mukund Sundararajan","Xuezhi Wang","Zack Ontiveros","Itay Karo","Jeremy Cole","Vinu Rajashekhar","Lara Tumeh","Eyal Ben-David","Rishub Jain","Jonathan Uesato","Romina Datta","Oskar Bunyan","Shimu Wu","John Zhang","Piotr Stanczyk","Ye Zhang","David Steiner","Subhajit Naskar","Michael Azzam","Matthew Johnson","Adam Paszke","Chung-Cheng Chiu","Jaume Sanchez Elias","Afroz Mohiuddin","Faizan Muhammad","Jin Miao","Andrew Lee","Nino Vieillard","Jane Park","Jiageng Zhang","Jeff Stanway","Drew Garmon","Abhijit Karmarkar","Zhe Dong","Jong Lee","Aviral Kumar","Luowei Zhou","Jonathan Evens","William Isaac","Geoffrey Irving","Edward Loper","Michael Fink","Isha Arkatkar","Nanxin Chen","Izhak Shafran","Ivan Petrychenko","Zhe Chen","Johnson Jia","Anselm Levskaya","Zhenkai Zhu","Peter Grabowski","Yu Mao","Alberto Magni","Kaisheng Yao","Javier Snaider","Norman Casagrande","Evan Palmer","Paul Suganthan","Alfonso Castaño","Irene Giannoumis","Wooyeol Kim","Mikołaj Rybiński","Ashwin Sreevatsa","Jennifer Prendki","David Soergel","Adrian Goedeckemeyer","Willi Gierke","Mohsen Jafari","Meenu Gaba","Jeremy Wiesner","Diana Gage Wright","Yawen Wei","Harsha Vashisht","Yana Kulizhskaya","Jay Hoover","Maigo Le","Lu Li","Chimezie Iwuanyanwu","Lu Liu","Kevin Ramirez","Andrey Khorlin","Albert Cui","Tian LIN","Marcus Wu","Ricardo Aguilar","Keith Pallo","Abhishek Chakladar","Ginger Perng","Elena Allica Abellan","Mingyang Zhang","Ishita Dasgupta","Nate Kushman","Ivo Penchev","Alena Repina","Xihui Wu","Tom van der Weide","Priya Ponnapalli","Caroline Kaplan","Jiri Simsa","Shuangfeng Li","Olivier Dousse","Fan Yang","Jeff Piper","Nathan Ie","Rama Pasumarthi","Nathan Lintz","Anitha Vijayakumar","Daniel Andor","Pedro Valenzuela","Minnie Lui","Cosmin Paduraru","Daiyi Peng","Katherine Lee","Shuyuan Zhang","Somer Greene","Duc Dung Nguyen","Paula Kurylowicz","Cassidy Hardin","Lucas Dixon","Lili Janzer","Kiam Choo","Ziqiang Feng","Biao Zhang","Achintya Singhal","Dayou Du","Dan McKinnon","Natasha Antropova","Tolga Bolukbasi","Orgad Keller","David Reid","Daniel Finchelstein","Maria Abi Raad","Remi Crocker","Peter Hawkins","Robert Dadashi","Colin Gaffney","Ken Franko","Anna Bulanova","Rémi Leblond","Shirley Chung","Harry Askham","Luis C. Cobo","Kelvin Xu","Felix Fischer","Jun Xu","Christina Sorokin","Chris Alberti","Chu-Cheng Lin","Colin Evans","Alek Dimitriev","Hannah Forbes","Dylan Banarse","Zora Tung","Mark Omernick","Colton Bishop","Rachel Sterneck","Rohan Jain","Jiawei Xia","Ehsan Amid","Francesco Piccinno","Xingyu Wang","Praseem Banzal","Daniel J. Mankowitz","Alex Polozov","Victoria Krakovna","Sasha Brown","MohammadHossein Bateni","Dennis Duan","Vlad Firoiu","Meghana Thotakuri","Tom Natan","Matthieu Geist","Ser tan Girgin","Hui Li","Jiayu Ye","Ofir Roval","Reiko Tojo","Michael Kwong","James Lee-Thorp","Christopher Yew","Danila Sinopalnikov","Sabela Ramos","John Mellor","Abhishek Sharma","Kathy Wu","David Miller","Nicolas Sonnerat","Denis Vnukov","Rory Greig","Jennifer Beattie","Emily Caveness","Libin Bai","Julian Eisenschlos","Alex Korchemniy","Tomy Tsai","Mimi Jasarevic","Weize Kong","Phuong Dao","Zeyu Zheng","Frederick Liu","Fan Yang","Rui Zhu","Tian Huey Teh","Jason Sanmiya","Evgeny Gladchenko","Nejc Trdin","Daniel Toyama","Evan Rosen","Sasan Tavakkol","Linting Xue","Chen Elkind","Oliver Woodman","John Carpenter","George Papamakarios","Rupert Kemp","Sushant Kafle","Tanya Grunina","Rishika Sinha","Alice Talbert","Diane Wu","Denese Owusu-Afriyie","Cosmo Du","Chloe Thornton","Jordi Pont-Tuset","Pradyumna Narayana","Jing Li","Saaber Fatehi","John Wieting","Omar Ajmeri","Benigno Uria","Yeongil Ko","Laura Knight","Amélie Héliou","Ning Niu","Shane Gu","Chenxi Pang","Yeqing Li","Nir Levine","Ariel Stolovich","Rebeca Santamaria-Fernandez","Sonam Goenka","Wenny Yustalim","Robin Strudel","Ali Elqursh","Charlie Deck","Hyo Lee","Zonglin Li","Kyle Levin","Raphael Hoffmann","Dan Holtmann-Rice","Olivier Bachem","Sho Arora","Christy Koh","Soheil Hassas Yeganeh","Siim Põder","Mukarram Tariq","Yanhua Sun","Lucian Ionita","Mojtaba Seyedhosseini","Pouya Tafti","Zhiyu Liu","Anmol Gulati","Jasmine Liu","Xinyu Ye","Bart Chrzaszcz","Lily Wang","Nikhil Sethi","Tianrun Li","Ben Brown","Shreya Singh","Wei Fan","Aaron Parisi","Joe Stanton","Vinod Koverkathu","Christopher A. Choquette-Choo","Yunjie Li","TJ Lu","Abe Ittycheriah","Prakash Shroff","Mani Varadarajan","Sanaz Bahargam","Rob Willoughby","David Gaddy","Guillaume Desjardins","Marco Cornero","Brona Robenek","Bhavishya Mittal","Ben Albrecht","Ashish Shenoy","Fedor Moiseev","Henrik Jacobsson","Alireza Ghaffarkhah","Morgane Rivière","Alanna Walton","Clément Crepy","Alicia Parrish","Zongwei Zhou","Clement Farabet","Carey Radebaugh","Praveen Srinivasan","Claudia van der Salm","Andreas Fidjeland","Salvatore Scellato","Eri Latorre-Chimoto","Hanna Klimczak-Plucińska","David Bridson","Dario de Cesare","Tom Hudson","Piermaria Mendolicchio","Lexi Walker","Alex Morris","Matthew Mauger","Alexey Guseynov","Alison Reid","Seth Odoom","Lucia Loher","Victor Cotruta","Madhavi Yenugula","Dominik Grewe","Anastasia Petrushkina","Tom Duerig","Antonio Sanchez","Steve Yadlowsky","Amy Shen","Amir Globerson","Lynette Webb","Sahil Dua","Dong Li","Surya Bhupatiraju","Dan Hurt","Haroon Qureshi","Ananth Agarwal","Tomer Shani","Matan Eyal","Anuj Khare","Shreyas Rammohan Belle","Lei Wang","Chetan Tekur","Mihir Sanjay Kale","Jinliang Wei","Ruoxin Sang","Brennan Saeta","Tyler Liechty","Yi Sun","Yao Zhao","Stephan Lee","Pandu Nayak","Doug Fritz","Manish Reddy Vuyyuru","John Aslanides","Nidhi Vyas","Martin Wicke","Xiao Ma","Evgenii Eltyshev","Nina Martin","Hardie Cate","James Manyika","Keyvan Amiri","Yelin Kim","Xi Xiong","Kai Kang","Florian Luisier","Nilesh Tripuraneni","David Madras","Mandy Guo","Austin Waters","Oliver Wang","Joshua Ainslie","Jason Baldridge","Han Zhang","Garima Pruthi","Jakob Bauer","Feng Yang","Riham Mansour","Jason Gelman","Yang Xu","George Polovets","Ji Liu","Honglong Cai","Warren Chen","XiangHai Sheng","Emily Xue","Sherjil Ozair","Christof Angermueller","Xiaowei Li","Anoop Sinha","Weiren Wang","Julia Wiesinger","Emmanouil Koukoumidis","Yuan Tian","Anand Iyer","Madhu Gurumurthy","Mark Goldenson","Parashar Shah","MK Blake","Hongkun Yu","Anthony Urbanowicz","Jennimaria Palomaki","Chrisantha Fernando","Ken Durden","Harsh Mehta","Nikola Momchev","Elahe Rahimtoroghi","Maria Georgaki","Amit Raul","Sebastian Ruder","Morgan Redshaw","Jinhyuk Lee","Denny Zhou","Komal Jalan","Dinghua Li","Blake Hechtman","Parker Schuh","Milad Nasr","Kieran Milan","Vladimir Mikulik","Juliana Franco","Tim Green","Nam Nguyen","Joe Kelley","Aroma Mahendru","Andrea Hu","Joshua Howland","Ben Vargas","Jeffrey Hui","Kshitij Bansal","Vikram Rao","Rakesh Ghiya","Emma Wang","Ke Ye","Jean Michel Sarr","Melanie Moranski Preston","Madeleine Elish","Steve Li","Aakash Kaku","Jigar Gupta","Ice Pasupat","Da-Cheng Juan","Milan Someswar","Tejvi M.","Xinyun Chen","Aida Amini","Alex Fabrikant","Eric Chu","Xuanyi Dong","Amruta Muthal","Senaka Buthpitiya","Sarthak Jauhari","Nan Hua","Urvashi Khandelwal","Ayal Hitron","Jie Ren","Larissa Rinaldi","Shahar Drath","Avigail Dabush","Nan-Jiang Jiang","Harshal Godhia","Uli Sachs","Anthony Chen","Yicheng Fan","Hagai Taitelbaum","Hila Noga","Zhuyun Dai","James Wang","Chen Liang","Jenny Hamer","Chun-Sung Ferng","Chenel Elkind","Aviel Atias","Paulina Lee","Vít Listík","Mathias Carlen","Jan van de Kerkhof","Marcin Pikus","Krunoslav Zaher","Paul Müller","Sasha Zykova","Richard Stefanec","Vitaly Gatsko","Christoph Hirnschall","Ashwin Sethi","Xingyu Federico Xu","Chetan Ahuja","Beth Tsai","Anca Stefanoiu","Bo Feng","Keshav Dhandhania","Manish Katyal","Akshay Gupta","Atharva Parulekar","Divya Pitta","Jing Zhao","Vivaan Bhatia","Yashodha Bhavnani","Omar Alhadlaq","Xiaolin Li","Peter Danenberg","Dennis Tu","Alex Pine","Vera Filippova","Abhipso Ghosh","Ben Limonchik","Bhargava Urala","Chaitanya Krishna Lanka","Derik Clive","Yi Sun","Edward Li","Hao Wu","Kevin Hongtongsak","Ianna Li","Kalind Thakkar","Kuanysh Omarov","Kushal Majmundar","Michael Alverson","Michael Kucharski","Mohak Patel","Mudit Jain","Maksim Zabelin","Paolo Pelagatti","Rohan Kohli","Saurabh Kumar","Joseph Kim","Swetha Sankar","Vineet Shah","Lakshmi Ramachandruni","Xiangkai Zeng","Ben Bariach","Laura Weidinger","Amar Subramanya","Sissie Hsiao","Demis Hassabis","Koray Kavukcuoglu","Adam Sadovsky","Quoc Le","Trevor Strohman","Yonghui Wu","Slav Petrov","Jeffrey Dean","Oriol Vinyals"],"pdf_url":"https://arxiv.org/pdf/2312.11805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04076v2","updated":"2024-04-02T21:47:35Z","published":"2023-12-07T06:43:34Z","title":"Large Language Models are Good Prompt Learners for Low-Shot Image\n Classification","summary":" Low-shot image classification, where training images are limited or\ninaccessible, has benefited from recent progress on pre-trained vision-language\n(VL) models with strong generalizability, e.g. CLIP. Prompt learning methods\nbuilt with VL models generate text features from the class names that only have\nconfined class-specific information. Large Language Models (LLMs), with their\nvast encyclopedic knowledge, emerge as the complement. Thus, in this paper, we\ndiscuss the integration of LLMs to enhance pre-trained VL models, specifically\non low-shot classification. However, the domain gap between language and vision\nblocks the direct application of LLMs. Thus, we propose LLaMP, Large Language\nModels as Prompt learners, that produces adaptive prompts for the CLIP text\nencoder, establishing it as the connecting bridge. Experiments show that,\ncompared with other state-of-the-art prompt learning methods, LLaMP yields\nbetter performance on both zero-shot generalization and few-shot image\nclassification, over a spectrum of 11 datasets. Code will be made available at:\nhttps://github.com/zhaohengz/LLaMP.\n","authors":["Zhaoheng Zheng","Jingmin Wei","Xuefeng Hu","Haidong Zhu","Ram Nevatia"],"pdf_url":"https://arxiv.org/pdf/2312.04076v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2210.04936v3","updated":"2024-04-02T21:45:06Z","published":"2022-10-10T18:09:35Z","title":"EarthNets: Empowering AI in Earth Observation","summary":" Earth observation (EO), aiming at monitoring the state of planet Earth using\nremote sensing data, is critical for improving our daily lives and living\nenvironment. With a growing number of satellites in orbit, an increasing number\nof datasets with diverse sensors and research domains are being published to\nfacilitate the research of the remote sensing community. This paper presents a\ncomprehensive review of more than 500 publicly published datasets, including\nresearch domains like agriculture, land use and land cover, disaster\nmonitoring, scene understanding, vision-language models, foundation models,\nclimate change, and weather forecasting. We systematically analyze these EO\ndatasets from four aspects: volume, resolution distributions, research domains,\nand the correlation between datasets. Based on the dataset attributes, we\npropose to measure, rank, and select datasets to build a new benchmark for\nmodel evaluation. Furthermore, a new platform for EO, termed EarthNets, is\nreleased to achieve a fair and consistent evaluation of deep learning methods\non remote sensing data. EarthNets supports standard dataset libraries and\ncutting-edge deep learning models to bridge the gap between the remote sensing\nand machine learning communities. Based on this platform, extensive\ndeep-learning methods are evaluated on the new benchmark. The insightful\nresults are beneficial to future research. The platform and dataset collections\nare publicly available at https://earthnets.github.io.\n","authors":["Zhitong Xiong","Fahong Zhang","Yi Wang","Yilei Shi","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2210.04936v3.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2311.17919v2","updated":"2024-04-02T21:34:29Z","published":"2023-11-29T18:59:59Z","title":"Visual Anagrams: Generating Multi-View Optical Illusions with Diffusion\n Models","summary":" We address the problem of synthesizing multi-view optical illusions: images\nthat change appearance upon a transformation, such as a flip or rotation. We\npropose a simple, zero-shot method for obtaining these illusions from\noff-the-shelf text-to-image diffusion models. During the reverse diffusion\nprocess, we estimate the noise from different views of a noisy image, and then\ncombine these noise estimates together and denoise the image. A theoretical\nanalysis suggests that this method works precisely for views that can be\nwritten as orthogonal transformations, of which permutations are a subset. This\nleads to the idea of a visual anagram--an image that changes appearance under\nsome rearrangement of pixels. This includes rotations and flips, but also more\nexotic pixel permutations such as a jigsaw rearrangement. Our approach also\nnaturally extends to illusions with more than two views. We provide both\nqualitative and quantitative results demonstrating the effectiveness and\nflexibility of our method. Please see our project webpage for additional\nvisualizations and results: https://dangeng.github.io/visual_anagrams/\n","authors":["Daniel Geng","Inbum Park","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2311.17919v2.pdf","comment":"CVPR 2024 camera ready"},{"id":"http://arxiv.org/abs/2310.20436v2","updated":"2024-04-02T21:01:56Z","published":"2023-10-31T13:15:49Z","title":"SignAvatars: A Large-scale 3D Sign Language Holistic Motion Dataset and\n Benchmark","summary":" We present SignAvatars, the first large-scale, multi-prompt 3D sign language\n(SL) motion dataset designed to bridge the communication gap for Deaf and\nhard-of-hearing individuals. While there has been an exponentially growing\nnumber of research regarding digital communication, the majority of existing\ncommunication technologies primarily cater to spoken or written languages,\ninstead of SL, the essential communication method for Deaf and hard-of-hearing\ncommunities. Existing SL datasets, dictionaries, and sign language production\n(SLP) methods are typically limited to 2D as annotating 3D models and avatars\nfor SL is usually an entirely manual and labor-intensive process conducted by\nSL experts, often resulting in unnatural avatars. In response to these\nchallenges, we compile and curate the SignAvatars dataset, which comprises\n70,000 videos from 153 signers, totaling 8.34 million frames, covering both\nisolated signs and continuous, co-articulated signs, with multiple prompts\nincluding HamNoSys, spoken language, and words. To yield 3D holistic\nannotations, including meshes and biomechanically-valid poses of body, hands,\nand face, as well as 2D and 3D keypoints, we introduce an automated annotation\npipeline operating on our large corpus of SL videos. SignAvatars facilitates\nvarious tasks such as 3D sign language recognition (SLR) and the novel 3D SL\nproduction (SLP) from diverse inputs like text scripts, individual words, and\nHamNoSys notation. Hence, to evaluate the potential of SignAvatars, we further\npropose a unified benchmark of 3D SL holistic motion production. We believe\nthat this work is a significant step forward towards bringing the digital world\nto the Deaf and hard-of-hearing communities as well as people interacting with\nthem.\n","authors":["Zhengdi Yu","Shaoli Huang","Yongkang Cheng","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2310.20436v2.pdf","comment":"14 pages; Project page available at https://signavatars.github.io/"},{"id":"http://arxiv.org/abs/2402.15276v3","updated":"2024-04-02T20:54:46Z","published":"2024-02-23T11:47:16Z","title":"CFIR: Fast and Effective Long-Text To Image Retrieval for Large Corpora","summary":" Text-to-image retrieval aims to find the relevant images based on a text\nquery, which is important in various use-cases, such as digital libraries,\ne-commerce, and multimedia databases. Although Multimodal Large Language Models\n(MLLMs) demonstrate state-of-the-art performance, they exhibit limitations in\nhandling large-scale, diverse, and ambiguous real-world needs of retrieval, due\nto the computation cost and the injective embeddings they produce. This paper\npresents a two-stage Coarse-to-Fine Index-shared Retrieval (CFIR) framework,\ndesigned for fast and effective large-scale long-text to image retrieval. The\nfirst stage, Entity-based Ranking (ER), adapts to long-text query ambiguity by\nemploying a multiple-queries-to-multiple-targets paradigm, facilitating\ncandidate filtering for the next stage. The second stage, Summary-based\nRe-ranking (SR), refines these rankings using summarized queries. We also\npropose a specialized Decoupling-BEiT-3 encoder, optimized for handling\nambiguous user needs and both stages, which also enhances computational\nefficiency through vector-based similarity inference. Evaluation on the AToMiC\ndataset reveals that CFIR surpasses existing MLLMs by up to 11.06% in\nRecall@1000, while reducing training and retrieval times by 68.75% and 99.79%,\nrespectively. We will release our code to facilitate future research at\nhttps://github.com/longkukuhi/CFIR.\n","authors":["Zijun Long","Xuri Ge","Richard Mccreadie","Joemon Jose"],"pdf_url":"https://arxiv.org/pdf/2402.15276v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13127v3","updated":"2024-04-02T20:42:51Z","published":"2023-11-22T03:31:31Z","title":"MetaCloak: Preventing Unauthorized Subject-driven Text-to-image\n Diffusion-based Synthesis via Meta-learning","summary":" Text-to-image diffusion models allow seamless generation of personalized\nimages from scant reference photos. Yet, these tools, in the wrong hands, can\nfabricate misleading or harmful content, endangering individuals. To address\nthis problem, existing poisoning-based approaches perturb user images in an\nimperceptible way to render them \"unlearnable\" from malicious uses. We identify\ntwo limitations of these defending approaches: i) sub-optimal due to the\nhand-crafted heuristics for solving the intractable bilevel optimization and\nii) lack of robustness against simple data transformations like Gaussian\nfiltering. To solve these challenges, we propose MetaCloak, which solves the\nbi-level poisoning problem with a meta-learning framework with an additional\ntransformation sampling process to craft transferable and robust perturbation.\nSpecifically, we employ a pool of surrogate diffusion models to craft\ntransferable and model-agnostic perturbation. Furthermore, by incorporating an\nadditional transformation process, we design a simple denoising-error\nmaximization loss that is sufficient for causing transformation-robust semantic\ndistortion and degradation in a personalized generation. Extensive experiments\non the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing\napproaches. Notably, MetaCloak can successfully fool online training services\nlike Replicate, in a black-box manner, demonstrating the effectiveness of\nMetaCloak in real-world scenarios. Our code is available at\nhttps://github.com/liuyixin-louis/MetaCloak.\n","authors":["Yixin Liu","Chenrui Fan","Yutong Dai","Xun Chen","Pan Zhou","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2311.13127v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02287v1","updated":"2024-04-02T20:29:59Z","published":"2024-04-02T20:29:59Z","title":"One Noise to Rule Them All: Multi-View Adversarial Attacks with\n Universal Perturbation","summary":" This paper presents a novel universal perturbation method for generating\nrobust multi-view adversarial examples in 3D object recognition. Unlike\nconventional attacks limited to single views, our approach operates on multiple\n2D images, offering a practical and scalable solution for enhancing model\nscalability and robustness. This generalizable method bridges the gap between\n2D perturbations and 3D-like attack capabilities, making it suitable for\nreal-world applications.\n Existing adversarial attacks may become ineffective when images undergo\ntransformations like changes in lighting, camera position, or natural\ndeformations. We address this challenge by crafting a single universal noise\nperturbation applicable to various object views. Experiments on diverse\nrendered 3D objects demonstrate the effectiveness of our approach. The\nuniversal perturbation successfully identified a single adversarial noise for\neach given set of 3D object renders from multiple poses and viewpoints.\nCompared to single-view attacks, our universal attacks lower classification\nconfidence across multiple viewing angles, especially at low noise levels. A\nsample implementation is made available at\nhttps://github.com/memoatwit/UniversalPerturbation.\n","authors":["Mehmet Ergezer","Phat Duong","Christian Green","Tommy Nguyen","Abdurrahman Zeybey"],"pdf_url":"https://arxiv.org/pdf/2404.02287v1.pdf","comment":"6 pages, 4 figures, presented at ICAIA, Springer to publish under\n Algorithms for Intelligent Systems"},{"id":"http://arxiv.org/abs/2404.02285v1","updated":"2024-04-02T20:23:10Z","published":"2024-04-02T20:23:10Z","title":"LP++: A Surprisingly Strong Linear Probe for Few-Shot CLIP","summary":" In a recent, strongly emergent literature on few-shot CLIP adaptation, Linear\nProbe (LP) has been often reported as a weak baseline. This has motivated\nintensive research building convoluted prompt learning or feature adaptation\nstrategies. In this work, we propose and examine from convex-optimization\nperspectives a generalization of the standard LP baseline, in which the linear\nclassifier weights are learnable functions of the text embedding, with\nclass-wise multipliers blending image and text knowledge. As our objective\nfunction depends on two types of variables, i.e., the class visual prototypes\nand the learnable blending parameters, we propose a computationally efficient\nblock coordinate Majorize-Minimize (MM) descent algorithm. In our full-batch MM\noptimizer, which we coin LP++, step sizes are implicit, unlike standard\ngradient descent practices where learning rates are intensively searched over\nvalidation sets. By examining the mathematical properties of our loss (e.g.,\nLipschitz gradient continuity), we build majorizing functions yielding\ndata-driven learning rates and derive approximations of the loss's minima,\nwhich provide data-informed initialization of the variables. Our image-language\nobjective function, along with these non-trivial optimization insights and\ningredients, yields, surprisingly, highly competitive few-shot CLIP\nperformances. Furthermore, LP++ operates in black-box, relaxes intensive\nvalidation searches for the optimization hyper-parameters, and runs\norders-of-magnitudes faster than state-of-the-art few-shot CLIP adaptation\nmethods. Our code is available at:\n\\url{https://github.com/FereshteShakeri/FewShot-CLIP-Strong-Baseline.git}.\n","authors":["Yunshi Huang","Fereshteh Shakeri","Jose Dolz","Malik Boudiaf","Houda Bahig","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2404.02285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02282v1","updated":"2024-04-02T20:15:43Z","published":"2024-04-02T20:15:43Z","title":"Smooth Deep Saliency","summary":" In this work, we investigate methods to reduce the noise in deep saliency\nmaps coming from convolutional downsampling, with the purpose of explaining how\na deep learning model detects tumors in scanned histological tissue samples.\nThose methods make the investigated models more interpretable for\ngradient-based saliency maps, computed in hidden layers. We test our approach\non different models trained for image classification on ImageNet1K, and models\ntrained for tumor detection on Camelyon16 and in-house real-world digital\npathology scans of stained tissue samples. Our results show that the\ncheckerboard noise in the gradient gets reduced, resulting in smoother and\ntherefore easier to interpret saliency maps.\n","authors":["Rudolf Herdt","Maximilian Schmidt","Daniel Otero Baguer","Peter Maaß"],"pdf_url":"https://arxiv.org/pdf/2404.02282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11311v2","updated":"2024-04-02T20:13:03Z","published":"2024-01-20T19:50:51Z","title":"A Novel Benchmark for Few-Shot Semantic Segmentation in the Era of\n Foundation Models","summary":" In recent years, the rapid evolution of computer vision has seen the\nemergence of various foundation models, each tailored to specific data types\nand tasks. In this study, we explore the adaptation of these models for\nfew-shot semantic segmentation. Specifically, we conduct a comprehensive\ncomparative analysis of four prominent foundation models: DINO V2, Segment\nAnything, CLIP, Masked AutoEncoders, and of a straightforward ResNet50\npre-trained on the COCO dataset. We also include 5 adaptation methods, ranging\nfrom linear probing to fine tuning. Our findings show that DINO V2 outperforms\nother models by a large margin, across various datasets and adaptation methods.\nOn the other hand, adaptation methods provide little discrepancy in the\nobtained results, suggesting that a simple linear probing can compete with\nadvanced, more computationally intensive, alternatives\n","authors":["Reda Bensaid","Vincent Gripon","François Leduc-Primeau","Lukas Mauch","Ghouthi Boukli Hacene","Fabien Cardinaux"],"pdf_url":"https://arxiv.org/pdf/2401.11311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02150v2","updated":"2024-04-02T20:12:20Z","published":"2023-12-04T18:59:32Z","title":"Readout Guidance: Learning Control from Diffusion Features","summary":" We present Readout Guidance, a method for controlling text-to-image diffusion\nmodels with learned signals. Readout Guidance uses readout heads, lightweight\nnetworks trained to extract signals from the features of a pre-trained, frozen\ndiffusion model at every timestep. These readouts can encode single-image\nproperties, such as pose, depth, and edges; or higher-order properties that\nrelate multiple images, such as correspondence and appearance similarity.\nFurthermore, by comparing the readout estimates to a user-defined target, and\nback-propagating the gradient through the readout head, these estimates can be\nused to guide the sampling process. Compared to prior methods for conditional\ngeneration, Readout Guidance requires significantly fewer added parameters and\ntraining samples, and offers a convenient and simple recipe for reproducing\ndifferent forms of conditional control under a single framework, with a single\narchitecture and sampling procedure. We showcase these benefits in the\napplications of drag-based manipulation, identity-consistent generation, and\nspatially aligned control. Project page: https://readout-guidance.github.io.\n","authors":["Grace Luo","Trevor Darrell","Oliver Wang","Dan B Goldman","Aleksander Holynski"],"pdf_url":"https://arxiv.org/pdf/2312.02150v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.00968v2","updated":"2024-04-02T19:57:32Z","published":"2023-12-01T23:04:27Z","title":"Omni-SMoLA: Boosting Generalist Multimodal Models with Soft Mixture of\n Low-rank Experts","summary":" Large multi-modal models (LMMs) exhibit remarkable performance across\nnumerous tasks. However, generalist LMMs often suffer from performance\ndegradation when tuned over a large collection of tasks. Recent research\nsuggests that Mixture of Experts (MoE) architectures are useful for instruction\ntuning, but for LMMs of parameter size around O(50-100B), the prohibitive cost\nof replicating and storing the expert models severely limits the number of\nexperts we can use. We propose Omni-SMoLA, an architecture that uses the Soft\nMoE approach to (softly) mix many multimodal low rank experts, and avoids\nintroducing a significant number of new parameters compared to conventional MoE\nmodels. The core intuition here is that the large model provides a foundational\nbackbone, while different lightweight experts residually learn specialized\nknowledge, either per-modality or multimodally. Extensive experiments\ndemonstrate that the SMoLA approach helps improve the generalist performance\nacross a broad range of generative vision-and-language tasks, achieving new\nSoTA generalist performance that often matches or outperforms single\nspecialized LMM baselines, as well as new SoTA specialist performance.\n","authors":["Jialin Wu","Xia Hu","Yaqing Wang","Bo Pang","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2312.00968v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02263v1","updated":"2024-04-02T19:37:58Z","published":"2024-04-02T19:37:58Z","title":"OFMPNet: Deep End-to-End Model for Occupancy and Flow Prediction in\n Urban Environment","summary":" The task of motion prediction is pivotal for autonomous driving systems,\nproviding crucial data to choose a vehicle behavior strategy within its\nsurroundings. Existing motion prediction techniques primarily focus on\npredicting the future trajectory of each agent in the scene individually,\nutilizing its past trajectory data. In this paper, we introduce an end-to-end\nneural network methodology designed to predict the future behaviors of all\ndynamic objects in the environment. This approach leverages the occupancy map\nand the scene's motion flow. We are investigatin various alternatives for\nconstructing a deep encoder-decoder model called OFMPNet. This model uses a\nsequence of bird's-eye-view road images, occupancy grid, and prior motion flow\nas input data. The encoder of the model can incorporate transformer,\nattention-based, or convolutional units. The decoder considers the use of both\nconvolutional modules and recurrent blocks. Additionally, we propose a novel\ntime-weighted motion flow loss, whose application has shown a substantial\ndecrease in end-point error. Our approach has achieved state-of-the-art results\non the Waymo Occupancy and Flow Prediction benchmark, with a Soft IoU of 52.1%\nand an AUC of 76.75% on Flow-Grounded Occupancy.\n","authors":["Youshaa Murhij","Dmitry Yudin"],"pdf_url":"https://arxiv.org/pdf/2404.02263v1.pdf","comment":"Accepted in Neurocomputing journal - 2024"},{"id":"http://arxiv.org/abs/2404.02257v1","updated":"2024-04-02T19:25:04Z","published":"2024-04-02T19:25:04Z","title":"SnAG: Scalable and Accurate Video Grounding","summary":" Temporal grounding of text descriptions in videos is a central problem in\nvision-language learning and video understanding. Existing methods often\nprioritize accuracy over scalability -- they have been optimized for grounding\nonly a few text queries within short videos, and fail to scale up to long\nvideos with hundreds of queries. In this paper, we study the effect of\ncross-modal fusion on the scalability of video grounding models. Our analysis\nestablishes late fusion as a more cost-effective fusion scheme for long-form\nvideos with many text queries. Moreover, it leads us to a novel, video-centric\nsampling scheme for efficient training. Based on these findings, we present\nSnAG, a simple baseline for scalable and accurate video grounding. Without\nbells and whistles, SnAG is 43% more accurate and 1.5x faster than CONE, a\nstate of the art for long-form video grounding on the challenging MAD dataset,\nwhile achieving highly competitive results on short videos.\n","authors":["Fangzhou Mu","Sicheng Mo","Yin Li"],"pdf_url":"https://arxiv.org/pdf/2404.02257v1.pdf","comment":"Accepted to CVPR 2024. Code available at\n https://github.com/fmu2/snag_release"},{"id":"http://arxiv.org/abs/2309.07096v4","updated":"2024-04-02T19:12:46Z","published":"2023-08-23T12:37:13Z","title":"Computational limits to the legibility of the imaged human brain","summary":" Our knowledge of the organisation of the human brain at the population-level\nis yet to translate into power to predict functional differences at the\nindividual-level, limiting clinical applications, and casting doubt on the\ngeneralisability of inferred mechanisms. It remains unknown whether the\ndifficulty arises from the absence of individuating biological patterns within\nthe brain, or from limited power to access them with the models and compute at\nour disposal. Here we comprehensively investigate the resolvability of such\npatterns with data and compute at unprecedented scale. Across 23 810 unique\nparticipants from UK Biobank, we systematically evaluate the predictability of\n25 individual biological characteristics, from all available combinations of\nstructural and functional neuroimaging data. Over 4526 GPU hours of\ncomputation, we train, optimize, and evaluate out-of-sample 700 individual\npredictive models, including fully-connected feed-forward neural networks of\ndemographic, psychological, serological, chronic disease, and functional\nconnectivity characteristics, and both uni- and multi-modal 3D convolutional\nneural network models of macro- and micro-structural brain imaging. We find a\nmarked discrepancy between the high predictability of sex (balanced accuracy\n99.7%), age (mean absolute error 2.048 years, R2 0.859), and weight (mean\nabsolute error 2.609Kg, R2 0.625), for which we set new state-of-the-art\nperformance, and the surprisingly low predictability of other characteristics.\nNeither structural nor functional imaging predicted psychology better than the\ncoincidence of chronic disease (p<0.05). Serology predicted chronic disease\n(p<0.05) and was best predicted by it (p<0.001), followed by structural\nneuroimaging (p<0.05). Our findings suggest either more informative imaging or\nmore powerful models are needed to decipher individual level characteristics\nfrom the human brain.\n","authors":["James K Ruffle","Robert J Gray","Samia Mohinta","Guilherme Pombo","Chaitanya Kaul","Harpreet Hyare","Geraint Rees","Parashkev Nachev"],"pdf_url":"https://arxiv.org/pdf/2309.07096v4.pdf","comment":"38 pages, 6 figures, 1 table, 2 supplementary figures, 1\n supplementary table"},{"id":"http://arxiv.org/abs/2311.17024v2","updated":"2024-04-02T19:11:35Z","published":"2023-11-28T18:27:15Z","title":"Diffusion 3D Features (Diff3F): Decorating Untextured Shapes with\n Distilled Semantic Features","summary":" We present Diff3F as a simple, robust, and class-agnostic feature descriptor\nthat can be computed for untextured input shapes (meshes or point clouds). Our\nmethod distills diffusion features from image foundational models onto input\nshapes. Specifically, we use the input shapes to produce depth and normal maps\nas guidance for conditional image synthesis. In the process, we produce\n(diffusion) features in 2D that we subsequently lift and aggregate on the\noriginal surface. Our key observation is that even if the conditional image\ngenerations obtained from multi-view rendering of the input shapes are\ninconsistent, the associated image features are robust and, hence, can be\ndirectly aggregated across views. This produces semantic features on the input\nshapes, without requiring additional data or training. We perform extensive\nexperiments on multiple benchmarks (SHREC'19, SHREC'20, FAUST, and TOSCA) and\ndemonstrate that our features, being semantic instead of geometric, produce\nreliable correspondence across both isometric and non-isometrically related\nshape families. Code is available via the project page at\nhttps://diff3f.github.io/\n","authors":["Niladri Shekhar Dutt","Sanjeev Muralikrishnan","Niloy J. Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.17024v2.pdf","comment":"Accepted at CVPR'24"},{"id":"http://arxiv.org/abs/2404.02242v1","updated":"2024-04-02T19:03:39Z","published":"2024-04-02T19:03:39Z","title":"Towards Robust 3D Pose Transfer with Adversarial Learning","summary":" 3D pose transfer that aims to transfer the desired pose to a target mesh is\none of the most challenging 3D generation tasks. Previous attempts rely on\nwell-defined parametric human models or skeletal joints as driving pose\nsources. However, to obtain those clean pose sources, cumbersome but necessary\npre-processing pipelines are inevitable, hindering implementations of the\nreal-time applications. This work is driven by the intuition that the\nrobustness of the model can be enhanced by introducing adversarial samples into\nthe training, leading to a more invulnerable model to the noisy inputs, which\neven can be further extended to directly handling the real-world data like raw\npoint clouds/scans without intermediate processing. Furthermore, we propose a\nnovel 3D pose Masked Autoencoder (3D-PoseMAE), a customized MAE that\neffectively learns 3D extrinsic presentations (i.e., pose). 3D-PoseMAE\nfacilitates learning from the aspect of extrinsic attributes by simultaneously\ngenerating adversarial samples that perturb the model and learning the\narbitrary raw noisy poses via a multi-scale masking strategy. Both qualitative\nand quantitative studies show that the transferred meshes given by our network\nresult in much better quality. Besides, we demonstrate the strong\ngeneralizability of our method on various poses, different domains, and even\nraw scans. Experimental results also show meaningful insights that the\nintermediate adversarial samples generated in the training can successfully\nattack the existing pose transfer models.\n","authors":["Haoyu Chen","Hao Tang","Ehsan Adeli","Guoying Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.02242v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02241v1","updated":"2024-04-02T18:59:39Z","published":"2024-04-02T18:59:39Z","title":"Linear Combination of Saved Checkpoints Makes Consistency and Diffusion\n Models Better","summary":" Diffusion Models (DM) and Consistency Models (CM) are two types of popular\ngenerative models with good generation quality on various tasks. When training\nDM and CM, intermediate weight checkpoints are not fully utilized and only the\nlast converged checkpoint is used. In this work, we find that high-quality\nmodel weights often lie in a basin which cannot be reached by SGD but can be\nobtained by proper checkpoint averaging. Based on these observations, we\npropose LCSC, a simple but effective and efficient method to enhance the\nperformance of DM and CM, by combining checkpoints along the training\ntrajectory with coefficients deduced from evolutionary search. We demonstrate\nthe value of LCSC through two use cases: $\\textbf{(a) Reducing training cost.}$\nWith LCSC, we only need to train DM/CM with fewer number of iterations and/or\nlower batch sizes to obtain comparable sample quality with the fully trained\nmodel. For example, LCSC achieves considerable training speedups for CM\n(23$\\times$ on CIFAR-10 and 15$\\times$ on ImageNet-64). $\\textbf{(b) Enhancing\npre-trained models.}$ Assuming full training is already done, LCSC can further\nimprove the generation quality or speed of the final converged models. For\nexample, LCSC achieves better performance using 1 number of function evaluation\n(NFE) than the base model with 2 NFE on consistency distillation, and decreases\nthe NFE of DM from 15 to 9 while maintaining the generation quality on\nCIFAR-10. Our code is available at\nhttps://github.com/imagination-research/LCSC.\n","authors":["Enshu Liu","Junyi Zhu","Zinan Lin","Xuefei Ning","Matthew B. Blaschko","Sergey Yekhanin","Shengen Yan","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14516v2","updated":"2024-04-02T18:57:12Z","published":"2023-09-25T20:22:47Z","title":"UniBEV: Multi-modal 3D Object Detection with Uniform BEV Encoders for\n Robustness against Missing Sensor Modalities","summary":" Multi-sensor object detection is an active research topic in automated\ndriving, but the robustness of such detection models against missing sensor\ninput (modality missing), e.g., due to a sudden sensor failure, is a critical\nproblem which remains under-studied. In this work, we propose UniBEV, an\nend-to-end multi-modal 3D object detection framework designed for robustness\nagainst missing modalities: UniBEV can operate on LiDAR plus camera input, but\nalso on LiDAR-only or camera-only input without retraining. To facilitate its\ndetector head to handle different input combinations, UniBEV aims to create\nwell-aligned Bird's Eye View (BEV) feature maps from each available modality.\nUnlike prior BEV-based multi-modal detection methods, all sensor modalities\nfollow a uniform approach to resample features from the native sensor\ncoordinate systems to the BEV features. We furthermore investigate the\nrobustness of various fusion strategies w.r.t. missing modalities: the commonly\nused feature concatenation, but also channel-wise averaging, and a\ngeneralization to weighted averaging termed Channel Normalized Weights. To\nvalidate its effectiveness, we compare UniBEV to state-of-the-art BEVFusion and\nMetaBEV on nuScenes over all sensor input combinations. In this setting, UniBEV\nachieves $52.5 \\%$ mAP on average over all input combinations, significantly\nimproving over the baselines ($43.5 \\%$ mAP on average for BEVFusion, $48.7 \\%$\nmAP on average for MetaBEV). An ablation study shows the robustness benefits of\nfusing by weighted averaging over regular concatenation, and of sharing queries\nbetween the BEV encoders of each modality. Our code will be released upon paper\nacceptance.\n","authors":["Shiming Wang","Holger Caesar","Liangliang Nan","Julian F. P. Kooij"],"pdf_url":"https://arxiv.org/pdf/2309.14516v2.pdf","comment":"Accepted by IEEE Intelligent Vehicles Symposium (IV 2024)"},{"id":"http://arxiv.org/abs/2401.10831v2","updated":"2024-04-02T18:54:50Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":" This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we show that VTCD\ncan be used for fine-grained action recognition and video object segmentation.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02233v1","updated":"2024-04-02T18:40:55Z","published":"2024-04-02T18:40:55Z","title":"Visual Concept Connectome (VCC): Open World Concept Discovery and their\n Interlayer Connections in Deep Models","summary":" Understanding what deep network models capture in their learned\nrepresentations is a fundamental challenge in computer vision. We present a new\nmethodology to understanding such vision models, the Visual Concept Connectome\n(VCC), which discovers human interpretable concepts and their interlayer\nconnections in a fully unsupervised manner. Our approach simultaneously reveals\nfine-grained concepts at a layer, connection weightings across all layers and\nis amendable to global analysis of network structure (e.g., branching pattern\nof hierarchical concept assemblies). Previous work yielded ways to extract\ninterpretable concepts from single layers and examine their impact on\nclassification, but did not afford multilayer concept analysis across an entire\nnetwork architecture. Quantitative and qualitative empirical results show the\neffectiveness of VCCs in the domain of image classification. Also, we leverage\nVCCs for the application of failure mode debugging to reveal where mistakes\narise in deep networks.\n","authors":["Matthew Kowal","Richard P. Wildes","Konstantinos G. Derpanis"],"pdf_url":"https://arxiv.org/pdf/2404.02233v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02227v1","updated":"2024-04-02T18:30:29Z","published":"2024-04-02T18:30:29Z","title":"OOSTraj: Out-of-Sight Trajectory Prediction With Vision-Positioning\n Denoising","summary":" Trajectory prediction is fundamental in computer vision and autonomous\ndriving, particularly for understanding pedestrian behavior and enabling\nproactive decision-making. Existing approaches in this field often assume\nprecise and complete observational data, neglecting the challenges associated\nwith out-of-view objects and the noise inherent in sensor data due to limited\ncamera range, physical obstructions, and the absence of ground truth for\ndenoised sensor data. Such oversights are critical safety concerns, as they can\nresult in missing essential, non-visible objects. To bridge this gap, we\npresent a novel method for out-of-sight trajectory prediction that leverages a\nvision-positioning technique. Our approach denoises noisy sensor observations\nin an unsupervised manner and precisely maps sensor-based trajectories of\nout-of-sight objects into visual trajectories. This method has demonstrated\nstate-of-the-art performance in out-of-sight noisy sensor trajectory denoising\nand prediction on the Vi-Fi and JRDB datasets. By enhancing trajectory\nprediction accuracy and addressing the challenges of out-of-sight objects, our\nwork significantly contributes to improving the safety and reliability of\nautonomous driving in complex environments. Our work represents the first\ninitiative towards Out-Of-Sight Trajectory prediction (OOSTraj), setting a new\nbenchmark for future research. The code is available at\n\\url{https://github.com/Hai-chao-Zhang/OOSTraj}.\n","authors":["Haichao Zhang","Yi Xu","Hongsheng Lu","Takayuki Shimizu","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2404.02227v1.pdf","comment":"In Proceedings of IEEE/CVF Conference on Computer Vision and Pattern\n Recognition 2024 (CVPR)"},{"id":"http://arxiv.org/abs/2404.02225v1","updated":"2024-04-02T18:27:03Z","published":"2024-04-02T18:27:03Z","title":"CHOSEN: Contrastive Hypothesis Selection for Multi-View Depth Refinement","summary":" We propose CHOSEN, a simple yet flexible, robust and effective multi-view\ndepth refinement framework. It can be employed in any existing multi-view\nstereo pipeline, with straightforward generalization capability for different\nmulti-view capture systems such as camera relative positioning and lenses.\nGiven an initial depth estimation, CHOSEN iteratively re-samples and selects\nthe best hypotheses, and automatically adapts to different metric or intrinsic\nscales determined by the capture system. The key to our approach is the\napplication of contrastive learning in an appropriate solution space and a\ncarefully designed hypothesis feature, based on which positive and negative\nhypotheses can be effectively distinguished. Integrated in a simple baseline\nmulti-view stereo pipeline, CHOSEN delivers impressive quality in terms of\ndepth and normal accuracy compared to many current deep learning based\nmulti-view stereo pipelines.\n","authors":["Di Qiu","Yinda Zhang","Thabo Beeler","Vladimir Tankovich","Christian Häne","Sean Fanello","Christoph Rhemann","Sergio Orts Escolano"],"pdf_url":"https://arxiv.org/pdf/2404.02225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03835v2","updated":"2024-04-02T18:26:12Z","published":"2024-01-08T11:46:45Z","title":"Limitations of Data-Driven Spectral Reconstruction -- Optics-Aware\n Analysis and Mitigation","summary":" Hyperspectral imaging empowers machine vision systems with the distinct\ncapability of identifying materials through recording their spectral\nsignatures. Recent efforts in data-driven spectral reconstruction aim at\nextracting spectral information from RGB images captured by cost-effective RGB\ncameras, instead of dedicated hardware.\n In this paper we systematically analyze the performance of such methods,\nevaluating both the practical limitations with respect to current datasets and\noverfitting, as well as fundamental limitations with respect to the nature of\nthe information encoded in the RGB images, and the dependency of this\ninformation on the optical system of the camera.\n We find that, the current models are not robust under slight variations,\ne.g., in noise level or compression of the RGB file. Without modeling\nunderrepresented spectral content, existing datasets and the models trained on\nthem are limited in their ability to cope with challenging metameric colors. To\nmitigate this issue, we propose to exploit the combination of metameric data\naugmentation and optical lens aberrations to improve the encoding of the\nmetameric information into the RGB image, which paves the road towards higher\nperforming spectral imaging and reconstruction approaches.\n","authors":["Qiang Fu","Matheus Souza","Eunsue Choi","Suhyun Shin","Seung-Hwan Baek","Wolfgang Heidrich"],"pdf_url":"https://arxiv.org/pdf/2401.03835v2.pdf","comment":"12 pages, 7 figures, 8 tables"},{"id":"http://arxiv.org/abs/2312.03587v2","updated":"2024-04-02T18:14:35Z","published":"2023-12-06T16:24:47Z","title":"Language-Informed Visual Concept Learning","summary":" Our understanding of the visual world is centered around various concept\naxes, characterizing different aspects of visual entities. While different\nconcept axes can be easily specified by language, e.g. color, the exact visual\nnuances along each axis often exceed the limitations of linguistic\narticulations, e.g. a particular style of painting. In this work, our goal is\nto learn a language-informed visual concept representation, by simply\ndistilling large pre-trained vision-language models. Specifically, we train a\nset of concept encoders to encode the information pertinent to a set of\nlanguage-informed concept axes, with an objective of reproducing the input\nimage through a pre-trained Text-to-Image (T2I) model. To encourage better\ndisentanglement of different concept encoders, we anchor the concept embeddings\nto a set of text embeddings obtained from a pre-trained Visual Question\nAnswering (VQA) model. At inference time, the model extracts concept embeddings\nalong various axes from new test images, which can be remixed to generate\nimages with novel compositions of visual concepts. With a lightweight test-time\nfinetuning procedure, it can also generalize to novel concepts unseen at\ntraining.\n","authors":["Sharon Lee","Yunzhi Zhang","Shangzhe Wu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2312.03587v2.pdf","comment":"ICLR 2024. The first two authors contributed equally and are\n alphabetically ordered. Project page:\n https://ai.stanford.edu/~yzzhang/projects/concept-axes/"},{"id":"http://arxiv.org/abs/2312.12433v3","updated":"2024-04-02T18:09:22Z","published":"2023-12-19T18:58:40Z","title":"TAO-Amodal: A Benchmark for Tracking Any Object Amodally","summary":" Amodal perception, the ability to comprehend complete object structures from\npartial visibility, is a fundamental skill, even for infants. Its significance\nextends to applications like autonomous driving, where a clear understanding of\nheavily occluded objects is essential. However, modern detection and tracking\nalgorithms often overlook this critical capability, perhaps due to the\nprevalence of \\textit{modal} annotations in most benchmarks. To address the\nscarcity of amodal benchmarks, we introduce TAO-Amodal, featuring 833 diverse\ncategories in thousands of video sequences. Our dataset includes\n\\textit{amodal} and modal bounding boxes for visible and partially or fully\noccluded objects, including those that are partially out of the camera frame.\nWe investigate the current lay of the land in both amodal tracking and\ndetection by benchmarking state-of-the-art modal trackers and amodal\nsegmentation methods. We find that existing methods, even when adapted for\namodal tracking, struggle to detect and track objects under heavy occlusion. To\nmitigate this, we explore simple finetuning schemes that can increase the\namodal tracking and detection metrics of occluded objects by 2.1\\% and 3.3\\%.\n","authors":["Cheng-Yen Hsieh","Kaihua Chen","Achal Dave","Tarasha Khurana","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2312.12433v3.pdf","comment":"Project Page: https://tao-amodal.github.io"},{"id":"http://arxiv.org/abs/2311.18832v2","updated":"2024-04-02T17:59:33Z","published":"2023-11-30T18:59:44Z","title":"Exploiting Diffusion Prior for Generalizable Dense Prediction","summary":" Contents generated by recent advanced Text-to-Image (T2I) diffusion models\nare sometimes too imaginative for existing off-the-shelf dense predictors to\nestimate due to the immitigable domain gap. We introduce DMP, a pipeline\nutilizing pre-trained T2I models as a prior for dense prediction tasks. To\naddress the misalignment between deterministic prediction tasks and stochastic\nT2I models, we reformulate the diffusion process through a sequence of\ninterpolations, establishing a deterministic mapping between input RGB images\nand output prediction distributions. To preserve generalizability, we use\nlow-rank adaptation to fine-tune pre-trained models. Extensive experiments\nacross five tasks, including 3D property estimation, semantic segmentation, and\nintrinsic image decomposition, showcase the efficacy of the proposed method.\nDespite limited-domain training data, the approach yields faithful estimations\nfor arbitrary images, surpassing existing state-of-the-art algorithms.\n","authors":["Hsin-Ying Lee","Hung-Yu Tseng","Hsin-Ying Lee","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2311.18832v2.pdf","comment":"To appear in CVPR 2024. Project page: https://shinying.github.io/dmp"},{"id":"http://arxiv.org/abs/2404.02189v1","updated":"2024-04-02T16:48:34Z","published":"2024-04-02T16:48:34Z","title":"Insights from the Use of Previously Unseen Neural Architecture Search\n Datasets","summary":" The boundless possibility of neural networks which can be used to solve a\nproblem -- each with different performance -- leads to a situation where a Deep\nLearning expert is required to identify the best neural network. This goes\nagainst the hope of removing the need for experts. Neural Architecture Search\n(NAS) offers a solution to this by automatically identifying the best\narchitecture. However, to date, NAS work has focused on a small set of datasets\nwhich we argue are not representative of real-world problems. We introduce\neight new datasets created for a series of NAS Challenges: AddNIST, Language,\nMultNIST, CIFARTile, Gutenberg, Isabella, GeoClassing, and Chesseract. These\ndatasets and challenges are developed to direct attention to issues in NAS\ndevelopment and to encourage authors to consider how their models will perform\non datasets unknown to them at development time. We present experimentation\nusing standard Deep Learning methods as well as the best results from challenge\nparticipants.\n","authors":["Rob Geada","David Towers","Matthew Forshaw","Amir Atapour-Abarghouei","A. Stephen McGough"],"pdf_url":"https://arxiv.org/pdf/2404.02189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02185v1","updated":"2024-04-02T15:49:00Z","published":"2024-04-02T15:49:00Z","title":"NeRFCodec: Neural Feature Compression Meets Neural Radiance Fields for\n Memory-Efficient Scene Representation","summary":" The emergence of Neural Radiance Fields (NeRF) has greatly impacted 3D scene\nmodeling and novel-view synthesis. As a kind of visual media for 3D scene\nrepresentation, compression with high rate-distortion performance is an eternal\ntarget. Motivated by advances in neural compression and neural field\nrepresentation, we propose NeRFCodec, an end-to-end NeRF compression framework\nthat integrates non-linear transform, quantization, and entropy coding for\nmemory-efficient scene representation. Since training a non-linear transform\ndirectly on a large scale of NeRF feature planes is impractical, we discover\nthat pre-trained neural 2D image codec can be utilized for compressing the\nfeatures when adding content-specific parameters. Specifically, we reuse neural\n2D image codec but modify its encoder and decoder heads, while keeping the\nother parts of the pre-trained decoder frozen. This allows us to train the full\npipeline via supervision of rendering loss and entropy loss, yielding the\nrate-distortion balance by updating the content-specific parameters. At test\ntime, the bitstreams containing latent code, feature decoder head, and other\nside information are transmitted for communication. Experimental results\ndemonstrate our method outperforms existing NeRF compression methods, enabling\nhigh-quality novel view synthesis with a memory budget of 0.5 MB.\n","authors":["Sicheng Li","Hao Li","Yiyi Liao","Lu Yu"],"pdf_url":"https://arxiv.org/pdf/2404.02185v1.pdf","comment":"Accepted at CVPR2024. The source code will be released"},{"id":"http://arxiv.org/abs/2307.11957v5","updated":"2024-04-02T12:25:10Z","published":"2023-07-22T01:56:58Z","title":"High-performance real-world optical computing trained by in situ\n model-free optimization","summary":" Optical computing systems provide high-speed and low-energy data processing\nbut face deficiencies in computationally demanding training and\nsimulation-to-reality gaps. We propose a gradient-based model-free optimization\n(G-MFO) method based on a Monte Carlo gradient estimation algorithm for\ncomputationally efficient in situ training of optical computing systems. This\napproach treats an optical computing system as a black box and back-propagates\nthe loss directly to the optical computing weights' probability distributions,\ncircumventing the need for a computationally heavy and biased system\nsimulation. Our experiments on diffractive optical computing systems show that\nG-MFO outperforms hybrid training on the MNIST and FMNIST datasets.\nFurthermore, we demonstrate image-free and high-speed classification of cells\nfrom their marker-free phase maps. Our method's model-free and high-performance\nnature, combined with its low demand for computational resources, paves the way\nfor accelerating the transition of optical computing from laboratory\ndemonstrations to practical, real-world applications.\n","authors":["Guangyuan Zhao","Xin Shu","Renjie Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11957v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01878v1","updated":"2024-04-02T12:08:26Z","published":"2024-04-02T12:08:26Z","title":"Real, fake and synthetic faces -- does the coin have three sides?","summary":" With the ever-growing power of generative artificial intelligence, deepfake\nand artificially generated (synthetic) media have continued to spread online,\nwhich creates various ethical and moral concerns regarding their usage. To\ntackle this, we thus present a novel exploration of the trends and patterns\nobserved in real, deepfake and synthetic facial images. The proposed analysis\nis done in two parts: firstly, we incorporate eight deep learning models and\nanalyze their performances in distinguishing between the three classes of\nimages. Next, we look to further delve into the similarities and differences\nbetween these three sets of images by investigating their image properties both\nin the context of the entire image as well as in the context of specific\nregions within the image. ANOVA test was also performed and provided further\nclarity amongst the patterns associated between the images of the three\nclasses. From our findings, we observe that the investigated deeplearning\nmodels found it easier to detect synthetic facial images, with the ViT Patch-16\nmodel performing best on this task with a class-averaged sensitivity,\nspecificity, precision, and accuracy of 97.37%, 98.69%, 97.48%, and 98.25%,\nrespectively. This observation was supported by further analysis of various\nimage properties. We saw noticeable differences across the three category of\nimages. This analysis can help us build better algorithms for facial image\ngeneration, and also shows that synthetic, deepfake and real face images are\nindeed three different classes.\n","authors":["Shahzeb Naeem","Ramzi Al-Sharawi","Muhammad Riyyan Khan","Usman Tariq","Abhinav Dhall","Hasan Al-Nashash"],"pdf_url":"https://arxiv.org/pdf/2404.01878v1.pdf","comment":null}]},"2024-04-03T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.02905v1","updated":"2024-04-03T17:59:53Z","published":"2024-04-03T17:59:53Z","title":"Visual Autoregressive Modeling: Scalable Image Generation via Next-Scale\n Prediction","summary":" We present Visual AutoRegressive modeling (VAR), a new generation paradigm\nthat redefines the autoregressive learning on images as coarse-to-fine\n\"next-scale prediction\" or \"next-resolution prediction\", diverging from the\nstandard raster-scan \"next-token prediction\". This simple, intuitive\nmethodology allows autoregressive (AR) transformers to learn visual\ndistributions fast and generalize well: VAR, for the first time, makes AR\nmodels surpass diffusion transformers in image generation. On ImageNet 256x256\nbenchmark, VAR significantly improve AR baseline by improving Frechet inception\ndistance (FID) from 18.65 to 1.80, inception score (IS) from 80.4 to 356.4,\nwith around 20x faster inference speed. It is also empirically verified that\nVAR outperforms the Diffusion Transformer (DiT) in multiple dimensions\nincluding image quality, inference speed, data efficiency, and scalability.\nScaling up VAR models exhibits clear power-law scaling laws similar to those\nobserved in LLMs, with linear correlation coefficients near -0.998 as solid\nevidence. VAR further showcases zero-shot generalization ability in downstream\ntasks including image in-painting, out-painting, and editing. These results\nsuggest VAR has initially emulated the two important properties of LLMs:\nScaling Laws and zero-shot task generalization. We have released all models and\ncodes to promote the exploration of AR/VAR models for visual generation and\nunified learning.\n","authors":["Keyu Tian","Yi Jiang","Zehuan Yuan","Bingyue Peng","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02904v1","updated":"2024-04-03T17:59:36Z","published":"2024-04-03T17:59:36Z","title":"ALOHa: A New Measure for Hallucination in Captioning Models","summary":" Despite recent advances in multimodal pre-training for visual description,\nstate-of-the-art models still produce captions containing errors, such as\nhallucinating objects not present in a scene. The existing prominent metric for\nobject hallucination, CHAIR, is limited to a fixed set of MS COCO objects and\nsynonyms. In this work, we propose a modernized open-vocabulary metric, ALOHa,\nwhich leverages large language models (LLMs) to measure object hallucinations.\nSpecifically, we use an LLM to extract groundable objects from a candidate\ncaption, measure their semantic similarity to reference objects from captions\nand object detections, and use Hungarian matching to produce a final\nhallucination score. We show that ALOHa correctly identifies 13.6% more\nhallucinated objects than CHAIR on HAT, a new gold-standard subset of MS COCO\nCaptions annotated for hallucinations, and 30.8% more on nocaps, where objects\nextend beyond MS COCO categories. Our code is available at\nhttps://davidmchan.github.io/aloha/.\n","authors":["Suzanne Petryk","David M. Chan","Anish Kachinthaya","Haodi Zou","John Canny","Joseph E. Gonzalez","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2404.02904v1.pdf","comment":"To appear at NAACL 2024"},{"id":"http://arxiv.org/abs/2404.02903v1","updated":"2024-04-03T17:59:28Z","published":"2024-04-03T17:59:28Z","title":"LidarDM: Generative LiDAR Simulation in a Generated World","summary":" We present LidarDM, a novel LiDAR generative model capable of producing\nrealistic, layout-aware, physically plausible, and temporally coherent LiDAR\nvideos. LidarDM stands out with two unprecedented capabilities in LiDAR\ngenerative modeling: (i) LiDAR generation guided by driving scenarios, offering\nsignificant potential for autonomous driving simulations, and (ii) 4D LiDAR\npoint cloud generation, enabling the creation of realistic and temporally\ncoherent sequences. At the heart of our model is a novel integrated 4D world\ngeneration framework. Specifically, we employ latent diffusion models to\ngenerate the 3D scene, combine it with dynamic actors to form the underlying 4D\nworld, and subsequently produce realistic sensory observations within this\nvirtual environment. Our experiments indicate that our approach outperforms\ncompeting algorithms in realism, temporal coherency, and layout consistency. We\nadditionally show that LidarDM can be used as a generative world model\nsimulator for training and testing perception models.\n","authors":["Vlas Zyrianov","Henry Che","Zhijian Liu","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02900v1","updated":"2024-04-03T17:58:21Z","published":"2024-04-03T17:58:21Z","title":"DeiT-LT Distillation Strikes Back for Vision Transformer Training on\n Long-Tailed Datasets","summary":" Vision Transformer (ViT) has emerged as a prominent architecture for various\ncomputer vision tasks. In ViT, we divide the input image into patch tokens and\nprocess them through a stack of self attention blocks. However, unlike\nConvolutional Neural Networks (CNN), ViTs simple architecture has no\ninformative inductive bias (e.g., locality,etc. ). Due to this, ViT requires a\nlarge amount of data for pre-training. Various data efficient approaches (DeiT)\nhave been proposed to train ViT on balanced datasets effectively. However,\nlimited literature discusses the use of ViT for datasets with long-tailed\nimbalances. In this work, we introduce DeiT-LT to tackle the problem of\ntraining ViTs from scratch on long-tailed datasets. In DeiT-LT, we introduce an\nefficient and effective way of distillation from CNN via distillation DIST\ntoken by using out-of-distribution images and re-weighting the distillation\nloss to enhance focus on tail classes. This leads to the learning of local\nCNN-like features in early ViT blocks, improving generalization for tail\nclasses. Further, to mitigate overfitting, we propose distilling from a flat\nCNN teacher, which leads to learning low-rank generalizable features for DIST\ntokens across all ViT blocks. With the proposed DeiT-LT scheme, the\ndistillation DIST token becomes an expert on the tail classes, and the\nclassifier CLS token becomes an expert on the head classes. The experts help to\neffectively learn features corresponding to both the majority and minority\nclasses using a distinct set of tokens within the same ViT architecture. We\nshow the effectiveness of DeiT-LT for training ViT from scratch on datasets\nranging from small-scale CIFAR-10 LT to large-scale iNaturalist-2018.\n","authors":["Harsh Rangwani","Pradipto Mondal","Mayank Mishra","Ashish Ramayee Asokan","R. Venkatesh Babu"],"pdf_url":"https://arxiv.org/pdf/2404.02900v1.pdf","comment":"CVPR 2024. Project Page: https://rangwani-harsh.github.io/DeiT-LT"},{"id":"http://arxiv.org/abs/2312.00947v2","updated":"2024-04-03T17:58:13Z","published":"2023-12-01T22:00:14Z","title":"FreeZe: Training-free zero-shot 6D pose estimation with geometric and\n vision foundation models","summary":" Estimating the 6D pose of objects unseen during training is highly desirable\nyet challenging. Zero-shot object 6D pose estimation methods address this\nchallenge by leveraging additional task-specific supervision provided by\nlarge-scale, photo-realistic synthetic datasets. However, their performance\nheavily depends on the quality and diversity of rendered data and they require\nextensive training. In this work, we show how to tackle the same task but\nwithout training on specific data. We propose FreeZe, a novel solution that\nharnesses the capabilities of pre-trained geometric and vision foundation\nmodels. FreeZe leverages 3D geometric descriptors learned from unrelated 3D\npoint clouds and 2D visual features learned from web-scale 2D images to\ngenerate discriminative 3D point-level descriptors. We then estimate the 6D\npose of unseen objects by 3D registration based on RANSAC. We also introduce a\nnovel algorithm to solve ambiguous cases due to geometrically symmetric objects\nthat is based on visual features. We comprehensively evaluate FreeZe across the\nseven core datasets of the BOP Benchmark, which include over a hundred 3D\nobjects and 20,000 images captured in various scenarios. FreeZe consistently\noutperforms all state-of-the-art approaches, including competitors extensively\ntrained on synthetic 6D pose estimation data. Code will be publicly available\nat https://andreacaraffa.github.io/freeze.\n","authors":["Andrea Caraffa","Davide Boscaini","Amir Hamza","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.00947v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02899v1","updated":"2024-04-03T17:57:15Z","published":"2024-04-03T17:57:15Z","title":"MatAtlas: Text-driven Consistent Geometry Texturing and Material\n Assignment","summary":" We present MatAtlas, a method for consistent text-guided 3D model texturing.\nFollowing recent progress we leverage a large scale text-to-image generation\nmodel (e.g., Stable Diffusion) as a prior to texture a 3D model. We carefully\ndesign an RGB texturing pipeline that leverages a grid pattern diffusion,\ndriven by depth and edges. By proposing a multi-step texture refinement\nprocess, we significantly improve the quality and 3D consistency of the\ntexturing output. To further address the problem of baked-in lighting, we move\nbeyond RGB colors and pursue assigning parametric materials to the assets.\nGiven the high-quality initial RGB texture, we propose a novel material\nretrieval method capitalized on Large Language Models (LLM), enabling\neditabiliy and relightability. We evaluate our method on a wide variety of\ngeometries and show that our method significantly outperform prior arts. We\nalso analyze the role of each component through a detailed ablation study.\n","authors":["Duygu Ceylan","Valentin Deschaintre","Thibault Groueix","Rosalie Martin","Chun-Hao Huang","Romain Rouffet","Vladimir Kim","Gaëtan Lassagne"],"pdf_url":"https://arxiv.org/pdf/2404.02899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02897v1","updated":"2024-04-03T17:54:37Z","published":"2024-04-03T17:54:37Z","title":"Deep Image Composition Meets Image Forgery","summary":" Image forgery is a topic that has been studied for many years. Before the\nbreakthrough of deep learning, forged images were detected using handcrafted\nfeatures that did not require training. These traditional methods failed to\nperform satisfactorily even on datasets much worse in quality than real-life\nimage manipulations. Advances in deep learning have impacted image forgery\ndetection as much as they have impacted other areas of computer vision and have\nimproved the state of the art. Deep learning models require large amounts of\nlabeled data for training. In the case of image forgery, labeled data at the\npixel level is a very important factor for the models to learn. None of the\nexisting datasets have sufficient size, realism and pixel-level labeling at the\nsame time. This is due to the high cost of producing and labeling quality\nimages. It can take hours for an image editing expert to manipulate just one\nimage. To bridge this gap, we automate data generation using image composition\ntechniques that are very related to image forgery. Unlike other automated data\ngeneration frameworks, we use state of the art image composition deep learning\nmodels to generate spliced images close to the quality of real-life\nmanipulations. Finally, we test the generated dataset on the SOTA image\nmanipulation detection model and show that its prediction performance is lower\ncompared to existing datasets, i.e. we produce realistic images that are more\ndifficult to detect. Dataset will be available at\nhttps://github.com/99eren99/DIS25k .\n","authors":["Eren Tahir","Mert Bal"],"pdf_url":"https://arxiv.org/pdf/2404.02897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02889v1","updated":"2024-04-03T17:44:02Z","published":"2024-04-03T17:44:02Z","title":"Steganographic Passport: An Owner and User Verifiable Credential for\n Deep Model IP Protection Without Retraining","summary":" Ensuring the legal usage of deep models is crucial to promoting trustable,\naccountable, and responsible artificial intelligence innovation. Current\npassport-based methods that obfuscate model functionality for license-to-use\nand ownership verifications suffer from capacity and quality constraints, as\nthey require retraining the owner model for new users. They are also vulnerable\nto advanced Expanded Residual Block ambiguity attacks. We propose\nSteganographic Passport, which uses an invertible steganographic network to\ndecouple license-to-use from ownership verification by hiding the user's\nidentity images into the owner-side passport and recovering them from their\nrespective user-side passports. An irreversible and collision-resistant hash\nfunction is used to avoid exposing the owner-side passport from the derived\nuser-side passports and increase the uniqueness of the model signature. To\nsafeguard both the passport and model's weights against advanced ambiguity\nattacks, an activation-level obfuscation is proposed for the verification\nbranch of the owner's model. By jointly training the verification and\ndeployment branches, their weights become tightly coupled. The proposed method\nsupports agile licensing of deep models by providing a strong ownership proof\nand license accountability without requiring a separate model retraining for\nthe admission of every new user. Experiment results show that our\nSteganographic Passport outperforms other passport-based deep model protection\nmethods in robustness against various known attacks.\n","authors":["Qi Cui","Ruohan Meng","Chaohui Xu","Chip-Hong Chang"],"pdf_url":"https://arxiv.org/pdf/2404.02889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14740v2","updated":"2024-04-03T17:42:44Z","published":"2023-08-28T17:41:14Z","title":"Total Selfie: Generating Full-Body Selfies","summary":" We present a method to generate full-body selfies from photographs originally\ntaken at arms length. Because self-captured photos are typically taken close\nup, they have limited field of view and exaggerated perspective that distorts\nfacial shapes. We instead seek to generate the photo some one else would take\nof you from a few feet away. Our approach takes as input four selfies of your\nface and body, a background image, and generates a full-body selfie in a\ndesired target pose. We introduce a novel diffusion-based approach to combine\nall of this information into high-quality, well-composed photos of you with the\ndesired pose and background.\n","authors":["Bowei Chen","Brian Curless","Ira Kemelmacher-Shlizerman","Steven M. Seitz"],"pdf_url":"https://arxiv.org/pdf/2308.14740v2.pdf","comment":"Project page:\n https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/"},{"id":"http://arxiv.org/abs/2403.00939v3","updated":"2024-04-03T17:42:11Z","published":"2024-03-01T19:36:11Z","title":"G3DR: Generative 3D Reconstruction in ImageNet","summary":" We introduce a novel 3D generative method, Generative 3D Reconstruction\n(G3DR) in ImageNet, capable of generating diverse and high-quality 3D objects\nfrom single images, addressing the limitations of existing methods. At the\nheart of our framework is a novel depth regularization technique that enables\nthe generation of scenes with high-geometric fidelity. G3DR also leverages a\npretrained language-vision model, such as CLIP, to enable reconstruction in\nnovel views and improve the visual realism of generations. Additionally, G3DR\ndesigns a simple but effective sampling procedure to further improve the\nquality of generations. G3DR offers diverse and efficient 3D asset generation\nbased on class or text conditioning. Despite its simplicity, G3DR is able to\nbeat state-of-theart methods, improving over them by up to 22% in perceptual\nmetrics and 90% in geometry scores, while needing only half of the training\ntime. Code is available at https://github.com/preddy5/G3DR\n","authors":["Pradyumna Reddy","Ismail Elezi","Jiankang Deng"],"pdf_url":"https://arxiv.org/pdf/2403.00939v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02885v1","updated":"2024-04-03T17:38:15Z","published":"2024-04-03T17:38:15Z","title":"PoCo: Point Context Cluster for RGBD Indoor Place Recognition","summary":" We present a novel end-to-end algorithm (PoCo) for the indoor RGB-D place\nrecognition task, aimed at identifying the most likely match for a given query\nframe within a reference database. The task presents inherent challenges\nattributed to the constrained field of view and limited range of perception\nsensors. We propose a new network architecture, which generalizes the recent\nContext of Clusters (CoCs) to extract global descriptors directly from the\nnoisy point clouds through end-to-end learning. Moreover, we develop the\narchitecture by integrating both color and geometric modalities into the point\nfeatures to enhance the global descriptor representation. We conducted\nevaluations on public datasets ScanNet-PR and ARKit with 807 and 5047\nscenarios, respectively. PoCo achieves SOTA performance: on ScanNet-PR, we\nachieve R@1 of 64.63%, a 5.7% improvement from the best-published result CGis\n(61.12%); on Arkit, we achieve R@1 of 45.12%, a 13.3% improvement from the\nbest-published result CGis (39.82%). In addition, PoCo shows higher efficiency\nthan CGis in inference time (1.75X-faster), and we demonstrate the\neffectiveness of PoCo in recognizing places within a real-world laboratory\nenvironment.\n","authors":["Jing Liang","Zhuo Deng","Zheming Zhou","Omid Ghasemalizadeh","Dinesh Manocha","Min Sun","Cheng-Hao Kuo","Arnie Sen"],"pdf_url":"https://arxiv.org/pdf/2404.02885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02883v1","updated":"2024-04-03T17:34:28Z","published":"2024-04-03T17:34:28Z","title":"On the Scalability of Diffusion-based Text-to-Image Generation","summary":" Scaling up model and data size has been quite successful for the evolution of\nLLMs. However, the scaling law for the diffusion based text-to-image (T2I)\nmodels is not fully explored. It is also unclear how to efficiently scale the\nmodel for better performance at reduced cost. The different training settings\nand expensive training cost make a fair model comparison extremely difficult.\nIn this work, we empirically study the scaling properties of diffusion based\nT2I models by performing extensive and rigours ablations on scaling both\ndenoising backbones and training set, including training scaled UNet and\nTransformer variants ranging from 0.4B to 4B parameters on datasets upto 600M\nimages. For model scaling, we find the location and amount of cross attention\ndistinguishes the performance of existing UNet designs. And increasing the\ntransformer blocks is more parameter-efficient for improving text-image\nalignment than increasing channel numbers. We then identify an efficient UNet\nvariant, which is 45% smaller and 28% faster than SDXL's UNet. On the data\nscaling side, we show the quality and diversity of the training set matters\nmore than simply dataset size. Increasing caption density and diversity\nimproves text-image alignment performance and the learning efficiency. Finally,\nwe provide scaling functions to predict the text-image alignment performance as\nfunctions of the scale of model size, compute and dataset size.\n","authors":["Hao Li","Yang Zou","Ying Wang","Orchid Majumder","Yusheng Xie","R. Manmatha","Ashwin Swaminathan","Zhuowen Tu","Stefano Ermon","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2404.02883v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.02877v1","updated":"2024-04-03T17:24:27Z","published":"2024-04-03T17:24:27Z","title":"FlightScope: A Deep Comprehensive Assessment of Aircraft Detection\n Algorithms in Satellite Imagery","summary":" Object detection in remotely sensed satellite pictures is fundamental in many\nfields such as biophysical, and environmental monitoring. While deep learning\nalgorithms are constantly evolving, they have been mostly implemented and\ntested on popular ground-based taken photos. This paper critically evaluates\nand compares a suite of advanced object detection algorithms customized for the\ntask of identifying aircraft within satellite imagery. Using the large\nHRPlanesV2 dataset, together with a rigorous validation with the GDIT dataset,\nthis research encompasses an array of methodologies including YOLO versions 5\nand 8, Faster RCNN, CenterNet, RetinaNet, RTMDet, and DETR, all trained from\nscratch. This exhaustive training and validation study reveal YOLOv5 as the\npreeminent model for the specific case of identifying airplanes from remote\nsensing data, showcasing high precision and adaptability across diverse imaging\nconditions. This research highlight the nuanced performance landscapes of these\nalgorithms, with YOLOv5 emerging as a robust solution for aerial object\ndetection, underlining its importance through superior mean average precision,\nRecall, and Intersection over Union scores. The findings described here\nunderscore the fundamental role of algorithm selection aligned with the\nspecific demands of satellite imagery analysis and extend a comprehensive\nframework to evaluate model efficacy. The benchmark toolkit and codes,\navailable via https://github.com/toelt-llc/FlightScope_Bench, aims to further\nexploration and innovation in the realm of remote sensing object detection,\npaving the way for improved analytical methodologies in satellite imagery\napplications.\n","authors":["Safouane El Ghazouali","Arnaud Gucciardi","Nicola Venturi","Michael Rueegsegger","Umberto Michelucci"],"pdf_url":"https://arxiv.org/pdf/2404.02877v1.pdf","comment":"15 figures, 4 tables, comprehensive survey, comparative study"},{"id":"http://arxiv.org/abs/2403.18346v3","updated":"2024-04-03T17:18:51Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from an over-reliance on unimodal biases (e.g., language\nbias and vision bias), leading to incorrect answers in complex multimodal\ntasks. To investigate this issue, we propose a causal framework to interpret\nthe biases in Visual Question Answering (VQA) problems. Within our framework,\nwe devise a causal graph to elucidate the predictions of MLLMs on VQA problems,\nand assess the causal effect of biases through an in-depth causal analysis.\nMotivated by the causal graph, we introduce a novel MORE dataset, consisting of\n12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities,\nnecessitating multi-hop reasoning and the surmounting of unimodal biases.\nFurthermore, we propose two strategies to mitigate unimodal biases and enhance\nMLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA)\nframework for limited-access MLLMs and the refinement of open-source MLLMs\nthrough fine-tuning. Extensive quantitative and qualitative experiments offer\nvaluable insights for future research. Our project page is at\nhttps://opencausalab.github.io/MORE.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11782v2","updated":"2024-04-03T16:57:35Z","published":"2023-12-19T01:33:46Z","title":"Learning Object State Changes in Videos: An Open-World Perspective","summary":" Object State Changes (OSCs) are pivotal for video understanding. While humans\ncan effortlessly generalize OSC understanding from familiar to unknown objects,\ncurrent approaches are confined to a closed vocabulary. Addressing this gap, we\nintroduce a novel open-world formulation for the video OSC problem. The goal is\nto temporally localize the three stages of an OSC -- the object's initial\nstate, its transitioning state, and its end state -- whether or not the object\nhas been observed during training. Towards this end, we develop VidOSC, a\nholistic learning approach that: (1) leverages text and vision-language models\nfor supervisory signals to obviate manually labeling OSC training data, and (2)\nabstracts fine-grained shared state representations from objects to enhance\ngeneralization. Furthermore, we present HowToChange, the first open-world\nbenchmark for video OSC localization, which offers an order of magnitude\nincrease in the label space and annotation volume compared to the best existing\nbenchmark. Experimental results demonstrate the efficacy of our approach, in\nboth traditional closed-world and open-world scenarios.\n","authors":["Zihui Xue","Kumar Ashutosh","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2312.11782v2.pdf","comment":"Accepted by CVPR 2024, Project website:\n https://vision.cs.utexas.edu/projects/VidOSC/"},{"id":"http://arxiv.org/abs/2404.01717v2","updated":"2024-04-03T16:46:27Z","published":"2024-04-02T08:07:38Z","title":"AddSR: Accelerating Diffusion-based Blind Super-Resolution with\n Adversarial Diffusion Distillation","summary":" Blind super-resolution methods based on stable diffusion showcase formidable\ngenerative capabilities in reconstructing clear high-resolution images with\nintricate details from low-resolution inputs. However, their practical\napplicability is often hampered by poor efficiency, stemming from the\nrequirement of thousands or hundreds of sampling steps. Inspired by the\nefficient text-to-image approach adversarial diffusion distillation (ADD), we\ndesign AddSR to address this issue by incorporating the ideas of both\ndistillation and ControlNet. Specifically, we first propose a prediction-based\nself-refinement strategy to provide high-frequency information in the student\nmodel output with marginal additional time cost. Furthermore, we refine the\ntraining process by employing HR images, rather than LR images, to regulate the\nteacher model, providing a more robust constraint for distillation. Second, we\nintroduce a timestep-adapting loss to address the perception-distortion\nimbalance problem introduced by ADD. Extensive experiments demonstrate our\nAddSR generates better restoration results, while achieving faster speed than\nprevious SD-based state-of-the-art models (e.g., 7x faster than SeeSR).\n","authors":["Rui Xie","Ying Tai","Kai Zhang","Zhenyu Zhang","Jun Zhou","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01717v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02845v1","updated":"2024-04-03T16:23:37Z","published":"2024-04-03T16:23:37Z","title":"Cross-Modal Conditioned Reconstruction for Language-guided Medical Image\n Segmentation","summary":" Recent developments underscore the potential of textual information in\nenhancing learning models for a deeper understanding of medical visual\nsemantics. However, language-guided medical image segmentation still faces a\nchallenging issue. Previous works employ implicit and ambiguous architectures\nto embed textual information. This leads to segmentation results that are\ninconsistent with the semantics represented by the language, sometimes even\ndiverging significantly. To this end, we propose a novel cross-modal\nconditioned Reconstruction for Language-guided Medical Image Segmentation\n(RecLMIS) to explicitly capture cross-modal interactions, which assumes that\nwell-aligned medical visual features and medical notes can effectively\nreconstruct each other. We introduce conditioned interaction to adaptively\npredict patches and words of interest. Subsequently, they are utilized as\nconditioning factors for mutual reconstruction to align with regions described\nin the medical notes. Extensive experiments demonstrate the superiority of our\nRecLMIS, surpassing LViT by 3.74% mIoU on the publicly available MosMedData+\ndataset and achieving an average increase of 1.89% mIoU for cross-domain tests\non our QATA-CoV19 dataset. Simultaneously, we achieve a relative reduction of\n20.2% in parameter count and a 55.5% decrease in computational load. The code\nwill be available at https://github.com/ShashankHuang/RecLMIS.\n","authors":["Xiaoshuang Huang","Hongxiang Li","Meng Cao","Long Chen","Chenyu You","Dong An"],"pdf_url":"https://arxiv.org/pdf/2404.02845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02830v1","updated":"2024-04-03T16:04:59Z","published":"2024-04-03T16:04:59Z","title":"Enhancing Interpretability of Vertebrae Fracture Grading using\n Human-interpretable Prototypes","summary":" Vertebral fracture grading classifies the severity of vertebral fractures,\nwhich is a challenging task in medical imaging and has recently attracted Deep\nLearning (DL) models. Only a few works attempted to make such models\nhuman-interpretable despite the need for transparency and trustworthiness in\ncritical use cases like DL-assisted medical diagnosis. Moreover, such models\neither rely on post-hoc methods or additional annotations. In this work, we\npropose a novel interpretable-by-design method, ProtoVerse, to find relevant\nsub-parts of vertebral fractures (prototypes) that reliably explain the model's\ndecision in a human-understandable way. Specifically, we introduce a novel\ndiversity-promoting loss to mitigate prototype repetitions in small datasets\nwith intricate semantics. We have experimented with the VerSe'19 dataset and\noutperformed the existing prototype-based method. Further, our model provides\nsuperior interpretability against the post-hoc method. Importantly, expert\nradiologists validated the visual interpretability of our results, showing\nclinical applicability.\n","authors":["Poulami Sinhamahapatra","Suprosanna Shit","Anjany Sekuboyina","Malek Husseini","David Schinz","Nicolas Lenhart","Joern Menze","Jan Kirschke","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2404.02830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10835v3","updated":"2024-04-03T16:00:18Z","published":"2023-12-17T22:40:38Z","title":"Your Student is Better Than Expected: Adaptive Teacher-Student\n Collaboration for Text-Conditional Diffusion Models","summary":" Knowledge distillation methods have recently shown to be a promising\ndirection to speedup the synthesis of large-scale diffusion models by requiring\nonly a few inference steps. While several powerful distillation methods were\nrecently proposed, the overall quality of student samples is typically lower\ncompared to the teacher ones, which hinders their practical usage. In this\nwork, we investigate the relative quality of samples produced by the teacher\ntext-to-image diffusion model and its distilled student version. As our main\nempirical finding, we discover that a noticeable portion of student samples\nexhibit superior fidelity compared to the teacher ones, despite the\n\"approximate\" nature of the student. Based on this finding, we propose an\nadaptive collaboration between student and teacher diffusion models for\neffective text-to-image synthesis. Specifically, the distilled model produces\nthe initial sample, and then an oracle decides whether it needs further\nimprovements with a slow teacher model. Extensive experiments demonstrate that\nthe designed pipeline surpasses state-of-the-art text-to-image alternatives for\nvarious inference budgets in terms of human preference. Furthermore, the\nproposed approach can be naturally used in popular applications such as\ntext-guided image editing and controllable generation.\n","authors":["Nikita Starodubcev","Artem Fedorov","Artem Babenko","Dmitry Baranchuk"],"pdf_url":"https://arxiv.org/pdf/2312.10835v3.pdf","comment":"CVPR2024 camera ready"},{"id":"http://arxiv.org/abs/2312.10389v2","updated":"2024-04-03T15:54:15Z","published":"2023-12-16T09:04:44Z","title":"ElasticLaneNet: An Efficient Geometry-Flexible Approach for Lane\n Detection","summary":" The task of lane detection involves identifying the boundaries of driving\nareas in real-time. Recognizing lanes with variable and complex geometric\nstructures remains a challenge. In this paper, we explore a novel and flexible\nway of implicit lanes representation named \\textit{Elastic Lane map (ELM)}, and\nintroduce an efficient physics-informed end-to-end lane detection framework,\nnamely, ElasticLaneNet (Elastic interaction energy-informed Lane detection\nNetwork). The approach considers predicted lanes as moving zero-contours on the\nflexibly shaped \\textit{ELM} that are attracted to the ground truth guided by\nan elastic interaction energy-loss function (EIE loss). Our framework well\nintegrates the global information and low-level features. The method performs\nwell in complex lane scenarios, including those with large curvature, weak\ngeometry features at intersections, complicated cross lanes, Y-shapes lanes,\ndense lanes, etc. We apply our approach on three datasets: SDLane, CULane, and\nTuSimple. The results demonstrate exceptional performance of our method, with\nthe state-of-the-art results on the structurally diverse SDLane, achieving\nF1-score of 89.51, Recall rate of 87.50, and Precision of 91.61 with fast\ninference speed.\n","authors":["Yaxin Feng","Yuan Lan","Luchan Zhang","Yang Xiang"],"pdf_url":"https://arxiv.org/pdf/2312.10389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05247v2","updated":"2024-04-03T15:40:00Z","published":"2023-12-08T18:55:24Z","title":"Dynamic LiDAR Re-simulation using Compositional Neural Fields","summary":" We introduce DyNFL, a novel neural field-based approach for high-fidelity\nre-simulation of LiDAR scans in dynamic driving scenes. DyNFL processes LiDAR\nmeasurements from dynamic environments, accompanied by bounding boxes of moving\nobjects, to construct an editable neural field. This field, comprising\nseparately reconstructed static background and dynamic objects, allows users to\nmodify viewpoints, adjust object positions, and seamlessly add or remove\nobjects in the re-simulated scene. A key innovation of our method is the neural\nfield composition technique, which effectively integrates reconstructed neural\nassets from various scenes through a ray drop test, accounting for occlusions\nand transparent surfaces. Our evaluation with both synthetic and real-world\nenvironments demonstrates that DyNFL substantially improves dynamic scene LiDAR\nsimulation, offering a combination of physical fidelity and flexible editing\ncapabilities.\n","authors":["Hanfeng Wu","Xingxing Zuo","Stefan Leutenegger","Or Litany","Konrad Schindler","Shengyu Huang"],"pdf_url":"https://arxiv.org/pdf/2312.05247v2.pdf","comment":"Project page: https://shengyuh.github.io/dynfl"},{"id":"http://arxiv.org/abs/2312.15702v2","updated":"2024-04-03T15:38:12Z","published":"2023-12-25T11:54:07Z","title":"Three Heads Are Better Than One: Complementary Experts for Long-Tailed\n Semi-supervised Learning","summary":" We address the challenging problem of Long-Tailed Semi-Supervised Learning\n(LTSSL) where labeled data exhibit imbalanced class distribution and unlabeled\ndata follow an unknown distribution. Unlike in balanced SSL, the generated\npseudo-labels are skewed towards head classes, intensifying the training bias.\nSuch a phenomenon is even amplified as more unlabeled data will be mislabeled\nas head classes when the class distribution of labeled and unlabeled datasets\nare mismatched. To solve this problem, we propose a novel method named\nComPlementary Experts (CPE). Specifically, we train multiple experts to model\nvarious class distributions, each of them yielding high-quality pseudo-labels\nwithin one form of class distribution. Besides, we introduce Classwise Batch\nNormalization for CPE to avoid performance degradation caused by feature\ndistribution mismatch between head and non-head classes. CPE achieves\nstate-of-the-art performances on CIFAR-10-LT, CIFAR-100-LT, and STL-10-LT\ndataset benchmarks. For instance, on CIFAR-10-LT, CPE improves test accuracy by\nover 2.22% compared to baselines. Code is available at\nhttps://github.com/machengcheng2016/CPE-LTSSL.\n","authors":["Chengcheng Ma","Ismail Elezi","Jiankang Deng","Weiming Dong","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2312.15702v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2404.02813v1","updated":"2024-04-03T15:37:02Z","published":"2024-04-03T15:37:02Z","title":"GPU-Accelerated RSF Level Set Evolution for Large-Scale Microvascular\n Segmentation","summary":" Microvascular networks are challenging to model because these structures are\ncurrently near the diffraction limit for most advanced three-dimensional\nimaging modalities, including confocal and light sheet microscopy. This makes\nsemantic segmentation difficult, because individual components of these\nnetworks fluctuate within the confines of individual pixels. Level set methods\nare ideally suited to solve this problem by providing surface and topological\nconstraints on the resulting model, however these active contour techniques are\nextremely time intensive and impractical for terabyte-scale images. We propose\na reformulation and implementation of the region-scalable fitting (RSF) level\nset model that makes it amenable to three-dimensional evaluation using both\nsingle-instruction multiple data (SIMD) and single-program multiple-data (SPMD)\nparallel processing. This enables evaluation of the level set equation on\nindependent regions of the data set using graphics processing units (GPUs),\nmaking large-scale segmentation of high-resolution networks practical and\ninexpensive.\n We tested this 3D parallel RSF approach on multiple data sets acquired using\nstate-of-the-art imaging techniques to acquire microvascular data, including\nmicro-CT, light sheet fluorescence microscopy (LSFM) and milling microscopy. To\nassess the performance and accuracy of the RSF model, we conducted a\nMonte-Carlo-based validation technique to compare results to other segmentation\nmethods. We also provide a rigorous profiling to show the gains in processing\nspeed leveraging parallel hardware. This study showcases the practical\napplication of the RSF model, emphasizing its utility in the challenging domain\nof segmenting large-scale high-topology network structures with a particular\nfocus on building microvascular models.\n","authors":["Meher Niger","Helya Goharbavang","Taeyong Ahn","Emily K. Alley","Joshua D. Wythe","Guoning Chen","David Mayerich"],"pdf_url":"https://arxiv.org/pdf/2404.02813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01449v2","updated":"2024-04-03T15:32:17Z","published":"2023-10-02T01:30:42Z","title":"Elastic Interaction Energy-Informed Real-Time Traffic Scene Perception","summary":" Urban segmentation and lane detection are two important tasks for traffic\nscene perception. Accuracy and fast inference speed of visual perception are\ncrucial for autonomous driving safety. Fine and complex geometric objects are\nthe most challenging but important recognition targets in traffic scene, such\nas pedestrians, traffic signs and lanes. In this paper, a simple and efficient\ntopology-aware energy loss function-based network training strategy named\nEIEGSeg is proposed. EIEGSeg is designed for multi-class segmentation on\nreal-time traffic scene perception. To be specific, the convolutional neural\nnetwork (CNN) extracts image features and produces multiple outputs, and the\nelastic interaction energy loss function (EIEL) drives the predictions moving\ntoward the ground truth until they are completely overlapped. Our strategy\nperforms well especially on fine-scale structure, \\textit{i.e.} small or\nirregularly shaped objects can be identified more accurately, and discontinuity\nissues on slender objects can be improved. We quantitatively and qualitatively\nanalyze our method on three traffic datasets, including urban scene\nsegmentation data Cityscapes and lane detection data TuSimple and CULane. Our\nresults demonstrate that EIEGSeg consistently improves the performance,\nespecially on real-time, lightweight networks that are better suited for\nautonomous driving.\n","authors":["Yaxin Feng","Yuan Lan","Luchan Zhang","Guoqing Liu","Yang Xiang"],"pdf_url":"https://arxiv.org/pdf/2310.01449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08730v2","updated":"2024-04-03T15:22:23Z","published":"2024-03-13T17:29:45Z","title":"Strengthening Multimodal Large Language Model with Bootstrapped\n Preference Optimization","summary":" Multimodal Large Language Models (MLLMs) excel in generating responses based\non visual inputs. However, they often suffer from a bias towards generating\nresponses similar to their pretraining corpus, overshadowing the importance of\nvisual information. We treat this bias as a \"preference\" for pretraining\nstatistics, which hinders the model's grounding in visual input. To mitigate\nthis issue, we propose Bootstrapped Preference Optimization (BPO), which\nconducts preference learning with datasets containing negative responses\nbootstrapped from the model itself. Specifically, we propose the following two\nstrategies: 1) using distorted image inputs to the MLLM for eliciting responses\nthat contain signified pretraining bias; 2) leveraging text-based LLM to\nexplicitly inject erroneous but common elements into the original response.\nThose undesirable responses are paired with original annotated responses from\nthe datasets to construct the preference dataset, which is subsequently\nutilized to perform preference learning. Our approach effectively suppresses\npretrained LLM bias, enabling enhanced grounding in visual inputs. Extensive\nexperimentation demonstrates significant performance improvements across\nmultiple benchmarks, advancing the state-of-the-art in multimodal\nconversational systems.\n","authors":["Renjie Pi","Tianyang Han","Wei Xiong","Jipeng Zhang","Runtao Liu","Rui Pan","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.02689v2","updated":"2024-04-03T15:18:00Z","published":"2020-12-04T15:58:34Z","title":"Isometric Multi-Shape Matching","summary":" Finding correspondences between shapes is a fundamental problem in computer\nvision and graphics, which is relevant for many applications, including 3D\nreconstruction, object tracking, and style transfer. The vast majority of\ncorrespondence methods aim to find a solution between pairs of shapes, even if\nmultiple instances of the same class are available. While isometries are often\nstudied in shape correspondence problems, they have not been considered\nexplicitly in the multi-matching setting. This paper closes this gap by\nproposing a novel optimisation formulation for isometric multi-shape matching.\nWe present a suitable optimisation algorithm for solving our formulation and\nprovide a convergence and complexity analysis. Our algorithm obtains\nmulti-matchings that are by construction provably cycle-consistent. We\ndemonstrate the superior performance of our method on various datasets and set\nthe new state-of-the-art in isometric multi-shape matching.\n","authors":["Maolin Gao","Zorah Lähner","Johan Thunberg","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2012.02689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07169v3","updated":"2024-04-03T15:11:33Z","published":"2023-12-12T11:13:17Z","title":"Semi-supervised Active Learning for Video Action Detection","summary":" In this work, we focus on label efficient learning for video action\ndetection. We develop a novel semi-supervised active learning approach which\nutilizes both labeled as well as unlabeled data along with informative sample\nselection for action detection. Video action detection requires spatio-temporal\nlocalization along with classification, which poses several challenges for both\nactive learning informative sample selection as well as semi-supervised\nlearning pseudo label generation. First, we propose NoiseAug, a simple\naugmentation strategy which effectively selects informative samples for video\naction detection. Next, we propose fft-attention, a novel technique based on\nhigh-pass filtering which enables effective utilization of pseudo label for SSL\nin video action detection by emphasizing on relevant activity region within a\nvideo. We evaluate the proposed approach on three different benchmark datasets,\nUCF-101-24, JHMDB-21, and Youtube-VOS. First, we demonstrate its effectiveness\non video action detection where the proposed approach outperforms prior works\nin semi-supervised and weakly-supervised learning along with several baseline\napproaches in both UCF101-24 and JHMDB-21. Next, we also show its effectiveness\non Youtube-VOS for video object segmentation demonstrating its generalization\ncapability for other dense prediction tasks in videos. The code and models is\npublicly available at:\n\\url{https://github.com/AKASH2907/semi-sup-active-learning}.\n","authors":["Ayush Singh","Aayush J Rana","Akash Kumar","Shruti Vyas","Yogesh Singh Rawat"],"pdf_url":"https://arxiv.org/pdf/2312.07169v3.pdf","comment":"AAAI Conference on Artificial Intelligence, Main Technical Track\n (AAAI), 2024, Code: https://github.com/AKASH2907/semi-sup-active-learning"},{"id":"http://arxiv.org/abs/2311.16432v2","updated":"2024-04-03T15:05:28Z","published":"2023-11-28T02:27:31Z","title":"Text-Driven Image Editing via Learnable Regions","summary":" Language has emerged as a natural interface for image editing. In this paper,\nwe introduce a method for region-based image editing driven by textual prompts,\nwithout the need for user-provided masks or sketches. Specifically, our\napproach leverages an existing pre-trained text-to-image model and introduces a\nbounding box generator to identify the editing regions that are aligned with\nthe textual prompts. We show that this simple approach enables flexible editing\nthat is compatible with current image generation models, and is able to handle\ncomplex prompts featuring multiple objects, complex sentences, or lengthy\nparagraphs. We conduct an extensive user study to compare our method against\nstate-of-the-art methods. The experiments demonstrate the competitive\nperformance of our method in manipulating images with high fidelity and realism\nthat correspond to the provided language descriptions. Our project webpage can\nbe found at: https://yuanze-lin.me/LearnableRegions_page.\n","authors":["Yuanze Lin","Yi-Wen Chen","Yi-Hsuan Tsai","Lu Jiang","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2311.16432v2.pdf","comment":"Accepted to CVPR 2024 Project webpage:\n https://yuanze-lin.me/LearnableRegions_page"},{"id":"http://arxiv.org/abs/2308.08393v2","updated":"2024-04-03T15:04:03Z","published":"2023-08-16T14:25:30Z","title":"SIGMA: Scale-Invariant Global Sparse Shape Matching","summary":" We propose a novel mixed-integer programming (MIP) formulation for generating\nprecise sparse correspondences for highly non-rigid shapes. To this end, we\nintroduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic\nand extrinsic geometric information to measure the deformation quality induced\nby predicted correspondences. We integrate the PLBO, together with an\norientation-aware regulariser, into a novel MIP formulation that can be solved\nto global optimality for many practical problems. In contrast to previous\nmethods, our approach is provably invariant to rigid transformations and global\nscaling, initialisation-free, has optimality guarantees, and scales to high\nresolution meshes with (empirically observed) linear time. We show\nstate-of-the-art results for sparse non-rigid matching on several challenging\n3D datasets, including data with inconsistent meshing, as well as applications\nin mesh-to-point-cloud matching.\n","authors":["Maolin Gao","Paul Roetzer","Marvin Eisenberger","Zorah Lähner","Michael Moeller","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2308.08393v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2210.01708v3","updated":"2024-04-03T15:01:26Z","published":"2022-10-04T16:08:54Z","title":"Conquering the Communication Constraints to Enable Large Pre-Trained\n Models in Federated Learning","summary":" Federated learning (FL) has emerged as a promising paradigm for enabling the\ncollaborative training of models without centralized access to the raw data on\nlocal devices. In the typical FL paradigm (e.g., FedAvg), model weights are\nsent to and from the server each round to participating clients. Recently, the\nuse of small pre-trained models has been shown effective in federated learning\noptimization and improving convergence. However, recent state-of-the-art\npre-trained models are getting more capable but also have more parameters. In\nconventional FL, sharing the enormous model weights can quickly put a massive\ncommunication burden on the system, especially if more capable models are\nemployed. Can we find a solution to enable those strong and readily-available\npre-trained models in FL to achieve excellent performance while simultaneously\nreducing the communication burden? To this end, we investigate the use of\nparameter-efficient fine-tuning in federated learning and thus introduce a new\nframework: FedPEFT. Specifically, we systemically evaluate the performance of\nFedPEFT across a variety of client stability, data distribution, and\ndifferential privacy settings. By only locally tuning and globally sharing a\nsmall portion of the model weights, significant reductions in the total\ncommunication overhead can be achieved while maintaining competitive or even\nbetter performance in a wide range of federated learning scenarios, providing\ninsight into a new paradigm for practical and effective federated systems.\n","authors":["Guangyu Sun","Umar Khalid","Matias Mendieta","Taojiannan Yang","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2210.01708v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14505v3","updated":"2024-04-03T14:59:08Z","published":"2024-02-22T12:55:01Z","title":"Towards Seamless Adaptation of Pre-trained Models for Visual Place\n Recognition","summary":" Recent studies show that vision models pre-trained in generic visual learning\ntasks with large-scale data can provide useful feature representations for a\nwide range of visual perception problems. However, few attempts have been made\nto exploit pre-trained foundation models in visual place recognition (VPR). Due\nto the inherent difference in training objectives and data between the tasks of\nmodel pre-training and VPR, how to bridge the gap and fully unleash the\ncapability of pre-trained models for VPR is still a key issue to address. To\nthis end, we propose a novel method to realize seamless adaptation of\npre-trained models for VPR. Specifically, to obtain both global and local\nfeatures that focus on salient landmarks for discriminating places, we design a\nhybrid adaptation method to achieve both global and local adaptation\nefficiently, in which only lightweight adapters are tuned without adjusting the\npre-trained model. Besides, to guide effective adaptation, we propose a mutual\nnearest neighbor local feature loss, which ensures proper dense local features\nare produced for local matching and avoids time-consuming spatial verification\nin re-ranking. Experimental results show that our method outperforms the\nstate-of-the-art methods with less training data and training time, and uses\nabout only 3% retrieval runtime of the two-stage VPR methods with RANSAC-based\nspatial verification. It ranks 1st on the MSLS challenge leaderboard (at the\ntime of submission). The code is released at\nhttps://github.com/Lu-Feng/SelaVPR.\n","authors":["Feng Lu","Lijun Zhang","Xiangyuan Lan","Shuting Dong","Yaowei Wang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.14505v3.pdf","comment":"ICLR2024"},{"id":"http://arxiv.org/abs/2404.02790v1","updated":"2024-04-03T14:58:00Z","published":"2024-04-03T14:58:00Z","title":"MULAN: A Multi Layer Annotated Dataset for Controllable Text-to-Image\n Generation","summary":" Text-to-image generation has achieved astonishing results, yet precise\nspatial controllability and prompt fidelity remain highly challenging. This\nlimitation is typically addressed through cumbersome prompt engineering, scene\nlayout conditioning, or image editing techniques which often require hand drawn\nmasks. Nonetheless, pre-existing works struggle to take advantage of the\nnatural instance-level compositionality of scenes due to the typically flat\nnature of rasterized RGB output images. Towards adressing this challenge, we\nintroduce MuLAn: a novel dataset comprising over 44K MUlti-Layer ANnotations of\nRGB images as multilayer, instance-wise RGBA decompositions, and over 100K\ninstance images. To build MuLAn, we developed a training free pipeline which\ndecomposes a monocular RGB image into a stack of RGBA layers comprising of\nbackground and isolated instances. We achieve this through the use of\npretrained general-purpose models, and by developing three modules: image\ndecomposition for instance discovery and extraction, instance completion to\nreconstruct occluded areas, and image re-assembly. We use our pipeline to\ncreate MuLAn-COCO and MuLAn-LAION datasets, which contain a variety of image\ndecompositions in terms of style, composition and complexity. With MuLAn, we\nprovide the first photorealistic resource providing instance decomposition and\nocclusion information for high quality images, opening up new avenues for\ntext-to-image generative AI research. With this, we aim to encourage the\ndevelopment of novel generation and editing technology, in particular\nlayer-wise solutions. MuLAn data resources are available at\nhttps://MuLAn-dataset.github.io/.\n","authors":["Petru-Daniel Tudosiu","Yongxin Yang","Shifeng Zhang","Fei Chen","Steven McDonagh","Gerasimos Lampouras","Ignacio Iacobacci","Sarah Parisot"],"pdf_url":"https://arxiv.org/pdf/2404.02790v1.pdf","comment":"CVPR 2024 - Project page: https://MuLAn-dataset.github.io/"},{"id":"http://arxiv.org/abs/2404.02788v1","updated":"2024-04-03T14:56:06Z","published":"2024-04-03T14:56:06Z","title":"GenN2N: Generative NeRF2NeRF Translation","summary":" We present GenN2N, a unified NeRF-to-NeRF translation framework for various\nNeRF translation tasks such as text-driven NeRF editing, colorization,\nsuper-resolution, inpainting, etc. Unlike previous methods designed for\nindividual translation tasks with task-specific schemes, GenN2N achieves all\nthese NeRF editing tasks by employing a plug-and-play image-to-image translator\nto perform editing in the 2D domain and lifting 2D edits into the 3D NeRF\nspace. Since the 3D consistency of 2D edits may not be assured, we propose to\nmodel the distribution of the underlying 3D edits through a generative model\nthat can cover all possible edited NeRFs. To model the distribution of 3D\nedited NeRFs from 2D edited images, we carefully design a VAE-GAN that encodes\nimages while decoding NeRFs. The latent space is trained to align with a\nGaussian distribution and the NeRFs are supervised through an adversarial loss\non its renderings. To ensure the latent code does not depend on 2D viewpoints\nbut truly reflects the 3D edits, we also regularize the latent code through a\ncontrastive learning scheme. Extensive experiments on various editing tasks\nshow GenN2N, as a universal framework, performs as well or better than\ntask-specific specialists while possessing flexible generative power. More\nresults on our project page: https://xiangyueliu.github.io/GenN2N/\n","authors":["Xiangyue Liu","Han Xue","Kunming Luo","Ping Tan","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2404.02788v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://xiangyueliu.github.io/GenN2N/"},{"id":"http://arxiv.org/abs/2404.02785v1","updated":"2024-04-03T14:55:17Z","published":"2024-04-03T14:55:17Z","title":"Domain Generalization through Meta-Learning: A Survey","summary":" Deep neural networks (DNNs) have revolutionized artificial intelligence but\noften lack performance when faced with out-of-distribution (OOD) data, a common\nscenario due to the inevitable domain shifts in real-world applications. This\nlimitation stems from the common assumption that training and testing data\nshare the same distribution-an assumption frequently violated in practice.\nDespite their effectiveness with large amounts of data and computational power,\nDNNs struggle with distributional shifts and limited labeled data, leading to\noverfitting and poor generalization across various tasks and domains.\nMeta-learning presents a promising approach by employing algorithms that\nacquire transferable knowledge across various tasks for fast adaptation,\neliminating the need to learn each task from scratch. This survey paper delves\ninto the realm of meta-learning with a focus on its contribution to domain\ngeneralization. We first clarify the concept of meta-learning for domain\ngeneralization and introduce a novel taxonomy based on the feature extraction\nstrategy and the classifier learning methodology, offering a granular view of\nmethodologies. Through an exhaustive review of existing methods and underlying\ntheories, we map out the fundamentals of the field. Our survey provides\npractical insights and an informed discussion on promising research directions,\npaving the way for future innovation in meta-learning for domain\ngeneralization.\n","authors":["Arsham Gholamzadeh Khoee","Yinan Yu","Robert Feldt"],"pdf_url":"https://arxiv.org/pdf/2404.02785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10206v2","updated":"2024-04-03T14:45:52Z","published":"2023-07-14T07:25:47Z","title":"NEAT: Distilling 3D Wireframes from Neural Attraction Fields","summary":" This paper studies the problem of structured 3D reconstruction using\nwireframes that consist of line segments and junctions, focusing on the\ncomputation of structured boundary geometries of scenes. Instead of leveraging\nmatching-based solutions from 2D wireframes (or line segments) for 3D wireframe\nreconstruction as done in prior arts, we present NEAT, a rendering-distilling\nformulation using neural fields to represent 3D line segments with 2D\nobservations, and bipartite matching for perceiving and distilling of a sparse\nset of 3D global junctions. The proposed {NEAT} enjoys the joint optimization\nof the neural fields and the global junctions from scratch, using\nview-dependent 2D observations without precomputed cross-view feature matching.\nComprehensive experiments on the DTU and BlendedMVS datasets demonstrate our\nNEAT's superiority over state-of-the-art alternatives for 3D wireframe\nreconstruction. Moreover, the distilled 3D global junctions by NEAT, are a\nbetter initialization than SfM points, for the recently-emerged 3D Gaussian\nSplatting for high-fidelity novel view synthesis using about 20 times fewer\ninitial 3D points. Project page: \\url{https://xuenan.net/neat}.\n","authors":["Nan Xue","Bin Tan","Yuxi Xiao","Liang Dong","Gui-Song Xia","Tianfu Wu","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2307.10206v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.12865v3","updated":"2024-04-03T14:39:32Z","published":"2023-12-20T09:27:41Z","title":"RadEdit: stress-testing biomedical vision models via diffusion image\n editing","summary":" Biomedical imaging datasets are often small and biased, meaning that\nreal-world performance of predictive models can be substantially lower than\nexpected from internal testing. This work proposes using generative image\nediting to simulate dataset shifts and diagnose failure modes of biomedical\nvision models; this can be used in advance of deployment to assess readiness,\npotentially reducing cost and patient harm. Existing editing methods can\nproduce undesirable changes, with spurious correlations learned due to the\nco-occurrence of disease and treatment interventions, limiting practical\napplicability. To address this, we train a text-to-image diffusion model on\nmultiple chest X-ray datasets and introduce a new editing method RadEdit that\nuses multiple masks, if present, to constrain changes and ensure consistency in\nthe edited images. We consider three types of dataset shifts: acquisition\nshift, manifestation shift, and population shift, and demonstrate that our\napproach can diagnose failures and quantify model robustness without additional\ndata collection, complementing more qualitative tools for explainable AI.\n","authors":["Fernando Pérez-García","Sam Bond-Taylor","Pedro P. Sanchez","Boris van Breugel","Daniel C. Castro","Harshita Sharma","Valentina Salvatelli","Maria T. A. Wetscherek","Hannah Richardson","Matthew P. Lungren","Aditya Nori","Javier Alvarez-Valle","Ozan Oktay","Maximilian Ilse"],"pdf_url":"https://arxiv.org/pdf/2312.12865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06757v3","updated":"2024-04-03T14:31:43Z","published":"2022-11-12T22:29:42Z","title":"DriftRec: Adapting diffusion models to blind JPEG restoration","summary":" In this work, we utilize the high-fidelity generation abilities of diffusion\nmodels to solve blind JPEG restoration at high compression levels. We propose\nan elegant modification of the forward stochastic differential equation of\ndiffusion models to adapt them to this restoration task and name our method\nDriftRec. Comparing DriftRec against an $L_2$ regression baseline with the same\nnetwork architecture and state-of-the-art techniques for JPEG restoration, we\nshow that our approach can escape the tendency of other methods to generate\nblurry images, and recovers the distribution of clean images significantly more\nfaithfully. For this, only a dataset of clean/corrupted image pairs and no\nknowledge about the corruption operation is required, enabling wider\napplicability to other restoration tasks. In contrast to other conditional and\nunconditional diffusion models, we utilize the idea that the distributions of\nclean and corrupted images are much closer to each other than each is to the\nusual Gaussian prior of the reverse process in diffusion models. Our approach\ntherefore requires only low levels of added noise and needs comparatively few\nsampling steps even without further optimizations. We show that DriftRec\nnaturally generalizes to realistic and difficult scenarios such as unaligned\ndouble JPEG compression and blind restoration of JPEGs found online, without\nhaving encountered such examples during training.\n","authors":["Simon Welker","Henry N. Chapman","Timo Gerkmann"],"pdf_url":"https://arxiv.org/pdf/2211.06757v3.pdf","comment":"(C) 2024 IEEE. Personal use of this material is permitted. Permission\n from IEEE must be obtained for all other uses, in any current or future\n media, including reprinting/republishing this material for advertising or\n promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2312.02145v2","updated":"2024-04-03T14:14:18Z","published":"2023-12-04T18:59:13Z","title":"Repurposing Diffusion-Based Image Generators for Monocular Depth\n Estimation","summary":" Monocular depth estimation is a fundamental computer vision task. Recovering\n3D depth from a single image is geometrically ill-posed and requires scene\nunderstanding, so it is not surprising that the rise of deep learning has led\nto a breakthrough. The impressive progress of monocular depth estimators has\nmirrored the growth in model capacity, from relatively modest CNNs to large\nTransformer architectures. Still, monocular depth estimators tend to struggle\nwhen presented with images with unfamiliar content and layout, since their\nknowledge of the visual world is restricted by the data seen during training,\nand challenged by zero-shot generalization to new domains. This motivates us to\nexplore whether the extensive priors captured in recent generative diffusion\nmodels can enable better, more generalizable depth estimation. We introduce\nMarigold, a method for affine-invariant monocular depth estimation that is\nderived from Stable Diffusion and retains its rich prior knowledge. The\nestimator can be fine-tuned in a couple of days on a single GPU using only\nsynthetic training data. It delivers state-of-the-art performance across a wide\nrange of datasets, including over 20% performance gains in specific cases.\nProject page: https://marigoldmonodepth.github.io.\n","authors":["Bingxin Ke","Anton Obukhov","Shengyu Huang","Nando Metzger","Rodrigo Caye Daudt","Konrad Schindler"],"pdf_url":"https://arxiv.org/pdf/2312.02145v2.pdf","comment":"CVPR 2024 camera ready"},{"id":"http://arxiv.org/abs/2306.09320v4","updated":"2024-04-03T14:09:58Z","published":"2023-06-15T17:55:05Z","title":"Learnable Weight Initialization for Volumetric Medical Image\n Segmentation","summary":" Hybrid volumetric medical image segmentation models, combining the advantages\nof local convolution and global attention, have recently received considerable\nattention. While mainly focusing on architectural modifications, most existing\nhybrid approaches still use conventional data-independent weight initialization\nschemes which restrict their performance due to ignoring the inherent\nvolumetric nature of the medical data. To address this issue, we propose a\nlearnable weight initialization approach that utilizes the available medical\ntraining data to effectively learn the contextual and structural cues via the\nproposed self-supervised objectives. Our approach is easy to integrate into any\nhybrid model and requires no external training data. Experiments on multi-organ\nand lung cancer segmentation tasks demonstrate the effectiveness of our\napproach, leading to state-of-the-art segmentation performance. Our proposed\ndata-dependent initialization approach performs favorably as compared to the\nSwin-UNETR model pretrained using large-scale datasets on multi-organ\nsegmentation task. Our source code and models are available at:\nhttps://github.com/ShahinaKK/LWI-VMS.\n","authors":["Shahina Kunhimon","Abdelrahman Shaker","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2306.09320v4.pdf","comment":"Accepted at Elsevier AI in Medicine Journal"},{"id":"http://arxiv.org/abs/2404.02759v1","updated":"2024-04-03T14:05:39Z","published":"2024-04-03T14:05:39Z","title":"Unsupervised Occupancy Learning from Sparse Point Cloud","summary":" Implicit Neural Representations have gained prominence as a powerful\nframework for capturing complex data modalities, encompassing a wide range from\n3D shapes to images and audio. Within the realm of 3D shape representation,\nNeural Signed Distance Functions (SDF) have demonstrated remarkable potential\nin faithfully encoding intricate shape geometry. However, learning SDFs from 3D\npoint clouds in the absence of ground truth supervision remains a very\nchallenging task. In this paper, we propose a method to infer occupancy fields\ninstead of SDFs as they are easier to learn from sparse inputs. We leverage a\nmargin-based uncertainty measure to differentially sample from the decision\nboundary of the occupancy function and supervise the sampled boundary points\nusing the input point cloud. We further stabilize the optimization process at\nthe early stages of the training by biasing the occupancy function towards\nminimal entropy fields while maximizing its entropy at the input point cloud.\nThrough extensive experiments and evaluations, we illustrate the efficacy of\nour proposed method, highlighting its capacity to improve implicit shape\ninference with respect to baselines and the state-of-the-art using synthetic\nand real data.\n","authors":["Amine Ouasfi","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2404.02759v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02755v1","updated":"2024-04-03T13:57:08Z","published":"2024-04-03T13:57:08Z","title":"DIBS: Enhancing Dense Video Captioning with Unlabeled Videos via Pseudo\n Boundary Enrichment and Online Refinement","summary":" We present Dive Into the BoundarieS (DIBS), a novel pretraining framework for\ndense video captioning (DVC), that elaborates on improving the quality of the\ngenerated event captions and their associated pseudo event boundaries from\nunlabeled videos. By leveraging the capabilities of diverse large language\nmodels (LLMs), we generate rich DVC-oriented caption candidates and optimize\nthe corresponding pseudo boundaries under several meticulously designed\nobjectives, considering diversity, event-centricity, temporal ordering, and\ncoherence. Moreover, we further introduce a novel online boundary refinement\nstrategy that iteratively improves the quality of pseudo boundaries during\ntraining. Comprehensive experiments have been conducted to examine the\neffectiveness of the proposed technique components. By leveraging a substantial\namount of unlabeled video data, such as HowTo100M, we achieve a remarkable\nadvancement on standard DVC datasets like YouCook2 and ActivityNet. We\noutperform the previous state-of-the-art Vid2Seq across a majority of metrics,\nachieving this with just 0.4% of the unlabeled video data used for pre-training\nby Vid2Seq.\n","authors":["Hao Wu","Huabin Liu","Yu Qiao","Xiao Sun"],"pdf_url":"https://arxiv.org/pdf/2404.02755v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02747v1","updated":"2024-04-03T13:44:41Z","published":"2024-04-03T13:44:41Z","title":"Cross-Attention Makes Inference Cumbersome in Text-to-Image Diffusion\n Models","summary":" This study explores the role of cross-attention during inference in\ntext-conditional diffusion models. We find that cross-attention outputs\nconverge to a fixed point after few inference steps. Accordingly, the time\npoint of convergence naturally divides the entire inference process into two\nstages: an initial semantics-planning stage, during which, the model relies on\ncross-attention to plan text-oriented visual semantics, and a subsequent\nfidelity-improving stage, during which the model tries to generate images from\npreviously planned semantics. Surprisingly, ignoring text conditions in the\nfidelity-improving stage not only reduces computation complexity, but also\nmaintains model performance. This yields a simple and training-free method\ncalled TGATE for efficient generation, which caches the cross-attention output\nonce it converges and keeps it fixed during the remaining inference steps. Our\nempirical study on the MS-COCO validation set confirms its effectiveness. The\nsource code of TGATE is available at https://github.com/HaozheLiu-ST/T-GATE.\n","authors":["Wentian Zhang","Haozhe Liu","Jinheng Xie","Francesco Faccio","Mike Zheng Shou","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2404.02747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15528v2","updated":"2024-04-03T13:40:14Z","published":"2024-03-22T17:27:18Z","title":"Evaluating GPT-4 with Vision on Detection of Radiological Findings on\n Chest Radiographs","summary":" The study examines the application of GPT-4V, a multi-modal large language\nmodel equipped with visual recognition, in detecting radiological findings from\na set of 100 chest radiographs and suggests that GPT-4V is currently not ready\nfor real-world diagnostic usage in interpreting chest radiographs.\n","authors":["Yiliang Zhou","Hanley Ong","Patrick Kennedy","Carol Wu","Jacob Kazam","Keith Hentel","Adam Flanders","George Shih","Yifan Peng"],"pdf_url":"https://arxiv.org/pdf/2403.15528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02742v1","updated":"2024-04-03T13:39:29Z","published":"2024-04-03T13:39:29Z","title":"LiDAR4D: Dynamic Neural Fields for Novel Space-time View LiDAR Synthesis","summary":" Although neural radiance fields (NeRFs) have achieved triumphs in image novel\nview synthesis (NVS), LiDAR NVS remains largely unexplored. Previous LiDAR NVS\nmethods employ a simple shift from image NVS methods while ignoring the dynamic\nnature and the large-scale reconstruction problem of LiDAR point clouds. In\nlight of this, we propose LiDAR4D, a differentiable LiDAR-only framework for\nnovel space-time LiDAR view synthesis. In consideration of the sparsity and\nlarge-scale characteristics, we design a 4D hybrid representation combined with\nmulti-planar and grid features to achieve effective reconstruction in a\ncoarse-to-fine manner. Furthermore, we introduce geometric constraints derived\nfrom point clouds to improve temporal consistency. For the realistic synthesis\nof LiDAR point clouds, we incorporate the global optimization of ray-drop\nprobability to preserve cross-region patterns. Extensive experiments on\nKITTI-360 and NuScenes datasets demonstrate the superiority of our method in\naccomplishing geometry-aware and time-consistent dynamic reconstruction. Codes\nare available at https://github.com/ispc-lab/LiDAR4D.\n","authors":["Zehan Zheng","Fan Lu","Weiyi Xue","Guang Chen","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.02742v1.pdf","comment":"Accepted by CVPR 2024. Project Page:\n https://dyfcalid.github.io/LiDAR4D"},{"id":"http://arxiv.org/abs/2404.02135v2","updated":"2024-04-03T13:36:38Z","published":"2024-04-02T17:48:46Z","title":"ResNet with Integrated Convolutional Block Attention Module for Ship\n Classification Using Transfer Learning on Optical Satellite Imagery","summary":" This study proposes a novel transfer learning framework for effective ship\nclassification using high-resolution optical remote sensing satellite imagery.\nThe framework is based on the deep convolutional neural network model ResNet50\nand incorporates the Convolutional Block Attention Module (CBAM) to enhance\nperformance. CBAM enables the model to attend to salient features in the\nimages, allowing it to better discriminate between subtle differences between\nships and backgrounds. Furthermore, this study adopts a transfer learning\napproach tailored for accurately classifying diverse types of ships by\nfine-tuning a pre-trained model for the specific task. Experimental results\ndemonstrate the efficacy of the proposed framework in ship classification using\noptical remote sensing imagery, achieving a high classification accuracy of 94%\nacross 5 classes, outperforming existing methods. This research holds potential\napplications in maritime surveillance and management, illegal fishing\ndetection, and maritime traffic monitoring.\n","authors":["Ryan Donghan Kwon","Gangjoo Robin Nam","Jisoo Tak","Yeom Hyeok","Junseob Shin","Hyerin Cha","Kim Soo Bin"],"pdf_url":"https://arxiv.org/pdf/2404.02135v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02738v1","updated":"2024-04-03T13:35:51Z","published":"2024-04-03T13:35:51Z","title":"Adaptive Affinity-Based Generalization For MRI Imaging Segmentation\n Across Resource-Limited Settings","summary":" The joint utilization of diverse data sources for medical imaging\nsegmentation has emerged as a crucial area of research, aiming to address\nchallenges such as data heterogeneity, domain shift, and data quality\ndiscrepancies. Integrating information from multiple data domains has shown\npromise in improving model generalizability and adaptability. However, this\napproach often demands substantial computational resources, hindering its\npracticality. In response, knowledge distillation (KD) has garnered attention\nas a solution. KD involves training light-weight models to emulate the behavior\nof more resource-intensive models, thereby mitigating the computational burden\nwhile maintaining performance. This paper addresses the pressing need to\ndevelop a lightweight and generalizable model for medical imaging segmentation\nthat can effectively handle data integration challenges. Our proposed approach\nintroduces a novel relation-based knowledge framework by seamlessly combining\nadaptive affinity-based and kernel-based distillation through a gram matrix\nthat can capture the style representation across features. This methodology\nempowers the student model to accurately replicate the feature representations\nof the teacher model, facilitating robust performance even in the face of\ndomain shift and data heterogeneity. To validate our innovative approach, we\nconducted experiments on publicly available multi-source prostate MRI data. The\nresults demonstrate a significant enhancement in segmentation performance using\nlightweight networks. Notably, our method achieves this improvement while\nreducing both inference time and storage usage, rendering it a practical and\nefficient solution for real-time medical imaging segmentation.\n","authors":["Eddardaa B. Loussaief","Mohammed Ayad","Domenc Puig","Hatem A. Rashwan"],"pdf_url":"https://arxiv.org/pdf/2404.02738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02733v1","updated":"2024-04-03T13:34:09Z","published":"2024-04-03T13:34:09Z","title":"InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image\n Generation","summary":" Tuning-free diffusion-based models have demonstrated significant potential in\nthe realm of image personalization and customization. However, despite this\nnotable progress, current models continue to grapple with several complex\nchallenges in producing style-consistent image generation. Firstly, the concept\nof style is inherently underdetermined, encompassing a multitude of elements\nsuch as color, material, atmosphere, design, and structure, among others.\nSecondly, inversion-based methods are prone to style degradation, often\nresulting in the loss of fine-grained details. Lastly, adapter-based approaches\nfrequently require meticulous weight tuning for each reference image to achieve\na balance between style intensity and text controllability. In this paper, we\ncommence by examining several compelling yet frequently overlooked\nobservations. We then proceed to introduce InstantStyle, a framework designed\nto address these issues through the implementation of two key strategies: 1) A\nstraightforward mechanism that decouples style and content from reference\nimages within the feature space, predicated on the assumption that features\nwithin the same space can be either added to or subtracted from one another. 2)\nThe injection of reference image features exclusively into style-specific\nblocks, thereby preventing style leaks and eschewing the need for cumbersome\nweight tuning, which often characterizes more parameter-heavy designs.Our work\ndemonstrates superior visual stylization outcomes, striking an optimal balance\nbetween the intensity of style and the controllability of textual elements. Our\ncodes will be available at https://github.com/InstantStyle/InstantStyle.\n","authors":["Haofan Wang","Qixun Wang","Xu Bai","Zekui Qin","Anthony Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02733v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2404.02731v1","updated":"2024-04-03T13:30:56Z","published":"2024-04-03T13:30:56Z","title":"Event Camera Demosaicing via Swin Transformer and Pixel-focus Loss","summary":" Recent research has highlighted improvements in high-quality imaging guided\nby event cameras, with most of these efforts concentrating on the RGB domain.\nHowever, these advancements frequently neglect the unique challenges introduced\nby the inherent flaws in the sensor design of event cameras in the RAW domain.\nSpecifically, this sensor design results in the partial loss of pixel values,\nposing new challenges for RAW domain processes like demosaicing. The challenge\nintensifies as most research in the RAW domain is based on the premise that\neach pixel contains a value, making the straightforward adaptation of these\nmethods to event camera demosaicing problematic. To end this, we present a\nSwin-Transformer-based backbone and a pixel-focus loss function for demosaicing\nwith missing pixel values in RAW domain processing. Our core motivation is to\nrefine a general and widely applicable foundational model from the RGB domain\nfor RAW domain processing, thereby broadening the model's applicability within\nthe entire imaging process. Our method harnesses multi-scale processing and\nspace-to-depth techniques to ensure efficiency and reduce computing complexity.\nWe also proposed the Pixel-focus Loss function for network fine-tuning to\nimprove network convergence based on our discovery of a long-tailed\ndistribution in training loss. Our method has undergone validation on the MIPI\nDemosaic Challenge dataset, with subsequent analytical experimentation\nconfirming its efficacy. All code and trained models are released here:\nhttps://github.com/yunfanLu/ev-demosaic\n","authors":["Yunfan Lu","Yijie Xu","Wenzong Ma","Weiyu Guo","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2404.02731v1.pdf","comment":"Accepted for the CVPR 2024 Workshop on Mobile Intelligent Photography\n & Imaging"},{"id":"http://arxiv.org/abs/2404.02726v1","updated":"2024-04-03T13:27:54Z","published":"2024-04-03T13:27:54Z","title":"Harnessing the Power of Large Vision Language Models for Synthetic Image\n Detection","summary":" In recent years, the emergence of models capable of generating images from\ntext has attracted considerable interest, offering the possibility of creating\nrealistic images from text descriptions. Yet these advances have also raised\nconcerns about the potential misuse of these images, including the creation of\nmisleading content such as fake news and propaganda. This study investigates\nthe effectiveness of using advanced vision-language models (VLMs) for synthetic\nimage identification. Specifically, the focus is on tuning state-of-the-art\nimage captioning models for synthetic image detection. By harnessing the robust\nunderstanding capabilities of large VLMs, the aim is to distinguish authentic\nimages from synthetic images produced by diffusion-based models. This study\ncontributes to the advancement of synthetic image detection by exploiting the\ncapabilities of visual language models such as BLIP-2 and ViTGPT2. By tailoring\nimage captioning models, we address the challenges associated with the\npotential misuse of synthetic images in real-world applications. Results\ndescribed in this paper highlight the promising role of VLMs in the field of\nsynthetic image detection, outperforming conventional image-based detection\ntechniques. Code and models can be found at\nhttps://github.com/Mamadou-Keita/VLM-DETECT.\n","authors":["Mamadou Keita","Wassim Hamidouche","Hassen Bougueffa","Abdenour Hadid","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2404.02726v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2404.01959"},{"id":"http://arxiv.org/abs/2312.09056v2","updated":"2024-04-03T13:09:27Z","published":"2023-12-14T15:53:07Z","title":"ReCoRe: Regularized Contrastive Representation Learning of World Model","summary":" While recent model-free Reinforcement Learning (RL) methods have demonstrated\nhuman-level effectiveness in gaming environments, their success in everyday\ntasks like visual navigation has been limited, particularly under significant\nappearance variations. This limitation arises from (i) poor sample efficiency\nand (ii) over-fitting to training scenarios. To address these challenges, we\npresent a world model that learns invariant features using (i) contrastive\nunsupervised learning and (ii) an intervention-invariant regularizer. Learning\nan explicit representation of the world dynamics i.e. a world model, improves\nsample efficiency while contrastive learning implicitly enforces learning of\ninvariant features, which improves generalization. However, the na\\\"ive\nintegration of contrastive loss to world models is not good enough, as\nworld-model-based RL methods independently optimize representation learning and\nagent policy. To overcome this issue, we propose an intervention-invariant\nregularizer in the form of an auxiliary task such as depth prediction, image\ndenoising, image segmentation, etc., that explicitly enforces invariance to\nstyle interventions. Our method outperforms current state-of-the-art\nmodel-based and model-free RL methods and significantly improves on\nout-of-distribution point navigation tasks evaluated on the iGibson benchmark.\nWith only visual observations, we further demonstrate that our approach\noutperforms recent language-guided foundation models for point navigation,\nwhich is essential for deployment on robots with limited computation\ncapabilities. Finally, we demonstrate that our proposed model excels at the\nsim-to-real transfer of its perception module on the Gibson benchmark.\n","authors":["Rudra P. K. Poudel","Harit Pandya","Stephan Liwicki","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2312.09056v2.pdf","comment":"Accepted at CVPR 2024. arXiv admin note: text overlap with\n arXiv:2209.14932"},{"id":"http://arxiv.org/abs/2403.13352v3","updated":"2024-04-03T13:08:55Z","published":"2024-03-20T07:31:07Z","title":"AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in\n Text-to-Image Generation","summary":" Text-to-Image (T2I) diffusion models have achieved remarkable success in\nimage generation. Despite their progress, challenges remain in both\nprompt-following ability, image quality and lack of high-quality datasets,\nwhich are essential for refining these models. As acquiring labeled data is\ncostly, we introduce AGFSync, a framework that enhances T2I diffusion models\nthrough Direct Preference Optimization (DPO) in a fully AI-driven approach.\nAGFSync utilizes Vision-Language Models (VLM) to assess image quality across\nstyle, coherence, and aesthetics, generating feedback data within an AI-driven\nloop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and\nSDXL, our extensive experiments on the TIFA dataset demonstrate notable\nimprovements in VQA scores, aesthetic evaluations, and performance on the HPSv2\nbenchmark, consistently outperforming the base models. AGFSync's method of\nrefining T2I diffusion models paves the way for scalable alignment techniques.\n","authors":["Jingkun An","Yinghao Zhu","Zongjian Li","Haoran Feng","Bohua Chen","Yemin Shi","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2403.13352v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05073v4","updated":"2024-04-03T13:07:55Z","published":"2023-09-10T16:42:11Z","title":"FreeMan: Towards Benchmarking 3D Human Pose Estimation under Real-World\n Conditions","summary":" Estimating the 3D structure of the human body from natural scenes is a\nfundamental aspect of visual perception. 3D human pose estimation is a vital\nstep in advancing fields like AIGC and human-robot interaction, serving as a\ncrucial technique for understanding and interacting with human actions in\nreal-world settings. However, the current datasets, often collected under\nsingle laboratory conditions using complex motion capture equipment and\nunvarying backgrounds, are insufficient. The absence of datasets on variable\nconditions is stalling the progress of this crucial task. To facilitate the\ndevelopment of 3D pose estimation, we present FreeMan, the first large-scale,\nmulti-view dataset collected under the real-world conditions. FreeMan was\ncaptured by synchronizing 8 smartphones across diverse scenarios. It comprises\n11M frames from 8000 sequences, viewed from different perspectives. These\nsequences cover 40 subjects across 10 different scenarios, each with varying\nlighting conditions. We have also established an semi-automated pipeline\ncontaining error detection to reduce the workload of manual check and ensure\nprecise annotation. We provide comprehensive evaluation baselines for a range\nof tasks, underlining the significant challenges posed by FreeMan. Further\nevaluations of standard indoor/outdoor human sensing datasets reveal that\nFreeMan offers robust representation transferability in real and complex\nscenes. Code and data are available at https://wangjiongw.github.io/freeman.\n","authors":["Jiong Wang","Fengyu Yang","Wenbo Gou","Bingliang Li","Danqi Yan","Ailing Zeng","Yijun Gao","Junle Wang","Yanqing Jing","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05073v4.pdf","comment":"CVPR2024 camera ready version. 19 pages, 16 figures. Project page:\n https://wangjiongw.github.io/freeman/ ; API:\n https://github.com/wangjiongw/FreeMan_API"},{"id":"http://arxiv.org/abs/2404.02697v1","updated":"2024-04-03T12:54:16Z","published":"2024-04-03T12:54:16Z","title":"Model-agnostic Origin Attribution of Generated Images with Few-shot\n Examples","summary":" Recent progress in visual generative models enables the generation of\nhigh-quality images. To prevent the misuse of generated images, it is important\nto identify the origin model that generates them. In this work, we study the\norigin attribution of generated images in a practical setting where only a few\nimages generated by a source model are available and the source model cannot be\naccessed. The goal is to check if a given image is generated by the source\nmodel. We first formulate this problem as a few-shot one-class classification\ntask. To solve the task, we propose OCC-CLIP, a CLIP-based framework for\nfew-shot one-class classification, enabling the identification of an image's\nsource model, even among multiple candidates. Extensive experiments\ncorresponding to various generative models verify the effectiveness of our\nOCC-CLIP framework. Furthermore, an experiment based on the recently released\nDALL-E 3 API verifies the real-world applicability of our solution.\n","authors":["Fengyuan Liu","Haochen Luo","Yiming Li","Philip Torr","Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2404.02697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12685v2","updated":"2024-04-03T12:47:15Z","published":"2023-09-22T07:51:17Z","title":"eWand: A calibration framework for wide baseline frame-based and\n event-based camera systems","summary":" Accurate calibration is crucial for using multiple cameras to triangulate the\nposition of objects precisely. However, it is also a time-consuming process\nthat needs to be repeated for every displacement of the cameras. The standard\napproach is to use a printed pattern with known geometry to estimate the\nintrinsic and extrinsic parameters of the cameras. The same idea can be applied\nto event-based cameras, though it requires extra work. By using frame\nreconstruction from events, a printed pattern can be detected. A blinking\npattern can also be displayed on a screen. Then, the pattern can be directly\ndetected from the events. Such calibration methods can provide accurate\nintrinsic calibration for both frame- and event-based cameras. However, using\n2D patterns has several limitations for multi-camera extrinsic calibration,\nwith cameras possessing highly different points of view and a wide baseline.\nThe 2D pattern can only be detected from one direction and needs to be of\nsignificant size to compensate for its distance to the camera. This makes the\nextrinsic calibration time-consuming and cumbersome. To overcome these\nlimitations, we propose eWand, a new method that uses blinking LEDs inside\nopaque spheres instead of a printed or displayed pattern. Our method provides a\nfaster, easier-to-use extrinsic calibration approach that maintains high\naccuracy for both event- and frame-based cameras.\n","authors":["Thomas Gossard","Andreas Ziegler","Levin Kolmar","Jonas Tebbe","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2309.12685v2.pdf","comment":"Accepted for 2024 IEEE International Conference on Robotics and\n Automation (ICRA 2024). Project web page:\n https://cogsys-tuebingen.github.io/ewand/"},{"id":"http://arxiv.org/abs/2401.05827v2","updated":"2024-04-03T12:42:32Z","published":"2024-01-11T10:52:17Z","title":"Hallucination Benchmark in Medical Visual Question Answering","summary":" The recent success of large language and vision models (LLVMs) on vision\nquestion answering (VQA), particularly their applications in medicine\n(Med-VQA), has shown a great potential of realizing effective visual assistants\nfor healthcare. However, these models are not extensively tested on the\nhallucination phenomenon in clinical settings. Here, we created a hallucination\nbenchmark of medical images paired with question-answer sets and conducted a\ncomprehensive evaluation of the state-of-the-art models. The study provides an\nin-depth analysis of current models' limitations and reveals the effectiveness\nof various prompting strategies.\n","authors":["Jinge Wu","Yunsoo Kim","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2401.05827v2.pdf","comment":"Accepted to ICLR 2024 Tiny Papers(Notable)"},{"id":"http://arxiv.org/abs/2404.02686v1","updated":"2024-04-03T12:32:13Z","published":"2024-04-03T12:32:13Z","title":"Design2Cloth: 3D Cloth Generation from 2D Masks","summary":" In recent years, there has been a significant shift in the field of digital\navatar research, towards modeling, animating and reconstructing clothed human\nrepresentations, as a key step towards creating realistic avatars. However,\ncurrent 3D cloth generation methods are garment specific or trained completely\non synthetic data, hence lacking fine details and realism. In this work, we\nmake a step towards automatic realistic garment design and propose\nDesign2Cloth, a high fidelity 3D generative model trained on a real world\ndataset from more than 2000 subject scans. To provide vital contribution to the\nfashion industry, we developed a user-friendly adversarial model capable of\ngenerating diverse and detailed clothes simply by drawing a 2D cloth mask.\nUnder a series of both qualitative and quantitative experiments, we showcase\nthat Design2Cloth outperforms current state-of-the-art cloth generative models\nby a large margin. In addition to the generative properties of our network, we\nshowcase that the proposed method can be used to achieve high quality\nreconstructions from single in-the-wild images and 3D scans. Dataset, code and\npre-trained model will become publicly available.\n","authors":["Jiali Zheng","Rolandos Alexandros Potamias","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2404.02686v1.pdf","comment":"Accepted to CVPR 2024, Project page:\n https://jiali-zheng.github.io/Design2Cloth/"},{"id":"http://arxiv.org/abs/2404.02678v1","updated":"2024-04-03T12:21:41Z","published":"2024-04-03T12:21:41Z","title":"Independently Keypoint Learning for Small Object Semantic Correspondence","summary":" Semantic correspondence remains a challenging task for establishing\ncorrespondences between a pair of images with the same category or similar\nscenes due to the large intra-class appearance. In this paper, we introduce a\nnovel problem called 'Small Object Semantic Correspondence (SOSC).' This\nproblem is challenging due to the close proximity of keypoints associated with\nsmall objects, which results in the fusion of these respective features. It is\ndifficult to identify the corresponding key points of the fused features, and\nit is also difficult to be recognized. To address this challenge, we propose\nthe Keypoint Bounding box-centered Cropping (KBC) method, which aims to\nincrease the spatial separation between keypoints of small objects, thereby\nfacilitating independent learning of these keypoints. The KBC method is\nseamlessly integrated into our proposed inference pipeline and can be easily\nincorporated into other methodologies, resulting in significant performance\nenhancements. Additionally, we introduce a novel framework, named KBCNet, which\nserves as our baseline model. KBCNet comprises a Cross-Scale Feature Alignment\n(CSFA) module and an efficient 4D convolutional decoder. The CSFA module is\ndesigned to align multi-scale features, enriching keypoint representations by\nintegrating fine-grained features and deep semantic features. Meanwhile, the 4D\nconvolutional decoder, based on efficient 4D convolution, ensures efficiency\nand rapid convergence. To empirically validate the effectiveness of our\nproposed methodology, extensive experiments are conducted on three widely used\nbenchmarks: PF-PASCAL, PF-WILLOW, and SPair-71k. Our KBC method demonstrates a\nsubstantial performance improvement of 7.5\\% on the SPair-71K dataset,\nproviding compelling evidence of its efficacy.\n","authors":["Hailong Jin","Huiying Li"],"pdf_url":"https://arxiv.org/pdf/2404.02678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16926v5","updated":"2024-04-03T12:08:08Z","published":"2023-11-28T16:31:27Z","title":"LLaFS: When Large Language Models Meet Few-Shot Segmentation","summary":" This paper proposes LLaFS, the first attempt to leverage large language\nmodels (LLMs) in few-shot segmentation. In contrast to the conventional\nfew-shot segmentation methods that only rely on the limited and biased\ninformation from the annotated support images, LLaFS leverages the vast prior\nknowledge gained by LLM as an effective supplement and directly uses the LLM to\nsegment images in a few-shot manner. To enable the text-based LLM to handle\nimage-related tasks, we carefully design an input instruction that allows the\nLLM to produce segmentation results represented as polygons, and propose a\nregion-attribute table to simulate the human visual mechanism and provide\nmulti-modal guidance. We also synthesize pseudo samples and use curriculum\nlearning for pretraining to augment data and achieve better optimization. LLaFS\nachieves state-of-the-art results on multiple datasets, showing the potential\nof using LLMs for few-shot computer vision tasks.\n","authors":["Lanyun Zhu","Tianrun Chen","Deyi Ji","Jieping Ye","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16926v5.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.02668v1","updated":"2024-04-03T12:06:01Z","published":"2024-04-03T12:06:01Z","title":"RS-Mamba for Large Remote Sensing Image Dense Prediction","summary":" The spatial resolution of remote sensing images is becoming increasingly\nhigher, posing challenges in handling large very-high-resolution (VHR) remote\nsensing images for dense prediction tasks. Models based on convolutional neural\nnetworks are limited in their ability to model global features of remote\nsensing images due to local convolution operations. Transformer based models,\ndespite their global modeling capabilities, face computational challenges with\nlarge VHR images due to their quadratic complexity. The common practice of\ncropping large images into smaller patches leads to a significant loss of\ncontextual information. To address these issues, we propose the Remote Sensing\nMamba (RSM) for dense prediction tasks in VHR remote sensing. RSM is designed\nto model global features of remote sensing images with linear complexity,\nenabling it to process large VHR images effectively. It employs an\nomnidirectional selective scan module to globally model the images in multiple\ndirections, capturing large spatial features from various directions.\nExperiments on semantic segmentation and change detection tasks across various\nobjects demonstrate the effectiveness of RSM. With simple model architecture\nand training approach, RSM achieves state-of-the-art performance on the dense\nprediction tasks of VHR remote sensing. The code for this work will be\navailable at https://github.com/walking-shadow/Official_Remote_Sensing_Mamba.\n","authors":["Sijie Zhao","Hao Chen","Xueliang Zhang","Pengfeng Xiao","Lei Bai","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2404.02668v1.pdf","comment":"13 pages,6 figures"},{"id":"http://arxiv.org/abs/2404.02659v1","updated":"2024-04-03T11:47:20Z","published":"2024-04-03T11:47:20Z","title":"A Satellite Band Selection Framework for Amazon Forest Deforestation\n Detection Task","summary":" The conservation of tropical forests is a topic of significant social and\necological relevance due to their crucial role in the global ecosystem.\nUnfortunately, deforestation and degradation impact millions of hectares\nannually, necessitating government or private initiatives for effective forest\nmonitoring. This study introduces a novel framework that employs the Univariate\nMarginal Distribution Algorithm (UMDA) to select spectral bands from Landsat-8\nsatellite, optimizing the representation of deforested areas. This selection\nguides a semantic segmentation architecture, DeepLabv3+, enhancing its\nperformance. Experimental results revealed several band compositions that\nachieved superior balanced accuracy compared to commonly adopted combinations\nfor deforestation detection, utilizing segment classification via a Support\nVector Machine (SVM). Moreover, the optimal band compositions identified by the\nUMDA-based approach improved the performance of the DeepLabv3+ architecture,\nsurpassing state-of-the-art approaches compared in this study. The observation\nthat a few selected bands outperform the total contradicts the data-driven\nparadigm prevalent in the deep learning field. Therefore, this suggests an\nexception to the conventional wisdom that 'more is always better'.\n","authors":["Eduardo Neto","Fabio A. Faria","Amanda A. S. de Oliveira","Álvaro L. Fazenda"],"pdf_url":"https://arxiv.org/pdf/2404.02659v1.pdf","comment":"9 pages, 4 figures, paper accepted for presentation at GECCO 2024"},{"id":"http://arxiv.org/abs/2304.08069v3","updated":"2024-04-03T11:46:48Z","published":"2023-04-17T08:30:02Z","title":"DETRs Beat YOLOs on Real-time Object Detection","summary":" The YOLO series has become the most popular framework for real-time object\ndetection due to its reasonable trade-off between speed and accuracy. However,\nwe observe that the speed and accuracy of YOLOs are negatively affected by the\nNMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an\nalternative to eliminating NMS. Nevertheless, the high computational cost\nlimits their practicality and hinders them from fully exploiting the advantage\nof excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer\n(RT-DETR), the first real-time end-to-end object detector to our best knowledge\nthat addresses the above dilemma. We build RT-DETR in two steps, drawing on the\nadvanced DETR: first we focus on maintaining accuracy while improving speed,\nfollowed by maintaining speed while improving accuracy. Specifically, we design\nan efficient hybrid encoder to expeditiously process multi-scale features by\ndecoupling intra-scale interaction and cross-scale fusion to improve speed.\nThen, we propose the uncertainty-minimal query selection to provide\nhigh-quality initial queries to the decoder, thereby improving accuracy. In\naddition, RT-DETR supports flexible speed tuning by adjusting the number of\ndecoder layers to adapt to various scenarios without retraining. Our\nRT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4\nGPU, outperforming previously advanced YOLOs in both speed and accuracy. We\nalso develop scaled RT-DETRs that outperform the lighter YOLO detectors (S and\nM models). Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy\nand about 21 times in FPS. After pre-training with Objects365, RT-DETR-R50 /\nR101 achieves 55.3% / 56.2% AP. The project page:\nhttps://zhao-yian.github.io/RTDETR.\n","authors":["Yian Zhao","Wenyu Lv","Shangliang Xu","Jinman Wei","Guanzhong Wang","Qingqing Dang","Yi Liu","Jie Chen"],"pdf_url":"https://arxiv.org/pdf/2304.08069v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02656v1","updated":"2024-04-03T11:37:03Z","published":"2024-04-03T11:37:03Z","title":"Non-negative Subspace Feature Representation for Few-shot Learning in\n Medical Imaging","summary":" Unlike typical visual scene recognition domains, in which massive datasets\nare accessible to deep neural networks, medical image interpretations are often\nobstructed by the paucity of data. In this paper, we investigate the\neffectiveness of data-based few-shot learning in medical imaging by exploring\ndifferent data attribute representations in a low-dimensional space. We\nintroduce different types of non-negative matrix factorization (NMF) in\nfew-shot learning, addressing the data scarcity issue in medical image\nclassification. Extensive empirical studies are conducted in terms of\nvalidating the effectiveness of NMF, especially its supervised variants (e.g.,\ndiscriminative NMF, and supervised and constrained NMF with sparseness), and\nthe comparison with principal component analysis (PCA), i.e., the collaborative\nrepresentation-based dimensionality reduction technique derived from\neigenvectors. With 14 different datasets covering 11 distinct illness\ncategories, thorough experimental results and comparison with related\ntechniques demonstrate that NMF is a competitive alternative to PCA for\nfew-shot learning in medical imaging, and the supervised NMF algorithms are\nmore discriminative in the subspace with greater effectiveness. Furthermore, we\nshow that the part-based representation of NMF, especially its supervised\nvariants, is dramatically impactful in detecting lesion areas in medical\nimaging with limited samples.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2404.02656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11389v2","updated":"2024-04-03T11:33:14Z","published":"2023-03-20T18:49:39Z","title":"Creating Ensembles of Classifiers through UMDA for Aerial Scene\n Classification","summary":" Aerial scene classification, which aims to semantically label remote sensing\nimages in a set of predefined classes (e.g., agricultural, beach, and harbor),\nis a very challenging task in remote sensing due to high intra-class\nvariability and the different scales and orientations of the objects present in\nthe dataset images. In remote sensing area, the use of CNN architectures as an\nalternative solution is also a reality for scene classification tasks.\nGenerally, these CNNs are used to perform the traditional image classification\ntask. However, another less used way to classify remote sensing image might be\nthe one that uses deep metric learning (DML) approaches. In this sense, this\nwork proposes to employ six DML approaches for aerial scene classification\ntasks, analysing their behave with four different pre-trained CNNs as well as\ncombining them through the use of evolutionary computation algorithm (UMDA). In\nperformed experiments, it is possible to observe than DML approaches can\nachieve the best classification results when compared to traditional\npre-trained CNNs for three well-known remote sensing aerial scene datasets. In\naddition, the UMDA algorithm proved to be a promising strategy to combine DML\napproaches when there is diversity among them, managing to improve at least\n5.6% of accuracy in the classification results using almost 50\\% of the\navailable classifiers for the construction of the final ensemble of\nclassifiers.\n","authors":["Fabio A. Faria","Luiz H. Buris","Luis A. M. Pereira","Fábio A. M. Cappabianco"],"pdf_url":"https://arxiv.org/pdf/2303.11389v2.pdf","comment":"9 pages, 4 figures, accepted for presentation at the GECCO2024"},{"id":"http://arxiv.org/abs/2401.15204v4","updated":"2024-04-03T11:12:16Z","published":"2024-01-26T21:02:44Z","title":"LYT-Net: Lightweight YUV Transformer-based Network for Low-Light Image\n Enhancement","summary":" In recent years, deep learning-based solutions have proven successful in the\ndomains of image enhancement. This paper introduces LYT-Net, or Lightweight YUV\nTransformer-based Network, as a novel approach for low-light image enhancement.\nThe proposed architecture, distinct from conventional Retinex-based models,\nleverages the YUV color space's natural separation of luminance (Y) and\nchrominance (U and V) to simplify the intricate task of disentangling light and\ncolor information in images. By utilizing the strengths of transformers, known\nfor their capability to capture long-range dependencies, LYT-Net ensures a\ncomprehensive contextual understanding of the image while maintaining reduced\nmodel complexity. By employing a novel hybrid loss function, our proposed\nmethod achieves state-of-the-art results on low-light image enhancement\ndatasets, all while being considerably more compact than its counterparts. The\nsource code and pre-trained models are available at\nhttps://github.com/albrateanu/LYT-Net\n","authors":["A. Brateanu","R. Balmez","A. Avram","C. Orhei"],"pdf_url":"https://arxiv.org/pdf/2401.15204v4.pdf","comment":"10 pages, 6 figures, submitted to ICIP"},{"id":"http://arxiv.org/abs/2212.05315v3","updated":"2024-04-03T11:03:52Z","published":"2022-12-10T14:49:24Z","title":"Mind The Edge: Refining Depth Edges in Sparsely-Supervised Monocular\n Depth Estimation","summary":" Monocular Depth Estimation (MDE) is a fundamental problem in computer vision\nwith numerous applications. Recently, LIDAR-supervised methods have achieved\nremarkable per-pixel depth accuracy in outdoor scenes. However, significant\nerrors are typically found in the proximity of depth discontinuities, i.e.,\ndepth edges, which often hinder the performance of depth-dependent applications\nthat are sensitive to such inaccuracies, e.g., novel view synthesis and\naugmented reality. Since direct supervision for the location of depth edges is\ntypically unavailable in sparse LIDAR-based scenes, encouraging the MDE model\nto produce correct depth edges is not straightforward. To the best of our\nknowledge this paper is the first attempt to address the depth edges issue for\nLIDAR-supervised scenes. In this work we propose to learn to detect the\nlocation of depth edges from densely-supervised synthetic data, and use it to\ngenerate supervision for the depth edges in the MDE training. To quantitatively\nevaluate our approach, and due to the lack of depth edges GT in LIDAR-based\nscenes, we manually annotated subsets of the KITTI and the DDAD datasets with\ndepth edges ground truth. We demonstrate significant gains in the accuracy of\nthe depth edges with comparable per-pixel depth accuracy on several challenging\ndatasets. Code and datasets are available at\n\\url{https://github.com/liortalker/MindTheEdge}.\n","authors":["Lior Talker","Aviad Cohen","Erez Yosef","Alexandra Dana","Michael Dinerstein"],"pdf_url":"https://arxiv.org/pdf/2212.05315v3.pdf","comment":"Appears in CVPR24'"},{"id":"http://arxiv.org/abs/2404.02638v1","updated":"2024-04-03T10:57:47Z","published":"2024-04-03T10:57:47Z","title":"SG-BEV: Satellite-Guided BEV Fusion for Cross-View Semantic Segmentation","summary":" This paper aims at achieving fine-grained building attribute segmentation in\na cross-view scenario, i.e., using satellite and street-view image pairs. The\nmain challenge lies in overcoming the significant perspective differences\nbetween street views and satellite views. In this work, we introduce SG-BEV, a\nnovel approach for satellite-guided BEV fusion for cross-view semantic\nsegmentation. To overcome the limitations of existing cross-view projection\nmethods in capturing the complete building facade features, we innovatively\nincorporate Bird's Eye View (BEV) method to establish a spatially explicit\nmapping of street-view features. Moreover, we fully leverage the advantages of\nmultiple perspectives by introducing a novel satellite-guided reprojection\nmodule, optimizing the uneven feature distribution issues associated with\ntraditional BEV methods. Our method demonstrates significant improvements on\nfour cross-view datasets collected from multiple cities, including New York,\nSan Francisco, and Boston. On average across these datasets, our method\nachieves an increase in mIOU by 10.13% and 5.21% compared with the\nstate-of-the-art satellite-based and cross-view methods. The code and datasets\nof this work will be released at https://github.com/yejy53/SG-BEV.\n","authors":["Junyan Ye","Qiyan Luo","Jinhua Yu","Huaping Zhong","Zhimeng Zheng","Conghui He","Weijia Li"],"pdf_url":"https://arxiv.org/pdf/2404.02638v1.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2311.10224v2","updated":"2024-04-03T10:57:10Z","published":"2023-11-16T22:31:05Z","title":"CV-Attention UNet: Attention-based UNet for 3D Cerebrovascular\n Segmentation of Enhanced TOF-MRA Images","summary":" Due to the lack of automated methods, to diagnose cerebrovascular disease,\ntime-of-flight magnetic resonance angiography (TOF-MRA) is assessed visually,\nmaking it time-consuming. The commonly used encoder-decoder architectures for\ncerebrovascular segmentation utilize redundant features, eventually leading to\nthe extraction of low-level features multiple times. Additionally,\nconvolutional neural networks (CNNs) suffer from performance degradation when\nthe batch size is small, and deeper networks experience the vanishing gradient\nproblem. Methods: In this paper, we attempt to solve these limitations and\npropose the 3D cerebrovascular attention UNet method, named CV-AttentionUNet,\nfor precise extraction of brain vessel images. We proposed a sequence of\npreprocessing techniques followed by deeply supervised UNet to improve the\naccuracy of segmentation of the brain vessels leading to a stroke. To combine\nthe low and high semantics, we applied the attention mechanism. This mechanism\nfocuses on relevant associations and neglects irrelevant anatomical\ninformation. Furthermore, the inclusion of deep supervision incorporates\ndifferent levels of features that prove to be beneficial for network\nconvergence. Results: We demonstrate the efficiency of the proposed method by\ncross-validating with an unlabeled dataset, which was further labeled by us. We\nbelieve that the novelty of this algorithm lies in its ability to perform well\non both labeled and unlabeled data with image processing-based enhancement. The\nresults indicate that our method performed better than the existing\nstate-of-the-art methods on the TubeTK dataset. Conclusion: The proposed method\nwill help in accurate segmentation of cerebrovascular structure leading to\nstroke\n","authors":["Syed Farhan Abbas","Nguyen Thanh Duc","Yoonguu Song","Kyungwon Kim","Ekta Srivastava","Boreom Lee"],"pdf_url":"https://arxiv.org/pdf/2311.10224v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02634v1","updated":"2024-04-03T10:44:06Z","published":"2024-04-03T10:44:06Z","title":"3DStyleGLIP: Part-Tailored Text-Guided 3D Neural Stylization","summary":" 3D stylization, which entails the application of specific styles to\nthree-dimensional objects, holds significant commercial potential as it enables\nthe creation of diverse 3D objects with distinct moods and styles, tailored to\nspecific demands of different scenes. With recent advancements in text-driven\nmethods and artificial intelligence, the stylization process is increasingly\nintuitive and automated, thereby diminishing the reliance on manual labor and\nexpertise. However, existing methods have predominantly focused on holistic\nstylization, thereby leaving the application of styles to individual components\nof a 3D object unexplored. In response, we introduce 3DStyleGLIP, a novel\nframework specifically designed for text-driven, part-tailored 3D stylization.\nGiven a 3D mesh and a text prompt, 3DStyleGLIP leverages the vision-language\nembedding space of the Grounded Language-Image Pre-training (GLIP) model to\nlocalize the individual parts of the 3D mesh and modify their colors and local\ngeometries to align them with the desired styles specified in the text prompt.\n3DStyleGLIP is effectively trained for 3D stylization tasks through a\npart-level style loss working in GLIP's embedding space, supplemented by two\ncomplementary learning techniques. Extensive experimental validation confirms\nthat our method achieves significant part-wise stylization capabilities,\ndemonstrating promising potential in advancing the field of 3D stylization.\n","authors":["SeungJeh Chung","JooHyun Park","Hyewon Kan","HyeongYeop Kang"],"pdf_url":"https://arxiv.org/pdf/2404.02634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00553v4","updated":"2024-04-03T10:36:31Z","published":"2023-04-02T15:04:43Z","title":"From Isolated Islands to Pangea: Unifying Semantic Space for Human\n Action Understanding","summary":" Action understanding has attracted long-term attention. It can be formed as\nthe mapping from the physical space to the semantic space. Typically,\nresearchers built datasets according to idiosyncratic choices to define classes\nand push the envelope of benchmarks respectively. Datasets are incompatible\nwith each other like \"Isolated Islands\" due to semantic gaps and various class\ngranularities, e.g., do housework in dataset A and wash plate in dataset B. We\nargue that we need a more principled semantic space to concentrate the\ncommunity efforts and use all datasets together to pursue generalizable action\nlearning. To this end, we design a structured action semantic space given verb\ntaxonomy hierarchy and covering massive actions. By aligning the classes of\nprevious datasets to our semantic space, we gather (image/video/skeleton/MoCap)\ndatasets into a unified database in a unified label system, i.e., bridging\n\"isolated islands\" into a \"Pangea\". Accordingly, we propose a novel model\nmapping from the physical space to semantic space to fully use Pangea. In\nextensive experiments, our new system shows significant superiority, especially\nin transfer learning. Our code and data will be made public at\nhttps://mvig-rhos.com/pangea.\n","authors":["Yong-Lu Li","Xiaoqian Wu","Xinpeng Liu","Zehao Wang","Yiming Dou","Yikun Ji","Junyi Zhang","Yixing Li","Jingru Tan","Xudong Lu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00553v4.pdf","comment":"CVPR 2024, Project Webpage: https://mvig-rhos.com/pangea"},{"id":"http://arxiv.org/abs/2404.02624v1","updated":"2024-04-03T10:25:45Z","published":"2024-04-03T10:25:45Z","title":"Multi-Scale Spatial-Temporal Self-Attention Graph Convolutional Networks\n for Skeleton-based Action Recognition","summary":" Skeleton-based gesture recognition methods have achieved high success using\nGraph Convolutional Network (GCN). In addition, context-dependent adaptive\ntopology as a neighborhood vertex information and attention mechanism leverages\na model to better represent actions. In this paper, we propose self-attention\nGCN hybrid model, Multi-Scale Spatial-Temporal self-attention (MSST)-GCN to\neffectively improve modeling ability to achieve state-of-the-art results on\nseveral datasets. We utilize spatial self-attention module with adaptive\ntopology to understand intra-frame interactions within a frame among different\nbody parts, and temporal self-attention module to examine correlations between\nframes of a node. These two are followed by multi-scale convolution network\nwith dilations, which not only captures the long-range temporal dependencies of\njoints but also the long-range spatial dependencies (i.e., long-distance\ndependencies) of node temporal behaviors. They are combined into high-level\nspatial-temporal representations and output the predicted action with the\nsoftmax classifier.\n","authors":["Ikuo Nakamura"],"pdf_url":"https://arxiv.org/pdf/2404.02624v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2306.05401v3","updated":"2024-04-03T10:16:22Z","published":"2023-06-08T17:52:34Z","title":"RDumb: A simple approach that questions our progress in continual\n test-time adaptation","summary":" Test-Time Adaptation (TTA) allows to update pre-trained models to changing\ndata distributions at deployment time. While early work tested these algorithms\nfor individual fixed distribution shifts, recent work proposed and applied\nmethods for continual adaptation over long timescales. To examine the reported\nprogress in the field, we propose the Continually Changing Corruptions (CCC)\nbenchmark to measure asymptotic performance of TTA techniques. We find that\neventually all but one state-of-the-art methods collapse and perform worse than\na non-adapting model, including models specifically proposed to be robust to\nperformance collapse. In addition, we introduce a simple baseline, \"RDumb\",\nthat periodically resets the model to its pretrained state. RDumb performs\nbetter or on par with the previously proposed state-of-the-art in all\nconsidered benchmarks. Our results show that previous TTA approaches are\nneither effective at regularizing adaptation to avoid collapse nor able to\noutperform a simplistic resetting strategy.\n","authors":["Ori Press","Steffen Schneider","Matthias Kümmerer","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2306.05401v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02618v1","updated":"2024-04-03T10:11:22Z","published":"2024-04-03T10:11:22Z","title":"Diffexplainer: Towards Cross-modal Global Explanations with Diffusion\n Models","summary":" We present DiffExplainer, a novel framework that, leveraging language-vision\nmodels, enables multimodal global explainability. DiffExplainer employs\ndiffusion models conditioned on optimized text prompts, synthesizing images\nthat maximize class outputs and hidden features of a classifier, thus providing\na visual tool for explaining decisions. Moreover, the analysis of generated\nvisual descriptions allows for automatic identification of biases and spurious\nfeatures, as opposed to traditional methods that often rely on manual\nintervention. The cross-modal transferability of language-vision models also\nenables the possibility to describe decisions in a more human-interpretable\nway, i.e., through text. We conduct comprehensive experiments, which include an\nextensive user study, demonstrating the effectiveness of DiffExplainer on 1)\nthe generation of high-quality images explaining model decisions, surpassing\nexisting activation maximization methods, and 2) the automated identification\nof biases and spurious features.\n","authors":["Matteo Pennisi","Giovanni Bellitto","Simone Palazzo","Mubarak Shah","Concetto Spampinato"],"pdf_url":"https://arxiv.org/pdf/2404.02618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02617v1","updated":"2024-04-03T10:08:55Z","published":"2024-04-03T10:08:55Z","title":"Neural Radiance Fields with Torch Units","summary":" Neural Radiance Fields (NeRF) give rise to learning-based 3D reconstruction\nmethods widely used in industrial applications. Although prevalent methods\nachieve considerable improvements in small-scale scenes, accomplishing\nreconstruction in complex and large-scale scenes is still challenging. First,\nthe background in complex scenes shows a large variance among different views.\nSecond, the current inference pattern, $i.e.$, a pixel only relies on an\nindividual camera ray, fails to capture contextual information. To solve these\nproblems, we propose to enlarge the ray perception field and build up the\nsample points interactions. In this paper, we design a novel inference pattern\nthat encourages a single camera ray possessing more contextual information, and\nmodels the relationship among sample points on each camera ray. To hold\ncontextual information,a camera ray in our proposed method can render a patch\nof pixels simultaneously. Moreover, we replace the MLP in neural radiance field\nmodels with distance-aware convolutions to enhance the feature propagation\namong sample points from the same camera ray. To summarize, as a torchlight, a\nray in our proposed method achieves rendering a patch of image. Thus, we call\nthe proposed method, Torch-NeRF. Extensive experiments on KITTI-360 and LLFF\nshow that the Torch-NeRF exhibits excellent performance.\n","authors":["Bingnan Ni","Huanyu Wang","Dongfeng Bai","Minghe Weng","Dexin Qi","Weichao Qiu","Bingbing Liu"],"pdf_url":"https://arxiv.org/pdf/2404.02617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02614v1","updated":"2024-04-03T10:01:23Z","published":"2024-04-03T10:01:23Z","title":"Vestibular schwannoma growth_prediction from longitudinal MRI by time\n conditioned neural fields","summary":" Vestibular schwannomas (VS) are benign tumors that are generally managed by\nactive surveillance with MRI examination. To further assist clinical\ndecision-making and avoid overtreatment, an accurate prediction of tumor growth\nbased on longitudinal imaging is highly desirable. In this paper, we introduce\nDeepGrowth, a deep learning method that incorporates neural fields and\nrecurrent neural networks for prospective tumor growth prediction. In the\nproposed method, each tumor is represented as a signed distance function (SDF)\nconditioned on a low-dimensional latent code. Unlike previous studies that\nperform tumor shape prediction directly in the image space, we predict the\nlatent codes instead and then reconstruct future shapes from it. To deal with\nirregular time intervals, we introduce a time-conditioned recurrent module\nbased on a ConvLSTM and a novel temporal encoding strategy, which enables the\nproposed model to output varying tumor shapes over time. The experiments on an\nin-house longitudinal VS dataset showed that the proposed model significantly\nimproved the performance ($\\ge 1.6\\%$ Dice score and $\\ge0.20$ mm 95\\%\nHausdorff distance), in particular for top 20\\% tumors that grow or shrink the\nmost ($\\ge 4.6\\%$ Dice score and $\\ge 0.73$ mm 95\\% Hausdorff distance). Our\ncode is available at ~\\burl{https://github.com/cyjdswx/DeepGrowth}\n","authors":["Yunjie Chen","Jelmer M. Wolterink","Olaf M. Neve","Stephan R. Romeijn","Berit M. Verbist","Erik F. Hensen","Qian Tao","Marius Staring"],"pdf_url":"https://arxiv.org/pdf/2404.02614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07739v2","updated":"2024-04-03T09:50:54Z","published":"2024-02-12T15:57:31Z","title":"Task-conditioned adaptation of visual features in multi-task policy\n learning","summary":" Successfully addressing a wide variety of tasks is a core ability of\nautonomous agents, requiring flexibly adapting the underlying decision-making\nstrategies and, as we argue in this work, also adapting the perception modules.\nAn analogical argument would be the human visual system, which uses top-down\nsignals to focus attention determined by the current task. Similarly, we adapt\npre-trained large vision models conditioned on specific downstream tasks in the\ncontext of multi-task policy learning. We introduce task-conditioned adapters\nthat do not require finetuning any pre-trained weights, combined with a single\npolicy trained with behavior cloning and capable of addressing multiple tasks.\nWe condition the visual adapters on task embeddings, which can be selected at\ninference if the task is known, or alternatively inferred from a set of example\ndemonstrations. To this end, we propose a new optimization-based estimator. We\nevaluate the method on a wide variety of tasks from the CortexBench benchmark\nand show that, compared to existing work, it can be addressed with a single\npolicy. In particular, we demonstrate that adapting visual features is a key\ndesign choice and that the method generalizes to unseen tasks given a few\ndemonstrations.\n","authors":["Pierre Marza","Laetitia Matignon","Olivier Simonin","Christian Wolf"],"pdf_url":"https://arxiv.org/pdf/2402.07739v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00035v2","updated":"2024-04-03T09:28:43Z","published":"2024-01-08T12:19:46Z","title":"Robustness Assessment of a Runway Object Classifier for Safe Aircraft\n Taxiing","summary":" As deep neural networks (DNNs) are becoming the prominent solution for many\ncomputational problems, the aviation industry seeks to explore their potential\nin alleviating pilot workload and in improving operational safety. However, the\nuse of DNNs in this type of safety-critical applications requires a thorough\ncertification process. This need can be addressed through formal verification,\nwhich provides rigorous assurances -- e.g.,~by proving the absence of certain\nmispredictions. In this case-study paper, we demonstrate this process using an\nimage-classifier DNN currently under development at Airbus and intended for use\nduring the aircraft taxiing phase. We use formal methods to assess this DNN's\nrobustness to three common image perturbation types: noise, brightness and\ncontrast, and some of their combinations. This process entails multiple\ninvocations of the underlying verifier, which might be computationally\nexpensive; and we therefore propose a method that leverages the monotonicity of\nthese robustness properties, as well as the results of past verification\nqueries, in order to reduce the overall number of verification queries required\nby nearly 60%. Our results provide an indication of the level of robustness\nachieved by the DNN classifier under study, and indicate that it is\nconsiderably more vulnerable to noise than to brightness or contrast\nperturbations.\n","authors":["Yizhak Elboher","Raya Elsaleh","Omri Isac","Mélanie Ducoffe","Audrey Galametz","Guillaume Povéda","Ryma Boumazouza","Noémie Cohen","Guy Katz"],"pdf_url":"https://arxiv.org/pdf/2402.00035v2.pdf","comment":"This is a preprint version of the paper in the proceedings of 43rd\n Digital Avionics Systems Conference (DASC)"},{"id":"http://arxiv.org/abs/2403.05839v2","updated":"2024-04-03T09:25:34Z","published":"2024-03-09T08:49:50Z","title":"Long-term Frame-Event Visual Tracking: Benchmark Dataset and Baseline","summary":" Current event-/frame-event based trackers undergo evaluation on short-term\ntracking datasets, however, the tracking of real-world scenarios involves\nlong-term tracking, and the performance of existing tracking algorithms in\nthese scenarios remains unclear. In this paper, we first propose a new\nlong-term and large-scale frame-event single object tracking dataset, termed\nFELT. It contains 742 videos and 1,594,474 RGB frames and event stream pairs\nand has become the largest frame-event tracking dataset to date. We re-train\nand evaluate 15 baseline trackers on our dataset for future works to compare.\nMore importantly, we find that the RGB frames and event streams are naturally\nincomplete due to the influence of challenging factors and spatially sparse\nevent flow. In response to this, we propose a novel associative memory\nTransformer network as a unified backbone by introducing modern Hopfield layers\ninto multi-head self-attention blocks to fuse both RGB and event data.\nExtensive experiments on RGB-Event (FELT), RGB-Thermal (RGBT234, LasHeR), and\nRGB-Depth (DepthTrack) datasets fully validated the effectiveness of our model.\nThe dataset and source code can be found at\n\\url{https://github.com/Event-AHU/FELT_SOT_Benchmark}.\n","authors":["Xiao Wang","Ju Huang","Shiao Wang","Chuanming Tang","Bo Jiang","Yonghong Tian","Jin Tang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2403.05839v2.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2401.04647v2","updated":"2024-04-03T09:25:08Z","published":"2024-01-09T16:16:16Z","title":"Advancing Ante-Hoc Explainable Models through Generative Adversarial\n Networks","summary":" This paper presents a novel concept learning framework for enhancing model\ninterpretability and performance in visual classification tasks. Our approach\nappends an unsupervised explanation generator to the primary classifier network\nand makes use of adversarial training. During training, the explanation module\nis optimized to extract visual concepts from the classifier's latent\nrepresentations, while the GAN-based module aims to discriminate images\ngenerated from concepts, from true images. This joint training scheme enables\nthe model to implicitly align its internally learned concepts with\nhuman-interpretable visual properties. Comprehensive experiments demonstrate\nthe robustness of our approach, while producing coherent concept activations.\nWe analyse the learned concepts, showing their semantic concordance with object\nparts and visual attributes. We also study how perturbations in the adversarial\ntraining protocol impact both classification and concept acquisition. In\nsummary, this work presents a significant step towards building inherently\ninterpretable deep vision models with task-aligned concept representations - a\nkey enabler for developing trustworthy AI for real-world perception tasks.\n","authors":["Tanmay Garg","Deepika Vemuri","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2401.04647v2.pdf","comment":"Paper accepted in Human-Centric Representation Learning workshop at\n AAAI 2024 (https://hcrl-workshop.github.io/2024/). Paper accepted and\n presented at Deployable AI Workshop at AAAI-2024\n (https://sites.google.com/view/dai-2024/home)"},{"id":"http://arxiv.org/abs/2404.01889v2","updated":"2024-04-03T09:18:09Z","published":"2024-04-02T12:28:40Z","title":"RAVE: Residual Vector Embedding for CLIP-Guided Backlit Image\n Enhancement","summary":" In this paper we propose a novel modification of Contrastive Language-Image\nPre-Training (CLIP) guidance for the task of unsupervised backlit image\nenhancement. Our work builds on the state-of-the-art CLIP-LIT approach, which\nlearns a prompt pair by constraining the text-image similarity between a prompt\n(negative/positive sample) and a corresponding image (backlit image/well-lit\nimage) in the CLIP embedding space. Learned prompts then guide an image\nenhancement network. Based on the CLIP-LIT framework, we propose two novel\nmethods for CLIP guidance. First, we show that instead of tuning prompts in the\nspace of text embeddings, it is possible to directly tune their embeddings in\nthe latent space without any loss in quality. This accelerates training and\npotentially enables the use of additional encoders that do not have a text\nencoder. Second, we propose a novel approach that does not require any prompt\ntuning. Instead, based on CLIP embeddings of backlit and well-lit images from\ntraining data, we compute the residual vector in the embedding space as a\nsimple difference between the mean embeddings of the well-lit and backlit\nimages. This vector then guides the enhancement network during training,\npushing a backlit image towards the space of well-lit images. This approach\nfurther dramatically reduces training time, stabilizes training and produces\nhigh quality enhanced images without artifacts, both in supervised and\nunsupervised training regimes. Additionally, we show that residual vectors can\nbe interpreted, revealing biases in training data, and thereby enabling\npotential bias correction.\n","authors":["Tatiana Gaintseva","Martin Benning","Gregory Slabaugh"],"pdf_url":"https://arxiv.org/pdf/2404.01889v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02585v1","updated":"2024-04-03T09:09:42Z","published":"2024-04-03T09:09:42Z","title":"Unsegment Anything by Simulating Deformation","summary":" Foundation segmentation models, while powerful, pose a significant risk: they\nenable users to effortlessly extract any objects from any digital content with\na single click, potentially leading to copyright infringement or malicious\nmisuse. To mitigate this risk, we introduce a new task \"Anything Unsegmentable\"\nto grant any image \"the right to be unsegmented\". The ambitious pursuit of the\ntask is to achieve highly transferable adversarial attacks against all\nprompt-based segmentation models, regardless of model parameterizations and\nprompts. We highlight the non-transferable and heterogeneous nature of\nprompt-specific adversarial noises. Our approach focuses on disrupting image\nencoder features to achieve prompt-agnostic attacks. Intriguingly, targeted\nfeature attacks exhibit better transferability compared to untargeted ones,\nsuggesting the optimal update direction aligns with the image manifold. Based\non the observations, we design a novel attack named Unsegment Anything by\nSimulating Deformation (UAD). Our attack optimizes a differentiable deformation\nfunction to create a target deformed image, which alters structural information\nwhile preserving achievable feature distance by adversarial example. Extensive\nexperiments verify the effectiveness of our approach, compromising a variety of\npromptable segmentation models with different architectures and prompt\ninterfaces. We release the code at\nhttps://github.com/jiahaolu97/anything-unsegmentable.\n","authors":["Jiahao Lu","Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02585v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.13674v3","updated":"2024-04-03T08:56:01Z","published":"2023-06-19T09:47:33Z","title":"MeciFace: Mechanomyography and Inertial Fusion-based Glasses for Edge\n Real-Time Recognition of Facial and Eating Activities","summary":" The increasing prevalence of stress-related eating behaviors and their impact\non overall health highlights the importance of effective and ubiquitous\nmonitoring systems. In this paper, we present MeciFace, an innovative wearable\ntechnology designed to monitor facial expressions and eating activities in\nreal-time on-the-edge (RTE). MeciFace aims to provide a low-power,\nprivacy-conscious, and highly accurate tool for promoting healthy eating\nbehaviors and stress management. We employ lightweight convolutional neural\nnetworks as backbone models for facial expression and eating monitoring\nscenarios. The MeciFace system ensures efficient data processing with a tiny\nmemory footprint, ranging from 11KB to 19 KB. During RTE evaluation, the system\nachieves an F1-score of < 86% for facial expression recognition and 94% for\neating/drinking monitoring, for the RTE of unseen users (user-independent\ncase).\n","authors":["Hymalai Bello","Sungho Suh","Bo Zhou","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2306.13674v3.pdf","comment":"Submitted to IEEE Transactions on Consumer Electronics"},{"id":"http://arxiv.org/abs/2404.02580v1","updated":"2024-04-03T08:55:44Z","published":"2024-04-03T08:55:44Z","title":"Active learning for efficient annotation in precision agriculture: a\n use-case on crop-weed semantic segmentation","summary":" Optimizing deep learning models requires large amounts of annotated images, a\nprocess that is both time-intensive and costly. Especially for semantic\nsegmentation models in which every pixel must be annotated. A potential\nstrategy to mitigate annotation effort is active learning. Active learning\nfacilitates the identification and selection of the most informative images\nfrom a large unlabelled pool. The underlying premise is that these selected\nimages can improve the model's performance faster than random selection to\nreduce annotation effort. While active learning has demonstrated promising\nresults on benchmark datasets like Cityscapes, its performance in the\nagricultural domain remains largely unexplored. This study addresses this\nresearch gap by conducting a comparative study of three active learning-based\nacquisition functions: Bayesian Active Learning by Disagreement (BALD),\nstochastic-based BALD (PowerBALD), and Random. The acquisition functions were\ntested on two agricultural datasets: Sugarbeet and Corn-Weed, both containing\nthree semantic classes: background, crop and weed. Our results indicated that\nactive learning, especially PowerBALD, yields a higher performance than Random\nsampling on both datasets. But due to the relatively large standard deviations,\nthe differences observed were minimal; this was partly caused by high image\nredundancy and imbalanced classes. Specifically, more than 89\\% of the pixels\nbelonged to the background class on both datasets. The absence of significant\nresults on both datasets indicates that further research is required for\napplying active learning on agricultural datasets, especially if they contain a\nhigh-class imbalance and redundant images. Recommendations and insights are\nprovided in this paper to potentially resolve such issues.\n","authors":["Bart M. van Marrewijk","Charbel Dandjinou","Dan Jeric Arcega Rustia","Nicolas Franco Gonzalez","Boubacar Diallo","Jérôme Dias","Paul Melki","Pieter M. Blok"],"pdf_url":"https://arxiv.org/pdf/2404.02580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02573v1","updated":"2024-04-03T08:47:40Z","published":"2024-04-03T08:47:40Z","title":"Knowledge Distillation with Multi-granularity Mixture of Priors for\n Image Super-Resolution","summary":" Knowledge distillation (KD) is a promising yet challenging model compression\ntechnique that transfers rich learning representations from a well-performing\nbut cumbersome teacher model to a compact student model. Previous methods for\nimage super-resolution (SR) mostly compare the feature maps directly or after\nstandardizing the dimensions with basic algebraic operations (e.g. average,\ndot-product). However, the intrinsic semantic differences among feature maps\nare overlooked, which are caused by the disparate expressive capacity between\nthe networks. This work presents MiPKD, a multi-granularity mixture of prior KD\nframework, to facilitate efficient SR model through the feature mixture in a\nunified latent space and stochastic network block mixture. Extensive\nexperiments demonstrate the effectiveness of the proposed MiPKD method.\n","authors":["Simiao Li","Yun Zhang","Wei Li","Hanting Chen","Wenjia Wang","Bingyi Jing","Shaohui Lin","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2404.02573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02562v1","updated":"2024-04-03T08:33:08Z","published":"2024-04-03T08:33:08Z","title":"Representation Alignment Contrastive Regularization for Multi-Object\n Tracking","summary":" Achieving high-performance in multi-object tracking algorithms heavily relies\non modeling spatio-temporal relationships during the data association stage.\nMainstream approaches encompass rule-based and deep learning-based methods for\nspatio-temporal relationship modeling. While the former relies on physical\nmotion laws, offering wider applicability but yielding suboptimal results for\ncomplex object movements, the latter, though achieving high-performance, lacks\ninterpretability and involves complex module designs. This work aims to\nsimplify deep learning-based spatio-temporal relationship models and introduce\ninterpretability into features for data association. Specifically, a\nlightweight single-layer transformer encoder is utilized to model\nspatio-temporal relationships. To make features more interpretative, two\ncontrastive regularization losses based on representation alignment are\nproposed, derived from spatio-temporal consistency rules. By applying weighted\nsummation to affinity matrices, the aligned features can seamlessly integrate\ninto the data association stage of the original tracking workflow. Experimental\nresults showcase that our model enhances the majority of existing tracking\nnetworks' performance without excessive complexity, with minimal increase in\ntraining overhead and nearly negligible computational and storage costs.\n","authors":["Shujie Chen","Zhonglin Liu","Jianfeng Dong","Di Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.02562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02558v1","updated":"2024-04-03T08:27:24Z","published":"2024-04-03T08:27:24Z","title":"Regional biases in image geolocation estimation: a case study with the\n SenseCity Africa dataset","summary":" Advances in Artificial Intelligence are challenged by the biases rooted in\nthe datasets used to train the models. In image geolocation estimation, models\nare mostly trained using data from specific geographic regions, notably the\nWestern world, and as a result, they may struggle to comprehend the\ncomplexities of underrepresented regions. To assess this issue, we apply a\nstate-of-the-art image geolocation estimation model (ISNs) to a crowd-sourced\ndataset of geolocated images from the African continent (SCA100), and then\nexplore the regional and socioeconomic biases underlying the model's\npredictions. Our findings show that the ISNs model tends to over-predict image\nlocations in high-income countries of the Western world, which is consistent\nwith the geographic distribution of its training data, i.e., the IM2GPS3k\ndataset. Accordingly, when compared to the IM2GPS3k benchmark, the accuracy of\nthe ISNs model notably decreases at all scales. Additionally, we cluster images\nof the SCA100 dataset based on how accurately they are predicted by the ISNs\nmodel and show the model's difficulties in correctly predicting the locations\nof images in low income regions, especially in Sub-Saharan Africa. Therefore,\nour results suggest that using IM2GPS3k as a training set and benchmark for\nimage geolocation estimation and other computer vision models overlooks its\npotential application in the African context.\n","authors":["Ximena Salgado Uribe","Martí Bosch","Jérôme Chenal"],"pdf_url":"https://arxiv.org/pdf/2404.02558v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.01272v2","updated":"2024-04-03T08:27:01Z","published":"2024-04-01T17:48:15Z","title":"Language Guided Domain Generalized Medical Image Segmentation","summary":" Single source domain generalization (SDG) holds promise for more reliable and\nconsistent image segmentation across real-world clinical settings particularly\nin the medical domain, where data privacy and acquisition cost constraints\noften limit the availability of diverse datasets. Depending solely on visual\nfeatures hampers the model's capacity to adapt effectively to various domains,\nprimarily because of the presence of spurious correlations and domain-specific\ncharacteristics embedded within the image features. Incorporating text features\nalongside visual features is a potential solution to enhance the model's\nunderstanding of the data, as it goes beyond pixel-level information to provide\nvaluable context. Textual cues describing the anatomical structures, their\nappearances, and variations across various imaging modalities can guide the\nmodel in domain adaptation, ultimately contributing to more robust and\nconsistent segmentation. In this paper, we propose an approach that explicitly\nleverages textual information by incorporating a contrastive learning mechanism\nguided by the text encoder features to learn a more robust feature\nrepresentation. We assess the effectiveness of our text-guided contrastive\nfeature alignment technique in various scenarios, including cross-modality,\ncross-sequence, and cross-site settings for different segmentation tasks. Our\napproach achieves favorable performance against existing methods in literature.\nOur code and model weights are available at\nhttps://github.com/ShahinaKK/LG_SDG.git.\n","authors":["Shahina Kunhimon","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2404.01272v2.pdf","comment":"Accepted at ISBI2024"},{"id":"http://arxiv.org/abs/2401.13627v2","updated":"2024-04-03T08:12:08Z","published":"2024-01-24T17:58:07Z","title":"Scaling Up to Excellence: Practicing Model Scaling for Photo-Realistic\n Image Restoration In the Wild","summary":" We introduce SUPIR (Scaling-UP Image Restoration), a groundbreaking image\nrestoration method that harnesses generative prior and the power of model\nscaling up. Leveraging multi-modal techniques and advanced generative prior,\nSUPIR marks a significant advance in intelligent and realistic image\nrestoration. As a pivotal catalyst within SUPIR, model scaling dramatically\nenhances its capabilities and demonstrates new potential for image restoration.\nWe collect a dataset comprising 20 million high-resolution, high-quality images\nfor model training, each enriched with descriptive text annotations. SUPIR\nprovides the capability to restore images guided by textual prompts, broadening\nits application scope and potential. Moreover, we introduce negative-quality\nprompts to further improve perceptual quality. We also develop a\nrestoration-guided sampling method to suppress the fidelity issue encountered\nin generative-based restoration. Experiments demonstrate SUPIR's exceptional\nrestoration effects and its novel capacity to manipulate restoration through\ntextual prompts.\n","authors":["Fanghua Yu","Jinjin Gu","Zheyuan Li","Jinfan Hu","Xiangtao Kong","Xintao Wang","Jingwen He","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2401.13627v2.pdf","comment":"This paper has been accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2307.13981v2","updated":"2024-04-03T08:04:55Z","published":"2023-07-26T06:38:33Z","title":"Analysis of Video Quality Datasets via Design of Minimalistic Video\n Quality Models","summary":" Blind video quality assessment (BVQA) plays an indispensable role in\nmonitoring and improving the end-users' viewing experience in various\nreal-world video-enabled media applications. As an experimental field, the\nimprovements of BVQA models have been measured primarily on a few human-rated\nVQA datasets. Thus, it is crucial to gain a better understanding of existing\nVQA datasets in order to properly evaluate the current progress in BVQA.\nTowards this goal, we conduct a first-of-its-kind computational analysis of VQA\ndatasets via designing minimalistic BVQA models. By minimalistic, we restrict\nour family of BVQA models to build only upon basic blocks: a video preprocessor\n(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an\noptional temporal quality analyzer, and a quality regressor, all with the\nsimplest possible instantiations. By comparing the quality prediction\nperformance of different model variants on eight VQA datasets with realistic\ndistortions, we find that nearly all datasets suffer from the easy dataset\nproblem of varying severity, some of which even admit blind image quality\nassessment (BIQA) solutions. We additionally justify our claims by contrasting\nour model generalizability on these VQA datasets, and by ablating a dizzying\nset of BVQA design choices related to the basic building blocks. Our results\ncast doubt on the current progress in BVQA, and meanwhile shed light on good\npractices of constructing next-generation VQA datasets and models.\n","authors":["Wei Sun","Wen Wen","Xiongkuo Min","Long Lan","Guangtao Zhai","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02544v1","updated":"2024-04-03T08:01:00Z","published":"2024-04-03T08:01:00Z","title":"Semi-Supervised Unconstrained Head Pose Estimation in the Wild","summary":" Existing head pose estimation datasets are either composed of numerous\nsamples by non-realistic synthesis or lab collection, or limited images by\nlabor-intensive annotating. This makes deep supervised learning based solutions\ncompromised due to the reliance on generous labeled data. To alleviate it, we\npropose the first semi-supervised unconstrained head pose estimation (SemiUHPE)\nmethod, which can leverage a large amount of unlabeled wild head images.\nSpecifically, we follow the recent semi-supervised rotation regression, and\nfocus on the diverse and complex head pose domain. Firstly, we claim that the\naspect-ratio invariant cropping of heads is superior to the previous\nlandmark-based affine alignment, which does not fit unlabeled natural heads or\npractical applications where landmarks are often unavailable. Then, instead of\nusing an empirically fixed threshold to filter out pseudo labels, we propose\nthe dynamic entropy-based filtering by updating thresholds for adaptively\nremoving unlabeled outliers. Moreover, we revisit the design of weak-strong\naugmentations, and further exploit its superiority by devising two novel\nhead-oriented strong augmentations named pose-irrelevant cut-occlusion and\npose-altering rotation consistency. Extensive experiments show that SemiUHPE\ncan surpass SOTAs with remarkable improvements on public benchmarks under both\nfront-range and full-range. Our code is released in\n\\url{https://github.com/hnuzhy/SemiUHPE}.\n","authors":["Huayi Zhou","Fei Jiang","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2404.02544v1.pdf","comment":"14 pages. Semi-Supervised Unconstrained Head Pose Estimation"},{"id":"http://arxiv.org/abs/2403.19425v2","updated":"2024-04-03T07:37:32Z","published":"2024-03-28T13:56:26Z","title":"A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation:\n Generalizability and Clinical Utility Beyond the ISLES Challenge","summary":" Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment\ndecisions, and prognosis. However, image and disease variability hinder the\ndevelopment of generalizable AI algorithms with clinical value. We address this\ngap by presenting a novel ensemble algorithm derived from the 2022 Ischemic\nStroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient\nscans with ischemic stroke from various medical centers, facilitating the\ndevelopment of a wide range of cutting-edge segmentation algorithms by the\nresearch community. Through collaboration with leading teams, we combined\ntop-performing algorithms into an ensemble model that overcomes the limitations\nof individual solutions. Our ensemble model achieved superior ischemic lesion\ndetection and segmentation accuracy on our internal test set compared to\nindividual algorithms. This accuracy generalized well across diverse image and\ndisease variables. Furthermore, the model excelled in extracting clinical\nbiomarkers. Notably, in a Turing-like test, neuroradiologists consistently\npreferred the algorithm's segmentations over manual expert efforts,\nhighlighting increased comprehensiveness and precision. Validation using a\nreal-world external dataset (N=1686) confirmed the model's generalizability.\nThe algorithm's outputs also demonstrated strong correlations with clinical\nscores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived\nresults, underlining its clinical relevance. This study offers two key\nfindings. First, we present an ensemble algorithm\n(https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments\nischemic stroke lesions on DWI across diverse scenarios on par with expert\n(neuro)radiologists. Second, we show the potential for biomedical challenge\noutputs to extend beyond the challenge's initial objectives, demonstrating\ntheir real-world clinical applicability.\n","authors":["Ezequiel de la Rosa","Mauricio Reyes","Sook-Lei Liew","Alexandre Hutton","Roland Wiest","Johannes Kaesmacher","Uta Hanning","Arsany Hakim","Richard Zubal","Waldo Valenzuela","David Robben","Diana M. Sima","Vincenzo Anania","Arne Brys","James A. Meakin","Anne Mickan","Gabriel Broocks","Christian Heitkamp","Shengbo Gao","Kongming Liang","Ziji Zhang","Md Mahfuzur Rahman Siddiquee","Andriy Myronenko","Pooya Ashtari","Sabine Van Huffel","Hyun-su Jeong","Chi-ho Yoon","Chulhong Kim","Jiayu Huo","Sebastien Ourselin","Rachel Sparks","Albert Clèrigues","Arnau Oliver","Xavier Lladó","Liam Chalcroft","Ioannis Pappas","Jeroen Bertels","Ewout Heylen","Juliette Moreau","Nima Hatami","Carole Frindel","Abdul Qayyum","Moona Mazher","Domenec Puig","Shao-Chieh Lin","Chun-Jung Juan","Tianxi Hu","Lyndon Boone","Maged Goubran","Yi-Jui Liu","Susanne Wegener","Florian Kofler","Ivan Ezhov","Suprosanna Shit","Moritz R. Hernandez Petzsche","Bjoern Menze","Jan S. Kirschke","Benedikt Wiestler"],"pdf_url":"https://arxiv.org/pdf/2403.19425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02530v1","updated":"2024-04-03T07:33:30Z","published":"2024-04-03T07:33:30Z","title":"Severity Controlled Text-to-Image Generative Model Bias Manipulation","summary":" Text-to-image (T2I) generative models are gaining wide popularity, especially\nin public domains. However, their intrinsic bias and potential malicious\nmanipulations remain under-explored. Charting the susceptibility of T2I models\nto such manipulation, we first expose the new possibility of a dynamic and\ncomputationally efficient exploitation of model bias by targeting the embedded\nlanguage models. By leveraging mathematical foundations of vector algebra, our\ntechnique enables a scalable and convenient control over the severity of output\nmanipulation through model bias. As a by-product, this control also allows a\nform of precise prompt engineering to generate images which are generally\nimplausible with regular text prompts. We also demonstrate a constructive\napplication of our manipulation for balancing the frequency of generated\nclasses - as in model debiasing. Our technique does not require training and is\nalso framed as a backdoor attack with severity control using semantically-null\ntext triggers in the prompts. With extensive analysis, we present interesting\nqualitative and quantitative results to expose potential manipulation\npossibilities for T2I models.\n Key-words: Text-to-Image Models, Generative Models, Backdoor Attacks, Prompt\nEngineering, Bias\n","authors":["Jordan Vice","Naveed Akhtar","Richard Hartley","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2404.02530v1.pdf","comment":"This research was supported by National Intelligence and Security\n Discovery Research Grants (project# NS220100007), funded by the Department of\n Defence Australia"},{"id":"http://arxiv.org/abs/2404.02527v1","updated":"2024-04-03T07:30:09Z","published":"2024-04-03T07:30:09Z","title":"Weakly-Supervised 3D Scene Graph Generation via Visual-Linguistic\n Assisted Pseudo-labeling","summary":" Learning to build 3D scene graphs is essential for real-world perception in a\nstructured and rich fashion. However, previous 3D scene graph generation\nmethods utilize a fully supervised learning manner and require a large amount\nof entity-level annotation data of objects and relations, which is extremely\nresource-consuming and tedious to obtain. To tackle this problem, we propose\n3D-VLAP, a weakly-supervised 3D scene graph generation method via\nVisual-Linguistic Assisted Pseudo-labeling. Specifically, our 3D-VLAP exploits\nthe superior ability of current large-scale visual-linguistic models to align\nthe semantics between texts and 2D images, as well as the naturally existing\ncorrespondences between 2D images and 3D point clouds, and thus implicitly\nconstructs correspondences between texts and 3D point clouds. First, we\nestablish the positional correspondence from 3D point clouds to 2D images via\ncamera intrinsic and extrinsic parameters, thereby achieving alignment of 3D\npoint clouds and 2D images. Subsequently, a large-scale cross-modal\nvisual-linguistic model is employed to indirectly align 3D instances with the\ntextual category labels of objects by matching 2D images with object category\nlabels. The pseudo labels for objects and relations are then produced for\n3D-VLAP model training by calculating the similarity between visual embeddings\nand textual category embeddings of objects and relations encoded by the\nvisual-linguistic model, respectively. Ultimately, we design an edge\nself-attention based graph neural network to generate scene graphs of 3D point\ncloud scenes. Extensive experiments demonstrate that our 3D-VLAP achieves\ncomparable results with current advanced fully supervised methods, meanwhile\nsignificantly alleviating the pressure of data annotation.\n","authors":["Xu Wang","Yifan Li","Qiudan Zhang","Wenhui Wu","Mark Junjie Li","Jianmin Jinag"],"pdf_url":"https://arxiv.org/pdf/2404.02527v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.02523v1","updated":"2024-04-03T07:23:03Z","published":"2024-04-03T07:23:03Z","title":"Text-driven Affordance Learning from Egocentric Vision","summary":" Visual affordance learning is a key component for robots to understand how to\ninteract with objects. Conventional approaches in this field rely on\npre-defined objects and actions, falling short of capturing diverse\ninteractions in realworld scenarios. The key idea of our approach is employing\ntextual instruction, targeting various affordances for a wide range of objects.\nThis approach covers both hand-object and tool-object interactions. We\nintroduce text-driven affordance learning, aiming to learn contact points and\nmanipulation trajectories from an egocentric view following textual\ninstruction. In our task, contact points are represented as heatmaps, and the\nmanipulation trajectory as sequences of coordinates that incorporate both\nlinear and rotational movements for various manipulations. However, when we\ngather data for this task, manual annotations of these diverse interactions are\ncostly. To this end, we propose a pseudo dataset creation pipeline and build a\nlarge pseudo-training dataset: TextAFF80K, consisting of over 80K instances of\nthe contact points, trajectories, images, and text tuples. We extend existing\nreferring expression comprehension models for our task, and experimental\nresults show that our approach robustly handles multiple affordances, serving\nas a new standard for affordance learning in real-world scenarios.\n","authors":["Tomoya Yoshida","Shuhei Kurita","Taichi Nishimura","Shinsuke Mori"],"pdf_url":"https://arxiv.org/pdf/2404.02523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00611v2","updated":"2024-04-03T07:18:11Z","published":"2024-03-31T09:01:17Z","title":"Object-level Copy-Move Forgery Image Detection based on Inconsistency\n Mining","summary":" In copy-move tampering operations, perpetrators often employ techniques, such\nas blurring, to conceal tampering traces, posing significant challenges to the\ndetection of object-level targets with intact structures. Focus on these\nchallenges, this paper proposes an Object-level Copy-Move Forgery Image\nDetection based on Inconsistency Mining (IMNet). To obtain complete\nobject-level targets, we customize prototypes for both the source and tampered\nregions and dynamically update them. Additionally, we extract inconsistent\nregions between coarse similar regions obtained through self-correlation\ncalculations and regions composed of prototypes. The detected inconsistent\nregions are used as supplements to coarse similar regions to refine pixel-level\ndetection. We operate experiments on three public datasets which validate the\neffectiveness and the robustness of the proposed IMNet.\n","authors":["Jingyu Wang","Niantai Jing","Ziyao Liu","Jie Nie","Yuxin Qi","Chi-Hung Chi","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2404.00611v2.pdf","comment":"4 pages, 2 figures, Accepted to WWW 2024"},{"id":"http://arxiv.org/abs/2404.00228v3","updated":"2024-04-03T07:15:05Z","published":"2024-03-30T03:16:37Z","title":"InfLoRA: Interference-Free Low-Rank Adaptation for Continual Learning","summary":" Continual learning requires the model to learn multiple tasks sequentially.\nIn continual learning, the model should possess the ability to maintain its\nperformance on old tasks (stability) and the ability to adapt to new tasks\ncontinuously (plasticity). Recently, parameter-efficient fine-tuning (PEFT),\nwhich involves freezing a pre-trained model and injecting a small number of\nlearnable parameters to adapt to downstream tasks, has gained increasing\npopularity in continual learning. Although existing continual learning methods\nbased on PEFT have demonstrated superior performance compared to those not\nbased on PEFT, most of them do not consider how to eliminate the interference\nof the new task on the old tasks, which inhibits the model from making a good\ntrade-off between stability and plasticity. In this work, we propose a new PEFT\nmethod, called interference-free low-rank adaptation (InfLoRA), for continual\nlearning. InfLoRA injects a small number of parameters to reparameterize the\npre-trained weights and shows that fine-tuning these injected parameters is\nequivalent to fine-tuning the pre-trained weights within a subspace.\nFurthermore, InfLoRA designs this subspace to eliminate the interference of the\nnew task on the old tasks, making a good trade-off between stability and\nplasticity. Experimental results show that InfLoRA outperforms existing\nstate-of-the-art continual learning methods on multiple datasets.\n","authors":["Yan-Shuo Liang","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2404.00228v3.pdf","comment":"Accepted by the 2024 IEEE/CVF Conference on Computer Vision and\n Pattern Recognition (CVPR 2024)"},{"id":"http://arxiv.org/abs/2404.02518v1","updated":"2024-04-03T07:11:19Z","published":"2024-04-03T07:11:19Z","title":"CPAISD: Core-penumbra acute ischemic stroke dataset","summary":" We introduce the CPAISD: Core-Penumbra Acute Ischemic Stroke Dataset, aimed\nat enhancing the early detection and segmentation of ischemic stroke using\nNon-Contrast Computed Tomography (NCCT) scans. Addressing the challenges in\ndiagnosing acute ischemic stroke during its early stages due to often\nnon-revealing native CT findings, the dataset provides a collection of\nsegmented NCCT images. These include annotations of ischemic core and penumbra\nregions, critical for developing machine learning models for rapid stroke\nidentification and assessment. By offering a carefully collected and annotated\ndataset, we aim to facilitate the development of advanced diagnostic tools,\ncontributing to improved patient care and outcomes in stroke management. Our\ndataset's uniqueness lies in its focus on the acute phase of ischemic stroke,\nwith non-informative native CT scans, and includes a baseline model to\ndemonstrate the dataset's application, encouraging further research and\ninnovation in the field of medical imaging and stroke diagnosis.\n","authors":["D. Umerenkov","S. Kudin","M. Peksheva","D. Pavlov"],"pdf_url":"https://arxiv.org/pdf/2404.02518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03166v3","updated":"2024-04-03T07:10:22Z","published":"2024-02-05T16:35:29Z","title":"RRWNet: Recursive Refinement Network for Effective Retinal Artery/Vein\n Segmentation and Classification","summary":" The caliber and configuration of retinal blood vessels serve as important\nbiomarkers for various diseases and medical conditions. A thorough analysis of\nthe retinal vasculature requires the segmentation of the blood vessels and\ntheir classification into arteries and veins, typically performed on color\nfundus images obtained by retinography. However, manually performing these\ntasks is labor-intensive and prone to human error. While several automated\nmethods have been proposed to address this task, the current state of art faces\nchallenges due to manifest classification errors affecting the topological\nconsistency of segmentation maps. In this work, we introduce RRWNet, a novel\nend-to-end deep learning framework that addresses this limitation. The\nframework consists of a fully convolutional neural network that recursively\nrefines semantic segmentation maps, correcting manifest classification errors\nand thus improving topological consistency. In particular, RRWNet is composed\nof two specialized subnetworks: a Base subnetwork that generates base\nsegmentation maps from the input images, and a Recursive Refinement subnetwork\nthat iteratively and recursively improves these maps. Evaluation on three\ndifferent public datasets demonstrates the state-of-the-art performance of the\nproposed method, yielding more topologically consistent segmentation maps with\nfewer manifest classification errors than existing approaches. In addition, the\nRecursive Refinement module within RRWNet proves effective in post-processing\nsegmentation maps from other methods, further demonstrating its potential. The\nmodel code, weights, and predictions will be publicly available at\nhttps://github.com/j-morano/rrwnet.\n","authors":["José Morano","Guilherme Aresta","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2402.03166v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02517v1","updated":"2024-04-03T07:10:18Z","published":"2024-04-03T07:10:18Z","title":"HENet: Hybrid Encoding for End-to-end Multi-task 3D Perception from\n Multi-view Cameras","summary":" Three-dimensional perception from multi-view cameras is a crucial component\nin autonomous driving systems, which involves multiple tasks like 3D object\ndetection and bird's-eye-view (BEV) semantic segmentation. To improve\nperception precision, large image encoders, high-resolution images, and\nlong-term temporal inputs have been adopted in recent 3D perception models,\nbringing remarkable performance gains. However, these techniques are often\nincompatible in training and inference scenarios due to computational resource\nconstraints. Besides, modern autonomous driving systems prefer to adopt an\nend-to-end framework for multi-task 3D perception, which can simplify the\noverall system architecture and reduce the implementation complexity. However,\nconflict between tasks often arises when optimizing multiple tasks jointly\nwithin an end-to-end 3D perception model. To alleviate these issues, we present\nan end-to-end framework named HENet for multi-task 3D perception in this paper.\nSpecifically, we propose a hybrid image encoding network, using a large image\nencoder for short-term frames and a small image encoder for long-term temporal\nframes. Then, we introduce a temporal feature integration module based on the\nattention mechanism to fuse the features of different frames extracted by the\ntwo aforementioned hybrid image encoders. Finally, according to the\ncharacteristics of each perception task, we utilize BEV features of different\ngrid sizes, independent BEV encoders, and task decoders for different tasks.\nExperimental results show that HENet achieves state-of-the-art end-to-end\nmulti-task 3D perception results on the nuScenes benchmark, including 3D object\ndetection and BEV semantic segmentation. The source code and models will be\nreleased at https://github.com/VDIGPKU/HENet.\n","authors":["Zhongyu Xia","ZhiWei Lin","Xinhao Wang","Yongtao Wang","Yun Xing","Shengxiang Qi","Nan Dong","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.02517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02514v1","updated":"2024-04-03T07:07:02Z","published":"2024-04-03T07:07:02Z","title":"Freditor: High-Fidelity and Transferable NeRF Editing by Frequency\n Decomposition","summary":" This paper enables high-fidelity, transferable NeRF editing by frequency\ndecomposition. Recent NeRF editing pipelines lift 2D stylization results to 3D\nscenes while suffering from blurry results, and fail to capture detailed\nstructures caused by the inconsistency between 2D editings. Our critical\ninsight is that low-frequency components of images are more\nmultiview-consistent after editing compared with their high-frequency parts.\nMoreover, the appearance style is mainly exhibited on the low-frequency\ncomponents, and the content details especially reside in high-frequency parts.\nThis motivates us to perform editing on low-frequency components, which results\nin high-fidelity edited scenes. In addition, the editing is performed in the\nlow-frequency feature space, enabling stable intensity control and novel scene\ntransfer. Comprehensive experiments conducted on photorealistic datasets\ndemonstrate the superior performance of high-fidelity and transferable NeRF\nediting. The project page is at \\url{https://aigc3d.github.io/freditor}.\n","authors":["Yisheng He","Weihao Yuan","Siyu Zhu","Zilong Dong","Liefeng Bo","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2404.02514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02508v1","updated":"2024-04-03T06:53:27Z","published":"2024-04-03T06:53:27Z","title":"VIAssist: Adapting Multi-modal Large Language Models for Users with\n Visual Impairments","summary":" Individuals with visual impairments, encompassing both partial and total\ndifficulties in visual perception, are referred to as visually impaired (VI)\npeople. An estimated 2.2 billion individuals worldwide are affected by visual\nimpairments. Recent advancements in multi-modal large language models (MLLMs)\nhave showcased their extraordinary capabilities across various domains. It is\ndesirable to help VI individuals with MLLMs' great capabilities of visual\nunderstanding and reasoning. However, it is challenging for VI people to use\nMLLMs due to the difficulties in capturing the desirable images to fulfill\ntheir daily requests. For example, the target object is not fully or partially\nplaced in the image. This paper explores how to leverage MLLMs for VI\nindividuals to provide visual-question answers. VIAssist can identify undesired\nimages and provide detailed actions. Finally, VIAssist can provide reliable\nanswers to users' queries based on the images. Our results show that VIAssist\nprovides +0.21 and +0.31 higher BERTScore and ROUGE scores than the baseline,\nrespectively.\n","authors":["Bufang Yang","Lixing He","Kaiwei Liu","Zhenyu Yan"],"pdf_url":"https://arxiv.org/pdf/2404.02508v1.pdf","comment":"Accepted to IEEE International Workshop on Foundation Models for\n Cyber-Physical Systems & Internet of Things (FMSys 2024)"},{"id":"http://arxiv.org/abs/2303.04989v2","updated":"2024-04-03T06:51:21Z","published":"2023-03-09T02:20:56Z","title":"ARS-DETR: Aspect Ratio Sensitive Oriented Object Detection with\n Transformer","summary":" Existing oriented object detection methods commonly use metric AP$_{50}$ to\nmeasure the performance of the model. We argue that AP$_{50}$ is inherently\nunsuitable for oriented object detection due to its large tolerance in angle\ndeviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$,\nto measure the performance of models. In this paper, we propose an Aspect Ratio\nSensitive Oriented Object Detector with Transformer, termed ARS-DETR, which\nexhibits a competitive performance in high-precision oriented object detection.\nSpecifically, a new angle classification method, calling Aspect Ratio aware\nCircle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more\nreasonable way and discard the hyperparameter that introduced by previous work\n(e.g. CSL). Then, a rotated deformable attention module is designed to rotate\nthe sampling points with the corresponding angles and eliminate the\nmisalignment between region features and sampling points. Moreover, a dynamic\nweight coefficient according to the aspect ratio is adopted to calculate the\nangle loss. Comprehensive experiments on several challenging datasets show that\nour method achieves competitive performance on the high-precision oriented\nobject detection task.\n","authors":["Ying Zeng","Xue Yang","Qingyun Li","Yushi Chen","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2303.04989v2.pdf","comment":"10 pages, 8 figures, 8 tables, the source code is available at\n https://github.com/httle/ARS-DETR"},{"id":"http://arxiv.org/abs/2403.04492v3","updated":"2024-04-03T06:48:15Z","published":"2024-03-07T13:49:29Z","title":"Discriminative Sample-Guided and Parameter-Efficient Feature Space\n Adaptation for Cross-Domain Few-Shot Learning","summary":" In this paper, we look at cross-domain few-shot classification which presents\nthe challenging task of learning new classes in previously unseen domains with\nfew labelled examples. Existing methods, though somewhat effective, encounter\nseveral limitations, which we alleviate through two significant improvements.\nFirst, we introduce a lightweight parameter-efficient adaptation strategy to\naddress overfitting associated with fine-tuning a large number of parameters on\nsmall datasets. This strategy employs a linear transformation of pre-trained\nfeatures, significantly reducing the trainable parameter count. Second, we\nreplace the traditional nearest centroid classifier with a discriminative\nsample-aware loss function, enhancing the model's sensitivity to the inter- and\nintra-class variances within the training set for improved clustering in\nfeature space. Empirical evaluations on the Meta-Dataset benchmark showcase\nthat our approach not only improves accuracy up to 7.7\\% and 5.3\\% on\npreviously seen and unseen datasets, respectively, but also achieves the above\nperformance while being at least $\\sim3\\times$ more parameter-efficient than\nexisting methods, establishing a new state-of-the-art in cross-domain few-shot\nlearning. Our code is available at https://github.com/rashindrie/DIPA.\n","authors":["Rashindrie Perera","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2403.04492v3.pdf","comment":"Code is available at this link: https://github.com/rashindrie/DIPA"},{"id":"http://arxiv.org/abs/2404.01700v2","updated":"2024-04-03T06:40:46Z","published":"2024-04-02T07:09:29Z","title":"MotionChain: Conversational Motion Controllers via Multimodal Prompts","summary":" Recent advancements in language models have demonstrated their adeptness in\nconducting multi-turn dialogues and retaining conversational context. However,\nthis proficiency remains largely unexplored in other multimodal generative\nmodels, particularly in human motion models. By integrating multi-turn\nconversations in controlling continuous virtual human movements, generative\nhuman motion models can achieve an intuitive and step-by-step process of human\ntask execution for humanoid robotics, game agents, or other embodied systems.\nIn this work, we present MotionChain, a conversational human motion controller\nto generate continuous and long-term human motion through multimodal prompts.\nSpecifically, MotionChain consists of multi-modal tokenizers that transform\nvarious data types such as text, image, and motion, into discrete tokens,\ncoupled with a Vision-Motion-aware Language model. By leveraging large-scale\nlanguage, vision-language, and vision-motion data to assist motion-related\ngeneration tasks, MotionChain thus comprehends each instruction in multi-turn\nconversation and generates human motions followed by these prompts. Extensive\nexperiments validate the efficacy of MotionChain, demonstrating\nstate-of-the-art performance in conversational motion generation, as well as\nmore intuitive manners of controlling and interacting with virtual humans.\n","authors":["Biao Jiang","Xin Chen","Chi Zhang","Fukun Yin","Zhuoyuan Li","Gang YU","Jiayuan Fan"],"pdf_url":"https://arxiv.org/pdf/2404.01700v2.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.12870v2","updated":"2024-04-03T06:11:17Z","published":"2023-12-20T09:34:22Z","title":"The Audio-Visual Conversational Graph: From an Egocentric-Exocentric\n Perspective","summary":" In recent years, the thriving development of research related to egocentric\nvideos has provided a unique perspective for the study of conversational\ninteractions, where both visual and audio signals play a crucial role. While\nmost prior work focus on learning about behaviors that directly involve the\ncamera wearer, we introduce the Ego-Exocentric Conversational Graph Prediction\nproblem, marking the first attempt to infer exocentric conversational\ninteractions from egocentric videos. We propose a unified multi-modal framework\n-- Audio-Visual Conversational Attention (AV-CONV), for the joint prediction of\nconversation behaviors -- speaking and listening -- for both the camera wearer\nas well as all other social partners present in the egocentric video.\nSpecifically, we adopt the self-attention mechanism to model the\nrepresentations across-time, across-subjects, and across-modalities. To\nvalidate our method, we conduct experiments on a challenging egocentric video\ndataset that includes multi-speaker and multi-conversation scenarios. Our\nresults demonstrate the superior performance of our method compared to a series\nof baselines. We also present detailed ablation studies to assess the\ncontribution of each component in our model. Check our project page at\nhttps://vjwq.github.io/AV-CONV/.\n","authors":["Wenqi Jia","Miao Liu","Hao Jiang","Ishwarya Ananthabhotla","James M. Rehg","Vamsi Krishna Ithapu","Ruohan Gao"],"pdf_url":"https://arxiv.org/pdf/2312.12870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01941v2","updated":"2024-04-03T05:43:15Z","published":"2024-04-02T13:33:31Z","title":"LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging","summary":" Human pose and shape (HPS) estimation with lensless imaging is not only\nbeneficial to privacy protection but also can be used in covert surveillance\nscenarios due to the small size and simple structure of this device. However,\nthis task presents significant challenges due to the inherent ambiguity of the\ncaptured measurements and lacks effective methods for directly estimating human\npose and shape from lensless data. In this paper, we propose the first\nend-to-end framework to recover 3D human poses and shapes from lensless\nmeasurements to our knowledge. We specifically design a multi-scale lensless\nfeature decoder to decode the lensless measurements through the optically\nencoded mask for efficient feature extraction. We also propose a double-head\nauxiliary supervision mechanism to improve the estimation accuracy of human\nlimb ends. Besides, we establish a lensless imaging system and verify the\neffectiveness of our method on various datasets acquired by our lensless\nimaging system.\n","authors":["Haoyang Ge","Qiao Feng","Hailong Jia","Xiongzheng Li","Xiangjun Yin","You Zhou","Jingyu Yang","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2404.01941v2.pdf","comment":"Accepted to CVPR 2024. More results available at\n https://cic.tju.edu.cn/faculty/likun/projects/LPSNet"},{"id":"http://arxiv.org/abs/2403.02561v2","updated":"2024-04-03T05:43:10Z","published":"2024-03-05T00:34:05Z","title":"Semantic Human Mesh Reconstruction with Textures","summary":" The field of 3D detailed human mesh reconstruction has made significant\nprogress in recent years. However, current methods still face challenges when\nused in industrial applications due to unstable results, low-quality meshes,\nand a lack of UV unwrapping and skinning weights. In this paper, we present\nSHERT, a novel pipeline that can reconstruct semantic human meshes with\ntextures and high-precision details. SHERT applies semantic- and normal-based\nsampling between the detailed surface (e.g. mesh and SDF) and the corresponding\nSMPL-X model to obtain a partially sampled semantic mesh and then generates the\ncomplete semantic mesh by our specifically designed self-supervised completion\nand refinement networks. Using the complete semantic mesh as a basis, we employ\na texture diffusion model to create human textures that are driven by both\nimages and texts. Our reconstructed meshes have stable UV unwrapping,\nhigh-quality triangle meshes, and consistent semantic information. The given\nSMPL-X model provides semantic information and shape priors, allowing SHERT to\nperform well even with incorrect and incomplete inputs. The semantic\ninformation also makes it easy to substitute and animate different body parts\nsuch as the face, body, and hands. Quantitative and qualitative experiments\ndemonstrate that SHERT is capable of producing high-fidelity and robust\nsemantic meshes that outperform state-of-the-art methods.\n","authors":["Xiaoyu Zhan","Jianxin Yang","Yuanqi Li","Jie Guo","Yanwen Guo","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2403.02561v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://zhanxy.xyz/projects/shert/"},{"id":"http://arxiv.org/abs/2404.02462v1","updated":"2024-04-03T05:04:55Z","published":"2024-04-03T05:04:55Z","title":"A Unified Membership Inference Method for Visual Self-supervised Encoder\n via Part-aware Capability","summary":" Self-supervised learning shows promise in harnessing extensive unlabeled\ndata, but it also confronts significant privacy concerns, especially in vision.\nIn this paper, we aim to perform membership inference on visual self-supervised\nmodels in a more realistic setting: self-supervised training method and details\nare unknown for an adversary when attacking as he usually faces a black-box\nsystem in practice. In this setting, considering that self-supervised model\ncould be trained by completely different self-supervised paradigms, e.g.,\nmasked image modeling and contrastive learning, with complex training details,\nwe propose a unified membership inference method called PartCrop. It is\nmotivated by the shared part-aware capability among models and stronger part\nresponse on the training data. Specifically, PartCrop crops parts of objects in\nan image to query responses with the image in representation space. We conduct\nextensive attacks on self-supervised models with different training protocols\nand structures using three widely used image datasets. The results verify the\neffectiveness and generalization of PartCrop. Moreover, to defend against\nPartCrop, we evaluate two common approaches, i.e., early stop and differential\nprivacy, and propose a tailored method called shrinking crop scale range. The\ndefense experiments indicate that all of them are effective. Our code is\navailable at https://github.com/JiePKU/PartCrop\n","authors":["Jie Zhu","Jirong Zha","Ding Li","Leye Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02462v1.pdf","comment":"Membership Inference, Self-supervised learning"},{"id":"http://arxiv.org/abs/2404.02460v1","updated":"2024-04-03T05:02:46Z","published":"2024-04-03T05:02:46Z","title":"TSNet:A Two-stage Network for Image Dehazing with Multi-scale Fusion and\n Adaptive Learning","summary":" Image dehazing has been a popular topic of research for a long time. Previous\ndeep learning-based image dehazing methods have failed to achieve satisfactory\ndehazing effects on both synthetic datasets and real-world datasets, exhibiting\npoor generalization. Moreover, single-stage networks often result in many\nregions with artifacts and color distortion in output images. To address these\nissues, this paper proposes a two-stage image dehazing network called TSNet,\nmainly consisting of the multi-scale fusion module (MSFM) and the adaptive\nlearning module (ALM). Specifically, MSFM and ALM enhance the generalization of\nTSNet. The MSFM can obtain large receptive fields at multiple scales and\nintegrate features at different frequencies to reduce the differences between\ninputs and learning objectives. The ALM can actively learn of regions of\ninterest in images and restore texture details more effectively. Additionally,\nTSNet is designed as a two-stage network, where the first-stage network\nperforms image dehazing, and the second-stage network is employed to improve\nissues such as artifacts and color distortion present in the results of the\nfirst-stage network. We also change the learning objective from ground truth\nimages to opposite fog maps, which improves the learning efficiency of TSNet.\nExtensive experiments demonstrate that TSNet exhibits superior dehazing\nperformance on both synthetic and real-world datasets compared to previous\nstate-of-the-art methods.\n","authors":["Xiaolin Gong","Zehan Zheng","Heyuan Du"],"pdf_url":"https://arxiv.org/pdf/2404.02460v1.pdf","comment":"12 pages, 10 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.02457v1","updated":"2024-04-03T04:59:28Z","published":"2024-04-03T04:59:28Z","title":"RS3Mamba: Visual State Space Model for Remote Sensing Images Semantic\n Segmentation","summary":" Semantic segmentation of remote sensing images is a fundamental task in\ngeoscience research. However, there are some significant shortcomings for the\nwidely used convolutional neural networks (CNNs) and Transformers. The former\nis limited by its insufficient long-range modeling capabilities, while the\nlatter is hampered by its computational complexity. Recently, a novel visual\nstate space (VSS) model represented by Mamba has emerged, capable of modeling\nlong-range relationships with linear computability. In this work, we propose a\nnovel dual-branch network named remote sensing images semantic segmentation\nMamba (RS3Mamba) to incorporate this innovative technology into remote sensing\ntasks. Specifically, RS3Mamba utilizes VSS blocks to construct an auxiliary\nbranch, providing additional global information to convolution-based main\nbranch. Moreover, considering the distinct characteristics of the two branches,\nwe introduce a collaborative completion module (CCM) to enhance and fuse\nfeatures from the dual-encoder. Experimental results on two widely used\ndatasets, ISPRS Vaihingen and LoveDA Urban, demonstrate the effectiveness and\npotential of the proposed RS3Mamba. To the best of our knowledge, this is the\nfirst vision Mamba specifically designed for remote sensing images semantic\nsegmentation. The source code will be made available at\nhttps://github.com/sstary/SSRS.\n","authors":["Xianping Ma","Xiaokang Zhang","Man-On Pun"],"pdf_url":"https://arxiv.org/pdf/2404.02457v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.02447v1","updated":"2024-04-03T04:26:50Z","published":"2024-04-03T04:26:50Z","title":"A Novel Approach to Breast Cancer Histopathological Image Classification\n Using Cross-Colour Space Feature Fusion and Quantum-Classical Stack Ensemble\n Method","summary":" Breast cancer classification stands as a pivotal pillar in ensuring timely\ndiagnosis and effective treatment. This study with histopathological images\nunderscores the profound significance of harnessing the synergistic\ncapabilities of colour space ensembling and quantum-classical stacking to\nelevate the precision of breast cancer classification. By delving into the\ndistinct colour spaces of RGB, HSV and CIE L*u*v, the authors initiated a\ncomprehensive investigation guided by advanced methodologies. Employing the\nDenseNet121 architecture for feature extraction the authors have capitalized on\nthe robustness of Random Forest, SVM, QSVC, and VQC classifiers. This research\nencompasses a unique feature fusion technique within the colour space ensemble.\nThis approach not only deepens our comprehension of breast cancer\nclassification but also marks a milestone in personalized medical assessment.\nThe amalgamation of quantum and classical classifiers through stacking emerges\nas a potent catalyst, effectively mitigating the inherent constraints of\nindividual classifiers, paving a robust path towards more dependable and\nrefined breast cancer identification. Through rigorous experimentation and\nmeticulous analysis, fusion of colour spaces like RGB with HSV and RGB with CIE\nL*u*v, presents an classification accuracy, nearing the value of unity. This\nunderscores the transformative potential of our approach, where the fusion of\ndiverse colour spaces and the synergy of quantum and classical realms converge\nto establish a new horizon in medical diagnostics. Thus the implications of\nthis research extend across medical disciplines, offering promising avenues for\nadvancing diagnostic accuracy and treatment efficacy.\n","authors":["Sambit Mallick","Snigdha Paul","Anindya Sen"],"pdf_url":"https://arxiv.org/pdf/2404.02447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11994v2","updated":"2024-04-03T04:07:50Z","published":"2023-12-19T09:37:25Z","title":"Optimizing Diffusion Noise Can Serve As Universal Motion Priors","summary":" We propose Diffusion Noise Optimization (DNO), a new method that effectively\nleverages existing motion diffusion models as motion priors for a wide range of\nmotion-related tasks. Instead of training a task-specific diffusion model for\neach new task, DNO operates by optimizing the diffusion latent noise of an\nexisting pre-trained text-to-motion model. Given the corresponding latent noise\nof a human motion, it propagates the gradient from the target criteria defined\non the motion space through the whole denoising process to update the diffusion\nlatent noise. As a result, DNO supports any use cases where criteria can be\ndefined as a function of motion. In particular, we show that, for motion\nediting and control, DNO outperforms existing methods in both achieving the\nobjective and preserving the motion content. DNO accommodates a diverse range\nof editing modes, including changing trajectory, pose, joint locations, or\navoiding newly added obstacles. In addition, DNO is effective in motion\ndenoising and completion, producing smooth and realistic motion from noisy and\npartial inputs. DNO achieves these results at inference time without the need\nfor model retraining, offering great versatility for any defined reward or loss\nfunction on the motion representation.\n","authors":["Korrawe Karunratanakul","Konpat Preechakul","Emre Aksan","Thabo Beeler","Supasorn Suwajanakorn","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2312.11994v2.pdf","comment":"CVPR 2024. Project page: https://korrawe.github.io/dno-project/"},{"id":"http://arxiv.org/abs/2403.11056v2","updated":"2024-04-03T04:00:53Z","published":"2024-03-17T02:06:03Z","title":"Analytic-Splatting: Anti-Aliased 3D Gaussian Splatting via Analytic\n Integration","summary":" The 3D Gaussian Splatting (3DGS) gained its popularity recently by combining\nthe advantages of both primitive-based and volumetric 3D representations,\nresulting in improved quality and efficiency for 3D scene rendering. However,\n3DGS is not alias-free, and its rendering at varying resolutions could produce\nsevere blurring or jaggies. This is because 3DGS treats each pixel as an\nisolated, single point rather than as an area, causing insensitivity to changes\nin the footprints of pixels. Consequently, this discrete sampling scheme\ninevitably results in aliasing, owing to the restricted sampling bandwidth. In\nthis paper, we derive an analytical solution to address this issue. More\nspecifically, we use a conditioned logistic function as the analytic\napproximation of the cumulative distribution function (CDF) in a\none-dimensional Gaussian signal and calculate the Gaussian integral by\nsubtracting the CDFs. We then introduce this approximation in the\ntwo-dimensional pixel shading, and present Analytic-Splatting, which\nanalytically approximates the Gaussian integral within the 2D-pixel window area\nto better capture the intensity response of each pixel. Moreover, we use the\napproximated response of the pixel window integral area to participate in the\ntransmittance calculation of volume rendering, making Analytic-Splatting\nsensitive to the changes in pixel footprint at different resolutions.\nExperiments on various datasets validate that our approach has better\nanti-aliasing capability that gives more details and better fidelity.\n","authors":["Zhihao Liang","Qi Zhang","Wenbo Hu","Ying Feng","Lei Zhu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2403.11056v2.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2401.13201v2","updated":"2024-04-03T03:52:44Z","published":"2024-01-24T03:07:26Z","title":"MLLMReID: Multimodal Large Language Model-based Person Re-identification","summary":" Multimodal large language models (MLLM) have achieved satisfactory results in\nmany tasks. However, their performance in the task of person re-identification\n(ReID) has not been explored to date. This paper will investigate how to adapt\nthem for the task of ReID. An intuitive idea is to fine-tune MLLM with ReID\nimage-text datasets, and then use their visual encoder as a backbone for ReID.\nHowever, there still exist two apparent issues: (1) Designing instructions for\nReID, MLLMs may overfit specific instructions, and designing a variety of\ninstructions will lead to higher costs. (2) Latent image feature vectors from\nLLMs are not involved in loss computation. Instructional learning, aligning\nimage-text features, results in indirect optimization and a learning objective\nthat inadequately utilizes features, limiting effectiveness in person feature\nlearning. To address these problems, this paper proposes MLLMReID: Multimodal\nLarge Language Model-based ReID. Firstly, we proposed Common Instruction, a\nsimple approach that leverages the essence ability of LLMs to continue writing,\navoiding complex and diverse instruction design. Secondly, we proposed\nDirectReID, which effectively employs the latent image feature vectors of\nimages outputted by LLMs in ReID tasks. The experimental results demonstrate\nthe superiority of our method. We will open-source the code on GitHub.\n","authors":["Shan Yang","Yongfei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.13201v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02402v3","updated":"2024-04-03T03:45:38Z","published":"2024-01-04T18:39:32Z","title":"3D Open-Vocabulary Panoptic Segmentation with 2D-3D Vision-Language\n Distillation","summary":" 3D panoptic segmentation is a challenging perception task, especially in\nautonomous driving. It aims to predict both semantic and instance annotations\nfor 3D points in a scene. Although prior 3D panoptic segmentation approaches\nhave achieved great performance on closed-set benchmarks, generalizing these\napproaches to unseen things and unseen stuff categories remains an open\nproblem. For unseen object categories, 2D open-vocabulary segmentation has\nachieved promising results that solely rely on frozen CLIP backbones and\nensembling multiple classification outputs. However, we find that simply\nextending these 2D models to 3D does not guarantee good performance due to poor\nper-mask classification quality, especially for novel stuff categories. In this\npaper, we propose the first method to tackle 3D open-vocabulary panoptic\nsegmentation. Our model takes advantage of the fusion between learnable LiDAR\nfeatures and dense frozen vision CLIP features, using a single classification\nhead to make predictions for both base and novel classes. To further improve\nthe classification performance on novel classes and leverage the CLIP model, we\npropose two novel loss functions: object-level distillation loss and\nvoxel-level distillation loss. Our experiments on the nuScenes and\nSemanticKITTI datasets show that our method outperforms the strong baseline by\na large margin.\n","authors":["Zihao Xiao","Longlong Jing","Shangxuan Wu","Alex Zihao Zhu","Jingwei Ji","Chiyu Max Jiang","Wei-Chih Hung","Thomas Funkhouser","Weicheng Kuo","Anelia Angelova","Yin Zhou","Shiwei Sheng"],"pdf_url":"https://arxiv.org/pdf/2401.02402v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02424v1","updated":"2024-04-03T03:27:01Z","published":"2024-04-03T03:27:01Z","title":"RESSA: Repair Sparse Vision-Language Models via Sparse Cross-Modality\n Adaptation","summary":" Vision-Language Models (VLMs), integrating diverse information from multiple\nmodalities, have shown remarkable success across various tasks. However,\ndeploying VLMs, comprising large-scale vision and language models poses\nchallenges in resource-constrained scenarios. While pruning followed by\nfinetuning offers a potential solution to maintain performance with smaller\nmodel sizes, its application to VLMs remains relatively unexplored, presenting\ntwo main questions: how to distribute sparsity across different\nmodality-specific models, and how to repair the performance of pruned sparse\nVLMs. To answer the first question, we conducted preliminary studies on VLM\npruning and found that pruning vision models and language models with the same\nsparsity ratios contribute to nearly optimal performance. For the second\nquestion, unlike finetuning unimodal sparse models, sparse VLMs involve\ncross-modality interactions, requiring specialized techniques for post-pruning\nperformance repair. Moreover, while parameter-efficient LoRA finetuning has\nbeen proposed to repair the performance of sparse models, a significant\nchallenge of weights merging arises due to the incompatibility of dense LoRA\nmodules with sparse models that destroy the sparsity of pruned models. To\ntackle these challenges, we propose to Repair Sparse Vision-Language Models via\nSparse Cross-modality Adaptation (RESSA). RESSA utilizes cross-modality\nfinetuning to enhance task-specific performance and facilitate knowledge\ndistillation from original dense models. Additionally, we introduce SparseLoRA,\nwhich applies sparsity directly to LoRA weights, enabling seamless integration\nwith sparse models. Our experimental results validate the effectiveness of\nRESSA, showcasing significant enhancements, such as an 11.3\\% improvement under\n2:4 sparsity and a remarkable 47.6\\% enhancement under unstructured 70\\%\nsparsity.\n","authors":["Shwai He","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15472v4","updated":"2024-04-03T03:15:55Z","published":"2022-06-30T17:59:08Z","title":"On-Device Training Under 256KB Memory","summary":" On-device training enables the model to adapt to new data collected from the\nsensors by fine-tuning a pre-trained model. Users can benefit from customized\nAI models without having to transfer the data to the cloud, protecting the\nprivacy. However, the training memory consumption is prohibitive for IoT\ndevices that have tiny memory resources. We propose an algorithm-system\nco-design framework to make on-device training possible with only 256KB of\nmemory. On-device training faces two unique challenges: (1) the quantized\ngraphs of neural networks are hard to optimize due to low bit-precision and the\nlack of normalization; (2) the limited hardware resource does not allow full\nback-propagation. To cope with the optimization difficulty, we propose\nQuantization-Aware Scaling to calibrate the gradient scales and stabilize 8-bit\nquantized training. To reduce the memory footprint, we propose Sparse Update to\nskip the gradient computation of less important layers and sub-tensors. The\nalgorithm innovation is implemented by a lightweight training system, Tiny\nTraining Engine, which prunes the backward computation graph to support sparse\nupdates and offload the runtime auto-differentiation to compile time. Our\nframework is the first solution to enable tiny on-device training of\nconvolutional neural networks under 256KB SRAM and 1MB Flash without auxiliary\nmemory, using less than 1/1000 of the memory of PyTorch and TensorFlow while\nmatching the accuracy on tinyML application VWW. Our study enables IoT devices\nnot only to perform inference but also to continuously adapt to new data for\non-device lifelong learning. A video demo can be found here:\nhttps://youtu.be/0pUFZYdoMY8.\n","authors":["Ji Lin","Ligeng Zhu","Wei-Ming Chen","Wei-Chen Wang","Chuang Gan","Song Han"],"pdf_url":"https://arxiv.org/pdf/2206.15472v4.pdf","comment":"NeurIPS 2022"},{"id":"http://arxiv.org/abs/2403.19428v2","updated":"2024-04-03T02:59:24Z","published":"2024-03-28T13:58:05Z","title":"Burst Super-Resolution with Diffusion Models for Improving Perceptual\n Quality","summary":" While burst LR images are useful for improving the SR image quality compared\nwith a single LR image, prior SR networks accepting the burst LR images are\ntrained in a deterministic manner, which is known to produce a blurry SR image.\nIn addition, it is difficult to perfectly align the burst LR images, making the\nSR image more blurry. Since such blurry images are perceptually degraded, we\naim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity\nimages can be reconstructed by diffusion models. However, prior SR methods\nusing the diffusion model are not properly optimized for the burst SR task.\nSpecifically, the reverse process starting from a random sample is not\noptimized for image enhancement and restoration methods, including burst SR. In\nour proposed method, on the other hand, burst LR features are used to\nreconstruct the initial burst SR image that is fed into an intermediate step in\nthe diffusion model. This reverse process from the intermediate step 1) skips\ndiffusion steps for reconstructing the global structure of the image and 2)\nfocuses on steps for refining detailed textures. Our experimental results\ndemonstrate that our method can improve the scores of the perceptual quality\nmetrics. Code: https://github.com/placerkyo/BSRD\n","authors":["Kyotaro Tokoro","Kazutoshi Akita","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.19428v2.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2403.19920v2","updated":"2024-04-03T02:48:47Z","published":"2024-03-29T02:17:09Z","title":"MI-NeRF: Learning a Single Face NeRF from Multiple Identities","summary":" In this work, we introduce a method that learns a single dynamic neural\nradiance field (NeRF) from monocular talking face videos of multiple\nidentities. NeRFs have shown remarkable results in modeling the 4D dynamics and\nappearance of human faces. However, they require per-identity optimization.\nAlthough recent approaches have proposed techniques to reduce the training and\nrendering time, increasing the number of identities can be expensive. We\nintroduce MI-NeRF (multi-identity NeRF), a single unified network that models\ncomplex non-rigid facial motion for multiple identities, using only monocular\nvideos of arbitrary length. The core premise in our method is to learn the\nnon-linear interactions between identity and non-identity specific information\nwith a multiplicative module. By training on multiple videos simultaneously,\nMI-NeRF not only reduces the total training time compared to standard\nsingle-identity NeRFs, but also demonstrates robustness in synthesizing novel\nexpressions for any input identity. We present results for both facial\nexpression transfer and talking face video synthesis. Our method can be further\npersonalized for a target identity given only a short video.\n","authors":["Aggelina Chatziagapi","Grigorios G. Chrysos","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.19920v2.pdf","comment":"Project page: https://aggelinacha.github.io/MI-NeRF/"},{"id":"http://arxiv.org/abs/2403.14530v2","updated":"2024-04-03T02:46:54Z","published":"2024-03-21T16:28:58Z","title":"HAC: Hash-grid Assisted Context for 3D Gaussian Splatting Compression","summary":" 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel\nview synthesis, boasting rapid rendering speed with high fidelity. However, the\nsubstantial Gaussians and their associated attributes necessitate effective\ncompression techniques. Nevertheless, the sparse and unorganized nature of the\npoint cloud of Gaussians (or anchors in our paper) presents challenges for\ncompression. To address this, we make use of the relations between the\nunorganized anchors and the structured hash grid, leveraging their mutual\ninformation for context modeling, and propose a Hash-grid Assisted Context\n(HAC) framework for highly compact 3DGS representation. Our approach introduces\na binary hash grid to establish continuous spatial consistencies, allowing us\nto unveil the inherent spatial relations of anchors through a carefully\ndesigned context model. To facilitate entropy coding, we utilize Gaussian\ndistributions to accurately estimate the probability of each quantized\nattribute, where an adaptive quantization module is proposed to enable\nhigh-precision quantization of these attributes for improved fidelity\nrestoration. Additionally, we incorporate an adaptive masking strategy to\neliminate invalid Gaussians and anchors. Importantly, our work is the pioneer\nto explore context-based compression for 3DGS representation, resulting in a\nremarkable size reduction of over $75\\times$ compared to vanilla 3DGS, while\nsimultaneously improving fidelity, and achieving over $11\\times$ size reduction\nover SOTA 3DGS compression approach Scaffold-GS. Our code is available here:\nhttps://github.com/YihangChen-ee/HAC\n","authors":["Yihang Chen","Qianyi Wu","Jianfei Cai","Mehrtash Harandi","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2403.14530v2.pdf","comment":"Project Page: https://yihangchen-ee.github.io/project_hac/ Code:\n https://github.com/YihangChen-ee/HAC"},{"id":"http://arxiv.org/abs/2404.02415v1","updated":"2024-04-03T02:40:35Z","published":"2024-04-03T02:40:35Z","title":"What Are We Measuring When We Evaluate Large Vision-Language Models? An\n Analysis of Latent Factors and Biases","summary":" Vision-language (VL) models, pretrained on colossal image-text datasets, have\nattained broad VL competence that is difficult to evaluate. A common belief is\nthat a small number of VL skills underlie the variety of VL tests. In this\npaper, we perform a large-scale transfer learning experiment aimed at\ndiscovering latent VL skills from data. We reveal interesting characteristics\nthat have important implications for test suite design. First, generation tasks\nsuffer from a length bias, suggesting benchmarks should balance tasks with\nvarying output lengths. Second, we demonstrate that factor analysis\nsuccessfully identifies reasonable yet surprising VL skill factors, suggesting\nbenchmarks could leverage similar analyses for task selection. Finally, we\npresent a new dataset, OLIVE (https://github.com/jq-zh/olive-dataset), which\nsimulates user instructions in the wild and presents challenges dissimilar to\nall datasets we tested. Our findings contribute to the design of balanced and\nbroad-coverage vision-language evaluation methods.\n","authors":["Anthony Meng Huat Tiong","Junqi Zhao","Boyang Li","Junnan Li","Steven C. H. Hoi","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2404.02415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.07935v2","updated":"2024-04-03T02:27:59Z","published":"2022-04-17T05:42:41Z","title":"Causal Intervention for Subject-Deconfounded Facial Action Unit\n Recognition","summary":" Subject-invariant facial action unit (AU) recognition remains challenging for\nthe reason that the data distribution varies among subjects. In this paper, we\npropose a causal inference framework for subject-invariant facial action unit\nrecognition. To illustrate the causal effect existing in AU recognition task,\nwe formulate the causalities among facial images, subjects, latent AU semantic\nrelations, and estimated AU occurrence probabilities via a structural causal\nmodel. By constructing such a causal diagram, we clarify the causal effect\namong variables and propose a plug-in causal intervention module, CIS, to\ndeconfound the confounder \\emph{Subject} in the causal diagram. Extensive\nexperiments conducted on two commonly used AU benchmark datasets, BP4D and\nDISFA, show the effectiveness of our CIS, and the model with CIS inserted,\nCISNet, has achieved state-of-the-art performance.\n","authors":["Yingjie Chen","Diqi Chen","Tao Wang","Yizhou Wang","Yun Liang"],"pdf_url":"https://arxiv.org/pdf/2204.07935v2.pdf","comment":"Accepted by AAAI2022"},{"id":"http://arxiv.org/abs/2404.02410v1","updated":"2024-04-03T02:26:15Z","published":"2024-04-03T02:26:15Z","title":"TCLC-GS: Tightly Coupled LiDAR-Camera Gaussian Splatting for Surrounding\n Autonomous Driving Scenes","summary":" Most 3D Gaussian Splatting (3D-GS) based methods for urban scenes initialize\n3D Gaussians directly with 3D LiDAR points, which not only underutilizes LiDAR\ndata capabilities but also overlooks the potential advantages of fusing LiDAR\nwith camera data. In this paper, we design a novel tightly coupled LiDAR-Camera\nGaussian Splatting (TCLC-GS) to fully leverage the combined strengths of both\nLiDAR and camera sensors, enabling rapid, high-quality 3D reconstruction and\nnovel view RGB/depth synthesis. TCLC-GS designs a hybrid explicit (colorized 3D\nmesh) and implicit (hierarchical octree feature) 3D representation derived from\nLiDAR-camera data, to enrich the properties of 3D Gaussians for splatting. 3D\nGaussian's properties are not only initialized in alignment with the 3D mesh\nwhich provides more completed 3D shape and color information, but are also\nendowed with broader contextual information through retrieved octree implicit\nfeatures. During the Gaussian Splatting optimization process, the 3D mesh\noffers dense depth information as supervision, which enhances the training\nprocess by learning of a robust geometry. Comprehensive evaluations conducted\non the Waymo Open Dataset and nuScenes Dataset validate our method's\nstate-of-the-art (SOTA) performance. Utilizing a single NVIDIA RTX 3090 Ti, our\nmethod demonstrates fast training and achieves real-time RGB and depth\nrendering at 90 FPS in resolution of 1920x1280 (Waymo), and 120 FPS in\nresolution of 1600x900 (nuScenes) in urban scenarios.\n","authors":["Cheng Zhao","Su Sun","Ruoyu Wang","Yuliang Guo","Jun-Jun Wan","Zhou Huang","Xinyu Huang","Yingjie Victor Chen","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2404.02410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02405v1","updated":"2024-04-03T02:16:30Z","published":"2024-04-03T02:16:30Z","title":"TE-TAD: Towards Full End-to-End Temporal Action Detection via\n Time-Aligned Coordinate Expression","summary":" In this paper, we investigate that the normalized coordinate expression is a\nkey factor as reliance on hand-crafted components in query-based detectors for\ntemporal action detection (TAD). Despite significant advancements towards an\nend-to-end framework in object detection, query-based detectors have been\nlimited in achieving full end-to-end modeling in TAD. To address this issue, we\npropose \\modelname{}, a full end-to-end temporal action detection transformer\nthat integrates time-aligned coordinate expression. We reformulate coordinate\nexpression utilizing actual timeline values, ensuring length-invariant\nrepresentations from the extremely diverse video duration environment.\nFurthermore, our proposed adaptive query selection dynamically adjusts the\nnumber of queries based on video length, providing a suitable solution for\nvarying video durations compared to a fixed query set. Our approach not only\nsimplifies the TAD process by eliminating the need for hand-crafted components\nbut also significantly improves the performance of query-based detectors. Our\nTE-TAD outperforms the previous query-based detectors and achieves competitive\nperformance compared to state-of-the-art methods on popular benchmark datasets.\nCode is available at: https://github.com/Dotori-HJ/TE-TAD\n","authors":["Ho-Joong Kim","Jung-Ho Hong","Heejon Kong","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02396v1","updated":"2024-04-03T01:55:15Z","published":"2024-04-03T01:55:15Z","title":"Enhancing Diffusion-based Point Cloud Generation with Smoothness\n Constraint","summary":" Diffusion models have been popular for point cloud generation tasks. Existing\nworks utilize the forward diffusion process to convert the original point\ndistribution into a noise distribution and then learn the reverse diffusion\nprocess to recover the point distribution from the noise distribution. However,\nthe reverse diffusion process can produce samples with non-smooth points on the\nsurface because of the ignorance of the point cloud geometric properties. We\npropose alleviating the problem by incorporating the local smoothness\nconstraint into the diffusion framework for point cloud generation. Experiments\ndemonstrate the proposed model can generate realistic shapes and smoother point\nclouds, outperforming multiple state-of-the-art methods.\n","authors":["Yukun Li","Liping Liu"],"pdf_url":"https://arxiv.org/pdf/2404.02396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02394v1","updated":"2024-04-03T01:36:27Z","published":"2024-04-03T01:36:27Z","title":"Cohort-Individual Cooperative Learning for Multimodal Cancer Survival\n Analysis","summary":" Recently, we have witnessed impressive achievements in cancer survival\nanalysis by integrating multimodal data, e.g., pathology images and genomic\nprofiles. However, the heterogeneity and high dimensionality of these\nmodalities pose significant challenges for extracting discriminative\nrepresentations while maintaining good generalization. In this paper, we\npropose a Cohort-individual Cooperative Learning (CCL) framework to advance\ncancer survival analysis by collaborating knowledge decomposition and cohort\nguidance. Specifically, first, we propose a Multimodal Knowledge Decomposition\n(MKD) module to explicitly decompose multimodal knowledge into four distinct\ncomponents: redundancy, synergy and uniqueness of the two modalities. Such a\ncomprehensive decomposition can enlighten the models to perceive easily\noverlooked yet important information, facilitating an effective multimodal\nfusion. Second, we propose a Cohort Guidance Modeling (CGM) to mitigate the\nrisk of overfitting task-irrelevant information. It can promote a more\ncomprehensive and robust understanding of the underlying multimodal data, while\navoiding the pitfalls of overfitting and enhancing the generalization ability\nof the model. By cooperating the knowledge decomposition and cohort guidance\nmethods, we develop a robust multimodal survival analysis model with enhanced\ndiscrimination and generalization abilities. Extensive experimental results on\nfive cancer datasets demonstrate the effectiveness of our model in integrating\nmultimodal data for survival analysis.\n","authors":["Huajun Zhou","Fengtao Zhou","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02394v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.02391v1","updated":"2024-04-03T01:29:30Z","published":"2024-04-03T01:29:30Z","title":"APC2Mesh: Bridging the gap from occluded building façades to full 3D\n models","summary":" The benefits of having digital twins of urban buildings are numerous.\nHowever, a major difficulty encountered in their creation from airborne LiDAR\npoint clouds is the effective means of accurately reconstructing significant\nocclusions amidst point density variations and noise. To bridge the\nnoise/sparsity/occlusion gap and generate high fidelity 3D building models, we\npropose APC2Mesh which integrates point completion into a 3D reconstruction\npipeline, enabling the learning of dense geometrically accurate representation\nof buildings. Specifically, we leveraged complete points generated from\noccluded ones as input to a linearized skip attention-based deformation network\nfor 3D mesh reconstruction. In our experiments, conducted on 3 different\nscenes, we demonstrate that: (1) APC2Mesh delivers comparatively superior\nresults, indicating its efficacy in handling the challenges of occluded\nairborne building points of diverse styles and complexities. (2) The\ncombination of point completion with typical deep learning-based 3D point cloud\nreconstruction methods offers a direct and effective solution for\nreconstructing significantly occluded airborne building points. As such, this\nneural integration holds promise for advancing the creation of digital twins\nfor urban buildings with greater accuracy and fidelity.\n","authors":["Perpetual Hope Akwensi","Akshay Bharadwaj","Ruisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02391v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2404.02388v1","updated":"2024-04-03T01:13:05Z","published":"2024-04-03T01:13:05Z","title":"CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation","summary":" Deep Neural Networks (DNNs) are widely used for visual classification tasks,\nbut their complex computation process and black-box nature hinder decision\ntransparency and interpretability. Class activation maps (CAMs) and recent\nvariants provide ways to visually explain the DNN decision-making process by\ndisplaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation\nonly offers relative attention information, that is, on an attention heatmap,\nwe can interpret which image region is more or less important than the others.\nHowever, these regions cannot be meaningfully compared across classes, and the\ncontribution of each region to the model's class prediction is not revealed. To\naddress these challenges that ultimately lead to better DNN Interpretation, in\nthis paper, we propose CAPE, a novel reformulation of CAM that provides a\nunified and probabilistically meaningful assessment of the contributions of\nimage regions. We quantitatively and qualitatively compare CAPE with\nstate-of-the-art CAM methods on CUB and ImageNet benchmark datasets to\ndemonstrate enhanced interpretability. We also test on a cytology imaging\ndataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML)\ndiagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE.\n","authors":["Townim Faisal Chowdhury","Kewen Liao","Vu Minh Hieu Phan","Minh-Son To","Yutong Xie","Kevin Hung","David Ross","Anton van den Hengel","Johan W. Verjans","Zhibin Liao"],"pdf_url":"https://arxiv.org/pdf/2404.02388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02370v1","updated":"2024-04-03T00:09:05Z","published":"2024-04-03T00:09:05Z","title":"Enhancing Human-Computer Interaction in Chest X-ray Analysis using\n Vision and Language Model with Eye Gaze Patterns","summary":" Recent advancements in Computer Assisted Diagnosis have shown promising\nperformance in medical imaging tasks, particularly in chest X-ray analysis.\nHowever, the interaction between these models and radiologists has been\nprimarily limited to input images. This work proposes a novel approach to\nenhance human-computer interaction in chest X-ray analysis using\nVision-Language Models (VLMs) enhanced with radiologists' attention by\nincorporating eye gaze data alongside textual prompts. Our approach leverages\nheatmaps generated from eye gaze data, overlaying them onto medical images to\nhighlight areas of intense radiologist's focus during chest X-ray evaluation.\nWe evaluate this methodology in tasks such as visual question answering, chest\nX-ray report automation, error detection, and differential diagnosis. Our\nresults demonstrate the inclusion of eye gaze information significantly\nenhances the accuracy of chest X-ray analysis. Also, the impact of eye gaze on\nfine-tuning was confirmed as it outperformed other medical VLMs in all tasks\nexcept visual question answering. This work marks the potential of leveraging\nboth the VLM's capabilities and the radiologist's domain knowledge to improve\nthe capabilities of AI models in medical imaging, paving a novel way for\nComputer Assisted Diagnosis with a human-centred AI.\n","authors":["Yunsoo Kim","Jinge Wu","Yusuf Abdulle","Yue Gao","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2404.02370v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.03121v1","updated":"2024-04-03T23:59:59Z","published":"2024-04-03T23:59:59Z","title":"Utilizing Computer Vision for Continuous Monitoring of Vaccine Side\n Effects in Experimental Mice","summary":" The demand for improved efficiency and accuracy in vaccine safety assessments\nis increasing. Here, we explore the application of computer vision technologies\nto automate the monitoring of experimental mice for potential side effects\nafter vaccine administration. Traditional observation methods are\nlabor-intensive and lack the capability for continuous monitoring. By deploying\na computer vision system, our research aims to improve the efficiency and\naccuracy of vaccine safety assessments. The methodology involves training\nmachine learning models on annotated video data of mice behaviors pre- and\npost-vaccination. Preliminary results indicate that computer vision effectively\nidentify subtle changes, signaling possible side effects. Therefore, our\napproach has the potential to significantly enhance the monitoring process in\nvaccine trials in animals, providing a practical solution to the limitations of\nhuman observation.\n","authors":["Chuang Li","Shuai Shao","Willian Mikason","Rubing Lin","Yantong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.03121v1.pdf","comment":"1 figure"},{"id":"http://arxiv.org/abs/2404.03118v1","updated":"2024-04-03T23:57:34Z","published":"2024-04-03T23:57:34Z","title":"LVLM-Intrepret: An Interpretability Tool for Large Vision-Language\n Models","summary":" In the rapidly evolving landscape of artificial intelligence, multi-modal\nlarge language models are emerging as a significant area of interest. These\nmodels, which combine various forms of data input, are becoming increasingly\npopular. However, understanding their internal mechanisms remains a complex\ntask. Numerous advancements have been made in the field of explainability tools\nand mechanisms, yet there is still much to explore. In this work, we present a\nnovel interactive application aimed towards understanding the internal\nmechanisms of large vision-language models. Our interface is designed to\nenhance the interpretability of the image patches, which are instrumental in\ngenerating an answer, and assess the efficacy of the language model in\ngrounding its output in the image. With our application, a user can\nsystematically investigate the model and uncover system limitations, paving the\nway for enhancements in system capabilities. Finally, we present a case study\nof how our application can aid in understanding failure mechanisms in a popular\nlarge multi-modal model: LLaVA.\n","authors":["Gabriela Ben Melech Stan","Raanan Yehezkel Rohekar","Yaniv Gurwicz","Matthew Lyle Olson","Anahita Bhiwandiwalla","Estelle Aflalo","Chenfei Wu","Nan Duan","Shao-Yen Tseng","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2404.03118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03110v1","updated":"2024-04-03T23:24:25Z","published":"2024-04-03T23:24:25Z","title":"Ego-Motion Aware Target Prediction Module for Robust Multi-Object\n Tracking","summary":" Multi-object tracking (MOT) is a prominent task in computer vision with\napplication in autonomous driving, responsible for the simultaneous tracking of\nmultiple object trajectories. Detection-based multi-object tracking (DBT)\nalgorithms detect objects using an independent object detector and predict the\nimminent location of each target. Conventional prediction methods in DBT\nutilize Kalman Filter(KF) to extrapolate the target location in the upcoming\nframes by supposing a constant velocity motion model. These methods are\nespecially hindered in autonomous driving applications due to dramatic camera\nmotion or unavailable detections. Such limitations lead to tracking failures\nmanifested by numerous identity switches and disrupted trajectories. In this\npaper, we introduce a novel KF-based prediction module called the Ego-motion\nAware Target Prediction (EMAP) module by focusing on the integration of camera\nmotion and depth information with object motion models. Our proposed method\ndecouples the impact of camera rotational and translational velocity from the\nobject trajectories by reformulating the Kalman Filter. This reformulation\nenables us to reject the disturbances caused by camera motion and maximizes the\nreliability of the object motion model. We integrate our module with four\nstate-of-the-art base MOT algorithms, namely OC-SORT, Deep OC-SORT, ByteTrack,\nand BoT-SORT. In particular, our evaluation on the KITTI MOT dataset\ndemonstrates that EMAP remarkably drops the number of identity switches (IDSW)\nof OC-SORT and Deep OC-SORT by 73% and 21%, respectively. At the same time, it\nelevates other performance metrics such as HOTA by more than 5%. Our source\ncode is available at https://github.com/noyzzz/EMAP.\n","authors":["Navid Mahdian","Mohammad Jani","Amir M. Soufi Enayati","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2404.03110v1.pdf","comment":"7 pages, 4 figures, submitted to IROS2024"},{"id":"http://arxiv.org/abs/2404.03109v1","updated":"2024-04-03T23:20:40Z","published":"2024-04-03T23:20:40Z","title":"Many-to-many Image Generation with Auto-regressive Diffusion Models","summary":" Recent advancements in image generation have made significant progress, yet\nexisting models present limitations in perceiving and generating an arbitrary\nnumber of interrelated images within a broad context. This limitation becomes\nincreasingly critical as the demand for multi-image scenarios, such as\nmulti-view images and visual narratives, grows with the expansion of multimedia\nplatforms. This paper introduces a domain-general framework for many-to-many\nimage generation, capable of producing interrelated image series from a given\nset of images, offering a scalable solution that obviates the need for\ntask-specific solutions across different multi-image scenarios. To facilitate\nthis, we present MIS, a novel large-scale multi-image dataset, containing 12M\nsynthetic multi-image samples, each with 25 interconnected images. Utilizing\nStable Diffusion with varied latent noises, our method produces a set of\ninterconnected images from a single caption. Leveraging MIS, we learn M2M, an\nautoregressive model for many-to-many generation, where each image is modeled\nwithin a diffusion framework. Throughout training on the synthetic MIS, the\nmodel excels in capturing style and content from preceding images - synthetic\nor real - and generates novel images following the captured patterns.\nFurthermore, through task-specific fine-tuning, our model demonstrates its\nadaptability to various multi-image generation tasks, including Novel View\nSynthesis and Visual Procedure Generation.\n","authors":["Ying Shen","Yizhe Zhang","Shuangfei Zhai","Lifu Huang","Joshua M. Susskind","Jiatao Gu"],"pdf_url":"https://arxiv.org/pdf/2404.03109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03097v1","updated":"2024-04-03T22:38:54Z","published":"2024-04-03T22:38:54Z","title":"SalFoM: Dynamic Saliency Prediction with Video Foundation Models","summary":" Recent advancements in video saliency prediction (VSP) have shown promising\nperformance compared to the human visual system, whose emulation is the primary\ngoal of VSP. However, current state-of-the-art models employ spatio-temporal\ntransformers trained on limited amounts of data, hindering generalizability\nadaptation to downstream tasks. The benefits of vision foundation models\npresent a potential solution to improve the VSP process. However, adapting\nimage foundation models to the video domain presents significant challenges in\nmodeling scene dynamics and capturing temporal information. To address these\nchallenges, and as the first initiative to design a VSP model based on video\nfoundation models, we introduce SalFoM, a novel encoder-decoder video\ntransformer architecture. Our model employs UnMasked Teacher (UMT) as feature\nextractor and presents a heterogeneous decoder which features a locality-aware\nspatio-temporal transformer and integrates local and global spatio-temporal\ninformation from various perspectives to produce the final saliency map. Our\nqualitative and quantitative experiments on the challenging VSP benchmark\ndatasets of DHF1K, Hollywood-2 and UCF-Sports demonstrate the superiority of\nour proposed model in comparison with the state-of-the-art methods.\n","authors":["Morteza Moradi","Mohammad Moradi","Francesco Rundo","Concetto Spampinato","Ali Borji","Simone Palazzo"],"pdf_url":"https://arxiv.org/pdf/2404.03097v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.02460v2","updated":"2024-04-03T22:23:25Z","published":"2024-01-04T08:39:13Z","title":"Improved Zero-Shot Classification by Adapting VLMs with Text\n Descriptions","summary":" The zero-shot performance of existing vision-language models (VLMs) such as\nCLIP is limited by the availability of large-scale, aligned image and text\ndatasets in specific domains. In this work, we leverage two complementary\nsources of information -- descriptions of categories generated by large\nlanguage models (LLMs) and abundant, fine-grained image classification datasets\n-- to improve the zero-shot classification performance of VLMs across\nfine-grained domains. On the technical side, we develop methods to train VLMs\nwith this \"bag-level\" image-text supervision. We find that simply using these\nattributes at test-time does not improve performance, but our training\nstrategy, for example, on the iNaturalist dataset, leads to an average\nimprovement of 4-5% in zero-shot classification accuracy for novel categories\nof birds and flowers. Similar improvements are observed in domains where a\nsubset of the categories was used to fine-tune the model. By prompting LLMs in\nvarious ways, we generate descriptions that capture visual appearance, habitat,\nand geographic regions and pair them with existing attributes such as the\ntaxonomic structure of the categories. We systematically evaluate their ability\nto improve zero-shot categorization in natural domains. Our findings suggest\nthat geographic priors can be just as effective and are complementary to visual\nappearance. Our method also outperforms prior work on prompt-based tuning of\nVLMs. We release the benchmark, consisting of 14 datasets at\nhttps://github.com/cvl-umass/AdaptCLIPZS , which will contribute to future\nresearch in zero-shot recognition.\n","authors":["Oindrila Saha","Grant Van Horn","Subhransu Maji"],"pdf_url":"https://arxiv.org/pdf/2401.02460v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01988v2","updated":"2024-04-03T21:47:52Z","published":"2024-04-02T14:26:18Z","title":"Cooperative Students: Navigating Unsupervised Domain Adaptation in\n Nighttime Object Detection","summary":" Unsupervised Domain Adaptation (UDA) has shown significant advancements in\nobject detection under well-lit conditions; however, its performance degrades\nnotably in low-visibility scenarios, especially at night, posing challenges not\nonly for its adaptability in low signal-to-noise ratio (SNR) conditions but\nalso for the reliability and efficiency of automated vehicles. To address this\nproblem, we propose a \\textbf{Co}operative \\textbf{S}tudents (\\textbf{CoS})\nframework that innovatively employs global-local transformations (GLT) and a\nproxy-based target consistency (PTC) mechanism to capture the spatial\nconsistency in day- and night-time scenarios effectively, and thus bridge the\nsignificant domain shift across contexts. Building upon this, we further devise\nan adaptive IoU-informed thresholding (AIT) module to gradually avoid\noverlooking potential true positives and enrich the latent information in the\ntarget domain. Comprehensive experiments show that CoS essentially enhanced UDA\nperformance in low-visibility conditions and surpasses current state-of-the-art\ntechniques, achieving an increase in mAP of 3.0\\%, 1.9\\%, and 2.5\\% on BDD100K,\nSHIFT, and ACDC datasets, respectively. Code is available at\nhttps://github.com/jichengyuan/Cooperitive_Students.\n","authors":["Jicheng Yuan","Anh Le-Tuan","Manfred Hauswirth","Danh Le-Phuoc"],"pdf_url":"https://arxiv.org/pdf/2404.01988v2.pdf","comment":"Code is available at\n https://github.com/jichengyuan/Cooperitive_Students"},{"id":"http://arxiv.org/abs/2404.03070v1","updated":"2024-04-03T21:18:27Z","published":"2024-04-03T21:18:27Z","title":"Behind the Veil: Enhanced Indoor 3D Scene Reconstruction with Occluded\n Surfaces Completion","summary":" In this paper, we present a novel indoor 3D reconstruction method with\noccluded surface completion, given a sequence of depth readings. Prior\nstate-of-the-art (SOTA) methods only focus on the reconstruction of the visible\nareas in a scene, neglecting the invisible areas due to the occlusions, e.g.,\nthe contact surface between furniture, occluded wall and floor. Our method\ntackles the task of completing the occluded scene surfaces, resulting in a\ncomplete 3D scene mesh. The core idea of our method is learning 3D geometry\nprior from various complete scenes to infer the occluded geometry of an unseen\nscene from solely depth measurements. We design a coarse-fine hierarchical\noctree representation coupled with a dual-decoder architecture, i.e.,\nGeo-decoder and 3D Inpainter, which jointly reconstructs the complete 3D scene\ngeometry. The Geo-decoder with detailed representation at fine levels is\noptimized online for each scene to reconstruct visible surfaces. The 3D\nInpainter with abstract representation at coarse levels is trained offline\nusing various scenes to complete occluded surfaces. As a result, while the\nGeo-decoder is specialized for an individual scene, the 3D Inpainter can be\ngenerally applied across different scenes. We evaluate the proposed method on\nthe 3D Completed Room Scene (3D-CRS) and iTHOR datasets, significantly\noutperforming the SOTA methods by a gain of 16.8% and 24.2% in terms of the\ncompleteness of 3D reconstruction. 3D-CRS dataset including a complete 3D mesh\nof each scene is provided at project webpage.\n","authors":["Su Sun","Cheng Zhao","Yuliang Guo","Ruoyu Wang","Xinyu Huang","Yingjie Victor Chen","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2404.03070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03067v1","updated":"2024-04-03T21:16:19Z","published":"2024-04-03T21:16:19Z","title":"Self-supervised 6-DoF Robot Grasping by Demonstration via Augmented\n Reality Teleoperation System","summary":" Most existing 6-DoF robot grasping solutions depend on strong supervision on\ngrasp pose to ensure satisfactory performance, which could be laborious and\nimpractical when the robot works in some restricted area. To this end, we\npropose a self-supervised 6-DoF grasp pose detection framework via an Augmented\nReality (AR) teleoperation system that can efficiently learn human\ndemonstrations and provide 6-DoF grasp poses without grasp pose annotations.\nSpecifically, the system collects the human demonstration from the AR\nenvironment and contrastively learns the grasping strategy from the\ndemonstration. For the real-world experiment, the proposed system leads to\nsatisfactory grasping abilities and learning to grasp unknown objects within\nthree demonstrations.\n","authors":["Xiwen Dengxiong","Xueting Wang","Shi Bai","Yunbo Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.03067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03043v1","updated":"2024-04-03T20:05:00Z","published":"2024-04-03T20:05:00Z","title":"Linear Anchored Gaussian Mixture Model for Location and Width\n Computation of Objects in Thick Line Shape","summary":" An accurate detection of the centerlines of linear objects is a challenging\ntopic in many sensitive real-world applications such X-ray imaging, remote\nsensing and lane marking detection in road traffic. Model-based approaches\nusing Hough and Radon transforms are often used but, are not recommended for\nthick line detection, whereas approaches based on image derivatives need\nfurther step-by-step processing, making their efficiency dependent on each step\noutcomes. In this paper, we aim to detect linear structures found in images by\nconsidering the 3D representation of the image gray levels as a finite mixture\nmodel of statistical distribution. The latter, which we named linear anchored\nGaussian distribution could be parametrized by a scale value {\\sigma}\ndescribing the linear structure thickness and a line equation, parametrized, in\nturn, by a radius \\r{ho} and an orientation angle {\\theta}, describing the\nlinear structure centerline location. Expectation-Maximization (EM) algorithm\nis used for the mixture model parameter estimation, where a new paradigm, using\nthe background subtraction for the likelihood function computation, is\nproposed. For the EM algorithm, two {\\theta} parameter initialization schemes\nare used: the first one is based on a random choice of the first component of\n{\\theta} vector, whereas the second is based on the image Hessian with a\nsimultaneous computation of the mixture model components number. Experiments on\nreal world images and synthetic images corrupted by blur and additive noise\nshow the good performance of the proposed methods, where the algorithm using\nbackground subtraction and Hessian-based {\\theta} initialization provides an\noutstanding accuracy of the linear structure detection despite irregular image\nbackground and presence of blur and noise.\n","authors":["Nafaa Nacereddine","Djemel Ziou","Aicha Baya Goumeidane"],"pdf_url":"https://arxiv.org/pdf/2404.03043v1.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2311.05698v3","updated":"2024-04-03T20:04:49Z","published":"2023-11-09T19:15:12Z","title":"Mirasol3B: A Multimodal Autoregressive model for time-aligned and\n contextual modalities","summary":" One of the main challenges of multimodal learning is the need to combine\nheterogeneous modalities (e.g., video, audio, text). For example, video and\naudio are obtained at much higher rates than text and are roughly aligned in\ntime. They are often not synchronized with text, which comes as a global\ncontext, e.g., a title, or a description. Furthermore, video and audio inputs\nare of much larger volumes, and grow as the video length increases, which\nnaturally requires more compute dedicated to these modalities and makes\nmodeling of long-range dependencies harder.\n We here decouple the multimodal modeling, dividing it into separate, focused\nautoregressive models, processing the inputs according to the characteristics\nof the modalities. We propose a multimodal model, called Mirasol3B, consisting\nof an autoregressive component for the time-synchronized modalities (audio and\nvideo), and an autoregressive component for the context modalities which are\nnot necessarily aligned in time but are still sequential. To address the\nlong-sequences of the video-audio inputs, we propose to further partition the\nvideo and audio sequences in consecutive snippets and autoregressively process\ntheir representations. To that end, we propose a Combiner mechanism, which\nmodels the audio-video information jointly within a timeframe. The Combiner\nlearns to extract audio and video features from raw spatio-temporal signals,\nand then learns to fuse these features producing compact but expressive\nrepresentations per snippet.\n Our approach achieves the state-of-the-art on well established multimodal\nbenchmarks, outperforming much larger models. It effectively addresses the high\ncomputational demand of media inputs by both learning compact representations,\ncontrolling the sequence length of the audio-video feature representations, and\nmodeling their dependencies in time.\n","authors":["AJ Piergiovanni","Isaac Noble","Dahun Kim","Michael S. Ryoo","Victor Gomes","Anelia Angelova"],"pdf_url":"https://arxiv.org/pdf/2311.05698v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03042v1","updated":"2024-04-03T20:04:44Z","published":"2024-04-03T20:04:44Z","title":"AWOL: Analysis WithOut synthesis using Language","summary":" Many classical parametric 3D shape models exist, but creating novel shapes\nwith such models requires expert knowledge of their parameters. For example,\nimagine creating a specific type of tree using procedural graphics or a new\nkind of animal from a statistical shape model. Our key idea is to leverage\nlanguage to control such existing models to produce novel shapes. This involves\nlearning a mapping between the latent space of a vision-language model and the\nparameter space of the 3D model, which we do using a small set of shape and\ntext pairs. Our hypothesis is that mapping from language to parameters allows\nus to generate parameters for objects that were never seen during training. If\nthe mapping between language and parameters is sufficiently smooth, then\ninterpolation or generalization in language should translate appropriately into\nnovel 3D shapes. We test our approach with two very different types of\nparametric shape models (quadrupeds and arboreal trees). We use a learned\nstatistical shape model of quadrupeds and show that we can use text to generate\nnew animals not present during training. In particular, we demonstrate\nstate-of-the-art shape estimation of 3D dogs. This work also constitutes the\nfirst language-driven method for generating 3D trees. Finally, embedding images\nin the CLIP latent space enables us to generate animals and trees directly from\nimages.\n","authors":["Silvia Zuffi","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2404.03042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02132v2","updated":"2024-04-03T19:45:02Z","published":"2024-04-02T17:40:29Z","title":"ViTamin: Designing Scalable Vision Models in the Vision-Language Era","summary":" Recent breakthroughs in vision-language models (VLMs) start a new page in the\nvision community. The VLMs provide stronger and more generalizable feature\nembeddings compared to those from ImageNet-pretrained models, thanks to the\ntraining on the large-scale Internet image-text pairs. However, despite the\namazing achievement from the VLMs, vanilla Vision Transformers (ViTs) remain\nthe default choice for the image encoder. Although pure transformer proves its\neffectiveness in the text encoding area, it remains questionable whether it is\nalso the case for image encoding, especially considering that various types of\nnetworks are proposed on the ImageNet benchmark, which, unfortunately, are\nrarely studied in VLMs. Due to small data/model scale, the original conclusions\nof model design on ImageNet can be limited and biased. In this paper, we aim at\nbuilding an evaluation protocol of vision models in the vision-language era\nunder the contrastive language-image pretraining (CLIP) framework. We provide a\ncomprehensive way to benchmark different vision models, covering their\nzero-shot performance and scalability in both model and training data sizes. To\nthis end, we introduce ViTamin, a new vision models tailored for VLMs.\nViTamin-L significantly outperforms ViT-L by 2.0% ImageNet zero-shot accuracy,\nwhen using the same publicly available DataComp-1B dataset and the same\nOpenCLIP training scheme. ViTamin-L presents promising results on 60 diverse\nbenchmarks, including classification, retrieval, open-vocabulary detection and\nsegmentation, and large multi-modal models. When further scaling up the model\nsize, our ViTamin-XL with only 436M parameters attains 82.9% ImageNet zero-shot\naccuracy, surpassing 82.0% achieved by EVA-E that has ten times more parameters\n(4.4B).\n","authors":["Jieneng Chen","Qihang Yu","Xiaohui Shen","Alan Yuille","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02132v2.pdf","comment":"CVPR 2024; https://github.com/Beckschen/ViTamin"},{"id":"http://arxiv.org/abs/2404.03022v1","updated":"2024-04-03T19:17:43Z","published":"2024-04-03T19:17:43Z","title":"BCAmirs at SemEval-2024 Task 4: Beyond Words: A Multimodal and\n Multilingual Exploration of Persuasion in Memes","summary":" Memes, combining text and images, frequently use metaphors to convey\npersuasive messages, shaping public opinion. Motivated by this, our team\nengaged in SemEval-2024 Task 4, a hierarchical multi-label classification task\ndesigned to identify rhetorical and psychological persuasion techniques\nembedded within memes. To tackle this problem, we introduced a caption\ngeneration step to assess the modality gap and the impact of additional\nsemantic information from images, which improved our result. Our best model\nutilizes GPT-4 generated captions alongside meme text to fine-tune RoBERTa as\nthe text encoder and CLIP as the image encoder. It outperforms the baseline by\na large margin in all 12 subtasks. In particular, it ranked in top-3 across all\nlanguages in Subtask 2a, and top-4 in Subtask 2b, demonstrating quantitatively\nstrong performance. The improvement achieved by the introduced intermediate\nstep is likely attributable to the metaphorical essence of images that\nchallenges visual encoders. This highlights the potential for improving\nabstract visual semantics encoding.\n","authors":["Amirhossein Abaskohi","Amirhossein Dabiriaghdam","Lele Wang","Giuseppe Carenini"],"pdf_url":"https://arxiv.org/pdf/2404.03022v1.pdf","comment":"11 pages, 5 tables, 2 figures, Proceedings of the 18th International\n Workshop on Semantic Evaluation (SemEval-2024) @ NAACL 2024"},{"id":"http://arxiv.org/abs/2306.08103v4","updated":"2024-04-03T19:16:02Z","published":"2023-06-13T19:48:56Z","title":"Generating Images with 3D Annotations Using Diffusion Models","summary":" Diffusion models have emerged as a powerful generative method, capable of\nproducing stunning photo-realistic images from natural language descriptions.\nHowever, these models lack explicit control over the 3D structure in the\ngenerated images. Consequently, this hinders our ability to obtain detailed 3D\nannotations for the generated images or to craft instances with specific poses\nand distances. In this paper, we propose 3D Diffusion Style Transfer (3D-DST),\nwhich incorporates 3D geometry control into diffusion models. Our method\nexploits ControlNet, which extends diffusion models by using visual prompts in\naddition to text prompts. We generate images of the 3D objects taken from 3D\nshape repositories (e.g., ShapeNet and Objaverse), render them from a variety\nof poses and viewing directions, compute the edge maps of the rendered images,\nand use these edge maps as visual prompts to generate realistic images. With\nexplicit 3D geometry control, we can easily change the 3D structures of the\nobjects in the generated images and obtain ground-truth 3D annotations\nautomatically. This allows us to improve a wide range of vision tasks, e.g.,\nclassification and 3D pose estimation, in both in-distribution (ID) and\nout-of-distribution (OOD) settings. We demonstrate the effectiveness of our\nmethod through extensive experiments on ImageNet-100/200, ImageNet-R,\nPASCAL3D+, ObjectNet3D, and OOD-CV. The results show that our method\nsignificantly outperforms existing methods, e.g., 3.8 percentage points on\nImageNet-100 using DeiT-B.\n","authors":["Wufei Ma","Qihao Liu","Jiahao Wang","Angtian Wang","Xiaoding Yuan","Yi Zhang","Zihao Xiao","Guofeng Zhang","Beijia Lu","Ruxiao Duan","Yongrui Qi","Adam Kortylewski","Yaoyao Liu","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2306.08103v4.pdf","comment":"ICLR 2024 Spotlight. Code: https://ccvl.jhu.edu/3D-DST/"},{"id":"http://arxiv.org/abs/2404.03015v1","updated":"2024-04-03T18:54:27Z","published":"2024-04-03T18:54:27Z","title":"DPFT: Dual Perspective Fusion Transformer for Camera-Radar-based Object\n Detection","summary":" The perception of autonomous vehicles has to be efficient, robust, and\ncost-effective. However, cameras are not robust against severe weather\nconditions, lidar sensors are expensive, and the performance of radar-based\nperception is still inferior to the others. Camera-radar fusion methods have\nbeen proposed to address this issue, but these are constrained by the typical\nsparsity of radar point clouds and often designed for radars without elevation\ninformation. We propose a novel camera-radar fusion approach called Dual\nPerspective Fusion Transformer (DPFT), designed to overcome these limitations.\nOur method leverages lower-level radar data (the radar cube) instead of the\nprocessed point clouds to preserve as much information as possible and employs\nprojections in both the camera and ground planes to effectively use radars with\nelevation information and simplify the fusion with camera data. As a result,\nDPFT has demonstrated state-of-the-art performance on the K-Radar dataset while\nshowing remarkable robustness against adverse weather conditions and\nmaintaining a low inference time. The code is made available as open-source\nsoftware under https://github.com/TUMFTM/DPFT.\n","authors":["Felix Fent","Andras Palffy","Holger Caesar"],"pdf_url":"https://arxiv.org/pdf/2404.03015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03010v1","updated":"2024-04-03T18:42:19Z","published":"2024-04-03T18:42:19Z","title":"Skeleton Recall Loss for Connectivity Conserving and Resource Efficient\n Segmentation of Thin Tubular Structures","summary":" Accurately segmenting thin tubular structures, such as vessels, nerves, roads\nor concrete cracks, is a crucial task in computer vision. Standard deep\nlearning-based segmentation loss functions, such as Dice or Cross-Entropy,\nfocus on volumetric overlap, often at the expense of preserving structural\nconnectivity or topology. This can lead to segmentation errors that adversely\naffect downstream tasks, including flow calculation, navigation, and structural\ninspection. Although current topology-focused losses mark an improvement, they\nintroduce significant computational and memory overheads. This is particularly\nrelevant for 3D data, rendering these losses infeasible for larger volumes as\nwell as increasingly important multi-class segmentation problems. To mitigate\nthis, we propose a novel Skeleton Recall Loss, which effectively addresses\nthese challenges by circumventing intensive GPU-based calculations with\ninexpensive CPU operations. It demonstrates overall superior performance to\ncurrent state-of-the-art approaches on five public datasets for\ntopology-preserving segmentation, while substantially reducing computational\noverheads by more than 90%. In doing so, we introduce the first multi-class\ncapable loss function for thin structure segmentation, excelling in both\nefficiency and efficacy for topology-preservation.\n","authors":["Yannick Kirchhoff","Maximilian R. Rokuss","Saikat Roy","Balint Kovacs","Constantin Ulrich","Tassilo Wald","Maximilian Zenk","Philipp Vollmuth","Jens Kleesiek","Fabian Isensee","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2404.03010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02999v1","updated":"2024-04-03T18:40:48Z","published":"2024-04-03T18:40:48Z","title":"MeshBrush: Painting the Anatomical Mesh with Neural Stylization for\n Endoscopy","summary":" Style transfer is a promising approach to close the sim-to-real gap in\nmedical endoscopy. Rendering realistic endoscopic videos by traversing\npre-operative scans (such as MRI or CT) can generate realistic simulations as\nwell as ground truth camera poses and depth maps. Although image-to-image (I2I)\ntranslation models such as CycleGAN perform well, they are unsuitable for\nvideo-to-video synthesis due to the lack of temporal consistency, resulting in\nartifacts between frames. We propose MeshBrush, a neural mesh stylization\nmethod to synthesize temporally consistent videos with differentiable\nrendering. MeshBrush uses the underlying geometry of patient imaging data while\nleveraging existing I2I methods. With learned per-vertex textures, the stylized\nmesh guarantees consistency while producing high-fidelity outputs. We\ndemonstrate that mesh stylization is a promising approach for creating\nrealistic simulations for downstream tasks such as training and preoperative\nplanning. Although our method is tested and designed for ureteroscopy, its\ncomponents are transferable to general endoscopic and laparoscopic procedures.\n","authors":["John J. Han","Ayberk Acar","Nicholas Kavoussi","Jie Ying Wu"],"pdf_url":"https://arxiv.org/pdf/2404.02999v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.02990v1","updated":"2024-04-03T18:20:41Z","published":"2024-04-03T18:20:41Z","title":"ASAP: Interpretable Analysis and Summarization of AI-generated Image\n Patterns at Scale","summary":" Generative image models have emerged as a promising technology to produce\nrealistic images. Despite potential benefits, concerns grow about its misuse,\nparticularly in generating deceptive images that could raise significant\nethical, legal, and societal issues. Consequently, there is growing demand to\nempower users to effectively discern and comprehend patterns of AI-generated\nimages. To this end, we developed ASAP, an interactive visualization system\nthat automatically extracts distinct patterns of AI-generated images and allows\nusers to interactively explore them via various views. To uncover fake\npatterns, ASAP introduces a novel image encoder, adapted from CLIP, which\ntransforms images into compact \"distilled\" representations, enriched with\ninformation for differentiating authentic and fake images. These\nrepresentations generate gradients that propagate back to the attention maps of\nCLIP's transformer block. This process quantifies the relative importance of\neach pixel to image authenticity or fakeness, exposing key deceptive patterns.\nASAP enables the at scale interactive analysis of these patterns through\nmultiple, coordinated visualizations. This includes a representation overview\nwith innovative cell glyphs to aid in the exploration and qualitative\nevaluation of fake patterns across a vast array of images, as well as a pattern\nview that displays authenticity-indicating patterns in images and quantifies\ntheir impact. ASAP supports the analysis of cutting-edge generative models with\nthe latest architectures, including GAN-based models like proGAN and diffusion\nmodels like the latent diffusion model. We demonstrate ASAP's usefulness\nthrough two usage scenarios using multiple fake image detection benchmark\ndatasets, revealing its ability to identify and understand hidden patterns in\nAI-generated images, especially in detecting fake human faces produced by\ndiffusion-based techniques.\n","authors":["Jinbin Huang","Chen Chen","Aditi Mishra","Bum Chul Kwon","Zhicheng Liu","Chris Bryan"],"pdf_url":"https://arxiv.org/pdf/2404.02990v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.01734v2","updated":"2024-04-03T18:11:54Z","published":"2023-12-04T08:55:46Z","title":"Effective Adapter for Face Recognition in the Wild","summary":" In this paper, we tackle the challenge of face recognition in the wild, where\nimages often suffer from low quality and real-world distortions. Traditional\nheuristic approaches-either training models directly on these degraded images\nor their enhanced counterparts using face restoration techniques-have proven\nineffective, primarily due to the degradation of facial features and the\ndiscrepancy in image domains. To overcome these issues, we propose an effective\nadapter for augmenting existing face recognition models trained on high-quality\nfacial datasets. The key of our adapter is to process both the unrefined and\nenhanced images using two similar structures, one fixed and the other\ntrainable. Such design can confer two benefits. First, the dual-input system\nminimizes the domain gap while providing varied perspectives for the face\nrecognition model, where the enhanced image can be regarded as a complex\nnon-linear transformation of the original one by the restoration model. Second,\nboth two similar structures can be initialized by the pre-trained models\nwithout dropping the past knowledge. The extensive experiments in zero-shot\nsettings show the effectiveness of our method by surpassing baselines of about\n3%, 4%, and 7% in three datasets. Our code will be publicly available.\n","authors":["Yunhao Liu","Yu-Ju Tsai","Kelvin C. K. Chan","Xiangtai Li","Lu Qi","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01734v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02973v1","updated":"2024-04-03T18:00:36Z","published":"2024-04-03T18:00:36Z","title":"Scaling Laws for Galaxy Images","summary":" We present the first systematic investigation of supervised scaling laws\noutside of an ImageNet-like context - on images of galaxies. We use 840k galaxy\nimages and over 100M annotations by Galaxy Zoo volunteers, comparable in scale\nto Imagenet-1K. We find that adding annotated galaxy images provides a power\nlaw improvement in performance across all architectures and all tasks, while\nadding trainable parameters is effective only for some (typically more\nsubjectively challenging) tasks. We then compare the downstream performance of\nfinetuned models pretrained on either ImageNet-12k alone vs. additionally\npretrained on our galaxy images. We achieve an average relative error rate\nreduction of 31% across 5 downstream tasks of scientific interest. Our\nfinetuned models are more label-efficient and, unlike their\nImageNet-12k-pretrained equivalents, often achieve linear transfer performance\nequal to that of end-to-end finetuning. We find relatively modest additional\ndownstream benefits from scaling model size, implying that scaling alone is not\nsufficient to address our domain gap, and suggest that practitioners with\nqualitatively different images might benefit more from in-domain adaption\nfollowed by targeted downstream labelling.\n","authors":["Mike Walmsley","Micah Bowles","Anna M. M. Scaife","Jason Shingirai Makechemu","Alexander J. Gordon","Annette M. N. Ferguson","Robert G. Mann","James Pearson","Jürgen J. Popp","Jo Bovy","Josh Speagle","Hugh Dickinson","Lucy Fortson","Tobias Géron","Sandor Kruk","Chris J. Lintott","Kameswara Mantha","Devina Mohan","David O'Ryan","Inigo V. Slijepevic"],"pdf_url":"https://arxiv.org/pdf/2404.02973v1.pdf","comment":"10+6 pages, 12 figures. Appendix C2 based on arxiv:2206.11927. Code,\n demos, documentation at https://github.com/mwalmsley/zoobot"},{"id":"http://arxiv.org/abs/2402.13729v4","updated":"2024-04-03T11:03:35Z","published":"2024-02-21T11:46:16Z","title":"Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet\n Representation","summary":" Generating high-quality videos that synthesize desired realistic content is a\nchallenging task due to their intricate high-dimensionality and complexity of\nvideos. Several recent diffusion-based methods have shown comparable\nperformance by compressing videos to a lower-dimensional latent space, using\ntraditional video autoencoder architecture. However, such method that employ\nstandard frame-wise 2D and 3D convolution fail to fully exploit the\nspatio-temporal nature of videos. To address this issue, we propose a novel\nhybrid video diffusion model, called HVDM, which can capture spatio-temporal\ndependencies more effectively. The HVDM is trained by a hybrid video\nautoencoder which extracts a disentangled representation of the video\nincluding: (i) a global context information captured by a 2D projected latent\n(ii) a local volume information captured by 3D convolutions with wavelet\ndecomposition (iii) a frequency information for improving the video\nreconstruction. Based on this disentangled representation, our hybrid\nautoencoder provide a more comprehensive video latent enriching the generated\nvideos with fine structures and details. Experiments on video generation\nbenchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed\napproach achieves state-of-the-art video generation quality, showing a wide\nrange of video applications (e.g., long video generation, image-to-video, and\nvideo dynamics control).\n","authors":["Kihong Kim","Haneol Lee","Jihye Park","Seyeon Kim","Kwanghee Lee","Seungryong Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2402.13729v4.pdf","comment":"Project page is available at https://hxngiee.github.io/HVDM/"},{"id":"http://arxiv.org/abs/2110.15352v2","updated":"2024-04-03T03:12:53Z","published":"2021-10-28T17:58:45Z","title":"MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning","summary":" Tiny deep learning on microcontroller units (MCUs) is challenging due to the\nlimited memory size. We find that the memory bottleneck is due to the\nimbalanced memory distribution in convolutional neural network (CNN) designs:\nthe first several blocks have an order of magnitude larger memory usage than\nthe rest of the network. To alleviate this issue, we propose a generic\npatch-by-patch inference scheduling, which operates only on a small spatial\nregion of the feature map and significantly cuts down the peak memory. However,\nnaive implementation brings overlapping patches and computation overhead. We\nfurther propose network redistribution to shift the receptive field and FLOPs\nto the later stage and reduce the computation overhead. Manually redistributing\nthe receptive field is difficult. We automate the process with neural\narchitecture search to jointly optimize the neural architecture and inference\nscheduling, leading to MCUNetV2. Patch-based inference effectively reduces the\npeak memory usage of existing networks by 4-8x. Co-designed with neural\nnetworks, MCUNetV2 sets a record ImageNet accuracy on MCU (71.8%), and achieves\n>90% accuracy on the visual wake words dataset under only 32kB SRAM. MCUNetV2\nalso unblocks object detection on tiny devices, achieving 16.9% higher mAP on\nPascal VOC compared to the state-of-the-art result. Our study largely addressed\nthe memory bottleneck in tinyML and paved the way for various vision\napplications beyond image classification.\n","authors":["Ji Lin","Wei-Ming Chen","Han Cai","Chuang Gan","Song Han"],"pdf_url":"https://arxiv.org/pdf/2110.15352v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04283v1","updated":"2024-04-03T17:48:31Z","published":"2024-04-03T17:48:31Z","title":"Translation-based Video-to-Video Synthesis","summary":" Translation-based Video Synthesis (TVS) has emerged as a vital research area\nin computer vision, aiming to facilitate the transformation of videos between\ndistinct domains while preserving both temporal continuity and underlying\ncontent features. This technique has found wide-ranging applications,\nencompassing video super-resolution, colorization, segmentation, and more, by\nextending the capabilities of traditional image-to-image translation to the\ntemporal domain. One of the principal challenges faced in TVS is the inherent\nrisk of introducing flickering artifacts and inconsistencies between frames\nduring the synthesis process. This is particularly challenging due to the\nnecessity of ensuring smooth and coherent transitions between video frames.\nEfforts to tackle this challenge have induced the creation of diverse\nstrategies and algorithms aimed at mitigating these unwanted consequences. This\ncomprehensive review extensively examines the latest progress in the realm of\nTVS. It thoroughly investigates emerging methodologies, shedding light on the\nfundamental concepts and mechanisms utilized for proficient video synthesis.\nThis survey also illuminates their inherent strengths, limitations, appropriate\napplications, and potential avenues for future development.\n","authors":["Pratim Saha","Chengcui Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04283v1.pdf","comment":"25 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.05558v1","updated":"2024-04-03T03:28:04Z","published":"2024-04-03T03:28:04Z","title":"JDEC: JPEG Decoding via Enhanced Continuous Cosine Coefficients","summary":" We propose a practical approach to JPEG image decoding, utilizing a local\nimplicit neural representation with continuous cosine formulation. The JPEG\nalgorithm significantly quantizes discrete cosine transform (DCT) spectra to\nachieve a high compression rate, inevitably resulting in quality degradation\nwhile encoding an image. We have designed a continuous cosine spectrum\nestimator to address the quality degradation issue that restores the distorted\nspectrum. By leveraging local DCT formulations, our network has the privilege\nto exploit dequantization and upsampling simultaneously. Our proposed model\nenables decoding compressed images directly across different quality factors\nusing a single pre-trained model without relying on a conventional JPEG\ndecoder. As a result, our proposed network achieves state-of-the-art\nperformance in flexible color image JPEG artifact removal tasks. Our source\ncode is available at https://github.com/WooKyoungHan/JDEC.\n","authors":["Woo Kyoung Han","Sunghoon Im","Jaedeok Kim","Kyong Hwan Jin"],"pdf_url":"https://arxiv.org/pdf/2404.05558v1.pdf","comment":null}]},"2024-04-04T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.03658v1","updated":"2024-04-04T17:59:59Z","published":"2024-04-04T17:59:59Z","title":"Know Your Neighbors: Improving Single-View Reconstruction via Spatial\n Vision-Language Reasoning","summary":" Recovering the 3D scene geometry from a single view is a fundamental yet\nill-posed problem in computer vision. While classical depth estimation methods\ninfer only a 2.5D scene representation limited to the image plane, recent\napproaches based on radiance fields reconstruct a full 3D representation.\nHowever, these methods still struggle with occluded regions since inferring\ngeometry without visual observation requires (i) semantic knowledge of the\nsurroundings, and (ii) reasoning about spatial context. We propose KYN, a novel\nmethod for single-view scene reconstruction that reasons about semantic and\nspatial context to predict each point's density. We introduce a vision-language\nmodulation module to enrich point features with fine-grained semantic\ninformation. We aggregate point representations across the scene through a\nlanguage-guided spatial attention mechanism to yield per-point density\npredictions aware of the 3D semantic context. We show that KYN improves 3D\nshape recovery compared to predicting density for each 3D point in isolation.\nWe achieve state-of-the-art results in scene and object reconstruction on\nKITTI-360, and show improved zero-shot generalization compared to prior work.\nProject page: https://ruili3.github.io/kyn.\n","authors":["Rui Li","Tobias Fischer","Mattia Segu","Marc Pollefeys","Luc Van Gool","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.03658v1.pdf","comment":"CVPR 2024. Project page: https://ruili3.github.io/kyn"},{"id":"http://arxiv.org/abs/2404.03657v1","updated":"2024-04-04T17:59:58Z","published":"2024-04-04T17:59:58Z","title":"OW-VISCap: Open-World Video Instance Segmentation and Captioning","summary":" Open-world video instance segmentation is an important video understanding\ntask. Yet most methods either operate in a closed-world setting, require an\nadditional user-input, or use classic region-based proposals to identify never\nbefore seen objects. Further, these methods only assign a one-word label to\ndetected objects, and don't generate rich object-centric descriptions. They\nalso often suffer from highly overlapping predictions. To address these issues,\nwe propose Open-World Video Instance Segmentation and Captioning (OW-VISCap),\nan approach to jointly segment, track, and caption previously seen or unseen\nobjects in a video. For this, we introduce open-world object queries to\ndiscover never before seen objects without additional user-input. We generate\nrich and descriptive object-centric captions for each detected object via a\nmasked attention augmented LLM input. We introduce an inter-query contrastive\nloss to ensure that the object queries differ from one another. Our generalized\napproach matches or surpasses state-of-the-art on three tasks: open-world video\ninstance segmentation on the BURST dataset, dense video object captioning on\nthe VidSTG dataset, and closed-world video instance segmentation on the OVIS\ndataset.\n","authors":["Anwesa Choudhuri","Girish Chowdhary","Alexander G. Schwing"],"pdf_url":"https://arxiv.org/pdf/2404.03657v1.pdf","comment":"Project page: https://anwesachoudhuri.github.io/OpenWorldVISCap/"},{"id":"http://arxiv.org/abs/2404.03656v1","updated":"2024-04-04T17:59:57Z","published":"2024-04-04T17:59:57Z","title":"MVD-Fusion: Single-view 3D via Depth-consistent Multi-view Generation","summary":" We present MVD-Fusion: a method for single-view 3D inference via generative\nmodeling of multi-view-consistent RGB-D images. While recent methods pursuing\n3D inference advocate learning novel-view generative models, these generations\nare not 3D-consistent and require a distillation process to generate a 3D\noutput. We instead cast the task of 3D inference as directly generating\nmutually-consistent multiple views and build on the insight that additionally\ninferring depth can provide a mechanism for enforcing this consistency.\nSpecifically, we train a denoising diffusion model to generate multi-view RGB-D\nimages given a single RGB input image and leverage the (intermediate noisy)\ndepth estimates to obtain reprojection-based conditioning to maintain\nmulti-view consistency. We train our model using large-scale synthetic dataset\nObajverse as well as the real-world CO3D dataset comprising of generic camera\nviewpoints. We demonstrate that our approach can yield more accurate synthesis\ncompared to recent state-of-the-art, including distillation-based 3D inference\nand prior multi-view generation methods. We also evaluate the geometry induced\nby our multi-view depth prediction and find that it yields a more accurate\nrepresentation than other direct 3D inference approaches.\n","authors":["Hanzhe Hu","Zhizhuo Zhou","Varun Jampani","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2404.03656v1.pdf","comment":"Project page: https://mvd-fusion.github.io/"},{"id":"http://arxiv.org/abs/2404.03654v1","updated":"2024-04-04T17:59:50Z","published":"2024-04-04T17:59:50Z","title":"RaFE: Generative Radiance Fields Restoration","summary":" NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel\nview synthesis and 3D reconstruction, but its performance is sensitive to input\nimage quality, which struggles to achieve high-fidelity rendering when provided\nwith low-quality sparse input viewpoints. Previous methods for NeRF restoration\nare tailored for specific degradation type, ignoring the generality of\nrestoration. To overcome this limitation, we propose a generic radiance fields\nrestoration pipeline, named RaFE, which applies to various types of\ndegradations, such as low resolution, blurriness, noise, compression artifacts,\nor their combinations. Our approach leverages the success of off-the-shelf 2D\nrestoration methods to recover the multi-view images individually. Instead of\nreconstructing a blurred NeRF by averaging inconsistencies, we introduce a\nnovel approach using Generative Adversarial Networks (GANs) for NeRF generation\nto better accommodate the geometric and appearance inconsistencies present in\nthe multi-view images. Specifically, we adopt a two-level tri-plane\narchitecture, where the coarse level remains fixed to represent the low-quality\nNeRF, and a fine-level residual tri-plane to be added to the coarse level is\nmodeled as a distribution with GAN to capture potential variations in\nrestoration. We validate RaFE on both synthetic and real cases for various\nrestoration tasks, demonstrating superior performance in both quantitative and\nqualitative evaluations, surpassing other 3D restoration methods specific to\nsingle task. Please see our project website\nhttps://zkaiwu.github.io/RaFE-Project/.\n","authors":["Zhongkai Wu","Ziyu Wan","Jing Zhang","Jing Liao","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2404.03654v1.pdf","comment":"Project Page: https://zkaiwu.github.io/RaFE-Project/"},{"id":"http://arxiv.org/abs/2404.03653v1","updated":"2024-04-04T17:59:46Z","published":"2024-04-04T17:59:46Z","title":"CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept\n Matching","summary":" Diffusion models have demonstrated great success in the field of\ntext-to-image generation. However, alleviating the misalignment between the\ntext prompts and images is still challenging. The root reason behind the\nmisalignment has not been extensively investigated. We observe that the\nmisalignment is caused by inadequate token attention activation. We further\nattribute this phenomenon to the diffusion model's insufficient condition\nutilization, which is caused by its training paradigm. To address the issue, we\npropose CoMat, an end-to-end diffusion model fine-tuning strategy with an\nimage-to-text concept matching mechanism. We leverage an image captioning model\nto measure image-to-text alignment and guide the diffusion model to revisit\nignored tokens. A novel attribute concentration module is also proposed to\naddress the attribute binding problem. Without any image or human preference\ndata, we use only 20K text prompts to fine-tune SDXL to obtain CoMat-SDXL.\nExtensive experiments show that CoMat-SDXL significantly outperforms the\nbaseline model SDXL in two text-to-image alignment benchmarks and achieves\nstart-of-the-art performance.\n","authors":["Dongzhi Jiang","Guanglu Song","Xiaoshi Wu","Renrui Zhang","Dazhong Shen","Zhuofan Zong","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.03653v1.pdf","comment":"Project Page: https://caraj7.github.io/comat"},{"id":"http://arxiv.org/abs/2404.03652v1","updated":"2024-04-04T17:59:40Z","published":"2024-04-04T17:59:40Z","title":"The More You See in 2D, the More You Perceive in 3D","summary":" Humans can infer 3D structure from 2D images of an object based on past\nexperience and improve their 3D understanding as they see more images. Inspired\nby this behavior, we introduce SAP3D, a system for 3D reconstruction and novel\nview synthesis from an arbitrary number of unposed images. Given a few unposed\nimages of an object, we adapt a pre-trained view-conditioned diffusion model\ntogether with the camera poses of the images via test-time fine-tuning. The\nadapted diffusion model and the obtained camera poses are then utilized as\ninstance-specific priors for 3D reconstruction and novel view synthesis. We\nshow that as the number of input images increases, the performance of our\napproach improves, bridging the gap between optimization-based prior-less 3D\nreconstruction methods and single-image-to-3D diffusion-based methods. We\ndemonstrate our system on real images as well as standard synthetic benchmarks.\nOur ablation studies confirm that this adaption behavior is key for more\naccurate 3D understanding.\n","authors":["Xinyang Han","Zelin Gao","Angjoo Kanazawa","Shubham Goel","Yossi Gandelsman"],"pdf_url":"https://arxiv.org/pdf/2404.03652v1.pdf","comment":"Project page: https://sap3d.github.io/"},{"id":"http://arxiv.org/abs/2404.03650v1","updated":"2024-04-04T17:59:08Z","published":"2024-04-04T17:59:08Z","title":"OpenNeRF: Open Set 3D Neural Scene Segmentation with Pixel-Wise Features\n and Rendered Novel Views","summary":" Large visual-language models (VLMs), like CLIP, enable open-set image\nsegmentation to segment arbitrary concepts from an image in a zero-shot manner.\nThis goes beyond the traditional closed-set assumption, i.e., where models can\nonly segment classes from a pre-defined training set. More recently, first\nworks on open-set segmentation in 3D scenes have appeared in the literature.\nThese methods are heavily influenced by closed-set 3D convolutional approaches\nthat process point clouds or polygon meshes. However, these 3D scene\nrepresentations do not align well with the image-based nature of the\nvisual-language models. Indeed, point cloud and 3D meshes typically have a\nlower resolution than images and the reconstructed 3D scene geometry might not\nproject well to the underlying 2D image sequences used to compute pixel-aligned\nCLIP features. To address these challenges, we propose OpenNeRF which naturally\noperates on posed images and directly encodes the VLM features within the NeRF.\nThis is similar in spirit to LERF, however our work shows that using pixel-wise\nVLM features (instead of global CLIP features) results in an overall less\ncomplex architecture without the need for additional DINO regularization. Our\nOpenNeRF further leverages NeRF's ability to render novel views and extract\nopen-set VLM features from areas that are not well observed in the initial\nposed images. For 3D point cloud segmentation on the Replica dataset, OpenNeRF\noutperforms recent open-vocabulary methods such as LERF and OpenScene by at\nleast +4.9 mIoU.\n","authors":["Francis Engelmann","Fabian Manhardt","Michael Niemeyer","Keisuke Tateno","Marc Pollefeys","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.03650v1.pdf","comment":"ICLR 2024, Project page: https://opennerf.github.io"},{"id":"http://arxiv.org/abs/2404.03645v1","updated":"2024-04-04T17:58:21Z","published":"2024-04-04T17:58:21Z","title":"Decoupling Static and Hierarchical Motion Perception for Referring Video\n Segmentation","summary":" Referring video segmentation relies on natural language expressions to\nidentify and segment objects, often emphasizing motion clues. Previous works\ntreat a sentence as a whole and directly perform identification at the\nvideo-level, mixing up static image-level cues with temporal motion cues.\nHowever, image-level features cannot well comprehend motion cues in sentences,\nand static cues are not crucial for temporal perception. In fact, static cues\ncan sometimes interfere with temporal perception by overshadowing motion cues.\nIn this work, we propose to decouple video-level referring expression\nunderstanding into static and motion perception, with a specific emphasis on\nenhancing temporal comprehension. Firstly, we introduce an\nexpression-decoupling module to make static cues and motion cues perform their\ndistinct role, alleviating the issue of sentence embeddings overlooking motion\ncues. Secondly, we propose a hierarchical motion perception module to capture\ntemporal information effectively across varying timescales. Furthermore, we\nemploy contrastive learning to distinguish the motions of visually similar\nobjects. These contributions yield state-of-the-art performance across five\ndatasets, including a remarkable $\\textbf{9.2%}$ $\\mathcal{J\\&F}$ improvement\non the challenging $\\textbf{MeViS}$ dataset. Code is available at\nhttps://github.com/heshuting555/DsHmp.\n","authors":["Shuting He","Henghui Ding"],"pdf_url":"https://arxiv.org/pdf/2404.03645v1.pdf","comment":"CVPR 2024, code: https://github.com/heshuting555/DsHmp"},{"id":"http://arxiv.org/abs/2404.03642v1","updated":"2024-04-04T17:57:25Z","published":"2024-04-04T17:57:25Z","title":"DiffBody: Human Body Restoration by Imagining with Generative Diffusion\n Prior","summary":" Human body restoration plays a vital role in various applications related to\nthe human body. Despite recent advances in general image restoration using\ngenerative models, their performance in human body restoration remains\nmediocre, often resulting in foreground and background blending, over-smoothing\nsurface textures, missing accessories, and distorted limbs. Addressing these\nchallenges, we propose a novel approach by constructing a human body-aware\ndiffusion model that leverages domain-specific knowledge to enhance\nperformance. Specifically, we employ a pretrained body attention module to\nguide the diffusion model's focus on the foreground, addressing issues caused\nby blending between the subject and background. We also demonstrate the value\nof revisiting the language modality of the diffusion model in restoration tasks\nby seamlessly incorporating text prompt to improve the quality of surface\ntexture and additional clothing and accessories details. Additionally, we\nintroduce a diffusion sampler tailored for fine-grained human body parts,\nutilizing local semantic information to rectify limb distortions. Lastly, we\ncollect a comprehensive dataset for benchmarking and advancing the field of\nhuman body restoration. Extensive experimental validation showcases the\nsuperiority of our approach, both quantitatively and qualitatively, over\nexisting methods.\n","authors":["Yiming Zhang","Zhe Wang","Xinjie Li","Yunchen Yuan","Chengsong Zhang","Xiao Sun","Zhihang Zhong","Jian Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12790v3","updated":"2024-04-04T17:55:04Z","published":"2023-03-22T17:58:01Z","title":"$CrowdDiff$: Multi-hypothesis Crowd Density Estimation using Diffusion\n Models","summary":" Crowd counting is a fundamental problem in crowd analysis which is typically\naccomplished by estimating a crowd density map and summing over the density\nvalues. However, this approach suffers from background noise accumulation and\nloss of density due to the use of broad Gaussian kernels to create the ground\ntruth density maps. This issue can be overcome by narrowing the Gaussian\nkernel. However, existing approaches perform poorly when trained with ground\ntruth density maps with broad kernels. To deal with this limitation, we propose\nusing conditional diffusion models to predict density maps, as diffusion models\nshow high fidelity to training data during generation. With that, we present\n$CrowdDiff$ that generates the crowd density map as a reverse diffusion\nprocess. Furthermore, as the intermediate time steps of the diffusion process\nare noisy, we incorporate a regression branch for direct crowd estimation only\nduring training to improve the feature learning. In addition, owing to the\nstochastic nature of the diffusion model, we introduce producing multiple\ndensity maps to improve the counting performance contrary to the existing crowd\ncounting pipelines. We conduct extensive experiments on publicly available\ndatasets to validate the effectiveness of our method. $CrowdDiff$ outperforms\nexisting state-of-the-art crowd counting methods on several public crowd\nanalysis benchmarks with significant improvements.\n","authors":["Yasiru Ranasinghe","Nithin Gopalakrishnan Nair","Wele Gedara Chaminda Bandara","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2303.12790v3.pdf","comment":"Accepted at CVPR'24. The project is available at\n https://dylran.github.io/crowddiff.github.io"},{"id":"http://arxiv.org/abs/2404.03635v1","updated":"2024-04-04T17:54:33Z","published":"2024-04-04T17:54:33Z","title":"WorDepth: Variational Language Prior for Monocular Depth Estimation","summary":" Three-dimensional (3D) reconstruction from a single image is an ill-posed\nproblem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text\ndescription(s) is similarly ill-posed, i.e. spatial arrangements of objects\ndescribed. We investigate the question of whether two inherently ambiguous\nmodalities can be used in conjunction to produce metric-scaled reconstructions.\nTo test this, we focus on monocular depth estimation, the problem of predicting\na dense depth map from a single image, but with an additional text caption\ndescribing the scene. To this end, we begin by encoding the text caption as a\nmean and standard deviation; using a variational framework, we learn the\ndistribution of the plausible metric reconstructions of 3D scenes corresponding\nto the text captions as a prior. To \"select\" a specific reconstruction or depth\nmap, we encode the given image through a conditional sampler that samples from\nthe latent space of the variational text encoder, which is then decoded to the\noutput depth map. Our approach is trained alternatingly between the text and\nimage branches: in one optimization step, we predict the mean and standard\ndeviation from the text description and sample from a standard Gaussian, and in\nthe other, we sample using a (image) conditional sampler. Once trained, we\ndirectly predict depth from the encoded text using the conditional sampler. We\ndemonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where\nwe show that language can consistently improve performance in both.\n","authors":["Ziyao Zeng","Daniel Wang","Fengyu Yang","Hyoungseob Park","Yangchao Wu","Stefano Soatto","Byung-Woo Hong","Dong Lao","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2404.03635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03634v1","updated":"2024-04-04T17:54:12Z","published":"2024-04-04T17:54:12Z","title":"PreAfford: Universal Affordance-Based Pre-Grasping for Diverse Objects\n and Environments","summary":" Robotic manipulation of ungraspable objects with two-finger grippers presents\nsignificant challenges due to the paucity of graspable features, while\ntraditional pre-grasping techniques, which rely on repositioning objects and\nleveraging external aids like table edges, lack the adaptability across object\ncategories and scenes. Addressing this, we introduce PreAfford, a novel\npre-grasping planning framework that utilizes a point-level affordance\nrepresentation and a relay training approach to enhance adaptability across a\nbroad range of environments and object types, including those previously\nunseen. Demonstrated on the ShapeNet-v2 dataset, PreAfford significantly\nimproves grasping success rates by 69% and validates its practicality through\nreal-world experiments. This work offers a robust and adaptable solution for\nmanipulating ungraspable objects.\n","authors":["Kairui Ding","Boyuan Chen","Ruihai Wu","Yuyang Li","Zongzheng Zhang","Huan-ang Gao","Siqi Li","Yixin Zhu","Guyue Zhou","Hao Dong","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.03634v1.pdf","comment":"Project Page: https://air-discover.github.io/PreAfford/"},{"id":"http://arxiv.org/abs/2404.03632v1","updated":"2024-04-04T17:53:33Z","published":"2024-04-04T17:53:33Z","title":"Reference-Based 3D-Aware Image Editing with Triplane","summary":" Generative Adversarial Networks (GANs) have emerged as powerful tools not\nonly for high-quality image generation but also for real image editing through\nmanipulation of their interpretable latent spaces. Recent advancements in GANs\ninclude the development of 3D-aware models such as EG3D, characterized by\nefficient triplane-based architectures enabling the reconstruction of 3D\ngeometry from single images. However, scant attention has been devoted to\nproviding an integrated framework for high-quality reference-based 3D-aware\nimage editing within this domain. This study addresses this gap by exploring\nand demonstrating the effectiveness of EG3D's triplane space for achieving\nadvanced reference-based edits, presenting a unique perspective on 3D-aware\nimage editing through our novel pipeline. Our approach integrates the encoding\nof triplane features, spatial disentanglement and automatic localization of\nfeatures in the triplane domain, and fusion learning for desired image editing.\nMoreover, our framework demonstrates versatility across domains, extending its\neffectiveness to animal face edits and partial stylization of cartoon\nportraits. The method shows significant improvements over relevant 3D-aware\nlatent editing and 2D reference-based editing methods, both qualitatively and\nquantitatively. Project page: https://three-bee.github.io/triplane_edit\n","authors":["Bahri Batuhan Bilecen","Yigit Yalin","Ning Yu","Aysegul Dundar"],"pdf_url":"https://arxiv.org/pdf/2404.03632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03631v1","updated":"2024-04-04T17:52:13Z","published":"2024-04-04T17:52:13Z","title":"Robust Concept Erasure Using Task Vectors","summary":" With the rapid growth of text-to-image models, a variety of techniques have\nbeen suggested to prevent undesirable image generations. Yet, these methods\noften only protect against specific user prompts and have been shown to allow\nunsafe generations with other inputs. Here we focus on unconditionally erasing\na concept from a text-to-image model rather than conditioning the erasure on\nthe user's prompt. We first show that compared to input-dependent erasure\nmethods, concept erasure that uses Task Vectors (TV) is more robust to\nunexpected user inputs, not seen during training. However, TV-based erasure can\nalso affect the core performance of the edited model, particularly when the\nrequired edit strength is unknown. To this end, we propose a method called\nDiverse Inversion, which we use to estimate the required strength of the TV\nedit. Diverse Inversion finds within the model input space a large set of word\nembeddings, each of which induces the generation of the target concept. We find\nthat encouraging diversity in the set makes our estimation more robust to\nunexpected prompts. Finally, we show that Diverse Inversion enables us to apply\na TV edit only to a subset of the model weights, enhancing the erasure\ncapabilities while better maintaining the core functionality of the model.\n","authors":["Minh Pham","Kelly O. Marshall","Chinmay Hegde","Niv Cohen"],"pdf_url":"https://arxiv.org/pdf/2404.03631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03620v1","updated":"2024-04-04T17:43:06Z","published":"2024-04-04T17:43:06Z","title":"LCM-Lookahead for Encoder-based Text-to-Image Personalization","summary":" Recent advancements in diffusion models have introduced fast sampling methods\nthat can effectively produce high-quality images in just one or a few denoising\nsteps. Interestingly, when these are distilled from existing diffusion models,\nthey often maintain alignment with the original model, retaining similar\noutputs for similar prompts and seeds. These properties present opportunities\nto leverage fast sampling methods as a shortcut-mechanism, using them to create\na preview of denoised outputs through which we can backpropagate image-space\nlosses. In this work, we explore the potential of using such\nshortcut-mechanisms to guide the personalization of text-to-image models to\nspecific facial identities. We focus on encoder-based personalization\napproaches, and demonstrate that by tuning them with a lookahead identity loss,\nwe can achieve higher identity fidelity, without sacrificing layout diversity\nor prompt alignment. We further explore the use of attention sharing mechanisms\nand consistent data generation for the task of personalization, and find that\nencoder training can benefit from both.\n","authors":["Rinon Gal","Or Lichter","Elad Richardson","Or Patashnik","Amit H. Bermano","Gal Chechik","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2404.03620v1.pdf","comment":"Project page at https://lcm-lookahead.github.io/"},{"id":"http://arxiv.org/abs/2404.03618v1","updated":"2024-04-04T17:40:06Z","published":"2024-04-04T17:40:06Z","title":"DeViDe: Faceted medical knowledge for improved medical vision-language\n pre-training","summary":" Vision-language pre-training for chest X-rays has made significant strides,\nprimarily by utilizing paired radiographs and radiology reports. However,\nexisting approaches often face challenges in encoding medical knowledge\neffectively. While radiology reports provide insights into the current disease\nmanifestation, medical definitions (as used by contemporary methods) tend to be\noverly abstract, creating a gap in knowledge. To address this, we propose\nDeViDe, a novel transformer-based method that leverages radiographic\ndescriptions from the open web. These descriptions outline general visual\ncharacteristics of diseases in radiographs, and when combined with abstract\ndefinitions and radiology reports, provide a holistic snapshot of knowledge.\nDeViDe incorporates three key features for knowledge-augmented vision language\nalignment: First, a large-language model-based augmentation is employed to\nhomogenise medical knowledge from diverse sources. Second, this knowledge is\naligned with image information at various levels of granularity. Third, a novel\nprojection layer is proposed to handle the complexity of aligning each image\nwith multiple descriptions arising in a multi-label setting. In zero-shot\nsettings, DeViDe performs comparably to fully supervised models on external\ndatasets and achieves state-of-the-art results on three large-scale datasets.\nAdditionally, fine-tuning DeViDe on four downstream tasks and six segmentation\ntasks showcases its superior performance across data from diverse\ndistributions.\n","authors":["Haozhe Luo","Ziyu Zhou","Corentin Royer","Anjany Sekuboyina","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2404.03618v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2208.04060 by other authors"},{"id":"http://arxiv.org/abs/2404.03617v1","updated":"2024-04-04T17:39:41Z","published":"2024-04-04T17:39:41Z","title":"On the Efficiency of Convolutional Neural Networks","summary":" Since the breakthrough performance of AlexNet in 2012, convolutional neural\nnetworks (convnets) have grown into extremely powerful vision models. Deep\nlearning researchers have used convnets to produce accurate results that were\nunachievable a decade ago. Yet computer scientists make computational\nefficiency their primary objective. Accuracy with exorbitant cost is not\nacceptable; an algorithm must also minimize its computational requirements.\nConfronted with the daunting computation that convnets use, deep learning\nresearchers also became interested in efficiency. Researchers applied\ntremendous effort to find the convnet architectures that have the greatest\nefficiency. However, skepticism grew among researchers and engineers alike\nabout the relevance of arithmetic complexity. Contrary to the prevailing view\nthat latency and arithmetic complexity are irreconcilable, a simple formula\nrelates both through computational efficiency. This insight enabled us to\nco-optimize the separate factors that determine latency. We observed that the\ndegenerate conv2d layers that produce the best accuracy-complexity trade-off\nalso have low operational intensity. Therefore, kernels that implement these\nlayers use significant memory resources. We solved this optimization problem\nwith block-fusion kernels that implement all layers of a residual block,\nthereby creating temporal locality, avoiding communication, and reducing\nworkspace size. Our ConvFirst model with block-fusion kernels ran approximately\nfour times as fast as the ConvNeXt baseline with PyTorch Inductor, at equal\naccuracy on the ImageNet-1K classification task. Our unified approach to\nconvnet efficiency envisions a new era of models and kernels that achieve\ngreater accuracy at lower cost.\n","authors":["Andrew Lavin"],"pdf_url":"https://arxiv.org/pdf/2404.03617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03613v1","updated":"2024-04-04T17:34:41Z","published":"2024-04-04T17:34:41Z","title":"Per-Gaussian Embedding-Based Deformation for Deformable 3D Gaussian\n Splatting","summary":" As 3D Gaussian Splatting (3DGS) provides fast and high-quality novel view\nsynthesis, it is a natural extension to deform a canonical 3DGS to multiple\nframes. However, previous works fail to accurately reconstruct dynamic scenes,\nespecially 1) static parts moving along nearby dynamic parts, and 2) some\ndynamic areas are blurry. We attribute the failure to the wrong design of the\ndeformation field, which is built as a coordinate-based function. This approach\nis problematic because 3DGS is a mixture of multiple fields centered at the\nGaussians, not just a single coordinate-based framework. To resolve this\nproblem, we define the deformation as a function of per-Gaussian embeddings and\ntemporal embeddings. Moreover, we decompose deformations as coarse and fine\ndeformations to model slow and fast movements, respectively. Also, we introduce\nan efficient training strategy for faster convergence and higher quality.\nProject page: https://jeongminb.github.io/e-d3dgs/\n","authors":["Jeongmin Bae","Seoha Kim","Youngsik Yun","Hahyun Lee","Gun Bang","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2404.03613v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.03611v1","updated":"2024-04-04T17:34:21Z","published":"2024-04-04T17:34:21Z","title":"InsectMamba: Insect Pest Classification with State Space Model","summary":" The classification of insect pests is a critical task in agricultural\ntechnology, vital for ensuring food security and environmental sustainability.\nHowever, the complexity of pest identification, due to factors like high\ncamouflage and species diversity, poses significant obstacles. Existing methods\nstruggle with the fine-grained feature extraction needed to distinguish between\nclosely related pest species. Although recent advancements have utilized\nmodified network structures and combined deep learning approaches to improve\naccuracy, challenges persist due to the similarity between pests and their\nsurroundings. To address this problem, we introduce InsectMamba, a novel\napproach that integrates State Space Models (SSMs), Convolutional Neural\nNetworks (CNNs), Multi-Head Self-Attention mechanism (MSA), and Multilayer\nPerceptrons (MLPs) within Mix-SSM blocks. This integration facilitates the\nextraction of comprehensive visual features by leveraging the strengths of each\nencoding strategy. A selective module is also proposed to adaptively aggregate\nthese features, enhancing the model's ability to discern pest characteristics.\nInsectMamba was evaluated against strong competitors across five insect pest\nclassification datasets. The results demonstrate its superior performance and\nverify the significance of each model component by an ablation study.\n","authors":["Qianning Wang","Chenglin Wang","Zhixin Lai","Yucheng Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.03611v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.03590v1","updated":"2024-04-04T16:58:26Z","published":"2024-04-04T16:58:26Z","title":"SemGrasp: Semantic Grasp Generation via Language Aligned Discretization","summary":" Generating natural human grasps necessitates consideration of not just object\ngeometry but also semantic information. Solely depending on object shape for\ngrasp generation confines the applications of prior methods in downstream\ntasks. This paper presents a novel semantic-based grasp generation method,\ntermed SemGrasp, which generates a static human grasp pose by incorporating\nsemantic information into the grasp representation. We introduce a discrete\nrepresentation that aligns the grasp space with semantic space, enabling the\ngeneration of grasp postures in accordance with language instructions. A\nMultimodal Large Language Model (MLLM) is subsequently fine-tuned, integrating\nobject, grasp, and language within a unified semantic space. To facilitate the\ntraining of SemGrasp, we have compiled a large-scale, grasp-text-aligned\ndataset named CapGrasp, featuring about 260k detailed captions and 50k diverse\ngrasps. Experimental findings demonstrate that SemGrasp efficiently generates\nnatural human grasps in alignment with linguistic intentions. Our code, models,\nand dataset are available publicly at: https://kailinli.github.io/SemGrasp.\n","authors":["Kailin Li","Jingbo Wang","Lixin Yang","Cewu Lu","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2404.03590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03584v1","updated":"2024-04-04T16:48:40Z","published":"2024-04-04T16:48:40Z","title":"Towards more realistic human motion prediction with attention to motion\n coordination","summary":" Joint relation modeling is a curial component in human motion prediction.\nMost existing methods rely on skeletal-based graphs to build the joint\nrelations, where local interactive relations between joint pairs are well\nlearned. However, the motion coordination, a global joint relation reflecting\nthe simultaneous cooperation of all joints, is usually weakened because it is\nlearned from part to whole progressively and asynchronously. Thus, the final\npredicted motions usually appear unrealistic. To tackle this issue, we learn a\nmedium, called coordination attractor (CA), from the spatiotemporal features of\nmotion to characterize the global motion features, which is subsequently used\nto build new relative joint relations. Through the CA, all joints are related\nsimultaneously, and thus the motion coordination of all joints can be better\nlearned. Based on this, we further propose a novel joint relation modeling\nmodule, Comprehensive Joint Relation Extractor (CJRE), to combine this motion\ncoordination with the local interactions between joint pairs in a unified\nmanner. Additionally, we also present a Multi-timescale Dynamics Extractor\n(MTDE) to extract enriched dynamics from the raw position information for\neffective prediction. Extensive experiments show that the proposed framework\noutperforms state-of-the-art methods in both short- and long-term predictions\non H3.6M, CMU-Mocap, and 3DPW.\n","authors":["Pengxiang Ding","Jianqin Yin"],"pdf_url":"https://arxiv.org/pdf/2404.03584v1.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2312.11972v2","updated":"2024-04-04T16:41:22Z","published":"2023-12-19T09:09:46Z","title":"Expressive Forecasting of 3D Whole-body Human Motions","summary":" Human motion forecasting, with the goal of estimating future human behavior\nover a period of time, is a fundamental task in many real-world applications.\nHowever, existing works typically concentrate on predicting the major joints of\nthe human body without considering the delicate movements of the human hands.\nIn practical applications, hand gesture plays an important role in human\ncommunication with the real world, and expresses the primary intention of human\nbeings. In this work, we are the first to formulate a whole-body human pose\nforecasting task, which jointly predicts the future body and hand activities.\nCorrespondingly, we propose a novel Encoding-Alignment-Interaction (EAI)\nframework that aims to predict both coarse (body joints) and fine-grained\n(gestures) activities collaboratively, enabling expressive and\ncross-facilitated forecasting of 3D whole-body human motions. Specifically, our\nmodel involves two key constituents: cross-context alignment (XCA) and\ncross-context interaction (XCI). Considering the heterogeneous information\nwithin the whole-body, XCA aims to align the latent features of various human\ncomponents, while XCI focuses on effectively capturing the context interaction\namong the human components. We conduct extensive experiments on a\nnewly-introduced large-scale benchmark and achieve state-of-the-art\nperformance. The code is public for research purposes at\nhttps://github.com/Dingpx/EAI.\n","authors":["Pengxiang Ding","Qiongjie Cui","Min Zhang","Mengyuan Liu","Haofan Wang","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.11972v2.pdf","comment":"Accepted by AAAI24"},{"id":"http://arxiv.org/abs/2404.03575v1","updated":"2024-04-04T16:38:57Z","published":"2024-04-04T16:38:57Z","title":"DreamScene: 3D Gaussian-based Text-to-3D Scene Generation via Formation\n Pattern Sampling","summary":" Text-to-3D scene generation holds immense potential for the gaming, film, and\narchitecture sectors. Despite significant progress, existing methods struggle\nwith maintaining high quality, consistency, and editing flexibility. In this\npaper, we propose DreamScene, a 3D Gaussian-based novel text-to-3D scene\ngeneration framework, to tackle the aforementioned three challenges mainly via\ntwo strategies. First, DreamScene employs Formation Pattern Sampling (FPS), a\nmulti-timestep sampling strategy guided by the formation patterns of 3D\nobjects, to form fast, semantically rich, and high-quality representations. FPS\nuses 3D Gaussian filtering for optimization stability, and leverages\nreconstruction techniques to generate plausible textures. Second, DreamScene\nemploys a progressive three-stage camera sampling strategy, specifically\ndesigned for both indoor and outdoor settings, to effectively ensure\nobject-environment integration and scene-wide 3D consistency. Last, DreamScene\nenhances scene editing flexibility by integrating objects and environments,\nenabling targeted adjustments. Extensive experiments validate DreamScene's\nsuperiority over current state-of-the-art techniques, heralding its\nwide-ranging potential for diverse applications. Code and demos will be\nreleased at https://dreamscene-project.github.io .\n","authors":["Haoran Li","Haolin Shi","Wenli Zhang","Wenjun Wu","Yong Liao","Lin Wang","Lik-hang Lee","Pengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.03575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03574v1","updated":"2024-04-04T16:38:49Z","published":"2024-04-04T16:38:49Z","title":"TinyVQA: Compact Multimodal Deep Neural Network for Visual Question\n Answering on Resource-Constrained Devices","summary":" Traditional machine learning models often require powerful hardware, making\nthem unsuitable for deployment on resource-limited devices. Tiny Machine\nLearning (tinyML) has emerged as a promising approach for running machine\nlearning models on these devices, but integrating multiple data modalities into\ntinyML models still remains a challenge due to increased complexity, latency,\nand power consumption. This paper proposes TinyVQA, a novel multimodal deep\nneural network for visual question answering tasks that can be deployed on\nresource-constrained tinyML hardware. TinyVQA leverages a supervised\nattention-based model to learn how to answer questions about images using both\nvision and language modalities. Distilled knowledge from the supervised\nattention-based VQA model trains the memory aware compact TinyVQA model and low\nbit-width quantization technique is employed to further compress the model for\ndeployment on tinyML devices. The TinyVQA model was evaluated on the FloodNet\ndataset, which is used for post-disaster damage assessment. The compact model\nachieved an accuracy of 79.5%, demonstrating the effectiveness of TinyVQA for\nreal-world applications. Additionally, the model was deployed on a Crazyflie\n2.0 drone, equipped with an AI deck and GAP8 microprocessor. The TinyVQA model\nachieved low latencies of 56 ms and consumes 693 mW power while deployed on the\ntiny drone, showcasing its suitability for resource-constrained embedded\nsystems.\n","authors":["Hasib-Al Rashid","Argho Sarkar","Aryya Gangopadhyay","Maryam Rahnemoonfar","Tinoosh Mohsenin"],"pdf_url":"https://arxiv.org/pdf/2404.03574v1.pdf","comment":"Accepted as a full paper by the tinyML Research Symposium 2024"},{"id":"http://arxiv.org/abs/2404.03572v1","updated":"2024-04-04T16:37:42Z","published":"2024-04-04T16:37:42Z","title":"Terrain Point Cloud Inpainting via Signal Decomposition","summary":" The rapid development of 3D acquisition technology has made it possible to\nobtain point clouds of real-world terrains. However, due to limitations in\nsensor acquisition technology or specific requirements, point clouds often\ncontain defects such as holes with missing data. Inpainting algorithms are\nwidely used to patch these holes. However, existing traditional inpainting\nalgorithms rely on precise hole boundaries, which limits their ability to\nhandle cases where the boundaries are not well-defined. On the other hand,\nlearning-based completion methods often prioritize reconstructing the entire\npoint cloud instead of solely focusing on hole filling. Based on the fact that\nreal-world terrain exhibits both global smoothness and rich local detail, we\npropose a novel representation for terrain point clouds. This representation\ncan help to repair the holes without clear boundaries. Specifically, it\ndecomposes terrains into low-frequency and high-frequency components, which are\nrepresented by B-spline surfaces and relative height maps respectively. In this\nway, the terrain point cloud inpainting problem is transformed into a B-spline\nsurface fitting and 2D image inpainting problem. By solving the two problems,\nthe highly complex and irregular holes on the terrain point clouds can be\nwell-filled, which not only satisfies the global terrain undulation but also\nexhibits rich geometric details. The experimental results also demonstrate the\neffectiveness of our method.\n","authors":["Yizhou Xie","Xiangning Xie","Yuran Wang","Yanci Zhang","Zejun Lv"],"pdf_url":"https://arxiv.org/pdf/2404.03572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14817v3","updated":"2024-04-04T16:27:06Z","published":"2024-02-22T18:59:56Z","title":"Cameras as Rays: Pose Estimation via Ray Diffusion","summary":" Estimating camera poses is a fundamental task for 3D reconstruction and\nremains challenging given sparsely sampled views (<10). In contrast to existing\napproaches that pursue top-down prediction of global parametrizations of camera\nextrinsics, we propose a distributed representation of camera pose that treats\na camera as a bundle of rays. This representation allows for a tight coupling\nwith spatial image features improving pose precision. We observe that this\nrepresentation is naturally suited for set-level transformers and develop a\nregression-based approach that maps image patches to corresponding rays. To\ncapture the inherent uncertainties in sparse-view pose inference, we adapt this\napproach to learn a denoising diffusion model which allows us to sample\nplausible modes while improving performance. Our proposed methods, both\nregression- and diffusion-based, demonstrate state-of-the-art performance on\ncamera pose estimation on CO3D while generalizing to unseen object categories\nand in-the-wild captures.\n","authors":["Jason Y. Zhang","Amy Lin","Moneish Kumar","Tzu-Hsuan Yang","Deva Ramanan","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2402.14817v3.pdf","comment":"In ICLR 2024 (oral). v2-3: updated references. Project webpage:\n https://jasonyzhang.com/RayDiffusion"},{"id":"http://arxiv.org/abs/2404.03566v1","updated":"2024-04-04T16:24:32Z","published":"2024-04-04T16:24:32Z","title":"PointInfinity: Resolution-Invariant Point Diffusion Models","summary":" We present PointInfinity, an efficient family of point cloud diffusion\nmodels. Our core idea is to use a transformer-based architecture with a\nfixed-size, resolution-invariant latent representation. This enables efficient\ntraining with low-resolution point clouds, while allowing high-resolution point\nclouds to be generated during inference. More importantly, we show that scaling\nthe test-time resolution beyond the training resolution improves the fidelity\nof generated point clouds and surfaces. We analyze this phenomenon and draw a\nlink to classifier-free guidance commonly used in diffusion models,\ndemonstrating that both allow trading off fidelity and variability during\ninference. Experiments on CO3D show that PointInfinity can efficiently generate\nhigh-resolution point clouds (up to 131k points, 31 times more than Point-E)\nwith state-of-the-art quality.\n","authors":["Zixuan Huang","Justin Johnson","Shoubhik Debnath","James M. Rehg","Chao-Yuan Wu"],"pdf_url":"https://arxiv.org/pdf/2404.03566v1.pdf","comment":"Accepted to CVPR 2024, project website at\n https://zixuanh.com/projects/pointinfinity"},{"id":"http://arxiv.org/abs/2403.01598v2","updated":"2024-04-04T16:12:51Z","published":"2024-03-03T19:52:43Z","title":"APISR: Anime Production Inspired Real-World Anime Super-Resolution","summary":" While real-world anime super-resolution (SR) has gained increasing attention\nin the SR community, existing methods still adopt techniques from the\nphotorealistic domain. In this paper, we analyze the anime production workflow\nand rethink how to use characteristics of it for the sake of the real-world\nanime SR. First, we argue that video networks and datasets are not necessary\nfor anime SR due to the repetition use of hand-drawing frames. Instead, we\npropose an anime image collection pipeline by choosing the least compressed and\nthe most informative frames from the video sources. Based on this pipeline, we\nintroduce the Anime Production-oriented Image (API) dataset. In addition, we\nidentify two anime-specific challenges of distorted and faint hand-drawn lines\nand unwanted color artifacts. We address the first issue by introducing a\nprediction-oriented compression module in the image degradation model and a\npseudo-ground truth preparation with enhanced hand-drawn lines. In addition, we\nintroduce the balanced twin perceptual loss combining both anime and\nphotorealistic high-level features to mitigate unwanted color artifacts and\nincrease visual clarity. We evaluate our method through extensive experiments\non the public benchmark, showing our method outperforms state-of-the-art anime\ndataset-trained approaches.\n","authors":["Boyang Wang","Fengyu Yang","Xihang Yu","Chao Zhang","Hanbin Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.01598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03541v1","updated":"2024-04-04T15:49:01Z","published":"2024-04-04T15:49:01Z","title":"Segmentation-Guided Knee Radiograph Generation using Conditional\n Diffusion Models","summary":" Deep learning-based medical image processing algorithms require\nrepresentative data during development. In particular, surgical data might be\ndifficult to obtain, and high-quality public datasets are limited. To overcome\nthis limitation and augment datasets, a widely adopted solution is the\ngeneration of synthetic images. In this work, we employ conditional diffusion\nmodels to generate knee radiographs from contour and bone segmentations.\nRemarkably, two distinct strategies are presented by incorporating the\nsegmentation as a condition into the sampling and training process, namely,\nconditional sampling and conditional training. The results demonstrate that\nboth methods can generate realistic images while adhering to the conditioning\nsegmentation. The conditional training method outperforms the conditional\nsampling method and the conventional U-Net.\n","authors":["Siyuan Mei","Fuxin Fan","Fabian Wagner","Mareike Thies","Mingxuan Gu","Yipeng Sun","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2404.03541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03539v1","updated":"2024-04-04T15:47:30Z","published":"2024-04-04T15:47:30Z","title":"Is CLIP the main roadblock for fine-grained open-world perception?","summary":" Modern applications increasingly demand flexible computer vision models that\nadapt to novel concepts not encountered during training. This necessity is\npivotal in emerging domains like extended reality, robotics, and autonomous\ndriving, which require the ability to respond to open-world stimuli. A key\ningredient is the ability to identify objects based on free-form textual\nqueries defined at inference time - a task known as open-vocabulary object\ndetection. Multimodal backbones like CLIP are the main enabling technology for\ncurrent open-world perception solutions. Despite performing well on generic\nqueries, recent studies highlighted limitations on the fine-grained recognition\ncapabilities in open-vocabulary settings - i.e., for distinguishing subtle\nobject features like color, shape, and material. In this paper, we perform a\ndetailed examination of these open-vocabulary object recognition limitations to\nfind the root cause. We evaluate the performance of CLIP, the most commonly\nused vision-language backbone, against a fine-grained object-matching\nbenchmark, revealing interesting analogies between the limitations of\nopen-vocabulary object detectors and their backbones. Experiments suggest that\nthe lack of fine-grained understanding is caused by the poor separability of\nobject characteristics in the CLIP latent space. Therefore, we try to\nunderstand whether fine-grained knowledge is present in CLIP embeddings but not\nexploited at inference time due, for example, to the unsuitability of the\ncosine similarity matching function, which may discard important object\ncharacteristics. Our preliminary experiments show that simple CLIP latent-space\nre-projections help separate fine-grained concepts, paving the way towards the\ndevelopment of backbones inherently able to process fine-grained details. The\ncode for reproducing these experiments is available at\nhttps://github.com/lorebianchi98/FG-CLIP.\n","authors":["Lorenzo Bianchi","Fabio Carrara","Nicola Messina","Fabrizio Falchi"],"pdf_url":"https://arxiv.org/pdf/2404.03539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03537v1","updated":"2024-04-04T15:45:25Z","published":"2024-04-04T15:45:25Z","title":"If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face\n Recognition through Synthetic Faces","summary":" Recent advances in deep face recognition have spurred a growing demand for\nlarge, diverse, and manually annotated face datasets. Acquiring authentic,\nhigh-quality data for face recognition has proven to be a challenge, primarily\ndue to privacy concerns. Large face datasets are primarily sourced from\nweb-based images, lacking explicit user consent. In this paper, we examine\nwhether and how synthetic face data can be used to train effective face\nrecognition models with reduced reliance on authentic images, thereby\nmitigating data collection concerns. First, we explored the performance gap\namong recent state-of-the-art face recognition models, trained with synthetic\ndata only and authentic (scarce) data only. Then, we deepened our analysis by\ntraining a state-of-the-art backbone with various combinations of synthetic and\nauthentic data, gaining insights into optimizing the limited use of the latter\nfor verification accuracy. Finally, we assessed the effectiveness of data\naugmentation approaches on synthetic and authentic data, with the same goal in\nmind. Our results highlighted the effectiveness of FR trained on combined\ndatasets, particularly when combined with appropriate augmentation techniques.\n","authors":["Andrea Atzori","Fadi Boutros","Naser Damer","Gianni Fenu","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2404.03537v1.pdf","comment":"Accepted as a full paper at FG 2024 main track"},{"id":"http://arxiv.org/abs/2404.03531v1","updated":"2024-04-04T15:35:43Z","published":"2024-04-04T15:35:43Z","title":"COMO: Compact Mapping and Odometry","summary":" We present COMO, a real-time monocular mapping and odometry system that\nencodes dense geometry via a compact set of 3D anchor points. Decoding anchor\npoint projections into dense geometry via per-keyframe depth covariance\nfunctions guarantees that depth maps are joined together at visible anchor\npoints. The representation enables joint optimization of camera poses and dense\ngeometry, intrinsic 3D consistency, and efficient second-order inference. To\nmaintain a compact yet expressive map, we introduce a frontend that leverages\nthe covariance function for tracking and initializing potentially visually\nindistinct 3D points across frames. Altogether, we introduce a real-time system\ncapable of estimating accurate poses and consistent geometry.\n","authors":["Eric Dexheimer","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2404.03531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03527v1","updated":"2024-04-04T15:31:11Z","published":"2024-04-04T15:31:11Z","title":"HAPNet: Toward Superior RGB-Thermal Scene Parsing via Hybrid,\n Asymmetric, and Progressive Heterogeneous Feature Fusion","summary":" Data-fusion networks have shown significant promise for RGB-thermal scene\nparsing. However, the majority of existing studies have relied on symmetric\nduplex encoders for heterogeneous feature extraction and fusion, paying\ninadequate attention to the inherent differences between RGB and thermal\nmodalities. Recent progress in vision foundation models (VFMs) trained through\nself-supervision on vast amounts of unlabeled data has proven their ability to\nextract informative, general-purpose features. However, this potential has yet\nto be fully leveraged in the domain. In this study, we take one step toward\nthis new research area by exploring a feasible strategy to fully exploit VFM\nfeatures for RGB-thermal scene parsing. Specifically, we delve deeper into the\nunique characteristics of RGB and thermal modalities, thereby designing a\nhybrid, asymmetric encoder that incorporates both a VFM and a convolutional\nneural network. This design allows for more effective extraction of\ncomplementary heterogeneous features, which are subsequently fused in a\ndual-path, progressive manner. Moreover, we introduce an auxiliary task to\nfurther enrich the local semantics of the fused features, thereby improving the\noverall performance of RGB-thermal scene parsing. Our proposed HAPNet, equipped\nwith all these components, demonstrates superior performance compared to all\nother state-of-the-art RGB-thermal scene parsing networks, achieving top ranks\nacross three widely used public RGB-thermal scene parsing datasets. We believe\nthis new paradigm has opened up new opportunities for future developments in\ndata-fusion scene parsing approaches.\n","authors":["Jiahang Li","Peng Yun","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.03527v1.pdf","comment":"12 pages, 4figures"},{"id":"http://arxiv.org/abs/2404.03518v1","updated":"2024-04-04T15:23:14Z","published":"2024-04-04T15:23:14Z","title":"SDPose: Tokenized Pose Estimation via Circulation-Guide\n Self-Distillation","summary":" Recently, transformer-based methods have achieved state-of-the-art prediction\nquality on human pose estimation(HPE). Nonetheless, most of these\ntop-performing transformer-based models are too computation-consuming and\nstorage-demanding to deploy on edge computing platforms. Those\ntransformer-based models that require fewer resources are prone to\nunder-fitting due to their smaller scale and thus perform notably worse than\ntheir larger counterparts. Given this conundrum, we introduce SDPose, a new\nself-distillation method for improving the performance of small\ntransformer-based models. To mitigate the problem of under-fitting, we design a\ntransformer module named Multi-Cycled Transformer(MCT) based on multiple-cycled\nforwards to more fully exploit the potential of small model parameters.\nFurther, in order to prevent the additional inference compute-consuming brought\nby MCT, we introduce a self-distillation scheme, extracting the knowledge from\nthe MCT module to a naive forward model. Specifically, on the MSCOCO validation\ndataset, SDPose-T obtains 69.7% mAP with 4.4M parameters and 1.8 GFLOPs.\nFurthermore, SDPose-S-V2 obtains 73.5% mAP on the MSCOCO validation dataset\nwith 6.2M parameters and 4.7 GFLOPs, achieving a new state-of-the-art among\npredominant tiny neural network methods. Our code is available at\nhttps://github.com/MartyrPenink/SDPose.\n","authors":["Sichen Chen","Yingyi Zhang","Siming Huang","Ran Yi","Ke Fan","Ruixin Zhang","Peixian Chen","Jun Wang","Shouhong Ding","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.03518v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03507v1","updated":"2024-04-04T15:10:24Z","published":"2024-04-04T15:10:24Z","title":"DQ-DETR: DETR with Dynamic Query for Tiny Object Detection","summary":" Despite previous DETR-like methods having performed successfully in generic\nobject detection, tiny object detection is still a challenging task for them\nsince the positional information of object queries is not customized for\ndetecting tiny objects, whose scale is extraordinarily smaller than general\nobjects. Also, DETR-like methods using a fixed number of queries make them\nunsuitable for aerial datasets, which only contain tiny objects, and the\nnumbers of instances are imbalanced between different images. Thus, we present\na simple yet effective model, named DQ-DETR, which consists of three different\ncomponents: categorical counting module, counting-guided feature enhancement,\nand dynamic query selection to solve the above-mentioned problems. DQ-DETR uses\nthe prediction and density maps from the categorical counting module to\ndynamically adjust the number of object queries and improve the positional\ninformation of queries. Our model DQ-DETR outperforms previous CNN-based and\nDETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2\ndataset, which mostly consists of tiny objects.\n","authors":["Yi-Xin Huang","Hou-I Liu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11963v2","updated":"2024-04-04T15:10:23Z","published":"2023-03-21T15:50:08Z","title":"NEMTO: Neural Environment Matting for Novel View and Relighting\n Synthesis of Transparent Objects","summary":" We propose NEMTO, the first end-to-end neural rendering pipeline to model 3D\ntransparent objects with complex geometry and unknown indices of refraction.\nCommonly used appearance modeling such as the Disney BSDF model cannot\naccurately address this challenging problem due to the complex light paths\nbending through refractions and the strong dependency of surface appearance on\nillumination. With 2D images of the transparent object as input, our method is\ncapable of high-quality novel view and relighting synthesis. We leverage\nimplicit Signed Distance Functions (SDF) to model the object geometry and\npropose a refraction-aware ray bending network to model the effects of light\nrefraction within the object. Our ray bending network is more tolerant to\ngeometric inaccuracies than traditional physically-based methods for rendering\ntransparent objects. We provide extensive evaluations on both synthetic and\nreal-world datasets to demonstrate our high-quality synthesis and the\napplicability of our method.\n","authors":["Dongqing Wang","Tong Zhang","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2303.11963v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2312.09228v3","updated":"2024-04-04T15:06:02Z","published":"2023-12-14T18:54:32Z","title":"3DGS-Avatar: Animatable Avatars via Deformable 3D Gaussian Splatting","summary":" We introduce an approach that creates animatable human avatars from monocular\nvideos using 3D Gaussian Splatting (3DGS). Existing methods based on neural\nradiance fields (NeRFs) achieve high-quality novel-view/novel-pose image\nsynthesis but often require days of training, and are extremely slow at\ninference time. Recently, the community has explored fast grid structures for\nefficient training of clothed avatars. Albeit being extremely fast at training,\nthese methods can barely achieve an interactive rendering frame rate with\naround 15 FPS. In this paper, we use 3D Gaussian Splatting and learn a\nnon-rigid deformation network to reconstruct animatable clothed human avatars\nthat can be trained within 30 minutes and rendered at real-time frame rates\n(50+ FPS). Given the explicit nature of our representation, we further\nintroduce as-isometric-as-possible regularizations on both the Gaussian mean\nvectors and the covariance matrices, enhancing the generalization of our model\non highly articulated unseen poses. Experimental results show that our method\nachieves comparable and even better performance compared to state-of-the-art\napproaches on animatable avatar creation from a monocular input, while being\n400x and 250x faster in training and inference, respectively.\n","authors":["Zhiyin Qian","Shaofei Wang","Marko Mihajlovic","Andreas Geiger","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2312.09228v3.pdf","comment":"Project page: https://neuralbodies.github.io/3DGS-Avatar"},{"id":"http://arxiv.org/abs/2403.19612v2","updated":"2024-04-04T14:44:23Z","published":"2024-03-28T17:32:01Z","title":"ILPO-NET: Network for the invariant recognition of arbitrary volumetric\n patterns in 3D","summary":" Effective recognition of spatial patterns and learning their hierarchy is\ncrucial in modern spatial data analysis. Volumetric data applications seek\ntechniques ensuring invariance not only to shifts but also to pattern\nrotations. While traditional methods can readily achieve translational\ninvariance, rotational invariance possesses multiple challenges and remains an\nactive area of research. Here, we present ILPO-Net (Invariant to Local Patterns\nOrientation Network), a novel approach that handles arbitrarily shaped patterns\nwith the convolutional operation inherently invariant to local spatial pattern\norientations using the Wigner matrix expansions. Our architecture seamlessly\nintegrates the new convolution operator and, when benchmarked on diverse\nvolumetric datasets such as MedMNIST and CATH, demonstrates superior\nperformance over the baselines with significantly reduced parameter counts - up\nto 1000 times fewer in the case of MedMNIST. Beyond these demonstrations,\nILPO-Net's rotational invariance paves the way for other applications across\nmultiple disciplines. Our code is publicly available at\nhttps://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPONet.\n","authors":["Dmitrii Zhemchuzhnikov","Sergei Grudinin"],"pdf_url":"https://arxiv.org/pdf/2403.19612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01987v2","updated":"2024-04-04T14:40:21Z","published":"2023-12-04T16:04:41Z","title":"Bootstrapping SparseFormers from Vision Foundation Models","summary":" The recently proposed SparseFormer architecture provides an alternative\napproach to visual understanding by utilizing a significantly lower number of\nvisual tokens via adjusting RoIs, greatly reducing computational costs while\nstill achieving promising performance. However, training SparseFormers from\nscratch is still expensive, and scaling up the number of parameters can be\nchallenging. In this paper, we propose to bootstrap SparseFormers from\nViT-based vision foundation models in a simple and efficient way. Since the\nmajority of SparseFormer blocks are the standard transformer ones, we can\ninherit weights from large-scale pre-trained vision transformers and freeze\nthem as much as possible. Therefore, we only need to train the\nSparseFormer-specific lightweight focusing transformer to adjust token RoIs and\nfine-tune a few early pre-trained blocks to align the final token\nrepresentation. In such a way, we can bootstrap SparseFormer architectures from\nvarious large-scale pre-trained models (e.g., IN-21K pre-trained AugRegs or\nCLIPs) using a rather smaller amount of training samples (e.g., IN-1K) and\nwithout labels or captions within just a few hours. As a result, the\nbootstrapped unimodal SparseFormer (from AugReg-ViT-L/16-384) can reach 84.9%\naccuracy on IN-1K with only 49 tokens, and the multimodal SparseFormer from\nCLIPs also demonstrates notable zero-shot performance with highly reduced\ncomputational cost without seeing any caption during the bootstrapping\nprocedure. In addition, CLIP-bootstrapped SparseFormers, which align the output\nspace with language without seeing a word, can serve as efficient vision\nencoders in multimodal large language models. Code and models are available at\nhttps://github.com/showlab/sparseformer\n","authors":["Ziteng Gao","Zhan Tong","Kevin Qinghong Lin","Joya Chen","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.01987v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03482v1","updated":"2024-04-04T14:35:49Z","published":"2024-04-04T14:35:49Z","title":"AdaGlimpse: Active Visual Exploration with Arbitrary Glimpse Position\n and Scale","summary":" Active Visual Exploration (AVE) is a task that involves dynamically selecting\nobservations (glimpses), which is critical to facilitate comprehension and\nnavigation within an environment. While modern AVE methods have demonstrated\nimpressive performance, they are constrained to fixed-scale glimpses from rigid\ngrids. In contrast, existing mobile platforms equipped with optical zoom\ncapabilities can capture glimpses of arbitrary positions and scales. To address\nthis gap between software and hardware capabilities, we introduce AdaGlimpse.\nIt uses Soft Actor-Critic, a reinforcement learning algorithm tailored for\nexploration tasks, to select glimpses of arbitrary position and scale. This\napproach enables our model to rapidly establish a general awareness of the\nenvironment before zooming in for detailed analysis. Experimental results\ndemonstrate that AdaGlimpse surpasses previous methods across various visual\ntasks while maintaining greater applicability in realistic AVE scenarios.\n","authors":["Adam Pardyl","Michał Wronka","Maciej Wołczyk","Kamil Adamczewski","Tomasz Trzciński","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2404.03482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03477v1","updated":"2024-04-04T14:28:34Z","published":"2024-04-04T14:28:34Z","title":"Towards Automated Movie Trailer Generation","summary":" Movie trailers are an essential tool for promoting films and attracting\naudiences. However, the process of creating trailers can be time-consuming and\nexpensive. To streamline this process, we propose an automatic trailer\ngeneration framework that generates plausible trailers from a full movie by\nautomating shot selection and composition. Our approach draws inspiration from\nmachine translation techniques and models the movies and trailers as sequences\nof shots, thus formulating the trailer generation problem as a\nsequence-to-sequence task. We introduce Trailer Generation Transformer (TGT), a\ndeep-learning framework utilizing an encoder-decoder architecture. TGT movie\nencoder is tasked with contextualizing each movie shot representation via\nself-attention, while the autoregressive trailer decoder predicts the feature\nrepresentation of the next trailer shot, accounting for the relevance of shots'\ntemporal order in trailers. Our TGT significantly outperforms previous methods\non a comprehensive suite of metrics.\n","authors":["Dawit Mureja Argaw","Mattia Soldan","Alejandro Pardo","Chen Zhao","Fabian Caba Heilbron","Joon Son Chung","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2404.03477v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03474v1","updated":"2024-04-04T14:26:58Z","published":"2024-04-04T14:26:58Z","title":"Performance of computer vision algorithms for fine-grained\n classification using crowdsourced insect images","summary":" With fine-grained classification, we identify unique characteristics to\ndistinguish among classes of the same super-class. We are focusing on species\nrecognition in Insecta, as they are critical for biodiversity monitoring and at\nthe base of many ecosystems. With citizen science campaigns, billions of images\nare collected in the wild. Once these are labelled, experts can use them to\ncreate distribution maps. However, the labelling process is time-consuming,\nwhich is where computer vision comes in. The field of computer vision offers a\nwide range of algorithms, each with its strengths and weaknesses; how do we\nidentify the algorithm that is in line with our application? To answer this\nquestion, we provide a full and detailed evaluation of nine algorithms among\ndeep convolutional networks (CNN), vision transformers (ViT), and\nlocality-based vision transformers (LBVT) on 4 different aspects:\nclassification performance, embedding quality, computational cost, and gradient\nactivity. We offer insights that we haven't yet had in this domain proving to\nwhich extent these algorithms solve the fine-grained tasks in Insecta. We found\nthat the ViT performs the best on inference speed and computational cost while\nthe LBVT outperforms the others on performance and embedding quality; the CNN\nprovide a trade-off among the metrics.\n","authors":["Rita Pucci","Vincent J. Kalkman","Dan Stowell"],"pdf_url":"https://arxiv.org/pdf/2404.03474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03462v1","updated":"2024-04-04T14:13:56Z","published":"2024-04-04T14:13:56Z","title":"You Only Scan Once: A Dynamic Scene Reconstruction Pipeline for 6-DoF\n Robotic Grasping of Novel Objects","summary":" In the realm of robotic grasping, achieving accurate and reliable\ninteractions with the environment is a pivotal challenge. Traditional methods\nof grasp planning methods utilizing partial point clouds derived from depth\nimage often suffer from reduced scene understanding due to occlusion,\nultimately impeding their grasping accuracy. Furthermore, scene reconstruction\nmethods have primarily relied upon static techniques, which are susceptible to\nenvironment change during manipulation process limits their efficacy in\nreal-time grasping tasks. To address these limitations, this paper introduces a\nnovel two-stage pipeline for dynamic scene reconstruction. In the first stage,\nour approach takes scene scanning as input to register each target object with\nmesh reconstruction and novel object pose tracking. In the second stage, pose\ntracking is still performed to provide object poses in real-time, enabling our\napproach to transform the reconstructed object point clouds back into the\nscene. Unlike conventional methodologies, which rely on static scene snapshots,\nour method continuously captures the evolving scene geometry, resulting in a\ncomprehensive and up-to-date point cloud representation. By circumventing the\nconstraints posed by occlusion, our method enhances the overall grasp planning\nprocess and empowers state-of-the-art 6-DoF robotic grasping algorithms to\nexhibit markedly improved accuracy.\n","authors":["Lei Zhou","Haozhe Wang","Zhengshen Zhang","Zhiyang Liu","Francis EH Tay","adn Marcelo H. Ang. Jr"],"pdf_url":"https://arxiv.org/pdf/2404.03462v1.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2404.03451v1","updated":"2024-04-04T13:55:06Z","published":"2024-04-04T13:55:06Z","title":"How Much Data are Enough? Investigating Dataset Requirements for\n Patch-Based Brain MRI Segmentation Tasks","summary":" Training deep neural networks reliably requires access to large-scale\ndatasets. However, obtaining such datasets can be challenging, especially in\nthe context of neuroimaging analysis tasks, where the cost associated with\nimage acquisition and annotation can be prohibitive. To mitigate both the time\nand financial costs associated with model development, a clear understanding of\nthe amount of data required to train a satisfactory model is crucial. This\npaper focuses on an early stage phase of deep learning research, prior to model\ndevelopment, and proposes a strategic framework for estimating the amount of\nannotated data required to train patch-based segmentation networks. This\nframework includes the establishment of performance expectations using a novel\nMinor Boundary Adjustment for Threshold (MinBAT) method, and standardizing\npatch selection through the ROI-based Expanded Patch Selection (REPS) method.\nOur experiments demonstrate that tasks involving regions of interest (ROIs)\nwith different sizes or shapes may yield variably acceptable Dice Similarity\nCoefficient (DSC) scores. By setting an acceptable DSC as the target, the\nrequired amount of training data can be estimated and even predicted as data\naccumulates. This approach could assist researchers and engineers in estimating\nthe cost associated with data collection and annotation when defining a new\nsegmentation task based on deep neural networks, ultimately contributing to\ntheir efficient translation to real-world applications.\n","authors":["Dongang Wang","Peilin Liu","Hengrui Wang","Heidi Beadnall","Kain Kyle","Linda Ly","Mariano Cabezas","Geng Zhan","Ryan Sullivan","Weidong Cai","Wanli Ouyang","Fernando Calamante","Michael Barnett","Chenyu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13785v2","updated":"2024-04-04T13:52:17Z","published":"2024-01-24T20:06:59Z","title":"Unified Spatio-Temporal Tri-Perspective View Representation for 3D\n Semantic Occupancy Prediction","summary":" Holistic understanding and reasoning in 3D scenes play a vital role in the\nsuccess of autonomous driving systems. The evolution of 3D semantic occupancy\nprediction as a pretraining task for autonomous driving and robotic downstream\ntasks capture finer 3D details compared to methods like 3D detection. Existing\napproaches predominantly focus on spatial cues such as tri-perspective view\nembeddings (TPV), often overlooking temporal cues. This study introduces a\nspatiotemporal transformer architecture S2TPVFormer for temporally coherent 3D\nsemantic occupancy prediction. We enrich the prior process by including\ntemporal cues using a novel temporal cross-view hybrid attention mechanism\n(TCVHA) and generate spatiotemporal TPV embeddings (i.e. S2TPV embeddings).\nExperimental evaluations on the nuScenes dataset demonstrate a substantial 4.1%\nimprovement in mean Intersection over Union (mIoU) for 3D Semantic Occupancy\ncompared to TPVFormer, confirming the effectiveness of the proposed S2TPVFormer\nin enhancing 3D scene perception.\n","authors":["Sathira Silva","Savindu Bhashitha Wannigama","Gihan Jayatilaka","Muhammad Haris Khan","Roshan Ragel"],"pdf_url":"https://arxiv.org/pdf/2401.13785v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03446v1","updated":"2024-04-04T13:46:52Z","published":"2024-04-04T13:46:52Z","title":"SP$^2$OT: Semantic-Regularized Progressive Partial Optimal Transport for\n Imbalanced Clustering","summary":" Deep clustering, which learns representation and semantic clustering without\nlabels information, poses a great challenge for deep learning-based approaches.\nDespite significant progress in recent years, most existing methods focus on\nuniformly distributed datasets, significantly limiting the practical\napplicability of their methods. In this paper, we propose a more practical\nproblem setting named deep imbalanced clustering, where the underlying classes\nexhibit an imbalance distribution. To address this challenge, we introduce a\nnovel optimal transport-based pseudo-label learning framework. Our framework\nformulates pseudo-label generation as a Semantic-regularized Progressive\nPartial Optimal Transport (SP$^2$OT) problem, which progressively transports\neach sample to imbalanced clusters under several prior distribution and\nsemantic relation constraints, thus generating high-quality and imbalance-aware\npseudo-labels. To solve SP$^2$OT, we develop a Majorization-Minimization-based\noptimization algorithm. To be more precise, we employ the strategy of\nmajorization to reformulate the SP$^2$OT problem into a Progressive Partial\nOptimal Transport problem, which can be transformed into an unbalanced optimal\ntransport problem with augmented constraints and can be solved efficiently by a\nfast matrix scaling algorithm. Experiments on various datasets, including a\nhuman-curated long-tailed CIFAR100, challenging ImageNet-R, and large-scale\nsubsets of fine-grained iNaturalist2018 datasets, demonstrate the superiority\nof our method.\n","authors":["Chuyu Zhang","Hui Ren","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2404.03446v1.pdf","comment":"under review. arXiv admin note: substantial text overlap with\n arXiv:2401.09266"},{"id":"http://arxiv.org/abs/2404.03443v1","updated":"2024-04-04T13:43:11Z","published":"2024-04-04T13:43:11Z","title":"Part-Attention Based Model Make Occluded Person Re-Identification\n Stronger","summary":" The goal of occluded person re-identification (ReID) is to retrieve specific\npedestrians in occluded situations. However, occluded person ReID still suffers\nfrom background clutter and low-quality local feature representations, which\nlimits model performance. In our research, we introduce a new framework called\nPAB-ReID, which is a novel ReID model incorporating part-attention mechanisms\nto tackle the aforementioned issues effectively. Firstly, we introduce the\nhuman parsing label to guide the generation of more accurate human part\nattention maps. In addition, we propose a fine-grained feature focuser for\ngenerating fine-grained human local feature representations while suppressing\nbackground interference. Moreover, We also design a part triplet loss to\nsupervise the learning of human local features, which optimizes\nintra/inter-class distance. We conducted extensive experiments on specialized\nocclusion and regular ReID datasets, showcasing that our approach outperforms\nthe existing state-of-the-art methods.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.03443v1.pdf","comment":"Accepted By International Joint Conference on Neural Networks"},{"id":"http://arxiv.org/abs/2312.12080v2","updated":"2024-04-04T13:36:21Z","published":"2023-12-19T11:57:54Z","title":"Learning Subject-Aware Cropping by Outpainting Professional Photos","summary":" How to frame (or crop) a photo often depends on the image subject and its\ncontext; e.g., a human portrait. Recent works have defined the subject-aware\nimage cropping task as a nuanced and practical version of image cropping. We\npropose a weakly-supervised approach (GenCrop) to learn what makes a\nhigh-quality, subject-aware crop from professional stock images. Unlike\nsupervised prior work, GenCrop requires no new manual annotations beyond the\nexisting stock image collection. The key challenge in learning from this data,\nhowever, is that the images are already cropped and we do not know what regions\nwere removed. Our insight is to combine a library of stock images with a\nmodern, pre-trained text-to-image diffusion model. The stock image collection\nprovides diversity and its images serve as pseudo-labels for a good crop, while\nthe text-image diffusion model is used to out-paint (i.e., outward inpainting)\nrealistic uncropped images. Using this procedure, we are able to automatically\ngenerate a large dataset of cropped-uncropped training pairs to train a\ncropping model. Despite being weakly-supervised, GenCrop is competitive with\nstate-of-the-art supervised methods and significantly better than comparable\nweakly-supervised baselines on quantitative and qualitative evaluation metrics.\n","authors":["James Hong","Lu Yuan","Michaël Gharbi","Matthew Fisher","Kayvon Fatahalian"],"pdf_url":"https://arxiv.org/pdf/2312.12080v2.pdf","comment":"AAAI 24. Extended version with supplemental materials"},{"id":"http://arxiv.org/abs/2404.02656v2","updated":"2024-04-04T13:30:59Z","published":"2024-04-03T11:37:03Z","title":"Non-negative Subspace Feature Representation for Few-shot Learning in\n Medical Imaging","summary":" Unlike typical visual scene recognition domains, in which massive datasets\nare accessible to deep neural networks, medical image interpretations are often\nobstructed by the paucity of data. In this paper, we investigate the\neffectiveness of data-based few-shot learning in medical imaging by exploring\ndifferent data attribute representations in a low-dimensional space. We\nintroduce different types of non-negative matrix factorization (NMF) in\nfew-shot learning, addressing the data scarcity issue in medical image\nclassification. Extensive empirical studies are conducted in terms of\nvalidating the effectiveness of NMF, especially its supervised variants (e.g.,\ndiscriminative NMF, and supervised and constrained NMF with sparseness), and\nthe comparison with principal component analysis (PCA), i.e., the collaborative\nrepresentation-based dimensionality reduction technique derived from\neigenvectors. With 14 different datasets covering 11 distinct illness\ncategories, thorough experimental results and comparison with related\ntechniques demonstrate that NMF is a competitive alternative to PCA for\nfew-shot learning in medical imaging, and the supervised NMF algorithms are\nmore discriminative in the subspace with greater effectiveness. Furthermore, we\nshow that the part-based representation of NMF, especially its supervised\nvariants, is dramatically impactful in detecting lesion areas in medical\nimaging with limited samples.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2404.02656v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14162v3","updated":"2024-04-04T13:29:25Z","published":"2023-09-25T14:13:26Z","title":"Data Upcycling Knowledge Distillation for Image Super-Resolution","summary":" Knowledge distillation (KD) compresses deep neural networks by transferring\ntask-related knowledge from cumbersome pre-trained teacher models to compact\nstudent models. However, current KD methods for super-resolution (SR) networks\noverlook the nature of SR task that the outputs of the teacher model are noisy\napproximations to the ground-truth distribution of high-quality images (GT),\nwhich shades the teacher model's knowledge to result in limited KD effects. To\nutilize the teacher model beyond the GT upper-bound, we present the Data\nUpcycling Knowledge Distillation (DUKD), to transfer the teacher model's\nknowledge to the student model through the upcycled in-domain data derived from\ntraining data. Besides, we impose label consistency regularization to KD for SR\nby the paired invertible augmentations to improve the student model's\nperformance and robustness. Comprehensive experiments demonstrate that the DUKD\nmethod significantly outperforms previous arts on several SR tasks.\n","authors":["Yun Zhang","Wei Li","Simiao Li","Hanting Chen","Zhijun Tu","Wenjia Wang","Bingyi Jing","Shaohui Lin","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2309.14162v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03425v1","updated":"2024-04-04T13:06:25Z","published":"2024-04-04T13:06:25Z","title":"ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State\n Space Model","summary":" Convolutional neural networks (CNN) and Transformers have made impressive\nprogress in the field of remote sensing change detection (CD). However, both\narchitectures have their inherent shortcomings. Recently, the Mamba\narchitecture, based on spatial state models, has shown remarkable performance\nin a series of natural language processing tasks, which can effectively\ncompensate for the shortcomings of the above two architectures. In this paper,\nwe explore for the first time the potential of the Mamba architecture for\nremote sensing change detection tasks. We tailor the corresponding frameworks,\ncalled MambaBCD, MambaSCD, and MambaBDA, for binary change detection (BCD),\nsemantic change detection (SCD), and building damage assessment (BDA),\nrespectively. All three frameworks adopt the cutting-edge visual Mamba\narchitecture as the encoder, which allows full learning of global spatial\ncontextual information from the input images. For the change decoder, which is\navailable in all three architectures, we propose three spatio-temporal\nrelationship modeling mechanisms, which can be naturally combined with the\nMamba architecture and fully utilize its attribute to achieve spatio-temporal\ninteraction of multi-temporal features and obtain accurate change information.\nOn five benchmark datasets, our proposed frameworks outperform current CNN- and\nTransformer-based approaches without using any complex strategies or tricks,\nfully demonstrating the potential of the Mamba architecture. Specifically, we\nobtained 83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU,\nLEVIR-CD+, and WHU-CD; on the SCD dataset SECOND, we obtained 24.04% SeK; and\non the xBD dataset, we obtained 81.41% overall F1 score. The source code will\nbe available in https://github.com/ChenHongruixuan/MambaCD\n","authors":["Hongruixuan Chen","Jian Song","Chengxi Han","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.03425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00778v2","updated":"2024-04-04T13:00:20Z","published":"2023-12-01T18:55:53Z","title":"MorpheuS: Neural Dynamic 360° Surface Reconstruction from Monocular\n RGB-D Video","summary":" Neural rendering has demonstrated remarkable success in dynamic scene\nreconstruction. Thanks to the expressiveness of neural representations, prior\nworks can accurately capture the motion and achieve high-fidelity\nreconstruction of the target object. Despite this, real-world video scenarios\noften feature large unobserved regions where neural representations struggle to\nachieve realistic completion. To tackle this challenge, we introduce MorpheuS,\na framework for dynamic 360{\\deg} surface reconstruction from a casually\ncaptured RGB-D video. Our approach models the target scene as a canonical field\nthat encodes its geometry and appearance, in conjunction with a deformation\nfield that warps points from the current frame to the canonical space. We\nleverage a view-dependent diffusion prior and distill knowledge from it to\nachieve realistic completion of unobserved regions. Experimental results on\nvarious real-world and synthetic datasets show that our method can achieve\nhigh-fidelity 360{\\deg} surface reconstruction of a deformable object from a\nmonocular RGB-D video.\n","authors":["Hengyi Wang","Jingwen Wang","Lourdes Agapito"],"pdf_url":"https://arxiv.org/pdf/2312.00778v2.pdf","comment":"CVPR2024. Project page:\n https://hengyiwang.github.io/projects/morpheus"},{"id":"http://arxiv.org/abs/2404.03421v1","updated":"2024-04-04T12:58:46Z","published":"2024-04-04T12:58:46Z","title":"Generalizable 3D Scene Reconstruction via Divide and Conquer from a\n Single View","summary":" Single-view 3D reconstruction is currently approached from two dominant\nperspectives: reconstruction of scenes with limited diversity using 3D data\nsupervision or reconstruction of diverse singular objects using large image\npriors. However, real-world scenarios are far more complex and exceed the\ncapabilities of these methods. We therefore propose a hybrid method following a\ndivide-and-conquer strategy. We first process the scene holistically,\nextracting depth and semantic information, and then leverage a single-shot\nobject-level method for the detailed reconstruction of individual components.\nBy following a compositional processing approach, the overall framework\nachieves full reconstruction of complex 3D scenes from a single image. We\npurposely design our pipeline to be highly modular by carefully integrating\nspecific procedures for each processing step, without requiring an end-to-end\ntraining of the whole system. This enables the pipeline to naturally improve as\nfuture methods can replace the individual modules. We demonstrate the\nreconstruction performance of our approach on both synthetic and real-world\nscenes, comparing favorable against prior works. Project page:\nhttps://andreeadogaru.github.io/Gen3DSR.\n","authors":["Andreea Dogaru","Mert Özer","Bernhard Egger"],"pdf_url":"https://arxiv.org/pdf/2404.03421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03417v1","updated":"2024-04-04T12:50:51Z","published":"2024-04-04T12:50:51Z","title":"NMF-Based Analysis of Mobile Eye-Tracking Data","summary":" The depiction of scanpaths from mobile eye-tracking recordings by thumbnails\nfrom the stimulus allows the application of visual computing to detect areas of\ninterest in an unsupervised way. We suggest using nonnegative matrix\nfactorization (NMF) to identify such areas in stimuli. For a user-defined\ninteger k, NMF produces an explainable decomposition into k components, each\nconsisting of a spatial representation associated with a temporal indicator. In\nthe context of multiple eye-tracking recordings, this leads to k spatial\nrepresentations, where the temporal indicator highlights the appearance within\nrecordings. The choice of k provides an opportunity to control the refinement\nof the decomposition, i.e., the number of areas to detect. We combine our\nNMF-based approach with visualization techniques to enable an exploratory\nanalysis of multiple recordings. Finally, we demonstrate the usefulness of our\napproach with mobile eye-tracking data of an art gallery.\n","authors":["Daniel Klötzl","Tim Krake","Frank Heyen","Michael Becher","Maurice Koch","Daniel Weiskopf","Kuno Kurzhals"],"pdf_url":"https://arxiv.org/pdf/2404.03417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03415v1","updated":"2024-04-04T12:49:42Z","published":"2024-04-04T12:49:42Z","title":"Future Predictive Success-or-Failure Classification for Long-Horizon\n Robotic Tasks","summary":" Automating long-horizon tasks with a robotic arm has been a central research\ntopic in robotics. Optimization-based action planning is an efficient approach\nfor creating an action plan to complete a given task. Construction of a\nreliable planning method requires a design process of conditions, e.g., to\navoid collision between objects. The design process, however, has two critical\nissues: 1) iterative trials--the design process is time-consuming due to the\ntrial-and-error process of modifying conditions, and 2) manual redesign--it is\ndifficult to cover all the necessary conditions manually. To tackle these\nissues, this paper proposes a future-predictive\nsuccess-or-failure-classification method to obtain conditions automatically.\nThe key idea behind the proposed method is an end-to-end approach for\ndetermining whether the action plan can complete a given task instead of\nmanually redesigning the conditions. The proposed method uses a long-horizon\nfuture-prediction method to enable success-or-failure classification without\nthe execution of an action plan. This paper also proposes a regularization term\ncalled transition consistency regularization to provide easy-to-predict feature\ndistribution. The regularization term improves future prediction and\nclassification performance. The effectiveness of our method is demonstrated\nthrough classification and robotic-manipulation experiments.\n","authors":["Naoya Sogi","Hiroyuki Oyama","Takashi Shibata","Makoto Terao"],"pdf_url":"https://arxiv.org/pdf/2404.03415v1.pdf","comment":"IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.03413v1","updated":"2024-04-04T12:46:01Z","published":"2024-04-04T12:46:01Z","title":"MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with\n Interleaved Visual-Textual Tokens","summary":" This paper introduces MiniGPT4-Video, a multimodal Large Language Model (LLM)\ndesigned specifically for video understanding. The model is capable of\nprocessing both temporal visual and textual data, making it adept at\nunderstanding the complexities of videos. Building upon the success of\nMiniGPT-v2, which excelled in translating visual features into the LLM space\nfor single images and achieved impressive results on various image-text\nbenchmarks, this paper extends the model's capabilities to process a sequence\nof frames, enabling it to comprehend videos. MiniGPT4-video does not only\nconsider visual content but also incorporates textual conversations, allowing\nthe model to effectively answer queries involving both visual and text\ncomponents. The proposed model outperforms existing state-of-the-art methods,\nregistering gains of 4.22%, 1.13%, 20.82%, and 13.1% on the MSVD, MSRVTT, TGIF,\nand TVQA benchmarks respectively. Our models and code have been made publicly\navailable here https://vision-cair.github.io/MiniGPT4-video/\n","authors":["Kirolos Ataallah","Xiaoqian Shen","Eslam Abdelrahman","Essam Sleiman","Deyao Zhu","Jian Ding","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2404.03413v1.pdf","comment":"6 pages,8 figures"},{"id":"http://arxiv.org/abs/2403.16612v2","updated":"2024-04-04T12:35:33Z","published":"2024-03-25T10:42:48Z","title":"Calibrating Bayesian UNet++ for Sub-Seasonal Forecasting","summary":" Seasonal forecasting is a crucial task when it comes to detecting the extreme\nheat and colds that occur due to climate change. Confidence in the predictions\nshould be reliable since a small increase in the temperatures in a year has a\nbig impact on the world. Calibration of the neural networks provides a way to\nensure our confidence in the predictions. However, calibrating regression\nmodels is an under-researched topic, especially in forecasters. We calibrate a\nUNet++ based architecture, which was shown to outperform physics-based models\nin temperature anomalies. We show that with a slight trade-off between\nprediction error and calibration error, it is possible to get more reliable and\nsharper forecasts. We believe that calibration should be an important part of\nsafety-critical machine learning applications such as weather forecasters.\n","authors":["Busra Asan","Abdullah Akgül","Alper Unal","Melih Kandemir","Gozde Unal"],"pdf_url":"https://arxiv.org/pdf/2403.16612v2.pdf","comment":"Accepted as a workshop paper at \"ICLR 2024 Tackling Climate Change\n with Machine Learning\""},{"id":"http://arxiv.org/abs/2404.03407v1","updated":"2024-04-04T12:12:24Z","published":"2024-04-04T12:12:24Z","title":"AIGIQA-20K: A Large Database for AI-Generated Image Quality Assessment","summary":" With the rapid advancements in AI-Generated Content (AIGC), AI-Generated\nImages (AIGIs) have been widely applied in entertainment, education, and social\nmedia. However, due to the significant variance in quality among different\nAIGIs, there is an urgent need for models that consistently match human\nsubjective ratings. To address this issue, we organized a challenge towards\nAIGC quality assessment on NTIRE 2024 that extensively considers 15 popular\ngenerative models, utilizing dynamic hyper-parameters (including\nclassifier-free guidance, iteration epochs, and output image resolution), and\ngather subjective scores that consider perceptual quality and text-to-image\nalignment altogether comprehensively involving 21 subjects. This approach\nculminates in the creation of the largest fine-grained AIGI subjective quality\ndatabase to date with 20,000 AIGIs and 420,000 subjective ratings, known as\nAIGIQA-20K. Furthermore, we conduct benchmark experiments on this database to\nassess the correspondence between 16 mainstream AIGI quality models and human\nperception. We anticipate that this large-scale quality database will inspire\nrobust quality indicators for AIGIs and propel the evolution of AIGC for\nvision. The database is released on\nhttps://www.modelscope.cn/datasets/lcysyzxdxc/AIGCQA-30K-Image.\n","authors":["Chunyi Li","Tengchuan Kou","Yixuan Gao","Yuqin Cao","Wei Sun","Zicheng Zhang","Yingjie Zhou","Zhichao Zhang","Weixia Zhang","Haoning Wu","Xiaohong Liu","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.03407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03398v1","updated":"2024-04-04T11:59:06Z","published":"2024-04-04T11:59:06Z","title":"Scaling Up Video Summarization Pretraining with Large Language Models","summary":" Long-form video content constitutes a significant portion of internet\ntraffic, making automated video summarization an essential research problem.\nHowever, existing video summarization datasets are notably limited in their\nsize, constraining the effectiveness of state-of-the-art methods for\ngeneralization. Our work aims to overcome this limitation by capitalizing on\nthe abundance of long-form videos with dense speech-to-video alignment and the\nremarkable capabilities of recent large language models (LLMs) in summarizing\nlong text. We introduce an automated and scalable pipeline for generating a\nlarge-scale video summarization dataset using LLMs as Oracle summarizers. By\nleveraging the generated dataset, we analyze the limitations of existing\napproaches and propose a new video summarization model that effectively\naddresses them. To facilitate further research in the field, our work also\npresents a new benchmark dataset that contains 1200 long videos each with\nhigh-quality summaries annotated by professionals. Extensive experiments\nclearly indicate that our proposed approach sets a new state-of-the-art in\nvideo summarization across several benchmarks.\n","authors":["Dawit Mureja Argaw","Seunghyun Yoon","Fabian Caba Heilbron","Hanieh Deilamsalehy","Trung Bui","Zhaowen Wang","Franck Dernoncourt","Joon Son Chung"],"pdf_url":"https://arxiv.org/pdf/2404.03398v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03394v1","updated":"2024-04-04T11:53:37Z","published":"2024-04-04T11:53:37Z","title":"Background Noise Reduction of Attention Map for Weakly Supervised\n Semantic Segmentation","summary":" In weakly-supervised semantic segmentation (WSSS) using only image-level\nclass labels, a problem with CNN-based Class Activation Maps (CAM) is that they\ntend to activate the most discriminative local regions of objects. On the other\nhand, methods based on Transformers learn global features but suffer from the\nissue of background noise contamination. This paper focuses on addressing the\nissue of background noise in attention weights within the existing WSSS method\nbased on Conformer, known as TransCAM. The proposed method successfully reduces\nbackground noise, leading to improved accuracy of pseudo labels. Experimental\nresults demonstrate that our model achieves segmentation performance of 70.5%\non the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS\nCOCO 2014 data, outperforming TransCAM in terms of segmentation performance.\n","authors":["Izumi Fujimori","Masaki Oono","Masami Shishibori"],"pdf_url":"https://arxiv.org/pdf/2404.03394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03392v1","updated":"2024-04-04T11:49:56Z","published":"2024-04-04T11:49:56Z","title":"Two Tricks to Improve Unsupervised Segmentation Learning","summary":" We present two practical improvement techniques for unsupervised segmentation\nlearning. These techniques address limitations in the resolution and accuracy\nof predicted segmentation maps of recent state-of-the-art methods. Firstly, we\nleverage image post-processing techniques such as guided filtering to refine\nthe output masks, improving accuracy while avoiding substantial computational\ncosts. Secondly, we introduce a multi-scale consistency criterion, based on a\nteacher-student training scheme. This criterion matches segmentation masks\npredicted from regions of the input image extracted at different resolutions to\neach other. Experimental results on several benchmarks used in unsupervised\nsegmentation learning demonstrate the effectiveness of our proposed techniques.\n","authors":["Alp Eren Sari","Francesco Locatello","Paolo Favar"],"pdf_url":"https://arxiv.org/pdf/2404.03392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03384v1","updated":"2024-04-04T11:33:29Z","published":"2024-04-04T11:33:29Z","title":"LongVLM: Efficient Long Video Understanding via Large Language Models","summary":" Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs\nhave driven progress in various video understanding tasks. These models encode\nvideo representations through pooling or query aggregation over a vast number\nof visual tokens, making computational and memory costs affordable. Despite\nsuccessfully providing an overall comprehension of video content, existing\nVideoLLMs still face challenges in achieving detailed understanding in videos\ndue to overlooking local information in long-term videos. To tackle this\nchallenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for\nlong video understanding, building upon the observation that long videos often\nconsist of sequential key events, complex actions, and camera movements. Our\napproach proposes to decompose long videos into multiple short-term segments\nand encode local features for each local segment via a hierarchical token\nmerging module. These features are concatenated in temporal order to maintain\nthe storyline across sequential short-term segments. Additionally, we propose\nto integrate global semantics into each local feature to enhance context\nunderstanding. In this way, we encode video representations that incorporate\nboth local and global information, enabling the LLM to generate comprehensive\nresponses for long-term videos. Experimental results on the VideoChatGPT\nbenchmark and zero-shot video question-answering datasets demonstrate the\nsuperior capabilities of our model over the previous state-of-the-art methods.\nQualitative examples demonstrate that our model produces more precise responses\nfor long videos understanding. Code is available at\n\\url{https://github.com/ziplab/LongVLM}.\n","authors":["Yuetian Weng","Mingfei Han","Haoyu He","Xiaojun Chang","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.03384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03349v1","updated":"2024-04-04T10:30:28Z","published":"2024-04-04T10:30:28Z","title":"VF-NeRF: Viewshed Fields for Rigid NeRF Registration","summary":" 3D scene registration is a fundamental problem in computer vision that seeks\nthe best 6-DoF alignment between two scenes. This problem was extensively\ninvestigated in the case of point clouds and meshes, but there has been\nrelatively limited work regarding Neural Radiance Fields (NeRF). In this paper,\nwe consider the problem of rigid registration between two NeRFs when the\nposition of the original cameras is not given. Our key novelty is the\nintroduction of Viewshed Fields (VF), an implicit function that determines, for\neach 3D point, how likely it is to be viewed by the original cameras. We\ndemonstrate how VF can help in the various stages of NeRF registration, with an\nextensive evaluation showing that VF-NeRF achieves SOTA results on various\ndatasets with different capturing approaches such as LLFF and Objaverese.\n","authors":["Leo Segre","Shai Avidan"],"pdf_url":"https://arxiv.org/pdf/2404.03349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03340v1","updated":"2024-04-04T10:10:38Z","published":"2024-04-04T10:10:38Z","title":"Meta Invariance Defense Towards Generalizable Robustness to Unknown\n Adversarial Attacks","summary":" Despite providing high-performance solutions for computer vision tasks, the\ndeep neural network (DNN) model has been proved to be extremely vulnerable to\nadversarial attacks. Current defense mainly focuses on the known attacks, but\nthe adversarial robustness to the unknown attacks is seriously overlooked.\nBesides, commonly used adaptive learning and fine-tuning technique is\nunsuitable for adversarial defense since it is essentially a zero-shot problem\nwhen deployed. Thus, to tackle this challenge, we propose an attack-agnostic\ndefense method named Meta Invariance Defense (MID). Specifically, various\ncombinations of adversarial attacks are randomly sampled from a manually\nconstructed Attacker Pool to constitute different defense tasks against unknown\nattacks, in which a student encoder is supervised by multi-consistency\ndistillation to learn the attack-invariant features via a meta principle. The\nproposed MID has two merits: 1) Full distillation from pixel-, feature- and\nprediction-level between benign and adversarial samples facilitates the\ndiscovery of attack-invariance. 2) The model simultaneously achieves robustness\nto the imperceptible adversarial perturbations in high-level image\nclassification and attack-suppression in low-level robust image regeneration.\nTheoretical and empirical studies on numerous benchmarks such as ImageNet\nverify the generalizable robustness and superiority of MID under various\nattacks.\n","authors":["Lei Zhang","Yuhang Zhou","Yi Yang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2404.03340v1.pdf","comment":"Accepted by IEEE TPAMI in 2024"},{"id":"http://arxiv.org/abs/2404.03327v1","updated":"2024-04-04T09:53:00Z","published":"2024-04-04T09:53:00Z","title":"DI-Retinex: Digital-Imaging Retinex Theory for Low-Light Image\n Enhancement","summary":" Many existing methods for low-light image enhancement (LLIE) based on Retinex\ntheory ignore important factors that affect the validity of this theory in\ndigital imaging, such as noise, quantization error, non-linearity, and dynamic\nrange overflow. In this paper, we propose a new expression called\nDigital-Imaging Retinex theory (DI-Retinex) through theoretical and\nexperimental analysis of Retinex theory in digital imaging. Our new expression\nincludes an offset term in the enhancement model, which allows for pixel-wise\nbrightness contrast adjustment with a non-linear mapping function. In addition,\nto solve the lowlight enhancement problem in an unsupervised manner, we propose\nan image-adaptive masked reverse degradation loss in Gamma space. We also\ndesign a variance suppression loss for regulating the additional offset term.\nExtensive experiments show that our proposed method outperforms all existing\nunsupervised methods in terms of visual quality, model size, and speed. Our\nalgorithm can also assist downstream face detectors in low-light, as it shows\nthe most performance gain after the low-light enhancement compared to other\nmethods.\n","authors":["Shangquan Sun","Wenqi Ren","Jingyang Peng","Fenglong Song","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2404.03327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01064v2","updated":"2024-04-04T09:48:30Z","published":"2024-04-01T11:57:34Z","title":"Roadside Monocular 3D Detection via 2D Detection Prompting","summary":" The problem of roadside monocular 3D detection requires detecting objects of\ninterested classes in a 2D RGB frame and predicting their 3D information such\nas locations in bird's-eye-view (BEV). It has broad applications in traffic\ncontrol, vehicle-vehicle communication, and vehicle-infrastructure cooperative\nperception. To approach this problem, we present a novel and simple method by\nprompting the 3D detector using 2D detections. Our method builds on a key\ninsight that, compared with 3D detectors, a 2D detector is much easier to train\nand performs significantly better w.r.t detections on the 2D image plane. That\nsaid, one can exploit 2D detections of a well-trained 2D detector as prompts to\na 3D detector, being trained in a way of inflating such 2D detections to 3D\ntowards 3D detection. To construct better prompts using the 2D detector, we\nexplore three techniques: (a) concatenating both 2D and 3D detectors' features,\n(b) attentively fusing 2D and 3D detectors' features, and (c) encoding\npredicted 2D boxes x, y, width, height, label and attentively fusing such with\nthe 3D detector's features. Surprisingly, the third performs the best.\nMoreover, we present a yaw tuning tactic and a class-grouping strategy that\nmerges classes based on their functionality; these techniques improve 3D\ndetection performance further. Comprehensive ablation studies and extensive\nexperiments demonstrate that our method resoundingly outperforms prior works,\nachieving the state-of-the-art on two large-scale roadside 3D detection\nbenchmarks.\n","authors":["Yechi Ma","Shuoquan Wei","Churun Zhang","Wei Hua","Yanan Li","Shu Kong"],"pdf_url":"https://arxiv.org/pdf/2404.01064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03323v1","updated":"2024-04-04T09:43:43Z","published":"2024-04-04T09:43:43Z","title":"Sparse Concept Bottleneck Models: Gumbel Tricks in Contrastive Learning","summary":" We propose a novel architecture and method of explainable classification with\nConcept Bottleneck Models (CBMs). While SOTA approaches to Image Classification\ntask work as a black box, there is a growing demand for models that would\nprovide interpreted results. Such a models often learn to predict the\ndistribution over class labels using additional description of this target\ninstances, called concepts. However, existing Bottleneck methods have a number\nof limitations: their accuracy is lower than that of a standard model and CBMs\nrequire an additional set of concepts to leverage. We provide a framework for\ncreating Concept Bottleneck Model from pre-trained multi-modal encoder and new\nCLIP-like architectures. By introducing a new type of layers known as Concept\nBottleneck Layers, we outline three methods for training them: with\n$\\ell_1$-loss, contrastive loss and loss function based on Gumbel-Softmax\ndistribution (Sparse-CBM), while final FC layer is still trained with\nCross-Entropy. We show a significant increase in accuracy using sparse hidden\nlayers in CLIP-based bottleneck models. Which means that sparse representation\nof concepts activation vector is meaningful in Concept Bottleneck Models.\nMoreover, with our Concept Matrix Search algorithm we can improve CLIP\npredictions on complex datasets without any additional training or fine-tuning.\nThe code is available at: https://github.com/Andron00e/SparseCBM.\n","authors":["Andrei Semenov","Vladimir Ivanov","Aleksandr Beznosikov","Alexander Gasnikov"],"pdf_url":"https://arxiv.org/pdf/2404.03323v1.pdf","comment":"23 pages, 1 algorithm, 36 figures"},{"id":"http://arxiv.org/abs/2310.00615v3","updated":"2024-04-04T09:18:50Z","published":"2023-10-01T08:32:46Z","title":"Scene-aware Human Motion Forecasting via Mutual Distance Prediction","summary":" In this paper, we tackle the problem of scene-aware 3D human motion\nforecasting. A key challenge of this task is to predict future human motions\nthat are consistent with the scene by modeling the human-scene interactions.\nWhile recent works have demonstrated that explicit constraints on human-scene\ninteractions can prevent the occurrence of ghost motion, they only provide\nconstraints on partial human motion e.g., the global motion of the human or a\nfew joints contacting the scene, leaving the rest of the motion unconstrained.\nTo address this limitation, we propose to model the human-scene interaction\nwith the mutual distance between the human body and the scene. Such mutual\ndistances constrain both the local and global human motion, resulting in a\nwhole-body motion constrained prediction. In particular, mutual distance\nconstraints consist of two components, the signed distance of each vertex on\nthe human mesh to the scene surface and the distance of basis scene points to\nthe human mesh. We further introduce a global scene representation learned from\na signed distance function (SDF) volume to ensure coherence between the global\nscene representation and the explicit constraint from the mutual distance. We\ndevelop a pipeline with two sequential steps: predicting the future mutual\ndistances first, followed by forecasting future human motion. During training,\nwe explicitly encourage consistency between predicted poses and mutual\ndistances. Extensive evaluations on the existing synthetic and real datasets\ndemonstrate that our approach consistently outperforms the state-of-the-art\nmethods.\n","authors":["Chaoyue Xing","Wei Mao","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2310.00615v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19773v2","updated":"2024-04-04T09:05:49Z","published":"2024-03-28T18:50:19Z","title":"ShapeFusion: A 3D diffusion model for localized shape editing","summary":" In the realm of 3D computer vision, parametric models have emerged as a\nground-breaking methodology for the creation of realistic and expressive 3D\navatars. Traditionally, they rely on Principal Component Analysis (PCA), given\nits ability to decompose data to an orthonormal space that maximally captures\nshape variations. However, due to the orthogonality constraints and the global\nnature of PCA's decomposition, these models struggle to perform localized and\ndisentangled editing of 3D shapes, which severely affects their use in\napplications requiring fine control such as face sculpting. In this paper, we\nleverage diffusion models to enable diverse and fully localized edits on 3D\nmeshes, while completely preserving the un-edited regions. We propose an\neffective diffusion masking training strategy that, by design, facilitates\nlocalized manipulation of any shape region, without being limited to predefined\nregions or to sparse sets of predefined control vertices. Following our\nframework, a user can explicitly set their manipulation region of choice and\ndefine an arbitrary set of vertices as handles to edit a 3D mesh. Compared to\nthe current state-of-the-art our method leads to more interpretable shape\nmanipulations than methods relying on latent code state, greater localization\nand generation diversity while offering faster inference than optimization\nbased approaches. Project page: https://rolpotamias.github.io/Shapefusion/\n","authors":["Rolandos Alexandros Potamias","Michail Tarasiou","Stylianos Ploumpis","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2403.19773v2.pdf","comment":"Project Page: https://rolpotamias.github.io/Shapefusion/"},{"id":"http://arxiv.org/abs/2404.02614v2","updated":"2024-04-04T08:57:00Z","published":"2024-04-03T10:01:23Z","title":"Vestibular schwannoma growth prediction from longitudinal MRI by time\n conditioned neural fields","summary":" Vestibular schwannomas (VS) are benign tumors that are generally managed by\nactive surveillance with MRI examination. To further assist clinical\ndecision-making and avoid overtreatment, an accurate prediction of tumor growth\nbased on longitudinal imaging is highly desirable. In this paper, we introduce\nDeepGrowth, a deep learning method that incorporates neural fields and\nrecurrent neural networks for prospective tumor growth prediction. In the\nproposed method, each tumor is represented as a signed distance function (SDF)\nconditioned on a low-dimensional latent code. Unlike previous studies that\nperform tumor shape prediction directly in the image space, we predict the\nlatent codes instead and then reconstruct future shapes from it. To deal with\nirregular time intervals, we introduce a time-conditioned recurrent module\nbased on a ConvLSTM and a novel temporal encoding strategy, which enables the\nproposed model to output varying tumor shapes over time. The experiments on an\nin-house longitudinal VS dataset showed that the proposed model significantly\nimproved the performance ($\\ge 1.6\\%$ Dice score and $\\ge0.20$ mm 95\\%\nHausdorff distance), in particular for top 20\\% tumors that grow or shrink the\nmost ($\\ge 4.6\\%$ Dice score and $\\ge 0.73$ mm 95\\% Hausdorff distance). Our\ncode is available at ~\\burl{https://github.com/cyjdswx/DeepGrowth}\n","authors":["Yunjie Chen","Jelmer M. Wolterink","Olaf M. Neve","Stephan R. Romeijn","Berit M. Verbist","Erik F. Hensen","Qian Tao","Marius Staring"],"pdf_url":"https://arxiv.org/pdf/2404.02614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02282v2","updated":"2024-04-04T08:38:17Z","published":"2024-04-02T20:15:43Z","title":"Smooth Deep Saliency","summary":" In this work, we investigate methods to reduce the noise in deep saliency\nmaps coming from convolutional downsampling, with the purpose of explaining how\na deep learning model detects tumors in scanned histological tissue samples.\nThose methods make the investigated models more interpretable for\ngradient-based saliency maps, computed in hidden layers. We test our approach\non different models trained for image classification on ImageNet1K, and models\ntrained for tumor detection on Camelyon16 and in-house real-world digital\npathology scans of stained tissue samples. Our results show that the\ncheckerboard noise in the gradient gets reduced, resulting in smoother and\ntherefore easier to interpret saliency maps.\n","authors":["Rudolf Herdt","Maximilian Schmidt","Daniel Otero Baguer","Peter Maaß"],"pdf_url":"https://arxiv.org/pdf/2404.02282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03296v1","updated":"2024-04-04T08:37:27Z","published":"2024-04-04T08:37:27Z","title":"AdaBM: On-the-Fly Adaptive Bit Mapping for Image Super-Resolution","summary":" Although image super-resolution (SR) problem has experienced unprecedented\nrestoration accuracy with deep neural networks, it has yet limited versatile\napplications due to the substantial computational costs. Since different input\nimages for SR face different restoration difficulties, adapting computational\ncosts based on the input image, referred to as adaptive inference, has emerged\nas a promising solution to compress SR networks. Specifically, adapting the\nquantization bit-widths has successfully reduced the inference and memory cost\nwithout sacrificing the accuracy. However, despite the benefits of the\nresultant adaptive network, existing works rely on time-intensive\nquantization-aware training with full access to the original training pairs to\nlearn the appropriate bit allocation policies, which limits its ubiquitous\nusage. To this end, we introduce the first on-the-fly adaptive quantization\nframework that accelerates the processing time from hours to seconds. We\nformulate the bit allocation problem with only two bit mapping modules: one to\nmap the input image to the image-wise bit adaptation factor and one to obtain\nthe layer-wise adaptation factors. These bit mappings are calibrated and\nfine-tuned using only a small number of calibration images. We achieve\ncompetitive performance with the previous adaptive quantization methods, while\nthe processing time is accelerated by x2000. Codes are available at\nhttps://github.com/Cheeun/AdaBM.\n","authors":["Cheeun Hong","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.03296v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2211.07459v2","updated":"2024-04-04T08:24:54Z","published":"2022-11-14T15:37:27Z","title":"Self-Aligning Depth-regularized Radiance Fields for Asynchronous RGB-D\n Sequences","summary":" It has been shown that learning radiance fields with depth rendering and\ndepth supervision can effectively promote the quality and convergence of view\nsynthesis. However, this paradigm requires input RGB-D sequences to be\nsynchronized, hindering its usage in the UAV city modeling scenario. As there\nexists asynchrony between RGB images and depth images due to high-speed flight,\nwe propose a novel time-pose function, which is an implicit network that maps\ntimestamps to $\\rm SE(3)$ elements. To simplify the training process, we also\ndesign a joint optimization scheme to jointly learn the large-scale\ndepth-regularized radiance fields and the time-pose function. Our algorithm\nconsists of three steps: (1) time-pose function fitting, (2) radiance field\nbootstrapping, (3) joint pose error compensation and radiance field refinement.\nIn addition, we propose a large synthetic dataset with diverse controlled\nmismatches and ground truth to evaluate this new problem setting\nsystematically. Through extensive experiments, we demonstrate that our method\noutperforms baselines without regularization. We also show qualitatively\nimproved results on a real-world asynchronous RGB-D sequence captured by drone.\nCodes, data, and models will be made publicly available.\n","authors":["Yuxin Huang","Andong Yang","Zirui Wu","Yuantao Chen","Runyi Yang","Zhenxin Zhu","Chao Hou","Hao Zhao","Guyue Zhou"],"pdf_url":"https://arxiv.org/pdf/2211.07459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01692v2","updated":"2024-04-04T08:07:22Z","published":"2024-04-02T06:52:31Z","title":"Beyond Image Super-Resolution for Image Recognition with Task-Driven\n Perceptual Loss","summary":" In real-world scenarios, image recognition tasks, such as semantic\nsegmentation and object detection, often pose greater challenges due to the\nlack of information available within low-resolution (LR) content. Image\nsuper-resolution (SR) is one of the promising solutions for addressing the\nchallenges. However, due to the ill-posed property of SR, it is challenging for\ntypical SR methods to restore task-relevant high-frequency contents, which may\ndilute the advantage of utilizing the SR method. Therefore, in this paper, we\npropose Super-Resolution for Image Recognition (SR4IR) that effectively guides\nthe generation of SR images beneficial to achieving satisfactory image\nrecognition performance when processing LR images. The critical component of\nour SR4IR is the task-driven perceptual (TDP) loss that enables the SR network\nto acquire task-specific knowledge from a network tailored for a specific task.\nMoreover, we propose a cross-quality patch mix and an alternate training\nframework that significantly enhances the efficacy of the TDP loss by\naddressing potential problems when employing the TDP loss. Through extensive\nexperiments, we demonstrate that our SR4IR achieves outstanding task\nperformance by generating SR images useful for a specific image recognition\ntask, including semantic segmentation, object detection, and image\nclassification. The implementation code is available at\nhttps://github.com/JaehaKim97/SR4IR.\n","authors":["Jaeha Kim","Junghun Oh","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.01692v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17369v2","updated":"2024-04-04T08:05:06Z","published":"2024-03-26T04:09:08Z","title":"CoDA: Instructive Chain-of-Domain Adaptation with Severity-Aware Visual\n Prompt Tuning","summary":" Unsupervised Domain Adaptation (UDA) aims to adapt models from labeled source\ndomains to unlabeled target domains. When adapting to adverse scenes, existing\nUDA methods fail to perform well due to the lack of instructions, leading their\nmodels to overlook discrepancies within all adverse scenes. To tackle this, we\npropose CoDA which instructs models to distinguish, focus, and learn from these\ndiscrepancies at scene and image levels. Specifically, CoDA consists of a\nChain-of-Domain (CoD) strategy and a Severity-Aware Visual Prompt Tuning\n(SAVPT) mechanism. CoD focuses on scene-level instructions to divide all\nadverse scenes into easy and hard scenes, guiding models to adapt from source\nto easy domains with easy scene images, and then to hard domains with hard\nscene images, thereby laying a solid foundation for whole adaptations. Building\nupon this foundation, we employ SAVPT to dive into more detailed image-level\ninstructions to boost performance. SAVPT features a novel metric Severity that\ndivides all adverse scene images into low-severity and high-severity images.\nThen Severity directs visual prompts and adapters, instructing models to\nconcentrate on unified severity features instead of scene-specific features,\nwithout adding complexity to the model architecture. CoDA achieves SOTA\nperformances on widely-used benchmarks under all adverse scenes. Notably, CoDA\noutperforms the existing ones by 4.6%, and 10.3% mIoU on the Foggy Driving, and\nFoggy Zurich benchmarks, respectively. Our code is available at\nhttps://github.com/Cuzyoung/CoDA\n","authors":["Ziyang Gong","Fuhao Li","Yupeng Deng","Deblina Bhattacharjee","Xiangwei Zhu","Zhenming Ji"],"pdf_url":"https://arxiv.org/pdf/2403.17369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03277v1","updated":"2024-04-04T08:04:00Z","published":"2024-04-04T08:04:00Z","title":"Design and Development of a Framework For Stroke-Based Handwritten\n Gujarati Font Generation","summary":" Handwritten font generation is important for preserving cultural heritage and\ncreating personalized designs. It adds an authentic and expressive touch to\nprinted materials, making them visually appealing and establishing a stronger\nconnection with the audience. This paper aims to design a framework for\ngenerating handwritten fonts in the Gujarati script, mimicking the variation of\nhuman handwriting. The proposed font generation model consists of a learning\nphase and a generation phase. In the learning phase, Gujarati scripts are\nanalyzed, and rules for designing each character are formulated. This ruleset\ninvolves the concatenation of strokes in a stroke-based manner, ensuring visual\nconsistency in the resulting glyphs. The generation phase involves the user\nproviding a small subset of characters, and the system automatically generates\nthe remaining character glyphs based on extracted strokes and learned rules,\nresulting in handwritten Gujarati fonts. The resulting character glyphs are\nconverted into an open-type font using the FontForge tool, making them\ncompatible with any Gujarati editor. Both subjective and objective evaluations\nare conducted to assess the synthesized images and fonts. Subjective evaluation\nthrough user studies provides feedback on quality and visual appeal, achieving\nan overall accuracy of 84.84%. Notably, eleven characters demonstrated a\nsuccess ratio above 90%. Objective evaluation using an existing recognition\nsystem achieves an overall accuracy of 84.28% in OCR evaluation. Notably,\nfifteen characters had a success ratio of 80% or higher.\n","authors":["Preeti P. Bhatt","Jitendra V. Nasriwala","Rakesh R. Savant"],"pdf_url":"https://arxiv.org/pdf/2404.03277v1.pdf","comment":"13 pages, 2 column, 12 figures"},{"id":"http://arxiv.org/abs/2404.01758v2","updated":"2024-04-04T08:03:04Z","published":"2024-04-02T09:18:52Z","title":"GEARS: Local Geometry-aware Hand-object Interaction Synthesis","summary":" Generating realistic hand motion sequences in interaction with objects has\ngained increasing attention with the growing interest in digital humans. Prior\nwork has illustrated the effectiveness of employing occupancy-based or\ndistance-based virtual sensors to extract hand-object interaction features.\nNonetheless, these methods show limited generalizability across object\ncategories, shapes and sizes. We hypothesize that this is due to two reasons:\n1) the limited expressiveness of employed virtual sensors, and 2) scarcity of\navailable training data. To tackle this challenge, we introduce a novel\njoint-centered sensor designed to reason about local object geometry near\npotential interaction regions. The sensor queries for object surface points in\nthe neighbourhood of each hand joint. As an important step towards mitigating\nthe learning complexity, we transform the points from global frame to hand\ntemplate frame and use a shared module to process sensor features of each\nindividual joint. This is followed by a spatio-temporal transformer network\naimed at capturing correlation among the joints in different dimensions.\nMoreover, we devise simple heuristic rules to augment the limited training\nsequences with vast static hand grasping samples. This leads to a broader\nspectrum of grasping types observed during training, in turn enhancing our\nmodel's generalization capability. We evaluate on two public datasets, GRAB and\nInterCap, where our method shows superiority over baselines both quantitatively\nand perceptually.\n","authors":["Keyang Zhou","Bharat Lal Bhatnagar","Jan Eric Lenssen","Gerard Pons-moll"],"pdf_url":"https://arxiv.org/pdf/2404.01758v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02935v3","updated":"2024-04-04T07:56:59Z","published":"2023-08-05T18:32:49Z","title":"Bias Behind the Wheel: Fairness Analysis of Autonomous Driving Systems","summary":" This paper analyzes fairness in automated pedestrian detection, a crucial but\nunder-explored issue in autonomous driving systems. We evaluate eight\nstate-of-the-art deep learning-based pedestrian detectors across demographic\ngroups on large-scale real-world datasets. To enable thorough fairness testing,\nwe provide extensive annotations for the datasets, resulting in 8,311 images\nwith 16,070 gender labels, 20,115 age labels, and 3,513 skin tone labels. Our\nfindings reveal significant fairness issues, particularly related to age. The\nundetected proportions for children are 20.14% higher compared to adults.\nFurthermore, we explore how various driving scenarios affect the fairness of\npedestrian detectors. We find that pedestrian detectors demonstrate significant\ngender biases during night time, potentially exacerbating the prevalent\nsocietal issue of female safety concerns during nighttime out. Moreover, we\nobserve that pedestrian detectors can demonstrate both enhanced fairness and\nsuperior performance under specific driving conditions, which challenges the\nfairness-performance trade-off theory widely acknowledged in the fairness\nliterature. We publicly release the code, data, and results to support future\nresearch on fairness in autonomous driving.\n","authors":["Xinyue Li","Zhenpeng Chen","Jie M. Zhang","Federica Sarro","Ying Zhang","Xuanzhe Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02935v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03256v1","updated":"2024-04-04T07:26:26Z","published":"2024-04-04T07:26:26Z","title":"Multi Positive Contrastive Learning with Pose-Consistent Generated\n Images","summary":" Model pre-training has become essential in various recognition tasks.\nMeanwhile, with the remarkable advancements in image generation models,\npre-training methods utilizing generated images have also emerged given their\nability to produce unlimited training data. However, while existing methods\nutilizing generated images excel in classification, they fall short in more\npractical tasks, such as human pose estimation. In this paper, we have\nexperimentally demonstrated it and propose the generation of visually distinct\nimages with identical human poses. We then propose a novel multi-positive\ncontrastive learning, which optimally utilize the previously generated images\nto learn structural features of the human body. We term the entire learning\npipeline as GenPoCCL. Despite using only less than 1% amount of data compared\nto current state-of-the-art method, GenPoCCL captures structural features of\nthe human body more effectively, surpassing existing methods in a variety of\nhuman-centric perception tasks.\n","authors":["Sho Inayoshi","Aji Resindra Widya","Satoshi Ozaki","Junji Otsuka","Takeshi Ohashi"],"pdf_url":"https://arxiv.org/pdf/2404.03256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03253v1","updated":"2024-04-04T07:19:31Z","published":"2024-04-04T07:19:31Z","title":"A dataset of primary nasopharyngeal carcinoma MRI with multi-modalities\n segmentation","summary":" Multi-modality magnetic resonance imaging data with various sequences\nfacilitate the early diagnosis, tumor segmentation, and disease staging in the\nmanagement of nasopharyngeal carcinoma (NPC). The lack of publicly available,\ncomprehensive datasets limits advancements in diagnosis, treatment planning,\nand the development of machine learning algorithms for NPC. Addressing this\ncritical need, we introduce the first comprehensive NPC MRI dataset,\nencompassing MR axial imaging of 277 primary NPC patients. This dataset\nincludes T1-weighted, T2-weighted, and contrast-enhanced T1-weighted sequences,\ntotaling 831 scans. In addition to the corresponding clinical data, manually\nannotated and labeled segmentations by experienced radiologists offer\nhigh-quality data resources from untreated primary NPC.\n","authors":["Yin Li","Qi Chen","Kai Wang","Meige Li","Liping Si","Yingwei Guo","Yu Xiong","Qixing Wang","Yang Qin","Ling Xu","Patrick van der Smagt","Jun Tang","Nutan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.03253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03251v1","updated":"2024-04-04T07:14:12Z","published":"2024-04-04T07:14:12Z","title":"Real-time Noise Source Estimation of a Camera System from an Image and\n Metadata","summary":" Autonomous machines must self-maintain proper functionality to ensure the\nsafety of humans and themselves. This pertains particularly to its cameras as\npredominant sensors to perceive the environment and support actions. A\nfundamental camera problem addressed in this study is noise. Solutions often\nfocus on denoising images a posteriori, that is, fighting symptoms rather than\nroot causes. However, tackling root causes requires identifying the noise\nsources, considering the limitations of mobile platforms. This work\ninvestigates a real-time, memory-efficient and reliable noise source estimator\nthat combines data- and physically-based models. To this end, a DNN that\nexamines an image with camera metadata for major camera noise sources is built\nand trained. In addition, it quantifies unexpected factors that impact image\nnoise or metadata. This study investigates seven different estimators on six\ndatasets that include synthetic noise, real-world noise from two camera\nsystems, and real field campaigns. For these, only the model with most metadata\nis capable to accurately and robustly quantify all individual noise\ncontributions. This method outperforms total image noise estimators and can be\nplug-and-play deployed. It also serves as a basis to include more advanced\nnoise sources, or as part of an automatic countermeasure feedback-loop to\napproach fully reliable machines.\n","authors":["Maik Wischow","Patrick Irmisch","Anko Boerner","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2404.03251v1.pdf","comment":"16 pages, 16 figures, 12 tables, Project page:\n https://github.com/MaikWischow/Noise-Source-Estimation"},{"id":"http://arxiv.org/abs/2404.03248v1","updated":"2024-04-04T07:07:34Z","published":"2024-04-04T07:07:34Z","title":"Learning Transferable Negative Prompts for Out-of-Distribution Detection","summary":" Existing prompt learning methods have shown certain capabilities in\nOut-of-Distribution (OOD) detection, but the lack of OOD images in the target\ndataset in their training can lead to mismatches between OOD images and\nIn-Distribution (ID) categories, resulting in a high false positive rate. To\naddress this issue, we introduce a novel OOD detection method, named\n'NegPrompt', to learn a set of negative prompts, each representing a negative\nconnotation of a given class label, for delineating the boundaries between ID\nand OOD images. It learns such negative prompts with ID data only, without any\nreliance on external outlier data. Further, current methods assume the\navailability of samples of all ID classes, rendering them ineffective in\nopen-vocabulary learning scenarios where the inference stage can contain novel\nID classes not present during training. In contrast, our learned negative\nprompts are transferable to novel class labels. Experiments on various ImageNet\nbenchmarks show that NegPrompt surpasses state-of-the-art prompt-learning-based\nOOD detection methods and maintains a consistent lead in hard OOD detection in\nclosed- and open-vocabulary classification scenarios. Code is available at\nhttps://github.com/mala-lab/negprompt.\n","authors":["Tianqi Li","Guansong Pang","Xiao Bai","Wenjun Miao","Jin Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.03248v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2306.10482v2","updated":"2024-04-04T07:00:56Z","published":"2023-06-18T05:37:38Z","title":"Weighted structure tensor total variation for image denoising","summary":" For image denoising problems, the structure tensor total variation\n(STV)-based models show good performances when compared with other competing\nregularization approaches. However, the STV regularizer does not couple the\nlocal information of the image and may not maintain the image details.\nTherefore, we employ the anisotropic weighted matrix introduced in the\nanisotropic total variation (ATV) model to improve the STV model. By applying\nthe weighted matrix to the discrete gradient of the patch-based Jacobian\noperator in STV, our proposed weighted STV (WSTV) model can effectively capture\nlocal information from images and maintain their details during the denoising\nprocess. The optimization problem in the model is solved by a fast first-order\ngradient projection algorithm with a complexity result of $O(1 / i^2)$. For\nimages with different Gaussian noise levels, the experimental results\ndemonstrate that the WSTV model can effectively improve the quality of restored\nimages compared to other TV and STV-based models.\n","authors":["Xiuhan Sheng","Lijuan Yang","Jingya Chang"],"pdf_url":"https://arxiv.org/pdf/2306.10482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03242v1","updated":"2024-04-04T06:58:39Z","published":"2024-04-04T06:58:39Z","title":"Would Deep Generative Models Amplify Bias in Future Models?","summary":" We investigate the impact of deep generative models on potential social\nbiases in upcoming computer vision models. As the internet witnesses an\nincreasing influx of AI-generated images, concerns arise regarding inherent\nbiases that may accompany them, potentially leading to the dissemination of\nharmful content. This paper explores whether a detrimental feedback loop,\nresulting in bias amplification, would occur if generated images were used as\nthe training data for future models. We conduct simulations by progressively\nsubstituting original images in COCO and CC3M datasets with images generated\nthrough Stable Diffusion. The modified datasets are used to train OpenCLIP and\nimage captioning models, which we evaluate in terms of quality and bias.\nContrary to expectations, our findings indicate that introducing generated\nimages during training does not uniformly amplify bias. Instead, instances of\nbias mitigation across specific tasks are observed. We further explore the\nfactors that may influence these phenomena, such as artifacts in image\ngeneration (e.g., blurry faces) or pre-existing biases in the original\ndatasets.\n","authors":["Tianwei Chen","Yusuke Hirota","Mayu Otani","Noa Garcia","Yuta Nakashima"],"pdf_url":"https://arxiv.org/pdf/2404.03242v1.pdf","comment":"This paper has been accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.12433v2","updated":"2024-04-04T06:57:39Z","published":"2024-01-23T01:52:49Z","title":"A Novel Garment Transfer Method Supervised by Distilled Knowledge of\n Virtual Try-on Model","summary":" This paper proposes a novel garment transfer method supervised with knowledge\ndistillation from virtual try-on. Our method first reasons the transfer parsing\nto provide shape prior to downstream tasks. We employ a multi-phase teaching\nstrategy to supervise the training of the transfer parsing reasoning model,\nlearning the response and feature knowledge from the try-on parsing reasoning\nmodel. To correct the teaching error, it transfers the garment back to its\nowner to absorb the hard knowledge in the self-study phase. Guided by the\ntransfer parsing, we adjust the position of the transferred garment via STN to\nprevent distortion. Afterward, we estimate a progressive flow to precisely warp\nthe garment with shape and content correspondences. To ensure warping\nrationality, we supervise the training of the garment warping model using\ntarget shape and warping knowledge from virtual try-on. To better preserve body\nfeatures in the transfer result, we propose a well-designed training strategy\nfor the arm regrowth task to infer new exposure skin. Experiments demonstrate\nthat our method has state-of-the-art performance compared with other virtual\ntry-on and garment transfer methods in garment transfer, especially for\npreserving garment texture and body features.\n","authors":["Naiyu Fang","Lemiao Qiu","Shuyou Zhang","Zili Wang","Kerui Hu","Jianrong Tan"],"pdf_url":"https://arxiv.org/pdf/2401.12433v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06395v2","updated":"2024-04-04T06:46:42Z","published":"2024-01-12T06:28:54Z","title":"ModaVerse: Efficiently Transforming Modalities with LLMs","summary":" Humans possess the capability to comprehend diverse modalities and seamlessly\ntransfer information between them. In this work, we introduce ModaVerse, a\nMulti-modal Large Language Model (MLLM) capable of comprehending and\ntransforming content across various modalities including images, videos, and\naudio. Predominant MLLM frameworks have largely relied on the alignment of\nlatent spaces of textual and non-textual features. This alignment process,\nwhich synchronizes a language model trained on textual data with encoders and\ndecoders trained on multi-modal data, often necessitates extensive training of\nseveral projection layers in multiple stages. Inspired by LLM-as-agent\nmethodologies, we propose a novel Input/Output (I/O) alignment mechanism that\noperates directly at the level of natural language. It aligns the LLM's output\nwith the input of generative models, avoiding the complexities associated with\nlatent feature alignments, and simplifying the multiple training stages of\nexisting MLLMs into a single, efficient process. This conceptual advancement\nleads to significant reductions in both data and computational costs. By\nconducting experiments on several benchmarks, we demonstrate that our approach\nattains comparable performance with the state of the art while achieving\nconsiderable efficiencies in data usage and training duration.\n","authors":["Xinyu Wang","Bohan Zhuang","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2401.06395v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.03225v1","updated":"2024-04-04T06:20:22Z","published":"2024-04-04T06:20:22Z","title":"FACTUAL: A Novel Framework for Contrastive Learning Based Robust SAR\n Image Classification","summary":" Deep Learning (DL) Models for Synthetic Aperture Radar (SAR) Automatic Target\nRecognition (ATR), while delivering improved performance, have been shown to be\nquite vulnerable to adversarial attacks. Existing works improve robustness by\ntraining models on adversarial samples. However, by focusing mostly on attacks\nthat manipulate images randomly, they neglect the real-world feasibility of\nsuch attacks. In this paper, we propose FACTUAL, a novel Contrastive Learning\nframework for Adversarial Training and robust SAR classification. FACTUAL\nconsists of two components: (1) Differing from existing works, a novel\nperturbation scheme that incorporates realistic physical adversarial attacks\n(such as OTSA) to build a supervised adversarial pre-training network. This\nnetwork utilizes class labels for clustering clean and perturbed images\ntogether into a more informative feature space. (2) A linear classifier\ncascaded after the encoder to use the computed representations to predict the\ntarget labels. By pre-training and fine-tuning our model on both clean and\nadversarial samples, we show that our model achieves high prediction accuracy\non both cases. Our model achieves 99.7% accuracy on clean samples, and 89.6% on\nperturbed samples, both outperforming previous state-of-the-art methods.\n","authors":["Xu Wang","Tian Ye","Rajgopal Kannan","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2404.03225v1.pdf","comment":"2024 IEEE Radar Conference"},{"id":"http://arxiv.org/abs/2404.03219v1","updated":"2024-04-04T05:54:19Z","published":"2024-04-04T05:54:19Z","title":"iSeg: Interactive 3D Segmentation via Interactive Attention","summary":" We present iSeg, a new interactive technique for segmenting 3D shapes.\nPrevious works have focused mainly on leveraging pre-trained 2D foundation\nmodels for 3D segmentation based on text. However, text may be insufficient for\naccurately describing fine-grained spatial segmentations. Moreover, achieving a\nconsistent 3D segmentation using a 2D model is challenging since occluded areas\nof the same semantic region may not be visible together from any 2D view. Thus,\nwe design a segmentation method conditioned on fine user clicks, which operates\nentirely in 3D. Our system accepts user clicks directly on the shape's surface,\nindicating the inclusion or exclusion of regions from the desired shape\npartition. To accommodate various click settings, we propose a novel\ninteractive attention module capable of processing different numbers and types\nof clicks, enabling the training of a single unified interactive segmentation\nmodel. We apply iSeg to a myriad of shapes from different domains,\ndemonstrating its versatility and faithfulness to the user's specifications.\nOur project page is at https://threedle.github.io/iSeg/.\n","authors":["Itai Lang","Fei Xu","Dale Decatur","Sudarshan Babu","Rana Hanocka"],"pdf_url":"https://arxiv.org/pdf/2404.03219v1.pdf","comment":"Project page: https://threedle.github.io/iSeg/"},{"id":"http://arxiv.org/abs/2404.03214v1","updated":"2024-04-04T05:39:09Z","published":"2024-04-04T05:39:09Z","title":"LeGrad: An Explainability Method for Vision Transformers via Feature\n Formation Sensitivity","summary":" Vision Transformers (ViTs), with their ability to model long-range\ndependencies through self-attention mechanisms, have become a standard\narchitecture in computer vision. However, the interpretability of these models\nremains a challenge. To address this, we propose LeGrad, an explainability\nmethod specifically designed for ViTs. LeGrad computes the gradient with\nrespect to the attention maps of ViT layers, considering the gradient itself as\nthe explainability signal. We aggregate the signal over all layers, combining\nthe activations of the last as well as intermediate tokens to produce the\nmerged explainability map. This makes LeGrad a conceptually simple and an\neasy-to-implement tool for enhancing the transparency of ViTs. We evaluate\nLeGrad in challenging segmentation, perturbation, and open-vocabulary settings,\nshowcasing its versatility compared to other SotA explainability methods\ndemonstrating its superior spatial fidelity and robustness to perturbations. A\ndemo and the code is available at https://github.com/WalBouss/LeGrad.\n","authors":["Walid Bousselham","Angie Boggust","Sofian Chaybouti","Hendrik Strobelt","Hilde Kuehne"],"pdf_url":"https://arxiv.org/pdf/2404.03214v1.pdf","comment":"Code available at https://github.com/WalBouss/LeGrad"},{"id":"http://arxiv.org/abs/2404.03210v1","updated":"2024-04-04T05:33:06Z","published":"2024-04-04T05:33:06Z","title":"HDR Imaging for Dynamic Scenes with Events","summary":" High dynamic range imaging (HDRI) for real-world dynamic scenes is\nchallenging because moving objects may lead to hybrid degradation of low\ndynamic range and motion blur. Existing event-based approaches only focus on a\nseparate task, while cascading HDRI and motion deblurring would lead to\nsub-optimal solutions, and unavailable ground-truth sharp HDR images aggravate\nthe predicament. To address these challenges, we propose an Event-based HDRI\nframework within a Self-supervised learning paradigm, i.e., Self-EHDRI, which\ngeneralizes HDRI performance in real-world dynamic scenarios. Specifically, a\nself-supervised learning strategy is carried out by learning cross-domain\nconversions from blurry LDR images to sharp LDR images, which enables sharp HDR\nimages to be accessible in the intermediate process even though ground-truth\nsharp HDR images are missing. Then, we formulate the event-based HDRI and\nmotion deblurring model and conduct a unified network to recover the\nintermediate sharp HDR results, where both the high dynamic range and high\ntemporal resolution of events are leveraged simultaneously for compensation. We\nconstruct large-scale synthetic and real-world datasets to evaluate the\neffectiveness of our method. Comprehensive experiments demonstrate that the\nproposed Self-EHDRI outperforms state-of-the-art approaches by a large margin.\nThe codes, datasets, and results are available at\nhttps://lxp-whu.github.io/Self-EHDRI.\n","authors":["Li Xiaopeng","Zeng Zhaoyuan","Fan Cien","Zhao Chen","Deng Lei","Yu Lei"],"pdf_url":"https://arxiv.org/pdf/2404.03210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03202v1","updated":"2024-04-04T05:10:26Z","published":"2024-04-04T05:10:26Z","title":"OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field\n Reconstruction using Omnidirectional Images","summary":" Photorealistic reconstruction relying on 3D Gaussian Splatting has shown\npromising potential in robotics. However, the current 3D Gaussian Splatting\nsystem only supports radiance field reconstruction using undistorted\nperspective images. In this paper, we present OmniGS, a novel omnidirectional\nGaussian splatting system, to take advantage of omnidirectional images for fast\nradiance field reconstruction. Specifically, we conduct a theoretical analysis\nof spherical camera model derivatives in 3D Gaussian Splatting. According to\nthe derivatives, we then implement a new GPU-accelerated omnidirectional\nrasterizer that directly splats 3D Gaussians onto the equirectangular screen\nspace for omnidirectional image rendering. As a result, we realize\ndifferentiable optimization of the radiance field without the requirement of\ncube-map rectification or tangent-plane approximation. Extensive experiments\nconducted in egocentric and roaming scenarios demonstrate that our method\nachieves state-of-the-art reconstruction quality and high rendering speed using\nomnidirectional images. To benefit the research community, the code will be\nmade publicly available once the paper is published.\n","authors":["Longwei Li","Huajian Huang","Sai-Kit Yeung","Hui Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03202v1.pdf","comment":"IROS 2024 submission, 7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.03200v1","updated":"2024-04-04T05:08:51Z","published":"2024-04-04T05:08:51Z","title":"Future-Proofing Class Incremental Learning","summary":" Exemplar-Free Class Incremental Learning is a highly challenging setting\nwhere replay memory is unavailable. Methods relying on frozen feature\nextractors have drawn attention recently in this setting due to their\nimpressive performances and lower computational costs. However, those methods\nare highly dependent on the data used to train the feature extractor and may\nstruggle when an insufficient amount of classes are available during the first\nincremental step. To overcome this limitation, we propose to use a pre-trained\ntext-to-image diffusion model in order to generate synthetic images of future\nclasses and use them to train the feature extractor. Experiments on the\nstandard benchmarks CIFAR100 and ImageNet-Subset demonstrate that our proposed\nmethod can be used to improve state-of-the-art methods for exemplar-free class\nincremental learning, especially in the most difficult settings where the first\nincremental step only contains few classes. Moreover, we show that using\nsynthetic samples of future classes achieves higher performance than using real\ndata from different classes, paving the way for better and less costly\npre-training methods for incremental learning.\n","authors":["Quentin Jodelet","Xin Liu","Yin Jun Phua","Tsuyoshi Murata"],"pdf_url":"https://arxiv.org/pdf/2404.03200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.09934v7","updated":"2024-04-04T04:52:43Z","published":"2022-07-20T14:20:35Z","title":"DeepIPC: Deeply Integrated Perception and Control for an Autonomous\n Vehicle in Real Environments","summary":" In this work, we introduce DeepIPC, a novel end-to-end model tailored for\nautonomous driving, which seamlessly integrates perception and control tasks.\nUnlike traditional models that handle these tasks separately, DeepIPC\ninnovatively combines a perception module, which processes RGBD images for\nsemantic segmentation and generates bird's eye view (BEV) mappings, with a\ncontroller module that utilizes these insights along with GNSS and angular\nspeed measurements to accurately predict navigational waypoints. This\nintegration allows DeepIPC to efficiently translate complex environmental data\ninto actionable driving commands. Our comprehensive evaluation demonstrates\nDeepIPC's superior performance in terms of drivability and multi-task\nefficiency across diverse real-world scenarios, setting a new benchmark for\nend-to-end autonomous driving systems with a leaner model architecture. The\nexperimental results underscore DeepIPC's potential to significantly enhance\nautonomous vehicular navigation, promising a step forward in the development of\nautonomous driving technologies. For further insights and replication, we will\nmake our code and datasets available at https://github.com/oskarnatan/DeepIPC.\n","authors":["Oskar Natan","Jun Miura"],"pdf_url":"https://arxiv.org/pdf/2207.09934v7.pdf","comment":"Accepted for Publication in IEEE Access"},{"id":"http://arxiv.org/abs/2404.02388v2","updated":"2024-04-04T04:23:10Z","published":"2024-04-03T01:13:05Z","title":"CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation","summary":" Deep Neural Networks (DNNs) are widely used for visual classification tasks,\nbut their complex computation process and black-box nature hinder decision\ntransparency and interpretability. Class activation maps (CAMs) and recent\nvariants provide ways to visually explain the DNN decision-making process by\ndisplaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation\nonly offers relative attention information, that is, on an attention heatmap,\nwe can interpret which image region is more or less important than the others.\nHowever, these regions cannot be meaningfully compared across classes, and the\ncontribution of each region to the model's class prediction is not revealed. To\naddress these challenges that ultimately lead to better DNN Interpretation, in\nthis paper, we propose CAPE, a novel reformulation of CAM that provides a\nunified and probabilistically meaningful assessment of the contributions of\nimage regions. We quantitatively and qualitatively compare CAPE with\nstate-of-the-art CAM methods on CUB and ImageNet benchmark datasets to\ndemonstrate enhanced interpretability. We also test on a cytology imaging\ndataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML)\ndiagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE.\n","authors":["Townim Faisal Chowdhury","Kewen Liao","Vu Minh Hieu Phan","Minh-Son To","Yutong Xie","Kevin Hung","David Ross","Anton van den Hengel","Johan W. Verjans","Zhibin Liao"],"pdf_url":"https://arxiv.org/pdf/2404.02388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03191v1","updated":"2024-04-04T04:22:50Z","published":"2024-04-04T04:22:50Z","title":"CORP: A Multi-Modal Dataset for Campus-Oriented Roadside Perception\n Tasks","summary":" Numerous roadside perception datasets have been introduced to propel\nadvancements in autonomous driving and intelligent transportation systems\nresearch and development. However, it has been observed that the majority of\ntheir concentrates is on urban arterial roads, inadvertently overlooking\nresidential areas such as parks and campuses that exhibit entirely distinct\ncharacteristics. In light of this gap, we propose CORP, which stands as the\nfirst public benchmark dataset tailored for multi-modal roadside perception\ntasks under campus scenarios. Collected in a university campus, CORP consists\nof over 205k images plus 102k point clouds captured from 18 cameras and 9 LiDAR\nsensors. These sensors with different configurations are mounted on roadside\nutility poles to provide diverse viewpoints within the campus region. The\nannotations of CORP encompass multi-dimensional information beyond 2D and 3D\nbounding boxes, providing extra support for 3D seamless tracking and instance\nsegmentation with unique IDs and pixel masks for identifying targets, to\nenhance the understanding of objects and their behaviors distributed across the\ncampus premises. Unlike other roadside datasets about urban traffic, CORP\nextends the spectrum to highlight the challenges for multi-modal perception in\ncampuses and other residential areas.\n","authors":["Beibei Wang","Lu Zhang","Shuang Meng","Chenjie Wang","Jingjing Huang","Yao Li","Haojie Ren","Yuxuan Xiao","Yuru Peng","Jianmin Ji","Yu Zhang","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.03191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03190v1","updated":"2024-04-04T04:22:25Z","published":"2024-04-04T04:22:25Z","title":"Adaptive Discrete Disparity Volume for Self-supervised Monocular Depth\n Estimation","summary":" In self-supervised monocular depth estimation tasks, discrete disparity\nprediction has been proven to attain higher quality depth maps than common\ncontinuous methods. However, current discretization strategies often divide\ndepth ranges of scenes into bins in a handcrafted and rigid manner, limiting\nmodel performance. In this paper, we propose a learnable module, Adaptive\nDiscrete Disparity Volume (ADDV), which is capable of dynamically sensing depth\ndistributions in different RGB images and generating adaptive bins for them.\nWithout any extra supervision, this module can be integrated into existing CNN\narchitectures, allowing networks to produce representative values for bins and\na probability volume over them. Furthermore, we introduce novel training\nstrategies - uniformizing and sharpening - through a loss term and temperature\nparameter, respectively, to provide regularizations under self-supervised\nconditions, preventing model degradation or collapse. Empirical results\ndemonstrate that ADDV effectively processes global information, generating\nappropriate bins for various scenes and producing higher quality depth maps\ncompared to handcrafted methods.\n","authors":["Jianwei Ren"],"pdf_url":"https://arxiv.org/pdf/2404.03190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03188v1","updated":"2024-04-04T04:16:31Z","published":"2024-04-04T04:16:31Z","title":"Classification of Nasopharyngeal Cases using DenseNet Deep Learning\n Architecture","summary":" Nasopharyngeal carcinoma (NPC) is one of the understudied yet deadliest\ncancers in South East Asia. In Malaysia, the prevalence is identified mainly in\nSarawak, among the ethnic of Bidayuh. NPC is often late-diagnosed because it is\nasymptomatic at the early stage. There are several tissue representations from\nthe nasopharynx biopsy, such as nasopharyngeal inflammation (NPI), lymphoid\nhyperplasia (LHP), nasopharyngeal carcinoma (NPC) and normal tissue. This paper\nis our first initiative to identify the difference between NPC, NPI and normal\ncases. Seven whole slide images (WSIs) with gigapixel resolutions from seven\ndifferent patients and two hospitals were experimented with using two test\nsetups, consisting of a different set of images. The tissue regions are patched\ninto smaller blocks and classified using DenseNet architecture with 21 dense\nlayers. Two tests are carried out, each for proof of concept (Test 1) and\nreal-test scenario (Test 2). The accuracy achieved for NPC class is 94.8% for\nTest 1 and 67.0% for Test 2.\n","authors":["W. S. H. M. W. Ahmad","M. F. A. Fauzi","M. K. Abdullahi","Jenny T. H. Lee","N. S. A. Basry","A Yahaya","A. M. Ismail","A. Adam","Elaine W. L. Chan","F. S. Abas"],"pdf_url":"https://arxiv.org/pdf/2404.03188v1.pdf","comment":"This article has been accepted in the Journal of Engineering Science\n and Technology (JESTEC) and awaiting publication"},{"id":"http://arxiv.org/abs/2404.03187v1","updated":"2024-04-04T04:12:30Z","published":"2024-04-04T04:12:30Z","title":"AGL-NET: Aerial-Ground Cross-Modal Global Localization with Varying\n Scales","summary":" We present AGL-NET, a novel learning-based method for global localization\nusing LiDAR point clouds and satellite maps. AGL-NET tackles two critical\nchallenges: bridging the representation gap between image and points modalities\nfor robust feature matching, and handling inherent scale discrepancies between\nglobal view and local view. To address these challenges, AGL-NET leverages a\nunified network architecture with a novel two-stage matching design. The first\nstage extracts informative neural features directly from raw sensor data and\nperforms initial feature matching. The second stage refines this matching\nprocess by extracting informative skeleton features and incorporating a novel\nscale alignment step to rectify scale variations between LiDAR and map data.\nFurthermore, a novel scale and skeleton loss function guides the network toward\nlearning scale-invariant feature representations, eliminating the need for\npre-processing satellite maps. This significantly improves real-world\napplicability in scenarios with unknown map scales. To facilitate rigorous\nperformance evaluation, we introduce a meticulously designed dataset within the\nCARLA simulator specifically tailored for metric localization training and\nassessment. The code and dataset will be made publicly available.\n","authors":["Tianrui Guan","Ruiqi Xian","Xijun Wang","Xiyang Wu","Mohamed Elnoor","Daeun Song","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2404.03187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12301v3","updated":"2024-04-04T04:11:05Z","published":"2023-07-23T11:50:27Z","title":"Image Outlier Detection Without Training using RANSAC","summary":" Image outlier detection (OD) is an essential tool to ensure the quality of\nimages used in computer vision tasks. Existing algorithms often involve\ntraining a model to represent the inlier distribution, and outliers are\ndetermined by some deviation measure. Although existing methods proved\neffective when trained on strictly inlier samples, their performance remains\nquestionable when undesired outliers are included during training. As a result\nof this limitation, it is necessary to carefully examine the data when\ndeveloping OD models for new domains. In this work, we present a novel image OD\nalgorithm called RANSAC-NN that eliminates the need of data examination and\nmodel training altogether. Unlike existing approaches, RANSAC-NN can be\ndirectly applied on datasets containing outliers by sampling and comparing\nsubsets of the data. Our algorithm maintains favorable performance compared to\nexisting methods on a range of benchmarks. Furthermore, we show that RANSAC-NN\ncan enhance the robustness of existing methods by incorporating our algorithm\nas part of the data preparation process.\n","authors":["Chen-Han Tsai","Yu-Shao Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12301v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06647v3","updated":"2024-04-04T04:07:48Z","published":"2023-07-13T09:23:21Z","title":"DeepIPCv2: LiDAR-powered Robust Environmental Perception and\n Navigational Control for Autonomous Vehicle","summary":" We present DeepIPCv2, an autonomous driving model that perceives the\nenvironment using a LiDAR sensor for more robust drivability, especially when\ndriving under poor illumination conditions where everything is not clearly\nvisible. DeepIPCv2 takes a set of LiDAR point clouds as the main perception\ninput. Since point clouds are not affected by illumination changes, they can\nprovide a clear observation of the surroundings no matter what the condition\nis. This results in a better scene understanding and stable features provided\nby the perception module to support the controller module in estimating\nnavigational control properly. To evaluate its performance, we conduct several\ntests by deploying the model to predict a set of driving records and perform\nreal automated driving under three different conditions. We also conduct\nablation and comparative studies with some recent models to justify its\nperformance. Based on the experimental results, DeepIPCv2 shows a robust\nperformance by achieving the best drivability in all driving scenarios.\nFurthermore, to support future research, we will upload the codes and data to\nhttps://github.com/oskarnatan/DeepIPCv2.\n","authors":["Oskar Natan","Jun Miura"],"pdf_url":"https://arxiv.org/pdf/2307.06647v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03183v1","updated":"2024-04-04T03:45:17Z","published":"2024-04-04T03:45:17Z","title":"BodyMAP -- Jointly Predicting Body Mesh and 3D Applied Pressure Map for\n People in Bed","summary":" Accurately predicting the 3D human posture and the pressure exerted on the\nbody for people resting in bed, visualized as a body mesh (3D pose & shape)\nwith a 3D pressure map, holds significant promise for healthcare applications,\nparticularly, in the prevention of pressure ulcers. Current methods focus on\nsingular facets of the problem -- predicting only 2D/3D poses, generating 2D\npressure images, predicting pressure only for certain body regions instead of\nthe full body, or forming indirect approximations to the 3D pressure map. In\ncontrast, we introduce BodyMAP, which jointly predicts the human body mesh and\n3D applied pressure map across the entire human body. Our network leverages\nmultiple visual modalities, incorporating both a depth image of a person in bed\nand its corresponding 2D pressure image acquired from a pressure-sensing\nmattress. The 3D pressure map is represented as a pressure value at each mesh\nvertex and thus allows for precise localization of high-pressure regions on the\nbody. Additionally, we present BodyMAP-WS, a new formulation of pressure\nprediction in which we implicitly learn pressure in 3D by aligning sensed 2D\npressure images with a differentiable 2D projection of the predicted 3D\npressure maps. In evaluations with real-world human data, our method\noutperforms the current state-of-the-art technique by 25% on both body mesh and\n3D applied pressure map prediction tasks for people in bed.\n","authors":["Abhishek Tandon","Anujraaj Goyal","Henry M. Clever","Zackory Erickson"],"pdf_url":"https://arxiv.org/pdf/2404.03183v1.pdf","comment":"Accepted at CVPR 2024 Project Website: https://bodymap3d.github.io/\n Code: https://github.com/RCHI-Lab/BodyMAP"},{"id":"http://arxiv.org/abs/2404.03181v1","updated":"2024-04-04T03:30:49Z","published":"2024-04-04T03:30:49Z","title":"MonoCD: Monocular 3D Object Detection with Complementary Depths","summary":" Monocular 3D object detection has attracted widespread attention due to its\npotential to accurately obtain object 3D localization from a single image at a\nlow cost. Depth estimation is an essential but challenging subtask of monocular\n3D object detection due to the ill-posedness of 2D to 3D mapping. Many methods\nexplore multiple local depth clues such as object heights and keypoints and\nthen formulate the object depth estimation as an ensemble of multiple depth\npredictions to mitigate the insufficiency of single-depth information. However,\nthe errors of existing multiple depths tend to have the same sign, which\nhinders them from neutralizing each other and limits the overall accuracy of\ncombined depth. To alleviate this problem, we propose to increase the\ncomplementarity of depths with two novel designs. First, we add a new depth\nprediction branch named complementary depth that utilizes global and efficient\ndepth clues from the entire image rather than the local clues to reduce the\ncorrelation of depth predictions. Second, we propose to fully exploit the\ngeometric relations between multiple depth clues to achieve complementarity in\nform. Benefiting from these designs, our method achieves higher\ncomplementarity. Experiments on the KITTI benchmark demonstrate that our method\nachieves state-of-the-art performance without introducing extra data. In\naddition, complementary depth can also be a lightweight and plug-and-play\nmodule to boost multiple existing monocular 3d object detectors. Code is\navailable at https://github.com/elvintanhust/MonoCD.\n","authors":["Longfei Yan","Pei Yan","Shengzhou Xiong","Xuanyu Xiang","Yihua Tan"],"pdf_url":"https://arxiv.org/pdf/2404.03181v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03179v1","updated":"2024-04-04T03:28:57Z","published":"2024-04-04T03:28:57Z","title":"UniAV: Unified Audio-Visual Perception for Multi-Task Video Localization","summary":" Video localization tasks aim to temporally locate specific instances in\nvideos, including temporal action localization (TAL), sound event detection\n(SED) and audio-visual event localization (AVEL). Existing methods\nover-specialize on each task, overlooking the fact that these instances often\noccur in the same video to form the complete video content. In this work, we\npresent UniAV, a Unified Audio-Visual perception network, to achieve joint\nlearning of TAL, SED and AVEL tasks for the first time. UniAV can leverage\ndiverse data available in task-specific datasets, allowing the model to learn\nand share mutually beneficial knowledge across tasks and modalities. To tackle\nthe challenges posed by substantial variations in datasets\n(size/domain/duration) and distinct task characteristics, we propose to\nuniformly encode visual and audio modalities of all videos to derive generic\nrepresentations, while also designing task-specific experts to capture unique\nknowledge for each task. Besides, we develop a unified language-aware\nclassifier by utilizing a pre-trained text encoder, enabling the model to\nflexibly detect various types of instances and previously unseen ones by simply\nchanging prompts during inference. UniAV outperforms its single-task\ncounterparts by a large margin with fewer parameters, achieving on-par or\nsuperior performances compared to state-of-the-art task-specific methods across\nActivityNet 1.3, DESED and UnAV-100 benchmarks.\n","authors":["Tiantian Geng","Teng Wang","Yanfu Zhang","Jinming Duan","Weili Guan","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.03179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02232v2","updated":"2024-04-04T03:10:58Z","published":"2023-12-04T06:37:11Z","title":"HumanNeRF-SE: A Simple yet Effective Approach to Animate HumanNeRF with\n Diverse Poses","summary":" We present HumanNeRF-SE, a simple yet effective method that synthesizes\ndiverse novel pose images with simple input. Previous HumanNeRF works require a\nlarge number of optimizable parameters to fit the human images. Instead, we\nreload these approaches by combining explicit and implicit human\nrepresentations to design both generalized rigid deformation and specific\nnon-rigid deformation. Our key insight is that explicit shape can reduce the\nsampling points used to fit implicit representation, and frozen blending\nweights from SMPL constructing a generalized rigid deformation can effectively\navoid overfitting and improve pose generalization performance. Our architecture\ninvolving both explicit and implicit representation is simple yet effective.\nExperiments demonstrate our model can synthesize images under arbitrary poses\nwith few-shot input and increase the speed of synthesizing images by 15 times\nthrough a reduction in computational complexity without using any existing\nacceleration modules. Compared to the state-of-the-art HumanNeRF studies,\nHumanNeRF-SE achieves better performance with fewer learnable parameters and\nless training time.\n","authors":["Caoyuan Ma","Yu-Lun Liu","Zhixiang Wang","Wu Liu","Xinchen Liu","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02232v2.pdf","comment":"16pages, 17 figures, 10 tables"},{"id":"http://arxiv.org/abs/2404.02405v2","updated":"2024-04-04T02:56:00Z","published":"2024-04-03T02:16:30Z","title":"TE-TAD: Towards Full End-to-End Temporal Action Detection via\n Time-Aligned Coordinate Expression","summary":" In this paper, we investigate that the normalized coordinate expression is a\nkey factor as reliance on hand-crafted components in query-based detectors for\ntemporal action detection (TAD). Despite significant advancements towards an\nend-to-end framework in object detection, query-based detectors have been\nlimited in achieving full end-to-end modeling in TAD. To address this issue, we\npropose \\modelname{}, a full end-to-end temporal action detection transformer\nthat integrates time-aligned coordinate expression. We reformulate coordinate\nexpression utilizing actual timeline values, ensuring length-invariant\nrepresentations from the extremely diverse video duration environment.\nFurthermore, our proposed adaptive query selection dynamically adjusts the\nnumber of queries based on video length, providing a suitable solution for\nvarying video durations compared to a fixed query set. Our approach not only\nsimplifies the TAD process by eliminating the need for hand-crafted components\nbut also significantly improves the performance of query-based detectors. Our\nTE-TAD outperforms the previous query-based detectors and achieves competitive\nperformance compared to state-of-the-art methods on popular benchmark datasets.\nCode is available at: https://github.com/Dotori-HJ/TE-TAD\n","authors":["Ho-Joong Kim","Jung-Ho Hong","Heejo Kong","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.13187v2","updated":"2024-04-04T02:47:09Z","published":"2020-10-25T18:51:15Z","title":"Improving the Reconstruction of Disentangled Representation Learners via\n Multi-Stage Modeling","summary":" Current autoencoder-based disentangled representation learning methods\nachieve disentanglement by penalizing the (aggregate) posterior to encourage\nstatistical independence of the latent factors. This approach introduces a\ntrade-off between disentangled representation learning and reconstruction\nquality since the model does not have enough capacity to learn correlated\nlatent variables that capture detail information present in most image data. To\novercome this trade-off, we present a novel multi-stage modeling approach where\nthe disentangled factors are first learned using a penalty-based disentangled\nrepresentation learning method; then, the low-quality reconstruction is\nimproved with another deep generative model that is trained to model the\nmissing correlated latent variables, adding detail information while\nmaintaining conditioning on the previously learned disentangled factors. Taken\ntogether, our multi-stage modelling approach results in a single, coherent\nprobabilistic model that is theoretically justified by the principal of\nD-separation and can be realized with a variety of model classes including\nlikelihood-based models such as variational autoencoders, implicit models such\nas generative adversarial networks, and tractable models like normalizing flows\nor mixtures of Gaussians. We demonstrate that our multi-stage model has higher\nreconstruction quality than current state-of-the-art methods with equivalent\ndisentanglement performance across multiple standard benchmarks. In addition,\nwe apply the multi-stage model to generate synthetic tabular datasets,\nshowcasing an enhanced performance over benchmark models across a variety of\nmetrics. The interpretability analysis further indicates that the multi-stage\nmodel can effectively uncover distinct and meaningful features of variations\nfrom which the original distribution can be recovered.\n","authors":["Akash Srivastava","Yamini Bansal","Yukun Ding","Cole Lincoln Hurwitz","Kai Xu","Bernhard Egger","Prasanna Sattigeri","Joshua B. Tenenbaum","Phuong Le","Arun Prakash R","Nengfeng Zhou","Joel Vaughan","Yaquan Wang","Anwesha Bhattacharyya","Kristjan Greenewald","David D. Cox","Dan Gutfreund"],"pdf_url":"https://arxiv.org/pdf/2010.13187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13739v2","updated":"2024-04-04T02:36:44Z","published":"2023-03-24T01:46:25Z","title":"WM-MoE: Weather-aware Multi-scale Mixture-of-Experts for Blind Adverse\n Weather Removal","summary":" Adverse weather removal tasks like deraining, desnowing, and dehazing are\nusually treated as separate tasks. However, in practical autonomous driving\nscenarios, the type, intensity,and mixing degree of weather are unknown, so\nhandling each task separately cannot deal with the complex practical scenarios.\nIn this paper, we study the blind adverse weather removal problem.\nMixture-of-Experts (MoE) is a popular model that adopts a learnable gate to\nroute the input to different expert networks. The principle of MoE involves\nusing adaptive networks to process different types of unknown inputs.\nTherefore, MoE has great potential for blind adverse weather removal. However,\nthe original MoE module is inadequate for coupled multiple weather types and\nfails to utilize multi-scale features for better performance. To this end, we\npropose a method called Weather-aware Multi-scale MoE (WM-MoE) based on\nTransformer for blind weather removal. WM-MoE includes two key designs:\nWEather-Aware Router (WEAR) and Multi-Scale Experts (MSE). WEAR assigns experts\nfor each image token based on decoupled content and weather features, which\nenhances the model's capability to process multiple adverse weathers. To obtain\ndiscriminative weather features from images, we propose Weather Guidance\nFine-grained Contrastive Learning (WGF-CL), which utilizes weather cluster\ninformation to guide the assignment of positive and negative samples for each\nimage token. Since processing different weather types requires different\nreceptive fields, MSE leverages multi-scale features to enhance the spatial\nrelationship modeling capability, facilitating the high-quality restoration of\ndiverse weather types and intensities. Our method achieves state-of-the-art\nperformance in blind adverse weather removal on two public datasets and our\ndataset. We also demonstrate the advantage of our method on downstream\nsegmentation tasks.\n","authors":["Yulin Luo","Rui Zhao","Xiaobao Wei","Jinwei Chen","Yijie Lu","Shenghao Xie","Tianyu Wang","Ruiqin Xiong","Ming Lu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.13739v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03161v1","updated":"2024-04-04T02:22:37Z","published":"2024-04-04T02:22:37Z","title":"BioVL-QR: Egocentric Biochemical Video-and-Language Dataset Using Micro\n QR Codes","summary":" This paper introduces a biochemical vision-and-language dataset, which\nconsists of 24 egocentric experiment videos, corresponding protocols, and\nvideo-and-language alignments. The key challenge in the wet-lab domain is\ndetecting equipment, reagents, and containers is difficult because the lab\nenvironment is scattered by filling objects on the table and some objects are\nindistinguishable. Therefore, previous studies assume that objects are manually\nannotated and given for downstream tasks, but this is costly and\ntime-consuming. To address this issue, this study focuses on Micro QR Codes to\ndetect objects automatically. From our preliminary study, we found that\ndetecting objects only using Micro QR Codes is still difficult because the\nresearchers manipulate objects, causing blur and occlusion frequently. To\naddress this, we also propose a novel object labeling method by combining a\nMicro QR Code detector and an off-the-shelf hand object detector. As one of the\napplications of our dataset, we conduct the task of generating protocols from\nexperiment videos and find that our approach can generate accurate protocols.\n","authors":["Taichi Nishimura","Koki Yamamoto","Yuto Haneji","Keiya Kajimura","Chihiro Nishiwaki","Eriko Daikoku","Natsuko Okuda","Fumihito Ono","Hirotaka Kameko","Shinsuke Mori"],"pdf_url":"https://arxiv.org/pdf/2404.03161v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2404.03159v1","updated":"2024-04-04T02:15:16Z","published":"2024-04-04T02:15:16Z","title":"HandDiff: 3D Hand Pose Estimation with Diffusion on Image-Point Cloud","summary":" Extracting keypoint locations from input hand frames, known as 3D hand pose\nestimation, is a critical task in various human-computer interaction\napplications. Essentially, the 3D hand pose estimation can be regarded as a 3D\npoint subset generative problem conditioned on input frames. Thanks to the\nrecent significant progress on diffusion-based generative models, hand pose\nestimation can also benefit from the diffusion model to estimate keypoint\nlocations with high quality. However, directly deploying the existing diffusion\nmodels to solve hand pose estimation is non-trivial, since they cannot achieve\nthe complex permutation mapping and precise localization. Based on this\nmotivation, this paper proposes HandDiff, a diffusion-based hand pose\nestimation model that iteratively denoises accurate hand pose conditioned on\nhand-shaped image-point clouds. In order to recover keypoint permutation and\naccurate location, we further introduce joint-wise condition and local detail\ncondition. Experimental results demonstrate that the proposed HandDiff\nsignificantly outperforms the existing approaches on four challenging hand pose\nbenchmark datasets. Codes and pre-trained models are publicly available at\nhttps://github.com/cwc1260/HandDiff.\n","authors":["Wencan Cheng","Hao Tang","Luc Van Gool","Jong Hwan Ko"],"pdf_url":"https://arxiv.org/pdf/2404.03159v1.pdf","comment":"Accepted as a conference paper to the Conference on Computer Vision\n and Pattern Recognition (2024)"},{"id":"http://arxiv.org/abs/2404.01518v2","updated":"2024-04-04T02:06:15Z","published":"2024-04-01T22:53:47Z","title":"Temporally Consistent Unbalanced Optimal Transport for Unsupervised\n Action Segmentation","summary":" We propose a novel approach to the action segmentation task for long,\nuntrimmed videos, based on solving an optimal transport problem. By encoding a\ntemporal consistency prior into a Gromov-Wasserstein problem, we are able to\ndecode a temporally consistent segmentation from a noisy affinity/matching cost\nmatrix between video frames and action classes. Unlike previous approaches, our\nmethod does not require knowing the action order for a video to attain temporal\nconsistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can\nbe efficiently solved on GPUs using a few iterations of projected mirror\ndescent. We demonstrate the effectiveness of our method in an unsupervised\nlearning setting, where our method is used to generate pseudo-labels for\nself-training. We evaluate our segmentation approach and unsupervised learning\npipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly\ndatasets, yielding state-of-the-art results for the unsupervised video action\nsegmentation task.\n","authors":["Ming Xu","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2404.01518v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03145v1","updated":"2024-04-04T01:39:01Z","published":"2024-04-04T01:39:01Z","title":"DreamWalk: Style Space Exploration using Diffusion Guidance","summary":" Text-conditioned diffusion models can generate impressive images, but fall\nshort when it comes to fine-grained control. Unlike direct-editing tools like\nPhotoshop, text conditioned models require the artist to perform \"prompt\nengineering,\" constructing special text sentences to control the style or\namount of a particular subject present in the output image. Our goal is to\nprovide fine-grained control over the style and substance specified by the\nprompt, for example to adjust the intensity of styles in different regions of\nthe image (Figure 1). Our approach is to decompose the text prompt into\nconceptual elements, and apply a separate guidance term for each element in a\nsingle diffusion process. We introduce guidance scale functions to control when\nin the diffusion process and \\emph{where} in the image to intervene. Since the\nmethod is based solely on adjusting diffusion guidance, it does not require\nfine-tuning or manipulating the internal layers of the diffusion model's neural\nnetwork, and can be used in conjunction with LoRA- or DreamBooth-trained models\n(Figure2). Project page: https://mshu1.github.io/dreamwalk.github.io/\n","authors":["Michelle Shu","Charles Herrmann","Richard Strong Bowen","Forrester Cole","Ramin Zabih"],"pdf_url":"https://arxiv.org/pdf/2404.03145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03144v1","updated":"2024-04-04T01:34:36Z","published":"2024-04-04T01:34:36Z","title":"Diverse and Tailored Image Generation for Zero-shot Multi-label\n Classification","summary":" Recently, zero-shot multi-label classification has garnered considerable\nattention for its capacity to operate predictions on unseen labels without\nhuman annotations. Nevertheless, prevailing approaches often use seen classes\nas imperfect proxies for unseen ones, resulting in suboptimal performance.\nDrawing inspiration from the success of text-to-image generation models in\nproducing realistic images, we propose an innovative solution: generating\nsynthetic data to construct a training set explicitly tailored for proxyless\ntraining on unseen labels. Our approach introduces a novel image generation\nframework that produces multi-label synthetic images of unseen classes for\nclassifier training. To enhance diversity in the generated images, we leverage\na pre-trained large language model to generate diverse prompts. Employing a\npre-trained multi-modal CLIP model as a discriminator, we assess whether the\ngenerated images accurately represent the target classes. This enables\nautomatic filtering of inaccurately generated images, preserving classifier\naccuracy. To refine text prompts for more precise and effective multi-label\nobject generation, we introduce a CLIP score-based discriminative loss to\nfine-tune the text encoder in the diffusion model. Additionally, to enhance\nvisual features on the target task while maintaining the generalization of\noriginal features and mitigating catastrophic forgetting resulting from\nfine-tuning the entire visual encoder, we propose a feature fusion module\ninspired by transformer attention mechanisms. This module aids in capturing\nglobal dependencies between multiple objects more effectively. Extensive\nexperimental results validate the effectiveness of our approach, demonstrating\nsignificant improvements over state-of-the-art methods.\n","authors":["Kaixin Zhang","Zhixiang Yuan","Tao Huang"],"pdf_url":"https://arxiv.org/pdf/2404.03144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03138v1","updated":"2024-04-04T01:22:23Z","published":"2024-04-04T01:22:23Z","title":"Discontinuity-preserving Normal Integration with Auxiliary Edges","summary":" Many surface reconstruction methods incorporate normal integration, which is\na process to obtain a depth map from surface gradients. In this process, the\ninput may represent a surface with discontinuities, e.g., due to\nself-occlusion. To reconstruct an accurate depth map from the input normal map,\nhidden surface gradients occurring from the jumps must be handled. To model\nthese jumps correctly, we design a novel discretization scheme for the domain\nof normal integration. Our key idea is to introduce auxiliary edges, which\nbridge between piecewise-smooth patches in the domain so that the magnitude of\nhidden jumps can be explicitly expressed. Using the auxiliary edges, we design\na novel algorithm to optimize the discontinuity and the depth map from the\ninput normal map. Our method optimizes discontinuities by using a combination\nof iterative re-weighted least squares and iterative filtering of the jump\nmagnitudes on auxiliary edges to provide strong sparsity regularization.\nCompared to previous discontinuity-preserving normal integration methods, which\nmodel the magnitudes of jumps only implicitly, our method reconstructs subtle\ndiscontinuities accurately thanks to our explicit representation of jumps\nallowing for strong sparsity regularization.\n","authors":["Hyomin Kim","Yucheol Jung","Seungyong Lee"],"pdf_url":"https://arxiv.org/pdf/2404.03138v1.pdf","comment":"To appear at CVPR 2024. For supplementary video, see\n https://youtu.be/MTTcW5kAOFE"},{"id":"http://arxiv.org/abs/2404.02072v2","updated":"2024-04-04T00:59:51Z","published":"2024-04-02T16:20:02Z","title":"EGTR: Extracting Graph from Transformer for Scene Graph Generation","summary":" Scene Graph Generation (SGG) is a challenging task of detecting objects and\npredicting relationships between objects. After DETR was developed, one-stage\nSGG models based on a one-stage object detector have been actively studied.\nHowever, complex modeling is used to predict the relationship between objects,\nand the inherent relationship between object queries learned in the multi-head\nself-attention of the object detector has been neglected. We propose a\nlightweight one-stage SGG model that extracts the relation graph from the\nvarious relationships learned in the multi-head self-attention layers of the\nDETR decoder. By fully utilizing the self-attention by-products, the relation\ngraph can be extracted effectively with a shallow relation extraction head.\nConsidering the dependency of the relation extraction task on the object\ndetection task, we propose a novel relation smoothing technique that adjusts\nthe relation label adaptively according to the quality of the detected objects.\nBy the relation smoothing, the model is trained according to the continuous\ncurriculum that focuses on object detection task at the beginning of training\nand performs multi-task learning as the object detection performance gradually\nimproves. Furthermore, we propose a connectivity prediction task that predicts\nwhether a relation exists between object pairs as an auxiliary task of the\nrelation extraction. We demonstrate the effectiveness and efficiency of our\nmethod for the Visual Genome and Open Image V6 datasets. Our code is publicly\navailable at https://github.com/naver-ai/egtr.\n","authors":["Jinbae Im","JeongYeon Nam","Nokyung Park","Hyungmin Lee","Seunghyun Park"],"pdf_url":"https://arxiv.org/pdf/2404.02072v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03126v1","updated":"2024-04-04T00:28:50Z","published":"2024-04-04T00:28:50Z","title":"GaSpCT: Gaussian Splatting for Novel CT Projection View Synthesis","summary":" We present GaSpCT, a novel view synthesis and 3D scene representation method\nused to generate novel projection views for Computer Tomography (CT) scans. We\nadapt the Gaussian Splatting framework to enable novel view synthesis in CT\nbased on limited sets of 2D image projections and without the need for\nStructure from Motion (SfM) methodologies. Therefore, we reduce the total\nscanning duration and the amount of radiation dose the patient receives during\nthe scan. We adapted the loss function to our use-case by encouraging a\nstronger background and foreground distinction using two sparsity promoting\nregularizers: a beta loss and a total variation (TV) loss. Finally, we\ninitialize the Gaussian locations across the 3D space using a uniform prior\ndistribution of where the brain's positioning would be expected to be within\nthe field of view. We evaluate the performance of our model using brain CT\nscans from the Parkinson's Progression Markers Initiative (PPMI) dataset and\ndemonstrate that the rendered novel views closely match the original projection\nviews of the simulated scan, and have better performance than other implicit 3D\nscene representations methodologies. Furthermore, we empirically observe\nreduced training time compared to neural network based image synthesis for\nsparse-view CT image reconstruction. Finally, the memory requirements of the\nGaussian Splatting representations are reduced by 17% compared to the\nequivalent voxel grid image representations.\n","authors":["Emmanouil Nikolakakis","Utkarsh Gupta","Jonathan Vengosh","Justin Bui","Razvan Marinescu"],"pdf_url":"https://arxiv.org/pdf/2404.03126v1.pdf","comment":"Under Review Process for MICCAI 2024"},{"id":"http://arxiv.org/abs/2203.13856v2","updated":"2024-04-04T00:13:42Z","published":"2022-03-25T18:42:20Z","title":"Robust deep learning for eye fundus images: Bridging real and synthetic\n data for enhancing generalization","summary":" Deep learning applications for assessing medical images are limited because\nthe datasets are often small and imbalanced. The use of synthetic data has been\nproposed in the literature, but neither a robust comparison of the different\nmethods nor generalizability has been reported. Our approach integrates a\nretinal image quality assessment model and StyleGAN2 architecture to enhance\nAge-related Macular Degeneration (AMD) detection capabilities and improve\ngeneralizability. This work compares ten different Generative Adversarial\nNetwork (GAN) architectures to generate synthetic eye-fundus images with and\nwithout AMD. We combined subsets of three public databases (iChallenge-AMD,\nODIR-2019, and RIADD) to form a single training and test set. We employed the\nSTARE dataset for external validation, ensuring a comprehensive assessment of\nthe proposed approach. The results show that StyleGAN2 reached the lowest\nFrechet Inception Distance (166.17), and clinicians could not accurately\ndifferentiate between real and synthetic images. ResNet-18 architecture\nobtained the best performance with 85% accuracy and outperformed the two human\nexperts (80% and 75%) in detecting AMD fundus images. The accuracy rates were\n82.8% for the test set and 81.3% for the STARE dataset, demonstrating the\nmodel's generalizability. The proposed methodology for synthetic medical image\ngeneration has been validated for robustness and accuracy, with free access to\nits code for further research and development in this field.\n","authors":["Guilherme C. Oliveira","Gustavo H. Rosa","Daniel C. G. Pedronette","João P. Papa","Himeesh Kumar","Leandro A. Passos","Dinesh Kumar"],"pdf_url":"https://arxiv.org/pdf/2203.13856v2.pdf","comment":"Accepted by the Biomedical Signal Processing and Control"},{"id":"http://arxiv.org/abs/2009.04650v2","updated":"2024-04-04T15:25:22Z","published":"2020-09-10T02:55:27Z","title":"Towards Fine-grained Large Object Segmentation 1st Place Solution to 3D\n AI Challenge 2020 -- Instance Segmentation Track","summary":" This technical report introduces our solutions of Team 'FineGrainedSeg' for\nInstance Segmentation track in 3D AI Challenge 2020. In order to handle\nextremely large objects in 3D-FUTURE, we adopt PointRend as our basic\nframework, which outputs more fine-grained masks compared to HTC and SOLOv2.\nOur final submission is an ensemble of 5 PointRend models, which achieves the\n1st place on both validation and test leaderboards. The code is available at\nhttps://github.com/zehuichen123/3DFuture_ins_seg.\n","authors":["Zehui Chen","Qiaofei Li","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2009.04650v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/1902.11122v5","updated":"2024-04-04T11:34:52Z","published":"2019-02-22T10:09:11Z","title":"Deep Learning in Cardiology","summary":" The medical field is creating large amount of data that physicians are unable\nto decipher and use efficiently. Moreover, rule-based expert systems are\ninefficient in solving complicated medical tasks or for creating insights using\nbig data. Deep learning has emerged as a more accurate and effective technology\nin a wide range of medical problems such as diagnosis, prediction and\nintervention. Deep learning is a representation learning method that consists\nof layers that transform the data non-linearly, thus, revealing hierarchical\nrelationships and structures. In this review we survey deep learning\napplication papers that use structured data, signal and imaging modalities from\ncardiology. We discuss the advantages and limitations of applying deep learning\nin cardiology that also apply in medicine in general, while proposing certain\ndirections as the most viable for clinical use.\n","authors":["Paschalis Bizopoulos","Dimitrios Koutsouris"],"pdf_url":"https://arxiv.org/pdf/1902.11122v5.pdf","comment":"27 pages, 2 figures, 10 tables"},{"id":"http://arxiv.org/abs/2404.03836v1","updated":"2024-04-04T23:38:45Z","published":"2024-04-04T23:38:45Z","title":"PARIS3D: Reasoning-based 3D Part Segmentation Using Large Multimodal\n Model","summary":" Recent advancements in 3D perception systems have significantly improved\ntheir ability to perform visual recognition tasks such as segmentation.\nHowever, these systems still heavily rely on explicit human instruction to\nidentify target objects or categories, lacking the capability to actively\nreason and comprehend implicit user intentions. We introduce a novel\nsegmentation task known as reasoning part segmentation for 3D objects, aiming\nto output a segmentation mask based on complex and implicit textual queries\nabout specific parts of a 3D object. To facilitate evaluation and benchmarking,\nwe present a large 3D dataset comprising over 60k instructions paired with\ncorresponding ground-truth part segmentation annotations specifically curated\nfor reasoning-based 3D part segmentation. We propose a model that is capable of\nsegmenting parts of 3D objects based on implicit textual queries and generating\nnatural language explanations corresponding to 3D object segmentation requests.\nExperiments show that our method achieves competitive performance to models\nthat use explicit queries, with the additional abilities to identify part\nconcepts, reason about them, and complement them with world knowledge. Our\nsource code, dataset, and trained models are available at\nhttps://github.com/AmrinKareem/PARIS3D.\n","authors":["Amrin Kareem","Jean Lahoud","Hisham Cholakkal"],"pdf_url":"https://arxiv.org/pdf/2404.03836v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.03831v1","updated":"2024-04-04T23:24:14Z","published":"2024-04-04T23:24:14Z","title":"SleepVST: Sleep Staging from Near-Infrared Video Signals using\n Pre-Trained Transformers","summary":" Advances in camera-based physiological monitoring have enabled the robust,\nnon-contact measurement of respiration and the cardiac pulse, which are known\nto be indicative of the sleep stage. This has led to research into camera-based\nsleep monitoring as a promising alternative to \"gold-standard\" polysomnography,\nwhich is cumbersome, expensive to administer, and hence unsuitable for\nlonger-term clinical studies. In this paper, we introduce SleepVST, a\ntransformer model which enables state-of-the-art performance in camera-based\nsleep stage classification (sleep staging). After pre-training on contact\nsensor data, SleepVST outperforms existing methods for cardio-respiratory sleep\nstaging on the SHHS and MESA datasets, achieving total Cohen's kappa scores of\n0.75 and 0.77 respectively. We then show that SleepVST can be successfully\ntransferred to cardio-respiratory waveforms extracted from video, enabling\nfully contact-free sleep staging. Using a video dataset of 50 nights, we\nachieve a total accuracy of 78.8\\% and a Cohen's $\\kappa$ of 0.71 in four-class\nvideo-based sleep staging, setting a new state-of-the-art in the domain.\n","authors":["Jonathan F. Carter","João Jorge","Oliver Gibson","Lionel Tarassenko"],"pdf_url":"https://arxiv.org/pdf/2404.03831v1.pdf","comment":"CVPR 2024 Highlight Paper"},{"id":"http://arxiv.org/abs/2305.05006v2","updated":"2024-04-04T22:51:42Z","published":"2023-05-08T19:25:50Z","title":"Synthesis of Annotated Colorectal Cancer Tissue Images from Gland Layout","summary":" Generating realistic tissue images with annotations is a challenging task\nthat is important in many computational histopathology applications.\nSynthetically generated images and annotations are valuable for training and\nevaluating algorithms in this domain. To address this, we propose an\ninteractive framework generating pairs of realistic colorectal cancer histology\nimages with corresponding glandular masks from glandular structure layouts. The\nframework accurately captures vital features like stroma, goblet cells, and\nglandular lumen. Users can control gland appearance by adjusting parameters\nsuch as the number of glands, their locations, and sizes. The generated images\nexhibit good Frechet Inception Distance (FID) scores compared to the\nstate-of-the-art image-to-image translation model. Additionally, we demonstrate\nthe utility of our synthetic annotations for evaluating gland segmentation\nalgorithms. Furthermore, we present a methodology for constructing glandular\nmasks using advanced deep generative models, such as latent diffusion models.\nThese masks enable tissue image generation through a residual encoder-decoder\nnetwork.\n","authors":["Srijay Deshpande","Fayyaz Minhas","Nasir Rajpoot"],"pdf_url":"https://arxiv.org/pdf/2305.05006v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16812v2","updated":"2024-04-04T22:31:18Z","published":"2023-12-28T04:14:55Z","title":"Spacetime Gaussian Feature Splatting for Real-Time Dynamic View\n Synthesis","summary":" Novel view synthesis of dynamic scenes has been an intriguing yet challenging\nproblem. Despite recent advancements, simultaneously achieving high-resolution\nphotorealistic results, real-time rendering, and compact storage remains a\nformidable task. To address these challenges, we propose Spacetime Gaussian\nFeature Splatting as a novel dynamic scene representation, composed of three\npivotal components. First, we formulate expressive Spacetime Gaussians by\nenhancing 3D Gaussians with temporal opacity and parametric motion/rotation.\nThis enables Spacetime Gaussians to capture static, dynamic, as well as\ntransient content within a scene. Second, we introduce splatted feature\nrendering, which replaces spherical harmonics with neural features. These\nfeatures facilitate the modeling of view- and time-dependent appearance while\nmaintaining small size. Third, we leverage the guidance of training error and\ncoarse depth to sample new Gaussians in areas that are challenging to converge\nwith existing pipelines. Experiments on several established real-world datasets\ndemonstrate that our method achieves state-of-the-art rendering quality and\nspeed, while retaining compact storage. At 8K resolution, our lite-version\nmodel can render at 60 FPS on an Nvidia RTX 4090 GPU. Our code is available at\nhttps://github.com/oppo-us-research/SpacetimeGaussians.\n","authors":["Zhan Li","Zhang Chen","Zhong Li","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2312.16812v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://oppo-us-research.github.io/SpacetimeGaussians-website/"},{"id":"http://arxiv.org/abs/2404.03819v1","updated":"2024-04-04T22:31:15Z","published":"2024-04-04T22:31:15Z","title":"Effective Lymph Nodes Detection in CT Scans Using Location Debiased\n Query Selection and Contrastive Query Representation in Transformer","summary":" Lymph node (LN) assessment is a critical, indispensable yet very challenging\ntask in the routine clinical workflow of radiology and oncology. Accurate LN\nanalysis is essential for cancer diagnosis, staging, and treatment planning.\nFinding scatteredly distributed, low-contrast clinically relevant LNs in 3D CT\nis difficult even for experienced physicians under high inter-observer\nvariations. Previous automatic LN detection works typically yield limited\nrecall and high false positives (FPs) due to adjacent anatomies with similar\nimage intensities, shapes, or textures (vessels, muscles, esophagus, etc). In\nthis work, we propose a new LN DEtection TRansformer, named LN-DETR, to achieve\nmore accurate performance. By enhancing the 2D backbone with a multi-scale 2.5D\nfeature fusion to incorporate 3D context explicitly, more importantly, we make\ntwo main contributions to improve the representation quality of LN queries. 1)\nConsidering that LN boundaries are often unclear, an IoU prediction head and a\nlocation debiased query selection are proposed to select LN queries of higher\nlocalization accuracy as the decoder query's initialization. 2) To reduce FPs,\nquery contrastive learning is employed to explicitly reinforce LN queries\ntowards their best-matched ground-truth queries over unmatched query\npredictions. Trained and tested on 3D CT scans of 1067 patients (with 10,000+\nlabeled LNs) via combining seven LN datasets from different body parts (neck,\nchest, and abdomen) and pathologies/cancers, our method significantly improves\nthe performance of previous leading methods by > 4-5% average recall at the\nsame FP rates in both internal and external testing. We further evaluate on the\nuniversal lesion detection task using NIH DeepLesion benchmark, and our method\nachieves the top performance of 88.46% averaged recall across 0.5 to 4 FPs per\nimage, compared with other leading reported results.\n","authors":["Qinji Yu","Yirui Wang","Ke Yan","Haoshen Li","Dazhou Guo","Li Zhang","Le Lu","Na Shen","Qifeng Wang","Xiaowei Ding","Xianghua Ye","Dakai Jin"],"pdf_url":"https://arxiv.org/pdf/2404.03819v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2404.03799v1","updated":"2024-04-04T20:42:49Z","published":"2024-04-04T20:42:49Z","title":"Language-Guided Instance-Aware Domain-Adaptive Panoptic Segmentation","summary":" The increasing relevance of panoptic segmentation is tied to the advancements\nin autonomous driving and AR/VR applications. However, the deployment of such\nmodels has been limited due to the expensive nature of dense data annotation,\ngiving rise to unsupervised domain adaptation (UDA). A key challenge in\npanoptic UDA is reducing the domain gap between a labeled source and an\nunlabeled target domain while harmonizing the subtasks of semantic and instance\nsegmentation to limit catastrophic interference. While considerable progress\nhas been achieved, existing approaches mainly focus on the adaptation of\nsemantic segmentation. In this work, we focus on incorporating instance-level\nadaptation via a novel instance-aware cross-domain mixing strategy IMix. IMix\nsignificantly enhances the panoptic quality by improving instance segmentation\nperformance. Specifically, we propose inserting high-confidence predicted\ninstances from the target domain onto source images, retaining the\nexhaustiveness of the resulting pseudo-labels while reducing the injected\nconfirmation bias. Nevertheless, such an enhancement comes at the cost of\ndegraded semantic performance, attributed to catastrophic forgetting. To\nmitigate this issue, we regularize our semantic branch by employing CLIP-based\ndomain alignment (CDA), exploiting the domain-robustness of natural language\nprompts. Finally, we present an end-to-end model incorporating these two\nmechanisms called LIDAPS, achieving state-of-the-art results on all popular\npanoptic UDA benchmarks.\n","authors":["Elham Amin Mansour","Ozan Unal","Suman Saha","Benjamin Bejar","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.03799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03789v1","updated":"2024-04-04T20:04:12Z","published":"2024-04-04T20:04:12Z","title":"Quantifying Uncertainty in Motion Prediction with Variational Bayesian\n Mixture","summary":" Safety and robustness are crucial factors in developing trustworthy\nautonomous vehicles. One essential aspect of addressing these factors is to\nequip vehicles with the capability to predict future trajectories for all\nmoving objects in the surroundings and quantify prediction uncertainties. In\nthis paper, we propose the Sequential Neural Variational Agent (SeNeVA), a\ngenerative model that describes the distribution of future trajectories for a\nsingle moving object. Our approach can distinguish Out-of-Distribution data\nwhile quantifying uncertainty and achieving competitive performance compared to\nstate-of-the-art methods on the Argoverse 2 and INTERACTION datasets.\nSpecifically, a 0.446 meters minimum Final Displacement Error, a 0.203 meters\nminimum Average Displacement Error, and a 5.35% Miss Rate are achieved on the\nINTERACTION test set. Extensive qualitative and quantitative analysis is also\nprovided to evaluate the proposed model. Our open-source code is available at\nhttps://github.com/PurdueDigitalTwin/seneva.\n","authors":["Juanwu Lu","Can Cui","Yunsheng Ma","Aniket Bera","Ziran Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03789v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03784v1","updated":"2024-04-04T19:55:11Z","published":"2024-04-04T19:55:11Z","title":"Layerwise Early Stopping for Test Time Adaptation","summary":" Test Time Adaptation (TTA) addresses the problem of distribution shift by\nenabling pretrained models to learn new features on an unseen domain at test\ntime. However, it poses a significant challenge to maintain a balance between\nlearning new features and retaining useful pretrained features. In this paper,\nwe propose Layerwise EArly STopping (LEAST) for TTA to address this problem.\nThe key idea is to stop adapting individual layers during TTA if the features\nbeing learned do not appear beneficial for the new domain. For that purpose, we\npropose using a novel gradient-based metric to measure the relevance of the\ncurrent learnt features to the new domain without the need for supervised\nlabels. More specifically, we propose to use this metric to determine\ndynamically when to stop updating each layer during TTA. This enables a more\nbalanced adaptation, restricted to layers benefiting from it, and only for a\ncertain number of steps. Such an approach also has the added effect of limiting\nthe forgetting of pretrained features useful for dealing with new domains.\nThrough extensive experiments, we demonstrate that Layerwise Early Stopping\nimproves the performance of existing TTA approaches across multiple datasets,\ndomain shifts, model architectures, and TTA losses.\n","authors":["Sabyasachi Sahoo","Mostafa ElAraby","Jonas Ngnawe","Yann Pequignot","Frederic Precioso","Christian Gagne"],"pdf_url":"https://arxiv.org/pdf/2404.03784v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.03778v1","updated":"2024-04-04T19:50:57Z","published":"2024-04-04T19:50:57Z","title":"Flattening the Parent Bias: Hierarchical Semantic Segmentation in the\n Poincaré Ball","summary":" Hierarchy is a natural representation of semantic taxonomies, including the\nones routinely used in image segmentation. Indeed, recent work on semantic\nsegmentation reports improved accuracy from supervised training leveraging\nhierarchical label structures. Encouraged by these results, we revisit the\nfundamental assumptions behind that work. We postulate and then empirically\nverify that the reasons for the observed improvement in segmentation accuracy\nmay be entirely unrelated to the use of the semantic hierarchy. To demonstrate\nthis, we design a range of cross-domain experiments with a representative\nhierarchical approach. We find that on the new testing domains, a flat\n(non-hierarchical) segmentation network, in which the parents are inferred from\nthe children, has superior segmentation accuracy to the hierarchical approach\nacross the board. Complementing these findings and inspired by the intrinsic\nproperties of hyperbolic spaces, we study a more principled approach to\nhierarchical segmentation using the Poincar\\'e ball model. The hyperbolic\nrepresentation largely outperforms the previous (Euclidean) hierarchical\napproach as well and is on par with our flat Euclidean baseline in terms of\nsegmentation accuracy. However, it additionally exhibits surprisingly strong\ncalibration quality of the parent nodes in the semantic hierarchy, especially\non the more challenging domains. Our combined analysis suggests that the\nestablished practice of hierarchical segmentation may be limited to in-domain\nsettings, whereas flat classifiers generalize substantially better, especially\nif they are modeled in the hyperbolic space.\n","authors":["Simon Weber","Barış Zöngür","Nikita Araslanov","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02733v2","updated":"2024-04-04T19:42:32Z","published":"2024-04-03T13:34:09Z","title":"InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image\n Generation","summary":" Tuning-free diffusion-based models have demonstrated significant potential in\nthe realm of image personalization and customization. However, despite this\nnotable progress, current models continue to grapple with several complex\nchallenges in producing style-consistent image generation. Firstly, the concept\nof style is inherently underdetermined, encompassing a multitude of elements\nsuch as color, material, atmosphere, design, and structure, among others.\nSecondly, inversion-based methods are prone to style degradation, often\nresulting in the loss of fine-grained details. Lastly, adapter-based approaches\nfrequently require meticulous weight tuning for each reference image to achieve\na balance between style intensity and text controllability. In this paper, we\ncommence by examining several compelling yet frequently overlooked\nobservations. We then proceed to introduce InstantStyle, a framework designed\nto address these issues through the implementation of two key strategies: 1) A\nstraightforward mechanism that decouples style and content from reference\nimages within the feature space, predicated on the assumption that features\nwithin the same space can be either added to or subtracted from one another. 2)\nThe injection of reference image features exclusively into style-specific\nblocks, thereby preventing style leaks and eschewing the need for cumbersome\nweight tuning, which often characterizes more parameter-heavy designs.Our work\ndemonstrates superior visual stylization outcomes, striking an optimal balance\nbetween the intensity of style and the controllability of textual elements. Our\ncodes will be available at https://github.com/InstantStyle/InstantStyle.\n","authors":["Haofan Wang","Matteo Spinelli","Qixun Wang","Xu Bai","Zekui Qin","Anthony Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02733v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2307.00040v3","updated":"2024-04-04T19:41:09Z","published":"2023-06-30T17:37:48Z","title":"DisCo: Disentangled Control for Realistic Human Dance Generation","summary":" Generative AI has made significant strides in computer vision, particularly\nin text-driven image/video synthesis (T2I/T2V). Despite the notable\nadvancements, it remains challenging in human-centric content synthesis such as\nrealistic dance generation. Current methodologies, primarily tailored for human\nmotion transfer, encounter difficulties when confronted with real-world dance\nscenarios (e.g., social media dance), which require to generalize across a wide\nspectrum of poses and intricate human details. In this paper, we depart from\nthe traditional paradigm of human motion transfer and emphasize two additional\ncritical attributes for the synthesis of human dance content in social media\ncontexts: (i) Generalizability: the model should be able to generalize beyond\ngeneric human viewpoints as well as unseen human subjects, backgrounds, and\nposes; (ii) Compositionality: it should allow for the seamless composition of\nseen/unseen subjects, backgrounds, and poses from different sources. To address\nthese challenges, we introduce DISCO, which includes a novel model architecture\nwith disentangled control to improve the compositionality of dance synthesis,\nand an effective human attribute pre-training for better generalizability to\nunseen humans. Extensive qualitative and quantitative results demonstrate that\nDisCc can generate high-quality human dance images and videos with diverse\nappearances and flexible motions. Code is available at\nhttps://disco-dance.github.io/.\n","authors":["Tan Wang","Linjie Li","Kevin Lin","Yuanhao Zhai","Chung-Ching Lin","Zhengyuan Yang","Hanwang Zhang","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.00040v3.pdf","comment":"Accepted by CVPR24"},{"id":"http://arxiv.org/abs/2312.12337v4","updated":"2024-04-04T19:04:55Z","published":"2023-12-19T17:03:50Z","title":"pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable\n Generalizable 3D Reconstruction","summary":" We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D\nradiance fields parameterized by 3D Gaussian primitives from pairs of images.\nOur model features real-time and memory-efficient rendering for scalable\ntraining as well as fast 3D reconstruction at inference time. To overcome local\nminima inherent to sparse and locally supported representations, we predict a\ndense probability distribution over 3D and sample Gaussian means from that\nprobability distribution. We make this sampling operation differentiable via a\nreparameterization trick, allowing us to back-propagate gradients through the\nGaussian splatting representation. We benchmark our method on wide-baseline\nnovel view synthesis on the real-world RealEstate10k and ACID datasets, where\nwe outperform state-of-the-art light field transformers and accelerate\nrendering by 2.5 orders of magnitude while reconstructing an interpretable and\neditable 3D radiance field.\n","authors":["David Charatan","Sizhe Li","Andrea Tagliasacchi","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2312.12337v4.pdf","comment":"Project page: https://dcharatan.github.io/pixelsplat"},{"id":"http://arxiv.org/abs/2305.07490v6","updated":"2024-04-04T18:55:18Z","published":"2023-05-12T14:04:30Z","title":"ArtGPT-4: Towards Artistic-understanding Large Vision-Language Models\n with Enhanced Adapter","summary":" The success of large language models (LLMs) has inspired an emerging research\nfield of multimodal learning. However, a grand challenge of exploiting LLMs for\nmultimodal learning is the size of pre-trained LLMs which are always with\nbillions of parameters. To tackle this challenge, models such as MiniGPT-4 and\nLLaVA have been developed to fine-tune the pre-trained models using fewer\nparameters. Despite their promising performance, these models remain limited in\ntheir understanding of artistic imagery. To facilitate better\nartistic-understanding, in this paper, we propose ArtGPT-4, a pioneering large\nvision-language model tailored to address the limitations of existing models in\nartistic comprehension. The key innovation of ArtGPT-4 lies in its craft for\nthe sophisticated challenge of artistic image comprehension, setting it apart\nfrom other models that overlook fine details for broader themes. Specifically,\nit works by integrating some specialized adapter layers into the LLM, enabling\nthe model to more efficiently and effectively parse and interpret complex\nvisual tokens, instead of fine-tuning the whole LLM as in the existing method.\nArtGPT-4 has demonstrated its outstanding performance on the efficiency:\nutilizing a Tesla A100 device, its training can be completed in mere 2 hours\nwith an image-text pair dataset comprising approximately 0.52M entries.\nAdditionally, ArtGPT-4 has also achieved state-of-the-art performance on the\nArtEmis and ArtEmis-v2.0 datasets as well as the benchmarks established in this\nwork, lagging behind professional artists' descriptions by a negligible 0.15\npoints on a 6-point scale. The outstanding performance of ArtGPT-4 shows that\nit can render images with an artistic-understanding and convey the emotions\nthey inspire, mirroring human interpretation. The code and the pre-trained\nmodel are accessible in \\url{https://github.com/DLYuanGod/ArtGPT-4}.\n","authors":["Zhengqing Yuan","Yunhong He","Kun Wang","Yanfang Ye","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2305.07490v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16862v2","updated":"2024-04-04T18:53:58Z","published":"2023-12-28T07:11:41Z","title":"TinyGPT-V: Efficient Multimodal Large Language Model via Small Backbones","summary":" In recent years, multimodal large language models (MLLMs) such as GPT-4V have\ndemonstrated remarkable advancements, excelling in a variety of vision-language\ntasks. Despite their prowess, the closed-source nature and computational\ndemands of such models limit their accessibility and applicability. This study\nintroduces TinyGPT-V, a novel open-source MLLM, designed for efficient training\nand inference across various vision-language tasks, including image captioning\n(IC) and visual question answering (VQA). Leveraging a compact yet powerful\narchitecture, TinyGPT-V integrates the Phi-2 language model with pre-trained\nvision encoders, utilizing a unique mapping module for visual and linguistic\ninformation fusion. With a training regimen optimized for small backbones and\nemploying a diverse dataset amalgam, TinyGPT-V requires significantly lower\ncomputational resources 24GB for training and as little as 8GB for inference\nwithout compromising on performance. Our experiments demonstrate that\nTinyGPT-V, with its language model 2.8 billion parameters, achieves comparable\nresults in VQA and image inference tasks to its larger counterparts while being\nuniquely suited for deployment on resource-constrained devices through\ninnovative quantization techniques. This work not only paves the way for more\naccessible and efficient MLLMs but also underscores the potential of smaller,\noptimized models in bridging the gap between high performance and computational\nefficiency in real-world applications. Additionally, this paper introduces a\nnew approach to multimodal large language models using smaller backbones. Our\ncode and training weights are available in\n\\url{https://github.com/DLYuanGod/TinyGPT-V}.\n","authors":["Zhengqing Yuan","Zhaoxu Li","Weiran Huang","Yanfang Ye","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2312.16862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03754v1","updated":"2024-04-04T18:50:58Z","published":"2024-04-04T18:50:58Z","title":"Data Science for Geographic Information Systems","summary":" The integration of data science into Geographic Information Systems (GIS) has\nfacilitated the evolution of these tools into complete spatial analysis\nplatforms. The adoption of machine learning and big data techniques has\nequipped these platforms with the capacity to handle larger amounts of\nincreasingly complex data, transcending the limitations of more traditional\napproaches. This work traces the historical and technical evolution of data\nscience and GIS as fields of study, highlighting the critical points of\nconvergence between domains, and underlining the many sectors that rely on this\nintegration. A GIS application is presented as a case study in the disaster\nmanagement sector where we utilize aerial data from Tr\\'oia, Portugal, to\nemphasize the process of insight extraction from raw data. We conclude by\noutlining prospects for future research in integration of these fields in\ngeneral, and the developed application in particular.\n","authors":["Afonso Oliveira","Nuno Fachada","João P. Matos-Carvalho"],"pdf_url":"https://arxiv.org/pdf/2404.03754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03743v1","updated":"2024-04-04T18:31:24Z","published":"2024-04-04T18:31:24Z","title":"Test Time Training for Industrial Anomaly Segmentation","summary":" Anomaly Detection and Segmentation (AD&S) is crucial for industrial quality\ncontrol. While existing methods excel in generating anomaly scores for each\npixel, practical applications require producing a binary segmentation to\nidentify anomalies. Due to the absence of labeled anomalies in many real\nscenarios, standard practices binarize these maps based on some statistics\nderived from a validation set containing only nominal samples, resulting in\npoor segmentation performance. This paper addresses this problem by proposing a\ntest time training strategy to improve the segmentation performance. Indeed, at\ntest time, we can extract rich features directly from anomalous samples to\ntrain a classifier that can discriminate defects effectively. Our general\napproach can work downstream to any AD&S method that provides an anomaly score\nmap as output, even in multimodal settings. We demonstrate the effectiveness of\nour approach over baselines through extensive experimentation and evaluation on\nMVTec AD and MVTec 3D-AD.\n","authors":["Alex Costanzino","Pierluigi Zama Ramirez","Mirko Del Moro","Agostino Aiezzo","Giuseppe Lisanti","Samuele Salti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2404.03743v1.pdf","comment":"Accepted at VAND 2.0, CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.01112v3","updated":"2024-04-04T18:31:05Z","published":"2024-04-01T13:38:16Z","title":"Few-shot point cloud reconstruction and denoising via learned Guassian\n splats renderings and fine-tuned diffusion features","summary":" Existing deep learning methods for the reconstruction and denoising of point\nclouds rely on small datasets of 3D shapes. We circumvent the problem by\nleveraging deep learning methods trained on billions of images. We propose a\nmethod to reconstruct point clouds from few images and to denoise point clouds\nfrom their rendering by exploiting prior knowledge distilled from image-based\ndeep learning models. To improve reconstruction in constraint settings, we\nregularize the training of a differentiable renderer with hybrid surface and\nappearance by introducing semantic consistency supervision. In addition, we\npropose a pipeline to finetune Stable Diffusion to denoise renderings of noisy\npoint clouds and we demonstrate how these learned filters can be used to remove\npoint cloud noise coming without 3D supervision. We compare our method with DSS\nand PointRadiance and achieved higher quality 3D reconstruction on the\nSketchfab Testset and SCUT Dataset.\n","authors":["Pietro Bonazzi"],"pdf_url":"https://arxiv.org/pdf/2404.01112v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01887v2","updated":"2024-04-04T18:29:00Z","published":"2024-04-02T12:26:17Z","title":"3D scene generation from scene graphs and self-attention","summary":" Synthesizing realistic and diverse indoor 3D scene layouts in a controllable\nfashion opens up applications in simulated navigation and virtual reality. As\nconcise and robust representations of a scene, scene graphs have proven to be\nwell-suited as the semantic control on the generated layout. We present a\nvariant of the conditional variational autoencoder (cVAE) model to synthesize\n3D scenes from scene graphs and floor plans. We exploit the properties of\nself-attention layers to capture high-level relationships between objects in a\nscene, and use these as the building blocks of our model. Our model, leverages\ngraph transformers to estimate the size, dimension and orientation of the\nobjects in a room while satisfying relationships in the given scene graph. Our\nexperiments shows self-attention layers leads to sparser (7.9x compared to\nGraphto3D) and more diverse scenes (16%).\n","authors":["Pietro Bonazzi"],"pdf_url":"https://arxiv.org/pdf/2404.01887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03736v1","updated":"2024-04-04T18:05:18Z","published":"2024-04-04T18:05:18Z","title":"SC4D: Sparse-Controlled Video-to-4D Generation and Motion Transfer","summary":" Recent advances in 2D/3D generative models enable the generation of dynamic\n3D objects from a single-view video. Existing approaches utilize score\ndistillation sampling to form the dynamic scene as dynamic NeRF or dense 3D\nGaussians. However, these methods struggle to strike a balance among reference\nview alignment, spatio-temporal consistency, and motion fidelity under\nsingle-view conditions due to the implicit nature of NeRF or the intricate\ndense Gaussian motion prediction. To address these issues, this paper proposes\nan efficient, sparse-controlled video-to-4D framework named SC4D, that\ndecouples motion and appearance to achieve superior video-to-4D generation.\nMoreover, we introduce Adaptive Gaussian (AG) initialization and Gaussian\nAlignment (GA) loss to mitigate shape degeneration issue, ensuring the fidelity\nof the learned motion and shape. Comprehensive experimental results demonstrate\nthat our method surpasses existing methods in both quality and efficiency. In\naddition, facilitated by the disentangled modeling of motion and appearance of\nSC4D, we devise a novel application that seamlessly transfers the learned\nmotion onto a diverse array of 4D entities according to textual descriptions.\n","authors":["Zijie Wu","Chaohui Yu","Yanqin Jiang","Chenjie Cao","Fan Wang","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2404.03736v1.pdf","comment":"Project Page: https://sc4d.github.io/"},{"id":"http://arxiv.org/abs/2304.01834v4","updated":"2024-04-04T18:01:47Z","published":"2023-04-04T14:39:44Z","title":"Neural Field Convolutions by Repeated Differentiation","summary":" Neural fields are evolving towards a general-purpose continuous\nrepresentation for visual computing. Yet, despite their numerous appealing\nproperties, they are hardly amenable to signal processing. As a remedy, we\npresent a method to perform general continuous convolutions with general\ncontinuous signals such as neural fields. Observing that piecewise polynomial\nkernels reduce to a sparse set of Dirac deltas after repeated differentiation,\nwe leverage convolution identities and train a repeated integral field to\nefficiently execute large-scale convolutions. We demonstrate our approach on a\nvariety of data modalities and spatially-varying kernels.\n","authors":["Ntumba Elie Nsampi","Adarsh Djeacoumar","Hans-Peter Seidel","Tobias Ritschel","Thomas Leimkühler"],"pdf_url":"https://arxiv.org/pdf/2304.01834v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04125v1","updated":"2024-04-04T17:58:02Z","published":"2024-04-04T17:58:02Z","title":"No \"Zero-Shot\" Without Exponential Data: Pretraining Concept Frequency\n Determines Multimodal Model Performance","summary":" Web-crawled pretraining datasets underlie the impressive \"zero-shot\"\nevaluation performance of multimodal models, such as CLIP for\nclassification/retrieval and Stable-Diffusion for image generation. However, it\nis unclear how meaningful the notion of \"zero-shot\" generalization is for such\nmultimodal models, as it is not known to what extent their pretraining datasets\nencompass the downstream concepts targeted for during \"zero-shot\" evaluation.\nIn this work, we ask: How is the performance of multimodal models on downstream\nconcepts influenced by the frequency of these concepts in their pretraining\ndatasets? We comprehensively investigate this question across 34 models and\nfive standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M,\nLAION-Aesthetics), generating over 300GB of data artifacts. We consistently\nfind that, far from exhibiting \"zero-shot\" generalization, multimodal models\nrequire exponentially more data to achieve linear improvements in downstream\n\"zero-shot\" performance, following a sample inefficient log-linear scaling\ntrend. This trend persists even when controlling for sample-level similarity\nbetween pretraining and downstream datasets, and testing on purely synthetic\ndata distributions. Furthermore, upon benchmarking models on long-tailed data\nsampled based on our analysis, we demonstrate that multimodal models across the\nboard perform poorly. We contribute this long-tail test set as the \"Let it\nWag!\" benchmark to further research in this direction. Taken together, our\nstudy reveals an exponential need for training data which implies that the key\nto \"zero-shot\" generalization capabilities under large-scale training paradigms\nremains to be found.\n","authors":["Vishaal Udandarao","Ameya Prabhu","Adhiraj Ghosh","Yash Sharma","Philip H. S. Torr","Adel Bibi","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2404.04125v1.pdf","comment":"Extended version of the short paper accepted at DPFM, ICLR'24"},{"id":"http://arxiv.org/abs/2404.03713v1","updated":"2024-04-04T17:46:20Z","published":"2024-04-04T17:46:20Z","title":"Explaining Explainability: Understanding Concept Activation Vectors","summary":" Recent interpretability methods propose using concept-based explanations to\ntranslate the internal representations of deep learning models into a language\nthat humans are familiar with: concepts. This requires understanding which\nconcepts are present in the representation space of a neural network. One\npopular method for finding concepts is Concept Activation Vectors (CAVs), which\nare learnt using a probe dataset of concept exemplars. In this work, we\ninvestigate three properties of CAVs. CAVs may be: (1) inconsistent between\nlayers, (2) entangled with different concepts, and (3) spatially dependent.\nEach property provides both challenges and opportunities in interpreting\nmodels. We introduce tools designed to detect the presence of these properties,\nprovide insight into how they affect the derived explanations, and provide\nrecommendations to minimise their impact. Understanding these properties can be\nused to our advantage. For example, we introduce spatially dependent CAVs to\ntest if a model is translation invariant with respect to a specific concept and\nclass. Our experiments are performed on ImageNet and a new synthetic dataset,\nElements. Elements is designed to capture a known ground truth relationship\nbetween concepts and classes. We release this dataset to facilitate further\nresearch in understanding and evaluating interpretability methods.\n","authors":["Angus Nicolson","Lisa Schut","J. Alison Noble","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2404.03713v1.pdf","comment":"(54 pages, 39 figures)"},{"id":"http://arxiv.org/abs/2404.04120v1","updated":"2024-04-04T10:12:55Z","published":"2024-04-04T10:12:55Z","title":"Cross-Modality Gait Recognition: Bridging LiDAR and Camera Modalities\n for Human Identification","summary":" Current gait recognition research mainly focuses on identifying pedestrians\ncaptured by the same type of sensor, neglecting the fact that individuals may\nbe captured by different sensors in order to adapt to various environments. A\nmore practical approach should involve cross-modality matching across different\nsensors. Hence, this paper focuses on investigating the problem of\ncross-modality gait recognition, with the objective of accurately identifying\npedestrians across diverse vision sensors. We present CrossGait inspired by the\nfeature alignment strategy, capable of cross retrieving diverse data\nmodalities. Specifically, we investigate the cross-modality recognition task by\ninitially extracting features within each modality and subsequently aligning\nthese features across modalities. To further enhance the cross-modality\nperformance, we propose a Prototypical Modality-shared Attention Module that\nlearns modality-shared features from two modality-specific features.\nAdditionally, we design a Cross-modality Feature Adapter that transforms the\nlearned modality-specific features into a unified feature space. Extensive\nexperiments conducted on the SUSTech1K dataset demonstrate the effectiveness of\nCrossGait: (1) it exhibits promising cross-modality ability in retrieving\npedestrians across various modalities from different sensors in diverse scenes,\nand (2) CrossGait not only learns modality-shared features for cross-modality\ngait recognition but also maintains modality-specific features for\nsingle-modality recognition.\n","authors":["Rui Wang","Chuanfu Shen","Manuel J. Marin-Jimenez","George Q. Huang","Shiqi Yu"],"pdf_url":"https://arxiv.org/pdf/2404.04120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03703v1","updated":"2024-04-04T07:49:39Z","published":"2024-04-04T07:49:39Z","title":"Mitigating analytical variability in fMRI results with style transfer","summary":" We propose a novel approach to improve the reproducibility of neuroimaging\nresults by converting statistic maps across different functional MRI pipelines.\nWe make the assumption that pipelines can be considered as a style component of\ndata and propose to use different generative models, among which, Diffusion\nModels (DM) to convert data between pipelines. We design a new DM-based\nunsupervised multi-domain image-to-image transition framework and constrain the\ngeneration of 3D fMRI statistic maps using the latent space of an auxiliary\nclassifier that distinguishes statistic maps from different pipelines. We\nextend traditional sampling techniques used in DM to improve the transition\nperformance. Our experiments demonstrate that our proposed methods are\nsuccessful: pipelines can indeed be transferred, providing an important source\nof data augmentation for future medical studies.\n","authors":["Elodie Germani","Elisa Fromont","Camille Maumet"],"pdf_url":"https://arxiv.org/pdf/2404.03703v1.pdf","comment":null}]},"2024-04-05T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.00086v2","updated":"2024-04-05T17:59:50Z","published":"2024-03-29T17:58:50Z","title":"DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries","summary":" Modern video segmentation methods adopt object queries to perform inter-frame\nassociation and demonstrate satisfactory performance in tracking continuously\nappearing objects despite large-scale motion and transient occlusion. However,\nthey all underperform on newly emerging and disappearing objects that are\ncommon in the real world because they attempt to model object emergence and\ndisappearance through feature transitions between background and foreground\nqueries that have significant feature gaps. We introduce Dynamic Anchor Queries\n(DAQ) to shorten the transition gap between the anchor and target queries by\ndynamically generating anchor queries based on the features of potential\ncandidates. Furthermore, we introduce a query-level object Emergence and\nDisappearance Simulation (EDS) strategy, which unleashes DAQ's potential\nwithout any additional cost. Finally, we combine our proposed DAQ and EDS with\nDVIS to obtain DVIS-DAQ. Extensive experiments demonstrate that DVIS-DAQ\nachieves a new state-of-the-art (SOTA) performance on five mainstream video\nsegmentation benchmarks. Code and models are available at\n\\url{https://github.com/SkyworkAI/DAQ-VS}.\n","authors":["Yikang Zhou","Tao Zhang","Shunping Ji","Shuicheng Yan","Xiangtai Li"],"pdf_url":"https://arxiv.org/pdf/2404.00086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04256v1","updated":"2024-04-05T17:59:44Z","published":"2024-04-05T17:59:44Z","title":"Sigma: Siamese Mamba Network for Multi-Modal Semantic Segmentation","summary":" Multi-modal semantic segmentation significantly enhances AI agents'\nperception and scene understanding, especially under adverse conditions like\nlow-light or overexposed environments. Leveraging additional modalities\n(X-modality) like thermal and depth alongside traditional RGB provides\ncomplementary information, enabling more robust and reliable segmentation. In\nthis work, we introduce Sigma, a Siamese Mamba network for multi-modal semantic\nsegmentation, utilizing the Selective Structured State Space Model, Mamba.\nUnlike conventional methods that rely on CNNs, with their limited local\nreceptive fields, or Vision Transformers (ViTs), which offer global receptive\nfields at the cost of quadratic complexity, our model achieves global receptive\nfields coverage with linear complexity. By employing a Siamese encoder and\ninnovating a Mamba fusion mechanism, we effectively select essential\ninformation from different modalities. A decoder is then developed to enhance\nthe channel-wise modeling ability of the model. Our method, Sigma, is\nrigorously evaluated on both RGB-Thermal and RGB-Depth segmentation tasks,\ndemonstrating its superiority and marking the first successful application of\nState Space Models (SSMs) in multi-modal perception tasks. Code is available at\nhttps://github.com/zifuwan/Sigma.\n","authors":["Zifu Wan","Yuhao Wang","Silong Yong","Pingping Zhang","Simon Stepputtis","Katia Sycara","Yaqi Xie"],"pdf_url":"https://arxiv.org/pdf/2404.04256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04254v1","updated":"2024-04-05T17:58:52Z","published":"2024-04-05T17:58:52Z","title":"Watermark-based Detection and Attribution of AI-Generated Content","summary":" Several companies--such as Google, Microsoft, and OpenAI--have deployed\ntechniques to watermark AI-generated content to enable proactive detection.\nHowever, existing literature mainly focuses on user-agnostic detection.\nAttribution aims to further trace back the user of a generative-AI service who\ngenerated a given content detected as AI-generated. Despite its growing\nimportance, attribution is largely unexplored. In this work, we aim to bridge\nthis gap by providing the first systematic study on watermark-based, user-aware\ndetection and attribution of AI-generated content. Specifically, we\ntheoretically study the detection and attribution performance via rigorous\nprobabilistic analysis. Moreover, we develop an efficient algorithm to select\nwatermarks for the users to enhance attribution performance. Both our\ntheoretical and empirical results show that watermark-based detection and\nattribution inherit the accuracy and (non-)robustness properties of the\nwatermarking method.\n","authors":["Zhengyuan Jiang","Moyang Guo","Yuepeng Hu","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2404.04254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04251v1","updated":"2024-04-05T17:57:16Z","published":"2024-04-05T17:57:16Z","title":"Who Evaluates the Evaluations? Objectively Scoring Text-to-Image Prompt\n Coherence Metrics with T2IScoreScore (TS2)","summary":" With advances in the quality of text-to-image (T2I) models has come interest\nin benchmarking their prompt faithfulness-the semantic coherence of generated\nimages to the prompts they were conditioned on. A variety of T2I faithfulness\nmetrics have been proposed, leveraging advances in cross-modal embeddings and\nvision-language models (VLMs). However, these metrics are not rigorously\ncompared and benchmarked, instead presented against few weak baselines by\ncorrelation to human Likert scores over a set of easy-to-discriminate images.\n We introduce T2IScoreScore (TS2), a curated set of semantic error graphs\ncontaining a prompt and a set increasingly erroneous images. These allow us to\nrigorously judge whether a given prompt faithfulness metric can correctly order\nimages with respect to their objective error count and significantly\ndiscriminate between different error nodes, using meta-metric scores derived\nfrom established statistical tests. Surprisingly, we find that the\nstate-of-the-art VLM-based metrics (e.g., TIFA, DSG, LLMScore, VIEScore) we\ntested fail to significantly outperform simple feature-based metrics like\nCLIPScore, particularly on a hard subset of naturally-occurring T2I model\nerrors. TS2 will enable the development of better T2I prompt faithfulness\nmetrics through more rigorous comparison of their conformity to expected\norderings and separations under objective criteria.\n","authors":["Michael Saxon","Fatima Jahara","Mahsa Khoshnoodi","Yujie Lu","Aditya Sharma","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04251v1.pdf","comment":"15 pages main, 9 pages appendices, 16 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.08240v2","updated":"2024-04-05T17:56:12Z","published":"2023-12-13T16:01:50Z","title":"CenterGrasp: Object-Aware Implicit Representation Learning for\n Simultaneous Shape Reconstruction and 6-DoF Grasp Estimation","summary":" Reliable object grasping is a crucial capability for autonomous robots.\nHowever, many existing grasping approaches focus on general clutter removal\nwithout explicitly modeling objects and thus only relying on the visible local\ngeometry. We introduce CenterGrasp, a novel framework that combines object\nawareness and holistic grasping. CenterGrasp learns a general object prior by\nencoding shapes and valid grasps in a continuous latent space. It consists of\nan RGB-D image encoder that leverages recent advances to detect objects and\ninfer their pose and latent code, and a decoder to predict shape and grasps for\neach object in the scene. We perform extensive experiments on simulated as well\nas real-world cluttered scenes and demonstrate strong scene reconstruction and\n6-DoF grasp-pose estimation performance. Compared to the state of the art,\nCenterGrasp achieves an improvement of 38.5 mm in shape reconstruction and 33\npercentage points on average in grasp success. We make the code and trained\nmodels publicly available at http://centergrasp.cs.uni-freiburg.de.\n","authors":["Eugenio Chisari","Nick Heppert","Tim Welschehold","Wolfram Burgard","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2312.08240v2.pdf","comment":"Accepted at RA-L. Video, code and models available at\n http://centergrasp.cs.uni-freiburg.de"},{"id":"http://arxiv.org/abs/2102.05984v2","updated":"2024-04-05T17:55:28Z","published":"2021-02-11T13:04:49Z","title":"Modeling 3D Surface Manifolds with a Locally Conditioned Atlas","summary":" Recently proposed 3D object reconstruction methods represent a mesh with an\natlas - a set of planar patches approximating the surface. However, their\napplication in a real-world scenario is limited since the surfaces of\nreconstructed objects contain discontinuities, which degrades the quality of\nthe final mesh. This is mainly caused by independent processing of individual\npatches, and in this work, we postulate to mitigate this limitation by\npreserving local consistency around patch vertices. To that end, we introduce a\nLocally Conditioned Atlas (LoCondA), a framework for representing a 3D object\nhierarchically in a generative model. Firstly, the model maps a point cloud of\nan object into a sphere. Secondly, by leveraging a spherical prior, we enforce\nthe mapping to be locally consistent on the sphere and on the target object.\nThis way, we can sample a mesh quad on that sphere and project it back onto the\nobject's manifold. With LoCondA, we can produce topologically diverse objects\nwhile maintaining quads to be stitched together. We show that the proposed\napproach provides structurally coherent reconstructions while producing meshes\nof quality comparable to the competitors.\n","authors":["Przemysław Spurek","Sebastian Winczowski","Maciej Zięba","Tomasz Trzciński","Kacper Kania","Marcin Mazur"],"pdf_url":"https://arxiv.org/pdf/2102.05984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04245v1","updated":"2024-04-05T17:51:58Z","published":"2024-04-05T17:51:58Z","title":"Evaluating Adversarial Robustness: A Comparison Of FGSM, Carlini-Wagner\n Attacks, And The Role of Distillation as Defense Mechanism","summary":" This technical report delves into an in-depth exploration of adversarial\nattacks specifically targeted at Deep Neural Networks (DNNs) utilized for image\nclassification. The study also investigates defense mechanisms aimed at\nbolstering the robustness of machine learning models. The research focuses on\ncomprehending the ramifications of two prominent attack methodologies: the Fast\nGradient Sign Method (FGSM) and the Carlini-Wagner (CW) approach. These attacks\nare examined concerning three pre-trained image classifiers: Resnext50_32x4d,\nDenseNet-201, and VGG-19, utilizing the Tiny-ImageNet dataset. Furthermore, the\nstudy proposes the robustness of defensive distillation as a defense mechanism\nto counter FGSM and CW attacks. This defense mechanism is evaluated using the\nCIFAR-10 dataset, where CNN models, specifically resnet101 and Resnext50_32x4d,\nserve as the teacher and student models, respectively. The proposed defensive\ndistillation model exhibits effectiveness in thwarting attacks such as FGSM.\nHowever, it is noted to remain susceptible to more sophisticated techniques\nlike the CW attack. The document presents a meticulous validation of the\nproposed scheme. It provides detailed and comprehensive results, elucidating\nthe efficacy and limitations of the defense mechanisms employed. Through\nrigorous experimentation and analysis, the study offers insights into the\ndynamics of adversarial attacks on DNNs, as well as the effectiveness of\ndefensive strategies in mitigating their impact.\n","authors":["Trilokesh Ranjan Sarkar","Nilanjan Das","Pralay Sankar Maitra","Bijoy Some","Ritwik Saha","Orijita Adhikary","Bishal Bose","Jaydip Sen"],"pdf_url":"https://arxiv.org/pdf/2404.04245v1.pdf","comment":"This report pertains to the Capstone Project done by Group 1 of the\n Fall batch of 2023 students at Praxis Tech School, Kolkata, India. The\n reports consists of 35 pages and it includes 15 figures and 10 tables. This\n is the preprint which will be submitted to to an IEEE international\n conference for review"},{"id":"http://arxiv.org/abs/2404.04244v1","updated":"2024-04-05T17:46:38Z","published":"2024-04-05T17:46:38Z","title":"DiffOp-net: A Differential Operator-based Fully Convolutional Network\n for Unsupervised Deformable Image Registration","summary":" Existing unsupervised deformable image registration methods usually rely on\nmetrics applied to the gradients of predicted displacement or velocity fields\nas a regularization term to ensure transformation smoothness, which potentially\nlimits registration accuracy. In this study, we propose a novel approach to\nenhance unsupervised deformable image registration by introducing a new\ndifferential operator into the registration framework. This operator, acting on\nthe velocity field and mapping it to a dual space, ensures the smoothness of\nthe velocity field during optimization, facilitating accurate deformable\nregistration. In addition, to tackle the challenge of capturing large\ndeformations inside image pairs, we introduce a Cross-Coordinate Attention\nmodule (CCA) and embed it into a proposed Fully Convolutional Networks\n(FCNs)-based multi-resolution registration architecture. Evaluation experiments\nare conducted on two magnetic resonance imaging (MRI) datasets. Compared to\nvarious state-of-the-art registration approaches, including a traditional\nalgorithm and three representative unsupervised learning-based methods, our\nmethod achieves superior accuracies, maintaining desirable diffeomorphic\nproperties, and exhibiting promising registration speed.\n","authors":["Jiong Wu"],"pdf_url":"https://arxiv.org/pdf/2404.04244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04243v1","updated":"2024-04-05T17:45:22Z","published":"2024-04-05T17:45:22Z","title":"Identity Decoupling for Multi-Subject Personalization of Text-to-Image\n Models","summary":" Text-to-image diffusion models have shown remarkable success in generating a\npersonalized subject based on a few reference images. However, current methods\nstruggle with handling multiple subjects simultaneously, often resulting in\nmixed identities with combined attributes from different subjects. In this\nwork, we present MuDI, a novel framework that enables multi-subject\npersonalization by effectively decoupling identities from multiple subjects.\nOur main idea is to utilize segmented subjects generated by the Segment\nAnything Model for both training and inference, as a form of data augmentation\nfor training and initialization for the generation process. Our experiments\ndemonstrate that MuDI can produce high-quality personalized images without\nidentity mixing, even for highly similar subjects as shown in Figure 1. In\nhuman evaluation, MuDI shows twice as many successes for personalizing multiple\nsubjects without identity mixing over existing baselines and is preferred over\n70% compared to the strongest baseline. More results are available at\nhttps://mudi-t2i.github.io/.\n","authors":["Sangwon Jang","Jaehyeong Jo","Kimin Lee","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2404.04243v1.pdf","comment":"Preprint. Project page: https://mudi-t2i.github.io/"},{"id":"http://arxiv.org/abs/2404.04242v1","updated":"2024-04-05T17:45:07Z","published":"2024-04-05T17:45:07Z","title":"Physical Property Understanding from Language-Embedded Feature Fields","summary":" Can computers perceive the physical properties of objects solely through\nvision? Research in cognitive science and vision science has shown that humans\nexcel at identifying materials and estimating their physical properties based\npurely on visual appearance. In this paper, we present a novel approach for\ndense prediction of the physical properties of objects using a collection of\nimages. Inspired by how humans reason about physics through vision, we leverage\nlarge language models to propose candidate materials for each object. We then\nconstruct a language-embedded point cloud and estimate the physical properties\nof each 3D point using a zero-shot kernel regression approach. Our method is\naccurate, annotation-free, and applicable to any object in the open world.\nExperiments demonstrate the effectiveness of the proposed approach in various\nphysical property reasoning tasks, such as estimating the mass of common\nobjects, as well as other properties like friction and hardness.\n","authors":["Albert J. Zhai","Yuan Shen","Emily Y. Chen","Gloria X. Wang","Xinlei Wang","Sheng Wang","Kaiyu Guan","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04242v1.pdf","comment":"CVPR 2024. Project page (with code):\n https://ajzhai.github.io/NeRF2Physics/"},{"id":"http://arxiv.org/abs/2311.08577v3","updated":"2024-04-05T17:37:36Z","published":"2023-11-14T22:46:01Z","title":"Finding AI-Generated Faces in the Wild","summary":" AI-based image generation has continued to rapidly improve, producing\nincreasingly more realistic images with fewer obvious visual flaws.\nAI-generated images are being used to create fake online profiles which in turn\nare being used for spam, fraud, and disinformation campaigns. As the general\nproblem of detecting any type of manipulated or synthesized content is\nreceiving increasing attention, here we focus on a more narrow task of\ndistinguishing a real face from an AI-generated face. This is particularly\napplicable when tackling inauthentic online accounts with a fake user profile\nphoto. We show that by focusing on only faces, a more resilient and\ngeneral-purpose artifact can be detected that allows for the detection of\nAI-generated faces from a variety of GAN- and diffusion-based synthesis\nengines, and across image resolutions (as low as 128 x 128 pixels) and\nqualities.\n","authors":["Gonzalo J. Aniano Porcile","Jack Gindi","Shivansh Mundra","James R. Verbus","Hany Farid"],"pdf_url":"https://arxiv.org/pdf/2311.08577v3.pdf","comment":"to be published as: G.J.A. Porcile, J. Gindi, S. Mundra, J.R. Verbus,\n and H. Farid, Finding AI-Generated Faces in the Wild, Workshop on Media\n Forensics at CVPR, 2024"},{"id":"http://arxiv.org/abs/2404.03635v2","updated":"2024-04-05T17:27:34Z","published":"2024-04-04T17:54:33Z","title":"WorDepth: Variational Language Prior for Monocular Depth Estimation","summary":" Three-dimensional (3D) reconstruction from a single image is an ill-posed\nproblem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text\ndescription(s) is similarly ill-posed, i.e. spatial arrangements of objects\ndescribed. We investigate the question of whether two inherently ambiguous\nmodalities can be used in conjunction to produce metric-scaled reconstructions.\nTo test this, we focus on monocular depth estimation, the problem of predicting\na dense depth map from a single image, but with an additional text caption\ndescribing the scene. To this end, we begin by encoding the text caption as a\nmean and standard deviation; using a variational framework, we learn the\ndistribution of the plausible metric reconstructions of 3D scenes corresponding\nto the text captions as a prior. To \"select\" a specific reconstruction or depth\nmap, we encode the given image through a conditional sampler that samples from\nthe latent space of the variational text encoder, which is then decoded to the\noutput depth map. Our approach is trained alternatingly between the text and\nimage branches: in one optimization step, we predict the mean and standard\ndeviation from the text description and sample from a standard Gaussian, and in\nthe other, we sample using a (image) conditional sampler. Once trained, we\ndirectly predict depth from the encoded text using the conditional sampler. We\ndemonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where\nwe show that language can consistently improve performance in both.\n","authors":["Ziyao Zeng","Daniel Wang","Fengyu Yang","Hyoungseob Park","Yangchao Wu","Stefano Soatto","Byung-Woo Hong","Dong Lao","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2404.03635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04231v1","updated":"2024-04-05T17:25:17Z","published":"2024-04-05T17:25:17Z","title":"Image-Text Co-Decomposition for Text-Supervised Semantic Segmentation","summary":" This paper addresses text-supervised semantic segmentation, aiming to learn a\nmodel capable of segmenting arbitrary visual concepts within images by using\nonly image-text pairs without dense annotations. Existing methods have\ndemonstrated that contrastive learning on image-text pairs effectively aligns\nvisual segments with the meanings of texts. We notice that there is a\ndiscrepancy between text alignment and semantic segmentation: A text often\nconsists of multiple semantic concepts, whereas semantic segmentation strives\nto create semantically homogeneous segments. To address this issue, we propose\na novel framework, Image-Text Co-Decomposition (CoDe), where the paired image\nand text are jointly decomposed into a set of image regions and a set of word\nsegments, respectively, and contrastive learning is developed to enforce\nregion-word alignment. To work with a vision-language model, we present a\nprompt learning mechanism that derives an extra representation to highlight an\nimage segment or a word segment of interest, with which more effective features\ncan be extracted from that segment. Comprehensive experimental results\ndemonstrate that our method performs favorably against existing text-supervised\nsemantic segmentation methods on six benchmark datasets.\n","authors":["Ji-Jia Wu","Andy Chia-Hao Chang","Chieh-Yu Chuang","Chun-Pei Chen","Yu-Lun Liu","Min-Hung Chen","Hou-Ning Hu","Yung-Yu Chuang","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2404.04231v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02257v2","updated":"2024-04-05T17:02:31Z","published":"2024-04-02T19:25:04Z","title":"SnAG: Scalable and Accurate Video Grounding","summary":" Temporal grounding of text descriptions in videos is a central problem in\nvision-language learning and video understanding. Existing methods often\nprioritize accuracy over scalability -- they have been optimized for grounding\nonly a few text queries within short videos, and fail to scale up to long\nvideos with hundreds of queries. In this paper, we study the effect of\ncross-modal fusion on the scalability of video grounding models. Our analysis\nestablishes late fusion as a more cost-effective fusion scheme for long-form\nvideos with many text queries. Moreover, it leads us to a novel, video-centric\nsampling scheme for efficient training. Based on these findings, we present\nSnAG, a simple baseline for scalable and accurate video grounding. Without\nbells and whistles, SnAG is 43% more accurate and 1.5x faster than CONE, a\nstate of the art for long-form video grounding on the challenging MAD dataset,\nwhile achieving highly competitive results on short videos.\n","authors":["Fangzhou Mu","Sicheng Mo","Yin Li"],"pdf_url":"https://arxiv.org/pdf/2404.02257v2.pdf","comment":"Accepted to CVPR 2024. Code available at\n https://github.com/fmu2/snag_release"},{"id":"http://arxiv.org/abs/2402.15584v2","updated":"2024-04-05T17:01:34Z","published":"2024-02-23T19:51:55Z","title":"State Space Models for Event Cameras","summary":" Today, state-of-the-art deep neural networks that process event-camera data\nfirst convert a temporal window of events into dense, grid-like input\nrepresentations. As such, they exhibit poor generalizability when deployed at\nhigher inference frequencies (i.e., smaller temporal windows) than the ones\nthey were trained on. We address this challenge by introducing state-space\nmodels (SSMs) with learnable timescale parameters to event-based vision. This\ndesign adapts to varying frequencies without the need to retrain the network at\ndifferent frequencies. Additionally, we investigate two strategies to\ncounteract aliasing effects when deploying the model at higher frequencies. We\ncomprehensively evaluate our approach against existing methods based on RNN and\nTransformer architectures across various benchmarks, including Gen1 and 1 Mpx\nevent camera datasets. Our results demonstrate that SSM-based models train 33%\nfaster and also exhibit minimal performance degradation when tested at higher\nfrequencies than the training input. Traditional RNN and Transformer models\nexhibit performance drops of more than 20 mAP, with SSMs having a drop of 3.31\nmAP, highlighting the effectiveness of SSMs in event-based vision tasks.\n","authors":["Nikola Zubić","Mathias Gehrig","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2402.15584v2.pdf","comment":"18 pages, 5 figures, 6 tables, CVPR 2024 Camera Ready paper"},{"id":"http://arxiv.org/abs/2301.07002v3","updated":"2024-04-05T16:50:13Z","published":"2023-01-17T16:44:48Z","title":"Opti-CAM: Optimizing saliency maps for interpretability","summary":" Methods based on class activation maps (CAM) provide a simple mechanism to\ninterpret predictions of convolutional neural networks by using linear\ncombinations of feature maps as saliency maps. By contrast, masking-based\nmethods optimize a saliency map directly in the image space or learn it by\ntraining another network on additional data.\n In this work we introduce Opti-CAM, combining ideas from CAM-based and\nmasking-based approaches. Our saliency map is a linear combination of feature\nmaps, where weights are optimized per image such that the logit of the masked\nimage for a given class is maximized. We also fix a fundamental flaw in two of\nthe most common evaluation metrics of attribution methods. On several datasets,\nOpti-CAM largely outperforms other CAM-based approaches according to the most\nrelevant classification metrics. We provide empirical evidence supporting that\nlocalization and classifier interpretability are not necessarily aligned.\n","authors":["Hanwei Zhang","Felipe Torres","Ronan Sicre","Yannis Avrithis","Stephane Ayache"],"pdf_url":"https://arxiv.org/pdf/2301.07002v3.pdf","comment":"This work is under consideration at \"Computer Vision and Image\n Understanding\""},{"id":"http://arxiv.org/abs/2404.04211v1","updated":"2024-04-05T16:42:16Z","published":"2024-04-05T16:42:16Z","title":"Robust Gaussian Splatting","summary":" In this paper, we address common error sources for 3D Gaussian Splatting\n(3DGS) including blur, imperfect camera poses, and color inconsistencies, with\nthe goal of improving its robustness for practical applications like\nreconstructions from handheld phone captures. Our main contribution involves\nmodeling motion blur as a Gaussian distribution over camera poses, allowing us\nto address both camera pose refinement and motion blur correction in a unified\nway. Additionally, we propose mechanisms for defocus blur compensation and for\naddressing color in-consistencies caused by ambient light, shadows, or due to\ncamera-related factors like varying white balancing settings. Our proposed\nsolutions integrate in a seamless way with the 3DGS formulation while\nmaintaining its benefits in terms of training efficiency and rendering speed.\nWe experimentally validate our contributions on relevant benchmark datasets\nincluding Scannet++ and Deblur-NeRF, obtaining state-of-the-art results and\nthus consistent improvements over relevant baselines.\n","authors":["François Darmon","Lorenzo Porzi","Samuel Rota-Bulò","Peter Kontschieder"],"pdf_url":"https://arxiv.org/pdf/2404.04211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04202v1","updated":"2024-04-05T16:25:39Z","published":"2024-04-05T16:25:39Z","title":"Deep-learning Segmentation of Small Volumes in CT images for\n Radiotherapy Treatment Planning","summary":" Our understanding of organs at risk is progressing to include physical small\ntissues such as coronary arteries and the radiosensitivities of many small\norgans and tissues are high. Therefore, the accurate segmentation of small\nvolumes in external radiotherapy is crucial to protect them from\nover-irradiation. Moreover, with the development of the particle therapy and\non-board imaging, the treatment becomes more accurate and precise. The purpose\nof this work is to optimize organ segmentation algorithms for small organs. We\nused 50 three-dimensional (3-D) computed tomography (CT) head and neck images\nfrom StructSeg2019 challenge to develop a general-purpose V-Net model to\nsegment 20 organs in the head and neck region. We applied specific strategies\nto improve the segmentation accuracy of the small volumes in this anatomical\nregion, i.e., the lens of the eye. Then, we used 17 additional head images from\nOSF healthcare to validate the robustness of the V Net model optimized for\nsmall-volume segmentation. With the study of the StructSeg2019 images, we found\nthat the optimization of the image normalization range and classification\nthreshold yielded a segmentation improvement of the lens of the eye of\napproximately 50%, compared to the use of the V-Net not optimized for small\nvolumes. We used the optimized model to segment 17 images acquired using\nheterogeneous protocols. We obtained comparable Dice coefficient values for the\nclinical and StructSeg2019 images (0.61 plus/minus 0.07 and 0.58 plus/minus\n0.10 for the left and right lens of the eye, respectively)\n","authors":["Jianxin Zhou","Kadishe Fejza","Massimiliano Salvatori","Daniele Della Latta","Gregory M. Hermann","Angela Di Fulvio"],"pdf_url":"https://arxiv.org/pdf/2404.04202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01482v4","updated":"2024-04-05T16:11:19Z","published":"2024-03-03T11:24:16Z","title":"EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised\n Semantic Segmentation","summary":" Semantic segmentation has innately relied on extensive pixel-level annotated\ndata, leading to the emergence of unsupervised methodologies. Among them,\nleveraging self-supervised Vision Transformers for unsupervised semantic\nsegmentation (USS) has been making steady progress with expressive deep\nfeatures. Yet, for semantically segmenting images with complex objects, a\npredominant challenge remains: the lack of explicit object-level semantic\nencoding in patch-level features. This technical limitation often leads to\ninadequate segmentation of complex objects with diverse structures. To address\nthis gap, we present a novel approach, EAGLE, which emphasizes object-centric\nrepresentation learning for unsupervised semantic segmentation. Specifically,\nwe introduce EiCue, a spectral technique providing semantic and structural cues\nthrough an eigenbasis derived from the semantic similarity matrix of deep image\nfeatures and color affinity from an image. Further, by incorporating our\nobject-centric contrastive loss with EiCue, we guide our model to learn\nobject-level representations with intra- and inter-image object-feature\nconsistency, thereby enhancing semantic accuracy. Extensive experiments on\nCOCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art\nUSS results of EAGLE with accurate and consistent semantic segmentation across\ncomplex scenes.\n","authors":["Chanyoung Kim","Woojung Han","Dayun Ju","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.01482v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00185v2","updated":"2024-04-05T16:10:44Z","published":"2024-03-29T22:51:45Z","title":"On Inherent Adversarial Robustness of Active Vision Systems","summary":" Current Deep Neural Networks are vulnerable to adversarial examples, which\nalter their predictions by adding carefully crafted noise. Since human eyes are\nrobust to such inputs, it is possible that the vulnerability stems from the\nstandard way of processing inputs in one shot by processing every pixel with\nthe same importance. In contrast, neuroscience suggests that the human vision\nsystem can differentiate salient features by (1) switching between multiple\nfixation points (saccades) and (2) processing the surrounding with a\nnon-uniform external resolution (foveation). In this work, we advocate that the\nintegration of such active vision mechanisms into current deep learning systems\ncan offer robustness benefits. Specifically, we empirically demonstrate the\ninherent robustness of two active vision methods - GFNet and FALcon - under a\nblack box threat model. By learning and inferencing based on downsampled\nglimpses obtained from multiple distinct fixation points within an input, we\nshow that these active methods achieve (2-3) times greater robustness compared\nto a standard passive convolutional network under state-of-the-art adversarial\nattacks. More importantly, we provide illustrative and interpretable\nvisualization analysis that demonstrates how performing inference from distinct\nfixation points makes active vision methods less vulnerable to malicious\ninputs.\n","authors":["Amitangshu Mukherjee","Timur Ibrayev","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2404.00185v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10427v2","updated":"2024-04-05T16:04:40Z","published":"2024-03-15T16:00:04Z","title":"SWAG: Splatting in the Wild images with Appearance-conditioned Gaussians","summary":" Implicit neural representation methods have shown impressive advancements in\nlearning 3D scenes from unstructured in-the-wild photo collections but are\nstill limited by the large computational cost of volumetric rendering. More\nrecently, 3D Gaussian Splatting emerged as a much faster alternative with\nsuperior rendering quality and training efficiency, especially for small-scale\nand object-centric scenarios. Nevertheless, this technique suffers from poor\nperformance on unstructured in-the-wild data. To tackle this, we extend over 3D\nGaussian Splatting to handle unstructured image collections. We achieve this by\nmodeling appearance to seize photometric variations in the rendered images.\nAdditionally, we introduce a new mechanism to train transient Gaussians to\nhandle the presence of scene occluders in an unsupervised manner. Experiments\non diverse photo collection scenes and multi-pass acquisition of outdoor\nlandmarks show the effectiveness of our method over prior works achieving\nstate-of-the-art results with improved efficiency.\n","authors":["Hiba Dahmani","Moussab Bennehar","Nathan Piasco","Luis Roldao","Dzmitry Tsishkou"],"pdf_url":"https://arxiv.org/pdf/2403.10427v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04179v1","updated":"2024-04-05T15:48:36Z","published":"2024-04-05T15:48:36Z","title":"SCAResNet: A ResNet Variant Optimized for Tiny Object Detection in\n Transmission and Distribution Towers","summary":" Traditional deep learning-based object detection networks often resize images\nduring the data preprocessing stage to achieve a uniform size and scale in the\nfeature map. Resizing is done to facilitate model propagation and fully\nconnected classification. However, resizing inevitably leads to object\ndeformation and loss of valuable information in the images. This drawback\nbecomes particularly pronounced for tiny objects like distribution towers with\nlinear shapes and few pixels. To address this issue, we propose abandoning the\nresizing operation. Instead, we introduce Positional-Encoding Multi-head\nCriss-Cross Attention. This allows the model to capture contextual information\nand learn from multiple representation subspaces, effectively enriching the\nsemantics of distribution towers. Additionally, we enhance Spatial Pyramid\nPooling by reshaping three pooled feature maps into a new unified one while\nalso reducing the computational burden. This approach allows images of\ndifferent sizes and scales to generate feature maps with uniform dimensions and\ncan be employed in feature map propagation. Our SCAResNet incorporates these\naforementioned improvements into the backbone network ResNet. We evaluated our\nSCAResNet using the Electric Transmission and Distribution Infrastructure\nImagery dataset from Duke University. Without any additional tricks, we\nemployed various object detection models with Gaussian Receptive Field based\nLabel Assignment as the baseline. When incorporating the SCAResNet into the\nbaseline model, we achieved a 2.1% improvement in mAPs. This demonstrates the\nadvantages of our SCAResNet in detecting transmission and distribution towers\nand its value in tiny object detection. The source code is available at\nhttps://github.com/LisavilaLee/SCAResNet_mmdet.\n","authors":["Weile Li","Muqing Shi","Zhonghua Hong"],"pdf_url":"https://arxiv.org/pdf/2404.04179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09915v2","updated":"2024-04-05T15:45:48Z","published":"2023-07-19T11:35:21Z","title":"Embedded Heterogeneous Attention Transformer for Cross-lingual Image\n Captioning","summary":" Cross-lingual image captioning is a challenging task that requires addressing\nboth cross-lingual and cross-modal obstacles in multimedia analysis. The\ncrucial issue in this task is to model the global and the local matching\nbetween the image and different languages. Existing cross-modal embedding\nmethods based on the transformer architecture oversee the local matching\nbetween the image region and monolingual words, especially when dealing with\ndiverse languages. To overcome these limitations, we propose an Embedded\nHeterogeneous Attention Transformer (EHAT) to establish cross-domain\nrelationships and local correspondences between images and different languages\nby using a heterogeneous network. EHAT comprises Masked Heterogeneous\nCross-attention (MHCA), Heterogeneous Attention Reasoning Network (HARN), and\nHeterogeneous Co-attention (HCA). The HARN serves as the core network and it\ncaptures cross-domain relationships by leveraging visual bounding box\nrepresentation features to connect word features from two languages and to\nlearn heterogeneous maps. MHCA and HCA facilitate cross-domain integration in\nthe encoder through specialized heterogeneous attention mechanisms, enabling a\nsingle model to generate captions in two languages. We evaluate our approach on\nthe MSCOCO dataset to generate captions in English and Chinese, two languages\nthat exhibit significant differences in their language families. The\nexperimental results demonstrate the superior performance of our method\ncompared to existing advanced monolingual methods. Our proposed EHAT framework\neffectively addresses the challenges of cross-lingual image captioning, paving\nthe way for improved multilingual image analysis and understanding.\n","authors":["Zijie Song","Zhenzhen Hu","Yuanen Zhou","Ye Zhao","Richang Hong","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07087v2","updated":"2024-04-05T15:42:13Z","published":"2024-02-11T02:34:42Z","title":"Self-Correcting Self-Consuming Loops for Generative Model Training","summary":" As synthetic data becomes higher quality and proliferates on the internet,\nmachine learning models are increasingly trained on a mix of human- and\nmachine-generated data. Despite the successful stories of using synthetic data\nfor representation learning, using synthetic data for generative model training\ncreates \"self-consuming loops\" which may lead to training instability or even\ncollapse, unless certain conditions are met. Our paper aims to stabilize\nself-consuming generative model training. Our theoretical results demonstrate\nthat by introducing an idealized correction function, which maps a data point\nto be more likely under the true data distribution, self-consuming loops can be\nmade exponentially more stable. We then propose self-correction functions,\nwhich rely on expert knowledge (e.g. the laws of physics programmed in a\nsimulator), and aim to approximate the idealized corrector automatically and at\nscale. We empirically validate the effectiveness of self-correcting\nself-consuming loops on the challenging human motion synthesis task, and\nobserve that it successfully avoids model collapse, even when the ratio of\nsynthetic data to real data is as high as 100%.\n","authors":["Nate Gillman","Michael Freeman","Daksh Aggarwal","Chia-Hong Hsu","Calvin Luo","Yonglong Tian","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2402.07087v2.pdf","comment":"This new version contains updated mathematical results (c.f. Remark\n 4.4), as well as experiments for an additional generative modeling task.\n Paper under submission; code is available at\n https://nategillman.com/sc-sc.html"},{"id":"http://arxiv.org/abs/2311.08046v3","updated":"2024-04-05T15:21:09Z","published":"2023-11-14T10:11:36Z","title":"Chat-UniVi: Unified Visual Representation Empowers Large Language Models\n with Image and Video Understanding","summary":" Large language models have demonstrated impressive universal capabilities\nacross a wide range of open-ended tasks and have extended their utility to\nencompass multimodal conversations. However, existing methods encounter\nchallenges in effectively handling both image and video understanding,\nparticularly with limited visual tokens. In this work, we introduce Chat-UniVi,\na Unified Vision-language model capable of comprehending and engaging in\nconversations involving images and videos through a unified visual\nrepresentation. Specifically, we employ a set of dynamic visual tokens to\nuniformly represent images and videos. This representation framework empowers\nthe model to efficiently utilize a limited number of visual tokens to\nsimultaneously capture the spatial details necessary for images and the\ncomprehensive temporal relationship required for videos. Moreover, we leverage\na multi-scale representation, enabling the model to perceive both high-level\nsemantic concepts and low-level visual details. Notably, Chat-UniVi is trained\non a mixed dataset containing both images and videos, allowing direct\napplication to tasks involving both mediums without requiring any\nmodifications. Extensive experimental results demonstrate that Chat-UniVi\nconsistently outperforms even existing methods exclusively designed for either\nimages or videos. Code is available at\nhttps://github.com/PKU-YuanGroup/Chat-UniVi.\n","authors":["Peng Jin","Ryuichi Takanobu","Wancai Zhang","Xiaochun Cao","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2311.08046v3.pdf","comment":"Accepted by CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2404.04159v1","updated":"2024-04-05T15:11:09Z","published":"2024-04-05T15:11:09Z","title":"Noisy Label Processing for Classification: A Survey","summary":" In recent years, deep neural networks (DNNs) have gained remarkable\nachievement in computer vision tasks, and the success of DNNs often depends\ngreatly on the richness of data. However, the acquisition process of data and\nhigh-quality ground truth requires a lot of manpower and money. In the long,\ntedious process of data annotation, annotators are prone to make mistakes,\nresulting in incorrect labels of images, i.e., noisy labels. The emergence of\nnoisy labels is inevitable. Moreover, since research shows that DNNs can easily\nfit noisy labels, the existence of noisy labels will cause significant damage\nto the model training process. Therefore, it is crucial to combat noisy labels\nfor computer vision tasks, especially for classification tasks. In this survey,\nwe first comprehensively review the evolution of different deep learning\napproaches for noisy label combating in the image classification task. In\naddition, we also review different noise patterns that have been proposed to\ndesign robust algorithms. Furthermore, we explore the inner pattern of\nreal-world label noise and propose an algorithm to generate a synthetic label\nnoise pattern guided by real-world data. We test the algorithm on the\nwell-known real-world dataset CIFAR-10N to form a new real-world data-guided\nsynthetic benchmark and evaluate some typical noise-robust methods on the\nbenchmark.\n","authors":["Mengting Li","Chuang Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.04159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04155v1","updated":"2024-04-05T15:04:57Z","published":"2024-04-05T15:04:57Z","title":"MarsSeg: Mars Surface Semantic Segmentation with Multi-level Extractor\n and Connector","summary":" The segmentation and interpretation of the Martian surface play a pivotal\nrole in Mars exploration, providing essential data for the trajectory planning\nand obstacle avoidance of rovers. However, the complex topography, similar\nsurface features, and the lack of extensive annotated data pose significant\nchallenges to the high-precision semantic segmentation of the Martian surface.\nTo address these challenges, we propose a novel encoder-decoder based Mars\nsegmentation network, termed MarsSeg. Specifically, we employ an\nencoder-decoder structure with a minimized number of down-sampling layers to\npreserve local details. To facilitate a high-level semantic understanding\nacross the shadow multi-level feature maps, we introduce a feature enhancement\nconnection layer situated between the encoder and decoder. This layer\nincorporates Mini Atrous Spatial Pyramid Pooling (Mini-ASPP), Polarized\nSelf-Attention (PSA), and Strip Pyramid Pooling Module (SPPM). The Mini-ASPP\nand PSA are specifically designed for shadow feature enhancement, thereby\nenabling the expression of local details and small objects. Conversely, the\nSPPM is employed for deep feature enhancement, facilitating the extraction of\nhigh-level semantic category-related information. Experimental results derived\nfrom the Mars-Seg and AI4Mars datasets substantiate that the proposed MarsSeg\noutperforms other state-of-the-art methods in segmentation performance,\nvalidating the efficacy of each proposed component.\n","authors":["Junbo Li","Keyan Chen","Gengju Tian","Lu Li","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2404.04155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14239v2","updated":"2024-04-05T15:00:58Z","published":"2023-12-21T18:59:53Z","title":"PlatoNeRF: 3D Reconstruction in Plato's Cave via Single-View Two-Bounce\n Lidar","summary":" 3D reconstruction from a single-view is challenging because of the ambiguity\nfrom monocular cues and lack of information about occluded regions. Neural\nradiance fields (NeRF), while popular for view synthesis and 3D reconstruction,\nare typically reliant on multi-view images. Existing methods for single-view 3D\nreconstruction with NeRF rely on either data priors to hallucinate views of\noccluded regions, which may not be physically accurate, or shadows observed by\nRGB cameras, which are difficult to detect in ambient light and low albedo\nbackgrounds. We propose using time-of-flight data captured by a single-photon\navalanche diode to overcome these limitations. Our method models two-bounce\noptical paths with NeRF, using lidar transient data for supervision. By\nleveraging the advantages of both NeRF and two-bounce light measured by lidar,\nwe demonstrate that we can reconstruct visible and occluded geometry without\ndata priors or reliance on controlled ambient lighting or scene albedo. In\naddition, we demonstrate improved generalization under practical constraints on\nsensor spatial- and temporal-resolution. We believe our method is a promising\ndirection as single-photon lidars become ubiquitous on consumer devices, such\nas phones, tablets, and headsets.\n","authors":["Tzofi Klinghoffer","Xiaoyu Xiang","Siddharth Somasundaram","Yuchen Fan","Christian Richardt","Ramesh Raskar","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2312.14239v2.pdf","comment":"CVPR 2024. Project Page: https://platonerf.github.io/"},{"id":"http://arxiv.org/abs/2402.01779v2","updated":"2024-04-05T14:57:56Z","published":"2024-02-01T18:05:47Z","title":"Plug-and-Play image restoration with Stochastic deNOising REgularization","summary":" Plug-and-Play (PnP) algorithms are a class of iterative algorithms that\naddress image inverse problems by combining a physical model and a deep neural\nnetwork for regularization. Even if they produce impressive image restoration\nresults, these algorithms rely on a non-standard use of a denoiser on images\nthat are less and less noisy along the iterations, which contrasts with recent\nalgorithms based on Diffusion Models (DM), where the denoiser is applied only\non re-noised images. We propose a new PnP framework, called Stochastic\ndeNOising REgularization (SNORE), which applies the denoiser only on images\nwith noise of the adequate level. It is based on an explicit stochastic\nregularization, which leads to a stochastic gradient descent algorithm to solve\nill-posed inverse problems. A convergence analysis of this algorithm and its\nannealing extension is provided. Experimentally, we prove that SNORE is\ncompetitive with respect to state-of-the-art methods on deblurring and\ninpainting tasks, both quantitatively and qualitatively.\n","authors":["Marien Renaud","Jean Prost","Arthur Leclaire","Nicolas Papadakis"],"pdf_url":"https://arxiv.org/pdf/2402.01779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02072v3","updated":"2024-04-05T14:48:43Z","published":"2024-04-02T16:20:02Z","title":"EGTR: Extracting Graph from Transformer for Scene Graph Generation","summary":" Scene Graph Generation (SGG) is a challenging task of detecting objects and\npredicting relationships between objects. After DETR was developed, one-stage\nSGG models based on a one-stage object detector have been actively studied.\nHowever, complex modeling is used to predict the relationship between objects,\nand the inherent relationship between object queries learned in the multi-head\nself-attention of the object detector has been neglected. We propose a\nlightweight one-stage SGG model that extracts the relation graph from the\nvarious relationships learned in the multi-head self-attention layers of the\nDETR decoder. By fully utilizing the self-attention by-products, the relation\ngraph can be extracted effectively with a shallow relation extraction head.\nConsidering the dependency of the relation extraction task on the object\ndetection task, we propose a novel relation smoothing technique that adjusts\nthe relation label adaptively according to the quality of the detected objects.\nBy the relation smoothing, the model is trained according to the continuous\ncurriculum that focuses on object detection task at the beginning of training\nand performs multi-task learning as the object detection performance gradually\nimproves. Furthermore, we propose a connectivity prediction task that predicts\nwhether a relation exists between object pairs as an auxiliary task of the\nrelation extraction. We demonstrate the effectiveness and efficiency of our\nmethod for the Visual Genome and Open Image V6 datasets. Our code is publicly\navailable at https://github.com/naver-ai/egtr.\n","authors":["Jinbae Im","JeongYeon Nam","Nokyung Park","Hyungmin Lee","Seunghyun Park"],"pdf_url":"https://arxiv.org/pdf/2404.02072v3.pdf","comment":"CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2312.00690v3","updated":"2024-04-05T14:44:27Z","published":"2023-12-01T16:17:16Z","title":"Open-vocabulary object 6D pose estimation","summary":" We introduce the new setting of open-vocabulary object 6D pose estimation, in\nwhich a textual prompt is used to specify the object of interest. In contrast\nto existing approaches, in our setting (i) the object of interest is specified\nsolely through the textual prompt, (ii) no object model (e.g., CAD or video\nsequence) is required at inference, and (iii) the object is imaged from two\nRGBD viewpoints of different scenes. To operate in this setting, we introduce a\nnovel approach that leverages a Vision-Language Model to segment the object of\ninterest from the scenes and to estimate its relative 6D pose. The key of our\napproach is a carefully devised strategy to fuse object-level information\nprovided by the prompt with local image features, resulting in a feature space\nthat can generalize to novel concepts. We validate our approach on a new\nbenchmark based on two popular datasets, REAL275 and Toyota-Light, which\ncollectively encompass 34 object instances appearing in four thousand image\npairs. The results demonstrate that our approach outperforms both a\nwell-established hand-crafted method and a recent deep learning-based baseline\nin estimating the relative 6D pose of objects in different scenes. Code and\ndataset are available at https://jcorsetti.github.io/oryon.\n","authors":["Jaime Corsetti","Davide Boscaini","Changjae Oh","Andrea Cavallaro","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.00690v3.pdf","comment":"Camera ready version (CVPR 2024, poster highlight). 21 pages, 15\n figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.04140v1","updated":"2024-04-05T14:39:13Z","published":"2024-04-05T14:39:13Z","title":"Improving Detection in Aerial Images by Capturing Inter-Object\n Relationships","summary":" In many image domains, the spatial distribution of objects in a scene\nexhibits meaningful patterns governed by their semantic relationships. In most\nmodern detection pipelines, however, the detection proposals are processed\nindependently, overlooking the underlying relationships between objects. In\nthis work, we introduce a transformer-based approach to capture these\ninter-object relationships to refine classification and regression outcomes for\ndetected objects. Building on two-stage detectors, we tokenize the region of\ninterest (RoI) proposals to be processed by a transformer encoder. Specific\nspatial and geometric relations are incorporated into the attention weights and\nadaptively modulated and regularized. Experimental results demonstrate that the\nproposed method achieves consistent performance improvement on three benchmarks\nincluding DOTA-v1.0, DOTA-v1.5, and HRSC 2016, especially ranking first on both\nDOTA-v1.5 and HRSC 2016. Specifically, our new method has an increase of 1.59\nmAP on DOTA-v1.0, 4.88 mAP on DOTA-v1.5, and 2.1 mAP on HRSC 2016,\nrespectively, compared to the baselines.\n","authors":["Botao Ren","Botian Xu","Yifan Pu","Jingyi Wang","Zhidong Deng"],"pdf_url":"https://arxiv.org/pdf/2404.04140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20092v4","updated":"2024-04-05T14:35:26Z","published":"2023-10-31T00:12:14Z","title":"The Missing U for Efficient Diffusion Models","summary":" Diffusion Probabilistic Models stand as a critical tool in generative\nmodelling, enabling the generation of complex data distributions. This family\nof generative models yields record-breaking performance in tasks such as image\nsynthesis, video generation, and molecule design. Despite their capabilities,\ntheir efficiency, especially in the reverse process, remains a challenge due to\nslow convergence rates and high computational costs. In this paper, we\nintroduce an approach that leverages continuous dynamical systems to design a\nnovel denoising network for diffusion models that is more parameter-efficient,\nexhibits faster convergence, and demonstrates increased noise robustness.\nExperimenting with Denoising Diffusion Probabilistic Models (DDPMs), our\nframework operates with approximately a quarter of the parameters, and $\\sim$\n30\\% of the Floating Point Operations (FLOPs) compared to standard U-Nets in\nDDPMs. Furthermore, our model is notably faster in inference than the baseline\nwhen measured in fair and equal conditions. We also provide a mathematical\nintuition as to why our proposed reverse process is faster as well as a\nmathematical discussion of the empirical tradeoffs in the denoising downstream\ntask. Finally, we argue that our method is compatible with existing performance\nenhancement techniques, enabling further improvements in efficiency, quality,\nand speed.\n","authors":["Sergio Calvo-Ordonez","Chun-Wun Cheng","Jiahao Huang","Lipei Zhang","Guang Yang","Carola-Bibiane Schonlieb","Angelica I Aviles-Rivero"],"pdf_url":"https://arxiv.org/pdf/2310.20092v4.pdf","comment":"23 pages, 14 figures, Accepted at Transactions of Machine Learning\n Research (04/2024)"},{"id":"http://arxiv.org/abs/2304.03560v2","updated":"2024-04-05T14:07:25Z","published":"2023-04-07T09:46:29Z","title":"DualRefine: Self-Supervised Depth and Pose Estimation Through Iterative\n Epipolar Sampling and Refinement Toward Equilibrium","summary":" Self-supervised multi-frame depth estimation achieves high accuracy by\ncomputing matching costs of pixel correspondences between adjacent frames,\ninjecting geometric information into the network. These pixel-correspondence\ncandidates are computed based on the relative pose estimates between the\nframes. Accurate pose predictions are essential for precise matching cost\ncomputation as they influence the epipolar geometry. Furthermore, improved\ndepth estimates can, in turn, be used to align pose estimates.\n Inspired by traditional structure-from-motion (SfM) principles, we propose\nthe DualRefine model, which tightly couples depth and pose estimation through a\nfeedback loop. Our novel update pipeline uses a deep equilibrium model\nframework to iteratively refine depth estimates and a hidden state of feature\nmaps by computing local matching costs based on epipolar geometry. Importantly,\nwe used the refined depth estimates and feature maps to compute pose updates at\neach step. This update in the pose estimates slowly alters the epipolar\ngeometry during the refinement process. Experimental results on the KITTI\ndataset demonstrate competitive depth prediction and odometry prediction\nperformance surpassing published self-supervised baselines.\n","authors":["Antyanta Bangunharcana","Ahmed Magd","Kyung-Soo Kim"],"pdf_url":"https://arxiv.org/pdf/2304.03560v2.pdf","comment":"CVPR 2023. Project page:\n https://antabangun.github.io/projects/DualRefine/ Code:\n https://github.com/antabangun/DualRefine"},{"id":"http://arxiv.org/abs/2404.04104v1","updated":"2024-04-05T14:00:07Z","published":"2024-04-05T14:00:07Z","title":"3D Facial Expressions through Analysis-by-Neural-Synthesis","summary":" While existing methods for 3D face reconstruction from in-the-wild images\nexcel at recovering the overall face shape, they commonly miss subtle, extreme,\nasymmetric, or rarely observed expressions. We improve upon these methods with\nSMIRK (Spatial Modeling for Image-based Reconstruction of Kinesics), which\nfaithfully reconstructs expressive 3D faces from images. We identify two key\nlimitations in existing methods: shortcomings in their self-supervised training\nformulation, and a lack of expression diversity in the training images. For\ntraining, most methods employ differentiable rendering to compare a predicted\nface mesh with the input image, along with a plethora of additional loss\nfunctions. This differentiable rendering loss not only has to provide\nsupervision to optimize for 3D face geometry, camera, albedo, and lighting,\nwhich is an ill-posed optimization problem, but the domain gap between\nrendering and input image further hinders the learning process. Instead, SMIRK\nreplaces the differentiable rendering with a neural rendering module that,\ngiven the rendered predicted mesh geometry, and sparsely sampled pixels of the\ninput image, generates a face image. As the neural rendering gets color\ninformation from sampled image pixels, supervising with neural rendering-based\nreconstruction loss can focus solely on the geometry. Further, it enables us to\ngenerate images of the input identity with varying expressions while training.\nThese are then utilized as input to the reconstruction model and used as\nsupervision with ground truth geometry. This effectively augments the training\ndata and enhances the generalization for diverse expressions. Our qualitative,\nquantitative and particularly our perceptual evaluations demonstrate that SMIRK\nachieves the new state-of-the art performance on accurate expression\nreconstruction. Project webpage: https://georgeretsi.github.io/smirk/.\n","authors":["George Retsinas","Panagiotis P. Filntisis","Radek Danecek","Victoria F. Abrevaya","Anastasios Roussos","Timo Bolkart","Petros Maragos"],"pdf_url":"https://arxiv.org/pdf/2404.04104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02702v2","updated":"2024-04-05T13:49:17Z","published":"2023-12-05T12:04:34Z","title":"Neural Sign Actors: A diffusion model for 3D sign language production\n from text","summary":" Sign Languages (SL) serve as the primary mode of communication for the Deaf\nand Hard of Hearing communities. Deep learning methods for SL recognition and\ntranslation have achieved promising results. However, Sign Language Production\n(SLP) poses a challenge as the generated motions must be realistic and have\nprecise semantic meaning. Most SLP methods rely on 2D data, which hinders their\nrealism. In this work, a diffusion-based SLP model is trained on a curated\nlarge-scale dataset of 4D signing avatars and their corresponding text\ntranscripts. The proposed method can generate dynamic sequences of 3D avatars\nfrom an unconstrained domain of discourse using a diffusion process formed on a\nnovel and anatomically informed graph neural network defined on the SMPL-X body\nskeleton. Through quantitative and qualitative experiments, we show that the\nproposed method considerably outperforms previous methods of SLP. This work\nmakes an important step towards realistic neural sign avatars, bridging the\ncommunication gap between Deaf and hearing communities.\n","authors":["Vasileios Baltatzis","Rolandos Alexandros Potamias","Evangelos Ververas","Guanxiong Sun","Jiankang Deng","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2312.02702v2.pdf","comment":"Accepted at CVPR 2024, Project page:\n https://baltatzisv.github.io/neural-sign-actors/"},{"id":"http://arxiv.org/abs/2312.06420v2","updated":"2024-04-05T13:45:11Z","published":"2023-12-11T14:43:23Z","title":"Localization Is All You Evaluate: Data Leakage in Online Mapping\n Datasets and How to Fix It","summary":" The task of online mapping is to predict a local map using current sensor\nobservations, e.g. from lidar and camera, without relying on a pre-built map.\nState-of-the-art methods are based on supervised learning and are trained\npredominantly using two datasets: nuScenes and Argoverse 2. However, these\ndatasets revisit the same geographic locations across training, validation, and\ntest sets. Specifically, over $80$% of nuScenes and $40$% of Argoverse 2\nvalidation and test samples are less than $5$ m from a training sample. At test\ntime, the methods are thus evaluated more on how well they localize within a\nmemorized implicit map built from the training data than on extrapolating to\nunseen locations. Naturally, this data leakage causes inflated performance\nnumbers and we propose geographically disjoint data splits to reveal the true\nperformance in unseen environments. Experimental results show that methods\nperform considerably worse, some dropping more than $45$ mAP, when trained and\nevaluated on proper data splits. Additionally, a reassessment of prior design\nchoices reveals diverging conclusions from those based on the original split.\nNotably, the impact of lifting methods and the support from auxiliary tasks\n(e.g., depth supervision) on performance appears less substantial or follows a\ndifferent trajectory than previously perceived. Splits can be found at\nhttps://github.com/LiljaAdam/geographical-splits\n","authors":["Adam Lilja","Junsheng Fu","Erik Stenborg","Lars Hammarstrand"],"pdf_url":"https://arxiv.org/pdf/2312.06420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04095v1","updated":"2024-04-05T13:44:39Z","published":"2024-04-05T13:44:39Z","title":"Dynamic Prompt Optimizing for Text-to-Image Generation","summary":" Text-to-image generative models, specifically those based on diffusion models\nlike Imagen and Stable Diffusion, have made substantial advancements. Recently,\nthere has been a surge of interest in the delicate refinement of text prompts.\nUsers assign weights or alter the injection time steps of certain words in the\ntext prompts to improve the quality of generated images. However, the success\nof fine-control prompts depends on the accuracy of the text prompts and the\ncareful selection of weights and time steps, which requires significant manual\nintervention. To address this, we introduce the \\textbf{P}rompt\n\\textbf{A}uto-\\textbf{E}diting (PAE) method. Besides refining the original\nprompts for image generation, we further employ an online reinforcement\nlearning strategy to explore the weights and injection time steps of each word,\nleading to the dynamic fine-control prompts. The reward function during\ntraining encourages the model to consider aesthetic score, semantic\nconsistency, and user preferences. Experimental results demonstrate that our\nproposed method effectively improves the original prompts, generating visually\nmore appealing images while maintaining semantic alignment. Code is available\nat https://github.com/Mowenyii/PAE.\n","authors":["Wenyi Mo","Tianyu Zhang","Yalong Bai","Bing Su","Ji-Rong Wen","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2404.04095v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/1902.06634v4","updated":"2024-04-05T13:03:08Z","published":"2019-02-18T16:15:25Z","title":"Contextual Encoder-Decoder Network for Visual Saliency Prediction","summary":" Predicting salient regions in natural images requires the detection of\nobjects that are present in a scene. To develop robust representations for this\nchallenging task, high-level visual features at multiple spatial scales must be\nextracted and augmented with contextual information. However, existing models\naimed at explaining human fixation maps do not incorporate such a mechanism\nexplicitly. Here we propose an approach based on a convolutional neural network\npre-trained on a large-scale image classification task. The architecture forms\nan encoder-decoder structure and includes a module with multiple convolutional\nlayers at different dilation rates to capture multi-scale features in parallel.\nMoreover, we combine the resulting representations with global scene\ninformation for accurately predicting visual saliency. Our model achieves\ncompetitive and consistent results across multiple evaluation metrics on two\npublic saliency benchmarks and we demonstrate the effectiveness of the\nsuggested approach on five datasets and selected examples. Compared to state of\nthe art approaches, the network is based on a lightweight image classification\nbackbone and hence presents a suitable choice for applications with limited\ncomputational resources, such as (virtual) robotic systems, to estimate human\nfixations across complex natural scenes.\n","authors":["Alexander Kroner","Mario Senden","Kurt Driessens","Rainer Goebel"],"pdf_url":"https://arxiv.org/pdf/1902.06634v4.pdf","comment":"Updated contact information"},{"id":"http://arxiv.org/abs/2404.04072v1","updated":"2024-04-05T12:58:07Z","published":"2024-04-05T12:58:07Z","title":"Label Propagation for Zero-shot Classification with Vision-Language\n Models","summary":" Vision-Language Models (VLMs) have demonstrated impressive performance on\nzero-shot classification, i.e. classification when provided merely with a list\nof class names. In this paper, we tackle the case of zero-shot classification\nin the presence of unlabeled data. We leverage the graph structure of the\nunlabeled data and introduce ZLaP, a method based on label propagation (LP)\nthat utilizes geodesic distances for classification. We tailor LP to graphs\ncontaining both text and image features and further propose an efficient method\nfor performing inductive inference based on a dual solution and a\nsparsification step. We perform extensive experiments to evaluate the\neffectiveness of our method on 14 common datasets and show that ZLaP\noutperforms the latest related works. Code:\nhttps://github.com/vladan-stojnic/ZLaP\n","authors":["Vladan Stojnić","Yannis Kalantidis","Giorgos Tolias"],"pdf_url":"https://arxiv.org/pdf/2404.04072v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.09124v2","updated":"2024-04-05T12:56:31Z","published":"2024-03-14T06:16:21Z","title":"Single Domain Generalization for Crowd Counting","summary":" Due to its promising results, density map regression has been widely employed\nfor image-based crowd counting. The approach, however, often suffers from\nsevere performance degradation when tested on data from unseen scenarios, the\nso-called \"domain shift\" problem. To address the problem, we investigate in\nthis work single domain generalization (SDG) for crowd counting. The existing\nSDG approaches are mainly for image classification and segmentation, and can\nhardly be extended to our case due to its regression nature and label ambiguity\n(i.e., ambiguous pixel-level ground truths). We propose MPCount, a novel\neffective SDG approach even for narrow source distribution. MPCount stores\ndiverse density values for density map regression and reconstructs\ndomain-invariant features by means of only one memory bank, a content error\nmask and attention consistency loss. By partitioning the image into grids, it\nemploys patch-wise classification as an auxiliary task to mitigate label\nambiguity. Through extensive experiments on different datasets, MPCount is\nshown to significantly improve counting accuracy compared to the state of the\nart under diverse scenarios unobserved in the training data characterized by\nnarrow source distribution. Code is available at\nhttps://github.com/Shimmer93/MPCount.\n","authors":["Zhuoxuan Peng","S. -H. Gary Chan"],"pdf_url":"https://arxiv.org/pdf/2403.09124v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.19340v2","updated":"2024-04-05T12:49:38Z","published":"2024-02-29T16:46:49Z","title":"One model to use them all: Training a segmentation model with\n complementary datasets","summary":" Understanding a surgical scene is crucial for computer-assisted surgery\nsystems to provide any intelligent assistance functionality. One way of\nachieving this scene understanding is via scene segmentation, where every pixel\nof a frame is classified and therefore identifies the visible structures and\ntissues. Progress on fully segmenting surgical scenes has been made using\nmachine learning. However, such models require large amounts of annotated\ntraining data, containing examples of all relevant object classes. Such fully\nannotated datasets are hard to create, as every pixel in a frame needs to be\nannotated by medical experts and, therefore, are rarely available. In this\nwork, we propose a method to combine multiple partially annotated datasets,\nwhich provide complementary annotations, into one model, enabling better scene\nsegmentation and the use of multiple readily available datasets. Our method\naims to combine available data with complementary labels by leveraging mutual\nexclusive properties to maximize information. Specifically, we propose to use\npositive annotations of other classes as negative samples and to exclude\nbackground pixels of binary annotations, as we cannot tell if they contain a\nclass not annotated but predicted by the model. We evaluate our method by\ntraining a DeepLabV3 on the publicly available Dresden Surgical Anatomy\nDataset, which provides multiple subsets of binary segmented anatomical\nstructures. Our approach successfully combines 6 classes into one model,\nincreasing the overall Dice Score by 4.4% compared to an ensemble of models\ntrained on the classes individually. By including information on multiple\nclasses, we were able to reduce confusion between stomach and colon by 24%. Our\nresults demonstrate the feasibility of training a model on multiple datasets.\nThis paves the way for future work further alleviating the need for one large,\nfully segmented datasets.\n","authors":["Alexander C. Jenke","Sebastian Bodenstedt","Fiona R. Kolbinger","Marius Distler","Jürgen Weitz","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2402.19340v2.pdf","comment":"Accepted at IPCAI 2024; submitted to IJCARS (under revision)"},{"id":"http://arxiv.org/abs/2404.03443v2","updated":"2024-04-05T12:44:39Z","published":"2024-04-04T13:43:11Z","title":"Part-Attention Based Model Make Occluded Person Re-Identification\n Stronger","summary":" The goal of occluded person re-identification (ReID) is to retrieve specific\npedestrians in occluded situations. However, occluded person ReID still suffers\nfrom background clutter and low-quality local feature representations, which\nlimits model performance. In our research, we introduce a new framework called\nPAB-ReID, which is a novel ReID model incorporating part-attention mechanisms\nto tackle the aforementioned issues effectively. Firstly, we introduce the\nhuman parsing label to guide the generation of more accurate human part\nattention maps. In addition, we propose a fine-grained feature focuser for\ngenerating fine-grained human local feature representations while suppressing\nbackground interference. Moreover, We also design a part triplet loss to\nsupervise the learning of human local features, which optimizes\nintra/inter-class distance. We conducted extensive experiments on specialized\nocclusion and regular ReID datasets, showcasing that our approach outperforms\nthe existing state-of-the-art methods.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.03443v2.pdf","comment":"Accepted By International Joint Conference on Neural Networks 2024"},{"id":"http://arxiv.org/abs/2403.06546v2","updated":"2024-04-05T12:35:06Z","published":"2024-03-11T09:46:41Z","title":"OMH: Structured Sparsity via Optimally Matched Hierarchy for\n Unsupervised Semantic Segmentation","summary":" Unsupervised Semantic Segmentation (USS) involves segmenting images without\nrelying on predefined labels, aiming to alleviate the burden of extensive human\nlabeling. Existing methods utilize features generated by self-supervised models\nand specific priors for clustering. However, their clustering objectives are\nnot involved in the optimization of the features during training. Additionally,\ndue to the lack of clear class definitions in USS, the resulting segments may\nnot align well with the clustering objective. In this paper, we introduce a\nnovel approach called Optimally Matched Hierarchy (OMH) to simultaneously\naddress the above issues. The core of our method lies in imposing structured\nsparsity on the feature space, which allows the features to encode information\nwith different levels of granularity. The structure of this sparsity stems from\nour hierarchy (OMH). To achieve this, we learn a soft but sparse hierarchy\namong parallel clusters through Optimal Transport. Our OMH yields better\nunsupervised segmentation performance compared to existing USS methods. Our\nextensive experiments demonstrate the benefits of OMH when utilizing our\ndifferentiable paradigm. We will make our code publicly available.\n","authors":["Baran Ozaydin","Tong Zhang","Deblina Bhattacharjee","Sabine Süsstrunk","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2403.06546v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2404.04057v1","updated":"2024-04-05T12:30:19Z","published":"2024-04-05T12:30:19Z","title":"Score identity Distillation: Exponentially Fast Distillation of\n Pretrained Diffusion Models for One-Step Generation","summary":" We introduce Score identity Distillation (SiD), an innovative data-free\nmethod that distills the generative capabilities of pretrained diffusion models\ninto a single-step generator. SiD not only facilitates an exponentially fast\nreduction in Fr\\'echet inception distance (FID) during distillation but also\napproaches or even exceeds the FID performance of the original teacher\ndiffusion models. By reformulating forward diffusion processes as semi-implicit\ndistributions, we leverage three score-related identities to create an\ninnovative loss mechanism. This mechanism achieves rapid FID reduction by\ntraining the generator using its own synthesized images, eliminating the need\nfor real data or reverse-diffusion-based generation, all accomplished within\nsignificantly shortened generation time. Upon evaluation across four benchmark\ndatasets, the SiD algorithm demonstrates high iteration efficiency during\ndistillation and surpasses competing distillation approaches, whether they are\none-step or few-step, data-free, or dependent on training data, in terms of\ngeneration quality. This achievement not only redefines the benchmarks for\nefficiency and effectiveness in diffusion distillation but also in the broader\nfield of diffusion-based generation. Our PyTorch implementation will be\npublicly accessible on GitHub.\n","authors":["Mingyuan Zhou","Huangjie Zheng","Zhendong Wang","Mingzhang Yin","Hai Huang"],"pdf_url":"https://arxiv.org/pdf/2404.04057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13377v2","updated":"2024-04-05T12:14:50Z","published":"2023-12-20T19:08:49Z","title":"SADA: Semantic adversarial unsupervised domain adaptation for Temporal\n Action Localization","summary":" Temporal Action Localization (TAL) is a complex task that poses relevant\nchallenges, particularly when attempting to generalize on new -- unseen --\ndomains in real-world applications. These scenarios, despite realistic, are\noften neglected in the literature, exposing these solutions to important\nperformance degradation. In this work, we tackle this issue by introducing, for\nthe first time, an approach for Unsupervised Domain Adaptation (UDA) in sparse\nTAL, which we refer to as Semantic Adversarial unsupervised Domain Adaptation\n(SADA). Our contributions are threefold: (1) we pioneer the development of a\ndomain adaptation model that operates on realistic sparse action detection\nbenchmarks; (2) we tackle the limitations of global-distribution alignment\ntechniques by introducing a novel adversarial loss that is sensitive to local\nclass distributions, ensuring finer-grained adaptation; and (3) we present a\nnovel set of benchmarks based on EpicKitchens100 and CharadesEgo, that evaluate\nmultiple domain shifts in a comprehensive manner. Our experiments indicate that\nSADA improves the adaptation across domains when compared to fully supervised\nstate-of-the-art and alternative UDA methods, attaining a performance boost of\nup to 6.14% mAP.\n","authors":["David Pujol-Perich","Albert Clapés","Sergio Escalera"],"pdf_url":"https://arxiv.org/pdf/2312.13377v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10344v2","updated":"2024-04-05T12:14:15Z","published":"2024-03-15T14:31:17Z","title":"SCILLA: SurfaCe Implicit Learning for Large Urban Area, a volumetric\n hybrid solution","summary":" Neural implicit surface representation methods have recently shown impressive\n3D reconstruction results. However, existing solutions struggle to reconstruct\nurban outdoor scenes due to their large, unbounded, and highly detailed nature.\nHence, to achieve accurate reconstructions, additional supervision data such as\nLiDAR, strong geometric priors, and long training times are required. To tackle\nsuch issues, we present SCILLA, a new hybrid implicit surface learning method\nto reconstruct large driving scenes from 2D images. SCILLA's hybrid\narchitecture models two separate implicit fields: one for the volumetric\ndensity and another for the signed distance to the surface. To accurately\nrepresent urban outdoor scenarios, we introduce a novel volume-rendering\nstrategy that relies on self-supervised probabilistic density estimation to\nsample points near the surface and transition progressively from volumetric to\nsurface representation. Our solution permits a proper and fast initialization\nof the signed distance field without relying on any geometric prior on the\nscene, compared to concurrent methods. By conducting extensive experiments on\nfour outdoor driving datasets, we show that SCILLA can learn an accurate and\ndetailed 3D surface scene representation in various urban scenarios while being\ntwo times faster to train compared to previous state-of-the-art solutions.\n","authors":["Hala Djeghim","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Désiré Sidibé"],"pdf_url":"https://arxiv.org/pdf/2403.10344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06587v2","updated":"2024-04-05T12:10:30Z","published":"2023-12-11T18:19:36Z","title":"QuickQuakeBuildings: Post-earthquake SAR-Optical Dataset for Quick\n Damaged-building Detection","summary":" Quick and automated earthquake-damaged building detection from post-event\nsatellite imagery is crucial, yet it is challenging due to the scarcity of\ntraining data required to develop robust algorithms. This letter presents the\nfirst dataset dedicated to detecting earthquake-damaged buildings from\npost-event very high resolution (VHR) Synthetic Aperture Radar (SAR) and\noptical imagery. Utilizing open satellite imagery and annotations acquired\nafter the 2023 Turkey-Syria earthquakes, we deliver a dataset of coregistered\nbuilding footprints and satellite image patches of both SAR and optical data,\nencompassing more than four thousand buildings. The task of damaged building\ndetection is formulated as a binary image classification problem, that can also\nbe treated as an anomaly detection problem due to extreme class imbalance. We\nprovide baseline methods and results to serve as references for comparison.\nResearchers can utilize this dataset to expedite algorithm development,\nfacilitating the rapid detection of damaged buildings in response to future\nevents. The dataset and codes together with detailed explanations and\nvisualization are made publicly available at\n\\url{https://github.com/ya0-sun/PostEQ-SARopt-BuildingDamage}.\n","authors":["Yao Sun","Yi Wang","Michael Eineder"],"pdf_url":"https://arxiv.org/pdf/2312.06587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04050v1","updated":"2024-04-05T12:09:36Z","published":"2024-04-05T12:09:36Z","title":"No Time to Train: Empowering Non-Parametric Networks for Few-shot 3D\n Scene Segmentation","summary":" To reduce the reliance on large-scale datasets, recent works in 3D\nsegmentation resort to few-shot learning. Current 3D few-shot segmentation\nmethods first pre-train models on 'seen' classes, and then evaluate their\ngeneralization performance on 'unseen' classes. However, the prior pre-training\nstage not only introduces excessive time overhead but also incurs a significant\ndomain gap on 'unseen' classes. To tackle these issues, we propose a\nNon-parametric Network for few-shot 3D Segmentation, Seg-NN, and its Parametric\nvariant, Seg-PN. Without training, Seg-NN extracts dense representations by\nhand-crafted filters and achieves comparable performance to existing parametric\nmodels. Due to the elimination of pre-training, Seg-NN can alleviate the domain\ngap issue and save a substantial amount of time. Based on Seg-NN, Seg-PN only\nrequires training a lightweight QUEry-Support Transferring (QUEST) module,\nwhich enhances the interaction between the support set and query set.\nExperiments suggest that Seg-PN outperforms previous state-of-the-art method by\n+4.19% and +7.71% mIoU on S3DIS and ScanNet datasets respectively, while\nreducing training time by -90%, indicating its effectiveness and efficiency.\n","authors":["Xiangyang Zhu","Renrui Zhang","Bowei He","Ziyu Guo","Jiaming Liu","Han Xiao","Chaoyou Fu","Hao Dong","Peng Gao"],"pdf_url":"https://arxiv.org/pdf/2404.04050v1.pdf","comment":"CVPR Highlight. Code is available at\n https://github.com/yangyangyang127/Seg-NN. arXiv admin note: text overlap\n with arXiv:2308.12961"},{"id":"http://arxiv.org/abs/2404.04040v1","updated":"2024-04-05T11:49:29Z","published":"2024-04-05T11:49:29Z","title":"Dynamic Risk Assessment Methodology with an LDM-based System for Parking\n Scenarios","summary":" This paper describes the methodology for building a dynamic risk assessment\nfor ADAS (Advanced Driving Assistance Systems) algorithms in parking scenarios,\nfusing exterior and interior perception for a better understanding of the scene\nand a more comprehensive risk estimation. This includes the definition of a\ndynamic risk methodology that depends on the situation from inside and outside\nthe vehicle, the creation of a multi-sensor dataset of risk assessment for ADAS\nbenchmarking purposes, and a Local Dynamic Map (LDM) that fuses data from the\nexterior and interior of the car to build an LDM-based Dynamic Risk Assessment\nSystem (DRAS).\n","authors":["Paola Natalia Cañas","Mikel García","Nerea Aranjuelo","Marcos Nieto","Aitor Iglesias","Igor Rodríguez"],"pdf_url":"https://arxiv.org/pdf/2404.04040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04037v1","updated":"2024-04-05T11:45:03Z","published":"2024-04-05T11:45:03Z","title":"InstructHumans: Editing Animated 3D Human Textures with Instructions","summary":" We present InstructHumans, a novel framework for instruction-driven 3D human\ntexture editing. Existing text-based editing methods use Score Distillation\nSampling (SDS) to distill guidance from generative models. This work shows that\nnaively using such scores is harmful to editing as they destroy consistency\nwith the source avatar. Instead, we propose an alternate SDS for Editing\n(SDS-E) that selectively incorporates subterms of SDS across diffusion\ntimesteps. We further enhance SDS-E with spatial smoothness regularization and\ngradient-based viewpoint sampling to achieve high-quality edits with sharp and\nhigh-fidelity detailing. InstructHumans significantly outperforms existing 3D\nediting methods, consistent with the initial avatar while faithful to the\ntextual instructions. Project page: https://jyzhu.top/instruct-humans .\n","authors":["Jiayin Zhu","Linlin Yang","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2404.04037v1.pdf","comment":"Project Page: https://jyzhu.top/instruct-humans"},{"id":"http://arxiv.org/abs/2312.00648v3","updated":"2024-04-05T11:31:12Z","published":"2023-12-01T15:20:58Z","title":"SPOT: Self-Training with Patch-Order Permutation for Object-Centric\n Learning with Autoregressive Transformers","summary":" Unsupervised object-centric learning aims to decompose scenes into\ninterpretable object entities, termed slots. Slot-based auto-encoders stand out\nas a prominent method for this task. Within them, crucial aspects include\nguiding the encoder to generate object-specific slots and ensuring the decoder\nutilizes them during reconstruction. This work introduces two novel techniques,\n(i) an attention-based self-training approach, which distills superior\nslot-based attention masks from the decoder to the encoder, enhancing object\nsegmentation, and (ii) an innovative patch-order permutation strategy for\nautoregressive transformers that strengthens the role of slot vectors in\nreconstruction. The effectiveness of these strategies is showcased\nexperimentally. The combined approach significantly surpasses prior slot-based\nautoencoder methods in unsupervised object segmentation, especially with\ncomplex real-world images. We provide the implementation code at\nhttps://github.com/gkakogeorgiou/spot .\n","authors":["Ioannis Kakogeorgiou","Spyros Gidaris","Konstantinos Karantzalos","Nikos Komodakis"],"pdf_url":"https://arxiv.org/pdf/2312.00648v3.pdf","comment":"CVPR 2024 (Highlight). Code: https://github.com/gkakogeorgiou/spot"},{"id":"http://arxiv.org/abs/2404.04026v1","updated":"2024-04-05T11:14:19Z","published":"2024-04-05T11:14:19Z","title":"MM-Gaussian: 3D Gaussian-based Multi-modal Fusion for Localization and\n Reconstruction in Unbounded Scenes","summary":" Localization and mapping are critical tasks for various applications such as\nautonomous vehicles and robotics. The challenges posed by outdoor environments\npresent particular complexities due to their unbounded characteristics. In this\nwork, we present MM-Gaussian, a LiDAR-camera multi-modal fusion system for\nlocalization and mapping in unbounded scenes. Our approach is inspired by the\nrecently developed 3D Gaussians, which demonstrate remarkable capabilities in\nachieving high rendering quality and fast rendering speed. Specifically, our\nsystem fully utilizes the geometric structure information provided by\nsolid-state LiDAR to address the problem of inaccurate depth encountered when\nrelying solely on visual solutions in unbounded, outdoor scenarios.\nAdditionally, we utilize 3D Gaussian point clouds, with the assistance of\npixel-level gradient descent, to fully exploit the color information in photos,\nthereby achieving realistic rendering effects. To further bolster the\nrobustness of our system, we designed a relocalization module, which assists in\nreturning to the correct trajectory in the event of a localization failure.\nExperiments conducted in multiple scenarios demonstrate the effectiveness of\nour method.\n","authors":["Chenyang Wu","Yifan Duan","Xinran Zhang","Yu Sheng","Jianmin Ji","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04026v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.04025v1","updated":"2024-04-05T11:13:59Z","published":"2024-04-05T11:13:59Z","title":"Framework to generate perfusion map from CT and CTA images in patients\n with acute ischemic stroke: A longitudinal and cross-sectional study","summary":" Stroke is a leading cause of disability and death. Effective treatment\ndecisions require early and informative vascular imaging. 4D perfusion imaging\nis ideal but rarely available within the first hour after stroke, whereas plain\nCT and CTA usually are. Hence, we propose a framework to extract a predicted\nperfusion map (PPM) derived from CT and CTA images. In all eighteen patients,\nwe found significantly high spatial similarity (with average Spearman's\ncorrelation = 0.7893) between our predicted perfusion map (PPM) and the T-max\nmap derived from 4D-CTP. Voxelwise correlations between the PPM and National\nInstitutes of Health Stroke Scale (NIHSS) subscores for L/R hand motor, gaze,\nand language on a large cohort of 2,110 subjects reliably mapped symptoms to\nexpected infarct locations. Therefore our PPM could serve as an alternative for\n4D perfusion imaging, if the latter is unavailable, to investigate blood\nperfusion in the first hours after hospital admission.\n","authors":["Chayanin Tangwiriyasakul","Pedro Borges","Stefano Moriconi","Paul Wright","Yee-Haur Mah","James Teo","Parashkev Nachev","Sebastien Ourselin","M. Jorge Cardoso"],"pdf_url":"https://arxiv.org/pdf/2404.04025v1.pdf","comment":"Accepted and presented in SWITCH2023: Stroke Workshop on Imaging and\n Treatment CHallenges (MICCAI 2023, Vancouver Canada)"},{"id":"http://arxiv.org/abs/2404.04007v1","updated":"2024-04-05T10:30:38Z","published":"2024-04-05T10:30:38Z","title":"Neural-Symbolic VideoQA: Learning Compositional Spatio-Temporal\n Reasoning for Real-world Video Question Answering","summary":" Compositional spatio-temporal reasoning poses a significant challenge in the\nfield of video question answering (VideoQA). Existing approaches struggle to\nestablish effective symbolic reasoning structures, which are crucial for\nanswering compositional spatio-temporal questions. To address this challenge,\nwe propose a neural-symbolic framework called Neural-Symbolic VideoQA\n(NS-VideoQA), specifically designed for real-world VideoQA tasks. The\nuniqueness and superiority of NS-VideoQA are two-fold: 1) It proposes a Scene\nParser Network (SPN) to transform static-dynamic video scenes into Symbolic\nRepresentation (SR), structuralizing persons, objects, relations, and action\nchronologies. 2) A Symbolic Reasoning Machine (SRM) is designed for top-down\nquestion decompositions and bottom-up compositional reasonings. Specifically, a\npolymorphic program executor is constructed for internally consistent reasoning\nfrom SR to the final answer. As a result, Our NS-VideoQA not only improves the\ncompositional spatio-temporal reasoning in real-world VideoQA task, but also\nenables step-by-step error analysis by tracing the intermediate results.\nExperimental evaluations on the AGQA Decomp benchmark demonstrate the\neffectiveness of the proposed NS-VideoQA framework. Empirical studies further\nconfirm that NS-VideoQA exhibits internal consistency in answering\ncompositional questions and significantly improves the capability of\nspatio-temporal and logical inference for VideoQA tasks.\n","authors":["Lili Liang","Guanglu Sun","Jin Qiu","Lizhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20106v2","updated":"2024-04-05T10:29:00Z","published":"2024-03-29T10:40:41Z","title":"Learning Enriched Features via Selective State Spaces Model for\n Efficient Image Deblurring","summary":" Image deblurring aims to restore a high-quality image from its corresponding\nblurred. The emergence of CNNs and Transformers has enabled significant\nprogress. However, these methods often face the dilemma between eliminating\nlong-range degradation perturbations and maintaining computational efficiency.\nWhile the selective state space model (SSM) shows promise in modeling\nlong-range dependencies with linear complexity, it also encounters challenges\nsuch as local pixel forgetting and channel redundancy. To address this issue,\nwe propose an efficient image deblurring network that leverages selective state\nspaces model to aggregate enriched and accurate features. Specifically, we\nintroduce an aggregate local and global information block (ALGBlock) designed\nto effectively capture and integrate both local invariant properties and\nnon-local information. The ALGBlock comprises two primary modules: a module for\ncapturing local and global features (CLGF), and a feature aggregation module\n(FA). The CLGF module is composed of two branches: the global branch captures\nlong-range dependency features via a selective state spaces model, while the\nlocal branch employs simplified channel attention to model local connectivity,\nthereby reducing local pixel forgetting and channel redundancy. In addition, we\ndesign a FA module to accentuate the local part by recalibrating the weight\nduring the aggregation of the two branches for restoration. Experimental\nresults demonstrate that the proposed method outperforms state-of-the-art\napproaches on widely used benchmarks.\n","authors":["Hu Gao","Depeng Dang"],"pdf_url":"https://arxiv.org/pdf/2403.20106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03999v1","updated":"2024-04-05T10:23:20Z","published":"2024-04-05T10:23:20Z","title":"Finsler-Laplace-Beltrami Operators with Application to Shape Analysis","summary":" The Laplace-Beltrami operator (LBO) emerges from studying manifolds equipped\nwith a Riemannian metric. It is often called the Swiss army knife of geometry\nprocessing as it allows to capture intrinsic shape information and gives rise\nto heat diffusion, geodesic distances, and a multitude of shape descriptors. It\nalso plays a central role in geometric deep learning. In this work, we explore\nFinsler manifolds as a generalization of Riemannian manifolds. We revisit the\nFinsler heat equation and derive a Finsler heat kernel and a\nFinsler-Laplace-Beltrami Operator (FLBO): a novel theoretically justified\nanisotropic Laplace-Beltrami operator (ALBO). In experimental evaluations we\ndemonstrate that the proposed FLBO is a valuable alternative to the traditional\nRiemannian-based LBO and ALBOs for spatial filtering and shape correspondence\nestimation. We hope that the proposed Finsler heat kernel and the FLBO will\ninspire further exploration of Finsler geometry in the computer vision\ncommunity.\n","authors":["Simon Weber","Thomas Dagès","Maolin Gao","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03998v1","updated":"2024-04-05T10:23:10Z","published":"2024-04-05T10:23:10Z","title":"Physics-Inspired Synthesized Underwater Image Dataset","summary":" This paper introduces the physics-inspired synthesized underwater image\ndataset (PHISWID), a dataset tailored for enhancing underwater image processing\nthrough physics-inspired image synthesis. Deep learning approaches to\nunderwater image enhancement typically demand extensive datasets, yet acquiring\npaired clean and degraded underwater ones poses significant challenges. While\nseveral underwater image datasets have been proposed using physics-based\nsynthesis, a publicly accessible collection has been lacking. Additionally,\nmost underwater image synthesis approaches do not intend to reproduce\natmospheric scenes, resulting in incomplete enhancement. PHISWID addresses this\ngap by offering a set of paired ground-truth (atmospheric) and synthetically\ndegraded underwater images, showcasing not only color degradation but also the\noften-neglected effects of marine snow, a composite of organic matter and sand\nparticles that considerably impairs underwater image clarity. The dataset\napplies these degradations to atmospheric RGB-D images, enhancing the dataset's\nrealism and applicability. PHISWID is particularly valuable for training deep\nneural networks in a supervised learning setting and for objectively assessing\nimage quality in benchmark analyses. Our results reveal that even a basic U-Net\narchitecture, when trained with PHISWID, substantially outperforms existing\nmethods in underwater image enhancement. We intend to release PHISWID publicly,\ncontributing a significant resource to the advancement of underwater imaging\ntechnology.\n","authors":["Reina Kaneko","Hiroshi Higashi","Yuichi Tanaka"],"pdf_url":"https://arxiv.org/pdf/2404.03998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01734v2","updated":"2024-04-05T10:11:27Z","published":"2023-11-03T06:05:36Z","title":"Sculpting Holistic 3D Representation in Contrastive Language-Image-3D\n Pre-training","summary":" Contrastive learning has emerged as a promising paradigm for 3D open-world\nunderstanding, i.e., aligning point cloud representation to image and text\nembedding space individually. In this paper, we introduce MixCon3D, a simple\nyet effective method aiming to sculpt holistic 3D representation in contrastive\nlanguage-image-3D pre-training. In contrast to point cloud only, we develop the\n3D object-level representation from complementary perspectives, e.g.,\nmulti-view rendered images with the point cloud. Then, MixCon3D performs\nlanguage-3D contrastive learning, comprehensively depicting real-world 3D\nobjects and bolstering text alignment. Additionally, we pioneer the first\nthorough investigation of various training recipes for the 3D contrastive\nlearning paradigm, building a solid baseline with improved performance.\nExtensive experiments conducted on three representative benchmarks reveal that\nour method significantly improves over the baseline, surpassing the previous\nstate-of-the-art performance on the challenging 1,156-category Objaverse-LVIS\ndataset by 5.7%. The versatility of MixCon3D is showcased in applications such\nas text-to-3D retrieval and point cloud captioning, further evidencing its\nefficacy in diverse scenarios. The code is available at\nhttps://github.com/UCSC-VLAA/MixCon3D.\n","authors":["Yipeng Gao","Zeyu Wang","Wei-Shi Zheng","Cihang Xie","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.01734v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.17170v3","updated":"2024-04-05T10:07:24Z","published":"2023-10-26T05:49:44Z","title":"MO-YOLO: End-to-End Multiple-Object Tracking Method with YOLO and\n Decoder","summary":" In the field of multi-object tracking (MOT), recent Transformer based\nend-to-end models like MOTR have demonstrated exceptional performance on\ndatasets such as DanceTracker. However, the computational demands of these\nmodels present challenges in training and deployment. Drawing inspiration from\nsuccessful models like GPT, we present MO-YOLO, an efficient and\ncomputationally frugal end-to-end MOT model. MO-YOLO integrates principles from\nYou Only Look Once (YOLO) and RT-DETR, adopting a decoder-only approach. By\nleveraging the decoder from RT-DETR and architectural components from YOLOv8,\nMO-YOLO achieves high speed, shorter training times, and proficient MOT\nperformance. On the Dancetrack, MO-YOLO not only matches MOTR's performance but\nalso surpasses it, achieving over twice the frames per second (MOTR 9.5 FPS,\nMO-YOLO 19.6 FPS). Furthermore, MO-YOLO demonstrates significantly reduced\ntraining times and lower hardware requirements compared to MOTR. This research\nintroduces a promising paradigm for efficient end-to-end MOT, emphasizing\nenhanced performance and resource efficiency.\n","authors":["Liao Pan","Yang Feng","Wu Di","Liu Bo","Zhang Xingle"],"pdf_url":"https://arxiv.org/pdf/2310.17170v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03992v1","updated":"2024-04-05T10:02:32Z","published":"2024-04-05T10:02:32Z","title":"Rolling the dice for better deep learning performance: A study of\n randomness techniques in deep neural networks","summary":" This paper investigates how various randomization techniques impact Deep\nNeural Networks (DNNs). Randomization, like weight noise and dropout, aids in\nreducing overfitting and enhancing generalization, but their interactions are\npoorly understood. The study categorizes randomness techniques into four types\nand proposes new methods: adding noise to the loss function and random masking\nof gradient updates. Using Particle Swarm Optimizer (PSO) for hyperparameter\noptimization, it explores optimal configurations across MNIST, FASHION-MNIST,\nCIFAR10, and CIFAR100 datasets. Over 30,000 configurations are evaluated,\nrevealing data augmentation and weight initialization randomness as main\nperformance contributors. Correlation analysis shows different optimizers\nprefer distinct randomization types. The complete implementation and dataset\nare available on GitHub.\n","authors":["Mohammed Ghaith Altarabichi","Sławomir Nowaczyk","Sepideh Pashami","Peyman Sheikholharam Mashhadi","Julia Handl"],"pdf_url":"https://arxiv.org/pdf/2404.03992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03991v1","updated":"2024-04-05T10:01:31Z","published":"2024-04-05T10:01:31Z","title":"Towards Efficient and Accurate CT Segmentation via Edge-Preserving\n Probabilistic Downsampling","summary":" Downsampling images and labels, often necessitated by limited resources or to\nexpedite network training, leads to the loss of small objects and thin\nboundaries. This undermines the segmentation network's capacity to interpret\nimages accurately and predict detailed labels, resulting in diminished\nperformance compared to processing at original resolutions. This situation\nexemplifies the trade-off between efficiency and accuracy, with higher\ndownsampling factors further impairing segmentation outcomes. Preserving\ninformation during downsampling is especially critical for medical image\nsegmentation tasks. To tackle this challenge, we introduce a novel method named\nEdge-preserving Probabilistic Downsampling (EPD). It utilizes class uncertainty\nwithin a local window to produce soft labels, with the window size dictating\nthe downsampling factor. This enables a network to produce quality predictions\nat low resolutions. Beyond preserving edge details more effectively than\nconventional nearest-neighbor downsampling, employing a similar algorithm for\nimages, it surpasses bilinear interpolation in image downsampling, enhancing\noverall performance. Our method significantly improved Intersection over Union\n(IoU) to 2.85%, 8.65%, and 11.89% when downsampling data to 1/2, 1/4, and 1/8,\nrespectively, compared to conventional interpolation methods.\n","authors":["Shahzad Ali","Yu Rim Lee","Soo Young Park","Won Young Tak","Soon Ki Jung"],"pdf_url":"https://arxiv.org/pdf/2404.03991v1.pdf","comment":"5 pages (4 figures, 1 table); This work has been submitted to the\n IEEE Signal Processing Letters. Copyright may be transferred without notice,\n after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2403.19655v2","updated":"2024-04-05T09:35:37Z","published":"2024-03-28T17:59:50Z","title":"GaussianCube: Structuring Gaussian Splatting using Optimal Transport for\n 3D Generative Modeling","summary":" 3D Gaussian Splatting (GS) have achieved considerable improvement over Neural\nRadiance Fields in terms of 3D fitting fidelity and rendering speed. However,\nthis unstructured representation with scattered Gaussians poses a significant\nchallenge for generative modeling. To address the problem, we introduce\nGaussianCube, a structured GS representation that is both powerful and\nefficient for generative modeling. We achieve this by first proposing a\nmodified densification-constrained GS fitting algorithm which can yield\nhigh-quality fitting results using a fixed number of free Gaussians, and then\nre-arranging the Gaussians into a predefined voxel grid via Optimal Transport.\nThe structured grid representation allows us to use standard 3D U-Net as our\nbackbone in diffusion generative modeling without elaborate designs. Extensive\nexperiments conducted on ShapeNet and OmniObject3D show that our model achieves\nstate-of-the-art generation results both qualitatively and quantitatively,\nunderscoring the potential of GaussianCube as a powerful and versatile 3D\nrepresentation.\n","authors":["Bowen Zhang","Yiji Cheng","Jiaolong Yang","Chunyu Wang","Feng Zhao","Yansong Tang","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.19655v2.pdf","comment":"Fix typo in Eq.2; Project Page: https://gaussiancube.github.io/"},{"id":"http://arxiv.org/abs/2402.12891v2","updated":"2024-04-05T09:26:07Z","published":"2024-02-20T10:35:51Z","title":"Mind the Exit Pupil Gap: Revisiting the Intrinsics of a Standard\n Plenoptic Camera","summary":" Among the common applications of plenoptic cameras are depth reconstruction\nand post-shot refocusing. These require a calibration relating the camera-side\nlight field to that of the scene. Numerous methods with this goal have been\ndeveloped based on thin lens models for the plenoptic camera's main lens and\nmicrolenses. Our work addresses the often-overlooked role of the main lens exit\npupil in these models and specifically in the decoding process of standard\nplenoptic camera (SPC) images. We formally deduce the connection between the\nrefocusing distance and the resampling parameter for the decoded light field\nand provide an analysis of the errors that arise when the exit pupil is not\nconsidered. In addition, previous work is revisited with respect to the exit\npupil's role and all theoretical results are validated through a\nray-tracing-based simulation. With the public release of the evaluated SPC\ndesigns alongside our simulation and experimental data we aim to contribute to\na more accurate and nuanced understanding of plenoptic camera optics.\n","authors":["Tim Michels","Daniel Mäckelmann","Reinhard Koch"],"pdf_url":"https://arxiv.org/pdf/2402.12891v2.pdf","comment":"29 pages, 16 figures, Accepted for publication in MDPI Sensors,\n Special Issue 'Short-Range Optical 3D Scanning and 3D Data Processing '"},{"id":"http://arxiv.org/abs/2111.05778v2","updated":"2024-04-05T09:19:41Z","published":"2021-11-10T16:31:27Z","title":"Theoretical and Empirical Analysis of a Fast Algorithm for Extracting\n Polygons from Signed Distance Bounds","summary":" Recently there has been renewed interest in signed distance bound\nrepresentations due to their unique properties for 3D shape modelling. This is\nespecially the case for deep learning-based bounds. However, it is beneficial\nto work with polygons in most computer-graphics applications. Thus, in this\npaper we introduce and investigate an asymptotically fast method for\ntransforming signed distance bounds into polygon meshes. This is achieved by\ncombining the principles of sphere tracing (or ray marching) with traditional\npolygonization techniques, such as Marching Cubes. We provide theoretical and\nexperimental evidence that this approach is of the $O(N^2\\log N)$ computational\ncomplexity for a polygonization grid with $N^3$ cells. The algorithm is tested\non both a set of primitive shapes as well as signed distance bounds generated\nfrom point clouds by machine learning (and represented as neural networks).\nGiven its speed, implementation simplicity and portability, we argue that it\ncould prove useful during the modelling stage as well as in shape compression\nfor storage.\n The code is available here: https://github.com/nenadmarkus/gridhopping\n","authors":["Nenad Markuš","Mirko Sužnjević"],"pdf_url":"https://arxiv.org/pdf/2111.05778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04868v3","updated":"2024-04-05T09:03:48Z","published":"2023-08-09T11:02:00Z","title":"InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering","summary":" Recent advances in full-head reconstruction have been obtained by optimizing\na neural field through differentiable surface or volume rendering to represent\na single scene. While these techniques achieve an unprecedented accuracy, they\ntake several minutes, or even hours, due to the expensive optimization process\nrequired. In this work, we introduce InstantAvatar, a method that recovers\nfull-head avatars from few images (down to just one) in a few seconds on\ncommodity hardware. In order to speed up the reconstruction process, we propose\na system that combines, for the first time, a voxel-grid neural field\nrepresentation with a surface renderer. Notably, a naive combination of these\ntwo techniques leads to unstable optimizations that do not converge to valid\nsolutions. In order to overcome this limitation, we present a novel statistical\nmodel that learns a prior distribution over 3D head signed distance functions\nusing a voxel-grid based architecture. The use of this prior model, in\ncombination with other design choices, results into a system that achieves 3D\nhead reconstructions with comparable accuracy as the state-of-the-art with a\n100x speed-up.\n","authors":["Antonio Canela","Pol Caselles","Ibrar Malik","Eduard Ramon","Jaime García","Jordi Sánchez-Riera","Gil Triginer","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2308.04868v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00434v2","updated":"2024-04-05T08:53:28Z","published":"2023-04-30T09:28:38Z","title":"EVREAL: Towards a Comprehensive Benchmark and Analysis Suite for\n Event-based Video Reconstruction","summary":" Event cameras are a new type of vision sensor that incorporates asynchronous\nand independent pixels, offering advantages over traditional frame-based\ncameras such as high dynamic range and minimal motion blur. However, their\noutput is not easily understandable by humans, making the reconstruction of\nintensity images from event streams a fundamental task in event-based vision.\nWhile recent deep learning-based methods have shown promise in video\nreconstruction from events, this problem is not completely solved yet. To\nfacilitate comparison between different approaches, standardized evaluation\nprotocols and diverse test datasets are essential. This paper proposes a\nunified evaluation methodology and introduces an open-source framework called\nEVREAL to comprehensively benchmark and analyze various event-based video\nreconstruction methods from the literature. Using EVREAL, we give a detailed\nanalysis of the state-of-the-art methods for event-based video reconstruction,\nand provide valuable insights into the performance of these methods under\nvarying settings, challenging scenarios, and downstream tasks.\n","authors":["Burak Ercan","Onur Eker","Aykut Erdem","Erkut Erdem"],"pdf_url":"https://arxiv.org/pdf/2305.00434v2.pdf","comment":"19 pages, 9 figures. Has been accepted for publication at the IEEE\n Conference on Computer Vision and Pattern Recognition Workshops (CVPRW),\n Vancouver, 2023. The project page can be found at\n https://ercanburak.github.io/evreal.html"},{"id":"http://arxiv.org/abs/2404.03962v1","updated":"2024-04-05T08:52:32Z","published":"2024-04-05T08:52:32Z","title":"RaSim: A Range-aware High-fidelity RGB-D Data Simulation Pipeline for\n Real-world Applications","summary":" In robotic vision, a de-facto paradigm is to learn in simulated environments\nand then transfer to real-world applications, which poses an essential\nchallenge in bridging the sim-to-real domain gap. While mainstream works tackle\nthis problem in the RGB domain, we focus on depth data synthesis and develop a\nrange-aware RGB-D data simulation pipeline (RaSim). In particular,\nhigh-fidelity depth data is generated by imitating the imaging principle of\nreal-world sensors. A range-aware rendering strategy is further introduced to\nenrich data diversity. Extensive experiments show that models trained with\nRaSim can be directly applied to real-world scenarios without any finetuning\nand excel at downstream RGB-D perception tasks.\n","authors":["Xingyu Liu","Chenyangguang Zhang","Gu Wang","Ruida Zhang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2404.03962v1.pdf","comment":"accepted by ICRA'24"},{"id":"http://arxiv.org/abs/2403.01300v2","updated":"2024-04-05T08:42:02Z","published":"2024-03-02T19:54:53Z","title":"Causal Mode Multiplexer: A Novel Framework for Unbiased Multispectral\n Pedestrian Detection","summary":" RGBT multispectral pedestrian detection has emerged as a promising solution\nfor safety-critical applications that require day/night operations. However,\nthe modality bias problem remains unsolved as multispectral pedestrian\ndetectors learn the statistical bias in datasets. Specifically, datasets in\nmultispectral pedestrian detection mainly distribute between ROTO (day) and\nRXTO (night) data; the majority of the pedestrian labels statistically co-occur\nwith their thermal features. As a result, multispectral pedestrian detectors\nshow poor generalization ability on examples beyond this statistical\ncorrelation, such as ROTX data. To address this problem, we propose a novel\nCausal Mode Multiplexer (CMM) framework that effectively learns the causalities\nbetween multispectral inputs and predictions. Moreover, we construct a new\ndataset (ROTX-MP) to evaluate modality bias in multispectral pedestrian\ndetection. ROTX-MP mainly includes ROTX examples not presented in previous\ndatasets. Extensive experiments demonstrate that our proposed CMM framework\ngeneralizes well on existing datasets (KAIST, CVC-14, FLIR) and the new\nROTX-MP. We will release our new dataset to the public for future research.\n","authors":["Taeheon Kim","Sebin Shin","Youngjoon Yu","Hak Gu Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2403.01300v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2401.01598v2","updated":"2024-04-05T08:23:29Z","published":"2024-01-03T07:59:17Z","title":"Learning Prompt with Distribution-Based Feature Replay for Few-Shot\n Class-Incremental Learning","summary":" Few-shot Class-Incremental Learning (FSCIL) aims to continuously learn new\nclasses based on very limited training data without forgetting the old ones\nencountered. Existing studies solely relied on pure visual networks, while in\nthis paper we solved FSCIL by leveraging the Vision-Language model (e.g., CLIP)\nand propose a simple yet effective framework, named Learning Prompt with\nDistribution-based Feature Replay (LP-DiF). We observe that simply using CLIP\nfor zero-shot evaluation can substantially outperform the most influential\nmethods. Then, prompt tuning technique is involved to further improve its\nadaptation ability, allowing the model to continually capture specific\nknowledge from each session. To prevent the learnable prompt from forgetting\nold knowledge in the new session, we propose a pseudo-feature replay approach.\nSpecifically, we preserve the old knowledge of each class by maintaining a\nfeature-level Gaussian distribution with a diagonal covariance matrix, which is\nestimated by the image features of training images and synthesized features\ngenerated from a VAE. When progressing to a new session, pseudo-features are\nsampled from old-class distributions combined with training images of the\ncurrent session to optimize the prompt, thus enabling the model to learn new\nknowledge while retaining old knowledge. Experiments on three prevalent\nbenchmarks, i.e., CIFAR100, mini-ImageNet, CUB-200, and two more challenging\nbenchmarks, i.e., SUN-397 and CUB-200$^*$ proposed in this paper showcase the\nsuperiority of LP-DiF, achieving new state-of-the-art (SOTA) in FSCIL. Code is\npublicly available at https://github.com/1170300714/LP-DiF.\n","authors":["Zitong Huang","Ze Chen","Zhixing Chen","Erjin Zhou","Xinxing Xu","Rick Siow Mong Goh","Yong Liu","Wangmeng Zuo","Chunmei Feng"],"pdf_url":"https://arxiv.org/pdf/2401.01598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10835v4","updated":"2024-04-05T08:20:32Z","published":"2023-12-17T22:40:38Z","title":"Your Student is Better Than Expected: Adaptive Teacher-Student\n Collaboration for Text-Conditional Diffusion Models","summary":" Knowledge distillation methods have recently shown to be a promising\ndirection to speedup the synthesis of large-scale diffusion models by requiring\nonly a few inference steps. While several powerful distillation methods were\nrecently proposed, the overall quality of student samples is typically lower\ncompared to the teacher ones, which hinders their practical usage. In this\nwork, we investigate the relative quality of samples produced by the teacher\ntext-to-image diffusion model and its distilled student version. As our main\nempirical finding, we discover that a noticeable portion of student samples\nexhibit superior fidelity compared to the teacher ones, despite the\n\"approximate\" nature of the student. Based on this finding, we propose an\nadaptive collaboration between student and teacher diffusion models for\neffective text-to-image synthesis. Specifically, the distilled model produces\nthe initial sample, and then an oracle decides whether it needs further\nimprovements with a slow teacher model. Extensive experiments demonstrate that\nthe designed pipeline surpasses state-of-the-art text-to-image alternatives for\nvarious inference budgets in terms of human preference. Furthermore, the\nproposed approach can be naturally used in popular applications such as\ntext-guided image editing and controllable generation.\n","authors":["Nikita Starodubcev","Artem Fedorov","Artem Babenko","Dmitry Baranchuk"],"pdf_url":"https://arxiv.org/pdf/2312.10835v4.pdf","comment":"CVPR2024 camera ready v2"},{"id":"http://arxiv.org/abs/2404.03936v1","updated":"2024-04-05T07:44:17Z","published":"2024-04-05T07:44:17Z","title":"Deep Learning for Satellite Image Time Series Analysis: A Review","summary":" Earth observation (EO) satellite missions have been providing detailed images\nabout the state of the Earth and its land cover for over 50 years. Long term\nmissions, such as NASA's Landsat, Terra, and Aqua satellites, and more\nrecently, the ESA's Sentinel missions, record images of the entire world every\nfew days. Although single images provide point-in-time data, repeated images of\nthe same area, or satellite image time series (SITS) provide information about\nthe changing state of vegetation and land use. These SITS are useful for\nmodeling dynamic processes and seasonal changes such as plant phenology. They\nhave potential benefits for many aspects of land and natural resource\nmanagement, including applications in agricultural, forest, water, and disaster\nmanagement, urban planning, and mining. However, the resulting satellite image\ntime series (SITS) are complex, incorporating information from the temporal,\nspatial, and spectral dimensions. Therefore, deep learning methods are often\ndeployed as they can analyze these complex relationships. This review presents\na summary of the state-of-the-art methods of modelling environmental,\nagricultural, and other Earth observation variables from SITS data using deep\nlearning methods. We aim to provide a resource for remote sensing experts\ninterested in using deep learning techniques to enhance Earth observation\nmodels with temporal information.\n","authors":["Lynn Miller","Charlotte Pelletier","Geoffrey I. Webb"],"pdf_url":"https://arxiv.org/pdf/2404.03936v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.03930v1","updated":"2024-04-05T07:24:10Z","published":"2024-04-05T07:24:10Z","title":"Real-GDSR: Real-World Guided DSM Super-Resolution via Edge-Enhancing\n Residual Network","summary":" A low-resolution digital surface model (DSM) features distinctive attributes\nimpacted by noise, sensor limitations and data acquisition conditions, which\nfailed to be replicated using simple interpolation methods like bicubic. This\ncauses super-resolution models trained on synthetic data does not perform\neffectively on real ones. Training a model on real low and high resolution DSMs\npairs is also a challenge because of the lack of information. On the other\nhand, the existence of other imaging modalities of the same scene can be used\nto enrich the information needed for large-scale super-resolution. In this\nwork, we introduce a novel methodology to address the intricacies of real-world\nDSM super-resolution, named REAL-GDSR, breaking down this ill-posed problem\ninto two steps. The first step involves the utilization of a residual local\nrefinement network. This strategic approach departs from conventional methods\nthat trained to directly predict height values instead of the differences\n(residuals) and utilize large receptive fields in their networks. The second\nstep introduces a diffusion-based technique that enhances the results on a\nglobal scale, with a primary focus on smoothing and edge preservation. Our\nexperiments underscore the effectiveness of the proposed method. We conduct a\ncomprehensive evaluation, comparing it to recent state-of-the-art techniques in\nthe domain of real-world DSM super-resolution (SR). Our approach consistently\noutperforms these existing methods, as evidenced through qualitative and\nquantitative assessments.\n","authors":["Daniel Panangian","Ksenia Bittner"],"pdf_url":"https://arxiv.org/pdf/2404.03930v1.pdf","comment":"Accepted for publication in the ISPRS Annals of Photogrammetry,\n Remote Sensing, and Spatial Information Sciences"},{"id":"http://arxiv.org/abs/2404.03925v1","updated":"2024-04-05T07:15:06Z","published":"2024-04-05T07:15:06Z","title":"LightOctree: Lightweight 3D Spatially-Coherent Indoor Lighting\n Estimation","summary":" We present a lightweight solution for estimating spatially-coherent indoor\nlighting from a single RGB image. Previous methods for estimating illumination\nusing volumetric representations have overlooked the sparse distribution of\nlight sources in space, necessitating substantial memory and computational\nresources for achieving high-quality results. We introduce a unified, voxel\noctree-based illumination estimation framework to produce 3D spatially-coherent\nlighting. Additionally, a differentiable voxel octree cone tracing rendering\nlayer is proposed to eliminate regular volumetric representation throughout the\nentire process and ensure the retention of features across different frequency\ndomains. This reduction significantly decreases spatial usage and required\nfloating-point operations without substantially compromising precision.\nExperimental results demonstrate that our approach achieves high-quality\ncoherent estimation with minimal cost compared to previous methods.\n","authors":["Xuecan Wang","Shibang Xiao","Xiaohui Liang"],"pdf_url":"https://arxiv.org/pdf/2404.03925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03924v1","updated":"2024-04-05T07:13:28Z","published":"2024-04-05T07:13:28Z","title":"Learning Correlation Structures for Vision Transformers","summary":" We introduce a new attention mechanism, dubbed structural self-attention\n(StructSA), that leverages rich correlation patterns naturally emerging in\nkey-query interactions of attention. StructSA generates attention maps by\nrecognizing space-time structures of key-query correlations via convolution and\nuses them to dynamically aggregate local contexts of value features. This\neffectively leverages rich structural patterns in images and videos such as\nscene layouts, object motion, and inter-object relations. Using StructSA as a\nmain building block, we develop the structural vision transformer (StructViT)\nand evaluate its effectiveness on both image and video classification tasks,\nachieving state-of-the-art results on ImageNet-1K, Kinetics-400,\nSomething-Something V1 & V2, Diving-48, and FineGym.\n","authors":["Manjin Kim","Paul Hongsuck Seo","Cordelia Schmid","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2404.03924v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.02723v2","updated":"2024-04-05T07:12:16Z","published":"2024-01-05T09:36:42Z","title":"Predicting Traffic Flow with Federated Learning and Graph Neural with\n Asynchronous Computations Network","summary":" Real-time traffic flow prediction holds significant importance within the\ndomain of Intelligent Transportation Systems (ITS). The task of achieving a\nbalance between prediction precision and computational efficiency presents a\nsignificant challenge. In this article, we present a novel deep-learning method\ncalled Federated Learning and Asynchronous Graph Convolutional Network\n(FLAGCN). Our framework incorporates the principles of asynchronous graph\nconvolutional networks with federated learning to enhance the accuracy and\nefficiency of real-time traffic flow prediction. The FLAGCN model employs a\nspatial-temporal graph convolution technique to asynchronously address\nspatio-temporal dependencies within traffic data effectively. To efficiently\nhandle the computational requirements associated with this deep learning model,\nthis study used a graph federated learning technique known as GraphFL. This\napproach is designed to facilitate the training process. The experimental\nresults obtained from conducting tests on two distinct traffic datasets\ndemonstrate that the utilization of FLAGCN leads to the optimization of both\ntraining and inference durations while maintaining a high level of prediction\naccuracy. FLAGCN outperforms existing models with significant improvements by\nachieving up to approximately 6.85% reduction in RMSE, 20.45% reduction in\nMAPE, compared to the best-performing existing models.\n","authors":["Muhammad Yaqub","Shahzad Ahmad","Malik Abdul Manan","Imran Shabir Chuhan"],"pdf_url":"https://arxiv.org/pdf/2401.02723v2.pdf","comment":"I request to withdraw my paper from arXiv due to significant updates\n and improvements identified post-submission. These enhancements will\n substantially elevate the work's quality and impact. I plan to resubmit the\n revised paper upon completion of these updates. Thank you for accommodating\n this request"},{"id":"http://arxiv.org/abs/2402.19326v2","updated":"2024-04-05T06:56:08Z","published":"2024-02-29T16:29:53Z","title":"Generalizable Whole Slide Image Classification with Fine-Grained\n Visual-Semantic Interaction","summary":" Whole Slide Image (WSI) classification is often formulated as a Multiple\nInstance Learning (MIL) problem. Recently, Vision-Language Models (VLMs) have\ndemonstrated remarkable performance in WSI classification. However, existing\nmethods leverage coarse-grained pathogenetic descriptions for visual\nrepresentation supervision, which are insufficient to capture the complex\nvisual appearance of pathogenetic images, hindering the generalizability of\nmodels on diverse downstream tasks. Additionally, processing high-resolution\nWSIs can be computationally expensive. In this paper, we propose a novel\n\"Fine-grained Visual-Semantic Interaction\" (FiVE) framework for WSI\nclassification. It is designed to enhance the model's generalizability by\nleveraging the interaction between localized visual patterns and fine-grained\npathological semantics. Specifically, with meticulously designed queries, we\nstart by utilizing a large language model to extract fine-grained pathological\ndescriptions from various non-standardized raw reports. The output descriptions\nare then reconstructed into fine-grained labels used for training. By\nintroducing a Task-specific Fine-grained Semantics (TFS) module, we enable\nprompts to capture crucial visual information in WSIs, which enhances\nrepresentation learning and augments generalization capabilities significantly.\nFurthermore, given that pathological visual patterns are redundantly\ndistributed across tissue slices, we sample a subset of visual instances during\ntraining. Our method demonstrates robust generalizability and strong\ntransferability, dominantly outperforming the counterparts on the TCGA Lung\nCancer dataset with at least 9.19% higher accuracy in few-shot experiments. The\ncode is available at: https://github.com/ls1rius/WSI_FiVE.\n","authors":["Hao Li","Ying Chen","Yifei Chen","Wenxian Yang","Bowen Ding","Yuchen Han","Liansheng Wang","Rongshan Yu"],"pdf_url":"https://arxiv.org/pdf/2402.19326v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03913v1","updated":"2024-04-05T06:41:27Z","published":"2024-04-05T06:41:27Z","title":"Concept Weaver: Enabling Multi-Concept Fusion in Text-to-Image Models","summary":" While there has been significant progress in customizing text-to-image\ngeneration models, generating images that combine multiple personalized\nconcepts remains challenging. In this work, we introduce Concept Weaver, a\nmethod for composing customized text-to-image diffusion models at inference\ntime. Specifically, the method breaks the process into two steps: creating a\ntemplate image aligned with the semantics of input prompts, and then\npersonalizing the template using a concept fusion strategy. The fusion strategy\nincorporates the appearance of the target concepts into the template image\nwhile retaining its structural details. The results indicate that our method\ncan generate multiple custom concepts with higher identity fidelity compared to\nalternative approaches. Furthermore, the method is shown to seamlessly handle\nmore than two concepts and closely follow the semantic meaning of the input\nprompt without blending appearances across different subjects.\n","authors":["Gihyun Kwon","Simon Jenni","Dingzeyu Li","Joon-Young Lee","Jong Chul Ye","Fabian Caba Heilbron"],"pdf_url":"https://arxiv.org/pdf/2404.03913v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03906v1","updated":"2024-04-05T05:58:40Z","published":"2024-04-05T05:58:40Z","title":"Deep Phase Coded Image Prior","summary":" Phase-coded imaging is a computational imaging method designed to tackle\ntasks such as passive depth estimation and extended depth of field (EDOF) using\ndepth cues inserted during image capture. Most of the current deep\nlearning-based methods for depth estimation or all-in-focus imaging require a\ntraining dataset with high-quality depth maps and an optimal focus point at\ninfinity for all-in-focus images. Such datasets are difficult to create,\nusually synthetic, and require external graphic programs. We propose a new\nmethod named \"Deep Phase Coded Image Prior\" (DPCIP) for jointly recovering the\ndepth map and all-in-focus image from a coded-phase image using solely the\ncaptured image and the optical information of the imaging system. Our approach\ndoes not depend on any specific dataset and surpasses prior supervised\ntechniques utilizing the same imaging system. This improvement is achieved\nthrough the utilization of a problem formulation based on implicit neural\nrepresentation (INR) and deep image prior (DIP). Due to our zero-shot method,\nwe overcome the barrier of acquiring accurate ground-truth data of depth maps\nand all-in-focus images for each new phase-coded system introduced. This allows\nfocusing mainly on developing the imaging system, and not on ground-truth data\ncollection.\n","authors":["Nimrod Shabtay","Eli Schwartz","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2404.03906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01439v2","updated":"2024-04-05T05:46:59Z","published":"2024-03-03T08:25:04Z","title":"Dynamic Adapter Meets Prompt Tuning: Parameter-Efficient Transfer\n Learning for Point Cloud Analysis","summary":" Point cloud analysis has achieved outstanding performance by transferring\npoint cloud pre-trained models. However, existing methods for model adaptation\nusually update all model parameters, i.e., full fine-tuning paradigm, which is\ninefficient as it relies on high computational costs (e.g., training GPU\nmemory) and massive storage space. In this paper, we aim to study\nparameter-efficient transfer learning for point cloud analysis with an ideal\ntrade-off between task performance and parameter efficiency. To achieve this\ngoal, we freeze the parameters of the default pre-trained models and then\npropose the Dynamic Adapter, which generates a dynamic scale for each token,\nconsidering the token significance to the downstream task. We further\nseamlessly integrate Dynamic Adapter with Prompt Tuning (DAPT) by constructing\nInternal Prompts, capturing the instance-specific features for interaction.\nExtensive experiments conducted on five challenging datasets demonstrate that\nthe proposed DAPT achieves superior performance compared to the full\nfine-tuning counterparts while significantly reducing the trainable parameters\nand training GPU memory by 95% and 35%, respectively. Code is available at\nhttps://github.com/LMD0311/DAPT.\n","authors":["Xin Zhou","Dingkang Liang","Wei Xu","Xingkui Zhu","Yihan Xu","Zhikang Zou","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2403.01439v2.pdf","comment":"Accepted to CVPR 2024. Code is available at\n https://github.com/LMD0311/DAPT"},{"id":"http://arxiv.org/abs/2404.03898v1","updated":"2024-04-05T05:42:23Z","published":"2024-04-05T05:42:23Z","title":"VoltaVision: A Transfer Learning model for electronic component\n classification","summary":" In this paper, we analyze the effectiveness of transfer learning on\nclassifying electronic components. Transfer learning reuses pre-trained models\nto save time and resources in building a robust classifier rather than learning\nfrom scratch. Our work introduces a lightweight CNN, coined as VoltaVision, and\ncompares its performance against more complex models. We test the hypothesis\nthat transferring knowledge from a similar task to our target domain yields\nbetter results than state-of-the-art models trained on general datasets. Our\ndataset and code for this work are available at\nhttps://github.com/AnasIshfaque/VoltaVision.\n","authors":["Anas Mohammad Ishfaqul Muktadir Osmani","Taimur Rahman","Salekul Islam"],"pdf_url":"https://arxiv.org/pdf/2404.03898v1.pdf","comment":"Tiny Paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2310.20550v3","updated":"2024-04-05T05:29:29Z","published":"2023-10-31T15:31:39Z","title":"CapsFusion: Rethinking Image-Text Data at Scale","summary":" Large multimodal models demonstrate remarkable generalist ability to perform\ndiverse multimodal tasks in a zero-shot manner. Large-scale web-based\nimage-text pairs contribute fundamentally to this success, but suffer from\nexcessive noise. Recent studies use alternative captions synthesized by\ncaptioning models and have achieved notable benchmark performance. However, our\nexperiments reveal significant Scalability Deficiency and World Knowledge Loss\nissues in models trained with synthetic captions, which have been largely\nobscured by their initial benchmark success. Upon closer examination, we\nidentify the root cause as the overly-simplified language structure and lack of\nknowledge details in existing synthetic captions. To provide higher-quality and\nmore scalable multimodal pretraining data, we propose CapsFusion, an advanced\nframework that leverages large language models to consolidate and refine\ninformation from both web-based image-text pairs and synthetic captions.\nExtensive experiments show that CapsFusion captions exhibit remarkable\nall-round superiority over existing captions in terms of model performance\n(e.g., 18.8 and 18.3 improvements in CIDEr score on COCO and NoCaps), sample\nefficiency (requiring 11-16 times less computation than baselines), world\nknowledge depth, and scalability. These effectiveness, efficiency and\nscalability advantages position CapsFusion as a promising candidate for future\nscaling of LMM training.\n","authors":["Qiying Yu","Quan Sun","Xiaosong Zhang","Yufeng Cui","Fan Zhang","Yue Cao","Xinlong Wang","Jingjing Liu"],"pdf_url":"https://arxiv.org/pdf/2310.20550v3.pdf","comment":"CVPR 2024. Code & Dataset: https://github.com/baaivision/CapsFusion"},{"id":"http://arxiv.org/abs/2404.03892v1","updated":"2024-04-05T05:00:21Z","published":"2024-04-05T05:00:21Z","title":"Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and\n Integration of Convolutional Neural Networks and Explainable AI","summary":" The study introduces an integrated framework combining Convolutional Neural\nNetworks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced\ndiagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned\nResNet50 architecture, our investigation not only provides effective\ndifferentiation of mammographic images into benign and malignant categories but\nalso addresses the opaque \"black-box\" nature of deep learning models by\nemploying XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN\ndecision-making processes for healthcare professionals. Our methodology\nencompasses an elaborate data preprocessing pipeline and advanced data\naugmentation techniques to counteract dataset limitations, and transfer\nlearning using pre-trained networks, such as VGG-16, DenseNet and ResNet was\nemployed. A focal point of our study is the evaluation of XAI's effectiveness\nin interpreting model predictions, highlighted by utilising the Hausdorff\nmeasure to assess the alignment between AI-generated explanations and expert\nannotations quantitatively. This approach plays a critical role for XAI in\npromoting trustworthiness and ethical fairness in AI-assisted diagnostics. The\nfindings from our research illustrate the effective collaboration between CNNs\nand XAI in advancing diagnostic methods for breast cancer, thereby facilitating\na more seamless integration of advanced AI technologies within clinical\nsettings. By enhancing the interpretability of AI-driven decisions, this work\nlays the groundwork for improved collaboration between AI systems and medical\npractitioners, ultimately enriching patient care. Furthermore, the implications\nof our research extend well beyond the current methodologies, advocating for\nsubsequent inquiries into the integration of multimodal data and the refinement\nof AI explanations to satisfy the needs of clinical practice.\n","authors":["Maryam Ahmed","Tooba Bibi","Rizwan Ahmed Khan","Sidra Nasir"],"pdf_url":"https://arxiv.org/pdf/2404.03892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03052v2","updated":"2024-04-05T04:33:23Z","published":"2023-12-05T18:58:37Z","title":"Visual Program Distillation: Distilling Tools and Programmatic Reasoning\n into Vision-Language Models","summary":" Solving complex visual tasks such as \"Who invented the musical instrument on\nthe right?\" involves a composition of skills: understanding space, recognizing\ninstruments, and also retrieving prior knowledge. Recent work shows promise by\ndecomposing such tasks using a large language model (LLM) into an executable\nprogram that invokes specialized vision models. However, generated programs are\nerror-prone: they omit necessary steps, include spurious ones, and are unable\nto recover when the specialized models give incorrect outputs. Moreover, they\nrequire loading multiple models, incurring high latency and computation costs.\nWe propose Visual Program Distillation (VPD), an instruction tuning framework\nthat produces a vision-language model (VLM) capable of solving complex visual\ntasks with a single forward pass. VPD distills the reasoning ability of LLMs by\nusing them to sample multiple candidate programs, which are then executed and\nverified to identify a correct one. It translates each correct program into a\nlanguage description of the reasoning steps, which are then distilled into a\nVLM. Extensive experiments show that VPD improves the VLM's ability to count,\nunderstand spatial relations, and reason compositionally. Our VPD-trained\nPaLI-X outperforms all prior VLMs, achieving state-of-the-art performance\nacross complex vision tasks, including MMBench, OK-VQA, A-OKVQA, TallyQA, POPE,\nand Hateful Memes. An evaluation with human annotators also confirms that VPD\nimproves model response factuality and consistency. Finally, experiments on\ncontent moderation demonstrate that VPD is also helpful for adaptation to\nreal-world applications with limited data.\n","authors":["Yushi Hu","Otilia Stretcu","Chun-Ta Lu","Krishnamurthy Viswanathan","Kenji Hata","Enming Luo","Ranjay Krishna","Ariel Fuxman"],"pdf_url":"https://arxiv.org/pdf/2312.03052v2.pdf","comment":"CVPR 2024 Oral"},{"id":"http://arxiv.org/abs/2404.03883v1","updated":"2024-04-05T04:11:31Z","published":"2024-04-05T04:11:31Z","title":"LiDAR-Guided Cross-Attention Fusion for Hyperspectral Band Selection and\n Image Classification","summary":" The fusion of hyperspectral and LiDAR data has been an active research topic.\nExisting fusion methods have ignored the high-dimensionality and redundancy\nchallenges in hyperspectral images, despite that band selection methods have\nbeen intensively studied for hyperspectral image (HSI) processing. This paper\naddresses this significant gap by introducing a cross-attention mechanism from\nthe transformer architecture for the selection of HSI bands guided by LiDAR\ndata. LiDAR provides high-resolution vertical structural information, which can\nbe useful in distinguishing different types of land cover that may have similar\nspectral signatures but different structural profiles. In our approach, the\nLiDAR data are used as the \"query\" to search and identify the \"key\" from the\nHSI to choose the most pertinent bands for LiDAR. This method ensures that the\nselected HSI bands drastically reduce redundancy and computational requirements\nwhile working optimally with the LiDAR data. Extensive experiments have been\nundertaken on three paired HSI and LiDAR data sets: Houston 2013, Trento and\nMUUFL. The results highlight the superiority of the cross-attention mechanism,\nunderlining the enhanced classification accuracy of the identified HSI bands\nwhen fused with the LiDAR features. The results also show that the use of fewer\nbands combined with LiDAR surpasses the performance of state-of-the-art fusion\nmodels.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Wee Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.03883v1.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.03876v1","updated":"2024-04-05T03:51:19Z","published":"2024-04-05T03:51:19Z","title":"Increasing Fairness in Classification of Out of Distribution Data for\n Facial Recognition","summary":" Standard classification theory assumes that the distribution of images in the\ntest and training sets are identical. Unfortunately, real-life scenarios\ntypically feature unseen data (\"out-of-distribution data\") which is different\nfrom data in the training distribution(\"in-distribution\"). This issue is most\nprevalent in social justice problems where data from under-represented groups\nmay appear in the test data without representing an equal proportion of the\ntraining data. This may result in a model returning confidently wrong decisions\nand predictions. We are interested in the following question: Can the\nperformance of a neural network improve on facial images of out-of-distribution\ndata when it is trained simultaneously on multiple datasets of in-distribution\ndata? We approach this problem by incorporating the Outlier Exposure model and\ninvestigate how the model's performance changes when other datasets of facial\nimages were implemented. We observe that the accuracy and other metrics of the\nmodel can be increased by applying Outlier Exposure, incorporating a trainable\nweight parameter to increase the machine's emphasis on outlier images, and by\nre-weighting the importance of different class labels. We also experimented\nwith whether sorting the images and determining outliers via image features\nwould have more of an effect on the metrics than sorting by average pixel\nvalue. Our goal was to make models not only more accurate but also more fair by\nscanning a more expanded range of images. We also tested the datasets in\nreverse order to see whether a more fair dataset with balanced features has an\neffect on the model's accuracy.\n","authors":["Gianluca Barone","Aashrit Cunchala","Rudy Nunez"],"pdf_url":"https://arxiv.org/pdf/2404.03876v1.pdf","comment":"18 pages, 6 tables, 6 figures"},{"id":"http://arxiv.org/abs/2306.00003v3","updated":"2024-04-05T03:25:04Z","published":"2023-05-25T18:22:12Z","title":"Detecting Heart Disease from Multi-View Ultrasound Images via Supervised\n Attention Multiple Instance Learning","summary":" Aortic stenosis (AS) is a degenerative valve condition that causes\nsubstantial morbidity and mortality. This condition is under-diagnosed and\nunder-treated. In clinical practice, AS is diagnosed with expert review of\ntransthoracic echocardiography, which produces dozens of ultrasound images of\nthe heart. Only some of these views show the aortic valve. To automate\nscreening for AS, deep networks must learn to mimic a human expert's ability to\nidentify views of the aortic valve then aggregate across these relevant images\nto produce a study-level diagnosis. We find previous approaches to AS detection\nyield insufficient accuracy due to relying on inflexible averages across\nimages. We further find that off-the-shelf attention-based multiple instance\nlearning (MIL) performs poorly. We contribute a new end-to-end MIL approach\nwith two key methodological innovations. First, a supervised attention\ntechnique guides the learned attention mechanism to favor relevant views.\nSecond, a novel self-supervised pretraining strategy applies contrastive\nlearning on the representation of the whole study instead of individual images\nas commonly done in prior literature. Experiments on an open-access dataset and\nan external validation set show that our approach yields higher accuracy while\nreducing model size.\n","authors":["Zhe Huang","Benjamin S. Wessler","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2306.00003v3.pdf","comment":"Echocardiogram; multiple-instance learning; self-supervised learning;\n semi-supervised learning; medical imaging"},{"id":"http://arxiv.org/abs/2404.01655v2","updated":"2024-04-05T03:15:11Z","published":"2024-04-02T05:56:17Z","title":"FashionEngine: Interactive Generation and Editing of 3D Clothed Humans","summary":" We present FashionEngine, an interactive 3D human generation and editing\nsystem that allows us to design 3D digital humans in a way that aligns with how\nhumans interact with the world, such as natural languages, visual perceptions,\nand hand-drawing. FashionEngine automates the 3D human production with three\nkey components: 1) A pre-trained 3D human diffusion model that learns to model\n3D humans in a semantic UV latent space from 2D image training data, which\nprovides strong priors for diverse generation and editing tasks. 2)\nMultimodality-UV Space encoding the texture appearance, shape topology, and\ntextual semantics of human clothing in a canonical UV-aligned space, which\nfaithfully aligns the user multimodal inputs with the implicit UV latent space\nfor controllable 3D human editing. The multimodality-UV space is shared across\ndifferent user inputs, such as texts, images, and sketches, which enables\nvarious joint multimodal editing tasks. 3) Multimodality-UV Aligned Sampler\nlearns to sample high-quality and diverse 3D humans from the diffusion prior\nfor multimodal user inputs. Extensive experiments validate FashionEngine's\nstate-of-the-art performance for conditional generation/editing tasks. In\naddition, we present an interactive user interface for our FashionEngine that\nenables both conditional and unconditional generation tasks, and editing tasks\nincluding pose/view/shape control, text-, image-, and sketch-driven 3D human\nediting and 3D virtual try-on, in a unified framework. Our project page is at:\nhttps://taohuumd.github.io/projects/FashionEngine.\n","authors":["Tao Hu","Fangzhou Hong","Zhaoxi Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01655v2.pdf","comment":"Project Page: https://taohuumd.github.io/projects/FashionEngine"},{"id":"http://arxiv.org/abs/2403.12686v3","updated":"2024-04-05T02:34:01Z","published":"2024-03-19T12:45:18Z","title":"WaterVG: Waterway Visual Grounding based on Text-Guided Vision and\n mmWave Radar","summary":" The perception of waterways based on human intent is significant for\nautonomous navigation and operations of Unmanned Surface Vehicles (USVs) in\nwater environments. Inspired by visual grounding, we introduce WaterVG, the\nfirst visual grounding dataset designed for USV-based waterway perception based\non human prompts. WaterVG encompasses prompts describing multiple targets, with\nannotations at the instance level including bounding boxes and masks. Notably,\nWaterVG includes 11,568 samples with 34,987 referred targets, whose prompts\nintegrates both visual and radar characteristics. The pattern of text-guided\ntwo sensors equips a finer granularity of text prompts with visual and radar\nfeatures of referred targets. Moreover, we propose a low-power visual grounding\nmodel, Potamoi, which is a multi-task model with a well-designed Phased\nHeterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting\n(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts\nrequired radar features to fuse with vision for prompt alignment. MHSCA is an\nefficient fusion module with a remarkably small parameter count and FLOPs,\nelegantly fusing scenario context captured by two sensors with linguistic\nfeatures, which performs expressively on visual grounding tasks. Comprehensive\nexperiments and evaluations have been conducted on WaterVG, where our Potamoi\narchives state-of-the-art performances compared with counterparts.\n","authors":["Runwei Guan","Liye Jia","Fengyufan Yang","Shanliang Yao","Erick Purwanto","Xiaohui Zhu","Eng Gee Lim","Jeremy Smith","Ka Lok Man","Xuming Hu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2403.12686v3.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.03854v1","updated":"2024-04-05T01:17:25Z","published":"2024-04-05T01:17:25Z","title":"Mitigating Heterogeneity in Federated Multimodal Learning with\n Biomedical Vision-Language Pre-training","summary":" Vision-language pre-training (VLP) has arised as an efficient scheme for\nmultimodal representation learning, but it requires large-scale multimodal data\nfor pre-training, making it an obstacle especially for biomedical applications.\nTo overcome the data limitation, federated learning (FL) can be a promising\nstrategy to scale up the dataset for biomedical VLP while protecting data\nprivacy. However, client data are often heterogeneous in real-world scenarios,\nand we observe that local training on heterogeneous client data would distort\nthe multimodal representation learning and lead to biased cross-modal\nalignment. To address this challenge, we propose Federated distributional\nRobust Guidance-Based (FedRGB) learning framework for federated VLP with\nrobustness to data heterogeneity. Specifically, we utilize a guidance-based\nlocal training scheme to reduce feature distortions, and employ a\ndistribution-based min-max optimization to learn unbiased cross-modal\nalignment. The experiments on real-world datasets show our method successfully\npromotes efficient federated multimodal learning for biomedical VLP with data\nheterogeneity.\n","authors":["Zitao Shuai","Liyue Shen"],"pdf_url":"https://arxiv.org/pdf/2404.03854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05061v2","updated":"2024-04-05T00:43:16Z","published":"2024-03-08T05:15:48Z","title":"RadarDistill: Boosting Radar-based Object Detection Performance via\n Knowledge Distillation from LiDAR Features","summary":" The inherent noisy and sparse characteristics of radar data pose challenges\nin finding effective representations for 3D object detection. In this paper, we\npropose RadarDistill, a novel knowledge distillation (KD) method, which can\nimprove the representation of radar data by leveraging LiDAR data. RadarDistill\nsuccessfully transfers desirable characteristics of LiDAR features into radar\nfeatures using three key components: Cross-Modality Alignment (CMA),\nActivation-based Feature Distillation (AFD), and Proposal-based Feature\nDistillation (PFD). CMA enhances the density of radar features by employing\nmultiple layers of dilation operations, effectively addressing the challenge of\ninefficient knowledge transfer from LiDAR to radar. AFD selectively transfers\nknowledge based on regions of the LiDAR features, with a specific focus on\nareas where activation intensity exceeds a predefined threshold. PFD similarly\nguides the radar network to selectively mimic features from the LiDAR network\nwithin the object proposals. Our comparative analyses conducted on the nuScenes\ndatasets demonstrate that RadarDistill achieves state-of-the-art (SOTA)\nperformance for radar-only object detection task, recording 20.5% in mAP and\n43.7% in NDS. Also, RadarDistill significantly improves the performance of the\ncamera-radar fusion model.\n","authors":["Geonho Bang","Kwangjin Choi","Jisong Kim","Dongsuk Kum","Jun Won Choi"],"pdf_url":"https://arxiv.org/pdf/2403.05061v2.pdf","comment":"Accepted to IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR) 2024, 10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.00498v2","updated":"2024-04-05T00:09:00Z","published":"2024-03-30T23:42:23Z","title":"94% on CIFAR-10 in 3.29 Seconds on a Single GPU","summary":" CIFAR-10 is among the most widely used datasets in machine learning,\nfacilitating thousands of research projects per year. To accelerate research\nand reduce the cost of experiments, we introduce training methods for CIFAR-10\nwhich reach 94% accuracy in 3.29 seconds, 95% in 10.4 seconds, and 96% in 46.3\nseconds, when run on a single NVIDIA A100 GPU. As one factor contributing to\nthese training speeds, we propose a derandomized variant of horizontal flipping\naugmentation, which we show improves over the standard method in every case\nwhere flipping is beneficial over no flipping at all. Our code is released at\nhttps://github.com/KellerJordan/cifar10-airbench.\n","authors":["Keller Jordan"],"pdf_url":"https://arxiv.org/pdf/2404.00498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09555v2","updated":"2024-04-05T23:58:40Z","published":"2023-07-14T15:17:04Z","title":"Transient Neural Radiance Fields for Lidar View Synthesis and 3D\n Reconstruction","summary":" Neural radiance fields (NeRFs) have become a ubiquitous tool for modeling\nscene appearance and geometry from multiview imagery. Recent work has also\nbegun to explore how to use additional supervision from lidar or depth sensor\nmeasurements in the NeRF framework. However, previous lidar-supervised NeRFs\nfocus on rendering conventional camera imagery and use lidar-derived point\ncloud data as auxiliary supervision; thus, they fail to incorporate the\nunderlying image formation model of the lidar. Here, we propose a novel method\nfor rendering transient NeRFs that take as input the raw, time-resolved photon\ncount histograms measured by a single-photon lidar system, and we seek to\nrender such histograms from novel views. Different from conventional NeRFs, the\napproach relies on a time-resolved version of the volume rendering equation to\nrender the lidar measurements and capture transient light transport phenomena\nat picosecond timescales. We evaluate our method on a first-of-its-kind dataset\nof simulated and captured transient multiview scans from a prototype\nsingle-photon lidar. Overall, our work brings NeRFs to a new dimension of\nimaging at transient timescales, newly enabling rendering of transient imagery\nfrom novel views. Additionally, we show that our approach recovers improved\ngeometry and conventional appearance compared to point cloud-based supervision\nwhen training on few input viewpoints. Transient NeRFs may be especially useful\nfor applications which seek to simulate raw lidar measurements for downstream\ntasks in autonomous driving, robotics, and remote sensing.\n","authors":["Anagh Malik","Parsa Mirdehghan","Sotiris Nousias","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2307.09555v2.pdf","comment":"NeurIPS 2023, Project Page: https://anaghmalik.com/TransientNeRF/"},{"id":"http://arxiv.org/abs/2404.04452v1","updated":"2024-04-05T23:38:57Z","published":"2024-04-05T23:38:57Z","title":"Vision Transformers in Domain Adaptation and Generalization: A Study of\n Robustness","summary":" Deep learning models are often evaluated in scenarios where the data\ndistribution is different from those used in the training and validation\nphases. The discrepancy presents a challenge for accurately predicting the\nperformance of models once deployed on the target distribution. Domain\nadaptation and generalization are widely recognized as effective strategies for\naddressing such shifts, thereby ensuring reliable performance. The recent\npromising results in applying vision transformers in computer vision tasks,\ncoupled with advancements in self-attention mechanisms, have demonstrated their\nsignificant potential for robustness and generalization in handling\ndistribution shifts. Motivated by the increased interest from the research\ncommunity, our paper investigates the deployment of vision transformers in\ndomain adaptation and domain generalization scenarios. For domain adaptation\nmethods, we categorize research into feature-level, instance-level, model-level\nadaptations, and hybrid approaches, along with other categorizations with\nrespect to diverse strategies for enhancing domain adaptation. Similarly, for\ndomain generalization, we categorize research into multi-domain learning,\nmeta-learning, regularization techniques, and data augmentation strategies. We\nfurther classify diverse strategies in research, underscoring the various\napproaches researchers have taken to address distribution shifts by integrating\nvision transformers. The inclusion of comprehensive tables summarizing these\ncategories is a distinct feature of our work, offering valuable insights for\nresearchers. These findings highlight the versatility of vision transformers in\nmanaging distribution shifts, crucial for real-world applications, especially\nin critical safety and decision-making scenarios.\n","authors":["Shadi Alijani","Jamil Fayyad","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2404.04452v1.pdf","comment":"28 pages, 5 figures, Preprint submitted to Elsevier"},{"id":"http://arxiv.org/abs/2404.04434v1","updated":"2024-04-05T22:21:49Z","published":"2024-04-05T22:21:49Z","title":"Robust Few-Shot Ensemble Learning with Focal Diversity-Based Pruning","summary":" This paper presents FusionShot, a focal diversity optimized few-shot ensemble\nlearning approach for boosting the robustness and generalization performance of\npre-trained few-shot models. The paper makes three original contributions.\nFirst, we explore the unique characteristics of few-shot learning to ensemble\nmultiple few-shot (FS) models by creating three alternative fusion channels.\nSecond, we introduce the concept of focal error diversity to learn the most\nefficient ensemble teaming strategy, rather than assuming that an ensemble of a\nlarger number of base models will outperform those sub-ensembles of smaller\nsize. We develop a focal-diversity ensemble pruning method to effectively prune\nout the candidate ensembles with low ensemble error diversity and recommend\ntop-$K$ FS ensembles with the highest focal error diversity. Finally, we\ncapture the complex non-linear patterns of ensemble few-shot predictions by\ndesigning the learn-to-combine algorithm, which can learn the diverse weight\nassignments for robust ensemble fusion over different member models. Extensive\nexperiments on representative few-shot benchmarks show that the top-K ensembles\nrecommended by FusionShot can outperform the representative SOTA few-shot\nmodels on novel tasks (different distributions and unknown at training), and\ncan prevail over existing few-shot learners in both cross-domain settings and\nadversarial settings. For reproducibility purposes, FusionShot trained models,\nresults, and code are made available at https://github.com/sftekin/fusionshot\n","authors":["Selim Furkan Tekin","Fatih Ilhan","Tiansheng Huang","Sihao Hu","Ka-Ho Chow","Margaret L. Loper","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2404.04434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01545v4","updated":"2024-04-05T22:17:46Z","published":"2023-10-02T18:41:23Z","title":"RF-ULM: Ultrasound Localization Microscopy Learned from Radio-Frequency\n Wavefronts","summary":" In Ultrasound Localization Microscopy (ULM), achieving high-resolution images\nrelies on the precise localization of contrast agent particles across a series\nof beamformed frames. However, our study uncovers an enormous potential: The\nprocess of delay-and-sum beamforming leads to an irreversible reduction of\nRadio-Frequency (RF) channel data, while its implications for localization\nremain largely unexplored. The rich contextual information embedded within RF\nwavefronts, including their hyperbolic shape and phase, offers great promise\nfor guiding Deep Neural Networks (DNNs) in challenging localization scenarios.\nTo fully exploit this data, we propose to directly localize scatterers in RF\nchannel data. Our approach involves a custom super-resolution DNN using learned\nfeature channel shuffling, non-maximum suppression, and a semi-global\nconvolutional block for reliable and accurate wavefront localization.\nAdditionally, we introduce a geometric point transformation that facilitates\nseamless mapping to the B-mode coordinate space. To understand the impact of\nbeamforming on ULM, we validate the effectiveness of our method by conducting\nan extensive comparison with State-Of-The-Art (SOTA) techniques. We present the\ninaugural in vivo results from a wavefront-localizing DNN, highlighting its\nreal-world practicality. Our findings show that RF-ULM bridges the domain shift\nbetween synthetic and real datasets, offering a considerable advantage in terms\nof precision and complexity. To enable the broader research community to\nbenefit from our findings, our code and the associated SOTA methods are made\navailable at https://github.com/hahnec/rf-ulm.\n","authors":["Christopher Hahne","Georges Chabouh","Arthur Chavignon","Olivier Couture","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2310.01545v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04430v1","updated":"2024-04-05T22:07:25Z","published":"2024-04-05T22:07:25Z","title":"PhysPT: Physics-aware Pretrained Transformer for Estimating Human\n Dynamics from Monocular Videos","summary":" While current methods have shown promising progress on estimating 3D human\nmotion from monocular videos, their motion estimates are often physically\nunrealistic because they mainly consider kinematics. In this paper, we\nintroduce Physics-aware Pretrained Transformer (PhysPT), which improves\nkinematics-based motion estimates and infers motion forces. PhysPT exploits a\nTransformer encoder-decoder backbone to effectively learn human dynamics in a\nself-supervised manner. Moreover, it incorporates physics principles governing\nhuman motion. Specifically, we build a physics-based body representation and\ncontact force model. We leverage them to impose novel physics-inspired training\nlosses (i.e., force loss, contact loss, and Euler-Lagrange loss), enabling\nPhysPT to capture physical properties of the human body and the forces it\nexperiences. Experiments demonstrate that, once trained, PhysPT can be directly\napplied to kinematics-based estimates to significantly enhance their physical\nplausibility and generate favourable motion forces. Furthermore, we show that\nthese physically meaningful quantities translate into improved accuracy of an\nimportant downstream task: human action recognition.\n","authors":["Yufei Zhang","Jeffrey O. Kephart","Zijun Cui","Qiang Ji"],"pdf_url":"https://arxiv.org/pdf/2404.04430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00938v2","updated":"2024-04-05T21:55:47Z","published":"2024-04-01T05:50:56Z","title":"How Can Large Language Models Enable Better Socially Assistive\n Human-Robot Interaction: A Brief Survey","summary":" Socially assistive robots (SARs) have shown great success in providing\npersonalized cognitive-affective support for user populations with special\nneeds such as older adults, children with autism spectrum disorder (ASD), and\nindividuals with mental health challenges. The large body of work on SAR\ndemonstrates its potential to provide at-home support that complements\nclinic-based interventions delivered by mental health professionals, making\nthese interventions more effective and accessible. However, there are still\nseveral major technical challenges that hinder SAR-mediated interactions and\ninterventions from reaching human-level social intelligence and efficacy. With\nthe recent advances in large language models (LLMs), there is an increased\npotential for novel applications within the field of SAR that can significantly\nexpand the current capabilities of SARs. However, incorporating LLMs introduces\nnew risks and ethical concerns that have not yet been encountered, and must be\ncarefully be addressed to safely deploy these more advanced systems. In this\nwork, we aim to conduct a brief survey on the use of LLMs in SAR technologies,\nand discuss the potentials and risks of applying LLMs to the following three\nmajor technical challenges of SAR: 1) natural language dialog; 2) multimodal\nunderstanding; 3) LLMs as robot policies.\n","authors":["Zhonghao Shi","Ellen Landrum","Amy O' Connell","Mina Kian","Leticia Pinto-Alva","Kaleen Shrestha","Xiaoyuan Zhu","Maja J Matarić"],"pdf_url":"https://arxiv.org/pdf/2404.00938v2.pdf","comment":"2 pages, accepted to the Proceedings of the AAAI Symposium Series,\n 2024"},{"id":"http://arxiv.org/abs/2404.04421v1","updated":"2024-04-05T21:44:57Z","published":"2024-04-05T21:44:57Z","title":"PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual\n Observations","summary":" Modeling and rendering photorealistic avatars is of crucial importance in\nmany applications. Existing methods that build a 3D avatar from visual\nobservations, however, struggle to reconstruct clothed humans. We introduce\nPhysAvatar, a novel framework that combines inverse rendering with inverse\nphysics to automatically estimate the shape and appearance of a human from\nmulti-view video data along with the physical parameters of the fabric of their\nclothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for\nspatio-temporal mesh tracking as well as a physically based inverse renderer to\nestimate the intrinsic material properties. PhysAvatar integrates a physics\nsimulator to estimate the physical parameters of the garments using\ngradient-based optimization in a principled manner. These novel capabilities\nenable PhysAvatar to create high-quality novel-view renderings of avatars\ndressed in loose-fitting clothes under motions and lighting conditions not seen\nin the training data. This marks a significant advancement towards modeling\nphotorealistic digital humans using physically based inverse rendering with\nphysics in the loop. Our project website is at:\nhttps://qingqing-zhao.github.io/PhysAvatar\n","authors":["Yang Zheng","Qingqing Zhao","Guandao Yang","Wang Yifan","Donglai Xiang","Florian Dubost","Dmitry Lagun","Thabo Beeler","Federico Tombari","Leonidas Guibas","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2404.04421v1.pdf","comment":"Yang Zheng and Qingqing Zhao are project co-leads"},{"id":"http://arxiv.org/abs/2308.02958v2","updated":"2024-04-05T21:39:23Z","published":"2023-08-05T22:07:37Z","title":"K-band: Self-supervised MRI Reconstruction via Stochastic Gradient\n Descent over K-space Subsets","summary":" Although deep learning (DL) methods are powerful for solving inverse\nproblems, their reliance on high-quality training data is a major hurdle. This\nis significant in high-dimensional (dynamic/volumetric) magnetic resonance\nimaging (MRI), where acquisition of high-resolution fully sampled k-space data\nis impractical. We introduce a novel mathematical framework, dubbed k-band,\nthat enables training DL models using only partial, limited-resolution k-space\ndata. Specifically, we introduce training with stochastic gradient descent\n(SGD) over k-space subsets. In each training iteration, rather than using the\nfully sampled k-space for computing gradients, we use only a small k-space\nportion. This concept is compatible with different sampling strategies; here we\ndemonstrate the method for k-space \"bands\", which have limited resolution in\none dimension and can hence be acquired rapidly. We prove analytically that our\nmethod stochastically approximates the gradients computed in a fully-supervised\nsetup, when two simple conditions are met: (i) the limited-resolution axis is\nchosen randomly-uniformly for every new scan, hence k-space is fully covered\nacross the entire training set, and (ii) the loss function is weighed with a\nmask, derived here analytically, which facilitates accurate reconstruction of\nhigh-resolution details. Numerical experiments with raw MRI data indicate that\nk-band outperforms two other methods trained on limited-resolution data and\nperforms comparably to state-of-the-art (SoTA) methods trained on\nhigh-resolution data. k-band hence obtains SoTA performance, with the advantage\nof training using only limited-resolution data. This work hence introduces a\npractical, easy-to-implement, self-supervised training framework, which\ninvolves fast acquisition and self-supervised reconstruction and offers\ntheoretical guarantees.\n","authors":["Frederic Wang","Han Qi","Alfredo De Goyeneche","Reinhard Heckel","Michael Lustig","Efrat Shimron"],"pdf_url":"https://arxiv.org/pdf/2308.02958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04818v3","updated":"2024-04-05T20:58:34Z","published":"2023-11-08T16:42:14Z","title":"Cross-Silo Federated Learning Across Divergent Domains with Iterative\n Parameter Alignment","summary":" Learning from the collective knowledge of data dispersed across private\nsources can provide neural networks with enhanced generalization capabilities.\nFederated learning, a method for collaboratively training a machine learning\nmodel across remote clients, achieves this by combining client models via the\norchestration of a central server. However, current approaches face two\ncritical limitations: i) they struggle to converge when client domains are\nsufficiently different, and ii) current aggregation techniques produce an\nidentical global model for each client. In this work, we address these issues\nby reformulating the typical federated learning setup: rather than learning a\nsingle global model, we learn N models each optimized for a common objective.\nTo achieve this, we apply a weighted distance minimization to model parameters\nshared in a peer-to-peer topology. The resulting framework, Iterative Parameter\nAlignment, applies naturally to the cross-silo setting, and has the following\nproperties: (i) a unique solution for each participant, with the option to\nglobally converge each model in the federation, and (ii) an optional\nearly-stopping mechanism to elicit fairness among peers in collaborative\nlearning settings. These characteristics jointly provide a flexible new\nframework for iteratively learning from peer models trained on disparate\ndatasets. We find that the technique achieves competitive results on a variety\nof data partitions compared to state-of-the-art approaches. Further, we show\nthat the method is robust to divergent domains (i.e. disjoint classes across\npeers) where existing approaches struggle.\n","authors":["Matt Gorbett","Hossein Shirazi","Indrakshi Ray"],"pdf_url":"https://arxiv.org/pdf/2311.04818v3.pdf","comment":"Published at IEEE Big Data 2023"},{"id":"http://arxiv.org/abs/2404.04394v1","updated":"2024-04-05T20:39:16Z","published":"2024-04-05T20:39:16Z","title":"Analyzing Participants' Engagement during Online Meetings Using\n Unsupervised Remote Photoplethysmography with Behavioral Features","summary":" Engagement measurement finds application in healthcare, education,\nadvertisement, and services. The use of physiological and behavioral features\nis viable, but the impracticality of traditional physiological measurement\narises due to the need for contact sensors. We demonstrate the feasibility of\nunsupervised remote photoplethysmography (rPPG) as an alternative for contact\nsensors in deriving heart rate variability (HRV) features, then fusing these\nwith behavioral features to measure engagement in online group meetings.\nFirstly, a unique Engagement Dataset of online interactions among social\nworkers is collected with granular engagement labels, offering insight into\nvirtual meeting dynamics. Secondly, a pre-trained rPPG model is customized to\nreconstruct accurate rPPG signals from video meetings in an unsupervised\nmanner, enabling the calculation of HRV features. Thirdly, the feasibility of\nestimating engagement from HRV features using short observation windows, with a\nnotable enhancement when using longer observation windows of two to four\nminutes, is demonstrated. Fourthly, the effectiveness of behavioral cues is\nevaluated and fused with physiological data, which further enhances engagement\nestimation performance. An accuracy of 94% is achieved when only HRV features\nare used, eliminating the need for contact sensors or ground truth signals. The\nincorporation of behavioral cues raises the accuracy to 96%. Facial video\nanalysis offers precise engagement measurement, beneficial for future\napplications.\n","authors":["Alexander Vedernikov","Zhaodong Sun","Virpi-Liisa Kykyri","Mikko Pohjola","Miriam Nokia","Xiaobai Li"],"pdf_url":"https://arxiv.org/pdf/2404.04394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19964v3","updated":"2024-04-05T20:33:14Z","published":"2024-03-29T03:56:19Z","title":"FairRAG: Fair Human Generation via Fair Retrieval Augmentation","summary":" Existing text-to-image generative models reflect or even amplify societal\nbiases ingrained in their training data. This is especially concerning for\nhuman image generation where models are biased against certain demographic\ngroups. Existing attempts to rectify this issue are hindered by the inherent\nlimitations of the pre-trained models and fail to substantially improve\ndemographic diversity. In this work, we introduce Fair Retrieval Augmented\nGeneration (FairRAG), a novel framework that conditions pre-trained generative\nmodels on reference images retrieved from an external image database to improve\nfairness in human generation. FairRAG enables conditioning through a\nlightweight linear module that projects reference images into the textual\nspace. To enhance fairness, FairRAG applies simple-yet-effective debiasing\nstrategies, providing images from diverse demographic groups during the\ngenerative process. Extensive experiments demonstrate that FairRAG outperforms\nexisting methods in terms of demographic diversity, image-text alignment, and\nimage fidelity while incurring minimal computational overhead during inference.\n","authors":["Robik Shrestha","Yang Zou","Qiuyu Chen","Zhiheng Li","Yusheng Xie","Siqi Deng"],"pdf_url":"https://arxiv.org/pdf/2403.19964v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19949v2","updated":"2024-04-05T20:08:16Z","published":"2024-03-29T03:15:31Z","title":"FairCLIP: Harnessing Fairness in Vision-Language Learning","summary":" Fairness is a critical concern in deep learning, especially in healthcare,\nwhere these models influence diagnoses and treatment decisions. Although\nfairness has been investigated in the vision-only domain, the fairness of\nmedical vision-language (VL) models remains unexplored due to the scarcity of\nmedical VL datasets for studying fairness. To bridge this research gap, we\nintroduce the first fair vision-language medical dataset Harvard-FairVLMed that\nprovides detailed demographic attributes, ground-truth labels, and clinical\nnotes to facilitate an in-depth examination of fairness within VL foundation\nmodels. Using Harvard-FairVLMed, we conduct a comprehensive fairness analysis\nof two widely-used VL models (CLIP and BLIP2), pre-trained on both natural and\nmedical domains, across four different protected attributes. Our results\nhighlight significant biases in all VL models, with Asian, Male, Non-Hispanic,\nand Spanish being the preferred subgroups across the protected attributes of\nrace, gender, ethnicity, and language, respectively. In order to alleviate\nthese biases, we propose FairCLIP, an optimal-transport-based approach that\nachieves a favorable trade-off between performance and fairness by reducing the\nSinkhorn distance between the overall sample distribution and the distributions\ncorresponding to each demographic group. As the first VL dataset of its kind,\nHarvard-FairVLMed holds the potential to catalyze advancements in the\ndevelopment of machine learning models that are both ethically aware and\nclinically effective. Our dataset and code are available at\nhttps://ophai.hms.harvard.edu/datasets/harvard-fairvlmed10k.\n","authors":["Yan Luo","Min Shi","Muhammad Osama Khan","Muhammad Muneeb Afzal","Hao Huang","Shuaihang Yuan","Yu Tian","Luo Song","Ava Kouhana","Tobias Elze","Yi Fang","Mengyu Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19949v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04377v1","updated":"2024-04-05T19:42:55Z","published":"2024-04-05T19:42:55Z","title":"LOSS-SLAM: Lightweight Open-Set Semantic Simultaneous Localization and\n Mapping","summary":" Enabling robots to understand the world in terms of objects is a critical\nbuilding block towards higher level autonomy. The success of foundation models\nin vision has created the ability to segment and identify nearly all objects in\nthe world. However, utilizing such objects to localize the robot and build an\nopen-set semantic map of the world remains an open research question. In this\nwork, a system of identifying, localizing, and encoding objects is tightly\ncoupled with probabilistic graphical models for performing open-set semantic\nsimultaneous localization and mapping (SLAM). Results are presented\ndemonstrating that the proposed lightweight object encoding can be used to\nperform more accurate object-based SLAM than existing open-set methods,\nclosed-set methods, and geometric methods while incurring a lower computational\noverhead than existing open-set mapping methods.\n","authors":["Kurran Singh","Tim Magoun","John J. Leonard"],"pdf_url":"https://arxiv.org/pdf/2404.04377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04376v1","updated":"2024-04-05T19:38:18Z","published":"2024-04-05T19:38:18Z","title":"ClickDiffusion: Harnessing LLMs for Interactive Precise Image Editing","summary":" Recently, researchers have proposed powerful systems for generating and\nmanipulating images using natural language instructions. However, it is\ndifficult to precisely specify many common classes of image transformations\nwith text alone. For example, a user may wish to change the location and breed\nof a particular dog in an image with several similar dogs. This task is quite\ndifficult with natural language alone, and would require a user to write a\nlaboriously complex prompt that both disambiguates the target dog and describes\nthe destination. We propose ClickDiffusion, a system for precise image\nmanipulation and generation that combines natural language instructions with\nvisual feedback provided by the user through a direct manipulation interface.\nWe demonstrate that by serializing both an image and a multi-modal instruction\ninto a textual representation it is possible to leverage LLMs to perform\nprecise transformations of the layout and appearance of an image. Code\navailable at https://github.com/poloclub/ClickDiffusion.\n","authors":["Alec Helbling","Seongmin Lee","Polo Chau"],"pdf_url":"https://arxiv.org/pdf/2404.04376v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2402.07925"},{"id":"http://arxiv.org/abs/2311.17518v2","updated":"2024-04-05T19:18:30Z","published":"2023-11-29T10:40:52Z","title":"The devil is in the fine-grained details: Evaluating open-vocabulary\n object detectors for fine-grained understanding","summary":" Recent advancements in large vision-language models enabled visual object\ndetection in open-vocabulary scenarios, where object classes are defined in\nfree-text formats during inference. In this paper, we aim to probe the\nstate-of-the-art methods for open-vocabulary object detection to determine to\nwhat extent they understand fine-grained properties of objects and their parts.\nTo this end, we introduce an evaluation protocol based on dynamic vocabulary\ngeneration to test whether models detect, discern, and assign the correct\nfine-grained description to objects in the presence of hard-negative classes.\nWe contribute with a benchmark suite of increasing difficulty and probing\ndifferent properties like color, pattern, and material. We further enhance our\ninvestigation by evaluating several state-of-the-art open-vocabulary object\ndetectors using the proposed protocol and find that most existing solutions,\nwhich shine in standard open-vocabulary benchmarks, struggle to accurately\ncapture and distinguish finer object details. We conclude the paper by\nhighlighting the limitations of current methodologies and exploring promising\nresearch directions to overcome the discovered drawbacks. Data and code are\navailable at https://lorebianchi98.github.io/FG-OVD/.\n","authors":["Lorenzo Bianchi","Fabio Carrara","Nicola Messina","Claudio Gennaro","Fabrizio Falchi"],"pdf_url":"https://arxiv.org/pdf/2311.17518v2.pdf","comment":"Accepted as Highlight at CVPR2024"},{"id":"http://arxiv.org/abs/2404.04363v1","updated":"2024-04-05T19:16:30Z","published":"2024-04-05T19:16:30Z","title":"Idea-2-3D: Collaborative LMM Agents Enable 3D Model Generation from\n Interleaved Multimodal Inputs","summary":" In this paper, we pursue a novel 3D AIGC setting: generating 3D content from\nIDEAs. The definition of an IDEA is the composition of multimodal inputs\nincluding text, image, and 3D models. To our knowledge, this challenging and\nappealing 3D AIGC setting has not been studied before. We propose the novel\nframework called Idea-2-3D to achieve this goal, which consists of three agents\nbased upon large multimodel models (LMMs) and several existing algorithmic\ntools for them to invoke. Specifically, these three LMM-based agents are\nprompted to do the jobs of prompt generation, model selection and feedback\nreflection. They work in a cycle that involves both mutual collaboration and\ncriticism. Note that this cycle is done in a fully automatic manner, without\nany human intervention. The framework then outputs a text prompt to generate 3D\nmodels that well align with input IDEAs. We show impressive 3D AIGC results\nthat are beyond any previous methods can achieve. For quantitative comparisons,\nwe construct caption-based baselines using a whole bunch of state-of-the-art 3D\nAIGC models and demonstrate Idea-2-3D out-performs significantly. In 94.2% of\ncases, Idea-2-3D meets users' requirements, marking a degree of match between\nIDEA and 3D models that is 2.3 times higher than baselines. Moreover, in 93.5%\nof the cases, users agreed that Idea-2-3D was better than baselines. Codes,\ndata and models will made publicly available.\n","authors":["Junhao Chen","Xiang Li","Xiaojun Ye","Chao Li","Zhaoxin Fan","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.04363v1.pdf","comment":"Project Page: https://air-discover.github.io/Idea-2-3D/ Code:\n https://github.com/yisuanwang/Idea23D"},{"id":"http://arxiv.org/abs/2404.04356v1","updated":"2024-04-05T18:56:00Z","published":"2024-04-05T18:56:00Z","title":"Pixel-wise RL on Diffusion Models: Reinforcement Learning from Rich\n Feedback","summary":" Latent diffusion models are the state-of-the-art for synthetic image\ngeneration. To align these models with human preferences, training the models\nusing reinforcement learning on human feedback is crucial. Black et. al 2024\nintroduced denoising diffusion policy optimisation (DDPO), which accounts for\nthe iterative denoising nature of the generation by modelling it as a Markov\nchain with a final reward. As the reward is a single value that determines the\nmodel's performance on the entire image, the model has to navigate a very\nsparse reward landscape and so requires a large sample count. In this work, we\nextend the DDPO by presenting the Pixel-wise Policy Optimisation (PXPO)\nalgorithm, which can take feedback for each pixel, providing a more nuanced\nreward to the model.\n","authors":["Mo Kordzanganeh","Danial Keshvary","Nariman Arian"],"pdf_url":"https://arxiv.org/pdf/2404.04356v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.04346v1","updated":"2024-04-05T18:33:04Z","published":"2024-04-05T18:33:04Z","title":"Koala: Key frame-conditioned long video-LLM","summary":" Long video question answering is a challenging task that involves recognizing\nshort-term activities and reasoning about their fine-grained relationships.\nState-of-the-art video Large Language Models (vLLMs) hold promise as a viable\nsolution due to their demonstrated emergent capabilities on new tasks. However,\ndespite being trained on millions of short seconds-long videos, vLLMs are\nunable to understand minutes-long videos and accurately answer questions about\nthem. To address this limitation, we propose a lightweight and self-supervised\napproach, Key frame-conditioned long video-LLM (Koala), that introduces\nlearnable spatiotemporal queries to adapt pretrained vLLMs for generalizing to\nlonger videos. Our approach introduces two new tokenizers that condition on\nvisual tokens computed from sparse video key frames for understanding short and\nlong video moments. We train our proposed approach on HowTo100M and demonstrate\nits effectiveness on zero-shot long video understanding benchmarks, where it\noutperforms state-of-the-art large models by 3 - 6% in absolute accuracy across\nall tasks. Surprisingly, we also empirically show that our approach not only\nhelps a pretrained vLLM to understand long videos but also improves its\naccuracy on short-term action recognition.\n","authors":["Reuben Tan","Ximeng Sun","Ping Hu","Jui-hsien Wang","Hanieh Deilamsalehy","Bryan A. Plummer","Bryan Russell","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2404.04346v1.pdf","comment":"Accepted at CVPR 2024 as a poster highlight"},{"id":"http://arxiv.org/abs/2403.13808v2","updated":"2024-04-05T18:22:02Z","published":"2024-03-20T17:59:58Z","title":"On Pretraining Data Diversity for Self-Supervised Learning","summary":" We explore the impact of training with more diverse datasets, characterized\nby the number of unique samples, on the performance of self-supervised learning\n(SSL) under a fixed computational budget. Our findings consistently demonstrate\nthat increasing pretraining data diversity enhances SSL performance, albeit\nonly when the distribution distance to the downstream data is minimal. Notably,\neven with an exceptionally large pretraining data diversity achieved through\nmethods like web crawling or diffusion-generated data, among other ways, the\ndistribution shift remains a challenge. Our experiments are comprehensive with\nseven SSL methods using large-scale datasets such as ImageNet and YFCC100M\namounting to over 200 GPU days. Code and trained models will be available at\nhttps://github.com/hammoudhasan/DiversitySSL .\n","authors":["Hasan Abed Al Kader Hammoud","Tuhin Das","Fabio Pizzati","Philip Torr","Adel Bibi","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2403.13808v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2403.16271v3","updated":"2024-04-05T18:11:20Z","published":"2024-03-24T19:32:39Z","title":"Object Detectors in the Open Environment: Challenges, Solutions, and\n Outlook","summary":" With the emergence of foundation models, deep learning-based object detectors\nhave shown practical usability in closed set scenarios. However, for real-world\ntasks, object detectors often operate in open environments, where crucial\nfactors (e.g., data distribution, objective) that influence model learning are\noften changing. The dynamic and intricate nature of the open environment poses\nnovel and formidable challenges to object detectors. Unfortunately, current\nresearch on object detectors in open environments lacks a comprehensive\nanalysis of their distinctive characteristics, challenges, and corresponding\nsolutions, which hinders their secure deployment in critical real-world\nscenarios. This paper aims to bridge this gap by conducting a comprehensive\nreview and analysis of object detectors in open environments. We initially\nidentified limitations of key structural components within the existing\ndetection pipeline and propose the open environment object detector challenge\nframework that includes four quadrants (i.e., out-of-domain, out-of-category,\nrobust learning, and incremental learning) based on the dimensions of the data\n/ target changes. For each quadrant of challenges in the proposed framework, we\npresent a detailed description and systematic analysis of the overarching goals\nand core difficulties, systematically review the corresponding solutions, and\nbenchmark their performance over multiple widely adopted datasets. In addition,\nwe engage in a discussion of open problems and potential avenues for future\nresearch. This paper aims to provide a fresh, comprehensive, and systematic\nunderstanding of the challenges and solutions associated with open-environment\nobject detectors, thus catalyzing the development of more solid applications in\nreal-world scenarios. A project related to this survey can be found at\nhttps://github.com/LiangSiyuan21/OEOD_Survey.\n","authors":["Siyuan Liang","Wei Wang","Ruoyu Chen","Aishan Liu","Boxi Wu","Ee-Chien Chang","Xiaochun Cao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.16271v3.pdf","comment":"37 pages, 17 figures"},{"id":"http://arxiv.org/abs/2404.04319v1","updated":"2024-04-05T17:59:25Z","published":"2024-04-05T17:59:25Z","title":"SpatialTracker: Tracking Any 2D Pixels in 3D Space","summary":" Recovering dense and long-range pixel motion in videos is a challenging\nproblem. Part of the difficulty arises from the 3D-to-2D projection process,\nleading to occlusions and discontinuities in the 2D motion domain. While 2D\nmotion can be intricate, we posit that the underlying 3D motion can often be\nsimple and low-dimensional. In this work, we propose to estimate point\ntrajectories in 3D space to mitigate the issues caused by image projection. Our\nmethod, named SpatialTracker, lifts 2D pixels to 3D using monocular depth\nestimators, represents the 3D content of each frame efficiently using a\ntriplane representation, and performs iterative updates using a transformer to\nestimate 3D trajectories. Tracking in 3D allows us to leverage\nas-rigid-as-possible (ARAP) constraints while simultaneously learning a\nrigidity embedding that clusters pixels into different rigid parts. Extensive\nevaluation shows that our approach achieves state-of-the-art tracking\nperformance both qualitatively and quantitatively, particularly in challenging\nscenarios such as out-of-plane rotation.\n","authors":["Yuxi Xiao","Qianqian Wang","Shangzhan Zhang","Nan Xue","Sida Peng","Yujun Shen","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.04319v1.pdf","comment":"Accepted to CVPR 2024 (selected as highlight paper). Project page:\n https://henry123-boy.github.io/SpaTracker/"},{"id":"http://arxiv.org/abs/2404.04318v1","updated":"2024-04-05T17:55:33Z","published":"2024-04-05T17:55:33Z","title":"Robust Depth Enhancement via Polarization Prompt Fusion Tuning","summary":" Existing depth sensors are imperfect and may provide inaccurate depth values\nin challenging scenarios, such as in the presence of transparent or reflective\nobjects. In this work, we present a general framework that leverages\npolarization imaging to improve inaccurate depth measurements from various\ndepth sensors. Previous polarization-based depth enhancement methods focus on\nutilizing pure physics-based formulas for a single sensor. In contrast, our\nmethod first adopts a learning-based strategy where a neural network is trained\nto estimate a dense and complete depth map from polarization data and a sensor\ndepth map from different sensors. To further improve the performance, we\npropose a Polarization Prompt Fusion Tuning (PPFT) strategy to effectively\nutilize RGB-based models pre-trained on large-scale datasets, as the size of\nthe polarization dataset is limited to train a strong model from scratch. We\nconducted extensive experiments on a public dataset, and the results\ndemonstrate that the proposed method performs favorably compared to existing\ndepth enhancement baselines. Code and demos are available at\nhttps://lastbasket.github.io/PPFT/.\n","authors":["Kei Ikemura","Yiming Huang","Felix Heide","Zhaoxiang Zhang","Qifeng Chen","Chenyang Lei"],"pdf_url":"https://arxiv.org/pdf/2404.04318v1.pdf","comment":"CVPR 2024. Project page: https://lastbasket.github.io/PPFT/. The\n first two authors contribute equally"},{"id":"http://arxiv.org/abs/2404.04308v1","updated":"2024-04-05T07:31:24Z","published":"2024-04-05T07:31:24Z","title":"Visual Knowledge in the Big Model Era: Retrospect and Prospect","summary":" Visual knowledge is a new form of knowledge representation that can\nencapsulate visual concepts and their relations in a succinct, comprehensive,\nand interpretable manner, with a deep root in cognitive psychology. As the\nknowledge about the visual world has been identified as an indispensable\ncomponent of human cognition and intelligence, visual knowledge is poised to\nhave a pivotal role in establishing machine intelligence. With the recent\nadvance of Artificial Intelligence (AI) techniques, large AI models (or\nfoundation models) have emerged as a potent tool capable of extracting\nversatile patterns from broad data as implicit knowledge, and abstracting them\ninto an outrageous amount of numeric parameters. To pave the way for creating\nvisual knowledge empowered AI machines in this coming wave, we present a timely\nreview that investigates the origins and development of visual knowledge in the\npre-big model era, and accentuates the opportunities and unique role of visual\nknowledge in the big model era.\n","authors":["Wenguan Wang","Yi Yang","Yunhe Pan"],"pdf_url":"https://arxiv.org/pdf/2404.04308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05758v1","updated":"2024-04-05T21:28:56Z","published":"2024-04-05T21:28:56Z","title":"Implicit Assimilation of Sparse In Situ Data for Dense & Global Storm\n Surge Forecasting","summary":" Hurricanes and coastal floods are among the most disastrous natural hazards.\nBoth are intimately related to storm surges, as their causes and effects,\nrespectively. However, the short-term forecasting of storm surges has proven\nchallenging, especially when targeting previously unseen locations or sites\nwithout tidal gauges. Furthermore, recent work improved short and medium-term\nweather forecasting but the handling of raw unassimilated data remains\nnon-trivial. In this paper, we tackle both challenges and demonstrate that\nneural networks can implicitly assimilate sparse in situ tide gauge data with\ncoarse ocean state reanalysis in order to forecast storm surges. We curate a\nglobal dataset to learn and validate the dense prediction of storm surges,\nbuilding on preceding efforts. Other than prior work limited to known gauges,\nour approach extends to ungauged sites, paving the way for global storm surge\nforecasting.\n","authors":["Patrick Ebel","Brandon Victor","Peter Naylor","Gabriele Meoni","Federico Serva","Rochelle Schneider"],"pdf_url":"https://arxiv.org/pdf/2404.05758v1.pdf","comment":"Accepted at CVPR EarthVision 2024"}]},"2024-04-08T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.05729v1","updated":"2024-04-08T17:59:46Z","published":"2024-04-08T17:59:46Z","title":"Finding Visual Task Vectors","summary":" Visual Prompting is a technique for teaching models to perform a visual task\nvia in-context examples, without any additional training. In this work, we\nanalyze the activations of MAE-VQGAN, a recent Visual Prompting model, and find\ntask vectors, activations that encode task-specific information. Equipped with\nthis insight, we demonstrate that it is possible to identify the task vectors\nand use them to guide the network towards performing different tasks without\nproviding any input-output examples. To find task vectors, we compute the\naverage intermediate activations per task and use the REINFORCE algorithm to\nsearch for the subset of task vectors. The resulting task vectors guide the\nmodel towards performing a task better than the original model without the need\nfor input-output examples.\n","authors":["Alberto Hojel","Yutong Bai","Trevor Darrell","Amir Globerson","Amir Bar"],"pdf_url":"https://arxiv.org/pdf/2404.05729v1.pdf","comment":"https://github.com/alhojel/visual_task_vectors"},{"id":"http://arxiv.org/abs/2404.05726v1","updated":"2024-04-08T17:59:24Z","published":"2024-04-08T17:59:24Z","title":"MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video\n Understanding","summary":" With the success of large language models (LLMs), integrating the vision\nmodel into LLMs to build vision-language foundation models has gained much more\ninterest recently. However, existing LLM-based large multimodal models (e.g.,\nVideo-LLaMA, VideoChat) can only take in a limited number of frames for short\nvideo understanding. In this study, we mainly focus on designing an efficient\nand effective model for long-term video understanding. Instead of trying to\nprocess more frames simultaneously like most existing work, we propose to\nprocess videos in an online manner and store past video information in a memory\nbank. This allows our model to reference historical video content for long-term\nanalysis without exceeding LLMs' context length constraints or GPU memory\nlimits. Our memory bank can be seamlessly integrated into current multimodal\nLLMs in an off-the-shelf manner. We conduct extensive experiments on various\nvideo understanding tasks, such as long-video understanding, video question\nanswering, and video captioning, and our model can achieve state-of-the-art\nperformances across multiple datasets. Code available at\nhttps://boheumd.github.io/MA-LMM/.\n","authors":["Bo He","Hengduo Li","Young Kyun Jang","Menglin Jia","Xuefei Cao","Ashish Shah","Abhinav Shrivastava","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2404.05726v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05719v1","updated":"2024-04-08T17:55:44Z","published":"2024-04-08T17:55:44Z","title":"Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs","summary":" Recent advancements in multimodal large language models (MLLMs) have been\nnoteworthy, yet, these general-domain MLLMs often fall short in their ability\nto comprehend and interact effectively with user interface (UI) screens. In\nthis paper, we present Ferret-UI, a new MLLM tailored for enhanced\nunderstanding of mobile UI screens, equipped with referring, grounding, and\nreasoning capabilities. Given that UI screens typically exhibit a more\nelongated aspect ratio and contain smaller objects of interest (e.g., icons,\ntexts) than natural images, we incorporate \"any resolution\" on top of Ferret to\nmagnify details and leverage enhanced visual features. Specifically, each\nscreen is divided into 2 sub-images based on the original aspect ratio (i.e.,\nhorizontal division for portrait screens and vertical division for landscape\nscreens). Both sub-images are encoded separately before being sent to LLMs. We\nmeticulously gather training samples from an extensive range of elementary UI\ntasks, such as icon recognition, find text, and widget listing. These samples\nare formatted for instruction-following with region annotations to facilitate\nprecise referring and grounding. To augment the model's reasoning ability, we\nfurther compile a dataset for advanced tasks, including detailed description,\nperception/interaction conversations, and function inference. After training on\nthe curated datasets, Ferret-UI exhibits outstanding comprehension of UI\nscreens and the capability to execute open-ended instructions. For model\nevaluation, we establish a comprehensive benchmark encompassing all the\naforementioned tasks. Ferret-UI excels not only beyond most open-source UI\nMLLMs, but also surpasses GPT-4V on all the elementary UI tasks.\n","authors":["Keen You","Haotian Zhang","Eldon Schoop","Floris Weers","Amanda Swearngin","Jeffrey Nichols","Yinfei Yang","Zhe Gan"],"pdf_url":"https://arxiv.org/pdf/2404.05719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05717v1","updated":"2024-04-08T17:52:29Z","published":"2024-04-08T17:52:29Z","title":"SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual\n Editing","summary":" Effective editing of personal content holds a pivotal role in enabling\nindividuals to express their creativity, weaving captivating narratives within\ntheir visual stories, and elevate the overall quality and impact of their\nvisual content. Therefore, in this work, we introduce SwapAnything, a novel\nframework that can swap any objects in an image with personalized concepts\ngiven by the reference, while keeping the context unchanged. Compared with\nexisting methods for personalized subject swapping, SwapAnything has three\nunique advantages: (1) precise control of arbitrary objects and parts rather\nthan the main subject, (2) more faithful preservation of context pixels, (3)\nbetter adaptation of the personalized concept to the image. First, we propose\ntargeted variable swapping to apply region control over latent feature maps and\nswap masked variables for faithful context preservation and initial semantic\nconcept swapping. Then, we introduce appearance adaptation, to seamlessly adapt\nthe semantic concept into the original image in terms of target location,\nshape, style, and content during the image generation process. Extensive\nresults on both human and automatic evaluation demonstrate significant\nimprovements of our approach over baseline methods on personalized swapping.\nFurthermore, SwapAnything shows its precise and faithful swapping abilities\nacross single object, multiple objects, partial object, and cross-domain\nswapping tasks. SwapAnything also achieves great performance on text-based\nswapping and tasks beyond swapping such as object insertion.\n","authors":["Jing Gu","Yilin Wang","Nanxuan Zhao","Wei Xiong","Qing Liu","Zhifei Zhang","He Zhang","Jianming Zhang","HyunJoon Jung","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05717v1.pdf","comment":"18 pages, 16 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.04071v4","updated":"2024-04-08T17:49:58Z","published":"2023-11-07T15:35:56Z","title":"Energy-Calibrated VAE with Test Time Free Lunch","summary":" In this paper, we propose a novel generative model that utilizes a\nconditional Energy-Based Model (EBM) for enhancing Variational Autoencoder\n(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer\nfrom blurry generated samples due to the lack of a tailored training on the\nsamples generated in the generative direction. On the other hand, EBMs can\ngenerate high-quality samples but require expensive Markov Chain Monte Carlo\n(MCMC) sampling. To address these issues, we introduce a conditional EBM for\ncalibrating the generative direction of VAE during training, without requiring\nit for the generation at test time. In particular, we train EC-VAE upon both\nthe input data and the calibrated samples with adaptive weight to enhance\nefficacy while avoiding MCMC sampling at test time. Furthermore, we extend the\ncalibration idea of EC-VAE to variational learning and normalizing flows, and\napply EC-VAE to an additional application of zero-shot image restoration via\nneural transport prior and range-null theory. We evaluate the proposed method\nwith two applications, including image generation and zero-shot image\nrestoration, and the experimental results show that our method achieves\ncompetitive performance over single-step non-adversarial generation. Our code\nis available at https://github.com/DJ-LYH/EC-VAE.\n","authors":["Yihong Luo","Siya Qiu","Xingjian Tao","Yujun Cai","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2311.04071v4.pdf","comment":"Revision. Code is available at https://github.com/DJ-LYH/EC-VAE"},{"id":"http://arxiv.org/abs/2404.05705v1","updated":"2024-04-08T17:42:08Z","published":"2024-04-08T17:42:08Z","title":"Learning 3D-Aware GANs from Unposed Images with Template Feature Field","summary":" Collecting accurate camera poses of training images has been shown to well\nserve the learning of 3D-aware generative adversarial networks (GANs) yet can\nbe quite expensive in practice. This work targets learning 3D-aware GANs from\nunposed images, for which we propose to perform on-the-fly pose estimation of\ntraining images with a learned template feature field (TeFF). Concretely, in\naddition to a generative radiance field as in previous approaches, we ask the\ngenerator to also learn a field from 2D semantic features while sharing the\ndensity from the radiance field. Such a framework allows us to acquire a\ncanonical 3D feature template leveraging the dataset mean discovered by the\ngenerative model, and further efficiently estimate the pose parameters on real\ndata. Experimental results on various challenging datasets demonstrate the\nsuperiority of our approach over state-of-the-art alternatives from both the\nqualitative and the quantitative perspectives.\n","authors":["Xinya Chen","Hanlei Guo","Yanrui Bin","Shangzhan Zhang","Yuanbo Yang","Yue Wang","Yujun Shen","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2404.05705v1.pdf","comment":"https://XDimlab.github.io/TeFF"},{"id":"http://arxiv.org/abs/2404.05693v1","updated":"2024-04-08T17:18:30Z","published":"2024-04-08T17:18:30Z","title":"Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic\n Segmentation for Satellite Imagery","summary":" Satellite imagery is crucial for tasks like environmental monitoring and\nurban planning. Typically, it relies on semantic segmentation or Land Use Land\nCover (LULC) classification to categorize each pixel. Despite the advancements\nbrought about by Deep Neural Networks (DNNs), their performance in segmentation\ntasks is hindered by challenges such as limited availability of labeled data,\nclass imbalance and the inherent variability and complexity of satellite\nimages. In order to mitigate those issues, our study explores the effectiveness\nof a Cut-and-Paste augmentation technique for semantic segmentation in\nsatellite images. We adapt this augmentation, which usually requires labeled\ninstances, to the case of semantic segmentation. By leveraging the connected\ncomponents in the semantic segmentation labels, we extract instances that are\nthen randomly pasted during training. Using the DynamicEarthNet dataset and a\nU-Net model for evaluation, we found that this augmentation significantly\nenhances the mIoU score on the test set from 37.9 to 44.1. This finding\nhighlights the potential of the Cut-and-Paste augmentation to improve the\ngeneralization capabilities of semantic segmentation models in satellite\nimagery.\n","authors":["Ionut M. Motoi","Leonardo Saraceni","Daniele Nardi","Thomas A. Ciarfuglia"],"pdf_url":"https://arxiv.org/pdf/2404.05693v1.pdf","comment":"Accepted for publication in IEEE 2024 International Geoscience &\n Remote Sensing Symposium (IGARSS 2024)"},{"id":"http://arxiv.org/abs/2404.05687v1","updated":"2024-04-08T17:10:45Z","published":"2024-04-08T17:10:45Z","title":"Retrieval-Augmented Open-Vocabulary Object Detection","summary":" Open-vocabulary object detection (OVD) has been studied with Vision-Language\nModels (VLMs) to detect novel objects beyond the pre-trained categories.\nPrevious approaches improve the generalization ability to expand the knowledge\nof the detector, using 'positive' pseudo-labels with additional 'class' names,\ne.g., sock, iPod, and alligator. To extend the previous methods in two aspects,\nwe propose Retrieval-Augmented Losses and visual Features (RALF). Our method\nretrieves related 'negative' classes and augments loss functions. Also, visual\nfeatures are augmented with 'verbalized concepts' of classes, e.g., worn on the\nfeet, handheld music player, and sharp teeth. Specifically, RALF consists of\ntwo modules: Retrieval Augmented Losses (RAL) and Retrieval-Augmented visual\nFeatures (RAF). RAL constitutes two losses reflecting the semantic similarity\nwith negative vocabularies. In addition, RAF augments visual features with the\nverbalized concepts from a large language model (LLM). Our experiments\ndemonstrate the effectiveness of RALF on COCO and LVIS benchmark datasets. We\nachieve improvement up to 3.4 box AP$_{50}^{\\text{N}}$ on novel categories of\nthe COCO dataset and 3.6 mask AP$_{\\text{r}}$ gains on the LVIS dataset. Code\nis available at https://github.com/mlvlab/RALF .\n","authors":["Jooyeon Kim","Eulrang Cho","Sehyung Kim","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.05687v1.pdf","comment":"Accepted paper at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05680v1","updated":"2024-04-08T16:58:31Z","published":"2024-04-08T16:58:31Z","title":"SphereHead: Stable 3D Full-head Synthesis with Spherical Tri-plane\n Representation","summary":" While recent advances in 3D-aware Generative Adversarial Networks (GANs) have\naided the development of near-frontal view human face synthesis, the challenge\nof comprehensively synthesizing a full 3D head viewable from all angles still\npersists. Although PanoHead proves the possibilities of using a large-scale\ndataset with images of both frontal and back views for full-head synthesis, it\noften causes artifacts for back views. Based on our in-depth analysis, we found\nthe reasons are mainly twofold. First, from network architecture perspective,\nwe found each plane in the utilized tri-plane/tri-grid representation space\ntends to confuse the features from both sides, causing \"mirroring\" artifacts\n(e.g., the glasses appear in the back). Second, from data supervision aspect,\nwe found that existing discriminator training in 3D GANs mainly focuses on the\nquality of the rendered image itself, and does not care much about its\nplausibility with the perspective from which it was rendered. This makes it\npossible to generate \"face\" in non-frontal views, due to its easiness to fool\nthe discriminator. In response, we propose SphereHead, a novel tri-plane\nrepresentation in the spherical coordinate system that fits the human head's\ngeometric characteristics and efficiently mitigates many of the generated\nartifacts. We further introduce a view-image consistency loss for the\ndiscriminator to emphasize the correspondence of the camera parameters and the\nimages. The combination of these efforts results in visually superior outcomes\nwith significantly fewer artifacts. Our code and dataset are publicly available\nat https://lhyfst.github.io/spherehead.\n","authors":["Heyuan Li","Ce Chen","Tianhao Shi","Yuda Qiu","Sizhe An","Guanying Chen","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2404.05680v1.pdf","comment":"project page: https://lhyfst.github.io/spherehead"},{"id":"http://arxiv.org/abs/2312.07425v2","updated":"2024-04-08T16:56:17Z","published":"2023-12-12T16:48:53Z","title":"Deep Internal Learning: Deep Learning from a Single Input","summary":" Deep learning, in general, focuses on training a neural network from large\nlabeled datasets. Yet, in many cases there is value in training a network just\nfrom the input at hand. This is particularly relevant in many signal and image\nprocessing problems where training data is scarce and diversity is large on the\none hand, and on the other, there is a lot of structure in the data that can be\nexploited. Using this information is the key to deep internal-learning\nstrategies, which may involve training a network from scratch using a single\ninput or adapting an already trained network to a provided input example at\ninference time. This survey paper aims at covering deep internal-learning\ntechniques that have been proposed in the past few years for these two\nimportant directions. While our main focus will be on image processing\nproblems, most of the approaches that we survey are derived for general signals\n(vectors with recurring patterns that can be distinguished from noise) and are\ntherefore applicable to other modalities.\n","authors":["Tom Tirer","Raja Giryes","Se Young Chun","Yonina C. Eldar"],"pdf_url":"https://arxiv.org/pdf/2312.07425v2.pdf","comment":"Accepted to IEEE Signal Processing Magazine"},{"id":"http://arxiv.org/abs/2404.05675v1","updated":"2024-04-08T16:56:05Z","published":"2024-04-08T16:56:05Z","title":"Normalizing Flows on the Product Space of SO(3) Manifolds for\n Probabilistic Human Pose Modeling","summary":" Normalizing flows have proven their efficacy for density estimation in\nEuclidean space, but their application to rotational representations, crucial\nin various domains such as robotics or human pose modeling, remains\nunderexplored. Probabilistic models of the human pose can benefit from\napproaches that rigorously consider the rotational nature of human joints. For\nthis purpose, we introduce HuProSO3, a normalizing flow model that operates on\na high-dimensional product space of SO(3) manifolds, modeling the joint\ndistribution for human joints with three degrees of freedom. HuProSO3's\nadvantage over state-of-the-art approaches is demonstrated through its superior\nmodeling accuracy in three different applications and its capability to\nevaluate the exact likelihood. This work not only addresses the technical\nchallenge of learning densities on SO(3) manifolds, but it also has broader\nimplications for domains where the probabilistic regression of correlated 3D\nrotations is of importance.\n","authors":["Olaf Dünkel","Tim Salzmann","Florian Pfaff"],"pdf_url":"https://arxiv.org/pdf/2404.05675v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05674v1","updated":"2024-04-08T16:55:49Z","published":"2024-04-08T16:55:49Z","title":"MoMA: Multimodal LLM Adapter for Fast Personalized Image Generation","summary":" In this paper, we present MoMA: an open-vocabulary, training-free\npersonalized image model that boasts flexible zero-shot capabilities. As\nfoundational text-to-image models rapidly evolve, the demand for robust\nimage-to-image translation grows. Addressing this need, MoMA specializes in\nsubject-driven personalized image generation. Utilizing an open-source,\nMultimodal Large Language Model (MLLM), we train MoMA to serve a dual role as\nboth a feature extractor and a generator. This approach effectively synergizes\nreference image and text prompt information to produce valuable image features,\nfacilitating an image diffusion model. To better leverage the generated\nfeatures, we further introduce a novel self-attention shortcut method that\nefficiently transfers image features to an image diffusion model, improving the\nresemblance of the target object in generated images. Remarkably, as a\ntuning-free plug-and-play module, our model requires only a single reference\nimage and outperforms existing methods in generating images with high detail\nfidelity, enhanced identity-preservation and prompt faithfulness. Our work is\nopen-source, thereby providing universal access to these advancements.\n","authors":["Kunpeng Song","Yizhe Zhu","Bingchen Liu","Qing Yan","Ahmed Elgammal","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05673v1","updated":"2024-04-08T16:55:39Z","published":"2024-04-08T16:55:39Z","title":"CoReS: Orchestrating the Dance of Reasoning and Segmentation","summary":" The reasoning segmentation task, which demands a nuanced comprehension of\nintricate queries to accurately pinpoint object regions, is attracting\nincreasing attention. However, Multi-modal Large Language Models (MLLM) often\nfind it difficult to accurately localize the objects described in complex\nreasoning contexts. We believe that the act of reasoning segmentation should\nmirror the cognitive stages of human visual search, where each step is a\nprogressive refinement of thought toward the final object. Thus we introduce\nthe Chains of Reasoning and Segmenting (CoReS) and find this top-down visual\nhierarchy indeed enhances the visual search process. Specifically, we propose a\ndual-chain structure that generates multi-modal, chain-like outputs to aid the\nsegmentation process. Furthermore, to steer the MLLM's outputs into this\nintended hierarchy, we incorporate in-context inputs as guidance. Extensive\nexperiments demonstrate the superior performance of our CoReS, which surpasses\nthe state-of-the-art method by 7.1\\% on the ReasonSeg dataset. The code will be\nreleased at https://github.com/baoxiaoyi/CoReS.\n","authors":["Xiaoyi Bao","Siyang Sun","Shuailei Ma","Kecheng Zheng","Yuxin Guo","Guosheng Zhao","Yun Zheng","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05669v1","updated":"2024-04-08T16:52:21Z","published":"2024-04-08T16:52:21Z","title":"NAF-DPM: A Nonlinear Activation-Free Diffusion Probabilistic Model for\n Document Enhancement","summary":" Real-world documents may suffer various forms of degradation, often resulting\nin lower accuracy in optical character recognition (OCR) systems. Therefore, a\ncrucial preprocessing step is essential to eliminate noise while preserving\ntext and key features of documents. In this paper, we propose NAF-DPM, a novel\ngenerative framework based on a diffusion probabilistic model (DPM) designed to\nrestore the original quality of degraded documents. While DPMs are recognized\nfor their high-quality generated images, they are also known for their large\ninference time. To mitigate this problem we provide the DPM with an efficient\nnonlinear activation-free (NAF) network and we employ as a sampler a fast\nsolver of ordinary differential equations, which can converge in a few\niterations. To better preserve text characters, we introduce an additional\ndifferentiable module based on convolutional recurrent neural networks,\nsimulating the behavior of an OCR system during training. Experiments conducted\non various datasets showcase the superiority of our approach, achieving\nstate-of-the-art performance in terms of pixel-level and perceptual similarity\nmetrics. Furthermore, the results demonstrate a notable character error\nreduction made by OCR systems when transcribing real-world document images\nenhanced by our framework. Code and pre-trained models are available at\nhttps://github.com/ispamm/NAF-DPM.\n","authors":["Giordano Cicchetti","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2404.05669v1.pdf","comment":"Under review at IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2404.05667v1","updated":"2024-04-08T16:51:33Z","published":"2024-04-08T16:51:33Z","title":"AlignZeg: Mitigating Objective Misalignment for Zero-shot Semantic\n Segmentation","summary":" A serious issue that harms the performance of zero-shot visual recognition is\nnamed objective misalignment, i.e., the learning objective prioritizes\nimproving the recognition accuracy of seen classes rather than unseen classes,\nwhile the latter is the true target to pursue. This issue becomes more\nsignificant in zero-shot image segmentation because the stronger (i.e.,\npixel-level) supervision brings a larger gap between seen and unseen classes.\nTo mitigate it, we propose a novel architecture named AlignZeg, which embodies\na comprehensive improvement of the segmentation pipeline, including proposal\nextraction, classification, and correction, to better fit the goal of zero-shot\nsegmentation. (1) Mutually-Refined Proposal Extraction. AlignZeg harnesses a\nmutual interaction between mask queries and visual features, facilitating\ndetailed class-agnostic mask proposal extraction. (2) Generalization-Enhanced\nProposal Classification. AlignZeg introduces synthetic data and incorporates\nmultiple background prototypes to allocate a more generalizable feature space.\n(3) Predictive Bias Correction. During the inference stage, AlignZeg uses a\nclass indicator to find potential unseen class proposals followed by a\nprediction postprocess to correct the prediction bias. Experiments demonstrate\nthat AlignZeg markedly enhances zero-shot semantic segmentation, as shown by an\naverage 3.8% increase in hIoU, primarily attributed to a 7.1% improvement in\nidentifying unseen classes, and we further validate that the improvement comes\nfrom alleviating the objective misalignment issue.\n","authors":["Jiannan Ge","Lingxi Xie","Hongtao Xie","Pandeng Li","Xiaopeng Zhang","Yongdong Zhang","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05666v1","updated":"2024-04-08T16:51:19Z","published":"2024-04-08T16:51:19Z","title":"YaART: Yet Another ART Rendering Technology","summary":" In the rapidly progressing field of generative models, the development of\nefficient and high-fidelity text-to-image diffusion systems represents a\nsignificant frontier. This study introduces YaART, a novel production-grade\ntext-to-image cascaded diffusion model aligned to human preferences using\nReinforcement Learning from Human Feedback (RLHF). During the development of\nYaART, we especially focus on the choices of the model and training dataset\nsizes, the aspects that were not systematically investigated for text-to-image\ncascaded diffusion models before. In particular, we comprehensively analyze how\nthese choices affect both the efficiency of the training process and the\nquality of the generated images, which are highly important in practice.\nFurthermore, we demonstrate that models trained on smaller datasets of\nhigher-quality images can successfully compete with those trained on larger\ndatasets, establishing a more efficient scenario of diffusion models training.\nFrom the quality perspective, YaART is consistently preferred by users over\nmany existing state-of-the-art models.\n","authors":["Sergey Kastryulin","Artem Konev","Alexander Shishenya","Eugene Lyapustin","Artem Khurshudov","Alexander Tselousov","Nikita Vinokurov","Denis Kuznedelev","Alexander Markovich","Grigoriy Livshits","Alexey Kirillov","Anastasiia Tabisheva","Liubov Chubarova","Marina Kaminskaia","Alexander Ustyuzhanin","Artemii Shvetsov","Daniil Shlenskii","Valerii Startsev","Dmitrii Kornilov","Mikhail Romanov","Artem Babenko","Sergei Ovcharenko","Valentin Khrulkov"],"pdf_url":"https://arxiv.org/pdf/2404.05666v1.pdf","comment":"Prompts and additional information are available on the project page,\n see https://ya.ru/ai/art/paper-yaart-v1"},{"id":"http://arxiv.org/abs/2404.05662v1","updated":"2024-04-08T16:46:25Z","published":"2024-04-08T16:46:25Z","title":"BinaryDM: Towards Accurate Binarization of Diffusion Model","summary":" With the advancement of diffusion models (DMs) and the substantially\nincreased computational requirements, quantization emerges as a practical\nsolution to obtain compact and efficient low-bit DMs. However, the highly\ndiscrete representation leads to severe accuracy degradation, hindering the\nquantization of diffusion models to ultra-low bit-widths. In this paper, we\npropose BinaryDM, a novel accurate quantization-aware training approach to push\nthe weights of diffusion models towards the limit of 1-bit. Firstly, we present\na Learnable Multi-basis Binarizer (LMB) to recover the representations\ngenerated by the binarized DM, which improves the information in details of\nrepresentations crucial to the DM. Secondly, a Low-rank Representation\nMimicking (LRM) is applied to enhance the binarization-aware optimization of\nthe DM, alleviating the optimization direction ambiguity caused by fine-grained\nalignment. Moreover, a progressive initialization strategy is applied to\ntraining DMs to avoid convergence difficulties. Comprehensive experiments\ndemonstrate that BinaryDM achieves significant accuracy and efficiency gains\ncompared to SOTA quantization methods of DMs under ultra-low bit-widths. As the\nfirst binarization method for diffusion models, BinaryDM achieves impressive\n16.0 times FLOPs and 27.1 times storage savings with 1-bit weight and 4-bit\nactivation, showcasing its substantial advantages and potential for deploying\nDMs on resource-limited scenarios.\n","authors":["Xingyu Zheng","Haotong Qin","Xudong Ma","Mingyuan Zhang","Haojie Hao","Jiakai Wang","Zixiang Zhao","Jinyang Guo","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05662v1.pdf","comment":"The code will soon be available at\n https://github.com/Xingyu-Zheng/BinaryDM"},{"id":"http://arxiv.org/abs/2404.05661v1","updated":"2024-04-08T16:46:07Z","published":"2024-04-08T16:46:07Z","title":"Automatic Controllable Colorization via Imagination","summary":" We propose a framework for automatic colorization that allows for iterative\nediting and modifications. The core of our framework lies in an imagination\nmodule: by understanding the content within a grayscale image, we utilize a\npre-trained image generation model to generate multiple images that contain the\nsame content. These images serve as references for coloring, mimicking the\nprocess of human experts. As the synthesized images can be imperfect or\ndifferent from the original grayscale image, we propose a Reference Refinement\nModule to select the optimal reference composition. Unlike most previous\nend-to-end automatic colorization algorithms, our framework allows for\niterative and localized modifications of the colorization results because we\nexplicitly model the coloring samples. Extensive experiments demonstrate the\nsuperiority of our framework over existing automatic colorization algorithms in\neditability and flexibility. Project page:\nhttps://xy-cong.github.io/imagine-colorization.\n","authors":["Xiaoyan Cong","Yue Wu","Qifeng Chen","Chenyang Lei"],"pdf_url":"https://arxiv.org/pdf/2404.05661v1.pdf","comment":"CVPR 2024. Project page:\n https://xy-cong.github.io/imagine-colorization"},{"id":"http://arxiv.org/abs/2404.05657v1","updated":"2024-04-08T16:40:15Z","published":"2024-04-08T16:40:15Z","title":"MLP Can Be A Good Transformer Learner","summary":" Self-attention mechanism is the key of the Transformer but often criticized\nfor its computation demands. Previous token pruning works motivate their\nmethods from the view of computation redundancy but still need to load the full\nnetwork and require same memory costs. This paper introduces a novel strategy\nthat simplifies vision transformers and reduces computational load through the\nselective removal of non-essential attention layers, guided by entropy\nconsiderations. We identify that regarding the attention layer in bottom\nblocks, their subsequent MLP layers, i.e. two feed-forward layers, can elicit\nthe same entropy quantity. Meanwhile, the accompanied MLPs are under-exploited\nsince they exhibit smaller feature entropy compared to those MLPs in the top\nblocks. Therefore, we propose to integrate the uninformative attention layers\ninto their subsequent counterparts by degenerating them into identical mapping,\nyielding only MLP in certain transformer blocks. Experimental results on\nImageNet-1k show that the proposed method can remove 40% attention layer of\nDeiT-B, improving throughput and memory bound without performance compromise.\nCode is available at https://github.com/sihaoevery/lambda_vit.\n","authors":["Sihao Lin","Pumeng Lyu","Dongrui Liu","Tao Tang","Xiaodan Liang","Andy Song","Xiaojun Chang"],"pdf_url":"https://arxiv.org/pdf/2404.05657v1.pdf","comment":"efficient transformer"},{"id":"http://arxiv.org/abs/2404.05641v1","updated":"2024-04-08T16:21:22Z","published":"2024-04-08T16:21:22Z","title":"3D-COCO: extension of MS-COCO dataset for image detection and 3D\n reconstruction modules","summary":" We introduce 3D-COCO, an extension of the original MS-COCO dataset providing\n3D models and 2D-3D alignment annotations. 3D-COCO was designed to achieve\ncomputer vision tasks such as 3D reconstruction or image detection configurable\nwith textual, 2D image, and 3D CAD model queries. We complete the existing\nMS-COCO dataset with 28K 3D models collected on ShapeNet and Objaverse. By\nusing an IoU-based method, we match each MS-COCO annotation with the best 3D\nmodels to provide a 2D-3D alignment. The open-source nature of 3D-COCO is a\npremiere that should pave the way for new research on 3D-related topics. The\ndataset and its source codes is available at\nhttps://kalisteo.cea.fr/index.php/coco3d-object-detection-and-reconstruction/\n","authors":["Maxence Bideaux","Alice Phe","Mohamed Chaouch","Bertrand Luvison","Quoc-Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2404.05641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06908v2","updated":"2024-04-08T16:16:56Z","published":"2024-03-11T17:00:27Z","title":"FreGS: 3D Gaussian Splatting with Progressive Frequency Regularization","summary":" 3D Gaussian splatting has achieved very impressive performance in real-time\nnovel view synthesis. However, it often suffers from over-reconstruction during\nGaussian densification where high-variance image regions are covered by a few\nlarge Gaussians only, leading to blur and artifacts in the rendered images. We\ndesign a progressive frequency regularization (FreGS) technique to tackle the\nover-reconstruction issue within the frequency space. Specifically, FreGS\nperforms coarse-to-fine Gaussian densification by exploiting low-to-high\nfrequency components that can be easily extracted with low-pass and high-pass\nfilters in the Fourier space. By minimizing the discrepancy between the\nfrequency spectrum of the rendered image and the corresponding ground truth, it\nachieves high-quality Gaussian densification and alleviates the\nover-reconstruction of Gaussian splatting effectively. Experiments over\nmultiple widely adopted benchmarks (e.g., Mip-NeRF360, Tanks-and-Temples and\nDeep Blending) show that FreGS achieves superior novel view synthesis and\noutperforms the state-of-the-art consistently.\n","authors":["Jiahui Zhang","Fangneng Zhan","Muyu Xu","Shijian Lu","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2403.06908v2.pdf","comment":"Accepted by CVPR 2024. Project website:\n https://rogeraigc.github.io/FreGS-Page/"},{"id":"http://arxiv.org/abs/2403.15238v2","updated":"2024-04-08T16:14:45Z","published":"2024-03-22T14:32:02Z","title":"WEEP: A method for spatial interpretation of weakly supervised CNN\n models in computational pathology","summary":" Deep learning enables the modelling of high-resolution histopathology\nwhole-slide images (WSI). Weakly supervised learning of tile-level data is\ntypically applied for tasks where labels only exist on the patient or WSI level\n(e.g. patient outcomes or histological grading). In this context, there is a\nneed for improved spatial interpretability of predictions from such models. We\npropose a novel method, Wsi rEgion sElection aPproach (WEEP), for model\ninterpretation. It provides a principled yet straightforward way to establish\nthe spatial area of WSI required for assigning a particular prediction label.\nWe demonstrate WEEP on a binary classification task in the area of breast\ncancer computational pathology. WEEP is easy to implement, is directly\nconnected to the model-based decision process, and offers information relevant\nto both research and diagnostic applications.\n","authors":["Abhinav Sharma","Bojing Liu","Mattias Rantalainen"],"pdf_url":"https://arxiv.org/pdf/2403.15238v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05626v1","updated":"2024-04-08T15:59:29Z","published":"2024-04-08T15:59:29Z","title":"Learning a Category-level Object Pose Estimator without Pose Annotations","summary":" 3D object pose estimation is a challenging task. Previous works always\nrequire thousands of object images with annotated poses for learning the 3D\npose correspondence, which is laborious and time-consuming for labeling. In\nthis paper, we propose to learn a category-level 3D object pose estimator\nwithout pose annotations. Instead of using manually annotated images, we\nleverage diffusion models (e.g., Zero-1-to-3) to generate a set of images under\ncontrolled pose differences and propose to learn our object pose estimator with\nthose images. Directly using the original diffusion model leads to images with\nnoisy poses and artifacts. To tackle this issue, firstly, we exploit an image\nencoder, which is learned from a specially designed contrastive pose learning,\nto filter the unreasonable details and extract image feature maps.\nAdditionally, we propose a novel learning strategy that allows the model to\nlearn object poses from those generated image sets without knowing the\nalignment of their canonical poses. Experimental results show that our method\nhas the capability of category-level object pose estimation from a single shot\nsetting (as pose definition), while significantly outperforming other\nstate-of-the-art methods on the few-shot category-level object pose estimation\nbenchmarks.\n","authors":["Fengrui Tian","Yaoyao Liu","Adam Kortylewski","Yueqi Duan","Shaoyi Du","Alan Yuille","Angtian Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05321v3","updated":"2024-04-08T15:59:11Z","published":"2022-09-12T15:26:13Z","title":"Deep Feature Statistics Mapping for Generalized Screen Content Image\n Quality Assessment","summary":" The statistical regularities of natural images, referred to as natural scene\nstatistics, play an important role in no-reference image quality assessment.\nHowever, it has been widely acknowledged that screen content images (SCIs),\nwhich are typically computer generated, do not hold such statistics. Here we\nmake the first attempt to learn the statistics of SCIs, based upon which the\nquality of SCIs can be effectively determined. The underlying mechanism of the\nproposed approach is based upon the mild assumption that the SCIs, which are\nnot physically acquired, still obey certain statistics that could be understood\nin a learning fashion. We empirically show that the statistics deviation could\nbe effectively leveraged in quality assessment, and the proposed method is\nsuperior when evaluated in different settings. Extensive experimental results\ndemonstrate the Deep Feature Statistics based SCI Quality Assessment (DFSS-IQA)\nmodel delivers promising performance compared with existing NR-IQA models and\nshows a high generalization capability in the cross-dataset settings. The\nimplementation of our method is publicly available at\nhttps://github.com/Baoliang93/DFSS-IQA.\n","authors":["Baoliang Chen","Hanwei Zhu","Lingyu Zhu","Shiqi Wang","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2209.05321v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.14466v2","updated":"2024-04-08T15:51:37Z","published":"2022-07-29T04:10:22Z","title":"Towards Domain-agnostic Depth Completion","summary":" Existing depth completion methods are often targeted at a specific sparse\ndepth type and generalize poorly across task domains. We present a method to\ncomplete sparse/semi-dense, noisy, and potentially low-resolution depth maps\nobtained by various range sensors, including those in modern mobile phones, or\nby multi-view reconstruction algorithms. Our method leverages a data-driven\nprior in the form of a single image depth prediction network trained on\nlarge-scale datasets, the output of which is used as an input to our model. We\npropose an effective training scheme where we simulate various sparsity\npatterns in typical task domains. In addition, we design two new benchmarks to\nevaluate the generalizability and the robustness of depth completion methods.\nOur simple method shows superior cross-domain generalization ability against\nstate-of-the-art depth completion methods, introducing a practical solution to\nhigh-quality depth capture on a mobile device. The code is available at:\nhttps://github.com/YvanYin/FillDepth.\n","authors":["Guangkai Xu","Wei Yin","Jianming Zhang","Oliver Wang","Simon Niklaus","Simon Chen","Jia-Wang Bian"],"pdf_url":"https://arxiv.org/pdf/2207.14466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05621v1","updated":"2024-04-08T15:51:21Z","published":"2024-04-08T15:51:21Z","title":"MULTIFLOW: Shifting Towards Task-Agnostic Vision-Language Pruning","summary":" While excellent in transfer learning, Vision-Language models (VLMs) come with\nhigh computational costs due to their large number of parameters. To address\nthis issue, removing parameters via model pruning is a viable solution.\nHowever, existing techniques for VLMs are task-specific, and thus require\npruning the network from scratch for each new task of interest. In this work,\nwe explore a new direction: Task-Agnostic Vision-Language Pruning (TA-VLP).\nGiven a pretrained VLM, the goal is to find a unique pruned counterpart\ntransferable to multiple unknown downstream tasks. In this challenging setting,\nthe transferable representations already encoded in the pretrained model are a\nkey aspect to preserve. Thus, we propose Multimodal Flow Pruning (MULTIFLOW), a\nfirst, gradient-free, pruning framework for TA-VLP where: (i) the importance of\na parameter is expressed in terms of its magnitude and its information flow, by\nincorporating the saliency of the neurons it connects; and (ii) pruning is\ndriven by the emergent (multimodal) distribution of the VLM parameters after\npretraining. We benchmark eight state-of-the-art pruning algorithms in the\ncontext of TA-VLP, experimenting with two VLMs, three vision-language tasks,\nand three pruning ratios. Our experimental results show that MULTIFLOW\noutperforms recent sophisticated, combinatorial competitors in the vast\nmajority of the cases, paving the way towards addressing TA-VLP. The code is\npublicly available at https://github.com/FarinaMatteo/multiflow.\n","authors":["Matteo Farina","Massimiliano Mancini","Elia Cunegatti","Gaowen Liu","Giovanni Iacca","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05621v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2207.12080v4","updated":"2024-04-08T15:50:13Z","published":"2022-07-25T11:57:01Z","title":"Intention-Conditioned Long-Term Human Egocentric Action Forecasting","summary":" To anticipate how a human would act in the future, it is essential to\nunderstand the human intention since it guides the human towards a certain\ngoal. In this paper, we propose a hierarchical architecture which assumes a\nsequence of human action (low-level) can be driven from the human intention\n(high-level). Based on this, we deal with Long-Term Action Anticipation task in\negocentric videos. Our framework first extracts two level of human information\nover the N observed videos human actions through a Hierarchical Multi-task MLP\nMixer (H3M). Then, we condition the uncertainty of the future through an\nIntention-Conditioned Variational Auto-Encoder (I-CVAE) that generates K stable\npredictions of the next Z=20 actions that the observed human might perform. By\nleveraging human intention as high-level information, we claim that our model\nis able to anticipate more time-consistent actions in the long-term, thus\nimproving the results over baseline methods in EGO4D Challenge. This work\nranked first in both CVPR@2022 and ECVV@2022 EGO4D LTA Challenge by providing\nmore plausible anticipated sequences, improving the anticipation of nouns and\noverall actions. Webpage: https://evm7.github.io/icvae-page/\n","authors":["Esteve Valls Mascaro","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2207.12080v4.pdf","comment":"Winner of CVPR@2022 and ECCV@2022 EGO4D LTA Challenge. Accepted in\n WACV2023. Webpage: https://evm7.github.io/icvae-page/"},{"id":"http://arxiv.org/abs/2302.08274v3","updated":"2024-04-08T15:48:50Z","published":"2023-02-16T13:06:39Z","title":"Robust Human Motion Forecasting using Transformer-based Model","summary":" Comprehending human motion is a fundamental challenge for developing\nHuman-Robot Collaborative applications. Computer vision researchers have\naddressed this field by only focusing on reducing error in predictions, but not\ntaking into account the requirements to facilitate its implementation in\nrobots. In this paper, we propose a new model based on Transformer that\nsimultaneously deals with the real time 3D human motion forecasting in the\nshort and long term. Our 2-Channel Transformer (2CH-TR) is able to efficiently\nexploit the spatio-temporal information of a shortly observed sequence (400ms)\nand generates a competitive accuracy against the current state-of-the-art.\n2CH-TR stands out for the efficient performance of the Transformer, being\nlighter and faster than its competitors. In addition, our model is tested in\nconditions where the human motion is severely occluded, demonstrating its\nrobustness in reconstructing and predicting 3D human motion in a highly noisy\nenvironment. Our experiment results show that the proposed 2CH-TR outperforms\nthe ST-Transformer, which is another state-of-the-art model based on the\nTransformer, in terms of reconstruction and prediction under the same\nconditions of input prefix. Our model reduces in 8.89% the mean squared error\nof ST-Transformer in short-term prediction, and 2.57% in long-term prediction\nin Human3.6M dataset with 400ms input prefix. Webpage:\nhttps://evm7.github.io/2CHTR-page/\n","authors":["Esteve Valls Mascaro","Shuo Ma","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2302.08274v3.pdf","comment":"Accepted to IROS2022. Webpage: https://evm7.github.io/2CHTR-page/"},{"id":"http://arxiv.org/abs/2308.07301v2","updated":"2024-04-08T15:47:20Z","published":"2023-08-14T17:39:44Z","title":"A Unified Masked Autoencoder with Patchified Skeletons for Motion\n Synthesis","summary":" The synthesis of human motion has traditionally been addressed through\ntask-dependent models that focus on specific challenges, such as predicting\nfuture motions or filling in intermediate poses conditioned on known key-poses.\nIn this paper, we present a novel task-independent model called UNIMASK-M,\nwhich can effectively address these challenges using a unified architecture.\nOur model obtains comparable or better performance than the state-of-the-art in\neach field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model\ndecomposes a human pose into body parts to leverage the spatio-temporal\nrelationships existing in human motion. Moreover, we reformulate various\npose-conditioned motion synthesis tasks as a reconstruction problem with\ndifferent masking patterns given as input. By explicitly informing our model\nabout the masked joints, our UNIMASK-M becomes more robust to occlusions.\nExperimental results show that our model successfully forecasts human motion on\nthe Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion\ninbetweening on the LaFAN1 dataset, particularly in long transition periods.\nMore information can be found on the project website\nhttps://evm7.github.io/UNIMASKM-page/\n","authors":["Esteve Valls Mascaro","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07301v2.pdf","comment":"Accepted to AAAI2024. Webpage: https://evm7.github.io/UNIMASKM-page/"},{"id":"http://arxiv.org/abs/2309.16524v2","updated":"2024-04-08T15:46:09Z","published":"2023-09-28T15:34:49Z","title":"HOI4ABOT: Human-Object Interaction Anticipation for Human Intention\n Reading Collaborative roBOTs","summary":" Robots are becoming increasingly integrated into our lives, assisting us in\nvarious tasks. To ensure effective collaboration between humans and robots, it\nis essential that they understand our intentions and anticipate our actions. In\nthis paper, we propose a Human-Object Interaction (HOI) anticipation framework\nfor collaborative robots. We propose an efficient and robust transformer-based\nmodel to detect and anticipate HOIs from videos. This enhanced anticipation\nempowers robots to proactively assist humans, resulting in more efficient and\nintuitive collaborations. Our model outperforms state-of-the-art results in HOI\ndetection and anticipation in VidHOI dataset with an increase of 1.76% and\n1.04% in mAP respectively while being 15.4 times faster. We showcase the\neffectiveness of our approach through experimental results in a real robot,\ndemonstrating that the robot's ability to anticipate HOIs is key for better\nHuman-Robot Interaction. More information can be found on our project webpage:\nhttps://evm7.github.io/HOI4ABOT_page/\n","authors":["Esteve Valls Mascaro","Daniel Sliwowski","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2309.16524v2.pdf","comment":"Proceedings in Conference on Robot Learning 2023. Webpage:\n https://evm7.github.io/HOI4ABOT_page/"},{"id":"http://arxiv.org/abs/2402.04768v2","updated":"2024-04-08T15:43:14Z","published":"2024-02-07T11:37:14Z","title":"Robot Interaction Behavior Generation based on Social Motion Forecasting\n for Human-Robot Interaction","summary":" Integrating robots into populated environments is a complex challenge that\nrequires an understanding of human social dynamics. In this work, we propose to\nmodel social motion forecasting in a shared human-robot representation space,\nwhich facilitates us to synthesize robot motions that interact with humans in\nsocial scenarios despite not observing any robot in the motion training. We\ndevelop a transformer-based architecture called ECHO, which operates in the\naforementioned shared space to predict the future motions of the agents\nencountered in social scenarios. Contrary to prior works, we reformulate the\nsocial motion problem as the refinement of the predicted individual motions\nbased on the surrounding agents, which facilitates the training while allowing\nfor single-motion forecasting when only one human is in the scene. We evaluate\nour model in multi-person and human-robot motion forecasting tasks and obtain\nstate-of-the-art performance by a large margin while being efficient and\nperforming in real-time. Additionally, our qualitative results showcase the\neffectiveness of our approach in generating human-robot interaction behaviors\nthat can be controlled via text commands. Webpage: https://evm7.github.io/ECHO/\n","authors":["Esteve Valls Mascaro","Yashuai Yan","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2402.04768v2.pdf","comment":"Accepted at ICRA 2024. Webpage: https://evm7.github.io/ECHO/"},{"id":"http://arxiv.org/abs/2404.05607v1","updated":"2024-04-08T15:29:46Z","published":"2024-04-08T15:29:46Z","title":"A Training-Free Plug-and-Play Watermark Framework for Stable Diffusion","summary":" Nowadays, the family of Stable Diffusion (SD) models has gained prominence\nfor its high quality outputs and scalability. This has also raised security\nconcerns on social media, as malicious users can create and disseminate harmful\ncontent. Existing approaches involve training components or entire SDs to embed\na watermark in generated images for traceability and responsibility\nattribution. However, in the era of AI-generated content (AIGC), the rapid\niteration of SDs renders retraining with watermark models costly. To address\nthis, we propose a training-free plug-and-play watermark framework for SDs.\nWithout modifying any components of SDs, we embed diverse watermarks in the\nlatent space, adapting to the denoising process. Our experimental findings\nreveal that our method effectively harmonizes image quality and watermark\ninvisibility. Furthermore, it performs robustly under various attacks. We also\nhave validated that our method is generalized to multiple versions of SDs, even\nwithout retraining the watermark model.\n","authors":["Guokai Zhang","Lanjun Wang","Yuting Su","An-An Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05606v1","updated":"2024-04-08T15:25:50Z","published":"2024-04-08T15:25:50Z","title":"Learning Topology Uniformed Face Mesh by Volume Rendering for Multi-view\n Reconstruction","summary":" Face meshes in consistent topology serve as the foundation for many\nface-related applications, such as 3DMM constrained face reconstruction and\nexpression retargeting. Traditional methods commonly acquire topology uniformed\nface meshes by two separate steps: multi-view stereo (MVS) to reconstruct\nshapes followed by non-rigid registration to align topology, but struggles with\nhandling noise and non-lambertian surfaces. Recently neural volume rendering\ntechniques have been rapidly evolved and shown great advantages in 3D\nreconstruction or novel view synthesis. Our goal is to leverage the superiority\nof neural volume rendering into multi-view reconstruction of face mesh with\nconsistent topology. We propose a mesh volume rendering method that enables\ndirectly optimizing mesh geometry while preserving topology, and learning\nimplicit features to model complex facial appearance from multi-view images.\nThe key innovation lies in spreading sparse mesh features into the surrounding\nspace to simulate radiance field required for volume rendering, which\nfacilitates backpropagation of gradients from images to mesh geometry and\nimplicit appearance features. Our proposed feature spreading module exhibits\ndeformation invariance, enabling photorealistic rendering seamlessly after mesh\nediting. We conduct experiments on multi-view face image dataset to evaluate\nthe reconstruction and implement an application for photorealistic rendering of\nanimated face mesh.\n","authors":["Yating Wang","Ran Yi","Ke Fan","Jinkun Hao","Jiangbo Lu","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.05606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05603v1","updated":"2024-04-08T15:22:38Z","published":"2024-04-08T15:22:38Z","title":"Self-Explainable Affordance Learning with Embodied Caption","summary":" In the field of visual affordance learning, previous methods mainly used\nabundant images or videos that delineate human behavior patterns to identify\naction possibility regions for object manipulation, with a variety of\napplications in robotic tasks. However, they encounter a main challenge of\naction ambiguity, illustrated by the vagueness like whether to beat or carry a\ndrum, and the complexities involved in processing intricate scenes. Moreover,\nit is important for human intervention to rectify robot errors in time. To\naddress these issues, we introduce Self-Explainable Affordance learning (SEA)\nwith embodied caption. This innovation enables robots to articulate their\nintentions and bridge the gap between explainable vision-language caption and\nvisual affordance learning. Due to a lack of appropriate dataset, we unveil a\npioneering dataset and metrics tailored for this task, which integrates images,\nheatmaps, and embodied captions. Furthermore, we propose a novel model to\neffectively combine affordance grounding with self-explanation in a simple but\nefficient manner. Extensive quantitative and qualitative experiments\ndemonstrate our method's effectiveness.\n","authors":["Zhipeng Zhang","Zhimin Wei","Guolei Sun","Peng Wang","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.05603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00722v3","updated":"2024-04-08T15:15:56Z","published":"2024-03-31T15:34:45Z","title":"DRCT: Saving Image Super-resolution away from Information Bottleneck","summary":" In recent years, Vision Transformer-based applications to low-level vision\ntasks have achieved widespread success. Unlike CNN-based models, Transformers\nare more adept at capturing long-range dependencies, enabling the\nreconstruction of images utilizing information from non-local areas. In the\ndomain of super-resolution, Swin-transformer-based approaches have become\nmainstream due to their capacity to capture global spatial information and\ntheir shifting-window attention mechanism that facilitates the interchange of\ninformation between different windows. Many researchers have enhanced image\nquality and network efficiency by expanding the receptive field or designing\ncomplex networks, yielding commendable results. However, we observed that\nspatial information tends to diminish during the forward propagation process\ndue to increased depth, leading to a loss of spatial information and,\nconsequently, limiting the model's potential. To address this, we propose the\nDense-residual-connected Transformer (DRCT), aimed at mitigating the loss of\nspatial information through dense-residual connections between layers, thereby\nunleashing the model's potential and enhancing performance. Experiment results\nindicate that our approach is not only straightforward but also achieves\nremarkable efficiency, surpassing state-of-the-art methods and performing\ncommendably at NTIRE2024.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou"],"pdf_url":"https://arxiv.org/pdf/2404.00722v3.pdf","comment":"NTIRE 2024 Image Super-resolution (x4)"},{"id":"http://arxiv.org/abs/2404.05595v1","updated":"2024-04-08T15:14:20Z","published":"2024-04-08T15:14:20Z","title":"UniFL: Improve Stable Diffusion via Unified Feedback Learning","summary":" Diffusion models have revolutionized the field of image generation, leading\nto the proliferation of high-quality models and diverse downstream\napplications. However, despite these significant advancements, the current\ncompetitive solutions still suffer from several limitations, including inferior\nvisual quality, a lack of aesthetic appeal, and inefficient inference, without\na comprehensive solution in sight. To address these challenges, we present\nUniFL, a unified framework that leverages feedback learning to enhance\ndiffusion models comprehensively. UniFL stands out as a universal, effective,\nand generalizable solution applicable to various diffusion models, such as\nSD1.5 and SDXL. Notably, UniFL incorporates three key components: perceptual\nfeedback learning, which enhances visual quality; decoupled feedback learning,\nwhich improves aesthetic appeal; and adversarial feedback learning, which\noptimizes inference speed. In-depth experiments and extensive user studies\nvalidate the superior performance of our proposed method in enhancing both the\nquality of generated models and their acceleration. For instance, UniFL\nsurpasses ImageReward by 17% user preference in terms of generation quality and\noutperforms LCM and SDXL Turbo by 57% and 20% in 4-step inference. Moreover, we\nhave verified the efficacy of our approach in downstream tasks, including Lora,\nControlNet, and AnimateDiff.\n","authors":["Jiacheng Zhang","Jie Wu","Yuxi Ren","Xin Xia","Huafeng Kuang","Pan Xie","Jiashi Li","Xuefeng Xiao","Weilin Huang","Min Zheng","Lean Fu","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2404.05595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05584v1","updated":"2024-04-08T14:59:53Z","published":"2024-04-08T14:59:53Z","title":"Neural Cellular Automata for Lightweight, Robust and Explainable\n Classification of White Blood Cell Images","summary":" Diagnosis of hematological malignancies depends on accurate identification of\nwhite blood cells in peripheral blood smears. Deep learning techniques are\nemerging as a viable solution to scale and optimize this process by automatic\nidentification of cells in laboratories. However, these techniques face several\nchallenges such as limited generalizability, sensitivity to domain shifts and\nlack of explainability. Here, we are introducing a novel approach based on\nneural cellular automata (NCA) for white blood cell classification. We test our\napproach on three datasets of white blood cell images and show that we achieve\ncompetitive performance compared to conventional methods. Our NCA-based method\nis significantly smaller in terms of parameters and exhibits robustness to\ndomain shifts. Furthermore, the architecture is inherently explainable,\nproviding insights into the decision process for each classification, helping\nexperts understand and validate model predictions. Results demonstrate that NCA\nnot only can be used for image classification, but also address key challenges\nof conventional methods, indicating a high potential for applicability in\nclinical practice.\n","authors":["Michael Deutges","Ario Sadafi","Nassir Navab","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2404.05584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05583v1","updated":"2024-04-08T14:58:52Z","published":"2024-04-08T14:58:52Z","title":"Towards More General Video-based Deepfake Detection through Facial\n Feature Guided Adaptation for Foundation Model","summary":" With the rise of deep learning, generative models have enabled the creation\nof highly realistic synthetic images, presenting challenges due to their\npotential misuse. While research in Deepfake detection has grown rapidly in\nresponse, many detection methods struggle with unseen Deepfakes generated by\nnew synthesis techniques. To address this generalisation challenge, we propose\na novel Deepfake detection approach by adapting rich information encoded inside\nthe Foundation Models with rich information encoded inside, specifically using\nthe image encoder from CLIP which has demonstrated strong zero-shot capability\nfor downstream tasks. Inspired by the recent advances of parameter efficient\nfine-tuning, we propose a novel side-network-based decoder to extract spatial\nand temporal cues from the given video clip, with the promotion of the Facial\nComponent Guidance (FCG) to guidencourage the spatial feature to include\nfeatures of key facial parts for more robust and general Deepfake detection.\nThrough extensive cross-dataset evaluations, our approach exhibits superior\neffectiveness in identifying unseen Deepfake samples, achieving notable\nperformance improvementsuccess even with limited training samples and\nmanipulation types. Our model secures an average performance enhancement of\n0.9% AUROC in cross-dataset assessments comparing with state-of-the-art\nmethods, especiallytablishing a significant lead of achieving 4.4% improvement\non the challenging DFDC dataset.\n","authors":["Yue-Hua Han","Tai-Ming Huang","Shu-Tzu Lo","Po-Han Huang","Kai-Lung Hua","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05580v1","updated":"2024-04-08T14:56:26Z","published":"2024-04-08T14:56:26Z","title":"Responsible Visual Editing","summary":" With recent advancements in visual synthesis, there is a growing risk of\nencountering images with detrimental effects, such as hate, discrimination, or\nprivacy violations. The research on transforming harmful images into\nresponsible ones remains unexplored. In this paper, we formulate a new task,\nresponsible visual editing, which entails modifying specific concepts within an\nimage to render it more responsible while minimizing changes. However, the\nconcept that needs to be edited is often abstract, making it challenging to\nlocate what needs to be modified and plan how to modify it. To tackle these\nchallenges, we propose a Cognitive Editor (CoEditor) that harnesses the large\nmultimodal model through a two-stage cognitive process: (1) a perceptual\ncognitive process to focus on what needs to be modified and (2) a behavioral\ncognitive process to strategize how to modify. To mitigate the negative\nimplications of harmful images on research, we create a transparent and public\ndataset, AltBear, which expresses harmful information using teddy bears instead\nof humans. Experiments demonstrate that CoEditor can effectively comprehend\nabstract concepts within complex scenes and significantly surpass the\nperformance of baseline models for responsible visual editing. We find that the\nAltBear dataset corresponds well to the harmful content found in real images,\noffering a consistent experimental evaluation, thereby providing a safer\nbenchmark for future research. Moreover, CoEditor also shows great results in\ngeneral editing. We release our code and dataset at\nhttps://github.com/kodenii/Responsible-Visual-Editing.\n","authors":["Minheng Ni","Yeli Shen","Lei Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05580v1.pdf","comment":"24 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.05579v1","updated":"2024-04-08T14:55:35Z","published":"2024-04-08T14:55:35Z","title":"Robust Data Pruning: Uncovering and Overcoming Implicit Bias","summary":" In the era of exceptionally data-hungry models, careful selection of the\ntraining data is essential to mitigate the extensive costs of deep learning.\nData pruning offers a solution by removing redundant or uninformative samples\nfrom the dataset, which yields faster convergence and improved neural scaling\nlaws. However, little is known about its impact on classification bias of the\ntrained models. We conduct the first systematic study of this effect and reveal\nthat existing data pruning algorithms can produce highly biased classifiers. At\nthe same time, we argue that random data pruning with appropriate class ratios\nhas potential to improve the worst-class performance. We propose a\n\"fairness-aware\" approach to pruning and empirically demonstrate its\nperformance on standard computer vision benchmarks. In sharp contrast to\nexisting algorithms, our proposed method continues improving robustness at a\ntolerable drop of average performance as we prune more from the datasets. We\npresent theoretical analysis of the classification risk in a mixture of\nGaussians to further motivate our algorithm and support our findings.\n","authors":["Artem Vysogorets","Kartik Ahuja","Julia Kempe"],"pdf_url":"https://arxiv.org/pdf/2404.05579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05578v1","updated":"2024-04-08T14:54:54Z","published":"2024-04-08T14:54:54Z","title":"Social-MAE: Social Masked Autoencoder for Multi-person Motion\n Representation Learning","summary":" For a complete comprehension of multi-person scenes, it is essential to go\nbeyond basic tasks like detection and tracking. Higher-level tasks, such as\nunderstanding the interactions and social activities among individuals, are\nalso crucial. Progress towards models that can fully understand scenes\ninvolving multiple people is hindered by a lack of sufficient annotated data\nfor such high-level tasks. To address this challenge, we introduce Social-MAE,\na simple yet effective transformer-based masked autoencoder framework for\nmulti-person human motion data. The framework uses masked modeling to pre-train\nthe encoder to reconstruct masked human joint trajectories, enabling it to\nlearn generalizable and data efficient representations of motion in human\ncrowded scenes. Social-MAE comprises a transformer as the MAE encoder and a\nlighter-weight transformer as the MAE decoder which operates on multi-person\njoints' trajectory in the frequency domain. After the reconstruction task, the\nMAE decoder is replaced with a task-specific decoder and the model is\nfine-tuned end-to-end for a variety of high-level social tasks. Our proposed\nmodel combined with our pre-training approach achieves the state-of-the-art\nresults on various high-level social tasks, including multi-person pose\nforecasting, social grouping, and social action understanding. These\nimprovements are demonstrated across four popular multi-person datasets\nencompassing both human 2D and 3D body pose.\n","authors":["Mahsa Ehsanpour","Ian Reid","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2404.05578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16741v2","updated":"2024-04-08T14:42:15Z","published":"2024-01-30T04:39:32Z","title":"MESA: Matching Everything by Segmenting Anything","summary":" Feature matching is a crucial task in the field of computer vision, which\ninvolves finding correspondences between images. Previous studies achieve\nremarkable performance using learning-based feature comparison. However, the\npervasive presence of matching redundancy between images gives rise to\nunnecessary and error-prone computations in these methods, imposing limitations\non their accuracy. To address this issue, we propose MESA, a novel approach to\nestablish precise area (or region) matches for efficient matching redundancy\nreduction. MESA first leverages the advanced image understanding capability of\nSAM, a state-of-the-art foundation model for image segmentation, to obtain\nimage areas with implicit semantic. Then, a multi-relational graph is proposed\nto model the spatial structure of these areas and construct their scale\nhierarchy. Based on graphical models derived from the graph, the area matching\nis reformulated as an energy minimization task and effectively resolved.\nExtensive experiments demonstrate that MESA yields substantial precision\nimprovement for multiple point matchers in indoor and outdoor downstream tasks,\ne.g. +13.61% for DKM in indoor pose estimation.\n","authors":["Yesheng Zhang","Xu Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.16741v2.pdf","comment":"CVPR24"},{"id":"http://arxiv.org/abs/2312.01068v2","updated":"2024-04-08T14:33:12Z","published":"2023-12-02T08:34:22Z","title":"DPHMs: Diffusion Parametric Head Models for Depth-based Tracking","summary":" We introduce Diffusion Parametric Head Models (DPHMs), a generative model\nthat enables robust volumetric head reconstruction and tracking from monocular\ndepth sequences. While recent volumetric head models, such as NPHMs, can now\nexcel in representing high-fidelity head geometries, tracking and\nreconstructing heads from real-world single-view depth sequences remains very\nchallenging, as the fitting to partial and noisy observations is\nunderconstrained. To tackle these challenges, we propose a latent\ndiffusion-based prior to regularize volumetric head reconstruction and\ntracking. This prior-based regularizer effectively constrains the identity and\nexpression codes to lie on the underlying latent manifold which represents\nplausible head shapes. To evaluate the effectiveness of the diffusion-based\nprior, we collect a dataset of monocular Kinect sequences consisting of various\ncomplex facial expression motions and rapid transitions. We compare our method\nto state-of-the-art tracking methods and demonstrate improved head identity\nreconstruction as well as robust expression tracking.\n","authors":["Jiapeng Tang","Angela Dai","Yinyu Nie","Lev Markhasin","Justus Thies","Matthias Niessner"],"pdf_url":"https://arxiv.org/pdf/2312.01068v2.pdf","comment":"CVPR 2024; homepage: https://tangjiapeng.github.io/projects/DPHMs/"},{"id":"http://arxiv.org/abs/2404.05559v1","updated":"2024-04-08T14:30:42Z","published":"2024-04-08T14:30:42Z","title":"TIM: A Time Interval Machine for Audio-Visual Action Recognition","summary":" Diverse actions give rise to rich audio-visual signals in long videos. Recent\nworks showcase that the two modalities of audio and video exhibit different\ntemporal extents of events and distinct labels. We address the interplay\nbetween the two modalities in long videos by explicitly modelling the temporal\nextents of audio and visual events. We propose the Time Interval Machine (TIM)\nwhere a modality-specific time interval poses as a query to a transformer\nencoder that ingests a long video input. The encoder then attends to the\nspecified interval, as well as the surrounding context in both modalities, in\norder to recognise the ongoing action.\n We test TIM on three long audio-visual video datasets: EPIC-KITCHENS,\nPerception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On\nEPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly\nlarger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we\nshow that TIM can be adapted for action detection, using dense multi-scale\ninterval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and\nshowing strong performance on the Perception Test. Our ablations show the\ncritical role of integrating the two modalities and modelling their time\nintervals in achieving this performance. Code and models at:\nhttps://github.com/JacobChalk/TIM\n","authors":["Jacob Chalk","Jaesung Huh","Evangelos Kazakos","Andrew Zisserman","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05559v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2307.06206v2","updated":"2024-04-08T14:26:52Z","published":"2023-07-12T14:52:21Z","title":"SepVAE: a contrastive VAE to separate pathological patterns from healthy\n ones","summary":" Contrastive Analysis VAE (CA-VAEs) is a family of Variational auto-encoders\n(VAEs) that aims at separating the common factors of variation between a\nbackground dataset (BG) (i.e., healthy subjects) and a target dataset (TG)\n(i.e., patients) from the ones that only exist in the target dataset. To do so,\nthese methods separate the latent space into a set of salient features (i.e.,\nproper to the target dataset) and a set of common features (i.e., exist in both\ndatasets). Currently, all models fail to prevent the sharing of information\nbetween latent spaces effectively and to capture all salient factors of\nvariation. To this end, we introduce two crucial regularization losses: a\ndisentangling term between common and salient representations and a\nclassification term between background and target samples in the salient space.\nWe show a better performance than previous CA-VAEs methods on three medical\napplications and a natural images dataset (CelebA). Code and datasets are\navailable on GitHub https://github.com/neurospin-projects/2023_rlouiset_sepvae.\n","authors":["Robin Louiset","Edouard Duchesnay","Antoine Grigis","Benoit Dufumier","Pietro Gori"],"pdf_url":"https://arxiv.org/pdf/2307.06206v2.pdf","comment":"Workshop on Interpretable ML in Healthcare at International\n Conference on Machine Learning (ICML), Honolulu, Hawaii, USA. 2023"},{"id":"http://arxiv.org/abs/2308.16018v4","updated":"2024-04-08T14:09:27Z","published":"2023-08-30T13:20:54Z","title":"SiT-MLP: A Simple MLP with Point-wise Topology Feature Learning for\n Skeleton-based Action Recognition","summary":" Graph convolution networks (GCNs) have achieved remarkable performance in\nskeleton-based action recognition. However, previous GCN-based methods rely on\nelaborate human priors excessively and construct complex feature aggregation\nmechanisms, which limits the generalizability and effectiveness of networks. To\nsolve these problems, we propose a novel Spatial Topology Gating Unit (STGU),\nan MLP-based variant without extra priors, to capture the co-occurrence\ntopology features that encode the spatial dependency across all joints. In\nSTGU, to learn the point-wise topology features, a new gate-based feature\ninteraction mechanism is introduced to activate the features point-to-point by\nthe attention map generated from the input sample. Based on the STGU, we\npropose the first MLP-based model, SiT-MLP, for skeleton-based action\nrecognition in this work. Compared with previous methods on three large-scale\ndatasets, SiT-MLP achieves competitive performance. In addition, SiT-MLP\nreduces the parameters significantly with favorable results. The code will be\navailable at https://github.com/BUPTSJZhang/SiT?MLP.\n","authors":["Shaojie Zhang","Jianqin Yin","Yonghao Dang","Jiajun Fu"],"pdf_url":"https://arxiv.org/pdf/2308.16018v4.pdf","comment":"Accepted by IEEE TCSVT 2024"},{"id":"http://arxiv.org/abs/2312.07526v2","updated":"2024-04-08T13:40:43Z","published":"2023-12-12T18:55:29Z","title":"RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose\n Estimation","summary":" Real-time multi-person pose estimation presents significant challenges in\nbalancing speed and precision. While two-stage top-down methods slow down as\nthe number of people in the image increases, existing one-stage methods often\nfail to simultaneously deliver high accuracy and real-time performance. This\npaper introduces RTMO, a one-stage pose estimation framework that seamlessly\nintegrates coordinate classification by representing keypoints using dual 1-D\nheatmaps within the YOLO architecture, achieving accuracy comparable to\ntop-down methods while maintaining high speed. We propose a dynamic coordinate\nclassifier and a tailored loss function for heatmap learning, specifically\ndesigned to address the incompatibilities between coordinate classification and\ndense prediction models. RTMO outperforms state-of-the-art one-stage pose\nestimators, achieving 1.1% higher AP on COCO while operating about 9 times\nfaster with the same backbone. Our largest model, RTMO-l, attains 74.8% AP on\nCOCO val2017 and 141 FPS on a single V100 GPU, demonstrating its efficiency and\naccuracy. The code and models are available at\nhttps://github.com/open-mmlab/mmpose/tree/main/projects/rtmo.\n","authors":["Peng Lu","Tao Jiang","Yining Li","Xiangtai Li","Kai Chen","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2312.07526v2.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo"},{"id":"http://arxiv.org/abs/2404.05519v1","updated":"2024-04-08T13:40:01Z","published":"2024-04-08T13:40:01Z","title":"Investigating the Effectiveness of Cross-Attention to Unlock Zero-Shot\n Editing of Text-to-Video Diffusion Models","summary":" With recent advances in image and video diffusion models for content\ncreation, a plethora of techniques have been proposed for customizing their\ngenerated content. In particular, manipulating the cross-attention layers of\nText-to-Image (T2I) diffusion models has shown great promise in controlling the\nshape and location of objects in the scene. Transferring image-editing\ntechniques to the video domain, however, is extremely challenging as object\nmotion and temporal consistency are difficult to capture accurately. In this\nwork, we take a first look at the role of cross-attention in Text-to-Video\n(T2V) diffusion models for zero-shot video editing. While one-shot models have\nshown potential in controlling motion and camera movement, we demonstrate\nzero-shot control over object shape, position and movement in T2V models. We\nshow that despite the limitations of current T2V models, cross-attention\nguidance can be a promising approach for editing videos.\n","authors":["Saman Motamed","Wouter Van Gansbeke","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.05519v1.pdf","comment":"Generative Models for Computer Vision Generative Models for Computer\n Vision CVPR 2024 Workshop"},{"id":"http://arxiv.org/abs/2404.05518v1","updated":"2024-04-08T13:39:12Z","published":"2024-04-08T13:39:12Z","title":"DepthMOT: Depth Cues Lead to a Strong Multi-Object Tracker","summary":" Accurately distinguishing each object is a fundamental goal of Multi-object\ntracking (MOT) algorithms. However, achieving this goal still remains\nchallenging, primarily due to: (i) For crowded scenes with occluded objects,\nthe high overlap of object bounding boxes leads to confusion among closely\nlocated objects. Nevertheless, humans naturally perceive the depth of elements\nin a scene when observing 2D videos. Inspired by this, even though the bounding\nboxes of objects are close on the camera plane, we can differentiate them in\nthe depth dimension, thereby establishing a 3D perception of the objects. (ii)\nFor videos with rapidly irregular camera motion, abrupt changes in object\npositions can result in ID switches. However, if the camera pose are known, we\ncan compensate for the errors in linear motion models. In this paper, we\npropose \\textit{DepthMOT}, which achieves: (i) detecting and estimating scene\ndepth map \\textit{end-to-end}, (ii) compensating the irregular camera motion by\ncamera pose estimation. Extensive experiments demonstrate the superior\nperformance of DepthMOT in VisDrone-MOT and UAVDT datasets. The code will be\navailable at \\url{https://github.com/JackWoo0831/DepthMOT}.\n","authors":["Jiapeng Wu","Yichen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05512v1","updated":"2024-04-08T13:35:14Z","published":"2024-04-08T13:35:14Z","title":"Impact of LiDAR visualisations on semantic segmentation of\n archaeological objects","summary":" Deep learning methods in LiDAR-based archaeological research often leverage\nvisualisation techniques derived from Digital Elevation Models to enhance\ncharacteristics of archaeological objects present in the images. This paper\ninvestigates the impact of visualisations on deep learning performance through\na comprehensive testing framework. The study involves the use of eight semantic\nsegmentation models to evaluate seven diverse visualisations across two study\nareas, encompassing five archaeological classes. Experimental results reveal\nthat the choice of appropriate visualisations can influence performance by up\nto 8%. Yet, pinpointing one visualisation that outperforms the others in\nsegmenting all archaeological classes proves challenging. The observed\nperformance variation, reaching up to 25% across different model\nconfigurations, underscores the importance of thoughtfully selecting model\nconfigurations and LiDAR visualisations for successfully segmenting\narchaeological objects.\n","authors":["Raveerat Jaturapitpornchai","Giulio Poggi","Gregory Sech","Ziga Kokalj","Marco Fiorucci","Arianna Traviglia"],"pdf_url":"https://arxiv.org/pdf/2404.05512v1.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2024 (IGARSS 2024) @IEEE copyright"},{"id":"http://arxiv.org/abs/2404.05505v1","updated":"2024-04-08T13:27:07Z","published":"2024-04-08T13:27:07Z","title":"Taming Transformers for Realistic Lidar Point Cloud Generation","summary":" Diffusion Models (DMs) have achieved State-Of-The-Art (SOTA) results in the\nLidar point cloud generation task, benefiting from their stable training and\niterative refinement during sampling. However, DMs often fail to realistically\nmodel Lidar raydrop noise due to their inherent denoising process. To retain\nthe strength of iterative sampling while enhancing the generation of raydrop\nnoise, we introduce LidarGRIT, a generative model that uses auto-regressive\ntransformers to iteratively sample the range images in the latent space rather\nthan image space. Furthermore, LidarGRIT utilises VQ-VAE to separately decode\nrange images and raydrop masks. Our results show that LidarGRIT achieves\nsuperior performance compared to SOTA models on KITTI-360 and KITTI odometry\ndatasets. Code available at:https://github.com/hamedhaghighi/LidarGRIT.\n","authors":["Hamed Haghighi","Amir Samadi","Mehrdad Dianati","Valentina Donzella","Kurt Debattista"],"pdf_url":"https://arxiv.org/pdf/2404.05505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08077v2","updated":"2024-04-08T13:23:47Z","published":"2023-11-14T11:05:08Z","title":"Zero-Shot Segmentation of Eye Features Using the Segment Anything Model\n (SAM)","summary":" The advent of foundation models signals a new era in artificial intelligence.\nThe Segment Anything Model (SAM) is the first foundation model for image\nsegmentation. In this study, we evaluate SAM's ability to segment features from\neye images recorded in virtual reality setups. The increasing requirement for\nannotated eye-image datasets presents a significant opportunity for SAM to\nredefine the landscape of data annotation in gaze estimation. Our investigation\ncenters on SAM's zero-shot learning abilities and the effectiveness of prompts\nlike bounding boxes or point clicks. Our results are consistent with studies in\nother domains, demonstrating that SAM's segmentation effectiveness can be\non-par with specialized models depending on the feature, with prompts improving\nits performance, evidenced by an IoU of 93.34% for pupil segmentation in one\ndataset. Foundation models like SAM could revolutionize gaze estimation by\nenabling quick and easy image segmentation, reducing reliance on specialized\nmodels and extensive manual annotation.\n","authors":["Virmarie Maquiling","Sean Anthony Byrne","Diederick C. Niehorster","Marcus Nyström","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2311.08077v2.pdf","comment":"14 pages, 8 figures, 1 table, Accepted to ETRA 2024: ACM Symposium on\n Eye Tracking Research & Applications"},{"id":"http://arxiv.org/abs/2311.16728v2","updated":"2024-04-08T13:17:05Z","published":"2023-11-28T12:19:00Z","title":"Photo-SLAM: Real-time Simultaneous Localization and Photorealistic\n Mapping for Monocular, Stereo, and RGB-D Cameras","summary":" The integration of neural rendering and the SLAM system recently showed\npromising results in joint localization and photorealistic view reconstruction.\nHowever, existing methods, fully relying on implicit representations, are so\nresource-hungry that they cannot run on portable devices, which deviates from\nthe original intention of SLAM. In this paper, we present Photo-SLAM, a novel\nSLAM framework with a hyper primitives map. Specifically, we simultaneously\nexploit explicit geometric features for localization and learn implicit\nphotometric features to represent the texture information of the observed\nenvironment. In addition to actively densifying hyper primitives based on\ngeometric features, we further introduce a Gaussian-Pyramid-based training\nmethod to progressively learn multi-level features, enhancing photorealistic\nmapping performance. The extensive experiments with monocular, stereo, and\nRGB-D datasets prove that our proposed system Photo-SLAM significantly\noutperforms current state-of-the-art SLAM systems for online photorealistic\nmapping, e.g., PSNR is 30% higher and rendering speed is hundreds of times\nfaster in the Replica dataset. Moreover, the Photo-SLAM can run at real-time\nspeed using an embedded platform such as Jetson AGX Orin, showing the potential\nof robotics applications.\n","authors":["Huajian Huang","Longwei Li","Hui Cheng","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.16728v2.pdf","comment":"CVPR 2024. Code: https://github.com/HuajianUP/Photo-SLAM - Project\n Page: https://huajianup.github.io/research/Photo-SLAM/"},{"id":"http://arxiv.org/abs/2311.17389v2","updated":"2024-04-08T13:15:03Z","published":"2023-11-29T06:42:12Z","title":"360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization\n with Cross-device Queries","summary":" Portable 360$^\\circ$ cameras are becoming a cheap and efficient tool to\nestablish large visual databases. By capturing omnidirectional views of a\nscene, these cameras could expedite building environment models that are\nessential for visual localization. However, such an advantage is often\noverlooked due to the lack of valuable datasets. This paper introduces a new\nbenchmark dataset, 360Loc, composed of 360$^\\circ$ images with ground truth\nposes for visual localization. We present a practical implementation of\n360$^\\circ$ mapping combining 360$^\\circ$ images with lidar data to generate\nthe ground truth 6DoF poses. 360Loc is the first dataset and benchmark that\nexplores the challenge of cross-device visual positioning, involving\n360$^\\circ$ reference frames, and query frames from pinhole, ultra-wide FoV\nfisheye, and 360$^\\circ$ cameras. We propose a virtual camera approach to\ngenerate lower-FoV query frames from 360$^\\circ$ images, which ensures a fair\ncomparison of performance among different query types in visual localization\ntasks. We also extend this virtual camera approach to feature matching-based\nand pose regression-based methods to alleviate the performance loss caused by\nthe cross-device domain gap, and evaluate its effectiveness against\nstate-of-the-art baselines. We demonstrate that omnidirectional visual\nlocalization is more robust in challenging large-scale scenes with symmetries\nand repetitive structures. These results provide new insights into 360-camera\nmapping and omnidirectional visual localization with cross-device queries.\n","authors":["Huajian Huang","Changkun Liu","Yipeng Zhu","Hui Cheng","Tristan Braud","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.17389v2.pdf","comment":"CVPR 2024. Project Page: https://huajianup.github.io/research/360Loc/"},{"id":"http://arxiv.org/abs/2404.05490v1","updated":"2024-04-08T13:11:57Z","published":"2024-04-08T13:11:57Z","title":"Two-Person Interaction Augmentation with Skeleton Priors","summary":" Close and continuous interaction with rich contacts is a crucial aspect of\nhuman activities (e.g. hugging, dancing) and of interest in many domains like\nactivity recognition, motion prediction, character animation, etc. However,\nacquiring such skeletal motion is challenging. While direct motion capture is\nexpensive and slow, motion editing/generation is also non-trivial, as complex\ncontact patterns with topological and geometric constraints have to be\nretained. To this end, we propose a new deep learning method for two-body\nskeletal interaction motion augmentation, which can generate variations of\ncontact-rich interactions with varying body sizes and proportions while\nretaining the key geometric/topological relations between two bodies. Our\nsystem can learn effectively from a relatively small amount of data and\ngeneralize to drastically different skeleton sizes. Through exhaustive\nevaluation and comparison, we show it can generate high-quality motions, has\nstrong generalizability and outperforms traditional optimization-based methods\nand alternative deep learning solutions.\n","authors":["Baiyi Li","Edmond S. L. Ho","Hubert P. H. Shum","He Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00226v2","updated":"2024-04-08T13:05:11Z","published":"2024-03-30T02:56:54Z","title":"Design as Desired: Utilizing Visual Question Answering for Multimodal\n Pre-training","summary":" Multimodal pre-training demonstrates its potential in the medical domain,\nwhich learns medical visual representations from paired medical reports.\nHowever, many pre-training tasks require extra annotations from clinicians, and\nmost of them fail to explicitly guide the model to learn the desired features\nof different pathologies. To the best of our knowledge, we are the first to\nutilize Visual Question Answering (VQA) for multimodal pre-training to guide\nthe framework focusing on targeted pathological features. In this work, we\nleverage descriptions in medical reports to design multi-granular\nquestion-answer pairs associated with different diseases, which assist the\nframework in pre-training without requiring extra annotations from experts. We\nalso propose a novel pre-training framework with a quasi-textual feature\ntransformer, a module designed to transform visual features into a\nquasi-textual space closer to the textual domain via a contrastive learning\nstrategy. This narrows the vision-language gap and facilitates modality\nalignment. Our framework is applied to four downstream tasks: report\ngeneration, classification, segmentation, and detection across five datasets.\nExtensive experiments demonstrate the superiority of our framework compared to\nother state-of-the-art methods. Our code will be released upon acceptance.\n","authors":["Tongkun Su","Jun Li","Xi Zhang","Haibo Jin","Hao Chen","Qiong Wang","Faqin Lv","Baoliang Zhao","Yin Hu"],"pdf_url":"https://arxiv.org/pdf/2404.00226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01941v3","updated":"2024-04-08T12:51:35Z","published":"2024-04-02T13:33:31Z","title":"LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging","summary":" Human pose and shape (HPS) estimation with lensless imaging is not only\nbeneficial to privacy protection but also can be used in covert surveillance\nscenarios due to the small size and simple structure of this device. However,\nthis task presents significant challenges due to the inherent ambiguity of the\ncaptured measurements and lacks effective methods for directly estimating human\npose and shape from lensless data. In this paper, we propose the first\nend-to-end framework to recover 3D human poses and shapes from lensless\nmeasurements to our knowledge. We specifically design a multi-scale lensless\nfeature decoder to decode the lensless measurements through the optically\nencoded mask for efficient feature extraction. We also propose a double-head\nauxiliary supervision mechanism to improve the estimation accuracy of human\nlimb ends. Besides, we establish a lensless imaging system and verify the\neffectiveness of our method on various datasets acquired by our lensless\nimaging system.\n","authors":["Haoyang Ge","Qiao Feng","Hailong Jia","Xiongzheng Li","Xiangjun Yin","You Zhou","Jingyu Yang","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2404.01941v3.pdf","comment":"Accepted to CVPR 2024. More results available at\n https://cic.tju.edu.cn/faculty/likun/projects/LPSNet"},{"id":"http://arxiv.org/abs/2306.14227v2","updated":"2024-04-08T12:50:51Z","published":"2023-06-25T12:15:44Z","title":"A ground-based dataset and a diffusion model for on-orbit low-light\n image enhancement","summary":" On-orbit service is important for maintaining the sustainability of space\nenvironment. Space-based visible camera is an economical and lightweight sensor\nfor situation awareness during on-orbit service. However, it can be easily\naffected by the low illumination environment. Recently, deep learning has\nachieved remarkable success in image enhancement of natural images, but seldom\napplied in space due to the data bottleneck. In this article, we first propose\na dataset of the Beidou Navigation Satellite for on-orbit low-light image\nenhancement (LLIE). In the automatic data collection scheme, we focus on\nreducing domain gap and improving the diversity of the dataset. we collect\nhardware in-the-loop images based on a robotic simulation testbed imitating\nspace lighting conditions. To evenly sample poses of different orientation and\ndistance without collision, a collision-free working space and pose stratified\nsampling is proposed. Afterwards, a novel diffusion model is proposed. To\nenhance the image contrast without over-exposure and blurring details, we\ndesign a fused attention to highlight the structure and dark region. Finally,\nwe compare our method with previous methods using our dataset, which indicates\nthat our method has a better capacity in on-orbit LLIE.\n","authors":["Yiman Zhu","Lu Wang","Jingyi Yuan","Yu Guo"],"pdf_url":"https://arxiv.org/pdf/2306.14227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05468v1","updated":"2024-04-08T12:46:39Z","published":"2024-04-08T12:46:39Z","title":"Mind-to-Image: Projecting Visual Mental Imagination of the Brain from\n fMRI","summary":" The reconstruction of images observed by subjects from fMRI data collected\nduring visual stimuli has made significant strides in the past decade, thanks\nto the availability of extensive fMRI datasets and advancements in generative\nmodels for image generation. However, the application of visual reconstruction\nhas remained limited. Reconstructing visual imagination presents a greater\nchallenge, with potentially revolutionary applications ranging from aiding\nindividuals with disabilities to verifying witness accounts in court. The\nprimary hurdles in this field are the absence of data collection protocols for\nvisual imagery and the lack of datasets on the subject. Traditionally,\nfMRI-to-image relies on data collected from subjects exposed to visual stimuli,\nwhich poses issues for generating visual imagery based on the difference of\nbrain activity between visual stimulation and visual imagery. For the first\ntime, we have compiled a substantial dataset (around 6h of scans) on visual\nimagery along with a proposed data collection protocol. We then train a\nmodified version of an fMRI-to-image model and demonstrate the feasibility of\nreconstructing images from two modes of imagination: from memory and from pure\nimagination. This marks an important step towards creating a technology that\nallow direct reconstruction of visual imagery.\n","authors":["Hugo Caselles-Dupré","Charles Mellerio","Paul Hérent","Alizée Lopez-Persem","Benoit Béranger","Mathieu Soularue","Pierre Fautrel","Gauthier Vernier","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2404.05468v1.pdf","comment":"Pre-print to be updated"},{"id":"http://arxiv.org/abs/2404.05466v1","updated":"2024-04-08T12:44:24Z","published":"2024-04-08T12:44:24Z","title":"Enhancing Lip Reading with Multi-Scale Video and Multi-Encoder","summary":" Automatic lip-reading (ALR) aims to automatically transcribe spoken content\nfrom a speaker's silent lip motion captured in video. Current mainstream\nlip-reading approaches only use a single visual encoder to model input videos\nof a single scale. In this paper, we propose to enhance lipreading by\nincorporating multi-scale video data and multi-encoder. Specifically, we first\npropose a novel multi-scale lip extraction algorithm based on the size of the\nspeaker's face and an enhanced ResNet3D visual front-end (VFE) to extract lip\nfeatures at different scales. For the multi-encoder, in addition to the\nmainstream Transformer and Conformer, we also incorporate the recently proposed\nBranchformer and EBranchformer as visual encoders. In the experiments, we\nexplore the influence of different video data scales and encoders on ALR system\nperformance and fuse the texts transcribed by all ALR systems using recognizer\noutput voting error reduction (ROVER). Finally, our proposed approach placed\nsecond in the ICME 2024 ChatCLR Challenge Task 2, with a 21.52% reduction in\ncharacter error rate (CER) compared to the official baseline on the evaluation\nset.\n","authors":["He Wang","Pengcheng Guo","Xucheng Wan","Huan Zhou","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.05466v1.pdf","comment":"6 pages, 3 figures, submitted to ICME2024 GC-ChatCLR"},{"id":"http://arxiv.org/abs/2404.05465v1","updated":"2024-04-08T12:43:32Z","published":"2024-04-08T12:43:32Z","title":"HAMMR: HierArchical MultiModal React agents for generic VQA","summary":" Combining Large Language Models (LLMs) with external specialized tools\n(LLMs+tools) is a recent paradigm to solve multimodal tasks such as Visual\nQuestion Answering (VQA). While this approach was demonstrated to work well\nwhen optimized and evaluated for each individual benchmark, in practice it is\ncrucial for the next generation of real-world AI systems to handle a broad\nrange of multimodal problems. Therefore we pose the VQA problem from a unified\nperspective and evaluate a single system on a varied suite of VQA tasks\nincluding counting, spatial reasoning, OCR-based reasoning, visual pointing,\nexternal knowledge, and more. In this setting, we demonstrate that naively\napplying the LLM+tools approach using the combined set of all tools leads to\npoor results. This motivates us to introduce HAMMR: HierArchical MultiModal\nReact. We start from a multimodal ReAct-based system and make it hierarchical\nby enabling our HAMMR agents to call upon other specialized agents. This\nenhances the compositionality of the LLM+tools approach, which we show to be\ncritical for obtaining high accuracy on generic VQA. Concretely, on our generic\nVQA suite, HAMMR outperforms the naive LLM+tools approach by 19.5%.\nAdditionally, HAMMR achieves state-of-the-art results on this task,\noutperforming the generic standalone PaLI-X VQA model by 5.0%.\n","authors":["Lluis Castrejon","Thomas Mensink","Howard Zhou","Vittorio Ferrari","Andre Araujo","Jasper Uijlings"],"pdf_url":"https://arxiv.org/pdf/2404.05465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05447v1","updated":"2024-04-08T12:29:46Z","published":"2024-04-08T12:29:46Z","title":"Pansharpening of PRISMA products for archaeological prospection","summary":" Hyperspectral data recorded from satellite platforms are often ill-suited for\ngeo-archaeological prospection due to low spatial resolution. The established\npotential of hyperspectral data from airborne sensors in identifying\narchaeological features has, on the other side, generated increased interest in\nenhancing hyperspectral data to achieve higher spatial resolution. This\nimprovement is crucial for detecting traces linked to sub-surface\ngeo-archaeological features and can make satellite hyperspectral acquisitions\nmore suitable for archaeological research. This research assesses the usability\nof pansharpened PRISMA satellite products in geo-archaeological prospections.\nThree pan-sharpening methods (GSA, MTF-GLP and HySure) are compared\nquantitatively and qualitatively and tested over the archaeological landscape\nof Aquileia (Italy). The results suggest that the application of pansharpening\ntechniques makes hyperspectral satellite imagery highly suitable, under certain\nconditions, to the identification of sub-surface archaeological features of\nsmall and large size.\n","authors":["Gregory Sech","Giulio Poggi","Marina Ljubenovic","Marco Fiorucci","Arianna Traviglia"],"pdf_url":"https://arxiv.org/pdf/2404.05447v1.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2024 (IGARSS 2024) @IEEE copyright"},{"id":"http://arxiv.org/abs/2301.07409v2","updated":"2024-04-08T12:25:10Z","published":"2023-01-18T10:13:29Z","title":"Representing Noisy Image Without Denoising","summary":" A long-standing topic in artificial intelligence is the effective recognition\nof patterns from noisy images. In this regard, the recent data-driven paradigm\nconsiders 1) improving the representation robustness by adding noisy samples in\ntraining phase (i.e., data augmentation) or 2) pre-processing the noisy image\nby learning to solve the inverse problem (i.e., image denoising). However, such\nmethods generally exhibit inefficient process and unstable result, limiting\ntheir practical applications. In this paper, we explore a non-learning paradigm\nthat aims to derive robust representation directly from noisy images, without\nthe denoising as pre-processing. Here, the noise-robust representation is\ndesigned as Fractional-order Moments in Radon space (FMR), with also beneficial\nproperties of orthogonality and rotation invariance. Unlike earlier\ninteger-order methods, our work is a more generic design taking such classical\nmethods as special cases, and the introduced fractional-order parameter offers\ntime-frequency analysis capability that is not available in classical methods.\nFormally, both implicit and explicit paths for constructing the FMR are\ndiscussed in detail. Extensive simulation experiments and an image security\napplication are provided to demonstrate the uniqueness and usefulness of our\nFMR, especially for noise robustness, rotation invariance, and time-frequency\ndiscriminability.\n","authors":["Shuren Qi","Yushu Zhang","Chao Wang","Tao Xiang","Xiaochun Cao","Yong Xiang"],"pdf_url":"https://arxiv.org/pdf/2301.07409v2.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.05439v1","updated":"2024-04-08T12:18:01Z","published":"2024-04-08T12:18:01Z","title":"Action-conditioned video data improves predictability","summary":" Long-term video generation and prediction remain challenging tasks in\ncomputer vision, particularly in partially observable scenarios where cameras\nare mounted on moving platforms. The interaction between observed image frames\nand the motion of the recording agent introduces additional complexities. To\naddress these issues, we introduce the Action-Conditioned Video Generation\n(ACVG) framework, a novel approach that investigates the relationship between\nactions and generated image frames through a deep dual Generator-Actor\narchitecture. ACVG generates video sequences conditioned on the actions of\nrobots, enabling exploration and analysis of how vision and action mutually\ninfluence one another in dynamic environments. We evaluate the framework's\neffectiveness on an indoor robot motion dataset which consists of sequences of\nimage frames along with the sequences of actions taken by the robotic agent,\nconducting a comprehensive empirical study comparing ACVG to other\nstate-of-the-art frameworks along with a detailed ablation study.\n","authors":["Meenakshi Sarkar","Debasish Ghose"],"pdf_url":"https://arxiv.org/pdf/2404.05439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05297v2","updated":"2024-04-08T12:17:24Z","published":"2024-03-08T13:24:46Z","title":"PEEB: Part-based Image Classifiers with an Explainable and Editable\n Language Bottleneck","summary":" CLIP-based classifiers rely on the prompt containing a {class name} that is\nknown to the text encoder. Therefore, they perform poorly on new classes or the\nclasses whose names rarely appear on the Internet (e.g., scientific names of\nbirds). For fine-grained classification, we propose PEEB - an explainable and\neditable classifier to (1) express the class name into a set of text\ndescriptors that describe the visual parts of that class; and (2) match the\nembeddings of the detected parts to their textual descriptors in each class to\ncompute a logit score for classification. In a zero-shot setting where the\nclass names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1\naccuracy). Compared to part-based classifiers, PEEB is not only the\nstate-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20%\naccuracy on CUB-200 and Dogs-120, respectively) but also the first to enable\nusers to edit the text descriptors to form a new classifier without any\nre-training. Compared to concept bottleneck models, PEEB is also the SOTA in\nboth zero-shot and supervised-learning settings.\n","authors":["Thang M. Pham","Peijie Chen","Tin Nguyen","Seunghyun Yoon","Trung Bui","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2403.05297v2.pdf","comment":"Findings of NAACL 2024 (long paper)"},{"id":"http://arxiv.org/abs/2305.10874v3","updated":"2024-04-08T12:17:01Z","published":"2023-05-18T11:06:15Z","title":"Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation","summary":" With the explosive popularity of AI-generated content (AIGC), video\ngeneration has recently received a lot of attention. Generating videos guided\nby text instructions poses significant challenges, such as modeling the complex\nrelationship between space and time, and the lack of large-scale text-video\npaired data. Existing text-video datasets suffer from limitations in both\ncontent quality and scale, or they are not open-source, rendering them\ninaccessible for study and use. For model design, previous approaches extend\npretrained text-to-image generation models by adding temporal 1D\nconvolution/attention modules for video generation. However, these approaches\noverlook the importance of jointly modeling space and time, inevitably leading\nto temporal distortions and misalignment between texts and videos. In this\npaper, we propose a novel approach that strengthens the interaction between\nspatial and temporal perceptions. In particular, we utilize a swapped\ncross-attention mechanism in 3D windows that alternates the ``query'' role\nbetween spatial and temporal blocks, enabling mutual reinforcement for each\nother. Moreover, to fully unlock model capabilities for high-quality video\ngeneration and promote the development of the field, we curate a large-scale\nand open-source video dataset called HD-VG-130M. This dataset comprises 130\nmillion text-video pairs from the open-domain, ensuring high-definition,\nwidescreen and watermark-free characters. A smaller-scale yet more meticulously\ncleaned subset further enhances the data quality, aiding models in achieving\nsuperior performance. Experimental quantitative and qualitative results\ndemonstrate the superiority of our approach in terms of per-frame quality,\ntemporal correlation, and text-video alignment, with clear margins.\n","authors":["Wenjing Wang","Huan Yang","Zixi Tuo","Huiguo He","Junchen Zhu","Jianlong Fu","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2305.10874v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05426v1","updated":"2024-04-08T11:54:49Z","published":"2024-04-08T11:54:49Z","title":"Test-Time Zero-Shot Temporal Action Localization","summary":" Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate\nactions in untrimmed videos unseen during training. Existing ZS-TAL methods\ninvolve fine-tuning a model on a large amount of annotated training data. While\neffective, training-based ZS-TAL approaches assume the availability of labeled\ndata for supervised learning, which can be impractical in some applications.\nFurthermore, the training process naturally induces a domain bias into the\nlearned model, which may adversely affect the model's generalization ability to\narbitrary videos. These considerations prompt us to approach the ZS-TAL problem\nfrom a radically novel perspective, relaxing the requirement for training data.\nTo this aim, we introduce a novel method that performs Test-Time adaptation for\nTemporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained\nVision and Language Model (VLM). T3AL operates in three steps. First, a\nvideo-level pseudo-label of the action category is computed by aggregating\ninformation from the entire video. Then, action localization is performed\nadopting a novel procedure inspired by self-supervised learning. Finally,\nframe-level textual descriptions extracted with a state-of-the-art captioning\nmodel are employed for refining the action region proposals. We validate the\neffectiveness of T3AL by conducting experiments on the THUMOS14 and the\nActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly\noutperforms zero-shot baselines based on state-of-the-art VLMs, confirming the\nbenefit of a test-time adaptation approach.\n","authors":["Benedetta Liberatori","Alessandro Conti","Paolo Rota","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05937v3","updated":"2024-04-08T11:46:07Z","published":"2024-02-08T18:59:53Z","title":"InstaGen: Enhancing Object Detection by Training on Synthetic Dataset","summary":" In this paper, we present a novel paradigm to enhance the ability of object\ndetector, e.g., expanding categories or improving detection performance, by\ntraining on synthetic dataset generated from diffusion models. Specifically, we\nintegrate an instance-level grounding head into a pre-trained, generative\ndiffusion model, to augment it with the ability of localising instances in the\ngenerated images. The grounding head is trained to align the text embedding of\ncategory names with the regional visual feature of the diffusion model, using\nsupervision from an off-the-shelf object detector, and a novel self-training\nscheme on (novel) categories not covered by the detector. We conduct thorough\nexperiments to show that, this enhanced version of diffusion model, termed as\nInstaGen, can serve as a data synthesizer, to enhance object detectors by\ntraining on its generated samples, demonstrating superior performance over\nexisting state-of-the-art methods in open-vocabulary (+4.5 AP) and data-sparse\n(+1.2 to 5.2 AP) scenarios. Project page with code:\nhttps://fcjian.github.io/InstaGen.\n","authors":["Chengjian Feng","Yujie Zhong","Zequn Jie","Weidi Xie","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2402.05937v3.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.05414v1","updated":"2024-04-08T11:32:26Z","published":"2024-04-08T11:32:26Z","title":"Two Hands Are Better Than One: Resolving Hand to Hand Intersections via\n Occupancy Networks","summary":" 3D hand pose estimation from images has seen considerable interest from the\nliterature, with new methods improving overall 3D accuracy. One current\nchallenge is to address hand-to-hand interaction where self-occlusions and\nfinger articulation pose a significant problem to estimation. Little work has\napplied physical constraints that minimize the hand intersections that occur as\na result of noisy estimation. This work addresses the intersection of hands by\nexploiting an occupancy network that represents the hand's volume as a\ncontinuous manifold. This allows us to model the probability distribution of\npoints being inside a hand. We designed an intersection loss function to\nminimize the likelihood of hand-to-point intersections. Moreover, we propose a\nnew hand mesh parameterization that is superior to the commonly used MANO model\nin many respects including lower mesh complexity, underlying 3D skeleton\nextraction, watertightness, etc. On the benchmark InterHand2.6M dataset, the\nmodels trained using our intersection loss achieve better results than the\nstate-of-the-art by significantly decreasing the number of hand intersections\nwhile lowering the mean per-joint positional error. Additionally, we\ndemonstrate superior performance for 3D hand uplift on Re:InterHand and SMILE\ndatasets and show reduced hand-to-hand intersections for complex domains such\nas sign-language pose estimation.\n","authors":["Maksym Ivashechkin","Oscar Mendez","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2404.05414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06704v3","updated":"2024-04-08T11:24:30Z","published":"2023-12-10T11:45:45Z","title":"SIFU: Side-view Conditioned Implicit Function for Real-world Usable\n Clothed Human Reconstruction","summary":" Creating high-quality 3D models of clothed humans from single images for\nreal-world applications is crucial. Despite recent advancements, accurately\nreconstructing humans in complex poses or with loose clothing from in-the-wild\nimages, along with predicting textures for unseen areas, remains a significant\nchallenge. A key limitation of previous methods is their insufficient prior\nguidance in transitioning from 2D to 3D and in texture prediction. In response,\nwe introduce SIFU (Side-view Conditioned Implicit Function for Real-world\nUsable Clothed Human Reconstruction), a novel approach combining a Side-view\nDecoupling Transformer with a 3D Consistent Texture Refinement pipeline.SIFU\nemploys a cross-attention mechanism within the transformer, using SMPL-X\nnormals as queries to effectively decouple side-view features in the process of\nmapping 2D features to 3D. This method not only improves the precision of the\n3D models but also their robustness, especially when SMPL-X estimates are not\nperfect. Our texture refinement process leverages text-to-image diffusion-based\nprior to generate realistic and consistent textures for invisible views.\nThrough extensive experiments, SIFU surpasses SOTA methods in both geometry and\ntexture reconstruction, showcasing enhanced robustness in complex scenarios and\nachieving an unprecedented Chamfer and P2S measurement. Our approach extends to\npractical applications such as 3D printing and scene building, demonstrating\nits broad utility in real-world scenarios. Project page\nhttps://river-zhang.github.io/SIFU-projectpage/ .\n","authors":["Zechuan Zhang","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2312.06704v3.pdf","comment":"Accepted by CVPR 2024; Project page\n https://river-zhang.github.io/SIFU-projectpage/"},{"id":"http://arxiv.org/abs/2303.13514v3","updated":"2024-04-08T11:22:05Z","published":"2023-03-23T17:59:35Z","title":"SAOR: Single-View Articulated Object Reconstruction","summary":" We introduce SAOR, a novel approach for estimating the 3D shape, texture, and\nviewpoint of an articulated object from a single image captured in the wild.\nUnlike prior approaches that rely on pre-defined category-specific 3D templates\nor tailored 3D skeletons, SAOR learns to articulate shapes from single-view\nimage collections with a skeleton-free part-based model without requiring any\n3D object shape priors. To prevent ill-posed solutions, we propose a\ncross-instance consistency loss that exploits disentangled object shape\ndeformation and articulation. This is helped by a new silhouette-based sampling\nmechanism to enhance viewpoint diversity during training. Our method only\nrequires estimated object silhouettes and relative depth maps from\noff-the-shelf pre-trained networks during training. At inference time, given a\nsingle-view image, it efficiently outputs an explicit mesh representation. We\nobtain improved qualitative and quantitative results on challenging quadruped\nanimals compared to relevant existing work.\n","authors":["Mehmet Aygün","Oisin Mac Aodha"],"pdf_url":"https://arxiv.org/pdf/2303.13514v3.pdf","comment":"Accepted to CVPR 2024, website: https://mehmetaygun.github.io/saor"},{"id":"http://arxiv.org/abs/2404.05409v1","updated":"2024-04-08T11:20:28Z","published":"2024-04-08T11:20:28Z","title":"Anatomical Conditioning for Contrastive Unpaired Image-to-Image\n Translation of Optical Coherence Tomography Images","summary":" For a unified analysis of medical images from different modalities, data\nharmonization using image-to-image (I2I) translation is desired. We study this\nproblem employing an optical coherence tomography (OCT) data set of\nSpectralis-OCT and Home-OCT images. I2I translation is challenging because the\nimages are unpaired, and a bijective mapping does not exist due to the\ninformation discrepancy between both domains. This problem has been addressed\nby the Contrastive Learning for Unpaired I2I Translation (CUT) approach, but it\nreduces semantic consistency. To restore the semantic consistency, we support\nthe style decoder using an additional segmentation decoder. Our approach\nincreases the similarity between the style-translated images and the target\ndistribution. Importantly, we improve the segmentation of biomarkers in\nHome-OCT images in an unsupervised domain adaptation scenario. Our data\nharmonization approach provides potential for the monitoring of diseases, e.g.,\nage related macular disease, using different OCT devices.\n","authors":["Marc S. Seibel","Hristina Uzunova","Timo Kepp","Heinz Handels"],"pdf_url":"https://arxiv.org/pdf/2404.05409v1.pdf","comment":"Accepted at ISBI 2024"},{"id":"http://arxiv.org/abs/2311.10605v2","updated":"2024-04-08T10:59:06Z","published":"2023-11-17T16:01:06Z","title":"CA-Jaccard: Camera-aware Jaccard Distance for Person Re-identification","summary":" Person re-identification (re-ID) is a challenging task that aims to learn\ndiscriminative features for person retrieval. In person re-ID, Jaccard distance\nis a widely used distance metric, especially in re-ranking and clustering\nscenarios. However, we discover that camera variation has a significant\nnegative impact on the reliability of Jaccard distance. In particular, Jaccard\ndistance calculates the distance based on the overlap of relevant neighbors.\nDue to camera variation, intra-camera samples dominate the relevant neighbors,\nwhich reduces the reliability of the neighbors by introducing intra-camera\nnegative samples and excluding inter-camera positive samples. To overcome this\nproblem, we propose a novel camera-aware Jaccard (CA-Jaccard) distance that\nleverages camera information to enhance the reliability of Jaccard distance.\nSpecifically, we design camera-aware k-reciprocal nearest neighbors (CKRNNs) to\nfind k-reciprocal nearest neighbors on the intra-camera and inter-camera\nranking lists, which improves the reliability of relevant neighbors and\nguarantees the contribution of inter-camera samples in the overlap. Moreover,\nwe propose a camera-aware local query expansion (CLQE) to mine reliable samples\nin relevant neighbors by exploiting camera variation as a strong constraint and\nassign these samples higher weights in overlap, further improving the\nreliability. Our CA-Jaccard distance is simple yet effective and can serve as a\ngeneral distance metric for person re-ID methods with high reliability and low\ncomputational cost. Extensive experiments demonstrate the effectiveness of our\nmethod.\n","authors":["Yiyu Chen","Zheyi Fan","Zhaoru Chen","Yixuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.10605v2.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2309.04190v4","updated":"2024-04-08T10:57:42Z","published":"2023-09-08T08:03:42Z","title":"SegmentAnything helps microscopy images based automatic and quantitative\n organoid detection and analysis","summary":" Organoids are self-organized 3D cell clusters that closely mimic the\narchitecture and function of in vivo tissues and organs. Quantification of\norganoid morphology helps in studying organ development, drug discovery, and\ntoxicity assessment. Recent microscopy techniques provide a potent tool to\nacquire organoid morphology features, but manual image analysis remains a labor\nand time-intensive process. Thus, this paper proposes a comprehensive pipeline\nfor microscopy analysis that leverages the SegmentAnything to precisely\ndemarcate individual organoids. Additionally, we introduce a set of\nmorphological properties, including perimeter, area, radius, non-smoothness,\nand non-circularity, allowing researchers to analyze the organoid structures\nquantitatively and automatically. To validate the effectiveness of our\napproach, we conducted tests on bright-field images of human induced\npluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The\nresults obtained from our automatic pipeline closely align with manual organoid\ndetection and measurement, showcasing the capability of our proposed method in\naccelerating organoids morphology analysis.\n","authors":["Xiaodan Xing","Chunling Tang","Yunzhe Guo","Nicholas Kurniawan","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2309.04190v4.pdf","comment":"Replace Figure 4 with the correct version. The original version is\n wrong due to a column name mismatch"},{"id":"http://arxiv.org/abs/2404.05393v1","updated":"2024-04-08T10:52:29Z","published":"2024-04-08T10:52:29Z","title":"PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation","summary":" Beyond class frequency, we recognize the impact of class-wise relationships\namong various class-specific predictions and the imbalance in label masks on\nlong-tailed segmentation learning. To address these challenges, we propose an\ninnovative Pixel-wise Adaptive Training (PAT) technique tailored for\nlong-tailed segmentation. PAT has two key features: 1) class-wise gradient\nmagnitude homogenization, and 2) pixel-wise class-specific loss adaptation\n(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate\nthe imbalance among label masks by ensuring equal consideration of the\nclass-wise impact on model updates. Second, PCLA tackles the detrimental impact\nof both rare classes within the long-tailed distribution and inaccurate\npredictions from previous training stages by encouraging learning classes with\nlow prediction confidence and guarding against forgetting classes with high\nconfidence. This combined approach fosters robust learning while preventing the\nmodel from forgetting previously learned knowledge. PAT exhibits significant\nperformance improvements, surpassing the current state-of-the-art by 2.2% in\nthe NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and\nintersection over union value by 2.07%, with a particularly notable declination\nof 0.39% in detecting rare classes compared to Balance Logits Variation, as\ndemonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and\nNYU.\n","authors":["Khoi Do","Duong Nguyen","Nguyen H. Tran","Viet Dung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05392v1","updated":"2024-04-08T10:51:29Z","published":"2024-04-08T10:51:29Z","title":"T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise\n Event Spotting in Sports Videos","summary":" In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer\nEncoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses\nmultiple challenges in the task, including the need for discriminability among\nframe representations, high output temporal resolution to maintain prediction\nprecision, and the necessity to capture information at different temporal\nscales to handle events with varying dynamics. It tackles these challenges\nthrough its specifically designed architecture, featuring an encoder-decoder\nfor leveraging multiple temporal scales and achieving high output temporal\nresolution, along with temporal modules designed to increase token\ndiscriminability. Leveraging these characteristics, T-DEED achieves SOTA\nperformance on the FigureSkating and FineDiving datasets.\n","authors":["Artur Xarles","Sergio Escalera","Thomas B. Moeslund","Albert Clapés"],"pdf_url":"https://arxiv.org/pdf/2404.05392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15288v2","updated":"2024-04-08T10:48:22Z","published":"2023-12-23T16:05:47Z","title":"Understanding normalization in contrastive representation learning and\n out-of-distribution detection","summary":" Contrastive representation learning has emerged as an outstanding approach\nfor anomaly detection. In this work, we explore the $\\ell_2$-norm of\ncontrastive features and its applications in out-of-distribution detection. We\npropose a simple method based on contrastive learning, which incorporates\nout-of-distribution data by discriminating against normal samples in the\ncontrastive layer space. Our approach can be applied flexibly as an outlier\nexposure (OE) approach, where the out-of-distribution data is a huge collective\nof random images, or as a fully self-supervised learning approach, where the\nout-of-distribution data is self-generated by applying distribution-shifting\ntransformations. The ability to incorporate additional out-of-distribution\nsamples enables a feasible solution for datasets where AD methods based on\ncontrastive learning generally underperform, such as aerial images or\nmicroscopy images. Furthermore, the high-quality features learned through\ncontrastive learning consistently enhance performance in OE scenarios, even\nwhen the available out-of-distribution dataset is not diverse enough. Our\nextensive experiments demonstrate the superiority of our proposed method under\nvarious scenarios, including unimodal and multimodal settings, with various\nimage datasets.\n","authors":["Tai Le-Gia","Jaehyun Ahn"],"pdf_url":"https://arxiv.org/pdf/2312.15288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05384v1","updated":"2024-04-08T10:45:29Z","published":"2024-04-08T10:45:29Z","title":"Rethinking the Spatial Inconsistency in Classifier-Free Diffusion\n Guidance","summary":" Classifier-Free Guidance (CFG) has been widely used in text-to-image\ndiffusion models, where the CFG scale is introduced to control the strength of\ntext guidance on the whole image space. However, we argue that a global CFG\nscale results in spatial inconsistency on varying semantic strengths and\nsuboptimal image quality. To address this problem, we present a novel approach,\nSemantic-aware Classifier-Free Guidance (S-CFG), to customize the guidance\ndegrees for different semantic units in text-to-image diffusion models.\nSpecifically, we first design a training-free semantic segmentation method to\npartition the latent image into relatively independent semantic regions at each\ndenoising step. In particular, the cross-attention map in the denoising U-net\nbackbone is renormalized for assigning each patch to the corresponding token,\nwhile the self-attention map is used to complete the semantic regions. Then, to\nbalance the amplification of diverse semantic units, we adaptively adjust the\nCFG scales across different semantic regions to rescale the text guidance\ndegrees into a uniform level. Finally, extensive experiments demonstrate the\nsuperiority of S-CFG over the original CFG strategy on various text-to-image\ndiffusion models, without requiring any extra training cost. our codes are\navailable at https://github.com/SmilesDZgk/S-CFG.\n","authors":["Dazhong Shen","Guanglu Song","Zeyue Xue","Fu-Yun Wang","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05384v1.pdf","comment":"accepted by CVPR-2024"},{"id":"http://arxiv.org/abs/2305.15873v2","updated":"2024-04-08T10:28:38Z","published":"2023-05-25T09:09:32Z","title":"Confronting Ambiguity in 6D Object Pose Estimation via Score-Based\n Diffusion on SE(3)","summary":" Addressing pose ambiguity in 6D object pose estimation from single RGB images\npresents a significant challenge, particularly due to object symmetries or\nocclusions. In response, we introduce a novel score-based diffusion method\napplied to the $SE(3)$ group, marking the first application of diffusion models\nto $SE(3)$ within the image domain, specifically tailored for pose estimation\ntasks. Extensive evaluations demonstrate the method's efficacy in handling pose\nambiguity, mitigating perspective-induced ambiguity, and showcasing the\nrobustness of our surrogate Stein score formulation on $SE(3)$. This\nformulation not only improves the convergence of denoising process but also\nenhances computational efficiency. Thus, we pioneer a promising strategy for 6D\nobject pose estimation.\n","authors":["Tsu-Ching Hsiao","Hao-Wei Chen","Hsuan-Kung Yang","Chun-Yi Lee"],"pdf_url":"https://arxiv.org/pdf/2305.15873v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.05366v1","updated":"2024-04-08T10:05:24Z","published":"2024-04-08T10:05:24Z","title":"CDAD-Net: Bridging Domain Gaps in Generalized Category Discovery","summary":" In Generalized Category Discovery (GCD), we cluster unlabeled samples of\nknown and novel classes, leveraging a training dataset of known classes. A\nsalient challenge arises due to domain shifts between these datasets. To\naddress this, we present a novel setting: Across Domain Generalized Category\nDiscovery (AD-GCD) and bring forth CDAD-NET (Class Discoverer Across Domains)\nas a remedy. CDAD-NET is architected to synchronize potential known class\nsamples across both the labeled (source) and unlabeled (target) datasets, while\nemphasizing the distinct categorization of the target data. To facilitate this,\nwe propose an entropy-driven adversarial learning strategy that accounts for\nthe distance distributions of target samples relative to source-domain class\nprototypes. Parallelly, the discriminative nature of the shared space is upheld\nthrough a fusion of three metric learning objectives. In the source domain, our\nfocus is on refining the proximity between samples and their affiliated class\nprototypes, while in the target domain, we integrate a neighborhood-centric\ncontrastive learning mechanism, enriched with an adept neighborsmining\napproach. To further accentuate the nuanced feature interrelation among\nsemantically aligned images, we champion the concept of conditional image\ninpainting, underscoring the premise that semantically analogous images prove\nmore efficacious to the task than their disjointed counterparts.\nExperimentally, CDAD-NET eclipses existing literature with a performance\nincrement of 8-15% on three AD-GCD benchmarks we present.\n","authors":["Sai Bhargav Rongali","Sarthak Mehrotra","Ankit Jha","Mohamad Hassan N C","Shirsha Bose","Tanisha Gupta","Mainak Singha","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2404.05366v1.pdf","comment":"Accepted in L3D-IVU, CVPR Workshop, 2024"},{"id":"http://arxiv.org/abs/2308.13888v3","updated":"2024-04-08T10:04:29Z","published":"2023-08-26T14:12:19Z","title":"Neural Implicit Morphing of Face Images","summary":" Face morphing is a problem in computer graphics with numerous artistic and\nforensic applications. It is challenging due to variations in pose, lighting,\ngender, and ethnicity. This task consists of a warping for feature alignment\nand a blending for a seamless transition between the warped images. We propose\nto leverage coord-based neural networks to represent such warpings and\nblendings of face images. During training, we exploit the smoothness and\nflexibility of such networks by combining energy functionals employed in\nclassical approaches without discretizations. Additionally, our method is\ntime-dependent, allowing a continuous warping/blending of the images. During\nmorphing inference, we need both direct and inverse transformations of the\ntime-dependent warping. The first (second) is responsible for warping the\ntarget (source) image into the source (target) image. Our neural warping stores\nthose maps in a single network dismissing the need for inverting them. The\nresults of our experiments indicate that our method is competitive with both\nclassical and generative models under the lens of image quality and\nface-morphing detectors. Aesthetically, the resulting images present a seamless\nblending of diverse faces not yet usual in the literature.\n","authors":["Guilherme Schardong","Tiago Novello","Hallison Paz","Iurii Medvedev","Vinícius da Silva","Luiz Velho","Nuno Gonçalves"],"pdf_url":"https://arxiv.org/pdf/2308.13888v3.pdf","comment":"14 pages, 20 figures, accepted for CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05362v1","updated":"2024-04-08T09:54:28Z","published":"2024-04-08T09:54:28Z","title":"Multi-head Attention-based Deep Multiple Instance Learning","summary":" This paper introduces MAD-MIL, a Multi-head Attention-based Deep Multiple\nInstance Learning model, designed for weakly supervised Whole Slide Images\n(WSIs) classification in digital pathology. Inspired by the multi-head\nattention mechanism of the Transformer, MAD-MIL simplifies model complexity\nwhile achieving competitive results against advanced models like CLAM and\nDS-MIL. Evaluated on the MNIST-BAGS and public datasets, including TUPAC16,\nTCGA BRCA, TCGA LUNG, and TCGA KIDNEY, MAD-MIL consistently outperforms ABMIL.\nThis demonstrates enhanced information diversity, interpretability, and\nefficiency in slide representation. The model's effectiveness, coupled with\nfewer trainable parameters and lower computational complexity makes it a\npromising solution for automated pathology workflows. Our code is available at\nhttps://github.com/tueimage/MAD-MIL.\n","authors":["Hassan Keshvarikhojasteh","Josien Pluim","Mitko Veta"],"pdf_url":"https://arxiv.org/pdf/2404.05362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01585v3","updated":"2024-04-08T09:53:27Z","published":"2023-02-03T07:35:53Z","title":"SegForestNet: Spatial-Partitioning-Based Aerial Image Segmentation","summary":" Aerial image segmentation is the basis for applications such as automatically\ncreating maps or tracking deforestation. In true orthophotos, which are often\nused in these applications, many objects and regions can be approximated well\nby polygons. However, this fact is rarely exploited by state-of-the-art\nsemantic segmentation models. Instead, most models allow unnecessary degrees of\nfreedom in their predictions by allowing arbitrary region shapes. We therefore\npresent a refinement of our deep learning model which predicts binary space\npartitioning trees, an efficient polygon representation. The refinements\ninclude a new feature decoder architecture and a new differentiable BSP tree\nrenderer which both avoid vanishing gradients. Additionally, we designed a\nnovel loss function specifically designed to improve the spatial partitioning\ndefined by the predicted trees. Furthermore, our expanded model can predict\nmultiple trees at once and thus can predict class-specific segmentations. As an\nadditional contribution, we investigate the impact of a non-optimal training\nprocess in comparison to an optimized training process. While model\narchitectures optimized for aerial images, such as PFNet or our own model, show\nan advantage under non-optimal conditions, this advantage disappears under\noptimal training conditions. Despite this observation, our model still makes\nbetter predictions for small rectangular objects, e.g., cars.\n","authors":["Daniel Gritzner","Jörn Ostermann"],"pdf_url":"https://arxiv.org/pdf/2302.01585v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05357v1","updated":"2024-04-08T09:48:02Z","published":"2024-04-08T09:48:02Z","title":"CNN-based Game State Detection for a Foosball Table","summary":" The automation of games using Deep Reinforcement Learning Strategies (DRL) is\na well-known challenge in AI research. While for feature extraction in a video\ngame typically the whole image is used, this is hardly practical for many real\nworld games. Instead, using a smaller game state reducing the dimension of the\nparameter space to include essential parameters only seems to be a promising\napproach. In the game of Foosball, a compact and comprehensive game state\ndescription consists of the positional shifts and rotations of the figures and\nthe position of the ball over time. In particular, velocities and accelerations\ncan be derived from consecutive time samples of the game state. In this paper,\na figure detection system to determine the game state in Foosball is presented.\nWe capture a dataset containing the rotations of the rods which were measured\nusing accelerometers and the positional shifts were derived using traditional\nComputer Vision techniques (in a laboratory setting). This dataset is utilized\nto train Convolutional Neural Network (CNN) based end-to-end regression models\nto predict the rotations and shifts of each rod. We present an evaluation of\nour system using different state-of-the-art CNNs as base architectures for the\nregression model. We show that our system is able to predict the game state\nwith high accuracy. By providing data for both black and white teams, the\npresented system is intended to provide the required data for future\ndevelopments of Imitation Learning techniques w.r.t. to observing human\nplayers.\n","authors":["David Hagens","Jan Knaup","Elke Hergenröther","Andreas Weinmann"],"pdf_url":"https://arxiv.org/pdf/2404.05357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05348v1","updated":"2024-04-08T09:33:40Z","published":"2024-04-08T09:33:40Z","title":"Iterative Refinement Strategy for Automated Data Labeling: Facial\n Landmark Diagnosis in Medical Imaging","summary":" Automated data labeling techniques are crucial for accelerating the\ndevelopment of deep learning models, particularly in complex medical imaging\napplications. However, ensuring accuracy and efficiency remains challenging.\nThis paper presents iterative refinement strategies for automated data labeling\nin facial landmark diagnosis to enhance accuracy and efficiency for deep\nlearning models in medical applications, including dermatology, plastic\nsurgery, and ophthalmology. Leveraging feedback mechanisms and advanced\nalgorithms, our approach iteratively refines initial labels, reducing reliance\non manual intervention while improving label quality. Through empirical\nevaluation and case studies, we demonstrate the effectiveness of our proposed\nstrategies in deep learning tasks across medical imaging domains. Our results\nhighlight the importance of iterative refinement in automated data labeling to\nenhance the capabilities of deep learning systems in medical imaging\napplications.\n","authors":["Yu-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13263v2","updated":"2024-04-08T09:31:33Z","published":"2023-06-23T02:19:52Z","title":"Synthetic data shuffling accelerates the convergence of federated\n learning under data heterogeneity","summary":" In federated learning, data heterogeneity is a critical challenge. A\nstraightforward solution is to shuffle the clients' data to homogenize the\ndistribution. However, this may violate data access rights, and how and when\nshuffling can accelerate the convergence of a federated optimization algorithm\nis not theoretically well understood. In this paper, we establish a precise and\nquantifiable correspondence between data heterogeneity and parameters in the\nconvergence rate when a fraction of data is shuffled across clients. We prove\nthat shuffling can quadratically reduce the gradient dissimilarity with respect\nto the shuffling percentage, accelerating convergence. Inspired by the theory,\nwe propose a practical approach that addresses the data access rights issue by\nshuffling locally generated synthetic data. The experimental results show that\nshuffling synthetic data improves the performance of multiple existing\nfederated learning algorithms by a large margin.\n","authors":["Bo Li","Yasin Esfandiari","Mikkel N. Schmidt","Tommy S. Alstrøm","Sebastian U. Stich"],"pdf_url":"https://arxiv.org/pdf/2306.13263v2.pdf","comment":"Accepted at TMLR"},{"id":"http://arxiv.org/abs/2404.05341v1","updated":"2024-04-08T09:27:42Z","published":"2024-04-08T09:27:42Z","title":"Comparative Analysis of Image Enhancement Techniques for Brain Tumor\n Segmentation: Contrast, Histogram, and Hybrid Approaches","summary":" This study systematically investigates the impact of image enhancement\ntechniques on Convolutional Neural Network (CNN)-based Brain Tumor\nSegmentation, focusing on Histogram Equalization (HE), Contrast Limited\nAdaptive Histogram Equalization (CLAHE), and their hybrid variations. Employing\nthe U-Net architecture on a dataset of 3064 Brain MRI images, the research\ndelves into preprocessing steps, including resizing and enhancement, to\noptimize segmentation accuracy. A detailed analysis of the CNN-based U-Net\narchitecture, training, and validation processes is provided. The comparative\nanalysis, utilizing metrics such as Accuracy, Loss, MSE, IoU, and DSC, reveals\nthat the hybrid approach CLAHE-HE consistently outperforms others. Results\nhighlight its superior accuracy (0.9982, 0.9939, 0.9936 for training, testing,\nand validation, respectively) and robust segmentation overlap, with Jaccard\nvalues of 0.9862, 0.9847, and 0.9864, and Dice values of 0.993, 0.9923, and\n0.9932 for the same phases, emphasizing its potential in neuro-oncological\napplications. The study concludes with a call for refinement in segmentation\nmethodologies to further enhance diagnostic precision and treatment planning in\nneuro-oncology.\n","authors":["Shoffan Saifullah","Andri Pranolo","Rafał Dreżewski"],"pdf_url":"https://arxiv.org/pdf/2404.05341v1.pdf","comment":"9 Pages, & Figures, 2 Tables, International Conference on Computer\n Science Electronics and Information (ICCSEI 2023)"},{"id":"http://arxiv.org/abs/2404.05331v1","updated":"2024-04-08T09:18:32Z","published":"2024-04-08T09:18:32Z","title":"Mask-ControlNet: Higher-Quality Image Generation with An Additional Mask\n Prompt","summary":" Text-to-image generation has witnessed great progress, especially with the\nrecent advancements in diffusion models. Since texts cannot provide detailed\nconditions like object appearance, reference images are usually leveraged for\nthe control of objects in the generated images. However, existing methods still\nsuffer limited accuracy when the relationship between the foreground and\nbackground is complicated. To address this issue, we develop a framework termed\nMask-ControlNet by introducing an additional mask prompt. Specifically, we\nfirst employ large vision models to obtain masks to segment the objects of\ninterest in the reference image. Then, the object images are employed as\nadditional prompts to facilitate the diffusion model to better understand the\nrelationship between foreground and background regions during image generation.\nExperiments show that the mask prompts enhance the controllability of the\ndiffusion model to maintain higher fidelity to the reference image while\nachieving better image quality. Comparison with previous text-to-image\ngeneration methods demonstrates our method's superior quantitative and\nqualitative performance on the benchmark datasets.\n","authors":["Zhiqi Huang","Huixin Xiong","Haoyu Wang","Longguang Wang","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05317v1","updated":"2024-04-08T09:08:43Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.12017v2","updated":"2024-04-08T09:02:40Z","published":"2023-03-21T16:54:01Z","title":"Learning Optical Flow and Scene Flow with Bidirectional Camera-LiDAR\n Fusion","summary":" In this paper, we study the problem of jointly estimating the optical flow\nand scene flow from synchronized 2D and 3D data. Previous methods either employ\na complex pipeline that splits the joint task into independent stages, or fuse\n2D and 3D information in an ``early-fusion'' or ``late-fusion'' manner. Such\none-size-fits-all approaches suffer from a dilemma of failing to fully utilize\nthe characteristic of each modality or to maximize the inter-modality\ncomplementarity. To address the problem, we propose a novel end-to-end\nframework, which consists of 2D and 3D branches with multiple bidirectional\nfusion connections between them in specific layers. Different from previous\nwork, we apply a point-based 3D branch to extract the LiDAR features, as it\npreserves the geometric structure of point clouds. To fuse dense image features\nand sparse point features, we propose a learnable operator named bidirectional\ncamera-LiDAR fusion module (Bi-CLFM). We instantiate two types of the\nbidirectional fusion pipeline, one based on the pyramidal coarse-to-fine\narchitecture (dubbed CamLiPWC), and the other one based on the recurrent\nall-pairs field transforms (dubbed CamLiRAFT). On FlyingThings3D, both CamLiPWC\nand CamLiRAFT surpass all existing methods and achieve up to a 47.9\\% reduction\nin 3D end-point-error from the best published result. Our best-performing\nmodel, CamLiRAFT, achieves an error of 4.26\\% on the KITTI Scene Flow\nbenchmark, ranking 1st among all submissions with much fewer parameters.\nBesides, our methods have strong generalization performance and the ability to\nhandle non-rigid motion. Code is available at\nhttps://github.com/MCG-NJU/CamLiFlow.\n","authors":["Haisong Liu","Tao Lu","Yihui Xu","Jia Liu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12017v2.pdf","comment":"Accepted to TPAMI 2023"},{"id":"http://arxiv.org/abs/2404.05309v1","updated":"2024-04-08T08:57:32Z","published":"2024-04-08T08:57:32Z","title":"CLIPping the Limits: Finding the Sweet Spot for Relevant Images in\n Automated Driving Systems Perception Testing","summary":" Perception systems, especially cameras, are the eyes of automated driving\nsystems. Ensuring that they function reliably and robustly is therefore an\nimportant building block in the automation of vehicles. There are various\napproaches to test the perception of automated driving systems. Ultimately,\nhowever, it always comes down to the investigation of the behavior of\nperception systems under specific input data. Camera images are a crucial part\nof the input data. Image data sets are therefore collected for the testing of\nautomated driving systems, but it is non-trivial to find specific images in\nthese data sets. Thanks to recent developments in neural networks, there are\nnow methods for sorting the images in a data set according to their similarity\nto a prompt in natural language. In order to further automate the provision of\nsearch results, we make a contribution by automating the threshold definition\nin these sorted results and returning only the images relevant to the prompt as\na result. Our focus is on preventing false positives and false negatives\nequally. It is also important that our method is robust and in the case that\nour assumptions are not fulfilled, we provide a fallback solution.\n","authors":["Philipp Rigoll","Laurenz Adolph","Lennart Ries","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2404.05309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05307v1","updated":"2024-04-08T08:53:54Z","published":"2024-04-08T08:53:54Z","title":"Human Detection from 4D Radar Data in Low-Visibility Field Conditions","summary":" Autonomous driving technology is increasingly being used on public roads and\nin industrial settings such as mines. While it is essential to detect\npedestrians, vehicles, or other obstacles, adverse field conditions negatively\naffect the performance of classical sensors such as cameras or lidars. Radar,\non the other hand, is a promising modality that is less affected by, e.g.,\ndust, smoke, water mist or fog. In particular, modern 4D imaging radars provide\ntarget responses across the range, vertical angle, horizontal angle and Doppler\nvelocity dimensions. We propose TMVA4D, a CNN architecture that leverages this\n4D radar modality for semantic segmentation. The CNN is trained to distinguish\nbetween the background and person classes based on a series of 2D projections\nof the 4D radar data that include the elevation, azimuth, range, and Doppler\nvelocity dimensions. We also outline the process of compiling a novel dataset\nconsisting of data collected in industrial settings with a car-mounted 4D radar\nand describe how the ground-truth labels were generated from reference thermal\nimages. Using TMVA4D on this dataset, we achieve an mIoU score of 78.2% and an\nmDice score of 86.1%, evaluated on the two classes background and person\n","authors":["Mikael Skog","Oleksandr Kotlyar","Vladimír Kubelka","Martin Magnusson"],"pdf_url":"https://arxiv.org/pdf/2404.05307v1.pdf","comment":"Submitted to Radar in Robotics workshop at ICRA 2024"},{"id":"http://arxiv.org/abs/2404.05300v1","updated":"2024-04-08T08:42:47Z","published":"2024-04-08T08:42:47Z","title":"Texture Classification Network Integrating Adaptive Wavelet Transform","summary":" Graves' disease is a common condition that is diagnosed clinically by\ndetermining the smoothness of the thyroid texture and its morphology in\nultrasound images. Currently, the most widely used approach for the automated\ndiagnosis of Graves' disease utilizes Convolutional Neural Networks (CNNs) for\nboth feature extraction and classification. However, these methods demonstrate\nlimited efficacy in capturing texture features. Given the high capacity of\nwavelets in describing texture features, this research integrates learnable\nwavelet modules utilizing the Lifting Scheme into CNNs and incorporates a\nparallel wavelet branch into the ResNet18 model to enhance texture feature\nextraction. Our model can analyze texture features in spatial and frequency\ndomains simultaneously, leading to optimized classification accuracy. We\nconducted experiments on collected ultrasound datasets and publicly available\nnatural image texture datasets, our proposed network achieved 97.27% accuracy\nand 95.60% recall on ultrasound datasets, 60.765% accuracy on natural image\ntexture datasets, surpassing the accuracy of ResNet and conrming the\neffectiveness of our approach.\n","authors":["Su-Xi Yu","Jing-Yuan He","Yi Wang","Yu-Jiao Cai","Jun Yang","Bo Lin","Wei-Bin Yang","Jian Ruan"],"pdf_url":"https://arxiv.org/pdf/2404.05300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05290v1","updated":"2024-04-08T08:28:19Z","published":"2024-04-08T08:28:19Z","title":"MindSet: Vision. A toolbox for testing DNNs on key psychological\n experiments","summary":" Multiple benchmarks have been developed to assess the alignment between deep\nneural networks (DNNs) and human vision. In almost all cases these benchmarks\nare observational in the sense they are composed of behavioural and brain\nresponses to naturalistic images that have not been manipulated to test\nhypotheses regarding how DNNs or humans perceive and identify objects. Here we\nintroduce the toolbox MindSet: Vision, consisting of a collection of image\ndatasets and related scripts designed to test DNNs on 30 psychological\nfindings. In all experimental conditions, the stimuli are systematically\nmanipulated to test specific hypotheses regarding human visual perception and\nobject recognition. In addition to providing pre-generated datasets of images,\nwe provide code to regenerate these datasets, offering many configurable\nparameters which greatly extend the dataset versatility for different research\ncontexts, and code to facilitate the testing of DNNs on these image datasets\nusing three different methods (similarity judgments, out-of-distribution\nclassification, and decoder method), accessible at\nhttps://github.com/MindSetVision/mindset-vision. We test ResNet-152 on each of\nthese methods as an example of how the toolbox can be used.\n","authors":["Valerio Biscione","Dong Yin","Gaurav Malhotra","Marin Dujmovic","Milton L. Montero","Guillermo Puebla","Federico Adolfi","Rachel F. Heaton","John E. Hummel","Benjamin D. Evans","Karim Habashy","Jeffrey S. Bowers"],"pdf_url":"https://arxiv.org/pdf/2404.05290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05285v1","updated":"2024-04-08T08:20:53Z","published":"2024-04-08T08:20:53Z","title":"Detecting Every Object from Events","summary":" Object detection is critical in autonomous driving, and it is more practical\nyet challenging to localize objects of unknown categories: an endeavour known\nas Class-Agnostic Object Detection (CAOD). Existing studies on CAOD\npredominantly rely on ordinary cameras, but these frame-based sensors usually\nhave high latency and limited dynamic range, leading to safety risks in\nreal-world scenarios. In this study, we turn to a new modality enabled by the\nso-called event camera, featured by its sub-millisecond latency and high\ndynamic range, for robust CAOD. We propose Detecting Every Object in Events\n(DEOE), an approach tailored for achieving high-speed, class-agnostic\nopen-world object detection in event-based vision. Built upon the fast\nevent-based backbone: recurrent vision transformer, we jointly consider the\nspatial and temporal consistencies to identify potential objects. The\ndiscovered potential objects are assimilated as soft positive samples to avoid\nbeing suppressed as background. Moreover, we introduce a disentangled\nobjectness head to separate the foreground-background classification and novel\nobject discovery tasks, enhancing the model's generalization in localizing\nnovel objects while maintaining a strong ability to filter out the background.\nExtensive experiments confirm the superiority of our proposed DEOE in\ncomparison with three strong baseline methods that integrate the\nstate-of-the-art event-based object detector with advancements in RGB-based\nCAOD. Our code is available at https://github.com/Hatins/DEOE.\n","authors":["Haitian Zhang","Chang Xu","Xinya Wang","Bingde Liu","Guang Hua","Lei Yu","Wen Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19428v3","updated":"2024-04-08T08:18:33Z","published":"2024-03-28T13:58:05Z","title":"Burst Super-Resolution with Diffusion Models for Improving Perceptual\n Quality","summary":" While burst LR images are useful for improving the SR image quality compared\nwith a single LR image, prior SR networks accepting the burst LR images are\ntrained in a deterministic manner, which is known to produce a blurry SR image.\nIn addition, it is difficult to perfectly align the burst LR images, making the\nSR image more blurry. Since such blurry images are perceptually degraded, we\naim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity\nimages can be reconstructed by diffusion models. However, prior SR methods\nusing the diffusion model are not properly optimized for the burst SR task.\nSpecifically, the reverse process starting from a random sample is not\noptimized for image enhancement and restoration methods, including burst SR. In\nour proposed method, on the other hand, burst LR features are used to\nreconstruct the initial burst SR image that is fed into an intermediate step in\nthe diffusion model. This reverse process from the intermediate step 1) skips\ndiffusion steps for reconstructing the global structure of the image and 2)\nfocuses on steps for refining detailed textures. Our experimental results\ndemonstrate that our method can improve the scores of the perceptual quality\nmetrics. Code: https://github.com/placerkyo/BSRD\n","authors":["Kyotaro Tokoro","Kazutoshi Akita","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.19428v3.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2404.05280v1","updated":"2024-04-08T08:11:56Z","published":"2024-04-08T08:11:56Z","title":"MOSE: Boosting Vision-based Roadside 3D Object Detection with Scene Cues","summary":" 3D object detection based on roadside cameras is an additional way for\nautonomous driving to alleviate the challenges of occlusion and short\nperception range from vehicle cameras. Previous methods for roadside 3D object\ndetection mainly focus on modeling the depth or height of objects, neglecting\nthe stationary of cameras and the characteristic of inter-frame consistency. In\nthis work, we propose a novel framework, namely MOSE, for MOnocular 3D object\ndetection with Scene cuEs. The scene cues are the frame-invariant\nscene-specific features, which are crucial for object localization and can be\nintuitively regarded as the height between the surface of the real road and the\nvirtual ground plane. In the proposed framework, a scene cue bank is designed\nto aggregate scene cues from multiple frames of the same scene with a carefully\ndesigned extrinsic augmentation strategy. Then, a transformer-based decoder\nlifts the aggregated scene cues as well as the 3D position embeddings for 3D\nobject location, which boosts generalization ability in heterologous scenes.\nThe extensive experiment results on two public benchmarks demonstrate the\nstate-of-the-art performance of the proposed method, which surpasses the\nexisting methods by a large margin.\n","authors":["Xiahan Chen","Mingjian Chen","Sanli Tang","Yi Niu","Jiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.05280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00916v2","updated":"2024-04-08T08:08:43Z","published":"2024-04-01T04:43:45Z","title":"Gyro-based Neural Single Image Deblurring","summary":" In this paper, we present GyroDeblurNet, a novel single image deblurring\nmethod that utilizes a gyro sensor to effectively resolve the ill-posedness of\nimage deblurring. The gyro sensor provides valuable information about camera\nmotion during exposure time that can significantly improve deblurring quality.\nHowever, effectively exploiting real-world gyro data is challenging due to\nsignificant errors from various sources including sensor noise, the disparity\nbetween the positions of a camera module and a gyro sensor, the absence of\ntranslational motion information, and moving objects whose motions cannot be\ncaptured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with\ntwo novel neural network blocks: a gyro refinement block and a gyro deblurring\nblock. The gyro refinement block refines the error-ridden gyro data using the\nblur information from the input image. On the other hand, the gyro deblurring\nblock removes blur from the input image using the refined gyro data and further\ncompensates for gyro error by leveraging the blur information from the input\nimage. For training a neural network with erroneous gyro data, we propose a\ntraining strategy based on the curriculum learning. We also introduce a novel\ngyro data embedding scheme to represent real-world intricate camera shakes.\nFinally, we present a synthetic dataset and a real dataset for the training and\nevaluation of gyro-based single image deblurring. Our experiments demonstrate\nthat our approach achieves state-of-the-art deblurring quality by effectively\nutilizing erroneous gyro data.\n","authors":["Heemin Yang","Jaesung Rim","Seungyong Lee","Seung-Hwan Baek","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.00916v2.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.05274v1","updated":"2024-04-08T08:04:44Z","published":"2024-04-08T08:04:44Z","title":"Deep Optics for Video Snapshot Compressive Imaging","summary":" Video snapshot compressive imaging (SCI) aims to capture a sequence of video\nframes with only a single shot of a 2D detector, whose backbones rest in\noptical modulation patterns (also known as masks) and a computational\nreconstruction algorithm. Advanced deep learning algorithms and mature hardware\nare putting video SCI into practical applications. Yet, there are two clouds in\nthe sunshine of SCI: i) low dynamic range as a victim of high temporal\nmultiplexing, and ii) existing deep learning algorithms' degradation on real\nsystem. To address these challenges, this paper presents a deep optics\nframework to jointly optimize masks and a reconstruction network. Specifically,\nwe first propose a new type of structural mask to realize motion-aware and\nfull-dynamic-range measurement. Considering the motion awareness property in\nmeasurement domain, we develop an efficient network for video SCI\nreconstruction using Transformer to capture long-term temporal dependencies,\ndubbed Res2former. Moreover, sensor response is introduced into the forward\nmodel of video SCI to guarantee end-to-end model training close to real system.\nFinally, we implement the learned structural masks on a digital micro-mirror\ndevice. Experimental results on synthetic and real data validate the\neffectiveness of the proposed framework. We believe this is a milestone for\nreal-world video SCI. The source code and data are available at\nhttps://github.com/pwangcs/DeepOpticsSCI.\n","authors":["Ping Wang","Lishun Wang","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.05274v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2404.05268v1","updated":"2024-04-08T07:59:04Z","published":"2024-04-08T07:59:04Z","title":"MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation","summary":" Customized text-to-image generation aims to synthesize instantiations of\nuser-specified concepts and has achieved unprecedented progress in handling\nindividual concept. However, when extending to multiple customized concepts,\nexisting methods exhibit limitations in terms of flexibility and fidelity, only\naccommodating the combination of limited types of models and potentially\nresulting in a mix of characteristics from different concepts. In this paper,\nwe introduce the Multi-concept guidance for Multi-concept customization, termed\nMC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the\nrequirements for model architecture via inference time optimization, allowing\nthe integration of various heterogeneous single-concept customized models. It\nadaptively refines the attention weights between visual and textual tokens,\ndirecting image regions to focus on their associated words while diminishing\nthe impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$\neven surpasses previous methods that require additional training in terms of\nconsistency with input prompt and reference images. Moreover, MC$^2$ can be\nextended to elevate the compositional capabilities of text-to-image generation,\nyielding appealing results. Code will be publicly available at\nhttps://github.com/JIANGJiaXiu/MC-2.\n","authors":["Jiaxiu Jiang","Yabo Zhang","Kailai Feng","Xiaohe Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05264v1","updated":"2024-04-08T07:54:18Z","published":"2024-04-08T07:54:18Z","title":"Unbridled Icarus: A Survey of the Potential Perils of Image Inputs in\n Multimodal Large Language Model Security","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable capabilities\nthat increasingly influence various aspects of our daily lives, constantly\ndefining the new boundary of Artificial General Intelligence (AGI). Image\nmodalities, enriched with profound semantic information and a more continuous\nmathematical nature compared to other modalities, greatly enhance the\nfunctionalities of MLLMs when integrated. However, this integration serves as a\ndouble-edged sword, providing attackers with expansive vulnerabilities to\nexploit for highly covert and harmful attacks. The pursuit of reliable AI\nsystems like powerful MLLMs has emerged as a pivotal area of contemporary\nresearch. In this paper, we endeavor to demostrate the multifaceted risks\nassociated with the incorporation of image modalities into MLLMs. Initially, we\ndelineate the foundational components and training processes of MLLMs.\nSubsequently, we construct a threat model, outlining the security\nvulnerabilities intrinsic to MLLMs. Moreover, we analyze and summarize existing\nscholarly discourses on MLLMs' attack and defense mechanisms, culminating in\nsuggestions for the future research on MLLM security. Through this\ncomprehensive analysis, we aim to deepen the academic understanding of MLLM\nsecurity challenges and propel forward the development of trustworthy MLLM\nsystems.\n","authors":["Yihe Fan","Yuxin Cao","Ziyu Zhao","Ziyao Liu","Shaofeng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05264v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2404.00936v3","updated":"2024-04-08T07:52:38Z","published":"2024-04-01T05:46:15Z","title":"A Comprehensive Review of Knowledge Distillation in Computer Vision","summary":" Deep learning techniques have been demonstrated to surpass preceding\ncutting-edge machine learning techniques in recent years, with computer vision\nbeing one of the most prominent examples. However, deep learning models suffer\nfrom significant drawbacks when deployed in resource-constrained environments\ndue to their large model size and high complexity. Knowledge Distillation is\none of the prominent solutions to overcome this challenge. This review paper\nexamines the current state of research on knowledge distillation, a technique\nfor compressing complex models into smaller and simpler ones. The paper\nprovides an overview of the major principles and techniques associated with\nknowledge distillation and reviews the applications of knowledge distillation\nin the domain of computer vision. The review focuses on the benefits of\nknowledge distillation, as well as the problems that must be overcome to\nimprove its effectiveness.\n","authors":["Sheikh Musa Kaleem","Tufail Rouf","Gousia Habib","Tausifa jan Saleem","Brejesh Lall"],"pdf_url":"https://arxiv.org/pdf/2404.00936v3.pdf","comment":"36 pages ,10 figures"},{"id":"http://arxiv.org/abs/2309.03467v2","updated":"2024-04-08T07:49:47Z","published":"2023-09-07T03:22:59Z","title":"Autoregressive Omni-Aware Outpainting for Open-Vocabulary 360-Degree\n Image Generation","summary":" A 360-degree (omni-directional) image provides an all-encompassing spherical\nview of a scene. Recently, there has been an increasing interest in\nsynthesising 360-degree images from conventional narrow field of view (NFoV)\nimages captured by digital cameras and smartphones, for providing immersive\nexperiences in various scenarios such as virtual reality. Yet, existing methods\ntypically fall short in synthesizing intricate visual details or ensure the\ngenerated images align consistently with user-provided prompts. In this study,\nautoregressive omni-aware generative network (AOG-Net) is proposed for\n360-degree image generation by out-painting an incomplete 360-degree image\nprogressively with NFoV and text guidances joinly or individually. This\nautoregressive scheme not only allows for deriving finer-grained and\ntext-consistent patterns by dynamically generating and adjusting the process\nbut also offers users greater flexibility to edit their conditions throughout\nthe generation process. A global-local conditioning mechanism is devised to\ncomprehensively formulate the outpainting guidance in each autoregressive step.\nText guidances, omni-visual cues, NFoV inputs and omni-geometry are encoded and\nfurther formulated with cross-attention based transformers into a global stream\nand a local stream into a conditioned generative backbone model. As AOG-Net is\ncompatible to leverage large-scale models for the conditional encoder and the\ngenerative prior, it enables the generation to use extensive open-vocabulary\ntext guidances. Comprehensive experiments on two commonly used 360-degree image\ndatasets for both indoor and outdoor settings demonstrate the state-of-the-art\nperformance of our proposed method. Our code will be made publicly available.\n","authors":["Zhuqiang Lu","Kun Hu","Chaoyue Wang","Lei Bai","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03467v2.pdf","comment":"Accepted by AAAI 24"},{"id":"http://arxiv.org/abs/2404.05258v1","updated":"2024-04-08T07:47:28Z","published":"2024-04-08T07:47:28Z","title":"Unsupervised Band Selection Using Fused HSI and LiDAR Attention\n Integrating With Autoencoder","summary":" Band selection in hyperspectral imaging (HSI) is critical for optimising data\nprocessing and enhancing analytical accuracy. Traditional approaches have\npredominantly concentrated on analysing spectral and pixel characteristics\nwithin individual bands independently. These approaches overlook the potential\nbenefits of integrating multiple data sources, such as Light Detection and\nRanging (LiDAR), and is further challenged by the limited availability of\nlabeled data in HSI processing, which represents a significant obstacle. To\naddress these challenges, this paper introduces a novel unsupervised band\nselection framework that incorporates attention mechanisms and an Autoencoder\nfor reconstruction-based band selection. Our methodology distinctively\nintegrates HSI with LiDAR data through an attention score, using a\nconvolutional Autoencoder to process the combined feature mask. This fusion\neffectively captures essential spatial and spectral features and reduces\nredundancy in hyperspectral datasets. A comprehensive comparative analysis of\nour innovative fused band selection approach is performed against existing\nunsupervised band selection and fusion models. We used data sets such as\nHouston 2013, Trento, and MUUFLE for our experiments. The results demonstrate\nthat our method achieves superior classification accuracy and significantly\noutperforms existing models. This enhancement in HSI band selection,\nfacilitated by the incorporation of LiDAR features, underscores the\nconsiderable advantages of integrating features from different sources.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Alan Wee Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.05258v1.pdf","comment":"13 pages, 13figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.05256v1","updated":"2024-04-08T07:43:23Z","published":"2024-04-08T07:43:23Z","title":"Text-to-Image Synthesis for Any Artistic Styles: Advancements in\n Personalized Artistic Image Generation via Subdivision and Dual Binding","summary":" Recent advancements in text-to-image models, such as Stable Diffusion, have\ndemonstrated their ability to synthesize visual images through natural language\nprompts. One approach of personalizing text-to-image models, exemplified by\nDreamBooth, fine-tunes the pre-trained model by binding unique text identifiers\nwith a few images of a specific subject. Although existing fine-tuning methods\nhave demonstrated competence in rendering images according to the styles of\nfamous painters, it is still challenging to learn to produce images\nencapsulating distinct art styles due to abstract and broad visual perceptions\nof stylistic attributes such as lines, shapes, textures, and colors. In this\npaper, we introduce a new method, Single-StyleForge, for personalization. It\nfine-tunes pre-trained text-to-image diffusion models to generate diverse\nimages in specified styles from text prompts. By using around 15-20 images of\nthe target style, the approach establishes a foundational binding of a unique\ntoken identifier with a broad range of the target style. It also utilizes\nauxiliary images to strengthen this binding, resulting in offering specific\nguidance on representing elements such as persons in a target style-consistent\nmanner. In addition, we present ways to improve the quality of style and\ntext-image alignment through a method called Multi-StyleForge, which inherits\nthe strategy used in StyleForge and learns tokens in multiple. Experimental\nevaluation conducted on six distinct artistic styles demonstrates substantial\nimprovements in both the quality of generated images and the perceptual\nfidelity metrics, such as FID, KID, and CLIP scores.\n","authors":["Junseo Park","Beomseok Ko","Hyeryung Jang"],"pdf_url":"https://arxiv.org/pdf/2404.05256v1.pdf","comment":"20 pages, 12 figuers"},{"id":"http://arxiv.org/abs/2404.05253v1","updated":"2024-04-08T07:34:39Z","published":"2024-04-08T07:34:39Z","title":"CodeEnhance: A Codebook-Driven Approach for Low-Light Image Enhancement","summary":" Low-light image enhancement (LLIE) aims to improve low-illumination images.\nHowever, existing methods face two challenges: (1) uncertainty in restoration\nfrom diverse brightness degradations; (2) loss of texture and color information\ncaused by noise suppression and light enhancement. In this paper, we propose a\nnovel enhancement approach, CodeEnhance, by leveraging quantized priors and\nimage refinement to address these challenges. In particular, we reframe LLIE as\nlearning an image-to-code mapping from low-light images to discrete codebook,\nwhich has been learned from high-quality images. To enhance this process, a\nSemantic Embedding Module (SEM) is introduced to integrate semantic information\nwith low-level features, and a Codebook Shift (CS) mechanism, designed to adapt\nthe pre-learned codebook to better suit the distinct characteristics of our\nlow-light dataset. Additionally, we present an Interactive Feature\nTransformation (IFT) module to refine texture and color information during\nimage reconstruction, allowing for interactive enhancement based on user\npreferences. Extensive experiments on both real-world and synthetic benchmarks\ndemonstrate that the incorporation of prior knowledge and controllable\ninformation transfer significantly enhances LLIE performance in terms of\nquality and fidelity. The proposed CodeEnhance exhibits superior robustness to\nvarious degradations, including uneven illumination, noise, and color\ndistortion.\n","authors":["Xu Wu","XianXu Hou","Zhihui Lai","Jie Zhou","Ya-nan Zhang","Witold Pedrycz","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2404.05253v1.pdf","comment":"10 pages, 13 figures"},{"id":"http://arxiv.org/abs/2312.03203v3","updated":"2024-04-08T07:19:52Z","published":"2023-12-06T00:46:30Z","title":"Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled\n Feature Fields","summary":" 3D scene representations have gained immense popularity in recent years.\nMethods that use Neural Radiance fields are versatile for traditional tasks\nsuch as novel view synthesis. In recent times, some work has emerged that aims\nto extend the functionality of NeRF beyond view synthesis, for semantically\naware tasks such as editing and segmentation using 3D feature field\ndistillation from 2D foundation models. However, these methods have two major\nlimitations: (a) they are limited by the rendering speed of NeRF pipelines, and\n(b) implicitly represented feature fields suffer from continuity artifacts\nreducing feature quality. Recently, 3D Gaussian Splatting has shown\nstate-of-the-art performance on real-time radiance field rendering. In this\nwork, we go one step further: in addition to radiance field rendering, we\nenable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D\nfoundation model distillation. This translation is not straightforward: naively\nincorporating feature fields in the 3DGS framework encounters significant\nchallenges, notably the disparities in spatial resolution and channel\nconsistency between RGB images and feature maps. We propose architectural and\ntraining changes to efficiently avert this problem. Our proposed method is\ngeneral, and our experiments showcase novel view semantic segmentation,\nlanguage-guided editing and segment anything through learning feature fields\nfrom state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across\nexperiments, our distillation method is able to provide comparable or better\nresults, while being significantly faster to both train and render.\nAdditionally, to the best of our knowledge, we are the first method to enable\npoint and bounding-box prompting for radiance field manipulation, by leveraging\nthe SAM model. Project website at: https://feature-3dgs.github.io/\n","authors":["Shijie Zhou","Haoran Chang","Sicheng Jiang","Zhiwen Fan","Zehao Zhu","Dejia Xu","Pradyumna Chari","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2312.03203v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05238v1","updated":"2024-04-08T07:09:15Z","published":"2024-04-08T07:09:15Z","title":"Allowing humans to interactively guide machines where to look does not\n always improve a human-AI team's classification accuracy","summary":" Via thousands of papers in Explainable AI (XAI), attention maps\n\\cite{vaswani2017attention} and feature attribution maps \\cite{bansal2020sam}\nhave been established as a common means for explaining the input features that\nare important to AI's decisions. It is an interesting but unexplored question\nwhether allowing users to edit the importance scores of input features at test\ntime would improve the human-AI team's accuracy on downstream tasks. In this\npaper, we address this question by taking CHM-Corr, a state-of-the-art,\nante-hoc explanation method \\cite{taesiri2022visual} that first predicts\npatch-wise correspondences between the input and the training-set images, and\nthen uses them to make classification decisions. We build an interactive\ninterface on top of CHM-Corr, enabling users to directly edit the initial\nfeature attribution map provided by CHM-Corr. Via our CHM-Corr++ interface,\nusers gain insights into if, when, and how the model changes its outputs,\nenhancing understanding beyond static explanations. Our user study with 18\nmachine learning researchers who performed $\\sim$1,400 decisions shows that our\ninteractive approach does not improve user accuracy on CUB-200 bird image\nclassification over static explanations. This challenges the belief that\ninteractivity inherently boosts XAI\neffectiveness~\\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding}\nand raises needs for future research. Our work contributes to the field by\nopen-sourcing an interactive tool for manipulating model attention, and it lays\nthe groundwork for future research to enable effective human-AI interaction in\ncomputer vision. We release code and data on\n\\href{https://anonymous.4open.science/r/CHMCorrPlusPlus/}{github}. Our\ninterface are available \\href{http://137.184.82.109:7080/}{here}.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Sunnie S. Y. Kim","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05238v1.pdf","comment":"Accepted for presentation at the XAI4CV Workshop, part of the CVPR\n 2024 proceedings"},{"id":"http://arxiv.org/abs/2312.07246v2","updated":"2024-04-08T07:07:02Z","published":"2023-12-12T13:22:44Z","title":"Unifying Correspondence, Pose and NeRF for Pose-Free Novel View\n Synthesis from Stereo Pairs","summary":" This work delves into the task of pose-free novel view synthesis from stereo\npairs, a challenging and pioneering task in 3D vision. Our innovative\nframework, unlike any before, seamlessly integrates 2D correspondence matching,\ncamera pose estimation, and NeRF rendering, fostering a synergistic enhancement\nof these tasks. We achieve this through designing an architecture that utilizes\na shared representation, which serves as a foundation for enhanced 3D geometry\nunderstanding. Capitalizing on the inherent interplay between the tasks, our\nunified framework is trained end-to-end with the proposed training strategy to\nimprove overall model accuracy. Through extensive evaluations across diverse\nindoor and outdoor scenes from two real-world datasets, we demonstrate that our\napproach achieves substantial improvement over previous methodologies,\nespecially in scenarios characterized by extreme viewpoint changes and the\nabsence of accurate camera poses.\n","authors":["Sunghwan Hong","Jaewoo Jung","Heeseong Shin","Jiaolong Yang","Seungryong Kim","Chong Luo"],"pdf_url":"https://arxiv.org/pdf/2312.07246v2.pdf","comment":"Project page: https://ku-cvlab.github.io/CoPoNeRF/ CVPR2024 camera\n ready version (Highlight)"},{"id":"http://arxiv.org/abs/2404.05236v1","updated":"2024-04-08T07:01:42Z","published":"2024-04-08T07:01:42Z","title":"Stylizing Sparse-View 3D Scenes with Hierarchical Neural Representation","summary":" Recently, a surge of 3D style transfer methods has been proposed that\nleverage the scene reconstruction power of a pre-trained neural radiance field\n(NeRF). To successfully stylize a scene this way, one must first reconstruct a\nphoto-realistic radiance field from collected images of the scene. However,\nwhen only sparse input views are available, pre-trained few-shot NeRFs often\nsuffer from high-frequency artifacts, which are generated as a by-product of\nhigh-frequency details for improving reconstruction quality. Is it possible to\ngenerate more faithful stylized scenes from sparse inputs by directly\noptimizing encoding-based scene representation with target style? In this\npaper, we consider the stylization of sparse-view scenes in terms of\ndisentangling content semantics and style textures. We propose a coarse-to-fine\nsparse-view scene stylization framework, where a novel hierarchical\nencoding-based neural representation is designed to generate high-quality\nstylized scenes directly from implicit scene representations. We also propose a\nnew optimization strategy with content strength annealing to achieve realistic\nstylization and better content preservation. Extensive experiments demonstrate\nthat our method can achieve high-quality stylization of sparse-view scenes and\noutperforms fine-tuning-based baselines in terms of stylization quality and\nefficiency.\n","authors":["Y. Wang","A. Gao","Y. Gong","Y. Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.05236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05231v1","updated":"2024-04-08T06:53:30Z","published":"2024-04-08T06:53:30Z","title":"PromptAD: Learning Prompts with only Normal Samples for Few-Shot Anomaly\n Detection","summary":" The vision-language model has brought great improvement to few-shot\nindustrial anomaly detection, which usually needs to design of hundreds of\nprompts through prompt engineering. For automated scenarios, we first use\nconventional prompt learning with many-class paradigm as the baseline to\nautomatically learn prompts but found that it can not work well in one-class\nanomaly detection. To address the above problem, this paper proposes a\none-class prompt learning method for few-shot anomaly detection, termed\nPromptAD. First, we propose semantic concatenation which can transpose normal\nprompts into anomaly prompts by concatenating normal prompts with anomaly\nsuffixes, thus constructing a large number of negative samples used to guide\nprompt learning in one-class setting. Furthermore, to mitigate the training\nchallenge caused by the absence of anomaly images, we introduce the concept of\nexplicit anomaly margin, which is used to explicitly control the margin between\nnormal prompt features and anomaly prompt features through a hyper-parameter.\nFor image-level/pixel-level anomaly detection, PromptAD achieves first place in\n11/12 few-shot settings on MVTec and VisA.\n","authors":["Xiaofan Li","Zhizhong Zhang","Xin Tan","Chengwei Chen","Yanyun Qu","Yuan Xie","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.05231v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.05225v1","updated":"2024-04-08T06:40:28Z","published":"2024-04-08T06:40:28Z","title":"LayoutLLM: Layout Instruction Tuning with Large Language Models for\n Document Understanding","summary":" Recently, leveraging large language models (LLMs) or multimodal large\nlanguage models (MLLMs) for document understanding has been proven very\npromising. However, previous works that employ LLMs/MLLMs for document\nunderstanding have not fully explored and utilized the document layout\ninformation, which is vital for precise document understanding. In this paper,\nwe propose LayoutLLM, an LLM/MLLM based method for document understanding. The\ncore of LayoutLLM is a layout instruction tuning strategy, which is specially\ndesigned to enhance the comprehension and utilization of document layouts. The\nproposed layout instruction tuning strategy consists of two components:\nLayout-aware Pre-training and Layout-aware Supervised Fine-tuning. To capture\nthe characteristics of document layout in Layout-aware Pre-training, three\ngroups of pre-training tasks, corresponding to document-level, region-level and\nsegment-level information, are introduced. Furthermore, a novel module called\nlayout chain-of-thought (LayoutCoT) is devised to enable LayoutLLM to focus on\nregions relevant to the question and generate accurate answers. LayoutCoT is\neffective for boosting the performance of document understanding. Meanwhile, it\nbrings a certain degree of interpretability, which could facilitate manual\ninspection and correction. Experiments on standard benchmarks show that the\nproposed LayoutLLM significantly outperforms existing methods that adopt\nopen-source 7B LLMs/MLLMs for document understanding. The training data of the\nLayoutLLM is publicly available at\nhttps://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/DocumentUnderstanding/LayoutLLM\n","authors":["Chuwei Luo","Yufan Shen","Zhaoqing Zhu","Qi Zheng","Zhi Yu","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2404.05225v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05220v1","updated":"2024-04-08T06:32:11Z","published":"2024-04-08T06:32:11Z","title":"StylizedGS: Controllable Stylization for 3D Gaussian Splatting","summary":" With the rapid development of XR, 3D generation and editing are becoming more\nand more important, among which, stylization is an important tool of 3D\nappearance editing. It can achieve consistent 3D artistic stylization given a\nsingle reference style image and thus is a user-friendly editing way. However,\nrecent NeRF-based 3D stylization methods face efficiency issues that affect the\nactual user experience and the implicit nature limits its ability to transfer\nthe geometric pattern styles. Additionally, the ability for artists to exert\nflexible control over stylized scenes is considered highly desirable, fostering\nan environment conducive to creative exploration. In this paper, we introduce\nStylizedGS, a 3D neural style transfer framework with adaptable control over\nperceptual factors based on 3D Gaussian Splatting (3DGS) representation. The\n3DGS brings the benefits of high efficiency. We propose a GS filter to\neliminate floaters in the reconstruction which affects the stylization effects\nbefore stylization. Then the nearest neighbor-based style loss is introduced to\nachieve stylization by fine-tuning the geometry and color parameters of 3DGS,\nwhile a depth preservation loss with other regularizations is proposed to\nprevent the tampering of geometry content. Moreover, facilitated by specially\ndesigned losses, StylizedGS enables users to control color, stylized scale and\nregions during the stylization to possess customized capabilities. Our method\ncan attain high-quality stylization results characterized by faithful\nbrushstrokes and geometric consistency with flexible controls. Extensive\nexperiments across various scenes and styles demonstrate the effectiveness and\nefficiency of our method concerning both stylization quality and inference FPS.\n","authors":["Dingxi Zhang","Zhuoxun Chen","Yu-Jie Yuan","Fang-Lue Zhang","Zhenliang He","Shiguang Shan","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.05220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05773v2","updated":"2024-04-08T06:28:13Z","published":"2024-02-08T16:00:25Z","title":"UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery","summary":" Raindrops adhering to the lens of UAVs can obstruct visibility of the\nbackground scene and degrade image quality. Despite recent progress in image\nderaining methods and datasets, there is a lack of focus on raindrop removal\nfrom UAV aerial imagery due to the unique challenges posed by varying angles\nand rapid movement during drone flight. To fill the gap in this research, we\nfirst construct a new benchmark dataset for removing raindrops from UAV images,\ncalled UAV-Rain1k. In this letter, we provide a dataset generation pipeline,\nwhich includes modeling raindrop shapes using Blender, collecting background\nimages from various UAV angles, random sampling of rain masks and etc. Based on\nthe proposed benchmark, we further present a comprehensive evaluation of\nexisting representative image deraining algorithms, and reveal future research\nopportunities worth exploring. The proposed dataset is publicly available at\nhttps://github.com/cschenxiang/UAV-Rain1k.\n","authors":["Wenhui Chang","Hongming Chen","Xin He","Xiang Chen","Liangduo Shen"],"pdf_url":"https://arxiv.org/pdf/2402.05773v2.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition Workshops (CVPRW) 2024"},{"id":"http://arxiv.org/abs/2312.17118v3","updated":"2024-04-08T06:23:12Z","published":"2023-12-28T16:54:53Z","title":"Fully Sparse 3D Occupancy Prediction","summary":" Occupancy prediction plays a pivotal role in autonomous driving. Previous\nmethods typically construct dense 3D volumes, neglecting the inherent sparsity\nof the scene and suffering high computational costs. To bridge the gap, we\nintroduce a novel fully sparse occupancy network, termed SparseOcc. SparseOcc\ninitially reconstructs a sparse 3D representation from visual inputs and\nsubsequently predicts semantic/instance occupancy from the 3D sparse\nrepresentation by sparse queries. A mask-guided sparse sampling is designed to\nenable sparse queries to interact with 2D features in a fully sparse manner,\nthereby circumventing costly dense features or global attention. Additionally,\nwe design a thoughtful ray-based evaluation metric, namely RayIoU, to solve the\ninconsistency penalty along depths raised in traditional voxel-level mIoU\ncriteria. SparseOcc demonstrates its effectiveness by achieving a RayIoU of\n34.0, while maintaining a real-time inference speed of 17.3 FPS, with 7 history\nframes inputs. By incorporating more preceding frames to 15, SparseOcc\ncontinuously improves its performance to 35.1 RayIoU without whistles and\nbells. Code is available at https://github.com/MCG-NJU/SparseOcc.\n","authors":["Haisong Liu","Yang Chen","Haiguang Wang","Zetong Yang","Tianyu Li","Jia Zeng","Li Chen","Hongyang Li","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.17118v3.pdf","comment":"Add new metric: RayIoU"},{"id":"http://arxiv.org/abs/2404.05218v1","updated":"2024-04-08T06:15:13Z","published":"2024-04-08T06:15:13Z","title":"Multi-agent Long-term 3D Human Pose Forecasting via Interaction-aware\n Trajectory Conditioning","summary":" Human pose forecasting garners attention for its diverse applications.\nHowever, challenges in modeling the multi-modal nature of human motion and\nintricate interactions among agents persist, particularly with longer\ntimescales and more agents. In this paper, we propose an interaction-aware\ntrajectory-conditioned long-term multi-agent human pose forecasting model,\nutilizing a coarse-to-fine prediction approach: multi-modal global trajectories\nare initially forecasted, followed by respective local pose forecasts\nconditioned on each mode. In doing so, our Trajectory2Pose model introduces a\ngraph-based agent-wise interaction module for a reciprocal forecast of local\nmotion-conditioned global trajectory and trajectory-conditioned local pose. Our\nmodel effectively handles the multi-modality of human motion and the complexity\nof long-term multi-agent interactions, improving performance in complex\nenvironments. Furthermore, we address the lack of long-term (6s+) multi-agent\n(5+) datasets by constructing a new dataset from real-world images and 2D\nannotations, enabling a comprehensive evaluation of our proposed model.\nState-of-the-art prediction performance on both complex and simpler datasets\nconfirms the generalized effectiveness of our method. The code is available at\nhttps://github.com/Jaewoo97/T2P.\n","authors":["Jaewoo Jeong","Daehee Park","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2404.05218v1.pdf","comment":"2024 CVPR Highlight"},{"id":"http://arxiv.org/abs/2404.02135v3","updated":"2024-04-08T06:11:48Z","published":"2024-04-02T17:48:46Z","title":"Enhancing Ship Classification in Optical Satellite Imagery: Integrating\n Convolutional Block Attention Module with ResNet for Improved Performance","summary":" This study presents an advanced Convolutional Neural Network (CNN)\narchitecture for ship classification from optical satellite imagery,\nsignificantly enhancing performance through the integration of the\nConvolutional Block Attention Module (CBAM) and additional architectural\ninnovations. Building upon the foundational ResNet50 model, we first\nincorporated a standard CBAM to direct the model's focus towards more\ninformative features, achieving an accuracy of 87% compared to the baseline\nResNet50's 85%. Further augmentations involved multi-scale feature integration,\ndepthwise separable convolutions, and dilated convolutions, culminating in the\nEnhanced ResNet Model with Improved CBAM. This model demonstrated a remarkable\naccuracy of 95%, with precision, recall, and f1-scores all witnessing\nsubstantial improvements across various ship classes. The bulk carrier and oil\ntanker classes, in particular, showcased nearly perfect precision and recall\nrates, underscoring the model's enhanced capability in accurately identifying\nand classifying ships. Attention heatmap analyses further validated the\nimproved model's efficacy, revealing a more focused attention on relevant ship\nfeatures, regardless of background complexities. These findings underscore the\npotential of integrating attention mechanisms and architectural innovations in\nCNNs for high-resolution satellite imagery classification. The study navigates\nthrough the challenges of class imbalance and computational costs, proposing\nfuture directions towards scalability and adaptability in new or rare ship type\nrecognition. This research lays a groundwork for the application of advanced\ndeep learning techniques in the domain of remote sensing, offering insights\ninto scalable and efficient satellite image classification.\n","authors":["Ryan Donghan Kwon","Gangjoo Robin Nam","Jisoo Tak","Junseob Shin","Hyerin Cha","Yeom Hyeok","Seung Won Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02135v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05215v1","updated":"2024-04-08T06:07:32Z","published":"2024-04-08T06:07:32Z","title":"Spatio-Temporal Attention and Gaussian Processes for Personalized Video\n Gaze Estimation","summary":" Gaze is an essential prompt for analyzing human behavior and attention.\nRecently, there has been an increasing interest in determining gaze direction\nfrom facial videos. However, video gaze estimation faces significant\nchallenges, such as understanding the dynamic evolution of gaze in video\nsequences, dealing with static backgrounds, and adapting to variations in\nillumination. To address these challenges, we propose a simple and novel deep\nlearning model designed to estimate gaze from videos, incorporating a\nspecialized attention module. Our method employs a spatial attention mechanism\nthat tracks spatial dynamics within videos. This technique enables accurate\ngaze direction prediction through a temporal sequence model, adeptly\ntransforming spatial observations into temporal insights, thereby significantly\nimproving gaze estimation accuracy. Additionally, our approach integrates\nGaussian processes to include individual-specific traits, facilitating the\npersonalization of our model with just a few labeled samples. Experimental\nresults confirm the efficacy of the proposed approach, demonstrating its\nsuccess in both within-dataset and cross-dataset settings. Specifically, our\nproposed approach achieves state-of-the-art performance on the Gaze360 dataset,\nimproving by $2.5^\\circ$ without personalization. Further, by personalizing the\nmodel with just three samples, we achieved an additional improvement of\n$0.8^\\circ$. The code and pre-trained models are available at\n\\url{https://github.com/jswati31/stage}.\n","authors":["Swati Jindal","Mohit Yadav","Roberto Manduchi"],"pdf_url":"https://arxiv.org/pdf/2404.05215v1.pdf","comment":"Accepted at CVPR 2024 Gaze workshop"},{"id":"http://arxiv.org/abs/2404.05212v1","updated":"2024-04-08T05:58:07Z","published":"2024-04-08T05:58:07Z","title":"DiffCJK: Conditional Diffusion Model for High-Quality and Wide-coverage\n CJK Character Generation","summary":" Chinese, Japanese, and Korean (CJK), with a vast number of native speakers,\nhas profound influence on society and culture. The typesetting of CJK languages\ncarries a wide range of requirements due to the complexity of their scripts and\nunique literary traditions. A critical aspect of this typesetting process is\nthat CJK fonts need to provide a set of consistent-looking glyphs for\napproximately one hundred thousand characters. However, creating such a font is\ninherently labor-intensive and expensive, which significantly hampers the\ndevelopment of new CJK fonts for typesetting, historical, aesthetic, or\nartistic purposes.\n To bridge this gap, we are motivated by recent advancements in\ndiffusion-based generative models and propose a novel diffusion method for\ngenerating glyphs in a targeted style from a \\emph{single} conditioned,\nstandard glyph form. Our experiments show that our method is capable of\ngenerating fonts of both printed and hand-written styles, the latter of which\npresents a greater challenge. Moreover, our approach shows remarkable zero-shot\ngeneralization capabilities for non-CJK but Chinese-inspired scripts. We also\nshow our method facilitates smooth style interpolation and generates bitmap\nimages suitable for vectorization, which is crucial in the font creation\nprocess. In summary, our proposed method opens the door to high-quality,\ngenerative model-assisted font creation for CJK characters, for both\ntypesetting and artistic endeavors.\n","authors":["Yingtao Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05211v1","updated":"2024-04-08T05:50:46Z","published":"2024-04-08T05:50:46Z","title":"Multi-level Graph Subspace Contrastive Learning for Hyperspectral Image\n Clustering","summary":" Hyperspectral image (HSI) clustering is a challenging task due to its high\ncomplexity. Despite subspace clustering shows impressive performance for HSI,\ntraditional methods tend to ignore the global-local interaction in HSI data. In\nthis study, we proposed a multi-level graph subspace contrastive learning\n(MLGSC) for HSI clustering. The model is divided into the following main parts.\nGraph convolution subspace construction: utilizing spectral and texture\nfeautures to construct two graph convolution views. Local-global graph\nrepresentation: local graph representations were obtained by step-by-step\nconvolutions and a more representative global graph representation was obtained\nusing an attention-based pooling strategy. Multi-level graph subspace\ncontrastive learning: multi-level contrastive learning was conducted to obtain\nlocal-global joint graph representations, to improve the consistency of the\npositive samples between views, and to obtain more robust graph embeddings.\nSpecifically, graph-level contrastive learning is used to better learn global\nrepresentations of HSI data. Node-level intra-view and inter-view contrastive\nlearning is designed to learn joint representations of local regions of HSI.\nThe proposed model is evaluated on four popular HSI datasets: Indian Pines,\nPavia University, Houston, and Xu Zhou. The overall accuracies are 97.75%,\n99.96%, 92.28%, and 95.73%, which significantly outperforms the current\nstate-of-the-art clustering methods.\n","authors":["Jingxin Wang","Renxiang Guan","Kainan Gao","Zihao Li","Hao Li","Xianju Li","Chang Tang"],"pdf_url":"https://arxiv.org/pdf/2404.05211v1.pdf","comment":"IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.05210v1","updated":"2024-04-08T05:45:03Z","published":"2024-04-08T05:45:03Z","title":"Bidirectional Long-Range Parser for Sequential Data Understanding","summary":" The transformer is a powerful data modelling framework responsible for\nremarkable performance on a wide range of tasks. However, they are limited in\nterms of scalability as it is suboptimal and inefficient to process\nlong-sequence data. To this purpose we introduce BLRP (Bidirectional Long-Range\nParser), a novel and versatile attention mechanism designed to increase\nperformance and efficiency on long-sequence tasks. It leverages short and long\nrange heuristics in the form of a local sliding window approach combined with a\nglobal bidirectional latent space synthesis technique. We show the benefits and\nversatility of our approach on vision and language domains by demonstrating\ncompetitive results against state-of-the-art methods on the Long-Range-Arena\nand CIFAR benchmarks together with ablations demonstrating the computational\nefficiency.\n","authors":["George Leotescu","Daniel Voinea","Alin-Ionut Popa"],"pdf_url":"https://arxiv.org/pdf/2404.05210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05207v1","updated":"2024-04-08T05:23:12Z","published":"2024-04-08T05:23:12Z","title":"iVPT: Improving Task-relevant Information Sharing in Visual Prompt\n Tuning by Cross-layer Dynamic Connection","summary":" Recent progress has shown great potential of visual prompt tuning (VPT) when\nadapting pre-trained vision transformers to various downstream tasks. However,\nmost existing solutions independently optimize prompts at each layer, thereby\nneglecting the usage of task-relevant information encoded in prompt tokens\nacross layers. Additionally, existing prompt structures are prone to\ninterference from task-irrelevant noise in input images, which can do harm to\nthe sharing of task-relevant information. In this paper, we propose a novel VPT\napproach, \\textbf{iVPT}. It innovatively incorporates a cross-layer dynamic\nconnection (CDC) for input prompt tokens from adjacent layers, enabling\neffective sharing of task-relevant information. Furthermore, we design a\ndynamic aggregation (DA) module that facilitates selective sharing of\ninformation between layers. The combination of CDC and DA enhances the\nflexibility of the attention process within the VPT framework. Building upon\nthese foundations, iVPT introduces an attentive reinforcement (AR) mechanism,\nby automatically identifying salient image tokens, which are further enhanced\nby prompt tokens in an additive manner. Extensive experiments on 24 image\nclassification and semantic segmentation benchmarks clearly demonstrate the\nadvantage of the proposed iVPT, compared to the state-of-the-art counterparts.\n","authors":["Nan Zhou","Jiaxin Chen","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.05207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05206v1","updated":"2024-04-08T05:19:28Z","published":"2024-04-08T05:19:28Z","title":"SoundingActions: Learning How Actions Sound from Narrated Egocentric\n Videos","summary":" We propose a novel self-supervised embedding to learn how actions sound from\nnarrated in-the-wild egocentric videos. Whereas existing methods rely on\ncurated data with known audio-visual correspondence, our multimodal\ncontrastive-consensus coding (MC3) embedding reinforces the associations\nbetween audio, language, and vision when all modality pairs agree, while\ndiminishing those associations when any one pair does not. We show our approach\ncan successfully discover how the long tail of human actions sound from\negocentric video, outperforming an array of recent multimodal embedding\ntechniques on two datasets (Ego4D and EPIC-Sounds) and multiple cross-modal\ntasks.\n","authors":["Changan Chen","Kumar Ashutosh","Rohit Girdhar","David Harwath","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2404.05206v1.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://vision.cs.utexas.edu/projects/soundingactions"},{"id":"http://arxiv.org/abs/2404.05205v1","updated":"2024-04-08T05:18:39Z","published":"2024-04-08T05:18:39Z","title":"A secure and private ensemble matcher using multi-vault obfuscated\n templates","summary":" Given the irrevocability of biometric samples and mounting privacy concerns,\nbiometric template security and secure matching are among the essential\nfeatures of any well-designed modern biometric system. In this paper, we\npropose an obfuscation method that hides the biometric template information\nwith just enough chaff. The main idea is to reduce the number of chaff points\nto a practical level by creating n sub-templates from the original template and\nhiding each sub-template with m chaff points. During verification, s closest\nvectors to the biometric query are retrieved from each vault and then combined\nto generate hash values that are compared with the stored hash value. We\ndemonstrate the effectiveness of synthetic facial images, generated by a\nGenerative Adversarial Network (GAN), as ``random chaff points'' within a\nsecure-vault authorization system. This approach safeguards user identities\nduring training and deployment. We tested our protocol using the AT&T, GT, and\nLFW face datasets, with the ROC areas under the curve being 0.99, 0.99, and\n0.90, respectively. These numbers were close to those of the unprotected\ntemplates, showing that our method does not adversely affect accuracy.\n","authors":["Babak Poorebrahim Gilkalaye","Shubhabrata Mukherjee","Reza Derakhshani"],"pdf_url":"https://arxiv.org/pdf/2404.05205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11825v2","updated":"2024-04-08T05:11:47Z","published":"2023-11-20T15:03:56Z","title":"Holistic Inverse Rendering of Complex Facade via Aerial 3D Scanning","summary":" In this work, we use multi-view aerial images to reconstruct the geometry,\nlighting, and material of facades using neural signed distance fields (SDFs).\nWithout the requirement of complex equipment, our method only takes simple RGB\nimages captured by a drone as inputs to enable physically based and\nphotorealistic novel-view rendering, relighting, and editing. However, a\nreal-world facade usually has complex appearances ranging from diffuse rocks\nwith subtle details to large-area glass windows with specular reflections,\nmaking it hard to attend to everything. As a result, previous methods can\npreserve the geometry details but fail to reconstruct smooth glass windows or\nverse vise. In order to address this challenge, we introduce three spatial- and\nsemantic-adaptive optimization strategies, including a semantic regularization\napproach based on zero-shot segmentation techniques to improve material\nconsistency, a frequency-aware geometry regularization to balance surface\nsmoothness and details in different surfaces, and a visibility probe-based\nscheme to enable efficient modeling of the local lighting in large-scale\noutdoor environments. In addition, we capture a real-world facade aerial 3D\nscanning image set and corresponding point clouds for training and\nbenchmarking. The experiment demonstrates the superior quality of our method on\nfacade holistic inverse rendering, novel view synthesis, and scene editing\ncompared to state-of-the-art baselines.\n","authors":["Zixuan Xie","Rengan Xie","Rong Li","Kai Huang","Pengju Qiao","Jingsen Zhu","Xu Yin","Qi Ye","Wei Hua","Yuchi Huo","Hujun Bao"],"pdf_url":"https://arxiv.org/pdf/2311.11825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01518v3","updated":"2024-04-08T05:09:19Z","published":"2024-04-01T22:53:47Z","title":"Temporally Consistent Unbalanced Optimal Transport for Unsupervised\n Action Segmentation","summary":" We propose a novel approach to the action segmentation task for long,\nuntrimmed videos, based on solving an optimal transport problem. By encoding a\ntemporal consistency prior into a Gromov-Wasserstein problem, we are able to\ndecode a temporally consistent segmentation from a noisy affinity/matching cost\nmatrix between video frames and action classes. Unlike previous approaches, our\nmethod does not require knowing the action order for a video to attain temporal\nconsistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can\nbe efficiently solved on GPUs using a few iterations of projected mirror\ndescent. We demonstrate the effectiveness of our method in an unsupervised\nlearning setting, where our method is used to generate pseudo-labels for\nself-training. We evaluate our segmentation approach and unsupervised learning\npipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly\ndatasets, yielding state-of-the-art results for the unsupervised video action\nsegmentation task.\n","authors":["Ming Xu","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2404.01518v3.pdf","comment":"Accepted to CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.05196v1","updated":"2024-04-08T04:53:29Z","published":"2024-04-08T04:53:29Z","title":"HSViT: Horizontally Scalable Vision Transformer","summary":" While the Vision Transformer (ViT) architecture gains prominence in computer\nvision and attracts significant attention from multimedia communities, its\ndeficiency in prior knowledge (inductive bias) regarding shift, scale, and\nrotational invariance necessitates pre-training on large-scale datasets.\nFurthermore, the growing layers and parameters in both ViT and convolutional\nneural networks (CNNs) impede their applicability to mobile multimedia\nservices, primarily owing to the constrained computational resources on edge\ndevices. To mitigate the aforementioned challenges, this paper introduces a\nnovel horizontally scalable vision transformer (HSViT). Specifically, a novel\nimage-level feature embedding allows ViT to better leverage the inductive bias\ninherent in the convolutional layers. Based on this, an innovative horizontally\nscalable architecture is designed, which reduces the number of layers and\nparameters of the models while facilitating collaborative training and\ninference of ViT models across multiple nodes. The experimental results depict\nthat, without pre-training on large-scale datasets, HSViT achieves up to 10%\nhigher top-1 accuracy than state-of-the-art schemes, ascertaining its superior\npreservation of inductive bias. The code is available at\nhttps://github.com/xuchenhao001/HSViT.\n","authors":["Chenhao Xu","Chang-Tsun Li","Chee Peng Lim","Douglas Creighton"],"pdf_url":"https://arxiv.org/pdf/2404.05196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05187v1","updated":"2024-04-08T04:27:36Z","published":"2024-04-08T04:27:36Z","title":"LGSDF: Continual Global Learning of Signed Distance Fields Aided by\n Local Updating","summary":" Implicit reconstruction of ESDF (Euclidean Signed Distance Field) involves\ntraining a neural network to regress the signed distance from any point to the\nnearest obstacle, which has the advantages of lightweight storage and\ncontinuous querying. However, existing algorithms usually rely on conflicting\nraw observations as training data, resulting in poor map performance. In this\npaper, we propose LGSDF, an ESDF continual Global learning algorithm aided by\nLocal updating. At the front end, axis-aligned grids are dynamically updated by\npre-processed sensor observations, where incremental fusion alleviates\nestimation error caused by limited viewing directions. At the back end, a\nrandomly initialized implicit ESDF neural network performs continual\nself-supervised learning guided by these grids to generate smooth and\ncontinuous maps. The results on multiple scenes show that LGSDF can construct\nmore accurate ESDF maps and meshes compared with SOTA (State Of The Art)\nexplicit and implicit mapping algorithms. The source code of LGSDF is publicly\navailable at https://github.com/BIT-DYN/LGSDF.\n","authors":["Yufeng Yue","Yinan Deng","Jiahui Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05183v1","updated":"2024-04-08T04:17:27Z","published":"2024-04-08T04:17:27Z","title":"Progressive Alignment with VLM-LLM Feature to Augment Defect\n Classification for the ASE Dataset","summary":" Traditional defect classification approaches are facing with two barriers.\n(1) Insufficient training data and unstable data quality. Collecting sufficient\ndefective sample is expensive and time-costing, consequently leading to dataset\nvariance. It introduces the difficulty on recognition and learning. (2)\nOver-dependence on visual modality. When the image pattern and texture is\nmonotonic for all defect classes in a given dataset, the performance of\nconventional AOI system cannot be guaranteed. In scenarios where image quality\nis compromised due to mechanical failures or when defect information is\ninherently difficult to discern, the performance of deep models cannot be\nguaranteed. A main question is, \"how to solve those two problems when they\noccur at the same time?\" The feasible strategy is to explore another feature\nwithin dataset and combine an eminent vision-language model (VLM) and\nLarge-Language model (LLM) with their astonishing zero-shot capability. In this\nwork, we propose the special ASE dataset, including rich data description\nrecorded on image, for defect classification, but the defect feature is uneasy\nto learn directly. Secondly, We present the prompting for VLM-LLM against\ndefect classification with the proposed ASE dataset to activate extra-modality\nfeature from images to enhance performance. Then, We design the novel\nprogressive feature alignment (PFA) block to refine image-text feature to\nalleviate the difficulty of alignment under few-shot scenario. Finally, the\nproposed Cross-modality attention fusion (CMAF) module can effectively fuse\ndifferent modality feature. Experiment results have demonstrated our method's\neffectiveness over several defect classification methods for the ASE dataset.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Chun-Hung Sun","Kuang-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2404.05183v1.pdf","comment":"MULA 2024"},{"id":"http://arxiv.org/abs/2404.05181v1","updated":"2024-04-08T04:13:35Z","published":"2024-04-08T04:13:35Z","title":"Adaptive Learning for Multi-view Stereo Reconstruction","summary":" Deep learning has recently demonstrated its excellent performance on the task\nof multi-view stereo (MVS). However, loss functions applied for deep MVS are\nrarely studied. In this paper, we first analyze existing loss functions'\nproperties for deep depth based MVS approaches. Regression based loss leads to\ninaccurate continuous results by computing mathematical expectation, while\nclassification based loss outputs discretized depth values. To this end, we\nthen propose a novel loss function, named adaptive Wasserstein loss, which is\nable to narrow down the difference between the true and predicted probability\ndistributions of depth. Besides, a simple but effective offset module is\nintroduced to better achieve sub-pixel prediction accuracy. Extensive\nexperiments on different benchmarks, including DTU, Tanks and Temples and\nBlendedMVS, show that the proposed method with the adaptive Wasserstein loss\nand the offset module achieves state-of-the-art performance.\n","authors":["Qinglu Min","Jie Zhao","Zhihao Zhang","Chen Min"],"pdf_url":"https://arxiv.org/pdf/2404.05181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05180v1","updated":"2024-04-08T04:10:50Z","published":"2024-04-08T04:10:50Z","title":"GloSoFarID: Global multispectral dataset for Solar Farm IDentification\n in satellite imagery","summary":" Solar Photovoltaic (PV) technology is increasingly recognized as a pivotal\nsolution in the global pursuit of clean and renewable energy. This technology\naddresses the urgent need for sustainable energy alternatives by converting\nsolar power into electricity without greenhouse gas emissions. It not only\ncurtails global carbon emissions but also reduces reliance on finite,\nnon-renewable energy sources. In this context, monitoring solar panel farms\nbecomes essential for understanding and facilitating the worldwide shift toward\nclean energy. This study contributes to this effort by developing the first\ncomprehensive global dataset of multispectral satellite imagery of solar panel\nfarms. This dataset is intended to form the basis for training robust machine\nlearning models, which can accurately map and analyze the expansion and\ndistribution of solar panel farms globally. The insights gained from this\nendeavor will be instrumental in guiding informed decision-making for a\nsustainable energy future. https://github.com/yzyly1992/GloSoFarID\n","authors":["Zhiyuan Yang","Ryan Rad"],"pdf_url":"https://arxiv.org/pdf/2404.05180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05169v1","updated":"2024-04-08T03:33:01Z","published":"2024-04-08T03:33:01Z","title":"QMix: Quality-aware Learning with Mixed Noise for Robust Retinal Disease\n Diagnosis","summary":" Due to the complexity of medical image acquisition and the difficulty of\nannotation, medical image datasets inevitably contain noise. Noisy data with\nwrong labels affects the robustness and generalization ability of deep neural\nnetworks. Previous noise learning methods mainly considered noise arising from\nimages being mislabeled, i.e. label noise, assuming that all mislabeled images\nare of high image quality. However, medical images are prone to suffering\nextreme quality issues, i.e. data noise, where discriminative visual features\nare missing for disease diagnosis. In this paper, we propose a noise learning\nframework, termed as QMix, that learns a robust disease diagnosis model under\nmixed noise. QMix alternates between sample separation and quality-aware\nsemisupervised training in each training epoch. In the sample separation phase,\nwe design a joint uncertainty-loss criterion to effectively separate (1)\ncorrectly labeled images; (2) mislabeled images with high quality and (3)\nmislabeled images with low quality. In the semi-supervised training phase, we\ntrain a disease diagnosis model to learn robust feature representation from the\nseparated samples. Specifically, we devise a sample-reweighing loss to mitigate\nthe effect of mislabeled images with low quality during training. Meanwhile, a\ncontrastive enhancement loss is proposed to further distinguish mislabeled\nimages with low quality from correctly labeled images. QMix achieved\nstate-of-the-art disease diagnosis performance on five public retinal image\ndatasets and exhibited substantial improvement on robustness against mixed\nnoise.\n","authors":["Junlin Hou","Jilan Xu","Rui Feng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05163v1","updated":"2024-04-08T03:06:19Z","published":"2024-04-08T03:06:19Z","title":"Semantic Flow: Learning Semantic Field of Dynamic Scenes from Monocular\n Videos","summary":" In this work, we pioneer Semantic Flow, a neural semantic representation of\ndynamic scenes from monocular videos. In contrast to previous NeRF methods that\nreconstruct dynamic scenes from the colors and volume densities of individual\npoints, Semantic Flow learns semantics from continuous flows that contain rich\n3D motion information. As there is 2D-to-3D ambiguity problem in the viewing\ndirection when extracting 3D flow features from 2D video frames, we consider\nthe volume densities as opacity priors that describe the contributions of flow\nfeatures to the semantics on the frames. More specifically, we first learn a\nflow network to predict flows in the dynamic scene, and propose a flow feature\naggregation module to extract flow features from video frames. Then, we propose\na flow attention module to extract motion information from flow features, which\nis followed by a semantic network to output semantic logits of flows. We\nintegrate the logits with volume densities in the viewing direction to\nsupervise the flow features with semantic labels on video frames. Experimental\nresults show that our model is able to learn from multiple dynamic scenes and\nsupports a series of new tasks such as instance-level scene editing, semantic\ncompletions, dynamic scene tracking and semantic adaption on novel scenes.\nCodes are available at https://github.com/tianfr/Semantic-Flow/.\n","authors":["Fengrui Tian","Yueqi Duan","Angtian Wang","Jianfei Guo","Shaoyi Du"],"pdf_url":"https://arxiv.org/pdf/2404.05163v1.pdf","comment":"Accepted by ICLR 2024, Codes are available at\n https://github.com/tianfr/Semantic-Flow/"},{"id":"http://arxiv.org/abs/2311.08393v3","updated":"2024-04-08T02:57:55Z","published":"2023-11-14T18:53:28Z","title":"MVSA-Net: Multi-View State-Action Recognition for Robust and Deployable\n Trajectory Generation","summary":" The learn-from-observation (LfO) paradigm is a human-inspired mode for a\nrobot to learn to perform a task simply by watching it being performed. LfO can\nfacilitate robot integration on factory floors by minimizing disruption and\nreducing tedious programming. A key component of the LfO pipeline is a\ntransformation of the depth camera frames to the corresponding task state and\naction pairs, which are then relayed to learning techniques such as imitation\nor inverse reinforcement learning for understanding the task parameters. While\nseveral existing computer vision models analyze videos for activity\nrecognition, SA-Net specifically targets robotic LfO from RGB-D data. However,\nSA-Net and many other models analyze frame data captured from a single\nviewpoint. Their analysis is therefore highly sensitive to occlusions of the\nobserved task, which are frequent in deployments. An obvious way of reducing\nocclusions is to simultaneously observe the task from multiple viewpoints and\nsynchronously fuse the multiple streams in the model. Toward this, we present\nmulti-view SA-Net, which generalizes the SA-Net model to allow the perception\nof multiple viewpoints of the task activity, integrate them, and better\nrecognize the state and action in each frame. Performance evaluations on two\ndistinct domains establish that MVSA-Net recognizes the state-action pairs\nunder occlusion more accurately compared to single-view MVSA-Net and other\nbaselines. Our ablation studies further evaluate its performance under\ndifferent ambient conditions and establish the contribution of the architecture\ncomponents. As such, MVSA-Net offers a significantly more robust and deployable\nstate-action trajectory generation compared to previous methods.\n","authors":["Ehsan Asali","Prashant Doshi","Jin Sun"],"pdf_url":"https://arxiv.org/pdf/2311.08393v3.pdf","comment":"Presented at Deployable AI Workshop at AAAI-2024 and 'Towards\n Reliable and Deployable Learning-Based Robotic Systems' Workshop at CoRL2023"},{"id":"http://arxiv.org/abs/2403.05805v2","updated":"2024-04-08T02:47:54Z","published":"2024-03-09T05:50:32Z","title":"And Then the Hammer Broke: Reflections on Machine Ethics from Feminist\n Philosophy of Science","summary":" Vision is an important metaphor in ethical and political questions of\nknowledge. The feminist philosopher Donna Haraway points out the ``perverse''\nnature of an intrusive, alienating, all-seeing vision (to which we might cry\nout ``stop looking at me!''), but also encourages us to embrace the embodied\nnature of sight and its promises for genuinely situated knowledge. Current\ntechnologies of machine vision -- surveillance cameras, drones (for war or\nrecreation), iPhone cameras -- are usually construed as instances of the former\nrather than the latter, and for good reasons. However, although in no way\nattempting to diminish the real suffering these technologies have brought about\nin the world, I make the case for understanding technologies of computer vision\nas material instances of embodied seeing and situated knowing. Furthermore,\nborrowing from Iris Murdoch's concept of moral vision, I suggest that these\ntechnologies direct our labor towards self-reflection in ethically significant\nways. My approach draws upon paradigms in computer vision research,\nphenomenology, and feminist epistemology. Ultimately, this essay is an argument\nfor directing more philosophical attention from merely criticizing technologies\nof vision as ethically deficient towards embracing them as complex,\nmethodologically and epistemologically important objects.\n","authors":["Andre Ye"],"pdf_url":"https://arxiv.org/pdf/2403.05805v2.pdf","comment":"Pacific University Philosophy Conference"},{"id":"http://arxiv.org/abs/2403.03954v3","updated":"2024-04-08T02:46:38Z","published":"2024-03-06T18:58:49Z","title":"3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple\n 3D Representations","summary":" Imitation learning provides an efficient way to teach robots dexterous\nskills; however, learning complex skills robustly and generalizablely usually\nconsumes large amounts of human demonstrations. To tackle this challenging\nproblem, we present 3D Diffusion Policy (DP3), a novel visual imitation\nlearning approach that incorporates the power of 3D visual representations into\ndiffusion policies, a class of conditional action generative models. The core\ndesign of DP3 is the utilization of a compact 3D visual representation,\nextracted from sparse point clouds with an efficient point encoder. In our\nexperiments involving 72 simulation tasks, DP3 successfully handles most tasks\nwith just 10 demonstrations and surpasses baselines with a 24.2% relative\nimprovement. In 4 real robot tasks, DP3 demonstrates precise control with a\nhigh success rate of 85%, given only 40 demonstrations of each task, and shows\nexcellent generalization abilities in diverse aspects, including space,\nviewpoint, appearance, and instance. Interestingly, in real robot experiments,\nDP3 rarely violates safety requirements, in contrast to baseline methods which\nfrequently do, necessitating human intervention. Our extensive evaluation\nhighlights the critical importance of 3D representations in real-world robot\nlearning. Videos, code, and data are available on\nhttps://3d-diffusion-policy.github.io .\n","authors":["Yanjie Ze","Gu Zhang","Kangning Zhang","Chenyuan Hu","Muhan Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2403.03954v3.pdf","comment":"Videos, code, and data: https://3d-diffusion-policy.github.io"},{"id":"http://arxiv.org/abs/2404.00989v2","updated":"2024-04-08T02:37:25Z","published":"2024-04-01T08:34:42Z","title":"360+x: A Panoptic Multi-modal Scene Understanding Dataset","summary":" Human perception of the world is shaped by a multitude of viewpoints and\nmodalities. While many existing datasets focus on scene understanding from a\ncertain perspective (e.g. egocentric or third-person views), our dataset offers\na panoptic perspective (i.e. multiple viewpoints with multiple data\nmodalities). Specifically, we encapsulate third-person panoramic and front\nviews, as well as egocentric monocular/binocular views with rich modalities\nincluding video, multi-channel audio, directional binaural delay, location data\nand textual scene descriptions within each scene captured, presenting\ncomprehensive observation of the world. Figure 1 offers a glimpse of all 28\nscene categories of our 360+x dataset. To the best of our knowledge, this is\nthe first database that covers multiple viewpoints with multiple data\nmodalities to mimic how daily information is accessed in the real world.\nThrough our benchmark analysis, we presented 5 different scene understanding\ntasks on the proposed 360+x dataset to evaluate the impact and benefit of each\ndata modality and perspective in panoptic scene understanding. We hope this\nunique dataset could broaden the scope of comprehensive scene understanding and\nencourage the community to approach these problems from more diverse\nperspectives.\n","authors":["Hao Chen","Yuqi Hou","Chenyuan Qu","Irene Testini","Xiaohan Hong","Jianbo Jiao"],"pdf_url":"https://arxiv.org/pdf/2404.00989v2.pdf","comment":"CVPR 2024 (Oral Presentation), Project page:\n https://x360dataset.github.io/"},{"id":"http://arxiv.org/abs/2402.07819v2","updated":"2024-04-08T02:36:23Z","published":"2024-02-12T17:24:35Z","title":"A Benchmark Grocery Dataset of Realworld Point Clouds From Single View","summary":" Fine-grained grocery object recognition is an important computer vision\nproblem with broad applications in automatic checkout, in-store robotic\nnavigation, and assistive technologies for the visually impaired. Existing\ndatasets on groceries are mainly 2D images. Models trained on these datasets\nare limited to learning features from the regular 2D grids. While portable 3D\nsensors such as Kinect were commonly available for mobile phones, sensors such\nas LiDAR and TrueDepth, have recently been integrated into mobile phones.\nDespite the availability of mobile 3D sensors, there are currently no dedicated\nreal-world large-scale benchmark 3D datasets for grocery. In addition, existing\n3D datasets lack fine-grained grocery categories and have limited training\nsamples. Furthermore, collecting data by going around the object versus the\ntraditional photo capture makes data collection cumbersome. Thus, we introduce\na large-scale grocery dataset called 3DGrocery100. It constitutes 100 classes,\nwith a total of 87,898 3D point clouds created from 10,755 RGB-D single-view\nimages. We benchmark our dataset on six recent state-of-the-art 3D point cloud\nclassification models. Additionally, we also benchmark the dataset on few-shot\nand continual learning point cloud classification tasks. Project Page:\nhttps://bigdatavision.org/3DGrocery100/.\n","authors":["Shivanand Venkanna Sheshappanavar","Tejas Anvekar","Shivanand Kundargi","Yufan Wang","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2402.07819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02241v2","updated":"2024-04-08T02:06:37Z","published":"2024-04-02T18:59:39Z","title":"Linear Combination of Saved Checkpoints Makes Consistency and Diffusion\n Models Better","summary":" Diffusion Models (DM) and Consistency Models (CM) are two types of popular\ngenerative models with good generation quality on various tasks. When training\nDM and CM, intermediate weight checkpoints are not fully utilized and only the\nlast converged checkpoint is used. In this work, we find that high-quality\nmodel weights often lie in a basin which cannot be reached by SGD but can be\nobtained by proper checkpoint averaging. Based on these observations, we\npropose LCSC, a simple but effective and efficient method to enhance the\nperformance of DM and CM, by combining checkpoints along the training\ntrajectory with coefficients deduced from evolutionary search. We demonstrate\nthe value of LCSC through two use cases: $\\textbf{(a) Reducing training cost.}$\nWith LCSC, we only need to train DM/CM with fewer number of iterations and/or\nlower batch sizes to obtain comparable sample quality with the fully trained\nmodel. For example, LCSC achieves considerable training speedups for CM\n(23$\\times$ on CIFAR-10 and 15$\\times$ on ImageNet-64). $\\textbf{(b) Enhancing\npre-trained models.}$ Assuming full training is already done, LCSC can further\nimprove the generation quality or speed of the final converged models. For\nexample, LCSC achieves better performance using 1 number of function evaluation\n(NFE) than the base model with 2 NFE on consistency distillation, and decreases\nthe NFE of DM from 15 to 9 while maintaining the generation quality on\nCIFAR-10. Our code is available at\nhttps://github.com/imagination-research/LCSC.\n","authors":["Enshu Liu","Junyi Zhu","Zinan Lin","Xuefei Ning","Matthew B. Blaschko","Sergey Yekhanin","Shengen Yan","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05145v1","updated":"2024-04-08T02:02:15Z","published":"2024-04-08T02:02:15Z","title":"UniMix: Towards Domain Adaptive and Generalizable LiDAR Semantic\n Segmentation in Adverse Weather","summary":" LiDAR semantic segmentation (LSS) is a critical task in autonomous driving\nand has achieved promising progress. However, prior LSS methods are\nconventionally investigated and evaluated on datasets within the same domain in\nclear weather. The robustness of LSS models in unseen scenes and all weather\nconditions is crucial for ensuring safety and reliability in real applications.\nTo this end, we propose UniMix, a universal method that enhances the\nadaptability and generalizability of LSS models. UniMix first leverages\nphysically valid adverse weather simulation to construct a Bridge Domain, which\nserves to bridge the domain gap between the clear weather scenes and the\nadverse weather scenes. Then, a Universal Mixing operator is defined regarding\nspatial, intensity, and semantic distributions to create the intermediate\ndomain with mixed samples from given domains. Integrating the proposed two\ntechniques into a teacher-student framework, UniMix efficiently mitigates the\ndomain gap and enables LSS models to learn weather-robust and domain-invariant\nrepresentations. We devote UniMix to two main setups: 1) unsupervised domain\nadaption, adapting the model from the clear weather source domain to the\nadverse weather target domain; 2) domain generalization, learning a model that\ngeneralizes well to unseen scenes in adverse weather. Extensive experiments\nvalidate the effectiveness of UniMix across different tasks and datasets, all\nachieving superior performance over state-of-the-art methods. The code will be\nreleased.\n","authors":["Haimei Zhao","Jing Zhang","Zhuo Chen","Shanshan Zhao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2404.05145v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05144v1","updated":"2024-04-08T01:55:28Z","published":"2024-04-08T01:55:28Z","title":"Enhancing Clinical Efficiency through LLM: Discharge Note Generation for\n Cardiac Patients","summary":" Medical documentation, including discharge notes, is crucial for ensuring\npatient care quality, continuity, and effective medical communication. However,\nthe manual creation of these documents is not only time-consuming but also\nprone to inconsistencies and potential errors. The automation of this\ndocumentation process using artificial intelligence (AI) represents a promising\narea of innovation in healthcare. This study directly addresses the\ninefficiencies and inaccuracies in creating discharge notes manually,\nparticularly for cardiac patients, by employing AI techniques, specifically\nlarge language model (LLM). Utilizing a substantial dataset from a cardiology\ncenter, encompassing wide-ranging medical records and physician assessments,\nour research evaluates the capability of LLM to enhance the documentation\nprocess. Among the various models assessed, Mistral-7B distinguished itself by\naccurately generating discharge notes that significantly improve both\ndocumentation efficiency and the continuity of care for patients. These notes\nunderwent rigorous qualitative evaluation by medical expert, receiving high\nmarks for their clinical relevance, completeness, readability, and contribution\nto informed decision-making and care planning. Coupled with quantitative\nanalyses, these results confirm Mistral-7B's efficacy in distilling complex\nmedical information into concise, coherent summaries. Overall, our findings\nilluminate the considerable promise of specialized LLM, such as Mistral-7B, in\nrefining healthcare documentation workflows and advancing patient care. This\nstudy lays the groundwork for further integrating advanced AI technologies in\nhealthcare, demonstrating their potential to revolutionize patient\ndocumentation and support better care outcomes.\n","authors":["HyoJe Jung","Yunha Kim","Heejung Choi","Hyeram Seo","Minkyoung Kim","JiYe Han","Gaeun Kee","Seohyun Park","Soyoung Ko","Byeolhee Kim","Suyeon Kim","Tae Joon Jun","Young-Hak Kim"],"pdf_url":"https://arxiv.org/pdf/2404.05144v1.pdf","comment":"10 pages, 1 figure, 3 tables, conference"},{"id":"http://arxiv.org/abs/2404.05139v1","updated":"2024-04-08T01:38:43Z","published":"2024-04-08T01:38:43Z","title":"Better Monocular 3D Detectors with LiDAR from the Past","summary":" Accurate 3D object detection is crucial to autonomous driving. Though\nLiDAR-based detectors have achieved impressive performance, the high cost of\nLiDAR sensors precludes their widespread adoption in affordable vehicles.\nCamera-based detectors are cheaper alternatives but often suffer inferior\nperformance compared to their LiDAR-based counterparts due to inherent depth\nambiguities in images. In this work, we seek to improve monocular 3D detectors\nby leveraging unlabeled historical LiDAR data. Specifically, at inference time,\nwe assume that the camera-based detectors have access to multiple unlabeled\nLiDAR scans from past traversals at locations of interest (potentially from\nother high-end vehicles equipped with LiDAR sensors). Under this setup, we\nproposed a novel, simple, and end-to-end trainable framework, termed\nAsyncDepth, to effectively extract relevant features from asynchronous LiDAR\ntraversals of the same location for monocular 3D detectors. We show consistent\nand significant performance gain (up to 9 AP) across multiple state-of-the-art\nmodels and datasets with a negligible additional latency of 9.66 ms and a small\nstorage cost.\n","authors":["Yurong You","Cheng Perng Phoo","Carlos Andres Diaz-Ruiz","Katie Z Luo","Wei-Lun Chao","Mark Campbell","Bharath Hariharan","Kilian Q Weinberger"],"pdf_url":"https://arxiv.org/pdf/2404.05139v1.pdf","comment":"Accepted by ICRA 2022. The code can be found at\n https://github.com/YurongYou/AsyncDepth"},{"id":"http://arxiv.org/abs/2404.05136v1","updated":"2024-04-08T01:29:10Z","published":"2024-04-08T01:29:10Z","title":"Self-Supervised Multi-Object Tracking with Path Consistency","summary":" In this paper, we propose a novel concept of path consistency to learn robust\nobject matching without using manual object identity supervision. Our key idea\nis that, to track a object through frames, we can obtain multiple different\nassociation results from a model by varying the frames it can observe, i.e.,\nskipping frames in observation. As the differences in observations do not alter\nthe identities of objects, the obtained association results should be\nconsistent. Based on this rationale, we generate multiple observation paths,\neach specifying a different set of frames to be skipped, and formulate the Path\nConsistency Loss that enforces the association results are consistent across\ndifferent observation paths. We use the proposed loss to train our object\nmatching model with only self-supervision. By extensive experiments on three\ntracking datasets (MOT17, PersonPath22, KITTI), we demonstrate that our method\noutperforms existing unsupervised methods with consistent margins on various\nevaluation metrics, and even achieves performance close to supervised methods.\n","authors":["Zijia Lu","Bing Shuai","Yanbei Chen","Zhenlin Xu","Davide Modolo"],"pdf_url":"https://arxiv.org/pdf/2404.05136v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05129v1","updated":"2024-04-08T01:14:09Z","published":"2024-04-08T01:14:09Z","title":"Image-based Agarwood Resinous Area Segmentation using Deep Learning","summary":" The manual extraction method of Agarwood resinous compound is laborious work,\nrequires skilled workers, and is subject to human errors. Commercial Agarwood\nindustries have been actively exploring using Computer Numerical Control (CNC)\nmachines to replace human effort for this particular task. The CNC machine\naccepts a G-code script produced from a binary image in which the wood region\nthat needs to be chiselled off is marked with (0, 0, 0) as its RGB value.\nRather than requiring a human expert to perform the region marking, we propose\nusing a Deep learning image segmentation method instead. Our setup involves a\ncamera that captures the cross-section image and then passes the image file to\na computer. The computer performs the automated image segmentation and feeds\nthe CNC machine with a G-code script. In this article, we report the initial\nsegmentation results achieved using a state-of-the-art Deep learning\nsegmentation method and discuss potential improvements to refine the\nsegmentation accuracy.\n","authors":["Irwandi Hipiny","Johari Abdullah","Noor Alamshah Bolhassan"],"pdf_url":"https://arxiv.org/pdf/2404.05129v1.pdf","comment":"15 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2207.01200v4","updated":"2024-04-08T01:11:22Z","published":"2022-07-04T05:03:10Z","title":"S$^{5}$Mars: Semi-Supervised Learning for Mars Semantic Segmentation","summary":" Deep learning has become a powerful tool for Mars exploration. Mars terrain\nsemantic segmentation is an important Martian vision task, which is the base of\nrover autonomous planning and safe driving. However, there is a lack of\nsufficient detailed and high-confidence data annotations, which are exactly\nrequired by most deep learning methods to obtain a good model. To address this\nproblem, we propose our solution from the perspective of joint data and method\ndesign. We first present a newdataset S5Mars for Semi-SuperviSed learning on\nMars Semantic Segmentation, which contains 6K high-resolution images and is\nsparsely annotated based on confidence, ensuring the high quality of labels.\nThen to learn from this sparse data, we propose a semi-supervised learning\n(SSL) framework for Mars image semantic segmentation, to learn representations\nfrom limited labeled data. Different from the existing SSL methods which are\nmostly targeted at the Earth image data, our method takes into account Mars\ndata characteristics. Specifically, we first investigate the impact of current\nwidely used natural image augmentations on Mars images. Based on the analysis,\nwe then proposed two novel and effective augmentations for SSL of Mars\nsegmentation, AugIN and SAM-Mix, which serve as strong augmentations to boost\nthe model performance. Meanwhile, to fully leverage the unlabeled data, we\nintroduce a soft-to-hard consistency learning strategy, learning from different\ntargets based on prediction confidence. Experimental results show that our\nmethod can outperform state-of-the-art SSL approaches remarkably. Our proposed\ndataset is available at https://jhang2020.github.io/S5Mars.github.io/.\n","authors":["Jiahang Zhang","Lilang Lin","Zejia Fan","Wenjing Wang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2207.01200v4.pdf","comment":"IEEE TGRS 2024"},{"id":"http://arxiv.org/abs/2404.05128v1","updated":"2024-04-08T01:08:41Z","published":"2024-04-08T01:08:41Z","title":"Improving Deep Learning Predictions with Simulated Images, and Vice\n Versa","summary":" Artificial neural networks are often used to identify features of crop\nplants. However, training their models requires many annotated images, which\ncan be expensive and time-consuming to acquire. Procedural models of plants,\nsuch as those developed with Lindenmayer-systems (L-systems) can be created to\nproduce visually realistic simulations, and hence images of plant simulations,\nwhere annotations are implicitly known. These synthetic images can either\naugment or completely replace real images in training neural networks for\nphenotyping tasks. In this paper, we systematically vary amounts of real and\nsynthetic images used for training in both maize and canola to better\nunderstand situations where synthetic images generated from L-systems can help\nprediction on real images. This work also explores the degree to which realism\nin the synthetic images improves prediction. Furthermore, we see how neural\nnetwork predictions can be used to help calibrate L-systems themselves,\ncreating a feedback loop.\n","authors":["Nazifa Azam Khan","Mikolaj Cieslak","Ian McQuillan"],"pdf_url":"https://arxiv.org/pdf/2404.05128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03202v2","updated":"2024-04-08T01:05:57Z","published":"2024-04-04T05:10:26Z","title":"OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field\n Reconstruction using Omnidirectional Images","summary":" Photorealistic reconstruction relying on 3D Gaussian Splatting has shown\npromising potential in robotics. However, the current 3D Gaussian Splatting\nsystem only supports radiance field reconstruction using undistorted\nperspective images. In this paper, we present OmniGS, a novel omnidirectional\nGaussian splatting system, to take advantage of omnidirectional images for fast\nradiance field reconstruction. Specifically, we conduct a theoretical analysis\nof spherical camera model derivatives in 3D Gaussian Splatting. According to\nthe derivatives, we then implement a new GPU-accelerated omnidirectional\nrasterizer that directly splats 3D Gaussians onto the equirectangular screen\nspace for omnidirectional image rendering. As a result, we realize\ndifferentiable optimization of the radiance field without the requirement of\ncube-map rectification or tangent-plane approximation. Extensive experiments\nconducted in egocentric and roaming scenarios demonstrate that our method\nachieves state-of-the-art reconstruction quality and high rendering speed using\nomnidirectional images. To benefit the research community, the code will be\nmade publicly available once the paper is published.\n","authors":["Longwei Li","Huajian Huang","Sai-Kit Yeung","Hui Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03202v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.05111v1","updated":"2024-04-08T00:13:05Z","published":"2024-04-08T00:13:05Z","title":"Class Similarity Transition: Decoupling Class Similarities and Imbalance\n from Generalized Few-shot Segmentation","summary":" In Generalized Few-shot Segmentation (GFSS), a model is trained with a large\ncorpus of base class samples and then adapted on limited samples of novel\nclasses. This paper focuses on the relevance between base and novel classes,\nand improves GFSS in two aspects: 1) mining the similarity between base and\nnovel classes to promote the learning of novel classes, and 2) mitigating the\nclass imbalance issue caused by the volume difference between the support set\nand the training set. Specifically, we first propose a similarity transition\nmatrix to guide the learning of novel classes with base class knowledge. Then,\nwe leverage the Label-Distribution-Aware Margin (LDAM) loss and Transductive\nInference to the GFSS task to address the problem of class imbalance as well as\noverfitting the support set. In addition, by extending the probability\ntransition matrix, the proposed method can mitigate the catastrophic forgetting\nof base classes when learning novel classes. With a simple training phase, our\nproposed method can be applied to any segmentation network trained on base\nclasses. We validated our methods on the adapted version of OpenEarthMap.\nCompared to existing GFSS baselines, our method excels them all from 3% to 7%\nand ranks second in the OpenEarthMap Land Cover Mapping Few-Shot Challenge at\nthe completion of this paper. Code:\nhttps://github.com/earth-insights/ClassTrans\n","authors":["Shihong Wang","Ruixun Liu","Kaiyu Li","Jiawei Jiang","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2404.05111v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.09250v2","updated":"2024-04-08T22:40:01Z","published":"2023-12-14T18:59:36Z","title":"Single Mesh Diffusion Models with Field Latents for Texture Generation","summary":" We introduce a framework for intrinsic latent diffusion models operating\ndirectly on the surfaces of 3D shapes, with the goal of synthesizing\nhigh-quality textures. Our approach is underpinned by two contributions: field\nlatents, a latent representation encoding textures as discrete vector fields on\nthe mesh vertices, and field latent diffusion models, which learn to denoise a\ndiffusion process in the learned latent space on the surface. We consider a\nsingle-textured-mesh paradigm, where our models are trained to generate\nvariations of a given texture on a mesh. We show the synthesized textures are\nof superior fidelity compared those from existing single-textured-mesh\ngenerative models. Our models can also be adapted for user-controlled editing\ntasks such as inpainting and label-guided generation. The efficacy of our\napproach is due in part to the equivariance of our proposed framework under\nisometries, allowing our models to seamlessly reproduce details across locally\nsimilar regions and opening the door to a notion of generative texture\ntransfer.\n","authors":["Thomas W. Mitchel","Carlos Esteves","Ameesh Makadia"],"pdf_url":"https://arxiv.org/pdf/2312.09250v2.pdf","comment":"CVPR 2024. Code and additional visualizations available:\n https://single-mesh-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2311.12539v2","updated":"2024-04-08T22:19:23Z","published":"2023-11-21T11:33:15Z","title":"GMISeg: General Medical Image Segmentation without Re-Training","summary":" Although deep learning models have become the main method for medical image\nsegmentation, they often cannot be extended to unknown segmentation tasks\ninvolving new anatomical structures, image shapes, or labels. For new\nsegmentation tasks, researchers often have to retrain or fine-tune the model,\nwhich is time-consuming and poses a significant obstacle to clinical\nresearchers, who often lack the resources and professional knowledge to train\nneural networks. Therefore, we proposed a general method that can solve unknown\nmedical image segmentation tasks without requiring additional training. Given\nan example set of images and prompts for defining new segmentation tasks,\nGMISeg applies a novel low-rank fine-tuning strategy based on the proposed\napproach to the SAM (Segment Anything Model) image encoder, and works with the\nprompt encoder and mask decoder to fine-tune the labeled dataset without the\nneed for additional training. To achieve generalization of new tasks, we used\nmedical image datasets with different imaging modes for different parts. We\ntrained and generalized GMISeg on a different set of anatomical and imaging\nmodes using cardiac images on other site datasets. We have demonstrated that\nGMISeg outperforms the latest methods on unknown tasks and have conducted a\ncomprehensive analysis and summary of the important performance of the proposed\nmethod.\n","authors":["Jing Xu"],"pdf_url":"https://arxiv.org/pdf/2311.12539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.11470v2","updated":"2024-04-08T22:01:32Z","published":"2022-10-20T17:59:54Z","title":"i-MAE: Are Latent Representations in Masked Autoencoders Linearly\n Separable?","summary":" Masked image modeling (MIM) has been recognized as a strong self-supervised\npre-training approach in the vision domain. However, the mechanism and\nproperties of the learned representations by such a scheme, as well as how to\nfurther enhance the representations are so far not well-explored. In this\npaper, we aim to explore an interactive Masked Autoencoders (i-MAE) framework\nto enhance the representation capability from two aspects: (1) employing a\ntwo-way image reconstruction and a latent feature reconstruction with\ndistillation loss to learn better features; (2) proposing a semantics-enhanced\nsampling strategy to boost the learned semantics in MAE. Upon the proposed\ni-MAE architecture, we can address two critical questions to explore the\nbehaviors of the learned representations in MAE: (1) Whether the separability\nof latent representations in Masked Autoencoders is helpful for model\nperformance? We study it by forcing the input as a mixture of two images\ninstead of one. (2) Whether we can enhance the representations in the latent\nfeature space by controlling the degree of semantics during sampling on Masked\nAutoencoders? To this end, we propose a sampling strategy within a mini-batch\nbased on the semantics of training samples to examine this aspect. Extensive\nexperiments are conducted on CIFAR-10/100, Tiny-ImageNet and ImageNet-1K to\nverify the observations we discovered. Furthermore, in addition to\nqualitatively analyzing the characteristics of the latent representations, we\nexamine the existence of linear separability and the degree of semantics in the\nlatent space by proposing two evaluation schemes. The surprising and consistent\nresults demonstrate that i-MAE is a superior framework design for understanding\nMAE frameworks, as well as achieving better representational ability. Code is\navailable at https://github.com/vision-learning-acceleration-lab/i-mae.\n","authors":["Kevin Zhang","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2210.11470v2.pdf","comment":"Project page: https://zhiqiangshen.com/projects/i-mae/"},{"id":"http://arxiv.org/abs/2404.03392v2","updated":"2024-04-08T21:26:47Z","published":"2024-04-04T11:49:56Z","title":"Two Tricks to Improve Unsupervised Segmentation Learning","summary":" We present two practical improvement techniques for unsupervised segmentation\nlearning. These techniques address limitations in the resolution and accuracy\nof predicted segmentation maps of recent state-of-the-art methods. Firstly, we\nleverage image post-processing techniques such as guided filtering to refine\nthe output masks, improving accuracy while avoiding substantial computational\ncosts. Secondly, we introduce a multi-scale consistency criterion, based on a\nteacher-student training scheme. This criterion matches segmentation masks\npredicted from regions of the input image extracted at different resolutions to\neach other. Experimental results on several benchmarks used in unsupervised\nsegmentation learning demonstrate the effectiveness of our proposed techniques.\n","authors":["Alp Eren Sari","Francesco Locatello","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2404.03392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05872v1","updated":"2024-04-08T21:09:59Z","published":"2024-04-08T21:09:59Z","title":"TabConv: Low-Computation CNN Inference via Table Lookups","summary":" Convolutional Neural Networks (CNNs) have demonstrated remarkable ability\nthroughout the field of computer vision. However, CNN inference requires a\nlarge number of arithmetic operations, making them expensive to deploy in\nhardware. Current approaches alleviate this issue by developing\nhardware-supported, algorithmic processes to simplify spatial convolution\nfunctions. However, these methods still heavily rely on matrix multiplication,\nleading to significant computational overhead. To bridge the gap between\nhardware, algorithmic acceleration, and approximate matrix multiplication, we\npropose TabConv, a novel, table-based approximation for convolution to\nsignificantly reduce arithmetic operations during inference. Additionally, we\nintroduce a priority masking technique based on cosine similarity to select\nlayers for table-based approximation, thereby maintaining the model\nperformance. We evaluate our approach on popular CNNs: ResNet-18, ResNet-34,\nand NetworkInNetwork (NIN). TabConv preserves over 93% of the original model's\nperformance while reducing arithmetic operations by 36.5%, 25.8%, and 99.4% for\nResNet-18 on CIFAR-10, CIFAR-100, and MNIST, respectively, 35.6% and 99.3% for\nResNet-34 on CIFAR-10 and MNIST, and 98.9% for NIN on MNIST, achieving\nlow-computation inference.\n","authors":["Neelesh Gupta","Narayanan Kannan","Pengmiao Zhang","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2404.05872v1.pdf","comment":"8 pages, Accepted at CF '24"},{"id":"http://arxiv.org/abs/2404.05862v1","updated":"2024-04-08T20:51:30Z","published":"2024-04-08T20:51:30Z","title":"Towards Improved Semiconductor Defect Inspection for high-NA EUVL based\n on SEMI-SuperYOLO-NAS","summary":" Due to potential pitch reduction, the semiconductor industry is adopting\nHigh-NA EUVL technology. However, its low depth of focus presents challenges\nfor High Volume Manufacturing. To address this, suppliers are exploring thinner\nphotoresists and new underlayers/hardmasks. These may suffer from poor SNR,\ncomplicating defect detection. Vision-based ML algorithms offer a promising\nsolution for semiconductor defect inspection. However, developing a robust ML\nmodel across various image resolutions without explicit training remains a\nchallenge for nano-scale defect inspection. This research's goal is to propose\na scale-invariant ADCD framework capable to upscale images, addressing this\nissue. We propose an improvised ADCD framework as SEMI-SuperYOLO-NAS, which\nbuilds upon the baseline YOLO-NAS architecture. This framework integrates a SR\nassisted branch to aid in learning HR features by the defect detection\nbackbone, particularly for detecting nano-scale defect instances from LR\nimages. Additionally, the SR-assisted branch can recursively generate upscaled\nimages from their corresponding downscaled counterparts, enabling defect\ndetection inference across various image resolutions without requiring explicit\ntraining. Moreover, we investigate improved data augmentation strategy aimed at\ngenerating diverse and realistic training datasets to enhance model\nperformance. We have evaluated our proposed approach using two original FAB\ndatasets obtained from two distinct processes and captured using two different\nimaging tools. Finally, we demonstrate zero-shot inference for our model on a\nnew, originating from a process condition distinct from the training dataset\nand possessing different Pitch characteristics. Experimental validation\ndemonstrates that our proposed ADCD framework aids in increasing the throughput\nof imaging tools for defect inspection by reducing the required image pixel\nresolutions.\n","authors":["Ying-Lin Chen","Jacob Deforce","Vic De Ridder","Bappaditya Dey","Victor Blanco","Sandip Halder","Philippe Leray"],"pdf_url":"https://arxiv.org/pdf/2404.05862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05849v1","updated":"2024-04-08T20:31:27Z","published":"2024-04-08T20:31:27Z","title":"Localizing Moments of Actions in Untrimmed Videos of Infants with Autism\n Spectrum Disorder","summary":" Autism Spectrum Disorder (ASD) presents significant challenges in early\ndiagnosis and intervention, impacting children and their families. With\nprevalence rates rising, there is a critical need for accessible and efficient\nscreening tools. Leveraging machine learning (ML) techniques, in particular\nTemporal Action Localization (TAL), holds promise for automating ASD screening.\nThis paper introduces a self-attention based TAL model designed to identify\nASD-related behaviors in infant videos. Unlike existing methods, our approach\nsimplifies complex modeling and emphasizes efficiency, which is essential for\npractical deployment in real-world scenarios. Importantly, this work\nunderscores the importance of developing computer vision methods capable of\noperating in naturilistic environments with little equipment control,\naddressing key challenges in ASD screening. This study is the first to conduct\nend-to-end temporal action localization in untrimmed videos of infants with\nASD, offering promising avenues for early intervention and support. We report\nbaseline results of behavior detection using our TAL model. We achieve 70%\naccuracy for look face, 79% accuracy for look object, 72% for smile and 65% for\nvocalization.\n","authors":["Halil Ismail Helvaci","Sen-ching Samson Cheung","Chen-Nee Chuah","Sally Ozonoff"],"pdf_url":"https://arxiv.org/pdf/2404.05849v1.pdf","comment":"7 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.05828v1","updated":"2024-04-08T19:46:20Z","published":"2024-04-08T19:46:20Z","title":"Privacy-Preserving Deep Learning Using Deformable Operators for Secure\n Task Learning","summary":" In the era of cloud computing and data-driven applications, it is crucial to\nprotect sensitive information to maintain data privacy, ensuring truly reliable\nsystems. As a result, preserving privacy in deep learning systems has become a\ncritical concern. Existing methods for privacy preservation rely on image\nencryption or perceptual transformation approaches. However, they often suffer\nfrom reduced task performance and high computational costs. To address these\nchallenges, we propose a novel Privacy-Preserving framework that uses a set of\ndeformable operators for secure task learning. Our method involves shuffling\npixels during the analog-to-digital conversion process to generate visually\nprotected data. Those are then fed into a well-known network enhanced with\ndeformable operators. Using our approach, users can achieve equivalent\nperformance to original images without additional training using a secret key.\nMoreover, our method enables access control against unauthorized users.\nExperimental results demonstrate the efficacy of our approach, showcasing its\npotential in cloud-based scenarios and privacy-sensitive applications.\n","authors":["Fabian Perez","Jhon Lopez","Henry Arguello"],"pdf_url":"https://arxiv.org/pdf/2404.05828v1.pdf","comment":"copyright 2024 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2403.04932v2","updated":"2024-04-08T19:45:32Z","published":"2024-03-07T22:39:02Z","title":"Divide and Conquer: High-Resolution Industrial Anomaly Detection via\n Memory Efficient Tiled Ensemble","summary":" Industrial anomaly detection is an important task within computer vision with\na wide range of practical use cases. The small size of anomalous regions in\nmany real-world datasets necessitates processing the images at a high\nresolution. This frequently poses significant challenges concerning memory\nconsumption during the model training and inference stages, leaving some\nexisting methods impractical for widespread adoption. To overcome this\nchallenge, we present the tiled ensemble approach, which reduces memory\nconsumption by dividing the input images into a grid of tiles and training a\ndedicated model for each tile location. The tiled ensemble is compatible with\nany existing anomaly detection model without the need for any modification of\nthe underlying architecture. By introducing overlapping tiles, we utilize the\nbenefits of traditional stacking ensembles, leading to further improvements in\nanomaly detection capabilities beyond high resolution alone. We perform a\ncomprehensive analysis using diverse underlying architectures, including Padim,\nPatchCore, FastFlow, and Reverse Distillation, on two standard anomaly\ndetection datasets: MVTec and VisA. Our method demonstrates a notable\nimprovement across setups while remaining within GPU memory constraints,\nconsuming only as much GPU memory as a single model needs to process a single\ntile.\n","authors":["Blaž Rolih","Dick Ameln","Ashwin Vaidya","Samet Akcay"],"pdf_url":"https://arxiv.org/pdf/2403.04932v2.pdf","comment":"To appear at CVPR 24 Visual Anomaly Detection Workshop. Research\n conducted during Google Summer of Code 2023 at OpenVINO (Intel). GSoC 2023\n page: https://summerofcode.withgoogle.com/archive/2023/projects/WUSjdxGl"},{"id":"http://arxiv.org/abs/2401.00896v2","updated":"2024-04-08T18:40:31Z","published":"2023-12-31T10:51:52Z","title":"TrailBlazer: Trajectory Control for Diffusion-Based Video Generation","summary":" Within recent approaches to text-to-video (T2V) generation, achieving\ncontrollability in the synthesized video is often a challenge. Typically, this\nissue is addressed by providing low-level per-frame guidance in the form of\nedge maps, depth maps, or an existing video to be altered. However, the process\nof obtaining such guidance can be labor-intensive. This paper focuses on\nenhancing controllability in video synthesis by employing straightforward\nbounding boxes to guide the subject in various ways, all without the need for\nneural network training, finetuning, optimization at inference time, or the use\nof pre-existing videos. Our algorithm, TrailBlazer, is constructed upon a\npre-trained (T2V) model, and easy to implement. The subject is directed by a\nbounding box through the proposed spatial and temporal attention map editing.\nMoreover, we introduce the concept of keyframing, allowing the subject\ntrajectory and overall appearance to be guided by both a moving bounding box\nand corresponding prompts, without the need to provide a detailed mask. The\nmethod is efficient, with negligible additional computation relative to the\nunderlying pre-trained model. Despite the simplicity of the bounding box\nguidance, the resulting motion is surprisingly natural, with emergent effects\nincluding perspective and movement toward the virtual camera as the box size\nincreases.\n","authors":["Wan-Duo Kurt Ma","J. P. Lewis","W. Bastiaan Kleijn"],"pdf_url":"https://arxiv.org/pdf/2401.00896v2.pdf","comment":"14 pages, 18 figures, Project Page:\n https://hohonu-vicml.github.io/Trailblazer.Page/"},{"id":"http://arxiv.org/abs/2404.05814v1","updated":"2024-04-08T18:36:18Z","published":"2024-04-08T18:36:18Z","title":"Towards Explainable Automated Neuroanatomy","summary":" We present a novel method for quantifying the microscopic structure of brain\ntissue. It is based on the automated recognition of interpretable features\nobtained by analyzing the shapes of cells. This contrasts with prevailing\nmethods of brain anatomical analysis in two ways. First, contemporary methods\nuse gray-scale values derived from smoothed version of the anatomical images,\nwhich dissipated valuable information from the texture of the images. Second,\ncontemporary analysis uses the output of black-box Convolutional Neural\nNetworks, while our system makes decisions based on interpretable features\nobtained by analyzing the shapes of individual cells. An important benefit of\nthis open-box approach is that the anatomist can understand and correct the\ndecisions made by the computer. Our proposed system can accurately localize and\nidentify existing brain structures. This can be used to align and coregistar\nbrains and will facilitate connectomic studies for reverse engineering of brain\ncircuitry.\n","authors":["Kui Qian","Litao Qiao","Beth Friedman","Edward O'Donnell","David Kleinfeld","Yoav Freund"],"pdf_url":"https://arxiv.org/pdf/2404.05814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05802v1","updated":"2024-04-08T18:05:24Z","published":"2024-04-08T18:05:24Z","title":"BatSort: Enhanced Battery Classification with Transfer Learning for\n Battery Sorting and Recycling","summary":" Battery recycling is a critical process for minimizing environmental harm and\nresource waste for used batteries. However, it is challenging, largely because\nsorting batteries is costly and hardly automated to group batteries based on\nbattery types. In this paper, we introduce a machine learning-based approach\nfor battery-type classification and address the daunting problem of data\nscarcity for the application. We propose BatSort which applies transfer\nlearning to utilize the existing knowledge optimized with large-scale datasets\nand customizes ResNet to be specialized for classifying battery types. We\ncollected our in-house battery-type dataset of small-scale to guide the\nknowledge transfer as a case study and evaluate the system performance. We\nconducted an experimental study and the results show that BatSort can achieve\noutstanding accuracy of 92.1% on average and up to 96.2% and the performance is\nstable for battery-type classification. Our solution helps realize fast and\nautomated battery sorting with minimized cost and can be transferred to related\nindustry applications with insufficient data.\n","authors":["Yunyi Zhao","Wei Zhang","Erhai Hu","Qingyu Yan","Cheng Xiang","King Jet Tseng","Dusit Niyato"],"pdf_url":"https://arxiv.org/pdf/2404.05802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05783v1","updated":"2024-04-08T17:53:21Z","published":"2024-04-08T17:53:21Z","title":"Responsible Generative AI: What to Generate and What Not","summary":" In recent years, generative AI (GenAI), like large language models and\ntext-to-image models, has received significant attention across various\ndomains. However, ensuring the responsible generation of content by these\nmodels is crucial for their real-world applicability. This raises an\ninteresting question: \\textit{What should responsible GenAI generate, and what\nshould it not?} To answer the question, this paper investigates the practical\nresponsible requirements of both textual and visual generative models,\noutlining five key considerations: generating truthful content, avoiding toxic\ncontent, refusing harmful instruction, leaking no training data-related\ncontent, and ensuring generated content identifiable. Specifically, we review\nrecent advancements and challenges in addressing these requirements. Besides,\nwe discuss and emphasize the importance of responsible GenAI across healthcare,\neducation, finance, and artificial general intelligence domains. Through a\nunified perspective on both textual and visual generative models, this paper\naims to provide insights into practical safety-related issues and further\nbenefit the community in building responsible GenAI.\n","authors":["Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2404.05783v1.pdf","comment":"74 pages, 10 figures"},{"id":"http://arxiv.org/abs/2205.10793v2","updated":"2024-04-08T16:59:24Z","published":"2022-05-22T10:26:54Z","title":"Knowledge Distillation via the Target-aware Transformer","summary":" Knowledge distillation becomes a de facto standard to improve the performance\nof small neural networks. Most of the previous works propose to regress the\nrepresentational features from the teacher to the student in a one-to-one\nspatial matching fashion. However, people tend to overlook the fact that, due\nto the architecture differences, the semantic information on the same spatial\nlocation usually vary. This greatly undermines the underlying assumption of the\none-to-one distillation approach. To this end, we propose a novel one-to-all\nspatial matching knowledge distillation approach. Specifically, we allow each\npixel of the teacher feature to be distilled to all spatial locations of the\nstudent features given its similarity, which is generated from a target-aware\ntransformer. Our approach surpasses the state-of-the-art methods by a\nsignificant margin on various computer vision benchmarks, such as ImageNet,\nPascal VOC and COCOStuff10k. Code is available at\nhttps://github.com/sihaoevery/TaT.\n","authors":["Sihao Lin","Hongwei Xie","Bing Wang","Kaicheng Yu","Xiaojun Chang","Xiaodan Liang","Gang Wang"],"pdf_url":"https://arxiv.org/pdf/2205.10793v2.pdf","comment":"CVPR2022(Oral)"},{"id":"http://arxiv.org/abs/2303.17546v3","updated":"2024-04-08T16:49:16Z","published":"2023-03-30T17:13:56Z","title":"PAIR-Diffusion: A Comprehensive Multimodal Object-Level Image Editor","summary":" Generative image editing has recently witnessed extremely fast-paced growth.\nSome works use high-level conditioning such as text, while others use low-level\nconditioning. Nevertheless, most of them lack fine-grained control over the\nproperties of the different objects present in the image, i.e. object-level\nimage editing. In this work, we tackle the task by perceiving the images as an\namalgamation of various objects and aim to control the properties of each\nobject in a fine-grained manner. Out of these properties, we identify structure\nand appearance as the most intuitive to understand and useful for editing\npurposes. We propose PAIR Diffusion, a generic framework that can enable a\ndiffusion model to control the structure and appearance properties of each\nobject in the image. We show that having control over the properties of each\nobject in an image leads to comprehensive editing capabilities. Our framework\nallows for various object-level editing operations on real images such as\nreference image-based appearance editing, free-form shape editing, adding\nobjects, and variations. Thanks to our design, we do not require any inversion\nstep. Additionally, we propose multimodal classifier-free guidance which\nenables editing images using both reference images and text when using our\napproach with foundational diffusion models. We validate the above claims by\nextensively evaluating our framework on both unconditional and foundational\ndiffusion models. Please refer to\nhttps://vidit98.github.io/publication/conference-paper/pair_diff.html for code\nand model release.\n","authors":["Vidit Goel","Elia Peruzzo","Yifan Jiang","Dejia Xu","Xingqian Xu","Nicu Sebe","Trevor Darrell","Zhangyang Wang","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2303.17546v3.pdf","comment":"Accepted in CVPR 2024, Project page\n https://vidit98.github.io/publication/conference-paper/pair_diff.html"},{"id":"http://arxiv.org/abs/2312.03048v2","updated":"2024-04-08T08:59:24Z","published":"2023-12-05T18:34:12Z","title":"DGInStyle: Domain-Generalizable Semantic Segmentation with Image\n Diffusion Models and Stylized Semantic Control","summary":" Large, pretrained latent diffusion models (LDMs) have demonstrated an\nextraordinary ability to generate creative content, specialize to user data\nthrough few-shot fine-tuning, and condition their output on other modalities,\nsuch as semantic maps. However, are they usable as large-scale data generators,\ne.g., to improve tasks in the perception stack, like semantic segmentation? We\ninvestigate this question in the context of autonomous driving, and answer it\nwith a resounding \"yes\". We propose an efficient data generation pipeline\ntermed DGInStyle. First, we examine the problem of specializing a pretrained\nLDM to semantically-controlled generation within a narrow domain. Second, we\npropose a Style Swap technique to endow the rich generative prior with the\nlearned semantic control. Third, we design a Multi-resolution Latent Fusion\ntechnique to overcome the bias of LDMs towards dominant objects. Using\nDGInStyle, we generate a diverse dataset of street scenes, train a\ndomain-agnostic semantic segmentation model on it, and evaluate the model on\nmultiple popular autonomous driving datasets. Our approach consistently\nincreases the performance of several domain generalization methods compared to\nthe previous state-of-the-art methods. Source code and dataset are available at\nhttps://dginstyle.github.io.\n","authors":["Yuru Jia","Lukas Hoyer","Shengyu Huang","Tianfu Wang","Luc Van Gool","Konrad Schindler","Anton Obukhov"],"pdf_url":"https://arxiv.org/pdf/2312.03048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05776v1","updated":"2024-04-08T06:47:03Z","published":"2024-04-08T06:47:03Z","title":"Forecasting Electric Vehicle Battery Output Voltage: A Predictive\n Modeling Approach","summary":" The battery management system plays a vital role in ensuring the safety and\ndependability of electric and hybrid vehicles. It is responsible for various\nfunctions, including state evaluation, monitoring, charge control, and cell\nbalancing, all integrated within the BMS. Nonetheless, due to the uncertainties\nsurrounding battery performance, implementing these functionalities poses\nsignificant challenges. In this study, we explore the latest approaches for\nassessing battery states, highlight notable advancements in battery management\nsystems (BMS), address existing issues with current BMS technology, and put\nforth possible solutions for predicting battery charging voltage.\n","authors":["Narayana Darapaneni","Ashish K","Ullas M S","Anwesh Reddy Paduri"],"pdf_url":"https://arxiv.org/pdf/2404.05776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04125v2","updated":"2024-04-08T21:14:43Z","published":"2024-04-04T17:58:02Z","title":"No \"Zero-Shot\" Without Exponential Data: Pretraining Concept Frequency\n Determines Multimodal Model Performance","summary":" Web-crawled pretraining datasets underlie the impressive \"zero-shot\"\nevaluation performance of multimodal models, such as CLIP for\nclassification/retrieval and Stable-Diffusion for image generation. However, it\nis unclear how meaningful the notion of \"zero-shot\" generalization is for such\nmultimodal models, as it is not known to what extent their pretraining datasets\nencompass the downstream concepts targeted for during \"zero-shot\" evaluation.\nIn this work, we ask: How is the performance of multimodal models on downstream\nconcepts influenced by the frequency of these concepts in their pretraining\ndatasets? We comprehensively investigate this question across 34 models and\nfive standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M,\nLAION-Aesthetics), generating over 300GB of data artifacts. We consistently\nfind that, far from exhibiting \"zero-shot\" generalization, multimodal models\nrequire exponentially more data to achieve linear improvements in downstream\n\"zero-shot\" performance, following a sample inefficient log-linear scaling\ntrend. This trend persists even when controlling for sample-level similarity\nbetween pretraining and downstream datasets, and testing on purely synthetic\ndata distributions. Furthermore, upon benchmarking models on long-tailed data\nsampled based on our analysis, we demonstrate that multimodal models across the\nboard perform poorly. We contribute this long-tail test set as the \"Let it\nWag!\" benchmark to further research in this direction. Taken together, our\nstudy reveals an exponential need for training data which implies that the key\nto \"zero-shot\" generalization capabilities under large-scale training paradigms\nremains to be found.\n","authors":["Vishaal Udandarao","Ameya Prabhu","Adhiraj Ghosh","Yash Sharma","Philip H. S. Torr","Adel Bibi","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2404.04125v2.pdf","comment":"Extended version of the short paper accepted at DPFM, ICLR'24"},{"id":"http://arxiv.org/abs/2404.07236v1","updated":"2024-04-08T08:50:09Z","published":"2024-04-08T08:50:09Z","title":"Lightweight Deep Learning for Resource-Constrained Environments: A\n Survey","summary":" Over the past decade, the dominance of deep learning has prevailed across\nvarious domains of artificial intelligence, including natural language\nprocessing, computer vision, and biomedical signal processing. While there have\nbeen remarkable improvements in model accuracy, deploying these models on\nlightweight devices, such as mobile phones and microcontrollers, is constrained\nby limited resources. In this survey, we provide comprehensive design guidance\ntailored for these devices, detailing the meticulous design of lightweight\nmodels, compression methods, and hardware acceleration strategies. The\nprincipal goal of this work is to explore methods and concepts for getting\naround hardware constraints without compromising the model's accuracy.\nAdditionally, we explore two notable paths for lightweight deep learning in the\nfuture: deployment techniques for TinyML and Large Language Models. Although\nthese paths undoubtedly have potential, they also present significant\nchallenges, encouraging research into unexplored areas.\n","authors":["Hou-I Liu","Marco Galindo","Hongxia Xie","Lai-Kuan Wong","Hong-Han Shuai","Yung-Yui Li","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.07236v1.pdf","comment":"40 pages"}]},"2024-04-07T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.05107v1","updated":"2024-04-07T23:31:37Z","published":"2024-04-07T23:31:37Z","title":"Reconstructing Retinal Visual Images from 3T fMRI Data Enhanced by\n Unsupervised Learning","summary":" The reconstruction of human visual inputs from brain activity, particularly\nthrough functional Magnetic Resonance Imaging (fMRI), holds promising avenues\nfor unraveling the mechanisms of the human visual system. Despite the\nsignificant strides made by deep learning methods in improving the quality and\ninterpretability of visual reconstruction, there remains a substantial demand\nfor high-quality, long-duration, subject-specific 7-Tesla fMRI experiments. The\nchallenge arises in integrating diverse smaller 3-Tesla datasets or\naccommodating new subjects with brief and low-quality fMRI scans. In response\nto these constraints, we propose a novel framework that generates enhanced 3T\nfMRI data through an unsupervised Generative Adversarial Network (GAN),\nleveraging unpaired training across two distinct fMRI datasets in 7T and 3T,\nrespectively. This approach aims to overcome the limitations of the scarcity of\nhigh-quality 7-Tesla data and the challenges associated with brief and\nlow-quality scans in 3-Tesla experiments. In this paper, we demonstrate the\nreconstruction capabilities of the enhanced 3T fMRI data, highlighting its\nproficiency in generating superior input visual images compared to\ndata-intensive methods trained and tested on a single subject.\n","authors":["Yujian Xiong","Wenhui Zhu","Zhong-Lin Lu","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05107v1.pdf","comment":"Accepted by ISBI 2024"},{"id":"http://arxiv.org/abs/2307.05845v5","updated":"2024-04-07T23:27:06Z","published":"2023-07-11T23:36:49Z","title":"PIGEON: Predicting Image Geolocations","summary":" Planet-scale image geolocalization remains a challenging problem due to the\ndiversity of images originating from anywhere in the world. Although approaches\nbased on vision transformers have made significant progress in geolocalization\naccuracy, success in prior literature is constrained to narrow distributions of\nimages of landmarks, and performance has not generalized to unseen places. We\npresent a new geolocalization system that combines semantic geocell creation,\nmulti-task contrastive pretraining, and a novel loss function. Additionally,\nour work is the first to perform retrieval over location clusters for guess\nrefinements. We train two models for evaluations on street-level data and\ngeneral-purpose image geolocalization; the first model, PIGEON, is trained on\ndata from the game of Geoguessr and is capable of placing over 40% of its\nguesses within 25 kilometers of the target location globally. We also develop a\nbot and deploy PIGEON in a blind experiment against humans, ranking in the top\n0.01% of players. We further challenge one of the world's foremost professional\nGeoguessr players to a series of six matches with millions of viewers, winning\nall six games. Our second model, PIGEOTTO, differs in that it is trained on a\ndataset of images from Flickr and Wikipedia, achieving state-of-the-art results\non a wide range of image geolocalization benchmarks, outperforming the previous\nSOTA by up to 7.7 percentage points on the city accuracy level and up to 38.8\npercentage points on the country level. Our findings suggest that PIGEOTTO is\nthe first image geolocalization model that effectively generalizes to unseen\nplaces and that our approach can pave the way for highly accurate, planet-scale\nimage geolocalization systems. Our code is available on GitHub.\n","authors":["Lukas Haas","Michal Skreta","Silas Alberti","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2307.05845v5.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.05105v1","updated":"2024-04-07T23:10:26Z","published":"2024-04-07T23:10:26Z","title":"VMambaMorph: a Visual Mamba-based Framework with Cross-Scan Module for\n Deformable 3D Image Registration","summary":" Image registration, a critical process in medical imaging, involves aligning\ndifferent sets of medical imaging data into a single unified coordinate system.\nDeep learning networks, such as the Convolutional Neural Network (CNN)-based\nVoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model\n(SSM)-based MambaMorph, have demonstrated effective performance in this domain.\nThe recent Visual State Space Model (VMamba), which incorporates a cross-scan\nmodule with SSM, has exhibited promising improvements in modeling global-range\ndependencies with efficient computational cost in computer vision tasks. This\npaper hereby introduces an exploration of VMamba with image registration, named\nVMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for\n3D image registration. Utilizing a U-shaped network architecture, VMambaMorph\ncomputes the deformation field based on target and source volumes. The\nVMamba-based block with 2D cross-scan module is redesigned for 3D volumetric\nfeature processing, and a fine-grained feature extraction module is proposed\nfor high-dimensional feature learning. We validate VMambaMorph using a public\nbenchmark brain MR-CT registration dataset, comparing its performance against\ncurrent state-of-the-art methods. The results indicate that VMambaMorph\nachieves competitive registration quality. The code for VMambaMorph is\navailable on GitHub.\n","authors":["Ziyang Wang","Jian-Qing Zheng","Chao Ma","Tao Guo"],"pdf_url":"https://arxiv.org/pdf/2404.05105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05102v1","updated":"2024-04-07T22:58:18Z","published":"2024-04-07T22:58:18Z","title":"LHU-Net: A Light Hybrid U-Net for Cost-Efficient, High-Performance\n Volumetric Medical Image Segmentation","summary":" As a result of the rise of Transformer architectures in medical image\nanalysis, specifically in the domain of medical image segmentation, a multitude\nof hybrid models have been created that merge the advantages of Convolutional\nNeural Networks (CNNs) and Transformers. These hybrid models have achieved\nnotable success by significantly improving segmentation accuracy. Yet, this\nprogress often comes at the cost of increased model complexity, both in terms\nof parameters and computational demand. Moreover, many of these models fail to\nconsider the crucial interplay between spatial and channel features, which\ncould further refine and improve segmentation outcomes. To address this, we\nintroduce LHU-Net, a Light Hybrid U-Net architecture optimized for volumetric\nmedical image segmentation. LHU-Net is meticulously designed to prioritize\nspatial feature analysis in its initial layers before shifting focus to\nchannel-based features in its deeper layers, ensuring a comprehensive feature\nextraction process. Rigorous evaluation across five benchmark datasets -\nSynapse, LA, Pancreas, ACDC, and BRaTS 2018 - underscores LHU-Net's superior\nperformance, showcasing its dual capacity for efficiency and accuracy. Notably,\nLHU-Net sets new performance benchmarks, such as attaining a Dice score of\n92.66 on the ACDC dataset, while simultaneously reducing parameters by 85% and\nquartering the computational load compared to existing state-of-the-art models.\nAchieved without any reliance on pre-training, additional data, or model\nensemble, LHU-Net's effectiveness is further evidenced by its state-of-the-art\nperformance across all evaluated datasets, utilizing fewer than 11 million\nparameters. This achievement highlights that balancing computational efficiency\nwith high accuracy in medical image segmentation is feasible. Our\nimplementation of LHU-Net is freely accessible to the research community on\nGitHub.\n","authors":["Yousef Sadegheih","Afshin Bozorgpour","Pratibha Kumari","Reza Azad","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2404.05102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04001v4","updated":"2024-04-07T22:46:13Z","published":"2023-09-07T20:07:57Z","title":"MMSFormer: Multimodal Transformer for Material and Semantic Segmentation","summary":" Leveraging information across diverse modalities is known to enhance\nperformance on multimodal segmentation tasks. However, effectively fusing\ninformation from different modalities remains challenging due to the unique\ncharacteristics of each modality. In this paper, we propose a novel fusion\nstrategy that can effectively fuse information from different modality\ncombinations. We also propose a new model named Multi-Modal Segmentation\nTransFormer (MMSFormer) that incorporates the proposed fusion strategy to\nperform multimodal material and semantic segmentation tasks. MMSFormer\noutperforms current state-of-the-art models on three different datasets. As we\nbegin with only one input modality, performance improves progressively as\nadditional modalities are incorporated, showcasing the effectiveness of the\nfusion block in combining useful information from diverse input modalities.\nAblation studies show that different modules in the fusion block are crucial\nfor overall model performance. Furthermore, our ablation studies also highlight\nthe capacity of different input modalities to improve performance in the\nidentification of different types of materials. The code and pretrained models\nwill be made available at https://github.com/csiplab/MMSFormer.\n","authors":["Md Kaykobad Reza","Ashley Prater-Bennette","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2309.04001v4.pdf","comment":"Accepted by IEEE Open Journal of Signal Processing. 15 pages, 3\n figures, 9 tables"},{"id":"http://arxiv.org/abs/2401.02634v2","updated":"2024-04-07T22:18:52Z","published":"2024-01-05T04:53:33Z","title":"AG-ReID.v2: Bridging Aerial and Ground Views for Person\n Re-identification","summary":" Aerial-ground person re-identification (Re-ID) presents unique challenges in\ncomputer vision, stemming from the distinct differences in viewpoints, poses,\nand resolutions between high-altitude aerial and ground-based cameras. Existing\nresearch predominantly focuses on ground-to-ground matching, with aerial\nmatching less explored due to a dearth of comprehensive datasets. To address\nthis, we introduce AG-ReID.v2, a dataset specifically designed for person Re-ID\nin mixed aerial and ground scenarios. This dataset comprises 100,502 images of\n1,615 unique individuals, each annotated with matching IDs and 15 soft\nattribute labels. Data were collected from diverse perspectives using a UAV,\nstationary CCTV, and smart glasses-integrated camera, providing a rich variety\nof intra-identity variations. Additionally, we have developed an explainable\nattention network tailored for this dataset. This network features a\nthree-stream architecture that efficiently processes pairwise image distances,\nemphasizes key top-down features, and adapts to variations in appearance due to\naltitude differences. Comparative evaluations demonstrate the superiority of\nour approach over existing baselines. We plan to release the dataset and\nalgorithm source code publicly, aiming to advance research in this specialized\nfield of computer vision. For access, please visit\nhttps://github.com/huynguyen792/AG-ReID.v2.\n","authors":["Huy Nguyen","Kien Nguyen","Sridha Sridharan","Clinton Fookes"],"pdf_url":"https://arxiv.org/pdf/2401.02634v2.pdf","comment":"13 pages, Accepted by TIFS 2023"},{"id":"http://arxiv.org/abs/2404.05083v1","updated":"2024-04-07T21:46:47Z","published":"2024-04-07T21:46:47Z","title":"HaVTR: Improving Video-Text Retrieval Through Augmentation Using Large\n Foundation Models","summary":" While recent progress in video-text retrieval has been driven by the\nexploration of powerful model architectures and training strategies, the\nrepresentation learning ability of video-text retrieval models is still limited\ndue to low-quality and scarce training data annotations. To address this issue,\nwe present a novel video-text learning paradigm, HaVTR, which augments video\nand text data to learn more generalized features. Specifically, we first adopt\na simple augmentation method, which generates self-similar data by randomly\nduplicating or dropping subwords and frames. In addition, inspired by the\nrecent advancement in visual and language generative models, we propose a more\npowerful augmentation method through textual paraphrasing and video stylization\nusing large language models (LLMs) and visual generative models (VGMs).\nFurther, to bring richer information into video and text, we propose a\nhallucination-based augmentation method, where we use LLMs and VGMs to generate\nand add new relevant information to the original data. Benefiting from the\nenriched data, extensive experiments on several video-text retrieval benchmarks\ndemonstrate the superiority of HaVTR over existing methods.\n","authors":["Yimu Wang","Shuai Yuan","Xiangru Jian","Wei Pang","Mushi Wang","Ning Yu"],"pdf_url":"https://arxiv.org/pdf/2404.05083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06886v2","updated":"2024-04-07T21:41:05Z","published":"2023-12-11T23:20:31Z","title":"Relightful Harmonization: Lighting-aware Portrait Background Replacement","summary":" Portrait harmonization aims to composite a subject into a new background,\nadjusting its lighting and color to ensure harmony with the background scene.\nExisting harmonization techniques often only focus on adjusting the global\ncolor and brightness of the foreground and ignore crucial illumination cues\nfrom the background such as apparent lighting direction, leading to unrealistic\ncompositions. We introduce Relightful Harmonization, a lighting-aware diffusion\nmodel designed to seamlessly harmonize sophisticated lighting effect for the\nforeground portrait using any background image. Our approach unfolds in three\nstages. First, we introduce a lighting representation module that allows our\ndiffusion model to encode lighting information from target image background.\nSecond, we introduce an alignment network that aligns lighting features learned\nfrom image background with lighting features learned from panorama environment\nmaps, which is a complete representation for scene illumination. Last, to\nfurther boost the photorealism of the proposed method, we introduce a novel\ndata simulation pipeline that generates synthetic training pairs from a diverse\nrange of natural images, which are used to refine the model. Our method\noutperforms existing benchmarks in visual fidelity and lighting coherence,\nshowing superior generalization in real-world testing scenarios, highlighting\nits versatility and practicality.\n","authors":["Mengwei Ren","Wei Xiong","Jae Shin Yoon","Zhixin Shu","Jianming Zhang","HyunJoon Jung","Guido Gerig","He Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.06886v2.pdf","comment":"CVPR 2024 camera ready"},{"id":"http://arxiv.org/abs/2404.05072v1","updated":"2024-04-07T21:00:14Z","published":"2024-04-07T21:00:14Z","title":"Spatial Cognition from Egocentric Video: Out of Sight, Not Out of Mind","summary":" As humans move around, performing their daily tasks, they are able to recall\nwhere they have positioned objects in their environment, even if these objects\nare currently out of sight. In this paper, we aim to mimic this spatial\ncognition ability. We thus formulate the task of Out of Sight, Not Out of Mind\n- 3D tracking active objects using observations captured through an egocentric\ncamera. We introduce Lift, Match and Keep (LMK), a method which lifts partial\n2D observations to 3D world coordinates, matches them over time using visual\nappearance, 3D location and interactions to form object tracks, and keeps these\nobject tracks even when they go out-of-view of the camera - hence keeping in\nmind what is out of sight. We test LMK on 100 long videos from EPIC-KITCHENS.\nOur results demonstrate that spatial cognition is critical for correctly\nlocating objects over short and long time scales. E.g., for one long egocentric\nvideo, we estimate the 3D location of 50 active objects. Of these, 60% can be\ncorrectly positioned in 3D after 2 minutes of leaving the camera view.\n","authors":["Chiara Plizzari","Shubham Goel","Toby Perrett","Jacob Chalk","Angjoo Kanazawa","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05072v1.pdf","comment":"21 pages including references and appendix. Project Webpage:\n http://dimadamen.github.io/OSNOM/"},{"id":"http://arxiv.org/abs/2404.05069v1","updated":"2024-04-07T20:39:31Z","published":"2024-04-07T20:39:31Z","title":"AirShot: Efficient Few-Shot Detection for Autonomous Exploration","summary":" Few-shot object detection has drawn increasing attention in the field of\nrobotic exploration, where robots are required to find unseen objects with a\nfew online provided examples. Despite recent efforts have been made to yield\nonline processing capabilities, slow inference speeds of low-powered robots\nfail to meet the demands of real-time detection-making them impractical for\nautonomous exploration. Existing methods still face performance and efficiency\nchallenges, mainly due to unreliable features and exhaustive class loops. In\nthis work, we propose a new paradigm AirShot, and discover that, by fully\nexploiting the valuable correlation map, AirShot can result in a more robust\nand faster few-shot object detection system, which is more applicable to\nrobotics community. The core module Top Prediction Filter (TPF) can operate on\nmulti-scale correlation maps in both the training and inference stages. During\ntraining, TPF supervises the generation of a more representative correlation\nmap, while during inference, it reduces looping iterations by selecting\ntop-ranked classes, thus cutting down on computational costs with better\nperformance. Surprisingly, this dual functionality exhibits general\neffectiveness and efficiency on various off-the-shelf models. Exhaustive\nexperiments on COCO2017, VOC2014, and SubT datasets demonstrate that TPF can\nsignificantly boost the efficacy and efficiency of most off-the-shelf models,\nachieving up to 36.4% precision improvements along with 56.3% faster inference\nspeed. Code and Data are at: https://github.com/ImNotPrepared/AirShot.\n","authors":["Zihan Wang","Bowen Li","Chen Wang","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2404.05069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17328v3","updated":"2024-04-07T20:20:09Z","published":"2023-05-27T02:08:51Z","title":"Zero-TPrune: Zero-Shot Token Pruning through Leveraging of the Attention\n Graph in Pre-Trained Transformers","summary":" Deployment of Transformer models on edge devices is becoming increasingly\nchallenging due to the exponentially growing inference cost that scales\nquadratically with the number of tokens in the input sequence. Token pruning is\nan emerging solution to address this challenge due to its ease of deployment on\nvarious Transformer backbones. However, most token pruning methods require\ncomputationally expensive fine-tuning, which is undesirable in many edge\ndeployment cases. In this work, we propose Zero-TPrune, the first zero-shot\nmethod that considers both the importance and similarity of tokens in\nperforming token pruning. It leverages the attention graph of pre-trained\nTransformer models to produce an importance distribution for tokens via our\nproposed Weighted Page Rank (WPR) algorithm. This distribution further guides\ntoken partitioning for efficient similarity-based pruning. Due to the\nelimination of the fine-tuning overhead, Zero-TPrune can prune large models at\nnegligible computational cost, switch between different pruning configurations\nat no computational cost, and perform hyperparameter tuning efficiently. We\nevaluate the performance of Zero-TPrune on vision tasks by applying it to\nvarious vision Transformer backbones and testing them on ImageNet. Without any\nfine-tuning, Zero-TPrune reduces the FLOPs cost of DeiT-S by 34.7% and improves\nits throughput by 45.3% with only 0.4% accuracy loss. Compared with\nstate-of-the-art pruning methods that require fine-tuning, Zero-TPrune not only\neliminates the need for fine-tuning after pruning but also does so with only\n0.1% accuracy loss. Compared with state-of-the-art fine-tuning-free pruning\nmethods, Zero-TPrune reduces accuracy loss by up to 49% with similar FLOPs\nbudgets. Project webpage: https://jha-lab.github.io/zerotprune.\n","authors":["Hongjie Wang","Bhishma Dedhia","Niraj K. Jha"],"pdf_url":"https://arxiv.org/pdf/2305.17328v3.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2404.05063v1","updated":"2024-04-07T20:19:04Z","published":"2024-04-07T20:19:04Z","title":"AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with\n Implicit Disentanglement","summary":" Facial action unit (AU) intensity plays a pivotal role in quantifying\nfine-grained expression behaviors, which is an effective condition for facial\nexpression manipulation. However, publicly available datasets containing\nintensity annotations for multiple AUs remain severely limited, often featuring\na restricted number of subjects. This limitation places challenges to the AU\nintensity manipulation in images due to disentanglement issues, leading\nresearchers to resort to other large datasets with pretrained AU intensity\nestimators for pseudo labels. In addressing this constraint and fully\nleveraging manual annotations of AU intensities for precise manipulation, we\nintroduce AUEditNet. Our proposed model achieves impressive intensity\nmanipulation across 12 AUs, trained effectively with only 18 subjects.\nUtilizing a dual-branch architecture, our approach achieves comprehensive\ndisentanglement of facial attributes and identity without necessitating\nadditional loss functions or implementing with large batch sizes. This approach\noffers a potential solution to achieve desired facial attribute editing despite\nthe dataset's limited subject count. Our experiments demonstrate AUEditNet's\nsuperior accuracy in editing AU intensities, affirming its capability in\ndisentangling facial attributes and identity within a limited subject pool.\nAUEditNet allows conditioning by either intensity values or target images,\neliminating the need for constructing AU combinations for specific facial\nexpression synthesis. Moreover, AU intensity estimation, as a downstream task,\nvalidates the consistency between real and edited images, confirming the\neffectiveness of our proposed AU intensity manipulation method.\n","authors":["Shiwei Jin","Peng Liu","Zhen Wang","Lei Wang","Ning Bi","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05061v1","updated":"2024-04-07T20:15:40Z","published":"2024-04-07T20:15:40Z","title":"Automated Prediction of Breast Cancer Response to Neoadjuvant\n Chemotherapy from DWI Data","summary":" Effective surgical planning for breast cancer hinges on accurately predicting\npathological complete response (pCR) to neoadjuvant chemotherapy (NAC).\nDiffusion-weighted MRI (DWI) and machine learning offer a non-invasive approach\nfor early pCR assessment. However, most machine-learning models require manual\ntumor segmentation, a cumbersome and error-prone task. We propose a deep\nlearning model employing \"Size-Adaptive Lesion Weighting\" for automatic DWI\ntumor segmentation to enhance pCR prediction accuracy. Despite\nhistopathological changes during NAC complicating DWI image segmentation, our\nmodel demonstrates robust performance. Utilizing the BMMR2 challenge dataset,\nit matches human experts in pCR prediction pre-NAC with an area under the curve\n(AUC) of 0.76 vs. 0.796, and surpasses standard automated methods mid-NAC, with\nan AUC of 0.729 vs. 0.654 and 0.576. Our approach represents a significant\nadvancement in automating breast cancer treatment planning, enabling more\nreliable pCR predictions without manual segmentation.\n","authors":["Shir Nitzan","Maya Gilad","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2404.05061v1.pdf","comment":"Accepted for presentation at the IEEE International Symposium on\n Biomedical Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2401.04244v2","updated":"2024-04-07T20:13:45Z","published":"2024-01-08T21:35:05Z","title":"Spatio-Temporal Turbulence Mitigation: A Translational Perspective","summary":" Recovering images distorted by atmospheric turbulence is a challenging\ninverse problem due to the stochastic nature of turbulence. Although numerous\nturbulence mitigation (TM) algorithms have been proposed, their efficiency and\ngeneralization to real-world dynamic scenarios remain severely limited.\nBuilding upon the intuitions of classical TM algorithms, we present the Deep\nAtmospheric TUrbulence Mitigation network (DATUM). DATUM aims to overcome major\nchallenges when transitioning from classical to deep learning approaches. By\ncarefully integrating the merits of classical multi-frame TM methods into a\ndeep network structure, we demonstrate that DATUM can efficiently perform\nlong-range temporal aggregation using a recurrent fashion, while deformable\nattention and temporal-channel attention seamlessly facilitate pixel\nregistration and lucky imaging. With additional supervision, tilt and blur\ndegradation can be jointly mitigated. These inductive biases empower DATUM to\nsignificantly outperform existing methods while delivering a tenfold increase\nin processing speed. A large-scale training dataset, ATSyn, is presented as a\nco-invention to enable generalization in real turbulence. Our code and datasets\nare available at https://xg416.github.io/DATUM.\n","authors":["Xingguang Zhang","Nicholas Chimitt","Yiheng Chi","Zhiyuan Mao","Stanley H. Chan"],"pdf_url":"https://arxiv.org/pdf/2401.04244v2.pdf","comment":"Accepted by CVPR 2024, project page https://xg416.github.io/DATUM/"},{"id":"http://arxiv.org/abs/2312.15719v2","updated":"2024-04-07T19:59:00Z","published":"2023-12-25T13:12:36Z","title":"Get a Grip: Reconstructing Hand-Object Stable Grasps in Egocentric\n Videos","summary":" We propose the task of Hand-Object Stable Grasp Reconstruction (HO-SGR), the\nreconstruction of frames during which the hand is stably holding the object. We\nfirst develop the stable grasp definition based on the intuition that the\nin-contact area between the hand and object should remain stable. By analysing\nthe 3D ARCTIC dataset, we identify stable grasp durations and showcase that\nobjects in stable grasps move within a single degree of freedom (1-DoF). We\nthereby propose a method to jointly optimise all frames within a stable grasp,\nminimising object motions to a latent 1-DoF. Finally, we extend the knowledge\nto in-the-wild videos by labelling 2.4K clips of stable grasps. Our proposed\nEPIC-Grasps dataset includes 390 object instances of 9 categories, featuring\nstable grasps from videos of daily interactions in 141 environments. Without 3D\nground truth, we use stable contact areas and 2D projection masks to assess the\nHO-SGR task in the wild. We evaluate relevant methods and our approach\npreserves significantly higher stable contact area, on both EPIC-Grasps and\nstable grasp sub-sequences from the ARCTIC dataset.\n","authors":["Zhifan Zhu","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2312.15719v2.pdf","comment":"webpage: https://zhifanzhu.github.io/getagrip"},{"id":"http://arxiv.org/abs/2404.05052v1","updated":"2024-04-07T19:23:28Z","published":"2024-04-07T19:23:28Z","title":"Facial Affective Behavior Analysis with Instruction Tuning","summary":" Facial affective behavior analysis (FABA) is crucial for understanding human\nmental states from images. However, traditional approaches primarily deploy\nmodels to discriminate among discrete emotion categories, and lack the fine\ngranularity and reasoning capability for complex facial behaviors. The advent\nof Multi-modal Large Language Models (MLLMs) has been proven successful in\ngeneral visual understanding tasks. However, directly harnessing MLLMs for FABA\nis challenging due to the scarcity of datasets and benchmarks, neglecting\nfacial prior knowledge, and low training efficiency. To address these\nchallenges, we introduce (i) an instruction-following dataset for two FABA\ntasks, e.g., emotion and action unit recognition, (ii) a benchmark FABA-Bench\nwith a new metric considering both recognition and generation ability, and\n(iii) a new MLLM \"EmoLA\" as a strong baseline to the community. Our initiative\non the dataset and benchmarks reveal the nature and rationale of facial\naffective behaviors, i.e., fine-grained facial movement, interpretability, and\nreasoning. Moreover, to build an effective and efficient FABA MLLM, we\nintroduce a facial prior expert module with face structure knowledge and a\nlow-rank adaptation module into pre-trained MLLM. We conduct extensive\nexperiments on FABA-Bench and four commonly-used FABA datasets. The results\ndemonstrate that the proposed facial prior expert can boost the performance and\nEmoLA achieves the best results on our FABA-Bench. On commonly-used FABA\ndatasets, EmoLA is competitive rivaling task-specific state-of-the-art models.\n","authors":["Yifan Li","Anh Dao","Wentao Bao","Zhen Tan","Tianlong Chen","Huan Liu","Yu Kong"],"pdf_url":"https://arxiv.org/pdf/2404.05052v1.pdf","comment":"V1.0"},{"id":"http://arxiv.org/abs/2404.05049v1","updated":"2024-04-07T19:10:02Z","published":"2024-04-07T19:10:02Z","title":"PlateSegFL: A Privacy-Preserving License Plate Detection Using Federated\n Segmentation Learning","summary":" Automatic License Plate Recognition (ALPR) is an integral component of an\nintelligent transport system with extensive applications in secure\ntransportation, vehicle-to-vehicle communication, stolen vehicles detection,\ntraffic violations, and traffic flow management. The existing license plate\ndetection system focuses on one-shot learners or pre-trained models that\noperate with a geometric bounding box, limiting the model's performance.\nFurthermore, continuous video data streams uploaded to the central server\nresult in network and complexity issues. To combat this, PlateSegFL was\nintroduced, which implements U-Net-based segmentation along with Federated\nLearning (FL). U-Net is well-suited for multi-class image segmentation tasks\nbecause it can analyze a large number of classes and generate a pixel-level\nsegmentation map for each class. Federated Learning is used to reduce the\nquantity of data required while safeguarding the user's privacy. Different\ncomputing platforms, such as mobile phones, are able to collaborate on the\ndevelopment of a standard prediction model where it makes efficient use of\none's time; incorporates more diverse data; delivers projections in real-time;\nand requires no physical effort from the user; resulting around 95% F1 score.\n","authors":["Md. Shahriar Rahman Anuvab","Mishkat Sultana","Md. Atif Hossain","Shashwata Das","Suvarthi Chowdhury","Rafeed Rahman","Dibyo Fabian Dofadar","Shahriar Rahman Rana"],"pdf_url":"https://arxiv.org/pdf/2404.05049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05046v1","updated":"2024-04-07T19:00:45Z","published":"2024-04-07T19:00:45Z","title":"FGAIF: Aligning Large Vision-Language Models with Fine-grained AI\n Feedback","summary":" Large Vision-Language Models (LVLMs) have demonstrated proficiency in\ntackling a variety of visual-language tasks. However, current LVLMs suffer from\nmisalignment between text and image modalities which causes three kinds of\nhallucination problems, i.e., object existence, object attribute, and object\nrelationship. To tackle this issue, existing methods mainly utilize\nReinforcement Learning (RL) to align modalities in LVLMs. However, they still\nsuffer from three main limitations: (1) General feedback can not indicate the\nhallucination type contained in the response; (2) Sparse rewards only give the\nsequence-level reward for the whole response; and (3)Annotation cost is\ntime-consuming and labor-intensive. To handle these limitations, we propose an\ninnovative method to align modalities in LVLMs through Fine-Grained Artificial\nIntelligence Feedback (FGAIF), which mainly consists of three steps: AI-based\nFeedback Collection, Fine-grained Reward Model Training, and Reinforcement\nLearning with Fine-grained Reward. Specifically, We first utilize AI tools to\npredict the types of hallucination for each segment in the response and obtain\na collection of fine-grained feedback. Then, based on the collected reward\ndata, three specialized reward models are trained to produce dense rewards.\nFinally, a novel fine-grained feedback module is integrated into the Proximal\nPolicy Optimization (PPO) algorithm. Extensive experiments are conducted on\nhallucination and general benchmarks, demonstrating the superior performance of\nour proposed method. Notably, compared with previous models trained with the\nRL-based aligning method, our proposed method is effective even with fewer\nparameters.\n","authors":["Liqiang Jing","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2404.05046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14435v6","updated":"2024-04-07T18:04:04Z","published":"2023-06-26T06:04:09Z","title":"DragDiffusion: Harnessing Diffusion Models for Interactive Point-based\n Image Editing","summary":" Accurate and controllable image editing is a challenging task that has\nattracted significant attention recently. Notably, DragGAN is an interactive\npoint-based image editing framework that achieves impressive editing results\nwith pixel-level precision. However, due to its reliance on generative\nadversarial networks (GANs), its generality is limited by the capacity of\npretrained GAN models. In this work, we extend this editing framework to\ndiffusion models and propose a novel approach DragDiffusion. By harnessing\nlarge-scale pretrained diffusion models, we greatly enhance the applicability\nof interactive point-based editing on both real and diffusion-generated images.\nOur approach involves optimizing the diffusion latents to achieve precise\nspatial control. The supervision signal of this optimization process is from\nthe diffusion model's UNet features, which are known to contain rich semantic\nand geometric information. Moreover, we introduce two additional techniques,\nnamely LoRA fine-tuning and latent-MasaCtrl, to further preserve the identity\nof the original image. Lastly, we present a challenging benchmark dataset\ncalled DragBench -- the first benchmark to evaluate the performance of\ninteractive point-based image editing methods. Experiments across a wide range\nof challenging cases (e.g., images with multiple objects, diverse object\ncategories, various styles, etc.) demonstrate the versatility and generality of\nDragDiffusion. Code: https://github.com/Yujun-Shi/DragDiffusion.\n","authors":["Yujun Shi","Chuhui Xue","Jun Hao Liew","Jiachun Pan","Hanshu Yan","Wenqing Zhang","Vincent Y. F. Tan","Song Bai"],"pdf_url":"https://arxiv.org/pdf/2306.14435v6.pdf","comment":"Code is released at https://github.com/Yujun-Shi/DragDiffusion"},{"id":"http://arxiv.org/abs/2404.05029v1","updated":"2024-04-07T17:51:53Z","published":"2024-04-07T17:51:53Z","title":"LOGO: A Long-Form Video Dataset for Group Action Quality Assessment","summary":" Action quality assessment (AQA) has become an emerging topic since it can be\nextensively applied in numerous scenarios. However, most existing methods and\ndatasets focus on single-person short-sequence scenes, hindering the\napplication of AQA in more complex situations. To address this issue, we\nconstruct a new multi-person long-form video dataset for action quality\nassessment named LOGO. Distinguished in scenario complexity, our dataset\ncontains 200 videos from 26 artistic swimming events with 8 athletes in each\nsample along with an average duration of 204.2 seconds. As for richness in\nannotations, LOGO includes formation labels to depict group information of\nmultiple athletes and detailed annotations on action procedures. Furthermore,\nwe propose a simple yet effective method to model relations among athletes and\nreason about the potential temporal logic in long-form videos. Specifically, we\ndesign a group-aware attention module, which can be easily plugged into\nexisting AQA methods, to enrich the clip-wise representations based on\ncontextual group information. To benchmark LOGO, we systematically conduct\ninvestigations on the performance of several popular methods in AQA and action\nsegmentation. The results reveal the challenges our dataset brings. Extensive\nexperiments also show that our approach achieves state-of-the-art on the LOGO\ndataset. The dataset and code will be released at\n\\url{https://github.com/shiyi-zh0408/LOGO }.\n","authors":["Shiyi Zhang","Wenxun Dai","Sujia Wang","Xiangwei Shen","Jiwen Lu","Jie Zhou","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2404.05029v1.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2404.05024v1","updated":"2024-04-07T17:31:53Z","published":"2024-04-07T17:31:53Z","title":"PathFinder: Attention-Driven Dynamic Non-Line-of-Sight Tracking with a\n Mobile Robot","summary":" The study of non-line-of-sight (NLOS) imaging is growing due to its many\npotential applications, including rescue operations and pedestrian detection by\nself-driving cars. However, implementing NLOS imaging on a moving camera\nremains an open area of research. Existing NLOS imaging methods rely on\ntime-resolved detectors and laser configurations that require precise optical\nalignment, making it difficult to deploy them in dynamic environments. This\nwork proposes a data-driven approach to NLOS imaging, PathFinder, that can be\nused with a standard RGB camera mounted on a small, power-constrained mobile\nrobot, such as an aerial drone. Our experimental pipeline is designed to\naccurately estimate the 2D trajectory of a person who moves in a\nManhattan-world environment while remaining hidden from the camera's\nfield-of-view. We introduce a novel approach to process a sequence of dynamic\nsuccessive frames in a line-of-sight (LOS) video using an attention-based\nneural network that performs inference in real-time. The method also includes a\npreprocessing selection metric that analyzes images from a moving camera which\ncontain multiple vertical planar surfaces, such as walls and building facades,\nand extracts planes that return maximum NLOS information. We validate the\napproach on in-the-wild scenes using a drone for video capture, thus\ndemonstrating low-cost NLOS imaging in dynamic capture environments.\n","authors":["Shenbagaraj Kannapiran","Sreenithy Chandran","Suren Jayasuriya","Spring Berman"],"pdf_url":"https://arxiv.org/pdf/2404.05024v1.pdf","comment":"First two authors have equal contribution"},{"id":"http://arxiv.org/abs/2404.05023v1","updated":"2024-04-07T17:30:57Z","published":"2024-04-07T17:30:57Z","title":"Scalable and Efficient Hierarchical Visual Topological Mapping","summary":" Hierarchical topological representations can significantly reduce search\ntimes within mapping and localization algorithms. Although recent research has\nshown the potential for such approaches, limited consideration has been given\nto the suitability and comparative performance of different global feature\nrepresentations within this context. In this work, we evaluate state-of-the-art\nhand-crafted and learned global descriptors using a hierarchical topological\nmapping technique on benchmark datasets and present results of a comprehensive\nevaluation of the impact of the global descriptor used. Although learned\ndescriptors have been incorporated into place recognition methods to improve\nretrieval accuracy and enhance overall recall, the problem of scalability and\nefficiency when applied to longer trajectories has not been adequately\naddressed in a majority of research studies. Based on our empirical analysis of\nmultiple runs, we identify that continuity and distinctiveness are crucial\ncharacteristics for an optimal global descriptor that enable efficient and\nscalable hierarchical mapping, and present a methodology for quantifying and\ncontrasting these characteristics across different global descriptors. Our\nstudy demonstrates that the use of global descriptors based on an unsupervised\nlearned Variational Autoencoder (VAE) excels in these characteristics and\nachieves significantly lower runtime. It runs on a consumer grade desktop, up\nto 2.3x faster than the second best global descriptor, NetVLAD, and up to 9.5x\nfaster than the hand-crafted descriptor, PHOG, on the longest track evaluated\n(St Lucia, 17.6 km), without sacrificing overall recall performance.\n","authors":["Saravanabalagi Ramachandran","Jonathan Horgan","Ganesh Sistu","John McDonald"],"pdf_url":"https://arxiv.org/pdf/2404.05023v1.pdf","comment":"Published in the 21st International Conference on Advanced Robotics\n (ICAR 2023)"},{"id":"http://arxiv.org/abs/2404.05022v1","updated":"2024-04-07T17:25:52Z","published":"2024-04-07T17:25:52Z","title":"DinoBloom: A Foundation Model for Generalizable Cell Embeddings in\n Hematology","summary":" In hematology, computational models offer significant potential to improve\ndiagnostic accuracy, streamline workflows, and reduce the tedious work of\nanalyzing single cells in peripheral blood or bone marrow smears. However,\nclinical adoption of computational models has been hampered by the lack of\ngeneralization due to large batch effects, small dataset sizes, and poor\nperformance in transfer learning from natural images. To address these\nchallenges, we introduce DinoBloom, the first foundation model for single cell\nimages in hematology, utilizing a tailored DINOv2 pipeline. Our model is built\nupon an extensive collection of 13 diverse, publicly available datasets of\nperipheral blood and bone marrow smears, the most substantial open-source\ncohort in hematology so far, comprising over 380,000 white blood cell images.\nTo assess its generalization capability, we evaluate it on an external dataset\nwith a challenging domain shift. We show that our model outperforms existing\nmedical and non-medical vision models in (i) linear probing and k-nearest\nneighbor evaluations for cell-type classification on blood and bone marrow\nsmears and (ii) weakly supervised multiple instance learning for acute myeloid\nleukemia subtyping by a large margin. A family of four DinoBloom models (small,\nbase, large, and giant) can be adapted for a wide range of downstream\napplications, be a strong baseline for classification problems, and facilitate\nthe assessment of batch effects in new datasets. All models are available at\ngithub.com/marrlab/DinoBloom.\n","authors":["Valentin Koch","Sophia J. Wagner","Salome Kazeminia","Ece Sancar","Matthias Hehr","Julia Schnabel","Tingying Peng","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2404.05022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16671v4","updated":"2024-04-07T17:22:46Z","published":"2023-09-28T17:59:56Z","title":"Demystifying CLIP Data","summary":" Contrastive Language-Image Pre-training (CLIP) is an approach that has\nadvanced research and applications in computer vision, fueling modern\nrecognition systems and generative models. We believe that the main ingredient\nto the success of CLIP is its data and not the model architecture or\npre-training objective. However, CLIP only provides very limited information\nabout its data and how it has been collected, leading to works that aim to\nreproduce CLIP's data by filtering with its model parameters. In this work, we\nintend to reveal CLIP's data curation approach and in our pursuit of making it\nopen to the community introduce Metadata-Curated Language-Image Pre-training\n(MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's\nconcepts) and yields a balanced subset over the metadata distribution. Our\nexperimental study rigorously isolates the model and training settings,\nconcentrating solely on data. MetaCLIP applied to CommonCrawl with 400M\nimage-text data pairs outperforms CLIP's data on multiple standard benchmarks.\nIn zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy,\nsurpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining\nthe same training budget, attains 72.4%. Our observations hold across various\nmodel sizes, exemplified by ViT-H achieving 80.5%, without any\nbells-and-whistles. Curation code and training data distribution on metadata is\nmade available at https://github.com/facebookresearch/MetaCLIP.\n","authors":["Hu Xu","Saining Xie","Xiaoqing Ellen Tan","Po-Yao Huang","Russell Howes","Vasu Sharma","Shang-Wen Li","Gargi Ghosh","Luke Zettlemoyer","Christoph Feichtenhofer"],"pdf_url":"https://arxiv.org/pdf/2309.16671v4.pdf","comment":"17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by\n other authors"},{"id":"http://arxiv.org/abs/2112.04731v5","updated":"2024-04-07T17:09:58Z","published":"2021-12-09T07:20:32Z","title":"Mimicking the Oracle: An Initial Phase Decorrelation Approach for Class\n Incremental Learning","summary":" Class Incremental Learning (CIL) aims at learning a multi-class classifier in\na phase-by-phase manner, in which only data of a subset of the classes are\nprovided at each phase. Previous works mainly focus on mitigating forgetting in\nphases after the initial one. However, we find that improving CIL at its\ninitial phase is also a promising direction. Specifically, we experimentally\nshow that directly encouraging CIL Learner at the initial phase to output\nsimilar representations as the model jointly trained on all classes can greatly\nboost the CIL performance. Motivated by this, we study the difference between a\nna\\\"ively-trained initial-phase model and the oracle model. Specifically, since\none major difference between these two models is the number of training\nclasses, we investigate how such difference affects the model representations.\nWe find that, with fewer training classes, the data representations of each\nclass lie in a long and narrow region; with more training classes, the\nrepresentations of each class scatter more uniformly. Inspired by this\nobservation, we propose Class-wise Decorrelation (CwD) that effectively\nregularizes representations of each class to scatter more uniformly, thus\nmimicking the model jointly trained with all classes (i.e., the oracle model).\nOur CwD is simple to implement and easy to plug into existing methods.\nExtensive experiments on various benchmark datasets show that CwD consistently\nand significantly improves the performance of existing state-of-the-art methods\nby around 1\\% to 3\\%. Code will be released.\n","authors":["Yujun Shi","Kuangqi Zhou","Jian Liang","Zihang Jiang","Jiashi Feng","Philip Torr","Song Bai","Vincent Y. F. Tan"],"pdf_url":"https://arxiv.org/pdf/2112.04731v5.pdf","comment":"CVPR 2022 Camera-Ready Version"},{"id":"http://arxiv.org/abs/2404.05016v1","updated":"2024-04-07T17:06:22Z","published":"2024-04-07T17:06:22Z","title":"Hyperbolic Learning with Synthetic Captions for Open-World Detection","summary":" Open-world detection poses significant challenges, as it requires the\ndetection of any object using either object class labels or free-form texts.\nExisting related works often use large-scale manual annotated caption datasets\nfor training, which are extremely expensive to collect. Instead, we propose to\ntransfer knowledge from vision-language models (VLMs) to enrich the\nopen-vocabulary descriptions automatically. Specifically, we bootstrap dense\nsynthetic captions using pre-trained VLMs to provide rich descriptions on\ndifferent regions in images, and incorporate these captions to train a novel\ndetector that generalizes to novel concepts. To mitigate the noise caused by\nhallucination in synthetic captions, we also propose a novel hyperbolic\nvision-language learning approach to impose a hierarchy between visual and\ncaption embeddings. We call our detector ``HyperLearner''. We conduct extensive\nexperiments on a wide variety of open-world detection benchmarks (COCO, LVIS,\nObject Detection in the Wild, RefCOCO) and our results show that our model\nconsistently outperforms existing state-of-the-art methods, such as GLIP,\nGLIPv2 and Grounding DINO, when using the same backbone.\n","authors":["Fanjie Kong","Yanbei Chen","Jiarui Cai","Davide Modolo"],"pdf_url":"https://arxiv.org/pdf/2404.05016v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.05713v3","updated":"2024-04-07T16:59:41Z","published":"2024-02-08T14:40:32Z","title":"Hidden in Plain Sight: Undetectable Adversarial Bias Attacks on\n Vulnerable Patient Populations","summary":" The proliferation of artificial intelligence (AI) in radiology has shed light\non the risk of deep learning (DL) models exacerbating clinical biases towards\nvulnerable patient populations. While prior literature has focused on\nquantifying biases exhibited by trained DL models, demographically targeted\nadversarial bias attacks on DL models and its implication in the clinical\nenvironment remains an underexplored field of research in medical imaging. In\nthis work, we demonstrate that demographically targeted label poisoning attacks\ncan introduce undetectable underdiagnosis bias in DL models. Our results across\nmultiple performance metrics and demographic groups like sex, age, and their\nintersectional subgroups show that adversarial bias attacks demonstrate\nhigh-selectivity for bias in the targeted group by degrading group model\nperformance without impacting overall model performance. Furthermore, our\nresults indicate that adversarial bias attacks result in biased DL models that\npropagate prediction bias even when evaluated with external datasets.\n","authors":["Pranav Kulkarni","Andrew Chan","Nithya Navarathna","Skylar Chan","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2402.05713v3.pdf","comment":"29 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.05014v1","updated":"2024-04-07T16:49:07Z","published":"2024-04-07T16:49:07Z","title":"MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators","summary":" Recent advances in Text-to-Video generation (T2V) have achieved remarkable\nsuccess in synthesizing high-quality general videos from textual descriptions.\nA largely overlooked problem in T2V is that existing models have not adequately\nencoded physical knowledge of the real world, thus generated videos tend to\nhave limited motion and poor variations. In this paper, we propose\n\\textbf{MagicTime}, a metamorphic time-lapse video generation model, which\nlearns real-world physics knowledge from time-lapse videos and implements\nmetamorphic generation. First, we design a MagicAdapter scheme to decouple\nspatial and temporal training, encode more physical knowledge from metamorphic\nvideos, and transform pre-trained T2V models to generate metamorphic videos.\nSecond, we introduce a Dynamic Frames Extraction strategy to adapt to\nmetamorphic time-lapse videos, which have a wider variation range and cover\ndramatic object metamorphic processes, thus embodying more physical knowledge\nthan general videos. Finally, we introduce a Magic Text-Encoder to improve the\nunderstanding of metamorphic video prompts. Furthermore, we create a time-lapse\nvideo-text dataset called \\textbf{ChronoMagic}, specifically curated to unlock\nthe metamorphic video generation ability. Extensive experiments demonstrate the\nsuperiority and effectiveness of MagicTime for generating high-quality and\ndynamic metamorphic videos, suggesting time-lapse video generation is a\npromising path toward building metamorphic simulators of the physical world.\n","authors":["Shenghai Yuan","Jinfa Huang","Yujun Shi","Yongqi Xu","Ruijie Zhu","Bin Lin","Xinhua Cheng","Li Yuan","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2404.05014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18172v2","updated":"2024-04-07T16:43:51Z","published":"2024-02-28T09:02:33Z","title":"NiteDR: Nighttime Image De-Raining with Cross-View Sensor Cooperative\n Learning for Dynamic Driving Scenes","summary":" In real-world environments, outdoor imaging systems are often affected by\ndisturbances such as rain degradation. Especially, in nighttime driving scenes,\ninsufficient and uneven lighting shrouds the scenes in darkness, resulting\ndegradation of both the image quality and visibility. Particularly, in the\nfield of autonomous driving, the visual perception ability of RGB sensors\nexperiences a sharp decline in such harsh scenarios. Additionally, driving\nassistance systems suffer from reduced capabilities in capturing and discerning\nthe surrounding environment, posing a threat to driving safety. Single-view\ninformation captured by single-modal sensors cannot comprehensively depict the\nentire scene. To address these challenges, we developed an image de-raining\nframework tailored for rainy nighttime driving scenes. It aims to remove rain\nartifacts, enrich scene representation, and restore useful information.\nSpecifically, we introduce cooperative learning between visible and infrared\nimages captured by different sensors. By cross-view fusion of these\nmulti-source data, the scene within the images gains richer texture details and\nenhanced contrast. We constructed an information cleaning module called\nCleanNet as the first stage of our framework. Moreover, we designed an\ninformation fusion module called FusionNet as the second stage to fuse the\nclean visible images with infrared images. Using this stage-by-stage learning\nstrategy, we obtain de-rained fusion images with higher quality and better\nvisual perception. Extensive experiments demonstrate the effectiveness of our\nproposed Cross-View Cooperative Learning (CVCL) in adverse driving scenarios in\nlow-light rainy environments. The proposed approach addresses the gap in the\nutilization of existing rain removal algorithms in specific low-light\nconditions.\n","authors":["Cidan Shi","Lihuang Fang","Han Wu","Xiaoyu Xian","Yukai Shi","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2402.18172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12588v3","updated":"2024-04-07T16:05:03Z","published":"2023-11-21T13:21:22Z","title":"HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning\n for RGB-D 6DoF Object Pose Estimation","summary":" In this work, we present a novel dense-correspondence method for 6DoF object\npose estimation from a single RGB-D image. While many existing data-driven\nmethods achieve impressive performance, they tend to be time-consuming due to\ntheir reliance on rendering-based refinement approaches. To circumvent this\nlimitation, we present HiPose, which establishes 3D-3D correspondences in a\ncoarse-to-fine manner with a hierarchical binary surface encoding. Unlike\nprevious dense-correspondence methods, we estimate the correspondence surface\nby employing point-to-surface matching and iteratively constricting the surface\nuntil it becomes a correspondence point while gradually removing outliers.\nExtensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate\nthat our method surpasses all refinement-free methods and is even on par with\nexpensive refinement-based approaches. Crucially, our approach is\ncomputationally efficient and enables real-time critical applications with high\naccuracy requirements.\n","authors":["Yongliang Lin","Yongzhi Su","Praveen Nathan","Sandeep Inuganti","Yan Di","Martin Sundermeyer","Fabian Manhardt","Didier Stricker","Jason Rambach","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12588v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05003v1","updated":"2024-04-07T15:58:25Z","published":"2024-04-07T15:58:25Z","title":"Camera-Based Remote Physiology Sensing for Hundreds of Subjects Across\n Skin Tones","summary":" Remote photoplethysmography (rPPG) emerges as a promising method for\nnon-invasive, convenient measurement of vital signs, utilizing the widespread\npresence of cameras. Despite advancements, existing datasets fall short in\nterms of size and diversity, limiting comprehensive evaluation under diverse\nconditions. This paper presents an in-depth analysis of the VitalVideo dataset,\nthe largest real-world rPPG dataset to date, encompassing 893 subjects and 6\nFitzpatrick skin tones. Our experimentation with six unsupervised methods and\nthree supervised models demonstrates that datasets comprising a few hundred\nsubjects(i.e., 300 for UBFC-rPPG, 500 for PURE, and 700 for MMPD-Simple) are\nsufficient for effective rPPG model training. Our findings highlight the\nimportance of diversity and consistency in skin tones for precise performance\nevaluation across different datasets.\n","authors":["Jiankai Tang","Xinyi Li","Jiacheng Liu","Xiyuxing Zhang","Zeyu Wang","Yuntao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05003v1.pdf","comment":"11 pages, 5 figures, CHI24 Workshop PhysioCHI"},{"id":"http://arxiv.org/abs/2404.05001v1","updated":"2024-04-07T15:53:21Z","published":"2024-04-07T15:53:21Z","title":"Dual-Scale Transformer for Large-Scale Single-Pixel Imaging","summary":" Single-pixel imaging (SPI) is a potential computational imaging technique\nwhich produces image by solving an illposed reconstruction problem from few\nmeasurements captured by a single-pixel detector. Deep learning has achieved\nimpressive success on SPI reconstruction. However, previous poor reconstruction\nperformance and impractical imaging model limit its real-world applications. In\nthis paper, we propose a deep unfolding network with hybrid-attention\nTransformer on Kronecker SPI model, dubbed HATNet, to improve the imaging\nquality of real SPI cameras. Specifically, we unfold the computation graph of\nthe iterative shrinkagethresholding algorithm (ISTA) into two alternative\nmodules: efficient tensor gradient descent and hybrid-attention multiscale\ndenoising. By virtue of Kronecker SPI, the gradient descent module can avoid\nhigh computational overheads rooted in previous gradient descent modules based\non vectorized SPI. The denoising module is an encoder-decoder architecture\npowered by dual-scale spatial attention for high- and low-frequency aggregation\nand channel attention for global information recalibration. Moreover, we build\na SPI prototype to verify the effectiveness of the proposed method. Extensive\nexperiments on synthetic and real data demonstrate that our method achieves the\nstate-of-the-art performance. The source code and pre-trained models are\navailable at https://github.com/Gang-Qu/HATNet-SPI.\n","authors":["Gang Qu","Ping Wang","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.05001v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04998v1","updated":"2024-04-07T15:48:33Z","published":"2024-04-07T15:48:33Z","title":"Weakly Supervised Deep Hyperspherical Quantization for Image Retrieval","summary":" Deep quantization methods have shown high efficiency on large-scale image\nretrieval. However, current models heavily rely on ground-truth information,\nhindering the application of quantization in label-hungry scenarios. A more\nrealistic demand is to learn from inexhaustible uploaded images that are\nassociated with informal tags provided by amateur users. Though such sketchy\ntags do not obviously reveal the labels, they actually contain useful semantic\ninformation for supervising deep quantization. To this end, we propose\nWeakly-Supervised Deep Hyperspherical Quantization (WSDHQ), which is the first\nwork to learn deep quantization from weakly tagged images. Specifically, 1) we\nuse word embeddings to represent the tags and enhance their semantic\ninformation based on a tag correlation graph. 2) To better preserve semantic\ninformation in quantization codes and reduce quantization error, we jointly\nlearn semantics-preserving embeddings and supervised quantizer on hypersphere\nby employing a well-designed fusion layer and tailor-made loss functions.\nExtensive experiments show that WSDHQ can achieve state-of-art performance on\nweakly-supervised compact coding. Code is available at\nhttps://github.com/gimpong/AAAI21-WSDHQ.\n","authors":["Jinpeng Wang","Bin Chen","Qiang Zhang","Zaiqiao Meng","Shangsong Liang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2404.04998v1.pdf","comment":"In proceedings of AAAI 2021. Code and data are available"},{"id":"http://arxiv.org/abs/2404.04996v1","updated":"2024-04-07T15:34:40Z","published":"2024-04-07T15:34:40Z","title":"Fantastic Animals and Where to Find Them: Segment Any Marine Animal with\n Dual SAM","summary":" As an important pillar of underwater intelligence, Marine Animal Segmentation\n(MAS) involves segmenting animals within marine environments. Previous methods\ndon't excel in extracting long-range contextual features and overlook the\nconnectivity between discrete pixels. Recently, Segment Anything Model (SAM)\noffers a universal framework for general segmentation tasks. Unfortunately,\ntrained with natural images, SAM does not obtain the prior knowledge from\nmarine images. In addition, the single-position prompt of SAM is very\ninsufficient for prior guidance. To address these issues, we propose a novel\nfeature learning framework, named Dual-SAM for high-performance MAS. To this\nend, we first introduce a dual structure with SAM's paradigm to enhance feature\nlearning of marine images. Then, we propose a Multi-level Coupled Prompt (MCP)\nstrategy to instruct comprehensive underwater prior information, and enhance\nthe multi-level features of SAM's encoder with adapters. Subsequently, we\ndesign a Dilated Fusion Attention Module (DFAM) to progressively integrate\nmulti-level features from SAM's encoder. Finally, instead of directly\npredicting the masks of marine animals, we propose a Criss-Cross Connectivity\nPrediction (C$^3$P) paradigm to capture the inter-connectivity between discrete\npixels. With dual decoders, it generates pseudo-labels and achieves mutual\nsupervision for complementary feature representations, resulting in\nconsiderable improvements over previous techniques. Extensive experiments\nverify that our proposed method achieves state-of-the-art performances on five\nwidely-used MAS datasets. The code is available at\nhttps://github.com/Drchip61/Dual_SAM.\n","authors":["Pingping Zhang","Tianyu Yan","Yang Liu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.04996v1.pdf","comment":"Accepted by CVPR2024 as Poster(Highlight)"},{"id":"http://arxiv.org/abs/2404.04992v1","updated":"2024-04-07T15:27:35Z","published":"2024-04-07T15:27:35Z","title":"Efficient Surgical Tool Recognition via HMM-Stabilized Deep Learning","summary":" Recognizing various surgical tools, actions and phases from surgery videos is\nan important problem in computer vision with exciting clinical applications.\nExisting deep-learning-based methods for this problem either process each\nsurgical video as a series of independent images without considering their\ndependence, or rely on complicated deep learning models to count for dependence\nof video frames. In this study, we revealed from exploratory data analysis that\nsurgical videos enjoy relatively simple semantic structure, where the presence\nof surgical phases and tools can be well modeled by a compact hidden Markov\nmodel (HMM). Based on this observation, we propose an HMM-stabilized deep\nlearning method for tool presence detection. A wide range of experiments\nconfirm that the proposed approaches achieve better performance with lower\ntraining and running costs, and support more flexible ways to construct and\nutilize training data in scenarios where not all surgery videos of interest are\nextensively labelled. These results suggest that popular deep learning\napproaches with over-complicated model structures may suffer from inefficient\nutilization of data, and integrating ingredients of deep learning and\nstatistical learning wisely may lead to more powerful algorithms that enjoy\ncompetitive performance, transparent interpretation and convenient model\ntraining simultaneously.\n","authors":["Haifeng Wang","Hao Xu","Jun Wang","Jian Zhou","Ke Deng"],"pdf_url":"https://arxiv.org/pdf/2404.04992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04986v1","updated":"2024-04-07T15:06:48Z","published":"2024-04-07T15:06:48Z","title":"Dynamic Distinction Learning: Adaptive Pseudo Anomalies for Video\n Anomaly Detection","summary":" We introduce Dynamic Distinction Learning (DDL) for Video Anomaly Detection,\na novel video anomaly detection methodology that combines pseudo-anomalies,\ndynamic anomaly weighting, and a distinction loss function to improve detection\naccuracy. By training on pseudo-anomalies, our approach adapts to the\nvariability of normal and anomalous behaviors without fixed anomaly thresholds.\nOur model showcases superior performance on the Ped2, Avenue and ShanghaiTech\ndatasets, where individual models are tailored for each scene. These\nachievements highlight DDL's effectiveness in advancing anomaly detection,\noffering a scalable and adaptable solution for video surveillance challenges.\n","authors":["Demetris Lappas","Vasileios Argyriou","Dimitrios Makris"],"pdf_url":"https://arxiv.org/pdf/2404.04986v1.pdf","comment":"To be published in the CVPR2024 Workshop"},{"id":"http://arxiv.org/abs/2404.00521v3","updated":"2024-04-07T15:04:47Z","published":"2024-03-31T01:41:36Z","title":"CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz\n continuity constrAIned Normalization","summary":" Generative Adversarial Networks (GANs) significantly advanced image\ngeneration but their performance heavily depends on abundant training data. In\nscenarios with limited data, GANs often struggle with discriminator overfitting\nand unstable training. Batch Normalization (BN), despite being known for\nenhancing generalization and training stability, has rarely been used in the\ndiscriminator of Data-Efficient GANs. Our work addresses this gap by\nidentifying a critical flaw in BN: the tendency for gradient explosion during\nthe centering and scaling steps. To tackle this issue, we present CHAIN\n(lipsCHitz continuity constrAIned Normalization), which replaces the\nconventional centering step with zero-mean regularization and integrates a\nLipschitz continuity constraint in the scaling step. CHAIN further enhances GAN\ntraining by adaptively interpolating the normalized and unnormalized features,\neffectively avoiding discriminator overfitting. Our theoretical analyses firmly\nestablishes CHAIN's effectiveness in reducing gradients in latent features and\nweights, improving stability and generalization in GAN training. Empirical\nevidence supports our theory. CHAIN achieves state-of-the-art results in\ndata-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven\nhigh-resolution few-shot image datasets. Code:\nhttps://github.com/MaxwellYaoNi/CHAIN\n","authors":["Yao Ni","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2404.00521v3.pdf","comment":"Accepted by CVPR2024. 26 pages full version. Code:\n https://github.com/MaxwellYaoNi/CHAIN"},{"id":"http://arxiv.org/abs/2404.04983v1","updated":"2024-04-07T15:03:46Z","published":"2024-04-07T15:03:46Z","title":"Primary liver cancer classification from routine tumour biopsy using\n weakly supervised deep learning","summary":" The diagnosis of primary liver cancers (PLCs) can be challenging, especially\non biopsies and for combined hepatocellular-cholangiocarcinoma (cHCC-CCA). We\nautomatically classified PLCs on routine-stained biopsies using a weakly\nsupervised learning method. Weak tumour/non-tumour annotations served as labels\nfor training a Resnet18 neural network, and the network's last convolutional\nlayer was used to extract new tumour tile features. Without knowledge of the\nprecise labels of the malignancies, we then applied an unsupervised clustering\nalgorithm. Our model identified specific features of hepatocellular carcinoma\n(HCC) and intrahepatic cholangiocarcinoma (iCCA). Despite no specific features\nof cHCC-CCA being recognized, the identification of HCC and iCCA tiles within a\nslide could facilitate the diagnosis of primary liver cancers, particularly\ncHCC-CCA.\n Method and results: 166 PLC biopsies were divided into training, internal and\nexternal validation sets: 90, 29 and 47 samples. Two liver pathologists\nreviewed each whole-slide hematein eosin saffron (HES)-stained image (WSI).\nAfter annotating the tumour/non-tumour areas, 256x256 pixel tiles were\nextracted from the WSIs and used to train a ResNet18. The network was used to\nextract new tile features. An unsupervised clustering algorithm was then\napplied to the new tile features. In a two-cluster model, Clusters 0 and 1\ncontained mainly HCC and iCCA histological features. The diagnostic agreement\nbetween the pathological diagnosis and the model predictions in the internal\nand external validation sets was 100% (11/11) and 96% (25/26) for HCC and 78%\n(7/9) and 87% (13/15) for iCCA, respectively. For cHCC-CCA, we observed a\nhighly variable proportion of tiles from each cluster (Cluster 0: 5-97%;\nCluster 1: 2-94%).\n","authors":["Aurélie Beaufrère","Nora Ouzir","Paul Emile Zafar","Astrid Laurent-Bellue","Miguel Albuquerque","Gwladys Lubuela","Jules Grégory","Catherine Guettier","Kévin Mondet","Jean-Christophe Pesquet","Valérie Paradis"],"pdf_url":"https://arxiv.org/pdf/2404.04983v1.pdf","comment":"https://www.sciencedirect.com/science/article/pii/S2589555924000090"},{"id":"http://arxiv.org/abs/2311.15879v2","updated":"2024-04-07T14:43:38Z","published":"2023-11-27T14:51:37Z","title":"EVCap: Retrieval-Augmented Image Captioning with External Visual-Name\n Memory for Open-World Comprehension","summary":" Large language models (LLMs)-based image captioning has the capability of\ndescribing objects not explicitly observed in training data; yet novel objects\noccur frequently, necessitating the requirement of sustaining up-to-date object\nknowledge for open-world comprehension. Instead of relying on large amounts of\ndata and/or scaling up network parameters, we introduce a highly effective\nretrieval-augmented image captioning method that prompts LLMs with object names\nretrieved from External Visual--name memory (EVCap). We build ever-changing\nobject knowledge memory using objects' visuals and names, enabling us to (i)\nupdate the memory at a minimal cost and (ii) effortlessly augment LLMs with\nretrieved object names by utilizing a lightweight and fast-to-train model. Our\nmodel, which was trained only on the COCO dataset, can adapt to out-of-domain\nwithout requiring additional fine-tuning or re-training. Our experiments\nconducted on benchmarks and synthetic commonsense-violating data show that\nEVCap, with only 3.97M trainable parameters, exhibits superior performance\ncompared to other methods based on frozen pre-trained LLMs. Its performance is\nalso competitive to specialist SOTAs that require extensive training.\n","authors":["Jiaxuan Li","Duc Minh Vo","Akihiro Sugimoto","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2311.15879v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04971v1","updated":"2024-04-07T14:21:37Z","published":"2024-04-07T14:21:37Z","title":"FPL+: Filtered Pseudo Label-based Unsupervised Cross-Modality Adaptation\n for 3D Medical Image Segmentation","summary":" Adapting a medical image segmentation model to a new domain is important for\nimproving its cross-domain transferability, and due to the expensive annotation\nprocess, Unsupervised Domain Adaptation (UDA) is appealing where only unlabeled\nimages are needed for the adaptation. Existing UDA methods are mainly based on\nimage or feature alignment with adversarial training for regularization, and\nthey are limited by insufficient supervision in the target domain. In this\npaper, we propose an enhanced Filtered Pseudo Label (FPL+)-based UDA method for\n3D medical image segmentation. It first uses cross-domain data augmentation to\ntranslate labeled images in the source domain to a dual-domain training set\nconsisting of a pseudo source-domain set and a pseudo target-domain set. To\nleverage the dual-domain augmented images to train a pseudo label generator,\ndomain-specific batch normalization layers are used to deal with the domain\nshift while learning the domain-invariant structure features, generating\nhigh-quality pseudo labels for target-domain images. We then combine labeled\nsource-domain images and target-domain images with pseudo labels to train a\nfinal segmentor, where image-level weighting based on uncertainty estimation\nand pixel-level weighting based on dual-domain consensus are proposed to\nmitigate the adverse effect of noisy pseudo labels. Experiments on three public\nmulti-modal datasets for Vestibular Schwannoma, brain tumor and whole heart\nsegmentation show that our method surpassed ten state-of-the-art UDA methods,\nand it even achieved better results than fully supervised learning in the\ntarget domain in some cases.\n","authors":["Jianghao Wu","Dong Guo","Guotai Wang","Qiang Yue","Huijun Yu","Kang Li","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04971v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.06462v2","updated":"2024-04-07T14:05:53Z","published":"2023-12-11T15:51:38Z","title":"Cooperation Does Matter: Exploring Multi-Order Bilateral Relations for\n Audio-Visual Segmentation","summary":" Recently, an audio-visual segmentation (AVS) task has been introduced, aiming\nto group pixels with sounding objects within a given video. This task\nnecessitates a first-ever audio-driven pixel-level understanding of the scene,\nposing significant challenges. In this paper, we propose an innovative\naudio-visual transformer framework, termed COMBO, an acronym for COoperation of\nMulti-order Bilateral relatiOns. For the first time, our framework explores\nthree types of bilateral entanglements within AVS: pixel entanglement, modality\nentanglement, and temporal entanglement. Regarding pixel entanglement, we\nemploy a Siam-Encoder Module (SEM) that leverages prior knowledge to generate\nmore precise visual features from the foundational model. For modality\nentanglement, we design a Bilateral-Fusion Module (BFM), enabling COMBO to\nalign corresponding visual and auditory signals bi-directionally. As for\ntemporal entanglement, we introduce an innovative adaptive inter-frame\nconsistency loss according to the inherent rules of temporal. Comprehensive\nexperiments and ablation studies on AVSBench-object (84.7 mIoU on S4, 59.2 mIou\non MS3) and AVSBench-semantic (42.1 mIoU on AVSS) datasets demonstrate that\nCOMBO surpasses previous state-of-the-art methods. Code and more results will\nbe publicly available at https://yannqi.github.io/AVS-COMBO/.\n","authors":["Qi Yang","Xing Nie","Tong Li","Pengfei Gao","Ying Guo","Cheng Zhen","Pengfei Yan","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2312.06462v2.pdf","comment":"CVPR 2024 Highlight. 13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.04960v1","updated":"2024-04-07T13:40:29Z","published":"2024-04-07T13:40:29Z","title":"PairAug: What Can Augmented Image-Text Pairs Do for Radiology?","summary":" Current vision-language pre-training (VLP) methodologies predominantly depend\non paired image-text datasets, a resource that is challenging to acquire in\nradiology due to privacy considerations and labelling complexities. Data\naugmentation provides a practical solution to overcome the issue of data\nscarcity, however, most augmentation methods exhibit a limited focus,\nprioritising either image or text augmentation exclusively. Acknowledging this\nlimitation, our objective is to devise a framework capable of concurrently\naugmenting medical image and text data. We design a Pairwise Augmentation\n(PairAug) approach that contains an Inter-patient Augmentation (InterAug)\nbranch and an Intra-patient Augmentation (IntraAug) branch. Specifically, the\nInterAug branch of our approach generates radiology images using synthesised\nyet plausible reports derived from a Large Language Model (LLM). The generated\npairs can be considered a collection of new patient cases since they are\nartificially created and may not exist in the original dataset. In contrast,\nthe IntraAug branch uses newly generated reports to manipulate images. This\nprocess allows us to create new paired data for each individual with diverse\nmedical conditions. Our extensive experiments on various downstream tasks\ncovering medical image classification zero-shot and fine-tuning analysis\ndemonstrate that our PairAug, concurrently expanding both image and text data,\nsubstantially outperforms image-/text-only expansion baselines and advanced\nmedical VLP baselines. Our code is released at\n\\url{https://github.com/YtongXie/PairAug}.\n","authors":["Yutong Xie","Qi Chen","Sinuo Wang","Minh-Son To","Iris Lee","Ee Win Khoo","Kerolos Hendy","Daniel Koh","Yong Xia","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.04960v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2311.16514v2","updated":"2024-04-07T13:33:56Z","published":"2023-11-27T13:14:06Z","title":"Video Anomaly Detection via Spatio-Temporal Pseudo-Anomaly Generation :\n A Unified Approach","summary":" Video Anomaly Detection (VAD) is an open-set recognition task, which is\nusually formulated as a one-class classification (OCC) problem, where training\ndata is comprised of videos with normal instances while test data contains both\nnormal and anomalous instances. Recent works have investigated the creation of\npseudo-anomalies (PAs) using only the normal data and making strong assumptions\nabout real-world anomalies with regards to abnormality of objects and speed of\nmotion to inject prior information about anomalies in an autoencoder (AE) based\nreconstruction model during training. This work proposes a novel method for\ngenerating generic spatio-temporal PAs by inpainting a masked out region of an\nimage using a pre-trained Latent Diffusion Model and further perturbing the\noptical flow using mixup to emulate spatio-temporal distortions in the data. In\naddition, we present a simple unified framework to detect real-world anomalies\nunder the OCC setting by learning three types of anomaly indicators, namely\nreconstruction quality, temporal irregularity and semantic inconsistency.\nExtensive experiments on four VAD benchmark datasets namely Ped2, Avenue,\nShanghaiTech and UBnormal demonstrate that our method performs on par with\nother existing state-of-the-art PAs generation and reconstruction based methods\nunder the OCC setting. Our analysis also examines the transferability and\ngeneralisation of PAs across these datasets, offering valuable insights by\nidentifying real-world anomalies through PAs.\n","authors":["Ayush K. Rai","Tarun Krishna","Feiyan Hu","Alexandru Drimbarean","Kevin McGuinness","Alan F. Smeaton","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2311.16514v2.pdf","comment":"Accepted in CVPRW 2024 - VAND Workshop"},{"id":"http://arxiv.org/abs/2404.04956v1","updated":"2024-04-07T13:30:10Z","published":"2024-04-07T13:30:10Z","title":"Gaussian Shading: Provable Performance-Lossless Image Watermarking for\n Diffusion Models","summary":" Ethical concerns surrounding copyright protection and inappropriate content\ngeneration pose challenges for the practical implementation of diffusion\nmodels. One effective solution involves watermarking the generated images.\nHowever, existing methods often compromise the model performance or require\nadditional training, which is undesirable for operators and users. To address\nthis issue, we propose Gaussian Shading, a diffusion model watermarking\ntechnique that is both performance-lossless and training-free, while serving\nthe dual purpose of copyright protection and tracing of offending content. Our\nwatermark embedding is free of model parameter modifications and thus is\nplug-and-play. We map the watermark to latent representations following a\nstandard Gaussian distribution, which is indistinguishable from latent\nrepresentations obtained from the non-watermarked diffusion model. Therefore we\ncan achieve watermark embedding with lossless performance, for which we also\nprovide theoretical proof. Furthermore, since the watermark is intricately\nlinked with image semantics, it exhibits resilience to lossy processing and\nerasure attempts. The watermark can be extracted by Denoising Diffusion\nImplicit Models (DDIM) inversion and inverse sampling. We evaluate Gaussian\nShading on multiple versions of Stable Diffusion, and the results demonstrate\nthat Gaussian Shading not only is performance-lossless but also outperforms\nexisting methods in terms of robustness.\n","authors":["Zijin Yang","Kai Zeng","Kejiang Chen","Han Fang","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.04956v1.pdf","comment":"17 pages, 11 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04953v1","updated":"2024-04-07T13:17:47Z","published":"2024-04-07T13:17:47Z","title":"High-Discriminative Attribute Feature Learning for Generalized Zero-Shot\n Learning","summary":" Zero-shot learning(ZSL) aims to recognize new classes without prior exposure\nto their samples, relying on semantic knowledge from observed classes. However,\ncurrent attention-based models may overlook the transferability of visual\nfeatures and the distinctiveness of attribute localization when learning\nregional features in images. Additionally, they often overlook shared\nattributes among different objects. Highly discriminative attribute features\nare crucial for identifying and distinguishing unseen classes. To address these\nissues, we propose an innovative approach called High-Discriminative Attribute\nFeature Learning for Generalized Zero-Shot Learning (HDAFL). HDAFL optimizes\nvisual features by learning attribute features to obtain discriminative visual\nembeddings. Specifically, HDAFL utilizes multiple convolutional kernels to\nautomatically learn discriminative regions highly correlated with attributes in\nimages, eliminating irrelevant interference in image features. Furthermore, we\nintroduce a Transformer-based attribute discrimination encoder to enhance the\ndiscriminative capability among attributes. Simultaneously, the method employs\ncontrastive loss to alleviate dataset biases and enhance the transferability of\nvisual features, facilitating better semantic transfer between seen and unseen\nclasses. Experimental results demonstrate the effectiveness of HDAFL across\nthree widely used datasets.\n","authors":["Yu Lei","Guoshuai Sheng","Fangfang Li","Quanxue Gao","Cheng Deng","Qin Li"],"pdf_url":"https://arxiv.org/pdf/2404.04953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09434v2","updated":"2024-04-07T13:05:24Z","published":"2024-03-14T14:25:10Z","title":"Reconstruction and Simulation of Elastic Objects with Spring-Mass 3D\n Gaussians","summary":" Reconstructing and simulating elastic objects from visual observations is\ncrucial for applications in computer vision and robotics. Existing methods,\nsuch as 3D Gaussians, model 3D appearance and geometry, but lack the ability to\nestimate physical properties for objects and simulate them. The core challenge\nlies in integrating an expressive yet efficient physical dynamics model. We\npropose Spring-Gaus, a 3D physical object representation for reconstructing and\nsimulating elastic objects from videos of the object from multiple viewpoints.\nIn particular, we develop and integrate a 3D Spring-Mass model into 3D Gaussian\nkernels, enabling the reconstruction of the visual appearance, shape, and\nphysical dynamics of the object. Our approach enables future prediction and\nsimulation under various initial states and environmental properties. We\nevaluate Spring-Gaus on both synthetic and real-world datasets, demonstrating\naccurate reconstruction and simulation of elastic objects. Project page:\nhttps://zlicheng.com/spring_gaus.\n","authors":["Licheng Zhong","Hong-Xing Yu","Jiajun Wu","Yunzhu Li"],"pdf_url":"https://arxiv.org/pdf/2403.09434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05964v2","updated":"2024-04-07T13:03:58Z","published":"2024-02-05T12:16:28Z","title":"A Survey on Transformer Compression","summary":" Transformer plays a vital role in the realms of natural language processing\n(NLP) and computer vision (CV), specially for constructing large language\nmodels (LLM) and large vision models (LVM). Model compression methods reduce\nthe memory and computational cost of Transformer, which is a necessary step to\nimplement large language/vision models on practical devices. Given the unique\narchitecture of Transformer, featuring alternative attention and feedforward\nneural network (FFN) modules, specific compression techniques are usually\nrequired. The efficiency of these compression methods is also paramount, as\nretraining large models on the entire training dataset is usually impractical.\nThis survey provides a comprehensive review of recent compression methods, with\na specific focus on their application to Transformer-based models. The\ncompression methods are primarily categorized into pruning, quantization,\nknowledge distillation, and efficient architecture design (Mamba, RetNet, RWKV,\netc.). In each category, we discuss compression methods for both language and\nvision tasks, highlighting common underlying principles. Finally, we delve into\nthe relation between various compression methods, and discuss further\ndirections in this domain.\n","authors":["Yehui Tang","Yunhe Wang","Jianyuan Guo","Zhijun Tu","Kai Han","Hailin Hu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2402.05964v2.pdf","comment":"Model Compression, Transformer, Large Language Model, Large Vision\n Model, LLM"},{"id":"http://arxiv.org/abs/2404.04946v1","updated":"2024-04-07T12:57:41Z","published":"2024-04-07T12:57:41Z","title":"AnimateZoo: Zero-shot Video Generation of Cross-Species Animation via\n Subject Alignment","summary":" Recent video editing advancements rely on accurate pose sequences to animate\nsubjects. However, these efforts are not suitable for cross-species animation\ndue to pose misalignment between species (for example, the poses of a cat\ndiffers greatly from that of a pig due to differences in body structure). In\nthis paper, we present AnimateZoo, a zero-shot diffusion-based video generator\nto address this challenging cross-species animation issue, aiming to accurately\nproduce animal animations while preserving the background. The key technique\nused in our AnimateZoo is subject alignment, which includes two steps. First,\nwe improve appearance feature extraction by integrating a Laplacian detail\nbooster and a prompt-tuning identity extractor. These components are\nspecifically designed to capture essential appearance information, including\nidentity and fine details. Second, we align shape features and address\nconflicts from differing subjects by introducing a scale-information remover.\nThis ensures accurate cross-species animation. Moreover, we introduce two\nhigh-quality animal video datasets featuring a wide variety of species. Trained\non these extensive datasets, our model is capable of generating videos\ncharacterized by accurate movements, consistent appearance, and high-fidelity\nframes, without the need for the pre-inference fine-tuning that prior arts\nrequired. Extensive experiments showcase the outstanding performance of our\nmethod in cross-species action following tasks, demonstrating exceptional shape\nadaptation capability. The project page is available at\nhttps://justinxu0.github.io/AnimateZoo/.\n","authors":["Yuanfeng Xu","Yuhao Chen","Zhongzhan Huang","Zijian He","Guangrun Wang","Philip Torr","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2404.04946v1.pdf","comment":"Technical report,15 pages"},{"id":"http://arxiv.org/abs/2404.03043v2","updated":"2024-04-07T12:37:04Z","published":"2024-04-03T20:05:00Z","title":"Linear Anchored Gaussian Mixture Model for Location and Width\n Computation of Objects in Thick Line Shape","summary":" An accurate detection of the centerlines of linear objects is a challenging\ntopic in many sensitive real-world applications such X-ray imaging, remote\nsensing and lane marking detection in road traffic. Model-based approaches\nusing Hough and Radon transforms are often used but, are not recommended for\nthick line detection, whereas approaches based on image derivatives need\nfurther step-by-step processing, making their efficiency dependent on each step\noutcomes. In this paper, we aim to detect linear structures found in images by\nconsidering the 3D representation of the image gray levels as a finite mixture\nmodel of statistical distribution. The latter, which we named linear anchored\nGaussian distribution could be parametrized by a scale value ${\\sigma}$\ndescribing the linear structure thickness and a line equation, parametrized, in\nturn, by a radius ${\\rho}$ and an orientation angle ${\\theta}$, describing the\nlinear structure centerline location. Expectation-Maximization (EM) algorithm\nis used for the mixture model parameter estimation, where a new paradigm, using\nthe background subtraction for the likelihood function computation, is\nproposed. For the EM algorithm, two ${\\theta}$ parameter initialization schemes\nare used: the first one is based on a random choice of the first component of\n${\\theta}$ vector, whereas the second is based on the image Hessian with a\nsimultaneous computation of the mixture model components number. Experiments on\nreal world images and synthetic images corrupted by blur and additive noise\nshow the good performance of the proposed methods, where the algorithm using\nbackground subtraction and Hessian-based ${\\theta}$ initialization provides an\noutstanding accuracy of the linear structure detection despite irregular image\nbackground and presence of blur and noise.\n","authors":["Nafaa Nacereddine","Aicha Baya Goumeidane","Djemel Ziou"],"pdf_url":"https://arxiv.org/pdf/2404.03043v2.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2305.13799v2","updated":"2024-04-07T12:33:08Z","published":"2023-05-23T08:13:09Z","title":"UPNet: Uncertainty-based Picking Deep Learning Network for Robust First\n Break Picking","summary":" In seismic exploration, first break (FB) picking is a crucial aspect in the\ndetermination of subsurface velocity models, significantly influencing the\nplacement of wells. Many deep neural networks (DNNs)-based automatic picking\nmethods have been proposed to accelerate this processing. Significantly, the\nsegmentation-based DNN methods provide a segmentation map and then estimate FB\nfrom the map using a picking threshold. However, the uncertainty of the results\npicked by DNNs still needs to be analyzed. Thus, the automatic picking methods\napplied in field datasets can not ensure robustness, especially in the case of\na low signal-to-noise ratio (SNR). In this paper, we introduce uncertainty\nquantification into the FB picking task and propose a novel uncertainty-based\npicking deep learning network called UPNet. UPNet not only estimates the\nuncertainty of network output but also can filter the pickings with low\nconfidence. Many experiments evaluate that UPNet exhibits higher accuracy and\nrobustness than the deterministic DNN-based model, achieving State-of-the-Art\n(SOTA) performance in field surveys. In addition, we verify that the\nmeasurement uncertainty is meaningful, which can provide a reference for human\ndecision-making.\n","authors":["Hongtao Wang","Jiangshe Zhang","Xiaoli Wei","Li Long","Chunxia Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04936v1","updated":"2024-04-07T12:17:40Z","published":"2024-04-07T12:17:40Z","title":"Bootstrapping Chest CT Image Understanding by Distilling Knowledge from\n X-ray Expert Models","summary":" Radiologists highly desire fully automated versatile AI for medical imaging\ninterpretation. However, the lack of extensively annotated large-scale\nmulti-disease datasets has hindered the achievement of this goal. In this\npaper, we explore the feasibility of leveraging language as a naturally\nhigh-quality supervision for chest CT imaging. In light of the limited\navailability of image-report pairs, we bootstrap the understanding of 3D chest\nCT images by distilling chest-related diagnostic knowledge from an extensively\npre-trained 2D X-ray expert model. Specifically, we propose a language-guided\nretrieval method to match each 3D CT image with its semantically closest 2D\nX-ray image, and perform pair-wise and semantic relation knowledge\ndistillation. Subsequently, we use contrastive learning to align images and\nreports within the same patient while distinguishing them from the other\npatients. However, the challenge arises when patients have similar semantic\ndiagnoses, such as healthy patients, potentially confusing if treated as\nnegatives. We introduce a robust contrastive learning that identifies and\ncorrects these false negatives. We train our model with over 12,000 pairs of\nchest CT images and radiology reports. Extensive experiments across multiple\nscenarios, including zero-shot learning, report generation, and fine-tuning\nprocesses, demonstrate the model's feasibility in interpreting chest CT images.\n","authors":["Weiwei Cao","Jianpeng Zhang","Yingda Xia","Tony C. W. Mok","Zi Li","Xianghua Ye","Le Lu","Jian Zheng","Yuxing Tang","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04936v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04935v1","updated":"2024-04-07T12:15:53Z","published":"2024-04-07T12:15:53Z","title":"Anomaly Detection in Electrocardiograms: Advancing Clinical Diagnosis\n Through Self-Supervised Learning","summary":" The electrocardiogram (ECG) is an essential tool for diagnosing heart\ndisease, with computer-aided systems improving diagnostic accuracy and reducing\nhealthcare costs. Despite advancements, existing systems often miss rare\ncardiac anomalies that could be precursors to serious, life-threatening issues\nor alterations in the cardiac macro/microstructure. We address this gap by\nfocusing on self-supervised anomaly detection (AD), training exclusively on\nnormal ECGs to recognize deviations indicating anomalies. We introduce a novel\nself-supervised learning framework for ECG AD, utilizing a vast dataset of\nnormal ECGs to autonomously detect and localize cardiac anomalies. It proposes\na novel masking and restoration technique alongside a multi-scale\ncross-attention module, enhancing the model's ability to integrate global and\nlocal signal features. The framework emphasizes accurate localization of\nanomalies within ECG signals, ensuring the method's clinical relevance and\nreliability. To reduce the impact of individual variability, the approach\nfurther incorporates crucial patient-specific information from ECG reports,\nsuch as age and gender, thus enabling accurate identification of a broad\nspectrum of cardiac anomalies, including rare ones. Utilizing an extensive\ndataset of 478,803 ECG graphic reports from real-world clinical practice, our\nmethod has demonstrated exceptional effectiveness in AD across all tested\nconditions, regardless of their frequency of occurrence, significantly\noutperforming existing models. It achieved superior performance metrics,\nincluding an AUROC of 91.2%, an F1 score of 83.7%, a sensitivity rate of 84.2%,\na specificity of 83.0%, and a precision of 75.6% with a fixed recall rate of\n90%. It has also demonstrated robust localization capabilities, with an AUROC\nof 76.5% and a Dice coefficient of 65.3% for anomaly localization.\n","authors":["Aofan Jiang","Chaoqin Huang","Qing Cao","Yuchen Xu","Zi Zeng","Kang Chen","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04933v1","updated":"2024-04-07T12:14:42Z","published":"2024-04-07T12:14:42Z","title":"UniMD: Towards Unifying Moment Retrieval and Temporal Action Detection","summary":" Temporal Action Detection (TAD) focuses on detecting pre-defined actions,\nwhile Moment Retrieval (MR) aims to identify the events described by open-ended\nnatural language within untrimmed videos. Despite that they focus on different\nevents, we observe they have a significant connection. For instance, most\ndescriptions in MR involve multiple actions from TAD. In this paper, we aim to\ninvestigate the potential synergy between TAD and MR. Firstly, we propose a\nunified architecture, termed Unified Moment Detection (UniMD), for both TAD and\nMR. It transforms the inputs of the two tasks, namely actions for TAD or events\nfor MR, into a common embedding space, and utilizes two novel query-dependent\ndecoders to generate a uniform output of classification score and temporal\nsegments. Secondly, we explore the efficacy of two task fusion learning\napproaches, pre-training and co-training, in order to enhance the mutual\nbenefits between TAD and MR. Extensive experiments demonstrate that the\nproposed task fusion learning scheme enables the two tasks to help each other\nand outperform the separately trained counterparts. Impressively, UniMD\nachieves state-of-the-art results on three paired datasets Ego4D, Charades-STA,\nand ActivityNet. Our code will be released at\nhttps://github.com/yingsen1/UniMD.\n","authors":["Yingsen Zeng","Yujie Zhong","Chengjian Feng","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2404.04933v1.pdf","comment":"Tech report"},{"id":"http://arxiv.org/abs/2402.13185v4","updated":"2024-04-07T12:11:28Z","published":"2024-02-20T17:52:12Z","title":"UniEdit: A Unified Tuning-Free Framework for Video Motion and Appearance\n Editing","summary":" Recent advances in text-guided video editing have showcased promising results\nin appearance editing (e.g., stylization). However, video motion editing in the\ntemporal dimension (e.g., from eating to waving), which distinguishes video\nediting from image editing, is underexplored. In this work, we present UniEdit,\na tuning-free framework that supports both video motion and appearance editing\nby harnessing the power of a pre-trained text-to-video generator within an\ninversion-then-generation framework. To realize motion editing while preserving\nsource video content, based on the insights that temporal and spatial\nself-attention layers encode inter-frame and intra-frame dependency\nrespectively, we introduce auxiliary motion-reference and reconstruction\nbranches to produce text-guided motion and source features respectively. The\nobtained features are then injected into the main editing path via temporal and\nspatial self-attention layers. Extensive experiments demonstrate that UniEdit\ncovers video motion editing and various appearance editing scenarios, and\nsurpasses the state-of-the-art methods. Our code will be publicly available.\n","authors":["Jianhong Bai","Tianyu He","Yuchi Wang","Junliang Guo","Haoji Hu","Zuozhu Liu","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2402.13185v4.pdf","comment":"Project page: https://jianhongbai.github.io/UniEdit/"},{"id":"http://arxiv.org/abs/2305.13600v2","updated":"2024-04-07T11:50:34Z","published":"2023-05-23T02:02:36Z","title":"SiCL: Silhouette-Driven Contrastive Learning for Unsupervised Person\n Re-Identification with Clothes Change","summary":" In this paper, we address a highly challenging yet critical task:\nunsupervised long-term person re-identification with clothes change. Existing\nunsupervised person re-id methods are mainly designed for short-term scenarios\nand usually rely on RGB cues so that fail to perceive feature patterns that are\nindependent of the clothes. To crack this bottleneck, we propose a\nsilhouette-driven contrastive learning (SiCL) method, which is designed to\nlearn cross-clothes invariance by integrating both the RGB cues and the\nsilhouette information within a contrastive learning framework. To our\nknowledge, this is the first tailor-made framework for unsupervised long-term\nclothes change \\reid{}, with superior performance on six benchmark datasets. We\nconduct extensive experiments to evaluate our proposed SiCL compared to the\nstate-of-the-art unsupervised person reid methods across all the representative\ndatasets. Experimental results demonstrate that our proposed SiCL significantly\noutperforms other unsupervised re-id methods.\n","authors":["Mingkun Li","Peng Xu","Chun-Guang Li","Jun Guo"],"pdf_url":"https://arxiv.org/pdf/2305.13600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04924v1","updated":"2024-04-07T11:48:07Z","published":"2024-04-07T11:48:07Z","title":"GvT: A Graph-based Vision Transformer with Talking-Heads Utilizing\n Sparsity, Trained from Scratch on Small Datasets","summary":" Vision Transformers (ViTs) have achieved impressive results in large-scale\nimage classification. However, when training from scratch on small datasets,\nthere is still a significant performance gap between ViTs and Convolutional\nNeural Networks (CNNs), which is attributed to the lack of inductive bias. To\naddress this issue, we propose a Graph-based Vision Transformer (GvT) that\nutilizes graph convolutional projection and graph-pooling. In each block,\nqueries and keys are calculated through graph convolutional projection based on\nthe spatial adjacency matrix, while dot-product attention is used in another\ngraph convolution to generate values. When using more attention heads, the\nqueries and keys become lower-dimensional, making their dot product an\nuninformative matching function. To overcome this low-rank bottleneck in\nattention heads, we employ talking-heads technology based on bilinear pooled\nfeatures and sparse selection of attention tensors. This allows interaction\namong filtered attention scores and enables each attention mechanism to depend\non all queries and keys. Additionally, we apply graph-pooling between two\nintermediate blocks to reduce the number of tokens and aggregate semantic\ninformation more effectively. Our experimental results show that GvT produces\ncomparable or superior outcomes to deep convolutional networks and surpasses\nvision transformers without pre-training on large datasets. The code for our\nproposed model is publicly available on the website.\n","authors":["Dongjing Shan","guiqiang chen"],"pdf_url":"https://arxiv.org/pdf/2404.04924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12488v2","updated":"2024-04-07T11:38:48Z","published":"2024-03-19T06:54:33Z","title":"DetToolChain: A New Prompting Paradigm to Unleash Detection Ability of\n MLLM","summary":" We present DetToolChain, a novel prompting paradigm, to unleash the zero-shot\nobject detection ability of multimodal large language models (MLLMs), such as\nGPT-4V and Gemini. Our approach consists of a detection prompting toolkit\ninspired by high-precision detection priors and a new Chain-of-Thought to\nimplement these prompts. Specifically, the prompts in the toolkit are designed\nto guide the MLLM to focus on regional information (e.g., zooming in), read\ncoordinates according to measure standards (e.g., overlaying rulers and\ncompasses), and infer from the contextual information (e.g., overlaying scene\ngraphs). Building upon these tools, the new detection chain-of-thought can\nautomatically decompose the task into simple subtasks, diagnose the\npredictions, and plan for progressive box refinements. The effectiveness of our\nframework is demonstrated across a spectrum of detection tasks, especially hard\ncases. Compared to existing state-of-the-art methods, GPT-4V with our\nDetToolChain improves state-of-the-art object detectors by +21.5% AP50 on MS\nCOCO Novel class set for open-vocabulary detection, +24.23% Acc on RefCOCO val\nset for zero-shot referring expression comprehension, +14.5% AP on D-cube\ndescribe object detection FULL setting.\n","authors":["Yixuan Wu","Yizhou Wang","Shixiang Tang","Wenhao Wu","Tong He","Wanli Ouyang","Jian Wu","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2403.12488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04922v1","updated":"2024-04-07T11:25:04Z","published":"2024-04-07T11:25:04Z","title":"Efficient Learnable Collaborative Attention for Single Image\n Super-Resolution","summary":" Non-Local Attention (NLA) is a powerful technique for capturing long-range\nfeature correlations in deep single image super-resolution (SR). However, NLA\nsuffers from high computational complexity and memory consumption, as it\nrequires aggregating all non-local feature information for each query response\nand recalculating the similarity weight distribution for different abstraction\nlevels of features. To address these challenges, we propose a novel Learnable\nCollaborative Attention (LCoA) that introduces inductive bias into non-local\nmodeling. Our LCoA consists of two components: Learnable Sparse Pattern (LSP)\nand Collaborative Attention (CoA). LSP uses the k-means clustering algorithm to\ndynamically adjust the sparse attention pattern of deep features, which reduces\nthe number of non-local modeling rounds compared with existing sparse\nsolutions. CoA leverages the sparse attention pattern and weights learned by\nLSP, and co-optimizes the similarity matrix across different abstraction\nlevels, which avoids redundant similarity matrix calculations. The experimental\nresults show that our LCoA can reduce the non-local modeling time by about 83%\nin the inference stage. In addition, we integrate our LCoA into a deep\nLearnable Collaborative Attention Network (LCoAN), which achieves competitive\nperformance in terms of inference time, memory consumption, and reconstruction\nquality compared with other state-of-the-art SR methods.\n","authors":["Yigang Zhao Chaowei Zheng","Jiannan Su"," GuangyongChen"," MinGan"],"pdf_url":"https://arxiv.org/pdf/2404.04922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16499v2","updated":"2024-04-07T11:16:15Z","published":"2024-03-25T07:34:06Z","title":"Self-Supervised Learning for Medical Image Data with Anatomy-Oriented\n Imaging Planes","summary":" Self-supervised learning has emerged as a powerful tool for pretraining deep\nnetworks on unlabeled data, prior to transfer learning of target tasks with\nlimited annotation. The relevance between the pretraining pretext and target\ntasks is crucial to the success of transfer learning. Various pretext tasks\nhave been proposed to utilize properties of medical image data (e.g., three\ndimensionality), which are more relevant to medical image analysis than generic\nones for natural images. However, previous work rarely paid attention to data\nwith anatomy-oriented imaging planes, e.g., standard cardiac magnetic resonance\nimaging views. As these imaging planes are defined according to the anatomy of\nthe imaged organ, pretext tasks effectively exploiting this information can\npretrain the networks to gain knowledge on the organ of interest. In this work,\nwe propose two complementary pretext tasks for this group of medical image data\nbased on the spatial relationship of the imaging planes. The first is to learn\nthe relative orientation between the imaging planes and implemented as\nregressing their intersecting lines. The second exploits parallel imaging\nplanes to regress their relative slice locations within a stack. Both pretext\ntasks are conceptually straightforward and easy to implement, and can be\ncombined in multitask learning for better representation learning. Thorough\nexperiments on two anatomical structures (heart and knee) and representative\ntarget tasks (semantic segmentation and classification) demonstrate that the\nproposed pretext tasks are effective in pretraining deep networks for\nremarkably boosted performance on the target tasks, and superior to other\nrecent approaches.\n","authors":["Tianwei Zhang","Dong Wei","Mengmeng Zhu","Shi Gu","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.16499v2.pdf","comment":"Medical Image Analysis"},{"id":"http://arxiv.org/abs/2404.04916v1","updated":"2024-04-07T10:57:54Z","published":"2024-04-07T10:57:54Z","title":"Correcting Diffusion-Based Perceptual Image Compression with Privileged\n End-to-End Decoder","summary":" The images produced by diffusion models can attain excellent perceptual\nquality. However, it is challenging for diffusion models to guarantee\ndistortion, hence the integration of diffusion models and image compression\nmodels still needs more comprehensive explorations. This paper presents a\ndiffusion-based image compression method that employs a privileged end-to-end\ndecoder model as correction, which achieves better perceptual quality while\nguaranteeing the distortion to an extent. We build a diffusion model and design\na novel paradigm that combines the diffusion model and an end-to-end decoder,\nand the latter is responsible for transmitting the privileged information\nextracted at the encoder side. Specifically, we theoretically analyze the\nreconstruction process of the diffusion models at the encoder side with the\noriginal images being visible. Based on the analysis, we introduce an\nend-to-end convolutional decoder to provide a better approximation of the score\nfunction $\\nabla_{\\mathbf{x}_t}\\log p(\\mathbf{x}_t)$ at the encoder side and\neffectively transmit the combination. Experiments demonstrate the superiority\nof our method in both distortion and perception compared with previous\nperceptual compression methods.\n","authors":["Yiyang Ma","Wenhan Yang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2404.04916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04913v1","updated":"2024-04-07T10:49:59Z","published":"2024-04-07T10:49:59Z","title":"CodecNeRF: Toward Fast Encoding and Decoding, Compact, and High-quality\n Novel-view Synthesis","summary":" Neural Radiance Fields (NeRF) have achieved huge success in effectively\ncapturing and representing 3D objects and scenes. However, several factors have\nimpeded its further proliferation as next-generation 3D media. To establish a\nubiquitous presence in everyday media formats, such as images and videos, it is\nimperative to devise a solution that effectively fulfills three key objectives:\nfast encoding and decoding time, compact model sizes, and high-quality\nrenderings. Despite significant advancements, a comprehensive algorithm that\nadequately addresses all objectives has yet to be fully realized. In this work,\nwe present CodecNeRF, a neural codec for NeRF representations, consisting of a\nnovel encoder and decoder architecture that can generate a NeRF representation\nin a single forward pass. Furthermore, inspired by the recent\nparameter-efficient finetuning approaches, we develop a novel finetuning method\nto efficiently adapt the generated NeRF representations to a new test instance,\nleading to high-quality image renderings and compact code sizes. The proposed\nCodecNeRF, a newly suggested encoding-decoding-finetuning pipeline for NeRF,\nachieved unprecedented compression performance of more than 150x and 20x\nreduction in encoding time while maintaining (or improving) the image quality\non widely used 3D object datasets, such as ShapeNet and Objaverse.\n","authors":["Gyeongjin Kang","Younggeun Lee","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2404.04913v1.pdf","comment":"34 pages, 22 figures, Project page:\n https://gynjn.github.io/Codec-NeRF/"},{"id":"http://arxiv.org/abs/2404.04910v1","updated":"2024-04-07T10:39:04Z","published":"2024-04-07T10:39:04Z","title":"MonoTAKD: Teaching Assistant Knowledge Distillation for Monocular 3D\n Object Detection","summary":" Monocular 3D object detection (Mono3D) is an indispensable research topic in\nautonomous driving, thanks to the cost-effective monocular camera sensors and\nits wide range of applications. Since the image perspective has depth\nambiguity, the challenges of Mono3D lie in understanding 3D scene geometry and\nreconstructing 3D object information from a single image. Previous methods\nattempted to transfer 3D information directly from the LiDAR-based teacher to\nthe camera-based student. However, a considerable gap in feature representation\nmakes direct cross-modal distillation inefficient, resulting in a significant\nperformance deterioration between the LiDAR-based teacher and the camera-based\nstudent. To address this issue, we propose the Teaching Assistant Knowledge\nDistillation (MonoTAKD) to break down the learning objective by integrating\nintra-modal distillation with cross-modal residual distillation. In particular,\nwe employ a strong camera-based teaching assistant model to distill powerful\nvisual knowledge effectively through intra-modal distillation. Subsequently, we\nintroduce the cross-modal residual distillation to transfer the 3D spatial\ncues. By acquiring both visual knowledge and 3D spatial cues, the predictions\nof our approach are rigorously evaluated on the KITTI 3D object detection\nbenchmark and achieve state-of-the-art performance in Mono3D.\n","authors":["Hou-I Liu","Christine Wu","Jen-Hao Cheng","Wenhao Chai","Shian-Yun Wang","Gaowen Liu","Jenq-Neng Hwang","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.04910v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.04908v1","updated":"2024-04-07T10:28:01Z","published":"2024-04-07T10:28:01Z","title":"Dual-Camera Smooth Zoom on Mobile Phones","summary":" When zooming between dual cameras on a mobile, noticeable jumps in geometric\ncontent and image color occur in the preview, inevitably affecting the user's\nzoom experience. In this work, we introduce a new task, ie, dual-camera smooth\nzoom (DCSZ) to achieve a smooth zoom preview. The frame interpolation (FI)\ntechnique is a potential solution but struggles with ground-truth collection.\nTo address the issue, we suggest a data factory solution where continuous\nvirtual cameras are assembled to generate DCSZ data by rendering reconstructed\n3D models of the scene. In particular, we propose a novel dual-camera smooth\nzoom Gaussian Splatting (ZoomGS), where a camera-specific encoding is\nintroduced to construct a specific 3D model for each virtual camera. With the\nproposed data factory, we construct a synthetic dataset for DCSZ, and we\nutilize it to fine-tune FI models. In addition, we collect real-world dual-zoom\nimages without ground-truth for evaluation. Extensive experiments are conducted\nwith multiple FI methods. The results show that the fine-tuned FI models\nachieve a significant performance improvement over the original ones on DCSZ\ntask. The datasets, codes, and pre-trained models will be publicly available.\n","authors":["Renlong Wu","Zhilu Zhang","Yu Yang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.04908v1.pdf","comment":"24"},{"id":"http://arxiv.org/abs/2403.16834v2","updated":"2024-04-07T09:56:54Z","published":"2024-03-25T14:57:29Z","title":"From Two-Stream to One-Stream: Efficient RGB-T Tracking via Mutual\n Prompt Learning and Knowledge Distillation","summary":" Due to the complementary nature of visible light and thermal infrared\nmodalities, object tracking based on the fusion of visible light images and\nthermal images (referred to as RGB-T tracking) has received increasing\nattention from researchers in recent years. How to achieve more comprehensive\nfusion of information from the two modalities at a lower cost has been an issue\nthat researchers have been exploring. Inspired by visual prompt learning, we\ndesigned a novel two-stream RGB-T tracking architecture based on cross-modal\nmutual prompt learning, and used this model as a teacher to guide a one-stream\nstudent model for rapid learning through knowledge distillation techniques.\nExtensive experiments have shown that, compared to similar RGB-T trackers, our\ndesigned teacher model achieved the highest precision rate, while the student\nmodel, with comparable precision rate to the teacher model, realized an\ninference speed more than three times faster than the teacher model.(Codes will\nbe available if accepted.)\n","authors":["Yang Luo","Xiqing Guo","Hao Li"],"pdf_url":"https://arxiv.org/pdf/2403.16834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11700v4","updated":"2024-04-07T09:17:34Z","published":"2023-11-20T12:08:23Z","title":"GS-SLAM: Dense Visual SLAM with 3D Gaussian Splatting","summary":" In this paper, we introduce \\textbf{GS-SLAM} that first utilizes 3D Gaussian\nrepresentation in the Simultaneous Localization and Mapping (SLAM) system. It\nfacilitates a better balance between efficiency and accuracy. Compared to\nrecent SLAM methods employing neural implicit representations, our method\nutilizes a real-time differentiable splatting rendering pipeline that offers\nsignificant speedup to map optimization and RGB-D rendering. Specifically, we\npropose an adaptive expansion strategy that adds new or deletes noisy 3D\nGaussians in order to efficiently reconstruct new observed scene geometry and\nimprove the mapping of previously observed areas. This strategy is essential to\nextend 3D Gaussian representation to reconstruct the whole scene rather than\nsynthesize a static object in existing methods. Moreover, in the pose tracking\nprocess, an effective coarse-to-fine technique is designed to select reliable\n3D Gaussian representations to optimize camera pose, resulting in runtime\nreduction and robust estimation. Our method achieves competitive performance\ncompared with existing state-of-the-art real-time methods on the Replica,\nTUM-RGBD datasets. Project page: https://gs-slam.github.io/.\n","authors":["Chi Yan","Delin Qu","Dan Xu","Bin Zhao","Zhigang Wang","Dong Wang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2311.11700v4.pdf","comment":"Accepted to CVPR 2024(highlight). Project Page:\n https://gs-slam.github.io/"},{"id":"http://arxiv.org/abs/2404.04891v1","updated":"2024-04-07T09:17:00Z","published":"2024-04-07T09:17:00Z","title":"DL-EWF: Deep Learning Empowering Women's Fashion with\n Grounded-Segment-Anything Segmentation for Body Shape Classification","summary":" The global fashion industry plays a pivotal role in the global economy, and\naddressing fundamental issues within the industry is crucial for developing\ninnovative solutions. One of the most pressing challenges in the fashion\nindustry is the mismatch between body shapes and the garments of individuals\nthey purchase. This issue is particularly prevalent among individuals with\nnon-ideal body shapes, exacerbating the challenges faced. Considering\ninter-individual variability in body shapes is essential for designing and\nproducing garments that are widely accepted by consumers. Traditional methods\nfor determining human body shape are limited due to their low accuracy, high\ncosts, and time-consuming nature. New approaches, utilizing digital imaging and\ndeep neural networks (DNN), have been introduced to identify human body shape.\nIn this study, the Style4BodyShape dataset is used for classifying body shapes\ninto five categories: Rectangle, Triangle, Inverted Triangle, Hourglass, and\nApple. In this paper, the body shape segmentation of a person is extracted from\nthe image, disregarding the surroundings and background. Then, Various\npre-trained models, such as ResNet18, ResNet34, ResNet50, VGG16, VGG19, and\nInception v3, are used to classify the segmentation results. Among these\npre-trained models, the Inception V3 model demonstrates superior performance\nregarding f1-score evaluation metric and accuracy compared to the other models.\n","authors":["Fatemeh Asghari","Mohammad Reza Soheili","Faezeh Gholamrezaie"],"pdf_url":"https://arxiv.org/pdf/2404.04891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04890v1","updated":"2024-04-07T09:15:45Z","published":"2024-04-07T09:15:45Z","title":"A Unified Diffusion Framework for Scene-aware Human Motion Estimation\n from Sparse Signals","summary":" Estimating full-body human motion via sparse tracking signals from\nhead-mounted displays and hand controllers in 3D scenes is crucial to\napplications in AR/VR. One of the biggest challenges to this task is the\none-to-many mapping from sparse observations to dense full-body motions, which\nendowed inherent ambiguities. To help resolve this ambiguous problem, we\nintroduce a new framework to combine rich contextual information provided by\nscenes to benefit full-body motion tracking from sparse observations. To\nestimate plausible human motions given sparse tracking signals and 3D scenes,\nwe develop $\\text{S}^2$Fusion, a unified framework fusing \\underline{S}cene and\nsparse \\underline{S}ignals with a conditional dif\\underline{Fusion} model.\n$\\text{S}^2$Fusion first extracts the spatial-temporal relations residing in\nthe sparse signals via a periodic autoencoder, and then produces time-alignment\nfeature embedding as additional inputs. Subsequently, by drawing initial noisy\nmotion from a pre-trained prior, $\\text{S}^2$Fusion utilizes conditional\ndiffusion to fuse scene geometry and sparse tracking signals to generate\nfull-body scene-aware motions. The sampling procedure of $\\text{S}^2$Fusion is\nfurther guided by a specially designed scene-penetration loss and\nphase-matching loss, which effectively regularizes the motion of the lower body\neven in the absence of any tracking signals, making the generated motion much\nmore plausible and coherent. Extensive experimental results have demonstrated\nthat our $\\text{S}^2$Fusion outperforms the state-of-the-art in terms of\nestimation quality and smoothness.\n","authors":["Jiangnan Tang","Jingya Wang","Kaiyang Ji","Lan Xu","Jingyi Yu","Ye Shi"],"pdf_url":"https://arxiv.org/pdf/2404.04890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04887v1","updated":"2024-04-07T09:08:14Z","published":"2024-04-07T09:08:14Z","title":"A Clinical-oriented Multi-level Contrastive Learning Method for Disease\n Diagnosis in Low-quality Medical Images","summary":" Representation learning offers a conduit to elucidate distinctive features\nwithin the latent space and interpret the deep models. However, the randomness\nof lesion distribution and the complexity of low-quality factors in medical\nimages pose great challenges for models to extract key lesion features. Disease\ndiagnosis methods guided by contrastive learning (CL) have shown significant\nadvantages in lesion feature representation. Nevertheless, the effectiveness of\nCL is highly dependent on the quality of the positive and negative sample\npairs. In this work, we propose a clinical-oriented multi-level CL framework\nthat aims to enhance the model's capacity to extract lesion features and\ndiscriminate between lesion and low-quality factors, thereby enabling more\naccurate disease diagnosis from low-quality medical images. Specifically, we\nfirst construct multi-level positive and negative pairs to enhance the model's\ncomprehensive recognition capability of lesion features by integrating\ninformation from different levels and qualities of medical images. Moreover, to\nimprove the quality of the learned lesion embeddings, we introduce a dynamic\nhard sample mining method based on self-paced learning. The proposed CL\nframework is validated on two public medical image datasets, EyeQ and Chest\nX-ray, demonstrating superior performance compared to other state-of-the-art\ndisease diagnostic methods.\n","authors":["Qingshan Hou","Shuai Cheng","Peng Cao","Jinzhu Yang","Xiaoli Liu","Osmar R. Zaiane","Yih Chung Tham"],"pdf_url":"https://arxiv.org/pdf/2404.04887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04884v1","updated":"2024-04-07T09:05:04Z","published":"2024-04-07T09:05:04Z","title":"LRNet: Change detection of high-resolution remote sensing imagery via\n strategy of localization-then-refinement","summary":" Change detection, as a research hotspot in the field of remote sensing, has\nwitnessed continuous development and progress. However, the discrimination of\nboundary details remains a significant bottleneck due to the complexity of\nsurrounding elements between change areas and backgrounds. Discriminating the\nboundaries of large change areas results in misalignment, while connecting\nboundaries occurs for small change targets. To address the above issues, a\nnovel network based on the localization-then-refinement strategy is proposed in\nthis paper, namely LRNet. LRNet consists of two stages: localization and\nrefinement. In the localization stage, a three-branch encoder simultaneously\nextracts original image features and their differential features for\ninteractive localization of the position of each change area. To minimize\ninformation loss during feature extraction, learnable optimal pooling (LOP) is\nproposed to replace the widely used max-pooling. Additionally, this process is\ntrainable and contributes to the overall optimization of the network. To\neffectively interact features from different branches and accurately locate\nchange areas of various sizes, change alignment attention (C2A) and\nhierarchical change alignment module (HCA) are proposed. In the refinement\nstage, the localization results from the localization stage are corrected by\nconstraining the change areas and change edges through the edge-area alignment\nmodule (E2A). Subsequently, the decoder, combined with the difference features\nstrengthened by C2A in the localization phase, refines change areas of\ndifferent sizes, ultimately achieving accurate boundary discrimination of\nchange areas. The proposed LRNet outperforms 13 other state-of-the-art methods\nin terms of comprehensive evaluation metrics and provides the most precise\nboundary discrimination results on the LEVIR-CD and WHU-CD datasets.\n","authors":["Huan Zhong","Chen Wu","Ziqi Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.04884v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.04883v1","updated":"2024-04-07T09:01:50Z","published":"2024-04-07T09:01:50Z","title":"Mixture of Low-rank Experts for Transferable AI-Generated Image\n Detection","summary":" Generative models have shown a giant leap in synthesizing photo-realistic\nimages with minimal expertise, sparking concerns about the authenticity of\nonline information. This study aims to develop a universal AI-generated image\ndetector capable of identifying images from diverse sources. Existing methods\nstruggle to generalize across unseen generative models when provided with\nlimited sample sources. Inspired by the zero-shot transferability of\npre-trained vision-language models, we seek to harness the nontrivial\nvisual-world knowledge and descriptive proficiency of CLIP-ViT to generalize\nover unknown domains. This paper presents a novel parameter-efficient\nfine-tuning approach, mixture of low-rank experts, to fully exploit CLIP-ViT's\npotential while preserving knowledge and expanding capacity for transferable\ndetection. We adapt only the MLP layers of deeper ViT blocks via an integration\nof shared and separate LoRAs within an MoE-based structure. Extensive\nexperiments on public benchmarks show that our method achieves superiority over\nstate-of-the-art approaches in cross-generator generalization and robustness to\nperturbations. Remarkably, our best-performing ViT-L/14 variant requires\ntraining only 0.08% of its parameters to surpass the leading baseline by +3.64%\nmAP and +12.72% avg.Acc across unseen diffusion and autoregressive models. This\neven outperforms the baseline with just 0.28% of the training data. Our code\nand pre-trained models will be available at\nhttps://github.com/zhliuworks/CLIPMoLE.\n","authors":["Zihan Liu","Hanyi Wang","Yaoyu Kang","Shilin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04880v1","updated":"2024-04-07T08:51:31Z","published":"2024-04-07T08:51:31Z","title":"GauU-Scene V2: Expanse Lidar Image Dataset Shows Unreliable Geometric\n Reconstruction Using Gaussian Splatting and NeRF","summary":" We introduce a novel large-scale scene reconstruction benchmark that utilizes\nnewly developed 3D representation approaches: Gaussian Splatting and Neural\nRadiance Fields, on our expansive GauU-Scene V2 dataset. GauU-Scene V2\nencompasses over 6.5 square kilometers and features a comprehensive RGB dataset\ncoupled with LiDAR ground truth. This dataset offers a unique blend of urban\nand academic environments for advanced spatial analysis, covering more than 6.5\nkm2. We also provide detailed supplementary information on data collection\nprotocols. Furthermore, we present an easy-to-follow pipeline to align the\nCOLMAP sparse point cloud with the detailed LiDAR dataset. Our evaluation of\nU-Scene, which includes a detailed analysis across various novel viewpoints\nusing image-based metrics such as SSIM, LPIPS, and PSNR, shows contradictory\nresults when applying geometric-based metrics, such as Chamfer distance. This\nleads to doubts about the reliability of current image-based measurement\nmatrices and geometric extraction methods on Gaussian Splatting. We also make\nthe dataset available on the following anonymous project page\n","authors":["Butian Xiong","Nanjun Zheng","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2404.04880v1.pdf","comment":"8 pages(No reference) 6 figures 4 tabs"},{"id":"http://arxiv.org/abs/2404.04878v1","updated":"2024-04-07T08:48:01Z","published":"2024-04-07T08:48:01Z","title":"CycleINR: Cycle Implicit Neural Representation for Arbitrary-Scale\n Volumetric Super-Resolution of Medical Data","summary":" In the realm of medical 3D data, such as CT and MRI images, prevalent\nanisotropic resolution is characterized by high intra-slice but diminished\ninter-slice resolution. The lowered resolution between adjacent slices poses\nchallenges, hindering optimal viewing experiences and impeding the development\nof robust downstream analysis algorithms. Various volumetric super-resolution\nalgorithms aim to surmount these challenges, enhancing inter-slice resolution\nand overall 3D medical imaging quality. However, existing approaches confront\ninherent challenges: 1) often tailored to specific upsampling factors, lacking\nflexibility for diverse clinical scenarios; 2) newly generated slices\nfrequently suffer from over-smoothing, degrading fine details, and leading to\ninter-slice inconsistency. In response, this study presents CycleINR, a novel\nenhanced Implicit Neural Representation model for 3D medical data volumetric\nsuper-resolution. Leveraging the continuity of the learned implicit function,\nthe CycleINR model can achieve results with arbitrary up-sampling rates,\neliminating the need for separate training. Additionally, we enhance the grid\nsampling in CycleINR with a local attention mechanism and mitigate\nover-smoothing by integrating cycle-consistent loss. We introduce a new metric,\nSlice-wise Noise Level Inconsistency (SNLI), to quantitatively assess\ninter-slice noise level inconsistency. The effectiveness of our approach is\ndemonstrated through image quality evaluations on an in-house dataset and a\ndownstream task analysis on the Medical Segmentation Decathlon liver tumor\ndataset.\n","authors":["Wei Fang","Yuxing Tang","Heng Guo","Mingze Yuan","Tony C. W. Mok","Ke Yan","Jiawen Yao","Xin Chen","Zaiyi Liu","Le Lu","Ling Zhang","Minfeng Xu"],"pdf_url":"https://arxiv.org/pdf/2404.04878v1.pdf","comment":"CVPR accepted paper"},{"id":"http://arxiv.org/abs/2404.04876v1","updated":"2024-04-07T08:46:06Z","published":"2024-04-07T08:46:06Z","title":"HiLo: Detailed and Robust 3D Clothed Human Reconstruction with High-and\n Low-Frequency Information of Parametric Models","summary":" Reconstructing 3D clothed human involves creating a detailed geometry of\nindividuals in clothing, with applications ranging from virtual try-on, movies,\nto games. To enable practical and widespread applications, recent advances\npropose to generate a clothed human from an RGB image. However, they struggle\nto reconstruct detailed and robust avatars simultaneously. We empirically find\nthat the high-frequency (HF) and low-frequency (LF) information from a\nparametric model has the potential to enhance geometry details and improve\nrobustness to noise, respectively. Based on this, we propose HiLo, namely\nclothed human reconstruction with high- and low-frequency information, which\ncontains two components. 1) To recover detailed geometry using HF information,\nwe propose a progressive HF Signed Distance Function to enhance the detailed 3D\ngeometry of a clothed human. We analyze that our progressive learning manner\nalleviates large gradients that hinder model convergence. 2) To achieve robust\nreconstruction against inaccurate estimation of the parametric model by using\nLF information, we propose a spatial interaction implicit function. This\nfunction effectively exploits the complementary spatial information from a\nlow-resolution voxel grid of the parametric model. Experimental results\ndemonstrate that HiLo outperforms the state-of-the-art methods by 10.43% and\n9.54% in terms of Chamfer distance on the Thuman2.0 and CAPE datasets,\nrespectively. Additionally, HiLo demonstrates robustness to noise from the\nparametric model, challenging poses, and various clothing styles.\n","authors":["Yifan Yang","Dong Liu","Shuhai Zhang","Zeshuai Deng","Zixiong Huang","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2404.04876v1.pdf","comment":"CVPR 2024 Accepted Paper"},{"id":"http://arxiv.org/abs/2404.04875v1","updated":"2024-04-07T08:42:38Z","published":"2024-04-07T08:42:38Z","title":"NeRF2Points: Large-Scale Point Cloud Generation From Street Views'\n Radiance Field Optimization","summary":" Neural Radiance Fields (NeRF) have emerged as a paradigm-shifting methodology\nfor the photorealistic rendering of objects and environments, enabling the\nsynthesis of novel viewpoints with remarkable fidelity. This is accomplished\nthrough the strategic utilization of object-centric camera poses characterized\nby significant inter-frame overlap. This paper explores a compelling,\nalternative utility of NeRF: the derivation of point clouds from aggregated\nurban landscape imagery. The transmutation of street-view data into point\nclouds is fraught with complexities, attributable to a nexus of interdependent\nvariables. First, high-quality point cloud generation hinges on precise camera\nposes, yet many datasets suffer from inaccuracies in pose metadata. Also, the\nstandard approach of NeRF is ill-suited for the distinct characteristics of\nstreet-view data from autonomous vehicles in vast, open settings. Autonomous\nvehicle cameras often record with limited overlap, leading to blurring,\nartifacts, and compromised pavement representation in NeRF-based point clouds.\nIn this paper, we present NeRF2Points, a tailored NeRF variant for urban point\ncloud synthesis, notable for its high-quality output from RGB inputs alone. Our\npaper is supported by a bespoke, high-resolution 20-kilometer urban street\ndataset, designed for point cloud generation and evaluation. NeRF2Points\nadeptly navigates the inherent challenges of NeRF-based point cloud synthesis\nthrough the implementation of the following strategic innovations: (1)\nIntegration of Weighted Iterative Geometric Optimization (WIGO) and Structure\nfrom Motion (SfM) for enhanced camera pose accuracy, elevating street-view data\nprecision. (2) Layered Perception and Integrated Modeling (LPiM) is designed\nfor distinct radiance field modeling in urban environments, resulting in\ncoherent point cloud representations.\n","authors":["Peng Tu","Xun Zhou","Mingming Wang","Xiaojun Yang","Bo Peng","Ping Chen","Xiu Su","Yawen Huang","Yefeng Zheng","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.04875v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2404.04871v1","updated":"2024-04-07T08:32:16Z","published":"2024-04-07T08:32:16Z","title":"Data Stream Sampling with Fuzzy Task Boundaries and Noisy Labels","summary":" In the realm of continual learning, the presence of noisy labels within data\nstreams represents a notable obstacle to model reliability and fairness. We\nfocus on the data stream scenario outlined in pertinent literature,\ncharacterized by fuzzy task boundaries and noisy labels. To address this\nchallenge, we introduce a novel and intuitive sampling method called Noisy Test\nDebiasing (NTD) to mitigate noisy labels in evolving data streams and establish\na fair and robust continual learning algorithm. NTD is straightforward to\nimplement, making it feasible across various scenarios. Our experiments\nbenchmark four datasets, including two synthetic noise datasets (CIFAR10 and\nCIFAR100) and real-world noise datasets (mini-WebVision and Food-101N). The\nresults validate the efficacy of NTD for online continual learning in scenarios\nwith noisy labels in data streams. Compared to the previous leading approach,\nNTD achieves a training speedup enhancement over two times while maintaining or\nsurpassing accuracy levels. Moreover, NTD utilizes less than one-fifth of the\nGPU memory resources compared to previous leading methods.\n","authors":["Yu-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2404.04871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04865v1","updated":"2024-04-07T08:17:48Z","published":"2024-04-07T08:17:48Z","title":"On the Learnability of Out-of-distribution Detection","summary":" Supervised learning aims to train a classifier under the assumption that\ntraining and test data are from the same distribution. To ease the above\nassumption, researchers have studied a more realistic setting:\nout-of-distribution (OOD) detection, where test data may come from classes that\nare unknown during training (i.e., OOD data). Due to the unavailability and\ndiversity of OOD data, good generalization ability is crucial for effective OOD\ndetection algorithms, and corresponding learning theory is still an open\nproblem. To study the generalization of OOD detection, this paper investigates\nthe probably approximately correct (PAC) learning theory of OOD detection that\nfits the commonly used evaluation metrics in the literature. First, we find a\nnecessary condition for the learnability of OOD detection. Then, using this\ncondition, we prove several impossibility theorems for the learnability of OOD\ndetection under some scenarios. Although the impossibility theorems are\nfrustrating, we find that some conditions of these impossibility theorems may\nnot hold in some practical scenarios. Based on this observation, we next give\nseveral necessary and sufficient conditions to characterize the learnability of\nOOD detection in some practical scenarios. Lastly, we offer theoretical support\nfor representative OOD detection works based on our OOD theory.\n","authors":["Zhen Fang","Yixuan Li","Feng Liu","Bo Han","Jie Lu"],"pdf_url":"https://arxiv.org/pdf/2404.04865v1.pdf","comment":"Accepted by JMLR in 7th of April, 2024. This is a journal extension\n of the previous NeurIPS 2022 Outstanding Paper \"Is Out-of-distribution\n Detection Learnable?\" [arXiv:2210.14707]"},{"id":"http://arxiv.org/abs/2308.06791v5","updated":"2024-04-07T08:13:38Z","published":"2023-08-13T15:30:02Z","title":"PV-SSD: A Multi-Modal Point Cloud Feature Fusion Method for Projection\n Features and Variable Receptive Field Voxel Features","summary":" LiDAR-based 3D object detection and classification is crucial for autonomous\ndriving. However, real-time inference from extremely sparse 3D data is a\nformidable challenge. To address this problem, a typical class of approaches\ntransforms the point cloud cast into a regular data representation (voxels or\nprojection maps). Then, it performs feature extraction with convolutional\nneural networks. However, such methods often result in a certain degree of\ninformation loss due to down-sampling or over-compression of feature\ninformation. This paper proposes a multi-modal point cloud feature fusion\nmethod for projection features and variable receptive field voxel features\n(PV-SSD) based on projection and variable voxelization to solve the information\nloss problem. We design a two-branch feature extraction structure with a 2D\nconvolutional neural network to extract the point cloud's projection features\nin bird's-eye view to focus on the correlation between local features. A voxel\nfeature extraction branch is used to extract local fine-grained features.\nMeanwhile, we propose a voxel feature extraction method with variable sensory\nfields to reduce the information loss of voxel branches due to downsampling. It\navoids missing critical point information by selecting more useful feature\npoints based on feature point weights for the detection task. In addition, we\npropose a multi-modal feature fusion module for point clouds. To validate the\neffectiveness of our method, we tested it on the KITTI dataset and ONCE\ndataset.\n","authors":["Yongxin Shao","Aihong Tan","Zhetao Sun","Enhui Zheng","Tianhong Yan","Peng Liao"],"pdf_url":"https://arxiv.org/pdf/2308.06791v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04860v1","updated":"2024-04-07T08:07:14Z","published":"2024-04-07T08:07:14Z","title":"ByteEdit: Boost, Comply and Accelerate Generative Image Editing","summary":" Recent advancements in diffusion-based generative image editing have sparked\na profound revolution, reshaping the landscape of image outpainting and\ninpainting tasks. Despite these strides, the field grapples with inherent\nchallenges, including: i) inferior quality; ii) poor consistency; iii)\ninsufficient instrcution adherence; iv) suboptimal generation efficiency. To\naddress these obstacles, we present ByteEdit, an innovative feedback learning\nframework meticulously designed to Boost, Comply, and Accelerate Generative\nImage Editing tasks. ByteEdit seamlessly integrates image reward models\ndedicated to enhancing aesthetics and image-text alignment, while also\nintroducing a dense, pixel-level reward model tailored to foster coherence in\nthe output. Furthermore, we propose a pioneering adversarial and progressive\nfeedback learning strategy to expedite the model's inference speed. Through\nextensive large-scale user evaluations, we demonstrate that ByteEdit surpasses\nleading generative image editing products, including Adobe, Canva, and MeiTu,\nin both generation quality and consistency. ByteEdit-Outpainting exhibits a\nremarkable enhancement of 388% and 135% in quality and consistency,\nrespectively, when compared to the baseline model. Experiments also verfied\nthat our acceleration models maintains excellent performance results in terms\nof quality and consistency.\n","authors":["Yuxi Ren","Jie Wu","Yanzuo Lu","Huafeng Kuang","Xin Xia","Xionghui Wang","Qianqian Wang","Yixing Zhu","Pan Xie","Shiyin Wang","Xuefeng Xiao","Yitong Wang","Min Zheng","Lean Fu"],"pdf_url":"https://arxiv.org/pdf/2404.04860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04856v1","updated":"2024-04-07T08:03:42Z","published":"2024-04-07T08:03:42Z","title":"Msmsfnet: a multi-stream and multi-scale fusion net for edge detection","summary":" Edge detection is a long standing problem in computer vision. Recent deep\nlearning based algorithms achieve state of-the-art performance in publicly\navailable datasets. Despite the efficiency of these algorithms, their\nperformance, however, relies heavily on the pretrained weights of the backbone\nnetwork on the ImageNet dataset. This limits heavily the design space of deep\nlearning based edge detectors. Whenever we want to devise a new model, we have\nto train this new model on the ImageNet dataset first, and then fine tune the\nmodel using the edge detection datasets. The comparison would be unfair\notherwise. However, it is usually not feasible for many researchers to train a\nmodel on the ImageNet dataset due to the limited computation resources. In this\nwork, we study the performance that can be achieved by state-of-the-art deep\nlearning based edge detectors in publicly available datasets when they are\ntrained from scratch, and devise a new network architecture, the multi-stream\nand multi scale fusion net (msmsfnet), for edge detection. We show in our\nexperiments that by training all models from scratch to ensure the fairness of\ncomparison, out model outperforms state-of-the art deep learning based edge\ndetectors in three publicly available datasets.\n","authors":["Chenguang Liu","Chisheng Wang","Feifei Dong","Xin Su","Chuanhua Zhu","Dejin Zhang","Qingquan Li"],"pdf_url":"https://arxiv.org/pdf/2404.04856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00292v2","updated":"2024-04-07T07:55:51Z","published":"2024-03-30T08:51:23Z","title":"LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge\n Retrieval-Augmented Diffusion","summary":" Camouflaged vision perception is an important vision task with numerous\npractical applications. Due to the expensive collection and labeling costs,\nthis community struggles with a major bottleneck that the species category of\nits datasets is limited to a small number of object species. However, the\nexisting camouflaged generation methods require specifying the background\nmanually, thus failing to extend the camouflaged sample diversity in a low-cost\nmanner. In this paper, we propose a Latent Background Knowledge\nRetrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To\nour knowledge, our contributions mainly include: (1) For the first time, we\npropose a camouflaged generation paradigm that does not need to receive any\nbackground inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented\nmethod with interpretability for camouflaged generation, in which we propose an\nidea that knowledge retrieval and reasoning enhancement are separated\nexplicitly, to alleviate the task-specific challenges. Moreover, our method is\nnot restricted to specific foreground targets or backgrounds, offering a\npotential for extending camouflaged vision perception to more diverse domains.\n(3) Experimental results demonstrate that our method outperforms the existing\napproaches, generating more realistic camouflage images.\n","authors":["Pancheng Zhao","Peng Xu","Pengda Qin","Deng-Ping Fan","Zhicheng Zhang","Guoli Jia","Bowen Zhou","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00292v2.pdf","comment":"Accepted by CVPR 2024, Fig.3 revised"},{"id":"http://arxiv.org/abs/2306.08498v2","updated":"2024-04-07T07:50:37Z","published":"2023-06-14T13:27:28Z","title":"Extending CLIP's Image-Text Alignment to Referring Image Segmentation","summary":" Referring Image Segmentation (RIS) is a cross-modal task that aims to segment\nan instance described by a natural language expression. Recent methods leverage\nlarge-scale pretrained unimodal models as backbones along with fusion\ntechniques for joint reasoning across modalities. However, the inherent\ncross-modal nature of RIS raises questions about the effectiveness of unimodal\nbackbones. We propose RISCLIP, a novel framework that effectively leverages the\ncross-modal nature of CLIP for RIS. Observing CLIP's inherent alignment between\nimage and text features, we capitalize on this starting point and introduce\nsimple but strong modules that enhance unimodal feature extraction and leverage\nrich alignment knowledge in CLIP's image-text shared-embedding space. RISCLIP\nexhibits outstanding results on all three major RIS benchmarks and also\noutperforms previous CLIP-based methods, demonstrating the efficacy of our\nstrategy in extending CLIP's image-text alignment to RIS.\n","authors":["Seoyeon Kim","Minguk Kang","Dongwon Kim","Jaesik Park","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2306.08498v2.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2404.04848v1","updated":"2024-04-07T07:42:04Z","published":"2024-04-07T07:42:04Z","title":"Task-Aware Encoder Control for Deep Video Compression","summary":" Prior research on deep video compression (DVC) for machine tasks typically\nnecessitates training a unique codec for each specific task, mandating a\ndedicated decoder per task. In contrast, traditional video codecs employ a\nflexible encoder controller, enabling the adaptation of a single codec to\ndifferent tasks through mechanisms like mode prediction. Drawing inspiration\nfrom this, we introduce an innovative encoder controller for deep video\ncompression for machines. This controller features a mode prediction and a\nGroup of Pictures (GoP) selection module. Our approach centralizes control at\nthe encoding stage, allowing for adaptable encoder adjustments across different\ntasks, such as detection and tracking, while maintaining compatibility with a\nstandard pre-trained DVC decoder. Empirical evidence demonstrates that our\nmethod is applicable across multiple tasks with various existing pre-trained\nDVCs. Moreover, extensive experiments demonstrate that our method outperforms\nprevious DVC by about 25% bitrate for different tasks, with only one\npre-trained decoder.\n","authors":["Xingtong Ge","Jixiang Luo","Xinjie Zhang","Tongda Xu","Guo Lu","Dailan He","Jing Geng","Yan Wang","Jun Zhang","Hongwei Qin"],"pdf_url":"https://arxiv.org/pdf/2404.04848v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12434v3","updated":"2024-04-07T07:37:59Z","published":"2024-03-19T04:47:56Z","title":"Human Mesh Recovery from Arbitrary Multi-view Images","summary":" Human mesh recovery from arbitrary multi-view images involves two\ncharacteristics: the arbitrary camera poses and arbitrary number of camera\nviews. Because of the variability, designing a unified framework to tackle this\ntask is challenging. The challenges can be summarized as the dilemma of being\nable to simultaneously estimate arbitrary camera poses and recover human mesh\nfrom arbitrary multi-view images while maintaining flexibility. To solve this\ndilemma, we propose a divide and conquer framework for Unified Human Mesh\nRecovery (U-HMR) from arbitrary multi-view images. In particular, U-HMR\nconsists of a decoupled structure and two main components: camera and body\ndecoupling (CBD), camera pose estimation (CPE), and arbitrary view fusion\n(AVF). As camera poses and human body mesh are independent of each other, CBD\nsplits the estimation of them into two sub-tasks for two individual\nsub-networks (ie, CPE and AVF) to handle respectively, thus the two sub-tasks\nare disentangled. In CPE, since each camera pose is unrelated to the others, we\nadopt a shared MLP to process all views in a parallel way. In AVF, in order to\nfuse multi-view information and make the fusion operation independent of the\nnumber of views, we introduce a transformer decoder with a SMPL parameters\nquery token to extract cross-view features for mesh recovery. To demonstrate\nthe efficacy and flexibility of the proposed framework and effect of each\ncomponent, we conduct extensive experiments on three public datasets:\nHuman3.6M, MPI-INF-3DHP, and TotalCapture.\n","authors":["Xiaoben Li","Mancheng Meng","Ziyan Wu","Terrence Chen","Fan Yang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2403.12434v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11863v2","updated":"2024-04-07T07:37:15Z","published":"2023-11-20T15:59:41Z","title":"GP-NeRF: Generalized Perception NeRF for Context-Aware 3D Scene\n Understanding","summary":" Applying NeRF to downstream perception tasks for scene understanding and\nrepresentation is becoming increasingly popular. Most existing methods treat\nsemantic prediction as an additional rendering task, \\textit{i.e.}, the \"label\nrendering\" task, to build semantic NeRFs. However, by rendering\nsemantic/instance labels per pixel without considering the contextual\ninformation of the rendered image, these methods usually suffer from unclear\nboundary segmentation and abnormal segmentation of pixels within an object. To\nsolve this problem, we propose Generalized Perception NeRF (GP-NeRF), a novel\npipeline that makes the widely used segmentation model and NeRF work compatibly\nunder a unified framework, for facilitating context-aware 3D scene perception.\nTo accomplish this goal, we introduce transformers to aggregate radiance as\nwell as semantic embedding fields jointly for novel views and facilitate the\njoint volumetric rendering of both fields. In addition, we propose two\nself-distillation mechanisms, i.e., the Semantic Distill Loss and the\nDepth-Guided Semantic Distill Loss, to enhance the discrimination and quality\nof the semantic field and the maintenance of geometric consistency. In\nevaluation, we conduct experimental comparisons under two perception tasks\n(\\textit{i.e.} semantic and instance segmentation) using both synthetic and\nreal-world datasets. Notably, our method outperforms SOTA approaches by 6.94\\%,\n11.76\\%, and 8.47\\% on generalized semantic segmentation, finetuning semantic\nsegmentation, and instance segmentation, respectively.\n","authors":["Hao Li","Dingwen Zhang","Yalun Dai","Nian Liu","Lechao Cheng","Jingfeng Li","Jingdong Wang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2311.11863v2.pdf","comment":"CVPR 2024 (Highlight). Project Page:\n https://lifuguan.github.io/gpnerf-pages/"},{"id":"http://arxiv.org/abs/2404.03654v2","updated":"2024-04-07T07:20:31Z","published":"2024-04-04T17:59:50Z","title":"RaFE: Generative Radiance Fields Restoration","summary":" NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel\nview synthesis and 3D reconstruction, but its performance is sensitive to input\nimage quality, which struggles to achieve high-fidelity rendering when provided\nwith low-quality sparse input viewpoints. Previous methods for NeRF restoration\nare tailored for specific degradation type, ignoring the generality of\nrestoration. To overcome this limitation, we propose a generic radiance fields\nrestoration pipeline, named RaFE, which applies to various types of\ndegradations, such as low resolution, blurriness, noise, compression artifacts,\nor their combinations. Our approach leverages the success of off-the-shelf 2D\nrestoration methods to recover the multi-view images individually. Instead of\nreconstructing a blurred NeRF by averaging inconsistencies, we introduce a\nnovel approach using Generative Adversarial Networks (GANs) for NeRF generation\nto better accommodate the geometric and appearance inconsistencies present in\nthe multi-view images. Specifically, we adopt a two-level tri-plane\narchitecture, where the coarse level remains fixed to represent the low-quality\nNeRF, and a fine-level residual tri-plane to be added to the coarse level is\nmodeled as a distribution with GAN to capture potential variations in\nrestoration. We validate RaFE on both synthetic and real cases for various\nrestoration tasks, demonstrating superior performance in both quantitative and\nqualitative evaluations, surpassing other 3D restoration methods specific to\nsingle task. Please see our project website\nhttps://zkaiwu.github.io/RaFE-Project/.\n","authors":["Zhongkai Wu","Ziyu Wan","Jing Zhang","Jing Liao","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2404.03654v2.pdf","comment":"Project Page: https://zkaiwu.github.io/RaFE"},{"id":"http://arxiv.org/abs/2305.03238v4","updated":"2024-04-07T07:07:49Z","published":"2023-05-05T01:40:00Z","title":"Reduction of Class Activation Uncertainty with Background Information","summary":" Multitask learning is a popular approach to training high-performing neural\nnetworks with improved generalization. In this paper, we propose a background\nclass to achieve improved generalization at a lower computation compared to\nmultitask learning to help researchers and organizations with limited\ncomputation power. We also present a methodology for selecting background\nimages and discuss potential future improvements. We apply our approach to\nseveral datasets and achieve improved generalization with much lower\ncomputation. Through the class activation mappings (CAMs) of the trained\nmodels, we observed the tendency towards looking at a bigger picture with the\nproposed model training methodology. Applying the vision transformer with the\nproposed background class, we receive state-of-the-art (SOTA) performance on\nSTL-10, Caltech-101, and CINIC-10 datasets. Example scripts are available in\nthe 'CAM' folder of the following GitHub Repository: github.com/dipuk0506/UQ\n","authors":["H M Dipu Kabir"],"pdf_url":"https://arxiv.org/pdf/2305.03238v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04833v1","updated":"2024-04-07T06:56:51Z","published":"2024-04-07T06:56:51Z","title":"ShoeModel: Learning to Wear on the User-specified Shoes via Diffusion\n Model","summary":" With the development of the large-scale diffusion model, Artificial\nIntelligence Generated Content (AIGC) techniques are popular recently. However,\nhow to truly make it serve our daily lives remains an open question. To this\nend, in this paper, we focus on employing AIGC techniques in one filed of\nE-commerce marketing, i.e., generating hyper-realistic advertising images for\ndisplaying user-specified shoes by human. Specifically, we propose a\nshoe-wearing system, called Shoe-Model, to generate plausible images of human\nlegs interacting with the given shoes. It consists of three modules: (1) shoe\nwearable-area detection module (WD), (2) leg-pose synthesis module (LpS) and\nthe final (3) shoe-wearing image generation module (SW). Them three are\nperformed in ordered stages. Compared to baselines, our ShoeModel is shown to\ngeneralize better to different type of shoes and has ability of keeping the\nID-consistency of the given shoes, as well as automatically producing\nreasonable interactions with human. Extensive experiments show the\neffectiveness of our proposed shoe-wearing system. Figure 1 shows the input and\noutput examples of our ShoeModel.\n","authors":["Binghui Chen","Wenyu Li","Yifeng Geng","Xuansong Xie","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.04833v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2401.10891v2","updated":"2024-04-07T06:52:21Z","published":"2024-01-19T18:59:52Z","title":"Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data","summary":" This work presents Depth Anything, a highly practical solution for robust\nmonocular depth estimation. Without pursuing novel technical modules, we aim to\nbuild a simple yet powerful foundation model dealing with any images under any\ncircumstances. To this end, we scale up the dataset by designing a data engine\nto collect and automatically annotate large-scale unlabeled data (~62M), which\nsignificantly enlarges the data coverage and thus is able to reduce the\ngeneralization error. We investigate two simple yet effective strategies that\nmake data scaling-up promising. First, a more challenging optimization target\nis created by leveraging data augmentation tools. It compels the model to\nactively seek extra visual knowledge and acquire robust representations.\nSecond, an auxiliary supervision is developed to enforce the model to inherit\nrich semantic priors from pre-trained encoders. We evaluate its zero-shot\ncapabilities extensively, including six public datasets and randomly captured\nphotos. It demonstrates impressive generalization ability. Further, through\nfine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs\nare set. Our better depth model also results in a better depth-conditioned\nControlNet. Our models are released at\nhttps://github.com/LiheYoung/Depth-Anything.\n","authors":["Lihe Yang","Bingyi Kang","Zilong Huang","Xiaogang Xu","Jiashi Feng","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10891v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://depth-anything.github.io"},{"id":"http://arxiv.org/abs/2212.12857v2","updated":"2024-04-07T06:34:37Z","published":"2022-12-25T05:24:08Z","title":"StepNet: Spatial-temporal Part-aware Network for Isolated Sign Language\n Recognition","summary":" The goal of sign language recognition (SLR) is to help those who are hard of\nhearing or deaf overcome the communication barrier. Most existing approaches\ncan be typically divided into two lines, i.e., Skeleton-based and RGB-based\nmethods, but both the two lines of methods have their limitations.\nSkeleton-based methods do not consider facial expressions, while RGB-based\napproaches usually ignore the fine-grained hand structure. To overcome both\nlimitations, we propose a new framework called Spatial-temporal Part-aware\nnetwork~(StepNet), based on RGB parts. As its name suggests, it is made up of\ntwo modules: Part-level Spatial Modeling and Part-level Temporal Modeling.\nPart-level Spatial Modeling, in particular, automatically captures the\nappearance-based properties, such as hands and faces, in the feature space\nwithout the use of any keypoint-level annotations. On the other hand,\nPart-level Temporal Modeling implicitly mines the long-short term context to\ncapture the relevant attributes over time. Extensive experiments demonstrate\nthat our StepNet, thanks to spatial-temporal modules, achieves competitive\nTop-1 Per-instance accuracy on three commonly-used SLR benchmarks, i.e., 56.89%\non WLASL, 77.2% on NMFs-CSL, and 77.1% on BOBSL. Additionally, the proposed\nmethod is compatible with the optical flow input and can produce superior\nperformance if fused. For those who are hard of hearing, we hope that our work\ncan act as a preliminary step.\n","authors":["Xiaolong Shen","Zhedong Zheng","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2212.12857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01134v2","updated":"2024-04-07T06:30:39Z","published":"2024-02-02T04:17:02Z","title":"DeepAAT: Deep Automated Aerial Triangulation for Fast UAV-based Mapping","summary":" Automated Aerial Triangulation (AAT), aiming to restore image pose and\nreconstruct sparse points simultaneously, plays a pivotal role in earth\nobservation. With its rich research heritage spanning several decades in\nphotogrammetry, AAT has evolved into a fundamental process widely applied in\nlarge-scale Unmanned Aerial Vehicle (UAV) based mapping. Despite its\nadvancements, classic AAT methods still face challenges like low efficiency and\nlimited robustness. This paper introduces DeepAAT, a deep learning network\ndesigned specifically for AAT of UAV imagery. DeepAAT considers both spatial\nand spectral characteristics of imagery, enhancing its capability to resolve\nerroneous matching pairs and accurately predict image poses. DeepAAT marks a\nsignificant leap in AAT's efficiency, ensuring thorough scene coverage and\nprecision. Its processing speed outpaces incremental AAT methods by hundreds of\ntimes and global AAT methods by tens of times while maintaining a comparable\nlevel of reconstruction accuracy. Additionally, DeepAAT's scene clustering and\nmerging strategy facilitate rapid localization and pose determination for\nlarge-scale UAV images, even under constrained computing resources. The\nexperimental results demonstrate DeepAAT's substantial improvements over\nconventional AAT methods, highlighting its potential in the efficiency and\naccuracy of UAV-based 3D reconstruction tasks. To benefit the photogrammetry\nsociety, the code of DeepAAT will be released at:\nhttps://github.com/WHU-USI3DV/DeepAAT.\n","authors":["Zequan Chen","Jianping Li","Qusheng Li","Bisheng Yang","Zhen Dong"],"pdf_url":"https://arxiv.org/pdf/2402.01134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04828v1","updated":"2024-04-07T06:28:53Z","published":"2024-04-07T06:28:53Z","title":"Strictly-ID-Preserved and Controllable Accessory Advertising Image\n Generation","summary":" Customized generative text-to-image models have the ability to produce images\nthat closely resemble a given subject. However, in the context of generating\nadvertising images for e-commerce scenarios, it is crucial that the generated\nsubject's identity aligns perfectly with the product being advertised. In order\nto address the need for strictly-ID preserved advertising image generation, we\nhave developed a Control-Net based customized image generation pipeline and\nhave taken earring model advertising as an example. Our approach facilitates a\nseamless interaction between the earrings and the model's face, while ensuring\nthat the identity of the earrings remains intact. Furthermore, to achieve a\ndiverse and controllable display, we have proposed a multi-branch\ncross-attention architecture, which allows for control over the scale, pose,\nand appearance of the model, going beyond the limitations of text prompts. Our\nmethod manages to achieve fine-grained control of the generated model's face,\nresulting in controllable and captivating advertising effects.\n","authors":["Youze Xue","Binghui Chen","Yifeng Geng","Xuansong Xie","Jiansheng Chen","Hongbing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.04828v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2310.08370v2","updated":"2024-04-07T06:21:21Z","published":"2023-10-12T14:39:58Z","title":"UniPAD: A Universal Pre-training Paradigm for Autonomous Driving","summary":" In the context of autonomous driving, the significance of effective feature\nlearning is widely acknowledged. While conventional 3D self-supervised\npre-training methods have shown widespread success, most methods follow the\nideas originally designed for 2D images. In this paper, we present UniPAD, a\nnovel self-supervised learning paradigm applying 3D volumetric differentiable\nrendering. UniPAD implicitly encodes 3D space, facilitating the reconstruction\nof continuous 3D shape structures and the intricate appearance characteristics\nof their 2D projections. The flexibility of our method enables seamless\nintegration into both 2D and 3D frameworks, enabling a more holistic\ncomprehension of the scenes. We manifest the feasibility and effectiveness of\nUniPAD by conducting extensive experiments on various downstream 3D tasks. Our\nmethod significantly improves lidar-, camera-, and lidar-camera-based baseline\nby 9.1, 7.7, and 6.9 NDS, respectively. Notably, our pre-training pipeline\nachieves 73.2 NDS for 3D object detection and 79.4 mIoU for 3D semantic\nsegmentation on the nuScenes validation set, achieving state-of-the-art results\nin comparison with previous methods. The code will be available at\nhttps://github.com/Nightmare-n/UniPAD.\n","authors":["Honghui Yang","Sha Zhang","Di Huang","Xiaoyang Wu","Haoyi Zhu","Tong He","Shixiang Tang","Hengshuang Zhao","Qibo Qiu","Binbin Lin","Xiaofei He","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.08370v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.04823v1","updated":"2024-04-07T06:17:10Z","published":"2024-04-07T06:17:10Z","title":"3D Building Reconstruction from Monocular Remote Sensing Images with\n Multi-level Supervisions","summary":" 3D building reconstruction from monocular remote sensing images is an\nimportant and challenging research problem that has received increasing\nattention in recent years, owing to its low cost of data acquisition and\navailability for large-scale applications. However, existing methods rely on\nexpensive 3D-annotated samples for fully-supervised training, restricting their\napplication to large-scale cross-city scenarios. In this work, we propose\nMLS-BRN, a multi-level supervised building reconstruction network that can\nflexibly utilize training samples with different annotation levels to achieve\nbetter reconstruction results in an end-to-end manner. To alleviate the demand\non full 3D supervision, we design two new modules, Pseudo Building Bbox\nCalculator and Roof-Offset guided Footprint Extractor, as well as new tasks and\ntraining strategies for different types of samples. Experimental results on\nseveral public and new datasets demonstrate that our proposed MLS-BRN achieves\ncompetitive performance using much fewer 3D-annotated samples, and\nsignificantly improves the footprint extraction and 3D reconstruction\nperformance compared with current state-of-the-art. The code and datasets of\nthis work will be released at https://github.com/opendatalab/MLS-BRN.git.\n","authors":["Weijia Li","Haote Yang","Zhenghao Hu","Juepeng Zheng","Gui-Song Xia","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2404.04823v1.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01133v2","updated":"2024-04-07T06:17:07Z","published":"2024-04-01T14:24:40Z","title":"CityGaussian: Real-time High-quality Large-Scale Scene Rendering with\n Gaussians","summary":" The advancement of real-time 3D scene reconstruction and novel view synthesis\nhas been significantly propelled by 3D Gaussian Splatting (3DGS). However,\neffectively training large-scale 3DGS and rendering it in real-time across\nvarious scales remains challenging. This paper introduces CityGaussian\n(CityGS), which employs a novel divide-and-conquer training approach and\nLevel-of-Detail (LoD) strategy for efficient large-scale 3DGS training and\nrendering. Specifically, the global scene prior and adaptive training data\nselection enables efficient training and seamless fusion. Based on fused\nGaussian primitives, we generate different detail levels through compression,\nand realize fast rendering across various scales through the proposed\nblock-wise detail levels selection and aggregation strategy. Extensive\nexperimental results on large-scale scenes demonstrate that our approach\nattains state-of-theart rendering quality, enabling consistent real-time\nrendering of largescale scenes across vastly different scales. Our project page\nis available at https://dekuliutesla.github.io/citygs/.\n","authors":["Yang Liu","He Guan","Chuanchen Luo","Lue Fan","Junran Peng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01133v2.pdf","comment":"Project Page: https://dekuliutesla.github.io/citygs/"},{"id":"http://arxiv.org/abs/2404.04819v1","updated":"2024-04-07T06:01:49Z","published":"2024-04-07T06:01:49Z","title":"Joint Reconstruction of 3D Human and Object via Contact-Based Refinement\n Transformer","summary":" Human-object contact serves as a strong cue to understand how humans\nphysically interact with objects. Nevertheless, it is not widely explored to\nutilize human-object contact information for the joint reconstruction of 3D\nhuman and object from a single image. In this work, we present a novel joint 3D\nhuman-object reconstruction method (CONTHO) that effectively exploits contact\ninformation between humans and objects. There are two core designs in our\nsystem: 1) 3D-guided contact estimation and 2) contact-based 3D human and\nobject refinement. First, for accurate human-object contact estimation, CONTHO\ninitially reconstructs 3D humans and objects and utilizes them as explicit 3D\nguidance for contact estimation. Second, to refine the initial reconstructions\nof 3D human and object, we propose a novel contact-based refinement Transformer\nthat effectively aggregates human features and object features based on the\nestimated human-object contact. The proposed contact-based refinement prevents\nthe learning of erroneous correlation between human and object, which enables\naccurate 3D reconstruction. As a result, our CONTHO achieves state-of-the-art\nperformance in both human-object contact estimation and joint reconstruction of\n3D human and object. The code is publicly available at\nhttps://github.com/dqj5182/CONTHO_RELEASE.\n","authors":["Hyeongjin Nam","Daniel Sungho Jung","Gyeongsik Moon","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.04819v1.pdf","comment":"Published at CVPR 2024, 19 pages including the supplementary material"},{"id":"http://arxiv.org/abs/2305.00510v3","updated":"2024-04-07T05:59:05Z","published":"2023-04-30T15:38:36Z","title":"Towards AI-Architecture Liberty: A Comprehensive Survey on Designing and\n Collaborating Virtual Architecture by Deep Learning in the Metaverse","summary":" 3D shape generation techniques leveraging deep learning have garnered\nsignificant interest from both the computer vision and architectural design\ncommunities, promising to enrich the content of the future metaverse. However,\nresearch on virtual architectural design remains limited, particularly\nregarding human-AI collaboration and deep learning-assisted design. We first\nilluminate the principles, generation techniques, and current literature of\nvirtual architecture, focusing on challenges such as datasets, multimodality,\ndesign intuition, and generative frameworks. In our survey, we reviewed 187\nrelated articles (80.7\\% of articles published between 2018 and 2022) covering\narchitectural research, virtual environments, and technical approaches. This\nsurvey investigates the latest approaches to 3D object generation with deep\ngenerative models (DGMs) and summarizes four characteristics of deep-learning\ngeneration approaches for virtual architecture. According to our analysis of\nthe survey, we expound on four research agendas, including agency,\ncommunication, user consideration, and integrating tools, and highlight three\nimportant enablers of ubiquitous interaction with immersive systems in deep\nlearning-assisted architectural generation. Our work contributes to fostering\nunderstanding between designers and deep learning techniques, broadening access\nto human-AI collaboration. We advocate for interdisciplinary efforts to address\nthis timely research topic, facilitating content designing and generation in\nthe metaverse.\n","authors":["Anqi Wang","Jiahua Dong","Lik-Hang Lee","Jiachuan Shen","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2305.00510v3.pdf","comment":"37 pages, 9 figures, and 5 tables"},{"id":"http://arxiv.org/abs/2404.04818v1","updated":"2024-04-07T05:56:42Z","published":"2024-04-07T05:56:42Z","title":"DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking","summary":" Multimodal entity linking (MEL) aims to utilize multimodal information\n(usually textual and visual information) to link ambiguous mentions to\nunambiguous entities in knowledge base. Current methods facing main issues:\n(1)treating the entire image as input may contain redundant information. (2)the\ninsufficient utilization of entity-related information, such as attributes in\nimages. (3)semantic inconsistency between the entity in knowledge base and its\nrepresentation. To this end, we propose DWE+ for multimodal entity linking.\nDWE+ could capture finer semantics and dynamically maintain semantic\nconsistency with entities. This is achieved by three aspects: (a)we introduce a\nmethod for extracting fine-grained image features by partitioning the image\ninto multiple local objects. Then, hierarchical contrastive learning is used to\nfurther align semantics between coarse-grained information(text and image) and\nfine-grained (mention and visual objects). (b)we explore ways to extract visual\nattributes from images to enhance fusion feature such as facial features and\nidentity. (c)we leverage Wikipedia and ChatGPT to capture the entity\nrepresentation, achieving semantic enrichment from both static and dynamic\nperspectives, which better reflects the real-world entity semantics.\nExperiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the\neffectiveness of DWE+ in improving MEL performance. Specifically, we optimize\nthese datasets and achieve state-of-the-art performance on the enhanced\ndatasets. The code and enhanced datasets are released on\nhttps://github.com/season1blue/DWET\n","authors":["Shezheng Song","Shasha Li","Shan Zhao","Xiaopeng Li","Chengyu Wang","Jie Yu","Jun Ma","Tianwei Yan","Bin Ji","Xiaoguang Mao"],"pdf_url":"https://arxiv.org/pdf/2404.04818v1.pdf","comment":"under review on TOIS"},{"id":"http://arxiv.org/abs/2303.04989v3","updated":"2024-04-07T05:50:18Z","published":"2023-03-09T02:20:56Z","title":"ARS-DETR: Aspect Ratio-Sensitive Detection Transformer for Aerial\n Oriented Object Detection","summary":" Existing oriented object detection methods commonly use metric AP$_{50}$ to\nmeasure the performance of the model. We argue that AP$_{50}$ is inherently\nunsuitable for oriented object detection due to its large tolerance in angle\ndeviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$,\nto measure the performance of models. In this paper, we propose an Aspect Ratio\nSensitive Oriented Object Detector with Transformer, termed ARS-DETR, which\nexhibits a competitive performance in high-precision oriented object detection.\nSpecifically, a new angle classification method, calling Aspect Ratio aware\nCircle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more\nreasonable way and discard the hyperparameter that introduced by previous work\n(e.g. CSL). Then, a rotated deformable attention module is designed to rotate\nthe sampling points with the corresponding angles and eliminate the\nmisalignment between region features and sampling points. Moreover, a dynamic\nweight coefficient according to the aspect ratio is adopted to calculate the\nangle loss. Comprehensive experiments on several challenging datasets show that\nour method achieves competitive performance on the high-precision oriented\nobject detection task.\n","authors":["Ying Zeng","Yushi Chen","Xue Yang","Qingyun Li","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2303.04989v3.pdf","comment":"15 pages, 13 figures, 13 tables, the source code is available at\n https://github.com/httle/ARS-DETR"},{"id":"http://arxiv.org/abs/2404.01959v2","updated":"2024-04-07T05:26:08Z","published":"2024-04-02T13:54:22Z","title":"Bi-LORA: A Vision-Language Approach for Synthetic Image Detection","summary":" Advancements in deep image synthesis techniques, such as generative\nadversarial networks (GANs) and diffusion models (DMs), have ushered in an era\nof generating highly realistic images. While this technological progress has\ncaptured significant interest, it has also raised concerns about the potential\ndifficulty in distinguishing real images from their synthetic counterparts.\nThis paper takes inspiration from the potent convergence capabilities between\nvision and language, coupled with the zero-shot nature of vision-language\nmodels (VLMs). We introduce an innovative method called Bi-LORA that leverages\nVLMs, combined with low-rank adaptation (LORA) tuning techniques, to enhance\nthe precision of synthetic image detection for unseen model-generated images.\nThe pivotal conceptual shift in our methodology revolves around reframing\nbinary classification as an image captioning task, leveraging the distinctive\ncapabilities of cutting-edge VLM, notably bootstrapping language image\npre-training (BLIP2). Rigorous and comprehensive experiments are conducted to\nvalidate the effectiveness of our proposed approach, particularly in detecting\nunseen diffusion-generated images from unknown diffusion-based generative\nmodels during training, showcasing robustness to noise, and demonstrating\ngeneralization capabilities to GANs. The obtained results showcase an\nimpressive average accuracy of 93.41% in synthetic image detection on unseen\ngeneration models. The code and models associated with this research can be\npublicly accessed at https://github.com/Mamadou-Keita/VLM-DETECT.\n","authors":["Mamadou Keita","Wassim Hamidouche","Hessen Bougueffa Eutamene","Abdenour Hadid","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2404.01959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04808v1","updated":"2024-04-07T04:56:58Z","published":"2024-04-07T04:56:58Z","title":"MemFlow: Optical Flow Estimation and Prediction with Memory","summary":" Optical flow is a classical task that is important to the vision community.\nClassical optical flow estimation uses two frames as input, whilst some recent\nmethods consider multiple frames to explicitly model long-range information.\nThe former ones limit their ability to fully leverage temporal coherence along\nthe video sequence; and the latter ones incur heavy computational overhead,\ntypically not possible for real-time flow estimation. Some multi-frame-based\napproaches even necessitate unseen future frames for current estimation,\ncompromising real-time applicability in safety-critical scenarios. To this end,\nwe present MemFlow, a real-time method for optical flow estimation and\nprediction with memory. Our method enables memory read-out and update modules\nfor aggregating historical motion information in real-time. Furthermore, we\nintegrate resolution-adaptive re-scaling to accommodate diverse video\nresolutions. Besides, our approach seamlessly extends to the future prediction\nof optical flow based on past observations. Leveraging effective historical\nmotion aggregation, our method outperforms VideoFlow with fewer parameters and\nfaster inference speed on Sintel and KITTI-15 datasets in terms of\ngeneralization performance. At the time of submission, MemFlow also leads in\nperformance on the 1080p Spring dataset. Codes and models will be available at:\nhttps://dqiaole.github.io/MemFlow/.\n","authors":["Qiaole Dong","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2404.04808v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04807v1","updated":"2024-04-07T04:55:58Z","published":"2024-04-07T04:55:58Z","title":"D2SL: Decouple Defogging and Semantic Learning for Foggy Domain-Adaptive\n Segmentation","summary":" We investigated domain adaptive semantic segmentation in foggy weather\nscenarios, which aims to enhance the utilization of unlabeled foggy data and\nimprove the model's adaptability to foggy conditions. Current methods rely on\nclear images as references, jointly learning defogging and segmentation for\nfoggy images. Despite making some progress, there are still two main drawbacks:\n(1) the coupling of segmentation and defogging feature representations,\nresulting in a decrease in semantic representation capability, and (2) the\nfailure to leverage real fog priors in unlabeled foggy data, leading to\ninsufficient model generalization ability. To address these issues, we propose\na novel training framework, Decouple Defogging and Semantic learning, called\nD2SL, aiming to alleviate the adverse impact of defogging tasks on the final\nsegmentation task. In this framework, we introduce a domain-consistent transfer\nstrategy to establish a connection between defogging and segmentation tasks.\nFurthermore, we design a real fog transfer strategy to improve defogging\neffects by fully leveraging the fog priors from real foggy images. Our approach\nenhances the semantic representations required for segmentation during the\ndefogging learning process and maximizes the representation capability of fog\ninvariance by effectively utilizing real fog data. Comprehensive experiments\nvalidate the effectiveness of the proposed method.\n","authors":["Xuan Sun","Zhanfu An","Yuyu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.04807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01843v2","updated":"2024-04-07T04:17:32Z","published":"2024-04-02T11:03:24Z","title":"Sketch3D: Style-Consistent Guidance for Sketch-to-3D Generation","summary":" Recently, image-to-3D approaches have achieved significant results with a\nnatural image as input. However, it is not always possible to access these\nenriched color input samples in practical applications, where only sketches are\navailable. Existing sketch-to-3D researches suffer from limitations in broad\napplications due to the challenges of lacking color information and multi-view\ncontent. To overcome them, this paper proposes a novel generation paradigm\nSketch3D to generate realistic 3D assets with shape aligned with the input\nsketch and color matching the textual description. Concretely, Sketch3D first\ninstantiates the given sketch in the reference image through the\nshape-preserving generation process. Second, the reference image is leveraged\nto deduce a coarse 3D Gaussian prior, and multi-view style-consistent guidance\nimages are generated based on the renderings of the 3D Gaussians. Finally,\nthree strategies are designed to optimize 3D Gaussians, i.e., structural\noptimization via a distribution transfer mechanism, color optimization with a\nstraightforward MSE loss and sketch similarity optimization with a CLIP-based\ngeometric similarity loss. Extensive visual comparisons and quantitative\nanalysis illustrate the advantage of our Sketch3D in generating realistic 3D\nassets while preserving consistency with the input.\n","authors":["Wangguandong Zheng","Haifeng Xia","Rui Chen","Ming Shao","Siyu Xia","Zhengming Ding"],"pdf_url":"https://arxiv.org/pdf/2404.01843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04804v1","updated":"2024-04-07T04:10:06Z","published":"2024-04-07T04:10:06Z","title":"Light the Night: A Multi-Condition Diffusion Framework for Unpaired\n Low-Light Enhancement in Autonomous Driving","summary":" Vision-centric perception systems for autonomous driving have gained\nconsiderable attention recently due to their cost-effectiveness and\nscalability, especially compared to LiDAR-based systems. However, these systems\noften struggle in low-light conditions, potentially compromising their\nperformance and safety. To address this, our paper introduces LightDiff, a\ndomain-tailored framework designed to enhance the low-light image quality for\nautonomous driving applications. Specifically, we employ a multi-condition\ncontrolled diffusion model. LightDiff works without any human-collected paired\ndata, leveraging a dynamic data degradation process instead. It incorporates a\nnovel multi-condition adapter that adaptively controls the input weights from\ndifferent modalities, including depth maps, RGB images, and text captions, to\neffectively illuminate dark scenes while maintaining context consistency.\nFurthermore, to align the enhanced images with the detection model's knowledge,\nLightDiff employs perception-specific scores as rewards to guide the diffusion\ntraining process through reinforcement learning. Extensive experiments on the\nnuScenes datasets demonstrate that LightDiff can significantly improve the\nperformance of several state-of-the-art 3D detectors in night-time conditions\nwhile achieving high visual quality scores, highlighting its potential to\nsafeguard autonomous driving.\n","authors":["Jinlong Li","Baolu Li","Zhengzhong Tu","Xinyu Liu","Qing Guo","Felix Juefei-Xu","Runsheng Xu","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.04804v1.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2306.02416v3","updated":"2024-04-07T03:53:33Z","published":"2023-06-04T17:39:08Z","title":"Training Like a Medical Resident: Context-Prior Learning Toward\n Universal Medical Image Segmentation","summary":" A major focus of clinical imaging workflow is disease diagnosis and\nmanagement, leading to medical imaging datasets strongly tied to specific\nclinical objectives. This scenario has led to the prevailing practice of\ndeveloping task-specific segmentation models, without gaining insights from\nwidespread imaging cohorts. Inspired by the training program of medical\nradiology residents, we propose a shift towards universal medical image\nsegmentation, a paradigm aiming to build medical image understanding foundation\nmodels by leveraging the diversity and commonality across clinical targets,\nbody regions, and imaging modalities. Towards this goal, we develop Hermes, a\nnovel context-prior learning approach to address the challenges of data\nheterogeneity and annotation differences in medical image segmentation. In a\nlarge collection of eleven diverse datasets (2,438 3D images) across five\nmodalities (CT, PET, T1, T2 and cine MRI) and multiple body regions, we\ndemonstrate the merit of the universal paradigm over the traditional paradigm\non addressing multiple tasks within a single model. By exploiting the synergy\nacross tasks, Hermes achieves state-of-the-art performance on all testing\ndatasets and shows superior model scalability. Results on two additional\ndatasets reveals Hermes' strong performance for transfer learning, incremental\nlearning, and generalization to downstream tasks. Hermes's learned priors\ndemonstrate an appealing trait to reflect the intricate relations among tasks\nand modalities, which aligns with the established anatomical and imaging\nprinciples in radiology. The code is available:\nhttps://github.com/yhygao/universal-medical-image-segmentation.\n","authors":["Yunhe Gao","Zhuowei Li","Di Liu","Mu Zhou","Shaoting Zhang","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2306.02416v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.08129v3","updated":"2024-04-07T03:53:29Z","published":"2023-10-12T08:36:25Z","title":"Tailored Visions: Enhancing Text-to-Image Generation with Personalized\n Prompt Rewriting","summary":" Despite significant progress in the field, it is still challenging to create\npersonalized visual representations that align closely with the desires and\npreferences of individual users. This process requires users to articulate\ntheir ideas in words that are both comprehensible to the models and accurately\ncapture their vision, posing difficulties for many users. In this paper, we\ntackle this challenge by leveraging historical user interactions with the\nsystem to enhance user prompts. We propose a novel approach that involves\nrewriting user prompts based on a newly collected large-scale text-to-image\ndataset with over 300k prompts from 3115 users. Our rewriting model enhances\nthe expressiveness and alignment of user prompts with their intended visual\noutputs. Experimental results demonstrate the superiority of our methods over\nbaseline approaches, as evidenced in our new offline evaluation method and\nonline tests. Our code and dataset are available at\nhttps://github.com/zzjchen/Tailored-Visions.\n","authors":["Zijie Chen","Lichao Zhang","Fangsheng Weng","Lili Pan","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2310.08129v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19098v2","updated":"2024-04-07T03:49:39Z","published":"2024-03-28T02:22:28Z","title":"GraphAD: Interaction Scene Graph for End-to-end Autonomous Driving","summary":" Modeling complicated interactions among the ego-vehicle, road agents, and map\nelements has been a crucial part for safety-critical autonomous driving.\nPrevious works on end-to-end autonomous driving rely on the attention mechanism\nfor handling heterogeneous interactions, which fails to capture the geometric\npriors and is also computationally intensive. In this paper, we propose the\nInteraction Scene Graph (ISG) as a unified method to model the interactions\namong the ego-vehicle, road agents, and map elements. With the representation\nof the ISG, the driving agents aggregate essential information from the most\ninfluential elements, including the road agents with potential collisions and\nthe map elements to follow. Since a mass of unnecessary interactions are\nomitted, the more efficient scene-graph-based framework is able to focus on\nindispensable connections and leads to better performance. We evaluate the\nproposed method for end-to-end autonomous driving on the nuScenes dataset.\nCompared with strong baselines, our method significantly outperforms in the\nfull-stack driving tasks, including perception, prediction, and planning. Code\nwill be released at https://github.com/zhangyp15/GraphAD.\n","authors":["Yunpeng Zhang","Deheng Qian","Ding Li","Yifeng Pan","Yong Chen","Zhenbao Liang","Zhiyao Zhang","Shurui Zhang","Hongxu Li","Maolei Fu","Yun Ye","Zhujin Liang","Yi Shan","Dalong Du"],"pdf_url":"https://arxiv.org/pdf/2403.19098v2.pdf","comment":"project page: https://github.com/zhangyp15/GraphAD"},{"id":"http://arxiv.org/abs/2401.01207v2","updated":"2024-04-07T03:44:59Z","published":"2024-01-02T13:28:39Z","title":"Towards a Simultaneous and Granular Identity-Expression Control in\n Personalized Face Generation","summary":" In human-centric content generation, the pre-trained text-to-image models\nstruggle to produce user-wanted portrait images, which retain the identity of\nindividuals while exhibiting diverse expressions. This paper introduces our\nefforts towards personalized face generation. To this end, we propose a novel\nmulti-modal face generation framework, capable of simultaneous\nidentity-expression control and more fine-grained expression synthesis. Our\nexpression control is so sophisticated that it can be specialized by the\nfine-grained emotional vocabulary. We devise a novel diffusion model that can\nundertake the task of simultaneously face swapping and reenactment. Due to the\nentanglement of identity and expression, it's nontrivial to separately and\nprecisely control them in one framework, thus has not been explored yet. To\novercome this, we propose several innovative designs in the conditional\ndiffusion model, including balancing identity and expression encoder, improved\nmidpoint sampling, and explicitly background conditioning. Extensive\nexperiments have demonstrated the controllability and scalability of the\nproposed framework, in comparison with state-of-the-art text-to-image, face\nswapping, and face reenactment methods.\n","authors":["Renshuai Liu","Bowen Ma","Wei Zhang","Zhipeng Hu","Changjie Fan","Tangjie Lv","Yu Ding","Xuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.01207v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04800v1","updated":"2024-04-07T03:41:45Z","published":"2024-04-07T03:41:45Z","title":"Coordinated Sparse Recovery of Label Noise","summary":" Label noise is a common issue in real-world datasets that inevitably impacts\nthe generalization of models. This study focuses on robust classification tasks\nwhere the label noise is instance-dependent. Estimating the transition matrix\naccurately in this task is challenging, and methods based on sample selection\noften exhibit confirmation bias to varying degrees. Sparse over-parameterized\ntraining (SOP) has been theoretically effective in estimating and recovering\nlabel noise, offering a novel solution for noise-label learning. However, this\nstudy empirically observes and verifies a technical flaw of SOP: the lack of\ncoordination between model predictions and noise recovery leads to increased\ngeneralization error. To address this, we propose a method called Coordinated\nSparse Recovery (CSR). CSR introduces a collaboration matrix and confidence\nweights to coordinate model predictions and noise recovery, reducing error\nleakage. Based on CSR, this study designs a joint sample selection strategy and\nconstructs a comprehensive and powerful learning framework called CSR+. CSR+\nsignificantly reduces confirmation bias, especially for datasets with more\nclasses and a high proportion of instance-specific noise. Experimental results\non simulated and real-world noisy datasets demonstrate that both CSR and CSR+\nachieve outstanding performance compared to methods at the same level.\n","authors":["Yukun Yang","Naihao Wang","Haixin Yang","Ruirui Li"],"pdf_url":"https://arxiv.org/pdf/2404.04800v1.pdf","comment":"Pre-print prior to submission to journal"},{"id":"http://arxiv.org/abs/2404.04799v1","updated":"2024-04-07T03:37:29Z","published":"2024-04-07T03:37:29Z","title":"Few-Shot Object Detection: Research Advances and Challenges","summary":" Object detection as a subfield within computer vision has achieved remarkable\nprogress, which aims to accurately identify and locate a specific object from\nimages or videos. Such methods rely on large-scale labeled training samples for\neach object category to ensure accurate detection, but obtaining extensive\nannotated data is a labor-intensive and expensive process in many real-world\nscenarios. To tackle this challenge, researchers have explored few-shot object\ndetection (FSOD) that combines few-shot learning and object detection\ntechniques to rapidly adapt to novel objects with limited annotated samples.\nThis paper presents a comprehensive survey to review the significant\nadvancements in the field of FSOD in recent years and summarize the existing\nchallenges and solutions. Specifically, we first introduce the background and\ndefinition of FSOD to emphasize potential value in advancing the field of\ncomputer vision. We then propose a novel FSOD taxonomy method and survey the\nplentifully remarkable FSOD algorithms based on this fact to report a\ncomprehensive overview that facilitates a deeper understanding of the FSOD\nproblem and the development of innovative solutions. Finally, we discuss the\nadvantages and limitations of these algorithms to summarize the challenges,\npotential research direction, and development trend of object detection in the\ndata scarcity scenario.\n","authors":["Zhimeng Xin","Shiming Chen","Tianxu Wu","Yuanjie Shao","Weiping Ding","Xinge You"],"pdf_url":"https://arxiv.org/pdf/2404.04799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18331v2","updated":"2024-04-07T02:51:02Z","published":"2024-02-28T13:50:46Z","title":"FineDiffusion: Scaling up Diffusion Models for Fine-grained Image\n Generation with 10,000 Classes","summary":" The class-conditional image generation based on diffusion models is renowned\nfor generating high-quality and diverse images. However, most prior efforts\nfocus on generating images for general categories, e.g., 1000 classes in\nImageNet-1k. A more challenging task, large-scale fine-grained image\ngeneration, remains the boundary to explore. In this work, we present a\nparameter-efficient strategy, called FineDiffusion, to fine-tune large\npre-trained diffusion models scaling to large-scale fine-grained image\ngeneration with 10,000 categories. FineDiffusion significantly accelerates\ntraining and reduces storage overhead by only fine-tuning tiered class\nembedder, bias terms, and normalization layers' parameters. To further improve\nthe image generation quality of fine-grained categories, we propose a novel\nsampling method for fine-grained image generation, which utilizes\nsuperclass-conditioned guidance, specifically tailored for fine-grained\ncategories, to replace the conventional classifier-free guidance sampling.\nCompared to full fine-tuning, FineDiffusion achieves a remarkable 1.56x\ntraining speed-up and requires storing merely 1.77% of the total model\nparameters, while achieving state-of-the-art FID of 9.776 on image generation\nof 10,000 classes. Extensive qualitative and quantitative experiments\ndemonstrate the superiority of our method compared to other parameter-efficient\nfine-tuning methods. The code and more generated results are available at our\nproject website: https://finediffusion.github.io/.\n","authors":["Ziying Pan","Kun Wang","Gang Li","Feihong He","Xiwang Li","Yongxuan Lai"],"pdf_url":"https://arxiv.org/pdf/2402.18331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17228v2","updated":"2024-04-07T02:43:54Z","published":"2024-02-27T05:42:38Z","title":"Feature Re-Embedding: Towards Foundation Model-Level Performance in\n Computational Pathology","summary":" Multiple instance learning (MIL) is the most widely used framework in\ncomputational pathology, encompassing sub-typing, diagnosis, prognosis, and\nmore. However, the existing MIL paradigm typically requires an offline instance\nfeature extractor, such as a pre-trained ResNet or a foundation model. This\napproach lacks the capability for feature fine-tuning within the specific\ndownstream tasks, limiting its adaptability and performance. To address this\nissue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding\nthe instance features online, which captures fine-grained local features and\nestablishes connections across different regions. Unlike existing works that\nfocus on pre-training powerful feature extractor or designing sophisticated\ninstance aggregator, R$^2$T is tailored to re-embed instance features online.\nIt serves as a portable module that can seamlessly integrate into mainstream\nMIL models. Extensive experimental results on common computational pathology\ntasks validate that: 1) feature re-embedding improves the performance of MIL\nmodels based on ResNet-50 features to the level of foundation model features,\nand further enhances the performance of foundation model features; 2) the\nR$^2$T can introduce more significant performance improvements to various MIL\nmodels; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest\nmethods by a large margin.The code is available at:\nhttps://github.com/DearCaat/RRT-MIL.\n","authors":["Wenhao Tang","Fengtao Zhou","Sheng Huang","Xiang Zhu","Yi Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17228v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.11502v3","updated":"2024-04-07T02:42:27Z","published":"2024-02-18T08:21:05Z","title":"GenAD: Generative End-to-End Autonomous Driving","summary":" Directly producing planning results from raw sensors has been a long-desired\nsolution for autonomous driving and has attracted increasing attention\nrecently. Most existing end-to-end autonomous driving methods factorize this\nproblem into perception, motion prediction, and planning. However, we argue\nthat the conventional progressive pipeline still cannot comprehensively model\nthe entire traffic evolution process, e.g., the future interaction between the\nego car and other traffic participants and the structural trajectory prior. In\nthis paper, we explore a new paradigm for end-to-end autonomous driving, where\nthe key is to predict how the ego car and the surroundings evolve given past\nscenes. We propose GenAD, a generative framework that casts autonomous driving\ninto a generative modeling problem. We propose an instance-centric scene\ntokenizer that first transforms the surrounding scenes into map-aware instance\ntokens. We then employ a variational autoencoder to learn the future trajectory\ndistribution in a structural latent space for trajectory prior modeling. We\nfurther adopt a temporal model to capture the agent and ego movements in the\nlatent space to generate more effective future trajectories. GenAD finally\nsimultaneously performs motion prediction and planning by sampling\ndistributions in the learned structural latent space conditioned on the\ninstance tokens and using the learned temporal model to generate futures.\nExtensive experiments on the widely used nuScenes benchmark show that the\nproposed GenAD achieves state-of-the-art performance on vision-centric\nend-to-end autonomous driving with high efficiency. Code:\nhttps://github.com/wzzheng/GenAD.\n","authors":["Wenzhao Zheng","Ruiqi Song","Xianda Guo","Chenming Zhang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2402.11502v3.pdf","comment":"Code is available at: https://github.com/wzzheng/GenAD"},{"id":"http://arxiv.org/abs/2309.16496v3","updated":"2024-04-07T02:39:31Z","published":"2023-09-28T15:03:44Z","title":"CCEdit: Creative and Controllable Video Editing via Diffusion Models","summary":" In this paper, we present CCEdit, a versatile generative video editing\nframework based on diffusion models. Our approach employs a novel trident\nnetwork structure that separates structure and appearance control, ensuring\nprecise and creative editing capabilities. Utilizing the foundational\nControlNet architecture, we maintain the structural integrity of the video\nduring editing. The incorporation of an additional appearance branch enables\nusers to exert fine-grained control over the edited key frame. These two side\nbranches seamlessly integrate into the main branch, which is constructed upon\nexisting text-to-image (T2I) generation models, through learnable temporal\nlayers. The versatility of our framework is demonstrated through a diverse\nrange of choices in both structure representations and personalized T2I models,\nas well as the option to provide the edited key frame. To facilitate\ncomprehensive evaluation, we introduce the BalanceCC benchmark dataset,\ncomprising 100 videos and 4 target prompts for each video. Our extensive user\nstudies compare CCEdit with eight state-of-the-art video editing methods. The\noutcomes demonstrate CCEdit's substantial superiority over all other methods.\n","authors":["Ruoyu Feng","Wenming Weng","Yanhui Wang","Yuhui Yuan","Jianmin Bao","Chong Luo","Zhibo Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2309.16496v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12850v2","updated":"2024-04-07T02:18:23Z","published":"2023-10-19T14:04:53Z","title":"PrivImage: Differentially Private Synthetic Image Generation using\n Diffusion Models with Semantic-Aware Pretraining","summary":" Differential Privacy (DP) image data synthesis, which leverages the DP\ntechnique to generate synthetic data to replace the sensitive data, allowing\norganizations to share and utilize synthetic images without privacy concerns.\nPrevious methods incorporate the advanced techniques of generative models and\npre-training on a public dataset to produce exceptional DP image data, but\nsuffer from problems of unstable training and massive computational resource\ndemands. This paper proposes a novel DP image synthesis method, termed\nPRIVIMAGE, which meticulously selects pre-training data, promoting the\nefficient creation of DP datasets with high fidelity and utility. PRIVIMAGE\nfirst establishes a semantic query function using a public dataset. Then, this\nfunction assists in querying the semantic distribution of the sensitive\ndataset, facilitating the selection of data from the public dataset with\nanalogous semantics for pre-training. Finally, we pre-train an image generative\nmodel using the selected data and then fine-tune this model on the sensitive\ndataset using Differentially Private Stochastic Gradient Descent (DP-SGD).\nPRIVIMAGE allows us to train a lightly parameterized generative model, reducing\nthe noise in the gradient during DP-SGD training and enhancing training\nstability. Extensive experiments demonstrate that PRIVIMAGE uses only 1% of the\npublic dataset for pre-training and 7.6% of the parameters in the generative\nmodel compared to the state-of-the-art method, whereas achieves superior\nsynthetic performance and conserves more computational resources. On average,\nPRIVIMAGE achieves 30.1% lower FID and 12.6% higher Classification Accuracy\nthan the state-of-the-art method. The replication package and datasets can be\naccessed online.\n","authors":["Kecen Li","Chen Gong","Zhixiang Li","Yuzhong Zhao","Xinwen Hou","Tianhao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12850v2.pdf","comment":"Accepted at USENIX Security 2024"},{"id":"http://arxiv.org/abs/2404.04785v1","updated":"2024-04-07T02:15:43Z","published":"2024-04-07T02:15:43Z","title":"Rethinking Diffusion Model for Multi-Contrast MRI Super-Resolution","summary":" Recently, diffusion models (DM) have been applied in magnetic resonance\nimaging (MRI) super-resolution (SR) reconstruction, exhibiting impressive\nperformance, especially with regard to detailed reconstruction. However, the\ncurrent DM-based SR reconstruction methods still face the following issues: (1)\nThey require a large number of iterations to reconstruct the final image, which\nis inefficient and consumes a significant amount of computational resources.\n(2) The results reconstructed by these methods are often misaligned with the\nreal high-resolution images, leading to remarkable distortion in the\nreconstructed MR images. To address the aforementioned issues, we propose an\nefficient diffusion model for multi-contrast MRI SR, named as DiffMSR.\nSpecifically, we apply DM in a highly compact low-dimensional latent space to\ngenerate prior knowledge with high-frequency detail information. The highly\ncompact latent space ensures that DM requires only a few simple iterations to\nproduce accurate prior knowledge. In addition, we design the Prior-Guide Large\nWindow Transformer (PLWformer) as the decoder for DM, which can extend the\nreceptive field while fully utilizing the prior knowledge generated by DM to\nensure that the reconstructed MR image remains undistorted. Extensive\nexperiments on public and clinical datasets demonstrate that our DiffMSR\noutperforms state-of-the-art methods.\n","authors":["Guangyuan Li","Chen Rao","Juncheng Mo","Zhanjie Zhang","Wei Xing","Lei Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.04785v1.pdf","comment":"14 pages, 12 figures, Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.00674v2","updated":"2024-04-07T01:56:15Z","published":"2024-03-31T12:45:23Z","title":"Knowledge NeRF: Few-shot Novel View Synthesis for Dynamic Articulated\n Objects","summary":" We present Knowledge NeRF to synthesize novel views for dynamic scenes.\nReconstructing dynamic 3D scenes from few sparse views and rendering them from\narbitrary perspectives is a challenging problem with applications in various\ndomains. Previous dynamic NeRF methods learn the deformation of articulated\nobjects from monocular videos. However, qualities of their reconstructed scenes\nare limited. To clearly reconstruct dynamic scenes, we propose a new framework\nby considering two frames at a time.We pretrain a NeRF model for an articulated\nobject.When articulated objects moves, Knowledge NeRF learns to generate novel\nviews at the new state by incorporating past knowledge in the pretrained NeRF\nmodel with minimal observations in the present state. We propose a projection\nmodule to adapt NeRF for dynamic scenes, learning the correspondence between\npretrained knowledge base and current states. Experimental results demonstrate\nthe effectiveness of our method in reconstructing dynamic 3D scenes with 5\ninput images in one state. Knowledge NeRF is a new pipeline and promising\nsolution for novel view synthesis in dynamic articulated objects. The data and\nimplementation are publicly available at\nhttps://github.com/RussRobin/Knowledge_NeRF.\n","authors":["Wenxiao Cai","Xinyue Lei","Xinyu He","Junming Leo Chen","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00674v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02449v2","updated":"2024-04-07T01:55:40Z","published":"2024-03-04T20:05:28Z","title":"Optimizing Illuminant Estimation in Dual-Exposure HDR Imaging","summary":" High dynamic range (HDR) imaging involves capturing a series of frames of the\nsame scene, each with different exposure settings, to broaden the dynamic range\nof light. This can be achieved through burst capturing or using staggered HDR\nsensors that capture long and short exposures simultaneously in the camera\nimage signal processor (ISP). Within camera ISP pipeline, illuminant estimation\nis a crucial step aiming to estimate the color of the global illuminant in the\nscene. This estimation is used in camera ISP white-balance module to remove\nundesirable color cast in the final image. Despite the multiple frames captured\nin the HDR pipeline, conventional illuminant estimation methods often rely only\non a single frame of the scene. In this paper, we explore leveraging\ninformation from frames captured with different exposure times. Specifically,\nwe introduce a simple feature extracted from dual-exposure images to guide\nilluminant estimators, referred to as the dual-exposure feature (DEF). To\nvalidate the efficiency of DEF, we employed two illuminant estimators using the\nproposed DEF: 1) a multilayer perceptron network (MLP), referred to as\nexposure-based MLP (EMLP), and 2) a modified version of the convolutional color\nconstancy (CCC) to integrate our DEF, that we call ECCC. Both EMLP and ECCC\nachieve promising results, in some cases surpassing prior methods that require\nhundreds of thousands or millions of parameters, with only a few hundred\nparameters for EMLP and a few thousand parameters for ECCC.\n","authors":["Mahmoud Afifi","Zhenhua Hu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2403.02449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20210v3","updated":"2024-04-07T01:25:09Z","published":"2023-10-31T06:19:09Z","title":"UWFormer: Underwater Image Enhancement via a Semi-Supervised Multi-Scale\n Transformer","summary":" Underwater images often exhibit poor quality, distorted color balance and low\ncontrast due to the complex and intricate interplay of light, water, and\nobjects. Despite the significant contributions of previous underwater\nenhancement techniques, there exist several problems that demand further\nimprovement: (i) The current deep learning methods rely on Convolutional Neural\nNetworks (CNNs) that lack the multi-scale enhancement, and global perception\nfield is also limited. (ii) The scarcity of paired real-world underwater\ndatasets poses a significant challenge, and the utilization of synthetic image\npairs could lead to overfitting. To address the aforementioned problems, this\npaper introduces a Multi-scale Transformer-based Network called UWFormer for\nenhancing images at multiple frequencies via semi-supervised learning, in which\nwe propose a Nonlinear Frequency-aware Attention mechanism and a Multi-Scale\nFusion Feed-forward Network for low-frequency enhancement. Besides, we\nintroduce a special underwater semi-supervised training strategy, where we\npropose a Subaqueous Perceptual Loss function to generate reliable pseudo\nlabels. Experiments using full-reference and non-reference underwater\nbenchmarks demonstrate that our method outperforms state-of-the-art methods in\nterms of both quantity and visual quality.\n","authors":["Weiwen Chen","Yingtie Lei","Shenghong Luo","Ziyang Zhou","Mingxian Li","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2310.20210v3.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.04763v1","updated":"2024-04-07T00:28:13Z","published":"2024-04-07T00:28:13Z","title":"GenEARL: A Training-Free Generative Framework for Multimodal Event\n Argument Role Labeling","summary":" Multimodal event argument role labeling (EARL), a task that assigns a role\nfor each event participant (object) in an image is a complex challenge. It\nrequires reasoning over the entire image, the depicted event, and the\ninteractions between various objects participating in the event. Existing\nmodels heavily rely on high-quality event-annotated training data to understand\nthe event semantics and structures, and they fail to generalize to new event\ntypes and domains. In this paper, we propose GenEARL, a training-free\ngenerative framework that harness the power of the modern generative models to\nunderstand event task descriptions given image contexts to perform the EARL\ntask. Specifically, GenEARL comprises two stages of generative prompting with a\nfrozen vision-language model (VLM) and a frozen large language model (LLM).\nFirst, a generative VLM learns the semantics of the event argument roles and\ngenerates event-centric object descriptions based on the image. Subsequently, a\nLLM is prompted with the generated object descriptions with a predefined\ntemplate for EARL (i.e., assign an object with an event argument role). We show\nthat GenEARL outperforms the contrastive pretraining (CLIP) baseline by 9.4%\nand 14.2% accuracy for zero-shot EARL on the M2E2 and SwiG datasets,\nrespectively. In addition, we outperform CLIP-Event by 22% precision on M2E2\ndataset. The framework also allows flexible adaptation and generalization to\nunseen domains.\n","authors":["Hritik Bansal","Po-Nien Kung","P. Jeffrey Brantingham","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2404.04763v1.pdf","comment":"20 pages, 15 Figures, 13 figures"},{"id":"http://arxiv.org/abs/2404.06332v1","updated":"2024-04-07T12:42:02Z","published":"2024-04-07T12:42:02Z","title":"X-VARS: Introducing Explainability in Football Refereeing with\n Multi-Modal Large Language Model","summary":" The rapid advancement of artificial intelligence has led to significant\nimprovements in automated decision-making. However, the increased performance\nof models often comes at the cost of explainability and transparency of their\ndecision-making processes. In this paper, we investigate the capabilities of\nlarge language models to explain decisions, using football refereeing as a\ntesting ground, given its decision complexity and subjectivity. We introduce\nthe Explainable Video Assistant Referee System, X-VARS, a multi-modal large\nlanguage model designed for understanding football videos from the point of\nview of a referee. X-VARS can perform a multitude of tasks, including video\ndescription, question answering, action recognition, and conducting meaningful\nconversations based on video content and in accordance with the Laws of the\nGame for football referees. We validate X-VARS on our novel dataset,\nSoccerNet-XFoul, which consists of more than 22k video-question-answer triplets\nannotated by over 70 experienced football referees. Our experiments and human\nstudy illustrate the impressive capabilities of X-VARS in interpreting complex\nfootball clips. Furthermore, we highlight the potential of X-VARS to reach\nhuman performance and support football referees in the future.\n","authors":["Jan Held","Hani Itani","Anthony Cioppa","Silvio Giancola","Bernard Ghanem","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2404.06332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04818v1","updated":"2024-04-07T05:56:42Z","published":"2024-04-07T05:56:42Z","title":"DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking","summary":" Multimodal entity linking (MEL) aims to utilize multimodal information\n(usually textual and visual information) to link ambiguous mentions to\nunambiguous entities in knowledge base. Current methods facing main issues:\n(1)treating the entire image as input may contain redundant information. (2)the\ninsufficient utilization of entity-related information, such as attributes in\nimages. (3)semantic inconsistency between the entity in knowledge base and its\nrepresentation. To this end, we propose DWE+ for multimodal entity linking.\nDWE+ could capture finer semantics and dynamically maintain semantic\nconsistency with entities. This is achieved by three aspects: (a)we introduce a\nmethod for extracting fine-grained image features by partitioning the image\ninto multiple local objects. Then, hierarchical contrastive learning is used to\nfurther align semantics between coarse-grained information(text and image) and\nfine-grained (mention and visual objects). (b)we explore ways to extract visual\nattributes from images to enhance fusion feature such as facial features and\nidentity. (c)we leverage Wikipedia and ChatGPT to capture the entity\nrepresentation, achieving semantic enrichment from both static and dynamic\nperspectives, which better reflects the real-world entity semantics.\nExperiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the\neffectiveness of DWE+ in improving MEL performance. Specifically, we optimize\nthese datasets and achieve state-of-the-art performance on the enhanced\ndatasets. The code and enhanced datasets are released on\nhttps://github.com/season1blue/DWET\n","authors":["Shezheng Song","Shasha Li","Shan Zhao","Xiaopeng Li","Chengyu Wang","Jie Yu","Jun Ma","Tianwei Yan","Bin Ji","Xiaoguang Mao"],"pdf_url":"https://arxiv.org/pdf/2404.04818v1.pdf","comment":"under review on TOIS. arXiv admin note: substantial text overlap with\n arXiv:2312.11816"}]},"2024-04-06T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.06694v3","updated":"2024-04-06T22:14:25Z","published":"2023-11-12T00:21:58Z","title":"Which One? Leveraging Context Between Objects and Multiple Views for\n Language Grounding","summary":" When connecting objects and their language referents in an embodied 3D\nenvironment, it is important to note that: (1) an object can be better\ncharacterized by leveraging comparative information between itself and other\nobjects, and (2) an object's appearance can vary with camera position. As such,\nwe present the Multi-view Approach to Grounding in Context (MAGiC), which\nselects an object referent based on language that distinguishes between two\nsimilar objects. By pragmatically reasoning over both objects and across\nmultiple views of those objects, MAGiC improves over the state-of-the-art model\non the SNARE object reference task with a relative error reduction of 12.9\\%\n(representing an absolute improvement of 2.7\\%). Ablation studies show that\nreasoning jointly over object referent candidates and multiple views of each\nobject both contribute to improved accuracy. Code:\nhttps://github.com/rcorona/magic_snare/\n","authors":["Chancharik Mitra","Abrar Anwar","Rodolfo Corona","Dan Klein","Trevor Darrell","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2311.06694v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04745v1","updated":"2024-04-06T22:08:20Z","published":"2024-04-06T22:08:20Z","title":"Collaborative Feedback Discriminative Propagation for Video\n Super-Resolution","summary":" The key success of existing video super-resolution (VSR) methods stems mainly\nfrom exploring spatial and temporal information, which is usually achieved by a\nrecurrent propagation module with an alignment module. However, inaccurate\nalignment usually leads to aligned features with significant artifacts, which\nwill be accumulated during propagation and thus affect video restoration.\nMoreover, propagation modules only propagate the same timestep features forward\nor backward that may fail in case of complex motion or occlusion, limiting\ntheir performance for high-quality frame restoration. To address these issues,\nwe propose a collaborative feedback discriminative (CFD) method to correct\ninaccurate aligned features and model long -range spatial and temporal\ninformation for better video reconstruction. In detail, we develop a\ndiscriminative alignment correction (DAC) method to adaptively explore\ninformation and reduce the influences of the artifacts caused by inaccurate\nalignment. Then, we propose a collaborative feedback propagation (CFP) module\nthat employs feedback and gating mechanisms to better explore spatial and\ntemporal information of different timestep features from forward and backward\npropagation simultaneously. Finally, we embed the proposed DAC and CFP into\ncommonly used VSR networks to verify the effectiveness of our method.\nQuantitative and qualitative experiments on several benchmarks demonstrate that\nour method can improve the performance of existing VSR models while maintaining\na lower model complexity. The source code and pre-trained models will be\navailable at \\url{https://github.com/House-Leo/CFDVSR}.\n","authors":["Hao Li","Xiang Chen","Jiangxin Dong","Jinhui Tang","Jinshan Pan"],"pdf_url":"https://arxiv.org/pdf/2404.04745v1.pdf","comment":"Project website: https://github.com/House-Leo/CFDVSR"},{"id":"http://arxiv.org/abs/2404.04736v1","updated":"2024-04-06T21:39:49Z","published":"2024-04-06T21:39:49Z","title":"ProtoAL: Interpretable Deep Active Learning with prototypes for medical\n imaging","summary":" The adoption of Deep Learning algorithms in the medical imaging field is a\nprominent area of research, with high potential for advancing AI-based\nComputer-aided diagnosis (AI-CAD) solutions. However, current solutions face\nchallenges due to a lack of interpretability features and high data demands,\nprompting recent efforts to address these issues. In this study, we propose the\nProtoAL method, where we integrate an interpretable DL model into the Deep\nActive Learning (DAL) framework. This approach aims to address both challenges\nby focusing on the medical imaging context and utilizing an inherently\ninterpretable model based on prototypes. We evaluated ProtoAL on the Messidor\ndataset, achieving an area under the precision-recall curve of 0.79 while\nutilizing only 76.54\\% of the available labeled data. These capabilities can\nenhances the practical usability of a DL model in the medical field, providing\na means of trust calibration in domain experts and a suitable solution for\nlearning in the data scarcity context often found.\n","authors":["Iury B. de A. Santos","André C. P. L. F. de Carvalho"],"pdf_url":"https://arxiv.org/pdf/2404.04736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04734v1","updated":"2024-04-06T21:33:39Z","published":"2024-04-06T21:33:39Z","title":"Towards Generalized Entropic Sparsification for Convolutional Neural\n Networks","summary":" Convolutional neural networks (CNNs) are reported to be overparametrized. The\nsearch for optimal (minimal) and sufficient architecture is an NP-hard problem\nas the hyperparameter space for possible network configurations is vast. Here,\nwe introduce a layer-by-layer data-driven pruning method based on the\nmathematical idea aiming at a computationally-scalable entropic relaxation of\nthe pruning problem. The sparse subnetwork is found from the pre-trained (full)\nCNN using the network entropy minimization as a sparsity constraint. This\nallows deploying a numerically scalable algorithm with a sublinear scaling\ncost. The method is validated on several benchmarks (architectures): (i) MNIST\n(LeNet) with sparsity 55%-84% and loss in accuracy 0.1%-0.5%, and (ii) CIFAR-10\n(VGG-16, ResNet18) with sparsity 73-89% and loss in accuracy 0.1%-0.5%.\n","authors":["Tin Barisin","Illia Horenko"],"pdf_url":"https://arxiv.org/pdf/2404.04734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07540v2","updated":"2024-04-06T19:56:42Z","published":"2023-03-14T00:05:08Z","title":"Tensor-based Multimodal Learning for Prediction of Pulmonary Arterial\n Wedge Pressure from Cardiac MRI","summary":" Heart failure is a serious and life-threatening condition that can lead to\nelevated pressure in the left ventricle. Pulmonary Arterial Wedge Pressure\n(PAWP) is an important surrogate marker indicating high pressure in the left\nventricle. PAWP is determined by Right Heart Catheterization (RHC) but it is an\ninvasive procedure. A non-invasive method is useful in quickly identifying\nhigh-risk patients from a large population. In this work, we develop a tensor\nlearning-based pipeline for identifying PAWP from multimodal cardiac Magnetic\nResonance Imaging (MRI). This pipeline extracts spatial and temporal features\nfrom high-dimensional scans. For quality control, we incorporate an epistemic\nuncertainty-based binning strategy to identify poor-quality training samples.\nTo improve the performance, we learn complementary information by integrating\nfeatures from multimodal data: cardiac MRI with short-axis and four-chamber\nviews, and Electronic Health Records. The experimental analysis on a large\ncohort of $1346$ subjects who underwent the RHC procedure for PAWP estimation\nindicates that the proposed pipeline has a diagnostic value and can produce\npromising performance with significant improvement over the baseline in\nclinical practice (i.e., $\\Delta$AUC $=0.10$, $\\Delta$Accuracy $=0.06$, and\n$\\Delta$MCC $=0.39$). The decision curve analysis further confirms the clinical\nutility of our method.\n","authors":["Prasun C. Tripathi","Mohammod N. I. Suvon","Lawrence Schobs","Shuo Zhou","Samer Alabed","Andrew J. Swift","Haiping Lu"],"pdf_url":"https://arxiv.org/pdf/2303.07540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04720v1","updated":"2024-04-06T19:50:48Z","published":"2024-04-06T19:50:48Z","title":"On Exploring PDE Modeling for Point Cloud Video Representation Learning","summary":" Point cloud video representation learning is challenging due to complex\nstructures and unordered spatial arrangement. Traditional methods struggle with\nframe-to-frame correlations and point-wise correspondence tracking. Recently,\npartial differential equations (PDE) have provided a new perspective in\nuniformly solving spatial-temporal data information within certain constraints.\nWhile tracking tangible point correspondence remains challenging, we propose to\nformalize point cloud video representation learning as a PDE-solving problem.\nInspired by fluid analysis, where PDEs are used to solve the deformation of\nspatial shape over time, we employ PDE to solve the variations of spatial\npoints affected by temporal information. By modeling spatial-temporal\ncorrelations, we aim to regularize spatial variations with temporal features,\nthereby enhancing representation learning in point cloud videos. We introduce\nMotion PointNet composed of a PointNet-like encoder and a PDE-solving module.\nInitially, we construct a lightweight yet effective encoder to model an initial\nstate of the spatial variations. Subsequently, we develop our PDE-solving\nmodule in a parameterized latent space, tailored to address the spatio-temporal\ncorrelations inherent in point cloud video. The process of solving PDE is\nguided and refined by a contrastive learning structure, which is pivotal in\nreshaping the feature distribution, thereby optimizing the feature\nrepresentation within point cloud video data. Remarkably, our Motion PointNet\nachieves an impressive accuracy of 97.52% on the MSRAction-3D dataset,\nsurpassing the current state-of-the-art in all aspects while consuming minimal\nresources (only 0.72M parameters and 0.82G FLOPs).\n","authors":["Zhuoxu Huang","Zhenkun Fan","Tao Xu","Jungong Han"],"pdf_url":"https://arxiv.org/pdf/2404.04720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04718v1","updated":"2024-04-06T19:42:25Z","published":"2024-04-06T19:42:25Z","title":"Interpretable Multimodal Learning for Cardiovascular Hemodynamics\n Assessment","summary":" Pulmonary Arterial Wedge Pressure (PAWP) is an essential cardiovascular\nhemodynamics marker to detect heart failure. In clinical practice, Right Heart\nCatheterization is considered a gold standard for assessing cardiac\nhemodynamics while non-invasive methods are often needed to screen high-risk\npatients from a large population. In this paper, we propose a multimodal\nlearning pipeline to predict PAWP marker. We utilize complementary information\nfrom Cardiac Magnetic Resonance Imaging (CMR) scans (short-axis and\nfour-chamber) and Electronic Health Records (EHRs). We extract spatio-temporal\nfeatures from CMR scans using tensor-based learning. We propose a graph\nattention network to select important EHR features for prediction, where we\nmodel subjects as graph nodes and feature relationships as graph edges using\nthe attention mechanism. We design four feature fusion strategies: early,\nintermediate, late, and hybrid fusion. With a linear classifier and linear\nfusion strategies, our pipeline is interpretable. We validate our pipeline on a\nlarge dataset of $2,641$ subjects from our ASPIRE registry. The comparative\nstudy against state-of-the-art methods confirms the superiority of our\npipeline. The decision curve analysis further validates that our pipeline can\nbe applied to screen a large population. The code is available at\nhttps://github.com/prasunc/hemodynamics.\n","authors":["Prasun C Tripathi","Sina Tabakhi","Mohammod N I Suvon","Lawrence Schöb","Samer Alabed","Andrew J Swift","Shuo Zhou","Haiping Lu"],"pdf_url":"https://arxiv.org/pdf/2404.04718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02494v3","updated":"2024-04-06T18:10:26Z","published":"2023-07-16T19:36:19Z","title":"Adaptively Placed Multi-Grid Scene Representation Networks for\n Large-Scale Data Visualization","summary":" Scene representation networks (SRNs) have been recently proposed for\ncompression and visualization of scientific data. However, state-of-the-art\nSRNs do not adapt the allocation of available network parameters to the complex\nfeatures found in scientific data, leading to a loss in reconstruction quality.\nWe address this shortcoming with an adaptively placed multi-grid SRN (APMGSRN)\nand propose a domain decomposition training and inference technique for\naccelerated parallel training on multi-GPU systems. We also release an\nopen-source neural volume rendering application that allows plug-and-play\nrendering with any PyTorch-based SRN. Our proposed APMGSRN architecture uses\nmultiple spatially adaptive feature grids that learn where to be placed within\nthe domain to dynamically allocate more neural network resources where error is\nhigh in the volume, improving state-of-the-art reconstruction accuracy of SRNs\nfor scientific data without requiring expensive octree refining, pruning, and\ntraversal like previous adaptive models. In our domain decomposition approach\nfor representing large-scale data, we train an set of APMGSRNs in parallel on\nseparate bricks of the volume to reduce training time while avoiding overhead\nnecessary for an out-of-core solution for volumes too large to fit in GPU\nmemory. After training, the lightweight SRNs are used for realtime neural\nvolume rendering in our open-source renderer, where arbitrary view angles and\ntransfer functions can be explored. A copy of this paper, all code, all models\nused in our experiments, and all supplemental materials and videos are\navailable at https://github.com/skywolf829/APMGSRN.\n","authors":["Skylar Wolfgang Wurster","Tianyu Xiong","Han-Wei Shen","Hanqi Guo","Tom Peterka"],"pdf_url":"https://arxiv.org/pdf/2308.02494v3.pdf","comment":"Accepted to IEEE VIS 2023.\n https://www.computer.org/csdl/journal/tg/2024/01/10297599/1RyYguiNBLO"},{"id":"http://arxiv.org/abs/2404.04693v1","updated":"2024-04-06T17:41:36Z","published":"2024-04-06T17:41:36Z","title":"OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera\n Fusion for Colorizing Point Clouds","summary":" A Colored point cloud, as a simple and efficient 3D representation, has many\nadvantages in various fields, including robotic navigation and scene\nreconstruction. This representation is now commonly used in 3D reconstruction\ntasks relying on cameras and LiDARs. However, fusing data from these two types\nof sensors is poorly performed in many existing frameworks, leading to\nunsatisfactory mapping results, mainly due to inaccurate camera poses. This\npaper presents OmniColor, a novel and efficient algorithm to colorize point\nclouds using an independent 360-degree camera. Given a LiDAR-based point cloud\nand a sequence of panorama images with initial coarse camera poses, our\nobjective is to jointly optimize the poses of all frames for mapping images\nonto geometric reconstructions. Our pipeline works in an off-the-shelf manner\nthat does not require any feature extraction or matching process. Instead, we\nfind optimal poses by directly maximizing the photometric consistency of LiDAR\nmaps. In experiments, we show that our method can overcome the severe visual\ndistortion of omnidirectional images and greatly benefit from the wide field of\nview (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy\nand stability. The code will be released at\nhttps://github.com/liubonan123/OmniColor/.\n","authors":["Bonan Liu","Guoyang Zhao","Jianhao Jiao","Guang Cai","Chengyang Li","Handi Yin","Yuyang Wang","Ming Liu","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2404.04693v1.pdf","comment":"2024 IEEE International Conference on Robotics and Automation"},{"id":"http://arxiv.org/abs/2404.04687v1","updated":"2024-04-06T17:23:43Z","published":"2024-04-06T17:23:43Z","title":"Z-Splat: Z-Axis Gaussian Splatting for Camera-Sonar Fusion","summary":" Differentiable 3D-Gaussian splatting (GS) is emerging as a prominent\ntechnique in computer vision and graphics for reconstructing 3D scenes. GS\nrepresents a scene as a set of 3D Gaussians with varying opacities and employs\na computationally efficient splatting operation along with analytical\nderivatives to compute the 3D Gaussian parameters given scene images captured\nfrom various viewpoints. Unfortunately, capturing surround view ($360^{\\circ}$\nviewpoint) images is impossible or impractical in many real-world imaging\nscenarios, including underwater imaging, rooms inside a building, and\nautonomous navigation. In these restricted baseline imaging scenarios, the GS\nalgorithm suffers from a well-known 'missing cone' problem, which results in\npoor reconstruction along the depth axis. In this manuscript, we demonstrate\nthat using transient data (from sonars) allows us to address the missing cone\nproblem by sampling high-frequency data along the depth axis. We extend the\nGaussian splatting algorithms for two commonly used sonars and propose fusion\nalgorithms that simultaneously utilize RGB camera data and sonar data. Through\nsimulations, emulations, and hardware experiments across various imaging\nscenarios, we show that the proposed fusion algorithms lead to significantly\nbetter novel view synthesis (5 dB improvement in PSNR) and 3D geometry\nreconstruction (60% lower Chamfer distance).\n","authors":["Ziyuan Qu","Omkar Vengurlekar","Mohamad Qadri","Kevin Zhang","Michael Kaess","Christopher Metzler","Suren Jayasuriya","Adithya Pediredla"],"pdf_url":"https://arxiv.org/pdf/2404.04687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04686v1","updated":"2024-04-06T17:23:21Z","published":"2024-04-06T17:23:21Z","title":"Predictive Modeling for Breast Cancer Classification in the Context of\n Bangladeshi Patients: A Supervised Machine Learning Approach with Explainable\n AI","summary":" Breast cancer has rapidly increased in prevalence in recent years, making it\none of the leading causes of mortality worldwide. Among all cancers, it is by\nfar the most common. Diagnosing this illness manually requires significant time\nand expertise. Since detecting breast cancer is a time-consuming process,\npreventing its further spread can be aided by creating machine-based forecasts.\nMachine learning and Explainable AI are crucial in classification as they not\nonly provide accurate predictions but also offer insights into how the model\narrives at its decisions, aiding in the understanding and trustworthiness of\nthe classification results. In this study, we evaluate and compare the\nclassification accuracy, precision, recall, and F-1 scores of five different\nmachine learning methods using a primary dataset (500 patients from Dhaka\nMedical College Hospital). Five different supervised machine learning\ntechniques, including decision tree, random forest, logistic regression, naive\nbayes, and XGBoost, have been used to achieve optimal results on our dataset.\nAdditionally, this study applied SHAP analysis to the XGBoost model to\ninterpret the model's predictions and understand the impact of each feature on\nthe model's output. We compared the accuracy with which several algorithms\nclassified the data, as well as contrasted with other literature in this field.\nAfter final evaluation, this study found that XGBoost achieved the best model\naccuracy, which is 97%.\n","authors":["Taminul Islam","Md. Alif Sheakh","Mst. Sazia Tahosin","Most. Hasna Hena","Shopnil Akash","Yousef A. Bin Jardan","Gezahign Fentahun Wondmie","Hiba-Allah Nafidi","Mohammed Bourhia"],"pdf_url":"https://arxiv.org/pdf/2404.04686v1.pdf","comment":"Accepted for the Scientific Reports (Nature) journal. 32 pages, 12\n figures"},{"id":"http://arxiv.org/abs/2404.04677v1","updated":"2024-04-06T16:48:08Z","published":"2024-04-06T16:48:08Z","title":"Salient Sparse Visual Odometry With Pose-Only Supervision","summary":" Visual Odometry (VO) is vital for the navigation of autonomous systems,\nproviding accurate position and orientation estimates at reasonable costs.\nWhile traditional VO methods excel in some conditions, they struggle with\nchallenges like variable lighting and motion blur. Deep learning-based VO,\nthough more adaptable, can face generalization problems in new environments.\nAddressing these drawbacks, this paper presents a novel hybrid visual odometry\n(VO) framework that leverages pose-only supervision, offering a balanced\nsolution between robustness and the need for extensive labeling. We propose two\ncost-effective and innovative designs: a self-supervised homographic\npre-training for enhancing optical flow learning from pose-only labels and a\nrandom patch-based salient point detection strategy for more accurate optical\nflow patch extraction. These designs eliminate the need for dense optical flow\nlabels for training and significantly improve the generalization capability of\nthe system in diverse and challenging environments. Our pose-only supervised\nmethod achieves competitive performance on standard datasets and greater\nrobustness and generalization ability in extreme and unseen scenarios, even\ncompared to dense optical flow-supervised state-of-the-art methods.\n","authors":["Siyu Chen","Kangcheng Liu","Chen Wang","Shenghai Yuan","Jianfei Yang","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2404.04677v1.pdf","comment":"Accepted by IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2312.08591v2","updated":"2024-04-06T16:38:38Z","published":"2023-12-14T01:24:22Z","title":"Joint2Human: High-quality 3D Human Generation via Compact Spherical\n Embedding of 3D Joints","summary":" 3D human generation is increasingly significant in various applications.\nHowever, the direct use of 2D generative methods in 3D generation often results\nin losing local details, while methods that reconstruct geometry from generated\nimages struggle with global view consistency. In this work, we introduce\nJoint2Human, a novel method that leverages 2D diffusion models to generate\ndetailed 3D human geometry directly, ensuring both global structure and local\ndetails. To achieve this, we employ the Fourier occupancy field (FOF)\nrepresentation, enabling the direct generation of 3D shapes as preliminary\nresults with 2D generative models. With the proposed high-frequency enhancer\nand the multi-view recarving strategy, our method can seamlessly integrate the\ndetails from different views into a uniform global shape. To better utilize the\n3D human prior and enhance control over the generated geometry, we introduce a\ncompact spherical embedding of 3D joints. This allows for an effective guidance\nof pose during the generation process. Additionally, our method can generate 3D\nhumans guided by textual inputs. Our experimental results demonstrate the\ncapability of our method to ensure global structure, local details, high\nresolution, and low computational cost simultaneously. More results and the\ncode can be found on our project page at\nhttp://cic.tju.edu.cn/faculty/likun/projects/Joint2Human.\n","authors":["Muxin Zhang","Qiao Feng","Zhuo Su","Chao Wen","Zhou Xue","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2312.08591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04673v1","updated":"2024-04-06T16:29:10Z","published":"2024-04-06T16:29:10Z","title":"Neural-ABC: Neural Parametric Models for Articulated Body with Clothes","summary":" In this paper, we introduce Neural-ABC, a novel parametric model based on\nneural implicit functions that can represent clothed human bodies with\ndisentangled latent spaces for identity, clothing, shape, and pose. Traditional\nmesh-based representations struggle to represent articulated bodies with\nclothes due to the diversity of human body shapes and clothing styles, as well\nas the complexity of poses. Our proposed model provides a unified framework for\nparametric modeling, which can represent the identity, clothing, shape and pose\nof the clothed human body. Our proposed approach utilizes the power of neural\nimplicit functions as the underlying representation and integrates\nwell-designed structures to meet the necessary requirements. Specifically, we\nrepresent the underlying body as a signed distance function and clothing as an\nunsigned distance function, and they can be uniformly represented as unsigned\ndistance fields. Different types of clothing do not require predefined\ntopological structures or classifications, and can follow changes in the\nunderlying body to fit the body. Additionally, we construct poses using a\ncontrollable articulated structure. The model is trained on both open and newly\nconstructed datasets, and our decoupling strategy is carefully designed to\nensure optimal performance. Our model excels at disentangling clothing and\nidentity in different shape and poses while preserving the style of the\nclothing. We demonstrate that Neural-ABC fits new observations of different\ntypes of clothing. Compared to other state-of-the-art parametric models,\nNeural-ABC demonstrates powerful advantages in the reconstruction of clothed\nhuman bodies, as evidenced by fitting raw scans, depth maps and images. We show\nthat the attributes of the fitted results can be further edited by adjusting\ntheir identities, clothing, shape and pose codes.\n","authors":["Honghu Chen","Yuxin Yao","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04673v1.pdf","comment":"Accepted by IEEE Transactions on Visualization and Computer Graphics.\n Project page: https://ustc3dv.github.io/NeuralABC/"},{"id":"http://arxiv.org/abs/2402.15756v3","updated":"2024-04-06T16:04:51Z","published":"2024-02-24T08:07:48Z","title":"Detection Is Tracking: Point Cloud Multi-Sweep Deep Learning Models\n Revisited","summary":" Conventional tracking paradigm takes in instantaneous measurements such as\nrange and bearing, and produces object tracks across time. In applications such\nas autonomous driving, lidar measurements in the form of point clouds are\nusually passed through a \"virtual sensor\" realized by a deep learning model, to\nproduce \"measurements\" such as bounding boxes, which are in turn ingested by a\ntracking module to produce object tracks. Very often multiple lidar sweeps are\naccumulated in a buffer to merge and become the input to the virtual sensor. We\nargue in this paper that such an input already contains temporal information,\nand therefore the virtual sensor output should also contain temporal\ninformation, not just instantaneous values for the time corresponding to the\nend of the buffer. In particular, we present the deep learning model called\nMULti-Sweep PAired Detector (MULSPAD) that produces, for each detected object,\na pair of bounding boxes at both the end time and the beginning time of the\ninput buffer. This is achieved with fairly straightforward changes in commonly\nused lidar detection models, and with only marginal extra processing, but the\nresulting symmetry is satisfying. Such paired detections make it possible not\nonly to construct rudimentary trackers fairly easily, but also to construct\nmore sophisticated trackers that can exploit the extra information conveyed by\nthe pair and be robust to choices of motion models and object birth/death\nmodels. We have conducted preliminary training and experimentation using Waymo\nOpen Dataset, which shows the efficacy of our proposed method.\n","authors":["Lingji Chen"],"pdf_url":"https://arxiv.org/pdf/2402.15756v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04665v1","updated":"2024-04-06T15:48:14Z","published":"2024-04-06T15:48:14Z","title":"Adaptive Intra-Class Variation Contrastive Learning for Unsupervised\n Person Re-Identification","summary":" The memory dictionary-based contrastive learning method has achieved\nremarkable results in the field of unsupervised person Re-ID. However, The\nmethod of updating memory based on all samples does not fully utilize the\nhardest sample to improve the generalization ability of the model, and the\nmethod based on hardest sample mining will inevitably introduce false-positive\nsamples that are incorrectly clustered in the early stages of the model.\nClustering-based methods usually discard a significant number of outliers,\nleading to the loss of valuable information. In order to address the issues\nmentioned before, we propose an adaptive intra-class variation contrastive\nlearning algorithm for unsupervised Re-ID, called AdaInCV. And the algorithm\nquantitatively evaluates the learning ability of the model for each class by\nconsidering the intra-class variations after clustering, which helps in\nselecting appropriate samples during the training process of the model. To be\nmore specific, two new strategies are proposed: Adaptive Sample Mining (AdaSaM)\nand Adaptive Outlier Filter (AdaOF). The first one gradually creates more\nreliable clusters to dynamically refine the memory, while the second can\nidentify and filter out valuable outliers as negative samples.\n","authors":["Lingzhi Liu","Haiyang Zhang","Chengwei Tang","Tiantian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04663v1","updated":"2024-04-06T15:31:57Z","published":"2024-04-06T15:31:57Z","title":"Focused Active Learning for Histopathological Image Classification","summary":" Active Learning (AL) has the potential to solve a major problem of digital\npathology: the efficient acquisition of labeled data for machine learning\nalgorithms. However, existing AL methods often struggle in realistic settings\nwith artifacts, ambiguities, and class imbalances, as commonly seen in the\nmedical field. The lack of precise uncertainty estimations leads to the\nacquisition of images with a low informative value. To address these\nchallenges, we propose Focused Active Learning (FocAL), which combines a\nBayesian Neural Network with Out-of-Distribution detection to estimate\ndifferent uncertainties for the acquisition function. Specifically, the\nweighted epistemic uncertainty accounts for the class imbalance, aleatoric\nuncertainty for ambiguous images, and an OoD score for artifacts. We perform\nextensive experiments to validate our method on MNIST and the real-world Panda\ndataset for the classification of prostate cancer. The results confirm that\nother AL methods are 'distracted' by ambiguities and artifacts which harm the\nperformance. FocAL effectively focuses on the most informative images, avoiding\nambiguities and artifacts during acquisition. For both experiments, FocAL\noutperforms existing AL approaches, reaching a Cohen's kappa of 0.764 with only\n0.69% of the labeled Panda data.\n","authors":["Arne Schmidt","Pablo Morales-Álvarez","Lee A. D. Cooper","Lee A. Newberg","Andinet Enquobahrie","Aggelos K. Katsaggelos","Rafael Molina"],"pdf_url":"https://arxiv.org/pdf/2404.04663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00335v2","updated":"2024-04-06T15:25:51Z","published":"2024-03-30T12:10:34Z","title":"Learning Trimaps via Clicks for Image Matting","summary":" Despite significant advancements in image matting, existing models heavily\ndepend on manually-drawn trimaps for accurate results in natural image\nscenarios. However, the process of obtaining trimaps is time-consuming, lacking\nuser-friendliness and device compatibility. This reliance greatly limits the\npractical application of all trimap-based matting methods. To address this\nissue, we introduce Click2Trimap, an interactive model capable of predicting\nhigh-quality trimaps and alpha mattes with minimal user click inputs. Through\nanalyzing real users' behavioral logic and characteristics of trimaps, we\nsuccessfully propose a powerful iterative three-class training strategy and a\ndedicated simulation function, making Click2Trimap exhibit versatility across\nvarious scenarios. Quantitative and qualitative assessments on synthetic and\nreal-world matting datasets demonstrate Click2Trimap's superior performance\ncompared to all existing trimap-free matting methods. Especially, in the user\nstudy, Click2Trimap achieves high-quality trimap and matting predictions in\njust an average of 5 seconds per image, demonstrating its substantial practical\nvalue in real-world applications.\n","authors":["Chenyi Zhang","Yihan Hu","Henghui Ding","Humphrey Shi","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.00335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04654v1","updated":"2024-04-06T15:14:25Z","published":"2024-04-06T15:14:25Z","title":"Music Recommendation Based on Facial Emotion Recognition","summary":" Introduction: Music provides an incredible avenue for individuals to express\ntheir thoughts and emotions, while also serving as a delightful mode of\nentertainment for enthusiasts and music lovers. Objectives: This paper presents\na comprehensive approach to enhancing the user experience through the\nintegration of emotion recognition, music recommendation, and explainable AI\nusing GRAD-CAM. Methods: The proposed methodology utilizes a ResNet50 model\ntrained on the Facial Expression Recognition (FER) dataset, consisting of real\nimages of individuals expressing various emotions. Results: The system achieves\nan accuracy of 82% in emotion classification. By leveraging GRAD-CAM, the model\nprovides explanations for its predictions, allowing users to understand the\nreasoning behind the system's recommendations. The model is trained on both FER\nand real user datasets, which include labelled facial expressions, and real\nimages of individuals expressing various emotions. The training process\ninvolves pre-processing the input images, extracting features through\nconvolutional layers, reasoning with dense layers, and generating emotion\npredictions through the output layer Conclusion: The proposed methodology,\nleveraging the Resnet50 model with ROI-based analysis and explainable AI\ntechniques, offers a robust and interpretable solution for facial emotion\ndetection paper.\n","authors":["Rajesh B","Keerthana V","Narayana Darapaneni","Anwesh Reddy P"],"pdf_url":"https://arxiv.org/pdf/2404.04654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.09760v2","updated":"2024-04-06T15:11:41Z","published":"2021-08-22T15:44:37Z","title":"Image Inpainting via Conditional Texture and Structure Dual Generation","summary":" Deep generative approaches have recently made considerable progress in image\ninpainting by introducing structure priors. Due to the lack of proper\ninteraction with image texture during structure reconstruction, however,\ncurrent solutions are incompetent in handling the cases with large corruptions,\nand they generally suffer from distorted results. In this paper, we propose a\nnovel two-stream network for image inpainting, which models the\nstructure-constrained texture synthesis and texture-guided structure\nreconstruction in a coupled manner so that they better leverage each other for\nmore plausible generation. Furthermore, to enhance the global consistency, a\nBi-directional Gated Feature Fusion (Bi-GFF) module is designed to exchange and\ncombine the structure and texture information and a Contextual Feature\nAggregation (CFA) module is developed to refine the generated contents by\nregion affinity learning and multi-scale feature aggregation. Qualitative and\nquantitative experiments on the CelebA, Paris StreetView and Places2 datasets\ndemonstrate the superiority of the proposed method. Our code is available at\nhttps://github.com/Xiefan-Guo/CTSDG.\n","authors":["Xiefan Guo","Hongyu Yang","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2108.09760v2.pdf","comment":"Accepted by ICCV 2021"},{"id":"http://arxiv.org/abs/2404.04653v1","updated":"2024-04-06T15:10:29Z","published":"2024-04-06T15:10:29Z","title":"HawkDrive: A Transformer-driven Visual Perception System for Autonomous\n Driving in Night Scene","summary":" Many established vision perception systems for autonomous driving scenarios\nignore the influence of light conditions, one of the key elements for driving\nsafety. To address this problem, we present HawkDrive, a novel perception\nsystem with hardware and software solutions. Hardware that utilizes stereo\nvision perception, which has been demonstrated to be a more reliable way of\nestimating depth information than monocular vision, is partnered with the edge\ncomputing device Nvidia Jetson Xavier AGX. Our software for low light\nenhancement, depth estimation, and semantic segmentation tasks, is a\ntransformer-based neural network. Our software stack, which enables fast\ninference and noise reduction, is packaged into system modules in Robot\nOperating System 2 (ROS2). Our experimental results have shown that the\nproposed end-to-end system is effective in improving the depth estimation and\nsemantic segmentation performance. Our dataset and codes will be released at\nhttps://github.com/ZionGo6/HawkDrive.\n","authors":["Ziang Guo","Stepan Perminov","Mikhail Konenkov","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2404.04653v1.pdf","comment":"Accepted by IEEE IV 2024"},{"id":"http://arxiv.org/abs/2404.04650v1","updated":"2024-04-06T14:56:59Z","published":"2024-04-06T14:56:59Z","title":"InitNO: Boosting Text-to-Image Diffusion Models via Initial Noise\n Optimization","summary":" Recent strides in the development of diffusion models, exemplified by\nadvancements such as Stable Diffusion, have underscored their remarkable\nprowess in generating visually compelling images. However, the imperative of\nachieving a seamless alignment between the generated image and the provided\nprompt persists as a formidable challenge. This paper traces the root of these\ndifficulties to invalid initial noise, and proposes a solution in the form of\nInitial Noise Optimization (InitNO), a paradigm that refines this noise.\nConsidering text prompts, not all random noises are effective in synthesizing\nsemantically-faithful images. We design the cross-attention response score and\nthe self-attention conflict score to evaluate the initial noise, bifurcating\nthe initial latent space into valid and invalid sectors. A strategically\ncrafted noise optimization pipeline is developed to guide the initial noise\ntowards valid regions. Our method, validated through rigorous experimentation,\nshows a commendable proficiency in generating images in strict accordance with\ntext prompts. Our code is available at https://github.com/xiefan-guo/initno.\n","authors":["Xiefan Guo","Jinlin Liu","Miaomiao Cui","Jiankai Li","Hongyu Yang","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.04650v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.00863v3","updated":"2024-04-06T14:55:42Z","published":"2024-02-01T18:58:44Z","title":"Geometry Transfer for Stylizing Radiance Fields","summary":" Shape and geometric patterns are essential in defining stylistic identity.\nHowever, current 3D style transfer methods predominantly focus on transferring\ncolors and textures, often overlooking geometric aspects. In this paper, we\nintroduce Geometry Transfer, a novel method that leverages geometric\ndeformation for 3D style transfer. This technique employs depth maps to extract\na style guide, subsequently applied to stylize the geometry of radiance fields.\nMoreover, we propose new techniques that utilize geometric cues from the 3D\nscene, thereby enhancing aesthetic expressiveness and more accurately\nreflecting intended styles. Our extensive experiments show that Geometry\nTransfer enables a broader and more expressive range of stylizations, thereby\nsignificantly expanding the scope of 3D style transfer.\n","authors":["Hyunyoung Jung","Seonghyeon Nam","Nikolaos Sarafianos","Sungjoo Yoo","Alexander Sorkine-Hornung","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2402.00863v3.pdf","comment":"CVPR 2024. Project page: https://hyblue.github.io/geo-srf/"},{"id":"http://arxiv.org/abs/2401.04747v2","updated":"2024-04-06T14:53:51Z","published":"2024-01-09T11:38:18Z","title":"DiffSHEG: A Diffusion-Based Approach for Real-Time Speech-driven\n Holistic 3D Expression and Gesture Generation","summary":" We propose DiffSHEG, a Diffusion-based approach for Speech-driven Holistic 3D\nExpression and Gesture generation with arbitrary length. While previous works\nfocused on co-speech gesture or expression generation individually, the joint\ngeneration of synchronized expressions and gestures remains barely explored. To\naddress this, our diffusion-based co-speech motion generation transformer\nenables uni-directional information flow from expression to gesture,\nfacilitating improved matching of joint expression-gesture distributions.\nFurthermore, we introduce an outpainting-based sampling strategy for arbitrary\nlong sequence generation in diffusion models, offering flexibility and\ncomputational efficiency. Our method provides a practical solution that\nproduces high-quality synchronized expression and gesture generation driven by\nspeech. Evaluated on two public datasets, our approach achieves\nstate-of-the-art performance both quantitatively and qualitatively.\nAdditionally, a user study confirms the superiority of DiffSHEG over prior\napproaches. By enabling the real-time generation of expressive and synchronized\nmotions, DiffSHEG showcases its potential for various applications in the\ndevelopment of digital humans and embodied agents.\n","authors":["Junming Chen","Yunfei Liu","Jianan Wang","Ailing Zeng","Yu Li","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2401.04747v2.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://jeremycjm.github.io/proj/DiffSHEG"},{"id":"http://arxiv.org/abs/2404.04647v1","updated":"2024-04-06T14:49:36Z","published":"2024-04-06T14:49:36Z","title":"Structured Gradient-based Interpretations via Norm-Regularized\n Adversarial Training","summary":" Gradient-based saliency maps have been widely used to explain the decisions\nof deep neural network classifiers. However, standard gradient-based\ninterpretation maps, including the simple gradient and integrated gradient\nalgorithms, often lack desired structures such as sparsity and connectedness in\ntheir application to real-world computer vision models. A frequently used\napproach to inducing sparsity structures into gradient-based saliency maps is\nto alter the simple gradient scheme using sparsification or norm-based\nregularization. A drawback with such post-processing methods is their\nfrequently-observed significant loss in fidelity to the original simple\ngradient map. In this work, we propose to apply adversarial training as an\nin-processing scheme to train neural networks with structured simple gradient\nmaps. We show a duality relation between the regularized norms of the\nadversarial perturbations and gradient-based maps, based on which we design\nadversarial training loss functions promoting sparsity and group-sparsity\nproperties in simple gradient maps. We present several numerical results to\nshow the influence of our proposed norm-based adversarial training methods on\nthe standard gradient-based maps of standard neural network architectures on\nbenchmark image datasets.\n","authors":["Shizhan Gong","Qi Dou","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2404.04647v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04643v1","updated":"2024-04-06T14:28:01Z","published":"2024-04-06T14:28:01Z","title":"Constrained 6-DoF Grasp Generation on Complex Shapes for Improved\n Dual-Arm Manipulation","summary":" Efficiently generating grasp poses tailored to specific regions of an object\nis vital for various robotic manipulation tasks, especially in a dual-arm\nsetup. This scenario presents a significant challenge due to the complex\ngeometries involved, requiring a deep understanding of the local geometry to\ngenerate grasps efficiently on the specified constrained regions. Existing\nmethods only explore settings involving table-top/small objects and require\naugmented datasets to train, limiting their performance on complex objects. We\npropose CGDF: Constrained Grasp Diffusion Fields, a diffusion-based grasp\ngenerative model that generalizes to objects with arbitrary geometries, as well\nas generates dense grasps on the target regions. CGDF uses a part-guided\ndiffusion approach that enables it to get high sample efficiency in constrained\ngrasping without explicitly training on massive constraint-augmented datasets.\nWe provide qualitative and quantitative comparisons using analytical metrics\nand in simulation, in both unconstrained and constrained settings to show that\nour method can generalize to generate stable grasps on complex objects,\nespecially useful for dual-arm manipulation settings, while existing methods\nstruggle to do so.\n","authors":["Gaurav Singh","Sanket Kalwar","Md Faizal Karim","Bipasha Sen","Nagamanikandan Govindan","Srinath Sridhar","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.04643v1.pdf","comment":"Project Page: https://constrained-grasp-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2404.04635v1","updated":"2024-04-06T13:59:41Z","published":"2024-04-06T13:59:41Z","title":"A Deep Look Into -- Automated Lung X-Ray Abnormality Detection System","summary":" Introduction: Automated Lung X-Ray Abnormality Detection System is the\napplication which distinguish the normal x-ray images from infected x-ray\nimages and highlight area considered for prediction, with the recent pandemic a\nneed to have a non-conventional method and faster detecting diseases, for which\nX ray serves the purpose. Obectives: As of current situation any viral disease\nthat is infectious is potential pandemic, so there is need for cheap and early\ndetection system. Methods: This research will help to eases the work of expert\nto do further analysis. Accuracy of three different preexisting models such as\nDenseNet, MobileNet and VGG16 were high but models over-fitted primarily due to\nblack and white images. Results: This led to building up new method such as as\nV-BreathNet which gave more than 96% percent accuracy. Conclusion: Thus, it can\nbe stated that not all state-of art CNN models can be used on B/W images. In\nconclusion not all state-of-art CNN models can be used on B/W images.\n","authors":["Nagullas KS","Vivekanand. V","Narayana Darapaneni","Anwesh R P"],"pdf_url":"https://arxiv.org/pdf/2404.04635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04629v1","updated":"2024-04-06T13:25:29Z","published":"2024-04-06T13:25:29Z","title":"DifFUSER: Diffusion Model for Robust Multi-Sensor Fusion in 3D Object\n Detection and BEV Segmentation","summary":" Diffusion models have recently gained prominence as powerful deep generative\nmodels, demonstrating unmatched performance across various domains. However,\ntheir potential in multi-sensor fusion remains largely unexplored. In this\nwork, we introduce DifFUSER, a novel approach that leverages diffusion models\nfor multi-modal fusion in 3D object detection and BEV map segmentation.\nBenefiting from the inherent denoising property of diffusion, DifFUSER is able\nto refine or even synthesize sensor features in case of sensor malfunction,\nthereby improving the quality of the fused output. In terms of architecture,\nour DifFUSER blocks are chained together in a hierarchical BiFPN fashion,\ntermed cMini-BiFPN, offering an alternative architecture for latent diffusion.\nWe further introduce a Gated Self-conditioned Modulated (GSM) latent diffusion\nmodule together with a Progressive Sensor Dropout Training (PSDT) paradigm,\ndesigned to add stronger conditioning to the diffusion process and robustness\nto sensor failures. Our extensive evaluations on the Nuscenes dataset reveal\nthat DifFUSER not only achieves state-of-the-art performance with a 69.1% mIOU\nin BEV map segmentation tasks but also competes effectively with leading\ntransformer-based fusion techniques in 3D object detection.\n","authors":["Duy-Tho Le","Hengcan Shi","Jianfei Cai","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2404.04629v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2404.04627v1","updated":"2024-04-06T13:25:00Z","published":"2024-04-06T13:25:00Z","title":"Self-Training Large Language Models for Improved Visual Program\n Synthesis With Visual Reinforcement","summary":" Visual program synthesis is a promising approach to exploit the reasoning\nabilities of large language models for compositional computer vision tasks.\nPrevious work has used few-shot prompting with frozen LLMs to synthesize visual\nprograms. Training an LLM to write better visual programs is an attractive\nprospect, but it is unclear how to accomplish this. No dataset of visual\nprograms for training exists, and acquisition of a visual program dataset\ncannot be easily crowdsourced due to the need for expert annotators. To get\naround the lack of direct supervision, we explore improving the program\nsynthesis abilities of an LLM using feedback from interactive experience. We\npropose a method where we exploit existing annotations for a vision-language\ntask to improvise a coarse reward signal for that task, treat the LLM as a\npolicy, and apply reinforced self-training to improve the visual program\nsynthesis ability of the LLM for that task. We describe a series of experiments\non object detection, compositional visual question answering, and image-text\nretrieval, and show that in each case, the self-trained LLM outperforms or\nperforms on par with few-shot frozen LLMs that are an order of magnitude\nlarger. Website: https://zaidkhan.me/ViReP\n","authors":["Zaid Khan","Vijay Kumar BG","Samuel Schulter","Yun Fu","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2404.04627v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04624v1","updated":"2024-04-06T13:14:04Z","published":"2024-04-06T13:14:04Z","title":"Bridging the Gap Between End-to-End and Two-Step Text Spotting","summary":" Modularity plays a crucial role in the development and maintenance of complex\nsystems. While end-to-end text spotting efficiently mitigates the issues of\nerror accumulation and sub-optimal performance seen in traditional two-step\nmethodologies, the two-step methods continue to be favored in many competitions\nand practical settings due to their superior modularity. In this paper, we\nintroduce Bridging Text Spotting, a novel approach that resolves the error\naccumulation and suboptimal performance issues in two-step methods while\nretaining modularity. To achieve this, we adopt a well-trained detector and\nrecognizer that are developed and trained independently and then lock their\nparameters to preserve their already acquired capabilities. Subsequently, we\nintroduce a Bridge that connects the locked detector and recognizer through a\nzero-initialized neural network. This zero-initialized neural network,\ninitialized with weights set to zeros, ensures seamless integration of the\nlarge receptive field features in detection into the locked recognizer.\nFurthermore, since the fixed detector and recognizer cannot naturally acquire\nend-to-end optimization features, we adopt the Adapter to facilitate their\nefficient learning of these features. We demonstrate the effectiveness of the\nproposed method through extensive experiments: Connecting the latest detector\nand recognizer through Bridging Text Spotting, we achieved an accuracy of 83.3%\non Total-Text, 69.8% on CTW1500, and 89.5% on ICDAR 2015. The code is available\nat https://github.com/mxin262/Bridging-Text-Spotting.\n","authors":["Mingxin Huang","Hongliang Li","Yuliang Liu","Xiang Bai","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2404.04624v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.04619v1","updated":"2024-04-06T12:51:00Z","published":"2024-04-06T12:51:00Z","title":"Do We Really Need a Complex Agent System? Distill Embodied Agent into a\n Single Model","summary":" With the power of large language models (LLMs), open-ended embodied agents\ncan flexibly understand human instructions, generate interpretable guidance\nstrategies, and output executable actions. Nowadays, Multi-modal Language\nModels~(MLMs) integrate multi-modal signals into LLMs, further bringing richer\nperception to entity agents and allowing embodied agents to perceive\nworld-understanding tasks more delicately. However, existing works: 1) operate\nindependently by agents, each containing multiple LLMs, from perception to\naction, resulting in gaps between complex tasks and execution; 2) train MLMs on\nstatic data, struggling with dynamics in open-ended scenarios; 3) input prior\nknowledge directly as prompts, suppressing application flexibility. We propose\nSTEVE-2, a hierarchical knowledge distillation framework for open-ended\nembodied tasks, characterized by 1) a hierarchical system for multi-granular\ntask division, 2) a mirrored distillation method for parallel simulation data,\nand 3) an extra expert model for bringing additional knowledge into parallel\nsimulation. After distillation, embodied agents can complete complex,\nopen-ended tasks without additional expert guidance, utilizing the performance\nand knowledge of a versatile MLM. Extensive evaluations on navigation and\ncreation tasks highlight the superior performance of STEVE-2 in open-ended\ntasks, with $1.4 \\times$ - $7.3 \\times$ in performance.\n","authors":["Zhonghan Zhao","Ke Ma","Wenhao Chai","Xuan Wang","Kewei Chen","Dongxu Guo","Yanting Zhang","Hongwei Wang","Gaoang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04619v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.08282"},{"id":"http://arxiv.org/abs/2404.04617v1","updated":"2024-04-06T12:50:08Z","published":"2024-04-06T12:50:08Z","title":"Empowering Image Recovery_ A Multi-Attention Approach","summary":" We propose Diverse Restormer (DART), a novel image restoration method that\neffectively integrates information from various sources (long sequences, local\nand global regions, feature dimensions, and positional dimensions) to address\nrestoration challenges. While Transformer models have demonstrated excellent\nperformance in image restoration due to their self-attention mechanism, they\nface limitations in complex scenarios. Leveraging recent advancements in\nTransformers and various attention mechanisms, our method utilizes customized\nattention mechanisms to enhance overall performance. DART, our novel network\narchitecture, employs windowed attention to mimic the selective focusing\nmechanism of human eyes. By dynamically adjusting receptive fields, it\noptimally captures the fundamental features crucial for image resolution\nreconstruction. Efficiency and performance balance are achieved through the\nLongIR attention mechanism for long sequence image restoration. Integration of\nattention mechanisms across feature and positional dimensions further enhances\nthe recovery of fine details. Evaluation across five restoration tasks\nconsistently positions DART at the forefront. Upon acceptance, we commit to\nproviding publicly accessible code and models to ensure reproducibility and\nfacilitate further research.\n","authors":["Juan Wen","Yawei Li","Chao Zhang","Weiyan Hou","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.04617v1.pdf","comment":"12 pages, 10 figures, 12 tables"},{"id":"http://arxiv.org/abs/2402.07710v3","updated":"2024-04-06T12:49:43Z","published":"2024-02-12T15:23:19Z","title":"Optimizing Sparse Convolution on GPUs with CUDA for 3D Point Cloud\n Processing in Embedded Systems","summary":" In recent years, there has been a significant increase in the utilization of\ndeep learning methods, particularly convolutional neural networks (CNNs), which\nhave emerged as the dominant approach in various domains that involve\nstructured grid data, such as picture analysis and processing. Nevertheless,\nthe exponential growth in the utilization of LiDAR and 3D sensors across many\ndomains has resulted in an increased need for the analysis of 3D point clouds.\nThe utilization of 3D point clouds is crucial in various applications,\nincluding object recognition and segmentation, as they offer a spatial\ndepiction of things within a three-dimensional environment. In contrast to\nphotos, point clouds exhibit sparsity and lack a regular grid, hence posing\ndistinct processing and computational issues.\n","authors":["Chester Luo","Kevin Lai"],"pdf_url":"https://arxiv.org/pdf/2402.07710v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2402.09329v3","updated":"2024-04-06T12:46:27Z","published":"2024-02-14T17:18:15Z","title":"YOLOv8-AM: YOLOv8 with Attention Mechanisms for Pediatric Wrist Fracture\n Detection","summary":" Wrist trauma and even fractures occur frequently in daily life, particularly\namong children who account for a significant proportion of fracture cases.\nBefore performing surgery, surgeons often request patients to undergo X-ray\nimaging first and prepare for it based on the analysis of the radiologist. With\nthe development of neural networks, You Only Look Once (YOLO) series models\nhave been widely used in fracture detection as computer-assisted diagnosis\n(CAD). In 2023, Ultralytics presented the latest version of the YOLO models,\nwhich has been employed for detecting fractures across various parts of the\nbody. Attention mechanism is one of the hottest methods to improve the model\nperformance. This research work proposes YOLOv8-AM, which incorporates the\nattention mechanism into the original YOLOv8 architecture. Specifically, we\nrespectively employ four attention modules, Convolutional Block Attention\nModule (CBAM), Global Attention Mechanism (GAM), Efficient Channel Attention\n(ECA), and Shuffle Attention (SA), to design the improved models and train them\non GRAZPEDWRI-DX dataset. Experimental results demonstrate that the mean\nAverage Precision at IoU 50 (mAP 50) of the YOLOv8-AM model based on ResBlock +\nCBAM (ResCBAM) increased from 63.6% to 65.8%, which achieves the\nstate-of-the-art (SOTA) performance. Conversely, YOLOv8-AM model incorporating\nGAM obtains the mAP 50 value of 64.2%, which is not a satisfactory enhancement.\nTherefore, we combine ResBlock and GAM, introducing ResGAM to design another\nnew YOLOv8-AM model, whose mAP 50 value is increased to 65.0%. The\nimplementation code for this study is available on GitHub at\nhttps://github.com/RuiyangJu/Fracture_Detection_Improved_YOLOv8.\n","authors":["Chun-Tse Chien","Rui-Yang Ju","Kuang-Yi Chou","Enkaer Xieerke","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2402.09329v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07063v3","updated":"2024-04-06T12:36:36Z","published":"2023-12-12T08:32:55Z","title":"Template Free Reconstruction of Human-object Interaction with Procedural\n Interaction Generation","summary":" Reconstructing human-object interaction in 3D from a single RGB image is a\nchallenging task and existing data driven methods do not generalize beyond the\nobjects present in the carefully curated 3D interaction datasets. Capturing\nlarge-scale real data to learn strong interaction and 3D shape priors is very\nexpensive due to the combinatorial nature of human-object interactions. In this\npaper, we propose ProciGen (Procedural interaction Generation), a method to\nprocedurally generate datasets with both, plausible interaction and diverse\nobject variation. We generate 1M+ human-object interaction pairs in 3D and\nleverage this large-scale data to train our HDM (Hierarchical Diffusion Model),\na novel method to reconstruct interacting human and unseen objects, without any\ntemplates. Our HDM is an image-conditioned diffusion model that learns both\nrealistic interaction and highly accurate human and object shapes. Experiments\nshow that our HDM trained with ProciGen significantly outperforms prior methods\nthat requires template meshes and that our dataset allows training methods with\nstrong generalization ability to unseen object instances. Our code and data are\nreleased.\n","authors":["Xianghui Xie","Bharat Lal Bhatnagar","Jan Eric Lenssen","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2312.07063v3.pdf","comment":"CVPR'24 camera ready version. 25 pages, 20 figures. Project page:\n https://virtualhumans.mpi-inf.mpg.de/procigen-hdm"},{"id":"http://arxiv.org/abs/2404.04608v1","updated":"2024-04-06T12:27:21Z","published":"2024-04-06T12:27:21Z","title":"Panoptic Perception: A Novel Task and Fine-grained Dataset for Universal\n Remote Sensing Image Interpretation","summary":" Current remote-sensing interpretation models often focus on a single task\nsuch as detection, segmentation, or caption. However, the task-specific\ndesigned models are unattainable to achieve the comprehensive multi-level\ninterpretation of images. The field also lacks support for multi-task joint\ninterpretation datasets. In this paper, we propose Panoptic Perception, a novel\ntask and a new fine-grained dataset (FineGrip) to achieve a more thorough and\nuniversal interpretation for RSIs. The new task, 1) integrates pixel-level,\ninstance-level, and image-level information for universal image perception, 2)\ncaptures image information from coarse to fine granularity, achieving deeper\nscene understanding and description, and 3) enables various independent tasks\nto complement and enhance each other through multi-task learning. By\nemphasizing multi-task interactions and the consistency of perception results,\nthis task enables the simultaneous processing of fine-grained foreground\ninstance segmentation, background semantic segmentation, and global\nfine-grained image captioning. Concretely, the FineGrip dataset includes 2,649\nremote sensing images, 12,054 fine-grained instance segmentation masks\nbelonging to 20 foreground things categories, 7,599 background semantic masks\nfor 5 stuff classes and 13,245 captioning sentences. Furthermore, we propose a\njoint optimization-based panoptic perception model. Experimental results on\nFineGrip demonstrate the feasibility of the panoptic perception task and the\nbeneficial effect of multi-task joint optimization on individual tasks. The\ndataset will be publicly available.\n","authors":["Danpei Zhao","Bo Yuan","Ziqiang Chen","Tian Li","Zhuoran Liu","Wentao Li","Yue Gao"],"pdf_url":"https://arxiv.org/pdf/2404.04608v1.pdf","comment":"Undergoing Review"},{"id":"http://arxiv.org/abs/2403.20002v2","updated":"2024-04-06T11:44:36Z","published":"2024-03-29T06:33:13Z","title":"Grounding and Enhancing Grid-based Models for Neural Fields","summary":" Many contemporary studies utilize grid-based models for neural field\nrepresentation, but a systematic analysis of grid-based models is still\nmissing, hindering the improvement of those models. Therefore, this paper\nintroduces a theoretical framework for grid-based models. This framework points\nout that these models' approximation and generalization behaviors are\ndetermined by grid tangent kernels (GTK), which are intrinsic properties of\ngrid-based models. The proposed framework facilitates a consistent and\nsystematic analysis of diverse grid-based models. Furthermore, the introduced\nframework motivates the development of a novel grid-based model named the\nMultiplicative Fourier Adaptive Grid (MulFAGrid). The numerical analysis\ndemonstrates that MulFAGrid exhibits a lower generalization bound than its\npredecessors, indicating its robust generalization performance. Empirical\nstudies reveal that MulFAGrid achieves state-of-the-art performance in various\ntasks, including 2D image fitting, 3D signed distance field (SDF)\nreconstruction, and novel view synthesis, demonstrating superior representation\nability. The project website is available at\nhttps://sites.google.com/view/cvpr24-2034-submission/home.\n","authors":["Zelin Zhao","Fenglei Fan","Wenlong Liao","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2403.20002v2.pdf","comment":"Accepted in CVPR24 as an oral presentation. Pre-rebuttal scores: 555.\n Post-rebuttal scores: 555"},{"id":"http://arxiv.org/abs/2404.04586v1","updated":"2024-04-06T10:50:02Z","published":"2024-04-06T10:50:02Z","title":"PIE: Physics-inspired Low-light Enhancement","summary":" In this paper, we propose a physics-inspired contrastive learning paradigm\nfor low-light enhancement, called PIE. PIE primarily addresses three issues:\n(i) To resolve the problem of existing learning-based methods often training a\nLLE model with strict pixel-correspondence image pairs, we eliminate the need\nfor pixel-correspondence paired training data and instead train with unpaired\nimages. (ii) To address the disregard for negative samples and the inadequacy\nof their generation in existing methods, we incorporate physics-inspired\ncontrastive learning for LLE and design the Bag of Curves (BoC) method to\ngenerate more reasonable negative samples that closely adhere to the underlying\nphysical imaging principle. (iii) To overcome the reliance on semantic ground\ntruths in existing methods, we propose an unsupervised regional segmentation\nmodule, ensuring regional brightness consistency while eliminating the\ndependency on semantic ground truths. Overall, the proposed PIE can effectively\nlearn from unpaired positive/negative samples and smoothly realize non-semantic\nregional enhancement, which is clearly different from existing LLE efforts.\nBesides the novel architecture of PIE, we explore the gain of PIE on downstream\ntasks such as semantic segmentation and face detection. Training on readily\navailable open data and extensive experiments demonstrate that our method\nsurpasses the state-of-the-art LLE models over six independent cross-scenes\ndatasets. PIE runs fast with reasonable GFLOPs in test time, making it easy to\nuse on mobile devices.\n","authors":["Dong Liang","Zhengyan Xu","Ling Li","Mingqiang Wei","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.04586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04584v1","updated":"2024-04-06T10:45:02Z","published":"2024-04-06T10:45:02Z","title":"D$^3$: Scaling Up Deepfake Detection by Learning from Discrepancy","summary":" The boom of Generative AI brings opportunities entangled with risks and\nconcerns. In this work, we seek a step toward a universal deepfake detection\nsystem with better generalization and robustness, to accommodate the\nresponsible deployment of diverse image generative models. We do so by first\nscaling up the existing detection task setup from the one-generator to\nmultiple-generators in training, during which we disclose two challenges\npresented in prior methodological designs. Specifically, we reveal that the\ncurrent methods tailored for training on one specific generator either struggle\nto learn comprehensive artifacts from multiple generators or tend to sacrifice\ntheir ability to identify fake images from seen generators (i.e., In-Domain\nperformance) to exchange the generalization for unseen generators (i.e.,\nOut-Of-Domain performance). To tackle the above challenges, we propose our\nDiscrepancy Deepfake Detector (D$^3$) framework, whose core idea is to learn\nthe universal artifacts from multiple generators by introducing a parallel\nnetwork branch that takes a distorted image as extra discrepancy signal to\nsupplement its original counterpart. Extensive scaled-up experiments on the\nmerged UFD and GenImage datasets with six detection models demonstrate the\neffectiveness of our framework, achieving a 5.3% accuracy improvement in the\nOOD testing compared to the current SOTA methods while maintaining the ID\nperformance.\n","authors":["Yongqi Yang","Zhihao Qian","Ye Zhu","Yu Wu"],"pdf_url":"https://arxiv.org/pdf/2404.04584v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.04580v1","updated":"2024-04-06T10:30:31Z","published":"2024-04-06T10:30:31Z","title":"SDFR: Synthetic Data for Face Recognition Competition","summary":" Large-scale face recognition datasets are collected by crawling the Internet\nand without individuals' consent, raising legal, ethical, and privacy concerns.\nWith the recent advances in generative models, recently several works proposed\ngenerating synthetic face recognition datasets to mitigate concerns in\nweb-crawled face recognition datasets. This paper presents the summary of the\nSynthetic Data for Face Recognition (SDFR) Competition held in conjunction with\nthe 18th IEEE International Conference on Automatic Face and Gesture\nRecognition (FG 2024) and established to investigate the use of synthetic data\nfor training face recognition models. The SDFR competition was split into two\ntasks, allowing participants to train face recognition systems using new\nsynthetic datasets and/or existing ones. In the first task, the face\nrecognition backbone was fixed and the dataset size was limited, while the\nsecond task provided almost complete freedom on the model backbone, the\ndataset, and the training pipeline. The submitted models were trained on\nexisting and also new synthetic datasets and used clever methods to improve\ntraining with synthetic data. The submissions were evaluated and ranked on a\ndiverse set of seven benchmarking datasets. The paper gives an overview of the\nsubmitted face recognition models and reports achieved performance compared to\nbaseline models trained on real and synthetic datasets. Furthermore, the\nevaluation of submissions is extended to bias assessment across different\ndemography groups. Lastly, an outlook on the current state of the research in\ntraining face recognition models using synthetic data is presented, and\nexisting problems as well as potential future directions are also discussed.\n","authors":["Hatef Otroshi Shahreza","Christophe Ecabert","Anjith George","Alexander Unnervik","Sébastien Marcel","Nicolò Di Domenico","Guido Borghi","Davide Maltoni","Fadi Boutros","Julia Vogel","Naser Damer","Ángela Sánchez-Pérez"," EnriqueMas-Candela","Jorge Calvo-Zaragoza","Bernardo Biesseck","Pedro Vidal","Roger Granada","David Menotti","Ivan DeAndres-Tame","Simone Maurizio La Cava","Sara Concas","Pietro Melzi","Ruben Tolosana","Ruben Vera-Rodriguez","Gianpaolo Perelli","Giulia Orrù","Gian Luca Marcialis","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2404.04580v1.pdf","comment":"The 18th IEEE International Conference on Automatic Face and Gesture\n Recognition (FG 2024)"},{"id":"http://arxiv.org/abs/2404.04578v1","updated":"2024-04-06T10:16:33Z","published":"2024-04-06T10:16:33Z","title":"GLCM-Based Feature Combination for Extraction Model Optimization in\n Object Detection Using Machine Learning","summary":" In the era of modern technology, object detection using the Gray Level\nCo-occurrence Matrix (GLCM) extraction method plays a crucial role in object\nrecognition processes. It finds applications in real-time scenarios such as\nsecurity surveillance and autonomous vehicle navigation, among others.\nComputational efficiency becomes a critical factor in achieving real-time\nobject detection. Hence, there is a need for a detection model with low\ncomplexity and satisfactory accuracy. This research aims to enhance\ncomputational efficiency by selecting appropriate features within the GLCM\nframework. Two classification models, namely K-Nearest Neighbours (K-NN) and\nSupport Vector Machine (SVM), were employed, with the results indicating that\nK-Nearest Neighbours (K-NN) outperforms SVM in terms of computational\ncomplexity. Specifically, K-NN, when utilizing a combination of Correlation,\nEnergy, and Homogeneity features, achieves a 100% accuracy rate with low\ncomplexity. Moreover, when using a combination of Energy and Homogeneity\nfeatures, K-NN attains an almost perfect accuracy level of 99.9889%, while\nmaintaining low complexity. On the other hand, despite SVM achieving 100%\naccuracy in certain feature combinations, its high or very high complexity can\npose challenges, particularly in real-time applications. Therefore, based on\nthe trade-off between accuracy and complexity, the K-NN model with a\ncombination of Correlation, Energy, and Homogeneity features emerges as a more\nsuitable choice for real-time applications that demand high accuracy and low\ncomplexity. This research provides valuable insights for optimizing object\ndetection in various applications requiring both high accuracy and rapid\nresponsiveness.\n","authors":["Florentina Tatrin Kurniati","Daniel HF Manongga","Eko Sediyono","Sri Yulianto Joko Prasetyo","Roy Rudolf Huizen"],"pdf_url":"https://arxiv.org/pdf/2404.04578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00644v2","updated":"2024-04-06T10:10:33Z","published":"2024-03-01T16:25:17Z","title":"Diff-Plugin: Revitalizing Details for Diffusion-based Low-level Tasks","summary":" Diffusion models trained on large-scale datasets have achieved remarkable\nprogress in image synthesis. However, due to the randomness in the diffusion\nprocess, they often struggle with handling diverse low-level tasks that require\ndetails preservation. To overcome this limitation, we present a new Diff-Plugin\nframework to enable a single pre-trained diffusion model to generate\nhigh-fidelity results across a variety of low-level tasks. Specifically, we\nfirst propose a lightweight Task-Plugin module with a dual branch design to\nprovide task-specific priors, guiding the diffusion process in preserving image\ncontent. We then propose a Plugin-Selector that can automatically select\ndifferent Task-Plugins based on the text instruction, allowing users to edit\nimages by indicating multiple low-level tasks with natural language. We conduct\nextensive experiments on 8 low-level vision tasks. The results demonstrate the\nsuperiority of Diff-Plugin over existing methods, particularly in real-world\nscenarios. Our ablations further validate that Diff-Plugin is stable,\nschedulable, and supports robust training across different dataset sizes.\n","authors":["Yuhao Liu","Zhanghan Ke","Fang Liu","Nanxuan Zhao","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2403.00644v2.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2212.06872v4","updated":"2024-04-06T09:27:45Z","published":"2022-12-13T19:38:13Z","title":"Comparing the Decision-Making Mechanisms by Transformers and CNNs via\n Explanation Methods","summary":" In order to gain insights about the decision-making of different visual\nrecognition backbones, we propose two methodologies, sub-explanation counting\nand cross-testing, that systematically applies deep explanation algorithms on a\ndataset-wide basis, and compares the statistics generated from the amount and\nnature of the explanations. These methodologies reveal the difference among\nnetworks in terms of two properties called compositionality and disjunctivism.\nTransformers and ConvNeXt are found to be more compositional, in the sense that\nthey jointly consider multiple parts of the image in building their decisions,\nwhereas traditional CNNs and distilled transformers are less compositional and\nmore disjunctive, which means that they use multiple diverse but smaller set of\nparts to achieve a confident prediction. Through further experiments, we\npinpointed the choice of normalization to be especially important in the\ncompositionality of a model, in that batch normalization leads to less\ncompositionality while group and layer normalization lead to more. Finally, we\nalso analyze the features shared by different backbones and plot a landscape of\ndifferent models based on their feature-use similarity.\n","authors":["Mingqi Jiang","Saeed Khorram","Li Fuxin"],"pdf_url":"https://arxiv.org/pdf/2212.06872v4.pdf","comment":"25 pages with 37 figures, to be published in CVPR24"},{"id":"http://arxiv.org/abs/2404.04565v1","updated":"2024-04-06T09:13:03Z","published":"2024-04-06T09:13:03Z","title":"SportsHHI: A Dataset for Human-Human Interaction Detection in Sports\n Videos","summary":" Video-based visual relation detection tasks, such as video scene graph\ngeneration, play important roles in fine-grained video understanding. However,\ncurrent video visual relation detection datasets have two main limitations that\nhinder the progress of research in this area. First, they do not explore\ncomplex human-human interactions in multi-person scenarios. Second, the\nrelation types of existing datasets have relatively low-level semantics and can\nbe often recognized by appearance or simple prior information, without the need\nfor detailed spatio-temporal context reasoning. Nevertheless, comprehending\nhigh-level interactions between humans is crucial for understanding complex\nmulti-person videos, such as sports and surveillance videos. To address this\nissue, we propose a new video visual relation detection task: video human-human\ninteraction detection, and build a dataset named SportsHHI for it. SportsHHI\ncontains 34 high-level interaction classes from basketball and volleyball\nsports. 118,075 human bounding boxes and 50,649 interaction instances are\nannotated on 11,398 keyframes. To benchmark this, we propose a two-stage\nbaseline method and conduct extensive experiments to reveal the key factors for\na successful human-human interaction detector. We hope that SportsHHI can\nstimulate research on human interaction understanding in videos and promote the\ndevelopment of spatio-temporal context modeling techniques in video visual\nrelation detection.\n","authors":["Tao Wu","Runyu He","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04565v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04564v1","updated":"2024-04-06T09:08:34Z","published":"2024-04-06T09:08:34Z","title":"Enhancing Video Summarization with Context Awareness","summary":" Video summarization is a crucial research area that aims to efficiently\nbrowse and retrieve relevant information from the vast amount of video content\navailable today. With the exponential growth of multimedia data, the ability to\nextract meaningful representations from videos has become essential. Video\nsummarization techniques automatically generate concise summaries by selecting\nkeyframes, shots, or segments that capture the video's essence. This process\nimproves the efficiency and accuracy of various applications, including video\nsurveillance, education, entertainment, and social media. Despite the\nimportance of video summarization, there is a lack of diverse and\nrepresentative datasets, hindering comprehensive evaluation and benchmarking of\nalgorithms. Existing evaluation metrics also fail to fully capture the\ncomplexities of video summarization, limiting accurate algorithm assessment and\nhindering the field's progress. To overcome data scarcity challenges and\nimprove evaluation, we propose an unsupervised approach that leverages video\ndata structure and information for generating informative summaries. By moving\naway from fixed annotations, our framework can produce representative summaries\neffectively. Moreover, we introduce an innovative evaluation pipeline tailored\nspecifically for video summarization. Human participants are involved in the\nevaluation, comparing our generated summaries to ground truth summaries and\nassessing their informativeness. This human-centric approach provides valuable\ninsights into the effectiveness of our proposed techniques. Experimental\nresults demonstrate that our training-free framework outperforms existing\nunsupervised approaches and achieves competitive results compared to\nstate-of-the-art supervised methods.\n","authors":["Hai-Dang Huynh-Lam","Ngoc-Phuong Ho-Thi","Minh-Triet Tran","Trung-Nghia Le"],"pdf_url":"https://arxiv.org/pdf/2404.04564v1.pdf","comment":"115 pages, 1 supplementary paper, undergraduate thesis report at\n US-VNUHCM"},{"id":"http://arxiv.org/abs/2404.04562v1","updated":"2024-04-06T09:03:18Z","published":"2024-04-06T09:03:18Z","title":"Diffusion Time-step Curriculum for One Image to 3D Generation","summary":" Score distillation sampling~(SDS) has been widely adopted to overcome the\nabsence of unseen views in reconstructing 3D objects from a \\textbf{single}\nimage. It leverages pre-trained 2D diffusion models as teacher to guide the\nreconstruction of student 3D models. Despite their remarkable success,\nSDS-based methods often encounter geometric artifacts and texture saturation.\nWe find out the crux is the overlooked indiscriminate treatment of diffusion\ntime-steps during optimization: it unreasonably treats the student-teacher\nknowledge distillation to be equal at all time-steps and thus entangles\ncoarse-grained and fine-grained modeling. Therefore, we propose the Diffusion\nTime-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the\nteacher and student models collaborating with the time-step curriculum in a\ncoarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and\nLevel50 benchmark demonstrate that DTC123 can produce multi-view consistent,\nhigh-quality, and diverse 3D assets. Codes and more generation demos will be\nreleased in https://github.com/yxymessi/DTC123.\n","authors":["Xuanyu Yi","Zike Wu","Qingshan Xu","Pan Zhou","Joo-Hwee Lim","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04561v1","updated":"2024-04-06T09:01:19Z","published":"2024-04-06T09:01:19Z","title":"Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering\n Regularization for Multi-Modal 3D Semantic Occupancy Prediction","summary":" 3D semantic occupancy prediction is a pivotal task in the field of autonomous\ndriving. Recent approaches have made great advances in 3D semantic occupancy\npredictions on a single modality. However, multi-modal semantic occupancy\nprediction approaches have encountered difficulties in dealing with the\nmodality heterogeneity, modality misalignment, and insufficient modality\ninteractions that arise during the fusion of different modalities data, which\nmay result in the loss of important geometric and semantic information. This\nletter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy\nprediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera\nfeature fusion with implicit volume rendering regularization. The key insight\nis that volume rendering in the feature space can proficiently bridge the gap\nbetween 3D LiDAR sweeps and 2D images while serving as a physical\nregularization to enhance LiDAR-camera fused volumetric representation.\nSpecifically, we first propose a Geometric- and Semantic-aware Fusion\n(GSFusion) module to explicitly enhance LiDAR features by incorporating\nneighboring camera features through a K-nearest neighbors (KNN) search. Then,\nwe employ volume rendering to project the fused feature back to the image\nplanes for reconstructing color and depth maps. These maps are then supervised\nby input images from the camera and depth estimations derived from LiDAR,\nrespectively. Extensive experiments on the popular nuScenes and SemanticKITTI\nbenchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy\nprediction. The project page is available at\nhttps://rorisis.github.io/Co-Occ_project-page/.\n","authors":["Jingyi Pan","Zipeng Wang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00906v2","updated":"2024-04-06T08:52:55Z","published":"2024-04-01T04:21:01Z","title":"From Pixels to Graphs: Open-Vocabulary Scene Graph Generation with\n Vision-Language Models","summary":" Scene graph generation (SGG) aims to parse a visual scene into an\nintermediate graph representation for downstream reasoning tasks. Despite\nrecent advancements, existing methods struggle to generate scene graphs with\nnovel visual relation concepts. To address this challenge, we introduce a new\nopen-vocabulary SGG framework based on sequence generation. Our framework\nleverages vision-language pre-trained models (VLM) by incorporating an\nimage-to-graph generation paradigm. Specifically, we generate scene graph\nsequences via image-to-text generation with VLM and then construct scene graphs\nfrom these sequences. By doing so, we harness the strong capabilities of VLM\nfor open-vocabulary SGG and seamlessly integrate explicit relational modeling\nfor enhancing the VL tasks. Experimental results demonstrate that our design\nnot only achieves superior performance with an open vocabulary but also\nenhances downstream vision-language task performance through explicit relation\nmodeling knowledge.\n","authors":["Rongjie Li","Songyang Zhang","Dahua Lin","Kai Chen","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2404.00906v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04557v1","updated":"2024-04-06T08:51:07Z","published":"2024-04-06T08:51:07Z","title":"Learning Instance-Aware Correspondences for Robust Multi-Instance Point\n Cloud Registration in Cluttered Scenes","summary":" Multi-instance point cloud registration estimates the poses of multiple\ninstances of a model point cloud in a scene point cloud. Extracting accurate\npoint correspondence is to the center of the problem. Existing approaches\nusually treat the scene point cloud as a whole, overlooking the separation of\ninstances. Therefore, point features could be easily polluted by other points\nfrom the background or different instances, leading to inaccurate\ncorrespondences oblivious to separate instances, especially in cluttered\nscenes. In this work, we propose MIRETR, Multi-Instance REgistration\nTRansformer, a coarse-to-fine approach to the extraction of instance-aware\ncorrespondences. At the coarse level, it jointly learns instance-aware\nsuperpoint features and predicts per-instance masks. With instance masks, the\ninfluence from outside of the instance being concerned is minimized, such that\nhighly reliable superpoint correspondences can be extracted. The superpoint\ncorrespondences are then extended to instance candidates at the fine level\naccording to the instance masks. At last, an efficient candidate selection and\nrefinement algorithm is devised to obtain the final registrations. Extensive\nexperiments on three public benchmarks demonstrate the efficacy of our\napproach. In particular, MIRETR outperforms the state of the arts by 16.6\npoints on F1 score on the challenging ROBI benchmark. Code and models are\navailable at https://github.com/zhiyuanYU134/MIRETR.\n","authors":["Zhiyuan Yu","Zheng Qin","Lintao Zheng","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2404.04557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04556v1","updated":"2024-04-06T08:45:07Z","published":"2024-04-06T08:45:07Z","title":"Rethinking Self-training for Semi-supervised Landmark Detection: A\n Selection-free Approach","summary":" Self-training is a simple yet effective method for semi-supervised learning,\nduring which pseudo-label selection plays an important role for handling\nconfirmation bias. Despite its popularity, applying self-training to landmark\ndetection faces three problems: 1) The selected confident pseudo-labels often\ncontain data bias, which may hurt model performance; 2) It is not easy to\ndecide a proper threshold for sample selection as the localization task can be\nsensitive to noisy pseudo-labels; 3) coordinate regression does not output\nconfidence, making selection-based self-training infeasible. To address the\nabove issues, we propose Self-Training for Landmark Detection (STLD), a method\nthat does not require explicit pseudo-label selection. Instead, STLD constructs\na task curriculum to deal with confirmation bias, which progressively\ntransitions from more confident to less confident tasks over the rounds of\nself-training. Pseudo pretraining and shrink regression are two essential\ncomponents for such a curriculum, where the former is the first task of the\ncurriculum for providing a better model initialization and the latter is\nfurther added in the later rounds to directly leverage the pseudo-labels in a\ncoarse-to-fine manner. Experiments on three facial and one medical landmark\ndetection benchmark show that STLD outperforms the existing methods\nconsistently in both semi- and omni-supervised settings.\n","authors":["Haibo Jin","Haoxuan Che","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.04556v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.01673v2","updated":"2024-04-06T08:41:46Z","published":"2024-04-02T06:24:21Z","title":"A Universal Knowledge Embedded Contrastive Learning Framework for\n Hyperspectral Image Classification","summary":" Hyperspectral image (HSI) classification techniques have been intensively\nstudied and a variety of models have been developed. However, these HSI\nclassification models are confined to pocket models and unrealistic ways of\ndatasets partitioning. The former limits the generalization performance of the\nmodel and the latter is partitioned leads to inflated model evaluation metrics,\nwhich results in plummeting model performance in the real world. Therefore, we\npropose a universal knowledge embedded contrastive learning framework (KnowCL)\nfor supervised, unsupervised, and semisupervised HSI classification, which\nlargely closes the gap of HSI classification models between pocket models and\nstandard vision backbones. We present a new HSI processing pipeline in\nconjunction with a range of data transformation and augmentation techniques\nthat provide diverse data representations and realistic data partitioning. The\nproposed framework based on this pipeline is compatible with all kinds of\nbackbones and can fully exploit labeled and unlabeled samples with expected\ntraining time. Furthermore, we design a new loss function, which can adaptively\nfuse the supervised loss and unsupervised loss, enhancing the learning\nperformance. This proposed new classification paradigm shows great potentials\nin exploring for HSI classification technology. The code can be accessed at\nhttps://github.com/quanweiliu/KnowCL.\n","authors":["Quanwei Liu","Yanni Dong","Tao Huang","Lefei Zhang","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2404.01673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04550v1","updated":"2024-04-06T08:25:33Z","published":"2024-04-06T08:25:33Z","title":"NPB-REC: A Non-parametric Bayesian Deep-learning Approach for\n Undersampled MRI Reconstruction with Uncertainty Estimation","summary":" The ability to reconstruct high-quality images from undersampled MRI data is\nvital in improving MRI temporal resolution and reducing acquisition times. Deep\nlearning methods have been proposed for this task, but the lack of verified\nmethods to quantify the uncertainty in the reconstructed images hampered\nclinical applicability. We introduce \"NPB-REC\", a non-parametric fully Bayesian\nframework, for MRI reconstruction from undersampled data with uncertainty\nestimation. We use Stochastic Gradient Langevin Dynamics during training to\ncharacterize the posterior distribution of the network parameters. This enables\nus to both improve the quality of the reconstructed images and quantify the\nuncertainty in the reconstructed images. We demonstrate the efficacy of our\napproach on a multi-coil MRI dataset from the fastMRI challenge and compare it\nto the baseline End-to-End Variational Network (E2E-VarNet). Our approach\noutperforms the baseline in terms of reconstruction accuracy by means of PSNR\nand SSIM ($34.55$, $0.908$ vs. $33.08$, $0.897$, $p<0.01$, acceleration rate\n$R=8$) and provides uncertainty measures that correlate better with the\nreconstruction error (Pearson correlation, $R=0.94$ vs. $R=0.91$).\nAdditionally, our approach exhibits better generalization capabilities against\nanatomical distribution shifts (PSNR and SSIM of $32.38$, $0.849$ vs. $31.63$,\n$0.836$, $p<0.01$, training on brain data, inference on knee data, acceleration\nrate $R=8$). NPB-REC has the potential to facilitate the safe utilization of\ndeep learning-based methods for MRI reconstruction from undersampled data. Code\nand trained models are available at \\url{https://github.com/samahkh/NPB-REC}.\n","authors":["Samah Khawaled","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2404.04550v1.pdf","comment":"Published in Artificial Intelligence in Medicine, DOI:\n https://doi.org/10.1016/j.artmed.2024.102798 This is an extension\n representing a more comprehensive work extending preliminary work presented\n at arXiv:2208.03966"},{"id":"http://arxiv.org/abs/2404.04546v1","updated":"2024-04-06T08:02:18Z","published":"2024-04-06T08:02:18Z","title":"A self-attention model for robust rigid slice-to-volume registration of\n functional MRI","summary":" Functional Magnetic Resonance Imaging (fMRI) is vital in neuroscience,\nenabling investigations into brain disorders, treatment monitoring, and brain\nfunction mapping. However, head motion during fMRI scans, occurring between\nshots of slice acquisition, can result in distortion, biased analyses, and\nincreased costs due to the need for scan repetitions. Therefore, retrospective\nslice-level motion correction through slice-to-volume registration (SVR) is\ncrucial. Previous studies have utilized deep learning (DL) based models to\naddress the SVR task; however, they overlooked the uncertainty stemming from\nthe input stack of slices and did not assign weighting or scoring to each\nslice. In this work, we introduce an end-to-end SVR model for aligning 2D fMRI\nslices with a 3D reference volume, incorporating a self-attention mechanism to\nenhance robustness against input data variations and uncertainties. It utilizes\nindependent slice and volume encoders and a self-attention module to assign\npixel-wise scores for each slice. We conducted evaluation experiments on 200\nimages involving synthetic rigid motion generated from 27 subjects belonging to\nthe test set, from the publicly available Healthy Brain Network (HBN) dataset.\nOur experimental results demonstrate that our model achieves competitive\nperformance in terms of alignment accuracy compared to state-of-the-art deep\nlearning-based methods (Euclidean distance of $0.93$ [mm] vs. $1.86$ [mm]).\nFurthermore, our approach exhibits significantly faster registration speed\ncompared to conventional iterative methods ($0.096$ sec. vs. $1.17$ sec.). Our\nend-to-end SVR model facilitates real-time head motion tracking during fMRI\nacquisition, ensuring reliability and robustness against uncertainties in\ninputs. source code, which includes the training and evaluations, will be\navailable soon.\n","authors":["Samah Khawaled","Simon K. Warfield","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2404.04546v1.pdf","comment":"Currently under review"},{"id":"http://arxiv.org/abs/2404.04544v1","updated":"2024-04-06T07:53:49Z","published":"2024-04-06T07:53:49Z","title":"BeyondScene: Higher-Resolution Human-Centric Scene Generation With\n Pretrained Diffusion","summary":" Generating higher-resolution human-centric scenes with details and controls\nremains a challenge for existing text-to-image diffusion models. This challenge\nstems from limited training image size, text encoder capacity (limited tokens),\nand the inherent difficulty of generating complex scenes involving multiple\nhumans. While current methods attempted to address training size limit only,\nthey often yielded human-centric scenes with severe artifacts. We propose\nBeyondScene, a novel framework that overcomes prior limitations, generating\nexquisite higher-resolution (over 8K) human-centric scenes with exceptional\ntext-image correspondence and naturalness using existing pretrained diffusion\nmodels. BeyondScene employs a staged and hierarchical approach to initially\ngenerate a detailed base image focusing on crucial elements in instance\ncreation for multiple humans and detailed descriptions beyond token limit of\ndiffusion model, and then to seamlessly convert the base image to a\nhigher-resolution output, exceeding training image size and incorporating\ndetails aware of text and instances via our novel instance-aware hierarchical\nenlargement process that consists of our proposed high-frequency injected\nforward diffusion and adaptive joint diffusion. BeyondScene surpasses existing\nmethods in terms of correspondence with detailed text descriptions and\nnaturalness, paving the way for advanced applications in higher-resolution\nhuman-centric scene creation beyond the capacity of pretrained diffusion models\nwithout costly retraining. Project page:\nhttps://janeyeon.github.io/beyond-scene.\n","authors":["Gwanghyun Kim","Hayeon Kim","Hoigi Seo","Dong Un Kang","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2404.04544v1.pdf","comment":"Project page: https://janeyeon.github.io/beyond-scene"},{"id":"http://arxiv.org/abs/2404.03527v2","updated":"2024-04-06T07:49:14Z","published":"2024-04-04T15:31:11Z","title":"HAPNet: Toward Superior RGB-Thermal Scene Parsing via Hybrid,\n Asymmetric, and Progressive Heterogeneous Feature Fusion","summary":" Data-fusion networks have shown significant promise for RGB-thermal scene\nparsing. However, the majority of existing studies have relied on symmetric\nduplex encoders for heterogeneous feature extraction and fusion, paying\ninadequate attention to the inherent differences between RGB and thermal\nmodalities. Recent progress in vision foundation models (VFMs) trained through\nself-supervision on vast amounts of unlabeled data has proven their ability to\nextract informative, general-purpose features. However, this potential has yet\nto be fully leveraged in the domain. In this study, we take one step toward\nthis new research area by exploring a feasible strategy to fully exploit VFM\nfeatures for RGB-thermal scene parsing. Specifically, we delve deeper into the\nunique characteristics of RGB and thermal modalities, thereby designing a\nhybrid, asymmetric encoder that incorporates both a VFM and a convolutional\nneural network. This design allows for more effective extraction of\ncomplementary heterogeneous features, which are subsequently fused in a\ndual-path, progressive manner. Moreover, we introduce an auxiliary task to\nfurther enrich the local semantics of the fused features, thereby improving the\noverall performance of RGB-thermal scene parsing. Our proposed HAPNet, equipped\nwith all these components, demonstrates superior performance compared to all\nother state-of-the-art RGB-thermal scene parsing networks, achieving top ranks\nacross three widely used public RGB-thermal scene parsing datasets. We believe\nthis new paradigm has opened up new opportunities for future developments in\ndata-fusion scene parsing approaches.\n","authors":["Jiahang Li","Peng Yun","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.03527v2.pdf","comment":"12 pages, 4figures"},{"id":"http://arxiv.org/abs/2404.04531v1","updated":"2024-04-06T07:13:49Z","published":"2024-04-06T07:13:49Z","title":"Frequency Decomposition-Driven Unsupervised Domain Adaptation for Remote\n Sensing Image Semantic Segmentation","summary":" Cross-domain semantic segmentation of remote sensing (RS) imagery based on\nunsupervised domain adaptation (UDA) techniques has significantly advanced\ndeep-learning applications in the geosciences. Recently, with its ingenious and\nversatile architecture, the Transformer model has been successfully applied in\nRS-UDA tasks. However, existing UDA methods mainly focus on domain alignment in\nthe high-level feature space. It is still challenging to retain cross-domain\nlocal spatial details and global contextual semantics simultaneously, which is\ncrucial for the RS image semantic segmentation task. To address these problems,\nwe propose novel high/low-frequency decomposition (HLFD) techniques to guide\nrepresentation alignment in cross-domain semantic segmentation. Specifically,\nHLFD attempts to decompose the feature maps into high- and low-frequency\ncomponents before performing the domain alignment in the corresponding\nsubspaces. Secondly, to further facilitate the alignment of decomposed\nfeatures, we propose a fully global-local generative adversarial network,\nnamely GLGAN, to learn domain-invariant detailed and semantic features across\ndomains by leveraging global-local transformer blocks (GLTBs). By integrating\nHLFD techniques and the GLGAN, a novel UDA framework called FD-GLGAN is\ndeveloped to improve the cross-domain transferability and generalization\ncapability of semantic segmentation models. Extensive experiments on two\nfine-resolution benchmark datasets, namely ISPRS Potsdam and ISPRS Vaihingen,\nhighlight the effectiveness and superiority of the proposed approach as\ncompared to the state-of-the-art UDA methods. The source code for this work\nwill be accessible at https://github.com/sstary/SSRS.\n","authors":["Xianping Ma","Xiaokang Zhang","Xingchen Ding","Man-On Pun","Siwei Ma"],"pdf_url":"https://arxiv.org/pdf/2404.04531v1.pdf","comment":"28 pages, 13 figures"},{"id":"http://arxiv.org/abs/2312.01531v2","updated":"2024-04-06T07:04:29Z","published":"2023-12-03T23:09:38Z","title":"SANeRF-HQ: Segment Anything for NeRF in High Quality","summary":" Recently, the Segment Anything Model (SAM) has showcased remarkable\ncapabilities of zero-shot segmentation, while NeRF (Neural Radiance Fields) has\ngained popularity as a method for various 3D problems beyond novel view\nsynthesis. Though there exist initial attempts to incorporate these two methods\ninto 3D segmentation, they face the challenge of accurately and consistently\nsegmenting objects in complex scenarios. In this paper, we introduce the\nSegment Anything for NeRF in High Quality (SANeRF-HQ) to achieve high-quality\n3D segmentation of any target object in a given scene. SANeRF-HQ utilizes SAM\nfor open-world object segmentation guided by user-supplied prompts, while\nleveraging NeRF to aggregate information from different viewpoints. To overcome\nthe aforementioned challenges, we employ density field and RGB similarity to\nenhance the accuracy of segmentation boundary during the aggregation.\nEmphasizing on segmentation accuracy, we evaluate our method on multiple NeRF\ndatasets where high-quality ground-truths are available or manually annotated.\nSANeRF-HQ shows a significant quality improvement over state-of-the-art methods\nin NeRF object segmentation, provides higher flexibility for object\nlocalization, and enables more consistent object segmentation across multiple\nviews. Results and code are available at the project site:\nhttps://lyclyc52.github.io/SANeRF-HQ/.\n","authors":["Yichen Liu","Benran Hu","Chi-Keung Tang","Yu-Wing Tai"],"pdf_url":"https://arxiv.org/pdf/2312.01531v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04527v1","updated":"2024-04-06T06:49:55Z","published":"2024-04-06T06:49:55Z","title":"VTR: An Optimized Vision Transformer for SAR ATR Acceleration on FPGA","summary":" Synthetic Aperture Radar (SAR) Automatic Target Recognition (ATR) is a key\ntechnique used in military applications like remote-sensing image recognition.\nVision Transformers (ViTs) are the current state-of-the-art in various computer\nvision applications, outperforming their CNN counterparts. However, using ViTs\nfor SAR ATR applications is challenging due to (1) standard ViTs require\nextensive training data to generalize well due to their low locality; the\nstandard SAR datasets, however, have a limited number of labeled training data\nwhich reduces the learning capability of ViTs; (2) ViTs have a high parameter\ncount and are computation intensive which makes their deployment on\nresource-constrained SAR platforms difficult. In this work, we develop a\nlightweight ViT model that can be trained directly on small datasets without\nany pre-training by utilizing the Shifted Patch Tokenization (SPT) and Locality\nSelf-Attention (LSA) modules. We directly train this model on SAR datasets\nwhich have limited training samples to evaluate its effectiveness for SAR ATR\napplications. We evaluate our proposed model, that we call VTR (ViT for SAR\nATR), on three widely used SAR datasets: MSTAR, SynthWakeSAR, and GBSAR.\nFurther, we propose a novel FPGA accelerator for VTR, in order to enable\ndeployment for real-time SAR ATR applications.\n","authors":["Sachini Wickramasinghe","Dhruv Parikh","Bingyi Zhang","Rajgopal Kannan","Viktor Prasanna","Carl Busart"],"pdf_url":"https://arxiv.org/pdf/2404.04527v1.pdf","comment":"SPIE DCS 2024"},{"id":"http://arxiv.org/abs/2404.04526v1","updated":"2024-04-06T06:48:16Z","published":"2024-04-06T06:48:16Z","title":"DATENeRF: Depth-Aware Text-based Editing of NeRFs","summary":" Recent advancements in diffusion models have shown remarkable proficiency in\nediting 2D images based on text prompts. However, extending these techniques to\nedit scenes in Neural Radiance Fields (NeRF) is complex, as editing individual\n2D frames can result in inconsistencies across multiple views. Our crucial\ninsight is that a NeRF scene's geometry can serve as a bridge to integrate\nthese 2D edits. Utilizing this geometry, we employ a depth-conditioned\nControlNet to enhance the coherence of each 2D image modification. Moreover, we\nintroduce an inpainting approach that leverages the depth information of NeRF\nscenes to distribute 2D edits across different images, ensuring robustness\nagainst errors and resampling challenges. Our results reveal that this\nmethodology achieves more consistent, lifelike, and detailed edits than\nexisting leading methods for text-driven NeRF scene editing.\n","authors":["Sara Rojas","Julien Philip","Kai Zhang","Sai Bi","Fujun Luan","Bernard Ghanem","Kalyan Sunkavall"],"pdf_url":"https://arxiv.org/pdf/2404.04526v1.pdf","comment":"14 pages, Conference paper, 3D Scene Editing, Neural Rendering,\n Diffusion Models"},{"id":"http://arxiv.org/abs/2404.04518v1","updated":"2024-04-06T06:18:11Z","published":"2024-04-06T06:18:11Z","title":"MedIAnomaly: A comparative study of anomaly detection in medical images","summary":" Anomaly detection (AD) aims at detecting abnormal samples that deviate from\nthe expected normal patterns. Generally, it can be trained on merely normal\ndata without the requirement for abnormal samples, and thereby plays an\nimportant role in the recognition of rare diseases and health screening in the\nmedical domain. Despite numerous related studies, we observe a lack of a fair\nand comprehensive evaluation, which causes some ambiguous conclusions and\nhinders the development of this field. This paper focuses on building a\nbenchmark with unified implementation and comparison to address this problem.\nIn particular, seven medical datasets with five image modalities, including\nchest X-rays, brain MRIs, retinal fundus images, dermatoscopic images, and\nhistopathology whole slide images are organized for extensive evaluation.\nTwenty-seven typical AD methods, including reconstruction and self-supervised\nlearning-based methods, are involved in comparison of image-level anomaly\nclassification and pixel-level anomaly segmentation. Furthermore, we for the\nfirst time formally explore the effect of key components in existing methods,\nclearly revealing unresolved challenges and potential future directions. The\ndatasets and code are available at\n\\url{https://github.com/caiyu6666/MedIAnomaly}.\n","authors":["Yu Cai","Weiwen Zhang","Hao Chen","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.04518v1.pdf","comment":"Under submission"},{"id":"http://arxiv.org/abs/2404.04517v1","updated":"2024-04-06T06:15:07Z","published":"2024-04-06T06:15:07Z","title":"Latent-based Diffusion Model for Long-tailed Recognition","summary":" Long-tailed imbalance distribution is a common issue in practical computer\nvision applications. Previous works proposed methods to address this problem,\nwhich can be categorized into several classes: re-sampling, re-weighting,\ntransfer learning, and feature augmentation. In recent years, diffusion models\nhave shown an impressive generation ability in many sub-problems of deep\ncomputer vision. However, its powerful generation has not been explored in\nlong-tailed problems. We propose a new approach, the Latent-based Diffusion\nModel for Long-tailed Recognition (LDMLR), as a feature augmentation method to\ntackle the issue. First, we encode the imbalanced dataset into features using\nthe baseline model. Then, we train a Denoising Diffusion Implicit Model (DDIM)\nusing these encoded features to generate pseudo-features. Finally, we train the\nclassifier using the encoded and pseudo-features from the previous two steps.\nThe model's accuracy shows an improvement on the CIFAR-LT and ImageNet-LT\ndatasets by using the proposed method.\n","authors":["Pengxiao Han","Changkun Ye","Jieming Zhou","Jing Zhang","Jie Hong","Xuesong Li"],"pdf_url":"https://arxiv.org/pdf/2404.04517v1.pdf","comment":"8 pages, 3 figures, accepted by L3DIVU-CVPR2024"},{"id":"http://arxiv.org/abs/2404.04511v1","updated":"2024-04-06T05:55:14Z","published":"2024-04-06T05:55:14Z","title":"Cluster-based Video Summarization with Temporal Context Awareness","summary":" In this paper, we present TAC-SUM, a novel and efficient training-free\napproach for video summarization that addresses the limitations of existing\ncluster-based models by incorporating temporal context. Our method partitions\nthe input video into temporally consecutive segments with clustering\ninformation, enabling the injection of temporal awareness into the clustering\nprocess, setting it apart from prior cluster-based summarization methods. The\nresulting temporal-aware clusters are then utilized to compute the final\nsummary, using simple rules for keyframe selection and frame importance\nscoring. Experimental results on the SumMe dataset demonstrate the\neffectiveness of our proposed approach, outperforming existing unsupervised\nmethods and achieving comparable performance to state-of-the-art supervised\nsummarization techniques. Our source code is available for reference at\n\\url{https://github.com/hcmus-thesis-gulu/TAC-SUM}.\n","authors":["Hai-Dang Huynh-Lam","Ngoc-Phuong Ho-Thi","Minh-Triet Tran","Trung-Nghia Le"],"pdf_url":"https://arxiv.org/pdf/2404.04511v1.pdf","comment":"14 pages, 6 figures, accepted in PSIVT 2023"},{"id":"http://arxiv.org/abs/2212.02190v3","updated":"2024-04-06T04:59:12Z","published":"2022-12-05T11:54:12Z","title":"L2SR: Learning to Sample and Reconstruct for Accelerated MRI via\n Reinforcement Learning","summary":" Magnetic Resonance Imaging (MRI) is a widely used medical imaging technique,\nbut its long acquisition time can be a limiting factor in clinical settings. To\naddress this issue, researchers have been exploring ways to reduce the\nacquisition time while maintaining the reconstruction quality. Previous works\nhave focused on finding either sparse samplers with a fixed reconstructor or\nfinding reconstructors with a fixed sampler. However, these approaches do not\nfully utilize the potential of joint learning of samplers and reconstructors.\nIn this paper, we propose an alternating training framework for jointly\nlearning a good pair of samplers and reconstructors via deep reinforcement\nlearning (RL). In particular, we consider the process of MRI sampling as a\nsampling trajectory controlled by a sampler, and introduce a novel\nsparse-reward Partially Observed Markov Decision Process (POMDP) to formulate\nthe MRI sampling trajectory. Compared to the dense-reward POMDP used in\nexisting works, the proposed sparse-reward POMDP is more computationally\nefficient and has a provable advantage. Moreover, the proposed framework,\ncalled L2SR (Learning to Sample and Reconstruct), overcomes the training\nmismatch problem that arises in previous methods that use dense-reward POMDP.\nBy alternately updating samplers and reconstructors, L2SR learns a pair of\nsamplers and reconstructors that achieve state-of-the-art reconstruction\nperformances on the fastMRI dataset. Codes are available at\n\\url{https://github.com/yangpuPKU/L2SR-Learning-to-Sample-and-Reconstruct}.\n","authors":["Pu Yang","Bin Dong"],"pdf_url":"https://arxiv.org/pdf/2212.02190v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04492v1","updated":"2024-04-06T03:48:29Z","published":"2024-04-06T03:48:29Z","title":"Automated Lane Change Behavior Prediction and Environmental Perception\n Based on SLAM Technology","summary":" In addition to environmental perception sensors such as cameras, radars, etc.\nin the automatic driving system, the external environment of the vehicle is\nperceived, in fact, there is also a perception sensor that has been silently\ndedicated in the system, that is, the positioning module. This paper explores\nthe application of SLAM (Simultaneous Localization and Mapping) technology in\nthe context of automatic lane change behavior prediction and environment\nperception for autonomous vehicles. It discusses the limitations of traditional\npositioning methods, introduces SLAM technology, and compares LIDAR SLAM with\nvisual SLAM. Real-world examples from companies like Tesla, Waymo, and Mobileye\nshowcase the integration of AI-driven technologies, sensor fusion, and SLAM in\nautonomous driving systems. The paper then delves into the specifics of SLAM\nalgorithms, sensor technologies, and the importance of automatic lane changes\nin driving safety and efficiency. It highlights Tesla's recent update to its\nAutopilot system, which incorporates automatic lane change functionality using\nSLAM technology. The paper concludes by emphasizing the crucial role of SLAM in\nenabling accurate environment perception, positioning, and decision-making for\nautonomous vehicles, ultimately enhancing safety and driving experience.\n","authors":["Han Lei","Baoming Wang","Zuwei Shui","Peiyuan Yang","Penghao Liang"],"pdf_url":"https://arxiv.org/pdf/2404.04492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04483v1","updated":"2024-04-06T03:25:24Z","published":"2024-04-06T03:25:24Z","title":"FastHDRNet: A new efficient method for SDR-to-HDR Translation","summary":" Modern displays nowadays possess the capability to render video content with\na high dynamic range (HDR) and an extensive color gamut (WCG).However, the\nmajority of available resources are still in standard dynamic range(SDR).\nTherefore, we need to identify an effective methodology for this objective.The\nexisting deep neural network (DNN) based SDR(Standard dynamic range) to HDR\n(High dynamic range) conversion methods outperform conventional methods, but\nthey are either too large to implement or generate some terrible artifacts. We\npropose a neural network for SDRTV to HDRTV conversion, termed \"FastHDRNet\".\nThis network includes two parts, Adaptive Universal Color Transformation and\nLocal Enhancement.The architecture is designed as a lightweight network that\nutilizes global statistics and local information with super high efficiency.\nAfter the experiment, we find that our proposed method achieve state-of-the-art\nperformance in both quantitative comparisons and visual quality with a\nlightweight structure and a enhanced infer speed.\n","authors":["Siyuan Tian","Hao Wang","Yiren Rong","Junhao Wang","Renjie Dai","Zhengxiao He"],"pdf_url":"https://arxiv.org/pdf/2404.04483v1.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.00241v4","updated":"2024-04-06T03:24:26Z","published":"2023-12-30T14:11:08Z","title":"Image Super-resolution Reconstruction Network based on Enhanced Swin\n Transformer via Alternating Aggregation of Local-Global Features","summary":" The Swin Transformer image super-resolution reconstruction network only\nrelies on the long-range relationship of window attention and shifted window\nattention to explore features. This mechanism has two limitations. On the one\nhand, it only focuses on global features while ignoring local features. On the\nother hand, it is only concerned with spatial feature interactions while\nignoring channel features and channel interactions, thus limiting its\nnon-linear mapping ability. To address the above limitations, this paper\nproposes enhanced Swin Transformer modules via alternating aggregation of\nlocal-global features. In the local feature aggregation stage, we introduce a\nshift convolution to realize the interaction between local spatial information\nand channel information. Then, a block sparse global perception module is\nintroduced in the global feature aggregation stage. In this module, we\nreorganize the spatial information first, then send the recombination\ninformation into a dense layer to implement the global perception. After that,\na multi-scale self-attention module and a low-parameter residual channel\nattention module are introduced to realize information aggregation at different\nscales. Finally, the proposed network is validated on five publicly available\ndatasets. The experimental results show that the proposed network outperforms\nthe other state-of-the-art super-resolution networks.\n","authors":["Yuming Huang","Yingpin Chen","Changhui Wu","Hanrong Xie","Binhui Song","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2401.00241v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09560v2","updated":"2024-04-06T03:17:33Z","published":"2023-10-14T11:03:04Z","title":"You Only Train Once: A Unified Framework for Both Full-Reference and\n No-Reference Image Quality Assessment","summary":" Although recent efforts in image quality assessment (IQA) have achieved\npromising performance, there still exists a considerable gap compared to the\nhuman visual system (HVS). One significant disparity lies in humans' seamless\ntransition between full reference (FR) and no reference (NR) tasks, whereas\nexisting models are constrained to either FR or NR tasks. This disparity\nimplies the necessity of designing two distinct systems, thereby greatly\ndiminishing the model's versatility. Therefore, our focus lies in unifying FR\nand NR IQA under a single framework. Specifically, we first employ an encoder\nto extract multi-level features from input images. Then a Hierarchical\nAttention (HA) module is proposed as a universal adapter for both FR and NR\ninputs to model the spatial distortion at each encoder stage. Furthermore,\nconsidering that different distortions contaminate encoder stages and damage\nimage semantic meaning differently, a Semantic Distortion Aware (SDA) module is\nproposed to examine feature correlations between shallow and deep layers of the\nencoder. By adopting HA and SDA, the proposed network can effectively perform\nboth FR and NR IQA. When our proposed model is independently trained on NR or\nFR IQA tasks, it outperforms existing models and achieves state-of-the-art\nperformance. Moreover, when trained jointly on NR and FR IQA tasks, it further\nenhances the performance of NR IQA while achieving on-par performance in the\nstate-of-the-art FR IQA. You only train once to perform both IQA tasks. Code\nwill be released at: https://github.com/BarCodeReader/YOTO.\n","authors":["Yi Ke Yun","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2310.09560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04478v1","updated":"2024-04-06T02:54:35Z","published":"2024-04-06T02:54:35Z","title":"Diffusion-RWKV: Scaling RWKV-Like Architectures for Diffusion Models","summary":" Transformers have catalyzed advancements in computer vision and natural\nlanguage processing (NLP) fields. However, substantial computational complexity\nposes limitations for their application in long-context tasks, such as\nhigh-resolution image generation. This paper introduces a series of\narchitectures adapted from the RWKV model used in the NLP, with requisite\nmodifications tailored for diffusion model applied to image generation tasks,\nreferred to as Diffusion-RWKV. Similar to the diffusion with Transformers, our\nmodel is designed to efficiently handle patchnified inputs in a sequence with\nextra conditions, while also scaling up effectively, accommodating both\nlarge-scale parameters and extensive datasets. Its distinctive advantage\nmanifests in its reduced spatial aggregation complexity, rendering it\nexceptionally adept at processing high-resolution images, thereby eliminating\nthe necessity for windowing or group cached operations. Experimental results on\nboth condition and unconditional image generation tasks demonstrate that\nDiffison-RWKV achieves performance on par with or surpasses existing CNN or\nTransformer-based diffusion models in FID and IS metrics while significantly\nreducing total computation FLOP usage.\n","authors":["Zhengcong Fei","Mingyuan Fan","Changqian Yu","Debang Li","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2404.04478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05357v2","updated":"2024-04-06T02:42:44Z","published":"2023-12-08T20:34:37Z","title":"Filtering Pixel Latent Variables for Unmixing Noisy and Undersampled\n Volumetric Images","summary":" The development of robust signal unmixing algorithms is essential for\nleveraging multimodal datasets acquired through a wide array of scientific\nimaging technologies, including hyperspectral or time-resolved acquisitions. In\nexperimental physics, enhancing the spatio-temporal resolution or expanding the\nnumber of detection channels often leads to diminished sampling rate and\nsignal-to-noise ratio, significantly affecting the efficacy of signal unmixing\nalgorithms. We propose applying band-pass filters to the latent space of a\nmulti-dimensional convolutional neural network to disentangle overlapping\nsignal components, enabling the isolation and quantification of their\nindividual contributions. Using multi-dimensional convolution kernels to\nprocess all dimensions simultaneously enhances the network's ability to extract\ninformation from adjacent pixels, time- or spectral-bins. This approach enables\nmore effective separation of components in cases where individual pixels do not\nprovide clear, well-resolved information. We showcase the method's practical\nuse in experimental physics through two test cases that highlight the\nversatility of our approach: fluorescence lifetime microscopy and mode\ndecomposition in optical fibers. The latent unmixing method extracts valuable\ninformation from complex signals that cannot be resolved by standard methods.\nApplication of latent unmixing to real FLIM experiments will increase the\nnumber of distinguishable fluorescent markers. It will also open new\npossibilities in optics and photonics for multichannel separations at increased\nsampling rate.\n","authors":["Catherine Bouchard","Andréanne Deschênes","Vincent Boulanger","Jean-Michel Bellavance","Flavie Lavoie-Cardinal","Christian Gagné"],"pdf_url":"https://arxiv.org/pdf/2312.05357v2.pdf","comment":"16 pages, 8 figures (main paper) + 18 pages, 9 figures (supplementary\n material)"},{"id":"http://arxiv.org/abs/2404.04476v1","updated":"2024-04-06T02:33:04Z","published":"2024-04-06T02:33:04Z","title":"DELTA: Decoupling Long-Tailed Online Continual Learning","summary":" A significant challenge in achieving ubiquitous Artificial Intelligence is\nthe limited ability of models to rapidly learn new information in real-world\nscenarios where data follows long-tailed distributions, all while avoiding\nforgetting previously acquired knowledge. In this work, we study the\nunder-explored problem of Long-Tailed Online Continual Learning (LTOCL), which\naims to learn new tasks from sequentially arriving class-imbalanced data\nstreams. Each data is observed only once for training without knowing the task\ndata distribution. We present DELTA, a decoupled learning approach designed to\nenhance learning representations and address the substantial imbalance in\nLTOCL. We enhance the learning process by adapting supervised contrastive\nlearning to attract similar samples and repel dissimilar (out-of-class)\nsamples. Further, by balancing gradients during training using an equalization\nloss, DELTA significantly enhances learning outcomes and successfully mitigates\ncatastrophic forgetting. Through extensive evaluation, we demonstrate that\nDELTA improves the capacity for incremental learning, surpassing existing OCL\nmethods. Our results suggest considerable promise for applying OCL in\nreal-world applications.\n","authors":["Siddeshwar Raghavan","Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.04476v1.pdf","comment":"CVPR Workshop acceptance archival track"},{"id":"http://arxiv.org/abs/2404.04474v1","updated":"2024-04-06T02:08:40Z","published":"2024-04-06T02:08:40Z","title":"RoNet: Rotation-oriented Continuous Image Translation","summary":" The generation of smooth and continuous images between domains has recently\ndrawn much attention in image-to-image (I2I) translation. Linear relationship\nacts as the basic assumption in most existing approaches, while applied to\ndifferent aspects including features, models or labels. However, the linear\nassumption is hard to conform with the element dimension increases and suffers\nfrom the limit that having to obtain both ends of the line. In this paper, we\npropose a novel rotation-oriented solution and model the continuous generation\nwith an in-plane rotation over the style representation of an image, achieving\na network named RoNet. A rotation module is implanted in the generation network\nto automatically learn the proper plane while disentangling the content and the\nstyle of an image. To encourage realistic texture, we also design a patch-based\nsemantic style loss that learns the different styles of the similar object in\ndifferent domains. We conduct experiments on forest scenes (where the complex\ntexture makes the generation very challenging), faces, streetscapes and the\niphone2dslr task. The results validate the superiority of our method in terms\nof visual quality and continuity.\n","authors":["Yi Li","Xin Xie","Lina Lei","Haiyan Fu","Yanqing Guo"],"pdf_url":"https://arxiv.org/pdf/2404.04474v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.04469v1","updated":"2024-04-06T01:54:17Z","published":"2024-04-06T01:54:17Z","title":"Mixed-Query Transformer: A Unified Image Segmentation Architecture","summary":" Existing unified image segmentation models either employ a unified\narchitecture across multiple tasks but use separate weights tailored to each\ndataset, or apply a single set of weights to multiple datasets but are limited\nto a single task. In this paper, we introduce the Mixed-Query Transformer\n(MQ-Former), a unified architecture for multi-task and multi-dataset image\nsegmentation using a single set of weights. To enable this, we propose a mixed\nquery strategy, which can effectively and dynamically accommodate different\ntypes of objects without heuristic designs. In addition, the unified\narchitecture allows us to use data augmentation with synthetic masks and\ncaptions to further improve model generalization. Experiments demonstrate that\nMQ-Former can not only effectively handle multiple segmentation datasets and\ntasks compared to specialized state-of-the-art models with competitive\nperformance, but also generalize better to open-set segmentation tasks,\nevidenced by over 7 points higher performance than the prior art on the\nopen-vocabulary SeginW benchmark.\n","authors":["Pei Wang","Zhaowei Cai","Hao Yang","Ashwin Swaminathan","R. Manmatha","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2404.04469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10671v3","updated":"2024-04-06T01:45:45Z","published":"2023-12-17T10:07:03Z","title":"Open3DIS: Open-Vocabulary 3D Instance Segmentation with 2D Mask Guidance","summary":" We introduce Open3DIS, a novel solution designed to tackle the problem of\nOpen-Vocabulary Instance Segmentation within 3D scenes. Objects within 3D\nenvironments exhibit diverse shapes, scales, and colors, making precise\ninstance-level identification a challenging task. Recent advancements in\nOpen-Vocabulary scene understanding have made significant strides in this area\nby employing class-agnostic 3D instance proposal networks for object\nlocalization and learning queryable features for each 3D mask. While these\nmethods produce high-quality instance proposals, they struggle with identifying\nsmall-scale and geometrically ambiguous objects. The key idea of our method is\na new module that aggregates 2D instance masks across frames and maps them to\ngeometrically coherent point cloud regions as high-quality object proposals\naddressing the above limitations. These are then combined with 3D\nclass-agnostic instance proposals to include a wide range of objects in the\nreal world. To validate our approach, we conducted experiments on three\nprominent datasets, including ScanNet200, S3DIS, and Replica, demonstrating\nsignificant performance gains in segmenting objects with diverse categories\nover the state-of-the-art approaches.\n","authors":["Phuc D. A. Nguyen","Tuan Duc Ngo","Evangelos Kalogerakis","Chuang Gan","Anh Tran","Cuong Pham","Khoi Nguyen"],"pdf_url":"https://arxiv.org/pdf/2312.10671v3.pdf","comment":"CVPR 2024. Project page: https://open3dis.github.io/"},{"id":"http://arxiv.org/abs/2404.04465v1","updated":"2024-04-06T01:23:23Z","published":"2024-04-06T01:23:23Z","title":"Aligning Diffusion Models by Optimizing Human Utility","summary":" We present Diffusion-KTO, a novel approach for aligning text-to-image\ndiffusion models by formulating the alignment objective as the maximization of\nexpected human utility. Since this objective applies to each generation\nindependently, Diffusion-KTO does not require collecting costly pairwise\npreference data nor training a complex reward model. Instead, our objective\nrequires simple per-image binary feedback signals, e.g. likes or dislikes,\nwhich are abundantly available. After fine-tuning using Diffusion-KTO,\ntext-to-image diffusion models exhibit superior performance compared to\nexisting techniques, including supervised fine-tuning and Diffusion-DPO, both\nin terms of human judgment and automatic evaluation metrics such as PickScore\nand ImageReward. Overall, Diffusion-KTO unlocks the potential of leveraging\nreadily available per-image binary signals and broadens the applicability of\naligning text-to-image diffusion models with human preferences.\n","authors":["Shufan Li","Konstantinos Kallidromitis","Akash Gokul","Yusuke Kato","Kazuki Kozuka"],"pdf_url":"https://arxiv.org/pdf/2404.04465v1.pdf","comment":"27 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.04461v1","updated":"2024-04-06T01:07:38Z","published":"2024-04-06T01:07:38Z","title":"Automated Polyp Segmentation in Colonoscopy Images","summary":" It is important to find the polyps in a human system that helps to prevent\ncancer during medical diagnosis. This research discusses using a dilated\nconvolution module along with a criss cross attention-based network to segment\npolyps from the endoscopic images of the colon. To gather the context\ninformation of all pixels in an image more efficiently, criss-cross attention\nmodule has played a vital role. In order to extract maximum information from\ndataset, data augmentation techniques are employed in the dataset. Rotations,\nflips, scaling, and contrast along with varying learning rates were implemented\nto make a better model. Global average pooling was applied over ResNet50 that\nhelped to store the important details of encoder. In our experiment, the\nproposed architecture's performance was compared with existing models like\nU-Net, DeepLabV3, PraNet. This architecture outperformed other models on the\nsubset of dataset which has irregular polyp shapes. The combination of dilated\nconvolution module, RCCA, and global average pooling was found to be effective\nfor irregular shapes. Our architecture demonstrates an enhancement, with an\naverage improvement of 3.75% across all metrics when compared to existing\nmodels.\n","authors":["Swagat Ranjit","Jian Zhang","Bijaya B. Karki"],"pdf_url":"https://arxiv.org/pdf/2404.04461v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2404.04458v1","updated":"2024-04-06T00:33:39Z","published":"2024-04-06T00:33:39Z","title":"JRDB-Social: A Multifaceted Robotic Dataset for Understanding of Context\n and Dynamics of Human Interactions Within Social Groups","summary":" Understanding human social behaviour is crucial in computer vision and\nrobotics. Micro-level observations like individual actions fall short,\nnecessitating a comprehensive approach that considers individual behaviour,\nintra-group dynamics, and social group levels for a thorough understanding. To\naddress dataset limitations, this paper introduces JRDB-Social, an extension of\nJRDB. Designed to fill gaps in human understanding across diverse indoor and\noutdoor social contexts, JRDB-Social provides annotations at three levels:\nindividual attributes, intra-group interactions, and social group context. This\ndataset aims to enhance our grasp of human social dynamics for robotic\napplications. Utilizing the recent cutting-edge multi-modal large language\nmodels, we evaluated our benchmark to explore their capacity to decipher social\nhuman behaviour.\n","authors":["Simindokht Jahangard","Zhixi Cai","Shiki Wen","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2404.04458v1.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://jrdb.erc.monash.edu/dataset/social"},{"id":"http://arxiv.org/abs/2404.04456v1","updated":"2024-04-06T00:04:19Z","published":"2024-04-06T00:04:19Z","title":"Beyond the Known: Adversarial Autoencoders in Novelty Detection","summary":" In novelty detection, the goal is to decide if a new data point should be\ncategorized as an inlier or an outlier, given a training dataset that primarily\ncaptures the inlier distribution. Recent approaches typically use deep encoder\nand decoder network frameworks to derive a reconstruction error, and employ\nthis error either to determine a novelty score, or as the basis for a one-class\nclassifier. In this research, we use a similar framework but with a lightweight\ndeep network, and we adopt a probabilistic score with reconstruction error. Our\nmethodology calculates the probability of whether the sample comes from the\ninlier distribution or not. This work makes two key contributions. The first is\nthat we compute the novelty probability by linearizing the manifold that holds\nthe structure of the inlier distribution. This allows us to interpret how the\nprobability is distributed and can be determined in relation to the local\ncoordinates of the manifold tangent space. The second contribution is that we\nimprove the training protocol for the network. Our results indicate that our\napproach is effective at learning the target class, and it outperforms recent\nstate-of-the-art methods on several benchmark datasets.\n","authors":["Muhammad Asad","Ihsan Ullah","Ganesh Sistu","Michael G. Madden"],"pdf_url":"https://arxiv.org/pdf/2404.04456v1.pdf","comment":"Accepted at the VISAAP 2024"},{"id":"http://arxiv.org/abs/2404.05764v1","updated":"2024-04-06T16:10:48Z","published":"2024-04-06T16:10:48Z","title":"Study of the effect of Sharpness on Blind Video Quality Assessment","summary":" Introduction: Video Quality Assessment (VQA) is one of the important areas of\nstudy in this modern era, where video is a crucial component of communication\nwith applications in every field. Rapid technology developments in mobile\ntechnology enabled anyone to create videos resulting in a varied range of video\nquality scenarios. Objectives: Though VQA was present for some time with the\nclassical metrices like SSIM and PSNR, the advent of machine learning has\nbrought in new techniques of VQAs which are built upon Convolutional Neural\nNetworks (CNNs) or Deep Neural Networks (DNNs). Methods: Over the past years\nvarious research studies such as the BVQA which performed video quality\nassessment of nature-based videos using DNNs exposed the powerful capabilities\nof machine learning algorithms. BVQA using DNNs explored human visual system\neffects such as content dependency and time-related factors normally known as\ntemporal effects. Results: This study explores the sharpness effect on models\nlike BVQA. Sharpness is the measure of the clarity and details of the video\nimage. Sharpness typically involves analyzing the edges and contrast of the\nimage to determine the overall level of detail and sharpness. Conclusion: This\nstudy uses the existing video quality databases such as CVD2014. A comparative\nstudy of the various machine learning parameters such as SRCC and PLCC during\nthe training and testing are presented along with the conclusion.\n","authors":["Anantha Prabhu","David Pratap","Narayana Darapeni","Anwesh P R"],"pdf_url":"https://arxiv.org/pdf/2404.05764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04654v1","updated":"2024-04-06T15:14:25Z","published":"2024-04-06T15:14:25Z","title":"Music Recommendation Based on Facial Emotion Recognition","summary":" Introduction: Music provides an incredible avenue for individuals to express\ntheir thoughts and emotions, while also serving as a delightful mode of\nentertainment for enthusiasts and music lovers. Objectives: This paper presents\na comprehensive approach to enhancing the user experience through the\nintegration of emotion recognition, music recommendation, and explainable AI\nusing GRAD-CAM. Methods: The proposed methodology utilizes a ResNet50 model\ntrained on the Facial Expression Recognition (FER) dataset, consisting of real\nimages of individuals expressing various emotions. Results: The system achieves\nan accuracy of 82% in emotion classification. By leveraging GRAD-CAM, the model\nprovides explanations for its predictions, allowing users to understand the\nreasoning behind the system's recommendations. The model is trained on both FER\nand real user datasets, which include labelled facial expressions, and real\nimages of individuals expressing various emotions. The training process\ninvolves pre-processing the input images, extracting features through\nconvolutional layers, reasoning with dense layers, and generating emotion\npredictions through the output layer. Conclusion: The proposed methodology,\nleveraging the Resnet50 model with ROI-based analysis and explainable AI\ntechniques, offers a robust and interpretable solution for facial emotion\ndetection paper.\n","authors":["Rajesh B","Keerthana V","Narayana Darapaneni","Anwesh Reddy P"],"pdf_url":"https://arxiv.org/pdf/2404.04654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05763v1","updated":"2024-04-06T15:09:49Z","published":"2024-04-06T15:09:49Z","title":"Deep Learning-Based Brain Image Segmentation for Automated Tumour\n Detection","summary":" Introduction: The present study on the development and evaluation of an\nautomated brain tumor segmentation technique based on deep learning using the\n3D U-Net model. Objectives: The objective is to leverage state-of-the-art\nconvolutional neural networks (CNNs) on a large dataset of brain MRI scans for\nsegmentation. Methods: The proposed methodology applies pre-processing\ntechniques for enhanced performance and generalizability. Results: Extensive\nvalidation on an independent dataset confirms the model's robustness and\npotential for integration into clinical workflows. The study emphasizes the\nimportance of data pre-processing and explores various hyperparameters to\noptimize the model's performance. The 3D U-Net, has given IoUs for training and\nvalidation dataset have been 0.8181 and 0.66 respectively. Conclusion:\nUltimately, this comprehensive framework showcases the efficacy of deep\nlearning in automating brain tumour detection, offering valuable support in\nclinical practice.\n","authors":["Suman Sourabh","Murugappan Valliappan","Narayana Darapaneni","Anwesh R P"],"pdf_url":"https://arxiv.org/pdf/2404.05763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04586v1","updated":"2024-04-06T10:50:02Z","published":"2024-04-06T10:50:02Z","title":"PIE: Physics-inspired Low-light Enhancement","summary":" In this paper, we propose a physics-inspired contrastive learning paradigm\nfor low-light enhancement, called PIE. PIE primarily addresses three issues:\n(i) To resolve the problem of existing learning-based methods often training a\nLLE model with strict pixel-correspondence image pairs, we eliminate the need\nfor pixel-correspondence paired training data and instead train with unpaired\nimages. (ii) To address the disregard for negative samples and the inadequacy\nof their generation in existing methods, we incorporate physics-inspired\ncontrastive learning for LLE and design the Bag of Curves (BoC) method to\ngenerate more reasonable negative samples that closely adhere to the underlying\nphysical imaging principle. (iii) To overcome the reliance on semantic ground\ntruths in existing methods, we propose an unsupervised regional segmentation\nmodule, ensuring regional brightness consistency while eliminating the\ndependency on semantic ground truths. Overall, the proposed PIE can effectively\nlearn from unpaired positive/negative samples and smoothly realize non-semantic\nregional enhancement, which is clearly different from existing LLE efforts.\nBesides the novel architecture of PIE, we explore the gain of PIE on downstream\ntasks such as semantic segmentation and face detection. Training on readily\navailable open data and extensive experiments demonstrate that our method\nsurpasses the state-of-the-art LLE models over six independent cross-scenes\ndatasets. PIE runs fast with reasonable GFLOPs in test time, making it easy to\nuse on mobile devices.\n","authors":["Dong Liang","Zhengyan Xu","Ling Li","Mingqiang Wei","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.04586v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2112.06451"}]},"2024-04-09T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.06512v1","updated":"2024-04-09T17:59:32Z","published":"2024-04-09T17:59:32Z","title":"InternLM-XComposer2-4KHD: A Pioneering Large Vision-Language Model\n Handling Resolutions from 336 Pixels to 4K HD","summary":" The Large Vision-Language Model (LVLM) field has seen significant\nadvancements, yet its progression has been hindered by challenges in\ncomprehending fine-grained visual content due to limited resolution. Recent\nefforts have aimed to enhance the high-resolution understanding capabilities of\nLVLMs, yet they remain capped at approximately 1500 x 1500 pixels and\nconstrained to a relatively narrow resolution range. This paper represents\nInternLM-XComposer2-4KHD, a groundbreaking exploration into elevating LVLM\nresolution capabilities up to 4K HD (3840 x 1600) and beyond. Concurrently,\nconsidering the ultra-high resolution may not be necessary in all scenarios, it\nsupports a wide range of diverse resolutions from 336 pixels to 4K standard,\nsignificantly broadening its scope of applicability. Specifically, this\nresearch advances the patch division paradigm by introducing a novel extension:\ndynamic resolution with automatic patch configuration. It maintains the\ntraining image aspect ratios while automatically varying patch counts and\nconfiguring layouts based on a pre-trained Vision Transformer (ViT) (336 x\n336), leading to dynamic training resolution from 336 pixels to 4K standard.\nOur research demonstrates that scaling training resolution up to 4K HD leads to\nconsistent performance enhancements without hitting the ceiling of potential\nimprovements. InternLM-XComposer2-4KHD shows superb capability that matches or\neven surpasses GPT-4V and Gemini Pro in 10 of the 16 benchmarks. The\nInternLM-XComposer2-4KHD model series with 7B parameters are publicly available\nat https://github.com/InternLM/InternLM-XComposer.\n","authors":["Xiaoyi Dong","Pan Zhang","Yuhang Zang","Yuhang Cao","Bin Wang","Linke Ouyang","Songyang Zhang","Haodong Duan","Wenwei Zhang","Yining Li","Hang Yan","Yang Gao","Zhe Chen","Xinyue Zhang","Wei Li","Jingwen Li","Wenhai Wang","Kai Chen","Conghui He","Xingcheng Zhang","Jifeng Dai","Yu Qiao","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06512v1.pdf","comment":"Code and models are publicly available at\n https://github.com/InternLM/InternLM-XComposer"},{"id":"http://arxiv.org/abs/2404.06511v1","updated":"2024-04-09T17:59:31Z","published":"2024-04-09T17:59:31Z","title":"MoReVQA: Exploring Modular Reasoning Models for Video Question Answering","summary":" This paper addresses the task of video question answering (videoQA) via a\ndecomposed multi-stage, modular reasoning framework. Previous modular methods\nhave shown promise with a single planning stage ungrounded in visual content.\nHowever, through a simple and effective baseline, we find that such systems can\nlead to brittle behavior in practice for challenging videoQA settings. Thus,\nunlike traditional single-stage planning methods, we propose a multi-stage\nsystem consisting of an event parser, a grounding stage, and a final reasoning\nstage in conjunction with an external memory. All stages are training-free, and\nperformed using few-shot prompting of large models, creating interpretable\nintermediate outputs at each stage. By decomposing the underlying planning and\ntask complexity, our method, MoReVQA, improves over prior work on standard\nvideoQA benchmarks (NExT-QA, iVQA, EgoSchema, ActivityNet-QA) with\nstate-of-the-art results, and extensions to related tasks (grounded videoQA,\nparagraph captioning).\n","authors":["Juhong Min","Shyamal Buch","Arsha Nagrani","Minsu Cho","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2404.06511v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06510v1","updated":"2024-04-09T17:59:04Z","published":"2024-04-09T17:59:04Z","title":"Can Feedback Enhance Semantic Grounding in Large Vision-Language Models?","summary":" Enhancing semantic grounding abilities in Vision-Language Models (VLMs) often\ninvolves collecting domain-specific training data, refining the network\narchitectures, or modifying the training recipes. In this work, we venture into\nan orthogonal direction and explore whether VLMs can improve their semantic\ngrounding by \"receiving\" feedback, without requiring in-domain data,\nfine-tuning, or modifications to the network architectures. We systematically\nanalyze this hypothesis using a feedback mechanism composed of a binary signal.\nWe find that if prompted appropriately, VLMs can utilize feedback both in a\nsingle step and iteratively, showcasing the potential of feedback as an\nalternative technique to improve grounding in internet-scale VLMs. Furthermore,\nVLMs, like LLMs, struggle to self-correct errors out-of-the-box. However, we\nfind that this issue can be mitigated via a binary verification mechanism.\nFinally, we explore the potential and limitations of amalgamating these\nfindings and applying them iteratively to automatically enhance VLMs' grounding\nperformance, showing grounding accuracy consistently improves using automated\nfeedback across all models in all settings investigated. Overall, our iterative\nframework improves semantic grounding in VLMs by more than 15 accuracy points\nunder noise-free feedback and up to 5 accuracy points under a simple automated\nbinary verification mechanism. The project website is hosted at\nhttps://andrewliao11.github.io/vlms_feedback\n","authors":["Yuan-Hong Liao","Rafid Mahmood","Sanja Fidler","David Acuna"],"pdf_url":"https://arxiv.org/pdf/2404.06510v1.pdf","comment":"31 pages, 15 figures"},{"id":"http://arxiv.org/abs/2404.06507v1","updated":"2024-04-09T17:55:41Z","published":"2024-04-09T17:55:41Z","title":"Reconstructing Hand-Held Objects in 3D","summary":" Objects manipulated by the hand (i.e., manipulanda) are particularly\nchallenging to reconstruct from in-the-wild RGB images or videos. Not only does\nthe hand occlude much of the object, but also the object is often only visible\nin a small number of image pixels. At the same time, two strong anchors emerge\nin this setting: (1) estimated 3D hands help disambiguate the location and\nscale of the object, and (2) the set of manipulanda is small relative to all\npossible objects. With these insights in mind, we present a scalable paradigm\nfor handheld object reconstruction that builds on recent breakthroughs in large\nlanguage/vision models and 3D object datasets. Our model, MCC-Hand-Object\n(MCC-HO), jointly reconstructs hand and object geometry given a single RGB\nimage and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve\na 3D object model that matches the object in the image and rigidly align the\nmodel to the network-inferred geometry; we call this alignment\nRetrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO\nachieves state-of-the-art performance on lab and Internet datasets, and we show\nhow RAR can be used to automatically obtain 3D labels for in-the-wild images of\nhand-object interactions.\n","authors":["Jane Wu","Georgios Pavlakos","Georgia Gkioxari","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2404.06507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17048v3","updated":"2024-04-09T17:54:12Z","published":"2023-11-28T18:55:37Z","title":"Zero-shot Referring Expression Comprehension via Structural Similarity\n Between Images and Captions","summary":" Zero-shot referring expression comprehension aims at localizing bounding\nboxes in an image corresponding to provided textual prompts, which requires:\n(i) a fine-grained disentanglement of complex visual scene and textual context,\nand (ii) a capacity to understand relationships among disentangled entities.\nUnfortunately, existing large vision-language alignment (VLA) models, e.g.,\nCLIP, struggle with both aspects so cannot be directly used for this task. To\nmitigate this gap, we leverage large foundation models to disentangle both\nimages and texts into triplets in the format of (subject, predicate, object).\nAfter that, grounding is accomplished by calculating the structural similarity\nmatrix between visual and textual triplets with a VLA model, and subsequently\npropagate it to an instance-level similarity matrix. Furthermore, to equip VLA\nmodels with the ability of relationship understanding, we design a\ntriplet-matching objective to fine-tune the VLA models on a collection of\ncurated dataset containing abundant entity relationships. Experiments\ndemonstrate that our visual grounding performance increase of up to 19.5% over\nthe SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo\ndataset, our zero-shot approach achieves comparable accuracy to the fully\nsupervised model. Code is available at\nhttps://github.com/Show-han/Zeroshot_REC.\n","authors":["Zeyu Han","Fangrui Zhu","Qianru Lao","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.17048v3.pdf","comment":"CVPR 2024, Code available at https://github.com/Show-han/Zeroshot_REC"},{"id":"http://arxiv.org/abs/2212.08731v3","updated":"2024-04-09T17:52:49Z","published":"2022-12-16T22:03:37Z","title":"Multi-person 3D pose estimation from unlabelled data","summary":" Its numerous applications make multi-human 3D pose estimation a remarkably\nimpactful area of research. Nevertheless, assuming a multiple-view system\ncomposed of several regular RGB cameras, 3D multi-pose estimation presents\nseveral challenges. First of all, each person must be uniquely identified in\nthe different views to separate the 2D information provided by the cameras.\nSecondly, the 3D pose estimation process from the multi-view 2D information of\neach person must be robust against noise and potential occlusions in the\nscenario. In this work, we address these two challenges with the help of deep\nlearning. Specifically, we present a model based on Graph Neural Networks\ncapable of predicting the cross-view correspondence of the people in the\nscenario along with a Multilayer Perceptron that takes the 2D points to yield\nthe 3D poses of each person. These two models are trained in a self-supervised\nmanner, thus avoiding the need for large datasets with 3D annotations.\n","authors":["Daniel Rodriguez-Criado","Pilar Bachiller","George Vogiatzis","Luis J. Manso"],"pdf_url":"https://arxiv.org/pdf/2212.08731v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06493v1","updated":"2024-04-09T17:48:52Z","published":"2024-04-09T17:48:52Z","title":"Flying With Photons: Rendering Novel Views of Propagating Light","summary":" We present an imaging and neural rendering technique that seeks to synthesize\nvideos of light propagating through a scene from novel, moving camera\nviewpoints. Our approach relies on a new ultrafast imaging setup to capture a\nfirst-of-its kind, multi-viewpoint video dataset with picosecond-level temporal\nresolution. Combined with this dataset, we introduce an efficient neural volume\nrendering framework based on the transient field. This field is defined as a\nmapping from a 3D point and 2D direction to a high-dimensional, discrete-time\nsignal that represents time-varying radiance at ultrafast timescales. Rendering\nwith transient fields naturally accounts for effects due to the finite speed of\nlight, including viewpoint-dependent appearance changes caused by light\npropagation delays to the camera. We render a range of complex effects,\nincluding scattering, specular reflection, refraction, and diffraction.\nAdditionally, we demonstrate removing viewpoint-dependent propagation delays\nusing a time warping procedure, rendering of relativistic effects, and video\nsynthesis of direct and global components of light transport.\n","authors":["Anagh Malik","Noah Juravsky","Ryan Po","Gordon Wetzstein","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2404.06493v1.pdf","comment":"Project page: https://anaghmalik.com/FlyingWithPhotons/"},{"id":"http://arxiv.org/abs/2303.12054v4","updated":"2024-04-09T17:44:24Z","published":"2023-03-21T17:45:38Z","title":"Influencer Backdoor Attack on Semantic Segmentation","summary":" When a small number of poisoned samples are injected into the training\ndataset of a deep neural network, the network can be induced to exhibit\nmalicious behavior during inferences, which poses potential threats to\nreal-world applications. While they have been intensively studied in\nclassification, backdoor attacks on semantic segmentation have been largely\noverlooked. Unlike classification, semantic segmentation aims to classify every\npixel within a given image. In this work, we explore backdoor attacks on\nsegmentation models to misclassify all pixels of a victim class by injecting a\nspecific trigger on non-victim pixels during inferences, which is dubbed\nInfluencer Backdoor Attack (IBA). IBA is expected to maintain the\nclassification accuracy of non-victim pixels and mislead classifications of all\nvictim pixels in every single inference and could be easily applied to\nreal-world scenes. Based on the context aggregation ability of segmentation\nmodels, we proposed a simple, yet effective, Nearest-Neighbor trigger injection\nstrategy. We also introduce an innovative Pixel Random Labeling strategy which\nmaintains optimal performance even when the trigger is placed far from the\nvictim pixels. Our extensive experiments reveal that current segmentation\nmodels do suffer from backdoor attacks, demonstrate IBA real-world\napplicability, and show that our proposed techniques can further increase\nattack performance.\n","authors":["Haoheng Lan","Jindong Gu","Philip Torr","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.12054v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06483v1","updated":"2024-04-09T17:34:19Z","published":"2024-04-09T17:34:19Z","title":"RhythmMamba: Fast Remote Physiological Measurement with Arbitrary Length\n Videos","summary":" Remote photoplethysmography (rPPG) is a non-contact method for detecting\nphysiological signals from facial videos, holding great potential in various\napplications such as healthcare, affective computing, and anti-spoofing.\nExisting deep learning methods struggle to address two core issues of rPPG\nsimultaneously: extracting weak rPPG signals from video segments with large\nspatiotemporal redundancy and understanding the periodic patterns of rPPG among\nlong contexts. This represents a trade-off between computational complexity and\nthe ability to capture long-range dependencies, posing a challenge for rPPG\nthat is suitable for deployment on mobile devices. Based on the in-depth\nexploration of Mamba's comprehension of spatial and temporal information, this\npaper introduces RhythmMamba, an end-to-end Mamba-based method that employs\nmulti-temporal Mamba to constrain both periodic patterns and short-term trends,\ncoupled with frequency domain feed-forward to enable Mamba to robustly\nunderstand the quasi-periodic patterns of rPPG. Extensive experiments show that\nRhythmMamba achieves state-of-the-art performance with reduced parameters and\nlower computational complexity. The proposed RhythmMamba can be applied to\nvideo segments of any length without performance degradation. The codes are\navailable at https://github.com/zizheng-guo/RhythmMamba.\n","authors":["Bochao Zou","Zizheng Guo","Xiaocheng Hu","Huimin Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06483v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2402.12788"},{"id":"http://arxiv.org/abs/2404.06479v1","updated":"2024-04-09T17:30:18Z","published":"2024-04-09T17:30:18Z","title":"Text-Based Reasoning About Vector Graphics","summary":" While large multimodal models excel in broad vision-language benchmarks, they\noften struggle with tasks requiring precise perception of low-level visual\ndetails, such as comparing line lengths or solving simple mazes. In particular,\nthis failure mode persists in question-answering tasks about vector graphics --\nimages composed purely of 2D objects and shapes. To address this challenge, we\npropose the Visually Descriptive Language Model (VDLM), which performs\ntext-based reasoning about vector graphics. VDLM leverages Scalable Vector\nGraphics (SVG) for a more precise visual description and first uses an\noff-the-shelf raster-to-SVG algorithm for encoding. Since existing language\nmodels cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG\nwith pretrained language models through a newly introduced intermediate\nsymbolic representation, Primal Visual Description (PVD), comprising primitive\nattributes (e.g., shape, position, measurement) with their corresponding\npredicted values. PVD is task-agnostic and represents visual primitives that\nare universal across all vector graphics. It can be learned with procedurally\ngenerated (SVG, PVD) pairs and also enables the direct use of LLMs for\ngeneralization to complex reasoning tasks. By casting an image to a text-based\nrepresentation, we can leverage the power of language models to learn alignment\nfrom SVG to visual primitives and generalize to unseen question-answering\ntasks. Empirical results show that VDLM achieves stronger zero-shot performance\ncompared to state-of-the-art LMMs, such as GPT-4V, in various low-level\nmultimodal perception and reasoning tasks on vector graphics. We additionally\npresent extensive analyses on VDLM's performance, demonstrating that our\nframework offers better interpretability due to its disentangled perception and\nreasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/\n","authors":["Zhenhailong Wang","Joy Hsu","Xingyao Wang","Kuan-Hao Huang","Manling Li","Jiajun Wu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2404.06479v1.pdf","comment":"Project page: https://mikewangwzhl.github.io/VDLM/"},{"id":"http://arxiv.org/abs/2404.06470v1","updated":"2024-04-09T17:17:48Z","published":"2024-04-09T17:17:48Z","title":"Learning State-Invariant Representations of Objects from Image\n Collections with State, Pose, and Viewpoint Changes","summary":" We add one more invariance - state invariance - to the more commonly used\nother invariances for learning object representations for recognition and\nretrieval. By state invariance, we mean robust with respect to changes in the\nstructural form of the object, such as when an umbrella is folded, or when an\nitem of clothing is tossed on the floor. Since humans generally have no\ndifficulty in recognizing objects despite such state changes, we are naturally\nfaced with the question of whether it is possible to devise a neural\narchitecture with similar abilities. To that end, we present a novel dataset,\nObjectsWithStateChange, that captures state and pose variations in the object\nimages recorded from arbitrary viewpoints. We believe that this dataset will\nfacilitate research in fine-grained object recognition and retrieval of objects\nthat are capable of state changes. The goal of such research would be to train\nmodels capable of generating object embeddings that remain invariant to state\nchanges while also staying invariant to transformations induced by changes in\nviewpoint, pose, illumination, etc. To demonstrate the usefulness of the\nObjectsWithStateChange dataset, we also propose a curriculum learning strategy\nthat uses the similarity relationships in the learned embedding space after\neach epoch to guide the training process. The model learns discriminative\nfeatures by comparing visually similar objects within and across different\ncategories, encouraging it to differentiate between objects that may be\nchallenging to distinguish due to changes in their state. We believe that this\nstrategy enhances the model's ability to capture discriminative features for\nfine-grained tasks that may involve objects with state changes, leading to\nperformance improvements on object-level tasks not only on our new dataset, but\nalso on two other challenging multi-view datasets such as ModelNet40 and\nObjectPI.\n","authors":["Rohan Sarkar","Avinash Kak"],"pdf_url":"https://arxiv.org/pdf/2404.06470v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2304.06140v3","updated":"2024-04-09T17:09:03Z","published":"2023-04-12T19:47:13Z","title":"An Edit Friendly DDPM Noise Space: Inversion and Manipulations","summary":" Denoising diffusion probabilistic models (DDPMs) employ a sequence of white\nGaussian noise samples to generate an image. In analogy with GANs, those noise\nmaps could be considered as the latent code associated with the generated\nimage. However, this native noise space does not possess a convenient\nstructure, and is thus challenging to work with in editing tasks. Here, we\npropose an alternative latent noise space for DDPM that enables a wide range of\nediting operations via simple means, and present an inversion method for\nextracting these edit-friendly noise maps for any given image (real or\nsynthetically generated). As opposed to the native DDPM noise space, the\nedit-friendly noise maps do not have a standard normal distribution and are not\nstatistically independent across timesteps. However, they allow perfect\nreconstruction of any desired image, and simple transformations on them\ntranslate into meaningful manipulations of the output image (e.g. shifting,\ncolor edits). Moreover, in text-conditional models, fixing those noise maps\nwhile changing the text prompt, modifies semantics while retaining structure.\nWe illustrate how this property enables text-based editing of real images via\nthe diverse DDPM sampling scheme (in contrast to the popular non-diverse DDIM\ninversion). We also show how it can be used within existing diffusion-based\nediting methods to improve their quality and diversity. Webpage:\nhttps://inbarhub.github.io/DDPM_inversion\n","authors":["Inbar Huberman-Spiegelglas","Vladimir Kulikov","Tomer Michaeli"],"pdf_url":"https://arxiv.org/pdf/2304.06140v3.pdf","comment":"CVPR 2024. Code and examples are available at\n https://github.com/inbarhub/DDPM_inversion"},{"id":"http://arxiv.org/abs/2404.06455v1","updated":"2024-04-09T16:55:23Z","published":"2024-04-09T16:55:23Z","title":"A comparative analysis of deep learning models for lung segmentation on\n X-ray images","summary":" Robust and highly accurate lung segmentation in X-rays is crucial in medical\nimaging. This study evaluates deep learning solutions for this task, ranking\nexisting methods and analyzing their performance under diverse image\nmodifications. Out of 61 analyzed papers, only nine offered implementation or\npre-trained models, enabling assessment of three prominent methods: Lung VAE,\nTransResUNet, and CE-Net. The analysis revealed that CE-Net performs best,\ndemonstrating the highest values in dice similarity coefficient and\nintersection over union metric.\n","authors":["Weronika Hryniewska-Guzik","Jakub Bilski","Bartosz Chrostowski","Jakub Drak Sbahi","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.06455v1.pdf","comment":"published at the Polish Conference on Artificial Intelligence\n (PP-RAI), 2024"},{"id":"http://arxiv.org/abs/2404.06453v1","updated":"2024-04-09T16:54:19Z","published":"2024-04-09T16:54:19Z","title":"PURE: Turning Polysemantic Neurons Into Pure Features by Identifying\n Relevant Circuits","summary":" The field of mechanistic interpretability aims to study the role of\nindividual neurons in Deep Neural Networks. Single neurons, however, have the\ncapability to act polysemantically and encode for multiple (unrelated)\nfeatures, which renders their interpretation difficult. We present a method for\ndisentangling polysemanticity of any Deep Neural Network by decomposing a\npolysemantic neuron into multiple monosemantic \"virtual\" neurons. This is\nachieved by identifying the relevant sub-graph (\"circuit\") for each \"pure\"\nfeature. We demonstrate how our approach allows us to find and disentangle\nvarious polysemantic units of ResNet models trained on ImageNet. While\nevaluating feature visualizations using CLIP, our method effectively\ndisentangles representations, improving upon methods based on neuron\nactivations. Our code is available at https://github.com/maxdreyer/PURE.\n","authors":["Maximilian Dreyer","Erblina Purelku","Johanna Vielhaben","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2404.06453v1.pdf","comment":"14 pages (4 pages manuscript, 2 pages references, 8 pages appendix)"},{"id":"http://arxiv.org/abs/2404.06451v1","updated":"2024-04-09T16:53:43Z","published":"2024-04-09T16:53:43Z","title":"SmartControl: Enhancing ControlNet for Handling Rough Visual Conditions","summary":" Human visual imagination usually begins with analogies or rough sketches. For\nexample, given an image with a girl playing guitar before a building, one may\nanalogously imagine how it seems like if Iron Man playing guitar before Pyramid\nin Egypt. Nonetheless, visual condition may not be precisely aligned with the\nimaginary result indicated by text prompt, and existing layout-controllable\ntext-to-image (T2I) generation models is prone to producing degraded generated\nresults with obvious artifacts. To address this issue, we present a novel T2I\ngeneration method dubbed SmartControl, which is designed to modify the rough\nvisual conditions for adapting to text prompt. The key idea of our SmartControl\nis to relax the visual condition on the areas that are conflicted with text\nprompts. In specific, a Control Scale Predictor (CSP) is designed to identify\nthe conflict regions and predict the local control scales, while a dataset with\ntext prompts and rough visual conditions is constructed for training CSP. It is\nworth noting that, even with a limited number (e.g., 1,000~2,000) of training\nsamples, our SmartControl can generalize well to unseen objects. Extensive\nexperiments on four typical visual condition types clearly show the efficacy of\nour SmartControl against state-of-the-arts. Source code, pre-trained models,\nand datasets are available at https://github.com/liuxiaoyu1104/SmartControl.\n","authors":["Xiaoyu Liu","Yuxiang Wei","Ming Liu","Xianhui Lin","Peiran Ren","Xuansong Xie","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.06451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06447v1","updated":"2024-04-09T16:49:42Z","published":"2024-04-09T16:49:42Z","title":"The Central Spanning Tree Problem","summary":" Spanning trees are an important primitive in many data analysis tasks, when a\ndata set needs to be summarized in terms of its \"skeleton\", or when a\ntree-shaped graph over all observations is required for downstream processing.\nPopular definitions of spanning trees include the minimum spanning tree and the\noptimum distance spanning tree, a.k.a. the minimum routing cost tree. When\nsearching for the shortest spanning tree but admitting additional branching\npoints, even shorter spanning trees can be realized: Steiner trees.\nUnfortunately, both minimum spanning and Steiner trees are not robust with\nrespect to noise in the observations; that is, small perturbations of the\noriginal data set often lead to drastic changes in the associated spanning\ntrees. In response, we make two contributions when the data lies in a Euclidean\nspace: on the theoretical side, we introduce a new optimization problem, the\n\"(branched) central spanning tree\", which subsumes all previously mentioned\ndefinitions as special cases. On the practical side, we show empirically that\nthe (branched) central spanning tree is more robust to noise in the data, and\nas such is better suited to summarize a data set in terms of its skeleton. We\nalso propose a heuristic to address the NP-hard optimization problem, and\nillustrate its use on single cell RNA expression data from biology and 3D point\nclouds of plants.\n","authors":["Enrique Fita Sanmartín","Christoph Schnörr","Fred A. Hamprecht"],"pdf_url":"https://arxiv.org/pdf/2404.06447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06443v1","updated":"2024-04-09T16:45:34Z","published":"2024-04-09T16:45:34Z","title":"Multi-scale Dynamic and Hierarchical Relationship Modeling for Facial\n Action Units Recognition","summary":" Human facial action units (AUs) are mutually related in a hierarchical\nmanner, as not only they are associated with each other in both spatial and\ntemporal domains but also AUs located in the same/close facial regions show\nstronger relationships than those of different facial regions. While none of\nexisting approach thoroughly model such hierarchical inter-dependencies among\nAUs, this paper proposes to comprehensively model multi-scale AU-related\ndynamic and hierarchical spatio-temporal relationship among AUs for their\noccurrences recognition. Specifically, we first propose a novel multi-scale\ntemporal differencing network with an adaptive weighting block to explicitly\ncapture facial dynamics across frames at different spatial scales, which\nspecifically considers the heterogeneity of range and magnitude in different\nAUs' activation. Then, a two-stage strategy is introduced to hierarchically\nmodel the relationship among AUs based on their spatial distribution (i.e.,\nlocal and cross-region AU relationship modelling). Experimental results\nachieved on BP4D and DISFA show that our approach is the new state-of-the-art\nin the field of AU occurrence recognition. Our code is publicly available at\nhttps://github.com/CVI-SZU/MDHR.\n","authors":["Zihan Wang","Siyang Song","Cheng Luo","Songhe Deng","Weicheng Xie","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2404.06443v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.06442v1","updated":"2024-04-09T16:42:54Z","published":"2024-04-09T16:42:54Z","title":"QueSTMaps: Queryable Semantic Topological Maps for 3D Scene\n Understanding","summary":" Understanding the structural organisation of 3D indoor scenes in terms of\nrooms is often accomplished via floorplan extraction. Robotic tasks such as\nplanning and navigation require a semantic understanding of the scene as well.\nThis is typically achieved via object-level semantic segmentation. However,\nsuch methods struggle to segment out topological regions like \"kitchen\" in the\nscene. In this work, we introduce a two-step pipeline. First, we extract a\ntopological map, i.e., floorplan of the indoor scene using a novel\nmulti-channel occupancy representation. Then, we generate CLIP-aligned features\nand semantic labels for every room instance based on the objects it contains\nusing a self-attention transformer. Our language-topology alignment supports\nnatural language querying, e.g., a \"place to cook\" locates the \"kitchen\". We\noutperform the current state-of-the-art on room segmentation by ~20% and room\nclassification by ~12%. Our detailed qualitative analysis and ablation studies\nprovide insights into the problem of joint structural and semantic 3D scene\nunderstanding.\n","authors":["Yash Mehan","Kumaraditya Gupta","Rohit Jayanti","Anirudh Govil","Sourav Garg","Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.06442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.12962v2","updated":"2024-04-09T16:39:00Z","published":"2021-10-25T13:56:00Z","title":"Event Data Association via Robust Model Fitting for Event-based Object\n Tracking","summary":" Event-based approaches, which are based on bio-inspired asynchronous event\ncameras, have achieved promising performance on various computer vision tasks.\nHowever, the study of the fundamental event data association problem is still\nin its infancy. In this paper, we propose a novel Event Data Association\n(called EDA) approach to explicitly address the event association and fusion\nproblem. The proposed EDA seeks for event trajectories that best fit the event\ndata, in order to perform unifying data association and information fusion. In\nEDA, we first asynchronously fuse the event data based on its information\nentropy. Then, we introduce a deterministic model hypothesis generation\nstrategy, which effectively generates model hypotheses from the fused events,\nto represent the corresponding event trajectories. After that, we present a\ntwo-stage weighting algorithm, which robustly weighs and selects true models\nfrom the generated model hypotheses, through multi-structural geometric model\nfitting. Meanwhile, we also propose an adaptive model selection strategy to\nautomatically determine the number of the true models. Finally, we use the\nselected true models to associate and fuse the event data, without being\naffected by sensor noise and irrelevant structures. We evaluate the performance\nof the proposed EDA on the object tracking task. The experimental results show\nthe effectiveness of EDA under challenging scenarios, such as high speed,\nmotion blur, and high dynamic range conditions.\n","authors":["Haosheng Chen","Shuyuan Lin","Yan Yan","Hanzi Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2110.12962v2.pdf","comment":"32 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.02408v2","updated":"2024-04-09T16:35:41Z","published":"2024-03-04T19:06:13Z","title":"A Spatio-temporal Aligned SUNet Model for Low-light Video Enhancement","summary":" Distortions caused by low-light conditions are not only visually unpleasant\nbut also degrade the performance of computer vision tasks. The restoration and\nenhancement have proven to be highly beneficial. However, there are only a\nlimited number of enhancement methods explicitly designed for videos acquired\nin low-light conditions. We propose a Spatio-Temporal Aligned SUNet (STA-SUNet)\nmodel using a Swin Transformer as a backbone to capture low light video\nfeatures and exploit their spatio-temporal correlations. The STA-SUNet model is\ntrained on a novel, fully registered dataset (BVI), which comprises dynamic\nscenes captured under varying light conditions. It is further analysed\ncomparatively against various other models over three test datasets. The model\ndemonstrates superior adaptivity across all datasets, obtaining the highest\nPSNR and SSIM values. It is particularly effective in extreme low-light\nconditions, yielding fairly good visualisation results.\n","authors":["Ruirui Lin","Nantheera Anantrasirichai","Alexandra Malyugina","David Bull"],"pdf_url":"https://arxiv.org/pdf/2403.02408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03108v3","updated":"2024-04-09T16:31:33Z","published":"2023-07-06T16:27:39Z","title":"DIAGNOSIS: Detecting Unauthorized Data Usages in Text-to-image Diffusion\n Models","summary":" Recent text-to-image diffusion models have shown surprising performance in\ngenerating high-quality images. However, concerns have arisen regarding the\nunauthorized data usage during the training or fine-tuning process. One example\nis when a model trainer collects a set of images created by a particular artist\nand attempts to train a model capable of generating similar images without\nobtaining permission and giving credit to the artist. To address this issue, we\npropose a method for detecting such unauthorized data usage by planting the\ninjected memorization into the text-to-image diffusion models trained on the\nprotected dataset. Specifically, we modify the protected images by adding\nunique contents on these images using stealthy image warping functions that are\nnearly imperceptible to humans but can be captured and memorized by diffusion\nmodels. By analyzing whether the model has memorized the injected content\n(i.e., whether the generated images are processed by the injected\npost-processing function), we can detect models that had illegally utilized the\nunauthorized data. Experiments on Stable Diffusion and VQ Diffusion with\ndifferent model training or fine-tuning methods (i.e, LoRA, DreamBooth, and\nstandard training) demonstrate the effectiveness of our proposed method in\ndetecting unauthorized data usages. Code:\nhttps://github.com/ZhentingWang/DIAGNOSIS.\n","authors":["Zhenting Wang","Chen Chen","Lingjuan Lyu","Dimitris N. Metaxas","Shiqing Ma"],"pdf_url":"https://arxiv.org/pdf/2307.03108v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.06437v1","updated":"2024-04-09T16:28:54Z","published":"2024-04-09T16:28:54Z","title":"Seasonal Fire Prediction using Spatio-Temporal Deep Neural Networks","summary":" With climate change expected to exacerbate fire weather conditions, the\naccurate anticipation of wildfires on a global scale becomes increasingly\ncrucial for disaster mitigation. In this study, we utilize SeasFire, a\ncomprehensive global wildfire dataset with climate, vegetation, oceanic\nindices, and human-related variables, to enable seasonal wildfire forecasting\nwith machine learning. For the predictive analysis, we train deep learning\nmodels with different architectures that capture the spatio-temporal context\nleading to wildfires. Our investigation focuses on assessing the effectiveness\nof these models in predicting the presence of burned areas at varying\nforecasting time horizons globally, extending up to six months into the future,\nand on how different spatial or/and temporal context affects the performance of\nthe models. Our findings demonstrate the great potential of deep learning\nmodels in seasonal fire forecasting; longer input time-series leads to more\nrobust predictions across varying forecasting horizons, while integrating\nspatial information to capture wildfire spatio-temporal dynamics boosts\nperformance. Finally, our results hint that in order to enhance performance at\nlonger forecasting horizons, a larger receptive field spatially needs to be\nconsidered.\n","authors":["Dimitrios Michail","Lefki-Ioanna Panagiotou","Charalampos Davalas","Ioannis Prapas","Spyros Kondylatos","Nikolaos Ioannis Bountos","Ioannis Papoutsis"],"pdf_url":"https://arxiv.org/pdf/2404.06437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06430v1","updated":"2024-04-09T16:23:01Z","published":"2024-04-09T16:23:01Z","title":"pfl-research: simulation framework for accelerating research in Private\n Federated Learning","summary":" Federated learning (FL) is an emerging machine learning (ML) training\nparadigm where clients own their data and collaborate to train a global model,\nwithout revealing any data to the server and other participants. Researchers\ncommonly perform experiments in a simulation environment to quickly iterate on\nideas. However, existing open-source tools do not offer the efficiency required\nto simulate FL on larger and more realistic FL datasets. We introduce\npfl-research, a fast, modular, and easy-to-use Python framework for simulating\nFL. It supports TensorFlow, PyTorch, and non-neural network models, and is\ntightly integrated with state-of-the-art privacy algorithms. We study the speed\nof open-source FL frameworks and show that pfl-research is 7-72$\\times$ faster\nthan alternative open-source frameworks on common cross-device setups. Such\nspeedup will significantly boost the productivity of the FL research community\nand enable testing hypotheses on realistic FL datasets that were previously too\nresource intensive. We release a suite of benchmarks that evaluates an\nalgorithm's overall performance on a diverse set of realistic scenarios. The\ncode is available on GitHub at https://github.com/apple/pfl-research.\n","authors":["Filip Granqvist","Congzheng Song","Áine Cahill","Rogier van Dalen","Martin Pelikan","Yi Sheng Chan","Xiaojun Feng","Natarajan Krishnaswami","Vojta Jina","Mona Chitnis"],"pdf_url":"https://arxiv.org/pdf/2404.06430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06429v1","updated":"2024-04-09T16:20:03Z","published":"2024-04-09T16:20:03Z","title":"Magic-Boost: Boost 3D Generation with Mutli-View Conditioned Diffusion","summary":" Benefiting from the rapid development of 2D diffusion models, 3D content\ncreation has made significant progress recently. One promising solution\ninvolves the fine-tuning of pre-trained 2D diffusion models to harness their\ncapacity for producing multi-view images, which are then lifted into accurate\n3D models via methods like fast-NeRFs or large reconstruction models. However,\nas inconsistency still exists and limited generated resolution, the generation\nresults of such methods still lack intricate textures and complex geometries.\nTo solve this problem, we propose Magic-Boost, a multi-view conditioned\ndiffusion model that significantly refines coarse generative results through a\nbrief period of SDS optimization ($\\sim15$min). Compared to the previous text\nor single image based diffusion models, Magic-Boost exhibits a robust\ncapability to generate images with high consistency from pseudo synthesized\nmulti-view images. It provides precise SDS guidance that well aligns with the\nidentity of the input images, enriching the local detail in both geometry and\ntexture of the initial generative results. Extensive experiments show\nMagic-Boost greatly enhances the coarse inputs and generates high-quality 3D\nassets with rich geometric and textural details. (Project Page:\nhttps://magic-research.github.io/magic-boost/)\n","authors":["Fan Yang","Jianfeng Zhang","Yichun Shi","Bowen Chen","Chenxu Zhang","Huichao Zhang","Xiaofeng Yang","Jiashi Feng","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06425v1","updated":"2024-04-09T16:15:03Z","published":"2024-04-09T16:15:03Z","title":"ZeST: Zero-Shot Material Transfer from a Single Image","summary":" We propose ZeST, a method for zero-shot material transfer to an object in the\ninput image given a material exemplar image. ZeST leverages existing diffusion\nadapters to extract implicit material representation from the exemplar image.\nThis representation is used to transfer the material using pre-trained\ninpainting diffusion model on the object in the input image using depth\nestimates as geometry cue and grayscale object shading as illumination cues.\nThe method works on real images without any training resulting a zero-shot\napproach. Both qualitative and quantitative results on real and synthetic\ndatasets demonstrate that ZeST outputs photorealistic images with transferred\nmaterials. We also show the application of ZeST to perform multiple edits and\nrobust material assignment under different illuminations. Project Page:\nhttps://ttchengab.github.io/zest\n","authors":["Ta-Ying Cheng","Prafull Sharma","Andrew Markham","Niki Trigoni","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2404.06425v1.pdf","comment":"Project Page: https://ttchengab.github.io/zest"},{"id":"http://arxiv.org/abs/2404.06406v1","updated":"2024-04-09T15:54:03Z","published":"2024-04-09T15:54:03Z","title":"Emergent Dynamics in Neural Cellular Automata","summary":" Neural Cellular Automata (NCA) models are trainable variations of traditional\nCellular Automata (CA). Emergent motion in the patterns created by NCA has been\nsuccessfully applied to synthesize dynamic textures. However, the conditions\nrequired for an NCA to display dynamic patterns remain unexplored. Here, we\ninvestigate the relationship between the NCA architecture and the emergent\ndynamics of the trained models. Specifically, we vary the number of channels in\nthe cell state and the number of hidden neurons in the MultiLayer Perceptron\n(MLP), and draw a relationship between the combination of these two variables\nand the motion strength between successive frames. Our analysis reveals that\nthe disparity and proportionality between these two variables have a strong\ncorrelation with the emergent dynamics in the NCA output. We thus propose a\ndesign principle for creating dynamic NCA.\n","authors":["Yitao Xu","Ehsan Pajouheshgar","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2404.06406v1.pdf","comment":"2 pages"},{"id":"http://arxiv.org/abs/2312.09168v3","updated":"2024-04-09T15:47:56Z","published":"2023-12-14T17:34:53Z","title":"DiffusionLight: Light Probes for Free by Painting a Chrome Ball","summary":" We present a simple yet effective technique to estimate lighting in a single\ninput image. Current techniques rely heavily on HDR panorama datasets to train\nneural networks to regress an input with limited field-of-view to a full\nenvironment map. However, these approaches often struggle with real-world,\nuncontrolled settings due to the limited diversity and size of their datasets.\nTo address this problem, we leverage diffusion models trained on billions of\nstandard images to render a chrome ball into the input image. Despite its\nsimplicity, this task remains challenging: the diffusion models often insert\nincorrect or inconsistent objects and cannot readily generate images in HDR\nformat. Our research uncovers a surprising relationship between the appearance\nof chrome balls and the initial diffusion noise map, which we utilize to\nconsistently generate high-quality chrome balls. We further fine-tune an LDR\ndiffusion model (Stable Diffusion XL) with LoRA, enabling it to perform\nexposure bracketing for HDR light estimation. Our method produces convincing\nlight estimates across diverse settings and demonstrates superior\ngeneralization to in-the-wild scenarios.\n","authors":["Pakkapon Phongthawee","Worameth Chinchuthakun","Nontaphat Sinsunthithet","Amit Raj","Varun Jampani","Pramook Khungurn","Supasorn Suwajanakorn"],"pdf_url":"https://arxiv.org/pdf/2312.09168v3.pdf","comment":"CVPR 2024 Oral. For more information and code, please visit our\n website https://diffusionlight.github.io/"},{"id":"http://arxiv.org/abs/2204.03330v2","updated":"2024-04-09T15:44:05Z","published":"2022-04-07T09:56:36Z","title":"Learning Local and Global Temporal Contexts for Video Semantic\n Segmentation","summary":" Contextual information plays a core role for video semantic segmentation\n(VSS). This paper summarizes contexts for VSS in two-fold: local temporal\ncontexts (LTC) which define the contexts from neighboring frames, and global\ntemporal contexts (GTC) which represent the contexts from the whole video. As\nfor LTC, it includes static and motional contexts, corresponding to static and\nmoving content in neighboring frames, respectively. Previously, both static and\nmotional contexts have been studied. However, there is no research about\nsimultaneously learning static and motional contexts (highly complementary).\nHence, we propose a Coarse-to-Fine Feature Mining (CFFM) technique to learn a\nunified presentation of LTC. CFFM contains two parts: Coarse-to-Fine Feature\nAssembling (CFFA) and Cross-frame Feature Mining (CFM). CFFA abstracts static\nand motional contexts, and CFM mines useful information from nearby frames to\nenhance target features. To further exploit more temporal contexts, we propose\nCFFM++ by additionally learning GTC from the whole video. Specifically, we\nuniformly sample certain frames from the video and extract global contextual\nprototypes by k-means. The information within those prototypes is mined by CFM\nto refine target features. Experimental results on popular benchmarks\ndemonstrate that CFFM and CFFM++ perform favorably against state-of-the-art\nmethods. Our code is available at https://github.com/GuoleiSun/VSS-CFFM\n","authors":["Guolei Sun","Yun Liu","Henghui Ding","Min Wu","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2204.03330v2.pdf","comment":"Accepted to TPAMI, an extended version of a paper published in CVPR\n 2022"},{"id":"http://arxiv.org/abs/2401.16110v2","updated":"2024-04-09T15:33:10Z","published":"2024-01-29T12:31:13Z","title":"SGV3D:Towards Scenario Generalization for Vision-based Roadside 3D\n Object Detection","summary":" Roadside perception can greatly increase the safety of autonomous vehicles by\nextending their perception ability beyond the visual range and addressing blind\nspots. However, current state-of-the-art vision-based roadside detection\nmethods possess high accuracy on labeled scenes but have inferior performance\non new scenes. This is because roadside cameras remain stationary after\ninstallation and can only collect data from a single scene, resulting in the\nalgorithm overfitting these roadside backgrounds and camera poses. To address\nthis issue, in this paper, we propose an innovative Scenario Generalization\nFramework for Vision-based Roadside 3D Object Detection, dubbed SGV3D.\nSpecifically, we employ a Background-suppressed Module (BSM) to mitigate\nbackground overfitting in vision-centric pipelines by attenuating background\nfeatures during the 2D to bird's-eye-view projection. Furthermore, by\nintroducing the Semi-supervised Data Generation Pipeline (SSDG) using unlabeled\nimages from new scenes, diverse instance foregrounds with varying camera poses\nare generated, addressing the risk of overfitting specific camera poses. We\nevaluate our method on two large-scale roadside benchmarks. Our method\nsurpasses all previous methods by a significant margin in new scenes, including\n+42.57% for vehicle, +5.87% for pedestrian, and +14.89% for cyclist compared to\nBEVHeight on the DAIR-V2X-I heterologous benchmark. On the larger-scale Rope3D\nheterologous benchmark, we achieve notable gains of 14.48% for car and 12.41%\nfor large vehicle. We aspire to contribute insights on the exploration of\nroadside perception techniques, emphasizing their capability for scenario\ngeneralization. The code will be available at\nhttps://github.com/yanglei18/SGV3D\n","authors":["Lei Yang","Xinyu Zhang","Jun Li","Li Wang","Chuang Zhang","Li Ju","Zhiwei Li","Yang Shen"],"pdf_url":"https://arxiv.org/pdf/2401.16110v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.06389v1","updated":"2024-04-09T15:31:48Z","published":"2024-04-09T15:31:48Z","title":"Raster Forge: Interactive Raster Manipulation Library and GUI for Python","summary":" Raster Forge is a Python library and graphical user interface for raster data\nmanipulation and analysis. The tool is focused on remote sensing applications,\nparticularly in wildfire management. It allows users to import, visualize, and\nprocess raster layers for tasks such as image compositing or topographical\nanalysis. For wildfire management, it generates fuel maps using predefined\nmodels. Its impact extends from disaster management to hydrological modeling,\nagriculture, and environmental monitoring. Raster Forge can be a valuable asset\nfor geoscientists and researchers who rely on raster data analysis, enhancing\ngeospatial data processing and visualization across various disciplines.\n","authors":["Afonso Oliveira","Nuno Fachada","João P. Matos-Carvalho"],"pdf_url":"https://arxiv.org/pdf/2404.06389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20330v2","updated":"2024-04-09T15:17:50Z","published":"2024-03-29T17:59:34Z","title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","summary":" Large vision-language models (LVLMs) have recently achieved rapid progress,\nsparking numerous studies to evaluate their multi-modal capabilities. However,\nwe dig into current evaluation works and identify two primary issues: 1) Visual\ncontent is unnecessary for many samples. The answers can be directly inferred\nfrom the questions and options, or the world knowledge embedded in LLMs. This\nphenomenon is prevalent across current benchmarks. For instance, GeminiPro\nachieves 42.9% on the MMMU benchmark without any visual input, and outperforms\nthe random choice baseline across six benchmarks over 24% on average. 2)\nUnintentional data leakage exists in LLM and LVLM training. LLM and LVLM could\nstill answer some visual-necessary questions without visual content, indicating\nthe memorizing of these samples within large-scale training data. For example,\nSphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM\nbackbone with 17.9%. Both problems lead to misjudgments of actual multi-modal\ngains and potentially misguide the study of LVLM. To this end, we present\nMMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500\nsamples meticulously selected by humans. MMStar benchmarks 6 core capabilities\nand 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with\ncarefully balanced and purified samples. These samples are first roughly\nselected from current benchmarks with an automated pipeline, human review is\nthen involved to ensure each curated sample exhibits visual dependency, minimal\ndata leakage, and requires advanced multi-modal capabilities. Moreover, two\nmetrics are developed to measure data leakage and actual performance gain in\nmulti-modal training. We evaluate 16 leading LVLMs on MMStar to assess their\nmulti-modal capabilities, and on 7 benchmarks with the proposed metrics to\ninvestigate their data leakage and actual multi-modal gain.\n","authors":["Lin Chen","Jinsong Li","Xiaoyi Dong","Pan Zhang","Yuhang Zang","Zehui Chen","Haodong Duan","Jiaqi Wang","Yu Qiao","Dahua Lin","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.20330v2.pdf","comment":"Project page: https://mmstar-benchmark.github.io/"},{"id":"http://arxiv.org/abs/2403.04198v2","updated":"2024-04-09T15:07:08Z","published":"2024-03-07T03:59:47Z","title":"CN-RMA: Combined Network with Ray Marching Aggregation for 3D Indoors\n Object Detection from Multi-view Images","summary":" This paper introduces CN-RMA, a novel approach for 3D indoor object detection\nfrom multi-view images. We observe the key challenge as the ambiguity of image\nand 3D correspondence without explicit geometry to provide occlusion\ninformation. To address this issue, CN-RMA leverages the synergy of 3D\nreconstruction networks and 3D object detection networks, where the\nreconstruction network provides a rough Truncated Signed Distance Function\n(TSDF) and guides image features to vote to 3D space correctly in an end-to-end\nmanner. Specifically, we associate weights to sampled points of each ray\nthrough ray marching, representing the contribution of a pixel in an image to\ncorresponding 3D locations. Such weights are determined by the predicted signed\ndistances so that image features vote only to regions near the reconstructed\nsurface. Our method achieves state-of-the-art performance in 3D object\ndetection from multi-view images, as measured by mAP@0.25 and mAP@0.5 on the\nScanNet and ARKitScenes datasets. The code and models are released at\nhttps://github.com/SerCharles/CN-RMA.\n","authors":["Guanlin Shen","Jingwei Huang","Zhihua Hu","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.04198v2.pdf","comment":"CVPR2024 poster paper, 8 pages of main part, and 4 pages of\n supplementary material"},{"id":"http://arxiv.org/abs/2311.06798v2","updated":"2024-04-09T15:07:02Z","published":"2023-11-12T10:21:04Z","title":"MetaMix: Meta-state Precision Searcher for Mixed-precision Activation\n Quantization","summary":" Mixed-precision quantization of efficient networks often suffer from\nactivation instability encountered in the exploration of bit selections. To\naddress this problem, we propose a novel method called MetaMix which consists\nof bit selection and weight training phases. The bit selection phase iterates\ntwo steps, (1) the mixed-precision-aware weight update, and (2) the bit-search\ntraining with the fixed mixed-precision-aware weights, both of which combined\nreduce activation instability in mixed-precision quantization and contribute to\nfast and high-quality bit selection. The weight training phase exploits the\nweights and step sizes trained in the bit selection phase and fine-tunes them\nthereby offering fast training. Our experiments with efficient and\nhard-to-quantize networks, i.e., MobileNet v2 and v3, and ResNet-18 on ImageNet\nshow that our proposed method pushes the boundary of mixed-precision\nquantization, in terms of accuracy vs. operations, by outperforming both mixed-\nand single-precision SOTA methods.\n","authors":["Han-Byul Kim","Joo Hyung Lee","Sungjoo Yoo","Hong-Seok Kim"],"pdf_url":"https://arxiv.org/pdf/2311.06798v2.pdf","comment":"Proc. The 38th Annual AAAI Conference on Artificial Intelligence\n (AAAI)"},{"id":"http://arxiv.org/abs/2404.06369v1","updated":"2024-04-09T15:05:48Z","published":"2024-04-09T15:05:48Z","title":"VISION2UI: A Real-World Dataset with Layout for Code Generation from UI\n Designs","summary":" Automatically generating UI code from webpage design visions can\nsignificantly alleviate the burden of developers, enabling beginner developers\nor designers to directly generate Web pages from design diagrams. Currently,\nprior research has accomplished the objective of generating UI code from\nrudimentary design visions or sketches through designing deep neural networks.\nInspired by the groundbreaking advancements achieved by Multimodal Large\nLanguage Models (MLLMs), the automatic generation of UI code from high-fidelity\ndesign images is now emerging as a viable possibility. Nevertheless, our\ninvestigation reveals that existing MLLMs are hampered by the scarcity of\nauthentic, high-quality, and large-scale datasets, leading to unsatisfactory\nperformance in automated UI code generation. To mitigate this gap, we present a\nnovel dataset, termed VISION2UI, extracted from real-world scenarios, augmented\nwith comprehensive layout information, tailored specifically for finetuning\nMLLMs in UI code generation. Specifically, this dataset is derived through a\nseries of operations, encompassing collecting, cleaning, and filtering of the\nopen-source Common Crawl dataset. In order to uphold its quality, a neural\nscorer trained on labeled samples is utilized to refine the data, retaining\nhigher-quality instances. Ultimately, this process yields a dataset comprising\n2,000 (Much more is coming soon) parallel samples encompassing design visions\nand UI code. The dataset is available at\nhttps://huggingface.co/datasets/xcodemind/vision2ui.\n","authors":["Yi Gui","Zhen Li","Yao Wan","Yemin Shi","Hongyu Zhang","Yi Su","Shaoling Dong","Xing Zhou","Wenbin Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.06369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06365v1","updated":"2024-04-09T15:02:01Z","published":"2024-04-09T15:02:01Z","title":"Dynamic Resolution Guidance for Facial Expression Recognition","summary":" Facial expression recognition (FER) is vital for human-computer interaction\nand emotion analysis, yet recognizing expressions in low-resolution images\nremains challenging. This paper introduces a practical method called Dynamic\nResolution Guidance for Facial Expression Recognition (DRGFER) to effectively\nrecognize facial expressions in images with varying resolutions without\ncompromising FER model accuracy. Our framework comprises two main components:\nthe Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation\nFacial Expression Recognition Network (MRAFER). The RRN determines image\nresolution, outputs a binary vector, and the MRAFER assigns images to suitable\nfacial expression recognition networks based on resolution. We evaluated DRGFER\non widely-used datasets RAFDB and FERPlus, demonstrating that our method\nretains optimal model performance at each resolution and outperforms\nalternative resolution approaches. The proposed framework exhibits robustness\nagainst resolution variations and facial expressions, offering a promising\nsolution for real-world applications.\n","authors":["Jie Ou","Xu Li","Tianxiang Jiang","Yuanlun Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06362v1","updated":"2024-04-09T14:56:34Z","published":"2024-04-09T14:56:34Z","title":"Test-Time Adaptation with SaLIP: A Cascade of SAM and CLIP for Zero shot\n Medical Image Segmentation","summary":" The Segment Anything Model (SAM) and CLIP are remarkable vision foundation\nmodels (VFMs). SAM, a prompt driven segmentation model, excels in segmentation\ntasks across diverse domains, while CLIP is renowned for its zero shot\nrecognition capabilities. However, their unified potential has not yet been\nexplored in medical image segmentation. To adapt SAM to medical imaging,\nexisting methods primarily rely on tuning strategies that require extensive\ndata or prior prompts tailored to the specific task, making it particularly\nchallenging when only a limited number of data samples are available. This work\npresents an in depth exploration of integrating SAM and CLIP into a unified\nframework for medical image segmentation. Specifically, we propose a simple\nunified framework, SaLIP, for organ segmentation. Initially, SAM is used for\npart based segmentation within the image, followed by CLIP to retrieve the mask\ncorresponding to the region of interest (ROI) from the pool of SAM generated\nmasks. Finally, SAM is prompted by the retrieved ROI to segment a specific\norgan. Thus, SaLIP is training and fine tuning free and does not rely on domain\nexpertise or labeled data for prompt engineering. Our method shows substantial\nenhancements in zero shot segmentation, showcasing notable improvements in DICE\nscores across diverse segmentation tasks like brain (63.46%), lung (50.11%),\nand fetal head (30.82%), when compared to un prompted SAM. Code and text\nprompts will be available online.\n","authors":["Sidra Aleem","Fangyijie Wang","Mayug Maniparambil","Eric Arazo","Julia Dietlmeier","Kathleen Curran","Noel E. O'Connor","Suzanne Little"],"pdf_url":"https://arxiv.org/pdf/2404.06362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06353v1","updated":"2024-04-09T14:44:12Z","published":"2024-04-09T14:44:12Z","title":"High Noise Scheduling is a Must","summary":" Consistency models possess high capabilities for image generation, advancing\nsampling steps to a single step through their advanced techniques. Current\nadvancements move one step forward consistency training techniques and\neliminates the limitation of distillation training. Even though the proposed\ncurriculum and noise scheduling in improved training techniques yield better\nresults than basic consistency models, it lacks well balanced noise\ndistribution and its consistency between curriculum. In this study, it is\ninvestigated the balance between high and low noise levels in noise\ndistribution and offered polynomial noise distribution to maintain the\nstability. This proposed polynomial noise distribution is also supported with a\npredefined Karras noises to prevent unique noise levels arises with Karras\nnoise generation algorithm. Furthermore, by elimination of learned noisy steps\nwith a curriculum based on sinusoidal function increase the performance of the\nmodel in denoising. To make a fair comparison with the latest released\nconsistency model training techniques, experiments are conducted with same\nhyper-parameters except curriculum and noise distribution. The models utilized\nduring experiments are determined with low depth to prove the robustness of our\nproposed technique. The results show that the polynomial noise distribution\noutperforms the model trained with log-normal noise distribution, yielding a\n33.54 FID score after 100,000 training steps with constant discretization\nsteps. Additionally, the implementation of a sinusoidal-based curriculum\nenhances denoising performance, resulting in a FID score of 30.48.\n","authors":["Mahmut S. Gokmen","Cody Bumgardner","Jie Zhang","Ge Wang","Jin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06352v1","updated":"2024-04-09T14:43:19Z","published":"2024-04-09T14:43:19Z","title":"DaF-BEVSeg: Distortion-aware Fisheye Camera based Bird's Eye View\n Segmentation with Occlusion Reasoning","summary":" Semantic segmentation is an effective way to perform scene understanding.\nRecently, segmentation in 3D Bird's Eye View (BEV) space has become popular as\nits directly used by drive policy. However, there is limited work on BEV\nsegmentation for surround-view fisheye cameras, commonly used in commercial\nvehicles. As this task has no real-world public dataset and existing synthetic\ndatasets do not handle amodal regions due to occlusion, we create a synthetic\ndataset using the Cognata simulator comprising diverse road types, weather, and\nlighting conditions. We generalize the BEV segmentation to work with any camera\nmodel; this is useful for mixing diverse cameras. We implement a baseline by\napplying cylindrical rectification on the fisheye images and using a standard\nLSS-based BEV segmentation model. We demonstrate that we can achieve better\nperformance without undistortion, which has the adverse effects of increased\nruntime due to pre-processing, reduced field-of-view, and resampling artifacts.\nFurther, we introduce a distortion-aware learnable BEV pooling strategy that is\nmore effective for the fisheye cameras. We extend the model with an occlusion\nreasoning module, which is critical for estimating in BEV space. Qualitative\nperformance of DaF-BEVSeg is showcased in the video at\nhttps://streamable.com/ge4v51.\n","authors":["Senthil Yogamani","David Unger","Venkatraman Narayanan","Varun Ravi Kumar"],"pdf_url":"https://arxiv.org/pdf/2404.06352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06351v1","updated":"2024-04-09T14:42:31Z","published":"2024-04-09T14:42:31Z","title":"HPNet: Dynamic Trajectory Forecasting with Historical Prediction\n Attention","summary":" Predicting the trajectories of road agents is essential for autonomous\ndriving systems. The recent mainstream methods follow a static paradigm, which\npredicts the future trajectory by using a fixed duration of historical frames.\nThese methods make the predictions independently even at adjacent time steps,\nwhich leads to potential instability and temporal inconsistency. As successive\ntime steps have largely overlapping historical frames, their forecasting should\nhave intrinsic correlation, such as overlapping predicted trajectories should\nbe consistent, or be different but share the same motion goal depending on the\nroad situation. Motivated by this, in this work, we introduce HPNet, a novel\ndynamic trajectory forecasting method. Aiming for stable and accurate\ntrajectory forecasting, our method leverages not only historical frames\nincluding maps and agent states, but also historical predictions. Specifically,\nwe newly design a Historical Prediction Attention module to automatically\nencode the dynamic relationship between successive predictions. Besides, it\nalso extends the attention range beyond the currently visible window\nbenefitting from the use of historical predictions. The proposed Historical\nPrediction Attention together with the Agent Attention and Mode Attention is\nfurther formulated as the Triple Factorized Attention module, serving as the\ncore design of HPNet.Experiments on the Argoverse and INTERACTION datasets show\nthat HPNet achieves state-of-the-art performance, and generates accurate and\nstable future trajectories. Our code are available at\nhttps://github.com/XiaolongTang23/HPNet.\n","authors":["Xiaolong Tang","Meina Kan","Shiguang Shan","Zhilong Ji","Jinfeng Bai","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06351v1.pdf","comment":"accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.06350v1","updated":"2024-04-09T14:40:54Z","published":"2024-04-09T14:40:54Z","title":"Rolling Shutter Correction with Intermediate Distortion Flow Estimation","summary":" This paper proposes to correct the rolling shutter (RS) distorted images by\nestimating the distortion flow from the global shutter (GS) to RS directly.\nExisting methods usually perform correction using the undistortion flow from\nthe RS to GS. They initially predict the flow from consecutive RS frames,\nsubsequently rescaling it as the displacement fields from the RS frame to the\nunderlying GS image using time-dependent scaling factors. Following this,\nRS-aware forward warping is employed to convert the RS image into its GS\ncounterpart. Nevertheless, this strategy is prone to two shortcomings. First,\nthe undistortion flow estimation is rendered inaccurate by merely linear\nscaling the flow, due to the complex non-linear motion nature. Second, RS-aware\nforward warping often results in unavoidable artifacts. To address these\nlimitations, we introduce a new framework that directly estimates the\ndistortion flow and rectifies the RS image with the backward warping operation.\nMore specifically, we first propose a global correlation-based flow attention\nmechanism to estimate the initial distortion flow and GS feature jointly, which\nare then refined by the following coarse-to-fine decoder layers. Additionally,\na multi-distortion flow prediction strategy is integrated to mitigate the issue\nof inaccurate flow estimation further. Experimental results validate the\neffectiveness of the proposed method, which outperforms state-of-the-art\napproaches on various benchmarks while maintaining high efficiency. The project\nis available at \\url{https://github.com/ljzycmd/DFRSC}.\n","authors":["Mingdeng Cao","Sidi Yang","Yujiu Yang","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06350v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.20035v2","updated":"2024-04-09T14:29:10Z","published":"2024-03-29T08:03:42Z","title":"UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces\n Parameters for Skin Lesion Segmentation","summary":" Traditionally for improving the segmentation performance of models, most\napproaches prefer to use adding more complex modules. And this is not suitable\nfor the medical field, especially for mobile medical devices, where\ncomputationally loaded models are not suitable for real clinical environments\ndue to computational resource constraints. Recently, state-space models (SSMs),\nrepresented by Mamba, have become a strong competitor to traditional CNNs and\nTransformers. In this paper, we deeply explore the key elements of parameter\ninfluence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight\nVM-UNet) based on this. Specifically, we propose a method for processing\nfeatures in parallel Vision Mamba, named PVM Layer, which achieves excellent\nperformance with the lowest computational load while keeping the overall number\nof processing channels constant. We conducted comparisons and ablation\nexperiments with several state-of-the-art lightweight models on three skin\nlesion public datasets and demonstrated that the UltraLight VM-UNet exhibits\nthe same strong performance competitiveness with parameters of only 0.049M and\nGFLOPs of 0.060. In addition, this study deeply explores the key elements of\nparameter influence in Mamba, which will lay a theoretical foundation for Mamba\nto possibly become a new mainstream module for lightweighting in the future.\nThe code is available from https://github.com/wurenkai/UltraLight-VM-UNet .\n","authors":["Renkai Wu","Yinghao Liu","Pengchen Liang","Qing Chang"],"pdf_url":"https://arxiv.org/pdf/2403.20035v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06337v1","updated":"2024-04-09T14:22:50Z","published":"2024-04-09T14:22:50Z","title":"Matching 2D Images in 3D: Metric Relative Pose from Metric\n Correspondences","summary":" Given two images, we can estimate the relative camera pose between them by\nestablishing image-to-image correspondences. Usually, correspondences are\n2D-to-2D and the pose we estimate is defined only up to scale. Some\napplications, aiming at instant augmented reality anywhere, require\nscale-metric pose estimates, and hence, they rely on external depth estimators\nto recover the scale. We present MicKey, a keypoint matching pipeline that is\nable to predict metric correspondences in 3D camera space. By learning to match\n3D coordinates across images, we are able to infer the metric relative pose\nwithout depth measurements. Depth measurements are also not required for\ntraining, nor are scene reconstructions or image overlap information. MicKey is\nsupervised only by pairs of images and their relative poses. MicKey achieves\nstate-of-the-art performance on the Map-Free Relocalisation benchmark while\nrequiring less supervision than competing approaches.\n","authors":["Axel Barroso-Laguna","Sowmya Munukutla","Victor Adrian Prisacariu","Eric Brachmann"],"pdf_url":"https://arxiv.org/pdf/2404.06337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04818v4","updated":"2024-04-09T14:15:32Z","published":"2023-11-08T16:42:14Z","title":"Cross-Silo Federated Learning Across Divergent Domains with Iterative\n Parameter Alignment","summary":" Learning from the collective knowledge of data dispersed across private\nsources can provide neural networks with enhanced generalization capabilities.\nFederated learning, a method for collaboratively training a machine learning\nmodel across remote clients, achieves this by combining client models via the\norchestration of a central server. However, current approaches face two\ncritical limitations: i) they struggle to converge when client domains are\nsufficiently different, and ii) current aggregation techniques produce an\nidentical global model for each client. In this work, we address these issues\nby reformulating the typical federated learning setup: rather than learning a\nsingle global model, we learn N models each optimized for a common objective.\nTo achieve this, we apply a weighted distance minimization to model parameters\nshared in a peer-to-peer topology. The resulting framework, Iterative Parameter\nAlignment, applies naturally to the cross-silo setting, and has the following\nproperties: (i) a unique solution for each participant, with the option to\nglobally converge each model in the federation, and (ii) an optional\nearly-stopping mechanism to elicit fairness among peers in collaborative\nlearning settings. These characteristics jointly provide a flexible new\nframework for iteratively learning from peer models trained on disparate\ndatasets. We find that the technique achieves competitive results on a variety\nof data partitions compared to state-of-the-art approaches. Further, we show\nthat the method is robust to divergent domains (i.e. disjoint classes across\npeers) where existing approaches struggle.\n","authors":["Matt Gorbett","Hossein Shirazi","Indrakshi Ray"],"pdf_url":"https://arxiv.org/pdf/2311.04818v4.pdf","comment":"Published at IEEE Big Data 2023"},{"id":"http://arxiv.org/abs/2402.18078v2","updated":"2024-04-09T14:12:02Z","published":"2024-02-28T06:07:07Z","title":"Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis","summary":" Diffusion model is a promising approach to image generation and has been\nemployed for Pose-Guided Person Image Synthesis (PGPIS) with competitive\nperformance. While existing methods simply align the person appearance to the\ntarget pose, they are prone to overfitting due to the lack of a high-level\nsemantic understanding on the source person image. In this paper, we propose a\nnovel Coarse-to-Fine Latent Diffusion (CFLD) method for PGPIS. In the absence\nof image-caption pairs and textual prompts, we develop a novel training\nparadigm purely based on images to control the generation process of a\npre-trained text-to-image diffusion model. A perception-refined decoder is\ndesigned to progressively refine a set of learnable queries and extract\nsemantic understanding of person images as a coarse-grained prompt. This allows\nfor the decoupling of fine-grained appearance and pose information controls at\ndifferent stages, and thus circumventing the potential overfitting problem. To\ngenerate more realistic texture details, a hybrid-granularity attention module\nis proposed to encode multi-scale fine-grained appearance features as bias\nterms to augment the coarse-grained prompt. Both quantitative and qualitative\nexperimental results on the DeepFashion benchmark demonstrate the superiority\nof our method over the state of the arts for PGPIS. Code is available at\nhttps://github.com/YanzuoLu/CFLD.\n","authors":["Yanzuo Lu","Manlin Zhang","Andy J Ma","Xiaohua Xie","Jian-Huang Lai"],"pdf_url":"https://arxiv.org/pdf/2402.18078v2.pdf","comment":"Accepted by CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2401.01558v2","updated":"2024-04-09T13:59:18Z","published":"2024-01-03T06:18:30Z","title":"One-Step Late Fusion Multi-view Clustering with Compressed Subspace","summary":" Late fusion multi-view clustering (LFMVC) has become a rapidly growing class\nof methods in the multi-view clustering (MVC) field, owing to its excellent\ncomputational speed and clustering performance. One bottleneck faced by\nexisting late fusion methods is that they are usually aligned to the average\nkernel function, which makes the clustering performance highly dependent on the\nquality of datasets. Another problem is that they require subsequent k-means\nclustering after obtaining the consensus partition matrix to get the final\ndiscrete labels, and the resulting separation of the label learning and cluster\nstructure optimization processes limits the integrity of these models. To\naddress the above issues, we propose an integrated framework named One-Step\nLate Fusion Multi-view Clustering with Compressed Subspace (OS-LFMVC-CS).\nSpecifically, we use the consensus subspace to align the partition matrix while\noptimizing the partition fusion, and utilize the fused partition matrix to\nguide the learning of discrete labels. A six-step iterative optimization\napproach with verified convergence is proposed. Sufficient experiments on\nmultiple datasets validate the effectiveness and efficiency of our proposed\nmethod.\n","authors":["Qiyuan Ou","Pei Zhang","Sihang Zhou","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.01558v2.pdf","comment":"Accepted by ICASSP2024"},{"id":"http://arxiv.org/abs/2403.17881v2","updated":"2024-04-09T13:56:06Z","published":"2024-03-26T17:12:34Z","title":"Deepfake Generation and Detection: A Benchmark and Survey","summary":" In addition to the advancements in deepfake generation, corresponding\ndetection technologies need to continuously evolve to regulate the potential\nmisuse of deepfakes, such as for privacy invasion and phishing attacks. This\nsurvey comprehensively reviews the latest developments in deepfake generation\nand detection, summarizing and analyzing the current state of the art in this\nrapidly evolving field. We first unify task definitions, comprehensively\nintroduce datasets and metrics, and discuss the development of generation and\ndetection technology frameworks. Then, we discuss the development of several\nrelated sub-fields and focus on researching four mainstream deepfake fields:\npopular face swap, face reenactment, talking face generation, and facial\nattribute editing, as well as foreign detection. Subsequently, we\ncomprehensively benchmark representative methods on popular datasets for each\nfield, fully evaluating the latest and influential works published in top\nconferences/journals. Finally, we analyze the challenges and future research\ndirections of the discussed fields. We closely follow the latest developments\nin https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection.\n","authors":["Gan Pei","Jiangning Zhang","Menghan Hu","Zhenyu Zhang","Chengjie Wang","Yunsheng Wu","Guangtao Zhai","Jian Yang","Chunhua Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.17881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05334v3","updated":"2024-04-09T13:54:48Z","published":"2023-09-11T09:32:45Z","title":"MultIOD: Rehearsal-free Multihead Incremental Object Detector","summary":" Class-Incremental learning (CIL) refers to the ability of artificial agents\nto integrate new classes as they appear in a stream. It is particularly\ninteresting in evolving environments where agents have limited access to memory\nand computational resources. The main challenge of incremental learning is\ncatastrophic forgetting, the inability of neural networks to retain past\nknowledge when learning a new one. Unfortunately, most existing\nclass-incremental methods for object detection are applied to two-stage\nalgorithms such as Faster-RCNN, and rely on rehearsal memory to retain past\nknowledge. We argue that those are not suitable in resource-limited\nenvironments, and more effort should be dedicated to anchor-free and\nrehearsal-free object detection. In this paper, we propose MultIOD, a\nclass-incremental object detector based on CenterNet. Our contributions are:\n(1) we propose a multihead feature pyramid and multihead detection architecture\nto efficiently separate class representations, (2) we employ transfer learning\nbetween classes learned initially and those learned incrementally to tackle\ncatastrophic forgetting, and (3) we use a class-wise non-max-suppression as a\npost-processing technique to remove redundant boxes. Results show that our\nmethod outperforms state-of-the-art methods on two Pascal VOC datasets, while\nonly saving the model in its current state, contrary to other\ndistillation-based counterparts.\n","authors":["Eden Belouadah","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2309.05334v3.pdf","comment":"Accepted at the archival track of the Workshop on Continual Learning\n in Computer Vision (CVPR 2024)"},{"id":"http://arxiv.org/abs/2401.17053v3","updated":"2024-04-09T13:47:18Z","published":"2024-01-30T14:34:19Z","title":"BlockFusion: Expandable 3D Scene Generation using Latent Tri-plane\n Extrapolation","summary":" We present BlockFusion, a diffusion-based model that generates 3D scenes as\nunit blocks and seamlessly incorporates new blocks to extend the scene.\nBlockFusion is trained using datasets of 3D blocks that are randomly cropped\nfrom complete 3D scene meshes. Through per-block fitting, all training blocks\nare converted into the hybrid neural fields: with a tri-plane containing the\ngeometry features, followed by a Multi-layer Perceptron (MLP) for decoding the\nsigned distance values. A variational auto-encoder is employed to compress the\ntri-planes into the latent tri-plane space, on which the denoising diffusion\nprocess is performed. Diffusion applied to the latent representations allows\nfor high-quality and diverse 3D scene generation. To expand a scene during\ngeneration, one needs only to append empty blocks to overlap with the current\nscene and extrapolate existing latent tri-planes to populate new blocks. The\nextrapolation is done by conditioning the generation process with the feature\nsamples from the overlapping tri-planes during the denoising iterations. Latent\ntri-plane extrapolation produces semantically and geometrically meaningful\ntransitions that harmoniously blend with the existing scene. A 2D layout\nconditioning mechanism is used to control the placement and arrangement of\nscene elements. Experimental results indicate that BlockFusion is capable of\ngenerating diverse, geometrically consistent and unbounded large 3D scenes with\nunprecedented high-quality shapes in both indoor and outdoor scenarios.\n","authors":["Zhennan Wu","Yang Li","Han Yan","Taizhang Shang","Weixuan Sun","Senbo Wang","Ruikai Cui","Weizhe Liu","Hiroyuki Sato","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2401.17053v3.pdf","comment":"Video: https://www.youtube.com/watch?v=PxIBtd6G0mA"},{"id":"http://arxiv.org/abs/2403.03309v4","updated":"2024-04-09T13:44:54Z","published":"2024-03-05T20:21:49Z","title":"Learning Zero-Shot Material States Segmentation, by Implanting Natural\n Image Patterns in Synthetic Data","summary":" Visual understanding and segmentation of materials and their states is\nfundamental to understanding the physical world. The myriad textures, shapes,\nand often blurry boundaries formed by materials make this task particularly\nhard to generalize. Whether it's identifying wet regions of a surface, minerals\nin rocks, infected regions in plants, or pollution in water, each material\nstate has its own unique form. For neural nets to learn general class-agnostic\nmaterial segmentation, it is necessary to first collect and annotate data that\ncaptures this complexity. Collecting and manually annotating real-world images\nis limited by the cost and precision of manual labor. In contrast, synthetic\nCGI data is highly accurate and almost cost-free, but fails to replicate the\nvast diversity of the material world. This work offers a method to bridge this\ncrucial gap by implanting patterns extracted from real-world images in\nsynthetic data. Hence, patterns automatically collected from natural images are\nused to map materials into synthetic scenes. This unsupervised approach allows\nthe generated data to capture the vast complexity of the real world while\nmaintaining the precision and scale of synthetic data. We also present the\nfirst general benchmark for zero-shot material state segmentation. The\nbenchmark contains a wide range of real-world images of material states, like\nfood, rocks, construction, plants, liquids, and many others, each in various\nstates (wet/dry/stained/cooked/burned/worn/rusted/sediment/foam, etc.). The\nannotation includes both partial similarity between regions with similar but\nnot identical materials, and hard segmentation of only points in the exact same\nmaterial state. We show that net trains on MatSeg significantly outperform\nexisting state-of-the-art methods on this task. The dataset, code, and trained\nmodel are available\n","authors":["Sagi Eppel","Jolina Li","Manuel Drehwald","Alan Aspuru-Guzik"],"pdf_url":"https://arxiv.org/pdf/2403.03309v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18171v5","updated":"2024-04-09T13:42:07Z","published":"2023-05-29T16:02:09Z","title":"Improved Probabilistic Image-Text Representations","summary":" Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,\nsuffers from the inherent ambiguity arising from multiplicity and imperfect\nannotations. Deterministic functions are not sufficiently powerful to capture\nambiguity, prompting the exploration of probabilistic embeddings to tackle the\nchallenge. However, the existing probabilistic ITM approach encounters two key\nshortcomings; the burden of heavy computations due to the Monte Carlo\napproximation, and the loss saturation issue in the face of abundant false\nnegatives. To overcome the issues, this paper presents an improved\nProbabilistic Cross-Modal Embeddings (named PCME++) by introducing a new\nprobabilistic distance with a closed-form solution. In addition, two\noptimization techniques are proposed to enhance PCME++ further: first, the\nincorporation of pseudo-positives to prevent the negative effect under massive\nfalse negatives; second, mixed sample data augmentation for probabilistic\nmatching. Experimental results on MS-COCO Caption and two extended benchmarks,\nCxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to\nstate-of-the-art ITM methods. The robustness of PCME++ is also evaluated under\nnoisy image-text correspondences. In addition, the potential applicability of\nPCME++ in automatic prompt-filtering for zero-shot classification is shown. The\ncode is available at https://github.com/naver-ai/pcmepp\n","authors":["Sanghyuk Chun"],"pdf_url":"https://arxiv.org/pdf/2305.18171v5.pdf","comment":"ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp.\n Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB"},{"id":"http://arxiv.org/abs/2404.06309v1","updated":"2024-04-09T13:39:37Z","published":"2024-04-09T13:39:37Z","title":"Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large\n Multi-Modal Models","summary":" Audio-visual zero-shot learning methods commonly build on features extracted\nfrom pre-trained models, e.g. video or audio classification models. However,\nexisting benchmarks predate the popularization of large multi-modal models,\nsuch as CLIP and CLAP. In this work, we explore such large pre-trained models\nto obtain features, i.e. CLIP for visual features, and CLAP for audio features.\nFurthermore, the CLIP and CLAP text encoders provide class label embeddings\nwhich are combined to boost the performance of the system. We propose a simple\nyet effective model that only relies on feed-forward neural networks,\nexploiting the strong generalization capabilities of the new audio, visual and\ntextual features. Our framework achieves state-of-the-art performance on\nVGGSound-GZSL, UCF-GZSL, and ActivityNet-GZSL with our new features. Code and\ndata available at: https://github.com/dkurzend/ClipClap-GZSL.\n","authors":["David Kurzendörfer","Otniel-Bogdan Mercea","A. Sophia Koepke","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2404.06309v1.pdf","comment":"CVPRw 2024 (L3D-IVU)"},{"id":"http://arxiv.org/abs/2309.14265v2","updated":"2024-04-09T13:33:30Z","published":"2023-09-25T16:23:49Z","title":"Industrial Application of 6D Pose Estimation for Robotic Manipulation in\n Automotive Internal Logistics","summary":" Despite the advances in robotics a large proportion of the of parts handling\ntasks in the automotive industry's internal logistics are not automated but\nstill performed by humans. A key component to competitively automate these\nprocesses is a 6D pose estimation that can handle a large number of different\nparts, is adaptable to new parts with little manual effort, and is sufficiently\naccurate and robust with respect to industry requirements. In this context, the\nquestion arises as to the current status quo with respect to these measures. To\naddress this we built a representative 6D pose estimation pipeline with\nstate-of-the-art components from economically scalable real to synthetic data\ngeneration to pose estimators and evaluated it on automotive parts with regards\nto a realistic sequencing process. We found that using the data generation\napproaches, the performance of the trained 6D pose estimators are promising,\nbut do not meet industry requirements. We reveal that the reason for this is\nthe inability of the estimators to provide reliable uncertainties for their\nposes, rather than the ability of to provide sufficiently accurate poses. In\nthis context we further analyzed how RGB- and RGB-D-based approaches compare\nagainst this background and show that they are differently vulnerable to the\ndomain gap induced by synthetic data.\n","authors":["Philipp Quentin","Dino Knoll","Daniel Goehring"],"pdf_url":"https://arxiv.org/pdf/2309.14265v2.pdf","comment":"Accepted for publication at IEEE International Conference on\n Automation Science and Engineering (CASE 2023)"},{"id":"http://arxiv.org/abs/2212.04227v2","updated":"2024-04-09T13:30:15Z","published":"2022-12-08T12:20:35Z","title":"Self-training via Metric Learning for Source-Free Domain Adaptation of\n Semantic Segmentation","summary":" Unsupervised source-free domain adaptation methods aim to train a model for\nthe target domain utilizing a pretrained source-domain model and unlabeled\ntarget-domain data, particularly when accessibility to source data is\nrestricted due to intellectual property or privacy concerns. Traditional\nmethods usually use self-training with pseudo-labeling, which is often\nsubjected to thresholding based on prediction confidence. However, such\nthresholding limits the effectiveness of self-training due to insufficient\nsupervision. This issue becomes more severe in a source-free setting, where\nsupervision comes solely from the predictions of the pre-trained source model.\nIn this study, we propose a novel approach by incorporating a mean-teacher\nmodel, wherein the student network is trained using all predictions from the\nteacher network. Instead of employing thresholding on predictions, we introduce\na method to weight the gradients calculated from pseudo-labels based on the\nreliability of the teacher's predictions. To assess reliability, we introduce a\nnovel approach using proxy-based metric learning. Our method is evaluated in\nsynthetic-to-real and cross-city scenarios, demonstrating superior performance\ncompared to existing state-of-the-art methods.\n","authors":["Ibrahim Batuhan Akkaya","Ugur Halici"],"pdf_url":"https://arxiv.org/pdf/2212.04227v2.pdf","comment":"This paper is under consideration at Computer Vision and Image\n Understanding"},{"id":"http://arxiv.org/abs/2404.06294v1","updated":"2024-04-09T13:19:43Z","published":"2024-04-09T13:19:43Z","title":"Fortifying Fully Convolutional Generative Adversarial Networks for Image\n Super-Resolution Using Divergence Measures","summary":" Super-Resolution (SR) is a time-hallowed image processing problem that aims\nto improve the quality of a Low-Resolution (LR) sample up to the standard of\nits High-Resolution (HR) counterpart. We aim to address this by introducing\nSuper-Resolution Generator (SuRGe), a fully-convolutional Generative\nAdversarial Network (GAN)-based architecture for SR. We show that distinct\nconvolutional features obtained at increasing depths of a GAN generator can be\noptimally combined by a set of learnable convex weights to improve the quality\nof generated SR samples. In the process, we employ the Jensen-Shannon and the\nGromov-Wasserstein losses respectively between the SR-HR and LR-SR pairs of\ndistributions to further aid the generator of SuRGe to better exploit the\navailable information in an attempt to improve SR. Moreover, we train the\ndiscriminator of SuRGe with the Wasserstein loss with gradient penalty, to\nprimarily prevent mode collapse. The proposed SuRGe, as an end-to-end GAN\nworkflow tailor-made for super-resolution, offers improved performance while\nmaintaining low inference time. The efficacy of SuRGe is substantiated by its\nsuperior performance compared to 18 state-of-the-art contenders on 10 benchmark\ndatasets.\n","authors":["Arkaprabha Basu","Kushal Bose","Sankha Subhra Mullick","Anish Chakrabarty","Swagatam Das"],"pdf_url":"https://arxiv.org/pdf/2404.06294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02730v3","updated":"2024-04-09T13:18:22Z","published":"2023-07-06T02:30:56Z","title":"Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of\n Figure Skating","summary":" The fine-grained action analysis of the existing action datasets is\nchallenged by insufficient action categories, low fine granularities, limited\nmodalities, and tasks. In this paper, we propose a Multi-modality and\nMulti-task dataset of Figure Skating (MMFS) which was collected from the World\nFigure Skating Championships. MMFS, which possesses action recognition and\naction quality assessment, captures RGB, skeleton, and is collected the score\nof actions from 11671 clips with 256 categories including spatial and temporal\nlabels. The key contributions of our dataset fall into three aspects as\nfollows. (1) Independently spatial and temporal categories are first proposed\nto further explore fine-grained action recognition and quality assessment. (2)\nMMFS first introduces the skeleton modality for complex fine-grained action\nquality assessment. (3) Our multi-modality and multi-task dataset encourage\nmore action analysis models. To benchmark our dataset, we adopt RGB-based and\nskeleton-based baseline methods for action recognition and action quality\nassessment.\n","authors":["Sheng-Lan Liu","Yu-Ning Ding","Gang Yan","Si-Fan Zhang","Jin-Rong Zhang","Wen-Yue Chen","Xue-Hai Xu"],"pdf_url":"https://arxiv.org/pdf/2307.02730v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06287v1","updated":"2024-04-09T13:13:24Z","published":"2024-04-09T13:13:24Z","title":"Counterfactual Reasoning for Multi-Label Image Classification via\n Patching-Based Training","summary":" The key to multi-label image classification (MLC) is to improve model\nperformance by leveraging label correlations. Unfortunately, it has been shown\nthat overemphasizing co-occurrence relationships can cause the overfitting\nissue of the model, ultimately leading to performance degradation. In this\npaper, we provide a causal inference framework to show that the correlative\nfeatures caused by the target object and its co-occurring objects can be\nregarded as a mediator, which has both positive and negative impacts on model\npredictions. On the positive side, the mediator enhances the recognition\nperformance of the model by capturing co-occurrence relationships; on the\nnegative side, it has the harmful causal effect that causes the model to make\nan incorrect prediction for the target object, even when only co-occurring\nobjects are present in an image. To address this problem, we propose a\ncounterfactual reasoning method to measure the total direct effect, achieved by\nenhancing the direct effect caused only by the target object. Due to the\nunknown location of the target object, we propose patching-based training and\ninference to accomplish this goal, which divides an image into multiple patches\nand identifies the pivot patch that contains the target object. Experimental\nresults on multiple benchmark datasets with diverse configurations validate\nthat the proposed method can achieve state-of-the-art performance.\n","authors":["Ming-Kun Xie","Jia-Hao Xiao","Pei Peng","Gang Niu","Masashi Sugiyama","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06279v1","updated":"2024-04-09T13:02:33Z","published":"2024-04-09T13:02:33Z","title":"NoiseNCA: Noisy Seed Improves Spatio-Temporal Continuity of Neural\n Cellular Automata","summary":" Neural Cellular Automata (NCA) is a class of Cellular Automata where the\nupdate rule is parameterized by a neural network that can be trained using\ngradient descent. In this paper, we focus on NCA models used for texture\nsynthesis, where the update rule is inspired by partial differential equations\n(PDEs) describing reaction-diffusion systems. To train the NCA model, the\nspatio-termporal domain is discretized, and Euler integration is used to\nnumerically simulate the PDE. However, whether a trained NCA truly learns the\ncontinuous dynamic described by the corresponding PDE or merely overfits the\ndiscretization used in training remains an open question. We study NCA models\nat the limit where space-time discretization approaches continuity. We find\nthat existing NCA models tend to overfit the training discretization,\nespecially in the proximity of the initial condition, also called \"seed\". To\naddress this, we propose a solution that utilizes uniform noise as the initial\ncondition. We demonstrate the effectiveness of our approach in preserving the\nconsistency of NCA dynamics across a wide range of spatio-temporal\ngranularities. Our improved NCA model enables two new test-time interactions by\nallowing continuous control over the speed of pattern formation and the scale\nof the synthesized patterns. We demonstrate this new NCA feature in our\ninteractive online demo. Our work reveals that NCA models can learn continuous\ndynamics and opens new venues for NCA research from a dynamical systems'\nperspective.\n","authors":["Ehsan Pajouheshgar","Yitao Xu","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2404.06279v1.pdf","comment":"9 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.06277v1","updated":"2024-04-09T13:01:26Z","published":"2024-04-09T13:01:26Z","title":"Learning Embeddings with Centroid Triplet Loss for Object Identification\n in Robotic Grasping","summary":" Foundation models are a strong trend in deep learning and computer vision.\nThese models serve as a base for applications as they require minor or no\nfurther fine-tuning by developers to integrate into their applications.\nFoundation models for zero-shot object segmentation such as Segment Anything\n(SAM) output segmentation masks from images without any further object\ninformation. When they are followed in a pipeline by an object identification\nmodel, they can perform object detection without training. Here, we focus on\ntraining such an object identification model. A crucial practical aspect for an\nobject identification model is to be flexible in input size. As object\nidentification is an image retrieval problem, a suitable method should handle\nmulti-query multi-gallery situations without constraining the number of input\nimages (e.g. by having fixed-size aggregation layers). The key solution to\ntrain such a model is the centroid triplet loss (CTL), which aggregates image\nfeatures to their centroids. CTL yields high accuracy, avoids misleading\ntraining signals and keeps the model input size flexible. In our experiments,\nwe establish a new state of the art on the ArmBench object identification task,\nwhich shows general applicability of our model. We furthermore demonstrate an\nintegrated unseen object detection pipeline on the challenging HOPE dataset,\nwhich requires fine-grained detection. There, our pipeline matches and\nsurpasses related methods which have been trained on dataset-specific data.\n","authors":["Anas Gouda","Max Schwarz","Christopher Reining","Sven Behnke","Alice Kirchheim"],"pdf_url":"https://arxiv.org/pdf/2404.06277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04561v2","updated":"2024-04-09T12:50:16Z","published":"2024-04-06T09:01:19Z","title":"Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering\n Regularization for Multi-Modal 3D Semantic Occupancy Prediction","summary":" 3D semantic occupancy prediction is a pivotal task in the field of autonomous\ndriving. Recent approaches have made great advances in 3D semantic occupancy\npredictions on a single modality. However, multi-modal semantic occupancy\nprediction approaches have encountered difficulties in dealing with the\nmodality heterogeneity, modality misalignment, and insufficient modality\ninteractions that arise during the fusion of different modalities data, which\nmay result in the loss of important geometric and semantic information. This\nletter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy\nprediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera\nfeature fusion with implicit volume rendering regularization. The key insight\nis that volume rendering in the feature space can proficiently bridge the gap\nbetween 3D LiDAR sweeps and 2D images while serving as a physical\nregularization to enhance LiDAR-camera fused volumetric representation.\nSpecifically, we first propose a Geometric- and Semantic-aware Fusion\n(GSFusion) module to explicitly enhance LiDAR features by incorporating\nneighboring camera features through a K-nearest neighbors (KNN) search. Then,\nwe employ volume rendering to project the fused feature back to the image\nplanes for reconstructing color and depth maps. These maps are then supervised\nby input images from the camera and depth estimations derived from LiDAR,\nrespectively. Extensive experiments on the popular nuScenes and SemanticKITTI\nbenchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy\nprediction. The project page is available at\nhttps://rorisis.github.io/Co-Occ_project-page/.\n","authors":["Jingyi Pan","Zipeng Wang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06273v1","updated":"2024-04-09T12:48:24Z","published":"2024-04-09T12:48:24Z","title":"Robust Confidence Intervals in Stereo Matching using Possibility Theory","summary":" We propose a method for estimating disparity confidence intervals in stereo\nmatching problems. Confidence intervals provide complementary information to\nusual confidence measures. To the best of our knowledge, this is the first\nmethod creating disparity confidence intervals based on the cost volume. This\nmethod relies on possibility distributions to interpret the epistemic\nuncertainty of the cost volume. Our method has the benefit of having a\nwhite-box nature, differing in this respect from current state-of-the-art deep\nneural networks approaches. The accuracy and size of confidence intervals are\nvalidated using the Middlebury stereo datasets as well as a dataset of\nsatellite images. This contribution is freely available on GitHub.\n","authors":["Roman Malinowski","Emmanuelle Sarrazin","Loïc Dumas","Emmanuel Dubois","Sébastien Destercke"],"pdf_url":"https://arxiv.org/pdf/2404.06273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06270v1","updated":"2024-04-09T12:47:30Z","published":"2024-04-09T12:47:30Z","title":"3D Geometry-aware Deformable Gaussian Splatting for Dynamic View\n Synthesis","summary":" In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting\nmethod for dynamic view synthesis. Existing neural radiance fields (NeRF) based\nsolutions learn the deformation in an implicit manner, which cannot incorporate\n3D scene geometry. Therefore, the learned deformation is not necessarily\ngeometrically coherent, which results in unsatisfactory dynamic view synthesis\nand 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new\nrepresentation of the 3D scene, building upon which the 3D geometry could be\nexploited in learning the complex 3D deformation. Specifically, the scenes are\nrepresented as a collection of 3D Gaussian, where each 3D Gaussian is optimized\nto move and rotate over time to model the deformation. To enforce the 3D scene\ngeometry constraint during deformation, we explicitly extract 3D geometry\nfeatures and integrate them in learning the 3D deformation. In this way, our\nsolution achieves 3D geometry-aware deformation modeling, which enables\nimproved dynamic view synthesis and 3D dynamic reconstruction. Extensive\nexperimental results on both synthetic and real datasets prove the superiority\nof our solution, which achieves new state-of-the-art performance.\n The project is available at https://npucvr.github.io/GaGS/\n","authors":["Zhicheng Lu","Xiang Guo","Le Hui","Tianrui Chen","Min Yang","Xiao Tang","Feng Zhu","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2404.06270v1.pdf","comment":"Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/"},{"id":"http://arxiv.org/abs/2404.06265v1","updated":"2024-04-09T12:44:34Z","published":"2024-04-09T12:44:34Z","title":"Spatial-Temporal Multi-level Association for Video Object Segmentation","summary":" Existing semi-supervised video object segmentation methods either focus on\ntemporal feature matching or spatial-temporal feature modeling. However, they\ndo not address the issues of sufficient target interaction and efficient\nparallel processing simultaneously, thereby constraining the learning of\ndynamic, target-aware features. To tackle these limitations, this paper\nproposes a spatial-temporal multi-level association framework, which jointly\nassociates reference frame, test frame, and object features to achieve\nsufficient interaction and parallel target ID association with a\nspatial-temporal memory bank for efficient video object segmentation.\nSpecifically, we construct a spatial-temporal multi-level feature association\nmodule to learn better target-aware features, which formulates feature\nextraction and interaction as the efficient operations of object\nself-attention, reference object enhancement, and test reference correlation.\nIn addition, we propose a spatial-temporal memory to assist feature association\nand temporal ID assignment and correlation. We evaluate the proposed method by\nconducting extensive experiments on numerous video object segmentation\ndatasets, including DAVIS 2016/2017 val, DAVIS 2017 test-dev, and YouTube-VOS\n2018/2019 val. The favorable performance against the state-of-the-art methods\ndemonstrates the effectiveness of our approach. All source code and trained\nmodels will be made publicly available.\n","authors":["Deshui Miao","Xin Li","Zhenyu He","Huchuan Lu","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.06265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07166v2","updated":"2024-04-09T12:40:18Z","published":"2023-10-11T03:29:13Z","title":"Anchor-based Multi-view Subspace Clustering with Hierarchical Feature\n Descent","summary":" Multi-view clustering has attracted growing attention owing to its\ncapabilities of aggregating information from various sources and its promising\nhorizons in public affairs. Up till now, many advanced approaches have been\nproposed in recent literature. However, there are several ongoing difficulties\nto be tackled. One common dilemma occurs while attempting to align the features\nof different views. {Moreover, due to the fact that many existing multi-view\nclustering algorithms stem from spectral clustering, this results to cubic time\ncomplexity w.r.t. the number of dataset. However, we propose Anchor-based\nMulti-view Subspace Clustering with Hierarchical Feature Descent(MVSC-HFD) to\ntackle the discrepancy among views through hierarchical feature descent and\nproject to a common subspace( STAGE 1), which reveals dependency of different\nviews. We further reduce the computational complexity to linear time cost\nthrough a unified sampling strategy in the common subspace( STAGE 2), followed\nby anchor-based subspace clustering to learn the bipartite graph collectively(\nSTAGE 3). }Extensive experimental results on public benchmark datasets\ndemonstrate that our proposed model consistently outperforms the\nstate-of-the-art techniques.\n","authors":["Qiyuan Ou","Siwei Wang","Pei Zhang","Sihang Zhou","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06261v1","updated":"2024-04-09T12:34:28Z","published":"2024-04-09T12:34:28Z","title":"Playing to Vision Foundation Model's Strengths in Stereo Matching","summary":" Stereo matching has become a key technique for 3D environment perception in\nintelligent vehicles. For a considerable time, convolutional neural networks\n(CNNs) have remained the mainstream choice for feature extraction in this\ndomain. Nonetheless, there is a growing consensus that the existing paradigm\nshould evolve towards vision foundation models (VFM), particularly those\ndeveloped based on vision Transformers (ViTs) and pre-trained through\nself-supervision on extensive, unlabeled datasets. While VFMs are adept at\nextracting informative, general-purpose visual features, specifically for dense\nprediction tasks, their performance often lacks in geometric vision tasks. This\nstudy serves as the first exploration of a viable approach for adapting VFMs to\nstereo matching. Our ViT adapter, referred to as ViTAS, is constructed upon\nthree types of modules: spatial differentiation, patch attention fusion, and\ncross-attention. The first module initializes feature pyramids, while the\nlatter two aggregate stereo and multi-scale contextual information into\nfine-grained features, respectively. ViTAStereo, which combines ViTAS with cost\nvolume-based stereo matching back-end processes, achieves the top rank on the\nKITTI Stereo 2012 dataset and outperforms the second-best network StereoBase by\napproximately 7.9% in terms of the percentage of error pixels, with a tolerance\nof 3 pixels. Additional experiments across diverse scenarios further\ndemonstrate its superior generalizability compared to all other\nstate-of-the-art approaches. We believe this new paradigm will pave the way for\nthe next generation of stereo matching networks.\n","authors":["Chuang-Wei Liu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.06261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06258v1","updated":"2024-04-09T12:32:10Z","published":"2024-04-09T12:32:10Z","title":"Robust feature knowledge distillation for enhanced performance of\n lightweight crack segmentation models","summary":" Vision-based crack detection faces deployment challenges due to the size of\nrobust models and edge device limitations. These can be addressed with\nlightweight models trained with knowledge distillation (KD). However,\nstate-of-the-art (SOTA) KD methods compromise anti-noise robustness. This paper\ndevelops Robust Feature Knowledge Distillation (RFKD), a framework to improve\nrobustness while retaining the precision of light models for crack\nsegmentation. RFKD distils knowledge from a teacher model's logit layers and\nintermediate feature maps while leveraging mixed clean and noisy images to\ntransfer robust patterns to the student model, improving its precision,\ngeneralisation, and anti-noise performance. To validate the proposed RFKD, a\nlightweight crack segmentation model, PoolingCrack Tiny (PCT), with only 0.5 M\nparameters, is also designed and used as the student to run the framework. The\nresults show a significant enhancement in noisy images, with RFKD reaching a\n62% enhanced mean Dice score (mDS) compared to SOTA KD methods.\n","authors":["Zhaohui Chen","Elyas Asadi Shamsabadi","Sheng Jiang","Luming Shen","Daniel Dias-da-Costa"],"pdf_url":"https://arxiv.org/pdf/2404.06258v1.pdf","comment":"24 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.06256v1","updated":"2024-04-09T12:29:16Z","published":"2024-04-09T12:29:16Z","title":"Label-Efficient 3D Object Detection For Road-Side Units","summary":" Occlusion presents a significant challenge for safety-critical applications\nsuch as autonomous driving. Collaborative perception has recently attracted a\nlarge research interest thanks to the ability to enhance the perception of\nautonomous vehicles via deep information fusion with intelligent roadside units\n(RSU), thus minimizing the impact of occlusion. While significant advancement\nhas been made, the data-hungry nature of these methods creates a major hurdle\nfor their real-world deployment, particularly due to the need for annotated RSU\ndata. Manually annotating the vast amount of RSU data required for training is\nprohibitively expensive, given the sheer number of intersections and the effort\ninvolved in annotating point clouds. We address this challenge by devising a\nlabel-efficient object detection method for RSU based on unsupervised object\ndiscovery. Our paper introduces two new modules: one for object discovery based\non a spatial-temporal aggregation of point clouds, and another for refinement.\nFurthermore, we demonstrate that fine-tuning on a small portion of annotated\ndata allows our object discovery models to narrow the performance gap with, or\neven surpass, fully supervised models. Extensive experiments are carried out in\nsimulated and real-world datasets to evaluate our method.\n","authors":["Minh-Quan Dao","Holger Caesar","Julie Stephany Berrio","Mao Shan","Stewart Worrall","Vincent Frémont","Ezio Malis"],"pdf_url":"https://arxiv.org/pdf/2404.06256v1.pdf","comment":"IV 2024"},{"id":"http://arxiv.org/abs/2404.06253v1","updated":"2024-04-09T12:25:06Z","published":"2024-04-09T12:25:06Z","title":"From Barlow Twins to Triplet Training: Differentiating Dementia with\n Limited Data","summary":" Differential diagnosis of dementia is challenging due to overlapping\nsymptoms, with structural magnetic resonance imaging (MRI) being the primary\nmethod for diagnosis. Despite the clinical value of computer-aided differential\ndiagnosis, research has been limited, mainly due to the absence of public\ndatasets that contain diverse types of dementia. This leaves researchers with\nsmall in-house datasets that are insufficient for training deep neural networks\n(DNNs). Self-supervised learning shows promise for utilizing unlabeled MRI\nscans in training, but small batch sizes for volumetric brain scans make its\napplication challenging. To address these issues, we propose Triplet Training\nfor differential diagnosis with limited target data. It consists of three key\nstages: (i) self-supervised pre-training on unlabeled data with Barlow Twins,\n(ii) self-distillation on task-related data, and (iii) fine-tuning on the\ntarget dataset. Our approach significantly outperforms traditional training\nstrategies, achieving a balanced accuracy of 75.6%. We further provide insights\ninto the training process by visualizing changes in the latent space after each\nstep. Finally, we validate the robustness of Triplet Training in terms of its\nindividual components in a comprehensive ablation study. Our code is available\nat https://github.com/ai-med/TripletTraining.\n","authors":["Yitong Li","Tom Nuno Wolf","Sebastian Pölsterl","Igor Yakushev","Dennis M. Hedderich","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2404.06253v1.pdf","comment":"Accepted for presentation at MIDL 2024"},{"id":"http://arxiv.org/abs/2404.06251v1","updated":"2024-04-09T12:23:30Z","published":"2024-04-09T12:23:30Z","title":"ColorMNet: A Memory-based Deep Spatial-Temporal Feature Propagation\n Network for Video Colorization","summary":" How to effectively explore spatial-temporal features is important for video\ncolorization. Instead of stacking multiple frames along the temporal dimension\nor recurrently propagating estimated features that will accumulate errors or\ncannot explore information from far-apart frames, we develop a memory-based\nfeature propagation module that can establish reliable connections with\nfeatures from far-apart frames and alleviate the influence of inaccurately\nestimated features. To extract better features from each frame for the\nabove-mentioned feature propagation, we explore the features from\nlarge-pretrained visual models to guide the feature estimation of each frame so\nthat the estimated features can model complex scenarios. In addition, we note\nthat adjacent frames usually contain similar contents. To explore this property\nfor better spatial and temporal feature utilization, we develop a local\nattention module to aggregate the features from adjacent frames in a\nspatial-temporal neighborhood. We formulate our memory-based feature\npropagation module, large-pretrained visual model guided feature estimation\nmodule, and local attention module into an end-to-end trainable network (named\nColorMNet) and show that it performs favorably against state-of-the-art methods\non both the benchmark datasets and real-world scenarios. The source code and\npre-trained models will be available at\n\\url{https://github.com/yyang181/colormnet}.\n","authors":["Yixin Yang","Jiangxin Dong","Jinhui Tang","Jinshan Pan"],"pdf_url":"https://arxiv.org/pdf/2404.06251v1.pdf","comment":"Project website: \\url{https://github.com/yyang181/colormnet}"},{"id":"http://arxiv.org/abs/2404.06247v1","updated":"2024-04-09T12:13:40Z","published":"2024-04-09T12:13:40Z","title":"LRR: Language-Driven Resamplable Continuous Representation against\n Adversarial Tracking Attacks","summary":" Visual object tracking plays a critical role in visual-based autonomous\nsystems, as it aims to estimate the position and size of the object of interest\nwithin a live video. Despite significant progress made in this field,\nstate-of-the-art (SOTA) trackers often fail when faced with adversarial\nperturbations in the incoming frames. This can lead to significant robustness\nand security issues when these trackers are deployed in the real world. To\nachieve high accuracy on both clean and adversarial data, we propose building a\nspatial-temporal continuous representation using the semantic text guidance of\nthe object of interest. This novel continuous representation enables us to\nreconstruct incoming frames to maintain semantic and appearance consistency\nwith the object of interest and its clean counterparts. As a result, our\nproposed method successfully defends against different SOTA adversarial\ntracking attacks while maintaining high accuracy on clean data. In particular,\nour method significantly increases tracking accuracy under adversarial attacks\nwith around 90% relative improvement on UAV123, which is even higher than the\naccuracy on clean data.\n","authors":["Jianlang Chen","Xuhong Ren","Qing Guo","Felix Juefei-Xu","Di Lin","Wei Feng","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.06247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06246v1","updated":"2024-04-09T12:11:25Z","published":"2024-04-09T12:11:25Z","title":"GHNeRF: Learning Generalizable Human Features with Efficient Neural\n Radiance Fields","summary":" Recent advances in Neural Radiance Fields (NeRF) have demonstrated promising\nresults in 3D scene representations, including 3D human representations.\nHowever, these representations often lack crucial information on the underlying\nhuman pose and structure, which is crucial for AR/VR applications and games. In\nthis paper, we introduce a novel approach, termed GHNeRF, designed to address\nthese limitations by learning 2D/3D joint locations of human subjects with NeRF\nrepresentation. GHNeRF uses a pre-trained 2D encoder streamlined to extract\nessential human features from 2D images, which are then incorporated into the\nNeRF framework in order to encode human biomechanic features. This allows our\nnetwork to simultaneously learn biomechanic features, such as joint locations,\nalong with human geometry and texture. To assess the effectiveness of our\nmethod, we conduct a comprehensive comparison with state-of-the-art human NeRF\ntechniques and joint estimation algorithms. Our results show that GHNeRF can\nachieve state-of-the-art results in near real-time.\n","authors":["Arnab Dey","Di Yang","Rohith Agaram","Antitza Dantcheva","Andrew I. Comport","Srinath Sridhar","Jean Martinet"],"pdf_url":"https://arxiv.org/pdf/2404.06246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06244v1","updated":"2024-04-09T12:10:54Z","published":"2024-04-09T12:10:54Z","title":"Anchor-based Robust Finetuning of Vision-Language Models","summary":" We aim at finetuning a vision-language model without hurting its\nout-of-distribution (OOD) generalization. We address two types of OOD\ngeneralization, i.e., i) domain shift such as natural to sketch images, and ii)\nzero-shot capability to recognize the category that was not contained in the\nfinetune data. Arguably, the diminished OOD generalization after finetuning\nstems from the excessively simplified finetuning target, which only provides\nthe class information, such as ``a photo of a [CLASS]''. This is distinct from\nthe process in that CLIP was pretrained, where there is abundant text\nsupervision with rich semantic information. Therefore, we propose to compensate\nfor the finetune process using auxiliary supervision with rich semantic\ninformation, which acts as anchors to preserve the OOD generalization.\nSpecifically, two types of anchors are elaborated in our method, including i)\ntext-compensated anchor which uses the images from the finetune set but\nenriches the text supervision from a pretrained captioner, ii) image-text-pair\nanchor which is retrieved from the dataset similar to pretraining data of CLIP\naccording to the downstream task, associating with the original CLIP text with\nrich semantics. Those anchors are utilized as auxiliary semantic information to\nmaintain the original feature space of CLIP, thereby preserving the OOD\ngeneralization capabilities. Comprehensive experiments demonstrate that our\nmethod achieves in-distribution performance akin to conventional finetuning\nwhile attaining new state-of-the-art results on domain shift and zero-shot\nlearning benchmarks.\n","authors":["Jinwei Han","Zhiwen Lin","Zhongyisun Sun","Yingguo Gao","Ke Yan","Shouhong Ding","Yuan Gao","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2404.06244v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.06243v1","updated":"2024-04-09T12:09:56Z","published":"2024-04-09T12:09:56Z","title":"ActNetFormer: Transformer-ResNet Hybrid Method for Semi-Supervised\n Action Recognition in Videos","summary":" Human action or activity recognition in videos is a fundamental task in\ncomputer vision with applications in surveillance and monitoring, self-driving\ncars, sports analytics, human-robot interaction and many more. Traditional\nsupervised methods require large annotated datasets for training, which are\nexpensive and time-consuming to acquire. This work proposes a novel approach\nusing Cross-Architecture Pseudo-Labeling with contrastive learning for\nsemi-supervised action recognition. Our framework leverages both labeled and\nunlabelled data to robustly learn action representations in videos, combining\npseudo-labeling with contrastive learning for effective learning from both\ntypes of samples. We introduce a novel cross-architecture approach where 3D\nConvolutional Neural Networks (3D CNNs) and video transformers (VIT) are\nutilised to capture different aspects of action representations; hence we call\nit ActNetFormer. The 3D CNNs excel at capturing spatial features and local\ndependencies in the temporal domain, while VIT excels at capturing long-range\ndependencies across frames. By integrating these complementary architectures\nwithin the ActNetFormer framework, our approach can effectively capture both\nlocal and global contextual information of an action. This comprehensive\nrepresentation learning enables the model to achieve better performance in\nsemi-supervised action recognition tasks by leveraging the strengths of each of\nthese architectures. Experimental results on standard action recognition\ndatasets demonstrate that our approach performs better than the existing\nmethods, achieving state-of-the-art performance with only a fraction of labeled\ndata. The official website of this work is available at:\nhttps://github.com/rana2149/ActNetFormer.\n","authors":["Sharana Dharshikgan Suresh Dass","Hrishav Bakul Barua","Ganesh Krishnasamy","Raveendran Paramesran","Raphael C. -W. Phan"],"pdf_url":"https://arxiv.org/pdf/2404.06243v1.pdf","comment":"Submitted for peer review"},{"id":"http://arxiv.org/abs/2404.06240v1","updated":"2024-04-09T12:06:21Z","published":"2024-04-09T12:06:21Z","title":"Hyperparameter-Free Medical Image Synthesis for Sharing Data and\n Improving Site-Specific Segmentation","summary":" Sharing synthetic medical images is a promising alternative to sharing real\nimages that can improve patient privacy and data security. To get good results,\nexisting methods for medical image synthesis must be manually adjusted when\nthey are applied to unseen data. To remove this manual burden, we introduce a\nHyperparameter-Free distributed learning method for automatic medical image\nSynthesis, Sharing, and Segmentation called HyFree-S3. For three diverse\nsegmentation settings (pelvic MRIs, lung X-rays, polyp photos), the use of\nHyFree-S3 results in improved performance over training only with site-specific\ndata (in the majority of cases). The hyperparameter-free nature of the method\nshould make data synthesis and sharing easier, potentially leading to an\nincrease in the quantity of available data and consequently the quality of the\nmodels trained that may ultimately be applied in the clinic. Our code is\navailable at https://github.com/AwesomeLemon/HyFree-S3\n","authors":["Alexander Chebykin","Peter A. N. Bosman","Tanja Alderliesten"],"pdf_url":"https://arxiv.org/pdf/2404.06240v1.pdf","comment":"Accepted at MIDL 2024"},{"id":"http://arxiv.org/abs/2311.18649v3","updated":"2024-04-09T11:55:20Z","published":"2023-11-30T15:57:34Z","title":"Simple Semantic-Aided Few-Shot Learning","summary":" Learning from a limited amount of data, namely Few-Shot Learning, stands out\nas a challenging computer vision task. Several works exploit semantics and\ndesign complicated semantic fusion mechanisms to compensate for rare\nrepresentative features within restricted data. However, relying on naive\nsemantics such as class names introduces biases due to their brevity, while\nacquiring extensive semantics from external knowledge takes a huge time and\neffort. This limitation severely constrains the potential of semantics in\nFew-Shot Learning. In this paper, we design an automatic way called Semantic\nEvolution to generate high-quality semantics. The incorporation of high-quality\nsemantics alleviates the need for complex network structures and learning\nalgorithms used in previous works. Hence, we employ a simple two-layer network\ntermed Semantic Alignment Network to transform semantics and visual features\ninto robust class prototypes with rich discriminative features for few-shot\nclassification. The experimental results show our framework outperforms all\nprevious methods on six benchmarks, demonstrating a simple network with\nhigh-quality semantics can beat intricate multi-modal modules on few-shot\nclassification tasks. Code is available at\nhttps://github.com/zhangdoudou123/SemFew.\n","authors":["Hai Zhang","Junzhe Xu","Shanlin Jiang","Zhenan He"],"pdf_url":"https://arxiv.org/pdf/2311.18649v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2307.10974v3","updated":"2024-04-09T11:23:10Z","published":"2023-07-20T16:00:19Z","title":"Deep Multi-Threshold Spiking-UNet for Image Processing","summary":" U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Zheng-jun Zha","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v3.pdf","comment":"Accepted in NeuroComputing"},{"id":"http://arxiv.org/abs/2404.06219v1","updated":"2024-04-09T11:13:36Z","published":"2024-04-09T11:13:36Z","title":"Automatic Defect Detection in Sewer Network Using Deep Learning Based\n Object Detector","summary":" Maintaining sewer systems in large cities is important, but also time and\neffort consuming, because visual inspections are currently done manually. To\nreduce the amount of aforementioned manual work, defects within sewer pipes\nshould be located and classified automatically. In the past, multiple works\nhave attempted solving this problem using classical image processing, machine\nlearning, or a combination of those. However, each provided solution only focus\non detecting a limited set of defect/structure types, such as fissure, root,\nand/or connection. Furthermore, due to the use of hand-crafted features and\nsmall training datasets, generalization is also problematic. In order to\novercome these deficits, a sizable dataset with 14.7 km of various sewer pipes\nwere annotated by sewer maintenance experts in the scope of this work. On top\nof that, an object detector (EfficientDet-D0) was trained for automatic defect\ndetection. From the result of several expermients, peculiar natures of defects\nin the context of object detection, which greatly effect annotation and\ntraining process, are found and discussed. At the end, the final detector was\nable to detect 83% of defects in the test set; out of the missing 17%, only\n0.77% are very severe defects. This work provides an example of applying deep\nlearning-based object detection into an important but quiet engineering field.\nIt also gives some practical pointers on how to annotate peculiar \"object\",\nsuch as defects.\n","authors":["Bach Ha","Birgit Schalter","Laura White","Joachim Koehler"],"pdf_url":"https://arxiv.org/pdf/2404.06219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06212v1","updated":"2024-04-09T11:00:19Z","published":"2024-04-09T11:00:19Z","title":"OmniFusion Technical Report","summary":" Last year, multimodal architectures served up a revolution in AI-based\napproaches and solutions, extending the capabilities of large language models\n(LLM). We propose an \\textit{OmniFusion} model based on a pretrained LLM and\nadapters for visual modality. We evaluated and compared several architecture\ndesign principles for better text and visual data coupling: MLP and transformer\nadapters, various CLIP ViT-based encoders (SigLIP, InternVIT, etc.), and their\nfusing approach, image encoding method (whole image or tiles encoding) and two\n7B LLMs (the proprietary one and open-source Mistral). Experiments on 8\nvisual-language benchmarks show the top score for the best OmniFusion setup in\nterms of different VQA tasks in comparison with open-source LLaVA-like\nsolutions: VizWiz, Pope, MM-Vet, ScienceQA, MMBench, TextVQA, VQAv2, MMMU. We\nalso propose a variety of situations, where OmniFusion provides highly-detailed\nanswers in different domains: housekeeping, sightseeing, culture, medicine,\nhandwritten and scanned equations recognition, etc. Mistral-based OmniFusion\nmodel is an open-source solution with weights, training and inference scripts\navailable at https://github.com/AIRI-Institute/OmniFusion.\n","authors":["Elizaveta Goncharova","Anton Razzhigaev","Matvey Mikhalchuk","Maxim Kurkin","Irina Abdullaeva","Matvey Skripkin","Ivan Oseledets","Denis Dimitrov","Andrey Kuznetsov"],"pdf_url":"https://arxiv.org/pdf/2404.06212v1.pdf","comment":"17 pages, 4 figures, 9 tables, 2 appendices"},{"id":"http://arxiv.org/abs/2404.06211v1","updated":"2024-04-09T11:00:11Z","published":"2024-04-09T11:00:11Z","title":"Unified Physical-Digital Attack Detection Challenge","summary":" Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR)\nSystems. In real-world scenarios, FRs are confronted with both physical and\ndigital attacks. However, existing algorithms often address only one type of\nattack at a time, which poses significant limitations in real-world scenarios\nwhere FR systems face hybrid physical-digital threats. To facilitate the\nresearch of Unified Attack Detection (UAD) algorithms, a large-scale\nUniAttackData dataset has been collected. UniAttackData is the largest public\ndataset for Unified Attack Detection, with a total of 28,706 videos, where each\nunique identity encompasses all advanced attack types. Based on this dataset,\nwe organized a Unified Physical-Digital Face Attack Detection Challenge to\nboost the research in Unified Attack Detections. It attracted 136 teams for the\ndevelopment phase, with 13 qualifying for the final round. The results\nre-verified by the organizing team were used for the final ranking. This paper\ncomprehensively reviews the challenge, detailing the dataset introduction,\nprotocol definition, evaluation criteria, and a summary of published results.\nFinally, we focus on the detailed analysis of the highest-performing algorithms\nand offer potential directions for unified physical-digital attack detection\ninspired by this competition. Challenge Website:\nhttps://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024.\n","authors":["Haocheng Yuan","Ajian Liu","Junze Zheng","Jun Wan","Jiankang Deng","Sergio Escalera","Hugo Jair Escalante","Isabelle Guyon","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2404.06211v1.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.06207v1","updated":"2024-04-09T10:56:46Z","published":"2024-04-09T10:56:46Z","title":"Leveraging edge detection and neural networks for better UAV\n localization","summary":" We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs)\nin environments lacking Global Navigation Satellite Systems (GNSS). Current\nstate-of-the-art techniques employ an offline-trained encoder to generate a\nvector representation (embedding) of the UAV's current view, which is then\ncompared with pre-computed embeddings of geo-referenced images to determine the\nUAV's position. Here, we demonstrate that the performance of these methods can\nbe significantly enhanced by preprocessing the images to extract their edges,\nwhich exhibit robustness to seasonal and illumination variations. Furthermore,\nwe establish that utilizing edges enhances resilience to orientation and\naltitude inaccuracies. Additionally, we introduce a confidence criterion for\nlocalization. Our findings are substantiated through synthetic experiments.\n","authors":["Theo Di Piazza","Enric Meinhardt-Llopis","Gabriele Facciolo","Benedicte Bascle","Corentin Abgrall","Jean-Clement Devaux"],"pdf_url":"https://arxiv.org/pdf/2404.06207v1.pdf","comment":"Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.06202v1","updated":"2024-04-09T10:47:43Z","published":"2024-04-09T10:47:43Z","title":"Automated National Urban Map Extraction","summary":" Developing countries usually lack the proper governance means to generate and\nregularly update a national rooftop map. Using traditional photogrammetry and\nsurveying methods to produce a building map at the federal level is costly and\ntime consuming. Using earth observation and deep learning methods, we can\nbridge this gap and propose an automated pipeline to fetch such national urban\nmaps. This paper aims to exploit the power of fully convolutional neural\nnetworks for multi-class buildings' instance segmentation to leverage high\nobject-wise accuracy results. Buildings' instance segmentation from sub-meter\nhigh-resolution satellite images can be achieved with relatively high\npixel-wise metric scores. We detail all engineering steps to replicate this\nwork and ensure highly accurate results in dense and slum areas witnessed in\nregions that lack proper urban planning in the Global South. We applied a case\nstudy of the proposed pipeline to Lebanon and successfully produced the first\ncomprehensive national building footprint map with approximately 1 Million\nunits with an 84% accuracy. The proposed architecture relies on advanced\naugmentation techniques to overcome dataset scarcity, which is often the case\nin developing countries.\n","authors":["Hasan Nasrallah","Abed Ellatif Samhat","Cristiano Nattero","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2404.06202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06194v1","updated":"2024-04-09T10:27:22Z","published":"2024-04-09T10:27:22Z","title":"Exploring the Potential of Large Foundation Models for Open-Vocabulary\n HOI Detection","summary":" Open-vocabulary human-object interaction (HOI) detection, which is concerned\nwith the problem of detecting novel HOIs guided by natural language, is crucial\nfor understanding human-centric scenes. However, prior zero-shot HOI detectors\noften employ the same levels of feature maps to model HOIs with varying\ndistances, leading to suboptimal performance in scenes containing human-object\npairs with a wide range of distances. In addition, these detectors primarily\nrely on category names and overlook the rich contextual information that\nlanguage can provide, which is essential for capturing open vocabulary concepts\nthat are typically rare and not well-represented by category names alone. In\nthis paper, we introduce a novel end-to-end open vocabulary HOI detection\nframework with conditional multi-level decoding and fine-grained semantic\nenhancement (CMD-SE), harnessing the potential of Visual-Language Models\n(VLMs). Specifically, we propose to model human-object pairs with different\ndistances with different levels of feature maps by incorporating a soft\nconstraint during the bipartite matching process. Furthermore, by leveraging\nlarge language models (LLMs) such as GPT models, we exploit their extensive\nworld knowledge to generate descriptions of human body part states for various\ninteractions. Then we integrate the generalizable and fine-grained semantics of\nhuman body parts to improve interaction recognition. Experimental results on\ntwo datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method\nachieves state-of-the-art results in open vocabulary HOI detection. The code\nand models are available at https://github.com/ltttpku/CMD-SE-release.\n","authors":["Ting Lei","Shaofeng Yin","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06181v1","updated":"2024-04-09T10:04:06Z","published":"2024-04-09T10:04:06Z","title":"EPL: Evidential Prototype Learning for Semi-supervised Medical Image\n Segmentation","summary":" Although current semi-supervised medical segmentation methods can achieve\ndecent performance, they are still affected by the uncertainty in unlabeled\ndata and model predictions, and there is currently a lack of effective\nstrategies that can explore the uncertain aspects of both simultaneously. To\naddress the aforementioned issues, we propose Evidential Prototype Learning\n(EPL), which utilizes an extended probabilistic framework to effectively fuse\nvoxel probability predictions from different sources and achieves prototype\nfusion utilization of labeled and unlabeled data under a generalized evidential\nframework, leveraging voxel-level dual uncertainty masking. The uncertainty not\nonly enables the model to self-correct predictions but also improves the guided\nlearning process with pseudo-labels and is able to feed back into the\nconstruction of hidden features. The method proposed in this paper has been\nexperimented on LA, Pancreas-CT and TBAD datasets, achieving the\nstate-of-the-art performance in three different labeled ratios, which strongly\ndemonstrates the effectiveness of our strategy.\n","authors":["Yuanpeng He"],"pdf_url":"https://arxiv.org/pdf/2404.06181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06180v1","updated":"2024-04-09T10:03:44Z","published":"2024-04-09T10:03:44Z","title":"YOLC: You Only Look Clusters for Tiny Object Detection in Aerial Images","summary":" Detecting objects from aerial images poses significant challenges due to the\nfollowing factors: 1) Aerial images typically have very large sizes, generally\nwith millions or even hundreds of millions of pixels, while computational\nresources are limited. 2) Small object size leads to insufficient information\nfor effective detection. 3) Non-uniform object distribution leads to\ncomputational resource wastage. To address these issues, we propose YOLC (You\nOnly Look Clusters), an efficient and effective framework that builds on an\nanchor-free object detector, CenterNet. To overcome the challenges posed by\nlarge-scale images and non-uniform object distribution, we introduce a Local\nScale Module (LSM) that adaptively searches cluster regions for zooming in for\naccurate detection. Additionally, we modify the regression loss using Gaussian\nWasserstein distance (GWD) to obtain high-quality bounding boxes. Deformable\nconvolution and refinement methods are employed in the detection head to\nenhance the detection of small objects. We perform extensive experiments on two\naerial image datasets, including Visdrone2019 and UAVDT, to demonstrate the\neffectiveness and superiority of our proposed approach.\n","authors":["Chenguang Liu","Guangshuai Gao","Ziyue Huang","Zhenghui Hu","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06180v1.pdf","comment":"accepted to TITS"},{"id":"http://arxiv.org/abs/2404.06177v1","updated":"2024-04-09T09:58:10Z","published":"2024-04-09T09:58:10Z","title":"Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised\n Medical Image Segmentation","summary":" Although the existing uncertainty-based semi-supervised medical segmentation\nmethods have achieved excellent performance, they usually only consider a\nsingle uncertainty evaluation, which often fails to solve the problem related\nto credibility completely. Therefore, based on the framework of evidential deep\nlearning, this paper integrates the evidential predictive results in the\ncross-region of mixed and original samples to reallocate the confidence degree\nand uncertainty measure of each voxel, which is realized by emphasizing\nuncertain information of probability assignments fusion rule of traditional\nevidence theory. Furthermore, we design a voxel-level asymptotic learning\nstrategy by introducing information entropy to combine with the fused\nuncertainty measure to estimate voxel prediction more precisely. The model will\ngradually pay attention to the prediction results with high uncertainty in the\nlearning process, to learn the features that are difficult to master. The\nexperimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the\nsuperior performance of our proposed method in comparison with the existing\nstate of the arts.\n","authors":["Yuanpeng He","Lijian Li"],"pdf_url":"https://arxiv.org/pdf/2404.06177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06173v1","updated":"2024-04-09T09:54:21Z","published":"2024-04-09T09:54:21Z","title":"Improving Interpretable Embeddings for Ad-hoc Video Search with\n Generative Captions and Multi-word Concept Bank","summary":" Aligning a user query and video clips in cross-modal latent space and that\nwith semantic concepts are two mainstream approaches for ad-hoc video search\n(AVS). However, the effectiveness of existing approaches is bottlenecked by the\nsmall sizes of available video-text datasets and the low quality of concept\nbanks, which results in the failures of unseen queries and the\nout-of-vocabulary problem. This paper addresses these two problems by\nconstructing a new dataset and developing a multi-word concept bank.\nSpecifically, capitalizing on a generative model, we construct a new dataset\nconsisting of 7 million generated text and video pairs for pre-training. To\ntackle the out-of-vocabulary problem, we develop a multi-word concept bank\nbased on syntax analysis to enhance the capability of a state-of-the-art\ninterpretable AVS method in modeling relationships between query words. We also\nstudy the impact of current advanced features on the method. Experimental\nresults show that the integration of the above-proposed elements doubles the\nR@1 performance of the AVS method on the MSRVTT dataset and improves the xinfAP\non the TRECVid AVS query sets for 2016-2023 (eight years) by a margin from 2%\nto 77%, with an average about 20%.\n","authors":["Jiaxin Wu","Chong-Wah Ngo","Wing-Kwong Chan"],"pdf_url":"https://arxiv.org/pdf/2404.06173v1.pdf","comment":"Accepted in ICMR2024"},{"id":"http://arxiv.org/abs/2403.10376v2","updated":"2024-04-09T09:52:54Z","published":"2024-03-15T15:05:29Z","title":"PASTA: Towards Flexible and Efficient HDR Imaging Via Progressively\n Aggregated Spatio-Temporal Alignment","summary":" Leveraging Transformer attention has led to great advancements in HDR\ndeghosting. However, the intricate nature of self-attention introduces\npractical challenges, as existing state-of-the-art methods often demand\nhigh-end GPUs or exhibit slow inference speeds, especially for high-resolution\nimages like 2K. Striking an optimal balance between performance and latency\nremains a critical concern. In response, this work presents PASTA, a novel\nProgressively Aggregated Spatio-Temporal Alignment framework for HDR\ndeghosting. Our approach achieves effectiveness and efficiency by harnessing\nhierarchical representation during feature distanglement. Through the\nutilization of diverse granularities within the hierarchical structure, our\nmethod substantially boosts computational speed and optimizes the HDR imaging\nworkflow. In addition, we explore within-scale feature modeling with local and\nglobal attention, gradually merging and refining them in a coarse-to-fine\nfashion. Experimental results showcase PASTA's superiority over current SOTA\nmethods in both visual quality and performance metrics, accompanied by a\nsubstantial 3-fold (x3) increase in inference speed.\n","authors":["Xiaoning Liu","Ao Li","Zongwei Wu","Yapeng Du","Le Zhang","Yulun Zhang","Radu Timofte","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.10376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05393v2","updated":"2024-04-09T09:52:32Z","published":"2024-04-08T10:52:29Z","title":"PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation","summary":" Beyond class frequency, we recognize the impact of class-wise relationships\namong various class-specific predictions and the imbalance in label masks on\nlong-tailed segmentation learning. To address these challenges, we propose an\ninnovative Pixel-wise Adaptive Training (PAT) technique tailored for\nlong-tailed segmentation. PAT has two key features: 1) class-wise gradient\nmagnitude homogenization, and 2) pixel-wise class-specific loss adaptation\n(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate\nthe imbalance among label masks by ensuring equal consideration of the\nclass-wise impact on model updates. Second, PCLA tackles the detrimental impact\nof both rare classes within the long-tailed distribution and inaccurate\npredictions from previous training stages by encouraging learning classes with\nlow prediction confidence and guarding against forgetting classes with high\nconfidence. This combined approach fosters robust learning while preventing the\nmodel from forgetting previously learned knowledge. PAT exhibits significant\nperformance improvements, surpassing the current state-of-the-art by 2.2% in\nthe NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and\nintersection over union value by 2.07%, with a particularly notable declination\nof 0.39% in detecting rare classes compared to Balance Logits Variation, as\ndemonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and\nNYU.\n","authors":["Khoi Do","Duong Nguyen","Nguyen H. Tran","Viet Dung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06165v1","updated":"2024-04-09T09:42:18Z","published":"2024-04-09T09:42:18Z","title":"Enhanced Radar Perception via Multi-Task Learning: Towards Refined Data\n for Sensor Fusion Applications","summary":" Radar and camera fusion yields robustness in perception tasks by leveraging\nthe strength of both sensors. The typical extracted radar point cloud is 2D\nwithout height information due to insufficient antennas along the elevation\naxis, which challenges the network performance. This work introduces a\nlearning-based approach to infer the height of radar points associated with 3D\nobjects. A novel robust regression loss is introduced to address the sparse\ntarget challenge. In addition, a multi-task training strategy is employed,\nemphasizing important features. The average radar absolute height error\ndecreases from 1.69 to 0.25 meters compared to the state-of-the-art height\nextension method. The estimated target height values are used to preprocess and\nenrich radar data for downstream perception tasks. Integrating this refined\nradar information further enhances the performance of existing radar camera\nfusion models for object detection and depth estimation tasks.\n","authors":["Huawei Sun","Hao Feng","Gianfranco Mauro","Julius Ott","Georg Stettinger","Lorenzo Servadei","Robert Wille"],"pdf_url":"https://arxiv.org/pdf/2404.06165v1.pdf","comment":"Accepted by IEEE Intelligent Vehicles Symposium (IV 2024)"},{"id":"http://arxiv.org/abs/2404.06155v1","updated":"2024-04-09T09:28:05Z","published":"2024-04-09T09:28:05Z","title":"Efficient and Robust Point Cloud Registration via Heuristics-guided\n Parameter Search","summary":" Estimating the rigid transformation with 6 degrees of freedom based on a\nputative 3D correspondence set is a crucial procedure in point cloud\nregistration. Existing correspondence identification methods usually lead to\nlarge outlier ratios ($>$ 95 $\\%$ is common), underscoring the significance of\nrobust registration methods. Many researchers turn to parameter search-based\nstrategies (e.g., Branch-and-Bround) for robust registration. Although related\nmethods show high robustness, their efficiency is limited to the\nhigh-dimensional search space. This paper proposes a heuristics-guided\nparameter search strategy to accelerate the search while maintaining high\nrobustness. We first sample some correspondences (i.e., heuristics) and then\njust need to sequentially search the feasible regions that make each sample an\ninlier. Our strategy largely reduces the search space and can guarantee\naccuracy with only a few inlier samples, therefore enjoying an excellent\ntrade-off between efficiency and robustness. Since directly parameterizing the\n6-dimensional nonlinear feasible region for efficient search is intractable, we\nconstruct a three-stage decomposition pipeline to reparameterize the feasible\nregion, resulting in three lower-dimensional sub-problems that are easily\nsolvable via our strategy. Besides reducing the searching dimension, our\ndecomposition enables the leverage of 1-dimensional interval stabbing at all\nthree stages for searching acceleration. Moreover, we propose a valid sampling\nstrategy to guarantee our sampling effectiveness, and a compatibility\nverification setup to further accelerate our search. Extensive experiments on\nboth simulated and real-world datasets demonstrate that our approach exhibits\ncomparable robustness with state-of-the-art methods while achieving a\nsignificant efficiency boost.\n","authors":["Tianyu Huang","Haoang Li","Liangzu Peng","Yinlong Liu","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06155v1.pdf","comment":"21 pages, 16 figures. Accepted to IEEE Transactions on Pattern\n Analysis and Machine Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.06154v1","updated":"2024-04-09T09:27:54Z","published":"2024-04-09T09:27:54Z","title":"Concise Plane Arrangements for Low-Poly Surface and Volume Modelling","summary":" Plane arrangements are a useful tool for surface and volume modelling.\nHowever, their main drawback is poor scalability. We introduce two key\nnovelties that enable the construction of plane arrangements for complex\nobjects and entire scenes: an ordering scheme for the plane insertion and the\ndirect use of input points during arrangement construction. Both ingredients\nreduce the number of unwanted splits, resulting in improved scalability of the\nconstruction mechanism by up to two orders of magnitude compared to existing\nalgorithms. We further introduce a remeshing and simplification technique that\nallows us to extract low-polygon surface meshes and lightweight convex\ndecompositions of volumes from the arrangement. We show that our approach leads\nto state-of-the-art results for the aforementioned tasks by comparing it to\nlearning-based and traditional approaches on various different datasets. Our\nimplementation is available at https://github.com/raphaelsulzer/compod .\n","authors":["Raphael Sulzer","Florent Lafarge"],"pdf_url":"https://arxiv.org/pdf/2404.06154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06152v1","updated":"2024-04-09T09:23:04Z","published":"2024-04-09T09:23:04Z","title":"HFNeRF: Learning Human Biomechanic Features with Neural Radiance Fields","summary":" In recent advancements in novel view synthesis, generalizable Neural Radiance\nFields (NeRF) based methods applied to human subjects have shown remarkable\nresults in generating novel views from few images. However, this generalization\nability cannot capture the underlying structural features of the skeleton\nshared across all instances. Building upon this, we introduce HFNeRF: a novel\ngeneralizable human feature NeRF aimed at generating human biomechanic features\nusing a pre-trained image encoder. While previous human NeRF methods have shown\npromising results in the generation of photorealistic virtual avatars, such\nmethods lack underlying human structure or biomechanic features such as\nskeleton or joint information that are crucial for downstream applications\nincluding Augmented Reality (AR)/Virtual Reality (VR). HFNeRF leverages 2D\npre-trained foundation models toward learning human features in 3D using neural\nrendering, and then volume rendering towards generating 2D feature maps. We\nevaluate HFNeRF in the skeleton estimation task by predicting heatmaps as\nfeatures. The proposed method is fully differentiable, allowing to successfully\nlearn color, geometry, and human skeleton in a simultaneous manner. This paper\npresents preliminary results of HFNeRF, illustrating its potential in\ngenerating realistic virtual avatars with biomechanic features using NeRF.\n","authors":["Arnab Dey","Di Yang","Antitza Dantcheva","Jean Martinet"],"pdf_url":"https://arxiv.org/pdf/2404.06152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10634v2","updated":"2024-04-09T09:18:26Z","published":"2023-12-17T07:33:06Z","title":"Anomaly Score: Evaluating Generative Models and Individual Generated\n Images based on Complexity and Vulnerability","summary":" With the advancement of generative models, the assessment of generated images\nbecomes more and more important. Previous methods measure distances between\nfeatures of reference and generated images from trained vision models. In this\npaper, we conduct an extensive investigation into the relationship between the\nrepresentation space and input space around generated images. We first propose\ntwo measures related to the presence of unnatural elements within images:\ncomplexity, which indicates how non-linear the representation space is, and\nvulnerability, which is related to how easily the extracted feature changes by\nadversarial input changes. Based on these, we introduce a new metric to\nevaluating image-generative models called anomaly score (AS). Moreover, we\npropose AS-i (anomaly score for individual images) that can effectively\nevaluate generated images individually. Experimental results demonstrate the\nvalidity of the proposed approach.\n","authors":["Jaehui Hwang","Junghyuk Lee","Jong-Seok Lee"],"pdf_url":"https://arxiv.org/pdf/2312.10634v2.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00915v2","updated":"2024-04-09T09:16:29Z","published":"2024-04-01T04:43:39Z","title":"Scalable 3D Registration via Truncated Entry-wise Absolute Residuals","summary":" Given an input set of $3$D point pairs, the goal of outlier-robust $3$D\nregistration is to compute some rotation and translation that align as many\npoint pairs as possible. This is an important problem in computer vision, for\nwhich many highly accurate approaches have been recently proposed. Despite\ntheir impressive performance, these approaches lack scalability, often\noverflowing the $16$GB of memory of a standard laptop to handle roughly\n$30,000$ point pairs. In this paper, we propose a $3$D registration approach\nthat can process more than ten million ($10^7$) point pairs with over $99\\%$\nrandom outliers. Moreover, our method is efficient, entails low memory costs,\nand maintains high accuracy at the same time. We call our method TEAR, as it\ninvolves minimizing an outlier-robust loss that computes Truncated Entry-wise\nAbsolute Residuals. To minimize this loss, we decompose the original\n$6$-dimensional problem into two subproblems of dimensions $3$ and $2$,\nrespectively, solved in succession to global optimality via a customized\nbranch-and-bound method. While branch-and-bound is often slow and unscalable,\nthis does not apply to TEAR as we propose novel bounding functions that are\ntight and computationally efficient. Experiments on various datasets are\nconducted to validate the scalability and efficiency of our method.\n","authors":["Tianyu Huang","Liangzu Peng","René Vidal","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00915v2.pdf","comment":"24 pages, 12 figures. Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.08801v2","updated":"2024-04-09T09:13:01Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02813v2","updated":"2024-04-09T09:12:58Z","published":"2023-12-05T14:56:55Z","title":"BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis\n via Bridging Image and Video Diffusion Models","summary":" Diffusion models have made tremendous progress in text-driven image and video\ngeneration. Now text-to-image foundation models are widely applied to various\ndownstream image synthesis tasks, such as controllable image generation and\nimage editing, while downstream video synthesis tasks are less explored for\nseveral reasons. First, it requires huge memory and computation overhead to\ntrain a video generation foundation model. Even with video foundation models,\nadditional costly training is still required for downstream video synthesis\ntasks. Second, although some works extend image diffusion models into videos in\na training-free manner, temporal consistency cannot be well preserved. Finally,\nthese adaption methods are specifically designed for one task and fail to\ngeneralize to different tasks. To mitigate these issues, we propose a\ntraining-free general-purpose video synthesis framework, coined as {\\bf\nBIVDiff}, via bridging specific image diffusion models and general\ntext-to-video foundation diffusion models. Specifically, we first use a\nspecific image diffusion model (e.g., ControlNet and Instruct Pix2Pix) for\nframe-wise video generation, then perform Mixed Inversion on the generated\nvideo, and finally input the inverted latents into the video diffusion models\n(e.g., VidRD and ZeroScope) for temporal smoothing. This decoupled framework\nenables flexible image model selection for different purposes with strong task\ngeneralization and high efficiency. To validate the effectiveness and general\nuse of BIVDiff, we perform a wide range of video synthesis tasks, including\ncontrollable video generation, video editing, video inpainting, and\noutpainting.\n","authors":["Fengyuan Shi","Jiaxi Gu","Hang Xu","Songcen Xu","Wei Zhang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02813v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://bivdiff.github.io;\n GitHub repository: https://github.com/MCG-NJU/BIVDiff"},{"id":"http://arxiv.org/abs/2404.06139v1","updated":"2024-04-09T09:05:23Z","published":"2024-04-09T09:05:23Z","title":"DiffHarmony: Latent Diffusion Model Meets Image Harmonization","summary":" Image harmonization, which involves adjusting the foreground of a composite\nimage to attain a unified visual consistency with the background, can be\nconceptualized as an image-to-image translation task. Diffusion models have\nrecently promoted the rapid development of image-to-image translation tasks .\nHowever, training diffusion models from scratch is computationally intensive.\nFine-tuning pre-trained latent diffusion models entails dealing with the\nreconstruction error induced by the image compression autoencoder, making it\nunsuitable for image generation tasks that involve pixel-level evaluation\nmetrics. To deal with these issues, in this paper, we first adapt a pre-trained\nlatent diffusion model to the image harmonization task to generate the\nharmonious but potentially blurry initial images. Then we implement two\nstrategies: utilizing higher-resolution images during inference and\nincorporating an additional refinement stage, to further enhance the clarity of\nthe initially harmonized images. Extensive experiments on iHarmony4 datasets\ndemonstrate the superiority of our proposed method. The code and model will be\nmade publicly available at https://github.com/nicecv/DiffHarmony .\n","authors":["Pengfei Zhou","Fangxiang Feng","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06139v1.pdf","comment":"Accepted by ICMR 2024"},{"id":"http://arxiv.org/abs/2404.06135v1","updated":"2024-04-09T09:02:21Z","published":"2024-04-09T09:02:21Z","title":"Mansformer: Efficient Transformer of Mixed Attention for Image\n Deblurring and Beyond","summary":" Transformer has made an enormous success in natural language processing and\nhigh-level vision over the past few years. However, the complexity of\nself-attention is quadratic to the image size, which makes it infeasible for\nhigh-resolution vision tasks. In this paper, we propose the Mansformer, a\nTransformer of mixed attention that combines multiple self-attentions, gate,\nand multi-layer perceptions (MLPs), to explore and employ more possibilities of\nself-attention. Taking efficiency into account, we design four kinds of\nself-attention, whose complexities are all linear. By elaborate adjustment of\nthe tensor shapes and dimensions for the dot product, we split the typical\nself-attention of quadratic complexity into four operations of linear\ncomplexity. To adaptively merge these different kinds of self-attention, we\ntake advantage of an architecture similar to Squeeze-and-Excitation Networks.\nFurthermore, we make it to merge the two-staged Transformer design into one\nstage by the proposed gated-dconv MLP. Image deblurring is our main target,\nwhile extensive quantitative and qualitative evaluations show that this method\nperforms favorably against the state-of-the-art methods far more than simply\ndeblurring. The source codes and trained models will be made available to the\npublic.\n","authors":["Pin-Hung Kuo","Jinshan Pan","Shao-Yi Chien","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.06135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06128v1","updated":"2024-04-09T08:51:44Z","published":"2024-04-09T08:51:44Z","title":"Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for\n Realistic Endoscopic Reconstruction","summary":" Within colorectal cancer diagnostics, conventional colonoscopy techniques\nface critical limitations, including a limited field of view and a lack of\ndepth information, which can impede the detection of precancerous lesions.\nCurrent methods struggle to provide comprehensive and accurate 3D\nreconstructions of the colonic surface which can help minimize the missing\nregions and reinspection for pre-cancerous polyps. Addressing this, we\nintroduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting\n(3D GS) combined with a Recurrent Neural Network-based Simultaneous\nLocalization and Mapping (RNNSLAM) system. By introducing geometric and depth\nregularization into the 3D GS framework, our approach ensures more accurate\nalignment of Gaussians with the colon surface, resulting in smoother 3D\nreconstructions with novel viewing of detailed textures and structures.\nEvaluations across three diverse datasets show that Gaussian Pancakes enhances\nnovel view synthesis quality, surpassing current leading methods with a 18%\nboost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster\nrendering and more than 10X shorter training times, making it a practical tool\nfor real-time applications. Hence, this holds promise for achieving clinical\ntranslation for better detection and diagnosis of colorectal cancer.\n","authors":["Sierra Bonilla","Shuai Zhang","Dimitrios Psychogyios","Danail Stoyanov","Francisco Vasconcelos","Sophia Bano"],"pdf_url":"https://arxiv.org/pdf/2404.06128v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.06124v1","updated":"2024-04-09T08:49:01Z","published":"2024-04-09T08:49:01Z","title":"Hierarchical Insights: Exploiting Structural Similarities for Reliable\n 3D Semantic Segmentation","summary":" Safety-critical applications like autonomous driving call for robust 3D\nenvironment perception algorithms which can withstand highly diverse and\nambiguous surroundings. The predictive performance of any classification model\nstrongly depends on the underlying dataset and the prior knowledge conveyed by\nthe annotated labels. While the labels provide a basis for the learning\nprocess, they usually fail to represent inherent relations between the classes\n- representations, which are a natural element of the human perception system.\nWe propose a training strategy which enables a 3D LiDAR semantic segmentation\nmodel to learn structural relationships between the different classes through\nabstraction. We achieve this by implicitly modeling those relationships through\na learning rule for hierarchical multi-label classification (HMC). With a\ndetailed analysis we show, how this training strategy not only improves the\nmodel's confidence calibration, but also preserves additional information for\ndownstream tasks like fusion, prediction and planning.\n","authors":["Mariella Dreissig","Florian Piewak","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2404.06124v1.pdf","comment":"submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2404.06119v1","updated":"2024-04-09T08:41:13Z","published":"2024-04-09T08:41:13Z","title":"DreamView: Injecting View-specific Text Guidance into Text-to-3D\n Generation","summary":" Text-to-3D generation, which synthesizes 3D assets according to an overall\ntext description, has significantly progressed. However, a challenge arises\nwhen the specific appearances need customizing at designated viewpoints but\nreferring solely to the overall description for generating 3D objects. For\ninstance, ambiguity easily occurs when producing a T-shirt with distinct\npatterns on its front and back using a single overall text guidance. In this\nwork, we propose DreamView, a text-to-image approach enabling multi-view\ncustomization while maintaining overall consistency by adaptively injecting the\nview-specific and overall text guidance through a collaborative text guidance\ninjection module, which can also be lifted to 3D generation via score\ndistillation sampling. DreamView is trained with large-scale rendered\nmulti-view images and their corresponding view-specific texts to learn to\nbalance the separate content manipulation in each view and the global\nconsistency of the overall object, resulting in a dual achievement of\ncustomization and consistency. Consequently, DreamView empowers artists to\ndesign 3D objects creatively, fostering the creation of more innovative and\ndiverse 3D assets. Code and model will be released at\nhttps://github.com/iSEE-Laboratory/DreamView.\n","authors":["Junkai Yan","Yipeng Gao","Qize Yang","Xihan Wei","Xuansong Xie","Ancong Wu","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06109v1","updated":"2024-04-09T08:20:37Z","published":"2024-04-09T08:20:37Z","title":"Revising Densification in Gaussian Splatting","summary":" In this paper, we address the limitations of Adaptive Density Control (ADC)\nin 3D Gaussian Splatting (3DGS), a scene representation method achieving\nhigh-quality, photorealistic results for novel view synthesis. ADC has been\nintroduced for automatic 3D point primitive management, controlling\ndensification and pruning, however, with certain limitations in the\ndensification logic. Our main contribution is a more principled, pixel-error\ndriven formulation for density control in 3DGS, leveraging an auxiliary,\nper-pixel error function as the criterion for densification. We further\nintroduce a mechanism to control the total number of primitives generated per\nscene and correct a bias in the current opacity handling strategy of ADC during\ncloning operations. Our approach leads to consistent quality improvements\nacross a variety of benchmark scenes, without sacrificing the method's\nefficiency.\n","authors":["Samuel Rota Bulò","Lorenzo Porzi","Peter Kontschieder"],"pdf_url":"https://arxiv.org/pdf/2404.06109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04617v2","updated":"2024-04-09T08:20:08Z","published":"2024-04-06T12:50:08Z","title":"Empowering Image Recovery_ A Multi-Attention Approach","summary":" We propose Diverse Restormer (DART), a novel image restoration method that\neffectively integrates information from various sources (long sequences, local\nand global regions, feature dimensions, and positional dimensions) to address\nrestoration challenges. While Transformer models have demonstrated excellent\nperformance in image restoration due to their self-attention mechanism, they\nface limitations in complex scenarios. Leveraging recent advancements in\nTransformers and various attention mechanisms, our method utilizes customized\nattention mechanisms to enhance overall performance. DART, our novel network\narchitecture, employs windowed attention to mimic the selective focusing\nmechanism of human eyes. By dynamically adjusting receptive fields, it\noptimally captures the fundamental features crucial for image resolution\nreconstruction. Efficiency and performance balance are achieved through the\nLongIR attention mechanism for long sequence image restoration. Integration of\nattention mechanisms across feature and positional dimensions further enhances\nthe recovery of fine details. Evaluation across five restoration tasks\nconsistently positions DART at the forefront. Upon acceptance, we commit to\nproviding publicly accessible code and models to ensure reproducibility and\nfacilitate further research.\n","authors":["Juan Wen","Yawei Li","Chao Zhang","Weiyan Hou","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.04617v2.pdf","comment":"12 pages, 10 figures, 12 tables"},{"id":"http://arxiv.org/abs/2401.13961v2","updated":"2024-04-09T08:07:48Z","published":"2024-01-25T05:50:48Z","title":"TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation\n in VEM images","summary":" While imaging techniques at macro and mesoscales have garnered substantial\nattention and resources, microscale VEM imaging, capable of revealing intricate\nvascular details, has lacked the necessary benchmarking infrastructure. In this\npaper, we address a significant gap in the field of neuroimaging by introducing\nthe largest-to-date public benchmark, \\textbf{BvEM}, designed specifically for\ncortical blood vessel segmentation in volume electron microscopy (VEM) images.\nOur BvEM benchmark is based on VEM image volumes from three mammal species:\nadult mouse, macaque, and human. We standardized the resolution, addressed\nimaging variations, and meticulously annotated blood vessels through\nsemi-automatic, manual, and quality control processes, ensuring high-quality 3D\nsegmentation. Furthermore, we developed a zero-shot cortical blood vessel\nsegmentation method named TriSAM, which leverages the powerful segmentation\nmodel SAM for 3D segmentation. To extend SAM from 2D to 3D volume segmentation,\nTriSAM employs a multi-seed tracking framework, leveraging the reliability of\ncertain image planes for tracking while using others to identify potential\nturning points. This approach effectively achieves long-term 3D blood vessel\nsegmentation without model training or fine-tuning. Experimental results show\nthat TriSAM achieved superior performances on the BvEM benchmark across three\nspecies.\n","authors":["Jia Wan","Wanhua Li","Jason Ken Adhinarta","Atmadeep Banerjee","Evelina Sjostedt","Jingpeng Wu","Jeff Lichtman","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2401.13961v2.pdf","comment":"BvEM-Mouse can be visualized at: https://tinyurl.com/yc2s38x9"},{"id":"http://arxiv.org/abs/2403.13358v2","updated":"2024-04-09T07:55:41Z","published":"2024-03-20T07:36:43Z","title":"GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped\n Robot","summary":" Multi-task robot learning holds significant importance in tackling diverse\nand complex scenarios. However, current approaches are hindered by performance\nissues and difficulties in collecting training datasets. In this paper, we\npropose GeRM (Generalist Robotic Model). We utilize offline reinforcement\nlearning to optimize data utilization strategies to learn from both\ndemonstrations and sub-optimal data, thus surpassing the limitations of human\ndemonstrations. Thereafter, we employ a transformer-based VLA network to\nprocess multi-modal inputs and output actions. By introducing the\nMixture-of-Experts structure, GeRM allows faster inference speed with higher\nwhole model capacity, and thus resolves the issue of limited RL parameters,\nenhancing model performance in multi-task learning while controlling\ncomputational costs. Through a series of experiments, we demonstrate that GeRM\noutperforms other methods across all tasks, while also validating its\nefficiency in both training and inference processes. Additionally, we uncover\nits potential to acquire emergent skills. Additionally, we contribute the\nQUARD-Auto dataset, collected automatically to support our training approach\nand foster advancements in multi-task quadruped robot learning. This work\npresents a new paradigm for reducing the cost of collecting robot data and\ndriving progress in the multi-task learning community. You can reach our\nproject and video through the link: https://songwxuan.github.io/GeRM/ .\n","authors":["Wenxuan Song","Han Zhao","Pengxiang Ding","Can Cui","Shangke Lyu","Yaning Fan","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05970v3","updated":"2024-04-09T07:49:55Z","published":"2023-03-10T15:01:51Z","title":"Exploring Recurrent Long-term Temporal Fusion for Multi-view 3D\n Perception","summary":" Long-term temporal fusion is a crucial but often overlooked technique in\ncamera-based Bird's-Eye-View (BEV) 3D perception. Existing methods are mostly\nin a parallel manner. While parallel fusion can benefit from long-term\ninformation, it suffers from increasing computational and memory overheads as\nthe fusion window size grows. Alternatively, BEVFormer adopts a recurrent\nfusion pipeline so that history information can be efficiently integrated, yet\nit fails to benefit from longer temporal frames. In this paper, we explore an\nembarrassingly simple long-term recurrent fusion strategy built upon the\nLSS-based methods and find it already able to enjoy the merits from both sides,\ni.e., rich long-term information and efficient fusion pipeline. A temporal\nembedding module is further proposed to improve the model's robustness against\noccasionally missed frames in practical scenarios. We name this simple but\neffective fusing pipeline VideoBEV. Experimental results on the nuScenes\nbenchmark show that VideoBEV obtains strong performance on various camera-based\n3D perception tasks, including object detection (55.4\\% mAP and 62.9\\% NDS),\nsegmentation (48.6\\% vehicle mIoU), tracking (54.8\\% AMOTA), and motion\nprediction (0.80m minADE and 0.463 EPA).\n","authors":["Chunrui Han","Jinrong Yang","Jianjian Sun","Zheng Ge","Runpei Dong","Hongyu Zhou","Weixin Mao","Yuang Peng","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.05970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06091v1","updated":"2024-04-09T07:49:30Z","published":"2024-04-09T07:49:30Z","title":"Hash3D: Training-free Acceleration for 3D Generation","summary":" The evolution of 3D generative modeling has been notably propelled by the\nadoption of 2D diffusion models. Despite this progress, the cumbersome\noptimization process per se presents a critical hurdle to efficiency. In this\npaper, we introduce Hash3D, a universal acceleration for 3D generation without\nmodel training. Central to Hash3D is the insight that feature-map redundancy is\nprevalent in images rendered from camera positions and diffusion time-steps in\nclose proximity. By effectively hashing and reusing these feature maps across\nneighboring timesteps and camera angles, Hash3D substantially prevents\nredundant calculations, thus accelerating the diffusion model's inference in 3D\ngeneration tasks. We achieve this through an adaptive grid-based hashing.\nSurprisingly, this feature-sharing mechanism not only speed up the generation\nbut also enhances the smoothness and view consistency of the synthesized 3D\nobjects. Our experiments covering 5 text-to-3D and 3 image-to-3D models,\ndemonstrate Hash3D's versatility to speed up optimization, enhancing efficiency\nby 1.3 to 4 times. Additionally, Hash3D's integration with 3D Gaussian\nsplatting largely speeds up 3D model creation, reducing text-to-3D processing\nto about 10 minutes and image-to-3D conversion to roughly 30 seconds. The\nproject page is at https://adamdad.github.io/hash3D/.\n","authors":["Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06091v1.pdf","comment":"https://adamdad.github.io/hash3D/"},{"id":"http://arxiv.org/abs/2311.17002v3","updated":"2024-04-09T07:46:43Z","published":"2023-11-28T17:57:44Z","title":"Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following","summary":" Existing text-to-image (T2I) diffusion models usually struggle in\ninterpreting complex prompts, especially those with quantity, object-attribute\nbinding, and multi-subject descriptions. In this work, we introduce a semantic\npanel as the middleware in decoding texts to images, supporting the generator\nto better follow instructions. The panel is obtained through arranging the\nvisual concepts parsed from the input text by the aid of large language models,\nand then injected into the denoising network as a detailed control signal to\ncomplement the text condition. To facilitate text-to-panel learning, we come up\nwith a carefully designed semantic formatting protocol, accompanied by a\nfully-automatic data preparation pipeline. Thanks to such a design, our\napproach, which we call Ranni, manages to enhance a pre-trained T2I generator\nregarding its textual controllability. More importantly, the introduction of\nthe generative middleware brings a more convenient form of interaction (i.e.,\ndirectly adjusting the elements in the panel or using language instructions)\nand further allows users to finely customize their generation, based on which\nwe develop a practical system and showcase its potential in continuous\ngeneration and chatting-based editing. Our project page is at\nhttps://ranni-t2i.github.io/Ranni.\n","authors":["Yutong Feng","Biao Gong","Di Chen","Yujun Shen","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.17002v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05559v2","updated":"2024-04-09T07:43:29Z","published":"2024-04-08T14:30:42Z","title":"TIM: A Time Interval Machine for Audio-Visual Action Recognition","summary":" Diverse actions give rise to rich audio-visual signals in long videos. Recent\nworks showcase that the two modalities of audio and video exhibit different\ntemporal extents of events and distinct labels. We address the interplay\nbetween the two modalities in long videos by explicitly modelling the temporal\nextents of audio and visual events. We propose the Time Interval Machine (TIM)\nwhere a modality-specific time interval poses as a query to a transformer\nencoder that ingests a long video input. The encoder then attends to the\nspecified interval, as well as the surrounding context in both modalities, in\norder to recognise the ongoing action.\n We test TIM on three long audio-visual video datasets: EPIC-KITCHENS,\nPerception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On\nEPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly\nlarger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we\nshow that TIM can be adapted for action detection, using dense multi-scale\ninterval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and\nshowing strong performance on the Perception Test. Our ablations show the\ncritical role of integrating the two modalities and modelling their time\nintervals in achieving this performance. Code and models at:\nhttps://github.com/JacobChalk/TIM\n","authors":["Jacob Chalk","Jaesung Huh","Evangelos Kazakos","Andrew Zisserman","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05559v2.pdf","comment":"Accepted to CVPR 2024. Project Webpage:\n https://jacobchalk.github.io/TIM-Project"},{"id":"http://arxiv.org/abs/2404.06080v1","updated":"2024-04-09T07:39:21Z","published":"2024-04-09T07:39:21Z","title":"Using Few-Shot Learning to Classify Primary Lung Cancer and Other\n Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial\n Ultrasound Procedures","summary":" This study aims to establish a computer-aided diagnosis system for\nendobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary\ndiagnosis of metastatic cancer. This involves arranging immediate examinations\nfor other sites of metastatic cancer after EBUS surgery, eliminating the need\nto wait for reports, thereby shortening the waiting time by more than half and\nenabling patients to detect other cancers earlier, allowing for early planning\nand implementation of treatment plans. Unlike previous studies on cell image\nclassification, which have abundant datasets for training, this study must also\nbe able to make effective classifications despite the limited amount of case\ndata for lung metastatic cancer. In the realm of small data set classification\nmethods, Few-shot learning (FSL) has become mainstream in recent years. Through\nits ability to train on small datasets and its strong generalization\ncapabilities, FSL shows potential in this task of lung metastatic cell image\nclassification. This study will adopt the approach of Few-shot learning,\nreferencing existing proposed models, and designing a model architecture for\nclassifying lung metastases cell images. Batch Spectral Regularization (BSR)\nwill be incorporated as a loss update parameter, and the Finetune method of PMF\nwill be modified. In terms of test results, the addition of BSR and the\nmodified Finetune method further increases the accuracy by 8.89% to 65.60%,\noutperforming other FSL methods. This study confirms that FSL is superior to\nsupervised and transfer learning in classifying metastatic cancer and\ndemonstrates that using BSR as a loss function and modifying Finetune can\nenhance the model's capabilities.\n","authors":["Ching-Kai Lin","Di-Chun Wei","Yun-Chien Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.06080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07937v4","updated":"2024-04-09T07:31:25Z","published":"2023-12-13T07:30:19Z","title":"BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics","summary":" The recently emerging text-to-motion advances have spired numerous attempts\nfor convenient and interactive human motion generation. Yet, existing methods\nare largely limited to generating body motions only without considering the\nrich two-hand motions, let alone handling various conditions like body dynamics\nor texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal\ndataset for two-hand motion generation. Our dataset includes accurate motion\ntracking for the human body and hands and provides pair-wised finger-level hand\nannotations and body descriptions. We further provide a strong baseline method,\nBOTH2Hands, for the novel task: generating vivid two-hand motions from both\nimplicit body dynamics and explicit text prompts. We first warm up two parallel\nbody-to-hand and text-to-hand diffusion models and then utilize the\ncross-attention transformer for motion blending. Extensive experiments and\ncross-validations demonstrate the effectiveness of our approach and dataset for\ngenerating convincing two-hand motions from the hybrid body-and-textual\nconditions. Our dataset and code will be disseminated to the community for\nfuture research.\n","authors":["Wenqian Zhang","Molin Huang","Yuxuan Zhou","Juze Zhang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07937v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06075v1","updated":"2024-04-09T07:25:30Z","published":"2024-04-09T07:25:30Z","title":"LIPT: Latency-aware Image Processing Transformer","summary":" Transformer is leading a trend in the field of image processing. Despite the\ngreat success that existing lightweight image processing transformers have\nachieved, they are tailored to FLOPs or parameters reduction, rather than\npractical inference acceleration. In this paper, we present a latency-aware\nimage processing transformer, termed LIPT. We devise the low-latency proportion\nLIPT block that substitutes memory-intensive operators with the combination of\nself-attention and convolutions to achieve practical speedup. Specifically, we\npropose a novel non-volatile sparse masking self-attention (NVSM-SA) that\nutilizes a pre-computing sparse mask to capture contextual information from a\nlarger window with no extra computation overload. Besides, a high-frequency\nreparameterization module (HRM) is proposed to make LIPT block\nreparameterization friendly, which improves the model's detail reconstruction\ncapability. Extensive experiments on multiple image processing tasks (e.g.,\nimage super-resolution (SR), JPEG artifact reduction, and image denoising)\ndemonstrate the superiority of LIPT on both latency and PSNR. LIPT achieves\nreal-time GPU inference with state-of-the-art performance on multiple image SR\nbenchmarks.\n","authors":["Junbo Qiao","Wei Li","Haizhen Xie","Hanting Chen","Yunshuai Zhou","Zhijun Tu","Jie Hu","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03892v2","updated":"2024-04-09T07:21:32Z","published":"2024-04-05T05:00:21Z","title":"Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and\n Integration of Convolutional Neural Networks and Explainable AI","summary":" The study introduces an integrated framework combining Convolutional Neural\nNetworks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced\ndiagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned\nResNet50 architecture, our investigation not only provides effective\ndifferentiation of mammographic images into benign and malignant categories but\nalso addresses the opaque \"black-box\" nature of deep learning models by\nemploying XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN\ndecision-making processes for healthcare professionals. Our methodology\nencompasses an elaborate data preprocessing pipeline and advanced data\naugmentation techniques to counteract dataset limitations, and transfer\nlearning using pre-trained networks, such as VGG-16, DenseNet and ResNet was\nemployed. A focal point of our study is the evaluation of XAI's effectiveness\nin interpreting model predictions, highlighted by utilising the Hausdorff\nmeasure to assess the alignment between AI-generated explanations and expert\nannotations quantitatively. This approach plays a critical role for XAI in\npromoting trustworthiness and ethical fairness in AI-assisted diagnostics. The\nfindings from our research illustrate the effective collaboration between CNNs\nand XAI in advancing diagnostic methods for breast cancer, thereby facilitating\na more seamless integration of advanced AI technologies within clinical\nsettings. By enhancing the interpretability of AI-driven decisions, this work\nlays the groundwork for improved collaboration between AI systems and medical\npractitioners, ultimately enriching patient care. Furthermore, the implications\nof our research extend well beyond the current methodologies, advocating for\nsubsequent inquiries into the integration of multimodal data and the refinement\nof AI explanations to satisfy the needs of clinical practice.\n","authors":["Maryam Ahmed","Tooba Bibi","Rizwan Ahmed Khan","Sidra Nasir"],"pdf_url":"https://arxiv.org/pdf/2404.03892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18201v2","updated":"2024-04-09T07:18:41Z","published":"2024-02-28T09:46:56Z","title":"Learning Invariant Inter-pixel Correlations for Superpixel Generation","summary":" Deep superpixel algorithms have made remarkable strides by substituting\nhand-crafted features with learnable ones. Nevertheless, we observe that\nexisting deep superpixel methods, serving as mid-level representation\noperations, remain sensitive to the statistical properties (e.g., color\ndistribution, high-level semantics) embedded within the training dataset.\nConsequently, learnable features exhibit constrained discriminative capability,\nresulting in unsatisfactory pixel grouping performance, particularly in\nuntrainable application scenarios. To address this issue, we propose the\nContent Disentangle Superpixel (CDS) algorithm to selectively separate the\ninvariant inter-pixel correlations and statistical properties, i.e., style\nnoise. Specifically, We first construct auxiliary modalities that are\nhomologous to the original RGB image but have substantial stylistic variations.\nThen, driven by mutual information, we propose the local-grid correlation\nalignment across modalities to reduce the distribution discrepancy of\nadaptively selected features and learn invariant inter-pixel correlations.\nAfterwards, we perform global-style mutual information minimization to enforce\nthe separation of invariant content and train data styles. The experimental\nresults on four benchmark datasets demonstrate the superiority of our approach\nto existing state-of-the-art methods, regarding boundary adherence,\ngeneralization, and efficiency. Code and pre-trained model are available at\nhttps://github.com/rookiie/CDSpixel.\n","authors":["Sen Xu","Shikui Wei","Tao Ruan","Lixin Liao"],"pdf_url":"https://arxiv.org/pdf/2402.18201v2.pdf","comment":"Accepted by AAAI24"},{"id":"http://arxiv.org/abs/2404.06065v1","updated":"2024-04-09T07:08:00Z","published":"2024-04-09T07:08:00Z","title":"Unified Entropy Optimization for Open-Set Test-Time Adaptation","summary":" Test-time adaptation (TTA) aims at adapting a model pre-trained on the\nlabeled source domain to the unlabeled target domain. Existing methods usually\nfocus on improving TTA performance under covariate shifts, while neglecting\nsemantic shifts. In this paper, we delve into a realistic open-set TTA setting\nwhere the target domain may contain samples from unknown classes. Many\nstate-of-the-art closed-set TTA methods perform poorly when applied to open-set\nscenarios, which can be attributed to the inaccurate estimation of data\ndistribution and model confidence. To address these issues, we propose a simple\nbut effective framework called unified entropy optimization (UniEnt), which is\ncapable of simultaneously adapting to covariate-shifted in-distribution (csID)\ndata and detecting covariate-shifted out-of-distribution (csOOD) data.\nSpecifically, UniEnt first mines pseudo-csID and pseudo-csOOD samples from test\ndata, followed by entropy minimization on the pseudo-csID data and entropy\nmaximization on the pseudo-csOOD data. Furthermore, we introduce UniEnt+ to\nalleviate the noise caused by hard data partition leveraging sample-level\nconfidence. Extensive experiments on CIFAR benchmarks and Tiny-ImageNet-C show\nthe superiority of our framework. The code is available at\nhttps://github.com/gaozhengqing/UniEnt\n","authors":["Zhengqing Gao","Xu-Yao Zhang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06065v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04580v2","updated":"2024-04-09T06:56:02Z","published":"2024-04-06T10:30:31Z","title":"SDFR: Synthetic Data for Face Recognition Competition","summary":" Large-scale face recognition datasets are collected by crawling the Internet\nand without individuals' consent, raising legal, ethical, and privacy concerns.\nWith the recent advances in generative models, recently several works proposed\ngenerating synthetic face recognition datasets to mitigate concerns in\nweb-crawled face recognition datasets. This paper presents the summary of the\nSynthetic Data for Face Recognition (SDFR) Competition held in conjunction with\nthe 18th IEEE International Conference on Automatic Face and Gesture\nRecognition (FG 2024) and established to investigate the use of synthetic data\nfor training face recognition models. The SDFR competition was split into two\ntasks, allowing participants to train face recognition systems using new\nsynthetic datasets and/or existing ones. In the first task, the face\nrecognition backbone was fixed and the dataset size was limited, while the\nsecond task provided almost complete freedom on the model backbone, the\ndataset, and the training pipeline. The submitted models were trained on\nexisting and also new synthetic datasets and used clever methods to improve\ntraining with synthetic data. The submissions were evaluated and ranked on a\ndiverse set of seven benchmarking datasets. The paper gives an overview of the\nsubmitted face recognition models and reports achieved performance compared to\nbaseline models trained on real and synthetic datasets. Furthermore, the\nevaluation of submissions is extended to bias assessment across different\ndemography groups. Lastly, an outlook on the current state of the research in\ntraining face recognition models using synthetic data is presented, and\nexisting problems as well as potential future directions are also discussed.\n","authors":["Hatef Otroshi Shahreza","Christophe Ecabert","Anjith George","Alexander Unnervik","Sébastien Marcel","Nicolò Di Domenico","Guido Borghi","Davide Maltoni","Fadi Boutros","Julia Vogel","Naser Damer","Ángela Sánchez-Pérez"," EnriqueMas-Candela","Jorge Calvo-Zaragoza","Bernardo Biesseck","Pedro Vidal","Roger Granada","David Menotti","Ivan DeAndres-Tame","Simone Maurizio La Cava","Sara Concas","Pietro Melzi","Ruben Tolosana","Ruben Vera-Rodriguez","Gianpaolo Perelli","Giulia Orrù","Gian Luca Marcialis","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2404.04580v2.pdf","comment":"The 18th IEEE International Conference on Automatic Face and Gesture\n Recognition (FG 2024)"},{"id":"http://arxiv.org/abs/2404.06057v1","updated":"2024-04-09T06:47:44Z","published":"2024-04-09T06:47:44Z","title":"Unified Multi-modal Diagnostic Framework with Reconstruction\n Pre-training and Heterogeneity-combat Tuning","summary":" Medical multi-modal pre-training has revealed promise in computer-aided\ndiagnosis by leveraging large-scale unlabeled datasets. However, existing\nmethods based on masked autoencoders mainly rely on data-level reconstruction\ntasks, but lack high-level semantic information. Furthermore, two significant\nheterogeneity challenges hinder the transfer of pre-trained knowledge to\ndownstream tasks, \\textit{i.e.}, the distribution heterogeneity between\npre-training data and downstream data, and the modality heterogeneity within\ndownstream data. To address these challenges, we propose a Unified Medical\nMulti-modal Diagnostic (UMD) framework with tailored pre-training and\ndownstream tuning strategies. Specifically, to enhance the representation\nabilities of vision and language encoders, we propose the Multi-level\nReconstruction Pre-training (MR-Pretrain) strategy, including a feature-level\nand data-level reconstruction, which guides models to capture the semantic\ninformation from masked inputs of different modalities. Moreover, to tackle two\nkinds of heterogeneities during the downstream tuning, we present the\nheterogeneity-combat downstream tuning strategy, which consists of a\nTask-oriented Distribution Calibration (TD-Calib) and a Gradient-guided\nModality Coordination (GM-Coord). In particular, TD-Calib fine-tunes the\npre-trained model regarding the distribution of downstream datasets, and\nGM-Coord adjusts the gradient weights according to the dynamic optimization\nstatus of different modalities. Extensive experiments on five public medical\ndatasets demonstrate the effectiveness of our UMD framework, which remarkably\noutperforms existing approaches on three kinds of downstream tasks.\n","authors":["Yupei Zhang","Li Pan","Qiushi Yang","Tan Li","Zhen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06057v1.pdf","comment":"to be published in IEEE JBHI; Code available at\n https://github.com/helenypzhang/UMD"},{"id":"http://arxiv.org/abs/2404.06050v1","updated":"2024-04-09T06:27:35Z","published":"2024-04-09T06:27:35Z","title":"Incremental Joint Learning of Depth, Pose and Implicit Scene\n Representation on Monocular Camera in Large-scale Scenes","summary":" Dense scene reconstruction for photo-realistic view synthesis has various\napplications, such as VR/AR, autonomous vehicles. However, most existing\nmethods have difficulties in large-scale scenes due to three core challenges:\n\\textit{(a) inaccurate depth input.} Accurate depth input is impossible to get\nin real-world large-scale scenes. \\textit{(b) inaccurate pose estimation.} Most\nexisting approaches rely on accurate pre-estimated camera poses. \\textit{(c)\ninsufficient scene representation capability.} A single global radiance field\nlacks the capacity to effectively scale to large-scale scenes. To this end, we\npropose an incremental joint learning framework, which can achieve accurate\ndepth, pose estimation, and large-scale scene reconstruction. A vision\ntransformer-based network is adopted as the backbone to enhance performance in\nscale information estimation. For pose estimation, a feature-metric bundle\nadjustment (FBA) method is designed for accurate and robust camera tracking in\nlarge-scale scenes. In terms of implicit scene representation, we propose an\nincremental scene representation method to construct the entire large-scale\nscene as multiple local radiance fields to enhance the scalability of 3D scene\nrepresentation. Extended experiments have been conducted to demonstrate the\neffectiveness and accuracy of our method in depth estimation, pose estimation,\nand large-scale scene reconstruction.\n","authors":["Tianchen Deng","Nailin Wang","Chongdi Wang","Shenghai Yuan","Jingchuan Wang","Danwei Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04421v2","updated":"2024-04-09T06:23:35Z","published":"2024-04-05T21:44:57Z","title":"PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual\n Observations","summary":" Modeling and rendering photorealistic avatars is of crucial importance in\nmany applications. Existing methods that build a 3D avatar from visual\nobservations, however, struggle to reconstruct clothed humans. We introduce\nPhysAvatar, a novel framework that combines inverse rendering with inverse\nphysics to automatically estimate the shape and appearance of a human from\nmulti-view video data along with the physical parameters of the fabric of their\nclothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for\nspatio-temporal mesh tracking as well as a physically based inverse renderer to\nestimate the intrinsic material properties. PhysAvatar integrates a physics\nsimulator to estimate the physical parameters of the garments using\ngradient-based optimization in a principled manner. These novel capabilities\nenable PhysAvatar to create high-quality novel-view renderings of avatars\ndressed in loose-fitting clothes under motions and lighting conditions not seen\nin the training data. This marks a significant advancement towards modeling\nphotorealistic digital humans using physically based inverse rendering with\nphysics in the loop. Our project website is at:\nhttps://qingqing-zhao.github.io/PhysAvatar\n","authors":["Yang Zheng","Qingqing Zhao","Guandao Yang","Wang Yifan","Donglai Xiang","Florian Dubost","Dmitry Lagun","Thabo Beeler","Federico Tombari","Leonidas Guibas","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2404.04421v2.pdf","comment":"Project Page: https://qingqing-zhao.github.io/PhysAvatar"},{"id":"http://arxiv.org/abs/2404.06044v1","updated":"2024-04-09T06:10:15Z","published":"2024-04-09T06:10:15Z","title":"Object Dynamics Modeling with Hierarchical Point Cloud-based\n Representations","summary":" Modeling object dynamics with a neural network is an important problem with\nnumerous applications. Most recent work has been based on graph neural\nnetworks. However, physics happens in 3D space, where geometric information\npotentially plays an important role in modeling physical phenomena. In this\nwork, we propose a novel U-net architecture based on continuous point\nconvolution which naturally embeds information from 3D coordinates and allows\nfor multi-scale feature representations with established downsampling and\nupsampling procedures. Bottleneck layers in the downsampled point clouds lead\nto better long-range interaction modeling. Besides, the flexibility of point\nconvolutions allows our approach to generalize to sparsely sampled points from\nmesh vertices and dynamically generate features on important interaction points\non mesh faces. Experimental results demonstrate that our approach significantly\nimproves the state-of-the-art, especially in scenarios that require accurate\ngravity or collision reasoning.\n","authors":["Chanho Kim","Li Fuxin"],"pdf_url":"https://arxiv.org/pdf/2404.06044v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.11729v2","updated":"2024-04-09T05:57:18Z","published":"2023-06-20T17:57:23Z","title":"Dense Video Object Captioning from Disjoint Supervision","summary":" We propose a new task and model for dense video object captioning --\ndetecting, tracking and captioning trajectories of objects in a video. This\ntask unifies spatial and temporal localization in video, whilst also requiring\nfine-grained visual understanding that is best described by natural language.\nWe propose a unified model, and demonstrate how our end-to-end approach is more\naccurate and temporally coherent than a multi-stage pipeline combining\nstate-of-the-art detection, tracking, and captioning models. Moreover, we\npropose a training strategy based on a mixture of disjoint tasks, which allows\nus to leverage diverse, large-scale datasets which supervise different parts of\nour model. Although each pretraining task only provides weak supervision, they\nare complementary and, when combined, result in noteworthy zero-shot ability\nand serve as strong initialization for additional finetuning to further improve\naccuracy. We carefully design new metrics capturing all components of our task,\nand show how we can repurpose existing video grounding datasets (e.g. VidSTG\nand VLN) for our new task. We show that our model improves upon a number of\nstrong baselines for this new task. Furthermore, we can apply our model to the\ntask of spatial grounding, outperforming prior state-of-the-art on VidSTG and\nVLN, without explicitly training for it. Code is available at\nhttps://github.com/google-research/scenic/tree/main/scenic/projects/densevoc.\n","authors":["Xingyi Zhou","Anurag Arnab","Chen Sun","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2306.11729v2.pdf","comment":"Code is available at\n https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc"},{"id":"http://arxiv.org/abs/2404.06036v1","updated":"2024-04-09T05:49:04Z","published":"2024-04-09T05:49:04Z","title":"Space-Time Video Super-resolution with Neural Operator","summary":" This paper addresses the task of space-time video super-resolution (ST-VSR).\nExisting methods generally suffer from inaccurate motion estimation and motion\ncompensation (MEMC) problems for large motions. Inspired by recent progress in\nphysics-informed neural networks, we model the challenges of MEMC in ST-VSR as\na mapping between two continuous function spaces. Specifically, our approach\ntransforms independent low-resolution representations in the coarse-grained\ncontinuous function space into refined representations with enriched\nspatiotemporal details in the fine-grained continuous function space. To\nachieve efficient and accurate MEMC, we design a Galerkin-type attention\nfunction to perform frame alignment and temporal interpolation. Due to the\nlinear complexity of the Galerkin-type attention mechanism, our model avoids\npatch partitioning and offers global receptive fields, enabling precise\nestimation of large motions. The experimental results show that the proposed\nmethod surpasses state-of-the-art techniques in both fixed-size and continuous\nspace-time video super-resolution tasks.\n","authors":["Yuantong Zhang","Hanyou Zheng","Daiqin Yang","Zhenzhong Chen","Haichuan Ma","Wenpeng Ding"],"pdf_url":"https://arxiv.org/pdf/2404.06036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10473v4","updated":"2024-04-09T05:47:57Z","published":"2023-02-21T06:31:53Z","title":"Oriented Object Detection in Optical Remote Sensing Images using Deep\n Learning: A Survey","summary":" Oriented object detection is one of the most fundamental and challenging\ntasks in remote sensing, aiming to locate and classify objects with arbitrary\norientations. Recent years have witnessed remarkable progress in oriented\nobject detection using deep learning techniques. Given the rapid development of\nthis field, this paper aims to provide a comprehensive survey of recent\nadvances in oriented object detection. To be specific, we first review the\ntechnical evolution from horizontal object detection to oriented object\ndetection and summarize the specific challenges, including feature\nmisalignment, spatial misalignment, and periodicity of angle. Subsequently, we\nfurther categorize existing methods into detection framework, oriented bounding\nbox (OBB) regression, and feature representations, and discuss how these\nmethods address the above challenges in detail. In addition, we cover several\npublicly available datasets and performance evaluation protocols. Furthermore,\nwe provide a comprehensive comparison and analysis of state-of-the-art oriented\nobject detection methods. Toward the end of this paper, we discuss several\nfuture directions for oriented object detection.\n","authors":["Kun Wang","Zi Wang","Zhang Li","Ang Su","Xichao Teng","Minhao Liu","Qifeng Yu"],"pdf_url":"https://arxiv.org/pdf/2302.10473v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06668v2","updated":"2024-04-09T05:47:39Z","published":"2024-03-11T12:36:14Z","title":"PeerAiD: Improving Adversarial Distillation from a Specialized Peer\n Tutor","summary":" Adversarial robustness of the neural network is a significant concern when it\nis applied to security-critical domains. In this situation, adversarial\ndistillation is a promising option which aims to distill the robustness of the\nteacher network to improve the robustness of a small student network. Previous\nworks pretrain the teacher network to make it robust to the adversarial\nexamples aimed at itself. However, the adversarial examples are dependent on\nthe parameters of the target network. The fixed teacher network inevitably\ndegrades its robustness against the unseen transferred adversarial examples\nwhich targets the parameters of the student network in the adversarial\ndistillation process. We propose PeerAiD to make a peer network learn the\nadversarial examples of the student network instead of adversarial examples\naimed at itself. PeerAiD is an adversarial distillation that trains the peer\nnetwork and the student network simultaneously in order to make the peer\nnetwork specialized for defending the student network. We observe that such\npeer networks surpass the robustness of pretrained robust teacher network\nagainst student-attacked adversarial samples. With this peer network and\nadversarial distillation, PeerAiD achieves significantly higher robustness of\nthe student network with AutoAttack (AA) accuracy up to 1.66%p and improves the\nnatural accuracy of the student network up to 4.72%p with ResNet-18 and\nTinyImageNet dataset.\n","authors":["Jaewon Jung","Hongsun Jang","Jaeyong Song","Jinho Lee"],"pdf_url":"https://arxiv.org/pdf/2403.06668v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06033v1","updated":"2024-04-09T05:44:00Z","published":"2024-04-09T05:44:00Z","title":"Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for\n Multi-exposure Image Fusion","summary":" In recent years, deep learning networks have made remarkable strides in the\ndomain of multi-exposure image fusion. Nonetheless, prevailing approaches often\ninvolve directly feeding over-exposed and under-exposed images into the\nnetwork, which leads to the under-utilization of inherent information present\nin the source images. Additionally, unsupervised techniques predominantly\nemploy rudimentary weighted summation for color channel processing, culminating\nin an overall desaturated final image tone. To partially mitigate these issues,\nthis study proposes a gamma correction module specifically designed to fully\nleverage latent information embedded within source images. Furthermore, a\nmodified transformer block, embracing with self-attention mechanisms, is\nintroduced to optimize the fusion process. Ultimately, a novel color\nenhancement algorithm is presented to augment image saturation while preserving\nintricate details. The source code is available at this https://github.com/ZhiyingDu/BHFMEF url.\n","authors":["Pan Mu","Zhiying Du","Jinyuan Liu","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2404.06033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06029v1","updated":"2024-04-09T05:30:58Z","published":"2024-04-09T05:30:58Z","title":"Improving Facial Landmark Detection Accuracy and Efficiency with\n Knowledge Distillation","summary":" The domain of computer vision has experienced significant advancements in\nfacial-landmark detection, becoming increasingly essential across various\napplications such as augmented reality, facial recognition, and emotion\nanalysis. Unlike object detection or semantic segmentation, which focus on\nidentifying objects and outlining boundaries, faciallandmark detection aims to\nprecisely locate and track critical facial features. However, deploying deep\nlearning-based facial-landmark detection models on embedded systems with\nlimited computational resources poses challenges due to the complexity of\nfacial features, especially in dynamic settings. Additionally, ensuring\nrobustness across diverse ethnicities and expressions presents further\nobstacles. Existing datasets often lack comprehensive representation of facial\nnuances, particularly within populations like those in Taiwan. This paper\nintroduces a novel approach to address these challenges through the development\nof a knowledge distillation method. By transferring knowledge from larger\nmodels to smaller ones, we aim to create lightweight yet powerful deep learning\nmodels tailored specifically for facial-landmark detection tasks. Our goal is\nto design models capable of accurately locating facial landmarks under varying\nconditions, including diverse expressions, orientations, and lighting\nenvironments. The ultimate objective is to achieve high accuracy and real-time\nperformance suitable for deployment on embedded systems. This method was\nsuccessfully implemented and achieved a top 6th place finish out of 165\nparticipants in the IEEE ICME 2024 PAIR competition.\n","authors":["Zong-Wei Hong","Yu-Chen Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06029v1.pdf","comment":"technical report. 6th/165 in IEEE ICME 2024 PAIR competition"},{"id":"http://arxiv.org/abs/2404.06025v1","updated":"2024-04-09T05:21:32Z","published":"2024-04-09T05:21:32Z","title":"Greedy-DiM: Greedy Algorithms for Unreasonably Effective Face Morphs","summary":" Morphing attacks are an emerging threat to state-of-the-art Face Recognition\n(FR) systems, which aim to create a single image that contains the biometric\ninformation of multiple identities. Diffusion Morphs (DiM) are a recently\nproposed morphing attack that has achieved state-of-the-art performance for\nrepresentation-based morphing attacks. However, none of the existing research\non DiMs have leveraged the iterative nature of DiMs and left the DiM model as a\nblack box, treating it no differently than one would a Generative Adversarial\nNetwork (GAN) or Varational AutoEncoder (VAE). We propose a greedy strategy on\nthe iterative sampling process of DiM models which searches for an optimal step\nguided by an identity-based heuristic function. We compare our proposed\nalgorithm against ten other state-of-the-art morphing algorithms using the\nopen-source SYN-MAD 2022 competition dataset. We find that our proposed\nalgorithm is unreasonably effective, fooling all of the tested FR systems with\nan MMPMR of 100%, outperforming all other morphing algorithms compared.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06025v1.pdf","comment":"Initial preprint. Under review"},{"id":"http://arxiv.org/abs/2404.06022v1","updated":"2024-04-09T05:11:28Z","published":"2024-04-09T05:11:28Z","title":"Band-Attention Modulated RetNet for Face Forgery Detection","summary":" The transformer networks are extensively utilized in face forgery detection\ndue to their scalability across large datasets.Despite their success,\ntransformers face challenges in balancing the capture of global context, which\nis crucial for unveiling forgery clues, with computational complexity.To\nmitigate this issue, we introduce Band-Attention modulated RetNet (BAR-Net), a\nlightweight network designed to efficiently process extensive visual contexts\nwhile avoiding catastrophic forgetting.Our approach empowers the target token\nto perceive global information by assigning differential attention levels to\ntokens at varying distances. We implement self-attention along both spatial\naxes, thereby maintaining spatial priors and easing the computational\nburden.Moreover, we present the adaptive frequency Band-Attention Modulation\nmechanism, which treats the entire Discrete Cosine Transform spectrogram as a\nseries of frequency bands with learnable weights.Together, BAR-Net achieves\nfavorable performance on several face forgery datasets, outperforming current\nstate-of-the-art methods.\n","authors":["Zhida Zhang","Jie Cao","Wenkui Yang","Qihang Fan","Kai Zhou","Ran He"],"pdf_url":"https://arxiv.org/pdf/2404.06022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16271v4","updated":"2024-04-09T05:09:56Z","published":"2024-03-24T19:32:39Z","title":"Object Detectors in the Open Environment: Challenges, Solutions, and\n Outlook","summary":" With the emergence of foundation models, deep learning-based object detectors\nhave shown practical usability in closed set scenarios. However, for real-world\ntasks, object detectors often operate in open environments, where crucial\nfactors (e.g., data distribution, objective) that influence model learning are\noften changing. The dynamic and intricate nature of the open environment poses\nnovel and formidable challenges to object detectors. Unfortunately, current\nresearch on object detectors in open environments lacks a comprehensive\nanalysis of their distinctive characteristics, challenges, and corresponding\nsolutions, which hinders their secure deployment in critical real-world\nscenarios. This paper aims to bridge this gap by conducting a comprehensive\nreview and analysis of object detectors in open environments. We initially\nidentified limitations of key structural components within the existing\ndetection pipeline and propose the open environment object detector challenge\nframework that includes four quadrants (i.e., out-of-domain, out-of-category,\nrobust learning, and incremental learning) based on the dimensions of the data\n/ target changes. For each quadrant of challenges in the proposed framework, we\npresent a detailed description and systematic analysis of the overarching goals\nand core difficulties, systematically review the corresponding solutions, and\nbenchmark their performance over multiple widely adopted datasets. In addition,\nwe engage in a discussion of open problems and potential avenues for future\nresearch. This paper aims to provide a fresh, comprehensive, and systematic\nunderstanding of the challenges and solutions associated with open-environment\nobject detectors, thus catalyzing the development of more solid applications in\nreal-world scenarios. A project related to this survey can be found at\nhttps://github.com/LiangSiyuan21/OEOD_Survey.\n","authors":["Siyuan Liang","Wei Wang","Ruoyu Chen","Aishan Liu","Boxi Wu","Ee-Chien Chang","Xiaochun Cao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.16271v4.pdf","comment":"37 pages, 17 figures"},{"id":"http://arxiv.org/abs/2312.13980v2","updated":"2024-04-09T04:41:53Z","published":"2023-12-21T16:10:33Z","title":"Carve3D: Improving Multi-view Reconstruction Consistency for Diffusion\n Models with RL Finetuning","summary":" Multi-view diffusion models, obtained by applying Supervised Finetuning (SFT)\nto text-to-image diffusion models, have driven recent breakthroughs in\ntext-to-3D research. However, due to the limited size and quality of existing\n3D datasets, they still suffer from multi-view inconsistencies and Neural\nRadiance Field (NeRF) reconstruction artifacts. We argue that multi-view\ndiffusion models can benefit from further Reinforcement Learning Finetuning\n(RLFT), which allows models to learn from the data generated by themselves and\nimprove beyond their dataset limitations during SFT. To this end, we introduce\nCarve3D, an improved RLFT algorithm coupled with a novel Multi-view\nReconstruction Consistency (MRC) metric, to enhance the consistency of\nmulti-view diffusion models. To measure the MRC metric on a set of multi-view\nimages, we compare them with their corresponding NeRF renderings at the same\ncamera viewpoints. The resulting model, which we denote as Carve3DM,\ndemonstrates superior multi-view consistency and NeRF reconstruction quality\nthan existing models. Our results suggest that pairing SFT with Carve3D's RLFT\nis essential for developing multi-view-consistent diffusion models, mirroring\nthe standard Large Language Model (LLM) alignment pipeline. Our code, training\nand testing data, and video results are available at:\nhttps://desaixie.github.io/carve-3d.\n","authors":["Desai Xie","Jiahao Li","Hao Tan","Xin Sun","Zhixin Shu","Yi Zhou","Sai Bi","Sören Pirk","Arie E. Kaufman"],"pdf_url":"https://arxiv.org/pdf/2312.13980v2.pdf","comment":"22 pages, 16 figures. Our code, training and testing data, and video\n results are available at: https://desaixie.github.io/carve-3d. This paper has\n been accepted to CVPR 2024. v2: incorporated changes from the CVPR 2024\n camera-ready version"},{"id":"http://arxiv.org/abs/2404.06012v1","updated":"2024-04-09T04:41:05Z","published":"2024-04-09T04:41:05Z","title":"Diffusion-Based Point Cloud Super-Resolution for mmWave Radar Data","summary":" The millimeter-wave radar sensor maintains stable performance under adverse\nenvironmental conditions, making it a promising solution for all-weather\nperception tasks, such as outdoor mobile robotics. However, the radar point\nclouds are relatively sparse and contain massive ghost points, which greatly\nlimits the development of mmWave radar technology. In this paper, we propose a\nnovel point cloud super-resolution approach for 3D mmWave radar data, named\nRadar-diffusion. Our approach employs the diffusion model defined by\nmean-reverting stochastic differential equations(SDE). Using our proposed new\nobjective function with supervision from corresponding LiDAR point clouds, our\napproach efficiently handles radar ghost points and enhances the sparse mmWave\nradar point clouds to dense LiDAR-like point clouds. We evaluate our approach\non two different datasets, and the experimental results show that our method\noutperforms the state-of-the-art baseline methods in 3D radar super-resolution\ntasks. Furthermore, we demonstrate that our enhanced radar point cloud is\ncapable of downstream radar point-based registration tasks.\n","authors":["Kai Luan","Chenghao Shi","Neng Wang","Yuwei Cheng","Huimin Lu","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05997v1","updated":"2024-04-09T04:04:50Z","published":"2024-04-09T04:04:50Z","title":"Concept-Attention Whitening for Interpretable Skin Lesion Diagnosis","summary":" The black-box nature of deep learning models has raised concerns about their\ninterpretability for successful deployment in real-world clinical applications.\nTo address the concerns, eXplainable Artificial Intelligence (XAI) aims to\nprovide clear and understandable explanations of the decision-making process.\nIn the medical domain, concepts such as attributes of lesions or abnormalities\nserve as key evidence for deriving diagnostic results. However, existing\nconcept-based models mainly depend on concepts that appear independently and\nrequire fine-grained concept annotations such as bounding boxes. A medical\nimage usually contains multiple concepts and the fine-grained concept\nannotations are difficult to acquire. In this paper, we propose a novel\nConcept-Attention Whitening (CAW) framework for interpretable skin lesion\ndiagnosis. CAW is comprised of a disease diagnosis branch and a concept\nalignment branch. In the former branch, we train the CNN with a CAW layer\ninserted to perform skin lesion diagnosis. The CAW layer decorrelates features\nand aligns image features to conceptual meanings via an orthogonal matrix. In\nthe latter branch, we calculate the orthogonal matrix under the guidance of the\nconcept attention mask. We particularly introduce a weakly-supervised concept\nmask generator that only leverages coarse concept labels for filtering local\nregions that are relevant to certain concepts, improving the optimization of\nthe orthogonal matrix. Extensive experiments on two public skin lesion\ndiagnosis datasets demonstrated that CAW not only enhanced interpretability but\nalso maintained a state-of-the-art diagnostic performance.\n","authors":["Junlin Hou","Jilan Xu","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05981v1","updated":"2024-04-09T03:27:09Z","published":"2024-04-09T03:27:09Z","title":"A Lightweight Measure of Classification Difficulty from Application\n Dataset Characteristics","summary":" Despite accuracy and computation benchmarks being widely available to help\nchoose among neural network models, these are usually trained on datasets with\nmany classes, and do not give a precise idea of performance for applications of\nfew (< 10) classes. The conventional procedure to predict performance is to\ntrain and test repeatedly on the different models and dataset variations of\ninterest. However, this is computationally expensive. We propose an efficient\nclassification difficulty measure that is calculated from the number of classes\nand intra- and inter-class similarity metrics of the dataset. After a single\nstage of training and testing per model family, relative performance for\ndifferent datasets and models of the same family can be predicted by comparing\ndifficulty measures - without further training and testing. We show how this\nmeasure can help a practitioner select a computationally efficient model for a\nsmall dataset 6 to 29x faster than through repeated training and testing. We\ngive an example of use of the measure for an industrial application in which\noptions are identified to select a model 42% smaller than the baseline\nYOLOv5-nano model, and if class merging from 3 to 2 classes meets requirements,\n85% smaller.\n","authors":["Bryan Bo Cao","Abhinav Sharma","Lawrence O'Gorman","Michael Coss","Shubham Jain"],"pdf_url":"https://arxiv.org/pdf/2404.05981v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.05980v1","updated":"2024-04-09T03:24:10Z","published":"2024-04-09T03:24:10Z","title":"Tackling Structural Hallucination in Image Translation with Local\n Diffusion","summary":" Recent developments in diffusion models have advanced conditioned image\ngeneration, yet they struggle with reconstructing out-of-distribution (OOD)\nimages, such as unseen tumors in medical images, causing ``image\nhallucination'' and risking misdiagnosis. We hypothesize such hallucinations\nresult from local OOD regions in the conditional images. We verify that\npartitioning the OOD region and conducting separate image generations\nalleviates hallucinations in several applications. From this, we propose a\ntraining-free diffusion framework that reduces hallucination with multiple\nLocal Diffusion processes. Our approach involves OOD estimation followed by two\nmodules: a ``branching'' module generates locally both within and outside OOD\nregions, and a ``fusion'' module integrates these predictions into one. Our\nevaluation shows our method mitigates hallucination over baseline models\nquantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the\nreal-world medical and natural image datasets, respectively. It also\ndemonstrates compatibility with various pre-trained diffusion models.\n","authors":["Seunghoi Kim","Chen Jin","Tom Diethe","Matteo Figini","Henry F. J. Tregidgo","Asher Mullokandov","Philip Teare","Daniel C. Alexander"],"pdf_url":"https://arxiv.org/pdf/2404.05980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05979v1","updated":"2024-04-09T03:22:36Z","published":"2024-04-09T03:22:36Z","title":"StoryImager: A Unified and Efficient Framework for Coherent Story\n Visualization and Completion","summary":" Story visualization aims to generate a series of realistic and coherent\nimages based on a storyline. Current models adopt a frame-by-frame architecture\nby transforming the pre-trained text-to-image model into an auto-regressive\nmanner. Although these models have shown notable progress, there are still\nthree flaws. 1) The unidirectional generation of auto-regressive manner\nrestricts the usability in many scenarios. 2) The additional introduced story\nhistory encoders bring an extremely high computational cost. 3) The story\nvisualization and continuation models are trained and inferred independently,\nwhich is not user-friendly. To these ends, we propose a bidirectional, unified,\nand efficient framework, namely StoryImager. The StoryImager enhances the\nstoryboard generative ability inherited from the pre-trained text-to-image\nmodel for a bidirectional generation. Specifically, we introduce a Target Frame\nMasking Strategy to extend and unify different story image generation tasks.\nFurthermore, we propose a Frame-Story Cross Attention Module that decomposes\nthe cross attention for local fidelity and global coherence. Moreover, we\ndesign a Contextual Feature Extractor to extract contextual information from\nthe whole storyline. The extensive experimental results demonstrate the\nexcellent performance of our StoryImager. The code is available at\nhttps://github.com/tobran/StoryImager.\n","authors":["Ming Tao","Bing-Kun Bao","Hao Tang","Yaowei Wang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2404.05979v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2403.14085v2","updated":"2024-04-09T02:59:41Z","published":"2024-03-21T02:31:17Z","title":"Surface Reconstruction from Point Clouds via Grid-based Intersection\n Prediction","summary":" Surface reconstruction from point clouds is a crucial task in the fields of\ncomputer vision and computer graphics. SDF-based methods excel at\nreconstructing smooth meshes with minimal error and artefacts but struggle with\nrepresenting open surfaces. On the other hand, UDF-based methods can\neffectively represent open surfaces but often introduce noise, leading to\nartefacts in the mesh. In this work, we propose a novel approach that directly\npredicts the intersection points between line segment of point pairs and\nimplicit surfaces. To achieve it, we propose two modules named Relative\nIntersection Module and Sign Module respectively with the feature of point pair\nas input. To preserve the continuity of the surface, we also integrate symmetry\ninto the two modules, which means the position of predicted intersection will\nnot change even if the input order of the point pair changes. This method not\nonly preserves the ability to represent open surfaces but also eliminates most\nartefacts on the mesh. Our approach demonstrates state-of-the-art performance\non three datasets: ShapeNet, MGN, and ScanNet. The code will be made available\nupon acceptance.\n","authors":["Hui Tian","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2403.14085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03394v2","updated":"2024-04-09T02:56:27Z","published":"2024-04-04T11:53:37Z","title":"Background Noise Reduction of Attention Map for Weakly Supervised\n Semantic Segmentation","summary":" In weakly-supervised semantic segmentation (WSSS) using only image-level\nclass labels, a problem with CNN-based Class Activation Maps (CAM) is that they\ntend to activate the most discriminative local regions of objects. On the other\nhand, methods based on Transformers learn global features but suffer from the\nissue of background noise contamination. This paper focuses on addressing the\nissue of background noise in attention weights within the existing WSSS method\nbased on Conformer, known as TransCAM. The proposed method successfully reduces\nbackground noise, leading to improved accuracy of pseudo labels. Experimental\nresults demonstrate that our model achieves segmentation performance of 70.5%\non the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS\nCOCO 2014 data, outperforming TransCAM in terms of segmentation performance.\n","authors":["Izumi Fujimori","Masaki Oono","Masami Shishibori"],"pdf_url":"https://arxiv.org/pdf/2404.03394v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05967v1","updated":"2024-04-09T02:55:12Z","published":"2024-04-09T02:55:12Z","title":"JSTR: Judgment Improves Scene Text Recognition","summary":" In this paper, we present a method for enhancing the accuracy of scene text\nrecognition tasks by judging whether the image and text match each other. While\nprevious studies focused on generating the recognition results from input\nimages, our approach also considers the model's misrecognition results to\nunderstand its error tendencies, thus improving the text recognition pipeline.\nThis method boosts text recognition accuracy by providing explicit feedback on\nthe data that the model is likely to misrecognize by predicting correct or\nincorrect between the image and text. The experimental results on publicly\navailable datasets demonstrate that our proposed method outperforms the\nbaseline and state-of-the-art methods in scene text recognition.\n","authors":["Masato Fujitake"],"pdf_url":"https://arxiv.org/pdf/2404.05967v1.pdf","comment":"IntelliSys 2024"},{"id":"http://arxiv.org/abs/2404.05960v1","updated":"2024-04-09T02:47:52Z","published":"2024-04-09T02:47:52Z","title":"EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker","summary":" Most of 3D single object trackers (SOT) in point clouds follow the two-stream\nmulti-stage 3D Siamese or motion tracking paradigms, which process the template\nand search area point clouds with two parallel branches, built on supervised\npoint cloud backbones. In this work, beyond typical 3D Siamese or motion\ntracking, we propose a neat and compact one-stream transformer 3D SOT paradigm\nfrom the novel perspective, termed as \\textbf{EasyTrack}, which consists of\nthree special designs: 1) A 3D point clouds tracking feature pre-training\nmodule is developed to exploit the masked autoencoding for learning 3D point\nclouds tracking representations. 2) A unified 3D tracking feature learning and\nfusion network is proposed to simultaneously learns target-aware 3D features,\nand extensively captures mutual correlation through the flexible self-attention\nmechanism. 3) A target location network in the dense bird's eye view (BEV)\nfeature space is constructed for target classification and regression.\nMoreover, we develop an enhanced version named EasyTrack++, which designs the\ncenter points interaction (CPI) strategy to reduce the ambiguous targets caused\nby the noise point cloud background information. The proposed EasyTrack and\nEasyTrack++ set a new state-of-the-art performance ($\\textbf{18\\%}$,\n$\\textbf{40\\%}$ and $\\textbf{3\\%}$ success gains) in KITTI, NuScenes, and Waymo\nwhile runing at \\textbf{52.6fps} with few parameters (\\textbf{1.3M}). The code\nwill be available at https://github.com/KnightApple427/Easytrack.\n","authors":["Baojie Fan","Wuyang Zhou","Kai Wang","Shijun Zhou","Fengyu Xu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12554v4","updated":"2024-04-09T02:42:28Z","published":"2023-01-29T22:05:28Z","title":"Improving the Accuracy-Robustness Trade-Off of Classifiers via Adaptive\n Smoothing","summary":" While prior research has proposed a plethora of methods that build neural\nclassifiers robust against adversarial robustness, practitioners are still\nreluctant to adopt them due to their unacceptably severe clean accuracy\npenalties. This paper significantly alleviates this accuracy-robustness\ntrade-off by mixing the output probabilities of a standard classifier and a\nrobust classifier, where the standard network is optimized for clean accuracy\nand is not robust in general. We show that the robust base classifier's\nconfidence difference for correct and incorrect examples is the key to this\nimprovement. In addition to providing intuitions and empirical evidence, we\ntheoretically certify the robustness of the mixed classifier under realistic\nassumptions. Furthermore, we adapt an adversarial input detector into a mixing\nnetwork that adaptively adjusts the mixture of the two base models, further\nreducing the accuracy penalty of achieving robustness. The proposed flexible\nmethod, termed \"adaptive smoothing\", can work in conjunction with existing or\neven future methods that improve clean accuracy, robustness, or adversary\ndetection. Our empirical evaluation considers strong attack methods, including\nAutoAttack and adaptive attack. On the CIFAR-100 dataset, our method achieves\nan 85.21% clean accuracy while maintaining a 38.72% $\\ell_\\infty$-AutoAttacked\n($\\epsilon = 8/255$) accuracy, becoming the second most robust method on the\nRobustBench CIFAR-100 benchmark as of submission, while improving the clean\naccuracy by ten percentage points compared with all listed models. The code\nthat implements our method is available at\nhttps://github.com/Bai-YT/AdaptiveSmoothing.\n","authors":["Yatong Bai","Brendon G. Anderson","Aerin Kim","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2301.12554v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06136v3","updated":"2024-04-09T02:38:16Z","published":"2024-02-09T01:48:44Z","title":"SIR: Multi-view Inverse Rendering with Decomposable Shadow for Indoor\n Scenes","summary":" We propose SIR, an efficient method to decompose differentiable shadows for\ninverse rendering on indoor scenes using multi-view data, addressing the\nchallenges in accurately decomposing the materials and lighting conditions.\nUnlike previous methods that struggle with shadow fidelity in complex lighting\nenvironments, our approach explicitly learns shadows for enhanced realism in\nmaterial estimation under unknown light positions. Utilizing posed HDR images\nas input, SIR employs an SDF-based neural radiance field for comprehensive\nscene representation. Then, SIR integrates a shadow term with a three-stage\nmaterial estimation approach to improve SVBRDF quality. Specifically, SIR is\ndesigned to learn a differentiable shadow, complemented by BRDF regularization,\nto optimize inverse rendering accuracy. Extensive experiments on both synthetic\nand real-world indoor scenes demonstrate the superior performance of SIR over\nexisting methods in both quantitative metrics and qualitative analysis. The\nsignificant decomposing ability of SIR enables sophisticated editing\ncapabilities like free-view relighting, object insertion, and material\nreplacement. The code and data are available at\nhttps://xiaokangwei.github.io/SIR/.\n","authors":["Xiaokang Wei","Zhuoman Liu","Yan Luximon"],"pdf_url":"https://arxiv.org/pdf/2402.06136v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15033v2","updated":"2024-04-09T02:29:32Z","published":"2024-03-22T08:32:30Z","title":"Toward Tiny and High-quality Facial Makeup with Data Amplify Learning","summary":" Contemporary makeup approaches primarily hinge on unpaired learning\nparadigms, yet they grapple with the challenges of inaccurate supervision\n(e.g., face misalignment) and sophisticated facial prompts (including face\nparsing, and landmark detection). These challenges prohibit low-cost deployment\nof facial makeup models, especially on mobile devices. To solve above problems,\nwe propose a brand-new learning paradigm, termed \"Data Amplify Learning (DAL),\"\nalongside a compact makeup model named \"TinyBeauty.\" The core idea of DAL lies\nin employing a Diffusion-based Data Amplifier (DDA) to \"amplify\" limited images\nfor the model training, thereby enabling accurate pixel-to-pixel supervision\nwith merely a handful of annotations. Two pivotal innovations in DDA facilitate\nthe above training approach: (1) A Residual Diffusion Model (RDM) is designed\nto generate high-fidelity detail and circumvent the detail vanishing problem in\nthe vanilla diffusion models; (2) A Fine-Grained Makeup Module (FGMM) is\nproposed to achieve precise makeup control and combination while retaining face\nidentity. Coupled with DAL, TinyBeauty necessitates merely 80K parameters to\nachieve a state-of-the-art performance without intricate face prompts.\nMeanwhile, TinyBeauty achieves a remarkable inference speed of up to 460 fps on\nthe iPhone 13. Extensive experiments show that DAL can produce highly\ncompetitive makeup models using only 5 image pairs.\n","authors":["Qiaoqiao Jin","Xuanhong Chen","Meiguang Jin","Ying Chen","Rui Shi","Yucheng Zheng","Yupeng Zhu","Bingbing Ni"],"pdf_url":"https://arxiv.org/pdf/2403.15033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03662v2","updated":"2024-04-09T01:43:11Z","published":"2024-03-06T12:31:02Z","title":"Harnessing Meta-Learning for Improving Full-Frame Video Stabilization","summary":" Video stabilization is a longstanding computer vision problem, particularly\npixel-level synthesis solutions for video stabilization which synthesize full\nframes add to the complexity of this task. These techniques aim to stabilize\nvideos by synthesizing full frames while enhancing the stability of the\nconsidered video. This intensifies the complexity of the task due to the\ndistinct mix of unique motion profiles and visual content present in each video\nsequence, making robust generalization with fixed parameters difficult. In our\nstudy, we introduce a novel approach to enhance the performance of pixel-level\nsynthesis solutions for video stabilization by adapting these models to\nindividual input video sequences. The proposed adaptation exploits low-level\nvisual cues accessible during test-time to improve both the stability and\nquality of resulting videos. We highlight the efficacy of our methodology of\n\"test-time adaptation\" through simple fine-tuning of one of these models,\nfollowed by significant stability gain via the integration of meta-learning\ntechniques. Notably, significant improvement is achieved with only a single\nadaptation step. The versatility of the proposed algorithm is demonstrated by\nconsistently improving the performance of various pixel-level synthesis models\nfor video stabilization in real-world scenarios.\n","authors":["Muhammad Kashif Ali","Eun Woo Im","Dongjin Kim","Tae Hyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.03662v2.pdf","comment":"CVPR 2024, Code will be made availble on:\n http://github.com/MKashifAli/MetaVideoStab"},{"id":"http://arxiv.org/abs/2309.13475v3","updated":"2024-04-09T01:26:58Z","published":"2023-09-23T20:33:38Z","title":"Detecting and Mitigating System-Level Anomalies of Vision-Based\n Controllers","summary":" Autonomous systems, such as self-driving cars and drones, have made\nsignificant strides in recent years by leveraging visual inputs and machine\nlearning for decision-making and control. Despite their impressive performance,\nthese vision-based controllers can make erroneous predictions when faced with\nnovel or out-of-distribution inputs. Such errors can cascade to catastrophic\nsystem failures and compromise system safety. In this work, we introduce a\nrun-time anomaly monitor to detect and mitigate such closed-loop, system-level\nfailures. Specifically, we leverage a reachability-based framework to\nstress-test the vision-based controller offline and mine its system-level\nfailures. This data is then used to train a classifier that is leveraged online\nto flag inputs that might cause system breakdowns. The anomaly detector\nhighlights issues that transcend individual modules and pertain to the safety\nof the overall system. We also design a fallback controller that robustly\nhandles these detected anomalies to preserve system safety. We validate the\nproposed approach on an autonomous aircraft taxiing system that uses a\nvision-based controller for taxiing. Our results show the efficacy of the\nproposed approach in identifying and handling system-level anomalies,\noutperforming methods such as prediction error-based detection, and ensembling,\nthereby enhancing the overall safety and robustness of autonomous systems.\n","authors":["Aryaman Gupta","Kaustav Chakraborty","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2309.13475v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10240v2","updated":"2024-04-09T01:16:07Z","published":"2023-12-15T22:18:38Z","title":"Rich Human Feedback for Text-to-Image Generation","summary":" Recent Text-to-Image (T2I) generation models such as Stable Diffusion and\nImagen have made significant progress in generating high-resolution images\nbased on text descriptions. However, many generated images still suffer from\nissues such as artifacts/implausibility, misalignment with text descriptions,\nand low aesthetic quality. Inspired by the success of Reinforcement Learning\nwith Human Feedback (RLHF) for large language models, prior works collected\nhuman-provided scores as feedback on generated images and trained a reward\nmodel to improve the T2I generation. In this paper, we enrich the feedback\nsignal by (i) marking image regions that are implausible or misaligned with the\ntext, and (ii) annotating which words in the text prompt are misrepresented or\nmissing on the image. We collect such rich human feedback on 18K generated\nimages (RichHF-18K) and train a multimodal transformer to predict the rich\nfeedback automatically. We show that the predicted rich human feedback can be\nleveraged to improve image generation, for example, by selecting high-quality\ntraining data to finetune and improve the generative models, or by creating\nmasks with predicted heatmaps to inpaint the problematic regions. Notably, the\nimprovements generalize to models (Muse) beyond those used to generate the\nimages on which human feedback data were collected (Stable Diffusion variants).\nThe RichHF-18K data set will be released in our GitHub repository:\nhttps://github.com/google-research/google-research/tree/master/richhf_18k.\n","authors":["Youwei Liang","Junfeng He","Gang Li","Peizhao Li","Arseniy Klimovskiy","Nicholas Carolan","Jiao Sun","Jordi Pont-Tuset","Sarah Young","Feng Yang","Junjie Ke","Krishnamurthy Dj Dvijotham","Katie Collins","Yiwen Luo","Yang Li","Kai J Kohlhoff","Deepak Ramachandran","Vidhya Navalpakkam"],"pdf_url":"https://arxiv.org/pdf/2312.10240v2.pdf","comment":"CVPR'24"},{"id":"http://arxiv.org/abs/2402.17228v3","updated":"2024-04-09T01:10:15Z","published":"2024-02-27T05:42:38Z","title":"Feature Re-Embedding: Towards Foundation Model-Level Performance in\n Computational Pathology","summary":" Multiple instance learning (MIL) is the most widely used framework in\ncomputational pathology, encompassing sub-typing, diagnosis, prognosis, and\nmore. However, the existing MIL paradigm typically requires an offline instance\nfeature extractor, such as a pre-trained ResNet or a foundation model. This\napproach lacks the capability for feature fine-tuning within the specific\ndownstream tasks, limiting its adaptability and performance. To address this\nissue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding\nthe instance features online, which captures fine-grained local features and\nestablishes connections across different regions. Unlike existing works that\nfocus on pre-training powerful feature extractor or designing sophisticated\ninstance aggregator, R$^2$T is tailored to re-embed instance features online.\nIt serves as a portable module that can seamlessly integrate into mainstream\nMIL models. Extensive experimental results on common computational pathology\ntasks validate that: 1) feature re-embedding improves the performance of MIL\nmodels based on ResNet-50 features to the level of foundation model features,\nand further enhances the performance of foundation model features; 2) the\nR$^2$T can introduce more significant performance improvements to various MIL\nmodels; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest\nmethods by a large margin.The code is available at:\nhttps://github.com/DearCaat/RRT-MIL.\n","authors":["Wenhao Tang","Fengtao Zhou","Sheng Huang","Xiang Zhu","Yi Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17228v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2308.13072v2","updated":"2024-04-09T01:09:41Z","published":"2023-08-24T20:29:09Z","title":"Full-dose Whole-body PET Synthesis from Low-dose PET Using\n High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency\n Model","summary":" Objective: Positron Emission Tomography (PET) has been a commonly used\nimaging modality in broad clinical applications. One of the most important\ntradeoffs in PET imaging is between image quality and radiation dose: high\nimage quality comes with high radiation exposure. Improving image quality is\ndesirable for all clinical applications while minimizing radiation exposure is\nneeded to reduce risk to patients. Approach: We introduce PET Consistency Model\n(PET-CM), an efficient diffusion-based method for generating high-quality\nfull-dose PET images from low-dose PET images. It employs a two-step process,\nadding Gaussian noise to full-dose PET images in the forward diffusion, and\nthen denoising them using a PET Shifted-window Vision Transformer (PET-VIT)\nnetwork in the reverse diffusion. The PET-VIT network learns a consistency\nfunction that enables direct denoising of Gaussian noise into clean full-dose\nPET images. PET-CM achieves state-of-the-art image quality while requiring\nsignificantly less computation time than other methods. Results: In experiments\ncomparing eighth-dose to full-dose images, PET-CM demonstrated impressive\nperformance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of\n0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of\n0.255+/-0.318%, with an average generation time of 62 seconds per patient. This\nis a significant improvement compared to the state-of-the-art diffusion-based\nmodel with PET-CM reaching this result 12x faster. Similarly, in the\nquarter-dose to full-dose image experiments, PET-CM delivered competitive\noutcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM\nof 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of\n0.151+/-0.192% using the same generation process, which underlining its high\nquantitative and clinical precision in both denoising scenario.\n","authors":["Shaoyan Pan","Elham Abouei","Junbo Peng","Joshua Qian","Jacob F Wynne","Tonghe Wang","Chih-Wei Chang","Justin Roper","Jonathon A Nye","Hui Mao","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05916v1","updated":"2024-04-09T00:30:16Z","published":"2024-04-09T00:30:16Z","title":"Prompt-driven Universal Model for View-Agnostic Echocardiography\n Analysis","summary":" Echocardiography segmentation for cardiac analysis is time-consuming and\nresource-intensive due to the variability in image quality and the necessity to\nprocess scans from various standard views. While current automated segmentation\nmethods in echocardiography show promising performance, they are trained on\nspecific scan views to analyze corresponding data. However, this solution has a\nlimitation as the number of required models increases with the number of\nstandard views. To address this, in this paper, we present a prompt-driven\nuniversal method for view-agnostic echocardiography analysis. Considering the\ndomain shift between standard views, we first introduce a method called prompt\nmatching, aimed at learning prompts specific to different views by matching\nprompts and querying input embeddings using a pre-trained vision model. Then,\nwe utilized a pre-trained medical language model to align textual information\nwith pixel data for accurate segmentation. Extensive experiments on three\nstandard views showed that our approach significantly outperforms the\nstate-of-the-art universal methods and achieves comparable or even better\nperformances over the segmentation model trained and tested on same views.\n","authors":["Sekeun Kim","Hui Ren","Peng Guo","Abder-Rahman Ali","Patrick Zhang","Kyungsang Kim","Xiang Li","Quanzheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05911v1","updated":"2024-04-09T00:05:45Z","published":"2024-04-09T00:05:45Z","title":"LATUP-Net: A Lightweight 3D Attention U-Net with Parallel Convolutions\n for Brain Tumor Segmentation","summary":" Early-stage 3D brain tumor segmentation from magnetic resonance imaging (MRI)\nscans is crucial for prompt and effective treatment. However, this process\nfaces the challenge of precise delineation due to the tumors' complex\nheterogeneity. Moreover, energy sustainability targets and resource\nlimitations, especially in developing countries, require efficient and\naccessible medical imaging solutions. The proposed architecture, a Lightweight\n3D ATtention U-Net with Parallel convolutions, LATUP-Net, addresses these\nissues. It is specifically designed to reduce computational requirements\nsignificantly while maintaining high segmentation performance. By incorporating\nparallel convolutions, it enhances feature representation by capturing\nmulti-scale information. It further integrates an attention mechanism to refine\nsegmentation through selective feature recalibration. LATUP-Net achieves\npromising segmentation performance: the average Dice scores for the whole\ntumor, tumor core, and enhancing tumor on the BraTS2020 dataset are 88.41%,\n83.82%, and 73.67%, and on the BraTS2021 dataset, they are 90.29%, 89.54%, and\n83.92%, respectively. Hausdorff distance metrics further indicate its improved\nability to delineate tumor boundaries. With its significantly reduced\ncomputational demand using only 3.07 M parameters, about 59 times fewer than\nother state-of-the-art models, and running on a single V100 GPU, LATUP-Net\nstands out as a promising solution for real-world clinical applications,\nparticularly in settings with limited resources. Investigations into the\nmodel's interpretability, utilizing gradient-weighted class activation mapping\nand confusion matrices, reveal that while attention mechanisms enhance the\nsegmentation of small regions, their impact is nuanced. Achieving the most\naccurate tumor delineation requires carefully balancing local and global\nfeatures.\n","authors":["Ebtihal J. Alwadee","Xianfang Sun","Yipeng Qin","Frank C. Langbein"],"pdf_url":"https://arxiv.org/pdf/2404.05911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06657v1","updated":"2024-04-09T23:47:53Z","published":"2024-04-09T23:47:53Z","title":"Res-U2Net: Untrained Deep Learning for Phase Retrieval and Image\n Reconstruction","summary":" Conventional deep learning-based image reconstruction methods require a large\namount of training data which can be hard to obtain in practice. Untrained deep\nlearning methods overcome this limitation by training a network to invert a\nphysical model of the image formation process. Here we present a novel\nuntrained Res-U2Net model for phase retrieval. We use the extracted phase\ninformation to determine changes in an object's surface and generate a mesh\nrepresentation of its 3D structure. We compare the performance of Res-U2Net\nphase retrieval against UNet and U2Net using images from the GDXRAY dataset.\n","authors":["Carlos Osorio Quero","Daniel Leykam","Irving Rondon Ojeda"],"pdf_url":"https://arxiv.org/pdf/2404.06657v1.pdf","comment":"16 pages, 8 figures, 4 Tables"},{"id":"http://arxiv.org/abs/2312.00825v2","updated":"2024-04-09T23:28:49Z","published":"2023-11-30T18:32:14Z","title":"SocialCounterfactuals: Probing and Mitigating Intersectional Social\n Biases in Vision-Language Models with Counterfactual Examples","summary":" While vision-language models (VLMs) have achieved remarkable performance\nimprovements recently, there is growing evidence that these models also posses\nharmful biases with respect to social attributes such as gender and race. Prior\nstudies have primarily focused on probing such bias attributes individually\nwhile ignoring biases associated with intersections between social attributes.\nThis could be due to the difficulty of collecting an exhaustive set of\nimage-text pairs for various combinations of social attributes. To address this\nchallenge, we employ text-to-image diffusion models to produce counterfactual\nexamples for probing intersectional social biases at scale. Our approach\nutilizes Stable Diffusion with cross attention control to produce sets of\ncounterfactual image-text pairs that are highly similar in their depiction of a\nsubject (e.g., a given occupation) while differing only in their depiction of\nintersectional social attributes (e.g., race & gender). Through our\nover-generate-then-filter methodology, we produce SocialCounterfactuals, a\nhigh-quality dataset containing 171k image-text pairs for probing\nintersectional biases related to gender, race, and physical characteristics. We\nconduct extensive experiments to demonstrate the usefulness of our generated\ndataset for probing and mitigating intersectional social biases in\nstate-of-the-art VLMs.\n","authors":["Phillip Howard","Avinash Madasu","Tiep Le","Gustavo Lujan Moreno","Anahita Bhiwandiwalla","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2312.00825v2.pdf","comment":"Accepted to CVPR 2024. arXiv admin note: text overlap with\n arXiv:2310.02988"},{"id":"http://arxiv.org/abs/2404.06653v1","updated":"2024-04-09T23:24:19Z","published":"2024-04-09T23:24:19Z","title":"FlameFinder: Illuminating Obscured Fire through Smoke with Attentive\n Deep Metric Learning","summary":" FlameFinder is a deep metric learning (DML) framework designed to accurately\ndetect flames, even when obscured by smoke, using thermal images from\nfirefighter drones during wildfire monitoring. Traditional RGB cameras struggle\nin such conditions, but thermal cameras can capture smoke-obscured flame\nfeatures. However, they lack absolute thermal reference points, leading to\nfalse positives.To address this issue, FlameFinder utilizes paired thermal-RGB\nimages for training. By learning latent flame features from smoke-free samples,\nthe model becomes less biased towards relative thermal gradients. In testing,\nit identifies flames in smoky patches by analyzing their equivalent\nthermal-domain distribution. This method improves performance using both\nsupervised and distance-based clustering metrics.The framework incorporates a\nflame segmentation method and a DML-aided detection framework. This includes\nutilizing center loss (CL), triplet center loss (TCL), and triplet cosine\ncenter loss (TCCL) to identify optimal cluster representatives for\nclassification. However, the dominance of center loss over the other losses\nleads to the model missing features sensitive to them. To address this\nlimitation, an attention mechanism is proposed. This mechanism allows for\nnon-uniform feature contribution, amplifying the critical role of cosine and\ntriplet loss in the DML framework. Additionally, it improves interpretability,\nclass discrimination, and decreases intra-class variance. As a result, the\nproposed model surpasses the baseline by 4.4% in the FLAME2 dataset and 7% in\nthe FLAME3 dataset for unobscured flame detection accuracy. Moreover, it\ndemonstrates enhanced class separation in obscured scenarios compared to VGG19,\nResNet18, and three backbone models tailored for flame detection.\n","authors":["Hossein Rajoli","Sahand Khoshdel","Fatemeh Afghah","Xiaolong Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06653v1.pdf","comment":"Submitted as a Journal Paper to IEEE Transactions on Geoscience and\n Remote Sensing"},{"id":"http://arxiv.org/abs/2404.05139v2","updated":"2024-04-09T23:17:07Z","published":"2024-04-08T01:38:43Z","title":"Better Monocular 3D Detectors with LiDAR from the Past","summary":" Accurate 3D object detection is crucial to autonomous driving. Though\nLiDAR-based detectors have achieved impressive performance, the high cost of\nLiDAR sensors precludes their widespread adoption in affordable vehicles.\nCamera-based detectors are cheaper alternatives but often suffer inferior\nperformance compared to their LiDAR-based counterparts due to inherent depth\nambiguities in images. In this work, we seek to improve monocular 3D detectors\nby leveraging unlabeled historical LiDAR data. Specifically, at inference time,\nwe assume that the camera-based detectors have access to multiple unlabeled\nLiDAR scans from past traversals at locations of interest (potentially from\nother high-end vehicles equipped with LiDAR sensors). Under this setup, we\nproposed a novel, simple, and end-to-end trainable framework, termed\nAsyncDepth, to effectively extract relevant features from asynchronous LiDAR\ntraversals of the same location for monocular 3D detectors. We show consistent\nand significant performance gain (up to 9 AP) across multiple state-of-the-art\nmodels and datasets with a negligible additional latency of 9.66 ms and a small\nstorage cost.\n","authors":["Yurong You","Cheng Perng Phoo","Carlos Andres Diaz-Ruiz","Katie Z Luo","Wei-Lun Chao","Mark Campbell","Bharath Hariharan","Kilian Q Weinberger"],"pdf_url":"https://arxiv.org/pdf/2404.05139v2.pdf","comment":"Accepted by ICRA 2024. The code can be found at\n https://github.com/YurongYou/AsyncDepth"},{"id":"http://arxiv.org/abs/2404.06638v1","updated":"2024-04-09T22:17:20Z","published":"2024-04-09T22:17:20Z","title":"SAM-I-Am: Semantic Boosting for Zero-shot Atomic-Scale Electron\n Micrograph Segmentation","summary":" Image segmentation is a critical enabler for tasks ranging from medical\ndiagnostics to autonomous driving. However, the correct segmentation semantics\n- where are boundaries located? what segments are logically similar? - change\ndepending on the domain, such that state-of-the-art foundation models can\ngenerate meaningless and incorrect results. Moreover, in certain domains,\nfine-tuning and retraining techniques are infeasible: obtaining labels is\ncostly and time-consuming; domain images (micrographs) can be exponentially\ndiverse; and data sharing (for third-party retraining) is restricted. To enable\nrapid adaptation of the best segmentation technology, we propose the concept of\nsemantic boosting: given a zero-shot foundation model, guide its segmentation\nand adjust results to match domain expectations. We apply semantic boosting to\nthe Segment Anything Model (SAM) to obtain microstructure segmentation for\ntransmission electron microscopy. Our booster, SAM-I-Am, extracts geometric and\ntextural features of various intermediate masks to perform mask removal and\nmask merging operations. We demonstrate a zero-shot performance increase of\n(absolute) +21.35%, +12.6%, +5.27% in mean IoU, and a -9.91%, -18.42%, -4.06%\ndrop in mean false positive masks across images of three difficulty classes\nover vanilla SAM (ViT-L).\n","authors":["Waqwoya Abebe","Jan Strube","Luanzheng Guo","Nathan R. Tallent","Oceane Bel","Steven Spurgeon","Christina Doty","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2404.06638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06637v1","updated":"2024-04-09T22:16:34Z","published":"2024-04-09T22:16:34Z","title":"GeoSynth: Contextually-Aware High-Resolution Satellite Image Synthesis","summary":" We present GeoSynth, a model for synthesizing satellite images with global\nstyle and image-driven layout control. The global style control is via textual\nprompts or geographic location. These enable the specification of scene\nsemantics or regional appearance respectively, and can be used together. We\ntrain our model on a large dataset of paired satellite imagery, with\nautomatically generated captions, and OpenStreetMap data. We evaluate various\ncombinations of control inputs, including different types of layout controls.\nResults demonstrate that our model can generate diverse, high-quality images\nand exhibits excellent zero-shot generalization. The code and model checkpoints\nare available at https://github.com/mvrl/GeoSynth.\n","authors":["Srikumar Sastry","Subash Khanal","Aayush Dhakal","Nathan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2404.06637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05195v2","updated":"2024-04-09T22:14:37Z","published":"2024-02-07T19:07:10Z","title":"$λ$-ECLIPSE: Multi-Concept Personalized Text-to-Image Diffusion\n Models by Leveraging CLIP Latent Space","summary":" Despite the recent advances in personalized text-to-image (P-T2I) generative\nmodels, it remains challenging to perform finetuning-free multi-subject-driven\nT2I in a resource-efficient manner. Predominantly, contemporary approaches,\ninvolving the training of Hypernetworks and Multimodal Large Language Models\n(MLLMs), require heavy computing resources that range from 600 to 12300 GPU\nhours of training. These subject-driven T2I methods hinge on Latent Diffusion\nModels (LDMs), which facilitate T2I mapping through cross-attention layers.\nWhile LDMs offer distinct advantages, P-T2I methods' reliance on the latent\nspace of these diffusion models significantly escalates resource demands,\nleading to inconsistent results and necessitating numerous iterations for a\nsingle desired image. In this paper, we present $\\lambda$-ECLIPSE, an\nalternative prior-training strategy that works in the latent space of a\npre-trained CLIP model without relying on the diffusion UNet models.\n$\\lambda$-ECLIPSE leverages the image-text interleaved pre-training for fast\nand effective multi-subject-driven P-T2I. Through extensive experiments, we\nestablish that $\\lambda$-ECLIPSE surpasses existing baselines in composition\nalignment while preserving concept alignment performance, even with\nsignificantly lower resource utilization. $\\lambda$-ECLIPSE performs\nmulti-subject driven P-T2I with just 34M parameters and is trained on a mere 74\nGPU hours. Additionally, $\\lambda$-ECLIPSE demonstrates the unique ability to\nperform multi-concept interpolations.\n","authors":["Maitreya Patel","Sangmin Jung","Chitta Baral","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2402.05195v2.pdf","comment":"Project page: https://eclipse-t2i.github.io/Lambda-ECLIPSE/"},{"id":"http://arxiv.org/abs/2312.04746v2","updated":"2024-04-09T21:48:42Z","published":"2023-12-07T23:16:37Z","title":"Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized\n Narratives from Open-Source Histopathology Videos","summary":" Diagnosis in histopathology requires a global whole slide images (WSIs)\nanalysis, requiring pathologists to compound evidence from different WSI\npatches. The gigapixel scale of WSIs poses a challenge for histopathology\nmulti-modal models. Training multi-model models for histopathology requires\ninstruction tuning datasets, which currently contain information for individual\nimage patches, without a spatial grounding of the concepts within each patch\nand without a wider view of the WSI. Therefore, they lack sufficient diagnostic\ncapacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a\nlarge-scale dataset of 107,131 histopathology-specific instruction\nquestion/answer pairs, grounded within diagnostically relevant image patches\nthat make up the WSI. Our dataset is collected by leveraging educational\nhistopathology videos from YouTube, which provides spatial localization of\nnarrations by automatically extracting the narrators' cursor positions.\nQuilt-Instruct supports contextual reasoning by extracting diagnosis and\nsupporting facts from the entire WSI. Using Quilt-Instruct, we train\nQuilt-LLaVA, which can reason beyond the given single image patch, enabling\ndiagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a\ncomprehensive evaluation dataset created from 985 images and 1283\nhuman-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using\npublic histopathology datasets, where Quilt-LLaVA significantly outperforms\nSOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set\nVQA. Our code, data, and model are publicly accessible at\nquilt-llava.github.io.\n","authors":["Mehmet Saygin Seyfioglu","Wisdom O. Ikezogwo","Fatemeh Ghezloo","Ranjay Krishna","Linda Shapiro"],"pdf_url":"https://arxiv.org/pdf/2312.04746v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06622v1","updated":"2024-04-09T21:12:31Z","published":"2024-04-09T21:12:31Z","title":"Calibrating Higher-Order Statistics for Few-Shot Class-Incremental\n Learning with Pre-trained Vision Transformers","summary":" Few-shot class-incremental learning (FSCIL) aims to adapt the model to new\nclasses from very few data (5 samples) without forgetting the previously\nlearned classes. Recent works in many-shot CIL (MSCIL) (using all available\ntraining data) exploited pre-trained models to reduce forgetting and achieve\nbetter plasticity. In a similar fashion, we use ViT models pre-trained on\nlarge-scale datasets for few-shot settings, which face the critical issue of\nlow plasticity. FSCIL methods start with a many-shot first task to learn a very\ngood feature extractor and then move to the few-shot setting from the second\ntask onwards. While the focus of most recent studies is on how to learn the\nmany-shot first task so that the model generalizes to all future few-shot\ntasks, we explore in this work how to better model the few-shot data using\npre-trained models, irrespective of how the first task is trained. Inspired by\nrecent works in MSCIL, we explore how using higher-order feature statistics can\ninfluence the classification of few-shot classes. We identify the main\nchallenge of obtaining a good covariance matrix from few-shot data and propose\nto calibrate the covariance matrix for new classes based on semantic similarity\nto the many-shot base classes. Using the calibrated feature statistics in\ncombination with existing methods significantly improves few-shot continual\nclassification on several FSCIL benchmarks. Code is available at\nhttps://github.com/dipamgoswami/FSCIL-Calibration.\n","authors":["Dipam Goswami","Bartłomiej Twardowski","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2404.06622v1.pdf","comment":"Accepted at CLVision workshop (CVPR 2024)"},{"id":"http://arxiv.org/abs/2403.08092v2","updated":"2024-04-09T20:55:01Z","published":"2024-03-12T22:03:19Z","title":"Mitigating the Impact of Attribute Editing on Face Recognition","summary":" Through a large-scale study over diverse face images, we show that facial\nattribute editing using modern generative AI models can severely degrade\nautomated face recognition systems. This degradation persists even with\nidentity-preserving generative models. To mitigate this issue, we propose two\nnovel techniques for local and global attribute editing. We empirically ablate\ntwenty-six facial semantic, demographic and expression-based attributes that\nhave been edited using state-of-the-art generative models, and evaluate them\nusing ArcFace and AdaFace matchers on CelebA, CelebAMaskHQ and LFW datasets.\nFinally, we use LLaVA, an emerging visual question-answering framework for\nattribute prediction to validate our editing techniques. Our methods outperform\nthe current state-of-the-art at facial editing (BLIP, InstantID) while\nimproving identity retention by a significant extent.\n","authors":["Sudipta Banerjee","Sai Pranaswi Mullangi","Shruti Wagle","Chinmay Hegde","Nasir Memon"],"pdf_url":"https://arxiv.org/pdf/2403.08092v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.06605v1","updated":"2024-04-09T20:24:29Z","published":"2024-04-09T20:24:29Z","title":"RoadBEV: Road Surface Reconstruction in Bird's Eye View","summary":" Road surface conditions, especially geometry profiles, enormously affect\ndriving performance of autonomous vehicles. Vision-based online road\nreconstruction promisingly captures road information in advance. Existing\nsolutions like monocular depth estimation and stereo matching suffer from\nmodest performance. The recent technique of Bird's-Eye-View (BEV) perception\nprovides immense potential to more reliable and accurate reconstruction. This\npaper uniformly proposes two simple yet effective models for road elevation\nreconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate\nroad elevation with monocular and stereo images, respectively. The former\ndirectly fits elevation values based on voxel features queried from image view,\nwhile the latter efficiently recognizes road elevation patterns based on BEV\nvolume representing discrepancy between left and right voxel features.\nInsightful analyses reveal their consistence and difference with perspective\nview. Experiments on real-world dataset verify the models' effectiveness and\nsuperiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm\nand 0.56cm, respectively. The estimation performance improves by 50\\% in BEV\nbased on monocular image. Our models are promising for practical applications,\nproviding valuable references for vision-based BEV perception in autonomous\ndriving. The code is released at https://github.com/ztsrxh/RoadBEV.\n","authors":["Tong Zhao","Lei Yang","Yichen Xie","Mingyu Ding","Masayoshi Tomizuka","Yintao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.06605v1.pdf","comment":"Dataset page: https://thu-rsxd.com/rsrd Code:\n https://github.com/ztsrxh/RoadBEV"},{"id":"http://arxiv.org/abs/2404.06593v1","updated":"2024-04-09T19:49:01Z","published":"2024-04-09T19:49:01Z","title":"Spatially Optimized Compact Deep Metric Learning Model for Similarity\n Search","summary":" Spatial optimization is often overlooked in many computer vision tasks.\nFilters should be able to recognize the features of an object regardless of\nwhere it is in the image. Similarity search is a crucial task where spatial\nfeatures decide an important output. The capacity of convolution to capture\nvisual patterns across various locations is limited. In contrast to\nconvolution, the involution kernel is dynamically created at each pixel based\non the pixel value and parameters that have been learned. This study\ndemonstrates that utilizing a single layer of involution feature extractor\nalongside a compact convolution model significantly enhances the performance of\nsimilarity search. Additionally, we improve predictions by using the GELU\nactivation function rather than the ReLU. The negligible amount of weight\nparameters in involution with a compact model with better performance makes the\nmodel very useful in real-world implementations. Our proposed model is below 1\nmegabyte in size. We have experimented with our proposed methodology and other\nmodels on CIFAR-10, FashionMNIST, and MNIST datasets. Our proposed method\noutperforms across all three datasets.\n","authors":["Md. Farhadul Islam","Md. Tanzim Reza","Meem Arafat Manab","Mohammad Rakibul Hasan Mahin","Sarah Zabeen","Jannatun Noor"],"pdf_url":"https://arxiv.org/pdf/2404.06593v1.pdf","comment":"5 pages, 3 figures,"},{"id":"http://arxiv.org/abs/2404.06589v1","updated":"2024-04-09T19:33:05Z","published":"2024-04-09T19:33:05Z","title":"Leveraging Latents for Efficient Thermography Classification and\n Segmentation","summary":" Breast cancer is a prominent health concern worldwide, currently being the\nsecondmost common and second-deadliest type of cancer in women. While current\nbreast cancer diagnosis mainly relies on mammography imaging, in recent years\nthe use of thermography for breast cancer imaging has been garnering growing\npopularity. Thermographic imaging relies on infrared cameras to capture\nbody-emitted heat distributions. While these heat signatures have proven useful\nfor computer-vision systems for accurate breast cancer segmentation and\nclassification, prior work often relies on handcrafted feature engineering or\ncomplex architectures, potentially limiting the comparability and applicability\nof these methods. In this work, we present a novel algorithm for both breast\ncancer classification and segmentation. Rather than focusing efforts on manual\nfeature and architecture engineering, our algorithm focuses on leveraging an\ninformative, learned feature space, thus making our solution simpler to use and\nextend to other frameworks and downstream tasks, as well as more applicable to\ndata-scarce settings. Our classification produces SOTA results, while we are\nthe first work to produce segmentation regions studied in this paper.\n","authors":["Tamir Shor","Chaim Baskin","Alex Bronstein"],"pdf_url":"https://arxiv.org/pdf/2404.06589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01102v2","updated":"2024-04-09T19:26:36Z","published":"2024-04-01T13:23:04Z","title":"Diffusion based Zero-shot Medical Image-to-Image Translation for Cross\n Modality Segmentation","summary":" Cross-modality image segmentation aims to segment the target modalities using\na method designed in the source modality. Deep generative models can translate\nthe target modality images into the source modality, thus enabling\ncross-modality segmentation. However, a vast body of existing cross-modality\nimage translation methods relies on supervised learning. In this work, we aim\nto address the challenge of zero-shot learning-based image translation tasks\n(extreme scenarios in the target modality is unseen in the training phase). To\nleverage generative learning for zero-shot cross-modality image segmentation,\nwe propose a novel unsupervised image translation method. The framework learns\nto translate the unseen source image to the target modality for image\nsegmentation by leveraging the inherent statistical consistency between\ndifferent modalities for diffusion guidance. Our framework captures identical\ncross-modality features in the statistical domain, offering diffusion guidance\nwithout relying on direct mappings between the source and target domains. This\nadvantage allows our method to adapt to changing source domains without the\nneed for retraining, making it highly practical when sufficient labeled source\ndomain data is not available. The proposed framework is validated in zero-shot\ncross-modality image segmentation tasks through empirical comparisons with\ninfluential generative models, including adversarial-based and diffusion-based\nmodels.\n","authors":["Zihao Wang","Yingyu Yang","Yuzhou Chen","Tingting Yuan","Maxime Sermesant","Herve Delingette","Ona Wu"],"pdf_url":"https://arxiv.org/pdf/2404.01102v2.pdf","comment":"Neurips 2023 Diffusion Workshop"},{"id":"http://arxiv.org/abs/2212.05140v2","updated":"2024-04-09T19:17:07Z","published":"2022-12-09T22:53:40Z","title":"Local Neighborhood Features for 3D Classification","summary":" With advances in deep learning model training strategies, the training of\nPoint cloud classification methods is significantly improving. For example,\nPointNeXt, which adopts prominent training techniques and InvResNet layers into\nPointNet++, achieves over 7% improvement on the real-world ScanObjectNN\ndataset. However, most of these models use point coordinates features of\nneighborhood points mapped to higher dimensional space while ignoring the\nneighborhood point features computed before feeding to the network layers. In\nthis paper, we revisit the PointNeXt model to study the usage and benefit of\nsuch neighborhood point features. We train and evaluate PointNeXt on ModelNet40\n(synthetic), ScanObjectNN (real-world), and a recent large-scale, real-world\ngrocery dataset, i.e., 3DGrocery100. In addition, we provide an additional\ninference strategy of weight averaging the top two checkpoints of PointNeXt to\nimprove classification accuracy. Together with the abovementioned ideas, we\ngain 0.5%, 1%, 4.8%, 3.4%, and 1.6% overall accuracy on the PointNeXt model\nwith real-world datasets, ScanObjectNN (hardest variant), 3DGrocery100's\nApple10, Fruits, Vegetables, and Packages subsets, respectively. We also\nachieve a comparable 0.2% accuracy gain on ModelNet40.\n","authors":["Shivanand Venkanna Sheshappanavar","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2212.05140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05490v2","updated":"2024-04-09T18:55:43Z","published":"2024-04-08T13:11:57Z","title":"Two-Person Interaction Augmentation with Skeleton Priors","summary":" Close and continuous interaction with rich contacts is a crucial aspect of\nhuman activities (e.g. hugging, dancing) and of interest in many domains like\nactivity recognition, motion prediction, character animation, etc. However,\nacquiring such skeletal motion is challenging. While direct motion capture is\nexpensive and slow, motion editing/generation is also non-trivial, as complex\ncontact patterns with topological and geometric constraints have to be\nretained. To this end, we propose a new deep learning method for two-body\nskeletal interaction motion augmentation, which can generate variations of\ncontact-rich interactions with varying body sizes and proportions while\nretaining the key geometric/topological relations between two bodies. Our\nsystem can learn effectively from a relatively small amount of data and\ngeneralize to drastically different skeleton sizes. Through exhaustive\nevaluation and comparison, we show it can generate high-quality motions, has\nstrong generalizability and outperforms traditional optimization-based methods\nand alternative deep learning solutions.\n","authors":["Baiyi Li","Edmond S. L. Ho","Hubert P. H. Shum","He Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02527v3","updated":"2024-04-09T18:26:27Z","published":"2024-03-04T22:42:17Z","title":"A dataset of over one thousand computed tomography scans of battery\n cells","summary":" Battery technology is increasingly important for global electrification\nefforts. However, batteries are highly sensitive to small manufacturing\nvariations that can induce reliability or safety issues. An important\ntechnology for battery quality control is computed tomography (CT) scanning,\nwhich is widely used for non-destructive 3D inspection across a variety of\nclinical and industrial applications. Historically, however, the utility of CT\nscanning for high-volume manufacturing has been limited by its low throughput\nas well as the difficulty of handling its large file sizes. In this work, we\npresent a dataset of over one thousand CT scans of as-produced commercially\navailable batteries. The dataset spans various chemistries (lithium-ion and\nsodium-ion) as well as various battery form factors (cylindrical, pouch, and\nprismatic). We evaluate seven different battery types in total. The\nmanufacturing variability and the presence of battery defects can be observed\nvia this dataset. This dataset may be of interest to scientists and engineers\nworking on battery technology, computer vision, or both.\n","authors":["Amariah Condon","Bailey Buscarino","Eric Moch","William J. Sehnert","Owen Miles","Patrick K. Herring","Peter M. Attia"],"pdf_url":"https://arxiv.org/pdf/2403.02527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08514v2","updated":"2024-04-09T18:23:39Z","published":"2023-12-13T21:02:03Z","title":"TAM-VT: Transformation-Aware Multi-scale Video Transformer for\n Segmentation and Tracking","summary":" Video Object Segmentation (VOS) has emerged as an increasingly important\nproblem with availability of larger datasets and more complex and realistic\nsettings, which involve long videos with global motion (e.g, in egocentric\nsettings), depicting small objects undergoing both rigid and non-rigid\n(including state) deformations. While a number of recent approaches have been\nexplored for this task, these data characteristics still present challenges. In\nthis work we propose a novel, clip-based DETR-style encoder-decoder\narchitecture, which focuses on systematically analyzing and addressing\naforementioned challenges. Specifically, we propose a novel\ntransformation-aware loss that focuses learning on portions of the video where\nan object undergoes significant deformations -- a form of \"soft\" hard examples\nmining. Further, we propose a multiplicative time-coded memory, beyond vanilla\nadditive positional encoding, which helps propagate context across long videos.\nFinally, we incorporate these in our proposed holistic multi-scale video\ntransformer for tracking via multi-scale memory matching and decoding to ensure\nsensitivity and accuracy for long videos and small objects. Our model enables\non-line inference with long videos in a windowed fashion, by breaking the video\ninto clips and propagating context among them. We illustrate that short clip\nlength and longer memory with learned time-coding are important design choices\nfor improved performance. Collectively, these technical contributions enable\nour model to achieve new state-of-the-art (SoTA) performance on two complex\negocentric datasets -- VISOR and VOST, while achieving comparable to SoTA\nresults on the conventional VOS benchmark, DAVIS'17. A series of detailed\nablations validate our design choices as well as provide insights into the\nimportance of parameter choices and their impact on performance.\n","authors":["Raghav Goyal","Wan-Cyuan Fan","Mennatullah Siam","Leonid Sigal"],"pdf_url":"https://arxiv.org/pdf/2312.08514v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06559v1","updated":"2024-04-09T18:23:34Z","published":"2024-04-09T18:23:34Z","title":"The Impact of Print-and-Scan in Heterogeneous Morph Evaluation Scenarios","summary":" Face morphing attacks present an emerging threat to the face recognition\nsystem. On top of that, printing and scanning the morphed images could obscure\nthe artifacts generated during the morphing process, which makes morphed image\ndetection even harder. In this work, we investigate the impact that printing\nand scanning has on morphing attacks through a series of heterogeneous tests.\nOur experiments show that we can increase the possibility of a false match by\nup to 5.64% for DiM and 16.00% for StyleGAN2 when providing an image that has\nbeen printed and scanned, regardless it is morphed or bona fide, to a Face\nRecognition (FR) system. Likewise, using Frechet Inception Distance (FID)\nmetric, strictly print-scanned morph attacks performed on average 9.185%\nstronger than non-print-scanned digital morphs.\n","authors":["Richard E. Neddo","Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06559v1.pdf","comment":"Initial preprint. Under review"},{"id":"http://arxiv.org/abs/2404.06542v1","updated":"2024-04-09T18:00:25Z","published":"2024-04-09T18:00:25Z","title":"Training-Free Open-Vocabulary Segmentation with Offline\n Diffusion-Augmented Prototype Generation","summary":" Open-vocabulary semantic segmentation aims at segmenting arbitrary categories\nexpressed in textual form. Previous works have trained over large amounts of\nimage-caption pairs to enforce pixel-level multimodal alignments. However,\ncaptions provide global information about the semantics of a given image but\nlack direct localization of individual concepts. Further, training on\nlarge-scale datasets inevitably brings significant computational costs. In this\npaper, we propose FreeDA, a training-free diffusion-augmented method for\nopen-vocabulary semantic segmentation, which leverages the ability of diffusion\nmodels to visually localize generated concepts and local-global similarities to\nmatch class-agnostic regions with semantic classes. Our approach involves an\noffline stage in which textual-visual reference embeddings are collected,\nstarting from a large set of captions and leveraging visual and semantic\ncontexts. At test time, these are queried to support the visual matching\nprocess, which is carried out by jointly considering class-agnostic regions and\nglobal semantic similarities. Extensive analyses demonstrate that FreeDA\nachieves state-of-the-art performance on five datasets, surpassing previous\nmethods by more than 7.0 average points in terms of mIoU and without requiring\nany training.\n","authors":["Luca Barsellotti","Roberto Amoroso","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2404.06542v1.pdf","comment":"CVPR 2024. Project page: https://aimagelab.github.io/freeda/"},{"id":"http://arxiv.org/abs/2208.11650v3","updated":"2024-04-09T17:59:34Z","published":"2022-08-24T16:40:27Z","title":"Lane Change Classification and Prediction with Action Recognition\n Networks","summary":" Anticipating lane change intentions of surrounding vehicles is crucial for\nefficient and safe driving decision making in an autonomous driving system.\nPrevious works often adopt physical variables such as driving speed,\nacceleration and so forth for lane change classification. However, physical\nvariables do not contain semantic information. Although 3D CNNs have been\ndeveloping rapidly, the number of methods utilising action recognition models\nand appearance feature for lane change recognition is low, and they all require\nadditional information to pre-process data. In this work, we propose an\nend-to-end framework including two action recognition methods for lane change\nrecognition, using video data collected by cameras. Our method achieves the\nbest lane change classification results using only the RGB video data of the\nPREVENTION dataset. Class activation maps demonstrate that action recognition\nmodels can efficiently extract lane change motions. A method to better extract\nmotion clues is also proposed in this paper.\n","authors":["Kai Liang","Jun Wang","Abhir Bhalerao"],"pdf_url":"https://arxiv.org/pdf/2208.11650v3.pdf","comment":"Accepted to ECCV2022 AVVISION"},{"id":"http://arxiv.org/abs/2404.06486v1","updated":"2024-04-09T17:37:08Z","published":"2024-04-09T17:37:08Z","title":"GO4Align: Group Optimization for Multi-Task Alignment","summary":" This paper proposes \\textit{GO4Align}, a multi-task optimization approach\nthat tackles task imbalance by explicitly aligning the optimization across\ntasks. To achieve this, we design an adaptive group risk minimization strategy,\ncompromising two crucial techniques in implementation: (i) dynamical group\nassignment, which clusters similar tasks based on task interactions; (ii)\nrisk-guided group indicators, which exploit consistent task correlations with\nrisk information from previous iterations. Comprehensive experimental results\non diverse typical benchmarks demonstrate our method's performance superiority\nwith even lower computational costs.\n","authors":["Jiayi Shen","Cheems Wang","Zehao Xiao","Nanne Van Noord","Marcel Worring"],"pdf_url":"https://arxiv.org/pdf/2404.06486v1.pdf","comment":null}]},"2024-04-10T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.07206v1","updated":"2024-04-10T17:59:59Z","published":"2024-04-10T17:59:59Z","title":"GoodDrag: Towards Good Practices for Drag Editing with Diffusion Models","summary":" In this paper, we introduce GoodDrag, a novel approach to improve the\nstability and image quality of drag editing. Unlike existing methods that\nstruggle with accumulated perturbations and often result in distortions,\nGoodDrag introduces an AlDD framework that alternates between drag and\ndenoising operations within the diffusion process, effectively improving the\nfidelity of the result. We also propose an information-preserving motion\nsupervision operation that maintains the original features of the starting\npoint for precise manipulation and artifact reduction. In addition, we\ncontribute to the benchmarking of drag editing by introducing a new dataset,\nDrag100, and developing dedicated quality assessment metrics, Dragging Accuracy\nIndex and Gemini Score, utilizing Large Multimodal Models. Extensive\nexperiments demonstrate that the proposed GoodDrag compares favorably against\nthe state-of-the-art approaches both qualitatively and quantitatively. The\nproject page is https://gooddrag.github.io.\n","authors":["Zewei Zhang","Huan Liu","Jun Chen","Xiangyu Xu"],"pdf_url":"https://arxiv.org/pdf/2404.07206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07204v1","updated":"2024-04-10T17:59:45Z","published":"2024-04-10T17:59:45Z","title":"BRAVE: Broadening the visual encoding of vision-language models","summary":" Vision-language models (VLMs) are typically composed of a vision encoder,\ne.g. CLIP, and a language model (LM) that interprets the encoded features to\nsolve downstream tasks. Despite remarkable progress, VLMs are subject to\nseveral shortcomings due to the limited capabilities of vision encoders, e.g.\n\"blindness\" to certain image features, visual hallucination, etc. To address\nthese issues, we study broadening the visual encoding capabilities of VLMs. We\nfirst comprehensively benchmark several vision encoders with different\ninductive biases for solving VLM tasks. We observe that there is no single\nencoding configuration that consistently achieves top performance across\ndifferent tasks, and encoders with different biases can perform surprisingly\nsimilarly. Motivated by this, we introduce a method, named BRAVE, that\nconsolidates features from multiple frozen encoders into a more versatile\nrepresentation that can be directly fed as the input to a frozen LM. BRAVE\nachieves state-of-the-art performance on a broad range of captioning and VQA\nbenchmarks and significantly reduces the aforementioned issues of VLMs, while\nrequiring a smaller number of trainable parameters than existing methods and\nhaving a more compressed representation. Our results highlight the potential of\nincorporating different visual biases for a more broad and contextualized\nvisual understanding of VLMs.\n","authors":["Oğuzhan Fatih Kar","Alessio Tonioni","Petra Poklukar","Achin Kulshrestha","Amir Zamir","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.07204v1.pdf","comment":"Project page at https://brave-vlms.epfl.ch/"},{"id":"http://arxiv.org/abs/2404.07202v1","updated":"2024-04-10T17:59:20Z","published":"2024-04-10T17:59:20Z","title":"UMBRAE: Unified Multimodal Decoding of Brain Signals","summary":" We address prevailing challenges of the brain-powered research, departing\nfrom the observation that the literature hardly recover accurate spatial\ninformation and require subject-specific models. To address these challenges,\nwe propose UMBRAE, a unified multimodal decoding of brain signals. First, to\nextract instance-level conceptual and spatial details from neural signals, we\nintroduce an efficient universal brain encoder for multimodal-brain alignment\nand recover object descriptions at multiple levels of granularity from\nsubsequent multimodal large language model (MLLM). Second, we introduce a\ncross-subject training strategy mapping subject-specific features to a common\nfeature space. This allows a model to be trained on multiple subjects without\nextra resources, even yielding superior results compared to subject-specific\nmodels. Further, we demonstrate this supports weakly-supervised adaptation to\nnew subjects, with only a fraction of the total training data. Experiments\ndemonstrate that UMBRAE not only achieves superior results in the newly\nintroduced tasks but also outperforms methods in well established tasks. To\nassess our method, we construct and share with the community a comprehensive\nbrain understanding benchmark BrainHub. Our code and benchmark are available at\nhttps://weihaox.github.io/UMBRAE.\n","authors":["Weihao Xia","Raoul de Charette","Cengiz Öztireli","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2404.07202v1.pdf","comment":"Project Page: https://weihaox.github.io/UMBRAE"},{"id":"http://arxiv.org/abs/2404.07199v1","updated":"2024-04-10T17:57:41Z","published":"2024-04-10T17:57:41Z","title":"RealmDreamer: Text-Driven 3D Scene Generation with Inpainting and Depth\n Diffusion","summary":" We introduce RealmDreamer, a technique for generation of general\nforward-facing 3D scenes from text descriptions. Our technique optimizes a 3D\nGaussian Splatting representation to match complex text prompts. We initialize\nthese splats by utilizing the state-of-the-art text-to-image generators,\nlifting their samples into 3D, and computing the occlusion volume. We then\noptimize this representation across multiple views as a 3D inpainting task with\nimage-conditional diffusion models. To learn correct geometric structure, we\nincorporate a depth diffusion model by conditioning on the samples from the\ninpainting model, giving rich geometric structure. Finally, we finetune the\nmodel using sharpened samples from image generators. Notably, our technique\ndoes not require video or multi-view data and can synthesize a variety of\nhigh-quality 3D scenes in different styles, consisting of multiple objects. Its\ngenerality additionally allows 3D synthesis from a single image.\n","authors":["Jaidev Shriram","Alex Trevithick","Lingjie Liu","Ravi Ramamoorthi"],"pdf_url":"https://arxiv.org/pdf/2404.07199v1.pdf","comment":"Project Page: https://realmdreamer.github.io/"},{"id":"http://arxiv.org/abs/2404.07191v1","updated":"2024-04-10T17:48:37Z","published":"2024-04-10T17:48:37Z","title":"InstantMesh: Efficient 3D Mesh Generation from a Single Image with\n Sparse-view Large Reconstruction Models","summary":" We present InstantMesh, a feed-forward framework for instant 3D mesh\ngeneration from a single image, featuring state-of-the-art generation quality\nand significant training scalability. By synergizing the strengths of an\noff-the-shelf multiview diffusion model and a sparse-view reconstruction model\nbased on the LRM architecture, InstantMesh is able to create diverse 3D assets\nwithin 10 seconds. To enhance the training efficiency and exploit more\ngeometric supervisions, e.g, depths and normals, we integrate a differentiable\niso-surface extraction module into our framework and directly optimize on the\nmesh representation. Experimental results on public datasets demonstrate that\nInstantMesh significantly outperforms other latest image-to-3D baselines, both\nqualitatively and quantitatively. We release all the code, weights, and demo of\nInstantMesh, with the intention that it can make substantial contributions to\nthe community of 3D generative AI and empower both researchers and content\ncreators.\n","authors":["Jiale Xu","Weihao Cheng","Yiming Gao","Xintao Wang","Shenghua Gao","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2404.07191v1.pdf","comment":"Technical report. Project: https://github.com/TencentARC/InstantMesh"},{"id":"http://arxiv.org/abs/2404.07188v1","updated":"2024-04-10T17:41:41Z","published":"2024-04-10T17:41:41Z","title":"GCV-Turbo: End-to-end Acceleration of GNN-based Computer Vision Tasks on\n FPGA","summary":" Graph neural networks (GNNs) have recently empowered various novel computer\nvision (CV) tasks. In GNN-based CV tasks, a combination of CNN layers and GNN\nlayers or only GNN layers are employed. This paper introduces GCV-Turbo, a\ndomain-specific accelerator on FPGA for end-to-end acceleration of GNN-based CV\ntasks. GCV-Turbo consists of two key components: (1) a \\emph{novel} hardware\narchitecture optimized for the computation kernels in both CNNs and GNNs using\nthe same set of computation resources. (2) a PyTorch-compatible compiler that\ntakes a user-defined model as input, performs end-to-end optimization for the\ncomputation graph of a given GNN-based CV task, and produces optimized code for\nhardware execution. The hardware architecture and the compiler work\nsynergistically to support a variety of GNN-based CV tasks. We implement\nGCV-Turbo on a state-of-the-art FPGA and evaluate its performance across six\nrepresentative GNN-based CV tasks with diverse input data modalities (e.g.,\nimage, human skeleton, point cloud). Compared with state-of-the-art CPU (GPU)\nimplementations, GCV-Turbo achieves an average latency reduction of\n$68.4\\times$ ($4.1\\times$) on these six GNN-based CV tasks. Moreover, GCV-Turbo\nsupports the execution of the standalone CNNs or GNNs, achieving performance\ncomparable to that of state-of-the-art CNN (GNN) accelerators for widely used\nCNN-only (GNN-only) models.\n","authors":["Bingyi Zhang","Rajgopal Kannan","Carl Busart","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2404.07188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14855v2","updated":"2024-04-10T17:35:16Z","published":"2022-12-30T18:04:25Z","title":"Disentangled Explanations of Neural Network Predictions by Finding\n Relevant Subspaces","summary":" Explainable AI aims to overcome the black-box nature of complex ML models\nlike neural networks by generating explanations for their predictions.\nExplanations often take the form of a heatmap identifying input features (e.g.\npixels) that are relevant to the model's decision. These explanations, however,\nentangle the potentially multiple factors that enter into the overall complex\ndecision strategy. We propose to disentangle explanations by extracting at some\nintermediate layer of a neural network, subspaces that capture the multiple and\ndistinct activation patterns (e.g. visual concepts) that are relevant to the\nprediction. To automatically extract these subspaces, we propose two new\nanalyses, extending principles found in PCA or ICA to explanations. These novel\nanalyses, which we call principal relevant component analysis (PRCA) and\ndisentangled relevant subspace analysis (DRSA), maximize relevance instead of\ne.g. variance or kurtosis. This allows for a much stronger focus of the\nanalysis on what the ML model actually uses for predicting, ignoring\nactivations or concepts to which the model is invariant. Our approach is\ngeneral enough to work alongside common attribution techniques such as Shapley\nValue, Integrated Gradients, or LRP. Our proposed methods show to be\npractically useful and compare favorably to the state of the art as\ndemonstrated on benchmarks and three use cases.\n","authors":["Pattarawat Chormai","Jan Herrmann","Klaus-Robert Müller","Grégoire Montavon"],"pdf_url":"https://arxiv.org/pdf/2212.14855v2.pdf","comment":"17 pages + supplement"},{"id":"http://arxiv.org/abs/2404.07178v1","updated":"2024-04-10T17:28:16Z","published":"2024-04-10T17:28:16Z","title":"Move Anything with Layered Scene Diffusion","summary":" Diffusion models generate images with an unprecedented level of quality, but\nhow can we freely rearrange image layouts? Recent works generate controllable\nscenes via learning spatially disentangled latent codes, but these methods do\nnot apply to diffusion models due to their fixed forward process. In this work,\nwe propose SceneDiffusion to optimize a layered scene representation during the\ndiffusion sampling process. Our key insight is that spatial disentanglement can\nbe obtained by jointly denoising scene renderings at different spatial layouts.\nOur generated scenes support a wide range of spatial editing operations,\nincluding moving, resizing, cloning, and layer-wise appearance editing\noperations, including object restyling and replacing. Moreover, a scene can be\ngenerated conditioned on a reference image, thus enabling object moving for\nin-the-wild images. Notably, this approach is training-free, compatible with\ngeneral text-to-image diffusion models, and responsive in less than a second.\n","authors":["Jiawei Ren","Mengmeng Xu","Jui-Chieh Wu","Ziwei Liu","Tao Xiang","Antoine Toisoul"],"pdf_url":"https://arxiv.org/pdf/2404.07178v1.pdf","comment":"CVPR 2024 camera-ready"},{"id":"http://arxiv.org/abs/2404.07176v1","updated":"2024-04-10T17:25:42Z","published":"2024-04-10T17:25:42Z","title":"Self-supervised Monocular Depth Estimation on Water Scenes via Specular\n Reflection Prior","summary":" Monocular depth estimation from a single image is an ill-posed problem for\ncomputer vision due to insufficient reliable cues as the prior knowledge.\nBesides the inter-frame supervision, namely stereo and adjacent frames,\nextensive prior information is available in the same frame. Reflections from\nspecular surfaces, informative intra-frame priors, enable us to reformulate the\nill-posed depth estimation task as a multi-view synthesis. This paper proposes\nthe first self-supervision for deep-learning depth estimation on water scenes\nvia intra-frame priors, known as reflection supervision and geometrical\nconstraints. In the first stage, a water segmentation network is performed to\nseparate the reflection components from the entire image. Next, we construct a\nself-supervised framework to predict the target appearance from reflections,\nperceived as other perspectives. The photometric re-projection error,\nincorporating SmoothL1 and a novel photometric adaptive SSIM, is formulated to\noptimize pose and depth estimation by aligning the transformed virtual depths\nand source ones. As a supplement, the water surface is determined from real and\nvirtual camera positions, which complement the depth of the water area.\nFurthermore, to alleviate these laborious ground truth annotations, we\nintroduce a large-scale water reflection scene (WRS) dataset rendered from\nUnreal Engine 4. Extensive experiments on the WRS dataset prove the feasibility\nof the proposed method compared to state-of-the-art depth estimation\ntechniques.\n","authors":["Zhengyang Lu","Ying Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07176v1.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2212.11120v2","updated":"2024-04-10T17:15:23Z","published":"2022-12-10T07:50:29Z","title":"Deep Learning for Inertial Sensor Alignment","summary":" Accurate alignment of a fixed mobile device equipped with inertial sensors\ninside a moving vehicle is important for navigation, activity recognition, and\nother applications. Accurate estimation of the device mounting angle is\nrequired to rotate the inertial measurement from the sensor frame to the moving\nplatform frame to standardize measurements and improve the performance of the\ntarget task. In this work, a data-driven approach using deep neural networks\n(DNNs) is proposed to learn the yaw mounting angle of a smartphone equipped\nwith an inertial measurement unit (IMU) and strapped to a car. The proposed\nmodel uses only the accelerometer and gyroscope readings from an IMU as input\nand, in contrast to existing solutions, does not require global position inputs\nfrom global navigation satellite systems (GNSS). To train the model in a\nsupervised manner, IMU data is collected for training and validation with the\nsensor mounted at a known yaw mounting angle, and a range of ground truth\nlabels is generated by applying a random rotation in a bounded range to the\nmeasurements. The trained model is tested on data with real rotations showing\nsimilar performance as with synthetic rotations. The trained model is deployed\non an Android device and evaluated in real-time to test the accuracy of the\nestimated yaw mounting angle. The model is shown to find the mounting angle at\nan accuracy of 8 degrees within 5 seconds, and 4 degrees within 27 seconds. An\nexperiment is conducted to compare the proposed model with an existing\noff-the-shelf solution.\n","authors":["Maxim Freydin","Niv Sfaradi","Nimrod Segol","Areej Eweida","Barak Or"],"pdf_url":"https://arxiv.org/pdf/2212.11120v2.pdf","comment":"9 Pages, Preprint. Accepted IEEE"},{"id":"http://arxiv.org/abs/2404.07155v1","updated":"2024-04-10T16:44:11Z","published":"2024-04-10T16:44:11Z","title":"Unified Language-driven Zero-shot Domain Adaptation","summary":" This paper introduces Unified Language-driven Zero-shot Domain Adaptation\n(ULDA), a novel task setting that enables a single model to adapt to diverse\ntarget domains without explicit domain-ID knowledge. We identify the\nconstraints in the existing language-driven zero-shot domain adaptation task,\nparticularly the requirement for domain IDs and domain-specific models, which\nmay restrict flexibility and scalability. To overcome these issues, we propose\na new framework for ULDA, consisting of Hierarchical Context Alignment (HCA),\nDomain Consistent Representation Learning (DCRL), and Text-Driven Rectifier\n(TDR). These components work synergistically to align simulated features with\ntarget text across multiple visual levels, retain semantic correlations between\ndifferent regional representations, and rectify biases between simulated and\nreal target visual features, respectively. Our extensive empirical evaluations\ndemonstrate that this framework achieves competitive performance in both\nsettings, surpassing even the model that requires domain-ID, showcasing its\nsuperiority and generalization ability. The proposed method is not only\neffective but also maintains practicality and efficiency, as it does not\nintroduce additional computational costs during inference. Our project page is\nhttps://senqiaoyang.com/project/ULDA .\n","authors":["Senqiao Yang","Zhuotao Tian","Li Jiang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2404.07155v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07153v1","updated":"2024-04-10T16:39:50Z","published":"2024-04-10T16:39:50Z","title":"Lost in Translation: Modern Neural Networks Still Struggle With Small\n Realistic Image Transformations","summary":" Deep neural networks that achieve remarkable performance in image\nclassification have previously been shown to be easily fooled by tiny\ntransformations such as a one pixel translation of the input image. In order to\naddress this problem, two approaches have been proposed in recent years. The\nfirst approach suggests using huge datasets together with data augmentation in\nthe hope that a highly varied training set will teach the network to learn to\nbe invariant. The second approach suggests using architectural modifications\nbased on sampling theory to deal explicitly with image translations. In this\npaper, we show that these approaches still fall short in robustly handling\n'natural' image translations that simulate a subtle change in camera\norientation. Our findings reveal that a mere one-pixel translation can result\nin a significant change in the predicted image representation for approximately\n40% of the test images in state-of-the-art models (e.g. open-CLIP trained on\nLAION-2B or DINO-v2) , while models that are explicitly constructed to be\nrobust to cyclic translations can still be fooled with 1 pixel realistic\n(non-cyclic) translations 11% of the time. We present Robust Inference by Crop\nSelection: a simple method that can be proven to achieve any desired level of\nconsistency, although with a modest tradeoff with the model's accuracy.\nImportantly, we demonstrate how employing this method reduces the ability to\nfool state-of-the-art models with a 1 pixel translation to less than 5% while\nsuffering from only a 1% drop in classification accuracy. Additionally, we show\nthat our method can be easy adjusted to deal with circular shifts as well. In\nsuch case we achieve 100% robustness to integer shifts with state-of-the-art\naccuracy, and with no need for any further training.\n","authors":["Ofir Shifman","Yair Weiss"],"pdf_url":"https://arxiv.org/pdf/2404.07153v1.pdf","comment":"14 pages, 6 appendices, 17 figures"},{"id":"http://arxiv.org/abs/2312.00068v2","updated":"2024-04-10T16:04:48Z","published":"2023-11-29T20:59:00Z","title":"GLiDR: Topologically Regularized Graph Generative Network for Sparse\n LiDAR Point Clouds","summary":" Sparse LiDAR point clouds cause severe loss of detail of static structures\nand reduce the density of static points available for navigation. Reduced\ndensity can be detrimental to navigation under several scenarios. We observe\nthat despite high sparsity, in most cases, the global topology of LiDAR\noutlining the static structures can be inferred. We utilize this property to\nobtain a backbone skeleton of a LiDAR scan in the form of a single connected\ncomponent that is a proxy to its global topology. We utilize the backbone to\naugment new points along static structures to overcome sparsity. Newly\nintroduced points could correspond to existing static structures or to static\npoints that were earlier obstructed by dynamic objects. To the best of our\nknowledge, we are the first to use such a strategy for sparse LiDAR point\nclouds. Existing solutions close to our approach fail to identify and preserve\nthe global static LiDAR topology and generate sub-optimal points. We propose\nGLiDR, a Graph Generative network that is topologically regularized using\n0-dimensional Persistent Homology ($\\mathcal{PH}$) constraints. This enables\nGLiDR to introduce newer static points along a topologically consistent global\nstatic LiDAR backbone. GLiDR generates precise static points using $32\\times$\nsparser dynamic scans and performs better than the baselines across three\ndatasets. GLiDR generates a valuable byproduct - an accurate binary\nsegmentation mask of static and dynamic objects that are helpful for navigation\nplanning and safety in constrained environments. The newly introduced static\npoints allow GLiDR to outperform LiDAR-based navigation using SLAM in several\nsettings. Source code is available at\n$\\texttt{https://github.com/GLiDR-CVPR2024/GLiDR}$.\n","authors":["Prashant Kumar","Kshitij Madhav Bhat","Vedang Bhupesh Shenvi Nadkarni","Prem Kalra"],"pdf_url":"https://arxiv.org/pdf/2312.00068v2.pdf","comment":"IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)"},{"id":"http://arxiv.org/abs/2404.07124v1","updated":"2024-04-10T16:04:21Z","published":"2024-04-10T16:04:21Z","title":"Measuring proximity to standard planes during fetal brain ultrasound\n scanning","summary":" This paper introduces a novel pipeline designed to bring ultrasound (US)\nplane pose estimation closer to clinical use for more effective navigation to\nthe standard planes (SPs) in the fetal brain. We propose a semi-supervised\nsegmentation model utilizing both labeled SPs and unlabeled 3D US volume\nslices. Our model enables reliable segmentation across a diverse set of fetal\nbrain images. Furthermore, the model incorporates a classification mechanism to\nidentify the fetal brain precisely. Our model not only filters out frames\nlacking the brain but also generates masks for those containing it, enhancing\nthe relevance of plane pose regression in clinical settings. We focus on fetal\nbrain navigation from 2D ultrasound (US) video analysis and combine this model\nwith a US plane pose regression network to provide sensorless proximity\ndetection to SPs and non-SPs planes; we emphasize the importance of proximity\ndetection to SPs for guiding sonographers, offering a substantial advantage\nover traditional methods by allowing earlier and more precise adjustments\nduring scanning. We demonstrate the practical applicability of our approach\nthrough validation on real fetal scan videos obtained from sonographers of\nvarying expertise levels. Our findings demonstrate the potential of our\napproach to complement existing fetal US technologies and advance prenatal\ndiagnostic practices.\n","authors":["Chiara Di Vece","Antonio Cirigliano","Meala Le Lous","Raffaele Napolitano","Anna L. David","Donald Peebles","Pierre Jannin","Francisco Vasconcelos","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2404.07124v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.07122v1","updated":"2024-04-10T16:01:37Z","published":"2024-04-10T16:01:37Z","title":"Driver Attention Tracking and Analysis","summary":" We propose a novel method to estimate a driver's points-of-gaze using a pair\nof ordinary cameras mounted on the windshield and dashboard of a car. This is a\nchallenging problem due to the dynamics of traffic environments with 3D scenes\nof unknown depths. This problem is further complicated by the volatile distance\nbetween the driver and the camera system. To tackle these challenges, we\ndevelop a novel convolutional network that simultaneously analyzes the image of\nthe scene and the image of the driver's face. This network has a camera\ncalibration module that can compute an embedding vector that represents the\nspatial configuration between the driver and the camera system. This\ncalibration module improves the overall network's performance, which can be\njointly trained end to end.\n We also address the lack of annotated data for training and evaluation by\nintroducing a large-scale driving dataset with point-of-gaze annotations. This\nis an in situ dataset of real driving sessions in an urban city, containing\nsynchronized images of the driving scene as well as the face and gaze of the\ndriver. Experiments on this dataset show that the proposed method outperforms\nvarious baseline methods, having the mean prediction error of 29.69 pixels,\nwhich is relatively small compared to the $1280{\\times}720$ resolution of the\nscene camera.\n","authors":["Dat Viet Thanh Nguyen","Anh Tran","Nam Vu","Cuong Pham","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2404.07122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10908v3","updated":"2024-04-10T15:59:31Z","published":"2023-12-18T03:34:07Z","title":"CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update","summary":" Utilizing large language models (LLMs) to compose off-the-shelf visual tools\nrepresents a promising avenue of research for developing robust visual\nassistants capable of addressing diverse visual tasks. However, these methods\noften overlook the potential for continual learning, typically by freezing the\nutilized tools, thus limiting their adaptation to environments requiring new\nknowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual\nAssistant, which operates within a framework encompassing inference,\nreflection, and learning phases. During the inference phase, LLMs generate\nprograms and execute corresponding tools to complete assigned tasks. In the\nreflection phase, a multimodal global-local reflection scheme analyzes human\nfeedback to determine which tools require updating. Lastly, the learning phase\nemploys three flexible approaches to automatically gather training data and\nintroduces a novel prompt tuning scheme to update the tools, allowing CLOVA to\nefficiently acquire new knowledge. Experimental findings demonstrate that CLOVA\nsurpasses existing tool-usage methods by 5% in visual question answering and\nmultiple-image reasoning, by 10% in knowledge tagging, and by 20% in image\nediting. These results underscore the significance of the continual learning\ncapability in general visual assistants.\n","authors":["Zhi Gao","Yuntao Du","Xintong Zhang","Xiaojian Ma","Wenjuan Han","Song-Chun Zhu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2312.10908v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.11468v3","updated":"2024-04-10T15:58:09Z","published":"2023-11-13T20:41:48Z","title":"Bias-Reduced Neural Networks for Parameter Estimation in Quantitative\n MRI","summary":" Purpose: To develop neural network (NN)-based quantitative MRI parameter\nestimators with minimal bias and a variance close to the Cram\\'er-Rao bound.\n Theory and Methods: We generalize the mean squared error loss to control the\nbias and variance of the NN's estimates, which involves averaging over multiple\nnoise realizations of the same measurements during training. Bias and variance\nproperties of the resulting NNs are studied for two neuroimaging applications.\n Results: In simulations, the proposed strategy reduces the estimates' bias\nthroughout parameter space and achieves a variance close to the Cram\\'er-Rao\nbound. In vivo, we observe good concordance between parameter maps estimated\nwith the proposed NNs and traditional estimators, such as non-linear\nleast-squares fitting, while state-of-the-art NNs show larger deviations.\n Conclusion: The proposed NNs have greatly reduced bias compared to those\ntrained using the mean squared error and offer significantly improved\ncomputational efficiency over traditional estimators with comparable or better\naccuracy.\n","authors":["Andrew Mao","Sebastian Flassbeck","Jakob Assländer"],"pdf_url":"https://arxiv.org/pdf/2312.11468v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07112v1","updated":"2024-04-10T15:51:46Z","published":"2024-04-10T15:51:46Z","title":"Unfolding ADMM for Enhanced Subspace Clustering of Hyperspectral Images","summary":" Deep subspace clustering methods are now prominent in clustering, typically\nusing fully connected networks and a self-representation loss function.\nHowever, these methods often struggle with overfitting and lack\ninterpretability. In this paper, we explore an alternative clustering approach\nbased on deep unfolding. By unfolding iterative optimization methods into\nneural networks, this approach offers enhanced interpretability and reliability\ncompared to data-driven deep learning methods, and greater adaptability and\ngeneralization than model-based approaches. Hence, unfolding has become widely\nused in inverse imaging problems, such as image restoration, reconstruction,\nand super-resolution, but has not been sufficiently explored yet in the context\nof clustering. In this work, we introduce an innovative clustering architecture\nfor hyperspectral images (HSI) by unfolding an iterative solver based on the\nAlternating Direction Method of Multipliers (ADMM) for sparse subspace\nclustering. To our knowledge, this is the first attempt to apply unfolding ADMM\nfor computing the self-representation matrix in subspace clustering. Moreover,\nour approach captures well the structural characteristics of HSI data by\nemploying the K nearest neighbors algorithm as part of a structure preservation\nmodule. Experimental evaluation of three established HSI datasets shows clearly\nthe potential of the unfolding approach in HSI clustering and even demonstrates\nsuperior performance compared to state-of-the-art techniques.\n","authors":["Xianlu Li","Nicolas Nadisic","Shaoguang Huang","Aleksandra Pižurica"],"pdf_url":"https://arxiv.org/pdf/2404.07112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07110v1","updated":"2024-04-10T15:47:35Z","published":"2024-04-10T15:47:35Z","title":"Wild Visual Navigation: Fast Traversability Learning via Pre-Trained\n Models and Online Self-Supervision","summary":" Natural environments such as forests and grasslands are challenging for\nrobotic navigation because of the false perception of rigid obstacles from high\ngrass, twigs, or bushes. In this work, we present Wild Visual Navigation (WVN),\nan online self-supervised learning system for visual traversability estimation.\nThe system is able to continuously adapt from a short human demonstration in\nthe field, only using onboard sensing and computing. One of the key ideas to\nachieve this is the use of high-dimensional features from pre-trained\nself-supervised models, which implicitly encode semantic information that\nmassively simplifies the learning task. Further, the development of an online\nscheme for supervision generator enables concurrent training and inference of\nthe learned model in the wild. We demonstrate our approach through diverse\nreal-world deployments in forests, parks, and grasslands. Our system is able to\nbootstrap the traversable terrain segmentation in less than 5 min of in-field\ntraining time, enabling the robot to navigate in complex, previously unseen\noutdoor terrains. Code: https://bit.ly/498b0CV - Project\npage:https://bit.ly/3M6nMHH\n","authors":["Matías Mattamala","Jonas Frey","Piotr Libera","Nived Chebrolu","Georg Martius","Cesar Cadena","Marco Hutter","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2404.07110v1.pdf","comment":"Extended version of arXiv:2305.08510"},{"id":"http://arxiv.org/abs/2404.07106v1","updated":"2024-04-10T15:45:03Z","published":"2024-04-10T15:45:03Z","title":"3DMambaComplete: Exploring Structured State Space Model for Point Cloud\n Completion","summary":" Point cloud completion aims to generate a complete and high-fidelity point\ncloud from an initially incomplete and low-quality input. A prevalent strategy\ninvolves leveraging Transformer-based models to encode global features and\nfacilitate the reconstruction process. However, the adoption of pooling\noperations to obtain global feature representations often results in the loss\nof local details within the point cloud. Moreover, the attention mechanism\ninherent in Transformers introduces additional computational complexity,\nrendering it challenging to handle long sequences effectively. To address these\nissues, we propose 3DMambaComplete, a point cloud completion network built on\nthe novel Mamba framework. It comprises three modules: HyperPoint Generation\nencodes point cloud features using Mamba's selection mechanism and predicts a\nset of Hyperpoints. A specific offset is estimated, and the down-sampled points\nbecome HyperPoints. The HyperPoint Spread module disperses these HyperPoints\nacross different spatial locations to avoid concentration. Finally, a\ndeformation method transforms the 2D mesh representation of HyperPoints into a\nfine-grained 3D structure for point cloud reconstruction. Extensive experiments\nconducted on various established benchmarks demonstrate that 3DMambaComplete\nsurpasses state-of-the-art point cloud completion methods, as confirmed by\nqualitative and quantitative analyses.\n","authors":["Yixuan Li","Weidong Yang","Ben Fei"],"pdf_url":"https://arxiv.org/pdf/2404.07106v1.pdf","comment":"10 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.07097v1","updated":"2024-04-10T15:37:00Z","published":"2024-04-10T15:37:00Z","title":"Learning Priors for Non Rigid SfM from Casual Videos","summary":" We tackle the long-standing challenge of reconstructing 3D structures and\ncamera positions from videos. The problem is particularly hard when objects are\ntransformed in a non-rigid way. Current approaches to this problem make\nunrealistic assumptions or require a long optimization time.\n We present TracksTo4D, a novel deep learning-based approach that enables\ninferring 3D structure and camera positions from dynamic content originating\nfrom in-the-wild videos using a single feed-forward pass on a sparse point\ntrack matrix. To achieve this, we leverage recent advances in 2D point tracking\nand design an equivariant neural architecture tailored for directly processing\n2D point tracks by leveraging their symmetries. TracksTo4D is trained on a\ndataset of in-the-wild videos utilizing only the 2D point tracks extracted from\nthe videos, without any 3D supervision. Our experiments demonstrate that\nTracksTo4D generalizes well to unseen videos of unseen semantic categories at\ninference time, producing equivalent results to state-of-the-art methods while\nsignificantly reducing the runtime compared to other baselines.\n","authors":["Yoni Kasten","Wuyue Lu","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2404.07097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07094v1","updated":"2024-04-10T15:34:10Z","published":"2024-04-10T15:34:10Z","title":"MoCap-to-Visual Domain Adaptation for Efficient Human Mesh Estimation\n from 2D Keypoints","summary":" This paper presents Key2Mesh, a model that takes a set of 2D human pose\nkeypoints as input and estimates the corresponding body mesh. Since this\nprocess does not involve any visual (i.e. RGB image) data, the model can be\ntrained on large-scale motion capture (MoCap) datasets, thereby overcoming the\nscarcity of image datasets with 3D labels. To enable the model's application on\nRGB images, we first run an off-the-shelf 2D pose estimator to obtain the 2D\nkeypoints, and then feed these 2D keypoints to Key2Mesh. To improve the\nperformance of our model on RGB images, we apply an adversarial domain\nadaptation (DA) method to bridge the gap between the MoCap and visual domains.\nCrucially, our DA method does not require 3D labels for visual data, which\nenables adaptation to target sets without the need for costly labels. We\nevaluate Key2Mesh for the task of estimating 3D human meshes from 2D keypoints,\nin the absence of RGB and mesh label pairs. Our results on widely used H3.6M\nand 3DPW datasets show that Key2Mesh sets the new state-of-the-art by\noutperforming other models in PA-MPJPE for both datasets, and in MPJPE and PVE\nfor the 3DPW dataset. Thanks to our model's simple architecture, it operates at\nleast 12x faster than the prior state-of-the-art model, LGD. Additional\nqualitative samples and code are available on the project website:\nhttps://key2mesh.github.io/.\n","authors":["Bedirhan Uguz","Ozhan Suat","Batuhan Karagoz","Emre Akbas"],"pdf_url":"https://arxiv.org/pdf/2404.07094v1.pdf","comment":"accepted to CVPRW 2024"},{"id":"http://arxiv.org/abs/2401.07745v2","updated":"2024-04-10T15:30:23Z","published":"2024-01-15T14:56:15Z","title":"MaskClustering: View Consensus based Mask Graph Clustering for\n Open-Vocabulary 3D Instance Segmentation","summary":" Open-vocabulary 3D instance segmentation is cutting-edge for its ability to\nsegment 3D instances without predefined categories. However, progress in 3D\nlags behind its 2D counterpart due to limited annotated 3D data. To address\nthis, recent works first generate 2D open-vocabulary masks through 2D models\nand then merge them into 3D instances based on metrics calculated between two\nneighboring frames. In contrast to these local metrics, we propose a novel\nmetric, view consensus rate, to enhance the utilization of multi-view\nobservations. The key insight is that two 2D masks should be deemed part of the\nsame 3D instance if a significant number of other 2D masks from different views\ncontain both these two masks. Using this metric as edge weight, we construct a\nglobal mask graph where each mask is a node. Through iterative clustering of\nmasks showing high view consensus, we generate a series of clusters, each\nrepresenting a distinct 3D instance. Notably, our model is training-free.\nThrough extensive experiments on publicly available datasets, including\nScanNet++, ScanNet200 and MatterPort3D, we demonstrate that our method achieves\nstate-of-the-art performance in open-vocabulary 3D instance segmentation. Our\nproject page is at https://pku-epic.github.io/MaskClustering.\n","authors":["Mi Yan","Jiazhao Zhang","Yan Zhu","He Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07745v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02233v2","updated":"2024-04-10T15:22:05Z","published":"2024-04-02T18:40:55Z","title":"Visual Concept Connectome (VCC): Open World Concept Discovery and their\n Interlayer Connections in Deep Models","summary":" Understanding what deep network models capture in their learned\nrepresentations is a fundamental challenge in computer vision. We present a new\nmethodology to understanding such vision models, the Visual Concept Connectome\n(VCC), which discovers human interpretable concepts and their interlayer\nconnections in a fully unsupervised manner. Our approach simultaneously reveals\nfine-grained concepts at a layer, connection weightings across all layers and\nis amendable to global analysis of network structure (e.g., branching pattern\nof hierarchical concept assemblies). Previous work yielded ways to extract\ninterpretable concepts from single layers and examine their impact on\nclassification, but did not afford multilayer concept analysis across an entire\nnetwork architecture. Quantitative and qualitative empirical results show the\neffectiveness of VCCs in the domain of image classification. Also, we leverage\nVCCs for the application of failure mode debugging to reveal where mistakes\narise in deep networks.\n","authors":["Matthew Kowal","Richard P. Wildes","Konstantinos G. Derpanis"],"pdf_url":"https://arxiv.org/pdf/2404.02233v2.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2401.10831v3","updated":"2024-04-10T15:19:07Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":" This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we show that VTCD\ncan be used for fine-grained action recognition and video object segmentation.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v3.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2402.18320v2","updated":"2024-04-10T15:09:22Z","published":"2024-02-28T13:33:43Z","title":"Location-guided Head Pose Estimation for Fisheye Image","summary":" Camera with a fisheye or ultra-wide lens covers a wide field of view that\ncannot be modeled by the perspective projection. Serious fisheye lens\ndistortion in the peripheral region of the image leads to degraded performance\nof the existing head pose estimation models trained on undistorted images. This\npaper presents a new approach for head pose estimation that uses the knowledge\nof head location in the image to reduce the negative effect of fisheye\ndistortion. We develop an end-to-end convolutional neural network to estimate\nthe head pose with the multi-task learning of head pose and head location. Our\nproposed network estimates the head pose directly from the fisheye image\nwithout the operation of rectification or calibration. We also created a\nfisheye-distorted version of the three popular head pose estimation datasets,\nBIWI, 300W-LP, and AFLW2000 for our experiments. Experiments results show that\nour network remarkably improves the accuracy of head pose estimation compared\nwith other state-of-the-art one-stage and two-stage methods.\n","authors":["Bing Li","Dong Zhang","Cheng Huang","Yun Xian","Ming Li","Dah-Jye Lee"],"pdf_url":"https://arxiv.org/pdf/2402.18320v2.pdf","comment":"Revised Introduction and Related Work; Submitted to lEEE Transactions\n on Cognitive and Developmental Systems for review"},{"id":"http://arxiv.org/abs/2404.07078v1","updated":"2024-04-10T15:09:15Z","published":"2024-04-10T15:09:15Z","title":"VLLMs Provide Better Context for Emotion Understanding Through Common\n Sense Reasoning","summary":" Recognising emotions in context involves identifying the apparent emotions of\nan individual, taking into account contextual cues from the surrounding scene.\nPrevious approaches to this task have involved the design of explicit\nscene-encoding architectures or the incorporation of external scene-related\ninformation, such as captions. However, these methods often utilise limited\ncontextual information or rely on intricate training pipelines. In this work,\nwe leverage the groundbreaking capabilities of Vision-and-Large-Language Models\n(VLLMs) to enhance in-context emotion classification without introducing\ncomplexity to the training process in a two-stage approach. In the first stage,\nwe propose prompting VLLMs to generate descriptions in natural language of the\nsubject's apparent emotion relative to the visual context. In the second stage,\nthe descriptions are used as contextual information and, along with the image\ninput, are used to train a transformer-based architecture that fuses text and\nvisual features before the final classification task. Our experimental results\nshow that the text and image features have complementary information, and our\nfused architecture significantly outperforms the individual modalities without\nany complex training methods. We evaluate our approach on three different\ndatasets, namely, EMOTIC, CAER-S, and BoLD, and achieve state-of-the-art or\ncomparable accuracy across all datasets and metrics compared to much more\ncomplex approaches. The code will be made publicly available on github:\nhttps://github.com/NickyFot/EmoCommonSense.git\n","authors":["Alexandros Xenos","Niki Maria Foteinopoulou","Ioanna Ntinou","Ioannis Patras","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2404.07078v1.pdf","comment":"A. Xenos, N. Foteinopoulou and I. Ntinou contributed equally to this\n work; 14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.07072v1","updated":"2024-04-10T15:02:26Z","published":"2024-04-10T15:02:26Z","title":"Implicit Multi-Spectral Transformer: An Lightweight and Effective\n Visible to Infrared Image Translation Model","summary":" In the field of computer vision, visible light images often exhibit low\ncontrast in low-light conditions, presenting a significant challenge. While\ninfrared imagery provides a potential solution, its utilization entails high\ncosts and practical limitations. Recent advancements in deep learning,\nparticularly the deployment of Generative Adversarial Networks (GANs), have\nfacilitated the transformation of visible light images to infrared images.\nHowever, these methods often experience unstable training phases and may\nproduce suboptimal outputs. To address these issues, we propose a novel\nend-to-end Transformer-based model that efficiently converts visible light\nimages into high-fidelity infrared images. Initially, the Texture Mapping\nModule and Color Perception Adapter collaborate to extract texture and color\nfeatures from the visible light image. The Dynamic Fusion Aggregation Module\nsubsequently integrates these features. Finally, the transformation into an\ninfrared image is refined through the synergistic action of the Color\nPerception Adapter and the Enhanced Perception Attention mechanism.\nComprehensive benchmarking experiments confirm that our model outperforms\nexisting methods, producing infrared images of markedly superior quality, both\nqualitatively and quantitatively. Furthermore, the proposed model enables more\neffective downstream applications for infrared images than other methods.\n","authors":["Yijia Chen","Pinghua Chen","Xiangxin Zhou","Yingtie Lei","Ziyang Zhou","Mingxian Li"],"pdf_url":"https://arxiv.org/pdf/2404.07072v1.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.07045v1","updated":"2024-04-10T14:35:22Z","published":"2024-04-10T14:35:22Z","title":"Identification of Fine-grained Systematic Errors via Controlled Scene\n Generation","summary":" Many safety-critical applications, especially in autonomous driving, require\nreliable object detectors. They can be very effectively assisted by a method to\nsearch for and identify potential failures and systematic errors before these\ndetectors are deployed. Systematic errors are characterized by combinations of\nattributes such as object location, scale, orientation, and color, as well as\nthe composition of their respective backgrounds. To identify them, one must\nrely on something other than real images from a test set because they do not\naccount for very rare but possible combinations of attributes. To overcome this\nlimitation, we propose a pipeline for generating realistic synthetic scenes\nwith fine-grained control, allowing the creation of complex scenes with\nmultiple objects. Our approach, BEV2EGO, allows for a realistic generation of\nthe complete scene with road-contingent control that maps 2D bird's-eye view\n(BEV) scene configurations to a first-person view (EGO). In addition, we\npropose a benchmark for controlled scene generation to select the most\nappropriate generative outpainting model for BEV2EGO. We further use it to\nperform a systematic analysis of multiple state-of-the-art object detection\nmodels and discover differences between them.\n","authors":["Valentyn Boreiko","Matthias Hein","Jan Hendrik Metzen"],"pdf_url":"https://arxiv.org/pdf/2404.07045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07032v1","updated":"2024-04-10T14:25:23Z","published":"2024-04-10T14:25:23Z","title":"An Evidential-enhanced Tri-Branch Consistency Learning Method for\n Semi-supervised Medical Image Segmentation","summary":" Semi-supervised segmentation presents a promising approach for large-scale\nmedical image analysis, effectively reducing annotation burdens while achieving\ncomparable performance. This methodology holds substantial potential for\nstreamlining the segmentation process and enhancing its feasibility within\nclinical settings for translational investigations. While cross-supervised\ntraining, based on distinct co-training sub-networks, has become a prevalent\nparadigm for this task, addressing critical issues such as predication\ndisagreement and label-noise suppression requires further attention and\nprogress in cross-supervised training. In this paper, we introduce an\nEvidential Tri-Branch Consistency learning framework (ETC-Net) for\nsemi-supervised medical image segmentation. ETC-Net employs three branches: an\nevidential conservative branch, an evidential progressive branch, and an\nevidential fusion branch. The first two branches exhibit complementary\ncharacteristics, allowing them to address prediction diversity and enhance\ntraining stability. We also integrate uncertainty estimation from the\nevidential learning into cross-supervised training, mitigating the negative\nimpact of erroneous supervision signals. Additionally, the evidential fusion\nbranch capitalizes on the complementary attributes of the first two branches\nand leverages an evidence-based Dempster-Shafer fusion strategy, supervised by\nmore reliable and accurate pseudo-labels of unlabeled data. Extensive\nexperiments conducted on LA, Pancreas-CT, and ACDC datasets demonstrate that\nETC-Net surpasses other state-of-the-art methods for semi-supervised\nsegmentation. The code will be made available in the near future at\nhttps://github.com/Medsemiseg.\n","authors":["Zhenxi Zhang","Heng Zhou","Xiaoran Shi","Ran Ran","Chunna Tian","Feng Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.07032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10166v2","updated":"2024-04-10T14:25:12Z","published":"2024-01-18T17:55:39Z","title":"VMamba: Visual State Space Model","summary":" Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) have long\nbeen the predominant backbone networks for visual representation learning.\nWhile ViTs have recently gained prominence over CNNs due to their superior\nfitting capabilities, their scalability is largely constrained by the quadratic\ncomplexity of attention computation. Inspired by the capability of Mamba in\nefficiently modeling long sequences, we propose VMamba, a generic vision\nbackbone model aiming to reduce the computational complexity to linear while\nretaining ViTs' advantageous features. To enhance VMamba's adaptability in\nprocessing vision data, we introduce the Cross-Scan Module (CSM) to enable 1D\nselective scanning in 2D image space with global receptive fields.\nAdditionally, we make further improvements in implementation details and\narchitectural designs to enhance VMamba's performance and boost its inference\nspeed. Extensive experimental results demonstrate VMamba's promising\nperformance across various visual perception tasks, highlighting its pronounced\nadvantages in input scaling efficiency compared to existing benchmark models.\nSource code is available at https://github.com/MzeroMiko/VMamba.\n","authors":["Yue Liu","Yunjie Tian","Yuzhong Zhao","Hongtian Yu","Lingxi Xie","Yaowei Wang","Qixiang Ye","Yunfan Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10166v2.pdf","comment":"21 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.07031v1","updated":"2024-04-10T14:24:10Z","published":"2024-04-10T14:24:10Z","title":"ORacle: Large Vision-Language Models for Knowledge-Guided Holistic OR\n Domain Modeling","summary":" Every day, countless surgeries are performed worldwide, each within the\ndistinct settings of operating rooms (ORs) that vary not only in their setups\nbut also in the personnel, tools, and equipment used. This inherent diversity\nposes a substantial challenge for achieving a holistic understanding of the OR,\nas it requires models to generalize beyond their initial training datasets. To\nreduce this gap, we introduce ORacle, an advanced vision-language model\ndesigned for holistic OR domain modeling, which incorporates multi-view and\ntemporal capabilities and can leverage external knowledge during inference,\nenabling it to adapt to previously unseen surgical scenarios. This capability\nis further enhanced by our novel data augmentation framework, which\nsignificantly diversifies the training dataset, ensuring ORacle's proficiency\nin applying the provided knowledge effectively. In rigorous testing, in scene\ngraph generation, and downstream tasks on the 4D-OR dataset, ORacle not only\ndemonstrates state-of-the-art performance but does so requiring less data than\nexisting models. Furthermore, its adaptability is displayed through its ability\nto interpret unseen views, actions, and appearances of tools and equipment.\nThis demonstrates ORacle's potential to significantly enhance the scalability\nand affordability of OR domain modeling and opens a pathway for future\nadvancements in surgical data science. We will release our code and data upon\nacceptance.\n","authors":["Ege Özsoy","Chantal Pellegrini","Matthias Keicher","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2404.07031v1.pdf","comment":"11 pages, 3 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.07029v1","updated":"2024-04-10T14:22:16Z","published":"2024-04-10T14:22:16Z","title":"Diffusion-based inpainting of incomplete Euclidean distance matrices of\n trajectories generated by a fractional Brownian motion","summary":" Fractional Brownian trajectories (fBm) feature both randomness and strong\nscale-free correlations, challenging generative models to reproduce the\nintrinsic memory characterizing the underlying process. Here we test a\ndiffusion probabilistic model on a specific dataset of corrupted images\ncorresponding to incomplete Euclidean distance matrices of fBm at various\nmemory exponents $H$. Our dataset implies uniqueness of the data imputation in\nthe regime of low missing ratio, where the remaining partial graph is rigid,\nproviding the ground truth for the inpainting. We find that the conditional\ndiffusion generation stably reproduces the statistics of missing\nfBm-distributed distances for different values of $H$ exponent. Furthermore,\nwhile diffusion models have been recently shown to remember samples from the\ntraining database, we show that diffusion-based inpainting behaves\nqualitatively different from the database search with the increasing database\nsize. Finally, we apply our fBm-trained diffusion model with $H=1/3$ for\ncompletion of chromosome distance matrices obtained in single-cell microscopy\nexperiments, showing its superiority over the standard bioinformatics\nalgorithms. Our source code is available on GitHub at\nhttps://github.com/alobashev/diffusion_fbm.\n","authors":["Alexander Lobashev","Kirill Polovnikov"],"pdf_url":"https://arxiv.org/pdf/2404.07029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10144v4","updated":"2024-04-10T13:58:08Z","published":"2023-12-15T19:00:07Z","title":"Data-Efficient Multimodal Fusion on a Single GPU","summary":" The goal of multimodal alignment is to learn a single latent space that is\nshared between multimodal inputs. The most powerful models in this space have\nbeen trained using massive datasets of paired inputs and large-scale\ncomputational resources, making them prohibitively expensive to train in many\npractical scenarios. We surmise that existing unimodal encoders pre-trained on\nlarge amounts of unimodal data should provide an effective bootstrap to create\nmultimodal models from unimodal ones at much lower costs. We therefore propose\nFuseMix, a multimodal augmentation scheme that operates on the latent spaces of\narbitrary pre-trained unimodal encoders. Using FuseMix for multimodal\nalignment, we achieve competitive performance -- and in certain cases\noutperform state-of-the art methods -- in both image-text and audio-text\nretrieval, with orders of magnitude less compute and data: for example, we\noutperform CLIP on the Flickr30K text-to-image retrieval task with $\\sim \\!\n600\\times$ fewer GPU days and $\\sim \\! 80\\times$ fewer image-text pairs.\nAdditionally, we show how our method can be applied to convert pre-trained\ntext-to-image generative models into audio-to-image ones. Code is available at:\nhttps://github.com/layer6ai-labs/fusemix.\n","authors":["Noël Vouitsis","Zhaoyan Liu","Satya Krishna Gorti","Valentin Villecroze","Jesse C. Cresswell","Guangwei Yu","Gabriel Loaiza-Ganem","Maksims Volkovs"],"pdf_url":"https://arxiv.org/pdf/2312.10144v4.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2307.12256v2","updated":"2024-04-10T13:43:54Z","published":"2023-07-23T08:02:37Z","title":"Building-road Collaborative Extraction from Remotely Sensed Images via\n Cross-Interaction","summary":" Buildings are the basic carrier of social production and human life; roads\nare the links that interconnect social networks. Building and road information\nhas important application value in the frontier fields of regional coordinated\ndevelopment, disaster prevention, auto-driving, etc. Mapping buildings and\nroads from very high-resolution (VHR) remote sensing images have become a hot\nresearch topic. However, the existing methods often ignore the strong spatial\ncorrelation between roads and buildings and extract them in isolation. To fully\nutilize the complementary advantages between buildings and roads, we propose a\nbuilding-road collaborative extraction method based on multi-task and\ncross-scale feature interaction to improve the accuracy of both tasks in a\ncomplementary way. A multi-task interaction module is proposed to interact\ninformation across tasks and preserve the unique information of each task,\nwhich tackle the seesaw phenomenon in multitask learning. By considering the\nvariation in appearance and structure between buildings and roads, a\ncross-scale interaction module is designed to automatically learn the optimal\nreception field for different tasks. Compared with many existing methods that\ntrain each task individually, the proposed collaborative extraction method can\nutilize the complementary advantages between buildings and roads by the\nproposed inter-task and inter-scale feature interactions, and automatically\nselect the optimal reception field for different tasks. Experiments on a wide\nrange of urban and rural scenarios show that the proposed algorithm can achieve\nbuilding-road extraction with outstanding performance and efficiency.\n","authors":["Haonan Guo","Xin Su","Chen Wu","Bo Du","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12256v2.pdf","comment":"IEEE Transactions on Geoscience and Remote Sensing"},{"id":"http://arxiv.org/abs/2312.07937v5","updated":"2024-04-10T13:35:51Z","published":"2023-12-13T07:30:19Z","title":"BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics","summary":" The recently emerging text-to-motion advances have spired numerous attempts\nfor convenient and interactive human motion generation. Yet, existing methods\nare largely limited to generating body motions only without considering the\nrich two-hand motions, let alone handling various conditions like body dynamics\nor texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal\ndataset for two-hand motion generation. Our dataset includes accurate motion\ntracking for the human body and hands and provides pair-wised finger-level hand\nannotations and body descriptions. We further provide a strong baseline method,\nBOTH2Hands, for the novel task: generating vivid two-hand motions from both\nimplicit body dynamics and explicit text prompts. We first warm up two parallel\nbody-to-hand and text-to-hand diffusion models and then utilize the\ncross-attention transformer for motion blending. Extensive experiments and\ncross-validations demonstrate the effectiveness of our approach and dataset for\ngenerating convincing two-hand motions from the hybrid body-and-textual\nconditions. Our dataset and code will be disseminated to the community for\nfuture research.\n","authors":["Wenqian Zhang","Molin Huang","Yuxuan Zhou","Juze Zhang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07937v5.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05317v2","updated":"2024-04-10T13:30:09Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v2.pdf","comment":"minor fixes (typos, URLs etc.)"},{"id":"http://arxiv.org/abs/2309.06067v6","updated":"2024-04-10T13:17:52Z","published":"2023-09-12T09:07:03Z","title":"Implicit Neural Representation for MRI Parallel Imaging Reconstruction","summary":" Magnetic resonance imaging (MRI) usually faces lengthy acquisition times,\nprompting the exploration of strategies such as parallel imaging (PI) to\nalleviate this problem by periodically skipping specific K-space lines and\nsubsequently reconstructing high-quality images from the undersampled K-space.\nImplicit neural representation (INR) has recently emerged as a promising deep\nlearning technique, characterizing objects as continuous functions of spatial\ncoordinates typically parameterized by a multilayer perceptron (MLP). In this\nstudy, we propose a novel MRI PI reconstruction method that uses INR. Our\napproach represents reconstructed fully-sampled images as functions of voxel\ncoordinates and prior feature vectors from undersampled images, addressing the\ngeneralization challenges of INR. Specifically, we introduce a scale-embedded\nencoder to generate scale-independent, voxel-specific features from MR images\nacross various undersampling scales. These features are then concatenated with\ncoordinate vectors to reconstruct fully-sampled MR images, facilitating\nmultiple-scale reconstructions. To evaluate our method's performance, we\nconducted experiments using publicly available MRI datasets, comparing it with\nalternative reconstruction techniques. Our quantitative assessment demonstrates\nthe superiority of our proposed method.\n","authors":["Hao Li","Yusheng Zhou","Jianan Liu","Xiling Liu","Tao Huang","Zhihan Lv","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2309.06067v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12220v2","updated":"2024-04-10T13:15:41Z","published":"2023-07-23T03:55:13Z","title":"Expediting Building Footprint Extraction from High-resolution Remote\n Sensing Images via progressive lenient supervision","summary":" The efficacy of building footprint segmentation from remotely sensed images\nhas been hindered by model transfer effectiveness. Many existing building\nsegmentation methods were developed upon the encoder-decoder architecture of\nU-Net, in which the encoder is finetuned from the newly developed backbone\nnetworks that are pre-trained on ImageNet. However, the heavy computational\nburden of the existing decoder designs hampers the successful transfer of these\nmodern encoder networks to remote sensing tasks. Even the widely-adopted deep\nsupervision strategy fails to mitigate these challenges due to its invalid loss\nin hybrid regions where foreground and background pixels are intermixed. In\nthis paper, we conduct a comprehensive evaluation of existing decoder network\ndesigns for building footprint segmentation and propose an efficient framework\ndenoted as BFSeg to enhance learning efficiency and effectiveness.\nSpecifically, a densely-connected coarse-to-fine feature fusion decoder network\nthat facilitates easy and fast feature fusion across scales is proposed.\nMoreover, considering the invalidity of hybrid regions in the down-sampled\nground truth during the deep supervision process, we present a lenient deep\nsupervision and distillation strategy that enables the network to learn proper\nknowledge from deep supervision. Building upon these advancements, we have\ndeveloped a new family of building segmentation networks, which consistently\nsurpass prior works with outstanding performance and efficiency across a wide\nrange of newly developed encoder networks.\n","authors":["Haonan Guo","Bo Du","Chen Wu","Xin Su","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06991v1","updated":"2024-04-10T13:10:52Z","published":"2024-04-10T13:10:52Z","title":"Ray-driven Spectral CT Reconstruction Based on Neural Base-Material\n Fields","summary":" In spectral CT reconstruction, the basis materials decomposition involves\nsolving a large-scale nonlinear system of integral equations, which is highly\nill-posed mathematically. This paper proposes a model that parameterizes the\nattenuation coefficients of the object using a neural field representation,\nthereby avoiding the complex calculations of pixel-driven projection\ncoefficient matrices during the discretization process of line integrals. It\nintroduces a lightweight discretization method for line integrals based on a\nray-driven neural field, enhancing the accuracy of the integral approximation\nduring the discretization process. The basis materials are represented as\ncontinuous vector-valued implicit functions to establish a neural field\nparameterization model for the basis materials. The auto-differentiation\nframework of deep learning is then used to solve the implicit continuous\nfunction of the neural base-material fields. This method is not limited by the\nspatial resolution of reconstructed images, and the network has compact and\nregular properties. Experimental validation shows that our method performs\nexceptionally well in addressing the spectral CT reconstruction. Additionally,\nit fulfils the requirements for the generation of high-resolution\nreconstruction images.\n","authors":["Ligen Shi","Chang Liu","Ping Yang","Jun Qiu","Xing Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.06991v1.pdf","comment":"14 pages,16 figures"},{"id":"http://arxiv.org/abs/2404.01563v2","updated":"2024-04-10T13:02:59Z","published":"2024-04-02T01:57:08Z","title":"Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level\n Awareness","summary":" To obtain high-quality positron emission tomography (PET) while minimizing\nradiation exposure, a range of methods have been designed to reconstruct\nstandard-dose PET (SPET) from corresponding low-dose PET (LPET) images.\nHowever, most current methods merely learn the mapping between\nsingle-dose-level LPET and SPET images, but omit the dose disparity of LPET\nimages in clinical scenarios. In this paper, to reconstruct high-quality SPET\nimages from multi-dose-level LPET images, we design a novel two-phase\nmulti-dose-level PET reconstruction algorithm with dose level awareness,\ncontaining a pre-training phase and a SPET prediction phase. Specifically, the\npre-training phase is devised to explore both fine-grained discriminative\nfeatures and effective semantic representation. The SPET prediction phase\nadopts a coarse prediction network utilizing pre-learned dose level prior to\ngenerate preliminary result, and a refinement network to precisely preserve the\ndetails. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge\nDataset have demonstrated the superiority of our method.\n","authors":["Yuchen Fei","Yanmei Luo","Yan Wang","Jiaqi Cui","Yuanyuan Xu","Jiliu Zhou","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.01563v2.pdf","comment":"Accepted by ISBI2024"},{"id":"http://arxiv.org/abs/2404.06033v2","updated":"2024-04-10T12:55:49Z","published":"2024-04-09T05:44:00Z","title":"Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for\n Multi-exposure Image Fusion","summary":" In recent years, deep learning networks have made remarkable strides in the\ndomain of multi-exposure image fusion. Nonetheless, prevailing approaches often\ninvolve directly feeding over-exposed and under-exposed images into the\nnetwork, which leads to the under-utilization of inherent information present\nin the source images. Additionally, unsupervised techniques predominantly\nemploy rudimentary weighted summation for color channel processing, culminating\nin an overall desaturated final image tone. To partially mitigate these issues,\nthis study proposes a gamma correction module specifically designed to fully\nleverage latent information embedded within source images. Furthermore, a\nmodified transformer block, embracing with self-attention mechanisms, is\nintroduced to optimize the fusion process. Ultimately, a novel color\nenhancement algorithm is presented to augment image saturation while preserving\nintricate details. The source code is available at\nhttps://github.com/ZhiyingDu/BHFMEF.\n","authors":["Pan Mu","Zhiying Du","Jinyuan Liu","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2404.06033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02265v2","updated":"2024-04-10T12:54:12Z","published":"2023-10-03T17:59:58Z","title":"DREAM: Visual Decoding from Reversing Human Visual System","summary":" In this work we present DREAM, an fMRI-to-image method for reconstructing\nviewed images from brain activities, grounded on fundamental knowledge of the\nhuman visual system. We craft reverse pathways that emulate the hierarchical\nand parallel nature of how humans perceive the visual world. These tailored\npathways are specialized to decipher semantics, color, and depth cues from fMRI\ndata, mirroring the forward pathways from visual stimuli to fMRI recordings. To\ndo so, two components mimic the inverse processes within the human visual\nsystem: the Reverse Visual Association Cortex (R-VAC) which reverses pathways\nof this brain region, extracting semantics from fMRI data; the Reverse Parallel\nPKM (R-PKM) component simultaneously predicting color and depth from fMRI\nsignals. The experiments indicate that our method outperforms the current\nstate-of-the-art models in terms of the consistency of appearance, structure,\nand semantics. Code will be made publicly available to facilitate further\nresearch in this field.\n","authors":["Weihao Xia","Raoul de Charette","Cengiz Öztireli","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2310.02265v2.pdf","comment":"Project Page: https://weihaox.github.io/DREAM"},{"id":"http://arxiv.org/abs/2404.06977v1","updated":"2024-04-10T12:45:27Z","published":"2024-04-10T12:45:27Z","title":"Accurate Tennis Court Line Detection on Amateur Recorded Matches","summary":" Typically, tennis court line detection is done by running\nHough-Line-Detection to find straight lines in the image, and then computing a\ntransformation matrix from the detected lines to create the final court\nstructure. We propose numerous improvements and enhancements to this algorithm,\nincluding using pretrained State-of-the-Art shadow-removal and object-detection\nML models to make our line-detection more robust. Compared to the original\nalgorithm, our method can accurately detect lines on amateur, dirty courts.\nWhen combined with a robust ball-tracking system, our method will enable\naccurate, automatic refereeing for amateur and professional tennis matches\nalike.\n","authors":["Sameer Agrawal","Ragoth Sundararajan","Vishak Sagar"],"pdf_url":"https://arxiv.org/pdf/2404.06977v1.pdf","comment":"Accepted to 5th International conference on Image, Video Processing\n and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2404.06971v1","updated":"2024-04-10T12:31:43Z","published":"2024-04-10T12:31:43Z","title":"TrajPRed: Trajectory Prediction with Region-based Relation Learning","summary":" Forecasting human trajectories in traffic scenes is critical for safety\nwithin mixed or fully autonomous systems. Human future trajectories are driven\nby two major stimuli, social interactions, and stochastic goals. Thus, reliable\nforecasting needs to capture these two stimuli. Edge-based relation modeling\nrepresents social interactions using pairwise correlations from precise\nindividual states. Nevertheless, edge-based relations can be vulnerable under\nperturbations. To alleviate these issues, we propose a region-based relation\nlearning paradigm that models social interactions via region-wise dynamics of\njoint states, i.e., the changes in the density of crowds. In particular,\nregion-wise agent joint information is encoded within convolutional feature\ngrids. Social relations are modeled by relating the temporal changes of local\njoint information from a global perspective. We show that region-based\nrelations are less susceptible to perturbations. In order to account for the\nstochastic individual goals, we exploit a conditional variational autoencoder\nto realize multi-goal estimation and diverse future prediction. Specifically,\nwe perform variational inference via the latent distribution, which is\nconditioned on the correlation between input states and associated target\ngoals. Sampling from the latent distribution enables the framework to reliably\ncapture the stochastic behavior in test data. We integrate multi-goal\nestimation and region-based relation learning to model the two stimuli, social\ninteractions, and stochastic goals, in a prediction framework. We evaluate our\nframework on the ETH-UCY dataset and Stanford Drone Dataset (SDD). We show that\nthe diverse prediction better fits the ground truth when incorporating the\nrelation module. Our framework outperforms the state-of-the-art models on SDD\nby $27.61\\%$/$18.20\\%$ of ADE/FDE metrics.\n","authors":["Chen Zhou","Ghassan AlRegib","Armin Parchami","Kunjan Singh"],"pdf_url":"https://arxiv.org/pdf/2404.06971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06963v1","updated":"2024-04-10T12:22:19Z","published":"2024-04-10T12:22:19Z","title":"V-MAD: Video-based Morphing Attack Detection in Operational Scenarios","summary":" In response to the rising threat of the face morphing attack, this paper\nintroduces and explores the potential of Video-based Morphing Attack Detection\n(V-MAD) systems in real-world operational scenarios. While current morphing\nattack detection methods primarily focus on a single or a pair of images, V-MAD\nis based on video sequences, exploiting the video streams often acquired by\nface verification tools available, for instance, at airport gates. Through this\nstudy, we show for the first time the advantages that the availability of\nmultiple probe frames can bring to the morphing attack detection task,\nespecially in scenarios where the quality of probe images is varied and might\nbe affected, for instance, by pose or illumination variations. Experimental\nresults on a real operational database demonstrate that video sequences\nrepresent valuable information for increasing the robustness and performance of\nmorphing attack detection systems.\n","authors":["Guido Borghi","Annalisa Franco","Nicolò Di Domenico","Matteo Ferrara","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2404.06963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06957v1","updated":"2024-04-10T12:17:25Z","published":"2024-04-10T12:17:25Z","title":"Adversarial purification for no-reference image-quality metrics:\n applicability study and new methods","summary":" Recently, the area of adversarial attacks on image quality metrics has begun\nto be explored, whereas the area of defences remains under-researched. In this\nstudy, we aim to cover that case and check the transferability of adversarial\npurification defences from image classifiers to IQA methods. In this paper, we\napply several widespread attacks on IQA models and examine the success of the\ndefences against them. The purification methodologies covered different\npreprocessing techniques, including geometrical transformations, compression,\ndenoising, and modern neural network-based methods. Also, we address the\nchallenge of assessing the efficacy of a defensive methodology by proposing\nways to estimate output visual quality and the success of neutralizing attacks.\nDefences were tested against attack on three IQA metrics -- Linearity, MetaIQA\nand SPAQ. The code for attacks and defences is available at: (link is hidden\nfor a blind review).\n","authors":["Aleksandr Gushchin","Anna Chistyakova","Vladislav Minashkin","Anastasia Antsiferova","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2404.06957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04350v3","updated":"2024-04-10T11:58:24Z","published":"2024-01-09T04:33:03Z","title":"Pre-trained Model Guided Fine-Tuning for Zero-Shot Adversarial\n Robustness","summary":" Large-scale pre-trained vision-language models like CLIP have demonstrated\nimpressive performance across various tasks, and exhibit remarkable zero-shot\ngeneralization capability, while they are also vulnerable to imperceptible\nadversarial examples. Existing works typically employ adversarial training\n(fine-tuning) as a defense method against adversarial examples. However, direct\napplication to the CLIP model may result in overfitting, compromising the\nmodel's capacity for generalization. In this paper, we propose Pre-trained\nModel Guided Adversarial Fine-Tuning (PMG-AFT) method, which leverages\nsupervision from the original pre-trained model by carefully designing an\nauxiliary branch, to enhance the model's zero-shot adversarial robustness.\nSpecifically, PMG-AFT minimizes the distance between the features of\nadversarial examples in the target model and those in the pre-trained model,\naiming to preserve the generalization features already captured by the\npre-trained model. Extensive Experiments on 15 zero-shot datasets demonstrate\nthat PMG-AFT significantly outperforms the state-of-the-art method, improving\nthe top-1 robust accuracy by an average of 4.99%. Furthermore, our approach\nconsistently improves clean accuracy by an average of 8.72%. Our code is\navailable at\nhttps://github.com/serendipity1122/Pre-trained-Model-Guided-Fine-Tuning-for-Zero-Shot-Adversarial-Robustness.\n","authors":["Sibo Wang","Jie Zhang","Zheng Yuan","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2401.04350v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.06275v3","updated":"2024-04-10T11:49:05Z","published":"2023-12-11T10:26:21Z","title":"DG-TTA: Out-of-domain medical image segmentation through Domain\n Generalization and Test-Time Adaptation","summary":" Applying pre-trained medical segmentation models on out-of-domain images\noften yields predictions of insufficient quality. Several strategies have been\nproposed to maintain model performance, such as finetuning or unsupervised- and\nsource-free domain adaptation. These strategies set restrictive requirements\nfor data availability. In this study, we propose to combine domain\ngeneralization and test-time adaptation to create a highly effective approach\nfor reusing pre-trained models in unseen target domains. Domain-generalized\npre-training on source data is used to obtain the best initial performance in\nthe target domain. We introduce the MIND descriptor previously used in image\nregistration tasks as a further technique to achieve generalization and present\nsuperior performance for small-scale datasets compared to existing approaches.\nAt test-time, high-quality segmentation for every single unseen scan is ensured\nby optimizing the model weights for consistency given different image\naugmentations. That way, our method enables separate use of source and target\ndata and thus removes current data availability barriers. Moreover, the\npresented method is highly modular as it does not require specific model\narchitectures or prior knowledge of involved domains and labels. We demonstrate\nthis by integrating it into the nnUNet, which is currently the most popular and\naccurate framework for medical image segmentation. We employ multiple datasets\ncovering abdominal, cardiac, and lumbar spine scans and compose several\nout-of-domain scenarios in this study. We demonstrate that our method, combined\nwith pre-trained whole-body CT models, can effectively segment MR images with\nhigh accuracy in all of the aforementioned scenarios. Open-source code can be\nfound here: https://github.com/multimodallearning/DG-TTA\n","authors":["Christian Weihsbach","Christian N. Kruse","Alexander Bigalke","Mattias P. Heinrich"],"pdf_url":"https://arxiv.org/pdf/2312.06275v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06941v1","updated":"2024-04-10T11:47:51Z","published":"2024-04-10T11:47:51Z","title":"Accelerating Cardiac MRI Reconstruction with CMRatt: An Attention-Driven\n Approach","summary":" Cine cardiac magnetic resonance (CMR) imaging is recognised as the benchmark\nmodality for the comprehensive assessment of cardiac function. Nevertheless,\nthe acquisition process of cine CMR is considered as an impediment due to its\nprolonged scanning time. One commonly used strategy to expedite the acquisition\nprocess is through k-space undersampling, though it comes with a drawback of\nintroducing aliasing effects in the reconstructed image. Lately, deep\nlearning-based methods have shown remarkable results over traditional\napproaches in rapidly achieving precise CMR reconstructed images. This study\naims to explore the untapped potential of attention mechanisms incorporated\nwith a deep learning model within the context of the CMR reconstruction\nproblem. We are motivated by the fact that attention has proven beneficial in\ndownstream tasks such as image classification and segmentation, but has not\nbeen systematically analysed in the context of CMR reconstruction. Our primary\ngoal is to identify the strengths and potential limitations of attention\nalgorithms when integrated with a convolutional backbone model such as a U-Net.\nTo achieve this, we benchmark different state-of-the-art spatial and channel\nattention mechanisms on the CMRxRecon dataset and quantitatively evaluate the\nquality of reconstruction using objective metrics. Furthermore, inspired by the\nbest performing attention mechanism, we propose a new, simple yet effective,\nattention pipeline specifically optimised for the task of cardiac image\nreconstruction that outperforms other state-of-the-art attention methods. The\nlayer and model code will be made publicly available.\n","authors":["Anam Hashmi","Julia Dietlmeier","Kathleen M. Curran","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2404.06941v1.pdf","comment":"This paper has been submitted for the 32nd European Signal Processing\n Conference EUSIPCO 2024 in Lyon"},{"id":"http://arxiv.org/abs/2306.10798v3","updated":"2024-04-10T11:42:22Z","published":"2023-06-19T09:38:21Z","title":"ExpPoint-MAE: Better interpretability and performance for\n self-supervised point cloud transformers","summary":" In this paper we delve into the properties of transformers, attained through\nself-supervision, in the point cloud domain. Specifically, we evaluate the\neffectiveness of Masked Autoencoding as a pretraining scheme, and explore\nMomentum Contrast as an alternative. In our study we investigate the impact of\ndata quantity on the learned features, and uncover similarities in the\ntransformer's behavior across domains. Through comprehensive visualiations, we\nobserve that the transformer learns to attend to semantically meaningful\nregions, indicating that pretraining leads to a better understanding of the\nunderlying geometry. Moreover, we examine the finetuning process and its effect\non the learned representations. Based on that, we devise an unfreezing strategy\nwhich consistently outperforms our baseline without introducing any other\nmodifications to the model or the training pipeline, and achieve\nstate-of-the-art results in the classification task among transformer models.\n","authors":["Ioannis Romanelis","Vlassis Fotis","Konstantinos Moustakas","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2306.10798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06936v1","updated":"2024-04-10T11:40:02Z","published":"2024-04-10T11:40:02Z","title":"Efficient and Generic Point Model for Lossless Point Cloud Attribute\n Compression","summary":" The past several years have witnessed the emergence of learned point cloud\ncompression (PCC) techniques. However, current learning-based lossless point\ncloud attribute compression (PCAC) methods either suffer from high\ncomputational complexity or deteriorated compression performance. Moreover, the\nsignificant variations in point cloud scale and sparsity encountered in\nreal-world applications make developing an all-in-one neural model a\nchallenging task. In this paper, we propose PoLoPCAC, an efficient and generic\nlossless PCAC method that achieves high compression efficiency and strong\ngeneralizability simultaneously. We formulate lossless PCAC as the task of\ninferring explicit distributions of attributes from group-wise autoregressive\npriors. A progressive random grouping strategy is first devised to efficiently\nresolve the point cloud into groups, and then the attributes of each group are\nmodeled sequentially from accumulated antecedents. A locality-aware attention\nmechanism is utilized to exploit prior knowledge from context windows in\nparallel. Since our method directly operates on points, it can naturally avoids\ndistortion caused by voxelization, and can be executed on point clouds with\narbitrary scale and density. Experiments show that our method can be instantly\ndeployed once trained on a Synthetic 2k-ShapeNet dataset while enjoying\ncontinuous bit-rate reduction over the latest G-PCCv23 on various datasets\n(ShapeNet, ScanNet, MVUB, 8iVFB). Meanwhile, our method reports shorter coding\ntime than G-PCCv23 on the majority of sequences with a lightweight model size\n(2.6MB), which is highly attractive for practical applications. Dataset, code\nand trained model are available at\nhttps://github.com/I2-Multimedia-Lab/PoLoPCAC.\n","authors":["Kang You","Pan Gao","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06918v1","updated":"2024-04-10T11:10:50Z","published":"2024-04-10T11:10:50Z","title":"HRVDA: High-Resolution Visual Document Assistant","summary":" Leveraging vast training data, multimodal large language models (MLLMs) have\ndemonstrated formidable general visual comprehension capabilities and achieved\nremarkable performance across various tasks. However, their performance in\nvisual document understanding still leaves much room for improvement. This\ndiscrepancy is primarily attributed to the fact that visual document\nunderstanding is a fine-grained prediction task. In natural scenes, MLLMs\ntypically use low-resolution images, leading to a substantial loss of visual\ninformation. Furthermore, general-purpose MLLMs do not excel in handling\ndocument-oriented instructions. In this paper, we propose a High-Resolution\nVisual Document Assistant (HRVDA), which bridges the gap between MLLMs and\nvisual document understanding. This model employs a content filtering mechanism\nand an instruction filtering module to separately filter out the\ncontent-agnostic visual tokens and instruction-agnostic visual tokens, thereby\nachieving efficient model training and inference for high-resolution images. In\naddition, we construct a document-oriented visual instruction tuning dataset\nand apply a multi-stage training strategy to enhance the model's document\nmodeling capabilities. Extensive experiments demonstrate that our model\nachieves state-of-the-art performance across multiple document understanding\ndatasets, while maintaining training efficiency and inference speed comparable\nto low-resolution models.\n","authors":["Chaohu Liu","Kun Yin","Haoyu Cao","Xinghua Jiang","Xin Li","Yinsong Liu","Deqiang Jiang","Xing Sun","Linli Xu"],"pdf_url":"https://arxiv.org/pdf/2404.06918v1.pdf","comment":"Accepted to CVPR 2024 main conference"},{"id":"http://arxiv.org/abs/2404.06913v1","updated":"2024-04-10T11:06:29Z","published":"2024-04-10T11:06:29Z","title":"Sparse Global Matching for Video Frame Interpolation with Large Motion","summary":" Large motion poses a critical challenge in Video Frame Interpolation (VFI)\ntask. Existing methods are often constrained by limited receptive fields,\nresulting in sub-optimal performance when handling scenarios with large motion.\nIn this paper, we introduce a new pipeline for VFI, which can effectively\nintegrate global-level information to alleviate issues associated with large\nmotion. Specifically, we first estimate a pair of initial intermediate flows\nusing a high-resolution feature map for extracting local details. Then, we\nincorporate a sparse global matching branch to compensate for flow estimation,\nwhich consists of identifying flaws in initial flows and generating sparse flow\ncompensation with a global receptive field. Finally, we adaptively merge the\ninitial flow estimation with global flow compensation, yielding a more accurate\nintermediate flow. To evaluate the effectiveness of our method in handling\nlarge motion, we carefully curate a more challenging subset from commonly used\nbenchmarks. Our method demonstrates the state-of-the-art performance on these\nVFI subsets with large motion.\n","authors":["Chunxu Liu","Guozhen Zhang","Rui Zhao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06913v1.pdf","comment":"Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/"},{"id":"http://arxiv.org/abs/2306.00977v4","updated":"2024-04-10T10:56:00Z","published":"2023-06-01T17:59:10Z","title":"AGILE3D: Attention Guided Interactive Multi-object 3D Segmentation","summary":" During interactive segmentation, a model and a user work together to\ndelineate objects of interest in a 3D point cloud. In an iterative process, the\nmodel assigns each data point to an object (or the background), while the user\ncorrects errors in the resulting segmentation and feeds them back into the\nmodel. The current best practice formulates the problem as binary\nclassification and segments objects one at a time. The model expects the user\nto provide positive clicks to indicate regions wrongly assigned to the\nbackground and negative clicks on regions wrongly assigned to the object.\nSequentially visiting objects is wasteful since it disregards synergies between\nobjects: a positive click for a given object can, by definition, serve as a\nnegative click for nearby objects. Moreover, a direct competition between\nadjacent objects can speed up the identification of their common boundary. We\nintroduce AGILE3D, an efficient, attention-based model that (1) supports\nsimultaneous segmentation of multiple 3D objects, (2) yields more accurate\nsegmentation masks with fewer user clicks, and (3) offers faster inference. Our\ncore idea is to encode user clicks as spatial-temporal queries and enable\nexplicit interactions between click queries as well as between them and the 3D\nscene through a click attention module. Every time new clicks are added, we\nonly need to run a lightweight decoder that produces updated segmentation\nmasks. In experiments with four different 3D point cloud datasets, AGILE3D sets\na new state-of-the-art. Moreover, we also verify its practicality in real-world\nsetups with real user studies.\n","authors":["Yuanwen Yue","Sabarinath Mahadevan","Jonas Schult","Francis Engelmann","Bastian Leibe","Konrad Schindler","Theodora Kontogianni"],"pdf_url":"https://arxiv.org/pdf/2306.00977v4.pdf","comment":"ICLR 2024 camera-ready. Project page: https://ywyue.github.io/AGILE3D"},{"id":"http://arxiv.org/abs/2404.06903v1","updated":"2024-04-10T10:46:59Z","published":"2024-04-10T10:46:59Z","title":"DreamScene360: Unconstrained Text-to-3D Scene Generation with Panoramic\n Gaussian Splatting","summary":" The increasing demand for virtual reality applications has highlighted the\nsignificance of crafting immersive 3D assets. We present a text-to-3D\n360$^{\\circ}$ scene generation pipeline that facilitates the creation of\ncomprehensive 360$^{\\circ}$ scenes for in-the-wild environments in a matter of\nminutes. Our approach utilizes the generative power of a 2D diffusion model and\nprompt self-refinement to create a high-quality and globally coherent panoramic\nimage. This image acts as a preliminary \"flat\" (2D) scene representation.\nSubsequently, it is lifted into 3D Gaussians, employing splatting techniques to\nenable real-time exploration. To produce consistent 3D geometry, our pipeline\nconstructs a spatially coherent structure by aligning the 2D monocular depth\ninto a globally optimized point cloud. This point cloud serves as the initial\nstate for the centroids of 3D Gaussians. In order to address invisible issues\ninherent in single-view inputs, we impose semantic and geometric constraints on\nboth synthesized and input camera views as regularizations. These guide the\noptimization of Gaussians, aiding in the reconstruction of unseen regions. In\nsummary, our method offers a globally consistent 3D scene within a\n360$^{\\circ}$ perspective, providing an enhanced immersive experience over\nexisting techniques. Project website at: http://dreamscene360.github.io/\n","authors":["Shijie Zhou","Zhiwen Fan","Dejia Xu","Haoran Chang","Pradyumna Chari","Tejas Bharadwaj","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2404.06903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12796v2","updated":"2024-04-10T10:37:22Z","published":"2023-11-21T18:59:58Z","title":"Physics-guided Shape-from-Template: Monocular Video Perception through\n Neural Surrogate Models","summary":" 3D reconstruction of dynamic scenes is a long-standing problem in computer\ngraphics and increasingly difficult the less information is available.\nShape-from-Template (SfT) methods aim to reconstruct a template-based geometry\nfrom RGB images or video sequences, often leveraging just a single monocular\ncamera without depth information, such as regular smartphone recordings.\nUnfortunately, existing reconstruction methods are either unphysical and noisy\nor slow in optimization. To solve this problem, we propose a novel SfT\nreconstruction algorithm for cloth using a pre-trained neural surrogate model\nthat is fast to evaluate, stable, and produces smooth reconstructions due to a\nregularizing physics simulation. Differentiable rendering of the simulated mesh\nenables pixel-wise comparisons between the reconstruction and a target video\nsequence that can be used for a gradient-based optimization procedure to\nextract not only shape information but also physical parameters such as\nstretching, shearing, or bending stiffness of the cloth. This allows to retain\na precise, stable, and smooth reconstructed geometry while reducing the runtime\nby a factor of 400-500 compared to $\\phi$-SfT, a state-of-the-art physics-based\nSfT approach.\n","authors":["David Stotko","Nils Wandel","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2311.12796v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06894v1","updated":"2024-04-10T10:36:15Z","published":"2024-04-10T10:36:15Z","title":"O-TALC: Steps Towards Combating Oversegmentation within Online Action\n Segmentation","summary":" Online temporal action segmentation shows a strong potential to facilitate\nmany HRI tasks where extended human action sequences must be tracked and\nunderstood in real time. Traditional action segmentation approaches, however,\noperate in an offline two stage approach, relying on computationally expensive\nvideo wide features for segmentation, rendering them unsuitable for online HRI\napplications. In order to facilitate online action segmentation on a stream of\nincoming video data, we introduce two methods for improved training and\ninference of backbone action recognition models, allowing them to be deployed\ndirectly for online frame level classification. Firstly, we introduce surround\ndense sampling whilst training to facilitate training vs. inference clip\nmatching and improve segment boundary predictions. Secondly, we introduce an\nOnline Temporally Aware Label Cleaning (O-TALC) strategy to explicitly reduce\noversegmentation during online inference. As our methods are backbone\ninvariant, they can be deployed with computationally efficient spatio-temporal\naction recognition models capable of operating in real time with a small\nsegmentation latency. We show our method outperforms similar online action\nsegmentation work as well as matches the performance of many offline models\nwith access to full temporal resolution when operating on challenging\nfine-grained datasets.\n","authors":["Matthew Kent Myers","Nick Wright","A. Stephen McGough","Nicholas Martin"],"pdf_url":"https://arxiv.org/pdf/2404.06894v1.pdf","comment":"5 pages, 3 figures. Accepted as a short (unindexed) paper at the\n TAHRI conference"},{"id":"http://arxiv.org/abs/2404.06892v1","updated":"2024-04-10T10:34:34Z","published":"2024-04-10T10:34:34Z","title":"SparseAD: Sparse Query-Centric Paradigm for Efficient End-to-End\n Autonomous Driving","summary":" End-to-End paradigms use a unified framework to implement multi-tasks in an\nautonomous driving system. Despite simplicity and clarity, the performance of\nend-to-end autonomous driving methods on sub-tasks is still far behind the\nsingle-task methods. Meanwhile, the widely used dense BEV features in previous\nend-to-end methods make it costly to extend to more modalities or tasks. In\nthis paper, we propose a Sparse query-centric paradigm for end-to-end\nAutonomous Driving (SparseAD), where the sparse queries completely represent\nthe whole driving scenario across space, time and tasks without any dense BEV\nrepresentation. Concretely, we design a unified sparse architecture for\nperception tasks including detection, tracking, and online mapping. Moreover,\nwe revisit motion prediction and planning, and devise a more justifiable motion\nplanner framework. On the challenging nuScenes dataset, SparseAD achieves SOTA\nfull-task performance among end-to-end methods and significantly narrows the\nperformance gap between end-to-end paradigms and single-task methods. Codes\nwill be released soon.\n","authors":["Diankun Zhang","Guoan Wang","Runwen Zhu","Jianbo Zhao","Xiwu Chen","Siyu Zhang","Jiahao Gong","Qibin Zhou","Wenyuan Zhang","Ningzi Wang","Feiyang Tan","Hangning Zhou","Ziyao Xu","Haotian Yao","Chi Zhang","Xiaojun Liu","Xiaoguang Di","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2404.06892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06883v1","updated":"2024-04-10T10:13:37Z","published":"2024-04-10T10:13:37Z","title":"Research on Detection of Floating Objects in River and Lake Based on AI\n Intelligent Image Recognition","summary":" With the rapid advancement of artificial intelligence technology, AI-enabled\nimage recognition has emerged as a potent tool for addressing challenges in\ntraditional environmental monitoring. This study focuses on the detection of\nfloating objects in river and lake environments, exploring an innovative\napproach based on deep learning. By intricately analyzing the technical\npathways for detecting static and dynamic features and considering the\ncharacteristics of river and lake debris, a comprehensive image acquisition and\nprocessing workflow has been developed. The study highlights the application\nand performance comparison of three mainstream deep learning models -SSD,\nFaster-RCNN, and YOLOv5- in debris identification. Additionally, a detection\nsystem for floating objects has been designed and implemented, encompassing\nboth hardware platform construction and software framework development. Through\nrigorous experimental validation, the proposed system has demonstrated its\nability to significantly enhance the accuracy and efficiency of debris\ndetection, thus offering a new technological avenue for water quality\nmonitoring in rivers and lakes\n","authors":["Jingyu Zhang","Ao Xiang","Yu Cheng","Qin Yang","Liyang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07887v2","updated":"2024-04-10T10:06:46Z","published":"2023-10-11T20:48:20Z","title":"Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging\n Noise","summary":" Accurate analysis of microscopy images is hindered by the presence of noise.\nThis noise is usually signal-dependent and often additionally correlated along\nrows or columns of pixels. Current self- and unsupervised denoisers can address\nsignal-dependent noise, but none can reliably remove noise that is also row- or\ncolumn-correlated. Here, we present the first fully unsupervised deep\nlearning-based denoiser capable of handling imaging noise that is\nrow-correlated as well as signal-dependent. Our approach uses a Variational\nAutoencoder (VAE) with a specially designed autoregressive decoder. This\ndecoder is capable of modeling row-correlated and signal-dependent noise but is\nincapable of independently modeling underlying clean signal. The VAE therefore\nproduces latent variables containing only clean signal information, and these\nare mapped back into image space using a proposed second decoder network. Our\nmethod does not require a pre-trained noise model and can be trained from\nscratch using unpaired noisy data. We show that our approach achieves\ncompetitive results when applied to a range of different sensor types and\nimaging modalities.\n","authors":["Benjamin Salmon","Alexander Krull"],"pdf_url":"https://arxiv.org/pdf/2310.07887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03190v5","updated":"2024-04-10T09:51:11Z","published":"2024-03-05T18:29:17Z","title":"Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract\n Reasoning process","summary":" Abstract reasoning problems pose significant challenges to artificial\nintelligence algorithms, demanding cognitive capabilities beyond those required\nfor perception tasks. This study introduces the Triple-CFN approach to tackle\nthe Bongard-Logo problem, achieving notable reasoning accuracy by implicitly\nreorganizing the concept space of conflicting instances. Additionally, the\nTriple-CFN paradigm proves effective for the RPM problem with necessary\nmodifications, yielding competitive results. To further enhance performance on\nthe RPM issue, we develop the Meta Triple-CFN network, which explicitly\nstructures the problem space while maintaining interpretability on progressive\npatterns. The success of Meta Triple-CFN is attributed to its paradigm of\nmodeling the conceptual space, equivalent to normalizing reasoning information.\nBased on this ideology, we introduce the Re-space layer, enhancing the\nperformance of both Meta Triple-CFN and Triple-CFN. This paper aims to\ncontribute to advancements in machine intelligence by exploring innovative\nnetwork designs for addressing abstract reasoning problems, paving the way for\nfurther breakthroughs in this domain.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03190v5.pdf","comment":"14 pages, 14 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.06865v1","updated":"2024-04-10T09:45:02Z","published":"2024-04-10T09:45:02Z","title":"Fine color guidance in diffusion models and its application to image\n compression at extremely low bitrates","summary":" This study addresses the challenge of, without training or fine-tuning,\ncontrolling the global color aspect of images generated with a diffusion model.\nWe rewrite the guidance equations to ensure that the outputs are closer to a\nknown color map, and this without hindering the quality of the generation. Our\nmethod leads to new guidance equations. We show in the color guidance context\nthat, the scaling of the guidance should not decrease but remains high\nthroughout the diffusion process. In a second contribution, our guidance is\napplied in a compression framework, we combine both semantic and general color\ninformation on the image to decode the images at low cost. We show that our\nmethod is effective at improving fidelity and realism of compressed images at\nextremely low bit rates, when compared to other classical or more semantic\noriented approaches.\n","authors":["Tom Bordin","Thomas Maugey"],"pdf_url":"https://arxiv.org/pdf/2404.06865v1.pdf","comment":"Submitted to IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2404.06863v1","updated":"2024-04-10T09:40:56Z","published":"2024-04-10T09:40:56Z","title":"RESSCAL3D: Resolution Scalable 3D Semantic Segmentation of Point Clouds","summary":" While deep learning-based methods have demonstrated outstanding results in\nnumerous domains, some important functionalities are missing. Resolution\nscalability is one of them. In this work, we introduce a novel architecture,\ndubbed RESSCAL3D, providing resolution-scalable 3D semantic segmentation of\npoint clouds. In contrast to existing works, the proposed method does not\nrequire the whole point cloud to be available to start inference. Once a\nlow-resolution version of the input point cloud is available, first semantic\npredictions can be generated in an extremely fast manner. This enables early\ndecision-making in subsequent processing steps. As additional points become\navailable, these are processed in parallel. To improve performance, features\nfrom previously computed scales are employed as prior knowledge at the current\nscale. Our experiments show that RESSCAL3D is 31-62% faster than the\nnon-scalable baseline while keeping a limited impact on performance. To the\nbest of our knowledge, the proposed method is the first to propose a\nresolution-scalable approach for 3D semantic segmentation of point clouds based\non deep learning.\n","authors":["Remco Royen","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2404.06863v1.pdf","comment":"Published at 2023 IEEE International Conference on Image Processing\n (ICIP)"},{"id":"http://arxiv.org/abs/2404.06860v1","updated":"2024-04-10T09:35:50Z","published":"2024-04-10T09:35:50Z","title":"Monocular 3D lane detection for Autonomous Driving: Recent Achievements,\n Challenges, and Outlooks","summary":" 3D lane detection plays a crucial role in autonomous driving by extracting\nstructural and traffic information from the road in 3D space to assist the\nself-driving car in rational, safe, and comfortable path planning and motion\ncontrol. Due to the consideration of sensor costs and the advantages of visual\ndata in color information, in practical applications, 3D lane detection based\non monocular vision is one of the important research directions in the field of\nautonomous driving, which has attracted more and more attention in both\nindustry and academia. Unfortunately, recent progress in visual perception\nseems insufficient to develop completely reliable 3D lane detection algorithms,\nwhich also hinders the development of vision-based fully autonomous\nself-driving cars, i.e., achieving level 5 autonomous driving, driving like\nhuman-controlled cars. This is one of the conclusions drawn from this review\npaper: there is still a lot of room for improvement and significant\nimprovements are still needed in the 3D lane detection algorithm for autonomous\ndriving cars using visual sensors. Motivated by this, this review defines,\nanalyzes, and reviews the current achievements in the field of 3D lane\ndetection research, and the vast majority of the current progress relies\nheavily on computationally complex deep learning models. In addition, this\nreview covers the 3D lane detection pipeline, investigates the performance of\nstate-of-the-art algorithms, analyzes the time complexity of cutting-edge\nmodeling choices, and highlights the main achievements and limitations of\ncurrent research efforts. The survey also includes a comprehensive discussion\nof available 3D lane detection datasets and the challenges that researchers\nhave faced but have not yet resolved. Finally, our work outlines future\nresearch directions and welcomes researchers and practitioners to enter this\nexciting field.\n","authors":["Fulong Ma","Weiqing Qi","Guoyang Zhao","Linwei Zheng","Sheng Wang","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06859v1","updated":"2024-04-10T09:35:36Z","published":"2024-04-10T09:35:36Z","title":"Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark","summary":" Multi-label image classification in dynamic environments is a problem that\nposes significant challenges. Previous studies have primarily focused on\nscenarios such as Domain Incremental Learning and Class Incremental Learning,\nwhich do not fully capture the complexity of real-world applications. In this\npaper, we study the problem of classification of medical imaging in the\nscenario termed New Instances \\& New Classes, which combines the challenges of\nboth new class arrivals and domain shifts in a single framework. Unlike\ntraditional scenarios, it reflects the realistic nature of CL in domains such\nas medical imaging, where updates may introduce both new classes and changes in\ndomain characteristics. To address the unique challenges posed by this complex\nscenario, we introduce a novel approach called Pseudo-Label Replay. This method\naims to mitigate forgetting while adapting to new classes and domain shifts by\ncombining the advantages of the Replay and Pseudo-Label methods and solving\ntheir limitations in the proposed scenario. % part3 We evaluate our proposed\napproach on a challenging benchmark consisting of two datasets, seven tasks,\nand nineteen classes, modeling a realistic Continual Learning scenario. Our\nexperimental findings demonstrate the effectiveness of Pseudo-Label Replay in\naddressing the challenges posed by the complex scenario proposed. Our method\nsurpasses existing approaches, exhibiting superior performance while showing\nminimal forgetting.\n","authors":["Marina Ceccon","Davide Dalle Pezze","Alessandro Fabris","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2404.06859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10035v3","updated":"2024-04-10T09:34:03Z","published":"2023-02-20T15:34:03Z","title":"Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey","summary":" With the urgent demand for generalized deep models, many pre-trained big\nmodels are proposed, such as BERT, ViT, GPT, etc. Inspired by the success of\nthese models in single domains (like computer vision and natural language\nprocessing), the multi-modal pre-trained big models have also drawn more and\nmore attention in recent years. In this work, we give a comprehensive survey of\nthese models and hope this paper could provide new insights and helps fresh\nresearchers to track the most cutting-edge works. Specifically, we firstly\nintroduce the background of multi-modal pre-training by reviewing the\nconventional deep learning, pre-training works in natural language process,\ncomputer vision, and speech. Then, we introduce the task definition, key\nchallenges, and advantages of multi-modal pre-training models (MM-PTMs), and\ndiscuss the MM-PTMs with a focus on data, objectives, network architectures,\nand knowledge enhanced pre-training. After that, we introduce the downstream\ntasks used for the validation of large-scale MM-PTMs, including generative,\nclassification, and regression tasks. We also give visualization and analysis\nof the model parameters and results on representative downstream tasks.\nFinally, we point out possible research directions for this topic that may\nbenefit future works. In addition, we maintain a continuously updated paper\nlist for large-scale pre-trained multi-modal big models:\nhttps://github.com/wangxiao5791509/MultiModal_BigModels_Survey. This paper has\nbeen published by the journal Machine Intelligence Research (MIR),\nhttps://link.springer.com/article/10.1007/s11633-022-1410-8, DOI:\n10.1007/s11633-022-1410-8, vol. 20, no. 4, pp. 447-482, 2023.\n","authors":["Xiao Wang","Guangyao Chen","Guangwu Qian","Pengcheng Gao","Xiao-Yong Wei","Yaowei Wang","Yonghong Tian","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2302.10035v3.pdf","comment":"Accepted by Machine Intelligence Research (MIR)"},{"id":"http://arxiv.org/abs/2404.06851v1","updated":"2024-04-10T09:24:54Z","published":"2024-04-10T09:24:54Z","title":"UDiFF: Generating Conditional Unsigned Distance Fields with Optimal\n Wavelet Diffusion","summary":" Diffusion models have shown remarkable results for image generation, editing\nand inpainting. Recent works explore diffusion models for 3D shape generation\nwith neural implicit functions, i.e., signed distance function and occupancy\nfunction. However, they are limited to shapes with closed surfaces, which\nprevents them from generating diverse 3D real-world contents containing open\nsurfaces. In this work, we present UDiFF, a 3D diffusion model for unsigned\ndistance fields (UDFs) which is capable to generate textured 3D shapes with\nopen surfaces from text conditions or unconditionally. Our key idea is to\ngenerate UDFs in spatial-frequency domain with an optimal wavelet\ntransformation, which produces a compact representation space for UDF\ngeneration. Specifically, instead of selecting an appropriate wavelet\ntransformation which requires expensive manual efforts and still leads to large\ninformation loss, we propose a data-driven approach to learn the optimal\nwavelet transformation for UDFs. We evaluate UDiFF to show our advantages by\nnumerical and visual comparisons with the latest methods on widely used\nbenchmarks. Page: https://weiqi-zhang.github.io/UDiFF.\n","authors":["Junsheng Zhou","Weiqi Zhang","Baorui Ma","Kanle Shi","Yu-Shen Liu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2404.06851v1.pdf","comment":"To appear at CVPR2024. Project page:\n https://weiqi-zhang.github.io/UDiFF"},{"id":"http://arxiv.org/abs/2404.06842v1","updated":"2024-04-10T09:14:28Z","published":"2024-04-10T09:14:28Z","title":"MoCha-Stereo: Motif Channel Attention Network for Stereo Matching","summary":" Learning-based stereo matching techniques have made significant progress.\nHowever, existing methods inevitably lose geometrical structure information\nduring the feature channel generation process, resulting in edge detail\nmismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network\n(MoCha-Stereo) is designed to address this problem. We provide the Motif\nChannel Correlation Volume (MCCV) to determine more accurate edge matching\ncosts. MCCV is achieved by projecting motif channels, which capture common\ngeometric structures in feature channels, onto feature maps and cost volumes.\nIn addition, edge variations in %potential feature channels of the\nreconstruction error map also affect details matching, we propose the\nReconstruction Error Motif Penalty (REMP) module to further refine the\nfull-resolution disparity estimation. REMP integrates the frequency information\nof typical channel features from the reconstruction error. MoCha-Stereo ranks\n1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure\nalso shows excellent performance in Multi-View Stereo. Code is avaliable at\nhttps://github.com/ZYangChen/MoCha-Stereo.\n","authors":["Ziyang Chen","Wei Long","He Yao","Yongjun Zhang","Bingshu Wang","Yongbin Qin","Jia Wu"],"pdf_url":"https://arxiv.org/pdf/2404.06842v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2402.02263v2","updated":"2024-04-10T09:00:44Z","published":"2024-02-03T21:12:36Z","title":"MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly\n Mixed Classifiers","summary":" Adversarial robustness often comes at the cost of degraded accuracy, impeding\nthe real-life application of robust classification models. Training-based\nsolutions for better trade-offs are limited by incompatibilities with\nalready-trained high-performance large models, necessitating the exploration of\ntraining-free ensemble approaches. Observing that robust models are more\nconfident in correct predictions than in incorrect ones on clean and\nadversarial data alike, we speculate amplifying this \"benign confidence\nproperty\" can reconcile accuracy and robustness in an ensemble setting. To\nachieve so, we propose \"MixedNUTS\", a training-free method where the output\nlogits of a robust classifier and a standard non-robust classifier are\nprocessed by nonlinear transformations with only three parameters, which are\noptimized through an efficient algorithm. MixedNUTS then converts the\ntransformed logits into probabilities and mixes them as the overall output. On\nCIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom\nstrong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and\nnear-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points,\nsacrificing merely 0.87 points in robust accuracy.\n","authors":["Yatong Bai","Mo Zhou","Vishal M. Patel","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2402.02263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06836v1","updated":"2024-04-10T08:54:43Z","published":"2024-04-10T08:54:43Z","title":"O2V-Mapping: Online Open-Vocabulary Mapping with Neural Implicit\n Representation","summary":" Online construction of open-ended language scenes is crucial for robotic\napplications, where open-vocabulary interactive scene understanding is\nrequired. Recently, neural implicit representation has provided a promising\ndirection for online interactive mapping. However, implementing open-vocabulary\nscene understanding capability into online neural implicit mapping still faces\nthree challenges: lack of local scene updating ability, blurry spatial\nhierarchical semantic segmentation and difficulty in maintaining multi-view\nconsistency. To this end, we proposed O2V-mapping, which utilizes voxel-based\nlanguage and geometric features to create an open-vocabulary field, thus\nallowing for local updates during online training process. Additionally, we\nleverage a foundational model for image segmentation to extract language\nfeatures on object-level entities, achieving clear segmentation boundaries and\nhierarchical semantic features. For the purpose of preserving consistency in 3D\nobject properties across different viewpoints, we propose a spatial adaptive\nvoxel adjustment mechanism and a multi-view weight selection method. Extensive\nexperiments on open-vocabulary object localization and semantic segmentation\ndemonstrate that O2V-mapping achieves online construction of language scenes\nwhile enhancing accuracy, outperforming the previous SOTA method.\n","authors":["Muer Tie","Julong Wei","Zhengjun Wang","Ke Wu","Shansuai Yuan","Kaizhao Zhang","Jie Jia","Jieru Zhao","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2404.06836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06835v1","updated":"2024-04-10T08:54:00Z","published":"2024-04-10T08:54:00Z","title":"Tuning-Free Adaptive Style Incorporation for Structure-Consistent\n Text-Driven Style Transfer","summary":" In this work, we target the task of text-driven style transfer in the context\nof text-to-image (T2I) diffusion models. The main challenge is consistent\nstructure preservation while enabling effective style transfer effects. The\npast approaches in this field directly concatenate the content and style\nprompts for a prompt-level style injection, leading to unavoidable structure\ndistortions. In this work, we propose a novel solution to the text-driven style\ntransfer task, namely, Adaptive Style Incorporation~(ASI), to achieve\nfine-grained feature-level style incorporation. It consists of the Siamese\nCross-Attention~(SiCA) to decouple the single-track cross-attention to a\ndual-track structure to obtain separate content and style features, and the\nAdaptive Content-Style Blending (AdaBlending) module to couple the content and\nstyle information from a structure-consistent manner. Experimentally, our\nmethod exhibits much better performance in both structure preservation and\nstylized effects.\n","authors":["Yanqi Ge","Jiaqi Liu","Qingnan Fan","Xi Jiang","Ye Huang","Shuai Qin","Hong Gu","Wen Li","Lixin Duan"],"pdf_url":"https://arxiv.org/pdf/2404.06835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06832v1","updated":"2024-04-10T08:48:09Z","published":"2024-04-10T08:48:09Z","title":"SplatPose & Detect: Pose-Agnostic 3D Anomaly Detection","summary":" Detecting anomalies in images has become a well-explored problem in both\nacademia and industry. State-of-the-art algorithms are able to detect defects\nin increasingly difficult settings and data modalities. However, most current\nmethods are not suited to address 3D objects captured from differing poses.\nWhile solutions using Neural Radiance Fields (NeRFs) have been proposed, they\nsuffer from excessive computation requirements, which hinder real-world\nusability. For this reason, we propose the novel 3D Gaussian splatting-based\nframework SplatPose which, given multi-view images of a 3D object, accurately\nestimates the pose of unseen views in a differentiable manner, and detects\nanomalies in them. We achieve state-of-the-art results in both training and\ninference speed, and detection performance, even when using less training data\nthan competing methods. We thoroughly evaluate our framework using the recently\nproposed Pose-agnostic Anomaly Detection benchmark and its multi-pose anomaly\ndetection (MAD) data set.\n","authors":["Mathis Kruse","Marco Rudolph","Dominik Woiwode","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2404.06832v1.pdf","comment":"Visual Anomaly and Novelty Detection 2.0 Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02668v2","updated":"2024-04-10T08:47:32Z","published":"2024-04-03T12:06:01Z","title":"RS-Mamba for Large Remote Sensing Image Dense Prediction","summary":" Context modeling is critical for remote sensing image dense prediction tasks.\nNowadays, the growing size of very-high-resolution (VHR) remote sensing images\nposes challenges in effectively modeling context. While transformer-based\nmodels possess global modeling capabilities, they encounter computational\nchallenges when applied to large VHR images due to their quadratic complexity.\nThe conventional practice of cropping large images into smaller patches results\nin a notable loss of contextual information. To address these issues, we\npropose the Remote Sensing Mamba (RSM) for dense prediction tasks in large VHR\nremote sensing images. RSM is specifically designed to capture the global\ncontext of remote sensing images with linear complexity, facilitating the\neffective processing of large VHR images. Considering that the land covers in\nremote sensing images are distributed in arbitrary spatial directions due to\ncharacteristics of remote sensing over-head imaging, the RSM incorporates an\nomnidirectional selective scan module to globally model the context of images\nin multiple directions, capturing large spatial features from various\ndirections. Extensive experiments on semantic segmentation and change detection\ntasks across various land covers demonstrate the effectiveness of the proposed\nRSM. We designed simple yet effective models based on RSM, achieving\nstate-of-the-art performance on dense prediction tasks in VHR remote sensing\nimages without fancy training strategies. Leveraging the linear complexity and\nglobal modeling capabilities, RSM achieves better efficiency and accuracy than\ntransformer-based models on large remote sensing images. Interestingly, we also\ndemonstrated that our model generally performs better with a larger image size\non dense prediction tasks. Our code is available at\nhttps://github.com/walking-shadow/Official_Remote_Sensing_Mamba.\n","authors":["Sijie Zhao","Hao Chen","Xueliang Zhang","Pengfeng Xiao","Lei Bai","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2404.02668v2.pdf","comment":"15 pages,8 figures"},{"id":"http://arxiv.org/abs/2312.03502v2","updated":"2024-04-10T08:29:23Z","published":"2023-12-06T13:59:22Z","title":"Improving the Generalization of Segmentation Foundation Model under\n Distribution Shift via Weakly Supervised Adaptation","summary":" The success of large language models has inspired the computer vision\ncommunity to explore image segmentation foundation model that is able to\nzero/few-shot generalize through prompt engineering. Segment-Anything(SAM),\namong others, is the state-of-the-art image segmentation foundation model\ndemonstrating strong zero/few-shot generalization. Despite the success, recent\nstudies reveal the weakness of SAM under strong distribution shift. In\nparticular, SAM performs awkwardly on corrupted natural images, camouflaged\nimages, medical images, etc. Motivated by the observations, we aim to develop a\nself-training based strategy to adapt SAM to target distribution. Given the\nunique challenges of large source dataset, high computation cost and incorrect\npseudo label, we propose a weakly supervised self-training architecture with\nanchor regularization and low-rank finetuning to improve the robustness and\ncomputation efficiency of adaptation. We validate the effectiveness on 5 types\nof downstream segmentation tasks including natural clean/corrupted images,\nmedical images, camouflaged images and robotic images. Our proposed method is\ntask-agnostic in nature and outperforms pre-trained SAM and state-of-the-art\ndomain adaptation methods on almost all downstream tasks with the same testing\nprompt inputs.\n","authors":["Haojie Zhang","Yongyi Su","Xun Xu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2312.03502v2.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.10610v4","updated":"2024-04-10T08:16:18Z","published":"2023-08-21T10:20:46Z","title":"Ear-Keeper: Real-time Diagnosis of Ear Lesions Utilizing\n Ultralight-Ultrafast ConvNet and Large-scale Ear Endoscopic Dataset","summary":" Deep learning-based ear disease diagnosis technology has proven effective and\naffordable. However, due to the lack of ear endoscope datasets with diversity,\nthe practical potential of the deep learning model has not been thoroughly\nstudied. Moreover, existing research failed to achieve a good trade-off between\nmodel inference speed and parameter size, rendering models inapplicable in\nreal-world settings. To address these challenges, we constructed the first\nlarge-scale ear endoscopic dataset comprising eight types of ear diseases and\ndisease-free samples from two institutions. Inspired by ShuffleNetV2, we\nproposed Best-EarNet, an ultrafast and ultralight network enabling real-time\near disease diagnosis. Best-EarNet incorporates a novel Local-Global Spatial\nFeature Fusion Module and multi-scale supervision strategy, which facilitates\nthe model focusing on global-local information within feature maps at various\nlevels. Utilizing transfer learning, the accuracy of Best-EarNet with only\n0.77M parameters achieves 95.23% (internal 22,581 images) and 92.14% (external\n1,652 images), respectively. In particular, it achieves an average frame per\nsecond of 80 on the CPU. From the perspective of model practicality, the\nproposed Best-EarNet is superior to state-of-the-art backbone models in ear\nlesion detection tasks. Most importantly, Ear-keeper, an intelligent diagnosis\nsystem based Best-EarNet, was developed successfully and deployed on common\nelectronic devices (smartphone, tablet computer and personal computer). In the\nfuture, Ear-Keeper has the potential to assist the public and healthcare\nproviders in performing comprehensive scanning and diagnosis of the ear canal\nin real-time video, thereby promptly detecting ear lesions.\n","authors":["Yubiao Yue","Xinyu Zeng","Xiaoqiang Shi","Meiping Zhang","Fan Zhang","Yunxin Liang","Yan Liu","Zhenzhang Li","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2308.10610v4.pdf","comment":"18 pages,8 figures"},{"id":"http://arxiv.org/abs/2404.06814v1","updated":"2024-04-10T08:02:17Z","published":"2024-04-10T08:02:17Z","title":"Zero-shot Point Cloud Completion Via 2D Priors","summary":" 3D point cloud completion is designed to recover complete shapes from\npartially observed point clouds. Conventional completion methods typically\ndepend on extensive point cloud data for training %, with their effectiveness\noften constrained to object categories similar to those seen during training.\nIn contrast, we propose a zero-shot framework aimed at completing partially\nobserved point clouds across any unseen categories. Leveraging point rendering\nvia Gaussian Splatting, we develop techniques of Point Cloud Colorization and\nZero-shot Fractal Completion that utilize 2D priors from pre-trained diffusion\nmodels to infer missing regions. Experimental results on both synthetic and\nreal-world scanned point clouds demonstrate that our approach outperforms\nexisting methods in completing a variety of objects without any requirement for\nspecific training data.\n","authors":["Tianxin Huang","Zhiwen Yan","Yuyang Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.06814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05916v2","updated":"2024-04-10T07:58:44Z","published":"2024-03-09T13:56:25Z","title":"GPT as Psychologist? Preliminary Evaluations for GPT-4V on Visual\n Affective Computing","summary":" Multimodal large language models (MLLMs) are designed to process and\nintegrate information from multiple sources, such as text, speech, images, and\nvideos. Despite its success in language understanding, it is critical to\nevaluate the performance of downstream tasks for better human-centric\napplications. This paper assesses the application of MLLMs with 5 crucial\nabilities for affective computing, spanning from visual affective tasks and\nreasoning tasks. The results show that \\gpt has high accuracy in facial action\nunit recognition and micro-expression detection while its general facial\nexpression recognition performance is not accurate. We also highlight the\nchallenges of achieving fine-grained micro-expression recognition and the\npotential for further study and demonstrate the versatility and potential of\n\\gpt for handling advanced tasks in emotion recognition and related fields by\nintegrating with task-related agents for more complex tasks, such as heart rate\nestimation through signal processing. In conclusion, this paper provides\nvaluable insights into the potential applications and challenges of MLLMs in\nhuman-centric computing. Our interesting examples are at\nhttps://github.com/EnVision-Research/GPT4Affectivity.\n","authors":["Hao Lu","Xuesong Niu","Jiyao Wang","Yin Wang","Qingyong Hu","Jiaqi Tang","Yuting Zhang","Kaishen Yuan","Bin Huang","Zitong Yu","Dengbo He","Shuiguang Deng","Hao Chen","Yingcong Chen","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2403.05916v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08551v3","updated":"2024-04-10T07:58:04Z","published":"2024-03-13T14:02:54Z","title":"GaussianImage: 1000 FPS Image Representation and Compression by 2D\n Gaussian Splatting","summary":" Implicit neural representations (INRs) recently achieved great success in\nimage representation and compression, offering high visual quality and fast\nrendering speeds with 10-1000 FPS, assuming sufficient GPU resources are\navailable. However, this requirement often hinders their use on low-end devices\nwith limited memory. In response, we propose a groundbreaking paradigm of image\nrepresentation and compression by 2D Gaussian Splatting, named GaussianImage.\nWe first introduce 2D Gaussian to represent the image, where each Gaussian has\n8 parameters including position, covariance and color. Subsequently, we unveil\na novel rendering algorithm based on accumulated summation. Remarkably, our\nmethod with a minimum of 3$\\times$ lower GPU memory usage and 5$\\times$ faster\nfitting time not only rivals INRs (e.g., WIRE, I-NGP) in representation\nperformance, but also delivers a faster rendering speed of 1500-2000 FPS\nregardless of parameter size. Furthermore, we integrate existing vector\nquantization technique to build an image codec. Experimental results\ndemonstrate that our codec attains rate-distortion performance comparable to\ncompression-based INRs such as COIN and COIN++, while facilitating decoding\nspeeds of approximately 1000 FPS. Additionally, preliminary proof of concept\nshows that our codec surpasses COIN and COIN++ in performance when using\npartial bits-back coding. Code will be available at\nhttps://github.com/Xinjie-Q/GaussianImage.\n","authors":["Xinjie Zhang","Xingtong Ge","Tongda Xu","Dailan He","Yan Wang","Hongwei Qin","Guo Lu","Jing Geng","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08551v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07354v4","updated":"2024-04-10T07:54:14Z","published":"2024-02-12T01:03:39Z","title":"Re-DiffiNet: Modeling discrepancies in tumor segmentation using\n diffusion models","summary":" Identification of tumor margins is essential for surgical decision-making for\nglioblastoma patients and provides reliable assistance for neurosurgeons.\nDespite improvements in deep learning architectures for tumor segmentation over\nthe years, creating a fully autonomous system suitable for clinical floors\nremains a formidable challenge because the model predictions have not yet\nreached the desired level of accuracy and generalizability for clinical\napplications. Generative modeling techniques have seen significant improvements\nin recent times. Specifically, Generative Adversarial Networks (GANs) and\nDenoising-diffusion-based models (DDPMs) have been used to generate\nhigher-quality images with fewer artifacts and finer attributes. In this work,\nwe introduce a framework called Re-Diffinet for modeling the discrepancy\nbetween the outputs of a segmentation model like U-Net and the ground truth,\nusing DDPMs. By explicitly modeling the discrepancy, the results show an\naverage improvement of 0.55\\% in the Dice score and 16.28\\% in HD95 from\ncross-validation over 5-folds, compared to the state-of-the-art U-Net\nsegmentation model.\n","authors":["Tianyi Ren","Abhishek Sharma","Juampablo Heras Rivera","Harshitha Rebala","Ethan Honey","Agamdeep Chopra","Jacob Ruzevick","Mehmet Kurt"],"pdf_url":"https://arxiv.org/pdf/2402.07354v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05063v2","updated":"2024-04-10T07:44:40Z","published":"2024-04-07T20:19:04Z","title":"AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with\n Implicit Disentanglement","summary":" Facial action unit (AU) intensity plays a pivotal role in quantifying\nfine-grained expression behaviors, which is an effective condition for facial\nexpression manipulation. However, publicly available datasets containing\nintensity annotations for multiple AUs remain severely limited, often featuring\na restricted number of subjects. This limitation places challenges to the AU\nintensity manipulation in images due to disentanglement issues, leading\nresearchers to resort to other large datasets with pretrained AU intensity\nestimators for pseudo labels. In addressing this constraint and fully\nleveraging manual annotations of AU intensities for precise manipulation, we\nintroduce AUEditNet. Our proposed model achieves impressive intensity\nmanipulation across 12 AUs, trained effectively with only 18 subjects.\nUtilizing a dual-branch architecture, our approach achieves comprehensive\ndisentanglement of facial attributes and identity without necessitating\nadditional loss functions or implementing with large batch sizes. This approach\noffers a potential solution to achieve desired facial attribute editing despite\nthe dataset's limited subject count. Our experiments demonstrate AUEditNet's\nsuperior accuracy in editing AU intensities, affirming its capability in\ndisentangling facial attributes and identity within a limited subject pool.\nAUEditNet allows conditioning by either intensity values or target images,\neliminating the need for constructing AU combinations for specific facial\nexpression synthesis. Moreover, AU intensity estimation, as a downstream task,\nvalidates the consistency between real and edited images, confirming the\neffectiveness of our proposed AU intensity manipulation method.\n","authors":["Shiwei Jin","Zhen Wang","Lei Wang","Peng Liu","Ning Bi","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06798v1","updated":"2024-04-10T07:41:35Z","published":"2024-04-10T07:41:35Z","title":"MedRG: Medical Report Grounding with Multi-modal Large Language Model","summary":" Medical Report Grounding is pivotal in identifying the most relevant regions\nin medical images based on a given phrase query, a critical aspect in medical\nimage analysis and radiological diagnosis. However, prevailing visual grounding\napproaches necessitate the manual extraction of key phrases from medical\nreports, imposing substantial burdens on both system efficiency and physicians.\nIn this paper, we introduce a novel framework, Medical Report Grounding\n(MedRG), an end-to-end solution for utilizing a multi-modal Large Language\nModel to predict key phrase by incorporating a unique token, BOX, into the\nvocabulary to serve as an embedding for unlocking detection capabilities.\nSubsequently, the vision encoder-decoder jointly decodes the hidden embedding\nand the input medical image, generating the corresponding grounding box. The\nexperimental results validate the effectiveness of MedRG, surpassing the\nperformance of the existing state-of-the-art medical phrase grounding methods.\nThis study represents a pioneering exploration of the medical report grounding\ntask, marking the first-ever endeavor in this domain.\n","authors":["Ke Zou","Yang Bai","Zhihao Chen","Yang Zhou","Yidi Chen","Kai Ren","Meng Wang","Xuedong Yuan","Xiaojing Shen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2404.06798v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.15361v2","updated":"2024-04-10T06:46:08Z","published":"2023-11-26T17:27:26Z","title":"Ultra-Range Gesture Recognition using a Web-Camera in Human-Robot\n Interaction","summary":" Hand gestures play a significant role in human interactions where non-verbal\nintentions, thoughts and commands are conveyed. In Human-Robot Interaction\n(HRI), hand gestures offer a similar and efficient medium for conveying clear\nand rapid directives to a robotic agent. However, state-of-the-art vision-based\nmethods for gesture recognition have been shown to be effective only up to a\nuser-camera distance of seven meters. Such a short distance range limits\npractical HRI with, for example, service robots, search and rescue robots and\ndrones. In this work, we address the Ultra-Range Gesture Recognition (URGR)\nproblem by aiming for a recognition distance of up to 25 meters and in the\ncontext of HRI. We propose the URGR framework, a novel deep-learning, using\nsolely a simple RGB camera. Gesture inference is based on a single image.\nFirst, a novel super-resolution model termed High-Quality Network (HQ-Net) uses\na set of self-attention and convolutional layers to enhance the low-resolution\nimage of the user. Then, we propose a novel URGR classifier termed Graph Vision\nTransformer (GViT) which takes the enhanced image as input. GViT combines the\nbenefits of a Graph Convolutional Network (GCN) and a modified Vision\nTransformer (ViT). Evaluation of the proposed framework over diverse test data\nyields a high recognition rate of 98.1%. The framework has also exhibited\nsuperior performance compared to human recognition in ultra-range distances.\nWith the framework, we analyze and demonstrate the performance of an autonomous\nquadruped robot directed by human gestures in complex ultra-range indoor and\noutdoor environments, acquiring 96% recognition rate on average.\n","authors":["Eran Bamani","Eden Nissinman","Inbar Meir","Lisa Koenigsberg","Avishai Sintov"],"pdf_url":"https://arxiv.org/pdf/2311.15361v2.pdf","comment":"Engineering Applications of Artificial Intelligence, In press"},{"id":"http://arxiv.org/abs/2404.06780v1","updated":"2024-04-10T06:41:30Z","published":"2024-04-10T06:41:30Z","title":"Urban Architect: Steerable 3D Urban Scene Generation with Layout Prior","summary":" Text-to-3D generation has achieved remarkable success via large-scale\ntext-to-image diffusion models. Nevertheless, there is no paradigm for scaling\nup the methodology to urban scale. Urban scenes, characterized by numerous\nelements, intricate arrangement relationships, and vast scale, present a\nformidable barrier to the interpretability of ambiguous textual descriptions\nfor effective model optimization. In this work, we surmount the limitations by\nintroducing a compositional 3D layout representation into text-to-3D paradigm,\nserving as an additional prior. It comprises a set of semantic primitives with\nsimple geometric structures and explicit arrangement relationships,\ncomplementing textual descriptions and enabling steerable generation. Upon\nthis, we propose two modifications -- (1) We introduce Layout-Guided\nVariational Score Distillation to address model optimization inadequacies. It\nconditions the score distillation sampling process with geometric and semantic\nconstraints of 3D layouts. (2) To handle the unbounded nature of urban scenes,\nwe represent 3D scene with a Scalable Hash Grid structure, incrementally\nadapting to the growing scale of urban scenes. Extensive experiments\nsubstantiate the capability of our framework to scale text-to-3D generation to\nlarge-scale urban scenes that cover over 1000m driving distance for the first\ntime. We also present various scene editing demonstrations, showing the powers\nof steerable urban scene generation. Website: https://urbanarchitect.github.io.\n","authors":["Fan Lu","Kwan-Yee Lin","Yan Xu","Hongsheng Li","Guang Chen","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.06780v1.pdf","comment":"Project page: https://urbanarchitect.github.io/"},{"id":"http://arxiv.org/abs/2404.06779v1","updated":"2024-04-10T06:39:18Z","published":"2024-04-10T06:39:18Z","title":"Efficient and Scalable Chinese Vector Font Generation via Component\n Composition","summary":" Chinese vector font generation is challenging due to the complex structure\nand huge amount of Chinese characters. Recent advances remain limited to\ngenerating a small set of characters with simple structure. In this work, we\nfirst observe that most Chinese characters can be disassembled into\nfrequently-reused components. Therefore, we introduce the first efficient and\nscalable Chinese vector font generation approach via component composition,\nallowing generating numerous vector characters from a small set of components.\nTo achieve this, we collect a large-scale dataset that contains over\n\\textit{90K} Chinese characters with their components and layout information.\nUpon the dataset, we propose a simple yet effective framework based on spatial\ntransformer networks (STN) and multiple losses tailored to font characteristics\nto learn the affine transformation of the components, which can be directly\napplied to the B\\'ezier curves, resulting in Chinese characters in vector\nformat. Our qualitative and quantitative experiments have demonstrated that our\nmethod significantly surpasses the state-of-the-art vector font generation\nmethods in generating large-scale complex Chinese characters in both font\ngeneration and zero-shot font extension.\n","authors":["Jinyu Song","Weitao You","Shuhui Shi","Shuxuan Guo","Lingyun Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06779v1.pdf","comment":"15 pages, 23 figures"},{"id":"http://arxiv.org/abs/2403.19837v3","updated":"2024-04-10T23:47:34Z","published":"2024-03-28T21:15:38Z","title":"Concept-based Analysis of Neural Networks via Vision-Language Models","summary":" The analysis of vision-based deep neural networks (DNNs) is highly desirable\nbut it is very challenging due to the difficulty of expressing formal\nspecifications for vision tasks and the lack of efficient verification\nprocedures. In this paper, we propose to leverage emerging multimodal,\nvision-language, foundation models (VLMs) as a lens through which we can reason\nabout vision models. VLMs have been trained on a large body of images\naccompanied by their textual description, and are thus implicitly aware of\nhigh-level, human-understandable concepts describing the images. We describe a\nlogical specification language $\\texttt{Con}_{\\texttt{spec}}$ designed to\nfacilitate writing specifications in terms of these concepts. To define and\nformally check $\\texttt{Con}_{\\texttt{spec}}$ specifications, we build a map\nbetween the internal representations of a given vision model and a VLM, leading\nto an efficient verification procedure of natural-language properties for\nvision models. We demonstrate our techniques on a ResNet-based classifier\ntrained on the RIVAL-10 dataset using CLIP as the multimodal model.\n","authors":["Ravi Mangal","Nina Narodytska","Divya Gopinath","Boyue Caroline Hu","Anirban Roy","Susmit Jha","Corina Pasareanu"],"pdf_url":"https://arxiv.org/pdf/2403.19837v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14666v2","updated":"2024-04-10T23:39:38Z","published":"2023-08-24T17:47:32Z","title":"Learning to Predict 3D Rotational Dynamics from Images of a Rigid Body\n with Unknown Mass Distribution","summary":" In many real-world settings, image observations of freely rotating 3D rigid\nbodies may be available when low-dimensional measurements are not. However, the\nhigh-dimensionality of image data precludes the use of classical estimation\ntechniques to learn the dynamics. The usefulness of standard deep learning\nmethods is also limited, because an image of a rigid body reveals nothing about\nthe distribution of mass inside the body, which, together with initial angular\nvelocity, is what determines how the body will rotate. We present a\nphysics-based neural network model to estimate and predict 3D rotational\ndynamics from image sequences. We achieve this using a multi-stage prediction\npipeline that maps individual images to a latent representation homeomorphic to\n$\\mathbf{SO}(3)$, computes angular velocities from latent pairs, and predicts\nfuture latent states using the Hamiltonian equations of motion. We demonstrate\nthe efficacy of our approach on new rotating rigid-body datasets of sequences\nof synthetic images of rotating objects, including cubes, prisms and\nsatellites, with unknown uniform and non-uniform mass distributions. Our model\noutperforms competing baselines on our datasets, producing better qualitative\npredictions and reducing the error observed for the state-of-the-art\nHamiltonian Generative Network by a factor of 2.\n","authors":["Justice Mason","Christine Allen-Blanchette","Nicholas Zolman","Elizabeth Davison","Naomi Ehrich Leonard"],"pdf_url":"https://arxiv.org/pdf/2308.14666v2.pdf","comment":"Previously appeared as arXiv:2209.11355v2, which was submitted as a\n replacement by accident. arXiv admin note: text overlap with arXiv:2209.11355"},{"id":"http://arxiv.org/abs/2404.07389v1","updated":"2024-04-10T23:30:54Z","published":"2024-04-10T23:30:54Z","title":"Object-Conditioned Energy-Based Attention Map Alignment in Text-to-Image\n Diffusion Models","summary":" Text-to-image diffusion models have shown great success in generating\nhigh-quality text-guided images. Yet, these models may still fail to\nsemantically align generated images with the provided text prompts, leading to\nproblems like incorrect attribute binding and/or catastrophic object neglect.\nGiven the pervasive object-oriented structure underlying text prompts, we\nintroduce a novel object-conditioned Energy-Based Attention Map Alignment\n(EBAMA) method to address the aforementioned problems. We show that an\nobject-centric attribute binding loss naturally emerges by approximately\nmaximizing the log-likelihood of a $z$-parameterized energy-based model with\nthe help of the negative sampling technique. We further propose an\nobject-centric intensity regularizer to prevent excessive shifts of objects\nattention towards their attributes. Extensive qualitative and quantitative\nexperiments, including human evaluation, on several challenging benchmarks\ndemonstrate the superior performance of our method over previous strong\ncounterparts. With better aligned attention maps, our approach shows great\npromise in further enhancing the text-controlled image editing ability of\ndiffusion models.\n","authors":["Yasi Zhang","Peiyu Yu","Ying Nian Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07377v1","updated":"2024-04-10T22:35:06Z","published":"2024-04-10T22:35:06Z","title":"Deep Generative Sampling in the Dual Divergence Space: A Data-efficient\n & Interpretative Approach for Generative AI","summary":" Building on the remarkable achievements in generative sampling of natural\nimages, we propose an innovative challenge, potentially overly ambitious, which\ninvolves generating samples of entire multivariate time series that resemble\nimages. However, the statistical challenge lies in the small sample size,\nsometimes consisting of a few hundred subjects. This issue is especially\nproblematic for deep generative models that follow the conventional approach of\ngenerating samples from a canonical distribution and then decoding or denoising\nthem to match the true data distribution. In contrast, our method is grounded\nin information theory and aims to implicitly characterize the distribution of\nimages, particularly the (global and local) dependency structure between\npixels. We achieve this by empirically estimating its KL-divergence in the dual\nform with respect to the respective marginal distribution. This enables us to\nperform generative sampling directly in the optimized 1-D dual divergence\nspace. Specifically, in the dual space, training samples representing the data\ndistribution are embedded in the form of various clusters between two end\npoints. In theory, any sample embedded between those two end points is\nin-distribution w.r.t. the data distribution. Our key idea for generating novel\nsamples of images is to interpolate between the clusters via a walk as per\ngradients of the dual function w.r.t. the data dimensions. In addition to the\ndata efficiency gained from direct sampling, we propose an algorithm that\noffers a significant reduction in sample complexity for estimating the\ndivergence of the data distribution with respect to the marginal distribution.\nWe provide strong theoretical guarantees along with an extensive empirical\nevaluation using many real-world datasets from diverse domains, establishing\nthe superiority of our approach w.r.t. state-of-the-art deep learning methods.\n","authors":["Sahil Garg","Anderson Schneider","Anant Raj","Kashif Rasul","Yuriy Nevmyvaka","Sneihil Gopal","Amit Dhurandhar","Guillermo Cecchi","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2404.07377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07374v1","updated":"2024-04-10T22:16:20Z","published":"2024-04-10T22:16:20Z","title":"Improving Multi-Center Generalizability of GAN-Based Fat Suppression\n using Federated Learning","summary":" Generative Adversarial Network (GAN)-based synthesis of fat suppressed (FS)\nMRIs from non-FS proton density sequences has the potential to accelerate\nacquisition of knee MRIs. However, GANs trained on single-site data have poor\ngeneralizability to external data. We show that federated learning can improve\nmulti-center generalizability of GANs for synthesizing FS MRIs, while\nfacilitating privacy-preserving multi-institutional collaborations.\n","authors":["Pranav Kulkarni","Adway Kanhere","Harshita Kukreja","Vivian Zhang","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2404.07374v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.07356v1","updated":"2024-04-10T21:23:13Z","published":"2024-04-10T21:23:13Z","title":"GANsemble for Small and Imbalanced Data Sets: A Baseline for Synthetic\n Microplastics Data","summary":" Microplastic particle ingestion or inhalation by humans is a problem of\ngrowing concern. Unfortunately, current research methods that use machine\nlearning to understand their potential harms are obstructed by a lack of\navailable data. Deep learning techniques in particular are challenged by such\ndomains where only small or imbalanced data sets are available. Overcoming this\nchallenge often involves oversampling underrepresented classes or augmenting\nthe existing data to improve model performance. This paper proposes GANsemble:\na two-module framework connecting data augmentation with conditional generative\nadversarial networks (cGANs) to generate class-conditioned synthetic data.\nFirst, the data chooser module automates augmentation strategy selection by\nsearching for the best data augmentation strategy. Next, the cGAN module uses\nthis strategy to train a cGAN for generating enhanced synthetic data. We\nexperiment with the GANsemble framework on a small and imbalanced microplastics\ndata set. A Microplastic-cGAN (MPcGAN) algorithm is introduced, and baselines\nfor synthetic microplastics (SYMP) data are established in terms of Frechet\nInception Distance (FID) and Inception Scores (IS). We also provide a synthetic\nmicroplastics filter (SYMP-Filter) algorithm to increase the quality of\ngenerated SYMP. Additionally, we show the best amount of oversampling with\naugmentation to fix class imbalance in small microplastics data sets. To our\nknowledge, this study is the first application of generative AI to\nsynthetically create microplastics data.\n","authors":["Daniel Platnick","Sourena Khanzadeh","Alireza Sadeghian","Richard Anthony Valenzano"],"pdf_url":"https://arxiv.org/pdf/2404.07356v1.pdf","comment":"Accepted to the 37th Canadian Artificial Intelligence Conference\n (2024), 12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.16133v2","updated":"2024-04-10T21:19:33Z","published":"2023-09-28T03:30:50Z","title":"Mask4Former: Mask Transformer for 4D Panoptic Segmentation","summary":" Accurately perceiving and tracking instances over time is essential for the\ndecision-making processes of autonomous agents interacting safely in dynamic\nenvironments. With this intention, we propose Mask4Former for the challenging\ntask of 4D panoptic segmentation of LiDAR point clouds. Mask4Former is the\nfirst transformer-based approach unifying semantic instance segmentation and\ntracking of sparse and irregular sequences of 3D point clouds into a single\njoint model. Our model directly predicts semantic instances and their temporal\nassociations without relying on hand-crafted non-learned association strategies\nsuch as probabilistic clustering or voting-based center prediction. Instead,\nMask4Former introduces spatio-temporal instance queries that encode the\nsemantic and geometric properties of each semantic tracklet in the sequence. In\nan in-depth study, we find that promoting spatially compact instance\npredictions is critical as spatio-temporal instance queries tend to merge\nmultiple semantically similar instances, even if they are spatially distant. To\nthis end, we regress 6-DOF bounding box parameters from spatio-temporal\ninstance queries, which are used as an auxiliary task to foster spatially\ncompact predictions. Mask4Former achieves a new state-of-the-art on the\nSemanticKITTI test set with a score of 68.4 LSTQ.\n","authors":["Kadir Yilmaz","Jonas Schult","Alexey Nekrasov","Bastian Leibe"],"pdf_url":"https://arxiv.org/pdf/2309.16133v2.pdf","comment":"Renamed from MASK4D to Mask4Former. ICRA 2024. Project page:\n https://vision.rwth-aachen.de/Mask4Former"},{"id":"http://arxiv.org/abs/2404.07351v1","updated":"2024-04-10T21:14:33Z","published":"2024-04-10T21:14:33Z","title":"A Transformer-Based Model for the Prediction of Human Gaze Behavior on\n Videos","summary":" Eye-tracking applications that utilize the human gaze in video understanding\ntasks have become increasingly important. To effectively automate the process\nof video analysis based on eye-tracking data, it is important to accurately\nreplicate human gaze behavior. However, this task presents significant\nchallenges due to the inherent complexity and ambiguity of human gaze patterns.\nIn this work, we introduce a novel method for simulating human gaze behavior.\nOur approach uses a transformer-based reinforcement learning algorithm to train\nan agent that acts as a human observer, with the primary role of watching\nvideos and simulating human gaze behavior. We employed an eye-tracking dataset\ngathered from videos generated by the VirtualHome simulator, with a primary\nfocus on activity recognition. Our experimental results demonstrate the\neffectiveness of our gaze prediction method by highlighting its capability to\nreplicate human gaze behavior and its applicability for downstream tasks where\nreal human-gaze is used as input.\n","authors":["Suleyman Ozdel","Yao Rong","Berat Mert Albaba","Yen-Ling Kuo","Xi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07351v1.pdf","comment":"2024 Symposium on Eye Tracking Research and Applications (ETRA24),\n Glasgow, United Kingdom"},{"id":"http://arxiv.org/abs/2309.04071v2","updated":"2024-04-10T21:09:15Z","published":"2023-09-08T02:05:03Z","title":"Enhancing Hierarchical Transformers for Whole Brain Segmentation with\n Intracranial Measurements Integration","summary":" Whole brain segmentation with magnetic resonance imaging (MRI) enables the\nnon-invasive measurement of brain regions, including total intracranial volume\n(TICV) and posterior fossa volume (PFV). Enhancing the existing whole brain\nsegmentation methodology to incorporate intracranial measurements offers a\nheightened level of comprehensiveness in the analysis of brain structures.\nDespite its potential, the task of generalizing deep learning techniques for\nintracranial measurements faces data availability constraints due to limited\nmanually annotated atlases encompassing whole brain and TICV/PFV labels. In\nthis paper, we enhancing the hierarchical transformer UNesT for whole brain\nsegmentation to achieve segmenting whole brain with 133 classes and TICV/PFV\nsimultaneously. To address the problem of data scarcity, the model is first\npretrained on 4859 T1-weighted (T1w) 3D volumes sourced from 8 different sites.\nThese volumes are processed through a multi-atlas segmentation pipeline for\nlabel generation, while TICV/PFV labels are unavailable. Subsequently, the\nmodel is finetuned with 45 T1w 3D volumes from Open Access Series Imaging\nStudies (OASIS) where both 133 whole brain classes and TICV/PFV labels are\navailable. We evaluate our method with Dice similarity coefficients(DSC). We\nshow that our model is able to conduct precise TICV/PFV estimation while\nmaintaining the 132 brain regions performance at a comparable level. Code and\ntrained model are available at:\nhttps://github.com/MASILab/UNesT/tree/main/wholebrainSeg.\n","authors":["Xin Yu","Yucheng Tang","Qi Yang","Ho Hin Lee","Shunxing Bao","Yuankai Huo","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2309.04071v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07347v1","updated":"2024-04-10T21:03:23Z","published":"2024-04-10T21:03:23Z","title":"Gaze-Guided Graph Neural Network for Action Anticipation Conditioned on\n Intention","summary":" Humans utilize their gaze to concentrate on essential information while\nperceiving and interpreting intentions in videos. Incorporating human gaze into\ncomputational algorithms can significantly enhance model performance in video\nunderstanding tasks. In this work, we address a challenging and innovative task\nin video understanding: predicting the actions of an agent in a video based on\na partial video. We introduce the Gaze-guided Action Anticipation algorithm,\nwhich establishes a visual-semantic graph from the video input. Our method\nutilizes a Graph Neural Network to recognize the agent's intention and predict\nthe action sequence to fulfill this intention. To assess the efficiency of our\napproach, we collect a dataset containing household activities generated in the\nVirtualHome environment, accompanied by human gaze data of viewing videos. Our\nmethod outperforms state-of-the-art techniques, achieving a 7\\% improvement in\naccuracy for 18-class intention recognition. This highlights the efficiency of\nour method in learning important features from human gaze data.\n","authors":["Suleyman Ozdel","Yao Rong","Berat Mert Albaba","Yen-Ling Kuo","Xi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07347v1.pdf","comment":"2024 Symposium on Eye Tracking Research and Applications (ETRA24),\n Glasgow, United Kingdom"},{"id":"http://arxiv.org/abs/2404.07336v1","updated":"2024-04-10T20:32:24Z","published":"2024-04-10T20:32:24Z","title":"PEAVS: Perceptual Evaluation of Audio-Visual Synchrony Grounded in\n Viewers' Opinion Scores","summary":" Recent advancements in audio-visual generative modeling have been propelled\nby progress in deep learning and the availability of data-rich benchmarks.\nHowever, the growth is not attributed solely to models and benchmarks.\nUniversally accepted evaluation metrics also play an important role in\nadvancing the field. While there are many metrics available to evaluate audio\nand visual content separately, there is a lack of metrics that offer a\nquantitative and interpretable measure of audio-visual synchronization for\nvideos \"in the wild\". To address this gap, we first created a large scale human\nannotated dataset (100+ hrs) representing nine types of synchronization errors\nin audio-visual content and how human perceive them. We then developed a PEAVS\n(Perceptual Evaluation of Audio-Visual Synchrony) score, a novel automatic\nmetric with a 5-point scale that evaluates the quality of audio-visual\nsynchronization. We validate PEAVS using a newly generated dataset, achieving a\nPearson correlation of 0.79 at the set level and 0.54 at the clip level when\ncompared to human labels. In our experiments, we observe a relative gain 50%\nover a natural extension of Fr\\'echet based metrics for Audio-Visual synchrony,\nconfirming PEAVS efficacy in objectively modeling subjective perceptions of\naudio-visual synchronization for videos \"in the wild\".\n","authors":["Lucas Goncalves","Prashant Mathur","Chandrashekhar Lavania","Metehan Cekic","Marcello Federico","Kyu J. Han"],"pdf_url":"https://arxiv.org/pdf/2404.07336v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2403.19653v2","updated":"2024-04-10T20:03:05Z","published":"2024-03-28T17:59:42Z","title":"Detecting Image Attribution for Text-to-Image Diffusion Models in RGB\n and Beyond","summary":" Modern text-to-image (T2I) diffusion models can generate images with\nremarkable realism and creativity. These advancements have sparked research in\nfake image detection and attribution, yet prior studies have not fully explored\nthe practical and scientific dimensions of this task. In addition to\nattributing images to 12 state-of-the-art T2I generators, we provide extensive\nanalyses on what inference stage hyperparameters and image modifications are\ndiscernible. Our experiments reveal that initialization seeds are highly\ndetectable, along with other subtle variations in the image generation process\nto some extent. We further investigate what visual traces are leveraged in\nimage attribution by perturbing high-frequency details and employing mid-level\nrepresentations of image style and structure. Notably, altering high-frequency\ninformation causes only slight reductions in accuracy, and training an\nattributor on style representations outperforms training on RGB images. Our\nanalyses underscore that fake images are detectable and attributable at various\nlevels of visual granularity than previously explored.\n","authors":["Katherine Xu","Lingzhi Zhang","Jianbo Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19653v2.pdf","comment":"Code available at https://github.com/k8xu/ImageAttribution"},{"id":"http://arxiv.org/abs/2404.07318v1","updated":"2024-04-10T19:39:43Z","published":"2024-04-10T19:39:43Z","title":"Rethinking Perceptual Metrics for Medical Image Translation","summary":" Modern medical image translation methods use generative models for tasks such\nas the conversion of CT images to MRI. Evaluating these methods typically\nrelies on some chosen downstream task in the target domain, such as\nsegmentation. On the other hand, task-agnostic metrics are attractive, such as\nthe network feature-based perceptual metrics (e.g., FID) that are common to\nimage translation in general computer vision. In this paper, we investigate\nevaluation metrics for medical image translation on two medical image\ntranslation tasks (GE breast MRI to Siemens breast MRI and lumbar spine MRI to\nCT), tested on various state-of-the-art translation methods. We show that\nperceptual metrics do not generally correlate with segmentation metrics due to\nthem extending poorly to the anatomical constraints of this sub-field, with FID\nbeing especially inconsistent. However, we find that the lesser-used\npixel-level SWD metric may be useful for subtle intra-modality translation. Our\nresults demonstrate the need for further research into helpful metrics for\nmedical image translation.\n","authors":["Nicholas Konz","Yuwen Chen","Hanxue Gu","Haoyu Dong","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2404.07318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07306v1","updated":"2024-04-10T18:58:05Z","published":"2024-04-10T18:58:05Z","title":"AI-Guided Defect Detection Techniques to Model Single Crystal Diamond\n Growth","summary":" From a process development perspective, diamond growth via chemical vapor\ndeposition has made significant strides. However, challenges persist in\nachieving high quality and large-area material production. These difficulties\ninclude controlling conditions to maintain uniform growth rates for the entire\ngrowth surface. As growth progresses, various factors or defect states emerge,\naltering the uniform conditions. These changes affect the growth rate and\nresult in the formation of crystalline defects at the microscale. However,\nthere is a distinct lack of methods to identify these defect states and their\ngeometry using images taken during the growth process. This paper details\nseminal work on defect segmentation pipeline using in-situ optical images to\nidentify features that indicate defective states that are visible at the\nmacroscale. Using a semantic segmentation approach as applied in our previous\nwork, these defect states and corresponding derivative features are isolated\nand classified by their pixel masks. Using an annotation focused\nhuman-in-the-loop software architecture to produce training datasets, with\nmodules for selective data labeling using active learning, data augmentations,\nand model-assisted labeling, our approach achieves effective annotation\naccuracy and drastically reduces the time and cost of labeling by orders of\nmagnitude. On the model development front, we found that deep learning-based\nalgorithms are the most efficient. They can accurately learn complex\nrepresentations from feature-rich datasets. Our best-performing model, based on\nthe YOLOV3 and DeeplabV3plus architectures, achieved excellent accuracy for\nspecific features of interest. Specifically, it reached 93.35% accuracy for\ncenter defects, 92.83% for polycrystalline defects, and 91.98% for edge\ndefects.\n","authors":["Rohan Reddy Mekala","Elias Garratt","Matthias Muehle","Arjun Srinivasan","Adam Porter","Mikael Lindvall"],"pdf_url":"https://arxiv.org/pdf/2404.07306v1.pdf","comment":"12 pages,4 figures,ACMME 2024"},{"id":"http://arxiv.org/abs/2404.07292v1","updated":"2024-04-10T18:40:23Z","published":"2024-04-10T18:40:23Z","title":"Solving Masked Jigsaw Puzzles with Diffusion Vision Transformers","summary":" Solving image and video jigsaw puzzles poses the challenging task of\nrearranging image fragments or video frames from unordered sequences to restore\nmeaningful images and video sequences. Existing approaches often hinge on\ndiscriminative models tasked with predicting either the absolute positions of\npuzzle elements or the permutation actions applied to the original data.\nUnfortunately, these methods face limitations in effectively solving puzzles\nwith a large number of elements. In this paper, we propose JPDVT, an innovative\napproach that harnesses diffusion transformers to address this challenge.\nSpecifically, we generate positional information for image patches or video\nframes, conditioned on their underlying visual content. This information is\nthen employed to accurately assemble the puzzle pieces in their correct\npositions, even in scenarios involving missing pieces. Our method achieves\nstate-of-the-art performance on several datasets.\n","authors":["Jinyang Liu","Wondmgezahu Teshome","Sandesh Ghimire","Mario Sznaier","Octavia Camps"],"pdf_url":"https://arxiv.org/pdf/2404.07292v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.06287v2","updated":"2024-04-10T18:16:32Z","published":"2024-01-11T23:00:24Z","title":"Hierarchical Augmentation and Distillation for Class Incremental\n Audio-Visual Video Recognition","summary":" Audio-visual video recognition (AVVR) aims to integrate audio and visual\nclues to categorize videos accurately. While existing methods train AVVR models\nusing provided datasets and achieve satisfactory results, they struggle to\nretain historical class knowledge when confronted with new classes in\nreal-world situations. Currently, there are no dedicated methods for addressing\nthis problem, so this paper concentrates on exploring Class Incremental\nAudio-Visual Video Recognition (CIAVVR). For CIAVVR, since both stored data and\nlearned model of past classes contain historical knowledge, the core challenge\nis how to capture past data knowledge and past model knowledge to prevent\ncatastrophic forgetting. We introduce Hierarchical Augmentation and\nDistillation (HAD), which comprises the Hierarchical Augmentation Module (HAM)\nand Hierarchical Distillation Module (HDM) to efficiently utilize the\nhierarchical structure of data and models, respectively. Specifically, HAM\nimplements a novel augmentation strategy, segmental feature augmentation, to\npreserve hierarchical model knowledge. Meanwhile, HDM introduces newly designed\nhierarchical (video-distribution) logical distillation and hierarchical\n(snippet-video) correlative distillation to capture and maintain the\nhierarchical intra-sample knowledge of each data and the hierarchical\ninter-sample knowledge between data, respectively. Evaluations on four\nbenchmarks (AVE, AVK-100, AVK-200, and AVK-400) demonstrate that the proposed\nHAD effectively captures hierarchical information in both data and models,\nresulting in better preservation of historical class knowledge and improved\nperformance. Furthermore, we provide a theoretical analysis to support the\nnecessity of the segmental feature augmentation strategy.\n","authors":["Yukun Zuo","Hantao Yao","Liansheng Zhuang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2401.06287v2.pdf","comment":"Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2308.15321v6","updated":"2024-04-10T18:13:00Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir \\textit{exposure bias} problem, described as the input mismatch between\ntraining and sampling, lacks in-depth exploration. In this paper, we\nsystematically investigate the exposure bias problem in diffusion models by\nfirst analytically modelling the sampling distribution, based on which we then\nattribute the prediction error at each sampling step as the root cause of the\nexposure bias issue. Furthermore, we discuss potential solutions to this issue\nand propose an intuitive metric for it. Along with the elucidation of exposure\nbias, we propose a simple, yet effective, training-free method called Epsilon\nScaling to alleviate the exposure bias. We show that Epsilon Scaling explicitly\nmoves the sampling trajectory closer to the vector field learned in the\ntraining phase by scaling down the network output, mitigating the input\nmismatch between training and sampling. Experiments on various diffusion\nframeworks (ADM, DDIM, EDM, LDM, DiT, PFGM++) verify the effectiveness of our\nmethod. Remarkably, our ADM-ES, as a state-of-the-art stochastic sampler,\nobtains 2.17 FID on CIFAR-10 under 100-step unconditional generation. The code\nis available at \\url{https://github.com/forever208/ADM-ES} and\n\\url{https://github.com/forever208/EDM-ES}.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v6.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.06776v1","updated":"2024-04-10T06:35:25Z","published":"2024-04-10T06:35:25Z","title":"Logit Calibration and Feature Contrast for Robust Federated Learning on\n Non-IID Data","summary":" Federated learning (FL) is a privacy-preserving distributed framework for\ncollaborative model training on devices in edge networks. However, challenges\narise due to vulnerability to adversarial examples (AEs) and the\nnon-independent and identically distributed (non-IID) nature of data\ndistribution among devices, hindering the deployment of adversarially robust\nand accurate learning models at the edge. While adversarial training (AT) is\ncommonly acknowledged as an effective defense strategy against adversarial\nattacks in centralized training, we shed light on the adverse effects of\ndirectly applying AT in FL that can severely compromise accuracy, especially in\nnon-IID challenges. Given this limitation, this paper proposes FatCC, which\nincorporates local logit \\underline{C}alibration and global feature\n\\underline{C}ontrast into the vanilla federated adversarial training\n(\\underline{FAT}) process from both logit and feature perspectives. This\napproach can effectively enhance the federated system's robust accuracy (RA)\nand clean accuracy (CA). First, we propose logit calibration, where the logits\nare calibrated during local adversarial updates, thereby improving adversarial\nrobustness. Second, FatCC introduces feature contrast, which involves a global\nalignment term that aligns each local representation with unbiased global\nfeatures, thus further enhancing robustness and accuracy in federated\nadversarial environments. Extensive experiments across multiple datasets\ndemonstrate that FatCC achieves comparable or superior performance gains in\nboth CA and RA compared to other baselines.\n","authors":["Yu Qiao","Chaoning Zhang","Apurba Adhikary","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2404.06776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06773v1","updated":"2024-04-10T06:30:08Z","published":"2024-04-10T06:30:08Z","title":"Adapting LLaMA Decoder to Vision Transformer","summary":" This work examines whether decoder-only Transformers such as LLaMA, which\nwere originally designed for large language models (LLMs), can be adapted to\nthe computer vision field. We first \"LLaMAfy\" a standard ViT step-by-step to\nalign with LLaMA's architecture, and find that directly applying a casual mask\nto the self-attention brings an attention collapse issue, resulting in the\nfailure to the network training. We suggest to reposition the class token\nbehind the image tokens with a post-sequence class token technique to overcome\nthis challenge, enabling causal self-attention to efficiently capture the\nentire image's information. Additionally, we develop a soft mask strategy that\ngradually introduces a casual mask to the self-attention at the onset of\ntraining to facilitate the optimization behavior. The tailored model, dubbed as\nimage LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct\nsupervised learning. Its causal self-attention boosts computational efficiency\nand learns complex representation by elevating attention map ranks. iLLaMA\nrivals the performance with its encoder-only counterparts, achieving 75.1%\nImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M\nand pre-training on ImageNet-21K further enhances the accuracy to 86.0%.\nExtensive experiments demonstrate iLLaMA's reliable properties: calibration,\nshape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR\ntransfer learning. We hope our study can kindle fresh views to visual model\ndesign in the wave of LLMs. Pre-trained models and codes are available here.\n","authors":["Jiahao Wang","Wenqi Shao","Mengzhao Chen","Chengyue Wu","Yong Liu","Kaipeng Zhang","Songyang Zhang","Kai Chen","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2404.06773v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.06753v1","updated":"2024-04-10T05:41:05Z","published":"2024-04-10T05:41:05Z","title":"MonoSelfRecon: Purely Self-Supervised Explicit Generalizable 3D\n Reconstruction of Indoor Scenes from Monocular RGB Views","summary":" Current monocular 3D scene reconstruction (3DR) works are either\nfully-supervised, or not generalizable, or implicit in 3D representation. We\npropose a novel framework - MonoSelfRecon that for the first time achieves\nexplicit 3D mesh reconstruction for generalizable indoor scenes with monocular\nRGB views by purely self-supervision on voxel-SDF (signed distance function).\nMonoSelfRecon follows an Autoencoder-based architecture, decodes voxel-SDF and\na generalizable Neural Radiance Field (NeRF), which is used to guide voxel-SDF\nin self-supervision. We propose novel self-supervised losses, which not only\nsupport pure self-supervision, but can be used together with supervised signals\nto further boost supervised training. Our experiments show that \"MonoSelfRecon\"\ntrained in pure self-supervision outperforms current best self-supervised\nindoor depth estimation models and is comparable to 3DR models trained in fully\nsupervision with depth annotations. MonoSelfRecon is not restricted by specific\nmodel design, which can be used to any models with voxel-SDF for purely\nself-supervised manner.\n","authors":["Runfa Li","Upal Mahbub","Vasudev Bhaskaran","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.06753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06744v1","updated":"2024-04-10T05:10:05Z","published":"2024-04-10T05:10:05Z","title":"YOLO based Ocean Eddy Localization with AWS SageMaker","summary":" Ocean eddies play a significant role both on the sea surface and beneath it,\ncontributing to the sustainability of marine life dependent on oceanic\nbehaviors. Therefore, it is crucial to investigate ocean eddies to monitor\nchanges in the Earth, particularly in the oceans, and their impact on climate.\nThis study aims to pinpoint ocean eddies using AWS cloud services, specifically\nSageMaker. The primary objective is to detect small-scale (<20km) ocean eddies\nfrom satellite remote images and assess the feasibility of utilizing SageMaker,\nwhich offers tools for deploying AI applications. Moreover, this research not\nonly explores the deployment of cloud-based services for remote sensing of\nEarth data but also evaluates several YOLO (You Only Look Once) models using\nsingle and multi-GPU-based services in the cloud. Furthermore, this study\nunderscores the potential of these services, their limitations, challenges\nrelated to deployment and resource management, and their user-riendliness for\nEarth science projects.\n","authors":["Seraj Al Mahmud Mostafa","Jinbo Wang","Benjamin Holt","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06744v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.06741v1","updated":"2024-04-10T04:59:51Z","published":"2024-04-10T04:59:51Z","title":"An Animation-based Augmentation Approach for Action Recognition from\n Discontinuous Video","summary":" The study of action recognition has attracted considerable attention recently\ndue to its broad applications in multiple areas. However, with the issue of\ndiscontinuous training video, which not only decreases the performance of\naction recognition model, but complicates the data augmentation process as\nwell, still remains under-exploration. In this study, we introduce the 4A\n(Action Animation-based Augmentation Approach), an innovative pipeline for data\naugmentation to address the problem. The main contributions remain in our work\nincludes: (1) we investigate the problem of severe decrease on performance of\naction recognition task training by discontinuous video, and the limitation of\nexisting augmentation methods on solving this problem. (2) we propose a novel\naugmentation pipeline, 4A, to address the problem of discontinuous video for\ntraining, while achieving a smoother and natural-looking action representation\nthan the latest data augmentation methodology. (3) We achieve the same\nperformance with only 10% of the original data for training as with all of the\noriginal data from the real-world dataset, and a better performance on\nIn-the-wild videos, by employing our data augmentation techniques.\n","authors":["Xingyu Song","Zhan Li","Shi Chen","Xin-Qiang Cai","Kazuyuki Demachi"],"pdf_url":"https://arxiv.org/pdf/2404.06741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02736v4","updated":"2024-04-10T04:51:33Z","published":"2022-11-04T20:22:58Z","title":"Discovering Closed-Loop Failures of Vision-Based Controllers via\n Reachability Analysis","summary":" Machine learning driven image-based controllers allow robotic systems to take\nintelligent actions based on the visual feedback from their environment.\nUnderstanding when these controllers might lead to system safety violations is\nimportant for their integration in safety-critical applications and engineering\ncorrective safety measures for the system. Existing methods leverage\nsimulation-based testing (or falsification) to find the failures of\nvision-based controllers, i.e., the visual inputs that lead to closed-loop\nsafety violations. However, these techniques do not scale well to the scenarios\ninvolving high-dimensional and complex visual inputs, such as RGB images. In\nthis work, we cast the problem of finding closed-loop vision failures as a\nHamilton-Jacobi (HJ) reachability problem. Our approach blends simulation-based\nanalysis with HJ reachability methods to compute an approximation of the\nbackward reachable tube (BRT) of the system, i.e., the set of unsafe states for\nthe system under vision-based controllers. Utilizing the BRT, we can tractably\nand systematically find the system states and corresponding visual inputs that\nlead to closed-loop failures. These visual inputs can be subsequently analyzed\nto find the input characteristics that might have caused the failure. Besides\nits scalability to high-dimensional visual inputs, an explicit computation of\nBRT allows the proposed approach to capture non-trivial system failures that\nare difficult to expose via random simulations. We demonstrate our framework on\ntwo case studies involving an RGB image-based neural network controller for (a)\nautonomous indoor navigation, and (b) autonomous aircraft taxiing.\n","authors":["Kaustav Chakraborty","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2211.02736v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01289v3","updated":"2024-04-10T04:42:10Z","published":"2023-06-02T06:15:36Z","title":"nnMobileNe: Rethinking CNN for Retinopathy Research","summary":" Over the past few decades, convolutional neural networks (CNNs) have been at\nthe forefront of the detection and tracking of various retinal diseases (RD).\nDespite their success, the emergence of vision transformers (ViT) in the 2020s\nhas shifted the trajectory of RD model development. The leading-edge\nperformance of ViT-based models in RD can be largely credited to their\nscalability-their ability to improve as more parameters are added. As a result,\nViT-based models tend to outshine traditional CNNs in RD applications, albeit\nat the cost of increased data and computational demands. ViTs also differ from\nCNNs in their approach to processing images, working with patches rather than\nlocal regions, which can complicate the precise localization of small, variably\npresented lesions in RD. In our study, we revisited and updated the\narchitecture of a CNN model, specifically MobileNet, to enhance its utility in\nRD diagnostics. We found that an optimized MobileNet, through selective\nmodifications, can surpass ViT-based models in various RD benchmarks, including\ndiabetic retinopathy grading, detection of multiple fundus diseases, and\nclassification of diabetic macular edema. The code is available at\nhttps://github.com/Retinal-Research/NN-MOBILENET\n","authors":["Wenhui Zhu","Peijie Qiu","Xiwen Chen","Xin Li","Natasha Lepore","Oana M. Dumitrascu","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01289v3.pdf","comment":"Accepted as a conference paper to 2024 CVPRW"},{"id":"http://arxiv.org/abs/2404.06727v1","updated":"2024-04-10T04:24:42Z","published":"2024-04-10T04:24:42Z","title":"Bayesian NeRF: Quantifying Uncertainty with Volume Density in Neural\n Radiance Fields","summary":" We present the Bayesian Neural Radiance Field (NeRF), which explicitly\nquantifies uncertainty in geometric volume structures without the need for\nadditional networks, making it adept for challenging observations and\nuncontrolled images. NeRF diverges from traditional geometric methods by\noffering an enriched scene representation, rendering color and density in 3D\nspace from various viewpoints. However, NeRF encounters limitations in relaxing\nuncertainties by using geometric structure information, leading to inaccuracies\nin interpretation under insufficient real-world observations. Recent research\nefforts aimed at addressing this issue have primarily relied on empirical\nmethods or auxiliary networks. To fundamentally address this issue, we propose\na series of formulational extensions to NeRF. By introducing generalized\napproximations and defining density-related uncertainty, our method seamlessly\nextends to manage uncertainty not only for RGB but also for depth, without the\nneed for additional networks or empirical assumptions. In experiments we show\nthat our method significantly enhances performance on RGB and depth images in\nthe comprehensive dataset, demonstrating the reliability of the Bayesian NeRF\napproach to quantifying uncertainty based on the geometric structure.\n","authors":["Sibeak Lee","Kyeongsu Kang","Hyeonwoo Yu"],"pdf_url":"https://arxiv.org/pdf/2404.06727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03384v2","updated":"2024-04-10T04:24:36Z","published":"2024-04-04T11:33:29Z","title":"LongVLM: Efficient Long Video Understanding via Large Language Models","summary":" Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs\nhave driven progress in various video understanding tasks. These models encode\nvideo representations through pooling or query aggregation over a vast number\nof visual tokens, making computational and memory costs affordable. Despite\nsuccessfully providing an overall comprehension of video content, existing\nVideoLLMs still face challenges in achieving detailed understanding in videos\ndue to overlooking local information in long-term videos. To tackle this\nchallenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for\nlong video understanding, building upon the observation that long videos often\nconsist of sequential key events, complex actions, and camera movements. Our\napproach proposes to decompose long videos into multiple short-term segments\nand encode local features for each local segment via a hierarchical token\nmerging module. These features are concatenated in temporal order to maintain\nthe storyline across sequential short-term segments. Additionally, we propose\nto integrate global semantics into each local feature to enhance context\nunderstanding. In this way, we encode video representations that incorporate\nboth local and global information, enabling the LLM to generate comprehensive\nresponses for long-term videos. Experimental results on the VideoChatGPT\nbenchmark and zero-shot video question-answering datasets demonstrate the\nsuperior capabilities of our model over the previous state-of-the-art methods.\nQualitative examples demonstrate that our model produces more precise responses\nfor long videos understanding. Code will be available at\nhttps://github.com/ziplab/LongVLM.\n","authors":["Yuetian Weng","Mingfei Han","Haoyu He","Xiaojun Chang","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.03384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11848v2","updated":"2024-04-10T04:05:24Z","published":"2024-03-18T15:00:38Z","title":"GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object\n Detection","summary":" Integrating LiDAR and camera information into Bird's-Eye-View (BEV)\nrepresentation has emerged as a crucial aspect of 3D object detection in\nautonomous driving. However, existing methods are susceptible to the inaccurate\ncalibration relationship between LiDAR and the camera sensor. Such inaccuracies\nresult in errors in depth estimation for the camera branch, ultimately causing\nmisalignment between LiDAR and camera BEV features. In this work, we propose a\nrobust fusion framework called Graph BEV. Addressing errors caused by\ninaccurate point cloud projection, we introduce a Local Align module that\nemploys neighbor-aware depth features via Graph matching. Additionally, we\npropose a Global Align module to rectify the misalignment between LiDAR and\ncamera BEV features. Our Graph BEV framework achieves state-of-the-art\nperformance, with an mAP of 70.1\\%, surpassing BEV Fusion by 1.6\\% on the\nnuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by\n8.3\\% under conditions with misalignment noise.\n","authors":["Ziying Song","Lei Yang","Shaoqing Xu","Lin Liu","Dongyang Xu","Caiyan Jia","Feiyang Jia","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06194v2","updated":"2024-04-10T04:01:43Z","published":"2024-04-09T10:27:22Z","title":"Exploring the Potential of Large Foundation Models for Open-Vocabulary\n HOI Detection","summary":" Open-vocabulary human-object interaction (HOI) detection, which is concerned\nwith the problem of detecting novel HOIs guided by natural language, is crucial\nfor understanding human-centric scenes. However, prior zero-shot HOI detectors\noften employ the same levels of feature maps to model HOIs with varying\ndistances, leading to suboptimal performance in scenes containing human-object\npairs with a wide range of distances. In addition, these detectors primarily\nrely on category names and overlook the rich contextual information that\nlanguage can provide, which is essential for capturing open vocabulary concepts\nthat are typically rare and not well-represented by category names alone. In\nthis paper, we introduce a novel end-to-end open vocabulary HOI detection\nframework with conditional multi-level decoding and fine-grained semantic\nenhancement (CMD-SE), harnessing the potential of Visual-Language Models\n(VLMs). Specifically, we propose to model human-object pairs with different\ndistances with different levels of feature maps by incorporating a soft\nconstraint during the bipartite matching process. Furthermore, by leveraging\nlarge language models (LLMs) such as GPT models, we exploit their extensive\nworld knowledge to generate descriptions of human body part states for various\ninteractions. Then we integrate the generalizable and fine-grained semantics of\nhuman body parts to improve interaction recognition. Experimental results on\ntwo datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method\nachieves state-of-the-art results in open vocabulary HOI detection. The code\nand models are available at https://github.com/ltttpku/CMD-SE-release.\n","authors":["Ting Lei","Shaofeng Yin","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06194v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06715v1","updated":"2024-04-10T03:54:53Z","published":"2024-04-10T03:54:53Z","title":"Sparse Points to Dense Clouds: Enhancing 3D Detection with Limited LiDAR\n Data","summary":" 3D detection is a critical task that enables machines to identify and locate\nobjects in three-dimensional space. It has a broad range of applications in\nseveral fields, including autonomous driving, robotics and augmented reality.\nMonocular 3D detection is attractive as it requires only a single camera,\nhowever, it lacks the accuracy and robustness required for real world\napplications. High resolution LiDAR on the other hand, can be expensive and\nlead to interference problems in heavy traffic given their active\ntransmissions. We propose a balanced approach that combines the advantages of\nmonocular and point cloud-based 3D detection. Our method requires only a small\nnumber of 3D points, that can be obtained from a low-cost, low-resolution\nsensor. Specifically, we use only 512 points, which is just 1% of a full LiDAR\nframe in the KITTI dataset. Our method reconstructs a complete 3D point cloud\nfrom this limited 3D information combined with a single image. The\nreconstructed 3D point cloud and corresponding image can be used by any\nmulti-modal off-the-shelf detector for 3D object detection. By using the\nproposed network architecture with an off-the-shelf multi-modal 3D detector,\nthe accuracy of 3D detection improves by 20% compared to the state-of-the-art\nmonocular detection methods and 6% to 9% compare to the baseline multi-modal\nmethods on KITTI and JackRabbot datasets.\n","authors":["Aakash Kumar","Chen Chen","Ajmal Mian","Neils Lobo","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2404.06715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01929v2","updated":"2024-04-10T03:36:33Z","published":"2024-04-02T13:23:21Z","title":"Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A\n Semi-Supervised Video Object Detection Method","summary":" This study aims to establish a computer-aided diagnostic system for lung\nlesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians\nin identifying lesion areas. During EBUS-transbronchial needle aspiration\n(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to\ndetermine the location of lesions. However, these images often contain\nsignificant noise and can be influenced by surrounding tissues or blood\nvessels, making interpretation challenging. Previous research has lacked the\napplication of object detection models to EBUS-TBNA, and there has been no\nwell-defined solution for annotating the EBUS-TBNA dataset. In related studies\non ultrasound images, although models have been successful in capturing target\nregions for their respective tasks, their training and predictions have been\nbased on two-dimensional images, limiting their ability to leverage temporal\nfeatures for improved predictions. This study introduces a three-dimensional\nimage-based object detection model. It utilizes an attention mechanism to\ncapture temporal correlations and we will implements a filtering mechanism to\nselect relevant information from previous frames. Subsequently, a\nteacher-student model training approach is employed to optimize the model\nfurther, leveraging unlabeled data. To mitigate the impact of poor-quality\npseudo-labels on the student model, we will add a special Gaussian Mixture\nModel (GMM) to ensure the quality of pseudo-labels.\n","authors":["Jyun-An Lin","Yun-Chien Cheng","Ching-Kai Lin"],"pdf_url":"https://arxiv.org/pdf/2404.01929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06080v2","updated":"2024-04-10T03:35:35Z","published":"2024-04-09T07:39:21Z","title":"Using Few-Shot Learning to Classify Primary Lung Cancer and Other\n Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial\n Ultrasound Procedures","summary":" This study aims to establish a computer-aided diagnosis system for\nendobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary\ndiagnosis of metastatic cancer. This involves arranging immediate examinations\nfor other sites of metastatic cancer after EBUS surgery, eliminating the need\nto wait for reports, thereby shortening the waiting time by more than half and\nenabling patients to detect other cancers earlier, allowing for early planning\nand implementation of treatment plans. Unlike previous studies on cell image\nclassification, which have abundant datasets for training, this study must also\nbe able to make effective classifications despite the limited amount of case\ndata for lung metastatic cancer. In the realm of small data set classification\nmethods, Few-shot learning (FSL) has become mainstream in recent years. Through\nits ability to train on small datasets and its strong generalization\ncapabilities, FSL shows potential in this task of lung metastatic cell image\nclassification. This study will adopt the approach of Few-shot learning,\nreferencing existing proposed models, and designing a model architecture for\nclassifying lung metastases cell images. Batch Spectral Regularization (BSR)\nwill be incorporated as a loss update parameter, and the Finetune method of PMF\nwill be modified. In terms of test results, the addition of BSR and the\nmodified Finetune method further increases the accuracy by 8.89% to 65.60%,\noutperforming other FSL methods. This study confirms that FSL is superior to\nsupervised and transfer learning in classifying metastatic cancer and\ndemonstrates that using BSR as a loss function and modifying Finetune can\nenhance the model's capabilities.\n","authors":["Ching-Kai Lin","Di-Chun Wei","Yun-Chien Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.06080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06038v2","updated":"2024-04-10T03:27:04Z","published":"2023-07-12T09:33:21Z","title":"Pyramid Deep Fusion Network for Two-Hand Reconstruction from RGB-D\n Images","summary":" Accurately recovering the dense 3D mesh of both hands from monocular images\nposes considerable challenges due to occlusions and projection ambiguity. Most\nof the existing methods extract features from color images to estimate the\nroot-aligned hand meshes, which neglect the crucial depth and scale information\nin the real world. Given the noisy sensor measurements with limited resolution,\ndepth-based methods predict 3D keypoints rather than a dense mesh. These\nlimitations motivate us to take advantage of these two complementary inputs to\nacquire dense hand meshes on a real-world scale. In this work, we propose an\nend-to-end framework for recovering dense meshes for both hands, which employ\nsingle-view RGB-D image pairs as input. The primary challenge lies in\neffectively utilizing two different input modalities to mitigate the blurring\neffects in RGB images and noises in depth images. Instead of directly treating\ndepth maps as additional channels for RGB images, we encode the depth\ninformation into the unordered point cloud to preserve more geometric details.\nSpecifically, our framework employs ResNet50 and PointNet++ to derive features\nfrom RGB and point cloud, respectively. Additionally, we introduce a novel\npyramid deep fusion network (PDFNet) to aggregate features at different scales,\nwhich demonstrates superior efficacy compared to previous fusion strategies.\nFurthermore, we employ a GCN-based decoder to process the fused features and\nrecover the corresponding 3D pose and dense mesh. Through comprehensive\nablation experiments, we have not only demonstrated the effectiveness of our\nproposed fusion algorithm but also outperformed the state-of-the-art approaches\non publicly available datasets. To reproduce the results, we will make our\nsource code and models publicly available at\n{https://github.com/zijinxuxu/PDFNet}.\n","authors":["Jinwei Ren","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.06038v2.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2404.06704v1","updated":"2024-04-10T03:20:33Z","published":"2024-04-10T03:20:33Z","title":"Convolution-based Probability Gradient Loss for Semantic Segmentation","summary":" In this paper, we introduce a novel Convolution-based Probability Gradient\n(CPG) loss for semantic segmentation. It employs convolution kernels similar to\nthe Sobel operator, capable of computing the gradient of pixel intensity in an\nimage. This enables the computation of gradients for both ground-truth and\npredicted category-wise probabilities. It enhances network performance by\nmaximizing the similarity between these two probability gradients. Moreover, to\nspecifically enhance accuracy near the object's boundary, we extract the object\nboundary based on the ground-truth probability gradient and exclusively apply\nthe CPG loss to pixels belonging to boundaries. CPG loss proves to be highly\nconvenient and effective. It establishes pixel relationships through\nconvolution, calculating errors from a distinct dimension compared to\npixel-wise loss functions such as cross-entropy loss. We conduct qualitative\nand quantitative analyses to evaluate the impact of the CPG loss on three\nwell-established networks (DeepLabv3-Resnet50, HRNetV2-OCR, and\nLRASPP_MobileNet_V3_Large) across three standard segmentation datasets\n(Cityscapes, COCO-Stuff, ADE20K). Our extensive experimental results\nconsistently and significantly demonstrate that the CPG loss enhances the mean\nIntersection over Union.\n","authors":["Guohang Shan","Shuangcheng Jia"],"pdf_url":"https://arxiv.org/pdf/2404.06704v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.06700v1","updated":"2024-04-10T03:11:10Z","published":"2024-04-10T03:11:10Z","title":"Scaling Multi-Camera 3D Object Detection through Weak-to-Strong\n Eliciting","summary":" The emergence of Multi-Camera 3D Object Detection (MC3D-Det), facilitated by\nbird's-eye view (BEV) representation, signifies a notable progression in 3D\nobject detection. Scaling MC3D-Det training effectively accommodates varied\ncamera parameters and urban landscapes, paving the way for the MC3D-Det\nfoundation model. However, the multi-view fusion stage of the MC3D-Det method\nrelies on the ill-posed monocular perception during training rather than\nsurround refinement ability, leading to what we term \"surround refinement\ndegradation\". To this end, our study presents a weak-to-strong eliciting\nframework aimed at enhancing surround refinement while maintaining robust\nmonocular perception. Specifically, our framework employs weakly tuned experts\ntrained on distinct subsets, and each is inherently biased toward specific\ncamera configurations and scenarios. These biased experts can learn the\nperception of monocular degeneration, which can help the multi-view fusion\nstage to enhance surround refinement abilities. Moreover, a composite\ndistillation strategy is proposed to integrate the universal knowledge of 2D\nfoundation models and task-specific information. Finally, for MC3D-Det joint\ntraining, the elaborate dataset merge strategy is designed to solve the problem\nof inconsistent camera numbers and camera parameters. We set up a multiple\ndataset joint training benchmark for MC3D-Det and adequately evaluated existing\nmethods. Further, we demonstrate the proposed framework brings a generalized\nand significant boost over multiple baselines. Our code is at\n\\url{https://github.com/EnVision-Research/Scale-BEV}.\n","authors":["Hao Lu","Jiaqi Tang","Xinli Xu","Xu Cao","Yunpeng Zhang","Guoqing Wang","Dalong Du","Hao Chen","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05645v2","updated":"2024-04-10T03:05:04Z","published":"2023-09-11T17:37:08Z","title":"CitDet: A Benchmark Dataset for Citrus Fruit Detection","summary":" In this letter, we present a new dataset to advance the state of the art in\ndetecting citrus fruit and accurately estimate yield on trees affected by the\nHuanglongbing (HLB) disease in orchard environments via imaging. Despite the\nfact that significant progress has been made in solving the fruit detection\nproblem, the lack of publicly available datasets has complicated direct\ncomparison of results. For instance, citrus detection has long been of interest\nto the agricultural research community, yet there is an absence of work,\nparticularly involving public datasets of citrus affected by HLB. To address\nthis issue, we enhance state-of-the-art object detection methods for use in\ntypical orchard settings. Concretely, we provide high-resolution images of\ncitrus trees located in an area known to be highly affected by HLB, along with\nhigh-quality bounding box annotations of citrus fruit. Fruit on both the trees\nand the ground are labeled to allow for identification of fruit location, which\ncontributes to advancements in yield estimation and potential measure of HLB\nimpact via fruit drop. The dataset consists of over 32,000 bounding box\nannotations for fruit instances contained in 579 high-resolution images. In\nsummary, our contributions are the following: (i) we introduce a novel dataset\nalong with baseline performance benchmarks on multiple contemporary object\ndetection algorithms, (ii) we show the ability to accurately capture fruit\nlocation on tree or on ground, and finally (ii) we present a correlation of our\nresults with yield estimations.\n","authors":["Jordan A. James","Heather K. Manching","Matthew R. Mattia","Kim D. Bowman","Amanda M. Hulse-Kemp","William J. Beksi"],"pdf_url":"https://arxiv.org/pdf/2309.05645v2.pdf","comment":"Submitted to IEEE Robotics and Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2404.06693v1","updated":"2024-04-10T02:47:05Z","published":"2024-04-10T02:47:05Z","title":"Binomial Self-compensation for Motion Error in Dynamic 3D Scanning","summary":" Phase shifting profilometry (PSP) is favored in high-precision 3D scanning\ndue to its high accuracy, robustness, and pixel-wise property. However, a\nfundamental assumption of PSP that the object should remain static is violated\nin dynamic measurement, making PSP susceptible to object moving, resulting in\nripple-like errors in the point clouds. We propose a pixel-wise and frame-wise\nloopable binomial self-compensation (BSC) algorithm to effectively and flexibly\neliminate motion error in the four-step PSP. Our mathematical model\ndemonstrates that by summing successive motion-affected phase frames weighted\nby binomial coefficients, motion error exponentially diminishes as the binomial\norder increases, accomplishing automatic error compensation through the\nmotion-affected phase sequence, without the assistance of any intermediate\nvariable. Extensive experiments show that our BSC outperforms the existing\nmethods in reducing motion error, while achieving a depth map frame rate equal\nto the camera's acquisition rate (90 fps), enabling high-accuracy 3D\nreconstruction with a quasi-single-shot frame rate.\n","authors":["Geyou Zhang","Ce Zhu","Kai Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06692v1","updated":"2024-04-10T02:40:17Z","published":"2024-04-10T02:40:17Z","title":"Perception-Oriented Video Frame Interpolation via Asymmetric Blending","summary":" Previous methods for Video Frame Interpolation (VFI) have encountered\nchallenges, notably the manifestation of blur and ghosting effects. These\nissues can be traced back to two pivotal factors: unavoidable motion errors and\nmisalignment in supervision. In practice, motion estimates often prove to be\nerror-prone, resulting in misaligned features. Furthermore, the reconstruction\nloss tends to bring blurry results, particularly in misaligned regions. To\nmitigate these challenges, we propose a new paradigm called PerVFI\n(Perception-oriented Video Frame Interpolation). Our approach incorporates an\nAsymmetric Synergistic Blending module (ASB) that utilizes features from both\nsides to synergistically blend intermediate features. One reference frame\nemphasizes primary content, while the other contributes complementary\ninformation. To impose a stringent constraint on the blending process, we\nintroduce a self-learned sparse quasi-binary mask which effectively mitigates\nghosting and blur artifacts in the output. Additionally, we employ a\nnormalizing flow-based generator and utilize the negative log-likelihood loss\nto learn the conditional distribution of the output, which further facilitates\nthe generation of clear and fine details. Experimental results validate the\nsuperiority of PerVFI, demonstrating significant improvements in perceptual\nquality compared to existing methods. Codes are available at\n\\url{https://github.com/mulns/PerVFI}\n","authors":["Guangyang Wu","Xin Tao","Changlin Li","Wenyi Wang","Xiaohong Liu","Qingqing Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06692v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2210.16101v2","updated":"2024-04-10T02:33:57Z","published":"2022-10-27T13:24:08Z","title":"A Generic Shared Attention Mechanism for Various Backbone Neural\n Networks","summary":" The self-attention mechanism has emerged as a critical component for\nimproving the performance of various backbone neural networks. However, current\nmainstream approaches individually incorporate newly designed self-attention\nmodules (SAMs) into each layer of the network for granted without fully\nexploiting their parameters' potential. This leads to suboptimal performance\nand increased parameter consumption as the network depth increases. To improve\nthis paradigm, in this paper, we first present a counterintuitive but inherent\nphenomenon: SAMs tend to produce strongly correlated attention maps across\ndifferent layers, with an average Pearson correlation coefficient of up to\n0.85. Inspired by this inherent observation, we propose Dense-and-Implicit\nAttention (DIA), which directly shares SAMs across layers and employs a long\nshort-term memory module to calibrate and bridge the highly correlated\nattention maps of different layers, thus improving the parameter utilization\nefficiency of SAMs. This design of DIA is also consistent with the neural\nnetwork's dynamical system perspective. Through extensive experiments, we\ndemonstrate that our simple yet effective DIA can consistently enhance various\nnetwork backbones, including ResNet, Transformer, and UNet, across tasks such\nas image classification, object detection, and image generation using diffusion\nmodels.\n","authors":["Zhongzhan Huang","Senwei Liang","Mingfu Liang","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2210.16101v2.pdf","comment":"Work in progress. arXiv admin note: text overlap with\n arXiv:1905.10671"},{"id":"http://arxiv.org/abs/2404.06493v2","updated":"2024-04-10T02:24:58Z","published":"2024-04-09T17:48:52Z","title":"Flying with Photons: Rendering Novel Views of Propagating Light","summary":" We present an imaging and neural rendering technique that seeks to synthesize\nvideos of light propagating through a scene from novel, moving camera\nviewpoints. Our approach relies on a new ultrafast imaging setup to capture a\nfirst-of-its kind, multi-viewpoint video dataset with picosecond-level temporal\nresolution. Combined with this dataset, we introduce an efficient neural volume\nrendering framework based on the transient field. This field is defined as a\nmapping from a 3D point and 2D direction to a high-dimensional, discrete-time\nsignal that represents time-varying radiance at ultrafast timescales. Rendering\nwith transient fields naturally accounts for effects due to the finite speed of\nlight, including viewpoint-dependent appearance changes caused by light\npropagation delays to the camera. We render a range of complex effects,\nincluding scattering, specular reflection, refraction, and diffraction.\nAdditionally, we demonstrate removing viewpoint-dependent propagation delays\nusing a time warping procedure, rendering of relativistic effects, and video\nsynthesis of direct and global components of light transport.\n","authors":["Anagh Malik","Noah Juravsky","Ryan Po","Gordon Wetzstein","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2404.06493v2.pdf","comment":"Project page: https://anaghmalik.com/FlyingWithPhotons/"},{"id":"http://arxiv.org/abs/2404.06507v2","updated":"2024-04-10T02:23:09Z","published":"2024-04-09T17:55:41Z","title":"Reconstructing Hand-Held Objects in 3D","summary":" Objects manipulated by the hand (i.e., manipulanda) are particularly\nchallenging to reconstruct from in-the-wild RGB images or videos. Not only does\nthe hand occlude much of the object, but also the object is often only visible\nin a small number of image pixels. At the same time, two strong anchors emerge\nin this setting: (1) estimated 3D hands help disambiguate the location and\nscale of the object, and (2) the set of manipulanda is small relative to all\npossible objects. With these insights in mind, we present a scalable paradigm\nfor handheld object reconstruction that builds on recent breakthroughs in large\nlanguage/vision models and 3D object datasets. Our model, MCC-Hand-Object\n(MCC-HO), jointly reconstructs hand and object geometry given a single RGB\nimage and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve\na 3D object model that matches the object in the image and rigidly align the\nmodel to the network-inferred geometry; we call this alignment\nRetrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO\nachieves state-of-the-art performance on lab and Internet datasets, and we show\nhow RAR can be used to automatically obtain 3D labels for in-the-wild images of\nhand-object interactions.\n","authors":["Jane Wu","Georgios Pavlakos","Georgia Gkioxari","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2404.06507v2.pdf","comment":"Project page: https://janehwu.github.io/mcc-ho"},{"id":"http://arxiv.org/abs/2311.10568v2","updated":"2024-04-10T02:19:19Z","published":"2023-11-17T15:08:15Z","title":"Phase Guided Light Field for Spatial-Depth High Resolution 3D Imaging","summary":" On 3D imaging, light field cameras typically are of single shot, and however,\nthey heavily suffer from low spatial resolution and depth accuracy. In this\npaper, by employing an optical projector to project a group of single\nhigh-frequency phase-shifted sinusoid patterns, we propose a phase guided light\nfield algorithm to significantly improve both the spatial and depth resolutions\nfor off-the-shelf light field cameras. First, for correcting the axial\naberrations caused by the main lens of our light field camera, we propose a\ndeformed cone model to calibrate our structured light field system. Second,\nover wrapped phases computed from patterned images, we propose a stereo\nmatching algorithm, i.e. phase guided sum of absolute difference, to robustly\nobtain the correspondence for each pair of neighbored two lenslets. Finally, by\nintroducing a virtual camera according to the basic geometrical optics of light\nfield imaging, we propose a reorganization strategy to reconstruct 3D point\nclouds with spatial-depth high resolution. Experimental results show that,\ncompared with the state-of-the-art active light field methods, the proposed\nreconstructs 3D point clouds with a spatial resolution of 1280$\\times$720 with\nfactors 10$\\times$ increased, while maintaining the same high depth resolution\nand needing merely a single group of high-frequency patterns.\n","authors":["Geyou Zhang","Ce Zhu","Kai Liu","Yipeng Liu"],"pdf_url":"https://arxiv.org/pdf/2311.10568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06479v2","updated":"2024-04-10T02:12:27Z","published":"2024-04-09T17:30:18Z","title":"Text-Based Reasoning About Vector Graphics","summary":" While large multimodal models excel in broad vision-language benchmarks, they\noften struggle with tasks requiring precise perception of low-level visual\ndetails, such as comparing line lengths or solving simple mazes. In particular,\nthis failure mode persists in question-answering tasks about vector graphics --\nimages composed purely of 2D objects and shapes. To address this challenge, we\npropose the Visually Descriptive Language Model (VDLM), which performs\ntext-based reasoning about vector graphics. VDLM leverages Scalable Vector\nGraphics (SVG) for a more precise visual description and first uses an\noff-the-shelf raster-to-SVG algorithm for encoding. Since existing language\nmodels cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG\nwith pretrained language models through a newly introduced intermediate\nsymbolic representation, Primal Visual Description (PVD), comprising primitive\nattributes (e.g., shape, position, measurement) with their corresponding\npredicted values. PVD is task-agnostic and represents visual primitives that\nare universal across all vector graphics. It can be learned with procedurally\ngenerated (SVG, PVD) pairs and also enables the direct use of LLMs for\ngeneralization to complex reasoning tasks. By casting an image to a text-based\nrepresentation, we can leverage the power of language models to learn alignment\nfrom SVG to visual primitives and generalize to unseen question-answering\ntasks. Empirical results show that VDLM achieves stronger zero-shot performance\ncompared to state-of-the-art LMMs, such as GPT-4V, in various low-level\nmultimodal perception and reasoning tasks on vector graphics. We additionally\npresent extensive analyses on VDLM's performance, demonstrating that our\nframework offers better interpretability due to its disentangled perception and\nreasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/\n","authors":["Zhenhailong Wang","Joy Hsu","Xingyao Wang","Kuan-Hao Huang","Manling Li","Jiajun Wu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2404.06479v2.pdf","comment":"Project page: https://mikewangwzhl.github.io/VDLM/"},{"id":"http://arxiv.org/abs/2404.06683v1","updated":"2024-04-10T02:03:14Z","published":"2024-04-10T02:03:14Z","title":"Unsupervised Visible-Infrared ReID via Pseudo-label Correction and\n Modality-level Alignment","summary":" Unsupervised visible-infrared person re-identification (UVI-ReID) has\nrecently gained great attention due to its potential for enhancing human\ndetection in diverse environments without labeling. Previous methods utilize\nintra-modality clustering and cross-modality feature matching to achieve\nUVI-ReID. However, there exist two challenges: 1) noisy pseudo labels might be\ngenerated in the clustering process, and 2) the cross-modality feature\nalignment via matching the marginal distribution of visible and infrared\nmodalities may misalign the different identities from two modalities. In this\npaper, we first conduct a theoretic analysis where an interpretable\ngeneralization upper bound is introduced. Based on the analysis, we then\npropose a novel unsupervised cross-modality person re-identification framework\n(PRAISE). Specifically, to address the first challenge, we propose a\npseudo-label correction strategy that utilizes a Beta Mixture Model to predict\nthe probability of mis-clustering based network's memory effect and rectifies\nthe correspondence by adding a perceptual term to contrastive learning. Next,\nwe introduce a modality-level alignment strategy that generates paired\nvisible-infrared latent features and reduces the modality gap by aligning the\nlabeling function of visible and infrared features to learn identity\ndiscriminative and modality-invariant features. Experimental results on two\nbenchmark datasets demonstrate that our method achieves state-of-the-art\nperformance than the unsupervised visible-ReID methods.\n","authors":["Yexin Liu","Weiming Zhang","Athanasios V. Vasilakos","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06683v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.02065v2","updated":"2024-04-10T01:53:17Z","published":"2024-04-02T16:06:20Z","title":"Multi-Level Label Correction by Distilling Proximate Patterns for\n Semi-supervised Semantic Segmentation","summary":" Semi-supervised semantic segmentation relieves the reliance on large-scale\nlabeled data by leveraging unlabeled data. Recent semi-supervised semantic\nsegmentation approaches mainly resort to pseudo-labeling methods to exploit\nunlabeled data. However, unreliable pseudo-labeling can undermine the\nsemi-supervision processes. In this paper, we propose an algorithm called\nMulti-Level Label Correction (MLLC), which aims to use graph neural networks to\ncapture structural relationships in Semantic-Level Graphs (SLGs) and\nClass-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically,\nSLGs represent semantic affinities between pairs of pixel features, and CLGs\ndescribe classification consistencies between pairs of pixel labels. With the\nsupport of proximate pattern information from graphs, MLLC can rectify\nincorrectly predicted pseudo-labels and can facilitate discriminative feature\nrepresentations. We design an end-to-end network to train and perform this\neffective label corrections mechanism. Experiments demonstrate that MLLC can\nsignificantly improve supervised baselines and outperforms state-of-the-art\napproaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets.\nSpecifically, MLLC improves the supervised baseline by at least 5% and 2% with\nDeepLabV2 and DeepLabV3+ respectively under different partition protocols.\n","authors":["Hui Xiao","Yuting Hong","Li Dong","Diqun Yan","Jiayan Zhuang","Junjie Xiong","Dongtai Liang","Chengbin Peng"],"pdf_url":"https://arxiv.org/pdf/2404.02065v2.pdf","comment":"12 pages, 8 figures. IEEE Transactions on Multimedia, 2024"},{"id":"http://arxiv.org/abs/2301.04218v4","updated":"2024-04-10T01:11:15Z","published":"2023-01-10T21:50:26Z","title":"Leveraging Diffusion For Strong and High Quality Face Morphing Attacks","summary":" Face morphing attacks seek to deceive a Face Recognition (FR) system by\npresenting a morphed image consisting of the biometric qualities from two\ndifferent identities with the aim of triggering a false acceptance with one of\nthe two identities, thereby presenting a significant threat to biometric\nsystems. The success of a morphing attack is dependent on the ability of the\nmorphed image to represent the biometric characteristics of both identities\nthat were used to create the image. We present a novel morphing attack that\nuses a Diffusion-based architecture to improve the visual fidelity of the image\nand the ability of the morphing attack to represent characteristics from both\nidentities. We demonstrate the effectiveness of the proposed attack by\nevaluating its visual fidelity via the Frechet Inception Distance (FID). Also,\nextensive experiments are conducted to measure the vulnerability of FR systems\nto the proposed attack. The ability of a morphing attack detector to detect the\nproposed attack is measured and compared against two state-of-the-art GAN-based\nmorphing attacks along with two Landmark-based attacks. Additionally, a novel\nmetric to measure the relative strength between different morphing attacks is\nintroduced and evaluated.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2301.04218v4.pdf","comment":"Diffusion Morphs (DiM) paper. Accepted in IEEE TBIOM"},{"id":"http://arxiv.org/abs/2404.05215v2","updated":"2024-04-10T00:49:11Z","published":"2024-04-08T06:07:32Z","title":"Spatio-Temporal Attention and Gaussian Processes for Personalized Video\n Gaze Estimation","summary":" Gaze is an essential prompt for analyzing human behavior and attention.\nRecently, there has been an increasing interest in determining gaze direction\nfrom facial videos. However, video gaze estimation faces significant\nchallenges, such as understanding the dynamic evolution of gaze in video\nsequences, dealing with static backgrounds, and adapting to variations in\nillumination. To address these challenges, we propose a simple and novel deep\nlearning model designed to estimate gaze from videos, incorporating a\nspecialized attention module. Our method employs a spatial attention mechanism\nthat tracks spatial dynamics within videos. This technique enables accurate\ngaze direction prediction through a temporal sequence model, adeptly\ntransforming spatial observations into temporal insights, thereby significantly\nimproving gaze estimation accuracy. Additionally, our approach integrates\nGaussian processes to include individual-specific traits, facilitating the\npersonalization of our model with just a few labeled samples. Experimental\nresults confirm the efficacy of the proposed approach, demonstrating its\nsuccess in both within-dataset and cross-dataset settings. Specifically, our\nproposed approach achieves state-of-the-art performance on the Gaze360 dataset,\nimproving by $2.5^\\circ$ without personalization. Further, by personalizing the\nmodel with just three samples, we achieved an additional improvement of\n$0.8^\\circ$. The code and pre-trained models are available at\n\\url{https://github.com/jswati31/stage}.\n","authors":["Swati Jindal","Mohit Yadav","Roberto Manduchi"],"pdf_url":"https://arxiv.org/pdf/2404.05215v2.pdf","comment":"Accepted at CVPR 2024 Gaze workshop"},{"id":"http://arxiv.org/abs/2404.06666v1","updated":"2024-04-10T00:26:08Z","published":"2024-04-10T00:26:08Z","title":"SafeGen: Mitigating Unsafe Content Generation in Text-to-Image Models","summary":" Text-to-image (T2I) models, such as Stable Diffusion, have exhibited\nremarkable performance in generating high-quality images from text descriptions\nin recent years. However, text-to-image models may be tricked into generating\nnot-safe-for-work (NSFW) content, particularly in sexual scenarios. Existing\ncountermeasures mostly focus on filtering inappropriate inputs and outputs, or\nsuppressing improper text embeddings, which can block explicit NSFW-related\ncontent (e.g., naked or sexy) but may still be vulnerable to adversarial\nprompts inputs that appear innocent but are ill-intended. In this paper, we\npresent SafeGen, a framework to mitigate unsafe content generation by\ntext-to-image models in a text-agnostic manner. The key idea is to eliminate\nunsafe visual representations from the model regardless of the text input. In\nthis way, the text-to-image model is resistant to adversarial prompts since\nunsafe visual representations are obstructed from within. Extensive experiments\nconducted on four datasets demonstrate SafeGen's effectiveness in mitigating\nunsafe content generation while preserving the high-fidelity of benign images.\nSafeGen outperforms eight state-of-the-art baseline methods and achieves 99.1%\nsexual content removal performance. Furthermore, our constructed benchmark of\nadversarial prompts provides a basis for future development and evaluation of\nanti-NSFW-generation methods.\n","authors":["Xinfeng Li","Yuchen Yang","Jiangyi Deng","Chen Yan","Yanjiao Chen","Xiaoyu Ji","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.06666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06665v1","updated":"2024-04-10T00:25:09Z","published":"2024-04-10T00:25:09Z","title":"Deep Generative Data Assimilation in Multimodal Setting","summary":" Robust integration of physical knowledge and data is key to improve\ncomputational simulations, such as Earth system models. Data assimilation is\ncrucial for achieving this goal because it provides a systematic framework to\ncalibrate model outputs with observations, which can include remote sensing\nimagery and ground station measurements, with uncertainty quantification.\nConventional methods, including Kalman filters and variational approaches,\ninherently rely on simplifying linear and Gaussian assumptions, and can be\ncomputationally expensive. Nevertheless, with the rapid adoption of data-driven\nmethods in many areas of computational sciences, we see the potential of\nemulating traditional data assimilation with deep learning, especially\ngenerative models. In particular, the diffusion-based probabilistic framework\nhas large overlaps with data assimilation principles: both allows for\nconditional generation of samples with a Bayesian inverse framework. These\nmodels have shown remarkable success in text-conditioned image generation or\nimage-controlled video synthesis. Likewise, one can frame data assimilation as\nobservation-conditioned state calibration. In this work, we propose SLAMS:\nScore-based Latent Assimilation in Multimodal Setting. Specifically, we\nassimilate in-situ weather station data and ex-situ satellite imagery to\ncalibrate the vertical temperature profiles, globally. Through extensive\nablation, we demonstrate that SLAMS is robust even in low-resolution, noisy,\nand sparse data settings. To our knowledge, our work is the first to apply deep\ngenerative framework for multimodal data assimilation using real-world\ndatasets; an important step for building robust computational simulators,\nincluding the next-generation Earth system models. Our code is available at:\nhttps://github.com/yongquan-qu/SLAMS\n","authors":["Yongquan Qu","Juan Nathaniel","Shuolin Li","Pierre Gentine"],"pdf_url":"https://arxiv.org/pdf/2404.06665v1.pdf","comment":"Accepted to CVPR2024 EarthVision"},{"id":"http://arxiv.org/abs/2404.06663v1","updated":"2024-04-10T00:11:03Z","published":"2024-04-10T00:11:03Z","title":"Multi-modal Document Presentation Attack Detection With Forensics Trace\n Disentanglement","summary":" Document Presentation Attack Detection (DPAD) is an important measure in\nprotecting the authenticity of a document image. However, recent DPAD methods\ndemand additional resources, such as manual effort in collecting additional\ndata or knowing the parameters of acquisition devices. This work proposes a\nDPAD method based on multi-modal disentangled traces (MMDT) without the above\ndrawbacks. We first disentangle the recaptured traces by a self-supervised\ndisentanglement and synthesis network to enhance the generalization capacity in\ndocument images with different contents and layouts. Then, unlike the existing\nDPAD approaches that rely only on data in the RGB domain, we propose to\nexplicitly employ the disentangled recaptured traces as new modalities in the\ntransformer backbone through adaptive multi-modal adapters to fuse RGB/trace\nfeatures efficiently. Visualization of the disentangled traces confirms the\neffectiveness of the proposed method in different document contents. Extensive\nexperiments on three benchmark datasets demonstrate the superiority of our MMDT\nmethod on representing forensic traces of recapturing distortion.\n","authors":["Changsheng Chen","Yongyi Deng","Liangwei Lin","Zitong Yu","Zhimao Lai"],"pdf_url":"https://arxiv.org/pdf/2404.06663v1.pdf","comment":"Accepted to ICME 2024"},{"id":"http://arxiv.org/abs/2404.06661v1","updated":"2024-04-10T00:05:55Z","published":"2024-04-10T00:05:55Z","title":"Efficient Denoising using Score Embedding in Score-based Diffusion\n Models","summary":" It is well known that training a denoising score-based diffusion models\nrequires tens of thousands of epochs and a substantial number of image data to\ntrain the model. In this paper, we propose to increase the efficiency in\ntraining score-based diffusion models. Our method allows us to decrease the\nnumber of epochs needed to train the diffusion model. We accomplish this by\nsolving the log-density Fokker-Planck (FP) Equation numerically to compute the\nscore \\textit{before} training. The pre-computed score is embedded into the\nimage to encourage faster training under slice Wasserstein distance.\nConsequently, it also allows us to decrease the number of images we need to\ntrain the neural network to learn an accurate score. We demonstrate through our\nnumerical experiments the improved performance of our proposed method compared\nto standard score-based diffusion models. Our proposed method achieves a\nsimilar quality to the standard method meaningfully faster.\n","authors":["Andrew S. Na","William Gao","Justin W. L. Wan"],"pdf_url":"https://arxiv.org/pdf/2404.06661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08017v1","updated":"2024-04-10T19:16:08Z","published":"2024-04-10T19:16:08Z","title":"AI-Guided Feature Segmentation Techniques to Model Features from Single\n Crystal Diamond Growth","summary":" Process refinement to consistently produce high-quality material over a large\narea of the grown crystal, enabling various applications from optics crystals\nto quantum detectors, has long been a goal for diamond growth. Machine learning\noffers a promising path toward this goal, but faces challenges such as the\ncomplexity of features within datasets, their time-dependency, and the volume\nof data produced per growth run. Accurate spatial feature extraction from image\nto image for real-time monitoring of diamond growth is crucial yet complicated\ndue to the low-volume and high feature complexity nature of the datasets. This\npaper compares various traditional and machine learning-driven approaches for\nfeature extraction in the diamond growth domain, proposing a novel deep\nlearning-driven semantic segmentation approach to isolate and classify accurate\npixel masks of geometric features like diamond, pocket holder, and background,\nalong with their derivative features based on shape and size. Using an\nannotation-focused human-in-the-loop software architecture for training\ndatasets, with modules for selective data labeling using active learning, data\naugmentations, and model-assisted labeling, our approach achieves effective\nannotation accuracy and drastically reduces labeling time and cost. Deep\nlearning algorithms prove highly efficient in accurately learning complex\nrepresentations from datasets with many features. Our top-performing model,\nbased on the DeeplabV3plus architecture, achieves outstanding accuracy in\nclassifying features of interest, with accuracies of 96.31% for pocket holder,\n98.60% for diamond top, and 91.64% for diamond side features.\n","authors":["Rohan Reddy Mekala","Elias Garratt","Matthias Muehle","Arjun Srinivasan","Adam Porter","Mikael Lindvall"],"pdf_url":"https://arxiv.org/pdf/2404.08017v1.pdf","comment":"12 pages,4 figures,ACMME 2024. arXiv admin note: substantial text\n overlap with arXiv:2404.07306"},{"id":"http://arxiv.org/abs/2404.08013v1","updated":"2024-04-10T15:37:15Z","published":"2024-04-10T15:37:15Z","title":"Enhanced Cooperative Perception for Autonomous Vehicles Using Imperfect\n Communication","summary":" Sharing and joint processing of camera feeds and sensor measurements, known\nas Cooperative Perception (CP), has emerged as a new technique to achieve\nhigher perception qualities. CP can enhance the safety of Autonomous Vehicles\n(AVs) where their individual visual perception quality is compromised by\nadverse weather conditions (haze as foggy weather), low illumination, winding\nroads, and crowded traffic. To cover the limitations of former methods, in this\npaper, we propose a novel approach to realize an optimized CP under constrained\ncommunications. At the core of our approach is recruiting the best helper from\nthe available list of front vehicles to augment the visual range and enhance\nthe Object Detection (OD) accuracy of the ego vehicle. In this two-step\nprocess, we first select the helper vehicles that contribute the most to CP\nbased on their visual range and lowest motion blur. Next, we implement a radio\nblock optimization among the candidate vehicles to further improve\ncommunication efficiency. We specifically focus on pedestrian detection as an\nexemplary scenario. To validate our approach, we used the CARLA simulator to\ncreate a dataset of annotated videos for different driving scenarios where\npedestrian detection is challenging for an AV with compromised vision. Our\nresults demonstrate the efficacy of our two-step optimization process in\nimproving the overall performance of cooperative perception in challenging\nscenarios, substantially improving driving safety under adverse conditions.\nFinally, we note that the networking assumptions are adopted from LTE Release\n14 Mode 4 side-link communication, commonly used for Vehicle-to-Vehicle (V2V)\ncommunication. Nonetheless, our method is flexible and applicable to arbitrary\nV2V communications.\n","authors":["Ahmad Sarlak","Hazim Alzorgan","Sayed Pedram Haeri Boroujeni","Abolfazl Razi","Rahul Amin"],"pdf_url":"https://arxiv.org/pdf/2404.08013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08011v1","updated":"2024-04-10T06:30:33Z","published":"2024-04-10T06:30:33Z","title":"An inclusive review on deep learning techniques and their scope in\n handwriting recognition","summary":" Deep learning expresses a category of machine learning algorithms that have\nthe capability to combine raw inputs into intermediate features layers. These\ndeep learning algorithms have demonstrated great results in different fields.\nDeep learning has particularly witnessed for a great achievement of human level\nperformance across a number of domains in computer vision and pattern\nrecognition. For the achievement of state-of-the-art performances in diverse\ndomains, the deep learning used different architectures and these architectures\nused activation functions to perform various computations between hidden and\noutput layers of any architecture. This paper presents a survey on the existing\nstudies of deep learning in handwriting recognition field. Even though the\nrecent progress indicates that the deep learning methods has provided valuable\nmeans for speeding up or proving accurate results in handwriting recognition,\nbut following from the extensive literature survey, the present study finds\nthat the deep learning has yet to revolutionize more and has to resolve many of\nthe most pressing challenges in this field, but promising advances have been\nmade on the prior state of the art. Additionally, an inadequate availability of\nlabelled data to train presents problems in this domain. Nevertheless, the\npresent handwriting recognition survey foresees deep learning enabling changes\nat both bench and bedside with the potential to transform several domains as\nimage processing, speech recognition, computer vision, machine translation,\nrobotics and control, medical imaging, medical information processing,\nbio-informatics, natural language processing, cyber security, and many others.\n","authors":["Sukhdeep Singh","Sudhir Rohilla","Anuj Sharma"],"pdf_url":"https://arxiv.org/pdf/2404.08011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07351v1","updated":"2024-04-10T21:14:33Z","published":"2024-04-10T21:14:33Z","title":"A Transformer-Based Model for the Prediction of Human Gaze Behavior on\n Videos","summary":" Eye-tracking applications that utilize the human gaze in video understanding\ntasks have become increasingly important. To effectively automate the process\nof video analysis based on eye-tracking data, it is important to accurately\nreplicate human gaze behavior. However, this task presents significant\nchallenges due to the inherent complexity and ambiguity of human gaze patterns.\nIn this work, we introduce a novel method for simulating human gaze behavior.\nOur approach uses a transformer-based reinforcement learning algorithm to train\nan agent that acts as a human observer, with the primary role of watching\nvideos and simulating human gaze behavior. We employed an eye-tracking dataset\ngathered from videos generated by the VirtualHome simulator, with a primary\nfocus on activity recognition. Our experimental results demonstrate the\neffectiveness of our gaze prediction method by highlighting its capability to\nreplicate human gaze behavior and its applicability for downstream tasks where\nreal human-gaze is used as input.\n","authors":["Suleyman Ozdel","Yao Rong","Berat Mert Albaba","Yen-Ling Kuo","Xi Wang","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2404.07351v1.pdf","comment":"2024 Symposium on Eye Tracking Research and Applications (ETRA24),\n Glasgow, United Kingdom"},{"id":"http://arxiv.org/abs/2404.07347v1","updated":"2024-04-10T21:03:23Z","published":"2024-04-10T21:03:23Z","title":"Gaze-Guided Graph Neural Network for Action Anticipation Conditioned on\n Intention","summary":" Humans utilize their gaze to concentrate on essential information while\nperceiving and interpreting intentions in videos. Incorporating human gaze into\ncomputational algorithms can significantly enhance model performance in video\nunderstanding tasks. In this work, we address a challenging and innovative task\nin video understanding: predicting the actions of an agent in a video based on\na partial video. We introduce the Gaze-guided Action Anticipation algorithm,\nwhich establishes a visual-semantic graph from the video input. Our method\nutilizes a Graph Neural Network to recognize the agent's intention and predict\nthe action sequence to fulfill this intention. To assess the efficiency of our\napproach, we collect a dataset containing household activities generated in the\nVirtualHome environment, accompanied by human gaze data of viewing videos. Our\nmethod outperforms state-of-the-art techniques, achieving a 7\\% improvement in\naccuracy for 18-class intention recognition. This highlights the efficiency of\nour method in learning important features from human gaze data.\n","authors":["Suleyman Ozdel","Yao Rong","Berat Mert Albaba","Yen-Ling Kuo","Xi Wang","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2404.07347v1.pdf","comment":"2024 Symposium on Eye Tracking Research and Applications (ETRA24),\n Glasgow, United Kingdom"}]},"2024-04-11T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.07992v1","updated":"2024-04-11T17:59:59Z","published":"2024-04-11T17:59:59Z","title":"GoMVS: Geometrically Consistent Cost Aggregation for Multi-View Stereo","summary":" Matching cost aggregation plays a fundamental role in learning-based\nmulti-view stereo networks. However, directly aggregating adjacent costs can\nlead to suboptimal results due to local geometric inconsistency. Related\nmethods either seek selective aggregation or improve aggregated depth in the 2D\nspace, both are unable to handle geometric inconsistency in the cost volume\neffectively. In this paper, we propose GoMVS to aggregate geometrically\nconsistent costs, yielding better utilization of adjacent geometries. More\nspecifically, we correspond and propagate adjacent costs to the reference pixel\nby leveraging the local geometric smoothness in conjunction with surface\nnormals. We achieve this by the geometric consistent propagation (GCP) module.\nIt computes the correspondence from the adjacent depth hypothesis space to the\nreference depth space using surface normals, then uses the correspondence to\npropagate adjacent costs to the reference geometry, followed by a convolution\nfor aggregation. Our method achieves new state-of-the-art performance on DTU,\nTanks & Temple, and ETH3D datasets. Notably, our method ranks 1st on the Tanks\n& Temple Advanced benchmark.\n","authors":["Jiang Wu","Rui Li","Haofei Xu","Wenxun Zhao","Yu Zhu","Jinqiu Sun","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.07992v1.pdf","comment":"CVPR 2024. Project page: https://wuuu3511.github.io/gomvs/ Code:\n https://github.com/Wuuu3511/GoMVS"},{"id":"http://arxiv.org/abs/2404.07993v1","updated":"2024-04-11T17:59:59Z","published":"2024-04-11T17:59:59Z","title":"Connecting NeRFs, Images, and Text","summary":" Neural Radiance Fields (NeRFs) have emerged as a standard framework for\nrepresenting 3D scenes and objects, introducing a novel data type for\ninformation exchange and storage. Concurrently, significant progress has been\nmade in multimodal representation learning for text and image data. This paper\nexplores a novel research direction that aims to connect the NeRF modality with\nother modalities, similar to established methodologies for images and text. To\nthis end, we propose a simple framework that exploits pre-trained models for\nNeRF representations alongside multimodal models for text and image processing.\nOur framework learns a bidirectional mapping between NeRF embeddings and those\nobtained from corresponding images and text. This mapping unlocks several novel\nand useful applications, including NeRF zero-shot classification and NeRF\nretrieval from images or text.\n","authors":["Francesco Ballerini","Pierluigi Zama Ramirez","Roberto Mirabella","Samuele Salti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2404.07993v1.pdf","comment":"Accepted at CVPRW-INRV 2024"},{"id":"http://arxiv.org/abs/2404.07991v1","updated":"2024-04-11T17:59:57Z","published":"2024-04-11T17:59:57Z","title":"GoMAvatar: Efficient Animatable Human Modeling from Monocular Video\n Using Gaussians-on-Mesh","summary":" We introduce GoMAvatar, a novel approach for real-time, memory-efficient,\nhigh-quality animatable human modeling. GoMAvatar takes as input a single\nmonocular video to create a digital avatar capable of re-articulation in new\nposes and real-time rendering from novel viewpoints, while seamlessly\nintegrating with rasterization-based graphics pipelines. Central to our method\nis the Gaussians-on-Mesh representation, a hybrid 3D model combining rendering\nquality and speed of Gaussian splatting with geometry modeling and\ncompatibility of deformable meshes. We assess GoMAvatar on ZJU-MoCap data and\nvarious YouTube videos. GoMAvatar matches or surpasses current monocular human\nmodeling algorithms in rendering quality and significantly outperforms them in\ncomputational efficiency (43 FPS) while being memory-efficient (3.63 MB per\nsubject).\n","authors":["Jing Wen","Xiaoming Zhao","Zhongzheng Ren","Alexander G. Schwing","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07991v1.pdf","comment":"CVPR 2024; project page: https://wenj.github.io/GoMAvatar/"},{"id":"http://arxiv.org/abs/2404.07990v1","updated":"2024-04-11T17:59:56Z","published":"2024-04-11T17:59:56Z","title":"OpenBias: Open-set Bias Detection in Text-to-Image Generative Models","summary":" Text-to-image generative models are becoming increasingly popular and\naccessible to the general public. As these models see large-scale deployments,\nit is necessary to deeply investigate their safety and fairness to not\ndisseminate and perpetuate any kind of biases. However, existing works focus on\ndetecting closed sets of biases defined a priori, limiting the studies to\nwell-known concepts. In this paper, we tackle the challenge of open-set bias\ndetection in text-to-image generative models presenting OpenBias, a new\npipeline that identifies and quantifies the severity of biases agnostically,\nwithout access to any precompiled set. OpenBias has three stages. In the first\nphase, we leverage a Large Language Model (LLM) to propose biases given a set\nof captions. Secondly, the target generative model produces images using the\nsame set of captions. Lastly, a Vision Question Answering model recognizes the\npresence and extent of the previously proposed biases. We study the behavior of\nStable Diffusion 1.5, 2, and XL emphasizing new biases, never investigated\nbefore. Via quantitative experiments, we demonstrate that OpenBias agrees with\ncurrent closed-set bias detection methods and human judgement.\n","authors":["Moreno D'Incà","Elia Peruzzo","Massimiliano Mancini","Dejia Xu","Vidit Goel","Xingqian Xu","Zhangyang Wang","Humphrey Shi","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2404.07990v1.pdf","comment":"CVPR 2024 Highlight - Code:\n https://github.com/Picsart-AI-Research/OpenBias"},{"id":"http://arxiv.org/abs/2404.07989v1","updated":"2024-04-11T17:59:45Z","published":"2024-04-11T17:59:45Z","title":"Any2Point: Empowering Any-modality Large Models for Efficient 3D\n Understanding","summary":" Large foundation models have recently emerged as a prominent focus of\ninterest, attaining superior performance in widespread scenarios. Due to the\nscarcity of 3D data, many efforts have been made to adapt pre-trained\ntransformers from vision to 3D domains. However, such 2D-to-3D approaches are\nstill limited, due to the potential loss of spatial geometries and high\ncomputation cost. More importantly, their frameworks are mainly designed for 2D\nmodels, lacking a general any-to-3D paradigm. In this paper, we introduce\nAny2Point, a parameter-efficient method to empower any-modality large models\n(vision, language, audio) for 3D understanding. Given a frozen transformer from\nany source modality, we propose a 3D-to-any (1D or 2D) virtual projection\nstrategy that correlates the input 3D points to the original 1D or 2D positions\nwithin the source modality. This mechanism enables us to assign each 3D token\nwith a positional encoding paired with the pre-trained model, which avoids 3D\ngeometry loss caused by the true projection and better motivates the\ntransformer for 3D learning with 1D/2D positional priors. Then, within each\ntransformer block, we insert an any-to-3D guided adapter module for\nparameter-efficient fine-tuning. The adapter incorporates prior spatial\nknowledge from the source modality to guide the local feature aggregation of 3D\ntokens, compelling the semantic adaption of any-modality transformers. We\nconduct extensive experiments to showcase the effectiveness and efficiency of\nour method. Code and models are released at\nhttps://github.com/Ivan-Tang-3D/Any2Point.\n","authors":["Yiwen Tang","Jiaming Liu","Dong Wang","Zhigang Wang","Shanghang Zhang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.07989v1.pdf","comment":"Code and models are released at\n https://github.com/Ivan-Tang-3D/Any2Point"},{"id":"http://arxiv.org/abs/2401.10222v2","updated":"2024-04-11T17:59:42Z","published":"2024-01-18T18:58:54Z","title":"Supervised Fine-tuning in turn Improves Visual Foundation Models","summary":" Image-text training like CLIP has dominated the pretraining of vision\nfoundation models in recent years. Subsequent efforts have been made to\nintroduce region-level visual learning into CLIP's pretraining but face\nscalability challenges due to the lack of large-scale region-level datasets.\nDrawing inspiration from supervised fine-tuning (SFT) in natural language\nprocessing such as instruction tuning, we explore the potential of fine-grained\nSFT in enhancing the generation of vision foundation models after their\npretraining. Thus a two-stage method ViSFT (Vision SFT) is proposed to unleash\nthe fine-grained knowledge of vision foundation models. In ViSFT, the vision\nfoundation model is enhanced by performing visual joint learning on some\nin-domain tasks and then tested on out-of-domain benchmarks. With updating\nusing ViSFT on 8 V100 GPUs in less than 2 days, a vision transformer with over\n4.4B parameters shows improvements across various out-of-domain benchmarks\nincluding vision and vision-linguistic scenarios.\n","authors":["Xiaohu Jiang","Yixiao Ge","Yuying Ge","Dachuan Shi","Chun Yuan","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2401.10222v2.pdf","comment":"23 pages, 3 figures, Project page:\n https://github.com/TencentARC/ViSFT/tree/main"},{"id":"http://arxiv.org/abs/2404.07988v1","updated":"2024-04-11T17:59:40Z","published":"2024-04-11T17:59:40Z","title":"QuasiSim: Parameterized Quasi-Physical Simulators for Dexterous\n Manipulations Transfer","summary":" We explore the dexterous manipulation transfer problem by designing\nsimulators. The task wishes to transfer human manipulations to dexterous robot\nhand simulations and is inherently difficult due to its intricate,\nhighly-constrained, and discontinuous dynamics and the need to control a\ndexterous hand with a DoF to accurately replicate human manipulations. Previous\napproaches that optimize in high-fidelity black-box simulators or a modified\none with relaxed constraints only demonstrate limited capabilities or are\nrestricted by insufficient simulation fidelity. We introduce parameterized\nquasi-physical simulators and a physics curriculum to overcome these\nlimitations. The key ideas are 1) balancing between fidelity and optimizability\nof the simulation via a curriculum of parameterized simulators, and 2) solving\nthe problem in each of the simulators from the curriculum, with properties\nranging from high task optimizability to high fidelity. We successfully enable\na dexterous hand to track complex and diverse manipulations in high-fidelity\nsimulated environments, boosting the success rate by 11\\%+ from the\nbest-performed baseline. The project website is available at\nhttps://meowuu7.github.io/QuasiSim/.\n","authors":["Xueyi Liu","Kangbo Lyu","Jieqiong Zhang","Tao Du","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2404.07988v1.pdf","comment":"Project website: https://meowuu7.github.io/QuasiSim/ Code:\n https://github.com/Meowuu7/QuasiSim Hugging Face Demo:\n https://huggingface.co/spaces/xymeow7/quasi-physical-sims"},{"id":"http://arxiv.org/abs/2404.07987v1","updated":"2024-04-11T17:59:09Z","published":"2024-04-11T17:59:09Z","title":"ControlNet++: Improving Conditional Controls with Efficient Consistency\n Feedback","summary":" To enhance the controllability of text-to-image diffusion models, existing\nefforts like ControlNet incorporated image-based conditional controls. In this\npaper, we reveal that existing methods still face significant challenges in\ngenerating images that align with the image conditional controls. To this end,\nwe propose ControlNet++, a novel approach that improves controllable generation\nby explicitly optimizing pixel-level cycle consistency between generated images\nand conditional controls. Specifically, for an input conditional control, we\nuse a pre-trained discriminative reward model to extract the corresponding\ncondition of the generated images, and then optimize the consistency loss\nbetween the input conditional control and extracted condition. A\nstraightforward implementation would be generating images from random noises\nand then calculating the consistency loss, but such an approach requires\nstoring gradients for multiple sampling timesteps, leading to considerable time\nand memory costs. To address this, we introduce an efficient reward strategy\nthat deliberately disturbs the input images by adding noise, and then uses the\nsingle-step denoised images for reward fine-tuning. This avoids the extensive\ncosts associated with image sampling, allowing for more efficient reward\nfine-tuning. Extensive experiments show that ControlNet++ significantly\nimproves controllability under various conditional controls. For example, it\nachieves improvements over ControlNet by 7.9% mIoU, 13.4% SSIM, and 7.6% RMSE,\nrespectively, for segmentation mask, line-art edge, and depth conditions.\n","authors":["Ming Li","Taojiannan Yang","Huafeng Kuang","Jie Wu","Zhaoning Wang","Xuefeng Xiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07987v1.pdf","comment":"Project Page: https://liming-ai.github.io/ControlNet_Plus_Plus"},{"id":"http://arxiv.org/abs/2404.07985v1","updated":"2024-04-11T17:58:44Z","published":"2024-04-11T17:58:44Z","title":"WaveMo: Learning Wavefront Modulations to See Through Scattering","summary":" Imaging through scattering media is a fundamental and pervasive challenge in\nfields ranging from medical diagnostics to astronomy. A promising strategy to\novercome this challenge is wavefront modulation, which induces measurement\ndiversity during image acquisition. Despite its importance, designing optimal\nwavefront modulations to image through scattering remains under-explored. This\npaper introduces a novel learning-based framework to address the gap. Our\napproach jointly optimizes wavefront modulations and a computationally\nlightweight feedforward \"proxy\" reconstruction network. This network is trained\nto recover scenes obscured by scattering, using measurements that are modified\nby these modulations. The learned modulations produced by our framework\ngeneralize effectively to unseen scattering scenarios and exhibit remarkable\nversatility. During deployment, the learned modulations can be decoupled from\nthe proxy network to augment other more computationally expensive restoration\nalgorithms. Through extensive experiments, we demonstrate our approach\nsignificantly advances the state of the art in imaging through scattering\nmedia. Our project webpage is at https://wavemo-2024.github.io/.\n","authors":["Mingyang Xie","Haiyun Guo","Brandon Y. Feng","Lingbo Jin","Ashok Veeraraghavan","Christopher A. Metzler"],"pdf_url":"https://arxiv.org/pdf/2404.07985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07984v1","updated":"2024-04-11T17:58:11Z","published":"2024-04-11T17:58:11Z","title":"View Selection for 3D Captioning via Diffusion Ranking","summary":" Scalable annotation approaches are crucial for constructing extensive 3D-text\ndatasets, facilitating a broader range of applications. However, existing\nmethods sometimes lead to the generation of hallucinated captions, compromising\ncaption quality. This paper explores the issue of hallucination in 3D object\ncaptioning, with a focus on Cap3D method, which renders 3D objects into 2D\nviews for captioning using pre-trained models. We pinpoint a major challenge:\ncertain rendered views of 3D objects are atypical, deviating from the training\ndata of standard image captioning models and causing hallucinations. To tackle\nthis, we present DiffuRank, a method that leverages a pre-trained text-to-3D\nmodel to assess the alignment between 3D objects and their 2D rendered views,\nwhere the view with high alignment closely represent the object's\ncharacteristics. By ranking all rendered views and feeding the top-ranked ones\ninto GPT4-Vision, we enhance the accuracy and detail of captions, enabling the\ncorrection of 200k captions in the Cap3D dataset and extending it to 1 million\ncaptions across Objaverse and Objaverse-XL datasets. Additionally, we showcase\nthe adaptability of DiffuRank by applying it to pre-trained text-to-image\nmodels for a Visual Question Answering task, where it outperforms the CLIP\nmodel.\n","authors":["Tiange Luo","Justin Johnson","Honglak Lee"],"pdf_url":"https://arxiv.org/pdf/2404.07984v1.pdf","comment":"Dataset link: https://huggingface.co/datasets/tiange/Cap3D"},{"id":"http://arxiv.org/abs/2404.07983v1","updated":"2024-04-11T17:58:06Z","published":"2024-04-11T17:58:06Z","title":"Two Effects, One Trigger: On the Modality Gap, Object Bias, and\n Information Imbalance in Contrastive Vision-Language Representation Learning","summary":" Contrastive vision-language models like CLIP have gained popularity for their\nversatile applicable learned representations in various downstream tasks.\nDespite their successes in some tasks, like zero-shot image recognition, they\nalso perform surprisingly poor on other tasks, like attribute detection.\nPrevious work has attributed these challenges to the modality gap, a separation\nof image and text in the shared representation space, and a bias towards\nobjects over other factors, such as attributes. In this work we investigate\nboth phenomena. We find that only a few embedding dimensions drive the modality\ngap. Further, we propose a measure for object bias and find that object bias\ndoes not lead to worse performance on other concepts, such as attributes. But\nwhat leads to the emergence of the modality gap and object bias? To answer this\nquestion we carefully designed an experimental setting which allows us to\ncontrol the amount of shared information between the modalities. This revealed\nthat the driving factor behind both, the modality gap and the object bias, is\nthe information imbalance between images and captions.\n","authors":["Simon Schrodi","David T. Hoffmann","Max Argus","Volker Fischer","Thomas Brox"],"pdf_url":"https://arxiv.org/pdf/2404.07983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07977v1","updated":"2024-04-11T17:57:19Z","published":"2024-04-11T17:57:19Z","title":"Gaga: Group Any Gaussians via 3D-aware Memory Bank","summary":" We introduce Gaga, a framework that reconstructs and segments open-world 3D\nscenes by leveraging inconsistent 2D masks predicted by zero-shot segmentation\nmodels. Contrasted to prior 3D scene segmentation approaches that heavily rely\non video object tracking, Gaga utilizes spatial information and effectively\nassociates object masks across diverse camera poses. By eliminating the\nassumption of continuous view changes in training images, Gaga demonstrates\nrobustness to variations in camera poses, particularly beneficial for sparsely\nsampled images, ensuring precise mask label consistency. Furthermore, Gaga\naccommodates 2D segmentation masks from diverse sources and demonstrates robust\nperformance with different open-world zero-shot segmentation models, enhancing\nits versatility. Extensive qualitative and quantitative evaluations demonstrate\nthat Gaga performs favorably against state-of-the-art methods, emphasizing its\npotential for real-world applications such as scene understanding and\nmanipulation.\n","authors":["Weijie Lyu","Xueting Li","Abhijit Kundu","Yi-Hsuan Tsai","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.07977v1.pdf","comment":"Project Page: https://www.gaga.gallery"},{"id":"http://arxiv.org/abs/2404.07976v1","updated":"2024-04-11T17:56:40Z","published":"2024-04-11T17:56:40Z","title":"Self-supervised Dataset Distillation: A Good Compression Is All You Need","summary":" Dataset distillation aims to compress information from a large-scale original\ndataset to a new compact dataset while striving to preserve the utmost degree\nof the original data informational essence. Previous studies have predominantly\nconcentrated on aligning the intermediate statistics between the original and\ndistilled data, such as weight trajectory, features, gradient, BatchNorm, etc.\nIn this work, we consider addressing this task through the new lens of model\ninformativeness in the compression stage on the original dataset pretraining.\nWe observe that with the prior state-of-the-art SRe$^2$L, as model sizes\nincrease, it becomes increasingly challenging for supervised pretrained models\nto recover learned information during data synthesis, as the channel-wise mean\nand variance inside the model are flatting and less informative. We further\nnotice that larger variances in BN statistics from self-supervised models\nenable larger loss signals to update the recovered data by gradients, enjoying\nmore informativeness during synthesis. Building on this observation, we\nintroduce SC-DD, a simple yet effective Self-supervised Compression framework\nfor Dataset Distillation that facilitates diverse information compression and\nrecovery compared to traditional supervised learning schemes, further reaps the\npotential of large pretrained models with enhanced capabilities. Extensive\nexperiments are conducted on CIFAR-100, Tiny-ImageNet and ImageNet-1K datasets\nto demonstrate the superiority of our proposed approach. The proposed SC-DD\noutperforms all previous state-of-the-art supervised dataset distillation\nmethods when employing larger models, such as SRe$^2$L, MTT, TESLA, DC, CAFE,\netc., by large margins under the same recovery and post-training budgets. Code\nis available at https://github.com/VILA-Lab/SRe2L/tree/main/SCDD/.\n","authors":["Muxin Zhou","Zeyuan Yin","Shitong Shao","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.07976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07973v1","updated":"2024-04-11T17:56:05Z","published":"2024-04-11T17:56:05Z","title":"Ferret-v2: An Improved Baseline for Referring and Grounding with Large\n Language Models","summary":" While Ferret seamlessly integrates regional understanding into the Large\nLanguage Model (LLM) to facilitate its referring and grounding capability, it\nposes certain limitations: constrained by the pre-trained fixed visual encoder\nand failed to perform well on broader tasks. In this work, we unveil Ferret-v2,\na significant upgrade to Ferret, with three key designs. (1) Any resolution\ngrounding and referring: A flexible approach that effortlessly handles higher\nimage resolution, improving the model's ability to process and understand\nimages in greater detail. (2) Multi-granularity visual encoding: By integrating\nthe additional DINOv2 encoder, the model learns better and diverse underlying\ncontexts for global and fine-grained visual information. (3) A three-stage\ntraining paradigm: Besides image-caption alignment, an additional stage is\nproposed for high-resolution dense alignment before the final instruction\ntuning. Experiments show that Ferret-v2 provides substantial improvements over\nFerret and other state-of-the-art methods, thanks to its high-resolution\nscaling and fine-grained visual processing.\n","authors":["Haotian Zhang","Haoxuan You","Philipp Dufter","Bowen Zhang","Chen Chen","Hong-You Chen","Tsu-Jui Fu","William Yang Wang","Shih-Fu Chang","Zhe Gan","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2404.07973v1.pdf","comment":"Preprint. 14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.07949v1","updated":"2024-04-11T17:46:14Z","published":"2024-04-11T17:46:14Z","title":"Taming Stable Diffusion for Text to 360° Panorama Image Generation","summary":" Generative models, e.g., Stable Diffusion, have enabled the creation of\nphotorealistic images from text prompts. Yet, the generation of 360-degree\npanorama images from text remains a challenge, particularly due to the dearth\nof paired text-panorama data and the domain gap between panorama and\nperspective images. In this paper, we introduce a novel dual-branch diffusion\nmodel named PanFusion to generate a 360-degree image from a text prompt. We\nleverage the stable diffusion model as one branch to provide prior knowledge in\nnatural image generation and register it to another panorama branch for\nholistic image generation. We propose a unique cross-attention mechanism with\nprojection awareness to minimize distortion during the collaborative denoising\nprocess. Our experiments validate that PanFusion surpasses existing methods\nand, thanks to its dual-branch structure, can integrate additional constraints\nlike room layout for customized panorama outputs. Code is available at\nhttps://chengzhag.github.io/publication/panfusion.\n","authors":["Cheng Zhang","Qianyi Wu","Camilo Cruz Gambardella","Xiaoshui Huang","Dinh Phung","Wanli Ouyang","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2404.07949v1.pdf","comment":"CVPR 2024. Project Page:\n https://chengzhag.github.io/publication/panfusion Code:\n https://github.com/chengzhag/PanFusion"},{"id":"http://arxiv.org/abs/2404.07933v1","updated":"2024-04-11T17:30:24Z","published":"2024-04-11T17:30:24Z","title":"Boosting Self-Supervision for Single-View Scene Completion via Knowledge\n Distillation","summary":" Inferring scene geometry from images via Structure from Motion is a\nlong-standing and fundamental problem in computer vision. While classical\napproaches and, more recently, depth map predictions only focus on the visible\nparts of a scene, the task of scene completion aims to reason about geometry\neven in occluded regions. With the popularity of neural radiance fields\n(NeRFs), implicit representations also became popular for scene completion by\npredicting so-called density fields. Unlike explicit approaches. e.g.\nvoxel-based methods, density fields also allow for accurate depth prediction\nand novel-view synthesis via image-based rendering. In this work, we propose to\nfuse the scene reconstruction from multiple images and distill this knowledge\ninto a more accurate single-view scene reconstruction. To this end, we propose\nMulti-View Behind the Scenes (MVBTS) to fuse density fields from multiple posed\nimages, trained fully self-supervised only from image data. Using knowledge\ndistillation, we use MVBTS to train a single-view scene completion network via\ndirect supervision called KDBTS. It achieves state-of-the-art performance on\noccupancy prediction, especially in occluded regions.\n","authors":["Keonhee Han","Dominik Muhle","Felix Wimbauer","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.07933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07932v1","updated":"2024-04-11T17:29:56Z","published":"2024-04-11T17:29:56Z","title":"FusionMamba: Efficient Image Fusion with State Space Model","summary":" Image fusion aims to generate a high-resolution multi/hyper-spectral image by\ncombining a high-resolution image with limited spectral information and a\nlow-resolution image with abundant spectral data. Current deep learning\n(DL)-based methods for image fusion primarily rely on CNNs or Transformers to\nextract features and merge different types of data. While CNNs are efficient,\ntheir receptive fields are limited, restricting their capacity to capture\nglobal context. Conversely, Transformers excel at learning global information\nbut are hindered by their quadratic complexity. Fortunately, recent\nadvancements in the State Space Model (SSM), particularly Mamba, offer a\npromising solution to this issue by enabling global awareness with linear\ncomplexity. However, there have been few attempts to explore the potential of\nSSM in information fusion, which is a crucial ability in domains like image\nfusion. Therefore, we propose FusionMamba, an innovative method for efficient\nimage fusion. Our contributions mainly focus on two aspects. Firstly,\nrecognizing that images from different sources possess distinct properties, we\nincorporate Mamba blocks into two U-shaped networks, presenting a novel\narchitecture that extracts spatial and spectral features in an efficient,\nindependent, and hierarchical manner. Secondly, to effectively combine spatial\nand spectral information, we extend the Mamba block to accommodate dual inputs.\nThis expansion leads to the creation of a new module called the FusionMamba\nblock, which outperforms existing fusion techniques such as concatenation and\ncross-attention. To validate FusionMamba's effectiveness, we conduct a series\nof experiments on five datasets related to three image fusion tasks. The\nquantitative and qualitative evaluation results demonstrate that our method\nachieves state-of-the-art (SOTA) performance, underscoring the superiority of\nFusionMamba.\n","authors":["Siran Peng","Xiangyu Zhu","Haoyu Deng","Zhen Lei","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2404.07932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07930v1","updated":"2024-04-11T17:27:39Z","published":"2024-04-11T17:27:39Z","title":"Parameter Hierarchical Optimization for Visible-Infrared Person\n Re-Identification","summary":" Visible-infrared person re-identification (VI-reID) aims at matching\ncross-modality pedestrian images captured by disjoint visible or infrared\ncameras. Existing methods alleviate the cross-modality discrepancies via\ndesigning different kinds of network architectures. Different from available\nmethods, in this paper, we propose a novel parameter optimizing paradigm,\nparameter hierarchical optimization (PHO) method, for the task of VI-ReID. It\nallows part of parameters to be directly optimized without any training, which\nnarrows the search space of parameters and makes the whole network more easier\nto be trained. Specifically, we first divide the parameters into different\ntypes, and then introduce a self-adaptive alignment strategy (SAS) to\nautomatically align the visible and infrared images through transformation.\nConsidering that features in different dimension have varying importance, we\ndevelop an auto-weighted alignment learning (AAL) module that can automatically\nweight features according to their importance. Importantly, in the alignment\nprocess of SAS and AAL, all the parameters are immediately optimized with\noptimization principles rather than training the whole network, which yields a\nbetter parameter training manner. Furthermore, we establish the cross-modality\nconsistent learning (CCL) loss to extract discriminative person representations\nwith translation consistency. We provide both theoretical justification and\nempirical evidence that our proposed PHO method outperform existing VI-reID\napproaches.\n","authors":["Zeng YU","Yunxiao Shi"],"pdf_url":"https://arxiv.org/pdf/2404.07930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07922v1","updated":"2024-04-11T17:09:28Z","published":"2024-04-11T17:09:28Z","title":"LaVy: Vietnamese Multimodal Large Language Model","summary":" Large Language Models (LLMs) and Multimodal Large language models (MLLMs)\nhave taken the world by storm with impressive abilities in complex reasoning\nand linguistic comprehension. Meanwhile there are plethora of works related to\nVietnamese Large Language Models, the lack of high-quality resources in\nmultimodality limits the progress of Vietnamese MLLMs. In this paper, we\npioneer in address this by introducing LaVy, a state-of-the-art Vietnamese\nMLLM, and we also introduce LaVy-Bench benchmark designated for evaluating\nMLLMs's understanding on Vietnamese visual language tasks. All code and model\nweights are public at https://github.com/baochi0212/LaVy\n","authors":["Chi Tran","Huong Le Thanh"],"pdf_url":"https://arxiv.org/pdf/2404.07922v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2401.04716v3","updated":"2024-04-11T16:46:52Z","published":"2024-01-09T18:40:52Z","title":"Low-Resource Vision Challenges for Foundation Models","summary":" Low-resource settings are well-established in natural language processing,\nwhere many languages lack sufficient data for deep learning at scale. However,\nlow-resource problems are under-explored in computer vision. In this paper, we\naddress this gap and explore the challenges of low-resource image tasks with\nvision foundation models. We first collect a benchmark of genuinely\nlow-resource image data, covering historic maps, circuit diagrams, and\nmechanical drawings. These low-resource settings all share three challenges:\ndata scarcity, fine-grained differences, and the distribution shift from\nnatural images to the specialized domain of interest. While existing foundation\nmodels have shown impressive generalizability, we find they cannot transfer\nwell to our low-resource tasks. To begin to tackle the challenges of\nlow-resource vision, we introduce one simple baseline per challenge.\nSpecifically, we i) enlarge the data space by generative models, ii) adopt the\nbest sub-kernels to encode local regions for fine-grained difference discovery\nand iii) learn attention for specialized domains. Experiments on our three\nlow-resource tasks demonstrate our proposals already provide a better baseline\nthan transfer learning, data augmentation, and fine-grained methods. This\nhighlights the unique characteristics and challenges of low-resource vision for\nfoundation models that warrant further investigation. Project page:\nhttps://xiaobai1217.github.io/Low-Resource-Vision/.\n","authors":["Yunhua Zhang","Hazel Doughty","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2401.04716v3.pdf","comment":"Accepted at CVPR2024"},{"id":"http://arxiv.org/abs/2401.08739v2","updated":"2024-04-11T16:35:22Z","published":"2024-01-16T18:55:22Z","title":"EgoGen: An Egocentric Synthetic Data Generator","summary":" Understanding the world in first-person view is fundamental in Augmented\nReality (AR). This immersive perspective brings dramatic visual changes and\nunique challenges compared to third-person views. Synthetic data has empowered\nthird-person-view vision models, but its application to embodied egocentric\nperception tasks remains largely unexplored. A critical challenge lies in\nsimulating natural human movements and behaviors that effectively steer the\nembodied cameras to capture a faithful egocentric representation of the 3D\nworld. To address this challenge, we introduce EgoGen, a new synthetic data\ngenerator that can produce accurate and rich ground-truth training data for\negocentric perception tasks. At the heart of EgoGen is a novel human motion\nsynthesis model that directly leverages egocentric visual inputs of a virtual\nhuman to sense the 3D environment. Combined with collision-avoiding motion\nprimitives and a two-stage reinforcement learning approach, our motion\nsynthesis model offers a closed-loop solution where the embodied perception and\nmovement of the virtual human are seamlessly coupled. Compared to previous\nworks, our model eliminates the need for a pre-defined global path, and is\ndirectly applicable to dynamic environments. Combined with our easy-to-use and\nscalable data generation pipeline, we demonstrate EgoGen's efficacy in three\ntasks: mapping and localization for head-mounted cameras, egocentric camera\ntracking, and human mesh recovery from egocentric views. EgoGen will be fully\nopen-sourced, offering a practical solution for creating realistic egocentric\ntraining data and aiming to serve as a useful tool for egocentric computer\nvision research. Refer to our project page: https://ego-gen.github.io/.\n","authors":["Gen Li","Kaifeng Zhao","Siwei Zhang","Xiaozhong Lyu","Mihai Dusmanu","Yan Zhang","Marc Pollefeys","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2401.08739v2.pdf","comment":"Accepted by CVPR 2024 (Oral). 23 pages, 17 figures. Project page:\n https://ego-gen.github.io/"},{"id":"http://arxiv.org/abs/2404.07887v1","updated":"2024-04-11T16:17:36Z","published":"2024-04-11T16:17:36Z","title":"Context-aware Video Anomaly Detection in Long-Term Datasets","summary":" Video anomaly detection research is generally evaluated on short, isolated\nbenchmark videos only a few minutes long. However, in real-world environments,\nsecurity cameras observe the same scene for months or years at a time, and the\nnotion of anomalous behavior critically depends on context, such as the time of\nday, day of week, or schedule of events. Here, we propose a context-aware video\nanomaly detection algorithm, Trinity, specifically targeted to these scenarios.\nTrinity is especially well-suited to crowded scenes in which individuals cannot\nbe easily tracked, and anomalies are due to speed, direction, or absence of\ngroup motion. Trinity is a contrastive learning framework that aims to learn\nalignments between context, appearance, and motion, and uses alignment quality\nto classify videos as normal or anomalous. We evaluate our algorithm on both\nconventional benchmarks and a public webcam-based dataset we collected that\nspans more than three months of activity.\n","authors":["Zhengye Yang","Richard Radke"],"pdf_url":"https://arxiv.org/pdf/2404.07887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06564v2","updated":"2024-04-11T16:06:39Z","published":"2024-04-09T18:28:55Z","title":"MambaAD: Exploring State Space Models for Multi-class Unsupervised\n Anomaly Detection","summary":" Recent advancements in anomaly detection have seen the efficacy of CNN- and\ntransformer-based approaches. However, CNNs struggle with long-range\ndependencies, while transformers are burdened by quadratic computational\ncomplexity. Mamba-based models, with their superior long-range modeling and\nlinear efficiency, have garnered substantial attention. This study pioneers the\napplication of Mamba to multi-class unsupervised anomaly detection, presenting\nMambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring\n(Locality-Enhanced State Space) LSS modules at multi-scales. The proposed LSS\nmodule, integrating parallel cascaded (Hybrid State Space) HSS blocks and\nmulti-kernel convolutions operations, effectively captures both long-range and\nlocal information. The HSS block, utilizing (Hybrid Scanning) HS encoders,\nencodes feature maps into five scanning methods and eight directions, thereby\nstrengthening global connections through the (State Space Model) SSM. The use\nof Hilbert scanning and eight directions significantly improves feature\nsequence modeling. Comprehensive experiments on six diverse anomaly detection\ndatasets and seven metrics demonstrate state-of-the-art performance,\nsubstantiating the method's effectiveness.\n","authors":["Haoyang He","Yuhu Bai","Jiangning Zhang","Qingdong He","Hongxu Chen","Zhenye Gan","Chengjie Wang","Xiangtai Li","Guanzhong Tian","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07867v1","updated":"2024-04-11T16:01:00Z","published":"2024-04-11T16:01:00Z","title":"The Power of Properties: Uncovering the Influential Factors in Emotion\n Classification","summary":" Facial expression-based human emotion recognition is a critical research area\nin psychology and medicine. State-of-the-art classification performance is only\nreached by end-to-end trained neural networks. Nevertheless, such black-box\nmodels lack transparency in their decision-making processes, prompting efforts\nto ascertain the rules that underlie classifiers' decisions. Analyzing single\ninputs alone fails to expose systematic learned biases. These biases can be\ncharacterized as facial properties summarizing abstract information like age or\nmedical conditions. Therefore, understanding a model's prediction behavior\nrequires an analysis rooted in causality along such selected properties. We\ndemonstrate that up to 91.25% of classifier output behavior changes are\nstatistically significant concerning basic properties. Among those are age,\ngender, and facial symmetry. Furthermore, the medical usage of surface\nelectromyography significantly influences emotion prediction. We introduce a\nworkflow to evaluate explicit properties and their impact. These insights might\nhelp medical professionals select and apply classifiers regarding their\nspecialized data and properties.\n","authors":["Tim Büchner","Niklas Penzel","Orlando Guntinas-Lichius","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2404.07867v1.pdf","comment":"8 pages, 3 tables, 1 figure, accepted at ICPRAI 2024"},{"id":"http://arxiv.org/abs/2404.06177v2","updated":"2024-04-11T15:57:52Z","published":"2024-04-09T09:58:10Z","title":"Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised\n Medical Image Segmentation","summary":" Although the existing uncertainty-based semi-supervised medical segmentation\nmethods have achieved excellent performance, they usually only consider a\nsingle uncertainty evaluation, which often fails to solve the problem related\nto credibility completely. Therefore, based on the framework of evidential deep\nlearning, this paper integrates the evidential predictive results in the\ncross-region of mixed and original samples to reallocate the confidence degree\nand uncertainty measure of each voxel, which is realized by emphasizing\nuncertain information of probability assignments fusion rule of traditional\nevidence theory. Furthermore, we design a voxel-level asymptotic learning\nstrategy by introducing information entropy to combine with the fused\nuncertainty measure to estimate voxel prediction more precisely. The model will\ngradually pay attention to the prediction results with high uncertainty in the\nlearning process, to learn the features that are difficult to master. The\nexperimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the\nsuperior performance of our proposed method in comparison with the existing\nstate of the arts.\n","authors":["Yuanpeng He","Lijian Li"],"pdf_url":"https://arxiv.org/pdf/2404.06177v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07855v1","updated":"2024-04-11T15:51:52Z","published":"2024-04-11T15:51:52Z","title":"Resolve Domain Conflicts for Generalizable Remote Physiological\n Measurement","summary":" Remote photoplethysmography (rPPG) technology has become increasingly popular\ndue to its non-invasive monitoring of various physiological indicators, making\nit widely applicable in multimedia interaction, healthcare, and emotion\nanalysis. Existing rPPG methods utilize multiple datasets for training to\nenhance the generalizability of models. However, they often overlook the\nunderlying conflict issues across different datasets, such as (1) label\nconflict resulting from different phase delays between physiological signal\nlabels and face videos at the instance level, and (2) attribute conflict\nstemming from distribution shifts caused by head movements, illumination\nchanges, skin types, etc. To address this, we introduce the DOmain-HArmonious\nframework (DOHA). Specifically, we first propose a harmonious phase strategy to\neliminate uncertain phase delays and preserve the temporal variation of\nphysiological signals. Next, we design a harmonious hyperplane optimization\nthat reduces irrelevant attribute shifts and encourages the model's\noptimization towards a global solution that fits more valid scenarios. Our\nexperiments demonstrate that DOHA significantly improves the performance of\nexisting methods under multiple protocols. Our code is available at\nhttps://github.com/SWY666/rPPG-DOHA.\n","authors":["Weiyu Sun","Xinyu Zhang","Hao Lu","Ying Chen","Yun Ge","Xiaolin Huang","Jie Yuan","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07855v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2404.07850v1","updated":"2024-04-11T15:46:42Z","published":"2024-04-11T15:46:42Z","title":"MindBridge: A Cross-Subject Brain Decoding Framework","summary":" Brain decoding, a pivotal field in neuroscience, aims to reconstruct stimuli\nfrom acquired brain signals, primarily utilizing functional magnetic resonance\nimaging (fMRI). Currently, brain decoding is confined to a\nper-subject-per-model paradigm, limiting its applicability to the same\nindividual for whom the decoding model is trained. This constraint stems from\nthree key challenges: 1) the inherent variability in input dimensions across\nsubjects due to differences in brain size; 2) the unique intrinsic neural\npatterns, influencing how different individuals perceive and process sensory\ninformation; 3) limited data availability for new subjects in real-world\nscenarios hampers the performance of decoding models. In this paper, we present\na novel approach, MindBridge, that achieves cross-subject brain decoding by\nemploying only one model. Our proposed framework establishes a generic paradigm\ncapable of addressing these challenges by introducing biological-inspired\naggregation function and novel cyclic fMRI reconstruction mechanism for\nsubject-invariant representation learning. Notably, by cycle reconstruction of\nfMRI, MindBridge can enable novel fMRI synthesis, which also can serve as\npseudo data augmentation. Within the framework, we also devise a novel\nreset-tuning method for adapting a pretrained model to a new subject.\nExperimental results demonstrate MindBridge's ability to reconstruct images for\nmultiple subjects, which is competitive with dedicated subject-specific models.\nFurthermore, with limited data for a new subject, we achieve a high level of\ndecoding accuracy, surpassing that of subject-specific models. This advancement\nin cross-subject brain decoding suggests promising directions for wider\napplications in neuroscience and indicates potential for more efficient\nutilization of limited fMRI data in real-world scenarios. Project page:\nhttps://littlepure2333.github.io/MindBridge\n","authors":["Shizun Wang","Songhua Liu","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07850v1.pdf","comment":"CVPR 2024 highlight. Code is available at\n https://github.com/littlepure2333/MindBridge"},{"id":"http://arxiv.org/abs/2404.07847v1","updated":"2024-04-11T15:42:53Z","published":"2024-04-11T15:42:53Z","title":"Fuss-Free Network: A Simplified and Efficient Neural Network for Crowd\n Counting","summary":" In the field of crowd-counting research, many recent deep learning based\nmethods have demonstrated robust capabilities for accurately estimating crowd\nsizes. However, the enhancement in their performance often arises from an\nincrease in the complexity of the model structure. This paper introduces the\nFuss-Free Network (FFNet), a crowd counting deep learning model that is\ncharacterized by its simplicity and efficiency in terms of its structure. The\nmodel comprises only a backbone of a neural network and a multi-scale feature\nfusion structure.The multi-scale feature fusion structure is a simple\narchitecture consisting of three branches, each only equipped with a focus\ntransition module, and combines the features from these branches through the\nconcatenation operation.Our proposed crowd counting model is trained and\nevaluated on four widely used public datasets, and it achieves accuracy that is\ncomparable to that of existing complex models.The experimental results further\nindicate that excellent performance in crowd counting tasks can also be\nachieved by utilizing a simple, low-parameter, and computationally efficient\nneural network structure.\n","authors":["Lei Chen","Xingen Gao"],"pdf_url":"https://arxiv.org/pdf/2404.07847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07846v1","updated":"2024-04-11T15:39:10Z","published":"2024-04-11T15:39:10Z","title":"TBSN: Transformer-Based Blind-Spot Network for Self-Supervised Image\n Denoising","summary":" Blind-spot networks (BSN) have been prevalent network architectures in\nself-supervised image denoising (SSID). Existing BSNs are mostly conducted with\nconvolution layers. Although transformers offer potential solutions to the\nlimitations of convolutions and have demonstrated success in various image\nrestoration tasks, their attention mechanisms may violate the blind-spot\nrequirement, thus restricting their applicability in SSID. In this paper, we\npresent a transformer-based blind-spot network (TBSN) by analyzing and\nredesigning the transformer operators that meet the blind-spot requirement.\nSpecifically, TBSN follows the architectural principles of dilated BSNs, and\nincorporates spatial as well as channel self-attention layers to enhance the\nnetwork capability. For spatial self-attention, an elaborate mask is applied to\nthe attention matrix to restrict its receptive field, thus mimicking the\ndilated convolution. For channel self-attention, we observe that it may leak\nthe blind-spot information when the channel number is greater than spatial size\nin the deep layers of multi-scale architectures. To eliminate this effect, we\ndivide the channel into several groups and perform channel attention\nseparately. Furthermore, we introduce a knowledge distillation strategy that\ndistills TBSN into smaller denoisers to improve computational efficiency while\nmaintaining performance. Extensive experiments on real-world image denoising\ndatasets show that TBSN largely extends the receptive field and exhibits\nfavorable performance against state-of-the-art SSID methods. The code and\npre-trained models will be publicly available at\nhttps://github.com/nagejacob/TBSN.\n","authors":["Junyi Li","Zhilu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.07846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08890v3","updated":"2024-04-11T15:34:46Z","published":"2023-02-17T14:19:28Z","title":"Deep Learning for Event-based Vision: A Comprehensive Survey and\n Benchmarks","summary":" Event cameras are bio-inspired sensors that capture the per-pixel intensity\nchanges asynchronously and produce event streams encoding the time, pixel\nposition, and polarity (sign) of the intensity changes. Event cameras possess a\nmyriad of advantages over canonical frame-based cameras, such as high temporal\nresolution, high dynamic range, low latency, etc. Being capable of capturing\ninformation in challenging visual conditions, event cameras have the potential\nto overcome the limitations of frame-based cameras in the computer vision and\nrobotics community. In very recent years, deep learning (DL) has been brought\nto this emerging field and inspired active research endeavors in mining its\npotential. However, there is still a lack of taxonomies in DL techniques for\nevent-based vision. We first scrutinize the typical event representations with\nquality enhancement methods as they play a pivotal role as inputs to the DL\nmodels. We then provide a comprehensive survey of existing DL-based methods by\nstructurally grouping them into two major categories: 1) image/video\nreconstruction and restoration; 2) event-based scene understanding and 3D\nvision. We conduct benchmark experiments for the existing methods in some\nrepresentative research directions, i.e., image reconstruction, deblurring, and\nobject recognition, to identify some critical insights and problems. Finally,\nwe have discussions regarding the challenges and provide new perspectives for\ninspiring more research studies.\n","authors":["Xu Zheng","Yexin Liu","Yunfan Lu","Tongyan Hua","Tianbo Pan","Weiming Zhang","Dacheng Tao","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2302.08890v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06842v2","updated":"2024-04-11T15:28:36Z","published":"2024-04-10T09:14:28Z","title":"MoCha-Stereo: Motif Channel Attention Network for Stereo Matching","summary":" Learning-based stereo matching techniques have made significant progress.\nHowever, existing methods inevitably lose geometrical structure information\nduring the feature channel generation process, resulting in edge detail\nmismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network\n(MoCha-Stereo) is designed to address this problem. We provide the Motif\nChannel Correlation Volume (MCCV) to determine more accurate edge matching\ncosts. MCCV is achieved by projecting motif channels, which capture common\ngeometric structures in feature channels, onto feature maps and cost volumes.\nIn addition, edge variations in %potential feature channels of the\nreconstruction error map also affect details matching, we propose the\nReconstruction Error Motif Penalty (REMP) module to further refine the\nfull-resolution disparity estimation. REMP integrates the frequency information\nof typical channel features from the reconstruction error. MoCha-Stereo ranks\n1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure\nalso shows excellent performance in Multi-View Stereo. Code is avaliable at\nhttps://github.com/ZYangChen/MoCha-Stereo.\n","authors":["Ziyang Chen","Wei Long","He Yao","Yongjun Zhang","Bingshu Wang","Yongbin Qin","Jia Wu"],"pdf_url":"https://arxiv.org/pdf/2404.06842v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07833v1","updated":"2024-04-11T15:18:34Z","published":"2024-04-11T15:18:34Z","title":"Streamlined Photoacoustic Image Processing with Foundation Models: A\n Training-Free Solution","summary":" Foundation models have rapidly evolved and have achieved significant\naccomplishments in computer vision tasks. Specifically, the prompt mechanism\nconveniently allows users to integrate image prior information into the model,\nmaking it possible to apply models without any training. Therefore, we propose\na method based on foundation models and zero training to solve the tasks of\nphotoacoustic (PA) image segmentation. We employed the segment anything model\n(SAM) by setting simple prompts and integrating the model's outputs with prior\nknowledge of the imaged objects to accomplish various tasks, including: (1)\nremoving the skin signal in three-dimensional PA image rendering; (2) dual\nspeed-of-sound reconstruction, and (3) segmentation of finger blood vessels.\nThrough these demonstrations, we have concluded that deep learning can be\ndirectly applied in PA imaging without the requirement for network design and\ntraining. This potentially allows for a hands-on, convenient approach to\nachieving efficient and accurate segmentation of PA images. This letter serves\nas a comprehensive tutorial, facilitating the mastery of the technique through\nthe provision of code and sample datasets.\n","authors":["Handi Deng","Yucheng Zhou","Jiaxuan Xiang","Liujie Gu","Yan Luo","Hai Feng","Mingyuan Liu","Cheng Ma"],"pdf_url":"https://arxiv.org/pdf/2404.07833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07824v1","updated":"2024-04-11T15:09:22Z","published":"2024-04-11T15:09:22Z","title":"Heron-Bench: A Benchmark for Evaluating Vision Language Models in\n Japanese","summary":" Vision Language Models (VLMs) have undergone a rapid evolution, giving rise\nto significant advancements in the realm of multimodal understanding tasks.\nHowever, the majority of these models are trained and evaluated on\nEnglish-centric datasets, leaving a gap in the development and evaluation of\nVLMs for other languages, such as Japanese. This gap can be attributed to the\nlack of methodologies for constructing VLMs and the absence of benchmarks to\naccurately measure their performance. To address this issue, we introduce a\nnovel benchmark, Japanese Heron-Bench, for evaluating Japanese capabilities of\nVLMs. The Japanese Heron-Bench consists of a variety of imagequestion answer\npairs tailored to the Japanese context. Additionally, we present a baseline\nJapanese VLM that has been trained with Japanese visual instruction tuning\ndatasets. Our Heron-Bench reveals the strengths and limitations of the proposed\nVLM across various ability dimensions. Furthermore, we clarify the capability\ngap between strong closed models like GPT-4V and the baseline model, providing\nvaluable insights for future research in this domain. We release the benchmark\ndataset and training code to facilitate further developments in Japanese VLM\nresearch.\n","authors":["Yuichi Inoue","Kento Sasaki","Yuma Ochi","Kazuki Fujii","Kotaro Tanahashi","Yu Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2404.07824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07821v1","updated":"2024-04-11T15:00:55Z","published":"2024-04-11T15:00:55Z","title":"Sparse Laneformer","summary":" Lane detection is a fundamental task in autonomous driving, and has achieved\ngreat progress as deep learning emerges. Previous anchor-based methods often\ndesign dense anchors, which highly depend on the training dataset and remain\nfixed during inference. We analyze that dense anchors are not necessary for\nlane detection, and propose a transformer-based lane detection framework based\non a sparse anchor mechanism. To this end, we generate sparse anchors with\nposition-aware lane queries and angle queries instead of traditional explicit\nanchors. We adopt Horizontal Perceptual Attention (HPA) to aggregate the lane\nfeatures along the horizontal direction, and adopt Lane-Angle Cross Attention\n(LACA) to perform interactions between lane queries and angle queries. We also\npropose Lane Perceptual Attention (LPA) based on deformable cross attention to\nfurther refine the lane predictions. Our method, named Sparse Laneformer, is\neasy-to-implement and end-to-end trainable. Extensive experiments demonstrate\nthat Sparse Laneformer performs favorably against the state-of-the-art methods,\ne.g., surpassing Laneformer by 3.0% F1 score and O2SFormer by 0.7% F1 score\nwith fewer MACs on CULane with the same ResNet-34 backbone.\n","authors":["Ji Liu","Zifeng Zhang","Mingjie Lu","Hongyang Wei","Dong Li","Yile Xie","Jinzhang Peng","Lu Tian","Ashish Sirasao","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2404.07821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07807v1","updated":"2024-04-11T14:51:12Z","published":"2024-04-11T14:51:12Z","title":"Voice-Assisted Real-Time Traffic Sign Recognition System Using\n Convolutional Neural Network","summary":" Traffic signs are important in communicating information to drivers. Thus,\ncomprehension of traffic signs is essential for road safety and ignorance may\nresult in road accidents. Traffic sign detection has been a research spotlight\nover the past few decades. Real-time and accurate detections are the\npreliminaries of robust traffic sign detection system which is yet to be\nachieved. This study presents a voice-assisted real-time traffic sign\nrecognition system which is capable of assisting drivers. This system functions\nunder two subsystems. Initially, the detection and recognition of the traffic\nsigns are carried out using a trained Convolutional Neural Network (CNN). After\nrecognizing the specific traffic sign, it is narrated to the driver as a voice\nmessage using a text-to-speech engine. An efficient CNN model for a benchmark\ndataset is developed for real-time detection and recognition using Deep\nLearning techniques. The advantage of this system is that even if the driver\nmisses a traffic sign, or does not look at the traffic sign, or is unable to\ncomprehend the sign, the system detects it and narrates it to the driver. A\nsystem of this type is also important in the development of autonomous\nvehicles.\n","authors":["Mayura Manawadu","Udaya Wijenayake"],"pdf_url":"https://arxiv.org/pdf/2404.07807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07794v1","updated":"2024-04-11T14:35:59Z","published":"2024-04-11T14:35:59Z","title":"DGMamba: Domain Generalization via Generalized State Space Model","summary":" Domain generalization~(DG) aims at solving distribution shift problems in\nvarious scenes. Existing approaches are based on Convolution Neural Networks\n(CNNs) or Vision Transformers (ViTs), which suffer from limited receptive\nfields or quadratic complexities issues. Mamba, as an emerging state space\nmodel (SSM), possesses superior linear complexity and global receptive fields.\nDespite this, it can hardly be applied to DG to address distribution shifts,\ndue to the hidden state issues and inappropriate scan mechanisms. In this\npaper, we propose a novel framework for DG, named DGMamba, that excels in\nstrong generalizability toward unseen domains and meanwhile has the advantages\nof global receptive fields, and efficient linear complexity. Our DGMamba\ncompromises two core components: Hidden State Suppressing~(HSS) and\nSemantic-aware Patch refining~(SPR). In particular, HSS is introduced to\nmitigate the influence of hidden states associated with domain-specific\nfeatures during output prediction. SPR strives to encourage the model to\nconcentrate more on objects rather than context, consisting of two designs:\nPrior-Free Scanning~(PFS), and Domain Context Interchange~(DCI). Concretely,\nPFS aims to shuffle the non-semantic patches within images, creating more\nflexible and effective sequences from images, and DCI is designed to regularize\nMamba with the combination of mismatched non-semantic and semantic information\nby fusing patches among domains. Extensive experiments on four commonly used DG\nbenchmarks demonstrate that the proposed DGMamba achieves remarkably superior\nresults to state-of-the-art models. The code will be made publicly available.\n","authors":["Shaocong Long","Qianyu Zhou","Xiangtai Li","Xuequan Lu","Chenhao Ying","Yuan Luo","Lizhuang Ma","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2404.07794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07790v1","updated":"2024-04-11T14:31:11Z","published":"2024-04-11T14:31:11Z","title":"VIFNet: An End-to-end Visible-Infrared Fusion Network for Image Dehazing","summary":" Image dehazing poses significant challenges in environmental perception.\nRecent research mainly focus on deep learning-based methods with single\nmodality, while they may result in severe information loss especially in\ndense-haze scenarios. The infrared image exhibits robustness to the haze,\nhowever, existing methods have primarily treated the infrared modality as\nauxiliary information, failing to fully explore its rich information in\ndehazing. To address this challenge, the key insight of this study is to design\na visible-infrared fusion network for image dehazing. In particular, we propose\na multi-scale Deep Structure Feature Extraction (DSFE) module, which\nincorporates the Channel-Pixel Attention Block (CPAB) to restore more spatial\nand marginal information within the deep structural features. Additionally, we\nintroduce an inconsistency weighted fusion strategy to merge the two modalities\nby leveraging the more reliable information. To validate this, we construct a\nvisible-infrared multimodal dataset called AirSim-VID based on the AirSim\nsimulation platform. Extensive experiments performed on challenging real and\nsimulated image datasets demonstrate that VIFNet can outperform many\nstate-of-the-art competing methods. The code and dataset are available at\nhttps://github.com/mengyu212/VIFNet_dehazing.\n","authors":["Meng Yu","Te Cui","Haoyang Lu","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2404.07790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07788v1","updated":"2024-04-11T14:29:30Z","published":"2024-04-11T14:29:30Z","title":"AUG: A New Dataset and An Efficient Model for Aerial Image Urban Scene\n Graph Generation","summary":" Scene graph generation (SGG) aims to understand the visual objects and their\nsemantic relationships from one given image. Until now, lots of SGG datasets\nwith the eyelevel view are released but the SGG dataset with the overhead view\nis scarcely studied. By contrast to the object occlusion problem in the\neyelevel view, which impedes the SGG, the overhead view provides a new\nperspective that helps to promote the SGG by providing a clear perception of\nthe spatial relationships of objects in the ground scene. To fill in the gap of\nthe overhead view dataset, this paper constructs and releases an aerial image\nurban scene graph generation (AUG) dataset. Images from the AUG dataset are\ncaptured with the low-attitude overhead view. In the AUG dataset, 25,594\nobjects, 16,970 relationships, and 27,175 attributes are manually annotated. To\navoid the local context being overwhelmed in the complex aerial urban scene,\nthis paper proposes one new locality-preserving graph convolutional network\n(LPG). Different from the traditional graph convolutional network, which has\nthe natural advantage of capturing the global context for SGG, the\nconvolutional layer in the LPG integrates the non-destructive initial features\nof the objects with dynamically updated neighborhood information to preserve\nthe local context under the premise of mining the global context. To address\nthe problem that there exists an extra-large number of potential object\nrelationship pairs but only a small part of them is meaningful in AUG, we\npropose the adaptive bounding box scaling factor for potential relationship\ndetection (ABS-PRD) to intelligently prune the meaningless relationship pairs.\nExtensive experiments on the AUG dataset show that our LPG can significantly\noutperform the state-of-the-art methods and the effectiveness of the proposed\nlocality-preserving strategy.\n","authors":["Yansheng Li","Kun Li","Yongjun Zhang","Linlin Wang","Dingwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.07788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07785v1","updated":"2024-04-11T14:28:04Z","published":"2024-04-11T14:28:04Z","title":"PRAM: Place Recognition Anywhere Model for Efficient Visual Localization","summary":" Humans localize themselves efficiently in known environments by first\nrecognizing landmarks defined on certain objects and their spatial\nrelationships, and then verifying the location by aligning detailed structures\nof recognized objects with those in the memory. Inspired by this, we propose\nthe place recognition anywhere model (PRAM) to perform visual localization as\nefficiently as humans do. PRAM consists of two main components - recognition\nand registration. In detail, first of all, a self-supervised map-centric\nlandmark definition strategy is adopted, making places in either indoor or\noutdoor scenes act as unique landmarks. Then, sparse keypoints extracted from\nimages, are utilized as the input to a transformer-based deep neural network\nfor landmark recognition; these keypoints enable PRAM to recognize hundreds of\nlandmarks with high time and memory efficiency. Keypoints along with recognized\nlandmark labels are further used for registration between query images and the\n3D landmark map. Different from previous hierarchical methods, PRAM discards\nglobal and local descriptors, and reduces over 90% storage. Since PRAM utilizes\nrecognition and landmark-wise verification to replace global reference search\nand exhaustive matching respectively, it runs 2.4 times faster than prior\nstate-of-the-art approaches. Moreover, PRAM opens new directions for visual\nlocalization including multi-modality localization, map-centric feature\nlearning, and hierarchical scene coordinate regression.\n","authors":["Fei Xue","Ignas Budvytis","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2404.07785v1.pdf","comment":"project page: https://feixue94.github.io/pram-project/"},{"id":"http://arxiv.org/abs/2404.04562v2","updated":"2024-04-11T14:28:00Z","published":"2024-04-06T09:03:18Z","title":"Diffusion Time-step Curriculum for One Image to 3D Generation","summary":" Score distillation sampling~(SDS) has been widely adopted to overcome the\nabsence of unseen views in reconstructing 3D objects from a \\textbf{single}\nimage. It leverages pre-trained 2D diffusion models as teacher to guide the\nreconstruction of student 3D models. Despite their remarkable success,\nSDS-based methods often encounter geometric artifacts and texture saturation.\nWe find out the crux is the overlooked indiscriminate treatment of diffusion\ntime-steps during optimization: it unreasonably treats the student-teacher\nknowledge distillation to be equal at all time-steps and thus entangles\ncoarse-grained and fine-grained modeling. Therefore, we propose the Diffusion\nTime-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the\nteacher and student models collaborating with the time-step curriculum in a\ncoarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and\nLevel50 benchmark demonstrate that DTC123 can produce multi-view consistent,\nhigh-quality, and diverse 3D assets. Codes and more generation demos will be\nreleased in https://github.com/yxymessi/DTC123.\n","authors":["Xuanyu Yi","Zike Wu","Qingshan Xu","Pan Zhou","Joo-Hwee Lim","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04562v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.07782v2","updated":"2024-04-11T14:27:27Z","published":"2024-01-15T15:43:56Z","title":"Exploring Masked Autoencoders for Sensor-Agnostic Image Retrieval in\n Remote Sensing","summary":" Self-supervised learning through masked autoencoders (MAEs) has recently\nattracted great attention for remote sensing (RS) image representation\nlearning, and thus embodies a significant potential for content-based image\nretrieval (CBIR) from ever-growing RS image archives. However, the existing\nstudies on MAEs in RS assume that the considered RS images are acquired by a\nsingle image sensor, and thus are only suitable for uni-modal CBIR problems.\nThe effectiveness of MAEs for cross-sensor CBIR, which aims to search\nsemantically similar images across different image modalities, has not been\nexplored yet. In this paper, we take the first step to explore the\neffectiveness of MAEs for sensor-agnostic CBIR in RS. To this end, we present a\nsystematic overview on the possible adaptations of the vanilla MAE to exploit\nmasked image modeling on multi-sensor RS image archives (denoted as\ncross-sensor masked autoencoders [CSMAEs]). Based on different adjustments\napplied to the vanilla MAE, we introduce different CSMAE models. We also\nprovide an extensive experimental analysis of these CSMAE models. We finally\nderive a guideline to exploit masked image modeling for uni-modal and\ncross-modal CBIR problems in RS. The code of this work is publicly available at\nhttps://github.com/jakhac/CSMAE.\n","authors":["Jakob Hackstein","Gencer Sumbul","Kai Norman Clasen","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2401.07782v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Our code is available at https://github.com/jakhac/CSMAE"},{"id":"http://arxiv.org/abs/2309.09590v2","updated":"2024-04-11T14:24:09Z","published":"2023-09-18T08:54:29Z","title":"An Autonomous Vision-Based Algorithm for Interplanetary Navigation","summary":" The surge of deep-space probes makes it unsustainable to navigate them with\nstandard radiometric tracking. Self-driving interplanetary satellites represent\na solution to this problem. In this work, a full vision-based navigation\nalgorithm is built by combining an orbit determination method with an image\nprocessing pipeline suitable for interplanetary transfers of autonomous\nplatforms. To increase the computational efficiency of the algorithm, a\nnon-dimensional extended Kalman filter is selected as state estimator, fed by\nthe positions of the planets extracted from deep-space images. An enhancement\nof the estimation accuracy is performed by applying an optimal strategy to\nselect the best pair of planets to track. Moreover, a novel analytical\nmeasurement model for deep-space navigation is developed providing a\nfirst-order approximation of the light-aberration and light-time effects.\nAlgorithm performance is tested on a high-fidelity, Earth--Mars interplanetary\ntransfer, showing the algorithm applicability for deep-space navigation.\n","authors":["Eleonora Andreis","Paolo Panicucci","Francesco Topputo"],"pdf_url":"https://arxiv.org/pdf/2309.09590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18551v2","updated":"2024-04-11T14:10:43Z","published":"2024-03-27T13:31:39Z","title":"Attention Calibration for Disentangled Text-to-Image Personalization","summary":" Recent thrilling progress in large-scale text-to-image (T2I) models has\nunlocked unprecedented synthesis quality of AI-generated content (AIGC)\nincluding image generation, 3D and video composition. Further, personalized\ntechniques enable appealing customized production of a novel concept given only\nseveral images as reference. However, an intriguing problem persists: Is it\npossible to capture multiple, novel concepts from one single reference image?\nIn this paper, we identify that existing approaches fail to preserve visual\nconsistency with the reference image and eliminate cross-influence from\nconcepts. To alleviate this, we propose an attention calibration mechanism to\nimprove the concept-level understanding of the T2I model. Specifically, we\nfirst introduce new learnable modifiers bound with classes to capture\nattributes of multiple concepts. Then, the classes are separated and\nstrengthened following the activation of the cross-attention operation,\nensuring comprehensive and self-contained concepts. Additionally, we suppress\nthe attention activation of different classes to mitigate mutual influence\namong concepts. Together, our proposed method, dubbed DisenDiff, can learn\ndisentangled multiple concepts from one single image and produce novel\ncustomized images with learned concepts. We demonstrate that our method\noutperforms the current state of the art in both qualitative and quantitative\nevaluations. More importantly, our proposed techniques are compatible with LoRA\nand inpainting pipelines, enabling more interactive experiences.\n","authors":["Yanbing Zhang","Mengping Yang","Qin Zhou","Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18551v2.pdf","comment":"CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.07773v1","updated":"2024-04-11T14:08:45Z","published":"2024-04-11T14:08:45Z","title":"ConsistencyDet: Robust Object Detector with Denoising Paradigm of\n Consistency Model","summary":" Object detection, a quintessential task in the realm of perceptual computing,\ncan be tackled using a generative methodology. In the present study, we\nintroduce a novel framework designed to articulate object detection as a\ndenoising diffusion process, which operates on perturbed bounding boxes of\nannotated entities. This framework, termed ConsistencyDet, leverages an\ninnovative denoising concept known as the Consistency Model. The hallmark of\nthis model is its self-consistency feature, which empowers the model to map\ndistorted information from any temporal stage back to its pristine state,\nthereby realizing a ``one-step denoising'' mechanism. Such an attribute\nmarkedly elevates the operational efficiency of the model, setting it apart\nfrom the conventional Diffusion Model. Throughout the training phase,\nConsistencyDet initiates the diffusion sequence with noise-infused boxes\nderived from the ground-truth annotations and conditions the model to perform\nthe denoising task. Subsequently, in the inference stage, the model employs a\ndenoising sampling strategy that commences with bounding boxes randomly sampled\nfrom a normal distribution. Through iterative refinement, the model transforms\nan assortment of arbitrarily generated boxes into the definitive detections.\nComprehensive evaluations employing standard benchmarks, such as MS-COCO and\nLVIS, corroborate that ConsistencyDet surpasses other leading-edge detectors in\nperformance metrics.\n","authors":["Lifan Jiang","Zhihui Wang","Changmiao Wang","Ming Li","Jiaxu Leng","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07770v1","updated":"2024-04-11T14:07:16Z","published":"2024-04-11T14:07:16Z","title":"Joint Conditional Diffusion Model for Image Restoration with Mixed\n Degradations","summary":" Image restoration is rather challenging in adverse weather conditions,\nespecially when multiple degradations occur simultaneously. Blind image\ndecomposition was proposed to tackle this issue, however, its effectiveness\nheavily relies on the accurate estimation of each component. Although\ndiffusion-based models exhibit strong generative abilities in image restoration\ntasks, they may generate irrelevant contents when the degraded images are\nseverely corrupted. To address these issues, we leverage physical constraints\nto guide the whole restoration process, where a mixed degradation model based\non atmosphere scattering model is constructed. Then we formulate our Joint\nConditional Diffusion Model (JCDM) by incorporating the degraded image and\ndegradation mask to provide precise guidance. To achieve better color and\ndetail recovery results, we further integrate a refinement network to\nreconstruct the restored image, where Uncertainty Estimation Block (UEB) is\nemployed to enhance the features. Extensive experiments performed on both\nmulti-weather and weather-specific datasets demonstrate the superiority of our\nmethod over state-of-the-art competing methods.\n","authors":["Yufeng Yue","Meng Yu","Luojie Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.07770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07766v1","updated":"2024-04-11T14:05:37Z","published":"2024-04-11T14:05:37Z","title":"RMAFF-PSN: A Residual Multi-Scale Attention Feature Fusion Photometric\n Stereo Network","summary":" Predicting accurate normal maps of objects from two-dimensional images in\nregions of complex structure and spatial material variations is challenging\nusing photometric stereo methods due to the influence of surface reflection\nproperties caused by variations in object geometry and surface materials. To\naddress this issue, we propose a photometric stereo network called a RMAFF-PSN\nthat uses residual multiscale attentional feature fusion to handle the\n``difficult'' regions of the object. Unlike previous approaches that only use\nstacked convolutional layers to extract deep features from the input image, our\nmethod integrates feature information from different resolution stages and\nscales of the image. This approach preserves more physical information, such as\ntexture and geometry of the object in complex regions, through shallow-deep\nstage feature extraction, double branching enhancement, and attention\noptimization. To test the network structure under real-world conditions, we\npropose a new real dataset called Simple PS data, which contains multiple\nobjects with varying structures and materials. Experimental results on a\npublicly available benchmark dataset demonstrate that our method outperforms\nmost existing calibrated photometric stereo methods for the same number of\ninput images, especially in the case of highly non-convex object structures.\nOur method also obtains good results under sparse lighting conditions.\n","authors":["Kai Luo","Yakun Ju","Lin Qi","Kaixuan Wang","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.07766v1.pdf","comment":"17 pages,12 figures"},{"id":"http://arxiv.org/abs/2404.07762v1","updated":"2024-04-11T14:03:16Z","published":"2024-04-11T14:03:16Z","title":"NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous\n Driving","summary":" We present a versatile NeRF-based simulator for testing autonomous driving\n(AD) software systems, designed with a focus on sensor-realistic closed-loop\nevaluation and the creation of safety-critical scenarios. The simulator learns\nfrom sequences of real-world driving sensor data and enables reconfigurations\nand renderings of new, unseen scenarios. In this work, we use our simulator to\ntest the responses of AD models to safety-critical scenarios inspired by the\nEuropean New Car Assessment Programme (Euro NCAP). Our evaluation reveals that,\nwhile state-of-the-art end-to-end planners excel in nominal driving scenarios\nin an open-loop setting, they exhibit critical flaws when navigating our\nsafety-critical scenarios in a closed-loop setting. This highlights the need\nfor advancements in the safety and real-world usability of end-to-end planners.\nBy publicly releasing our simulator and scenarios as an easy-to-run evaluation\nsuite, we invite the research community to explore, refine, and validate their\nAD models in controlled, yet highly configurable and challenging\nsensor-realistic environments. Code and instructions can be found at\nhttps://github.com/wljungbergh/NeuroNCAP\n","authors":["William Ljungbergh","Adam Tonderski","Joakim Johnander","Holger Caesar","Kalle Åström","Michael Felsberg","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2404.07762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07754v1","updated":"2024-04-11T14:00:20Z","published":"2024-04-11T14:00:20Z","title":"Generating Synthetic Satellite Imagery With Deep-Learning Text-to-Image\n Models -- Technical Challenges and Implications for Monitoring and\n Verification","summary":" Novel deep-learning (DL) architectures have reached a level where they can\ngenerate digital media, including photorealistic images, that are difficult to\ndistinguish from real data. These technologies have already been used to\ngenerate training data for Machine Learning (ML) models, and large\ntext-to-image models like DALL-E 2, Imagen, and Stable Diffusion are achieving\nremarkable results in realistic high-resolution image generation. Given these\ndevelopments, issues of data authentication in monitoring and verification\ndeserve a careful and systematic analysis: How realistic are synthetic images?\nHow easily can they be generated? How useful are they for ML researchers, and\nwhat is their potential for Open Science? In this work, we use novel DL models\nto explore how synthetic satellite images can be created using conditioning\nmechanisms. We investigate the challenges of synthetic satellite image\ngeneration and evaluate the results based on authenticity and state-of-the-art\nmetrics. Furthermore, we investigate how synthetic data can alleviate the lack\nof data in the context of ML methods for remote-sensing. Finally we discuss\nimplications of synthetic satellite imagery in the context of monitoring and\nverification.\n","authors":["Tuong Vy Nguyen","Alexander Glaser","Felix Biessmann"],"pdf_url":"https://arxiv.org/pdf/2404.07754v1.pdf","comment":"https://resources.inmm.org/annual-meeting-proceedings/generating-synthetic-satellite-imagery-deep-learning-text-image-models"},{"id":"http://arxiv.org/abs/2404.07748v1","updated":"2024-04-11T13:46:05Z","published":"2024-04-11T13:46:05Z","title":"3D-CSAD: Untrained 3D Anomaly Detection for Complex Manufacturing\n Surfaces","summary":" The surface quality inspection of manufacturing parts based on 3D point cloud\ndata has attracted increasing attention in recent years. The reason is that the\n3D point cloud can capture the entire surface of manufacturing parts, unlike\nthe previous practices that focus on some key product characteristics. However,\nachieving accurate 3D anomaly detection is challenging, due to the complex\nsurfaces of manufacturing parts and the difficulty of collecting sufficient\nanomaly samples. To address these challenges, we propose a novel untrained\nanomaly detection method based on 3D point cloud data for complex manufacturing\nparts, which can achieve accurate anomaly detection in a single sample without\ntraining data. In the proposed framework, we transform an input sample into two\nsets of profiles along different directions. Based on one set of the profiles,\na novel segmentation module is devised to segment the complex surface into\nmultiple basic and simple components. In each component, another set of\nprofiles, which have the nature of similar shapes, can be modeled as a low-rank\nmatrix. Thus, accurate 3D anomaly detection can be achieved by using Robust\nPrincipal Component Analysis (RPCA) on these low-rank matrices. Extensive\nnumerical experiments on different types of parts show that our method achieves\npromising results compared with the benchmark methods.\n","authors":["Xuanming Cao","Chengyu Tao","Juan Du"],"pdf_url":"https://arxiv.org/pdf/2404.07748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05539v2","updated":"2024-04-11T13:39:18Z","published":"2023-11-09T17:34:57Z","title":"A Deep Learning Method for Simultaneous Denoising and Missing Wedge\n Reconstruction in Cryogenic Electron Tomography","summary":" Cryogenic electron tomography is a technique for imaging biological samples\nin 3D. A microscope collects a series of 2D projections of the sample, and the\ngoal is to reconstruct the 3D density of the sample called the tomogram.\nReconstruction is difficult as the 2D projections are noisy and can not be\nrecorded from all directions, resulting in a missing wedge of information.\nTomograms conventionally reconstructed with filtered back-projection suffer\nfrom noise and strong artifacts due to the missing wedge. Here, we propose a\ndeep-learning approach for simultaneous denoising and missing wedge\nreconstruction called DeepDeWedge. The algorithm requires no ground truth data\nand is based on fitting a neural network to the 2D projections using a\nself-supervised loss. DeepDeWedge performs better than CryoCARE and IsoNet,\nwhich are state-of-the-art methods for denoising and missing wedge\nreconstruction, and similarly and, in some cases, better than the combination\nof the two methods. At the same time, DeepDeWedge is simpler than this two-step\napproach, as it does denoising and missing wedge reconstruction simultaneously\nrather than sequentially.\n","authors":["Simon Wiedemann","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2311.05539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07739v1","updated":"2024-04-11T13:37:51Z","published":"2024-04-11T13:37:51Z","title":"Exploiting Object-based and Segmentation-based Semantic Features for\n Deep Learning-based Indoor Scene Classification","summary":" Indoor scenes are usually characterized by scattered objects and their\nrelationships, which turns the indoor scene classification task into a\nchallenging computer vision task. Despite the significant performance boost in\nclassification tasks achieved in recent years, provided by the use of\ndeep-learning-based methods, limitations such as inter-category ambiguity and\nintra-category variation have been holding back their performance. To overcome\nsuch issues, gathering semantic information has been shown to be a promising\nsource of information towards a more complete and discriminative feature\nrepresentation of indoor scenes. Therefore, the work described in this paper\nuses both semantic information, obtained from object detection, and semantic\nsegmentation techniques. While object detection techniques provide the 2D\nlocation of objects allowing to obtain spatial distributions between objects,\nsemantic segmentation techniques provide pixel-level information that allows to\nobtain, at a pixel-level, a spatial distribution and shape-related features of\nthe segmentation categories. Hence, a novel approach that uses a semantic\nsegmentation mask to provide Hu-moments-based segmentation categories' shape\ncharacterization, designated by Segmentation-based Hu-Moments Features (SHMFs),\nis proposed. Moreover, a three-main-branch network, designated by\nGOS$^2$F$^2$App, that exploits deep-learning-based global features,\nobject-based features, and semantic segmentation-based features is also\nproposed. GOS$^2$F$^2$App was evaluated in two indoor scene benchmark datasets:\nSUN RGB-D and NYU Depth V2, where, to the best of our knowledge,\nstate-of-the-art results were achieved on both datasets, which present\nevidences of the effectiveness of the proposed approach.\n","authors":["Ricardo Pereira","Luís Garrote","Tiago Barros","Ana Lopes","Urbano J. Nunes"],"pdf_url":"https://arxiv.org/pdf/2404.07739v1.pdf","comment":"This preprint was submitted at IEEE Transactions on Image Processing"},{"id":"http://arxiv.org/abs/2404.05392v2","updated":"2024-04-11T13:36:58Z","published":"2024-04-08T10:51:29Z","title":"T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise\n Event Spotting in Sports Videos","summary":" In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer\nEncoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses\nmultiple challenges in the task, including the need for discriminability among\nframe representations, high output temporal resolution to maintain prediction\nprecision, and the necessity to capture information at different temporal\nscales to handle events with varying dynamics. It tackles these challenges\nthrough its specifically designed architecture, featuring an encoder-decoder\nfor leveraging multiple temporal scales and achieving high output temporal\nresolution, along with temporal modules designed to increase token\ndiscriminability. Leveraging these characteristics, T-DEED achieves SOTA\nperformance on the FigureSkating and FineDiving datasets. Code is available at\nhttps://github.com/arturxe2/T-DEED.\n","authors":["Artur Xarles","Sergio Escalera","Thomas B. Moeslund","Albert Clapés"],"pdf_url":"https://arxiv.org/pdf/2404.05392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07729v1","updated":"2024-04-11T13:19:46Z","published":"2024-04-11T13:19:46Z","title":"Realistic Continual Learning Approach using Pre-trained Models","summary":" Continual learning (CL) is crucial for evaluating adaptability in learning\nsolutions to retain knowledge. Our research addresses the challenge of\ncatastrophic forgetting, where models lose proficiency in previously learned\ntasks as they acquire new ones. While numerous solutions have been proposed,\nexisting experimental setups often rely on idealized class-incremental learning\nscenarios. We introduce Realistic Continual Learning (RealCL), a novel CL\nparadigm where class distributions across tasks are random, departing from\nstructured setups.\n We also present CLARE (Continual Learning Approach with pRE-trained models\nfor RealCL scenarios), a pre-trained model-based solution designed to integrate\nnew knowledge while preserving past learning. Our contributions include\npioneering RealCL as a generalization of traditional CL setups, proposing CLARE\nas an adaptable approach for RealCL tasks, and conducting extensive experiments\ndemonstrating its effectiveness across various RealCL scenarios. Notably, CLARE\noutperforms existing models on RealCL benchmarks, highlighting its versatility\nand robustness in unpredictable learning environments.\n","authors":["Nadia Nasri","Carlos Gutiérrez-Álvarez","Sergio Lafuente-Arroyo","Saturnino Maldonado-Bascón","Roberto J. López-Sastre"],"pdf_url":"https://arxiv.org/pdf/2404.07729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07724v1","updated":"2024-04-11T13:16:47Z","published":"2024-04-11T13:16:47Z","title":"Applying Guidance in a Limited Interval Improves Sample and Distribution\n Quality in Diffusion Models","summary":" Guidance is a crucial technique for extracting the best performance out of\nimage-generating diffusion models. Traditionally, a constant guidance weight\nhas been applied throughout the sampling chain of an image. We show that\nguidance is clearly harmful toward the beginning of the chain (high noise\nlevels), largely unnecessary toward the end (low noise levels), and only\nbeneficial in the middle. We thus restrict it to a specific range of noise\nlevels, improving both the inference speed and result quality. This limited\nguidance interval improves the record FID in ImageNet-512 significantly, from\n1.81 to 1.40. We show that it is quantitatively and qualitatively beneficial\nacross different sampler parameters, network architectures, and datasets,\nincluding the large-scale setting of Stable Diffusion XL. We thus suggest\nexposing the guidance interval as a hyperparameter in all diffusion models that\nuse guidance.\n","authors":["Tuomas Kynkäänniemi","Miika Aittala","Tero Karras","Samuli Laine","Timo Aila","Jaakko Lehtinen"],"pdf_url":"https://arxiv.org/pdf/2404.07724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03778v2","updated":"2024-04-11T13:12:48Z","published":"2024-04-04T19:50:57Z","title":"Flattening the Parent Bias: Hierarchical Semantic Segmentation in the\n Poincar{é} Ball","summary":" Hierarchy is a natural representation of semantic taxonomies, including the\nones routinely used in image segmentation. Indeed, recent work on semantic\nsegmentation reports improved accuracy from supervised training leveraging\nhierarchical label structures. Encouraged by these results, we revisit the\nfundamental assumptions behind that work. We postulate and then empirically\nverify that the reasons for the observed improvement in segmentation accuracy\nmay be entirely unrelated to the use of the semantic hierarchy. To demonstrate\nthis, we design a range of cross-domain experiments with a representative\nhierarchical approach. We find that on the new testing domains, a flat\n(non-hierarchical) segmentation network, in which the parents are inferred from\nthe children, has superior segmentation accuracy to the hierarchical approach\nacross the board. Complementing these findings and inspired by the intrinsic\nproperties of hyperbolic spaces, we study a more principled approach to\nhierarchical segmentation using the Poincar\\'e ball model. The hyperbolic\nrepresentation largely outperforms the previous (Euclidean) hierarchical\napproach as well and is on par with our flat Euclidean baseline in terms of\nsegmentation accuracy. However, it additionally exhibits surprisingly strong\ncalibration quality of the parent nodes in the semantic hierarchy, especially\non the more challenging domains. Our combined analysis suggests that the\nestablished practice of hierarchical segmentation may be limited to in-domain\nsettings, whereas flat classifiers generalize substantially better, especially\nif they are modeled in the hyperbolic space.\n","authors":["Simon Weber","Barış Zöngür","Nikita Araslanov","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16783v2","updated":"2024-04-11T13:07:43Z","published":"2023-03-29T15:19:01Z","title":"Exploring Efficient Asymmetric Blind-Spots for Self-Supervised Denoising\n in Real-World Scenarios","summary":" Self-supervised denoising has attracted widespread attention due to its\nability to train without clean images. However, noise in real-world scenarios\nis often spatially correlated, which causes many self-supervised algorithms\nthat assume pixel-wise independent noise to perform poorly. Recent works have\nattempted to break noise correlation with downsampling or neighborhood masking.\nHowever, denoising on downsampled subgraphs can lead to aliasing effects and\nloss of details due to a lower sampling rate. Furthermore, the neighborhood\nmasking methods either come with high computational complexity or do not\nconsider local spatial preservation during inference. Through the analysis of\nexisting methods, we point out that the key to obtaining high-quality and\ntexture-rich results in real-world self-supervised denoising tasks is to train\nat the original input resolution structure and use asymmetric operations during\ntraining and inference. Based on this, we propose Asymmetric Tunable Blind-Spot\nNetwork (AT-BSN), where the blind-spot size can be freely adjusted, thus better\nbalancing noise correlation suppression and image local spatial destruction\nduring training and inference. In addition, we regard the pre-trained AT-BSN as\na meta-teacher network capable of generating various teacher networks by\nsampling different blind-spots. We propose a blind-spot based multi-teacher\ndistillation strategy to distill a lightweight network, significantly improving\nperformance. Experimental results on multiple datasets prove that our method\nachieves state-of-the-art, and is superior to other self-supervised algorithms\nin terms of computational overhead and visual effects.\n","authors":["Shiyan Chen","Jiyuan Zhang","Zhaofei Yu","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2303.16783v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03936v2","updated":"2024-04-11T13:02:58Z","published":"2024-04-05T07:44:17Z","title":"Deep Learning for Satellite Image Time Series Analysis: A Review","summary":" Earth observation (EO) satellite missions have been providing detailed images\nabout the state of the Earth and its land cover for over 50 years. Long term\nmissions, such as NASA's Landsat, Terra, and Aqua satellites, and more\nrecently, the ESA's Sentinel missions, record images of the entire world every\nfew days. Although single images provide point-in-time data, repeated images of\nthe same area, or satellite image time series (SITS) provide information about\nthe changing state of vegetation and land use. These SITS are useful for\nmodeling dynamic processes and seasonal changes such as plant phenology. They\nhave potential benefits for many aspects of land and natural resource\nmanagement, including applications in agricultural, forest, water, and disaster\nmanagement, urban planning, and mining. However, the resulting satellite image\ntime series (SITS) are complex, incorporating information from the temporal,\nspatial, and spectral dimensions. Therefore, deep learning methods are often\ndeployed as they can analyze these complex relationships. This review presents\na summary of the state-of-the-art methods of modelling environmental,\nagricultural, and other Earth observation variables from SITS data using deep\nlearning methods. We aim to provide a resource for remote sensing experts\ninterested in using deep learning techniques to enhance Earth observation\nmodels with temporal information.\n","authors":["Lynn Miller","Charlotte Pelletier","Geoffrey I. Webb"],"pdf_url":"https://arxiv.org/pdf/2404.03936v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.07713v1","updated":"2024-04-11T12:59:38Z","published":"2024-04-11T12:59:38Z","title":"Progressive Semantic-Guided Vision Transformer for Zero-Shot Learning","summary":" Zero-shot learning (ZSL) recognizes the unseen classes by conducting\nvisual-semantic interactions to transfer semantic knowledge from seen classes\nto unseen ones, supported by semantic information (e.g., attributes). However,\nexisting ZSL methods simply extract visual features using a pre-trained network\nbackbone (i.e., CNN or ViT), which fail to learn matched visual-semantic\ncorrespondences for representing semantic-related visual features as lacking of\nthe guidance of semantic information, resulting in undesirable visual-semantic\ninteractions. To tackle this issue, we propose a progressive semantic-guided\nvision transformer for zero-shot learning (dubbed ZSLViT). ZSLViT mainly\nconsiders two properties in the whole network: i) discover the semantic-related\nvisual representations explicitly, and ii) discard the semantic-unrelated\nvisual information. Specifically, we first introduce semantic-embedded token\nlearning to improve the visual-semantic correspondences via semantic\nenhancement and discover the semantic-related visual tokens explicitly with\nsemantic-guided token attention. Then, we fuse low semantic-visual\ncorrespondence visual tokens to discard the semantic-unrelated visual\ninformation for visual enhancement. These two operations are integrated into\nvarious encoders to progressively learn semantic-related visual representations\nfor accurate visual-semantic interactions in ZSL. The extensive experiments\nshow that our ZSLViT achieves significant performance gains on three popular\nbenchmark datasets, i.e., CUB, SUN, and AWA2.\n","authors":["Shiming Chen","Wenjin Hou","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2404.07713v1.pdf","comment":"Accepted to CVPR'24"},{"id":"http://arxiv.org/abs/2404.07711v1","updated":"2024-04-11T12:58:12Z","published":"2024-04-11T12:58:12Z","title":"OpenTrench3D: A Photogrammetric 3D Point Cloud Dataset for Semantic\n Segmentation of Underground Utilities","summary":" Identifying and classifying underground utilities is an important task for\nefficient and effective urban planning and infrastructure maintenance. We\npresent OpenTrench3D, a novel and comprehensive 3D Semantic Segmentation point\ncloud dataset, designed to advance research and development in underground\nutility surveying and mapping. OpenTrench3D covers a completely novel domain\nfor public 3D point cloud datasets and is unique in its focus, scope, and\ncost-effective capturing method. The dataset consists of 310 point clouds\ncollected across 7 distinct areas. These include 5 water utility areas and 2\ndistrict heating utility areas. The inclusion of different geographical areas\nand main utilities (water and district heating utilities) makes OpenTrench3D\nparticularly valuable for inter-domain transfer learning experiments. We\nprovide benchmark results for the dataset using three state-of-the-art semantic\nsegmentation models, PointNeXt, PointVector and PointMetaBase. Benchmarks are\nconducted by training on data from water areas, fine-tuning on district heating\narea 1 and evaluating on district heating area 2. The dataset is publicly\navailable. With OpenTrench3D, we seek to foster innovation and progress in the\nfield of 3D semantic segmentation in applications related to detection and\ndocumentation of underground utilities as well as in transfer learning methods\nin general.\n","authors":["Lasse H. Hansen","Simon B. Jensen","Mark P. Philipsen","Andreas Møgelmose","Lars Bodum","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2404.07711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07705v1","updated":"2024-04-11T12:49:56Z","published":"2024-04-11T12:49:56Z","title":"ViM-UNet: Vision Mamba for Biomedical Segmentation","summary":" CNNs, most notably the UNet, are the default architecture for biomedical\nsegmentation. Transformer-based approaches, such as UNETR, have been proposed\nto replace them, benefiting from a global field of view, but suffering from\nlarger runtimes and higher parameter counts. The recent Vision Mamba\narchitecture offers a compelling alternative to transformers, also providing a\nglobal field of view, but at higher efficiency. Here, we introduce ViM-UNet, a\nnovel segmentation architecture based on it and compare it to UNet and UNETR\nfor two challenging microscopy instance segmentation tasks. We find that it\nperforms similarly or better than UNet, depending on the task, and outperforms\nUNETR while being more efficient. Our code is open source and documented at\nhttps://github.com/constantinpape/torch-em/blob/main/vimunet.md.\n","authors":["Anwai Archit","Constantin Pape"],"pdf_url":"https://arxiv.org/pdf/2404.07705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07698v1","updated":"2024-04-11T12:44:15Z","published":"2024-04-11T12:44:15Z","title":"Point Cloud Geometry Scalable Coding with a Quality-Conditioned Latents\n Probability Estimator","summary":" The widespread usage of point clouds (PC) for immersive visual applications\nhas resulted in the use of very heterogeneous receiving conditions and devices,\nnotably in terms of network, hardware, and display capabilities. In this\nscenario, quality scalability, i.e., the ability to reconstruct a signal at\ndifferent qualities by progressively decoding a single bitstream, is a major\nrequirement that has yet to be conveniently addressed, notably in most\nlearning-based PC coding solutions. This paper proposes a quality scalability\nscheme, named Scalable Quality Hyperprior (SQH), adaptable to learning-based\nstatic point cloud geometry codecs, which uses a Quality-conditioned Latents\nProbability Estimator (QuLPE) to decode a high-quality version of a PC\nlearning-based representation, based on an available lower quality base layer.\nSQH is integrated in the future JPEG PC coding standard, allowing to create a\nlayered bitstream that can be used to progressively decode the PC geometry with\nincreasing quality and fidelity. Experimental results show that SQH offers the\nquality scalability feature with very limited or no compression performance\npenalty at all when compared with the corresponding non-scalable solution, thus\npreserving the significant compression gains over other state-of-the-art PC\ncodecs.\n","authors":["Daniele Mari","André F. R. Guarda","Nuno M. M. Rodrigues","Simone Milani","Fernando Pereira"],"pdf_url":"https://arxiv.org/pdf/2404.07698v1.pdf","comment":"Submitted at ICIP 2024"},{"id":"http://arxiv.org/abs/2404.07696v1","updated":"2024-04-11T12:42:18Z","published":"2024-04-11T12:42:18Z","title":"Flatness Improves Backbone Generalisation in Few-shot Classification","summary":" Deployment of deep neural networks in real-world settings typically requires\nadaptation to new tasks with few examples. Few-shot classification (FSC)\nprovides a solution to this problem by leveraging pre-trained backbones for\nfast adaptation to new classes. Surprisingly, most efforts have only focused on\ndeveloping architectures for easing the adaptation to the target domain without\nconsidering the importance of backbone training for good generalisation. We\nshow that flatness-aware backbone training with vanilla fine-tuning results in\na simpler yet competitive baseline compared to the state-of-the-art. Our\nresults indicate that for in- and cross-domain FSC, backbone training is\ncrucial to achieving good generalisation across different adaptation methods.\nWe advocate more care should be taken when training these models.\n","authors":["Rui Li","Martin Trapp","Marcus Klasson","Arno Solin"],"pdf_url":"https://arxiv.org/pdf/2404.07696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07687v1","updated":"2024-04-11T12:26:10Z","published":"2024-04-11T12:26:10Z","title":"Chaos in Motion: Unveiling Robustness in Remote Heart Rate Measurement\n through Brain-Inspired Skin Tracking","summary":" Heart rate is an important physiological indicator of human health status.\nExisting remote heart rate measurement methods typically involve facial\ndetection followed by signal extraction from the region of interest (ROI).\nThese SOTA methods have three serious problems: (a) inaccuracies even failures\nin detection caused by environmental influences or subject movement; (b)\nfailures for special patients such as infants and burn victims; (c) privacy\nleakage issues resulting from collecting face video. To address these issues,\nwe regard the remote heart rate measurement as the process of analyzing the\nspatiotemporal characteristics of the optical flow signal in the video. We\napply chaos theory to computer vision tasks for the first time, thus designing\na brain-inspired framework. Firstly, using an artificial primary visual cortex\nmodel to extract the skin in the videos, and then calculate heart rate by\ntime-frequency analysis on all pixels. Our method achieves Robust Skin Tracking\nfor Heart Rate measurement, called HR-RST. The experimental results show that\nHR-RST overcomes the difficulty of environmental influences and effectively\ntracks the subject movement. Moreover, the method could extend to other body\nparts. Consequently, the method can be applied to special patients and\neffectively protect individual privacy, offering an innovative solution.\n","authors":["Jie Wang","Jing Lian","Minjie Ma","Junqiang Lei","Chunbiao Li","Bin Li","Jizhao Liu"],"pdf_url":"https://arxiv.org/pdf/2404.07687v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.07686v1","updated":"2024-04-11T12:25:54Z","published":"2024-04-11T12:25:54Z","title":"Depth Estimation using Weighted-loss and Transfer Learning","summary":" Depth estimation from 2D images is a common computer vision task that has\napplications in many fields including autonomous vehicles, scene understanding\nand robotics. The accuracy of a supervised depth estimation method mainly\nrelies on the chosen loss function, the model architecture, quality of data and\nperformance metrics. In this study, we propose a simplified and adaptable\napproach to improve depth estimation accuracy using transfer learning and an\noptimized loss function. The optimized loss function is a combination of\nweighted losses to which enhance robustness and generalization: Mean Absolute\nError (MAE), Edge Loss and Structural Similarity Index (SSIM). We use a grid\nsearch and a random search method to find optimized weights for the losses,\nwhich leads to an improved model. We explore multiple encoder-decoder-based\nmodels including DenseNet121, DenseNet169, DenseNet201, and EfficientNet for\nthe supervised depth estimation model on NYU Depth Dataset v2. We observe that\nthe EfficientNet model, pre-trained on ImageNet for classification when used as\nan encoder, with a simple upsampling decoder, gives the best results in terms\nof RSME, REL and log10: 0.386, 0.113 and 0.049, respectively. We also perform a\nqualitative analysis which illustrates that our model produces depth maps that\nclosely resemble ground truth, even in cases where the ground truth is flawed.\nThe results indicate significant improvements in accuracy and robustness, with\nEfficientNet being the most successful architecture.\n","authors":["Muhammad Adeel Hafeez","Michael G. Madden","Ganesh Sistu","Ihsan Ullah"],"pdf_url":"https://arxiv.org/pdf/2404.07686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09657v2","updated":"2024-04-11T12:25:45Z","published":"2022-08-20T10:59:33Z","title":"Is Medieval Distant Viewing Possible? : Extending and Enriching\n Annotation of Legacy Image Collections using Visual Analytics","summary":" Distant viewing approaches have typically used image datasets close to the\ncontemporary image data used to train machine learning models. To work with\nimages from other historical periods requires expert annotated data, and the\nquality of labels is crucial for the quality of results. Especially when\nworking with cultural heritage collections that contain myriad uncertainties,\nannotating data, or re-annotating, legacy data is an arduous task. In this\npaper, we describe working with two pre-annotated sets of medieval manuscript\nimages that exhibit conflicting and overlapping metadata. Since a manual\nreconciliation of the two legacy ontologies would be very expensive, we aim (1)\nto create a more uniform set of descriptive labels to serve as a \"bridge\" in\nthe combined dataset, and (2) to establish a high quality hierarchical\nclassification that can be used as a valuable input for subsequent supervised\nmachine learning. To achieve these goals, we developed visualization and\ninteraction mechanisms, enabling medievalists to combine, regularize and extend\nthe vocabulary used to describe these, and other cognate, image datasets. The\nvisual interfaces provide experts an overview of relationships in the data\ngoing beyond the sum total of the metadata. Word and image embeddings as well\nas co-occurrences of labels across the datasets, enable batch re-annotation of\nimages, recommendation of label candidates and support composing a hierarchical\nclassification of labels.\n","authors":["Christofer Meinecke","Estelle Guéville","David Joseph Wrisley","Stefan Jänicke"],"pdf_url":"https://arxiv.org/pdf/2208.09657v2.pdf","comment":"Revision after DSH Peer Review. Paper is now accepted at DSH"},{"id":"http://arxiv.org/abs/2404.07685v1","updated":"2024-04-11T12:24:47Z","published":"2024-04-11T12:24:47Z","title":"Run-time Monitoring of 3D Object Detection in Automated Driving Systems\n Using Early Layer Neural Activation Patterns","summary":" Monitoring the integrity of object detection for errors within the perception\nmodule of automated driving systems (ADS) is paramount for ensuring safety.\nDespite recent advancements in deep neural network (DNN)-based object\ndetectors, their susceptibility to detection errors, particularly in the\nless-explored realm of 3D object detection, remains a significant concern.\nState-of-the-art integrity monitoring (also known as introspection) mechanisms\nin 2D object detection mainly utilise the activation patterns in the final\nlayer of the DNN-based detector's backbone. However, that may not sufficiently\naddress the complexities and sparsity of data in 3D object detection. To this\nend, we conduct, in this article, an extensive investigation into the effects\nof activation patterns extracted from various layers of the backbone network\nfor introspecting the operation of 3D object detectors. Through a comparative\nanalysis using Kitti and NuScenes datasets with PointPillars and CenterPoint\ndetectors, we demonstrate that using earlier layers' activation patterns\nenhances the error detection performance of the integrity monitoring system,\nyet increases computational complexity. To address the real-time operation\nrequirements in ADS, we also introduce a novel introspection method that\ncombines activation patterns from multiple layers of the detector's backbone\nand report its performance.\n","authors":["Hakan Yekta Yatbaz","Mehrdad Dianati","Konstantinos Koufos","Roger Woodman"],"pdf_url":"https://arxiv.org/pdf/2404.07685v1.pdf","comment":"Accepted by CVPR 2024 Workshop on Safe Autonomy for All Domains\n (SAIAD)"},{"id":"http://arxiv.org/abs/2404.07676v1","updated":"2024-04-11T12:14:48Z","published":"2024-04-11T12:14:48Z","title":"Model-based Cleaning of the QUILT-1M Pathology Dataset for\n Text-Conditional Image Synthesis","summary":" The QUILT-1M dataset is the first openly available dataset containing images\nharvested from various online sources. While it provides a huge data variety,\nthe image quality and composition is highly heterogeneous, impacting its\nutility for text-conditional image synthesis. We propose an automatic pipeline\nthat provides predictions of the most common impurities within the images,\ne.g., visibility of narrators, desktop environment and pathology software, or\ntext within the image. Additionally, we propose to use semantic alignment\nfiltering of the image-text pairs. Our findings demonstrate that by rigorously\nfiltering the dataset, there is a substantial enhancement of image fidelity in\ntext-to-image tasks.\n","authors":["Marc Aubreville","Jonathan Ganz","Jonas Ammeling","Christopher C. Kaltenecker","Christof A. Bertram"],"pdf_url":"https://arxiv.org/pdf/2404.07676v1.pdf","comment":"4 pages (short paper)"},{"id":"http://arxiv.org/abs/2402.13255v2","updated":"2024-04-11T12:13:27Z","published":"2024-02-20T18:59:57Z","title":"How NeRFs and 3D Gaussian Splatting are Reshaping SLAM: a Survey","summary":" Over the past two decades, research in the field of Simultaneous Localization\nand Mapping (SLAM) has undergone a significant evolution, highlighting its\ncritical role in enabling autonomous exploration of unknown environments. This\nevolution ranges from hand-crafted methods, through the era of deep learning,\nto more recent developments focused on Neural Radiance Fields (NeRFs) and 3D\nGaussian Splatting (3DGS) representations. Recognizing the growing body of\nresearch and the absence of a comprehensive survey on the topic, this paper\naims to provide the first comprehensive overview of SLAM progress through the\nlens of the latest advancements in radiance fields. It sheds light on the\nbackground, evolutionary path, inherent strengths and limitations, and serves\nas a fundamental reference to highlight the dynamic progress and specific\nchallenges.\n","authors":["Fabio Tosi","Youmin Zhang","Ziren Gong","Erik Sandström","Stefano Mattoccia","Martin R. Oswald","Matteo Poggi"],"pdf_url":"https://arxiv.org/pdf/2402.13255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07671v1","updated":"2024-04-11T12:06:50Z","published":"2024-04-11T12:06:50Z","title":"Deep learning-driven pulmonary arteries and veins segmentation reveals\n demography-associated pulmonary vasculature anatomy","summary":" Pulmonary artery-vein segmentation is crucial for diagnosing pulmonary\ndiseases and surgical planning, and is traditionally achieved by Computed\nTomography Pulmonary Angiography (CTPA). However, concerns regarding adverse\nhealth effects from contrast agents used in CTPA have constrained its clinical\nutility. In contrast, identifying arteries and veins using non-contrast CT, a\nconventional and low-cost clinical examination routine, has long been\nconsidered impossible. Here we propose a High-abundant Pulmonary Artery-vein\nSegmentation (HiPaS) framework achieving accurate artery-vein segmentation on\nboth non-contrast CT and CTPA across various spatial resolutions. HiPaS first\nperforms spatial normalization on raw CT scans via a super-resolution module,\nand then iteratively achieves segmentation results at different branch levels\nby utilizing the low-level vessel segmentation as a prior for high-level vessel\nsegmentation. We trained and validated HiPaS on our established multi-centric\ndataset comprising 1,073 CT volumes with meticulous manual annotation. Both\nquantitative experiments and clinical evaluation demonstrated the superior\nperformance of HiPaS, achieving a dice score of 91.8% and a sensitivity of\n98.0%. Further experiments demonstrated the non-inferiority of HiPaS\nsegmentation on non-contrast CT compared to segmentation on CTPA. Employing\nHiPaS, we have conducted an anatomical study of pulmonary vasculature on 10,613\nparticipants in China (five sites), discovering a new association between\npulmonary vessel abundance and sex and age: vessel abundance is significantly\nhigher in females than in males, and slightly decreases with age, under the\ncontrolling of lung volumes (p < 0.0001). HiPaS realizing accurate artery-vein\nsegmentation delineates a promising avenue for clinical diagnosis and\nunderstanding pulmonary physiology in a non-invasive manner.\n","authors":["Yuetan Chu","Gongning Luo","Longxi Zhou","Shaodong Cao","Guolin Ma","Xianglin Meng","Juexiao Zhou","Changchun Yang","Dexuan Xie","Ricardo Henao","Xigang Xiao","Lianming Wu","Zhaowen Qiu","Xin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.07671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11111v2","updated":"2024-04-11T12:01:34Z","published":"2024-03-17T06:31:16Z","title":"3D Human Reconstruction in the Wild with Synthetic Data Using Generative\n Models","summary":" In this work, we show that synthetic data created by generative models is\ncomplementary to computer graphics (CG) rendered data for achieving remarkable\ngeneralization performance on diverse real-world scenes for 3D human pose and\nshape estimation (HPS). Specifically, we propose an effective approach based on\nrecent diffusion models, termed HumanWild, which can effortlessly generate\nhuman images and corresponding 3D mesh annotations. We first collect a\nlarge-scale human-centric dataset with comprehensive annotations, e.g., text\ncaptions and surface normal images. Then, we train a customized ControlNet\nmodel upon this dataset to generate diverse human images and initial\nground-truth labels. At the core of this step is that we can easily obtain\nnumerous surface normal images from a 3D human parametric model, e.g., SMPL-X,\nby rendering the 3D mesh onto the image plane. As there exists inevitable noise\nin the initial labels, we then apply an off-the-shelf foundation segmentation\nmodel, i.e., SAM, to filter negative data samples. Our data generation pipeline\nis flexible and customizable to facilitate different real-world tasks, e.g.,\nego-centric scenes and perspective-distortion scenes. The generated dataset\ncomprises 0.79M images with corresponding 3D annotations, covering versatile\nviewpoints, scenes, and human identities. We train various HPS regressors on\ntop of the generated data and evaluate them on a wide range of benchmarks\n(3DPW, RICH, EgoBody, AGORA, SSP-3D) to verify the effectiveness of the\ngenerated data. By exclusively employing generative models, we generate\nlarge-scale in-the-wild human images and high-quality annotations, eliminating\nthe need for real-world data collection.\n","authors":["Yongtao Ge","Wenjia Wang","Yongfan Chen","Hao Chen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2403.11111v2.pdf","comment":"project page: https://yongtaoge.github.io/projects/humanwild"},{"id":"http://arxiv.org/abs/2404.07668v1","updated":"2024-04-11T12:00:13Z","published":"2024-04-11T12:00:13Z","title":"Shape Completion in the Dark: Completing Vertebrae Morphology from 3D\n Ultrasound","summary":" Purpose: Ultrasound (US) imaging, while advantageous for its radiation-free\nnature, is challenging to interpret due to only partially visible organs and a\nlack of complete 3D information. While performing US-based diagnosis or\ninvestigation, medical professionals therefore create a mental map of the 3D\nanatomy. In this work, we aim to replicate this process and enhance the visual\nrepresentation of anatomical structures.\n Methods: We introduce a point-cloud-based probabilistic DL method to complete\noccluded anatomical structures through 3D shape completion and choose US-based\nspine examinations as our application. To enable training, we generate\nsynthetic 3D representations of partially occluded spinal views by mimicking US\nphysics and accounting for inherent artifacts.\n Results: The proposed model performs consistently on synthetic and patient\ndata, with mean and median differences of 2.02 and 0.03 in CD, respectively.\nOur ablation study demonstrates the importance of US physics-based data\ngeneration, reflected in the large mean and median difference of 11.8 CD and\n9.55 CD, respectively. Additionally, we demonstrate that anatomic landmarks,\nsuch as the spinous process (with reconstruction CD of 4.73) and the facet\njoints (mean distance to GT of 4.96mm) are preserved in the 3D completion.\n Conclusion: Our work establishes the feasibility of 3D shape completion for\nlumbar vertebrae, ensuring the preservation of level-wise characteristics and\nsuccessful generalization from synthetic to real data. The incorporation of US\nphysics contributes to more accurate patient data completions. Notably, our\nmethod preserves essential anatomic landmarks and reconstructs crucial\ninjections sites at their correct locations. The generated data and source code\nwill be made publicly available\n(https://github.com/miruna20/Shape-Completion-in-the-Dark).\n","authors":["Miruna-Alexandra Gafencu","Yordanka Velikova","Mahdi Saleh","Tamas Ungi","Nassir Navab","Thomas Wendler","Mohammad Farid Azampour"],"pdf_url":"https://arxiv.org/pdf/2404.07668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07667v1","updated":"2024-04-11T12:00:06Z","published":"2024-04-11T12:00:06Z","title":"Dealing with Subject Similarity in Differential Morphing Attack\n Detection","summary":" The advent of morphing attacks has posed significant security concerns for\nautomated Face Recognition systems, raising the pressing need for robust and\neffective Morphing Attack Detection (MAD) methods able to effectively address\nthis issue. In this paper, we focus on Differential MAD (D-MAD), where a\ntrusted live capture, usually representing the criminal, is compared with the\ndocument image to classify it as morphed or bona fide. We show these approaches\nbased on identity features are effective when the morphed image and the live\none are sufficiently diverse; unfortunately, the effectiveness is significantly\nreduced when the same approaches are applied to look-alike subjects or in all\nthose cases when the similarity between the two compared images is high (e.g.\ncomparison between the morphed image and the accomplice). Therefore, in this\npaper, we propose ACIdA, a modular D-MAD system, consisting of a module for the\nattempt type classification, and two modules for the identity and artifacts\nanalysis on input images. Successfully addressing this task would allow\nbroadening the D-MAD applications including, for instance, the document\nenrollment stage, which currently relies entirely on human evaluation, thus\nlimiting the possibility of releasing ID documents with manipulated images, as\nwell as the automated gates to detect both accomplices and criminals. An\nextensive cross-dataset experimental evaluation conducted on the introduced\nscenario shows that ACIdA achieves state-of-the-art results, outperforming\nliterature competitors, while maintaining good performance in traditional D-MAD\nbenchmarks.\n","authors":["Nicolò Di Domenico","Guido Borghi","Annalisa Franco","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2404.07667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07664v1","updated":"2024-04-11T11:55:42Z","published":"2024-04-11T11:55:42Z","title":"Finding Dino: A plug-and-play framework for unsupervised detection of\n out-of-distribution objects using prototypes","summary":" Detecting and localising unknown or Out-of-distribution (OOD) objects in any\nscene can be a challenging task in vision. Particularly, in safety-critical\ncases involving autonomous systems like automated vehicles or trains.\nSupervised anomaly segmentation or open-world object detection models depend on\ntraining on exhaustively annotated datasets for every domain and still struggle\nin distinguishing between background and OOD objects. In this work, we present\na plug-and-play generalised framework - PRototype-based zero-shot OOD detection\nWithout Labels (PROWL). It is an inference-based method that does not require\ntraining on the domain dataset and relies on extracting relevant features from\nself-supervised pre-trained models. PROWL can be easily adapted to detect OOD\nobjects in any operational design domain by specifying a list of known classes\nfrom this domain. PROWL, as an unsupervised method, outperforms other\nsupervised methods trained without auxiliary OOD data on the RoadAnomaly and\nRoadObstacle datasets provided in SegmentMeIfYouCan (SMIYC) benchmark. We also\ndemonstrate its suitability for other domains such as rail and maritime scenes.\n","authors":["Poulami Sinhamahapatra","Franziska Schwaiger","Shirsha Bose","Huiyu Wang","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2404.07664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03122v2","updated":"2024-04-11T11:42:13Z","published":"2024-03-05T17:07:29Z","title":"NRDF: Neural Riemannian Distance Fields for Learning Articulated Pose\n Priors","summary":" Faithfully modeling the space of articulations is a crucial task that allows\nrecovery and generation of realistic poses, and remains a notorious challenge.\nTo this end, we introduce Neural Riemannian Distance Fields (NRDFs),\ndata-driven priors modeling the space of plausible articulations, represented\nas the zero-level-set of a neural field in a high-dimensional\nproduct-quaternion space. To train NRDFs only on positive examples, we\nintroduce a new sampling algorithm, ensuring that the geodesic distances follow\na desired distribution, yielding a principled distance field learning paradigm.\nWe then devise a projection algorithm to map any random pose onto the level-set\nby an adaptive-step Riemannian optimizer, adhering to the product manifold of\njoint rotations at all times. NRDFs can compute the Riemannian gradient via\nbackpropagation and by mathematical analogy, are related to Riemannian flow\nmatching, a recent generative model. We conduct a comprehensive evaluation of\nNRDF against other pose priors in various downstream tasks, i.e., pose\ngeneration, image-based pose estimation, and solving inverse kinematics,\nhighlighting NRDF's superior performance. Besides humans, NRDF's versatility\nextends to hand and animal poses, as it can effectively represent any\narticulation.\n","authors":["Yannan He","Garvita Tiwari","Tolga Birdal","Jan Eric Lenssen","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2403.03122v2.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://virtualhumans.mpi-inf.mpg.de/nrdf"},{"id":"http://arxiv.org/abs/2404.07649v1","updated":"2024-04-11T11:12:06Z","published":"2024-04-11T11:12:06Z","title":"Separated Attention: An Improved Cycle GAN Based Under Water Image\n Enhancement Method","summary":" In this paper we have present an improved Cycle GAN based model for under\nwater image enhancement. We have utilized the cycle consistent learning\ntechnique of the state-of-the-art Cycle GAN model with modification in the loss\nfunction in terms of depth-oriented attention which enhance the contrast of the\noverall image, keeping global content, color, local texture, and style\ninformation intact. We trained the Cycle GAN model with the modified loss\nfunctions on the benchmarked Enhancing Underwater Visual Perception (EUPV)\ndataset a large dataset including paired and unpaired sets of underwater images\n(poor and good quality) taken with seven distinct cameras in a range of\nvisibility situation during research on ocean exploration and human-robot\ncooperation. In addition, we perform qualitative and quantitative evaluation\nwhich supports the given technique applied and provided a better contrast\nenhancement model of underwater imagery. More significantly, the upgraded\nimages provide better results from conventional models and further for under\nwater navigation, pose estimation, saliency prediction, object detection and\ntracking. The results validate the appropriateness of the model for autonomous\nunderwater vehicles (AUV) in visual navigation.\n","authors":["Tashmoy Ghosh"],"pdf_url":"https://arxiv.org/pdf/2404.07649v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.07645v1","updated":"2024-04-11T11:07:57Z","published":"2024-04-11T11:07:57Z","title":"Simba: Mamba augmented U-ShiftGCN for Skeletal Action Recognition in\n Videos","summary":" Skeleton Action Recognition (SAR) involves identifying human actions using\nskeletal joint coordinates and their interconnections. While plain Transformers\nhave been attempted for this task, they still fall short compared to the\ncurrent leading methods, which are rooted in Graph Convolutional Networks\n(GCNs) due to the absence of structural priors. Recently, a novel selective\nstate space model, Mamba, has surfaced as a compelling alternative to the\nattention mechanism in Transformers, offering efficient modeling of long\nsequences. In this work, to the utmost extent of our awareness, we present the\nfirst SAR framework incorporating Mamba. Each fundamental block of our model\nadopts a novel U-ShiftGCN architecture with Mamba as its core component. The\nencoder segment of the U-ShiftGCN is devised to extract spatial features from\nthe skeletal data using downsampling vanilla Shift S-GCN blocks. These spatial\nfeatures then undergo intermediate temporal modeling facilitated by the Mamba\nblock before progressing to the encoder section, which comprises vanilla\nupsampling Shift S-GCN blocks. Additionally, a Shift T-GCN (ShiftTCN) temporal\nmodeling unit is employed before the exit of each fundamental block to refine\ntemporal representations. This particular integration of downsampling spatial,\nintermediate temporal, upsampling spatial, and ultimate temporal subunits\nyields promising results for skeleton action recognition. We dub the resulting\nmodel \\textbf{Simba}, which attains state-of-the-art performance across three\nwell-known benchmark skeleton action recognition datasets: NTU RGB+D, NTU RGB+D\n120, and Northwestern-UCLA. Interestingly, U-ShiftGCN (Simba without\nIntermediate Mamba Block) by itself is capable of performing reasonably well\nand surpasses our baseline.\n","authors":["Soumyabrata Chaudhuri","Saumik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2404.07645v1.pdf","comment":"20 pages, 6 tables, 1 figure"},{"id":"http://arxiv.org/abs/2404.03425v2","updated":"2024-04-11T10:51:34Z","published":"2024-04-04T13:06:25Z","title":"ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State\n Space Model","summary":" Convolutional neural networks (CNN) and Transformers have made impressive\nprogress in the field of remote sensing change detection (CD). However, both\narchitectures have inherent shortcomings. Recently, the Mamba architecture,\nbased on state space models, has shown remarkable performance in a series of\nnatural language processing tasks, which can effectively compensate for the\nshortcomings of the above two architectures. In this paper, we explore for the\nfirst time the potential of the Mamba architecture for remote sensing CD tasks.\nWe tailor the corresponding frameworks, called MambaBCD, MambaSCD, and\nMambaBDA, for binary change detection (BCD), semantic change detection (SCD),\nand building damage assessment (BDA), respectively. All three frameworks adopt\nthe cutting-edge Visual Mamba architecture as the encoder, which allows full\nlearning of global spatial contextual information from the input images. For\nthe change decoder, which is available in all three architectures, we propose\nthree spatio-temporal relationship modeling mechanisms, which can be naturally\ncombined with the Mamba architecture and fully utilize its attribute to achieve\nspatio-temporal interaction of multi-temporal features, thereby obtaining\naccurate change information. On five benchmark datasets, our proposed\nframeworks outperform current CNN- and Transformer-based approaches without\nusing any complex training strategies or tricks, fully demonstrating the\npotential of the Mamba architecture in CD tasks. Specifically, we obtained\n83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, LEVIR-CD+,\nand WHU-CD; on the SCD dataset SECOND, we obtained 24.11% SeK; and on the BDA\ndataset xBD, we obtained 81.41% overall F1 score. Further experiments show that\nour architecture is quite robust to degraded data. The source code will be\navailable in https://github.com/ChenHongruixuan/MambaCD\n","authors":["Hongruixuan Chen","Jian Song","Chengxi Han","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.03425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16074v2","updated":"2024-04-11T10:45:05Z","published":"2023-10-24T15:16:19Z","title":"RePoseDM: Recurrent Pose Alignment and Gradient Guidance for Pose Guided\n Image Synthesis","summary":" Pose-guided person image synthesis task requires re-rendering a reference\nimage, which should have a photorealistic appearance and flawless pose\ntransfer. Since person images are highly structured, existing approaches\nrequire dense connections for complex deformations and occlusions because these\nare generally handled through multi-level warping and masking in latent space.\nThe feature maps generated by convolutional neural networks do not have\nequivariance, and hence multi-level warping is required to perform pose\nalignment. Inspired by the ability of the diffusion model to generate\nphotorealistic images from the given conditional guidance, we propose recurrent\npose alignment to provide pose-aligned texture features as conditional\nguidance. Due to the leakage of the source pose in conditional guidance, we\npropose gradient guidance from pose interaction fields, which output the\ndistance from the valid pose manifold given a predicted pose as input. This\nhelps in learning plausible pose transfer trajectories that result in\nphotorealism and undistorted texture details. Extensive results on two\nlarge-scale benchmarks and a user study demonstrate the ability of our proposed\napproach to generate photorealistic pose transfer under challenging scenarios.\nAdditionally, we demonstrate the efficiency of gradient guidance in pose-guided\nimage generation on the HumanArt dataset with fine-tuned stable diffusion.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2310.16074v2.pdf","comment":"Accepted at CVPR 2024 SyntaGen Workshop, 13 pages, 4 tables, 7\n figures"},{"id":"http://arxiv.org/abs/2312.01919v2","updated":"2024-04-11T10:38:33Z","published":"2023-12-04T14:23:18Z","title":"COTR: Compact Occupancy TRansformer for Vision-based 3D Occupancy\n Prediction","summary":" The autonomous driving community has shown significant interest in 3D\noccupancy prediction, driven by its exceptional geometric perception and\ngeneral object recognition capabilities. To achieve this, current works try to\nconstruct a Tri-Perspective View (TPV) or Occupancy (OCC) representation\nextending from the Bird-Eye-View perception. However, compressed views like TPV\nrepresentation lose 3D geometry information while raw and sparse OCC\nrepresentation requires heavy but redundant computational costs. To address the\nabove limitations, we propose Compact Occupancy TRansformer (COTR), with a\ngeometry-aware occupancy encoder and a semantic-aware group decoder to\nreconstruct a compact 3D OCC representation. The occupancy encoder first\ngenerates a compact geometrical OCC feature through efficient explicit-implicit\nview transformation. Then, the occupancy decoder further enhances the semantic\ndiscriminability of the compact OCC representation by a coarse-to-fine semantic\ngrouping strategy. Empirical experiments show that there are evident\nperformance gains across multiple baselines, e.g., COTR outperforms baselines\nwith a relative improvement of 8%-15%, demonstrating the superiority of our\nmethod.\n","authors":["Qihang Ma","Xin Tan","Yanyun Qu","Lizhuang Ma","Zhizhong Zhang","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2312.01919v2.pdf","comment":"CVPR2024. Code is available at https://github.com/NotACracker/COTR"},{"id":"http://arxiv.org/abs/2404.07626v1","updated":"2024-04-11T10:26:40Z","published":"2024-04-11T10:26:40Z","title":"Homography Guided Temporal Fusion for Road Line and Marking Segmentation","summary":" Reliable segmentation of road lines and markings is critical to autonomous\ndriving. Our work is motivated by the observations that road lines and markings\nare (1) frequently occluded in the presence of moving vehicles, shadow, and\nglare and (2) highly structured with low intra-class shape variance and overall\nhigh appearance consistency. To solve these issues, we propose a Homography\nGuided Fusion (HomoFusion) module to exploit temporally-adjacent video frames\nfor complementary cues facilitating the correct classification of the partially\noccluded road lines or markings. To reduce computational complexity, a novel\nsurface normal estimator is proposed to establish spatial correspondences\nbetween the sampled frames, allowing the HomoFusion module to perform a\npixel-to-pixel attention mechanism in updating the representation of the\noccluded road lines or markings. Experiments on ApolloScape, a large-scale lane\nmark segmentation dataset, and ApolloScape Night with artificial simulated\nnight-time road conditions, demonstrate that our method outperforms other\nexisting SOTA lane mark segmentation models with less than 9\\% of their\nparameters and computational complexity. We show that exploiting available\ncamera intrinsic data and ground plane assumption for cross-frame\ncorrespondence can lead to a light-weight network with significantly improved\nperformances in speed and accuracy. We also prove the versatility of our\nHomoFusion approach by applying it to the problem of water puddle segmentation\nand achieving SOTA performance.\n","authors":["Shan Wang","Chuong Nguyen","Jiawei Liu","Kaihao Zhang","Wenhan Luo","Yanhao Zhang","Sundaram Muthu","Fahira Afzal Maken","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2404.07626v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15855v2","updated":"2024-04-11T10:19:41Z","published":"2023-08-30T08:44:21Z","title":"IIDM: Inter and Intra-domain Mixing for Semi-supervised Domain\n Adaptation in Semantic Segmentation","summary":" Despite recent advances in semantic segmentation, an inevitable challenge is\nthe performance degradation caused by the domain shift in real applications.\nCurrent dominant approach to solve this problem is unsupervised domain\nadaptation (UDA). However, the absence of labeled target data in UDA is overly\nrestrictive and limits performance. To overcome this limitation, a more\npractical scenario called semi-supervised domain adaptation (SSDA) has been\nproposed. Existing SSDA methods are derived from the UDA paradigm and primarily\nfocus on leveraging the unlabeled target data and source data. In this paper,\nwe highlight the significance of exploiting the intra-domain information\nbetween the labeled target data and unlabeled target data. Instead of solely\nusing the scarce labeled target data for supervision, we propose a novel SSDA\nframework that incorporates both Inter and Intra Domain Mixing (IIDM), where\ninter-domain mixing mitigates the source-target domain gap and intra-domain\nmixing enriches the available target domain information, and the network can\ncapture more domain-invariant features. We also explore different domain mixing\nstrategies to better exploit the target domain information. Comprehensive\nexperiments conducted on the GTA5 to Cityscapes and SYNTHIA to Cityscapes\nbenchmarks demonstrate the effectiveness of IIDM, surpassing previous methods\nby a large margin.\n","authors":["Weifu Fu","Qiang Nie","Jialin Li","Yuhuan Lin","Kai Wu","Jian Li","Yabiao Wang","Yong Liu","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15855v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.07622v1","updated":"2024-04-11T10:16:44Z","published":"2024-04-11T10:16:44Z","title":"Multi-Image Visual Question Answering for Unsupervised Anomaly Detection","summary":" Unsupervised anomaly detection enables the identification of potential\npathological areas by juxtaposing original images with their pseudo-healthy\nreconstructions generated by models trained exclusively on normal images.\nHowever, the clinical interpretation of resultant anomaly maps presents a\nchallenge due to a lack of detailed, understandable explanations. Recent\nadvancements in language models have shown the capability of mimicking\nhuman-like understanding and providing detailed descriptions. This raises an\ninteresting question: \\textit{How can language models be employed to make the\nanomaly maps more explainable?} To the best of our knowledge, we are the first\nto leverage a language model for unsupervised anomaly detection, for which we\nconstruct a dataset with different questions and answers. Additionally, we\npresent a novel multi-image visual question answering framework tailored for\nanomaly detection, incorporating diverse feature fusion strategies to enhance\nvisual knowledge extraction. Our experiments reveal that the framework,\naugmented by our new Knowledge Q-Former module, adeptly answers questions on\nthe anomaly detection dataset. Besides, integrating anomaly maps as inputs\ndistinctly aids in improving the detection of unseen pathologies.\n","authors":["Jun Li","Cosmin I. Bercea","Philip Müller","Lina Felsner","Suhwan Kim","Daniel Rueckert","Benedikt Wiestler","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2404.07622v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.07620v1","updated":"2024-04-11T10:14:56Z","published":"2024-04-11T10:14:56Z","title":"Diffusion Probabilistic Multi-cue Level Set for Reducing Edge\n Uncertainty in Pancreas Segmentation","summary":" Accurately segmenting the pancreas remains a huge challenge. Traditional\nmethods encounter difficulties in semantic localization due to the small volume\nand distorted structure of the pancreas, while deep learning methods encounter\nchallenges in obtaining accurate edges because of low contrast and organ\noverlapping. To overcome these issues, we propose a multi-cue level set method\nbased on the diffusion probabilistic model, namely Diff-mcs. Our method adopts\na coarse-to-fine segmentation strategy. We use the diffusion probabilistic\nmodel in the coarse segmentation stage, with the obtained probability\ndistribution serving as both the initial localization and prior cues for the\nlevel set method. In the fine segmentation stage, we combine the prior cues\nwith grayscale cues and texture cues to refine the edge by maximizing the\ndifference between probability distributions of the cues inside and outside the\nlevel set curve. The method is validated on three public datasets and achieves\nstate-of-the-art performance, which can obtain more accurate segmentation\nresults with lower uncertainty segmentation edges. In addition, we conduct\nablation studies and uncertainty analysis to verify that the diffusion\nprobability model provides a more appropriate initialization for the level set\nmethod. Furthermore, when combined with multiple cues, the level set method can\nbetter obtain edges and improve the overall accuracy. Our code is available at\nhttps://github.com/GOUYUEE/Diff-mcs.\n","authors":["Yue Gou","Yuming Xing","Shengzhu Shi","Zhichang Guo"],"pdf_url":"https://arxiv.org/pdf/2404.07620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18956v2","updated":"2024-04-11T10:06:10Z","published":"2024-02-29T08:51:51Z","title":"WWW: A Unified Framework for Explaining What, Where and Why of Neural\n Networks by Interpretation of Neuron Concepts","summary":" Recent advancements in neural networks have showcased their remarkable\ncapabilities across various domains. Despite these successes, the \"black box\"\nproblem still remains. Addressing this, we propose a novel framework, WWW, that\noffers the 'what', 'where', and 'why' of the neural network decisions in\nhuman-understandable terms. Specifically, WWW utilizes adaptive selection for\nconcept discovery, employing adaptive cosine similarity and thresholding\ntechniques to effectively explain 'what'. To address the 'where' and 'why', we\nproposed a novel combination of neuron activation maps (NAMs) with Shapley\nvalues, generating localized concept maps and heatmaps for individual inputs.\nFurthermore, WWW introduces a method for predicting uncertainty, leveraging\nheatmap similarities to estimate 'how' reliable the prediction is. Experimental\nevaluations of WWW demonstrate superior performance in both quantitative and\nqualitative metrics, outperforming existing methods in interpretability. WWW\nprovides a unified solution for explaining 'what', 'where', and 'why',\nintroducing a method for localized explanations from global interpretations and\noffering a plug-and-play solution adaptable to various architectures.\n","authors":["Yong Hyun Ahn","Hyeon Bae Kim","Seong Tae Kim"],"pdf_url":"https://arxiv.org/pdf/2402.18956v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01705v2","updated":"2024-04-11T10:05:12Z","published":"2024-04-02T07:38:16Z","title":"Samba: Semantic Segmentation of Remotely Sensed Images with State Space\n Model","summary":" High-resolution remotely sensed images pose a challenge for commonly used\nsemantic segmentation methods such as Convolutional Neural Network (CNN) and\nVision Transformer (ViT). CNN-based methods struggle with handling such\nhigh-resolution images due to their limited receptive field, while ViT faces\nchallenges in handling long sequences. Inspired by Mamba, which adopts a State\nSpace Model (SSM) to efficiently capture global semantic information, we\npropose a semantic segmentation framework for high-resolution remotely sensed\nimages, named Samba. Samba utilizes an encoder-decoder architecture, with Samba\nblocks serving as the encoder for efficient multi-level semantic information\nextraction, and UperNet functioning as the decoder. We evaluate Samba on the\nLoveDA, ISPRS Vaihingen, and ISPRS Potsdam datasets, comparing its performance\nagainst top-performing CNN and ViT methods. The results reveal that Samba\nachieved unparalleled performance on commonly used remote sensing datasets for\nsemantic segmentation. Our proposed Samba demonstrates for the first time the\neffectiveness of SSM in semantic segmentation of remotely sensed images,\nsetting a new benchmark in performance for Mamba-based techniques in this\nspecific application. The source code and baseline implementations are\navailable at https://github.com/zhuqinfeng1999/Samba.\n","authors":["Qinfeng Zhu","Yuanzhi Cai","Yuan Fang","Yihan Yang","Cheng Chen","Lei Fan","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.01705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07610v1","updated":"2024-04-11T09:58:23Z","published":"2024-04-11T09:58:23Z","title":"Do You Remember? Dense Video Captioning with Cross-Modal Memory\n Retrieval","summary":" There has been significant attention to the research on dense video\ncaptioning, which aims to automatically localize and caption all events within\nuntrimmed video. Several studies introduce methods by designing dense video\ncaptioning as a multitasking problem of event localization and event captioning\nto consider inter-task relations. However, addressing both tasks using only\nvisual input is challenging due to the lack of semantic content. In this study,\nwe address this by proposing a novel framework inspired by the cognitive\ninformation processing of humans. Our model utilizes external memory to\nincorporate prior knowledge. The memory retrieval method is proposed with\ncross-modal video-to-text matching. To effectively incorporate retrieved text\nfeatures, the versatile encoder and the decoder with visual and textual\ncross-attention modules are designed. Comparative experiments have been\nconducted to show the effectiveness of the proposed method on ActivityNet\nCaptions and YouCook2 datasets. Experimental results show promising performance\nof our model without extensive pretraining from a large video dataset.\n","authors":["Minkuk Kim","Hyeon Bae Kim","Jinyoung Moon","Jinwoo Choi","Seong Tae Kim"],"pdf_url":"https://arxiv.org/pdf/2404.07610v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07607v1","updated":"2024-04-11T09:50:05Z","published":"2024-04-11T09:50:05Z","title":"Automatic Detection of Dark Ship-to-Ship Transfers using Deep Learning\n and Satellite Imagery","summary":" Despite extensive research into ship detection via remote sensing, no studies\nidentify ship-to-ship transfers in satellite imagery. Given the importance of\ntransshipment in illicit shipping practices, this is a significant gap. In what\nfollows, I train a convolutional neural network to accurately detect 4\ndifferent types of cargo vessel and two different types of Ship-to-Ship\ntransfer in PlanetScope satellite imagery. I then elaborate a pipeline for the\nautomatic detection of suspected illicit ship-to-ship transfers by\ncross-referencing satellite detections with vessel borne GPS data. Finally, I\napply this method to the Kerch Strait between Ukraine and Russia to identify\nover 400 dark transshipment events since 2022.\n","authors":["Ollie Ballinger"],"pdf_url":"https://arxiv.org/pdf/2404.07607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07605v1","updated":"2024-04-11T09:47:52Z","published":"2024-04-11T09:47:52Z","title":"Contrastive-Based Deep Embeddings for Label Noise-Resilient\n Histopathology Image Classification","summary":" Recent advancements in deep learning have proven highly effective in medical\nimage classification, notably within histopathology. However, noisy labels\nrepresent a critical challenge in histopathology image classification, where\naccurate annotations are vital for training robust deep learning models.\nIndeed, deep neural networks can easily overfit label noise, leading to severe\ndegradations in model performance. While numerous public pathology foundation\nmodels have emerged recently, none have evaluated their resilience to label\nnoise. Through thorough empirical analyses across multiple datasets, we exhibit\nthe label noise resilience property of embeddings extracted from foundation\nmodels trained in a self-supervised contrastive manner. We demonstrate that\ntraining with such embeddings substantially enhances label noise robustness\nwhen compared to non-contrastive-based ones as well as commonly used\nnoise-resilient methods. Our results unequivocally underline the superiority of\ncontrastive learning in effectively mitigating the label noise challenge. Code\nis publicly available at\nhttps://github.com/LucasDedieu/NoiseResilientHistopathology.\n","authors":["Lucas Dedieu","Nicolas Nerrienet","Adrien Nivaggioli","Clara Simmat","Marceau Clavel","Arnaud Gauthier","Stéphane Sockeel","Rémy Peyret"],"pdf_url":"https://arxiv.org/pdf/2404.07605v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2404.07603v1","updated":"2024-04-11T09:43:07Z","published":"2024-04-11T09:43:07Z","title":"GLID: Pre-training a Generalist Encoder-Decoder Vision Model","summary":" This paper proposes a GeneraLIst encoder-Decoder (GLID) pre-training method\nfor better handling various downstream computer vision tasks. While\nself-supervised pre-training approaches, e.g., Masked Autoencoder, have shown\nsuccess in transfer learning, task-specific sub-architectures are still\nrequired to be appended for different downstream tasks, which cannot enjoy the\nbenefits of large-scale pre-training. GLID overcomes this challenge by allowing\nthe pre-trained generalist encoder-decoder to be fine-tuned on various vision\ntasks with minimal task-specific architecture modifications. In the GLID\ntraining scheme, pre-training pretext task and other downstream tasks are\nmodeled as \"query-to-answer\" problems, including the pre-training pretext task\nand other downstream tasks. We pre-train a task-agnostic encoder-decoder with\nquery-mask pairs. During fine-tuning, GLID maintains the pre-trained\nencoder-decoder and queries, only replacing the topmost linear transformation\nlayer with task-specific linear heads. This minimizes the pretrain-finetune\narchitecture inconsistency and enables the pre-trained model to better adapt to\ndownstream tasks. GLID achieves competitive performance on various vision\ntasks, including object detection, image segmentation, pose estimation, and\ndepth estimation, outperforming or matching specialist models such as\nMask2Former, DETR, ViTPose, and BinsFormer.\n","authors":["Jihao Liu","Jinliang Zheng","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.07603v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07602v1","updated":"2024-04-11T09:41:14Z","published":"2024-04-11T09:41:14Z","title":"Attention based End to end network for Offline Writer Identification on\n Word level data","summary":" Writer identification due to its widespread application in various fields has\ngained popularity over the years. In scenarios where optimum handwriting\nsamples are available, whether they be in the form of a single line, a\nsentence, or an entire page, writer identification algorithms have demonstrated\nnoteworthy levels of accuracy. However, in scenarios where only a limited\nnumber of handwritten samples are available, particularly in the form of word\nimages, there is a significant scope for improvement.\n In this paper, we propose a writer identification system based on an\nattention-driven Convolutional Neural Network (CNN). The system is trained\nutilizing image segments, known as fragments, extracted from word images,\nemploying a pyramid-based strategy. This methodology enables the system to\ncapture a comprehensive representation of the data, encompassing both\nfine-grained details and coarse features across various levels of abstraction.\nThese extracted fragments serve as the training data for the convolutional\nnetwork, enabling it to learn a more robust representation compared to\ntraditional convolution-based networks trained on word images. Additionally,\nthe paper explores the integration of an attention mechanism to enhance the\nrepresentational power of the learned features. The efficacy of the proposed\nalgorithm is evaluated on three benchmark databases, demonstrating its\nproficiency in writer identification tasks, particularly in scenarios with\nlimited access to handwriting data.\n","authors":["Vineet Kumar","Suresh Sundaram"],"pdf_url":"https://arxiv.org/pdf/2404.07602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07600v1","updated":"2024-04-11T09:39:58Z","published":"2024-04-11T09:39:58Z","title":"Implicit and Explicit Language Guidance for Diffusion-based Visual\n Perception","summary":" Text-to-image diffusion models have shown powerful ability on conditional\nimage synthesis. With large-scale vision-language pre-training, diffusion\nmodels are able to generate high-quality images with rich texture and\nreasonable structure under different text prompts. However, it is an open\nproblem to adapt the pre-trained diffusion model for visual perception. In this\npaper, we propose an implicit and explicit language guidance framework for\ndiffusion-based perception, named IEDP. Our IEDP comprises of an implicit\nlanguage guidance branch and an explicit language guidance branch. The implicit\nbranch employs frozen CLIP image encoder to directly generate implicit text\nembeddings that are fed to diffusion model, without using explicit text\nprompts. The explicit branch utilizes the ground-truth labels of corresponding\nimages as text prompts to condition feature extraction of diffusion model.\nDuring training, we jointly train diffusion model by sharing the model weights\nof these two branches. As a result, implicit and explicit branches can jointly\nguide feature learning. During inference, we only employ implicit branch for\nfinal prediction, which does not require any ground-truth labels. Experiments\nare performed on two typical perception tasks, including semantic segmentation\nand depth estimation. Our IEDP achieves promising performance on both tasks.\nFor semantic segmentation, our IEDP has the mIoU score of 55.9% on AD20K\nvalidation set, which outperforms the baseline method VPD by 2.2%. For depth\nestimation, our IEDP outperforms the baseline method VPD with a relative gain\nof 10.2%.\n","authors":["Hefeng Wang","Jiale Cao","Jin Xie","Aiping Yang","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2404.07600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07594v1","updated":"2024-04-11T09:23:44Z","published":"2024-04-11T09:23:44Z","title":"Weakly-Supervised Learning via Multi-Lateral Decoder Branching for\n Guidewire Segmentation in Robot-Assisted Cardiovascular Catheterization","summary":" Although robot-assisted cardiovascular catheterization is commonly performed\nfor intervention of cardiovascular diseases, more studies are needed to support\nthe procedure with automated tool segmentation. This can aid surgeons on tool\ntracking and visualization during intervention. Learning-based segmentation has\nrecently offered state-of-the-art segmentation performances however, generating\nground-truth signals for fully-supervised methods is labor-intensive and time\nconsuming for the interventionists. In this study, a weakly-supervised learning\nmethod with multi-lateral pseudo labeling is proposed for tool segmentation in\ncardiac angiograms. The method includes a modified U-Net model with one encoder\nand multiple lateral-branched decoders that produce pseudo labels as\nsupervision signals under different perturbation. The pseudo labels are\nself-generated through a mixed loss function and shared consistency in the\ndecoders. We trained the model end-to-end with weakly-annotated data obtained\nduring robotic cardiac catheterization. Experiments with the proposed model\nshows weakly annotated data has closer performance to when fully annotated data\nis used. Compared to three existing weakly-supervised methods, our approach\nyielded higher segmentation performance across three different cardiac\nangiogram data. With ablation study, we showed consistent performance under\ndifferent parameters. Thus, we offer a less expensive method for real-time tool\nsegmentation and tracking during robot-assisted cardiac catheterization.\n","authors":["Olatunji Mumini Omisore","Toluwanimi Akinyemi","Anh Nguyen","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07580v1","updated":"2024-04-11T09:13:50Z","published":"2024-04-11T09:13:50Z","title":"Multi-rater Prompting for Ambiguous Medical Image Segmentation","summary":" Multi-rater annotations commonly occur when medical images are independently\nannotated by multiple experts (raters). In this paper, we tackle two challenges\narisen in multi-rater annotations for medical image segmentation (called\nambiguous medical image segmentation): (1) How to train a deep learning model\nwhen a group of raters produces a set of diverse but plausible annotations, and\n(2) how to fine-tune the model efficiently when computation resources are not\navailable for re-training the entire model on a different dataset domain. We\npropose a multi-rater prompt-based approach to address these two challenges\naltogether. Specifically, we introduce a series of rater-aware prompts that can\nbe plugged into the U-Net model for uncertainty estimation to handle\nmulti-annotation cases. During the prompt-based fine-tuning process, only 0.3%\nof learnable parameters are required to be updated comparing to training the\nentire model. Further, in order to integrate expert consensus and disagreement,\nwe explore different multi-rater incorporation strategies and design a\nmix-training strategy for comprehensive insight learning. Extensive experiments\nverify the effectiveness of our new approach for ambiguous medical image\nsegmentation on two public datasets while alleviating the heavy burden of model\nre-training.\n","authors":["Jinhong Wang","Yi Cheng","Jintai Chen","Hongxia Xu","Danny Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07122v2","updated":"2024-04-11T09:10:21Z","published":"2024-04-10T16:01:37Z","title":"Driver Attention Tracking and Analysis","summary":" We propose a novel method to estimate a driver's points-of-gaze using a pair\nof ordinary cameras mounted on the windshield and dashboard of a car. This is a\nchallenging problem due to the dynamics of traffic environments with 3D scenes\nof unknown depths. This problem is further complicated by the volatile distance\nbetween the driver and the camera system. To tackle these challenges, we\ndevelop a novel convolutional network that simultaneously analyzes the image of\nthe scene and the image of the driver's face. This network has a camera\ncalibration module that can compute an embedding vector that represents the\nspatial configuration between the driver and the camera system. This\ncalibration module improves the overall network's performance, which can be\njointly trained end to end.\n We also address the lack of annotated data for training and evaluation by\nintroducing a large-scale driving dataset with point-of-gaze annotations. This\nis an in situ dataset of real driving sessions in an urban city, containing\nsynchronized images of the driving scene as well as the face and gaze of the\ndriver. Experiments on this dataset show that the proposed method outperforms\nvarious baseline methods, having the mean prediction error of 29.69 pixels,\nwhich is relatively small compared to the $1280{\\times}720$ resolution of the\nscene camera.\n","authors":["Dat Viet Thanh Nguyen","Anh Tran","Hoai Nam Vu","Cuong Pham","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2404.07122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07564v1","updated":"2024-04-11T08:50:12Z","published":"2024-04-11T08:50:12Z","title":"ObjBlur: A Curriculum Learning Approach With Progressive Object-Level\n Blurring for Improved Layout-to-Image Generation","summary":" We present ObjBlur, a novel curriculum learning approach to improve\nlayout-to-image generation models, where the task is to produce realistic\nimages from layouts composed of boxes and labels. Our method is based on\nprogressive object-level blurring, which effectively stabilizes training and\nenhances the quality of generated images. This curriculum learning strategy\nsystematically applies varying degrees of blurring to individual objects or the\nbackground during training, starting from strong blurring to progressively\ncleaner images. Our findings reveal that this approach yields significant\nperformance improvements, stabilized training, smoother convergence, and\nreduced variance between multiple runs. Moreover, our technique demonstrates\nits versatility by being compatible with generative adversarial networks and\ndiffusion models, underlining its applicability across various generative\nmodeling paradigms. With ObjBlur, we reach new state-of-the-art results on the\ncomplex COCO and Visual Genome datasets.\n","authors":["Stanislav Frolov","Brian B. Moser","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2404.07564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06710v2","updated":"2024-04-11T08:40:42Z","published":"2024-04-10T03:31:32Z","title":"SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike\n Camera","summary":" One of the most critical factors in achieving sharp Novel View Synthesis\n(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D\nGaussian Splatting (3DGS) is the quality of the training images. However,\nConventional RGB cameras are susceptible to motion blur. In contrast,\nneuromorphic cameras like event and spike cameras inherently capture more\ncomprehensive temporal information, which can provide a sharp representation of\nthe scene as additional training data. Recent methods have explored the\nintegration of event cameras to improve the quality of NVS. The event-RGB\napproaches have some limitations, such as high training costs and the inability\nto work effectively in the background. Instead, our study introduces a new\nmethod that uses the spike camera to overcome these limitations. By considering\ntexture reconstruction from spike streams as ground truth, we design the\nTexture from Spike (TfS) loss. Since the spike camera relies on temporal\nintegration instead of temporal differentiation used by event cameras, our\nproposed TfS loss maintains manageable training costs. It handles foreground\nobjects with backgrounds simultaneously. We also provide a real-world dataset\ncaptured with our spike-RGB camera system to facilitate future research\nendeavors. We conduct extensive experiments using synthetic and real-world\ndatasets to demonstrate that our design can enhance novel view synthesis across\nNeRF and 3DGS. The code and dataset will be made available for public access.\n","authors":["Gaole Dai","Zhenyu Wang","Qinwen Xu","Ming Lu","Wen Cheng","Baixin Shi","Shanghang Zhang","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07556v1","updated":"2024-04-11T08:36:36Z","published":"2024-04-11T08:36:36Z","title":"Attention-Aware Laparoscopic Image Desmoking Network with Lightness\n Embedding and Hybrid Guided Embedding","summary":" This paper presents a novel method of smoke removal from the laparoscopic\nimages. Due to the heterogeneous nature of surgical smoke, a two-stage network\nis proposed to estimate the smoke distribution and reconstruct a clear,\nsmoke-free surgical scene. The utilization of the lightness channel plays a\npivotal role in providing vital information pertaining to smoke density. The\nreconstruction of smoke-free image is guided by a hybrid embedding, which\ncombines the estimated smoke mask with the initial image. Experimental results\ndemonstrate that the proposed method boasts a Peak Signal to Noise Ratio that\nis $2.79\\%$ higher than the state-of-the-art methods, while also exhibits a\nremarkable $38.2\\%$ reduction in run-time. Overall, the proposed method offers\ncomparable or even superior performance in terms of both smoke removal quality\nand computational efficiency when compared to existing state-of-the-art\nmethods. This work will be publicly available on\nhttp://homepage.hit.edu.cn/wpgao\n","authors":["Ziteng Liu","Jiahua Zhu","Bainan Liu","Hao Liu","Wenpeng Gao","Yili Fu"],"pdf_url":"https://arxiv.org/pdf/2404.07556v1.pdf","comment":"ISBI2024"},{"id":"http://arxiv.org/abs/2404.07554v1","updated":"2024-04-11T08:36:13Z","published":"2024-04-11T08:36:13Z","title":"CAT: Contrastive Adapter Training for Personalized Image Generation","summary":" The emergence of various adapters, including Low-Rank Adaptation (LoRA)\napplied from the field of natural language processing, has allowed diffusion\nmodels to personalize image generation at a low cost. However, due to the\nvarious challenges including limited datasets and shortage of regularization\nand computation resources, adapter training often results in unsatisfactory\noutcomes, leading to the corruption of the backbone model's prior knowledge.\nOne of the well known phenomena is the loss of diversity in object generation,\nespecially within the same class which leads to generating almost identical\nobjects with minor variations. This poses challenges in generation\ncapabilities. To solve this issue, we present Contrastive Adapter Training\n(CAT), a simple yet effective strategy to enhance adapter training through the\napplication of CAT loss. Our approach facilitates the preservation of the base\nmodel's original knowledge when the model initiates adapters. Furthermore, we\nintroduce the Knowledge Preservation Score (KPS) to evaluate CAT's ability to\nkeep the former information. We qualitatively and quantitatively compare CAT's\nimprovement. Finally, we mention the possibility of CAT in the aspects of\nmulti-concept adapter and optimization.\n","authors":["Jae Wan Park","Sang Hyun Park","Jun Young Koh","Junha Lee","Min Song"],"pdf_url":"https://arxiv.org/pdf/2404.07554v1.pdf","comment":"CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.07553v1","updated":"2024-04-11T08:35:24Z","published":"2024-04-11T08:35:24Z","title":"SFSORT: Scene Features-based Simple Online Real-Time Tracker","summary":" This paper introduces SFSORT, the world's fastest multi-object tracking\nsystem based on experiments conducted on MOT Challenge datasets. To achieve an\naccurate and computationally efficient tracker, this paper employs a\ntracking-by-detection method, following the online real-time tracking approach\nestablished in prior literature. By introducing a novel cost function called\nthe Bounding Box Similarity Index, this work eliminates the Kalman Filter,\nleading to reduced computational requirements. Additionally, this paper\ndemonstrates the impact of scene features on enhancing object-track association\nand improving track post-processing. Using a 2.2 GHz Intel Xeon CPU, the\nproposed method achieves an HOTA of 61.7\\% with a processing speed of 2242 Hz\non the MOT17 dataset and an HOTA of 60.9\\% with a processing speed of 304 Hz on\nthe MOT20 dataset. The tracker's source code, fine-tuned object detection\nmodel, and tutorials are available at\n\\url{https://github.com/gitmehrdad/SFSORT}.\n","authors":["M. M. Morsali","Z. Sharifi","F. Fallah","S. Hashembeiki","H. Mohammadzade","S. Bagheri Shouraki"],"pdf_url":"https://arxiv.org/pdf/2404.07553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07551v1","updated":"2024-04-11T08:34:10Z","published":"2024-04-11T08:34:10Z","title":"Event-Enhanced Snapshot Compressive Videography at 10K FPS","summary":" Video snapshot compressive imaging (SCI) encodes the target dynamic scene\ncompactly into a snapshot and reconstructs its high-speed frame sequence\nafterward, greatly reducing the required data footprint and transmission\nbandwidth as well as enabling high-speed imaging with a low frame rate\nintensity camera. In implementation, high-speed dynamics are encoded via\ntemporally varying patterns, and only frames at corresponding temporal\nintervals can be reconstructed, while the dynamics occurring between\nconsecutive frames are lost. To unlock the potential of conventional snapshot\ncompressive videography, we propose a novel hybrid \"intensity+event\" imaging\nscheme by incorporating an event camera into a video SCI setup. Our proposed\nsystem consists of a dual-path optical setup to record the coded intensity\nmeasurement and intermediate event signals simultaneously, which is compact and\nphoton-efficient by collecting the half photons discarded in conventional video\nSCI. Correspondingly, we developed a dual-branch Transformer utilizing the\nreciprocal relationship between two data modes to decode dense video frames.\nExtensive experiments on both simulated and real-captured data demonstrate our\nsuperiority to state-of-the-art video SCI and video frame interpolation (VFI)\nmethods. Benefiting from the new hybrid design leveraging both intrinsic\nredundancy in videos and the unique feature of event cameras, we achieve\nhigh-quality videography at 0.1ms time intervals with a low-cost CMOS image\nsensor working at 24 FPS.\n","authors":["Bo Zhang","Jinli Suo","Qionghai Dai"],"pdf_url":"https://arxiv.org/pdf/2404.07551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10372v2","updated":"2024-04-11T08:21:09Z","published":"2023-10-16T13:11:35Z","title":"Learning Object Permanence from Videos via Latent Imaginations","summary":" While human infants exhibit knowledge about object permanence from two months\nof age onwards, deep-learning approaches still largely fail to recognize\nobjects' continued existence. We introduce a slot-based autoregressive deep\nlearning system, the looped location and identity tracking model Loci-Looped,\nwhich learns to adaptively fuse latent imaginations with pixel-space\nobservations into consistent latent object-specific what and where encodings\nover time. The novel loop empowers Loci-Looped to learn the physical concepts\nof object permanence, directional inertia, and object solidity through\nobservation alone. As a result, Loci-Looped tracks objects through occlusions,\nanticipates their reappearance, and shows signs of surprise and internal\nrevisions when observing implausible object behavior. Notably, Loci-Looped\noutperforms state-of-the-art baseline models in handling object occlusions and\ntemporary sensory interruptions while exhibiting more compositional,\ninterpretable internal activity patterns. Our work thus introduces the first\nself-supervised interpretable learning model that learns about object\npermanence directly from video data without supervision.\n","authors":["Manuel Traub","Frederic Becker","Sebastian Otte","Martin V. Butz"],"pdf_url":"https://arxiv.org/pdf/2310.10372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15011v3","updated":"2024-04-11T08:16:53Z","published":"2023-11-25T12:34:02Z","title":"VSCode: General Visual Salient and Camouflaged Object Detection with 2D\n Prompt Learning","summary":" Salient object detection (SOD) and camouflaged object detection (COD) are\nrelated yet distinct binary mapping tasks. These tasks involve multiple\nmodalities, sharing commonalities and unique cues. Existing research often\nemploys intricate task-specific specialist models, potentially leading to\nredundancy and suboptimal results. We introduce VSCode, a generalist model with\nnovel 2D prompt learning, to jointly address four SOD tasks and three COD\ntasks. We utilize VST as the foundation model and introduce 2D prompts within\nthe encoder-decoder architecture to learn domain and task-specific knowledge on\ntwo separate dimensions. A prompt discrimination loss helps disentangle\npeculiarities to benefit model optimization. VSCode outperforms\nstate-of-the-art methods across six tasks on 26 datasets and exhibits zero-shot\ngeneralization to unseen tasks by combining 2D prompts, such as RGB-D COD.\nSource code has been available at https://github.com/Sssssuperior/VSCode.\n","authors":["Ziyang Luo","Nian Liu","Wangbo Zhao","Xuguang Yang","Dingwen Zhang","Deng-Ping Fan","Fahad Khan","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2311.15011v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2204.01348v2","updated":"2024-04-11T08:12:50Z","published":"2022-04-04T09:46:30Z","title":"Extended Reality for Mental Health Evaluation -A Scoping Review","summary":" Mental health disorders are the leading cause of health-related problems\nglobally. It is projected that mental health disorders will be the leading\ncause of morbidity among adults as the incidence rates of anxiety and\ndepression grows globally. Recently, extended reality (XR), a general term\ncovering virtual reality (VR), augmented reality (AR) and mixed reality (MR),\nis paving a new way to deliver mental health care. In this paper, we conduct a\nscoping review on the development and application of XR in the area of mental\ndisorders. We performed a scoping database search to identify the relevant\nstudies indexed in Google Scholar, PubMed, and the ACM Digital Library. A\nsearch period between August 2016 and December 2023 was defined to select\narticles related to the usage of VR, AR, and MR in a mental health context. We\nidentified a total of 85 studies from 27 countries across the globe. By\nperforming data analysis, we found that most of the studies focused on\ndeveloped countries such as the US (16.47%) and Germany (12.94%). None of the\nstudies were for African countries. The majority of the articles reported that\nXR techniques led to a significant reduction in symptoms of anxiety or\ndepression. More studies were published in the year 2021, i.e., 31.76% (n =\n31). This could indicate that mental disorder intervention received a higher\nattention when COVID-19 emerged. Most studies (n = 65) focused on a population\nbetween 18 and 65 years old, only a few studies focused on teenagers (n = 2).\nAlso, more studies were done experimentally (n = 67, 78.82%) rather than by\nanalytical and modeling approaches (n = 8, 9.41%). This shows that there is a\nrapid development of XR technology for mental health care. Furthermore, these\nstudies showed that XR technology can effectively be used for evaluating mental\ndisorders in similar or better way as the conventional approaches.\n","authors":["Omisore Olatunji","Ifeanyi Odenigbo","Joseph Orji","Amelia Beltran","Nilufar Baghaei","Meier Sandra","Rita Orji"],"pdf_url":"https://arxiv.org/pdf/2204.01348v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07545v1","updated":"2024-04-11T08:12:48Z","published":"2024-04-11T08:12:48Z","title":"Stereo-LiDAR Depth Estimation with Deformable Propagation and Learned\n Disparity-Depth Conversion","summary":" Accurate and dense depth estimation with stereo cameras and LiDAR is an\nimportant task for automatic driving and robotic perception. While sparse hints\nfrom LiDAR points have improved cost aggregation in stereo matching, their\neffectiveness is limited by the low density and non-uniform distribution. To\naddress this issue, we propose a novel stereo-LiDAR depth estimation network\nwith Semi-Dense hint Guidance, named SDG-Depth. Our network includes a\ndeformable propagation module for generating a semi-dense hint map and a\nconfidence map by propagating sparse hints using a learned deformable window.\nThese maps then guide cost aggregation in stereo matching. To reduce the\ntriangulation error in depth recovery from disparity, especially in distant\nregions, we introduce a disparity-depth conversion module. Our method is both\naccurate and efficient. The experimental results on benchmark tests show its\nsuperior performance. Our code is available at\nhttps://github.com/SJTU-ViSYS/SDG-Depth.\n","authors":["Ang Li","Anning Hu","Wei Xi","Wenxian Yu","Danping Zou"],"pdf_url":"https://arxiv.org/pdf/2404.07545v1.pdf","comment":"Accepted in ICRA 2024. 8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.07543v1","updated":"2024-04-11T08:11:36Z","published":"2024-04-11T08:11:36Z","title":"Content-Adaptive Non-Local Convolution for Remote Sensing Pansharpening","summary":" Currently, machine learning-based methods for remote sensing pansharpening\nhave progressed rapidly. However, existing pansharpening methods often do not\nfully exploit differentiating regional information in non-local spaces, thereby\nlimiting the effectiveness of the methods and resulting in redundant learning\nparameters. In this paper, we introduce a so-called content-adaptive non-local\nconvolution (CANConv), a novel method tailored for remote sensing image\npansharpening. Specifically, CANConv employs adaptive convolution, ensuring\nspatial adaptability, and incorporates non-local self-similarity through the\nsimilarity relationship partition (SRP) and the partition-wise adaptive\nconvolution (PWAC) sub-modules. Furthermore, we also propose a corresponding\nnetwork architecture, called CANNet, which mainly utilizes the multi-scale\nself-similarity. Extensive experiments demonstrate the superior performance of\nCANConv, compared with recent promising fusion methods. Besides, we\nsubstantiate the method's effectiveness through visualization, ablation\nexperiments, and comparison with existing methods on multiple test sets. The\nsource code is publicly available at https://github.com/duanyll/CANConv.\n","authors":["Yule Duan","Xiao Wu","Haoyu Deng","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2404.07543v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.11725v2","updated":"2024-04-11T08:11:20Z","published":"2023-10-18T05:44:49Z","title":"VST++: Efficient and Stronger Visual Saliency Transformer","summary":" While previous CNN-based models have exhibited promising results for salient\nobject detection (SOD), their ability to explore global long-range dependencies\nis restricted. Our previous work, the Visual Saliency Transformer (VST),\naddressed this constraint from a transformer-based sequence-to-sequence\nperspective, to unify RGB and RGB-D SOD. In VST, we developed a multi-task\ntransformer decoder that concurrently predicts saliency and boundary outcomes\nin a pure transformer architecture. Moreover, we introduced a novel token\nupsampling method called reverse T2T for predicting a high-resolution saliency\nmap effortlessly within transformer-based structures. Building upon the VST\nmodel, we further propose an efficient and stronger VST version in this work,\ni.e. VST++. To mitigate the computational costs of the VST model, we propose a\nSelect-Integrate Attention (SIA) module, partitioning foreground into\nfine-grained segments and aggregating background information into a single\ncoarse-grained token. To incorporate 3D depth information with low cost, we\ndesign a novel depth position encoding method tailored for depth maps.\nFurthermore, we introduce a token-supervised prediction loss to provide\nstraightforward guidance for the task-related tokens. We evaluate our VST++\nmodel across various transformer-based backbones on RGB, RGB-D, and RGB-T SOD\nbenchmark datasets. Experimental results show that our model outperforms\nexisting methods while achieving a 25% reduction in computational costs without\nsignificant performance compromise. The demonstrated strong ability for\ngeneralization, enhanced performance, and heightened efficiency of our VST++\nmodel highlight its potential.\n","authors":["Nian Liu","Ziyang Luo","Ni Zhang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2310.11725v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00349v2","updated":"2024-04-11T08:08:10Z","published":"2023-01-01T05:02:46Z","title":"Towards Reliable Medical Image Segmentation by utilizing Evidential\n Calibrated Uncertainty","summary":" Medical image segmentation is critical for disease diagnosis and treatment\nassessment. However, concerns regarding the reliability of segmentation regions\npersist among clinicians, mainly attributed to the absence of confidence\nassessment, robustness, and calibration to accuracy. To address this, we\nintroduce DEviS, an easily implementable foundational model that seamlessly\nintegrates into various medical image segmentation networks. DEviS not only\nenhances the calibration and robustness of baseline segmentation accuracy but\nalso provides high-efficiency uncertainty estimation for reliable predictions.\nBy leveraging subjective logic theory, we explicitly model probability and\nuncertainty for the problem of medical image segmentation. Here, the Dirichlet\ndistribution parameterizes the distribution of probabilities for different\nclasses of the segmentation results. To generate calibrated predictions and\nuncertainty, we develop a trainable calibrated uncertainty penalty.\nFurthermore, DEviS incorporates an uncertainty-aware filtering module, which\nutilizes the metric of uncertainty-calibrated error to filter reliable data\nwithin the dataset. We conducted validation studies to assess both the accuracy\nand robustness of DEviS segmentation, along with evaluating the efficiency and\nreliability of uncertainty estimation. These evaluations were performed using\npublicly available datasets including ISIC2018, LiTS2017, and BraTS2019.\nAdditionally, two potential clinical trials are being conducted at Johns\nHopkins OCT, Duke-OCT-DME, and FIVES datasets to demonstrate their efficacy in\nfiltering high-quality or out-of-distribution data. Our code has been released\nin https://github.com/Cocofeat/DEviS.\n","authors":["Ke Zou","Yidi Chen","Ling Huang","Xuedong Yuan","Xiaojing Shen","Meng Wang","Rick Siow Mong Goh","Yong Liu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2301.00349v2.pdf","comment":"34 pages, 11 figures"},{"id":"http://arxiv.org/abs/2306.00696v2","updated":"2024-04-11T08:03:25Z","published":"2023-06-01T14:06:48Z","title":"Analyzing the Internals of Neural Radiance Fields","summary":" Modern Neural Radiance Fields (NeRFs) learn a mapping from position to\nvolumetric density leveraging proposal network samplers. In contrast to the\ncoarse-to-fine sampling approach with two NeRFs, this offers significant\npotential for acceleration using lower network capacity. Given that NeRFs\nutilize most of their network capacity to estimate radiance, they could store\nvaluable density information in their parameters or their deep features. To\ninvestigate this proposition, we take one step back and analyze large, trained\nReLU-MLPs used in coarse-to-fine sampling. Building on our novel activation\nvisualization method, we find that trained NeRFs, Mip-NeRFs and proposal\nnetwork samplers map samples with high density to local minima along a ray in\nactivation feature space. We show how these large MLPs can be accelerated by\ntransforming intermediate activations to a weight estimate, without any\nmodifications to the training protocol or the network architecture. With our\napproach, we can reduce the computational requirements of trained NeRFs by up\nto 50% with only a slight hit in rendering quality. Extensive experimental\nevaluation on a variety of datasets and architectures demonstrates the\neffectiveness of our approach. Consequently, our methodology provides valuable\ninsight into the inner workings of NeRFs.\n","authors":["Lukas Radl","Andreas Kurz","Michael Steiner","Markus Steinberger"],"pdf_url":"https://arxiv.org/pdf/2306.00696v2.pdf","comment":"Accepted to CVPRW'24! Project Page:\n https://r4dl.github.io/nerfinternals/"},{"id":"http://arxiv.org/abs/2404.07537v1","updated":"2024-04-11T08:03:23Z","published":"2024-04-11T08:03:23Z","title":"How is Visual Attention Influenced by Text Guidance? Database and Model","summary":" The analysis and prediction of visual attention have long been crucial tasks\nin the fields of computer vision and image processing. In practical\napplications, images are generally accompanied by various text descriptions,\nhowever, few studies have explored the influence of text descriptions on visual\nattention, let alone developed visual saliency prediction models considering\ntext guidance. In this paper, we conduct a comprehensive study on text-guided\nimage saliency (TIS) from both subjective and objective perspectives.\nSpecifically, we construct a TIS database named SJTU-TIS, which includes 1200\ntext-image pairs and the corresponding collected eye-tracking data. Based on\nthe established SJTU-TIS database, we analyze the influence of various text\ndescriptions on visual attention. Then, to facilitate the development of\nsaliency prediction models considering text influence, we construct a benchmark\nfor the established SJTU-TIS database using state-of-the-art saliency models.\nFinally, considering the effect of text descriptions on visual attention, while\nmost existing saliency models ignore this impact, we further propose a\ntext-guided saliency (TGSal) prediction model, which extracts and integrates\nboth image features and text features to predict the image saliency under\nvarious text-description conditions. Our proposed model significantly\noutperforms the state-of-the-art saliency models on both the SJTU-TIS database\nand the pure image saliency databases in terms of various evaluation metrics.\nThe SJTU-TIS database and the code of the proposed TGSal model will be released\nat: https://github.com/IntMeGroup/TGSal.\n","authors":["Yinan Sun","Xiongkuo Min","Huiyu Duan","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.07537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09107v2","updated":"2024-04-11T07:42:43Z","published":"2024-03-14T05:00:29Z","title":"S^2MVTC: a Simple yet Efficient Scalable Multi-View Tensor Clustering","summary":" Anchor-based large-scale multi-view clustering has attracted considerable\nattention for its effectiveness in handling massive datasets. However, current\nmethods mainly seek the consensus embedding feature for clustering by exploring\nglobal correlations between anchor graphs or projection matrices.In this paper,\nwe propose a simple yet efficient scalable multi-view tensor clustering\n(S^2MVTC) approach, where our focus is on learning correlations of embedding\nfeatures within and across views. Specifically, we first construct the\nembedding feature tensor by stacking the embedding features of different views\ninto a tensor and rotating it. Additionally, we build a novel tensor\nlow-frequency approximation (TLFA) operator, which incorporates graph\nsimilarity into embedding feature learning, efficiently achieving smooth\nrepresentation of embedding features within different views. Furthermore,\nconsensus constraints are applied to embedding features to ensure inter-view\nsemantic consistency. Experimental results on six large-scale multi-view\ndatasets demonstrate that S^2MVTC significantly outperforms state-of-the-art\nalgorithms in terms of clustering performance and CPU execution time,\nespecially when handling massive data. The code of S^2MVTC is publicly\navailable at https://github.com/longzhen520/S2MVTC.\n","authors":["Zhen Long","Qiyuan Wang","Yazhou Ren","Yipeng Liu","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.09107v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.07520v1","updated":"2024-04-11T07:26:00Z","published":"2024-04-11T07:26:00Z","title":"PromptSync: Bridging Domain Gaps in Vision-Language Models through\n Class-Aware Prototype Alignment and Discrimination","summary":" The potential for zero-shot generalization in vision-language (V-L) models\nsuch as CLIP has spurred their widespread adoption in addressing numerous\ndownstream tasks. Previous methods have employed test-time prompt tuning to\nadapt the model to unseen domains, but they overlooked the issue of imbalanced\nclass distributions. In this study, we explicitly address this problem by\nemploying class-aware prototype alignment weighted by mean class probabilities\nobtained for the test sample and filtered augmented views. Additionally, we\nensure that the class probabilities are as accurate as possible by performing\nprototype discrimination using contrastive learning. The combination of\nalignment and discriminative loss serves as a geometric regularizer, preventing\nthe prompt representation from collapsing onto a single class and effectively\nbridging the distribution gap between the source and test domains. Our method,\nnamed PromptSync, synchronizes the prompts for each test sample on both the\ntext and vision branches of the V-L model. In empirical evaluations on the\ndomain generalization benchmark, our method outperforms previous best methods\nby 2.33\\% in overall performance, by 1\\% in base-to-novel generalization, and\nby 2.84\\% in cross-dataset transfer tasks.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2404.07520v1.pdf","comment":"Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures"},{"id":"http://arxiv.org/abs/2404.06859v2","updated":"2024-04-11T07:24:59Z","published":"2024-04-10T09:35:36Z","title":"Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark","summary":" Multi-label image classification in dynamic environments is a problem that\nposes significant challenges. Previous studies have primarily focused on\nscenarios such as Domain Incremental Learning and Class Incremental Learning,\nwhich do not fully capture the complexity of real-world applications. In this\npaper, we study the problem of classification of medical imaging in the\nscenario termed New Instances and New Classes, which combines the challenges of\nboth new class arrivals and domain shifts in a single framework. Unlike\ntraditional scenarios, it reflects the realistic nature of CL in domains such\nas medical imaging, where updates may introduce both new classes and changes in\ndomain characteristics. To address the unique challenges posed by this complex\nscenario, we introduce a novel approach called Pseudo-Label Replay. This method\naims to mitigate forgetting while adapting to new classes and domain shifts by\ncombining the advantages of the Replay and Pseudo-Label methods and solving\ntheir limitations in the proposed scenario. We evaluate our proposed approach\non a challenging benchmark consisting of two datasets, seven tasks, and\nnineteen classes, modeling a realistic Continual Learning scenario. Our\nexperimental findings demonstrate the effectiveness of Pseudo-Label Replay in\naddressing the challenges posed by the complex scenario proposed. Our method\nsurpasses existing approaches, exhibiting superior performance while showing\nminimal forgetting.\n","authors":["Marina Ceccon","Davide Dalle Pezze","Alessandro Fabris","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2404.06859v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07518v1","updated":"2024-04-11T07:22:14Z","published":"2024-04-11T07:22:14Z","title":"Remembering Transformer for Continual Learning","summary":" Neural networks encounter the challenge of Catastrophic Forgetting (CF) in\ncontinual learning, where new task knowledge interferes with previously learned\nknowledge. We propose Remembering Transformer, inspired by the brain's\nComplementary Learning Systems (CLS), to tackle this issue. Remembering\nTransformer employs a mixture-of-adapters and a generative model-based routing\nmechanism to alleviate CF by dynamically routing task data to relevant\nadapters. Our approach demonstrated a new SOTA performance in various vision\ncontinual learning tasks and great parameter efficiency.\n","authors":["Yuwei Sun","Jun Sakuma","Ryota Kanai"],"pdf_url":"https://arxiv.org/pdf/2404.07518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16073v2","updated":"2024-04-11T07:20:52Z","published":"2023-10-24T14:59:51Z","title":"FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal\n Consistency and Correlation Debiasing","summary":" Dynamic scene graph generation (SGG) from videos requires not only a\ncomprehensive understanding of objects across scenes but also a method to\ncapture the temporal motions and interactions with different objects. Moreover,\nthe long-tailed distribution of visual relationships is a crucial bottleneck\nfor most dynamic SGG methods. This is because many of them focus on capturing\nspatio-temporal context using complex architectures, leading to the generation\nof biased scene graphs. To address these challenges, we propose\n\\textsc{FloCoDe}: \\textbf{Flo}w-aware Temporal Consistency and\n\\textbf{Co}rrelation \\textbf{De}biasing with uncertainty attenuation for\nunbiased dynamic scene graphs. \\textsc{FloCoDe} employs feature warping using\nflow to detect temporally consistent objects across frames. To address the\nlong-tail issue of visual relationships, we propose correlation debiasing and a\nlabel correlation-based loss to learn unbiased relation representations for\nlong-tailed classes. Specifically, we propose to incorporate label correlations\nusing contrastive loss to capture commonly co-occurring relations, which aids\nin learning robust representations for long-tailed classes. Further, we adopt\nthe uncertainty attenuation-based classifier framework to handle noisy\nannotations in the SGG data. Extensive experimental evaluation shows a\nperformance gain as high as 4.1\\%, demonstrating the superiority of generating\nmore unbiased scene graphs.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2310.16073v2.pdf","comment":"Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2404.05426v2","updated":"2024-04-11T07:12:35Z","published":"2024-04-08T11:54:49Z","title":"Test-Time Zero-Shot Temporal Action Localization","summary":" Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate\nactions in untrimmed videos unseen during training. Existing ZS-TAL methods\ninvolve fine-tuning a model on a large amount of annotated training data. While\neffective, training-based ZS-TAL approaches assume the availability of labeled\ndata for supervised learning, which can be impractical in some applications.\nFurthermore, the training process naturally induces a domain bias into the\nlearned model, which may adversely affect the model's generalization ability to\narbitrary videos. These considerations prompt us to approach the ZS-TAL problem\nfrom a radically novel perspective, relaxing the requirement for training data.\nTo this aim, we introduce a novel method that performs Test-Time adaptation for\nTemporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained\nVision and Language Model (VLM). T3AL operates in three steps. First, a\nvideo-level pseudo-label of the action category is computed by aggregating\ninformation from the entire video. Then, action localization is performed\nadopting a novel procedure inspired by self-supervised learning. Finally,\nframe-level textual descriptions extracted with a state-of-the-art captioning\nmodel are employed for refining the action region proposals. We validate the\neffectiveness of T3AL by conducting experiments on the THUMOS14 and the\nActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly\noutperforms zero-shot baselines based on state-of-the-art VLMs, confirming the\nbenefit of a test-time adaptation approach.\n","authors":["Benedetta Liberatori","Alessandro Conti","Paolo Rota","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05426v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07514v1","updated":"2024-04-11T07:11:43Z","published":"2024-04-11T07:11:43Z","title":"Generalization Gap in Data Augmentation: Insights from Illumination","summary":" In the field of computer vision, data augmentation is widely used to enrich\nthe feature complexity of training datasets with deep learning techniques.\nHowever, regarding the generalization capabilities of models, the difference in\nartificial features generated by data augmentation and natural visual features\nhas not been fully revealed. This study focuses on the visual representation\nvariable 'illumination', by simulating its distribution degradation and\nexamining how data augmentation techniques enhance model performance on a\nclassification task. Our goal is to investigate the differences in\ngeneralization between models trained with augmented data and those trained\nunder real-world illumination conditions. Results indicate that after\nundergoing various data augmentation methods, model performance has been\nsignificantly improved. Yet, a noticeable generalization gap still exists after\nutilizing various data augmentation methods, emphasizing the critical role of\nfeature diversity in the training set for enhancing model generalization.\n","authors":["Jianqiang Xiao","Weiwen Guo","Junfeng Liu","Mengze Li"],"pdf_url":"https://arxiv.org/pdf/2404.07514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01446v2","updated":"2024-04-11T06:58:18Z","published":"2024-04-01T19:33:41Z","title":"Finding Regions of Interest in Whole Slide Images Using Multiple\n Instance Learning","summary":" Whole Slide Images (WSI), obtained by high-resolution digital scanning of\nmicroscope slides at multiple scales, are the cornerstone of modern Digital\nPathology. However, they represent a particular challenge to\nAI-based/AI-mediated analysis because pathology labeling is typically done at\nslide-level, instead of tile-level. It is not just that medical diagnostics is\nrecorded at the specimen level, the detection of oncogene mutation is also\nexperimentally obtained, and recorded by initiatives like The Cancer Genome\nAtlas (TCGA), at the slide level. This configures a dual challenge: a)\naccurately predicting the overall cancer phenotype and b) finding out what\ncellular morphologies are associated with it at the tile level. To address\nthese challenges, a weakly supervised Multiple Instance Learning (MIL) approach\nwas explored for two prevalent cancer types, Invasive Breast Carcinoma\n(TCGA-BRCA) and Lung Squamous Cell Carcinoma (TCGA-LUSC). This approach was\nexplored for tumor detection at low magnification levels and TP53 mutations at\nvarious levels. Our results show that a novel additive implementation of MIL\nmatched the performance of reference implementation (AUC 0.96), and was only\nslightly outperformed by Attention MIL (AUC 0.97). More interestingly from the\nperspective of the molecular pathologist, these different AI architectures\nidentify distinct sensitivities to morphological features (through the\ndetection of Regions of Interest, RoI) at different amplification levels.\nTellingly, TP53 mutation was most sensitive to features at the higher\napplications where cellular morphology is resolved.\n","authors":["Martim Afonso","Praphulla M. S. Bhawsar","Monjoy Saha","Jonas S. Almeida","Arlindo L. Oliveira"],"pdf_url":"https://arxiv.org/pdf/2404.01446v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07507v1","updated":"2024-04-11T06:55:44Z","published":"2024-04-11T06:55:44Z","title":"Learning to Classify New Foods Incrementally Via Compressed Exemplars","summary":" Food image classification systems play a crucial role in health monitoring\nand diet tracking through image-based dietary assessment techniques. However,\nexisting food recognition systems rely on static datasets characterized by a\npre-defined fixed number of food classes. This contrasts drastically with the\nreality of food consumption, which features constantly changing data.\nTherefore, food image classification systems should adapt to and manage data\nthat continuously evolves. This is where continual learning plays an important\nrole. A challenge in continual learning is catastrophic forgetting, where ML\nmodels tend to discard old knowledge upon learning new information. While\nmemory-replay algorithms have shown promise in mitigating this problem by\nstoring old data as exemplars, they are hampered by the limited capacity of\nmemory buffers, leading to an imbalance between new and previously learned\ndata. To address this, our work explores the use of neural image compression to\nextend buffer size and enhance data diversity. We introduced the concept of\ncontinuously learning a neural compression model to adaptively improve the\nquality of compressed data and optimize the bitrates per pixel (bpp) to store\nmore exemplars. Our extensive experiments, including evaluations on\nfood-specific datasets including Food-101 and VFN-74, as well as the general\ndataset ImageNet-100, demonstrate improvements in classification accuracy. This\nprogress is pivotal in advancing more realistic food recognition systems that\nare capable of adapting to continually evolving data. Moreover, the principles\nand methodologies we've developed hold promise for broader applications,\nextending their benefits to other domains of continual machine learning\nsystems.\n","authors":["Justin Yang","Zhihao Duan","Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.07507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15430v2","updated":"2024-04-11T06:40:12Z","published":"2024-02-23T16:50:07Z","title":"Hierarchical Invariance for Robust and Interpretable Vision Tasks at\n Larger Scales","summary":" Developing robust and interpretable vision systems is a crucial step towards\ntrustworthy artificial intelligence. In this regard, a promising paradigm\nconsiders embedding task-required invariant structures, e.g., geometric\ninvariance, in the fundamental image representation. However, such invariant\nrepresentations typically exhibit limited discriminability, limiting their\napplications in larger-scale trustworthy vision tasks. For this open problem,\nwe conduct a systematic investigation of hierarchical invariance, exploring\nthis topic from theoretical, practical, and application perspectives. At the\ntheoretical level, we show how to construct over-complete invariants with a\nConvolutional Neural Networks (CNN)-like hierarchical architecture yet in a\nfully interpretable manner. The general blueprint, specific definitions,\ninvariant properties, and numerical implementations are provided. At the\npractical level, we discuss how to customize this theoretical framework into a\ngiven task. With the over-completeness, discriminative features w.r.t. the task\ncan be adaptively formed in a Neural Architecture Search (NAS)-like manner. We\ndemonstrate the above arguments with accuracy, invariance, and efficiency\nresults on texture, digit, and parasite classification experiments.\nFurthermore, at the application level, our representations are explored in\nreal-world forensics tasks on adversarial perturbations and Artificial\nIntelligence Generated Content (AIGC). Such applications reveal that the\nproposed strategy not only realizes the theoretically promised invariance, but\nalso exhibits competitive discriminability even in the era of deep learning.\nFor robust and interpretable vision tasks at larger scales, hierarchical\ninvariant representation can be considered as an effective alternative to\ntraditional CNN and invariants.\n","authors":["Shuren Qi","Yushu Zhang","Chao Wang","Zhihua Xia","Xiaochun Cao","Jian Weng"],"pdf_url":"https://arxiv.org/pdf/2402.15430v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07504v1","updated":"2024-04-11T06:39:53Z","published":"2024-04-11T06:39:53Z","title":"Mitigating Object Dependencies: Improving Point Cloud Self-Supervised\n Learning through Object Exchange","summary":" In the realm of point cloud scene understanding, particularly in indoor\nscenes, objects are arranged following human habits, resulting in objects of\ncertain semantics being closely positioned and displaying notable inter-object\ncorrelations. This can create a tendency for neural networks to exploit these\nstrong dependencies, bypassing the individual object patterns. To address this\nchallenge, we introduce a novel self-supervised learning (SSL) strategy. Our\napproach leverages both object patterns and contextual cues to produce robust\nfeatures. It begins with the formulation of an object-exchanging strategy,\nwhere pairs of objects with comparable sizes are exchanged across different\nscenes, effectively disentangling the strong contextual dependencies.\nSubsequently, we introduce a context-aware feature learning strategy, which\nencodes object patterns without relying on their specific context by\naggregating object features across various scenes. Our extensive experiments\ndemonstrate the superiority of our method over existing SSL techniques, further\nshowing its better robustness to environmental changes. Moreover, we showcase\nthe applicability of our approach by transferring pre-trained models to diverse\npoint cloud datasets.\n","authors":["Yanhao Wu","Tong Zhang","Wei Ke","Congpei Qiu","Sabine Susstrunk","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2404.07504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08801v3","updated":"2024-04-11T06:25:41Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07868v2","updated":"2024-04-11T06:21:29Z","published":"2023-01-19T03:42:56Z","title":"MV-Adapter: Multimodal Video Transfer Learning for Video Text Retrieval","summary":" State-of-the-art video-text retrieval (VTR) methods typically involve fully\nfine-tuning a pre-trained model (e.g. CLIP) on specific datasets. However, this\ncan result in significant storage costs in practical applications as a separate\nmodel per task must be stored. To address this issue, we present our pioneering\nwork that enables parameter-efficient VTR using a pre-trained model, with only\na small number of tunable parameters during training. Towards this goal, we\npropose a new method dubbed Multimodal Video Adapter (MV-Adapter) for\nefficiently transferring the knowledge in the pre-trained CLIP from image-text\nto video-text. Specifically, MV-Adapter utilizes bottleneck structures in both\nvideo and text branches, along with two novel components. The first is a\nTemporal Adaptation Module that is incorporated in the video branch to\nintroduce global and local temporal contexts. We also train weights\ncalibrations to adjust to dynamic variations across frames. The second is Cross\nModality Tying that generates weights for video/text branches through sharing\ncross modality factors, for better aligning between modalities. Thanks to above\ninnovations, MV-Adapter can achieve comparable or better performance than\nstandard full fine-tuning with negligible parameters overhead. Notably,\nMV-Adapter consistently outperforms various competing methods in V2T/T2V tasks\nwith large margins on five widely used VTR benchmarks (MSR-VTT, MSVD, LSMDC,\nDiDemo, and ActivityNet).\n","authors":["Xiaojie Jin","Bowen Zhang","Weibo Gong","Kai Xu","XueQing Deng","Peng Wang","Zhao Zhang","Xiaohui Shen","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2301.07868v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07495v1","updated":"2024-04-11T06:06:56Z","published":"2024-04-11T06:06:56Z","title":"PillarTrack: Redesigning Pillar-based Transformer Network for Single\n Object Tracking on Point Clouds","summary":" LiDAR-based 3D single object tracking (3D SOT) is a critical issue in\nrobotics and autonomous driving. It aims to obtain accurate 3D BBox from the\nsearch area based on similarity or motion. However, existing 3D SOT methods\nusually follow the point-based pipeline, where the sampling operation\ninevitably leads to redundant or lost information, resulting in unexpected\nperformance. To address these issues, we propose PillarTrack, a pillar-based 3D\nsingle object tracking framework. Firstly, we transform sparse point clouds\ninto dense pillars to preserve the local and global geometrics. Secondly, we\nintroduce a Pyramid-type Encoding Pillar Feature Encoder (PE-PFE) design to\nhelp the feature representation of each pillar. Thirdly, we present an\nefficient Transformer-based backbone from the perspective of modality\ndifferences. Finally, we construct our PillarTrack tracker based above designs.\nExtensive experiments on the KITTI and nuScenes dataset demonstrate the\nsuperiority of our proposed method. Notably, our method achieves\nstate-of-the-art performance on the KITTI and nuScenes dataset and enables\nreal-time tracking speed. We hope our work could encourage the community to\nrethink existing 3D SOT tracker designs.We will open source our code to the\nresearch community in https://github.com/StiphyJay/PillarTrack.\n","authors":["Weisheng Xu","Sifan Zhou","Zhihang Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.07495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07487v1","updated":"2024-04-11T05:51:06Z","published":"2024-04-11T05:51:06Z","title":"Fine-Grained Side Information Guided Dual-Prompts for Zero-Shot Skeleton\n Action Recognition","summary":" Skeleton-based zero-shot action recognition aims to recognize unknown human\nactions based on the learned priors of the known skeleton-based actions and a\nsemantic descriptor space shared by both known and unknown categories. However,\nprevious works focus on establishing the bridges between the known skeleton\nrepresentation space and semantic descriptions space at the coarse-grained\nlevel for recognizing unknown action categories, ignoring the fine-grained\nalignment of these two spaces, resulting in suboptimal performance in\ndistinguishing high-similarity action categories. To address these challenges,\nwe propose a novel method via Side information and dual-prompts learning for\nskeleton-based zero-shot action recognition (STAR) at the fine-grained level.\nSpecifically, 1) we decompose the skeleton into several parts based on its\ntopology structure and introduce the side information concerning multi-part\ndescriptions of human body movements for alignment between the skeleton and the\nsemantic space at the fine-grained level; 2) we design the visual-attribute and\nsemantic-part prompts to improve the intra-class compactness within the\nskeleton space and inter-class separability within the semantic space,\nrespectively, to distinguish the high-similarity actions. Extensive experiments\nshow that our method achieves state-of-the-art performance in ZSL and GZSL\nsettings on NTU RGB+D, NTU RGB+D 120, and PKU-MMD datasets.\n","authors":["Yang Chen","Jingcai Guo","Tian He","Ling Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07487v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.00644v3","updated":"2024-04-11T05:48:36Z","published":"2024-03-01T16:25:17Z","title":"Diff-Plugin: Revitalizing Details for Diffusion-based Low-level Tasks","summary":" Diffusion models trained on large-scale datasets have achieved remarkable\nprogress in image synthesis. However, due to the randomness in the diffusion\nprocess, they often struggle with handling diverse low-level tasks that require\ndetails preservation. To overcome this limitation, we present a new Diff-Plugin\nframework to enable a single pre-trained diffusion model to generate\nhigh-fidelity results across a variety of low-level tasks. Specifically, we\nfirst propose a lightweight Task-Plugin module with a dual branch design to\nprovide task-specific priors, guiding the diffusion process in preserving image\ncontent. We then propose a Plugin-Selector that can automatically select\ndifferent Task-Plugins based on the text instruction, allowing users to edit\nimages by indicating multiple low-level tasks with natural language. We conduct\nextensive experiments on 8 low-level vision tasks. The results demonstrate the\nsuperiority of Diff-Plugin over existing methods, particularly in real-world\nscenarios. Our ablations further validate that Diff-Plugin is stable,\nschedulable, and supports robust training across different dataset sizes.\n","authors":["Yuhao Liu","Zhanghan Ke","Fang Liu","Nanxuan Zhao","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2403.00644v3.pdf","comment":"Accepted to CVPR2024. Replaced some celebrity images to avoid\n copyright disputes"},{"id":"http://arxiv.org/abs/2404.06351v2","updated":"2024-04-11T05:17:44Z","published":"2024-04-09T14:42:31Z","title":"HPNet: Dynamic Trajectory Forecasting with Historical Prediction\n Attention","summary":" Predicting the trajectories of road agents is essential for autonomous\ndriving systems. The recent mainstream methods follow a static paradigm, which\npredicts the future trajectory by using a fixed duration of historical frames.\nThese methods make the predictions independently even at adjacent time steps,\nwhich leads to potential instability and temporal inconsistency. As successive\ntime steps have largely overlapping historical frames, their forecasting should\nhave intrinsic correlation, such as overlapping predicted trajectories should\nbe consistent, or be different but share the same motion goal depending on the\nroad situation. Motivated by this, in this work, we introduce HPNet, a novel\ndynamic trajectory forecasting method. Aiming for stable and accurate\ntrajectory forecasting, our method leverages not only historical frames\nincluding maps and agent states, but also historical predictions. Specifically,\nwe newly design a Historical Prediction Attention module to automatically\nencode the dynamic relationship between successive predictions. Besides, it\nalso extends the attention range beyond the currently visible window\nbenefitting from the use of historical predictions. The proposed Historical\nPrediction Attention together with the Agent Attention and Mode Attention is\nfurther formulated as the Triple Factorized Attention module, serving as the\ncore design of HPNet.Experiments on the Argoverse and INTERACTION datasets show\nthat HPNet achieves state-of-the-art performance, and generates accurate and\nstable future trajectories. Our code are available at\nhttps://github.com/XiaolongTang23/HPNet.\n","authors":["Xiaolong Tang","Meina Kan","Shiguang Shan","Zhilong Ji","Jinfeng Bai","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06351v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.00511v3","updated":"2024-04-11T05:14:35Z","published":"2024-03-31T01:16:02Z","title":"MIPS at SemEval-2024 Task 3: Multimodal Emotion-Cause Pair Extraction in\n Conversations with Multimodal Language Models","summary":" This paper presents our winning submission to Subtask 2 of SemEval 2024 Task\n3 on multimodal emotion cause analysis in conversations. We propose a novel\nMultimodal Emotion Recognition and Multimodal Emotion Cause Extraction\n(MER-MCE) framework that integrates text, audio, and visual modalities using\nspecialized emotion encoders. Our approach sets itself apart from\ntop-performing teams by leveraging modality-specific features for enhanced\nemotion understanding and causality inference. Experimental evaluation\ndemonstrates the advantages of our multimodal approach, with our submission\nachieving a competitive weighted F1 score of 0.3435, ranking third with a\nmargin of only 0.0339 behind the 1st team and 0.0025 behind the 2nd team.\nProject: https://github.com/MIPS-COLT/MER-MCE.git\n","authors":["Zebang Cheng","Fuqiang Niu","Yuxiang Lin","Zhi-Qi Cheng","Bowen Zhang","Xiaojiang Peng"],"pdf_url":"https://arxiv.org/pdf/2404.00511v3.pdf","comment":"Ranked 3rd in SemEval '24 Task 3 with F1 of 0.3435, close to 1st &\n 2nd by 0.0339 & 0.0025"},{"id":"http://arxiv.org/abs/2404.07474v1","updated":"2024-04-11T04:58:18Z","published":"2024-04-11T04:58:18Z","title":"G-NeRF: Geometry-enhanced Novel View Synthesis from Single-View Images","summary":" Novel view synthesis aims to generate new view images of a given view image\ncollection. Recent attempts address this problem relying on 3D geometry priors\n(e.g., shapes, sizes, and positions) learned from multi-view images. However,\nsuch methods encounter the following limitations: 1) they require a set of\nmulti-view images as training data for a specific scene (e.g., face, car or\nchair), which is often unavailable in many real-world scenarios; 2) they fail\nto extract the geometry priors from single-view images due to the lack of\nmulti-view supervision. In this paper, we propose a Geometry-enhanced NeRF\n(G-NeRF), which seeks to enhance the geometry priors by a geometry-guided\nmulti-view synthesis approach, followed by a depth-aware training. In the\nsynthesis process, inspired that existing 3D GAN models can unconditionally\nsynthesize high-fidelity multi-view images, we seek to adopt off-the-shelf 3D\nGAN models, such as EG3D, as a free source to provide geometry priors through\nsynthesizing multi-view data. Simultaneously, to further improve the geometry\nquality of the synthetic data, we introduce a truncation method to effectively\nsample latent codes within 3D GAN models. To tackle the absence of multi-view\nsupervision for single-view images, we design the depth-aware training\napproach, incorporating a depth-aware discriminator to guide geometry priors\nthrough depth maps. Experiments demonstrate the effectiveness of our method in\nterms of both qualitative and quantitative results.\n","authors":["Zixiong Huang","Qi Chen","Libo Sun","Yifan Yang","Naizhou Wang","Mingkui Tan","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07474v1.pdf","comment":"CVPR 2024 Accepted Paper"},{"id":"http://arxiv.org/abs/2404.07473v1","updated":"2024-04-11T04:54:42Z","published":"2024-04-11T04:54:42Z","title":"LUCF-Net: Lightweight U-shaped Cascade Fusion Network for Medical Image\n Segmentation","summary":" In this study, the performance of existing U-shaped neural network\narchitectures was enhanced for medical image segmentation by adding\nTransformer. Although Transformer architectures are powerful at extracting\nglobal information, its ability to capture local information is limited due to\nits high complexity. To address this challenge, we proposed a new lightweight\nU-shaped cascade fusion network (LUCF-Net) for medical image segmentation. It\nutilized an asymmetrical structural design and incorporated both local and\nglobal modules to enhance its capacity for local and global modeling.\nAdditionally, a multi-layer cascade fusion decoding network was designed to\nfurther bolster the network's information fusion capabilities. Validation\nresults achieved on multi-organ datasets in CT format, cardiac segmentation\ndatasets in MRI format, and dermatology datasets in image format demonstrated\nthat the proposed model outperformed other state-of-the-art methods in handling\nlocal-global information, achieving an improvement of 1.54% in Dice coefficient\nand 2.6 mm in Hausdorff distance on multi-organ segmentation. Furthermore, as a\nnetwork that combines Convolutional Neural Network and Transformer\narchitectures, it achieves competitive segmentation performance with only 6.93\nmillion parameters and 6.6 gigabytes of floating point operations, without the\nneed of pre-training. In summary, the proposed method demonstrated enhanced\nperformance while retaining a simpler model design compared to other\nTransformer-based segmentation networks.\n","authors":["Songkai Sun","Qingshan She","Yuliang Ma","Rihui Li","Yingchun Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.07473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06845v2","updated":"2024-04-11T04:17:13Z","published":"2024-03-11T16:03:35Z","title":"DriveDreamer-2: LLM-Enhanced World Models for Diverse Driving Video\n Generation","summary":" World models have demonstrated superiority in autonomous driving,\nparticularly in the generation of multi-view driving videos. However,\nsignificant challenges still exist in generating customized driving videos. In\nthis paper, we propose DriveDreamer-2, which builds upon the framework of\nDriveDreamer and incorporates a Large Language Model (LLM) to generate\nuser-defined driving videos. Specifically, an LLM interface is initially\nincorporated to convert a user's query into agent trajectories. Subsequently, a\nHDMap, adhering to traffic regulations, is generated based on the trajectories.\nUltimately, we propose the Unified Multi-View Model to enhance temporal and\nspatial coherence in the generated driving videos. DriveDreamer-2 is the first\nworld model to generate customized driving videos, it can generate uncommon\ndriving videos (e.g., vehicles abruptly cut in) in a user-friendly manner.\nBesides, experimental results demonstrate that the generated videos enhance the\ntraining of driving perception methods (e.g., 3D detection and tracking).\nFurthermore, video generation quality of DriveDreamer-2 surpasses other\nstate-of-the-art methods, showcasing FID and FVD scores of 11.2 and 55.7,\nrepresenting relative improvements of 30% and 50%.\n","authors":["Guosheng Zhao","Xiaofeng Wang","Zheng Zhu","Xinze Chen","Guan Huang","Xiaoyi Bao","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.06845v2.pdf","comment":"Project Page: https://drivedreamer2.github.io"},{"id":"http://arxiv.org/abs/2404.07467v1","updated":"2024-04-11T04:14:48Z","published":"2024-04-11T04:14:48Z","title":"Trashbusters: Deep Learning Approach for Litter Detection and Tracking","summary":" The illegal disposal of trash is a major public health and environmental\nconcern. Disposing of trash in unplanned places poses serious health and\nenvironmental risks. We should try to restrict public trash cans as much as\npossible. This research focuses on automating the penalization of litterbugs,\naddressing the persistent problem of littering in public places. Traditional\napproaches relying on manual intervention and witness reporting suffer from\ndelays, inaccuracies, and anonymity issues. To overcome these challenges, this\npaper proposes a fully automated system that utilizes surveillance cameras and\nadvanced computer vision algorithms for litter detection, object tracking, and\nface recognition. The system accurately identifies and tracks individuals\nengaged in littering activities, attaches their identities through face\nrecognition, and enables efficient enforcement of anti-littering policies. By\nreducing reliance on manual intervention, minimizing human error, and providing\nprompt identification, the proposed system offers significant advantages in\naddressing littering incidents. The primary contribution of this research lies\nin the implementation of the proposed system, leveraging advanced technologies\nto enhance surveillance operations and automate the penalization of litterbugs.\n","authors":["Kashish Jain","Manthan Juthani","Jash Jain","Anant V. Nimkar"],"pdf_url":"https://arxiv.org/pdf/2404.07467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10974v4","updated":"2024-04-11T04:14:33Z","published":"2023-07-20T16:00:19Z","title":"Deep Multi-Threshold Spiking-UNet for Image Processing","summary":" U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v4.pdf","comment":"Accepted in NeuroComputing"},{"id":"http://arxiv.org/abs/2402.16994v2","updated":"2024-04-11T03:44:49Z","published":"2024-02-26T20:00:57Z","title":"GEM3D: GEnerative Medial Abstractions for 3D Shape Synthesis","summary":" We introduce GEM3D -- a new deep, topology-aware generative model of 3D\nshapes. The key ingredient of our method is a neural skeleton-based\nrepresentation encoding information on both shape topology and geometry.\nThrough a denoising diffusion probabilistic model, our method first generates\nskeleton-based representations following the Medial Axis Transform (MAT), then\ngenerates surfaces through a skeleton-driven neural implicit formulation. The\nneural implicit takes into account the topological and geometric information\nstored in the generated skeleton representations to yield surfaces that are\nmore topologically and geometrically accurate compared to previous neural field\nformulations. We discuss applications of our method in shape synthesis and\npoint cloud reconstruction tasks, and evaluate our method both qualitatively\nand quantitatively. We demonstrate significantly more faithful surface\nreconstruction and diverse shape generation results compared to the\nstate-of-the-art, also involving challenging scenarios of reconstructing and\nsynthesizing structurally complex, high-genus shape surfaces from Thingi10K and\nShapeNet.\n","authors":["Dmitry Petrov","Pradyumn Goyal","Vikas Thamizharasan","Vladimir G. Kim","Matheus Gadelha","Melinos Averkiou","Siddhartha Chaudhuri","Evangelos Kalogerakis"],"pdf_url":"https://arxiv.org/pdf/2402.16994v2.pdf","comment":"Webpage: https://lodurality.github.io/GEM3D/ -- Cond. accept. to\n SIGGRAPH 2024 (conf. track) -- Changes (based on reviews): changed style to\n sigconf; rearranged figures for readability; added missing citations; fixed\n misaligned centers in Fig. 3; added failure cases (Fig. 10); rewrote\n discussion; added categories averages to Tab. 8; added Tab. 10 with model\n capacities"},{"id":"http://arxiv.org/abs/2404.07449v1","updated":"2024-04-11T03:09:34Z","published":"2024-04-11T03:09:34Z","title":"Learning to Localize Objects Improves Spatial Reasoning in Visual-LLMs","summary":" Integration of Large Language Models (LLMs) into visual domain tasks,\nresulting in visual-LLMs (V-LLMs), has enabled exceptional performance in\nvision-language tasks, particularly for visual question answering (VQA).\nHowever, existing V-LLMs (e.g. BLIP-2, LLaVA) demonstrate weak spatial\nreasoning and localization awareness. Despite generating highly descriptive and\nelaborate textual answers, these models fail at simple tasks like\ndistinguishing a left vs right location. In this work, we explore how\nimage-space coordinate based instruction fine-tuning objectives could inject\nspatial awareness into V-LLMs. We discover optimal coordinate representations,\ndata-efficient instruction fine-tuning objectives, and pseudo-data generation\nstrategies that lead to improved spatial awareness in V-LLMs. Additionally, our\nresulting model improves VQA across image and video domains, reduces undesired\nhallucination, and generates better contextual object descriptions. Experiments\nacross 5 vision-language tasks involving 14 different datasets establish the\nclear performance improvements achieved by our proposed framework.\n","authors":["Kanchana Ranasinghe","Satya Narayan Shukla","Omid Poursaeed","Michael S. Ryoo","Tsung-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2404.07449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07448v1","updated":"2024-04-11T03:08:53Z","published":"2024-04-11T03:08:53Z","title":"Transferable and Principled Efficiency for Open-Vocabulary Segmentation","summary":" Recent success of pre-trained foundation vision-language models makes\nOpen-Vocabulary Segmentation (OVS) possible. Despite the promising performance,\nthis approach introduces heavy computational overheads for two challenges: 1)\nlarge model sizes of the backbone; 2) expensive costs during the fine-tuning.\nThese challenges hinder this OVS strategy from being widely applicable and\naffordable in real-world scenarios. Although traditional methods such as model\ncompression and efficient fine-tuning can address these challenges, they often\nrely on heuristics. This means that their solutions cannot be easily\ntransferred and necessitate re-training on different models, which comes at a\ncost. In the context of efficient OVS, we target achieving performance that is\ncomparable to or even better than prior OVS works based on large\nvision-language foundation models, by utilizing smaller models that incur lower\ntraining costs. The core strategy is to make our efficiency principled and thus\nseamlessly transferable from one OVS framework to others without further\ncustomization. Comprehensive experiments on diverse OVS benchmarks demonstrate\nour superior trade-off between segmentation accuracy and computation costs over\nprevious works. Our code is available on https://github.com/Xujxyang/OpenTrans\n","authors":["Jingxuan Xu","Wuyang Chen","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.07448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16923v2","updated":"2024-04-11T03:01:41Z","published":"2024-01-30T11:46:27Z","title":"Fourier Prompt Tuning for Modality-Incomplete Scene Segmentation","summary":" Integrating information from multiple modalities enhances the robustness of\nscene perception systems in autonomous vehicles, providing a more comprehensive\nand reliable sensory framework. However, the modality incompleteness in\nmulti-modal segmentation remains under-explored. In this work, we establish a\ntask called Modality-Incomplete Scene Segmentation (MISS), which encompasses\nboth system-level modality absence and sensor-level modality errors. To avoid\nthe predominant modality reliance in multi-modal fusion, we introduce a\nMissing-aware Modal Switch (MMS) strategy to proactively manage missing\nmodalities during training. Utilizing bit-level batch-wise sampling enhances\nthe model's performance in both complete and incomplete testing scenarios.\nFurthermore, we introduce the Fourier Prompt Tuning (FPT) method to incorporate\nrepresentative spectral information into a limited number of learnable prompts\nthat maintain robustness against all MISS scenarios. Akin to fine-tuning\neffects but with fewer tunable parameters (1.1%). Extensive experiments prove\nthe efficacy of our proposed approach, showcasing an improvement of 5.84% mIoU\nover the prior state-of-the-art parameter-efficient methods in modality\nmissing. The source code is publicly available at\nhttps://github.com/RuipingL/MISS.\n","authors":["Ruiping Liu","Jiaming Zhang","Kunyu Peng","Yufan Chen","Ke Cao","Junwei Zheng","M. Saquib Sarfraz","Kailun Yang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2401.16923v2.pdf","comment":"Accepted to IEEE IV 2024. The source code is publicly available at\n https://github.com/RuipingL/MISS"},{"id":"http://arxiv.org/abs/2404.07445v1","updated":"2024-04-11T03:00:00Z","published":"2024-04-11T03:00:00Z","title":"Multi-view Aggregation Network for Dichotomous Image Segmentation","summary":" Dichotomous Image Segmentation (DIS) has recently emerged towards\nhigh-precision object segmentation from high-resolution natural images.\n When designing an effective DIS model, the main challenge is how to balance\nthe semantic dispersion of high-resolution targets in the small receptive field\nand the loss of high-precision details in the large receptive field. Existing\nmethods rely on tedious multiple encoder-decoder streams and stages to\ngradually complete the global localization and local refinement.\n Human visual system captures regions of interest by observing them from\nmultiple views. Inspired by it, we model DIS as a multi-view object perception\nproblem and provide a parsimonious multi-view aggregation network (MVANet),\nwhich unifies the feature fusion of the distant view and close-up view into a\nsingle stream with one encoder-decoder structure. With the help of the proposed\nmulti-view complementary localization and refinement modules, our approach\nestablished long-range, profound visual interactions across multiple views,\nallowing the features of the detailed close-up view to focus on highly slender\nstructures.Experiments on the popular DIS-5K dataset show that our MVANet\nsignificantly outperforms state-of-the-art methods in both accuracy and speed.\nThe source code and datasets will be publicly available at\n\\href{https://github.com/qianyu-dlut/MVANet}{MVANet}.\n","authors":["Qian Yu","Xiaoqi Zhao","Youwei Pang","Lihe Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.07445v1.pdf","comment":"Accepted by CVPR2024 as Highlight"},{"id":"http://arxiv.org/abs/2310.14576v2","updated":"2024-04-11T02:57:21Z","published":"2023-10-23T05:25:49Z","title":"Tensor Decomposition Based Attention Module for Spiking Neural Networks","summary":" The attention mechanism has been proven to be an effective way to improve\nspiking neural network (SNN). However, based on the fact that the current SNN\ninput data flow is split into tensors to process on GPUs, none of the previous\nworks consider the properties of tensors to implement an attention module. This\ninspires us to rethink current SNN from the perspective of tensor-relevant\ntheories. Using tensor decomposition, we design the \\textit{projected full\nattention} (PFA) module, which demonstrates excellent results with linearly\ngrowing parameters. Specifically, PFA is composed by the \\textit{linear\nprojection of spike tensor} (LPST) module and \\textit{attention map composing}\n(AMC) module. In LPST, we start by compressing the original spike tensor into\nthree projected tensors using a single property-preserving strategy with\nlearnable parameters for each dimension. Then, in AMC, we exploit the inverse\nprocedure of the tensor decomposition process to combine the three tensors into\nthe attention map using a so-called connecting factor. To validate the\neffectiveness of the proposed PFA module, we integrate it into the widely used\nVGG and ResNet architectures for classification tasks. Our method achieves\nstate-of-the-art performance on both static and dynamic benchmark datasets,\nsurpassing the existing SNN models with Transformer-based and CNN-based\nbackbones.\n","authors":["Haoyu Deng","Ruijie Zhu","Xuerui Qiu","Yule Duan","Malu Zhang","Liangjian Deng"],"pdf_url":"https://arxiv.org/pdf/2310.14576v2.pdf","comment":"Accepted by Knowledge-Based Systems"},{"id":"http://arxiv.org/abs/2403.17920v2","updated":"2024-04-11T02:42:59Z","published":"2024-03-26T17:55:11Z","title":"TC4D: Trajectory-Conditioned Text-to-4D Generation","summary":" Recent techniques for text-to-4D generation synthesize dynamic 3D scenes\nusing supervision from pre-trained text-to-video models. However, existing\nrepresentations for motion, such as deformation models or time-dependent neural\nrepresentations, are limited in the amount of motion they can generate-they\ncannot synthesize motion extending far beyond the bounding box used for volume\nrendering. The lack of a more flexible motion model contributes to the gap in\nrealism between 4D generation methods and recent, near-photorealistic video\ngeneration models. Here, we propose TC4D: trajectory-conditioned text-to-4D\ngeneration, which factors motion into global and local components. We represent\nthe global motion of a scene's bounding box using rigid transformation along a\ntrajectory parameterized by a spline. We learn local deformations that conform\nto the global trajectory using supervision from a text-to-video model. Our\napproach enables the synthesis of scenes animated along arbitrary trajectories,\ncompositional scene generation, and significant improvements to the realism and\namount of generated motion, which we evaluate qualitatively and through a user\nstudy. Video results can be viewed on our website:\nhttps://sherwinbahmani.github.io/tc4d.\n","authors":["Sherwin Bahmani","Xian Liu","Yifan Wang","Ivan Skorokhodov","Victor Rong","Ziwei Liu","Xihui Liu","Jeong Joon Park","Sergey Tulyakov","Gordon Wetzstein","Andrea Tagliasacchi","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2403.17920v2.pdf","comment":"Project Page: https://sherwinbahmani.github.io/tc4d"},{"id":"http://arxiv.org/abs/2404.07435v1","updated":"2024-04-11T02:29:08Z","published":"2024-04-11T02:29:08Z","title":"Encoding Urban Ecologies: Automated Building Archetype Generation\n through Self-Supervised Learning for Energy Modeling","summary":" As the global population and urbanization expand, the building sector has\nemerged as the predominant energy consumer and carbon emission contributor. The\nneed for innovative Urban Building Energy Modeling grows, yet existing building\narchetypes often fail to capture the unique attributes of local buildings and\nthe nuanced distinctions between different cities, jeopardizing the precision\nof energy modeling. This paper presents an alternative tool employing\nself-supervised learning to distill complex geometric data into representative,\nlocale-specific archetypes. This study attempts to foster a new paradigm of\ninteraction with built environments, incorporating local parameters to conduct\nbespoke energy simulations at the community level. The catered archetypes can\naugment the precision and applicability of energy consumption modeling at\ndifferent scales across diverse building inventories. This tool provides a\npotential solution that encourages the exploration of emerging local ecologies.\nBy integrating building envelope characteristics and cultural granularity into\nthe building archetype generation process, we seek a future where architecture\nand urban design are intricately interwoven with the energy sector in shaping\nour built environments.\n","authors":["Xinwei Zhuang","Zixun Huang","Wentao Zeng","Luisa Caldas"],"pdf_url":"https://arxiv.org/pdf/2404.07435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10853v4","updated":"2024-04-11T01:56:38Z","published":"2023-07-20T13:16:10Z","title":"Exploring Effective Priors and Efficient Models for Weakly-Supervised\n Change Detection","summary":" Weakly-supervised change detection (WSCD) aims to detect pixel-level changes\nwith only image-level annotations. Owing to its label efficiency, WSCD is\ndrawing increasing attention recently. However, current WSCD methods often\nencounter the challenge of change missing and fabricating, i.e., the\ninconsistency between image-level annotations and pixel-level predictions.\nSpecifically, change missing refer to the situation that the WSCD model fails\nto predict any changed pixels, even though the image-level label indicates\nchanged, and vice versa for change fabricating. To address this challenge, in\nthis work, we leverage global-scale and local-scale priors in WSCD and propose\ntwo components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint.\nThe DP decoder decodes samples with the changed image-level label, skips\nsamples with the unchanged label, and replaces them with an all-unchanged\npixel-level label. The LG constraint is derived from the correspondence between\nchanged representations and image-level labels, penalizing the model when it\nmispredicts the change status. Additionally, we develop TransWCD, a simple yet\npowerful transformer-based model, showcasing the potential of weakly-supervised\nlearning in change detection. By integrating the DP decoder and LG constraint\ninto TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL\nachieve significant +6.33% and +9.55% F1 score improvements over the\nstate-of-the-art methods on the WHU-CD dataset, respectively. Some performance\nmetrics even exceed several fully-supervised change detection (FSCD)\ncompetitors. Code will be available at\nhttps://github.com/zhenghuizhao/TransWCD.\n","authors":["Zhenghui Zhao","Lixiang Ru","Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2307.10853v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07424v1","updated":"2024-04-11T01:33:45Z","published":"2024-04-11T01:33:45Z","title":"CopilotCAD: Empowering Radiologists with Report Completion Models and\n Quantitative Evidence from Medical Image Foundation Models","summary":" Computer-aided diagnosis systems hold great promise to aid radiologists and\nclinicians in radiological clinical practice and enhance diagnostic accuracy\nand efficiency. However, the conventional systems primarily focus on delivering\ndiagnostic results through text report generation or medical image\nclassification, positioning them as standalone decision-makers rather than\nhelpers and ignoring radiologists' expertise. This study introduces an\ninnovative paradigm to create an assistive co-pilot system for empowering\nradiologists by leveraging Large Language Models (LLMs) and medical image\nanalysis tools. Specifically, we develop a collaborative framework to integrate\nLLMs and quantitative medical image analysis results generated by foundation\nmodels with radiologists in the loop, achieving efficient and safe generation\nof radiology reports and effective utilization of computational power of AI and\nthe expertise of medical professionals. This approach empowers radiologists to\ngenerate more precise and detailed diagnostic reports, enhancing patient\noutcomes while reducing the burnout of clinicians. Our methodology underscores\nthe potential of AI as a supportive tool in medical diagnostics, promoting a\nharmonious integration of technology and human expertise to advance the field\nof radiology.\n","authors":["Sheng Wang","Tianming Du","Katherine Fischer","Gregory E Tasian","Justin Ziemba","Joanie M Garratt","Hersh Sagreiya","Yong Fan"],"pdf_url":"https://arxiv.org/pdf/2404.07424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07410v1","updated":"2024-04-11T00:49:38Z","published":"2024-04-11T00:49:38Z","title":"Improving Shift Invariance in Convolutional Neural Networks with\n Translation Invariant Polyphase Sampling","summary":" Downsampling operators break the shift invariance of convolutional neural\nnetworks (CNNs) and this affects the robustness of features learned by CNNs\nwhen dealing with even small pixel-level shift. Through a large-scale\ncorrelation analysis framework, we study shift invariance of CNNs by inspecting\nexisting downsampling operators in terms of their maximum-sampling bias (MSB),\nand find that MSB is negatively correlated with shift invariance. Based on this\ncrucial insight, we propose a learnable pooling operator called Translation\nInvariant Polyphase Sampling (TIPS) and two regularizations on the intermediate\nfeature maps of TIPS to reduce MSB and learn translation-invariant\nrepresentations. TIPS can be integrated into any CNN and can be trained\nend-to-end with marginal computational overhead. Our experiments demonstrate\nthat TIPS results in consistent performance gains in terms of accuracy, shift\nconsistency, and shift fidelity on multiple benchmarks for image classification\nand semantic segmentation compared to previous methods and also leads to\nimprovements in adversarial and distributional robustness. TIPS results in the\nlowest MSB compared to all previous methods, thus explaining our strong\nempirical results.\n","authors":["Sourajit Saha","Tejas Gokhale"],"pdf_url":"https://arxiv.org/pdf/2404.07410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07405v1","updated":"2024-04-11T00:45:10Z","published":"2024-04-11T00:45:10Z","title":"Simplifying Two-Stage Detectors for On-Device Inference in Remote\n Sensing","summary":" Deep learning has been successfully applied to object detection from remotely\nsensed images. Images are typically processed on the ground rather than\non-board due to the computation power of the ground system. Such offloaded\nprocessing causes delays in acquiring target mission information, which hinders\nits application to real-time use cases. For on-device object detection,\nresearches have been conducted on designing efficient detectors or model\ncompression to reduce inference latency. However, highly accurate two-stage\ndetectors still need further exploitation for acceleration. In this paper, we\npropose a model simplification method for two-stage object detectors. Instead\nof constructing a general feature pyramid, we utilize only one feature\nextraction in the two-stage detector. To compensate for the accuracy drop, we\napply a high pass filter to the RPN's score map. Our approach is applicable to\nany two-stage detector using a feature pyramid network. In the experiments with\nstate-of-the-art two-stage detectors such as ReDet, Oriented-RCNN, and LSKNet,\nour method reduced computation costs upto 61.2% with the accuracy loss within\n2.1% on the DOTAv1.5 dataset. Source code will be released.\n","authors":["Jaemin Kang","Hoeseok Yang","Hyungshin Kim"],"pdf_url":"https://arxiv.org/pdf/2404.07405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10300v4","updated":"2024-04-11T00:35:04Z","published":"2023-05-17T15:37:47Z","title":"One-Prompt to Segment All Medical Images","summary":" Large foundation models, known for their strong zero-shot generalization,\nhave excelled in visual and language applications. However, applying them to\nmedical image segmentation, a domain with diverse imaging types and target\nlabels, remains an open challenge. Current approaches, such as adapting\ninteractive segmentation models like Segment Anything Model (SAM), require user\nprompts for each sample during inference. Alternatively, transfer learning\nmethods like few/one-shot models demand labeled samples, leading to high costs.\nThis paper introduces a new paradigm toward the universal medical image\nsegmentation, termed 'One-Prompt Segmentation.' One-Prompt Segmentation\ncombines the strengths of one-shot and interactive methods. In the inference\nstage, with just \\textbf{one prompted sample}, it can adeptly handle the unseen\ntask in a single forward pass. We train One-Prompt Model on 64 open-source\nmedical datasets, accompanied by the collection of over 3,000 clinician-labeled\nprompts. Tested on 14 previously unseen datasets, the One-Prompt Model\nshowcases superior zero-shot segmentation capabilities, outperforming a wide\nrange of related methods. The code and data is released as\n\\url{https://github.com/KidsWithTokens/one-prompt}.\n","authors":["Junde Wu","Jiayuan Zhu","Yuanpei Liu","Yueming Jin","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2305.10300v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.12620"},{"id":"http://arxiv.org/abs/2404.07399v1","updated":"2024-04-11T00:23:28Z","published":"2024-04-11T00:23:28Z","title":"Post-hurricane building damage assessment using street-view imagery and\n structured data: A multi-modal deep learning approach","summary":" Accurately assessing building damage is critical for disaster response and\nrecovery. However, many existing models for detecting building damage have poor\nprediction accuracy due to their limited capabilities of identifying detailed,\ncomprehensive structural and/or non-structural damage from the street-view\nimage. Additionally, these models mainly rely on the imagery data for damage\nclassification, failing to account for other critical information, such as wind\nspeed, building characteristics, evacuation zones, and distance of the building\nto the hurricane track. To address these limitations, in this study, we propose\na novel multi-modal (i.e., imagery and structured data) approach for\npost-hurricane building damage classification, named the Multi-Modal Swin\nTransformer (MMST). We empirically train and evaluate the proposed MMST using\ndata collected from the 2022 Hurricane Ian in Florida, USA. Results show that\nMMST outperforms all selected state-of-the-art benchmark models and can achieve\nan accuracy of 92.67%, which are 7.71% improvement in accuracy compared to\nVisual Geometry Group 16 (VGG-16). In addition to the street-view imagery data,\nbuilding value, building age, and wind speed are the most important predictors\nfor damage level classification. The proposed MMST can be deployed to assist in\nrapid damage assessment and guide reconnaissance efforts in future hurricanes.\n","authors":["Zhuoqun Xue","Xiaojian Zhang","David O. Prevatt","Jennifer Bridge","Susu Xu","Xilei Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.07399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07395v1","updated":"2024-04-11T00:02:57Z","published":"2024-04-11T00:02:57Z","title":"Global versus Local: Evaluating AlexNet Architectures for Tropical\n Cyclone Intensity Estimation","summary":" Given the destructive impacts of tropical cyclones, it is critical to have a\nreliable system for cyclone intensity detection. Various techniques are\navailable for this purpose, each with differing levels of accuracy. In this\npaper, we introduce two ensemble-based models based on AlexNet architecture to\nestimate tropical cyclone intensity using visible satellite images. The first\nmodel, trained on the entire dataset, is called the global AlexNet model. The\nsecond model is a distributed version of AlexNet in which multiple AlexNets are\ntrained separately on subsets of the training data categorized according to the\nSaffir-Simpson wind speed scale prescribed by the meterologists. We evaluated\nthe performance of both models against a deep learning benchmark model called\n\\textit{Deepti} using a publicly available cyclone image dataset. Results\nindicate that both the global model (with a root mean square error (RMSE) of\n9.03 knots) and the distributed model (with a RMSE of 9.3 knots) outperform the\nbenchmark model (with a RMSE of 13.62 knots). We provide a thorough discussion\nof our solution approach, including an explanantion of the AlexNet's\nperformance using gradient class activation maps (grad-CAM). Our proposed\nsolution strategy allows future experimentation with various deep learning\nmodels in both single and multi-channel settings.\n","authors":["Vikas Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2404.07395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04526v2","updated":"2024-04-11T23:50:32Z","published":"2023-08-08T18:41:38Z","title":"Large-Scale Multi-Hypotheses Cell Tracking Using Ultrametric Contours\n Maps","summary":" In this work, we describe a method for large-scale 3D cell-tracking through a\nsegmentation selection approach. The proposed method is effective at tracking\ncells across large microscopy datasets on two fronts: (i) It can solve problems\ncontaining millions of segmentation instances in terabyte-scale 3D+t datasets;\n(ii) It achieves competitive results with or without deep learning, which\nrequires 3D annotated data, that is scarce in the fluorescence microscopy\nfield. The proposed method computes cell tracks and segments using a hierarchy\nof segmentation hypotheses and selects disjoint segments by maximizing the\noverlap between adjacent frames. We show that this method achieves\nstate-of-the-art results in 3D images from the cell tracking challenge and has\na faster integer linear programming formulation. Moreover, our framework is\nflexible and supports segmentations from off-the-shelf cell segmentation models\nand can combine them into an ensemble that improves tracking. The code is\navailable https://github.com/royerlab/ultrack.\n","authors":["Jordão Bragantini","Merlin Lange","Loïc Royer"],"pdf_url":"https://arxiv.org/pdf/2308.04526v2.pdf","comment":"13 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.16400v2","updated":"2024-04-11T23:38:06Z","published":"2024-03-25T03:30:37Z","title":"ASDF: Assembly State Detection Utilizing Late Fusion by Integrating 6D\n Pose Estimation","summary":" In medical and industrial domains, providing guidance for assembly processes\nis critical to ensure efficiency and safety. Errors in assembly can lead to\nsignificant consequences such as extended surgery times, and prolonged\nmanufacturing or maintenance times in industry. Assembly scenarios can benefit\nfrom in-situ AR visualization to provide guidance, reduce assembly times and\nminimize errors. To enable in-situ visualization 6D pose estimation can be\nleveraged. Existing 6D pose estimation techniques primarily focus on individual\nobjects and static captures. However, assembly scenarios have various dynamics\nincluding occlusion during assembly and dynamics in the assembly objects\nappearance. Existing work, combining object detection/6D pose estimation and\nassembly state detection focuses either on pure deep learning-based approaches,\nor limit the assembly state detection to building blocks. To address the\nchallenges of 6D pose estimation in combination with assembly state detection,\nour approach ASDF builds upon the strengths of YOLOv8, a real-time capable\nobject detection framework. We extend this framework, refine the object pose\nand fuse pose knowledge with network-detected pose information. Utilizing our\nlate fusion in our Pose2State module results in refined 6D pose estimation and\nassembly state detection. By combining both pose and state information, our\nPose2State module predicts the final assembly state with precision. Our\nevaluation on our ASDF dataset shows that our Pose2State module leads to an\nimproved assembly state detection and that the improvement of the assembly\nstate further leads to a more robust 6D pose estimation. Moreover, on the GBOT\ndataset, we outperform the pure deep learning-based network, and even\noutperform the hybrid and pure tracking-based approaches.\n","authors":["Hannah Schieber","Shiyu Li","Niklas Corell","Philipp Beckerle","Julian Kreimeier","Daniel Roth"],"pdf_url":"https://arxiv.org/pdf/2403.16400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01492v2","updated":"2024-04-11T23:09:25Z","published":"2024-04-01T21:28:50Z","title":"Modality Translation for Object Detection Adaptation Without Forgetting\n Prior Knowledge","summary":" A common practice in deep learning consists of training large neural networks\non massive datasets to perform accurately for different domains and tasks.\nWhile this methodology may work well in numerous application areas, it only\napplies across modalities due to a larger distribution shift in data captured\nusing different sensors. This paper focuses on the problem of adapting a large\nobject detection model to one or multiple modalities while being efficient. To\ndo so, we propose ModTr as an alternative to the common approach of fine-tuning\nlarge models. ModTr consists of adapting the input with a small transformation\nnetwork trained to minimize the detection loss directly. The original model can\ntherefore work on the translated inputs without any further change or\nfine-tuning to its parameters. Experimental results on translating from IR to\nRGB images on two well-known datasets show that this simple ModTr approach\nprovides detectors that can perform comparably or better than the standard\nfine-tuning without forgetting the original knowledge. This opens the doors to\na more flexible and efficient service-based detection pipeline in which,\ninstead of using a different detector for each modality, a unique and unaltered\nserver is constantly running, where multiple modalities with the corresponding\ntranslations can query it. Code: https://github.com/heitorrapela/ModTr.\n","authors":["Heitor Rapela Medeiros","Masih Aminbeidokhti","Fidel Guerrero Pena","David Latortue","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2404.01492v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12982v2","updated":"2024-04-11T22:47:39Z","published":"2023-10-19T17:59:56Z","title":"Putting the Object Back into Video Object Segmentation","summary":" We present Cutie, a video object segmentation (VOS) network with object-level\nmemory reading, which puts the object representation from memory back into the\nvideo object segmentation result. Recent works on VOS employ bottom-up\npixel-level memory reading which struggles due to matching noise, especially in\nthe presence of distractors, resulting in lower performance in more challenging\ndata. In contrast, Cutie performs top-down object-level memory reading by\nadapting a small set of object queries. Via those, it interacts with the\nbottom-up pixel features iteratively with a query-based object transformer (qt,\nhence Cutie). The object queries act as a high-level summary of the target\nobject, while high-resolution feature maps are retained for accurate\nsegmentation. Together with foreground-background masked attention, Cutie\ncleanly separates the semantics of the foreground object from the background.\nOn the challenging MOSE dataset, Cutie improves by 8.7 J&F over XMem with a\nsimilar running time and improves by 4.2 J&F over DeAOT while being three times\nfaster. Code is available at: https://hkchengrex.github.io/Cutie\n","authors":["Ho Kei Cheng","Seoung Wug Oh","Brian Price","Joon-Young Lee","Alexander Schwing"],"pdf_url":"https://arxiv.org/pdf/2310.12982v2.pdf","comment":"CVPR 2024 Highlight. Project page: https://hkchengrex.github.io/Cutie"},{"id":"http://arxiv.org/abs/2307.15904v2","updated":"2024-04-11T22:39:15Z","published":"2023-07-29T06:23:51Z","title":"Sat2Cap: Mapping Fine-Grained Textual Descriptions from Satellite Images","summary":" We propose a weakly supervised approach for creating maps using free-form\ntextual descriptions. We refer to this work of creating textual maps as\nzero-shot mapping. Prior works have approached mapping tasks by developing\nmodels that predict a fixed set of attributes using overhead imagery. However,\nthese models are very restrictive as they can only solve highly specific tasks\nfor which they were trained. Mapping text, on the other hand, allows us to\nsolve a large variety of mapping problems with minimal restrictions. To achieve\nthis, we train a contrastive learning framework called Sat2Cap on a new\nlarge-scale dataset with 6.1M pairs of overhead and ground-level images. For a\ngiven location and overhead image, our model predicts the expected CLIP\nembeddings of the ground-level scenery. The predicted CLIP embeddings are then\nused to learn about the textual space associated with that location. Sat2Cap is\nalso conditioned on date-time information, allowing it to model temporally\nvarying concepts over a location. Our experimental results demonstrate that our\nmodels successfully capture ground-level concepts and allow large-scale mapping\nof fine-grained textual queries. Our approach does not require any text-labeled\ndata, making the training easily scalable. The code, dataset, and models will\nbe made publicly available.\n","authors":["Aayush Dhakal","Adeel Ahmad","Subash Khanal","Srikumar Sastry","Hannah Kerner","Nathan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2307.15904v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2404.08135v1","updated":"2024-04-11T21:41:55Z","published":"2024-04-11T21:41:55Z","title":"SciFlow: Empowering Lightweight Optical Flow Models with Self-Cleaning\n Iterations","summary":" Optical flow estimation is crucial to a variety of vision tasks. Despite\nsubstantial recent advancements, achieving real-time on-device optical flow\nestimation remains a complex challenge. First, an optical flow model must be\nsufficiently lightweight to meet computation and memory constraints to ensure\nreal-time performance on devices. Second, the necessity for real-time on-device\noperation imposes constraints that weaken the model's capacity to adequately\nhandle ambiguities in flow estimation, thereby intensifying the difficulty of\npreserving flow accuracy. This paper introduces two synergistic techniques,\nSelf-Cleaning Iteration (SCI) and Regression Focal Loss (RFL), designed to\nenhance the capabilities of optical flow models, with a focus on addressing\noptical flow regression ambiguities. These techniques prove particularly\neffective in mitigating error propagation, a prevalent issue in optical flow\nmodels that employ iterative refinement. Notably, these techniques add\nnegligible to zero overhead in model parameters and inference latency, thereby\npreserving real-time on-device efficiency. The effectiveness of our proposed\nSCI and RFL techniques, collectively referred to as SciFlow for brevity, is\ndemonstrated across two distinct lightweight optical flow model architectures\nin our experiments. Remarkably, SciFlow enables substantial reduction in error\nmetrics (EPE and Fl-all) over the baseline models by up to 6.3% and 10.5% for\nin-domain scenarios and by up to 6.2% and 13.5% for cross-domain scenarios on\nthe Sintel and KITTI 2015 datasets, respectively.\n","authors":["Jamie Menjay Lin","Jisoo Jeong","Hong Cai","Risheek Garrepalli","Kai Wang","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2404.08135v1.pdf","comment":"CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.08127v1","updated":"2024-04-11T21:07:38Z","published":"2024-04-11T21:07:38Z","title":"Self-Supervised Learning of Color Constancy","summary":" Color constancy (CC) describes the ability of the visual system to perceive\nan object as having a relatively constant color despite changes in lighting\nconditions. While CC and its limitations have been carefully characterized in\nhumans, it is still unclear how the visual system acquires this ability during\ndevelopment. Here, we present a first study showing that CC develops in a\nneural network trained in a self-supervised manner through an invariance\nlearning objective. During learning, objects are presented under changing\nilluminations, while the network aims to map subsequent views of the same\nobject onto close-by latent representations. This gives rise to representations\nthat are largely invariant to the illumination conditions, offering a plausible\nexample of how CC could emerge during human cognitive development via a form of\nself-supervised learning.\n","authors":["Markus R. Ernst","Francisco M. López","Arthur Aubret","Roland W. Fleming","Jochen Triesch"],"pdf_url":"https://arxiv.org/pdf/2404.08127v1.pdf","comment":"7 pages, 5 figures, submitted to the IEEE International Conference on\n Development and Learning (ICDL 2024)"},{"id":"http://arxiv.org/abs/2404.08111v1","updated":"2024-04-11T20:25:26Z","published":"2024-04-11T20:25:26Z","title":"S3Editor: A Sparse Semantic-Disentangled Self-Training Framework for\n Face Video Editing","summary":" Face attribute editing plays a pivotal role in various applications. However,\nexisting methods encounter challenges in achieving high-quality results while\npreserving identity, editing faithfulness, and temporal consistency. These\nchallenges are rooted in issues related to the training pipeline, including\nlimited supervision, architecture design, and optimization strategy. In this\nwork, we introduce S3Editor, a Sparse Semantic-disentangled Self-training\nframework for face video editing. S3Editor is a generic solution that\ncomprehensively addresses these challenges with three key contributions.\nFirstly, S3Editor adopts a self-training paradigm to enhance the training\nprocess through semi-supervision. Secondly, we propose a semantic disentangled\narchitecture with a dynamic routing mechanism that accommodates diverse editing\nrequirements. Thirdly, we present a structured sparse optimization schema that\nidentifies and deactivates malicious neurons to further disentangle impacts\nfrom untarget attributes. S3Editor is model-agnostic and compatible with\nvarious editing approaches. Our extensive qualitative and quantitative results\naffirm that our approach significantly enhances identity preservation, editing\nfidelity, as well as temporal consistency.\n","authors":["Guangzhi Wang","Tianyi Chen","Kamran Ghasedi","HsiangTao Wu","Tianyu Ding","Chris Nuesmeyer","Ilya Zharkov","Mohan Kankanhalli","Luming Liang"],"pdf_url":"https://arxiv.org/pdf/2404.08111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01001v2","updated":"2024-04-11T20:07:20Z","published":"2023-12-02T02:09:31Z","title":"Learning county from pixels: Corn yield prediction with\n attention-weighted multiple instance learning","summary":" Remote sensing technology has become a promising tool in yield prediction.\nMost prior work employs satellite imagery for county-level corn yield\nprediction by spatially aggregating all pixels within a county into a single\nvalue, potentially overlooking the detailed information and valuable insights\noffered by more granular data. To this end, this research examines each county\nat the pixel level and applies multiple instance learning to leverage detailed\ninformation within a county. In addition, our method addresses the \"mixed\npixel\" issue caused by the inconsistent resolution between feature datasets and\ncrop mask, which may introduce noise into the model and therefore hinder\naccurate yield prediction. Specifically, the attention mechanism is employed to\nautomatically assign weights to different pixels, which can mitigate the\ninfluence of mixed pixels. The experimental results show that the developed\nmodel outperforms four other machine learning models over the past five years\nin the U.S. corn belt and demonstrates its best performance in 2022, achieving\na coefficient of determination (R2) value of 0.84 and a root mean square error\n(RMSE) of 0.83. This paper demonstrates the advantages of our approach from\nboth spatial and temporal perspectives. Furthermore, through an in-depth study\nof the relationship between mixed pixels and attention, it is verified that our\napproach can capture critical feature information while filtering out noise\nfrom mixed pixels.\n","authors":["Xiaoyu Wang","Yuchi Ma","Qunying Huang","Zhengwei Yang","Zhou Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.01001v2.pdf","comment":"I am writing to request the withdrawal of my paper submitted to\n arXiv. Upon further review, I have identified an error in the paper that\n significantly affects the results and conclusions. To maintain the integrity\n of the scientific record and prevent the dissemination of incorrect\n information, I believe it is necessary to withdraw the paper from the archive"},{"id":"http://arxiv.org/abs/2307.03798v2","updated":"2024-04-11T19:24:50Z","published":"2023-07-07T18:54:11Z","title":"Fooling Contrastive Language-Image Pre-trained Models with\n CLIPMasterPrints","summary":" Models leveraging both visual and textual data such as Contrastive\nLanguage-Image Pre-training (CLIP), are the backbone of many recent advances in\nartificial intelligence. In this work, we show that despite their versatility,\nsuch models are vulnerable to what we refer to as fooling master images.\nFooling master images are capable of maximizing the confidence score of a CLIP\nmodel for a significant number of widely varying prompts, while being either\nunrecognizable or unrelated to the attacked prompts for humans. The existence\nof such images is problematic as it could be used by bad actors to maliciously\ninterfere with CLIP-trained image retrieval models in production with\ncomparably small effort as a single image can attack many different prompts. We\ndemonstrate how fooling master images for CLIP (CLIPMasterPrints) can be mined\nusing stochastic gradient descent, projected gradient descent, or blackbox\noptimization. Contrary to many common adversarial attacks, the blackbox\noptimization approach allows us to mine CLIPMasterPrints even when the weights\nof the model are not accessible. We investigate the properties of the mined\nimages, and find that images trained on a small number of image captions\ngeneralize to a much larger number of semantically related captions. We\nevaluate possible mitigation strategies, where we increase the robustness of\nthe model and introduce an approach to automatically detect CLIPMasterPrints to\nsanitize the input of vulnerable models. Finally, we find that vulnerability to\nCLIPMasterPrints is related to a modality gap in contrastive pre-trained\nmulti-modal networks. Code available at\nhttps://github.com/matfrei/CLIPMasterPrints.\n","authors":["Matthias Freiberger","Peter Kun","Christian Igel","Anders Sundnes Løvlie","Sebastian Risi"],"pdf_url":"https://arxiv.org/pdf/2307.03798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13004v3","updated":"2024-04-11T19:22:41Z","published":"2022-10-24T07:50:02Z","title":"Efficient Representation of Natural Image Patches","summary":" Utilizing an abstract information processing model based on minimal yet\nrealistic assumptions inspired by biological systems, we study how to achieve\nthe early visual system's two ultimate objectives: efficient information\ntransmission and accurate sensor probability distribution modeling. We prove\nthat optimizing for information transmission does not guarantee optimal\nprobability distribution modeling in general. We illustrate, using a two-pixel\n(2D) system and image patches, that an efficient representation can be realized\nthrough a nonlinear population code driven by two types of biologically\nplausible loss functions that depend solely on output. After unsupervised\nlearning, our abstract information processing model bears remarkable\nresemblances to biological systems, despite not mimicking many features of real\nneurons, such as spiking activity. A preliminary comparison with a contemporary\ndeep learning model suggests that our model offers a significant efficiency\nadvantage. Our model provides novel insights into the computational theory of\nearly visual systems as well as a potential new approach to enhance the\nefficiency of deep learning models.\n","authors":["Cheng Guo"],"pdf_url":"https://arxiv.org/pdf/2210.13004v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08088v1","updated":"2024-04-11T19:06:36Z","published":"2024-04-11T19:06:36Z","title":"Visual Context-Aware Person Fall Detection","summary":" As the global population ages, the number of fall-related incidents is on the\nrise. Effective fall detection systems, specifically in healthcare sector, are\ncrucial to mitigate the risks associated with such events. This study evaluates\nthe role of visual context, including background objects, on the accuracy of\nfall detection classifiers. We present a segmentation pipeline to\nsemi-automatically separate individuals and objects in images. Well-established\nmodels like ResNet-18, EfficientNetV2-S, and Swin-Small are trained and\nevaluated. During training, pixel-based transformations are applied to\nsegmented objects, and the models are then evaluated on raw images without\nsegmentation. Our findings highlight the significant influence of visual\ncontext on fall detection. The application of Gaussian blur to the image\nbackground notably improves the performance and generalization capabilities of\nall models. Background objects such as beds, chairs, or wheelchairs can\nchallenge fall detection systems, leading to false positive alarms. However, we\ndemonstrate that object-specific contextual transformations during training\neffectively mitigate this challenge. Further analysis using saliency maps\nsupports our observation that visual context is crucial in classification\ntasks. We create both dataset processing API and segmentation pipeline,\navailable at https://github.com/A-NGJ/image-segmentation-cli.\n","authors":["Aleksander Nagaj","Zenjie Li","Dim P. Papadopoulos","Kamal Nasrollahi"],"pdf_url":"https://arxiv.org/pdf/2404.08088v1.pdf","comment":"10 pages, 6 figures, KES IDT-24 conference"},{"id":"http://arxiv.org/abs/2404.03507v2","updated":"2024-04-11T18:54:24Z","published":"2024-04-04T15:10:24Z","title":"DQ-DETR: DETR with Dynamic Query for Tiny Object Detection","summary":" Despite previous DETR-like methods having performed successfully in generic\nobject detection, tiny object detection is still a challenging task for them\nsince the positional information of object queries is not customized for\ndetecting tiny objects, whose scale is extraordinarily smaller than general\nobjects. Also, DETR-like methods using a fixed number of queries make them\nunsuitable for aerial datasets, which only contain tiny objects, and the\nnumbers of instances are imbalanced between different images. Thus, we present\na simple yet effective model, named DQ-DETR, which consists of three different\ncomponents: categorical counting module, counting-guided feature enhancement,\nand dynamic query selection to solve the above-mentioned problems. DQ-DETR uses\nthe prediction and density maps from the categorical counting module to\ndynamically adjust the number of object queries and improve the positional\ninformation of queries. Our model DQ-DETR outperforms previous CNN-based and\nDETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2\ndataset, which mostly consists of tiny objects.\n","authors":["Yi-Xin Huang","Hou-I Liu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03507v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17205v4","updated":"2024-04-11T18:48:04Z","published":"2023-12-28T18:40:31Z","title":"EFHQ: Multi-purpose ExtremePose-Face-HQ dataset","summary":" The existing facial datasets, while having plentiful images at near frontal\nviews, lack images with extreme head poses, leading to the downgraded\nperformance of deep learning models when dealing with profile or pitched faces.\nThis work aims to address this gap by introducing a novel dataset named Extreme\nPose Face High-Quality Dataset (EFHQ), which includes a maximum of 450k\nhigh-quality images of faces at extreme poses. To produce such a massive\ndataset, we utilize a novel and meticulous dataset processing pipeline to\ncurate two publicly available datasets, VFHQ and CelebV-HQ, which contain many\nhigh-resolution face videos captured in various settings. Our dataset can\ncomplement existing datasets on various facial-related tasks, such as facial\nsynthesis with 2D/3D-aware GAN, diffusion-based text-to-image face generation,\nand face reenactment. Specifically, training with EFHQ helps models generalize\nwell across diverse poses, significantly improving performance in scenarios\ninvolving extreme views, confirmed by extensive experiments. Additionally, we\nutilize EFHQ to define a challenging cross-view face verification benchmark, in\nwhich the performance of SOTA face recognition models drops 5-37% compared to\nfrontal-to-frontal scenarios, aiming to stimulate studies on face recognition\nunder severe pose conditions in the wild.\n","authors":["Trung Tuan Dao","Duc Hong Vu","Cuong Pham","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2312.17205v4.pdf","comment":"Project Page: https://bomcon123456.github.io/efhq/"},{"id":"http://arxiv.org/abs/2404.08081v1","updated":"2024-04-11T18:42:14Z","published":"2024-04-11T18:42:14Z","title":"Real-Time Detection and Analysis of Vehicles and Pedestrians using Deep\n Learning","summary":" Computer vision, particularly vehicle and pedestrian identification is\ncritical to the evolution of autonomous driving, artificial intelligence, and\nvideo surveillance. Current traffic monitoring systems confront major\ndifficulty in recognizing small objects and pedestrians effectively in\nreal-time, posing a serious risk to public safety and contributing to traffic\ninefficiency. Recognizing these difficulties, our project focuses on the\ncreation and validation of an advanced deep-learning framework capable of\nprocessing complex visual input for precise, real-time recognition of cars and\npeople in a variety of environmental situations. On a dataset representing\ncomplicated urban settings, we trained and evaluated different versions of the\nYOLOv8 and RT-DETR models. The YOLOv8 Large version proved to be the most\neffective, especially in pedestrian recognition, with great precision and\nrobustness. The results, which include Mean Average Precision and recall rates,\ndemonstrate the model's ability to dramatically improve traffic monitoring and\nsafety. This study makes an important addition to real-time, reliable detection\nin computer vision, establishing new benchmarks for traffic management systems.\n","authors":["Md Nahid Sadik","Tahmim Hossain","Faisal Sayeed"],"pdf_url":"https://arxiv.org/pdf/2404.08081v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.08079v1","updated":"2024-04-11T18:34:29Z","published":"2024-04-11T18:34:29Z","title":"DIMAT: Decentralized Iterative Merging-And-Training for Deep Learning\n Models","summary":" Recent advances in decentralized deep learning algorithms have demonstrated\ncutting-edge performance on various tasks with large pre-trained models.\nHowever, a pivotal prerequisite for achieving this level of competitiveness is\nthe significant communication and computation overheads when updating these\nmodels, which prohibits the applications of them to real-world scenarios. To\naddress this issue, drawing inspiration from advanced model merging techniques\nwithout requiring additional training, we introduce the Decentralized Iterative\nMerging-And-Training (DIMAT) paradigm--a novel decentralized deep learning\nframework. Within DIMAT, each agent is trained on their local data and\nperiodically merged with their neighboring agents using advanced model merging\ntechniques like activation matching until convergence is achieved. DIMAT\nprovably converges with the best available rate for nonconvex functions with\nvarious first-order methods, while yielding tighter error bounds compared to\nthe popular existing approaches. We conduct a comprehensive empirical analysis\nto validate DIMAT's superiority over baselines across diverse computer vision\ntasks sourced from multiple datasets. Empirical results validate our\ntheoretical claims by showing that DIMAT attains faster and higher initial gain\nin accuracy with independent and identically distributed (IID) and non-IID\ndata, incurring lower communication overhead. This DIMAT paradigm presents a\nnew opportunity for the future decentralized learning, enhancing its\nadaptability to real-world with sparse and light-weight communication and\ncomputation.\n","authors":["Nastaran Saadati","Minh Pham","Nasla Saleem","Joshua R. Waite","Aditya Balu","Zhanhong Jiang","Chinmay Hegde","Soumik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2404.08079v1.pdf","comment":"CVPR 2024 accepted paper, 22 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.02059v2","updated":"2024-04-11T18:29:01Z","published":"2024-04-02T15:58:36Z","title":"IISAN: Efficiently Adapting Multimodal Representation for Sequential\n Recommendation with Decoupled PEFT","summary":" Multimodal foundation models are transformative in sequential recommender\nsystems, leveraging powerful representation learning capabilities. While\nParameter-efficient Fine-tuning (PEFT) is commonly used to adapt foundation\nmodels for recommendation tasks, most research prioritizes parameter\nefficiency, often overlooking critical factors like GPU memory efficiency and\ntraining speed. Addressing this gap, our paper introduces IISAN (Intra- and\nInter-modal Side Adapted Network for Multimodal Representation), a simple\nplug-and-play architecture using a Decoupled PEFT structure and exploiting both\nintra- and inter-modal adaptation.\n IISAN matches the performance of full fine-tuning (FFT) and state-of-the-art\nPEFT. More importantly, it significantly reduces GPU memory usage - from 47GB\nto just 3GB for multimodal sequential recommendation tasks. Additionally, it\naccelerates training time per epoch from 443s to 22s compared to FFT. This is\nalso a notable improvement over the Adapter and LoRA, which require 37-39 GB\nGPU memory and 350-380 seconds per epoch for training.\n Furthermore, we propose a new composite efficiency metric, TPME\n(Training-time, Parameter, and GPU Memory Efficiency) to alleviate the\nprevalent misconception that \"parameter efficiency represents overall\nefficiency\". TPME provides more comprehensive insights into practical\nefficiency comparisons between different methods. Besides, we give an\naccessible efficiency analysis of all PEFT and FFT approaches, which\ndemonstrate the superiority of IISAN. We release our codes and other materials\nat https://github.com/GAIR-Lab/IISAN.\n","authors":["Junchen Fu","Xuri Ge","Xin Xin","Alexandros Karatzoglou","Ioannis Arapakis","Jie Wang","Joemon M. Jose"],"pdf_url":"https://arxiv.org/pdf/2404.02059v2.pdf","comment":"Accepted by SIGIR2024"},{"id":"http://arxiv.org/abs/2404.08031v1","updated":"2024-04-11T17:59:52Z","published":"2024-04-11T17:59:52Z","title":"Latent Guard: a Safety Framework for Text-to-image Generation","summary":" With the ability to generate high-quality images, text-to-image (T2I) models\ncan be exploited for creating inappropriate content. To prevent misuse,\nexisting safety measures are either based on text blacklists, which can be\neasily circumvented, or harmful content classification, requiring large\ndatasets for training and offering low flexibility. Hence, we propose Latent\nGuard, a framework designed to improve safety measures in text-to-image\ngeneration. Inspired by blacklist-based approaches, Latent Guard learns a\nlatent space on top of the T2I model's text encoder, where it is possible to\ncheck the presence of harmful concepts in the input text embeddings. Our\nproposed framework is composed of a data generation pipeline specific to the\ntask using large language models, ad-hoc architectural components, and a\ncontrastive learning strategy to benefit from the generated data. The\neffectiveness of our method is verified on three datasets and against four\nbaselines. Code and data will be shared at\nhttps://github.com/rt219/LatentGuard.\n","authors":["Runtao Liu","Ashkan Khakzar","Jindong Gu","Qifeng Chen","Philip Torr","Fabio Pizzati"],"pdf_url":"https://arxiv.org/pdf/2404.08031v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2404.08030v1","updated":"2024-04-11T17:59:43Z","published":"2024-04-11T17:59:43Z","title":"Rethinking Artistic Copyright Infringements in the Era of Text-to-Image\n Generative Models","summary":" Recent text-to-image generative models such as Stable Diffusion are extremely\nadept at mimicking and generating copyrighted content, raising concerns amongst\nartists that their unique styles may be improperly copied. Understanding how\ngenerative models copy \"artistic style\" is more complex than duplicating a\nsingle image, as style is comprised by a set of elements (or signature) that\nfrequently co-occurs across a body of work, where each individual work may vary\nsignificantly. In our paper, we first reformulate the problem of \"artistic\ncopyright infringement\" to a classification problem over image sets, instead of\nprobing image-wise similarities. We then introduce ArtSavant, a practical\n(i.e., efficient and easy to understand) tool to (i) determine the unique style\nof an artist by comparing it to a reference dataset of works from 372 artists\ncurated from WikiArt, and (ii) recognize if the identified style reappears in\ngenerated images. We leverage two complementary methods to perform artistic\nstyle classification over image sets, includingTagMatch, which is a novel\ninherently interpretable and attributable method, making it more suitable for\nbroader use by non-technical stake holders (artists, lawyers, judges, etc).\nLeveraging ArtSavant, we then perform a large-scale empirical study to provide\nquantitative insight on the prevalence of artistic style copying across 3\npopular text-to-image generative models. Namely, amongst a dataset of prolific\nartists (including many famous ones), only 20% of them appear to have their\nstyles be at a risk of copying via simple prompting of today's popular\ntext-to-image generative models.\n","authors":["Mazda Moayeri","Samyadeep Basu","Sriram Balasubramanian","Priyatham Kattakinda","Atoosa Chengini","Robert Brauneis","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2404.08030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08027v1","updated":"2024-04-11T15:58:12Z","published":"2024-04-11T15:58:12Z","title":"SurvMamba: State Space Model with Multi-grained Multi-modal Interaction\n for Survival Prediction","summary":" Multi-modal learning that combines pathological images with genomic data has\nsignificantly enhanced the accuracy of survival prediction. Nevertheless,\nexisting methods have not fully utilized the inherent hierarchical structure\nwithin both whole slide images (WSIs) and transcriptomic data, from which\nbetter intra-modal representations and inter-modal integration could be\nderived. Moreover, many existing studies attempt to improve multi-modal\nrepresentations through attention mechanisms, which inevitably lead to high\ncomplexity when processing high-dimensional WSIs and transcriptomic data.\nRecently, a structured state space model named Mamba emerged as a promising\napproach for its superior performance in modeling long sequences with low\ncomplexity. In this study, we propose Mamba with multi-grained multi-modal\ninteraction (SurvMamba) for survival prediction. SurvMamba is implemented with\na Hierarchical Interaction Mamba (HIM) module that facilitates efficient\nintra-modal interactions at different granularities, thereby capturing more\ndetailed local features as well as rich global representations. In addition, an\nInteraction Fusion Mamba (IFM) module is used for cascaded inter-modal\ninteractive fusion, yielding more comprehensive features for survival\nprediction. Comprehensive evaluations on five TCGA datasets demonstrate that\nSurvMamba outperforms other existing methods in terms of performance and\ncomputational cost.\n","authors":["Ying Chen","Jiajing Xie","Yuxiang Lin","Yuhang Song","Wenxian Yang","Rongshan Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08703v1","updated":"2024-04-11T05:06:51Z","published":"2024-04-11T05:06:51Z","title":"Synthetic Brain Images: Bridging the Gap in Brain Mapping With\n Generative Adversarial Model","summary":" Magnetic Resonance Imaging (MRI) is a vital modality for gaining precise\nanatomical information, and it plays a significant role in medical imaging for\ndiagnosis and therapy planning. Image synthesis problems have seen a revolution\nin recent years due to the introduction of deep learning techniques,\nspecifically Generative Adversarial Networks (GANs). This work investigates the\nuse of Deep Convolutional Generative Adversarial Networks (DCGAN) for producing\nhigh-fidelity and realistic MRI image slices. The suggested approach uses a\ndataset with a variety of brain MRI scans to train a DCGAN architecture. While\nthe discriminator network discerns between created and real slices, the\ngenerator network learns to synthesise realistic MRI image slices. The\ngenerator refines its capacity to generate slices that closely mimic real MRI\ndata through an adversarial training approach. The outcomes demonstrate that\nthe DCGAN promise for a range of uses in medical imaging research, since they\nshow that it can effectively produce MRI image slices if we train them for a\nconsequent number of epochs. This work adds to the expanding corpus of research\non the application of deep learning techniques for medical image synthesis. The\nslices that are could be produced possess the capability to enhance datasets,\nprovide data augmentation in the training of deep learning models, as well as a\nnumber of functions are made available to make MRI data cleaning easier, and a\nthree ready to use and clean dataset on the major anatomical plans.\n","authors":["Drici Mourad","Kazeem Oluwakemi Oseni"],"pdf_url":"https://arxiv.org/pdf/2404.08703v1.pdf","comment":null}]},"2024-04-12T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.08640v1","updated":"2024-04-12T17:59:47Z","published":"2024-04-12T17:59:47Z","title":"EventEgo3D: 3D Human Motion Capture from Egocentric Event Streams","summary":" Monocular egocentric 3D human motion capture is a challenging and actively\nresearched problem. Existing methods use synchronously operating visual sensors\n(e.g. RGB cameras) and often fail under low lighting and fast motions, which\ncan be restricting in many applications involving head-mounted devices. In\nresponse to the existing limitations, this paper 1) introduces a new problem,\ni.e., 3D human motion capture from an egocentric monocular event camera with a\nfisheye lens, and 2) proposes the first approach to it called EventEgo3D\n(EE3D). Event streams have high temporal resolution and provide reliable cues\nfor 3D human motion capture under high-speed human motions and rapidly changing\nillumination. The proposed EE3D framework is specifically tailored for learning\nwith event streams in the LNES representation, enabling high 3D reconstruction\naccuracy. We also design a prototype of a mobile head-mounted device with an\nevent camera and record a real dataset with event observations and the\nground-truth 3D human poses (in addition to the synthetic dataset). Our EE3D\ndemonstrates robustness and superior 3D accuracy compared to existing solutions\nacross various challenging experiments while supporting real-time 3D pose\nupdate rates of 140Hz.\n","authors":["Christen Millerdurai","Hiroyasu Akada","Jian Wang","Diogo Luvizon","Christian Theobalt","Vladislav Golyanik"],"pdf_url":"https://arxiv.org/pdf/2404.08640v1.pdf","comment":"14 pages, 11 figures and 6 tables; project page:\n https://4dqv.mpi-inf.mpg.de/EventEgo3D/; Computer Vision and Pattern\n Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2404.08639v1","updated":"2024-04-12T17:59:40Z","published":"2024-04-12T17:59:40Z","title":"COCONut: Modernizing COCO Segmentation","summary":" In recent decades, the vision community has witnessed remarkable progress in\nvisual recognition, partially owing to advancements in dataset benchmarks.\nNotably, the established COCO benchmark has propelled the development of modern\ndetection and segmentation systems. However, the COCO segmentation benchmark\nhas seen comparatively slow improvement over the last decade. Originally\nequipped with coarse polygon annotations for thing instances, it gradually\nincorporated coarse superpixel annotations for stuff regions, which were\nsubsequently heuristically amalgamated to yield panoptic segmentation\nannotations. These annotations, executed by different groups of raters, have\nresulted not only in coarse segmentation masks but also in inconsistencies\nbetween segmentation types. In this study, we undertake a comprehensive\nreevaluation of the COCO segmentation annotations. By enhancing the annotation\nquality and expanding the dataset to encompass 383K images with more than 5.18M\npanoptic masks, we introduce COCONut, the COCO Next Universal segmenTation\ndataset. COCONut harmonizes segmentation annotations across semantic, instance,\nand panoptic segmentation with meticulously crafted high-quality masks, and\nestablishes a robust benchmark for all segmentation tasks. To our knowledge,\nCOCONut stands as the inaugural large-scale universal segmentation dataset,\nverified by human raters. We anticipate that the release of COCONut will\nsignificantly contribute to the community's ability to assess the progress of\nnovel neural networks.\n","authors":["Xueqing Deng","Qihang Yu","Peng Wang","Xiaohui Shen","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2404.08639v1.pdf","comment":"Accepted at CVPR2024, data available at\n https://xdeng7.github.io/coconut.github.io/"},{"id":"http://arxiv.org/abs/2404.08636v1","updated":"2024-04-12T17:58:04Z","published":"2024-04-12T17:58:04Z","title":"Probing the 3D Awareness of Visual Foundation Models","summary":" Recent advances in large-scale pretraining have yielded visual foundation\nmodels with strong capabilities. Not only can recent models generalize to\narbitrary images for their training task, their intermediate representations\nare useful for other visual tasks such as detection and segmentation. Given\nthat such models can classify, delineate, and localize objects in 2D, we ask\nwhether they also represent their 3D structure? In this work, we analyze the 3D\nawareness of visual foundation models. We posit that 3D awareness implies that\nrepresentations (1) encode the 3D structure of the scene and (2) consistently\nrepresent the surface across views. We conduct a series of experiments using\ntask-specific probes and zero-shot inference procedures on frozen features. Our\nexperiments reveal several limitations of the current models. Our code and\nanalysis can be found at https://github.com/mbanani/probe3d.\n","authors":["Mohamed El Banani","Amit Raj","Kevis-Kokitsi Maninis","Abhishek Kar","Yuanzhen Li","Michael Rubinstein","Deqing Sun","Leonidas Guibas","Justin Johnson","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2404.08636v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://github.com/mbanani/probe3d"},{"id":"http://arxiv.org/abs/2403.15388v4","updated":"2024-04-12T17:34:29Z","published":"2024-03-22T17:59:52Z","title":"LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal\n Models","summary":" Large Multimodal Models (LMMs) have shown significant reasoning capabilities\nby connecting a visual encoder and a large language model. LMMs typically use a\nfixed amount of visual tokens, such as the penultimate layer features in the\nCLIP visual encoder, as the prefix content. Recent LMMs incorporate more\ncomplex visual inputs, such as high-resolution images and videos, which\nincrease the number of visual tokens significantly. However, due to the design\nof the Transformer architecture, computational costs associated with these\nmodels tend to increase quadratically with the number of input tokens. To\ntackle this problem, we explore a token reduction mechanism and find, similar\nto prior work, that many visual tokens are spatially redundant. Based on this,\nwe propose PruMerge, a novel adaptive visual token reduction approach, which\nlargely reduces the number of visual tokens while maintaining comparable model\nperformance. We first select the unpruned visual tokens based on their\nsimilarity to class tokens and spatial tokens. We then cluster the pruned\ntokens based on key similarity and merge the clustered tokens with the unpruned\ntokens to supplement their information. Empirically, when applied to LLaVA-1.5,\nour approach can compress the visual tokens by 18 times on average, and achieve\ncomparable performance across diverse visual question-answering and reasoning\ntasks. Code and checkpoints are at https://llava-prumerge.github.io/.\n","authors":["Yuzhang Shang","Mu Cai","Bingxin Xu","Yong Jae Lee","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.15388v4.pdf","comment":"Project page: https://llava-prumerge.github.io/"},{"id":"http://arxiv.org/abs/2404.08611v1","updated":"2024-04-12T17:20:57Z","published":"2024-04-12T17:20:57Z","title":"Automatic Quantification of Serial PET/CT Images for Pediatric Hodgkin\n Lymphoma Patients Using a Longitudinally-Aware Segmentation Network","summary":" $\\textbf{Purpose}$: Automatic quantification of longitudinal changes in PET\nscans for lymphoma patients has proven challenging, as residual disease in\ninterim-therapy scans is often subtle and difficult to detect. Our goal was to\ndevelop a longitudinally-aware segmentation network (LAS-Net) that can quantify\nserial PET/CT images for pediatric Hodgkin lymphoma patients.\n$\\textbf{Materials and Methods}$: This retrospective study included baseline\n(PET1) and interim (PET2) PET/CT images from 297 patients enrolled in two\nChildren's Oncology Group clinical trials (AHOD1331 and AHOD0831). LAS-Net\nincorporates longitudinal cross-attention, allowing relevant features from PET1\nto inform the analysis of PET2. Model performance was evaluated using Dice\ncoefficients for PET1 and detection F1 scores for PET2. Additionally, we\nextracted and compared quantitative PET metrics, including metabolic tumor\nvolume (MTV) and total lesion glycolysis (TLG) in PET1, as well as qPET and\n$\\Delta$SUVmax in PET2, against physician measurements. We quantified their\nagreement using Spearman's $\\rho$ correlations and employed bootstrap\nresampling for statistical analysis. $\\textbf{Results}$: LAS-Net detected\nresidual lymphoma in PET2 with an F1 score of 0.606 (precision/recall:\n0.615/0.600), outperforming all comparator methods (P<0.01). For baseline\nsegmentation, LAS-Net achieved a mean Dice score of 0.772. In PET\nquantification, LAS-Net's measurements of qPET, $\\Delta$SUVmax, MTV and TLG\nwere strongly correlated with physician measurements, with Spearman's $\\rho$ of\n0.78, 0.80, 0.93 and 0.96, respectively. The performance remained high, with a\nslight decrease, in an external testing cohort. $\\textbf{Conclusion}$: LAS-Net\nachieved high performance in quantifying PET metrics across serial scans,\nhighlighting the value of longitudinal awareness in evaluating multi-time-point\nimaging datasets.\n","authors":["Xin Tie","Muheon Shin","Changhee Lee","Scott B. Perlman","Zachary Huemann","Amy J. Weisman","Sharon M. Castellino","Kara M. Kelly","Kathleen M. McCarten","Adina L. Alazraki","Junjie Hu","Steve Y. Cho","Tyler J. Bradshaw"],"pdf_url":"https://arxiv.org/pdf/2404.08611v1.pdf","comment":"6 figures, 4 tables in the main text"},{"id":"http://arxiv.org/abs/2310.16073v3","updated":"2024-04-12T17:04:15Z","published":"2023-10-24T14:59:51Z","title":"FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal\n Consistency and Correlation Debiasing","summary":" Dynamic scene graph generation (SGG) from videos requires not only a\ncomprehensive understanding of objects across scenes but also a method to\ncapture the temporal motions and interactions with different objects. Moreover,\nthe long-tailed distribution of visual relationships is a crucial bottleneck\nfor most dynamic SGG methods. This is because many of them focus on capturing\nspatio-temporal context using complex architectures, leading to the generation\nof biased scene graphs. To address these challenges, we propose FloCoDe:\nFlow-aware Temporal Consistency and Correlation Debiasing with uncertainty\nattenuation for unbiased dynamic scene graphs. FloCoDe employs feature warping\nusing flow to detect temporally consistent objects across frames. To address\nthe long-tail issue of visual relationships, we propose correlation debiasing\nand a label correlation-based loss to learn unbiased relation representations\nfor long-tailed classes. Specifically, we propose to incorporate label\ncorrelations using contrastive loss to capture commonly co-occurring relations,\nwhich aids in learning robust representations for long-tailed classes. Further,\nwe adopt the uncertainty attenuation-based classifier framework to handle noisy\nannotations in the SGG data. Extensive experimental evaluation shows a\nperformance gain as high as 4.1%, demonstrating the superiority of generating\nmore unbiased scene graphs.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2310.16073v3.pdf","comment":"Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2404.08603v1","updated":"2024-04-12T17:02:56Z","published":"2024-04-12T17:02:56Z","title":"Training-free Boost for Open-Vocabulary Object Detection with Confidence\n Aggregation","summary":" Open-vocabulary object detection (OVOD) aims at localizing and recognizing\nvisual objects from novel classes unseen at the training time. Whereas,\nempirical studies reveal that advanced detectors generally assign lower scores\nto those novel instances, which are inadvertently suppressed during inference\nby commonly adopted greedy strategies like Non-Maximum Suppression (NMS),\nleading to sub-optimal detection performance for novel classes. This paper\nsystematically investigates this problem with the commonly-adopted two-stage\nOVOD paradigm. Specifically, in the region-proposal stage, proposals that\ncontain novel instances showcase lower objectness scores, since they are\ntreated as background proposals during the training phase. Meanwhile, in the\nobject-classification stage, novel objects share lower region-text similarities\n(i.e., classification scores) due to the biased visual-language alignment by\nseen training samples. To alleviate this problem, this paper introduces two\nadvanced measures to adjust confidence scores and conserve erroneously\ndismissed objects: (1) a class-agnostic localization quality estimate via\noverlap degree of region/object proposals, and (2) a text-guided visual\nsimilarity estimate with proxy prototypes for novel classes. Integrated with\nadjusting techniques specifically designed for the region-proposal and\nobject-classification stages, this paper derives the aggregated confidence\nestimate for the open-vocabulary object detection paradigm (AggDet). Our AggDet\nis a generic and training-free post-processing scheme, which consistently\nbolsters open-vocabulary detectors across model scales and architecture\ndesigns. For instance, AggDet receives 3.3% and 1.5% gains on OV-COCO and\nOV-LVIS benchmarks respectively, without any training cost.\n","authors":["Yanhao Zheng","Kai Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07520v2","updated":"2024-04-12T17:01:04Z","published":"2024-04-11T07:26:00Z","title":"PromptSync: Bridging Domain Gaps in Vision-Language Models through\n Class-Aware Prototype Alignment and Discrimination","summary":" The potential for zero-shot generalization in vision-language (V-L) models\nsuch as CLIP has spurred their widespread adoption in addressing numerous\ndownstream tasks. Previous methods have employed test-time prompt tuning to\nadapt the model to unseen domains, but they overlooked the issue of imbalanced\nclass distributions. In this study, we explicitly address this problem by\nemploying class-aware prototype alignment weighted by mean class probabilities\nobtained for the test sample and filtered augmented views. Additionally, we\nensure that the class probabilities are as accurate as possible by performing\nprototype discrimination using contrastive learning. The combination of\nalignment and discriminative loss serves as a geometric regularizer, preventing\nthe prompt representation from collapsing onto a single class and effectively\nbridging the distribution gap between the source and test domains. Our method,\nnamed PromptSync, synchronizes the prompts for each test sample on both the\ntext and vision branches of the V-L model. In empirical evaluations on the\ndomain generalization benchmark, our method outperforms previous best methods\nby 2.33% in overall performance, by 1% in base-to-novel generalization, and by\n2.84% in cross-dataset transfer tasks.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2404.07520v2.pdf","comment":"Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures"},{"id":"http://arxiv.org/abs/2312.03884v2","updated":"2024-04-12T16:47:05Z","published":"2023-12-06T20:22:32Z","title":"WonderJourney: Going from Anywhere to Everywhere","summary":" We introduce WonderJourney, a modularized framework for perpetual 3D scene\ngeneration. Unlike prior work on view generation that focuses on a single type\nof scenes, we start at any user-provided location (by a text description or an\nimage) and generate a journey through a long sequence of diverse yet coherently\nconnected 3D scenes. We leverage an LLM to generate textual descriptions of the\nscenes in this journey, a text-driven point cloud generation pipeline to make a\ncompelling and coherent sequence of 3D scenes, and a large VLM to verify the\ngenerated scenes. We show compelling, diverse visual results across various\nscene types and styles, forming imaginary \"wonderjourneys\". Project website:\nhttps://kovenyu.com/WonderJourney/\n","authors":["Hong-Xing Yu","Haoyi Duan","Junhwa Hur","Kyle Sargent","Michael Rubinstein","William T. Freeman","Forrester Cole","Deqing Sun","Noah Snavely","Jiajun Wu","Charles Herrmann"],"pdf_url":"https://arxiv.org/pdf/2312.03884v2.pdf","comment":"Project website with video results:\n https://kovenyu.com/WonderJourney/"},{"id":"http://arxiv.org/abs/2404.08590v1","updated":"2024-04-12T16:38:48Z","published":"2024-04-12T16:38:48Z","title":"Improving Referring Image Segmentation using Vision-Aware Text Features","summary":" Referring image segmentation is a challenging task that involves generating\npixel-wise segmentation masks based on natural language descriptions. Existing\nmethods have relied mostly on visual features to generate the segmentation\nmasks while treating text features as supporting components. This over-reliance\non visual features can lead to suboptimal results, especially in complex\nscenarios where text prompts are ambiguous or context-dependent. To overcome\nthese challenges, we present a novel framework VATEX to improve referring image\nsegmentation by enhancing object and context understanding with Vision-Aware\nText Feature. Our method involves using CLIP to derive a CLIP Prior that\nintegrates an object-centric visual heatmap with text description, which can be\nused as the initial query in DETR-based architecture for the segmentation task.\nFurthermore, by observing that there are multiple ways to describe an instance\nin an image, we enforce feature similarity between text variations referring to\nthe same visual input by two components: a novel Contextual Multimodal Decoder\nthat turns text embeddings into vision-aware text features, and a Meaning\nConsistency Constraint to ensure further the coherent and consistent\ninterpretation of language expressions with the context understanding obtained\nfrom the image. Our method achieves a significant performance improvement on\nthree benchmark datasets RefCOCO, RefCOCO+ and G-Ref. Code is available at:\nhttps://nero1342.github.io/VATEX\\_RIS.\n","authors":["Hai Nguyen-Truong","E-Ro Nguyen","Tuan-Anh Vu","Minh-Triet Tran","Binh-Son Hua","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2404.08590v1.pdf","comment":"30 pages including supplementary"},{"id":"http://arxiv.org/abs/2401.01448v2","updated":"2024-04-12T16:37:46Z","published":"2024-01-02T22:15:20Z","title":"ProbMCL: Simple Probabilistic Contrastive Learning for Multi-label\n Visual Classification","summary":" Multi-label image classification presents a challenging task in many domains,\nincluding computer vision and medical imaging. Recent advancements have\nintroduced graph-based and transformer-based methods to improve performance and\ncapture label dependencies. However, these methods often include complex\nmodules that entail heavy computation and lack interpretability. In this paper,\nwe propose Probabilistic Multi-label Contrastive Learning (ProbMCL), a novel\nframework to address these challenges in multi-label image classification\ntasks. Our simple yet effective approach employs supervised contrastive\nlearning, in which samples that share enough labels with an anchor image based\non a decision threshold are introduced as a positive set. This structure\ncaptures label dependencies by pulling positive pair embeddings together and\npushing away negative samples that fall below the threshold. We enhance\nrepresentation learning by incorporating a mixture density network into\ncontrastive learning and generating Gaussian mixture distributions to explore\nthe epistemic uncertainty of the feature encoder. We validate the effectiveness\nof our framework through experimentation with datasets from the computer vision\nand medical imaging domains. Our method outperforms the existing\nstate-of-the-art methods while achieving a low computational footprint on both\ndatasets. Visualization analyses also demonstrate that ProbMCL-learned\nclassifiers maintain a meaningful semantic topology.\n","authors":["Ahmad Sajedi","Samir Khaki","Yuri A. Lawryshyn","Konstantinos N. Plataniotis"],"pdf_url":"https://arxiv.org/pdf/2401.01448v2.pdf","comment":"This paper has been accepted for the ICASSP 2024 - 2024 IEEE\n International Conference on Acoustics, Speech and Signal Processing (ICASSP)"},{"id":"http://arxiv.org/abs/2404.08589v1","updated":"2024-04-12T16:35:23Z","published":"2024-04-12T16:35:23Z","title":"Enhancing Visual Question Answering through Question-Driven Image\n Captions as Prompts","summary":" Visual question answering (VQA) is known as an AI-complete task as it\nrequires understanding, reasoning, and inferring about the vision and the\nlanguage content. Over the past few years, numerous neural architectures have\nbeen suggested for the VQA problem. However, achieving success in zero-shot VQA\nremains a challenge due to its requirement for advanced generalization and\nreasoning skills. This study explores the impact of incorporating image\ncaptioning as an intermediary process within the VQA pipeline. Specifically, we\nexplore the efficacy of utilizing image captions instead of images and\nleveraging large language models (LLMs) to establish a zero-shot setting. Since\nimage captioning is the most crucial step in this process, we compare the\nimpact of state-of-the-art image captioning models on VQA performance across\nvarious question types in terms of structure and semantics. We propose a\nstraightforward and efficient question-driven image captioning approach within\nthis pipeline to transfer contextual information into the question-answering\n(QA) model. This method involves extracting keywords from the question,\ngenerating a caption for each image-question pair using the keywords, and\nincorporating the question-driven caption into the LLM prompt. We evaluate the\nefficacy of using general-purpose and question-driven image captions in the VQA\npipeline. Our study highlights the potential of employing image captions and\nharnessing the capabilities of LLMs to achieve competitive performance on GQA\nunder the zero-shot setting. Our code is available at\n\\url{https://github.com/ovguyo/captions-in-VQA}.\n","authors":["Övgü Özdemir","Erdem Akagündüz"],"pdf_url":"https://arxiv.org/pdf/2404.08589v1.pdf","comment":"The paper has been accepted for presentation at CVPR 2024 Workshop on\n Prompting in Vision"},{"id":"http://arxiv.org/abs/2404.08585v1","updated":"2024-04-12T16:30:15Z","published":"2024-04-12T16:30:15Z","title":"Advanced wood species identification based on multiple anatomical\n sections and using deep feature transfer and fusion","summary":" In recent years, we have seen many advancements in wood species\nidentification. Methods like DNA analysis, Near Infrared (NIR) spectroscopy,\nand Direct Analysis in Real Time (DART) mass spectrometry complement the\nlong-established wood anatomical assessment of cell and tissue morphology.\nHowever, most of these methods have some limitations such as high costs, the\nneed for skilled experts for data interpretation, and the lack of good datasets\nfor professional reference. Therefore, most of these methods, and certainly the\nwood anatomical assessment, may benefit from tools based on Artificial\nIntelligence. In this paper, we apply two transfer learning techniques with\nConvolutional Neural Networks (CNNs) to a multi-view Congolese wood species\ndataset including sections from different orientations and viewed at different\nmicroscopic magnifications. We explore two feature extraction methods in\ndetail, namely Global Average Pooling (GAP) and Random Encoding of Aggregated\nDeep Activation Maps (RADAM), for efficient and accurate wood species\nidentification. Our results indicate superior accuracy on diverse datasets and\nanatomical sections, surpassing the results of other methods. Our proposal\nrepresents a significant advancement in wood species identification, offering a\nrobust tool to support the conservation of forest ecosystems and promote\nsustainable forestry practices.\n","authors":["Kallil M. Zielinski","Leonardo Scabini","Lucas C. Ribas","Núbia R. da Silva","Hans Beeckman","Jan Verwaeren","Odemir M. Bruno","Bernard De Baets"],"pdf_url":"https://arxiv.org/pdf/2404.08585v1.pdf","comment":"33 pages, 7 tables, 9 figures"},{"id":"http://arxiv.org/abs/2404.08584v1","updated":"2024-04-12T16:29:49Z","published":"2024-04-12T16:29:49Z","title":"Pathological Primitive Segmentation Based on Visual Foundation Model\n with Zero-Shot Mask Generation","summary":" Medical image processing usually requires a model trained with carefully\ncrafted datasets due to unique image characteristics and domain-specific\nchallenges, especially in pathology. Primitive detection and segmentation in\ndigitized tissue samples are essential for objective and automated diagnosis\nand prognosis of cancer. SAM (Segment Anything Model) has recently been\ndeveloped to segment general objects from natural images with high accuracy,\nbut it requires human prompts to generate masks. In this work, we present a\nnovel approach that adapts pre-trained natural image encoders of SAM for\ndetection-based region proposals. Regions proposed by a pre-trained encoder are\nsent to cascaded feature propagation layers for projection. Then, local\nsemantic and global context is aggregated from multi-scale for bounding box\nlocalization and classification. Finally, the SAM decoder uses the identified\nbounding boxes as essential prompts to generate a comprehensive primitive\nsegmentation map. The entire base framework, SAM, requires no additional\ntraining or fine-tuning but could produce an end-to-end result for two\nfundamental segmentation tasks in pathology. Our method compares with\nstate-of-the-art models in F1 score for nuclei detection and binary/multiclass\npanoptic(bPQ/mPQ) and mask quality(dice) for segmentation quality on the\nPanNuke dataset while offering end-to-end efficiency. Our model also achieves\nremarkable Average Precision (+4.5%) on the secondary dataset (HuBMAP Kidney)\ncompared to Faster RCNN. The code is publicly available at\nhttps://github.com/learner-codec/autoprom_sam.\n","authors":["Abu Bakor Hayat Arnob","Xiangxue Wang","Yiping Jiao","Xiao Gan","Wenlong Ming","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2404.08584v1.pdf","comment":"2024 IEEE International Symposium on Biomedical Imaging"},{"id":"http://arxiv.org/abs/2404.08582v1","updated":"2024-04-12T16:28:30Z","published":"2024-04-12T16:28:30Z","title":"FashionFail: Addressing Failure Cases in Fashion Object Detection and\n Segmentation","summary":" In the realm of fashion object detection and segmentation for online shopping\nimages, existing state-of-the-art fashion parsing models encounter limitations,\nparticularly when exposed to non-model-worn apparel and close-up shots. To\naddress these failures, we introduce FashionFail; a new fashion dataset with\ne-commerce images for object detection and segmentation. The dataset is\nefficiently curated using our novel annotation tool that leverages recent\nfoundation models. The primary objective of FashionFail is to serve as a test\nbed for evaluating the robustness of models. Our analysis reveals the\nshortcomings of leading models, such as Attribute-Mask R-CNN and Fashionformer.\nAdditionally, we propose a baseline approach using naive data augmentation to\nmitigate common failure cases and improve model robustness. Through this work,\nwe aim to inspire and support further research in fashion item detection and\nsegmentation for industrial applications. The dataset, annotation tool, code,\nand models are available at \\url{https://rizavelioglu.github.io/fashionfail/}.\n","authors":["Riza Velioglu","Robin Chan","Barbara Hammer"],"pdf_url":"https://arxiv.org/pdf/2404.08582v1.pdf","comment":"to be published in 2024 International Joint Conference on Neural\n Networks (IJCNN)"},{"id":"http://arxiv.org/abs/2404.08580v1","updated":"2024-04-12T16:23:42Z","published":"2024-04-12T16:23:42Z","title":"Lossy Image Compression with Foundation Diffusion Models","summary":" Incorporating diffusion models in the image compression domain has the\npotential to produce realistic and detailed reconstructions, especially at\nextremely low bitrates. Previous methods focus on using diffusion models as\nexpressive decoders robust to quantization errors in the conditioning signals,\nyet achieving competitive results in this manner requires costly training of\nthe diffusion model and long inference times due to the iterative generative\nprocess. In this work we formulate the removal of quantization error as a\ndenoising task, using diffusion to recover lost information in the transmitted\nimage latent. Our approach allows us to perform less than 10\\% of the full\ndiffusion generative process and requires no architectural changes to the\ndiffusion model, enabling the use of foundation models as a strong prior\nwithout additional fine tuning of the backbone. Our proposed codec outperforms\nprevious methods in quantitative realism metrics, and we verify that our\nreconstructions are qualitatively preferred by end users, even when other\nmethods use twice the bitrate.\n","authors":["Lucas Relic","Roberto Azevedo","Markus Gross","Christopher Schroers"],"pdf_url":"https://arxiv.org/pdf/2404.08580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06994v2","updated":"2024-04-12T16:07:55Z","published":"2024-02-10T17:02:53Z","title":"A Change Detection Reality Check","summary":" In recent years, there has been an explosion of proposed change detection\ndeep learning architectures in the remote sensing literature. These approaches\nclaim to offer state-of-the-art performance on different standard benchmark\ndatasets. However, has the field truly made significant progress? In this paper\nwe perform experiments which conclude a simple U-Net segmentation baseline\nwithout training tricks or complicated architectural changes is still a top\nperformer for the task of change detection.\n","authors":["Isaac Corley","Caleb Robinson","Anthony Ortiz"],"pdf_url":"https://arxiv.org/pdf/2402.06994v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08561v1","updated":"2024-04-12T16:00:03Z","published":"2024-04-12T16:00:03Z","title":"IDD-X: A Multi-View Dataset for Ego-relative Important Object\n Localization and Explanation in Dense and Unstructured Traffic","summary":" Intelligent vehicle systems require a deep understanding of the interplay\nbetween road conditions, surrounding entities, and the ego vehicle's driving\nbehavior for safe and efficient navigation. This is particularly critical in\ndeveloping countries where traffic situations are often dense and unstructured\nwith heterogeneous road occupants. Existing datasets, predominantly geared\ntowards structured and sparse traffic scenarios, fall short of capturing the\ncomplexity of driving in such environments. To fill this gap, we present IDD-X,\na large-scale dual-view driving video dataset. With 697K bounding boxes, 9K\nimportant object tracks, and 1-12 objects per video, IDD-X offers comprehensive\nego-relative annotations for multiple important road objects covering 10\ncategories and 19 explanation label categories. The dataset also incorporates\nrearview information to provide a more complete representation of the driving\nenvironment. We also introduce custom-designed deep networks aimed at multiple\nimportant object localization and per-object explanation prediction. Overall,\nour dataset and introduced prediction models form the foundation for studying\nhow road conditions and surrounding entities affect driving behavior in complex\ntraffic situations.\n","authors":["Chirag Parikh","Rohit Saluja","C. V. Jawahar","Ravi Kiran Sarvadevabhatla"],"pdf_url":"https://arxiv.org/pdf/2404.08561v1.pdf","comment":"Accepted at ICRA 2024"},{"id":"http://arxiv.org/abs/2404.08557v1","updated":"2024-04-12T15:54:48Z","published":"2024-04-12T15:54:48Z","title":"Scalability in Building Component Data Annotation: Enhancing Facade\n Material Classification with Synthetic Data","summary":" Computer vision models trained on Google Street View images can create\nmaterial cadastres. However, current approaches need manually annotated\ndatasets that are difficult to obtain and often have class imbalance. To\naddress these challenges, this paper fine-tuned a Swin Transformer model on a\nsynthetic dataset generated with DALL-E and compared the performance to a\nsimilar manually annotated dataset. Although manual annotation remains the gold\nstandard, the synthetic dataset performance demonstrates a reasonable\nalternative. The findings will ease annotation needed to develop material\ncadastres, offering architects insights into opportunities for material reuse,\nthus contributing to the reduction of demolition waste.\n","authors":["Josie Harrison","Alexander Hollberg","Yinan Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08557v1.pdf","comment":"10 pages, 6 figures, submitted to 2024 European Conference of\n Computing in Construction"},{"id":"http://arxiv.org/abs/2310.02557v3","updated":"2024-04-12T15:48:47Z","published":"2023-10-04T03:30:32Z","title":"Generalization in diffusion models arises from geometry-adaptive\n harmonic representations","summary":" Deep neural networks (DNNs) trained for image denoising are able to generate\nhigh-quality samples with score-based reverse diffusion algorithms. These\nimpressive capabilities seem to imply an escape from the curse of\ndimensionality, but recent reports of memorization of the training set raise\nthe question of whether these networks are learning the \"true\" continuous\ndensity of the data. Here, we show that two DNNs trained on non-overlapping\nsubsets of a dataset learn nearly the same score function, and thus the same\ndensity, when the number of training images is large enough. In this regime of\nstrong generalization, diffusion-generated images are distinct from the\ntraining set, and are of high visual quality, suggesting that the inductive\nbiases of the DNNs are well-aligned with the data density. We analyze the\nlearned denoising functions and show that the inductive biases give rise to a\nshrinkage operation in a basis adapted to the underlying image. Examination of\nthese bases reveals oscillating harmonic structures along contours and in\nhomogeneous regions. We demonstrate that trained denoisers are inductively\nbiased towards these geometry-adaptive harmonic bases since they arise not only\nwhen the network is trained on photographic images, but also when it is trained\non image classes supported on low-dimensional manifolds for which the harmonic\nbasis is suboptimal. Finally, we show that when trained on regular image\nclasses for which the optimal basis is known to be geometry-adaptive and\nharmonic, the denoising performance of the networks is near-optimal.\n","authors":["Zahra Kadkhodaie","Florentin Guth","Eero P. Simoncelli","Stéphane Mallat"],"pdf_url":"https://arxiv.org/pdf/2310.02557v3.pdf","comment":"Accepted for oral presentation at ICLR, Vienna, May 2024"},{"id":"http://arxiv.org/abs/2404.08549v1","updated":"2024-04-12T15:45:26Z","published":"2024-04-12T15:45:26Z","title":"Benchmarking the Cell Image Segmentation Models Robustness under the\n Microscope Optical Aberrations","summary":" Cell segmentation is essential in biomedical research for analyzing cellular\nmorphology and behavior. Deep learning methods, particularly convolutional\nneural networks (CNNs), have revolutionized cell segmentation by extracting\nintricate features from images. However, the robustness of these methods under\nmicroscope optical aberrations remains a critical challenge. This study\ncomprehensively evaluates the performance of cell instance segmentation models\nunder simulated aberration conditions using the DynamicNuclearNet (DNN) and\nLIVECell datasets. Aberrations, including Astigmatism, Coma, Spherical, and\nTrefoil, were simulated using Zernike polynomial equations. Various\nsegmentation models, such as Mask R-CNN with different network heads (FPN, C3)\nand backbones (ResNet, VGG19, SwinS), were trained and tested under aberrated\nconditions. Results indicate that FPN combined with SwinS demonstrates superior\nrobustness in handling simple cell images affected by minor aberrations.\nConversely, Cellpose2.0 proves effective for complex cell images under similar\nconditions. Our findings provide insights into selecting appropriate\nsegmentation models based on cell morphology and aberration severity, enhancing\nthe reliability of cell segmentation in biomedical applications. Further\nresearch is warranted to validate these methods with diverse aberration types\nand emerging segmentation models. Overall, this research aims to guide\nresearchers in effectively utilizing cell segmentation models in the presence\nof minor optical aberrations.\n","authors":["Boyuan Peng","Jiaju Chen","Qihui Ye","Minjiang Chen","Peiwu Qin","Chenggang Yan","Dongmei Yu","Zhenglin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.08549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08544v1","updated":"2024-04-12T15:37:53Z","published":"2024-04-12T15:37:53Z","title":"Analyzing Decades-Long Environmental Changes in Namibia Using Archival\n Aerial Photography and Deep Learning","summary":" This study explores object detection in historical aerial photographs of\nNamibia to identify long-term environmental changes. Specifically, we aim to\nidentify key objects -- \\textit{Waterholes}, \\textit{Omuti homesteads}, and\n\\textit{Big trees} -- around Oshikango in Namibia using sub-meter gray-scale\naerial imagery from 1943 and 1972. In this work, we propose a workflow for\nanalyzing historical aerial imagery using a deep semantic segmentation model on\nsparse hand-labels. To this end, we employ a number of strategies including\nclass-weighting, pseudo-labeling and empirical p-value-based filtering to\nbalance skewed and sparse representations of objects in the ground truth data.\nResults demonstrate the benefits of these different training strategies\nresulting in an average $F_1=0.661$ and $F_1=0.755$ over the three objects of\ninterest for the 1943 and 1972 imagery, respectively. We also identified that\nthe average size of Waterhole and Big trees increased while the average size of\nOmutis decreased between 1943 and 1972 reflecting some of the local effects of\nthe massive post-Second World War economic, agricultural, demographic, and\nenvironmental changes. This work also highlights the untapped potential of\nhistorical aerial photographs in understanding long-term environmental changes\nbeyond Namibia (and Africa). With the lack of adequate satellite technology in\nthe past, archival aerial photography offers a great alternative to uncover\ndecades-long environmental changes.\n","authors":["Girmaw Abebe Tadesse","Caleb Robinson","Gilles Quentin Hacheme","Akram Zaytar","Rahul Dodhia","Tsering Wangyal Shawa","Juan M. Lavista Ferres","Emmanuel H. Kreike"],"pdf_url":"https://arxiv.org/pdf/2404.08544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08540v1","updated":"2024-04-12T15:35:20Z","published":"2024-04-12T15:35:20Z","title":"On the Robustness of Language Guidance for Low-Level Vision Tasks:\n Findings from Depth Estimation","summary":" Recent advances in monocular depth estimation have been made by incorporating\nnatural language as additional guidance. Although yielding impressive results,\nthe impact of the language prior, particularly in terms of generalization and\nrobustness, remains unexplored. In this paper, we address this gap by\nquantifying the impact of this prior and introduce methods to benchmark its\neffectiveness across various settings. We generate \"low-level\" sentences that\nconvey object-centric, three-dimensional spatial relationships, incorporate\nthem as additional language priors and evaluate their downstream impact on\ndepth estimation. Our key finding is that current language-guided depth\nestimators perform optimally only with scene-level descriptions and\ncounter-intuitively fare worse with low level descriptions. Despite leveraging\nadditional data, these methods are not robust to directed adversarial attacks\nand decline in performance with an increase in distribution shift. Finally, to\nprovide a foundation for future research, we identify points of failures and\noffer insights to better understand these shortcomings. With an increasing\nnumber of methods using language for depth estimation, our findings highlight\nthe opportunities and pitfalls that require careful consideration for effective\ndeployment in real-world settings\n","authors":["Agneet Chatterjee","Tejas Gokhale","Chitta Baral","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.08540v1.pdf","comment":"Accepted to CVPR 2024. Project webpage:\n https://agneetchatterjee.com/robustness_depth_lang/"},{"id":"http://arxiv.org/abs/2404.08535v1","updated":"2024-04-12T15:30:03Z","published":"2024-04-12T15:30:03Z","title":"Generalized Contrastive Learning for Multi-Modal Retrieval and Ranking","summary":" Contrastive learning has gained widespread adoption for retrieval tasks due\nto its minimal requirement for manual annotations. However, popular contrastive\nframeworks typically learn from binary relevance, making them ineffective at\nincorporating direct fine-grained rankings. In this paper, we curate a\nlarge-scale dataset featuring detailed relevance scores for each query-document\npair to facilitate future research and evaluation. Subsequently, we propose\nGeneralized Contrastive Learning for Multi-Modal Retrieval and Ranking (GCL),\nwhich is designed to learn from fine-grained rankings beyond binary relevance\nscores. Our results show that GCL achieves a 94.5% increase in NDCG@10 for\nin-domain and 26.3 to 48.8% increases for cold-start evaluations, all relative\nto the CLIP baseline and involving ground truth rankings.\n","authors":["Tianyu Zhu","Myong Chol Jung","Jesse Clark"],"pdf_url":"https://arxiv.org/pdf/2404.08535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08531v1","updated":"2024-04-12T15:18:25Z","published":"2024-04-12T15:18:25Z","title":"Text Prompt with Normality Guidance for Weakly Supervised Video Anomaly\n Detection","summary":" Weakly supervised video anomaly detection (WSVAD) is a challenging task.\nGenerating fine-grained pseudo-labels based on weak-label and then\nself-training a classifier is currently a promising solution. However, since\nthe existing methods use only RGB visual modality and the utilization of\ncategory text information is neglected, thus limiting the generation of more\naccurate pseudo-labels and affecting the performance of self-training. Inspired\nby the manual labeling process based on the event description, in this paper,\nwe propose a novel pseudo-label generation and self-training framework based on\nText Prompt with Normality Guidance (TPWNG) for WSVAD. Our idea is to transfer\nthe rich language-visual knowledge of the contrastive language-image\npre-training (CLIP) model for aligning the video event description text and\ncorresponding video frames to generate pseudo-labels. Specifically, We first\nfine-tune the CLIP for domain adaptation by designing two ranking losses and a\ndistributional inconsistency loss. Further, we propose a learnable text prompt\nmechanism with the assist of a normality visual prompt to further improve the\nmatching accuracy of video event description text and video frames. Then, we\ndesign a pseudo-label generation module based on the normality guidance to\ninfer reliable frame-level pseudo-labels. Finally, we introduce a temporal\ncontext self-adaptive learning module to learn the temporal dependencies of\ndifferent video events more flexibly and accurately. Extensive experiments show\nthat our method achieves state-of-the-art performance on two benchmark\ndatasets, UCF-Crime and XD-Viole\n","authors":["Zhiwei Yang","Jing Liu","Peng Wu"],"pdf_url":"https://arxiv.org/pdf/2404.08531v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2402.11568v2","updated":"2024-04-12T15:17:17Z","published":"2024-02-18T12:31:29Z","title":"A novel Fourier neural operator framework for classification of\n multi-sized images: Application to three dimensional digital porous media","summary":" Fourier neural operators (FNOs) are invariant with respect to the size of\ninput images, and thus images with any size can be fed into FNO-based\nframeworks without any modification of network architectures, in contrast to\ntraditional convolutional neural networks (CNNs). Leveraging the advantage of\nFNOs, we propose a novel deep-learning framework for classifying images with\nvarying sizes. Particularly, we simultaneously train the proposed network on\nmulti-sized images. As a practical application, we consider the problem of\npredicting the label (e.g., permeability) of three-dimensional digital porous\nmedia. To construct the framework, an intuitive approach is to connect FNO\nlayers to a classifier using adaptive max pooling. First, we show that this\napproach is only effective for porous media with fixed sizes, whereas it fails\nfor porous media of varying sizes. To overcome this limitation, we introduce\nour approach: instead of using adaptive max pooling, we use static max pooling\nwith the size of channel width of FNO layers. Since the channel width of the\nFNO layers is independent of input image size, the introduced framework can\nhandle multi-sized images during training. We show the effectiveness of the\nintroduced framework and compare its performance with the intuitive approach\nthrough the example of the classification of three-dimensional digital porous\nmedia of varying sizes.\n","authors":["Ali Kashefi","Tapan Mukerji"],"pdf_url":"https://arxiv.org/pdf/2402.11568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08526v1","updated":"2024-04-12T15:15:39Z","published":"2024-04-12T15:15:39Z","title":"Masked Image Modeling as a Framework for Self-Supervised Learning across\n Eye Movements","summary":" To make sense of their surroundings, intelligent systems must transform\ncomplex sensory inputs to structured codes that are reduced to task-relevant\ninformation such as object category. Biological agents achieve this in a\nlargely autonomous manner, presumably via self-\\allowbreak super-\\allowbreak\nvised learning. Whereas previous attempts to model the underlying mechanisms\nwere largely discriminative in nature, there is ample evidence that the brain\nemploys a generative model of the world. Here, we propose that eye movements,\nin combination with the focused nature of primate vision, constitute a\ngenerative, self-supervised task of predicting and revealing visual\ninformation. We construct a proof-of-principle model starting from the\nframework of masked image modeling (MIM), a common approach in deep\nrepresentation learning. To do so, we analyze how core components of MIM such\nas masking technique and data augmentation influence the formation of\ncategory-specific representations. This allows us not only to better understand\nthe principles behind MIM, but to then reassemble a MIM more in line with the\nfocused nature of biological perception. From a theoretical angle, we find that\nMIM disentangles neurons in latent space, a property that has been suggested to\nstructure visual representations in primates, without explicit regulation.\nTogether with previous findings of invariance learning, this highlights an\ninteresting connection of MIM to latent regularization approaches for\nself-supervised learning. The source code is available under\nhttps://github.com/RobinWeiler/FocusMIM\n","authors":["Robin Weiler","Matthias Brucklacher","Cyriel M. A. Pennartz","Sander M. Bohté"],"pdf_url":"https://arxiv.org/pdf/2404.08526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11868v3","updated":"2024-04-12T15:01:10Z","published":"2024-03-18T15:22:09Z","title":"View-Consistent 3D Editing with Gaussian Splatting","summary":" The advent of 3D Gaussian Splatting (3DGS) has revolutionized 3D editing,\noffering efficient, high-fidelity rendering and enabling precise local\nmanipulations. Currently, diffusion-based 2D editing models are harnessed to\nmodify multi-view rendered images, which then guide the editing of 3DGS models.\nHowever, this approach faces a critical issue of multi-view inconsistency,\nwhere the guidance images exhibit significant discrepancies across views,\nleading to mode collapse and visual artifacts of 3DGS. To this end, we\nintroduce View-consistent Editing (VcEdit), a novel framework that seamlessly\nincorporates 3DGS into image editing processes, ensuring multi-view consistency\nin edited guidance images and effectively mitigating mode collapse issues.\nVcEdit employs two innovative consistency modules: the Cross-attention\nConsistency Module and the Editing Consistency Module, both designed to reduce\ninconsistencies in edited images. By incorporating these consistency modules\ninto an iterative pattern, VcEdit proficiently resolves the issue of multi-view\ninconsistency, facilitating high-quality 3DGS editing across a diverse range of\nscenes.\n","authors":["Yuxuan Wang","Xuanyu Yi","Zike Wu","Na Zhao","Long Chen","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11868v3.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2404.06710v3","updated":"2024-04-12T14:58:21Z","published":"2024-04-10T03:31:32Z","title":"SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike\n Camera","summary":" One of the most critical factors in achieving sharp Novel View Synthesis\n(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D\nGaussian Splatting (3DGS) is the quality of the training images. However,\nConventional RGB cameras are susceptible to motion blur. In contrast,\nneuromorphic cameras like event and spike cameras inherently capture more\ncomprehensive temporal information, which can provide a sharp representation of\nthe scene as additional training data. Recent methods have explored the\nintegration of event cameras to improve the quality of NVS. The event-RGB\napproaches have some limitations, such as high training costs and the inability\nto work effectively in the background. Instead, our study introduces a new\nmethod that uses the spike camera to overcome these limitations. By considering\ntexture reconstruction from spike streams as ground truth, we design the\nTexture from Spike (TfS) loss. Since the spike camera relies on temporal\nintegration instead of temporal differentiation used by event cameras, our\nproposed TfS loss maintains manageable training costs. It handles foreground\nobjects with backgrounds simultaneously. We also provide a real-world dataset\ncaptured with our spike-RGB camera system to facilitate future research\nendeavors. We conduct extensive experiments using synthetic and real-world\ndatasets to demonstrate that our design can enhance novel view synthesis across\nNeRF and 3DGS. The code and dataset will be made available for public access.\n","authors":["Gaole Dai","Zhenyu Wang","Qinwen Xu","Ming Lu","Wen Chen","Boxin Shi","Shanghang Zhang","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08515v1","updated":"2024-04-12T14:54:34Z","published":"2024-04-12T14:54:34Z","title":"ChatGPT and general-purpose AI count fruits in pictures surprisingly\n well","summary":" Object counting is a popular task in deep learning applications in various\ndomains, including agriculture. A conventional deep learning approach requires\na large amount of training data, often a logistic problem in a real-world\napplication. To address this issue, we examined how well ChatGPT (GPT4V) and a\ngeneral-purpose AI (foundation model for object counting, T-Rex) can count the\nnumber of fruit bodies (coffee cherries) in 100 images. The foundation model\nwith few-shot learning outperformed the trained YOLOv8 model (R2 = 0.923 and\n0.900, respectively). ChatGPT also showed some interesting potential,\nespecially when few-shot learning with human feedback was applied (R2 = 0.360\nand 0.460, respectively). Moreover, we examined the time required for\nimplementation as a practical question. Obtaining the results with the\nfoundation model and ChatGPT were much shorter than the YOLOv8 model (0.83 hrs,\n1.75 hrs, and 161 hrs). We interpret these results as two surprises for deep\nlearning users in applied domains: a foundation model with few-shot\ndomain-specific learning can drastically save time and effort compared to the\nconventional approach, and ChatGPT can reveal a relatively good performance.\nBoth approaches do not need coding skills, which can foster AI education and\ndissemination.\n","authors":["Konlavach Mengsuwan","Juan Camilo Rivera Palacio","Masahiro Ryo"],"pdf_url":"https://arxiv.org/pdf/2404.08515v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.08514v1","updated":"2024-04-12T14:54:26Z","published":"2024-04-12T14:54:26Z","title":"NIR-Assisted Image Denoising: A Selective Fusion Approach and A\n Real-World Benchmark Datase","summary":" Despite the significant progress in image denoising, it is still challenging\nto restore fine-scale details while removing noise, especially in extremely\nlow-light environments. Leveraging near-infrared (NIR) images to assist visible\nRGB image denoising shows the potential to address this issue, becoming a\npromising technology. Nonetheless, existing works still struggle with taking\nadvantage of NIR information effectively for real-world image denoising, due to\nthe content inconsistency between NIR-RGB images and the scarcity of real-world\npaired datasets. To alleviate the problem, we propose an efficient Selective\nFusion Module (SFM), which can be plug-and-played into the advanced denoising\nnetworks to merge the deep NIR-RGB features. Specifically, we sequentially\nperform the global and local modulation for NIR and RGB features, and then\nintegrate the two modulated features. Furthermore, we present a Real-world\nNIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse\nscenarios as well as various noise levels. Extensive experiments on both\nsynthetic and our real-world datasets demonstrate that the proposed method\nachieves better results than state-of-the-art ones. The dataset, codes, and\npre-trained models will be publicly available at\nhttps://github.com/ronjonxu/NAID.\n","authors":["Rongjian Xu","Zhilu Zhang","Renlong Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.08514v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2401.03785v2","updated":"2024-04-12T14:44:04Z","published":"2024-01-08T10:06:52Z","title":"Identifying Important Group of Pixels using Interactions","summary":" To better understand the behavior of image classifiers, it is useful to\nvisualize the contribution of individual pixels to the model prediction. In\nthis study, we propose a method, MoXI ($\\textbf{Mo}$del e$\\textbf{X}$planation\nby $\\textbf{I}$nteractions), that efficiently and accurately identifies a group\nof pixels with high prediction confidence. The proposed method employs\ngame-theoretic concepts, Shapley values and interactions, taking into account\nthe effects of individual pixels and the cooperative influence of pixels on\nmodel confidence. Theoretical analysis and experiments demonstrate that our\nmethod better identifies the pixels that are highly contributing to the model\noutputs than widely-used visualization by Grad-CAM, Attention rollout, and\nShapley value. While prior studies have suffered from the exponential\ncomputational cost in the computation of Shapley value and interactions, we\nshow that this can be reduced to quadratic cost for our task. The code is\navailable at https://github.com/KosukeSumiyasu/MoXI.\n","authors":["Kosuke Sumiyasu","Kazuhiko Kawamoto","Hiroshi Kera"],"pdf_url":"https://arxiv.org/pdf/2401.03785v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.08506v1","updated":"2024-04-12T14:40:45Z","published":"2024-04-12T14:40:45Z","title":"LaSagnA: Language-based Segmentation Assistant for Complex Queries","summary":" Recent advancements have empowered Large Language Models for Vision (vLLMs)\nto generate detailed perceptual outcomes, including bounding boxes and masks.\nNonetheless, there are two constraints that restrict the further application of\nthese vLLMs: the incapability of handling multiple targets per query and the\nfailure to identify the absence of query objects in the image. In this study,\nwe acknowledge that the main cause of these problems is the insufficient\ncomplexity of training queries. Consequently, we define the general sequence\nformat for complex queries. Then we incorporate a semantic segmentation task in\nthe current pipeline to fulfill the requirements of training data. Furthermore,\nwe present three novel strategies to effectively handle the challenges arising\nfrom the direct integration of the proposed format. The effectiveness of our\nmodel in processing complex queries is validated by the comparable results with\nconventional methods on both close-set and open-set semantic segmentation\ndatasets. Additionally, we outperform a series of vLLMs in reasoning and\nreferring segmentation, showcasing our model's remarkable capabilities. We\nrelease the code at https://github.com/congvvc/LaSagnA.\n","authors":["Cong Wei","Haoxian Tan","Yujie Zhong","Yujiu Yang","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2404.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08504v1","updated":"2024-04-12T14:34:24Z","published":"2024-04-12T14:34:24Z","title":"3D Human Scan With A Moving Event Camera","summary":" Capturing the 3D human body is one of the important tasks in computer vision\nwith a wide range of applications such as virtual reality and sports analysis.\nHowever, conventional frame cameras are limited by their temporal resolution\nand dynamic range, which imposes constraints in real-world application setups.\nEvent cameras have the advantages of high temporal resolution and high dynamic\nrange (HDR), but the development of event-based methods is necessary to handle\ndata with different characteristics. This paper proposes a novel event-based\nmethod for 3D pose estimation and human mesh recovery. Prior work on\nevent-based human mesh recovery require frames (images) as well as event data.\nThe proposed method solely relies on events; it carves 3D voxels by moving the\nevent camera around a stationary body, reconstructs the human pose and mesh by\nattenuated rays, and fit statistical body models, preserving high-frequency\ndetails. The experimental results show that the proposed method outperforms\nconventional frame-based methods in the estimation accuracy of both pose and\nbody mesh. We also demonstrate results in challenging situations where a\nconventional camera has motion blur. This is the first to demonstrate\nevent-only human mesh recovery, and we hope that it is the first step toward\nachieving robust and accurate 3D human body scanning from vision sensors.\n","authors":["Kai Kohyama","Shintaro Shiba","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2404.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14991v2","updated":"2024-04-12T14:21:20Z","published":"2023-12-22T11:56:22Z","title":"FoodLMM: A Versatile Food Assistant using Large Multi-modal Model","summary":" Large Multi-modal Models (LMMs) have made impressive progress in many\nvision-language tasks. Nevertheless, the performance of general LMMs in\nspecific domains is still far from satisfactory. This paper proposes FoodLMM, a\nversatile food assistant based on LMMs with various capabilities, including\nfood recognition, ingredient recognition, recipe generation, nutrition\nestimation, food segmentation and multi-round conversation. To facilitate\nFoodLMM to deal with tasks beyond pure text output, we introduce a series of\nnovel task-specific tokens and heads, enabling the model to predict food\nnutritional values and multiple segmentation masks. We adopt a two-stage\ntraining strategy. In the first stage, we utilize multiple public food\nbenchmarks for multi-task learning by leveraging the instruct-following\nparadigm. In the second stage, we construct a multi-round conversation dataset\nand a reasoning segmentation dataset to fine-tune the model, enabling it to\nconduct professional dialogues and generate segmentation masks based on complex\nreasoning in the food domain. Our fine-tuned FoodLMM achieves state-of-the-art\nresults across several food benchmarks. We will make our code, models and\ndatasets publicly available.\n","authors":["Yuehao Yin","Huiyan Qi","Bin Zhu","Jingjing Chen","Yu-Gang Jiang","Chong-Wah Ngo"],"pdf_url":"https://arxiv.org/pdf/2312.14991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08489v1","updated":"2024-04-12T14:12:03Z","published":"2024-04-12T14:12:03Z","title":"SpectralMamba: Efficient Mamba for Hyperspectral Image Classification","summary":" Recurrent neural networks and Transformers have recently dominated most\napplications in hyperspectral (HS) imaging, owing to their capability to\ncapture long-range dependencies from spectrum sequences. However, despite the\nsuccess of these sequential architectures, the non-ignorable inefficiency\ncaused by either difficulty in parallelization or computationally prohibitive\nattention still hinders their practicality, especially for large-scale\nobservation in remote sensing scenarios. To address this issue, we herein\npropose SpectralMamba -- a novel state space model incorporated efficient deep\nlearning framework for HS image classification. SpectralMamba features the\nsimplified but adequate modeling of HS data dynamics at two levels. First, in\nspatial-spectral space, a dynamical mask is learned by efficient convolutions\nto simultaneously encode spatial regularity and spectral peculiarity, thus\nattenuating the spectral variability and confusion in discriminative\nrepresentation learning. Second, the merged spectrum can then be efficiently\noperated in the hidden state space with all parameters learned input-dependent,\nyielding selectively focused responses without reliance on redundant attention\nor imparallelizable recurrence. To explore the room for further computational\ndownsizing, a piece-wise scanning mechanism is employed in-between,\ntransferring approximately continuous spectrum into sequences with squeezed\nlength while maintaining short- and long-term contextual profiles among\nhundreds of bands. Through extensive experiments on four benchmark HS datasets\nacquired by satellite-, aircraft-, and UAV-borne imagers, SpectralMamba\nsurprisingly creates promising win-wins from both performance and efficiency\nperspectives.\n","authors":["Jing Yao","Danfeng Hong","Chenyu Li","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2404.08489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00513v2","updated":"2024-04-12T13:58:33Z","published":"2024-03-31T01:20:16Z","title":"Transformer based Pluralistic Image Completion with Reduced Information\n Loss","summary":" Transformer based methods have achieved great success in image inpainting\nrecently. However, we find that these solutions regard each pixel as a token,\nthus suffering from an information loss issue from two aspects: 1) They\ndownsample the input image into much lower resolutions for efficiency\nconsideration. 2) They quantize $256^3$ RGB values to a small number (such as\n512) of quantized color values. The indices of quantized pixels are used as\ntokens for the inputs and prediction targets of the transformer. To mitigate\nthese issues, we propose a new transformer based framework called \"PUT\".\nSpecifically, to avoid input downsampling while maintaining computation\nefficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts\nthe masked image into non-overlapped patch tokens and the decoder recovers the\nmasked regions from the inpainted tokens while keeping the unmasked regions\nunchanged. To eliminate the information loss caused by input quantization, an\nUn-quantized Transformer is applied. It directly takes features from the\nP-VQVAE encoder as input without any quantization and only regards the\nquantized tokens as prediction targets. Furthermore, to make the inpainting\nprocess more controllable, we introduce semantic and structural conditions as\nextra guidance. Extensive experiments show that our method greatly outperforms\nexisting transformer based methods on image fidelity and achieves much higher\ndiversity and better fidelity than state-of-the-art pluralistic inpainting\nmethods on complex large-scale datasets (e.g., ImageNet). Codes are available\nat https://github.com/liuqk3/PUT.\n","authors":["Qiankun Liu","Yuqi Jiang","Zhentao Tan","Dongdong Chen","Ying Fu","Qi Chu","Gang Hua","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.00513v2.pdf","comment":"Accepted by TPAMI (2024). arXiv admin note: text overlap with\n arXiv:2205.05076"},{"id":"http://arxiv.org/abs/2404.08477v1","updated":"2024-04-12T13:55:05Z","published":"2024-04-12T13:55:05Z","title":"New Efficient Visual OILU Markers","summary":" Basic patterns are the source of a wide range of more or less complex\ngeometric structures. We will exploit such patterns to develop new efficient\nvisual markers. Besides being projective invariants, the proposed markers allow\nproducing rich panel of unique identifiers, highly required for\nresource-intensive navigation and augmented reality applications. The spiral\ntopology of our markers permits the validation of an accurate identification\nscheme, which is based on level set methods. The robustness of the markers\nagainst acquisition and geometric distortions is validated by extensive\nexperimental tests.\n","authors":["Youssef Chahir","Messaoud Mostefai","Hamza Saida"],"pdf_url":"https://arxiv.org/pdf/2404.08477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13570v2","updated":"2024-04-12T13:44:44Z","published":"2023-11-22T18:25:51Z","title":"WildFusion: Learning 3D-Aware Latent Diffusion Models in View Space","summary":" Modern learning-based approaches to 3D-aware image synthesis achieve high\nphotorealism and 3D-consistent viewpoint changes for the generated images.\nExisting approaches represent instances in a shared canonical space. However,\nfor in-the-wild datasets a shared canonical system can be difficult to define\nor might not even exist. In this work, we instead model instances in view\nspace, alleviating the need for posed images and learned camera distributions.\nWe find that in this setting, existing GAN-based methods are prone to\ngenerating flat geometry and struggle with distribution coverage. We hence\npropose WildFusion, a new approach to 3D-aware image synthesis based on latent\ndiffusion models (LDMs). We first train an autoencoder that infers a compressed\nlatent representation, which additionally captures the images' underlying 3D\nstructure and enables not only reconstruction but also novel view synthesis. To\nlearn a faithful 3D representation, we leverage cues from monocular depth\nprediction. Then, we train a diffusion model in the 3D-aware latent space,\nthereby enabling synthesis of high-quality 3D-consistent image samples,\noutperforming recent state-of-the-art GAN-based methods. Importantly, our\n3D-aware LDM is trained without any direct supervision from multiview images or\n3D geometry and does not require posed images or learned pose or camera\ndistributions. It directly learns a 3D representation without relying on\ncanonical camera coordinates. This opens up promising research avenues for\nscalable 3D-aware image synthesis and 3D content creation from in-the-wild\nimage data. See https://katjaschwarz.github.io/wildfusion for videos of our 3D\nresults.\n","authors":["Katja Schwarz","Seung Wook Kim","Jun Gao","Sanja Fidler","Andreas Geiger","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2311.13570v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08452v1","updated":"2024-04-12T13:02:08Z","published":"2024-04-12T13:02:08Z","title":"MoE-FFD: Mixture of Experts for Generalized and Parameter-Efficient Face\n Forgery Detection","summary":" Deepfakes have recently raised significant trust issues and security concerns\namong the public. Compared to CNN face forgery detectors, ViT-based methods\ntake advantage of the expressivity of transformers, achieving superior\ndetection performance. However, these approaches still exhibit the following\nlimitations: (1). Fully fine-tuning ViT-based models from ImageNet weights\ndemands substantial computational and storage resources; (2). ViT-based methods\nstruggle to capture local forgery clues, leading to model bias and limited\ngeneralizability. To tackle these challenges, this work introduces\nMixture-of-Experts modules for Face Forgery Detection (MoE-FFD), a generalized\nyet parameter-efficient ViT-based approach. MoE-FFD only updates lightweight\nLow-Rank Adaptation (LoRA) and Adapter layers while keeping the ViT backbone\nfrozen, thereby achieving parameter-efficient training. Moreover, MoE-FFD\nleverages the expressivity of transformers and local priors of CNNs to\nsimultaneously extract global and local forgery clues. Additionally, novel MoE\nmodules are designed to scale the model's capacity and select optimal forgery\nexperts, further enhancing forgery detection performance. The proposed MoE\nlearning scheme can be seamlessly adapted to various transformer backbones in a\nplug-and-play manner. Extensive experimental results demonstrate that the\nproposed method achieves state-of-the-art face forgery detection performance\nwith reduced parameter overhead. The code will be released upon acceptance.\n","authors":["Chenqi Kong","Anwei Luo","Song Xia","Yi Yu","Haoliang Li","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2404.08452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08450v1","updated":"2024-04-12T13:01:22Z","published":"2024-04-12T13:01:22Z","title":"Joint Physical-Digital Facial Attack Detection Via Simulating Spoofing\n Clues","summary":" Face recognition systems are frequently subjected to a variety of physical\nand digital attacks of different types. Previous methods have achieved\nsatisfactory performance in scenarios that address physical attacks and digital\nattacks, respectively. However, few methods are considered to integrate a model\nthat simultaneously addresses both physical and digital attacks, implying the\nnecessity to develop and maintain multiple models. To jointly detect physical\nand digital attacks within a single model, we propose an innovative approach\nthat can adapt to any network architecture. Our approach mainly contains two\ntypes of data augmentation, which we call Simulated Physical Spoofing Clues\naugmentation (SPSC) and Simulated Digital Spoofing Clues augmentation (SDSC).\nSPSC and SDSC augment live samples into simulated attack samples by simulating\nspoofing clues of physical and digital attacks, respectively, which\nsignificantly improve the capability of the model to detect \"unseen\" attack\ntypes. Extensive experiments show that SPSC and SDSC can achieve\nstate-of-the-art generalization in Protocols 2.1 and 2.2 of the UniAttackData\ndataset, respectively. Our method won first place in \"Unified Physical-Digital\nFace Attack Detection\" of the 5th Face Anti-spoofing Challenge@CVPR2024. Our\nfinal submission obtains 3.75% APCER, 0.93% BPCER, and 2.34% ACER,\nrespectively. Our code is available at\nhttps://github.com/Xianhua-He/cvpr2024-face-anti-spoofing-challenge.\n","authors":["Xianhua He","Dashuang Liang","Song Yang","Zhanlong Hao","Hui Ma","Binjie Mao","Xi Li","Yao Wang","Pengfei Yan","Ajian Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08450v1.pdf","comment":"10 pages with 6 figures, Accepted by CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.08449v1","updated":"2024-04-12T13:00:06Z","published":"2024-04-12T13:00:06Z","title":"OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering","summary":" Rendering dynamic 3D human from monocular videos is crucial for various\napplications such as virtual reality and digital entertainment. Most methods\nassume the people is in an unobstructed scene, while various objects may cause\nthe occlusion of body parts in real-life scenarios. Previous method utilizing\nNeRF for surface rendering to recover the occluded areas, but it requiring more\nthan one day to train and several seconds to render, failing to meet the\nrequirements of real-time interactive applications. To address these issues, we\npropose OccGaussian based on 3D Gaussian Splatting, which can be trained within\n6 minutes and produces high-quality human renderings up to 160 FPS with\noccluded input. OccGaussian initializes 3D Gaussian distributions in the\ncanonical space, and we perform occlusion feature query at occluded regions,\nthe aggregated pixel-align feature is extracted to compensate for the missing\ninformation. Then we use Gaussian Feature MLP to further process the feature\nalong with the occlusion-aware loss functions to better perceive the occluded\narea. Extensive experiments both in simulated and real-world occlusions,\ndemonstrate that our method achieves comparable or even superior performance\ncompared to the state-of-the-art method. And we improving training and\ninference speeds by 250x and 800x, respectively. Our code will be available for\nresearch purposes.\n","authors":["Jingrui Ye","Zongkai Zhang","Yujiao Jiang","Qingmin Liao","Wenming Yang","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2404.08449v1.pdf","comment":"12 April, 2024; originally announced April 2024"},{"id":"http://arxiv.org/abs/2404.08433v1","updated":"2024-04-12T12:30:48Z","published":"2024-04-12T12:30:48Z","title":"MSSTNet: A Multi-Scale Spatio-Temporal CNN-Transformer Network for\n Dynamic Facial Expression Recognition","summary":" Unlike typical video action recognition, Dynamic Facial Expression\nRecognition (DFER) does not involve distinct moving targets but relies on\nlocalized changes in facial muscles. Addressing this distinctive attribute, we\npropose a Multi-Scale Spatio-temporal CNN-Transformer network (MSSTNet). Our\napproach takes spatial features of different scales extracted by CNN and feeds\nthem into a Multi-scale Embedding Layer (MELayer). The MELayer extracts\nmulti-scale spatial information and encodes these features before sending them\ninto a Temporal Transformer (T-Former). The T-Former simultaneously extracts\ntemporal information while continually integrating multi-scale spatial\ninformation. This process culminates in the generation of multi-scale\nspatio-temporal features that are utilized for the final classification. Our\nmethod achieves state-of-the-art results on two in-the-wild datasets.\nFurthermore, a series of ablation experiments and visualizations provide\nfurther validation of our approach's proficiency in leveraging spatio-temporal\ninformation within DFER.\n","authors":["Linhuang Wang","Xin Kang","Fei Ding","Satoshi Nakagawa","Fuji Ren"],"pdf_url":"https://arxiv.org/pdf/2404.08433v1.pdf","comment":"Accepted to 2024 IEEE International Conference on Acoustics, Speech,\n and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2404.08421v1","updated":"2024-04-12T12:10:53Z","published":"2024-04-12T12:10:53Z","title":"Adapting the Segment Anything Model During Usage in Novel Situations","summary":" The interactive segmentation task consists in the creation of object\nsegmentation masks based on user interactions. The most common way to guide a\nmodel towards producing a correct segmentation consists in clicks on the object\nand background. The recently published Segment Anything Model (SAM) supports a\ngeneralized version of the interactive segmentation problem and has been\ntrained on an object segmentation dataset which contains 1.1B masks. Though\nbeing trained extensively and with the explicit purpose of serving as a\nfoundation model, we show significant limitations of SAM when being applied for\ninteractive segmentation on novel domains or object types. On the used\ndatasets, SAM displays a failure rate $\\text{FR}_{30}@90$ of up to $72.6 \\%$.\nSince we still want such foundation models to be immediately applicable, we\npresent a framework that can adapt SAM during immediate usage. For this we will\nleverage the user interactions and masks, which are constructed during the\ninteractive segmentation process. We use this information to generate\npseudo-labels, which we use to compute a loss function and optimize a part of\nthe SAM model. The presented method causes a relative reduction of up to $48.1\n\\%$ in the $\\text{FR}_{20}@85$ and $46.6 \\%$ in the $\\text{FR}_{30}@90$\nmetrics.\n","authors":["Robin Schön","Julian Lorenz","Katja Ludwig","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2404.08421v1.pdf","comment":"11 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.08419v1","updated":"2024-04-12T12:08:06Z","published":"2024-04-12T12:08:06Z","title":"Direct May Not Be the Best: An Incremental Evolution View of Pose\n Generation","summary":" Pose diversity is an inherent representative characteristic of 2D images. Due\nto the 3D to 2D projection mechanism, there is evident content discrepancy\namong distinct pose images. This is the main obstacle bothering pose\ntransformation related researches. To deal with this challenge, we propose a\nfine-grained incremental evolution centered pose generation framework, rather\nthan traditional direct one-to-one in a rush. Since proposed approach actually\nbypasses the theoretical difficulty of directly modeling dramatic non-linear\nvariation, the incurred content distortion and blurring could be effectively\nconstrained, at the same time the various individual pose details, especially\nclothes texture, could be precisely maintained. In order to systematically\nguide the evolution course, both global and incremental evolution constraints\nare elaborately designed and merged into the overall frame?work. And a novel\ntriple-path knowledge fusion structure is worked out to take full advantage of\nall available valuable knowledge to conduct high-quality pose synthesis. In\naddition, our framework could generate a series of valuable byproducts, namely\nthe various intermediate poses. Extensive experiments have been conducted to\nverify the effectiveness of the proposed approach. Code is available at\nhttps://github.com/Xiaofei-CN/Incremental-Evolution-Pose-Generation.\n","authors":["Yuelong Li","Tengfei Xiao","Lei Geng","Jianming Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06707v2","updated":"2024-04-12T11:56:18Z","published":"2023-04-13T17:56:08Z","title":"Toward Reliable Human Pose Forecasting with Uncertainty","summary":" Recently, there has been an arms race of pose forecasting methods aimed at\nsolving the spatio-temporal task of predicting a sequence of future 3D poses of\na person given a sequence of past observed ones. However, the lack of unified\nbenchmarks and limited uncertainty analysis have hindered progress in the\nfield. To address this, we first develop an open-source library for human pose\nforecasting, including multiple models, supporting several datasets, and\nemploying standardized evaluation metrics, with the aim of promoting research\nand moving toward a unified and consistent evaluation. Second, we devise two\ntypes of uncertainty in the problem to increase performance and convey better\ntrust: 1) we propose a method for modeling aleatoric uncertainty by using\nuncertainty priors to inject knowledge about the pattern of uncertainty. This\nfocuses the capacity of the model in the direction of more meaningful\nsupervision while reducing the number of learned parameters and improving\nstability; 2) we introduce a novel approach for quantifying the epistemic\nuncertainty of any model through clustering and measuring the entropy of its\nassignments. Our experiments demonstrate up to $25\\%$ improvements in\nforecasting at short horizons, with no loss on longer horizons on Human3.6M,\nAMSS, and 3DPW datasets, and better performance in uncertainty estimation. The\ncode is available online at https://github.com/vita-epfl/UnPOSed.\n","authors":["Saeed Saadatnejad","Mehrshad Mirmohammadi","Matin Daghyani","Parham Saremi","Yashar Zoroofchi Benisi","Amirhossein Alimohammadi","Zahra Tehraninasab","Taylor Mordan","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2304.06707v2.pdf","comment":"Published in RA-L 2024"},{"id":"http://arxiv.org/abs/2404.08406v1","updated":"2024-04-12T11:33:26Z","published":"2024-04-12T11:33:26Z","title":"MambaDFuse: A Mamba-based Dual-phase Model for Multi-modality Image\n Fusion","summary":" Multi-modality image fusion (MMIF) aims to integrate complementary\ninformation from different modalities into a single fused image to represent\nthe imaging scene and facilitate downstream visual tasks comprehensively. In\nrecent years, significant progress has been made in MMIF tasks due to advances\nin deep neural networks. However, existing methods cannot effectively and\nefficiently extract modality-specific and modality-fused features constrained\nby the inherent local reductive bias (CNN) or quadratic computational\ncomplexity (Transformers). To overcome this issue, we propose a Mamba-based\nDual-phase Fusion (MambaDFuse) model. Firstly, a dual-level feature extractor\nis designed to capture long-range features from single-modality images by\nextracting low and high-level features from CNN and Mamba blocks. Then, a\ndual-phase feature fusion module is proposed to obtain fusion features that\ncombine complementary information from different modalities. It uses the\nchannel exchange method for shallow fusion and the enhanced Multi-modal Mamba\n(M3) blocks for deep fusion. Finally, the fused image reconstruction module\nutilizes the inverse transformation of the feature extraction to generate the\nfused result. Through extensive experiments, our approach achieves promising\nfusion results in infrared-visible image fusion and medical image fusion.\nAdditionally, in a unified benchmark, MambaDFuse has also demonstrated improved\nperformance in downstream tasks such as object detection. Code with checkpoints\nwill be available after the peer-review process.\n","authors":["Zhe Li","Haiwei Pan","Kejia Zhang","Yuhua Wang","Fengming Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08401v1","updated":"2024-04-12T11:15:15Z","published":"2024-04-12T11:15:15Z","title":"No Bells, Just Whistles: Sports Field Registration by Leveraging\n Geometric Properties","summary":" Broadcast sports field registration is traditionally addressed as a\nhomography estimation task, mapping the visible image area to a planar field\nmodel, predominantly focusing on the main camera shot. Addressing the\nshortcomings of previous approaches, we propose a novel calibration pipeline\nenabling camera calibration using a 3D soccer field model and extending the\nprocess to assess the multiple-view nature of broadcast videos. Our approach\nbegins with a keypoint generation pipeline derived from SoccerNet dataset\nannotations, leveraging the geometric properties of the court. Subsequently, we\nexecute classical camera calibration through DLT algorithm in a minimalist\nfashion, without further refinement. Through extensive experimentation on\nreal-world soccer broadcast datasets such as SoccerNet-Calibration, WorldCup\n2014 and TS- WorldCup, our method demonstrates superior performance in both\nmultiple- and single-view 3D camera calibration while maintaining competitive\nresults in homography estimation compared to state-of-the-art techniques.\n","authors":["Marc Gutiérrez-Pérez","Antonio Agudo"],"pdf_url":"https://arxiv.org/pdf/2404.08401v1.pdf","comment":"Accepted in CVPRW 2024"},{"id":"http://arxiv.org/abs/2105.03026v2","updated":"2024-04-12T11:14:04Z","published":"2021-05-07T01:32:37Z","title":"Efficient Masked Face Recognition Method during the COVID-19 Pandemic","summary":" The coronavirus disease (COVID-19) is an unparalleled crisis leading to a\nhuge number of casualties and security problems. In order to reduce the spread\nof coronavirus, people often wear masks to protect themselves. This makes face\nrecognition a very difficult task since certain parts of the face are hidden. A\nprimary focus of researchers during the ongoing coronavirus pandemic is to come\nup with suggestions to handle this problem through rapid and efficient\nsolutions. In this paper, we propose a reliable method based on occlusion\nremoval and deep learning-based features in order to address the problem of the\nmasked face recognition process. The first step is to remove the masked face\nregion. Next, we apply three pre-trained deep Convolutional Neural Networks\n(CNN) namely, VGG-16, AlexNet, and ResNet-50, and use them to extract deep\nfeatures from the obtained regions (mostly eyes and forehead regions). The\nBag-of-features paradigm is then applied to the feature maps of the last\nconvolutional layer in order to quantize them and to get a slight\nrepresentation comparing to the fully connected layer of classical CNN.\nFinally, Multilayer Perceptron (MLP) is applied for the classification process.\nExperimental results on Real-World-Masked-Face-Dataset show high recognition\nperformance compared to other state-of-the-art methods.\n","authors":["Walid Hariri"],"pdf_url":"https://arxiv.org/pdf/2105.03026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08399v1","updated":"2024-04-12T11:08:26Z","published":"2024-04-12T11:08:26Z","title":"Mitigating Challenges of the Space Environment for Onboard Artificial\n Intelligence: Design Overview of the Imaging Payload on SpIRIT","summary":" Artificial intelligence (AI) and autonomous edge computing in space are\nemerging areas of interest to augment capabilities of nanosatellites, where\nmodern sensors generate orders of magnitude more data than can typically be\ntransmitted to mission control. Here, we present the hardware and software\ndesign of an onboard AI subsystem hosted on SpIRIT. The system is optimised for\non-board computer vision experiments based on visible light and long wave\ninfrared cameras. This paper highlights the key design choices made to maximise\nthe robustness of the system in harsh space conditions, and their motivation\nrelative to key mission requirements, such as limited compute resources,\nresilience to cosmic radiation, extreme temperature variations, distribution\nshifts, and very low transmission bandwidths. The payload, called Loris,\nconsists of six visible light cameras, three infrared cameras, a camera control\nboard and a Graphics Processing Unit (GPU) system-on-module. Loris enables the\nexecution of AI models with on-orbit fine-tuning as well as a next-generation\nimage compression algorithm, including progressive coding. This innovative\napproach not only enhances the data processing capabilities of nanosatellites\nbut also lays the groundwork for broader applications to remote sensing from\nspace.\n","authors":["Miguel Ortiz del Castillo","Jonathan Morgan","Jack McRobbie","Clint Therakam","Zaher Joukhadar","Robert Mearns","Simon Barraclough","Richard Sinnott","Andrew Woods","Chris Bayliss","Kris Ehinger","Ben Rubinstein","James Bailey","Airlie Chapman","Michele Trenti"],"pdf_url":"https://arxiv.org/pdf/2404.08399v1.pdf","comment":"AI4Space 2024, 3rd Workshop on AI for Space, CVPR 2024"},{"id":"http://arxiv.org/abs/2404.08392v1","updated":"2024-04-12T10:54:11Z","published":"2024-04-12T10:54:11Z","title":"NC-TTT: A Noise Contrastive Approach for Test-Time Training","summary":" Despite their exceptional performance in vision tasks, deep learning models\noften struggle when faced with domain shifts during testing. Test-Time Training\n(TTT) methods have recently gained popularity by their ability to enhance the\nrobustness of models through the addition of an auxiliary objective that is\njointly optimized with the main task. Being strictly unsupervised, this\nauxiliary objective is used at test time to adapt the model without any access\nto labels. In this work, we propose Noise-Contrastive Test-Time Training\n(NC-TTT), a novel unsupervised TTT technique based on the discrimination of\nnoisy feature maps. By learning to classify noisy views of projected feature\nmaps, and then adapting the model accordingly on new domains, classification\nperformance can be recovered by an important margin. Experiments on several\npopular test-time adaptation baselines demonstrate the advantages of our method\ncompared to recent approaches for this task. The code can be found\nat:https://github.com/GustavoVargasHakim/NCTTT.git\n","authors":["David Osowiechi","Gustavo A. Vargas Hakim","Mehrdad Noori","Milad Cheraghalikhani","Ali Bahri","Moslem Yazdanpanah","Ismail Ben Ayed","Christian Desrosiers"],"pdf_url":"https://arxiv.org/pdf/2404.08392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04385v2","updated":"2024-04-12T10:15:45Z","published":"2024-03-07T10:25:23Z","title":"Impacts of Color and Texture Distortions on Earth Observation Data in\n Deep Learning","summary":" Land cover classification and change detection are two important applications\nof remote sensing and Earth observation (EO) that have benefited greatly from\nthe advances of deep learning. Convolutional and transformer-based U-net models\nare the state-of-the-art architectures for these tasks, and their performances\nhave been boosted by an increased availability of large-scale annotated EO\ndatasets. However, the influence of different visual characteristics of the\ninput EO data on a model's predictions is not well understood. In this work we\nsystematically examine model sensitivities with respect to several color- and\ntexture-based distortions on the input EO data during inference, given models\nthat have been trained without such distortions. We conduct experiments with\nmultiple state-of-the-art segmentation networks for land cover classification\nand show that they are in general more sensitive to texture than to color\ndistortions. Beyond revealing intriguing characteristics of widely used land\ncover classification models, our results can also be used to guide the\ndevelopment of more robust models within the EO domain.\n","authors":["Martin Willbo","Aleksis Pirinen","John Martinsson","Edvin Listo Zec","Olof Mogren","Mikael Nilsson"],"pdf_url":"https://arxiv.org/pdf/2403.04385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08363v1","updated":"2024-04-12T10:04:03Z","published":"2024-04-12T10:04:03Z","title":"Let It Flow: Simultaneous Optimization of 3D Flow and Object Clustering","summary":" We study the problem of self-supervised 3D scene flow estimation from real\nlarge-scale raw point cloud sequences, which is crucial to various tasks like\ntrajectory prediction or instance segmentation. In the absence of ground truth\nscene flow labels, contemporary approaches concentrate on deducing optimizing\nflow across sequential pairs of point clouds by incorporating structure based\nregularization on flow and object rigidity. The rigid objects are estimated by\na variety of 3D spatial clustering methods. While state-of-the-art methods\nsuccessfully capture overall scene motion using the Neural Prior structure,\nthey encounter challenges in discerning multi-object motions. We identified the\nstructural constraints and the use of large and strict rigid clusters as the\nmain pitfall of the current approaches and we propose a novel clustering\napproach that allows for combination of overlapping soft clusters as well as\nnon-overlapping rigid clusters representation. Flow is then jointly estimated\nwith progressively growing non-overlapping rigid clusters together with fixed\nsize overlapping soft clusters. We evaluate our method on multiple datasets\nwith LiDAR point clouds, demonstrating the superior performance over the\nself-supervised baselines reaching new state of the art results. Our method\nespecially excels in resolving flow in complicated dynamic scenes with multiple\nindependently moving objects close to each other which includes pedestrians,\ncyclists and other vulnerable road users. Our codes will be publicly available.\n","authors":["Patrik Vacek","David Hurych","Tomáš Svoboda","Karel Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2404.08363v1.pdf","comment":"ECCV submission"},{"id":"http://arxiv.org/abs/2404.08353v1","updated":"2024-04-12T09:44:18Z","published":"2024-04-12T09:44:18Z","title":"TDANet: Target-Directed Attention Network For Object-Goal Visual\n Navigation With Zero-Shot Ability","summary":" The generalization of the end-to-end deep reinforcement learning (DRL) for\nobject-goal visual navigation is a long-standing challenge since object classes\nand placements vary in new test environments. Learning domain-independent\nvisual representation is critical for enabling the trained DRL agent with the\nability to generalize to unseen scenes and objects. In this letter, a\ntarget-directed attention network (TDANet) is proposed to learn the end-to-end\nobject-goal visual navigation policy with zero-shot ability. TDANet features a\nnovel target attention (TA) module that learns both the spatial and semantic\nrelationships among objects to help TDANet focus on the most relevant observed\nobjects to the target. With the Siamese architecture (SA) design, TDANet\ndistinguishes the difference between the current and target states and\ngenerates the domain-independent visual representation. To evaluate the\nnavigation performance of TDANet, extensive experiments are conducted in the\nAI2-THOR embodied AI environment. The simulation results demonstrate a strong\ngeneralization ability of TDANet to unseen scenes and target objects, with\nhigher navigation success rate (SR) and success weighted by length (SPL) than\nother state-of-the-art models.\n","authors":["Shiwei Lian","Feitian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16588v2","updated":"2024-04-12T09:38:33Z","published":"2023-09-28T16:45:46Z","title":"Vision Transformers Need Registers","summary":" Transformers have recently emerged as a powerful tool for learning visual\nrepresentations. In this paper, we identify and characterize artifacts in\nfeature maps of both supervised and self-supervised ViT networks. The artifacts\ncorrespond to high-norm tokens appearing during inference primarily in\nlow-informative background areas of images, that are repurposed for internal\ncomputations. We propose a simple yet effective solution based on providing\nadditional tokens to the input sequence of the Vision Transformer to fill that\nrole. We show that this solution fixes that problem entirely for both\nsupervised and self-supervised models, sets a new state of the art for\nself-supervised visual models on dense visual prediction tasks, enables object\ndiscovery methods with larger models, and most importantly leads to smoother\nfeature maps and attention maps for downstream visual processing.\n","authors":["Timothée Darcet","Maxime Oquab","Julien Mairal","Piotr Bojanowski"],"pdf_url":"https://arxiv.org/pdf/2309.16588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16254v2","updated":"2024-04-12T09:37:37Z","published":"2023-11-27T19:02:17Z","title":"Safe-CLIP: Removing NSFW Concepts from Vision-and-Language Models","summary":" Large-scale vision-and-language models, such as CLIP, are typically trained\non web-scale data, which can introduce inappropriate content and lead to the\ndevelopment of unsafe and biased behavior. This, in turn, hampers their\napplicability in sensitive and trustworthy contexts and could raise significant\nconcerns in their adoption. Our research introduces a novel approach to\nenhancing the safety of vision-and-language models by diminishing their\nsensitivity to NSFW (not safe for work) inputs. In particular, our methodology\nseeks to sever \"toxic\" linguistic and visual concepts, unlearning the linkage\nbetween unsafe linguistic or visual items and unsafe regions of the embedding\nspace. We show how this can be done by fine-tuning a CLIP model on synthetic\ndata obtained from a large language model trained to convert between safe and\nunsafe sentences, and a text-to-image generator. We conduct extensive\nexperiments on the resulting embedding space for cross-modal retrieval,\ntext-to-image, and image-to-text generation, where we show that our model can\nbe remarkably employed with pre-trained generative models. Our source code and\ntrained models are available at: https://github.com/aimagelab/safe-clip.\n","authors":["Samuele Poppi","Tobia Poppi","Federico Cocchi","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2311.16254v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07236v2","updated":"2024-04-12T09:34:38Z","published":"2024-04-08T08:50:09Z","title":"Lightweight Deep Learning for Resource-Constrained Environments: A\n Survey","summary":" Over the past decade, the dominance of deep learning has prevailed across\nvarious domains of artificial intelligence, including natural language\nprocessing, computer vision, and biomedical signal processing. While there have\nbeen remarkable improvements in model accuracy, deploying these models on\nlightweight devices, such as mobile phones and microcontrollers, is constrained\nby limited resources. In this survey, we provide comprehensive design guidance\ntailored for these devices, detailing the meticulous design of lightweight\nmodels, compression methods, and hardware acceleration strategies. The\nprincipal goal of this work is to explore methods and concepts for getting\naround hardware constraints without compromising the model's accuracy.\nAdditionally, we explore two notable paths for lightweight deep learning in the\nfuture: deployment techniques for TinyML and Large Language Models. Although\nthese paths undoubtedly have potential, they also present significant\nchallenges, encouraging research into unexplored areas.\n","authors":["Hou-I Liu","Marco Galindo","Hongxia Xie","Lai-Kuan Wong","Hong-Han Shuai","Yung-Hui Li","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.07236v2.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2404.08351v1","updated":"2024-04-12T09:31:55Z","published":"2024-04-12T09:31:55Z","title":"OmniSat: Self-Supervised Modality Fusion for Earth Observation","summary":" The field of Earth Observations (EO) offers a wealth of data from diverse\nsensors, presenting a great opportunity for advancing self-supervised\nmultimodal learning. However, current multimodal EO datasets and models focus\non a single data type, either mono-date images or time series, which limits\ntheir expressivity. We introduce OmniSat, a novel architecture that exploits\nthe spatial alignment between multiple EO modalities to learn expressive\nmultimodal representations without labels. To demonstrate the advantages of\ncombining modalities of different natures, we augment two existing datasets\nwith new modalities. As demonstrated on three downstream tasks: forestry, land\ncover classification, and crop mapping. OmniSat can learn rich representations\nin an unsupervised manner, leading to improved performance in the semi- and\nfully-supervised settings, even when only one modality is available for\ninference. The code and dataset are available at github.com/gastruc/OmniSat.\n","authors":["Guillaume Astruc","Nicolas Gonthier","Clement Mallet","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2404.08351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08350v1","updated":"2024-04-12T09:31:11Z","published":"2024-04-12T09:31:11Z","title":"Self-Supervised k-Space Regularization for Motion-Resolved Abdominal MRI\n Using Neural Implicit k-Space Representation","summary":" Neural implicit k-space representations have shown promising results for\ndynamic MRI at high temporal resolutions. Yet, their exclusive training in\nk-space limits the application of common image regularization methods to\nimprove the final reconstruction. In this work, we introduce the concept of\nparallel imaging-inspired self-consistency (PISCO), which we incorporate as\nnovel self-supervised k-space regularization enforcing a consistent\nneighborhood relationship. At no additional data cost, the proposed\nregularization significantly improves neural implicit k-space reconstructions\non simulated data. Abdominal in-vivo reconstructions using PISCO result in\nenhanced spatio-temporal image quality compared to state-of-the-art methods.\nCode is available at https://github.com/vjspi/PISCO-NIK.\n","authors":["Veronika Spieker","Hannah Eichhorn","Jonathan K. Stelter","Wenqi Huang","Rickmer F. Braren","Daniel Rückert","Francisco Sahli Costabal","Kerstin Hammernik","Claudia Prieto","Dimitrios C. Karampinos","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2404.08350v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2404.08347v1","updated":"2024-04-12T09:22:24Z","published":"2024-04-12T09:22:24Z","title":"Learning to Rebalance Multi-Modal Optimization by Adaptively Masking\n Subnetworks","summary":" Multi-modal learning aims to enhance performance by unifying models from\nvarious modalities but often faces the \"modality imbalance\" problem in real\ndata, leading to a bias towards dominant modalities and neglecting others,\nthereby limiting its overall effectiveness. To address this challenge, the core\nidea is to balance the optimization of each modality to achieve a joint\noptimum. Existing approaches often employ a modal-level control mechanism for\nadjusting the update of each modal parameter. However, such a global-wise\nupdating mechanism ignores the different importance of each parameter. Inspired\nby subnetwork optimization, we explore a uniform sampling-based optimization\nstrategy and find it more effective than global-wise updating. According to the\nfindings, we further propose a novel importance sampling-based, element-wise\njoint optimization method, called Adaptively Mask Subnetworks Considering Modal\nSignificance(AMSS). Specifically, we incorporate mutual information rates to\ndetermine the modal significance and employ non-uniform adaptive sampling to\nselect foreground subnetworks from each modality for parameter updates, thereby\nrebalancing multi-modal learning. Additionally, we demonstrate the reliability\nof the AMSS strategy through convergence analysis. Building upon theoretical\ninsights, we further enhance the multi-modal mask subnetwork strategy using\nunbiased estimation, referred to as AMSS+. Extensive experiments reveal the\nsuperiority of our approach over comparison methods.\n","authors":["Yang Yang","Hongpeng Pan","Qing-Yuan Jiang","Yi Xu","Jinghui Tang"],"pdf_url":"https://arxiv.org/pdf/2404.08347v1.pdf","comment":"17 pages;6 figures"},{"id":"http://arxiv.org/abs/2308.09372v2","updated":"2024-04-12T09:21:33Z","published":"2023-08-18T08:06:49Z","title":"Which Transformer to Favor: A Comparative Analysis of Efficiency in\n Vision Transformers","summary":" Transformers come with a high computational cost, yet their effectiveness in\naddressing problems in language and vision has sparked extensive research aimed\nat enhancing their efficiency. However, diverse experimental conditions,\nspanning multiple input domains, prevent a fair comparison based solely on\nreported results, posing challenges for model selection. To address this gap in\ncomparability, we design a comprehensive benchmark of more than 30 models for\nimage classification, evaluating key efficiency aspects, including accuracy,\nspeed, and memory usage. This benchmark provides a standardized baseline across\nthe landscape of efficiency-oriented transformers and our framework of\nanalysis, based on Pareto optimality, reveals surprising insights. Despite\nclaims of other models being more efficient, ViT remains Pareto optimal across\nmultiple metrics. We observe that hybrid attention-CNN models exhibit\nremarkable inference memory- and parameter-efficiency. Moreover, our benchmark\nshows that using a larger model in general is more efficient than using higher\nresolution images. Thanks to our holistic evaluation, we provide a centralized\nresource for practitioners and researchers, facilitating informed decisions\nwhen selecting transformers or measuring progress of the development of\nefficient transformers.\n","authors":["Tobias Christian Nauen","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2308.09372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08341v1","updated":"2024-04-12T09:13:37Z","published":"2024-04-12T09:13:37Z","title":"Counterfactual Explanations for Face Forgery Detection via Adversarial\n Removal of Artifacts","summary":" Highly realistic AI generated face forgeries known as deepfakes have raised\nserious social concerns. Although DNN-based face forgery detection models have\nachieved good performance, they are vulnerable to latest generative methods\nthat have less forgery traces and adversarial attacks. This limitation of\ngeneralization and robustness hinders the credibility of detection results and\nrequires more explanations. In this work, we provide counterfactual\nexplanations for face forgery detection from an artifact removal perspective.\nSpecifically, we first invert the forgery images into the StyleGAN latent\nspace, and then adversarially optimize their latent representations with the\ndiscrimination supervision from the target detection model. We verify the\neffectiveness of the proposed explanations from two aspects: (1) Counterfactual\nTrace Visualization: the enhanced forgery images are useful to reveal artifacts\nby visually contrasting the original images and two different visualization\nmethods; (2) Transferable Adversarial Attacks: the adversarial forgery images\ngenerated by attacking the detection model are able to mislead other detection\nmodels, implying the removed artifacts are general. Extensive experiments\ndemonstrate that our method achieves over 90% attack success rate and superior\nattack transferability. Compared with naive adversarial noise methods, our\nmethod adopts both generative and discriminative model priors, and optimize the\nlatent representations in a synthesis-by-analysis way, which forces the search\nof counterfactual explanations on the natural face manifold. Thus, more general\ncounterfactual traces can be found and better adversarial attack\ntransferability can be achieved.\n","authors":["Yang Li","Songlin Yang","Wei Wang","Ziwen He","Bo Peng","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2404.08341v1.pdf","comment":"Accepted to ICME2024"},{"id":"http://arxiv.org/abs/2404.07762v2","updated":"2024-04-12T09:13:29Z","published":"2024-04-11T14:03:16Z","title":"NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous\n Driving","summary":" We present a versatile NeRF-based simulator for testing autonomous driving\n(AD) software systems, designed with a focus on sensor-realistic closed-loop\nevaluation and the creation of safety-critical scenarios. The simulator learns\nfrom sequences of real-world driving sensor data and enables reconfigurations\nand renderings of new, unseen scenarios. In this work, we use our simulator to\ntest the responses of AD models to safety-critical scenarios inspired by the\nEuropean New Car Assessment Programme (Euro NCAP). Our evaluation reveals that,\nwhile state-of-the-art end-to-end planners excel in nominal driving scenarios\nin an open-loop setting, they exhibit critical flaws when navigating our\nsafety-critical scenarios in a closed-loop setting. This highlights the need\nfor advancements in the safety and real-world usability of end-to-end planners.\nBy publicly releasing our simulator and scenarios as an easy-to-run evaluation\nsuite, we invite the research community to explore, refine, and validate their\nAD models in controlled, yet highly configurable and challenging\nsensor-realistic environments. Code and instructions can be found at\nhttps://github.com/wljungbergh/NeuroNCAP\n","authors":["William Ljungbergh","Adam Tonderski","Joakim Johnander","Holger Caesar","Kalle Åström","Michael Felsberg","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2404.07762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16794v2","updated":"2024-04-12T09:04:05Z","published":"2023-12-28T02:54:34Z","title":"ZONE: Zero-Shot Instruction-Guided Local Editing","summary":" Recent advances in vision-language models like Stable Diffusion have shown\nremarkable power in creative image synthesis and editing.However, most existing\ntext-to-image editing methods encounter two obstacles: First, the text prompt\nneeds to be carefully crafted to achieve good results, which is not intuitive\nor user-friendly. Second, they are insensitive to local edits and can\nirreversibly affect non-edited regions, leaving obvious editing traces. To\ntackle these problems, we propose a Zero-shot instructiON-guided local image\nEditing approach, termed ZONE. We first convert the editing intent from the\nuser-provided instruction (e.g., \"make his tie blue\") into specific image\nediting regions through InstructPix2Pix. We then propose a Region-IoU scheme\nfor precise image layer extraction from an off-the-shelf segment model. We\nfurther develop an edge smoother based on FFT for seamless blending between the\nlayer and the image.Our method allows for arbitrary manipulation of a specific\nregion with a single instruction while preserving the rest. Extensive\nexperiments demonstrate that our ZONE achieves remarkable local editing results\nand user-friendliness, outperforming state-of-the-art methods. Code is\navailable at https://github.com/lsl001006/ZONE.\n","authors":["Shanglin Li","Bohan Zeng","Yutang Feng","Sicheng Gao","Xuhui Liu","Jiaming Liu","Li Lin","Xu Tang","Yao Hu","Jianzhuang Liu","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.16794v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.06567v2","updated":"2024-04-12T08:52:24Z","published":"2024-03-11T10:06:45Z","title":"Leveraging Foundation Models for Content-Based Medical Image Retrieval\n in Radiology","summary":" Content-based image retrieval (CBIR) has the potential to significantly\nimprove diagnostic aid and medical research in radiology. Current CBIR systems\nface limitations due to their specialization to certain pathologies, limiting\ntheir utility. In response, we propose using vision foundation models as\npowerful and versatile off-the-shelf feature extractors for content-based\nmedical image retrieval. By benchmarking these models on a comprehensive\ndataset of 1.6 million 2D radiological images spanning four modalities and 161\npathologies, we identify weakly-supervised models as superior, achieving a P@1\nof up to 0.594. This performance not only competes with a specialized model but\ndoes so without the need for fine-tuning. Our analysis further explores the\nchallenges in retrieving pathological versus anatomical structures, indicating\nthat accurate retrieval of pathological features presents greater difficulty.\nDespite these challenges, our research underscores the vast potential of\nfoundation models for CBIR in radiology, proposing a shift towards versatile,\ngeneral-purpose medical image retrieval systems that do not require specific\ntuning.\n","authors":["Stefan Denner","David Zimmerer","Dimitrios Bounias","Markus Bujotzek","Shuhan Xiao","Lisa Kausch","Philipp Schader","Tobias Penzkofer","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2403.06567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08330v1","updated":"2024-04-12T08:46:53Z","published":"2024-04-12T08:46:53Z","title":"Emerging Property of Masked Token for Effective Pre-training","summary":" Driven by the success of Masked Language Modeling (MLM), the realm of\nself-supervised learning for computer vision has been invigorated by the\ncentral role of Masked Image Modeling (MIM) in driving recent breakthroughs.\nNotwithstanding the achievements of MIM across various downstream tasks, its\noverall efficiency is occasionally hampered by the lengthy duration of the\npre-training phase. This paper presents a perspective that the optimization of\nmasked tokens as a means of addressing the prevailing issue. Initially, we\ndelve into an exploration of the inherent properties that a masked token ought\nto possess. Within the properties, we principally dedicated to articulating and\nemphasizing the `data singularity' attribute inherent in masked tokens. Through\na comprehensive analysis of the heterogeneity between masked tokens and visible\ntokens within pre-trained models, we propose a novel approach termed masked\ntoken optimization (MTO), specifically designed to improve model efficiency\nthrough weight recalibration and the enhancement of the key property of masked\ntokens. The proposed method serves as an adaptable solution that seamlessly\nintegrates into any MIM approach that leverages masked tokens. As a result, MTO\nachieves a considerable improvement in pre-training efficiency, resulting in an\napproximately 50% reduction in pre-training epochs required to attain converged\nperformance of the recent approaches.\n","authors":["Hyesong Choi","Hunsang Lee","Seyoung Joung","Hyejin Park","Jiyeong Kim","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2404.08330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01449v2","updated":"2024-04-12T08:40:55Z","published":"2024-03-03T09:07:16Z","title":"DUFOMap: Efficient Dynamic Awareness Mapping","summary":" The dynamic nature of the real world is one of the main challenges in\nrobotics. The first step in dealing with it is to detect which parts of the\nworld are dynamic. A typical benchmark task is to create a map that contains\nonly the static part of the world to support, for example, localization and\nplanning. Current solutions are often applied in post-processing, where\nparameter tuning allows the user to adjust the setting for a specific dataset.\nIn this paper, we propose DUFOMap, a novel dynamic awareness mapping framework\ndesigned for efficient online processing. Despite having the same parameter\nsettings for all scenarios, it performs better or is on par with\nstate-of-the-art methods. Ray casting is utilized to identify and classify\nfully observed empty regions. Since these regions have been observed empty, it\nfollows that anything inside them at another time must be dynamic. Evaluation\nis carried out in various scenarios, including outdoor environments in KITTI\nand Argoverse 2, open areas on the KTH campus, and with different sensor types.\nDUFOMap outperforms the state of the art in terms of accuracy and computational\nefficiency. The source code, benchmarks, and links to the datasets utilized are\nprovided. See https://kth-rpl.github.io/dufomap for more details.\n","authors":["Daniel Duberg","Qingwen Zhang","MingKai Jia","Patric Jensfelt"],"pdf_url":"https://arxiv.org/pdf/2403.01449v2.pdf","comment":"The first two authors hold equal contribution. 8 pages, 7 figures,\n project page https://kth-rpl.github.io/dufomap"},{"id":"http://arxiv.org/abs/2404.08327v1","updated":"2024-04-12T08:38:51Z","published":"2024-04-12T08:38:51Z","title":"Salience-Based Adaptive Masking: Revisiting Token Dynamics for Enhanced\n Pre-training","summary":" In this paper, we introduce Saliency-Based Adaptive Masking (SBAM), a novel\nand cost-effective approach that significantly enhances the pre-training\nperformance of Masked Image Modeling (MIM) approaches by prioritizing token\nsalience. Our method provides robustness against variations in masking ratios,\neffectively mitigating the performance instability issues common in existing\nmethods. This relaxes the sensitivity of MIM-based pre-training to masking\nratios, which in turn allows us to propose an adaptive strategy for `tailored'\nmasking ratios for each data sample, which no existing method can provide.\nToward this goal, we propose an Adaptive Masking Ratio (AMR) strategy that\ndynamically adjusts the proportion of masking for the unique content of each\nimage based on token salience. We show that our method significantly improves\nover the state-of-the-art in mask-based pre-training on the ImageNet-1K\ndataset.\n","authors":["Hyesong Choi","Hyejin Park","Kwang Moo Yi","Sungmin Cha","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2404.08327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.14335v2","updated":"2024-04-12T08:37:47Z","published":"2021-09-29T10:41:41Z","title":"A Systematic Survey of Deep Learning-based Single-Image Super-Resolution","summary":" Single-image super-resolution (SISR) is an important task in image\nprocessing, which aims to enhance the resolution of imaging systems. Recently,\nSISR has made a huge leap and has achieved promising results with the help of\ndeep learning (DL). In this survey, we give an overview of DL-based SISR\nmethods and group them according to their design targets. Specifically, we\nfirst introduce the problem definition, research background, and the\nsignificance of SISR. Secondly, we introduce some related works, including\nbenchmark datasets, upsampling methods, optimization objectives, and image\nquality assessment methods. Thirdly, we provide a detailed investigation of\nSISR and give some domain-specific applications of it. Fourthly, we present the\nreconstruction results of some classic SISR methods to intuitively know their\nperformance. Finally, we discuss some issues that still exist in SISR and\nsummarize some new trends and future directions. This is an exhaustive survey\nof SISR, which can help researchers better understand SISR and inspire more\nexciting research in this field. An investigation project for SISR is provided\nat https://github.com/CV-JunchengLi/SISR-Survey.\n","authors":["Juncheng Li","Zehua Pei","Wenjie Li","Guangwei Gao","Longguang Wang","Yingqian Wang","Tieyong Zeng"],"pdf_url":"https://arxiv.org/pdf/2109.14335v2.pdf","comment":"40 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.07537v2","updated":"2024-04-12T08:18:44Z","published":"2024-04-11T08:03:23Z","title":"How is Visual Attention Influenced by Text Guidance? Database and Model","summary":" The analysis and prediction of visual attention have long been crucial tasks\nin the fields of computer vision and image processing. In practical\napplications, images are generally accompanied by various text descriptions,\nhowever, few studies have explored the influence of text descriptions on visual\nattention, let alone developed visual saliency prediction models considering\ntext guidance. In this paper, we conduct a comprehensive study on text-guided\nimage saliency (TIS) from both subjective and objective perspectives.\nSpecifically, we construct a TIS database named SJTU-TIS, which includes 1200\ntext-image pairs and the corresponding collected eye-tracking data. Based on\nthe established SJTU-TIS database, we analyze the influence of various text\ndescriptions on visual attention. Then, to facilitate the development of\nsaliency prediction models considering text influence, we construct a benchmark\nfor the established SJTU-TIS database using state-of-the-art saliency models.\nFinally, considering the effect of text descriptions on visual attention, while\nmost existing saliency models ignore this impact, we further propose a\ntext-guided saliency (TGSal) prediction model, which extracts and integrates\nboth image features and text features to predict the image saliency under\nvarious text-description conditions. Our proposed model significantly\noutperforms the state-of-the-art saliency models on both the SJTU-TIS database\nand the pure image saliency databases in terms of various evaluation metrics.\nThe SJTU-TIS database and the code of the proposed TGSal model will be released\nat: https://github.com/IntMeGroup/TGSal.\n","authors":["Yinan Sun","Xiongkuo Min","Huiyu Duan","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.07537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08312v1","updated":"2024-04-12T08:14:17Z","published":"2024-04-12T08:14:17Z","title":"GPN: Generative Point-based NeRF","summary":" Scanning real-life scenes with modern registration devices typically gives\nincomplete point cloud representations, primarily due to the limitations of\npartial scanning, 3D occlusions, and dynamic light conditions. Recent works on\nprocessing incomplete point clouds have always focused on point cloud\ncompletion. However, these approaches do not ensure consistency between the\ncompleted point cloud and the captured images regarding color and geometry. We\npropose using Generative Point-based NeRF (GPN) to reconstruct and repair a\npartial cloud by fully utilizing the scanning images and the corresponding\nreconstructed cloud. The repaired point cloud can achieve multi-view\nconsistency with the captured images at high spatial resolution. For the\nfinetunes of a single scene, we optimize the global latent condition by\nincorporating an Auto-Decoder architecture while retaining multi-view\nconsistency. As a result, the generated point clouds are smooth, plausible, and\ngeometrically consistent with the partial scanning images. Extensive\nexperiments on ShapeNet demonstrate that our works achieve competitive\nperformances to the other state-of-the-art point cloud-based neural scene\nrendering and editing performances.\n","authors":["Haipeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08801v4","updated":"2024-04-12T07:48:45Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17759v4","updated":"2024-04-12T07:44:25Z","published":"2024-01-31T11:36:12Z","title":"Rapid post-disaster infrastructure damage characterisation enabled by\n remote sensing and deep learning technologies -- a tiered approach","summary":" Critical infrastructure, such as transport networks and bridges, are\nsystematically targeted during wars and suffer damage during extensive natural\ndisasters because it is vital for enabling connectivity and transportation of\npeople and goods, and hence, underpins national and international economic\ngrowth. Mass destruction of transport assets, in conjunction with minimal or no\naccessibility in the wake of natural and anthropogenic disasters, prevents us\nfrom delivering rapid recovery and adaptation. As a result, systemic\noperability is drastically reduced, leading to low levels of resilience. Thus,\nthere is a need for rapid assessment of its condition to allow for informed\ndecision-making for restoration prioritisation. A solution to this challenge is\nto use technology that enables stand-off observations. Nevertheless, no methods\nexist for automated characterisation of damage at multiple scales, i.e.\nregional (e.g., network), asset (e.g., bridges), and structural (e.g., road\npavement) scales. We propose a methodology based on an integrated, multi-scale\ntiered approach to fill this capability gap. In doing so, we demonstrate how\nautomated damage characterisation can be enabled by fit-for-purpose digital\ntechnologies. Next, the methodology is applied and validated to a case study in\nUkraine that includes 17 bridges, damaged by human targeted interventions. From\nregional to component scale, we deploy technology to integrate assessments\nusing Sentinel-1 SAR images, crowdsourced information, and high-resolution\nimages for deep learning to facilitate automatic damage detection and\ncharacterisation. For the first time, the interferometric coherence difference\nand semantic segmentation of images were deployed in a tiered multi-scale\napproach to improve the reliability of damage characterisations at different\nscales.\n","authors":["Nadiia Kopiika","Andreas Karavias","Pavlos Krassakis","Zehao Ye","Jelena Ninic","Nataliya Shakhovska","Nikolaos Koukouzas","Sotirios Argyroudis","Stergios-Aristoteles Mitoulis"],"pdf_url":"https://arxiv.org/pdf/2401.17759v4.pdf","comment":"43 pages; 20 figures"},{"id":"http://arxiv.org/abs/2310.12877v4","updated":"2024-04-12T07:43:35Z","published":"2023-10-19T16:32:18Z","title":"Perceptual Assessment and Optimization of High Dynamic Range Image\n Rendering","summary":" High dynamic range (HDR) rendering has the ability to faithfully reproduce\nthe wide luminance ranges in natural scenes, but how to accurately assess the\nrendering quality is relatively underexplored. Existing quality models are\nmostly designed for low dynamic range (LDR) images, and do not align well with\nhuman perception of HDR image quality. To fill this gap, we propose a family of\nHDR quality metrics, in which the key step is employing a simple inverse\ndisplay model to decompose an HDR image into a stack of LDR images with varying\nexposures. Subsequently, these decomposed images are assessed through\nwell-established LDR quality metrics. Our HDR quality models present three\ndistinct benefits. First, they directly inherit the recent advancements of LDR\nquality metrics. Second, they do not rely on human perceptual data of HDR image\nquality for re-calibration. Third, they facilitate the alignment and\nprioritization of specific luminance ranges for more accurate and detailed\nquality assessment. Experimental results show that our HDR quality metrics\nconsistently outperform existing models in terms of quality assessment on four\nHDR image quality datasets and perceptual optimization of HDR novel view\nsynthesis.\n","authors":["Peibei Cao","Rafal K. Mantiuk","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2310.12877v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08298v1","updated":"2024-04-12T07:41:17Z","published":"2024-04-12T07:41:17Z","title":"Interference Motion Removal for Doppler Radar Vital Sign Detection Using\n Variational Encoder-Decoder Neural Network","summary":" The treatment of interfering motion contributions remains one of the key\nchallenges in the domain of radar-based vital sign monitoring. Removal of the\ninterference to extract the vital sign contributions is demanding due to\noverlapping Doppler bands, the complex structure of the interference motions\nand significant variations in the power levels of their contributions. A novel\napproach to the removal of interference through the use of a probabilistic deep\nlearning model is presented. Results show that a convolutional encoder-decoder\nneural network with a variational objective is capable of learning a meaningful\nrepresentation space of vital sign Doppler-time distribution facilitating their\nextraction from a mixture signal. The approach is tested on semi-experimental\ndata containing real vital sign signatures and simulated returns from\ninterfering body motions. The application of the proposed network enhances the\nextraction of the micro-Doppler frequency corresponding to the respiration rate\nis demonstrated.\n","authors":["Mikolaj Czerkawski","Christos Ilioudis","Carmine Clemente","Craig Michie","Ivan Andonovic","Christos Tachtatzis"],"pdf_url":"https://arxiv.org/pdf/2404.08298v1.pdf","comment":"Presented at 2021 IEEE Radar Conference (RadarConf21)"},{"id":"http://arxiv.org/abs/2404.08293v1","updated":"2024-04-12T07:30:52Z","published":"2024-04-12T07:30:52Z","title":"Overcoming Scene Context Constraints for Object Detection in wild using\n Defilters","summary":" This paper focuses on improving object detection performance by addressing\nthe issue of image distortions, commonly encountered in uncontrolled\nacquisition environments. High-level computer vision tasks such as object\ndetection, recognition, and segmentation are particularly sensitive to image\ndistortion. To address this issue, we propose a novel approach employing an\nimage defilter to rectify image distortion prior to object detection. This\nmethod enhances object detection accuracy, as models perform optimally when\ntrained on non-distorted images. Our experiments demonstrate that utilizing\ndefiltered images significantly improves mean average precision compared to\ntraining object detection models on distorted images. Consequently, our\nproposed method offers considerable benefits for real-world applications\nplagued by image distortion. To our knowledge, the contribution lies in\nemploying distortion-removal paradigm for object detection on images captured\nin natural settings. We achieved an improvement of 0.562 and 0.564 of mean\nAverage precision on validation and test data.\n","authors":["Vamshi Krishna Kancharla","Neelam sinha"],"pdf_url":"https://arxiv.org/pdf/2404.08293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08292v1","updated":"2024-04-12T07:30:24Z","published":"2024-04-12T07:30:24Z","title":"AdaContour: Adaptive Contour Descriptor with Hierarchical Representation","summary":" Existing angle-based contour descriptors suffer from lossy representation for\nnon-starconvex shapes. By and large, this is the result of the shape being\nregistered with a single global inner center and a set of radii corresponding\nto a polar coordinate parameterization. In this paper, we propose AdaContour,\nan adaptive contour descriptor that uses multiple local representations to\ndesirably characterize complex shapes. After hierarchically encoding object\nshapes in a training set and constructing a contour matrix of all subdivided\nregions, we compute a robust low-rank robust subspace and approximate each\nlocal contour by linearly combining the shared basis vectors to represent an\nobject. Experiments show that AdaContour is able to represent shapes more\naccurately and robustly than other descriptors while retaining effectiveness.\nWe validate AdaContour by integrating it into off-the-shelf detectors to enable\ninstance segmentation which demonstrates faithful performance. The code is\navailable at https://github.com/tding1/AdaContour.\n","authors":["Tianyu Ding","Jinxin Zhou","Tianyi Chen","Zhihui Zhu","Ilya Zharkov","Luming Liang"],"pdf_url":"https://arxiv.org/pdf/2404.08292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08291v1","updated":"2024-04-12T07:30:08Z","published":"2024-04-12T07:30:08Z","title":"On Input Formats for Radar Micro-Doppler Signature Processing by\n Convolutional Neural Networks","summary":" Convolutional neural networks have often been proposed for processing radar\nMicro-Doppler signatures, most commonly with the goal of classifying the\nsignals. The majority of works tend to disregard phase information from the\ncomplex time-frequency representation. Here, the utility of the phase\ninformation, as well as the optimal format of the Doppler-time input for a\nconvolutional neural network, is analysed. It is found that the performance\nachieved by convolutional neural network classifiers is heavily influenced by\nthe type of input representation, even across formats with equivalent\ninformation. Furthermore, it is demonstrated that the phase component of the\nDoppler-time representation contains rich information useful for classification\nand that unwrapping the phase in the temporal dimension can improve the results\ncompared to a magnitude-only solution, improving accuracy from 0.920 to 0.938\non the tested human activity dataset. Further improvement of 0.947 is achieved\nby training a linear classifier on embeddings from multiple-formats.\n","authors":["Mikolaj Czerkawski","Carmine Clemente","Craig Michie","Christos Tachtatzis"],"pdf_url":"https://arxiv.org/pdf/2404.08291v1.pdf","comment":"Presented at International Conference on Radar Systems (RADAR 2022)"},{"id":"http://arxiv.org/abs/2404.08285v1","updated":"2024-04-12T07:19:16Z","published":"2024-04-12T07:19:16Z","title":"A Survey of Neural Network Robustness Assessment in Image Recognition","summary":" In recent years, there has been significant attention given to the robustness\nassessment of neural networks. Robustness plays a critical role in ensuring\nreliable operation of artificial intelligence (AI) systems in complex and\nuncertain environments. Deep learning's robustness problem is particularly\nsignificant, highlighted by the discovery of adversarial attacks on image\nclassification models. Researchers have dedicated efforts to evaluate\nrobustness in diverse perturbation conditions for image recognition tasks.\nRobustness assessment encompasses two main techniques: robustness verification/\ncertification for deliberate adversarial attacks and robustness testing for\nrandom data corruptions. In this survey, we present a detailed examination of\nboth adversarial robustness (AR) and corruption robustness (CR) in neural\nnetwork assessment. Analyzing current research papers and standards, we provide\nan extensive overview of robustness assessment in image recognition. Three\nessential aspects are analyzed: concepts, metrics, and assessment methods. We\ninvestigate the perturbation metrics and range representations used to measure\nthe degree of perturbations on images, as well as the robustness metrics\nspecifically for the robustness conditions of classification models. The\nstrengths and limitations of the existing methods are also discussed, and some\npotential directions for future research are provided.\n","authors":["Jie Wang","Jun Ai","Minyan Lu","Haoran Su","Dan Yu","Yutao Zhang","Junda Zhu","Jingyu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08281v1","updated":"2024-04-12T07:13:32Z","published":"2024-04-12T07:13:32Z","title":"Calibration & Reconstruction: Deep Integrated Language for Referring\n Image Segmentation","summary":" Referring image segmentation aims to segment an object referred to by natural\nlanguage expression from an image. The primary challenge lies in the efficient\npropagation of fine-grained semantic information from textual features to\nvisual features. Many recent works utilize a Transformer to address this\nchallenge. However, conventional transformer decoders can distort linguistic\ninformation with deeper layers, leading to suboptimal results. In this paper,\nwe introduce CRFormer, a model that iteratively calibrates multi-modal features\nin the transformer decoder. We start by generating language queries using\nvision features, emphasizing different aspects of the input language. Then, we\npropose a novel Calibration Decoder (CDec) wherein the multi-modal features can\niteratively calibrated by the input language features. In the Calibration\nDecoder, we use the output of each decoder layer and the original language\nfeatures to generate new queries for continuous calibration, which gradually\nupdates the language features. Based on CDec, we introduce a Language\nReconstruction Module and a reconstruction loss. This module leverages queries\nfrom the final layer of the decoder to reconstruct the input language and\ncompute the reconstruction loss. This can further prevent the language\ninformation from being lost or distorted. Our experiments consistently show the\nsuperior performance of our approach across RefCOCO, RefCOCO+, and G-Ref\ndatasets compared to state-of-the-art methods.\n","authors":["Yichen Yan","Xingjian He","Sihan Chen","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08281v1.pdf","comment":"9 pages, 8 figures ICMR2024. arXiv admin note: text overlap with\n arXiv:2305.14969"},{"id":"http://arxiv.org/abs/2404.08279v1","updated":"2024-04-12T07:08:05Z","published":"2024-04-12T07:08:05Z","title":"Convolutional neural network classification of cancer cytopathology\n images: taking breast cancer as an example","summary":" Breast cancer is a relatively common cancer among gynecological cancers. Its\ndiagnosis often relies on the pathology of cells in the lesion. The\npathological diagnosis of breast cancer not only requires professionals and\ntime, but also sometimes involves subjective judgment. To address the\nchallenges of dependence on pathologists expertise and the time-consuming\nnature of achieving accurate breast pathological image classification, this\npaper introduces an approach utilizing convolutional neural networks (CNNs) for\nthe rapid categorization of pathological images, aiming to enhance the\nefficiency of breast pathological image detection. And the approach enables the\nrapid and automatic classification of pathological images into benign and\nmalignant groups. The methodology involves utilizing a convolutional neural\nnetwork (CNN) model leveraging the Inceptionv3 architecture and transfer\nlearning algorithm for extracting features from pathological images. Utilizing\na neural network with fully connected layers and employing the SoftMax function\nfor image classification. Additionally, the concept of image partitioning is\nintroduced to handle high-resolution images. To achieve the ultimate\nclassification outcome, the classification probabilities of each image block\nare aggregated using three algorithms: summation, product, and maximum.\nExperimental validation was conducted on the BreaKHis public dataset, resulting\nin accuracy rates surpassing 0.92 across all four magnification coefficients\n(40X, 100X, 200X, and 400X). It demonstrates that the proposed method\neffectively enhances the accuracy in classifying pathological images of breast\ncancer.\n","authors":["MingXuan Xiao","Yufeng Li","Xu Yan","Min Gao","Weimin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02492v3","updated":"2024-04-12T07:06:52Z","published":"2023-10-03T23:44:35Z","title":"FairVision: Equitable Deep Learning for Eye Disease Screening via Fair\n Identity Scaling","summary":" Equity in AI for healthcare is crucial due to its direct impact on human\nwell-being. Despite advancements in 2D medical imaging fairness, the fairness\nof 3D models remains underexplored, hindered by the small sizes of 3D fairness\ndatasets. Since 3D imaging surpasses 2D imaging in SOTA clinical care, it is\ncritical to understand the fairness of these 3D models. To address this\nresearch gap, we conduct the first comprehensive study on the fairness of 3D\nmedical imaging models across multiple protected attributes. Our investigation\nspans both 2D and 3D models and evaluates fairness across five architectures on\nthree common eye diseases, revealing significant biases across race, gender,\nand ethnicity. To alleviate these biases, we propose a novel fair identity\nscaling (FIS) method that improves both overall performance and fairness,\noutperforming various SOTA fairness methods. Moreover, we release\nHarvard-FairVision, the first large-scale medical fairness dataset with 30,000\nsubjects featuring both 2D and 3D imaging data and six demographic identity\nattributes. Harvard-FairVision provides labels for three major eye disorders\naffecting about 380 million people worldwide, serving as a valuable resource\nfor both 2D and 3D fairness learning. Our code and dataset are publicly\naccessible at\n\\url{https://ophai.hms.harvard.edu/datasets/harvard-fairvision30k}.\n","authors":["Yan Luo","Muhammad Osama Khan","Yu Tian","Min Shi","Zehao Dou","Tobias Elze","Yi Fang","Mengyu Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02492v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08277v1","updated":"2024-04-12T07:04:56Z","published":"2024-04-12T07:04:56Z","title":"FaceFilterSense: A Filter-Resistant Face Recognition and Facial\n Attribute Analysis Framework","summary":" With the advent of social media, fun selfie filters have come into tremendous\nmainstream use affecting the functioning of facial biometric systems as well as\nimage recognition systems. These filters vary from beautification filters and\nAugmented Reality (AR)-based filters to filters that modify facial landmarks.\nHence, there is a need to assess the impact of such filters on the performance\nof existing face recognition systems. The limitation associated with existing\nsolutions is that these solutions focus more on the beautification filters.\nHowever, the current AR-based filters and filters which distort facial key\npoints are in vogue recently and make the faces highly unrecognizable even to\nthe naked eye. Also, the filters considered are mostly obsolete with limited\nvariations. To mitigate these limitations, we aim to perform a holistic impact\nanalysis of the latest filters and propose an user recognition model with the\nfiltered images. We have utilized a benchmark dataset for baseline images, and\napplied the latest filters over them to generate a beautified/filtered dataset.\nNext, we have introduced a model FaceFilterNet for beautified user recognition.\nIn this framework, we also utilize our model to comment on various attributes\nof the person including age, gender, and ethnicity. In addition, we have also\npresented a filter-wise impact analysis on face recognition, age estimation,\ngender, and ethnicity prediction. The proposed method affirms the efficacy of\nour dataset with an accuracy of 87.25% and an optimal accuracy for facial\nattribute analysis.\n","authors":["Shubham Tiwari","Yash Sethia","Ritesh Kumar","Ashwani Tanwar","Rudresh Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2404.08277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08273v1","updated":"2024-04-12T06:52:40Z","published":"2024-04-12T06:52:40Z","title":"Struggle with Adversarial Defense? Try Diffusion","summary":" Adversarial attacks induce misclassification by introducing subtle\nperturbations. Recently, diffusion models are applied to the image classifiers\nto improve adversarial robustness through adversarial training or by purifying\nadversarial noise. However, diffusion-based adversarial training often\nencounters convergence challenges and high computational expenses.\nAdditionally, diffusion-based purification inevitably causes data shift and is\ndeemed susceptible to stronger adaptive attacks. To tackle these issues, we\npropose the Truth Maximization Diffusion Classifier (TMDC), a generative\nBayesian classifier that builds upon pre-trained diffusion models and the\nBayesian theorem. Unlike data-driven classifiers, TMDC, guided by Bayesian\nprinciples, utilizes the conditional likelihood from diffusion models to\ndetermine the class probabilities of input images, thereby insulating against\nthe influences of data shift and the limitations of adversarial training.\nMoreover, to enhance TMDC's resilience against more potent adversarial attacks,\nwe propose an optimization strategy for diffusion classifiers. This strategy\ninvolves post-training the diffusion model on perturbed datasets with\nground-truth labels as conditions, guiding the diffusion model to learn the\ndata distribution and maximizing the likelihood under the ground-truth labels.\nThe proposed method achieves state-of-the-art performance on the CIFAR10\ndataset against heavy white-box attacks and strong adaptive attacks.\nSpecifically, TMDC achieves robust accuracies of 82.81% against $l_{\\infty}$\nnorm-bounded perturbations and 86.05% against $l_{2}$ norm-bounded\nperturbations, respectively, with $\\epsilon=0.05$.\n","authors":["Yujie Li","Yanbin Wang","Haitao xu","Bin Liu","Jianguo Sun","Zhenhao Guo","Wenrui Ma"],"pdf_url":"https://arxiv.org/pdf/2404.08273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.05516v2","updated":"2024-04-12T06:51:06Z","published":"2022-06-11T12:39:37Z","title":"Deep Learning-Based MR Image Re-parameterization","summary":" Magnetic resonance (MR) image re-parameterization refers to the process of\ngenerating via simulations of an MR image with a new set of MRI scanning\nparameters. Different parameter values generate distinct contrast between\ndifferent tissues, helping identify pathologic tissue. Typically, more than one\nscan is required for diagnosis; however, acquiring repeated scans can be\ncostly, time-consuming, and difficult for patients. Thus, using MR image\nre-parameterization to predict and estimate the contrast in these imaging scans\ncan be an effective alternative. In this work, we propose a novel deep learning\n(DL) based convolutional model for MRI re-parameterization. Based on our\npreliminary results, DL-based techniques hold the potential to learn the\nnon-linearities that govern the re-parameterization.\n","authors":["Abhijeet Narang","Abhigyan Raj","Mihaela Pop","Mehran Ebrahimi"],"pdf_url":"https://arxiv.org/pdf/2206.05516v2.pdf","comment":"A. Narang, A. Raj, M. Pop and M. Ebrahimi, \"Deep Learning-Based MR\n Image Re-parameterization,\" 2023 Congress in Computer Science, Computer\n Engineering, & Applied Computing (CSCE), Las Vegas, NV, USA, 2023, pp.\n 536-541, doi: 10.1109/CSCE60160.2023.00094"},{"id":"http://arxiv.org/abs/2303.03761v2","updated":"2024-04-12T06:42:47Z","published":"2023-03-07T09:56:23Z","title":"Graph Neural Networks in Vision-Language Image Understanding: A Survey","summary":" 2D image understanding is a complex problem within computer vision, but it\nholds the key to providing human-level scene comprehension. It goes further\nthan identifying the objects in an image, and instead, it attempts to\nunderstand the scene. Solutions to this problem form the underpinning of a\nrange of tasks, including image captioning, visual question answering (VQA),\nand image retrieval. Graphs provide a natural way to represent the relational\narrangement between objects in an image, and thus, in recent years graph neural\nnetworks (GNNs) have become a standard component of many 2D image understanding\npipelines, becoming a core architectural component, especially in the VQA group\nof tasks. In this survey, we review this rapidly evolving field and we provide\na taxonomy of graph types used in 2D image understanding approaches, a\ncomprehensive list of the GNN models used in this domain, and a roadmap of\nfuture potential developments. To the best of our knowledge, this is the first\ncomprehensive survey that covers image captioning, visual question answering,\nand image retrieval techniques that focus on using GNNs as the main part of\ntheir architecture.\n","authors":["Henry Senior","Gregory Slabaugh","Shanxin Yuan","Luca Rossi"],"pdf_url":"https://arxiv.org/pdf/2303.03761v2.pdf","comment":"20 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.08264v1","updated":"2024-04-12T06:23:48Z","published":"2024-04-12T06:23:48Z","title":"Guided Masked Self-Distillation Modeling for Distributed Multimedia\n Sensor Event Analysis","summary":" Observations with distributed sensors are essential in analyzing a series of\nhuman and machine activities (referred to as 'events' in this paper) in complex\nand extensive real-world environments. This is because the information obtained\nfrom a single sensor is often missing or fragmented in such an environment;\nobservations from multiple locations and modalities should be integrated to\nanalyze events comprehensively. However, a learning method has yet to be\nestablished to extract joint representations that effectively combine such\ndistributed observations. Therefore, we propose Guided Masked sELf-Distillation\nmodeling (Guided-MELD) for inter-sensor relationship modeling. The basic idea\nof Guided-MELD is to learn to supplement the information from the masked sensor\nwith information from other sensors needed to detect the event. Guided-MELD is\nexpected to enable the system to effectively distill the fragmented or\nredundant target event information obtained by the sensors without being overly\ndependent on any specific sensors. To validate the effectiveness of the\nproposed method in novel tasks of distributed multimedia sensor event analysis,\nwe recorded two new datasets that fit the problem setting: MM-Store and\nMM-Office. These datasets consist of human activities in a convenience store\nand an office, recorded using distributed cameras and microphones. Experimental\nresults on these datasets show that the proposed Guided-MELD improves event\ntagging and detection performance and outperforms conventional inter-sensor\nrelationship modeling methods. Furthermore, the proposed method performed\nrobustly even when sensors were reduced.\n","authors":["Masahiro Yasuda","Noboru Harada","Yasunori Ohishi","Shoichiro Saito","Akira Nakayama","Nobutaka Ono"],"pdf_url":"https://arxiv.org/pdf/2404.08264v1.pdf","comment":"13page, 7figure, under review"},{"id":"http://arxiv.org/abs/2312.16837v3","updated":"2024-04-12T06:23:45Z","published":"2023-12-28T05:46:26Z","title":"DiffusionGAN3D: Boosting Text-guided 3D Generation and Domain Adaptation\n by Combining 3D GANs and Diffusion Priors","summary":" Text-guided domain adaptation and generation of 3D-aware portraits find many\napplications in various fields. However, due to the lack of training data and\nthe challenges in handling the high variety of geometry and appearance, the\nexisting methods for these tasks suffer from issues like inflexibility,\ninstability, and low fidelity. In this paper, we propose a novel framework\nDiffusionGAN3D, which boosts text-guided 3D domain adaptation and generation by\ncombining 3D GANs and diffusion priors. Specifically, we integrate the\npre-trained 3D generative models (e.g., EG3D) and text-to-image diffusion\nmodels. The former provides a strong foundation for stable and high-quality\navatar generation from text. And the diffusion models in turn offer powerful\npriors and guide the 3D generator finetuning with informative direction to\nachieve flexible and efficient text-guided domain adaptation. To enhance the\ndiversity in domain adaptation and the generation capability in text-to-avatar,\nwe introduce the relative distance loss and case-specific learnable triplane\nrespectively. Besides, we design a progressive texture refinement module to\nimprove the texture quality for both tasks above. Extensive experiments\ndemonstrate that the proposed framework achieves excellent results in both\ndomain adaptation and text-to-avatar tasks, outperforming existing methods in\nterms of generation quality and efficiency. The project homepage is at\nhttps://younglbw.github.io/DiffusionGAN3D-homepage/.\n","authors":["Biwen Lei","Kai Yu","Mengyang Feng","Miaomiao Cui","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2312.16837v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.05268v2","updated":"2024-04-12T06:20:49Z","published":"2024-04-08T07:59:04Z","title":"MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation","summary":" Customized text-to-image generation aims to synthesize instantiations of\nuser-specified concepts and has achieved unprecedented progress in handling\nindividual concept. However, when extending to multiple customized concepts,\nexisting methods exhibit limitations in terms of flexibility and fidelity, only\naccommodating the combination of limited types of models and potentially\nresulting in a mix of characteristics from different concepts. In this paper,\nwe introduce the Multi-concept guidance for Multi-concept customization, termed\nMC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the\nrequirements for model architecture via inference time optimization, allowing\nthe integration of various heterogeneous single-concept customized models. It\nadaptively refines the attention weights between visual and textual tokens,\ndirecting image regions to focus on their associated words while diminishing\nthe impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$\neven surpasses previous methods that require additional training in terms of\nconsistency with input prompt and reference images. Moreover, MC$^2$ can be\nextended to elevate the compositional capabilities of text-to-image generation,\nyielding appealing results. Code will be publicly available at\nhttps://github.com/JIANGJiaXiu/MC-2.\n","authors":["Jiaxiu Jiang","Yabo Zhang","Kailai Feng","Xiaohe Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05268v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08255v1","updated":"2024-04-12T06:09:24Z","published":"2024-04-12T06:09:24Z","title":"Practical Region-level Attack against Segment Anything Models","summary":" Segment Anything Models (SAM) have made significant advancements in image\nsegmentation, allowing users to segment target portions of an image with a\nsingle click (i.e., user prompt). Given its broad applications, the robustness\nof SAM against adversarial attacks is a critical concern. While recent works\nhave explored adversarial attacks against a pre-defined prompt/click, their\nthreat model is not yet realistic: (1) they often assume the user-click\nposition is known to the attacker (point-based attack), and (2) they often\noperate under a white-box setting with limited transferability. In this paper,\nwe propose a more practical region-level attack where attackers do not need to\nknow the precise user prompt. The attack remains effective as the user clicks\non any point on the target object in the image, hiding the object from SAM.\nAlso, by adapting a spectrum transformation method, we make the attack more\ntransferable under a black-box setting. Both control experiments and testing\nagainst real-world SAM services confirm its effectiveness.\n","authors":["Yifan Shen","Zhengyuan Li","Gang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08252v1","updated":"2024-04-12T05:43:10Z","published":"2024-04-12T05:43:10Z","title":"MonoPatchNeRF: Improving Neural Radiance Fields with Patch-based\n Monocular Guidance","summary":" The latest regularized Neural Radiance Field (NeRF) approaches produce poor\ngeometry and view extrapolation for multiview stereo (MVS) benchmarks such as\nETH3D. In this paper, we aim to create 3D models that provide accurate geometry\nand view synthesis, partially closing the large geometric performance gap\nbetween NeRF and traditional MVS methods. We propose a patch-based approach\nthat effectively leverages monocular surface normal and relative depth\npredictions. The patch-based ray sampling also enables the appearance\nregularization of normalized cross-correlation (NCC) and structural similarity\n(SSIM) between randomly sampled virtual and training views. We further show\nthat \"density restrictions\" based on sparse structure-from-motion points can\nhelp greatly improve geometric accuracy with a slight drop in novel view\nsynthesis metrics. Our experiments show 4x the performance of RegNeRF and 8x\nthat of FreeNeRF on average F1@2cm for ETH3D MVS benchmark, suggesting a\nfruitful research direction to improve the geometric accuracy of NeRF-based\nmodels, and sheds light on a potential future approach to enable NeRF-based\noptimization to eventually outperform traditional MVS.\n","authors":["Yuqun Wu","Jae Yong Lee","Chuhang Zou","Shenlong Wang","Derek Hoiem"],"pdf_url":"https://arxiv.org/pdf/2404.08252v1.pdf","comment":"26 pages, 15 figures"},{"id":"http://arxiv.org/abs/2309.08966v2","updated":"2024-04-12T05:34:02Z","published":"2023-09-16T11:42:41Z","title":"FF-LOGO: Cross-Modality Point Cloud Registration with Feature Filtering\n and Local to Global Optimization","summary":" Cross-modality point cloud registration is confronted with significant\nchallenges due to inherent differences in modalities between different sensors.\nWe propose a cross-modality point cloud registration framework FF-LOGO: a\ncross-modality point cloud registration method with feature filtering and\nlocal-global optimization. The cross-modality feature correlation filtering\nmodule extracts geometric transformation-invariant features from cross-modality\npoint clouds and achieves point selection by feature matching. We also\nintroduce a cross-modality optimization process, including a local adaptive key\nregion aggregation module and a global modality consistency fusion optimization\nmodule. Experimental results demonstrate that our two-stage optimization\nsignificantly improves the registration accuracy of the feature association and\nselection module. Our method achieves a substantial increase in recall rate\ncompared to the current state-of-the-art methods on the 3DCSR dataset,\nimproving from 40.59% to 75.74%. Our code will be available at\nhttps://github.com/wangmohan17/FFLOGO.\n","authors":["Nan Ma","Mohan Wang","Yiheng Han","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2309.08966v2.pdf","comment":"Accepted by 2024 IEEE International Conference on Robotics and\n Automation (ICRA),7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.15070v3","updated":"2024-04-12T05:26:59Z","published":"2023-08-29T07:11:52Z","title":"DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior","summary":" We present DiffBIR, a general restoration pipeline that could handle\ndifferent blind image restoration tasks in a unified framework. DiffBIR\ndecouples blind image restoration problem into two stages: 1) degradation\nremoval: removing image-independent content; 2) information regeneration:\ngenerating the lost image content. Each stage is developed independently but\nthey work seamlessly in a cascaded manner. In the first stage, we use\nrestoration modules to remove degradations and obtain high-fidelity restored\nresults. For the second stage, we propose IRControlNet that leverages the\ngenerative ability of latent diffusion models to generate realistic details.\nSpecifically, IRControlNet is trained based on specially produced condition\nimages without distracting noisy content for stable generation performance.\nMoreover, we design a region-adaptive restoration guidance that can modify the\ndenoising process during inference without model re-training, allowing users to\nbalance realness and fidelity through a tunable guidance scale. Extensive\nexperiments have demonstrated DiffBIR's superiority over state-of-the-art\napproaches for blind image super-resolution, blind face restoration and blind\nimage denoising tasks on both synthetic and real-world datasets. The code is\navailable at https://github.com/XPixelGroup/DiffBIR.\n","authors":["Xinqi Lin","Jingwen He","Ziyan Chen","Zhaoyang Lyu","Bo Dai","Fanghua Yu","Wanli Ouyang","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2308.15070v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10356v3","updated":"2024-04-12T05:07:28Z","published":"2023-09-19T06:32:19Z","title":"RoadFormer: Duplex Transformer for RGB-Normal Semantic Road Scene\n Parsing","summary":" The recent advancements in deep convolutional neural networks have shown\nsignificant promise in the domain of road scene parsing. Nevertheless, the\nexisting works focus primarily on freespace detection, with little attention\ngiven to hazardous road defects that could compromise both driving safety and\ncomfort. In this paper, we introduce RoadFormer, a novel Transformer-based\ndata-fusion network developed for road scene parsing. RoadFormer utilizes a\nduplex encoder architecture to extract heterogeneous features from both RGB\nimages and surface normal information. The encoded features are subsequently\nfed into a novel heterogeneous feature synergy block for effective feature\nfusion and recalibration. The pixel decoder then learns multi-scale long-range\ndependencies from the fused and recalibrated heterogeneous features, which are\nsubsequently processed by a Transformer decoder to produce the final semantic\nprediction. Additionally, we release SYN-UDTIRI, the first large-scale road\nscene parsing dataset that contains over 10,407 RGB images, dense depth images,\nand the corresponding pixel-level annotations for both freespace and road\ndefects of different shapes and sizes. Extensive experimental evaluations\nconducted on our SYN-UDTIRI dataset, as well as on three public datasets,\nincluding KITTI road, CityScapes, and ORFD, demonstrate that RoadFormer\noutperforms all other state-of-the-art networks for road scene parsing.\nSpecifically, RoadFormer ranks first on the KITTI road benchmark. Our source\ncode, created dataset, and demo video are publicly available at\nmias.group/RoadFormer.\n","authors":["Jiahang Li","Yikang Zhang","Peng Yun","Guangliang Zhou","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2309.10356v3.pdf","comment":"9 pages 7 figures. Accepted by Transactions on Intelligent Vehicles"},{"id":"http://arxiv.org/abs/2403.14047v2","updated":"2024-04-12T05:07:27Z","published":"2024-03-21T00:09:04Z","title":"Accelerating ViT Inference on FPGA through Static and Dynamic Pruning","summary":" Vision Transformers (ViTs) have achieved state-of-the-art accuracy on various\ncomputer vision tasks. However, their high computational complexity prevents\nthem from being applied to many real-world applications. Weight and token\npruning are two well-known methods for reducing complexity: weight pruning\nreduces the model size and associated computational demands, while token\npruning further dynamically reduces the computation based on the input.\nCombining these two techniques should significantly reduce computation\ncomplexity and model size; however, naively integrating them results in\nirregular computation patterns, leading to significant accuracy drops and\ndifficulties in hardware acceleration.\n Addressing the above challenges, we propose a comprehensive\nalgorithm-hardware codesign for accelerating ViT on FPGA through simultaneous\npruning -combining static weight pruning and dynamic token pruning. For\nalgorithm design, we systematically combine a hardware-aware structured\nblock-pruning method for pruning model parameters and a dynamic token pruning\nmethod for removing unimportant token vectors. Moreover, we design a novel\ntraining algorithm to recover the model's accuracy. For hardware design, we\ndevelop a novel hardware accelerator for executing the pruned model. The\nproposed hardware design employs multi-level parallelism with load balancing\nstrategy to efficiently deal with the irregular computation pattern led by the\ntwo pruning approaches. Moreover, we develop an efficient hardware mechanism\nfor efficiently executing the on-the-fly token pruning.\n","authors":["Dhruv Parikh","Shouyi Li","Bingyi Zhang","Rajgopal Kannan","Carl Busart","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2403.14047v2.pdf","comment":"FCCM 2024"},{"id":"http://arxiv.org/abs/2208.07463v4","updated":"2024-04-12T04:48:48Z","published":"2022-08-15T22:51:23Z","title":"Conv-Adapter: Exploring Parameter Efficient Transfer Learning for\n ConvNets","summary":" While parameter efficient tuning (PET) methods have shown great potential\nwith transformer architecture on Natural Language Processing (NLP) tasks, their\neffectiveness with large-scale ConvNets is still under-studied on Computer\nVision (CV) tasks. This paper proposes Conv-Adapter, a PET module designed for\nConvNets. Conv-Adapter is light-weight, domain-transferable, and\narchitecture-agnostic with generalized performance on different tasks. When\ntransferring on downstream tasks, Conv-Adapter learns tasks-specific feature\nmodulation to the intermediate representations of backbones while keeping the\npre-trained parameters frozen. By introducing only a tiny amount of learnable\nparameters, e.g., only 3.5% full fine-tuning parameters of ResNet50. It can\nalso be applied for transformer-based backbones. Conv-Adapter outperforms\nprevious PET baseline methods and achieves comparable or surpasses the\nperformance of full fine-tuning on 23 classification tasks of various domains.\nIt also presents superior performance on the few-shot classification with an\naverage margin of 3.39%. Beyond classification, Conv-Adapter can generalize to\ndetection and segmentation tasks with more than 50% reduction of parameters but\ncomparable performance to the traditional full fine-tuning.\n","authors":["Hao Chen","Ran Tao","Han Zhang","Yidong Wang","Xiang Li","Wei Ye","Jindong Wang","Guosheng Hu","Marios Savvides"],"pdf_url":"https://arxiv.org/pdf/2208.07463v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08238v1","updated":"2024-04-12T04:45:51Z","published":"2024-04-12T04:45:51Z","title":"Simulation of a Vision Correction Display System","summary":" Eyes serve as our primary sensory organs, responsible for processing up to\n80\\% of our sensory input. However, common visual aberrations like myopia and\nhyperopia affect a significant portion of the global population. This paper\nfocuses on simulating a Vision Correction Display (VCD) to enhance the visual\nexperience of individuals with various visual impairments. Utilising Blender,\nwe digitally model the functionality of a VCD in correcting refractive errors\nsuch as myopia and hyperopia. With these simulations we can see potential\nimprovements in visual acuity and comfort. These simulations provide valuable\ninsights for the design and development of future VCD technologies, ultimately\nadvancing accessibility and usability for individuals with visual challenges.\n","authors":["Vidya Sunil","Renu M Rameshan"],"pdf_url":"https://arxiv.org/pdf/2404.08238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08237v1","updated":"2024-04-12T04:44:11Z","published":"2024-04-12T04:44:11Z","title":"IFViT: Interpretable Fixed-Length Representation for Fingerprint\n Matching via Vision Transformer","summary":" Determining dense feature points on fingerprints used in constructing deep\nfixed-length representations for accurate matching, particularly at the pixel\nlevel, is of significant interest. To explore the interpretability of\nfingerprint matching, we propose a multi-stage interpretable fingerprint\nmatching network, namely Interpretable Fixed-length Representation for\nFingerprint Matching via Vision Transformer (IFViT), which consists of two\nprimary modules. The first module, an interpretable dense registration module,\nestablishes a Vision Transformer (ViT)-based Siamese Network to capture\nlong-range dependencies and the global context in fingerprint pairs. It\nprovides interpretable dense pixel-wise correspondences of feature points for\nfingerprint alignment and enhances the interpretability in the subsequent\nmatching stage. The second module takes into account both local and global\nrepresentations of the aligned fingerprint pair to achieve an interpretable\nfixed-length representation extraction and matching. It employs the ViTs\ntrained in the first module with the additional fully connected layer and\nretrains them to simultaneously produce the discriminative fixed-length\nrepresentation and interpretable dense pixel-wise correspondences of feature\npoints. Extensive experimental results on diverse publicly available\nfingerprint databases demonstrate that the proposed framework not only exhibits\nsuperior performance on dense registration and matching but also significantly\npromotes the interpretability in deep fixed-length representations-based\nfingerprint matching.\n","authors":["Yuhang Qiu","Honghui Chen","Xingbo Dong","Zheng Lin","Iman Yi Liao","Massimo Tistarelli","Zhe Jin"],"pdf_url":"https://arxiv.org/pdf/2404.08237v1.pdf","comment":"ready to submit to IEEE Transactions on Information Forensics and\n Security (TIFS)"},{"id":"http://arxiv.org/abs/2302.06874v2","updated":"2024-04-12T04:42:29Z","published":"2023-02-14T07:39:37Z","title":"Robust Representation Learning with Self-Distillation for Domain\n Generalization","summary":" Despite the recent success of deep neural networks, there remains a need for\neffective methods to enhance domain generalization using vision transformers.\nIn this paper, we propose a novel domain generalization technique called Robust\nRepresentation Learning with Self-Distillation (RRLD) comprising i)\nintermediate-block self-distillation and ii) augmentation-guided\nself-distillation to improve the generalization capabilities of\ntransformer-based models on unseen domains. This approach enables the network\nto learn robust and general features that are invariant to different\naugmentations and domain shifts while effectively mitigating overfitting to\nsource domains. To evaluate the effectiveness of our proposed method, we\nperform extensive experiments on PACS and OfficeHome benchmark datasets, as\nwell as an industrial wafer semiconductor defect dataset. The results\ndemonstrate that RRLD achieves robust and accurate generalization performance.\nWe observe an average accuracy improvement in the range of 1.2% to 2.3% over\nthe state-of-the-art on the three datasets.\n","authors":["Ankur Singh","Senthilnath Jayavelu"],"pdf_url":"https://arxiv.org/pdf/2302.06874v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2404.05960v2","updated":"2024-04-12T04:23:12Z","published":"2024-04-09T02:47:52Z","title":"EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker","summary":" Most of 3D single object trackers (SOT) in point clouds follow the two-stream\nmulti-stage 3D Siamese or motion tracking paradigms, which process the template\nand search area point clouds with two parallel branches, built on supervised\npoint cloud backbones. In this work, beyond typical 3D Siamese or motion\ntracking, we propose a neat and compact one-stream transformer 3D SOT paradigm\nfrom the novel perspective, termed as \\textbf{EasyTrack}, which consists of\nthree special designs: 1) A 3D point clouds tracking feature pre-training\nmodule is developed to exploit the masked autoencoding for learning 3D point\nclouds tracking representations. 2) A unified 3D tracking feature learning and\nfusion network is proposed to simultaneously learns target-aware 3D features,\nand extensively captures mutual correlation through the flexible self-attention\nmechanism. 3) A target location network in the dense bird's eye view (BEV)\nfeature space is constructed for target classification and regression.\nMoreover, we develop an enhanced version named EasyTrack++, which designs the\ncenter points interaction (CPI) strategy to reduce the ambiguous targets caused\nby the noise point cloud background information. The proposed EasyTrack and\nEasyTrack++ set a new state-of-the-art performance ($\\textbf{18\\%}$,\n$\\textbf{40\\%}$ and $\\textbf{3\\%}$ success gains) in KITTI, NuScenes, and Waymo\nwhile runing at \\textbf{52.6fps} with few parameters (\\textbf{1.3M}). The code\nwill be available at https://github.com/KnightApple427/Easytrack.\n","authors":["Baojie Fan","Wuyang Zhou","Kai Wang","Shijun Zhou","Fengyu Xu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08229v1","updated":"2024-04-12T04:08:21Z","published":"2024-04-12T04:08:21Z","title":"Enhancing Traffic Safety with Parallel Dense Video Captioning for\n End-to-End Event Analysis","summary":" This paper introduces our solution for Track 2 in AI City Challenge 2024. The\ntask aims to solve traffic safety description and analysis with the dataset of\nWoven Traffic Safety (WTS), a real-world Pedestrian-Centric Traffic Video\nDataset for Fine-grained Spatial-Temporal Understanding. Our solution mainly\nfocuses on the following points: 1) To solve dense video captioning, we\nleverage the framework of dense video captioning with parallel decoding (PDVC)\nto model visual-language sequences and generate dense caption by chapters for\nvideo. 2) Our work leverages CLIP to extract visual features to more\nefficiently perform cross-modality training between visual and textual\nrepresentations. 3) We conduct domain-specific model adaptation to mitigate\ndomain shift problem that poses recognition challenge in video understanding.\n4) Moreover, we leverage BDD-5K captioned videos to conduct knowledge transfer\nfor better understanding WTS videos and more accurate captioning. Our solution\nhas yielded on the test set, achieving 6th place in the competition. The open\nsource code will be available at https://github.com/UCF-SST-Lab/AICity2024CVPRW\n","authors":["Maged Shoman","Dongdong Wang","Armstrong Aboah","Mohamed Abdel-Aty"],"pdf_url":"https://arxiv.org/pdf/2404.08229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08226v1","updated":"2024-04-12T03:43:37Z","published":"2024-04-12T03:43:37Z","title":"Improving Continuous Sign Language Recognition with Adapted Image Models","summary":" The increase of web-scale weakly labelled image-text pairs have greatly\nfacilitated the development of large-scale vision-language models (e.g., CLIP),\nwhich have shown impressive generalization performance over a series of\ndownstream tasks. However, the massive model size and scarcity of available\ndata limit their applications to fine-tune the whole model in downstream tasks.\nBesides, fully fine-tuning the model easily forgets the generic essential\nknowledge acquired in the pretraining stage and overfits the downstream data.\nTo enable high efficiency when adapting these large vision-language models\n(e.g., CLIP) to performing continuous sign language recognition (CSLR) while\npreserving their generalizability, we propose a novel strategy (AdaptSign).\nEspecially, CLIP is adopted as the visual backbone to extract frame-wise\nfeatures whose parameters are fixed, and a set of learnable modules are\nintroduced to model spatial sign variations or capture temporal sign movements.\nThe introduced additional modules are quite lightweight, only owning 3.2% extra\ncomputations with high efficiency. The generic knowledge acquired in the\npretraining stage is well-preserved in the frozen CLIP backbone in this\nprocess. Extensive experiments show that despite being efficient, AdaptSign is\nable to demonstrate superior performance across a series of CSLR benchmarks\nincluding PHOENIX14, PHOENIX14-T, CSL-Daily and CSL compared to existing\nmethods. Visualizations show that AdaptSign could learn to dynamically pay\nmajor attention to the informative spatial regions and cross-frame trajectories\nin sign videos.\n","authors":["Lianyu Hu","Tongkai Shi","Liqing Gao","Zekang Liu","Wei Feng"],"pdf_url":"https://arxiv.org/pdf/2404.08226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04582v2","updated":"2024-04-12T03:33:31Z","published":"2023-10-06T20:48:43Z","title":"Universal Humanoid Motion Representations for Physics-Based Control","summary":" We present a universal motion representation that encompasses a comprehensive\nrange of motor skills for physics-based humanoid control. Due to the high\ndimensionality of humanoids and the inherent difficulties in reinforcement\nlearning, prior methods have focused on learning skill embeddings for a narrow\nrange of movement styles (e.g. locomotion, game characters) from specialized\nmotion datasets. This limited scope hampers their applicability in complex\ntasks. We close this gap by significantly increasing the coverage of our motion\nrepresentation space. To achieve this, we first learn a motion imitator that\ncan imitate all of human motion from a large, unstructured motion dataset. We\nthen create our motion representation by distilling skills directly from the\nimitator. This is achieved by using an encoder-decoder structure with a\nvariational information bottleneck. Additionally, we jointly learn a prior\nconditioned on proprioception (humanoid's own pose and velocities) to improve\nmodel expressiveness and sampling efficiency for downstream tasks. By sampling\nfrom the prior, we can generate long, stable, and diverse human motions. Using\nthis latent space for hierarchical RL, we show that our policies solve tasks\nusing human-like behavior. We demonstrate the effectiveness of our motion\nrepresentation by solving generative tasks (e.g. strike, terrain traversal) and\nmotion tracking using VR controllers.\n","authors":["Zhengyi Luo","Jinkun Cao","Josh Merel","Alexander Winkler","Jing Huang","Kris Kitani","Weipeng Xu"],"pdf_url":"https://arxiv.org/pdf/2310.04582v2.pdf","comment":"ICLR 2024 Spotlight. Project page:\n https://zhengyiluo.github.io/PULSE/"},{"id":"http://arxiv.org/abs/2403.12416v2","updated":"2024-04-12T03:15:26Z","published":"2024-03-19T03:59:14Z","title":"Eye-gaze Guided Multi-modal Alignment Framework for Radiology","summary":" In multi-modal frameworks, the alignment of cross-modal features presents a\nsignificant challenge. The predominant approach in multi-modal pre-training\nemphasizes either global or local alignment between modalities, utilizing\nextensive datasets. This bottom-up driven method often suffers from a lack of\ninterpretability, a critical concern in radiology. Previous studies have\nintegrated high-level labels in medical images or text, but these still rely on\nmanual annotation, a costly and labor-intensive process. Our work introduces a\nnovel approach by using eye-gaze data, collected synchronously by radiologists\nduring diagnostic evaluations. This data, indicating radiologists' focus areas,\nnaturally links chest X-rays to diagnostic texts. We propose the Eye-gaze\nGuided Multi-modal Alignment (EGMA) framework to harness eye-gaze data for\nbetter alignment of image and text features, aiming to reduce reliance on\nmanual annotations and thus cut training costs. Our model demonstrates robust\nperformance, outperforming other state-of-the-art methods in zero-shot\nclassification and retrieval tasks. The incorporation of easily-obtained\neye-gaze data during routine radiological diagnoses signifies a step towards\nminimizing manual annotation dependency. Additionally, we explore the impact of\nvarying amounts of eye-gaze data on model performance, highlighting the\nfeasibility and utility of integrating this auxiliary data into multi-modal\npre-training.\n","authors":["Chong Ma","Hanqi Jiang","Wenting Chen","Zihao Wu","Xiaowei Yu","Fang Zeng","Lei Guo","Dajiang Zhu","Tuo Zhang","Dinggang Shen","Tianming Liu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2403.12416v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.15036v3","updated":"2024-04-12T03:14:34Z","published":"2023-10-23T15:34:03Z","title":"A Technique for Classifying Static Gestures Using UWB Radar","summary":" Our paper presents a robust framework for UWB-based static gesture\nrecognition, leveraging proprietary UWB radar sensor technology. Extensive data\ncollection efforts were undertaken to compile datasets containing five commonly\nused gestures. Our approach involves a comprehensive data pre-processing\npipeline that encompasses outlier handling, aspect ratio-preserving resizing,\nand false-color image transformation. Both CNN and MobileNet models were\ntrained on the processed images. Remarkably, our best-performing model achieved\nan accuracy of 96.78%. Additionally, we developed a user-friendly GUI framework\nto assess the model's system resource usage and processing times, which\nrevealed low memory utilization and real-time task completion in under one\nsecond. This research marks a significant step towards enhancing static gesture\nrecognition using UWB technology, promising practical applications in various\ndomains.\n","authors":["Abhishek Sebastian","Pragna R"],"pdf_url":"https://arxiv.org/pdf/2310.15036v3.pdf","comment":"This is not a technical research paper, but an excerpt of what was\n applied during a funded project for the promotion of Open Science"},{"id":"http://arxiv.org/abs/2312.17428v2","updated":"2024-04-12T03:06:07Z","published":"2023-12-29T01:42:20Z","title":"ChangeNet: Multi-Temporal Asymmetric Change Detection Dataset","summary":" Change Detection (CD) has been attracting extensive interests with the\navailability of bi-temporal datasets. However, due to the huge cost of\nmulti-temporal images acquisition and labeling, existing change detection\ndatasets are small in quantity, short in temporal, and low in practicability.\nTherefore, a large-scale practical-oriented dataset covering wide temporal\nphases is urgently needed to facilitate the community. To this end, the\nChangeNet dataset is presented especially for multi-temporal change detection,\nalong with the new task of \"Asymmetric Change Detection\". Specifically,\nChangeNet consists of 31,000 multi-temporal images pairs, a wide range of\ncomplex scenes from 100 cities, and 6 pixel-level annotated categories, which\nis far superior to all the existing change detection datasets including\nLEVIR-CD, WHU Building CD, etc.. In addition, ChangeNet contains amounts of\nreal-world perspective distortions in different temporal phases on the same\nareas, which is able to promote the practical application of change detection\nalgorithms. The ChangeNet dataset is suitable for both binary change detection\n(BCD) and semantic change detection (SCD) tasks. Accordingly, we benchmark the\nChangeNet dataset on six BCD methods and two SCD methods, and extensive\nexperiments demonstrate its challenges and great significance. The dataset is\navailable at https://github.com/jankyee/ChangeNet.\n","authors":["Deyi Ji","Siqi Gao","Mingyuan Tao","Hongtao Lu","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.17428v2.pdf","comment":"Accepted to ICASSP 2024 Oral/Lecture"},{"id":"http://arxiv.org/abs/2402.09055v2","updated":"2024-04-12T02:51:45Z","published":"2024-02-14T10:05:19Z","title":"Comment-aided Video-Language Alignment via Contrastive Pre-training for\n Short-form Video Humor Detection","summary":" The growing importance of multi-modal humor detection within affective\ncomputing correlates with the expanding influence of short-form video sharing\non social media platforms. In this paper, we propose a novel two-branch\nhierarchical model for short-form video humor detection (SVHD), named\nComment-aided Video-Language Alignment (CVLA) via data-augmented multi-modal\ncontrastive pre-training. Notably, our CVLA not only operates on raw signals\nacross various modal channels but also yields an appropriate multi-modal\nrepresentation by aligning the video and language components within a\nconsistent semantic space. The experimental results on two humor detection\ndatasets, including DY11k and UR-FUNNY, demonstrate that CVLA dramatically\noutperforms state-of-the-art and several competitive baseline approaches. Our\ndataset, code and model release at https://github.com/yliu-cs/CVLA.\n","authors":["Yang Liu","Tongfei Shen","Dong Zhang","Qingying Sun","Shoushan Li","Guodong Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.09055v2.pdf","comment":"Accepted by ICMR 2024"},{"id":"http://arxiv.org/abs/2403.18554v2","updated":"2024-04-12T02:27:09Z","published":"2024-03-27T13:33:14Z","title":"CosalPure: Learning Concept from Group Images for Robust Co-Saliency\n Detection","summary":" Co-salient object detection (CoSOD) aims to identify the common and salient\n(usually in the foreground) regions across a given group of images. Although\nachieving significant progress, state-of-the-art CoSODs could be easily\naffected by some adversarial perturbations, leading to substantial accuracy\nreduction. The adversarial perturbations can mislead CoSODs but do not change\nthe high-level semantic information (e.g., concept) of the co-salient objects.\nIn this paper, we propose a novel robustness enhancement framework by first\nlearning the concept of the co-salient objects based on the input group images\nand then leveraging this concept to purify adversarial perturbations, which are\nsubsequently fed to CoSODs for robustness enhancement. Specifically, we propose\nCosalPure containing two modules, i.e., group-image concept learning and\nconcept-guided diffusion purification. For the first module, we adopt a\npre-trained text-to-image diffusion model to learn the concept of co-salient\nobjects within group images where the learned concept is robust to adversarial\nexamples. For the second module, we map the adversarial image to the latent\nspace and then perform diffusion generation by embedding the learned concept\ninto the noise prediction function as an extra condition. Our method can\neffectively alleviate the influence of the SOTA adversarial attack containing\ndifferent adversarial patterns, including exposure and noise. The extensive\nresults demonstrate that our method could enhance the robustness of CoSODs\nsignificantly.\n","authors":["Jiayi Zhu","Qing Guo","Felix Juefei-Xu","Yihao Huang","Yang Liu","Geguang Pu"],"pdf_url":"https://arxiv.org/pdf/2403.18554v2.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.08201v1","updated":"2024-04-12T02:14:35Z","published":"2024-04-12T02:14:35Z","title":"A Mutual Inclusion Mechanism for Precise Boundary Segmentation in\n Medical Images","summary":" In medical imaging, accurate image segmentation is crucial for quantifying\ndiseases, assessing prognosis, and evaluating treatment outcomes. However,\nexisting methods lack an in-depth integration of global and local features,\nfailing to pay special attention to abnormal regions and boundary details in\nmedical images. To this end, we present a novel deep learning-based approach,\nMIPC-Net, for precise boundary segmentation in medical images. Our approach,\ninspired by radiologists' working patterns, features two distinct modules: (i)\n\\textbf{Mutual Inclusion of Position and Channel Attention (MIPC) module}: To\nenhance the precision of boundary segmentation in medical images, we introduce\nthe MIPC module, which enhances the focus on channel information when\nextracting position features and vice versa; (ii) \\textbf{GL-MIPC-Residue}: To\nimprove the restoration of medical images, we propose the GL-MIPC-Residue, a\nglobal residual connection that enhances the integration of the encoder and\ndecoder by filtering out invalid information and restoring the most effective\ninformation lost during the feature extraction process. We evaluate the\nperformance of the proposed model using metrics such as Dice coefficient (DSC)\nand Hausdorff Distance (HD) on three publicly accessible datasets: Synapse,\nISIC2018-Task, and Segpc. Our ablation study shows that each module contributes\nto improving the quality of segmentation results. Furthermore, with the\nassistance of both modules, our approach outperforms state-of-the-art methods\nacross all metrics on the benchmark datasets, notably achieving a 2.23mm\nreduction in HD on the Synapse dataset, strongly evidencing our model's\nenhanced capability for precise image boundary segmentation. Codes will be\navailable at https://github.com/SUN-1024/MIPC-Net.\n","authors":["Yizhi Pan","Junyi Xin","Tianhua Yang","Teeradaj Racharak","Le-Minh Nguyen","Guanqun Sun"],"pdf_url":"https://arxiv.org/pdf/2404.08201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08197v1","updated":"2024-04-12T02:04:34Z","published":"2024-04-12T02:04:34Z","title":"Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and\n Training Strategies","summary":" This paper investigates the performance of the Contrastive Language-Image\nPre-training (CLIP) when scaled down to limited computation budgets. We explore\nCLIP along three dimensions: data, architecture, and training strategies. With\nregards to data, we demonstrate the significance of high-quality training data\nand show that a smaller dataset of high-quality data can outperform a larger\ndataset with lower quality. We also examine how model performance varies with\ndifferent dataset sizes, suggesting that smaller ViT models are better suited\nfor smaller datasets, while larger models perform better on larger datasets\nwith fixed compute. Additionally, we provide guidance on when to choose a\nCNN-based architecture or a ViT-based architecture for CLIP training. We\ncompare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data\nAugmentation - and show that the choice of training strategy depends on the\navailable compute resource. Our analysis reveals that CLIP+Data Augmentation\ncan achieve comparable performance to CLIP using only half of the training\ndata. This work provides practical insights into how to effectively train and\ndeploy CLIP models, making them more accessible and affordable for practical\nuse in various applications.\n","authors":["Zichao Li","Cihang Xie","Ekin Dogus Cubuk"],"pdf_url":"https://arxiv.org/pdf/2404.08197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08195v1","updated":"2024-04-12T01:54:59Z","published":"2024-04-12T01:54:59Z","title":"Tackling Ambiguity from Perspective of Uncertainty Inference and\n Affinity Diversification for Weakly Supervised Semantic Segmentation","summary":" Weakly supervised semantic segmentation (WSSS) with image-level labels\nintends to achieve dense tasks without laborious annotations. However, due to\nthe ambiguous contexts and fuzzy regions, the performance of WSSS, especially\nthe stages of generating Class Activation Maps (CAMs) and refining pseudo\nmasks, widely suffers from ambiguity while being barely noticed by previous\nliterature. In this work, we propose UniA, a unified single-staged WSSS\nframework, to efficiently tackle this issue from the perspective of uncertainty\ninference and affinity diversification, respectively. When activating class\nobjects, we argue that the false activation stems from the bias to the\nambiguous regions during the feature extraction. Therefore, we design a more\nrobust feature representation with a probabilistic Gaussian distribution and\nintroduce the uncertainty estimation to avoid the bias. A distribution loss is\nparticularly proposed to supervise the process, which effectively captures the\nambiguity and models the complex dependencies among features. When refining\npseudo labels, we observe that the affinity from the prevailing refinement\nmethods intends to be similar among ambiguities. To this end, an affinity\ndiversification module is proposed to promote diversity among semantics. A\nmutual complementing refinement is proposed to initially rectify the ambiguous\naffinity with multiple inferred pseudo labels. More importantly, a contrastive\naffinity loss is further designed to diversify the relations among unrelated\nsemantics, which reliably propagates the diversity into the whole feature\nrepresentations and helps generate better pseudo masks. Extensive experiments\nare conducted on PASCAL VOC, MS COCO, and medical ACDC datasets, which validate\nthe efficiency of UniA tackling ambiguity and the superiority over recent\nsingle-staged or even most multi-staged competitors.\n","authors":["Zhiwei Yang","Yucong Meng","Kexue Fu","Shuo Wang","Zhijian Song"],"pdf_url":"https://arxiv.org/pdf/2404.08195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08187v1","updated":"2024-04-12T01:36:00Z","published":"2024-04-12T01:36:00Z","title":"Adapting CNNs for Fisheye Cameras without Retraining","summary":" The majority of image processing approaches assume images are in or can be\nrectified to a perspective projection. However, in many applications it is\nbeneficial to use non conventional cameras, such as fisheye cameras, that have\na larger field of view (FOV). The issue arises that these large-FOV images\ncan't be rectified to a perspective projection without significant cropping of\nthe original image. To address this issue we propose Rectified Convolutions\n(RectConv); a new approach for adapting pre-trained convolutional networks to\noperate with new non-perspective images, without any retraining. Replacing the\nconvolutional layers of the network with RectConv layers allows the network to\nsee both rectified patches and the entire FOV. We demonstrate RectConv adapting\nmultiple pre-trained networks to perform segmentation and detection on fisheye\nimagery from two publicly available datasets. Our approach requires no\nadditional data or training, and operates directly on the native image as\ncaptured from the camera. We believe this work is a step toward adapting the\nvast resources available for perspective images to operate across a broad range\nof camera geometries.\n","authors":["Ryan Griffiths","Donald G. Dansereau"],"pdf_url":"https://arxiv.org/pdf/2404.08187v1.pdf","comment":"Project page: https://roboticimaging.org/Projects/RectConv/"},{"id":"http://arxiv.org/abs/2404.08184v1","updated":"2024-04-12T01:13:23Z","published":"2024-04-12T01:13:23Z","title":"Measuring Domain Shifts using Deep Learning Remote Photoplethysmography\n Model Similarity","summary":" Domain shift differences between training data for deep learning models and\nthe deployment context can result in severe performance issues for models which\nfail to generalize. We study the domain shift problem under the context of\nremote photoplethysmography (rPPG), a technique for video-based heart rate\ninference. We propose metrics based on model similarity which may be used as a\nmeasure of domain shift, and we demonstrate high correlation between these\nmetrics and empirical performance. One of the proposed metrics with viable\ncorrelations, DS-diff, does not assume access to the ground truth of the target\ndomain, i.e. it may be applied to in-the-wild data. To that end, we investigate\na model selection problem in which ground truth results for the evaluation\ndomain is not known, demonstrating a 13.9% performance improvement over the\naverage case baseline.\n","authors":["Nathan Vance","Patrick Flynn"],"pdf_url":"https://arxiv.org/pdf/2404.08184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08181v1","updated":"2024-04-12T01:08:04Z","published":"2024-04-12T01:08:04Z","title":"Pay Attention to Your Neighbours: Training-Free Open-Vocabulary Semantic\n Segmentation","summary":" Despite the significant progress in deep learning for dense visual\nrecognition problems, such as semantic segmentation, traditional methods are\nconstrained by fixed class sets. Meanwhile, vision-language foundation models,\nsuch as CLIP, have showcased remarkable effectiveness in numerous zero-shot\nimage-level tasks, owing to their robust generalizability. Recently, a body of\nwork has investigated utilizing these models in open-vocabulary semantic\nsegmentation (OVSS). However, existing approaches often rely on impractical\nsupervised pre-training or access to additional pre-trained networks. In this\nwork, we propose a strong baseline for training-free OVSS, termed\nNeighbour-Aware CLIP (NACLIP), representing a straightforward adaptation of\nCLIP tailored for this scenario. Our method enforces localization of patches in\nthe self-attention of CLIP's vision transformer which, despite being crucial\nfor dense prediction tasks, has been overlooked in the OVSS literature. By\nincorporating design choices favouring segmentation, our approach significantly\nimproves performance without requiring additional data, auxiliary pre-trained\nnetworks, or extensive hyperparameter tuning, making it highly practical for\nreal-world applications. Experiments are performed on 8 popular semantic\nsegmentation benchmarks, yielding state-of-the-art performance on most\nscenarios. Our code is publicly available at https://github.com/sinahmr/NACLIP .\n","authors":["Sina Hajimiri","Ismail Ben Ayed","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2404.08181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03584v2","updated":"2024-04-12T00:52:35Z","published":"2023-06-06T11:03:05Z","title":"RDFC-GAN: RGB-Depth Fusion CycleGAN for Indoor Depth Completion","summary":" Raw depth images captured in indoor scenarios frequently exhibit extensive\nmissing values due to the inherent limitations of the sensors and environments.\nFor example, transparent materials frequently elude detection by depth sensors;\nsurfaces may introduce measurement inaccuracies due to their polished textures,\nextended distances, and oblique incidence angles from the sensor. The presence\nof incomplete depth maps imposes significant challenges for subsequent vision\napplications, prompting the development of numerous depth completion techniques\nto mitigate this problem. Numerous methods excel at reconstructing dense depth\nmaps from sparse samples, but they often falter when faced with extensive\ncontiguous regions of missing depth values, a prevalent and critical challenge\nin indoor environments. To overcome these challenges, we design a novel\ntwo-branch end-to-end fusion network named RDFC-GAN, which takes a pair of RGB\nand incomplete depth images as input to predict a dense and completed depth\nmap. The first branch employs an encoder-decoder structure, by adhering to the\nManhattan world assumption and utilizing normal maps from RGB-D information as\nguidance, to regress the local dense depth values from the raw depth map. The\nother branch applies an RGB-depth fusion CycleGAN, adept at translating RGB\nimagery into detailed, textured depth maps while ensuring high fidelity through\ncycle consistency. We fuse the two branches via adaptive fusion modules named\nW-AdaIN and train the model with the help of pseudo depth maps. Comprehensive\nevaluations on NYU-Depth V2 and SUN RGB-D datasets show that our method\nsignificantly enhances depth completion performance particularly in realistic\nindoor settings.\n","authors":["Haowen Wang","Zhengping Che","Yufan Yang","Mingyuan Wang","Zhiyuan Xu","Xiuquan Qiao","Mengshi Qi","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2306.03584v2.pdf","comment":"Haowen Wang and Zhengping Che are with equal contributions. Paper\n accepted by IEEE Transactions on Pattern Analysis and Machine Intelligence\n (TPAMI). An earlier version has been accepted by CVPR 2022\n (arXiv:2203.10856). arXiv admin note: text overlap with arXiv:2203.10856"},{"id":"http://arxiv.org/abs/2305.09948v5","updated":"2024-04-12T00:46:26Z","published":"2023-05-17T05:03:46Z","title":"HICO-DET-SG and V-COCO-SG: New Data Splits for Evaluating the Systematic\n Generalization Performance of Human-Object Interaction Detection Models","summary":" Human-Object Interaction (HOI) detection is a task to localize humans and\nobjects in an image and predict the interactions in human-object pairs. In\nreal-world scenarios, HOI detection models need systematic generalization,\ni.e., generalization to novel combinations of objects and interactions, because\nthe train data are expected to cover a limited portion of all possible\ncombinations. To evaluate the systematic generalization performance of HOI\ndetection models, we created two new sets of HOI detection data splits named\nHICO-DET-SG and V-COCO-SG based on the HICO-DET and V-COCO datasets,\nrespectively. When evaluated on the new data splits, HOI detection models with\nvarious characteristics performed much more poorly than when evaluated on the\noriginal splits. This shows that systematic generalization is a challenging\ngoal in HOI detection. By analyzing the evaluation results, we also gain\ninsights for improving the systematic generalization performance and identify\nfour possible future research directions. We hope that our new data splits and\npresented analysis will encourage further research on systematic generalization\nin HOI detection.\n","authors":["Kentaro Takemoto","Moyuru Yamada","Tomotake Sasaki","Hisanao Akima"],"pdf_url":"https://arxiv.org/pdf/2305.09948v5.pdf","comment":"19 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.08853v1","updated":"2024-04-12T23:49:37Z","published":"2024-04-12T23:49:37Z","title":"Uncertainty Quantification in Detecting Choroidal Metastases on MRI via\n Evolutionary Strategies","summary":" Uncertainty quantification plays a vital role in facilitating the practical\nimplementation of AI in radiology by addressing growing concerns around\ntrustworthiness. Given the challenges associated with acquiring large,\nannotated datasets in this field, there is a need for methods that enable\nuncertainty quantification in small data AI approaches tailored to radiology\nimages. In this study, we focused on uncertainty quantification within the\ncontext of the small data evolutionary strategies-based technique of deep\nneuroevolution (DNE). Specifically, we employed DNE to train a simple\nConvolutional Neural Network (CNN) with MRI images of the eyes for binary\nclassification. The goal was to distinguish between normal eyes and those with\nmetastatic tumors called choroidal metastases. The training set comprised 18\nimages with choroidal metastases and 18 without tumors, while the testing set\ncontained a tumor-to-normal ratio of 15:15.\n We trained CNN model weights via DNE for approximately 40,000 episodes,\nultimately reaching a convergence of 100% accuracy on the training set. We\nsaved all models that achieved maximal training set accuracy. Then, by applying\nthese models to the testing set, we established an ensemble method for\nuncertainty quantification.The saved set of models produced distributions for\neach testing set image between the two classes of normal and tumor-containing.\nThe relative frequencies permitted uncertainty quantification of model\npredictions. Intriguingly, we found that subjective features appreciated by\nhuman radiologists explained images for which uncertainty was high,\nhighlighting the significance of uncertainty quantification in AI-driven\nradiological analyses.\n","authors":["Bala McRae-Posani","Andrei Holodny","Hrithwik Shalu","Joseph N Stember"],"pdf_url":"https://arxiv.org/pdf/2404.08853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05294v2","updated":"2024-04-12T23:33:27Z","published":"2024-01-10T17:53:59Z","title":"Enhanced Muscle and Fat Segmentation for CT-Based Body Composition\n Analysis: A Comparative Study","summary":" Purpose: Body composition measurements from routine abdominal CT can yield\npersonalized risk assessments for asymptomatic and diseased patients. In\nparticular, attenuation and volume measures of muscle and fat are associated\nwith important clinical outcomes, such as cardiovascular events, fractures, and\ndeath. This study evaluates the reliability of an Internal tool for the\nsegmentation of muscle and fat (subcutaneous and visceral) as compared to the\nwell-established public TotalSegmentator tool.\n Methods: We assessed the tools across 900 CT series from the publicly\navailable SAROS dataset, focusing on muscle, subcutaneous fat, and visceral\nfat. The Dice score was employed to assess accuracy in subcutaneous fat and\nmuscle segmentation. Due to the lack of ground truth segmentations for visceral\nfat, Cohen's Kappa was utilized to assess segmentation agreement between the\ntools.\n Results: Our Internal tool achieved a 3% higher Dice (83.8 vs. 80.8) for\nsubcutaneous fat and a 5% improvement (87.6 vs. 83.2) for muscle segmentation\nrespectively. A Wilcoxon signed-rank test revealed that our results were\nstatistically different with p<0.01. For visceral fat, the Cohen's kappa score\nof 0.856 indicated near-perfect agreement between the two tools. Our internal\ntool also showed very strong correlations for muscle volume (R^2=0.99), muscle\nattenuation (R^2=0.93), and subcutaneous fat volume (R^2=0.99) with a moderate\ncorrelation for subcutaneous fat attenuation (R^2=0.45).\n Conclusion: Our findings indicated that our Internal tool outperformed\nTotalSegmentator in measuring subcutaneous fat and muscle. The high Cohen's\nKappa score for visceral fat suggests a reliable level of agreement between the\ntwo tools. These results demonstrate the potential of our tool in advancing the\naccuracy of body composition analysis.\n","authors":["Benjamin Hou","Tejas Sudharshan Mathai","Jianfei Liu","Christopher Parnell","Ronald M. Summers"],"pdf_url":"https://arxiv.org/pdf/2401.05294v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05949v3","updated":"2024-04-12T22:30:54Z","published":"2024-03-09T16:02:46Z","title":"General surgery vision transformer: A video pre-trained foundation model\n for general surgery","summary":" The absence of openly accessible data and specialized foundation models is a\nmajor barrier for computational research in surgery. Toward this, (i) we\nopen-source the largest dataset of general surgery videos to-date, consisting\nof 680 hours of surgical videos, including data from robotic and laparoscopic\ntechniques across 28 procedures; (ii) we propose a technique for video\npre-training a general surgery vision transformer (GSViT) on surgical videos\nbased on forward video prediction that can run in real-time for surgical\napplications, toward which we open-source the code and weights of GSViT; (iii)\nwe also release code and weights for procedure-specific fine-tuned versions of\nGSViT across 10 procedures; (iv) we demonstrate the performance of GSViT on the\nCholec80 phase annotation task, displaying improved performance over\nstate-of-the-art single frame predictors.\n","authors":["Samuel Schmidgall","Ji Woong Kim","Jeffrey Jopling","Axel Krieger"],"pdf_url":"https://arxiv.org/pdf/2403.05949v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17179v3","updated":"2024-04-12T22:23:32Z","published":"2023-11-28T19:14:40Z","title":"SatCLIP: Global, General-Purpose Location Embeddings with Satellite\n Imagery","summary":" Geographic information is essential for modeling tasks in fields ranging from\necology to epidemiology. However, extracting relevant location characteristics\nfor a given task can be challenging, often requiring expensive data fusion or\ndistillation from massive global imagery datasets. To address this challenge,\nwe introduce Satellite Contrastive Location-Image Pretraining (SatCLIP). This\nglobal, general-purpose geographic location encoder learns an implicit\nrepresentation of locations by matching CNN and ViT inferred visual patterns of\nopenly available satellite imagery with their geographic coordinates. The\nresulting SatCLIP location encoder efficiently summarizes the characteristics\nof any given location for convenient use in downstream tasks. In our\nexperiments, we use SatCLIP embeddings to improve prediction performance on\nnine diverse location-dependent tasks including temperature prediction, animal\nrecognition, and population density estimation. Across tasks, SatCLIP\nconsistently outperforms alternative location encoders and improves geographic\ngeneralization by encoding visual similarities of spatially distant\nenvironments. These results demonstrate the potential of vision-location models\nto learn meaningful representations of our planet from the vast, varied, and\nlargely untapped modalities of geospatial data.\n","authors":["Konstantin Klemmer","Esther Rolf","Caleb Robinson","Lester Mackey","Marc Rußwurm"],"pdf_url":"https://arxiv.org/pdf/2311.17179v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12850v3","updated":"2024-04-12T22:08:40Z","published":"2023-10-19T14:04:53Z","title":"PrivImage: Differentially Private Synthetic Image Generation using\n Diffusion Models with Semantic-Aware Pretraining","summary":" Differential Privacy (DP) image data synthesis, which leverages the DP\ntechnique to generate synthetic data to replace the sensitive data, allowing\norganizations to share and utilize synthetic images without privacy concerns.\nPrevious methods incorporate the advanced techniques of generative models and\npre-training on a public dataset to produce exceptional DP image data, but\nsuffer from problems of unstable training and massive computational resource\ndemands. This paper proposes a novel DP image synthesis method, termed\nPRIVIMAGE, which meticulously selects pre-training data, promoting the\nefficient creation of DP datasets with high fidelity and utility. PRIVIMAGE\nfirst establishes a semantic query function using a public dataset. Then, this\nfunction assists in querying the semantic distribution of the sensitive\ndataset, facilitating the selection of data from the public dataset with\nanalogous semantics for pre-training. Finally, we pre-train an image generative\nmodel using the selected data and then fine-tune this model on the sensitive\ndataset using Differentially Private Stochastic Gradient Descent (DP-SGD).\nPRIVIMAGE allows us to train a lightly parameterized generative model, reducing\nthe noise in the gradient during DP-SGD training and enhancing training\nstability. Extensive experiments demonstrate that PRIVIMAGE uses only 1% of the\npublic dataset for pre-training and 7.6% of the parameters in the generative\nmodel compared to the state-of-the-art method, whereas achieves superior\nsynthetic performance and conserves more computational resources. On average,\nPRIVIMAGE achieves 30.1% lower FID and 12.6% higher Classification Accuracy\nthan the state-of-the-art method. The replication package and datasets can be\naccessed online.\n","authors":["Kecen Li","Chen Gong","Zhixiang Li","Yuzhong Zhao","Xinwen Hou","Tianhao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12850v3.pdf","comment":"Accepted at USENIX Security 2024. The first two authors contributed\n equally"},{"id":"http://arxiv.org/abs/2404.08831v1","updated":"2024-04-12T22:05:01Z","published":"2024-04-12T22:05:01Z","title":"Structured Model Pruning for Efficient Inference in Computational\n Pathology","summary":" Recent years have seen significant efforts to adopt Artificial Intelligence\n(AI) in healthcare for various use cases, from computer-aided diagnosis to ICU\ntriage. However, the size of AI models has been rapidly growing due to scaling\nlaws and the success of foundational models, which poses an increasing\nchallenge to leverage advanced models in practical applications. It is thus\nimperative to develop efficient models, especially for deploying AI solutions\nunder resource-constrains or with time sensitivity. One potential solution is\nto perform model compression, a set of techniques that remove less important\nmodel components or reduce parameter precision, to reduce model computation\ndemand. In this work, we demonstrate that model pruning, as a model compression\ntechnique, can effectively reduce inference cost for computational and digital\npathology based analysis with a negligible loss of analysis performance. To\nthis end, we develop a methodology for pruning the widely used U-Net-style\narchitectures in biomedical imaging, with which we evaluate multiple pruning\nheuristics on nuclei instance segmentation and classification, and empirically\ndemonstrate that pruning can compress models by at least 70% with a negligible\ndrop in performance.\n","authors":["Mohammed Adnan","Qinle Ba","Nazim Shaikh","Shivam Kalra","Satarupa Mukherjee","Auranuch Lorsakul"],"pdf_url":"https://arxiv.org/pdf/2404.08831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02263v3","updated":"2024-04-12T22:03:06Z","published":"2024-02-03T21:12:36Z","title":"MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly\n Mixed Classifiers","summary":" Adversarial robustness often comes at the cost of degraded accuracy, impeding\nthe real-life application of robust classification models. Training-based\nsolutions for better trade-offs are limited by incompatibilities with\nalready-trained high-performance large models, necessitating the exploration of\ntraining-free ensemble approaches. Observing that robust models are more\nconfident in correct predictions than in incorrect ones on clean and\nadversarial data alike, we speculate amplifying this \"benign confidence\nproperty\" can reconcile accuracy and robustness in an ensemble setting. To\nachieve so, we propose \"MixedNUTS\", a training-free method where the output\nlogits of a robust classifier and a standard non-robust classifier are\nprocessed by nonlinear transformations with only three parameters, which are\noptimized through an efficient algorithm. MixedNUTS then converts the\ntransformed logits into probabilities and mixes them as the overall output. On\nCIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom\nstrong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and\nnear-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points,\nsacrificing merely 0.87 points in robust accuracy.\n","authors":["Yatong Bai","Mo Zhou","Vishal M. Patel","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2402.02263v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08827v1","updated":"2024-04-12T21:56:21Z","published":"2024-04-12T21:56:21Z","title":"\"Don't forget to put the milk back!\" Dataset for Enabling Embodied\n Agents to Detect Anomalous Situations","summary":" Home robots intend to make their users lives easier. Our work assists in this\ngoal by enabling robots to inform their users of dangerous or unsanitary\nanomalies in their home. Some examples of these anomalies include the user\nleaving their milk out, forgetting to turn off the stove, or leaving poison\naccessible to children. To move towards enabling home robots with these\nabilities, we have created a new dataset, which we call SafetyDetect. The\nSafetyDetect dataset consists of 1000 anomalous home scenes, each of which\ncontains unsafe or unsanitary situations for an agent to detect. Our approach\nutilizes large language models (LLMs) alongside both a graph representation of\nthe scene and the relationships between the objects in the scene. Our key\ninsight is that this connected scene graph and the object relationships it\nencodes enables the LLM to better reason about the scene -- especially as it\nrelates to detecting dangerous or unsanitary situations. Our most promising\napproach utilizes GPT-4 and pursues a categorization technique where object\nrelations from the scene graph are classified as normal, dangerous, unsanitary,\nor dangerous for children. This method is able to correctly identify over 90%\nof anomalous scenarios in the SafetyDetect Dataset. Additionally, we conduct\nreal world experiments on a ClearPath TurtleBot where we generate a scene graph\nfrom visuals of the real world scene, and run our approach with no\nmodification. This setup resulted in little performance loss. The SafetyDetect\nDataset and code will be released to the public upon this papers publication.\n","authors":["James F. Mullen Jr","Prasoon Goyal","Robinson Piramuthu","Michael Johnston","Dinesh Manocha","Reza Ghanadan"],"pdf_url":"https://arxiv.org/pdf/2404.08827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08820v1","updated":"2024-04-12T21:30:09Z","published":"2024-04-12T21:30:09Z","title":"Single-image driven 3d viewpoint training data augmentation for\n effective wine label recognition","summary":" Confronting the critical challenge of insufficient training data in the field\nof complex image recognition, this paper introduces a novel 3D viewpoint\naugmentation technique specifically tailored for wine label recognition. This\nmethod enhances deep learning model performance by generating visually\nrealistic training samples from a single real-world wine label image,\novercoming the challenges posed by the intricate combinations of text and\nlogos. Classical Generative Adversarial Network (GAN) methods fall short in\nsynthesizing such intricate content combination. Our proposed solution\nleverages time-tested computer vision and image processing strategies to expand\nour training dataset, thereby broadening the range of training samples for deep\nlearning applications. This innovative approach to data augmentation\ncircumvents the constraints of limited training resources. Using the augmented\ntraining images through batch-all triplet metric learning on a Vision\nTransformer (ViT) architecture, we can get the most discriminative embedding\nfeatures for every wine label, enabling us to perform one-shot recognition of\nexisting wine labels in the training classes or future newly collected wine\nlabels unavailable in the training. Experimental results show a significant\nincrease in recognition accuracy over conventional 2D data augmentation\ntechniques.\n","authors":["Yueh-Cheng Huang","Hsin-Yi Chen","Cheng-Jui Hung","Jen-Hui Chuang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2404.08820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07214v2","updated":"2024-04-12T21:20:37Z","published":"2024-02-20T18:57:34Z","title":"Exploring the Frontier of Vision-Language Models: A Survey of Current\n Methodologies and Future Directions","summary":" The advent of Large Language Models (LLMs) has significantly reshaped the\ntrajectory of the AI revolution. Nevertheless, these LLMs exhibit a notable\nlimitation, as they are primarily adept at processing textual information. To\naddress this constraint, researchers have endeavored to integrate visual\ncapabilities with LLMs, resulting in the emergence of Vision-Language Models\n(VLMs). These advanced models are instrumental in tackling more intricate tasks\nsuch as image captioning and visual question answering. In our comprehensive\nsurvey paper, we delve into the key advancements within the realm of VLMs. Our\nclassification organizes VLMs into three distinct categories: models dedicated\nto vision-language understanding, models that process multimodal inputs to\ngenerate unimodal (textual) outputs and models that both accept and produce\nmultimodal inputs and outputs.This classification is based on their respective\ncapabilities and functionalities in processing and generating various\nmodalities of data.We meticulously dissect each model, offering an extensive\nanalysis of its foundational architecture, training data sources, as well as\nits strengths and limitations wherever possible, providing readers with a\ncomprehensive understanding of its essential components. We also analyzed the\nperformance of VLMs in various benchmark datasets. By doing so, we aim to offer\na nuanced understanding of the diverse landscape of VLMs. Additionally, we\nunderscore potential avenues for future research in this dynamic domain,\nanticipating further breakthroughs and advancements.\n","authors":["Akash Ghosh","Arkadeep Acharya","Sriparna Saha","Vinija Jain","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2404.07214v2.pdf","comment":"The most extensive and up to date Survey on Visual Language Models\n covering 76 Visual Language Models"},{"id":"http://arxiv.org/abs/2312.01117v2","updated":"2024-04-12T21:19:36Z","published":"2023-12-02T12:23:07Z","title":"Paved2Paradise: Cost-Effective and Scalable LiDAR Simulation by\n Factoring the Real World","summary":" To achieve strong real world performance, neural networks must be trained on\nlarge, diverse datasets; however, obtaining and annotating such datasets is\ncostly and time-consuming, particularly for 3D point clouds. In this paper, we\ndescribe Paved2Paradise, a simple, cost-effective approach for generating fully\nlabeled, diverse, and realistic lidar datasets from scratch, all while\nrequiring minimal human annotation. Our key insight is that, by deliberately\ncollecting separate \"background\" and \"object\" datasets (i.e., \"factoring the\nreal world\"), we can intelligently combine them to produce a combinatorially\nlarge and diverse training set. The Paved2Paradise pipeline thus consists of\nfour steps: (1) collecting copious background data, (2) recording individuals\nfrom the desired object class(es) performing different behaviors in an isolated\nenvironment (like a parking lot), (3) bootstrapping labels for the object\ndataset, and (4) generating samples by placing objects at arbitrary locations\nin backgrounds. To demonstrate the utility of Paved2Paradise, we generated\nsynthetic datasets for two tasks: (1) human detection in orchards (a task for\nwhich no public data exists) and (2) pedestrian detection in urban\nenvironments. Qualitatively, we find that a model trained exclusively on\nPaved2Paradise synthetic data is highly effective at detecting humans in\norchards, including when individuals are heavily occluded by tree branches.\nQuantitatively, a model trained on Paved2Paradise data that sources backgrounds\nfrom KITTI performs comparably to a model trained on the actual dataset. These\nresults suggest the Paved2Paradise synthetic data pipeline can help accelerate\npoint cloud model development in sectors where acquiring lidar datasets has\npreviously been cost-prohibitive.\n","authors":["Michael A. Alcorn","Noah Schwartz"],"pdf_url":"https://arxiv.org/pdf/2312.01117v2.pdf","comment":"Accepted to the Synthetic Data for Computer Vision workshop at CVPR\n 2024"},{"id":"http://arxiv.org/abs/2404.08814v1","updated":"2024-04-12T21:14:20Z","published":"2024-04-12T21:14:20Z","title":"E3: Ensemble of Expert Embedders for Adapting Synthetic Image Detectors\n to New Generators Using Limited Data","summary":" As generative AI progresses rapidly, new synthetic image generators continue\nto emerge at a swift pace. Traditional detection methods face two main\nchallenges in adapting to these generators: the forensic traces of synthetic\nimages from new techniques can vastly differ from those learned during\ntraining, and access to data for these new generators is often limited. To\naddress these issues, we introduce the Ensemble of Expert Embedders (E3), a\nnovel continual learning framework for updating synthetic image detectors. E3\nenables the accurate detection of images from newly emerged generators using\nminimal training data. Our approach does this by first employing transfer\nlearning to develop a suite of expert embedders, each specializing in the\nforensic traces of a specific generator. Then, all embeddings are jointly\nanalyzed by an Expert Knowledge Fusion Network to produce accurate and reliable\ndetection decisions. Our experiments demonstrate that E3 outperforms existing\ncontinual learning methods, including those developed specifically for\nsynthetic image detection.\n","authors":["Aref Azizpour","Tai D. Nguyen","Manil Shrestha","Kaidi Xu","Edward Kim","Matthew C. Stamm"],"pdf_url":"https://arxiv.org/pdf/2404.08814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05695v4","updated":"2024-04-12T21:11:16Z","published":"2023-08-10T16:57:14Z","title":"Masked Diffusion as Self-supervised Representation Learner","summary":" Denoising diffusion probabilistic models have recently demonstrated\nstate-of-the-art generative performance and have been used as strong\npixel-level representation learners. This paper decomposes the interrelation\nbetween the generative capability and representation learning ability inherent\nin diffusion models. We present the masked diffusion model (MDM), a scalable\nself-supervised representation learner for semantic segmentation, substituting\nthe conventional additive Gaussian noise of traditional diffusion with a\nmasking mechanism. Our proposed approach convincingly surpasses prior\nbenchmarks, demonstrating remarkable advancements in both medical and natural\nimage semantic segmentation tasks, particularly in few-shot scenarios.\n","authors":["Zixuan Pan","Jianxu Chen","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2308.05695v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07381v2","updated":"2024-04-12T20:41:14Z","published":"2023-12-12T15:57:03Z","title":"ScribblePrompt: Fast and Flexible Interactive Segmentation for Any\n Biomedical Image","summary":" Biomedical image segmentation is a crucial part of both scientific research\nand clinical care. With enough labelled data, deep learning models can be\ntrained to accurately automate specific biomedical image segmentation tasks.\nHowever, manually segmenting images to create training data is highly labor\nintensive and requires domain expertise. We present ScribblePrompt, a flexible\nneural network based interactive segmentation tool for biomedical imaging that\nenables human annotators to segment previously unseen structures using\nscribbles, clicks, and bounding boxes. Through rigorous quantitative\nexperiments, we demonstrate that given comparable amounts of interaction,\nScribblePrompt produces more accurate segmentations than previous methods on\ndatasets unseen during training. In a user study with domain experts,\nScribblePrompt reduced annotation time by 28% while improving Dice by 15%\ncompared to the next best method. ScribblePrompt's success rests on a set of\ncareful design decisions. These include a training strategy that incorporates\nboth a highly diverse set of images and tasks, novel algorithms for simulated\nuser interactions and labels, and a network that enables fast inference. We\nshowcase ScribblePrompt in an online demo and provide code at\nhttps://scribbleprompt.csail.mit.edu\n","authors":["Hallee E. Wong","Marianne Rakic","John Guttag","Adrian V. Dalca"],"pdf_url":"https://arxiv.org/pdf/2312.07381v2.pdf","comment":"Project Website: https://scribbleprompt.csail.mit.edu Keywords:\n Interactive Segmentation, Medical Imaging, Segment Anything Model, SAM,\n Scribble Annotations, Prompt"},{"id":"http://arxiv.org/abs/2404.08805v1","updated":"2024-04-12T20:39:19Z","published":"2024-04-12T20:39:19Z","title":"Real-time guidewire tracking and segmentation in intraoperative x-ray","summary":" During endovascular interventions, physicians have to perform accurate and\nimmediate operations based on the available real-time information, such as the\nshape and position of guidewires observed on the fluoroscopic images, haptic\ninformation and the patients' physiological signals. For this purpose,\nreal-time and accurate guidewire segmentation and tracking can enhance the\nvisualization of guidewires and provide visual feedback for physicians during\nthe intervention as well as for robot-assisted interventions. Nevertheless,\nthis task often comes with the challenge of elongated deformable structures\nthat present themselves with low contrast in the noisy fluoroscopic image\nsequences. To address these issues, a two-stage deep learning framework for\nreal-time guidewire segmentation and tracking is proposed. In the first stage,\na Yolov5s detector is trained, using the original X-ray images as well as\nsynthetic ones, which is employed to output the bounding boxes of possible\ntarget guidewires. More importantly, a refinement module based on\nspatiotemporal constraints is incorporated to robustly localize the guidewire\nand remove false detections. In the second stage, a novel and efficient network\nis proposed to segment the guidewire in each detected bounding box. The network\ncontains two major modules, namely a hessian-based enhancement embedding module\nand a dual self-attention module. Quantitative and qualitative evaluations on\nclinical intra-operative images demonstrate that the proposed approach\nsignificantly outperforms our baselines as well as the current state of the art\nand, in comparison, shows higher robustness to low quality images.\n","authors":["Baochang Zhang","Mai Bui","Cheng Wang","Felix Bourier","Heribert Schunkert","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2404.08805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02344v2","updated":"2024-04-12T20:18:00Z","published":"2024-04-02T22:37:34Z","title":"Generative AI-Based Effective Malware Detection for Embedded Computing\n Systems","summary":" One of the pivotal security threats for the embedded computing systems is\nmalicious software a.k.a malware. With efficiency and efficacy, Machine\nLearning (ML) has been widely adopted for malware detection in recent times.\nDespite being efficient, the existing techniques require a tremendous number of\nbenign and malware samples for training and modeling an efficient malware\ndetector. Furthermore, such constraints limit the detection of emerging malware\nsamples due to the lack of sufficient malware samples required for efficient\ntraining. To address such concerns, we introduce a code-aware data generation\ntechnique that generates multiple mutated samples of the limitedly seen malware\nby the devices. Loss minimization ensures that the generated samples closely\nmimic the limitedly seen malware and mitigate the impractical samples. Such\ndeveloped malware is further incorporated into the training set to formulate\nthe model that can efficiently detect the emerging malware despite having\nlimited exposure. The experimental results demonstrates that the proposed\ntechnique achieves an accuracy of 90% in detecting limitedly seen malware,\nwhich is approximately 3x more than the accuracy attained by state-of-the-art\ntechniques.\n","authors":["Sreenitha Kasarapu","Sanket Shukla","Rakibul Hassan","Avesta Sasan","Houman Homayoun","Sai Manoj Pudukotai Dinakarrao"],"pdf_url":"https://arxiv.org/pdf/2404.02344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08799v1","updated":"2024-04-12T20:16:03Z","published":"2024-04-12T20:16:03Z","title":"Semantic Approach to Quantifying the Consistency of Diffusion Model\n Image Generation","summary":" In this study, we identify the need for an interpretable, quantitative score\nof the repeatability, or consistency, of image generation in diffusion models.\nWe propose a semantic approach, using a pairwise mean CLIP (Contrastive\nLanguage-Image Pretraining) score as our semantic consistency score. We applied\nthis metric to compare two state-of-the-art open-source image generation\ndiffusion models, Stable Diffusion XL and PixArt-{\\alpha}, and we found\nstatistically significant differences between the semantic consistency scores\nfor the models. Agreement between the Semantic Consistency Score selected model\nand aggregated human annotations was 94%. We also explored the consistency of\nSDXL and a LoRA-fine-tuned version of SDXL and found that the fine-tuned model\nhad significantly higher semantic consistency in generated images. The Semantic\nConsistency Score proposed here offers a measure of image generation alignment,\nfacilitating the evaluation of model architectures for specific tasks and\naiding in informed decision-making regarding model selection.\n","authors":["Brinnae Bent"],"pdf_url":"https://arxiv.org/pdf/2404.08799v1.pdf","comment":"Accepted to 2024 CVPR 3rd Explainable AI for Computer Vision (XAI4CV)\n Workshop"},{"id":"http://arxiv.org/abs/2403.05297v3","updated":"2024-04-12T20:10:29Z","published":"2024-03-08T13:24:46Z","title":"PEEB: Part-based Image Classifiers with an Explainable and Editable\n Language Bottleneck","summary":" CLIP-based classifiers rely on the prompt containing a {class name} that is\nknown to the text encoder. Therefore, they perform poorly on new classes or the\nclasses whose names rarely appear on the Internet (e.g., scientific names of\nbirds). For fine-grained classification, we propose PEEB - an explainable and\neditable classifier to (1) express the class name into a set of text\ndescriptors that describe the visual parts of that class; and (2) match the\nembeddings of the detected parts to their textual descriptors in each class to\ncompute a logit score for classification. In a zero-shot setting where the\nclass names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1\naccuracy). Compared to part-based classifiers, PEEB is not only the\nstate-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20%\naccuracy on CUB-200 and Dogs-120, respectively) but also the first to enable\nusers to edit the text descriptors to form a new classifier without any\nre-training. Compared to concept bottleneck models, PEEB is also the SOTA in\nboth zero-shot and supervised-learning settings.\n","authors":["Thang M. Pham","Peijie Chen","Tin Nguyen","Seunghyun Yoon","Trung Bui","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2403.05297v3.pdf","comment":"Findings of NAACL 2024 (long paper)"},{"id":"http://arxiv.org/abs/2401.12946v6","updated":"2024-04-12T19:48:27Z","published":"2024-01-23T18:07:07Z","title":"Coverage Axis++: Efficient Inner Point Selection for 3D Shape\n Skeletonization","summary":" We introduce Coverage Axis++, a novel and efficient approach to 3D shape\nskeletonization. The current state-of-the-art approaches for this task often\nrely on the watertightness of the input or suffer from substantial\ncomputational costs, thereby limiting their practicality. To address this\nchallenge, Coverage Axis++ proposes a heuristic algorithm to select skeletal\npoints, offering a high-accuracy approximation of the Medial Axis Transform\n(MAT) while significantly mitigating computational intensity for various shape\nrepresentations. We introduce a simple yet effective strategy that considers\nshape coverage, uniformity, and centrality to derive skeletal points. The\nselection procedure enforces consistency with the shape structure while\nfavoring the dominant medial balls, which thus introduces a compact underlying\nshape representation in terms of MAT. As a result, Coverage Axis++ allows for\nskeletonization for various shape representations (e.g., water-tight meshes,\ntriangle soups, point clouds), specification of the number of skeletal points,\nfew hyperparameters, and highly efficient computation with improved\nreconstruction accuracy. Extensive experiments across a wide range of 3D shapes\nvalidate the efficiency and effectiveness of Coverage Axis++. The code will be\npublicly available once the paper is published.\n","authors":["Zimeng Wang","Zhiyang Dou","Rui Xu","Cheng Lin","Yuan Liu","Xiaoxiao Long","Shiqing Xin","Taku Komura","Xiaoming Yuan","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12946v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08788v1","updated":"2024-04-12T19:29:10Z","published":"2024-04-12T19:29:10Z","title":"Detecting AI-Generated Images via CLIP","summary":" As AI-generated image (AIGI) methods become more powerful and accessible, it\nhas become a critical task to determine if an image is real or AI-generated.\nBecause AIGI lack the signatures of photographs and have their own unique\npatterns, new models are needed to determine if an image is AI-generated. In\nthis paper, we investigate the ability of the Contrastive Language-Image\nPre-training (CLIP) architecture, pre-trained on massive internet-scale data\nsets, to perform this differentiation. We fine-tune CLIP on real images and\nAIGI from several generative models, enabling CLIP to determine if an image is\nAI-generated and, if so, determine what generation method was used to create\nit. We show that the fine-tuned CLIP architecture is able to differentiate AIGI\nas well or better than models whose architecture is specifically designed to\ndetect AIGI. Our method will significantly increase access to AIGI-detecting\ntools and reduce the negative effects of AIGI on society, as our CLIP\nfine-tuning procedures require no architecture changes from publicly available\nmodel repositories and consume significantly less GPU resources than other AIGI\ndetection models.\n","authors":["A. G. Moskowitz","T. Gaona","J. Peterson"],"pdf_url":"https://arxiv.org/pdf/2404.08788v1.pdf","comment":"submitted for publication in Machine Vision and Applications"},{"id":"http://arxiv.org/abs/2404.08785v1","updated":"2024-04-12T19:13:42Z","published":"2024-04-12T19:13:42Z","title":"Under pressure: learning-based analog gauge reading in the wild","summary":" We propose an interpretable framework for reading analog gauges that is\ndeployable on real world robotic systems. Our framework splits the reading task\ninto distinct steps, such that we can detect potential failures at each step.\nOur system needs no prior knowledge of the type of gauge or the range of the\nscale and is able to extract the units used. We show that our gauge reading\nalgorithm is able to extract readings with a relative reading error of less\nthan 2%.\n","authors":["Maurits Reitsma","Julian Keller","Kenneth Blomqvist","Roland Siegwart"],"pdf_url":"https://arxiv.org/pdf/2404.08785v1.pdf","comment":"7 pages, 8 figures, accepted for presentation at the 2024 IEEE\n International Conference on Robotics and Automation (ICRA) and for inclusion\n in the conference proceedings, finalist for the IEEE ICRA 2024 Best Paper\n Award in Automation, source code\n https://github.com/ethz-asl/analog_gauge_reader, Autonomous Systems Lab, ETH\n Zurich"},{"id":"http://arxiv.org/abs/2404.08778v1","updated":"2024-04-12T19:04:59Z","published":"2024-04-12T19:04:59Z","title":"Towards Sim-to-Real Industrial Parts Classification with Synthetic\n Dataset","summary":" This paper is about effectively utilizing synthetic data for training deep\nneural networks for industrial parts classification, in particular, by taking\ninto account the domain gap against real-world images. To this end, we\nintroduce a synthetic dataset that may serve as a preliminary testbed for the\nSim-to-Real challenge; it contains 17 objects of six industrial use cases,\nincluding isolated and assembled parts. A few subsets of objects exhibit large\nsimilarities in shape and albedo for reflecting challenging cases of industrial\nparts. All the sample images come with and without random backgrounds and\npost-processing for evaluating the importance of domain randomization. We call\nit Synthetic Industrial Parts dataset (SIP-17). We study the usefulness of\nSIP-17 through benchmarking the performance of five state-of-the-art deep\nnetwork models, supervised and self-supervised, trained only on the synthetic\ndata while testing them on real data. By analyzing the results, we deduce some\ninsights on the feasibility and challenges of using synthetic data for\nindustrial parts classification and for further developing larger-scale\nsynthetic datasets. Our dataset and code are publicly available.\n","authors":["Xiaomeng Zhu","Talha Bilal","Pär Mårtensson","Lars Hanson","Mårten Björkman","Atsuto Maki"],"pdf_url":"https://arxiv.org/pdf/2404.08778v1.pdf","comment":"Published in 2023 IEEE/CVF Conference on Computer Vision and Pattern\n Recognition Workshops (CVPRW)"},{"id":"http://arxiv.org/abs/2404.08767v1","updated":"2024-04-12T18:45:51Z","published":"2024-04-12T18:45:51Z","title":"LLM-Seg: Bridging Image Segmentation and Large Language Model Reasoning","summary":" Understanding human instructions to identify the target objects is vital for\nperception systems. In recent years, the advancements of Large Language Models\n(LLMs) have introduced new possibilities for image segmentation. In this work,\nwe delve into reasoning segmentation, a novel task that enables segmentation\nsystem to reason and interpret implicit user intention via large language model\nreasoning and then segment the corresponding target. Our work on reasoning\nsegmentation contributes on both the methodological design and dataset\nlabeling. For the model, we propose a new framework named LLM-Seg. LLM-Seg\neffectively connects the current foundational Segmentation Anything Model and\nthe LLM by mask proposals selection. For the dataset, we propose an automatic\ndata generation pipeline and construct a new reasoning segmentation dataset\nnamed LLM-Seg40K. Experiments demonstrate that our LLM-Seg exhibits competitive\nperformance compared with existing methods. Furthermore, our proposed pipeline\ncan efficiently produce high-quality reasoning segmentation datasets. The\nLLM-Seg40K dataset, developed through this pipeline, serves as a new benchmark\nfor training and evaluating various reasoning segmentation approaches. Our\ncode, models and dataset are at https://github.com/wangjunchi/LLMSeg.\n","authors":["Junchi Wang","Lei Ke"],"pdf_url":"https://arxiv.org/pdf/2404.08767v1.pdf","comment":"Github: https://github.com/wangjunchi/LLMSeg"},{"id":"http://arxiv.org/abs/2404.08761v1","updated":"2024-04-12T18:37:00Z","published":"2024-04-12T18:37:00Z","title":"`Eyes of a Hawk and Ears of a Fox': Part Prototype Network for\n Generalized Zero-Shot Learning","summary":" Current approaches in Generalized Zero-Shot Learning (GZSL) are built upon\nbase models which consider only a single class attribute vector representation\nover the entire image. This is an oversimplification of the process of novel\ncategory recognition, where different regions of the image may have properties\nfrom different seen classes and thus have different predominant attributes.\nWith this in mind, we take a fundamentally different approach: a pre-trained\nVision-Language detector (VINVL) sensitive to attribute information is employed\nto efficiently obtain region features. A learned function maps the region\nfeatures to region-specific attribute attention used to construct class part\nprototypes. We conduct experiments on a popular GZSL benchmark consisting of\nthe CUB, SUN, and AWA2 datasets where our proposed Part Prototype Network (PPN)\nachieves promising results when compared with other popular base models.\nCorresponding ablation studies and analysis show that our approach is highly\npractical and has a distinct advantage over global attribute attention when\nlocalized proposals are available.\n","authors":["Joshua Feinglass","Jayaraman J. Thiagarajan","Rushil Anirudh","T. S. Jayram","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.08761v1.pdf","comment":"Accepted to the CVPR 2024 LIMIT Workshop"},{"id":"http://arxiv.org/abs/2312.04552v2","updated":"2024-04-12T18:34:31Z","published":"2023-12-07T18:59:20Z","title":"Generating Illustrated Instructions","summary":" We introduce the new task of generating Illustrated Instructions, i.e.,\nvisual instructions customized to a user's needs. We identify desiderata unique\nto this task, and formalize it through a suite of automatic and human\nevaluation metrics, designed to measure the validity, consistency, and efficacy\nof the generations. We combine the power of large language models (LLMs)\ntogether with strong text-to-image generation diffusion models to propose a\nsimple approach called StackedDiffusion, which generates such illustrated\ninstructions given text as input. The resulting model strongly outperforms\nbaseline approaches and state-of-the-art multimodal LLMs; and in 30% of cases,\nusers even prefer it to human-generated articles. Most notably, it enables\nvarious new and exciting applications far beyond what static articles on the\nweb can provide, such as personalized instructions complete with intermediate\nsteps and pictures in response to a user's individual situation.\n","authors":["Sachit Menon","Ishan Misra","Rohit Girdhar"],"pdf_url":"https://arxiv.org/pdf/2312.04552v2.pdf","comment":"Accepted to CVPR 2024. Project website:\n http://facebookresearch.github.io/IllustratedInstructions. Code reproduction:\n https://github.com/sachit-menon/generating-illustrated-instructions-reproduction"},{"id":"http://arxiv.org/abs/2404.08756v1","updated":"2024-04-12T18:29:10Z","published":"2024-04-12T18:29:10Z","title":"SCOUT+: Towards Practical Task-Driven Drivers' Gaze Prediction","summary":" Accurate prediction of drivers' gaze is an important component of\nvision-based driver monitoring and assistive systems. Of particular interest\nare safety-critical episodes, such as performing maneuvers or crossing\nintersections. In such scenarios, drivers' gaze distribution changes\nsignificantly and becomes difficult to predict, especially if the task and\ncontext information is represented implicitly, as is common in many\nstate-of-the-art models. However, explicit modeling of top-down factors\naffecting drivers' attention often requires additional information and\nannotations that may not be readily available.\n In this paper, we address the challenge of effective modeling of task and\ncontext with common sources of data for use in practical systems. To this end,\nwe introduce SCOUT+, a task- and context-aware model for drivers' gaze\nprediction, which leverages route and map information inferred from commonly\navailable GPS data. We evaluate our model on two datasets, DR(eye)VE and BDD-A,\nand demonstrate that using maps improves results compared to bottom-up models\nand reaches performance comparable to the top-down model SCOUT which relies on\nprivileged ground truth information. Code is available at\nhttps://github.com/ykotseruba/SCOUT.\n","authors":["Iuliia Kotseruba","John K. Tsotsos"],"pdf_url":"https://arxiv.org/pdf/2404.08756v1.pdf","comment":"Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024"},{"id":"http://arxiv.org/abs/2404.08755v1","updated":"2024-04-12T18:28:44Z","published":"2024-04-12T18:28:44Z","title":"Training a Vision Language Model as Smartphone Assistant","summary":" Addressing the challenge of a digital assistant capable of executing a wide\narray of user tasks, our research focuses on the realm of instruction-based\nmobile device control. We leverage recent advancements in large language models\n(LLMs) and present a visual language model (VLM) that can fulfill diverse tasks\non mobile devices. Our model functions by interacting solely with the user\ninterface (UI). It uses the visual input from the device screen and mimics\nhuman-like interactions, encompassing gestures such as tapping and swiping.\nThis generality in the input and output space allows our agent to interact with\nany application on the device. Unlike previous methods, our model operates not\nonly on a single screen image but on vision-language sentences created from\nsequences of past screenshots along with corresponding actions. Evaluating our\nmethod on the challenging Android in the Wild benchmark demonstrates its\npromising efficacy and potential.\n","authors":["Nicolai Dorka","Janusz Marecki","Ammar Anwar"],"pdf_url":"https://arxiv.org/pdf/2404.08755v1.pdf","comment":"ICLR 2024 workshop on Generative Models for Decision Making"},{"id":"http://arxiv.org/abs/2404.08749v1","updated":"2024-04-12T18:23:00Z","published":"2024-04-12T18:23:00Z","title":"Data Limitations for Modeling Top-Down Effects on Drivers' Attention","summary":" Driving is a visuomotor task, i.e., there is a connection between what\ndrivers see and what they do. While some models of drivers' gaze account for\ntop-down effects of drivers' actions, the majority learn only bottom-up\ncorrelations between human gaze and driving footage. The crux of the problem is\nlack of public data with annotations that could be used to train top-down\nmodels and evaluate how well models of any kind capture effects of task on\nattention. As a result, top-down models are trained and evaluated on private\ndata and public benchmarks measure only the overall fit to human data.\n In this paper, we focus on data limitations by examining four large-scale\npublic datasets, DR(eye)VE, BDD-A, MAAD, and LBW, used to train and evaluate\nalgorithms for drivers' gaze prediction. We define a set of driving tasks\n(lateral and longitudinal maneuvers) and context elements (intersections and\nright-of-way) known to affect drivers' attention, augment the datasets with\nannotations based on the said definitions, and analyze the characteristics of\ndata recording and processing pipelines w.r.t. capturing what the drivers see\nand do. In sum, the contributions of this work are: 1) quantifying biases of\nthe public datasets, 2) examining performance of the SOTA bottom-up models on\nsubsets of the data involving non-trivial drivers' actions, 3) linking\nshortcomings of the bottom-up models to data limitations, and 4)\nrecommendations for future data collection and processing. The new annotations\nand code for reproducing the results is available at\nhttps://github.com/ykotseruba/SCOUT.\n","authors":["Iuliia Kotseruba","John K. Tsotsos"],"pdf_url":"https://arxiv.org/pdf/2404.08749v1.pdf","comment":"Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024"},{"id":"http://arxiv.org/abs/2404.08748v1","updated":"2024-04-12T18:21:08Z","published":"2024-04-12T18:21:08Z","title":"Multi-Branch Generative Models for Multichannel Imaging with an\n Application to PET/CT Joint Reconstruction","summary":" This paper presents a proof-of-concept approach for learned synergistic\nreconstruction of medical images using multi-branch generative models.\nLeveraging variational autoencoders (VAEs) and generative adversarial networks\n(GANs), our models learn from pairs of images simultaneously, enabling\neffective denoising and reconstruction. Synergistic image reconstruction is\nachieved by incorporating the trained models in a regularizer that evaluates\nthe distance between the images and the model, in a similar fashion to\nmultichannel dictionary learning (DiL). We demonstrate the efficacy of our\napproach on both Modified National Institute of Standards and Technology\n(MNIST) and positron emission tomography (PET)/computed tomography (CT)\ndatasets, showcasing improved image quality and information sharing between\nmodalities. Despite challenges such as patch decomposition and model\nlimitations, our results underscore the potential of generative models for\nenhancing medical imaging reconstruction.\n","authors":["Noel Jeffrey Pinton","Alexandre Bousse","Catherine Cheze-Le-Rest","Dimitris Visvikis"],"pdf_url":"https://arxiv.org/pdf/2404.08748v1.pdf","comment":"12 pages, 16 figures, submitted to IEEE TRPMS"},{"id":"http://arxiv.org/abs/2310.09275v3","updated":"2024-04-12T18:10:51Z","published":"2023-10-13T17:38:41Z","title":"Understanding and Modeling the Effects of Task and Context on Drivers'\n Gaze Allocation","summary":" To further advance driver monitoring and assistance systems, it is important\nto understand how drivers allocate their attention, in other words, where do\nthey tend to look and why. Traditionally, factors affecting human visual\nattention have been divided into bottom-up (involuntary attraction to salient\nregions) and top-down (driven by the demands of the task being performed).\nAlthough both play a role in directing drivers' gaze, most of the existing\nmodels for drivers' gaze prediction apply techniques developed for bottom-up\nsaliency and do not consider influences of the drivers' actions explicitly.\nLikewise, common driving attention benchmarks lack relevant annotations for\ndrivers' actions and the context in which they are performed. Therefore, to\nenable analysis and modeling of these factors for drivers' gaze prediction, we\npropose the following: 1) we correct the data processing pipeline used in\nDR(eye)VE to reduce noise in the recorded gaze data; 2) we then add per-frame\nlabels for driving task and context; 3) we benchmark a number of baseline and\nSOTA models for saliency and driver gaze prediction and use new annotations to\nanalyze how their performance changes in scenarios involving different tasks;\nand, lastly, 4) we develop a novel model that modulates drivers' gaze\nprediction with explicit action and context information. While reducing noise\nin the DR(eye)VE gaze data improves results of all models, we show that using\ntask information in our proposed model boosts performance even further compared\nto bottom-up models on the cleaned up data, both overall (by 24% KLD and 89%\nNSS) and on scenarios that involve performing safety-critical maneuvers and\ncrossing intersections (by up to 10--30% KLD). Extended annotations and code\nare available at https://github.com/ykotseruba/SCOUT.\n","authors":["Iuliia Kotseruba","John K. Tsotsos"],"pdf_url":"https://arxiv.org/pdf/2310.09275v3.pdf","comment":"Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024"},{"id":"http://arxiv.org/abs/2404.10534v1","updated":"2024-04-12T21:41:50Z","published":"2024-04-12T21:41:50Z","title":"Into the Fog: Evaluating Multiple Object Tracking Robustness","summary":" State-of-the-art (SOTA) trackers have shown remarkable Multiple Object\nTracking (MOT) performance when trained and evaluated on current benchmarks.\nHowever, these benchmarks primarily consist of clear scenarios, overlooking\nadverse atmospheric conditions such as fog, haze, smoke and dust. As a result,\nthe robustness of SOTA trackers remains underexplored. To address these\nlimitations, we propose a pipeline for physic-based volumetric fog simulation\nin arbitrary real-world MOT dataset utilizing frame-by-frame monocular depth\nestimation and a fog formation optical model. Moreover, we enhance our\nsimulation by rendering of both homogeneous and heterogeneous fog effects. We\npropose to use the dark channel prior method to estimate fog (smoke) color,\nwhich shows promising results even in night and indoor scenes. We present the\nleading tracking benchmark MOTChallenge (MOT17 dataset) overlaid by fog (smoke\nfor indoor scenes) of various intensity levels and conduct a comprehensive\nevaluation of SOTA MOT methods, revealing their limitations under fog and\nfog-similar challenges.\n","authors":["Nadezda Kirillova","M. Jehanzeb Mirza","Horst Possegger","Horst Bischof"],"pdf_url":"https://arxiv.org/pdf/2404.10534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10540v1","updated":"2024-04-12T20:40:12Z","published":"2024-04-12T20:40:12Z","title":"SEVD: Synthetic Event-based Vision Dataset for Ego and Fixed Traffic\n Perception","summary":" Recently, event-based vision sensors have gained attention for autonomous\ndriving applications, as conventional RGB cameras face limitations in handling\nchallenging dynamic conditions. However, the availability of real-world and\nsynthetic event-based vision datasets remains limited. In response to this gap,\nwe present SEVD, a first-of-its-kind multi-view ego, and fixed perception\nsynthetic event-based dataset using multiple dynamic vision sensors within the\nCARLA simulator. Data sequences are recorded across diverse lighting (noon,\nnighttime, twilight) and weather conditions (clear, cloudy, wet, rainy, foggy)\nwith domain shifts (discrete and continuous). SEVD spans urban, suburban,\nrural, and highway scenes featuring various classes of objects (car, truck,\nvan, bicycle, motorcycle, and pedestrian). Alongside event data, SEVD includes\nRGB imagery, depth maps, optical flow, semantic, and instance segmentation,\nfacilitating a comprehensive understanding of the scene. Furthermore, we\nevaluate the dataset using state-of-the-art event-based (RED, RVT) and\nframe-based (YOLOv8) methods for traffic participant detection tasks and\nprovide baseline benchmarks for assessment. Additionally, we conduct\nexperiments to assess the synthetic event-based dataset's generalization\ncapabilities. The dataset is available at\nhttps://eventbasedvision.github.io/SEVD\n","authors":["Manideep Reddy Aliminati","Bharatesh Chakravarthi","Aayush Atul Verma","Arpitsinh Vaghela","Hua Wei","Xuesong Zhou","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.10540v1.pdf","comment":null}]},"2024-04-15T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.09752v1","updated":"2024-04-15T12:53:48Z","published":"2024-04-15T12:53:48Z","title":"Can We Break Free from Strong Data Augmentations in Self-Supervised\n Learning?","summary":" Self-supervised learning (SSL) has emerged as a promising solution for\naddressing the challenge of limited labeled data in deep neural networks\n(DNNs), offering scalability potential. However, the impact of design\ndependencies within the SSL framework remains insufficiently investigated. In\nthis study, we comprehensively explore SSL behavior across a spectrum of\naugmentations, revealing their crucial role in shaping SSL model performance\nand learning mechanisms. Leveraging these insights, we propose a novel learning\napproach that integrates prior knowledge, with the aim of curtailing the need\nfor extensive data augmentations and thereby amplifying the efficacy of learned\nrepresentations. Notably, our findings underscore that SSL models imbued with\nprior knowledge exhibit reduced texture bias, diminished reliance on shortcuts\nand augmentations, and improved robustness against both natural and adversarial\ncorruptions. These findings not only illuminate a new direction in SSL\nresearch, but also pave the way for enhancing DNN performance while\nconcurrently alleviating the imperative for intensive data augmentation,\nthereby enhancing scalability and real-world problem-solving capabilities.\n","authors":["Shruthi Gowda","Elahe Arani","Bahram Zonooz"],"pdf_url":"https://arxiv.org/pdf/2404.09752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09748v1","updated":"2024-04-15T12:50:44Z","published":"2024-04-15T12:50:44Z","title":"LetsGo: Large-Scale Garage Modeling and Rendering via LiDAR-Assisted\n Gaussian Primitives","summary":" Large garages are ubiquitous yet intricate scenes in our daily lives, posing\nchallenges characterized by monotonous colors, repetitive patterns, reflective\nsurfaces, and transparent vehicle glass. Conventional Structure from Motion\n(SfM) methods for camera pose estimation and 3D reconstruction fail in these\nenvironments due to poor correspondence construction. To address these\nchallenges, this paper introduces LetsGo, a LiDAR-assisted Gaussian splatting\napproach for large-scale garage modeling and rendering. We develop a handheld\nscanner, Polar, equipped with IMU, LiDAR, and a fisheye camera, to facilitate\naccurate LiDAR and image data scanning. With this Polar device, we present a\nGarageWorld dataset consisting of five expansive garage scenes with diverse\ngeometric structures and will release the dataset to the community for further\nresearch. We demonstrate that the collected LiDAR point cloud by the Polar\ndevice enhances a suite of 3D Gaussian splatting algorithms for garage scene\nmodeling and rendering. We also propose a novel depth regularizer for 3D\nGaussian splatting algorithm training, effectively eliminating floating\nartifacts in rendered images, and a lightweight Level of Detail (LOD) Gaussian\nrenderer for real-time viewing on web-based devices. Additionally, we explore a\nhybrid representation that combines the advantages of traditional mesh in\ndepicting simple geometry and colors (e.g., walls and the ground) with modern\n3D Gaussian representations capturing complex details and high-frequency\ntextures. This strategy achieves an optimal balance between memory performance\nand rendering quality. Experimental results on our dataset, along with\nScanNet++ and KITTI-360, demonstrate the superiority of our method in rendering\nquality and resource efficiency.\n","authors":["Jiadi Cui","Junming Cao","Yuhui Zhong","Liao Wang","Fuqiang Zhao","Penghao Wang","Yifan Chen","Zhipeng He","Lan Xu","Yujiao Shi","Yingliang Zhang","Jingyi Yu"],"pdf_url":"https://arxiv.org/pdf/2404.09748v1.pdf","comment":"Project Page: https://jdtsui.github.io/letsgo/"},{"id":"http://arxiv.org/abs/2404.09736v1","updated":"2024-04-15T12:37:26Z","published":"2024-04-15T12:37:26Z","title":"FSRT: Facial Scene Representation Transformer for Face Reenactment from\n Factorized Appearance, Head-pose, and Facial Expression Features","summary":" The task of face reenactment is to transfer the head motion and facial\nexpressions from a driving video to the appearance of a source image, which may\nbe of a different person (cross-reenactment). Most existing methods are\nCNN-based and estimate optical flow from the source image to the current\ndriving frame, which is then inpainted and refined to produce the output\nanimation. We propose a transformer-based encoder for computing a set-latent\nrepresentation of the source image(s). We then predict the output color of a\nquery pixel using a transformer-based decoder, which is conditioned with\nkeypoints and a facial expression vector extracted from the driving frame.\nLatent representations of the source person are learned in a self-supervised\nmanner that factorize their appearance, head pose, and facial expressions.\nThus, they are perfectly suited for cross-reenactment. In contrast to most\nrelated work, our method naturally extends to multiple source images and can\nthus adapt to person-specific facial dynamics. We also propose data\naugmentation and regularization schemes that are necessary to prevent\noverfitting and support generalizability of the learned representations. We\nevaluated our approach in a randomized user study. The results indicate\nsuperior performance compared to the state-of-the-art in terms of motion\ntransfer quality and temporal consistency.\n","authors":["Andre Rochow","Max Schwarz","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2404.09736v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09735v1","updated":"2024-04-15T12:35:10Z","published":"2024-04-15T12:35:10Z","title":"Equipping Diffusion Models with Differentiable Spatial Entropy for\n Low-Light Image Enhancement","summary":" Image restoration, which aims to recover high-quality images from their\ncorrupted counterparts, often faces the challenge of being an ill-posed problem\nthat allows multiple solutions for a single input. However, most deep learning\nbased works simply employ l1 loss to train their network in a deterministic\nway, resulting in over-smoothed predictions with inferior perceptual quality.\nIn this work, we propose a novel method that shifts the focus from a\ndeterministic pixel-by-pixel comparison to a statistical perspective,\nemphasizing the learning of distributions rather than individual pixel values.\nThe core idea is to introduce spatial entropy into the loss function to measure\nthe distribution difference between predictions and targets. To make this\nspatial entropy differentiable, we employ kernel density estimation (KDE) to\napproximate the probabilities for specific intensity values of each pixel with\ntheir neighbor areas. Specifically, we equip the entropy with diffusion models\nand aim for superior accuracy and enhanced perceptual quality over l1 based\nnoise matching loss. In the experiments, we evaluate the proposed method for\nlow light enhancement on two datasets and the NTIRE challenge 2024. All these\nresults illustrate the effectiveness of our statistic-based entropy loss. Code\nis available at https://github.com/shermanlian/spatial-entropy-loss.\n","authors":["Wenyi Lian","Wenjing Lian","Ziwei Luo"],"pdf_url":"https://arxiv.org/pdf/2404.09735v1.pdf","comment":"CVPRW 2024, best LPIPS in the NTIRE low light enhancement challenge\n 2024"},{"id":"http://arxiv.org/abs/2404.09732v1","updated":"2024-04-15T12:34:21Z","published":"2024-04-15T12:34:21Z","title":"Photo-Realistic Image Restoration in the Wild with Controlled\n Vision-Language Models","summary":" Though diffusion models have been successfully applied to various image\nrestoration (IR) tasks, their performance is sensitive to the choice of\ntraining datasets. Typically, diffusion models trained in specific datasets\nfail to recover images that have out-of-distribution degradations. To address\nthis problem, this work leverages a capable vision-language model and a\nsynthetic degradation pipeline to learn image restoration in the wild (wild\nIR). More specifically, all low-quality images are simulated with a synthetic\ndegradation pipeline that contains multiple common degradations such as blur,\nresize, noise, and JPEG compression. Then we introduce robust training for a\ndegradation-aware CLIP model to extract enriched image content features to\nassist high-quality image restoration. Our base diffusion model is the image\nrestoration SDE (IR-SDE). Built upon it, we further present a posterior\nsampling strategy for fast noise-free image generation. We evaluate our model\non both synthetic and real-world degradation datasets. Moreover, experiments on\nthe unified image restoration task illustrate that the proposed posterior\nsampling improves image generation quality for various degradations.\n","authors":["Ziwei Luo","Fredrik K. Gustafsson","Zheng Zhao","Jens Sjölund","Thomas B. Schön"],"pdf_url":"https://arxiv.org/pdf/2404.09732v1.pdf","comment":"CVPRW 2024; Code: https://github.com/Algolzw/daclip-uir"},{"id":"http://arxiv.org/abs/2404.06913v2","updated":"2024-04-15T12:27:51Z","published":"2024-04-10T11:06:29Z","title":"Sparse Global Matching for Video Frame Interpolation with Large Motion","summary":" Large motion poses a critical challenge in Video Frame Interpolation (VFI)\ntask. Existing methods are often constrained by limited receptive fields,\nresulting in sub-optimal performance when handling scenarios with large motion.\nIn this paper, we introduce a new pipeline for VFI, which can effectively\nintegrate global-level information to alleviate issues associated with large\nmotion. Specifically, we first estimate a pair of initial intermediate flows\nusing a high-resolution feature map for extracting local details. Then, we\nincorporate a sparse global matching branch to compensate for flow estimation,\nwhich consists of identifying flaws in initial flows and generating sparse flow\ncompensation with a global receptive field. Finally, we adaptively merge the\ninitial flow estimation with global flow compensation, yielding a more accurate\nintermediate flow. To evaluate the effectiveness of our method in handling\nlarge motion, we carefully curate a more challenging subset from commonly used\nbenchmarks. Our method demonstrates the state-of-the-art performance on these\nVFI subsets with large motion.\n","authors":["Chunxu Liu","Guozhen Zhang","Rui Zhao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06913v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/.\n Fixed some typos in the supplementary material"},{"id":"http://arxiv.org/abs/2402.06611v2","updated":"2024-04-15T12:13:42Z","published":"2024-02-09T18:42:30Z","title":"Image-based Deep Learning for the time-dependent prediction of fresh\n concrete properties","summary":" Increasing the degree of digitisation and automation in the concrete\nproduction process can play a crucial role in reducing the CO$_2$ emissions\nthat are associated with the production of concrete. In this paper, a method is\npresented that makes it possible to predict the properties of fresh concrete\nduring the mixing process based on stereoscopic image sequences of the\nconcretes flow behaviour. A Convolutional Neural Network (CNN) is used for the\nprediction, which receives the images supported by information on the mix\ndesign as input. In addition, the network receives temporal information in the\nform of the time difference between the time at which the images are taken and\nthe time at which the reference values of the concretes are carried out. With\nthis temporal information, the network implicitly learns the time-dependent\nbehaviour of the concretes properties. The network predicts the slump flow\ndiameter, the yield stress and the plastic viscosity. The time-dependent\nprediction potentially opens up the pathway to determine the temporal\ndevelopment of the fresh concrete properties already during mixing. This\nprovides a huge advantage for the concrete industry. As a result,\ncountermeasures can be taken in a timely manner. It is shown that an approach\nbased on depth and optical flow images, supported by information of the mix\ndesign, achieves the best results.\n","authors":["Max Meyer","Amadeus Langer","Max Mehltretter","Dries Beyer","Max Coenen","Tobias Schack","Michael Haist","Christian Heipke"],"pdf_url":"https://arxiv.org/pdf/2402.06611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08212v2","updated":"2024-04-15T12:08:41Z","published":"2024-01-16T08:56:52Z","title":"Human vs. LMMs: Exploring the Discrepancy in Emoji Interpretation and\n Usage in Digital Communication","summary":" Leveraging Large Multimodal Models (LMMs) to simulate human behaviors when\nprocessing multimodal information, especially in the context of social media,\nhas garnered immense interest due to its broad potential and far-reaching\nimplications. Emojis, as one of the most unique aspects of digital\ncommunication, are pivotal in enriching and often clarifying the emotional and\ntonal dimensions. Yet, there is a notable gap in understanding how these\nadvanced models, such as GPT-4V, interpret and employ emojis in the nuanced\ncontext of online interaction. This study intends to bridge this gap by\nexamining the behavior of GPT-4V in replicating human-like use of emojis. The\nfindings reveal a discernible discrepancy between human and GPT-4V behaviors,\nlikely due to the subjective nature of human interpretation and the limitations\nof GPT-4V's English-centric training, suggesting cultural biases and inadequate\nrepresentation of non-English cultures.\n","authors":["Hanjia Lyu","Weihong Qi","Zhongyu Wei","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2401.08212v2.pdf","comment":"Accepted for publication in ICWSM 2024"},{"id":"http://arxiv.org/abs/2404.09707v1","updated":"2024-04-15T12:06:00Z","published":"2024-04-15T12:06:00Z","title":"Adaptive Patching for High-resolution Image Segmentation with\n Transformers","summary":" Attention-based models are proliferating in the space of image analytics,\nincluding segmentation. The standard method of feeding images to transformer\nencoders is to divide the images into patches and then feed the patches to the\nmodel as a linear sequence of tokens. For high-resolution images, e.g.\nmicroscopic pathology images, the quadratic compute and memory cost prohibits\nthe use of an attention-based model, if we are to use smaller patch sizes that\nare favorable in segmentation. The solution is to either use custom complex\nmulti-resolution models or approximate attention schemes. We take inspiration\nfrom Adapative Mesh Refinement (AMR) methods in HPC by adaptively patching the\nimages, as a pre-processing step, based on the image details to reduce the\nnumber of patches being fed to the model, by orders of magnitude. This method\nhas a negligible overhead, and works seamlessly with any attention-based model,\ni.e. it is a pre-processing step that can be adopted by any attention-based\nmodel without friction. We demonstrate superior segmentation quality over SoTA\nsegmentation models for real-world pathology datasets while gaining a geomean\nspeedup of $6.9\\times$ for resolutions up to $64K^2$, on up to $2,048$ GPUs.\n","authors":["Enzhi Zhang","Isaac Lyngaas","Peng Chen","Xiao Wang","Jun Igarashi","Yuankai Huo","Mohamed Wahib","Masaharu Munetomo"],"pdf_url":"https://arxiv.org/pdf/2404.09707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09697v1","updated":"2024-04-15T11:59:19Z","published":"2024-04-15T11:59:19Z","title":"HSIDMamba: Exploring Bidirectional State-Space Models for Hyperspectral\n Denoising","summary":" Effectively discerning spatial-spectral dependencies in HSI denoising is\ncrucial, but prevailing methods using convolution or transformers still face\ncomputational efficiency limitations. Recently, the emerging Selective State\nSpace Model(Mamba) has risen with its nearly linear computational complexity in\nprocessing natural language sequences, which inspired us to explore its\npotential in handling long spectral sequences. In this paper, we propose\nHSIDMamba(HSDM), tailored to exploit the linear complexity for effectively\ncapturing spatial-spectral dependencies in HSI denoising. In particular, HSDM\ncomprises multiple Hyperspectral Continuous Scan Blocks, incorporating\nBCSM(Bidirectional Continuous Scanning Mechanism), scale residual, and spectral\nattention mechanisms to enhance the capture of long-range and local\nspatial-spectral information. BCSM strengthens spatial-spectral interactions by\nlinking forward and backward scans and enhancing information from eight\ndirections through SSM, significantly enhancing the perceptual capability of\nHSDM and improving denoising performance more effectively. Extensive\nevaluations against HSI denoising benchmarks validate the superior performance\nof HSDM, achieving state-of-the-art results in performance and surpassing the\nefficiency of the latest transformer architectures by $30\\%$.\n","authors":["Yang Liu","Jiahua Xiao","Yu Guo","Peilin Jiang","Haiwei Yang","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09692v1","updated":"2024-04-15T11:46:24Z","published":"2024-04-15T11:46:24Z","title":"XoFTR: Cross-modal Feature Matching Transformer","summary":" We introduce, XoFTR, a cross-modal cross-view method for local feature\nmatching between thermal infrared (TIR) and visible images. Unlike visible\nimages, TIR images are less susceptible to adverse lighting and weather\nconditions but present difficulties in matching due to significant texture and\nintensity differences. Current hand-crafted and learning-based methods for\nvisible-TIR matching fall short in handling viewpoint, scale, and texture\ndiversities. To address this, XoFTR incorporates masked image modeling\npre-training and fine-tuning with pseudo-thermal image augmentation to handle\nthe modality differences. Additionally, we introduce a refined matching\npipeline that adjusts for scale discrepancies and enhances match reliability\nthrough sub-pixel level refinement. To validate our approach, we collect a\ncomprehensive visible-thermal dataset, and show that our method outperforms\nexisting methods on many benchmarks.\n","authors":["Önder Tuzcuoğlu","Aybora Köksal","Buğra Sofu","Sinan Kalkan","A. Aydın Alatan"],"pdf_url":"https://arxiv.org/pdf/2404.09692v1.pdf","comment":"CVPR Image Matching Workshop, 2024. 12 pages, 7 figures, 5 tables.\n Codes and dataset are available at https://github.com/OnderT/XoFTR"},{"id":"http://arxiv.org/abs/2404.09690v1","updated":"2024-04-15T11:45:30Z","published":"2024-04-15T11:45:30Z","title":"Harnessing GPT-4V(ision) for Insurance: A Preliminary Exploration","summary":" The emergence of Large Multimodal Models (LMMs) marks a significant milestone\nin the development of artificial intelligence. Insurance, as a vast and complex\ndiscipline, involves a wide variety of data forms in its operational processes,\nincluding text, images, and videos, thereby giving rise to diverse multimodal\ntasks. Despite this, there has been limited systematic exploration of\nmultimodal tasks specific to insurance, nor a thorough investigation into how\nLMMs can address these challenges. In this paper, we explore GPT-4V's\ncapabilities in the insurance domain. We categorize multimodal tasks by\nfocusing primarily on visual aspects based on types of insurance (e.g., auto,\nhousehold/commercial property, health, and agricultural insurance) and\ninsurance stages (e.g., risk assessment, risk monitoring, and claims\nprocessing). Our experiment reveals that GPT-4V exhibits remarkable abilities\nin insurance-related tasks, demonstrating not only a robust understanding of\nmultimodal content in the insurance domain but also a comprehensive knowledge\nof insurance scenarios. However, there are notable shortcomings: GPT-4V\nstruggles with detailed risk rating and loss assessment, suffers from\nhallucination in image understanding, and shows variable support for different\nlanguages. Through this work, we aim to bridge the insurance domain with\ncutting-edge LMM technology, facilitate interdisciplinary exchange and\ndevelopment, and provide a foundation for the continued advancement and\nevolution of future research endeavors.\n","authors":["Chenwei Lin","Hanjia Lyu","Jiebo Luo","Xian Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12796v3","updated":"2024-04-15T11:40:39Z","published":"2023-11-21T18:59:58Z","title":"Physics-guided Shape-from-Template: Monocular Video Perception through\n Neural Surrogate Models","summary":" 3D reconstruction of dynamic scenes is a long-standing problem in computer\ngraphics and increasingly difficult the less information is available.\nShape-from-Template (SfT) methods aim to reconstruct a template-based geometry\nfrom RGB images or video sequences, often leveraging just a single monocular\ncamera without depth information, such as regular smartphone recordings.\nUnfortunately, existing reconstruction methods are either unphysical and noisy\nor slow in optimization. To solve this problem, we propose a novel SfT\nreconstruction algorithm for cloth using a pre-trained neural surrogate model\nthat is fast to evaluate, stable, and produces smooth reconstructions due to a\nregularizing physics simulation. Differentiable rendering of the simulated mesh\nenables pixel-wise comparisons between the reconstruction and a target video\nsequence that can be used for a gradient-based optimization procedure to\nextract not only shape information but also physical parameters such as\nstretching, shearing, or bending stiffness of the cloth. This allows to retain\na precise, stable, and smooth reconstructed geometry while reducing the runtime\nby a factor of 400-500 compared to $\\phi$-SfT, a state-of-the-art physics-based\nSfT approach.\n","authors":["David Stotko","Nils Wandel","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2311.12796v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09683v1","updated":"2024-04-15T11:36:31Z","published":"2024-04-15T11:36:31Z","title":"Post-Training Network Compression for 3D Medical Image Segmentation:\n Reducing Computational Efforts via Tucker Decomposition","summary":" We address the computational barrier of deploying advanced deep learning\nsegmentation models in clinical settings by studying the efficacy of network\ncompression through tensor decomposition. We propose a post-training Tucker\nfactorization that enables the decomposition of pre-existing models to reduce\ncomputational requirements without impeding segmentation accuracy. We applied\nTucker decomposition to the convolutional kernels of the TotalSegmentator (TS)\nmodel, an nnU-Net model trained on a comprehensive dataset for automatic\nsegmentation of 117 anatomical structures. Our approach reduced the\nfloating-point operations (FLOPs) and memory required during inference,\noffering an adjustable trade-off between computational efficiency and\nsegmentation quality. This study utilized the publicly available TS dataset,\nemploying various downsampling factors to explore the relationship between\nmodel size, inference speed, and segmentation performance. The application of\nTucker decomposition to the TS model substantially reduced the model parameters\nand FLOPs across various compression rates, with limited loss in segmentation\naccuracy. We removed up to 88% of the model's parameters with no significant\nperformance changes in the majority of classes after fine-tuning. Practical\nbenefits varied across different graphics processing unit (GPU) architectures,\nwith more distinct speed-ups on less powerful hardware. Post-hoc network\ncompression via Tucker decomposition presents a viable strategy for reducing\nthe computational demand of medical image segmentation models without\nsubstantially sacrificing accuracy. This approach enables the broader adoption\nof advanced deep learning technologies in clinical practice, offering a way to\nnavigate the constraints of hardware capabilities.\n","authors":["Tobias Weber","Jakob Dexl","David Rügamer","Michael Ingrisch"],"pdf_url":"https://arxiv.org/pdf/2404.09683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00368v2","updated":"2024-04-15T11:18:00Z","published":"2024-03-30T13:41:57Z","title":"Towards Variable and Coordinated Holistic Co-Speech Motion Generation","summary":" This paper addresses the problem of generating lifelike holistic co-speech\nmotions for 3D avatars, focusing on two key aspects: variability and\ncoordination. Variability allows the avatar to exhibit a wide range of motions\neven with similar speech content, while coordination ensures a harmonious\nalignment among facial expressions, hand gestures, and body poses. We aim to\nachieve both with ProbTalk, a unified probabilistic framework designed to\njointly model facial, hand, and body movements in speech. ProbTalk builds on\nthe variational autoencoder (VAE) architecture and incorporates three core\ndesigns. First, we introduce product quantization (PQ) to the VAE, which\nenriches the representation of complex holistic motion. Second, we devise a\nnovel non-autoregressive model that embeds 2D positional encoding into the\nproduct-quantized representation, thereby preserving essential structure\ninformation of the PQ codes. Last, we employ a secondary stage to refine the\npreliminary prediction, further sharpening the high-frequency details. Coupling\nthese three designs enables ProbTalk to generate natural and diverse holistic\nco-speech motions, outperforming several state-of-the-art methods in\nqualitative and quantitative evaluations, particularly in terms of realism. Our\ncode and model will be released for research purposes at\nhttps://feifeifeiliu.github.io/probtalk/.\n","authors":["Yifei Liu","Qiong Cao","Yandong Wen","Huaiguang Jiang","Changxing Ding"],"pdf_url":"https://arxiv.org/pdf/2404.00368v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.00362v2","updated":"2024-04-15T11:03:06Z","published":"2023-12-01T05:59:08Z","title":"Dancing with Still Images: Video Distillation via Static-Dynamic\n Disentanglement","summary":" Recently, dataset distillation has paved the way towards efficient machine\nlearning, especially for image datasets. However, the distillation for videos,\ncharacterized by an exclusive temporal dimension, remains an underexplored\ndomain. In this work, we provide the first systematic study of video\ndistillation and introduce a taxonomy to categorize temporal compression. Our\ninvestigation reveals that the temporal information is usually not well learned\nduring distillation, and the temporal dimension of synthetic data contributes\nlittle. The observations motivate our unified framework of disentangling the\ndynamic and static information in the videos. It first distills the videos into\nstill images as static memory and then compensates the dynamic and motion\ninformation with a learnable dynamic memory block. Our method achieves\nstate-of-the-art on video datasets at different scales, with a notably smaller\nmemory storage budget. Our code is available at\nhttps://github.com/yuz1wan/video_distillation.\n","authors":["Ziyu Wang","Yue Xu","Cewu Lu","Yong-Lu Li"],"pdf_url":"https://arxiv.org/pdf/2312.00362v2.pdf","comment":"CVPR 2024, project page: https://mvig-rhos.com/video-distill"},{"id":"http://arxiv.org/abs/2404.09666v1","updated":"2024-04-15T10:57:16Z","published":"2024-04-15T10:57:16Z","title":"Deformable MRI Sequence Registration for AI-based Prostate Cancer\n Diagnosis","summary":" The PI-CAI (Prostate Imaging: Cancer AI) challenge led to expert-level\ndiagnostic algorithms for clinically significant prostate cancer detection. The\nalgorithms receive biparametric MRI scans as input, which consist of\nT2-weighted and diffusion-weighted scans. These scans can be misaligned due to\nmultiple factors in the scanning process. Image registration can alleviate this\nissue by predicting the deformation between the sequences. We investigate the\neffect of image registration on the diagnostic performance of AI-based prostate\ncancer diagnosis. First, the image registration algorithm, developed in\nMeVisLab, is analyzed using a dataset with paired lesion annotations. Second,\nthe effect on diagnosis is evaluated by comparing case-level cancer diagnosis\nperformance between using the original dataset, rigidly aligned\ndiffusion-weighted scans, or deformably aligned diffusion-weighted scans. Rigid\nregistration showed no improvement. Deformable registration demonstrated a\nsubstantial improvement in lesion overlap (+10% median Dice score) and a\npositive yet non-significant improvement in diagnostic performance (+0.3%\nAUROC, p=0.18). Our investigation shows that a substantial improvement in\nlesion alignment does not directly lead to a significant improvement in\ndiagnostic performance. Qualitative analysis indicated that jointly developing\nimage registration methods and diagnostic AI algorithms could enhance\ndiagnostic accuracy and patient outcomes.\n","authors":["Alessa Hering","Sarah de Boer","Anindo Saha","Jasper J. Twilt","Derya Yakar","Maarten de Rooij","Henkjan Huisman","Joeran S. Bosma"],"pdf_url":"https://arxiv.org/pdf/2404.09666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08285v2","updated":"2024-04-15T10:50:47Z","published":"2024-04-12T07:19:16Z","title":"A Survey of Neural Network Robustness Assessment in Image Recognition","summary":" In recent years, there has been significant attention given to the robustness\nassessment of neural networks. Robustness plays a critical role in ensuring\nreliable operation of artificial intelligence (AI) systems in complex and\nuncertain environments. Deep learning's robustness problem is particularly\nsignificant, highlighted by the discovery of adversarial attacks on image\nclassification models. Researchers have dedicated efforts to evaluate\nrobustness in diverse perturbation conditions for image recognition tasks.\nRobustness assessment encompasses two main techniques: robustness verification/\ncertification for deliberate adversarial attacks and robustness testing for\nrandom data corruptions. In this survey, we present a detailed examination of\nboth adversarial robustness (AR) and corruption robustness (CR) in neural\nnetwork assessment. Analyzing current research papers and standards, we provide\nan extensive overview of robustness assessment in image recognition. Three\nessential aspects are analyzed: concepts, metrics, and assessment methods. We\ninvestigate the perturbation metrics and range representations used to measure\nthe degree of perturbations on images, as well as the robustness metrics\nspecifically for the robustness conditions of classification models. The\nstrengths and limitations of the existing methods are also discussed, and some\npotential directions for future research are provided.\n","authors":["Jie Wang","Jun Ai","Minyan Lu","Haoran Su","Dan Yu","Yutao Zhang","Junda Zhu","Jingyu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08285v2.pdf","comment":"Corrected typos and grammatical errors in Section 5"},{"id":"http://arxiv.org/abs/2404.09654v1","updated":"2024-04-15T10:42:22Z","published":"2024-04-15T10:42:22Z","title":"Do LLMs Understand Visual Anomalies? Uncovering LLM Capabilities in\n Zero-shot Anomaly Detection","summary":" Large vision-language models (LVLMs) are markedly proficient in deriving\nvisual representations guided by natural language. Recent explorations have\nutilized LVLMs to tackle zero-shot visual anomaly detection (VAD) challenges by\npairing images with textual descriptions indicative of normal and abnormal\nconditions, referred to as anomaly prompts. However, existing approaches depend\non static anomaly prompts that are prone to cross-semantic ambiguity, and\nprioritize global image-level representations over crucial local pixel-level\nimage-to-text alignment that is necessary for accurate anomaly localization. In\nthis paper, we present ALFA, a training-free approach designed to address these\nchallenges via a unified model. We propose a run-time prompt adaptation\nstrategy, which first generates informative anomaly prompts to leverage the\ncapabilities of a large language model (LLM). This strategy is enhanced by a\ncontextual scoring mechanism for per-image anomaly prompt adaptation and\ncross-semantic ambiguity mitigation. We further introduce a novel fine-grained\naligner to fuse local pixel-level semantics for precise anomaly localization,\nby projecting the image-text alignment from global to local semantic spaces.\nExtensive evaluations on the challenging MVTec and VisA datasets confirm ALFA's\neffectiveness in harnessing the language potential for zero-shot VAD, achieving\nsignificant PRO improvements of 12.1% on MVTec AD and 8.9% on VisA compared to\nstate-of-the-art zero-shot VAD approaches.\n","authors":["Jiaqi Zhu","Shaofeng Cai","Fang Deng","Junran Wu"],"pdf_url":"https://arxiv.org/pdf/2404.09654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03453v2","updated":"2024-04-15T10:28:44Z","published":"2023-09-07T02:28:04Z","title":"SyncDreamer: Generating Multiview-consistent Images from a Single-view\n Image","summary":" In this paper, we present a novel diffusion model called that generates\nmultiview-consistent images from a single-view image. Using pretrained\nlarge-scale 2D diffusion models, recent work Zero123 demonstrates the ability\nto generate plausible novel views from a single-view image of an object.\nHowever, maintaining consistency in geometry and colors for the generated\nimages remains a challenge. To address this issue, we propose a synchronized\nmultiview diffusion model that models the joint probability distribution of\nmultiview images, enabling the generation of multiview-consistent images in a\nsingle reverse process. SyncDreamer synchronizes the intermediate states of all\nthe generated images at every step of the reverse process through a 3D-aware\nfeature attention mechanism that correlates the corresponding features across\ndifferent views. Experiments show that SyncDreamer generates images with high\nconsistency across different views, thus making it well-suited for various 3D\ngeneration tasks such as novel-view-synthesis, text-to-3D, and image-to-3D.\n","authors":["Yuan Liu","Cheng Lin","Zijiao Zeng","Xiaoxiao Long","Lingjie Liu","Taku Komura","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03453v2.pdf","comment":"ICLR 2024 Spotlight. Project page:\n https://liuyuan-pal.github.io/SyncDreamer/ Code:\n https://github.com/liuyuan-pal/SyncDreamer"},{"id":"http://arxiv.org/abs/2404.09645v1","updated":"2024-04-15T10:24:32Z","published":"2024-04-15T10:24:32Z","title":"Real-world Instance-specific Image Goal Navigation for Service Robots:\n Bridging the Domain Gap with Contrastive Learning","summary":" Improving instance-specific image goal navigation (InstanceImageNav), which\nlocates the identical object in a real-world environment from a query image, is\nessential for robotic systems to assist users in finding desired objects. The\nchallenge lies in the domain gap between low-quality images observed by the\nmoving robot, characterized by motion blur and low-resolution, and high-quality\nquery images provided by the user. Such domain gaps could significantly reduce\nthe task success rate but have not been the focus of previous work. To address\nthis, we propose a novel method called Few-shot Cross-quality Instance-aware\nAdaptation (CrossIA), which employs contrastive learning with an instance\nclassifier to align features between massive low- and few high-quality images.\nThis approach effectively reduces the domain gap by bringing the latent\nrepresentations of cross-quality images closer on an instance basis.\nAdditionally, the system integrates an object image collection with a\npre-trained deblurring model to enhance the observed image quality. Our method\nfine-tunes the SimSiam model, pre-trained on ImageNet, using CrossIA. We\nevaluated our method's effectiveness through an InstanceImageNav task with 20\ndifferent types of instances, where the robot identifies the same instance in a\nreal-world environment as a high-quality query image. Our experiments showed\nthat our method improves the task success rate by up to three times compared to\nthe baseline, a conventional approach based on SuperGlue. These findings\nhighlight the potential of leveraging contrastive learning and image\nenhancement techniques to bridge the domain gap and improve object localization\nin robotic applications. The project website is\nhttps://emergentsystemlabstudent.github.io/DomainBridgingNav/.\n","authors":["Taichi Sakaguchi","Akira Taniguchi","Yoshinobu Hagiwara","Lotfi El Hafi","Shoichi Hasegawa","Tadahiro Taniguchi"],"pdf_url":"https://arxiv.org/pdf/2404.09645v1.pdf","comment":"See website at\n https://emergentsystemlabstudent.github.io/DomainBridgingNav/. Submitted to\n IROS2024"},{"id":"http://arxiv.org/abs/2404.09640v1","updated":"2024-04-15T10:19:39Z","published":"2024-04-15T10:19:39Z","title":"CREST: Cross-modal Resonance through Evidential Deep Learning for\n Enhanced Zero-Shot Learning","summary":" Zero-shot learning (ZSL) enables the recognition of novel classes by\nleveraging semantic knowledge transfer from known to unknown categories. This\nknowledge, typically encapsulated in attribute descriptions, aids in\nidentifying class-specific visual features, thus facilitating visual-semantic\nalignment and improving ZSL performance. However, real-world challenges such as\ndistribution imbalances and attribute co-occurrence among instances often\nhinder the discernment of local variances in images, a problem exacerbated by\nthe scarcity of fine-grained, region-specific attribute annotations. Moreover,\nthe variability in visual presentation within categories can also skew\nattribute-category associations. In response, we propose a bidirectional\ncross-modal ZSL approach CREST. It begins by extracting representations for\nattribute and visual localization and employs Evidential Deep Learning (EDL) to\nmeasure underlying epistemic uncertainty, thereby enhancing the model's\nresilience against hard negatives. CREST incorporates dual learning pathways,\nfocusing on both visual-category and attribute-category alignments, to ensure\nrobust correlation between latent and observable spaces. Moreover, we introduce\nan uncertainty-informed cross-modal fusion technique to refine visual-attribute\ninference. Extensive experiments demonstrate our model's effectiveness and\nunique explainability across multiple datasets. Our code and data are available\nat: Comments: Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at\nhttps://github.com/JethroJames/CREST.\n","authors":["Haojian Huang","Xiaozhen Qiao","Zhuo Chen","Haodong Chen","Bingyu Li","Zhe Sun","Mulin Chen","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.09640v1.pdf","comment":"Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at\n https://github.com/JethroJames/CREST"},{"id":"http://arxiv.org/abs/2404.05468v2","updated":"2024-04-15T10:13:25Z","published":"2024-04-08T12:46:39Z","title":"Mind-to-Image: Projecting Visual Mental Imagination of the Brain from\n fMRI","summary":" The reconstruction of images observed by subjects from fMRI data collected\nduring visual stimuli has made significant strides in the past decade, thanks\nto the availability of extensive fMRI datasets and advancements in generative\nmodels for image generation. However, the application of visual reconstruction\nhas remained limited. Reconstructing visual imagination presents a greater\nchallenge, with potentially revolutionary applications ranging from aiding\nindividuals with disabilities to verifying witness accounts in court. The\nprimary hurdles in this field are the absence of data collection protocols for\nvisual imagery and the lack of datasets on the subject. Traditionally,\nfMRI-to-image relies on data collected from subjects exposed to visual stimuli,\nwhich poses issues for generating visual imagery based on the difference of\nbrain activity between visual stimulation and visual imagery. For the first\ntime, we have compiled a substantial dataset (around 6h of scans) on visual\nimagery along with a proposed data collection protocol. We then train a\nmodified version of an fMRI-to-image model and demonstrate the feasibility of\nreconstructing images from two modes of imagination: from memory and from pure\nimagination. This marks an important step towards creating a technology that\nallow direct reconstruction of visual imagery.\n","authors":["Hugo Caselles-Dupré","Charles Mellerio","Paul Hérent","Alizée Lopez-Persem","Benoit Béranger","Mathieu Soularue","Pierre Fautrel","Gauthier Vernier","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2404.05468v2.pdf","comment":"Pre-print to be updated"},{"id":"http://arxiv.org/abs/2403.16092v2","updated":"2024-04-15T10:06:41Z","published":"2024-03-24T11:09:41Z","title":"Are NeRFs ready for autonomous driving? Towards closing the\n real-to-simulation gap","summary":" Neural Radiance Fields (NeRFs) have emerged as promising tools for advancing\nautonomous driving (AD) research, offering scalable closed-loop simulation and\ndata augmentation capabilities. However, to trust the results achieved in\nsimulation, one needs to ensure that AD systems perceive real and rendered data\nin the same way. Although the performance of rendering methods is increasing,\nmany scenarios will remain inherently challenging to reconstruct faithfully. To\nthis end, we propose a novel perspective for addressing the real-to-simulated\ndata gap. Rather than solely focusing on improving rendering fidelity, we\nexplore simple yet effective methods to enhance perception model robustness to\nNeRF artifacts without compromising performance on real data. Moreover, we\nconduct the first large-scale investigation into the real-to-simulated data gap\nin an AD setting using a state-of-the-art neural rendering technique.\nSpecifically, we evaluate object detectors and an online mapping model on real\nand simulated data, and study the effects of different fine-tuning\nstrategies.Our results show notable improvements in model robustness to\nsimulated data, even improving real-world performance in some cases. Last, we\ndelve into the correlation between the real-to-simulated gap and image\nreconstruction metrics, identifying FID and LPIPS as strong indicators. See\nhttps://research.zenseact.com/publications/closing-real2sim-gap for our project\npage.\n","authors":["Carl Lindström","Georg Hess","Adam Lilja","Maryam Fatemi","Lars Hammarstrand","Christoffer Petersson","Lennart Svensson"],"pdf_url":"https://arxiv.org/pdf/2403.16092v2.pdf","comment":"Accepted at Workshop on Autonomous Driving, CVPR 2024"},{"id":"http://arxiv.org/abs/2312.02244v3","updated":"2024-04-15T10:06:19Z","published":"2023-12-04T12:30:07Z","title":"Geometrically-driven Aggregation for Zero-shot 3D Point Cloud\n Understanding","summary":" Zero-shot 3D point cloud understanding can be achieved via 2D Vision-Language\nModels (VLMs). Existing strategies directly map Vision-Language Models from 2D\npixels of rendered or captured views to 3D points, overlooking the inherent and\nexpressible point cloud geometric structure. Geometrically similar or close\nregions can be exploited for bolstering point cloud understanding as they are\nlikely to share semantic information. To this end, we introduce the first\ntraining-free aggregation technique that leverages the point cloud's 3D\ngeometric structure to improve the quality of the transferred Vision-Language\nModels. Our approach operates iteratively, performing local-to-global\naggregation based on geometric and semantic point-level reasoning. We benchmark\nour approach on three downstream tasks, including classification, part\nsegmentation, and semantic segmentation, with a variety of datasets\nrepresenting both synthetic/real-world, and indoor/outdoor scenarios. Our\napproach achieves new state-of-the-art results in all benchmarks. Our approach\noperates iteratively, performing local-to-global aggregation based on geometric\nand semantic point-level reasoning. Code and dataset are available at\nhttps://luigiriz.github.io/geoze-website/\n","authors":["Guofeng Mei","Luigi Riz","Yiming Wang","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.02244v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09633v1","updated":"2024-04-15T10:05:36Z","published":"2024-04-15T10:05:36Z","title":"In-Context Translation: Towards Unifying Image Recognition, Processing,\n and Generation","summary":" We propose In-Context Translation (ICT), a general learning framework to\nunify visual recognition (e.g., semantic segmentation), low-level image\nprocessing (e.g., denoising), and conditional image generation (e.g.,\nedge-to-image synthesis). Thanks to unification, ICT significantly reduces the\ninherent inductive bias that comes with designing models for specific tasks,\nand it maximizes mutual enhancement across similar tasks. However, the\nunification across a large number of tasks is non-trivial due to various data\nformats and training pipelines. To this end, ICT introduces two designs.\nFirstly, it standardizes input-output data of different tasks into RGB image\npairs, e.g., semantic segmentation data pairs an RGB image with its\nsegmentation mask in the same RGB format. This turns different tasks into a\ngeneral translation task between two RGB images. Secondly, it standardizes the\ntraining of different tasks into a general in-context learning, where\n\"in-context\" means the input comprises an example input-output pair of the\ntarget task and a query image. The learning objective is to generate the\n\"missing\" data paired with the query. The implicit translation process is thus\nbetween the query and the generated image. In experiments, ICT unifies ten\nvision tasks and showcases impressive performance on their respective\nbenchmarks. Notably, compared to its competitors, e.g., Painter and\nPromptDiffusion, ICT trained on only 4 RTX 3090 GPUs is shown to be more\nefficient and less costly in training.\n","authors":["Han Xue","Qianru Sun","Li Song","Wenjun Zhang","Zhiwu Huang"],"pdf_url":"https://arxiv.org/pdf/2404.09633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09632v1","updated":"2024-04-15T10:04:15Z","published":"2024-04-15T10:04:15Z","title":"Bridging Vision and Language Spaces with Assignment Prediction","summary":" This paper introduces VLAP, a novel approach that bridges pretrained vision\nmodels and large language models (LLMs) to make frozen LLMs understand the\nvisual world. VLAP transforms the embedding space of pretrained vision models\ninto the LLMs' word embedding space using a single linear layer for efficient\nand general-purpose visual and language understanding. Specifically, we harness\nwell-established word embeddings to bridge two modality embedding spaces. The\nvisual and text representations are simultaneously assigned to a set of word\nembeddings within pretrained LLMs by formulating the assigning procedure as an\noptimal transport problem. We predict the assignment of one modality from the\nrepresentation of another modality data, enforcing consistent assignments for\npaired multimodal data. This allows vision and language representations to\ncontain the same information, grounding the frozen LLMs' word embedding space\nin visual data. Moreover, a robust semantic taxonomy of LLMs can be preserved\nwith visual data since the LLMs interpret and reason linguistic information\nfrom correlations between word embeddings. Experimental results show that VLAP\nachieves substantial improvements over the previous linear transformation-based\napproaches across a range of vision-language tasks, including image captioning,\nvisual question answering, and cross-modal retrieval. We also demonstrate the\nlearned visual representations hold a semantic taxonomy of LLMs, making visual\nsemantic arithmetic possible.\n","authors":["Jungin Park","Jiyoung Lee","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2404.09632v1.pdf","comment":"ICLR 2024 Camera-ready"},{"id":"http://arxiv.org/abs/2404.09624v1","updated":"2024-04-15T09:56:20Z","published":"2024-04-15T09:56:20Z","title":"AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics\n Perception","summary":" The highly abstract nature of image aesthetics perception (IAP) poses\nsignificant challenge for current multimodal large language models (MLLMs). The\nlack of human-annotated multi-modality aesthetic data further exacerbates this\ndilemma, resulting in MLLMs falling short of aesthetics perception\ncapabilities. To address the above challenge, we first introduce a\ncomprehensively annotated Aesthetic Multi-Modality Instruction Tuning (AesMMIT)\ndataset, which serves as the footstone for building multi-modality aesthetics\nfoundation models. Specifically, to align MLLMs with human aesthetics\nperception, we construct a corpus-rich aesthetic critique database with 21,904\ndiverse-sourced images and 88K human natural language feedbacks, which are\ncollected via progressive questions, ranging from coarse-grained aesthetic\ngrades to fine-grained aesthetic descriptions. To ensure that MLLMs can handle\ndiverse queries, we further prompt GPT to refine the aesthetic critiques and\nassemble the large-scale aesthetic instruction tuning dataset, i.e. AesMMIT,\nwhich consists of 409K multi-typed instructions to activate stronger aesthetic\ncapabilities. Based on the AesMMIT database, we fine-tune the open-sourced\ngeneral foundation models, achieving multi-modality Aesthetic Expert models,\ndubbed AesExpert. Extensive experiments demonstrate that the proposed AesExpert\nmodels deliver significantly better aesthetic perception performances than the\nstate-of-the-art MLLMs, including the most advanced GPT-4V and\nGemini-Pro-Vision. Source data will be available at\nhttps://github.com/yipoh/AesExpert.\n","authors":["Yipo Huang","Xiangfei Sheng","Zhichao Yang","Quan Yuan","Zhichao Duan","Pengfei Chen","Leida Li","Weisi Lin","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2404.09624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03778v3","updated":"2024-04-15T09:55:50Z","published":"2024-04-04T19:50:57Z","title":"Flattening the Parent Bias: Hierarchical Semantic Segmentation in the\n Poincaré Ball","summary":" Hierarchy is a natural representation of semantic taxonomies, including the\nones routinely used in image segmentation. Indeed, recent work on semantic\nsegmentation reports improved accuracy from supervised training leveraging\nhierarchical label structures. Encouraged by these results, we revisit the\nfundamental assumptions behind that work. We postulate and then empirically\nverify that the reasons for the observed improvement in segmentation accuracy\nmay be entirely unrelated to the use of the semantic hierarchy. To demonstrate\nthis, we design a range of cross-domain experiments with a representative\nhierarchical approach. We find that on the new testing domains, a flat\n(non-hierarchical) segmentation network, in which the parents are inferred from\nthe children, has superior segmentation accuracy to the hierarchical approach\nacross the board. Complementing these findings and inspired by the intrinsic\nproperties of hyperbolic spaces, we study a more principled approach to\nhierarchical segmentation using the Poincar\\'e ball model. The hyperbolic\nrepresentation largely outperforms the previous (Euclidean) hierarchical\napproach as well and is on par with our flat Euclidean baseline in terms of\nsegmentation accuracy. However, it additionally exhibits surprisingly strong\ncalibration quality of the parent nodes in the semantic hierarchy, especially\non the more challenging domains. Our combined analysis suggests that the\nestablished practice of hierarchical segmentation may be limited to in-domain\nsettings, whereas flat classifiers generalize substantially better, especially\nif they are modeled in the hyperbolic space.\n","authors":["Simon Weber","Barış Zöngür","Nikita Araslanov","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03778v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08339v2","updated":"2024-04-15T09:51:15Z","published":"2023-10-12T13:57:32Z","title":"TTK is Getting MPI-Ready","summary":" This system paper documents the technical foundations for the extension of\nthe Topology ToolKit (TTK) to distributed-memory parallelism with the Message\nPassing Interface (MPI). While several recent papers introduced topology-based\napproaches for distributed-memory environments, these were reporting\nexperiments obtained with tailored, mono-algorithm implementations. In\ncontrast, we describe in this paper a versatile approach (supporting both\ntriangulated domains and regular grids) for the support of topological analysis\npipelines, i.e. a sequence of topological algorithms interacting together.\nWhile developing this extension, we faced several algorithmic and software\nengineering challenges, which we document in this paper. We describe an MPI\nextension of TTK's data structure for triangulation representation and\ntraversal, a central component to the global performance and generality of\nTTK's topological implementations. We also introduce an intermediate interface\nbetween TTK and MPI, both at the global pipeline level, and at the fine-grain\nalgorithmic level. We provide a taxonomy for the distributed-memory topological\nalgorithms supported by TTK, depending on their communication needs and provide\nexamples of hybrid MPI+thread parallelizations. Performance analyses show that\nparallel efficiencies range from 20% to 80% (depending on the algorithms), and\nthat the MPI-specific preconditioning introduced by our framework induces a\nnegligible computation time overhead. We illustrate the new distributed-memory\ncapabilities of TTK with an example of advanced analysis pipeline, combining\nmultiple algorithms, run on the largest publicly available dataset we have\nfound (120 billion vertices) on a cluster with 64 nodes (for a total of 1536\ncores). Finally, we provide a roadmap for the completion of TTK's MPI\nextension, along with generic recommendations for each algorithm communication\ncategory.\n","authors":["Eve Le Guillou","Michael Will","Pierre Guillou","Jonas Lukasczyk","Pierre Fortin","Christoph Garth","Julien Tierny"],"pdf_url":"https://arxiv.org/pdf/2310.08339v2.pdf","comment":"18 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.09619v1","updated":"2024-04-15T09:47:48Z","published":"2024-04-15T09:47:48Z","title":"UNIAA: A Unified Multi-modal Image Aesthetic Assessment Baseline and\n Benchmark","summary":" As an alternative to expensive expert evaluation, Image Aesthetic Assessment\n(IAA) stands out as a crucial task in computer vision. However, traditional IAA\nmethods are typically constrained to a single data source or task, restricting\nthe universality and broader application. In this work, to better align with\nhuman aesthetics, we propose a Unified Multi-modal Image Aesthetic Assessment\n(UNIAA) framework, including a Multi-modal Large Language Model (MLLM) named\nUNIAA-LLaVA and a comprehensive benchmark named UNIAA-Bench. We choose MLLMs\nwith both visual perception and language ability for IAA and establish a\nlow-cost paradigm for transforming the existing datasets into unified and\nhigh-quality visual instruction tuning data, from which the UNIAA-LLaVA is\ntrained. To further evaluate the IAA capability of MLLMs, we construct the\nUNIAA-Bench, which consists of three aesthetic levels: Perception, Description,\nand Assessment. Extensive experiments validate the effectiveness and\nrationality of UNIAA. UNIAA-LLaVA achieves competitive performance on all\nlevels of UNIAA-Bench, compared with existing MLLMs. Specifically, our model\nperforms better than GPT-4V in aesthetic perception and even approaches the\njunior-level human. We find MLLMs have great potential in IAA, yet there\nremains plenty of room for further improvement. The UNIAA-LLaVA and UNIAA-Bench\nwill be released.\n","authors":["Zhaokun Zhou","Qiulin Wang","Bin Lin","Yiwei Su","Rui Chen","Xin Tao","Amin Zheng","Li Yuan","Pengfei Wan","Di Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09616v1","updated":"2024-04-15T09:40:44Z","published":"2024-04-15T09:40:44Z","title":"A Review and Efficient Implementation of Scene Graph Generation Metrics","summary":" Scene graph generation has emerged as a prominent research field in computer\nvision, witnessing significant advancements in the recent years. However,\ndespite these strides, precise and thorough definitions for the metrics used to\nevaluate scene graph generation models are lacking. In this paper, we address\nthis gap in the literature by providing a review and precise definition of\ncommonly used metrics in scene graph generation. Our comprehensive examination\nclarifies the underlying principles of these metrics and can serve as a\nreference or introduction to scene graph metrics.\n Furthermore, to facilitate the usage of these metrics, we introduce a\nstandalone Python package called SGBench that efficiently implements all\ndefined metrics, ensuring their accessibility to the research community.\nAdditionally, we present a scene graph benchmarking web service, that enables\nresearchers to compare scene graph generation methods and increase visibility\nof new methods in a central place.\n All of our code can be found at https://lorjul.github.io/sgbench/.\n","authors":["Julian Lorenz","Robin Schön","Katja Ludwig","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2404.09616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11369v2","updated":"2024-04-15T09:33:21Z","published":"2023-06-20T08:19:51Z","title":"CrossKD: Cross-Head Knowledge Distillation for Object Detection","summary":" Knowledge Distillation (KD) has been validated as an effective model\ncompression technique for learning compact object detectors. Existing\nstate-of-the-art KD methods for object detection are mostly based on feature\nimitation. In this paper, we present a general and effective prediction\nmimicking distillation scheme, called CrossKD, which delivers the intermediate\nfeatures of the student's detection head to the teacher's detection head. The\nresulting cross-head predictions are then forced to mimic the teacher's\npredictions. This manner relieves the student's head from receiving\ncontradictory supervision signals from the annotations and the teacher's\npredictions, greatly improving the student's detection performance. Moreover,\nas mimicking the teacher's predictions is the target of KD, CrossKD offers more\ntask-oriented information in contrast with feature imitation. On MS COCO, with\nonly prediction mimicking losses applied, our CrossKD boosts the average\nprecision of GFL ResNet-50 with 1x training schedule from 40.2 to 43.7,\noutperforming all existing KD methods. In addition, our method also works well\nwhen distilling detectors with heterogeneous backbones. Code is available at\nhttps://github.com/jbwang1997/CrossKD.\n","authors":["Jiabao Wang","Yuming Chen","Zhaohui Zheng","Xiang Li","Ming-Ming Cheng","Qibin Hou"],"pdf_url":"https://arxiv.org/pdf/2306.11369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17648v3","updated":"2024-04-15T09:31:17Z","published":"2023-05-28T06:44:33Z","title":"Z-GMOT: Zero-shot Generic Multiple Object Tracking","summary":" Despite recent significant progress, Multi-Object Tracking (MOT) faces\nlimitations such as reliance on prior knowledge and predefined categories and\nstruggles with unseen objects. To address these issues, Generic Multiple Object\nTracking (GMOT) has emerged as an alternative approach, requiring less prior\ninformation. However, current GMOT methods often rely on initial bounding boxes\nand struggle to handle variations in factors such as viewpoint, lighting,\nocclusion, and scale, among others. Our contributions commence with the\nintroduction of the \\textit{Referring GMOT dataset} a collection of videos,\neach accompanied by detailed textual descriptions of their attributes.\nSubsequently, we propose $\\mathtt{Z-GMOT}$, a cutting-edge tracking solution\ncapable of tracking objects from \\textit{never-seen categories} without the\nneed of initial bounding boxes or predefined categories. Within our\n$\\mathtt{Z-GMOT}$ framework, we introduce two novel components: (i)\n$\\mathtt{iGLIP}$, an improved Grounded language-image pretraining, for\naccurately detecting unseen objects with specific characteristics. (ii)\n$\\mathtt{MA-SORT}$, a novel object association approach that adeptly integrates\nmotion and appearance-based matching strategies to tackle the complex task of\ntracking objects with high similarity. Our contributions are benchmarked\nthrough extensive experiments conducted on the Referring GMOT dataset for GMOT\ntask. Additionally, to assess the generalizability of the proposed\n$\\mathtt{Z-GMOT}$, we conduct ablation studies on the DanceTrack and MOT20\ndatasets for the MOT task. Our dataset, code, and models are released at:\nhttps://fsoft-aic.github.io/Z-GMOT.\n","authors":["Kim Hoang Tran","Anh Duy Le Dinh","Tien Phat Nguyen","Thinh Phan","Pha Nguyen","Khoa Luu","Donald Adjeroh","Gianfranco Doretto","Ngan Hoang Le"],"pdf_url":"https://arxiv.org/pdf/2305.17648v3.pdf","comment":"Accepted to NAACL 2024"},{"id":"http://arxiv.org/abs/2307.03992v4","updated":"2024-04-15T09:19:01Z","published":"2023-07-08T14:59:41Z","title":"Stimulating the Diffusion Model for Image Denoising via Adaptive\n Embedding and Ensembling","summary":" Image denoising is a fundamental problem in computational photography, where\nachieving high perception with low distortion is highly demanding. Current\nmethods either struggle with perceptual quality or suffer from significant\ndistortion. Recently, the emerging diffusion model has achieved\nstate-of-the-art performance in various tasks and demonstrates great potential\nfor image denoising. However, stimulating diffusion models for image denoising\nis not straightforward and requires solving several critical problems. For one\nthing, the input inconsistency hinders the connection between diffusion models\nand image denoising. For another, the content inconsistency between the\ngenerated image and the desired denoised image introduces distortion. To tackle\nthese problems, we present a novel strategy called the Diffusion Model for\nImage Denoising (DMID) by understanding and rethinking the diffusion model from\na denoising perspective. Our DMID strategy includes an adaptive embedding\nmethod that embeds the noisy image into a pre-trained unconditional diffusion\nmodel and an adaptive ensembling method that reduces distortion in the denoised\nimage. Our DMID strategy achieves state-of-the-art performance on both\ndistortion-based and perception-based metrics, for both Gaussian and real-world\nimage denoising.The code is available at https://github.com/Li-Tong-621/DMID.\n","authors":["Tong Li","Hansen Feng","Lizhi Wang","Zhiwei Xiong","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2307.03992v4.pdf","comment":"18 pages,15 figures"},{"id":"http://arxiv.org/abs/2404.09601v1","updated":"2024-04-15T09:16:49Z","published":"2024-04-15T09:16:49Z","title":"Reactive Model Correction: Mitigating Harm to Task-Relevant Features via\n Conditional Bias Suppression","summary":" Deep Neural Networks are prone to learning and relying on spurious\ncorrelations in the training data, which, for high-risk applications, can have\nfatal consequences. Various approaches to suppress model reliance on harmful\nfeatures have been proposed that can be applied post-hoc without additional\ntraining. Whereas those methods can be applied with efficiency, they also tend\nto harm model performance by globally shifting the distribution of latent\nfeatures. To mitigate unintended overcorrection of model behavior, we propose a\nreactive approach conditioned on model-derived knowledge and eXplainable\nArtificial Intelligence (XAI) insights. While the reactive approach can be\napplied to many post-hoc methods, we demonstrate the incorporation of\nreactivity in particular for P-ClArC (Projective Class Artifact Compensation),\nintroducing a new method called R-ClArC (Reactive Class Artifact Compensation).\nThrough rigorous experiments in controlled settings (FunnyBirds) and with a\nreal-world dataset (ISIC2019), we show that introducing reactivity can minimize\nthe detrimental effect of the applied correction while simultaneously ensuring\nlow reliance on spurious features.\n","authors":["Dilyara Bareeva","Maximilian Dreyer","Frederik Pahde","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2404.09601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11821v3","updated":"2024-04-15T09:10:56Z","published":"2024-03-18T14:24:20Z","title":"Evaluating Text-to-Image Synthesis: Survey and Taxonomy of Image Quality\n Metrics","summary":" Recent advances in text-to-image synthesis enabled through a combination of\nlanguage and vision foundation models have led to a proliferation of the tools\navailable and an increased attention to the field. When conducting\ntext-to-image synthesis, a central goal is to ensure that the content between\ntext and image is aligned. As such, there exist numerous evaluation metrics\nthat aim to mimic human judgement. However, it is often unclear which metric to\nuse for evaluating text-to-image synthesis systems as their evaluation is\nhighly nuanced. In this work, we provide a comprehensive overview of existing\ntext-to-image evaluation metrics. Based on our findings, we propose a new\ntaxonomy for categorizing these metrics. Our taxonomy is grounded in the\nassumption that there are two main quality criteria, namely compositionality\nand generality, which ideally map to human preferences. Ultimately, we derive\nguidelines for practitioners conducting text-to-image evaluation, discuss open\nchallenges of evaluation mechanisms, and surface limitations of current\nmetrics.\n","authors":["Sebastian Hartwig","Dominik Engel","Leon Sick","Hannah Kniesel","Tristan Payer","Poonam Poonam","Michael Glöckler","Alex Bäuerle","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2403.11821v3.pdf","comment":"preprint, 20 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.09591v1","updated":"2024-04-15T09:01:47Z","published":"2024-04-15T09:01:47Z","title":"3D Gaussian Splatting as Markov Chain Monte Carlo","summary":" While 3D Gaussian Splatting has recently become popular for neural rendering,\ncurrent methods rely on carefully engineered cloning and splitting strategies\nfor placing Gaussians, which does not always generalize and may lead to\npoor-quality renderings. In addition, for real-world scenes, they rely on a\ngood initial point cloud to perform well. In this work, we rethink 3D Gaussians\nas random samples drawn from an underlying probability distribution describing\nthe physical representation of the scene -- in other words, Markov Chain Monte\nCarlo (MCMC) samples. Under this view, we show that the 3D Gaussian updates are\nstrikingly similar to a Stochastic Langevin Gradient Descent (SGLD) update. As\nwith MCMC, samples are nothing but past visit locations, adding new Gaussians\nunder our framework can simply be realized without heuristics as placing\nGaussians at existing Gaussian locations. To encourage using fewer Gaussians\nfor efficiency, we introduce an L1-regularizer on the Gaussians. On various\nstandard evaluation scenes, we show that our method provides improved rendering\nquality, easy control over the number of Gaussians, and robustness to\ninitialization.\n","authors":["Shakiba Kheradmand","Daniel Rebain","Gopal Sharma","Weiwei Sun","Jeff Tseng","Hossam Isack","Abhishek Kar","Andrea Tagliasacchi","Kwang Moo Yi"],"pdf_url":"https://arxiv.org/pdf/2404.09591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09586v1","updated":"2024-04-15T08:54:33Z","published":"2024-04-15T08:54:33Z","title":"Mitigating the Curse of Dimensionality for Certified Robustness via Dual\n Randomized Smoothing","summary":" Randomized Smoothing (RS) has been proven a promising method for endowing an\narbitrary image classifier with certified robustness. However, the substantial\nuncertainty inherent in the high-dimensional isotropic Gaussian noise imposes\nthe curse of dimensionality on RS. Specifically, the upper bound of ${\\ell_2}$\ncertified robustness radius provided by RS exhibits a diminishing trend with\nthe expansion of the input dimension $d$, proportionally decreasing at a rate\nof $1/\\sqrt{d}$. This paper explores the feasibility of providing ${\\ell_2}$\ncertified robustness for high-dimensional input through the utilization of dual\nsmoothing in the lower-dimensional space. The proposed Dual Randomized\nSmoothing (DRS) down-samples the input image into two sub-images and smooths\nthe two sub-images in lower dimensions. Theoretically, we prove that DRS\nguarantees a tight ${\\ell_2}$ certified robustness radius for the original\ninput and reveal that DRS attains a superior upper bound on the ${\\ell_2}$\nrobustness radius, which decreases proportionally at a rate of $(1/\\sqrt m +\n1/\\sqrt n )$ with $m+n=d$. Extensive experiments demonstrate the\ngeneralizability and effectiveness of DRS, which exhibits a notable capability\nto integrate with established methodologies, yielding substantial improvements\nin both accuracy and ${\\ell_2}$ certified robustness baselines of RS on the\nCIFAR-10 and ImageNet datasets. Code is available at\nhttps://github.com/xiasong0501/DRS.\n","authors":["Song Xia","Yu Yi","Xudong Jiang","Henghui Ding"],"pdf_url":"https://arxiv.org/pdf/2404.09586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09585v1","updated":"2024-04-15T08:52:51Z","published":"2024-04-15T08:52:51Z","title":"Pseudo-label Learning with Calibrated Confidence Using an Energy-based\n Model","summary":" In pseudo-labeling (PL), which is a type of semi-supervised learning,\npseudo-labels are assigned based on the confidence scores provided by the\nclassifier; therefore, accurate confidence is important for successful PL. In\nthis study, we propose a PL algorithm based on an energy-based model (EBM),\nwhich is referred to as the energy-based PL (EBPL). In EBPL, a neural\nnetwork-based classifier and an EBM are jointly trained by sharing their\nfeature extraction parts. This approach enables the model to learn both the\nclass decision boundary and input data distribution, enhancing confidence\ncalibration during network training. The experimental results demonstrate that\nEBPL outperforms the existing PL method in semi-supervised image classification\ntasks, with superior confidence calibration error and recognition accuracy.\n","authors":["Masahito Toba","Seiichi Uchida","Hideaki Hayashi"],"pdf_url":"https://arxiv.org/pdf/2404.09585v1.pdf","comment":"8 pages, 8 figures, Accepted at IJCNN 2024"},{"id":"http://arxiv.org/abs/2311.17955v2","updated":"2024-04-15T08:43:58Z","published":"2023-11-29T08:11:20Z","title":"PEAN: A Diffusion-Based Prior-Enhanced Attention Network for Scene Text\n Image Super-Resolution","summary":" Scene text image super-resolution (STISR) aims at simultaneously increasing\nthe resolution and readability of low-resolution scene text images, thus\nboosting the performance of the downstream recognition task. Two factors in\nscene text images, visual structure and semantic information, affect the\nrecognition performance significantly. To mitigate the effects from these\nfactors, this paper proposes a Prior-Enhanced Attention Network (PEAN).\nSpecifically, an attention-based modulation module is leveraged to understand\nscene text images by neatly perceiving the local and global dependence of\nimages, despite the shape of the text. Meanwhile, a diffusion-based module is\ndeveloped to enhance the text prior, hence offering better guidance for the SR\nnetwork to generate SR images with higher semantic accuracy. Additionally, a\nmulti-task learning paradigm is employed to optimize the network, enabling the\nmodel to generate legible SR images. As a result, PEAN establishes new SOTA\nresults on the TextZoom benchmark. Experiments are also conducted to analyze\nthe importance of the enhanced text prior as a means of improving the\nperformance of the SR network. Code will be made available at\nhttps://github.com/jdfxzzy/PEAN.\n","authors":["Zuoyan Zhao","Hui Xue","Pengfei Fang","Shipeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.17955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07662v3","updated":"2024-04-15T08:37:57Z","published":"2023-03-14T07:07:34Z","title":"Do More With What You Have: Transferring Depth-Scale from Labeled to\n Unlabeled Domains","summary":" Transferring the absolute depth prediction capabilities of an estimator to a\nnew domain is a task with significant real-world applications. This task is\nspecifically challenging when images from the new domain are collected without\nground-truth depth measurements, and possibly with sensors of different\nintrinsics. To overcome such limitations, a recent zero-shot solution was\ntrained on an extensive training dataset and encoded the various camera\nintrinsics. Other solutions generated synthetic data with depth labels that\nmatched the intrinsics of the new target data to enable depth-scale transfer\nbetween the domains.\n In this work we present an alternative solution that can utilize any existing\nsynthetic or real dataset, that has a small number of images annotated with\nground truth depth labels. Specifically, we show that self-supervised depth\nestimators result in up-to-scale predictions that are linearly correlated to\ntheir absolute depth values across the domain, a property that we model in this\nwork using a single scalar. In addition, aligning the field-of-view of two\ndatasets prior to training, results in a common linear relationship for both\ndomains. We use this observed property to transfer the depth-scale from source\ndatasets that have absolute depth labels to new target datasets that lack these\nmeasurements, enabling absolute depth predictions in the target domain.\n The suggested method was successfully demonstrated on the KITTI, DDAD and\nnuScenes datasets, while using other existing real or synthetic source\ndatasets, that have a different field-of-view, other image style or structural\ncontent, achieving comparable or better accuracy than other existing methods\nthat do not use target ground-truth depths.\n","authors":["Alexandra Dana","Nadav Carmel","Amit Shomer","Ofer Manela","Tomer Peleg"],"pdf_url":"https://arxiv.org/pdf/2303.07662v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09571v1","updated":"2024-04-15T08:32:41Z","published":"2024-04-15T08:32:41Z","title":"MTKD: Multi-Teacher Knowledge Distillation for Image Super-Resolution","summary":" Knowledge distillation (KD) has emerged as a promising technique in deep\nlearning, typically employed to enhance a compact student network through\nlearning from their high-performance but more complex teacher variant. When\napplied in the context of image super-resolution, most KD approaches are\nmodified versions of methods developed for other computer vision tasks, which\nare based on training strategies with a single teacher and simple loss\nfunctions. In this paper, we propose a novel Multi-Teacher Knowledge\nDistillation (MTKD) framework specifically for image super-resolution. It\nexploits the advantages of multiple teachers by combining and enhancing the\noutputs of these teacher models, which then guides the learning process of the\ncompact student network. To achieve more effective learning performance, we\nhave also developed a new wavelet-based loss function for MTKD, which can\nbetter optimize the training process by observing differences in both the\nspatial and frequency domains. We fully evaluate the effectiveness of the\nproposed method by comparing it to five commonly used KD methods for image\nsuper-resolution based on three popular network architectures. The results show\nthat the proposed MTKD method achieves evident improvements in super-resolution\nperformance, up to 0.46dB (based on PSNR), over state-of-the-art KD approaches\nacross different network structures. The source code of MTKD will be made\navailable here for public evaluation.\n","authors":["Yuxuan Jiang","Chen Feng","Fan Zhang","David Bull"],"pdf_url":"https://arxiv.org/pdf/2404.09571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09570v1","updated":"2024-04-15T08:32:18Z","published":"2024-04-15T08:32:18Z","title":"The revenge of BiSeNet: Efficient Multi-Task Image Segmentation","summary":" Recent advancements in image segmentation have focused on enhancing the\nefficiency of the models to meet the demands of real-time applications,\nespecially on edge devices. However, existing research has primarily\nconcentrated on single-task settings, especially on semantic segmentation,\nleading to redundant efforts and specialized architectures for different tasks.\nTo address this limitation, we propose a novel architecture for efficient\nmulti-task image segmentation, capable of handling various segmentation tasks\nwithout sacrificing efficiency or accuracy. We introduce BiSeNetFormer, that\nleverages the efficiency of two-stream semantic segmentation architectures and\nit extends them into a mask classification framework. Our approach maintains\nthe efficient spatial and context paths to capture detailed and semantic\ninformation, respectively, while leveraging an efficient transformed-based\nsegmentation head that computes the binary masks and class probabilities. By\nseamlessly supporting multiple tasks, namely semantic and panoptic\nsegmentation, BiSeNetFormer offers a versatile solution for multi-task\nsegmentation. We evaluate our approach on popular datasets, Cityscapes and\nADE20K, demonstrating impressive inference speeds while maintaining competitive\naccuracy compared to state-of-the-art architectures. Our results indicate that\nBiSeNetFormer represents a significant advancement towards fast, efficient, and\nmulti-task segmentation networks, bridging the gap between model efficiency and\ntask adaptability.\n","authors":["Gabriele Rosi","Claudia Cuttano","Niccolò Cavagnero","Giuseppe Averta","Fabio Cermelli"],"pdf_url":"https://arxiv.org/pdf/2404.09570v1.pdf","comment":"Accepted to ECV workshop at CVPR2024"},{"id":"http://arxiv.org/abs/2212.14855v3","updated":"2024-04-15T08:24:42Z","published":"2022-12-30T18:04:25Z","title":"Disentangled Explanations of Neural Network Predictions by Finding\n Relevant Subspaces","summary":" Explainable AI aims to overcome the black-box nature of complex ML models\nlike neural networks by generating explanations for their predictions.\nExplanations often take the form of a heatmap identifying input features (e.g.\npixels) that are relevant to the model's decision. These explanations, however,\nentangle the potentially multiple factors that enter into the overall complex\ndecision strategy. We propose to disentangle explanations by extracting at some\nintermediate layer of a neural network, subspaces that capture the multiple and\ndistinct activation patterns (e.g. visual concepts) that are relevant to the\nprediction. To automatically extract these subspaces, we propose two new\nanalyses, extending principles found in PCA or ICA to explanations. These novel\nanalyses, which we call principal relevant component analysis (PRCA) and\ndisentangled relevant subspace analysis (DRSA), maximize relevance instead of\ne.g. variance or kurtosis. This allows for a much stronger focus of the\nanalysis on what the ML model actually uses for predicting, ignoring\nactivations or concepts to which the model is invariant. Our approach is\ngeneral enough to work alongside common attribution techniques such as Shapley\nValue, Integrated Gradients, or LRP. Our proposed methods show to be\npractically useful and compare favorably to the state of the art as\ndemonstrated on benchmarks and three use cases.\n","authors":["Pattarawat Chormai","Jan Herrmann","Klaus-Robert Müller","Grégoire Montavon"],"pdf_url":"https://arxiv.org/pdf/2212.14855v3.pdf","comment":"17 pages + supplement"},{"id":"http://arxiv.org/abs/2303.14017v3","updated":"2024-04-15T08:22:49Z","published":"2023-03-24T14:18:40Z","title":"CF-Font: Content Fusion for Few-shot Font Generation","summary":" Content and style disentanglement is an effective way to achieve few-shot\nfont generation. It allows to transfer the style of the font image in a source\ndomain to the style defined with a few reference images in a target domain.\nHowever, the content feature extracted using a representative font might not be\noptimal. In light of this, we propose a content fusion module (CFM) to project\nthe content feature into a linear space defined by the content features of\nbasis fonts, which can take the variation of content features caused by\ndifferent fonts into consideration. Our method also allows to optimize the\nstyle representation vector of reference images through a lightweight iterative\nstyle-vector refinement (ISR) strategy. Moreover, we treat the 1D projection of\na character image as a probability distribution and leverage the distance\nbetween two distributions as the reconstruction loss (namely projected\ncharacter loss, PCL). Compared to L2 or L1 reconstruction loss, the\ndistribution distance pays more attention to the global shape of characters. We\nhave evaluated our method on a dataset of 300 fonts with 6.5k characters each.\nExperimental results verify that our method outperforms existing\nstate-of-the-art few-shot font generation methods by a large margin. The source\ncode can be found at https://github.com/wangchi95/CF-Font.\n","authors":["Chi Wang","Min Zhou","Tiezheng Ge","Yuning Jiang","Hujun Bao","Weiwei Xu"],"pdf_url":"https://arxiv.org/pdf/2303.14017v3.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2404.09556v1","updated":"2024-04-15T08:19:08Z","published":"2024-04-15T08:19:08Z","title":"nnU-Net Revisited: A Call for Rigorous Validation in 3D Medical Image\n Segmentation","summary":" The release of nnU-Net marked a paradigm shift in 3D medical image\nsegmentation, demonstrating that a properly configured U-Net architecture could\nstill achieve state-of-the-art results. Despite this, the pursuit of novel\narchitectures, and the respective claims of superior performance over the U-Net\nbaseline, continued. In this study, we demonstrate that many of these recent\nclaims fail to hold up when scrutinized for common validation shortcomings,\nsuch as the use of inadequate baselines, insufficient datasets, and neglected\ncomputational resources. By meticulously avoiding these pitfalls, we conduct a\nthorough and comprehensive benchmarking of current segmentation methods\nincluding CNN-based, Transformer-based, and Mamba-based approaches. In contrast\nto current beliefs, we find that the recipe for state-of-the-art performance is\n1) employing CNN-based U-Net models, including ResNet and ConvNeXt variants, 2)\nusing the nnU-Net framework, and 3) scaling models to modern hardware\nresources. These results indicate an ongoing innovation bias towards novel\narchitectures in the field and underscore the need for more stringent\nvalidation standards in the quest for scientific progress.\n","authors":["Fabian Isensee","Tassilo Wald","Constantin Ulrich","Michael Baumgartner","Saikat Roy","Klaus Maier-Hein","Paul F. Jaeger"],"pdf_url":"https://arxiv.org/pdf/2404.09556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09555v1","updated":"2024-04-15T08:18:38Z","published":"2024-04-15T08:18:38Z","title":"AI-KD: Towards Alignment Invariant Face Image Quality Assessment Using\n Knowledge Distillation","summary":" Face Image Quality Assessment (FIQA) techniques have seen steady improvements\nover recent years, but their performance still deteriorates if the input face\nsamples are not properly aligned. This alignment sensitivity comes from the\nfact that most FIQA techniques are trained or designed using a specific face\nalignment procedure. If the alignment technique changes, the performance of\nmost existing FIQA techniques quickly becomes suboptimal. To address this\nproblem, we present in this paper a novel knowledge distillation approach,\ntermed AI-KD that can extend on any existing FIQA technique, improving its\nrobustness to alignment variations and, in turn, performance with different\nalignment procedures. To validate the proposed distillation approach, we\nconduct comprehensive experiments on 6 face datasets with 4 recent face\nrecognition models and in comparison to 7 state-of-the-art FIQA techniques. Our\nresults show that AI-KD consistently improves performance of the initial FIQA\ntechniques not only with misaligned samples, but also with properly aligned\nfacial images. Furthermore, it leads to a new state-of-the-art, when used with\na competitive initial FIQA approach. The code for AI-KD is made publicly\navailable from: https://github.com/LSIbabnikz/AI-KD.\n","authors":["Žiga Babnik","Fadi Boutros","Naser Damer","Peter Peer","Vitomir Štruc"],"pdf_url":"https://arxiv.org/pdf/2404.09555v1.pdf","comment":"IEEE International Workshop on Biometrics and Forensics (IWBF) 2024,\n pp. 6"},{"id":"http://arxiv.org/abs/2404.09540v1","updated":"2024-04-15T08:04:44Z","published":"2024-04-15T08:04:44Z","title":"Text-Driven Diverse Facial Texture Generation via Progressive\n Latent-Space Refinement","summary":" Automatic 3D facial texture generation has gained significant interest\nrecently. Existing approaches may not support the traditional physically based\nrendering pipeline or rely on 3D data captured by Light Stage. Our key\ncontribution is a progressive latent space refinement approach that can\nbootstrap from 3D Morphable Models (3DMMs)-based texture maps generated from\nfacial images to generate high-quality and diverse PBR textures, including\nalbedo, normal, and roughness. It starts with enhancing Generative Adversarial\nNetworks (GANs) for text-guided and diverse texture generation. To this end, we\ndesign a self-supervised paradigm to overcome the reliance on ground truth 3D\ntextures and train the generative model with only entangled texture maps.\nBesides, we foster mutual enhancement between GANs and Score Distillation\nSampling (SDS). SDS boosts GANs with more generative modes, while GANs promote\nmore efficient optimization of SDS. Furthermore, we introduce an edge-aware SDS\nfor multi-view consistent facial structure. Experiments demonstrate that our\nmethod outperforms existing 3D texture generation methods regarding\nphoto-realistic quality, diversity, and efficiency.\n","authors":["Chi Wang","Junming Huang","Rong Zhang","Qi Wang","Haotian Yang","Haibin Huang","Chongyang Ma","Weiwei Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05941v2","updated":"2024-04-15T07:59:37Z","published":"2023-12-10T17:07:37Z","title":"ASH: Animatable Gaussian Splats for Efficient and Photoreal Human\n Rendering","summary":" Real-time rendering of photorealistic and controllable human avatars stands\nas a cornerstone in Computer Vision and Graphics. While recent advances in\nneural implicit rendering have unlocked unprecedented photorealism for digital\navatars, real-time performance has mostly been demonstrated for static scenes\nonly. To address this, we propose ASH, an animatable Gaussian splatting\napproach for photorealistic rendering of dynamic humans in real-time. We\nparameterize the clothed human as animatable 3D Gaussians, which can be\nefficiently splatted into image space to generate the final rendering. However,\nnaively learning the Gaussian parameters in 3D space poses a severe challenge\nin terms of compute. Instead, we attach the Gaussians onto a deformable\ncharacter model, and learn their parameters in 2D texture space, which allows\nleveraging efficient 2D convolutional architectures that easily scale with the\nrequired number of Gaussians. We benchmark ASH with competing methods on\npose-controllable avatars, demonstrating that our method outperforms existing\nreal-time methods by a large margin and shows comparable or even better results\nthan offline methods.\n","authors":["Haokai Pang","Heming Zhu","Adam Kortylewski","Christian Theobalt","Marc Habermann"],"pdf_url":"https://arxiv.org/pdf/2312.05941v2.pdf","comment":"For project page, see https://vcai.mpi-inf.mpg.de/projects/ash/"},{"id":"http://arxiv.org/abs/2401.03522v2","updated":"2024-04-15T07:59:03Z","published":"2024-01-07T15:47:19Z","title":"Text-Driven Traffic Anomaly Detection with Temporal High-Frequency\n Modeling in Driving Videos","summary":" Traffic anomaly detection (TAD) in driving videos is critical for ensuring\nthe safety of autonomous driving and advanced driver assistance systems.\nPrevious single-stage TAD methods primarily rely on frame prediction, making\nthem vulnerable to interference from dynamic backgrounds induced by the rapid\nmovement of the dashboard camera. While two-stage TAD methods appear to be a\nnatural solution to mitigate such interference by pre-extracting\nbackground-independent features (such as bounding boxes and optical flow) using\nperceptual algorithms, they are susceptible to the performance of first-stage\nperceptual algorithms and may result in error propagation. In this paper, we\nintroduce TTHF, a novel single-stage method aligning video clips with text\nprompts, offering a new perspective on traffic anomaly detection. Unlike\nprevious approaches, the supervised signal of our method is derived from\nlanguages rather than orthogonal one-hot vectors, providing a more\ncomprehensive representation. Further, concerning visual representation, we\npropose to model the high frequency of driving videos in the temporal domain.\nThis modeling captures the dynamic changes of driving scenes, enhances the\nperception of driving behavior, and significantly improves the detection of\ntraffic anomalies. In addition, to better perceive various types of traffic\nanomalies, we carefully design an attentive anomaly focusing mechanism that\nvisually and linguistically guides the model to adaptively focus on the visual\ncontext of interest, thereby facilitating the detection of traffic anomalies.\nIt is shown that our proposed TTHF achieves promising performance,\noutperforming state-of-the-art competitors by +5.4% AUC on the DoTA dataset and\nachieving high generalization on the DADA dataset.\n","authors":["Rongqin Liang","Yuanman Li","Jiantao Zhou","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2401.03522v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.09533v1","updated":"2024-04-15T07:53:07Z","published":"2024-04-15T07:53:07Z","title":"WiTUnet: A U-Shaped Architecture Integrating CNN and Transformer for\n Improved Feature Alignment and Local Information Fusion","summary":" Low-dose computed tomography (LDCT) has become the technology of choice for\ndiagnostic medical imaging, given its lower radiation dose compared to standard\nCT, despite increasing image noise and potentially affecting diagnostic\naccuracy. To address this, advanced deep learning-based LDCT denoising\nalgorithms have been developed, primarily using Convolutional Neural Networks\n(CNNs) or Transformer Networks with the Unet architecture. This architecture\nenhances image detail by integrating feature maps from the encoder and decoder\nvia skip connections. However, current methods often overlook enhancements to\nthe Unet architecture itself, focusing instead on optimizing encoder and\ndecoder structures. This approach can be problematic due to the significant\ndifferences in feature map characteristics between the encoder and decoder,\nwhere simple fusion strategies may not effectively reconstruct images.In this\npaper, we introduce WiTUnet, a novel LDCT image denoising method that utilizes\nnested, dense skip pathways instead of traditional skip connections to improve\nfeature integration. WiTUnet also incorporates a windowed Transformer structure\nto process images in smaller, non-overlapping segments, reducing computational\nload. Additionally, the integration of a Local Image Perception Enhancement\n(LiPe) module in both the encoder and decoder replaces the standard multi-layer\nperceptron (MLP) in Transformers, enhancing local feature capture and\nrepresentation. Through extensive experimental comparisons, WiTUnet has\ndemonstrated superior performance over existing methods in key metrics such as\nPeak Signal-to-Noise Ratio (PSNR), Structural Similarity (SSIM), and Root Mean\nSquare Error (RMSE), significantly improving noise removal and image quality.\n","authors":["Bin Wang","Fei Deng","Peifan Jiang","Shuang Wang","Xiao Han","Hongjie Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.09533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09532v1","updated":"2024-04-15T07:51:40Z","published":"2024-04-15T07:51:40Z","title":"TMPQ-DM: Joint Timestep Reduction and Quantization Precision Selection\n for Efficient Diffusion Models","summary":" Diffusion models have emerged as preeminent contenders in the realm of\ngenerative models. Distinguished by their distinctive sequential generative\nprocesses, characterized by hundreds or even thousands of timesteps, diffusion\nmodels progressively reconstruct images from pure Gaussian noise, with each\ntimestep necessitating full inference of the entire model. However, the\nsubstantial computational demands inherent to these models present challenges\nfor deployment, quantization is thus widely used to lower the bit-width for\nreducing the storage and computing overheads. Current quantization\nmethodologies primarily focus on model-side optimization, disregarding the\ntemporal dimension, such as the length of the timestep sequence, thereby\nallowing redundant timesteps to continue consuming computational resources,\nleaving substantial scope for accelerating the generative process. In this\npaper, we introduce TMPQ-DM, which jointly optimizes timestep reduction and\nquantization to achieve a superior performance-efficiency trade-off, addressing\nboth temporal and model optimization aspects. For timestep reduction, we devise\na non-uniform grouping scheme tailored to the non-uniform nature of the\ndenoising process, thereby mitigating the explosive combinations of timesteps.\nIn terms of quantization, we adopt a fine-grained layer-wise approach to\nallocate varying bit-widths to different layers based on their respective\ncontributions to the final generative performance, thus rectifying performance\ndegradation observed in prior studies. To expedite the evaluation of\nfine-grained quantization, we further devise a super-network to serve as a\nprecision solver by leveraging shared quantization results. These two design\ncomponents are seamlessly integrated within our framework, enabling rapid joint\nexploration of the exponentially large decision space via a gradient-free\nevolutionary search algorithm.\n","authors":["Haojun Sun","Chen Tang","Zhi Wang","Yuan Meng","Jingyan jiang","Xinzhu Ma","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.09532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09531v1","updated":"2024-04-15T07:51:29Z","published":"2024-04-15T07:51:29Z","title":"Oblique-MERF: Revisiting and Improving MERF for Oblique Photography","summary":" Neural implicit fields have established a new paradigm for scene\nrepresentation, with subsequent work achieving high-quality real-time\nrendering. However, reconstructing 3D scenes from oblique aerial photography\npresents unique challenges, such as varying spatial scale distributions and a\nconstrained range of tilt angles, often resulting in high memory consumption\nand reduced rendering quality at extrapolated viewpoints. In this paper, we\nenhance MERF to accommodate these data characteristics by introducing an\ninnovative adaptive occupancy plane optimized during the volume rendering\nprocess and a smoothness regularization term for view-dependent color to\naddress these issues. Our approach, termed Oblique-MERF, surpasses\nstate-of-the-art real-time methods by approximately 0.7 dB, reduces VRAM usage\nby about 40%, and achieves higher rendering frame rates with more realistic\nrendering outcomes across most viewpoints.\n","authors":["Xiaoyi Zeng","Kaiwen Song","Leyuan Yang","Bailin Deng","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09530v1","updated":"2024-04-15T07:50:15Z","published":"2024-04-15T07:50:15Z","title":"RanLayNet: A Dataset for Document Layout Detection used for Domain\n Adaptation and Generalization","summary":" Large ground-truth datasets and recent advances in deep learning techniques\nhave been useful for layout detection. However, because of the restricted\nlayout diversity of these datasets, training on them requires a sizable number\nof annotated instances, which is both expensive and time-consuming. As a\nresult, differences between the source and target domains may significantly\nimpact how well these models function. To solve this problem, domain adaptation\napproaches have been developed that use a small quantity of labeled data to\nadjust the model to the target domain. In this research, we introduced a\nsynthetic document dataset called RanLayNet, enriched with automatically\nassigned labels denoting spatial positions, ranges, and types of layout\nelements. The primary aim of this endeavor is to develop a versatile dataset\ncapable of training models with robustness and adaptability to diverse document\nformats. Through empirical experimentation, we demonstrate that a deep layout\nidentification model trained on our dataset exhibits enhanced performance\ncompared to a model trained solely on actual documents. Moreover, we conduct a\ncomparative analysis by fine-tuning inference models using both PubLayNet and\nIIIT-AR-13K datasets on the Doclaynet dataset. Our findings emphasize that\nmodels enriched with our dataset are optimal for tasks such as achieving 0.398\nand 0.588 mAP95 score in the scientific document domain for the TABLE class.\n","authors":["Avinash Anand","Raj Jaiswal","Mohit Gupta","Siddhesh S Bangar","Pijush Bhuyan","Naman Lal","Rajeev Singh","Ritika Jha","Rajiv Ratn Shah","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2404.09530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09516v1","updated":"2024-04-15T07:24:45Z","published":"2024-04-15T07:24:45Z","title":"State Space Model for New-Generation Network Alternative to\n Transformers: A Survey","summary":" In the post-deep learning era, the Transformer architecture has demonstrated\nits powerful performance across pre-trained big models and various downstream\ntasks. However, the enormous computational demands of this architecture have\ndeterred many researchers. To further reduce the complexity of attention\nmodels, numerous efforts have been made to design more efficient methods. Among\nthem, the State Space Model (SSM), as a possible replacement for the\nself-attention based Transformer model, has drawn more and more attention in\nrecent years. In this paper, we give the first comprehensive review of these\nworks and also provide experimental comparisons and analysis to better\ndemonstrate the features and advantages of SSM. Specifically, we first give a\ndetailed description of principles to help the readers quickly capture the key\nideas of SSM. After that, we dive into the reviews of existing SSMs and their\nvarious applications, including natural language processing, computer vision,\ngraph, multi-modal and multi-media, point cloud/event stream, time series data,\nand other domains. In addition, we give statistical comparisons and analysis of\nthese models and hope it helps the readers to understand the effectiveness of\ndifferent structures on various tasks. Then, we propose possible research\npoints in this direction to better promote the development of the theoretical\nmodel and application of SSM. More related works will be continuously updated\non the following GitHub:\nhttps://github.com/Event-AHU/Mamba_State_Space_Model_Paper_List.\n","authors":["Xiao Wang","Shiao Wang","Yuhe Ding","Yuehang Li","Wentao Wu","Yao Rong","Weizhe Kong","Ju Huang","Shihao Li","Haoxiang Yang","Ziwen Wang","Bo Jiang","Chenglong Li","Yaowei Wang","Yonghong Tian","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2404.09516v1.pdf","comment":"The First review of State Space Model (SSM)/Mamba and their\n applications in artificial intelligence, 33 pages"},{"id":"http://arxiv.org/abs/2404.09515v1","updated":"2024-04-15T07:20:09Z","published":"2024-04-15T07:20:09Z","title":"Deep image learning of quantitative structure-property relationships of\n cooper alloys via feature augmentation on Geodesic curve in shape space","summary":" Understanding how the structure of materials affects their properties is a\ncornerstone of materials science and engineering. However, traditional methods\nhave struggled to accurately describe the quantitative structure-property\nrelationships for complex structures. In our study, we bridge this gap by\nleveraging machine learning to analyze images of materials' microstructures,\nthus offering a novel way to understand and predict the properties of materials\nbased on their microstructures. We introduce a method known as FAGC (Feature\nAugmentation on Geodesic Curves), specifically demonstrated for Cu-Cr-Zr\nalloys. This approach utilizes machine learning to examine the shapes within\nimages of the alloys' microstructures and predict their mechanical and\nelectronic properties. This generative FAGC approach can effectively expand the\nrelatively small training datasets due to the limited availability of materials\nimages labeled with quantitative properties. The process begins with extracting\nfeatures from the images using neural networks. These features are then mapped\nonto the Pre-shape space to construct the Geodesic curves. Along these curves,\nnew features are generated, effectively increasing the dataset. Moreover, we\ndesign a pseudo-labeling mechanism for these newly generated features to\nfurther enhance the training dataset. Our FAGC method has shown remarkable\nresults, significantly improving the accuracy of predicting the electronic\nconductivity and hardness of Cu-Cr-Zr alloys, with R-squared values of 0.978\nand 0.998, respectively. These outcomes underscore the potential of FAGC to\naddress the challenge of limited image data in materials science, providing a\npowerful tool for establishing detailed and quantitative relationships between\ncomplex microstructures and material properties.\n","authors":["Yuexing Han","Guanxin Wan","Bing Wang","Yi Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13783v2","updated":"2024-04-15T07:18:45Z","published":"2023-12-21T12:14:31Z","title":"Few Shot Part Segmentation Reveals Compositional Logic for Industrial\n Anomaly Detection","summary":" Logical anomalies (LA) refer to data violating underlying logical constraints\ne.g., the quantity, arrangement, or composition of components within an image.\nDetecting accurately such anomalies requires models to reason about various\ncomponent types through segmentation. However, curation of pixel-level\nannotations for semantic segmentation is both time-consuming and expensive.\nAlthough there are some prior few-shot or unsupervised co-part segmentation\nalgorithms, they often fail on images with industrial object. These images have\ncomponents with similar textures and shapes, and a precise differentiation\nproves challenging. In this study, we introduce a novel component segmentation\nmodel for LA detection that leverages a few labeled samples and unlabeled\nimages sharing logical constraints. To ensure consistent segmentation across\nunlabeled images, we employ a histogram matching loss in conjunction with an\nentropy loss. As segmentation predictions play a crucial role, we propose to\nenhance both local and global sample validity detection by capturing key\naspects from visual semantics via three memory banks: class histograms,\ncomponent composition embeddings and patch-level representations. For effective\nLA detection, we propose an adaptive scaling strategy to standardize anomaly\nscores from different memory banks in inference. Extensive experiments on the\npublic benchmark MVTec LOCO AD reveal our method achieves 98.1% AUROC in LA\ndetection vs. 89.6% from competing methods.\n","authors":["Soopil Kim","Sion An","Philip Chikontwe","Myeongkyun Kang","Ehsan Adeli","Kilian M. Pohl","Sang Hyun Park"],"pdf_url":"https://arxiv.org/pdf/2312.13783v2.pdf","comment":"Accepted in AAAI2024"},{"id":"http://arxiv.org/abs/2312.01897v2","updated":"2024-04-15T07:15:43Z","published":"2023-12-04T13:51:16Z","title":"Adapting Short-Term Transformers for Action Detection in Untrimmed\n Videos","summary":" Vision Transformer (ViT) has shown high potential in video recognition, owing\nto its flexible design, adaptable self-attention mechanisms, and the efficacy\nof masked pre-training. Yet, it remains unclear how to adapt these pre-trained\nshort-term ViTs for temporal action detection (TAD) in untrimmed videos. The\nexisting works treat them as off-the-shelf feature extractors for each\nshort-trimmed snippet without capturing the fine-grained relation among\ndifferent snippets in a broader temporal context. To mitigate this issue, this\npaper focuses on designing a new mechanism for adapting these pre-trained ViT\nmodels as a unified long-form video transformer to fully unleash its modeling\npower in capturing inter-snippet relation, while still keeping low computation\noverhead and memory consumption for efficient TAD. To this end, we design\neffective cross-snippet propagation modules to gradually exchange short-term\nvideo information among different snippets from two levels. For inner-backbone\ninformation propagation, we introduce a cross-snippet propagation strategy to\nenable multi-snippet temporal feature interaction inside the backbone.For\npost-backbone information propagation, we propose temporal transformer layers\nfor further clip-level modeling. With the plain ViT-B pre-trained with\nVideoMAE, our end-to-end temporal action detector (ViT-TAD) yields a very\ncompetitive performance to previous temporal action detectors, riching up to\n69.5 average mAP on THUMOS14, 37.40 average mAP on ActivityNet-1.3 and 17.20\naverage mAP on FineAction.\n","authors":["Min Yang","Huan Gao","Ping Guo","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01897v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.09512v1","updated":"2024-04-15T07:15:39Z","published":"2024-04-15T07:15:39Z","title":"Magic Clothing: Controllable Garment-Driven Image Synthesis","summary":" We propose Magic Clothing, a latent diffusion model (LDM)-based network\narchitecture for an unexplored garment-driven image synthesis task. Aiming at\ngenerating customized characters wearing the target garments with diverse text\nprompts, the image controllability is the most critical issue, i.e., to\npreserve the garment details and maintain faithfulness to the text prompts. To\nthis end, we introduce a garment extractor to capture the detailed garment\nfeatures, and employ self-attention fusion to incorporate them into the\npretrained LDMs, ensuring that the garment details remain unchanged on the\ntarget character. Then, we leverage the joint classifier-free guidance to\nbalance the control of garment features and text prompts over the generated\nresults. Meanwhile, the proposed garment extractor is a plug-in module\napplicable to various finetuned LDMs, and it can be combined with other\nextensions like ControlNet and IP-Adapter to enhance the diversity and\ncontrollability of the generated characters. Furthermore, we design\nMatched-Points-LPIPS (MP-LPIPS), a robust metric for evaluating the consistency\nof the target image to the source garment. Extensive experiments demonstrate\nthat our Magic Clothing achieves state-of-the-art results under various\nconditional controls for garment-driven image synthesis. Our source code is\navailable at https://github.com/ShineChen1024/MagicClothing.\n","authors":["Weifeng Chen","Tao Gu","Yuhao Xu","Chengcai Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01238v2","updated":"2024-04-15T07:12:20Z","published":"2024-03-02T15:47:42Z","title":"On the Road to Portability: Compressing End-to-End Motion Planner for\n Autonomous Driving","summary":" End-to-end motion planning models equipped with deep neural networks have\nshown great potential for enabling full autonomous driving. However, the\noversized neural networks render them impractical for deployment on\nresource-constrained systems, which unavoidably requires more computational\ntime and resources during reference.To handle this, knowledge distillation\noffers a promising approach that compresses models by enabling a smaller\nstudent model to learn from a larger teacher model. Nevertheless, how to apply\nknowledge distillation to compress motion planners has not been explored so\nfar. In this paper, we propose PlanKD, the first knowledge distillation\nframework tailored for compressing end-to-end motion planners. First,\nconsidering that driving scenes are inherently complex, often containing\nplanning-irrelevant or even noisy information, transferring such information is\nnot beneficial for the student planner. Thus, we design an information\nbottleneck based strategy to only distill planning-relevant information, rather\nthan transfer all information indiscriminately. Second, different waypoints in\nan output planned trajectory may hold varying degrees of importance for motion\nplanning, where a slight deviation in certain crucial waypoints might lead to a\ncollision. Therefore, we devise a safety-aware waypoint-attentive distillation\nmodule that assigns adaptive weights to different waypoints based on the\nimportance, to encourage the student to accurately mimic more crucial\nwaypoints, thereby improving overall safety. Experiments demonstrate that our\nPlanKD can boost the performance of smaller planners by a large margin, and\nsignificantly reduce their reference time.\n","authors":["Kaituo Feng","Changsheng Li","Dongchun Ren","Ye Yuan","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2403.01238v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.00015v2","updated":"2024-04-15T07:06:54Z","published":"2023-12-28T14:14:31Z","title":"Maintaining User Trust Through Multistage Uncertainty Aware Inference","summary":" This paper describes and evaluates a multistage approach to AI deployment.\nEach stage involves a more accurate method of inference, yet engaging each\ncomes with an increasing cost. In outlining the architecture, we present a\nmethod for quantifying model uncertainty that facilitates confident deferral\ndecisions. The architecture is currently under active deployment to thousands\nof cotton farmers across India. The broader idea however is applicable to a\ngrowing sector of AI deployments in challenging low resources settings.\n","authors":["Chandan Agrawal","Ashish Papanai","Jerome White"],"pdf_url":"https://arxiv.org/pdf/2402.00015v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09509v1","updated":"2024-04-15T07:05:14Z","published":"2024-04-15T07:05:14Z","title":"Fuse after Align: Improving Face-Voice Association Learning via\n Multimodal Encoder","summary":" Today, there have been many achievements in learning the association between\nvoice and face. However, most previous work models rely on cosine similarity or\nL2 distance to evaluate the likeness of voices and faces following contrastive\nlearning, subsequently applied to retrieval and matching tasks. This method\nonly considers the embeddings as high-dimensional vectors, utilizing a minimal\nscope of available information. This paper introduces a novel framework within\nan unsupervised setting for learning voice-face associations. By employing a\nmultimodal encoder after contrastive learning and addressing the problem\nthrough binary classification, we can learn the implicit information within the\nembeddings in a more effective and varied manner. Furthermore, by introducing\nan effective pair selection method, we enhance the learning outcomes of both\ncontrastive learning and the matching task. Empirical evidence demonstrates\nthat our framework achieves state-of-the-art results in voice-face matching,\nverification, and retrieval tasks, improving verification by approximately 3%,\nmatching by about 2.5%, and retrieval by around 1.3%.\n","authors":["Chong Peng","Liqiang He","Dan Su"],"pdf_url":"https://arxiv.org/pdf/2404.09509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09507v1","updated":"2024-04-15T06:58:09Z","published":"2024-04-15T06:58:09Z","title":"Clothes-Changing Person Re-Identification with Feasibility-Aware\n Intermediary Matching","summary":" Current clothes-changing person re-identification (re-id) approaches usually\nperform retrieval based on clothes-irrelevant features, while neglecting the\npotential of clothes-relevant features. However, we observe that relying solely\non clothes-irrelevant features for clothes-changing re-id is limited, since\nthey often lack adequate identity information and suffer from large intra-class\nvariations. On the contrary, clothes-relevant features can be used to discover\nsame-clothes intermediaries that possess informative identity clues. Based on\nthis observation, we propose a Feasibility-Aware Intermediary Matching (FAIM)\nframework to additionally utilize clothes-relevant features for retrieval.\nFirstly, an Intermediary Matching (IM) module is designed to perform an\nintermediary-assisted matching process. This process involves using\nclothes-relevant features to find informative intermediates, and then using\nclothes-irrelevant features of these intermediates to complete the matching.\nSecondly, in order to reduce the negative effect of low-quality intermediaries,\nan Intermediary-Based Feasibility Weighting (IBFW) module is designed to\nevaluate the feasibility of intermediary matching process by assessing the\nquality of intermediaries. Extensive experiments demonstrate that our method\noutperforms state-of-the-art methods on several widely-used clothes-changing\nre-id benchmarks.\n","authors":["Jiahe Zhao","Ruibing Hou","Hong Chang","Xinqian Gu","Bingpeng Ma","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09504v1","updated":"2024-04-15T06:50:58Z","published":"2024-04-15T06:50:58Z","title":"Learning Tracking Representations from Single Point Annotations","summary":" Existing deep trackers are typically trained with largescale video frames\nwith annotated bounding boxes. However, these bounding boxes are expensive and\ntime-consuming to annotate, in particular for large scale datasets. In this\npaper, we propose to learn tracking representations from single point\nannotations (i.e., 4.5x faster to annotate than the traditional bounding box)\nin a weakly supervised manner. Specifically, we propose a soft contrastive\nlearning (SoCL) framework that incorporates target objectness prior into\nend-to-end contrastive learning. Our SoCL consists of adaptive positive and\nnegative sample generation, which is memory-efficient and effective for\nlearning tracking representations. We apply the learned representation of SoCL\nto visual tracking and show that our method can 1) achieve better performance\nthan the fully supervised baseline trained with box annotations under the same\nannotation time cost; 2) achieve comparable performance of the fully supervised\nbaseline by using the same number of training frames and meanwhile reducing\nannotation time cost by 78% and total fees by 85%; 3) be robust to annotation\nnoise.\n","authors":["Qiangqiang Wu","Antoni B. Chan"],"pdf_url":"https://arxiv.org/pdf/2404.09504v1.pdf","comment":"Accept to CVPR2024-L3DIVU"},{"id":"http://arxiv.org/abs/2403.13392v2","updated":"2024-04-15T06:46:04Z","published":"2024-03-20T08:33:40Z","title":"Robust image segmentation model based on binary level set","summary":" In order to improve the robustness of traditional image segmentation models\nto noise, this paper models the illumination term in intensity inhomogeneity\nimages. Additionally, to enhance the model's robustness to noisy images, we\nincorporate the binary level set model into the proposed model. Compared to the\ntraditional level set, the binary level set eliminates the need for continuous\nreinitialization. Moreover, by introducing the variational operator GL, our\nmodel demonstrates better capability in segmenting noisy images. Finally, we\nemploy the three-step splitting operator method for solving, and the\neffectiveness of the proposed model is demonstrated on various images.\n","authors":["Wenqi Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.13392v2.pdf","comment":"SCI"},{"id":"http://arxiv.org/abs/2404.09502v1","updated":"2024-04-15T06:45:06Z","published":"2024-04-15T06:45:06Z","title":"SparseOcc: Rethinking Sparse Latent Representation for Vision-Based\n Semantic Occupancy Prediction","summary":" Vision-based perception for autonomous driving requires an explicit modeling\nof a 3D space, where 2D latent representations are mapped and subsequent 3D\noperators are applied. However, operating on dense latent spaces introduces a\ncubic time and space complexity, which limits scalability in terms of\nperception range or spatial resolution. Existing approaches compress the dense\nrepresentation using projections like Bird's Eye View (BEV) or Tri-Perspective\nView (TPV). Although efficient, these projections result in information loss,\nespecially for tasks like semantic occupancy prediction. To address this, we\npropose SparseOcc, an efficient occupancy network inspired by sparse point\ncloud processing. It utilizes a lossless sparse latent representation with\nthree key innovations. Firstly, a 3D sparse diffuser performs latent completion\nusing spatially decomposed 3D sparse convolutional kernels. Secondly, a feature\npyramid and sparse interpolation enhance scales with information from others.\nFinally, the transformer head is redesigned as a sparse variant. SparseOcc\nachieves a remarkable 74.9% reduction on FLOPs over the dense baseline.\nInterestingly, it also improves accuracy, from 12.8% to 14.1% mIOU, which in\npart can be attributed to the sparse representation's ability to avoid\nhallucinations on empty voxels.\n","authors":["Pin Tang","Zhongdao Wang","Guoqing Wang","Jilai Zheng","Xiangxuan Ren","Bailan Feng","Chao Ma"],"pdf_url":"https://arxiv.org/pdf/2404.09502v1.pdf","comment":"10 pages, 4 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09499v1","updated":"2024-04-15T06:38:09Z","published":"2024-04-15T06:38:09Z","title":"Learning Human Motion from Monocular Videos via Cross-Modal Manifold\n Alignment","summary":" Learning 3D human motion from 2D inputs is a fundamental task in the realms\nof computer vision and computer graphics. Many previous methods grapple with\nthis inherently ambiguous task by introducing motion priors into the learning\nprocess. However, these approaches face difficulties in defining the complete\nconfigurations of such priors or training a robust model. In this paper, we\npresent the Video-to-Motion Generator (VTM), which leverages motion priors\nthrough cross-modal latent feature space alignment between 3D human motion and\n2D inputs, namely videos and 2D keypoints. To reduce the complexity of modeling\nmotion priors, we model the motion data separately for the upper and lower body\nparts. Additionally, we align the motion data with a scale-invariant virtual\nskeleton to mitigate the interference of human skeleton variations to the\nmotion priors. Evaluated on AIST++, the VTM showcases state-of-the-art\nperformance in reconstructing 3D human motion from monocular videos. Notably,\nour VTM exhibits the capabilities for generalization to unseen view angles and\nin-the-wild videos.\n","authors":["Shuaiying Hou","Hongyu Tao","Junheng Fang","Changqing Zou","Hujun Bao","Weiwei Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09498v1","updated":"2024-04-15T06:37:21Z","published":"2024-04-15T06:37:21Z","title":"FusionMamba: Dynamic Feature Enhancement for Multimodal Image Fusion\n with Mamba","summary":" Multi-modal image fusion aims to combine information from different modes to\ncreate a single image with comprehensive information and detailed textures.\nHowever, fusion models based on convolutional neural networks encounter\nlimitations in capturing global image features due to their focus on local\nconvolution operations. Transformer-based models, while excelling in global\nfeature modeling, confront computational challenges stemming from their\nquadratic complexity. Recently, the Selective Structured State Space Model has\nexhibited significant potential for long-range dependency modeling with linear\ncomplexity, offering a promising avenue to address the aforementioned dilemma.\nIn this paper, we propose FusionMamba, a novel dynamic feature enhancement\nmethod for multimodal image fusion with Mamba. Specifically, we devise an\nimproved efficient Mamba model for image fusion, integrating efficient visual\nstate space model with dynamic convolution and channel attention. This refined\nmodel not only upholds the performance of Mamba and global modeling capability\nbut also diminishes channel redundancy while enhancing local enhancement\ncapability. Additionally, we devise a dynamic feature fusion module (DFFM)\ncomprising two dynamic feature enhancement modules (DFEM) and a cross modality\nfusion mamba module (CMFM). The former serves for dynamic texture enhancement\nand dynamic difference perception, whereas the latter enhances correlation\nfeatures between modes and suppresses redundant intermodal information.\nFusionMamba has yielded state-of-the-art (SOTA) performance across various\nmultimodal medical image fusion tasks (CT-MRI, PET-MRI, SPECT-MRI), infrared\nand visible image fusion task (IR-VIS) and multimodal biomedical image fusion\ndataset (GFP-PC), which is proved that our model has generalization ability.\nThe code for FusionMamba is available at\nhttps://github.com/millieXie/FusionMamba.\n","authors":["Xinyu Xie","Yawen Cui","Chio-In Ieong","Tao Tan","Xiaozhi Zhang","Xubin Zheng","Zitong Yu"],"pdf_url":"https://arxiv.org/pdf/2404.09498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03883v2","updated":"2024-04-15T06:34:52Z","published":"2024-04-05T04:11:31Z","title":"LiDAR-Guided Cross-Attention Fusion for Hyperspectral Band Selection and\n Image Classification","summary":" The fusion of hyperspectral and LiDAR data has been an active research topic.\nExisting fusion methods have ignored the high-dimensionality and redundancy\nchallenges in hyperspectral images, despite that band selection methods have\nbeen intensively studied for hyperspectral image (HSI) processing. This paper\naddresses this significant gap by introducing a cross-attention mechanism from\nthe transformer architecture for the selection of HSI bands guided by LiDAR\ndata. LiDAR provides high-resolution vertical structural information, which can\nbe useful in distinguishing different types of land cover that may have similar\nspectral signatures but different structural profiles. In our approach, the\nLiDAR data are used as the \"query\" to search and identify the \"key\" from the\nHSI to choose the most pertinent bands for LiDAR. This method ensures that the\nselected HSI bands drastically reduce redundancy and computational requirements\nwhile working optimally with the LiDAR data. Extensive experiments have been\nundertaken on three paired HSI and LiDAR data sets: Houston 2013, Trento and\nMUUFL. The results highlight the superiority of the cross-attention mechanism,\nunderlining the enhanced classification accuracy of the identified HSI bands\nwhen fused with the LiDAR features. The results also show that the use of fewer\nbands combined with LiDAR surpasses the performance of state-of-the-art fusion\nmodels.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Alan Wee-Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.03883v2.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.09496v1","updated":"2024-04-15T06:33:32Z","published":"2024-04-15T06:33:32Z","title":"Towards Collaborative Autonomous Driving: Simulation Platform and\n End-to-End System","summary":" Vehicle-to-everything-aided autonomous driving (V2X-AD) has a huge potential\nto provide a safer driving solution. Despite extensive researches in\ntransportation and communication to support V2X-AD, the actual utilization of\nthese infrastructures and communication resources in enhancing driving\nperformances remains largely unexplored. This highlights the necessity of\ncollaborative autonomous driving: a machine learning approach that optimizes\nthe information sharing strategy to improve the driving performance of each\nvehicle. This effort necessitates two key foundations: a platform capable of\ngenerating data to facilitate the training and testing of V2X-AD, and a\ncomprehensive system that integrates full driving-related functionalities with\nmechanisms for information sharing. From the platform perspective, we present\nV2Xverse, a comprehensive simulation platform for collaborative autonomous\ndriving. This platform provides a complete pipeline for collaborative driving.\nFrom the system perspective, we introduce CoDriving, a novel end-to-end\ncollaborative driving system that properly integrates V2X communication over\nthe entire autonomous pipeline, promoting driving with shared perceptual\ninformation. The core idea is a novel driving-oriented communication strategy.\nLeveraging this strategy, CoDriving improves driving performance while\noptimizing communication efficiency. We make comprehensive benchmarks with\nV2Xverse, analyzing both modular performance and closed-loop driving\nperformance. Experimental results show that CoDriving: i) significantly\nimproves the driving score by 62.49% and drastically reduces the pedestrian\ncollision rate by 53.50% compared to the SOTA end-to-end driving method, and\nii) achieves sustaining driving performance superiority over dynamic constraint\ncommunication conditions.\n","authors":["Genjia Liu","Yue Hu","Chenxin Xu","Weibo Mao","Junhao Ge","Zhengxiang Huang","Yifan Lu","Yinda Xu","Junkai Xia","Yafei Wang","Siheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09490v1","updated":"2024-04-15T06:24:56Z","published":"2024-04-15T06:24:56Z","title":"Leveraging Temporal Contextualization for Video Action Recognition","summary":" Pretrained vision-language models have shown effectiveness in video\nunderstanding. However, recent studies have not sufficiently leveraged\nessential temporal information from videos, simply averaging frame-wise\nrepresentations or referencing consecutive frames. We introduce Temporally\nContextualized CLIP (TC-CLIP), a pioneering framework for video understanding\nthat effectively and efficiently leverages comprehensive video information. We\npropose Temporal Contextualization (TC), a novel layer-wise temporal\ninformation infusion mechanism for video that extracts core information from\neach frame, interconnects relevant information across the video to summarize\ninto context tokens, and ultimately leverages the context tokens during the\nfeature encoding process. Furthermore, our Video-conditional Prompting (VP)\nmodule manufactures context tokens to generate informative prompts in text\nmodality. We conduct extensive experiments in zero-shot, few-shot,\nbase-to-novel, and fully-supervised action recognition to validate the\nsuperiority of our TC-CLIP. Ablation studies for TC and VP guarantee our design\nchoices. Code is available at https://github.com/naver-ai/tc-clip\n","authors":["Minji Kim","Dongyoon Han","Taekyung Kim","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2404.09490v1.pdf","comment":"24 pages, 10 figures, 12 tables"},{"id":"http://arxiv.org/abs/2404.09486v1","updated":"2024-04-15T06:15:46Z","published":"2024-04-15T06:15:46Z","title":"MMCode: Evaluating Multi-Modal Code Large Language Models with Visually\n Rich Programming Problems","summary":" Programming often involves converting detailed and complex specifications\ninto code, a process during which developers typically utilize visual aids to\nmore effectively convey concepts. While recent developments in Large Multimodal\nModels have demonstrated remarkable abilities in visual reasoning and\nmathematical tasks, there is little work on investigating whether these models\ncan effectively interpret visual elements for code generation. To this end, we\npresent MMCode, the first multi-modal coding dataset for evaluating algorithmic\nproblem-solving skills in visually rich contexts. MMCode contains 3,548\nquestions and 6,620 images collected from real-world programming challenges\nharvested from 10 code competition websites, presenting significant challenges\ndue to the extreme demand for reasoning abilities. Our experiment results show\nthat current state-of-the-art models struggle to solve these problems. The\nresults highlight the lack of powerful vision-code models, and we hope MMCode\ncan serve as an inspiration for future works in this domain. The data and code\nare publicly available at https://github.com/happylkx/MMCode.\n","authors":["Kaixin Li","Yuchen Tian","Qisheng Hu","Ziyang Luo","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.09486v1.pdf","comment":"46 pages, 21 figures and 6 tables"},{"id":"http://arxiv.org/abs/2311.12198v3","updated":"2024-04-15T06:04:55Z","published":"2023-11-20T21:34:52Z","title":"PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics","summary":" We introduce PhysGaussian, a new method that seamlessly integrates physically\ngrounded Newtonian dynamics within 3D Gaussians to achieve high-quality novel\nmotion synthesis. Employing a custom Material Point Method (MPM), our approach\nenriches 3D Gaussian kernels with physically meaningful kinematic deformation\nand mechanical stress attributes, all evolved in line with continuum mechanics\nprinciples. A defining characteristic of our method is the seamless integration\nbetween physical simulation and visual rendering: both components utilize the\nsame 3D Gaussian kernels as their discrete representations. This negates the\nnecessity for triangle/tetrahedron meshing, marching cubes, \"cage meshes,\" or\nany other geometry embedding, highlighting the principle of \"what you see is\nwhat you simulate (WS$^2$).\" Our method demonstrates exceptional versatility\nacross a wide variety of materials--including elastic entities, metals,\nnon-Newtonian fluids, and granular materials--showcasing its strong\ncapabilities in creating diverse visual content with novel viewpoints and\nmovements. Our project page is at: https://xpandora.github.io/PhysGaussian/\n","authors":["Tianyi Xie","Zeshun Zong","Yuxing Qiu","Xuan Li","Yutao Feng","Yin Yang","Chenfanfu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.12198v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09476v1","updated":"2024-04-15T06:02:31Z","published":"2024-04-15T06:02:31Z","title":"FreqMamba: Viewing Mamba from a Frequency Perspective for Image\n Deraining","summary":" Images corrupted by rain streaks often lose vital frequency information for\nperception, and image deraining aims to solve this issue which relies on global\nand local degradation modeling. Recent studies have witnessed the effectiveness\nand efficiency of Mamba for perceiving global and local information based on\nits exploiting local correlation among patches, however, rarely attempts have\nbeen explored to extend it with frequency analysis for image deraining,\nlimiting its ability to perceive global degradation that is relevant to\nfrequency modeling (e.g. Fourier transform). In this paper, we propose\nFreqMamba, an effective and efficient paradigm that leverages the complementary\nbetween Mamba and frequency analysis for image deraining. The core of our\nmethod lies in extending Mamba with frequency analysis from two perspectives:\nextending it with frequency-band for exploiting frequency correlation, and\nconnecting it with Fourier transform for global degradation modeling.\nSpecifically, FreqMamba introduces complementary triple interaction structures\nincluding spatial Mamba, frequency band Mamba, and Fourier global modeling.\nFrequency band Mamba decomposes the image into sub-bands of different\nfrequencies to allow 2D scanning from the frequency dimension. Furthermore,\nleveraging Mamba's unique data-dependent properties, we use rainy images at\ndifferent scales to provide degradation priors to the network, thereby\nfacilitating efficient training. Extensive experiments show that our method\noutperforms state-of-the-art methods both visually and quantitatively.\n","authors":["Zou Zhen","Yu Hu","Zhao Feng"],"pdf_url":"https://arxiv.org/pdf/2404.09476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09475v1","updated":"2024-04-15T06:02:09Z","published":"2024-04-15T06:02:09Z","title":"Improving Weakly-Supervised Object Localization Using Adversarial\n Erasing and Pseudo Label","summary":" Weakly-supervised learning approaches have gained significant attention due\nto their ability to reduce the effort required for human annotations in\ntraining neural networks. This paper investigates a framework for\nweakly-supervised object localization, which aims to train a neural network\ncapable of predicting both the object class and its location using only images\nand their image-level class labels. The proposed framework consists of a shared\nfeature extractor, a classifier, and a localizer. The localizer predicts\npixel-level class probabilities, while the classifier predicts the object class\nat the image level. Since image-level class labels are insufficient for\ntraining the localizer, weakly-supervised object localization methods often\nencounter challenges in accurately localizing the entire object region. To\naddress this issue, the proposed method incorporates adversarial erasing and\npseudo labels to improve localization accuracy. Specifically, novel losses are\ndesigned to utilize adversarially erased foreground features and adversarially\nerased feature maps, reducing dependence on the most discriminative region.\nAdditionally, the proposed method employs pseudo labels to suppress activation\nvalues in the background while increasing them in the foreground. The proposed\nmethod is applied to two backbone networks (MobileNetV1 and InceptionV3) and is\nevaluated on three publicly available datasets (ILSVRC-2012, CUB-200-2011, and\nPASCAL VOC 2012). The experimental results demonstrate that the proposed method\noutperforms previous state-of-the-art methods across all evaluated metrics.\n","authors":["Byeongkeun Kang","Sinhae Cha","Yeejin Lee"],"pdf_url":"https://arxiv.org/pdf/2404.09475v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.09474v1","updated":"2024-04-15T06:01:48Z","published":"2024-04-15T06:01:48Z","title":"TCCT-Net: Two-Stream Network Architecture for Fast and Efficient\n Engagement Estimation via Behavioral Feature Signals","summary":" Engagement analysis finds various applications in healthcare, education,\nadvertisement, services. Deep Neural Networks, used for analysis, possess\ncomplex architecture and need large amounts of input data, computational power,\ninference time. These constraints challenge embedding systems into devices for\nreal-time use. To address these limitations, we present a novel two-stream\nfeature fusion \"Tensor-Convolution and Convolution-Transformer Network\"\n(TCCT-Net) architecture. To better learn the meaningful patterns in the\ntemporal-spatial domain, we design a \"CT\" stream that integrates a hybrid\nconvolutional-transformer. In parallel, to efficiently extract rich patterns\nfrom the temporal-frequency domain and boost processing speed, we introduce a\n\"TC\" stream that uses Continuous Wavelet Transform (CWT) to represent\ninformation in a 2D tensor form. Evaluated on the EngageNet dataset, the\nproposed method outperforms existing baselines, utilizing only two behavioral\nfeatures (head pose rotations) compared to the 98 used in baseline models.\nFurthermore, comparative analysis shows TCCT-Net's architecture offers an\norder-of-magnitude improvement in inference speed compared to state-of-the-art\nimage-based Recurrent Neural Network (RNN) methods. The code will be released\nat https://github.com/vedernikovphoto/TCCT_Net.\n","authors":["Alexander Vedernikov","Puneet Kumar","Haoyu Chen","Tapio Seppanen","Xiaobai Li"],"pdf_url":"https://arxiv.org/pdf/2404.09474v1.pdf","comment":"Accepted for the CVPR 2024 workshop (ABAW)"},{"id":"http://arxiv.org/abs/2404.09472v1","updated":"2024-04-15T05:53:26Z","published":"2024-04-15T05:53:26Z","title":"Q2A: Querying Implicit Fully Continuous Feature Pyramid to Align\n Features for Medical Image Segmentation","summary":" Recent medical image segmentation methods apply implicit neural\nrepresentation (INR) to the decoder for achieving a continuous coordinate\ndecoding to tackle the drawback of conventional discrete grid-based data\nrepresentations. However, the INR-based decoder cannot well handle the feature\nmisalignment problem brought about by the naive latent code acquisition\nstrategy in INR. Although there exist many feature alignment works, they all\nadopt a progressive multi-step aligning paradigm on a discrete feature pyramid,\nwhich is incompatible with the continuous one-step characteristics of INR-based\ndecoder, and thus fails to be the solution. Therefore, we propose Q2A, a novel\none-step query-based aligning paradigm, to solve the feature misalignment\nproblem in the INR-based decoder. Specifically, for each target coordinate, Q2A\nfirst generates several queries depicting the spatial offsets and the cell\nresolutions of the contextual features aligned to the coordinate, then\ncalculates the corresponding aligned features by feeding the queries into a\nnovel implicit fully continuous feature pyramid (FCFP), finally fuses the\naligned features to predict the class distribution. In FCFP, we further propose\na novel universal partition-and-aggregate strategy (P&A) to replace the naive\ninterpolation strategy for latent code acquisition in INR, which mitigates the\ninformation loss problem that occurs when the query cell resolution is\nrelatively large and achieves an effective feature decoding at arbitrary\ncontinuous resolution. We conduct extensive experiments on two medical\ndatasets, i.e. Glas and Synapse, and a universal dataset, i.e. Cityscapes, and\nthey show the superiority of the proposed Q2A.\n","authors":["Jiahao Yu","Li Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09472v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.09469v1","updated":"2024-04-15T05:44:03Z","published":"2024-04-15T05:44:03Z","title":"Virtually Enriched NYU Depth V2 Dataset for Monocular Depth Estimation:\n Do We Need Artificial Augmentation?","summary":" We present ANYU, a new virtually augmented version of the NYU depth v2\ndataset, designed for monocular depth estimation. In contrast to the well-known\napproach where full 3D scenes of a virtual world are utilized to generate\nartificial datasets, ANYU was created by incorporating RGB-D representations of\nvirtual reality objects into the original NYU depth v2 images. We specifically\ndid not match each generated virtual object with an appropriate texture and a\nsuitable location within the real-world image. Instead, an assignment of\ntexture, location, lighting, and other rendering parameters was randomized to\nmaximize a diversity of the training data, and to show that it is randomness\nthat can improve the generalizing ability of a dataset. By conducting extensive\nexperiments with our virtually modified dataset and validating on the original\nNYU depth v2 and iBims-1 benchmarks, we show that ANYU improves the monocular\ndepth estimation performance and generalization of deep neural networks with\nconsiderably different architectures, especially for the current\nstate-of-the-art VPD model. To the best of our knowledge, this is the first\nwork that augments a real-world dataset with randomly generated virtual 3D\nobjects for monocular depth estimation. We make our ANYU dataset publicly\navailable in two training configurations with 10% and 100% additional\nsynthetically enriched RGB-D pairs of training images, respectively, for\nefficient training and empirical exploration of virtual augmentation at\nhttps://github.com/ABrain-One/ANYU\n","authors":["Dmitry Ignatov","Andrey Ignatov","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2404.09469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03124v2","updated":"2024-04-15T05:38:16Z","published":"2024-02-05T15:51:34Z","title":"Towards Eliminating Hard Label Constraints in Gradient Inversion Attacks","summary":" Gradient inversion attacks aim to reconstruct local training data from\nintermediate gradients exposed in the federated learning framework. Despite\nsuccessful attacks, all previous methods, starting from reconstructing a single\ndata point and then relaxing the single-image limit to batch level, are only\ntested under hard label constraints. Even for single-image reconstruction, we\nstill lack an analysis-based algorithm to recover augmented soft labels. In\nthis work, we change the focus from enlarging batchsize to investigating the\nhard label constraints, considering a more realistic circumstance where label\nsmoothing and mixup techniques are used in the training process. In particular,\nwe are the first to initiate a novel algorithm to simultaneously recover the\nground-truth augmented label and the input feature of the last fully-connected\nlayer from single-input gradients, and provide a necessary condition for any\nanalytical-based label recovery methods. Extensive experiments testify to the\nlabel recovery accuracy, as well as the benefits to the following image\nreconstruction. We believe soft labels in classification tasks are worth\nfurther attention in gradient inversion attacks.\n","authors":["Yanbo Wang","Jian Liang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2402.03124v2.pdf","comment":"ICLR2024 poster"},{"id":"http://arxiv.org/abs/2404.09465v1","updated":"2024-04-15T05:29:23Z","published":"2024-04-15T05:29:23Z","title":"PhyScene: Physically Interactable 3D Scene Synthesis for Embodied AI","summary":" With recent developments in Embodied Artificial Intelligence (EAI) research,\nthere has been a growing demand for high-quality, large-scale interactive scene\ngeneration. While prior methods in scene synthesis have prioritized the\nnaturalness and realism of the generated scenes, the physical plausibility and\ninteractivity of scenes have been largely left unexplored. To address this\ndisparity, we introduce PhyScene, a novel method dedicated to generating\ninteractive 3D scenes characterized by realistic layouts, articulated objects,\nand rich physical interactivity tailored for embodied agents. Based on a\nconditional diffusion model for capturing scene layouts, we devise novel\nphysics- and interactivity-based guidance mechanisms that integrate constraints\nfrom object collision, room layout, and object reachability. Through extensive\nexperiments, we demonstrate that PhyScene effectively leverages these guidance\nfunctions for physically interactable scene synthesis, outperforming existing\nstate-of-the-art scene synthesis methods by a large margin. Our findings\nsuggest that the scenes generated by PhyScene hold considerable potential for\nfacilitating diverse skill acquisition among agents within interactive\nenvironments, thereby catalyzing further advancements in embodied AI research.\nProject website: http://physcene.github.io.\n","authors":["Yandan Yang","Baoxiong Jia","Peiyuan Zhi","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2404.09465v1.pdf","comment":"Accepted by CVPR 2024, 18 pages"},{"id":"http://arxiv.org/abs/2404.09461v1","updated":"2024-04-15T05:00:40Z","published":"2024-04-15T05:00:40Z","title":"Improved Object-Based Style Transfer with Single Deep Network","summary":" This research paper proposes a novel methodology for image-to-image style\ntransfer on objects utilizing a single deep convolutional neural network. The\nproposed approach leverages the You Only Look Once version 8 (YOLOv8)\nsegmentation model and the backbone neural network of YOLOv8 for style\ntransfer. The primary objective is to enhance the visual appeal of objects in\nimages by seamlessly transferring artistic styles while preserving the original\nobject characteristics. The proposed approach's novelty lies in combining\nsegmentation and style transfer in a single deep convolutional neural network.\nThis approach omits the need for multiple stages or models, thus resulting in\nsimpler training and deployment of the model for practical applications. The\nresults of this approach are shown on two content images by applying different\nstyle images. The paper also demonstrates the ability to apply style transfer\non multiple objects in the same image.\n","authors":["Harshmohan Kulkarni","Om Khare","Ninad Barve","Sunil Mane"],"pdf_url":"https://arxiv.org/pdf/2404.09461v1.pdf","comment":"In Proceedings of the Fourth International Conference on Innovations\n in Computational Intelligence and Computer Vision"},{"id":"http://arxiv.org/abs/2303.09792v3","updated":"2024-04-15T04:58:07Z","published":"2023-03-17T06:26:55Z","title":"Exploring Sparse Visual Prompt for Domain Adaptive Dense Prediction","summary":" The visual prompts have provided an efficient manner in addressing visual\ncross-domain problems. In previous works, Visual Domain Prompt (VDP) first\nintroduces domain prompts to tackle the classification Test-Time Adaptation\n(TTA) problem by warping image-level prompts on the input and fine-tuning\nprompts for each target domain. However, since the image-level prompts mask out\ncontinuous spatial details in the prompt-allocated region, it will suffer from\ninaccurate contextual information and limited domain knowledge extraction,\nparticularly when dealing with dense prediction TTA problems. To overcome these\nchallenges, we propose a novel Sparse Visual Domain Prompts (SVDP) approach,\nwhich holds minimal trainable parameters (e.g., 0.1\\%) in the image-level\nprompt and reserves more spatial information of the input. To better apply SVDP\nin extracting domain-specific knowledge, we introduce the Domain Prompt\nPlacement (DPP) method to adaptively allocates trainable parameters of SVDP on\nthe pixels with large distribution shifts. Furthermore, recognizing that each\ntarget domain sample exhibits a unique domain shift, we design Domain Prompt\nUpdating (DPU) strategy to optimize prompt parameters differently for each\nsample, facilitating efficient adaptation to the target domain. Extensive\nexperiments were conducted on widely-used TTA and continual TTA benchmarks, and\nour proposed method achieves state-of-the-art performance in both semantic\nsegmentation and depth estimation tasks.\n","authors":["Senqiao Yang","Jiarui Wu","Jiaming Liu","Xiaoqi Li","Qizhe Zhang","Mingjie Pan","Yulu Gan","Zehui Chen","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.09792v3.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2404.09458v1","updated":"2024-04-15T04:50:39Z","published":"2024-04-15T04:50:39Z","title":"CompGS: Efficient 3D Scene Representation via Compressed Gaussian\n Splatting","summary":" Gaussian splatting, renowned for its exceptional rendering quality and\nefficiency, has emerged as a prominent technique in 3D scene representation.\nHowever, the substantial data volume of Gaussian splatting impedes its\npractical utility in real-world applications. Herein, we propose an efficient\n3D scene representation, named Compressed Gaussian Splatting (CompGS), which\nharnesses compact Gaussian primitives for faithful 3D scene modeling with a\nremarkably reduced data size. To ensure the compactness of Gaussian primitives,\nwe devise a hybrid primitive structure that captures predictive relationships\nbetween each other. Then, we exploit a small set of anchor primitives for\nprediction, allowing the majority of primitives to be encapsulated into highly\ncompact residual forms. Moreover, we develop a rate-constrained optimization\nscheme to eliminate redundancies within such hybrid primitives, steering our\nCompGS towards an optimal trade-off between bitrate consumption and\nrepresentation efficacy. Experimental results show that the proposed CompGS\nsignificantly outperforms existing methods, achieving superior compactness in\n3D scene representation without compromising model accuracy and rendering\nquality. Our code will be released on GitHub for further research.\n","authors":["Xiangrui Liu","Xinju Wu","Pingping Zhang","Shiqi Wang","Zhu Li","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2404.09458v1.pdf","comment":"Submitted to a conference"},{"id":"http://arxiv.org/abs/2404.09454v1","updated":"2024-04-15T04:43:53Z","published":"2024-04-15T04:43:53Z","title":"Utility-Fairness Trade-Offs and How to Find Them","summary":" When building classification systems with demographic fairness\nconsiderations, there are two objectives to satisfy: 1) maximizing utility for\nthe specific task and 2) ensuring fairness w.r.t. a known demographic\nattribute. These objectives often compete, so optimizing both can lead to a\ntrade-off between utility and fairness. While existing works acknowledge the\ntrade-offs and study their limits, two questions remain unanswered: 1) What are\nthe optimal trade-offs between utility and fairness? and 2) How can we\nnumerically quantify these trade-offs from data for a desired prediction task\nand demographic attribute of interest? This paper addresses these questions. We\nintroduce two utility-fairness trade-offs: the Data-Space and Label-Space\nTrade-off. The trade-offs reveal three regions within the utility-fairness\nplane, delineating what is fully and partially possible and impossible. We\npropose U-FaTE, a method to numerically quantify the trade-offs for a given\nprediction task and group fairness definition from data samples. Based on the\ntrade-offs, we introduce a new scheme for evaluating representations. An\nextensive evaluation of fair representation learning methods and\nrepresentations from over 1000 pre-trained models revealed that most current\napproaches are far from the estimated and achievable fairness-utility\ntrade-offs across multiple datasets and prediction tasks.\n","authors":["Sepehr Dehdashtian","Bashir Sadeghi","Vishnu Naresh Boddeti"],"pdf_url":"https://arxiv.org/pdf/2404.09454v1.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2024"},{"id":"http://arxiv.org/abs/2404.05317v3","updated":"2024-04-15T04:37:44Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v3.pdf","comment":"updated section II-C (\"A-Frame\"), updated references"},{"id":"http://arxiv.org/abs/2404.09451v1","updated":"2024-04-15T04:31:24Z","published":"2024-04-15T04:31:24Z","title":"Contrastive Mean-Shift Learning for Generalized Category Discovery","summary":" We address the problem of generalized category discovery (GCD) that aims to\npartition a partially labeled collection of images; only a small part of the\ncollection is labeled and the total number of target classes is unknown. To\naddress this generalized image clustering problem, we revisit the mean-shift\nalgorithm, i.e., a classic, powerful technique for mode seeking, and\nincorporate it into a contrastive learning framework. The proposed method,\ndubbed Contrastive Mean-Shift (CMS) learning, trains an image encoder to\nproduce representations with better clustering properties by an iterative\nprocess of mean shift and contrastive update. Experiments demonstrate that our\nmethod, both in settings with and without the total number of clusters being\nknown, achieves state-of-the-art performance on six public GCD benchmarks\nwithout bells and whistles.\n","authors":["Sua Choi","Dahyun Kang","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2404.09451v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09447v1","updated":"2024-04-15T04:20:01Z","published":"2024-04-15T04:20:01Z","title":"kNN-CLIP: Retrieval Enables Training-Free Segmentation on Continually\n Expanding Large Vocabularies","summary":" Rapid advancements in continual segmentation have yet to bridge the gap of\nscaling to large continually expanding vocabularies under compute-constrained\nscenarios. We discover that traditional continual training leads to\ncatastrophic forgetting under compute constraints, unable to outperform\nzero-shot segmentation methods. We introduce a novel strategy for semantic and\npanoptic segmentation with zero forgetting, capable of adapting to continually\ngrowing vocabularies without the need for retraining or large memory costs. Our\ntraining-free approach, kNN-CLIP, leverages a database of instance embeddings\nto enable open-vocabulary segmentation approaches to continually expand their\nvocabulary on any given domain with a single-pass through data, while only\nstoring embeddings minimizing both compute and memory costs. This method\nachieves state-of-the-art mIoU performance across large-vocabulary semantic and\npanoptic segmentation datasets. We hope kNN-CLIP represents a step forward in\nenabling more efficient and adaptable continual segmentation, paving the way\nfor advances in real-world large-vocabulary continual segmentation methods.\n","authors":["Zhongrui Gui","Shuyang Sun","Runjia Li","Jianhao Yuan","Zhaochong An","Karsten Roth","Ameya Prabhu","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2404.09447v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.09445v1","updated":"2024-04-15T04:14:42Z","published":"2024-04-15T04:14:42Z","title":"Exploring Text-to-Motion Generation with Human Preference","summary":" This paper presents an exploration of preference learning in text-to-motion\ngeneration. We find that current improvements in text-to-motion generation\nstill rely on datasets requiring expert labelers with motion capture systems.\nInstead, learning from human preference data does not require motion capture\nsystems; a labeler with no expertise simply compares two generated motions.\nThis is particularly efficient because evaluating the model's output is easier\nthan gathering the motion that performs a desired task (e.g. backflip). To\npioneer the exploration of this paradigm, we annotate 3,528 preference pairs\ngenerated by MotionGPT, marking the first effort to investigate various\nalgorithms for learning from preference data. In particular, our exploration\nhighlights important design choices when using preference data. Additionally,\nour experimental results show that preference learning has the potential to\ngreatly improve current text-to-motion generative models. Our code and dataset\nare publicly available at\nhttps://github.com/THU-LYJ-Lab/InstructMotion}{https://github.com/THU-LYJ-Lab/InstructMotion\nto further facilitate research in this area.\n","authors":["Jenny Sheng","Matthieu Lin","Andrew Zhao","Kevin Pruvost","Yu-Hui Wen","Yangguang Li","Gao Huang","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09445v1.pdf","comment":"Accepted to CVPR 2024 HuMoGen Workshop"},{"id":"http://arxiv.org/abs/2402.09055v3","updated":"2024-04-15T03:23:07Z","published":"2024-02-14T10:05:19Z","title":"Comment-aided Video-Language Alignment via Contrastive Pre-training for\n Short-form Video Humor Detection","summary":" The growing importance of multi-modal humor detection within affective\ncomputing correlates with the expanding influence of short-form video sharing\non social media platforms. In this paper, we propose a novel two-branch\nhierarchical model for short-form video humor detection (SVHD), named\nComment-aided Video-Language Alignment (CVLA) via data-augmented multi-modal\ncontrastive pre-training. Notably, our CVLA not only operates on raw signals\nacross various modal channels but also yields an appropriate multi-modal\nrepresentation by aligning the video and language components within a\nconsistent semantic space. The experimental results on two humor detection\ndatasets, including DY11k and UR-FUNNY, demonstrate that CVLA dramatically\noutperforms state-of-the-art and several competitive baseline approaches. Our\ndataset, code and model release at https://github.com/yliu-cs/CVLA.\n","authors":["Yang Liu","Tongfei Shen","Dong Zhang","Qingying Sun","Shoushan Li","Guodong Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.09055v3.pdf","comment":"Accepted by ICMR 2024"},{"id":"http://arxiv.org/abs/2308.06603v3","updated":"2024-04-15T03:20:41Z","published":"2023-08-12T16:14:44Z","title":"LadleNet: A Two-Stage UNet for Infrared Image to Visible Image\n Translation Guided by Semantic Segmentation","summary":" The translation of thermal infrared (TIR) images into visible light (VI)\nimages plays a critical role in enhancing model performance and generalization\ncapability, particularly in various fields such as registration and fusion of\nTIR and VI images. However, current research in this field faces challenges of\ninsufficiently realistic image quality after translation and the difficulty of\nexisting models in adapting to unseen scenarios. In order to develop a more\ngeneralizable image translation architecture, we conducted an analysis of\nexisting translation architectures. By exploring the interpretability of\nintermediate modalities in existing translation architectures, we found that\nthe intermediate modality in the image translation process for street scene\nimages essentially performs semantic segmentation, distinguishing street images\nbased on background and foreground patterns before assigning color information.\nBased on these principles, we propose an improved algorithm based on U-net\ncalled LadleNet. This network utilizes a two-stage U-net concatenation\nstructure, consisting of Handle and Bowl modules. The Handle module is\nresponsible for constructing an abstract semantic space, while the Bowl module\ndecodes the semantic space to obtain the mapped VI image. Due to the\ncharacteristic of semantic segmentation, the Handle module has strong\nextensibility. Therefore, we also propose LadleNet+, which replaces the Handle\nmodule in LadleNet with a pre-trained DeepLabv3+ network, enabling the model to\nhave a more powerful capability in constructing semantic space. The proposed\nmethods were trained and tested on the KAIST dataset, followed by quantitative\nand qualitative analysis. Compared to existing methods, LadleNet and LadleNet+\nachieved an average improvement of 12.4% and 15.2% in SSIM metrics, and 37.9%\nand 50.6% in MS-SSIM metrics, respectively.\n","authors":["Tonghui Zou","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2308.06603v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09432v1","updated":"2024-04-15T03:12:17Z","published":"2024-04-15T03:12:17Z","title":"The 8th AI City Challenge","summary":" The eighth AI City Challenge highlighted the convergence of computer vision\nand artificial intelligence in areas like retail, warehouse settings, and\nIntelligent Traffic Systems (ITS), presenting significant research\nopportunities. The 2024 edition featured five tracks, attracting unprecedented\ninterest from 726 teams in 47 countries and regions. Track 1 dealt with\nmulti-target multi-camera (MTMC) people tracking, highlighting significant\nenhancements in camera count, character number, 3D annotation, and camera\nmatrices, alongside new rules for 3D tracking and online tracking algorithm\nencouragement. Track 2 introduced dense video captioning for traffic safety,\nfocusing on pedestrian accidents using multi-camera feeds to improve insights\nfor insurance and prevention. Track 3 required teams to classify driver actions\nin a naturalistic driving analysis. Track 4 explored fish-eye camera analytics\nusing the FishEye8K dataset. Track 5 focused on motorcycle helmet rule\nviolation detection. The challenge utilized two leaderboards to showcase\nmethods, with participants setting new benchmarks, some surpassing existing\nstate-of-the-art achievements.\n","authors":["Shuo Wang","David C. Anastasiu","Zheng Tang","Ming-Ching Chang","Yue Yao","Liang Zheng","Mohammed Shaiqur Rahman","Meenakshi S. Arya","Anuj Sharma","Pranamesh Chakraborty","Sanjita Prajapati","Quan Kong","Norimasa Kobori","Munkhjargal Gochoo","Munkh-Erdene Otgonbold","Fady Alnajjar","Ganzorig Batnasan","Ping-Yang Chen","Jun-Wei Hsieh","Xunlei Wu","Sameer Satish Pusegaonkar","Yizhou Wang","Sujit Biswas","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2404.09432v1.pdf","comment":"Summary of the 8th AI City Challenge Workshop in conjunction with\n CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09431v1","updated":"2024-04-15T03:12:12Z","published":"2024-04-15T03:12:12Z","title":"VFMM3D: Releasing the Potential of Image by Vision Foundation Model for\n Monocular 3D Object Detection","summary":" Due to its cost-effectiveness and widespread availability, monocular 3D\nobject detection, which relies solely on a single camera during inference,\nholds significant importance across various applications, including autonomous\ndriving and robotics. Nevertheless, directly predicting the coordinates of\nobjects in 3D space from monocular images poses challenges. Therefore, an\neffective solution involves transforming monocular images into LiDAR-like\nrepresentations and employing a LiDAR-based 3D object detector to predict the\n3D coordinates of objects. The key step in this method is accurately converting\nthe monocular image into a reliable point cloud form. In this paper, we present\nVFMM3D, an innovative approach that leverages the capabilities of Vision\nFoundation Models (VFMs) to accurately transform single-view images into LiDAR\npoint cloud representations. VFMM3D utilizes the Segment Anything Model (SAM)\nand Depth Anything Model (DAM) to generate high-quality pseudo-LiDAR data\nenriched with rich foreground information. Specifically, the Depth Anything\nModel (DAM) is employed to generate dense depth maps. Subsequently, the Segment\nAnything Model (SAM) is utilized to differentiate foreground and background\nregions by predicting instance masks. These predicted instance masks and depth\nmaps are then combined and projected into 3D space to generate pseudo-LiDAR\npoints. Finally, any object detectors based on point clouds can be utilized to\npredict the 3D coordinates of objects. Comprehensive experiments are conducted\non the challenging 3D object detection dataset KITTI. Our VFMM3D establishes a\nnew state-of-the-art performance. Additionally, experimental results\ndemonstrate the generality of VFMM3D, showcasing its seamless integration into\nvarious LiDAR-based 3D object detectors.\n","authors":["Bonan Ding","Jin Xie","Jing Nie","Jiale Cao"],"pdf_url":"https://arxiv.org/pdf/2404.09431v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.09220v2","updated":"2024-04-15T02:47:01Z","published":"2023-07-18T12:52:49Z","title":"A Survey on Open-Vocabulary Detection and Segmentation: Past, Present,\n and Future","summary":" As the most fundamental scene understanding tasks, object detection and\nsegmentation have made tremendous progress in deep learning era. Due to the\nexpensive manual labeling cost, the annotated categories in existing datasets\nare often small-scale and pre-defined, i.e., state-of-the-art fully-supervised\ndetectors and segmentors fail to generalize beyond the closed vocabulary. To\nresolve this limitation, in the last few years, the community has witnessed an\nincreasing attention toward Open-Vocabulary Detection (OVD) and Segmentation\n(OVS). By ``open-vocabulary'', we mean that the models can classify objects\nbeyond pre-defined categories. In this survey, we provide a comprehensive\nreview on recent developments of OVD and OVS. A taxonomy is first developed to\norganize different tasks and methodologies. We find that the permission and\nusage of weak supervision signals can well discriminate different\nmethodologies, including: visual-semantic space mapping, novel visual feature\nsynthesis, region-aware training, pseudo-labeling, knowledge distillation, and\ntransfer learning. The proposed taxonomy is universal across different tasks,\ncovering object detection, semantic/instance/panoptic segmentation, 3D and\nvideo understanding. The main design principles, key challenges, development\nroutes, methodology strengths, and weaknesses are thoroughly analyzed. In\naddition, we benchmark each task along with the vital components of each method\nin appendix and updated online at\nhttps://github.com/seanzhuh/awesome-open-vocabulary-detection-and-segmentation.\nFinally, several promising directions are provided and discussed to stimulate\nfuture research.\n","authors":["Chaoyang Zhu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2307.09220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09426v1","updated":"2024-04-15T02:44:23Z","published":"2024-04-15T02:44:23Z","title":"ViFu: Multiple 360$^\\circ$ Objects Reconstruction with Clean Background\n via Visible Part Fusion","summary":" In this paper, we propose a method to segment and recover a static, clean\nbackground and multiple 360$^\\circ$ objects from observations of scenes at\ndifferent timestamps. Recent works have used neural radiance fields to model 3D\nscenes and improved the quality of novel view synthesis, while few studies have\nfocused on modeling the invisible or occluded parts of the training images.\nThese under-reconstruction parts constrain both scene editing and rendering\nview selection, thereby limiting their utility for synthetic data generation\nfor downstream tasks. Our basic idea is that, by observing the same set of\nobjects in various arrangement, so that parts that are invisible in one scene\nmay become visible in others. By fusing the visible parts from each scene,\nocclusion-free rendering of both background and foreground objects can be\nachieved.\n We decompose the multi-scene fusion task into two main components: (1)\nobjects/background segmentation and alignment, where we leverage point\ncloud-based methods tailored to our novel problem formulation; (2) radiance\nfields fusion, where we introduce visibility field to quantify the visible\ninformation of radiance fields, and propose visibility-aware rendering for the\nfusion of series of scenes, ultimately obtaining clean background and\n360$^\\circ$ object rendering. Comprehensive experiments were conducted on\nsynthetic and real datasets, and the results demonstrate the effectiveness of\nour method.\n","authors":["Tianhan Xu","Takuya Ikeda","Koichi Nishiwaki"],"pdf_url":"https://arxiv.org/pdf/2404.09426v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.09425v1","updated":"2024-04-15T02:41:55Z","published":"2024-04-15T02:41:55Z","title":"Super-resolution of biomedical volumes with 2D supervision","summary":" Volumetric biomedical microscopy has the potential to increase the diagnostic\ninformation extracted from clinical tissue specimens and improve the diagnostic\naccuracy of both human pathologists and computational pathology models.\nUnfortunately, barriers to integrating 3-dimensional (3D) volumetric microscopy\ninto clinical medicine include long imaging times, poor depth / z-axis\nresolution, and an insufficient amount of high-quality volumetric data.\nLeveraging the abundance of high-resolution 2D microscopy data, we introduce\nmasked slice diffusion for super-resolution (MSDSR), which exploits the\ninherent equivalence in the data-generating distribution across all spatial\ndimensions of biological specimens. This intrinsic characteristic allows for\nsuper-resolution models trained on high-resolution images from one plane (e.g.,\nXY) to effectively generalize to others (XZ, YZ), overcoming the traditional\ndependency on orientation. We focus on the application of MSDSR to stimulated\nRaman histology (SRH), an optical imaging modality for biological specimen\nanalysis and intraoperative diagnosis, characterized by its rapid acquisition\nof high-resolution 2D images but slow and costly optical z-sectioning. To\nevaluate MSDSR's efficacy, we introduce a new performance metric, SliceFID, and\ndemonstrate MSDSR's superior performance over baseline models through extensive\nevaluations. Our findings reveal that MSDSR not only significantly enhances the\nquality and resolution of 3D volumetric data, but also addresses major\nobstacles hindering the broader application of 3D volumetric microscopy in\nclinical diagnostics and biomedical research.\n","authors":["Cheng Jiang","Alexander Gedeon","Yiwei Lyu","Eric Landgraf","Yufeng Zhang","Xinhai Hou","Akhil Kondepudi","Asadur Chowdury","Honglak Lee","Todd Hollon"],"pdf_url":"https://arxiv.org/pdf/2404.09425v1.pdf","comment":"CVPR Workshop on Computer Vision for Microscopy Image Analysis 2024"},{"id":"http://arxiv.org/abs/2404.07487v2","updated":"2024-04-15T02:25:22Z","published":"2024-04-11T05:51:06Z","title":"Fine-Grained Side Information Guided Dual-Prompts for Zero-Shot Skeleton\n Action Recognition","summary":" Skeleton-based zero-shot action recognition aims to recognize unknown human\nactions based on the learned priors of the known skeleton-based actions and a\nsemantic descriptor space shared by both known and unknown categories. However,\nprevious works focus on establishing the bridges between the known skeleton\nrepresentation space and semantic descriptions space at the coarse-grained\nlevel for recognizing unknown action categories, ignoring the fine-grained\nalignment of these two spaces, resulting in suboptimal performance in\ndistinguishing high-similarity action categories. To address these challenges,\nwe propose a novel method via Side information and dual-prompts learning for\nskeleton-based zero-shot action recognition (STAR) at the fine-grained level.\nSpecifically, 1) we decompose the skeleton into several parts based on its\ntopology structure and introduce the side information concerning multi-part\ndescriptions of human body movements for alignment between the skeleton and the\nsemantic space at the fine-grained level; 2) we design the visual-attribute and\nsemantic-part prompts to improve the intra-class compactness within the\nskeleton space and inter-class separability within the semantic space,\nrespectively, to distinguish the high-similarity actions. Extensive experiments\nshow that our method achieves state-of-the-art performance in ZSL and GZSL\nsettings on NTU RGB+D, NTU RGB+D 120, and PKU-MMD datasets.\n","authors":["Yang Chen","Jingcai Guo","Tian He","Ling Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07487v2.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.08449v2","updated":"2024-04-15T02:10:45Z","published":"2024-04-12T13:00:06Z","title":"OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering","summary":" Rendering dynamic 3D human from monocular videos is crucial for various\napplications such as virtual reality and digital entertainment. Most methods\nassume the people is in an unobstructed scene, while various objects may cause\nthe occlusion of body parts in real-life scenarios. Previous method utilizing\nNeRF for surface rendering to recover the occluded areas, but it requiring more\nthan one day to train and several seconds to render, failing to meet the\nrequirements of real-time interactive applications. To address these issues, we\npropose OccGaussian based on 3D Gaussian Splatting, which can be trained within\n6 minutes and produces high-quality human renderings up to 160 FPS with\noccluded input. OccGaussian initializes 3D Gaussian distributions in the\ncanonical space, and we perform occlusion feature query at occluded regions,\nthe aggregated pixel-align feature is extracted to compensate for the missing\ninformation. Then we use Gaussian Feature MLP to further process the feature\nalong with the occlusion-aware loss functions to better perceive the occluded\narea. Extensive experiments both in simulated and real-world occlusions,\ndemonstrate that our method achieves comparable or even superior performance\ncompared to the state-of-the-art method. And we improving training and\ninference speeds by 250x and 800x, respectively. Our code will be available for\nresearch purposes.\n","authors":["Jingrui Ye","Zongkai Zhang","Yujiao Jiang","Qingmin Liao","Wenming Yang","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2404.08449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09415v1","updated":"2024-04-15T02:02:15Z","published":"2024-04-15T02:02:15Z","title":"A Review on Machine Learning Algorithms for Dust Aerosol Detection using\n Satellite Data","summary":" Dust storms are associated with certain respiratory illnesses across\ndifferent areas in the world. Researchers have devoted time and resources to\nstudy the elements surrounding dust storm phenomena. This paper reviews the\nefforts of those who have investigated dust aerosols using sensors onboard of\nsatellites using machine learning-based approaches. We have reviewed the most\ncommon issues revolving dust aerosol modeling using different datasets and\ndifferent sensors from a historical perspective. Our findings suggest that\nmulti-spectral approaches based on linear and non-linear combinations of\nspectral bands are some of the most successful for visualization and\nquantitative analysis; however, when researchers have leveraged machine\nlearning, performance has been improved and new opportunities to solve unique\nproblems arise.\n","authors":["Nurul Rafi","Pablo Rivas"],"pdf_url":"https://arxiv.org/pdf/2404.09415v1.pdf","comment":"The 23rd International Conference on Artificial Intelligence (ICAI\n 2021)"},{"id":"http://arxiv.org/abs/2404.09412v1","updated":"2024-04-15T01:58:54Z","published":"2024-04-15T01:58:54Z","title":"DeferredGS: Decoupled and Editable Gaussian Splatting with Deferred\n Shading","summary":" Reconstructing and editing 3D objects and scenes both play crucial roles in\ncomputer graphics and computer vision. Neural radiance fields (NeRFs) can\nachieve realistic reconstruction and editing results but suffer from\ninefficiency in rendering. Gaussian splatting significantly accelerates\nrendering by rasterizing Gaussian ellipsoids. However, Gaussian splatting\nutilizes a single Spherical Harmonic (SH) function to model both texture and\nlighting, limiting independent editing capabilities of these components.\nRecently, attempts have been made to decouple texture and lighting with the\nGaussian splatting representation but may fail to produce plausible geometry\nand decomposition results on reflective scenes. Additionally, the forward\nshading technique they employ introduces noticeable blending artifacts during\nrelighting, as the geometry attributes of Gaussians are optimized under the\noriginal illumination and may not be suitable for novel lighting conditions. To\naddress these issues, we introduce DeferredGS, a method for decoupling and\nediting the Gaussian splatting representation using deferred shading. To\nachieve successful decoupling, we model the illumination with a learnable\nenvironment map and define additional attributes such as texture parameters and\nnormal direction on Gaussians, where the normal is distilled from a jointly\ntrained signed distance function. More importantly, we apply deferred shading,\nresulting in more realistic relighting effects compared to previous methods.\nBoth qualitative and quantitative experiments demonstrate the superior\nperformance of DeferredGS in novel view synthesis and editing tasks.\n","authors":["Tong Wu","Jia-Mu Sun","Yu-Kun Lai","Yuewen Ma","Leif Kobbelt","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.09412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13313v2","updated":"2024-04-15T01:49:23Z","published":"2023-12-20T09:16:47Z","title":"ParamISP: Learned Forward and Inverse ISPs using Camera Parameters","summary":" RAW images are rarely shared mainly due to its excessive data size compared\nto their sRGB counterparts obtained by camera ISPs. Learning the forward and\ninverse processes of camera ISPs has been recently demonstrated, enabling\nphysically-meaningful RAW-level image processing on input sRGB images. However,\nexisting learning-based ISP methods fail to handle the large variations in the\nISP processes with respect to camera parameters such as ISO and exposure time,\nand have limitations when used for various applications. In this paper, we\npropose ParamISP, a learning-based method for forward and inverse conversion\nbetween sRGB and RAW images, that adopts a novel neural-network module to\nutilize camera parameters, which is dubbed as ParamNet. Given the camera\nparameters provided in the EXIF data, ParamNet converts them into a feature\nvector to control the ISP networks. Extensive experiments demonstrate that\nParamISP achieve superior RAW and sRGB reconstruction results compared to\nprevious methods and it can be effectively used for a variety of applications\nsuch as deblurring dataset synthesis, raw deblurring, HDR reconstruction, and\ncamera-to-camera transfer.\n","authors":["Woohyeok Kim","Geonu Kim","Junyong Lee","Seungyong Lee","Seung-Hwan Baek","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2312.13313v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09406v1","updated":"2024-04-15T01:47:44Z","published":"2024-04-15T01:47:44Z","title":"Human-in-the-Loop Segmentation of Multi-species Coral Imagery","summary":" Broad-scale marine surveys performed by underwater vehicles significantly\nincrease the availability of coral reef imagery, however it is costly and\ntime-consuming for domain experts to label images. Point label propagation is\nan approach used to leverage existing image data labeled with sparse point\nlabels. The resulting augmented ground truth generated is then used to train a\nsemantic segmentation model. Here, we first demonstrate that recent advances in\nfoundation models enable generation of multi-species coral augmented ground\ntruth masks using denoised DINOv2 features and K-Nearest Neighbors (KNN),\nwithout the need for any pre-training or custom-designed algorithms. For\nextremely sparsely labeled images, we propose a labeling regime based on\nhuman-in-the-loop principles, resulting in significant improvement in\nannotation efficiency: If only 5 point labels per image are available, our\nproposed human-in-the-loop approach improves on the state-of-the-art by 17.3%\nfor pixel accuracy and 22.6% for mIoU; and by 10.6% and 19.1% when 10 point\nlabels per image are available. Even if the human-in-the-loop labeling regime\nis not used, the denoised DINOv2 features with a KNN outperforms the prior\nstate-of-the-art by 3.5% for pixel accuracy and 5.7% for mIoU (5 grid points).\nWe also provide a detailed analysis of how point labeling style and the\nquantity of points per image affects the point label propagation quality and\nprovide general recommendations on maximizing point label efficiency.\n","authors":["Scarlett Raine","Ross Marchant","Brano Kusy","Frederic Maire","Niko Suenderhauf","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2404.09406v1.pdf","comment":"10 pages, 6 figures, an additional 4 pages of supplementary material"},{"id":"http://arxiv.org/abs/2307.11259v2","updated":"2024-04-15T01:31:57Z","published":"2023-07-20T22:35:27Z","title":"Investigating Low Data, Confidence Aware Image Prediction on Smooth\n Repetitive Videos using Gaussian Processes","summary":" The ability to predict future states is crucial to informed decision-making\nwhile interacting with dynamic environments. With cameras providing a prevalent\nand information-rich sensing modality, the problem of predicting future states\nfrom image sequences has garnered a lot of attention. Current state-of-the-art\nmethods typically train large parametric models for their predictions. Though\noften able to predict with accuracy these models often fail to provide\ninterpretable confidence metrics around their predictions. Additionally these\nmethods are reliant on the availability of large training datasets to converge\nto useful solutions. In this paper, we focus on the problem of predicting\nfuture images of an image sequence with interpretable confidence bounds from\nvery little training data. To approach this problem, we use non-parametric\nmodels to take a probabilistic approach to image prediction. We generate\nprobability distributions over sequentially predicted images, and propagate\nuncertainty through time to generate a confidence metric for our predictions.\nGaussian Processes are used for their data efficiency and ability to readily\nincorporate new training data online. Our methods predictions are evaluated on\na smooth fluid simulation environment. We showcase the capabilities of our\napproach on real world data by predicting pedestrian flows and weather patterns\nfrom satellite imagery.\n","authors":["Nikhil U. Shinde","Xiao Liang","Florian Richter","Michael C. Yip"],"pdf_url":"https://arxiv.org/pdf/2307.11259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09401v1","updated":"2024-04-15T01:27:07Z","published":"2024-04-15T01:27:07Z","title":"Watermark-embedded Adversarial Examples for Copyright Protection against\n Diffusion Models","summary":" Diffusion Models (DMs) have shown remarkable capabilities in various\nimage-generation tasks. However, there are growing concerns that DMs could be\nused to imitate unauthorized creations and thus raise copyright issues. To\naddress this issue, we propose a novel framework that embeds personal\nwatermarks in the generation of adversarial examples. Such examples can force\nDMs to generate images with visible watermarks and prevent DMs from imitating\nunauthorized images. We construct a generator based on conditional adversarial\nnetworks and design three losses (adversarial loss, GAN loss, and perturbation\nloss) to generate adversarial examples that have subtle perturbation but can\neffectively attack DMs to prevent copyright violations. Training a generator\nfor a personal watermark by our method only requires 5-10 samples within 2-3\nminutes, and once the generator is trained, it can generate adversarial\nexamples with that watermark significantly fast (0.2s per image). We conduct\nextensive experiments in various conditional image-generation scenarios.\nCompared to existing methods that generate images with chaotic textures, our\nmethod adds visible watermarks on the generated images, which is a more\nstraightforward way to indicate copyright violations. We also observe that our\nadversarial examples exhibit good transferability across unknown generative\nmodels. Therefore, this work provides a simple yet powerful way to protect\ncopyright from DM-based imitation.\n","authors":["Peifei Zhu","Tsubasa Takahashi","Hirokatsu Kataoka"],"pdf_url":"https://arxiv.org/pdf/2404.09401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00513v3","updated":"2024-04-15T01:15:34Z","published":"2024-03-31T01:20:16Z","title":"Transformer based Pluralistic Image Completion with Reduced Information\n Loss","summary":" Transformer based methods have achieved great success in image inpainting\nrecently. However, we find that these solutions regard each pixel as a token,\nthus suffering from an information loss issue from two aspects: 1) They\ndownsample the input image into much lower resolutions for efficiency\nconsideration. 2) They quantize $256^3$ RGB values to a small number (such as\n512) of quantized color values. The indices of quantized pixels are used as\ntokens for the inputs and prediction targets of the transformer. To mitigate\nthese issues, we propose a new transformer based framework called \"PUT\".\nSpecifically, to avoid input downsampling while maintaining computation\nefficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts\nthe masked image into non-overlapped patch tokens and the decoder recovers the\nmasked regions from the inpainted tokens while keeping the unmasked regions\nunchanged. To eliminate the information loss caused by input quantization, an\nUn-quantized Transformer is applied. It directly takes features from the\nP-VQVAE encoder as input without any quantization and only regards the\nquantized tokens as prediction targets. Furthermore, to make the inpainting\nprocess more controllable, we introduce semantic and structural conditions as\nextra guidance. Extensive experiments show that our method greatly outperforms\nexisting transformer based methods on image fidelity and achieves much higher\ndiversity and better fidelity than state-of-the-art pluralistic inpainting\nmethods on complex large-scale datasets (e.g., ImageNet). Codes are available\nat https://github.com/liuqk3/PUT.\n","authors":["Qiankun Liu","Yuqi Jiang","Zhentao Tan","Dongdong Chen","Ying Fu","Qi Chu","Gang Hua","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.00513v3.pdf","comment":"Accepted by TPAMI (2024). arXiv admin note: text overlap with\n arXiv:2205.05076"},{"id":"http://arxiv.org/abs/2104.00170v3","updated":"2024-04-15T01:03:11Z","published":"2021-04-01T00:14:45Z","title":"Are Bias Mitigation Techniques for Deep Learning Effective?","summary":" A critical problem in deep learning is that systems learn inappropriate\nbiases, resulting in their inability to perform well on minority groups. This\nhas led to the creation of multiple algorithms that endeavor to mitigate bias.\nHowever, it is not clear how effective these methods are. This is because study\nprotocols differ among papers, systems are tested on datasets that fail to test\nmany forms of bias, and systems have access to hidden knowledge or are tuned\nspecifically to the test set. To address this, we introduce an improved\nevaluation protocol, sensible metrics, and a new dataset, which enables us to\nask and answer critical questions about bias mitigation algorithms. We evaluate\nseven state-of-the-art algorithms using the same network architecture and\nhyperparameter selection policy across three benchmark datasets. We introduce a\nnew dataset called Biased MNIST that enables assessment of robustness to\nmultiple bias sources. We use Biased MNIST and a visual question answering\n(VQA) benchmark to assess robustness to hidden biases. Rather than only tuning\nto the test set distribution, we study robustness across different tuning\ndistributions, which is critical because for many applications the test\ndistribution may not be known during development. We find that algorithms\nexploit hidden biases, are unable to scale to multiple forms of bias, and are\nhighly sensitive to the choice of tuning set. Based on our findings, we implore\nthe community to adopt more rigorous assessment of future bias mitigation\nmethods. All data, code, and results are publicly available at:\nhttps://github.com/erobic/bias-mitigators.\n","authors":["Robik Shrestha","Kushal Kafle","Christopher Kanan"],"pdf_url":"https://arxiv.org/pdf/2104.00170v3.pdf","comment":"WACV 2022"},{"id":"http://arxiv.org/abs/2404.09389v1","updated":"2024-04-15T00:19:47Z","published":"2024-04-15T00:19:47Z","title":"Masked and Shuffled Blind Spot Denoising for Real-World Images","summary":" We introduce a novel approach to single image denoising based on the Blind\nSpot Denoising principle, which we call MAsked and SHuffled Blind Spot\nDenoising (MASH). We focus on the case of correlated noise, which often plagues\nreal images. MASH is the result of a careful analysis to determine the\nrelationships between the level of blindness (masking) of the input and the\n(unknown) noise correlation. Moreover, we introduce a shuffling technique to\nweaken the local correlation of noise, which in turn yields an additional\ndenoising performance improvement. We evaluate MASH via extensive experiments\non real-world noisy image datasets. We demonstrate on par or better results\ncompared to existing self-supervised denoising methods.\n","authors":["Hamadi Chihaoui","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2404.09389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09387v1","updated":"2024-04-15T00:12:27Z","published":"2024-04-15T00:12:27Z","title":"RankCLIP: Ranking-Consistent Language-Image Pretraining","summary":" Among the ever-evolving development of vision-language models, contrastive\nlanguage-image pretraining (CLIP) has set new benchmarks in many downstream\ntasks such as zero-shot classifications by leveraging self-supervised\ncontrastive learning on large amounts of text-image pairs. However, its\ndependency on rigid one-to-one mappings overlooks the complex and often\nmultifaceted relationships between and within texts and images. To this end, we\nintroduce RankCLIP, a novel pretraining method that extends beyond the rigid\none-to-one matching framework of CLIP and its variants. By leveraging both\nin-modal and cross-modal ranking consistency, RankCLIP improves the alignment\nprocess, enabling it to capture the nuanced many-to-many relationships between\nand within each modality. Through comprehensive experiments, we demonstrate the\nenhanced capability of RankCLIP to effectively improve performance across\nvarious downstream tasks, notably achieving significant gains in zero-shot\nclassifications over state-of-the-art methods, underscoring the potential of\nRankCLIP in further advancing vision-language pretraining.\n","authors":["Yiming Zhang","Zhuokai Zhao","Zhaorun Chen","Zhili Feng","Zenghui Ding","Yining Sun"],"pdf_url":"https://arxiv.org/pdf/2404.09387v1.pdf","comment":"10 pages, 3 figures, 6 tables. Code and model checkpoints are\n available at https://github.com/Jam1ezhang/RankCLIP"},{"id":"http://arxiv.org/abs/2404.08419v2","updated":"2024-04-15T15:30:32Z","published":"2024-04-12T12:08:06Z","title":"Direct May Not Be the Best: An Incremental Evolution View of Pose\n Generation","summary":" Pose diversity is an inherent representative characteristic of 2D images. Due\nto the 3D to 2D projection mechanism, there is evident content discrepancy\namong distinct pose images. This is the main obstacle bothering pose\ntransformation related researches. To deal with this challenge, we propose a\nfine-grained incremental evolution centered pose generation framework, rather\nthan traditional direct one-to-one in a rush. Since proposed approach actually\nbypasses the theoretical difficulty of directly modeling dramatic non-linear\nvariation, the incurred content distortion and blurring could be effectively\nconstrained, at the same time the various individual pose details, especially\nclothes texture, could be precisely maintained. In order to systematically\nguide the evolution course, both global and incremental evolution constraints\nare elaborately designed and merged into the overall framework. And a novel\ntriple-path knowledge fusion structure is worked out to take full advantage of\nall available valuable knowledge to conduct high-quality pose synthesis. In\naddition, our framework could generate a series of valuable byproducts, namely\nthe various intermediate poses. Extensive experiments have been conducted to\nverify the effectiveness of the proposed approach. Code is available at\nhttps://github.com/Xiaofei-CN/Incremental-Evolution-Pose-Generation.\n","authors":["Yuelong Li","Tengfei Xiao","Lei Geng","Jianming Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08419v2.pdf","comment":"Accepted at AAAI2024"},{"id":"http://arxiv.org/abs/2404.00722v4","updated":"2024-04-15T17:53:44Z","published":"2024-03-31T15:34:45Z","title":"DRCT: Saving Image Super-resolution away from Information Bottleneck","summary":" In recent years, Vision Transformer-based approaches for low-level vision\ntasks have achieved widespread success. Unlike CNN-based models, Transformers\nare more adept at capturing long-range dependencies, enabling the\nreconstruction of images utilizing non-local information. In the domain of\nsuper-resolution, Swin-transformer-based models have become mainstream due to\ntheir capability of global spatial information modeling and their\nshifting-window attention mechanism that facilitates the interchange of\ninformation between different windows. Many researchers have enhanced model\nperformance by expanding the receptive fields or designing meticulous networks,\nyielding commendable results. However, we observed that it is a general\nphenomenon for the feature map intensity to be abruptly suppressed to small\nvalues towards the network's end. This implies an information bottleneck and a\ndiminishment of spatial information, implicitly limiting the model's potential.\nTo address this, we propose the Dense-residual-connected Transformer (DRCT),\naimed at mitigating the loss of spatial information and stabilizing the\ninformation flow through dense-residual connections between layers, thereby\nunleashing the model's potential and saving the model away from information\nbottleneck. Experiment results indicate that our approach surpasses\nstate-of-the-art methods on benchmark datasets and performs commendably at the\nNTIRE-2024 Image Super-Resolution (x4) Challenge. Our source code is available\nat https://github.com/ming053l/DRCT\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou"],"pdf_url":"https://arxiv.org/pdf/2404.00722v4.pdf","comment":"Camera-ready version, NTIRE 2024 Image Super-resolution (x4)"},{"id":"http://arxiv.org/abs/2310.17347v3","updated":"2024-04-15T23:52:11Z","published":"2023-10-26T12:27:56Z","title":"CADS: Unleashing the Diversity of Diffusion Models through\n Condition-Annealed Sampling","summary":" While conditional diffusion models are known to have good coverage of the\ndata distribution, they still face limitations in output diversity,\nparticularly when sampled with a high classifier-free guidance scale for\noptimal image quality or when trained on small datasets. We attribute this\nproblem to the role of the conditioning signal in inference and offer an\nimproved sampling strategy for diffusion models that can increase generation\ndiversity, especially at high guidance scales, with minimal loss of sample\nquality. Our sampling strategy anneals the conditioning signal by adding\nscheduled, monotonically decreasing Gaussian noise to the conditioning vector\nduring inference to balance diversity and condition alignment. Our\nCondition-Annealed Diffusion Sampler (CADS) can be used with any pretrained\nmodel and sampling algorithm, and we show that it boosts the diversity of\ndiffusion models in various conditional generation tasks. Further, using an\nexisting pretrained diffusion model, CADS achieves a new state-of-the-art FID\nof 1.70 and 2.31 for class-conditional ImageNet generation at 256$\\times$256\nand 512$\\times$512 respectively.\n","authors":["Seyedmorteza Sadat","Jakob Buhmann","Derek Bradley","Otmar Hilliges","Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2310.17347v3.pdf","comment":"Published as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2402.19481v3","updated":"2024-04-15T23:37:46Z","published":"2024-02-29T18:59:58Z","title":"DistriFusion: Distributed Parallel Inference for High-Resolution\n Diffusion Models","summary":" Diffusion models have achieved great success in synthesizing high-quality\nimages. However, generating high-resolution images with diffusion models is\nstill challenging due to the enormous computational costs, resulting in a\nprohibitive latency for interactive applications. In this paper, we propose\nDistriFusion to tackle this problem by leveraging parallelism across multiple\nGPUs. Our method splits the model input into multiple patches and assigns each\npatch to a GPU. However, naively implementing such an algorithm breaks the\ninteraction between patches and loses fidelity, while incorporating such an\ninteraction will incur tremendous communication overhead. To overcome this\ndilemma, we observe the high similarity between the input from adjacent\ndiffusion steps and propose displaced patch parallelism, which takes advantage\nof the sequential nature of the diffusion process by reusing the pre-computed\nfeature maps from the previous timestep to provide context for the current\nstep. Therefore, our method supports asynchronous communication, which can be\npipelined by computation. Extensive experiments show that our method can be\napplied to recent Stable Diffusion XL with no quality degradation and achieve\nup to a 6.1$\\times$ speedup on eight NVIDIA A100s compared to one. Our code is\npublicly available at https://github.com/mit-han-lab/distrifuser.\n","authors":["Muyang Li","Tianle Cai","Jiaxin Cao","Qinsheng Zhang","Han Cai","Junjie Bai","Yangqing Jia","Ming-Yu Liu","Kai Li","Song Han"],"pdf_url":"https://arxiv.org/pdf/2402.19481v3.pdf","comment":"CVPR 2024 Highlight Code: https://github.com/mit-han-lab/distrifuser\n Website: https://hanlab.mit.edu/projects/distrifusion Blog:\n https://hanlab.mit.edu/blog/distrifusion"},{"id":"http://arxiv.org/abs/2311.13602v4","updated":"2024-04-15T23:29:51Z","published":"2023-11-22T18:59:53Z","title":"Retrieval-Augmented Layout Transformer for Content-Aware Layout\n Generation","summary":" Content-aware graphic layout generation aims to automatically arrange visual\nelements along with a given content, such as an e-commerce product image. In\nthis paper, we argue that the current layout generation approaches suffer from\nthe limited training data for the high-dimensional layout structure. We show\nthat a simple retrieval augmentation can significantly improve the generation\nquality. Our model, which is named Retrieval-Augmented Layout Transformer\n(RALF), retrieves nearest neighbor layout examples based on an input image and\nfeeds these results into an autoregressive generator. Our model can apply\nretrieval augmentation to various controllable generation tasks and yield\nhigh-quality layouts within a unified architecture. Our extensive experiments\nshow that RALF successfully generates content-aware layouts in both constrained\nand unconstrained settings and significantly outperforms the baselines.\n","authors":["Daichi Horita","Naoto Inoue","Kotaro Kikuchi","Kota Yamaguchi","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2311.13602v4.pdf","comment":"Accepted to CVPR 2024 (Oral), Project website:\n https://udonda.github.io/RALF/ , GitHub:\n https://github.com/CyberAgentAILab/RALF"},{"id":"http://arxiv.org/abs/2404.10178v1","updated":"2024-04-15T23:23:31Z","published":"2024-04-15T23:23:31Z","title":"CryoMAE: Few-Shot Cryo-EM Particle Picking with Masked Autoencoders","summary":" Cryo-electron microscopy (cryo-EM) emerges as a pivotal technology for\ndetermining the architecture of cells, viruses, and protein assemblies at\nnear-atomic resolution. Traditional particle picking, a key step in cryo-EM,\nstruggles with manual effort and automated methods' sensitivity to low\nsignal-to-noise ratio (SNR) and varied particle orientations. Furthermore,\nexisting neural network (NN)-based approaches often require extensive labeled\ndatasets, limiting their practicality. To overcome these obstacles, we\nintroduce cryoMAE, a novel approach based on few-shot learning that harnesses\nthe capabilities of Masked Autoencoders (MAE) to enable efficient selection of\nsingle particles in cryo-EM images. Contrary to conventional NN-based\ntechniques, cryoMAE requires only a minimal set of positive particle images for\ntraining yet demonstrates high performance in particle detection. Furthermore,\nthe implementation of a self-cross similarity loss ensures distinct features\nfor particle and background regions, thereby enhancing the discrimination\ncapability of cryoMAE. Experiments on large-scale cryo-EM datasets show that\ncryoMAE outperforms existing state-of-the-art (SOTA) methods, improving 3D\nreconstruction resolution by up to 22.4%.\n","authors":["Chentianye Xu","Xueying Zhan","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2404.10178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10175v1","updated":"2024-04-15T23:06:58Z","published":"2024-04-15T23:06:58Z","title":"PD-L1 Classification of Weakly-Labeled Whole Slide Images of Breast\n Cancer","summary":" Specific and effective breast cancer therapy relies on the accurate\nquantification of PD-L1 positivity in tumors, which appears in the form of\nbrown stainings in high resolution whole slide images (WSIs). However, the\nretrieval and extensive labeling of PD-L1 stained WSIs is a time-consuming and\nchallenging task for pathologists, resulting in low reproducibility, especially\nfor borderline images. This study aims to develop and compare models able to\nclassify PD-L1 positivity of breast cancer samples based on WSI analysis,\nrelying only on WSI-level labels. The task consists of two phases: identifying\nregions of interest (ROI) and classifying tumors as PD-L1 positive or negative.\nFor the latter, two model categories were developed, with different feature\nextraction methodologies. The first encodes images based on the colour distance\nfrom a base color. The second uses a convolutional autoencoder to obtain\nembeddings of WSI tiles, and aggregates them into a WSI-level embedding. For\nboth model types, features are fed into downstream ML classifiers. Two datasets\nfrom different clinical centers were used in two different training\nconfigurations: (1) training on one dataset and testing on the other; (2)\ncombining the datasets. We also tested the performance with or without human\npreprocessing to remove brown artefacts Colour distance based models achieve\nthe best performances on testing configuration (1) with artefact removal, while\nautoencoder-based models are superior in the remaining cases, which are prone\nto greater data variability.\n","authors":["Giacomo Cignoni","Cristian Scatena","Chiara Frascarelli","Nicola Fusco","Antonio Giuseppe Naccarato","Giuseppe Nicoló Fanelli","Alina Sîrbu"],"pdf_url":"https://arxiv.org/pdf/2404.10175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10172v1","updated":"2024-04-15T23:01:59Z","published":"2024-04-15T23:01:59Z","title":"Forensic Iris Image-Based Post-Mortem Interval Estimation","summary":" Post-mortem iris recognition is an emerging application of iris-based human\nidentification in a forensic setup. One factor that may be useful in\nconditioning iris recognition methods is the tissue decomposition level, which\nis correlated with the post-mortem interval (PMI), i.g., the number of hours\nthat have elapsed since death. PMI, however, is not always available, and its\nprecise estimation remains one of the core challenges in forensic examination.\nThis paper presents the first known to us method of PMI estimation directly\nfrom forensic iris images. To assess the feasibility of the iris-based PMI\nestimation, convolutional neural networks-based models (VGG19, DenseNet121,\nResNet152, and Inception_v3) were trained to predict the PMI from (a)\nnear-infrared (NIR), (b) visible (RGB), and (c) multispectral forensic iris\nimages. Models were evaluated following a 10-fold cross-validation in (S1)\nsample-disjoint, (S2) subject-disjoint, and (S3) cross-dataset scenarios. We\nfound that using the multispectral data offers a spectacularly low mean\nabsolute error (MAE) of approximately 3.5 hours in scenario (S1), a bit worse\nMAE of approximately 17.5 hours in scenario (S2), and an MAE of approximately\n69.0 hours of in the scenario (S3). This suggests that if the environmental\nconditions are favorable (e.g., bodies are kept in low temperatures), forensic\niris images provide features that are indicative of the PMI and can be\nautomatically estimated. The source codes and model weights are made available\nwith the paper.\n","authors":["Rasel Ahmed Bhuiyan","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2404.10172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10170v1","updated":"2024-04-15T22:49:37Z","published":"2024-04-15T22:49:37Z","title":"High-Resolution Detection of Earth Structural Heterogeneities from\n Seismic Amplitudes using Convolutional Neural Networks with Attention layers","summary":" Earth structural heterogeneities have a remarkable role in the petroleum\neconomy for both exploration and production projects. Automatic detection of\ndetailed structural heterogeneities is challenging when considering modern\nmachine learning techniques like deep neural networks. Typically, these\ntechniques can be an excellent tool for assisted interpretation of such\nheterogeneities, but it heavily depends on the amount of data to be trained.\n We propose an efficient and cost-effective architecture for detecting seismic\nstructural heterogeneities using Convolutional Neural Networks (CNNs) combined\nwith Attention layers. The attention mechanism reduces costs and enhances\naccuracy, even in cases with relatively noisy data. Our model has half the\nparameters compared to the state-of-the-art, and it outperforms previous\nmethods in terms of Intersection over Union (IoU) by 0.6% and precision by\n0.4%. By leveraging synthetic data, we apply transfer learning to train and\nfine-tune the model, addressing the challenge of limited annotated data\navailability.\n","authors":["Luiz Schirmer","Guilherme Schardong","Vinícius da Silva","Rogério Santos","Hélio Lopes"],"pdf_url":"https://arxiv.org/pdf/2404.10170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06394v2","updated":"2024-04-15T22:47:09Z","published":"2023-11-10T20:50:36Z","title":"A design of Convolutional Neural Network model for the Diagnosis of the\n COVID-19","summary":" With the spread of COVID-19 around the globe over the past year, the usage of\nartificial intelligence (AI) algorithms and image processing methods to analyze\nthe X-ray images of patients' chest with COVID-19 has become essential. The\nCOVID-19 virus recognition in the lung area of a patient is one of the basic\nand essential needs of clicical centers and hospitals. Most research in this\nfield has been devoted to papers on the basis of deep learning methods\nutilizing CNNs (Convolutional Neural Network), which mainly deal with the\nscreening of sick and healthy people.In this study, a new structure of a\n19-layer CNN has been recommended for accurately recognition of the COVID-19\nfrom the X-ray pictures of chest. The offered CNN is developed to serve as a\nprecise diagnosis system for a three class (viral pneumonia, Normal, COVID) and\na four classclassification (Lung opacity, Normal, COVID-19, and pneumonia). A\ncomparison is conducted among the outcomes of the offered procedure and some\npopular pretrained networks, including Inception, Alexnet, ResNet50,\nSqueezenet, and VGG19 and based on Specificity, Accuracy, Precision,\nSensitivity, Confusion Matrix, and F1-score. The experimental results of the\noffered CNN method specify its dominance over the existing published\nprocedures. This method can be a useful tool for clinicians in deciding\nproperly about COVID-19.\n","authors":["Xinyuan Song"],"pdf_url":"https://arxiv.org/pdf/2311.06394v2.pdf","comment":"Important mistakes. Also, another author has contributed some to the\n revised version. So it is not appropriate for it to be with only my name"},{"id":"http://arxiv.org/abs/2404.10166v1","updated":"2024-04-15T22:32:50Z","published":"2024-04-15T22:32:50Z","title":"Self-Supervised Learning Featuring Small-Scale Image Dataset for\n Treatable Retinal Diseases Classification","summary":" Automated medical diagnosis through image-based neural networks has increased\nin popularity and matured over years. Nevertheless, it is confined by the\nscarcity of medical images and the expensive labor annotation costs.\nSelf-Supervised Learning (SSL) is an good alternative to Transfer Learning (TL)\nand is suitable for imbalanced image datasets. In this study, we assess four\npretrained SSL models and two TL models in treatable retinal diseases\nclassification using small-scale Optical Coherence Tomography (OCT) images\nranging from 125 to 4000 with balanced or imbalanced distribution for training.\nThe proposed SSL model achieves the state-of-art accuracy of 98.84% using only\n4,000 training images. Our results suggest the SSL models provide superior\nperformance under both the balanced and imbalanced training scenarios. The SSL\nmodel with MoCo-v2 scheme has consistent good performance under the imbalanced\nscenario and, especially, surpasses the other models when the training set is\nless than 500 images.\n","authors":["Luffina C. Huang","Darren J. Chiu","Manish Mehta"],"pdf_url":"https://arxiv.org/pdf/2404.10166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10163v1","updated":"2024-04-15T22:26:27Z","published":"2024-04-15T22:26:27Z","title":"EyeFormer: Predicting Personalized Scanpaths with Transformer-Guided\n Reinforcement Learning","summary":" From a visual perception perspective, modern graphical user interfaces (GUIs)\ncomprise a complex graphics-rich two-dimensional visuospatial arrangement of\ntext, images, and interactive objects such as buttons and menus. While existing\nmodels can accurately predict regions and objects that are likely to attract\nattention ``on average'', so far there is no scanpath model capable of\npredicting scanpaths for an individual. To close this gap, we introduce\nEyeFormer, which leverages a Transformer architecture as a policy network to\nguide a deep reinforcement learning algorithm that controls gaze locations. Our\nmodel has the unique capability of producing personalized predictions when\ngiven a few user scanpath samples. It can predict full scanpath information,\nincluding fixation positions and duration, across individuals and various\nstimulus types. Additionally, we demonstrate applications in GUI layout\noptimization driven by our model. Our software and models will be publicly\navailable.\n","authors":["Yue Jiang","Zixin Guo","Hamed Rezazadegan Tavakoli","Luis A. Leiva","Antti Oulasvirta"],"pdf_url":"https://arxiv.org/pdf/2404.10163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02687v3","updated":"2024-04-15T22:13:39Z","published":"2022-12-06T01:10:31Z","title":"Vision Transformer Computation and Resilience for Dynamic Inference","summary":" State-of-the-art deep learning models for computer vision tasks are based on\nthe transformer architecture and often deployed in real-time applications. In\nthis scenario, the resources available for every inference can vary, so it is\nuseful to be able to dynamically adapt execution to trade accuracy for\nefficiency. To create dynamic models, we leverage the resilience of vision\ntransformers to pruning and switch between different scaled versions of a\nmodel. Surprisingly, we find that most FLOPs are generated by convolutions, not\nattention. These relative FLOP counts are not a good predictor of GPU\nperformance since GPUs have special optimizations for convolutions. Some models\nare fairly resilient and their model execution can be adapted without\nretraining, while all models achieve better accuracy with retraining\nalternative execution paths. These insights mean that we can leverage CNN\naccelerators and these alternative execution paths to enable efficient and\ndynamic vision transformer inference. Our analysis shows that leveraging this\ntype of dynamic execution can lead to saving 28\\% of energy with a 1.4\\%\naccuracy drop for SegFormer (63 GFLOPs), with no additional training, and 53\\%\nof energy for ResNet-50 (4 GFLOPs) with a 3.3\\% accuracy drop by switching\nbetween pretrained Once-For-All models.\n","authors":["Kavya Sreedhar","Jason Clemons","Rangharajan Venkatesan","Stephen W. Keckler","Mark Horowitz"],"pdf_url":"https://arxiv.org/pdf/2212.02687v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10157v1","updated":"2024-04-15T22:13:35Z","published":"2024-04-15T22:13:35Z","title":"Salient Object-Aware Background Generation using Text-Guided Diffusion\n Models","summary":" Generating background scenes for salient objects plays a crucial role across\nvarious domains including creative design and e-commerce, as it enhances the\npresentation and context of subjects by integrating them into tailored\nenvironments. Background generation can be framed as a task of text-conditioned\noutpainting, where the goal is to extend image content beyond a salient\nobject's boundaries on a blank background. Although popular diffusion models\nfor text-guided inpainting can also be used for outpainting by mask inversion,\nthey are trained to fill in missing parts of an image rather than to place an\nobject into a scene. Consequently, when used for background creation,\ninpainting models frequently extend the salient object's boundaries and thereby\nchange the object's identity, which is a phenomenon we call \"object expansion.\"\nThis paper introduces a model for adapting inpainting diffusion models to the\nsalient object outpainting task using Stable Diffusion and ControlNet\narchitectures. We present a series of qualitative and quantitative results\nacross models and datasets, including a newly proposed metric to measure object\nexpansion that does not require any human labeling. Compared to Stable\nDiffusion 2.0 Inpainting, our proposed approach reduces object expansion by\n3.6x on average with no degradation in standard visual metrics across multiple\ndatasets.\n","authors":["Amir Erfan Eshratifar","Joao V. B. Soares","Kapil Thadani","Shaunak Mishra","Mikhail Kuznetsov","Yueh-Ning Ku","Paloma de Juan"],"pdf_url":"https://arxiv.org/pdf/2404.10157v1.pdf","comment":"Accepted for publication at CVPR 2024's Generative Models for\n Computer Vision workshop"},{"id":"http://arxiv.org/abs/2404.10156v1","updated":"2024-04-15T22:12:05Z","published":"2024-04-15T22:12:05Z","title":"SegFormer3D: an Efficient Transformer for 3D Medical Image Segmentation","summary":" The adoption of Vision Transformers (ViTs) based architectures represents a\nsignificant advancement in 3D Medical Image (MI) segmentation, surpassing\ntraditional Convolutional Neural Network (CNN) models by enhancing global\ncontextual understanding. While this paradigm shift has significantly enhanced\n3D segmentation performance, state-of-the-art architectures require extremely\nlarge and complex architectures with large scale computing resources for\ntraining and deployment. Furthermore, in the context of limited datasets, often\nencountered in medical imaging, larger models can present hurdles in both model\ngeneralization and convergence. In response to these challenges and to\ndemonstrate that lightweight models are a valuable area of research in 3D\nmedical imaging, we present SegFormer3D, a hierarchical Transformer that\ncalculates attention across multiscale volumetric features. Additionally,\nSegFormer3D avoids complex decoders and uses an all-MLP decoder to aggregate\nlocal and global attention features to produce highly accurate segmentation\nmasks. The proposed memory efficient Transformer preserves the performance\ncharacteristics of a significantly larger model in a compact design.\nSegFormer3D democratizes deep learning for 3D medical image segmentation by\noffering a model with 33x less parameters and a 13x reduction in GFLOPS\ncompared to the current state-of-the-art (SOTA). We benchmark SegFormer3D\nagainst the current SOTA models on three widely used datasets Synapse, BRaTs,\nand ACDC, achieving competitive results. Code:\nhttps://github.com/OSUPCVLab/SegFormer3D.git\n","authors":["Shehan Perera","Pouyan Navard","Alper Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2404.10156v1.pdf","comment":"Accepted at CVPR Workshop 2024"},{"id":"http://arxiv.org/abs/2404.10147v1","updated":"2024-04-15T21:33:45Z","published":"2024-04-15T21:33:45Z","title":"Eyes on the Streets: Leveraging Street-Level Imaging to Model Urban\n Crime Dynamics","summary":" This study addresses the challenge of urban safety in New York City by\nexamining the relationship between the built environment and crime rates using\nmachine learning and a comprehensive dataset of street view im- ages. We aim to\nidentify how urban landscapes correlate with crime statistics, focusing on the\ncharacteristics of street views and their association with crime rates. The\nfindings offer insights for urban planning and crime pre- vention, highlighting\nthe potential of environmental de- sign in enhancing public safety.\n","authors":["Zhixuan Qi","Huaiying Luo","Chen Chi"],"pdf_url":"https://arxiv.org/pdf/2404.10147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10146v1","updated":"2024-04-15T21:30:50Z","published":"2024-04-15T21:30:50Z","title":"Cross-Modal Self-Training: Aligning Images and Pointclouds to Learn\n Classification without Labels","summary":" Large-scale vision 2D vision language models, such as CLIP can be aligned\nwith a 3D encoder to learn generalizable (open-vocabulary) 3D vision models.\nHowever, current methods require supervised pre-training for such alignment,\nand the performance of such 3D zero-shot models remains sub-optimal for\nreal-world adaptation. In this work, we propose an optimization framework:\nCross-MoST: Cross-Modal Self-Training, to improve the label-free classification\nperformance of a zero-shot 3D vision model by simply leveraging unlabeled 3D\ndata and their accompanying 2D views. We propose a student-teacher framework to\nsimultaneously process 2D views and 3D point clouds and generate joint pseudo\nlabels to train a classifier and guide cross-model feature alignment. Thereby\nwe demonstrate that 2D vision language models such as CLIP can be used to\ncomplement 3D representation learning to improve classification performance\nwithout the need for expensive class annotations. Using synthetic and\nreal-world 3D datasets, we further demonstrate that Cross-MoST enables\nefficient cross-modal knowledge exchange resulting in both image and point\ncloud modalities learning from each other's rich representations.\n","authors":["Amaya Dharmasiri","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2404.10146v1.pdf","comment":"To be published in Workshop for Learning 3D with Multi-View\n Supervision (3DMV) at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10141v1","updated":"2024-04-15T21:19:10Z","published":"2024-04-15T21:19:10Z","title":"ANCHOR: LLM-driven News Subject Conditioning for Text-to-Image Synthesis","summary":" Text-to-Image (T2I) Synthesis has made tremendous strides in enhancing\nsynthesized image quality, but current datasets evaluate model performance only\non descriptive, instruction-based prompts. Real-world news image captions take\na more pragmatic approach, providing high-level situational and Named-Entity\n(NE) information and limited physical object descriptions, making them\nabstractive. To evaluate the ability of T2I models to capture intended subjects\nfrom news captions, we introduce the Abstractive News Captions with High-level\ncOntext Representation (ANCHOR) dataset, containing 70K+ samples sourced from 5\ndifferent news media organizations. With Large Language Models (LLM) achieving\nsuccess in language and commonsense reasoning tasks, we explore the ability of\ndifferent LLMs to identify and understand key subjects from abstractive\ncaptions. Our proposed method Subject-Aware Finetuning (SAFE), selects and\nenhances the representation of key subjects in synthesized images by leveraging\nLLM-generated subject weights. It also adapts to the domain distribution of\nnews images and captions through custom Domain Fine-tuning, outperforming\ncurrent T2I baselines on ANCHOR. By launching the ANCHOR dataset, we hope to\nmotivate research in furthering the Natural Language Understanding (NLU)\ncapabilities of T2I models.\n","authors":["Aashish Anantha Ramakrishnan","Sharon X. Huang","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2404.10141v1.pdf","comment":"23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.06129v2","updated":"2024-04-15T21:10:37Z","published":"2024-01-11T18:59:53Z","title":"Distilling Vision-Language Models on Millions of Videos","summary":" The recent advance in vision-language models is largely attributed to the\nabundance of image-text data. We aim to replicate this success for\nvideo-language models, but there simply is not enough human-curated video-text\ndata available. We thus resort to fine-tuning a video-language model from a\nstrong image-language baseline with synthesized instructional data. The\nresulting video model by video-instruction-tuning (VIIT) is then used to\nauto-label millions of videos to generate high-quality captions. We show the\nadapted video-language model performs well on a wide range of video-language\nbenchmarks. For instance, it surpasses the best prior result on open-ended\nNExT-QA by 2.8%. Besides, our model generates detailed descriptions for\npreviously unseen videos, which provide better textual supervision than\nexisting methods. Experiments show that a video-language dual-encoder model\ncontrastively trained on these auto-generated captions is 3.8% better than the\nstrongest baseline that also leverages vision-language models. Our best model\noutperforms state-of-the-art methods on MSR-VTT zero-shot text-to-video\nretrieval by 6%. As a side product, we generate the largest video caption\ndataset to date.\n","authors":["Yue Zhao","Long Zhao","Xingyi Zhou","Jialin Wu","Chun-Te Chu","Hui Miao","Florian Schroff","Hartwig Adam","Ting Liu","Boqing Gong","Philipp Krähenbühl","Liangzhe Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.06129v2.pdf","comment":"CVPR 2024. Project page:\n https://zhaoyue-zephyrus.github.io/video-instruction-tuning"},{"id":"http://arxiv.org/abs/2403.15977v2","updated":"2024-04-15T21:08:05Z","published":"2024-03-24T01:20:08Z","title":"Towards Two-Stream Foveation-based Active Vision Learning","summary":" Deep neural network (DNN) based machine perception frameworks process the\nentire input in a one-shot manner to provide answers to both \"what object is\nbeing observed\" and \"where it is located\". In contrast, the \"two-stream\nhypothesis\" from neuroscience explains the neural processing in the human\nvisual cortex as an active vision system that utilizes two separate regions of\nthe brain to answer the what and the where questions. In this work, we propose\na machine learning framework inspired by the \"two-stream hypothesis\" and\nexplore the potential benefits that it offers. Specifically, the proposed\nframework models the following mechanisms: 1) ventral (what) stream focusing on\nthe input regions perceived by the fovea part of an eye (foveation), 2) dorsal\n(where) stream providing visual guidance, and 3) iterative processing of the\ntwo streams to calibrate visual focus and process the sequence of focused image\npatches. The training of the proposed framework is accomplished by label-based\nDNN training for the ventral stream model and reinforcement learning for the\ndorsal stream model. We show that the two-stream foveation-based learning is\napplicable to the challenging task of weakly-supervised object localization\n(WSOL), where the training data is limited to the object class or its\nattributes. The framework is capable of both predicting the properties of an\nobject and successfully localizing it by predicting its bounding box. We also\nshow that, due to the independent nature of the two streams, the dorsal model\ncan be applied on its own to unseen images to localize objects from different\ndatasets.\n","authors":["Timur Ibrayev","Amitangshu Mukherjee","Sai Aparna Aketi","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2403.15977v2.pdf","comment":"Accepted for publication at IEEE Transactions on Cognitive and\n Developmental Systems (IEEE TCDS), 18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.10133v1","updated":"2024-04-15T20:48:33Z","published":"2024-04-15T20:48:33Z","title":"WB LUTs: Contrastive Learning for White Balancing Lookup Tables","summary":" Automatic white balancing (AWB), one of the first steps in an integrated\nsignal processing (ISP) pipeline, aims to correct the color cast induced by the\nscene illuminant. An incorrect white balance (WB) setting or AWB failure can\nlead to an undesired blue or red tint in the rendered sRGB image. To address\nthis, recent methods pose the post-capture WB correction problem as an\nimage-to-image translation task and train deep neural networks to learn the\nnecessary color adjustments at a lower resolution. These low resolution outputs\nare post-processed to generate high resolution WB corrected images, forming a\nbottleneck in the end-to-end run time. In this paper we present a 3D Lookup\nTable (LUT) based WB correction model called WB LUTs that can generate high\nresolution outputs in real time. We introduce a contrastive learning framework\nwith a novel hard sample mining strategy, which improves the WB correction\nquality of baseline 3D LUTs by 25.5%. Experimental results demonstrate that the\nproposed WB LUTs perform competitively against state-of-the-art models on two\nbenchmark datasets while being 300 times faster using 12.7 times less memory.\nOur model and code are available at https://github.com/skrmanne/3DLUT_sRGB_WB.\n","authors":["Sai Kumar Reddy Manne","Michael Wan"],"pdf_url":"https://arxiv.org/pdf/2404.10133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10130v1","updated":"2024-04-15T20:37:52Z","published":"2024-04-15T20:37:52Z","title":"NOISe: Nuclei-Aware Osteoclast Instance Segmentation for Mouse-to-Human\n Domain Transfer","summary":" Osteoclast cell image analysis plays a key role in osteoporosis research, but\nit typically involves extensive manual image processing and hand annotations by\na trained expert. In the last few years, a handful of machine learning\napproaches for osteoclast image analysis have been developed, but none have\naddressed the full instance segmentation task required to produce the same\noutput as that of the human expert led process. Furthermore, none of the prior,\nfully automated algorithms have publicly available code, pretrained models, or\nannotated datasets, inhibiting reproduction and extension of their work. We\npresent a new dataset with ~2*10^5 expert annotated mouse osteoclast masks,\ntogether with a deep learning instance segmentation method which works for both\nin vitro mouse osteoclast cells on plastic tissue culture plates and human\nosteoclast cells on bone chips. To our knowledge, this is the first work to\nautomate the full osteoclast instance segmentation task. Our method achieves a\nperformance of 0.82 mAP_0.5 (mean average precision at intersection-over-union\nthreshold of 0.5) in cross validation for mouse osteoclasts. We present a novel\nnuclei-aware osteoclast instance segmentation training strategy (NOISe) based\non the unique biology of osteoclasts, to improve the model's generalizability\nand boost the mAP_0.5 from 0.60 to 0.82 on human osteoclasts. We publish our\nannotated mouse osteoclast image dataset, instance segmentation models, and\ncode at github.com/michaelwwan/noise to enable reproducibility and to provide a\npublic tool to accelerate osteoporosis research.\n","authors":["Sai Kumar Reddy Manne","Brendan Martin","Tyler Roy","Ryan Neilson","Rebecca Peters","Meghana Chillara","Christine W. Lary","Katherine J. Motyl","Michael Wan"],"pdf_url":"https://arxiv.org/pdf/2404.10130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01289v4","updated":"2024-04-15T20:35:03Z","published":"2023-06-02T06:15:36Z","title":"nnMobileNet: Rethinking CNN for Retinopathy Research","summary":" Over the past few decades, convolutional neural networks (CNNs) have been at\nthe forefront of the detection and tracking of various retinal diseases (RD).\nDespite their success, the emergence of vision transformers (ViT) in the 2020s\nhas shifted the trajectory of RD model development. The leading-edge\nperformance of ViT-based models in RD can be largely credited to their\nscalability-their ability to improve as more parameters are added. As a result,\nViT-based models tend to outshine traditional CNNs in RD applications, albeit\nat the cost of increased data and computational demands. ViTs also differ from\nCNNs in their approach to processing images, working with patches rather than\nlocal regions, which can complicate the precise localization of small, variably\npresented lesions in RD. In our study, we revisited and updated the\narchitecture of a CNN model, specifically MobileNet, to enhance its utility in\nRD diagnostics. We found that an optimized MobileNet, through selective\nmodifications, can surpass ViT-based models in various RD benchmarks, including\ndiabetic retinopathy grading, detection of multiple fundus diseases, and\nclassification of diabetic macular edema. The code is available at\nhttps://github.com/Retinal-Research/NN-MOBILENET\n","authors":["Wenhui Zhu","Peijie Qiu","Xiwen Chen","Xin Li","Natasha Lepore","Oana M. Dumitrascu","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01289v4.pdf","comment":"Accepted as a conference paper to 2024 CVPRW"},{"id":"http://arxiv.org/abs/2404.10124v1","updated":"2024-04-15T20:21:05Z","published":"2024-04-15T20:21:05Z","title":"Epistemic Uncertainty Quantification For Pre-trained Neural Network","summary":" Epistemic uncertainty quantification (UQ) identifies where models lack\nknowledge. Traditional UQ methods, often based on Bayesian neural networks, are\nnot suitable for pre-trained non-Bayesian models. Our study addresses\nquantifying epistemic uncertainty for any pre-trained model, which does not\nneed the original training data or model modifications and can ensure broad\napplicability regardless of network architectures or training techniques.\nSpecifically, we propose a gradient-based approach to assess epistemic\nuncertainty, analyzing the gradients of outputs relative to model parameters,\nand thereby indicating necessary model adjustments to accurately represent the\ninputs. We first explore theoretical guarantees of gradient-based methods for\nepistemic UQ, questioning the view that this uncertainty is only calculable\nthrough differences between multiple models. We further improve gradient-driven\nUQ by using class-specific weights for integrating gradients and emphasizing\ndistinct contributions from neural network layers. Additionally, we enhance UQ\naccuracy by combining gradient and perturbation methods to refine the\ngradients. We evaluate our approach on out-of-distribution detection,\nuncertainty calibration, and active learning, demonstrating its superiority\nover current state-of-the-art UQ methods for pre-trained models.\n","authors":["Hanjing Wang","Qiang Ji"],"pdf_url":"https://arxiv.org/pdf/2404.10124v1.pdf","comment":"Published at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10108v1","updated":"2024-04-15T19:43:16Z","published":"2024-04-15T19:43:16Z","title":"GeoAI Reproducibility and Replicability: a computational and spatial\n perspective","summary":" GeoAI has emerged as an exciting interdisciplinary research area that\ncombines spatial theories and data with cutting-edge AI models to address\ngeospatial problems in a novel, data-driven manner. While GeoAI research has\nflourished in the GIScience literature, its reproducibility and replicability\n(R&R), fundamental principles that determine the reusability, reliability, and\nscientific rigor of research findings, have rarely been discussed. This paper\naims to provide an in-depth analysis of this topic from both computational and\nspatial perspectives. We first categorize the major goals for reproducing GeoAI\nresearch, namely, validation (repeatability), learning and adapting the method\nfor solving a similar or new problem (reproducibility), and examining the\ngeneralizability of the research findings (replicability). Each of these goals\nrequires different levels of understanding of GeoAI, as well as different\nmethods to ensure its success. We then discuss the factors that may cause the\nlack of R&R in GeoAI research, with an emphasis on (1) the selection and use of\ntraining data; (2) the uncertainty that resides in the GeoAI model design,\ntraining, deployment, and inference processes; and more importantly (3) the\ninherent spatial heterogeneity of geospatial data and processes. We use a deep\nlearning-based image analysis task as an example to demonstrate the results'\nuncertainty and spatial variance caused by different factors. The findings\nreiterate the importance of knowledge sharing, as well as the generation of a\n\"replicability map\" that incorporates spatial autocorrelation and spatial\nheterogeneity into consideration in quantifying the spatial replicability of\nGeoAI research.\n","authors":["Wenwen Lia","Chia-Yu Hsu","Sizhe Wang","Peter Kedron"],"pdf_url":"https://arxiv.org/pdf/2404.10108v1.pdf","comment":"Accepted by Annals of the American Association of Geographers"},{"id":"http://arxiv.org/abs/2004.05704v3","updated":"2024-04-15T19:09:39Z","published":"2020-04-12T21:45:23Z","title":"Visual Grounding Methods for VQA are Working for the Wrong Reasons!","summary":" Existing Visual Question Answering (VQA) methods tend to exploit dataset\nbiases and spurious statistical correlations, instead of producing right\nanswers for the right reasons. To address this issue, recent bias mitigation\nmethods for VQA propose to incorporate visual cues (e.g., human attention maps)\nto better ground the VQA models, showcasing impressive gains. However, we show\nthat the performance improvements are not a result of improved visual\ngrounding, but a regularization effect which prevents over-fitting to\nlinguistic priors. For instance, we find that it is not actually necessary to\nprovide proper, human-based cues; random, insensible cues also result in\nsimilar improvements. Based on this observation, we propose a simpler\nregularization scheme that does not require any external annotations and yet\nachieves near state-of-the-art performance on VQA-CPv2.\n","authors":["Robik Shrestha","Kushal Kafle","Christopher Kanan"],"pdf_url":"https://arxiv.org/pdf/2004.05704v3.pdf","comment":"ACL 2020"},{"id":"http://arxiv.org/abs/2402.10021v2","updated":"2024-04-15T19:07:07Z","published":"2024-02-15T15:39:46Z","title":"SAWEC: Sensing-Assisted Wireless Edge Computing","summary":" Emerging mobile virtual reality (VR) systems will require to continuously\nperform complex computer vision tasks on ultra-high-resolution video frames\nthrough the execution of deep neural networks (DNNs)-based algorithms. Since\nstate-of-the-art DNNs require computational power that is excessive for mobile\ndevices, techniques based on wireless edge computing (WEC) have been recently\nproposed. However, existing WEC methods require the transmission and processing\nof a high amount of video data which may ultimately saturate the wireless link.\nIn this paper, we propose a novel Sensing-Assisted Wireless Edge Computing\n(SAWEC) paradigm to address this issue. SAWEC leverages knowledge about the\nphysical environment to reduce the end-to-end latency and overall computational\nburden by transmitting to the edge server only the relevant data for the\ndelivery of the service. Our intuition is that the transmission of the portion\nof the video frames where there are no changes with respect to previous frames\ncan be avoided. Specifically, we leverage wireless sensing techniques to\nestimate the location of objects in the environment and obtain insights about\nthe environment dynamics. Hence, only the part of the frames where any\nenvironmental change is detected is transmitted and processed. We evaluated\nSAWEC by using a 10K 360$^{\\circ}$ with a Wi-Fi 6 sensing system operating at\n160 MHz and performing localization and tracking. We considered instance\nsegmentation and object detection as benchmarking tasks for performance\nevaluation. We carried out experiments in an anechoic chamber and an entrance\nhall with two human subjects in six different setups. Experimental results show\nthat SAWEC reduces both the channel occupation and end-to-end latency by more\nthan 90% while improving the instance segmentation and object detection\nperformance with respect to state-of-the-art WEC approaches.\n","authors":["Khandaker Foysal Haque","Francesca Meneghello","Md. Ebtidaul Karim","Francesco Restuccia"],"pdf_url":"https://arxiv.org/pdf/2402.10021v2.pdf","comment":"Submitted to ACM for possible publication"},{"id":"http://arxiv.org/abs/2404.10096v1","updated":"2024-04-15T19:06:58Z","published":"2024-04-15T19:06:58Z","title":"Vision Augmentation Prediction Autoencoder with Attention Design\n (VAPAAD)","summary":" Despite significant advancements in sequence prediction, current methods lack\nattention-based mechanisms for next-frame prediction. Our work introduces\nVAPAAD or Vision Augmentation Prediction Autoencoder with Attention Design, an\ninnovative model that enhances predictive performance by integrating attention\ndesigns, allowing for nuanced understanding and handling of temporal dynamics\nin video sequences. We demonstrate using the famous Moving MNIST dataset the\nrobust performance of the proposed model and potential applicability of such\ndesign in the literature.\n","authors":["Yiqiao Yin"],"pdf_url":"https://arxiv.org/pdf/2404.10096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07329v2","updated":"2024-04-15T18:54:10Z","published":"2024-02-11T23:39:33Z","title":"The Bias of Harmful Label Associations in Vision-Language Models","summary":" Despite the remarkable performance of foundation vision-language models, the\nshared representation space for text and vision can also encode harmful label\nassociations detrimental to fairness. While prior work has uncovered bias in\nvision-language models' (VLMs) classification performance across geography,\nwork has been limited along the important axis of harmful label associations\ndue to a lack of rich, labeled data. In this work, we investigate harmful label\nassociations in the recently released Casual Conversations datasets containing\nmore than 70,000 videos. We study bias in the frequency of harmful label\nassociations across self-provided labels for age, gender, apparent skin tone,\nand physical adornments across several leading VLMs. We find that VLMs are\n$4-7$x more likely to harmfully classify individuals with darker skin tones. We\nalso find scaling transformer encoder model size leads to higher confidence in\nharmful predictions. Finally, we find improvements on standard vision tasks\nacross VLMs does not address disparities in harmful label associations.\n","authors":["Caner Hazirbas","Alicia Sun","Yonathan Efroni","Mark Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2402.07329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10078v1","updated":"2024-04-15T18:32:52Z","published":"2024-04-15T18:32:52Z","title":"Low-Light Image Enhancement Framework for Improved Object Detection in\n Fisheye Lens Datasets","summary":" This study addresses the evolving challenges in urban traffic monitoring\ndetection systems based on fisheye lens cameras by proposing a framework that\nimproves the efficacy and accuracy of these systems. In the context of urban\ninfrastructure and transportation management, advanced traffic monitoring\nsystems have become critical for managing the complexities of urbanization and\nincreasing vehicle density. Traditional monitoring methods, which rely on\nstatic cameras with narrow fields of view, are ineffective in dynamic urban\nenvironments, necessitating the installation of multiple cameras, which raises\ncosts. Fisheye lenses, which were recently introduced, provide wide and\nomnidirectional coverage in a single frame, making them a transformative\nsolution. However, issues such as distorted views and blurriness arise,\npreventing accurate object detection on these images. Motivated by these\nchallenges, this study proposes a novel approach that combines a\nransformer-based image enhancement framework and ensemble learning technique to\naddress these challenges and improve traffic monitoring accuracy, making\nsignificant contributions to the future of intelligent traffic management\nsystems. Our proposed methodological framework won 5th place in the 2024 AI\nCity Challenge, Track 4, with an F1 score of 0.5965 on experimental validation\ndata. The experimental results demonstrate the effectiveness, efficiency, and\nrobustness of the proposed system. Our code is publicly available at\nhttps://github.com/daitranskku/AIC2024-TRACK4-TEAM15.\n","authors":["Dai Quoc Tran","Armstrong Aboah","Yuntae Jeon","Maged Shoman","Minsoo Park","Seunghee Park"],"pdf_url":"https://arxiv.org/pdf/2404.10078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10073v1","updated":"2024-04-15T18:26:03Z","published":"2024-04-15T18:26:03Z","title":"Explainable Light-Weight Deep Learning Pipeline for Improved Drought\n Stres","summary":" Early identification of drought stress in crops is vital for implementing\neffective mitigation measures and reducing yield loss. Non-invasive imaging\ntechniques hold immense potential by capturing subtle physiological changes in\nplants under water deficit. Sensor based imaging data serves as a rich source\nof information for machine learning and deep learning algorithms, facilitating\nfurther analysis aimed at identifying drought stress. While these approaches\nyield favorable results, real-time field applications requires algorithms\nspecifically designed for the complexities of natural agricultural conditions.\nOur work proposes a novel deep learning framework for classifying drought\nstress in potato crops captured by UAVs in natural settings. The novelty lies\nin the synergistic combination of a pretrained network with carefully designed\ncustom layers. This architecture leverages feature extraction capabilities of\nthe pre-trained network while the custom layers enable targeted dimensionality\nreduction and enhanced regularization, ultimately leading to improved\nperformance. A key innovation of our work involves the integration of\nGradient-Class Activation Mapping (Grad-CAM), an explainability technique.\nGrad-CAM sheds light on the internal workings of the deep learning model,\ntypically referred to as a black box. By visualizing the focus areas of the\nmodel within the images, Grad-CAM fosters interpretability and builds trust in\nthe decision-making process of the model. Our proposed framework achieves\nsuperior performance, particularly with the DenseNet121 pre-trained network,\nreaching a precision of 98% to identify the stressed class with an overall\naccuracy of 90%. Comparative analysis of existing state-of-the-art object\ndetection algorithms reveals the superiority of our approach in significantly\nhigher precision and accuracy.\n","authors":["Aswini Kumar Patra","Lingaraj Sahoo"],"pdf_url":"https://arxiv.org/pdf/2404.10073v1.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.11443v2","updated":"2024-04-15T18:11:29Z","published":"2023-05-19T05:50:24Z","title":"Equivariant Multi-Modality Image Fusion","summary":" Multi-modality image fusion is a technique that combines information from\ndifferent sensors or modalities, enabling the fused image to retain\ncomplementary features from each modality, such as functional highlights and\ntexture details. However, effective training of such fusion models is\nchallenging due to the scarcity of ground truth fusion data. To tackle this\nissue, we propose the Equivariant Multi-Modality imAge fusion (EMMA) paradigm\nfor end-to-end self-supervised learning. Our approach is rooted in the prior\nknowledge that natural imaging responses are equivariant to certain\ntransformations. Consequently, we introduce a novel training paradigm that\nencompasses a fusion module, a pseudo-sensing module, and an equivariant fusion\nmodule. These components enable the net training to follow the principles of\nthe natural sensing-imaging process while satisfying the equivariant imaging\nprior. Extensive experiments confirm that EMMA yields high-quality fusion\nresults for infrared-visible and medical images, concurrently facilitating\ndownstream multi-modal segmentation and detection tasks. The code is available\nat https://github.com/Zhaozixiang1228/MMIF-EMMA.\n","authors":["Zixiang Zhao","Haowen Bai","Jiangshe Zhang","Yulun Zhang","Kai Zhang","Shuang Xu","Dongdong Chen","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2305.11443v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.06627v4","updated":"2024-04-15T18:03:26Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10054v1","updated":"2024-04-15T18:00:30Z","published":"2024-04-15T18:00:30Z","title":"AIGeN: An Adversarial Approach for Instruction Generation in VLN","summary":" In the last few years, the research interest in Vision-and-Language\nNavigation (VLN) has grown significantly. VLN is a challenging task that\ninvolves an agent following human instructions and navigating in a previously\nunknown environment to reach a specified goal. Recent work in literature\nfocuses on different ways to augment the available datasets of instructions for\nimproving navigation performance by exploiting synthetic training data. In this\nwork, we propose AIGeN, a novel architecture inspired by Generative Adversarial\nNetworks (GANs) that produces meaningful and well-formed synthetic instructions\nto improve navigation agents' performance. The model is composed of a\nTransformer decoder (GPT-2) and a Transformer encoder (BERT). During the\ntraining phase, the decoder generates sentences for a sequence of images\ndescribing the agent's path to a particular point while the encoder\ndiscriminates between real and fake instructions. Experimentally, we evaluate\nthe quality of the generated instructions and perform extensive ablation\nstudies. Additionally, we generate synthetic instructions for 217K trajectories\nusing AIGeN on Habitat-Matterport 3D Dataset (HM3D) and show an improvement in\nthe performance of an off-the-shelf VLN method. The validation analysis of our\nproposal is conducted on REVERIE and R2R and highlights the promising aspects\nof our proposal, achieving state-of-the-art performance.\n","authors":["Niyati Rawal","Roberto Bigazzi","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2404.10054v1.pdf","comment":"Accepted to 7th Multimodal Learning and Applications Workshop (MULA\n 2024) at the IEEE/CVF Conference on Computer Vision and Pattern Recognition\n 2024"},{"id":"http://arxiv.org/abs/2404.09995v1","updated":"2024-04-15T17:59:57Z","published":"2024-04-15T17:59:57Z","title":"Taming Latent Diffusion Model for Neural Radiance Field Inpainting","summary":" Neural Radiance Field (NeRF) is a representation for 3D reconstruction from\nmulti-view images. Despite some recent work showing preliminary success in\nediting a reconstructed NeRF with diffusion prior, they remain struggling to\nsynthesize reasonable geometry in completely uncovered regions. One major\nreason is the high diversity of synthetic contents from the diffusion model,\nwhich hinders the radiance field from converging to a crisp and deterministic\ngeometry. Moreover, applying latent diffusion models on real data often yields\na textural shift incoherent to the image condition due to auto-encoding errors.\nThese two problems are further reinforced with the use of pixel-distance\nlosses. To address these issues, we propose tempering the diffusion model's\nstochasticity with per-scene customization and mitigating the textural shift\nwith masked adversarial training. During the analyses, we also found the\ncommonly used pixel and perceptual losses are harmful in the NeRF inpainting\ntask. Through rigorous experiments, our framework yields state-of-the-art NeRF\ninpainting results on various real-world scenes. Project page:\nhttps://hubert0527.github.io/MALD-NeRF\n","authors":["Chieh Hubert Lin","Changil Kim","Jia-Bin Huang","Qinbo Li","Chih-Yao Ma","Johannes Kopf","Ming-Hsuan Yang","Hung-Yu Tseng"],"pdf_url":"https://arxiv.org/pdf/2404.09995v1.pdf","comment":"Project page: https://hubert0527.github.io/MALD-NeRF"},{"id":"http://arxiv.org/abs/2404.09993v1","updated":"2024-04-15T17:59:56Z","published":"2024-04-15T17:59:56Z","title":"No More Ambiguity in 360° Room Layout via Bi-Layout Estimation","summary":" Inherent ambiguity in layout annotations poses significant challenges to\ndeveloping accurate 360{\\deg} room layout estimation models. To address this\nissue, we propose a novel Bi-Layout model capable of predicting two distinct\nlayout types. One stops at ambiguous regions, while the other extends to\nencompass all visible areas. Our model employs two global context embeddings,\nwhere each embedding is designed to capture specific contextual information for\neach layout type. With our novel feature guidance module, the image feature\nretrieves relevant context from these embeddings, generating layout-aware\nfeatures for precise bi-layout predictions. A unique property of our Bi-Layout\nmodel is its ability to inherently detect ambiguous regions by comparing the\ntwo predictions. To circumvent the need for manual correction of ambiguous\nannotations during testing, we also introduce a new metric for disambiguating\nground truth layouts. Our method demonstrates superior performance on benchmark\ndatasets, notably outperforming leading approaches. Specifically, on the\nMatterportLayout dataset, it improves 3DIoU from 81.70% to 82.57% across the\nfull test set and notably from 54.80% to 59.97% in subsets with significant\nambiguity. Project page: https://liagm.github.io/Bi_Layout/\n","authors":["Yu-Ju Tsai","Jin-Cheng Jhang","Jingjing Zheng","Wei Wang","Albert Y. C. Chen","Min Sun","Cheng-Hao Kuo","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.09993v1.pdf","comment":"CVPR 2024, Project page: https://liagm.github.io/Bi_Layout/"},{"id":"http://arxiv.org/abs/2404.09992v1","updated":"2024-04-15T17:59:50Z","published":"2024-04-15T17:59:50Z","title":"MMInA: Benchmarking Multihop Multimodal Internet Agents","summary":" Autonomous embodied agents live on an Internet of multimedia websites. Can\nthey hop around multimodal websites to complete complex user tasks? Existing\nbenchmarks fail to assess them in a realistic, evolving environment for their\nembodiment across websites. To answer this question, we present MMInA, a\nmultihop and multimodal benchmark to evaluate the embodied agents for\ncompositional Internet tasks, with several appealing properties: 1) Evolving\nreal-world multimodal websites. Our benchmark uniquely operates on evolving\nreal-world websites, ensuring a high degree of realism and applicability to\nnatural user tasks. Our data includes 1,050 human-written tasks covering\nvarious domains such as shopping and travel, with each task requiring the agent\nto autonomously extract multimodal information from web pages as observations;\n2) Multihop web browsing. Our dataset features naturally compositional tasks\nthat require information from or actions on multiple websites to solve, to\nassess long-range reasoning capabilities on web tasks; 3) Holistic evaluation.\nWe propose a novel protocol for evaluating an agent's progress in completing\nmultihop tasks. We experiment with both standalone (multimodal) language models\nand heuristic-based web agents. Extensive experiments demonstrate that while\nlong-chain multihop web tasks are easy for humans, they remain challenging for\nstate-of-the-art web agents. We identify that agents are more likely to fail on\nthe early hops when solving tasks of more hops, which results in lower task\nsuccess rates. To address this issue, we propose a simple memory augmentation\napproach replaying past action trajectories to reflect. Our method\nsignificantly improved both the single-hop and multihop web browsing abilities\nof agents. See our code and data at https://mmina.cliangyu.com\n","authors":["Ziniu Zhang","Shulin Tian","Liangyu Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09991v1","updated":"2024-04-15T17:59:47Z","published":"2024-04-15T17:59:47Z","title":"EgoPet: Egomotion and Interaction Data from an Animal's Perspective","summary":" Animals perceive the world to plan their actions and interact with other\nagents to accomplish complex tasks, demonstrating capabilities that are still\nunmatched by AI systems. To advance our understanding and reduce the gap\nbetween the capabilities of animals and AI systems, we introduce a dataset of\npet egomotion imagery with diverse examples of simultaneous egomotion and\nmulti-agent interaction. Current video datasets separately contain egomotion\nand interaction examples, but rarely both at the same time. In addition, EgoPet\noffers a radically distinct perspective from existing egocentric datasets of\nhumans or vehicles. We define two in-domain benchmark tasks that capture animal\nbehavior, and a third benchmark to assess the utility of EgoPet as a\npretraining resource to robotic quadruped locomotion, showing that models\ntrained from EgoPet outperform those trained from prior datasets.\n","authors":["Amir Bar","Arya Bakhtiar","Danny Tran","Antonio Loquercio","Jathushan Rajasegaran","Yann LeCun","Amir Globerson","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2404.09991v1.pdf","comment":"https://www.amirbar.net/egopet"},{"id":"http://arxiv.org/abs/2404.09990v1","updated":"2024-04-15T17:59:31Z","published":"2024-04-15T17:59:31Z","title":"HQ-Edit: A High-Quality Dataset for Instruction-based Image Editing","summary":" This study introduces HQ-Edit, a high-quality instruction-based image editing\ndataset with around 200,000 edits. Unlike prior approaches relying on attribute\nguidance or human feedback on building datasets, we devise a scalable data\ncollection pipeline leveraging advanced foundation models, namely GPT-4V and\nDALL-E 3. To ensure its high quality, diverse examples are first collected\nonline, expanded, and then used to create high-quality diptychs featuring input\nand output images with detailed text prompts, followed by precise alignment\nensured through post-processing. In addition, we propose two evaluation\nmetrics, Alignment and Coherence, to quantitatively assess the quality of image\nedit pairs using GPT-4V. HQ-Edits high-resolution images, rich in detail and\naccompanied by comprehensive editing prompts, substantially enhance the\ncapabilities of existing image editing models. For example, an HQ-Edit\nfinetuned InstructPix2Pix can attain state-of-the-art image editing\nperformance, even surpassing those models fine-tuned with human-annotated data.\nThe project page is https://thefllood.github.io/HQEdit_web.\n","authors":["Mude Hui","Siwei Yang","Bingchen Zhao","Yichun Shi","Heng Wang","Peng Wang","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2404.09990v1.pdf","comment":"Project Page: https://thefllood.github.io/HQEdit_web"},{"id":"http://arxiv.org/abs/2404.09988v1","updated":"2024-04-15T17:59:04Z","published":"2024-04-15T17:59:04Z","title":"in2IN: Leveraging individual Information to Generate Human INteractions","summary":" Generating human-human motion interactions conditioned on textual\ndescriptions is a very useful application in many areas such as robotics,\ngaming, animation, and the metaverse. Alongside this utility also comes a great\ndifficulty in modeling the highly dimensional inter-personal dynamics. In\naddition, properly capturing the intra-personal diversity of interactions has a\nlot of challenges. Current methods generate interactions with limited diversity\nof intra-person dynamics due to the limitations of the available datasets and\nconditioning strategies. For this, we introduce in2IN, a novel diffusion model\nfor human-human motion generation which is conditioned not only on the textual\ndescription of the overall interaction but also on the individual descriptions\nof the actions performed by each person involved in the interaction. To train\nthis model, we use a large language model to extend the InterHuman dataset with\nindividual descriptions. As a result, in2IN achieves state-of-the-art\nperformance in the InterHuman dataset. Furthermore, in order to increase the\nintra-personal diversity on the existing interaction datasets, we propose\nDualMDM, a model composition technique that combines the motions generated with\nin2IN and the motions generated by a single-person motion prior pre-trained on\nHumanML3D. As a result, DualMDM generates motions with higher individual\ndiversity and improves control over the intra-person dynamics while maintaining\ninter-personal coherence.\n","authors":["Pablo Ruiz Ponce","German Barquero","Cristina Palmero","Sergio Escalera","Jose Garcia-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2404.09988v1.pdf","comment":"Project page: https://pabloruizponce.github.io/in2IN/"},{"id":"http://arxiv.org/abs/2404.09987v1","updated":"2024-04-15T17:58:57Z","published":"2024-04-15T17:58:57Z","title":"OneChart: Purify the Chart Structural Extraction via One Auxiliary Token","summary":" Chart parsing poses a significant challenge due to the diversity of styles,\nvalues, texts, and so forth. Even advanced large vision-language models (LVLMs)\nwith billions of parameters struggle to handle such tasks satisfactorily. To\naddress this, we propose OneChart: a reliable agent specifically devised for\nthe structural extraction of chart information. Similar to popular LVLMs,\nOneChart incorporates an autoregressive main body. Uniquely, to enhance the\nreliability of the numerical parts of the output, we introduce an auxiliary\ntoken placed at the beginning of the total tokens along with an additional\ndecoder. The numerically optimized (auxiliary) token allows subsequent tokens\nfor chart parsing to capture enhanced numerical features through causal\nattention. Furthermore, with the aid of the auxiliary token, we have devised a\nself-evaluation mechanism that enables the model to gauge the reliability of\nits chart parsing results by providing confidence scores for the generated\ncontent. Compared to current state-of-the-art (SOTA) chart parsing models,\ne.g., DePlot, ChartVLM, ChartAst, OneChart significantly outperforms in Average\nPrecision (AP) for chart structural extraction across multiple public\nbenchmarks, despite enjoying only 0.2 billion parameters. Moreover, as a chart\nparsing agent, it also brings 10%+ accuracy gains for the popular LVLM\n(LLaVA-1.6) in the downstream ChartQA benchmark.\n","authors":["Jinyue Chen","Lingyu Kong","Haoran Wei","Chenglong Liu","Zheng Ge","Liang Zhao","Jianjian Sun","Chunrui Han","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09987v1.pdf","comment":"14 pages, 9 figures and 6 tables"},{"id":"http://arxiv.org/abs/2404.09979v1","updated":"2024-04-15T17:56:05Z","published":"2024-04-15T17:56:05Z","title":"One-Click Upgrade from 2D to 3D: Sandwiched RGB-D Video Compression for\n Stereoscopic Teleconferencing","summary":" Stereoscopic video conferencing is still challenging due to the need to\ncompress stereo RGB-D video in real-time. Though hardware implementations of\nstandard video codecs such as H.264 / AVC and HEVC are widely available, they\nare not designed for stereoscopic videos and suffer from reduced quality and\nperformance. Specific multiview or 3D extensions of these codecs are complex\nand lack efficient implementations. In this paper, we propose a new approach to\nupgrade a 2D video codec to support stereo RGB-D video compression, by wrapping\nit with a neural pre- and post-processor pair. The neural networks are\nend-to-end trained with an image codec proxy, and shown to work with a more\nsophisticated video codec. We also propose a geometry-aware loss function to\nimprove rendering quality. We train the neural pre- and post-processors on a\nsynthetic 4D people dataset, and evaluate it on both synthetic and\nreal-captured stereo RGB-D videos. Experimental results show that the neural\nnetworks generalize well to unseen data and work out-of-box with various video\ncodecs. Our approach saves about 30% bit-rate compared to a conventional video\ncoding scheme and MV-HEVC at the same level of rendering quality from a novel\nview, without the need of a task-specific hardware upgrade.\n","authors":["Yueyu Hu","Onur G. Guleryuz","Philip A. Chou","Danhang Tang","Jonathan Taylor","Rus Maxham","Yao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09979v1.pdf","comment":"Accepted by CVPR 2024 Workshop (AIS: Vision, Graphics and AI for\n Streaming https://ai4streaming-workshop.github.io )"},{"id":"http://arxiv.org/abs/2404.09977v1","updated":"2024-04-15T17:55:56Z","published":"2024-04-15T17:55:56Z","title":"MaxFusion: Plug&Play Multi-Modal Generation in Text-to-Image Diffusion\n Models","summary":" Large diffusion-based Text-to-Image (T2I) models have shown impressive\ngenerative powers for text-to-image generation as well as spatially conditioned\nimage generation. For most applications, we can train the model end-toend with\npaired data to obtain photorealistic generation quality. However, to add an\nadditional task, one often needs to retrain the model from scratch using paired\ndata across all modalities to retain good generation performance. In this\npaper, we tackle this issue and propose a novel strategy to scale a generative\nmodel across new tasks with minimal compute. During our experiments, we\ndiscovered that the variance maps of intermediate feature maps of diffusion\nmodels capture the intensity of conditioning. Utilizing this prior information,\nwe propose MaxFusion, an efficient strategy to scale up text-to-image\ngeneration models to accommodate new modality conditions. Specifically, we\ncombine aligned features of multiple models, hence bringing a compositional\neffect. Our fusion strategy can be integrated into off-the-shelf models to\nenhance their generative prowess.\n","authors":["Nithin Gopalakrishnan Nair","Jeya Maria Jose Valanarasu","Vishal M Patel"],"pdf_url":"https://arxiv.org/pdf/2404.09977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09976v1","updated":"2024-04-15T17:55:43Z","published":"2024-04-15T17:55:43Z","title":"Diffscaler: Enhancing the Generative Prowess of Diffusion Transformers","summary":" Recently, diffusion transformers have gained wide attention with its\nexcellent performance in text-to-image and text-to-vidoe models, emphasizing\nthe need for transformers as backbone for diffusion models. Transformer-based\nmodels have shown better generalization capability compared to CNN-based models\nfor general vision tasks. However, much less has been explored in the existing\nliterature regarding the capabilities of transformer-based diffusion backbones\nand expanding their generative prowess to other datasets. This paper focuses on\nenabling a single pre-trained diffusion transformer model to scale across\nmultiple datasets swiftly, allowing for the completion of diverse generative\ntasks using just one model. To this end, we propose DiffScaler, an efficient\nscaling strategy for diffusion models where we train a minimal amount of\nparameters to adapt to different tasks. In particular, we learn task-specific\ntransformations at each layer by incorporating the ability to utilize the\nlearned subspaces of the pre-trained model, as well as the ability to learn\nadditional task-specific subspaces, which may be absent in the pre-training\ndataset. As these parameters are independent, a single diffusion model with\nthese task-specific parameters can be used to perform multiple tasks\nsimultaneously. Moreover, we find that transformer-based diffusion models\nsignificantly outperform CNN-based diffusion models methods while performing\nfine-tuning over smaller datasets. We perform experiments on four unconditional\nimage generation datasets. We show that using our proposed method, a single\npre-trained model can scale up to perform these conditional and unconditional\ntasks, respectively, with minimal parameter tuning while performing as close as\nfine-tuning an entire diffusion model for that particular task.\n","authors":["Nithin Gopalakrishnan Nair","Jeya Maria Jose Valanarasu","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2404.09976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09967v1","updated":"2024-04-15T17:45:36Z","published":"2024-04-15T17:45:36Z","title":"Ctrl-Adapter: An Efficient and Versatile Framework for Adapting Diverse\n Controls to Any Diffusion Model","summary":" ControlNets are widely used for adding spatial control in image generation\nwith different conditions, such as depth maps, canny edges, and human poses.\nHowever, there are several challenges when leveraging the pretrained image\nControlNets for controlled video generation. First, pretrained ControlNet\ncannot be directly plugged into new backbone models due to the mismatch of\nfeature spaces, and the cost of training ControlNets for new backbones is a big\nburden. Second, ControlNet features for different frames might not effectively\nhandle the temporal consistency. To address these challenges, we introduce\nCtrl-Adapter, an efficient and versatile framework that adds diverse controls\nto any image/video diffusion models, by adapting pretrained ControlNets (and\nimproving temporal alignment for videos). Ctrl-Adapter provides diverse\ncapabilities including image control, video control, video control with sparse\nframes, multi-condition control, compatibility with different backbones,\nadaptation to unseen control conditions, and video editing. In Ctrl-Adapter, we\ntrain adapter layers that fuse pretrained ControlNet features to different\nimage/video diffusion models, while keeping the parameters of the ControlNets\nand the diffusion models frozen. Ctrl-Adapter consists of temporal and spatial\nmodules so that it can effectively handle the temporal consistency of videos.\nWe also propose latent skipping and inverse timestep sampling for robust\nadaptation and sparse control. Moreover, Ctrl-Adapter enables control from\nmultiple conditions by simply taking the (weighted) average of ControlNet\noutputs. With diverse image/video diffusion backbones (SDXL, Hotshot-XL,\nI2VGen-XL, and SVD), Ctrl-Adapter matches ControlNet for image control and\noutperforms all baselines for video control (achieving the SOTA accuracy on the\nDAVIS 2017 dataset) with significantly lower computational costs (less than 10\nGPU hours).\n","authors":["Han Lin","Jaemin Cho","Abhay Zala","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2404.09967v1.pdf","comment":"First two authors contributed equally; Project page:\n https://ctrl-adapter.github.io/"},{"id":"http://arxiv.org/abs/2404.09964v1","updated":"2024-04-15T17:40:23Z","published":"2024-04-15T17:40:23Z","title":"Design and Analysis of Efficient Attention in Transformers for Social\n Group Activity Recognition","summary":" Social group activity recognition is a challenging task extended from group\nactivity recognition, where social groups must be recognized with their\nactivities and group members. Existing methods tackle this task by leveraging\nregion features of individuals following existing group activity recognition\nmethods. However, the effectiveness of region features is susceptible to person\nlocalization and variable semantics of individual actions. To overcome these\nissues, we propose leveraging attention modules in transformers to generate\nsocial group features. In this method, multiple embeddings are used to\naggregate features for a social group, each of which is assigned to a group\nmember without duplication. Due to this non-duplicated assignment, the number\nof embeddings must be significant to avoid missing group members and thus\nrenders attention in transformers ineffective. To find optimal attention\ndesigns with a large number of embeddings, we explore several design choices of\nqueries for feature aggregation and self-attention modules in transformer\ndecoders. Extensive experimental results show that the proposed method achieves\nstate-of-the-art performance and verify that the proposed attention designs are\nhighly effective on social group activity recognition.\n","authors":["Masato Tamura"],"pdf_url":"https://arxiv.org/pdf/2404.09964v1.pdf","comment":"Accepted to IJCV, preprint version"},{"id":"http://arxiv.org/abs/2404.09961v1","updated":"2024-04-15T17:38:47Z","published":"2024-04-15T17:38:47Z","title":"Ti-Patch: Tiled Physical Adversarial Patch for no-reference video\n quality metrics","summary":" Objective no-reference image- and video-quality metrics are crucial in many\ncomputer vision tasks. However, state-of-the-art no-reference metrics have\nbecome learning-based and are vulnerable to adversarial attacks. The\nvulnerability of quality metrics imposes restrictions on using such metrics in\nquality control systems and comparing objective algorithms. Also, using\nvulnerable metrics as a loss for deep learning model training can mislead\ntraining to worsen visual quality. Because of that, quality metrics testing for\nvulnerability is a task of current interest. This paper proposes a new method\nfor testing quality metrics vulnerability in the physical space. To our\nknowledge, quality metrics were not previously tested for vulnerability to this\nattack; they were only tested in the pixel space. We applied a physical\nadversarial Ti-Patch (Tiled Patch) attack to quality metrics and did\nexperiments both in pixel and physical space. We also performed experiments on\nthe implementation of physical adversarial wallpaper. The proposed method can\nbe used as additional quality metrics in vulnerability evaluation,\ncomplementing traditional subjective comparison and vulnerability tests in the\npixel space. We made our code and adversarial videos available on GitHub:\nhttps://github.com/leonenkova/Ti-Patch.\n","authors":["Victoria Leonenkova","Ekaterina Shumitskaya","Anastasia Antsiferova","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2404.09961v1.pdf","comment":"Accepted to WAIT AINL 2024"},{"id":"http://arxiv.org/abs/2404.09957v1","updated":"2024-04-15T17:31:32Z","published":"2024-04-15T17:31:32Z","title":"How to build the best medical image segmentation algorithm using\n foundation models: a comprehensive empirical study with Segment Anything\n Model","summary":" Automated segmentation is a fundamental medical image analysis task, which\nenjoys significant advances due to the advent of deep learning. While\nfoundation models have been useful in natural language processing and some\nvision tasks for some time, the foundation model developed with image\nsegmentation in mind - Segment Anything Model (SAM) - has been developed only\nrecently and has shown similar promise. However, there are still no systematic\nanalyses or ``best-practice'' guidelines for optimal fine-tuning of SAM for\nmedical image segmentation. This work summarizes existing fine-tuning\nstrategies with various backbone architectures, model components, and\nfine-tuning algorithms across 18 combinations, and evaluates them on 17\ndatasets covering all common radiology modalities. Our study reveals that (1)\nfine-tuning SAM leads to slightly better performance than previous segmentation\nmethods, (2) fine-tuning strategies that use parameter-efficient learning in\nboth the encoder and decoder are superior to other strategies, (3) network\narchitecture has a small impact on final performance, (4) further training SAM\nwith self-supervised learning can improve final model performance. We also\ndemonstrate the ineffectiveness of some methods popular in the literature and\nfurther expand our experiments into few-shot and prompt-based settings. Lastly,\nwe released our code and MRI-specific fine-tuned weights, which consistently\nobtained superior performance over the original SAM, at\nhttps://github.com/mazurowski-lab/finetune-SAM.\n","authors":["Hanxue Gu","Haoyu Dong","Jichen Yang","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2404.09957v1.pdf","comment":"Code available at https://github.com/mazurowski-lab/finetune-SAM"},{"id":"http://arxiv.org/abs/2404.10034v1","updated":"2024-04-15T17:25:21Z","published":"2024-04-15T17:25:21Z","title":"Realistic Model Selection for Weakly Supervised Object Localization","summary":" Weakly Supervised Object Localization (WSOL) allows for training deep\nlearning models for classification and localization, using only global\nclass-level labels. The lack of bounding box (bbox) supervision during training\nrepresents a considerable challenge for hyper-parameter search and model\nselection. Earlier WSOL works implicitly observed localization performance over\na test set which leads to biased performance evaluation. More recently, a\nbetter WSOL protocol has been proposed, where a validation set with bbox\nannotations is held out for model selection. Although it does not rely on the\ntest set, this protocol is unrealistic since bboxes are not available in\nreal-world applications, and when available, it is better to use them directly\nto fit model weights. Our initial empirical analysis shows that the\nlocalization performance of a model declines significantly when using only\nimage-class labels for model selection (compared to using bounding-box\nannotations). This suggests that adding bounding-box labels is preferable for\nselecting the best model for localization. In this paper, we introduce a new\nWSOL validation protocol that provides a localization signal without the need\nfor manual bbox annotations. In particular, we leverage noisy pseudo boxes from\nan off-the-shelf ROI proposal generator such as Selective-Search, CLIP, and RPN\npretrained models for model selection. Our experimental results with several\nWSOL methods on ILSVRC and CUB-200-2011 datasets show that our noisy boxes\nallow selecting models with performance close to those selected using ground\ntruth boxes, and better than models selected using only image-class labels.\n","authors":["Shakeeb Murtaza","Soufiane Belharbi","Marco Pedersoli","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2404.10034v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.09951v1","updated":"2024-04-15T17:24:57Z","published":"2024-04-15T17:24:57Z","title":"Unifying Global and Local Scene Entities Modelling for Precise Action\n Spotting","summary":" Sports videos pose complex challenges, including cluttered backgrounds,\ncamera angle changes, small action-representing objects, and imbalanced action\nclass distribution. Existing methods for detecting actions in sports videos\nheavily rely on global features, utilizing a backbone network as a black box\nthat encompasses the entire spatial frame. However, these approaches tend to\noverlook the nuances of the scene and struggle with detecting actions that\noccupy a small portion of the frame. In particular, they face difficulties when\ndealing with action classes involving small objects, such as balls or\nyellow/red cards in soccer, which only occupy a fraction of the screen space.\nTo address these challenges, we introduce a novel approach that analyzes and\nmodels scene entities using an adaptive attention mechanism. Particularly, our\nmodel disentangles the scene content into the global environment feature and\nlocal relevant scene entities feature. To efficiently extract environmental\nfeatures while considering temporal information with less computational cost,\nwe propose the use of a 2D backbone network with a time-shift mechanism. To\naccurately capture relevant scene entities, we employ a Vision-Language model\nin conjunction with the adaptive attention mechanism. Our model has\ndemonstrated outstanding performance, securing the 1st place in the\nSoccerNet-v2 Action Spotting, FineDiving, and FineGym challenge with a\nsubstantial performance improvement of 1.6, 2.0, and 1.3 points in avg-mAP\ncompared to the runner-up methods. Furthermore, our approach offers\ninterpretability capabilities in contrast to other deep learning models, which\nare often designed as black boxes. Our code and models are released at:\nhttps://github.com/Fsoft-AIC/unifying-global-local-feature.\n","authors":["Kim Hoang Tran","Phuc Vuong Do","Ngoc Quoc Ly","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2404.09951v1.pdf","comment":"Accepted to IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.09942v1","updated":"2024-04-15T17:11:25Z","published":"2024-04-15T17:11:25Z","title":"Knowledge-enhanced Visual-Language Pretraining for Computational\n Pathology","summary":" In this paper, we consider the problem of visual representation learning for\ncomputational pathology, by exploiting large-scale image-text pairs gathered\nfrom public resources, along with the domain specific knowledge in pathology.\nSpecifically, we make the following contributions: (i) We curate a pathology\nknowledge tree that consists of 50,470 informative attributes for 4,718\ndiseases requiring pathology diagnosis from 32 human tissues. To our knowledge,\nthis is the first comprehensive structured pathology knowledge base; (ii) We\ndevelop a knowledge-enhanced visual-language pretraining approach, where we\nfirst project pathology-specific knowledge into latent embedding space via\nlanguage model, and use it to guide the visual representation learning; (iii)\nWe conduct thorough experiments to validate the effectiveness of our proposed\ncomponents, demonstrating significant performance improvement on various\ndownstream tasks, including cross-modal retrieval, zero-shot classification on\npathology patches, and zero-shot tumor subtyping on whole slide images (WSIs).\nAll codes, models and the pathology knowledge tree will be released to the\nresearch community\n","authors":["Xiao Zhou","Xiaoman Zhang","Chaoyi Wu","Ya Zhang","Weidi Xie","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09941v1","updated":"2024-04-15T17:09:53Z","published":"2024-04-15T17:09:53Z","title":"Evolving Interpretable Visual Classifiers with Large Language Models","summary":" Multimodal pre-trained models, such as CLIP, are popular for zero-shot\nclassification due to their open-vocabulary flexibility and high performance.\nHowever, vision-language models, which compute similarity scores between images\nand class labels, are largely black-box, with limited interpretability, risk\nfor bias, and inability to discover new visual concepts not written down.\nMoreover, in practical settings, the vocabulary for class names and attributes\nof specialized concepts will not be known, preventing these methods from\nperforming well on images uncommon in large-scale vision-language datasets. To\naddress these limitations, we present a novel method that discovers\ninterpretable yet discriminative sets of attributes for visual recognition. We\nintroduce an evolutionary search algorithm that uses a large language model and\nits in-context learning abilities to iteratively mutate a concept bottleneck of\nattributes for classification. Our method produces state-of-the-art,\ninterpretable fine-grained classifiers. We outperform the latest baselines by\n18.4% on five fine-grained iNaturalist datasets and by 22.2% on two KikiBouba\ndatasets, despite the baselines having access to privileged information about\nclass names.\n","authors":["Mia Chiquier","Utkarsh Mall","Carl Vondrick"],"pdf_url":"https://arxiv.org/pdf/2404.09941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09940v1","updated":"2024-04-15T17:08:53Z","published":"2024-04-15T17:08:53Z","title":"eMotion-GAN: A Motion-based GAN for Photorealistic and Facial Expression\n Preserving Frontal View Synthesis","summary":" Many existing facial expression recognition (FER) systems encounter\nsubstantial performance degradation when faced with variations in head pose.\nNumerous frontalization methods have been proposed to enhance these systems'\nperformance under such conditions. However, they often introduce undesirable\ndeformations, rendering them less suitable for precise facial expression\nanalysis. In this paper, we present eMotion-GAN, a novel deep learning approach\ndesigned for frontal view synthesis while preserving facial expressions within\nthe motion domain. Considering the motion induced by head variation as noise\nand the motion induced by facial expression as the relevant information, our\nmodel is trained to filter out the noisy motion in order to retain only the\nmotion related to facial expression. The filtered motion is then mapped onto a\nneutral frontal face to generate the corresponding expressive frontal face. We\nconducted extensive evaluations using several widely recognized dynamic FER\ndatasets, which encompass sequences exhibiting various degrees of head pose\nvariations in both intensity and orientation. Our results demonstrate the\neffectiveness of our approach in significantly reducing the FER performance gap\nbetween frontal and non-frontal faces. Specifically, we achieved a FER\nimprovement of up to +5\\% for small pose variations and up to +20\\% improvement\nfor larger pose variations. Code available at\n\\url{https://github.com/o-ikne/eMotion-GAN.git}.\n","authors":["Omar Ikne","Benjamin Allaert","Ioan Marius Bilasco","Hazem Wannous"],"pdf_url":"https://arxiv.org/pdf/2404.09940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02692v2","updated":"2024-04-15T17:01:23Z","published":"2023-10-04T10:03:07Z","title":"Clustering-based Image-Text Graph Matching for Domain Generalization","summary":" Learning domain-invariant visual representations is important to train a\nmodel that can generalize well to unseen target task domains. Recent works\ndemonstrate that text descriptions contain high-level class-discriminative\ninformation and such auxiliary semantic cues can be used as effective pivot\nembedding for domain generalization problem. However, they use pivot embedding\nin global manner (i.e., aligning an image embedding with sentence-level text\nembedding), not fully utilizing the semantic cues of given text description. In\nthis work, we advocate for the use of local alignment between image regions and\ncorresponding textual descriptions. To this end, we first represent image and\ntext inputs with graphs. We subsequently cluster nodes in those graphs and\nmatch the graph-based image node features into textual graphs. This matching\nprocess is conducted globally and locally, tightly aligning visual and textual\nsemantic sub-structures. We experiment with large-scale public datasets, such\nas CUB-DG and DomainBed, and our model achieves matched or better\nstate-of-the-art performance on these datasets. Our code will be publicly\navailable upon publication.\n","authors":["Nokyung Park","Daewon Chae","Jeongyong Shim","Sangpil Kim","Eun-Sol Kim","Jinkyu Kim"],"pdf_url":"https://arxiv.org/pdf/2310.02692v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09933v1","updated":"2024-04-15T16:59:00Z","published":"2024-04-15T16:59:00Z","title":"HOI-Ref: Hand-Object Interaction Referral in Egocentric Vision","summary":" Large Vision Language Models (VLMs) are now the de facto state-of-the-art for\na number of tasks including visual question answering, recognising objects, and\nspatial referral. In this work, we propose the HOI-Ref task for egocentric\nimages that aims to understand interactions between hands and objects using\nVLMs. To enable HOI-Ref, we curate the HOI-QA dataset that consists of 3.9M\nquestion-answer pairs for training and evaluating VLMs. HOI-QA includes\nquestions relating to locating hands, objects, and critically their\ninteractions (e.g. referring to the object being manipulated by the hand). We\ntrain the first VLM for HOI-Ref on this dataset and call it VLM4HOI. Our\nresults demonstrate that VLMs trained for referral on third person images fail\nto recognise and refer hands and objects in egocentric images. When fine-tuned\non our egocentric HOI-QA dataset, performance improves by 27.9% for referring\nhands and objects, and by 26.7% for referring interactions.\n","authors":["Siddhant Bansal","Michael Wray","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.09933v1.pdf","comment":"Project Page: https://sid2697.github.io/hoi-ref/"},{"id":"http://arxiv.org/abs/2404.09931v1","updated":"2024-04-15T16:56:58Z","published":"2024-04-15T16:56:58Z","title":"Zero-shot detection of buildings in mobile LiDAR using Language Vision\n Model","summary":" Recent advances have demonstrated that Language Vision Models (LVMs) surpass\nthe existing State-of-the-Art (SOTA) in two-dimensional (2D) computer vision\ntasks, motivating attempts to apply LVMs to three-dimensional (3D) data. While\nLVMs are efficient and effective in addressing various downstream 2D vision\ntasks without training, they face significant challenges when it comes to point\nclouds, a representative format for representing 3D data. It is more difficult\nto extract features from 3D data and there are challenges due to large data\nsizes and the cost of the collection and labelling, resulting in a notably\nlimited availability of datasets. Moreover, constructing LVMs for point clouds\nis even more challenging due to the requirements for large amounts of data and\ntraining time. To address these issues, our research aims to 1) apply the\nGrounded SAM through Spherical Projection to transfer 3D to 2D, and 2)\nexperiment with synthetic data to evaluate its effectiveness in bridging the\ngap between synthetic and real-world data domains. Our approach exhibited high\nperformance with an accuracy of 0.96, an IoU of 0.85, precision of 0.92, recall\nof 0.91, and an F1 score of 0.92, confirming its potential. However, challenges\nsuch as occlusion problems and pixel-level overlaps of multi-label points\nduring spherical image generation remain to be addressed in future studies.\n","authors":["June Moh Goo","Zichao Zeng","Jan Boehm"],"pdf_url":"https://arxiv.org/pdf/2404.09931v1.pdf","comment":"7 pages, 6 figures, conference"},{"id":"http://arxiv.org/abs/2403.17192v2","updated":"2024-04-15T16:55:38Z","published":"2024-03-25T21:08:26Z","title":"Strategies to Improve Real-World Applicability of Laparoscopic Anatomy\n Segmentation Models","summary":" Accurate identification and localization of anatomical structures of varying\nsize and appearance in laparoscopic imaging are necessary to leverage the\npotential of computer vision techniques for surgical decision support.\nSegmentation performance of such models is traditionally reported using metrics\nof overlap such as IoU. However, imbalanced and unrealistic representation of\nclasses in the training data and suboptimal selection of reported metrics have\nthe potential to skew nominal segmentation performance and thereby ultimately\nlimit clinical translation. In this work, we systematically analyze the impact\nof class characteristics (i.e., organ size differences), training and test data\ncomposition (i.e., representation of positive and negative examples), and\nmodeling parameters (i.e., foreground-to-background class weight) on eight\nsegmentation metrics: accuracy, precision, recall, IoU, F1 score (Dice\nSimilarity Coefficient), specificity, Hausdorff Distance, and Average Symmetric\nSurface Distance. Our findings support two adjustments to account for data\nbiases in surgical data science: First, training on datasets that are similar\nto the clinical real-world scenarios in terms of class distribution, and\nsecond, class weight adjustments to optimize segmentation model performance\nwith regard to metrics of particular relevance in the respective clinical\nsetting.\n","authors":["Fiona R. Kolbinger","Jiangpeng He","Jinge Ma","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.17192v2.pdf","comment":"14 pages, 5 figures, 4 tables; accepted for the workshop \"Data\n Curation and Augmentation in Medical Imaging\" at CVPR 2024 (archival track)"},{"id":"http://arxiv.org/abs/2404.09921v1","updated":"2024-04-15T16:47:22Z","published":"2024-04-15T16:47:22Z","title":"Zero-shot Building Age Classification from Facade Image Using GPT-4","summary":" A building's age of construction is crucial for supporting many geospatial\napplications. Much current research focuses on estimating building age from\nfacade images using deep learning. However, building an accurate deep learning\nmodel requires a considerable amount of labelled training data, and the trained\nmodels often have geographical constraints. Recently, large pre-trained vision\nlanguage models (VLMs) such as GPT-4 Vision, which demonstrate significant\ngeneralisation capabilities, have emerged as potential training-free tools for\ndealing with specific vision tasks, but their applicability and reliability for\nbuilding information remain unexplored. In this study, a zero-shot building age\nclassifier for facade images is developed using prompts that include logical\ninstructions. Taking London as a test case, we introduce a new dataset,\nFI-London, comprising facade images and building age epochs. Although the\ntraining-free classifier achieved a modest accuracy of 39.69%, the mean\nabsolute error of 0.85 decades indicates that the model can predict building\nage epochs successfully albeit with a small bias. The ensuing discussion\nreveals that the classifier struggles to predict the age of very old buildings\nand is challenged by fine-grained predictions within 2 decades. Overall, the\nclassifier utilising GPT-4 Vision is capable of predicting the rough age epoch\nof a building from a single facade image without any training.\n","authors":["Zichao Zeng","June Moh Goo","Xinglei Wang","Bin Chi","Meihui Wang","Jan Boehm"],"pdf_url":"https://arxiv.org/pdf/2404.09921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09918v1","updated":"2024-04-15T16:45:08Z","published":"2024-04-15T16:45:08Z","title":"EdgeRelight360: Text-Conditioned 360-Degree HDR Image Generation for\n Real-Time On-Device Video Portrait Relighting","summary":" In this paper, we present EdgeRelight360, an approach for real-time video\nportrait relighting on mobile devices, utilizing text-conditioned generation of\n360-degree high dynamic range image (HDRI) maps. Our method proposes a\ndiffusion-based text-to-360-degree image generation in the HDR domain, taking\nadvantage of the HDR10 standard. This technique facilitates the generation of\nhigh-quality, realistic lighting conditions from textual descriptions, offering\nflexibility and control in portrait video relighting task. Unlike the previous\nrelighting frameworks, our proposed system performs video relighting directly\non-device, enabling real-time inference with real 360-degree HDRI maps. This\non-device processing ensures both privacy and guarantees low runtime, providing\nan immediate response to changes in lighting conditions or user inputs. Our\napproach paves the way for new possibilities in real-time video applications,\nincluding video conferencing, gaming, and augmented reality, by allowing\ndynamic, text-based control of lighting conditions.\n","authors":["Min-Hui Lin","Mahesh Reddy","Guillaume Berger","Michel Sarkis","Fatih Porikli","Ning Bi"],"pdf_url":"https://arxiv.org/pdf/2404.09918v1.pdf","comment":"Camera-ready version (CVPR workshop - EDGE'24)"},{"id":"http://arxiv.org/abs/2311.01908v3","updated":"2024-04-15T16:43:57Z","published":"2023-11-03T13:38:42Z","title":"LLM-driven Multimodal Target Volume Contouring in Radiation Oncology","summary":" Target volume contouring for radiation therapy is considered significantly\nmore challenging than the normal organ segmentation tasks as it necessitates\nthe utilization of both image and text-based clinical information. Inspired by\nthe recent advancement of large language models (LLMs) that can facilitate the\nintegration of the textural information and images, here we present a novel\nLLM-driven multimodal AI, namely LLMSeg, that utilizes the clinical text\ninformation and is applicable to the challenging task of target volume\ncontouring for radiation therapy, and validate it within the context of breast\ncancer radiation therapy target volume contouring. Using external validation\nand data-insufficient environments, which attributes highly conducive to\nreal-world applications, we demonstrate that the proposed model exhibits\nmarkedly improved performance compared to conventional unimodal AI models,\nparticularly exhibiting robust generalization performance and data efficiency.\nTo our best knowledge, this is the first LLM-driven multimodal AI model that\nintegrates the clinical text information into target volume delineation for\nradiation oncology.\n","authors":["Yujin Oh","Sangjoon Park","Hwa Kyung Byun","Yeona Cho","Ik Jae Lee","Jin Sung Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.01908v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09917v1","updated":"2024-04-15T16:43:24Z","published":"2024-04-15T16:43:24Z","title":"Evaluating the Explainability of Attributes and Prototypes for a Medical\n Classification Model","summary":" Due to the sensitive nature of medicine, it is particularly important and\nhighly demanded that AI methods are explainable. This need has been recognised\nand there is great research interest in xAI solutions with medical\napplications. However, there is a lack of user-centred evaluation regarding the\nactual impact of the explanations. We evaluate attribute- and prototype-based\nexplanations with the Proto-Caps model. This xAI model reasons the target\nclassification with human-defined visual features of the target object in the\nform of scores and attribute-specific prototypes. The model thus provides a\nmultimodal explanation that is intuitively understandable to humans thanks to\npredefined attributes. A user study involving six radiologists shows that the\nexplanations are subjectivly perceived as helpful, as they reflect their\ndecision-making process. The results of the model are considered a second\nopinion that radiologists can discuss using the model's explanations. However,\nit was shown that the inclusion and increased magnitude of model explanations\nobjectively can increase confidence in the model's predictions when the model\nis incorrect. We can conclude that attribute scores and visual prototypes\nenhance confidence in the model. However, additional development and repeated\nuser studies are needed to tailor the explanation to the respective use case.\n","authors":["Luisa Gallée","Catharina Silvia Lisson","Christoph Gerhard Lisson","Daniela Drees","Felix Weig","Daniel Vogele","Meinrad Beer","Michael Götz"],"pdf_url":"https://arxiv.org/pdf/2404.09917v1.pdf","comment":"Accepted at The 2nd World Conference on eXplainable Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2403.01505v2","updated":"2024-04-15T16:42:50Z","published":"2024-03-03T13:08:32Z","title":"SCott: Accelerating Diffusion Models with Stochastic Consistency\n Distillation","summary":" The iterative sampling procedure employed by diffusion models (DMs) often\nleads to significant inference latency. To address this, we propose Stochastic\nConsistency Distillation (SCott) to enable accelerated text-to-image\ngeneration, where high-quality generations can be achieved with just 1-2\nsampling steps, and further improvements can be obtained by adding additional\nsteps. In contrast to vanilla consistency distillation (CD) which distills the\nordinary differential equation solvers-based sampling process of a pretrained\nteacher model into a student, SCott explores the possibility and validates the\nefficacy of integrating stochastic differential equation (SDE) solvers into CD\nto fully unleash the potential of the teacher. SCott is augmented with\nelaborate strategies to control the noise strength and sampling process of the\nSDE solver. An adversarial loss is further incorporated to strengthen the\nsample quality with rare sampling steps. Empirically, on the MSCOCO-2017 5K\ndataset with a Stable Diffusion-V1.5 teacher, SCott achieves an FID (Frechet\nInceptio Distance) of 22.1, surpassing that (23.4) of the 1-step InstaFlow (Liu\net al., 2023) and matching that of 4-step UFOGen (Xue et al., 2023b). Moreover,\nSCott can yield more diverse samples than other consistency models for\nhigh-resolution image generation (Luo et al., 2023a), with up to 16%\nimprovement in a qualified metric. The code and checkpoints are coming soon.\n","authors":["Hongjian Liu","Qingsong Xie","Zhijie Deng","Chen Chen","Shixiang Tang","Fueyang Fu","Zheng-jun Zha","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.01505v2.pdf","comment":"22 pages, 16 figures"},{"id":"http://arxiv.org/abs/2401.17542v2","updated":"2024-04-15T16:33:38Z","published":"2024-01-31T02:09:21Z","title":"A Medical Data-Effective Learning Benchmark for Highly Efficient\n Pre-training of Foundation Models","summary":" Foundation models, pre-trained on massive datasets, have achieved\nunprecedented generalizability. However, is it truly necessary to involve such\nvast amounts of data in pre-training, consuming extensive computational\nresources? This paper introduces data-effective learning, aiming to use data in\nthe most impactful way to pre-train foundation models. This involves strategies\nthat focus on data quality rather than quantity, ensuring the data used for\ntraining has high informational value. Data-effective learning plays a profound\nrole in accelerating foundation model training, reducing computational costs,\nand saving data storage, which is very important as the volume of medical data\nin recent years has grown beyond many people's expectations. However, due to\nthe lack of standards and comprehensive benchmarks, research on medical\ndata-effective learning is poorly studied. To address this gap, our paper\nintroduces a comprehensive benchmark specifically for evaluating data-effective\nlearning in the medical field. This benchmark includes a dataset with millions\nof data samples from 31 medical centers (DataDEL), a baseline method for\ncomparison (MedDEL), and a new evaluation metric (NormDEL) to objectively\nmeasure data-effective learning performance. Our extensive experimental results\nshow the baseline MedDEL can achieve performance comparable to the original\nlarge dataset with only 5% of the data. Establishing such an open\ndata-effective learning benchmark is crucial for the medical foundation model\nresearch community because it facilitates efficient data use, promotes\ncollaborative breakthroughs, and fosters the development of cost-effective,\nscalable, and impactful healthcare solutions.\n","authors":["Wenxuan Yang","Weimin Tan","Yuqi Sun","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2401.17542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04166v4","updated":"2024-04-15T16:31:05Z","published":"2023-06-07T05:36:45Z","title":"BAA-NGP: Bundle-Adjusting Accelerated Neural Graphics Primitives","summary":" Implicit neural representations have become pivotal in robotic perception,\nenabling robots to comprehend 3D environments from 2D images. Given a set of\ncamera poses and associated images, the models can be trained to synthesize\nnovel, unseen views. To successfully navigate and interact in dynamic settings,\nrobots require the understanding of their spatial surroundings driven by\nunassisted reconstruction of 3D scenes and camera poses from real-time video\nfootage. Existing approaches like COLMAP and bundle-adjusting neural radiance\nfield methods take hours to days to process due to the high computational\ndemands of feature matching, dense point sampling, and training of a\nmulti-layer perceptron structure with a large number of parameters. To address\nthese challenges, we propose a framework called bundle-adjusting accelerated\nneural graphics primitives (BAA-NGP) which leverages accelerated sampling and\nhash encoding to expedite automatic pose refinement/estimation and 3D scene\nreconstruction. Experimental results demonstrate 10 to 20 x speed improvement\ncompared to other bundle-adjusting neural radiance field methods without\nsacrificing the quality of pose estimation. The github repository can be found\nhere https://github.com/IntelLabs/baa-ngp.\n","authors":["Sainan Liu","Shan Lin","Jingpei Lu","Alexey Supikov","Michael Yip"],"pdf_url":"https://arxiv.org/pdf/2306.04166v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09886v1","updated":"2024-04-15T15:54:30Z","published":"2024-04-15T15:54:30Z","title":"ReffAKD: Resource-efficient Autoencoder-based Knowledge Distillation","summary":" In this research, we propose an innovative method to boost Knowledge\nDistillation efficiency without the need for resource-heavy teacher models.\nKnowledge Distillation trains a smaller ``student'' model with guidance from a\nlarger ``teacher'' model, which is computationally costly. However, the main\nbenefit comes from the soft labels provided by the teacher, helping the student\ngrasp nuanced class similarities. In our work, we propose an efficient method\nfor generating these soft labels, thereby eliminating the need for a large\nteacher model. We employ a compact autoencoder to extract essential features\nand calculate similarity scores between different classes. Afterward, we apply\nthe softmax function to these similarity scores to obtain a soft probability\nvector. This vector serves as valuable guidance during the training of the\nstudent model. Our extensive experiments on various datasets, including\nCIFAR-100, Tiny Imagenet, and Fashion MNIST, demonstrate the superior resource\nefficiency of our approach compared to traditional knowledge distillation\nmethods that rely on large teacher models. Importantly, our approach\nconsistently achieves similar or even superior performance in terms of model\naccuracy. We also perform a comparative study with various techniques recently\ndeveloped for knowledge distillation showing our approach achieves competitive\nperformance with using significantly less resources. We also show that our\napproach can be easily added to any logit based knowledge distillation method.\nThis research contributes to making knowledge distillation more accessible and\ncost-effective for practical applications, making it a promising avenue for\nimproving the efficiency of model training. The code for this work is available\nat, https://github.com/JEKimLab/ReffAKD.\n","authors":["Divyang Doshi","Jung-Eun Kim"],"pdf_url":"https://arxiv.org/pdf/2404.09886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09884v1","updated":"2024-04-15T15:53:23Z","published":"2024-04-15T15:53:23Z","title":"Map-Relative Pose Regression for Visual Re-Localization","summary":" Pose regression networks predict the camera pose of a query image relative to\na known environment. Within this family of methods, absolute pose regression\n(APR) has recently shown promising accuracy in the range of a few centimeters\nin position error. APR networks encode the scene geometry implicitly in their\nweights. To achieve high accuracy, they require vast amounts of training data\nthat, realistically, can only be created using novel view synthesis in a\ndays-long process. This process has to be repeated for each new scene again and\nagain. We present a new approach to pose regression, map-relative pose\nregression (marepo), that satisfies the data hunger of the pose regression\nnetwork in a scene-agnostic fashion. We condition the pose regressor on a\nscene-specific map representation such that its pose predictions are relative\nto the scene map. This allows us to train the pose regressor across hundreds of\nscenes to learn the generic relation between a scene-specific map\nrepresentation and the camera pose. Our map-relative pose regressor can be\napplied to new map representations immediately or after mere minutes of\nfine-tuning for the highest accuracy. Our approach outperforms previous pose\nregression methods by far on two public datasets, indoor and outdoor. Code is\navailable: https://nianticlabs.github.io/marepo\n","authors":["Shuai Chen","Tommaso Cavallari","Victor Adrian Prisacariu","Eric Brachmann"],"pdf_url":"https://arxiv.org/pdf/2404.09884v1.pdf","comment":"IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)\n 2024, Highlight Paper"},{"id":"http://arxiv.org/abs/2308.04466v3","updated":"2024-04-15T15:52:41Z","published":"2023-08-08T05:46:47Z","title":"Backdoor Federated Learning by Poisoning Backdoor-Critical Layers","summary":" Federated learning (FL) has been widely deployed to enable machine learning\ntraining on sensitive data across distributed devices. However, the\ndecentralized learning paradigm and heterogeneity of FL further extend the\nattack surface for backdoor attacks. Existing FL attack and defense\nmethodologies typically focus on the whole model. None of them recognizes the\nexistence of backdoor-critical (BC) layers-a small subset of layers that\ndominate the model vulnerabilities. Attacking the BC layers achieves equivalent\neffects as attacking the whole model but at a far smaller chance of being\ndetected by state-of-the-art (SOTA) defenses. This paper proposes a general\nin-situ approach that identifies and verifies BC layers from the perspective of\nattackers. Based on the identified BC layers, we carefully craft a new backdoor\nattack methodology that adaptively seeks a fundamental balance between\nattacking effects and stealthiness under various defense strategies. Extensive\nexperiments show that our BC layer-aware backdoor attacks can successfully\nbackdoor FL under seven SOTA defenses with only 10% malicious clients and\noutperform the latest backdoor attack methods.\n","authors":["Haomin Zhuang","Mingxian Yu","Hao Wang","Yang Hua","Jian Li","Xu Yuan"],"pdf_url":"https://arxiv.org/pdf/2308.04466v3.pdf","comment":"Accepted to ICLR'24"},{"id":"http://arxiv.org/abs/2207.14624v2","updated":"2024-04-15T15:48:43Z","published":"2022-07-29T11:50:35Z","title":"Post-processing of coronary and myocardial spatial data","summary":" Numerical simulations of real-world phenomenon are implemented with at least\ntwo parts: the computational scheme and the computational domain. In the\ncontext of hemodynamics, the computational domain of a simulation represents\nthe blood vessel network through which blood flows. Such blood vessel networks\ncan contain millions of individual vessels that are joined together to form a\nin series and parallel to form the network. It is computationally unfeasible to\nexplicitly simulate blood flow in all blood vessels. Here, from imaged data of\na single porcine left coronary arterial tree, we develop a data-pipeline to\nobtain computational domains for hemodynmaic simulations from a graph\nrepresenting the coronary vascular tree. Further, we develop a method to\nascertain which subregions of the left ventricle are most likely to be perfused\nvia a given artery using a comparison with the American Heart Association\ndivision of the left ventricle as a sense check.\n","authors":["Jay Aodh Mackenzie","Megan Jeanne Miller","Nicholas Hill","Mette Olufsen"],"pdf_url":"https://arxiv.org/pdf/2207.14624v2.pdf","comment":"21 pages, 22 figures"},{"id":"http://arxiv.org/abs/2404.09872v1","updated":"2024-04-15T15:43:52Z","published":"2024-04-15T15:43:52Z","title":"Conditional Prototype Rectification Prompt Learning","summary":" Pre-trained large-scale vision-language models (VLMs) have acquired profound\nunderstanding of general visual concepts. Recent advancements in efficient\ntransfer learning (ETL) have shown remarkable success in fine-tuning VLMs\nwithin the scenario of limited data, introducing only a few parameters to\nharness task-specific insights from VLMs. Despite significant progress, current\nleading ETL methods tend to overfit the narrow distributions of base classes\nseen during training and encounter two primary challenges: (i) only utilizing\nuni-modal information to modeling task-specific knowledge; and (ii) using\ncostly and time-consuming methods to supplement knowledge. To address these\nissues, we propose a Conditional Prototype Rectification Prompt Learning (CPR)\nmethod to correct the bias of base examples and augment limited data in an\neffective way. Specifically, we alleviate overfitting on base classes from two\naspects. First, each input image acquires knowledge from both textual and\nvisual prototypes, and then generates sample-conditional text tokens. Second,\nwe extract utilizable knowledge from unlabeled data to further refine the\nprototypes. These two strategies mitigate biases stemming from base classes,\nyielding a more effective classifier. Extensive experiments on 11 benchmark\ndatasets show that our CPR achieves state-of-the-art performance on both\nfew-shot classification and base-to-new generalization tasks. Our code is\navaliable at \\url{https://github.com/chenhaoxing/CPR}.\n","authors":["Haoxing Chen","Yaohui Li","Zizheng Huang","Yan Hong","Zhuoer Xu","Zhangxuan Gu","Jun Lan","Huijia Zhu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09870v1","updated":"2024-04-15T15:36:38Z","published":"2024-04-15T15:36:38Z","title":"Table tennis ball spin estimation with an event camera","summary":" Spin plays a pivotal role in ball-based sports. Estimating spin becomes a key\nskill due to its impact on the ball's trajectory and bouncing behavior. Spin\ncannot be observed directly, making it inherently challenging to estimate. In\ntable tennis, the combination of high velocity and spin renders traditional low\nframe rate cameras inadequate for quickly and accurately observing the ball's\nlogo to estimate the spin due to the motion blur. Event cameras do not suffer\nas much from motion blur, thanks to their high temporal resolution. Moreover,\nthe sparse nature of the event stream solves communication bandwidth\nlimitations many frame cameras face. To the best of our knowledge, we present\nthe first method for table tennis spin estimation using an event camera. We use\nordinal time surfaces to track the ball and then isolate the events generated\nby the logo on the ball. Optical flow is then estimated from the extracted\nevents to infer the ball's spin. We achieved a spin magnitude mean error of\n$10.7 \\pm 17.3$ rps and a spin axis mean error of $32.9 \\pm 38.2\\deg$ in real\ntime for a flying ball.\n","authors":["Thomas Gossard","Julian Krismer","Andreas Ziegler","Jonas Tebbe","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2404.09870v1.pdf","comment":"Accepted to CVsport (CVPRW 2024)"},{"id":"http://arxiv.org/abs/2403.14534v2","updated":"2024-04-15T15:30:31Z","published":"2024-03-21T16:36:40Z","title":"Transfer Learning for Cross-dataset Isolated Sign Language Recognition\n in Under-Resourced Datasets","summary":" Sign language recognition (SLR) has recently achieved a breakthrough in\nperformance thanks to deep neural networks trained on large annotated sign\ndatasets. Of the many different sign languages, these annotated datasets are\nonly available for a select few. Since acquiring gloss-level labels on sign\nlanguage videos is difficult, learning by transferring knowledge from existing\nannotated sources is useful for recognition in under-resourced sign languages.\nThis study provides a publicly available cross-dataset transfer learning\nbenchmark from two existing public Turkish SLR datasets. We use a temporal\ngraph convolution-based sign language recognition approach to evaluate five\nsupervised transfer learning approaches and experiment with closed-set and\npartial-set cross-dataset transfer learning. Experiments demonstrate that\nimprovement over finetuning based transfer learning is possible with\nspecialized supervised transfer learning methods.\n","authors":["Ahmet Alp Kindiroglu","Ozgur Kara","Ogulcan Ozdemir","Lale Akarun"],"pdf_url":"https://arxiv.org/pdf/2403.14534v2.pdf","comment":"Accepted to The 18th IEEE International Conference on Automatic Face\n and Gesture Recognition 2024, Code available in\n https://github.com/alpk/tid-supervised-transfer-learning-dataset"},{"id":"http://arxiv.org/abs/2309.11711v2","updated":"2024-04-15T15:26:29Z","published":"2023-09-21T01:31:54Z","title":"MoDA: Leveraging Motion Priors from Videos for Advancing Unsupervised\n Domain Adaptation in Semantic Segmentation","summary":" Unsupervised domain adaptation (UDA) has been a potent technique to handle\nthe lack of annotations in the target domain, particularly in semantic\nsegmentation task. This study introduces a different UDA scenarios where the\ntarget domain contains unlabeled video frames. Drawing upon recent advancements\nof self-supervised learning of the object motion from unlabeled videos with\ngeometric constraint, we design a \\textbf{Mo}tion-guided \\textbf{D}omain\n\\textbf{A}daptive semantic segmentation framework (MoDA). MoDA harnesses the\nself-supervised object motion cues to facilitate cross-domain alignment for\nsegmentation task. First, we present an object discovery module to localize and\nsegment target moving objects using object motion information. Then, we propose\na semantic mining module that takes the object masks to refine the pseudo\nlabels in the target domain. Subsequently, these high-quality pseudo labels are\nused in the self-training loop to bridge the cross-domain gap. On domain\nadaptive video and image segmentation experiments, MoDA shows the effectiveness\nutilizing object motion as guidance for domain alignment compared with optical\nflow information. Moreover, MoDA exhibits versatility as it can complement\nexisting state-of-the-art UDA approaches. Code at\nhttps://github.com/feipanir/MoDA.\n","authors":["Fei Pan","Xu Yin","Seokju Lee","Axi Niu","Sungeui Yoon","In So Kweon"],"pdf_url":"https://arxiv.org/pdf/2309.11711v2.pdf","comment":"CVPR 2024 Workshop on Learning with Limited Labelled Data for Image\n and Video Understanding. Best Paper Award"},{"id":"http://arxiv.org/abs/2404.09857v1","updated":"2024-04-15T15:12:53Z","published":"2024-04-15T15:12:53Z","title":"Empowering Embodied Visual Tracking with Visual Foundation Models and\n Offline RL","summary":" Embodied visual tracking is to follow a target object in dynamic 3D\nenvironments using an agent's egocentric vision. This is a vital and\nchallenging skill for embodied agents. However, existing methods suffer from\ninefficient training and poor generalization. In this paper, we propose a novel\nframework that combines visual foundation models (VFM) and offline\nreinforcement learning (offline RL) to empower embodied visual tracking. We use\na pre-trained VFM, such as ``Tracking Anything\", to extract semantic\nsegmentation masks with text prompts. We then train a recurrent policy network\nwith offline RL, e.g., Conservative Q-Learning, to learn from the collected\ndemonstrations without online agent-environment interactions. To further\nimprove the robustness and generalization of the policy network, we also\nintroduce a mask re-targeting mechanism and a multi-level data collection\nstrategy. In this way, we can train a robust tracker within an hour on a\nconsumer-level GPU, e.g., Nvidia RTX 3090. Such efficiency is unprecedented for\nRL-based visual tracking methods. We evaluate our tracker on several\nhigh-fidelity environments with challenging situations, such as distraction and\nocclusion. The results show that our agent outperforms state-of-the-art methods\nin terms of sample efficiency, robustness to distractors, and generalization to\nunseen scenarios and targets. We also demonstrate the transferability of the\nlearned tracker from the virtual world to real-world scenarios.\n","authors":["Fangwei Zhong","Kui Wu","Hai Ci","Churan Wang","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01964v3","updated":"2024-04-15T15:00:49Z","published":"2023-12-04T15:23:49Z","title":"Semantics-aware Motion Retargeting with Vision-Language Models","summary":" Capturing and preserving motion semantics is essential to motion retargeting\nbetween animation characters. However, most of the previous works neglect the\nsemantic information or rely on human-designed joint-level representations.\nHere, we present a novel Semantics-aware Motion reTargeting (SMT) method with\nthe advantage of vision-language models to extract and maintain meaningful\nmotion semantics. We utilize a differentiable module to render 3D motions. Then\nthe high-level motion semantics are incorporated into the motion retargeting\nprocess by feeding the vision-language model with the rendered images and\naligning the extracted semantic embeddings. To ensure the preservation of\nfine-grained motion details and high-level semantics, we adopt a two-stage\npipeline consisting of skeleton-aware pre-training and fine-tuning with\nsemantics and geometry constraints. Experimental results show the effectiveness\nof the proposed method in producing high-quality motion retargeting results\nwhile accurately preserving motion semantics.\n","authors":["Haodong Zhang","ZhiKe Chen","Haocheng Xu","Lei Hao","Xiaofei Wu","Songcen Xu","Zhensong Zhang","Yue Wang","Rong Xiong"],"pdf_url":"https://arxiv.org/pdf/2312.01964v3.pdf","comment":"Accepted in CVPR2024"},{"id":"http://arxiv.org/abs/2404.09846v1","updated":"2024-04-15T14:55:43Z","published":"2024-04-15T14:55:43Z","title":"A Diffusion-based Data Generator for Training Object Recognition Models\n in Ultra-Range Distance","summary":" Object recognition, commonly performed by a camera, is a fundamental\nrequirement for robots to complete complex tasks. Some tasks require\nrecognizing objects far from the robot's camera. A challenging example is\nUltra-Range Gesture Recognition (URGR) in human-robot interaction where the\nuser exhibits directive gestures at a distance of up to 25~m from the robot.\nHowever, training a model to recognize hardly visible objects located in\nultra-range requires an exhaustive collection of a significant amount of\nlabeled samples. The generation of synthetic training datasets is a recent\nsolution to the lack of real-world data, while unable to properly replicate the\nrealistic visual characteristics of distant objects in images. In this letter,\nwe propose the Diffusion in Ultra-Range (DUR) framework based on a Diffusion\nmodel to generate labeled images of distant objects in various scenes. The DUR\ngenerator receives a desired distance and class (e.g., gesture) and outputs a\ncorresponding synthetic image. We apply DUR to train a URGR model with\ndirective gestures in which fine details of the gesturing hand are challenging\nto distinguish. DUR is compared to other types of generative models showcasing\nsuperiority both in fidelity and in recognition success rate when training a\nURGR model. More importantly, training a DUR model on a limited amount of real\ndata and then using it to generate synthetic data for training a URGR model\noutperforms directly training the URGR model on real data. The synthetic-based\nURGR model is also demonstrated in gesture-based direction of a ground robot.\n","authors":["Eran Bamani","Eden Nissinman","Lisa Koenigsberg","Inbar Meir","Avishai Sintov"],"pdf_url":"https://arxiv.org/pdf/2404.09846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09842v1","updated":"2024-04-15T14:52:02Z","published":"2024-04-15T14:52:02Z","title":"STMixer: A One-Stage Sparse Action Detector","summary":" Traditional video action detectors typically adopt the two-stage pipeline,\nwhere a person detector is first employed to generate actor boxes and then 3D\nRoIAlign is used to extract actor-specific features for classification. This\ndetection paradigm requires multi-stage training and inference, and the feature\nsampling is constrained inside the box, failing to effectively leverage richer\ncontext information outside. Recently, a few query-based action detectors have\nbeen proposed to predict action instances in an end-to-end manner. However,\nthey still lack adaptability in feature sampling and decoding, thus suffering\nfrom the issues of inferior performance or slower convergence. In this paper,\nwe propose two core designs for a more flexible one-stage sparse action\ndetector. First, we present a query-based adaptive feature sampling module,\nwhich endows the detector with the flexibility of mining a group of\ndiscriminative features from the entire spatio-temporal domain. Second, we\ndevise a decoupled feature mixing module, which dynamically attends to and\nmixes video features along the spatial and temporal dimensions respectively for\nbetter feature decoding. Based on these designs, we instantiate two detection\npipelines, that is, STMixer-K for keyframe action detection and STMixer-T for\naction tubelet detection. Without bells and whistles, our STMixer detectors\nobtain state-of-the-art results on five challenging spatio-temporal action\ndetection benchmarks for keyframe action detection or action tube detection.\n","authors":["Tao Wu","Mengqi Cao","Ziteng Gao","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09842v1.pdf","comment":"Extended version of the paper arXiv:2303.15879 presented at CVPR\n 2023. Accepted by TPAMI 2024"},{"id":"http://arxiv.org/abs/2403.12075v2","updated":"2024-04-15T14:41:09Z","published":"2024-02-14T22:21:12Z","title":"Adversarial Nibbler: An Open Red-Teaming Method for Identifying Diverse\n Harms in Text-to-Image Generation","summary":" With the rise of text-to-image (T2I) generative AI models reaching wide\naudiences, it is critical to evaluate model robustness against non-obvious\nattacks to mitigate the generation of offensive images. By focusing on\n``implicitly adversarial'' prompts (those that trigger T2I models to generate\nunsafe images for non-obvious reasons), we isolate a set of difficult safety\nissues that human creativity is well-suited to uncover. To this end, we built\nthe Adversarial Nibbler Challenge, a red-teaming methodology for crowdsourcing\na diverse set of implicitly adversarial prompts. We have assembled a suite of\nstate-of-the-art T2I models, employed a simple user interface to identify and\nannotate harms, and engaged diverse populations to capture long-tail safety\nissues that may be overlooked in standard testing. The challenge is run in\nconsecutive rounds to enable a sustained discovery and analysis of safety\npitfalls in T2I models.\n In this paper, we present an in-depth account of our methodology, a\nsystematic study of novel attack strategies and discussion of safety failures\nrevealed by challenge participants. We also release a companion visualization\ntool for easy exploration and derivation of insights from the dataset. The\nfirst challenge round resulted in over 10k prompt-image pairs with machine\nannotations for safety. A subset of 1.5k samples contains rich human\nannotations of harm types and attack styles. We find that 14% of images that\nhumans consider harmful are mislabeled as ``safe'' by machines. We have\nidentified new attack strategies that highlight the complexity of ensuring T2I\nmodel robustness. Our findings emphasize the necessity of continual auditing\nand adaptation as new vulnerabilities emerge. We are confident that this work\nwill enable proactive, iterative safety assessments and promote responsible\ndevelopment of T2I models.\n","authors":["Jessica Quaye","Alicia Parrish","Oana Inel","Charvi Rastogi","Hannah Rose Kirk","Minsuk Kahng","Erin van Liemt","Max Bartolo","Jess Tsang","Justin White","Nathan Clement","Rafael Mosquera","Juan Ciro","Vijay Janapa Reddi","Lora Aroyo"],"pdf_url":"https://arxiv.org/pdf/2403.12075v2.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2212.00621v2","updated":"2024-04-15T14:39:19Z","published":"2022-12-01T16:15:54Z","title":"CONDA: Continual Unsupervised Domain Adaptation Learning in Visual\n Perception for Self-Driving Cars","summary":" Although unsupervised domain adaptation methods have achieved remarkable\nperformance in semantic scene segmentation in visual perception for\nself-driving cars, these approaches remain impractical in real-world use cases.\nIn practice, the segmentation models may encounter new data that have not been\nseen yet. Also, the previous data training of segmentation models may be\ninaccessible due to privacy problems. Therefore, to address these problems, in\nthis work, we propose a Continual Unsupervised Domain Adaptation (CONDA)\napproach that allows the model to continuously learn and adapt with respect to\nthe presence of the new data. Moreover, our proposed approach is designed\nwithout the requirement of accessing previous training data. To avoid the\ncatastrophic forgetting problem and maintain the performance of the\nsegmentation models, we present a novel Bijective Maximum Likelihood loss to\nimpose the constraint of predicted segmentation distribution shifts. The\nexperimental results on the benchmark of continual unsupervised domain\nadaptation have shown the advanced performance of the proposed CONDA method.\n","authors":["Thanh-Dat Truong","Pierce Helton","Ahmed Moustafa","Jackson David Cothren","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2212.00621v2.pdf","comment":"Accepted to CVPRW 2024"},{"id":"http://arxiv.org/abs/2402.11874v2","updated":"2024-04-15T14:37:57Z","published":"2024-02-19T06:32:23Z","title":"Language-guided Image Reflection Separation","summary":" This paper studies the problem of language-guided reflection separation,\nwhich aims at addressing the ill-posed reflection separation problem by\nintroducing language descriptions to provide layer content. We propose a\nunified framework to solve this problem, which leverages the cross-attention\nmechanism with contrastive learning strategies to construct the correspondence\nbetween language descriptions and image layers. A gated network design and a\nrandomized training strategy are employed to tackle the recognizable layer\nambiguity. The effectiveness of the proposed method is validated by the\nsignificant performance advantage over existing reflection separation methods\non both quantitative and qualitative comparisons.\n","authors":["Haofeng Zhong","Yuchen Hong","Shuchen Weng","Jinxiu Liang","Boxin Shi"],"pdf_url":"https://arxiv.org/pdf/2402.11874v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09833v1","updated":"2024-04-15T14:32:32Z","published":"2024-04-15T14:32:32Z","title":"Video2Game: Real-time, Interactive, Realistic and Browser-Compatible\n Environment from a Single Video","summary":" Creating high-quality and interactive virtual environments, such as games and\nsimulators, often involves complex and costly manual modeling processes. In\nthis paper, we present Video2Game, a novel approach that automatically converts\nvideos of real-world scenes into realistic and interactive game environments.\nAt the heart of our system are three core components:(i) a neural radiance\nfields (NeRF) module that effectively captures the geometry and visual\nappearance of the scene; (ii) a mesh module that distills the knowledge from\nNeRF for faster rendering; and (iii) a physics module that models the\ninteractions and physical dynamics among the objects. By following the\ncarefully designed pipeline, one can construct an interactable and actionable\ndigital replica of the real world. We benchmark our system on both indoor and\nlarge-scale outdoor scenes. We show that we can not only produce\nhighly-realistic renderings in real-time, but also build interactive games on\ntop.\n","authors":["Hongchi Xia","Zhi-Hao Lin","Wei-Chiu Ma","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09833v1.pdf","comment":"CVPR 2024. Project page (with code): https://video2game.github.io/"},{"id":"http://arxiv.org/abs/2404.09831v1","updated":"2024-04-15T14:29:47Z","published":"2024-04-15T14:29:47Z","title":"Digging into contrastive learning for robust depth estimation with\n diffusion models","summary":" Recently, diffusion-based depth estimation methods have drawn widespread\nattention due to their elegant denoising patterns and promising performance.\nHowever, they are typically unreliable under adverse conditions prevalent in\nreal-world scenarios, such as rainy, snowy, etc. In this paper, we propose a\nnovel robust depth estimation method called D4RD, featuring a custom\ncontrastive learning mode tailored for diffusion models to mitigate performance\ndegradation in complex environments. Concretely, we integrate the strength of\nknowledge distillation into contrastive learning, building the `trinity'\ncontrastive scheme. This scheme utilizes the sampled noise of the forward\ndiffusion process as a natural reference, guiding the predicted noise in\ndiverse scenes toward a more stable and precise optimum. Moreover, we extend\nnoise-level trinity to encompass more generic feature and image levels,\nestablishing a multi-level contrast to distribute the burden of robust\nperception across the overall network. Before addressing complex scenarios, we\nenhance the stability of the baseline diffusion model with three\nstraightforward yet effective improvements, which facilitate convergence and\nremove depth outliers. Extensive experiments demonstrate that D4RD surpasses\nexisting state-of-the-art solutions on synthetic corruption datasets and\nreal-world weather conditions. The code for D4RD will be made available for\nfurther exploration and adoption.\n","authors":["Jiyuan Wang","Chunyu Lin","Lang Nie","Kang Liao","Shuwei Shao","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.09831v1.pdf","comment":"8 pages,6 figures"},{"id":"http://arxiv.org/abs/2404.09828v1","updated":"2024-04-15T14:26:00Z","published":"2024-04-15T14:26:00Z","title":"Interaction as Explanation: A User Interaction-based Method for\n Explaining Image Classification Models","summary":" In computer vision, explainable AI (xAI) methods seek to mitigate the\n'black-box' problem by making the decision-making process of deep learning\nmodels more interpretable and transparent. Traditional xAI methods concentrate\non visualizing input features that influence model predictions, providing\ninsights primarily suited for experts. In this work, we present an\ninteraction-based xAI method that enhances user comprehension of image\nclassification models through their interaction. Thus, we developed a web-based\nprototype allowing users to modify images via painting and erasing, thereby\nobserving changes in classification results. Our approach enables users to\ndiscern critical features influencing the model's decision-making process,\naligning their mental models with the model's logic. Experiments conducted with\nfive images demonstrate the potential of the method to reveal feature\nimportance through user interaction. Our work contributes a novel perspective\nto xAI by centering on end-user engagement and understanding, paving the way\nfor more intuitive and accessible explainability in AI systems.\n","authors":["Hyeonggeun Yun"],"pdf_url":"https://arxiv.org/pdf/2404.09828v1.pdf","comment":"5 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.09826v1","updated":"2024-04-15T14:23:39Z","published":"2024-04-15T14:23:39Z","title":"A Recipe for CAC: Mosaic-based Generalized Loss for Improved\n Class-Agnostic Counting","summary":" Class agnostic counting (CAC) is a vision task that can be used to count the\ntotal occurrence number of any given reference objects in the query image. The\ntask is usually formulated as a density map estimation problem through\nsimilarity computation among a few image samples of the reference object and\nthe query image. In this paper, we point out a severe issue of the existing CAC\nframework: Given a multi-class setting, models don't consider reference images\nand instead blindly match all dominant objects in the query image. Moreover,\nthe current evaluation metrics and dataset cannot be used to faithfully assess\nthe model's generalization performance and robustness. To this end, we discover\nthat the combination of mosaic augmentation with generalized loss is essential\nfor addressing the aforementioned issue of CAC models to count objects of\nmajority (i.e. dominant objects) regardless of the references. Furthermore, we\nintroduce a new evaluation protocol and metrics for resolving the problem\nbehind the existing CAC evaluation scheme and better benchmarking CAC models in\na more fair manner. Besides, extensive evaluation results demonstrate that our\nproposed recipe can consistently improve the performance of different CAC\nmodels. The code will be released upon acceptance.\n","authors":["Tsung-Han Chou","Brian Wang","Wei-Chen Chiu","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09819v1","updated":"2024-04-15T14:20:07Z","published":"2024-04-15T14:20:07Z","title":"3D Face Tracking from 2D Video through Iterative Dense UV to Image Flow","summary":" When working with 3D facial data, improving fidelity and avoiding the uncanny\nvalley effect is critically dependent on accurate 3D facial performance\ncapture. Because such methods are expensive and due to the widespread\navailability of 2D videos, recent methods have focused on how to perform\nmonocular 3D face tracking. However, these methods often fall short in\ncapturing precise facial movements due to limitations in their network\narchitecture, training, and evaluation processes. Addressing these challenges,\nwe propose a novel face tracker, FlowFace, that introduces an innovative 2D\nalignment network for dense per-vertex alignment. Unlike prior work, FlowFace\nis trained on high-quality 3D scan annotations rather than weak supervision or\nsynthetic data. Our 3D model fitting module jointly fits a 3D face model from\none or many observations, integrating existing neutral shape priors for\nenhanced identity and expression disentanglement and per-vertex deformations\nfor detailed facial feature reconstruction. Additionally, we propose a novel\nmetric and benchmark for assessing tracking accuracy. Our method exhibits\nsuperior performance on both custom and publicly available benchmarks. We\nfurther validate the effectiveness of our tracker by generating high-quality 3D\ndata from 2D videos, which leads to performance gains on downstream tasks.\n","authors":["Felix Taubner","Prashant Raina","Mathieu Tuli","Eu Wern Teh","Chul Lee","Jinmiao Huang"],"pdf_url":"https://arxiv.org/pdf/2404.09819v1.pdf","comment":"22 pages, 25 figures, to be published in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09809v1","updated":"2024-04-15T14:07:33Z","published":"2024-04-15T14:07:33Z","title":"Neighbour-level Message Interaction Encoding for Improved Representation\n Learning on Graphs","summary":" Message passing has become the dominant framework in graph representation\nlearning. The essential idea of the message-passing framework is to update node\nembeddings based on the information aggregated from local neighbours. However,\nmost existing aggregation methods have not encoded neighbour-level message\ninteractions into the aggregated message, resulting in an information lost in\nembedding generation. And this information lost could be accumulated and become\nmore serious as more layers are added to the graph network model. To address\nthis issue, we propose a neighbour-level message interaction information\nencoding method for improving graph representation learning. For messages that\nare aggregated at a node, we explicitly generate an encoding between each\nmessage and the rest messages using an encoding function. Then we aggregate\nthese learned encodings and take the sum of the aggregated encoding and the\naggregated message to update the embedding for the node. By this way,\nneighbour-level message interaction information is integrated into the\ngenerated node embeddings. The proposed encoding method is a generic method\nwhich can be integrated into message-passing graph convolutional networks.\nExtensive experiments are conducted on six popular benchmark datasets across\nfour highly-demanded tasks. The results show that integrating neighbour-level\nmessage interactions achieves improved performance of the base models,\nadvancing the state of the art results for representation learning over graphs.\n","authors":["Haimin Zhang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09809v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.09807v1","updated":"2024-04-15T14:03:31Z","published":"2024-04-15T14:03:31Z","title":"A Universal Protocol to Benchmark Camera Calibration for Sports","summary":" Camera calibration is a crucial component in the realm of sports analytics,\nas it serves as the foundation to extract 3D information out of the broadcast\nimages. Despite the significance of camera calibration research in sports\nanalytics, progress is impeded by outdated benchmarking criteria. Indeed, the\nannotation data and evaluation metrics provided by most currently available\nbenchmarks strongly favor and incite the development of sports field\nregistration methods, i.e. methods estimating homographies that map the sports\nfield plane to the image plane. However, such homography-based methods are\ndoomed to overlook the broader capabilities of camera calibration in bridging\nthe 3D world to the image. In particular, real-world non-planar sports field\nelements (such as goals, corner flags, baskets, ...) and image distortion\ncaused by broadcast camera lenses are out of the scope of sports field\nregistration methods. To overcome these limitations, we designed a new\nbenchmarking protocol, named ProCC, based on two principles: (1) the protocol\nshould be agnostic to the camera model chosen for a camera calibration method,\nand (2) the protocol should fairly evaluate camera calibration methods using\nthe reprojection of arbitrary yet accurately known 3D objects. Indirectly, we\nalso provide insights into the metric used in SoccerNet-calibration, which\nsolely relies on image annotation data of viewed 3D objects as ground truth,\nthus implementing our protocol. With experiments on the World Cup 2014, CARWC,\nand SoccerNet datasets, we show that our benchmarking protocol provides fairer\nevaluations of camera calibration methods. By defining our requirements for\nproper benchmarking, we hope to pave the way for a new stage in camera\ncalibration for sports applications with high accuracy standards.\n","authors":["Floriane Magera","Thomas Hoyoux","Olivier Barnich","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2404.09807v1.pdf","comment":"12 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.09797v1","updated":"2024-04-15T13:54:35Z","published":"2024-04-15T13:54:35Z","title":"TextCoT: Zoom In for Enhanced Multimodal Text-Rich Image Understanding","summary":" The advent of Large Multimodal Models (LMMs) has sparked a surge in research\naimed at harnessing their remarkable reasoning abilities. However, for\nunderstanding text-rich images, challenges persist in fully leveraging the\npotential of LMMs, and existing methods struggle with effectively processing\nhigh-resolution images. In this work, we propose TextCoT, a novel\nChain-of-Thought framework for text-rich image understanding. TextCoT utilizes\nthe captioning ability of LMMs to grasp the global context of the image and the\ngrounding capability to examine local textual regions. This allows for the\nextraction of both global and local visual information, facilitating more\naccurate question-answering. Technically, TextCoT consists of three stages,\nincluding image overview, coarse localization, and fine-grained observation.\nThe image overview stage provides a comprehensive understanding of the global\nscene information, and the coarse localization stage approximates the image\narea containing the answer based on the question asked. Then, integrating the\nobtained global image descriptions, the final stage further examines specific\nregions to provide accurate answers. Our method is free of extra training,\noffering immediate plug-and-play functionality. Extensive experiments are\nconducted on a series of text-rich image question-answering benchmark datasets\nbased on several advanced LMMs, and the results demonstrate the effectiveness\nand strong generalization ability of our method. Code is available at\nhttps://github.com/bzluan/TextCoT.\n","authors":["Bozhi Luan","Hao Feng","Hong Chen","Yonghui Wang","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.09797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02469v2","updated":"2024-04-15T13:51:30Z","published":"2024-03-04T20:29:51Z","title":"Vision-Language Models for Medical Report Generation and Visual Question\n Answering: A Review","summary":" Medical vision-language models (VLMs) combine computer vision (CV) and\nnatural language processing (NLP) to analyze visual and textual medical data.\nOur paper reviews recent advancements in developing VLMs specialized for\nhealthcare, focusing on models designed for medical report generation and\nvisual question answering (VQA). We provide background on NLP and CV,\nexplaining how techniques from both fields are integrated into VLMs to enable\nlearning from multimodal data. Key areas we address include the exploration of\nmedical vision-language datasets, in-depth analyses of architectures and\npre-training strategies employed in recent noteworthy medical VLMs, and\ncomprehensive discussion on evaluation metrics for assessing VLMs' performance\nin medical report generation and VQA. We also highlight current challenges and\npropose future directions, including enhancing clinical validity and addressing\npatient privacy concerns. Overall, our review summarizes recent progress in\ndeveloping VLMs to harness multimodal medical data for improved healthcare\napplications.\n","authors":["Iryna Hartsock","Ghulam Rasool"],"pdf_url":"https://arxiv.org/pdf/2403.02469v2.pdf","comment":"43 pages; paper edited and restructured"},{"id":"http://arxiv.org/abs/2402.19159v2","updated":"2024-04-15T13:51:17Z","published":"2024-02-29T13:44:14Z","title":"Trajectory Consistency Distillation: Improved Latent Consistency\n Distillation by Semi-Linear Consistency Function with Trajectory Mapping","summary":" Latent Consistency Model (LCM) extends the Consistency Model to the latent\nspace and leverages the guided consistency distillation technique to achieve\nimpressive performance in accelerating text-to-image synthesis. However, we\nobserved that LCM struggles to generate images with both clarity and detailed\nintricacy. Consequently, we introduce Trajectory Consistency Distillation\n(TCD), which encompasses trajectory consistency function and strategic\nstochastic sampling. The trajectory consistency function diminishes the\nparameterisation and distillation errors by broadening the scope of the\nself-consistency boundary condition with trajectory mapping and endowing the\nTCD with the ability to accurately trace the entire trajectory of the\nProbability Flow ODE in semi-linear form with an Exponential Integrator.\nAdditionally, strategic stochastic sampling provides explicit control of\nstochastic and circumvents the accumulated errors inherent in multi-step\nconsistency sampling. Experiments demonstrate that TCD not only significantly\nenhances image quality at low NFEs but also yields more detailed results\ncompared to the teacher model at high NFEs.\n","authors":["Jianbin Zheng","Minghui Hu","Zhongyi Fan","Chaoyue Wang","Changxing Ding","Dacheng Tao","Tat-Jen Cham"],"pdf_url":"https://arxiv.org/pdf/2402.19159v2.pdf","comment":"Project Page: https://mhh0318.github.io/tcd"},{"id":"http://arxiv.org/abs/2402.19404v2","updated":"2024-04-15T13:47:31Z","published":"2024-02-29T18:03:00Z","title":"EAMA : Entity-Aware Multimodal Alignment Based Approach for News Image\n Captioning","summary":" News image captioning requires model to generate an informative caption rich\nin entities, with the news image and the associated news article. Though\nMultimodal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities in addressing various vision-language tasks, our research finds\nthat current MLLMs still bear limitations in handling entity information on\nnews image captioning task. Besides, while MLLMs have the ability to process\nlong inputs, generating high-quality news image captions still requires a\ntrade-off between sufficiency and conciseness of textual input information. To\nexplore the potential of MLLMs and address problems we discovered, we propose :\nan Entity-Aware Multimodal Alignment based approach for news image captioning.\nOur approach first aligns the MLLM through Balance Training Strategy with two\nextra alignment tasks: Entity-Aware Sentence Selection task and Entity\nSelection task, together with News Image Captioning task, to enhance its\ncapability in handling multimodal entity information. The aligned MLLM will\nutilizes the additional entity-related information it explicitly extract to\nsupplement its textual input while generating news image captions. Our approach\nachieves better results than all previous models in CIDEr score on GoodNews\ndataset (72.33 -> 88.39) and NYTimes800k dataset (70.83 -> 85.61).\n","authors":["Junzhe Zhang","Huixuan Zhang","Xunjian Yin","Xiaojun Wan"],"pdf_url":"https://arxiv.org/pdf/2402.19404v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09790v1","updated":"2024-04-15T13:45:48Z","published":"2024-04-15T13:45:48Z","title":"NTIRE 2024 Challenge on Image Super-Resolution ($\\times$4): Methods and\n Results","summary":" This paper reviews the NTIRE 2024 challenge on image super-resolution\n($\\times$4), highlighting the solutions proposed and the outcomes obtained. The\nchallenge involves generating corresponding high-resolution (HR) images,\nmagnified by a factor of four, from low-resolution (LR) inputs using prior\ninformation. The LR images originate from bicubic downsampling degradation. The\naim of the challenge is to obtain designs/solutions with the most advanced SR\nperformance, with no constraints on computational resources (e.g., model size\nand FLOPs) or training data. The track of this challenge assesses performance\nwith the PSNR metric on the DIV2K testing dataset. The competition attracted\n199 registrants, with 20 teams submitting valid entries. This collective\nendeavour not only pushes the boundaries of performance in single-image SR but\nalso offers a comprehensive overview of current trends in this field.\n","authors":["Zheng Chen","Zongwei Wu","Eduard Zamfir","Kai Zhang","Yulun Zhang","Radu Timofte","Xiaokang Yang","Hongyuan Yu","Cheng Wan","Yuxin Hong","Zhijuan Huang","Yajun Zou","Yuan Huang","Jiamin Lin","Bingnan Han","Xianyu Guan","Yongsheng Yu","Daoan Zhang","Xuanwu Yin","Kunlong Zuo","Jinhua Hao","Kai Zhao","Kun Yuan","Ming Sun","Chao Zhou","Hongyu An","Xinfeng Zhang","Zhiyuan Song","Ziyue Dong","Qing Zhao","Xiaogang Xu","Pengxu Wei","Zhi-chao Dou","Gui-ling Wang","Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou","Cansu Korkmaz","A. Murat Tekalp","Yubin Wei","Xiaole Yan","Binren Li","Haonan Chen","Siqi Zhang","Sihan Chen","Amogh Joshi","Nikhil Akalwadi","Sampada Malagi","Palani Yashaswini","Chaitra Desai","Ramesh Ashok Tabib","Ujwala Patil","Uma Mudenagudi","Anjali Sarvaiya","Pooja Choksy","Jagrit Joshi","Shubh Kawa","Kishor Upla","Sushrut Patwardhan","Raghavendra Ramachandra","Sadat Hossain","Geongi Park","S. M. Nadim Uddin","Hao Xu","Yanhui Guo","Aman Urumbekov","Xingzhuo Yan","Wei Hao","Minghan Fu","Isaac Orais","Samuel Smith","Ying Liu","Wangwang Jia","Qisheng Xu","Kele Xu","Weijun Yuan","Zhan Li","Wenqin Kuang","Ruijin Guan","Ruting Deng","Zhao Zhang","Bo Wang","Suiyi Zhao","Yan Luo","Yanyan Wei","Asif Hussain Khan","Christian Micheloni","Niki Martinel"],"pdf_url":"https://arxiv.org/pdf/2404.09790v1.pdf","comment":"NTIRE 2024 webpage: https://cvlai.net/ntire/2024. Code:\n https://github.com/zhengchen1999/NTIRE2024_ImageSR_x4"},{"id":"http://arxiv.org/abs/2309.05418v2","updated":"2024-04-15T13:42:13Z","published":"2023-09-11T12:35:17Z","title":"FlowIBR: Leveraging Pre-Training for Efficient Neural Image-Based\n Rendering of Dynamic Scenes","summary":" We introduce FlowIBR, a novel approach for efficient monocular novel view\nsynthesis of dynamic scenes. Existing techniques already show impressive\nrendering quality but tend to focus on optimization within a single scene\nwithout leveraging prior knowledge, resulting in long optimization times per\nscene. FlowIBR circumvents this limitation by integrating a neural image-based\nrendering method, pre-trained on a large corpus of widely available static\nscenes, with a per-scene optimized scene flow field. Utilizing this flow field,\nwe bend the camera rays to counteract the scene dynamics, thereby presenting\nthe dynamic scene as if it were static to the rendering network. The proposed\nmethod reduces per-scene optimization time by an order of magnitude, achieving\ncomparable rendering quality to existing methods -- all on a single\nconsumer-grade GPU.\n","authors":["Marcel Büsching","Josef Bengtson","David Nilsson","Mårten Björkman"],"pdf_url":"https://arxiv.org/pdf/2309.05418v2.pdf","comment":"Accepted to CVPR 2024 Workshop on Efficient Deep Learning for\n Computer Vision. Project page: https://flowibr.github.io"},{"id":"http://arxiv.org/abs/2404.09778v1","updated":"2024-04-15T13:30:34Z","published":"2024-04-15T13:30:34Z","title":"The Devil is in the Few Shots: Iterative Visual Knowledge Completion for\n Few-shot Learning","summary":" Contrastive Language-Image Pre-training (CLIP) has shown powerful zero-shot\nlearning performance. Few-shot learning aims to further enhance the transfer\ncapability of CLIP by giving few images in each class, aka 'few shots'. Most\nexisting methods either implicitly learn from the few shots by incorporating\nlearnable prompts or adapters, or explicitly embed them in a cache model for\ninference. However, the narrow distribution of few shots often contains\nincomplete class information, leading to biased visual knowledge with high risk\nof misclassification. To tackle this problem, recent methods propose to\nsupplement visual knowledge by generative models or extra databases, which can\nbe costly and time-consuming. In this paper, we propose an Iterative Visual\nKnowledge CompLetion (KCL) method to complement visual knowledge by properly\ntaking advantages of unlabeled samples without access to any auxiliary or\nsynthetic data. Specifically, KCL first measures the similarities between\nunlabeled samples and each category. Then, the samples with top confidence to\neach category is selected and collected by a designed confidence criterion.\nFinally, the collected samples are treated as labeled ones and added to few\nshots to jointly re-estimate the remaining unlabeled ones. The above procedures\nwill be repeated for a certain number of iterations with more and more samples\nbeing collected until convergence, ensuring a progressive and robust knowledge\ncompletion process. Extensive experiments on 11 benchmark datasets demonstrate\nthe effectiveness and efficiency of KCL as a plug-and-play module under both\nfew-shot and zero-shot learning settings. Code is available at\nhttps://github.com/Mark-Sky/KCL.\n","authors":["Yaohui Li","Qifeng Zhou","Haoxing Chen","Jianbing Zhang","Xinyu Dai","Hao Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.09778v1.pdf","comment":"26 pages, submitted to ECCV 2024"},{"id":"http://arxiv.org/abs/2303.16611v2","updated":"2024-04-15T13:29:47Z","published":"2023-03-29T11:50:21Z","title":"4D Facial Expression Diffusion Model","summary":" Facial expression generation is one of the most challenging and long-sought\naspects of character animation, with many interesting applications. The\nchallenging task, traditionally having relied heavily on digital craftspersons,\nremains yet to be explored. In this paper, we introduce a generative framework\nfor generating 3D facial expression sequences (i.e. 4D faces) that can be\nconditioned on different inputs to animate an arbitrary 3D face mesh. It is\ncomposed of two tasks: (1) Learning the generative model that is trained over a\nset of 3D landmark sequences, and (2) Generating 3D mesh sequences of an input\nfacial mesh driven by the generated landmark sequences. The generative model is\nbased on a Denoising Diffusion Probabilistic Model (DDPM), which has achieved\nremarkable success in generative tasks of other domains. While it can be\ntrained unconditionally, its reverse process can still be conditioned by\nvarious condition signals. This allows us to efficiently develop several\ndownstream tasks involving various conditional generation, by using expression\nlabels, text, partial sequences, or simply a facial geometry. To obtain the\nfull mesh deformation, we then develop a landmark-guided encoder-decoder to\napply the geometrical deformation embedded in landmarks on a given facial mesh.\nExperiments show that our model has learned to generate realistic, quality\nexpressions solely from the dataset of relatively small size, improving over\nthe state-of-the-art methods. Videos and qualitative comparisons with other\nmethods can be found at \\url{https://github.com/ZOUKaifeng/4DFM}.\n","authors":["Kaifeng Zou","Sylvain Faisan","Boyang Yu","Sébastien Valette","Hyewon Seo"],"pdf_url":"https://arxiv.org/pdf/2303.16611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01369v2","updated":"2024-04-15T13:29:32Z","published":"2023-09-04T05:34:19Z","title":"Exploring Limits of Diffusion-Synthetic Training with Weakly Supervised\n Semantic Segmentation","summary":" The advance of generative models for images has inspired various training\ntechniques for image recognition utilizing synthetic images. In semantic\nsegmentation, one promising approach is extracting pseudo-masks from attention\nmaps in text-to-image diffusion models, which enables\nreal-image-and-annotation-free training. However, the pioneering training\nmethod using the diffusion-synthetic images and pseudo-masks, i.e., DiffuMask\nhas limitations in terms of mask quality, scalability, and ranges of applicable\ndomains. To overcome these limitations, this work introduces three techniques\nfor diffusion-synthetic semantic segmentation training. First,\nreliability-aware robust training, originally used in weakly supervised\nlearning, helps segmentation with insufficient synthetic mask quality. %Second,\nlarge-scale pretraining of whole segmentation models, not only backbones, on\nsynthetic ImageNet-1k-class images with pixel-labels benefits downstream\nsegmentation tasks. Second, we introduce prompt augmentation, data augmentation\nto the prompt text set to scale up and diversify training images with a limited\ntext resources. Finally, LoRA-based adaptation of Stable Diffusion enables the\ntransfer to a distant domain, e.g., auto-driving images. Experiments in PASCAL\nVOC, ImageNet-S, and Cityscapes show that our method effectively closes gap\nbetween real and synthetic training in semantic segmentation.\n","authors":["Ryota Yoshihashi","Yuya Otsuka","Kenji Doi","Tomohiro Tanaka","Hirokatsu Kataoka"],"pdf_url":"https://arxiv.org/pdf/2309.01369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09774v1","updated":"2024-04-15T13:28:13Z","published":"2024-04-15T13:28:13Z","title":"RandAlign: A Parameter-Free Method for Regularizing Graph Convolutional\n Networks","summary":" Studies continually find that message-passing graph convolutional networks\nsuffer from the over-smoothing issue. Basically, the issue of over-smoothing\nrefers to the phenomenon that the learned embeddings for all nodes can become\nvery similar to one another and therefore are uninformative after repeatedly\napplying message passing iterations. Intuitively, we can expect the generated\nembeddings become smooth asymptotically layerwisely, that is each layer of\ngraph convolution generates a smoothed version of embeddings as compared to\nthat generated by the previous layer. Based on this intuition, we propose\nRandAlign, a stochastic regularization method for graph convolutional networks.\nThe idea of RandAlign is to randomly align the learned embedding for each node\nwith that of the previous layer using randomly interpolation in each graph\nconvolution layer. Through alignment, the smoothness of the generated\nembeddings is explicitly reduced. To better maintain the benefit yielded by the\ngraph convolution, in the alignment step we introduce to first scale the\nembedding of the previous layer to the same norm as the generated embedding and\nthen perform random interpolation for aligning the generated embedding.\nRandAlign is a parameter-free method and can be directly applied without\nintroducing additional trainable weights or hyper-parameters. We experimentally\nevaluate RandAlign on different graph domain tasks on seven benchmark datasets.\nThe experimental results show that RandAlign is a general method that improves\nthe generalization performance of various graph convolutional network models\nand also improves the numerical stability of optimization, advancing the state\nof the art performance for graph representation learning.\n","authors":["Haimin Zhang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09774v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.08199v3","updated":"2024-04-15T13:25:28Z","published":"2023-07-17T02:03:17Z","title":"Unbiased Image Synthesis via Manifold Guidance in Diffusion Models","summary":" Diffusion Models are a potent class of generative models capable of producing\nhigh-quality images. However, they often inadvertently favor certain data\nattributes, undermining the diversity of generated images. This issue is\nstarkly apparent in skewed datasets like CelebA, where the initial dataset\ndisproportionately favors females over males by 57.9%, this bias amplified in\ngenerated data where female representation outstrips males by 148%. In\nresponse, we propose a plug-and-play method named Manifold Guidance Sampling,\nwhich is also the first unsupervised method to mitigate bias issue in DDPMs.\nLeveraging the inherent structure of the data manifold, this method steers the\nsampling process towards a more uniform distribution, effectively dispersing\nthe clustering of biased data. Without the need for modifying the existing\nmodel or additional training, it significantly mitigates data bias and enhances\nthe quality and unbiasedness of the generated images.\n","authors":["Xingzhe Su","Daixi Jia","Fengge Wu","Junsuo Zhao","Changwen Zheng","Wenwen Qiang"],"pdf_url":"https://arxiv.org/pdf/2307.08199v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06819v2","updated":"2024-04-15T13:24:46Z","published":"2023-04-13T21:02:32Z","title":"Modeling Dense Multimodal Interactions Between Biological Pathways and\n Histology for Survival Prediction","summary":" Integrating whole-slide images (WSIs) and bulk transcriptomics for predicting\npatient survival can improve our understanding of patient prognosis. However,\nthis multimodal task is particularly challenging due to the different nature of\nthese data: WSIs represent a very high-dimensional spatial description of a\ntumor, while bulk transcriptomics represent a global description of gene\nexpression levels within that tumor. In this context, our work aims to address\ntwo key challenges: (1) how can we tokenize transcriptomics in a semantically\nmeaningful and interpretable way?, and (2) how can we capture dense multimodal\ninteractions between these two modalities? Specifically, we propose to learn\nbiological pathway tokens from transcriptomics that can encode specific\ncellular functions. Together with histology patch tokens that encode the\ndifferent morphological patterns in the WSI, we argue that they form\nappropriate reasoning units for downstream interpretability analyses. We\npropose fusing both modalities using a memory-efficient multimodal Transformer\nthat can model interactions between pathway and histology patch tokens. Our\nproposed model, SURVPATH, achieves state-of-the-art performance when evaluated\nagainst both unimodal and multimodal baselines on five datasets from The Cancer\nGenome Atlas. Our interpretability framework identifies key multimodal\nprognostic factors, and, as such, can provide valuable insights into the\ninteraction between genotype and phenotype, enabling a deeper understanding of\nthe underlying biological mechanisms at play. We make our code public at:\nhttps://github.com/ajv012/SurvPath.\n","authors":["Guillaume Jaume","Anurag Vaidya","Richard Chen","Drew Williamson","Paul Liang","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2304.06819v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09768v1","updated":"2024-04-15T13:13:56Z","published":"2024-04-15T13:13:56Z","title":"Contrastive Pretraining for Visual Concept Explanations of Socioeconomic\n Outcomes","summary":" Predicting socioeconomic indicators from satellite imagery with deep learning\nhas become an increasingly popular research direction. Post-hoc concept-based\nexplanations can be an important step towards broader adoption of these models\nin policy-making as they enable the interpretation of socioeconomic outcomes\nbased on visual concepts that are intuitive to humans. In this paper, we study\nthe interplay between representation learning using an additional task-specific\ncontrastive loss and post-hoc concept explainability for socioeconomic studies.\nOur results on two different geographical locations and tasks indicate that the\ntask-specific pretraining imposes a continuous ordering of the latent space\nembeddings according to the socioeconomic outcomes. This improves the model's\ninterpretability as it enables the latent space of the model to associate urban\nconcepts with continuous intervals of socioeconomic outcomes. Further, we\nillustrate how analyzing the model's conceptual sensitivity for the intervals\nof socioeconomic outcomes can shed light on new insights for urban studies.\n","authors":["Ivica Obadic","Alex Levering","Lars Pennig","Dario Oliveira","Diego Marcos","Xiaoxiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.09768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09761v1","updated":"2024-04-15T13:03:42Z","published":"2024-04-15T13:03:42Z","title":"Deep Learning-Based Segmentation of Tumors in PET/CT Volumes: Benchmark\n of Different Architectures and Training Strategies","summary":" Cancer is one of the leading causes of death globally, and early diagnosis is\ncrucial for patient survival. Deep learning algorithms have great potential for\nautomatic cancer analysis. Artificial intelligence has achieved high\nperformance in recognizing and segmenting single lesions. However, diagnosing\nmultiple lesions remains a challenge. This study examines and compares various\nneural network architectures and training strategies for automatically\nsegmentation of cancer lesions using PET/CT images from the head, neck, and\nwhole body. The authors analyzed datasets from the AutoPET and HECKTOR\nchallenges, exploring popular single-step segmentation architectures and\npresenting a two-step approach. The results indicate that the V-Net and nnU-Net\nmodels were the most effective for their respective datasets. The results for\nthe HECKTOR dataset ranged from 0.75 to 0.76 for the aggregated Dice\ncoefficient. Eliminating cancer-free cases from the AutoPET dataset was found\nto improve the performance of most models. In the case of AutoPET data, the\naverage segmentation efficiency after training only on images containing cancer\nlesions increased from 0.55 to 0.66 for the classic Dice coefficient and from\n0.65 to 0.73 for the aggregated Dice coefficient. The research demonstrates the\npotential of artificial intelligence in precise oncological diagnostics and may\ncontribute to the development of more targeted and effective cancer assessment\ntechniques.\n","authors":["Monika Górka","Daniel Jaworek","Marek Wodzinski"],"pdf_url":"https://arxiv.org/pdf/2404.09761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16515v2","updated":"2024-04-15T12:58:26Z","published":"2023-09-28T15:22:02Z","title":"Latent Noise Segmentation: How Neural Noise Leads to the Emergence of\n Segmentation and Grouping","summary":" Humans are able to segment images effortlessly without supervision using\nperceptual grouping. In this work, we propose a counter-intuitive computational\napproach to solving unsupervised perceptual grouping and segmentation: that\nthey arise \\textit{because} of neural noise, rather than in spite of it. We (1)\nmathematically demonstrate that under realistic assumptions, neural noise can\nbe used to separate objects from each other; (2) that adding noise in a DNN\nenables the network to segment images even though it was never trained on any\nsegmentation labels; and (3) that segmenting objects using noise results in\nsegmentation performance that aligns with the perceptual grouping phenomena\nobserved in humans, and is sample-efficient. We introduce the Good Gestalt (GG)\ndatasets -- six datasets designed to specifically test perceptual grouping, and\nshow that our DNN models reproduce many important phenomena in human\nperception, such as illusory contours, closure, continuity, proximity, and\nocclusion. Finally, we (4) show that our model improves performance on our GG\ndatasets compared to other tested unsupervised models by $24.9\\%$. Together,\nour results suggest a novel unsupervised segmentation method requiring few\nassumptions, a new explanation for the formation of perceptual grouping, and a\nnovel potential benefit of neural noise.\n","authors":["Ben Lonnqvist","Zhengqing Wu","Michael H. Herzog"],"pdf_url":"https://arxiv.org/pdf/2309.16515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08570v2","updated":"2024-04-15T12:27:13Z","published":"2024-01-16T18:57:50Z","title":"RoHM: Robust Human Motion Reconstruction via Diffusion","summary":" We propose RoHM, an approach for robust 3D human motion reconstruction from\nmonocular RGB(-D) videos in the presence of noise and occlusions. Most previous\napproaches either train neural networks to directly regress motion in 3D or\nlearn data-driven motion priors and combine them with optimization at test\ntime. The former do not recover globally coherent motion and fail under\nocclusions; the latter are time-consuming, prone to local minima, and require\nmanual tuning. To overcome these shortcomings, we exploit the iterative,\ndenoising nature of diffusion models. RoHM is a novel diffusion-based motion\nmodel that, conditioned on noisy and occluded input data, reconstructs\ncomplete, plausible motions in consistent global coordinates. Given the\ncomplexity of the problem -- requiring one to address different tasks\n(denoising and infilling) in different solution spaces (local and global\nmotion) -- we decompose it into two sub-tasks and learn two models, one for\nglobal trajectory and one for local motion. To capture the correlations between\nthe two, we then introduce a novel conditioning module, combining it with an\niterative inference scheme. We apply RoHM to a variety of tasks -- from motion\nreconstruction and denoising to spatial and temporal infilling. Extensive\nexperiments on three popular datasets show that our method outperforms\nstate-of-the-art approaches qualitatively and quantitatively, while being\nfaster at test time. The code is available at\nhttps://sanweiliti.github.io/ROHM/ROHM.html.\n","authors":["Siwei Zhang","Bharat Lal Bhatnagar","Yuanlu Xu","Alexander Winkler","Petr Kadlecek","Siyu Tang","Federica Bogo"],"pdf_url":"https://arxiv.org/pdf/2401.08570v2.pdf","comment":"With the appendix included"},{"id":"http://arxiv.org/abs/2109.14406v2","updated":"2024-04-15T06:19:32Z","published":"2021-09-29T13:10:46Z","title":"Neural Knitworks: Patched Neural Implicit Representation Networks","summary":" Coordinate-based Multilayer Perceptron (MLP) networks, despite being capable\nof learning neural implicit representations, are not performant for internal\nimage synthesis applications. Convolutional Neural Networks (CNNs) are\ntypically used instead for a variety of internal generative tasks, at the cost\nof a larger model. We propose Neural Knitwork, an architecture for neural\nimplicit representation learning of natural images that achieves image\nsynthesis by optimizing the distribution of image patches in an adversarial\nmanner and by enforcing consistency between the patch predictions. To the best\nof our knowledge, this is the first implementation of a coordinate-based MLP\ntailored for synthesis tasks such as image inpainting, super-resolution, and\ndenoising. We demonstrate the utility of the proposed technique by training on\nthese three tasks. The results show that modeling natural images using patches,\nrather than pixels, produces results of higher fidelity. The resulting model\nrequires 80% fewer parameters than alternative CNN-based solutions while\nachieving comparable performance and training time.\n","authors":["Mikolaj Czerkawski","Javier Cardona","Robert Atkinson","Craig Michie","Ivan Andonovic","Carmine Clemente","Christos Tachtatzis"],"pdf_url":"https://arxiv.org/pdf/2109.14406v2.pdf","comment":"Published in Pattern Recognition"},{"id":"http://arxiv.org/abs/2404.10147v1","updated":"2024-04-15T21:33:45Z","published":"2024-04-15T21:33:45Z","title":"Eyes on the Streets: Leveraging Street-Level Imaging to Model Urban\n Crime Dynamics","summary":" This study addresses the challenge of urban safety in New York City by\nexamining the relationship between the built environment and crime rates using\nmachine learning and a comprehensive dataset of street view images. We aim to\nidentify how urban landscapes correlate with crime statistics, focusing on the\ncharacteristics of street views and their association with crime rates. The\nfindings offer insights for urban planning and crime prevention, highlighting\nthe potential of environmental design in enhancing public safety.\n","authors":["Zhixuan Qi","Huaiying Luo","Chen Chi"],"pdf_url":"https://arxiv.org/pdf/2404.10147v1.pdf","comment":null}]},"2024-04-14T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2302.04871v4","updated":"2024-04-14T23:46:05Z","published":"2023-02-09T18:59:56Z","title":"In-N-Out: Faithful 3D GAN Inversion with Volumetric Decomposition for\n Face Editing","summary":" 3D-aware GANs offer new capabilities for view synthesis while preserving the\nediting functionalities of their 2D counterparts. GAN inversion is a crucial\nstep that seeks the latent code to reconstruct input images or videos,\nsubsequently enabling diverse editing tasks through manipulation of this latent\ncode. However, a model pre-trained on a particular dataset (e.g., FFHQ) often\nhas difficulty reconstructing images with out-of-distribution (OOD) objects\nsuch as faces with heavy make-up or occluding objects. We address this issue by\nexplicitly modeling OOD objects from the input in 3D-aware GANs. Our core idea\nis to represent the image using two individual neural radiance fields: one for\nthe in-distribution content and the other for the out-of-distribution object.\nThe final reconstruction is achieved by optimizing the composition of these two\nradiance fields with carefully designed regularization. We demonstrate that our\nexplicit decomposition alleviates the inherent trade-off between reconstruction\nfidelity and editability. We evaluate reconstruction accuracy and editability\nof our method on challenging real face images and videos and showcase favorable\nresults against other baselines.\n","authors":["Yiran Xu","Zhixin Shu","Cameron Smith","Seoung Wug Oh","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2302.04871v4.pdf","comment":"Project page: https://in-n-out-3d.github.io/"},{"id":"http://arxiv.org/abs/2404.09378v1","updated":"2024-04-14T23:30:35Z","published":"2024-04-14T23:30:35Z","title":"Orientation-conditioned Facial Texture Mapping for Video-based Facial\n Remote Photoplethysmography Estimation","summary":" Camera-based remote photoplethysmography (rPPG) enables contactless\nmeasurement of important physiological signals such as pulse rate (PR).\nHowever, dynamic and unconstrained subject motion introduces significant\nvariability into the facial appearance in video, confounding the ability of\nvideo-based methods to accurately extract the rPPG signal. In this study, we\nleverage the 3D facial surface to construct a novel orientation-conditioned\nfacial texture video representation which improves the motion robustness of\nexisting video-based facial rPPG estimation methods. Our proposed method\nachieves a significant 18.2% performance improvement in cross-dataset testing\non MMPD over our baseline using the PhysNet model trained on PURE, highlighting\nthe efficacy and generalization benefits of our designed video representation.\nWe demonstrate significant performance improvements of up to 29.6% in all\ntested motion scenarios in cross-dataset testing on MMPD, even in the presence\nof dynamic and unconstrained subject motion. Emphasizing the benefits the\nbenefits of disentangling motion through modeling the 3D facial surface for\nmotion robust facial rPPG estimation. We validate the efficacy of our design\ndecisions and the impact of different video processing steps through an\nablation study. Our findings illustrate the potential strengths of exploiting\nthe 3D facial surface as a general strategy for addressing dynamic and\nunconstrained subject motion in videos. The code is available at\nhttps://samcantrill.github.io/orientation-uv-rppg/.\n","authors":["Sam Cantrill","David Ahmedt-Aristizabal","Lars Petersson","Hanna Suominen","Mohammad Ali Armin"],"pdf_url":"https://arxiv.org/pdf/2404.09378v1.pdf","comment":"12 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.09376v1","updated":"2024-04-14T23:17:01Z","published":"2024-04-14T23:17:01Z","title":"\\textit{sweet} -- An Open Source Modular Platform for Contactless Hand\n Vascular Biometric Experiments","summary":" Current finger-vein or palm-vein recognition systems usually require direct\ncontact of the subject with the apparatus. This can be problematic in\nenvironments where hygiene is of primary importance. In this work we present a\ncontactless vascular biometrics sensor platform named \\sweet which can be used\nfor hand vascular biometrics studies (wrist-, palm- and finger-vein) and\nsurface features such as palmprint. It supports several acquisition modalities\nsuch as multi-spectral Near-Infrared (NIR), RGB-color, Stereo Vision (SV) and\nPhotometric Stereo (PS). Using this platform we collect a dataset consisting of\nthe fingers, palm and wrist vascular data of 120 subjects and develop a\npowerful 3D pipeline for the pre-processing of this data. We then present\nbiometric experimental results, focusing on Finger-Vein Recognition (FVR).\nFinally, we discuss fusion of multiple modalities, such palm-vein combined with\npalm-print biometrics. The acquisition software, parts of the hardware design,\nthe new FV dataset, as well as source-code for our experiments are publicly\navailable for research purposes.\n","authors":["David Geissbühler","Sushil Bhattacharjee","Ketan Kotwal","Guillaume Clivaz","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2404.09376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06741v2","updated":"2024-04-14T22:33:27Z","published":"2023-12-11T18:19:04Z","title":"Gaussian Splatting SLAM","summary":" We present the first application of 3D Gaussian Splatting in monocular SLAM,\nthe most fundamental but the hardest setup for Visual SLAM. Our method, which\nruns live at 3fps, utilises Gaussians as the only 3D representation, unifying\nthe required representation for accurate, efficient tracking, mapping, and\nhigh-quality rendering. Designed for challenging monocular settings, our\napproach is seamlessly extendable to RGB-D SLAM when an external depth sensor\nis available. Several innovations are required to continuously reconstruct 3D\nscenes with high fidelity from a live camera. First, to move beyond the\noriginal 3DGS algorithm, which requires accurate poses from an offline\nStructure from Motion (SfM) system, we formulate camera tracking for 3DGS using\ndirect optimisation against the 3D Gaussians, and show that this enables fast\nand robust tracking with a wide basin of convergence. Second, by utilising the\nexplicit nature of the Gaussians, we introduce geometric verification and\nregularisation to handle the ambiguities occurring in incremental 3D dense\nreconstruction. Finally, we introduce a full SLAM system which not only\nachieves state-of-the-art results in novel view synthesis and trajectory\nestimation but also reconstruction of tiny and even transparent objects.\n","authors":["Hidenobu Matsuki","Riku Murai","Paul H. J. Kelly","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2312.06741v2.pdf","comment":"CVPR2024 Highlight. First two authors contributed equally to this\n work. Project Page: https://rmurai.co.uk/projects/GaussianSplattingSLAM/"},{"id":"http://arxiv.org/abs/2310.08580v2","updated":"2024-04-14T22:23:18Z","published":"2023-10-12T17:59:38Z","title":"OmniControl: Control Any Joint at Any Time for Human Motion Generation","summary":" We present a novel approach named OmniControl for incorporating flexible\nspatial control signals into a text-conditioned human motion generation model\nbased on the diffusion process. Unlike previous methods that can only control\nthe pelvis trajectory, OmniControl can incorporate flexible spatial control\nsignals over different joints at different times with only one model.\nSpecifically, we propose analytic spatial guidance that ensures the generated\nmotion can tightly conform to the input control signals. At the same time,\nrealism guidance is introduced to refine all the joints to generate more\ncoherent motion. Both the spatial and realism guidance are essential and they\nare highly complementary for balancing control accuracy and motion realism. By\ncombining them, OmniControl generates motions that are realistic, coherent, and\nconsistent with the spatial constraints. Experiments on HumanML3D and KIT-ML\ndatasets show that OmniControl not only achieves significant improvement over\nstate-of-the-art methods on pelvis control but also shows promising results\nwhen incorporating the constraints over other joints.\n","authors":["Yiming Xie","Varun Jampani","Lei Zhong","Deqing Sun","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.08580v2.pdf","comment":"ICLR 2024. Project page: https://neu-vi.github.io/omnicontrol/"},{"id":"http://arxiv.org/abs/2404.09359v1","updated":"2024-04-14T21:14:47Z","published":"2024-04-14T21:14:47Z","title":"Exploring Feedback Generation in Automated Skeletal Movement Assessment:\n A Comprehensive Overview","summary":" The application of machine-learning solutions to movement assessment from\nskeleton videos has attracted significant research attention in recent years.\nThis advancement has made rehabilitation at home more accessible, utilizing\nmovement assessment algorithms that can operate on affordable equipment for\nhuman pose detection from 2D or 3D videos. While the primary objective of\nautomatic assessment tasks is to score movements, the automatic generation of\nfeedback highlighting key movement issues has the potential to significantly\nenhance and accelerate the rehabilitation process. In this study, we explain\nthe types of feedback that can be generated, review existing solutions for\nautomatic feedback generation, and discuss future research directions. To our\nknowledge, this is the first comprehensive review of feedback generation in\nskeletal movement assessment.\n","authors":["Tal Hakim"],"pdf_url":"https://arxiv.org/pdf/2404.09359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19001v3","updated":"2024-04-14T21:13:01Z","published":"2024-02-29T09:52:39Z","title":"Analysis of the Two-Step Heterogeneous Transfer Learning for Laryngeal\n Blood Vessel Classification: Issue and Improvement","summary":" Accurate classification of laryngeal vascular as benign or malignant is\ncrucial for early detection of laryngeal cancer. However, organizations with\nlimited access to laryngeal vascular images face challenges due to the lack of\nlarge and homogeneous public datasets for effective learning. Distinguished\nfrom the most familiar works, which directly transfer the ImageNet pre-trained\nmodels to the target domain for fine-tuning, this work pioneers exploring\ntwo-step heterogeneous transfer learning (THTL) for laryngeal lesion\nclassification with nine deep-learning models, utilizing the diabetic\nretinopathy color fundus images, semantically non-identical yet vascular\nimages, as the intermediate domain. Attention visualization technique, Layer\nClass Activate Map (LayerCAM), reveals a novel finding that yet the\nintermediate and the target domain both reflect vascular structure to a certain\nextent, the prevalent radial vascular pattern in the intermediate domain\nprevents learning the features of twisted and tangled vessels that distinguish\nthe malignant class in the target domain, summarizes a vital rule for laryngeal\nlesion classification using THTL. To address this, we introduce an enhanced\nfine-tuning strategy in THTL called Step-Wise Fine-Tuning (SWFT) and apply it\nto the ResNet models. SWFT progressively refines model performance by\naccumulating fine-tuning layers from back to front, guided by the visualization\nresults of LayerCAM. Comparison with the original THTL approach shows\nsignificant improvements. For ResNet18, the accuracy and malignant recall\nincreases by 26.1% and 79.8%, respectively, while for ResNet50, these\nindicators improve by 20.4% and 62.2%, respectively.\n","authors":["Xinyi Fang","Xu Yang","Chak Fong Chong","Kei Long Wong","Yapeng Wang","Tiankui Zhang","Sio-Kei Im"],"pdf_url":"https://arxiv.org/pdf/2402.19001v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09349v1","updated":"2024-04-14T20:14:38Z","published":"2024-04-14T20:14:38Z","title":"Adversarial Robustness Limits via Scaling-Law and Human-Alignment\n Studies","summary":" This paper revisits the simple, long-studied, yet still unsolved problem of\nmaking image classifiers robust to imperceptible perturbations. Taking CIFAR10\nas an example, SOTA clean accuracy is about $100$%, but SOTA robustness to\n$\\ell_{\\infty}$-norm bounded perturbations barely exceeds $70$%. To understand\nthis gap, we analyze how model size, dataset size, and synthetic data quality\naffect robustness by developing the first scaling laws for adversarial\ntraining. Our scaling laws reveal inefficiencies in prior art and provide\nactionable feedback to advance the field. For instance, we discovered that SOTA\nmethods diverge notably from compute-optimal setups, using excess compute for\ntheir level of robustness. Leveraging a compute-efficient setup, we surpass the\nprior SOTA with $20$% ($70$%) fewer training (inference) FLOPs. We trained\nvarious compute-efficient models, with our best achieving $74$% AutoAttack\naccuracy ($+3$% gain). However, our scaling laws also predict robustness slowly\ngrows then plateaus at $90$%: dwarfing our new SOTA by scaling is impractical,\nand perfect robustness is impossible. To better understand this predicted\nlimit, we carry out a small-scale human evaluation on the AutoAttack data that\nfools our top-performing model. Concerningly, we estimate that human\nperformance also plateaus near $90$%, which we show to be attributable to\n$\\ell_{\\infty}$-constrained attacks' generation of invalid images not\nconsistent with their original labels. Having characterized limiting\nroadblocks, we outline promising paths for future research.\n","authors":["Brian R. Bartoldson","James Diffenderfer","Konstantinos Parasyris","Bhavya Kailkhura"],"pdf_url":"https://arxiv.org/pdf/2404.09349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09342v1","updated":"2024-04-14T19:51:32Z","published":"2024-04-14T19:51:32Z","title":"Face-voice Association in Multilingual Environments (FAME) Challenge\n 2024 Evaluation Plan","summary":" The advancements of technology have led to the use of multimodal systems in\nvarious real-world applications. Among them, the audio-visual systems are one\nof the widely used multimodal systems. In the recent years, associating face\nand voice of a person has gained attention due to presence of unique\ncorrelation between them. The Face-voice Association in Multilingual\nEnvironments (FAME) Challenge 2024 focuses on exploring face-voice association\nunder a unique condition of multilingual scenario. This condition is inspired\nfrom the fact that half of the world's population is bilingual and most often\npeople communicate under multilingual scenario. The challenge uses a dataset\nnamely, Multilingual Audio-Visual (MAV-Celeb) for exploring face-voice\nassociation in multilingual environments. This report provides the details of\nthe challenge, dataset, baselines and task details for the FAME Challenge.\n","authors":["Muhammad Saad Saeed","Shah Nawaz","Muhammad Salman Tahir","Rohan Kumar Das","Muhammad Zaigham Zaheer","Marta Moscati","Markus Schedl","Muhammad Haris Khan","Karthik Nandakumar","Muhammad Haroon Yousaf"],"pdf_url":"https://arxiv.org/pdf/2404.09342v1.pdf","comment":"ACM Multimedia Conference - Grand Challenge"},{"id":"http://arxiv.org/abs/2404.09326v1","updated":"2024-04-14T18:57:38Z","published":"2024-04-14T18:57:38Z","title":"Weight Copy and Low-Rank Adaptation for Few-Shot Distillation of Vision\n Transformers","summary":" Few-shot knowledge distillation recently emerged as a viable approach to\nharness the knowledge of large-scale pre-trained models, using limited data and\ncomputational resources. In this paper, we propose a novel few-shot feature\ndistillation approach for vision transformers. Our approach is based on two key\nsteps. Leveraging the fact that vision transformers have a consistent\ndepth-wise structure, we first copy the weights from intermittent layers of\nexisting pre-trained vision transformers (teachers) into shallower\narchitectures (students), where the intermittence factor controls the\ncomplexity of the student transformer with respect to its teacher. Next, we\nemploy an enhanced version of Low-Rank Adaptation (LoRA) to distill knowledge\ninto the student in a few-shot scenario, aiming to recover the information\nprocessing carried out by the skipped teacher layers. We present comprehensive\nexperiments with supervised and self-supervised transformers as teachers, on\nfive data sets from various domains, including natural, medical and satellite\nimages. The empirical results confirm the superiority of our approach over\ncompetitive baselines. Moreover, the ablation results demonstrate the\nusefulness of each component of the proposed pipeline.\n","authors":["Diana-Nicoleta Grigore","Mariana-Iuliana Georgescu","Jon Alvarez Justo","Tor Johansen","Andreea Iuliana Ionescu","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2404.09326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05105v2","updated":"2024-04-14T18:27:41Z","published":"2024-04-07T23:10:26Z","title":"VMambaMorph: a Multi-Modality Deformable Image Registration Framework\n based on Visual State Space Model with Cross-Scan Module","summary":" Image registration, a critical process in medical imaging, involves aligning\ndifferent sets of medical imaging data into a single unified coordinate system.\nDeep learning networks, such as the Convolutional Neural Network (CNN)-based\nVoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model\n(SSM)-based MambaMorph, have demonstrated effective performance in this domain.\nThe recent Visual State Space Model (VMamba), which incorporates a cross-scan\nmodule with SSM, has exhibited promising improvements in modeling global-range\ndependencies with efficient computational cost in computer vision tasks. This\npaper hereby introduces an exploration of VMamba with image registration, named\nVMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for\n3D image registration. Utilizing a U-shaped network architecture, VMambaMorph\ncomputes the deformation field based on target and source volumes. The\nVMamba-based block with 2D cross-scan module is redesigned for 3D volumetric\nfeature processing. To overcome the complex motion and structure on\nmulti-modality images, we further propose a fine-tune recursive registration\nframework. We validate VMambaMorph using a public benchmark brain MR-CT\nregistration dataset, comparing its performance against current\nstate-of-the-art methods. The results indicate that VMambaMorph achieves\ncompetitive registration quality. The code for VMambaMorph with all baseline\nmethods is available on GitHub.\n","authors":["Ziyang Wang","Jian-Qing Zheng","Chao Ma","Tao Guo"],"pdf_url":"https://arxiv.org/pdf/2404.05105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16519v2","updated":"2024-04-14T17:56:49Z","published":"2023-12-27T10:57:03Z","title":"Image Restoration by Denoising Diffusion Models with Iteratively\n Preconditioned Guidance","summary":" Training deep neural networks has become a common approach for addressing\nimage restoration problems. An alternative for training a \"task-specific\"\nnetwork for each observation model is to use pretrained deep denoisers for\nimposing only the signal's prior within iterative algorithms, without\nadditional training. Recently, a sampling-based variant of this approach has\nbecome popular with the rise of diffusion/score-based generative models. Using\ndenoisers for general purpose restoration requires guiding the iterations to\nensure agreement of the signal with the observations. In low-noise settings,\nguidance that is based on back-projection (BP) has been shown to be a promising\nstrategy (used recently also under the names \"pseudoinverse\" or\n\"range/null-space\" guidance). However, the presence of noise in the\nobservations hinders the gains from this approach. In this paper, we propose a\nnovel guidance technique, based on preconditioning that allows traversing from\nBP-based guidance to least squares based guidance along the restoration scheme.\nThe proposed approach is robust to noise while still having much simpler\nimplementation than alternative methods (e.g., it does not require SVD or a\nlarge number of iterations). We use it within both an optimization scheme and a\nsampling-based scheme, and demonstrate its advantages over existing methods for\nimage deblurring and super-resolution.\n","authors":["Tomer Garber","Tom Tirer"],"pdf_url":"https://arxiv.org/pdf/2312.16519v2.pdf","comment":"CVPR 2024 (camera-ready). Code can be found at:\n https://github.com/tirer-lab/DDPG"},{"id":"http://arxiv.org/abs/2312.05239v3","updated":"2024-04-14T17:39:27Z","published":"2023-12-08T18:44:09Z","title":"SwiftBrush: One-Step Text-to-Image Diffusion Model with Variational\n Score Distillation","summary":" Despite their ability to generate high-resolution and diverse images from\ntext prompts, text-to-image diffusion models often suffer from slow iterative\nsampling processes. Model distillation is one of the most effective directions\nto accelerate these models. However, previous distillation methods fail to\nretain the generation quality while requiring a significant amount of images\nfor training, either from real data or synthetically generated by the teacher\nmodel. In response to this limitation, we present a novel image-free\ndistillation scheme named $\\textbf{SwiftBrush}$. Drawing inspiration from\ntext-to-3D synthesis, in which a 3D neural radiance field that aligns with the\ninput prompt can be obtained from a 2D text-to-image diffusion prior via a\nspecialized loss without the use of any 3D data ground-truth, our approach\nre-purposes that same loss for distilling a pretrained multi-step text-to-image\nmodel to a student network that can generate high-fidelity images with just a\nsingle inference step. In spite of its simplicity, our model stands as one of\nthe first one-step text-to-image generators that can produce images of\ncomparable quality to Stable Diffusion without reliance on any training image\ndata. Remarkably, SwiftBrush achieves an FID score of $\\textbf{16.67}$ and a\nCLIP score of $\\textbf{0.29}$ on the COCO-30K benchmark, achieving competitive\nresults or even substantially surpassing existing state-of-the-art distillation\ntechniques.\n","authors":["Thuan Hoang Nguyen","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2312.05239v3.pdf","comment":"Accepted to CVPR 2024; Project Page:\n https://thuanz123.github.io/swiftbrush/"},{"id":"http://arxiv.org/abs/2404.09308v1","updated":"2024-04-14T17:33:33Z","published":"2024-04-14T17:33:33Z","title":"In My Perspective, In My Hands: Accurate Egocentric 2D Hand Pose and\n Action Recognition","summary":" Action recognition is essential for egocentric video understanding, allowing\nautomatic and continuous monitoring of Activities of Daily Living (ADLs)\nwithout user effort. Existing literature focuses on 3D hand pose input, which\nrequires computationally intensive depth estimation networks or wearing an\nuncomfortable depth sensor. In contrast, there has been insufficient research\nin understanding 2D hand pose for egocentric action recognition, despite the\navailability of user-friendly smart glasses in the market capable of capturing\na single RGB image. Our study aims to fill this research gap by exploring the\nfield of 2D hand pose estimation for egocentric action recognition, making two\ncontributions. Firstly, we introduce two novel approaches for 2D hand pose\nestimation, namely EffHandNet for single-hand estimation and EffHandEgoNet,\ntailored for an egocentric perspective, capturing interactions between hands\nand objects. Both methods outperform state-of-the-art models on H2O and FPHA\npublic benchmarks. Secondly, we present a robust action recognition\narchitecture from 2D hand and object poses. This method incorporates\nEffHandEgoNet, and a transformer-based action recognition method. Evaluated on\nH2O and FPHA datasets, our architecture has a faster inference time and\nachieves an accuracy of 91.32% and 94.43%, respectively, surpassing state of\nthe art, including 3D-based methods. Our work demonstrates that using 2D\nskeletal data is a robust approach for egocentric action understanding.\nExtensive evaluation and ablation studies show the impact of the hand pose\nestimation approach, and how each input affects the overall performance.\n","authors":["Wiktor Mucha","Martin Kampel"],"pdf_url":"https://arxiv.org/pdf/2404.09308v1.pdf","comment":"Accepted at: The 18th IEEE International Conference on Automatic Face\n and Gesture Recognition"},{"id":"http://arxiv.org/abs/2309.07849v3","updated":"2024-04-14T17:29:46Z","published":"2023-09-14T16:48:31Z","title":"TFNet: Exploiting Temporal Cues for Fast and Accurate LiDAR Semantic\n Segmentation","summary":" LiDAR semantic segmentation plays a crucial role in enabling autonomous\ndriving and robots to understand their surroundings accurately and robustly. A\nmultitude of methods exist within this domain, including point-based,\nrange-image-based, polar-coordinate-based, and hybrid strategies. Among these,\nrange-image-based techniques have gained widespread adoption in practical\napplications due to their efficiency. However, they face a significant\nchallenge known as the ``many-to-one'' problem caused by the range image's\nlimited horizontal and vertical angular resolution. As a result, around 20% of\nthe 3D points can be occluded. In this paper, we present TFNet, a\nrange-image-based LiDAR semantic segmentation method that utilizes temporal\ninformation to address this issue. Specifically, we incorporate a temporal\nfusion layer to extract useful information from previous scans and integrate it\nwith the current scan. We then design a max-voting-based post-processing\ntechnique to correct false predictions, particularly those caused by the\n``many-to-one'' issue. We evaluated the approach on two benchmarks and\ndemonstrated that the plug-in post-processing technique is generic and can be\napplied to various networks.\n","authors":["Rong Li","ShiJie Li","Xieyuanli Chen","Teli Ma","Juergen Gall","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2309.07849v3.pdf","comment":"accepted by CVPR2024 Workshop on Autonomous Driving"},{"id":"http://arxiv.org/abs/2404.09301v1","updated":"2024-04-14T16:55:23Z","published":"2024-04-14T16:55:23Z","title":"A Simple Strategy for Body Estimation from Partial-View Images","summary":" Virtual try-on and product personalization have become increasingly important\nin modern online shopping, highlighting the need for accurate body measurement\nestimation. Although previous research has advanced in estimating 3D body\nshapes from RGB images, the task is inherently ambiguous as the observed scale\nof human subjects in the images depends on two unknown factors: capture\ndistance and body dimensions. This ambiguity is particularly pronounced in\npartial-view scenarios. To address this challenge, we propose a modular and\nsimple height normalization solution. This solution relocates the subject\nskeleton to the desired position, thereby normalizing the scale and\ndisentangling the relationship between the two variables. Our experimental\nresults demonstrate that integrating this technique into state-of-the-art human\nmesh reconstruction models significantly enhances partial body measurement\nestimation. Additionally, we illustrate the applicability of this approach to\nmulti-view settings, showcasing its versatility.\n","authors":["Yafei Mao","Xuelu Li","Brandon Smith","Jinjin Li","Raja Bala"],"pdf_url":"https://arxiv.org/pdf/2404.09301v1.pdf","comment":"Accepted to CVPRW 2024 Computer Vision for Fashion, Art, and Design"},{"id":"http://arxiv.org/abs/2404.07191v2","updated":"2024-04-14T16:54:24Z","published":"2024-04-10T17:48:37Z","title":"InstantMesh: Efficient 3D Mesh Generation from a Single Image with\n Sparse-view Large Reconstruction Models","summary":" We present InstantMesh, a feed-forward framework for instant 3D mesh\ngeneration from a single image, featuring state-of-the-art generation quality\nand significant training scalability. By synergizing the strengths of an\noff-the-shelf multiview diffusion model and a sparse-view reconstruction model\nbased on the LRM architecture, InstantMesh is able to create diverse 3D assets\nwithin 10 seconds. To enhance the training efficiency and exploit more\ngeometric supervisions, e.g, depths and normals, we integrate a differentiable\niso-surface extraction module into our framework and directly optimize on the\nmesh representation. Experimental results on public datasets demonstrate that\nInstantMesh significantly outperforms other latest image-to-3D baselines, both\nqualitatively and quantitatively. We release all the code, weights, and demo of\nInstantMesh, with the intention that it can make substantial contributions to\nthe community of 3D generative AI and empower both researchers and content\ncreators.\n","authors":["Jiale Xu","Weihao Cheng","Yiming Gao","Xintao Wang","Shenghua Gao","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2404.07191v2.pdf","comment":"Technical report. Project: https://github.com/TencentARC/InstantMesh"},{"id":"http://arxiv.org/abs/2404.09293v1","updated":"2024-04-14T16:09:33Z","published":"2024-04-14T16:09:33Z","title":"A Novel State Space Model with Local Enhancement and State Sharing for\n Image Fusion","summary":" In image fusion tasks, images from different sources possess distinct\ncharacteristics. This has driven the development of numerous methods to explore\nbetter ways of fusing them while preserving their respective characteristics.\nMamba, as a state space model, has emerged in the field of natural language\nprocessing. Recently, many studies have attempted to extend Mamba to vision\ntasks. However, due to the nature of images different from casual language\nsequences, the limited state capacity of Mamba weakens its ability to model\nimage information. Additionally, the sequence modeling ability of Mamba is only\ncapable of spatial information and cannot effectively capture the rich spectral\ninformation in images. Motivated by these challenges, we customize and improve\nthe vision Mamba network designed for the image fusion task. Specifically, we\npropose the local-enhanced vision Mamba block, dubbed as LEVM. The LEVM block\ncan improve local information perception of the network and simultaneously\nlearn local and global spatial information. Furthermore, we propose the state\nsharing technique to enhance spatial details and integrate spatial and spectral\ninformation. Finally, the overall network is a multi-scale structure based on\nvision Mamba, called LE-Mamba. Extensive experiments show the proposed methods\nachieve state-of-the-art results on multispectral pansharpening and\nmultispectral and hyperspectral image fusion datasets, and demonstrate the\neffectiveness of the proposed approach. Code will be made available.\n","authors":["Zihan Cao","Xiao Wu","Liang-Jian Deng","Yu Zhong"],"pdf_url":"https://arxiv.org/pdf/2404.09293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09292v1","updated":"2024-04-14T15:58:35Z","published":"2024-04-14T15:58:35Z","title":"Bridging Data Islands: Geographic Heterogeneity-Aware Federated Learning\n for Collaborative Remote Sensing Semantic Segmentation","summary":" Remote sensing semantic segmentation (RSS) is an essential task in Earth\nObservation missions. Due to data privacy concerns, high-quality remote sensing\nimages with annotations cannot be well shared among institutions, making it\ndifficult to fully utilize RSS data to train a generalized model. Federated\nLearning (FL), a privacy-preserving collaborative learning technology, is a\npotential solution. However, the current research on how to effectively apply\nFL in RSS is still scarce and requires further investigation. Remote sensing\nimages in various institutions often exhibit strong geographical heterogeneity.\nMore specifically, it is reflected in terms of class-distribution heterogeneity\nand object-appearance heterogeneity. Unfortunately, most existing FL studies\nshow inadequate focus on geographical heterogeneity, thus leading to\nperformance degradation in the global model. Considering the aforementioned\nissues, we propose a novel Geographic Heterogeneity-Aware Federated Learning\n(GeoFed) framework to address privacy-preserving RSS. Through Global Feature\nExtension and Tail Regeneration modules, class-distribution heterogeneity is\nalleviated. Additionally, we design an Essential Feature Mining strategy to\nalleviate object-appearance heterogeneity by constructing essential features.\nExtensive experiments on three datasets (i.e., FBP, CASID, Inria) show that our\nGeoFed consistently outperforms the current state-of-the-art methods. The code\nwill be available publicly.\n","authors":["Jieyi Tan","Yansheng Li","Sergey A. Bartalev","Bo Dang","Wei Chen","Yongjun Zhang","Liangqi Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.09292v1.pdf","comment":"13 pages,9 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.09290v1","updated":"2024-04-14T15:50:10Z","published":"2024-04-14T15:50:10Z","title":"RoofDiffusion: Constructing Roofs from Severely Corrupted Point Data via\n Diffusion","summary":" Accurate completion and denoising of roof height maps are crucial to\nreconstructing high-quality 3D buildings. Repairing sparse points can enhance\nlow-cost sensor use and reduce UAV flight overlap. RoofDiffusion is a new\nend-to-end self-supervised diffusion technique for robustly completing, in\nparticular difficult, roof height maps. RoofDiffusion leverages\nwidely-available curated footprints and can so handle up to 99\\% point sparsity\nand 80\\% roof area occlusion (regional incompleteness). A variant, No-FP\nRoofDiffusion, simultaneously predicts building footprints and heights. Both\nquantitatively outperform state-of-the-art unguided depth completion and\nrepresentative inpainting methods for Digital Elevation Models (DEM), on both a\nroof-specific benchmark and the BuildingNet dataset. Qualitative assessments\nshow the effectiveness of RoofDiffusion for datasets with real-world scans\nincluding AHN3, Dales3D, and USGS 3DEP LiDAR. Tested with the leading City3D\nalgorithm, preprocessing height maps with RoofDiffusion noticeably improves 3D\nbuilding reconstruction. RoofDiffusion is complemented by a new dataset of 13k\ncomplex roof geometries, focusing on long-tail issues in remote sensing; a\nnovel simulation of tree occlusion; and a wide variety of large-area roof\ncut-outs for data augmentation and benchmarking.\n","authors":["Kyle Shih-Huang Lo","Jörg Peters","Eric Spellman"],"pdf_url":"https://arxiv.org/pdf/2404.09290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09277v1","updated":"2024-04-14T14:58:52Z","published":"2024-04-14T14:58:52Z","title":"SyntStereo2Real: Edge-Aware GAN for Remote Sensing Image-to-Image\n Translation while Maintaining Stereo Constraint","summary":" In the field of remote sensing, the scarcity of stereo-matched and\nparticularly lack of accurate ground truth data often hinders the training of\ndeep neural networks. The use of synthetically generated images as an\nalternative, alleviates this problem but suffers from the problem of domain\ngeneralization. Unifying the capabilities of image-to-image translation and\nstereo-matching presents an effective solution to address the issue of domain\ngeneralization. Current methods involve combining two networks, an unpaired\nimage-to-image translation network and a stereo-matching network, while jointly\noptimizing them. We propose an edge-aware GAN-based network that effectively\ntackles both tasks simultaneously. We obtain edge maps of input images from the\nSobel operator and use it as an additional input to the encoder in the\ngenerator to enforce geometric consistency during translation. We additionally\ninclude a warping loss calculated from the translated images to maintain the\nstereo consistency. We demonstrate that our model produces qualitatively and\nquantitatively superior results than existing models, and its applicability\nextends to diverse domains, including autonomous driving.\n","authors":["Vasudha Venkatesan","Daniel Panangian","Mario Fuentes Reyes","Ksenia Bittner"],"pdf_url":"https://arxiv.org/pdf/2404.09277v1.pdf","comment":"Accepted to IEEE Conference on Computer Vision and Pattern\n Recognition Workshop (CVPRW) EarthVision"},{"id":"http://arxiv.org/abs/2304.02649v3","updated":"2024-04-14T14:55:55Z","published":"2023-04-03T20:19:56Z","title":"Specialty-Oriented Generalist Medical AI for Chest CT Screening","summary":" Modern medical records include a vast amount of multimodal free text clinical\ndata and imaging data from radiology, cardiology, and digital pathology. Fully\nmining such big data requires multitasking; otherwise, occult but important\naspects may be overlooked, adversely affecting clinical management and\npopulation healthcare. Despite remarkable successes of AI in individual tasks\nwith single-modal data, the progress in developing generalist medical AI\nremains relatively slow to combine multimodal data for multitasks because of\nthe dual challenges of data curation and model architecture. The data challenge\ninvolves querying and curating multimodal structured and unstructured text,\nalphanumeric, and especially 3D tomographic scans on an individual patient\nlevel for real-time decisions and on a scale to estimate population health\nstatistics. The model challenge demands a scalable and adaptable network\narchitecture to integrate multimodal datasets for diverse clinical tasks. Here\nwe propose the first-of-its-kind medical multimodal-multitask foundation model\n(M3FM) with application in lung cancer screening and related tasks. After we\ncurated a comprehensive multimodal multitask dataset consisting of 49 clinical\ndata types including 163,725 chest CT series and 17 medical tasks involved in\nLCS, we develop a multimodal question-answering framework as a unified training\nand inference strategy to synergize multimodal information and perform multiple\ntasks via free-text prompting. M3FM consistently outperforms the\nstate-of-the-art single-modal task-specific models, identifies multimodal data\nelements informative for clinical tasks and flexibly adapts to new tasks with a\nsmall out-of-distribution dataset. As a specialty-oriented generalist medical\nAI model, M3FM paves the way for similar breakthroughs in other areas of\nmedicine, closing the gap between specialists and the generalist.\n","authors":["Chuang Niu","Qing Lyu","Christopher D. Carothers","Parisa Kaviani","Josh Tan","Pingkun Yan","Mannudeep K. Kalra","Christopher T. Whitlow","Ge Wang"],"pdf_url":"https://arxiv.org/pdf/2304.02649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01029v3","updated":"2024-04-14T14:53:32Z","published":"2023-04-03T14:28:29Z","title":"Domain Generalization for Crop Segmentation with Standardized Ensemble\n Knowledge Distillation","summary":" In recent years, precision agriculture has gradually oriented farming closer\nto automation processes to support all the activities related to field\nmanagement. Service robotics plays a predominant role in this evolution by\ndeploying autonomous agents that can navigate fields while performing tasks\nsuch as monitoring, spraying, and harvesting without human intervention. To\nexecute these precise actions, mobile robots need a real-time perception system\nthat understands their surroundings and identifies their targets in the wild.\nExisting methods, however, often fall short in generalizing to new crops and\nenvironmental conditions. This limit is critical for practical applications\nwhere labeled samples are rarely available. In this paper, we investigate the\nproblem of crop segmentation and propose a novel approach to enhance domain\ngeneralization using knowledge distillation. In the proposed framework, we\ntransfer knowledge from a standardized ensemble of models individually trained\non source domains to a student model that can adapt to unseen realistic\nscenarios. To support the proposed method, we present a synthetic multi-domain\ndataset for crop segmentation containing plants of variegate species and\ncovering different terrain styles, weather conditions, and light scenarios for\nmore than 70,000 samples. We demonstrate significant improvements in\nperformance over state-of-the-art methods and superior sim-to-real\ngeneralization. Our approach provides a promising solution for domain\ngeneralization in crop segmentation and has the potential to enhance a wide\nvariety of agriculture applications.\n","authors":["Simone Angarano","Mauro Martini","Alessandro Navone","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2304.01029v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09275v1","updated":"2024-04-14T14:51:44Z","published":"2024-04-14T14:51:44Z","title":"TrafficVLM: A Controllable Visual Language Model for Traffic Video\n Captioning","summary":" Traffic video description and analysis have received much attention recently\ndue to the growing demand for efficient and reliable urban surveillance\nsystems. Most existing methods only focus on locating traffic event segments,\nwhich severely lack descriptive details related to the behaviour and context of\nall the subjects of interest in the events. In this paper, we present\nTrafficVLM, a novel multi-modal dense video captioning model for vehicle ego\ncamera view. TrafficVLM models traffic video events at different levels of\nanalysis, both spatially and temporally, and generates long fine-grained\ndescriptions for the vehicle and pedestrian at different phases of the event.\nWe also propose a conditional component for TrafficVLM to control the\ngeneration outputs and a multi-task fine-tuning paradigm to enhance\nTrafficVLM's learning capability. Experiments show that TrafficVLM performs\nwell on both vehicle and overhead camera views. Our solution achieved\noutstanding results in Track 2 of the AI City Challenge 2024, ranking us third\nin the challenge standings. Our code is publicly available at\nhttps://github.com/quangminhdinh/TrafficVLM.\n","authors":["Quang Minh Dinh","Minh Khoi Ho","Anh Quan Dang","Hung Phong Tran"],"pdf_url":"https://arxiv.org/pdf/2404.09275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09271v1","updated":"2024-04-14T14:26:33Z","published":"2024-04-14T14:26:33Z","title":"VRS-NeRF: Visual Relocalization with Sparse Neural Radiance Field","summary":" Visual relocalization is a key technique to autonomous driving, robotics, and\nvirtual/augmented reality. After decades of explorations, absolute pose\nregression (APR), scene coordinate regression (SCR), and hierarchical methods\n(HMs) have become the most popular frameworks. However, in spite of high\nefficiency, APRs and SCRs have limited accuracy especially in large-scale\noutdoor scenes; HMs are accurate but need to store a large number of 2D\ndescriptors for matching, resulting in poor efficiency. In this paper, we\npropose an efficient and accurate framework, called VRS-NeRF, for visual\nrelocalization with sparse neural radiance field. Precisely, we introduce an\nexplicit geometric map (EGM) for 3D map representation and an implicit learning\nmap (ILM) for sparse patches rendering. In this localization process, EGP\nprovides priors of spare 2D points and ILM utilizes these sparse points to\nrender patches with sparse NeRFs for matching. This allows us to discard a\nlarge number of 2D descriptors so as to reduce the map size. Moreover,\nrendering patches only for useful points rather than all pixels in the whole\nimage reduces the rendering time significantly. This framework inherits the\naccuracy of HMs and discards their low efficiency. Experiments on 7Scenes,\nCambridgeLandmarks, and Aachen datasets show that our method gives much better\naccuracy than APRs and SCRs, and close performance to HMs but is much more\nefficient.\n","authors":["Fei Xue","Ignas Budvytis","Daniel Olmeda Reino","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2404.09271v1.pdf","comment":"source code https://github.com/feixue94/vrs-nerf"},{"id":"http://arxiv.org/abs/2404.09269v1","updated":"2024-04-14T14:24:13Z","published":"2024-04-14T14:24:13Z","title":"PANet: A Physics-guided Parametric Augmentation Net for Image Dehazing\n by Hazing","summary":" Image dehazing faces challenges when dealing with hazy images in real-world\nscenarios. A huge domain gap between synthetic and real-world haze images\ndegrades dehazing performance in practical settings. However, collecting\nreal-world image datasets for training dehazing models is challenging since\nboth hazy and clean pairs must be captured under the same conditions. In this\npaper, we propose a Physics-guided Parametric Augmentation Network (PANet) that\ngenerates photo-realistic hazy and clean training pairs to effectively enhance\nreal-world dehazing performance. PANet comprises a Haze-to-Parameter Mapper\n(HPM) to project hazy images into a parameter space and a Parameter-to-Haze\nMapper (PHM) to map the resampled haze parameters back to hazy images. In the\nparameter space, we can pixel-wisely resample individual haze parameter maps to\ngenerate diverse hazy images with physically-explainable haze conditions unseen\nin the training set. Our experimental results demonstrate that PANet can\naugment diverse realistic hazy images to enrich existing hazy image benchmarks\nso as to effectively boost the performances of state-of-the-art image dehazing\nmodels.\n","authors":["Chih-Ling Chang","Fu-Jen Tsai","Zi-Ling Huang","Lin Gu","Chia-Wen Lin"],"pdf_url":"https://arxiv.org/pdf/2404.09269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07564v2","updated":"2024-04-14T14:11:58Z","published":"2024-03-12T11:51:59Z","title":"RSBuilding: Towards General Remote Sensing Image Building Extraction and\n Change Detection with Foundation Model","summary":" The intelligent interpretation of buildings plays a significant role in urban\nplanning and management, macroeconomic analysis, population dynamics, etc.\nRemote sensing image building interpretation primarily encompasses building\nextraction and change detection. However, current methodologies often treat\nthese two tasks as separate entities, thereby failing to leverage shared\nknowledge. Moreover, the complexity and diversity of remote sensing image\nscenes pose additional challenges, as most algorithms are designed to model\nindividual small datasets, thus lacking cross-scene generalization. In this\npaper, we propose a comprehensive remote sensing image building understanding\nmodel, termed RSBuilding, developed from the perspective of the foundation\nmodel. RSBuilding is designed to enhance cross-scene generalization and task\nuniversality. Specifically, we extract image features based on the prior\nknowledge of the foundation model and devise a multi-level feature sampler to\naugment scale information. To unify task representation and integrate image\nspatiotemporal clues, we introduce a cross-attention decoder with task prompts.\nAddressing the current shortage of datasets that incorporate annotations for\nboth tasks, we have developed a federated training strategy to facilitate\nsmooth model convergence even when supervision for some tasks is missing,\nthereby bolstering the complementarity of different tasks. Our model was\ntrained on a dataset comprising up to 245,000 images and validated on multiple\nbuilding extraction and change detection datasets. The experimental results\nsubstantiate that RSBuilding can concurrently handle two structurally distinct\ntasks and exhibits robust zero-shot generalization capabilities.\n","authors":["Mingze Wang","Lili Su","Cilin Yan","Sheng Xu","Pengcheng Yuan","Xiaolong Jiang","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.07564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09263v1","updated":"2024-04-14T14:06:42Z","published":"2024-04-14T14:06:42Z","title":"Task-Driven Exploration: Decoupling and Inter-Task Feedback for Joint\n Moment Retrieval and Highlight Detection","summary":" Video moment retrieval and highlight detection are two highly valuable tasks\nin video understanding, but until recently they have been jointly studied.\nAlthough existing studies have made impressive advancement recently, they\npredominantly follow the data-driven bottom-up paradigm. Such paradigm\noverlooks task-specific and inter-task effects, resulting in poor model\nperformance. In this paper, we propose a novel task-driven top-down framework\nTaskWeave for joint moment retrieval and highlight detection. The framework\nintroduces a task-decoupled unit to capture task-specific and common\nrepresentations. To investigate the interplay between the two tasks, we propose\nan inter-task feedback mechanism, which transforms the results of one task as\nguiding masks to assist the other task. Different from existing methods, we\npresent a task-dependent joint loss function to optimize the model.\nComprehensive experiments and in-depth ablation studies on QVHighlights, TVSum,\nand Charades-STA datasets corroborate the effectiveness and flexibility of the\nproposed framework. Codes are available at\nhttps://github.com/EdenGabriel/TaskWeave.\n","authors":["Jin Yang","Ping Wei","Huan Li","Ziyang Ren"],"pdf_url":"https://arxiv.org/pdf/2404.09263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09259v1","updated":"2024-04-14T13:56:30Z","published":"2024-04-14T13:56:30Z","title":"FedCCL: Federated Dual-Clustered Feature Contrast Under Domain\n Heterogeneity","summary":" Federated learning (FL) facilitates a privacy-preserving neural network\ntraining paradigm through collaboration between edge clients and a central\nserver. One significant challenge is that the distributed data is not\nindependently and identically distributed (non-IID), typically including both\nintra-domain and inter-domain heterogeneity. However, recent research is\nlimited to simply using averaged signals as a form of regularization and only\nfocusing on one aspect of these non-IID challenges. Given these limitations,\nthis paper clarifies these two non-IID challenges and attempts to introduce\ncluster representation to address them from both local and global perspectives.\nSpecifically, we propose a dual-clustered feature contrast-based FL framework\nwith dual focuses. First, we employ clustering on the local representations of\neach client, aiming to capture intra-class information based on these local\nclusters at a high level of granularity. Then, we facilitate cross-client\nknowledge sharing by pulling the local representation closer to clusters shared\nby clients with similar semantics while pushing them away from clusters with\ndissimilar semantics. Second, since the sizes of local clusters belonging to\nthe same class may differ for each client, we further utilize clustering on the\nglobal side and conduct averaging to create a consistent global signal for\nguiding each local training in a contrastive manner. Experimental results on\nmultiple datasets demonstrate that our proposal achieves comparable or superior\nperformance gain under intra-domain and inter-domain heterogeneity.\n","authors":["Yu Qiao","Huy Q. Le","Mengchun Zhang","Apurba Adhikary","Chaoning Zhang","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2404.09259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09254v1","updated":"2024-04-14T13:39:02Z","published":"2024-04-14T13:39:02Z","title":"TEXT2TASTE: A Versatile Egocentric Vision System for Intelligent Reading\n Assistance Using Large Language Model","summary":" The ability to read, understand and find important information from written\ntext is a critical skill in our daily lives for our independence, comfort and\nsafety. However, a significant part of our society is affected by partial\nvision impairment, which leads to discomfort and dependency in daily\nactivities. To address the limitations of this part of society, we propose an\nintelligent reading assistant based on smart glasses with embedded RGB cameras\nand a Large Language Model (LLM), whose functionality goes beyond corrective\nlenses. The video recorded from the egocentric perspective of a person wearing\nthe glasses is processed to localise text information using object detection\nand optical character recognition methods. The LLM processes the data and\nallows the user to interact with the text and responds to a given query, thus\nextending the functionality of corrective lenses with the ability to find and\nsummarize knowledge from the text. To evaluate our method, we create a\nchat-based application that allows the user to interact with the system. The\nevaluation is conducted in a real-world setting, such as reading menus in a\nrestaurant, and involves four participants. The results show robust accuracy in\ntext retrieval. The system not only provides accurate meal suggestions but also\nachieves high user satisfaction, highlighting the potential of smart glasses\nand LLMs in assisting people with special needs.\n","authors":["Wiktor Mucha","Florin Cuconasu","Naome A. Etori","Valia Kalokyri","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2404.09254v1.pdf","comment":"Accepted at ICCHP 2024"},{"id":"http://arxiv.org/abs/2312.06709v4","updated":"2024-04-14T13:35:14Z","published":"2023-12-10T17:07:29Z","title":"AM-RADIO: Agglomerative Vision Foundation Model -- Reduce All Domains\n Into One","summary":" A handful of visual foundation models (VFMs) have recently emerged as the\nbackbones for numerous downstream tasks. VFMs like CLIP, DINOv2, SAM are\ntrained with distinct objectives, exhibiting unique characteristics for various\ndownstream tasks. We find that despite their conceptual differences, these\nmodels can be effectively merged into a unified model through multi-teacher\ndistillation. We name this approach AM-RADIO (Agglomerative Model -- Reduce All\nDomains Into One). This integrative approach not only surpasses the performance\nof individual teacher models but also amalgamates their distinctive features,\nsuch as zero-shot vision-language comprehension, detailed pixel-level\nunderstanding, and open vocabulary segmentation capabilities. In pursuit of the\nmost hardware-efficient backbone, we evaluated numerous architectures in our\nmulti-teacher distillation pipeline using the same training recipe. This led to\nthe development of a novel architecture (E-RADIO) that exceeds the performance\nof its predecessors and is at least 7x faster than the teacher models. Our\ncomprehensive benchmarking process covers downstream tasks including ImageNet\nclassification, ADE20k semantic segmentation, COCO object detection and\nLLaVa-1.5 framework.\n Code: https://github.com/NVlabs/RADIO\n","authors":["Mike Ranzinger","Greg Heinrich","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2312.06709v4.pdf","comment":"CVPR 2024 Version 3: CVPR Camera Ready, reconfigured full paper,\n table 1 is now more comprehensive Version 2: Added more acknowledgements and\n updated table 7 with more recent results. Ensured that the link in the\n abstract to our code is working properly Version 3: Fix broken hyperlinks"},{"id":"http://arxiv.org/abs/2404.07766v2","updated":"2024-04-14T13:14:54Z","published":"2024-04-11T14:05:37Z","title":"RMAFF-PSN: A Residual Multi-Scale Attention Feature Fusion Photometric\n Stereo Network","summary":" Predicting accurate normal maps of objects from two-dimensional images in\nregions of complex structure and spatial material variations is challenging\nusing photometric stereo methods due to the influence of surface reflection\nproperties caused by variations in object geometry and surface materials. To\naddress this issue, we propose a photometric stereo network called a RMAFF-PSN\nthat uses residual multiscale attentional feature fusion to handle the\n``difficult'' regions of the object. Unlike previous approaches that only use\nstacked convolutional layers to extract deep features from the input image, our\nmethod integrates feature information from different resolution stages and\nscales of the image. This approach preserves more physical information, such as\ntexture and geometry of the object in complex regions, through shallow-deep\nstage feature extraction, double branching enhancement, and attention\noptimization. To test the network structure under real-world conditions, we\npropose a new real dataset called Simple PS data, which contains multiple\nobjects with varying structures and materials. Experimental results on a\npublicly available benchmark dataset demonstrate that our method outperforms\nmost existing calibrated photometric stereo methods for the same number of\ninput images, especially in the case of highly non-convex object structures.\nOur method also obtains good results under sparse lighting conditions.\n","authors":["Kai Luo","Yakun Ju","Lin Qi","Kaixuan Wang","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.07766v2.pdf","comment":"17 pages,12 figures"},{"id":"http://arxiv.org/abs/2404.09245v1","updated":"2024-04-14T13:14:13Z","published":"2024-04-14T13:14:13Z","title":"Arena: A Patch-of-Interest ViT Inference Acceleration System for\n Edge-Assisted Video Analytics","summary":" The advent of edge computing has made real-time intelligent video analytics\nfeasible. Previous works, based on traditional model architecture (e.g., CNN,\nRNN, etc.), employ various strategies to filter out non-region-of-interest\ncontent to minimize bandwidth and computation consumption but show inferior\nperformance in adverse environments. Recently, visual foundation models based\non transformers have shown great performance in adverse environments due to\ntheir amazing generalization capability. However, they require a large amount\nof computation power, which limits their applications in real-time intelligent\nvideo analytics. In this paper, we find visual foundation models like Vision\nTransformer (ViT) also have a dedicated acceleration mechanism for video\nanalytics. To this end, we introduce Arena, an end-to-end edge-assisted video\ninference acceleration system based on ViT. We leverage the capability of ViT\nthat can be accelerated through token pruning by only offloading and feeding\nPatches-of-Interest (PoIs) to the downstream models. Additionally, we employ\nprobability-based patch sampling, which provides a simple but efficient\nmechanism for determining PoIs where the probable locations of objects are in\nsubsequent frames. Through extensive evaluations on public datasets, our\nfindings reveal that Arena can boost inference speeds by up to $1.58\\times$ and\n$1.82\\times$ on average while consuming only 54% and 34% of the bandwidth,\nrespectively, all with high inference accuracy.\n","authors":["Haosong Peng","Wei Feng","Hao Li","Yufeng Zhan","Qihua Zhou","Yuanqing Xia"],"pdf_url":"https://arxiv.org/pdf/2404.09245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12091v4","updated":"2024-04-14T13:03:26Z","published":"2023-03-21T09:07:15Z","title":"Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised\n Learning","summary":" Semi-supervised learning (SSL) methods assume that labeled data, unlabeled\ndata and test data are from the same distribution. Open-set semi-supervised\nlearning (Open-set SSL) considers a more practical scenario, where unlabeled\ndata and test data contain new categories (outliers) not observed in labeled\ndata (inliers). Most previous works focused on outlier detection via binary\nclassifiers, which suffer from insufficient scalability and inability to\ndistinguish different types of uncertainty. In this paper, we propose a novel\nframework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these\nlimitations. Concretely, we first introduce evidential deep learning (EDL) as\nan outlier detector to quantify different types of uncertainty, and design\ndifferent uncertainty metrics for self-training and inference. Furthermore, we\npropose a novel adaptive negative optimization strategy, making EDL more\ntailored to the unlabeled dataset containing both inliers and outliers. As\ndemonstrated empirically, our proposed method outperforms existing\nstate-of-the-art methods across four datasets.\n","authors":["Yang Yu","Danruo Deng","Furui Liu","Yueming Jin","Qi Dou","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2303.12091v4.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2211.08089v4","updated":"2024-04-14T13:02:59Z","published":"2022-11-15T12:15:29Z","title":"DeS3: Adaptive Attention-driven Self and Soft Shadow Removal using ViT\n Similarity","summary":" Removing soft and self shadows that lack clear boundaries from a single image\nis still challenging. Self shadows are shadows that are cast on the object\nitself. Most existing methods rely on binary shadow masks, without considering\nthe ambiguous boundaries of soft and self shadows. In this paper, we present\nDeS3, a method that removes hard, soft and self shadows based on adaptive\nattention and ViT similarity. Our novel ViT similarity loss utilizes features\nextracted from a pre-trained Vision Transformer. This loss helps guide the\nreverse sampling towards recovering scene structures. Our adaptive attention is\nable to differentiate shadow regions from the underlying objects, as well as\nshadow regions from the object casting the shadow. This capability enables DeS3\nto better recover the structures of objects even when they are partially\noccluded by shadows. Different from existing methods that rely on constraints\nduring the training phase, we incorporate the ViT similarity during the\nsampling stage. Our method outperforms state-of-the-art methods on the SRD,\nAISTD, LRSS, USR and UIUC datasets, removing hard, soft, and self shadows\nrobustly. Specifically, our method outperforms the SOTA method by 16\\% of the\nRMSE of the whole image on the LRSS dataset. Our data and code is available at:\n\\url{https://github.com/jinyeying/DeS3_Deshadow}\n","authors":["Yeying Jin","Wei Ye","Wenhan Yang","Yuan Yuan","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2211.08089v4.pdf","comment":"Accepted to AAAI2024, diffusion shadow removal,\n \\url{https://github.com/jinyeying/DeS3_Deshadow}"},{"id":"http://arxiv.org/abs/2404.05238v2","updated":"2024-04-14T12:48:55Z","published":"2024-04-08T07:09:15Z","title":"Allowing humans to interactively guide machines where to look does not\n always improve human-AI team's classification accuracy","summary":" Via thousands of papers in Explainable AI (XAI), attention maps\n\\cite{vaswani2017attention} and feature attribution maps \\cite{bansal2020sam}\nhave been established as a common means for finding how important each input\nfeature is to an AI's decisions. It is an interesting, unexplored question\nwhether allowing users to edit the feature importance at test time would\nimprove a human-AI team's accuracy on downstream tasks. In this paper, we\naddress this question by leveraging CHM-Corr, a state-of-the-art, ante-hoc\nexplainable classifier \\cite{taesiri2022visual} that first predicts patch-wise\ncorrespondences between the input and training-set images, and then base on\nthem to make classification decisions. We build CHM-Corr++, an interactive\ninterface for CHM-Corr, enabling users to edit the feature attribution map\nprovided by CHM-Corr and observe updated model decisions. Via CHM-Corr++, users\ncan gain insights into if, when, and how the model changes its outputs,\nimproving their understanding beyond static explanations. However, our user\nstudy with 18 users who performed 1,400 decisions finds no statistical\nsignificance that our interactive approach improves user accuracy on CUB-200\nbird image classification over static explanations. This challenges the\nhypothesis that interactivity can boost human-AI team\naccuracy~\\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding}\nand raises needs for future research. We open-source CHM-Corr++, an interactive\ntool for editing image classifier attention (see an interactive demo\n\\href{http://137.184.82.109:7080/}{here}). % , and it lays the groundwork for\nfuture research to enable effective human-AI interaction in computer vision. We\nrelease code and data on\n\\href{https://github.com/anguyen8/chm-corr-interactive}{github}.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Sunnie S. Y. Kim","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05238v2.pdf","comment":"Accepted for presentation at the XAI4CV Workshop, part of the CVPR\n 2024 proceedings"},{"id":"http://arxiv.org/abs/2308.11949v2","updated":"2024-04-14T12:43:56Z","published":"2023-08-23T06:45:11Z","title":"High-quality Image Dehazing with Diffusion Model","summary":" Image dehazing is quite challenging in dense-haze scenarios, where quite less\noriginal information remains in the hazy image. Though previous methods have\nmade marvelous progress, they still suffer from information loss in content and\ncolor in dense-haze scenarios. The recently emerged Denoising Diffusion\nProbabilistic Model (DDPM) exhibits strong generation ability, showing\npotential for solving this problem. However, DDPM fails to consider the physics\nproperty of dehazing task, limiting its information completion capacity. In\nthis work, we propose DehazeDDPM: A DDPM-based and physics-aware image dehazing\nframework that applies to complex hazy scenarios. Specifically, DehazeDDPM\nworks in two stages. The former stage physically models the dehazing task with\nthe Atmospheric Scattering Model (ASM), pulling the distribution closer to the\nclear data and endowing DehazeDDPM with fog-aware ability. The latter stage\nexploits the strong generation ability of DDPM to compensate for the\nhaze-induced huge information loss, by working in conjunction with the physical\nmodelling. Extensive experiments demonstrate that our method attains\nstate-of-the-art performance on both synthetic and real-world hazy datasets.\n","authors":["Hu Yu","Jie Huang","Kaiwen Zheng","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09231v1","updated":"2024-04-14T12:19:16Z","published":"2024-04-14T12:19:16Z","title":"Tri-modal Confluence with Temporal Dynamics for Scene Graph Generation\n in Operating Rooms","summary":" A comprehensive understanding of surgical scenes allows for monitoring of the\nsurgical process, reducing the occurrence of accidents and enhancing efficiency\nfor medical professionals. Semantic modeling within operating rooms, as a scene\ngraph generation (SGG) task, is challenging since it involves consecutive\nrecognition of subtle surgical actions over prolonged periods. To address this\nchallenge, we propose a Tri-modal (i.e., images, point clouds, and language)\nconfluence with Temporal dynamics framework, termed TriTemp-OR. Diverging from\nprevious approaches that integrated temporal information via memory graphs, our\nmethod embraces two advantages: 1) we directly exploit bi-modal temporal\ninformation from the video streaming for hierarchical feature interaction, and\n2) the prior knowledge from Large Language Models (LLMs) is embedded to\nalleviate the class-imbalance problem in the operating theatre. Specifically,\nour model performs temporal interactions across 2D frames and 3D point clouds,\nincluding a scale-adaptive multi-view temporal interaction (ViewTemp) and a\ngeometric-temporal point aggregation (PointTemp). Furthermore, we transfer\nknowledge from the biomedical LLM, LLaVA-Med, to deepen the comprehension of\nintraoperative relations. The proposed TriTemp-OR enables the aggregation of\ntri-modal features through relation-aware unification to predict relations so\nas to generate scene graphs. Experimental results on the 4D-OR benchmark\ndemonstrate the superior performance of our model for long-term OR streaming.\n","authors":["Diandian Guo","Manxi Lin","Jialun Pei","He Tang","Yueming Jin","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2404.09231v1.pdf","comment":"10 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.09227v1","updated":"2024-04-14T12:13:07Z","published":"2024-04-14T12:13:07Z","title":"DreamScape: 3D Scene Creation via Gaussian Splatting joint Correlation\n Modeling","summary":" Recent progress in text-to-3D creation has been propelled by integrating the\npotent prior of Diffusion Models from text-to-image generation into the 3D\ndomain. Nevertheless, generating 3D scenes characterized by multiple instances\nand intricate arrangements remains challenging. In this study, we present\nDreamScape, a method for creating highly consistent 3D scenes solely from\ntextual descriptions, leveraging the strong 3D representation capabilities of\nGaussian Splatting and the complex arrangement abilities of large language\nmodels (LLMs). Our approach involves a 3D Gaussian Guide ($3{DG^2}$) for scene\nrepresentation, consisting of semantic primitives (objects) and their spatial\ntransformations and relationships derived directly from text prompts using\nLLMs. This compositional representation allows for local-to-global optimization\nof the entire scene. A progressive scale control is tailored during local\nobject generation, ensuring that objects of different sizes and densities adapt\nto the scene, which addresses training instability issue arising from simple\nblending in the subsequent global optimization stage. To mitigate potential\nbiases of LLM priors, we model collision relationships between objects at the\nglobal level, enhancing physical correctness and overall realism. Additionally,\nto generate pervasive objects like rain and snow distributed extensively across\nthe scene, we introduce a sparse initialization and densification strategy.\nExperiments demonstrate that DreamScape offers high usability and\ncontrollability, enabling the generation of high-fidelity 3D scenes from only\ntext prompts and achieving state-of-the-art performance compared to other\nmethods.\n","authors":["Xuening Yuan","Hongyu Yang","Yueming Zhao","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.09227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09226v1","updated":"2024-04-14T12:09:47Z","published":"2024-04-14T12:09:47Z","title":"Breast Cancer Image Classification Method Based on Deep Transfer\n Learning","summary":" To address the issues of limited samples, time-consuming feature design, and\nlow accuracy in detection and classification of breast cancer pathological\nimages, a breast cancer image classification model algorithm combining deep\nlearning and transfer learning is proposed. This algorithm is based on the\nDenseNet structure of deep neural networks, and constructs a network model by\nintroducing attention mechanisms, and trains the enhanced dataset using\nmulti-level transfer learning. Experimental results demonstrate that the\nalgorithm achieves an efficiency of over 84.0\\% in the test set, with a\nsignificantly improved classification accuracy compared to previous models,\nmaking it applicable to medical breast cancer detection tasks.\n","authors":["Weimin Wang","Min Gao","Mingxuan Xiao","Xu Yan","Yufeng Li"],"pdf_url":"https://arxiv.org/pdf/2404.09226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09792v2","updated":"2024-04-14T11:57:55Z","published":"2024-03-14T18:24:55Z","title":"Images are Achilles' Heel of Alignment: Exploiting Visual\n Vulnerabilities for Jailbreaking Multimodal Large Language Models","summary":" In this paper, we study the harmlessness alignment problem of multimodal\nlarge language models (MLLMs). We conduct a systematic empirical analysis of\nthe harmlessness performance of representative MLLMs and reveal that the image\ninput poses the alignment vulnerability of MLLMs. Inspired by this, we propose\na novel jailbreak method named HADES, which hides and amplifies the harmfulness\nof the malicious intent within the text input, using meticulously crafted\nimages. Experimental results show that HADES can effectively jailbreak existing\nMLLMs, which achieves an average Attack Success Rate (ASR) of 90.26% for\nLLaVA-1.5 and 71.60% for Gemini Pro Vision. Our code and data will be publicly\nreleased.\n","authors":["Yifan Li","Hangyu Guo","Kun Zhou","Wayne Xin Zhao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2403.09792v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2404.09216v1","updated":"2024-04-14T11:01:44Z","published":"2024-04-14T11:01:44Z","title":"DetCLIPv3: Towards Versatile Generative Open-vocabulary Object Detection","summary":" Existing open-vocabulary object detectors typically require a predefined set\nof categories from users, significantly confining their application scenarios.\nIn this paper, we introduce DetCLIPv3, a high-performing detector that excels\nnot only at both open-vocabulary object detection, but also generating\nhierarchical labels for detected objects. DetCLIPv3 is characterized by three\ncore designs: 1. Versatile model architecture: we derive a robust open-set\ndetection framework which is further empowered with generation ability via the\nintegration of a caption head. 2. High information density data: we develop an\nauto-annotation pipeline leveraging visual large language model to refine\ncaptions for large-scale image-text pairs, providing rich, multi-granular\nobject labels to enhance the training. 3. Efficient training strategy: we\nemploy a pre-training stage with low-resolution inputs that enables the object\ncaptioner to efficiently learn a broad spectrum of visual concepts from\nextensive image-text paired data. This is followed by a fine-tuning stage that\nleverages a small number of high-resolution samples to further enhance\ndetection performance. With these effective designs, DetCLIPv3 demonstrates\nsuperior open-vocabulary detection performance, \\eg, our Swin-T backbone model\nachieves a notable 47.0 zero-shot fixed AP on the LVIS minival benchmark,\noutperforming GLIPv2, GroundingDINO, and DetCLIPv2 by 18.0/19.6/6.6 AP,\nrespectively. DetCLIPv3 also achieves a state-of-the-art 19.7 AP in dense\ncaptioning task on VG dataset, showcasing its strong generative capability.\n","authors":["Lewei Yao","Renjie Pi","Jianhua Han","Xiaodan Liang","Hang Xu","Wei Zhang","Zhenguo Li","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09216v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.03425v3","updated":"2024-04-14T10:41:40Z","published":"2024-04-04T13:06:25Z","title":"ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State\n Space Model","summary":" Convolutional neural networks (CNN) and Transformers have made impressive\nprogress in the field of remote sensing change detection (CD). However, both\narchitectures have inherent shortcomings. Recently, the Mamba architecture,\nbased on state space models, has shown remarkable performance in a series of\nnatural language processing tasks, which can effectively compensate for the\nshortcomings of the above two architectures. In this paper, we explore for the\nfirst time the potential of the Mamba architecture for remote sensing CD tasks.\nWe tailor the corresponding frameworks, called MambaBCD, MambaSCD, and\nMambaBDA, for binary change detection (BCD), semantic change detection (SCD),\nand building damage assessment (BDA), respectively. All three frameworks adopt\nthe cutting-edge Visual Mamba architecture as the encoder, which allows full\nlearning of global spatial contextual information from the input images. For\nthe change decoder, which is available in all three architectures, we propose\nthree spatio-temporal relationship modeling mechanisms, which can be naturally\ncombined with the Mamba architecture and fully utilize its attribute to achieve\nspatio-temporal interaction of multi-temporal features, thereby obtaining\naccurate change information. On five benchmark datasets, our proposed\nframeworks outperform current CNN- and Transformer-based approaches without\nusing any complex training strategies or tricks, fully demonstrating the\npotential of the Mamba architecture in CD tasks. Specifically, we obtained\n83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, LEVIR-CD+,\nand WHU-CD; on the SCD dataset SECOND, we obtained 24.11% SeK; and on the BDA\ndataset xBD, we obtained 81.41% overall F1 score. Further experiments show that\nour architecture is quite robust to degraded data. The source code will be\navailable in https://github.com/ChenHongruixuan/MambaCD\n","authors":["Hongruixuan Chen","Jian Song","Chengxi Han","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.03425v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09210v1","updated":"2024-04-14T10:23:30Z","published":"2024-04-14T10:23:30Z","title":"FedDistill: Global Model Distillation for Local Model De-Biasing in\n Non-IID Federated Learning","summary":" Federated Learning (FL) is a novel approach that allows for collaborative\nmachine learning while preserving data privacy by leveraging models trained on\ndecentralized devices. However, FL faces challenges due to non-uniformly\ndistributed (non-iid) data across clients, which impacts model performance and\nits generalization capabilities. To tackle the non-iid issue, recent efforts\nhave utilized the global model as a teaching mechanism for local models.\nHowever, our pilot study shows that their effectiveness is constrained by\nimbalanced data distribution, which induces biases in local models and leads to\na 'local forgetting' phenomenon, where the ability of models to generalize\ndegrades over time, particularly for underrepresented classes. This paper\nintroduces FedDistill, a framework enhancing the knowledge transfer from the\nglobal model to local models, focusing on the issue of imbalanced class\ndistribution. Specifically, FedDistill employs group distillation, segmenting\nclasses based on their frequency in local datasets to facilitate a focused\ndistillation process to classes with fewer samples. Additionally, FedDistill\ndissects the global model into a feature extractor and a classifier. This\nseparation empowers local models with more generalized data representation\ncapabilities and ensures more accurate classification across all classes.\nFedDistill mitigates the adverse effects of data imbalance, ensuring that local\nmodels do not forget underrepresented classes but instead become more adept at\nrecognizing and classifying them accurately. Our comprehensive experiments\ndemonstrate FedDistill's effectiveness, surpassing existing baselines in\naccuracy and convergence speed across several benchmark datasets.\n","authors":["Changlin Song","Divya Saxena","Jiannong Cao","Yuqing Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.09210v1.pdf","comment":"13 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.03441v5","updated":"2024-04-14T10:13:25Z","published":"2023-12-06T11:50:14Z","title":"UFineBench: Towards Text-based Person Retrieval with Ultra-fine\n Granularity","summary":" Existing text-based person retrieval datasets often have relatively\ncoarse-grained text annotations. This hinders the model to comprehend the\nfine-grained semantics of query texts in real scenarios. To address this\nproblem, we contribute a new benchmark named \\textbf{UFineBench} for text-based\nperson retrieval with ultra-fine granularity.\n Firstly, we construct a new \\textbf{dataset} named UFine6926. We collect a\nlarge number of person images and manually annotate each image with two\ndetailed textual descriptions, averaging 80.8 words each. The average word\ncount is three to four times that of the previous datasets. In addition of\nstandard in-domain evaluation, we also propose a special \\textbf{evaluation\nparadigm} more representative of real scenarios. It contains a new evaluation\nset with cross domains, cross textual granularity and cross textual styles,\nnamed UFine3C, and a new evaluation metric for accurately measuring retrieval\nability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a\nmore efficient \\textbf{algorithm} especially designed for text-based person\nretrieval with ultra fine-grained texts. It achieves fine granularity mining by\nadopting a shared cross-modal granularity decoder and hard negative match\nmechanism.\n With standard in-domain evaluation, CFAM establishes competitive performance\nacross various datasets, especially on our ultra fine-grained UFine6926.\nFurthermore, by evaluating on UFine3C, we demonstrate that training on our\nUFine6926 significantly improves generalization to real scenarios compared with\nother coarse-grained datasets. The dataset and code will be made publicly\navailable at \\url{https://github.com/Zplusdragon/UFineBench}.\n","authors":["Jialong Zuo","Hanyu Zhou","Ying Nie","Feng Zhang","Tianyu Guo","Nong Sang","Yunhe Wang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2312.03441v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09204v1","updated":"2024-04-14T09:48:37Z","published":"2024-04-14T09:48:37Z","title":"TextHawk: Exploring Efficient Fine-Grained Perception of Multimodal\n Large Language Models","summary":" Multimodal Large Language Models (MLLMs) have shown impressive results on\nvarious multimodal tasks. However, most existing MLLMs are not well suited for\ndocument-oriented tasks, which require fine-grained image perception and\ninformation compression. In this paper, we present TextHawk, a MLLM that is\nspecifically designed for document-oriented tasks, while preserving the general\ncapabilities of MLLMs. TextHawk is aimed to explore efficient fine-grained\nperception by designing four dedicated components. Firstly, a ReSampling and\nReArrangement (ReSA) module is proposed to reduce the redundancy in the\ndocument texts and lower the computational cost of the MLLM. We explore\nencoding the positions of each local feature by presenting Scalable Positional\nEmbeddings (SPEs), which can preserve the scalability of various image sizes. A\nQuery Proposal Network (QPN) is then adopted to initialize the queries\ndynamically among different sub-images. To further enhance the fine-grained\nvisual perceptual ability of the MLLM, we design a Multi-Level Cross-Attention\n(MLCA) mechanism that captures the hierarchical structure and semantic\nrelations of document images. Furthermore, we create a new instruction-tuning\ndataset for document-oriented tasks by enriching the multimodal document data\nwith Gemini Pro. We conduct extensive experiments on both general and\ndocument-oriented MLLM benchmarks, and show that TextHawk outperforms the\nstate-of-the-art methods, demonstrating its effectiveness and superiority in\nfine-grained document perception and general abilities.\n","authors":["Ya-Qi Yu","Minghui Liao","Jihao Wu","Yongxin Liao","Xiaoyu Zheng","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.09204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06075v2","updated":"2024-04-14T09:28:28Z","published":"2024-03-10T03:43:02Z","title":"Multisize Dataset Condensation","summary":" While dataset condensation effectively enhances training efficiency, its\napplication in on-device scenarios brings unique challenges. 1) Due to the\nfluctuating computational resources of these devices, there's a demand for a\nflexible dataset size that diverges from a predefined size. 2) The limited\ncomputational power on devices often prevents additional condensation\noperations. These two challenges connect to the \"subset degradation problem\" in\ntraditional dataset condensation: a subset from a larger condensed dataset is\noften unrepresentative compared to directly condensing the whole dataset to\nthat smaller size. In this paper, we propose Multisize Dataset Condensation\n(MDC) by compressing N condensation processes into a single condensation\nprocess to obtain datasets with multiple sizes. Specifically, we introduce an\n\"adaptive subset loss\" on top of the basic condensation loss to mitigate the\n\"subset degradation problem\". Our MDC method offers several benefits: 1) No\nadditional condensation process is required; 2) reduced storage requirement by\nreusing condensed images. Experiments validate our findings on networks\nincluding ConvNet, ResNet and DenseNet, and datasets including SVHN, CIFAR-10,\nCIFAR-100 and ImageNet. For example, we achieved 5.22%-6.40% average accuracy\ngains on condensing CIFAR-10 to ten images per class. Code is available at:\nhttps://github.com/he-y/Multisize-Dataset-Condensation.\n","authors":["Yang He","Lingao Xiao","Joey Tianyi Zhou","Ivor Tsang"],"pdf_url":"https://arxiv.org/pdf/2403.06075v2.pdf","comment":"Accepted by ICLR 2024 Oral"},{"id":"http://arxiv.org/abs/2404.06564v3","updated":"2024-04-14T09:14:23Z","published":"2024-04-09T18:28:55Z","title":"MambaAD: Exploring State Space Models for Multi-class Unsupervised\n Anomaly Detection","summary":" Recent advancements in anomaly detection have seen the efficacy of CNN- and\ntransformer-based approaches. However, CNNs struggle with long-range\ndependencies, while transformers are burdened by quadratic computational\ncomplexity. Mamba-based models, with their superior long-range modeling and\nlinear efficiency, have garnered substantial attention. This study pioneers the\napplication of Mamba to multi-class unsupervised anomaly detection, presenting\nMambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring\n(Locality-Enhanced State Space) LSS modules at multi-scales. The proposed LSS\nmodule, integrating parallel cascaded (Hybrid State Space) HSS blocks and\nmulti-kernel convolutions operations, effectively captures both long-range and\nlocal information. The HSS block, utilizing (Hybrid Scanning) HS encoders,\nencodes feature maps into five scanning methods and eight directions, thereby\nstrengthening global connections through the (State Space Model) SSM. The use\nof Hilbert scanning and eight directions significantly improves feature\nsequence modeling. Comprehensive experiments on six diverse anomaly detection\ndatasets and seven metrics demonstrate state-of-the-art performance,\nsubstantiating the method's effectiveness.\n","authors":["Haoyang He","Yuhu Bai","Jiangning Zhang","Qingdong He","Hongxu Chen","Zhenye Gan","Chengjie Wang","Xiangtai Li","Guanzhong Tian","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09193v1","updated":"2024-04-14T09:01:26Z","published":"2024-04-14T09:01:26Z","title":"FaceCat: Enhancing Face Recognition Security with a Unified Generative\n Model Framework","summary":" Face anti-spoofing (FAS) and adversarial detection (FAD) have been regarded\nas critical technologies to ensure the safety of face recognition systems. As a\nconsequence of their limited practicality and generalization, some existing\nmethods aim to devise a framework capable of concurrently detecting both\nthreats to address the challenge. Nevertheless, these methods still encounter\nchallenges of insufficient generalization and suboptimal robustness,\npotentially owing to the inherent drawback of discriminative models. Motivated\nby the rich structural and detailed features of face generative models, we\npropose FaceCat which utilizes the face generative model as a pre-trained model\nto improve the performance of FAS and FAD. Specifically, FaceCat elaborately\ndesigns a hierarchical fusion mechanism to capture rich face semantic features\nof the generative model. These features then serve as a robust foundation for a\nlightweight head, designed to execute FAS and FAD tasks simultaneously. As\nrelying solely on single-modality data often leads to suboptimal performance,\nwe further propose a novel text-guided multi-modal alignment strategy that\nutilizes text prompts to enrich feature representation, thereby enhancing\nperformance. For fair evaluations, we build a comprehensive protocol with a\nwide range of 28 attack types to benchmark the performance. Extensive\nexperiments validate the effectiveness of FaceCat generalizes significantly\nbetter and obtains excellent robustness against input transformations.\n","authors":["Jiawei Chen","Xiao Yang","Yinpeng Dong","Hang Su","Jianteng Peng","Zhaoxia Yin"],"pdf_url":"https://arxiv.org/pdf/2404.09193v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.06270v2","updated":"2024-04-14T08:40:51Z","published":"2024-04-09T12:47:30Z","title":"3D Geometry-aware Deformable Gaussian Splatting for Dynamic View\n Synthesis","summary":" In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting\nmethod for dynamic view synthesis. Existing neural radiance fields (NeRF) based\nsolutions learn the deformation in an implicit manner, which cannot incorporate\n3D scene geometry. Therefore, the learned deformation is not necessarily\ngeometrically coherent, which results in unsatisfactory dynamic view synthesis\nand 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new\nrepresentation of the 3D scene, building upon which the 3D geometry could be\nexploited in learning the complex 3D deformation. Specifically, the scenes are\nrepresented as a collection of 3D Gaussian, where each 3D Gaussian is optimized\nto move and rotate over time to model the deformation. To enforce the 3D scene\ngeometry constraint during deformation, we explicitly extract 3D geometry\nfeatures and integrate them in learning the 3D deformation. In this way, our\nsolution achieves 3D geometry-aware deformation modeling, which enables\nimproved dynamic view synthesis and 3D dynamic reconstruction. Extensive\nexperimental results on both synthetic and real datasets prove the superiority\nof our solution, which achieves new state-of-the-art performance.\n The project is available at https://npucvr.github.io/GaGS/\n","authors":["Zhicheng Lu","Xiang Guo","Le Hui","Tianrui Chen","Min Yang","Xiao Tang","Feng Zhu","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2404.06270v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/"},{"id":"http://arxiv.org/abs/2404.09179v1","updated":"2024-04-14T08:09:33Z","published":"2024-04-14T08:09:33Z","title":"Change Guiding Network: Incorporating Change Prior to Guide Change\n Detection in Remote Sensing Imagery","summary":" The rapid advancement of automated artificial intelligence algorithms and\nremote sensing instruments has benefited change detection (CD) tasks. However,\nthere is still a lot of space to study for precise detection, especially the\nedge integrity and internal holes phenomenon of change features. In order to\nsolve these problems, we design the Change Guiding Network (CGNet), to tackle\nthe insufficient expression problem of change features in the conventional\nU-Net structure adopted in previous methods, which causes inaccurate edge\ndetection and internal holes. Change maps from deep features with rich semantic\ninformation are generated and used as prior information to guide multi-scale\nfeature fusion, which can improve the expression ability of change features.\nMeanwhile, we propose a self-attention module named Change Guide Module (CGM),\nwhich can effectively capture the long-distance dependency among pixels and\neffectively overcome the problem of the insufficient receptive field of\ntraditional convolutional neural networks. On four major CD datasets, we verify\nthe usefulness and efficiency of the CGNet, and a large number of experiments\nand ablation studies demonstrate the effectiveness of CGNet. We're going to\nopen-source our code at https://github.com/ChengxiHAN/CGNet-CD.\n","authors":["Chengxi Han","Chen Wu","Haonan Guo","Meiqi Hu","Jiepan Li","Hongruixuan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09178v1","updated":"2024-04-14T08:01:27Z","published":"2024-04-14T08:01:27Z","title":"HANet: A Hierarchical Attention Network for Change Detection With\n Bitemporal Very-High-Resolution Remote Sensing Images","summary":" Benefiting from the developments in deep learning technology,\ndeep-learning-based algorithms employing automatic feature extraction have\nachieved remarkable performance on the change detection (CD) task. However, the\nperformance of existing deep-learning-based CD methods is hindered by the\nimbalance between changed and unchanged pixels. To tackle this problem, a\nprogressive foreground-balanced sampling strategy on the basis of not adding\nchange information is proposed in this article to help the model accurately\nlearn the features of the changed pixels during the early training process and\nthereby improve detection performance.Furthermore, we design a discriminative\nSiamese network, hierarchical attention network (HANet), which can integrate\nmultiscale features and refine detailed features. The main part of HANet is the\nHAN module, which is a lightweight and effective self-attention mechanism.\nExtensive experiments and ablation studies on two CDdatasets with extremely\nunbalanced labels validate the effectiveness and efficiency of the proposed\nmethod.\n","authors":["Chengxi Han","Chen Wu","Haonan Guo","Meiqi Hu","Hongruixuan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04760v3","updated":"2024-04-14T07:44:19Z","published":"2023-07-10T17:58:17Z","title":"Learning Spatial Features from Audio-Visual Correspondence in Egocentric\n Videos","summary":" We propose a self-supervised method for learning representations based on\nspatial audio-visual correspondences in egocentric videos. Our method uses a\nmasked auto-encoding framework to synthesize masked binaural (multi-channel)\naudio through the synergy of audio and vision, thereby learning useful spatial\nrelationships between the two modalities. We use our pretrained features to\ntackle two downstream video tasks requiring spatial understanding in social\nscenarios: active speaker detection and spatial audio denoising. Through\nextensive experiments, we show that our features are generic enough to improve\nover multiple state-of-the-art baselines on both tasks on two challenging\negocentric video datasets that offer binaural audio, EgoCom and EasyCom.\nProject: http://vision.cs.utexas.edu/projects/ego_av_corr.\n","authors":["Sagnik Majumder","Ziad Al-Halah","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2307.04760v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09172v1","updated":"2024-04-14T07:36:18Z","published":"2024-04-14T07:36:18Z","title":"LoopAnimate: Loopable Salient Object Animation","summary":" Research on diffusion model-based video generation has advanced rapidly.\nHowever, limitations in object fidelity and generation length hinder its\npractical applications. Additionally, specific domains like animated wallpapers\nrequire seamless looping, where the first and last frames of the video match\nseamlessly. To address these challenges, this paper proposes LoopAnimate, a\nnovel method for generating videos with consistent start and end frames. To\nenhance object fidelity, we introduce a framework that decouples multi-level\nimage appearance and textual semantic information. Building upon an\nimage-to-image diffusion model, our approach incorporates both pixel-level and\nfeature-level information from the input image, injecting image appearance and\ntextual semantic embeddings at different positions of the diffusion model.\nExisting UNet-based video generation models require to input the entire videos\nduring training to encode temporal and positional information at once. However,\ndue to limitations in GPU memory, the number of frames is typically restricted\nto 16. To address this, this paper proposes a three-stage training strategy\nwith progressively increasing frame numbers and reducing fine-tuning modules.\nAdditionally, we introduce the Temporal E nhanced Motion Module(TEMM) to extend\nthe capacity for encoding temporal and positional information up to 36 frames.\nThe proposed LoopAnimate, which for the first time extends the single-pass\ngeneration length of UNet-based video generation models to 35 frames while\nmaintaining high-quality video generation. Experiments demonstrate that\nLoopAnimate achieves state-of-the-art performance in both objective metrics,\nsuch as fidelity and temporal consistency, and subjective evaluation results.\n","authors":["Fanyi Wang","Peng Liu","Haotian Hu","Dan Meng","Jingwen Su","Jinjin Xu","Yanhao Zhang","Xiaoming Ren","Zhiwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19473v3","updated":"2024-04-14T07:01:41Z","published":"2024-02-29T18:59:01Z","title":"Retrieval-Augmented Generation for AI-Generated Content: A Survey","summary":" Advancements in model algorithms, the growth of foundational models, and\naccess to high-quality datasets have propelled the evolution of Artificial\nIntelligence Generated Content (AIGC). Despite its notable successes, AIGC\nstill faces hurdles such as updating knowledge, handling long-tail data,\nmitigating data leakage, and managing high training and inference costs.\nRetrieval-Augmented Generation (RAG) has recently emerged as a paradigm to\naddress such challenges. In particular, RAG introduces the information\nretrieval process, which enhances the generation process by retrieving relevant\nobjects from available data stores, leading to higher accuracy and better\nrobustness. In this paper, we comprehensively review existing efforts that\nintegrate RAG technique into AIGC scenarios. We first classify RAG foundations\naccording to how the retriever augments the generator, distilling the\nfundamental abstractions of the augmentation methodologies for various\nretrievers and generators. This unified perspective encompasses all RAG\nscenarios, illuminating advancements and pivotal technologies that help with\npotential future progress. We also summarize additional enhancements methods\nfor RAG, facilitating effective engineering and implementation of RAG systems.\nThen from another view, we survey on practical applications of RAG across\ndifferent modalities and tasks, offering valuable references for researchers\nand practitioners. Furthermore, we introduce the benchmarks for RAG, discuss\nthe limitations of current RAG systems, and suggest potential directions for\nfuture research. Github: https://github.com/PKU-DAIR/RAG-Survey.\n","authors":["Penghao Zhao","Hailin Zhang","Qinhan Yu","Zhengren Wang","Yunteng Geng","Fangcheng Fu","Ling Yang","Wentao Zhang","Jie Jiang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2402.19473v3.pdf","comment":"Citing 377 papers, 28 pages, 1 table, 12 figures. Project:\n https://github.com/PKU-DAIR/RAG-Survey"},{"id":"http://arxiv.org/abs/2401.03890v2","updated":"2024-04-14T06:50:24Z","published":"2024-01-08T13:42:59Z","title":"A Survey on 3D Gaussian Splatting","summary":" 3D Gaussian splatting (GS) has recently emerged as a transformative technique\nin the realm of explicit radiance field and computer graphics. This innovative\napproach, characterized by the utilization of millions of learnable 3D\nGaussians, represents a significant departure from mainstream neural radiance\nfield approaches, which predominantly use implicit, coordinate-based models to\nmap spatial coordinates to pixel values. 3D GS, with its explicit scene\nrepresentation and differentiable rendering algorithm, not only promises\nreal-time rendering capability but also introduces unprecedented levels of\neditability. This positions 3D GS as a potential game-changer for the next\ngeneration of 3D reconstruction and representation. In the present paper, we\nprovide the first systematic overview of the recent developments and critical\ncontributions in the domain of 3D GS. We begin with a detailed exploration of\nthe underlying principles and the driving forces behind the emergence of 3D GS,\nlaying the groundwork for understanding its significance. A focal point of our\ndiscussion is the practical applicability of 3D GS. By enabling unprecedented\nrendering speed, 3D GS opens up a plethora of applications, ranging from\nvirtual reality to interactive media and beyond. This is complemented by a\ncomparative analysis of leading 3D GS models, evaluated across various\nbenchmark tasks to highlight their performance and practical utility. The\nsurvey concludes by identifying current challenges and suggesting potential\navenues for future research in this domain. Through this survey, we aim to\nprovide a valuable resource for both newcomers and seasoned researchers,\nfostering further exploration and advancement in applicable and explicit\nradiance field representation.\n","authors":["Guikun Chen","Wenguan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.03890v2.pdf","comment":"Ongoing project"},{"id":"http://arxiv.org/abs/2404.09161v1","updated":"2024-04-14T06:46:16Z","published":"2024-04-14T06:46:16Z","title":"Coreset Selection for Object Detection","summary":" Coreset selection is a method for selecting a small, representative subset of\nan entire dataset. It has been primarily researched in image classification,\nassuming there is only one object per image. However, coreset selection for\nobject detection is more challenging as an image can contain multiple objects.\nAs a result, much research has yet to be done on this topic. Therefore, we\nintroduce a new approach, Coreset Selection for Object Detection (CSOD). CSOD\ngenerates imagewise and classwise representative feature vectors for multiple\nobjects of the same class within each image. Subsequently, we adopt submodular\noptimization for considering both representativeness and diversity and utilize\nthe representative vectors in the submodular optimization process to select a\nsubset. When we evaluated CSOD on the Pascal VOC dataset, CSOD outperformed\nrandom selection by +6.4%p in AP$_{50}$ when selecting 200 images.\n","authors":["Hojun Lee","Suyoung Kim","Junhoo Lee","Jaeyoung Yoo","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2404.09161v1.pdf","comment":"Accepted by CVPR 2024: 1st Workshop on Dataset Distillation for\n Computer Vision"},{"id":"http://arxiv.org/abs/2404.09158v1","updated":"2024-04-14T06:19:46Z","published":"2024-04-14T06:19:46Z","title":"StreakNet-Arch: An Anti-scattering Network-based Architecture for\n Underwater Carrier LiDAR-Radar Imaging","summary":" In this paper, we introduce StreakNet-Arch, a novel signal processing\narchitecture designed for Underwater Carrier LiDAR-Radar (UCLR) imaging\nsystems, to address the limitations in scatter suppression and real-time\nimaging. StreakNet-Arch formulates the signal processing as a real-time,\nend-to-end binary classification task, enabling real-time image acquisition. To\nachieve this, we leverage Self-Attention networks and propose a novel Double\nBranch Cross Attention (DBC-Attention) mechanism that surpasses the performance\nof traditional methods. Furthermore, we present a method for embedding\nstreak-tube camera images into attention networks, effectively acting as a\nlearned bandpass filter. To facilitate further research, we contribute a\npublicly available streak-tube camera image dataset. The dataset contains\n2,695,168 real-world underwater 3D point cloud data. These advancements\nsignificantly improve UCLR capabilities, enhancing its performance and\napplicability in underwater imaging tasks. The source code and dataset can be\nfound at https://github.com/BestAnHongjun/StreakNet .\n","authors":["Xuelong Li","Hongjun An","Guangying Li","Xing Wang","Guanghua Cheng","Zhe Sun"],"pdf_url":"https://arxiv.org/pdf/2404.09158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09146v1","updated":"2024-04-14T05:28:46Z","published":"2024-04-14T05:28:46Z","title":"Fusion-Mamba for Cross-modality Object Detection","summary":" Cross-modality fusing complementary information from different modalities\neffectively improves object detection performance, making it more useful and\nrobust for a wider range of applications. Existing fusion strategies combine\ndifferent types of images or merge different backbone features through\nelaborated neural network modules. However, these methods neglect that modality\ndisparities affect cross-modality fusion performance, as different modalities\nwith different camera focal lengths, placements, and angles are hardly fused.\nIn this paper, we investigate cross-modality fusion by associating cross-modal\nfeatures in a hidden state space based on an improved Mamba with a gating\nmechanism. We design a Fusion-Mamba block (FMB) to map cross-modal features\ninto a hidden state space for interaction, thereby reducing disparities between\ncross-modal features and enhancing the representation consistency of fused\nfeatures. FMB contains two modules: the State Space Channel Swapping (SSCS)\nmodule facilitates shallow feature fusion, and the Dual State Space Fusion\n(DSSF) enables deep fusion in a hidden state space. Through extensive\nexperiments on public datasets, our proposed approach outperforms the\nstate-of-the-art methods on $m$AP with 5.9% on $M^3FD$ and 4.9% on FLIR-Aligned\ndatasets, demonstrating superior object detection performance. To the best of\nour knowledge, this is the first work to explore the potential of Mamba for\ncross-modal fusion and establish a new baseline for cross-modality object\ndetection.\n","authors":["Wenhao Dong","Haodong Zhu","Shaohui Lin","Xiaoyan Luo","Yunhang Shen","Xuhui Liu","Juan Zhang","Guodong Guo","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17493v3","updated":"2024-04-14T05:20:10Z","published":"2023-05-27T15:10:41Z","title":"The Curse of Recursion: Training on Generated Data Makes Models Forget","summary":" Stable Diffusion revolutionised image creation from descriptive text. GPT-2,\nGPT-3(.5) and GPT-4 demonstrated astonishing performance across a variety of\nlanguage tasks. ChatGPT introduced such language models to the general public.\nIt is now clear that large language models (LLMs) are here to stay, and will\nbring about drastic change in the whole ecosystem of online text and images. In\nthis paper we consider what the future might hold. What will happen to GPT-{n}\nonce LLMs contribute much of the language found online? We find that use of\nmodel-generated content in training causes irreversible defects in the\nresulting models, where tails of the original content distribution disappear.\nWe refer to this effect as Model Collapse and show that it can occur in\nVariational Autoencoders, Gaussian Mixture Models and LLMs. We build\ntheoretical intuition behind the phenomenon and portray its ubiquity amongst\nall learned generative models. We demonstrate that it has to be taken seriously\nif we are to sustain the benefits of training from large-scale data scraped\nfrom the web. Indeed, the value of data collected about genuine human\ninteractions with systems will be increasingly valuable in the presence of\ncontent generated by LLMs in data crawled from the Internet.\n","authors":["Ilia Shumailov","Zakhar Shumaylov","Yiren Zhao","Yarin Gal","Nicolas Papernot","Ross Anderson"],"pdf_url":"https://arxiv.org/pdf/2305.17493v3.pdf","comment":"Fixed typos in eqn 4,5"},{"id":"http://arxiv.org/abs/2303.12307v3","updated":"2024-04-14T05:16:49Z","published":"2023-03-22T04:49:23Z","title":"Curvature-Balanced Feature Manifold Learning for Long-Tailed\n Classification","summary":" To address the challenges of long-tailed classification, researchers have\nproposed several approaches to reduce model bias, most of which assume that\nclasses with few samples are weak classes. However, recent studies have shown\nthat tail classes are not always hard to learn, and model bias has been\nobserved on sample-balanced datasets, suggesting the existence of other factors\nthat affect model bias. In this work, we systematically propose a series of\ngeometric measurements for perceptual manifolds in deep neural networks, and\nthen explore the effect of the geometric characteristics of perceptual\nmanifolds on classification difficulty and how learning shapes the geometric\ncharacteristics of perceptual manifolds. An unanticipated finding is that the\ncorrelation between the class accuracy and the separation degree of perceptual\nmanifolds gradually decreases during training, while the negative correlation\nwith the curvature gradually increases, implying that curvature imbalance leads\nto model bias. Therefore, we propose curvature regularization to facilitate the\nmodel to learn curvature-balanced and flatter perceptual manifolds. Evaluations\non multiple long-tailed and non-long-tailed datasets show the excellent\nperformance and exciting generality of our approach, especially in achieving\nsignificant performance improvements based on current state-of-the-art\ntechniques. Our work opens up a geometric analysis perspective on model bias\nand reminds researchers to pay attention to model bias on non-long-tailed and\neven sample-balanced datasets. The code and model will be made public.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Shuyuan Yang","Xu Liu","Lingling Li"],"pdf_url":"https://arxiv.org/pdf/2303.12307v3.pdf","comment":"20pages, Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2301.00349v3","updated":"2024-04-14T03:59:35Z","published":"2023-01-01T05:02:46Z","title":"Towards Reliable Medical Image Segmentation by utilizing Evidential\n Calibrated Uncertainty","summary":" Medical image segmentation is critical for disease diagnosis and treatment\nassessment. However, concerns regarding the reliability of segmentation regions\npersist among clinicians, mainly attributed to the absence of confidence\nassessment, robustness, and calibration to accuracy. To address this, we\nintroduce DEviS, an easily implementable foundational model that seamlessly\nintegrates into various medical image segmentation networks. DEviS not only\nenhances the calibration and robustness of baseline segmentation accuracy but\nalso provides high-efficiency uncertainty estimation for reliable predictions.\nBy leveraging subjective logic theory, we explicitly model probability and\nuncertainty for the problem of medical image segmentation. Here, the Dirichlet\ndistribution parameterizes the distribution of probabilities for different\nclasses of the segmentation results. To generate calibrated predictions and\nuncertainty, we develop a trainable calibrated uncertainty penalty.\nFurthermore, DEviS incorporates an uncertainty-aware filtering module, which\nutilizes the metric of uncertainty-calibrated error to filter reliable data\nwithin the dataset. We conducted validation studies to assess both the accuracy\nand robustness of DEviS segmentation, along with evaluating the efficiency and\nreliability of uncertainty estimation. These evaluations were performed using\npublicly available datasets including ISIC2018, LiTS2017, and BraTS2019.\nAdditionally, two potential clinical trials are being conducted at Johns\nHopkins OCT, Duke-OCT-DME, and FIVES datasets to demonstrate their efficacy in\nfiltering high-quality or out-of-distribution data. Our code has been released\nin https://github.com/Cocofeat/DEviS.\n","authors":["Ke Zou","Yidi Chen","Ling Huang","Xuedong Yuan","Xiaojing Shen","Meng Wang","Rick Siow Mong Goh","Yong Liu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2301.00349v3.pdf","comment":"34 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.09115v1","updated":"2024-04-14T01:51:11Z","published":"2024-04-14T01:51:11Z","title":"GCC: Generative Calibration Clustering","summary":" Deep clustering as an important branch of unsupervised representation\nlearning focuses on embedding semantically similar samples into the identical\nfeature space. This core demand inspires the exploration of contrastive\nlearning and subspace clustering. However, these solutions always rely on the\nbasic assumption that there are sufficient and category-balanced samples for\ngenerating valid high-level representation. This hypothesis actually is too\nstrict to be satisfied for real-world applications. To overcome such a\nchallenge, the natural strategy is utilizing generative models to augment\nconsiderable instances. How to use these novel samples to effectively fulfill\nclustering performance improvement is still difficult and under-explored. In\nthis paper, we propose a novel Generative Calibration Clustering (GCC) method\nto delicately incorporate feature learning and augmentation into clustering\nprocedure. First, we develop a discriminative feature alignment mechanism to\ndiscover intrinsic relationship across real and generated samples. Second, we\ndesign a self-supervised metric learning to generate more reliable cluster\nassignment to boost the conditional diffusion generation. Extensive\nexperimental results on three benchmarks validate the effectiveness and\nadvantage of our proposed method over the state-of-the-art methods.\n","authors":["Haifeng Xia","Hai Huang","Zhengming Ding"],"pdf_url":"https://arxiv.org/pdf/2404.09115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09111v1","updated":"2024-04-14T01:23:19Z","published":"2024-04-14T01:23:19Z","title":"Exploring Generative AI for Sim2Real in Driving Data Synthesis","summary":" Datasets are essential for training and testing vehicle perception\nalgorithms. However, the collection and annotation of real-world images is\ntime-consuming and expensive. Driving simulators offer a solution by\nautomatically generating various driving scenarios with corresponding\nannotations, but the simulation-to-reality (Sim2Real) domain gap remains a\nchallenge. While most of the Generative Artificial Intelligence (AI) follows\nthe de facto Generative Adversarial Nets (GANs)-based methods, the recent\nemerging diffusion probabilistic models have not been fully explored in\nmitigating Sim2Real challenges for driving data synthesis. To explore the\nperformance, this paper applied three different generative AI methods to\nleverage semantic label maps from a driving simulator as a bridge for the\ncreation of realistic datasets. A comparative analysis of these methods is\npresented from the perspective of image quality and perception. New synthetic\ndatasets, which include driving images and auto-generated high-quality\nannotations, are produced with low costs and high scene variability. The\nexperimental results show that although GAN-based methods are adept at\ngenerating high-quality images when provided with manually annotated labels,\nControlNet produces synthetic datasets with fewer artefacts and more structural\nfidelity when using simulator-generated labels. This suggests that the\ndiffusion-based approach may provide improved stability and an alternative\nmethod for addressing Sim2Real challenges.\n","authors":["Haonan Zhao","Yiting Wang","Thomas Bashford-Rogers","Valentina Donzella","Kurt Debattista"],"pdf_url":"https://arxiv.org/pdf/2404.09111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09105v1","updated":"2024-04-14T00:08:56Z","published":"2024-04-14T00:08:56Z","title":"EGGS: Edge Guided Gaussian Splatting for Radiance Fields","summary":" The Gaussian splatting methods are getting popular. However, their loss\nfunction only contains the $\\ell_1$ norm and the structural similarity between\nthe rendered and input images, without considering the edges in these images.\nIt is well-known that the edges in an image provide important information.\nTherefore, in this paper, we propose an Edge Guided Gaussian Splatting (EGGS)\nmethod that leverages the edges in the input images. More specifically, we give\nthe edge region a higher weight than the flat region. With such edge guidance,\nthe resulting Gaussian particles focus more on the edges instead of the flat\nregions. Moreover, such edge guidance does not crease the computation cost\nduring the training and rendering stage. The experiments confirm that such\nsimple edge-weighted loss function indeed improves about $1\\sim2$ dB on several\ndifference data sets. With simply plugging in the edge guidance, the proposed\nmethod can improve all Gaussian splatting methods in different scenarios, such\nas human head modeling, building 3D reconstruction, etc.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2404.09105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10539v1","updated":"2024-04-14T15:49:02Z","published":"2024-04-14T15:49:02Z","title":"VideoSAGE: Video Summarization with Graph Representation Learning","summary":" We propose a graph-based representation learning framework for video\nsummarization. First, we convert an input video to a graph where nodes\ncorrespond to each of the video frames. Then, we impose sparsity on the graph\nby connecting only those pairs of nodes that are within a specified temporal\ndistance. We then formulate the video summarization task as a binary node\nclassification problem, precisely classifying video frames whether they should\nbelong to the output summary video. A graph constructed this way aims to\ncapture long-range interactions among video frames, and the sparsity ensures\nthe model trains without hitting the memory and compute bottleneck. Experiments\non two datasets(SumMe and TVSum) demonstrate the effectiveness of the proposed\nnimble model compared to existing state-of-the-art summarization approaches\nwhile being one order of magnitude more efficient in compute time and memory\n","authors":["Jose M. Rojas Chaves","Subarna Tripathi"],"pdf_url":"https://arxiv.org/pdf/2404.10539v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2207.07783"}]},"2024-04-13T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.13659v4","updated":"2024-04-13T22:52:19Z","published":"2024-03-20T15:08:43Z","title":"Recursive Joint Cross-Modal Attention for Multimodal Fusion in\n Dimensional Emotion Recognition","summary":" Though multimodal emotion recognition has achieved significant progress over\nrecent years, the potential of rich synergic relationships across the\nmodalities is not fully exploited. In this paper, we introduce Recursive Joint\nCross-Modal Attention (RJCMA) to effectively capture both intra- and\ninter-modal relationships across audio, visual, and text modalities for\ndimensional emotion recognition. In particular, we compute the attention\nweights based on cross-correlation between the joint audio-visual-text feature\nrepresentations and the feature representations of individual modalities to\nsimultaneously capture intra- and intermodal relationships across the\nmodalities. The attended features of the individual modalities are again fed as\ninput to the fusion model in a recursive mechanism to obtain more refined\nfeature representations. We have also explored Temporal Convolutional Networks\n(TCNs) to improve the temporal modeling of the feature representations of\nindividual modalities. Extensive experiments are conducted to evaluate the\nperformance of the proposed fusion model on the challenging Affwild2 dataset.\nBy effectively capturing the synergic intra- and inter-modal relationships\nacross audio, visual, and text modalities, the proposed fusion model achieves a\nConcordance Correlation Coefficient (CCC) of 0.585 (0.542) and 0.674 (0.619)\nfor valence and arousal respectively on the validation set(test set). This\nshows a significant improvement over the baseline of 0.240 (0.211) and 0.200\n(0.191) for valence and arousal, respectively, in the validation set (test\nset), achieving second place in the valence-arousal challenge of the 6th\nAffective Behavior Analysis in-the-Wild (ABAW) competition.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.13659v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09081v1","updated":"2024-04-13T21:02:49Z","published":"2024-04-13T21:02:49Z","title":"Probabilistic Directed Distance Fields for Ray-Based Shape\n Representations","summary":" In modern computer vision, the optimal representation of 3D shape continues\nto be task-dependent. One fundamental operation applied to such representations\nis differentiable rendering, as it enables inverse graphics approaches in\nlearning frameworks. Standard explicit shape representations (voxels, point\nclouds, or meshes) are often easily rendered, but can suffer from limited\ngeometric fidelity, among other issues. On the other hand, implicit\nrepresentations (occupancy, distance, or radiance fields) preserve greater\nfidelity, but suffer from complex or inefficient rendering processes, limiting\nscalability. In this work, we devise Directed Distance Fields (DDFs), a novel\nneural shape representation that builds upon classical distance fields. The\nfundamental operation in a DDF maps an oriented point (position and direction)\nto surface visibility and depth. This enables efficient differentiable\nrendering, obtaining depth with a single forward pass per pixel, as well as\ndifferential geometric quantity extraction (e.g., surface normals), with only\nadditional backward passes. Using probabilistic DDFs (PDDFs), we show how to\nmodel inherent discontinuities in the underlying field. We then apply DDFs to\nseveral applications, including single-shape fitting, generative modelling, and\nsingle-image 3D reconstruction, showcasing strong performance with simple\narchitectural components via the versatility of our representation. Finally,\nsince the dimensionality of DDFs permits view-dependent geometric artifacts, we\nconduct a theoretical investigation of the constraints necessary for view\nconsistency. We find a small set of field properties that are sufficient to\nguarantee a DDF is consistent, without knowing, for instance, which shape the\nfield is expressing.\n","authors":["Tristan Aumentado-Armstrong","Stavros Tsogkas","Sven Dickinson","Allan Jepson"],"pdf_url":"https://arxiv.org/pdf/2404.09081v1.pdf","comment":"Extension of arXiv:2112.05300"},{"id":"http://arxiv.org/abs/2403.11376v3","updated":"2024-04-13T20:42:17Z","published":"2024-03-18T00:03:48Z","title":"ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal\n Instance Segmentation","summary":" Amodal Instance Segmentation (AIS) presents a challenging task as it involves\npredicting both visible and occluded parts of objects within images. Existing\nAIS methods rely on a bidirectional approach, encompassing both the transition\nfrom amodal features to visible features (amodal-to-visible) and from visible\nfeatures to amodal features (visible-to-amodal). Our observation shows that the\nutilization of amodal features through the amodal-to-visible can confuse the\nvisible features due to the extra information of occluded/hidden segments not\npresented in visible display. Consequently, this compromised quality of visible\nfeatures during the subsequent visible-to-amodal transition. To tackle this\nissue, we introduce ShapeFormer, a decoupled Transformer-based model with a\nvisible-to-amodal transition. It facilitates the explicit relationship between\noutput segmentations and avoids the need for amodal-to-visible transitions.\nShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for\npredicting visible segmentation with occlusion awareness, (ii) Shape-Prior\nAmodal Mask Head for predicting amodal and occluded masks, and (iii)\nCategory-Specific Shape Prior Retriever aims to provide shape prior knowledge.\nComprehensive experiments and extensive ablation studies across various AIS\nbenchmarks demonstrate the effectiveness of our ShapeFormer. The code is\navailable at: https://github.com/UARK-AICV/ShapeFormer\n","authors":["Minh Tran","Winston Bounsavy","Khoa Vo","Anh Nguyen","Tri Nguyen","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2403.11376v3.pdf","comment":"Accepted to IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.09067v1","updated":"2024-04-13T19:34:14Z","published":"2024-04-13T19:34:14Z","title":"Exploring Explainability in Video Action Recognition","summary":" Image Classification and Video Action Recognition are perhaps the two most\nfoundational tasks in computer vision. Consequently, explaining the inner\nworkings of trained deep neural networks is of prime importance. While numerous\nefforts focus on explaining the decisions of trained deep neural networks in\nimage classification, exploration in the domain of its temporal version, video\naction recognition, has been scant. In this work, we take a deeper look at this\nproblem. We begin by revisiting Grad-CAM, one of the popular feature\nattribution methods for Image Classification, and its extension to Video Action\nRecognition tasks and examine the method's limitations. To address these, we\nintroduce Video-TCAV, by building on TCAV for Image Classification tasks, which\naims to quantify the importance of specific concepts in the decision-making\nprocess of Video Action Recognition models. As the scalable generation of\nconcepts is still an open problem, we propose a machine-assisted approach to\ngenerate spatial and spatiotemporal concepts relevant to Video Action\nRecognition for testing Video-TCAV. We then establish the importance of\ntemporally-varying concepts by demonstrating the superiority of dynamic\nspatiotemporal concepts over trivial spatial concepts. In conclusion, we\nintroduce a framework for investigating hypotheses in action recognition and\nquantitatively testing them, thus advancing research in the explainability of\ndeep neural networks used in video action recognition.\n","authors":["Avinab Saha","Shashank Gupta","Sravan Kumar Ankireddy","Karl Chahine","Joydeep Ghosh"],"pdf_url":"https://arxiv.org/pdf/2404.09067v1.pdf","comment":"6 pages, 10 figures, Accepted to the 3rd Explainable AI for Computer\n Vision (XAI4CV) Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05980v2","updated":"2024-04-13T18:10:00Z","published":"2024-04-09T03:24:10Z","title":"Tackling Structural Hallucination in Image Translation with Local\n Diffusion","summary":" Recent developments in diffusion models have advanced conditioned image\ngeneration, yet they struggle with reconstructing out-of-distribution (OOD)\nimages, such as unseen tumors in medical images, causing ``image\nhallucination'' and risking misdiagnosis. We hypothesize such hallucinations\nresult from local OOD regions in the conditional images. We verify that\npartitioning the OOD region and conducting separate image generations\nalleviates hallucinations in several applications. From this, we propose a\ntraining-free diffusion framework that reduces hallucination with multiple\nLocal Diffusion processes. Our approach involves OOD estimation followed by two\nmodules: a ``branching'' module generates locally both within and outside OOD\nregions, and a ``fusion'' module integrates these predictions into one. Our\nevaluation shows our method mitigates hallucination over baseline models\nquantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the\nreal-world medical and natural image datasets, respectively. It also\ndemonstrates compatibility with various pre-trained diffusion models.\n","authors":["Seunghoi Kim","Chen Jin","Tom Diethe","Matteo Figini","Henry F. J. Tregidgo","Asher Mullokandov","Philip Teare","Daniel C. Alexander"],"pdf_url":"https://arxiv.org/pdf/2404.05980v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09051v1","updated":"2024-04-13T17:31:11Z","published":"2024-04-13T17:31:11Z","title":"Rethinking Iterative Stereo Matching from Diffusion Bridge Model\n Perspective","summary":" Recently, iteration-based stereo matching has shown great potential. However,\nthese models optimize the disparity map using RNN variants. The discrete\noptimization process poses a challenge of information loss, which restricts the\nlevel of detail that can be expressed in the generated disparity map. In order\nto address these issues, we propose a novel training approach that incorporates\ndiffusion models into the iterative optimization process. We designed a\nTime-based Gated Recurrent Unit (T-GRU) to correlate temporal and disparity\noutputs. Unlike standard recurrent units, we employ Agent Attention to generate\nmore expressive features. We also designed an attention-based context network\nto capture a large amount of contextual information. Experiments on several\npublic benchmarks show that we have achieved competitive stereo matching\nperformance. Our model ranks first in the Scene Flow dataset, achieving over a\n7% improvement compared to competing methods, and requires only 8 iterations to\nachieve state-of-the-art results.\n","authors":["Yuguang Shi"],"pdf_url":"https://arxiv.org/pdf/2404.09051v1.pdf","comment":"tip. arXiv admin note: text overlap with arXiv:2303.06615 by other\n authors"},{"id":"http://arxiv.org/abs/2305.14882v2","updated":"2024-04-13T17:13:55Z","published":"2023-05-24T08:33:15Z","title":"Dynamic Clue Bottlenecks: Towards Interpretable-by-Design Visual\n Question Answering","summary":" Recent advances in multimodal large language models (LLMs) have shown extreme\neffectiveness in visual question answering (VQA). However, the design nature of\nthese end-to-end models prevents them from being interpretable to humans,\nundermining trust and applicability in critical domains. While post-hoc\nrationales offer certain insight into understanding model behavior, these\nexplanations are not guaranteed to be faithful to the model. In this paper, we\naddress these shortcomings by introducing an interpretable by design model that\nfactors model decisions into intermediate human-legible explanations, and\nallows people to easily understand why a model fails or succeeds. We propose\nthe Dynamic Clue Bottleneck Model ( (DCLUB), a method that is designed towards\nan inherently interpretable VQA system. DCLUB provides an explainable\nintermediate space before the VQA decision and is faithful from the beginning,\nwhile maintaining comparable performance to black-box systems. Given a\nquestion, DCLUB first returns a set of visual clues: natural language\nstatements of visually salient evidence from the image, and then generates the\noutput based solely on the visual clues. To supervise and evaluate the\ngeneration of VQA explanations within DCLUB, we collect a dataset of 1.7k\nreasoning-focused questions with visual clues. Evaluations show that our\ninherently interpretable system can improve 4.64% over a comparable black-box\nsystem in reasoning-focused questions while preserving 99.43% of performance on\nVQA-v2.\n","authors":["Xingyu Fu","Ben Zhou","Sihao Chen","Mark Yatskar","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2305.14882v2.pdf","comment":"Multimodal, Visual Question Answering, Vision and Language"},{"id":"http://arxiv.org/abs/2212.12043v2","updated":"2024-04-13T17:02:25Z","published":"2022-12-22T21:27:12Z","title":"When are Lemons Purple? The Concept Association Bias of Vision-Language\n Models","summary":" Large-scale vision-language models such as CLIP have shown impressive\nperformance on zero-shot image classification and image-to-text retrieval.\nHowever, such performance does not realize in tasks that require a\nfiner-grained correspondence between vision and language, such as Visual\nQuestion Answering (VQA). As a potential cause of the difficulty of applying\nthese models to VQA and similar tasks, we report an interesting phenomenon of\nvision-language models, which we call the Concept Association Bias (CAB). We\nfind that models with CAB tend to treat input as a bag of concepts and attempt\nto fill in the other missing concept crossmodally, leading to an unexpected\nzero-shot prediction. We demonstrate CAB by showing that CLIP's zero-shot\nclassification performance greatly suffers when there is a strong concept\nassociation between an object (e.g. eggplant) and an attribute (e.g. color\npurple). We also show that the strength of CAB predicts the performance on VQA.\nWe observe that CAB is prevalent in vision-language models trained with\ncontrastive losses, even when autoregressive losses are jointly employed.\nHowever, a model that solely relies on autoregressive loss seems to exhibit\nminimal or no signs of CAB.\n","authors":["Yutaro Yamada","Yingtian Tang","Yoyo Zhang","Ilker Yildirim"],"pdf_url":"https://arxiv.org/pdf/2212.12043v2.pdf","comment":"EMNLP 2023 main"},{"id":"http://arxiv.org/abs/2404.09042v1","updated":"2024-04-13T16:57:37Z","published":"2024-04-13T16:57:37Z","title":"Improving Personalisation in Valence and Arousal Prediction using Data\n Augmentation","summary":" In the field of emotion recognition and Human-Machine Interaction (HMI),\npersonalised approaches have exhibited their efficacy in capturing\nindividual-specific characteristics and enhancing affective prediction\naccuracy. However, personalisation techniques often face the challenge of\nlimited data for target individuals. This paper presents our work on an\nenhanced personalisation strategy, that leverages data augmentation to develop\ntailored models for continuous valence and arousal prediction. Our proposed\napproach, Distance Weighting Augmentation (DWA), employs a weighting-based\naugmentation method that expands a target individual's dataset, leveraging\ndistance metrics to identify similar samples at the segment-level. Experimental\nresults on the MuSe-Personalisation 2023 Challenge dataset demonstrate that our\nmethod significantly improves the performance of features sets which have low\nbaseline performance, on the test set. This improvement in poor-performing\nfeatures comes without sacrificing performance on high-performing features. In\nparticular, our method achieves a maximum combined testing CCC of 0.78,\ncompared to the reported baseline score of 0.76 (reproduced at 0.72). It also\nachieved a peak arousal and valence scores of 0.81 and 0.76, compared to\nreproduced baseline scores of 0.76 and 0.67 respectively. Through this work, we\nmake significant contributions to the advancement of personalised affective\ncomputing models, enhancing the practicality and adaptability of data-level\npersonalisation in real world contexts.\n","authors":["Munachiso Nwadike","Jialin Li","Hanan Salam"],"pdf_url":"https://arxiv.org/pdf/2404.09042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09126v2","updated":"2024-04-13T16:43:01Z","published":"2024-01-17T11:02:52Z","title":"Objects With Lighting: A Real-World Dataset for Evaluating\n Reconstruction and Rendering for Object Relighting","summary":" Reconstructing an object from photos and placing it virtually in a new\nenvironment goes beyond the standard novel view synthesis task as the\nappearance of the object has to not only adapt to the novel viewpoint but also\nto the new lighting conditions and yet evaluations of inverse rendering methods\nrely on novel view synthesis data or simplistic synthetic datasets for\nquantitative analysis. This work presents a real-world dataset for measuring\nthe reconstruction and rendering of objects for relighting. To this end, we\ncapture the environment lighting and ground truth images of the same objects in\nmultiple environments allowing to reconstruct the objects from images taken in\none environment and quantify the quality of the rendered views for the unseen\nlighting environments. Further, we introduce a simple baseline composed of\noff-the-shelf methods and test several state-of-the-art methods on the\nrelighting task and show that novel view synthesis is not a reliable proxy to\nmeasure performance. Code and dataset are available at\nhttps://github.com/isl-org/objects-with-lighting .\n","authors":["Benjamin Ummenhofer","Sanskar Agrawal","Rene Sepulveda","Yixing Lao","Kai Zhang","Tianhang Cheng","Stephan Richter","Shenlong Wang","German Ros"],"pdf_url":"https://arxiv.org/pdf/2401.09126v2.pdf","comment":"Accepted at 3DV 2024, Oral presentation. For the project page see\n https://github.com/isl-org/objects-with-lighting"},{"id":"http://arxiv.org/abs/2310.11890v3","updated":"2024-04-13T14:57:15Z","published":"2023-10-18T11:19:32Z","title":"IRAD: Implicit Representation-driven Image Resampling against\n Adversarial Attacks","summary":" We introduce a novel approach to counter adversarial attacks, namely, image\nresampling. Image resampling transforms a discrete image into a new one,\nsimulating the process of scene recapturing or rerendering as specified by a\ngeometrical transformation. The underlying rationale behind our idea is that\nimage resampling can alleviate the influence of adversarial perturbations while\npreserving essential semantic information, thereby conferring an inherent\nadvantage in defending against adversarial attacks. To validate this concept,\nwe present a comprehensive study on leveraging image resampling to defend\nagainst adversarial attacks. We have developed basic resampling methods that\nemploy interpolation strategies and coordinate shifting magnitudes. Our\nanalysis reveals that these basic methods can partially mitigate adversarial\nattacks. However, they come with apparent limitations: the accuracy of clean\nimages noticeably decreases, while the improvement in accuracy on adversarial\nexamples is not substantial. We propose implicit representation-driven image\nresampling (IRAD) to overcome these limitations. First, we construct an\nimplicit continuous representation that enables us to represent any input image\nwithin a continuous coordinate space. Second, we introduce SampleNet, which\nautomatically generates pixel-wise shifts for resampling in response to\ndifferent inputs. Furthermore, we can extend our approach to the\nstate-of-the-art diffusion-based method, accelerating it with fewer time steps\nwhile preserving its defense capability. Extensive experiments demonstrate that\nour method significantly enhances the adversarial robustness of diverse deep\nmodels against various attacks while maintaining high accuracy on clean images.\n","authors":["Yue Cao","Tianlin Li","Xiaofeng Cao","Ivor Tsang","Yang Liu","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2310.11890v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06791v6","updated":"2024-04-13T14:39:51Z","published":"2023-08-13T15:30:02Z","title":"PV-SSD: A Multi-Modal Point Cloud Feature Fusion Method for Projection\n Features and Variable Receptive Field Voxel Features","summary":" LiDAR-based 3D object detection and classification is crucial for autonomous\ndriving. However, real-time inference from extremely sparse 3D data is a\nformidable challenge. To address this problem, a typical class of approaches\ntransforms the point cloud cast into a regular data representation (voxels or\nprojection maps). Then, it performs feature extraction with convolutional\nneural networks. However, such methods often result in a certain degree of\ninformation loss due to down-sampling or over-compression of feature\ninformation. This paper proposes a multi-modal point cloud feature fusion\nmethod for projection features and variable receptive field voxel features\n(PV-SSD) based on projection and variable voxelization to solve the information\nloss problem. We design a two-branch feature extraction structure with a 2D\nconvolutional neural network to extract the point cloud's projection features\nin bird's-eye view to focus on the correlation between local features. A voxel\nfeature extraction branch is used to extract local fine-grained features.\nMeanwhile, we propose a voxel feature extraction method with variable sensory\nfields to reduce the information loss of voxel branches due to downsampling. It\navoids missing critical point information by selecting more useful feature\npoints based on feature point weights for the detection task. In addition, we\npropose a multi-modal feature fusion module for point clouds. To validate the\neffectiveness of our method, we tested it on the KITTI dataset and ONCE\ndataset.\n","authors":["Yongxin Shao","Aihong Tan","Zhetao Sun","Enhui Zheng","Tianhong Yan","Peng Liao"],"pdf_url":"https://arxiv.org/pdf/2308.06791v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09016v1","updated":"2024-04-13T14:08:56Z","published":"2024-04-13T14:08:56Z","title":"Theoretical research on generative diffusion models: an overview","summary":" Generative diffusion models showed high success in many fields with a\npowerful theoretical background. They convert the data distribution to noise\nand remove the noise back to obtain a similar distribution. Many existing\nreviews focused on the specific application areas without concentrating on the\nresearch about the algorithm. Unlike them we investigated the theoretical\ndevelopments of the generative diffusion models. These approaches mainly divide\ninto two: training-based and sampling-based. Awakening to this allowed us a\nclear and understandable categorization for the researchers who will make new\ndevelopments in the future.\n","authors":["Melike Nur Yeğin","Mehmet Fatih Amasyalı"],"pdf_url":"https://arxiv.org/pdf/2404.09016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06773v2","updated":"2024-04-13T13:58:29Z","published":"2024-04-10T06:30:08Z","title":"Adapting LLaMA Decoder to Vision Transformer","summary":" This work examines whether decoder-only Transformers such as LLaMA, which\nwere originally designed for large language models (LLMs), can be adapted to\nthe computer vision field. We first \"LLaMAfy\" a standard ViT step-by-step to\nalign with LLaMA's architecture, and find that directly applying a casual mask\nto the self-attention brings an attention collapse issue, resulting in the\nfailure to the network training. We suggest to reposition the class token\nbehind the image tokens with a post-sequence class token technique to overcome\nthis challenge, enabling causal self-attention to efficiently capture the\nentire image's information. Additionally, we develop a soft mask strategy that\ngradually introduces a casual mask to the self-attention at the onset of\ntraining to facilitate the optimization behavior. The tailored model, dubbed as\nimage LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct\nsupervised learning. Its causal self-attention boosts computational efficiency\nand learns complex representation by elevating attention map ranks. iLLaMA\nrivals the performance with its encoder-only counterparts, achieving 75.1%\nImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M\nand pre-training on ImageNet-21K further enhances the accuracy to 86.0%.\nExtensive experiments demonstrate iLLaMA's reliable properties: calibration,\nshape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR\ntransfer learning. We hope our study can kindle fresh views to visual model\ndesign in the wave of LLMs. Pre-trained models and codes are available here.\n","authors":["Jiahao Wang","Wenqi Shao","Mengzhao Chen","Chengyue Wu","Yong Liu","Kaipeng Zhang","Songyang Zhang","Kai Chen","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2404.06773v2.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.07922v2","updated":"2024-04-13T13:57:51Z","published":"2024-04-11T17:09:28Z","title":"LaVy: Vietnamese Multimodal Large Language Model","summary":" Large Language Models (LLMs) and Multimodal Large language models (MLLMs)\nhave taken the world by storm with impressive abilities in complex reasoning\nand linguistic comprehension. Meanwhile there are plethora of works related to\nVietnamese Large Language Models, the lack of high-quality resources in\nmultimodality limits the progress of Vietnamese MLLMs. In this paper, we\npioneer in address this by introducing LaVy, a state-of-the-art Vietnamese\nMLLM, and we also introduce LaVy-Bench benchmark designated for evaluating\nMLLMs's understanding on Vietnamese visual language tasks. All code and model\nweights are public at https://github.com/baochi0212/LaVy\n","authors":["Chi Tran","Huong Le Thanh"],"pdf_url":"https://arxiv.org/pdf/2404.07922v2.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2404.09011v1","updated":"2024-04-13T13:41:13Z","published":"2024-04-13T13:41:13Z","title":"PracticalDG: Perturbation Distillation on Vision-Language Models for\n Hybrid Domain Generalization","summary":" Domain Generalization (DG) aims to resolve distribution shifts between source\nand target domains, and current DG methods are default to the setting that data\nfrom source and target domains share identical categories. Nevertheless, there\nexists unseen classes from target domains in practical scenarios. To address\nthis issue, Open Set Domain Generalization (OSDG) has emerged and several\nmethods have been exclusively proposed. However, most existing methods adopt\ncomplex architectures with slight improvement compared with DG methods.\nRecently, vision-language models (VLMs) have been introduced in DG following\nthe fine-tuning paradigm, but consume huge training overhead with large vision\nmodels. Therefore, in this paper, we innovate to transfer knowledge from VLMs\nto lightweight vision models and improve the robustness by introducing\nPerturbation Distillation (PD) from three perspectives, including Score, Class\nand Instance (SCI), named SCI-PD. Moreover, previous methods are oriented by\nthe benchmarks with identical and fixed splits, ignoring the divergence between\nsource domains. These methods are revealed to suffer from sharp performance\ndecay with our proposed new benchmark Hybrid Domain Generalization (HDG) and a\nnovel metric $H^{2}$-CV, which construct various splits to comprehensively\nassess the robustness of algorithms. Extensive experiments demonstrate that our\nmethod outperforms state-of-the-art algorithms on multiple datasets, especially\nimproving the robustness when confronting data scarcity.\n","authors":["Zining Chen","Weiqiu Wang","Zhicheng Zhao","Fei Su","Aidong Men","Hongying Meng"],"pdf_url":"https://arxiv.org/pdf/2404.09011v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.14472v3","updated":"2024-04-13T13:39:50Z","published":"2024-03-21T15:18:30Z","title":"Detoxifying Large Language Models via Knowledge Editing","summary":" This paper investigates using knowledge editing techniques to detoxify Large\nLanguage Models (LLMs). We construct a benchmark, SafeEdit, which covers nine\nunsafe categories with various powerful attack prompts and equips comprehensive\nmetrics for systematic evaluation. We conduct experiments with several\nknowledge editing approaches, indicating that knowledge editing has the\npotential to efficiently detoxify LLMs with limited impact on general\nperformance. Then, we propose a simple yet effective baseline, dubbed\nDetoxifying with Intraoperative Neural Monitoring (DINM), to diminish the\ntoxicity of LLMs within a few tuning steps via only one instance. We further\nprovide an in-depth analysis of the internal mechanism for various detoxifying\napproaches, demonstrating that previous methods like SFT and DPO may merely\nsuppress the activations of toxic parameters, while DINM mitigates the toxicity\nof the toxic parameters to a certain extent, making permanent adjustments. We\nhope that these insights could shed light on future work of developing\ndetoxifying approaches and the underlying knowledge mechanisms of LLMs. Code\nand benchmark are available at https://github.com/zjunlp/EasyEdit.\n","authors":["Mengru Wang","Ningyu Zhang","Ziwen Xu","Zekun Xi","Shumin Deng","Yunzhi Yao","Qishen Zhang","Linyi Yang","Jindong Wang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14472v3.pdf","comment":"Ongoing work. Project website:\n https://zjunlp.github.io/project/SafeEdit Add and update experimental results\n in Tables 1 and 3"},{"id":"http://arxiv.org/abs/2404.09010v1","updated":"2024-04-13T13:39:26Z","published":"2024-04-13T13:39:26Z","title":"MMA-DFER: MultiModal Adaptation of unimodal models for Dynamic Facial\n Expression Recognition in-the-wild","summary":" Dynamic Facial Expression Recognition (DFER) has received significant\ninterest in the recent years dictated by its pivotal role in enabling empathic\nand human-compatible technologies. Achieving robustness towards in-the-wild\ndata in DFER is particularly important for real-world applications. One of the\ndirections aimed at improving such models is multimodal emotion recognition\nbased on audio and video data. Multimodal learning in DFER increases the model\ncapabilities by leveraging richer, complementary data representations. Within\nthe field of multimodal DFER, recent methods have focused on exploiting\nadvances of self-supervised learning (SSL) for pre-training of strong\nmultimodal encoders. Another line of research has focused on adapting\npre-trained static models for DFER. In this work, we propose a different\nperspective on the problem and investigate the advancement of multimodal DFER\nperformance by adapting SSL-pre-trained disjoint unimodal encoders. We identify\nmain challenges associated with this task, namely, intra-modality adaptation,\ncross-modal alignment, and temporal adaptation, and propose solutions to each\nof them. As a result, we demonstrate improvement over current state-of-the-art\non two popular DFER benchmarks, namely DFEW and MFAW.\n","authors":["Kateryna Chumachenko","Alexandros Iosifidis","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2404.09010v1.pdf","comment":"accepted to CVPR 2024 ABAW Workshop"},{"id":"http://arxiv.org/abs/2404.09003v1","updated":"2024-04-13T13:08:57Z","published":"2024-04-13T13:08:57Z","title":"THQA: A Perceptual Quality Assessment Database for Talking Heads","summary":" In the realm of media technology, digital humans have gained prominence due\nto rapid advancements in computer technology. However, the manual modeling and\ncontrol required for the majority of digital humans pose significant obstacles\nto efficient development. The speech-driven methods offer a novel avenue for\nmanipulating the mouth shape and expressions of digital humans. Despite the\nproliferation of driving methods, the quality of many generated talking head\n(TH) videos remains a concern, impacting user visual experiences. To tackle\nthis issue, this paper introduces the Talking Head Quality Assessment (THQA)\ndatabase, featuring 800 TH videos generated through 8 diverse speech-driven\nmethods. Extensive experiments affirm the THQA database's richness in character\nand speech features. Subsequent subjective quality assessment experiments\nanalyze correlations between scoring results and speech-driven methods, ages,\nand genders. In addition, experimental results show that mainstream image and\nvideo quality assessment methods have limitations for the THQA database,\nunderscoring the imperative for further research to enhance TH video quality\nassessment. The THQA database is publicly accessible at\nhttps://github.com/zyj-2000/THQA.\n","authors":["Yingjie Zhou","Zicheng Zhang","Wei Sun","Xiaohong Liu","Xiongkuo Min","Zhihua Wang","Xiao-Ping Zhang","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.09003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09001v1","updated":"2024-04-13T13:03:59Z","published":"2024-04-13T13:03:59Z","title":"Smart Help: Strategic Opponent Modeling for Proactive and Adaptive Robot\n Assistance in Households","summary":" Despite the significant demand for assistive technology among vulnerable\ngroups (e.g., the elderly, children, and the disabled) in daily tasks, research\ninto advanced AI-driven assistive solutions that genuinely accommodate their\ndiverse needs remains sparse. Traditional human-machine interaction tasks often\nrequire machines to simply help without nuanced consideration of human\nabilities and feelings, such as their opportunity for practice and learning,\nsense of self-improvement, and self-esteem. Addressing this gap, we define a\npivotal and novel challenge Smart Help, which aims to provide proactive yet\nadaptive support to human agents with diverse disabilities and dynamic goals in\nvarious tasks and environments. To establish this challenge, we leverage\nAI2-THOR to build a new interactive 3D realistic household environment for the\nSmart Help task. We introduce an innovative opponent modeling module that\nprovides a nuanced understanding of the main agent's capabilities and goals, in\norder to optimize the assisting agent's helping policy. Rigorous experiments\nvalidate the efficacy of our model components and show the superiority of our\nholistic approach against established baselines. Our findings illustrate the\npotential of AI-imbued assistive robots in improving the well-being of\nvulnerable groups.\n","authors":["Zhihao Cao","Zidong Wang","Siwen Xie","Anji Liu","Lifeng Fan"],"pdf_url":"https://arxiv.org/pdf/2404.09001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09000v1","updated":"2024-04-13T13:03:19Z","published":"2024-04-13T13:03:19Z","title":"MaSkel: A Model for Human Whole-body X-rays Generation from Human\n Masking Images","summary":" The human whole-body X-rays could offer a valuable reference for various\napplications, including medical diagnostics, digital animation modeling, and\nergonomic design. The traditional method of obtaining X-ray information\nrequires the use of CT (Computed Tomography) scan machines, which emit\npotentially harmful radiation. Thus it faces a significant limitation for\nrealistic applications because it lacks adaptability and safety. In our work,\nWe proposed a new method to directly generate the 2D human whole-body X-rays\nfrom the human masking images. The predicted images will be similar to the real\nones with the same image style and anatomic structure. We employed a\ndata-driven strategy. By leveraging advanced generative techniques, our model\nMaSkel(Masking image to Skeleton X-rays) could generate a high-quality X-ray\nimage from a human masking image without the need for invasive and harmful\nradiation exposure, which not only provides a new path to generate highly\nanatomic and customized data but also reduces health risks. To our knowledge,\nour model MaSkel is the first work for predicting whole-body X-rays. In this\npaper, we did two parts of the work. The first one is to solve the data\nlimitation problem, the diffusion-based techniques are utilized to make a data\naugmentation, which provides two synthetic datasets for preliminary\npretraining. Then we designed a two-stage training strategy to train MaSkel. At\nlast, we make qualitative and quantitative evaluations of the generated X-rays.\nIn addition, we invite some professional doctors to assess our predicted data.\nThese evaluations demonstrate the MaSkel's superior ability to generate\nanatomic X-rays from human masking images. The related code and links of the\ndataset are available at https://github.com/2022yingjie/MaSkel.\n","authors":["Yingjie Xi","Boyuan Cheng","Jingyao Cai","Jian Jun Zhang","Xiaosong Yang"],"pdf_url":"https://arxiv.org/pdf/2404.09000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08995v1","updated":"2024-04-13T12:41:40Z","published":"2024-04-13T12:41:40Z","title":"Beyond Known Clusters: Probe New Prototypes for Efficient Generalized\n Class Discovery","summary":" Generalized Class Discovery (GCD) aims to dynamically assign labels to\nunlabelled data partially based on knowledge learned from labelled data, where\nthe unlabelled data may come from known or novel classes. The prevailing\napproach generally involves clustering across all data and learning conceptions\nby prototypical contrastive learning. However, existing methods largely hinge\non the performance of clustering algorithms and are thus subject to their\ninherent limitations. Firstly, the estimated cluster number is often smaller\nthan the ground truth, making the existing methods suffer from the lack of\nprototypes for comprehensive conception learning. To address this issue, we\npropose an adaptive probing mechanism that introduces learnable potential\nprototypes to expand cluster prototypes (centers). As there is no ground truth\nfor the potential prototype, we develop a self-supervised prototype learning\nframework to optimize the potential prototype in an end-to-end fashion.\nSecondly, clustering is computationally intensive, and the conventional\nstrategy of clustering both labelled and unlabelled instances exacerbates this\nissue. To counteract this inefficiency, we opt to cluster only the unlabelled\ninstances and subsequently expand the cluster prototypes with our introduced\npotential prototypes to fast explore novel classes. Despite the simplicity of\nour proposed method, extensive empirical analysis on a wide range of datasets\nconfirms that our method consistently delivers state-of-the-art results.\nSpecifically, our method surpasses the nearest competitor by a significant\nmargin of \\textbf{9.7}$\\%$ within the Stanford Cars dataset and\n\\textbf{12$\\times$} clustering efficiency within the Herbarium 19 dataset. We\nwill make the code and checkpoints publicly available at\n\\url{https://github.com/xjtuYW/PNP.git}.\n","authors":["Ye Wang","Yaxiong Wang","Yujiao Wu","Bingchen Zhao","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2404.08995v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.08990v1","updated":"2024-04-13T12:28:40Z","published":"2024-04-13T12:28:40Z","title":"A Fourier-enhanced multi-modal 3D small object optical mark recognition\n and positioning method for percutaneous abdominal puncture surgical\n navigation","summary":" Navigation for thoracoabdominal puncture surgery is used to locate the needle\nentry point on the patient's body surface. The traditional reflective ball\nnavigation method is difficult to position the needle entry point on the soft,\nirregular, smooth chest and abdomen. Due to the lack of clear characteristic\npoints on the body surface using structured light technology, it is difficult\nto identify and locate arbitrary needle insertion points. Based on the high\nstability and high accuracy requirements of surgical navigation, this paper\nproposed a novel method, a muti-modal 3D small object medical marker detection\nmethod, which identifies the center of a small single ring as the needle\ninsertion point. Moreover, this novel method leverages Fourier transform\nenhancement technology to augment the dataset, enrich image details, and\nenhance the network's capability. The method extracts the Region of Interest\n(ROI) of the feature image from both enhanced and original images, followed by\ngenerating a mask map. Subsequently, the point cloud of the ROI from the depth\nmap is obtained through the registration of ROI point cloud contour fitting. In\naddition, this method employs Tukey loss for optimal precision. The\nexperimental results show this novel method proposed in this paper not only\nachieves high-precision and high-stability positioning, but also enables the\npositioning of any needle insertion point.\n","authors":["Zezhao Guo","Yanzhong Guo","Zhanfang Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.08990v1.pdf","comment":"19 pages, 6 figures,"},{"id":"http://arxiv.org/abs/2401.14387v2","updated":"2024-04-13T12:26:44Z","published":"2024-01-25T18:46:35Z","title":"Inconsistency Masks: Removing the Uncertainty from Input-Pseudo-Label\n Pairs","summary":" Efficiently generating sufficient labeled data remains a major bottleneck in\ndeep learning, particularly for image segmentation tasks where labeling\nrequires significant time and effort. This study tackles this issue in a\nresource-constrained environment, devoid of extensive datasets or pre-existing\nmodels. We introduce Inconsistency Masks (IM), a novel approach that filters\nuncertainty in image-pseudo-label pairs to substantially enhance segmentation\nquality, surpassing traditional semi-supervised learning techniques. Employing\nIM, we achieve strong segmentation results with as little as 10% labeled data,\nacross four diverse datasets and it further benefits from integration with\nother techniques, indicating broad applicability. Notably on the ISIC 2018\ndataset, three of our hybrid approaches even outperform models trained on the\nfully labeled dataset. We also present a detailed comparative analysis of\nprevalent semi-supervised learning strategies, all under uniform starting\nconditions, to underline our approach's effectiveness and robustness. The full\ncode is available at: https://github.com/MichaelVorndran/InconsistencyMasks\n","authors":["Michael R. H. Vorndran","Bernhard F. Roeck"],"pdf_url":"https://arxiv.org/pdf/2401.14387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08981v1","updated":"2024-04-13T12:09:37Z","published":"2024-04-13T12:09:37Z","title":"Fast Fishing: Approximating BAIT for Efficient and Scalable Deep Active\n Image Classification","summary":" Deep active learning (AL) seeks to minimize the annotation costs for training\ndeep neural networks. BAIT, a recently proposed AL strategy based on the Fisher\nInformation, has demonstrated impressive performance across various datasets.\nHowever, BAIT's high computational and memory requirements hinder its\napplicability on large-scale classification tasks, resulting in current\nresearch neglecting BAIT in their evaluation. This paper introduces two methods\nto enhance BAIT's computational efficiency and scalability. Notably, we\nsignificantly reduce its time complexity by approximating the Fisher\nInformation. In particular, we adapt the original formulation by i) taking the\nexpectation over the most probable classes, and ii) constructing a binary\nclassification task, leading to an alternative likelihood for gradient\ncomputations. Consequently, this allows the efficient use of BAIT on\nlarge-scale datasets, including ImageNet. Our unified and comprehensive\nevaluation across a variety of datasets demonstrates that our approximations\nachieve strong performance with considerably reduced time complexity.\nFurthermore, we provide an extensive open-source toolbox that implements recent\nstate-of-the-art AL strategies, available at\nhttps://github.com/dhuseljic/dal-toolbox.\n","authors":["Denis Huseljic","Paul Hahn","Marek Herde","Lukas Rauch","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2404.08981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15706v2","updated":"2024-04-13T12:06:35Z","published":"2024-03-23T03:56:31Z","title":"G-ACIL: Analytic Learning for Exemplar-Free Generalized Class\n Incremental Learning","summary":" Class incremental learning (CIL) trains a network on sequential tasks with\nseparated categories but suffers from catastrophic forgetting, where models\nquickly lose previously learned knowledge when acquiring new tasks. The\ngeneralized CIL (GCIL) aims to address the CIL problem in a more real-world\nscenario, where incoming data have mixed data categories and unknown sample\nsize distribution, leading to intensified forgetting. Existing attempts for the\nGCIL either have poor performance, or invade data privacy by saving historical\nexemplars. To address this, in this paper, we propose an exemplar-free\ngeneralized analytic class incremental learning (G-ACIL). The G-ACIL adopts\nanalytic learning (a gradient-free training technique), and delivers an\nanalytical solution (i.e., closed-form) to the GCIL scenario. This solution is\nderived via decomposing the incoming data into exposed and unexposed classes,\nallowing an equivalence between the incremental learning and its joint\ntraining, i.e., the weight-invariant property. Such an equivalence is\ntheoretically validated through matrix analysis tools, and hence contributes\ninterpretability in GCIL. It is also empirically evidenced by experiments on\nvarious datasets and settings of GCIL. The results show that the G-ACIL\nexhibits leading performance with high robustness compared with existing\ncompetitive GCIL methods. Codes will be ready at\n\\url{https://github.com/ZHUANGHP/Analytic-continual-learning}.\n","authors":["Huiping Zhuang","Yizhu Chen","Di Fang","Run He","Kai Tong","Hongxin Wei","Ziqian Zeng","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.15706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08979v1","updated":"2024-04-13T12:06:29Z","published":"2024-04-13T12:06:29Z","title":"BG-YOLO: A Bidirectional-Guided Method for Underwater Object Detection","summary":" Degraded underwater images decrease the accuracy of underwater object\ndetection. However, existing methods for underwater image enhancement mainly\nfocus on improving the indicators in visual aspects, which may not benefit the\ntasks of underwater image detection, and may lead to serious degradation in\nperformance. To alleviate this problem, we proposed a bidirectional-guided\nmethod for underwater object detection, referred to as BG-YOLO. In the proposed\nmethod, network is organized by constructing an enhancement branch and a\ndetection branch in a parallel way. The enhancement branch consists of a\ncascade of an image enhancement subnet and an object detection subnet. And the\ndetection branch only consists of a detection subnet. A feature guided module\nconnects the shallow convolution layer of the two branches. When training the\nenhancement branch, the object detection subnet in the enhancement branch\nguides the image enhancement subnet to be optimized towards the direction that\nis most conducive to the detection task. The shallow feature map of the trained\nenhancement branch will be output to the feature guided module, constraining\nthe optimization of detection branch through consistency loss and prompting\ndetection branch to learn more detailed information of the objects. And hence\nthe detection performance will be refined. During the detection tasks, only\ndetection branch will be reserved so that no additional cost of computation\nwill be introduced. Extensive experiments demonstrate that the proposed method\nshows significant improvement in performance of the detector in severely\ndegraded underwater scenes while maintaining a remarkable detection speed.\n","authors":["Jian Zhang","Ruiteng Zhang","Xinyue Yan","Xiting Zhuang","Ruicheng Cao"],"pdf_url":"https://arxiv.org/pdf/2404.08979v1.pdf","comment":"15 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.08968v1","updated":"2024-04-13T11:13:56Z","published":"2024-04-13T11:13:56Z","title":"MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes","summary":" Recent advancements in post-hoc and inherently interpretable methods have\nmarkedly enhanced the explanations of black box classifier models. These\nmethods operate either through post-analysis or by integrating concept learning\nduring model training. Although being effective in bridging the semantic gap\nbetween a model's latent space and human interpretation, these explanation\nmethods only partially reveal the model's decision-making process. The outcome\nis typically limited to high-level semantics derived from the last feature map.\nWe argue that the explanations lacking insights into the decision processes at\nlow and mid-level features are neither fully faithful nor useful. Addressing\nthis gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet),\nan inherently interpretable model. MCPNet autonomously learns meaningful\nconcept prototypes across multiple feature map levels using Centered Kernel\nAlignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so\nwithout reliance on predefined concept labels. Further, we propose a novel\nclassifier paradigm that learns and aligns multi-level concept prototype\ndistributions for classification purposes via Class-aware Concept Distribution\n(CCD) loss. Our experiments reveal that our proposed MCPNet while being\nadaptable to various model architectures, offers comprehensive multi-level\nexplanations while maintaining classification accuracy. Additionally, its\nconcept distribution-based classification approach shows improved\ngeneralization capabilities in few-shot classification scenarios.\n","authors":["Bor-Shiun Wang","Chien-Yi Wang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2404.08968v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.11248v3","updated":"2024-04-13T11:11:44Z","published":"2024-02-17T11:03:02Z","title":"CoLLaVO: Crayon Large Language and Vision mOdel","summary":" The remarkable success of Large Language Models (LLMs) and instruction tuning\ndrives the evolution of Vision Language Models (VLMs) towards a versatile\ngeneral-purpose model. Yet, it remains unexplored whether current VLMs\ngenuinely possess quality object-level image understanding capabilities\ndetermined from `what objects are in the image?' or `which object corresponds\nto a specified bounding box?'. Our findings reveal that the image understanding\ncapabilities of current VLMs are strongly correlated with their zero-shot\nperformance on vision language (VL) tasks. This suggests that prioritizing\nbasic image understanding is crucial for VLMs to excel at VL tasks. To enhance\nobject-level image understanding, we propose Crayon Large Language and Vision\nmOdel (CoLLaVO), which incorporates instruction tuning with Crayon Prompt as a\nnew visual prompt tuning scheme based on panoptic color maps. Furthermore, we\npresent a learning strategy of Dual QLoRA to preserve object-level image\nunderstanding without forgetting it during visual instruction tuning, thereby\nachieving a significant leap in numerous VL benchmarks in a zero-shot setting.\n","authors":["Byung-Kwan Lee","Beomchan Park","Chae Won Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2402.11248v3.pdf","comment":"Code available: https://github.com/ByungKwanLee/CoLLaVO"},{"id":"http://arxiv.org/abs/2404.08966v1","updated":"2024-04-13T11:07:53Z","published":"2024-04-13T11:07:53Z","title":"LoopGaussian: Creating 3D Cinemagraph with Multi-view Images via\n Eulerian Motion Field","summary":" Cinemagraph is a unique form of visual media that combines elements of still\nphotography and subtle motion to create a captivating experience. However, the\nmajority of videos generated by recent works lack depth information and are\nconfined to the constraints of 2D image space. In this paper, inspired by\nsignificant progress in the field of novel view synthesis (NVS) achieved by 3D\nGaussian Splatting (3D-GS), we propose LoopGaussian to elevate cinemagraph from\n2D image space to 3D space using 3D Gaussian modeling. To achieve this, we\nfirst employ the 3D-GS method to reconstruct 3D Gaussian point clouds from\nmulti-view images of static scenes,incorporating shape regularization terms to\nprevent blurring or artifacts caused by object deformation. We then adopt an\nautoencoder tailored for 3D Gaussian to project it into feature space. To\nmaintain the local continuity of the scene, we devise SuperGaussian for\nclustering based on the acquired features. By calculating the similarity\nbetween clusters and employing a two-stage estimation method, we derive an\nEulerian motion field to describe velocities across the entire scene. The 3D\nGaussian points then move within the estimated Eulerian motion field. Through\nbidirectional animation techniques, we ultimately generate a 3D Cinemagraph\nthat exhibits natural and seamlessly loopable dynamics. Experiment results\nvalidate the effectiveness of our approach, demonstrating high-quality and\nvisually appealing scene generation.\n","authors":["Jiyang Li","Lechao Cheng","Zhangye Wang","Tingting Mu","Jingxuan He"],"pdf_url":"https://arxiv.org/pdf/2404.08966v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.08965v1","updated":"2024-04-13T11:07:10Z","published":"2024-04-13T11:07:10Z","title":"Seeing Text in the Dark: Algorithm and Benchmark","summary":" Localizing text in low-light environments is challenging due to visual\ndegradations. Although a straightforward solution involves a two-stage pipeline\nwith low-light image enhancement (LLE) as the initial step followed by\ndetector, LLE is primarily designed for human vision instead of machine and can\naccumulate errors. In this work, we propose an efficient and effective\nsingle-stage approach for localizing text in dark that circumvents the need for\nLLE. We introduce a constrained learning module as an auxiliary mechanism\nduring the training stage of the text detector. This module is designed to\nguide the text detector in preserving textual spatial features amidst feature\nmap resizing, thus minimizing the loss of spatial information in texts under\nlow-light visual degradations. Specifically, we incorporate spatial\nreconstruction and spatial semantic constraints within this module to ensure\nthe text detector acquires essential positional and contextual range knowledge.\nOur approach enhances the original text detector's ability to identify text's\nlocal topological features using a dynamic snake feature pyramid network and\nadopts a bottom-up contour shaping strategy with a novel rectangular\naccumulation technique for accurate delineation of streamlined text features.\nIn addition, we present a comprehensive low-light dataset for arbitrary-shaped\ntext, encompassing diverse scenes and languages. Notably, our method achieves\nstate-of-the-art results on this low-light dataset and exhibits comparable\nperformance on standard normal light datasets. The code and dataset will be\nreleased.\n","authors":["Chengpei Xu","Hao Fu","Long Ma","Wenjing Jia","Chengqi Zhang","Feng Xia","Xiaoyu Ai","Binghao Li","Wenjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08964v1","updated":"2024-04-13T11:06:49Z","published":"2024-04-13T11:06:49Z","title":"Understanding Multimodal Deep Neural Networks: A Concept Selection View","summary":" The multimodal deep neural networks, represented by CLIP, have generated rich\ndownstream applications owing to their excellent performance, thus making\nunderstanding the decision-making process of CLIP an essential research topic.\nDue to the complex structure and the massive pre-training data, it is often\nregarded as a black-box model that is too difficult to understand and\ninterpret. Concept-based models map the black-box visual representations\nextracted by deep neural networks onto a set of human-understandable concepts\nand use the concepts to make predictions, enhancing the transparency of the\ndecision-making process. However, these methods involve the datasets labeled\nwith fine-grained attributes by expert knowledge, which incur high costs and\nintroduce excessive human prior knowledge and bias. In this paper, we observe\nthe long-tail distribution of concepts, based on which we propose a two-stage\nConcept Selection Model (CSM) to mine core concepts without introducing any\nhuman priors. The concept greedy rough selection algorithm is applied to\nextract head concepts, and then the concept mask fine selection method performs\nthe extraction of core concepts. Experiments show that our approach achieves\ncomparable performance to end-to-end black-box models, and human evaluation\ndemonstrates that the concepts discovered by our method are interpretable and\ncomprehensible for humans.\n","authors":["Chenming Shang","Hengyuan Zhang","Hao Wen","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.08964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07853v2","updated":"2024-04-13T10:56:49Z","published":"2024-01-15T17:28:37Z","title":"VeCAF: Vision-language Collaborative Active Finetuning with Training\n Objective Awareness","summary":" Finetuning a pretrained vision model (PVM) is a common technique for learning\ndownstream vision tasks. However, the conventional finetuning process with\nrandomly sampled data points results in diminished training efficiency. To\naddress this drawback, we propose a novel approach, Vision-language\nCollaborative Active Finetuning (VeCAF). With the emerging availability of\nlabels and natural language annotations of images through web-scale crawling or\ncontrolled generation, VeCAF makes use of these information to perform\nparametric data selection for PVM finetuning. VeCAF incorporates the finetuning\nobjective to select significant data points that effectively guide the PVM\ntowards faster convergence to meet the performance goal. This process is\nassisted by the inherent semantic richness of the text embedding space which we\nuse to augment image features. Furthermore, the flexibility of text-domain\naugmentation allows VeCAF to handle out-of-distribution scenarios without\nexternal data. Extensive experiments show the leading performance and high\ncomputational efficiency of VeCAF that is superior to baselines in both\nin-distribution and out-of-distribution image classification tasks. On\nImageNet, VeCAF uses up to 3.3x less training batches to reach the target\nperformance compared to full finetuning, and achieves an accuracy improvement\nof 2.7% over the state-of-the-art active finetuning method with the same number\nof batches.\n","authors":["Rongyu Zhang","Zefan Cai","Huanrui Yang","Zidong Liu","Denis Gudovskiy","Tomoyuki Okuno","Yohei Nakata","Kurt Keutzer","Baobao Chang","Yuan Du","Li Du","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.07853v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2404.08958v1","updated":"2024-04-13T10:46:11Z","published":"2024-04-13T10:46:11Z","title":"AMU-Tuning: Effective Logit Bias for CLIP-based Few-shot Learning","summary":" Recently, pre-trained vision-language models (e.g., CLIP) have shown great\npotential in few-shot learning and attracted a lot of research interest.\nAlthough efforts have been made to improve few-shot ability of CLIP, key\nfactors on the effectiveness of existing methods have not been well studied,\nlimiting further exploration of CLIP's potential in few-shot learning. In this\npaper, we first introduce a unified formulation to analyze CLIP-based few-shot\nlearning methods from a perspective of logit bias, which encourages us to learn\nan effective logit bias for further improving performance of CLIP-based\nfew-shot learning methods. To this end, we disassemble three key components\ninvolved in computation of logit bias (i.e., logit features, logit predictor,\nand logit fusion) and empirically analyze the effect on performance of few-shot\nclassification. Based on analysis of key components, this paper proposes a\nnovel AMU-Tuning method to learn effective logit bias for CLIP-based few-shot\nclassification. Specifically, our AMU-Tuning predicts logit bias by exploiting\nthe appropriate $\\underline{\\textbf{A}}$uxiliary features, which are fed into\nan efficient feature-initialized linear classifier with\n$\\underline{\\textbf{M}}$ulti-branch training. Finally, an\n$\\underline{\\textbf{U}}$ncertainty-based fusion is developed to incorporate\nlogit bias into CLIP for few-shot classification. The experiments are conducted\non several widely used benchmarks, and the results show AMU-Tuning clearly\noutperforms its counterparts while achieving state-of-the-art performance of\nCLIP-based few-shot learning without bells and whistles.\n","authors":["Yuwei Tang","Zhenyi Lin","Qilong Wang","Pengfei Zhu","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2404.08958v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.05975v2","updated":"2024-04-13T10:45:47Z","published":"2023-12-10T19:33:40Z","title":"FM-G-CAM: A Holistic Approach for Explainable AI in Computer Vision","summary":" Explainability is an aspect of modern AI that is vital for impact and\nusability in the real world. The main objective of this paper is to emphasise\nthe need to understand the predictions of Computer Vision models, specifically\nConvolutional Neural Network (CNN) based models. Existing methods of explaining\nCNN predictions are mostly based on Gradient-weighted Class Activation Maps\n(Grad-CAM) and solely focus on a single target class. We show that from the\npoint of the target class selection, we make an assumption on the prediction\nprocess, hence neglecting a large portion of the predictor CNN model's thinking\nprocess. In this paper, we present an exhaustive methodology called Fused\nMulti-class Gradient-weighted Class Activation Map (FM-G-CAM) that considers\nmultiple top predicted classes, which provides a holistic explanation of the\npredictor CNN's thinking rationale. We also provide a detailed and\ncomprehensive mathematical and algorithmic description of our method.\nFurthermore, along with a concise comparison of existing methods, we compare\nFM-G-CAM with Grad-CAM, highlighting its benefits through real-world practical\nuse cases. Finally, we present an open-source Python library with FM-G-CAM\nimplementation to conveniently generate saliency maps for CNN-based model\npredictions.\n","authors":["Ravidu Suien Rammuni Silva","Jordan J. Bird"],"pdf_url":"https://arxiv.org/pdf/2312.05975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08951v1","updated":"2024-04-13T10:15:51Z","published":"2024-04-13T10:15:51Z","title":"Constructing and Exploring Intermediate Domains in Mixed Domain\n Semi-supervised Medical Image Segmentation","summary":" Both limited annotation and domain shift are prevalent challenges in medical\nimage segmentation. Traditional semi-supervised segmentation and unsupervised\ndomain adaptation methods address one of these issues separately. However, the\ncoexistence of limited annotation and domain shift is quite common, which\nmotivates us to introduce a novel and challenging scenario: Mixed Domain\nSemi-supervised medical image Segmentation (MiDSS). In this scenario, we handle\ndata from multiple medical centers, with limited annotations available for a\nsingle domain and a large amount of unlabeled data from multiple domains. We\nfound that the key to solving the problem lies in how to generate reliable\npseudo labels for the unlabeled data in the presence of domain shift with\nlabeled data. To tackle this issue, we employ Unified Copy-Paste (UCP) between\nimages to construct intermediate domains, facilitating the knowledge transfer\nfrom the domain of labeled data to the domains of unlabeled data. To fully\nutilize the information within the intermediate domain, we propose a symmetric\nGuidance training strategy (SymGD), which additionally offers direct guidance\nto unlabeled data by merging pseudo labels from intermediate samples.\nSubsequently, we introduce a Training Process aware Random Amplitude MixUp\n(TP-RAM) to progressively incorporate style-transition components into\nintermediate samples. Compared with existing state-of-the-art approaches, our\nmethod achieves a notable 13.57% improvement in Dice score on Prostate dataset,\nas demonstrated on three public datasets. Our code is available at\nhttps://github.com/MQinghe/MiDSS .\n","authors":["Qinghe Ma","Jian Zhang","Lei Qi","Qian Yu","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2404.08951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03443v3","updated":"2024-04-13T10:00:28Z","published":"2024-04-04T13:43:11Z","title":"Part-Attention Based Model Make Occluded Person Re-Identification\n Stronger","summary":" The goal of occluded person re-identification (ReID) is to retrieve specific\npedestrians in occluded situations. However, occluded person ReID still suffers\nfrom background clutter and low-quality local feature representations, which\nlimits model performance. In our research, we introduce a new framework called\nPAB-ReID, which is a novel ReID model incorporating part-attention mechanisms\nto tackle the aforementioned issues effectively. Firstly, we introduce the\nhuman parsing label to guide the generation of more accurate human part\nattention maps. In addition, we propose a fine-grained feature focuser for\ngenerating fine-grained human local feature representations while suppressing\nbackground interference. Moreover, We also design a part triplet loss to\nsupervise the learning of human local features, which optimizes\nintra/inter-class distance. We conducted extensive experiments on specialized\nocclusion and regular ReID datasets, showcasing that our approach outperforms\nthe existing state-of-the-art methods.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.03443v3.pdf","comment":"Accepted By International Joint Conference on Neural Networks 2024"},{"id":"http://arxiv.org/abs/2401.06614v2","updated":"2024-04-13T09:23:21Z","published":"2024-01-12T15:05:08Z","title":"Motion2VecSets: 4D Latent Vector Set Diffusion for Non-rigid Shape\n Reconstruction and Tracking","summary":" We introduce Motion2VecSets, a 4D diffusion model for dynamic surface\nreconstruction from point cloud sequences. While existing state-of-the-art\nmethods have demonstrated success in reconstructing non-rigid objects using\nneural field representations, conventional feed-forward networks encounter\nchallenges with ambiguous observations from noisy, partial, or sparse point\nclouds. To address these challenges, we introduce a diffusion model that\nexplicitly learns the shape and motion distribution of non-rigid objects\nthrough an iterative denoising process of compressed latent representations.\nThe diffusion-based priors enable more plausible and probabilistic\nreconstructions when handling ambiguous inputs. We parameterize 4D dynamics\nwith latent sets instead of using global latent codes. This novel 4D\nrepresentation allows us to learn local shape and deformation patterns, leading\nto more accurate non-linear motion capture and significantly improving\ngeneralizability to unseen motions and identities. For more temporally-coherent\nobject tracking, we synchronously denoise deformation latent sets and exchange\ninformation across multiple frames. To avoid computational overhead, we\ndesigned an interleaved space and time attention block to alternately aggregate\ndeformation latents along spatial and temporal domains. Extensive comparisons\nagainst state-of-the-art methods demonstrate the superiority of our\nMotion2VecSets in 4D reconstruction from various imperfect observations. More\ndetailed information can be found at\nhttps://vveicao.github.io/projects/Motion2VecSets/.\n","authors":["Wei Cao","Chang Luo","Biao Zhang","Matthias Nießner","Jiapeng Tang"],"pdf_url":"https://arxiv.org/pdf/2401.06614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08937v1","updated":"2024-04-13T09:17:51Z","published":"2024-04-13T09:17:51Z","title":"ChimpVLM: Ethogram-Enhanced Chimpanzee Behaviour Recognition","summary":" We show that chimpanzee behaviour understanding from camera traps can be\nenhanced by providing visual architectures with access to an embedding of text\ndescriptions that detail species behaviours. In particular, we present a\nvision-language model which employs multi-modal decoding of visual features\nextracted directly from camera trap videos to process query tokens representing\nbehaviours and output class predictions. Query tokens are initialised using a\nstandardised ethogram of chimpanzee behaviour, rather than using random or\nname-based initialisations. In addition, the effect of initialising query\ntokens using a masked language model fine-tuned on a text corpus of known\nbehavioural patterns is explored. We evaluate our system on the PanAf500 and\nPanAf20K datasets and demonstrate the performance benefits of our multi-modal\ndecoding approach and query initialisation strategy on multi-class and\nmulti-label recognition tasks, respectively. Results and ablations corroborate\nperformance improvements. We achieve state-of-the-art performance over vision\nand vision-language models in top-1 accuracy (+6.34%) on PanAf500 and overall\n(+1.1%) and tail-class (+2.26%) mean average precision on PanAf20K. We share\ncomplete source code and network weights for full reproducibility of results\nand easy utilisation.\n","authors":["Otto Brookes","Majid Mirmehdi","Hjalmar Kuhl","Tilo Burghardt"],"pdf_url":"https://arxiv.org/pdf/2404.08937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08936v1","updated":"2024-04-13T09:10:33Z","published":"2024-04-13T09:10:33Z","title":"Shifting Spotlight for Co-supervision: A Simple yet Efficient\n Single-branch Network to See Through Camouflage","summary":" Efficient and accurate camouflaged object detection (COD) poses a challenge\nin the field of computer vision. Recent approaches explored the utility of edge\ninformation for network co-supervision, achieving notable advancements.\nHowever, these approaches introduce an extra branch for complex edge\nextraction, complicate the model architecture and increases computational\ndemands. Addressing this issue, our work replicates the effect that animal's\ncamouflage can be easily revealed under a shifting spotlight, and leverages it\nfor network co-supervision to form a compact yet efficient single-branch\nnetwork, the Co-Supervised Spotlight Shifting Network (CS$^3$Net). The\nspotlight shifting strategy allows CS$^3$Net to learn additional prior within a\nsingle-branch framework, obviating the need for resource demanding multi-branch\ndesign. To leverage the prior of spotlight shifting co-supervision, we propose\nShadow Refinement Module (SRM) and Projection Aware Attention (PAA) for feature\nrefinement and enhancement. To ensure the continuity of multi-scale features\naggregation, we utilize the Extended Neighbor Connection Decoder (ENCD) for\ngenerating the final predictions. Empirical evaluations on public datasets\nconfirm that our CS$^3$Net offers an optimal balance between efficiency and\nperformance: it accomplishes a 32.13% reduction in Multiply-Accumulate (MACs)\noperations compared to leading efficient COD models, while also delivering\nsuperior performance.\n","authors":["Yang Hu","Jinxia Zhang","Kaihua Zhang","Yin Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.08936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09486v3","updated":"2024-04-13T09:00:35Z","published":"2023-12-15T01:52:35Z","title":"Unraveling Batch Normalization for Realistic Test-Time Adaptation","summary":" While recent test-time adaptations exhibit efficacy by adjusting batch\nnormalization to narrow domain disparities, their effectiveness diminishes with\nrealistic mini-batches due to inaccurate target estimation. As previous\nattempts merely introduce source statistics to mitigate this issue, the\nfundamental problem of inaccurate target estimation still persists, leaving the\nintrinsic test-time domain shifts unresolved. This paper delves into the\nproblem of mini-batch degradation. By unraveling batch normalization, we\ndiscover that the inexact target statistics largely stem from the substantially\nreduced class diversity in batch. Drawing upon this insight, we introduce a\nstraightforward tool, Test-time Exponential Moving Average (TEMA), to bridge\nthe class diversity gap between training and testing batches. Importantly, our\nTEMA adaptively extends the scope of typical methods beyond the current batch\nto incorporate a diverse set of class information, which in turn boosts an\naccurate target estimation. Built upon this foundation, we further design a\nnovel layer-wise rectification strategy to consistently promote test-time\nperformance. Our proposed method enjoys a unique advantage as it requires\nneither training nor tuning parameters, offering a truly hassle-free solution.\nIt significantly enhances model robustness against shifted domains and\nmaintains resilience in diverse real-world scenarios with various batch sizes,\nachieving state-of-the-art performance on several major benchmarks. Code is\navailable at \\url{https://github.com/kiwi12138/RealisticTTA}.\n","authors":["Zixian Su","Jingwei Guo","Kai Yao","Xi Yang","Qiufeng Wang","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2312.09486v3.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2404.04880v2","updated":"2024-04-13T08:53:28Z","published":"2024-04-07T08:51:31Z","title":"GauU-Scene V2: Assessing the Reliability of Image-Based Metrics with\n Expansive Lidar Image Dataset Using 3DGS and NeRF","summary":" We introduce a novel, multimodal large-scale scene reconstruction benchmark\nthat utilizes newly developed 3D representation approaches: Gaussian Splatting\nand Neural Radiance Fields (NeRF). Our expansive U-Scene dataset surpasses any\npreviously existing real large-scale outdoor LiDAR and image dataset in both\narea and point count. GauU-Scene encompasses over 6.5 square kilometers and\nfeatures a comprehensive RGB dataset coupled with LiDAR ground truth.\nAdditionally, we are the first to propose a LiDAR and image alignment method\nfor a drone-based dataset. Our assessment of GauU-Scene includes a detailed\nanalysis across various novel viewpoints, employing image-based metrics such as\nSSIM, LPIPS, and PSNR on NeRF and Gaussian Splatting based methods. This\nanalysis reveals contradictory results when applying geometric-based metrics\nlike Chamfer distance. The experimental results on our multimodal dataset\nhighlight the unreliability of current image-based metrics and reveal\nsignificant drawbacks in geometric reconstruction using the current Gaussian\nSplatting-based method, further illustrating the necessity of our dataset for\nassessing geometry reconstruction tasks. We also provide detailed supplementary\ninformation on data collection protocols and make the dataset available on the\nfollowing anonymous project page\n","authors":["Butian Xiong","Nanjun Zheng","Junhua Liu","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2404.04880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09625v2","updated":"2024-04-13T08:51:33Z","published":"2023-12-15T09:08:14Z","title":"Weakly-Supervised 3D Visual Grounding based on Visual Linguistic\n Alignment","summary":" Learning to ground natural language queries to target objects or regions in\n3D point clouds is quite essential for 3D scene understanding. Nevertheless,\nexisting 3D visual grounding approaches require a substantial number of\nbounding box annotations for text queries, which is time-consuming and\nlabor-intensive to obtain. In this paper, we propose \\textbf{3D-VLA}, a weakly\nsupervised approach for \\textbf{3D} visual grounding based on \\textbf{V}isual\n\\textbf{L}inguistic \\textbf{A}lignment. Our 3D-VLA exploits the superior\nability of current large-scale vision-language models (VLMs) on aligning the\nsemantics between texts and 2D images, as well as the naturally existing\ncorrespondences between 2D images and 3D point clouds, and thus implicitly\nconstructs correspondences between texts and 3D point clouds with no need for\nfine-grained box annotations in the training procedure. During the inference\nstage, the learned text-3D correspondence will help us ground the text queries\nto the 3D target objects even without 2D images. To the best of our knowledge,\nthis is the first work to investigate 3D visual grounding in a weakly\nsupervised manner by involving large scale vision-language models, and\nextensive experiments on ReferIt3D and ScanRefer datasets demonstrate that our\n3D-VLA achieves comparable and even superior results over the fully supervised\nmethods.\n","authors":["Xiaoxu Xu","Yitian Yuan","Qiudan Zhang","Wenhui Wu","Zequn Jie","Lin Ma","Xu Wang"],"pdf_url":"https://arxiv.org/pdf/2312.09625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08931v1","updated":"2024-04-13T08:49:17Z","published":"2024-04-13T08:49:17Z","title":"Label-free Anomaly Detection in Aerial Agricultural Images with Masked\n Image Modeling","summary":" Detecting various types of stresses (nutritional, water, nitrogen, etc.) in\nagricultural fields is critical for farmers to ensure maximum productivity.\nHowever, stresses show up in different shapes and sizes across different crop\ntypes and varieties. Hence, this is posed as an anomaly detection task in\nagricultural images. Accurate anomaly detection in agricultural UAV images is\nvital for early identification of field irregularities. Traditional supervised\nlearning faces challenges in adapting to diverse anomalies, necessitating\nextensive annotated data. In this work, we overcome this limitation with\nself-supervised learning using a masked image modeling approach. Masked\nAutoencoders (MAE) extract meaningful normal features from unlabeled image\nsamples which produces high reconstruction error for the abnormal pixels during\nreconstruction. To remove the need of using only ``normal\" data while training,\nwe use an anomaly suppression loss mechanism that effectively minimizes the\nreconstruction of anomalous pixels and allows the model to learn anomalous\nareas without explicitly separating ``normal\" images for training. Evaluation\non the Agriculture-Vision data challenge shows a mIOU score improvement in\ncomparison to prior state of the art in unsupervised and self-supervised\nmethods. A single model generalizes across all the anomaly categories in the\nAgri-Vision Challenge Dataset\n","authors":["Sambal Shikhar","Anupam Sobti"],"pdf_url":"https://arxiv.org/pdf/2404.08931v1.pdf","comment":"The paper has been accepted to CVPR 2024 5th Workshop on Vision for\n Agriculture as an Oral Paper"},{"id":"http://arxiv.org/abs/2403.11134v2","updated":"2024-04-13T08:40:52Z","published":"2024-03-17T07:57:08Z","title":"Recent Advances in 3D Gaussian Splatting","summary":" The emergence of 3D Gaussian Splatting (3DGS) has greatly accelerated the\nrendering speed of novel view synthesis. Unlike neural implicit representations\nlike Neural Radiance Fields (NeRF) that represent a 3D scene with position and\nviewpoint-conditioned neural networks, 3D Gaussian Splatting utilizes a set of\nGaussian ellipsoids to model the scene so that efficient rendering can be\naccomplished by rasterizing Gaussian ellipsoids into images. Apart from the\nfast rendering speed, the explicit representation of 3D Gaussian Splatting\nfacilitates editing tasks like dynamic reconstruction, geometry editing, and\nphysical simulation. Considering the rapid change and growing number of works\nin this field, we present a literature review of recent 3D Gaussian Splatting\nmethods, which can be roughly classified into 3D reconstruction, 3D editing,\nand other downstream applications by functionality. Traditional point-based\nrendering methods and the rendering formulation of 3D Gaussian Splatting are\nalso illustrated for a better understanding of this technique. This survey aims\nto help beginners get into this field quickly and provide experienced\nresearchers with a comprehensive overview, which can stimulate the future\ndevelopment of the 3D Gaussian Splatting representation.\n","authors":["Tong Wu","Yu-Jie Yuan","Ling-Xiao Zhang","Jie Yang","Yan-Pei Cao","Ling-Qi Yan","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2403.11134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08928v1","updated":"2024-04-13T08:36:13Z","published":"2024-04-13T08:36:13Z","title":"DeDoDe v2: Analyzing and Improving the DeDoDe Keypoint Detector","summary":" In this paper, we analyze and improve into the recently proposed DeDoDe\nkeypoint detector. We focus our analysis on some key issues. First, we find\nthat DeDoDe keypoints tend to cluster together, which we fix by performing\nnon-max suppression on the target distribution of the detector during training.\nSecond, we address issues related to data augmentation. In particular, the\nDeDoDe detector is sensitive to large rotations. We fix this by including\n90-degree rotations as well as horizontal flips. Finally, the decoupled nature\nof the DeDoDe detector makes evaluation of downstream usefulness problematic.\nWe fix this by matching the keypoints with a pretrained dense matcher (RoMa)\nand evaluating two-view pose estimates. We find that the original long training\nis detrimental to performance, and therefore propose a much shorter training\nschedule. We integrate all these improvements into our proposed detector DeDoDe\nv2 and evaluate it with the original DeDoDe descriptor on the MegaDepth-1500\nand IMC2022 benchmarks. Our proposed detector significantly increases pose\nestimation results, notably from 75.9 to 78.3 mAA on the IMC2022 challenge.\nCode and weights are available at https://github.com/Parskatt/DeDoDe\n","authors":["Johan Edstedt","Georg Bökman","Zhenjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.08928v1.pdf","comment":"Accepted to Sixth Workshop on Image Matching - CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.08926v1","updated":"2024-04-13T08:27:10Z","published":"2024-04-13T08:27:10Z","title":"Diffusion Models Meet Remote Sensing: Principles, Methods, and\n Perspectives","summary":" As a newly emerging advance in deep generative models, diffusion models have\nachieved state-of-the-art results in many fields, including computer vision,\nnatural language processing, and molecule design. The remote sensing community\nhas also noticed the powerful ability of diffusion models and quickly applied\nthem to a variety of tasks for image processing. Given the rapid increase in\nresearch on diffusion models in the field of remote sensing, it is necessary to\nconduct a comprehensive review of existing diffusion model-based remote sensing\npapers, to help researchers recognize the potential of diffusion models and\nprovide some directions for further exploration. Specifically, this paper first\nintroduces the theoretical background of diffusion models, and then\nsystematically reviews the applications of diffusion models in remote sensing,\nincluding image generation, enhancement, and interpretation. Finally, the\nlimitations of existing remote sensing diffusion models and worthy research\ndirections for further exploration are discussed and summarized.\n","authors":["Yidan Liu","Jun Yue","Shaobo Xia","Pedram Ghamisi","Weiying Xie","Leyuan Fang"],"pdf_url":"https://arxiv.org/pdf/2404.08926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08923v1","updated":"2024-04-13T08:15:57Z","published":"2024-04-13T08:15:57Z","title":"Trustworthy Multimodal Fusion for Sentiment Analysis in Ordinal\n Sentiment Space","summary":" Multimodal video sentiment analysis aims to integrate multiple modal\ninformation to analyze the opinions and attitudes of speakers. Most previous\nwork focuses on exploring the semantic interactions of intra- and\ninter-modality. However, these works ignore the reliability of multimodality,\ni.e., modalities tend to contain noise, semantic ambiguity, missing modalities,\netc. In addition, previous multimodal approaches treat different modalities\nequally, largely ignoring their different contributions. Furthermore, existing\nmultimodal sentiment analysis methods directly regress sentiment scores without\nconsidering ordinal relationships within sentiment categories, with limited\nperformance. To address the aforementioned problems, we propose a trustworthy\nmultimodal sentiment ordinal network (TMSON) to improve performance in\nsentiment analysis. Specifically, we first devise a unimodal feature extractor\nfor each modality to obtain modality-specific features. Then, an uncertainty\ndistribution estimation network is customized, which estimates the unimodal\nuncertainty distributions. Next, Bayesian fusion is performed on the learned\nunimodal distributions to obtain multimodal distributions for sentiment\nprediction. Finally, an ordinal-aware sentiment space is constructed, where\nordinal regression is used to constrain the multimodal distributions. Our\nproposed TMSON outperforms baselines on multimodal sentiment analysis tasks,\nand empirical results demonstrate that TMSON is capable of reducing uncertainty\nto obtain more robust predictions.\n","authors":["Zhuyang Xie","Yan Yang","Jie Wang","Xiaorong Liu","Xiaofan Li"],"pdf_url":"https://arxiv.org/pdf/2404.08923v1.pdf","comment":"14 pages, 9 figures, Accepted by IEEE Transactions on Circuits and\n Systems for Video Technology"},{"id":"http://arxiv.org/abs/2310.03420v2","updated":"2024-04-13T08:07:05Z","published":"2023-10-05T09:57:23Z","title":"FreeReg: Image-to-Point Cloud Registration Leveraging Pretrained\n Diffusion Models and Monocular Depth Estimators","summary":" Matching cross-modality features between images and point clouds is a\nfundamental problem for image-to-point cloud registration. However, due to the\nmodality difference between images and points, it is difficult to learn robust\nand discriminative cross-modality features by existing metric learning methods\nfor feature matching. Instead of applying metric learning on cross-modality\ndata, we propose to unify the modality between images and point clouds by\npretrained large-scale models first, and then establish robust correspondence\nwithin the same modality. We show that the intermediate features, called\ndiffusion features, extracted by depth-to-image diffusion models are\nsemantically consistent between images and point clouds, which enables the\nbuilding of coarse but robust cross-modality correspondences. We further\nextract geometric features on depth maps produced by the monocular depth\nestimator. By matching such geometric features, we significantly improve the\naccuracy of the coarse correspondences produced by diffusion features.\nExtensive experiments demonstrate that without any task-specific training,\ndirect utilization of both features produces accurate image-to-point cloud\nregistration. On three public indoor and outdoor benchmarks, the proposed\nmethod averagely achieves a 20.6 percent improvement in Inlier Ratio, a\nthree-fold higher Inlier Number, and a 48.6 percent improvement in Registration\nRecall than existing state-of-the-arts.\n","authors":["Haiping Wang","Yuan Liu","Bing Wang","Yujing Sun","Zhen Dong","Wenping Wang","Bisheng Yang"],"pdf_url":"https://arxiv.org/pdf/2310.03420v2.pdf","comment":"CameraReady version for ICLR 2024. Project Page:\n https://whu-usi3dv.github.io/FreeReg/"},{"id":"http://arxiv.org/abs/2404.08921v1","updated":"2024-04-13T07:50:17Z","published":"2024-04-13T07:50:17Z","title":"PNeRV: Enhancing Spatial Consistency via Pyramidal Neural Representation\n for Videos","summary":" The primary focus of Neural Representation for Videos (NeRV) is to\neffectively model its spatiotemporal consistency. However, current NeRV systems\noften face a significant issue of spatial inconsistency, leading to decreased\nperceptual quality. To address this issue, we introduce the Pyramidal Neural\nRepresentation for Videos (PNeRV), which is built on a multi-scale information\nconnection and comprises a lightweight rescaling operator, Kronecker\nFully-connected layer (KFc), and a Benign Selective Memory (BSM) mechanism. The\nKFc, inspired by the tensor decomposition of the vanilla Fully-connected layer,\nfacilitates low-cost rescaling and global correlation modeling. BSM merges\nhigh-level features with granular ones adaptively. Furthermore, we provide an\nanalysis based on the Universal Approximation Theory of the NeRV system and\nvalidate the effectiveness of the proposed PNeRV.We conducted comprehensive\nexperiments to demonstrate that PNeRV surpasses the performance of contemporary\nNeRV models, achieving the best results in video regression on UVG and DAVIS\nunder various metrics (PSNR, SSIM, LPIPS, and FVD). Compared to vanilla NeRV,\nPNeRV achieves a +4.49 dB gain in PSNR and a 231% increase in FVD on UVG, along\nwith a +3.28 dB PSNR and 634% FVD increase on DAVIS.\n","authors":["Qi Zhao","M. Salman Asif","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2404.08921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03270v4","updated":"2024-04-13T07:33:57Z","published":"2023-10-05T02:51:53Z","title":"EfficientDM: Efficient Quantization-Aware Fine-Tuning of Low-Bit\n Diffusion Models","summary":" Diffusion models have demonstrated remarkable capabilities in image synthesis\nand related generative tasks. Nevertheless, their practicality for real-world\napplications is constrained by substantial computational costs and latency\nissues. Quantization is a dominant way to compress and accelerate diffusion\nmodels, where post-training quantization (PTQ) and quantization-aware training\n(QAT) are two main approaches, each bearing its own properties. While PTQ\nexhibits efficiency in terms of both time and data usage, it may lead to\ndiminished performance in low bit-width. On the other hand, QAT can alleviate\nperformance degradation but comes with substantial demands on computational and\ndata resources. In this paper, we introduce a data-free and parameter-efficient\nfine-tuning framework for low-bit diffusion models, dubbed EfficientDM, to\nachieve QAT-level performance with PTQ-like efficiency. Specifically, we\npropose a quantization-aware variant of the low-rank adapter (QALoRA) that can\nbe merged with model weights and jointly quantized to low bit-width. The\nfine-tuning process distills the denoising capabilities of the full-precision\nmodel into its quantized counterpart, eliminating the requirement for training\ndata. We also introduce scale-aware optimization and temporal learned step-size\nquantization to further enhance performance. Extensive experimental results\ndemonstrate that our method significantly outperforms previous PTQ-based\ndiffusion models while maintaining similar time and data efficiency.\nSpecifically, there is only a 0.05 sFID increase when quantizing both weights\nand activations of LDM-4 to 4-bit on ImageNet 256x256. Compared to QAT-based\nmethods, our EfficientDM also boasts a 16.2x faster quantization speed with\ncomparable generation quality. Code is available at\n\\href{https://github.com/ThisisBillhe/EfficientDM}{this hrl}.\n","authors":["Yefei He","Jing Liu","Weijia Wu","Hong Zhou","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2310.03270v4.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2404.08917v1","updated":"2024-04-13T07:30:17Z","published":"2024-04-13T07:30:17Z","title":"MAProtoNet: A Multi-scale Attentive Interpretable Prototypical Part\n Network for 3D Magnetic Resonance Imaging Brain Tumor Classification","summary":" Automated diagnosis with artificial intelligence has emerged as a promising\narea in the realm of medical imaging, while the interpretability of the\nintroduced deep neural networks still remains an urgent concern. Although\ncontemporary works, such as XProtoNet and MProtoNet, has sought to design\ninterpretable prediction models for the issue, the localization precision of\ntheir resulting attribution maps can be further improved. To this end, we\npropose a Multi-scale Attentive Prototypical part Network, termed MAProtoNet,\nto provide more precise maps for attribution. Specifically, we introduce a\nconcise multi-scale module to merge attentive features from quadruplet\nattention layers, and produces attribution maps. The proposed quadruplet\nattention layers can enhance the existing online class activation mapping loss\nvia capturing interactions between the spatial and channel dimension, while the\nmulti-scale module then fuses both fine-grained and coarse-grained information\nfor precise maps generation. We also apply a novel multi-scale mapping loss for\nsupervision on the proposed multi-scale module. Compared to existing\ninterpretable prototypical part networks in medical imaging, MAProtoNet can\nachieve state-of-the-art performance in localization on brain tumor\nsegmentation (BraTS) datasets, resulting in approximately 4% overall\nimprovement on activation precision score (with a best score of 85.8%), without\nusing additional annotated labels of segmentation. Our code will be released in\nhttps://github.com/TUAT-Novice/maprotonet.\n","authors":["Binghua Li","Jie Mao","Zhe Sun","Chao Li","Qibin Zhao","Toshihisa Tanaka"],"pdf_url":"https://arxiv.org/pdf/2404.08917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08916v1","updated":"2024-04-13T07:30:16Z","published":"2024-04-13T07:30:16Z","title":"Meply: A Large-scale Dataset and Baseline Evaluations for Metastatic\n Perirectal Lymph Node Detection and Segmentation","summary":" Accurate segmentation of metastatic lymph nodes in rectal cancer is crucial\nfor the staging and treatment of rectal cancer. However, existing segmentation\napproaches face challenges due to the absence of pixel-level annotated datasets\ntailored for lymph nodes around the rectum. Additionally, metastatic lymph\nnodes are characterized by their relatively small size, irregular shapes, and\nlower contrast compared to the background, further complicating the\nsegmentation task. To address these challenges, we present the first\nlarge-scale perirectal metastatic lymph node CT image dataset called Meply,\nwhich encompasses pixel-level annotations of 269 patients diagnosed with rectal\ncancer. Furthermore, we introduce a novel lymph-node segmentation model named\nCoSAM. The CoSAM utilizes sequence-based detection to guide the segmentation of\nmetastatic lymph nodes in rectal cancer, contributing to improved localization\nperformance for the segmentation model. It comprises three key components:\nsequence-based detection module, segmentation module, and collaborative\nconvergence unit. To evaluate the effectiveness of CoSAM, we systematically\ncompare its performance with several popular segmentation methods using the\nMeply dataset. Our code and dataset will be publicly available at:\nhttps://github.com/kanydao/CoSAM.\n","authors":["Weidong Guo","Hantao Zhang","Shouhong Wan","Bingbing Zou","Wanqin Wang","Chenyang Qiu","Jun Li","Peiquan Jin"],"pdf_url":"https://arxiv.org/pdf/2404.08916v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2404.08915v1","updated":"2024-04-13T07:27:06Z","published":"2024-04-13T07:27:06Z","title":"PM2: A New Prompting Multi-modal Model Paradigm for Few-shot Medical\n Image Classification","summary":" Few-shot learning has been successfully applied to medical image\nclassification as only very few medical examples are available for training.\nDue to the challenging problem of limited number of annotated medical images,\nimage representations should not be solely derived from a single image modality\nwhich is insufficient for characterizing concept classes. In this paper, we\npropose a new prompting multi-modal model paradigm on medical image\nclassification based on multi-modal foundation models, called PM2. Besides\nimage modality,PM2 introduces another supplementary text input, known as\nprompt, to further describe corresponding image or concept classes and\nfacilitate few-shot learning across diverse modalities. To better explore the\npotential of prompt engineering, we empirically investigate five distinct\nprompt schemes under the new paradigm. Furthermore, linear probing in\nmulti-modal models acts as a linear classification head taking as input only\nclass token, which ignores completely merits of rich statistics inherent in\nhigh-level visual tokens. Thus, we alternatively perform a linear\nclassification on feature distribution of visual tokens and class token\nsimultaneously. To effectively mine such rich statistics, a global covariance\npooling with efficient matrix power normalization is used to aggregate visual\ntokens. Then we study and combine two classification heads. One is shared for\nclass token of image from vision encoder and prompt representation encoded by\ntext encoder. The other is to classification on feature distribution of visual\ntokens from vision encoder. Extensive experiments on three medical datasets\nshow that our PM2 significantly outperforms counterparts regardless of prompt\nschemes and achieves state-of-the-art performance.\n","authors":["Zhenwei Wang","Qiule Sun","Bingbing Zhang","Pengfei Wang","Jianxin Zhang","Qiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.03331v2","updated":"2024-04-13T07:20:03Z","published":"2023-01-09T13:35:03Z","title":"A Specific Task-oriented Semantic Image Communication System for\n substation patrol inspection","summary":" Intelligent inspection robots are widely used in substation patrol\ninspection, which can help check potential safety hazards by patrolling the\nsubstation and sending back scene images. However, when patrolling some\nmarginal areas with weak signal, the scene images cannot be sucessfully\ntransmissted to be used for hidden danger elimination, which greatly reduces\nthe quality of robots'daily work. To solve such problem, a Specific\nTask-oriented Semantic Communication System for Imag-STSCI is designed, which\ninvolves the semantic features extraction, transmission, restoration and\nenhancement to get clearer images sent by intelligent robots under weak\nsignals. Inspired by that only some specific details of the image are needed in\nsuch substation patrol inspection task, we proposed a new paradigm of semantic\nenhancement in such specific task to ensure the clarity of key semantic\ninformation when facing a lower bit rate or a low signal-to-noise ratio\nsituation. Across the reality-based simulation, experiments show our STSCI can\ngenerally surpass traditional image-compression-based and channel-codingbased\nor other semantic communication system in the substation patrol inspection task\nwith a lower bit rate even under a low signal-to-noise ratio situation.\n","authors":["Senran Fan","Haotai Liang","Chen Dong","Xiaodong Xu","Geng Liu"],"pdf_url":"https://arxiv.org/pdf/2301.03331v2.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.06061v2","updated":"2024-04-13T06:14:42Z","published":"2023-05-10T11:26:36Z","title":"Visual Tuning","summary":" Fine-tuning visual models has been widely shown promising performance on many\ndownstream visual tasks. With the surprising development of pre-trained visual\nfoundation models, visual tuning jumped out of the standard modus operandi that\nfine-tunes the whole pre-trained model or just the fully connected layer.\nInstead, recent advances can achieve superior performance than full-tuning the\nwhole pre-trained parameters by updating far fewer parameters, enabling edge\ndevices and downstream applications to reuse the increasingly large foundation\nmodels deployed on the cloud. With the aim of helping researchers get the full\npicture and future directions of visual tuning, this survey characterizes a\nlarge and thoughtful selection of recent works, providing a systematic and\ncomprehensive overview of existing work and models. Specifically, it provides a\ndetailed background of visual tuning and categorizes recent visual tuning\ntechniques into five groups: prompt tuning, adapter tuning, parameter tuning,\nand remapping tuning. Meanwhile, it offers some exciting research directions\nfor prospective pre-training and various interactions in visual tuning.\n","authors":["Bruce X. B. Yu","Jianlong Chang","Haixin Wang","Lingbo Liu","Shijie Wang","Zhiyu Wang","Junfan Lin","Lingxi Xie","Haojie Li","Zhouchen Lin","Qi Tian","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2305.06061v2.pdf","comment":"37 pages. Accepted to ACM CSUR"},{"id":"http://arxiv.org/abs/2401.03749v2","updated":"2024-04-13T05:56:09Z","published":"2024-01-08T09:20:46Z","title":"The Method of Detecting Flying Birds in Surveillance Video Based on\n Their Characteristics","summary":" Aiming at the characteristics of the flying bird object in surveillance\nvideo, such as the single frame image feature is not obvious, the size is small\nin most cases, and asymmetric, this paper proposes a Flying Bird Object\nDetection method in Surveillance Video (FBOD-SV). Firstly, a new feature\naggregation module, the Correlation Attention Feature Aggregation\n(Co-Attention-FA) module, is designed to aggregate the features of the flying\nbird object according to the bird object's correlation on multiple consecutive\nframes of images. Secondly, a Flying Bird Object Detection Network (FBOD-Net)\nwith down-sampling and then up-sampling is designed, which uses a large feature\nlayer that fuses fine spatial information and large receptive field information\nto detect special multi-scale (mostly small-scale) bird objects. Finally, the\nSimOTA dynamic label allocation method is applied to One-Category object\ndetection, and the SimOTA-OC dynamic label strategy is proposed to solve the\ndifficult problem of label allocation caused by irregular flying bird objects.\nIn this paper, the algorithm's performance is verified by the experimental data\nset of the surveillance video of the flying bird object of the traction\nsubstation. The experimental results show that the surveillance video flying\nbird object detection method proposed in this paper effectively improves the\ndetection performance of flying bird objects.\n","authors":["Ziwei Sun","Zexi Hua","Hengchao Li","Yan Li"],"pdf_url":"https://arxiv.org/pdf/2401.03749v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10942v4","updated":"2024-04-13T05:52:04Z","published":"2023-10-17T02:38:09Z","title":"UNK-VQA: A Dataset and a Probe into the Abstention Ability of\n Multi-modal Large Models","summary":" Teaching Visual Question Answering (VQA) models to refrain from answering\nunanswerable questions is necessary for building a trustworthy AI system.\nExisting studies, though have explored various aspects of VQA but somewhat\nignored this particular attribute. This paper aims to bridge the research gap\nby contributing a comprehensive dataset, called UNK-VQA. The dataset is\nspecifically designed to address the challenge of questions that models do not\nknow. To this end, we first augment the existing data via deliberate\nperturbations on either the image or question. In specific, we carefully ensure\nthat the question-image semantics remain close to the original unperturbed\ndistribution. By this means, the identification of unanswerable questions\nbecomes challenging, setting our dataset apart from others that involve mere\nimage replacement. We then extensively evaluate the zero- and few-shot\nperformance of several emerging multi-modal large models and discover their\nsignificant limitations when applied to our dataset. Additionally, we also\npropose a straightforward method to tackle these unanswerable questions. This\ndataset, we believe, will serve as a valuable benchmark for enhancing the\nabstention capability of VQA models, thereby leading to increased\ntrustworthiness of AI systems. We have made the dataset\n(https://github.com/guoyang9/UNK-VQA) available to facilitate further\nexploration in this area.\n","authors":["Yangyang Guo","Fangkai Jiao","Zhiqi Shen","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2310.10942v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17963v2","updated":"2024-04-13T04:16:18Z","published":"2023-11-29T11:30:33Z","title":"M$^{2}$Chat: Empowering VLM for Multimodal LLM Interleaved Text-Image\n Generation","summary":" While current LLM chatbots like GPT-4V bridge the gap between human\ninstructions and visual representations to enable text-image generations, they\nstill lack efficient alignment methods for high-fidelity performance on\nmultiple downstream tasks. In this paper, we propose \\textbf{$M^{2}Chat$}, a\nnovel unified multimodal LLM framework for generating interleaved text-image\nconversation across various scenarios. Specifically, we propose an\n$M^{3}Adapter$ that efficiently integrates granular low-level visual\ninformation and high-level semantic features from multi-modality prompts. Upon\nthe well-aligned fused feature, $M^{3}Adapter$ tailors a learnable gating\nstrategy to balance the model creativity and consistency across various tasks\nadaptively. Moreover, to further enhance the effectiveness of $M^{3}Adapter$\nwhile preserving the coherence of semantic context comprehension, we introduce\na two-stage $M^{3}FT$ fine-tuning strategy. This strategy optimizes disjoint\ngroups of parameters for image-text alignment and visual-instruction\nrespectively. Extensive experiments demonstrate our $M^{2}Chat$ surpasses\nstate-of-the-art counterparts across diverse benchmarks, showcasing its prowess\nin interleaving generation, storytelling, and multimodal dialogue systems. The\ndemo and code are available at\n\\red{https://mattie-e.github.io/M2Chat.github.io}.\n","authors":["Xiaowei Chi","Rongyu Zhang","Zhengkai Jiang","Yijiang Liu","Yatian Wang","Xingqun Qi","Wenhan Luo","Peng Gao","Shanghang Zhang","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2311.17963v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08894v1","updated":"2024-04-13T04:01:35Z","published":"2024-04-13T04:01:35Z","title":"HEAT: Head-level Parameter Efficient Adaptation of Vision Transformers\n with Taylor-expansion Importance Scores","summary":" Prior computer vision research extensively explores adapting pre-trained\nvision transformers (ViT) to downstream tasks. However, the substantial number\nof parameters requiring adaptation has led to a focus on Parameter Efficient\nTransfer Learning (PETL) as an approach to efficiently adapt large pre-trained\nmodels by training only a subset of parameters, achieving both parameter and\nstorage efficiency. Although the significantly reduced parameters have shown\npromising performance under transfer learning scenarios, the structural\nredundancy inherent in the model still leaves room for improvement, which\nwarrants further investigation. In this paper, we propose Head-level Efficient\nAdaptation with Taylor-expansion importance score (HEAT): a simple method that\nefficiently fine-tuning ViTs at head levels. In particular, the first-order\nTaylor expansion is employed to calculate each head's importance score, termed\nTaylor-expansion Importance Score (TIS), indicating its contribution to\nspecific tasks. Additionally, three strategies for calculating TIS have been\nemployed to maximize the effectiveness of TIS. These strategies calculate TIS\nfrom different perspectives, reflecting varying contributions of parameters.\nBesides ViT, HEAT has also been applied to hierarchical transformers such as\nSwin Transformer, demonstrating its versatility across different transformer\narchitectures. Through extensive experiments, HEAT has demonstrated superior\nperformance over state-of-the-art PETL methods on the VTAB-1K benchmark.\n","authors":["Yibo Zhong","Yao Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.08894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05773v3","updated":"2024-04-13T03:56:11Z","published":"2024-02-08T16:00:25Z","title":"UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery","summary":" Raindrops adhering to the lens of UAVs can obstruct visibility of the\nbackground scene and degrade image quality. Despite recent progress in image\nderaining methods and datasets, there is a lack of focus on raindrop removal\nfrom UAV aerial imagery due to the unique challenges posed by varying angles\nand rapid movement during drone flight. To fill the gap in this research, we\nfirst construct a new benchmark dataset for removing raindrops from UAV images,\ncalled UAV-Rain1k. In this letter, we provide a dataset generation pipeline,\nwhich includes modeling raindrop shapes using Blender, collecting background\nimages from various UAV angles, random sampling of rain masks and etc. Based on\nthe proposed benchmark, we further present a comprehensive evaluation of\nexisting representative image deraining algorithms, and reveal future research\nopportunities worth exploring. The proposed dataset is publicly available at\nhttps://github.com/cschenxiang/UAV-Rain1k.\n","authors":["Wenhui Chang","Hongming Chen","Xin He","Xiang Chen","Liangduo Shen"],"pdf_url":"https://arxiv.org/pdf/2402.05773v3.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition Workshops (CVPRW) 2024"},{"id":"http://arxiv.org/abs/2404.08892v1","updated":"2024-04-13T03:46:35Z","published":"2024-04-13T03:46:35Z","title":"ChangeAnywhere: Sample Generation for Remote Sensing Change Detection\n via Semantic Latent Diffusion Model","summary":" Remote sensing change detection (CD) is a pivotal technique that pinpoints\nchanges on a global scale based on multi-temporal images. With the recent\nexpansion of deep learning, supervised deep learning-based CD models have shown\nsatisfactory performance. However, CD sample labeling is very time-consuming as\nit is densely labeled and requires expert knowledge. To alleviate this problem,\nwe introduce ChangeAnywhere, a novel CD sample generation method using the\nsemantic latent diffusion model and single-temporal images. Specifically,\nChangeAnywhere leverages the relative ease of acquiring large single-temporal\nsemantic datasets to generate large-scale, diverse, and semantically annotated\nbi-temporal CD datasets. ChangeAnywhere captures the two essentials of CD\nsamples, i.e., change implies semantically different, and non-change implies\nreasonable change under the same semantic constraints. We generated\nChangeAnywhere-100K, the largest synthesis CD dataset with 100,000 pairs of CD\nsamples based on the proposed method. The ChangeAnywhere-100K significantly\nimproved both zero-shot and few-shot performance on two CD benchmark datasets\nfor various deep learning-based CD models, as demonstrated by transfer\nexperiments. This paper delineates the enormous potential of ChangeAnywhere for\nCD sample generation and demonstrates the subsequent enhancement of model\nperformance. Therefore, ChangeAnywhere offers a potent tool for remote sensing\nCD. All codes and pre-trained models will be available at\nhttps://github.com/tangkai-RS/ChangeAnywhere.\n","authors":["Kai Tang","Jin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.08892v1.pdf","comment":"Concise manuscript version of ChangeAnywhere"},{"id":"http://arxiv.org/abs/2209.14145v3","updated":"2024-04-13T03:36:29Z","published":"2022-09-28T14:49:28Z","title":"Multi-scale Attention Network for Single Image Super-Resolution","summary":" ConvNets can compete with transformers in high-level tasks by exploiting\nlarger receptive fields. To unleash the potential of ConvNet in\nsuper-resolution, we propose a multi-scale attention network (MAN), by coupling\nclassical multi-scale mechanism with emerging large kernel attention. In\nparticular, we proposed multi-scale large kernel attention (MLKA) and gated\nspatial attention unit (GSAU). Through our MLKA, we modify large kernel\nattention with multi-scale and gate schemes to obtain the abundant attention\nmap at various granularity levels, thereby aggregating global and local\ninformation and avoiding potential blocking artifacts. In GSAU, we integrate\ngate mechanism and spatial attention to remove the unnecessary linear layer and\naggregate informative spatial context. To confirm the effectiveness of our\ndesigns, we evaluate MAN with multiple complexities by simply stacking\ndifferent numbers of MLKA and GSAU. Experimental results illustrate that our\nMAN can perform on par with SwinIR and achieve varied trade-offs between\nstate-of-the-art performance and computations.\n","authors":["Yan Wang","Yusen Li","Gang Wang","Xiaoguang Liu"],"pdf_url":"https://arxiv.org/pdf/2209.14145v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08886v1","updated":"2024-04-13T03:15:56Z","published":"2024-04-13T03:15:56Z","title":"EIVEN: Efficient Implicit Attribute Value Extraction using Multimodal\n LLM","summary":" In e-commerce, accurately extracting product attribute values from multimodal\ndata is crucial for improving user experience and operational efficiency of\nretailers. However, previous approaches to multimodal attribute value\nextraction often struggle with implicit attribute values embedded in images or\ntext, rely heavily on extensive labeled data, and can easily confuse similar\nattribute values. To address these issues, we introduce EIVEN, a data- and\nparameter-efficient generative framework that pioneers the use of multimodal\nLLM for implicit attribute value extraction. EIVEN leverages the rich inherent\nknowledge of a pre-trained LLM and vision encoder to reduce reliance on labeled\ndata. We also introduce a novel Learning-by-Comparison technique to reduce\nmodel confusion by enforcing attribute value comparison and difference\nidentification. Additionally, we construct initial open-source datasets for\nmultimodal implicit attribute value extraction. Our extensive experiments\nreveal that EIVEN significantly outperforms existing methods in extracting\nimplicit attribute values while requiring less labeled data.\n","authors":["Henry Peng Zou","Gavin Heqing Yu","Ziwei Fan","Dan Bu","Han Liu","Peng Dai","Dongmei Jia","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2404.08886v1.pdf","comment":"Accepted by NAACL 2024 Industry Track"},{"id":"http://arxiv.org/abs/2401.00094v2","updated":"2024-04-13T02:21:10Z","published":"2023-12-29T23:04:00Z","title":"Generating Enhanced Negatives for Training Language-Based Object\n Detectors","summary":" The recent progress in language-based open-vocabulary object detection can be\nlargely attributed to finding better ways of leveraging large-scale data with\nfree-form text annotations. Training such models with a discriminative\nobjective function has proven successful, but requires good positive and\nnegative samples. However, the free-form nature and the open vocabulary of\nobject descriptions make the space of negatives extremely large. Prior works\nrandomly sample negatives or use rule-based techniques to build them. In\ncontrast, we propose to leverage the vast knowledge built into modern\ngenerative models to automatically build negatives that are more relevant to\nthe original data. Specifically, we use large-language-models to generate\nnegative text descriptions, and text-to-image diffusion models to also generate\ncorresponding negative images. Our experimental analysis confirms the relevance\nof the generated negative data, and its use in language-based detectors\nimproves performance on two complex benchmarks. Code is available at\n\\url{https://github.com/xiaofeng94/Gen-Enhanced-Negs}.\n","authors":["Shiyu Zhao","Long Zhao","Vijay Kumar B. G","Yumin Suh","Dimitris N. Metaxas","Manmohan Chandraker","Samuel Schulter"],"pdf_url":"https://arxiv.org/pdf/2401.00094v2.pdf","comment":"Accepted to CVPR 2024. The supplementary document included"},{"id":"http://arxiv.org/abs/2404.00292v3","updated":"2024-04-13T02:01:50Z","published":"2024-03-30T08:51:23Z","title":"LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge\n Retrieval-Augmented Diffusion","summary":" Camouflaged vision perception is an important vision task with numerous\npractical applications. Due to the expensive collection and labeling costs,\nthis community struggles with a major bottleneck that the species category of\nits datasets is limited to a small number of object species. However, the\nexisting camouflaged generation methods require specifying the background\nmanually, thus failing to extend the camouflaged sample diversity in a low-cost\nmanner. In this paper, we propose a Latent Background Knowledge\nRetrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To\nour knowledge, our contributions mainly include: (1) For the first time, we\npropose a camouflaged generation paradigm that does not need to receive any\nbackground inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented\nmethod with interpretability for camouflaged generation, in which we propose an\nidea that knowledge retrieval and reasoning enhancement are separated\nexplicitly, to alleviate the task-specific challenges. Moreover, our method is\nnot restricted to specific foreground targets or backgrounds, offering a\npotential for extending camouflaged vision perception to more diverse domains.\n(3) Experimental results demonstrate that our method outperforms the existing\napproaches, generating more realistic camouflage images.\n","authors":["Pancheng Zhao","Peng Xu","Pengda Qin","Deng-Ping Fan","Zhicheng Zhang","Guoli Jia","Bowen Zhou","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00292v3.pdf","comment":"Accepted by CVPR 2024, Fig.3 revised"},{"id":"http://arxiv.org/abs/2308.06412v3","updated":"2024-04-13T01:40:03Z","published":"2023-08-11T23:03:50Z","title":"Taming Self-Training for Open-Vocabulary Object Detection","summary":" Recent studies have shown promising performance in open-vocabulary object\ndetection (OVD) by utilizing pseudo labels (PLs) from pretrained vision and\nlanguage models (VLMs). However, teacher-student self-training, a powerful and\nwidely used paradigm to leverage PLs, is rarely explored for OVD. This work\nidentifies two challenges of using self-training in OVD: noisy PLs from VLMs\nand frequent distribution changes of PLs. To address these challenges, we\npropose SAS-Det that tames self-training for OVD from two key perspectives.\nFirst, we present a split-and-fusion (SAF) head that splits a standard\ndetection into an open-branch and a closed-branch. This design can reduce noisy\nsupervision from pseudo boxes. Moreover, the two branches learn complementary\nknowledge from different training data, significantly enhancing performance\nwhen fused together. Second, in our view, unlike in closed-set tasks, the PL\ndistributions in OVD are solely determined by the teacher model. We introduce a\nperiodic update strategy to decrease the number of updates to the teacher,\nthereby decreasing the frequency of changes in PL distributions, which\nstabilizes the training process. Extensive experiments demonstrate SAS-Det is\nboth efficient and effective. SAS-Det outperforms recent models of the same\nscale by a clear margin and achieves 37.4 AP50 and 29.1 APr on novel categories\nof the COCO and LVIS benchmarks, respectively. Code is available at\n\\url{https://github.com/xiaofeng94/SAS-Det}.\n","authors":["Shiyu Zhao","Samuel Schulter","Long Zhao","Zhixing Zhang","Vijay Kumar B. G","Yumin Suh","Manmohan Chandraker","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2308.06412v3.pdf","comment":"Accepted to CVPR 2024. The supplementary document included"},{"id":"http://arxiv.org/abs/2403.16051v3","updated":"2024-04-13T01:19:39Z","published":"2024-03-24T07:36:38Z","title":"Segment Anything Model for Road Network Graph Extraction","summary":" We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for\nextracting large-scale, vectorized road network graphs from satellite imagery.\nTo predict graph geometry, we formulate it as a dense semantic segmentation\ntask, leveraging the inherent strengths of SAM. The image encoder of SAM is\nfine-tuned to produce probability masks for roads and intersections, from which\nthe graph vertices are extracted via simple non-maximum suppression. To predict\ngraph topology, we designed a lightweight transformer-based graph neural\nnetwork, which leverages the SAM image embeddings to estimate the edge\nexistence probabilities between vertices. Our approach directly predicts the\ngraph vertices and edges for large regions without expensive and complex\npost-processing heuristics, and is capable of building complete road network\ngraphs spanning multiple square kilometers in a matter of seconds. With its\nsimple, straightforward, and minimalist design, SAM-Road achieves comparable\naccuracy with the state-of-the-art method RNGDet++, while being 40 times faster\non the City-scale dataset. We thus demonstrate the power of a foundational\nvision model when applied to a graph learning task. The code is available at\nhttps://github.com/htcr/sam_road.\n","authors":["Congrui Hetang","Haoru Xue","Cindy Le","Tianwei Yue","Wenping Wang","Yihui He"],"pdf_url":"https://arxiv.org/pdf/2403.16051v3.pdf","comment":"Accepted by IEEE/CVF Computer Vision and Pattern Recognition\n Conference (CVPR) 2024, 2nd Workshop on Scene Graphs and Graph Representation\n Learning"},{"id":"http://arxiv.org/abs/2211.13854v5","updated":"2024-04-13T00:14:03Z","published":"2022-11-25T01:37:48Z","title":"ComCLIP: Training-Free Compositional Image and Text Matching","summary":" Contrastive Language-Image Pretraining (CLIP) has demonstrated great\nzero-shot performance for matching images and text. However, it is still\nchallenging to adapt vision-lanaguage pretrained models like CLIP to\ncompositional image and text matching -- a more challenging image and text\nmatching task requiring the model understanding of compositional word concepts\nand visual components. Towards better compositional generalization in zero-shot\nimage and text matching, in this paper, we study the problem from a causal\nperspective: the erroneous semantics of individual entities are essentially\nconfounders that cause the matching failure. Therefore, we propose a novel\n\\textbf{\\textit{training-free}} compositional CLIP model (ComCLIP). ComCLIP\ndisentangles input images into subjects, objects, and action sub-images and\ncomposes CLIP's vision encoder and text encoder to perform evolving matching\nover compositional text embedding and sub-image embeddings. In this way,\nComCLIP can mitigate spurious correlations introduced by the pretrained CLIP\nmodels and dynamically evaluate the importance of each component. Experiments\non four compositional image-text matching datasets: SVO, ComVG, Winoground, and\nVL-checklist, and two general image-text retrieval datasets: Flick30K, and\nMSCOCO demonstrate the effectiveness of our plug-and-play method, which boosts\nthe \\textbf{\\textit{zero-shot}} inference ability of CLIP, SLIP, and BLIP2 even\nwithout further training or fine-tuning. Our codes can be found at\nhttps://github.com/eric-ai-lab/ComCLIP.\n","authors":["Kenan Jiang","Xuehai He","Ruize Xu","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2211.13854v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08858v1","updated":"2024-04-13T00:13:20Z","published":"2024-04-13T00:13:20Z","title":"A Lightweight Spatiotemporal Network for Online Eye Tracking with Event\n Camera","summary":" Event-based data are commonly encountered in edge computing environments\nwhere efficiency and low latency are critical. To interface with such data and\nleverage their rich temporal features, we propose a causal spatiotemporal\nconvolutional network. This solution targets efficient implementation on\nedge-appropriate hardware with limited resources in three ways: 1) deliberately\ntargets a simple architecture and set of operations (convolutions, ReLU\nactivations) 2) can be configured to perform online inference efficiently via\nbuffering of layer outputs 3) can achieve more than 90% activation sparsity\nthrough regularization during training, enabling very significant efficiency\ngains on event-based processors. In addition, we propose a general affine\naugmentation strategy acting directly on the events, which alleviates the\nproblem of dataset scarcity for event-based systems. We apply our model on the\nAIS 2024 event-based eye tracking challenge, reaching a score of 0.9916 p10\naccuracy on the Kaggle private testset.\n","authors":["Yan Ru Pei","Sasskia Brüers","Sébastien Crouzet","Douglas McLelland","Olivier Coenen"],"pdf_url":"https://arxiv.org/pdf/2404.08858v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2207.13316v2","updated":"2024-04-13T07:39:29Z","published":"2022-07-27T06:25:47Z","title":"NICEST: Noisy Label Correction and Training for Robust Scene Graph\n Generation","summary":" Nearly all existing scene graph generation (SGG) models have overlooked the\nground-truth annotation qualities of mainstream SGG datasets, i.e., they\nassume: 1) all the manually annotated positive samples are equally correct; 2)\nall the un-annotated negative samples are absolutely background. In this paper,\nwe argue that neither of the assumptions applies to SGG: there are numerous\nnoisy ground-truth predicate labels that break these two assumptions and harm\nthe training of unbiased SGG models. To this end, we propose a novel NoIsy\nlabel CorrEction and Sample Training strategy for SGG: NICEST. Specifically, it\nconsists of two parts: NICE and NIST, which rule out these noisy label issues\nby generating high-quality samples and the effective training strategy,\nrespectively. NICE first detects noisy samples and then reassigns them more\nhigh-quality soft predicate labels. NIST is a multi-teacher knowledge\ndistillation based training strategy, which enables the model to learn unbiased\nfusion knowledge. And a dynamic trade-off weighting strategy in NIST is\ndesigned to penalize the bias of different teachers. Due to the model-agnostic\nnature of both NICE and NIST, our NICEST can be seamlessly incorporated into\nany SGG architecture to boost its performance on different predicate\ncategories. In addition, to better evaluate the generalization of SGG models,\nwe further propose a new benchmark VG-OOD, by re-organizing the prevalent VG\ndataset and deliberately making the predicate distributions of the training and\ntest sets as different as possible for each subject-object category pair. This\nnew benchmark helps disentangle the influence of subject-object category based\nfrequency biases. Extensive ablations and results on different backbones and\ntasks have attested to the effectiveness and generalization ability of each\ncomponent of NICEST.\n","authors":["Lin Li","Jun Xiao","Hanrong Shi","Hanwang Zhang","Yi Yang","Wei Liu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2207.13316v2.pdf","comment":"Extension of CVPR'22 work (The Devil is in the Labels: Noisy Label\n Correction for Robust Scene Graph Generation). arXiv admin note: substantial\n text overlap with arXiv:2206.03014"},{"id":"http://arxiv.org/abs/2404.10790v1","updated":"2024-04-13T01:31:25Z","published":"2024-04-13T01:31:25Z","title":"Multimodal Attack Detection for Action Recognition Models","summary":" Adversarial machine learning attacks on video action recognition models is a\ngrowing research area and many effective attacks were introduced in recent\nyears. These attacks show that action recognition models can be breached in\nmany ways. Hence using these models in practice raises significant security\nconcerns. However, there are very few works which focus on defending against or\ndetecting attacks. In this work, we propose a novel universal detection method\nwhich is compatible with any action recognition model. In our extensive\nexperiments, we show that our method consistently detects various attacks\nagainst different target models with high true positive rates while satisfying\nvery low false positive rates. Tested against four state-of-the-art attacks\ntargeting four action recognition models, the proposed detector achieves an\naverage AUC of 0.911 over 16 test cases while the best performance achieved by\nthe existing detectors is 0.645 average AUC. This 41.2% improvement is enabled\nby the robustness of the proposed detector to varying attack methods and target\nmodels. The lowest AUC achieved by our detector across the 16 test cases is\n0.837 while the competing detector's performance drops as low as 0.211. We also\nshow that the proposed detector is robust to varying attack strengths. In\naddition, we analyze our method's real-time performance with different hardware\nsetups to demonstrate its potential as a practical defense mechanism.\n","authors":["Furkan Mumcu","Yasin Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2404.10790v1.pdf","comment":null}]},"2024-04-16T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2402.16846v2","updated":"2024-04-16T17:59:53Z","published":"2024-02-26T18:59:33Z","title":"GROUNDHOG: Grounding Large Language Models to Holistic Segmentation","summary":" Most multimodal large language models (MLLMs) learn language-to-object\ngrounding through causal language modeling where grounded objects are captured\nby bounding boxes as sequences of location tokens. This paradigm lacks\npixel-level representations that are important for fine-grained visual\nunderstanding and diagnosis. In this work, we introduce GROUNDHOG, an MLLM\ndeveloped by grounding Large Language Models to holistic segmentation.\nGROUNDHOG incorporates a masked feature extractor and converts extracted\nfeatures into visual entity tokens for the MLLM backbone, which then connects\ngroundable phrases to unified grounding masks by retrieving and merging the\nentity masks. To train GROUNDHOG, we carefully curated M3G2, a grounded visual\ninstruction tuning dataset with Multi-Modal Multi-Grained Grounding, by\nharvesting a collection of segmentation-grounded datasets with rich\nannotations. Our experimental results show that GROUNDHOG achieves superior\nperformance on various language grounding tasks without task-specific\nfine-tuning, and significantly reduces object hallucination. GROUNDHOG also\ndemonstrates better grounding towards complex forms of visual input and\nprovides easy-to-understand diagnosis in failure cases.\n","authors":["Yichi Zhang","Ziqiao Ma","Xiaofeng Gao","Suhaila Shakiah","Qiaozi Gao","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2402.16846v2.pdf","comment":"Accepted to CVPR 2024. Website: https://groundhog-mllm.github.io/"},{"id":"http://arxiv.org/abs/2404.10775v1","updated":"2024-04-16T17:59:11Z","published":"2024-04-16T17:59:11Z","title":"COMBO: Compositional World Models for Embodied Multi-Agent Cooperation","summary":" In this paper, we investigate the problem of embodied multi-agent\ncooperation, where decentralized agents must cooperate given only partial\negocentric views of the world. To effectively plan in this setting, in contrast\nto learning world dynamics in a single-agent scenario, we must simulate world\ndynamics conditioned on an arbitrary number of agents' actions given only\npartial egocentric visual observations of the world. To address this issue of\npartial observability, we first train generative models to estimate the overall\nworld state given partial egocentric observations. To enable accurate\nsimulation of multiple sets of actions on this world state, we then propose to\nlearn a compositional world model for multi-agent cooperation by factorizing\nthe naturally composable joint actions of multiple agents and compositionally\ngenerating the video. By leveraging this compositional world model, in\ncombination with Vision Language Models to infer the actions of other agents,\nwe can use a tree search procedure to integrate these modules and facilitate\nonline cooperative planning. To evaluate the efficacy of our methods, we create\ntwo challenging embodied multi-agent long-horizon cooperation tasks using the\nThreeDWorld simulator and conduct experiments with 2-4 agents. The results show\nour compositional world model is effective and the framework enables the\nembodied agents to cooperate efficiently with different agents across various\ntasks and an arbitrary number of agents, showing the promising future of our\nproposed framework. More videos can be found at\nhttps://vis-www.cs.umass.edu/combo/.\n","authors":["Hongxin Zhang","Zeyuan Wang","Qiushi Lyu","Zheyuan Zhang","Sunli Chen","Tianmin Shu","Yilun Du","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2404.10775v1.pdf","comment":"23 pages. The first three authors contributed equally"},{"id":"http://arxiv.org/abs/2404.10772v1","updated":"2024-04-16T17:57:19Z","published":"2024-04-16T17:57:19Z","title":"Gaussian Opacity Fields: Efficient and Compact Surface Reconstruction in\n Unbounded Scenes","summary":" Recently, 3D Gaussian Splatting (3DGS) has demonstrated impressive novel view\nsynthesis results, while allowing the rendering of high-resolution images in\nreal-time. However, leveraging 3D Gaussians for surface reconstruction poses\nsignificant challenges due to the explicit and disconnected nature of 3D\nGaussians. In this work, we present Gaussian Opacity Fields (GOF), a novel\napproach for efficient, high-quality, and compact surface reconstruction in\nunbounded scenes. Our GOF is derived from ray-tracing-based volume rendering of\n3D Gaussians, enabling direct geometry extraction from 3D Gaussians by\nidentifying its levelset, without resorting to Poisson reconstruction or TSDF\nfusion as in previous work. We approximate the surface normal of Gaussians as\nthe normal of the ray-Gaussian intersection plane, enabling the application of\nregularization that significantly enhances geometry. Furthermore, we develop an\nefficient geometry extraction method utilizing marching tetrahedra, where the\ntetrahedral grids are induced from 3D Gaussians and thus adapt to the scene's\ncomplexity. Our evaluations reveal that GOF surpasses existing 3DGS-based\nmethods in surface reconstruction and novel view synthesis. Further, it\ncompares favorably to, or even outperforms, neural implicit methods in both\nquality and speed.\n","authors":["Zehao Yu","Torsten Sattler","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2404.10772v1.pdf","comment":"Project page:\n https://niujinshuchong.github.io/gaussian-opacity-fields"},{"id":"http://arxiv.org/abs/2312.13150v2","updated":"2024-04-16T17:56:19Z","published":"2023-12-20T16:14:58Z","title":"Splatter Image: Ultra-Fast Single-View 3D Reconstruction","summary":" We introduce the \\method, an ultra-efficient approach for monocular 3D object\nreconstruction. Splatter Image is based on Gaussian Splatting, which allows\nfast and high-quality reconstruction of 3D scenes from multiple images. We\napply Gaussian Splatting to monocular reconstruction by learning a neural\nnetwork that, at test time, performs reconstruction in a feed-forward manner,\nat 38 FPS. Our main innovation is the surprisingly straightforward design of\nthis network, which, using 2D operators, maps the input image to one 3D\nGaussian per pixel. The resulting set of Gaussians thus has the form an image,\nthe Splatter Image. We further extend the method take several images as input\nvia cross-view attention. Owning to the speed of the renderer (588 FPS), we use\na single GPU for training while generating entire images at each iteration to\noptimize perceptual metrics like LPIPS. On several synthetic, real,\nmulti-category and large-scale benchmark datasets, we achieve better results in\nterms of PSNR, LPIPS, and other metrics while training and evaluating much\nfaster than prior works. Code, models, demo and more results are available at\nhttps://szymanowiczs.github.io/splatter-image.\n","authors":["Stanislaw Szymanowicz","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2312.13150v2.pdf","comment":"CVPR 2024. Project page:\n https://szymanowiczs.github.io/splatter-image.html . Code:\n https://github.com/szymanowiczs/splatter-image , Demo:\n https://huggingface.co/spaces/szymanowiczs/splatter_image"},{"id":"http://arxiv.org/abs/2312.13752v2","updated":"2024-04-16T17:55:53Z","published":"2023-12-21T11:33:10Z","title":"Hunting imaging biomarkers in pulmonary fibrosis: Benchmarks of the\n AIIB23 challenge","summary":" Airway-related quantitative imaging biomarkers are crucial for examination,\ndiagnosis, and prognosis in pulmonary diseases. However, the manual delineation\nof airway trees remains prohibitively time-consuming. While significant efforts\nhave been made towards enhancing airway modelling, current public-available\ndatasets concentrate on lung diseases with moderate morphological variations.\nThe intricate honeycombing patterns present in the lung tissues of fibrotic\nlung disease patients exacerbate the challenges, often leading to various\nprediction errors. To address this issue, the 'Airway-Informed Quantitative CT\nImaging Biomarker for Fibrotic Lung Disease 2023' (AIIB23) competition was\norganized in conjunction with the official 2023 International Conference on\nMedical Image Computing and Computer Assisted Intervention (MICCAI). The airway\nstructures were meticulously annotated by three experienced radiologists.\nCompetitors were encouraged to develop automatic airway segmentation models\nwith high robustness and generalization abilities, followed by exploring the\nmost correlated QIB of mortality prediction. A training set of 120\nhigh-resolution computerised tomography (HRCT) scans were publicly released\nwith expert annotations and mortality status. The online validation set\nincorporated 52 HRCT scans from patients with fibrotic lung disease and the\noffline test set included 140 cases from fibrosis and COVID-19 patients. The\nresults have shown that the capacity of extracting airway trees from patients\nwith fibrotic lung disease could be enhanced by introducing voxel-wise weighted\ngeneral union loss and continuity loss. In addition to the competitive image\nbiomarkers for prognosis, a strong airway-derived biomarker (Hazard ratio>1.5,\np<0.0001) was revealed for survival prognostication compared with existing\nclinical measurements, clinician assessment and AI-based biomarkers.\n","authors":["Yang Nan","Xiaodan Xing","Shiyi Wang","Zeyu Tang","Federico N Felder","Sheng Zhang","Roberta Eufrasia Ledda","Xiaoliu Ding","Ruiqi Yu","Weiping Liu","Feng Shi","Tianyang Sun","Zehong Cao","Minghui Zhang","Yun Gu","Hanxiao Zhang","Jian Gao","Pingyu Wang","Wen Tang","Pengxin Yu","Han Kang","Junqiang Chen","Xing Lu","Boyu Zhang","Michail Mamalakis","Francesco Prinzi","Gianluca Carlini","Lisa Cuneo","Abhirup Banerjee","Zhaohu Xing","Lei Zhu","Zacharia Mesbah","Dhruv Jain","Tsiry Mayet","Hongyu Yuan","Qing Lyu","Abdul Qayyum","Moona Mazher","Athol Wells","Simon LF Walsh","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2312.13752v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2404.10766v1","updated":"2024-04-16T17:50:09Z","published":"2024-04-16T17:50:09Z","title":"RapidVol: Rapid Reconstruction of 3D Ultrasound Volumes from Sensorless\n 2D Scans","summary":" Two-dimensional (2D) freehand ultrasonography is one of the most commonly\nused medical imaging modalities, particularly in obstetrics and gynaecology.\nHowever, it only captures 2D cross-sectional views of inherently 3D anatomies,\nlosing valuable contextual information. As an alternative to requiring costly\nand complex 3D ultrasound scanners, 3D volumes can be constructed from 2D scans\nusing machine learning. However this usually requires long computational time.\nHere, we propose RapidVol: a neural representation framework to speed up\nslice-to-volume ultrasound reconstruction. We use tensor-rank decomposition, to\ndecompose the typical 3D volume into sets of tri-planes, and store those\ninstead, as well as a small neural network. A set of 2D ultrasound scans, with\ntheir ground truth (or estimated) 3D position and orientation (pose) is all\nthat is required to form a complete 3D reconstruction. Reconstructions are\nformed from real fetal brain scans, and then evaluated by requesting novel\ncross-sectional views. When compared to prior approaches based on fully\nimplicit representation (e.g. neural radiance fields), our method is over 3x\nquicker, 46% more accurate, and if given inaccurate poses is more robust.\nFurther speed-up is also possible by reconstructing from a structural prior\nrather than from scratch.\n","authors":["Mark C. Eid","Pak-Hei Yeung","Madeleine K. Wyburd","João F. Henriques","Ana I. L. Namburete"],"pdf_url":"https://arxiv.org/pdf/2404.10766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10765v1","updated":"2024-04-16T17:50:02Z","published":"2024-04-16T17:50:02Z","title":"RefFusion: Reference Adapted Diffusion Models for 3D Scene Inpainting","summary":" Neural reconstruction approaches are rapidly emerging as the preferred\nrepresentation for 3D scenes, but their limited editability is still posing a\nchallenge. In this work, we propose an approach for 3D scene inpainting -- the\ntask of coherently replacing parts of the reconstructed scene with desired\ncontent. Scene inpainting is an inherently ill-posed task as there exist many\nsolutions that plausibly replace the missing content. A good inpainting method\nshould therefore not only enable high-quality synthesis but also a high degree\nof control. Based on this observation, we focus on enabling explicit control\nover the inpainted content and leverage a reference image as an efficient means\nto achieve this goal. Specifically, we introduce RefFusion, a novel 3D\ninpainting method based on a multi-scale personalization of an image inpainting\ndiffusion model to the given reference view. The personalization effectively\nadapts the prior distribution to the target scene, resulting in a lower\nvariance of score distillation objective and hence significantly sharper\ndetails. Our framework achieves state-of-the-art results for object removal\nwhile maintaining high controllability. We further demonstrate the generality\nof our formulation on other downstream tasks such as object insertion, scene\noutpainting, and sparse view reconstruction.\n","authors":["Ashkan Mirzaei","Riccardo De Lutio","Seung Wook Kim","David Acuna","Jonathan Kelly","Sanja Fidler","Igor Gilitschenski","Zan Gojcic"],"pdf_url":"https://arxiv.org/pdf/2404.10765v1.pdf","comment":"Project page: https://reffusion.github.io"},{"id":"http://arxiv.org/abs/2404.10763v1","updated":"2024-04-16T17:47:16Z","published":"2024-04-16T17:47:16Z","title":"LaDiC: Are Diffusion Models Really Inferior to Autoregressive\n Counterparts for Image-to-Text Generation?","summary":" Diffusion models have exhibited remarkable capabilities in text-to-image\ngeneration. However, their performance in image-to-text generation,\nspecifically image captioning, has lagged behind Auto-Regressive (AR) models,\ncasting doubt on their applicability for such tasks. In this work, we revisit\ndiffusion models, highlighting their capacity for holistic context modeling and\nparallel decoding. With these benefits, diffusion models can alleviate the\ninherent limitations of AR methods, including their slow inference speed, error\npropagation, and unidirectional constraints. Furthermore, we identify the prior\nunderperformance of diffusion models stemming from the absence of an effective\nlatent space for image-text alignment, and the discrepancy between continuous\ndiffusion processes and discrete textual data. In response, we introduce a\nnovel architecture, LaDiC, which utilizes a split BERT to create a dedicated\nlatent space for captions and integrates a regularization module to manage\nvarying text lengths. Our framework also includes a diffuser for semantic\nimage-to-text conversion and a Back&Refine technique to enhance token\ninteractivity during inference. LaDiC achieves state-of-the-art performance for\ndiffusion-based methods on the MS COCO dataset with 38.2 BLEU@4 and 126.2\nCIDEr, demonstrating exceptional performance without pre-training or ancillary\nmodules. This indicates strong competitiveness with AR models, revealing the\npreviously untapped potential of diffusion models in image-to-text generation.\n","authors":["Yuchi Wang","Shuhuai Ren","Rundong Gao","Linli Yao","Qingyan Guo","Kaikai An","Jianhong Bai","Xu Sun"],"pdf_url":"https://arxiv.org/pdf/2404.10763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10760v1","updated":"2024-04-16T17:38:26Z","published":"2024-04-16T17:38:26Z","title":"Learning Feature Inversion for Multi-class Anomaly Detection under\n General-purpose COCO-AD Benchmark","summary":" Anomaly detection (AD) is often focused on detecting anomaly areas for\nindustrial quality inspection and medical lesion examination. However, due to\nthe specific scenario targets, the data scale for AD is relatively small, and\nevaluation metrics are still deficient compared to classic vision tasks, such\nas object detection and semantic segmentation. To fill these gaps, this work\nfirst constructs a large-scale and general-purpose COCO-AD dataset by extending\nCOCO to the AD field. This enables fair evaluation and sustainable development\nfor different methods on this challenging benchmark. Moreover, current metrics\nsuch as AU-ROC have nearly reached saturation on simple datasets, which\nprevents a comprehensive evaluation of different methods. Inspired by the\nmetrics in the segmentation field, we further propose several more practical\nthreshold-dependent AD-specific metrics, ie, m$F_1$$^{.2}_{.8}$,\nmAcc$^{.2}_{.8}$, mIoU$^{.2}_{.8}$, and mIoU-max. Motivated by GAN inversion's\nhigh-quality reconstruction capability, we propose a simple but more powerful\nInvAD framework to achieve high-quality feature reconstruction. Our method\nimproves the effectiveness of reconstruction-based methods on popular MVTec AD,\nVisA, and our newly proposed COCO-AD datasets under a multi-class unsupervised\nsetting, where only a single detection model is trained to detect anomalies\nfrom different classes. Extensive ablation experiments have demonstrated the\neffectiveness of each component of our InvAD. Full codes and models are\navailable at https://github.com/zhangzjn/ader.\n","authors":["Jiangning Zhang","Chengjie Wang","Xiangtai Li","Guanzhong Tian","Zhucun Xue","Yong Liu","Guansong Pang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2404.10760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10758v1","updated":"2024-04-16T17:35:35Z","published":"2024-04-16T17:35:35Z","title":"Watch Your Step: Optimal Retrieval for Continual Learning at Scale","summary":" One of the most widely used approaches in continual learning is referred to\nas replay. Replay methods support interleaved learning by storing past\nexperiences in a replay buffer. Although there are methods for selectively\nconstructing the buffer and reprocessing its contents, there is limited\nexploration of the problem of selectively retrieving samples from the buffer.\nCurrent solutions have been tested in limited settings and, more importantly,\nin isolation. Existing work has also not explored the impact of duplicate\nreplays on performance. In this work, we propose a framework for evaluating\nselective retrieval strategies, categorized by simple, independent class- and\nsample-selective primitives. We evaluated several combinations of existing\nstrategies for selective retrieval and present their performances. Furthermore,\nwe propose a set of strategies to prevent duplicate replays and explore whether\nnew samples with low loss values can be learned without replay. In an effort to\nmatch our problem setting to a realistic continual learning pipeline, we\nrestrict our experiments to a setting involving a large, pre-trained, open\nvocabulary object detection model, which is fully fine-tuned on a sequence of\n15 datasets.\n","authors":["Truman Hickok","Dhireesha Kudithipudi"],"pdf_url":"https://arxiv.org/pdf/2404.10758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17484v2","updated":"2024-04-16T16:55:35Z","published":"2024-01-30T22:37:24Z","title":"Pixel to Elevation: Learning to Predict Elevation Maps at Long Range\n using Images for Autonomous Offroad Navigation","summary":" Understanding terrain topology at long-range is crucial for the success of\noff-road robotic missions, especially when navigating at high-speeds. LiDAR\nsensors, which are currently heavily relied upon for geometric mapping, provide\nsparse measurements when mapping at greater distances. To address this\nchallenge, we present a novel learning-based approach capable of predicting\nterrain elevation maps at long-range using only onboard egocentric images in\nreal-time. Our proposed method is comprised of three main elements. First, a\ntransformer-based encoder is introduced that learns cross-view associations\nbetween the egocentric views and prior bird-eye-view elevation map predictions.\nSecond, an orientation-aware positional encoding is proposed to incorporate the\n3D vehicle pose information over complex unstructured terrain with multi-view\nvisual image features. Lastly, a history-augmented learn-able map embedding is\nproposed to achieve better temporal consistency between elevation map\npredictions to facilitate the downstream navigational tasks. We experimentally\nvalidate the applicability of our proposed approach for autonomous offroad\nrobotic navigation in complex and unstructured terrain using real-world offroad\ndriving data. Furthermore, the method is qualitatively and quantitatively\ncompared against the current state-of-the-art methods. Extensive field\nexperiments demonstrate that our method surpasses baseline models in accurately\npredicting terrain elevation while effectively capturing the overall terrain\ntopology at long-ranges. Finally, ablation studies are conducted to highlight\nand understand the effect of key components of the proposed approach and\nvalidate their suitability to improve offroad robotic navigation capabilities.\n","authors":["Chanyoung Chung","Georgios Georgakis","Patrick Spieler","Curtis Padgett","Shehryar Khattak"],"pdf_url":"https://arxiv.org/pdf/2401.17484v2.pdf","comment":"8 pages, 6 figures, Accepted in IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2301.13656v3","updated":"2024-04-16T16:52:18Z","published":"2023-01-31T14:18:19Z","title":"A Survey and Benchmark of Automatic Surface Reconstruction from Point\n Clouds","summary":" We present a comprehensive survey and benchmark of both traditional and\nlearning-based methods for surface reconstruction from point clouds. This task\nis particularly challenging for real-world acquisitions due to factors like\nnoise, outliers, non-uniform sampling, and missing data. Traditional approaches\noften simplify the problem by imposing handcrafted priors on either the input\npoint clouds or the resulting surface, a process that can necessitate tedious\nhyperparameter tuning. Conversely, deep learning models have the capability to\ndirectly learn the properties of input point clouds and desired surfaces from\ndata. We study the influence of these handcrafted and learned priors on the\nprecision and robustness of surface reconstruction techniques. We evaluate\nvarious time-tested and contemporary methods in a standardized manner. When\nboth trained and evaluated on point clouds with identical characteristics, the\nlearning-based models consistently produce superior surfaces compared to their\ntraditional counterparts$\\unicode{x2013}$even in scenarios involving novel\nshape categories. However, traditional methods demonstrate greater resilience\nto the diverse array of point cloud anomalies commonly found in real-world 3D\nacquisitions. For the benefit of the research community, we make our code and\ndatasets available, inviting further enhancements to learning-based surface\nreconstruction. This can be accessed at\nhttps://github.com/raphaelsulzer/dsr-benchmark .\n","authors":["Raphael Sulzer","Renaud Marlet","Bruno Vallet","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2301.13656v3.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2404.10718v1","updated":"2024-04-16T16:51:27Z","published":"2024-04-16T16:51:27Z","title":"GazeHTA: End-to-end Gaze Target Detection with Head-Target Association","summary":" We propose an end-to-end approach for gaze target detection: predicting a\nhead-target connection between individuals and the target image regions they\nare looking at. Most of the existing methods use independent components such as\noff-the-shelf head detectors or have problems in establishing associations\nbetween heads and gaze targets. In contrast, we investigate an end-to-end\nmulti-person Gaze target detection framework with Heads and Targets Association\n(GazeHTA), which predicts multiple head-target instances based solely on input\nscene image. GazeHTA addresses challenges in gaze target detection by (1)\nleveraging a pre-trained diffusion model to extract scene features for rich\nsemantic understanding, (2) re-injecting a head feature to enhance the head\npriors for improved head understanding, and (3) learning a connection map as\nthe explicit visual associations between heads and gaze targets. Our extensive\nexperimental results demonstrate that GazeHTA outperforms state-of-the-art gaze\ntarget detection methods and two adapted diffusion-based baselines on two\nstandard datasets.\n","authors":["Zhi-Yi Lin","Jouh Yeong Chew","Jan van Gemert","Xucong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10717v1","updated":"2024-04-16T16:51:12Z","published":"2024-04-16T16:51:12Z","title":"Mixed Prototype Consistency Learning for Semi-supervised Medical Image\n Segmentation","summary":" Recently, prototype learning has emerged in semi-supervised medical image\nsegmentation and achieved remarkable performance. However, the scarcity of\nlabeled data limits the expressiveness of prototypes in previous methods,\npotentially hindering the complete representation of prototypes for class\nembedding. To address this problem, we propose the Mixed Prototype Consistency\nLearning (MPCL) framework, which includes a Mean Teacher and an auxiliary\nnetwork. The Mean Teacher generates prototypes for labeled and unlabeled data,\nwhile the auxiliary network produces additional prototypes for mixed data\nprocessed by CutMix. Through prototype fusion, mixed prototypes provide extra\nsemantic information to both labeled and unlabeled prototypes. High-quality\nglobal prototypes for each class are formed by fusing two enhanced prototypes,\noptimizing the distribution of hidden embeddings used in consistency learning.\nExtensive experiments on the left atrium and type B aortic dissection datasets\ndemonstrate MPCL's superiority over previous state-of-the-art approaches,\nconfirming the effectiveness of our framework. The code will be released soon.\n","authors":["Lijian Li"],"pdf_url":"https://arxiv.org/pdf/2404.10717v1.pdf","comment":"15 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.10716v1","updated":"2024-04-16T16:50:35Z","published":"2024-04-16T16:50:35Z","title":"MOWA: Multiple-in-One Image Warping Model","summary":" While recent image warping approaches achieved remarkable success on existing\nbenchmarks, they still require training separate models for each specific task\nand cannot generalize well to different camera models or customized\nmanipulations. To address diverse types of warping in practice, we propose a\nMultiple-in-One image WArping model (named MOWA) in this work. Specifically, we\nmitigate the difficulty of multi-task learning by disentangling the motion\nestimation at both the region level and pixel level. To further enable dynamic\ntask-aware image warping, we introduce a lightweight point-based classifier\nthat predicts the task type, serving as prompts to modulate the feature maps\nfor better estimation. To our knowledge, this is the first work that solves\nmultiple practical warping tasks in one single model. Extensive experiments\ndemonstrate that our MOWA, which is trained on six tasks for multiple-in-one\nimage warping, outperforms state-of-the-art task-specific models across most\ntasks. Moreover, MOWA also exhibits promising potential to generalize into\nunseen scenes, as evidenced by cross-domain and zero-shot evaluations. The code\nwill be made publicly available.\n","authors":["Kang Liao","Zongsheng Yue","Zhonghua Wu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2404.10716v1.pdf","comment":"Project page: https://kangliao929.github.io/projects/mowa/"},{"id":"http://arxiv.org/abs/2404.10714v1","updated":"2024-04-16T16:43:36Z","published":"2024-04-16T16:43:36Z","title":"AV-GAN: Attention-Based Varifocal Generative Adversarial Network for\n Uneven Medical Image Translation","summary":" Different types of staining highlight different structures in organs, thereby\nassisting in diagnosis. However, due to the impossibility of repeated staining,\nwe cannot obtain different types of stained slides of the same tissue area.\nTranslating the slide that is easy to obtain (e.g., H&E) to slides of staining\ntypes difficult to obtain (e.g., MT, PAS) is a promising way to solve this\nproblem. However, some regions are closely connected to other regions, and to\nmaintain this connection, they often have complex structures and are difficult\nto translate, which may lead to wrong translations. In this paper, we propose\nthe Attention-Based Varifocal Generative Adversarial Network (AV-GAN), which\nsolves multiple problems in pathologic image translation tasks, such as uneven\ntranslation difficulty in different regions, mutual interference of multiple\nresolution information, and nuclear deformation. Specifically, we develop an\nAttention-Based Key Region Selection Module, which can attend to regions with\nhigher translation difficulty. We then develop a Varifocal Module to translate\nthese regions at multiple resolutions. Experimental results show that our\nproposed AV-GAN outperforms existing image translation methods with two virtual\nkidney tissue staining tasks and improves FID values by 15.9 and 4.16\nrespectively in the H&E-MT and H&E-PAS tasks.\n","authors":["Zexin Li","Yiyang Lin","Zijie Fang","Shuyan Li","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2404.10714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10713v1","updated":"2024-04-16T16:43:14Z","published":"2024-04-16T16:43:14Z","title":"A Plausibility Study of Using Augmented Reality in the\n Ventriculoperitoneal Shunt Operations","summary":" The field of augmented reality (AR) has undergone substantial growth, finding\ndiverse applications in the medical industry. This paper delves into various\ntechniques employed in medical surgeries, scrutinizing factors such as cost,\nimplementation, and accessibility. The focus of this exploration is on AR-based\nsolutions, with a particular emphasis on addressing challenges and proposing an\ninnovative solution for ventriculoperitoneal shunt (VP) operations. The\nproposed solution introduces a novel flow in the pre-surgery phase, aiming to\nsubstantially reduce setup time and operation duration by creating 3D models of\nthe skull and ventricles. Experiments are conducted where the models are\nvisualized on a 3D- printed skull through an AR device, specifically the\nMicrosoft HoloLens 2. The paper then conducts an in-depth analysis of this\nproposed solution, discussing its feasibility, advantages, limitations,and\nfuture implications.\n","authors":["Tandin Dorji","Pakinee Aimmanee","Vich Yindeedej"],"pdf_url":"https://arxiv.org/pdf/2404.10713v1.pdf","comment":"Accepted for the 2024 - 16th International Conference on Knowledge\n and Smart Technology (KST). To be published in IEEEXplore Digital Library\n (#61284), ISBN: 979-8-3503-7073-7"},{"id":"http://arxiv.org/abs/2404.10710v1","updated":"2024-04-16T16:36:50Z","published":"2024-04-16T16:36:50Z","title":"Dual Modalities of Text: Visual and Textual Generative Pre-training","summary":" Harnessing visual texts represents a burgeoning frontier in the evolution of\nlanguage modeling. In this paper, we introduce a novel pre-training framework\nfor a suite of pixel-based autoregressive language models, pre-training on a\ncorpus of over 400 million documents rendered as RGB images. Our approach is\ncharacterized by a dual-modality training regimen, engaging both visual data\nthrough next patch prediction with a regression head and textual data via next\ntoken prediction with a classification head. This study is particularly focused\non investigating the synergistic interplay between visual and textual\nmodalities of language. Our comprehensive evaluation across a diverse array of\nbenchmarks reveals that the confluence of visual and textual data substantially\naugments the efficacy of pixel-based language models. Notably, our findings\nshow that a unidirectional pixel-based model, devoid of textual data during\ntraining, can match the performance levels of advanced bidirectional\npixel-based models on various language understanding benchmarks. This work\nhighlights the considerable untapped potential of integrating visual and\ntextual information for language modeling purposes. We will release our code,\ndata, and checkpoints to inspire further research advancement.\n","authors":["Yekun Chai","Qingyi Liu","Jingwu Xiao","Shuohuan Wang","Yu Sun","Hua Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.11679v2","updated":"2024-04-16T16:28:42Z","published":"2021-12-22T06:05:02Z","title":"Ghost-dil-NetVLAD: A Lightweight Neural Network for Visual Place\n Recognition","summary":" Visual place recognition (VPR) is a challenging task with the unbalance\nbetween enormous computational cost and high recognition performance. Thanks to\nthe practical feature extraction ability of the lightweight convolution neural\nnetworks (CNNs) and the train-ability of the vector of locally aggregated\ndescriptors (VLAD) layer, we propose a lightweight weakly supervised end-to-end\nneural network consisting of a front-ended perception model called GhostCNN and\na learnable VLAD layer as a back-end. GhostCNN is based on Ghost modules that\nare lightweight CNN-based architectures. They can generate redundant feature\nmaps using linear operations instead of the traditional convolution process,\nmaking a good trade-off between computation resources and recognition accuracy.\nTo enhance our proposed lightweight model further, we add dilated convolutions\nto the Ghost module to get features containing more spatial semantic\ninformation, improving accuracy. Finally, rich experiments conducted on a\ncommonly used public benchmark and our private dataset validate that the\nproposed neural network reduces the FLOPs and parameters of VGG16-NetVLAD by\n99.04% and 80.16%, respectively. Besides, both models achieve similar accuracy.\n","authors":["Qingyuan Gong","Yu Liu","Liqiang Zhang","Renhe Liu"],"pdf_url":"https://arxiv.org/pdf/2112.11679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16278v2","updated":"2024-04-16T16:26:35Z","published":"2023-11-27T19:34:04Z","title":"VehicleGAN: Pair-flexible Pose Guided Image Synthesis for Vehicle\n Re-identification","summary":" Vehicle Re-identification (Re-ID) has been broadly studied in the last\ndecade; however, the different camera view angle leading to confused\ndiscrimination in the feature subspace for the vehicles of various poses, is\nstill challenging for the Vehicle Re-ID models in the real world. To promote\nthe Vehicle Re-ID models, this paper proposes to synthesize a large number of\nvehicle images in the target pose, whose idea is to project the vehicles of\ndiverse poses into the unified target pose so as to enhance feature\ndiscrimination. Considering that the paired data of the same vehicles in\ndifferent traffic surveillance cameras might be not available in the real\nworld, we propose the first Pair-flexible Pose Guided Image Synthesis method\nfor Vehicle Re-ID, named as VehicleGAN in this paper, which works for both\nsupervised and unsupervised settings without the knowledge of geometric 3D\nmodels. Because of the feature distribution difference between real and\nsynthetic data, simply training a traditional metric learning based Re-ID model\nwith data-level fusion (i.e., data augmentation) is not satisfactory, therefore\nwe propose a new Joint Metric Learning (JML) via effective feature-level fusion\nfrom both real and synthetic data. Intensive experimental results on the public\nVeRi-776 and VehicleID datasets prove the accuracy and effectiveness of our\nproposed VehicleGAN and JML.\n","authors":["Baolu Li","Ping Liu","Lan Fu","Jinlong Li","Jianwu Fang","Zhigang Xu","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2311.16278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10700v1","updated":"2024-04-16T16:17:48Z","published":"2024-04-16T16:17:48Z","title":"Rawformer: Unpaired Raw-to-Raw Translation for Learnable Camera ISPs","summary":" Modern smartphone camera quality heavily relies on the image signal processor\n(ISP) to enhance captured raw images, utilizing carefully designed modules to\nproduce final output images encoded in a standard color space (e.g., sRGB).\nNeural-based end-to-end learnable ISPs offer promising advancements,\npotentially replacing traditional ISPs with their ability to adapt without\nrequiring extensive tuning for each new camera model, as is often the case for\nnearly every module in traditional ISPs. However, the key challenge with the\nrecent learning-based ISPs is the urge to collect large paired datasets for\neach distinct camera model due to the influence of intrinsic camera\ncharacteristics on the formation of input raw images. This paper tackles this\nchallenge by introducing a novel method for unpaired learning of raw-to-raw\ntranslation across diverse cameras. Specifically, we propose Rawformer, an\nunsupervised Transformer-based encoder-decoder method for raw-to-raw\ntranslation. It accurately maps raw images captured by a certain camera to the\ntarget camera, facilitating the generalization of learnable ISPs to new unseen\ncameras. Our method demonstrates superior performance on real camera datasets,\nachieving higher accuracy compared to previous state-of-the-art techniques, and\npreserving a more robust correlation between the original and translated raw\nimages.\n","authors":["Georgy Perevozchikov","Nancy Mehta","Mahmoud Afifi","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2404.10700v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.10699v1","updated":"2024-04-16T16:16:40Z","published":"2024-04-16T16:16:40Z","title":"ECLAIR: A High-Fidelity Aerial LiDAR Dataset for Semantic Segmentation","summary":" We introduce ECLAIR (Extended Classification of Lidar for AI Recognition), a\nnew outdoor large-scale aerial LiDAR dataset designed specifically for\nadvancing research in point cloud semantic segmentation. As the most extensive\nand diverse collection of its kind to date, the dataset covers a total area of\n10$km^2$ with close to 600 million points and features eleven distinct object\ncategories. To guarantee the dataset's quality and utility, we have thoroughly\ncurated the point labels through an internal team of experts, ensuring accuracy\nand consistency in semantic labeling. The dataset is engineered to move forward\nthe fields of 3D urban modeling, scene understanding, and utility\ninfrastructure management by presenting new challenges and potential\napplications. As a benchmark, we report qualitative and quantitative analysis\nof a voxel-based point cloud segmentation approach based on the Minkowski\nEngine.\n","authors":["Iaroslav Melekhov","Anand Umashankar","Hyeong-Jin Kim","Vladislav Serkov","Dusty Argyle"],"pdf_url":"https://arxiv.org/pdf/2404.10699v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.10690v1","updated":"2024-04-16T16:10:23Z","published":"2024-04-16T16:10:23Z","title":"MathWriting: A Dataset For Handwritten Mathematical Expression\n Recognition","summary":" We introduce MathWriting, the largest online handwritten mathematical\nexpression dataset to date. It consists of 230k human-written samples and an\nadditional 400k synthetic ones. MathWriting can also be used for offline HME\nrecognition and is larger than all existing offline HME datasets like\nIM2LATEX-100K. We introduce a benchmark based on MathWriting data in order to\nadvance research on both online and offline HME recognition.\n","authors":["Philippe Gervais","Asya Fadeeva","Andrii Maksai"],"pdf_url":"https://arxiv.org/pdf/2404.10690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10688v1","updated":"2024-04-16T16:08:59Z","published":"2024-04-16T16:08:59Z","title":"Efficient Conditional Diffusion Model with Probability Flow Sampling for\n Image Super-resolution","summary":" Image super-resolution is a fundamentally ill-posed problem because multiple\nvalid high-resolution images exist for one low-resolution image.\nSuper-resolution methods based on diffusion probabilistic models can deal with\nthe ill-posed nature by learning the distribution of high-resolution images\nconditioned on low-resolution images, avoiding the problem of blurry images in\nPSNR-oriented methods. However, existing diffusion-based super-resolution\nmethods have high time consumption with the use of iterative sampling, while\nthe quality and consistency of generated images are less than ideal due to\nproblems like color shifting. In this paper, we propose Efficient Conditional\nDiffusion Model with Probability Flow Sampling (ECDP) for image\nsuper-resolution. To reduce the time consumption, we design a continuous-time\nconditional diffusion model for image super-resolution, which enables the use\nof probability flow sampling for efficient generation. Additionally, to improve\nthe consistency of generated images, we propose a hybrid parametrization for\nthe denoiser network, which interpolates between the data-predicting\nparametrization and the noise-predicting parametrization for different noise\nscales. Moreover, we design an image quality loss as a complement to the score\nmatching loss of diffusion models, further improving the consistency and\nquality of super-resolution. Extensive experiments on DIV2K, ImageNet, and\nCelebA demonstrate that our method achieves higher super-resolution quality\nthan existing diffusion-based image super-resolution methods while having lower\ntime consumption. Our code is available at https://github.com/Yuan-Yutao/ECDP.\n","authors":["Yutao Yuan","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.10688v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2404.10685v1","updated":"2024-04-16T16:04:38Z","published":"2024-04-16T16:04:38Z","title":"Generating Human Interaction Motions in Scenes with Text Control","summary":" We present TeSMo, a method for text-controlled scene-aware motion generation\nbased on denoising diffusion models. Previous text-to-motion methods focus on\ncharacters in isolation without considering scenes due to the limited\navailability of datasets that include motion, text descriptions, and\ninteractive scenes. Our approach begins with pre-training a scene-agnostic\ntext-to-motion diffusion model, emphasizing goal-reaching constraints on\nlarge-scale motion-capture datasets. We then enhance this model with a\nscene-aware component, fine-tuned using data augmented with detailed scene\ninformation, including ground plane and object shapes. To facilitate training,\nwe embed annotated navigation and interaction motions within scenes. The\nproposed method produces realistic and diverse human-object interactions, such\nas navigation and sitting, in different scenes with various object shapes,\norientations, initial body positions, and poses. Extensive experiments\ndemonstrate that our approach surpasses prior techniques in terms of the\nplausibility of human-scene interactions, as well as the realism and variety of\nthe generated motions. Code will be released upon publication of this work at\nhttps://research.nvidia.com/labs/toronto-ai/tesmo.\n","authors":["Hongwei Yi","Justus Thies","Michael J. Black","Xue Bin Peng","Davis Rempe"],"pdf_url":"https://arxiv.org/pdf/2404.10685v1.pdf","comment":"Project Page: https://research.nvidia.com/labs/toronto-ai/tesmo/"},{"id":"http://arxiv.org/abs/2404.10681v1","updated":"2024-04-16T15:58:49Z","published":"2024-04-16T15:58:49Z","title":"StyleCity: Large-Scale 3D Urban Scenes Stylization with Vision-and-Text\n Reference via Progressive Optimization","summary":" Creating large-scale virtual urban scenes with variant styles is inherently\nchallenging. To facilitate prototypes of virtual production and bypass the need\nfor complex materials and lighting setups, we introduce the first\nvision-and-text-driven texture stylization system for large-scale urban scenes,\nStyleCity. Taking an image and text as references, StyleCity stylizes a 3D\ntextured mesh of a large-scale urban scene in a semantics-aware fashion and\ngenerates a harmonic omnidirectional sky background. To achieve that, we\npropose to stylize a neural texture field by transferring 2D vision-and-text\npriors to 3D globally and locally. During 3D stylization, we progressively\nscale the planned training views of the input 3D scene at different levels in\norder to preserve high-quality scene content. We then optimize the scene style\nglobally by adapting the scale of the style image with the scale of the\ntraining views. Moreover, we enhance local semantics consistency by the\nsemantics-aware style loss which is crucial for photo-realistic stylization.\nBesides texture stylization, we further adopt a generative diffusion model to\nsynthesize a style-consistent omnidirectional sky image, which offers a more\nimmersive atmosphere and assists the semantic stylization process. The stylized\nneural texture field can be baked into an arbitrary-resolution texture,\nenabling seamless integration into conventional rendering pipelines and\nsignificantly easing the virtual production prototyping process. Extensive\nexperiments demonstrate our stylized scenes' superiority in qualitative and\nquantitative performance and user preferences.\n","authors":["Yingshu Chen","Huajian Huang","Tuan-Anh Vu","Ka Chun Shum","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2404.10681v1.pdf","comment":"project page: https://chenyingshu.github.io/stylecity3d/"},{"id":"http://arxiv.org/abs/2312.02126v3","updated":"2024-04-16T15:50:56Z","published":"2023-12-04T18:53:24Z","title":"SplaTAM: Splat, Track & Map 3D Gaussians for Dense RGB-D SLAM","summary":" Dense simultaneous localization and mapping (SLAM) is crucial for robotics\nand augmented reality applications. However, current methods are often hampered\nby the non-volumetric or implicit way they represent a scene. This work\nintroduces SplaTAM, an approach that, for the first time, leverages explicit\nvolumetric representations, i.e., 3D Gaussians, to enable high-fidelity\nreconstruction from a single unposed RGB-D camera, surpassing the capabilities\nof existing methods. SplaTAM employs a simple online tracking and mapping\nsystem tailored to the underlying Gaussian representation. It utilizes a\nsilhouette mask to elegantly capture the presence of scene density. This\ncombination enables several benefits over prior representations, including fast\nrendering and dense optimization, quickly determining if areas have been\npreviously mapped, and structured map expansion by adding more Gaussians.\nExtensive experiments show that SplaTAM achieves up to 2x superior performance\nin camera pose estimation, map construction, and novel-view synthesis over\nexisting methods, paving the way for more immersive high-fidelity SLAM\napplications.\n","authors":["Nikhil Keetha","Jay Karhade","Krishna Murthy Jatavallabhula","Gengshan Yang","Sebastian Scherer","Deva Ramanan","Jonathon Luiten"],"pdf_url":"https://arxiv.org/pdf/2312.02126v3.pdf","comment":"CVPR 2024. Website: https://spla-tam.github.io/"},{"id":"http://arxiv.org/abs/2404.10667v1","updated":"2024-04-16T15:43:22Z","published":"2024-04-16T15:43:22Z","title":"VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time","summary":" We introduce VASA, a framework for generating lifelike talking faces with\nappealing visual affective skills (VAS) given a single static image and a\nspeech audio clip. Our premiere model, VASA-1, is capable of not only producing\nlip movements that are exquisitely synchronized with the audio, but also\ncapturing a large spectrum of facial nuances and natural head motions that\ncontribute to the perception of authenticity and liveliness. The core\ninnovations include a holistic facial dynamics and head movement generation\nmodel that works in a face latent space, and the development of such an\nexpressive and disentangled face latent space using videos. Through extensive\nexperiments including evaluation on a set of new metrics, we show that our\nmethod significantly outperforms previous methods along various dimensions\ncomprehensively. Our method not only delivers high video quality with realistic\nfacial and head dynamics but also supports the online generation of 512x512\nvideos at up to 40 FPS with negligible starting latency. It paves the way for\nreal-time engagements with lifelike avatars that emulate human conversational\nbehaviors.\n","authors":["Sicheng Xu","Guojun Chen","Yu-Xiao Guo","Jiaolong Yang","Chong Li","Zhenyu Zang","Yizhong Zhang","Xin Tong","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2404.10667v1.pdf","comment":"Tech Report. Project webpage:\n https://www.microsoft.com/en-us/research/project/vasa-1/"},{"id":"http://arxiv.org/abs/2404.10664v1","updated":"2024-04-16T15:40:18Z","published":"2024-04-16T15:40:18Z","title":"Assessing The Impact of CNN Auto Encoder-Based Image Denoising on Image\n Classification Tasks","summary":" Images captured from the real world are often affected by different types of\nnoise, which can significantly impact the performance of Computer Vision\nsystems and the quality of visual data. This study presents a novel approach\nfor defect detection in casting product noisy images, specifically focusing on\nsubmersible pump impellers. The methodology involves utilizing deep learning\nmodels such as VGG16, InceptionV3, and other models in both the spatial and\nfrequency domains to identify noise types and defect status. The research\nprocess begins with preprocessing images, followed by applying denoising\ntechniques tailored to specific noise categories. The goal is to enhance the\naccuracy and robustness of defect detection by integrating noise detection and\ndenoising into the classification pipeline. The study achieved remarkable\nresults using VGG16 for noise type classification in the frequency domain,\nachieving an accuracy of over 99%. Removal of salt and pepper noise resulted in\nan average SSIM of 87.9, while Gaussian noise removal had an average SSIM of\n64.0, and periodic noise removal yielded an average SSIM of 81.6. This\ncomprehensive approach showcases the effectiveness of the deep AutoEncoder\nmodel and median filter, for denoising strategies in real-world industrial\napplications. Finally, our study reports significant improvements in binary\nclassification accuracy for defect detection compared to previous methods. For\nthe VGG16 classifier, accuracy increased from 94.6% to 97.0%, demonstrating the\neffectiveness of the proposed noise detection and denoising approach.\nSimilarly, for the InceptionV3 classifier, accuracy improved from 84.7% to\n90.0%, further validating the benefits of integrating noise analysis into the\nclassification pipeline.\n","authors":["Mohsen Hami","Mahdi JameBozorg"],"pdf_url":"https://arxiv.org/pdf/2404.10664v1.pdf","comment":"13 pages, 13 figures, 13th International conference on innovative\n technologies in the field of science, engineering and technology"},{"id":"http://arxiv.org/abs/2404.07922v3","updated":"2024-04-16T15:33:45Z","published":"2024-04-11T17:09:28Z","title":"LaVy: Vietnamese Multimodal Large Language Model","summary":" Large Language Models (LLMs) and Multimodal Large language models (MLLMs)\nhave taken the world by storm with impressive abilities in complex reasoning\nand linguistic comprehension. Meanwhile there are plethora of works related to\nVietnamese Large Language Models, the lack of high-quality resources in\nmultimodality limits the progress of Vietnamese MLLMs. In this paper, we\npioneer in address this by introducing LaVy, a state-of-the-art Vietnamese\nMLLM, and we also introduce LaVy-Bench benchmark designated for evaluating\nMLLMs's understanding on Vietnamese visual language tasks. Our project is\npublic at https://github.com/baochi0212/LaVy\n","authors":["Chi Tran","Huong Le Thanh"],"pdf_url":"https://arxiv.org/pdf/2404.07922v3.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2404.10633v1","updated":"2024-04-16T15:04:55Z","published":"2024-04-16T15:04:55Z","title":"Contextrast: Contextual Contrastive Learning for Semantic Segmentation","summary":" Despite great improvements in semantic segmentation, challenges persist\nbecause of the lack of local/global contexts and the relationship between them.\nIn this paper, we propose Contextrast, a contrastive learning-based semantic\nsegmentation method that allows to capture local/global contexts and comprehend\ntheir relationships. Our proposed method comprises two parts: a) contextual\ncontrastive learning (CCL) and b) boundary-aware negative (BANE) sampling.\nContextual contrastive learning obtains local/global context from multi-scale\nfeature aggregation and inter/intra-relationship of features for better\ndiscrimination capabilities. Meanwhile, BANE sampling selects embedding\nfeatures along the boundaries of incorrectly predicted regions to employ them\nas harder negative samples on our contrastive learning, resolving segmentation\nissues along the boundary region by exploiting fine-grained details. We\ndemonstrate that our Contextrast substantially enhances the performance of\nsemantic segmentation networks, outperforming state-of-the-art contrastive\nlearning approaches on diverse public datasets, e.g. Cityscapes, CamVid,\nPASCAL-C, COCO-Stuff, and ADE20K, without an increase in computational cost\nduring inference.\n","authors":["Changki Sung","Wanhee Kim","Jungho An","Wooju Lee","Hyungtae Lim","Hyun Myung"],"pdf_url":"https://arxiv.org/pdf/2404.10633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09172v2","updated":"2024-04-16T14:56:32Z","published":"2024-04-14T07:36:18Z","title":"LoopAnimate: Loopable Salient Object Animation","summary":" Research on diffusion model-based video generation has advanced rapidly.\nHowever, limitations in object fidelity and generation length hinder its\npractical applications. Additionally, specific domains like animated wallpapers\nrequire seamless looping, where the first and last frames of the video match\nseamlessly. To address these challenges, this paper proposes LoopAnimate, a\nnovel method for generating videos with consistent start and end frames. To\nenhance object fidelity, we introduce a framework that decouples multi-level\nimage appearance and textual semantic information. Building upon an\nimage-to-image diffusion model, our approach incorporates both pixel-level and\nfeature-level information from the input image, injecting image appearance and\ntextual semantic embeddings at different positions of the diffusion model.\nExisting UNet-based video generation models require to input the entire videos\nduring training to encode temporal and positional information at once. However,\ndue to limitations in GPU memory, the number of frames is typically restricted\nto 16. To address this, this paper proposes a three-stage training strategy\nwith progressively increasing frame numbers and reducing fine-tuning modules.\nAdditionally, we introduce the Temporal E nhanced Motion Module(TEMM) to extend\nthe capacity for encoding temporal and positional information up to 36 frames.\nThe proposed LoopAnimate, which for the first time extends the single-pass\ngeneration length of UNet-based video generation models to 35 frames while\nmaintaining high-quality video generation. Experiments demonstrate that\nLoopAnimate achieves state-of-the-art performance in both objective metrics,\nsuch as fidelity and temporal consistency, and subjective evaluation results.\n","authors":["Fanyi Wang","Peng Liu","Haotian Hu","Dan Meng","Jingwen Su","Jinjin Xu","Yanhao Zhang","Xiaoming Ren","Zhiwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10626v1","updated":"2024-04-16T14:52:15Z","published":"2024-04-16T14:52:15Z","title":"Exploring selective image matching methods for zero-shot and few-sample\n unsupervised domain adaptation of urban canopy prediction","summary":" We explore simple methods for adapting a trained multi-task UNet which\npredicts canopy cover and height to a new geographic setting using remotely\nsensed data without the need of training a domain-adaptive classifier and\nextensive fine-tuning. Extending previous research, we followed a selective\nalignment process to identify similar images in the two geographical domains\nand then tested an array of data-based unsupervised domain adaptation\napproaches in a zero-shot setting as well as with a small amount of\nfine-tuning. We find that the selective aligned data-based image matching\nmethods produce promising results in a zero-shot setting, and even more so with\na small amount of fine-tuning. These methods outperform both an untransformed\nbaseline and a popular data-based image-to-image translation model. The best\nperforming methods were pixel distribution adaptation and fourier domain\nadaptation on the canopy cover and height tasks respectively.\n","authors":["John Francis","Stephen Law"],"pdf_url":"https://arxiv.org/pdf/2404.10626v1.pdf","comment":"ICLR 2024 Machine Learning for Remote Sensing (ML4RS) Workshop"},{"id":"http://arxiv.org/abs/2404.10625v1","updated":"2024-04-16T14:48:40Z","published":"2024-04-16T14:48:40Z","title":"Gaussian Splatting Decoder for 3D-aware Generative Adversarial Networks","summary":" NeRF-based 3D-aware Generative Adversarial Networks (GANs) like EG3D or\nGIRAFFE have shown very high rendering quality under large representational\nvariety. However, rendering with Neural Radiance Fields poses challenges for 3D\napplications: First, the significant computational demands of NeRF rendering\npreclude its use on low-power devices, such as mobiles and VR/AR headsets.\nSecond, implicit representations based on neural networks are difficult to\nincorporate into explicit 3D scenes, such as VR environments or video games. 3D\nGaussian Splatting (3DGS) overcomes these limitations by providing an explicit\n3D representation that can be rendered efficiently at high frame rates. In this\nwork, we present a novel approach that combines the high rendering quality of\nNeRF-based 3D-aware GANs with the flexibility and computational advantages of\n3DGS. By training a decoder that maps implicit NeRF representations to explicit\n3D Gaussian Splatting attributes, we can integrate the representational\ndiversity and quality of 3D GANs into the ecosystem of 3D Gaussian Splatting\nfor the first time. Additionally, our approach allows for a high resolution GAN\ninversion and real-time GAN editing with 3D Gaussian Splatting scenes.\n","authors":["Florian Barthel","Arian Beckmann","Wieland Morgenstern","Anna Hilsmann","Peter Eisert"],"pdf_url":"https://arxiv.org/pdf/2404.10625v1.pdf","comment":"CVPRW"},{"id":"http://arxiv.org/abs/2403.08801v5","updated":"2024-04-16T14:48:34Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14045v2","updated":"2024-04-16T14:45:44Z","published":"2024-02-21T13:06:48Z","title":"A Systematic Review of Low-Rank and Local Low-Rank Matrix Approximation\n in Big Data Medical Imaging","summary":" The large volume and complexity of medical imaging datasets are bottlenecks\nfor storage, transmission, and processing. To tackle these challenges, the\napplication of low-rank matrix approximation (LRMA) and its derivative, local\nLRMA (LLRMA) has demonstrated potential.\n A detailed analysis of the literature identifies LRMA and LLRMA methods\napplied to various imaging modalities, and the challenges and limitations\nassociated with existing LRMA and LLRMA methods are addressed.\n We note a significant shift towards a preference for LLRMA in the medical\nimaging field since 2015, demonstrating its potential and effectiveness in\ncapturing complex structures in medical data compared to LRMA. Acknowledging\nthe limitations of shallow similarity methods used with LLRMA, we suggest\nadvanced semantic image segmentation for similarity measure, explaining in\ndetail how it can measure similar patches and their feasibility.\n We note that LRMA and LLRMA are mainly applied to unstructured medical data,\nand we propose extending their application to different medical data types,\nincluding structured and semi-structured. This paper also discusses how LRMA\nand LLRMA can be applied to regular data with missing entries and the impact of\ninaccuracies in predicting missing values and their effects. We discuss the\nimpact of patch size and propose the use of random search (RS) to determine the\noptimal patch size. To enhance feasibility, a hybrid approach using Bayesian\noptimization and RS is proposed, which could improve the application of LRMA\nand LLRMA in medical imaging.\n","authors":["Sisipho Hamlomo","Marcellin Atemkeng","Yusuf Brima","Chuneeta Nunhokee","Jeremy Baxter"],"pdf_url":"https://arxiv.org/pdf/2402.14045v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10620v1","updated":"2024-04-16T14:43:33Z","published":"2024-04-16T14:43:33Z","title":"PyTorchGeoNodes: Enabling Differentiable Shape Programs for 3D Shape\n Reconstruction","summary":" We propose PyTorchGeoNodes, a differentiable module for reconstructing 3D\nobjects from images using interpretable shape programs. In comparison to\ntraditional CAD model retrieval methods, the use of shape programs for 3D\nreconstruction allows for reasoning about the semantic properties of\nreconstructed objects, editing, low memory footprint, etc. However, the\nutilization of shape programs for 3D scene understanding has been largely\nneglected in past works. As our main contribution, we enable gradient-based\noptimization by introducing a module that translates shape programs designed in\nBlender, for example, into efficient PyTorch code. We also provide a method\nthat relies on PyTorchGeoNodes and is inspired by Monte Carlo Tree Search\n(MCTS) to jointly optimize discrete and continuous parameters of shape programs\nand reconstruct 3D objects for input scenes. In our experiments, we apply our\nalgorithm to reconstruct 3D objects in the ScanNet dataset and evaluate our\nresults against CAD model retrieval-based reconstructions. Our experiments\nindicate that our reconstructions match well the input scenes while enabling\nsemantic reasoning about reconstructed objects.\n","authors":["Sinisa Stekovic","Stefan Ainetter","Mattia D'Urso","Friedrich Fraundorfer","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2404.10620v1.pdf","comment":"In Submission"},{"id":"http://arxiv.org/abs/2404.10618v1","updated":"2024-04-16T14:42:49Z","published":"2024-04-16T14:42:49Z","title":"Private Attribute Inference from Images with Vision-Language Models","summary":" As large language models (LLMs) become ubiquitous in our daily tasks and\ndigital interactions, associated privacy risks are increasingly in focus. While\nLLM privacy research has primarily focused on the leakage of model training\ndata, it has recently been shown that the increase in models' capabilities has\nenabled LLMs to make accurate privacy-infringing inferences from previously\nunseen texts. With the rise of multimodal vision-language models (VLMs),\ncapable of understanding both images and text, a pertinent question is whether\nsuch results transfer to the previously unexplored domain of benign images\nposted online. To investigate the risks associated with the image reasoning\ncapabilities of newly emerging VLMs, we compile an image dataset with\nhuman-annotated labels of the image owner's personal attributes. In order to\nunderstand the additional privacy risk posed by VLMs beyond traditional human\nattribute recognition, our dataset consists of images where the inferable\nprivate attributes do not stem from direct depictions of humans. On this\ndataset, we evaluate the inferential capabilities of 7 state-of-the-art VLMs,\nfinding that they can infer various personal attributes at up to 77.6%\naccuracy. Concerningly, we observe that accuracy scales with the general\ncapabilities of the models, implying that future models can be misused as\nstronger adversaries, establishing an imperative for the development of\nadequate defenses.\n","authors":["Batuhan Tömekçe","Mark Vero","Robin Staab","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2404.10618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10121v3","updated":"2024-04-16T14:35:13Z","published":"2023-11-16T10:45:46Z","title":"Slide-SAM: Medical SAM Meets Sliding Window","summary":" The Segment Anything Model (SAM) has achieved a notable success in\ntwo-dimensional image segmentation in natural images. However, the substantial\ngap between medical and natural images hinders its direct application to\nmedical image segmentation tasks. Particularly in 3D medical images, SAM\nstruggles to learn contextual relationships between slices, limiting its\npractical applicability. Moreover, applying 2D SAM to 3D images requires\nprompting the entire volume, which is time- and label-consuming. To address\nthese problems, we propose Slide-SAM, which treats a stack of three adjacent\nslices as a prediction window. It firstly takes three slices from a 3D volume\nand point- or bounding box prompts on the central slice as inputs to predict\nsegmentation masks for all three slices. Subsequently, the masks of the top and\nbottom slices are then used to generate new prompts for adjacent slices.\nFinally, step-wise prediction can be achieved by sliding the prediction window\nforward or backward through the entire volume. Our model is trained on multiple\npublic and private medical datasets and demonstrates its effectiveness through\nextensive 3D segmetnation experiments, with the help of minimal prompts. Code\nis available at \\url{https://github.com/Curli-quan/Slide-SAM}.\n","authors":["Quan Quan","Fenghe Tang","Zikang Xu","Heqin Zhu","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.10121v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10603v1","updated":"2024-04-16T14:28:57Z","published":"2024-04-16T14:28:57Z","title":"Enhancing 3D Fidelity of Text-to-3D using Cross-View Correspondences","summary":" Leveraging multi-view diffusion models as priors for 3D optimization have\nalleviated the problem of 3D consistency, e.g., the Janus face problem or the\ncontent drift problem, in zero-shot text-to-3D models. However, the 3D\ngeometric fidelity of the output remains an unresolved issue; albeit the\nrendered 2D views are realistic, the underlying geometry may contain errors\nsuch as unreasonable concavities. In this work, we propose CorrespondentDream,\nan effective method to leverage annotation-free, cross-view correspondences\nyielded from the diffusion U-Net to provide additional 3D prior to the NeRF\noptimization process. We find that these correspondences are strongly\nconsistent with human perception, and by adopting it in our loss design, we are\nable to produce NeRF models with geometries that are more coherent with common\nsense, e.g., more smoothed object surface, yielding higher 3D fidelity. We\ndemonstrate the efficacy of our approach through various comparative\nqualitative results and a solid user study.\n","authors":["Seungwook Kim","Kejie Li","Xueqing Deng","Yichun Shi","Minsu Cho","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.10603v1.pdf","comment":"25 pages, 22 figures, accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10600v1","updated":"2024-04-16T14:26:55Z","published":"2024-04-16T14:26:55Z","title":"Intra-operative tumour margin evaluation in breast-conserving surgery\n with deep learning","summary":" A positive margin may result in an increased risk of local recurrences after\nbreast retention surgery for any malignant tumour. In order to reduce the\nnumber of positive margins would offer surgeon real-time intra-operative\ninformation on the presence of positive resection margins. This study aims to\ndesign an intra-operative tumour margin evaluation scheme by using specimen\nmammography in breast-conserving surgery. Total of 30 cases were evaluated and\ncompared with the manually determined contours by experienced physicians and\npathology report. The proposed method utilizes image thresholding to extract\nregions of interest and then performs a deep learning model, i.e. SegNet, to\nsegment tumour tissue. The margin width of normal tissues surrounding it is\nevaluated as the result. The desired size of margin around the tumor was set\nfor 10 mm. The smallest average difference to manual sketched margin (6.53 mm\n+- 5.84). In the all case, the SegNet architecture was utilized to obtain\ntissue specimen boundary and tumor contour, respectively. The simulation\nresults indicated that this technology is helpful in discriminating positive\nfrom negative margins in the intra-operative setting. The aim of proposed\nscheme was a potential procedure in the intra-operative measurement system. The\nexperimental results reveal that deep learning techniques can draw results that\nare consistent with pathology reports.\n","authors":["Wei-Chung Shia","Yu-Len Huang","Yi-Chun Chen","Hwa-Koon Wu","Dar-Ren Chen"],"pdf_url":"https://arxiv.org/pdf/2404.10600v1.pdf","comment":"1 pages, 6 figures and 2 tables"},{"id":"http://arxiv.org/abs/2404.10595v1","updated":"2024-04-16T14:20:55Z","published":"2024-04-16T14:20:55Z","title":"Automated Evaluation of Large Vision-Language Models on Self-driving\n Corner Cases","summary":" Large Vision-Language Models (LVLMs), due to the remarkable visual reasoning\nability to understand images and videos, have received widespread attention in\nthe autonomous driving domain, which significantly advances the development of\ninterpretable end-to-end autonomous driving. However, current evaluations of\nLVLMs primarily focus on the multi-faceted capabilities in common scenarios,\nlacking quantifiable and automated assessment in autonomous driving contexts,\nlet alone severe road corner cases that even the state-of-the-art autonomous\ndriving perception systems struggle to handle. In this paper, we propose\nCODA-LM, a novel vision-language benchmark for self-driving, which provides the\nfirst automatic and quantitative evaluation of LVLMs for interpretable\nautonomous driving including general perception, regional perception, and\ndriving suggestions. CODA-LM utilizes the texts to describe the road images,\nexploiting powerful text-only large language models (LLMs) without image inputs\nto assess the capabilities of LVLMs in autonomous driving scenarios, which\nreveals stronger alignment with human preferences than LVLM judges. Experiments\ndemonstrate that even the closed-sourced commercial LVLMs like GPT-4V cannot\ndeal with road corner cases well, suggesting that we are still far from a\nstrong LVLM-powered intelligent driving agent, and we hope our CODA-LM can\nbecome the catalyst to promote future development.\n","authors":["Yanze Li","Wenhua Zhang","Kai Chen","Yanxin Liu","Pengxiang Li","Ruiyuan Gao","Lanqing Hong","Meng Tian","Xinhai Zhao","Zhenguo Li","Dit-Yan Yeung","Huchuan Lu","Xu Jia"],"pdf_url":"https://arxiv.org/pdf/2404.10595v1.pdf","comment":"Project Page: https://coda-dataset.github.io/coda-lm/"},{"id":"http://arxiv.org/abs/2404.08814v2","updated":"2024-04-16T14:17:51Z","published":"2024-04-12T21:14:20Z","title":"E3: Ensemble of Expert Embedders for Adapting Synthetic Image Detectors\n to New Generators Using Limited Data","summary":" As generative AI progresses rapidly, new synthetic image generators continue\nto emerge at a swift pace. Traditional detection methods face two main\nchallenges in adapting to these generators: the forensic traces of synthetic\nimages from new techniques can vastly differ from those learned during\ntraining, and access to data for these new generators is often limited. To\naddress these issues, we introduce the Ensemble of Expert Embedders (E3), a\nnovel continual learning framework for updating synthetic image detectors. E3\nenables the accurate detection of images from newly emerged generators using\nminimal training data. Our approach does this by first employing transfer\nlearning to develop a suite of expert embedders, each specializing in the\nforensic traces of a specific generator. Then, all embeddings are jointly\nanalyzed by an Expert Knowledge Fusion Network to produce accurate and reliable\ndetection decisions. Our experiments demonstrate that E3 outperforms existing\ncontinual learning methods, including those developed specifically for\nsynthetic image detection.\n","authors":["Aref Azizpour","Tai D. Nguyen","Manil Shrestha","Kaidi Xu","Edward Kim","Matthew C. Stamm"],"pdf_url":"https://arxiv.org/pdf/2404.08814v2.pdf","comment":"11 pages, 4 figures, To be published in CVPRWMF24"},{"id":"http://arxiv.org/abs/2403.14421v2","updated":"2024-04-16T14:16:48Z","published":"2024-03-21T14:17:28Z","title":"DP-RDM: Adapting Diffusion Models to Private Domains Without Fine-Tuning","summary":" Text-to-image diffusion models have been shown to suffer from sample-level\nmemorization, possibly reproducing near-perfect replica of images that they are\ntrained on, which may be undesirable. To remedy this issue, we develop the\nfirst differentially private (DP) retrieval-augmented generation algorithm that\nis capable of generating high-quality image samples while providing provable\nprivacy guarantees. Specifically, we assume access to a text-to-image diffusion\nmodel trained on a small amount of public data, and design a DP retrieval\nmechanism to augment the text prompt with samples retrieved from a private\nretrieval dataset. Our \\emph{differentially private retrieval-augmented\ndiffusion model} (DP-RDM) requires no fine-tuning on the retrieval dataset to\nadapt to another domain, and can use state-of-the-art generative models to\ngenerate high-quality image samples while satisfying rigorous DP guarantees.\nFor instance, when evaluated on MS-COCO, our DP-RDM can generate samples with a\nprivacy budget of $\\epsilon=10$, while providing a $3.5$ point improvement in\nFID compared to public-only retrieval for up to $10,000$ queries.\n","authors":["Jonathan Lebensold","Maziar Sanjabi","Pietro Astolfi","Adriana Romero-Soriano","Kamalika Chaudhuri","Mike Rabbat","Chuan Guo"],"pdf_url":"https://arxiv.org/pdf/2403.14421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08966v2","updated":"2024-04-16T14:16:40Z","published":"2024-04-13T11:07:53Z","title":"LoopGaussian: Creating 3D Cinemagraph with Multi-view Images via\n Eulerian Motion Field","summary":" Cinemagraph is a unique form of visual media that combines elements of still\nphotography and subtle motion to create a captivating experience. However, the\nmajority of videos generated by recent works lack depth information and are\nconfined to the constraints of 2D image space. In this paper, inspired by\nsignificant progress in the field of novel view synthesis (NVS) achieved by 3D\nGaussian Splatting (3D-GS), we propose LoopGaussian to elevate cinemagraph from\n2D image space to 3D space using 3D Gaussian modeling. To achieve this, we\nfirst employ the 3D-GS method to reconstruct 3D Gaussian point clouds from\nmulti-view images of static scenes,incorporating shape regularization terms to\nprevent blurring or artifacts caused by object deformation. We then adopt an\nautoencoder tailored for 3D Gaussian to project it into feature space. To\nmaintain the local continuity of the scene, we devise SuperGaussian for\nclustering based on the acquired features. By calculating the similarity\nbetween clusters and employing a two-stage estimation method, we derive an\nEulerian motion field to describe velocities across the entire scene. The 3D\nGaussian points then move within the estimated Eulerian motion field. Through\nbidirectional animation techniques, we ultimately generate a 3D Cinemagraph\nthat exhibits natural and seamlessly loopable dynamics. Experiment results\nvalidate the effectiveness of our approach, demonstrating high-quality and\nvisually appealing scene generation. The project is available at\nhttps://pokerlishao.github.io/LoopGaussian/.\n","authors":["Jiyang Li","Lechao Cheng","Zhangye Wang","Tingting Mu","Jingxuan He"],"pdf_url":"https://arxiv.org/pdf/2404.08966v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2202.13588v3","updated":"2024-04-16T14:15:40Z","published":"2022-02-28T07:44:59Z","title":"Using Multi-scale SwinTransformer-HTC with Data augmentation in CoNIC\n Challenge","summary":" Colorectal cancer is one of the most common cancers worldwide, so early\npathological examination is very important. However, it is time-consuming and\nlabor-intensive to identify the number and type of cells on H&E images in\nclinical. Therefore, automatic segmentation and classification task and\ncounting the cellular composition of H&E images from pathological sections is\nproposed by CoNIC Challenge 2022. We proposed a multi-scale Swin transformer\nwith HTC for this challenge, and also applied the known normalization methods\nto generate more augmentation data. Finally, our strategy showed that the\nmulti-scale played a crucial role to identify different scale features and the\naugmentation arose the recognition of model.\n","authors":["Chia-Yen Lee","Hsiang-Chin Chien","Ching-Ping Wang","Hong Yen","Kai-Wen Zhen","Hong-Kun Lin"],"pdf_url":"https://arxiv.org/pdf/2202.13588v3.pdf","comment":"Errors have been identified in the analysis"},{"id":"http://arxiv.org/abs/2404.10588v1","updated":"2024-04-16T14:13:44Z","published":"2024-04-16T14:13:44Z","title":"Do Counterfactual Examples Complicate Adversarial Training?","summary":" We leverage diffusion models to study the robustness-performance tradeoff of\nrobust classifiers. Our approach introduces a simple, pretrained diffusion\nmethod to generate low-norm counterfactual examples (CEs): semantically altered\ndata which results in different true class membership. We report that the\nconfidence and accuracy of robust models on their clean training data are\nassociated with the proximity of the data to their CEs. Moreover, robust models\nperform very poorly when evaluated on the CEs directly, as they become\nincreasingly invariant to the low-norm, semantic changes brought by CEs. The\nresults indicate a significant overlap between non-robust and semantic\nfeatures, countering the common assumption that non-robust features are not\ninterpretable.\n","authors":["Eric Yeats","Cameron Darwin","Eduardo Ortega","Frank Liu","Hai Li"],"pdf_url":"https://arxiv.org/pdf/2404.10588v1.pdf","comment":"Accepted as a short paper to the GCV Workshop at CVPR'24"},{"id":"http://arxiv.org/abs/2404.10584v1","updated":"2024-04-16T14:10:42Z","published":"2024-04-16T14:10:42Z","title":"ReWiTe: Realistic Wide-angle and Telephoto Dual Camera Fusion Dataset\n via Beam Splitter Camera Rig","summary":" The fusion of images from dual camera systems featuring a wide-angle and a\ntelephoto camera has become a hotspot problem recently. By integrating\nsimultaneously captured wide-angle and telephoto images from these systems, the\nresulting fused image achieves a wide field of view (FOV) coupled with\nhigh-definition quality. Existing approaches are mostly deep learning methods,\nand predominantly rely on supervised learning, where the training dataset plays\na pivotal role. However, current datasets typically adopt a data synthesis\napproach generate input pairs of wide-angle and telephoto images alongside\nground-truth images. Notably, the wide-angle inputs are synthesized rather than\ncaptured using real wide-angle cameras, and the ground-truth image is captured\nby wide-angle camera whose quality is substantially lower than that of input\ntelephoto images captured by telephoto cameras. To address these limitations,\nwe introduce a novel hardware setup utilizing a beam splitter to simultaneously\ncapture three images, i.e. input pairs and ground-truth images, from two\nauthentic cellphones equipped with wide-angle and telephoto dual cameras.\nSpecifically, the wide-angle and telephoto images captured by cellphone 2 serve\nas the input pair, while the telephoto image captured by cellphone 1, which is\ncalibrated to match the optical path of the wide-angle image from cellphone 2,\nserves as the ground-truth image, maintaining quality on par with the input\ntelephoto image. Experiments validate the efficacy of our newly introduced\ndataset, named ReWiTe, significantly enhances the performance of various\nexisting methods for real-world wide-angle and telephoto dual image fusion\ntasks.\n","authors":["Chunli Peng","Xuan Dong","Tiantian Cao","Zhengqing Li","Kun Dong","Weixin Li"],"pdf_url":"https://arxiv.org/pdf/2404.10584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15368v3","updated":"2024-04-16T14:08:03Z","published":"2023-03-27T16:35:28Z","title":"2S-UDF: A Novel Two-stage UDF Learning Method for Robust Non-watertight\n Model Reconstruction from Multi-view Images","summary":" Recently, building on the foundation of neural radiance field, various\ntechniques have emerged to learn unsigned distance fields (UDF) to reconstruct\n3D non-watertight models from multi-view images. Yet, a central challenge in\nUDF-based volume rendering is formulating a proper way to convert unsigned\ndistance values into volume density, ensuring that the resulting weight\nfunction remains unbiased and sensitive to occlusions. Falling short on these\nrequirements often results in incorrect topology or large reconstruction errors\nin resulting models. This paper addresses this challenge by presenting a novel\ntwo-stage algorithm, 2S-UDF, for learning a high-quality UDF from multi-view\nimages. Initially, the method applies an easily trainable density function\nthat, while slightly biased and transparent, aids in coarse reconstruction. The\nsubsequent stage then refines the geometry and appearance of the object to\nachieve a high-quality reconstruction by directly adjusting the weight function\nused in volume rendering to ensure that it is unbiased and occlusion-aware.\nDecoupling density and weight in two stages makes our training stable and\nrobust, distinguishing our technique from existing UDF learning approaches.\nEvaluations on the DeepFashion3D, DTU, and BlendedMVS datasets validate the\nrobustness and effectiveness of our proposed approach. In both quantitative\nmetrics and visual quality, the results indicate our superior performance over\nother UDF learning techniques in reconstructing 3D non-watertight models from\nmulti-view images. Our code is available at\nhttps://bitbucket.org/jkdeng/2sudf/.\n","authors":["Junkai Deng","Fei Hou","Xuhui Chen","Wencheng Wang","Ying He"],"pdf_url":"https://arxiv.org/pdf/2303.15368v3.pdf","comment":"accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10575v1","updated":"2024-04-16T13:53:58Z","published":"2024-04-16T13:53:58Z","title":"EMC$^2$: Efficient MCMC Negative Sampling for Contrastive Learning with\n Global Convergence","summary":" A key challenge in contrastive learning is to generate negative samples from\na large sample set to contrast with positive samples, for learning better\nencoding of the data. These negative samples often follow a softmax\ndistribution which are dynamically updated during the training process.\nHowever, sampling from this distribution is non-trivial due to the high\ncomputational costs in computing the partition function. In this paper, we\npropose an Efficient Markov Chain Monte Carlo negative sampling method for\nContrastive learning (EMC$^2$). We follow the global contrastive learning loss\nas introduced in SogCLR, and propose EMC$^2$ which utilizes an adaptive\nMetropolis-Hastings subroutine to generate hardness-aware negative samples in\nan online fashion during the optimization. We prove that EMC$^2$ finds an\n$\\mathcal{O}(1/\\sqrt{T})$-stationary point of the global contrastive loss in\n$T$ iterations. Compared to prior works, EMC$^2$ is the first algorithm that\nexhibits global convergence (to stationarity) regardless of the choice of batch\nsize while exhibiting low computation and memory cost. Numerical experiments\nvalidate that EMC$^2$ is effective with small batch training and achieves\ncomparable or better performance than baseline algorithms. We report the\nresults for pre-training image encoders on STL-10 and Imagenet-100.\n","authors":["Chung-Yiu Yau","Hoi-To Wai","Parameswaran Raman","Soumajyoti Sarkar","Mingyi Hong"],"pdf_url":"https://arxiv.org/pdf/2404.10575v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2404.10574v1","updated":"2024-04-16T13:52:00Z","published":"2024-04-16T13:52:00Z","title":"Uncertainty-guided Open-Set Source-Free Unsupervised Domain Adaptation\n with Target-private Class Segregation","summary":" Standard Unsupervised Domain Adaptation (UDA) aims to transfer knowledge from\na labeled source domain to an unlabeled target but usually requires\nsimultaneous access to both source and target data. Moreover, UDA approaches\ncommonly assume that source and target domains share the same labels space.\nYet, these two assumptions are hardly satisfied in real-world scenarios. This\npaper considers the more challenging Source-Free Open-set Domain Adaptation\n(SF-OSDA) setting, where both assumptions are dropped. We propose a novel\napproach for SF-OSDA that exploits the granularity of target-private categories\nby segregating their samples into multiple unknown classes. Starting from an\ninitial clustering-based assignment, our method progressively improves the\nsegregation of target-private samples by refining their pseudo-labels with the\nguide of an uncertainty-based sample selection module. Additionally, we propose\na novel contrastive loss, named NL-InfoNCELoss, that, integrating negative\nlearning into self-supervised contrastive learning, enhances the model\nrobustness to noisy pseudo-labels. Extensive experiments on benchmark datasets\ndemonstrate the superiority of the proposed method over existing approaches,\nestablishing new state-of-the-art performance. Notably, additional analyses\nshow that our method is able to learn the underlying semantics of novel\nclasses, opening the possibility to perform novel class discovery.\n","authors":["Mattia Litrico","Davide Talon","Sebastiano Battiato","Alessio Del Bue","Mario Valerio Giuffrida","Pietro Morerio"],"pdf_url":"https://arxiv.org/pdf/2404.10574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10572v1","updated":"2024-04-16T13:47:27Z","published":"2024-04-16T13:47:27Z","title":"Label merge-and-split: A graph-colouring approach for memory-efficient\n brain parcellation","summary":" Whole brain parcellation requires inferring hundreds of segmentation labels\nin large image volumes and thus presents significant practical challenges for\ndeep learning approaches. We introduce label merge-and-split, a method that\nfirst greatly reduces the effective number of labels required for\nlearning-based whole brain parcellation and then recovers original labels.\nUsing a greedy graph colouring algorithm, our method automatically groups and\nmerges multiple spatially separate labels prior to model training and\ninference. The merged labels may be semantically unrelated. A deep learning\nmodel is trained to predict merged labels. At inference time, original labels\nare restored using atlas-based influence regions. In our experiments, the\nproposed approach reduces the number of labels by up to 68% while achieving\nsegmentation accuracy comparable to the baseline method without label merging\nand splitting. Moreover, model training and inference times as well as GPU\nmemory requirements were reduced significantly. The proposed method can be\napplied to all semantic segmentation tasks with a large number of spatially\nseparate classes within an atlas-based prior.\n","authors":["Aaron Kujawa","Reuben Dorent","Sebastien Ourselin","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2404.10572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10571v1","updated":"2024-04-16T13:47:21Z","published":"2024-04-16T13:47:21Z","title":"CMU-Flownet: Exploring Point Cloud Scene Flow Estimation in Occluded\n Scenario","summary":" Occlusions hinder point cloud frame alignment in LiDAR data, a challenge\ninadequately addressed by scene flow models tested mainly on occlusion-free\ndatasets. Attempts to integrate occlusion handling within networks often suffer\naccuracy issues due to two main limitations: a) the inadequate use of occlusion\ninformation, often merging it with flow estimation without an effective\nintegration strategy, and b) reliance on distance-weighted upsampling that\nfalls short in correcting occlusion-related errors. To address these\nchallenges, we introduce the Correlation Matrix Upsampling Flownet\n(CMU-Flownet), incorporating an occlusion estimation module within its cost\nvolume layer, alongside an Occlusion-aware Cost Volume (OCV) mechanism.\nSpecifically, we propose an enhanced upsampling approach that expands the\nsensory field of the sampling process which integrates a Correlation Matrix\ndesigned to evaluate point-level similarity. Meanwhile, our model robustly\nintegrates occlusion data within the context of scene flow, deploying this\ninformation strategically during the refinement phase of the flow estimation.\nThe efficacy of this approach is demonstrated through subsequent experimental\nvalidation. Empirical assessments reveal that CMU-Flownet establishes\nstate-of-the-art performance within the realms of occluded Flyingthings3D and\nKITTY datasets, surpassing previous methodologies across a majority of\nevaluated metrics.\n","authors":["Jingze Chen","Junfeng Yao","Qiqin Lin","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2404.10571v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2308.16215v6","updated":"2024-04-16T13:32:25Z","published":"2023-08-30T16:44:38Z","title":"Deep Video Codec Control for Vision Models","summary":" Standardized lossy video coding is at the core of almost all real-world video\nprocessing pipelines. Rate control is used to enable standard codecs to adapt\nto different network bandwidth conditions or storage constraints. However,\nstandard video codecs (e.g., H.264) and their rate control modules aim to\nminimize video distortion w.r.t. human quality assessment. We demonstrate\nempirically that standard-coded videos vastly deteriorate the performance of\ndeep vision models. To overcome the deterioration of vision performance, this\npaper presents the first end-to-end learnable deep video codec control that\nconsiders both bandwidth constraints and downstream deep vision performance,\nwhile adhering to existing standardization. We demonstrate that our approach\nbetter preserves downstream deep vision performance than traditional standard\nvideo coding.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Tim Prangemeier","Daniel Cremers","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2308.16215v6.pdf","comment":"Accepted at CVPR 2024 Workshop on AI for Streaming (AIS)"},{"id":"http://arxiv.org/abs/2404.00724v2","updated":"2024-04-16T13:28:22Z","published":"2024-03-31T15:50:52Z","title":"Absolute-Unified Multi-Class Anomaly Detection via Class-Agnostic\n Distribution Alignment","summary":" Conventional unsupervised anomaly detection (UAD) methods build separate\nmodels for each object category. Recent studies have proposed to train a\nunified model for multiple classes, namely model-unified UAD. However, such\nmethods still implement the unified model separately on each class during\ninference with respective anomaly decision thresholds, which hinders their\napplication when the image categories are entirely unavailable. In this work,\nwe present a simple yet powerful method to address multi-class anomaly\ndetection without any class information, namely \\textit{absolute-unified} UAD.\nWe target the crux of prior works in this challenging setting: different\nobjects have mismatched anomaly score distributions. We propose Class-Agnostic\nDistribution Alignment (CADA) to align the mismatched score distribution of\neach implicit class without knowing class information, which enables unified\nanomaly detection for all classes and samples. The essence of CADA is to\npredict each class's score distribution of normal samples given any image,\nnormal or anomalous, of this class. As a general component, CADA can activate\nthe potential of nearly all UAD methods under absolute-unified setting. Our\napproach is extensively evaluated under the proposed setting on two popular UAD\nbenchmark datasets, MVTec AD and VisA, where we exceed previous\nstate-of-the-art by a large margin.\n","authors":["Jia Guo","Haonan Han","Shuai Lu","Weihang Zhang","Huiqi Li"],"pdf_url":"https://arxiv.org/pdf/2404.00724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00218v3","updated":"2024-04-16T13:22:08Z","published":"2022-11-01T02:00:32Z","title":"Pixel-Wise Contrastive Distillation","summary":" We present a simple but effective pixel-level self-supervised distillation\nframework friendly to dense prediction tasks. Our method, called Pixel-Wise\nContrastive Distillation (PCD), distills knowledge by attracting the\ncorresponding pixels from student's and teacher's output feature maps. PCD\nincludes a novel design called SpatialAdaptor which ``reshapes'' a part of the\nteacher network while preserving the distribution of its output features. Our\nablation experiments suggest that this reshaping behavior enables more\ninformative pixel-to-pixel distillation. Moreover, we utilize a plug-in\nmulti-head self-attention module that explicitly relates the pixels of\nstudent's feature maps to enhance the effective receptive field, leading to a\nmore competitive student. PCD \\textbf{outperforms} previous self-supervised\ndistillation methods on various dense prediction tasks. A backbone of\n\\mbox{ResNet-18-FPN} distilled by PCD achieves $37.4$ AP$^\\text{bbox}$ and\n$34.0$ AP$^\\text{mask}$ on COCO dataset using the detector of \\mbox{Mask\nR-CNN}. We hope our study will inspire future research on how to pre-train a\nsmall model friendly to dense prediction tasks in a self-supervised fashion.\n","authors":["Junqiang Huang","Zichao Guo"],"pdf_url":"https://arxiv.org/pdf/2211.00218v3.pdf","comment":"ICCV 2023 camera-ready"},{"id":"http://arxiv.org/abs/2304.08272v4","updated":"2024-04-16T13:20:44Z","published":"2023-04-17T13:33:23Z","title":"About latent roles in forecasting players in team sports","summary":" Forecasting players in sports has grown in popularity due to the potential\nfor a tactical advantage and the applicability of such research to multi-agent\ninteraction systems. Team sports contain a significant social component that\ninfluences interactions between teammates and opponents. However, it still\nneeds to be fully exploited. In this work, we hypothesize that each participant\nhas a specific function in each action and that role-based interaction is\ncritical for predicting players' future moves. We create RolFor, a novel\nend-to-end model for Role-based Forecasting. RolFor uses a new module we\ndeveloped called Ordering Neural Networks (OrderNN) to permute the order of the\nplayers such that each player is assigned to a latent role. The latent role is\nthen modeled with a RoleGCN. Thanks to its graph representation, it provides a\nfully learnable adjacency matrix that captures the relationships between roles\nand is subsequently used to forecast the players' future trajectories.\nExtensive experiments on a challenging NBA basketball dataset back up the\nimportance of roles and justify our goal of modeling them using optimizable\nmodels. When an oracle provides roles, the proposed RolFor compares favorably\nto the current state-of-the-art (it ranks first in terms of ADE and second in\nterms of FDE errors). However, training the end-to-end RolFor incurs the issues\nof differentiability of permutation methods, which we experimentally review.\nFinally, this work restates differentiable ranking as a difficult open problem\nand its great potential in conjunction with graph-based interaction models.\nProject is available at: https://www.pinlab.org/aboutlatentroles\n","authors":["Luca Scofano","Alessio Sampieri","Giuseppe Re","Matteo Almanza","Alessandro Panconesi","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2304.08272v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10548v1","updated":"2024-04-16T13:18:02Z","published":"2024-04-16T13:18:02Z","title":"Classification of Prostate Cancer in 3D Magnetic Resonance Imaging Data\n based on Convolutional Neural Networks","summary":" Prostate cancer is a commonly diagnosed cancerous disease among men\nworld-wide. Even with modern technology such as multi-parametric magnetic\nresonance tomography and guided biopsies, the process for diagnosing prostate\ncancer remains time consuming and requires highly trained professionals. In\nthis paper, different convolutional neural networks (CNN) are evaluated on\ntheir abilities to reliably classify whether an MRI sequence contains malignant\nlesions. Implementations of a ResNet, a ConvNet and a ConvNeXt for 3D image\ndata are trained and evaluated. The models are trained using different data\naugmentation techniques, learning rates, and optimizers. The data is taken from\na private dataset, provided by Cantonal Hospital Aarau. The best result was\nachieved by a ResNet3D, yielding an average precision score of 0.4583 and AUC\nROC score of 0.6214.\n","authors":["Malte Rippa","Ruben Schulze","Marian Himstedt","Felice Burn"],"pdf_url":"https://arxiv.org/pdf/2404.10548v1.pdf","comment":"Previous version published in Buzug T.M., Handels H., M\\\"uller S.,\n H\\\"ubner C., Mertins A., Rostalski P.: Student Conference Proceedings 2023,\n Infinite Science Publishing, 2023 (ISBN/EAN 978-3-945954-72-0). 7 pages, 2\n figures"},{"id":"http://arxiv.org/abs/2311.15658v2","updated":"2024-04-16T12:58:57Z","published":"2023-11-27T09:40:14Z","title":"Regularization by Texts for Latent Diffusion Inverse Solvers","summary":" The recent advent of diffusion models has led to significant progress in\nsolving inverse problems, leveraging these models as effective generative\npriors. Nonetheless, there remain challenges related to the ill-posed nature of\nsuch problems, often due to inherent ambiguities in measurements or intrinsic\nsystem symmetries. To address this, drawing inspiration from the human ability\nto resolve visual ambiguities through perceptual biases, here we introduce a\nnovel latent diffusion inverse solver by regularization by texts (TReg).\nSpecifically, TReg applies the textual description of the preconception of the\nsolution during the reverse diffusion sampling, of which the description is\ndynamically reinforced through null-text optimization for adaptive negation.\nOur comprehensive experimental results demonstrate that TReg successfully\nmitigates ambiguity in the inverse problems, enhancing their effectiveness and\naccuracy.\n","authors":["Jeongsol Kim","Geon Yeong Park","Hyungjin Chung","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10527v1","updated":"2024-04-16T12:55:15Z","published":"2024-04-16T12:55:15Z","title":"SPVLoc: Semantic Panoramic Viewport Matching for 6D Camera Localization\n in Unseen Environments","summary":" In this paper, we present SPVLoc, a global indoor localization method that\naccurately determines the six-dimensional (6D) camera pose of a query image and\nrequires minimal scene-specific prior knowledge and no scene-specific training.\nOur approach employs a novel matching procedure to localize the perspective\ncamera's viewport, given as an RGB image, within a set of panoramic semantic\nlayout representations of the indoor environment. The panoramas are rendered\nfrom an untextured 3D reference model, which only comprises approximate\nstructural information about room shapes, along with door and window\nannotations. We demonstrate that a straightforward convolutional network\nstructure can successfully achieve image-to-panorama and ultimately\nimage-to-model matching. Through a viewport classification score, we rank\nreference panoramas and select the best match for the query image. Then, a 6D\nrelative pose is estimated between the chosen panorama and query image. Our\nexperiments demonstrate that this approach not only efficiently bridges the\ndomain gap but also generalizes well to previously unseen scenes that are not\npart of the training data. Moreover, it achieves superior localization accuracy\ncompared to the state of the art methods and also estimates more degrees of\nfreedom of the camera pose. We will make our source code publicly available at\nhttps://github.com/fraunhoferhhi/spvloc .\n","authors":["Niklas Gard","Anna Hilsmann","Peter Eisert"],"pdf_url":"https://arxiv.org/pdf/2404.10527v1.pdf","comment":"This submission includes the paper and supplementary material. 24\n pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.02155v3","updated":"2024-04-16T12:43:35Z","published":"2023-12-04T18:59:55Z","title":"GPS-Gaussian: Generalizable Pixel-wise 3D Gaussian Splatting for\n Real-time Human Novel View Synthesis","summary":" We present a new approach, termed GPS-Gaussian, for synthesizing novel views\nof a character in a real-time manner. The proposed method enables 2K-resolution\nrendering under a sparse-view camera setting. Unlike the original Gaussian\nSplatting or neural implicit rendering methods that necessitate per-subject\noptimizations, we introduce Gaussian parameter maps defined on the source views\nand regress directly Gaussian Splatting properties for instant novel view\nsynthesis without any fine-tuning or optimization. To this end, we train our\nGaussian parameter regression module on a large amount of human scan data,\njointly with a depth estimation module to lift 2D parameter maps to 3D space.\nThe proposed framework is fully differentiable and experiments on several\ndatasets demonstrate that our method outperforms state-of-the-art methods while\nachieving an exceeding rendering speed.\n","authors":["Shunyuan Zheng","Boyao Zhou","Ruizhi Shao","Boning Liu","Shengping Zhang","Liqiang Nie","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02155v3.pdf","comment":"Accepted by CVPR 2024 (Highlight). Project page:\n https://shunyuanzheng.github.io/GPS-Gaussian"},{"id":"http://arxiv.org/abs/2404.10518v1","updated":"2024-04-16T12:41:25Z","published":"2024-04-16T12:41:25Z","title":"MobileNetV4 - Universal Models for the Mobile Ecosystem","summary":" We present the latest generation of MobileNets, known as MobileNetV4 (MNv4),\nfeaturing universally efficient architecture designs for mobile devices. At its\ncore, we introduce the Universal Inverted Bottleneck (UIB) search block, a\nunified and flexible structure that merges Inverted Bottleneck (IB), ConvNext,\nFeed Forward Network (FFN), and a novel Extra Depthwise (ExtraDW) variant.\nAlongside UIB, we present Mobile MQA, an attention block tailored for mobile\naccelerators, delivering a significant 39% speedup. An optimized neural\narchitecture search (NAS) recipe is also introduced which improves MNv4 search\neffectiveness. The integration of UIB, Mobile MQA and the refined NAS recipe\nresults in a new suite of MNv4 models that are mostly Pareto optimal across\nmobile CPUs, DSPs, GPUs, as well as specialized accelerators like Apple Neural\nEngine and Google Pixel EdgeTPU - a characteristic not found in any other\nmodels tested. Finally, to further boost accuracy, we introduce a novel\ndistillation technique. Enhanced by this technique, our MNv4-Hybrid-Large model\ndelivers 87% ImageNet-1K accuracy, with a Pixel 8 EdgeTPU runtime of just\n3.8ms.\n","authors":["Danfeng Qin","Chas Leichner","Manolis Delakis","Marco Fornoni","Shixin Luo","Fan Yang","Weijun Wang","Colby Banbury","Chengxi Ye","Berkin Akin","Vaibhav Aggarwal","Tenghui Zhu","Daniele Moro","Andrew Howard"],"pdf_url":"https://arxiv.org/pdf/2404.10518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14950v2","updated":"2024-04-16T12:40:41Z","published":"2022-11-27T22:01:47Z","title":"Leveraging Image Matching Toward End-to-End Relative Camera Pose\n Regression","summary":" This paper proposes a generalizable, end-to-end deep learning-based method\nfor relative pose regression between two images. Given two images of the same\nscene captured from different viewpoints, our method predicts the relative\nrotation and translation (including direction and scale) between the two\nrespective cameras. Inspired by the classical pipeline, our method leverages\nImage Matching (IM) as a pre-trained task for relative pose regression.\nSpecifically, we use LoFTR, an architecture that utilizes an attention-based\nnetwork pre-trained on Scannet, to extract semi-dense feature maps, which are\nthen warped and fed into a pose regression network. Notably, we use a loss\nfunction that utilizes separate terms to account for the translation direction\nand scale. We believe such a separation is important because translation\ndirection is determined by point correspondences while the scale is inferred\nfrom prior on shape sizes. Our ablations further support this choice. We\nevaluate our method on several datasets and show that it outperforms previous\nend-to-end methods. The method also generalizes well to unseen datasets.\n","authors":["Fadi Khatib","Yuval Margalit","Meirav Galun","Ronen Basri"],"pdf_url":"https://arxiv.org/pdf/2211.14950v2.pdf","comment":"Project webpage: https://fadikhatib.github.io/GRelPose"},{"id":"http://arxiv.org/abs/2404.10501v1","updated":"2024-04-16T12:19:54Z","published":"2024-04-16T12:19:54Z","title":"Self-Supervised Visual Preference Alignment","summary":" This paper makes the first attempt towards unsupervised preference alignment\nin Vision-Language Models (VLMs). We generate chosen and rejected responses\nwith regard to the original and augmented image pairs, and conduct preference\nalignment with direct preference optimization. It is based on a core idea:\nproperly designed augmentation to the image input will induce VLM to generate\nfalse but hard negative responses, which helps the model to learn from and\nproduce more robust and powerful answers. The whole pipeline no longer hinges\non supervision from GPT4 or human involvement during alignment, and is highly\nefficient with few lines of code. With only 8k randomly sampled unsupervised\ndata, it achieves 90\\% relative score to GPT-4 on complex reasoning in\nLLaVA-Bench, and improves LLaVA-7B/13B by 6.7\\%/5.6\\% score on complex\nmulti-modal benchmark MM-Vet. Visualizations shows its improved ability to\nalign with user-intentions. A series of ablations are firmly conducted to\nreveal the latent mechanism of the approach, which also indicates its potential\ntowards further scaling. Code will be available.\n","authors":["Ke Zhu","Liang Zhao","Zheng Ge","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10499v1","updated":"2024-04-16T12:18:08Z","published":"2024-04-16T12:18:08Z","title":"Robust Noisy Label Learning via Two-Stream Sample Distillation","summary":" Noisy label learning aims to learn robust networks under the supervision of\nnoisy labels, which plays a critical role in deep learning. Existing work\neither conducts sample selection or label correction to deal with noisy labels\nduring the model training process. In this paper, we design a simple yet\neffective sample selection framework, termed Two-Stream Sample Distillation\n(TSSD), for noisy label learning, which can extract more high-quality samples\nwith clean labels to improve the robustness of network training. Firstly, a\nnovel Parallel Sample Division (PSD) module is designed to generate a certain\ntraining set with sufficient reliable positive and negative samples by jointly\nconsidering the sample structure in feature space and the human prior in loss\nspace. Secondly, a novel Meta Sample Purification (MSP) module is further\ndesigned to mine adequate semi-hard samples from the remaining uncertain\ntraining set by learning a strong meta classifier with extra golden data. As a\nresult, more and more high-quality samples will be distilled from the noisy\ntraining set to train networks robustly in every iteration. Extensive\nexperiments on four benchmark datasets, including CIFAR-10, CIFAR-100,\nTiny-ImageNet, and Clothing-1M, show that our method has achieved\nstate-of-the-art results over its competitors.\n","authors":["Sihan Bai","Sanping Zhou","Zheng Qin","Le Wang","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.10499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10498v1","updated":"2024-04-16T12:12:06Z","published":"2024-04-16T12:12:06Z","title":"LAECIPS: Large Vision Model Assisted Adaptive Edge-Cloud Collaboration\n for IoT-based Perception System","summary":" Recent large vision models (e.g., SAM) enjoy great potential to facilitate\nintelligent perception with high accuracy. Yet, the resource constraints in the\nIoT environment tend to limit such large vision models to be locally deployed,\nincurring considerable inference latency thereby making it difficult to support\nreal-time applications, such as autonomous driving and robotics. Edge-cloud\ncollaboration with large-small model co-inference offers a promising approach\nto achieving high inference accuracy and low latency. However, existing\nedge-cloud collaboration methods are tightly coupled with the model\narchitecture and cannot adapt to the dynamic data drifts in heterogeneous IoT\nenvironments. To address the issues, we propose LAECIPS, a new edge-cloud\ncollaboration framework. In LAECIPS, both the large vision model on the cloud\nand the lightweight model on the edge are plug-and-play. We design an\nedge-cloud collaboration strategy based on hard input mining, optimized for\nboth high accuracy and low latency. We propose to update the edge model and its\ncollaboration strategy with the cloud under the supervision of the large vision\nmodel, so as to adapt to the dynamic IoT data streams. Theoretical analysis of\nLAECIPS proves its feasibility. Experiments conducted in a robotic semantic\nsegmentation system using real-world datasets show that LAECIPS outperforms its\nstate-of-the-art competitors in accuracy, latency, and communication overhead\nwhile having better adaptability to dynamic environments.\n","authors":["Shijing Hu","Ruijun Deng","Xin Du","Zhihui Lu","Qiang Duan","Yi He","Shih-Chia Huang","Jie Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06665v2","updated":"2024-04-16T12:09:24Z","published":"2024-04-10T00:25:09Z","title":"Deep Generative Data Assimilation in Multimodal Setting","summary":" Robust integration of physical knowledge and data is key to improve\ncomputational simulations, such as Earth system models. Data assimilation is\ncrucial for achieving this goal because it provides a systematic framework to\ncalibrate model outputs with observations, which can include remote sensing\nimagery and ground station measurements, with uncertainty quantification.\nConventional methods, including Kalman filters and variational approaches,\ninherently rely on simplifying linear and Gaussian assumptions, and can be\ncomputationally expensive. Nevertheless, with the rapid adoption of data-driven\nmethods in many areas of computational sciences, we see the potential of\nemulating traditional data assimilation with deep learning, especially\ngenerative models. In particular, the diffusion-based probabilistic framework\nhas large overlaps with data assimilation principles: both allows for\nconditional generation of samples with a Bayesian inverse framework. These\nmodels have shown remarkable success in text-conditioned image generation or\nimage-controlled video synthesis. Likewise, one can frame data assimilation as\nobservation-conditioned state calibration. In this work, we propose SLAMS:\nScore-based Latent Assimilation in Multimodal Setting. Specifically, we\nassimilate in-situ weather station data and ex-situ satellite imagery to\ncalibrate the vertical temperature profiles, globally. Through extensive\nablation, we demonstrate that SLAMS is robust even in low-resolution, noisy,\nand sparse data settings. To our knowledge, our work is the first to apply deep\ngenerative framework for multimodal data assimilation using real-world\ndatasets; an important step for building robust computational simulators,\nincluding the next-generation Earth system models. Our code is available at:\nhttps://github.com/yongquan-qu/SLAMS\n","authors":["Yongquan Qu","Juan Nathaniel","Shuolin Li","Pierre Gentine"],"pdf_url":"https://arxiv.org/pdf/2404.06665v2.pdf","comment":"CVPR2024 EarthVision"},{"id":"http://arxiv.org/abs/2312.07039v2","updated":"2024-04-16T12:05:55Z","published":"2023-12-12T07:52:33Z","title":"Open-Pose 3D Zero-Shot Learning: Benchmark and Challenges","summary":" With the explosive 3D data growth, the urgency of utilizing zero-shot\nlearning to facilitate data labeling becomes evident. Recently, methods\ntransferring language or language-image pre-training models like Contrastive\nLanguage-Image Pre-training (CLIP) to 3D vision have made significant progress\nin the 3D zero-shot classification task. These methods primarily focus on 3D\nobject classification with an aligned pose; such a setting is, however, rather\nrestrictive, which overlooks the recognition of 3D objects with open poses\ntypically encountered in real-world scenarios, such as an overturned chair or a\nlying teddy bear. To this end, we propose a more realistic and challenging\nscenario named open-pose 3D zero-shot classification, focusing on the\nrecognition of 3D objects regardless of their orientation. First, we revisit\nthe current research on 3D zero-shot classification, and propose two benchmark\ndatasets specifically designed for the open-pose setting. We empirically\nvalidate many of the most popular methods in the proposed open-pose benchmark.\nOur investigations reveal that most current 3D zero-shot classification models\nsuffer from poor performance, indicating a substantial exploration room towards\nthe new direction. Furthermore, we study a concise pipeline with an iterative\nangle refinement mechanism that automatically optimizes one ideal angle to\nclassify these open-pose 3D objects. In particular, to make validation more\ncompelling and not just limited to existing CLIP-based methods, we also pioneer\nthe exploration of knowledge transfer based on Diffusion models. While the\nproposed solutions can serve as a new benchmark for open-pose 3D zero-shot\nclassification, we discuss the complexities and challenges of this scenario\nthat remain for further research development. The code is available publicly at\nhttps://github.com/weiguangzhao/Diff-OP3D.\n","authors":["Weiguang Zhao","Guanyu Yang","Rui Zhang","Chenru Jiang","Chaolong Yang","Yuyao Yan","Amir Hussain","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2312.07039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04016v2","updated":"2024-04-16T12:04:01Z","published":"2023-12-07T03:10:03Z","title":"PartDistill: 3D Shape Part Segmentation by Vision-Language Model\n Distillation","summary":" This paper proposes a cross-modal distillation framework, PartDistill, which\ntransfers 2D knowledge from vision-language models (VLMs) to facilitate 3D\nshape part segmentation. PartDistill addresses three major challenges in this\ntask: the lack of 3D segmentation in invisible or undetected regions in the 2D\nprojections, inconsistent 2D predictions by VLMs, and the lack of knowledge\naccumulation across different 3D shapes. PartDistill consists of a teacher\nnetwork that uses a VLM to make 2D predictions and a student network that\nlearns from the 2D predictions while extracting geometrical features from\nmultiple 3D shapes to carry out 3D part segmentation. A bi-directional\ndistillation, including forward and backward distillations, is carried out\nwithin the framework, where the former forward distills the 2D predictions to\nthe student network, and the latter improves the quality of the 2D predictions,\nwhich subsequently enhances the final 3D segmentation. Moreover, PartDistill\ncan exploit generative models that facilitate effortless 3D shape creation for\ngenerating knowledge sources to be distilled. Through extensive experiments,\nPartDistill boosts the existing methods with substantial margins on widely used\nShapeNetPart and PartNetE datasets, by more than 15% and 12% higher mIoU\nscores, respectively. The code for this work is available at\nhttps://github.com/ardianumam/PartDistill.\n","authors":["Ardian Umam","Cheng-Kun Yang","Min-Hung Chen","Jen-Hui Chuang","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2312.04016v2.pdf","comment":"CVPR 2024 Accepted"},{"id":"http://arxiv.org/abs/2404.10490v1","updated":"2024-04-16T11:57:03Z","published":"2024-04-16T11:57:03Z","title":"Teaching Chinese Sign Language with Feedback in Mixed Reality","summary":" Traditional sign language teaching methods face challenges such as limited\nfeedback and diverse learning scenarios. Although 2D resources lack real-time\nfeedback, classroom teaching is constrained by a scarcity of teacher. Methods\nbased on VR and AR have relatively primitive interaction feedback mechanisms.\nThis study proposes an innovative teaching model that uses real-time monocular\nvision and mixed reality technology. First, we introduce an improved\nhand-posture reconstruction method to achieve sign language semantic retention\nand real-time feedback. Second, a ternary system evaluation algorithm is\nproposed for a comprehensive assessment, maintaining good consistency with\nexperts in sign language. Furthermore, we use mixed reality technology to\nconstruct a scenario-based 3D sign language classroom and explore the user\nexperience of scenario teaching. Overall, this paper presents a novel teaching\nmethod that provides an immersive learning experience, advanced posture\nreconstruction, and precise feedback, achieving positive feedback on user\nexperience and learning effectiveness.\n","authors":["Hongli Wen","Yang Xu","Lin Li","Xudong Ru"],"pdf_url":"https://arxiv.org/pdf/2404.10490v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.17701v3","updated":"2024-04-16T11:46:39Z","published":"2024-03-26T13:40:18Z","title":"Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical\n Image Segmentation","summary":" Image segmentation holds a vital position in the realms of diagnosis and\ntreatment within the medical domain. Traditional convolutional neural networks\n(CNNs) and Transformer models have made significant advancements in this realm,\nbut they still encounter challenges because of limited receptive field or high\ncomputing complexity. Recently, State Space Models (SSMs), particularly Mamba\nand its variants, have demonstrated notable performance in the field of vision.\nHowever, their feature extraction methods may not be sufficiently effective and\nretain some redundant structures, leaving room for parameter reduction.\nMotivated by previous spatial and channel attention methods, we propose Triplet\nMamba-UNet. The method leverages residual VSS Blocks to extract intensive\ncontextual features, while Triplet SSM is employed to fuse features across\nspatial and channel dimensions. We conducted experiments on ISIC17, ISIC18,\nCVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets,\ndemonstrating the superior segmentation performance of our proposed TM-UNet.\nAdditionally, compared to the previous VM-UNet, our model achieves a one-third\nreduction in parameters.\n","authors":["Hao Tang","Lianglun Cheng","Guoheng Huang","Zhengguang Tan","Junhao Lu","Kaihong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.17701v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10484v1","updated":"2024-04-16T11:44:12Z","published":"2024-04-16T11:44:12Z","title":"AbsGS: Recovering Fine Details for 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3D-GS) technique couples 3D Gaussian primitives with\ndifferentiable rasterization to achieve high-quality novel view synthesis\nresults while providing advanced real-time rendering performance. However, due\nto the flaw of its adaptive density control strategy in 3D-GS, it frequently\nsuffers from over-reconstruction issue in intricate scenes containing\nhigh-frequency details, leading to blurry rendered images. The underlying\nreason for the flaw has still been under-explored. In this work, we present a\ncomprehensive analysis of the cause of aforementioned artifacts, namely\ngradient collision, which prevents large Gaussians in over-reconstructed\nregions from splitting. To address this issue, we propose the novel\nhomodirectional view-space positional gradient as the criterion for\ndensification. Our strategy efficiently identifies large Gaussians in\nover-reconstructed regions, and recovers fine details by splitting. We evaluate\nour proposed method on various challenging datasets. The experimental results\nindicate that our approach achieves the best rendering quality with reduced or\nsimilar memory consumption. Our method is easy to implement and can be\nincorporated into a wide variety of most recent Gaussian Splatting-based\nmethods. We will open source our codes upon formal publication. Our project\npage is available at: https://ty424.github.io/AbsGS.github.io/\n","authors":["Zongxin Ye","Wenyu Li","Sidun Liu","Peng Qiao","Yong Dou"],"pdf_url":"https://arxiv.org/pdf/2404.10484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10476v1","updated":"2024-04-16T11:38:44Z","published":"2024-04-16T11:38:44Z","title":"Efficient optimal dispersed Haar-like filters for face detection","summary":" This paper introduces a new dispersed Haar-like filter for efficiently\ndetection face. The basic idea for finding the filter is maximising\nbetween-class and minimising within-class variance. The proposed filters can be\nconsidered as an optimal configuration dispersed Haar-like filters; filters\nwith disjoint black and white parts.\n","authors":["Zeinab Sedaghatjoo","Hossein Hosseinzadeh","Ahmad shirzadi"],"pdf_url":"https://arxiv.org/pdf/2404.10476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02612v2","updated":"2024-04-16T11:35:37Z","published":"2023-11-05T10:01:18Z","title":"GPT-4V-AD: Exploring Grounding Potential of VQA-oriented GPT-4V for\n Zero-shot Anomaly Detection","summary":" Large Multimodal Model (LMM) GPT-4V(ision) endows GPT-4 with visual grounding\ncapabilities, making it possible to handle certain tasks through the Visual\nQuestion Answering (VQA) paradigm. This paper explores the potential of\nVQA-oriented GPT-4V in the recently popular visual Anomaly Detection (AD) and\nis the first to conduct qualitative and quantitative evaluations on the popular\nMVTec AD and VisA datasets. Considering that this task requires both\nimage-/pixel-level evaluations, the proposed GPT-4V-AD framework contains three\ncomponents: \\textbf{\\textit{1)}} Granular Region Division, \\textbf{\\textit{2)}}\nPrompt Designing, \\textbf{\\textit{3)}} Text2Segmentation for easy quantitative\nevaluation, and have made some different attempts for comparative analysis. The\nresults show that GPT-4V can achieve certain results in the zero-shot AD task\nthrough a VQA paradigm, such as achieving image-level 77.1/88.0 and pixel-level\n68.0/76.6 AU-ROCs on MVTec AD and VisA datasets, respectively. However, its\nperformance still has a certain gap compared to the state-of-the-art zero-shot\nmethod, \\eg, WinCLIP and CLIP-AD, and further researches are needed. This study\nprovides a baseline reference for the research of VQA-oriented LMM in the\nzero-shot AD task, and we also post several possible future works. Code is\navailable at \\url{https://github.com/zhangzjn/GPT-4V-AD}.\n","authors":["Jiangning Zhang","Haoyang He","Xuhai Chen","Zhucun Xue","Yabiao Wang","Chengjie Wang","Lei Xie","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2311.02612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10474v1","updated":"2024-04-16T11:29:43Z","published":"2024-04-16T11:29:43Z","title":"Toward a Realistic Benchmark for Out-of-Distribution Detection","summary":" Deep neural networks are increasingly used in a wide range of technologies\nand services, but remain highly susceptible to out-of-distribution (OOD)\nsamples, that is, drawn from a different distribution than the original\ntraining set. A common approach to address this issue is to endow deep neural\nnetworks with the ability to detect OOD samples. Several benchmarks have been\nproposed to design and validate OOD detection techniques. However, many of them\nare based on far-OOD samples drawn from very different distributions, and thus\nlack the complexity needed to capture the nuances of real-world scenarios. In\nthis work, we introduce a comprehensive benchmark for OOD detection, based on\nImageNet and Places365, that assigns individual classes as in-distribution or\nout-of-distribution depending on the semantic similarity with the training set.\nSeveral techniques can be used to determine which classes should be considered\nin-distribution, yielding benchmarks with varying properties. Experimental\nresults on different OOD detection techniques show how their measured efficacy\ndepends on the selected benchmark and how confidence-based techniques may\noutperform classifier-based ones on near-OOD samples.\n","authors":["Pietro Recalcati","Fabio Garcea","Luca Piano","Fabrizio Lamberti","Lia Morra"],"pdf_url":"https://arxiv.org/pdf/2404.10474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11029v4","updated":"2024-04-16T11:24:36Z","published":"2023-06-19T15:46:41Z","title":"RemoteCLIP: A Vision Language Foundation Model for Remote Sensing","summary":" General-purpose foundation models have led to recent breakthroughs in\nartificial intelligence. In remote sensing, self-supervised learning (SSL) and\nMasked Image Modeling (MIM) have been adopted to build foundation models.\nHowever, these models primarily learn low-level features and require annotated\ndata for fine-tuning. Moreover, they are inapplicable for retrieval and\nzero-shot applications due to the lack of language understanding. To address\nthese limitations, we propose RemoteCLIP, the first vision-language foundation\nmodel for remote sensing that aims to learn robust visual features with rich\nsemantics and aligned text embeddings for seamless downstream application. To\naddress the scarcity of pre-training data, we leverage data scaling which\nconverts heterogeneous annotations into a unified image-caption data format\nbased on Box-to-Caption (B2C) and Mask-to-Box (M2B) conversion. By further\nincorporating UAV imagery, we produce a 12 $\\times$ larger pretraining dataset\nthan the combination of all available datasets. RemoteCLIP can be applied to a\nvariety of downstream tasks, including zero-shot image classification, linear\nprobing, $\\textit{k}$-NN classification, few-shot classification, image-text\nretrieval, and object counting in remote sensing images. Evaluation on 16\ndatasets, including a newly introduced RemoteCount benchmark to test the object\ncounting ability, shows that RemoteCLIP consistently outperforms baseline\nfoundation models across different model scales. Impressively, RemoteCLIP beats\nthe state-of-the-art method by 9.14% mean recall on the RSITMD dataset and\n8.92% on the RSICD dataset. For zero-shot classification, our RemoteCLIP\noutperforms the CLIP baseline by up to 6.39% average accuracy on 12 downstream\ndatasets. Project website: https://github.com/ChenDelong1999/RemoteCLIP\n","authors":["Fan Liu","Delong Chen","Zhangqingyun Guan","Xiaocong Zhou","Jiale Zhu","Qiaolin Ye","Liyong Fu","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.11029v4.pdf","comment":"Accepted by IEEE Transactions on Geoscience and Remote Sensing (TGRS)"},{"id":"http://arxiv.org/abs/2404.10454v1","updated":"2024-04-16T10:50:16Z","published":"2024-04-16T10:50:16Z","title":"A Computer Vision-Based Quality Assessment Technique for the automatic\n control of consumables for analytical laboratories","summary":" The rapid growth of the Industry 4.0 paradigm is increasing the pressure to\ndevelop effective automated monitoring systems. Artificial Intelligence (AI) is\na convenient tool to improve the efficiency of industrial processes while\nreducing errors and waste. In fact, it allows the use of real-time data to\nincrease the effectiveness of monitoring systems, minimize errors, make the\nproduction process more sustainable, and save costs. In this paper, a novel\nautomatic monitoring system is proposed in the context of production process of\nplastic consumables used in analysis laboratories, with the aim to increase the\neffectiveness of the control process currently performed by a human operator.\nIn particular, we considered the problem of classifying the presence or absence\nof a transparent anticoagulant substance inside test tubes. Specifically, a\nhand-designed deep network model is used and compared with some\nstate-of-the-art models for its ability to categorize different images of vials\nthat can be either filled with the anticoagulant or empty. Collected results\nindicate that the proposed approach is competitive with state-of-the-art models\nin terms of accuracy. Furthermore, we increased the complexity of the task by\ntraining the models on the ability to discriminate not only the presence or\nabsence of the anticoagulant inside the vial, but also the size of the test\ntube. The analysis performed in the latter scenario confirms the\ncompetitiveness of our approach. Moreover, our model is remarkably superior in\nterms of its generalization ability and requires significantly fewer resources.\nThese results suggest the possibility of successfully implementing such a model\nin the production process of a plastic consumables company.\n","authors":["Meriam Zribi","Paolo Pagliuca","Francesca Pitolli"],"pdf_url":"https://arxiv.org/pdf/2404.10454v1.pdf","comment":"31 pages, 13 figures, 10 tables"},{"id":"http://arxiv.org/abs/2404.09342v2","updated":"2024-04-16T10:33:36Z","published":"2024-04-14T19:51:32Z","title":"Face-voice Association in Multilingual Environments (FAME) Challenge\n 2024 Evaluation Plan","summary":" The advancements of technology have led to the use of multimodal systems in\nvarious real-world applications. Among them, the audio-visual systems are one\nof the widely used multimodal systems. In the recent years, associating face\nand voice of a person has gained attention due to presence of unique\ncorrelation between them. The Face-voice Association in Multilingual\nEnvironments (FAME) Challenge 2024 focuses on exploring face-voice association\nunder a unique condition of multilingual scenario. This condition is inspired\nfrom the fact that half of the world's population is bilingual and most often\npeople communicate under multilingual scenario. The challenge uses a dataset\nnamely, Multilingual Audio-Visual (MAV-Celeb) for exploring face-voice\nassociation in multilingual environments. This report provides the details of\nthe challenge, dataset, baselines and task details for the FAME Challenge.\n","authors":["Muhammad Saad Saeed","Shah Nawaz","Muhammad Salman Tahir","Rohan Kumar Das","Muhammad Zaigham Zaheer","Marta Moscati","Markus Schedl","Muhammad Haris Khan","Karthik Nandakumar","Muhammad Haroon Yousaf"],"pdf_url":"https://arxiv.org/pdf/2404.09342v2.pdf","comment":"ACM Multimedia Conference - Grand Challenge"},{"id":"http://arxiv.org/abs/2404.10441v1","updated":"2024-04-16T10:26:57Z","published":"2024-04-16T10:26:57Z","title":"1st Place Solution for ICCV 2023 OmniObject3D Challenge: Sparse-View\n Reconstruction","summary":" In this report, we present the 1st place solution for ICCV 2023 OmniObject3D\nChallenge: Sparse-View Reconstruction. The challenge aims to evaluate\napproaches for novel view synthesis and surface reconstruction using only a few\nposed images of each object. We utilize Pixel-NeRF as the basic model, and\napply depth supervision as well as coarse-to-fine positional encoding. The\nexperiments demonstrate the effectiveness of our approach in improving\nsparse-view reconstruction quality. We ranked first in the final test with a\nPSNR of 25.44614.\n","authors":["Hang Du","Yaping Xue","Weidong Dai","Xuejun Yan","Jingjing Wang"],"pdf_url":"https://arxiv.org/pdf/2404.10441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08504v2","updated":"2024-04-16T10:18:56Z","published":"2024-04-12T14:34:24Z","title":"3D Human Scan With A Moving Event Camera","summary":" Capturing a 3D human body is one of the important tasks in computer vision\nwith a wide range of applications such as virtual reality and sports analysis.\nHowever, conventional frame cameras are limited by their temporal resolution\nand dynamic range, which imposes constraints in real-world application setups.\nEvent cameras have the advantages of high temporal resolution and high dynamic\nrange (HDR), but the development of event-based methods is necessary to handle\ndata with different characteristics. This paper proposes a novel event-based\nmethod for 3D pose estimation and human mesh recovery. Prior work on\nevent-based human mesh recovery require frames (images) as well as event data.\nThe proposed method solely relies on events; it carves 3D voxels by moving the\nevent camera around a stationary body, reconstructs the human pose and mesh by\nattenuated rays, and fit statistical body models, preserving high-frequency\ndetails. The experimental results show that the proposed method outperforms\nconventional frame-based methods in the estimation accuracy of both pose and\nbody mesh. We also demonstrate results in challenging situations where a\nconventional camera has motion blur. This is the first to demonstrate\nevent-only human mesh recovery, and we hope that it is the first step toward\nachieving robust and accurate 3D human body scanning from vision sensors.\nhttps://florpeng.github.io/event-based-human-scan/\n","authors":["Kai Kohyama","Shintaro Shiba","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2404.08504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10438v1","updated":"2024-04-16T10:04:38Z","published":"2024-04-16T10:04:38Z","title":"The Unreasonable Effectiveness of Pre-Trained Features for Camera Pose\n Refinement","summary":" Pose refinement is an interesting and practically relevant research\ndirection. Pose refinement can be used to (1) obtain a more accurate pose\nestimate from an initial prior (e.g., from retrieval), (2) as pre-processing,\ni.e., to provide a better starting point to a more expensive pose estimator,\n(3) as post-processing of a more accurate localizer. Existing approaches focus\non learning features / scene representations for the pose refinement task. This\ninvolves training an implicit scene representation or learning features while\noptimizing a camera pose-based loss. A natural question is whether training\nspecific features / representations is truly necessary or whether similar\nresults can be already achieved with more generic features. In this work, we\npresent a simple approach that combines pre-trained features with a particle\nfilter and a renderable representation of the scene. Despite its simplicity, it\nachieves state-of-the-art results, demonstrating that one can easily build a\npose refiner without the need for specific training. The code is at\nhttps://github.com/ga1i13o/mcloc_poseref\n","authors":["Gabriele Trivigno","Carlo Masone","Barbara Caputo","Torsten Sattler"],"pdf_url":"https://arxiv.org/pdf/2404.10438v1.pdf","comment":"Accepted to CVPR2024 (Highlight)"},{"id":"http://arxiv.org/abs/2404.05468v3","updated":"2024-04-16T10:02:17Z","published":"2024-04-08T12:46:39Z","title":"Mind-to-Image: Projecting Visual Mental Imagination of the Brain from\n fMRI","summary":" The reconstruction of images observed by subjects from fMRI data collected\nduring visual stimuli has made significant strides in the past decade, thanks\nto the availability of extensive fMRI datasets and advancements in generative\nmodels for image generation. However, the application of visual reconstruction\nhas remained limited. Reconstructing visual imagination presents a greater\nchallenge, with potentially revolutionary applications ranging from aiding\nindividuals with disabilities to verifying witness accounts in court. The\nprimary hurdles in this field are the absence of data collection protocols for\nvisual imagery and the lack of datasets on the subject. Traditionally,\nfMRI-to-image relies on data collected from subjects exposed to visual stimuli,\nwhich poses issues for generating visual imagery based on the difference of\nbrain activity between visual stimulation and visual imagery. For the first\ntime, we have compiled a substantial dataset (around 6h of scans) on visual\nimagery along with a proposed data collection protocol. We then train a\nmodified version of an fMRI-to-image model and demonstrate the feasibility of\nreconstructing images from two modes of imagination: from memory and from pure\nimagination. This marks an important step towards creating a technology that\nallow direct reconstruction of visual imagery.\n","authors":["Hugo Caselles-Dupré","Charles Mellerio","Paul Hérent","Alizée Lopez-Persem","Benoit Béranger","Mathieu Soularue","Pierre Fautrel","Gauthier Vernier","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2404.05468v3.pdf","comment":"Pre-print to be updated. Work in progress"},{"id":"http://arxiv.org/abs/2404.10433v1","updated":"2024-04-16T09:56:08Z","published":"2024-04-16T09:56:08Z","title":"Explainable concept mappings of MRI: Revealing the mechanisms underlying\n deep learning-based brain disease classification","summary":" Motivation. While recent studies show high accuracy in the classification of\nAlzheimer's disease using deep neural networks, the underlying learned concepts\nhave not been investigated.\n Goals. To systematically identify changes in brain regions through concepts\nlearned by the deep neural network for model validation.\n Approach. Using quantitative R2* maps we separated Alzheimer's patients\n(n=117) from normal controls (n=219) by using a convolutional neural network\nand systematically investigated the learned concepts using Concept Relevance\nPropagation and compared these results to a conventional region of\ninterest-based analysis.\n Results. In line with established histological findings and the region of\ninterest-based analyses, highly relevant concepts were primarily found in and\nadjacent to the basal ganglia.\n Impact. The identification of concepts learned by deep neural networks for\ndisease classification enables validation of the models and could potentially\nimprove reliability.\n","authors":["Christian Tinauer","Anna Damulina","Maximilian Sackl","Martin Soellradl","Reduan Achtibat","Maximilian Dreyer","Frederik Pahde","Sebastian Lapuschkin","Reinhold Schmidt","Stefan Ropele","Wojciech Samek","Christian Langkammer"],"pdf_url":"https://arxiv.org/pdf/2404.10433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18975v2","updated":"2024-04-16T09:49:15Z","published":"2024-02-29T09:27:40Z","title":"Theoretically Achieving Continuous Representation of Oriented Bounding\n Boxes","summary":" Considerable efforts have been devoted to Oriented Object Detection (OOD).\nHowever, one lasting issue regarding the discontinuity in Oriented Bounding Box\n(OBB) representation remains unresolved, which is an inherent bottleneck for\nextant OOD methods. This paper endeavors to completely solve this issue in a\ntheoretically guaranteed manner and puts an end to the ad-hoc efforts in this\ndirection. Prior studies typically can only address one of the two cases of\ndiscontinuity: rotation and aspect ratio, and often inadvertently introduce\ndecoding discontinuity, e.g. Decoding Incompleteness (DI) and Decoding\nAmbiguity (DA) as discussed in literature. Specifically, we propose a novel\nrepresentation method called Continuous OBB (COBB), which can be readily\nintegrated into existing detectors e.g. Faster-RCNN as a plugin. It can\ntheoretically ensure continuity in bounding box regression which to our best\nknowledge, has not been achieved in literature for rectangle-based object\nrepresentation. For fairness and transparency of experiments, we have developed\na modularized benchmark based on the open-source deep learning framework\nJittor's detection toolbox JDet for OOD evaluation. On the popular DOTA\ndataset, by integrating Faster-RCNN as the same baseline model, our new method\noutperforms the peer method Gliding Vertex by 1.13% mAP50 (relative improvement\n1.54%), and 2.46% mAP75 (relative improvement 5.91%), without any tricks.\n","authors":["Zi-Kai Xiao","Guo-Ye Yang","Xue Yang","Tai-Jiang Mu","Junchi Yan","Shi-min Hu"],"pdf_url":"https://arxiv.org/pdf/2402.18975v2.pdf","comment":"17 pages, 12 tables, 8 figures. Accepted by CVPR'24. Code:\n https://github.com/514flowey/JDet-COBB"},{"id":"http://arxiv.org/abs/2404.10411v1","updated":"2024-04-16T09:28:54Z","published":"2024-04-16T09:28:54Z","title":"Camera clustering for scalable stream-based active distillation","summary":" We present a scalable framework designed to craft efficient lightweight\nmodels for video object detection utilizing self-training and knowledge\ndistillation techniques. We scrutinize methodologies for the ideal selection of\ntraining images from video streams and the efficacy of model sharing across\nnumerous cameras. By advocating for a camera clustering methodology, we aim to\ndiminish the requisite number of models for training while augmenting the\ndistillation dataset. The findings affirm that proper camera clustering notably\namplifies the accuracy of distilled models, eclipsing the methodologies that\nemploy distinct models for each camera or a universal model trained on the\naggregate camera data.\n","authors":["Dani Manjah","Davide Cacciarelli","Christophe De Vleeschouwer","Benoit Macq"],"pdf_url":"https://arxiv.org/pdf/2404.10411v1.pdf","comment":"This manuscript is currently under review at IEEE Transactions on\n Circuits and Systems for Video Technology"},{"id":"http://arxiv.org/abs/2404.10408v1","updated":"2024-04-16T09:19:23Z","published":"2024-04-16T09:19:23Z","title":"Adversarial Identity Injection for Semantic Face Image Synthesis","summary":" Nowadays, deep learning models have reached incredible performance in the\ntask of image generation. Plenty of literature works address the task of face\ngeneration and editing, with human and automatic systems that struggle to\ndistinguish what's real from generated. Whereas most systems reached excellent\nvisual generation quality, they still face difficulties in preserving the\nidentity of the starting input subject. Among all the explored techniques,\nSemantic Image Synthesis (SIS) methods, whose goal is to generate an image\nconditioned on a semantic segmentation mask, are the most promising, even\nthough preserving the perceived identity of the input subject is not their main\nconcern. Therefore, in this paper, we investigate the problem of identity\npreservation in face image generation and present an SIS architecture that\nexploits a cross-attention mechanism to merge identity, style, and semantic\nfeatures to generate faces whose identities are as similar as possible to the\ninput ones. Experimental results reveal that the proposed method is not only\nsuitable for preserving the identity but is also effective in the face\nrecognition adversarial attack, i.e. hiding a second identity in the generated\nfaces.\n","authors":["Giuseppe Tarollo","Tomaso Fontanini","Claudio Ferrari","Guido Borghi","Andrea Prati"],"pdf_url":"https://arxiv.org/pdf/2404.10408v1.pdf","comment":"Paper accepted at CVPR 2024 Biometrics Workshop"},{"id":"http://arxiv.org/abs/2404.10407v1","updated":"2024-04-16T09:19:11Z","published":"2024-04-16T09:19:11Z","title":"Comprehensive Survey of Model Compression and Speed up for Vision\n Transformers","summary":" Vision Transformers (ViT) have marked a paradigm shift in computer vision,\noutperforming state-of-the-art models across diverse tasks. However, their\npractical deployment is hampered by high computational and memory demands. This\nstudy addresses the challenge by evaluating four primary model compression\ntechniques: quantization, low-rank approximation, knowledge distillation, and\npruning. We methodically analyze and compare the efficacy of these techniques\nand their combinations in optimizing ViTs for resource-constrained\nenvironments. Our comprehensive experimental evaluation demonstrates that these\nmethods facilitate a balanced compromise between model accuracy and\ncomputational efficiency, paving the way for wider application in edge\ncomputing devices.\n","authors":["Feiyang Chen","Ziqian Luo","Lisang Zhou","Xueting Pan","Ying Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.10407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10405v1","updated":"2024-04-16T09:12:16Z","published":"2024-04-16T09:12:16Z","title":"Integration of Self-Supervised BYOL in Semi-Supervised Medical Image\n Recognition","summary":" Image recognition techniques heavily rely on abundant labeled data,\nparticularly in medical contexts. Addressing the challenges associated with\nobtaining labeled data has led to the prominence of self-supervised learning\nand semi-supervised learning, especially in scenarios with limited annotated\ndata. In this paper, we proposed an innovative approach by integrating\nself-supervised learning into semi-supervised models to enhance medical image\nrecognition. Our methodology commences with pre-training on unlabeled data\nutilizing the BYOL method. Subsequently, we merge pseudo-labeled and labeled\ndatasets to construct a neural network classifier, refining it through\niterative fine-tuning. Experimental results on three different datasets\ndemonstrate that our approach optimally leverages unlabeled data, outperforming\nexisting methods in terms of accuracy for medical image recognition.\n","authors":["Hao Feng","Yuanzhe Jia","Ruijia Xu","Mukesh Prasad","Ali Anaissi","Ali Braytee"],"pdf_url":"https://arxiv.org/pdf/2404.10405v1.pdf","comment":"Accepted by ICCS 2024"},{"id":"http://arxiv.org/abs/2205.10120v7","updated":"2024-04-16T09:03:32Z","published":"2022-05-17T14:00:58Z","title":"Privacy Preserving Image Registration","summary":" Image registration is a key task in medical imaging applications, allowing to\nrepresent medical images in a common spatial reference frame. Current\napproaches to image registration are generally based on the assumption that the\ncontent of the images is usually accessible in clear form, from which the\nspatial transformation is subsequently estimated. This common assumption may\nnot be met in practical applications, since the sensitive nature of medical\nimages may ultimately require their analysis under privacy constraints,\npreventing to openly share the image content.In this work, we formulate the\nproblem of image registration under a privacy preserving regime, where images\nare assumed to be confidential and cannot be disclosed in clear. We derive our\nprivacy preserving image registration framework by extending classical\nregistration paradigms to account for advanced cryptographic tools, such as\nsecure multi-party computation and homomorphic encryption, that enable the\nexecution of operations without leaking the underlying data. To overcome the\nproblem of performance and scalability of cryptographic tools in high\ndimensions, we propose several techniques to optimize the image registration\noperations by using gradient approximations, and by revisiting the use of\nhomomorphic encryption trough packing, to allow the efficient encryption and\nmultiplication of large matrices. We demonstrate our privacy preserving\nframework in linear and non-linear registration problems, evaluating its\naccuracy and scalability with respect to standard, non-private counterparts.\nOur results show that privacy preserving image registration is feasible and can\nbe adopted in sensitive medical imaging applications.\n","authors":["Riccardo Taiello","Melek Önen","Francesco Capano","Olivier Humbert","Marco Lorenzi"],"pdf_url":"https://arxiv.org/pdf/2205.10120v7.pdf","comment":"v4 Accepted at Medical Image Computing and Computer Assisted\n Intervention (2022) 130-140"},{"id":"http://arxiv.org/abs/2404.10394v1","updated":"2024-04-16T08:52:42Z","published":"2024-04-16T08:52:42Z","title":"Portrait3D: Text-Guided High-Quality 3D Portrait Generation Using\n Pyramid Representation and GANs Prior","summary":" Existing neural rendering-based text-to-3D-portrait generation methods\ntypically make use of human geometry prior and diffusion models to obtain\nguidance. However, relying solely on geometry information introduces issues\nsuch as the Janus problem, over-saturation, and over-smoothing. We present\nPortrait3D, a novel neural rendering-based framework with a novel joint\ngeometry-appearance prior to achieve text-to-3D-portrait generation that\novercomes the aforementioned issues. To accomplish this, we train a 3D portrait\ngenerator, 3DPortraitGAN-Pyramid, as a robust prior. This generator is capable\nof producing 360{\\deg} canonical 3D portraits, serving as a starting point for\nthe subsequent diffusion-based generation process. To mitigate the \"grid-like\"\nartifact caused by the high-frequency information in the feature-map-based 3D\nrepresentation commonly used by most 3D-aware GANs, we integrate a novel\npyramid tri-grid 3D representation into 3DPortraitGAN-Pyramid. To generate 3D\nportraits from text, we first project a randomly generated image aligned with\nthe given prompt into the pre-trained 3DPortraitGAN-Pyramid's latent space. The\nresulting latent code is then used to synthesize a pyramid tri-grid. Beginning\nwith the obtained pyramid tri-grid, we use score distillation sampling to\ndistill the diffusion model's knowledge into the pyramid tri-grid. Following\nthat, we utilize the diffusion model to refine the rendered images of the 3D\nportrait and then use these refined images as training data to further optimize\nthe pyramid tri-grid, effectively eliminating issues with unrealistic color and\nunnatural artifacts. Our experimental results show that Portrait3D can produce\nrealistic, high-quality, and canonical 3D portraits that align with the prompt.\n","authors":["Yiqian Wu","Hao Xu","Xiangjun Tang","Xien Chen","Siyu Tang","Zhebin Zhang","Chen Li","Xiaogang Jin"],"pdf_url":"https://arxiv.org/pdf/2404.10394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10387v1","updated":"2024-04-16T08:39:29Z","published":"2024-04-16T08:39:29Z","title":"CNN-based explanation ensembling for dataset, representation and\n explanations evaluation","summary":" Explainable Artificial Intelligence has gained significant attention due to\nthe widespread use of complex deep learning models in high-stake domains such\nas medicine, finance, and autonomous cars. However, different explanations\noften present different aspects of the model's behavior. In this research\nmanuscript, we explore the potential of ensembling explanations generated by\ndeep classification models using convolutional model. Through experimentation\nand analysis, we aim to investigate the implications of combining explanations\nto uncover a more coherent and reliable patterns of the model's behavior,\nleading to the possibility of evaluating the representation learned by the\nmodel. With our method, we can uncover problems of under-representation of\nimages in a certain class. Moreover, we discuss other side benefits like\nfeatures' reduction by replacing the original image with its explanations\nresulting in the removal of some sensitive information. Through the use of\ncarefully selected evaluation metrics from the Quantus library, we demonstrated\nthe method's superior performance in terms of Localisation and Faithfulness,\ncompared to individual explanations.\n","authors":["Weronika Hryniewska-Guzik","Luca Longo","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.10387v1.pdf","comment":"accepted at 2nd World Conference on eXplainable Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2404.10383v1","updated":"2024-04-16T08:25:36Z","published":"2024-04-16T08:25:36Z","title":"Learning to Score Sign Language with Two-stage Method","summary":" Human action recognition and performance assessment have been hot research\ntopics in recent years. Recognition problems have mature solutions in the field\nof sign language, but past research in performance analysis has focused on\ncompetitive sports and medical training, overlooking the scoring assessment\n,which is an important part of sign language teaching digitalization. In this\npaper, we analyze the existing technologies for performance assessment and\nadopt methods that perform well in human pose reconstruction tasks combined\nwith motion rotation embedded expressions, proposing a two-stage sign language\nperformance evaluation pipeline. Our analysis shows that choosing\nreconstruction tasks in the first stage can provide more expressive features,\nand using smoothing methods can provide an effective reference for assessment.\nExperiments show that our method provides good score feedback mechanisms and\nhigh consistency with professional assessments compared to end-to-end\nevaluations.\n","authors":["Wen Hongli","Xu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.10383v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.10378v1","updated":"2024-04-16T08:15:10Z","published":"2024-04-16T08:15:10Z","title":"Second Edition FRCSyn Challenge at CVPR 2024: Face Recognition Challenge\n in the Era of Synthetic Data","summary":" Synthetic data is gaining increasing relevance for training machine learning\nmodels. This is mainly motivated due to several factors such as the lack of\nreal data and intra-class variability, time and errors produced in manual\nlabeling, and in some cases privacy concerns, among others. This paper presents\nan overview of the 2nd edition of the Face Recognition Challenge in the Era of\nSynthetic Data (FRCSyn) organized at CVPR 2024. FRCSyn aims to investigate the\nuse of synthetic data in face recognition to address current technological\nlimitations, including data privacy concerns, demographic biases,\ngeneralization to novel scenarios, and performance constraints in challenging\nsituations such as aging, pose variations, and occlusions. Unlike the 1st\nedition, in which synthetic data from DCFace and GANDiffFace methods was only\nallowed to train face recognition systems, in this 2nd edition we propose new\nsub-tasks that allow participants to explore novel face generative methods. The\noutcomes of the 2nd FRCSyn Challenge, along with the proposed experimental\nprotocol and benchmarking contribute significantly to the application of\nsynthetic data to face recognition.\n","authors":["Ivan DeAndres-Tame","Ruben Tolosana","Pietro Melzi","Ruben Vera-Rodriguez","Minchul Kim","Christian Rathgeb","Xiaoming Liu","Aythami Morales","Julian Fierrez","Javier Ortega-Garcia","Zhizhou Zhong","Yuge Huang","Yuxi Mi","Shouhong Ding","Shuigeng Zhou","Shuai He","Lingzhi Fu","Heng Cong","Rongyu Zhang","Zhihong Xiao","Evgeny Smirnov","Anton Pimenov","Aleksei Grigorev","Denis Timoshenko","Kaleb Mesfin Asfaw","Cheng Yaw Low","Hao Liu","Chuyi Wang","Qing Zuo","Zhixiang He","Hatef Otroshi Shahreza","Anjith George","Alexander Unnervik","Parsa Rahimi","Sébastien Marcel","Pedro C. Neto","Marco Huber","Jan Niklas Kolf","Naser Damer","Fadi Boutros","Jaime S. Cardoso","Ana F. Sequeira","Andrea Atzori","Gianni Fenu","Mirko Marras","Vitomir Štruc","Jiang Yu","Zhangjie Li","Jichun Li","Weisong Zhao","Zhen Lei","Xiangyu Zhu","Xiao-Yu Zhang","Bernardo Biesseck","Pedro Vidal","Luiz Coelho","Roger Granada","David Menotti"],"pdf_url":"https://arxiv.org/pdf/2404.10378v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2311.10476"},{"id":"http://arxiv.org/abs/2404.10370v1","updated":"2024-04-16T08:08:47Z","published":"2024-04-16T08:08:47Z","title":"Know Yourself Better: Diverse Discriminative Feature Learning Improves\n Open Set Recognition","summary":" Open set recognition (OSR) is a critical aspect of machine learning,\naddressing the challenge of detecting novel classes during inference. Within\nthe realm of deep learning, neural classifiers trained on a closed set of data\ntypically struggle to identify novel classes, leading to erroneous predictions.\nTo address this issue, various heuristic methods have been proposed, allowing\nmodels to express uncertainty by stating \"I don't know.\" However, a gap in the\nliterature remains, as there has been limited exploration of the underlying\nmechanisms of these methods. In this paper, we conduct an analysis of open set\nrecognition methods, focusing on the aspect of feature diversity. Our research\nreveals a significant correlation between learning diverse discriminative\nfeatures and enhancing OSR performance. Building on this insight, we propose a\nnovel OSR approach that leverages the advantages of feature diversity. The\nefficacy of our method is substantiated through rigorous evaluation on a\nstandard OSR testbench, demonstrating a substantial improvement over\nstate-of-the-art methods.\n","authors":["Jiawen Xu"],"pdf_url":"https://arxiv.org/pdf/2404.10370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08514v2","updated":"2024-04-16T07:56:01Z","published":"2024-04-12T14:54:26Z","title":"NIR-Assisted Image Denoising: A Selective Fusion Approach and A\n Real-World Benchmark Datase","summary":" Despite the significant progress in image denoising, it is still challenging\nto restore fine-scale details while removing noise, especially in extremely\nlow-light environments. Leveraging near-infrared (NIR) images to assist visible\nRGB image denoising shows the potential to address this issue, becoming a\npromising technology. Nonetheless, existing works still struggle with taking\nadvantage of NIR information effectively for real-world image denoising, due to\nthe content inconsistency between NIR-RGB images and the scarcity of real-world\npaired datasets. To alleviate the problem, we propose an efficient Selective\nFusion Module (SFM), which can be plug-and-played into the advanced denoising\nnetworks to merge the deep NIR-RGB features. Specifically, we sequentially\nperform the global and local modulation for NIR and RGB features, and then\nintegrate the two modulated features. Furthermore, we present a Real-world\nNIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse\nscenarios as well as various noise levels. Extensive experiments on both\nsynthetic and our real-world datasets demonstrate that the proposed method\nachieves better results than state-of-the-art ones.\n","authors":["Rongjian Xu","Zhilu Zhang","Renlong Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.08514v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2311.12815v2","updated":"2024-04-16T07:51:12Z","published":"2023-09-24T02:57:56Z","title":"Proposing an intelligent mesh smoothing method with graph neural\n networks","summary":" In CFD, mesh smoothing methods are commonly utilized to refine the mesh\nquality to achieve high-precision numerical simulations. Specifically,\noptimization-based smoothing is used for high-quality mesh smoothing, but it\nincurs significant computational overhead. Pioneer works improve its smoothing\nefficiency by adopting supervised learning to learn smoothing methods from\nhigh-quality meshes. However, they pose difficulty in smoothing the mesh nodes\nwith varying degrees and also need data augmentation to address the node input\nsequence problem. Additionally, the required labeled high-quality meshes\nfurther limit the applicability of the proposed method. In this paper, we\npresent GMSNet, a lightweight neural network model for intelligent mesh\nsmoothing. GMSNet adopts graph neural networks to extract features of the\nnode's neighbors and output the optimal node position. During smoothing, we\nalso introduce a fault-tolerance mechanism to prevent GMSNet from generating\nnegative volume elements. With a lightweight model, GMSNet can effectively\nsmoothing mesh nodes with varying degrees and remain unaffected by the order of\ninput data. A novel loss function, MetricLoss, is also developed to eliminate\nthe need for high-quality meshes, which provides a stable and rapid convergence\nduring training. We compare GMSNet with commonly used mesh smoothing methods on\ntwo-dimensional triangle meshes. The experimental results show that GMSNet\nachieves outstanding mesh smoothing performances with 5% model parameters of\nthe previous model, and attains 13.56 times faster than optimization-based\nsmoothing.\n","authors":["Zhichao Wang","Xinhai Chen","Junjun Yan","Jie Liu"],"pdf_url":"https://arxiv.org/pdf/2311.12815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16016v3","updated":"2024-04-16T07:47:19Z","published":"2023-06-28T08:44:00Z","title":"Positive Label Is All You Need for Multi-Label Classification","summary":" Multi-label classification (MLC) faces challenges from label noise in\ntraining data due to annotating diverse semantic labels for each image. Current\nmethods mainly target identifying and correcting label mistakes using trained\nMLC models, but still struggle with persistent noisy labels during training,\nresulting in imprecise recognition and reduced performance. Our paper addresses\nlabel noise in MLC by introducing a positive and unlabeled multi-label\nclassification (PU-MLC) method. To counteract noisy labels, we directly discard\nnegative labels, focusing on the abundance of negative labels and the origin of\nmost noisy labels. PU-MLC employs positive-unlabeled learning, training the\nmodel with only positive labels and unlabeled data. The method incorporates\nadaptive re-balance factors and temperature coefficients in the loss function\nto address label distribution imbalance and prevent over-smoothing of\nprobabilities during training. Additionally, we introduce a local-global\nconvolution module to capture both local and global dependencies in the image\nwithout requiring backbone retraining. PU-MLC proves effective on MLC and MLC\nwith partial labels (MLC-PL) tasks, demonstrating significant improvements on\nMS-COCO and PASCAL VOC datasets with fewer annotations. Code is available at:\nhttps://github.com/TAKELAMAG/PU-MLC.\n","authors":["Zhixiang Yuan","Kaixin Zhang","Tao Huang"],"pdf_url":"https://arxiv.org/pdf/2306.16016v3.pdf","comment":"ICME 2024"},{"id":"http://arxiv.org/abs/2404.10358v1","updated":"2024-04-16T07:46:55Z","published":"2024-04-16T07:46:55Z","title":"Improving Bracket Image Restoration and Enhancement with Flow-guided\n Alignment and Enhanced Feature Aggregation","summary":" In this paper, we address the Bracket Image Restoration and Enhancement\n(BracketIRE) task using a novel framework, which requires restoring a\nhigh-quality high dynamic range (HDR) image from a sequence of noisy, blurred,\nand low dynamic range (LDR) multi-exposure RAW inputs. To overcome this\nchallenge, we present the IREANet, which improves the multiple exposure\nalignment and aggregation with a Flow-guide Feature Alignment Module (FFAM) and\nan Enhanced Feature Aggregation Module (EFAM). Specifically, the proposed FFAM\nincorporates the inter-frame optical flow as guidance to facilitate the\ndeformable alignment and spatial attention modules for better feature\nalignment. The EFAM further employs the proposed Enhanced Residual Block (ERB)\nas a foundational component, wherein a unidirectional recurrent network\naggregates the aligned temporal features to better reconstruct the results. To\nimprove model generalization and performance, we additionally employ the Bayer\npreserving augmentation (BayerAug) strategy to augment the multi-exposure RAW\ninputs. Our experimental evaluations demonstrate that the proposed IREANet\nshows state-of-the-art performance compared with previous methods.\n","authors":["Wenjie Lin","Zhen Liu","Chengzhi Jiang","Mingyan Han","Ting Jiang","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2404.10358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10357v1","updated":"2024-04-16T07:44:52Z","published":"2024-04-16T07:44:52Z","title":"Optimization of Prompt Learning via Multi-Knowledge Representation for\n Vision-Language Models","summary":" Vision-Language Models (VLMs), such as CLIP, play a foundational role in\nvarious cross-modal applications. To fully leverage VLMs' potential in adapting\nto downstream tasks, context optimization methods like Prompt Tuning are\nessential. However, one key limitation is the lack of diversity in prompt\ntemplates, whether they are hand-crafted or learned through additional modules.\nThis limitation restricts the capabilities of pretrained VLMs and can result in\nincorrect predictions in downstream tasks. To address this challenge, we\npropose Context Optimization with Multi-Knowledge Representation (CoKnow), a\nframework that enhances Prompt Learning for VLMs with rich contextual\nknowledge. To facilitate CoKnow during inference, we trained lightweight\nsemantic knowledge mappers, which are capable of generating Multi-Knowledge\nRepresentation for an input image without requiring additional priors.\nExperimentally, We conducted extensive experiments on 11 publicly available\ndatasets, demonstrating that CoKnow outperforms a series of previous methods.\nWe will make all resources open-source: https://github.com/EMZucas/CoKnow.\n","authors":["Enming Zhang","Bingke zhu","Yingying Chen","Qinghai Miao","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.10357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06207v2","updated":"2024-04-16T07:41:49Z","published":"2024-04-09T10:56:46Z","title":"Leveraging edge detection and neural networks for better UAV\n localization","summary":" We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs)\nin environments lacking Global Navigation Satellite Systems (GNSS). Current\nstate-of-the-art techniques employ an offline-trained encoder to generate a\nvector representation (embedding) of the UAV's current view, which is then\ncompared with pre-computed embeddings of geo-referenced images to determine the\nUAV's position. Here, we demonstrate that the performance of these methods can\nbe significantly enhanced by preprocessing the images to extract their edges,\nwhich exhibit robustness to seasonal and illumination variations. Furthermore,\nwe establish that utilizing edges enhances resilience to orientation and\naltitude inaccuracies. Additionally, we introduce a confidence criterion for\nlocalization. Our findings are substantiated through synthetic experiments.\n","authors":["Theo Di Piazza","Enric Meinhardt-Llopis","Gabriele Facciolo","Benedicte Bascle","Corentin Abgrall","Jean-Clement Devaux"],"pdf_url":"https://arxiv.org/pdf/2404.06207v2.pdf","comment":"Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2401.09450v2","updated":"2024-04-16T07:35:41Z","published":"2023-12-22T11:15:16Z","title":"Joining Forces for Pathology Diagnostics with AI Assistance: The EMPAIA\n Initiative","summary":" Over the past decade, artificial intelligence (AI) methods in pathology have\nadvanced substantially. However, integration into routine clinical practice has\nbeen slow due to numerous challenges, including technical and regulatory\nhurdles in translating research results into clinical diagnostic products and\nthe lack of standardized interfaces. The open and vendor-neutral EMPAIA\ninitiative addresses these challenges. Here, we provide an overview of EMPAIA's\nachievements and lessons learned. EMPAIA integrates various stakeholders of the\npathology AI ecosystem, i.e., pathologists, computer scientists, and industry.\nIn close collaboration, we developed technical interoperability standards,\nrecommendations for AI testing and product development, and explainability\nmethods. We implemented the modular and open-source EMPAIA platform and\nsuccessfully integrated 14 AI-based image analysis apps from 8 different\nvendors, demonstrating how different apps can use a single standardized\ninterface. We prioritized requirements and evaluated the use of AI in real\nclinical settings with 14 different pathology laboratories in Europe and Asia.\nIn addition to technical developments, we created a forum for all stakeholders\nto share information and experiences on digital pathology and AI. Commercial,\nclinical, and academic stakeholders can now adopt EMPAIA's common open-source\ninterfaces, providing a unique opportunity for large-scale standardization and\nstreamlining of processes. Further efforts are needed to effectively and\nbroadly establish AI assistance in routine laboratory use. To this end, a\nsustainable infrastructure, the non-profit association EMPAIA International,\nhas been established to continue standardization and support broad\nimplementation and advocacy for an AI-assisted digital pathology future.\n","authors":["Norman Zerbe","Lars Ole Schwen","Christian Geißler","Katja Wiesemann","Tom Bisson","Peter Boor","Rita Carvalho","Michael Franz","Christoph Jansen","Tim-Rasmus Kiehl","Björn Lindequist","Nora Charlotte Pohlan","Sarah Schmell","Klaus Strohmenger","Falk Zakrzewski","Markus Plass","Michael Takla","Tobias Küster","André Homeyer","Peter Hufnagl"],"pdf_url":"https://arxiv.org/pdf/2401.09450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10343v1","updated":"2024-04-16T07:26:20Z","published":"2024-04-16T07:26:20Z","title":"The Ninth NTIRE 2024 Efficient Super-Resolution Challenge Report","summary":" This paper provides a comprehensive review of the NTIRE 2024 challenge,\nfocusing on efficient single-image super-resolution (ESR) solutions and their\noutcomes. The task of this challenge is to super-resolve an input image with a\nmagnification factor of x4 based on pairs of low and corresponding\nhigh-resolution images. The primary objective is to develop networks that\noptimize various aspects such as runtime, parameters, and FLOPs, while still\nmaintaining a peak signal-to-noise ratio (PSNR) of approximately 26.90 dB on\nthe DIV2K_LSDIR_valid dataset and 26.99 dB on the DIV2K_LSDIR_test dataset. In\naddition, this challenge has 4 tracks including the main track (overall\nperformance), sub-track 1 (runtime), sub-track 2 (FLOPs), and sub-track 3\n(parameters). In the main track, all three metrics (ie runtime, FLOPs, and\nparameter count) were considered. The ranking of the main track is calculated\nbased on a weighted sum-up of the scores of all other sub-tracks. In sub-track\n1, the practical runtime performance of the submissions was evaluated, and the\ncorresponding score was used to determine the ranking. In sub-track 2, the\nnumber of FLOPs was considered. The score calculated based on the corresponding\nFLOPs was used to determine the ranking. In sub-track 3, the number of\nparameters was considered. The score calculated based on the corresponding\nparameters was used to determine the ranking. RLFN is set as the baseline for\nefficiency measurement. The challenge had 262 registered participants, and 34\nteams made valid submissions. They gauge the state-of-the-art in efficient\nsingle-image super-resolution. To facilitate the reproducibility of the\nchallenge and enable other researchers to build upon these findings, the code\nand the pre-trained model of validated solutions are made publicly available at\nhttps://github.com/Amazingren/NTIRE2024_ESR/.\n","authors":["Bin Ren","Yawei Li","Nancy Mehta","Radu Timofte","Hongyuan Yu","Cheng Wan","Yuxin Hong","Bingnan Han","Zhuoyuan Wu","Yajun Zou","Yuqing Liu","Jizhe Li","Keji He","Chao Fan","Heng Zhang","Xiaolin Zhang","Xuanwu Yin","Kunlong Zuo","Bohao Liao","Peizhe Xia","Long Peng","Zhibo Du","Xin Di","Wangkai Li","Yang Wang","Wei Zhai","Renjing Pei","Jiaming Guo","Songcen Xu","Yang Cao","Zhengjun Zha","Yan Wang","Yi Liu","Qing Wang","Gang Zhang","Liou Zhang","Shijie Zhao","Long Sun","Jinshan Pan","Jiangxin Dong","Jinhui Tang","Xin Liu","Min Yan","Qian Wang","Menghan Zhou","Yiqiang Yan","Yixuan Liu","Wensong Chan","Dehua Tang","Dong Zhou","Li Wang","Lu Tian","Barsoum Emad","Bohan Jia","Junbo Qiao","Yunshuai Zhou","Yun Zhang","Wei Li","Shaohui Lin","Shenglong Zhou","Binbin Chen","Jincheng Liao","Suiyi Zhao","Zhao Zhang","Bo Wang","Yan Luo","Yanyan Wei","Feng Li","Mingshen Wang","Yawei Li","Jinhan Guan","Dehua Hu","Jiawei Yu","Qisheng Xu","Tao Sun","Long Lan","Kele Xu","Xin Lin","Jingtong Yue","Lehan Yang","Shiyi Du","Lu Qi","Chao Ren","Zeyu Han","Yuhan Wang","Chaolin Chen","Haobo Li","Mingjun Zheng","Zhongbao Yang","Lianhong Song","Xingzhuo Yan","Minghan Fu","Jingyi Zhang","Baiang Li","Qi Zhu","Xiaogang Xu","Dan Guo","Chunle Guo","Jiadi Chen","Huanhuan Long","Chunjiang Duanmu","Xiaoyan Lei","Jie Liu","Weilin Jia","Weifeng Cao","Wenlong Zhang","Yanyu Mao","Ruilong Guo","Nihao Zhang","Qian Wang","Manoj Pandey","Maksym Chernozhukov","Giang Le","Shuli Cheng","Hongyuan Wang","Ziyan Wei","Qingting Tang","Liejun Wang","Yongming Li","Yanhui Guo","Hao Xu","Akram Khatami-Rizi","Ahmad Mahmoudi-Aznaveh","Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou","Amogh Joshi","Nikhil Akalwadi","Sampada Malagi","Palani Yashaswini","Chaitra Desai","Ramesh Ashok Tabib","Ujwala Patil","Uma Mudenagudi"],"pdf_url":"https://arxiv.org/pdf/2404.10343v1.pdf","comment":"The report paper of NTIRE2024 Efficient Super-resolution, accepted by\n CVPRW2024"},{"id":"http://arxiv.org/abs/2404.10342v1","updated":"2024-04-16T07:25:17Z","published":"2024-04-16T07:25:17Z","title":"Referring Flexible Image Restoration","summary":" In reality, images often exhibit multiple degradations, such as rain and fog\nat night (triple degradations). However, in many cases, individuals may not\nwant to remove all degradations, for instance, a blurry lens revealing a\nbeautiful snowy landscape (double degradations). In such scenarios, people may\nonly desire to deblur. These situations and requirements shed light on a new\nchallenge in image restoration, where a model must perceive and remove specific\ndegradation types specified by human commands in images with multiple\ndegradations. We term this task Referring Flexible Image Restoration (RFIR). To\naddress this, we first construct a large-scale synthetic dataset called RFIR,\ncomprising 153,423 samples with the degraded image, text prompt for specific\ndegradation removal and restored image. RFIR consists of five basic degradation\ntypes: blur, rain, haze, low light and snow while six main sub-categories are\nincluded for varying degrees of degradation removal. To tackle the challenge,\nwe propose a novel transformer-based multi-task model named TransRFIR, which\nsimultaneously perceives degradation types in the degraded image and removes\nspecific degradation upon text prompt. TransRFIR is based on two devised\nattention modules, Multi-Head Agent Self-Attention (MHASA) and Multi-Head Agent\nCross Attention (MHACA), where MHASA and MHACA introduce the agent token and\nreach the linear complexity, achieving lower computation cost than vanilla\nself-attention and cross-attention and obtaining competitive performances. Our\nTransRFIR achieves state-of-the-art performances compared with other\ncounterparts and is proven as an effective architecture for image restoration.\nWe release our project at https://github.com/GuanRunwei/FIR-CP.\n","authors":["Runwei Guan","Rongsheng Hu","Zhuhao Zhou","Tianlang Xue","Ka Lok Man","Jeremy Smith","Eng Gee Lim","Weiping Ding","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2404.10342v1.pdf","comment":"15 pages, 19 figures"},{"id":"http://arxiv.org/abs/2404.10335v1","updated":"2024-04-16T07:19:52Z","published":"2024-04-16T07:19:52Z","title":"Efficiently Adversarial Examples Generation for Visual-Language Models\n under Targeted Transfer Scenarios using Diffusion Models","summary":" Targeted transfer-based attacks involving adversarial examples pose a\nsignificant threat to large visual-language models (VLMs). However, the\nstate-of-the-art (SOTA) transfer-based attacks incur high costs due to\nexcessive iteration counts. Furthermore, the generated adversarial examples\nexhibit pronounced adversarial noise and demonstrate limited efficacy in\nevading defense methods such as DiffPure. To address these issues, inspired by\nscore matching, we introduce AdvDiffVLM, which utilizes diffusion models to\ngenerate natural, unrestricted adversarial examples. Specifically, AdvDiffVLM\nemploys Adaptive Ensemble Gradient Estimation to modify the score during the\ndiffusion model's reverse generation process, ensuring the adversarial examples\nproduced contain natural adversarial semantics and thus possess enhanced\ntransferability. Simultaneously, to enhance the quality of adversarial examples\nfurther, we employ the GradCAM-guided Mask method to disperse adversarial\nsemantics throughout the image, rather than concentrating them in a specific\narea. Experimental results demonstrate that our method achieves a speedup\nranging from 10X to 30X compared to existing transfer-based attack methods,\nwhile maintaining superior quality of adversarial examples. Additionally, the\ngenerated adversarial examples possess strong transferability and exhibit\nincreased robustness against adversarial defense methods. Notably, AdvDiffVLM\ncan successfully attack commercial VLMs, including GPT-4V, in a black-box\nmanner.\n","authors":["Qi Guo","Shanmin Pang","Xiaojun Jia","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2404.10335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10332v1","updated":"2024-04-16T07:14:32Z","published":"2024-04-16T07:14:32Z","title":"Prescribing the Right Remedy: Mitigating Hallucinations in Large\n Vision-Language Models via Targeted Instruction Tuning","summary":" Despite achieving outstanding performance on various cross-modal tasks,\ncurrent large vision-language models (LVLMs) still suffer from hallucination\nissues, manifesting as inconsistencies between their generated responses and\nthe corresponding images. Prior research has implicated that the low quality of\ninstruction data, particularly the skewed balance between positive and negative\nsamples, is a significant contributor to model hallucinations. Recently,\nresearchers have proposed high-quality instruction datasets, such as\nLRV-Instruction, to mitigate model hallucination. Nonetheless, our\ninvestigation reveals that hallucinatory concepts from different LVLMs exhibit\nspecificity, i.e. the distribution of hallucinatory concepts varies\nsignificantly across models. Existing datasets did not consider the\nhallucination specificity of different models in the design processes, thereby\ndiminishing their efficacy in mitigating model hallucination. In this paper, we\npropose a targeted instruction data generation framework named DFTG that\ntailored to the hallucination specificity of different models. Concretely, DFTG\nconsists of two stages: hallucination diagnosis, which extracts the necessary\ninformation from the model's responses and images for hallucination diagnosis;\nand targeted data generation, which generates targeted instruction data based\non diagnostic results. The experimental results on hallucination benchmarks\ndemonstrate that the targeted instruction data generated by our method are more\neffective in mitigating hallucinations compared to previous datasets.\n","authors":["Rui Hu","Yahan Tu","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2404.10332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10322v1","updated":"2024-04-16T07:07:40Z","published":"2024-04-16T07:07:40Z","title":"Domain-Rectifying Adapter for Cross-Domain Few-Shot Segmentation","summary":" Few-shot semantic segmentation (FSS) has achieved great success on segmenting\nobjects of novel classes, supported by only a few annotated samples. However,\nexisting FSS methods often underperform in the presence of domain shifts,\nespecially when encountering new domain styles that are unseen during training.\nIt is suboptimal to directly adapt or generalize the entire model to new\ndomains in the few-shot scenario. Instead, our key idea is to adapt a small\nadapter for rectifying diverse target domain styles to the source domain.\nConsequently, the rectified target domain features can fittingly benefit from\nthe well-optimized source domain segmentation model, which is intently trained\non sufficient source domain data. Training domain-rectifying adapter requires\nsufficiently diverse target domains. We thus propose a novel local-global style\nperturbation method to simulate diverse potential target domains by\nperturbating the feature channel statistics of the individual images and\ncollective statistics of the entire source domain, respectively. Additionally,\nwe propose a cyclic domain alignment module to facilitate the adapter\neffectively rectifying domains using a reverse domain rectification\nsupervision. The adapter is trained to rectify the image features from diverse\nsynthesized target domains to align with the source domain. During testing on\ntarget domains, we start by rectifying the image features and then conduct\nfew-shot segmentation on the domain-rectified features. Extensive experiments\ndemonstrate the effectiveness of our method, achieving promising results on\ncross-domain few-shot semantic segmentation tasks. Our code is available at\nhttps://github.com/Matt-Su/DR-Adapter.\n","authors":["Jiapeng Su","Qi Fan","Guangming Lu","Fanglin Chen","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2404.10322v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10319v1","updated":"2024-04-16T06:59:26Z","published":"2024-04-16T06:59:26Z","title":"Application of Deep Learning Methods to Processing of Noisy Medical\n Video Data","summary":" Cells count become a challenging problem when the cells move in a continuous\nstream, and their boundaries are difficult for visual detection. To resolve\nthis problem we modified the training and decision making processes using\ncurriculum learning and multi-view predictions techniques, respectively.\n","authors":["Danil Afonchikov","Elena Kornaeva","Irina Makovik","Alexey Kornaev"],"pdf_url":"https://arxiv.org/pdf/2404.10319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10318v1","updated":"2024-04-16T06:58:30Z","published":"2024-04-16T06:58:30Z","title":"SRGS: Super-Resolution 3D Gaussian Splatting","summary":" Recently, 3D Gaussian Splatting (3DGS) has gained popularity as a novel\nexplicit 3D representation. This approach relies on the representation power of\nGaussian primitives to provide a high-quality rendering. However, primitives\noptimized at low resolution inevitably exhibit sparsity and texture deficiency,\nposing a challenge for achieving high-resolution novel view synthesis (HRNVS).\nTo address this problem, we propose Super-Resolution 3D Gaussian Splatting\n(SRGS) to perform the optimization in a high-resolution (HR) space. The\nsub-pixel constraint is introduced for the increased viewpoints in HR space,\nexploiting the sub-pixel cross-view information of the multiple low-resolution\n(LR) views. The gradient accumulated from more viewpoints will facilitate the\ndensification of primitives. Furthermore, a pre-trained 2D super-resolution\nmodel is integrated with the sub-pixel constraint, enabling these dense\nprimitives to learn faithful texture features. In general, our method focuses\non densification and texture learning to effectively enhance the representation\nability of primitives. Experimentally, our method achieves high rendering\nquality on HRNVS only with LR inputs, outperforming state-of-the-art methods on\nchallenging datasets such as Mip-NeRF 360 and Tanks & Temples. Related codes\nwill be released upon acceptance.\n","authors":["Xiang Feng","Yongbo He","Yubo Wang","Yan Yang","Zhenzhong Kuang","Yu Jun","Jianping Fan","Jiajun ding"],"pdf_url":"https://arxiv.org/pdf/2404.10318v1.pdf","comment":"submit ACM MM 2024"},{"id":"http://arxiv.org/abs/2404.10314v1","updated":"2024-04-16T06:40:51Z","published":"2024-04-16T06:40:51Z","title":"Awareness of uncertainty in classification using a multivariate model\n and multi-views","summary":" One of the ways to make artificial intelligence more natural is to give it\nsome room for doubt. Two main questions should be resolved in that way. First,\nhow to train a model to estimate uncertainties of its own predictions? And\nthen, what to do with the uncertain predictions if they appear? First, we\nproposed an uncertainty-aware negative log-likelihood loss for the case of\nN-dimensional multivariate normal distribution with spherical variance matrix\nto the solution of N-classes classification tasks. The loss is similar to the\nheteroscedastic regression loss. The proposed model regularizes uncertain\npredictions, and trains to calculate both the predictions and their uncertainty\nestimations. The model fits well with the label smoothing technique. Second, we\nexpanded the limits of data augmentation at the training and test stages, and\nmade the trained model to give multiple predictions for a given number of\naugmented versions of each test sample. Given the multi-view predictions\ntogether with their uncertainties and confidences, we proposed several methods\nto calculate final predictions, including mode values and bin counts with soft\nand hard weights. For the latter method, we formalized the model tuning task in\nthe form of multimodal optimization with non-differentiable criteria of maximum\naccuracy, and applied particle swarm optimization to solve the tuning task. The\nproposed methodology was tested using CIFAR-10 dataset with clean and noisy\nlabels and demonstrated good results in comparison with other uncertainty\nestimation methods related to sample selection, co-teaching, and label\nsmoothing.\n","authors":["Alexey Kornaev","Elena Kornaeva","Oleg Ivanov","Ilya Pershin","Danis Alukaev"],"pdf_url":"https://arxiv.org/pdf/2404.10314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10312v1","updated":"2024-04-16T06:39:37Z","published":"2024-04-16T06:39:37Z","title":"OmniSSR: Zero-shot Omnidirectional Image Super-Resolution using Stable\n Diffusion Model","summary":" Omnidirectional images (ODIs) are commonly used in real-world visual tasks,\nand high-resolution ODIs help improve the performance of related visual tasks.\nMost existing super-resolution methods for ODIs use end-to-end learning\nstrategies, resulting in inferior realness of generated images and a lack of\neffective out-of-domain generalization capabilities in training methods. Image\ngeneration methods represented by diffusion model provide strong priors for\nvisual tasks and have been proven to be effectively applied to image\nrestoration tasks. Leveraging the image priors of the Stable Diffusion (SD)\nmodel, we achieve omnidirectional image super-resolution with both fidelity and\nrealness, dubbed as OmniSSR. Firstly, we transform the equirectangular\nprojection (ERP) images into tangent projection (TP) images, whose distribution\napproximates the planar image domain. Then, we use SD to iteratively sample\ninitial high-resolution results. At each denoising iteration, we further\ncorrect and update the initial results using the proposed Octadecaplex Tangent\nInformation Interaction (OTII) and Gradient Decomposition (GD) technique to\nensure better consistency. Finally, the TP images are transformed back to\nobtain the final high-resolution results. Our method is zero-shot, requiring no\ntraining or fine-tuning. Experiments of our method on two benchmark datasets\ndemonstrate the effectiveness of our proposed method.\n","authors":["Runyi Li","Xuhan Sheng","Weiqi Li","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15406v2","updated":"2024-04-16T06:33:09Z","published":"2023-12-24T04:49:06Z","title":"Objects as volumes: A stochastic geometry view of opaque solids","summary":" We develop a theory for the representation of opaque solids as volumes.\nStarting from a stochastic representation of opaque solids as random indicator\nfunctions, we prove the conditions under which such solids can be modeled using\nexponential volumetric transport. We also derive expressions for the volumetric\nattenuation coefficient as a functional of the probability distributions of the\nunderlying indicator functions. We generalize our theory to account for\nisotropic and anisotropic scattering at different parts of the solid, and for\nrepresentations of opaque solids as stochastic implicit surfaces. We derive our\nvolumetric representation from first principles, which ensures that it\nsatisfies physical constraints such as reciprocity and reversibility. We use\nour theory to explain, compare, and correct previous volumetric\nrepresentations, as well as propose meaningful extensions that lead to improved\nperformance in 3D reconstruction tasks.\n","authors":["Bailey Miller","Hanyu Chen","Alice Lai","Ioannis Gkioulekas"],"pdf_url":"https://arxiv.org/pdf/2312.15406v2.pdf","comment":"project page: https://imaging.cs.cmu.edu/volumetric_opaque_solids"},{"id":"http://arxiv.org/abs/2404.10307v1","updated":"2024-04-16T06:33:08Z","published":"2024-04-16T06:33:08Z","title":"Learnable Prompt for Few-Shot Semantic Segmentation in Remote Sensing\n Domain","summary":" Few-shot segmentation is a task to segment objects or regions of novel\nclasses within an image given only a few annotated examples. In the generalized\nsetting, the task extends to segment both the base and the novel classes. The\nmain challenge is how to train the model such that the addition of novel\nclasses does not hurt the base classes performance, also known as catastrophic\nforgetting. To mitigate this issue, we use SegGPT as our base model and train\nit on the base classes. Then, we use separate learnable prompts to handle\npredictions for each novel class. To handle various object sizes which\ntypically present in remote sensing domain, we perform patch-based prediction.\nTo address the discontinuities along patch boundaries, we propose a\npatch-and-stitch technique by re-framing the problem as an image inpainting\ntask. During inference, we also utilize image similarity search over image\nembeddings for prompt selection and novel class filtering to reduce false\npositive predictions. Based on our experiments, our proposed method boosts the\nweighted mIoU of a simple fine-tuned SegGPT from 15.96 to 35.08 on the\nvalidation set of few-shot OpenEarthMap dataset given in the challenge.\n","authors":["Steve Andreas Immanuel","Hagai Raja Sinulingga"],"pdf_url":"https://arxiv.org/pdf/2404.10307v1.pdf","comment":"Accepted to CVPRW 2024"},{"id":"http://arxiv.org/abs/2303.16242v4","updated":"2024-04-16T06:26:46Z","published":"2023-03-28T18:36:19Z","title":"CuNeRF: Cube-Based Neural Radiance Field for Zero-Shot Medical Image\n Arbitrary-Scale Super Resolution","summary":" Medical image arbitrary-scale super-resolution (MIASSR) has recently gained\nwidespread attention, aiming to super sample medical volumes at arbitrary\nscales via a single model. However, existing MIASSR methods face two major\nlimitations: (i) reliance on high-resolution (HR) volumes and (ii) limited\ngeneralization ability, which restricts their application in various scenarios.\nTo overcome these limitations, we propose Cube-based Neural Radiance Field\n(CuNeRF), a zero-shot MIASSR framework that can yield medical images at\narbitrary scales and viewpoints in a continuous domain. Unlike existing MIASSR\nmethods that fit the mapping between low-resolution (LR) and HR volumes, CuNeRF\nfocuses on building a coordinate-intensity continuous representation from LR\nvolumes without the need for HR references. This is achieved by the proposed\ndifferentiable modules: including cube-based sampling, isotropic volume\nrendering, and cube-based hierarchical rendering. Through extensive experiments\non magnetic resource imaging (MRI) and computed tomography (CT) modalities, we\ndemonstrate that CuNeRF outperforms state-of-the-art MIASSR methods. CuNeRF\nyields better visual verisimilitude and reduces aliasing artifacts at various\nupsampling factors. Moreover, our CuNeRF does not need any LR-HR training\npairs, which is more flexible and easier to be used than others. Our code is\nreleased at https://github.com/NarcissusEx/CuNeRF.\n","authors":["Zixuan Chen","Jian-Huang Lai","Lingxiao Yang","Xiaohua Xie"],"pdf_url":"https://arxiv.org/pdf/2303.16242v4.pdf","comment":"This paper is accepted by the International Conference on Computer\n Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2404.10305v1","updated":"2024-04-16T06:24:53Z","published":"2024-04-16T06:24:53Z","title":"TC-OCR: TableCraft OCR for Efficient Detection & Recognition of Table\n Structure & Content","summary":" The automatic recognition of tabular data in document images presents a\nsignificant challenge due to the diverse range of table styles and complex\nstructures. Tables offer valuable content representation, enhancing the\npredictive capabilities of various systems such as search engines and Knowledge\nGraphs. Addressing the two main problems, namely table detection (TD) and table\nstructure recognition (TSR), has traditionally been approached independently.\nIn this research, we propose an end-to-end pipeline that integrates deep\nlearning models, including DETR, CascadeTabNet, and PP OCR v2, to achieve\ncomprehensive image-based table recognition. This integrated approach\neffectively handles diverse table styles, complex structures, and image\ndistortions, resulting in improved accuracy and efficiency compared to existing\nmethods like Table Transformers. Our system achieves simultaneous table\ndetection (TD), table structure recognition (TSR), and table content\nrecognition (TCR), preserving table structures and accurately extracting\ntabular data from document images. The integration of multiple models addresses\nthe intricacies of table recognition, making our approach a promising solution\nfor image-based table understanding, data extraction, and information retrieval\napplications. Our proposed approach achieves an IOU of 0.96 and an OCR Accuracy\nof 78%, showcasing a remarkable improvement of approximately 25% in the OCR\nAccuracy compared to the previous Table Transformer approach.\n","authors":["Avinash Anand","Raj Jaiswal","Pijush Bhuyan","Mohit Gupta","Siddhesh Bangar","Md. Modassir Imam","Rajiv Ratn Shah","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2404.10305v1.pdf","comment":"8 pages, 2 figures, Workshop of 1st MMIR Deep Multimodal Learning for\n Information Retrieval"},{"id":"http://arxiv.org/abs/2404.09406v2","updated":"2024-04-16T05:58:39Z","published":"2024-04-15T01:47:44Z","title":"Human-in-the-Loop Segmentation of Multi-species Coral Imagery","summary":" Broad-scale marine surveys performed by underwater vehicles significantly\nincrease the availability of coral reef imagery, however it is costly and\ntime-consuming for domain experts to label images. Point label propagation is\nan approach used to leverage existing image data labeled with sparse point\nlabels. The resulting augmented ground truth generated is then used to train a\nsemantic segmentation model. Here, we first demonstrate that recent advances in\nfoundation models enable generation of multi-species coral augmented ground\ntruth masks using denoised DINOv2 features and K-Nearest Neighbors (KNN),\nwithout the need for any pre-training or custom-designed algorithms. For\nextremely sparsely labeled images, we propose a labeling regime based on\nhuman-in-the-loop principles, resulting in significant improvement in\nannotation efficiency: If only 5 point labels per image are available, our\nproposed human-in-the-loop approach improves on the state-of-the-art by 17.3%\nfor pixel accuracy and 22.6% for mIoU; and by 10.6% and 19.1% when 10 point\nlabels per image are available. Even if the human-in-the-loop labeling regime\nis not used, the denoised DINOv2 features with a KNN outperforms the prior\nstate-of-the-art by 3.5% for pixel accuracy and 5.7% for mIoU (5 grid points).\nWe also provide a detailed analysis of how point labeling style and the\nquantity of points per image affects the point label propagation quality and\nprovide general recommendations on maximizing point label efficiency.\n","authors":["Scarlett Raine","Ross Marchant","Brano Kusy","Frederic Maire","Niko Suenderhauf","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2404.09406v2.pdf","comment":"Accepted at the CVPR2024 3rd Workshop on Learning with Limited\n Labelled Data for Image and Video Understanding (L3D-IVU), 10 pages, 6\n figures, an additional 4 pages of supplementary material"},{"id":"http://arxiv.org/abs/2404.10292v1","updated":"2024-04-16T05:29:14Z","published":"2024-04-16T05:29:14Z","title":"From Data Deluge to Data Curation: A Filtering-WoRA Paradigm for\n Efficient Text-based Person Search","summary":" In text-based person search endeavors, data generation has emerged as a\nprevailing practice, addressing concerns over privacy preservation and the\narduous task of manual annotation. Although the number of synthesized data can\nbe infinite in theory, the scientific conundrum persists that how much\ngenerated data optimally fuels subsequent model training. We observe that only\na subset of the data in these constructed datasets plays a decisive role.\nTherefore, we introduce a new Filtering-WoRA paradigm, which contains a\nfiltering algorithm to identify this crucial data subset and WoRA (Weighted\nLow-Rank Adaptation) learning strategy for light fine-tuning. The filtering\nalgorithm is based on the cross-modality relevance to remove the lots of coarse\nmatching synthesis pairs. As the number of data decreases, we do not need to\nfine-tune the entire model. Therefore, we propose a WoRA learning strategy to\nefficiently update a minimal portion of model parameters. WoRA streamlines the\nlearning process, enabling heightened efficiency in extracting knowledge from\nfewer, yet potent, data instances. Extensive experimentation validates the\nefficacy of pretraining, where our model achieves advanced and efficient\nretrieval performance on challenging real-world benchmarks. Notably, on the\nCUHK-PEDES dataset, we have achieved a competitive mAP of 67.02% while reducing\nmodel training time by 19.82%.\n","authors":["Jintao Sun","Zhedong Zheng","Gangyi Ding"],"pdf_url":"https://arxiv.org/pdf/2404.10292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10290v1","updated":"2024-04-16T05:28:07Z","published":"2024-04-16T05:28:07Z","title":"NeuroMorphix: A Novel Brain MRI Asymmetry-specific Feature Construction\n Approach For Seizure Recurrence Prediction","summary":" Seizure recurrence is an important concern after an initial unprovoked\nseizure; without drug treatment, it occurs within 2 years in 40-50% of cases.\nThe decision to treat currently relies on predictors of seizure recurrence risk\nthat are inaccurate, resulting in unnecessary, possibly harmful, treatment in\nsome patients and potentially preventable seizures in others. Because of the\nlink between brain lesions and seizure recurrence, we developed a recurrence\nprediction tool using machine learning and clinical 3T brain MRI. We developed\nNeuroMorphix, a feature construction approach based on MRI brain anatomy. Each\nof seven NeuroMorphix features measures the absolute or relative difference\nbetween corresponding regions in each cerebral hemisphere. FreeSurfer was used\nto segment brain regions and to generate values for morphometric parameters (8\nfor each cortical region and 5 for each subcortical region). The parameters\nwere then mapped to whole brain NeuroMorphix features, yielding a total of 91\nfeatures per subject. Features were generated for a first seizure patient\ncohort (n = 169) categorised into seizure recurrence and non-recurrence\nsubgroups. State-of-the-art classification algorithms were trained and tested\nusing NeuroMorphix features to predict seizure recurrence. Classification\nmodels using the top 5 features, ranked by sequential forward selection,\ndemonstrated excellent performance in predicting seizure recurrence, with area\nunder the ROC curve of 88-93%, accuracy of 83-89%, and F1 score of 83-90%.\nHighly ranked features aligned with structural alterations known to be\nassociated with epilepsy. This study highlights the potential for targeted,\ndata-driven approaches to aid clinical decision-making in brain disorders.\n","authors":["Soumen Ghosh","Viktor Vegh","Shahrzad Moinian","Hamed Moradi","Alice-Ann Sullivan","John Phamnguyen","David Reutens"],"pdf_url":"https://arxiv.org/pdf/2404.10290v1.pdf","comment":"This work has been submitted to the IEEE TMI for possible publication"},{"id":"http://arxiv.org/abs/2404.10282v1","updated":"2024-04-16T04:52:41Z","published":"2024-04-16T04:52:41Z","title":"Tripod: Three Complementary Inductive Biases for Disentangled\n Representation Learning","summary":" Inductive biases are crucial in disentangled representation learning for\nnarrowing down an underspecified solution set. In this work, we consider\nendowing a neural network autoencoder with three select inductive biases from\nthe literature: data compression into a grid-like latent space via\nquantization, collective independence amongst latents, and minimal functional\ninfluence of any latent on how other latents determine data generation. In\nprinciple, these inductive biases are deeply complementary: they most directly\nspecify properties of the latent space, encoder, and decoder, respectively. In\npractice, however, naively combining existing techniques instantiating these\ninductive biases fails to yield significant benefits. To address this, we\npropose adaptations to the three techniques that simplify the learning problem,\nequip key regularization terms with stabilizing invariances, and quash\ndegenerate incentives. The resulting model, Tripod, achieves state-of-the-art\nresults on a suite of four image disentanglement benchmarks. We also verify\nthat Tripod significantly improves upon its naive incarnation and that all\nthree of its \"legs\" are necessary for best performance.\n","authors":["Kyle Hsu","Jubayer Ibn Hamid","Kaylee Burns","Chelsea Finn","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10282v1.pdf","comment":"22 pages, 10 figures, code available at\n https://github.com/kylehkhsu/tripod"},{"id":"http://arxiv.org/abs/2404.10279v1","updated":"2024-04-16T04:44:16Z","published":"2024-04-16T04:44:16Z","title":"EucliDreamer: Fast and High-Quality Texturing for 3D Models with\n Depth-Conditioned Stable Diffusion","summary":" We present EucliDreamer, a simple and effective method to generate textures\nfor 3D models given text prompts and meshes. The texture is parametrized as an\nimplicit function on the 3D surface, which is optimized with the Score\nDistillation Sampling (SDS) process and differentiable rendering. To generate\nhigh-quality textures, we leverage a depth-conditioned Stable Diffusion model\nguided by the depth image rendered from the mesh. We test our approach on 3D\nmodels in Objaverse and conducted a user study, which shows its superior\nquality compared to existing texturing methods like Text2Tex. In addition, our\nmethod converges 2 times faster than DreamFusion. Through text prompting,\ntextures of diverse art styles can be produced. We hope Euclidreamer proides a\nviable solution to automate a labor-intensive stage in 3D content creation.\n","authors":["Cindy Le","Congrui Hetang","Chendi Lin","Ang Cao","Yihui He"],"pdf_url":"https://arxiv.org/pdf/2404.10279v1.pdf","comment":"Short version of arXiv:2311.15573"},{"id":"http://arxiv.org/abs/2403.14987v2","updated":"2024-04-16T04:15:32Z","published":"2024-03-22T06:45:45Z","title":"Generative Active Learning for Image Synthesis Personalization","summary":" This paper presents a pilot study that explores the application of active\nlearning, traditionally studied in the context of discriminative models, to\ngenerative models. We specifically focus on image synthesis personalization\ntasks. The primary challenge in conducting active learning on generative models\nlies in the open-ended nature of querying, which differs from the closed form\nof querying in discriminative models that typically target a single concept. We\nintroduce the concept of anchor directions to transform the querying process\ninto a semi-open problem. We propose a direction-based uncertainty sampling\nstrategy to enable generative active learning and tackle the\nexploitation-exploration dilemma. Extensive experiments are conducted to\nvalidate the effectiveness of our approach, demonstrating that an open-source\nmodel can achieve superior performance compared to closed-source models\ndeveloped by large companies, such as Google's StyleDrop. The source code is\navailable at https://github.com/zhangxulu1996/GAL4Personalization.\n","authors":["Xulu Zhang","Wengyu Zhang","Xiao-Yong Wei","Jinlin Wu","Zhaoxiang Zhang","Zhen Lei","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2403.14987v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10272v1","updated":"2024-04-16T04:05:33Z","published":"2024-04-16T04:05:33Z","title":"Plug-and-Play Acceleration of Occupancy Grid-based NeRF Rendering using\n VDB Grid and Hierarchical Ray Traversal","summary":" Transmittance estimators such as Occupancy Grid (OG) can accelerate the\ntraining and rendering of Neural Radiance Field (NeRF) by predicting important\nsamples that contributes much to the generated image. However, OG manages\noccupied regions in the form of the dense binary grid, in which there are many\nblocks with the same values that cause redundant examination of voxels'\nemptiness in ray-tracing. In our work, we introduce two techniques to improve\nthe efficiency of ray-tracing in trained OG without fine-tuning. First, we\nreplace the dense grids with VDB grids to reduce the spatial redundancy.\nSecond, we use hierarchical digital differential analyzer (HDDA) to efficiently\ntrace voxels in the VDB grids. Our experiments on NeRF-Synthetic and Mip-NeRF\n360 datasets show that our proposed method successfully accelerates rendering\nNeRF-Synthetic dataset by 12% in average and Mip-NeRF 360 dataset by 4% in\naverage, compared to a fast implementation of OG, NerfAcc, without losing the\nquality of rendered images.\n","authors":["Yoshio Kato","Shuhei Tarashima"],"pdf_url":"https://arxiv.org/pdf/2404.10272v1.pdf","comment":"Short paper for CVPR Neural Rendering Intelligence Workshop 2024.\n Code: https://github.com/Yosshi999/faster-occgrid"},{"id":"http://arxiv.org/abs/2305.00635v2","updated":"2024-04-16T03:46:03Z","published":"2023-05-01T02:51:38Z","title":"Learning Self-Prior for Mesh Inpainting Using Self-Supervised Graph\n Convolutional Networks","summary":" In this paper, we present a self-prior-based mesh inpainting framework that\nrequires only an incomplete mesh as input, without the need for any training\ndatasets. Additionally, our method maintains the polygonal mesh format\nthroughout the inpainting process without converting the shape format to an\nintermediate one, such as a voxel grid, a point cloud, or an implicit function,\nwhich are typically considered easier for deep neural networks to process. To\nachieve this goal, we introduce two graph convolutional networks (GCNs):\nsingle-resolution GCN (SGCN) and multi-resolution GCN (MGCN), both trained in a\nself-supervised manner. Our approach refines a watertight mesh obtained from\nthe initial hole filling to generate a complete output mesh. Specifically, we\ntrain the GCNs to deform an oversmoothed version of the input mesh into the\nexpected complete shape. The deformation is described by vertex displacements,\nand the GCNs are supervised to obtain accurate displacements at vertices in\nreal holes. To this end, we specify several connected regions of the mesh as\nfake holes, thereby generating meshes with various sets of fake holes. The\ncorrect displacements of vertices are known in these fake holes, thus enabling\ntraining GCNs with loss functions that assess the accuracy of vertex\ndisplacements. We demonstrate that our method outperforms traditional\ndataset-independent approaches and exhibits greater robustness compared with\nother deep-learning-based methods for shapes that infrequently appear in shape\ndatasets. Our code and test data are available at\nhttps://github.com/astaka-pe/SeMIGCN.\n","authors":["Shota Hattori","Tatsuya Yatagawa","Yutaka Ohtake","Hiromasa Suzuki"],"pdf_url":"https://arxiv.org/pdf/2305.00635v2.pdf","comment":"18 pages, 18 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.10267v1","updated":"2024-04-16T03:45:45Z","published":"2024-04-16T03:45:45Z","title":"OneActor: Consistent Character Generation via Cluster-Conditioned\n Guidance","summary":" Text-to-image diffusion models benefit artists with high-quality image\ngeneration. Yet its stochastic nature prevent artists from creating consistent\nimages of the same character. Existing methods try to tackle this challenge and\ngenerate consistent content in various ways. However, they either depend on\nexternal data or require expensive tuning of the diffusion model. For this\nissue, we argue that a lightweight but intricate guidance is enough to\nfunction. Aiming at this, we lead the way to formalize the objective of\nconsistent generation, derive a clustering-based score function and propose a\nnovel paradigm, OneActor. We design a cluster-conditioned model which\nincorporates posterior samples to guide the denoising trajectories towards the\ntarget cluster. To overcome the overfitting challenge shared by one-shot tuning\npipelines, we devise auxiliary components to simultaneously augment the tuning\nand regulate the inference. This technique is later verified to significantly\nenhance the content diversity of generated images. Comprehensive experiments\nshow that our method outperforms a variety of baselines with satisfactory\ncharacter consistency, superior prompt conformity as well as high image\nquality. And our method is at least 4 times faster than tuning-based baselines.\nFurthermore, to our best knowledge, we first prove that the semantic space has\nthe same interpolation property as the latent space dose. This property can\nserve as another promising tool for fine generation control.\n","authors":["Jiahao Wang","Caixia Yan","Haonan Lin","Weizhan Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13127v4","updated":"2024-04-16T03:43:43Z","published":"2023-11-22T03:31:31Z","title":"MetaCloak: Preventing Unauthorized Subject-driven Text-to-image\n Diffusion-based Synthesis via Meta-learning","summary":" Text-to-image diffusion models allow seamless generation of personalized\nimages from scant reference photos. Yet, these tools, in the wrong hands, can\nfabricate misleading or harmful content, endangering individuals. To address\nthis problem, existing poisoning-based approaches perturb user images in an\nimperceptible way to render them \"unlearnable\" from malicious uses. We identify\ntwo limitations of these defending approaches: i) sub-optimal due to the\nhand-crafted heuristics for solving the intractable bilevel optimization and\nii) lack of robustness against simple data transformations like Gaussian\nfiltering. To solve these challenges, we propose MetaCloak, which solves the\nbi-level poisoning problem with a meta-learning framework with an additional\ntransformation sampling process to craft transferable and robust perturbation.\nSpecifically, we employ a pool of surrogate diffusion models to craft\ntransferable and model-agnostic perturbation. Furthermore, by incorporating an\nadditional transformation process, we design a simple denoising-error\nmaximization loss that is sufficient for causing transformation-robust semantic\ndistortion and degradation in a personalized generation. Extensive experiments\non the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing\napproaches. Notably, MetaCloak can successfully fool online training services\nlike Replicate, in a black-box manner, demonstrating the effectiveness of\nMetaCloak in real-world scenarios. Our code is available at\nhttps://github.com/liuyixin-louis/MetaCloak.\n","authors":["Yixin Liu","Chenrui Fan","Yutong Dai","Xun Chen","Pan Zhou","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2311.13127v4.pdf","comment":"Accepted to CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.09640v2","updated":"2024-04-16T03:43:11Z","published":"2024-04-15T10:19:39Z","title":"CREST: Cross-modal Resonance through Evidential Deep Learning for\n Enhanced Zero-Shot Learning","summary":" Zero-shot learning (ZSL) enables the recognition of novel classes by\nleveraging semantic knowledge transfer from known to unknown categories. This\nknowledge, typically encapsulated in attribute descriptions, aids in\nidentifying class-specific visual features, thus facilitating visual-semantic\nalignment and improving ZSL performance. However, real-world challenges such as\ndistribution imbalances and attribute co-occurrence among instances often\nhinder the discernment of local variances in images, a problem exacerbated by\nthe scarcity of fine-grained, region-specific attribute annotations. Moreover,\nthe variability in visual presentation within categories can also skew\nattribute-category associations. In response, we propose a bidirectional\ncross-modal ZSL approach CREST. It begins by extracting representations for\nattribute and visual localization and employs Evidential Deep Learning (EDL) to\nmeasure underlying epistemic uncertainty, thereby enhancing the model's\nresilience against hard negatives. CREST incorporates dual learning pathways,\nfocusing on both visual-category and attribute-category alignments, to ensure\nrobust correlation between latent and observable spaces. Moreover, we introduce\nan uncertainty-informed cross-modal fusion technique to refine visual-attribute\ninference. Extensive experiments demonstrate our model's effectiveness and\nunique explainability across multiple datasets. Our code and data are available\nat: https://github.com/JethroJames/CREST.\n","authors":["Haojian Huang","Xiaozhen Qiao","Zhuo Chen","Haodong Chen","Bingyu Li","Zhe Sun","Mulin Chen","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.09640v2.pdf","comment":"Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at:\n https://github.com/JethroJames/CREST"},{"id":"http://arxiv.org/abs/2404.09378v2","updated":"2024-04-16T03:39:27Z","published":"2024-04-14T23:30:35Z","title":"Orientation-conditioned Facial Texture Mapping for Video-based Facial\n Remote Photoplethysmography Estimation","summary":" Camera-based remote photoplethysmography (rPPG) enables contactless\nmeasurement of important physiological signals such as pulse rate (PR).\nHowever, dynamic and unconstrained subject motion introduces significant\nvariability into the facial appearance in video, confounding the ability of\nvideo-based methods to accurately extract the rPPG signal. In this study, we\nleverage the 3D facial surface to construct a novel orientation-conditioned\nfacial texture video representation which improves the motion robustness of\nexisting video-based facial rPPG estimation methods. Our proposed method\nachieves a significant 18.2% performance improvement in cross-dataset testing\non MMPD over our baseline using the PhysNet model trained on PURE, highlighting\nthe efficacy and generalization benefits of our designed video representation.\nWe demonstrate significant performance improvements of up to 29.6% in all\ntested motion scenarios in cross-dataset testing on MMPD, even in the presence\nof dynamic and unconstrained subject motion, emphasizing the benefits of\ndisentangling motion through modeling the 3D facial surface for motion robust\nfacial rPPG estimation. We validate the efficacy of our design decisions and\nthe impact of different video processing steps through an ablation study. Our\nfindings illustrate the potential strengths of exploiting the 3D facial surface\nas a general strategy for addressing dynamic and unconstrained subject motion\nin videos. The code is available at\nhttps://samcantrill.github.io/orientation-uv-rppg/.\n","authors":["Sam Cantrill","David Ahmedt-Aristizabal","Lars Petersson","Hanna Suominen","Mohammad Ali Armin"],"pdf_url":"https://arxiv.org/pdf/2404.09378v2.pdf","comment":"12 pages, 8 figures, 6 tables; corrected abstract typo"},{"id":"http://arxiv.org/abs/2404.00231v2","updated":"2024-04-16T03:38:31Z","published":"2024-03-30T03:23:52Z","title":"Attention-based Shape-Deformation Networks for Artifact-Free Geometry\n Reconstruction of Lumbar Spine from MR Images","summary":" Lumbar disc degeneration, a progressive structural wear and tear of lumbar\nintervertebral disc, is regarded as an essential role on low back pain, a\nsignificant global health concern. Automated lumbar spine geometry\nreconstruction from MR images will enable fast measurement of medical\nparameters to evaluate the lumbar status, in order to determine a suitable\ntreatment. Existing image segmentation-based techniques often generate\nerroneous segments or unstructured point clouds, unsuitable for medical\nparameter measurement. In this work, we present TransDeformer: a novel\nattention-based deep learning approach that reconstructs the geometry of the\nlumbar spine with high spatial accuracy and mesh correspondence across\npatients, and we also present a variant of TransDeformer for error estimation.\nSpecially, we devise new attention modules with a new attention formula, which\nintegrate image features and tokenized contour features to predict the\ndisplacements of the points on a shape template without the need for image\nsegmentation. The deformed template reveals the lumbar spine geometry in an\nimage. Experiment results show that our TransDeformer generates artifact-free\ngeometry outputs, and its variant predicts the error of a reconstructed\ngeometry. Our code is available at\nhttps://github.com/linchenq/TransDeformer-Mesh.\n","authors":["Linchen Qian","Jiasong Chen","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2404.00231v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10263v1","updated":"2024-04-16T03:34:35Z","published":"2024-04-16T03:34:35Z","title":"PreGSU-A Generalized Traffic Scene Understanding Model for Autonomous\n Driving based on Pre-trained Graph Attention Network","summary":" Scene understanding, defined as learning, extraction, and representation of\ninteractions among traffic elements, is one of the critical challenges toward\nhigh-level autonomous driving (AD). Current scene understanding methods mainly\nfocus on one concrete single task, such as trajectory prediction and risk level\nevaluation. Although they perform well on specific metrics, the generalization\nability is insufficient to adapt to the real traffic complexity and downstream\ndemand diversity. In this study, we propose PreGSU, a generalized pre-trained\nscene understanding model based on graph attention network to learn the\nuniversal interaction and reasoning of traffic scenes to support various\ndownstream tasks. After the feature engineering and sub-graph module, all\nelements are embedded as nodes to form a dynamic weighted graph. Then, four\ngraph attention layers are applied to learn the relationships among agents and\nlanes. In the pre-train phase, the understanding model is trained on two\nself-supervised tasks: Virtual Interaction Force (VIF) modeling and Masked Road\nModeling (MRM). Based on the artificial potential field theory, VIF modeling\nenables PreGSU to capture the agent-to-agent interactions while MRM extracts\nagent-to-road connections. In the fine-tuning process, the pre-trained\nparameters are loaded to derive detailed understanding outputs. We conduct\nvalidation experiments on two downstream tasks, i.e., trajectory prediction in\nurban scenario, and intention recognition in highway scenario, to verify the\ngeneralized ability and understanding ability. Results show that compared with\nthe baselines, PreGSU achieves better accuracy on both tasks, indicating the\npotential to be generalized to various scenes and targets. Ablation study shows\nthe effectiveness of pre-train task design.\n","authors":["Yuning Wang","Zhiyuan Liu","Haotian Lin","Junkai Jiang","Shaobing Xu","Jianqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.10263v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2404.09301v2","updated":"2024-04-16T03:27:00Z","published":"2024-04-14T16:55:23Z","title":"A Simple Strategy for Body Estimation from Partial-View Images","summary":" Virtual try-on and product personalization have become increasingly important\nin modern online shopping, highlighting the need for accurate body measurement\nestimation. Although previous research has advanced in estimating 3D body\nshapes from RGB images, the task is inherently ambiguous as the observed scale\nof human subjects in the images depends on two unknown factors: capture\ndistance and body dimensions. This ambiguity is particularly pronounced in\npartial-view scenarios. To address this challenge, we propose a modular and\nsimple height normalization solution. This solution relocates the subject\nskeleton to the desired position, thereby normalizing the scale and\ndisentangling the relationship between the two variables. Our experimental\nresults demonstrate that integrating this technique into state-of-the-art human\nmesh reconstruction models significantly enhances partial body measurement\nestimation. Additionally, we illustrate the applicability of this approach to\nmulti-view settings, showcasing its versatility.\n","authors":["Yafei Mao","Xuelu Li","Brandon Smith","Jinjin Li","Raja Bala"],"pdf_url":"https://arxiv.org/pdf/2404.09301v2.pdf","comment":"Accepted to CVPRW 2024 Computer Vision for Fashion, Art, and Design"},{"id":"http://arxiv.org/abs/2401.15914v2","updated":"2024-04-16T03:25:25Z","published":"2024-01-29T06:57:48Z","title":"Overcoming the Pitfalls of Vision-Language Model Finetuning for OOD\n Generalization","summary":" Existing vision-language models exhibit strong generalization on a variety of\nvisual domains and tasks. However, such models mainly perform zero-shot\nrecognition in a closed-set manner, and thus struggle to handle open-domain\nvisual concepts by design. There are recent finetuning methods, such as prompt\nlearning, that not only study the discrimination between in-distribution (ID)\nand out-of-distribution (OOD) samples, but also show some improvements in both\nID and OOD accuracies. In this paper, we first demonstrate that vision-language\nmodels, after long enough finetuning but without proper regularization, tend to\noverfit the known classes in the given dataset, with degraded performance on\nunknown classes. Then we propose a novel approach OGEN to address this pitfall,\nwith the main focus on improving the OOD GENeralization of finetuned models.\nSpecifically, a class-conditional feature generator is introduced to synthesize\nOOD features using just the class name of any unknown class. Such synthesized\nfeatures will provide useful knowledge about unknowns and help regularize the\ndecision boundary between ID and OOD data when optimized jointly. Equally\nimportant is our adaptive self-distillation mechanism to regularize our feature\ngeneration model during joint optimization, i.e., adaptively transferring\nknowledge between model states to further prevent overfitting. Experiments\nvalidate that our method yields convincing gains in OOD generalization\nperformance in different settings. Code: https://github.com/apple/ml-ogen.\n","authors":["Yuhang Zang","Hanlin Goh","Josh Susskind","Chen Huang"],"pdf_url":"https://arxiv.org/pdf/2401.15914v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2312.06797v2","updated":"2024-04-16T03:10:34Z","published":"2023-12-11T19:13:38Z","title":"Improving the Robustness of 3D Human Pose Estimation: A Benchmark and\n Learning from Noisy Input","summary":" Despite the promising performance of current 3D human pose estimation\ntechniques, understanding and enhancing their generalization on challenging\nin-the-wild videos remain an open problem. In this work, we focus on the\nrobustness of 2D-to-3D pose lifters. To this end, we develop two benchmark\ndatasets, namely Human3.6M-C and HumanEva-I-C, to examine the robustness of\nvideo-based 3D pose lifters to a wide range of common video corruptions\nincluding temporary occlusion, motion blur, and pixel-level noise. We observe\nthe poor generalization of state-of-the-art 3D pose lifters in the presence of\ncorruption and establish two techniques to tackle this issue. First, we\nintroduce Temporal Additive Gaussian Noise (TAGN) as a simple yet effective 2D\ninput pose data augmentation. Additionally, to incorporate the confidence\nscores output by the 2D pose detectors, we design a confidence-aware\nconvolution (CA-Conv) block. Extensively tested on corrupted videos, the\nproposed strategies consistently boost the robustness of 3D pose lifters and\nserve as new baselines for future research.\n","authors":["Trung-Hieu Hoang","Mona Zehni","Huy Phan","Duc Minh Vo","Minh N. Do"],"pdf_url":"https://arxiv.org/pdf/2312.06797v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17521v2","updated":"2024-04-16T03:02:04Z","published":"2024-02-27T14:05:05Z","title":"AVS-Net: Point Sampling with Adaptive Voxel Size for 3D Scene\n Understanding","summary":" The recent advancements in point cloud learning have enabled intelligent\nvehicles and robots to comprehend 3D environments better. However, processing\nlarge-scale 3D scenes remains a challenging problem, such that efficient\ndownsampling methods play a crucial role in point cloud learning. Existing\ndownsampling methods either require a huge computational burden or sacrifice\nfine-grained geometric information. For such purpose, this paper presents an\nadvanced sampler that achieves both high accuracy and efficiency. The proposed\nmethod utilizes voxel centroid sampling as a foundation but effectively\naddresses the challenges regarding voxel size determination and the\npreservation of critical geometric cues. Specifically, we propose a Voxel\nAdaptation Module that adaptively adjusts voxel sizes with the reference of\npoint-based downsampling ratio. This ensures that the sampling results exhibit\na favorable distribution for comprehending various 3D objects or scenes.\nMeanwhile, we introduce a network compatible with arbitrary voxel sizes for\nsampling and feature extraction while maintaining high efficiency. The proposed\napproach is demonstrated with 3D object detection and 3D semantic segmentation.\nCompared to existing state-of-the-art methods, our approach achieves better\naccuracy on outdoor and indoor large-scale datasets, e.g. Waymo and ScanNet,\nwith promising efficiency.\n","authors":["Hongcheng Yang","Dingkang Liang","Dingyuan Zhang","Zhe Liu","Zhikang Zou","Xingyu Jiang","Yingying Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.17521v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.10242v1","updated":"2024-04-16T02:42:06Z","published":"2024-04-16T02:42:06Z","title":"Masked Autoencoders for Microscopy are Scalable Learners of Cellular\n Biology","summary":" Featurizing microscopy images for use in biological research remains a\nsignificant challenge, especially for large-scale experiments spanning millions\nof images. This work explores the scaling properties of weakly supervised\nclassifiers and self-supervised masked autoencoders (MAEs) when training with\nincreasingly larger model backbones and microscopy datasets. Our results show\nthat ViT-based MAEs outperform weakly supervised classifiers on a variety of\ntasks, achieving as much as a 11.5% relative improvement when recalling known\nbiological relationships curated from public databases. Additionally, we\ndevelop a new channel-agnostic MAE architecture (CA-MAE) that allows for\ninputting images of different numbers and orders of channels at inference time.\nWe demonstrate that CA-MAEs effectively generalize by inferring and evaluating\non a microscopy image dataset (JUMP-CP) generated under different experimental\nconditions with a different channel structure than our pretraining data\n(RPI-93M). Our findings motivate continued research into scaling\nself-supervised learning on microscopy data in order to create powerful\nfoundation models of cellular biology that have the potential to catalyze\nadvancements in drug discovery and beyond.\n","authors":["Oren Kraus","Kian Kenyon-Dean","Saber Saberian","Maryam Fallah","Peter McLean","Jess Leung","Vasudev Sharma","Ayla Khan","Jia Balakrishnan","Safiye Celik","Dominique Beaini","Maciej Sypetkowski","Chi Vicky Cheng","Kristen Morse","Maureen Makes","Ben Mabey","Berton Earnshaw"],"pdf_url":"https://arxiv.org/pdf/2404.10242v1.pdf","comment":"CVPR 2024 Highlight. arXiv admin note: text overlap with\n arXiv:2309.16064"},{"id":"http://arxiv.org/abs/2404.10241v1","updated":"2024-04-16T02:40:35Z","published":"2024-04-16T02:40:35Z","title":"Vision-and-Language Navigation via Causal Learning","summary":" In the pursuit of robust and generalizable environment perception and\nlanguage understanding, the ubiquitous challenge of dataset bias continues to\nplague vision-and-language navigation (VLN) agents, hindering their performance\nin unseen environments. This paper introduces the generalized cross-modal\ncausal transformer (GOAT), a pioneering solution rooted in the paradigm of\ncausal inference. By delving into both observable and unobservable confounders\nwithin vision, language, and history, we propose the back-door and front-door\nadjustment causal learning (BACL and FACL) modules to promote unbiased learning\nby comprehensively mitigating potential spurious correlations. Additionally, to\ncapture global confounder features, we propose a cross-modal feature pooling\n(CFP) module supervised by contrastive learning, which is also shown to be\neffective in improving cross-modal representations during pre-training.\nExtensive experiments across multiple VLN datasets (R2R, REVERIE, RxR, and\nSOON) underscore the superiority of our proposed method over previous\nstate-of-the-art approaches. Code is available at\nhttps://github.com/CrystalSixone/VLN-GOAT.\n","authors":["Liuyi Wang","Zongtao He","Ronghao Dang","Mengjiao Shen","Chengju Liu","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2404.10241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08123v3","updated":"2024-04-16T02:37:47Z","published":"2023-07-16T18:42:01Z","title":"Solving Inverse Problems with Latent Diffusion Models via Hard Data\n Consistency","summary":" Diffusion models have recently emerged as powerful generative priors for\nsolving inverse problems. However, training diffusion models in the pixel space\nare both data-intensive and computationally demanding, which restricts their\napplicability as priors for high-dimensional real-world data such as medical\nimages. Latent diffusion models, which operate in a much lower-dimensional\nspace, offer a solution to these challenges. However, incorporating latent\ndiffusion models to solve inverse problems remains a challenging problem due to\nthe nonlinearity of the encoder and decoder. To address these issues, we\npropose \\textit{ReSample}, an algorithm that can solve general inverse problems\nwith pre-trained latent diffusion models. Our algorithm incorporates data\nconsistency by solving an optimization problem during the reverse sampling\nprocess, a concept that we term as hard data consistency. Upon solving this\noptimization problem, we propose a novel resampling scheme to map the\nmeasurement-consistent sample back onto the noisy data manifold and\ntheoretically demonstrate its benefits. Lastly, we apply our algorithm to solve\na wide range of linear and nonlinear inverse problems in both natural and\nmedical images, demonstrating that our approach outperforms existing\nstate-of-the-art approaches, including those based on pixel-space diffusion\nmodels.\n","authors":["Bowen Song","Soo Min Kwon","Zecheng Zhang","Xinyu Hu","Qing Qu","Liyue Shen"],"pdf_url":"https://arxiv.org/pdf/2307.08123v3.pdf","comment":"27 pages, 20 figures"},{"id":"http://arxiv.org/abs/2404.10237v1","updated":"2024-04-16T02:35:17Z","published":"2024-04-16T02:35:17Z","title":"MoE-TinyMed: Mixture of Experts for Tiny Medical Large Vision-Language\n Models","summary":" Mixture of Expert Tuning (MoE-Tuning) has effectively enhanced the\nperformance of general MLLMs with fewer parameters, yet its application in\nresource-limited medical settings has not been fully explored. To address this\ngap, we developed MoE-TinyMed, a model tailored for medical applications that\nsignificantly lowers parameter demands. In evaluations on the VQA-RAD, SLAKE,\nand Path-VQA datasets, MoE-TinyMed outperformed LLaVA-Med in all Med-VQA closed\nsettings with just 3.6B parameters. Additionally, a streamlined version with 2B\nparameters surpassed LLaVA-Med's performance in PathVQA, showcasing its\neffectiveness in resource-limited healthcare settings.\n","authors":["Songtao Jiang","Tuo Zheng","Yan Zhang","Yeying Jin","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.10237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10234v1","updated":"2024-04-16T02:29:00Z","published":"2024-04-16T02:29:00Z","title":"Compressible and Searchable: AI-native Multi-Modal Retrieval System with\n Learned Image Compression","summary":" The burgeoning volume of digital content across diverse modalities\nnecessitates efficient storage and retrieval methods. Conventional approaches\nstruggle to cope with the escalating complexity and scale of multimedia data.\nIn this paper, we proposed framework addresses this challenge by fusing\nAI-native multi-modal search capabilities with neural image compression. First\nwe analyze the intricate relationship between compressibility and\nsearchability, recognizing the pivotal role each plays in the efficiency of\nstorage and retrieval systems. Through the usage of simple adapter is to bridge\nthe feature of Learned Image Compression(LIC) and Contrastive Language-Image\nPretraining(CLIP) while retaining semantic fidelity and retrieval of\nmulti-modal data. Experimental evaluations on Kodak datasets demonstrate the\nefficacy of our approach, showcasing significant enhancements in compression\nefficiency and search accuracy compared to existing methodologies. Our work\nmarks a significant advancement towards scalable and efficient multi-modal\nsearch systems in the era of big data.\n","authors":["Jixiang Luo"],"pdf_url":"https://arxiv.org/pdf/2404.10234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10227v1","updated":"2024-04-16T02:18:18Z","published":"2024-04-16T02:18:18Z","title":"MS-MANO: Enabling Hand Pose Tracking with Biomechanical Constraints","summary":" This work proposes a novel learning framework for visual hand dynamics\nanalysis that takes into account the physiological aspects of hand motion. The\nexisting models, which are simplified joint-actuated systems, often produce\nunnatural motions. To address this, we integrate a musculoskeletal system with\na learnable parametric hand model, MANO, to create a new model, MS-MANO. This\nmodel emulates the dynamics of muscles and tendons to drive the skeletal\nsystem, imposing physiologically realistic constraints on the resulting torque\ntrajectories. We further propose a simulation-in-the-loop pose refinement\nframework, BioPR, that refines the initial estimated pose through a multi-layer\nperceptron (MLP) network. Our evaluation of the accuracy of MS-MANO and the\nefficacy of the BioPR is conducted in two separate parts. The accuracy of\nMS-MANO is compared with MyoSuite, while the efficacy of BioPR is benchmarked\nagainst two large-scale public datasets and two recent state-of-the-art\nmethods. The results demonstrate that our approach consistently improves the\nbaseline methods both quantitatively and qualitatively.\n","authors":["Pengfei Xie","Wenqiang Xu","Tutian Tang","Zhenjun Yu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2404.10227v1.pdf","comment":"11 pages, 5 figures; CVPR 2024"},{"id":"http://arxiv.org/abs/2307.03157v2","updated":"2024-04-16T02:12:11Z","published":"2023-07-06T17:32:38Z","title":"Achieving Reliable and Fair Skin Lesion Diagnosis via Unsupervised\n Domain Adaptation","summary":" The development of reliable and fair diagnostic systems is often constrained\nby the scarcity of labeled data. To address this challenge, our work explores\nthe feasibility of unsupervised domain adaptation (UDA) to integrate large\nexternal datasets for developing reliable classifiers. The adoption of UDA with\nmultiple sources can simultaneously enrich the training set and bridge the\ndomain gap between different skin lesion datasets, which vary due to distinct\nacquisition protocols. Particularly, UDA shows practical promise for improving\ndiagnostic reliability when training with a custom skin lesion dataset, where\nonly limited labeled data are available from the target domain. In this study,\nwe investigate three UDA training schemes based on source data utilization:\nsingle-source, combined-source, and multi-source UDA. Our findings demonstrate\nthe effectiveness of applying UDA on multiple sources for binary and\nmulti-class classification. A strong correlation between test error and label\nshift in multi-class tasks has been observed in the experiment. Crucially, our\nstudy shows that UDA can effectively mitigate bias against minority groups and\nenhance fairness in diagnostic systems, while maintaining superior\nclassification performance. This is achieved even without directly implementing\nfairness-focused techniques. This success is potentially attributed to the\nincreased and well-adapted demographic information obtained from multiple\nsources.\n","authors":["Janet Wang","Yunbei Zhang","Zhengming Ding","Jihun Hamm"],"pdf_url":"https://arxiv.org/pdf/2307.03157v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10226v1","updated":"2024-04-16T02:11:46Z","published":"2024-04-16T02:11:46Z","title":"Find The Gap: Knowledge Base Reasoning For Visual Question Answering","summary":" We analyze knowledge-based visual question answering, for which given a\nquestion, the models need to ground it into the visual modality and retrieve\nthe relevant knowledge from a given large knowledge base (KB) to be able to\nanswer. Our analysis has two folds, one based on designing neural architectures\nand training them from scratch, and another based on large pre-trained language\nmodels (LLMs). Our research questions are: 1) Can we effectively augment models\nby explicit supervised retrieval of the relevant KB information to solve the\nKB-VQA problem? 2) How do task-specific and LLM-based models perform in the\nintegration of visual and external knowledge, and multi-hop reasoning over both\nsources of information? 3) Is the implicit knowledge of LLMs sufficient for\nKB-VQA and to what extent it can replace the explicit KB? Our results\ndemonstrate the positive impact of empowering task-specific and LLM models with\nsupervised external and visual knowledge retrieval models. Our findings show\nthat though LLMs are stronger in 1-hop reasoning, they suffer in 2-hop\nreasoning in comparison with our fine-tuned NN model even if the relevant\ninformation from both modalities is available to the model. Moreover, we\nobserved that LLM models outperform the NN model for KB-related questions which\nconfirms the effectiveness of implicit knowledge in LLMs however, they do not\nalleviate the need for external KB.\n","authors":["Elham J. Barezi","Parisa Kordjamshidi"],"pdf_url":"https://arxiv.org/pdf/2404.10226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10220v1","updated":"2024-04-16T02:01:56Z","published":"2024-04-16T02:01:56Z","title":"Closed-Loop Open-Vocabulary Mobile Manipulation with GPT-4V","summary":" Autonomous robot navigation and manipulation in open environments require\nreasoning and replanning with closed-loop feedback. We present COME-robot, the\nfirst closed-loop framework utilizing the GPT-4V vision-language foundation\nmodel for open-ended reasoning and adaptive planning in real-world scenarios.\nWe meticulously construct a library of action primitives for robot exploration,\nnavigation, and manipulation, serving as callable execution modules for GPT-4V\nin task planning. On top of these modules, GPT-4V serves as the brain that can\naccomplish multimodal reasoning, generate action policy with code, verify the\ntask progress, and provide feedback for replanning. Such design enables\nCOME-robot to (i) actively perceive the environments, (ii) perform situated\nreasoning, and (iii) recover from failures. Through comprehensive experiments\ninvolving 8 challenging real-world tabletop and manipulation tasks, COME-robot\ndemonstrates a significant improvement in task success rate (~25%) compared to\nstate-of-the-art baseline methods. We further conduct comprehensive analyses to\nelucidate how COME-robot's design facilitates failure recovery, free-form\ninstruction following, and long-horizon task planning.\n","authors":["Peiyuan Zhi","Zhiyuan Zhang","Muzhi Han","Zeyu Zhang","Zhitian Li","Ziyuan Jiao","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2404.10220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16654v2","updated":"2024-04-16T01:55:19Z","published":"2023-06-29T03:31:46Z","title":"Self-Supervised MRI Reconstruction with Unrolled Diffusion Models","summary":" Magnetic Resonance Imaging (MRI) produces excellent soft tissue contrast,\nalbeit it is an inherently slow imaging modality. Promising deep learning\nmethods have recently been proposed to reconstruct accelerated MRI scans.\nHowever, existing methods still suffer from various limitations regarding image\nfidelity, contextual sensitivity, and reliance on fully-sampled acquisitions\nfor model training. To comprehensively address these limitations, we propose a\nnovel self-supervised deep reconstruction model, named Self-Supervised\nDiffusion Reconstruction (SSDiffRecon). SSDiffRecon expresses a conditional\ndiffusion process as an unrolled architecture that interleaves cross-attention\ntransformers for reverse diffusion steps with data-consistency blocks for\nphysics-driven processing. Unlike recent diffusion methods for MRI\nreconstruction, a self-supervision strategy is adopted to train SSDiffRecon\nusing only undersampled k-space data. Comprehensive experiments on public brain\nMR datasets demonstrates the superiority of SSDiffRecon against\nstate-of-the-art supervised, and self-supervised baselines in terms of\nreconstruction speed and quality. Implementation will be available at\nhttps://github.com/yilmazkorkmaz1/SSDiffRecon.\n","authors":["Yilmaz Korkmaz","Tolga Cukur","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2306.16654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12308v5","updated":"2024-04-16T01:52:00Z","published":"2023-04-24T17:57:15Z","title":"Segment Anything in 3D with Radiance Fields","summary":" The Segment Anything Model (SAM) emerges as a powerful vision foundation\nmodel to generate high-quality 2D segmentation results. This paper aims to\ngeneralize SAM to segment 3D objects. Rather than replicating the data\nacquisition and annotation procedure which is costly in 3D, we design an\nefficient solution, leveraging the radiance field as a cheap and off-the-shelf\nprior that connects multi-view 2D images to the 3D space. We refer to the\nproposed solution as SA3D, short for Segment Anything in 3D. With SA3D, the\nuser is only required to provide a 2D segmentation prompt (e.g., rough points)\nfor the target object in a single view, which is used to generate its\ncorresponding 2D mask with SAM. Next, SA3D alternately performs mask inverse\nrendering and cross-view self-prompting across various views to iteratively\nrefine the 3D mask of the target object. For one view, mask inverse rendering\nprojects the 2D mask obtained by SAM into the 3D space with guidance of the\ndensity distribution learned by the radiance field for 3D mask refinement;\nThen, cross-view self-prompting extracts reliable prompts automatically as the\ninput to SAM from the rendered 2D mask of the inaccurate 3D mask for a new\nview. We show in experiments that SA3D adapts to various scenes and achieves 3D\nsegmentation within seconds. Our research reveals a potential methodology to\nlift the ability of a 2D segmentation model to 3D. Our code is available at\nhttps://github.com/Jumpat/SegmentAnythingin3D.\n","authors":["Jiazhong Cen","Jiemin Fang","Zanwei Zhou","Chen Yang","Lingxi Xie","Xiaopeng Zhang","Wei Shen","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2304.12308v5.pdf","comment":"Extension version of SA3D (NeurIPS 2023). Project page:\n https://jumpat.github.io/SA3D/"},{"id":"http://arxiv.org/abs/2404.10213v1","updated":"2024-04-16T01:50:10Z","published":"2024-04-16T01:50:10Z","title":"GaitPoint+: A Gait Recognition Network Incorporating Point Cloud\n Analysis and Recycling","summary":" Gait is a behavioral biometric modality that can be used to recognize\nindividuals by the way they walk from a far distance. Most existing gait\nrecognition approaches rely on either silhouettes or skeletons, while their\njoint use is underexplored. Features from silhouettes and skeletons can provide\ncomplementary information for more robust recognition against appearance\nchanges or pose estimation errors. To exploit the benefits of both silhouette\nand skeleton features, we propose a new gait recognition network, referred to\nas the GaitPoint+. Our approach models skeleton key points as a 3D point cloud,\nand employs a computational complexity-conscious 3D point processing approach\nto extract skeleton features, which are then combined with silhouette features\nfor improved accuracy. Since silhouette- or CNN-based methods already require\nconsiderable amount of computational resources, it is preferable that the key\npoint learning module is faster and more lightweight. We present a detailed\nanalysis of the utilization of every human key point after the use of\ntraditional max-pooling, and show that while elbow and ankle points are used\nmost commonly, many useful points are discarded by max-pooling. Thus, we\npresent a method to recycle some of the discarded points by a Recycling\nMax-Pooling module, during processing of skeleton point clouds, and achieve\nfurther performance improvement. We provide a comprehensive set of experimental\nresults showing that (i) incorporating skeleton features obtained by a\npoint-based 3D point cloud processing approach boosts the performance of three\ndifferent state-of-the-art silhouette- and CNN-based baselines; (ii) recycling\nthe discarded points increases the accuracy further. Ablation studies are also\nprovided to show the effectiveness and contribution of different components of\nour approach.\n","authors":["Huantao Ren","Jiajing Chen","Senem Velipasalar"],"pdf_url":"https://arxiv.org/pdf/2404.10213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10212v1","updated":"2024-04-16T01:49:35Z","published":"2024-04-16T01:49:35Z","title":"LWIRPOSE: A novel LWIR Thermal Image Dataset and Benchmark","summary":" Human pose estimation faces hurdles in real-world applications due to factors\nlike lighting changes, occlusions, and cluttered environments. We introduce a\nunique RGB-Thermal Nearly Paired and Annotated 2D Pose Dataset, comprising over\n2,400 high-quality LWIR (thermal) images. Each image is meticulously annotated\nwith 2D human poses, offering a valuable resource for researchers and\npractitioners. This dataset, captured from seven actors performing diverse\neveryday activities like sitting, eating, and walking, facilitates pose\nestimation on occlusion and other challenging scenarios. We benchmark\nstate-of-the-art pose estimation methods on the dataset to showcase its\npotential, establishing a strong baseline for future research. Our results\ndemonstrate the dataset's effectiveness in promoting advancements in pose\nestimation for various applications, including surveillance, healthcare, and\nsports analytics. The dataset and code are available at\nhttps://github.com/avinres/LWIRPOSE\n","authors":["Avinash Upadhyay","Bhipanshu Dhupar","Manoj Sharma","Ankit Shukla","Ajith Abraham"],"pdf_url":"https://arxiv.org/pdf/2404.10212v1.pdf","comment":"Submitted in ICIP2024"},{"id":"http://arxiv.org/abs/2404.10210v1","updated":"2024-04-16T01:41:22Z","published":"2024-04-16T01:41:22Z","title":"MK-SGN: A Spiking Graph Convolutional Network with Multimodal Fusion and\n Knowledge Distillation for Skeleton-based Action Recognition","summary":" In recent years, skeleton-based action recognition, leveraging multimodal\nGraph Convolutional Networks (GCN), has achieved remarkable results. However,\ndue to their deep structure and reliance on continuous floating-point\noperations, GCN-based methods are energy-intensive. To address this issue, we\npropose an innovative Spiking Graph Convolutional Network with Multimodal\nFusion and Knowledge Distillation (MK-SGN). By merging the energy efficiency of\nSpiking Neural Network (SNN) with the graph representation capability of GCN,\nthe proposed MK-SGN reduces energy consumption while maintaining recognition\naccuracy. Firstly, we convert GCN into Spiking Graph Convolutional Network\n(SGN) and construct a foundational Base-SGN for skeleton-based action\nrecognition, establishing a new benchmark and paving the way for future\nresearch exploration. Secondly, we further propose a Spiking Multimodal Fusion\nmodule (SMF), leveraging mutual information to process multimodal data more\nefficiently. Additionally, we introduce a spiking attention mechanism and\ndesign a Spatio Graph Convolution module with a Spatial Global Spiking\nAttention mechanism (SA-SGC), enhancing feature learning capability.\nFurthermore, we delve into knowledge distillation methods from multimodal GCN\nto SGN and propose a novel, integrated method that simultaneously focuses on\nboth intermediate layer distillation and soft label distillation to improve the\nperformance of SGN. On two challenging datasets for skeleton-based action\nrecognition, MK-SGN outperforms the state-of-the-art GCN-like frameworks in\nreducing computational load and energy consumption. In contrast, typical GCN\nmethods typically consume more than 35mJ per action sample, while MK-SGN\nreduces energy consumption by more than 98%.\n","authors":["Naichuan Zheng","Hailun Xia","Zeyu Liang"],"pdf_url":"https://arxiv.org/pdf/2404.10210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10523v3","updated":"2024-04-16T01:34:34Z","published":"2023-04-20T17:52:58Z","title":"GenCorres: Consistent Shape Matching via Coupled Implicit-Explicit Shape\n Generative Models","summary":" This paper introduces GenCorres, a novel unsupervised joint shape matching\n(JSM) approach. Our key idea is to learn a mesh generator to fit an unorganized\ndeformable shape collection while constraining deformations between adjacent\nsynthetic shapes to preserve geometric structures such as local rigidity and\nlocal conformality. GenCorres presents three appealing advantages over existing\nJSM techniques. First, GenCorres performs JSM among a synthetic shape\ncollection whose size is much bigger than the input shapes and fully leverages\nthe datadriven power of JSM. Second, GenCorres unifies consistent shape\nmatching and pairwise matching (i.e., by enforcing deformation priors between\nadjacent synthetic shapes). Third, the generator provides a concise encoding of\nconsistent shape correspondences. However, learning a mesh generator from an\nunorganized shape collection is challenging, requiring a good initialization.\nGenCorres addresses this issue by learning an implicit generator from the input\nshapes, which provides intermediate shapes between two arbitrary shapes. We\nintroduce a novel approach for computing correspondences between adjacent\nimplicit surfaces, which we use to regularize the implicit generator. Synthetic\nshapes of the implicit generator then guide initial fittings (i.e., via\ntemplate-based deformation) for learning the mesh generator. Experimental\nresults show that GenCorres considerably outperforms state-of-the-art JSM\ntechniques. The synthetic shapes of GenCorres also achieve salient performance\ngains against state-of-the-art deformable shape generators.\n","authors":["Haitao Yang","Xiangru Huang","Bo Sun","Chandrajit Bajaj","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2304.10523v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.08197v2","updated":"2024-04-16T01:13:35Z","published":"2024-04-12T02:04:34Z","title":"Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and\n Training Strategies","summary":" This paper investigates the performance of the Contrastive Language-Image\nPre-training (CLIP) when scaled down to limited computation budgets. We explore\nCLIP along three dimensions: data, architecture, and training strategies. With\nregards to data, we demonstrate the significance of high-quality training data\nand show that a smaller dataset of high-quality data can outperform a larger\ndataset with lower quality. We also examine how model performance varies with\ndifferent dataset sizes, suggesting that smaller ViT models are better suited\nfor smaller datasets, while larger models perform better on larger datasets\nwith fixed compute. Additionally, we provide guidance on when to choose a\nCNN-based architecture or a ViT-based architecture for CLIP training. We\ncompare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data\nAugmentation - and show that the choice of training strategy depends on the\navailable compute resource. Our analysis reveals that CLIP+Data Augmentation\ncan achieve comparable performance to CLIP using only half of the training\ndata. This work provides practical insights into how to effectively train and\ndeploy CLIP models, making them more accessible and affordable for practical\nuse in various applications.\n","authors":["Zichao Li","Cihang Xie","Ekin Dogus Cubuk"],"pdf_url":"https://arxiv.org/pdf/2404.08197v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10193v1","updated":"2024-04-16T00:28:26Z","published":"2024-04-16T00:28:26Z","title":"Consistency and Uncertainty: Identifying Unreliable Responses From\n Black-Box Vision-Language Models for Selective Visual Question Answering","summary":" The goal of selective prediction is to allow an a model to abstain when it\nmay not be able to deliver a reliable prediction, which is important in\nsafety-critical contexts. Existing approaches to selective prediction typically\nrequire access to the internals of a model, require retraining a model or study\nonly unimodal models. However, the most powerful models (e.g. GPT-4) are\ntypically only available as black boxes with inaccessible internals, are not\nretrainable by end-users, and are frequently used for multimodal tasks. We\nstudy the possibility of selective prediction for vision-language models in a\nrealistic, black-box setting. We propose using the principle of\n\\textit{neighborhood consistency} to identify unreliable responses from a\nblack-box vision-language model in question answering tasks. We hypothesize\nthat given only a visual question and model response, the consistency of the\nmodel's responses over the neighborhood of a visual question will indicate\nreliability. It is impossible to directly sample neighbors in feature space in\na black-box setting. Instead, we show that it is possible to use a smaller\nproxy model to approximately sample from the neighborhood. We find that\nneighborhood consistency can be used to identify model responses to visual\nquestions that are likely unreliable, even in adversarial settings or settings\nthat are out-of-distribution to the proxy model.\n","authors":["Zaid Khan","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2404.10193v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10947v1","updated":"2024-04-16T23:05:17Z","published":"2024-04-16T23:05:17Z","title":"Residual Connections Harm Self-Supervised Abstract Feature Learning","summary":" We demonstrate that adding a weighting factor to decay the strength of\nidentity shortcuts within residual networks substantially improves semantic\nfeature learning in the state-of-the-art self-supervised masked autoencoding\n(MAE) paradigm. Our modification to the identity shortcuts within a VIT-B/16\nbackbone of an MAE boosts linear probing accuracy on ImageNet from 67.3% to\n72.3%. This significant gap suggests that, while residual connection structure\nserves an essential role in facilitating gradient propagation, it may have a\nharmful side effect of reducing capacity for abstract learning by virtue of\ninjecting an echo of shallower representations into deeper layers. We\nameliorate this downside via a fixed formula for monotonically decreasing the\ncontribution of identity connections as layer depth increases. Our design\npromotes the gradual development of feature abstractions, without impacting\nnetwork trainability. Analyzing the representations learned by our modified\nresidual networks, we find correlation between low effective feature rank and\ndownstream task performance.\n","authors":["Xiao Zhang","Ruoxi Jiang","William Gao","Rebecca Willett","Michael Maire"],"pdf_url":"https://arxiv.org/pdf/2404.10947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10940v1","updated":"2024-04-16T22:44:29Z","published":"2024-04-16T22:44:29Z","title":"Neuromorphic Vision-based Motion Segmentation with Graph Transformer\n Neural Network","summary":" Moving object segmentation is critical to interpret scene dynamics for\nrobotic navigation systems in challenging environments. Neuromorphic vision\nsensors are tailored for motion perception due to their asynchronous nature,\nhigh temporal resolution, and reduced power consumption. However, their\nunconventional output requires novel perception paradigms to leverage their\nspatially sparse and temporally dense nature. In this work, we propose a novel\nevent-based motion segmentation algorithm using a Graph Transformer Neural\nNetwork, dubbed GTNN. Our proposed algorithm processes event streams as 3D\ngraphs by a series of nonlinear transformations to unveil local and global\nspatiotemporal correlations between events. Based on these correlations, events\nbelonging to moving objects are segmented from the background without prior\nknowledge of the dynamic scene geometry. The algorithm is trained on publicly\navailable datasets including MOD, EV-IMO, and \\textcolor{black}{EV-IMO2} using\nthe proposed training scheme to facilitate efficient training on extensive\ndatasets. Moreover, we introduce the Dynamic Object Mask-aware Event Labeling\n(DOMEL) approach for generating approximate ground-truth labels for event-based\nmotion segmentation datasets. We use DOMEL to label our own recorded Event\ndataset for Motion Segmentation (EMS-DOMEL), which we release to the public for\nfurther research and benchmarking. Rigorous experiments are conducted on\nseveral unseen publicly-available datasets where the results revealed that GTNN\noutperforms state-of-the-art methods in the presence of dynamic background\nvariations, motion patterns, and multiple dynamic objects with varying sizes\nand velocities. GTNN achieves significant performance gains with an average\nincrease of 9.4% and 4.5% in terms of motion segmentation accuracy (IoU%) and\ndetection rate (DR%), respectively.\n","authors":["Yusra Alkendi","Rana Azzam","Sajid Javed","Lakmal Seneviratne","Yahya Zweiri"],"pdf_url":"https://arxiv.org/pdf/2404.10940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17121v2","updated":"2024-04-16T22:41:26Z","published":"2023-11-28T13:44:33Z","title":"ScribbleGen: Generative Data Augmentation Improves Scribble-supervised\n Semantic Segmentation","summary":" Recent advances in generative models, such as diffusion models, have made\ngenerating high-quality synthetic images widely accessible. Prior works have\nshown that training on synthetic images improves many perception tasks, such as\nimage classification, object detection, and semantic segmentation. We are the\nfirst to explore generative data augmentations for scribble-supervised semantic\nsegmentation. We propose ScribbleGen, a generative data augmentation method\nthat leverages a ControlNet diffusion model conditioned on semantic scribbles\nto produce high-quality training data. However, naive implementations of\ngenerative data augmentations may inadvertently harm the performance of the\ndownstream segmentor rather than improve it. We leverage classifier-free\ndiffusion guidance to enforce class consistency and introduce encode ratios to\ntrade off data diversity for data realism. Using the guidance scale and encode\nratio, we can generate a spectrum of high-quality training images. We propose\nmultiple augmentation schemes and find that these schemes significantly impact\nmodel performance, especially in the low-data regime. Our framework further\nreduces the gap between the performance of scribble-supervised segmentation and\nthat of fully-supervised segmentation. We also show that our framework\nsignificantly improves segmentation performance on small datasets, even\nsurpassing fully-supervised segmentation. The code is available at\nhttps://github.com/mengtang-lab/scribblegen.\n","authors":["Jacob Schnell","Jieke Wang","Lu Qi","Vincent Tao Hu","Meng Tang"],"pdf_url":"https://arxiv.org/pdf/2311.17121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13938v2","updated":"2024-04-16T22:29:17Z","published":"2023-07-26T03:30:28Z","title":"Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese\n Structure Network","summary":" Semi-supervised semantic segmentation (SSS) is an important task that\nutilizes both labeled and unlabeled data to reduce expenses on labeling\ntraining examples. However, the effectiveness of SSS algorithms is limited by\nthe difficulty of fully exploiting the potential of unlabeled data. To address\nthis, we propose a dual-level Siamese structure network (DSSN) for pixel-wise\ncontrastive learning. By aligning positive pairs with a pixel-wise contrastive\nloss using strong augmented views in both low-level image space and high-level\nfeature space, the proposed DSSN is designed to maximize the utilization of\navailable unlabeled data. Additionally, we introduce a novel class-aware\npseudo-label selection strategy for weak-to-strong supervision, which addresses\nthe limitations of most existing methods that do not perform selection or apply\na predefined threshold for all classes. Specifically, our strategy selects the\ntop high-confidence prediction of the weak view for each class to generate\npseudo labels that supervise the strong augmented views. This strategy is\ncapable of taking into account the class imbalance and improving the\nperformance of long-tailed classes. Our proposed method achieves\nstate-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes,\noutperforming other SSS algorithms by a significant margin. The source code is\navailable at https://github.com/kunzhan/DSSN.\n","authors":["Zhibo Tain","Xiaolin Zhang","Peng Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.13938v2.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2402.18771v2","updated":"2024-04-16T22:15:58Z","published":"2024-02-29T00:25:26Z","title":"NARUTO: Neural Active Reconstruction from Uncertain Target Observations","summary":" We present NARUTO, a neural active reconstruction system that combines a\nhybrid neural representation with uncertainty learning, enabling high-fidelity\nsurface reconstruction. Our approach leverages a multi-resolution hash-grid as\nthe mapping backbone, chosen for its exceptional convergence speed and capacity\nto capture high-frequency local features.The centerpiece of our work is the\nincorporation of an uncertainty learning module that dynamically quantifies\nreconstruction uncertainty while actively reconstructing the environment. By\nharnessing learned uncertainty, we propose a novel uncertainty aggregation\nstrategy for goal searching and efficient path planning. Our system\nautonomously explores by targeting uncertain observations and reconstructs\nenvironments with remarkable completeness and fidelity. We also demonstrate the\nutility of this uncertainty-aware approach by enhancing SOTA neural SLAM\nsystems through an active ray sampling strategy. Extensive evaluations of\nNARUTO in various environments, using an indoor scene simulator, confirm its\nsuperior performance and state-of-the-art status in active reconstruction, as\nevidenced by its impressive results on benchmark datasets like Replica and\nMP3D.\n","authors":["Ziyue Feng","Huangying Zhan","Zheng Chen","Qingan Yan","Xiangyu Xu","Changjiang Cai","Bing Li","Qilun Zhu","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2402.18771v2.pdf","comment":"Accepted to CVPR2024. Project page:\n https://oppo-us-research.github.io/NARUTO-website/. Code:\n https://github.com/oppo-us-research/NARUTO"},{"id":"http://arxiv.org/abs/2403.09799v2","updated":"2024-04-16T22:03:16Z","published":"2024-03-14T18:37:46Z","title":"BOP Challenge 2023 on Detection, Segmentation and Pose Estimation of\n Seen and Unseen Rigid Objects","summary":" We present the evaluation methodology, datasets and results of the BOP\nChallenge 2023, the fifth in a series of public competitions organized to\ncapture the state of the art in model-based 6D object pose estimation from an\nRGB/RGB-D image and related tasks. Besides the three tasks from 2022\n(model-based 2D detection, 2D segmentation, and 6D localization of objects seen\nduring training), the 2023 challenge introduced new variants of these tasks\nfocused on objects unseen during training. In the new tasks, methods were\nrequired to learn new objects during a short onboarding stage (max 5 minutes, 1\nGPU) from provided 3D object models. The best 2023 method for 6D localization\nof unseen objects (GenFlow) notably reached the accuracy of the best 2020\nmethod for seen objects (CosyPose), although being noticeably slower. The best\n2023 method for seen objects (GPose) achieved a moderate accuracy improvement\nbut a significant 43% run-time improvement compared to the best 2022\ncounterpart (GDRNPP). Since 2017, the accuracy of 6D localization of seen\nobjects has improved by more than 50% (from 56.9 to 85.6 AR_C). The online\nevaluation system stays open and is available at: http://bop.felk.cvut.cz/.\n","authors":["Tomas Hodan","Martin Sundermeyer","Yann Labbe","Van Nguyen Nguyen","Gu Wang","Eric Brachmann","Bertram Drost","Vincent Lepetit","Carsten Rother","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2403.09799v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2302.13075"},{"id":"http://arxiv.org/abs/2404.10927v1","updated":"2024-04-16T21:57:58Z","published":"2024-04-16T21:57:58Z","title":"A Concise Tiling Strategy for Preserving Spatial Context in Earth\n Observation Imagery","summary":" We propose a new tiling strategy, Flip-n-Slide, which has been developed for\nspecific use with large Earth observation satellite images when the location of\nobjects-of-interest (OoI) is unknown and spatial context can be necessary for\nclass disambiguation. Flip-n-Slide is a concise and minimalistic approach that\nallows OoI to be represented at multiple tile positions and orientations. This\nstrategy introduces multiple views of spatio-contextual information, without\nintroducing redundancies into the training set. By maintaining distinct\ntransformation permutations for each tile overlap, we enhance the\ngeneralizability of the training set without misrepresenting the true data\ndistribution. Our experiments validate the effectiveness of Flip-n-Slide in the\ntask of semantic segmentation, a necessary data product in geophysical studies.\nWe find that Flip-n-Slide outperforms the previous state-of-the-art\naugmentation routines for tiled data in all evaluation metrics. For\nunderrepresented classes, Flip-n-Slide increases precision by as much as 15.8%.\n","authors":["Ellianna Abrahams","Tasha Snow","Matthew R. Siegfried","Fernando Pérez"],"pdf_url":"https://arxiv.org/pdf/2404.10927v1.pdf","comment":"Accepted to the Machine Learning for Remote Sensing (ML4RS) Workshop\n at ICLR 2024"},{"id":"http://arxiv.org/abs/2110.14553v4","updated":"2024-04-16T21:52:28Z","published":"2021-10-27T16:24:39Z","title":"GenURL: A General Framework for Unsupervised Representation Learning","summary":" Unsupervised representation learning (URL), which learns compact embeddings\nof high-dimensional data without supervision, has made remarkable progress\nrecently. However, the development of URLs for different requirements is\nindependent, which limits the generalization of the algorithms, especially\nprohibitive as the number of tasks grows. For example, dimension reduction\nmethods, t-SNE, and UMAP optimize pair-wise data relationships by preserving\nthe global geometric structure, while self-supervised learning, SimCLR, and\nBYOL focus on mining the local statistics of instances under specific\naugmentations. To address this dilemma, we summarize and propose a unified\nsimilarity-based URL framework, GenURL, which can smoothly adapt to various URL\ntasks. In this paper, we regard URL tasks as different implicit constraints on\nthe data geometric structure that help to seek optimal low-dimensional\nrepresentations that boil down to data structural modeling (DSM) and\nlow-dimensional transformation (LDT). Specifically, DMS provides a\nstructure-based submodule to describe the global structures, and LDT learns\ncompact low-dimensional embeddings with given pretext tasks. Moreover, an\nobjective function, General Kullback-Leibler divergence (GKL), is proposed to\nconnect DMS and LDT naturally. Comprehensive experiments demonstrate that\nGenURL achieves consistent state-of-the-art performance in self-supervised\nvisual learning, unsupervised knowledge distillation (KD), graph embeddings\n(GE), and dimension reduction.\n","authors":["Siyuan Li","Zicheng Liu","Zelin Zang","Di Wu","Zhiyuan Chen","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2110.14553v4.pdf","comment":"TNNLS 2024 version with 13 pages and 14 figures"},{"id":"http://arxiv.org/abs/2402.01203v2","updated":"2024-04-16T21:44:32Z","published":"2024-02-02T08:13:18Z","title":"Neural Language of Thought Models","summary":" The Language of Thought Hypothesis suggests that human cognition operates on\na structured, language-like system of mental representations. While neural\nlanguage models can naturally benefit from the compositional structure\ninherently and explicitly expressed in language data, learning such\nrepresentations from non-linguistic general observations, like images, remains\na challenge. In this work, we introduce the Neural Language of Thought Model\n(NLoTM), a novel approach for unsupervised learning of LoTH-inspired\nrepresentation and generation. NLoTM comprises two key components: (1) the\nSemantic Vector-Quantized Variational Autoencoder, which learns hierarchical,\ncomposable discrete representations aligned with objects and their properties,\nand (2) the Autoregressive LoT Prior, an autoregressive transformer that learns\nto generate semantic concept tokens compositionally, capturing the underlying\ndata distribution. We evaluate NLoTM on several 2D and 3D image datasets,\ndemonstrating superior performance in downstream tasks, out-of-distribution\ngeneralization, and image generation quality compared to patch-based VQ-VAE and\ncontinuous object-centric representations. Our work presents a significant step\ntowards creating neural networks exhibiting more human-like understanding by\ndeveloping LoT-like representations and offers insights into the intersection\nof cognitive science and machine learning.\n","authors":["Yi-Fu Wu","Minseung Lee","Sungjin Ahn"],"pdf_url":"https://arxiv.org/pdf/2402.01203v2.pdf","comment":"Accepted in ICLR 2024"},{"id":"http://arxiv.org/abs/2310.07687v2","updated":"2024-04-16T21:13:12Z","published":"2023-10-11T17:36:17Z","title":"Orbital Polarimetric Tomography of a Flare Near the Sagittarius A*\n Supermassive Black Hole","summary":" The interaction between the supermassive black hole at the center of the\nMilky Way, Sagittarius A*, and its accretion disk occasionally produces\nhigh-energy flares seen in X-ray, infrared, and radio. One proposed mechanism\nthat produces flares is the formation of compact, bright regions that appear\nwithin the accretion disk and close to the event horizon. Understanding these\nflares provides a window into accretion processes. Although sophisticated\nsimulations predict the formation of these flares, their structure has yet to\nbe recovered by observations. Here we show the first three-dimensional (3D)\nreconstruction of an emission flare recovered from ALMA light curves observed\non April 11, 2017. Our recovery shows compact, bright regions at a distance of\nroughly six times the event horizon. Moreover, it suggests a clockwise rotation\nin a low-inclination orbital plane, consistent with prior studies by GRAVITY\nand EHT. To recover this emission structure, we solve an ill-posed tomography\nproblem by integrating a neural 3D representation with a gravitational model\nfor black holes. Although the recovery is subject to, and sometimes sensitive\nto, the model assumptions, under physically motivated choices, our results are\nstable, and our approach is successful on simulated data.\n","authors":["Aviad Levis","Andrew A. Chael","Katherine L. Bouman","Maciek Wielgus","Pratul P. Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2310.07687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16168v2","updated":"2024-04-16T21:05:24Z","published":"2023-12-26T18:56:49Z","title":"Social-Transmotion: Promptable Human Trajectory Prediction","summary":" Accurate human trajectory prediction is crucial for applications such as\nautonomous vehicles, robotics, and surveillance systems. Yet, existing models\noften fail to fully leverage the non-verbal social cues human subconsciously\ncommunicate when navigating the space. To address this, we introduce\nSocial-Transmotion, a generic Transformer-based model that exploits diverse and\nnumerous visual cues to predict human behavior. We translate the idea of a\nprompt from Natural Language Processing (NLP) to the task of human trajectory\nprediction, where a prompt can be a sequence of x-y coordinates on the ground,\nbounding boxes in the image plane, or body pose keypoints in either 2D or 3D.\nThis, in turn, augments trajectory data, leading to enhanced human trajectory\nprediction. Using masking technique, our model exhibits flexibility and\nadaptability by capturing spatiotemporal interactions between agents based on\nthe available visual cues. We delve into the merits of using 2D versus 3D\nposes, and a limited set of poses. Additionally, we investigate the spatial and\ntemporal attention map to identify which keypoints and time-steps in the\nsequence are vital for optimizing human trajectory prediction. Our approach is\nvalidated on multiple datasets, including JTA, JRDB, Pedestrians and Cyclists\nin Road Traffic, and ETH-UCY. The code is publicly available:\nhttps://github.com/vita-epfl/social-transmotion.\n","authors":["Saeed Saadatnejad","Yang Gao","Kaouther Messaoud","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2312.16168v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2307.03798v3","updated":"2024-04-16T20:57:35Z","published":"2023-07-07T18:54:11Z","title":"Fooling Contrastive Language-Image Pre-trained Models with\n CLIPMasterPrints","summary":" Models leveraging both visual and textual data such as Contrastive\nLanguage-Image Pre-training (CLIP), are the backbone of many recent advances in\nartificial intelligence. In this work, we show that despite their versatility,\nsuch models are vulnerable to what we refer to as fooling master images.\nFooling master images are capable of maximizing the confidence score of a CLIP\nmodel for a significant number of widely varying prompts, while being either\nunrecognizable or unrelated to the attacked prompts for humans. The existence\nof such images is problematic as it could be used by bad actors to maliciously\ninterfere with CLIP-trained image retrieval models in production with\ncomparably small effort as a single image can attack many different prompts. We\ndemonstrate how fooling master images for CLIP (CLIPMasterPrints) can be mined\nusing stochastic gradient descent, projected gradient descent, or blackbox\noptimization. Contrary to many common adversarial attacks, the blackbox\noptimization approach allows us to mine CLIPMasterPrints even when the weights\nof the model are not accessible. We investigate the properties of the mined\nimages, and find that images trained on a small number of image captions\ngeneralize to a much larger number of semantically related captions. We\nevaluate possible mitigation strategies, where we increase the robustness of\nthe model and introduce an approach to automatically detect CLIPMasterPrints to\nsanitize the input of vulnerable models. Finally, we find that vulnerability to\nCLIPMasterPrints is related to a modality gap in contrastive pre-trained\nmulti-modal networks. Code available at\nhttps://github.com/matfrei/CLIPMasterPrints.\n","authors":["Matthias Freiberger","Peter Kun","Christian Igel","Anders Sundnes Løvlie","Sebastian Risi"],"pdf_url":"https://arxiv.org/pdf/2307.03798v3.pdf","comment":"This work was supported by a research grant (40575) from VILLUM\n FONDEN"},{"id":"http://arxiv.org/abs/2404.10904v1","updated":"2024-04-16T20:51:36Z","published":"2024-04-16T20:51:36Z","title":"Multi-Task Multi-Modal Self-Supervised Learning for Facial Expression\n Recognition","summary":" Human communication is multi-modal; e.g., face-to-face interaction involves\nauditory signals (speech) and visual signals (face movements and hand\ngestures). Hence, it is essential to exploit multiple modalities when designing\nmachine learning-based facial expression recognition systems. In addition,\ngiven the ever-growing quantities of video data that capture human facial\nexpressions, such systems should utilize raw unlabeled videos without requiring\nexpensive annotations. Therefore, in this work, we employ a multitask\nmulti-modal self-supervised learning method for facial expression recognition\nfrom in-the-wild video data. Our model combines three self-supervised objective\nfunctions: First, a multi-modal contrastive loss, that pulls diverse data\nmodalities of the same video together in the representation space. Second, a\nmulti-modal clustering loss that preserves the semantic structure of input data\nin the representation space. Finally, a multi-modal data reconstruction loss.\nWe conduct a comprehensive study on this multimodal multi-task self-supervised\nlearning method on three facial expression recognition benchmarks. To that end,\nwe examine the performance of learning through different combinations of\nself-supervised tasks on the facial expression recognition downstream task. Our\nmodel ConCluGen outperforms several multi-modal self-supervised and fully\nsupervised baselines on the CMU-MOSEI dataset. Our results generally show that\nmulti-modal self-supervision tasks offer large performance gains for\nchallenging tasks such as facial expression recognition, while also reducing\nthe amount of manual annotations required. We release our pre-trained models as\nwell as source code publicly\n","authors":["Marah Halawa","Florian Blume","Pia Bideau","Martin Maier","Rasha Abdel Rahman","Olaf Hellwich"],"pdf_url":"https://arxiv.org/pdf/2404.10904v1.pdf","comment":"The paper will appear in the CVPR 2024 workshops proceedings"},{"id":"http://arxiv.org/abs/2404.10896v1","updated":"2024-04-16T20:37:54Z","published":"2024-04-16T20:37:54Z","title":"From a Lossless (~1.5:1) Compression Algorithm for Llama2 7B Weights to\n Variable Precision, Variable Range, Compressed Numeric Data Types for CNNs\n and LLMs","summary":" This paper starts with a simple lossless ~1.5:1 compression algorithm for the\nweights of the Large Language Model (LLM) Llama2 7B [1] that can be implemented\nin ~200 LUTs in AMD FPGAs, processing over 800 million bfloat16 numbers per\nsecond. This framework is then extended to variable precision, variable range,\ncompressed numerical data types that are a user defined super set of both\nfloats and posits [2]. The paper then discusses a simple hardware\nimplementation of such format based on ANS (Asymmetrical Numeral Systems) [3]\nthat acts as a bridge between this flexible data format and a computational\nengine while, at the same time, achieving bandwidth reduction. An example of a\ntoken factory using weight compression and sharing is also given.\n","authors":["Vincenzo Liguori"],"pdf_url":"https://arxiv.org/pdf/2404.10896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10894v1","updated":"2024-04-16T20:37:14Z","published":"2024-04-16T20:37:14Z","title":"Semantics-Aware Attention Guidance for Diagnosing Whole Slide Images","summary":" Accurate cancer diagnosis remains a critical challenge in digital pathology,\nlargely due to the gigapixel size and complex spatial relationships present in\nwhole slide images. Traditional multiple instance learning (MIL) methods often\nstruggle with these intricacies, especially in preserving the necessary context\nfor accurate diagnosis. In response, we introduce a novel framework named\nSemantics-Aware Attention Guidance (SAG), which includes 1) a technique for\nconverting diagnostically relevant entities into attention signals, and 2) a\nflexible attention loss that efficiently integrates various semantically\nsignificant information, such as tissue anatomy and cancerous regions. Our\nexperiments on two distinct cancer datasets demonstrate consistent improvements\nin accuracy, precision, and recall with two state-of-the-art baseline models.\nQualitative analysis further reveals that the incorporation of heuristic\nguidance enables the model to focus on regions critical for diagnosis. SAG is\nnot only effective for the models discussed here, but its adaptability extends\nto any attention-based diagnostic model. This opens up exciting possibilities\nfor further improving the accuracy and efficiency of cancer diagnostics.\n","authors":["Kechun Liu","Wenjun Wu","Joann G. Elmore","Linda G. Shapiro"],"pdf_url":"https://arxiv.org/pdf/2404.10894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10892v1","updated":"2024-04-16T20:30:16Z","published":"2024-04-16T20:30:16Z","title":"Automatic classification of prostate MR series type using image content\n and metadata","summary":" With the wealth of medical image data, efficient curation is essential.\nAssigning the sequence type to magnetic resonance images is necessary for\nscientific studies and artificial intelligence-based analysis. However,\nincomplete or missing metadata prevents effective automation. We therefore\npropose a deep-learning method for classification of prostate cancer scanning\nsequences based on a combination of image data and DICOM metadata. We\ndemonstrate superior results compared to metadata or image data alone, and make\nour code publicly available at\nhttps://github.com/deepakri201/DICOMScanClassification.\n","authors":["Deepa Krishnaswamy","Bálint Kovács","Stefan Denner","Steve Pieper","David Clunie","Christopher P. Bridge","Tina Kapur","Klaus H. Maier-Hein","Andrey Fedorov"],"pdf_url":"https://arxiv.org/pdf/2404.10892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10880v1","updated":"2024-04-16T19:59:21Z","published":"2024-04-16T19:59:21Z","title":"HumMUSS: Human Motion Understanding using State Space Models","summary":" Understanding human motion from video is essential for a range of\napplications, including pose estimation, mesh recovery and action recognition.\nWhile state-of-the-art methods predominantly rely on transformer-based\narchitectures, these approaches have limitations in practical scenarios.\nTransformers are slower when sequentially predicting on a continuous stream of\nframes in real-time, and do not generalize to new frame rates. In light of\nthese constraints, we propose a novel attention-free spatiotemporal model for\nhuman motion understanding building upon recent advancements in state space\nmodels. Our model not only matches the performance of transformer-based models\nin various motion understanding tasks but also brings added benefits like\nadaptability to different video frame rates and enhanced training speed when\nworking with longer sequence of keypoints. Moreover, the proposed model\nsupports both offline and real-time applications. For real-time sequential\nprediction, our model is both memory efficient and several times faster than\ntransformer-based approaches while maintaining their high accuracy.\n","authors":["Arnab Kumar Mondal","Stefano Alletto","Denis Tome"],"pdf_url":"https://arxiv.org/pdf/2404.10880v1.pdf","comment":"CVPR 24"},{"id":"http://arxiv.org/abs/2404.10865v1","updated":"2024-04-16T19:29:27Z","published":"2024-04-16T19:29:27Z","title":"OSR-ViT: A Simple and Modular Framework for Open-Set Object Detection\n and Discovery","summary":" An object detector's ability to detect and flag \\textit{novel} objects during\nopen-world deployments is critical for many real-world applications.\nUnfortunately, much of the work in open object detection today is disjointed\nand fails to adequately address applications that prioritize unknown object\nrecall \\textit{in addition to} known-class accuracy. To close this gap, we\npresent a new task called Open-Set Object Detection and Discovery (OSODD) and\nas a solution propose the Open-Set Regions with ViT features (OSR-ViT)\ndetection framework. OSR-ViT combines a class-agnostic proposal network with a\npowerful ViT-based classifier. Its modular design simplifies optimization and\nallows users to easily swap proposal solutions and feature extractors to best\nsuit their application. Using our multifaceted evaluation protocol, we show\nthat OSR-ViT obtains performance levels that far exceed state-of-the-art\nsupervised methods. Our method also excels in low-data settings, outperforming\nsupervised baselines using a fraction of the training data.\n","authors":["Matthew Inkawhich","Nathan Inkawhich","Hao Yang","Jingyang Zhang","Randolph Linderman","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2404.10865v1.pdf","comment":"28 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.10864v1","updated":"2024-04-16T19:27:21Z","published":"2024-04-16T19:27:21Z","title":"Vocabulary-free Image Classification and Semantic Segmentation","summary":" Large vision-language models revolutionized image classification and semantic\nsegmentation paradigms. However, they typically assume a pre-defined set of\ncategories, or vocabulary, at test time for composing textual prompts. This\nassumption is impractical in scenarios with unknown or evolving semantic\ncontext. Here, we address this issue and introduce the Vocabulary-free Image\nClassification (VIC) task, which aims to assign a class from an unconstrained\nlanguage-induced semantic space to an input image without needing a known\nvocabulary. VIC is challenging due to the vastness of the semantic space, which\ncontains millions of concepts, including fine-grained categories. To address\nVIC, we propose Category Search from External Databases (CaSED), a\ntraining-free method that leverages a pre-trained vision-language model and an\nexternal database. CaSED first extracts the set of candidate categories from\nthe most semantically similar captions in the database and then assigns the\nimage to the best-matching candidate category according to the same\nvision-language model. Furthermore, we demonstrate that CaSED can be applied\nlocally to generate a coarse segmentation mask that classifies image regions,\nintroducing the task of Vocabulary-free Semantic Segmentation. CaSED and its\nvariants outperform other more complex vision-language models, on\nclassification and semantic segmentation benchmarks, while using much fewer\nparameters.\n","authors":["Alessandro Conti","Enrico Fini","Massimiliano Mancini","Paolo Rota","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.10864v1.pdf","comment":"Under review, 22 pages, 10 figures, code is available at\n https://github.com/altndrr/vicss. arXiv admin note: text overlap with\n arXiv:2306.00917"},{"id":"http://arxiv.org/abs/2208.11050v3","updated":"2024-04-16T19:16:34Z","published":"2022-08-23T15:57:19Z","title":"Tunable Hybrid Proposal Networks for the Open World","summary":" Current state-of-the-art object proposal networks are trained with a\nclosed-world assumption, meaning they learn to only detect objects of the\ntraining classes. These models fail to provide high recall in open-world\nenvironments where important novel objects may be encountered. While a handful\nof recent works attempt to tackle this problem, they fail to consider that the\noptimal behavior of a proposal network can vary significantly depending on the\ndata and application. Our goal is to provide a flexible proposal solution that\ncan be easily tuned to suit a variety of open-world settings. To this end, we\ndesign a Tunable Hybrid Proposal Network (THPN) that leverages an adjustable\nhybrid architecture, a novel self-training procedure, and dynamic loss\ncomponents to optimize the tradeoff between known and unknown object detection\nperformance. To thoroughly evaluate our method, we devise several new\nchallenges which invoke varying degrees of label bias by altering known class\ndiversity and label count. We find that in every task, THPN easily outperforms\nexisting baselines (e.g., RPN, OLN). Our method is also highly data efficient,\nsurpassing baseline recall with a fraction of the labeled data.\n","authors":["Matthew Inkawhich","Nathan Inkawhich","Hai Li","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2208.11050v3.pdf","comment":"Published in WACV 2024. 22 pages, 9 figures, 12 tables"},{"id":"http://arxiv.org/abs/2404.10856v1","updated":"2024-04-16T19:10:40Z","published":"2024-04-16T19:10:40Z","title":"UruDendro, a public dataset of cross-section images of Pinus taeda","summary":" The automatic detection of tree-ring boundaries and other anatomical features\nusing image analysis has progressed substantially over the past decade with\nadvances in machine learning and imagery technology, as well as increasing\ndemands from the dendrochronology community. This paper presents a publicly\navailable database of 64 scanned images of transverse sections of commercially\ngrown Pinus taeda trees from northern Uruguay, ranging from 17 to 24 years old.\nThe collection contains several challenging features for automatic ring\ndetection, including illumination and surface preparation variation, fungal\ninfection (blue stains), knot formation, missing cortex or interruptions in\nouter rings, and radial cracking. This dataset can be used to develop and test\nautomatic tree ring detection algorithms. This paper presents to the\ndendrochronology community one such method, Cross-Section Tree-Ring Detection\n(CS-TRD), which identifies and marks complete annual rings in cross-sections\nfor tree species presenting a clear definition between early and latewood. We\ncompare the CS-TRD performance against the ground truth manual delineation of\nall rings over the UruDendro dataset. The CS-TRD software identified rings with\nan average F-score of 89% and RMSE error of 5.27px for the entire database in\nless than 20 seconds per image. Finally, we propose a robust measure of the\nring growth using the \\emph{equivalent radius} of a circle having the same area\nenclosed by the detected tree ring. Overall, this study contributes to the\ndendrochronologist's toolbox of fast and low-cost methods to automatically\ndetect rings in conifer species, particularly for measuring diameter growth\nrates and stem transverse area using entire cross-sections.\n","authors":["Henry Marichal","Diego Passarella","Christine Lucas","Ludmila Profumo","Verónica Casaravilla","María Noel Rocha Galli","Serrana Ambite","Gregory Randall"],"pdf_url":"https://arxiv.org/pdf/2404.10856v1.pdf","comment":"Submitted to Dendrochronologia. arXiv admin note: text overlap with\n arXiv:2305.10809"},{"id":"http://arxiv.org/abs/2402.02286v2","updated":"2024-04-16T19:07:06Z","published":"2024-02-03T22:51:17Z","title":"Multi-Level Feature Aggregation and Recursive Alignment Network for\n Real-Time Semantic Segmentation","summary":" Real-time semantic segmentation is a crucial research for real-world\napplications. However, many methods lay particular emphasis on reducing the\ncomputational complexity and model size, while largely sacrificing the\naccuracy. To tackle this problem, we propose a parallel inference network\ncustomized for semantic segmentation tasks to achieve a good trade-off between\nspeed and accuracy. We employ a shallow backbone to ensure real-time speed, and\npropose three core components to compensate for the reduced model capacity to\nimprove accuracy. Specifically, we first design a dual-pyramidal path\narchitecture (Multi-level Feature Aggregation Module, MFAM) to aggregate\nmulti-level features from the encoder to each scale, providing hierarchical\nclues for subsequent spatial alignment and corresponding in-network inference.\nThen, we build Recursive Alignment Module (RAM) by combining the flow-based\nalignment module with recursive upsampling architecture for accurate spatial\nalignment between multi-scale feature maps with half the computational\ncomplexity of the straightforward alignment method. Finally, we perform\nindependent parallel inference on the aligned features to obtain multi-scale\nscores, and adaptively fuse them through an attention-based Adaptive Scores\nFusion Module (ASFM) so that the final prediction can favor objects of multiple\nscales. Our framework shows a better balance between speed and accuracy than\nstate-of-the-art real-time methods on Cityscapes and CamVid datasets. We also\nconducted systematic ablation studies to gain insight into our motivation and\narchitectural design. Code is available at:\nhttps://github.com/Yanhua-Zhang/MFARANet.\n","authors":["Yanhua Zhang","Ke Zhang","Jingyu Wang","Yulin Wu","Wuwei Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02286v2.pdf","comment":"15 pages, 9 figures and 12 Tables. Manuscript completed on April 30,\n 2022"},{"id":"http://arxiv.org/abs/2404.10841v1","updated":"2024-04-16T18:38:23Z","published":"2024-04-16T18:38:23Z","title":"Gasformer: A Transformer-based Architecture for Segmenting Methane\n Emissions from Livestock in Optical Gas Imaging","summary":" Methane emissions from livestock, particularly cattle, significantly\ncontribute to climate change. Effective methane emission mitigation strategies\nare crucial as the global population and demand for livestock products\nincrease. We introduce Gasformer, a novel semantic segmentation architecture\nfor detecting low-flow rate methane emissions from livestock, and controlled\nrelease experiments using optical gas imaging. We present two unique datasets\ncaptured with a FLIR GF77 OGI camera. Gasformer leverages a Mix Vision\nTransformer encoder and a Light-Ham decoder to generate multi-scale features\nand refine segmentation maps. Gasformer outperforms other state-of-the-art\nmodels on both datasets, demonstrating its effectiveness in detecting and\nsegmenting methane plumes in controlled and real-world scenarios. On the\nlivestock dataset, Gasformer achieves mIoU of 88.56%, surpassing other\nstate-of-the-art models. Materials are available at:\ngithub.com/toqitahamid/Gasformer.\n","authors":["Toqi Tahamid Sarker","Mohamed G Embaby","Khaled R Ahmed","Amer AbuGhazaleh"],"pdf_url":"https://arxiv.org/pdf/2404.10841v1.pdf","comment":"9 pages, 5 figures, this paper has been submitted and accepted for\n publication at CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.10838v1","updated":"2024-04-16T18:22:49Z","published":"2024-04-16T18:22:49Z","title":"Dynamic Self-adaptive Multiscale Distillation from Pre-trained\n Multimodal Large Model for Efficient Cross-modal Representation Learning","summary":" In recent years, pre-trained multimodal large models have attracted\nwidespread attention due to their outstanding performance in various multimodal\napplications. Nonetheless, the extensive computational resources and vast\ndatasets required for their training present significant hurdles for deployment\nin environments with limited computational resources. To address this\nchallenge, we propose a novel dynamic self-adaptive multiscale distillation\nfrom pre-trained multimodal large model for efficient cross-modal\nrepresentation learning for the first time. Unlike existing distillation\nmethods, our strategy employs a multiscale perspective, enabling the extraction\nstructural knowledge across from the pre-trained multimodal large model.\nEnsuring that the student model inherits a comprehensive and nuanced\nunderstanding of the teacher knowledge. To optimize each distillation loss in a\nbalanced and efficient manner, we propose a dynamic self-adaptive distillation\nloss balancer, a novel component eliminating the need for manual loss weight\nadjustments and dynamically balances each loss item during the distillation\nprocess. Our methodology streamlines pre-trained multimodal large models using\nonly their output features and original image-level information, requiring\nminimal computational resources. This efficient approach is suited for various\napplications and allows the deployment of advanced multimodal technologies even\nin resource-limited settings. Extensive experiments has demonstrated that our\nmethod maintains high performance while significantly reducing model complexity\nand training costs. Moreover, our distilled student model utilizes only\nimage-level information to achieve state-of-the-art performance on cross-modal\nretrieval tasks, surpassing previous methods that relied on region-level\ninformation.\n","authors":["Zhengyang Liang","Meiyu Liang","Wei Huang","Yawen Li","Zhe Xue"],"pdf_url":"https://arxiv.org/pdf/2404.10838v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.10836v1","updated":"2024-04-16T18:15:57Z","published":"2024-04-16T18:15:57Z","title":"Semantic-Based Active Perception for Humanoid Visual Tasks with Foveal\n Sensors","summary":" The aim of this work is to establish how accurately a recent semantic-based\nfoveal active perception model is able to complete visual tasks that are\nregularly performed by humans, namely, scene exploration and visual search.\nThis model exploits the ability of current object detectors to localize and\nclassify a large number of object classes and to update a semantic description\nof a scene across multiple fixations. It has been used previously in scene\nexploration tasks. In this paper, we revisit the model and extend its\napplication to visual search tasks. To illustrate the benefits of using\nsemantic information in scene exploration and visual search tasks, we compare\nits performance against traditional saliency-based models. In the task of scene\nexploration, the semantic-based method demonstrates superior performance\ncompared to the traditional saliency-based model in accurately representing the\nsemantic information present in the visual scene. In visual search experiments,\nsearching for instances of a target class in a visual field containing multiple\ndistractors shows superior performance compared to the saliency-driven model\nand a random gaze selection algorithm. Our results demonstrate that semantic\ninformation, from the top-down, influences visual exploration and search tasks\nsignificantly, suggesting a potential area of research for integrating it with\ntraditional bottom-up cues.\n","authors":["João Luzio","Alexandre Bernardino","Plinio Moreno"],"pdf_url":"https://arxiv.org/pdf/2404.10836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18635v2","updated":"2024-04-16T18:07:51Z","published":"2023-11-30T15:43:13Z","title":"DiffusionAvatars: Deferred Diffusion for High-fidelity 3D Head Avatars","summary":" DiffusionAvatars synthesizes a high-fidelity 3D head avatar of a person,\noffering intuitive control over both pose and expression. We propose a\ndiffusion-based neural renderer that leverages generic 2D priors to produce\ncompelling images of faces. For coarse guidance of the expression and head\npose, we render a neural parametric head model (NPHM) from the target\nviewpoint, which acts as a proxy geometry of the person. Additionally, to\nenhance the modeling of intricate facial expressions, we condition\nDiffusionAvatars directly on the expression codes obtained from NPHM via\ncross-attention. Finally, to synthesize consistent surface details across\ndifferent viewpoints and expressions, we rig learnable spatial features to the\nhead's surface via TriPlane lookup in NPHM's canonical space. We train\nDiffusionAvatars on RGB videos and corresponding fitted NPHM meshes of a person\nand test the obtained avatars in both self-reenactment and animation scenarios.\nOur experiments demonstrate that DiffusionAvatars generates temporally\nconsistent and visually appealing videos for novel poses and expressions of a\nperson, outperforming existing approaches.\n","authors":["Tobias Kirschstein","Simon Giebenhain","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2311.18635v2.pdf","comment":"Project Page: https://tobias-kirschstein.github.io/diffusion-avatars/\n , Video: https://youtu.be/nSjDiiTnp2E"},{"id":"http://arxiv.org/abs/2404.10518v1","updated":"2024-04-16T12:41:25Z","published":"2024-04-16T12:41:25Z","title":"MobileNetV4 -- Universal Models for the Mobile Ecosystem","summary":" We present the latest generation of MobileNets, known as MobileNetV4 (MNv4),\nfeaturing universally efficient architecture designs for mobile devices. At its\ncore, we introduce the Universal Inverted Bottleneck (UIB) search block, a\nunified and flexible structure that merges Inverted Bottleneck (IB), ConvNext,\nFeed Forward Network (FFN), and a novel Extra Depthwise (ExtraDW) variant.\nAlongside UIB, we present Mobile MQA, an attention block tailored for mobile\naccelerators, delivering a significant 39% speedup. An optimized neural\narchitecture search (NAS) recipe is also introduced which improves MNv4 search\neffectiveness. The integration of UIB, Mobile MQA and the refined NAS recipe\nresults in a new suite of MNv4 models that are mostly Pareto optimal across\nmobile CPUs, DSPs, GPUs, as well as specialized accelerators like Apple Neural\nEngine and Google Pixel EdgeTPU - a characteristic not found in any other\nmodels tested. Finally, to further boost accuracy, we introduce a novel\ndistillation technique. Enhanced by this technique, our MNv4-Hybrid-Large model\ndelivers 87% ImageNet-1K accuracy, with a Pixel 8 EdgeTPU runtime of just\n3.8ms.\n","authors":["Danfeng Qin","Chas Leichner","Manolis Delakis","Marco Fornoni","Shixin Luo","Fan Yang","Weijun Wang","Colby Banbury","Chengxi Ye","Berkin Akin","Vaibhav Aggarwal","Tenghui Zhu","Daniele Moro","Andrew Howard"],"pdf_url":"https://arxiv.org/pdf/2404.10518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12407v1","updated":"2024-04-16T17:47:45Z","published":"2024-04-16T17:47:45Z","title":"TV100: A TV Series Dataset that Pre-Trained CLIP Has Not Seen","summary":" The era of pre-trained models has ushered in a wealth of new insights for the\nmachine learning community. Among the myriad of questions that arise, one of\nparamount importance is: 'Do pre-trained models possess comprehensive\nknowledge?' This paper seeks to address this crucial inquiry. In line with our\nobjective, we have made publicly available a novel dataset comprised of images\nfrom TV series released post-2021. This dataset holds significant potential for\nuse in various research areas, including the evaluation of incremental\nlearning, novel class discovery, and long-tailed learning, among others.\nProject page: https://tv-100.github.io/\n","authors":["Da-Wei Zhou","Zhi-Hong Qi","Han-Jia Ye","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2404.12407v1.pdf","comment":"Project page: https://tv-100.github.io/"}]},"2024-04-17T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.11615v1","updated":"2024-04-17T17:59:59Z","published":"2024-04-17T17:59:59Z","title":"Factorized Diffusion: Perceptual Illusions by Noise Decomposition","summary":" Given a factorization of an image into a sum of linear components, we present\na zero-shot method to control each individual component through diffusion model\nsampling. For example, we can decompose an image into low and high spatial\nfrequencies and condition these components on different text prompts. This\nproduces hybrid images, which change appearance depending on viewing distance.\nBy decomposing an image into three frequency subbands, we can generate hybrid\nimages with three prompts. We also use a decomposition into grayscale and color\ncomponents to produce images whose appearance changes when they are viewed in\ngrayscale, a phenomena that naturally occurs under dim lighting. And we explore\na decomposition by a motion blur kernel, which produces images that change\nappearance under motion blurring. Our method works by denoising with a\ncomposite noise estimate, built from the components of noise estimates\nconditioned on different prompts. We also show that for certain decompositions,\nour method recovers prior approaches to compositional generation and spatial\ncontrol. Finally, we show that we can extend our approach to generate hybrid\nimages from real images. We do this by holding one component fixed and\ngenerating the remaining components, effectively solving an inverse problem.\n","authors":["Daniel Geng","Inbum Park","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2404.11615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11614v1","updated":"2024-04-17T17:59:55Z","published":"2024-04-17T17:59:55Z","title":"Dynamic Typography: Bringing Words to Life","summary":" Text animation serves as an expressive medium, transforming static\ncommunication into dynamic experiences by infusing words with motion to evoke\nemotions, emphasize meanings, and construct compelling narratives. Crafting\nanimations that are semantically aware poses significant challenges, demanding\nexpertise in graphic design and animation. We present an automated text\nanimation scheme, termed \"Dynamic Typography\", which combines two challenging\ntasks. It deforms letters to convey semantic meaning and infuses them with\nvibrant movements based on user prompts. Our technique harnesses vector\ngraphics representations and an end-to-end optimization-based framework. This\nframework employs neural displacement fields to convert letters into base\nshapes and applies per-frame motion, encouraging coherence with the intended\ntextual concept. Shape preservation techniques and perceptual loss\nregularization are employed to maintain legibility and structural integrity\nthroughout the animation process. We demonstrate the generalizability of our\napproach across various text-to-video models and highlight the superiority of\nour end-to-end methodology over baseline methods, which might comprise separate\ntasks. Through quantitative and qualitative evaluations, we demonstrate the\neffectiveness of our framework in generating coherent text animations that\nfaithfully interpret user prompts while maintaining readability. Our code is\navailable at: https://animate-your-word.github.io/demo/.\n","authors":["Zichen Liu","Yihao Meng","Hao Ouyang","Yue Yu","Bolin Zhao","Daniel Cohen-Or","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2404.11614v1.pdf","comment":"Our demo page is available at:\n https://animate-your-word.github.io/demo/"},{"id":"http://arxiv.org/abs/2404.11613v1","updated":"2024-04-17T17:59:53Z","published":"2024-04-17T17:59:53Z","title":"InFusion: Inpainting 3D Gaussians via Learning Depth Completion from\n Diffusion Prior","summary":" 3D Gaussians have recently emerged as an efficient representation for novel\nview synthesis. This work studies its editability with a particular focus on\nthe inpainting task, which aims to supplement an incomplete set of 3D Gaussians\nwith additional points for visually harmonious rendering. Compared to 2D\ninpainting, the crux of inpainting 3D Gaussians is to figure out the\nrendering-relevant properties of the introduced points, whose optimization\nlargely benefits from their initial 3D positions. To this end, we propose to\nguide the point initialization with an image-conditioned depth completion\nmodel, which learns to directly restore the depth map based on the observed\nimage. Such a design allows our model to fill in depth values at an aligned\nscale with the original depth, and also to harness strong generalizability from\nlargescale diffusion prior. Thanks to the more accurate depth completion, our\napproach, dubbed InFusion, surpasses existing alternatives with sufficiently\nbetter fidelity and efficiency under various complex scenarios. We further\ndemonstrate the effectiveness of InFusion with several practical applications,\nsuch as inpainting with user-specific texture or with novel object insertion.\n","authors":["Zhiheng Liu","Hao Ouyang","Qiuyu Wang","Ka Leong Cheng","Jie Xiao","Kai Zhu","Nan Xue","Yu Liu","Yujun Shen","Yang Cao"],"pdf_url":"https://arxiv.org/pdf/2404.11613v1.pdf","comment":"Project page: https://johanan528.github.io/Infusion"},{"id":"http://arxiv.org/abs/2311.16278v3","updated":"2024-04-17T17:58:59Z","published":"2023-11-27T19:34:04Z","title":"VehicleGAN: Pair-flexible Pose Guided Image Synthesis for Vehicle\n Re-identification","summary":" Vehicle Re-identification (Re-ID) has been broadly studied in the last\ndecade; however, the different camera view angle leading to confused\ndiscrimination in the feature subspace for the vehicles of various poses, is\nstill challenging for the Vehicle Re-ID models in the real world. To promote\nthe Vehicle Re-ID models, this paper proposes to synthesize a large number of\nvehicle images in the target pose, whose idea is to project the vehicles of\ndiverse poses into the unified target pose so as to enhance feature\ndiscrimination. Considering that the paired data of the same vehicles in\ndifferent traffic surveillance cameras might be not available in the real\nworld, we propose the first Pair-flexible Pose Guided Image Synthesis method\nfor Vehicle Re-ID, named as VehicleGAN in this paper, which works for both\nsupervised and unsupervised settings without the knowledge of geometric 3D\nmodels. Because of the feature distribution difference between real and\nsynthetic data, simply training a traditional metric learning based Re-ID model\nwith data-level fusion (i.e., data augmentation) is not satisfactory, therefore\nwe propose a new Joint Metric Learning (JML) via effective feature-level fusion\nfrom both real and synthetic data. Intensive experimental results on the public\nVeRi-776 and VehicleID datasets prove the accuracy and effectiveness of our\nproposed VehicleGAN and JML.\n","authors":["Baolu Li","Ping Liu","Lan Fu","Jinlong Li","Jianwu Fang","Zhigang Xu","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2311.16278v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11605v1","updated":"2024-04-17T17:54:49Z","published":"2024-04-17T17:54:49Z","title":"VG4D: Vision-Language Model Goes 4D Video Recognition","summary":" Understanding the real world through point cloud video is a crucial aspect of\nrobotics and autonomous driving systems. However, prevailing methods for 4D\npoint cloud recognition have limitations due to sensor resolution, which leads\nto a lack of detailed information. Recent advances have shown that\nVision-Language Models (VLM) pre-trained on web-scale text-image datasets can\nlearn fine-grained visual concepts that can be transferred to various\ndownstream tasks. However, effectively integrating VLM into the domain of 4D\npoint clouds remains an unresolved problem. In this work, we propose the\nVision-Language Models Goes 4D (VG4D) framework to transfer VLM knowledge from\nvisual-text pre-trained models to a 4D point cloud network. Our approach\ninvolves aligning the 4D encoder's representation with a VLM to learn a shared\nvisual and text space from training on large-scale image-text pairs. By\ntransferring the knowledge of the VLM to the 4D encoder and combining the VLM,\nour VG4D achieves improved recognition performance. To enhance the 4D encoder,\nwe modernize the classic dynamic point cloud backbone and propose an improved\nversion of PSTNet, im-PSTNet, which can efficiently model point cloud videos.\nExperiments demonstrate that our method achieves state-of-the-art performance\nfor action recognition on both the NTU RGB+D 60 dataset and the NTU RGB+D 120\ndataset. Code is available at \\url{https://github.com/Shark0-0/VG4D}.\n","authors":["Zhichao Deng","Xiangtai Li","Xia Li","Yunhai Tong","Shen Zhao","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11605v1.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2112.06979v2","updated":"2024-04-17T17:50:54Z","published":"2021-12-13T19:25:16Z","title":"The Brain Tumor Sequence Registration (BraTS-Reg) Challenge:\n Establishing Correspondence Between Pre-Operative and Follow-up MRI Scans of\n Diffuse Glioma Patients","summary":" Registration of longitudinal brain MRI scans containing pathologies is\nchallenging due to dramatic changes in tissue appearance. Although there has\nbeen progress in developing general-purpose medical image registration\ntechniques, they have not yet attained the requisite precision and reliability\nfor this task, highlighting its inherent complexity. Here we describe the Brain\nTumor Sequence Registration (BraTS-Reg) challenge, as the first public\nbenchmark environment for deformable registration algorithms focusing on\nestimating correspondences between pre-operative and follow-up scans of the\nsame patient diagnosed with a diffuse brain glioma. The BraTS-Reg data comprise\nde-identified multi-institutional multi-parametric MRI (mpMRI) scans, curated\nfor size and resolution according to a canonical anatomical template, and\ndivided into training, validation, and testing sets. Clinical experts annotated\nground truth (GT) landmark points of anatomical locations distinct across the\ntemporal domain. Quantitative evaluation and ranking were based on the Median\nEuclidean Error (MEE), Robustness, and the determinant of the Jacobian of the\ndisplacement field. The top-ranked methodologies yielded similar performance\nacross all evaluation metrics and shared several methodological commonalities,\nincluding pre-alignment, deep neural networks, inverse consistency analysis,\nand test-time instance optimization per-case basis as a post-processing step.\nThe top-ranked method attained the MEE at or below that of the inter-rater\nvariability for approximately 60% of the evaluated landmarks, underscoring the\nscope for further accuracy and robustness improvements, especially relative to\nhuman experts. The aim of BraTS-Reg is to continue to serve as an active\nresource for research, with the data and online evaluation tools accessible at\nhttps://bratsreg.github.io/.\n","authors":["Bhakti Baheti","Satrajit Chakrabarty","Hamed Akbari","Michel Bilello","Benedikt Wiestler","Julian Schwarting","Evan Calabrese","Jeffrey Rudie","Syed Abidi","Mina Mousa","Javier Villanueva-Meyer","Brandon K. K. Fields","Florian Kofler","Russell Takeshi Shinohara","Juan Eugenio Iglesias","Tony C. W. Mok","Albert C. S. Chung","Marek Wodzinski","Artur Jurgas","Niccolo Marini","Manfredo Atzori","Henning Muller","Christoph Grobroehmer","Hanna Siebert","Lasse Hansen","Mattias P. Heinrich","Luca Canalini","Jan Klein","Annika Gerken","Stefan Heldmann","Alessa Hering","Horst K. Hahn","Mingyuan Meng","Lei Bi","Dagan Feng","Jinman Kim","Ramy A. Zeineldin","Mohamed E. Karar","Franziska Mathis-Ullrich","Oliver Burgert","Javid Abderezaei","Aymeric Pionteck","Agamdeep Chopra","Mehmet Kurt","Kewei Yan","Yonghong Yan","Zhe Tang","Jianqiang Ma","Sahar Almahfouz Nasser","Nikhil Cherian Kurian","Mohit Meena","Saqib Shamsi","Amit Sethi","Nicholas J. Tustison","Brian B. Avants","Philip Cook","James C. Gee","Lin Tian","Hastings Greer","Marc Niethammer","Andrew Hoopes","Malte Hoffmann","Adrian V. Dalca","Stergios Christodoulidis","Theo Estiene","Maria Vakalopoulou","Nikos Paragios","Daniel S. Marcus","Christos Davatzikos","Aristeidis Sotiras","Bjoern Menze","Spyridon Bakas","Diana Waldmannstetter"],"pdf_url":"https://arxiv.org/pdf/2112.06979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11599v1","updated":"2024-04-17T17:50:24Z","published":"2024-04-17T17:50:24Z","title":"Variational Bayesian Last Layers","summary":" We introduce a deterministic variational formulation for training Bayesian\nlast layer neural networks. This yields a sampling-free, single-pass model and\nloss that effectively improves uncertainty estimation. Our variational Bayesian\nlast layer (VBLL) can be trained and evaluated with only quadratic complexity\nin last layer width, and is thus (nearly) computationally free to add to\nstandard architectures. We experimentally investigate VBLLs, and show that they\nimprove predictive accuracy, calibration, and out of distribution detection\nover baselines across both regression and classification. Finally, we\ninvestigate combining VBLL layers with variational Bayesian feature learning,\nyielding a lower variance collapsed variational inference method for Bayesian\nneural networks.\n","authors":["James Harrison","John Willes","Jasper Snoek"],"pdf_url":"https://arxiv.org/pdf/2404.11599v1.pdf","comment":"International Conference on Learning Representations (ICLR) 2024"},{"id":"http://arxiv.org/abs/2404.11593v1","updated":"2024-04-17T17:45:08Z","published":"2024-04-17T17:45:08Z","title":"IntrinsicAnything: Learning Diffusion Priors for Inverse Rendering Under\n Unknown Illumination","summary":" This paper aims to recover object materials from posed images captured under\nan unknown static lighting condition. Recent methods solve this task by\noptimizing material parameters through differentiable physically based\nrendering. However, due to the coupling between object geometry, materials, and\nenvironment lighting, there is inherent ambiguity during the inverse rendering\nprocess, preventing previous methods from obtaining accurate results. To\novercome this ill-posed problem, our key idea is to learn the material prior\nwith a generative model for regularizing the optimization process. We observe\nthat the general rendering equation can be split into diffuse and specular\nshading terms, and thus formulate the material prior as diffusion models of\nalbedo and specular. Thanks to this design, our model can be trained using the\nexisting abundant 3D object data, and naturally acts as a versatile tool to\nresolve the ambiguity when recovering material representations from RGB images.\nIn addition, we develop a coarse-to-fine training strategy that leverages\nestimated materials to guide diffusion models to satisfy multi-view consistent\nconstraints, leading to more stable and accurate results. Extensive experiments\non real-world and synthetic datasets demonstrate that our approach achieves\nstate-of-the-art performance on material recovery. The code will be available\nat https://zju3dv.github.io/IntrinsicAnything.\n","authors":["Xi Chen","Sida Peng","Dongchen Yang","Yuan Liu","Bowen Pan","Chengfei Lv","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.11593v1.pdf","comment":"Project page: https://zju3dv.github.io/IntrinsicAnything"},{"id":"http://arxiv.org/abs/2312.02255v2","updated":"2024-04-17T17:44:44Z","published":"2023-12-04T18:56:08Z","title":"Re-Nerfing: Improving Novel Views Synthesis through Novel Views\n Synthesis","summary":" Neural Radiance Fields (NeRFs) have shown remarkable novel view synthesis\ncapabilities even in large-scale, unbounded scenes, albeit requiring hundreds\nof views or introducing artifacts in sparser settings. Their optimization\nsuffers from shape-radiance ambiguities wherever only a small visual overlap is\navailable. This leads to erroneous scene geometry and artifacts. In this paper,\nwe propose Re-Nerfing, a simple and general multi-stage data augmentation\napproach that leverages NeRF's own view synthesis ability to address these\nlimitations. With Re-Nerfing, we enhance the geometric consistency of novel\nviews as follows: First, we train a NeRF with the available views. Then, we use\nthe optimized NeRF to synthesize pseudo-views around the original ones with a\nview selection strategy to improve coverage and preserve view quality. Finally,\nwe train a second NeRF with both the original images and the pseudo views\nmasking out uncertain regions. Extensive experiments applying Re-Nerfing on\nvarious pipelines on the mip-NeRF 360 dataset, including Gaussian Splatting,\nprovide valuable insights into the improvements achievable without external\ndata or supervision, on denser and sparser input scenarios. Project page:\nhttps://renerfing.github.io\n","authors":["Felix Tristram","Stefano Gasperini","Nassir Navab","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.02255v2.pdf","comment":"Code will be released upon acceptance"},{"id":"http://arxiv.org/abs/2310.01040v3","updated":"2024-04-17T17:44:24Z","published":"2023-10-02T09:33:54Z","title":"Segmenting the motion components of a video: A long-term unsupervised\n model","summary":" Human beings have the ability to continuously analyze a video and immediately\nextract the motion components. We want to adopt this paradigm to provide a\ncoherent and stable motion segmentation over the video sequence. In this\nperspective, we propose a novel long-term spatio-temporal model operating in a\ntotally unsupervised way. It takes as input the volume of consecutive optical\nflow (OF) fields, and delivers a volume of segments of coherent motion over the\nvideo. More specifically, we have designed a transformer-based network, where\nwe leverage a mathematically well-founded framework, the Evidence Lower Bound\n(ELBO), to derive the loss function. The loss function combines a flow\nreconstruction term involving spatio-temporal parametric motion models\ncombining, in a novel way, polynomial (quadratic) motion models for the spatial\ndimensions and B-splines for the time dimension of the video sequence, and a\nregularization term enforcing temporal consistency on the segments. We report\nexperiments on four VOS benchmarks, demonstrating competitive quantitative\nresults, while performing motion segmentation on a whole sequence in one go. We\nalso highlight through visual results the key contributions on temporal\nconsistency brought by our method.\n","authors":["Etienne Meunier","Patrick Bouthemy"],"pdf_url":"https://arxiv.org/pdf/2310.01040v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11590v1","updated":"2024-04-17T17:39:59Z","published":"2024-04-17T17:39:59Z","title":"A Subspace-Constrained Tyler's Estimator and its Applications to\n Structure from Motion","summary":" We present the subspace-constrained Tyler's estimator (STE) designed for\nrecovering a low-dimensional subspace within a dataset that may be highly\ncorrupted with outliers. STE is a fusion of the Tyler's M-estimator (TME) and a\nvariant of the fast median subspace. Our theoretical analysis suggests that,\nunder a common inlier-outlier model, STE can effectively recover the underlying\nsubspace, even when it contains a smaller fraction of inliers relative to other\nmethods in the field of robust subspace recovery. We apply STE in the context\nof Structure from Motion (SfM) in two ways: for robust estimation of the\nfundamental matrix and for the removal of outlying cameras, enhancing the\nrobustness of the SfM pipeline. Numerical experiments confirm the\nstate-of-the-art performance of our method in these applications. This research\nmakes significant contributions to the field of robust subspace recovery,\nparticularly in the context of computer vision and 3D reconstruction.\n","authors":["Feng Yu","Teng Zhang","Gilad Lerman"],"pdf_url":"https://arxiv.org/pdf/2404.11590v1.pdf","comment":"23 pages, accepted by CVPR 24"},{"id":"http://arxiv.org/abs/2404.11589v1","updated":"2024-04-17T17:38:56Z","published":"2024-04-17T17:38:56Z","title":"Prompt Optimizer of Text-to-Image Diffusion Models for Abstract Concept\n Understanding","summary":" The rapid evolution of text-to-image diffusion models has opened the door of\ngenerative AI, enabling the translation of textual descriptions into visually\ncompelling images with remarkable quality. However, a persistent challenge\nwithin this domain is the optimization of prompts to effectively convey\nabstract concepts into concrete objects. For example, text encoders can hardly\nexpress \"peace\", while can easily illustrate olive branches and white doves.\nThis paper introduces a novel approach named Prompt Optimizer for Abstract\nConcepts (POAC) specifically designed to enhance the performance of\ntext-to-image diffusion models in interpreting and generating images from\nabstract concepts. We propose a Prompt Language Model (PLM), which is\ninitialized from a pre-trained language model, and then fine-tuned with a\ncurated dataset of abstract concept prompts. The dataset is created with GPT-4\nto extend the abstract concept to a scene and concrete objects. Our framework\nemploys a Reinforcement Learning (RL)-based optimization strategy, focusing on\nthe alignment between the generated images by a stable diffusion model and\noptimized prompts. Through extensive experiments, we demonstrate that our\nproposed POAC significantly improves the accuracy and aesthetic quality of\ngenerated images, particularly in the description of abstract concepts and\nalignment with optimized prompts. We also present a comprehensive analysis of\nour model's performance across diffusion models under different settings,\nshowcasing its versatility and effectiveness in enhancing abstract concept\nrepresentation.\n","authors":["Zezhong Fan","Xiaohan Li","Chenhao Fang","Topojoy Biswas","Kaushiki Nag","Jianpeng Xu","Kannan Achan"],"pdf_url":"https://arxiv.org/pdf/2404.11589v1.pdf","comment":"WWW 2024 Companion"},{"id":"http://arxiv.org/abs/2206.10177v3","updated":"2024-04-17T17:36:19Z","published":"2022-06-21T08:16:08Z","title":"TCJA-SNN: Temporal-Channel Joint Attention for Spiking Neural Networks","summary":" Spiking Neural Networks (SNNs) are attracting widespread interest due to\ntheir biological plausibility, energy efficiency, and powerful spatio-temporal\ninformation representation ability. Given the critical role of attention\nmechanisms in enhancing neural network performance, the integration of SNNs and\nattention mechanisms exhibits potential to deliver energy-efficient and\nhigh-performance computing paradigms. We present a novel Temporal-Channel Joint\nAttention mechanism for SNNs, referred to as TCJA-SNN. The proposed TCJA-SNN\nframework can effectively assess the significance of spike sequence from both\nspatial and temporal dimensions. More specifically, our essential technical\ncontribution lies on: 1) We employ the squeeze operation to compress the spike\nstream into an average matrix. Then, we leverage two local attention mechanisms\nbased on efficient 1D convolutions to facilitate comprehensive feature\nextraction at the temporal and channel levels independently. 2) We introduce\nthe Cross Convolutional Fusion (CCF) layer as a novel approach to model the\ninter-dependencies between the temporal and channel scopes. This layer breaks\nthe independence of these two dimensions and enables the interaction between\nfeatures. Experimental results demonstrate that the proposed TCJA-SNN\noutperforms SOTA by up to 15.7% accuracy on standard static and neuromorphic\ndatasets, including Fashion-MNIST, CIFAR10-DVS, N-Caltech 101, and DVS128\nGesture. Furthermore, we apply the TCJA-SNN framework to image generation tasks\nby leveraging a variation autoencoder. To the best of our knowledge, this study\nis the first instance where the SNN-attention mechanism has been employed for\nimage classification and generation tasks. Notably, our approach has achieved\nSOTA performance in both domains, establishing a significant advancement in the\nfield. Codes are available at https://github.com/ridgerchu/TCJA.\n","authors":["Rui-Jie Zhu","Malu Zhang","Qihang Zhao","Haoyu Deng","Yule Duan","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2206.10177v3.pdf","comment":"Accepted by IEEE Transactions on Neural Networks and Learning Systems"},{"id":"http://arxiv.org/abs/2404.11576v1","updated":"2024-04-17T17:19:48Z","published":"2024-04-17T17:19:48Z","title":"State-space Decomposition Model for Video Prediction Considering\n Long-term Motion Trend","summary":" Stochastic video prediction enables the consideration of uncertainty in\nfuture motion, thereby providing a better reflection of the dynamic nature of\nthe environment. Stochastic video prediction methods based on image\nauto-regressive recurrent models need to feed their predictions back into the\nlatent space. Conversely, the state-space models, which decouple frame\nsynthesis and temporal prediction, proves to be more efficient. However,\ninferring long-term temporal information about motion and generalizing to\ndynamic scenarios under non-stationary assumptions remains an unresolved\nchallenge. In this paper, we propose a state-space decomposition stochastic\nvideo prediction model that decomposes the overall video frame generation into\ndeterministic appearance prediction and stochastic motion prediction. Through\nadaptive decomposition, the model's generalization capability to dynamic\nscenarios is enhanced. In the context of motion prediction, obtaining a prior\non the long-term trend of future motion is crucial. Thus, in the stochastic\nmotion prediction branch, we infer the long-term motion trend from conditional\nframes to guide the generation of future frames that exhibit high consistency\nwith the conditional frames. Experimental results demonstrate that our model\noutperforms baselines on multiple datasets.\n","authors":["Fei Cui","Jiaojiao Fang","Xiaojiang Wu","Zelong Lai","Mengke Yang","Menghan Jia","Guizhong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11569v1","updated":"2024-04-17T17:11:47Z","published":"2024-04-17T17:11:47Z","title":"Simple Image Signal Processing using Global Context Guidance","summary":" In modern smartphone cameras, the Image Signal Processor (ISP) is the core\nelement that converts the RAW readings from the sensor into perceptually\npleasant RGB images for the end users. The ISP is typically proprietary and\nhandcrafted and consists of several blocks such as white balance, color\ncorrection, and tone mapping. Deep learning-based ISPs aim to transform RAW\nimages into DSLR-like RGB images using deep neural networks. However, most\nlearned ISPs are trained using patches (small regions) due to computational\nlimitations. Such methods lack global context, which limits their efficacy on\nfull-resolution images and harms their ability to capture global properties\nsuch as color constancy or illumination. First, we propose a novel module that\ncan be integrated into any neural ISP to capture the global context information\nfrom the full RAW images. Second, we propose an efficient and simple neural ISP\nthat utilizes our proposed module. Our model achieves state-of-the-art results\non different benchmarks using diverse and real smartphone images.\n","authors":["Omar Elezabi","Marcos V. Conde","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2404.11569v1.pdf","comment":"Preprint under review"},{"id":"http://arxiv.org/abs/2404.11565v1","updated":"2024-04-17T17:08:05Z","published":"2024-04-17T17:08:05Z","title":"MoA: Mixture-of-Attention for Subject-Context Disentanglement in\n Personalized Image Generation","summary":" We introduce a new architecture for personalization of text-to-image\ndiffusion models, coined Mixture-of-Attention (MoA). Inspired by the\nMixture-of-Experts mechanism utilized in large language models (LLMs), MoA\ndistributes the generation workload between two attention pathways: a\npersonalized branch and a non-personalized prior branch. MoA is designed to\nretain the original model's prior by fixing its attention layers in the prior\nbranch, while minimally intervening in the generation process with the\npersonalized branch that learns to embed subjects in the layout and context\ngenerated by the prior branch. A novel routing mechanism manages the\ndistribution of pixels in each layer across these branches to optimize the\nblend of personalized and generic content creation. Once trained, MoA\nfacilitates the creation of high-quality, personalized images featuring\nmultiple subjects with compositions and interactions as diverse as those\ngenerated by the original model. Crucially, MoA enhances the distinction\nbetween the model's pre-existing capability and the newly augmented\npersonalized intervention, thereby offering a more disentangled subject-context\ncontrol that was previously unattainable. Project page:\nhttps://snap-research.github.io/mixture-of-attention\n","authors":[" Kuan-Chieh"," Wang","Daniil Ostashev","Yuwei Fang","Sergey Tulyakov","Kfir Aberman"],"pdf_url":"https://arxiv.org/pdf/2404.11565v1.pdf","comment":"Project Website: https://snap-research.github.io/mixture-of-attention"},{"id":"http://arxiv.org/abs/2404.11554v1","updated":"2024-04-17T16:56:31Z","published":"2024-04-17T16:56:31Z","title":"Predicting Long-horizon Futures by Conditioning on Geometry and Time","summary":" Our work explores the task of generating future sensor observations\nconditioned on the past. We are motivated by `predictive coding' concepts from\nneuroscience as well as robotic applications such as self-driving vehicles.\nPredictive video modeling is challenging because the future may be multi-modal\nand learning at scale remains computationally expensive for video processing.\nTo address both challenges, our key insight is to leverage the large-scale\npretraining of image diffusion models which can handle multi-modality. We\nrepurpose image models for video prediction by conditioning on new frame\ntimestamps. Such models can be trained with videos of both static and dynamic\nscenes. To allow them to be trained with modestly-sized datasets, we introduce\ninvariances by factoring out illumination and texture by forcing the model to\npredict (pseudo) depth, readily obtained for in-the-wild videos via\noff-the-shelf monocular depth networks. In fact, we show that simply modifying\nnetworks to predict grayscale pixels already improves the accuracy of video\nprediction. Given the extra controllability with timestamp conditioning, we\npropose sampling schedules that work better than the traditional autoregressive\nand hierarchical sampling strategies. Motivated by probabilistic metrics from\nthe object forecasting literature, we create a benchmark for video prediction\non a diverse set of videos spanning indoor and outdoor scenes and a large\nvocabulary of objects. Our experiments illustrate the effectiveness of learning\nto condition on timestamps, and show the importance of predicting the future\nwith invariant modalities.\n","authors":["Tarasha Khurana","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2404.11554v1.pdf","comment":"Project page: http://www.cs.cmu.edu/~tkhurana/depthforecasting/"},{"id":"http://arxiv.org/abs/2403.11376v4","updated":"2024-04-17T16:46:02Z","published":"2024-03-18T00:03:48Z","title":"ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal\n Instance Segmentation","summary":" Amodal Instance Segmentation (AIS) presents a challenging task as it involves\npredicting both visible and occluded parts of objects within images. Existing\nAIS methods rely on a bidirectional approach, encompassing both the transition\nfrom amodal features to visible features (amodal-to-visible) and from visible\nfeatures to amodal features (visible-to-amodal). Our observation shows that the\nutilization of amodal features through the amodal-to-visible can confuse the\nvisible features due to the extra information of occluded/hidden segments not\npresented in visible display. Consequently, this compromised quality of visible\nfeatures during the subsequent visible-to-amodal transition. To tackle this\nissue, we introduce ShapeFormer, a decoupled Transformer-based model with a\nvisible-to-amodal transition. It facilitates the explicit relationship between\noutput segmentations and avoids the need for amodal-to-visible transitions.\nShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for\npredicting visible segmentation with occlusion awareness, (ii) Shape-Prior\nAmodal Mask Head for predicting amodal and occluded masks, and (iii)\nCategory-Specific Shape Prior Retriever aims to provide shape prior knowledge.\nComprehensive experiments and extensive ablation studies across various AIS\nbenchmarks demonstrate the effectiveness of our ShapeFormer. The code is\navailable at: \\url{https://github.com/UARK-AICV/ShapeFormer}\n","authors":["Minh Tran","Winston Bounsavy","Khoa Vo","Anh Nguyen","Tri Nguyen","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2403.11376v4.pdf","comment":"Accepted to IJCNN2024"},{"id":"http://arxiv.org/abs/2312.03678v2","updated":"2024-04-17T16:37:44Z","published":"2023-12-06T18:41:01Z","title":"Hybrid Functional Maps for Crease-Aware Non-Isometric Shape Matching","summary":" Non-isometric shape correspondence remains a fundamental challenge in\ncomputer vision. Traditional methods using Laplace-Beltrami operator (LBO)\neigenmodes face limitations in characterizing high-frequency extrinsic shape\nchanges like bending and creases. We propose a novel approach of combining the\nnon-orthogonal extrinsic basis of eigenfunctions of the elastic thin-shell\nhessian with the intrinsic ones of the LBO, creating a hybrid spectral space in\nwhich we construct functional maps. To this end, we present a theoretical\nframework to effectively integrate non-orthogonal basis functions into\ndescriptor- and learning-based functional map methods. Our approach can be\nincorporated easily into existing functional map pipelines across varying\napplications and is able to handle complex deformations beyond isometries. We\nshow extensive evaluations across various supervised and unsupervised settings\nand demonstrate significant improvements. Notably, our approach achieves up to\n15% better mean geodesic error for non-isometric correspondence settings and up\nto 45% improvement in scenarios with topological noise.\n","authors":["Lennart Bastian","Yizheng Xie","Nassir Navab","Zorah Lähner"],"pdf_url":"https://arxiv.org/pdf/2312.03678v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11537v1","updated":"2024-04-17T16:30:56Z","published":"2024-04-17T16:30:56Z","title":"SSDiff: Spatial-spectral Integrated Diffusion Model for Remote Sensing\n Pansharpening","summary":" Pansharpening is a significant image fusion technique that merges the spatial\ncontent and spectral characteristics of remote sensing images to generate\nhigh-resolution multispectral images. Recently, denoising diffusion\nprobabilistic models have been gradually applied to visual tasks, enhancing\ncontrollable image generation through low-rank adaptation (LoRA). In this\npaper, we introduce a spatial-spectral integrated diffusion model for the\nremote sensing pansharpening task, called SSDiff, which considers the\npansharpening process as the fusion process of spatial and spectral components\nfrom the perspective of subspace decomposition. Specifically, SSDiff utilizes\nspatial and spectral branches to learn spatial details and spectral features\nseparately, then employs a designed alternating projection fusion module (APFM)\nto accomplish the fusion. Furthermore, we propose a frequency modulation\ninter-branch module (FMIM) to modulate the frequency distribution between\nbranches. The two components of SSDiff can perform favorably against the APFM\nwhen utilizing a LoRA-like branch-wise alternative fine-tuning method. It\nrefines SSDiff to capture component-discriminating features more sufficiently.\nFinally, extensive experiments on four commonly used datasets, i.e.,\nWorldView-3, WorldView-2, GaoFen-2, and QuickBird, demonstrate the superiority\nof SSDiff both visually and quantitatively. The code will be made open source\nafter possible acceptance.\n","authors":["Yu Zhong","Xiao Wu","Liang-Jian Deng","Zihan Cao"],"pdf_url":"https://arxiv.org/pdf/2404.11537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11525v1","updated":"2024-04-17T16:16:12Z","published":"2024-04-17T16:16:12Z","title":"JointViT: Modeling Oxygen Saturation Levels with Joint Supervision on\n Long-Tailed OCTA","summary":" The oxygen saturation level in the blood (SaO2) is crucial for health,\nparticularly in relation to sleep-related breathing disorders. However,\ncontinuous monitoring of SaO2 is time-consuming and highly variable depending\non patients' conditions. Recently, optical coherence tomography angiography\n(OCTA) has shown promising development in rapidly and effectively screening\neye-related lesions, offering the potential for diagnosing sleep-related\ndisorders. To bridge this gap, our paper presents three key contributions.\nFirstly, we propose JointViT, a novel model based on the Vision Transformer\narchitecture, incorporating a joint loss function for supervision. Secondly, we\nintroduce a balancing augmentation technique during data preprocessing to\nimprove the model's performance, particularly on the long-tail distribution\nwithin the OCTA dataset. Lastly, through comprehensive experiments on the OCTA\ndataset, our proposed method significantly outperforms other state-of-the-art\nmethods, achieving improvements of up to 12.28% in overall accuracy. This\nadvancement lays the groundwork for the future utilization of OCTA in\ndiagnosing sleep-related disorders. See project website\nhttps://steve-zeyu-zhang.github.io/JointViT\n","authors":["Zeyu Zhang","Xuyin Qi","Mingxi Chen","Guangxi Li","Ryan Pham","Ayub Zuhair","Ella Berry","Zhibin Liao","Owen Siggs","Robert Mclaughlin","Jamie Craig","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2404.11525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05889v2","updated":"2024-04-17T16:13:22Z","published":"2023-12-10T13:44:03Z","title":"SuperPrimitive: Scene Reconstruction at a Primitive Level","summary":" Joint camera pose and dense geometry estimation from a set of images or a\nmonocular video remains a challenging problem due to its computational\ncomplexity and inherent visual ambiguities. Most dense incremental\nreconstruction systems operate directly on image pixels and solve for their 3D\npositions using multi-view geometry cues. Such pixel-level approaches suffer\nfrom ambiguities or violations of multi-view consistency (e.g. caused by\ntextureless or specular surfaces).\n We address this issue with a new image representation which we call a\nSuperPrimitive. SuperPrimitives are obtained by splitting images into\nsemantically correlated local regions and enhancing them with estimated surface\nnormal directions, both of which are predicted by state-of-the-art single image\nneural networks. This provides a local geometry estimate per SuperPrimitive,\nwhile their relative positions are adjusted based on multi-view observations.\n We demonstrate the versatility of our new representation by addressing three\n3D reconstruction tasks: depth completion, few-view structure from motion, and\nmonocular dense visual odometry.\n","authors":["Kirill Mazur","Gwangbin Bae","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2312.05889v2.pdf","comment":"CVPR2024. Project Page: https://makezur.github.io/SuperPrimitive/"},{"id":"http://arxiv.org/abs/2404.11511v1","updated":"2024-04-17T16:06:29Z","published":"2024-04-17T16:06:29Z","title":"Event Cameras Meet SPADs for High-Speed, Low-Bandwidth Imaging","summary":" Traditional cameras face a trade-off between low-light performance and\nhigh-speed imaging: longer exposure times to capture sufficient light results\nin motion blur, whereas shorter exposures result in Poisson-corrupted noisy\nimages. While burst photography techniques help mitigate this tradeoff,\nconventional cameras are fundamentally limited in their sensor noise\ncharacteristics. Event cameras and single-photon avalanche diode (SPAD) sensors\nhave emerged as promising alternatives to conventional cameras due to their\ndesirable properties. SPADs are capable of single-photon sensitivity with\nmicrosecond temporal resolution, and event cameras can measure brightness\nchanges up to 1 MHz with low bandwidth requirements. We show that these\nproperties are complementary, and can help achieve low-light, high-speed image\nreconstruction with low bandwidth requirements. We introduce a sensor fusion\nframework to combine SPADs with event cameras to improves the reconstruction of\nhigh-speed, low-light scenes while reducing the high bandwidth cost associated\nwith using every SPAD frame. Our evaluation, on both synthetic and real sensor\ndata, demonstrates significant enhancements ( > 5 dB PSNR) in reconstructing\nlow-light scenes at high temporal resolution (100 kHz) compared to conventional\ncameras. Event-SPAD fusion shows great promise for real-world applications,\nsuch as robotics or medical imaging.\n","authors":["Manasi Muglikar","Siddharth Somasundaram","Akshat Dave","Edoardo Charbon","Ramesh Raskar","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2404.11511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06567v3","updated":"2024-04-17T15:58:36Z","published":"2024-03-11T10:06:45Z","title":"Leveraging Foundation Models for Content-Based Medical Image Retrieval\n in Radiology","summary":" Content-based image retrieval (CBIR) has the potential to significantly\nimprove diagnostic aid and medical research in radiology. Current CBIR systems\nface limitations due to their specialization to certain pathologies, limiting\ntheir utility. In response, we propose using vision foundation models as\npowerful and versatile off-the-shelf feature extractors for content-based\nmedical image retrieval. By benchmarking these models on a comprehensive\ndataset of 1.6 million 2D radiological images spanning four modalities and 161\npathologies, we identify weakly-supervised models as superior, achieving a P@1\nof up to 0.594. This performance not only competes with a specialized model but\ndoes so without the need for fine-tuning. Our analysis further explores the\nchallenges in retrieving pathological versus anatomical structures, indicating\nthat accurate retrieval of pathological features presents greater difficulty.\nDespite these challenges, our research underscores the vast potential of\nfoundation models for CBIR in radiology, proposing a shift towards versatile,\ngeneral-purpose medical image retrieval systems that do not require specific\ntuning.\n","authors":["Stefan Denner","David Zimmerer","Dimitrios Bounias","Markus Bujotzek","Shuhan Xiao","Lisa Kausch","Philipp Schader","Tobias Penzkofer","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2403.06567v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11492v1","updated":"2024-04-17T15:47:26Z","published":"2024-04-17T15:47:26Z","title":"arcjetCV: an open-source software to analyze material ablation","summary":" arcjetCV is an open-source Python software designed to automate time-resolved\nmeasurements of heatshield material recession and recession rates from arcjet\ntest video footage. This new automated and accessible capability greatly\nexceeds previous manual extraction methods, enabling rapid and detailed\ncharacterization of material recession for any sample with a profile video.\narcjetCV automates the video segmentation process using machine learning\nmodels, including a one-dimensional (1D) Convolutional Neural Network (CNN) to\ninfer the time-window of interest, a two-dimensional (2D) CNN for image and\nedge segmentation, and a Local Outlier Factor (LOF) for outlier filtering. A\ngraphical user interface (GUI) simplifies the user experience and an\napplication programming interface (API) allows users to call the core functions\nfrom scripts, enabling video batch processing. arcjetCV's capability to measure\ntime-resolved recession in turn enables characterization of non-linear\nprocesses (shrinkage, swelling, melt flows, etc.), contributing to higher\nfidelity validation and improved modeling of heatshield material performance.\nThe source code associated with this article can be found at\nhttps://github.com/magnus-haw/arcjetCV.\n","authors":["Alexandre Quintart","Magnus Haw","Federico Semeraro"],"pdf_url":"https://arxiv.org/pdf/2404.11492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11488v1","updated":"2024-04-17T15:45:49Z","published":"2024-04-17T15:45:49Z","title":"Multi-resolution Rescored ByteTrack for Video Object Detection on\n Ultra-low-power Embedded Systems","summary":" This paper introduces Multi-Resolution Rescored Byte-Track (MR2-ByteTrack), a\nnovel video object detection framework for ultra-low-power embedded processors.\nThis method reduces the average compute load of an off-the-shelf Deep Neural\nNetwork (DNN) based object detector by up to 2.25$\\times$ by alternating the\nprocessing of high-resolution images (320$\\times$320 pixels) with multiple\ndown-sized frames (192$\\times$192 pixels). To tackle the accuracy degradation\ndue to the reduced image input size, MR2-ByteTrack correlates the output\ndetections over time using the ByteTrack tracker and corrects potential\nmisclassification using a novel probabilistic Rescore algorithm. By\ninterleaving two down-sized images for every high-resolution one as the input\nof different state-of-the-art DNN object detectors with our MR2-ByteTrack, we\ndemonstrate an average accuracy increase of 2.16% and a latency reduction of\n43% on the GAP9 microcontroller compared to a baseline frame-by-frame inference\nscheme using exclusively full-resolution images. Code available at:\nhttps://github.com/Bomps4/Multi_Resolution_Rescored_ByteTrack\n","authors":["Luca Bompani","Manuele Rusci","Daniele Palossi","Francesco Conti","Luca Benini"],"pdf_url":"https://arxiv.org/pdf/2404.11488v1.pdf","comment":"9 pages, 3 figures Accepted for publication at the Embedded Vision\n Workshop of the Computer Vision and Pattern Recognition conference, Seattle,\n 2024"},{"id":"http://arxiv.org/abs/2404.11475v1","updated":"2024-04-17T15:31:06Z","published":"2024-04-17T15:31:06Z","title":"AdaIR: Exploiting Underlying Similarities of Image Restoration Tasks\n with Adapters","summary":" Existing image restoration approaches typically employ extensive networks\nspecifically trained for designated degradations. Despite being effective, such\nmethods inevitably entail considerable storage costs and computational\noverheads due to the reliance on task-specific networks. In this work, we go\nbeyond this well-established framework and exploit the inherent commonalities\namong image restoration tasks. The primary objective is to identify components\nthat are shareable across restoration tasks and augment the shared components\nwith modules specifically trained for individual tasks. Towards this goal, we\npropose AdaIR, a novel framework that enables low storage cost and efficient\ntraining without sacrificing performance. Specifically, a generic restoration\nnetwork is first constructed through self-supervised pre-training using\nsynthetic degradations. Subsequent to the pre-training phase, adapters are\ntrained to adapt the pre-trained network to specific degradations. AdaIR\nrequires solely the training of lightweight, task-specific modules, ensuring a\nmore efficient storage and training regimen. We have conducted extensive\nexperiments to validate the effectiveness of AdaIR and analyze the influence of\nthe pre-training strategy on discovering shareable components. Extensive\nexperimental results show that AdaIR achieves outstanding results on multi-task\nrestoration while utilizing significantly fewer parameters (1.9 MB) and less\ntraining time (7 hours) for each restoration task. The source codes and trained\nmodels will be released.\n","authors":["Hao-Wei Chen","Yu-Syuan Xu","Kelvin C. K. Chan","Hsien-Kai Kuo","Chun-Yi Lee","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.11475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11474v1","updated":"2024-04-17T15:28:53Z","published":"2024-04-17T15:28:53Z","title":"Towards Highly Realistic Artistic Style Transfer via Stable Diffusion\n with Step-aware and Layer-aware Prompt","summary":" Artistic style transfer aims to transfer the learned artistic style onto an\narbitrary content image, generating artistic stylized images. Existing\ngenerative adversarial network-based methods fail to generate highly realistic\nstylized images and always introduce obvious artifacts and disharmonious\npatterns. Recently, large-scale pre-trained diffusion models opened up a new\nway for generating highly realistic artistic stylized images. However,\ndiffusion model-based methods generally fail to preserve the content structure\nof input content images well, introducing some undesired content structure and\nstyle patterns. To address the above problems, we propose a novel pre-trained\ndiffusion-based artistic style transfer method, called LSAST, which can\ngenerate highly realistic artistic stylized images while preserving the content\nstructure of input content images well, without bringing obvious artifacts and\ndisharmonious style patterns. Specifically, we introduce a Step-aware and\nLayer-aware Prompt Space, a set of learnable prompts, which can learn the style\ninformation from the collection of artworks and dynamically adjusts the input\nimages' content structure and style pattern. To train our prompt space, we\npropose a novel inversion method, called Step-ware and Layer-aware Prompt\nInversion, which allows the prompt space to learn the style information of the\nartworks collection. In addition, we inject a pre-trained conditional branch of\nControlNet into our LSAST, which further improved our framework's ability to\nmaintain content structure. Extensive experiments demonstrate that our proposed\nmethod can generate more highly realistic artistic stylized images than the\nstate-of-the-art artistic style transfer methods.\n","authors":["Zhanjie Zhang","Quanwei Zhang","Huaizhong Lin","Wei Xing","Juncheng Mo","Shuaicheng Huang","Jinheng Xie","Guangyuan Li","Junsheng Luan","Lei Zhao","Dalong Zhang","Lixia Chen"],"pdf_url":"https://arxiv.org/pdf/2404.11474v1.pdf","comment":"Accepted by IJCAI2024"},{"id":"http://arxiv.org/abs/2303.12054v5","updated":"2024-04-17T15:12:29Z","published":"2023-03-21T17:45:38Z","title":"Influencer Backdoor Attack on Semantic Segmentation","summary":" When a small number of poisoned samples are injected into the training\ndataset of a deep neural network, the network can be induced to exhibit\nmalicious behavior during inferences, which poses potential threats to\nreal-world applications. While they have been intensively studied in\nclassification, backdoor attacks on semantic segmentation have been largely\noverlooked. Unlike classification, semantic segmentation aims to classify every\npixel within a given image. In this work, we explore backdoor attacks on\nsegmentation models to misclassify all pixels of a victim class by injecting a\nspecific trigger on non-victim pixels during inferences, which is dubbed\nInfluencer Backdoor Attack (IBA). IBA is expected to maintain the\nclassification accuracy of non-victim pixels and mislead classifications of all\nvictim pixels in every single inference and could be easily applied to\nreal-world scenes. Based on the context aggregation ability of segmentation\nmodels, we proposed a simple, yet effective, Nearest-Neighbor trigger injection\nstrategy. We also introduce an innovative Pixel Random Labeling strategy which\nmaintains optimal performance even when the trigger is placed far from the\nvictim pixels. Our extensive experiments reveal that current segmentation\nmodels do suffer from backdoor attacks, demonstrate IBA real-world\napplicability, and show that our proposed techniques can further increase\nattack performance.\n","authors":["Haoheng Lan","Jindong Gu","Philip Torr","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.12054v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11461v1","updated":"2024-04-17T15:09:31Z","published":"2024-04-17T15:09:31Z","title":"Using Game Engines and Machine Learning to Create Synthetic Satellite\n Imagery for a Tabletop Verification Exercise","summary":" Satellite imagery is regarded as a great opportunity for citizen-based\nmonitoring of activities of interest. Relevant imagery may however not be\navailable at sufficiently high resolution, quality, or cadence -- let alone be\nuniformly accessible to open-source analysts. This limits an assessment of the\ntrue long-term potential of citizen-based monitoring of nuclear activities\nusing publicly available satellite imagery. In this article, we demonstrate how\nmodern game engines combined with advanced machine-learning techniques can be\nused to generate synthetic imagery of sites of interest with the ability to\nchoose relevant parameters upon request; these include time of day, cloud\ncover, season, or level of activity onsite. At the same time, resolution and\noff-nadir angle can be adjusted to simulate different characteristics of the\nsatellite. While there are several possible use-cases for synthetic imagery,\nhere we focus on its usefulness to support tabletop exercises in which simple\nmonitoring scenarios can be examined to better understand verification\ncapabilities enabled by new satellite constellations and very short revisit\ntimes.\n","authors":["Johannes Hoster","Sara Al-Sayed","Felix Biessmann","Alexander Glaser","Kristian Hildebrand","Igor Moric","Tuong Vy Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.11461v1.pdf","comment":"Annual Meeting of the Institute of Nuclear Materials Management\n (INMM), Vienna"},{"id":"http://arxiv.org/abs/2404.11459v1","updated":"2024-04-17T15:07:06Z","published":"2024-04-17T15:07:06Z","title":"Octopus v3: Technical Report for On-device Sub-billion Multimodal AI\n Agent","summary":" A multimodal AI agent is characterized by its ability to process and learn\nfrom various types of data, including natural language, visual, and audio\ninputs, to inform its actions. Despite advancements in large language models\nthat incorporate visual data, such as GPT-4V, effectively translating\nimage-based data into actionable outcomes for AI agents continues to be\nchallenging. In this paper, we introduce a multimodal model that incorporates\nthe concept of functional token specifically designed for AI agent\napplications. To ensure compatibility with edge devices, our model is optimized\nto a compact size of less than 1B parameters. Like GPT-4, our model can process\nboth English and Chinese. We demonstrate that this model is capable of\noperating efficiently on a wide range of edge devices, including as constrained\nas a Raspberry Pi.\n","authors":["Wei Chen","Zhiyuan Li"],"pdf_url":"https://arxiv.org/pdf/2404.11459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05309v2","updated":"2024-04-17T15:04:14Z","published":"2023-02-10T15:12:40Z","title":"The LuViRA Dataset: Measurement Description","summary":" We present a dataset to evaluate localization algorithms, which utilizes\nvision, audio, and radio sensors: the Lund University Vision, Radio, and Audio\n(LuViRA) Dataset. The dataset includes RGB images, corresponding depth maps,\nIMU readings, channel response between a massive MIMO channel sounder and a\nuser equipment, audio recorded by 12 microphones, and 0.5 mm accurate 6DoF pose\nground truth. We synchronize these sensors to make sure that all data are\nrecorded simultaneously. A camera, speaker, and transmit antenna are placed on\ntop of a slowly moving service robot and 88 trajectories are recorded. Each\ntrajectory includes 20 to 50 seconds of recorded sensor data and ground truth\nlabels. The data from different sensors can be used separately or jointly to\nconduct localization tasks and a motion capture system is used to verify the\nresults obtained by the localization algorithms. The main aim of this dataset\nis to enable research on fusing the most commonly used sensors for localization\ntasks. However, the full dataset or some parts of it can also be used for other\nresearch areas such as channel estimation, image classification, etc. Fusing\nsensor data can lead to increased localization accuracy and reliability, as\nwell as decreased latency and power consumption. The created dataset will be\nmade public at a later date.\n","authors":["Ilayda Yaman","Guoda Tian","Martin Larsson","Patrik Persson","Michiel Sandra","Alexander Dürr","Erik Tegler","Nikhil Challa","Henrik Garde","Fredrik Tufvesson","Kalle Åström","Ove Edfors","Steffen Malkowsky","Liang Liu"],"pdf_url":"https://arxiv.org/pdf/2302.05309v2.pdf","comment":"7 pages, 7 figures, Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2305.15964v5","updated":"2024-04-17T15:01:39Z","published":"2023-05-25T12:03:31Z","title":"ChatCAD+: Towards a Universal and Reliable Interactive CAD using LLMs","summary":" The integration of Computer-Aided Diagnosis (CAD) with Large Language Models\n(LLMs) presents a promising frontier in clinical applications, notably in\nautomating diagnostic processes akin to those performed by radiologists and\nproviding consultations similar to a virtual family doctor. Despite the\npromising potential of this integration, current works face at least two\nlimitations: (1) From the perspective of a radiologist, existing studies\ntypically have a restricted scope of applicable imaging domains, failing to\nmeet the diagnostic needs of different patients. Also, the insufficient\ndiagnostic capability of LLMs further undermine the quality and reliability of\nthe generated medical reports. (2) Current LLMs lack the requisite depth in\nmedical expertise, rendering them less effective as virtual family doctors due\nto the potential unreliability of the advice provided during patient\nconsultations. To address these limitations, we introduce ChatCAD+, to be\nuniversal and reliable. Specifically, it is featured by two main modules: (1)\nReliable Report Generation and (2) Reliable Interaction. The Reliable Report\nGeneration module is capable of interpreting medical images from diverse\ndomains and generate high-quality medical reports via our proposed hierarchical\nin-context learning. Concurrently, the interaction module leverages up-to-date\ninformation from reputable medical websites to provide reliable medical advice.\nTogether, these designed modules synergize to closely align with the expertise\nof human medical professionals, offering enhanced consistency and reliability\nfor interpretation and advice. The source code is available at\nhttps://github.com/zhaozh10/ChatCAD.\n","authors":["Zihao Zhao","Sheng Wang","Jinchen Gu","Yitao Zhu","Lanzhuju Mei","Zixu Zhuang","Zhiming Cui","Qian Wang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2305.15964v5.pdf","comment":"Authors Zihao Zhao, Sheng Wang, Jinchen Gu, Yitao Zhu contributed\n equally to this work and should be considered co-first authors"},{"id":"http://arxiv.org/abs/2403.18807v4","updated":"2024-04-17T14:59:51Z","published":"2024-03-27T17:53:30Z","title":"ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth\n Estimation","summary":" In the absence of parallax cues, a learning-based single image depth\nestimation (SIDE) model relies heavily on shading and contextual cues in the\nimage. While this simplicity is attractive, it is necessary to train such\nmodels on large and varied datasets, which are difficult to capture. It has\nbeen shown that using embeddings from pre-trained foundational models, such as\nCLIP, improves zero shot transfer in several applications. Taking inspiration\nfrom this, in our paper we explore the use of global image priors generated\nfrom a pre-trained ViT model to provide more detailed contextual information.\nWe argue that the embedding vector from a ViT model, pre-trained on a large\ndataset, captures greater relevant information for SIDE than the usual route of\ngenerating pseudo image captions, followed by CLIP based text embeddings. Based\non this idea, we propose a new SIDE model using a diffusion backbone which is\nconditioned on ViT embeddings. Our proposed design establishes a new\nstate-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of\n0.059 (14% improvement) compared to 0.069 by the current SOTA (VPD). And on\nKITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to\n0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model\ntrained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%)\nover NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%,\n18%, 45%, 9%) by ZoeDepth. The project page is available at\nhttps://ecodepth-iitd.github.io\n","authors":["Suraj Patni","Aradhye Agarwal","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2403.18807v4.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2402.00186v2","updated":"2024-04-17T14:54:56Z","published":"2024-01-31T21:28:40Z","title":"Distance and Collision Probability Estimation from Gaussian Surface\n Models","summary":" This paper describes continuous-space methodologies to estimate the collision\nprobability, Euclidean distance and gradient between an ellipsoidal robot model\nand an environment surface modeled as a set of Gaussian distributions.\nContinuous-space collision probability estimation is critical for\nuncertainty-aware motion planning. Most collision detection and avoidance\napproaches assume the robot is modeled as a sphere, but ellipsoidal\nrepresentations provide tighter approximations and enable navigation in\ncluttered and narrow spaces. State-of-the-art methods derive the Euclidean\ndistance and gradient by processing raw point clouds, which is computationally\nexpensive for large workspaces. Recent advances in Gaussian surface modeling\n(e.g. mixture models, splatting) enable compressed and high-fidelity surface\nrepresentations. Few methods exist to estimate continuous-space occupancy from\nsuch models. They require Gaussians to model free space and are unable to\nestimate the collision probability, Euclidean distance and gradient for an\nellipsoidal robot. The proposed methods bridge this gap by extending prior work\nin ellipsoid-to-ellipsoid Euclidean distance and collision probability\nestimation to Gaussian surface models. A geometric blending approach is also\nproposed to improve collision probability estimation. The approaches are\nevaluated with numerical 2D and 3D experiments using real-world point cloud\ndata. Methods for efficient calculation of these quantities are demonstrated to\nexecute within a few microseconds per ellipsoid pair using a single-thread on\nlow-power CPUs of modern embedded computers\n","authors":["Kshitij Goel","Wennie Tabib"],"pdf_url":"https://arxiv.org/pdf/2402.00186v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11429v1","updated":"2024-04-17T14:34:56Z","published":"2024-04-17T14:34:56Z","title":"CarcassFormer: An End-to-end Transformer-based Framework for\n Simultaneous Localization, Segmentation and Classification of Poultry Carcass\n Defect","summary":" In the food industry, assessing the quality of poultry carcasses during\nprocessing is a crucial step. This study proposes an effective approach for\nautomating the assessment of carcass quality without requiring skilled labor or\ninspector involvement. The proposed system is based on machine learning (ML)\nand computer vision (CV) techniques, enabling automated defect detection and\ncarcass quality assessment. To this end, an end-to-end framework called\nCarcassFormer is introduced. It is built upon a Transformer-based architecture\ndesigned to effectively extract visual representations while simultaneously\ndetecting, segmenting, and classifying poultry carcass defects. Our proposed\nframework is capable of analyzing imperfections resulting from production and\ntransport welfare issues, as well as processing plant stunner, scalder, picker,\nand other equipment malfunctions. To benchmark the framework, a dataset of\n7,321 images was initially acquired, which contained both single and multiple\ncarcasses per image. In this study, the performance of the CarcassFormer system\nis compared with other state-of-the-art (SOTA) approaches for both\nclassification, detection, and segmentation tasks. Through extensive\nquantitative experiments, our framework consistently outperforms existing\nmethods, demonstrating remarkable improvements across various evaluation\nmetrics such as AP, AP@50, and AP@75. Furthermore, the qualitative results\nhighlight the strengths of CarcassFormer in capturing fine details, including\nfeathers, and accurately localizing and segmenting carcasses with high\nprecision. To facilitate further research and collaboration, the pre-trained\nmodel and source code of CarcassFormer is available for research purposes at:\n\\url{https://github.com/UARK-AICV/CarcassFormer}.\n","authors":["Minh Tran","Sang Truong","Arthur F. A. Fernandes","Michael T. Kidd","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2404.11429v1.pdf","comment":"Accepted to Poultry Science Journal"},{"id":"http://arxiv.org/abs/2404.11428v1","updated":"2024-04-17T14:34:35Z","published":"2024-04-17T14:34:35Z","title":"Explainable Lung Disease Classification from Chest X-Ray Images\n Utilizing Deep Learning and XAI","summary":" Lung diseases remain a critical global health concern, and it's crucial to\nhave accurate and quick ways to diagnose them. This work focuses on classifying\ndifferent lung diseases into five groups: viral pneumonia, bacterial pneumonia,\nCOVID, tuberculosis, and normal lungs. Employing advanced deep learning\ntechniques, we explore a diverse range of models including CNN, hybrid models,\nensembles, transformers, and Big Transfer. The research encompasses\ncomprehensive methodologies such as hyperparameter tuning, stratified k-fold\ncross-validation, and transfer learning with fine-tuning.Remarkably, our\nfindings reveal that the Xception model, fine-tuned through 5-fold\ncross-validation, achieves the highest accuracy of 96.21\\%. This success shows\nthat our methods work well in accurately identifying different lung diseases.\nThe exploration of explainable artificial intelligence (XAI) methodologies\nfurther enhances our understanding of the decision-making processes employed by\nthese models, contributing to increased trust in their clinical applications.\n","authors":["Tanzina Taher Ifty","Saleh Ahmed Shafin","Shoeb Mohammad Shahriar","Tashfia Towhid"],"pdf_url":"https://arxiv.org/pdf/2404.11428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11426v1","updated":"2024-04-17T14:33:41Z","published":"2024-04-17T14:33:41Z","title":"SPAMming Labels: Efficient Annotations for the Trackers of Tomorrow","summary":" Increasing the annotation efficiency of trajectory annotations from videos\nhas the potential to enable the next generation of data-hungry tracking\nalgorithms to thrive on large-scale datasets. Despite the importance of this\ntask, there are currently very few works exploring how to efficiently label\ntracking datasets comprehensively. In this work, we introduce SPAM, a tracking\ndata engine that provides high-quality labels with minimal human intervention.\nSPAM is built around two key insights: i) most tracking scenarios can be easily\nresolved. To take advantage of this, we utilize a pre-trained model to generate\nhigh-quality pseudo-labels, reserving human involvement for a smaller subset of\nmore difficult instances; ii) handling the spatiotemporal dependencies of track\nannotations across time can be elegantly and efficiently formulated through\ngraphs. Therefore, we use a unified graph formulation to address the annotation\nof both detections and identity association for tracks across time. Based on\nthese insights, SPAM produces high-quality annotations with a fraction of\nground truth labeling cost. We demonstrate that trackers trained on SPAM labels\nachieve comparable performance to those trained on human annotations while\nrequiring only 3-20% of the human labeling effort. Hence, SPAM paves the way\ntowards highly efficient labeling of large-scale tracking datasets. Our code\nand models will be available upon acceptance.\n","authors":["Orcun Cetintas","Tim Meinhardt","Guillem Brasó","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2404.11426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11419v1","updated":"2024-04-17T14:23:28Z","published":"2024-04-17T14:23:28Z","title":"SLAIM: Robust Dense Neural SLAM for Online Tracking and Mapping","summary":" We present SLAIM - Simultaneous Localization and Implicit Mapping. We propose\na novel coarse-to-fine tracking model tailored for Neural Radiance Field SLAM\n(NeRF-SLAM) to achieve state-of-the-art tracking performance. Notably, existing\nNeRF-SLAM systems consistently exhibit inferior tracking performance compared\nto traditional SLAM algorithms. NeRF-SLAM methods solve camera tracking via\nimage alignment and photometric bundle-adjustment. Such optimization processes\nare difficult to optimize due to the narrow basin of attraction of the\noptimization loss in image space (local minima) and the lack of initial\ncorrespondences. We mitigate these limitations by implementing a Gaussian\npyramid filter on top of NeRF, facilitating a coarse-to-fine tracking\noptimization strategy. Furthermore, NeRF systems encounter challenges in\nconverging to the right geometry with limited input views. While prior\napproaches use a Signed-Distance Function (SDF)-based NeRF and directly\nsupervise SDF values by approximating ground truth SDF through depth\nmeasurements, this often results in suboptimal geometry. In contrast, our\nmethod employs a volume density representation and introduces a novel KL\nregularizer on the ray termination distribution, constraining scene geometry to\nconsist of empty space and opaque surfaces. Our solution implements both local\nand global bundle-adjustment to produce a robust (coarse-to-fine) and accurate\n(KL regularizer) SLAM solution. We conduct experiments on multiple datasets\n(ScanNet, TUM, Replica) showing state-of-the-art results in tracking and in\nreconstruction accuracy.\n","authors":["Vincent Cartillier","Grant Schindler","Irfan Essa"],"pdf_url":"https://arxiv.org/pdf/2404.11419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11416v1","updated":"2024-04-17T14:17:05Z","published":"2024-04-17T14:17:05Z","title":"Neural Shrödinger Bridge Matching for Pansharpening","summary":" Recent diffusion probabilistic models (DPM) in the field of pansharpening\nhave been gradually gaining attention and have achieved state-of-the-art (SOTA)\nperformance. In this paper, we identify shortcomings in directly applying DPMs\nto the task of pansharpening as an inverse problem: 1) initiating sampling\ndirectly from Gaussian noise neglects the low-resolution multispectral image\n(LRMS) as a prior; 2) low sampling efficiency often necessitates a higher\nnumber of sampling steps. We first reformulate pansharpening into the\nstochastic differential equation (SDE) form of an inverse problem. Building\nupon this, we propose a Schr\\\"odinger bridge matching method that addresses\nboth issues.\n We design an efficient deep neural network architecture tailored for the\nproposed SB matching.\n In comparison to the well-established DL-regressive-based framework and the\nrecent DPM framework, our method demonstrates SOTA performance with fewer\nsampling steps. Moreover, we discuss the relationship between SB matching and\nother methods based on SDEs and ordinary differential equations (ODEs), as well\nas its connection with optimal transport.\n Code will be available.\n","authors":["Zihan Cao","Xiao Wu","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2404.11416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11401v1","updated":"2024-04-17T14:07:22Z","published":"2024-04-17T14:07:22Z","title":"RainyScape: Unsupervised Rainy Scene Reconstruction using Decoupled\n Neural Rendering","summary":" We propose RainyScape, an unsupervised framework for reconstructing clean\nscenes from a collection of multi-view rainy images. RainyScape consists of two\nmain modules: a neural rendering module and a rain-prediction module that\nincorporates a predictor network and a learnable latent embedding that captures\nthe rain characteristics of the scene. Specifically, based on the spectral bias\nproperty of neural networks, we first optimize the neural rendering pipeline to\nobtain a low-frequency scene representation. Subsequently, we jointly optimize\nthe two modules, driven by the proposed adaptive direction-sensitive\ngradient-based reconstruction loss, which encourages the network to distinguish\nbetween scene details and rain streaks, facilitating the propagation of\ngradients to the relevant components. Extensive experiments on both the classic\nneural radiance field and the recently proposed 3D Gaussian splatting\ndemonstrate the superiority of our method in effectively eliminating rain\nstreaks and rendering clean images, achieving state-of-the-art performance. The\nconstructed high-quality dataset and source code will be publicly available.\n","authors":["Xianqiang Lyu","Hui Liu","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2404.11401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16749v3","updated":"2024-04-17T14:06:28Z","published":"2024-02-26T17:11:11Z","title":"MISC: Ultra-low Bitrate Image Semantic Compression Driven by Large\n Multimodal Model","summary":" With the evolution of storage and communication protocols, ultra-low bitrate\nimage compression has become a highly demanding topic. However, existing\ncompression algorithms must sacrifice either consistency with the ground truth\nor perceptual quality at ultra-low bitrate. In recent years, the rapid\ndevelopment of the Large Multimodal Model (LMM) has made it possible to balance\nthese two goals. To solve this problem, this paper proposes a method called\nMultimodal Image Semantic Compression (MISC), which consists of an LMM encoder\nfor extracting the semantic information of the image, a map encoder to locate\nthe region corresponding to the semantic, an image encoder generates an\nextremely compressed bitstream, and a decoder reconstructs the image based on\nthe above information. Experimental results show that our proposed MISC is\nsuitable for compressing both traditional Natural Sense Images (NSIs) and\nemerging AI-Generated Images (AIGIs) content. It can achieve optimal\nconsistency and perception results while saving 50% bitrate, which has strong\npotential applications in the next generation of storage and communication. The\ncode will be released on https://github.com/lcysyzxdxc/MISC.\n","authors":["Chunyi Li","Guo Lu","Donghui Feng","Haoning Wu","Zicheng Zhang","Xiaohong Liu","Guangtao Zhai","Weisi Lin","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.16749v3.pdf","comment":"13 page, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2312.06722v2","updated":"2024-04-17T13:56:06Z","published":"2023-12-11T03:35:58Z","title":"EgoPlan-Bench: Benchmarking Egocentric Embodied Planning with Multimodal\n Large Language Models","summary":" Multimodal Large Language Models, combining the remarkable reasoning and\ngeneralization capabilities of Large Language Models (LLMs) with the ability to\ncomprehend visual inputs, have opened up new avenues for embodied task\nplanning. Given diverse environmental inputs, including real-time task\nprogress, visual observations, and open-form language instructions, a\nproficient task planner is expected to predict feasible actions, which is a\nfeat inherently achievable by Multimodal Large Language Models (MLLMs). In this\npaper, we aim to quantitatively investigate the potential of MLLMs as embodied\ntask planners in real-world scenarios by introducing a benchmark with human\nannotations named EgoPlan-Bench. Our benchmark is distinguished by realistic\ntasks derived from real-world videos, a diverse set of actions involving\ninteractions with hundreds of different objects, and complex visual\nobservations from varied scenes. We evaluate a wide range of MLLMs, revealing\nthat these models have not yet evolved into embodied planning generalists (even\nGPT-4V). We further construct an instruction-tuning dataset EgoPlan-IT from\nvideos with human-object interactions, to facilitate the learning of high-level\ntask planning in intricate real-world situations. The experiment results\ndemonstrate that the model tuned on EgoPlan-IT not only significantly improves\nperformance on our benchmark, but can also be applied as a task planner for\nguiding embodied agents in simulations.\n","authors":["Yi Chen","Yuying Ge","Yixiao Ge","Mingyu Ding","Bohao Li","Rui Wang","Ruifeng Xu","Ying Shan","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2312.06722v2.pdf","comment":"Project released at: https://github.com/ChenYi99/EgoPlan"},{"id":"http://arxiv.org/abs/2310.20621v2","updated":"2024-04-17T13:41:07Z","published":"2023-10-31T16:54:14Z","title":"Deepfake detection by exploiting surface anomalies: the SurFake approach","summary":" The ever-increasing use of synthetically generated content in different\nsectors of our everyday life, one for all media information, poses a strong\nneed for deepfake detection tools in order to avoid the proliferation of\naltered messages. The process to identify manipulated content, in particular\nimages and videos, is basically performed by looking for the presence of some\ninconsistencies and/or anomalies specifically due to the fake generation\nprocess. Different techniques exist in the scientific literature that exploit\ndiverse ad-hoc features in order to highlight possible modifications. In this\npaper, we propose to investigate how deepfake creation can impact on the\ncharacteristics that the whole scene had at the time of the acquisition. In\nparticular, when an image (video) is captured the overall geometry of the scene\n(e.g. surfaces) and the acquisition process (e.g. illumination) determine a\nunivocal environment that is directly represented by the image pixel values;\nall these intrinsic relations are possibly changed by the deepfake generation\nprocess. By resorting to the analysis of the characteristics of the surfaces\ndepicted in the image it is possible to obtain a descriptor usable to train a\nCNN for deepfake detection: we refer to such an approach as SurFake.\nExperimental results carried out on the FF++ dataset for different kinds of\ndeepfake forgeries and diverse deep learning models confirm that such a feature\ncan be adopted to discriminate between pristine and altered images;\nfurthermore, experiments witness that it can also be combined with visual data\nto provide a certain improvement in terms of detection accuracy.\n","authors":["Andrea Ciamarra","Roberto Caldelli","Federico Becattini","Lorenzo Seidenari","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2310.20621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11375v1","updated":"2024-04-17T13:33:09Z","published":"2024-04-17T13:33:09Z","title":"Text-controlled Motion Mamba: Text-Instructed Temporal Grounding of\n Human Motion","summary":" Human motion understanding is a fundamental task with diverse practical\napplications, facilitated by the availability of large-scale motion capture\ndatasets. Recent studies focus on text-motion tasks, such as text-based motion\ngeneration, editing and question answering. In this study, we introduce the\nnovel task of text-based human motion grounding (THMG), aimed at precisely\nlocalizing temporal segments corresponding to given textual descriptions within\nuntrimmed motion sequences. Capturing global temporal information is crucial\nfor the THMG task. However, transformer-based models that rely on global\ntemporal self-attention face challenges when handling long untrimmed sequences\ndue to the quadratic computational cost. We address these challenges by\nproposing Text-controlled Motion Mamba (TM-Mamba), a unified model that\nintegrates temporal global context, language query control, and spatial graph\ntopology with only linear memory cost. The core of the model is a\ntext-controlled selection mechanism which dynamically incorporates global\ntemporal information based on text query. The model is further enhanced to be\ntopology-aware through the integration of relational embeddings. For\nevaluation, we introduce BABEL-Grounding, the first text-motion dataset that\nprovides detailed textual descriptions of human actions along with their\ncorresponding temporal segments. Extensive evaluations demonstrate the\neffectiveness of TM-Mamba on BABEL-Grounding.\n","authors":["Xinghan Wang","Zixi Kang","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2404.11375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13756v3","updated":"2024-04-17T13:32:15Z","published":"2024-02-21T12:34:31Z","title":"High-throughput Visual Nano-drone to Nano-drone Relative Localization\n using Onboard Fully Convolutional Networks","summary":" Relative drone-to-drone localization is a fundamental building block for any\nswarm operations. We address this task in the context of miniaturized\nnano-drones, i.e., 10cm in diameter, which show an ever-growing interest due to\nnovel use cases enabled by their reduced form factor. The price for their\nversatility comes with limited onboard resources, i.e., sensors, processing\nunits, and memory, which limits the complexity of the onboard algorithms. A\ntraditional solution to overcome these limitations is represented by\nlightweight deep learning models directly deployed aboard nano-drones. This\nwork tackles the challenging relative pose estimation between nano-drones using\nonly a gray-scale low-resolution camera and an ultra-low-power System-on-Chip\n(SoC) hosted onboard. We present a vertically integrated system based on a\nnovel vision-based fully convolutional neural network (FCNN), which runs at\n39Hz within 101mW onboard a Crazyflie nano-drone extended with the GWT GAP8\nSoC. We compare our FCNN against three State-of-the-Art (SoA) systems.\nConsidering the best-performing SoA approach, our model results in an R-squared\nimprovement from 32 to 47% on the horizontal image coordinate and from 18 to\n55% on the vertical image coordinate, on a real-world dataset of 30k images.\nFinally, our in-field tests show a reduction of the average tracking error of\n37% compared to a previous SoA work and an endurance performance up to the\nentire battery lifetime of 4 minutes.\n","authors":["Luca Crupi","Alessandro Giusti","Daniele Palossi"],"pdf_url":"https://arxiv.org/pdf/2402.13756v3.pdf","comment":"ICRA 2024, IEEE Conference"},{"id":"http://arxiv.org/abs/2401.11470v2","updated":"2024-04-17T13:25:38Z","published":"2024-01-21T11:55:42Z","title":"Exploring Missing Modality in Multimodal Egocentric Datasets","summary":" Multimodal video understanding is crucial for analyzing egocentric videos,\nwhere integrating multiple sensory signals significantly enhances action\nrecognition and moment localization. However, practical applications often\ngrapple with incomplete modalities due to factors like privacy concerns,\nefficiency demands, or hardware malfunctions. Addressing this, our study delves\ninto the impact of missing modalities on egocentric action recognition,\nparticularly within transformer-based models. We introduce a novel concept\n-Missing Modality Token (MMT)-to maintain performance even when modalities are\nabsent, a strategy that proves effective in the Ego4D, Epic-Kitchens, and\nEpic-Sounds datasets. Our method mitigates the performance loss, reducing it\nfrom its original $\\sim 30\\%$ drop to only $\\sim 10\\%$ when half of the test\nset is modal-incomplete. Through extensive experimentation, we demonstrate the\nadaptability of MMT to different training scenarios and its superiority in\nhandling missing modalities compared to current methods. Our research\ncontributes a comprehensive analysis and an innovative approach, opening\navenues for more resilient multimodal systems in real-world settings.\n","authors":["Merey Ramazanova","Alejandro Pardo","Humam Alwassel","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2401.11470v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10247v2","updated":"2024-04-17T13:25:35Z","published":"2023-03-17T20:54:04Z","title":"Video shutter angle estimation using optical flow and linear blur","summary":" We present a method for estimating the shutter angle, a.k.a. exposure\nfraction - the ratio of the exposure time and the reciprocal of frame rate - of\nvideoclips containing motion. The approach exploits the relation of the\nexposure fraction, optical flow, and linear motion blur. Robustness is achieved\nby selecting image patches where both the optical flow and blur estimates are\nreliable, checking their consistency. The method was evaluated on the publicly\navailable Beam-Splitter Dataset with a range of exposure fractions from 0.015\nto 0.36. The best achieved mean absolute error of estimates was 0.039. We\nsuccessfully test the suitability of the method for a forensic application of\ndetection of video tampering by frame removal or insertion\n","authors":["David Korcak","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2303.10247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11361v1","updated":"2024-04-17T13:18:39Z","published":"2024-04-17T13:18:39Z","title":"Boosting Medical Image Segmentation Performance with Adaptive\n Convolution Layer","summary":" Medical image segmentation plays a vital role in various clinical\napplications, enabling accurate delineation and analysis of anatomical\nstructures or pathological regions. Traditional CNNs have achieved remarkable\nsuccess in this field. However, they often rely on fixed kernel sizes, which\ncan limit their performance and adaptability in medical images where features\nexhibit diverse scales and configurations due to variability in equipment,\ntarget sizes, and expert interpretations.\n In this paper, we propose an adaptive layer placed ahead of leading\ndeep-learning models such as UCTransNet, which dynamically adjusts the kernel\nsize based on the local context of the input image.\n By adaptively capturing and fusing features at multiple scales, our approach\nenhances the network's ability to handle diverse anatomical structures and\nsubtle image details, even for recently performing architectures that\ninternally implement intra-scale modules, such as UCTransnet.\n Extensive experiments are conducted on\n benchmark medical image datasets to evaluate the effectiveness of our\nproposal. It consistently outperforms traditional \\glspl{CNN} with fixed kernel\nsizes with a similar number of parameters, achieving superior segmentation\nAccuracy, Dice, and IoU in popular datasets such as SegPC2021 and ISIC2018. The\nmodel and data are published in the open-source repository, ensuring\ntransparency and reproducibility of our promising results.\n","authors":["Seyed M. R. Modaresi","Aomar Osmani","Mohammadreza Razzazi","Abdelghani Chibani"],"pdf_url":"https://arxiv.org/pdf/2404.11361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11358v1","updated":"2024-04-17T13:14:52Z","published":"2024-04-17T13:14:52Z","title":"DeblurGS: Gaussian Splatting for Camera Motion Blur","summary":" Although significant progress has been made in reconstructing sharp 3D scenes\nfrom motion-blurred images, a transition to real-world applications remains\nchallenging. The primary obstacle stems from the severe blur which leads to\ninaccuracies in the acquisition of initial camera poses through\nStructure-from-Motion, a critical aspect often overlooked by previous\napproaches. To address this challenge, we propose DeblurGS, a method to\noptimize sharp 3D Gaussian Splatting from motion-blurred images, even with the\nnoisy camera pose initialization. We restore a fine-grained sharp scene by\nleveraging the remarkable reconstruction capability of 3D Gaussian Splatting.\nOur approach estimates the 6-Degree-of-Freedom camera motion for each blurry\nobservation and synthesizes corresponding blurry renderings for the\noptimization process. Furthermore, we propose Gaussian Densification Annealing\nstrategy to prevent the generation of inaccurate Gaussians at erroneous\nlocations during the early training stages when camera motion is still\nimprecise. Comprehensive experiments demonstrate that our DeblurGS achieves\nstate-of-the-art performance in deblurring and novel view synthesis for\nreal-world and synthetic benchmark datasets, as well as field-captured blurry\nsmartphone videos.\n","authors":["Jeongtaek Oh","Jaeyoung Chung","Dongwoo Lee","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.11358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11357v1","updated":"2024-04-17T13:12:14Z","published":"2024-04-17T13:12:14Z","title":"Detector Collapse: Backdooring Object Detection to Catastrophic Overload\n or Blindness","summary":" Object detection tasks, crucial in safety-critical systems like autonomous\ndriving, focus on pinpointing object locations. These detectors are known to be\nsusceptible to backdoor attacks. However, existing backdoor techniques have\nprimarily been adapted from classification tasks, overlooking deeper\nvulnerabilities specific to object detection. This paper is dedicated to\nbridging this gap by introducing Detector Collapse} (DC), a brand-new backdoor\nattack paradigm tailored for object detection. DC is designed to instantly\nincapacitate detectors (i.e., severely impairing detector's performance and\nculminating in a denial-of-service). To this end, we develop two innovative\nattack schemes: Sponge for triggering widespread misidentifications and\nBlinding for rendering objects invisible. Remarkably, we introduce a novel\npoisoning strategy exploiting natural objects, enabling DC to act as a\npractical backdoor in real-world environments. Our experiments on different\ndetectors across several benchmarks show a significant improvement\n($\\sim$10\\%-60\\% absolute and $\\sim$2-7$\\times$ relative) in attack efficacy\nover state-of-the-art attacks.\n","authors":["Hangtao Zhang","Shengshan Hu","Yichen Wang","Leo Yu Zhang","Ziqi Zhou","Xianlong Wang","Yanjun Zhang","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.11357v1.pdf","comment":"Accepted by IJCAI-24"},{"id":"http://arxiv.org/abs/2404.11355v1","updated":"2024-04-17T13:09:44Z","published":"2024-04-17T13:09:44Z","title":"Consisaug: A Consistency-based Augmentation for Polyp Detection in\n Endoscopy Image Analysis","summary":" Colorectal cancer (CRC), which frequently originates from initially benign\npolyps, remains a significant contributor to global cancer-related mortality.\nEarly and accurate detection of these polyps via colonoscopy is crucial for CRC\nprevention. However, traditional colonoscopy methods depend heavily on the\noperator's experience, leading to suboptimal polyp detection rates. Besides,\nthe public database are limited in polyp size and shape diversity. To enhance\nthe available data for polyp detection, we introduce Consisaug, an innovative\nand effective methodology to augment data that leverages deep learning. We\nutilize the constraint that when the image is flipped the class label should be\nequal and the bonding boxes should be consistent. We implement our Consisaug on\nfive public polyp datasets and at three backbones, and the results show the\neffectiveness of our method.\n","authors":["Ziyu Zhou","Wenyuan Shen","Chang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11355v1.pdf","comment":"MLMI 2023"},{"id":"http://arxiv.org/abs/2404.11339v1","updated":"2024-04-17T13:00:05Z","published":"2024-04-17T13:00:05Z","title":"Best Practices for a Handwritten Text Recognition System","summary":" Handwritten text recognition has been developed rapidly in the recent years,\nfollowing the rise of deep learning and its applications. Though deep learning\nmethods provide notable boost in performance concerning text recognition,\nnon-trivial deviation in performance can be detected even when small\npre-processing or architectural/optimization elements are changed. This work\nfollows a ``best practice'' rationale; highlight simple yet effective empirical\npractices that can further help training and provide well-performing\nhandwritten text recognition systems. Specifically, we considered three basic\naspects of a deep HTR system and we proposed simple yet effective solutions: 1)\nretain the aspect ratio of the images in the preprocessing step, 2) use\nmax-pooling for converting the 3D feature map of CNN output into a sequence of\nfeatures and 3) assist the training procedure via an additional CTC loss which\nacts as a shortcut on the max-pooled sequential features. Using these proposed\nsimple modifications, one can attain close to state-of-the-art results, while\nconsidering a basic convolutional-recurrent (CNN+LSTM) architecture, for both\nIAM and RIMES datasets. Code is available at\nhttps://github.com/georgeretsi/HTR-best-practices/.\n","authors":["George Retsinas","Giorgos Sfikas","Basilis Gatos","Christophoros Nikou"],"pdf_url":"https://arxiv.org/pdf/2404.11339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11336v1","updated":"2024-04-17T12:53:57Z","published":"2024-04-17T12:53:57Z","title":"Vision-based control for landing an aerial vehicle on a marine vessel","summary":" This work addresses the landing problem of an aerial vehicle, exemplified by\na simple quadrotor, on a moving platform using image-based visual servo\ncontrol. First, the mathematical model of the quadrotor aircraft is introduced,\nfollowed by the design of the inner-loop control. At the second stage, the\nimage features on the textured target plane are exploited to derive a\nvision-based control law. The image of the spherical centroid of a set of\nlandmarks present in the landing target is used as a position measurement,\nwhereas the translational optical flow is used as velocity measurement. The\nkinematics of the vision-based system is expressed in terms of the observable\nfeatures, and the proposed control law guarantees convergence without\nestimating the unknown distance between the vision system and the target, which\nis also guaranteed to remain strictly positive, avoiding undesired collisions.\nThe performance of the proposed control law is evaluated in MATLAB and 3-D\nsimulation software Gazebo. Simulation results for a quadrotor UAV are provided\nfor different velocity profiles of the moving target, showcasing the robustness\nof the proposed controller.\n","authors":["Haohua Dong"],"pdf_url":"https://arxiv.org/pdf/2404.11336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11335v1","updated":"2024-04-17T12:53:45Z","published":"2024-04-17T12:53:45Z","title":"SoccerNet Game State Reconstruction: End-to-End Athlete Tracking and\n Identification on a Minimap","summary":" Tracking and identifying athletes on the pitch holds a central role in\ncollecting essential insights from the game, such as estimating the total\ndistance covered by players or understanding team tactics. This tracking and\nidentification process is crucial for reconstructing the game state, defined by\nthe athletes' positions and identities on a 2D top-view of the pitch, (i.e. a\nminimap). However, reconstructing the game state from videos captured by a\nsingle camera is challenging. It requires understanding the position of the\nathletes and the viewpoint of the camera to localize and identify players\nwithin the field. In this work, we formalize the task of Game State\nReconstruction and introduce SoccerNet-GSR, a novel Game State Reconstruction\ndataset focusing on football videos. SoccerNet-GSR is composed of 200 video\nsequences of 30 seconds, annotated with 9.37 million line points for pitch\nlocalization and camera calibration, as well as over 2.36 million athlete\npositions on the pitch with their respective role, team, and jersey number.\nFurthermore, we introduce GS-HOTA, a novel metric to evaluate game state\nreconstruction methods. Finally, we propose and release an end-to-end baseline\nfor game state reconstruction, bootstrapping the research on this task. Our\nexperiments show that GSR is a challenging novel task, which opens the field\nfor future research. Our dataset and codebase are publicly available at\nhttps://github.com/SoccerNet/sn-gamestate.\n","authors":["Vladimir Somers","Victor Joos","Anthony Cioppa","Silvio Giancola","Seyed Abolfazl Ghasemzadeh","Floriane Magera","Baptiste Standaert","Amir Mohammad Mansourian","Xin Zhou","Shohreh Kasaei","Bernard Ghanem","Alexandre Alahi","Marc Van Droogenbroeck","Christophe De Vleeschouwer"],"pdf_url":"https://arxiv.org/pdf/2404.11335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11327v1","updated":"2024-04-17T12:39:48Z","published":"2024-04-17T12:39:48Z","title":"Following the Human Thread in Social Navigation","summary":" The success of collaboration between humans and robots in shared environments\nrelies on the robot's real-time adaptation to human motion. Specifically, in\nSocial Navigation, the agent should be close enough to assist but ready to back\nup to let the human move freely, avoiding collisions. Human trajectories emerge\nas crucial cues in Social Navigation, but they are partially observable from\nthe robot's egocentric view and computationally complex to process.\n We propose the first Social Dynamics Adaptation model (SDA) based on the\nrobot's state-action history to infer the social dynamics. We propose a\ntwo-stage Reinforcement Learning framework: the first learns to encode the\nhuman trajectories into social dynamics and learns a motion policy conditioned\non this encoded information, the current status, and the previous action. Here,\nthe trajectories are fully visible, i.e., assumed as privileged information. In\nthe second stage, the trained policy operates without direct access to\ntrajectories. Instead, the model infers the social dynamics solely from the\nhistory of previous actions and statuses in real-time. Tested on the novel\nHabitat 3.0 platform, SDA sets a novel state of the art (SoA) performance in\nfinding and following humans.\n","authors":["Luca Scofano","Alessio Sampieri","Tommaso Campari","Valentino Sacco","Indro Spinelli","Lamberto Ballan","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2404.11327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11326v1","updated":"2024-04-17T12:38:58Z","published":"2024-04-17T12:38:58Z","title":"Single-temporal Supervised Remote Change Detection for Domain\n Generalization","summary":" Change detection is widely applied in remote sensing image analysis. Existing\nmethods require training models separately for each dataset, which leads to\npoor domain generalization. Moreover, these methods rely heavily on large\namounts of high-quality pair-labelled data for training, which is expensive and\nimpractical. In this paper, we propose a multimodal contrastive learning\n(ChangeCLIP) based on visual-language pre-training for change detection domain\ngeneralization. Additionally, we propose a dynamic context optimization for\nprompt learning. Meanwhile, to address the data dependency issue of existing\nmethods, we introduce a single-temporal and controllable AI-generated training\nstrategy (SAIN). This allows us to train the model using a large number of\nsingle-temporal images without image pairs in the real world, achieving\nexcellent generalization. Extensive experiments on series of real change\ndetection datasets validate the superiority and strong generalization of\nChangeCLIP, outperforming state-of-the-art change detection methods. Code will\nbe available.\n","authors":["Qiangang Du","Jinlong Peng","Xu Chen","Qingdong He","Qiang Nie","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01431v2","updated":"2024-04-17T12:36:06Z","published":"2023-12-03T15:40:10Z","title":"D$^2$ST-Adapter: Disentangled-and-Deformable Spatio-Temporal Adapter for\n Few-shot Action Recognition","summary":" Adapting large pre-trained image models to few-shot action recognition has\nproven to be an effective and efficient strategy for learning robust feature\nextractors, which is essential for few-shot learning. Typical fine-tuning based\nadaptation paradigm is prone to overfitting in the few-shot learning scenarios\nand offers little modeling flexibility for learning temporal features in video\ndata. In this work we present the Disentangled-and-Deformable Spatio-Temporal\nAdapter (D$^2$ST-Adapter), which is a novel adapter tuning framework\nwell-suited for few-shot action recognition due to lightweight design and low\nparameter-learning overhead. It is designed in a dual-pathway architecture to\nencode spatial and temporal features in a disentangled manner. In particular,\nwe devise the anisotropic Deformable Spatio-Temporal Attention module as the\ncore component of D$^2$ST-Adapter, which can be tailored with anisotropic\nsampling densities along spatial and temporal domains to learn spatial and\ntemporal features specifically in corresponding pathways, allowing our\nD$^2$ST-Adapter to encode features in a global view in 3D spatio-temporal space\nwhile maintaining a lightweight design. Extensive experiments with\ninstantiations of our method on both pre-trained ResNet and ViT demonstrate the\nsuperiority of our method over state-of-the-art methods for few-shot action\nrecognition. Our method is particularly well-suited to challenging scenarios\nwhere temporal dynamics are critical for action recognition.\n","authors":["Wenjie Pei","Qizhong Tan","Guangming Lu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2312.01431v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11322v1","updated":"2024-04-17T12:34:49Z","published":"2024-04-17T12:34:49Z","title":"VBR: A Vision Benchmark in Rome","summary":" This paper presents a vision and perception research dataset collected in\nRome, featuring RGB data, 3D point clouds, IMU, and GPS data. We introduce a\nnew benchmark targeting visual odometry and SLAM, to advance the research in\nautonomous robotics and computer vision. This work complements existing\ndatasets by simultaneously addressing several issues, such as environment\ndiversity, motion patterns, and sensor frequency. It uses up-to-date devices\nand presents effective procedures to accurately calibrate the intrinsic and\nextrinsic of the sensors while addressing temporal synchronization. During\nrecording, we cover multi-floor buildings, gardens, urban and highway\nscenarios. Combining handheld and car-based data collections, our setup can\nsimulate any robot (quadrupeds, quadrotors, autonomous vehicles). The dataset\nincludes an accurate 6-dof ground truth based on a novel methodology that\nrefines the RTK-GPS estimate with LiDAR point clouds through Bundle Adjustment.\nAll sequences divided in training and testing are accessible through our\nwebsite.\n","authors":["Leonardo Brizi","Emanuele Giacomini","Luca Di Giammarino","Simone Ferrari","Omar Salem","Lorenzo De Rebotti","Giorgio Grisetti"],"pdf_url":"https://arxiv.org/pdf/2404.11322v1.pdf","comment":"Accepted at IEEE ICRA 2024 Website:\n https://rvp-group.net/datasets/slam.html"},{"id":"http://arxiv.org/abs/2404.11318v1","updated":"2024-04-17T12:32:10Z","published":"2024-04-17T12:32:10Z","title":"Leveraging Fine-Grained Information and Noise Decoupling for Remote\n Sensing Change Detection","summary":" Change detection aims to identify remote sense object changes by analyzing\ndata between bitemporal image pairs. Due to the large temporal and spatial span\nof data collection in change detection image pairs, there are often a\nsignificant amount of task-specific and task-agnostic noise. Previous effort\nhas focused excessively on denoising, with this goes a great deal of loss of\nfine-grained information. In this paper, we revisit the importance of\nfine-grained features in change detection and propose a series of operations\nfor fine-grained information compensation and noise decoupling (FINO). First,\nthe context is utilized to compensate for the fine-grained information in the\nfeature space. Next, a shape-aware and a brightness-aware module are designed\nto improve the capacity for representation learning. The shape-aware module\nguides the backbone for more precise shape estimation, guiding the backbone\nnetwork in extracting object shape features. The brightness-aware module learns\na overall brightness estimation to improve the model's robustness to\ntask-agnostic noise. Finally, a task-specific noise decoupling structure is\ndesigned as a way to improve the model's ability to separate noise interference\nfrom feature similarity. With these training schemes, our proposed method\nachieves new state-of-the-art (SOTA) results in multiple change detection\nbenchmarks. The code will be made available.\n","authors":["Qiangang Du","Jinlong Peng","Changan Wang","Xu Chen","Qingdong He","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11317v1","updated":"2024-04-17T12:30:54Z","published":"2024-04-17T12:30:54Z","title":"Improving Composed Image Retrieval via Contrastive Learning with Scaling\n Positives and Negatives","summary":" The Composed Image Retrieval (CIR) task aims to retrieve target images using\na composed query consisting of a reference image and a modified text. Advanced\nmethods often utilize contrastive learning as the optimization objective, which\nbenefits from adequate positive and negative examples. However, the triplet for\nCIR incurs high manual annotation costs, resulting in limited positive\nexamples. Furthermore, existing methods commonly use in-batch negative\nsampling, which reduces the negative number available for the model. To address\nthe problem of lack of positives, we propose a data generation method by\nleveraging a multi-modal large language model to construct triplets for CIR. To\nintroduce more negatives during fine-tuning, we design a two-stage fine-tuning\nframework for CIR, whose second stage introduces plenty of static\nrepresentations of negatives to optimize the representation space rapidly. The\nabove two improvements can be effectively stacked and designed to be\nplug-and-play, easily applied to existing CIR models without changing their\noriginal architectures. Extensive experiments and ablation analysis demonstrate\nthat our method effectively scales positives and negatives and achieves\nstate-of-the-art results on both FashionIQ and CIRR datasets. In addition, our\nmethods also perform well in zero-shot composed image retrieval, providing a\nnew CIR solution for the low-resources scenario.\n","authors":["Zhangchi Feng","Richong Zhang","Zhijie Nie"],"pdf_url":"https://arxiv.org/pdf/2404.11317v1.pdf","comment":"12 pages, 11 figures"},{"id":"http://arxiv.org/abs/2309.11930v2","updated":"2024-04-17T12:27:25Z","published":"2023-09-21T09:44:39Z","title":"Bridging the Gap: Learning Pace Synchronization for Open-World\n Semi-Supervised Learning","summary":" In open-world semi-supervised learning, a machine learning model is tasked\nwith uncovering novel categories from unlabeled data while maintaining\nperformance on seen categories from labeled data. The central challenge is the\nsubstantial learning gap between seen and novel categories, as the model learns\nthe former faster due to accurate supervisory information. Moreover, capturing\nthe semantics of unlabeled novel category samples is also challenging due to\nthe missing label information. To address the above issues, we introduce 1) the\nadaptive synchronizing marginal loss which imposes class-specific negative\nmargins to alleviate the model bias towards seen classes, and 2) the\npseudo-label contrastive clustering which exploits pseudo-labels predicted by\nthe model to group unlabeled data from the same category together in the output\nspace. Extensive experiments on benchmark datasets demonstrate that previous\napproaches may significantly hinder novel class learning, whereas our method\nstrikingly balances the learning pace between seen and novel classes, achieving\na remarkable 3% average accuracy increase on the ImageNet dataset. Importantly,\nwe find that fine-tuning the self-supervised pre-trained model significantly\nboosts the performance, which is overlooked in prior literature. Our code is\navailable at https://github.com/yebo0216best/LPS-main.\n","authors":["Bo Ye","Kai Gan","Tong Wei","Min-Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.11930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11309v1","updated":"2024-04-17T12:21:57Z","published":"2024-04-17T12:21:57Z","title":"Achieving Rotation Invariance in Convolution Operations: Shifting from\n Data-Driven to Mechanism-Assured","summary":" Achieving rotation invariance in deep neural networks without relying on data\nhas always been a hot research topic. Intrinsic rotation invariance can enhance\nthe model's feature representation capability, enabling better performance in\ntasks such as multi-orientation object recognition and detection. Based on\nvarious types of non-learnable operators, including gradient, sort, local\nbinary pattern, maximum, etc., this paper designs a set of new convolution\noperations that are natually invariant to arbitrary rotations. Unlike most\nprevious studies, these rotation-invariant convolutions (RIConvs) have the same\nnumber of learnable parameters and a similar computational process as\nconventional convolution operations, allowing them to be interchangeable. Using\nthe MNIST-Rot dataset, we first verify the invariance of these RIConvs under\nvarious rotation angles and compare their performance with previous\nrotation-invariant convolutional neural networks (RI-CNNs). Two types of\nRIConvs based on gradient operators achieve state-of-the-art results.\nSubsequently, we combine RIConvs with different types and depths of classic CNN\nbackbones. Using the OuTex_00012, MTARSI, and NWPU-RESISC-45 datasets, we test\ntheir performance on texture recognition, aircraft type recognition, and remote\nsensing image classification tasks. The results show that RIConvs significantly\nimprove the accuracy of these CNN backbones, especially when the training data\nis limited. Furthermore, we find that even with data augmentation, RIConvs can\nfurther enhance model performance.\n","authors":["Hanlin Mo","Guoying Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.11309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11302v1","updated":"2024-04-17T12:13:18Z","published":"2024-04-17T12:13:18Z","title":"A Semantic Segmentation-guided Approach for Ground-to-Aerial Image\n Matching","summary":" Nowadays the accurate geo-localization of ground-view images has an important\nrole across domains as diverse as journalism, forensics analysis, transports,\nand Earth Observation. This work addresses the problem of matching a query\nground-view image with the corresponding satellite image without GPS data. This\nis done by comparing the features from a ground-view image and a satellite one,\ninnovatively leveraging the corresponding latter's segmentation mask through a\nthree-stream Siamese-like network. The proposed method, Semantic Align Net\n(SAN), focuses on limited Field-of-View (FoV) and ground panorama images\n(images with a FoV of 360{\\deg}). The novelty lies in the fusion of satellite\nimages in combination with their semantic segmentation masks, aimed at ensuring\nthat the model can extract useful features and focus on the significant parts\nof the images. This work shows how SAN through semantic analysis of images\nimproves the performance on the unlabelled CVUSA dataset for all the tested\nFoVs.\n","authors":["Francesco Pro","Nikolaos Dionelis","Luca Maiano","Bertrand Le Saux","Irene Amerini"],"pdf_url":"https://arxiv.org/pdf/2404.11302v1.pdf","comment":"6 pages, 2 figures, 2 tables, Submitted to IGARSS 2024"},{"id":"http://arxiv.org/abs/2404.11299v1","updated":"2024-04-17T12:12:48Z","published":"2024-04-17T12:12:48Z","title":"Learning from Unlabelled Data with Transformers: Domain Adaptation for\n Semantic Segmentation of High Resolution Aerial Images","summary":" Data from satellites or aerial vehicles are most of the times unlabelled.\nAnnotating such data accurately is difficult, requires expertise, and is costly\nin terms of time. Even if Earth Observation (EO) data were correctly labelled,\nlabels might change over time. Learning from unlabelled data within a\nsemi-supervised learning framework for segmentation of aerial images is\nchallenging. In this paper, we develop a new model for semantic segmentation of\nunlabelled images, the Non-annotated Earth Observation Semantic Segmentation\n(NEOS) model. NEOS performs domain adaptation as the target domain does not\nhave ground truth semantic segmentation masks. The distribution inconsistencies\nbetween the target and source domains are due to differences in acquisition\nscenes, environment conditions, sensors, and times. Our model aligns the\nlearned representations of the different domains to make them coincide. The\nevaluation results show that NEOS is successful and outperforms other models\nfor semantic segmentation of unlabelled data.\n","authors":["Nikolaos Dionelis","Francesco Pro","Luca Maiano","Irene Amerini","Bertrand Le Saux"],"pdf_url":"https://arxiv.org/pdf/2404.11299v1.pdf","comment":"6 pages, 7 figures, Submitted to IGARSS 2024"},{"id":"http://arxiv.org/abs/2404.10588v2","updated":"2024-04-17T12:09:17Z","published":"2024-04-16T14:13:44Z","title":"Do Counterfactual Examples Complicate Adversarial Training?","summary":" We leverage diffusion models to study the robustness-performance tradeoff of\nrobust classifiers. Our approach introduces a simple, pretrained diffusion\nmethod to generate low-norm counterfactual examples (CEs): semantically altered\ndata which results in different true class membership. We report that the\nconfidence and accuracy of robust models on their clean training data are\nassociated with the proximity of the data to their CEs. Moreover, robust models\nperform very poorly when evaluated on the CEs directly, as they become\nincreasingly invariant to the low-norm, semantic changes brought by CEs. The\nresults indicate a significant overlap between non-robust and semantic\nfeatures, countering the common assumption that non-robust features are not\ninterpretable.\n","authors":["Eric Yeats","Cameron Darwin","Eduardo Ortega","Frank Liu","Hai Li"],"pdf_url":"https://arxiv.org/pdf/2404.10588v2.pdf","comment":"Accepted as a short paper to the GCV Workshop at CVPR'24"},{"id":"http://arxiv.org/abs/2403.00303v2","updated":"2024-04-17T12:05:28Z","published":"2024-03-01T06:13:53Z","title":"ODM: A Text-Image Further Alignment Pre-training Approach for Scene Text\n Detection and Spotting","summary":" In recent years, text-image joint pre-training techniques have shown\npromising results in various tasks. However, in Optical Character Recognition\n(OCR) tasks, aligning text instances with their corresponding text regions in\nimages poses a challenge, as it requires effective alignment between text and\nOCR-Text (referring to the text in images as OCR-Text to distinguish from the\ntext in natural language) rather than a holistic understanding of the overall\nimage content. In this paper, we propose a new pre-training method called\nOCR-Text Destylization Modeling (ODM) that transfers diverse styles of text\nfound in images to a uniform style based on the text prompt. With ODM, we\nachieve better alignment between text and OCR-Text and enable pre-trained\nmodels to adapt to the complex and diverse styles of scene text detection and\nspotting tasks. Additionally, we have designed a new labeling generation method\nspecifically for ODM and combined it with our proposed Text-Controller module\nto address the challenge of annotation costs in OCR tasks, allowing a larger\namount of unlabeled data to participate in pre-training. Extensive experiments\non multiple public datasets demonstrate that our method significantly improves\nperformance and outperforms current pre-training methods in scene text\ndetection and spotting tasks. Code is available at\nhttps://github.com/PriNing/ODM.\n","authors":["Chen Duan","Pei Fu","Shan Guo","Qianyi Jiang","Xiaoming Wei"],"pdf_url":"https://arxiv.org/pdf/2403.00303v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.11291v1","updated":"2024-04-17T11:55:45Z","published":"2024-04-17T11:55:45Z","title":"Closely Interactive Human Reconstruction with Proxemics and\n Physics-Guided Adaption","summary":" Existing multi-person human reconstruction approaches mainly focus on\nrecovering accurate poses or avoiding penetration, but overlook the modeling of\nclose interactions. In this work, we tackle the task of reconstructing closely\ninteractive humans from a monocular video. The main challenge of this task\ncomes from insufficient visual information caused by depth ambiguity and severe\ninter-person occlusion. In view of this, we propose to leverage knowledge from\nproxemic behavior and physics to compensate the lack of visual information.\nThis is based on the observation that human interaction has specific patterns\nfollowing the social proxemics. Specifically, we first design a latent\nrepresentation based on Vector Quantised-Variational AutoEncoder (VQ-VAE) to\nmodel human interaction. A proxemics and physics guided diffusion model is then\nintroduced to denoise the initial distribution. We design the diffusion model\nas dual branch with each branch representing one individual such that the\ninteraction can be modeled via cross attention. With the learned priors of\nVQ-VAE and physical constraint as the additional information, our proposed\napproach is capable of estimating accurate poses that are also proxemics and\nphysics plausible. Experimental results on Hi4D, 3DPW, and CHI3D demonstrate\nthat our method outperforms existing approaches. The code is available at\n\\url{https://github.com/boycehbz/HumanInteraction}.\n","authors":["Buzhen Huang","Chen Li","Chongyang Xu","Liang Pan","Yangang Wang","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.11291v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2306.16533v2","updated":"2024-04-17T11:38:12Z","published":"2023-06-28T20:06:36Z","title":"ICSVR: Investigating Compositional and Syntactic Understanding in Video\n Retrieval Models","summary":" Video retrieval (VR) involves retrieving the ground truth video from the\nvideo database given a text caption or vice-versa. The two important components\nof compositionality: objects & attributes and actions are joined using correct\nsyntax to form a proper text query. These components (objects & attributes,\nactions and syntax) each play an important role to help distinguish among\nvideos and retrieve the correct ground truth video. However, it is unclear what\nis the effect of these components on the video retrieval performance. We\ntherefore, conduct a systematic study to evaluate the compositional and\nsyntactic understanding of video retrieval models on standard benchmarks such\nas MSRVTT, MSVD and DIDEMO. The study is performed on two categories of video\nretrieval models: (i) which are pre-trained on video-text pairs and fine-tuned\non downstream video retrieval datasets (Eg. Frozen-in-Time, Violet, MCQ etc.)\n(ii) which adapt pre-trained image-text representations like CLIP for video\nretrieval (Eg. CLIP4Clip, XCLIP, CLIP2Video etc.). Our experiments reveal that\nactions and syntax play a minor role compared to objects & attributes in video\nunderstanding. Moreover, video retrieval models that use pre-trained image-text\nrepresentations (CLIP) have better syntactic and compositional understanding as\ncompared to models pre-trained on video-text data. The code is available at\nhttps://github.com/IntelLabs/multimodal_cognitive_ai/tree/main/ICSVR\n","authors":["Avinash Madasu","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2306.16533v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11273v1","updated":"2024-04-17T11:25:19Z","published":"2024-04-17T11:25:19Z","title":"Training Transformer Models by Wavelet Losses Improves Quantitative and\n Visual Performance in Single Image Super-Resolution","summary":" Transformer-based models have achieved remarkable results in low-level vision\ntasks including image super-resolution (SR). However, early Transformer-based\napproaches that rely on self-attention within non-overlapping windows encounter\nchallenges in acquiring global information. To activate more input pixels\nglobally, hybrid attention models have been proposed. Moreover, training by\nsolely minimizing pixel-wise RGB losses, such as L1, have been found inadequate\nfor capturing essential high-frequency details. This paper presents two\ncontributions: i) We introduce convolutional non-local sparse attention (NLSA)\nblocks to extend the hybrid transformer architecture in order to further\nenhance its receptive field. ii) We employ wavelet losses to train Transformer\nmodels to improve quantitative and subjective performance. While wavelet losses\nhave been explored previously, showing their power in training\nTransformer-based SR models is novel. Our experimental results demonstrate that\nthe proposed model provides state-of-the-art PSNR results as well as superior\nvisual performance across various benchmark datasets.\n","authors":["Cansu Korkmaz","A. Murat Tekalp"],"pdf_url":"https://arxiv.org/pdf/2404.11273v1.pdf","comment":"total of 10 pages including references, 5 tables and 5 figures,\n accepted for NTIRE 2024 Single Image Super Resolution (x4) challenge"},{"id":"http://arxiv.org/abs/2404.11266v1","updated":"2024-04-17T11:17:12Z","published":"2024-04-17T11:17:12Z","title":"Criteria for Uncertainty-based Corner Cases Detection in Instance\n Segmentation","summary":" The operating environment of a highly automated vehicle is subject to change,\ne.g., weather, illumination, or the scenario containing different objects and\nother participants in which the highly automated vehicle has to navigate its\npassengers safely. These situations must be considered when developing and\nvalidating highly automated driving functions. This already poses a problem for\ntraining and evaluating deep learning models because without the costly\nlabeling of thousands of recordings, not knowing whether the data contains\nrelevant, interesting data for further model training, it is a guess under\nwhich conditions and situations the model performs poorly. For this purpose, we\npresent corner case criteria based on the predictive uncertainty. With our\ncorner case criteria, we are able to detect uncertainty-based corner cases of\nan object instance segmentation model without relying on ground truth (GT)\ndata. We evaluated each corner case criterion using the COCO and the NuImages\ndataset to analyze the potential of our approach. We also provide a corner case\ndecision function that allows us to distinguish each object into True Positive\n(TP), localization and/or classification corner case, or False Positive (FP).\nWe also present our first results of an iterative training cycle that\noutperforms the baseline and where the data added to the training dataset is\nselected based on the corner case decision function.\n","authors":["Florian Heidecker","Ahmad El-Khateeb","Maarten Bieshaar","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2404.11266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11265v1","updated":"2024-04-17T11:15:58Z","published":"2024-04-17T11:15:58Z","title":"The Victim and The Beneficiary: Exploiting a Poisoned Model to Train a\n Clean Model on Poisoned Data","summary":" Recently, backdoor attacks have posed a serious security threat to the\ntraining process of deep neural networks (DNNs). The attacked model behaves\nnormally on benign samples but outputs a specific result when the trigger is\npresent. However, compared with the rocketing progress of backdoor attacks,\nexisting defenses are difficult to deal with these threats effectively or\nrequire benign samples to work, which may be unavailable in real scenarios. In\nthis paper, we find that the poisoned samples and benign samples can be\ndistinguished with prediction entropy. This inspires us to propose a novel\ndual-network training framework: The Victim and The Beneficiary (V&B), which\nexploits a poisoned model to train a clean model without extra benign samples.\nFirstly, we sacrifice the Victim network to be a powerful poisoned sample\ndetector by training on suspicious samples. Secondly, we train the Beneficiary\nnetwork on the credible samples selected by the Victim to inhibit backdoor\ninjection. Thirdly, a semi-supervised suppression strategy is adopted for\nerasing potential backdoors and improving model performance. Furthermore, to\nbetter inhibit missed poisoned samples, we propose a strong data augmentation\nmethod, AttentionMix, which works well with our proposed V&B framework.\nExtensive experiments on two widely used datasets against 6 state-of-the-art\nattacks demonstrate that our framework is effective in preventing backdoor\ninjection and robust to various attacks while maintaining the performance on\nbenign samples. Our code is available at https://github.com/Zixuan-Zhu/VaB.\n","authors":["Zixuan Zhu","Rui Wang","Cong Zou","Lihua Jing"],"pdf_url":"https://arxiv.org/pdf/2404.11265v1.pdf","comment":"13 pages, 6 figures, published to ICCV"},{"id":"http://arxiv.org/abs/2402.17187v3","updated":"2024-04-17T11:08:02Z","published":"2024-02-27T03:53:27Z","title":"PE-MVCNet: Multi-view and Cross-modal Fusion Network for Pulmonary\n Embolism Prediction","summary":" The early detection of a pulmonary embolism (PE) is critical for enhancing\npatient survival rates. Both image-based and non-image-based features are of\nutmost importance in medical classification tasks. In a clinical setting,\nphysicians tend to rely on the contextual information provided by Electronic\nMedical Records (EMR) to interpret medical imaging. However, very few models\neffectively integrate clinical information with imaging data. To address this\nshortcoming, we suggest a multimodal fusion methodology, termed PE-MVCNet,\nwhich capitalizes on Computed Tomography Pulmonary Angiography imaging and EMR\ndata. This method comprises the Image-only module with an integrated multi-view\nblock, the EMR-only module, and the Cross-modal Attention Fusion (CMAF) module.\nThese modules cooperate to extract comprehensive features that subsequently\ngenerate predictions for PE. We conducted experiments using the publicly\naccessible Stanford University Medical Center dataset, achieving an AUROC of\n94.1%, an accuracy rate of 90.2%, and an F1 score of 90.6%. Our proposed model\noutperforms existing methodologies, corroborating that our multimodal fusion\nmodel excels compared to models that use a single data modality. Our source\ncode is available at https://github.com/LeavingStarW/PE-MVCNET.\n","authors":["Zhaoxin Guo","Zhipeng Wang","Ruiquan Ge","Jianxun Yu","Feiwei Qin","Yuan Tian","Yuqing Peng","Yonghong Li","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2402.17187v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11256v1","updated":"2024-04-17T11:06:42Z","published":"2024-04-17T11:06:42Z","title":"MMCBE: Multi-modality Dataset for Crop Biomass Estimation and Beyond","summary":" Crop biomass, a critical indicator of plant growth, health, and productivity,\nis invaluable for crop breeding programs and agronomic research. However, the\naccurate and scalable quantification of crop biomass remains inaccessible due\nto limitations in existing measurement methods. One of the obstacles impeding\nthe advancement of current crop biomass prediction methodologies is the\nscarcity of publicly available datasets. Addressing this gap, we introduce a\nnew dataset in this domain, i.e. Multi-modality dataset for crop biomass\nestimation (MMCBE). Comprising 216 sets of multi-view drone images, coupled\nwith LiDAR point clouds, and hand-labelled ground truth, MMCBE represents the\nfirst multi-modality one in the field. This dataset aims to establish benchmark\nmethods for crop biomass quantification and foster the development of\nvision-based approaches. We have rigorously evaluated state-of-the-art crop\nbiomass estimation methods using MMCBE and ventured into additional potential\napplications, such as 3D crop reconstruction from drone imagery and novel-view\nrendering. With this publication, we are making our comprehensive dataset\navailable to the broader community.\n","authors":["Xuesong Li","Zeeshan Hayder","Ali Zia","Connor Cassidy","Shiming Liu","Warwick Stiller","Eric Stone","Warren Conaty","Lars Petersson","Vivien Rolland"],"pdf_url":"https://arxiv.org/pdf/2404.11256v1.pdf","comment":"10 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2305.10300v5","updated":"2024-04-17T11:04:57Z","published":"2023-05-17T15:37:47Z","title":"One-Prompt to Segment All Medical Images","summary":" Large foundation models, known for their strong zero-shot generalization,\nhave excelled in visual and language applications. However, applying them to\nmedical image segmentation, a domain with diverse imaging types and target\nlabels, remains an open challenge. Current approaches, such as adapting\ninteractive segmentation models like Segment Anything Model (SAM), require user\nprompts for each sample during inference. Alternatively, transfer learning\nmethods like few/one-shot models demand labeled samples, leading to high costs.\nThis paper introduces a new paradigm toward the universal medical image\nsegmentation, termed 'One-Prompt Segmentation.' One-Prompt Segmentation\ncombines the strengths of one-shot and interactive methods. In the inference\nstage, with just \\textbf{one prompted sample}, it can adeptly handle the unseen\ntask in a single forward pass. We train One-Prompt Model on 64 open-source\nmedical datasets, accompanied by the collection of over 3,000 clinician-labeled\nprompts. Tested on 14 previously unseen datasets, the One-Prompt Model\nshowcases superior zero-shot segmentation capabilities, outperforming a wide\nrange of related methods. The code and data is released as\nhttps://github.com/KidsWithTokens/one-prompt.\n","authors":["Junde Wu","Jiayuan Zhu","Yueming Jin","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2305.10300v5.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.12620"},{"id":"http://arxiv.org/abs/2404.11249v1","updated":"2024-04-17T10:56:06Z","published":"2024-04-17T10:56:06Z","title":"A Progressive Framework of Vision-language Knowledge Distillation and\n Alignment for Multilingual Scene","summary":" Pre-trained vision-language (V-L) models such as CLIP have shown excellent\nperformance in many downstream cross-modal tasks. However, most of them are\nonly applicable to the English context. Subsequent research has focused on this\nproblem and proposed improved models, such as CN-CLIP and AltCLIP, to\nfacilitate their applicability to Chinese and even other languages.\nNevertheless, these models suffer from high latency and a large memory\nfootprint in inference, which limits their further deployment on\nresource-constrained edge devices. In this work, we propose a conceptually\nsimple yet effective multilingual CLIP Compression framework and train a\nlightweight multilingual vision-language model, called DC-CLIP, for both\nChinese and English context. In this framework, we collect high-quality Chinese\nand English text-image pairs and design two training stages, including\nmultilingual vision-language feature distillation and alignment. During the\nfirst stage, lightweight image/text student models are designed to learn robust\nvisual/multilingual textual feature representation ability from corresponding\nteacher models, respectively. Subsequently, the multilingual vision-language\nalignment stage enables effective alignment of visual and multilingual textual\nfeatures to further improve the model's multilingual performance. Comprehensive\nexperiments in zero-shot image classification, conducted based on the ELEVATER\nbenchmark, showcase that DC-CLIP achieves superior performance in the English\ncontext and competitive performance in the Chinese context, even with less\ntraining data, when compared to existing models of similar parameter magnitude.\nThe evaluation demonstrates the effectiveness of our designed training\nmechanism.\n","authors":["Wenbo Zhang","Yifan Zhang","Jianfeng Lin","Binqiang Huang","Jinlu Zhang","Wenhao Yu"],"pdf_url":"https://arxiv.org/pdf/2404.11249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11243v1","updated":"2024-04-17T10:49:00Z","published":"2024-04-17T10:49:00Z","title":"Optical Image-to-Image Translation Using Denoising Diffusion Models:\n Heterogeneous Change Detection as a Use Case","summary":" We introduce an innovative deep learning-based method that uses a denoising\ndiffusion-based model to translate low-resolution images to high-resolution\nones from different optical sensors while preserving the contents and avoiding\nundesired artifacts. The proposed method is trained and tested on a large and\ndiverse data set of paired Sentinel-II and Planet Dove images. We show that it\ncan solve serious image generation issues observed when the popular\nclassifier-free guided Denoising Diffusion Implicit Model (DDIM) framework is\nused in the task of Image-to-Image Translation of multi-sensor optical remote\nsensing images and that it can generate large images with highly consistent\npatches, both in colors and in features. Moreover, we demonstrate how our\nmethod improves heterogeneous change detection results in two urban areas:\nBeirut, Lebanon, and Austin, USA. Our contributions are: i) a new training and\ntesting algorithm based on denoising diffusion models for optical image\ntranslation; ii) a comprehensive image quality evaluation and ablation study;\niii) a comparison with the classifier-free guided DDIM framework; and iv)\nchange detection experiments on heterogeneous data.\n","authors":["João Gabriel Vinholi","Marco Chini","Anis Amziane","Renato Machado","Danilo Silva","Patrick Matgen"],"pdf_url":"https://arxiv.org/pdf/2404.11243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14065v7","updated":"2024-04-17T10:42:06Z","published":"2023-09-25T11:57:16Z","title":"AsymFormer: Asymmetrical Cross-Modal Representation Learning for Mobile\n Platform Real-Time RGB-D Semantic Segmentation","summary":" Understanding indoor scenes is crucial for urban studies. Considering the\ndynamic nature of indoor environments, effective semantic segmentation requires\nboth real-time operation and high accuracy.To address this, we propose\nAsymFormer, a novel network that improves real-time semantic segmentation\naccuracy using RGB-D multi-modal information without substantially increasing\nnetwork complexity. AsymFormer uses an asymmetrical backbone for multimodal\nfeature extraction, reducing redundant parameters by optimizing computational\nresource distribution. To fuse asymmetric multimodal features, a Local\nAttention-Guided Feature Selection (LAFS) module is used to selectively fuse\nfeatures from different modalities by leveraging their dependencies.\nSubsequently, a Cross-Modal Attention-Guided Feature Correlation Embedding\n(CMA) module is introduced to further extract cross-modal representations. The\nAsymFormer demonstrates competitive results with 54.1% mIoU on NYUv2 and 49.1%\nmIoU on SUNRGBD. Notably, AsymFormer achieves an inference speed of 65 FPS (79\nFPS after implementing mixed precision quantization) on RTX3090, demonstrating\nthat AsymFormer can strike a balance between high accuracy and efficiency.\n","authors":["Siqi Du","Weixi Wang","Renzhong Guo","Ruisheng Wang","Yibin Tian","Shengjun Tang"],"pdf_url":"https://arxiv.org/pdf/2309.14065v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11236v1","updated":"2024-04-17T10:38:51Z","published":"2024-04-17T10:38:51Z","title":"ONOT: a High-Quality ICAO-compliant Synthetic Mugshot Dataset","summary":" Nowadays, state-of-the-art AI-based generative models represent a viable\nsolution to overcome privacy issues and biases in the collection of datasets\ncontaining personal information, such as faces. Following this intuition, in\nthis paper we introduce ONOT, a synthetic dataset specifically focused on the\ngeneration of high-quality faces in adherence to the requirements of the\nISO/IEC 39794-5 standards that, following the guidelines of the International\nCivil Aviation Organization (ICAO), defines the interchange formats of face\nimages in electronic Machine-Readable Travel Documents (eMRTD). The strictly\ncontrolled and varied mugshot images included in ONOT are useful in research\nfields related to the analysis of face images in eMRTD, such as Morphing Attack\nDetection and Face Quality Assessment. The dataset is publicly released, in\ncombination with the generation procedure details in order to improve the\nreproducibility and enable future extensions.\n","authors":["Nicolò Di Domenico","Guido Borghi","Annalisa Franco","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2404.11236v1.pdf","comment":"Paper accepted in IEEE FG 2024"},{"id":"http://arxiv.org/abs/2404.11230v1","updated":"2024-04-17T10:26:49Z","published":"2024-04-17T10:26:49Z","title":"Energy-Efficient Uncertainty-Aware Biomass Composition Prediction at the\n Edge","summary":" Clover fixates nitrogen from the atmosphere to the ground, making\ngrass-clover mixtures highly desirable to reduce external nitrogen\nfertilization. Herbage containing clover additionally promotes higher food\nintake, resulting in higher milk production. Herbage probing however remains\nlargely unused as it requires a time-intensive manual laboratory analysis.\nWithout this information, farmers are unable to perform localized clover sowing\nor take targeted fertilization decisions. Deep learning algorithms have been\nproposed with the goal to estimate the dry biomass composition from images of\nthe grass directly in the fields. The energy-intensive nature of deep learning\nhowever limits deployment to practical edge devices such as smartphones. This\npaper proposes to fill this gap by applying filter pruning to reduce the energy\nrequirement of existing deep learning solutions. We report that although pruned\nnetworks are accurate on controlled, high-quality images of the grass, they\nstruggle to generalize to real-world smartphone images that are blurry or taken\nfrom challenging angles. We address this challenge by training filter-pruned\nmodels using a variance attenuation loss so they can predict the uncertainty of\ntheir predictions. When the uncertainty exceeds a threshold, we re-infer using\na more accurate unpruned model. This hybrid approach allows us to reduce energy\nconsumption while retaining a high accuracy. We evaluate our algorithm on two\ndatasets: the GrassClover and the Irish clover using an NVIDIA Jetson Nano edge\ndevice. We find that we reduce energy reduction with respect to\nstate-of-the-art solutions by 50% on average with only 4% accuracy loss.\n","authors":["Muhammad Zawish","Paul Albert","Flavio Esposito","Steven Davy","Lizy Abraham"],"pdf_url":"https://arxiv.org/pdf/2404.11230v1.pdf","comment":"The paper has been accepted to CVPR 2024 5th Workshop on Vision for\n Agriculture"},{"id":"http://arxiv.org/abs/2404.11226v1","updated":"2024-04-17T10:20:16Z","published":"2024-04-17T10:20:16Z","title":"Simple In-place Data Augmentation for Surveillance Object Detection","summary":" Motivated by the need to improve model performance in traffic monitoring\ntasks with limited labeled samples, we propose a straightforward augmentation\ntechnique tailored for object detection datasets, specifically designed for\nstationary camera-based applications. Our approach focuses on placing objects\nin the same positions as the originals to ensure its effectiveness. By applying\nin-place augmentation on objects from the same camera input image, we address\nthe challenge of overlapping with original and previously selected objects.\nThrough extensive testing on two traffic monitoring datasets, we illustrate the\nefficacy of our augmentation strategy in improving model performance,\nparticularly in scenarios with limited labeled samples and imbalanced class\ndistributions. Notably, our method achieves comparable performance to models\ntrained on the entire dataset while utilizing only 8.5 percent of the original\ndata. Moreover, we report significant improvements, with mAP@.5 increasing from\n0.4798 to 0.5025, and the mAP@.5:.95 rising from 0.29 to 0.3138 on the\nFishEye8K dataset. These results highlight the potential of our augmentation\napproach in enhancing object detection models for traffic monitoring\napplications.\n","authors":["Munkh-Erdene Otgonbold","Ganzorig Batnasan","Munkhjargal Gochoo"],"pdf_url":"https://arxiv.org/pdf/2404.11226v1.pdf","comment":"CVPR Workshop 2024"},{"id":"http://arxiv.org/abs/2404.11214v1","updated":"2024-04-17T09:58:53Z","published":"2024-04-17T09:58:53Z","title":"Feature Corrective Transfer Learning: End-to-End Solutions to Object\n Detection in Non-Ideal Visual Conditions","summary":" A significant challenge in the field of object detection lies in the system's\nperformance under non-ideal imaging conditions, such as rain, fog, low\nillumination, or raw Bayer images that lack ISP processing. Our study\nintroduces \"Feature Corrective Transfer Learning\", a novel approach that\nleverages transfer learning and a bespoke loss function to facilitate the\nend-to-end detection of objects in these challenging scenarios without the need\nto convert non-ideal images into their RGB counterparts. In our methodology, we\ninitially train a comprehensive model on a pristine RGB image dataset.\nSubsequently, non-ideal images are processed by comparing their feature maps\nagainst those from the initial ideal RGB model. This comparison employs the\nExtended Area Novel Structural Discrepancy Loss (EANSDL), a novel loss function\ndesigned to quantify similarities and integrate them into the detection loss.\nThis approach refines the model's ability to perform object detection across\nvarying conditions through direct feature map correction, encapsulating the\nessence of Feature Corrective Transfer Learning. Experimental validation on\nvariants of the KITTI dataset demonstrates a significant improvement in mean\nAverage Precision (mAP), resulting in a 3.8-8.1% relative enhancement in\ndetection under non-ideal conditions compared to the baseline model, and a less\nmarginal performance difference within 1.3% of the mAP@[0.5:0.95] achieved\nunder ideal conditions by the standard Faster RCNN algorithm.\n","authors":["Chuheng Wei","Guoyuan Wu","Matthew J. Barth"],"pdf_url":"https://arxiv.org/pdf/2404.11214v1.pdf","comment":"10 pages, 3 figures, accepted by 2024 CVPR UG2 Workshop"},{"id":"http://arxiv.org/abs/2311.10339v2","updated":"2024-04-17T09:50:25Z","published":"2023-11-17T05:49:50Z","title":"A2XP: Towards Private Domain Generalization","summary":" Deep Neural Networks (DNNs) have become pivotal in various fields, especially\nin computer vision, outperforming previous methodologies. A critical challenge\nin their deployment is the bias inherent in data across different domains, such\nas image style and environmental conditions, leading to domain gaps. This\nnecessitates techniques for learning general representations from biased\ntraining data, known as domain generalization. This paper presents Attend to\neXpert Prompts (A2XP), a novel approach for domain generalization that\npreserves the privacy and integrity of the network architecture. A2XP consists\nof two phases: Expert Adaptation and Domain Generalization. In the first phase,\nprompts for each source domain are optimized to guide the model towards the\noptimal direction. In the second phase, two embedder networks are trained to\neffectively amalgamate these expert prompts, aiming for an optimal output. Our\nextensive experiments demonstrate that A2XP achieves state-of-the-art results\nover existing non-private domain generalization methods. The experimental\nresults validate that the proposed approach not only tackles the domain\ngeneralization challenge in DNNs but also offers a privacy-preserving,\nefficient solution to the broader field of computer vision.\n","authors":["Geunhyeok Yu","Hyoseok Hwang"],"pdf_url":"https://arxiv.org/pdf/2311.10339v2.pdf","comment":"Accepted to CVPR 2024. Our code is available at\n https://github.com/AIRLABkhu/A2XP"},{"id":"http://arxiv.org/abs/2404.11209v1","updated":"2024-04-17T09:45:43Z","published":"2024-04-17T09:45:43Z","title":"Prompt-Guided Generation of Structured Chest X-Ray Report Using a\n Pre-trained LLM","summary":" Medical report generation automates radiology descriptions from images,\neasing the burden on physicians and minimizing errors. However, current methods\nlack structured outputs and physician interactivity for clear, clinically\nrelevant reports. Our method introduces a prompt-guided approach to generate\nstructured chest X-ray reports using a pre-trained large language model (LLM).\nFirst, we identify anatomical regions in chest X-rays to generate focused\nsentences that center on key visual elements, thereby establishing a structured\nreport foundation with anatomy-based sentences. We also convert the detected\nanatomy into textual prompts conveying anatomical comprehension to the LLM.\nAdditionally, the clinical context prompts guide the LLM to emphasize\ninteractivity and clinical requirements. By integrating anatomy-focused\nsentences and anatomy/clinical prompts, the pre-trained LLM can generate\nstructured chest X-ray reports tailored to prompted anatomical regions and\nclinical contexts. We evaluate using language generation and clinical\neffectiveness metrics, demonstrating strong performance.\n","authors":["Hongzhao Li","Hongyu Wang","Xia Sun","Hua He","Jun Feng"],"pdf_url":"https://arxiv.org/pdf/2404.11209v1.pdf","comment":"Accepted by IEEE Conference on Multimedia Expo 2024"},{"id":"http://arxiv.org/abs/2404.11207v1","updated":"2024-04-17T09:39:07Z","published":"2024-04-17T09:39:07Z","title":"Exploring the Transferability of Visual Prompting for Multimodal Large\n Language Models","summary":" Although Multimodal Large Language Models (MLLMs) have demonstrated promising\nversatile capabilities, their performance is still inferior to specialized\nmodels on downstream tasks, which makes adaptation necessary to enhance their\nutility. However, fine-tuning methods require independent training for every\nmodel, leading to huge computation and memory overheads. In this paper, we\npropose a novel setting where we aim to improve the performance of diverse\nMLLMs with a group of shared parameters optimized for a downstream task. To\nachieve this, we propose Transferable Visual Prompting (TVP), a simple and\neffective approach to generate visual prompts that can transfer to different\nmodels and improve their performance on downstream tasks after trained on only\none model. We introduce two strategies to address the issue of cross-model\nfeature corruption of existing visual prompting methods and enhance the\ntransferability of the learned prompts, including 1) Feature Consistency\nAlignment: which imposes constraints to the prompted feature changes to\nmaintain task-agnostic knowledge; 2) Task Semantics Enrichment: which\nencourages the prompted images to contain richer task-specific semantics with\nlanguage guidance. We validate the effectiveness of TVP through extensive\nexperiments with 6 modern MLLMs on a wide variety of tasks ranging from object\nrecognition and counting to multimodal reasoning and hallucination correction.\n","authors":["Yichi Zhang","Yinpeng Dong","Siyuan Zhang","Tianzan Min","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.11207v1.pdf","comment":"Accepted in CVPR 2024 as Poster (Highlight)"},{"id":"http://arxiv.org/abs/2404.11205v1","updated":"2024-04-17T09:37:25Z","published":"2024-04-17T09:37:25Z","title":"Kathakali Hand Gesture Recognition With Minimal Data","summary":" The Indian classical dance-drama Kathakali has a set of hand gestures called\nMudras, which form the fundamental units of all its dance moves and postures.\nRecognizing the depicted mudra becomes one of the first steps in its digital\nprocessing. The work treats the problem as a 24-class classification task and\nproposes a vector-similarity-based approach using pose estimation, eliminating\nthe need for further training or fine-tuning. This approach overcomes the\nchallenge of data scarcity that limits the application of AI in similar\ndomains. The method attains 92% accuracy which is a similar or better\nperformance as other model-training-based works existing in the domain, with\nthe added advantage that the method can still work with data sizes as small as\n1 or 5 samples with a slightly reduced performance. Working with images,\nvideos, and even real-time streams is possible. The system can work with\nhand-cropped or full-body images alike. We have developed and made public a\ndataset for the Kathakali Mudra Recognition as part of this work.\n","authors":["Kavitha Raju","Nandini J. Warrier"],"pdf_url":"https://arxiv.org/pdf/2404.11205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11202v1","updated":"2024-04-17T09:33:31Z","published":"2024-04-17T09:33:31Z","title":"GhostNetV3: Exploring the Training Strategies for Compact Models","summary":" Compact neural networks are specially designed for applications on edge\ndevices with faster inference speed yet modest performance. However, training\nstrategies of compact models are borrowed from that of conventional models at\npresent, which ignores their difference in model capacity and thus may impede\nthe performance of compact models. In this paper, by systematically\ninvestigating the impact of different training ingredients, we introduce a\nstrong training strategy for compact models. We find that the appropriate\ndesigns of re-parameterization and knowledge distillation are crucial for\ntraining high-performance compact models, while some commonly used data\naugmentations for training conventional models, such as Mixup and CutMix, lead\nto worse performance. Our experiments on ImageNet-1K dataset demonstrate that\nour specialized training strategy for compact models is applicable to various\narchitectures, including GhostNetV2, MobileNetV2 and ShuffleNetV2.\nSpecifically, equipped with our strategy, GhostNetV3 1.3$\\times$ achieves a\ntop-1 accuracy of 79.1% with only 269M FLOPs and a latency of 14.46ms on mobile\ndevices, surpassing its ordinarily trained counterpart by a large margin.\nMoreover, our observation can also be extended to object detection scenarios.\nPyTorch code and checkpoints can be found at\nhttps://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv3_pytorch.\n","authors":["Zhenhua Liu","Zhiwei Hao","Kai Han","Yehui Tang","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02977v2","updated":"2024-04-17T09:09:17Z","published":"2023-10-04T17:12:18Z","title":"T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation","summary":" Recent methods in text-to-3D leverage powerful pretrained diffusion models to\noptimize NeRF. Notably, these methods are able to produce high-quality 3D\nscenes without training on 3D data. Due to the open-ended nature of the task,\nmost studies evaluate their results with subjective case studies and user\nexperiments, thereby presenting a challenge in quantitatively addressing the\nquestion: How has current progress in Text-to-3D gone so far? In this paper, we\nintroduce T$^3$Bench, the first comprehensive text-to-3D benchmark containing\ndiverse text prompts of three increasing complexity levels that are specially\ndesigned for 3D generation. To assess both the subjective quality and the text\nalignment, we propose two automatic metrics based on multi-view images produced\nby the 3D contents. The quality metric combines multi-view text-image scores\nand regional convolution to detect quality and view inconsistency. The\nalignment metric uses multi-view captioning and GPT-4 evaluation to measure\ntext-3D consistency. Both metrics closely correlate with different dimensions\nof human judgments, providing a paradigm for efficiently evaluating text-to-3D\nmodels. The benchmarking results, shown in Fig. 1, reveal performance\ndifferences among an extensive 10 prevalent text-to-3D methods. Our analysis\nfurther highlights the common struggles for current methods on generating\nsurroundings and multi-object scenes, as well as the bottleneck of leveraging\n2D guidance for 3D generation. Our project page is available at:\nhttps://t3bench.com.\n","authors":["Yuze He","Yushi Bai","Matthieu Lin","Wang Zhao","Yubin Hu","Jenny Sheng","Ran Yi","Juanzi Li","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02977v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.18402v2","updated":"2024-04-17T08:57:35Z","published":"2023-11-30T09:51:53Z","title":"MV-CLIP: Multi-View CLIP for Zero-shot 3D Shape Recognition","summary":" Large-scale pre-trained models have demonstrated impressive performance in\nvision and language tasks within open-world scenarios. Due to the lack of\ncomparable pre-trained models for 3D shapes, recent methods utilize\nlanguage-image pre-training to realize zero-shot 3D shape recognition. However,\ndue to the modality gap, pretrained language-image models are not confident\nenough in the generalization to 3D shape recognition. Consequently, this paper\naims to improve the confidence with view selection and hierarchical prompts.\nLeveraging the CLIP model as an example, we employ view selection on the vision\nside by identifying views with high prediction confidence from multiple\nrendered views of a 3D shape. On the textual side, the strategy of hierarchical\nprompts is proposed for the first time. The first layer prompts several\nclassification candidates with traditional class-level descriptions, while the\nsecond layer refines the prediction based on function-level descriptions or\nfurther distinctions between the candidates. Remarkably, without the need for\nadditional training, our proposed method achieves impressive zero-shot 3D\nclassification accuracies of 84.44%, 91.51%, and 66.17% on ModelNet40,\nModelNet10, and ShapeNet Core55, respectively. Furthermore, we will make the\ncode publicly available to facilitate reproducibility and further research in\nthis area.\n","authors":["Dan Song","Xinwei Fu","Weizhi Nie","Wenhui Li","Lanjun Wang","You Yang","Anan Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10710v2","updated":"2024-04-17T08:44:30Z","published":"2024-04-16T16:36:50Z","title":"Dual Modalities of Text: Visual and Textual Generative Pre-training","summary":" Harnessing visual texts represents a burgeoning frontier in the evolution of\nlanguage modeling. In this paper, we introduce a novel pre-training framework\nfor a suite of pixel-based autoregressive language models, pre-training on a\ncorpus of over 400 million documents rendered as RGB images. Our approach is\ncharacterized by a dual-modality training regimen, engaging both visual data\nthrough next patch prediction with a regression head and textual data via next\ntoken prediction with a classification head. This study is particularly focused\non investigating the synergistic interplay between visual and textual\nmodalities of language. Our comprehensive evaluation across a diverse array of\nbenchmarks reveals that the confluence of visual and textual data substantially\naugments the efficacy of pixel-based language models. Notably, our findings\nshow that a unidirectional pixel-based model, devoid of textual data during\ntraining, can match the performance levels of advanced bidirectional\npixel-based models on various language understanding benchmarks. This work\nhighlights the considerable untapped potential of integrating visual and\ntextual information for language modeling purposes. We will release our code,\ndata, and checkpoints to inspire further research advancement.\n","authors":["Yekun Chai","Qingyi Liu","Jingwu Xiao","Shuohuan Wang","Yu Sun","Hua Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00311v3","updated":"2024-04-17T08:40:57Z","published":"2023-12-01T03:05:21Z","title":"3D Face Reconstruction with the Geometric Guidance of Facial Part\n Segmentation","summary":" 3D Morphable Models (3DMMs) provide promising 3D face reconstructions in\nvarious applications. However, existing methods struggle to reconstruct faces\nwith extreme expressions due to deficiencies in supervisory signals, such as\nsparse or inaccurate landmarks. Segmentation information contains effective\ngeometric contexts for face reconstruction. Certain attempts intuitively depend\non differentiable renderers to compare the rendered silhouettes of\nreconstruction with segmentation, which is prone to issues like local optima\nand gradient instability. In this paper, we fully utilize the facial part\nsegmentation geometry by introducing Part Re-projection Distance Loss (PRDL).\nSpecifically, PRDL transforms facial part segmentation into 2D points and\nre-projects the reconstruction onto the image plane. Subsequently, by\nintroducing grid anchors and computing different statistical distances from\nthese anchors to the point sets, PRDL establishes geometry descriptors to\noptimize the distribution of the point sets for face reconstruction. PRDL\nexhibits a clear gradient compared to the renderer-based methods and presents\nstate-of-the-art reconstruction performance in extensive quantitative and\nqualitative experiments. Our project is available at\nhttps://github.com/wang-zidu/3DDFA-V3 .\n","authors":["Zidu Wang","Xiangyu Zhu","Tianshuo Zhang","Baiqin Wang","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2312.00311v3.pdf","comment":"CVPR2024 (Highlight)"},{"id":"http://arxiv.org/abs/2312.08555v2","updated":"2024-04-17T08:38:54Z","published":"2023-12-13T23:00:48Z","title":"KDAS: Knowledge Distillation via Attention Supervision Framework for\n Polyp Segmentation","summary":" Polyp segmentation, a contentious issue in medical imaging, has seen numerous\nproposed methods aimed at improving the quality of segmented masks. While\ncurrent state-of-the-art techniques yield impressive results, the size and\ncomputational cost of these models create challenges for practical industry\napplications. To address this challenge, we present KDAS, a Knowledge\nDistillation framework that incorporates attention supervision, and our\nproposed Symmetrical Guiding Module. This framework is designed to facilitate a\ncompact student model with fewer parameters, allowing it to learn the strengths\nof the teacher model and mitigate the inconsistency between teacher features\nand student features, a common challenge in Knowledge Distillation, via the\nSymmetrical Guiding Module. Through extensive experiments, our compact models\ndemonstrate their strength by achieving competitive results with\nstate-of-the-art methods, offering a promising approach to creating compact\nmodels with high accuracy for polyp segmentation and in the medical imaging\nfield. The implementation is available on https://github.com/huyquoctrinh/KDAS.\n","authors":["Quoc-Huy Trinh","Minh-Van Nguyen","Phuoc-Thao Vo Thi"],"pdf_url":"https://arxiv.org/pdf/2312.08555v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11161v1","updated":"2024-04-17T08:21:02Z","published":"2024-04-17T08:21:02Z","title":"Pre-processing matters: A segment search method for WSI classification","summary":" Pre-processing for whole slide images can affect classification performance\nboth in the training and inference stages. Our study analyzes the impact of\npre-processing parameters on inference and training across single- and\nmultiple-domain datasets. However, searching for an optimal parameter set is\ntime-consuming. To overcome this, we propose a novel Similarity-based Simulated\nAnnealing approach for fast parameter tuning to enhance inference performance\non single-domain data. Our method demonstrates significant performance\nimprovements in accuracy, which raise accuracy from 0.512 to 0.847 in a single\ndomain. We further extend our insight into training performance in multi-domain\ndata by employing a novel Bayesian optimization to search optimal\npre-processing parameters, resulting in a high AUC of 0.967. We highlight that\nbetter pre-processing for WSI can contribute to further accuracy improvement in\nthe histology area.\n","authors":["Jun Wang","Yufei Cui","Yu Mao","Nan Guan","Chun Jason Xue"],"pdf_url":"https://arxiv.org/pdf/2404.11161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11159v1","updated":"2024-04-17T08:15:25Z","published":"2024-04-17T08:15:25Z","title":"Deep Portrait Quality Assessment. A NTIRE 2024 Challenge Survey","summary":" This paper reviews the NTIRE 2024 Portrait Quality Assessment Challenge,\nhighlighting the proposed solutions and results. This challenge aims to obtain\nan efficient deep neural network capable of estimating the perceptual quality\nof real portrait photos. The methods must generalize to diverse scenes and\ndiverse lighting conditions (indoor, outdoor, low-light), movement, blur, and\nother challenging conditions. In the challenge, 140 participants registered,\nand 35 submitted results during the challenge period. The performance of the\ntop 5 submissions is reviewed and provided here as a gauge for the current\nstate-of-the-art in Portrait Quality Assessment.\n","authors":["Nicolas Chahine","Marcos V. Conde","Daniela Carfora","Gabriel Pacianotto","Benoit Pochon","Sira Ferradans","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2404.11159v1.pdf","comment":"CVPRW - NTIRE 2024"},{"id":"http://arxiv.org/abs/2404.11156v1","updated":"2024-04-17T08:09:25Z","published":"2024-04-17T08:09:25Z","title":"Learning SO(3)-Invariant Semantic Correspondence via Local Shape\n Transform","summary":" Establishing accurate 3D correspondences between shapes stands as a pivotal\nchallenge with profound implications for computer vision and robotics. However,\nexisting self-supervised methods for this problem assume perfect input shape\nalignment, restricting their real-world applicability. In this work, we\nintroduce a novel self-supervised Rotation-Invariant 3D correspondence learner\nwith Local Shape Transform, dubbed RIST, that learns to establish dense\ncorrespondences between shapes even under challenging intra-class variations\nand arbitrary orientations. Specifically, RIST learns to dynamically formulate\nan SO(3)-invariant local shape transform for each point, which maps the\nSO(3)-equivariant global shape descriptor of the input shape to a local shape\ndescriptor. These local shape descriptors are provided as inputs to our decoder\nto facilitate point cloud self- and cross-reconstruction. Our proposed\nself-supervised training pipeline encourages semantically corresponding points\nfrom different shapes to be mapped to similar local shape descriptors, enabling\nRIST to establish dense point-wise correspondences. RIST demonstrates\nstate-of-the-art performances on 3D part label transfer and semantic keypoint\ntransfer given arbitrarily rotated point cloud pairs, outperforming existing\nmethods by significant margins.\n","authors":["Chunghyun Park","Seungwook Sim","Jaesik Park","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2404.11156v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11155v1","updated":"2024-04-17T08:08:34Z","published":"2024-04-17T08:08:34Z","title":"HybriMap: Hybrid Clues Utilization for Effective Vectorized HD Map\n Construction","summary":" Constructing vectorized high-definition maps from surround-view cameras has\ngarnered significant attention in recent years. However, the commonly employed\nmulti-stage sequential workflow in prevailing approaches often leads to the\nloss of early-stage information, particularly in perspective-view features.\nUsually, such loss is observed as an instance missing or shape mismatching in\nthe final birds-eye-view predictions. To address this concern, we propose a\nnovel approach, namely \\textbf{HybriMap}, which effectively exploits clues from\nhybrid features to ensure the delivery of valuable information. Specifically,\nwe design the Dual Enhancement Module, to enable both explicit integration and\nimplicit modification under the guidance of hybrid features. Additionally, the\nperspective keypoints are utilized as supervision, further directing the\nfeature enhancement process. Extensive experiments conducted on existing\nbenchmarks have demonstrated the state-of-the-art performance of our proposed\napproach.\n","authors":["Chi Zhang","Qi Song","Feifei Li","Yongquan Chen","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2404.11155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07773v2","updated":"2024-04-17T08:06:51Z","published":"2024-04-11T14:08:45Z","title":"ConsistencyDet: A Robust Object Detector with a Denoising Paradigm of\n Consistency Model","summary":" Object detection, a quintessential task in the realm of perceptual computing,\ncan be tackled using a generative methodology. In the present study, we\nintroduce a novel framework designed to articulate object detection as a\ndenoising diffusion process, which operates on the perturbed bounding boxes of\nannotated entities. This framework, termed ConsistencyDet, leverages an\ninnovative denoising concept known as the Consistency Model. The hallmark of\nthis model is its self-consistency feature, which empowers the model to map\ndistorted information from any temporal stage back to its pristine state,\nthereby realizing a \"one-step denoising\" mechanism. Such an attribute markedly\nelevates the operational efficiency of the model, setting it apart from the\nconventional Diffusion Model. Throughout the training phase, ConsistencyDet\ninitiates the diffusion sequence with noise-infused boxes derived from the\nground-truth annotations and conditions the model to perform the denoising\ntask. Subsequently, in the inference stage, the model employs a denoising\nsampling strategy that commences with bounding boxes randomly sampled from a\nnormal distribution. Through iterative refinement, the model transforms an\nassortment of arbitrarily generated boxes into definitive detections.\nComprehensive evaluations employing standard benchmarks, such as MS-COCO and\nLVIS, corroborate that ConsistencyDet surpasses other leading-edge detectors in\nperformance metrics. Our code is available at\nhttps://github.com/Tankowa/ConsistencyDet.\n","authors":["Lifan Jiang","Zhihui Wang","Changmiao Wang","Ming Li","Jiaxu Leng","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11152v1","updated":"2024-04-17T08:05:04Z","published":"2024-04-17T08:05:04Z","title":"Multi-target and multi-stage liver lesion segmentation and detection in\n multi-phase computed tomography scans","summary":" Multi-phase computed tomography (CT) scans use contrast agents to highlight\ndifferent anatomical structures within the body to improve the probability of\nidentifying and detecting anatomical structures of interest and abnormalities\nsuch as liver lesions. Yet, detecting these lesions remains a challenging task\nas these lesions vary significantly in their size, shape, texture, and contrast\nwith respect to surrounding tissue. Therefore, radiologists need to have an\nextensive experience to be able to identify and detect these lesions.\nSegmentation-based neural networks can assist radiologists with this task.\nCurrent state-of-the-art lesion segmentation networks use the encoder-decoder\ndesign paradigm based on the UNet architecture where the multi-phase CT scan\nvolume is fed to the network as a multi-channel input. Although this approach\nutilizes information from all the phases and outperform single-phase\nsegmentation networks, we demonstrate that their performance is not optimal and\ncan be further improved by incorporating the learning from models trained on\neach single-phase individually. Our approach comprises three stages. The first\nstage identifies the regions within the liver where there might be lesions at\nthree different scales (4, 8, and 16 mm). The second stage includes the main\nsegmentation model trained using all the phases as well as a segmentation model\ntrained on each of the phases individually. The third stage uses the\nmulti-phase CT volumes together with the predictions from each of the\nsegmentation models to generate the final segmentation map. Overall, our\napproach improves relative liver lesion segmentation performance by 1.6% while\nreducing performance variability across subjects by 8% when compared to the\ncurrent state-of-the-art models.\n","authors":["Abdullah F. Al-Battal","Soan T. M. Duong","Van Ha Tang","Quang Duc Tran","Steven Q. H. Truong","Chien Phan","Truong Q. Nguyen","Cheolhong An"],"pdf_url":"https://arxiv.org/pdf/2404.11152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11151v1","updated":"2024-04-17T08:01:55Z","published":"2024-04-17T08:01:55Z","title":"REACTO: Reconstructing Articulated Objects from a Single Video","summary":" In this paper, we address the challenge of reconstructing general articulated\n3D objects from a single video. Existing works employing dynamic neural\nradiance fields have advanced the modeling of articulated objects like humans\nand animals from videos, but face challenges with piece-wise rigid general\narticulated objects due to limitations in their deformation models. To tackle\nthis, we propose Quasi-Rigid Blend Skinning, a novel deformation model that\nenhances the rigidity of each part while maintaining flexible deformation of\nthe joints. Our primary insight combines three distinct approaches: 1) an\nenhanced bone rigging system for improved component modeling, 2) the use of\nquasi-sparse skinning weights to boost part rigidity and reconstruction\nfidelity, and 3) the application of geodesic point assignment for precise\nmotion and seamless deformation. Our method outperforms previous works in\nproducing higher-fidelity 3D reconstructions of general articulated objects, as\ndemonstrated on both real and synthetic datasets. Project page:\nhttps://chaoyuesong.github.io/REACTO.\n","authors":["Chaoyue Song","Jiacheng Wei","Chuan-Sheng Foo","Guosheng Lin","Fayao Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09326v2","updated":"2024-04-17T07:46:28Z","published":"2024-04-14T18:57:38Z","title":"Weight Copy and Low-Rank Adaptation for Few-Shot Distillation of Vision\n Transformers","summary":" Few-shot knowledge distillation recently emerged as a viable approach to\nharness the knowledge of large-scale pre-trained models, using limited data and\ncomputational resources. In this paper, we propose a novel few-shot feature\ndistillation approach for vision transformers. Our approach is based on two key\nsteps. Leveraging the fact that vision transformers have a consistent\ndepth-wise structure, we first copy the weights from intermittent layers of\nexisting pre-trained vision transformers (teachers) into shallower\narchitectures (students), where the intermittence factor controls the\ncomplexity of the student transformer with respect to its teacher. Next, we\nemploy an enhanced version of Low-Rank Adaptation (LoRA) to distill knowledge\ninto the student in a few-shot scenario, aiming to recover the information\nprocessing carried out by the skipped teacher layers. We present comprehensive\nexperiments with supervised and self-supervised transformers as teachers, on\nfive data sets from various domains, including natural, medical and satellite\nimages. The empirical results confirm the superiority of our approach over\ncompetitive baselines. Moreover, the ablation results demonstrate the\nusefulness of each component of the proposed pipeline.\n","authors":["Diana-Nicoleta Grigore","Mariana-Iuliana Georgescu","Jon Alvarez Justo","Tor Johansen","Andreea Iuliana Ionescu","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2404.09326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03788v2","updated":"2024-04-17T07:41:48Z","published":"2024-01-08T10:08:48Z","title":"Low-light Image Enhancement via CLIP-Fourier Guided Wavelet Diffusion","summary":" Low-light image enhancement techniques have significantly progressed, but\nunstable image quality recovery and unsatisfactory visual perception are still\nsignificant challenges. To solve these problems, we propose a novel and robust\nlow-light image enhancement method via CLIP-Fourier Guided Wavelet Diffusion,\nabbreviated as CFWD. Specifically, CFWD leverages multimodal visual-language\ninformation in the frequency domain space created by multiple wavelet\ntransforms to guide the enhancement process. Multi-scale supervision across\ndifferent modalities facilitates the alignment of image features with semantic\nfeatures during the wavelet diffusion process, effectively bridging the gap\nbetween degraded and normal domains. Moreover, to further promote the effective\nrecovery of the image details, we combine the Fourier transform based on the\nwavelet transform and construct a Hybrid High Frequency Perception Module\n(HFPM) with a significant perception of the detailed features. This module\navoids the diversity confusion of the wavelet diffusion process by guiding the\nfine-grained structure recovery of the enhancement results to achieve\nfavourable metric and perceptually oriented enhancement. Extensive quantitative\nand qualitative experiments on publicly available real-world benchmarks show\nthat our approach outperforms existing state-of-the-art methods, achieving\nsignificant progress in image quality and noise suppression. The project code\nis available at https://github.com/hejh8/CFWD.\n","authors":["Minglong Xue","Jinhong He","Wenhai Wang","Mingliang Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.03788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08926v2","updated":"2024-04-17T07:38:32Z","published":"2024-04-13T08:27:10Z","title":"Diffusion Models Meet Remote Sensing: Principles, Methods, and\n Perspectives","summary":" As a newly emerging advance in deep generative models, diffusion models have\nachieved state-of-the-art results in many fields, including computer vision,\nnatural language processing, and molecule design. The remote sensing community\nhas also noticed the powerful ability of diffusion models and quickly applied\nthem to a variety of tasks for image processing. Given the rapid increase in\nresearch on diffusion models in the field of remote sensing, it is necessary to\nconduct a comprehensive review of existing diffusion model-based remote sensing\npapers, to help researchers recognize the potential of diffusion models and\nprovide some directions for further exploration. Specifically, this paper first\nintroduces the theoretical background of diffusion models, and then\nsystematically reviews the applications of diffusion models in remote sensing,\nincluding image generation, enhancement, and interpretation. Finally, the\nlimitations of existing remote sensing diffusion models and worthy research\ndirections for further exploration are discussed and summarized.\n","authors":["Yidan Liu","Jun Yue","Shaobo Xia","Pedram Ghamisi","Weiying Xie","Leyuan Fang"],"pdf_url":"https://arxiv.org/pdf/2404.08926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11139v1","updated":"2024-04-17T07:34:21Z","published":"2024-04-17T07:34:21Z","title":"GeoReF: Geometric Alignment Across Shape Variation for Category-level\n Object Pose Refinement","summary":" Object pose refinement is essential for robust object pose estimation.\nPrevious work has made significant progress towards instance-level object pose\nrefinement. Yet, category-level pose refinement is a more challenging problem\ndue to large shape variations within a category and the discrepancies between\nthe target object and the shape prior. To address these challenges, we\nintroduce a novel architecture for category-level object pose refinement. Our\napproach integrates an HS-layer and learnable affine transformations, which\naims to enhance the extraction and alignment of geometric information.\nAdditionally, we introduce a cross-cloud transformation mechanism that\nefficiently merges diverse data sources. Finally, we push the limits of our\nmodel by incorporating the shape prior information for translation and size\nerror prediction. We conducted extensive experiments to demonstrate the\neffectiveness of the proposed framework. Through extensive quantitative\nexperiments, we demonstrate significant improvement over the baseline method by\na large margin across all metrics.\n","authors":["Linfang Zheng","Tze Ho Elden Tse","Chen Wang","Yinghan Sun","Hua Chen","Ales Leonardis","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.11139v1.pdf","comment":"The IEEE/CVF Conference on Computer Vision and Pattern Recognition\n 2024"},{"id":"http://arxiv.org/abs/2404.11129v1","updated":"2024-04-17T07:20:56Z","published":"2024-04-17T07:20:56Z","title":"Fact :Teaching MLLMs with Faithful, Concise and Transferable Rationales","summary":" The remarkable performance of Multimodal Large Language Models (MLLMs) has\nunequivocally demonstrated their proficient understanding capabilities in\nhandling a wide array of visual tasks. Nevertheless, the opaque nature of their\nblack-box reasoning processes persists as an enigma, rendering them\nuninterpretable and struggling with hallucination. Their ability to execute\nintricate compositional reasoning tasks is also constrained, culminating in a\nstagnation of learning progression for these models. In this work, we introduce\nFact, a novel paradigm designed to generate multimodal rationales that are\nfaithful, concise, and transferable for teaching MLLMs. This paradigm utilizes\nverifiable visual programming to generate executable code guaranteeing\nfaithfulness and precision. Subsequently, through a series of operations\nincluding pruning, merging, and bridging, the rationale enhances its\nconciseness. Furthermore, we filter rationales that can be transferred to\nend-to-end paradigms from programming paradigms to guarantee transferability.\nEmpirical evidence from experiments demonstrates the superiority of our method\nacross models of varying parameter sizes, significantly enhancing their\ncompositional reasoning and generalization ability. Our approach also reduces\nhallucinations owing to its high correlation between images and text.\n","authors":["Minghe Gao","Shuang Chen","Liang Pang","Yuan Yao","Jisheng Dang","Wenqiao Zhang","Juncheng Li","Siliang Tang","Yueting Zhuang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2404.11129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11127v1","updated":"2024-04-17T07:17:47Z","published":"2024-04-17T07:17:47Z","title":"D-Aug: Enhancing Data Augmentation for Dynamic LiDAR Scenes","summary":" Creating large LiDAR datasets with pixel-level labeling poses significant\nchallenges. While numerous data augmentation methods have been developed to\nreduce the reliance on manual labeling, these methods predominantly focus on\nstatic scenes and they overlook the importance of data augmentation for dynamic\nscenes, which is critical for autonomous driving. To address this issue, we\npropose D-Aug, a LiDAR data augmentation method tailored for augmenting dynamic\nscenes. D-Aug extracts objects and inserts them into dynamic scenes,\nconsidering the continuity of these objects across consecutive frames. For\nseamless insertion into dynamic scenes, we propose a reference-guided method\nthat involves dynamic collision detection and rotation alignment. Additionally,\nwe present a pixel-level road identification strategy to efficiently determine\nsuitable insertion positions. We validated our method using the nuScenes\ndataset with various 3D detection and tracking methods. Comparative experiments\ndemonstrate the superiority of D-Aug.\n","authors":["Jiaxing Zhao","Peng Zheng","Rui Ma"],"pdf_url":"https://arxiv.org/pdf/2404.11127v1.pdf","comment":"4pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.02562v2","updated":"2024-04-17T07:13:27Z","published":"2024-04-03T08:33:08Z","title":"Representation Alignment Contrastive Regularization for Multi-Object\n Tracking","summary":" Achieving high-performance in multi-object tracking algorithms heavily relies\non modeling spatio-temporal relationships during the data association stage.\nMainstream approaches encompass rule-based and deep learning-based methods for\nspatio-temporal relationship modeling. While the former relies on physical\nmotion laws, offering wider applicability but yielding suboptimal results for\ncomplex object movements, the latter, though achieving high-performance, lacks\ninterpretability and involves complex module designs. This work aims to\nsimplify deep learning-based spatio-temporal relationship models and introduce\ninterpretability into features for data association. Specifically, a\nlightweight single-layer transformer encoder is utilized to model\nspatio-temporal relationships. To make features more interpretative, two\ncontrastive regularization losses based on representation alignment are\nproposed, derived from spatio-temporal consistency rules. By applying weighted\nsummation to affinity matrices, the aligned features can seamlessly integrate\ninto the data association stage of the original tracking workflow. Experimental\nresults showcase that our model enhances the majority of existing tracking\nnetworks' performance without excessive complexity, with minimal increase in\ntraining overhead and nearly negligible computational and storage costs.\n","authors":["Zhonglin Liu","Shujie Chen","Jianfeng Dong","Xun Wang","Di Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.02562v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11120v1","updated":"2024-04-17T07:08:38Z","published":"2024-04-17T07:08:38Z","title":"TiNO-Edit: Timestep and Noise Optimization for Robust Diffusion-Based\n Image Editing","summary":" Despite many attempts to leverage pre-trained text-to-image models (T2I) like\nStable Diffusion (SD) for controllable image editing, producing good\npredictable results remains a challenge. Previous approaches have focused on\neither fine-tuning pre-trained T2I models on specific datasets to generate\ncertain kinds of images (e.g., with a specific object or person), or on\noptimizing the weights, text prompts, and/or learning features for each input\nimage in an attempt to coax the image generator to produce the desired result.\nHowever, these approaches all have shortcomings and fail to produce good\nresults in a predictable and controllable manner. To address this problem, we\npresent TiNO-Edit, an SD-based method that focuses on optimizing the noise\npatterns and diffusion timesteps during editing, something previously\nunexplored in the literature. With this simple change, we are able to generate\nresults that both better align with the original images and reflect the desired\nresult. Furthermore, we propose a set of new loss functions that operate in the\nlatent domain of SD, greatly speeding up the optimization when compared to\nprior approaches, which operate in the pixel domain. Our method can be easily\napplied to variations of SD including Textual Inversion and DreamBooth that\nencode new concepts and incorporate them into the edited results. We present a\nhost of image-editing capabilities enabled by our approach. Our code is\npublicly available at https://github.com/SherryXTChen/TiNO-Edit.\n","authors":["Sherry X. Chen","Yaron Vaxman","Elad Ben Baruch","David Asulin","Aviad Moreshet","Kuo-Chin Lien","Misha Sra","Pradeep Sen"],"pdf_url":"https://arxiv.org/pdf/2404.11120v1.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2404.11118v1","updated":"2024-04-17T07:06:22Z","published":"2024-04-17T07:06:22Z","title":"MHLR: Moving Haar Learning Rate Scheduler for Large-scale Face\n Recognition Training with One GPU","summary":" Face recognition (FR) has seen significant advancements due to the\nutilization of large-scale datasets. Training deep FR models on large-scale\ndatasets with multiple GPUs is now a common practice. In fact, computing power\nhas evolved into a foundational and indispensable resource in the area of deep\nlearning. It is nearly impossible to train a deep FR model without holding\nadequate hardware resources. Recognizing this challenge, some FR approaches\nhave started exploring ways to reduce the time complexity of the\nfully-connected layer in FR models. Unlike other approaches, this paper\nintroduces a simple yet highly effective approach, Moving Haar Learning Rate\n(MHLR) scheduler, for scheduling the learning rate promptly and accurately in\nthe training process. MHLR supports large-scale FR training with only one GPU,\nwhich is able to accelerate the model to 1/4 of its original training time\nwithout sacrificing more than 1% accuracy. More specifically, MHLR only needs\n$30$ hours to train the model ResNet100 on the dataset WebFace12M containing\nmore than 12M face images with 0.6M identities. Extensive experiments validate\nthe efficiency and effectiveness of MHLR.\n","authors":["Xueyuan Gong","Yain-whar Si","Zheng Zhang","Xiaochen Yuan","Ke Wang","Xinyuan Zhang","Cong Lin","Xiaoxiang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11111v1","updated":"2024-04-17T06:57:57Z","published":"2024-04-17T06:57:57Z","title":"CorrNet+: Sign Language Recognition and Translation via Spatial-Temporal\n Correlation","summary":" In sign language, the conveyance of human body trajectories predominantly\nrelies upon the coordinated movements of hands and facial expressions across\nsuccessive frames. Despite the recent advancements of sign language\nunderstanding methods, they often solely focus on individual frames, inevitably\noverlooking the inter-frame correlations that are essential for effectively\nmodeling human body trajectories. To address this limitation, this paper\nintroduces a spatial-temporal correlation network, denoted as CorrNet+, which\nexplicitly identifies body trajectories across multiple frames. In specific,\nCorrNet+ employs a correlation module and an identification module to build\nhuman body trajectories. Afterwards, a temporal attention module is followed to\nadaptively evaluate the contributions of different frames. The resultant\nfeatures offer a holistic perspective on human body movements, facilitating a\ndeeper understanding of sign language. As a unified model, CorrNet+ achieves\nnew state-of-the-art performance on two extensive sign language understanding\ntasks, including continuous sign language recognition (CSLR) and sign language\ntranslation (SLT). Especially, CorrNet+ surpasses previous methods equipped\nwith resource-intensive pose-estimation networks or pre-extracted heatmaps for\nhand and facial feature extraction. Compared with CorrNet, CorrNet+ achieves a\nsignificant performance boost across all benchmarks while halving the\ncomputational overhead. A comprehensive comparison with previous\nspatial-temporal reasoning methods verifies the superiority of CorrNet+. Code\nis available at https://github.com/hulianyuyy/CorrNet_Plus.\n","authors":["Lianyu Hu","Wei Feng","Liqing Gao","Zekang Liu","Liang Wan"],"pdf_url":"https://arxiv.org/pdf/2404.11111v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.03202"},{"id":"http://arxiv.org/abs/2404.11108v1","updated":"2024-04-17T06:47:17Z","published":"2024-04-17T06:47:17Z","title":"LADDER: An Efficient Framework for Video Frame Interpolation","summary":" Video Frame Interpolation (VFI) is a crucial technique in various\napplications such as slow-motion generation, frame rate conversion, video frame\nrestoration etc. This paper introduces an efficient video frame interpolation\nframework that aims to strike a favorable balance between efficiency and\nquality. Our framework follows a general paradigm consisting of a flow\nestimator and a refinement module, while incorporating carefully designed\ncomponents. First of all, we adopt depth-wise convolution with large kernels in\nthe flow estimator that simultaneously reduces the parameters and enhances the\nreceptive field for encoding rich context and handling complex motion.\nSecondly, diverging from a common design for the refinement module with a\nUNet-structure (encoder-decoder structure), which we find redundant, our\ndecoder-only refinement module directly enhances the result from coarse to fine\nfeatures, offering a more efficient process. In addition, to address the\nchallenge of handling high-definition frames, we also introduce an innovative\nHD-aware augmentation strategy during training, leading to consistent\nenhancement on HD images. Extensive experiments are conducted on diverse\ndatasets, Vimeo90K, UCF101, Xiph and SNU-FILM. The results demonstrate that our\napproach achieves state-of-the-art performance with clear improvement while\nrequiring much less FLOPs and parameters, reaching to a better spot for\nbalancing efficiency and quality.\n","authors":["Tong Shen","Dong Li","Ziheng Gao","Lu Tian","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2404.11108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11104v1","updated":"2024-04-17T06:40:47Z","published":"2024-04-17T06:40:47Z","title":"Object Remover Performance Evaluation Methods using Class-wise Object\n Removal Images","summary":" Object removal refers to the process of erasing designated objects from an\nimage while preserving the overall appearance, and it is one area where image\ninpainting is widely used in real-world applications. The performance of an\nobject remover is quantitatively evaluated by measuring the quality of object\nremoval results, similar to how the performance of an image inpainter is\ngauged. Current works reporting quantitative performance evaluations utilize\noriginal images as references. In this letter, to validate the current\nevaluation methods cannot properly evaluate the performance of an object\nremover, we create a dataset with object removal ground truth and compare the\nevaluations made by the current methods using original images to those\nutilizing object removal ground truth images. The disparities between two\nevaluation sets validate that the current methods are not suitable for\nmeasuring the performance of an object remover. Additionally, we propose new\nevaluation methods tailored to gauge the performance of an object remover. The\nproposed methods evaluate the performance through class-wise object removal\nresults and utilize images without the target class objects as a comparison\nset. We confirm that the proposed methods can make judgments consistent with\nhuman evaluators in the COCO dataset, and that they can produce measurements\naligning with those using object removal ground truth in the self-acquired\ndataset.\n","authors":["Changsuk Oh","Dongseok Shim","Taekbeom Lee","H. Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2404.11104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11100v1","updated":"2024-04-17T06:36:17Z","published":"2024-04-17T06:36:17Z","title":"Synthesizing Realistic Data for Table Recognition","summary":" To overcome the limitations and challenges of current automatic table data\nannotation methods and random table data synthesis approaches, we propose a\nnovel method for synthesizing annotation data specifically designed for table\nrecognition. This method utilizes the structure and content of existing complex\ntables, facilitating the efficient creation of tables that closely replicate\nthe authentic styles found in the target domain. By leveraging the actual\nstructure and content of tables from Chinese financial announcements, we have\ndeveloped the first extensive table annotation dataset in this domain. We used\nthis dataset to train several recent deep learning-based end-to-end table\nrecognition models. Additionally, we have established the inaugural benchmark\nfor real-world complex tables in the Chinese financial announcement domain,\nusing it to assess the performance of models trained on our synthetic data,\nthereby effectively validating our method's practicality and effectiveness.\nFurthermore, we applied our synthesis method to augment the FinTabNet dataset,\nextracted from English financial announcements, by increasing the proportion of\ntables with multiple spanning cells to introduce greater complexity. Our\nexperiments show that models trained on this augmented dataset achieve\ncomprehensive improvements in performance, especially in the recognition of\ntables with multiple spanning cells.\n","authors":["Qiyu Hou","Jun Wang","Meixuan Qiao","Lujun Tian"],"pdf_url":"https://arxiv.org/pdf/2404.11100v1.pdf","comment":"ICDAR 2024"},{"id":"http://arxiv.org/abs/2404.11098v1","updated":"2024-04-17T06:32:42Z","published":"2024-04-17T06:32:42Z","title":"LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing\n Diffusion Models","summary":" In the era of AIGC, the demand for low-budget or even on-device applications\nof diffusion models emerged. In terms of compressing the Stable Diffusion\nmodels (SDMs), several approaches have been proposed, and most of them\nleveraged the handcrafted layer removal methods to obtain smaller U-Nets, along\nwith knowledge distillation to recover the network performance. However, such a\nhandcrafting manner of layer removal is inefficient and lacks scalability and\ngeneralization, and the feature distillation employed in the retraining phase\nfaces an imbalance issue that a few numerically significant feature loss terms\ndominate over others throughout the retraining process. To this end, we\nproposed the layer pruning and normalized distillation for compressing\ndiffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to\ncompress SDM's U-Net automatically and proposed an effective one-shot pruning\ncriterion whose one-shot performance is guaranteed by its good additivity\nproperty, surpassing other layer pruning and handcrafted layer removal methods,\n2) proposed the normalized feature distillation for retraining, alleviated the\nimbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of\nSDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0%\ndecline in PickScore at a pruning ratio of 50% while the comparative methods'\nminimal PickScore decline is 8.2%. We will release our code.\n","authors":["Dingkun Zhang","Sijia Li","Chen Chen","Qingsong Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.11098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10312v2","updated":"2024-04-17T06:30:00Z","published":"2024-04-16T06:39:37Z","title":"OmniSSR: Zero-shot Omnidirectional Image Super-Resolution using Stable\n Diffusion Model","summary":" Omnidirectional images (ODIs) are commonly used in real-world visual tasks,\nand high-resolution ODIs help improve the performance of related visual tasks.\nMost existing super-resolution methods for ODIs use end-to-end learning\nstrategies, resulting in inferior realness of generated images and a lack of\neffective out-of-domain generalization capabilities in training methods. Image\ngeneration methods represented by diffusion model provide strong priors for\nvisual tasks and have been proven to be effectively applied to image\nrestoration tasks. Leveraging the image priors of the Stable Diffusion (SD)\nmodel, we achieve omnidirectional image super-resolution with both fidelity and\nrealness, dubbed as OmniSSR. Firstly, we transform the equirectangular\nprojection (ERP) images into tangent projection (TP) images, whose distribution\napproximates the planar image domain. Then, we use SD to iteratively sample\ninitial high-resolution results. At each denoising iteration, we further\ncorrect and update the initial results using the proposed Octadecaplex Tangent\nInformation Interaction (OTII) and Gradient Decomposition (GD) technique to\nensure better consistency. Finally, the TP images are transformed back to\nobtain the final high-resolution results. Our method is zero-shot, requiring no\ntraining or fine-tuning. Experiments of our method on two benchmark datasets\ndemonstrate the effectiveness of our proposed method.\n","authors":["Runyi Li","Xuhan Sheng","Weiqi Li","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10312v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00425v2","updated":"2024-04-17T06:26:04Z","published":"2023-12-01T08:47:56Z","title":"Retina : Low-Power Eye Tracking with Event Camera and Spiking Hardware","summary":" This paper introduces a neuromorphic methodology for eye tracking, harnessing\npure event data captured by a Dynamic Vision Sensor (DVS) camera. The framework\nintegrates a directly trained Spiking Neuron Network (SNN) regression model and\nleverages a state-of-the-art low power edge neuromorphic processor - Speck,\ncollectively aiming to advance the precision and efficiency of eye-tracking\nsystems. First, we introduce a representative event-based eye-tracking dataset,\n\"Ini-30\", which was collected with two glass-mounted DVS cameras from thirty\nvolunteers. Then,a SNN model, based on Integrate And Fire (IAF) neurons, named\n\"Retina\", is described , featuring only 64k parameters (6.63x fewer than the\nlatest) and achieving pupil tracking error of only 3.24 pixels in a 64x64 DVS\ninput. The continous regression output is obtained by means of convolution\nusing a non-spiking temporal 1D filter slided across the output spiking layer.\nFinally, we evaluate Retina on the neuromorphic processor, showing an\nend-to-end power between 2.89-4.8 mW and a latency of 5.57-8.01 mS dependent on\nthe time window. We also benchmark our model against the latest event-based\neye-tracking method, \"3ET\", which was built upon event frames. Results show\nthat Retina achieves superior precision with 1.24px less pupil centroid error\nand reduced computational complexity with 35 times fewer MAC operations. We\nhope this work will open avenues for further investigation of close-loop\nneuromorphic solutions and true event-based training pursuing edge performance.\n","authors":["Pietro Bonazzi","Sizhen Bian","Giovanni Lippolis","Yawei Li","Sadique Sheik","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2312.00425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09831v2","updated":"2024-04-17T05:55:33Z","published":"2024-04-15T14:29:47Z","title":"Digging into contrastive learning for robust depth estimation with\n diffusion models","summary":" Recently, diffusion-based depth estimation methods have drawn widespread\nattention due to their elegant denoising patterns and promising performance.\nHowever, they are typically unreliable under adverse conditions prevalent in\nreal-world scenarios, such as rainy, snowy, etc. In this paper, we propose a\nnovel robust depth estimation method called D4RD, featuring a custom\ncontrastive learning mode tailored for diffusion models to mitigate performance\ndegradation in complex environments. Concretely, we integrate the strength of\nknowledge distillation into contrastive learning, building the `trinity'\ncontrastive scheme. This scheme utilizes the sampled noise of the forward\ndiffusion process as a natural reference, guiding the predicted noise in\ndiverse scenes toward a more stable and precise optimum. Moreover, we extend\nnoise-level trinity to encompass more generic feature and image levels,\nestablishing a multi-level contrast to distribute the burden of robust\nperception across the overall network. Before addressing complex scenarios, we\nenhance the stability of the baseline diffusion model with three\nstraightforward yet effective improvements, which facilitate convergence and\nremove depth outliers. Extensive experiments demonstrate that D4RD surpasses\nexisting state-of-the-art solutions on synthetic corruption datasets and\nreal-world weather conditions. The code for D4RD will be made available for\nfurther exploration and adoption.\n","authors":["Jiyuan Wang","Chunyu Lin","Lang Nie","Kang Liao","Shuwei Shao","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.09831v2.pdf","comment":"8 pages,6 figures"},{"id":"http://arxiv.org/abs/2402.19474v3","updated":"2024-04-17T05:55:04Z","published":"2024-02-29T18:59:17Z","title":"The All-Seeing Project V2: Towards General Relation Comprehension of the\n Open World","summary":" We present the All-Seeing Project V2: a new model and dataset designed for\nunderstanding object relations in images. Specifically, we propose the\nAll-Seeing Model V2 (ASMv2) that integrates the formulation of text generation,\nobject localization, and relation comprehension into a relation conversation\n(ReC) task. Leveraging this unified task, our model excels not only in\nperceiving and recognizing all objects within the image but also in grasping\nthe intricate relation graph between them, diminishing the relation\nhallucination often encountered by Multi-modal Large Language Models (MLLMs).\nTo facilitate training and evaluation of MLLMs in relation understanding, we\ncreated the first high-quality ReC dataset ({AS-V2) which is aligned with the\nformat of standard instruction tuning data. In addition, we design a new\nbenchmark, termed Circular-based Relation Probing Evaluation (CRPE) for\ncomprehensively evaluating the relation comprehension capabilities of MLLMs.\nNotably, our ASMv2 achieves an overall accuracy of 52.04 on this relation-aware\nbenchmark, surpassing the 43.14 of LLaVA-1.5 by a large margin. We hope that\nour work can inspire more future research and contribute to the evolution\ntowards artificial general intelligence. Our project is released at\nhttps://github.com/OpenGVLab/all-seeing.\n","authors":["Weiyun Wang","Yiming Ren","Haowen Luo","Tiantong Li","Chenxiang Yan","Zhe Chen","Wenhai Wang","Qingyun Li","Lewei Lu","Xizhou Zhu","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2402.19474v3.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2404.08968v2","updated":"2024-04-17T05:42:52Z","published":"2024-04-13T11:13:56Z","title":"MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes","summary":" Recent advancements in post-hoc and inherently interpretable methods have\nmarkedly enhanced the explanations of black box classifier models. These\nmethods operate either through post-analysis or by integrating concept learning\nduring model training. Although being effective in bridging the semantic gap\nbetween a model's latent space and human interpretation, these explanation\nmethods only partially reveal the model's decision-making process. The outcome\nis typically limited to high-level semantics derived from the last feature map.\nWe argue that the explanations lacking insights into the decision processes at\nlow and mid-level features are neither fully faithful nor useful. Addressing\nthis gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet),\nan inherently interpretable model. MCPNet autonomously learns meaningful\nconcept prototypes across multiple feature map levels using Centered Kernel\nAlignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so\nwithout reliance on predefined concept labels. Further, we propose a novel\nclassifier paradigm that learns and aligns multi-level concept prototype\ndistributions for classification purposes via Class-aware Concept Distribution\n(CCD) loss. Our experiments reveal that our proposed MCPNet while being\nadaptable to various model architectures, offers comprehensive multi-level\nexplanations while maintaining classification accuracy. Additionally, its\nconcept distribution-based classification approach shows improved\ngeneralization capabilities in few-shot classification scenarios.\n","authors":["Bor-Shiun Wang","Chien-Yi Wang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2404.08968v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11070v1","updated":"2024-04-17T04:59:36Z","published":"2024-04-17T04:59:36Z","title":"Sky-GVIO: an enhanced GNSS/INS/Vision navigation with FCN-based\n sky-segmentation in urban canyon","summary":" Accurate, continuous, and reliable positioning is a critical component of\nachieving autonomous driving. However, in complex urban canyon environments,\nthe vulnerability of a stand-alone sensor and non-line-of-sight (NLOS) caused\nby high buildings, trees, and elevated structures seriously affect positioning\nresults. To address these challenges, a sky-view images segmentation algorithm\nbased on Fully Convolutional Network (FCN) is proposed for GNSS NLOS detection.\nBuilding upon this, a novel NLOS detection and mitigation algorithm (named\nS-NDM) is extended to the tightly coupled Global Navigation Satellite Systems\n(GNSS), Inertial Measurement Units (IMU), and visual feature system which is\ncalled Sky-GVIO, with the aim of achieving continuous and accurate positioning\nin urban canyon environments. Furthermore, the system harmonizes Single Point\nPositioning (SPP) with Real-Time Kinematic (RTK) methodologies to bolster its\noperational versatility and resilience. In urban canyon environments, the\npositioning performance of S-NDM algorithm proposed in this paper is evaluated\nunder different tightly coupled SPP-related and RTK-related models. The results\nexhibit that Sky-GVIO system achieves meter-level accuracy under SPP mode and\nsub-decimeter precision with RTK, surpassing the performance of GNSS/INS/Vision\nframeworks devoid of S-NDM. Additionally, the sky-view image dataset, inclusive\nof training and evaluation subsets, has been made publicly accessible for\nscholarly exploration at https://github.com/whuwangjr/sky-view-images .\n","authors":["Jingrong Wang","Bo Xu","Ronghe Jin","Shoujian Zhang","Kefu Gao","Jingnan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11064v1","updated":"2024-04-17T04:46:27Z","published":"2024-04-17T04:46:27Z","title":"Rethinking 3D Dense Caption and Visual Grounding in A Unified Framework\n through Prompt-based Localization","summary":" 3D Visual Grounding (3DVG) and 3D Dense Captioning (3DDC) are two crucial\ntasks in various 3D applications, which require both shared and complementary\ninformation in localization and visual-language relationships. Therefore,\nexisting approaches adopt the two-stage \"detect-then-describe/discriminate\"\npipeline, which relies heavily on the performance of the detector, resulting in\nsuboptimal performance. Inspired by DETR, we propose a unified framework,\n3DGCTR, to jointly solve these two distinct but closely related tasks in an\nend-to-end fashion. The key idea is to reconsider the prompt-based localization\nability of the 3DVG model. In this way, the 3DVG model with a well-designed\nprompt as input can assist the 3DDC task by extracting localization information\nfrom the prompt. In terms of implementation, we integrate a Lightweight Caption\nHead into the existing 3DVG network with a Caption Text Prompt as a connection,\neffectively harnessing the existing 3DVG model's inherent localization\ncapacity, thereby boosting 3DDC capability. This integration facilitates\nsimultaneous multi-task training on both tasks, mutually enhancing their\nperformance. Extensive experimental results demonstrate the effectiveness of\nthis approach. Specifically, on the ScanRefer dataset, 3DGCTR surpasses the\nstate-of-the-art 3DDC method by 4.3% in CIDEr@0.5IoU in MLE training and\nimproves upon the SOTA 3DVG method by 3.16% in Acc@0.25IoU.\n","authors":["Yongdong Luo","Haojia Lin","Xiawu Zheng","Yigeng Jiang","Fei Chao","Jie Hu","Guannan Jiang","Songan Zhang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.11064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03492v2","updated":"2024-04-17T04:07:47Z","published":"2023-06-06T08:19:30Z","title":"Efficient Anomaly Detection with Budget Annotation Using Semi-Supervised\n Residual Transformer","summary":" Anomaly Detection is challenging as usually only the normal samples are seen\nduring training and the detector needs to discover anomalies on-the-fly. The\nrecently proposed deep-learning-based approaches could somehow alleviate the\nproblem but there is still a long way to go in obtaining an industrial-class\nanomaly detector for real-world applications. On the other hand, in some\nparticular AD tasks, a few anomalous samples are labeled manually for achieving\nhigher accuracy. However, this performance gain is at the cost of considerable\nannotation efforts, which can be intractable in many practical scenarios.\n In this work, the above two problems are addressed in a unified framework.\nFirstly, inspired by the success of the patch-matching-based AD algorithms, we\ntrain a sliding vision transformer over the residuals generated by a novel\nposition-constrained patch-matching. Secondly, the conventional pixel-wise\nsegmentation problem is cast into a block-wise classification problem. Thus the\nsliding transformer can attain even higher accuracy with much less annotation\nlabor. Thirdly, to further reduce the labeling cost, we propose to label the\nanomalous regions using only bounding boxes. The unlabeled regions caused by\nthe weak labels are effectively exploited using a highly-customized\nsemi-supervised learning scheme equipped with two novel data augmentation\nmethods. The proposed method outperforms all the state-of-the-art approaches\nusing all the evaluation metrics in both the unsupervised and supervised\nscenarios. On the popular MVTec-AD dataset, our SemiREST algorithm obtains the\nAverage Precision (AP) of 81.2% in the unsupervised condition and 84.4% AP for\nsupervised anomaly detection. Surprisingly, with the bounding-box-based\nsemi-supervisions, SemiREST still outperforms the SOTA methods with full\nsupervision (83.8% AP) on MVTec-AD.\n","authors":["Hanxi Li","Jingqi Wu","Hao Chen","Mingwen Wang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2306.03492v2.pdf","comment":"20 pages,6 figures"},{"id":"http://arxiv.org/abs/2404.11054v1","updated":"2024-04-17T03:56:28Z","published":"2024-04-17T03:56:28Z","title":"Multilateral Temporal-view Pyramid Transformer for Video Inpainting\n Detection","summary":" The task of video inpainting detection is to expose the pixel-level inpainted\nregions within a video sequence. Existing methods usually focus on leveraging\nspatial and temporal inconsistencies. However, these methods typically employ\nfixed operations to combine spatial and temporal clues, limiting their\napplicability in different scenarios. In this paper, we introduce a novel\nMultilateral Temporal-view Pyramid Transformer ({\\em MumPy}) that collaborates\nspatial-temporal clues flexibly. Our method utilizes a newly designed\nmultilateral temporal-view encoder to extract various collaborations of\nspatial-temporal clues and introduces a deformable window-based temporal-view\ninteraction module to enhance the diversity of these collaborations.\nSubsequently, we develop a multi-pyramid decoder to aggregate the various types\nof features and generate detection maps. By adjusting the contribution strength\nof spatial and temporal clues, our method can effectively identify inpainted\nregions. We validate our method on existing datasets and also introduce a new\nchallenging and large-scale Video Inpainting dataset based on the YouTube-VOS\ndataset, which employs several more recent inpainting methods. The results\ndemonstrate the superiority of our method in both in-domain and cross-domain\nevaluation scenarios.\n","authors":["Ying Zhang","Bo Peng","Jiaran Zhou","Huiyu Zhou","Junyu Dong","Yuezun Li"],"pdf_url":"https://arxiv.org/pdf/2404.11054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11052v1","updated":"2024-04-17T03:51:55Z","published":"2024-04-17T03:51:55Z","title":"Supervised Contrastive Vision Transformer for Breast Histopathological\n Image Classification","summary":" Invasive ductal carcinoma (IDC) is the most prevalent form of breast cancer.\nBreast tissue histopathological examination is critical in diagnosing and\nclassifying breast cancer. Although existing methods have shown promising\nresults, there is still room for improvement in the classification accuracy and\ngeneralization of IDC using histopathology images. We present a novel approach,\nSupervised Contrastive Vision Transformer (SupCon-ViT), for improving the\nclassification of invasive ductal carcinoma in terms of accuracy and\ngeneralization by leveraging the inherent strengths and advantages of both\ntransfer learning, i.e., pre-trained vision transformer, and supervised\ncontrastive learning. Our results on a benchmark breast cancer dataset\ndemonstrate that SupCon-Vit achieves state-of-the-art performance in IDC\nclassification, with an F1-score of 0.8188, precision of 0.7692, and\nspecificity of 0.8971, outperforming existing methods. In addition, the\nproposed model demonstrates resilience in scenarios with minimal labeled data,\nmaking it highly efficient in real-world clinical settings where labelled data\nis limited. Our findings suggest that supervised contrastive learning in\nconjunction with pre-trained vision transformers appears to be a viable\nstrategy for an accurate classification of IDC, thus paving the way for a more\nefficient and reliable diagnosis of breast cancer through histopathological\nimage analysis.\n","authors":["Mohammad Shiri","Jiangwen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.11052v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.11051v1","updated":"2024-04-17T03:51:24Z","published":"2024-04-17T03:51:24Z","title":"WPS-Dataset: A benchmark for wood plate segmentation in bark removal\n processing","summary":" Using deep learning methods is a promising approach to improving bark removal\nefficiency and enhancing the quality of wood products. However, the lack of\npublicly available datasets for wood plate segmentation in bark removal\nprocessing poses challenges for researchers in this field. To address this\nissue, a benchmark for wood plate segmentation in bark removal processing named\nWPS-dataset is proposed in this study, which consists of 4863 images. We\ndesigned an image acquisition device and assembled it on a bark removal\nequipment to capture images in real industrial settings. We evaluated the\nWPS-dataset using six typical segmentation models. The models effectively learn\nand understand the WPS-dataset characteristics during training, resulting in\nhigh performance and accuracy in wood plate segmentation tasks. We believe that\nour dataset can lay a solid foundation for future research in bark removal\nprocessing and contribute to advancements in this field.\n","authors":["Rijun Wang","Guanghao Zhang","Fulong Liang","Bo Wang","Xiangwei Mou","Yesheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.11051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11046v1","updated":"2024-04-17T03:42:48Z","published":"2024-04-17T03:42:48Z","title":"Lightweight Unsupervised Federated Learning with Pretrained Vision\n Language Model","summary":" Federated learning aims to tackle the ``isolated data island\" problem, where\nit trains a collective model from physically isolated clients while\nsafeguarding the privacy of users' data. However, supervised federated learning\nnecessitates that each client labels their data for training, which can be both\ntime-consuming and resource-intensive, and may even be impractical for edge\ndevices. Moreover, the training and transmission of deep models present\nchallenges to the computation and communication capabilities of the clients. To\naddress these two inherent challenges in supervised federated learning, we\npropose a novel lightweight unsupervised federated learning approach that\nleverages unlabeled data on each client to perform lightweight model training\nand communication by harnessing pretrained vision-language models, such as\nCLIP. By capitalizing on the zero-shot prediction capability and the\nwell-trained image encoder of the pre-trained CLIP model, we have carefully\ncrafted an efficient and resilient self-training approach. This method refines\nthe initial zero-shot predicted pseudo-labels of unlabeled instances through\nthe sole training of a linear classifier on top of the fixed image encoder.\nAdditionally, to address data heterogeneity within each client, we propose a\nclass-balanced text feature sampling strategy for generating synthetic\ninstances in the feature space to support local training. Experiments are\nconducted on multiple benchmark datasets. The experimental results demonstrate\nthat our proposed method greatly enhances model performance in comparison to\nCLIP's zero-shot predictions and even outperforms supervised federated learning\nbenchmark methods given limited computational and communication overhead.\n","authors":["Hao Yan","Yuhong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.11046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07922v4","updated":"2024-04-17T03:23:33Z","published":"2024-04-11T17:09:28Z","title":"LaVy: Vietnamese Multimodal Large Language Model","summary":" Large Language Models (LLMs) and Multimodal Large language models (MLLMs)\nhave taken the world by storm with impressive abilities in complex reasoning\nand linguistic comprehension. Meanwhile there are plethora of works related to\nVietnamese Large Language Models, the lack of high-quality resources in\nmultimodality limits the progress of Vietnamese MLLMs. In this paper, we\npioneer in address this by introducing LaVy, a state-of-the-art Vietnamese\nMLLM, and we also introduce LaVy-Bench benchmark designated for evaluating\nMLLMs's understanding on Vietnamese visual language tasks. Our project is\npublic at https://github.com/baochi0212/LaVy\n","authors":["Chi Tran","Huong Le Thanh"],"pdf_url":"https://arxiv.org/pdf/2404.07922v4.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2306.08251v3","updated":"2024-04-17T03:14:21Z","published":"2023-06-14T05:34:02Z","title":"GBSD: Generative Bokeh with Stage Diffusion","summary":" The bokeh effect is an artistic technique that blurs out-of-focus areas in a\nphotograph and has gained interest due to recent developments in text-to-image\nsynthesis and the ubiquity of smart-phone cameras and photo-sharing apps. Prior\nwork on rendering bokeh effects have focused on post hoc image manipulation to\nproduce similar blurring effects in existing photographs using classical\ncomputer graphics or neural rendering techniques, but have either depth\ndiscontinuity artifacts or are restricted to reproducing bokeh effects that are\npresent in the training data. More recent diffusion based models can synthesize\nimages with an artistic style, but either require the generation of\nhigh-dimensional masks, expensive fine-tuning, or affect global image\ncharacteristics. In this paper, we present GBSD, the first generative\ntext-to-image model that synthesizes photorealistic images with a bokeh style.\nMotivated by how image synthesis occurs progressively in diffusion models, our\napproach combines latent diffusion models with a 2-stage conditioning algorithm\nto render bokeh effects on semantically defined objects. Since we can focus the\neffect on objects, this semantic bokeh effect is more versatile than classical\nrendering techniques. We evaluate GBSD both quantitatively and qualitatively\nand demonstrate its ability to be applied in both text-to-image and\nimage-to-image settings.\n","authors":["Jieren Deng","Xin Zhou","Hao Tian","Zhihong Pan","Derek Aguiar"],"pdf_url":"https://arxiv.org/pdf/2306.08251v3.pdf","comment":"Short Version is accepted by International Conference on Acoustics,\n Speech, and Signal Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2401.03907v2","updated":"2024-04-17T03:14:00Z","published":"2024-01-08T14:10:24Z","title":"RoboFusion: Towards Robust Multi-Modal 3D Object Detection via SAM","summary":" Multi-modal 3D object detectors are dedicated to exploring secure and\nreliable perception systems for autonomous driving (AD). However, while\nachieving state-of-the-art (SOTA) performance on clean benchmark datasets, they\ntend to overlook the complexity and harsh conditions of real-world\nenvironments. Meanwhile, with the emergence of visual foundation models (VFMs),\nopportunities and challenges are presented for improving the robustness and\ngeneralization of multi-modal 3D object detection in autonomous driving.\nTherefore, we propose RoboFusion, a robust framework that leverages VFMs like\nSAM to tackle out-of-distribution (OOD) noise scenarios. We first adapt the\noriginal SAM for autonomous driving scenarios named SAM-AD. To align SAM or\nSAM-AD with multi-modal methods, we then introduce AD-FPN for upsampling the\nimage features extracted by SAM. We employ wavelet decomposition to denoise the\ndepth-guided images for further noise reduction and weather interference.\nLastly, we employ self-attention mechanisms to adaptively reweight the fused\nfeatures, enhancing informative features while suppressing excess noise. In\nsummary, our RoboFusion gradually reduces noise by leveraging the\ngeneralization and robustness of VFMs, thereby enhancing the resilience of\nmulti-modal 3D object detection. Consequently, our RoboFusion achieves\nstate-of-the-art performance in noisy scenarios, as demonstrated by the KITTI-C\nand nuScenes-C benchmarks.\n","authors":["Ziying Song","Guoxing Zhang","Lin Liu","Lei Yang","Shaoqing Xu","Caiyan Jia","Feiyang Jia","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2401.03907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11031v1","updated":"2024-04-17T03:13:58Z","published":"2024-04-17T03:13:58Z","title":"TaCOS: Task-Specific Camera Optimization with Simulation","summary":" The performance of robots in their applications heavily depends on the\nquality of sensory input. However, designing sensor payloads and their\nparameters for specific robotic tasks is an expensive process that requires\nwell-established sensor knowledge and extensive experiments with physical\nhardware. With cameras playing a pivotal role in robotic perception, we\nintroduce a novel end-to-end optimization approach for co-designing a camera\nwith specific robotic tasks by combining derivative-free and gradient-based\noptimizers. The proposed method leverages recent computer graphics techniques\nand physical camera characteristics to prototype the camera in software,\nsimulate operational environments and tasks for robots, and optimize the camera\ndesign based on the desired tasks in a cost-effective way. We validate the\naccuracy of our camera simulation by comparing it with physical cameras, and\ndemonstrate the design of cameras with stronger performance than common\noff-the-shelf alternatives. Our approach supports the optimization of both\ncontinuous and discrete camera parameters, manufacturing constraints, and can\nbe generalized to a broad range of camera design scenarios including multiple\ncameras and unconventional cameras. This work advances the fully automated\ndesign of cameras for specific robotics tasks.\n","authors":["Chengyang Yan","Donald Dansereau"],"pdf_url":"https://arxiv.org/pdf/2404.11031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09276v3","updated":"2024-04-17T03:02:38Z","published":"2023-10-13T17:38:45Z","title":"Transformer-based Multimodal Change Detection with Multitask Consistency\n Constraints","summary":" Change detection plays a fundamental role in Earth observation for analyzing\ntemporal iterations over time. However, recent studies have largely neglected\nthe utilization of multimodal data that presents significant practical and\ntechnical advantages compared to single-modal approaches. This research focuses\non leveraging {pre-event} digital surface model (DSM) data and {post-event}\ndigital aerial images captured at different times for detecting change beyond\n2D. We observe that the current change detection methods struggle with the\nmultitask conflicts between semantic and height change detection tasks. To\naddress this challenge, we propose an efficient Transformer-based network that\nlearns shared representation between cross-dimensional inputs through\ncross-attention. {It adopts a consistency constraint to establish the\nmultimodal relationship. Initially, pseudo-changes are derived by employing\nheight change thresholding. Subsequently, the $L2$ distance between semantic\nand pseudo-changes within their overlapping regions is minimized. This\nexplicitly endows the height change detection (regression task) and semantic\nchange detection (classification task) with representation consistency.} A\nDSM-to-image multimodal dataset encompassing three cities in the Netherlands\nwas constructed. It lays a new foundation for beyond-2D change detection from\ncross-dimensional inputs. Compared to five state-of-the-art change detection\nmethods, our model demonstrates consistent multitask superiority in terms of\nsemantic and height change detection. Furthermore, the consistency strategy can\nbe seamlessly adapted to the other methods, yielding promising improvements.\n","authors":["Biyuan Liu","Huaixin Chen","Kun Li","Michael Ying Yang"],"pdf_url":"https://arxiv.org/pdf/2310.09276v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11025v1","updated":"2024-04-17T03:01:47Z","published":"2024-04-17T03:01:47Z","title":"Spatial-Aware Image Retrieval: A Hyperdimensional Computing Approach for\n Efficient Similarity Hashing","summary":" In the face of burgeoning image data, efficiently retrieving similar images\nposes a formidable challenge. Past research has focused on refining hash\nfunctions to distill images into compact indicators of resemblance. Initial\nattempts used shallow models, evolving to attention mechanism-based\narchitectures from Convolutional Neural Networks (CNNs) to advanced models.\nRecognizing limitations in gradient-based models for spatial information\nembedding, we propose an innovative image hashing method, NeuroHash leveraging\nHyperdimensional Computing (HDC). HDC symbolically encodes spatial information\ninto high-dimensional vectors, reshaping image representation. Our approach\ncombines pre-trained large vision models with HDC operations, enabling\nspatially encoded feature representations. Hashing with locality-sensitive\nhashing (LSH) ensures swift and efficient image retrieval. Notably, our\nframework allows dynamic hash manipulation for conditional image retrieval. Our\nwork introduces a transformative image hashing framework enabling spatial-aware\nconditional retrieval. By seamlessly combining DNN-based neural and HDC-based\nsymbolic models, our methodology breaks from traditional training, offering\nflexible and conditional image retrieval. Performance evaluations signify a\nparadigm shift in image-hashing methodologies, demonstrating enhanced retrieval\naccuracy.\n","authors":["Sanggeon Yun","Ryozo Masukawa","SungHeon Jeong","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2404.11025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16627v2","updated":"2024-04-17T02:57:58Z","published":"2024-03-25T11:16:23Z","title":"SDXS: Real-Time One-Step Latent Diffusion Models with Image Conditions","summary":" Recent advancements in diffusion models have positioned them at the forefront\nof image generation. Despite their superior performance, diffusion models are\nnot without drawbacks; they are characterized by complex architectures and\nsubstantial computational demands, resulting in significant latency due to\ntheir iterative sampling process. To mitigate these limitations, we introduce a\ndual approach involving model miniaturization and a reduction in sampling\nsteps, aimed at significantly decreasing model latency. Our methodology\nleverages knowledge distillation to streamline the U-Net and image decoder\narchitectures, and introduces an innovative one-step DM training technique that\nutilizes feature matching and score distillation. We present two models,\nSDXS-512 and SDXS-1024, achieving inference speeds of approximately 100 FPS\n(30x faster than SD v1.5) and 30 FPS (60x faster than SDXL) on a single GPU,\nrespectively. Moreover, our training approach offers promising applications in\nimage-conditioned control, facilitating efficient image-to-image translation.\n","authors":["Yuda Song","Zehao Sun","Xuanwu Yin"],"pdf_url":"https://arxiv.org/pdf/2403.16627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10357v2","updated":"2024-04-17T02:48:49Z","published":"2024-04-16T07:44:52Z","title":"Optimization of Prompt Learning via Multi-Knowledge Representation for\n Vision-Language Models","summary":" Vision-Language Models (VLMs), such as CLIP, play a foundational role in\nvarious cross-modal applications. To fully leverage VLMs' potential in adapting\nto downstream tasks, context optimization methods like Prompt Tuning are\nessential. However, one key limitation is the lack of diversity in prompt\ntemplates, whether they are hand-crafted or learned through additional modules.\nThis limitation restricts the capabilities of pretrained VLMs and can result in\nincorrect predictions in downstream tasks. To address this challenge, we\npropose Context Optimization with Multi-Knowledge Representation (CoKnow), a\nframework that enhances Prompt Learning for VLMs with rich contextual\nknowledge. To facilitate CoKnow during inference, we trained lightweight\nsemantic knowledge mappers, which are capable of generating Multi-Knowledge\nRepresentation for an input image without requiring additional priors.\nExperimentally, We conducted extensive experiments on 11 publicly available\ndatasets, demonstrating that CoKnow outperforms a series of previous methods.\nWe will make all resources open-source: https://github.com/EMZucas/CoKnow.\n","authors":["Enming Zhang","Bingke Zhu","Yingying Chen","Qinghai Miao","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.10357v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11016v1","updated":"2024-04-17T02:47:39Z","published":"2024-04-17T02:47:39Z","title":"MaeFuse: Transferring Omni Features with Pretrained Masked Autoencoders\n for Infrared and Visible Image Fusion via Guided Training","summary":" In this research, we introduce MaeFuse, a novel autoencoder model designed\nfor infrared and visible image fusion (IVIF). The existing approaches for image\nfusion often rely on training combined with downstream tasks to obtain\nhigh-level visual information, which is effective in emphasizing target objects\nand delivering impressive results in visual quality and task-specific\napplications. MaeFuse, however, deviates from the norm. Instead of being driven\nby downstream tasks, our model utilizes a pretrained encoder from Masked\nAutoencoders (MAE), which facilities the omni features extraction for low-level\nreconstruction and high-level vision tasks, to obtain perception friendly\nfeatures with a low cost. In order to eliminate the domain gap of different\nmodal features and the block effect caused by the MAE encoder, we further\ndevelop a guided training strategy. This strategy is meticulously crafted to\nensure that the fusion layer seamlessly adjusts to the feature space of the\nencoder, gradually enhancing the fusion effect. It facilitates the\ncomprehensive integration of feature vectors from both infrared and visible\nmodalities, preserving the rich details inherent in each. MaeFuse not only\nintroduces a novel perspective in the realm of fusion techniques but also\nstands out with impressive performance across various public datasets.\n","authors":["Jiayang Li","Junjun Jiang","Pengwei Liang","Jiayi Ma"],"pdf_url":"https://arxiv.org/pdf/2404.11016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.10971v2","updated":"2024-04-17T02:39:19Z","published":"2022-07-22T09:37:48Z","title":"Kinematics Modeling Network for Video-based Human Pose Estimation","summary":" Estimating human poses from videos is critical in human-computer interaction.\nJoints cooperate rather than move independently during human movement. There\nare both spatial and temporal correlations between joints. Despite the positive\nresults of previous approaches, most focus on modeling the spatial correlation\nbetween joints while only straightforwardly integrating features along the\ntemporal dimension, ignoring the temporal correlation between joints. In this\nwork, we propose a plug-and-play kinematics modeling module (KMM) to explicitly\nmodel temporal correlations between joints across different frames by\ncalculating their temporal similarity. In this way, KMM can capture motion cues\nof the current joint relative to all joints in different time. Besides, we\nformulate video-based human pose estimation as a Markov Decision Process and\ndesign a novel kinematics modeling network (KIMNet) to simulate the Markov\nChain, allowing KIMNet to locate joints recursively. Our approach achieves\nstate-of-the-art results on two challenging benchmarks. In particular, KIMNet\nshows robustness to the occlusion. The code will be released at\nhttps://github.com/YHDang/KIMNet.\n","authors":["Yonghao Dang","Jianqin Yin","Shaojie Zhang","Jiping Liu","Yanzhu Hu"],"pdf_url":"https://arxiv.org/pdf/2207.10971v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11008v1","updated":"2024-04-17T02:36:02Z","published":"2024-04-17T02:36:02Z","title":"AKGNet: Attribute Knowledge-Guided Unsupervised Lung-Infected Area\n Segmentation","summary":" Lung-infected area segmentation is crucial for assessing the severity of lung\ndiseases. However, existing image-text multi-modal methods typically rely on\nlabour-intensive annotations for model training, posing challenges regarding\ntime and expertise. To address this issue, we propose a novel attribute\nknowledge-guided framework for unsupervised lung-infected area segmentation\n(AKGNet), which achieves segmentation solely based on image-text data without\nany mask annotation. AKGNet facilitates text attribute knowledge learning,\nattribute-image cross-attention fusion, and high-confidence-based pseudo-label\nexploration simultaneously. It can learn statistical information and capture\nspatial correlations between image and text attributes in the embedding space,\niteratively refining the mask to enhance segmentation. Specifically, we\nintroduce a text attribute knowledge learning module by extracting attribute\nknowledge and incorporating it into feature representations, enabling the model\nto learn statistical information and adapt to different attributes. Moreover,\nwe devise an attribute-image cross-attention module by calculating the\ncorrelation between attributes and images in the embedding space to capture\nspatial dependency information, thus selectively focusing on relevant regions\nwhile filtering irrelevant areas. Finally, a self-training mask improvement\nprocess is employed by generating pseudo-labels using high-confidence\npredictions to iteratively enhance the mask and segmentation. Experimental\nresults on a benchmark medical image dataset demonstrate the superior\nperformance of our method compared to state-of-the-art segmentation techniques\nin unsupervised scenarios.\n","authors":["Qing En","Yuhong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.11008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11003v1","updated":"2024-04-17T02:29:44Z","published":"2024-04-17T02:29:44Z","title":"InfoMatch: Entropy Neural Estimation for Semi-Supervised Image\n Classification","summary":" Semi-supervised image classification, leveraging pseudo supervision and\nconsistency regularization, has demonstrated remarkable success. However, the\nongoing challenge lies in fully exploiting the potential of unlabeled data. To\naddress this, we employ information entropy neural estimation to harness the\npotential of unlabeled samples. Inspired by contrastive learning, the entropy\nis estimated by maximizing a lower bound on mutual information across different\naugmented views. Moreover, we theoretically analyze that the information\nentropy of the posterior of an image classifier is approximated by maximizing\nthe likelihood function of the softmax predictions. Guided by these insights,\nwe optimize our model from both perspectives to ensure that the predicted\nprobability distribution closely aligns with the ground-truth distribution.\nGiven the theoretical connection to information entropy, we name our method\n\\textit{InfoMatch}. Through extensive experiments, we show its superior\nperformance.\n","authors":["Qi Han","Zhibo Tian","Chengwei Xia","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2404.11003v1.pdf","comment":"IJCAI 2024"},{"id":"http://arxiv.org/abs/2308.13072v3","updated":"2024-04-17T02:09:54Z","published":"2023-08-24T20:29:09Z","title":"Full-dose Whole-body PET Synthesis from Low-dose PET Using\n High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency\n Model","summary":" Objective: Positron Emission Tomography (PET) has been a commonly used\nimaging modality in broad clinical applications. One of the most important\ntradeoffs in PET imaging is between image quality and radiation dose: high\nimage quality comes with high radiation exposure. Improving image quality is\ndesirable for all clinical applications while minimizing radiation exposure is\nneeded to reduce risk to patients. Approach: We introduce PET Consistency Model\n(PET-CM), an efficient diffusion-based method for generating high-quality\nfull-dose PET images from low-dose PET images. It employs a two-step process,\nadding Gaussian noise to full-dose PET images in the forward diffusion, and\nthen denoising them using a PET Shifted-window Vision Transformer (PET-VIT)\nnetwork in the reverse diffusion. The PET-VIT network learns a consistency\nfunction that enables direct denoising of Gaussian noise into clean full-dose\nPET images. PET-CM achieves state-of-the-art image quality while requiring\nsignificantly less computation time than other methods. Results: In experiments\ncomparing eighth-dose to full-dose images, PET-CM demonstrated impressive\nperformance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of\n0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of\n0.255+/-0.318%, with an average generation time of 62 seconds per patient. This\nis a significant improvement compared to the state-of-the-art diffusion-based\nmodel with PET-CM reaching this result 12x faster. Similarly, in the\nquarter-dose to full-dose image experiments, PET-CM delivered competitive\noutcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM\nof 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of\n0.151+/-0.192% using the same generation process, which underlining its high\nquantitative and clinical precision in both denoising scenario.\n","authors":["Shaoyan Pan","Elham Abouei","Junbo Peng","Joshua Qian","Jacob F Wynne","Tonghe Wang","Chih-Wei Chang","Justin Roper","Jonathon A Nye","Hui Mao","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13072v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10992v1","updated":"2024-04-17T02:05:05Z","published":"2024-04-17T02:05:05Z","title":"How to deal with glare for improved perception of Autonomous Vehicles","summary":" Vision sensors are versatile and can capture a wide range of visual cues,\nsuch as color, texture, shape, and depth. This versatility, along with the\nrelatively inexpensive availability of machine vision cameras, played an\nimportant role in adopting vision-based environment perception systems in\nautonomous vehicles (AVs). However, vision-based perception systems can be\neasily affected by glare in the presence of a bright source of light, such as\nthe sun or the headlights of the oncoming vehicle at night or simply by light\nreflecting off snow or ice-covered surfaces; scenarios encountered frequently\nduring driving. In this paper, we investigate various glare reduction\ntechniques, including the proposed saturated pixel-aware glare reduction\ntechnique for improved performance of the computer vision (CV) tasks employed\nby the perception layer of AVs. We evaluate these glare reduction methods based\non various performance metrics of the CV algorithms used by the perception\nlayer. Specifically, we considered object detection, object recognition, object\ntracking, depth estimation, and lane detection which are crucial for autonomous\ndriving. The experimental findings validate the efficacy of the proposed glare\nreduction approach, showcasing enhanced performance across diverse perception\ntasks and remarkable resilience against varying levels of glare.\n","authors":["Muhammad Z. Alam","Zeeshan Kaleem","Sousso Kelouwani"],"pdf_url":"https://arxiv.org/pdf/2404.10992v1.pdf","comment":"14 pages, 9 figures, Accepted IEEE TIV"},{"id":"http://arxiv.org/abs/2404.10096v2","updated":"2024-04-17T02:02:33Z","published":"2024-04-15T19:06:58Z","title":"Vision Augmentation Prediction Autoencoder with Attention Design\n (VAPAAD)","summary":" Recent advancements in sequence prediction have significantly improved the\naccuracy of video data interpretation; however, existing models often overlook\nthe potential of attention-based mechanisms for next-frame prediction. This\nstudy introduces the Vision Augmentation Prediction Autoencoder with Attention\nDesign (VAPAAD), an innovative approach that integrates attention mechanisms\ninto sequence prediction, enabling nuanced analysis and understanding of\ntemporal dynamics in video sequences. Utilizing the Moving MNIST dataset, we\ndemonstrate VAPAAD's robust performance and superior handling of complex\ntemporal data compared to traditional methods. VAPAAD combines data\naugmentation, ConvLSTM2D layers, and a custom-built self-attention mechanism to\neffectively focus on salient features within a sequence, enhancing predictive\naccuracy and context-aware analysis. This methodology not only adheres to human\ncognitive processes during video interpretation but also addresses limitations\nin conventional models, which often struggle with the variability inherent in\nvideo sequences. The experimental results confirm that VAPAAD outperforms\nexisting models, especially in integrating attention mechanisms, which\nsignificantly improve predictive performance.\n","authors":["Yiqiao Yin"],"pdf_url":"https://arxiv.org/pdf/2404.10096v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.10989v1","updated":"2024-04-17T01:53:03Z","published":"2024-04-17T01:53:03Z","title":"FairSSD: Understanding Bias in Synthetic Speech Detectors","summary":" Methods that can generate synthetic speech which is perceptually\nindistinguishable from speech recorded by a human speaker, are easily\navailable. Several incidents report misuse of synthetic speech generated from\nthese methods to commit fraud. To counter such misuse, many methods have been\nproposed to detect synthetic speech. Some of these detectors are more\ninterpretable, can generalize to detect synthetic speech in the wild and are\nrobust to noise. However, limited work has been done on understanding bias in\nthese detectors. In this work, we examine bias in existing synthetic speech\ndetectors to determine if they will unfairly target a particular gender, age\nand accent group. We also inspect whether these detectors will have a higher\nmisclassification rate for bona fide speech from speech-impaired speakers w.r.t\nfluent speakers. Extensive experiments on 6 existing synthetic speech detectors\nusing more than 0.9 million speech signals demonstrate that most detectors are\ngender, age and accent biased, and future work is needed to ensure fairness. To\nsupport future research, we release our evaluation dataset, models used in our\nstudy and source code at https://gitlab.com/viper-purdue/fairssd.\n","authors":["Amit Kumar Singh Yadav","Kratika Bhagtani","Davide Salvi","Paolo Bestagini","Edward J. Delp"],"pdf_url":"https://arxiv.org/pdf/2404.10989v1.pdf","comment":"Accepted at CVPR 2024 (WMF)"},{"id":"http://arxiv.org/abs/2404.02155v2","updated":"2024-04-17T01:41:59Z","published":"2024-04-02T17:58:57Z","title":"Alpha Invariance: On Inverse Scaling Between Distance and Volume Density\n in Neural Radiance Fields","summary":" Scale-ambiguity in 3D scene dimensions leads to magnitude-ambiguity of\nvolumetric densities in neural radiance fields, i.e., the densities double when\nscene size is halved, and vice versa. We call this property alpha invariance.\nFor NeRFs to better maintain alpha invariance, we recommend 1) parameterizing\nboth distance and volume densities in log space, and 2) a\ndiscretization-agnostic initialization strategy to guarantee high ray\ntransmittance. We revisit a few popular radiance field models and find that\nthese systems use various heuristics to deal with issues arising from scene\nscaling. We test their behaviors and show our recipe to be more robust.\n","authors":["Joshua Ahn","Haochen Wang","Raymond A. Yeh","Greg Shakhnarovich"],"pdf_url":"https://arxiv.org/pdf/2404.02155v2.pdf","comment":"CVPR 2024. project page https://pals.ttic.edu/p/alpha-invariance"},{"id":"http://arxiv.org/abs/2404.10985v1","updated":"2024-04-17T01:35:52Z","published":"2024-04-17T01:35:52Z","title":"Pixel-Wise Symbol Spotting via Progressive Points Location for Parsing\n CAD Images","summary":" Parsing Computer-Aided Design (CAD) drawings is a fundamental step for CAD\nrevision, semantic-based management, and the generation of 3D prototypes in\nboth the architecture and engineering industries. Labeling symbols from a CAD\ndrawing is a challenging yet notorious task from a practical point of view. In\nthis work, we propose to label and spot symbols from CAD images that are\nconverted from CAD drawings. The advantage of spotting symbols from CAD images\nlies in the low requirement of labelers and the low-cost annotation. However,\npixel-wise spotting symbols from CAD images is challenging work. We propose a\npixel-wise point location via Progressive Gaussian Kernels (PGK) to balance\nbetween training efficiency and location accuracy. Besides, we introduce a\nlocal offset to the heatmap-based point location method. Based on the keypoints\ndetection, we propose a symbol grouping method to redraw the rectangle symbols\nin CAD images. We have released a dataset containing CAD images of equipment\nrooms from telecommunication industrial CAD drawings. Extensive experiments on\nthis real-world dataset show that the proposed method has good generalization\nability.\n","authors":["Junbiao Pang","Zailin Dong","Jiaxin Deng","Mengyuan Zhu","Yunwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10985v1.pdf","comment":"10 pages, 10 figures,6 tables"},{"id":"http://arxiv.org/abs/2404.10980v1","updated":"2024-04-17T01:26:15Z","published":"2024-04-17T01:26:15Z","title":"Hyper Evidential Deep Learning to Quantify Composite Classification\n Uncertainty","summary":" Deep neural networks (DNNs) have been shown to perform well on exclusive,\nmulti-class classification tasks. However, when different classes have similar\nvisual features, it becomes challenging for human annotators to differentiate\nthem. This scenario necessitates the use of composite class labels. In this\npaper, we propose a novel framework called Hyper-Evidential Neural Network\n(HENN) that explicitly models predictive uncertainty due to composite class\nlabels in training data in the context of the belief theory called Subjective\nLogic (SL). By placing a grouped Dirichlet distribution on the class\nprobabilities, we treat predictions of a neural network as parameters of\nhyper-subjective opinions and learn the network that collects both single and\ncomposite evidence leading to these hyper-opinions by a deterministic DNN from\ndata. We introduce a new uncertainty type called vagueness originally designed\nfor hyper-opinions in SL to quantify composite classification uncertainty for\nDNNs. Our results demonstrate that HENN outperforms its state-of-the-art\ncounterparts based on four image datasets. The code and datasets are available\nat: https://github.com/Hugo101/HyperEvidentialNN.\n","authors":["Changbin Li","Kangshuo Li","Yuzhe Ou","Lance M. Kaplan","Audun Jøsang","Jin-Hee Cho","Dong Hyun Jeong","Feng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.10980v1.pdf","comment":"In Proceedings of The Twelfth International Conference on Learning\n Representations, ICLR 2024"},{"id":"http://arxiv.org/abs/2404.10978v1","updated":"2024-04-17T01:23:49Z","published":"2024-04-17T01:23:49Z","title":"Leveraging 3D LiDAR Sensors to Enable Enhanced Urban Safety and Public\n Health: Pedestrian Monitoring and Abnormal Activity Detection","summary":" The integration of Light Detection and Ranging (LiDAR) and Internet of Things\n(IoT) technologies offers transformative opportunities for public health\ninformatics in urban safety and pedestrian well-being. This paper proposes a\nnovel framework utilizing these technologies for enhanced 3D object detection\nand activity classification in urban traffic scenarios. By employing elevated\nLiDAR, we obtain detailed 3D point cloud data, enabling precise pedestrian\nactivity monitoring. To overcome urban data scarcity, we create a specialized\ndataset through simulated traffic environments in Blender, facilitating\ntargeted model training. Our approach employs a modified Point\nVoxel-Region-based Convolutional Neural Network (PV-RCNN) for robust 3D\ndetection and PointNet for classifying pedestrian activities, significantly\nbenefiting urban traffic management and public health by offering insights into\npedestrian behavior and promoting safer urban environments. Our dual-model\napproach not only enhances urban traffic management but also contributes\nsignificantly to public health by providing insights into pedestrian behavior\nand promoting safer urban environment.\n","authors":["Nawfal Guefrachi","Jian Shi","Hakim Ghazzai","Ahmad Alsharoa"],"pdf_url":"https://arxiv.org/pdf/2404.10978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02145v2","updated":"2024-04-17T01:10:28Z","published":"2024-04-02T17:57:31Z","title":"Iterated Learning Improves Compositionality in Large Vision-Language\n Models","summary":" A fundamental characteristic common to both human vision and natural language\nis their compositional nature. Yet, despite the performance gains contributed\nby large vision and language pretraining, recent investigations find that\nmost-if not all-our state-of-the-art vision-language models struggle at\ncompositionality. They are unable to distinguish between images of \" a girl in\nwhite facing a man in black\" and \"a girl in black facing a man in white\".\nMoreover, prior work suggests that compositionality doesn't arise with scale:\nlarger model sizes or training data don't help. This paper develops a new\niterated training algorithm that incentivizes compositionality. We draw on\ndecades of cognitive science research that identifies cultural transmission-the\nneed to teach a new generation-as a necessary inductive prior that incentivizes\nhumans to develop compositional languages. Specifically, we reframe\nvision-language contrastive learning as the Lewis Signaling Game between a\nvision agent and a language agent, and operationalize cultural transmission by\niteratively resetting one of the agent's weights during training. After every\niteration, this training paradigm induces representations that become \"easier\nto learn\", a property of compositional languages: e.g. our model trained on\nCC3M and CC12M improves standard CLIP by 4.7%, 4.0% respectfully in the\nSugarCrepe benchmark.\n","authors":["Chenhao Zheng","Jieyu Zhang","Aniruddha Kembhavi","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.02145v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10383v2","updated":"2024-04-17T01:05:07Z","published":"2024-04-16T08:25:36Z","title":"Learning to Score Sign Language with Two-stage Method","summary":" Human action recognition and performance assessment have been hot research\ntopics in recent years. Recognition problems have mature solutions in the field\nof sign language, but past research in performance analysis has focused on\ncompetitive sports and medical training, overlooking the scoring assessment\n,which is an important part of sign language teaching digitalization. In this\npaper, we analyze the existing technologies for performance assessment and\nadopt methods that perform well in human pose reconstruction tasks combined\nwith motion rotation embedded expressions, proposing a two-stage sign language\nperformance evaluation pipeline. Our analysis shows that choosing\nreconstruction tasks in the first stage can provide more expressive features,\nand using smoothing methods can provide an effective reference for assessment.\nExperiments show that our method provides good score feedback mechanisms and\nhigh consistency with professional assessments compared to end-to-end\nevaluations.\n","authors":["Hongli Wen","Yang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.10383v2.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.11700v2","updated":"2024-04-17T01:04:07Z","published":"2023-10-18T04:15:39Z","title":"Runner re-identification from single-view running video in the\n open-world setting","summary":" In many sports, player re-identification is crucial for automatic video\nprocessing and analysis. However, most of the current studies on player\nre-identification in multi- or single-view sports videos focus on\nre-identification in the closed-world setting using labeled image dataset, and\nplayer re-identification in the open-world setting for automatic video analysis\nis not well developed. In this paper, we propose a runner re-identification\nsystem that directly processes single-view video to address the open-world\nsetting. In the open-world setting, we cannot use labeled dataset and have to\nprocess video directly. The proposed system automatically processes raw video\nas input to identify runners, and it can identify runners even when they are\nframed out multiple times. For the automatic processing, we first detect the\nrunners in the video using the pre-trained YOLOv8 and the fine-tuned\nEfficientNet. We then track the runners using ByteTrack and detect their shoes\nwith the fine-tuned YOLOv8. Finally, we extract the image features of the\nrunners using an unsupervised method with the gated recurrent unit autoencoder\nand global and local features mixing. To improve the accuracy of runner\nre-identification, we use shoe images as local image features and dynamic\nfeatures of running sequence images. We evaluated the system on a running\npractice video dataset and showed that the proposed method identified runners\nwith higher accuracy than some state-of-the-art models in unsupervised\nre-identification. We also showed that our proposed local image feature and\nrunning dynamic feature were effective for runner re-identification. Our runner\nre-identification system can be useful for the automatic analysis of running\nvideos.\n","authors":["Tomohiro Suzuki","Kazushi Tsutsui","Kazuya Takeda","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2310.11700v2.pdf","comment":"20 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.03557v2","updated":"2024-04-17T01:01:17Z","published":"2024-02-05T22:15:55Z","title":"Robust Analysis of Multi-Task Learning Efficiency: New Benchmarks on\n Light-Weighed Backbones and Effective Measurement of Multi-Task Learning\n Challenges by Feature Disentanglement","summary":" One of the main motivations of MTL is to develop neural networks capable of\ninferring multiple tasks simultaneously. While countless methods have been\nproposed in the past decade investigating robust model architectures and\nefficient training algorithms, there is still lack of understanding of these\nmethods when applied on smaller feature extraction backbones, the\ngeneralizability of the commonly used fast approximation technique of replacing\nparameter-level gradients with feature level gradients, and lack of\ncomprehensive understanding of MTL challenges and how one can efficiently and\neffectively identify the challenges. In this paper, we focus on the\naforementioned efficiency aspects of existing MTL methods. We first carry out\nlarge-scale experiments of the methods with smaller backbones and on a the\nMetaGraspNet dataset as a new test ground. We also compare the existing methods\nwith and without using the fast gradient surrogate and empirically study the\ngeneralizability of this technique. Lastly, we propose Feature Disentanglement\nmeasure as a novel and efficient identifier of the challenges in MTL, and\npropose Ranking Similarity score as an evaluation metric for different\nidentifiers to prove the faithfulness of our method.\n","authors":["Dayou Mao","Yuhao Chen","Yifan Wu","Maximilian Gilles","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2402.03557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10966v1","updated":"2024-04-17T00:21:36Z","published":"2024-04-17T00:21:36Z","title":"Domain-Specific Block Selection and Paired-View Pseudo-Labeling for\n Online Test-Time Adaptation","summary":" Test-time adaptation (TTA) aims to adapt a pre-trained model to a new test\ndomain without access to source data after deployment. Existing approaches\ntypically rely on self-training with pseudo-labels since ground-truth cannot be\nobtained from test data. Although the quality of pseudo labels is important for\nstable and accurate long-term adaptation, it has not been previously addressed.\nIn this work, we propose DPLOT, a simple yet effective TTA framework that\nconsists of two components: (1) domain-specific block selection and (2)\npseudo-label generation using paired-view images. Specifically, we select\nblocks that involve domain-specific feature extraction and train these blocks\nby entropy minimization. After blocks are adjusted for current test domain, we\ngenerate pseudo-labels by averaging given test images and corresponding flipped\ncounterparts. By simply using flip augmentation, we prevent a decrease in the\nquality of the pseudo-labels, which can be caused by the domain gap resulting\nfrom strong augmentation. Our experimental results demonstrate that DPLOT\noutperforms previous TTA methods in CIFAR10-C, CIFAR100-C, and ImageNet-C\nbenchmarks, reducing error by up to 5.4%, 9.1%, and 2.9%, respectively. Also,\nwe provide an extensive analysis to demonstrate effectiveness of our framework.\nCode is available at\nhttps://github.com/gist-ailab/domain-specific-block-selection-and-paired-view-pseudo-labeling-for-online-TTA.\n","authors":["Yeonguk Yu","Sungho Shin","Seunghyeok Back","Minhwan Ko","Sangjun Noh","Kyoobin Lee"],"pdf_url":"https://arxiv.org/pdf/2404.10966v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11803v1","updated":"2024-04-17T23:49:00Z","published":"2024-04-17T23:49:00Z","title":"TempBEV: Improving Learned BEV Encoders with Combined Image and BEV\n Space Temporal Aggregation","summary":" Autonomous driving requires an accurate representation of the environment. A\nstrategy toward high accuracy is to fuse data from several sensors. Learned\nBird's-Eye View (BEV) encoders can achieve this by mapping data from individual\nsensors into one joint latent space. For cost-efficient camera-only systems,\nthis provides an effective mechanism to fuse data from multiple cameras with\ndifferent views. Accuracy can further be improved by aggregating sensor\ninformation over time. This is especially important in monocular camera systems\nto account for the lack of explicit depth and velocity measurements. Thereby,\nthe effectiveness of developed BEV encoders crucially depends on the operators\nused to aggregate temporal information and on the used latent representation\nspaces. We analyze BEV encoders proposed in the literature and compare their\neffectiveness, quantifying the effects of aggregation operators and latent\nrepresentations. While most existing approaches aggregate temporal information\neither in image or in BEV latent space, our analyses and performance\ncomparisons suggest that these latent representations exhibit complementary\nstrengths. Therefore, we develop a novel temporal BEV encoder, TempBEV, which\nintegrates aggregated temporal information from both latent spaces. We consider\nsubsequent image frames as stereo through time and leverage methods from\noptical flow estimation for temporal stereo encoding. Empirical evaluation on\nthe NuScenes dataset shows a significant improvement by TempBEV over the\nbaseline for 3D object detection and BEV segmentation. The ablation uncovers a\nstrong synergy of joint temporal aggregation in the image and BEV latent space.\nThese results indicate the overall effectiveness of our approach and make a\nstrong case for aggregating temporal information in both image and BEV latent\nspaces.\n","authors":["Thomas Monninger","Vandana Dokkadi","Md Zafar Anwar","Steffen Staab"],"pdf_url":"https://arxiv.org/pdf/2404.11803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11798v1","updated":"2024-04-17T23:33:34Z","published":"2024-04-17T23:33:34Z","title":"Establishing a Baseline for Gaze-driven Authentication Performance in\n VR: A Breadth-First Investigation on a Very Large Dataset","summary":" This paper performs the crucial work of establishing a baseline for\ngaze-driven authentication performance to begin answering fundamental research\nquestions using a very large dataset of gaze recordings from 9202 people with a\nlevel of eye tracking (ET) signal quality equivalent to modern consumer-facing\nvirtual reality (VR) platforms. The size of the employed dataset is at least an\norder-of-magnitude larger than any other dataset from previous related work.\nBinocular estimates of the optical and visual axes of the eyes and a minimum\nduration for enrollment and verification are required for our model to achieve\na false rejection rate (FRR) of below 3% at a false acceptance rate (FAR) of 1\nin 50,000. In terms of identification accuracy which decreases with gallery\nsize, we estimate that our model would fall below chance-level accuracy for\ngallery sizes of 148,000 or more. Our major findings indicate that gaze\nauthentication can be as accurate as required by the FIDO standard when driven\nby a state-of-the-art machine learning architecture and a sufficiently large\ntraining dataset.\n","authors":["Dillon Lohr","Michael J. Proulx","Oleg Komogortsev"],"pdf_url":"https://arxiv.org/pdf/2404.11798v1.pdf","comment":"28 pages, 18 figures, 5 tables, includes supplementary material"},{"id":"http://arxiv.org/abs/2404.11797v1","updated":"2024-04-17T23:30:48Z","published":"2024-04-17T23:30:48Z","title":"When are Foundation Models Effective? Understanding the Suitability for\n Pixel-Level Classification Using Multispectral Imagery","summary":" Foundation models, i.e., very large deep learning models, have demonstrated\nimpressive performances in various language and vision tasks that are otherwise\ndifficult to reach using smaller-size models. The major success of GPT-type of\nlanguage models is particularly exciting and raises expectations on the\npotential of foundation models in other domains including satellite remote\nsensing. In this context, great efforts have been made to build foundation\nmodels to test their capabilities in broader applications, and examples include\nPrithvi by NASA-IBM, Segment-Anything-Model, ViT, etc. This leads to an\nimportant question: Are foundation models always a suitable choice for\ndifferent remote sensing tasks, and when or when not? This work aims to enhance\nthe understanding of the status and suitability of foundation models for\npixel-level classification using multispectral imagery at moderate resolution,\nthrough comparisons with traditional machine learning (ML) and regular-size\ndeep learning models. Interestingly, the results reveal that in many scenarios\ntraditional ML models still have similar or better performance compared to\nfoundation models, especially for tasks where texture is less useful for\nclassification. On the other hand, deep learning models did show more promising\nresults for tasks where labels partially depend on texture (e.g., burn scar),\nwhile the difference in performance between foundation models and deep learning\nmodels is not obvious. The results conform with our analysis: The suitability\nof foundation models depend on the alignment between the self-supervised\nlearning tasks and the real downstream tasks, and the typical masked\nautoencoder paradigm is not necessarily suitable for many remote sensing\nproblems.\n","authors":["Yiqun Xie","Zhihao Wang","Weiye Chen","Zhili Li","Xiaowei Jia","Yanhua Li","Ruichen Wang","Kangyang Chai","Ruohan Li","Sergii Skakun"],"pdf_url":"https://arxiv.org/pdf/2404.11797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10913v2","updated":"2024-04-17T23:27:02Z","published":"2023-08-20T22:29:16Z","title":"Automated mapping of virtual environments with visual predictive coding","summary":" Humans construct internal cognitive maps of their environment directly from\nsensory inputs without access to a system of explicit coordinates or distance\nmeasurements. While machine learning algorithms like SLAM utilize specialized\nvisual inference procedures to identify visual features and construct spatial\nmaps from visual and odometry data, the general nature of cognitive maps in the\nbrain suggests a unified mapping algorithmic strategy that can generalize to\nauditory, tactile, and linguistic inputs. Here, we demonstrate that predictive\ncoding provides a natural and versatile neural network algorithm for\nconstructing spatial maps using sensory data. We introduce a framework in which\nan agent navigates a virtual environment while engaging in visual predictive\ncoding using a self-attention-equipped convolutional neural network. While\nlearning a next image prediction task, the agent automatically constructs an\ninternal representation of the environment that quantitatively reflects\ndistances. The internal map enables the agent to pinpoint its location relative\nto landmarks using only visual information.The predictive coding network\ngenerates a vectorized encoding of the environment that supports vector\nnavigation where individual latent space units delineate localized, overlapping\nneighborhoods in the environment. Broadly, our work introduces predictive\ncoding as a unified algorithmic framework for constructing cognitive maps that\ncan naturally extend to the mapping of auditory, sensorimotor, and linguistic\ninputs.\n","authors":["James Gornet","Matthew Thomson"],"pdf_url":"https://arxiv.org/pdf/2308.10913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11795v1","updated":"2024-04-17T23:10:11Z","published":"2024-04-17T23:10:11Z","title":"Prompt-Driven Feature Diffusion for Open-World Semi-Supervised Learning","summary":" In this paper, we present a novel approach termed Prompt-Driven Feature\nDiffusion (PDFD) within a semi-supervised learning framework for Open World\nSemi-Supervised Learning (OW-SSL). At its core, PDFD deploys an efficient\nfeature-level diffusion model with the guidance of class-specific prompts to\nsupport discriminative feature representation learning and feature generation,\ntackling the challenge of the non-availability of labeled data for unseen\nclasses in OW-SSL. In particular, PDFD utilizes class prototypes as prompts in\nthe diffusion model, leveraging their class-discriminative and semantic\ngeneralization ability to condition and guide the diffusion process across all\nthe seen and unseen classes. Furthermore, PDFD incorporates a class-conditional\nadversarial loss for diffusion model training, ensuring that the features\ngenerated via the diffusion process can be discriminatively aligned with the\nclass-conditional features of the real data. Additionally, the class prototypes\nof the unseen classes are computed using only unlabeled instances with\nconfident predictions within a semi-supervised learning framework. We conduct\nextensive experiments to evaluate the proposed PDFD. The empirical results show\nPDFD exhibits remarkable performance enhancements over many state-of-the-art\nexisting methods.\n","authors":["Marzi Heidari","Hanping Zhang","Yuhong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.11795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.03338v2","updated":"2024-04-17T23:02:39Z","published":"2022-12-06T21:42:05Z","title":"Framework-agnostic Semantically-aware Global Reasoning for Segmentation","summary":" Recent advances in pixel-level tasks (e.g. segmentation) illustrate the\nbenefit of of long-range interactions between aggregated region-based\nrepresentations that can enhance local features. However, such aggregated\nrepresentations, often in the form of attention, fail to model the underlying\nsemantics of the scene (e.g. individual objects and, by extension, their\ninteractions). In this work, we address the issue by proposing a component that\nlearns to project image features into latent representations and reason between\nthem using a transformer encoder to generate contextualized and\nscene-consistent representations which are fused with original image features.\nOur design encourages the latent regions to represent semantic concepts by\nensuring that the activated regions are spatially disjoint and the union of\nsuch regions corresponds to a connected object segment. The proposed semantic\nglobal reasoning (SGR) component is end-to-end trainable and can be easily\nadded to a wide variety of backbones (CNN or transformer-based) and\nsegmentation heads (per-pixel or mask classification) to consistently improve\nthe segmentation results on different datasets. In addition, our latent tokens\nare semantically interpretable and diverse and provide a rich set of features\nthat can be transferred to downstream tasks like object detection and\nsegmentation, with improved performance. Furthermore, we also proposed metrics\nto quantify the semantics of latent tokens at both class \\& instance level.\n","authors":["Mir Rayat Imtiaz Hossain","Leonid Sigal","James J. Little"],"pdf_url":"https://arxiv.org/pdf/2212.03338v2.pdf","comment":"Published in WACV 2024"},{"id":"http://arxiv.org/abs/2403.14115v2","updated":"2024-04-17T22:38:14Z","published":"2024-03-21T04:01:26Z","title":"Training point-based deep learning networks for forest segmentation with\n synthetic data","summary":" Remote sensing through unmanned aerial systems (UAS) has been increasing in\nforestry in recent years, along with using machine learning for data\nprocessing. Deep learning architectures, extensively applied in natural\nlanguage and image processing, have recently been extended to the point cloud\ndomain. However, the availability of point cloud datasets for training and\ntesting remains limited. Creating forested environment point cloud datasets is\nexpensive, requires high-precision sensors, and is time-consuming as manual\npoint classification is required. Moreover, forest areas could be inaccessible\nor dangerous for humans, further complicating data collection. Then, a question\narises whether it is possible to use synthetic data to train deep learning\nnetworks without the need to rely on large volumes of real forest data. To\nanswer this question, we developed a realistic simulator that procedurally\ngenerates synthetic forest scenes. Thanks to this, we have conducted a\ncomparative study of different state-of-the-art point-based deep learning\nnetworks for forest segmentation. Using created datasets, we determined the\nfeasibility of using synthetic data to train deep learning networks to classify\npoint clouds from real forest datasets. Both the simulator and the datasets are\nreleased as part of this work.\n","authors":["Francisco Raverta Capua","Juan Schandin","Pablo De Cristóforis"],"pdf_url":"https://arxiv.org/pdf/2403.14115v2.pdf","comment":"15 pages, 4 figures. Submitted to the International Conference on\n Pattern Recognition (ICPR) 2024"},{"id":"http://arxiv.org/abs/2404.11778v1","updated":"2024-04-17T22:02:22Z","published":"2024-04-17T22:02:22Z","title":"CU-Mamba: Selective State Space Models with Channel Learning for Image\n Restoration","summary":" Reconstructing degraded images is a critical task in image processing.\nAlthough CNN and Transformer-based models are prevalent in this field, they\nexhibit inherent limitations, such as inadequate long-range dependency modeling\nand high computational costs. To overcome these issues, we introduce the\nChannel-Aware U-Shaped Mamba (CU-Mamba) model, which incorporates a dual State\nSpace Model (SSM) framework into the U-Net architecture. CU-Mamba employs a\nSpatial SSM module for global context encoding and a Channel SSM component to\npreserve channel correlation features, both in linear computational complexity\nrelative to the feature map size. Extensive experimental results validate\nCU-Mamba's superiority over existing state-of-the-art methods, underscoring the\nimportance of integrating both spatial and channel contexts in image\nrestoration.\n","authors":["Rui Deng","Tianpei Gu"],"pdf_url":"https://arxiv.org/pdf/2404.11778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11776v1","updated":"2024-04-17T21:57:29Z","published":"2024-04-17T21:57:29Z","title":"3D object quality prediction for Metal Jet Printer with Multimodal\n thermal encoder","summary":" With the advancements in 3D printing technologies, it is extremely important\nthat the quality of 3D printed objects, and dimensional accuracies should meet\nthe customer's specifications. Various factors during metal printing affect the\nprinted parts' quality, including the power quality, the printing stage\nparameters, the print part's location inside the print bed, the curing stage\nparameters, and the metal sintering process. With the large data gathered from\nHP's MetJet printing process, AI techniques can be used to analyze, learn, and\neffectively infer the printed part quality metrics, as well as assist in\nimproving the print yield. In-situ thermal sensing data captured by\nprinter-installed thermal sensors contains the part thermal signature of fusing\nlayers. Such part thermal signature contains a convoluted impact from various\nfactors. In this paper, we use a multimodal thermal encoder network to fuse\ndata of a different nature including the video data vectorized printer control\ndata, and exact part thermal signatures with a trained encoder-decoder module.\nWe explored the data fusing techniques and stages for data fusing, the\noptimized end-to-end model architecture indicates an improved part quality\nprediction accuracy.\n","authors":[" Rachel"," Chen","Wenjia Zheng","Sandeep Jalui","Pavan Suri","Jun Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.11776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11770v1","updated":"2024-04-17T21:53:01Z","published":"2024-04-17T21:53:01Z","title":"Event-Based Eye Tracking. AIS 2024 Challenge Survey","summary":" This survey reviews the AIS 2024 Event-Based Eye Tracking (EET) Challenge.\nThe task of the challenge focuses on processing eye movement recorded with\nevent cameras and predicting the pupil center of the eye. The challenge\nemphasizes efficient eye tracking with event cameras to achieve good task\naccuracy and efficiency trade-off. During the challenge period, 38 participants\nregistered for the Kaggle competition, and 8 teams submitted a challenge\nfactsheet. The novel and diverse methods from the submitted factsheets are\nreviewed and analyzed in this survey to advance future event-based eye tracking\nresearch.\n","authors":["Zuowen Wang","Chang Gao","Zongwei Wu","Marcos V. Conde","Radu Timofte","Shih-Chii Liu","Qinyu Chen","Zheng-jun Zha","Wei Zhai","Han Han","Bohao Liao","Yuliang Wu","Zengyu Wan","Zhong Wang","Yang Cao","Ganchao Tan","Jinze Chen","Yan Ru Pei","Sasskia Brüers","Sébastien Crouzet","Douglas McLelland","Oliver Coenen","Baoheng Zhang","Yizhao Gao","Jingyuan Li","Hayden Kwok-Hay So","Philippe Bich","Chiara Boretti","Luciano Prono","Mircea Lică","David Dinucu-Jianu","Cătălin Grîu","Xiaopeng Lin","Hongwei Ren","Bojun Cheng","Xinan Zhang","Valentin Vial","Anthony Yezzi","James Tsai"],"pdf_url":"https://arxiv.org/pdf/2404.11770v1.pdf","comment":"Qinyu Chen is the corresponding author"},{"id":"http://arxiv.org/abs/2404.11769v1","updated":"2024-04-17T21:52:21Z","published":"2024-04-17T21:52:21Z","title":"QGen: On the Ability to Generalize in Quantization Aware Training","summary":" Quantization lowers memory usage, computational requirements, and latency by\nutilizing fewer bits to represent model weights and activations. In this work,\nwe investigate the generalization properties of quantized neural networks, a\ncharacteristic that has received little attention despite its implications on\nmodel performance. In particular, first, we develop a theoretical model for\nquantization in neural networks and demonstrate how quantization functions as a\nform of regularization. Second, motivated by recent work connecting the\nsharpness of the loss landscape and generalization, we derive an approximate\nbound for the generalization of quantized models conditioned on the amount of\nquantization noise. We then validate our hypothesis by experimenting with over\n2000 models trained on CIFAR-10, CIFAR-100, and ImageNet datasets on\nconvolutional and transformer-based models.\n","authors":["MohammadHossein AskariHemmat","Ahmadreza Jeddi","Reyhane Askari Hemmat","Ivan Lazarevich","Alexander Hoffman","Sudhakar Sah","Ehsan Saboori","Yvon Savaria","Jean-Pierre David"],"pdf_url":"https://arxiv.org/pdf/2404.11769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11764v1","updated":"2024-04-17T21:47:45Z","published":"2024-04-17T21:47:45Z","title":"Multimodal 3D Object Detection on Unseen Domains","summary":" LiDAR datasets for autonomous driving exhibit biases in properties such as\npoint cloud density, range, and object dimensions. As a result, object\ndetection networks trained and evaluated in different environments often\nexperience performance degradation. Domain adaptation approaches assume access\nto unannotated samples from the test distribution to address this problem.\nHowever, in the real world, the exact conditions of deployment and access to\nsamples representative of the test dataset may be unavailable while training.\nWe argue that the more realistic and challenging formulation is to require\nrobustness in performance to unseen target domains. We propose to address this\nproblem in a two-pronged manner. First, we leverage paired LiDAR-image data\npresent in most autonomous driving datasets to perform multimodal object\ndetection. We suggest that working with multimodal features by leveraging both\nimages and LiDAR point clouds for scene understanding tasks results in object\ndetectors more robust to unseen domain shifts. Second, we train a 3D object\ndetector to learn multimodal object features across different distributions and\npromote feature invariance across these source domains to improve\ngeneralizability to unseen target domains. To this end, we propose\nCLIX$^\\text{3D}$, a multimodal fusion and supervised contrastive learning\nframework for 3D object detection that performs alignment of object features\nfrom same-class samples of different domains while pushing the features from\ndifferent classes apart. We show that CLIX$^\\text{3D}$ yields state-of-the-art\ndomain generalization performance under multiple dataset shifts.\n","authors":["Deepti Hegde","Suhas Lohit","Kuan-Chuan Peng","Michael J. Jones","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2404.11764v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2404.11762v1","updated":"2024-04-17T21:43:43Z","published":"2024-04-17T21:43:43Z","title":"IrrNet: Advancing Irrigation Mapping with Incremental Patch Size\n Training on Remote Sensing Imagery","summary":" Irrigation mapping plays a crucial role in effective water management,\nessential for preserving both water quality and quantity, and is key to\nmitigating the global issue of water scarcity. The complexity of agricultural\nfields, adorned with diverse irrigation practices, especially when multiple\nsystems coexist in close quarters, poses a unique challenge. This complexity is\nfurther compounded by the nature of Landsat's remote sensing data, where each\npixel is rich with densely packed information, complicating the task of\naccurate irrigation mapping. In this study, we introduce an innovative approach\nthat employs a progressive training method, which strategically increases patch\nsizes throughout the training process, utilizing datasets from Landsat 5 and 7,\nlabeled with the WRLU dataset for precise labeling. This initial focus allows\nthe model to capture detailed features, progressively shifting to broader, more\ngeneral features as the patch size enlarges. Remarkably, our method enhances\nthe performance of existing state-of-the-art models by approximately 20%.\nFurthermore, our analysis delves into the significance of incorporating various\nspectral bands into the model, assessing their impact on performance. The\nfindings reveal that additional bands are instrumental in enabling the model to\ndiscern finer details more effectively. This work sets a new standard for\nleveraging remote sensing imagery in irrigation mapping.\n","authors":["Oishee Bintey Hoque","Samarth Swarup","Abhijin Adiga","Sayjro Kossi Nouwakpo","Madhav Marathe"],"pdf_url":"https://arxiv.org/pdf/2404.11762v1.pdf","comment":"Full version of the paper will be appearing in Proceedings of the\n IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n Workshops, 2024"},{"id":"http://arxiv.org/abs/2302.04143v2","updated":"2024-04-17T21:20:14Z","published":"2023-02-08T15:41:21Z","title":"Predicting Thrombectomy Recanalization from CT Imaging Using Deep\n Learning Models","summary":" For acute ischemic stroke (AIS) patients with large vessel occlusions,\nclinicians must decide if the benefit of mechanical thrombectomy (MTB)\noutweighs the risks and potential complications following an invasive\nprocedure. Pre-treatment computed tomography (CT) and angiography (CTA) are\nwidely used to characterize occlusions in the brain vasculature. If a patient\nis deemed eligible, a modified treatment in cerebral ischemia (mTICI) score\nwill be used to grade how well blood flow is reestablished throughout and\nfollowing the MTB procedure. An estimation of the likelihood of successful\nrecanalization can support treatment decision-making. In this study, we\nproposed a fully automated prediction of a patient's recanalization score using\npre-treatment CT and CTA imaging. We designed a spatial cross attention network\n(SCANet) that utilizes vision transformers to localize to pertinent slices and\nbrain regions. Our top model achieved an average cross-validated ROC-AUC of\n77.33 $\\pm$ 3.9\\%. This is a promising result that supports future applications\nof deep learning on CT and CTA for the identification of eligible AIS patients\nfor MTB.\n","authors":["Haoyue Zhang","Jennifer S. Polson","Eric J. Yang","Kambiz Nael","William Speier","Corey W. Arnold"],"pdf_url":"https://arxiv.org/pdf/2302.04143v2.pdf","comment":"Medical Imaging with Deep Learning 2022 accepted short paper Jun 2022"},{"id":"http://arxiv.org/abs/2210.12100v2","updated":"2024-04-17T21:16:56Z","published":"2022-10-21T16:52:16Z","title":"Boomerang: Local sampling on image manifolds using diffusion models","summary":" The inference stage of diffusion models can be seen as running a reverse-time\ndiffusion stochastic differential equation, where samples from a Gaussian\nlatent distribution are transformed into samples from a target distribution\nthat usually reside on a low-dimensional manifold, e.g., an image manifold. The\nintermediate values between the initial latent space and the image manifold can\nbe interpreted as noisy images, with the amount of noise determined by the\nforward diffusion process noise schedule. We utilize this interpretation to\npresent Boomerang, an approach for local sampling of image manifolds. As\nimplied by its name, Boomerang local sampling involves adding noise to an input\nimage, moving it closer to the latent space, and then mapping it back to the\nimage manifold through a partial reverse diffusion process. Thus, Boomerang\ngenerates images on the manifold that are ``similar,'' but nonidentical, to the\noriginal input image. We can control the proximity of the generated images to\nthe original by adjusting the amount of noise added. Furthermore, due to the\nstochastic nature of the reverse diffusion process in Boomerang, the generated\nimages display a certain degree of stochasticity, allowing us to obtain local\nsamples from the manifold without encountering any duplicates. Boomerang offers\nthe flexibility to work seamlessly with any pretrained diffusion model, such as\nStable Diffusion, without necessitating any adjustments to the reverse\ndiffusion process. We present three applications for Boomerang. First, we\nprovide a framework for constructing privacy-preserving datasets having\ncontrollable degrees of anonymity. Second, we show that using Boomerang for\ndata augmentation increases generalization performance and outperforms\nstate-of-the-art synthetic data augmentation. Lastly, we introduce a perceptual\nimage enhancement framework, which enables resolution enhancement.\n","authors":["Lorenzo Luzi","Paul M Mayer","Josue Casco-Rodriguez","Ali Siahkoohi","Richard G. Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2210.12100v2.pdf","comment":"Published in Transactions on Machine Learning Research"},{"id":"http://arxiv.org/abs/2404.11741v1","updated":"2024-04-17T20:48:19Z","published":"2024-04-17T20:48:19Z","title":"Diffusion Schrödinger Bridge Models for High-Quality MR-to-CT\n Synthesis for Head and Neck Proton Treatment Planning","summary":" In recent advancements in proton therapy, MR-based treatment planning is\ngaining momentum to minimize additional radiation exposure compared to\ntraditional CT-based methods. This transition highlights the critical need for\naccurate MR-to-CT image synthesis, which is essential for precise proton dose\ncalculations. Our research introduces the Diffusion Schr\\\"odinger Bridge Models\n(DSBM), an innovative approach for high-quality MR-to-CT synthesis. DSBM learns\nthe nonlinear diffusion processes between MR and CT data distributions. This\nmethod improves upon traditional diffusion models by initiating synthesis from\nthe prior distribution rather than the Gaussian distribution, enhancing both\ngeneration quality and efficiency. We validated the effectiveness of DSBM on a\nhead and neck cancer dataset, demonstrating its superiority over traditional\nimage synthesis methods through both image-level and dosimetric-level\nevaluations. The effectiveness of DSBM in MR-based proton treatment planning\nhighlights its potential as a valuable tool in various clinical scenarios.\n","authors":["Muheng Li","Xia Li","Sairos Safai","Damien Weber","Antony Lomax","Ye Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.11741v1.pdf","comment":"International Conference on the use of Computers in Radiation therapy\n (ICCR)"},{"id":"http://arxiv.org/abs/2404.11737v1","updated":"2024-04-17T20:41:49Z","published":"2024-04-17T20:41:49Z","title":"Equivariant Spatio-Temporal Self-Supervision for LiDAR Object Detection","summary":" Popular representation learning methods encourage feature invariance under\ntransformations applied at the input. However, in 3D perception tasks like\nobject localization and segmentation, outputs are naturally equivariant to some\ntransformations, such as rotation. Using pre-training loss functions that\nencourage equivariance of features under certain transformations provides a\nstrong self-supervision signal while also retaining information of geometric\nrelationships between transformed feature representations. This can enable\nimproved performance in downstream tasks that are equivariant to such\ntransformations. In this paper, we propose a spatio-temporal equivariant\nlearning framework by considering both spatial and temporal augmentations\njointly. Our experiments show that the best performance arises with a\npre-training approach that encourages equivariance to translation, scaling, and\nflip, rotation and scene flow. For spatial augmentations, we find that\ndepending on the transformation, either a contrastive objective or an\nequivariance-by-classification objective yields best results. To leverage\nreal-world object deformations and motion, we consider sequential LiDAR scene\npairs and develop a novel 3D scene flow-based equivariance objective that leads\nto improved performance overall. We show our pre-training method for 3D object\ndetection which outperforms existing equivariant and invariant approaches in\nmany settings.\n","authors":["Deepti Hegde","Suhas Lohit","Kuan-Chuan Peng","Michael J. Jones","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2404.11737v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2404.11735v1","updated":"2024-04-17T20:37:29Z","published":"2024-04-17T20:37:29Z","title":"Learning with 3D rotations, a hitchhiker's guide to SO(3)","summary":" Many settings in machine learning require the selection of a rotation\nrepresentation. However, choosing a suitable representation from the many\navailable options is challenging. This paper acts as a survey and guide through\nrotation representations. We walk through their properties that harm or benefit\ndeep learning with gradient-based optimization. By consolidating insights from\nrotation-based learning, we provide a comprehensive overview of learning\nfunctions with rotation representations. We provide guidance on selecting\nrepresentations based on whether rotations are in the model's input or output\nand whether the data primarily comprises small angles.\n","authors":["A. René Geist","Jonas Frey","Mikel Zobro","Anna Levina","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2404.11735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11732v1","updated":"2024-04-17T20:35:00Z","published":"2024-04-17T20:35:00Z","title":"Visual Prompting for Generalized Few-shot Segmentation: A Multi-scale\n Approach","summary":" The emergence of attention-based transformer models has led to their\nextensive use in various tasks, due to their superior generalization and\ntransfer properties. Recent research has demonstrated that such models, when\nprompted appropriately, are excellent for few-shot inference. However, such\ntechniques are under-explored for dense prediction tasks like semantic\nsegmentation. In this work, we examine the effectiveness of prompting a\ntransformer-decoder with learned visual prompts for the generalized few-shot\nsegmentation (GFSS) task. Our goal is to achieve strong performance not only on\nnovel categories with limited examples, but also to retain performance on base\ncategories. We propose an approach to learn visual prompts with limited\nexamples. These learned visual prompts are used to prompt a multiscale\ntransformer decoder to facilitate accurate dense predictions. Additionally, we\nintroduce a unidirectional causal attention mechanism between the novel\nprompts, learned with limited examples, and the base prompts, learned with\nabundant data. This mechanism enriches the novel prompts without deteriorating\nthe base class performance. Overall, this form of prompting helps us achieve\nstate-of-the-art performance for GFSS on two different benchmark datasets:\nCOCO-$20^i$ and Pascal-$5^i$, without the need for test-time optimization (or\ntransduction). Furthermore, test-time optimization leveraging unlabelled test\ndata can be used to improve the prompts, which we refer to as transductive\nprompt tuning.\n","authors":["Mir Rayat Imtiaz Hossain","Mennatullah Siam","Leonid Sigal","James J. Little"],"pdf_url":"https://arxiv.org/pdf/2404.11732v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2401.06341v2","updated":"2024-04-17T20:33:56Z","published":"2024-01-12T03:21:02Z","title":"AffordanceLLM: Grounding Affordance from Vision Language Models","summary":" Affordance grounding refers to the task of finding the area of an object with\nwhich one can interact. It is a fundamental but challenging task, as a\nsuccessful solution requires the comprehensive understanding of a scene in\nmultiple aspects including detection, localization, and recognition of objects\nwith their parts, of geo-spatial configuration/layout of the scene, of 3D\nshapes and physics, as well as of the functionality and potential interaction\nof the objects and humans. Much of the knowledge is hidden and beyond the image\ncontent with the supervised labels from a limited training set. In this paper,\nwe make an attempt to improve the generalization capability of the current\naffordance grounding by taking the advantage of the rich world, abstract, and\nhuman-object-interaction knowledge from pretrained large-scale vision language\nmodels. Under the AGD20K benchmark, our proposed model demonstrates a\nsignificant performance gain over the competing methods for in-the-wild object\naffordance grounding. We further demonstrate it can ground affordance for\nobjects from random Internet images, even if both objects and actions are\nunseen during training. Project site: https://jasonqsy.github.io/AffordanceLLM/\n","authors":["Shengyi Qian","Weifeng Chen","Min Bai","Xiong Zhou","Zhuowen Tu","Li Erran Li"],"pdf_url":"https://arxiv.org/pdf/2401.06341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11727v1","updated":"2024-04-17T20:28:15Z","published":"2024-04-17T20:28:15Z","title":"Deep Learning for Video-Based Assessment of Endotracheal Intubation\n Skills","summary":" Endotracheal intubation (ETI) is an emergency procedure performed in civilian\nand combat casualty care settings to establish an airway. Objective and\nautomated assessment of ETI skills is essential for the training and\ncertification of healthcare providers. However, the current approach is based\non manual feedback by an expert, which is subjective, time- and\nresource-intensive, and is prone to poor inter-rater reliability and halo\neffects. This work proposes a framework to evaluate ETI skills using single and\nmulti-view videos. The framework consists of two stages. First, a 2D\nconvolutional autoencoder (AE) and a pre-trained self-supervision network\nextract features from videos. Second, a 1D convolutional enhanced with a\ncross-view attention module takes the features from the AE as input and outputs\npredictions for skill evaluation. The ETI datasets were collected in two\nphases. In the first phase, ETI is performed by two subject cohorts: Experts\nand Novices. In the second phase, novice subjects perform ETI under time\npressure, and the outcome is either Successful or Unsuccessful. A third dataset\nof videos from a single head-mounted camera for Experts and Novices is also\nanalyzed. The study achieved an accuracy of 100% in identifying Expert/Novice\ntrials in the initial phase. In the second phase, the model showed 85% accuracy\nin classifying Successful/Unsuccessful procedures. Using head-mounted cameras\nalone, the model showed a 96% accuracy on Expert and Novice classification\nwhile maintaining an accuracy of 85% on classifying successful and\nunsuccessful. In addition, GradCAMs are presented to explain the differences\nbetween Expert and Novice behavior and Successful and Unsuccessful trials. The\napproach offers a reliable and objective method for automated assessment of ETI\nskills.\n","authors":["Jean-Paul Ainam","Erim Yanik","Rahul Rahul","Taylor Kunkes","Lora Cavuoto","Brian Clemency","Kaori Tanaka","Matthew Hackett","Jack Norfleet","Suvranu De"],"pdf_url":"https://arxiv.org/pdf/2404.11727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11725v1","updated":"2024-04-17T20:23:07Z","published":"2024-04-17T20:23:07Z","title":"Postoperative glioblastoma segmentation: Development of a fully\n automated pipeline using deep convolutional neural networks and comparison\n with currently available models","summary":" Accurately assessing tumor removal is paramount in the management of\nglioblastoma. We developed a pipeline using MRI scans and neural networks to\nsegment tumor subregions and the surgical cavity in postoperative images. Our\nmodel excels in accurately classifying the extent of resection, offering a\nvaluable tool for clinicians in assessing treatment effectiveness.\n","authors":["Santiago Cepeda","Roberto Romero","Daniel Garcia-Perez","Guillermo Blasco","Luigi Tommaso Luppino","Samuel Kuttner","Ignacio Arrese","Ole Solheim","Live Eikenes","Anna Karlberg","Angel Perez-Nunez","Trinidad Escudero","Roberto Hornero","Rosario Sarabia"],"pdf_url":"https://arxiv.org/pdf/2404.11725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14176v2","updated":"2024-04-17T20:18:45Z","published":"2023-03-24T17:38:45Z","title":"A Hybrid ANN-SNN Architecture for Low-Power and Low-Latency Visual\n Perception","summary":" Spiking Neural Networks (SNN) are a class of bio-inspired neural networks\nthat promise to bring low-power and low-latency inference to edge devices\nthrough asynchronous and sparse processing. However, being temporal models,\nSNNs depend heavily on expressive states to generate predictions on par with\nclassical artificial neural networks (ANNs). These states converge only after\nlong transient periods, and quickly decay without input data, leading to higher\nlatency, power consumption, and lower accuracy. This work addresses this issue\nby initializing the state with an auxiliary ANN running at a low rate. The SNN\nthen uses the state to generate predictions with high temporal resolution until\nthe next initialization phase. Our hybrid ANN-SNN model thus combines the best\nof both worlds: It does not suffer from long state transients and state decay\nthanks to the ANN, and can generate predictions with high temporal resolution,\nlow latency, and low power thanks to the SNN. We show for the task of\nevent-based 2D and 3D human pose estimation that our method consumes 88% less\npower with only a 4% decrease in performance compared to its fully ANN\ncounterparts when run at the same inference rate. Moreover, when compared to\nSNNs, our method achieves a 74% lower error. This research thus provides a new\nunderstanding of how ANNs and SNNs can be used to maximize their respective\nbenefits.\n","authors":["Asude Aydin","Mathias Gehrig","Daniel Gehrig","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2303.14176v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08888v2","updated":"2024-04-17T19:32:47Z","published":"2023-12-13T13:11:44Z","title":"Read Between the Layers: Leveraging Intra-Layer Representations for\n Rehearsal-Free Continual Learning with Pre-Trained Models","summary":" We address the Continual Learning (CL) problem, wherein a model must learn a\nsequence of tasks from non-stationary distributions while preserving prior\nknowledge upon encountering new experiences. With the advancement of foundation\nmodels, CL research has pivoted from the initial learning-from-scratch paradigm\ntowards utilizing generic features from large-scale pre-training. However,\nexisting approaches to CL with pre-trained models primarily focus on separating\nclass-specific features from the final representation layer and neglect the\npotential of intermediate representations to capture low- and mid-level\nfeatures, which are more invariant to domain shifts. In this work, we propose\nLayUP, a new prototype-based approach to continual learning that leverages\nsecond-order feature statistics from multiple intermediate layers of a\npre-trained network. Our method is conceptually simple, does not require access\nto prior data, and works out of the box with any foundation model. LayUP\nsurpasses the state of the art in four of the seven class-incremental learning\nbenchmarks, all three domain-incremental learning benchmarks and in six of the\nseven online continual learning benchmarks, while significantly reducing memory\nand computational requirements compared to existing baselines. Our results\ndemonstrate that fully exhausting the representational capacities of\npre-trained models in CL goes well beyond their final embeddings.\n","authors":["Kyra Ahrens","Hans Hergen Lehmann","Jae Hee Lee","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2312.08888v2.pdf","comment":"Preprint under review"},{"id":"http://arxiv.org/abs/2402.17177v3","updated":"2024-04-17T18:41:39Z","published":"2024-02-27T03:30:58Z","title":"Sora: A Review on Background, Technology, Limitations, and Opportunities\n of Large Vision Models","summary":" Sora is a text-to-video generative AI model, released by OpenAI in February\n2024. The model is trained to generate videos of realistic or imaginative\nscenes from text instructions and show potential in simulating the physical\nworld. Based on public technical reports and reverse engineering, this paper\npresents a comprehensive review of the model's background, related\ntechnologies, applications, remaining challenges, and future directions of\ntext-to-video AI models. We first trace Sora's development and investigate the\nunderlying technologies used to build this \"world simulator\". Then, we describe\nin detail the applications and potential impact of Sora in multiple industries\nranging from film-making and education to marketing. We discuss the main\nchallenges and limitations that need to be addressed to widely deploy Sora,\nsuch as ensuring safe and unbiased video generation. Lastly, we discuss the\nfuture development of Sora and video generation models in general, and how\nadvancements in the field could enable new ways of human-AI interaction,\nboosting productivity and creativity of video generation.\n","authors":["Yixin Liu","Kai Zhang","Yuan Li","Zhiling Yan","Chujie Gao","Ruoxi Chen","Zhengqing Yuan","Yue Huang","Hanchi Sun","Jianfeng Gao","Lifang He","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2402.17177v3.pdf","comment":"37 pages, 18 figures; GitHub:\n https://github.com/lichao-sun/SoraReview"},{"id":"http://arxiv.org/abs/2404.11683v1","updated":"2024-04-17T18:29:32Z","published":"2024-04-17T18:29:32Z","title":"Unifying Scene Representation and Hand-Eye Calibration with 3D\n Foundation Models","summary":" Representing the environment is a central challenge in robotics, and is\nessential for effective decision-making. Traditionally, before capturing images\nwith a manipulator-mounted camera, users need to calibrate the camera using a\nspecific external marker, such as a checkerboard or AprilTag. However, recent\nadvances in computer vision have led to the development of \\emph{3D foundation\nmodels}. These are large, pre-trained neural networks that can establish fast\nand accurate multi-view correspondences with very few images, even in the\nabsence of rich visual features. This paper advocates for the integration of 3D\nfoundation models into scene representation approaches for robotic systems\nequipped with manipulator-mounted RGB cameras. Specifically, we propose the\nJoint Calibration and Representation (JCR) method. JCR uses RGB images,\ncaptured by a manipulator-mounted camera, to simultaneously construct an\nenvironmental representation and calibrate the camera relative to the robot's\nend-effector, in the absence of specific calibration markers. The resulting 3D\nenvironment representation is aligned with the robot's coordinate frame and\nmaintains physically accurate scales. We demonstrate that JCR can build\neffective scene representations using a low-cost RGB camera attached to a\nmanipulator, without prior calibration.\n","authors":["Weiming Zhi","Haozhan Tang","Tianyi Zhang","Matthew Johnson-Roberson"],"pdf_url":"https://arxiv.org/pdf/2404.11683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11669v1","updated":"2024-04-17T18:08:00Z","published":"2024-04-17T18:08:00Z","title":"Factorized Motion Fields for Fast Sparse Input Dynamic View Synthesis","summary":" Designing a 3D representation of a dynamic scene for fast optimization and\nrendering is a challenging task. While recent explicit representations enable\nfast learning and rendering of dynamic radiance fields, they require a dense\nset of input viewpoints. In this work, we focus on learning a fast\nrepresentation for dynamic radiance fields with sparse input viewpoints.\nHowever, the optimization with sparse input is under-constrained and\nnecessitates the use of motion priors to constrain the learning. Existing fast\ndynamic scene models do not explicitly model the motion, making them difficult\nto be constrained with motion priors. We design an explicit motion model as a\nfactorized 4D representation that is fast and can exploit the spatio-temporal\ncorrelation of the motion field. We then introduce reliable flow priors\nincluding a combination of sparse flow priors across cameras and dense flow\npriors within cameras to regularize our motion model. Our model is fast,\ncompact and achieves very good performance on popular multi-view dynamic scene\ndatasets with sparse input viewpoints. The source code for our model can be\nfound on our project page:\nhttps://nagabhushansn95.github.io/publications/2024/RF-DeRF.html.\n","authors":["Nagabhushan Somraj","Kapil Choudhary","Sai Harsha Mupparaju","Rajiv Soundararajan"],"pdf_url":"https://arxiv.org/pdf/2404.11669v1.pdf","comment":"Accepted at SIGGRAPH 2024"},{"id":"http://arxiv.org/abs/2404.11667v1","updated":"2024-04-17T18:04:37Z","published":"2024-04-17T18:04:37Z","title":"Deep Dependency Networks and Advanced Inference Schemes for Multi-Label\n Classification","summary":" We present a unified framework called deep dependency networks (DDNs) that\ncombines dependency networks and deep learning architectures for multi-label\nclassification, with a particular emphasis on image and video data. The primary\nadvantage of dependency networks is their ease of training, in contrast to\nother probabilistic graphical models like Markov networks. In particular, when\ncombined with deep learning architectures, they provide an intuitive,\neasy-to-use loss function for multi-label classification. A drawback of DDNs\ncompared to Markov networks is their lack of advanced inference schemes,\nnecessitating the use of Gibbs sampling. To address this challenge, we propose\nnovel inference schemes based on local search and integer linear programming\nfor computing the most likely assignment to the labels given observations. We\nevaluate our novel methods on three video datasets (Charades, TACoS, Wetlab)\nand three image datasets (MS-COCO, PASCAL VOC, NUS-WIDE), comparing their\nperformance with (a) basic neural architectures and (b) neural architectures\ncombined with Markov networks equipped with advanced inference and learning\ntechniques. Our results demonstrate the superiority of our new DDN methods over\nthe two competing approaches.\n","authors":["Shivvrat Arya","Yu Xiang","Vibhav Gogate"],"pdf_url":"https://arxiv.org/pdf/2404.11667v1.pdf","comment":"Will appear in AISTATS 2024. arXiv admin note: substantial text\n overlap with arXiv:2302.00633"},{"id":"http://arxiv.org/abs/2404.12163v1","updated":"2024-04-17T17:38:54Z","published":"2024-04-17T17:38:54Z","title":"Unsupervised Microscopy Video Denoising","summary":" In this paper, we introduce a novel unsupervised network to denoise\nmicroscopy videos featured by image sequences captured by a fixed location\nmicroscopy camera. Specifically, we propose a DeepTemporal Interpolation\nmethod, leveraging a temporal signal filter integrated into the bottom CNN\nlayers, to restore microscopy videos corrupted by unknown noise types. Our\nunsupervised denoising architecture is distinguished by its ability to adapt to\nmultiple noise conditions without the need for pre-existing noise distribution\nknowledge, addressing a significant challenge in real-world medical\napplications. Furthermore, we evaluate our denoising framework using both real\nmicroscopy recordings and simulated data, validating our outperforming video\ndenoising performance across a broad spectrum of noise scenarios. Extensive\nexperiments demonstrate that our unsupervised model consistently outperforms\nstate-of-the-art supervised and unsupervised video denoising techniques,\nproving especially effective for microscopy videos.\n","authors":["Mary Aiyetigbo","Alexander Korte","Ethan Anderson","Reda Chalhoub","Peter Kalivas","Feng Luo","Nianyi Li"],"pdf_url":"https://arxiv.org/pdf/2404.12163v1.pdf","comment":"Accepted at CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.12142v1","updated":"2024-04-17T16:50:14Z","published":"2024-04-17T16:50:14Z","title":"SDIP: Self-Reinforcement Deep Image Prior Framework for Image Processing","summary":" Deep image prior (DIP) proposed in recent research has revealed the inherent\ntrait of convolutional neural networks (CNN) for capturing substantial\nlow-level image statistics priors. This framework efficiently addresses the\ninverse problems in image processing and has induced extensive applications in\nvarious domains. However, as the whole algorithm is initialized randomly, the\nDIP algorithm often lacks stability. Thus, this method still has space for\nfurther improvement. In this paper, we propose the self-reinforcement deep\nimage prior (SDIP) as an improved version of the original DIP. We observed that\nthe changes in the DIP networks' input and output are highly correlated during\neach iteration. SDIP efficiently utilizes this trait in a reinforcement\nlearning manner, where the current iteration's output is utilized by a steering\nalgorithm to update the network input for the next iteration, guiding the\nalgorithm toward improved results. Experimental results across multiple\napplications demonstrate that our proposed SDIP framework offers improvement\ncompared to the original DIP method and other state-of-the-art methods.\n","authors":["Ziyu Shu","Zhixin Pan"],"pdf_url":"https://arxiv.org/pdf/2404.12142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12144v1","updated":"2024-04-17T12:37:25Z","published":"2024-04-17T12:37:25Z","title":"Mushroom Segmentation and 3D Pose Estimation from Point Clouds using\n Fully Convolutional Geometric Features and Implicit Pose Encoding","summary":" Modern agricultural applications rely more and more on deep learning\nsolutions. However, training well-performing deep networks requires a large\namount of annotated data that may not be available and in the case of 3D\nannotation may not even be feasible for human annotators. In this work, we\ndevelop a deep learning approach to segment mushrooms and estimate their pose\non 3D data, in the form of point clouds acquired by depth sensors. To\ncircumvent the annotation problem, we create a synthetic dataset of mushroom\nscenes, where we are fully aware of 3D information, such as the pose of each\nmushroom. The proposed network has a fully convolutional backbone, that parses\nsparse 3D data, and predicts pose information that implicitly defines both\ninstance segmentation and pose estimation task. We have validated the\neffectiveness of the proposed implicit-based approach for a synthetic test set,\nas well as provided qualitative results for a small set of real acquired point\nclouds with depth sensors. Code is publicly available at\nhttps://github.com/georgeretsi/mushroom-pose.\n","authors":["George Retsinas","Niki Efthymiou","Petros Maragos"],"pdf_url":"https://arxiv.org/pdf/2404.12144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12415v1","updated":"2024-04-17T17:57:20Z","published":"2024-04-17T17:57:20Z","title":"Soil Fertility Prediction Using Combined USB-microscope Based Soil\n Image, Auxiliary Variables, and Portable X-Ray Fluorescence Spectrometry","summary":" This study explored the application of portable X-ray fluorescence (PXRF)\nspectrometry and soil image analysis to rapidly assess soil fertility, focusing\non critical parameters such as available B, organic carbon (OC), available Mn,\navailable S, and the sulfur availability index (SAI). Analyzing 1,133 soil\nsamples from various agro-climatic zones in Eastern India, the research\ncombined color and texture features from microscopic soil images, PXRF data,\nand auxiliary soil variables (AVs) using a Random Forest model. Results\nindicated that integrating image features (IFs) with auxiliary variables (AVs)\nsignificantly enhanced prediction accuracy for available B (R^2 = 0.80) and OC\n(R^2 = 0.88). A data fusion approach, incorporating IFs, AVs, and PXRF data,\nfurther improved predictions for available Mn and SAI with R^2 values of 0.72\nand 0.70, respectively. The study demonstrated how these integrated\ntechnologies have the potential to provide quick and affordable options for\nsoil testing, opening up access to more sophisticated prediction models and a\nbetter comprehension of the fertility and health of the soil. Future research\nshould focus on the application of deep learning models on a larger dataset of\nsoil images, developed using soils from a broader range of agro-climatic zones\nunder field condition.\n","authors":["Shubhadip Dasgupta","Satwik Pate","Divya Rathore","L. G. Divyanth","Ayan Das","Anshuman Nayak","Subhadip Dey","Asim Biswas","David C. Weindorf","Bin Li","Sergio Henrique Godinho Silva","Bruno Teixeira Ribeiro","Sanjay Srivastava","Somsubhra Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2404.12415v1.pdf","comment":"37 pages, 10 figures; manuscript under peer-review for publication in\n the jounral 'Computers and Electronics in Agriculture'"},{"id":"http://arxiv.org/abs/2404.11565v1","updated":"2024-04-17T17:08:05Z","published":"2024-04-17T17:08:05Z","title":"MoA: Mixture-of-Attention for Subject-Context Disentanglement in\n Personalized Image Generation","summary":" We introduce a new architecture for personalization of text-to-image\ndiffusion models, coined Mixture-of-Attention (MoA). Inspired by the\nMixture-of-Experts mechanism utilized in large language models (LLMs), MoA\ndistributes the generation workload between two attention pathways: a\npersonalized branch and a non-personalized prior branch. MoA is designed to\nretain the original model's prior by fixing its attention layers in the prior\nbranch, while minimally intervening in the generation process with the\npersonalized branch that learns to embed subjects in the layout and context\ngenerated by the prior branch. A novel routing mechanism manages the\ndistribution of pixels in each layer across these branches to optimize the\nblend of personalized and generic content creation. Once trained, MoA\nfacilitates the creation of high-quality, personalized images featuring\nmultiple subjects with compositions and interactions as diverse as those\ngenerated by the original model. Crucially, MoA enhances the distinction\nbetween the model's pre-existing capability and the newly augmented\npersonalized intervention, thereby offering a more disentangled subject-context\ncontrol that was previously unattainable. Project page:\nhttps://snap-research.github.io/mixture-of-attention\n","authors":["Kuan-Chieh Wang","Daniil Ostashev","Yuwei Fang","Sergey Tulyakov","Kfir Aberman"],"pdf_url":"https://arxiv.org/pdf/2404.11565v1.pdf","comment":"Project Website: https://snap-research.github.io/mixture-of-attention"}]},"2024-04-18T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.12391v1","updated":"2024-04-18T17:59:58Z","published":"2024-04-18T17:59:58Z","title":"On the Content Bias in Fréchet Video Distance","summary":" Fr\\'echet Video Distance (FVD), a prominent metric for evaluating video\ngeneration models, is known to conflict with human perception occasionally. In\nthis paper, we aim to explore the extent of FVD's bias toward per-frame quality\nover temporal realism and identify its sources. We first quantify the FVD's\nsensitivity to the temporal axis by decoupling the frame and motion quality and\nfind that the FVD increases only slightly with large temporal corruption. We\nthen analyze the generated videos and show that via careful sampling from a\nlarge set of generated videos that do not contain motions, one can drastically\ndecrease FVD without improving the temporal quality. Both studies suggest FVD's\nbias towards the quality of individual frames. We further observe that the bias\ncan be attributed to the features extracted from a supervised video classifier\ntrained on the content-biased dataset. We show that FVD with features extracted\nfrom the recent large-scale self-supervised video models is less biased toward\nimage quality. Finally, we revisit a few real-world examples to validate our\nhypothesis.\n","authors":["Songwei Ge","Aniruddha Mahapatra","Gaurav Parmar","Jun-Yan Zhu","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2404.12391v1.pdf","comment":"CVPR 2024. Project webpage: https://content-debiased-fvd.github.io/"},{"id":"http://arxiv.org/abs/2404.01300v2","updated":"2024-04-18T17:59:57Z","published":"2024-04-01T17:59:55Z","title":"NeRF-MAE: Masked AutoEncoders for Self-Supervised 3D Representation\n Learning for Neural Radiance Fields","summary":" Neural fields excel in computer vision and robotics due to their ability to\nunderstand the 3D visual world such as inferring semantics, geometry, and\ndynamics. Given the capabilities of neural fields in densely representing a 3D\nscene from 2D images, we ask the question: Can we scale their self-supervised\npretraining, specifically using masked autoencoders, to generate effective 3D\nrepresentations from posed RGB images. Owing to the astounding success of\nextending transformers to novel data modalities, we employ standard 3D Vision\nTransformers to suit the unique formulation of NeRFs. We leverage NeRF's\nvolumetric grid as a dense input to the transformer, contrasting it with other\n3D representations such as pointclouds where the information density can be\nuneven, and the representation is irregular. Due to the difficulty of applying\nmasked autoencoders to an implicit representation, such as NeRF, we opt for\nextracting an explicit representation that canonicalizes scenes across domains\nby employing the camera trajectory for sampling. Our goal is made possible by\nmasking random patches from NeRF's radiance and density grid and employing a\nstandard 3D Swin Transformer to reconstruct the masked patches. In doing so,\nthe model can learn the semantic and spatial structure of complete scenes. We\npretrain this representation at scale on our proposed curated posed-RGB data,\ntotaling over 1.6 million images. Once pretrained, the encoder is used for\neffective 3D transfer learning. Our novel self-supervised pretraining for\nNeRFs, NeRF-MAE, scales remarkably well and improves performance on various\nchallenging 3D tasks. Utilizing unlabeled posed 2D data for pretraining,\nNeRF-MAE significantly outperforms self-supervised 3D pretraining and NeRF\nscene understanding baselines on Front3D and ScanNet datasets with an absolute\nperformance improvement of over 20% AP50 and 8% AP25 for 3D object detection.\n","authors":["Muhammad Zubair Irshad","Sergey Zakahrov","Vitor Guizilini","Adrien Gaidon","Zsolt Kira","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2404.01300v2.pdf","comment":"29 pages, 13 figures. Project Page: https://nerf-mae.github.io/"},{"id":"http://arxiv.org/abs/2404.12390v1","updated":"2024-04-18T17:59:54Z","published":"2024-04-18T17:59:54Z","title":"BLINK: Multimodal Large Language Models Can See but Not Perceive","summary":" We introduce Blink, a new benchmark for multimodal language models (LLMs)\nthat focuses on core visual perception abilities not found in other\nevaluations. Most of the Blink tasks can be solved by humans \"within a blink\"\n(e.g., relative depth estimation, visual correspondence, forensics detection,\nand multi-view reasoning). However, we find these perception-demanding tasks\ncast significant challenges for current multimodal LLMs because they resist\nmediation through natural language. Blink reformats 14 classic computer vision\ntasks into 3,807 multiple-choice questions, paired with single or multiple\nimages and visual prompting. While humans get 95.70% accuracy on average, Blink\nis surprisingly challenging for existing multimodal LLMs: even the\nbest-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only\n13.17% and 7.63% higher than random guessing, indicating that such perception\nabilities have not \"emerged\" yet in recent multimodal LLMs. Our analysis also\nhighlights that specialist CV models could solve these problems much better,\nsuggesting potential pathways for future improvements. We believe Blink will\nstimulate the community to help multimodal LLMs catch up with human-level\nvisual perception.\n","authors":["Xingyu Fu","Yushi Hu","Bangzheng Li","Yu Feng","Haoyu Wang","Xudong Lin","Dan Roth","Noah A. Smith","Wei-Chiu Ma","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.12390v1.pdf","comment":"Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/"},{"id":"http://arxiv.org/abs/2404.12388v1","updated":"2024-04-18T17:59:53Z","published":"2024-04-18T17:59:53Z","title":"VideoGigaGAN: Towards Detail-rich Video Super-Resolution","summary":" Video super-resolution (VSR) approaches have shown impressive temporal\nconsistency in upsampled videos. However, these approaches tend to generate\nblurrier results than their image counterparts as they are limited in their\ngenerative capability. This raises a fundamental question: can we extend the\nsuccess of a generative image upsampler to the VSR task while preserving the\ntemporal consistency? We introduce VideoGigaGAN, a new generative VSR model\nthat can produce videos with high-frequency details and temporal consistency.\nVideoGigaGAN builds upon a large-scale image upsampler -- GigaGAN. Simply\ninflating GigaGAN to a video model by adding temporal modules produces severe\ntemporal flickering. We identify several key issues and propose techniques that\nsignificantly improve the temporal consistency of upsampled videos. Our\nexperiments show that, unlike previous VSR methods, VideoGigaGAN generates\ntemporally consistent videos with more fine-grained appearance details. We\nvalidate the effectiveness of VideoGigaGAN by comparing it with\nstate-of-the-art VSR models on public datasets and showcasing video results\nwith $8\\times$ super-resolution.\n","authors":["Yiran Xu","Taesung Park","Richard Zhang","Yang Zhou","Eli Shechtman","Feng Liu","Jia-Bin Huang","Difan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12388v1.pdf","comment":"project page: https://videogigagan.github.io/"},{"id":"http://arxiv.org/abs/2404.12389v1","updated":"2024-04-18T17:59:53Z","published":"2024-04-18T17:59:53Z","title":"Moving Object Segmentation: All You Need Is SAM (and Flow)","summary":" The objective of this paper is motion segmentation -- discovering and\nsegmenting the moving objects in a video. This is a much studied area with\nnumerous careful,and sometimes complex, approaches and training schemes\nincluding: self-supervised learning, learning from synthetic datasets,\nobject-centric representations, amodal representations, and many more. Our\ninterest in this paper is to determine if the Segment Anything model (SAM) can\ncontribute to this task. We investigate two models for combining SAM with\noptical flow that harness the segmentation power of SAM with the ability of\nflow to discover and group moving objects. In the first model, we adapt SAM to\ntake optical flow, rather than RGB, as an input. In the second, SAM takes RGB\nas an input, and flow is used as a segmentation prompt. These surprisingly\nsimple methods, without any further modifications, outperform all previous\napproaches by a considerable margin in both single and multi-object benchmarks.\nWe also extend these frame-level segmentations to sequence-level segmentations\nthat maintain object identity. Again, this simple model outperforms previous\nmethods on multiple video object segmentation benchmarks.\n","authors":["Junyu Xie","Charig Yang","Weidi Xie","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2404.12389v1.pdf","comment":"Project Page: https://www.robots.ox.ac.uk/~vgg/research/flowsam/"},{"id":"http://arxiv.org/abs/2404.12387v1","updated":"2024-04-18T17:59:48Z","published":"2024-04-18T17:59:48Z","title":"Reka Core, Flash, and Edge: A Series of Powerful Multimodal Language\n Models","summary":" We introduce Reka Core, Flash, and Edge, a series of powerful multimodal\nlanguage models trained from scratch by Reka. Reka models are able to process\nand reason with text, images, video, and audio inputs. This technical report\ndiscusses details of training some of these models and provides comprehensive\nevaluation results. We show that Reka Edge and Reka Flash are not only\nstate-of-the-art but also outperform many much larger models, delivering\noutsized values for their respective compute class. Meanwhile, our most capable\nand largest model, Reka Core, approaches the best frontier models on both\nautomatic evaluations and blind human evaluations. On image question answering\nbenchmarks (e.g. MMMU, VQAv2), Core performs competitively to GPT4-V.\nMeanwhile, on multimodal chat, Core ranks as the second most preferred model\nunder a blind third-party human evaluation setup, outperforming other models\nsuch as Claude 3 Opus. On text benchmarks, Core not only performs competitively\nto other frontier models on a set of well-established benchmarks (e.g. MMLU,\nGSM8K) but also outperforms GPT4-0613 on human evaluation. On video question\nanswering (Perception-Test), Core outperforms Gemini Ultra. Models are shipped\nin production at http://chat.reka.ai . A showcase of non cherry picked\nqualitative examples can also be found at http://showcase.reka.ai .\n","authors":["Aitor Ormazabal","Che Zheng","Cyprien de Masson d'Autume","Dani Yogatama","Deyu Fu","Donovan Ong","Eric Chen","Eugenie Lamprecht","Hai Pham","Isaac Ong","Kaloyan Aleksiev","Lei Li","Matthew Henderson","Max Bain","Mikel Artetxe","Nishant Relan","Piotr Padlewski","Qi Liu","Ren Chen","Samuel Phua","Yazheng Yang","Yi Tay","Yuqi Wang","Zhongkai Zhu","Zhihui Xie"],"pdf_url":"https://arxiv.org/pdf/2404.12387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12386v1","updated":"2024-04-18T17:59:46Z","published":"2024-04-18T17:59:46Z","title":"SOHES: Self-supervised Open-world Hierarchical Entity Segmentation","summary":" Open-world entity segmentation, as an emerging computer vision task, aims at\nsegmenting entities in images without being restricted by pre-defined classes,\noffering impressive generalization capabilities on unseen images and concepts.\nDespite its promise, existing entity segmentation methods like Segment Anything\nModel (SAM) rely heavily on costly expert annotators. This work presents\nSelf-supervised Open-world Hierarchical Entity Segmentation (SOHES), a novel\napproach that eliminates the need for human annotations. SOHES operates in\nthree phases: self-exploration, self-instruction, and self-correction. Given a\npre-trained self-supervised representation, we produce abundant high-quality\npseudo-labels through visual feature clustering. Then, we train a segmentation\nmodel on the pseudo-labels, and rectify the noises in pseudo-labels via a\nteacher-student mutual-learning procedure. Beyond segmenting entities, SOHES\nalso captures their constituent parts, providing a hierarchical understanding\nof visual entities. Using raw images as the sole training data, our method\nachieves unprecedented performance in self-supervised open-world segmentation,\nmarking a significant milestone towards high-quality open-world entity\nsegmentation in the absence of human-annotated masks. Project page:\nhttps://SOHES.github.io.\n","authors":["Shengcao Cao","Jiuxiang Gu","Jason Kuen","Hao Tan","Ruiyi Zhang","Handong Zhao","Ani Nenkova","Liang-Yan Gui","Tong Sun","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12386v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.12385v1","updated":"2024-04-18T17:59:41Z","published":"2024-04-18T17:59:41Z","title":"MeshLRM: Large Reconstruction Model for High-Quality Mesh","summary":" We propose MeshLRM, a novel LRM-based approach that can reconstruct a\nhigh-quality mesh from merely four input images in less than one second.\nDifferent from previous large reconstruction models (LRMs) that focus on\nNeRF-based reconstruction, MeshLRM incorporates differentiable mesh extraction\nand rendering within the LRM framework. This allows for end-to-end mesh\nreconstruction by fine-tuning a pre-trained NeRF LRM with mesh rendering.\nMoreover, we improve the LRM architecture by simplifying several complex\ndesigns in previous LRMs. MeshLRM's NeRF initialization is sequentially trained\nwith low- and high-resolution images; this new LRM training strategy enables\nsignificantly faster convergence and thereby leads to better quality with less\ncompute. Our approach achieves state-of-the-art mesh reconstruction from\nsparse-view inputs and also allows for many downstream applications, including\ntext-to-3D and single-image-to-3D generation. Project page:\nhttps://sarahweiii.github.io/meshlrm/\n","authors":["Xinyue Wei","Kai Zhang","Sai Bi","Hao Tan","Fujun Luan","Valentin Deschaintre","Kalyan Sunkavalli","Hao Su","Zexiang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.12385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12383v1","updated":"2024-04-18T17:59:28Z","published":"2024-04-18T17:59:28Z","title":"G-HOP: Generative Hand-Object Prior for Interaction Reconstruction and\n Grasp Synthesis","summary":" We propose G-HOP, a denoising diffusion based generative prior for\nhand-object interactions that allows modeling both the 3D object and a human\nhand, conditioned on the object category. To learn a 3D spatial diffusion model\nthat can capture this joint distribution, we represent the human hand via a\nskeletal distance field to obtain a representation aligned with the (latent)\nsigned distance field for the object. We show that this hand-object prior can\nthen serve as generic guidance to facilitate other tasks like reconstruction\nfrom interaction clip and human grasp synthesis. We believe that our model,\ntrained by aggregating seven diverse real-world interaction datasets spanning\nacross 155 categories, represents a first approach that allows jointly\ngenerating both hand and object. Our empirical evaluations demonstrate the\nbenefit of this joint prior in video-based reconstruction and human grasp\nsynthesis, outperforming current task-specific baselines.\n Project website: https://judyye.github.io/ghop-www\n","authors":["Yufei Ye","Abhinav Gupta","Kris Kitani","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2404.12383v1.pdf","comment":"accepted to CVPR2024; project page at\n https://judyye.github.io/ghop-www"},{"id":"http://arxiv.org/abs/2404.12382v1","updated":"2024-04-18T17:59:27Z","published":"2024-04-18T17:59:27Z","title":"Lazy Diffusion Transformer for Interactive Image Editing","summary":" We introduce a novel diffusion transformer, LazyDiffusion, that generates\npartial image updates efficiently. Our approach targets interactive image\nediting applications in which, starting from a blank canvas or an image, a user\nspecifies a sequence of localized image modifications using binary masks and\ntext prompts. Our generator operates in two phases. First, a context encoder\nprocesses the current canvas and user mask to produce a compact global context\ntailored to the region to generate. Second, conditioned on this context, a\ndiffusion-based transformer decoder synthesizes the masked pixels in a \"lazy\"\nfashion, i.e., it only generates the masked region. This contrasts with\nprevious works that either regenerate the full canvas, wasting time and\ncomputation, or confine processing to a tight rectangular crop around the mask,\nignoring the global image context altogether. Our decoder's runtime scales with\nthe mask size, which is typically small, while our encoder introduces\nnegligible overhead. We demonstrate that our approach is competitive with\nstate-of-the-art inpainting methods in terms of quality and fidelity while\nproviding a 10x speedup for typical user interactions, where the editing mask\nrepresents 10% of the image.\n","authors":["Yotam Nitzan","Zongze Wu","Richard Zhang","Eli Shechtman","Daniel Cohen-Or","Taesung Park","Michaël Gharbi"],"pdf_url":"https://arxiv.org/pdf/2404.12382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12378v1","updated":"2024-04-18T17:58:16Z","published":"2024-04-18T17:58:16Z","title":"6Img-to-3D: Few-Image Large-Scale Outdoor Driving Scene Reconstruction","summary":" Current 3D reconstruction techniques struggle to infer unbounded scenes from\na few images faithfully. Specifically, existing methods have high computational\ndemands, require detailed pose information, and cannot reconstruct occluded\nregions reliably. We introduce 6Img-to-3D, an efficient, scalable\ntransformer-based encoder-renderer method for single-shot image to 3D\nreconstruction. Our method outputs a 3D-consistent parameterized triplane from\nonly six outward-facing input images for large-scale, unbounded outdoor driving\nscenarios. We take a step towards resolving existing shortcomings by combining\ncontracted custom cross- and self-attention mechanisms for triplane\nparameterization, differentiable volume rendering, scene contraction, and image\nfeature projection. We showcase that six surround-view vehicle images from a\nsingle timestamp without global pose information are enough to reconstruct\n360$^{\\circ}$ scenes during inference time, taking 395 ms. Our method allows,\nfor example, rendering third-person images and birds-eye views. Our code is\navailable at https://github.com/continental/6Img-to-3D, and more examples can\nbe found at our website here https://6Img-to-3D.GitHub.io/.\n","authors":["Théo Gieruc","Marius Kästingschäfer","Sebastian Bernhard","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2404.12378v1.pdf","comment":"Joint first authorship. Project page: https://6Img-to-3D.GitHub.io/\n Code https://github.com/continental/6Img-to-3D"},{"id":"http://arxiv.org/abs/2404.12379v1","updated":"2024-04-18T17:58:16Z","published":"2024-04-18T17:58:16Z","title":"Dynamic Gaussians Mesh: Consistent Mesh Reconstruction from Monocular\n Videos","summary":" Modern 3D engines and graphics pipelines require mesh as a memory-efficient\nrepresentation, which allows efficient rendering, geometry processing, texture\nediting, and many other downstream operations. However, it is still highly\ndifficult to obtain high-quality mesh in terms of structure and detail from\nmonocular visual observations. The problem becomes even more challenging for\ndynamic scenes and objects. To this end, we introduce Dynamic Gaussians Mesh\n(DG-Mesh), a framework to reconstruct a high-fidelity and time-consistent mesh\ngiven a single monocular video. Our work leverages the recent advancement in 3D\nGaussian Splatting to construct the mesh sequence with temporal consistency\nfrom a video. Building on top of this representation, DG-Mesh recovers\nhigh-quality meshes from the Gaussian points and can track the mesh vertices\nover time, which enables applications such as texture editing on dynamic\nobjects. We introduce the Gaussian-Mesh Anchoring, which encourages evenly\ndistributed Gaussians, resulting better mesh reconstruction through mesh-guided\ndensification and pruning on the deformed Gaussians. By applying\ncycle-consistent deformation between the canonical and the deformed space, we\ncan project the anchored Gaussian back to the canonical space and optimize\nGaussians across all time frames. During the evaluation on different datasets,\nDG-Mesh provides significantly better mesh reconstruction and rendering than\nbaselines.\n","authors":["Isabella Liu","Hao Su","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12379v1.pdf","comment":"Project page: https://www.liuisabella.com/DG-Mesh/"},{"id":"http://arxiv.org/abs/2404.12372v1","updated":"2024-04-18T17:53:19Z","published":"2024-04-18T17:53:19Z","title":"MedThink: Explaining Medical Visual Question Answering via Multimodal\n Decision-Making Rationale","summary":" Medical Visual Question Answering (MedVQA), which offers language responses\nto image-based medical inquiries, represents a challenging task and significant\nadvancement in healthcare. It assists medical experts to swiftly interpret\nmedical images, thereby enabling faster and more accurate diagnoses. However,\nthe model interpretability and transparency of existing MedVQA solutions are\noften limited, posing challenges in understanding their decision-making\nprocesses. To address this issue, we devise a semi-automated annotation process\nto streamlining data preparation and build new benchmark MedVQA datasets R-RAD\nand R-SLAKE. The R-RAD and R-SLAKE datasets provide intermediate medical\ndecision-making rationales generated by multimodal large language models and\nhuman annotations for question-answering pairs in existing MedVQA datasets,\ni.e., VQA-RAD and SLAKE. Moreover, we design a novel framework which finetunes\nlightweight pretrained generative models by incorporating medical\ndecision-making rationales into the training process. The framework includes\nthree distinct strategies to generate decision outcomes and corresponding\nrationales, thereby clearly showcasing the medical decision-making process\nduring reasoning. Extensive experiments demonstrate that our method can achieve\nan accuracy of 83.5% on R-RAD and 86.3% on R-SLAKE, significantly outperforming\nexisting state-of-the-art baselines. Dataset and code will be released.\n","authors":["Xiaotang Gai","Chenyi Zhou","Jiaxiang Liu","Yang Feng","Jian Wu","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12368v1","updated":"2024-04-18T17:50:23Z","published":"2024-04-18T17:50:23Z","title":"Gradient-Regularized Out-of-Distribution Detection","summary":" One of the challenges for neural networks in real-life applications is the\noverconfident errors these models make when the data is not from the original\ntraining distribution.\n Addressing this issue is known as Out-of-Distribution (OOD) detection.\n Many state-of-the-art OOD methods employ an auxiliary dataset as a surrogate\nfor OOD data during training to achieve improved performance.\n However, these methods fail to fully exploit the local information embedded\nin the auxiliary dataset.\n In this work, we propose the idea of leveraging the information embedded in\nthe gradient of the loss function during training to enable the network to not\nonly learn a desired OOD score for each sample but also to exhibit similar\nbehavior in a local neighborhood around each sample.\n We also develop a novel energy-based sampling method to allow the network to\nbe exposed to more informative OOD samples during the training phase. This is\nespecially important when the auxiliary dataset is large. We demonstrate the\neffectiveness of our method through extensive experiments on several OOD\nbenchmarks, improving the existing state-of-the-art FPR95 by 4% on our ImageNet\nexperiment.\n We further provide a theoretical analysis through the lens of certified\nrobustness and Lipschitz analysis to showcase the theoretical foundation of our\nwork. We will publicly release our code after the review process.\n","authors":["Sina Sharifi","Taha Entesari","Bardia Safaei","Vishal M. Patel","Mahyar Fazlyab"],"pdf_url":"https://arxiv.org/pdf/2404.12368v1.pdf","comment":"Under review for the 18th European Conference on Computer Vision\n (ECCV) 2024"},{"id":"http://arxiv.org/abs/2404.12359v1","updated":"2024-04-18T17:37:53Z","published":"2024-04-18T17:37:53Z","title":"Inverse Neural Rendering for Explainable Multi-Object Tracking","summary":" Today, most methods for image understanding tasks rely on feed-forward neural\nnetworks. While this approach has allowed for empirical accuracy, efficiency,\nand task adaptation via fine-tuning, it also comes with fundamental\ndisadvantages. Existing networks often struggle to generalize across different\ndatasets, even on the same task. By design, these networks ultimately reason\nabout high-dimensional scene features, which are challenging to analyze. This\nis true especially when attempting to predict 3D information based on 2D\nimages. We propose to recast 3D multi-object tracking from RGB cameras as an\n\\emph{Inverse Rendering (IR)} problem, by optimizing via a differentiable\nrendering pipeline over the latent space of pre-trained 3D object\nrepresentations and retrieve the latents that best represent object instances\nin a given input image. To this end, we optimize an image loss over generative\nlatent spaces that inherently disentangle shape and appearance properties. We\ninvestigate not only an alternate take on tracking but our method also enables\nexamining the generated objects, reasoning about failure situations, and\nresolving ambiguous cases. We validate the generalization and scaling\ncapabilities of our method by learning the generative prior exclusively from\nsynthetic data and assessing camera-based 3D tracking on the nuScenes and Waymo\ndatasets. Both these datasets are completely unseen to our method and do not\nrequire fine-tuning. Videos and code are available at\nhttps://light.princeton.edu/inverse-rendering-tracking/.\n","authors":["Julian Ost","Tanushree Banerjee","Mario Bijelic","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2404.12359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12353v1","updated":"2024-04-18T17:32:46Z","published":"2024-04-18T17:32:46Z","title":"V2Xum-LLM: Cross-Modal Video Summarization with Temporal Prompt\n Instruction Tuning","summary":" Video summarization aims to create short, accurate, and cohesive summaries of\nlonger videos. Despite the existence of various video summarization datasets, a\nnotable limitation is their limited amount of source videos, which hampers the\neffective fine-tuning of advanced large vision-language models (VLMs).\nAdditionally, most existing datasets are created for video-to-video\nsummarization, overlooking the contemporary need for multimodal video content\nsummarization. Recent efforts have been made to expand from unimodal to\nmultimodal video summarization, categorizing the task into three sub-tasks\nbased on the summary's modality: video-to-video (V2V), video-to-text (V2T), and\na combination of video and text summarization (V2VT). However, the textual\nsummaries in previous multimodal datasets are inadequate. To address these\nissues, we introduce Instruct-V2Xum, a cross-modal video summarization dataset\nfeaturing 30,000 diverse videos sourced from YouTube, with lengths ranging from\n40 to 940 seconds and an average summarization ratio of 16.39\\%. Each video\nsummary in Instruct-V2Xum is paired with a textual summary that references\nspecific frame indexes, facilitating the generation of aligned video and\ntextual summaries. In addition, we propose a new video summarization framework\nnamed V2Xum-LLM. V2Xum-LLM, specifically V2Xum-LLaMA in this study, is the\nfirst framework that unifies different video summarization tasks into one large\nlanguage model's (LLM) text decoder and achieves task-controllable video\nsummarization with temporal prompts and task instructions. Experiments show\nthat V2Xum-LLaMA outperforms strong baseline models on multiple video\nsummarization tasks. Furthermore, we propose an enhanced evaluation metric for\nV2V and V2VT summarization tasks.\n","authors":["Hang Hua","Yunlong Tang","Chenliang Xu","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2404.12353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12352v1","updated":"2024-04-18T17:32:32Z","published":"2024-04-18T17:32:32Z","title":"Point-In-Context: Understanding Point Cloud via In-Context Learning","summary":" With the emergence of large-scale models trained on diverse datasets,\nin-context learning has emerged as a promising paradigm for multitasking,\nnotably in natural language processing and image processing. However, its\napplication in 3D point cloud tasks remains largely unexplored. In this work,\nwe introduce Point-In-Context (PIC), a novel framework for 3D point cloud\nunderstanding via in-context learning. We address the technical challenge of\neffectively extending masked point modeling to 3D point clouds by introducing a\nJoint Sampling module and proposing a vanilla version of PIC called\nPoint-In-Context-Generalist (PIC-G). PIC-G is designed as a generalist model\nfor various 3D point cloud tasks, with inputs and outputs modeled as\ncoordinates. In this paradigm, the challenging segmentation task is achieved by\nassigning label points with XYZ coordinates for each category; the final\nprediction is then chosen based on the label point closest to the predictions.\nTo break the limitation by the fixed label-coordinate assignment, which has\npoor generalization upon novel classes, we propose two novel training\nstrategies, In-Context Labeling and In-Context Enhancing, forming an extended\nversion of PIC named Point-In-Context-Segmenter (PIC-S), targeting improving\ndynamic context labeling and model training. By utilizing dynamic in-context\nlabels and extra in-context pairs, PIC-S achieves enhanced performance and\ngeneralization capability in and across part segmentation datasets. PIC is a\ngeneral framework so that other tasks or datasets can be seamlessly introduced\ninto our PIC through a unified data format. We conduct extensive experiments to\nvalidate the versatility and adaptability of our proposed methods in handling a\nwide range of tasks and segmenting multi-datasets. Our PIC-S is capable of\ngeneralizing unseen datasets and performing novel part segmentation by\ncustomizing prompts.\n","authors":["Mengyuan Liu","Zhongbin Fang","Xia Li","Joachim M. Buhmann","Xiangtai Li","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2404.12352v1.pdf","comment":"Project page: https://fanglaosi.github.io/Point-In-Context_Pages.\n arXiv admin note: text overlap with arXiv:2306.08659"},{"id":"http://arxiv.org/abs/2404.08995v2","updated":"2024-04-18T17:26:30Z","published":"2024-04-13T12:41:40Z","title":"Beyond Known Clusters: Probe New Prototypes for Efficient Generalized\n Class Discovery","summary":" Generalized Class Discovery (GCD) aims to dynamically assign labels to\nunlabelled data partially based on knowledge learned from labelled data, where\nthe unlabelled data may come from known or novel classes. The prevailing\napproach generally involves clustering across all data and learning conceptions\nby prototypical contrastive learning. However, existing methods largely hinge\non the performance of clustering algorithms and are thus subject to their\ninherent limitations. Firstly, the estimated cluster number is often smaller\nthan the ground truth, making the existing methods suffer from the lack of\nprototypes for comprehensive conception learning. To address this issue, we\npropose an adaptive probing mechanism that introduces learnable potential\nprototypes to expand cluster prototypes (centers). As there is no ground truth\nfor the potential prototype, we develop a self-supervised prototype learning\nframework to optimize the potential prototype in an end-to-end fashion.\nSecondly, clustering is computationally intensive, and the conventional\nstrategy of clustering both labelled and unlabelled instances exacerbates this\nissue. To counteract this inefficiency, we opt to cluster only the unlabelled\ninstances and subsequently expand the cluster prototypes with our introduced\npotential prototypes to fast explore novel classes. Despite the simplicity of\nour proposed method, extensive empirical analysis on a wide range of datasets\nconfirms that our method consistently delivers state-of-the-art results.\nSpecifically, our method surpasses the nearest competitor by a significant\nmargin of \\textbf{9.7}$\\%$ within the Stanford Cars dataset and\n\\textbf{12$\\times$} clustering efficiency within the Herbarium 19 dataset. We\nwill make the code and checkpoints publicly available at\n\\url{https://github.com/xjtuYW/PNP.git}.\n","authors":["Ye Wang","Yaxiong Wang","Yujiao Wu","Bingchen Zhao","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2404.08995v2.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.12347v1","updated":"2024-04-18T17:24:28Z","published":"2024-04-18T17:24:28Z","title":"AniClipart: Clipart Animation with Text-to-Video Priors","summary":" Clipart, a pre-made graphic art form, offers a convenient and efficient way\nof illustrating visual content. Traditional workflows to convert static clipart\nimages into motion sequences are laborious and time-consuming, involving\nnumerous intricate steps like rigging, key animation and in-betweening. Recent\nadvancements in text-to-video generation hold great potential in resolving this\nproblem. Nevertheless, direct application of text-to-video generation models\noften struggles to retain the visual identity of clipart images or generate\ncartoon-style motions, resulting in unsatisfactory animation outcomes. In this\npaper, we introduce AniClipart, a system that transforms static clipart images\ninto high-quality motion sequences guided by text-to-video priors. To generate\ncartoon-style and smooth motion, we first define B\\'{e}zier curves over\nkeypoints of the clipart image as a form of motion regularization. We then\nalign the motion trajectories of the keypoints with the provided text prompt by\noptimizing the Video Score Distillation Sampling (VSDS) loss, which encodes\nadequate knowledge of natural motion within a pretrained text-to-video\ndiffusion model. With a differentiable As-Rigid-As-Possible shape deformation\nalgorithm, our method can be end-to-end optimized while maintaining deformation\nrigidity. Experimental results show that the proposed AniClipart consistently\noutperforms existing image-to-video generation models, in terms of text-video\nalignment, visual identity preservation, and motion consistency. Furthermore,\nwe showcase the versatility of AniClipart by adapting it to generate a broader\narray of animation formats, such as layered animation, which allows topological\nchanges.\n","authors":["Ronghuan Wu","Wanchao Su","Kede Ma","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2404.12347v1.pdf","comment":"Project Page: https://aniclipart.github.io/"},{"id":"http://arxiv.org/abs/2404.12341v1","updated":"2024-04-18T17:10:18Z","published":"2024-04-18T17:10:18Z","title":"Measuring Feature Dependency of Neural Networks by Collapsing Feature\n Dimensions in the Data Manifold","summary":" This paper introduces a new technique to measure the feature dependency of\nneural network models. The motivation is to better understand a model by\nquerying whether it is using information from human-understandable features,\ne.g., anatomical shape, volume, or image texture. Our method is based on the\nprinciple that if a model is dependent on a feature, then removal of that\nfeature should significantly harm its performance. A targeted feature is\n\"removed\" by collapsing the dimension in the data distribution that corresponds\nto that feature. We perform this by moving data points along the feature\ndimension to a baseline feature value while staying on the data manifold, as\nestimated by a deep generative model. Then we observe how the model's\nperformance changes on the modified test data set, with the target feature\ndimension removed. We test our method on deep neural network models trained on\nsynthetic image data with known ground truth, an Alzheimer's disease prediction\ntask using MRI and hippocampus segmentations from the OASIS-3 dataset, and a\ncell nuclei classification task using the Lizard dataset.\n","authors":["Yinzhu Jin","Matthew B. Dwyer","P. Thomas Fletcher"],"pdf_url":"https://arxiv.org/pdf/2404.12341v1.pdf","comment":"Accepted and will be pulished in International Symposium on\n Biomedical Imaging (ISBI) 2024"},{"id":"http://arxiv.org/abs/2404.12339v1","updated":"2024-04-18T17:09:10Z","published":"2024-04-18T17:09:10Z","title":"SPOT: Point Cloud Based Stereo Visual Place Recognition for Similar and\n Opposing Viewpoints","summary":" Recognizing places from an opposing viewpoint during a return trip is a\ncommon experience for human drivers. However, the analogous robotics\ncapability, visual place recognition (VPR) with limited field of view cameras\nunder 180 degree rotations, has proven to be challenging to achieve. To address\nthis problem, this paper presents Same Place Opposing Trajectory (SPOT), a\ntechnique for opposing viewpoint VPR that relies exclusively on structure\nestimated through stereo visual odometry (VO). The method extends recent\nadvances in lidar descriptors and utilizes a novel double (similar and\nopposing) distance matrix sequence matching method. We evaluate SPOT on a\npublicly available dataset with 6.7-7.6 km routes driven in similar and\nopposing directions under various lighting conditions. The proposed algorithm\ndemonstrates remarkable improvement over the state-of-the-art, achieving up to\n91.7% recall at 100% precision in opposing viewpoint cases, while requiring\nless storage than all baselines tested and running faster than all but one.\nMoreover, the proposed method assumes no a priori knowledge of whether the\nviewpoint is similar or opposing, and also demonstrates competitive performance\nin similar viewpoint cases.\n","authors":["Spencer Carmichael","Rahul Agrawal","Ram Vasudevan","Katherine A. Skinner"],"pdf_url":"https://arxiv.org/pdf/2404.12339v1.pdf","comment":"Accepted to ICRA 2024, project website:\n https://umautobots.github.io/spot"},{"id":"http://arxiv.org/abs/2309.16208v2","updated":"2024-04-18T17:08:53Z","published":"2023-09-28T07:17:44Z","title":"Low-rank tensor completion via tensor joint rank with logarithmic\n composite norm","summary":" Low-rank tensor completion (LRTC) aims to recover a complete low-rank tensor\nfrom incomplete observed tensor, attracting extensive attention in various\npractical applications such as image processing and computer vision. However,\ncurrent methods often perform well only when there is a sufficient of observed\ninformation, and they perform poorly or may fail when the observed information\nis less than 5\\%. In order to improve the utilization of observed information,\na new method called the tensor joint rank with logarithmic composite norm\n(TJLC) method is proposed. This method simultaneously exploits two types of\ntensor low-rank structures, namely tensor Tucker rank and tubal rank, thereby\nenhancing the inherent correlations between known and missing elements. To\naddress the challenge of applying two tensor ranks with significantly different\ndirectly to LRTC, a new tensor Logarithmic composite norm is further proposed.\nSubsequently, the TJLC model and algorithm for the LRTC problem are proposed.\nAdditionally, theoretical convergence guarantees for the TJLC method are\nprovided. Experiments on various real datasets demonstrate that the proposed\nmethod outperforms state-of-the-art methods significantly. Particularly, the\nproposed method achieves satisfactory recovery even when the observed\ninformation is as low as 1\\%, and the recovery performance improves\nsignificantly as the observed information increases.\n","authors":["Hongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.16208v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12333v1","updated":"2024-04-18T16:59:51Z","published":"2024-04-18T16:59:51Z","title":"Customizing Text-to-Image Diffusion with Camera Viewpoint Control","summary":" Model customization introduces new concepts to existing text-to-image models,\nenabling the generation of the new concept in novel contexts. However, such\nmethods lack accurate camera view control w.r.t the object, and users must\nresort to prompt engineering (e.g., adding \"top-view\") to achieve coarse view\ncontrol. In this work, we introduce a new task -- enabling explicit control of\ncamera viewpoint for model customization. This allows us to modify object\nproperties amongst various background scenes via text prompts, all while\nincorporating the target camera pose as additional control. This new task\npresents significant challenges in merging a 3D representation from the\nmulti-view images of the new concept with a general, 2D text-to-image model. To\nbridge this gap, we propose to condition the 2D diffusion process on rendered,\nview-dependent features of the new object. During training, we jointly adapt\nthe 2D diffusion modules and 3D feature predictions to reconstruct the object's\nappearance and geometry while reducing overfitting to the input multi-view\nimages. Our method outperforms existing image editing and model personalization\nbaselines in preserving the custom object's identity while following the input\ntext prompt and the object's camera pose.\n","authors":["Nupur Kumari","Grace Su","Richard Zhang","Taesung Park","Eli Shechtman","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.12333v1.pdf","comment":"project page: https://customdiffusion360.github.io"},{"id":"http://arxiv.org/abs/2404.12330v1","updated":"2024-04-18T16:58:05Z","published":"2024-04-18T16:58:05Z","title":"A Perspective on Deep Vision Performance with Standard Image and Video\n Codecs","summary":" Resource-constrained hardware, such as edge devices or cell phones, often\nrely on cloud servers to provide the required computational resources for\ninference in deep vision models. However, transferring image and video data\nfrom an edge or mobile device to a cloud server requires coding to deal with\nnetwork constraints. The use of standardized codecs, such as JPEG or H.264, is\nprevalent and required to ensure interoperability. This paper aims to examine\nthe implications of employing standardized codecs within deep vision pipelines.\nWe find that using JPEG and H.264 coding significantly deteriorates the\naccuracy across a broad range of vision tasks and models. For instance, strong\ncompression rates reduce semantic segmentation accuracy by more than 80% in\nmIoU. In contrast to previous findings, our analysis extends beyond image and\naction classification to localization and dense prediction tasks, thus\nproviding a more comprehensive perspective.\n","authors":["Christoph Reich","Oliver Hahn","Daniel Cremers","Stefan Roth","Biplob Debnath"],"pdf_url":"https://arxiv.org/pdf/2404.12330v1.pdf","comment":"Accepted at CVPR 2024 Workshop on AI for Streaming (AIS)"},{"id":"http://arxiv.org/abs/2404.12322v1","updated":"2024-04-18T16:53:08Z","published":"2024-04-18T16:53:08Z","title":"Generalizable Face Landmarking Guided by Conditional Face Warping","summary":" As a significant step for human face modeling, editing, and generation, face\nlandmarking aims at extracting facial keypoints from images. A generalizable\nface landmarker is required in practice because real-world facial images, e.g.,\nthe avatars in animations and games, are often stylized in various ways.\nHowever, achieving generalizable face landmarking is challenging due to the\ndiversity of facial styles and the scarcity of labeled stylized faces. In this\nstudy, we propose a simple but effective paradigm to learn a generalizable face\nlandmarker based on labeled real human faces and unlabeled stylized faces. Our\nmethod learns the face landmarker as the key module of a conditional face\nwarper. Given a pair of real and stylized facial images, the conditional face\nwarper predicts a warping field from the real face to the stylized one, in\nwhich the face landmarker predicts the ending points of the warping field and\nprovides us with high-quality pseudo landmarks for the corresponding stylized\nfacial images. Applying an alternating optimization strategy, we learn the face\nlandmarker to minimize $i)$ the discrepancy between the stylized faces and the\nwarped real ones and $ii)$ the prediction errors of both real and pseudo\nlandmarks. Experiments on various datasets show that our method outperforms\nexisting state-of-the-art domain adaptation methods in face landmarking tasks,\nleading to a face landmarker with better generalizability. Code is available at\nhttps://plustwo0.github.io/project-face-landmarker}{https://plustwo0.github.io/project-face-landmarker.\n","authors":["Jiayi Liang","Haotian Liu","Hongteng Xu","Dixin Luo"],"pdf_url":"https://arxiv.org/pdf/2404.12322v1.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.12309v1","updated":"2024-04-18T16:38:02Z","published":"2024-04-18T16:38:02Z","title":"iRAG: An Incremental Retrieval Augmented Generation System for Videos","summary":" Retrieval augmented generation (RAG) systems combine the strengths of\nlanguage generation and information retrieval to power many real-world\napplications like chatbots. Use of RAG for combined understanding of multimodal\ndata such as text, images and videos is appealing but two critical limitations\nexist: one-time, upfront capture of all content in large multimodal data as\ntext descriptions entails high processing times, and not all information in the\nrich multimodal data is typically in the text descriptions. Since the user\nqueries are not known apriori, developing a system for multimodal to text\nconversion and interactive querying of multimodal data is challenging.\n To address these limitations, we propose iRAG, which augments RAG with a\nnovel incremental workflow to enable interactive querying of large corpus of\nmultimodal data. Unlike traditional RAG, iRAG quickly indexes large\nrepositories of multimodal data, and in the incremental workflow, it uses the\nindex to opportunistically extract more details from select portions of the\nmultimodal data to retrieve context relevant to an interactive user query. Such\nan incremental workflow avoids long multimodal to text conversion times,\novercomes information loss issues by doing on-demand query-specific extraction\nof details in multimodal data, and ensures high quality of responses to\ninteractive user queries that are often not known apriori. To the best of our\nknowledge, iRAG is the first system to augment RAG with an incremental workflow\nto support efficient interactive querying of large, real-world multimodal data.\nExperimental results on real-world long videos demonstrate 23x to 25x faster\nvideo to text ingestion, while ensuring that quality of responses to\ninteractive user queries is comparable to responses from a traditional RAG\nwhere all video data is converted to text upfront before any querying.\n","authors":["Md Adnan Arefeen","Biplob Debnath","Md Yusuf Sarwar Uddin","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2404.12309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12295v1","updated":"2024-04-18T16:18:41Z","published":"2024-04-18T16:18:41Z","title":"When Medical Imaging Met Self-Attention: A Love Story That Didn't Quite\n Work Out","summary":" A substantial body of research has focused on developing systems that assist\nmedical professionals during labor-intensive early screening processes, many\nbased on convolutional deep-learning architectures. Recently, multiple studies\nexplored the application of so-called self-attention mechanisms in the vision\ndomain. These studies often report empirical improvements over fully\nconvolutional approaches on various datasets and tasks. To evaluate this trend\nfor medical imaging, we extend two widely adopted convolutional architectures\nwith different self-attention variants on two different medical datasets. With\nthis, we aim to specifically evaluate the possible advantages of additional\nself-attention. We compare our models with similarly sized convolutional and\nattention-based baselines and evaluate performance gains statistically.\nAdditionally, we investigate how including such layers changes the features\nlearned by these models during the training. Following a hyperparameter search,\nand contrary to our expectations, we observe no significant improvement in\nbalanced accuracy over fully convolutional models. We also find that important\nfeatures, such as dermoscopic structures in skin lesion images, are still not\nlearned by employing self-attention. Finally, analyzing local explanations, we\nconfirm biased feature usage. We conclude that merely incorporating attention\nis insufficient to surpass the performance of existing fully convolutional\nmethods.\n","authors":["Tristan Piater","Niklas Penzel","Gideon Stein","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2404.12295v1.pdf","comment":"10 pages, 2 figures, 5 tables, presented at VISAPP 2024"},{"id":"http://arxiv.org/abs/2404.12292v1","updated":"2024-04-18T16:12:38Z","published":"2024-04-18T16:12:38Z","title":"Reducing Bias in Pre-trained Models by Tuning while Penalizing Change","summary":" Deep models trained on large amounts of data often incorporate implicit\nbiases present during training time. If later such a bias is discovered during\ninference or deployment, it is often necessary to acquire new data and retrain\nthe model. This behavior is especially problematic in critical areas such as\nautonomous driving or medical decision-making. In these scenarios, new data is\noften expensive and hard to come by. In this work, we present a method based on\nchange penalization that takes a pre-trained model and adapts the weights to\nmitigate a previously detected bias. We achieve this by tuning a\nzero-initialized copy of a frozen pre-trained network. Our method needs very\nfew, in extreme cases only a single, examples that contradict the bias to\nincrease performance. Additionally, we propose an early stopping criterion to\nmodify baselines and reduce overfitting. We evaluate our approach on a\nwell-known bias in skin lesion classification and three other datasets from the\ndomain shift literature. We find that our approach works especially well with\nvery few images. Simple fine-tuning combined with our early stopping also leads\nto performance benefits for a larger number of tuning samples.\n","authors":["Niklas Penzel","Gideon Stein","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2404.12292v1.pdf","comment":"12 pages, 12 figures, presented at VISAPP 2024"},{"id":"http://arxiv.org/abs/2404.12285v1","updated":"2024-04-18T16:04:14Z","published":"2024-04-18T16:04:14Z","title":"Performance Evaluation of Segment Anything Model with Variational\n Prompting for Application to Non-Visible Spectrum Imagery","summary":" The Segment Anything Model (SAM) is a deep neural network foundational model\ndesigned to perform instance segmentation which has gained significant\npopularity given its zero-shot segmentation ability. SAM operates by generating\nmasks based on various input prompts such as text, bounding boxes, points, or\nmasks, introducing a novel methodology to overcome the constraints posed by\ndataset-specific scarcity. While SAM is trained on an extensive dataset,\ncomprising ~11M images, it mostly consists of natural photographic images with\nonly very limited images from other modalities. Whilst the rapid progress in\nvisual infrared surveillance and X-ray security screening imaging technologies,\ndriven forward by advances in deep learning, has significantly enhanced the\nability to detect, classify and segment objects with high accuracy, it is not\nevident if the SAM zero-shot capabilities can be transferred to such\nmodalities. This work assesses SAM capabilities in segmenting objects of\ninterest in the X-ray/infrared modalities. Our approach reuses the pre-trained\nSAM with three different prompts: bounding box, centroid and random points. We\npresent quantitative/qualitative results to showcase the performance on\nselected datasets. Our results show that SAM can segment objects in the X-ray\nmodality when given a box prompt, but its performance varies for point prompts.\nSpecifically, SAM performs poorly in segmenting slender objects and organic\nmaterials, such as plastic bottles. We find that infrared objects are also\nchallenging to segment with point prompts given the low-contrast nature of this\nmodality. This study shows that while SAM demonstrates outstanding zero-shot\ncapabilities with box prompts, its performance ranges from moderate to poor for\npoint prompts, indicating that special consideration on the cross-modal\ngeneralisation of SAM is needed when considering use on X-ray/infrared imagery.\n","authors":["Yona Falinie A. Gaus","Neelanjan Bhowmik","Brian K. S. Isaac-Medina","Toby P. Breckon"],"pdf_url":"https://arxiv.org/pdf/2404.12285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08273v2","updated":"2024-04-18T15:55:56Z","published":"2024-04-12T06:52:40Z","title":"Struggle with Adversarial Defense? Try Diffusion","summary":" Adversarial attacks induce misclassification by introducing subtle\nperturbations. Recently, diffusion models are applied to the image classifiers\nto improve adversarial robustness through adversarial training or by purifying\nadversarial noise. However, diffusion-based adversarial training often\nencounters convergence challenges and high computational expenses.\nAdditionally, diffusion-based purification inevitably causes data shift and is\ndeemed susceptible to stronger adaptive attacks. To tackle these issues, we\npropose the Truth Maximization Diffusion Classifier (TMDC), a generative\nBayesian classifier that builds upon pre-trained diffusion models and the\nBayesian theorem. Unlike data-driven classifiers, TMDC, guided by Bayesian\nprinciples, utilizes the conditional likelihood from diffusion models to\ndetermine the class probabilities of input images, thereby insulating against\nthe influences of data shift and the limitations of adversarial training.\nMoreover, to enhance TMDC's resilience against more potent adversarial attacks,\nwe propose an optimization strategy for diffusion classifiers. This strategy\ninvolves post-training the diffusion model on perturbed datasets with\nground-truth labels as conditions, guiding the diffusion model to learn the\ndata distribution and maximizing the likelihood under the ground-truth labels.\nThe proposed method achieves state-of-the-art performance on the CIFAR10\ndataset against heavy white-box attacks and strong adaptive attacks.\nSpecifically, TMDC achieves robust accuracies of 82.81% against $l_{\\infty}$\nnorm-bounded perturbations and 86.05% against $l_{2}$ norm-bounded\nperturbations, respectively, with $\\epsilon=0.05$.\n","authors":["Yujie Li","Yanbin Wang","Haitao Xu","Bin Liu","Jianguo Sun","Zhenhao Guo","Wenrui Ma"],"pdf_url":"https://arxiv.org/pdf/2404.08273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04517v4","updated":"2024-04-18T15:50:37Z","published":"2023-01-11T15:31:15Z","title":"A new dataset for measuring the performance of blood vessel segmentation\n methods under distribution shifts","summary":" Creating a dataset for training supervised machine learning algorithms can be\na demanding task. This is especially true for medical image segmentation since\none or more specialists are usually required for image annotation, and creating\nground truth labels for just a single image can take up to several hours. In\naddition, it is paramount that the annotated samples represent well the\ndifferent conditions that might affect the imaged tissues as well as possible\nchanges in the image acquisition process. This can only be achieved by\nconsidering samples that are typical in the dataset as well as atypical, or\neven outlier, samples. We introduce VessMAP, a heterogeneous blood vessel\nsegmentation dataset acquired by carefully sampling relevant images from a\nlarger non-annotated dataset. A methodology was developed to select both\nprototypical and atypical samples from the base dataset, thus defining an\nassorted set of images that can be used for measuring the performance of\nsegmentation algorithms on samples that are highly distinct from each other. To\ndemonstrate the potential of the new dataset, we show that the validation\nperformance of a neural network changes significantly depending on the splits\nused for training the network.\n","authors":["Matheus Viana da Silva","Natália de Carvalho Santos","Julie Ouellette","Baptiste Lacoste","Cesar Henrique Comin"],"pdf_url":"https://arxiv.org/pdf/2301.04517v4.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2310.08475v5","updated":"2024-04-18T15:46:22Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":" In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights. Code and\ndataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v5.pdf","comment":"EMNLP 2023. Add the Exact Match/Accuracy results of Reliability and\n T-Generality"},{"id":"http://arxiv.org/abs/2309.16388v2","updated":"2024-04-18T15:32:30Z","published":"2023-09-28T12:36:12Z","title":"Exposing Image Splicing Traces in Scientific Publications via\n Uncertainty-guided Refinement","summary":" Recently, a surge in scientific publications suspected of image manipulation\nhas led to numerous retractions, bringing the issue of image integrity into\nsharp focus. Although research on forensic detectors for image plagiarism and\nimage synthesis exists, the detection of image splicing traces in scientific\npublications remains unexplored. Compared to image duplication and synthesis,\nimage splicing detection is more challenging due to the lack of reference\nimages and the typically small tampered areas. Furthermore, disruptive factors\nin scientific images, such as artifacts from digital compression, abnormal\npatterns, and noise from physical operations, present misleading features like\nsplicing traces, significantly increasing the difficulty of this task.\nMoreover, the scarcity of high-quality datasets of spliced scientific images\nlimits potential advancements. In this work, we propose an Uncertainty-guided\nRefinement Network (URN) to mitigate the impact of these disruptive factors.\nOur URN can explicitly suppress the propagation of unreliable information flow\ncaused by disruptive factors between regions, thus obtaining robust splicing\nfeatures. Additionally, the URN is designed to concentrate improvements in\nuncertain prediction areas during the decoding phase. We also construct a\ndataset for image splicing detection (SciSp) containing 1,290 spliced images.\nCompared to existing datasets, SciSp includes the largest number of spliced\nimages and the most diverse sources. Comprehensive experiments conducted on\nthree benchmark datasets demonstrate the superiority of our approach. We also\nvalidate the URN's generalisability in resisting cross-dataset domain shifts\nand its robustness against various post-processing techniques, including\nadvanced deep-learning-based inpainting.\n","authors":["Xun Lin","Wenzhong Tang","Haoran Wang","Yizhong Liu","Yakun Ju","Shuai Wang","Zitong Yu"],"pdf_url":"https://arxiv.org/pdf/2309.16388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15584v3","updated":"2024-04-18T15:29:14Z","published":"2024-02-23T19:51:55Z","title":"State Space Models for Event Cameras","summary":" Today, state-of-the-art deep neural networks that process event-camera data\nfirst convert a temporal window of events into dense, grid-like input\nrepresentations. As such, they exhibit poor generalizability when deployed at\nhigher inference frequencies (i.e., smaller temporal windows) than the ones\nthey were trained on. We address this challenge by introducing state-space\nmodels (SSMs) with learnable timescale parameters to event-based vision. This\ndesign adapts to varying frequencies without the need to retrain the network at\ndifferent frequencies. Additionally, we investigate two strategies to\ncounteract aliasing effects when deploying the model at higher frequencies. We\ncomprehensively evaluate our approach against existing methods based on RNN and\nTransformer architectures across various benchmarks, including Gen1 and 1 Mpx\nevent camera datasets. Our results demonstrate that SSM-based models train 33%\nfaster and also exhibit minimal performance degradation when tested at higher\nfrequencies than the training input. Traditional RNN and Transformer models\nexhibit performance drops of more than 20 mAP, with SSMs having a drop of 3.76\nmAP, highlighting the effectiveness of SSMs in event-based vision tasks.\n","authors":["Nikola Zubić","Mathias Gehrig","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2402.15584v3.pdf","comment":"18 pages, 5 figures, 6 tables, CVPR 2024 Camera Ready paper"},{"id":"http://arxiv.org/abs/2404.12260v1","updated":"2024-04-18T15:28:34Z","published":"2024-04-18T15:28:34Z","title":"Alleviating Catastrophic Forgetting in Facial Expression Recognition\n with Emotion-Centered Models","summary":" Facial expression recognition is a pivotal component in machine learning,\nfacilitating various applications. However, convolutional neural networks\n(CNNs) are often plagued by catastrophic forgetting, impeding their\nadaptability. The proposed method, emotion-centered generative replay (ECgr),\ntackles this challenge by integrating synthetic images from generative\nadversarial networks. Moreover, ECgr incorporates a quality assurance algorithm\nto ensure the fidelity of generated images. This dual approach enables CNNs to\nretain past knowledge while learning new tasks, enhancing their performance in\nemotion recognition. The experimental results on four diverse facial expression\ndatasets demonstrate that incorporating images generated by our\npseudo-rehearsal method enhances training on the targeted dataset and the\nsource dataset while making the CNN retain previously learned knowledge.\n","authors":["Israel A. Laurensi","Alceu de Souza Britto Jr.","Jean Paul Barddal","Alessandro Lameiras Koerich"],"pdf_url":"https://arxiv.org/pdf/2404.12260v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.12258v1","updated":"2024-04-18T15:25:59Z","published":"2024-04-18T15:25:59Z","title":"DeepLocalization: Using change point detection for Temporal Action\n Localization","summary":" In this study, we introduce DeepLocalization, an innovative framework devised\nfor the real-time localization of actions tailored explicitly for monitoring\ndriver behavior. Utilizing the power of advanced deep learning methodologies,\nour objective is to tackle the critical issue of distracted driving-a\nsignificant factor contributing to road accidents. Our strategy employs a dual\napproach: leveraging Graph-Based Change-Point Detection for pinpointing actions\nin time alongside a Video Large Language Model (Video-LLM) for precisely\ncategorizing activities. Through careful prompt engineering, we customize the\nVideo-LLM to adeptly handle driving activities' nuances, ensuring its\nclassification efficacy even with sparse data. Engineered to be lightweight,\nour framework is optimized for consumer-grade GPUs, making it vastly applicable\nin practical scenarios. We subjected our method to rigorous testing on the\nSynDD2 dataset, a complex benchmark for distracted driving behaviors, where it\ndemonstrated commendable performance-achieving 57.5% accuracy in event\nclassification and 51% in event detection. These outcomes underscore the\nsubstantial promise of DeepLocalization in accurately identifying diverse\ndriver behaviors and their temporal occurrences, all within the bounds of\nlimited computational resources.\n","authors":["Mohammed Shaiqur Rahman","Ibne Farabi Shihab","Lynna Chu","Anuj Sharma"],"pdf_url":"https://arxiv.org/pdf/2404.12258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12257v1","updated":"2024-04-18T15:23:37Z","published":"2024-04-18T15:23:37Z","title":"Food Portion Estimation via 3D Object Scaling","summary":" Image-based methods to analyze food images have alleviated the user burden\nand biases associated with traditional methods. However, accurate portion\nestimation remains a major challenge due to the loss of 3D information in the\n2D representation of foods captured by smartphone cameras or wearable devices.\nIn this paper, we propose a new framework to estimate both food volume and\nenergy from 2D images by leveraging the power of 3D food models and physical\nreference in the eating scene. Our method estimates the pose of the camera and\nthe food object in the input image and recreates the eating occasion by\nrendering an image of a 3D model of the food with the estimated poses. We also\nintroduce a new dataset, SimpleFood45, which contains 2D images of 45 food\nitems and associated annotations including food volume, weight, and energy. Our\nmethod achieves an average error of 31.10 kCal (17.67%) on this dataset,\noutperforming existing portion estimation methods.\n","authors":["Gautham Vinod","Jiangpeng He","Zeman Shao","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.12257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12252v1","updated":"2024-04-18T15:20:59Z","published":"2024-04-18T15:20:59Z","title":"Deep Gaussian mixture model for unsupervised image segmentation","summary":" The recent emergence of deep learning has led to a great deal of work on\ndesigning supervised deep semantic segmentation algorithms. As in many tasks\nsufficient pixel-level labels are very difficult to obtain, we propose a method\nwhich combines a Gaussian mixture model (GMM) with unsupervised deep learning\ntechniques. In the standard GMM the pixel values with each sub-region are\nmodelled by a Gaussian distribution. In order to identify the different\nregions, the parameter vector that minimizes the negative log-likelihood (NLL)\nfunction regarding the GMM has to be approximated. For this task, usually\niterative optimization methods such as the expectation-maximization (EM)\nalgorithm are used. In this paper, we propose to estimate these parameters\ndirectly from the image using a convolutional neural network (CNN). We thus\nchange the iterative procedure in the EM algorithm replacing the\nexpectation-step by a gradient-step with regard to the networks parameters.\nThis means that the network is trained to minimize the NLL function of the GMM\nwhich comes with at least two advantages. As once trained, the network is able\nto predict label probabilities very quickly compared with time consuming\niterative optimization methods. Secondly, due to the deep image prior our\nmethod is able to partially overcome one of the main disadvantages of GMM,\nwhich is not taking into account correlation between neighboring pixels, as it\nassumes independence between them. We demonstrate the advantages of our method\nin various experiments on the example of myocardial infarct segmentation on\nmulti-sequence MRI images.\n","authors":["Matthias Schwab","Agnes Mayr","Markus Haltmeier"],"pdf_url":"https://arxiv.org/pdf/2404.12252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12251v1","updated":"2024-04-18T15:18:14Z","published":"2024-04-18T15:18:14Z","title":"Dynamic Modality and View Selection for Multimodal Emotion Recognition\n with Missing Modalities","summary":" The study of human emotions, traditionally a cornerstone in fields like\npsychology and neuroscience, has been profoundly impacted by the advent of\nartificial intelligence (AI). Multiple channels, such as speech (voice) and\nfacial expressions (image), are crucial in understanding human emotions.\nHowever, AI's journey in multimodal emotion recognition (MER) is marked by\nsubstantial technical challenges. One significant hurdle is how AI models\nmanage the absence of a particular modality - a frequent occurrence in\nreal-world situations. This study's central focus is assessing the performance\nand resilience of two strategies when confronted with the lack of one modality:\na novel multimodal dynamic modality and view selection and a cross-attention\nmechanism. Results on the RECOLA dataset show that dynamic selection-based\nmethods are a promising approach for MER. In the missing modalities scenarios,\nall dynamic selection-based methods outperformed the baseline. The study\nconcludes by emphasizing the intricate interplay between audio and video\nmodalities in emotion prediction, showcasing the adaptability of dynamic\nselection methods in handling missing modalities.\n","authors":["Luciana Trinkaus Menon","Luiz Carlos Ribeiro Neduziak","Jean Paul Barddal","Alessandro Lameiras Koerich","Alceu de Souza Britto Jr"],"pdf_url":"https://arxiv.org/pdf/2404.12251v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.12246v1","updated":"2024-04-18T15:11:02Z","published":"2024-04-18T15:11:02Z","title":"Blind Localization and Clustering of Anomalies in Textures","summary":" Anomaly detection and localization in images is a growing field in computer\nvision. In this area, a seemingly understudied problem is anomaly clustering,\ni.e., identifying and grouping different types of anomalies in a fully\nunsupervised manner. In this work, we propose a novel method for clustering\nanomalies in largely stationary images (textures) in a blind setting. That is,\nthe input consists of normal and anomalous images without distinction and\nwithout labels. What contributes to the difficulty of the task is that\nanomalous regions are often small and may present only subtle changes in\nappearance, which can be easily overshadowed by the genuine variance in the\ntexture. Moreover, each anomaly type may have a complex appearance\ndistribution. We introduce a novel scheme for solving this task using a\ncombination of blind anomaly localization and contrastive learning. By\nidentifying the anomalous regions with high fidelity, we can restrict our focus\nto those regions of interest; then, contrastive learning is employed to\nincrease the separability of different anomaly types and reduce the intra-class\nvariation. Our experiments show that the proposed solution yields significantly\nbetter results compared to prior work, setting a new state of the art. Project\npage: https://reality.tf.fau.de/pub/ardelean2024blind.html.\n","authors":["Andrei-Timotei Ardelean","Tim Weyrich"],"pdf_url":"https://arxiv.org/pdf/2404.12246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11474v4","updated":"2024-04-18T15:10:47Z","published":"2023-05-19T06:55:04Z","title":"Reciprocal Attention Mixing Transformer for Lightweight Image\n Restoration","summary":" Although many recent works have made advancements in the image restoration\n(IR) field, they often suffer from an excessive number of parameters. Another\nissue is that most Transformer-based IR methods focus only on either local or\nglobal features, leading to limited receptive fields or deficient parameter\nissues. To address these problems, we propose a lightweight IR network,\nReciprocal Attention Mixing Transformer (RAMiT). It employs our proposed\ndimensional reciprocal attention mixing Transformer (D-RAMiT) blocks, which\ncompute bi-dimensional (spatial and channel) self-attentions in parallel with\ndifferent numbers of multi-heads. The bi-dimensional attentions help each other\nto complement their counterpart's drawbacks and are then mixed. Additionally,\nwe introduce a hierarchical reciprocal attention mixing (H-RAMi) layer that\ncompensates for pixel-level information losses and utilizes semantic\ninformation while maintaining an efficient hierarchical structure. Furthermore,\nwe revisit and modify MobileNet V1 and V2 to attach efficient convolutions to\nour proposed components. The experimental results demonstrate that RAMiT\nachieves state-of-the-art performance on multiple lightweight IR tasks,\nincluding super-resolution, color denoising, grayscale denoising, low-light\nenhancement, and deraining. Codes are available at\nhttps://github.com/rami0205/RAMiT.\n","authors":["Haram Choi","Cheolwoong Na","Jihyeon Oh","Seungjae Lee","Jinseop Kim","Subeen Choe","Jeongmin Lee","Taehoon Kim","Jihoon Yang"],"pdf_url":"https://arxiv.org/pdf/2305.11474v4.pdf","comment":"CVPR 2024 Workshop - NTIRE. Codes are available at\n https://github.com/rami0205/RAMiT"},{"id":"http://arxiv.org/abs/2404.09683v2","updated":"2024-04-18T14:51:55Z","published":"2024-04-15T11:36:31Z","title":"Post-Training Network Compression for 3D Medical Image Segmentation:\n Reducing Computational Efforts via Tucker Decomposition","summary":" We address the computational barrier of deploying advanced deep learning\nsegmentation models in clinical settings by studying the efficacy of network\ncompression through tensor decomposition. We propose a post-training Tucker\nfactorization that enables the decomposition of pre-existing models to reduce\ncomputational requirements without impeding segmentation accuracy. We applied\nTucker decomposition to the convolutional kernels of the TotalSegmentator (TS)\nmodel, an nnU-Net model trained on a comprehensive dataset for automatic\nsegmentation of 117 anatomical structures. Our approach reduced the\nfloating-point operations (FLOPs) and memory required during inference,\noffering an adjustable trade-off between computational efficiency and\nsegmentation quality. This study utilized the publicly available TS dataset,\nemploying various downsampling factors to explore the relationship between\nmodel size, inference speed, and segmentation performance. The application of\nTucker decomposition to the TS model substantially reduced the model parameters\nand FLOPs across various compression rates, with limited loss in segmentation\naccuracy. We removed up to 88% of the model's parameters with no significant\nperformance changes in the majority of classes after fine-tuning. Practical\nbenefits varied across different graphics processing unit (GPU) architectures,\nwith more distinct speed-ups on less powerful hardware. Post-hoc network\ncompression via Tucker decomposition presents a viable strategy for reducing\nthe computational demand of medical image segmentation models without\nsubstantially sacrificing accuracy. This approach enables the broader adoption\nof advanced deep learning technologies in clinical practice, offering a way to\nnavigate the constraints of hardware capabilities.\n","authors":["Tobias Weber","Jakob Dexl","David Rügamer","Michael Ingrisch"],"pdf_url":"https://arxiv.org/pdf/2404.09683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12235v1","updated":"2024-04-18T14:51:42Z","published":"2024-04-18T14:51:42Z","title":"Beyond Average: Individualized Visual Scanpath Prediction","summary":" Understanding how attention varies across individuals has significant\nscientific and societal impacts. However, existing visual scanpath models treat\nattention uniformly, neglecting individual differences. To bridge this gap,\nthis paper focuses on individualized scanpath prediction (ISP), a new attention\nmodeling task that aims to accurately predict how different individuals shift\ntheir attention in diverse visual tasks. It proposes an ISP method featuring\nthree novel technical components: (1) an observer encoder to characterize and\nintegrate an observer's unique attention traits, (2) an observer-centric\nfeature integration approach that holistically combines visual features, task\nguidance, and observer-specific characteristics, and (3) an adaptive fixation\nprioritization mechanism that refines scanpath predictions by dynamically\nprioritizing semantic feature maps based on individual observers' attention\ntraits. These novel components allow scanpath models to effectively address the\nattention variations across different observers. Our method is generally\napplicable to different datasets, model architectures, and visual tasks,\noffering a comprehensive tool for transforming general scanpath models into\nindividualized ones. Comprehensive evaluations using value-based and\nranking-based metrics verify the method's effectiveness and generalizability.\n","authors":["Xianyu Chen","Ming Jiang","Qi Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.12235v1.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2404.12216v1","updated":"2024-04-18T14:20:30Z","published":"2024-04-18T14:20:30Z","title":"ProTA: Probabilistic Token Aggregation for Text-Video Retrieval","summary":" Text-video retrieval aims to find the most relevant cross-modal samples for a\ngiven query. Recent methods focus on modeling the whole spatial-temporal\nrelations. However, since video clips contain more diverse content than\ncaptions, the model aligning these asymmetric video-text pairs has a high risk\nof retrieving many false positive results. In this paper, we propose\nProbabilistic Token Aggregation (\\textit{ProTA}) to handle cross-modal\ninteraction with content asymmetry. Specifically, we propose dual\npartial-related aggregation to disentangle and re-aggregate token\nrepresentations in both low-dimension and high-dimension spaces. We propose\ntoken-based probabilistic alignment to generate token-level probabilistic\nrepresentation and maintain the feature representation diversity. In addition,\nan adaptive contrastive loss is proposed to learn compact cross-modal\ndistribution space. Based on extensive experiments, \\textit{ProTA} achieves\nsignificant improvements on MSR-VTT (50.9%), LSMDC (25.8%), and DiDeMo (47.2%).\n","authors":["Han Fang","Xianghao Zang","Chao Ban","Zerun Feng","Lanxiang Zhou","Zhongjiang He","Yongxiang Li","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2404.12216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12210v1","updated":"2024-04-18T14:14:44Z","published":"2024-04-18T14:14:44Z","title":"Observation, Analysis, and Solution: Exploring Strong Lightweight Vision\n Transformers via Masked Image Modeling Pre-Training","summary":" Masked image modeling (MIM) pre-training for large-scale vision transformers\n(ViTs) in computer vision has enabled promising downstream performance on top\nof the learned self-supervised ViT features. In this paper, we question if the\nextremely simple ViTs' fine-tuning performance with a small-scale architecture\ncan also benefit from this pre-training paradigm, which is considerably less\nstudied yet in contrast to the well-established lightweight architecture design\nmethodology with sophisticated components introduced. By carefully adapting\nvarious typical MIM pre-training methods to this lightweight regime and\ncomparing them with the contrastive learning (CL) pre-training on various\ndownstream image classification and dense prediction tasks, we systematically\nobserve different behaviors between MIM and CL with respect to the downstream\nfine-tuning data scales. Furthermore, we analyze the frozen features under\nlinear probing evaluation and also the layer representation similarities and\nattention maps across the obtained models, which clearly show the inferior\nlearning of MIM pre-training on higher layers, leading to unsatisfactory\nfine-tuning performance on data-insufficient downstream tasks. This finding is\nnaturally a guide to choosing appropriate distillation strategies during\npre-training to solve the above deterioration problem. Extensive experiments on\nvarious vision tasks demonstrate the effectiveness of our\nobservation-analysis-solution flow. In particular, our pre-training with\ndistillation on pure lightweight ViTs with vanilla/hierarchical design\n(5.7M/6.5M) can achieve 79.4%/78.9% top-1 accuracy on ImageNet-1K. It also\nenables SOTA performance on the ADE20K semantic segmentation task (42.8% mIoU)\nand LaSOT visual tracking task (66.1% AUC) in the lightweight regime. The\nlatter even surpasses all the current SOTA lightweight CPU-realtime trackers.\n","authors":["Jin Gao","Shubo Lin","Shaoru Wang","Yutong Kou","Zeming Li","Liang Li","Congxuan Zhang","Xiaoqin Zhang","Yizheng Wang","Weiming Hu"],"pdf_url":"https://arxiv.org/pdf/2404.12210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12209v1","updated":"2024-04-18T14:14:07Z","published":"2024-04-18T14:14:07Z","title":"Partial-to-Partial Shape Matching with Geometric Consistency","summary":" Finding correspondences between 3D shapes is an important and long-standing\nproblem in computer vision, graphics and beyond. A prominent challenge are\npartial-to-partial shape matching settings, which occur when the shapes to\nmatch are only observed incompletely (e.g. from 3D scanning). Although\npartial-to-partial matching is a highly relevant setting in practice, it is\nrarely explored. Our work bridges the gap between existing (rather artificial)\n3D full shape matching and partial-to-partial real-world settings by exploiting\ngeometric consistency as a strong constraint. We demonstrate that it is indeed\npossible to solve this challenging problem in a variety of settings. For the\nfirst time, we achieve geometric consistency for partial-to-partial matching,\nwhich is realized by a novel integer non-linear program formalism building on\ntriangle product spaces, along with a new pruning algorithm based on linear\ninteger programming. Further, we generate a new inter-class dataset for\npartial-to-partial shape-matching. We show that our method outperforms current\nSOTA methods on both an established intra-class dataset and our novel\ninter-class dataset.\n","authors":["Viktoria Ehm","Maolin Gao","Paul Roetzer","Marvin Eisenberger","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2404.12209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12203v1","updated":"2024-04-18T14:07:08Z","published":"2024-04-18T14:07:08Z","title":"GraFIQs: Face Image Quality Assessment Using Gradient Magnitudes","summary":" Face Image Quality Assessment (FIQA) estimates the utility of face images for\nautomated face recognition (FR) systems. We propose in this work a novel\napproach to assess the quality of face images based on inspecting the required\nchanges in the pre-trained FR model weights to minimize differences between\ntesting samples and the distribution of the FR training dataset. To achieve\nthat, we propose quantifying the discrepancy in Batch Normalization statistics\n(BNS), including mean and variance, between those recorded during FR training\nand those obtained by processing testing samples through the pretrained FR\nmodel. We then generate gradient magnitudes of pretrained FR weights by\nbackpropagating the BNS through the pretrained model. The cumulative absolute\nsum of these gradient magnitudes serves as the FIQ for our approach. Through\ncomprehensive experimentation, we demonstrate the effectiveness of our\ntraining-free and quality labeling-free approach, achieving competitive\nperformance to recent state-of-theart FIQA approaches without relying on\nquality labeling, the need to train regression networks, specialized\narchitectures, or designing and optimizing specific loss functions.\n","authors":["Jan Niklas Kolf","Naser Damer","Fadi Boutros"],"pdf_url":"https://arxiv.org/pdf/2404.12203v1.pdf","comment":"Accepted at CVPR Workshop 2024"},{"id":"http://arxiv.org/abs/2404.12192v1","updated":"2024-04-18T13:56:03Z","published":"2024-04-18T13:56:03Z","title":"Aligning Actions and Walking to LLM-Generated Textual Descriptions","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nvarious domains, including data augmentation and synthetic data generation.\nThis work explores the use of LLMs to generate rich textual descriptions for\nmotion sequences, encompassing both actions and walking patterns. We leverage\nthe expressive power of LLMs to align motion representations with high-level\nlinguistic cues, addressing two distinct tasks: action recognition and\nretrieval of walking sequences based on appearance attributes. For action\nrecognition, we employ LLMs to generate textual descriptions of actions in the\nBABEL-60 dataset, facilitating the alignment of motion sequences with\nlinguistic representations. In the domain of gait analysis, we investigate the\nimpact of appearance attributes on walking patterns by generating textual\ndescriptions of motion sequences from the DenseGait dataset using LLMs. These\ndescriptions capture subtle variations in walking styles influenced by factors\nsuch as clothing choices and footwear. Our approach demonstrates the potential\nof LLMs in augmenting structured motion attributes and aligning multi-modal\nrepresentations. The findings contribute to the advancement of comprehensive\nmotion understanding and open up new avenues for leveraging LLMs in multi-modal\nalignment and data augmentation for motion analysis. We make the code publicly\navailable at https://github.com/Radu1999/WalkAndText\n","authors":["Radu Chivereanu","Adrian Cosma","Andy Catruna","Razvan Rughinis","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2404.12192v1.pdf","comment":"Accepted at 2nd Workshop on Learning with Few or without Annotated\n Face, Body and Gesture Data"},{"id":"http://arxiv.org/abs/2404.12183v1","updated":"2024-04-18T13:46:16Z","published":"2024-04-18T13:46:16Z","title":"Gait Recognition from Highly Compressed Videos","summary":" Surveillance footage represents a valuable resource and opportunities for\nconducting gait analysis. However, the typical low quality and high noise\nlevels in such footage can severely impact the accuracy of pose estimation\nalgorithms, which are foundational for reliable gait analysis. Existing\nliterature suggests a direct correlation between the efficacy of pose\nestimation and the subsequent gait analysis results. A common mitigation\nstrategy involves fine-tuning pose estimation models on noisy data to improve\nrobustness. However, this approach may degrade the downstream model's\nperformance on the original high-quality data, leading to a trade-off that is\nundesirable in practice. We propose a processing pipeline that incorporates a\ntask-targeted artifact correction model specifically designed to pre-process\nand enhance surveillance footage before pose estimation. Our artifact\ncorrection model is optimized to work alongside a state-of-the-art pose\nestimation network, HRNet, without requiring repeated fine-tuning of the pose\nestimation model. Furthermore, we propose a simple and robust method for\nobtaining low quality videos that are annotated with poses in an automatic\nmanner with the purpose of training the artifact correction model. We\nsystematically evaluate the performance of our artifact correction model\nagainst a range of noisy surveillance data and demonstrate that our approach\nnot only achieves improved pose estimation on low-quality surveillance footage,\nbut also preserves the integrity of the pose estimation on high resolution\nfootage. Our experiments show a clear enhancement in gait analysis performance,\nsupporting the viability of the proposed method as a superior alternative to\ndirect fine-tuning strategies. Our contributions pave the way for more reliable\ngait analysis using surveillance data in real-world applications, regardless of\ndata quality.\n","authors":["Andrei Niculae","Andy Catruna","Adrian Cosma","Daniel Rosner","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2404.12183v1.pdf","comment":"Accepted at 2nd Workshop on Learning with Few or without Annotated\n Face, Body and Gesture Data"},{"id":"http://arxiv.org/abs/2404.10335v2","updated":"2024-04-18T13:34:08Z","published":"2024-04-16T07:19:52Z","title":"Efficiently Adversarial Examples Generation for Visual-Language Models\n under Targeted Transfer Scenarios using Diffusion Models","summary":" Targeted transfer-based attacks involving adversarial examples pose a\nsignificant threat to large visual-language models (VLMs). However, the\nstate-of-the-art (SOTA) transfer-based attacks incur high costs due to\nexcessive iteration counts. Furthermore, the generated adversarial examples\nexhibit pronounced adversarial noise and demonstrate limited efficacy in\nevading defense methods such as DiffPure. To address these issues, inspired by\nscore matching, we introduce AdvDiffVLM, which utilizes diffusion models to\ngenerate natural, unrestricted adversarial examples. Specifically, AdvDiffVLM\nemploys Adaptive Ensemble Gradient Estimation to modify the score during the\ndiffusion model's reverse generation process, ensuring the adversarial examples\nproduced contain natural adversarial semantics and thus possess enhanced\ntransferability. Simultaneously, to enhance the quality of adversarial examples\nfurther, we employ the GradCAM-guided Mask method to disperse adversarial\nsemantics throughout the image, rather than concentrating them in a specific\narea. Experimental results demonstrate that our method achieves a speedup\nranging from 10X to 30X compared to existing transfer-based attack methods,\nwhile maintaining superior quality of adversarial examples. Additionally, the\ngenerated adversarial examples possess strong transferability and exhibit\nincreased robustness against adversarial defense methods. Notably, AdvDiffVLM\ncan successfully attack commercial VLMs, including GPT-4V, in a black-box\nmanner.\n","authors":["Qi Guo","Shanmin Pang","Xiaojun Jia","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2404.10335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02286v3","updated":"2024-04-18T13:33:32Z","published":"2024-02-03T22:51:17Z","title":"Multi-Level Aggregation and Recursive Alignment Architecture for\n Efficient Parallel Inference Segmentation Network","summary":" Real-time semantic segmentation is a crucial research for real-world\napplications. However, many methods lay particular emphasis on reducing the\ncomputational complexity and model size, while largely sacrificing the\naccuracy. To tackle this problem, we propose a parallel inference network\ncustomized for semantic segmentation tasks to achieve a good trade-off between\nspeed and accuracy. We employ a shallow backbone to ensure real-time speed, and\npropose three core components to compensate for the reduced model capacity to\nimprove accuracy. Specifically, we first design a dual-pyramidal path\narchitecture (Multi-level Feature Aggregation Module, MFAM) to aggregate\nmulti-level features from the encoder to each scale, providing hierarchical\nclues for subsequent spatial alignment and corresponding in-network inference.\nThen, we build Recursive Alignment Module (RAM) by combining the flow-based\nalignment module with recursive upsampling architecture for accurate spatial\nalignment between multi-scale feature maps with half the computational\ncomplexity of the straightforward alignment method. Finally, we perform\nindependent parallel inference on the aligned features to obtain multi-scale\nscores, and adaptively fuse them through an attention-based Adaptive Scores\nFusion Module (ASFM) so that the final prediction can favor objects of multiple\nscales. Our framework shows a better balance between speed and accuracy than\nstate-of-the-art real-time methods on Cityscapes and CamVid datasets. We also\nconducted systematic ablation studies to gain insight into our motivation and\narchitectural design. Code is available at:\nhttps://github.com/Yanhua-Zhang/MFARANet.\n","authors":["Yanhua Zhang","Ke Zhang","Jingyu Wang","Yulin Wu","Wuwei Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02286v3.pdf","comment":"15 pages, 9 figures and 12 Tables. Manuscript completed on April 30,\n 2022"},{"id":"http://arxiv.org/abs/2404.12172v1","updated":"2024-04-18T13:27:29Z","published":"2024-04-18T13:27:29Z","title":"How to Benchmark Vision Foundation Models for Semantic Segmentation?","summary":" Recent vision foundation models (VFMs) have demonstrated proficiency in\nvarious tasks but require supervised fine-tuning to perform the task of\nsemantic segmentation effectively. Benchmarking their performance is essential\nfor selecting current models and guiding future model developments for this\ntask. The lack of a standardized benchmark complicates comparisons. Therefore,\nthe primary objective of this paper is to study how VFMs should be benchmarked\nfor semantic segmentation. To do so, various VFMs are fine-tuned under various\nsettings, and the impact of individual settings on the performance ranking and\ntraining time is assessed. Based on the results, the recommendation is to\nfine-tune the ViT-B variants of VFMs with a 16x16 patch size and a linear\ndecoder, as these settings are representative of using a larger model, more\nadvanced decoder and smaller patch size, while reducing training time by more\nthan 13 times. Using multiple datasets for training and evaluation is also\nrecommended, as the performance ranking across datasets and domain shifts\nvaries. Linear probing, a common practice for some VFMs, is not recommended, as\nit is not representative of end-to-end fine-tuning. The benchmarking setup\nrecommended in this paper enables a performance analysis of VFMs for semantic\nsegmentation. The findings of such an analysis reveal that pretraining with\npromptable segmentation is not beneficial, whereas masked image modeling (MIM)\nwith abstract representations is crucial, even more important than the type of\nsupervision used. The code for efficiently fine-tuning VFMs for semantic\nsegmentation can be accessed through the project page at:\nhttps://tue-mps.github.io/benchmark-vfm-ss/.\n","authors":["Tommie Kerssies","Daan de Geus","Gijs Dubbelman"],"pdf_url":"https://arxiv.org/pdf/2404.12172v1.pdf","comment":"CVPR 2024 Workshop Proceedings for the Second Workshop on Foundation\n Models"},{"id":"http://arxiv.org/abs/2404.12168v1","updated":"2024-04-18T13:22:56Z","published":"2024-04-18T13:22:56Z","title":"Real-World Efficient Blind Motion Deblurring via Blur Pixel\n Discretization","summary":" As recent advances in mobile camera technology have enabled the capability to\ncapture high-resolution images, such as 4K images, the demand for an efficient\ndeblurring model handling large motion has increased. In this paper, we\ndiscover that the image residual errors, i.e., blur-sharp pixel differences,\ncan be grouped into some categories according to their motion blur type and how\ncomplex their neighboring pixels are. Inspired by this, we decompose the\ndeblurring (regression) task into blur pixel discretization (pixel-level blur\nclassification) and discrete-to-continuous conversion (regression with blur\nclass map) tasks. Specifically, we generate the discretized image residual\nerrors by identifying the blur pixels and then transform them to a continuous\nform, which is computationally more efficient than naively solving the original\nregression problem with continuous values. Here, we found that the\ndiscretization result, i.e., blur segmentation map, remarkably exhibits visual\nsimilarity with the image residual errors. As a result, our efficient model\nshows comparable performance to state-of-the-art methods in realistic\nbenchmarks, while our method is up to 10 times computationally more efficient.\n","authors":["Insoo Kim","Jae Seok Choi","Geonseok Seo","Kinam Kwon","Jinwoo Shin","Hyong-Euk Lee"],"pdf_url":"https://arxiv.org/pdf/2404.12168v1.pdf","comment":"CVPR2024 Camera-Ready"},{"id":"http://arxiv.org/abs/2311.17116v4","updated":"2024-04-18T13:03:44Z","published":"2023-11-28T12:14:22Z","title":"REF$^2$-NeRF: Reflection and Refraction aware Neural Radiance Field","summary":" Recently, significant progress has been made in the study of methods for 3D\nreconstruction from multiple images using implicit neural representations,\nexemplified by the neural radiance field (NeRF) method. Such methods, which are\nbased on volume rendering, can model various light phenomena, and various\nextended methods have been proposed to accommodate different scenes and\nsituations. However, when handling scenes with multiple glass objects, e.g.,\nobjects in a glass showcase, modeling the target scene accurately has been\nchallenging due to the presence of multiple reflection and refraction effects.\nThus, this paper proposes a NeRF-based modeling method for scenes containing a\nglass case. In the proposed method, refraction and reflection are modeled using\nelements that are dependent and independent of the viewer's perspective. This\napproach allows us to estimate the surfaces where refraction occurs, i.e.,\nglass surfaces, and enables the separation and modeling of both direct and\nreflected light components. The proposed method requires predetermined camera\nposes, but accurately estimating these poses in scenes with glass objects is\ndifficult. Therefore, we used a robotic arm with an attached camera to acquire\nimages with known poses. Compared to existing methods, the proposed method\nenables more accurate modeling of both glass refraction and the overall scene.\n","authors":["Wooseok Kim","Taiki Fukiage","Takeshi Oishi"],"pdf_url":"https://arxiv.org/pdf/2311.17116v4.pdf","comment":"10 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2404.12154v1","updated":"2024-04-18T12:58:55Z","published":"2024-04-18T12:58:55Z","title":"StyleBooth: Image Style Editing with Multimodal Instruction","summary":" Given an original image, image editing aims to generate an image that align\nwith the provided instruction. The challenges are to accept multimodal inputs\nas instructions and a scarcity of high-quality training data, including crucial\ntriplets of source/target image pairs and multimodal (text and image)\ninstructions. In this paper, we focus on image style editing and present\nStyleBooth, a method that proposes a comprehensive framework for image editing\nand a feasible strategy for building a high-quality style editing dataset. We\nintegrate encoded textual instruction and image exemplar as a unified condition\nfor diffusion model, enabling the editing of original image following\nmultimodal instructions. Furthermore, by iterative style-destyle tuning and\nediting and usability filtering, the StyleBooth dataset provides\ncontent-consistent stylized/plain image pairs in various categories of styles.\nTo show the flexibility of StyleBooth, we conduct experiments on diverse tasks,\nsuch as text-based style editing, exemplar-based style editing and\ncompositional style editing. The results demonstrate that the quality and\nvariety of training data significantly enhance the ability to preserve content\nand improve the overall quality of generated images in editing tasks. Project\npage can be found at https://ali-vilab.github.io/stylebooth-page/.\n","authors":["Zhen Han","Chaojie Mao","Zeyinzi Jiang","Yulin Pan","Jingfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15260v3","updated":"2024-04-18T12:44:56Z","published":"2023-11-26T10:27:22Z","title":"NeuRAD: Neural Rendering for Autonomous Driving","summary":" Neural radiance fields (NeRFs) have gained popularity in the autonomous\ndriving (AD) community. Recent methods show NeRFs' potential for closed-loop\nsimulation, enabling testing of AD systems, and as an advanced training data\naugmentation technique. However, existing methods often require long training\ntimes, dense semantic supervision, or lack generalizability. This, in turn,\nhinders the application of NeRFs for AD at scale. In this paper, we propose\nNeuRAD, a robust novel view synthesis method tailored to dynamic AD data. Our\nmethod features simple network design, extensive sensor modeling for both\ncamera and lidar -- including rolling shutter, beam divergence and ray dropping\n-- and is applicable to multiple datasets out of the box. We verify its\nperformance on five popular AD datasets, achieving state-of-the-art performance\nacross the board. To encourage further development, we will openly release the\nNeuRAD source code. See https://github.com/georghess/NeuRAD .\n","authors":["Adam Tonderski","Carl Lindström","Georg Hess","William Ljungbergh","Lennart Svensson","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2311.15260v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12139v1","updated":"2024-04-18T12:41:33Z","published":"2024-04-18T12:41:33Z","title":"Omniview-Tuning: Boosting Viewpoint Invariance of Vision-Language\n Pre-training Models","summary":" Vision-Language Pre-training (VLP) models like CLIP have achieved remarkable\nsuccess in computer vision and particularly demonstrated superior robustness to\ndistribution shifts of 2D images. However, their robustness under 3D viewpoint\nvariations is still limited, which can hinder the development for real-world\napplications. This paper successfully addresses this concern while keeping\nVLPs' original performance by breaking through two primary obstacles: 1) the\nscarcity of training data and 2) the suboptimal fine-tuning paradigms. To\ncombat data scarcity, we build the Multi-View Caption (MVCap) dataset -- a\ncomprehensive collection of over four million multi-view image-text pairs\nacross more than 100K objects, providing more potential for VLP models to\ndevelop generalizable viewpoint-invariant representations. To address the\nlimitations of existing paradigms in performance trade-offs and training\nefficiency, we design a novel fine-tuning framework named Omniview-Tuning\n(OVT). Specifically, OVT introduces a Cross-Viewpoint Alignment objective\nthrough a minimax-like optimization strategy, which effectively aligns\nrepresentations of identical objects from diverse viewpoints without causing\noverfitting. Additionally, OVT fine-tunes VLP models in a parameter-efficient\nmanner, leading to minimal computational cost. Extensive experiments on various\nVLP models with different architectures validate that OVT significantly\nimproves the models' resilience to viewpoint shifts and keeps the original\nperformance, establishing a pioneering standard for boosting the viewpoint\ninvariance of VLP models.\n","authors":["Shouwei Ruan","Yinpeng Dong","Hanqing Liu","Yao Huang","Hang Su","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2404.12139v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2311.06634v2","updated":"2024-04-18T12:39:15Z","published":"2023-11-11T18:32:06Z","title":"Back to Basics: Fast Denoising Iterative Algorithm","summary":" We introduce Back to Basics (BTB), a fast iterative algorithm for noise\nreduction. Our method is computationally efficient, does not require training\nor ground truth data, and can be applied in the presence of independent noise,\nas well as correlated (coherent) noise, where the noise level is unknown. We\nexamine three study cases: natural image denoising in the presence of additive\nwhite Gaussian noise, Poisson-distributed image denoising, and speckle\nsuppression in optical coherence tomography (OCT). Experimental results\ndemonstrate that the proposed approach can effectively improve image quality,\nin challenging noise settings. Theoretical guarantees are provided for\nconvergence stability.\n","authors":["Deborah Pereg"],"pdf_url":"https://arxiv.org/pdf/2311.06634v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12130v1","updated":"2024-04-18T12:31:48Z","published":"2024-04-18T12:31:48Z","title":"One-Shot Sequential Federated Learning for Non-IID Data by Enhancing\n Local Model Diversity","summary":" Traditional federated learning mainly focuses on parallel settings (PFL),\nwhich can suffer significant communication and computation costs. In contrast,\none-shot and sequential federated learning (SFL) have emerged as innovative\nparadigms to alleviate these costs. However, the issue of non-IID (Independent\nand Identically Distributed) data persists as a significant challenge in\none-shot and SFL settings, exacerbated by the restricted communication between\nclients. In this paper, we improve the one-shot sequential federated learning\nfor non-IID data by proposing a local model diversity-enhancing strategy.\nSpecifically, to leverage the potential of local model diversity for improving\nmodel performance, we introduce a local model pool for each client that\ncomprises diverse models generated during local training, and propose two\ndistance measurements to further enhance the model diversity and mitigate the\neffect of non-IID data. Consequently, our proposed framework can improve the\nglobal model performance while maintaining low communication costs. Extensive\nexperiments demonstrate that our method exhibits superior performance to\nexisting one-shot PFL methods and achieves better accuracy compared with\nstate-of-the-art one-shot SFL methods on both label-skew and domain-shift tasks\n(e.g., 6%+ accuracy improvement on the CIFAR-10 dataset).\n","authors":["Naibo Wang","Yuchen Deng","Wenjie Feng","Shichen Fan","Jianwei Yin","See-Kiong Ng"],"pdf_url":"https://arxiv.org/pdf/2404.12130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12120v1","updated":"2024-04-18T12:13:09Z","published":"2024-04-18T12:13:09Z","title":"Fortify the Guardian, Not the Treasure: Resilient Adversarial Detectors","summary":" This paper presents RADAR-Robust Adversarial Detection via Adversarial\nRetraining-an approach designed to enhance the robustness of adversarial\ndetectors against adaptive attacks, while maintaining classifier performance.\nAn adaptive attack is one where the attacker is aware of the defenses and\nadapts their strategy accordingly. Our proposed method leverages adversarial\ntraining to reinforce the ability to detect attacks, without compromising clean\naccuracy. During the training phase, we integrate into the dataset adversarial\nexamples, which were optimized to fool both the classifier and the adversarial\ndetector, enabling the adversarial detector to learn and adapt to potential\nattack scenarios. Experimental evaluations on the CIFAR-10 and SVHN datasets\ndemonstrate that our proposed algorithm significantly improves a detector's\nability to accurately identify adaptive adversarial attacks -- without\nsacrificing clean accuracy.\n","authors":["Raz Lapid","Almog Dubin","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2404.12120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08182v2","updated":"2024-04-18T11:57:49Z","published":"2023-10-12T10:17:40Z","title":"XIMAGENET-12: An Explainable AI Benchmark Dataset for Model Robustness\n Evaluation","summary":" Despite the promising performance of existing visual models on public\nbenchmarks, the critical assessment of their robustness for real-world\napplications remains an ongoing challenge. To bridge this gap, we propose an\nexplainable visual dataset, XIMAGENET-12, to evaluate the robustness of visual\nmodels. XIMAGENET-12 consists of over 200K images with 15,410 manual semantic\nannotations. Specifically, we deliberately selected 12 categories from\nImageNet, representing objects commonly encountered in practical life. To\nsimulate real-world situations, we incorporated six diverse scenarios, such as\noverexposure, blurring, and color changes, etc. We further develop a\nquantitative criterion for robustness assessment, allowing for a nuanced\nunderstanding of how visual models perform under varying conditions, notably in\nrelation to the background. We make the XIMAGENET-12 dataset and its\ncorresponding code openly accessible at\n\\url{https://sites.google.com/view/ximagenet-12/home}. We expect the\nintroduction of the XIMAGENET-12 dataset will empower researchers to thoroughly\nevaluate the robustness of their visual models under challenging conditions.\n","authors":["Qiang Li","Dan Zhang","Shengzhao Lei","Xun Zhao","Porawit Kamnoedboon","WeiWei Li","Junhao Dong","Shuyan Li"],"pdf_url":"https://arxiv.org/pdf/2310.08182v2.pdf","comment":"Paper accepted by Synthetic Data for Computer Vision Workshop @ IEEE\n CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06211v2","updated":"2024-04-18T11:52:11Z","published":"2024-04-09T11:00:11Z","title":"Unified Physical-Digital Attack Detection Challenge","summary":" Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR)\nSystems. In real-world scenarios, FRs are confronted with both physical and\ndigital attacks. However, existing algorithms often address only one type of\nattack at a time, which poses significant limitations in real-world scenarios\nwhere FR systems face hybrid physical-digital threats. To facilitate the\nresearch of Unified Attack Detection (UAD) algorithms, a large-scale\nUniAttackData dataset has been collected. UniAttackData is the largest public\ndataset for Unified Attack Detection, with a total of 28,706 videos, where each\nunique identity encompasses all advanced attack types. Based on this dataset,\nwe organized a Unified Physical-Digital Face Attack Detection Challenge to\nboost the research in Unified Attack Detections. It attracted 136 teams for the\ndevelopment phase, with 13 qualifying for the final round. The results\nre-verified by the organizing team were used for the final ranking. This paper\ncomprehensively reviews the challenge, detailing the dataset introduction,\nprotocol definition, evaluation criteria, and a summary of published results.\nFinally, we focus on the detailed analysis of the highest-performing algorithms\nand offer potential directions for unified physical-digital attack detection\ninspired by this competition. Challenge Website:\nhttps://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024.\n","authors":["Haocheng Yuan","Ajian Liu","Junze Zheng","Jun Wan","Jiankang Deng","Sergio Escalera","Hugo Jair Escalante","Isabelle Guyon","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2404.06211v2.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.12104v1","updated":"2024-04-18T11:38:25Z","published":"2024-04-18T11:38:25Z","title":"Ethical-Lens: Curbing Malicious Usages of Open-Source Text-to-Image\n Models","summary":" The burgeoning landscape of text-to-image models, exemplified by innovations\nsuch as Midjourney and DALLE 3, has revolutionized content creation across\ndiverse sectors. However, these advancements bring forth critical ethical\nconcerns, particularly with the misuse of open-source models to generate\ncontent that violates societal norms. Addressing this, we introduce\nEthical-Lens, a framework designed to facilitate the value-aligned usage of\ntext-to-image tools without necessitating internal model revision. Ethical-Lens\nensures value alignment in text-to-image models across toxicity and bias\ndimensions by refining user commands and rectifying model outputs. Systematic\nevaluation metrics, combining GPT4-V, HEIM, and FairFace scores, assess\nalignment capability. Our experiments reveal that Ethical-Lens enhances\nalignment capabilities to levels comparable with or superior to commercial\nmodels like DALLE 3, ensuring user-generated content adheres to ethical\nstandards while maintaining image quality. This study indicates the potential\nof Ethical-Lens to ensure the sustainable development of open-source\ntext-to-image tools and their beneficial integration into society. Our code is\navailable at https://github.com/yuzhu-cai/Ethical-Lens.\n","authors":["Yuzhu Cai","Sheng Yin","Yuxi Wei","Chenxin Xu","Weibo Mao","Felix Juefei-Xu","Siheng Chen","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12104v1.pdf","comment":"42 pages, 17 figures, 29 tables"},{"id":"http://arxiv.org/abs/2404.12103v1","updated":"2024-04-18T11:36:37Z","published":"2024-04-18T11:36:37Z","title":"S3R-Net: A Single-Stage Approach to Self-Supervised Shadow Removal","summary":" In this paper we present S3R-Net, the Self-Supervised Shadow Removal Network.\nThe two-branch WGAN model achieves self-supervision relying on the\nunify-and-adaptphenomenon - it unifies the style of the output data and infers\nits characteristics from a database of unaligned shadow-free reference images.\nThis approach stands in contrast to the large body of supervised frameworks.\nS3R-Net also differentiates itself from the few existing self-supervised models\noperating in a cycle-consistent manner, as it is a non-cyclic, unidirectional\nsolution. The proposed framework achieves comparable numerical scores to recent\nselfsupervised shadow removal models while exhibiting superior qualitative\nperformance and keeping the computational cost low.\n","authors":["Nikolina Kubiak","Armin Mustafa","Graeme Phillipson","Stephen Jolly","Simon Hadfield"],"pdf_url":"https://arxiv.org/pdf/2404.12103v1.pdf","comment":"NTIRE workshop @ CVPR 2024. Code & models available at\n https://github.com/n-kubiak/S3R-Net"},{"id":"http://arxiv.org/abs/2303.13959v4","updated":"2024-04-18T11:31:00Z","published":"2023-03-24T12:33:44Z","title":"Bridging Stereo Geometry and BEV Representation with Reliable Mutual\n Interaction for Semantic Scene Completion","summary":" 3D semantic scene completion (SSC) is an ill-posed perception task that\nrequires inferring a dense 3D scene from limited observations. Previous\ncamera-based methods struggle to predict accurate semantic scenes due to\ninherent geometric ambiguity and incomplete observations. In this paper, we\nresort to stereo matching technique and bird's-eye-view (BEV) representation\nlearning to address such issues in SSC. Complementary to each other, stereo\nmatching mitigates geometric ambiguity with epipolar constraint while BEV\nrepresentation enhances the hallucination ability for invisible regions with\nglobal semantic context. However, due to the inherent representation gap\nbetween stereo geometry and BEV features, it is non-trivial to bridge them for\ndense prediction task of SSC. Therefore, we further develop a unified\noccupancy-based framework dubbed BRGScene, which effectively bridges these two\nrepresentations with dense 3D volumes for reliable semantic scene completion.\nSpecifically, we design a novel Mutual Interactive Ensemble (MIE) block for\npixel-level reliable aggregation of stereo geometry and BEV features. Within\nthe MIE block, a Bi-directional Reliable Interaction (BRI) module, enhanced\nwith confidence re-weighting, is employed to encourage fine-grained interaction\nthrough mutual guidance. Besides, a Dual Volume Ensemble (DVE) module is\nintroduced to facilitate complementary aggregation through channel-wise\nrecalibration and multi-group voting. Our method outperforms all published\ncamera-based methods on SemanticKITTI for semantic scene completion. Our code\nis available on \\url{https://github.com/Arlo0o/StereoScene}.\n","authors":["Bohan Li","Yasheng Sun","Zhujin Liang","Dalong Du","Zhuanghui Zhang","Xiaofeng Wang","Yunnan Wang","Xin Jin","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2303.13959v4.pdf","comment":"IJCAI2024"},{"id":"http://arxiv.org/abs/2404.12091v1","updated":"2024-04-18T11:20:53Z","published":"2024-04-18T11:20:53Z","title":"Harnessing Joint Rain-/Detail-aware Representations to Eliminate\n Intricate Rains","summary":" Recent advances in image deraining have focused on training powerful models\non mixed multiple datasets comprising diverse rain types and backgrounds.\nHowever, this approach tends to overlook the inherent differences among rainy\nimages, leading to suboptimal results. To overcome this limitation, we focus on\naddressing various rainy images by delving into meaningful representations that\nencapsulate both the rain and background components. Leveraging these\nrepresentations as instructive guidance, we put forth a Context-based\nInstance-level Modulation (CoI-M) mechanism adept at efficiently modulating\nCNN- or Transformer-based models. Furthermore, we devise a rain-/detail-aware\ncontrastive learning strategy to help extract joint rain-/detail-aware\nrepresentations. By integrating CoI-M with the rain-/detail-aware Contrastive\nlearning, we develop CoIC, an innovative and potent algorithm tailored for\ntraining models on mixed datasets. Moreover, CoIC offers insight into modeling\nrelationships of datasets, quantitatively assessing the impact of rain and\ndetails on restoration, and unveiling distinct behaviors of models given\ndiverse inputs. Extensive experiments validate the efficacy of CoIC in boosting\nthe deraining ability of CNN and Transformer models. CoIC also enhances the\nderaining prowess remarkably when real-world dataset is included.\n","authors":["Wu Ran","Peirong Ma","Zhiquan He","Hao Ren","Hong Lu"],"pdf_url":"https://arxiv.org/pdf/2404.12091v1.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.12083v1","updated":"2024-04-18T11:09:25Z","published":"2024-04-18T11:09:25Z","title":"MambaPupil: Bidirectional Selective Recurrent model for Event-based Eye\n tracking","summary":" Event-based eye tracking has shown great promise with the high temporal\nresolution and low redundancy provided by the event camera. However, the\ndiversity and abruptness of eye movement patterns, including blinking,\nfixating, saccades, and smooth pursuit, pose significant challenges for eye\nlocalization. To achieve a stable event-based eye-tracking system, this paper\nproposes a bidirectional long-term sequence modeling and time-varying state\nselection mechanism to fully utilize contextual temporal information in\nresponse to the variability of eye movements. Specifically, the MambaPupil\nnetwork is proposed, which consists of the multi-layer convolutional encoder to\nextract features from the event representations, a bidirectional Gated\nRecurrent Unit (GRU), and a Linear Time-Varying State Space Module (LTV-SSM),\nto selectively capture contextual correlation from the forward and backward\ntemporal relationship. Furthermore, the Bina-rep is utilized as a compact event\nrepresentation, and the tailor-made data augmentation, called as Event-Cutout,\nis proposed to enhance the model's robustness by applying spatial random\nmasking to the event image. The evaluation on the ThreeET-plus benchmark shows\nthe superior performance of the MambaPupil, which secured the 1st place in\nCVPR'2024 AIS Event-based Eye Tracking challenge.\n","authors":["Zhong Wang","Zengyu Wan","Han Han","Bohao Liao","Yuliang Wu","Wei Zhai","Yang Cao","Zheng-jun Zha"],"pdf_url":"https://arxiv.org/pdf/2404.12083v1.pdf","comment":"Accepted by CVPR 2024 Workshop (AIS: Vision, Graphics and AI for\n Streaming), top solution of challenge Event-based Eye Tracking, see\n https://www.kaggle.com/competitions/event-based-eye-tracking-ais2024"},{"id":"http://arxiv.org/abs/2404.12081v1","updated":"2024-04-18T11:05:15Z","published":"2024-04-18T11:05:15Z","title":"MaskCD: A Remote Sensing Change Detection Network Based on Mask\n Classification","summary":" Change detection (CD) from remote sensing (RS) images using deep learning has\nbeen widely investigated in the literature. It is typically regarded as a\npixel-wise labeling task that aims to classify each pixel as changed or\nunchanged. Although per-pixel classification networks in encoder-decoder\nstructures have shown dominance, they still suffer from imprecise boundaries\nand incomplete object delineation at various scenes. For high-resolution RS\nimages, partly or totally changed objects are more worthy of attention rather\nthan a single pixel. Therefore, we revisit the CD task from the mask prediction\nand classification perspective and propose MaskCD to detect changed areas by\nadaptively generating categorized masks from input image pairs. Specifically,\nit utilizes a cross-level change representation perceiver (CLCRP) to learn\nmultiscale change-aware representations and capture spatiotemporal relations\nfrom encoded features by exploiting deformable multihead self-attention\n(DeformMHSA). Subsequently, a masked-attention-based detection transformers\n(MA-DETR) decoder is developed to accurately locate and identify changed\nobjects based on masked attention and self-attention mechanisms. It\nreconstructs the desired changed objects by decoding the pixel-wise\nrepresentations into learnable mask proposals and making final predictions from\nthese candidates. Experimental results on five benchmark datasets demonstrate\nthe proposed approach outperforms other state-of-the-art models. Codes and\npretrained models are available online (https://github.com/EricYu97/MaskCD).\n","authors":["Weikang Yu","Xiaokang Zhang","Samiran Das","Xiao Xiang Zhu","Pedram Ghamisi"],"pdf_url":"https://arxiv.org/pdf/2404.12081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15663v2","updated":"2024-04-18T10:48:15Z","published":"2024-01-28T13:59:58Z","title":"Low-resolution Prior Equilibrium Network for CT Reconstruction","summary":" The unrolling method has been investigated for learning variational models in\nX-ray computed tomography. However, it has been observed that directly\nunrolling the regularization model through gradient descent does not produce\nsatisfactory results. In this paper, we present a novel deep learning-based CT\nreconstruction model, where the low-resolution image is introduced to obtain an\neffective regularization term for improving the network`s robustness. Our\napproach involves constructing the backbone network architecture by algorithm\nunrolling that is realized using the deep equilibrium architecture. We\ntheoretically discuss the convergence of the proposed low-resolution prior\nequilibrium model and provide the conditions to guarantee convergence.\nExperimental results on both sparse-view and limited-angle reconstruction\nproblems are provided, demonstrating that our end-to-end low-resolution prior\nequilibrium model outperforms other state-of-the-art methods in terms of noise\nreduction, contrast-to-noise ratio, and preservation of edge details.\n","authors":["Yijie Yang","Qifeng Gao","Yuping Duan"],"pdf_url":"https://arxiv.org/pdf/2401.15663v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04519v3","updated":"2024-04-18T10:40:35Z","published":"2023-12-07T18:38:39Z","title":"Bootstrapping Autonomous Driving Radars with Self-Supervised Learning","summary":" The perception of autonomous vehicles using radars has attracted increased\nresearch interest due its ability to operate in fog and bad weather. However,\ntraining radar models is hindered by the cost and difficulty of annotating\nlarge-scale radar data. To overcome this bottleneck, we propose a\nself-supervised learning framework to leverage the large amount of unlabeled\nradar data to pre-train radar-only embeddings for self-driving perception\ntasks. The proposed method combines radar-to-radar and radar-to-vision\ncontrastive losses to learn a general representation from unlabeled radar\nheatmaps paired with their corresponding camera images. When used for\ndownstream object detection, we demonstrate that the proposed self-supervision\nframework can improve the accuracy of state-of-the-art supervised baselines by\n$5.8\\%$ in mAP. Code is available at \\url{https://github.com/yiduohao/Radical}.\n","authors":["Yiduo Hao","Sohrab Madani","Junfeng Guan","Mohammed Alloulah","Saurabh Gupta","Haitham Hassanieh"],"pdf_url":"https://arxiv.org/pdf/2312.04519v3.pdf","comment":"12 pages, 5 figures, to be published in Proceedings of the IEEE/CVF\n Conference on Computer Vision and Pattern Recognition 2024"},{"id":"http://arxiv.org/abs/2404.12064v1","updated":"2024-04-18T10:23:10Z","published":"2024-04-18T10:23:10Z","title":"PureForest: A Large-scale Aerial Lidar and Aerial Imagery Dataset for\n Tree Species Classification in Monospecific Forests","summary":" Knowledge of tree species distribution is fundamental to managing forests.\nNew deep learning approaches promise significant accuracy gains for forest\nmapping, and are becoming a critical tool for mapping multiple tree species at\nscale. To advance the field, deep learning researchers need large benchmark\ndatasets with high-quality annotations. To this end, we present the PureForest\ndataset: a large-scale, open, multimodal dataset designed for tree species\nclassification from both Aerial Lidar Scanning (ALS) point clouds and Very High\nResolution (VHR) aerial images. Most current public Lidar datasets for tree\nspecies classification have low diversity as they only span a small area of a\nfew dozen annotated hectares at most. In contrast, PureForest has 18 tree\nspecies grouped into 13 semantic classes, and spans 339 km$^2$ across 449\ndistinct monospecific forests, and is to date the largest and most\ncomprehensive Lidar dataset for the identification of tree species. By making\nPureForest publicly available, we hope to provide a challenging benchmark\ndataset to support the development of deep learning approaches for tree species\nidentification from Lidar and/or aerial imagery. In this data paper, we\ndescribe the annotation workflow, the dataset, the recommended evaluation\nmethodology, and establish a baseline performance from both 3D and 2D\nmodalities.\n","authors":["Charles Gaydon","Floryne Roche"],"pdf_url":"https://arxiv.org/pdf/2404.12064v1.pdf","comment":"14 pages | 5 figures | Dataset is available at\n http://huggingface.co/datasets/IGNF/PureForest"},{"id":"http://arxiv.org/abs/2404.12062v1","updated":"2024-04-18T10:20:37Z","published":"2024-04-18T10:20:37Z","title":"MIDGET: Music Conditioned 3D Dance Generation","summary":" In this paper, we introduce a MusIc conditioned 3D Dance GEneraTion model,\nnamed MIDGET based on Dance motion Vector Quantised Variational AutoEncoder\n(VQ-VAE) model and Motion Generative Pre-Training (GPT) model to generate\nvibrant and highquality dances that match the music rhythm. To tackle\nchallenges in the field, we introduce three new components: 1) a pre-trained\nmemory codebook based on the Motion VQ-VAE model to store different human pose\ncodes, 2) employing Motion GPT model to generate pose codes with music and\nmotion Encoders, 3) a simple framework for music feature extraction. We compare\nwith existing state-of-the-art models and perform ablation experiments on\nAIST++, the largest publicly available music-dance dataset. Experiments\ndemonstrate that our proposed framework achieves state-of-the-art performance\non motion quality and its alignment with the music.\n","authors":["Jinwu Wang","Wei Mao","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12062v1.pdf","comment":"12 pages, 6 figures Published in AI 2023: Advances in Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2312.16867v2","updated":"2024-04-18T10:14:31Z","published":"2023-12-28T07:37:11Z","title":"DualFluidNet: an Attention-based Dual-pipeline Network for FLuid\n Simulation","summary":" Fluid motion can be considered as a point cloud transformation when using the\nSPH method. Compared to traditional numerical analysis methods, using machine\nlearning techniques to learn physics simulations can achieve near-accurate\nresults, while significantly increasing efficiency. In this paper, we propose\nan innovative approach for 3D fluid simulations utilizing an Attention-based\nDual-pipeline Network, which employs a dual-pipeline architecture, seamlessly\nintegrated with an Attention-based Feature Fusion Module. Unlike previous\nmethods, which often make difficult trade-offs between global fluid control and\nphysical law constraints, we find a way to achieve a better balance between\nthese two crucial aspects with a well-designed dual-pipeline approach.\nAdditionally, we design a Type-aware Input Module to adaptively recognize\nparticles of different types and perform feature fusion afterward, such that\nfluid-solid coupling issues can be better dealt with. Furthermore, we propose a\nnew dataset, Tank3D, to further explore the network's ability to handle more\ncomplicated scenes. The experiments demonstrate that our approach not only\nattains a quantitative enhancement in various metrics, surpassing the\nstate-of-the-art methods but also signifies a qualitative leap in neural\nnetwork-based simulation by faithfully adhering to the physical laws. Code and\nvideo demonstrations are available at\nhttps://github.com/chenyu-xjtu/DualFluidNet.\n","authors":["Yu Chen","Shuai Zheng","Menglong Jin","Yan Chang","Nianyi Wang"],"pdf_url":"https://arxiv.org/pdf/2312.16867v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.12055v1","updated":"2024-04-18T10:10:56Z","published":"2024-04-18T10:10:56Z","title":"Improving the perception of visual fiducial markers in the field using\n Adaptive Active Exposure Control","summary":" Accurate localization is fundamental for autonomous underwater vehicles\n(AUVs) to carry out precise tasks, such as manipulation and construction.\nVision-based solutions using fiducial marker are promising, but extremely\nchallenging underwater because of harsh lighting condition underwater. This\npaper introduces a gradient-based active camera exposure control method to\ntackle sharp lighting variations during image acquisition, which can establish\nbetter foundation for subsequent image enhancement procedures. Considering a\ntypical scenario for underwater operations where visual tags are used, we\nproposed several experiments comparing our method with other state-of-the-art\nexposure control method including Active Exposure Control (AEC) and\nGradient-based Exposure Control (GEC). Results show a significant improvement\nin the accuracy of robot localization. This method is an important component\nthat can be used in visual-based state estimation pipeline to improve the\noverall localization accuracy.\n","authors":["Ziang Ren","Samuel Lensgraf","Alberto Quattrini Li"],"pdf_url":"https://arxiv.org/pdf/2404.12055v1.pdf","comment":"Paper accepted by ISER 2023"},{"id":"http://arxiv.org/abs/2404.09624v2","updated":"2024-04-18T10:10:00Z","published":"2024-04-15T09:56:20Z","title":"AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics\n Perception","summary":" The highly abstract nature of image aesthetics perception (IAP) poses\nsignificant challenge for current multimodal large language models (MLLMs). The\nlack of human-annotated multi-modality aesthetic data further exacerbates this\ndilemma, resulting in MLLMs falling short of aesthetics perception\ncapabilities. To address the above challenge, we first introduce a\ncomprehensively annotated Aesthetic Multi-Modality Instruction Tuning (AesMMIT)\ndataset, which serves as the footstone for building multi-modality aesthetics\nfoundation models. Specifically, to align MLLMs with human aesthetics\nperception, we construct a corpus-rich aesthetic critique database with 21,904\ndiverse-sourced images and 88K human natural language feedbacks, which are\ncollected via progressive questions, ranging from coarse-grained aesthetic\ngrades to fine-grained aesthetic descriptions. To ensure that MLLMs can handle\ndiverse queries, we further prompt GPT to refine the aesthetic critiques and\nassemble the large-scale aesthetic instruction tuning dataset, i.e. AesMMIT,\nwhich consists of 409K multi-typed instructions to activate stronger aesthetic\ncapabilities. Based on the AesMMIT database, we fine-tune the open-sourced\ngeneral foundation models, achieving multi-modality Aesthetic Expert models,\ndubbed AesExpert. Extensive experiments demonstrate that the proposed AesExpert\nmodels deliver significantly better aesthetic perception performances than the\nstate-of-the-art MLLMs, including the most advanced GPT-4V and\nGemini-Pro-Vision. Source data will be available at\nhttps://github.com/yipoh/AesExpert.\n","authors":["Yipo Huang","Xiangfei Sheng","Zhichao Yang","Quan Yuan","Zhichao Duan","Pengfei Chen","Leida Li","Weisi Lin","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2404.09624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12037v1","updated":"2024-04-18T09:44:56Z","published":"2024-04-18T09:44:56Z","title":"Data-free Knowledge Distillation for Fine-grained Visual Categorization","summary":" Data-free knowledge distillation (DFKD) is a promising approach for\naddressing issues related to model compression, security privacy, and\ntransmission restrictions. Although the existing methods exploiting DFKD have\nachieved inspiring achievements in coarse-grained classification, in practical\napplications involving fine-grained classification tasks that require more\ndetailed distinctions between similar categories, sub-optimal results are\nobtained. To address this issue, we propose an approach called DFKD-FGVC that\nextends DFKD to fine-grained visual categorization~(FGVC) tasks. Our approach\nutilizes an adversarial distillation framework with attention generator, mixed\nhigh-order attention distillation, and semantic feature contrast learning.\nSpecifically, we introduce a spatial-wise attention mechanism to the generator\nto synthesize fine-grained images with more details of discriminative parts. We\nalso utilize the mixed high-order attention mechanism to capture complex\ninteractions among parts and the subtle differences among discriminative\nfeatures of the fine-grained categories, paying attention to both local\nfeatures and semantic context relationships. Moreover, we leverage the teacher\nand student models of the distillation framework to contrast high-level\nsemantic feature maps in the hyperspace, comparing variances of different\ncategories. We evaluate our approach on three widely-used FGVC benchmarks\n(Aircraft, Cars196, and CUB200) and demonstrate its superior performance.\n","authors":["Renrong Shao","Wei Zhang","Jianhua Yin","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08277v2","updated":"2024-04-18T09:43:26Z","published":"2024-04-12T07:04:56Z","title":"FaceFilterSense: A Filter-Resistant Face Recognition and Facial\n Attribute Analysis Framework","summary":" With the advent of social media, fun selfie filters have come into tremendous\nmainstream use affecting the functioning of facial biometric systems as well as\nimage recognition systems. These filters vary from beautification filters and\nAugmented Reality (AR)-based filters to filters that modify facial landmarks.\nHence, there is a need to assess the impact of such filters on the performance\nof existing face recognition systems. The limitation associated with existing\nsolutions is that these solutions focus more on the beautification filters.\nHowever, the current AR-based filters and filters which distort facial key\npoints are in vogue recently and make the faces highly unrecognizable even to\nthe naked eye. Also, the filters considered are mostly obsolete with limited\nvariations. To mitigate these limitations, we aim to perform a holistic impact\nanalysis of the latest filters and propose an user recognition model with the\nfiltered images. We have utilized a benchmark dataset for baseline images, and\napplied the latest filters over them to generate a beautified/filtered dataset.\nNext, we have introduced a model FaceFilterNet for beautified user recognition.\nIn this framework, we also utilize our model to comment on various attributes\nof the person including age, gender, and ethnicity. In addition, we have also\npresented a filter-wise impact analysis on face recognition, age estimation,\ngender, and ethnicity prediction. The proposed method affirms the efficacy of\nour dataset with an accuracy of 87.25% and an optimal accuracy for facial\nattribute analysis.\n","authors":["Shubham Tiwari","Yash Sethia","Ritesh Kumar","Ashwani Tanwar","Rudresh Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2404.08277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12031v1","updated":"2024-04-18T09:31:03Z","published":"2024-04-18T09:31:03Z","title":"MLS-Track: Multilevel Semantic Interaction in RMOT","summary":" The new trend in multi-object tracking task is to track objects of interest\nusing natural language. However, the scarcity of paired prompt-instance data\nhinders its progress. To address this challenge, we propose a high-quality yet\nlow-cost data generation method base on Unreal Engine 5 and construct a\nbrand-new benchmark dataset, named Refer-UE-City, which primarily includes\nscenes from intersection surveillance videos, detailing the appearance and\nactions of people and vehicles. Specifically, it provides 14 videos with a\ntotal of 714 expressions, and is comparable in scale to the Refer-KITTI\ndataset. Additionally, we propose a multi-level semantic-guided multi-object\nframework called MLS-Track, where the interaction between the model and text is\nenhanced layer by layer through the introduction of Semantic Guidance Module\n(SGM) and Semantic Correlation Branch (SCB). Extensive experiments on\nRefer-UE-City and Refer-KITTI datasets demonstrate the effectiveness of our\nproposed framework and it achieves state-of-the-art performance. Code and\ndatatsets will be available.\n","authors":["Zeliang Ma","Song Yang","Zhe Cui","Zhicheng Zhao","Fei Su","Delong Liu","Jingyu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12031v1.pdf","comment":"17 pages 8 figures"},{"id":"http://arxiv.org/abs/2404.12024v1","updated":"2024-04-18T09:21:16Z","published":"2024-04-18T09:21:16Z","title":"Meta-Auxiliary Learning for Micro-Expression Recognition","summary":" Micro-expressions (MEs) are involuntary movements revealing people's hidden\nfeelings, which has attracted numerous interests for its objectivity in emotion\ndetection. However, despite its wide applications in various scenarios,\nmicro-expression recognition (MER) remains a challenging problem in real life\ndue to three reasons, including (i) data-level: lack of data and imbalanced\nclasses, (ii) feature-level: subtle, rapid changing, and complex features of\nMEs, and (iii) decision-making-level: impact of individual differences. To\naddress these issues, we propose a dual-branch meta-auxiliary learning method,\ncalled LightmanNet, for fast and robust micro-expression recognition.\nSpecifically, LightmanNet learns general MER knowledge from limited data\nthrough a dual-branch bi-level optimization process: (i) In the first level, it\nobtains task-specific MER knowledge by learning in two branches, where the\nfirst branch is for learning MER features via primary MER tasks, while the\nother branch is for guiding the model obtain discriminative features via\nauxiliary tasks, i.e., image alignment between micro-expressions and\nmacro-expressions since their resemblance in both spatial and temporal\nbehavioral patterns. The two branches of learning jointly constrain the model\nof learning meaningful task-specific MER knowledge while avoiding learning\nnoise or superficial connections between MEs and emotions that may damage its\ngeneralization ability. (ii) In the second level, LightmanNet further refines\nthe learned task-specific knowledge, improving model generalization and\nefficiency. Extensive experiments on various benchmark datasets demonstrate the\nsuperior robustness and efficiency of LightmanNet.\n","authors":["Jingyao Wang","Yunhan Tian","Yuxuan Yang","Xiaoxin Chen","Changwen Zheng","Wenwen Qiang"],"pdf_url":"https://arxiv.org/pdf/2404.12024v1.pdf","comment":"10 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.12020v1","updated":"2024-04-18T09:16:02Z","published":"2024-04-18T09:16:02Z","title":"Look, Listen, and Answer: Overcoming Biases for Audio-Visual Question\n Answering","summary":" Audio-Visual Question Answering (AVQA) is a complex multi-modal reasoning\ntask, demanding intelligent systems to accurately respond to natural language\nqueries based on audio-video input pairs. Nevertheless, prevalent AVQA\napproaches are prone to overlearning dataset biases, resulting in poor\nrobustness. Furthermore, current datasets may not provide a precise diagnostic\nfor these methods. To tackle these challenges, firstly, we propose a novel\ndataset, \\textit{MUSIC-AVQA-R}, crafted in two steps: rephrasing questions\nwithin the test split of a public dataset (\\textit{MUSIC-AVQA}) and\nsubsequently introducing distribution shifts to split questions. The former\nleads to a large, diverse test space, while the latter results in a\ncomprehensive robustness evaluation on rare, frequent, and overall questions.\nSecondly, we propose a robust architecture that utilizes a multifaceted cycle\ncollaborative debiasing strategy to overcome bias learning. Experimental\nresults show that this architecture achieves state-of-the-art performance on\nboth datasets, especially obtaining a significant improvement of 9.68\\% on the\nproposed dataset. Extensive ablation experiments are conducted on these two\ndatasets to validate the effectiveness of the debiasing strategy. Additionally,\nwe highlight the limited robustness of existing multi-modal QA methods through\nthe evaluation on our dataset.\n","authors":["Jie Ma","Min Hu","Pinghui Wang","Wangchun Sun","Lingyun Song","Hongbin Pei","Jun Liu","Youtian Du"],"pdf_url":"https://arxiv.org/pdf/2404.12020v1.pdf","comment":"16 pages, 9 figures,5 Tables"},{"id":"http://arxiv.org/abs/2404.12015v1","updated":"2024-04-18T09:06:05Z","published":"2024-04-18T09:06:05Z","title":"What does CLIP know about peeling a banana?","summary":" Humans show an innate capability to identify tools to support specific\nactions. The association between objects parts and the actions they facilitate\nis usually named affordance. Being able to segment objects parts depending on\nthe tasks they afford is crucial to enable intelligent robots to use objects of\ndaily living. Traditional supervised learning methods for affordance\nsegmentation require costly pixel-level annotations, while weakly supervised\napproaches, though less demanding, still rely on object-interaction examples\nand support a closed set of actions. These limitations hinder scalability, may\nintroduce biases, and usually restrict models to a limited set of predefined\nactions. This paper proposes AffordanceCLIP, to overcome these limitations by\nleveraging the implicit affordance knowledge embedded within large pre-trained\nVision-Language models like CLIP. We experimentally demonstrate that CLIP,\nalthough not explicitly trained for affordances detection, retains valuable\ninformation for the task. Our AffordanceCLIP achieves competitive zero-shot\nperformance compared to methods with specialized training, while offering\nseveral advantages: i) it works with any action prompt, not just a predefined\nset; ii) it requires training only a small number of additional parameters\ncompared to existing solutions and iii) eliminates the need for direct\nsupervision on action-object pairs, opening new perspectives for\nfunctionality-based reasoning of models.\n","authors":["Claudia Cuttano","Gabriele Rosi","Gabriele Trivigno","Giuseppe Averta"],"pdf_url":"https://arxiv.org/pdf/2404.12015v1.pdf","comment":"Accepted to MAR Workshop at CVPR2024"},{"id":"http://arxiv.org/abs/2311.09104v2","updated":"2024-04-18T09:03:04Z","published":"2023-11-15T16:51:18Z","title":"Cross-view and Cross-pose Completion for 3D Human Understanding","summary":" Human perception and understanding is a major domain of computer vision\nwhich, like many other vision subdomains recently, stands to gain from the use\nof large models pre-trained on large datasets. We hypothesize that the most\ncommon pre-training strategy of relying on general purpose, object-centric\nimage datasets such as ImageNet, is limited by an important domain shift. On\nthe other hand, collecting domain-specific ground truth such as 2D or 3D labels\ndoes not scale well. Therefore, we propose a pre-training approach based on\nself-supervised learning that works on human-centric data using only images.\nOur method uses pairs of images of humans: the first is partially masked and\nthe model is trained to reconstruct the masked parts given the visible ones and\na second image. It relies on both stereoscopic (cross-view) pairs, and temporal\n(cross-pose) pairs taken from videos, in order to learn priors about 3D as well\nas human motion. We pre-train a model for body-centric tasks and one for\nhand-centric tasks. With a generic transformer architecture, these models\noutperform existing self-supervised pre-training methods on a wide set of\nhuman-centric downstream tasks, and obtain state-of-the-art performance for\ninstance when fine-tuning for model-based and model-free human mesh recovery.\n","authors":["Matthieu Armando","Salma Galaaoui","Fabien Baradel","Thomas Lucas","Vincent Leroy","Romain Brégier","Philippe Weinzaepfel","Grégory Rogez"],"pdf_url":"https://arxiv.org/pdf/2311.09104v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2303.12307v4","updated":"2024-04-18T08:54:01Z","published":"2023-03-22T04:49:23Z","title":"Predicting and Enhancing the Fairness of DNNs with the Curvature of\n Perceptual Manifolds","summary":" To address the challenges of long-tailed classification, researchers have\nproposed several approaches to reduce model bias, most of which assume that\nclasses with few samples are weak classes. However, recent studies have shown\nthat tail classes are not always hard to learn, and model bias has been\nobserved on sample-balanced datasets, suggesting the existence of other factors\nthat affect model bias. In this work, we first establish a geometric\nperspective for analyzing model fairness and then systematically propose a\nseries of geometric measurements for perceptual manifolds in deep neural\nnetworks. Subsequently, we comprehensively explore the effect of the geometric\ncharacteristics of perceptual manifolds on classification difficulty and how\nlearning shapes the geometric characteristics of perceptual manifolds. An\nunanticipated finding is that the correlation between the class accuracy and\nthe separation degree of perceptual manifolds gradually decreases during\ntraining, while the negative correlation with the curvature gradually\nincreases, implying that curvature imbalance leads to model bias.Building upon\nthese observations, we propose curvature regularization to facilitate the model\nto learn curvature-balanced and flatter perceptual manifolds. Evaluations on\nmultiple long-tailed and non-long-tailed datasets show the excellent\nperformance and exciting generality of our approach, especially in achieving\nsignificant performance improvements based on current state-of-the-art\ntechniques. Our work opens up a geometric analysis perspective on model bias\nand reminds researchers to pay attention to model bias on non-long-tailed and\neven sample-balanced datasets.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Maoji Wen","Lingling Li","Wenping Ma","Shuyuan Yang","Xu Liu","Puhua Chen"],"pdf_url":"https://arxiv.org/pdf/2303.12307v4.pdf","comment":"17pages, Accepted by CVPR 2023, Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2311.09590v2","updated":"2024-04-18T08:49:03Z","published":"2023-11-16T06:02:03Z","title":"MARformer: An Efficient Metal Artifact Reduction Transformer for Dental\n CBCT Images","summary":" Cone Beam Computed Tomography (CBCT) plays a key role in dental diagnosis and\nsurgery. However, the metal teeth implants could bring annoying metal artifacts\nduring the CBCT imaging process, interfering diagnosis and downstream\nprocessing such as tooth segmentation. In this paper, we develop an efficient\nTransformer to perform metal artifacts reduction (MAR) from dental CBCT images.\nThe proposed MAR Transformer (MARformer) reduces computation complexity in the\nmultihead self-attention by a new Dimension-Reduced Self-Attention (DRSA)\nmodule, based on that the CBCT images have globally similar structure. A\nPatch-wise Perceptive Feed Forward Network (P2FFN) is also proposed to perceive\nlocal image information for fine-grained restoration. Experimental results on\nCBCT images with synthetic and real-world metal artifacts show that our\nMARformer is efficient and outperforms previous MAR methods and two restoration\nTransformers.\n","authors":["Yuxuan Shi","Jun Xu","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2311.09590v2.pdf","comment":"under consideration of Computer Vision and Image Understanding\n journal"},{"id":"http://arxiv.org/abs/2404.11998v1","updated":"2024-04-18T08:46:12Z","published":"2024-04-18T08:46:12Z","title":"Curriculum Point Prompting for Weakly-Supervised Referring Image\n Segmentation","summary":" Referring image segmentation (RIS) aims to precisely segment referents in\nimages through corresponding natural language expressions, yet relying on\ncost-intensive mask annotations. Weakly supervised RIS thus learns from\nimage-text pairs to pixel-level semantics, which is challenging for segmenting\nfine-grained masks. A natural approach to enhancing segmentation precision is\nto empower weakly supervised RIS with the image segmentation foundation model\nSAM. Nevertheless, we observe that simply integrating SAM yields limited\nbenefits and can even lead to performance regression due to the inevitable\nnoise issues and challenges in excessive focus on object parts. In this paper,\nwe present an innovative framework, Point PrompTing (PPT), incorporated with\nthe proposed multi-source curriculum learning strategy to address these\nchallenges. Specifically, the core of PPT is a point generator that not only\nharnesses CLIP's text-image alignment capability and SAM's powerful mask\ngeneration ability but also generates negative point prompts to address the\nnoisy and excessive focus issues inherently and effectively. In addition, we\nintroduce a curriculum learning strategy with object-centric images to help PPT\ngradually learn from simpler yet precise semantic alignment to more complex\nRIS. Experiments demonstrate that our PPT significantly and consistently\noutperforms prior weakly supervised techniques on mIoU by 11.34%, 14.14%, and\n6.97% across RefCOCO, RefCOCO+, and G-Ref, respectively.\n","authors":["Qiyuan Dai","Sibei Yang"],"pdf_url":"https://arxiv.org/pdf/2404.11998v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15182v2","updated":"2024-04-18T08:40:58Z","published":"2024-03-22T13:11:26Z","title":"PDE-CNNs: Axiomatic Derivations and Applications","summary":" PDE-based Group Convolutional Neural Networks (PDE-G-CNNs) utilize solvers of\ngeometrically meaningful evolution PDEs as substitutes for the conventional\ncomponents in G-CNNs. PDE-G-CNNs offer several key benefits all at once: fewer\nparameters, inherent equivariance, better performance, data efficiency, and\ngeometric interpretability.\n In this article we focus on Euclidean equivariant PDE-G-CNNs where the\nfeature maps are two dimensional throughout. We call this variant of the\nframework a PDE-CNN.\n From a machine learning perspective, we list several practically desirable\naxioms and derive from these which PDEs should be used in a PDE-CNN. Here our\napproach to geometric learning via PDEs is inspired by the axioms of classical\nlinear and morphological scale-space theory, which we generalize by introducing\nsemifield-valued signals.\n Furthermore, we experimentally confirm for small networks that PDE-CNNs offer\nfewer parameters, increased performance, and better data efficiency when\ncompared to CNNs. We also investigate what effect the use of different\nsemifields has on the performance of the models.\n","authors":["Gijs Bellaard","Sei Sakata","Bart M. N. Smets","Remco Duits"],"pdf_url":"https://arxiv.org/pdf/2403.15182v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04265v5","updated":"2024-04-18T08:33:37Z","published":"2023-12-07T12:43:00Z","title":"Stronger, Fewer, & Superior: Harnessing Vision Foundation Models for\n Domain Generalized Semantic Segmentation","summary":" In this paper, we first assess and harness various Vision Foundation Models\n(VFMs) in the context of Domain Generalized Semantic Segmentation (DGSS).\nDriven by the motivation that Leveraging Stronger pre-trained models and Fewer\ntrainable parameters for Superior generalizability, we introduce a robust\nfine-tuning approach, namely Rein, to parameter-efficiently harness VFMs for\nDGSS. Built upon a set of trainable tokens, each linked to distinct instances,\nRein precisely refines and forwards the feature maps from each layer to the\nnext layer within the backbone. This process produces diverse refinements for\ndifferent categories within a single image. With fewer trainable parameters,\nRein efficiently fine-tunes VFMs for DGSS tasks, surprisingly surpassing full\nparameter fine-tuning. Extensive experiments across various settings\ndemonstrate that Rein significantly outperforms state-of-the-art methods.\nRemarkably, with just an extra 1% of trainable parameters within the frozen\nbackbone, Rein achieves a mIoU of 78.4% on the Cityscapes, without accessing\nany real urban-scene datasets.Code is available at\nhttps://github.com/w1oves/Rein.git.\n","authors":["Zhixiang Wei","Lin Chen","Yi Jin","Xiaoxiao Ma","Tianle Liu","Pengyang Ling","Ben Wang","Huaian Chen","Jinjin Zheng"],"pdf_url":"https://arxiv.org/pdf/2312.04265v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05317v4","updated":"2024-04-18T08:29:48Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v4.pdf","comment":"minor fixes/rephrasing"},{"id":"http://arxiv.org/abs/2404.11987v1","updated":"2024-04-18T08:29:29Z","published":"2024-04-18T08:29:29Z","title":"MultiPhys: Multi-Person Physics-aware 3D Motion Estimation","summary":" We introduce MultiPhys, a method designed for recovering multi-person motion\nfrom monocular videos. Our focus lies in capturing coherent spatial placement\nbetween pairs of individuals across varying degrees of engagement. MultiPhys,\nbeing physically aware, exhibits robustness to jittering and occlusions, and\neffectively eliminates penetration issues between the two individuals. We\ndevise a pipeline in which the motion estimated by a kinematic-based method is\nfed into a physics simulator in an autoregressive manner. We introduce distinct\ncomponents that enable our model to harness the simulator's properties without\ncompromising the accuracy of the kinematic estimates. This results in final\nmotion estimates that are both kinematically coherent and physically compliant.\nExtensive evaluations on three challenging datasets characterized by\nsubstantial inter-person interaction show that our method significantly reduces\nerrors associated with penetration and foot skating, while performing\ncompetitively with the state-of-the-art on motion accuracy and smoothness.\nResults and code can be found on our project page\n(http://www.iri.upc.edu/people/nugrinovic/multiphys/).\n","authors":["Nicolas Ugrinovic","Boxiao Pan","Georgios Pavlakos","Despoina Paschalidou","Bokui Shen","Jordi Sanchez-Riera","Francesc Moreno-Noguer","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2404.11987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11981v1","updated":"2024-04-18T08:23:24Z","published":"2024-04-18T08:23:24Z","title":"Tendency-driven Mutual Exclusivity for Weakly Supervised Incremental\n Semantic Segmentation","summary":" Weakly Incremental Learning for Semantic Segmentation (WILSS) leverages a\npre-trained segmentation model to segment new classes using cost-effective and\nreadily available image-level labels. A prevailing way to solve WILSS is the\ngeneration of seed areas for each new class, serving as a form of pixel-level\nsupervision. However, a scenario usually arises where a pixel is concurrently\npredicted as an old class by the pre-trained segmentation model and a new class\nby the seed areas. Such a scenario becomes particularly problematic in WILSS,\nas the lack of pixel-level annotations on new classes makes it intractable to\nascertain whether the pixel pertains to the new class or not. To surmount this\nissue, we propose an innovative, tendency-driven relationship of mutual\nexclusivity, meticulously tailored to govern the behavior of the seed areas and\nthe predictions generated by the pre-trained segmentation model. This\nrelationship stipulates that predictions for the new and old classes must not\nconflict whilst prioritizing the preservation of predictions for the old\nclasses, which not only addresses the conflicting prediction issue but also\neffectively mitigates the inherent challenge of incremental learning -\ncatastrophic forgetting. Furthermore, under the auspices of this\ntendency-driven mutual exclusivity relationship, we generate pseudo masks for\nthe new classes, allowing for concurrent execution with model parameter\nupdating via the resolution of a bi-level optimization problem. Extensive\nexperiments substantiate the effectiveness of our framework, resulting in the\nestablishment of new benchmarks and paving the way for further research in this\nfield.\n","authors":["Chongjie Si","Xuehui Wang","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2404.11981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11525v2","updated":"2024-04-18T08:23:05Z","published":"2024-04-17T16:16:12Z","title":"JointViT: Modeling Oxygen Saturation Levels with Joint Supervision on\n Long-Tailed OCTA","summary":" The oxygen saturation level in the blood (SaO2) is crucial for health,\nparticularly in relation to sleep-related breathing disorders. However,\ncontinuous monitoring of SaO2 is time-consuming and highly variable depending\non patients' conditions. Recently, optical coherence tomography angiography\n(OCTA) has shown promising development in rapidly and effectively screening\neye-related lesions, offering the potential for diagnosing sleep-related\ndisorders. To bridge this gap, our paper presents three key contributions.\nFirstly, we propose JointViT, a novel model based on the Vision Transformer\narchitecture, incorporating a joint loss function for supervision. Secondly, we\nintroduce a balancing augmentation technique during data preprocessing to\nimprove the model's performance, particularly on the long-tail distribution\nwithin the OCTA dataset. Lastly, through comprehensive experiments on the OCTA\ndataset, our proposed method significantly outperforms other state-of-the-art\nmethods, achieving improvements of up to 12.28% in overall accuracy. This\nadvancement lays the groundwork for the future utilization of OCTA in\ndiagnosing sleep-related disorders. See project website\nhttps://steve-zeyu-zhang.github.io/JointViT\n","authors":["Zeyu Zhang","Xuyin Qi","Mingxi Chen","Guangxi Li","Ryan Pham","Ayub Qassim","Ella Berry","Zhibin Liao","Owen Siggs","Robert Mclaughlin","Jamie Craig","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2404.11525v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11979v1","updated":"2024-04-18T08:16:56Z","published":"2024-04-18T08:16:56Z","title":"MTGA: Multi-view Temporal Granularity aligned Aggregation for\n Event-based Lip-reading","summary":" Lip-reading is to utilize the visual information of the speaker's lip\nmovements to recognize words and sentences. Existing event-based lip-reading\nsolutions integrate different frame rate branches to learn spatio-temporal\nfeatures of varying granularities. However, aggregating events into event\nframes inevitably leads to the loss of fine-grained temporal information within\nframes. To remedy this drawback, we propose a novel framework termed Multi-view\nTemporal Granularity aligned Aggregation (MTGA). Specifically, we first present\na novel event representation method, namely time-segmented voxel graph list,\nwhere the most significant local voxels are temporally connected into a graph\nlist. Then we design a spatio-temporal fusion module based on temporal\ngranularity alignment, where the global spatial features extracted from event\nframes, together with the local relative spatial and temporal features\ncontained in voxel graph list are effectively aligned and integrated. Finally,\nwe design a temporal aggregation module that incorporates positional encoding,\nwhich enables the capture of local absolute spatial and global temporal\ninformation. Experiments demonstrate that our method outperforms both the\nevent-based and video-based lip-reading counterparts. Our code will be publicly\navailable.\n","authors":["Wenhao Zhang","Jun Wang","Yong Luo","Lei Yu","Wei Yu","Zheng He"],"pdf_url":"https://arxiv.org/pdf/2404.11979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06244v2","updated":"2024-04-18T08:08:45Z","published":"2024-02-09T08:33:48Z","title":"Quantifying and Enhancing Multi-modal Robustness with Modality\n Preference","summary":" Multi-modal models have shown a promising capability to effectively integrate\ninformation from various sources, yet meanwhile, they are found vulnerable to\npervasive perturbations, such as uni-modal attacks and missing conditions. To\ncounter these perturbations, robust multi-modal representations are highly\nexpected, which are positioned well away from the discriminative multi-modal\ndecision boundary. In this paper, different from conventional empirical\nstudies, we focus on a commonly used joint multi-modal framework and\ntheoretically discover that larger uni-modal representation margins and more\nreliable integration for modalities are essential components for achieving\nhigher robustness. This discovery can further explain the limitation of\nmulti-modal robustness and the phenomenon that multi-modal models are often\nvulnerable to attacks on the specific modality. Moreover, our analysis reveals\nhow the widespread issue, that the model has different preferences for\nmodalities, limits the multi-modal robustness by influencing the essential\ncomponents and could lead to attacks on the specific modality highly effective.\nInspired by our theoretical finding, we introduce a training procedure called\nCertifiable Robust Multi-modal Training (CRMT), which can alleviate this\ninfluence from modality preference and explicitly regulate essential components\nto significantly improve robustness in a certifiable manner. Our method\ndemonstrates substantial improvements in performance and robustness compared\nwith existing methods. Furthermore, our training procedure can be easily\nextended to enhance other robust training strategies, highlighting its\ncredibility and flexibility.\n","authors":["Zequn Yang","Yake Wei","Ce Liang","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2402.06244v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2404.11974v1","updated":"2024-04-18T08:05:23Z","published":"2024-04-18T08:05:23Z","title":"Device (In)Dependence of Deep Learning-based Image Age Approximation","summary":" The goal of temporal image forensic is to approximate the age of a digital\nimage relative to images from the same device. Usually, this is based on traces\nleft during the image acquisition pipeline. For example, several methods exist\nthat exploit the presence of in-field sensor defects for this purpose. In\naddition to these 'classical' methods, there is also an approach in which a\nConvolutional Neural Network (CNN) is trained to approximate the image age. One\nadvantage of a CNN is that it independently learns the age features used. This\nwould make it possible to exploit other (different) age traces in addition to\nthe known ones (i.e., in-field sensor defects). In a previous work, we have\nshown that the presence of strong in-field sensor defects is irrelevant for a\nCNN to predict the age class. Based on this observation, the question arises\nhow device (in)dependent the learned features are. In this work, we empirically\nasses this by training a network on images from a single device and then apply\nthe trained model to images from different devices. This evaluation is\nperformed on 14 different devices, including 10 devices from the publicly\navailable 'Northumbria Temporal Image Forensics' database. These 10 different\ndevices are based on five different device pairs (i.e., with the identical\ncamera model).\n","authors":["Robert Jöchl","Andreas Uhl"],"pdf_url":"https://arxiv.org/pdf/2404.11974v1.pdf","comment":"This work was accepted and presented in: 2022 ICPR-Workshop on\n Artificial Intelligence for Multimedia Forensics and Disinformation\n Detection. Montreal, Quebec, Canada. However, due to a technical issue on the\n publishing companies' side, the work does not appear in the workshop\n proceedings"},{"id":"http://arxiv.org/abs/2305.00220v2","updated":"2024-04-18T08:01:26Z","published":"2023-04-29T10:10:25Z","title":"Relaxed forced choice improves performance of visual quality assessment\n methods","summary":" In image quality assessment, a collective visual quality score for an image\nor video is obtained from the individual ratings of many subjects. One commonly\nused format for these experiments is the two-alternative forced choice method.\nTwo stimuli with the same content but differing visual quality are presented\nsequentially or side-by-side. Subjects are asked to select the one of better\nquality, and when uncertain, they are required to guess. The relaxed\nalternative forced choice format aims to reduce the cognitive load and the\nnoise in the responses due to the guessing by providing a third response\noption, namely, ``not sure''. This work presents a large and comprehensive\ncrowdsourcing experiment to compare these two response formats: the one with\nthe ``not sure'' option and the one without it. To provide unambiguous ground\ntruth for quality evaluation, subjects were shown pairs of images with\ndiffering numbers of dots and asked each time to choose the one with more dots.\nOur crowdsourcing study involved 254 participants and was conducted using a\nwithin-subject design. Each participant was asked to respond to 40 pair\ncomparisons with and without the ``not sure'' response option and completed a\nquestionnaire to evaluate their cognitive load for each testing condition. The\nexperimental results show that the inclusion of the ``not sure'' response\noption in the forced choice method reduced mental load and led to models with\nbetter data fit and correspondence to ground truth. We also tested for the\nequivalence of the models and found that they were different. The dataset is\navailable at http://database.mmsp-kn.de/cogvqa-database.html.\n","authors":["Mohsen Jenadeleh","Johannes Zagermann","Harald Reiterer","Ulf-Dietrich Reips","Raouf Hamzaoui","Dietmar Saupe"],"pdf_url":"https://arxiv.org/pdf/2305.00220v2.pdf","comment":"6 pages, 3 figures, accepted at the 2023 15th International\n Conference on Quality of Multimedia Experience (QoMEX). Database is publicly\n accessible at http://database.mmsp-kn.de/cogvqa-database.html"},{"id":"http://arxiv.org/abs/2404.11962v1","updated":"2024-04-18T07:48:00Z","published":"2024-04-18T07:48:00Z","title":"©Plug-in Authorization for Human Content Copyright Protection\n in Text-to-Image Model","summary":" This paper addresses the contentious issue of copyright infringement in\nimages generated by text-to-image models, sparking debates among AI developers,\ncontent creators, and legal entities. State-of-the-art models create\nhigh-quality content without crediting original creators, causing concern in\nthe artistic community. To mitigate this, we propose the \\copyright Plug-in\nAuthorization framework, introducing three operations: addition, extraction,\nand combination. Addition involves training a \\copyright plug-in for specific\ncopyright, facilitating proper credit attribution. Extraction allows creators\nto reclaim copyright from infringing models, and combination enables users to\nmerge different \\copyright plug-ins. These operations act as permits,\nincentivizing fair use and providing flexibility in authorization. We present\ninnovative approaches,\"Reverse LoRA\" for extraction and \"EasyMerge\" for\nseamless combination. Experiments in artist-style replication and cartoon IP\nrecreation demonstrate \\copyright plug-ins' effectiveness, offering a valuable\nsolution for human copyright protection in the age of generative AIs.\n","authors":["Chao Zhou","Huishuai Zhang","Jiang Bian","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.11962v1.pdf","comment":"20 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.01188v2","updated":"2024-04-18T07:42:19Z","published":"2023-11-02T12:34:23Z","title":"Terrain-Informed Self-Supervised Learning: Enhancing Building Footprint\n Extraction from LiDAR Data with Limited Annotations","summary":" Estimating building footprint maps from geospatial data is of paramount\nimportance in urban planning, development, disaster management, and various\nother applications. Deep learning methodologies have gained prominence in\nbuilding segmentation maps, offering the promise of precise footprint\nextraction without extensive post-processing. However, these methods face\nchallenges in generalization and label efficiency, particularly in remote\nsensing, where obtaining accurate labels can be both expensive and\ntime-consuming. To address these challenges, we propose terrain-aware\nself-supervised learning, tailored to remote sensing, using digital elevation\nmodels from LiDAR data. We propose to learn a model to differentiate between\nbare Earth and superimposed structures enabling the network to implicitly learn\ndomain-relevant features without the need for extensive pixel-level\nannotations. We test the effectiveness of our approach by evaluating building\nsegmentation performance on test datasets with varying label fractions.\nRemarkably, with only 1% of the labels (equivalent to 25 labeled examples), our\nmethod improves over ImageNet pre-training, showing the advantage of leveraging\nunlabeled data for feature extraction in the domain of remote sensing. The\nperformance improvement is more pronounced in few-shot scenarios and gradually\ncloses the gap with ImageNet pre-training as the label fraction increases. We\ntest on a dataset characterized by substantial distribution shifts and labeling\nerrors to demonstrate the generalizability of our approach. When compared to\nother baselines, including ImageNet pretraining and more complex architectures,\nour approach consistently performs better, demonstrating the efficiency and\neffectiveness of self-supervised terrain-aware feature learning.\n","authors":["Anuja Vats","David Völgyes","Martijn Vermeer","Marius Pedersen","Kiran Raja","Daniele S. M. Fantin","Jacob Alexander Hay"],"pdf_url":"https://arxiv.org/pdf/2311.01188v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11459v2","updated":"2024-04-18T07:32:52Z","published":"2024-04-17T15:07:06Z","title":"Octopus v3: Technical Report for On-device Sub-billion Multimodal AI\n Agent","summary":" A multimodal AI agent is characterized by its ability to process and learn\nfrom various types of data, including natural language, visual, and audio\ninputs, to inform its actions. Despite advancements in large language models\nthat incorporate visual data, such as GPT-4V, effectively translating\nimage-based data into actionable outcomes for AI agents continues to be\nchallenging. In this paper, we introduce a multimodal model that incorporates\nthe concept of functional token specifically designed for AI agent\napplications. To ensure compatibility with edge devices, our model is optimized\nto a compact size of less than 1B parameters. Like GPT-4, our model can process\nboth English and Chinese. We demonstrate that this model is capable of\noperating efficiently on a wide range of edge devices, including as constrained\nas a Raspberry Pi.\n","authors":["Wei Chen","Zhiyuan Li"],"pdf_url":"https://arxiv.org/pdf/2404.11459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11958v1","updated":"2024-04-18T07:25:59Z","published":"2024-04-18T07:25:59Z","title":"Not All Voxels Are Equal: Hardness-Aware Semantic Scene Completion with\n Self-Distillation","summary":" Semantic scene completion, also known as semantic occupancy prediction, can\nprovide dense geometric and semantic information for autonomous vehicles, which\nattracts the increasing attention of both academia and industry. Unfortunately,\nexisting methods usually formulate this task as a voxel-wise classification\nproblem and treat each voxel equally in 3D space during training. As the hard\nvoxels have not been paid enough attention, the performance in some challenging\nregions is limited. The 3D dense space typically contains a large number of\nempty voxels, which are easy to learn but require amounts of computation due to\nhandling all the voxels uniformly for the existing models. Furthermore, the\nvoxels in the boundary region are more challenging to differentiate than those\nin the interior. In this paper, we propose HASSC approach to train the semantic\nscene completion model with hardness-aware design. The global hardness from the\nnetwork optimization process is defined for dynamical hard voxel selection.\nThen, the local hardness with geometric anisotropy is adopted for voxel-wise\nrefinement. Besides, self-distillation strategy is introduced to make training\nprocess stable and consistent. Extensive experiments show that our HASSC scheme\ncan effectively promote the accuracy of the baseline model without incurring\nthe extra inference cost. Source code is available at:\nhttps://github.com/songw-zju/HASSC.\n","authors":["Song Wang","Jiawei Yu","Wentong Li","Wenyu Liu","Xiaolu Liu","Junbo Chen","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.11958v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.11957v1","updated":"2024-04-18T07:22:38Z","published":"2024-04-18T07:22:38Z","title":"The devil is in the object boundary: towards annotation-free instance\n segmentation using Foundation Models","summary":" Foundation models, pre-trained on a large amount of data have demonstrated\nimpressive zero-shot capabilities in various downstream tasks. However, in\nobject detection and instance segmentation, two fundamental computer vision\ntasks heavily reliant on extensive human annotations, foundation models such as\nSAM and DINO struggle to achieve satisfactory performance. In this study, we\nreveal that the devil is in the object boundary, \\textit{i.e.}, these\nfoundation models fail to discern boundaries between individual objects. For\nthe first time, we probe that CLIP, which has never accessed any instance-level\nannotations, can provide a highly beneficial and strong instance-level boundary\nprior in the clustering results of its particular intermediate layer. Following\nthis surprising observation, we propose $\\textbf{Zip}$ which $\\textbf{Z}$ips up\nCL$\\textbf{ip}$ and SAM in a novel classification-first-then-discovery\npipeline, enabling annotation-free, complex-scene-capable, open-vocabulary\nobject detection and instance segmentation. Our Zip significantly boosts SAM's\nmask AP on COCO dataset by 12.5% and establishes state-of-the-art performance\nin various settings, including training-free, self-training, and\nlabel-efficient finetuning. Furthermore, annotation-free Zip even achieves\ncomparable performance to the best-performing open-vocabulary object detecters\nusing base annotations. Code is released at\nhttps://github.com/ChengShiest/Zip-Your-CLIP\n","authors":["Cheng Shi","Sibei Yang"],"pdf_url":"https://arxiv.org/pdf/2404.11957v1.pdf","comment":"ICLR2024, Code is released at\n https://github.com/ChengShiest/Zip-Your-CLIP"},{"id":"http://arxiv.org/abs/2404.11949v1","updated":"2024-04-18T07:07:38Z","published":"2024-04-18T07:07:38Z","title":"Sketch-guided Image Inpainting with Partial Discrete Diffusion Process","summary":" In this work, we study the task of sketch-guided image inpainting. Unlike the\nwell-explored natural language-guided image inpainting, which excels in\ncapturing semantic details, the relatively less-studied sketch-guided\ninpainting offers greater user control in specifying the object's shape and\npose to be inpainted. As one of the early solutions to this task, we introduce\na novel partial discrete diffusion process (PDDP). The forward pass of the PDDP\ncorrupts the masked regions of the image and the backward pass reconstructs\nthese masked regions conditioned on hand-drawn sketches using our proposed\nsketch-guided bi-directional transformer. The proposed novel transformer module\naccepts two inputs -- the image containing the masked region to be inpainted\nand the query sketch to model the reverse diffusion process. This strategy\neffectively addresses the domain gap between sketches and natural images,\nthereby, enhancing the quality of inpainting results. In the absence of a\nlarge-scale dataset specific to this task, we synthesize a dataset from the\nMS-COCO to train and extensively evaluate our proposed framework against\nvarious competent approaches in the literature. The qualitative and\nquantitative results and user studies establish that the proposed method\ninpaints realistic objects that fit the context in terms of the visual\nappearance of the provided sketch. To aid further research, we have made our\ncode publicly available at https://github.com/vl2g/Sketch-Inpainting .\n","authors":["Nakul Sharma","Aditay Tripathi","Anirban Chakraborty","Anand Mishra"],"pdf_url":"https://arxiv.org/pdf/2404.11949v1.pdf","comment":"Accepted to NTIRE Workshop @ CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11947v1","updated":"2024-04-18T06:59:40Z","published":"2024-04-18T06:59:40Z","title":"VCC-INFUSE: Towards Accurate and Efficient Selection of Unlabeled\n Examples in Semi-supervised Learning","summary":" Despite the progress of Semi-supervised Learning (SSL), existing methods fail\nto utilize unlabeled data effectively and efficiently. Many pseudo-label-based\nmethods select unlabeled examples based on inaccurate confidence scores from\nthe classifier. Most prior work also uses all available unlabeled data without\npruning, making it difficult to handle large amounts of unlabeled data. To\naddress these issues, we propose two methods: Variational Confidence\nCalibration (VCC) and Influence-Function-based Unlabeled Sample Elimination\n(INFUSE). VCC is an universal plugin for SSL confidence calibration, using a\nvariational autoencoder to select more accurate pseudo labels based on three\ntypes of consistency scores. INFUSE is a data pruning method that constructs a\ncore dataset of unlabeled examples under SSL. Our methods are effective in\nmultiple datasets and settings, reducing classification errors rates and saving\ntraining time. Together, VCC-INFUSE reduces the error rate of FlexMatch on the\nCIFAR-100 dataset by 1.08% while saving nearly half of the training time.\n","authors":["Shijie Fang","Qianhan Feng","Tong Lin"],"pdf_url":"https://arxiv.org/pdf/2404.11947v1.pdf","comment":"Accepted paper of IJCAI 2024. Shijie Fang and Qianhan Feng\n contributed equally to this paper"},{"id":"http://arxiv.org/abs/2404.11946v1","updated":"2024-04-18T06:58:02Z","published":"2024-04-18T06:58:02Z","title":"S4TP: Social-Suitable and Safety-Sensitive Trajectory Planning for\n Autonomous Vehicles","summary":" In public roads, autonomous vehicles (AVs) face the challenge of frequent\ninteractions with human-driven vehicles (HDVs), which render uncertain driving\nbehavior due to varying social characteristics among humans. To effectively\nassess the risks prevailing in the vicinity of AVs in social interactive\ntraffic scenarios and achieve safe autonomous driving, this article proposes a\nsocial-suitable and safety-sensitive trajectory planning (S4TP) framework.\nSpecifically, S4TP integrates the Social-Aware Trajectory Prediction (SATP) and\nSocial-Aware Driving Risk Field (SADRF) modules. SATP utilizes Transformers to\neffectively encode the driving scene and incorporates an AV's planned\ntrajectory during the prediction decoding process. SADRF assesses the expected\nsurrounding risk degrees during AVs-HDVs interactions, each with different\nsocial characteristics, visualized as two-dimensional heat maps centered on the\nAV. SADRF models the driving intentions of the surrounding HDVs and predicts\ntrajectories based on the representation of vehicular interactions. S4TP\nemploys an optimization-based approach for motion planning, utilizing the\npredicted HDVs'trajectories as input. With the integration of SADRF, S4TP\nexecutes real-time online optimization of the planned trajectory of AV within\nlowrisk regions, thus improving the safety and the interpretability of the\nplanned trajectory. We have conducted comprehensive tests of the proposed\nmethod using the SMARTS simulator. Experimental results in complex social\nscenarios, such as unprotected left turn intersections, merging, cruising, and\novertaking, validate the superiority of our proposed S4TP in terms of safety\nand rationality. S4TP achieves a pass rate of 100% across all scenarios,\nsurpassing the current state-of-the-art methods Fanta of 98.25% and\nPredictive-Decision of 94.75%.\n","authors":["Xiao Wang","Ke Tang","Xingyuan Dai","Jintao Xu","Quancheng Du","Rui Ai","Yuxiao Wang","Weihao Gu"],"pdf_url":"https://arxiv.org/pdf/2404.11946v1.pdf","comment":"12 pages,4 figures, published to IEEE Transactions on Intelligent\n Vehicles"},{"id":"http://arxiv.org/abs/2401.16158v2","updated":"2024-04-18T06:53:38Z","published":"2024-01-29T13:46:37Z","title":"Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual\n Perception","summary":" Mobile device agent based on Multimodal Large Language Models (MLLM) is\nbecoming a popular application. In this paper, we introduce Mobile-Agent, an\nautonomous multi-modal mobile device agent. Mobile-Agent first leverages visual\nperception tools to accurately identify and locate both the visual and textual\nelements within the app's front-end interface. Based on the perceived vision\ncontext, it then autonomously plans and decomposes the complex operation task,\nand navigates the mobile Apps through operations step by step. Different from\nprevious solutions that rely on XML files of Apps or mobile system metadata,\nMobile-Agent allows for greater adaptability across diverse mobile operating\nenvironments in a vision-centric way, thereby eliminating the necessity for\nsystem-specific customizations. To assess the performance of Mobile-Agent, we\nintroduced Mobile-Eval, a benchmark for evaluating mobile device operations.\nBased on Mobile-Eval, we conducted a comprehensive evaluation of Mobile-Agent.\nThe experimental results indicate that Mobile-Agent achieved remarkable\naccuracy and completion rates. Even with challenging instructions, such as\nmulti-app operations, Mobile-Agent can still complete the requirements. Code\nand model will be open-sourced at https://github.com/X-PLUG/MobileAgent.\n","authors":["Junyang Wang","Haiyang Xu","Jiabo Ye","Ming Yan","Weizhou Shen","Ji Zhang","Fei Huang","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2401.16158v2.pdf","comment":"Accepted by ICLR 2024 Workshop in Large Language Model (LLM) Agents"},{"id":"http://arxiv.org/abs/2404.11936v1","updated":"2024-04-18T06:35:37Z","published":"2024-04-18T06:35:37Z","title":"LD-Pruner: Efficient Pruning of Latent Diffusion Models using\n Task-Agnostic Insights","summary":" Latent Diffusion Models (LDMs) have emerged as powerful generative models,\nknown for delivering remarkable results under constrained computational\nresources. However, deploying LDMs on resource-limited devices remains a\ncomplex issue, presenting challenges such as memory consumption and inference\nspeed. To address this issue, we introduce LD-Pruner, a novel\nperformance-preserving structured pruning method for compressing LDMs.\nTraditional pruning methods for deep neural networks are not tailored to the\nunique characteristics of LDMs, such as the high computational cost of training\nand the absence of a fast, straightforward and task-agnostic method for\nevaluating model performance. Our method tackles these challenges by leveraging\nthe latent space during the pruning process, enabling us to effectively\nquantify the impact of pruning on model performance, independently of the task\nat hand. This targeted pruning of components with minimal impact on the output\nallows for faster convergence during training, as the model has less\ninformation to re-learn, thereby addressing the high computational cost of\ntraining. Consequently, our approach achieves a compressed model that offers\nimproved inference speed and reduced parameter count, while maintaining minimal\nperformance degradation. We demonstrate the effectiveness of our approach on\nthree different tasks: text-to-image (T2I) generation, Unconditional Image\nGeneration (UIG) and Unconditional Audio Generation (UAG). Notably, we reduce\nthe inference time of Stable Diffusion (SD) by 34.9% while simultaneously\nimproving its FID by 5.2% on MS-COCO T2I benchmark. This work paves the way for\nmore efficient pruning methods for LDMs, enhancing their applicability.\n","authors":["Thibault Castells","Hyoung-Kyu Song","Bo-Kyeong Kim","Shinkook Choi"],"pdf_url":"https://arxiv.org/pdf/2404.11936v1.pdf","comment":"8 pages, accepted to CVPR24 First Workshop on Efficient and On-Device\n Generation (EDGE)"},{"id":"http://arxiv.org/abs/2404.11929v1","updated":"2024-04-18T06:18:48Z","published":"2024-04-18T06:18:48Z","title":"A Symmetric Regressor for MRI-Based Assessment of Striatal Dopamine\n Transporter Uptake in Parkinson's Disease","summary":" Dopamine transporter (DAT) imaging is commonly used for monitoring\nParkinson's disease (PD), where striatal DAT uptake amount is computed to\nassess PD severity. However, DAT imaging has a high cost and the risk of\nradiance exposure and is not available in general clinics. Recently, MRI patch\nof the nigral region has been proposed as a safer and easier alternative. This\npaper proposes a symmetric regressor for predicting the DAT uptake amount from\nthe nigral MRI patch. Acknowledging the symmetry between the right and left\nnigrae, the proposed regressor incorporates a paired input-output model that\nsimultaneously predicts the DAT uptake amounts for both the right and left\nstriata. Moreover, it employs a symmetric loss that imposes a constraint on the\ndifference between right-to-left predictions, resembling the high correlation\nin DAT uptake amounts in the two lateral sides. Additionally, we propose a\nsymmetric Monte-Carlo (MC) dropout method for providing a fruitful uncertainty\nestimate of the DAT uptake prediction, which utilizes the above symmetry. We\nevaluated the proposed approach on 734 nigral patches, which demonstrated\nsignificantly improved performance of the symmetric regressor compared with the\nstandard regressors while giving better explainability and feature\nrepresentation. The symmetric MC dropout also gave precise uncertainty ranges\nwith a high probability of including the true DAT uptake amounts within the\nrange.\n","authors":["Walid Abdullah Al","Il Dong Yun","Yun Jung Bae"],"pdf_url":"https://arxiv.org/pdf/2404.11929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08513v3","updated":"2024-04-18T06:12:57Z","published":"2023-09-15T16:19:09Z","title":"SCT: A Simple Baseline for Parameter-Efficient Fine-Tuning via Salient\n Channels","summary":" Pre-trained vision transformers have strong representation benefits to\nvarious downstream tasks. Recently, many parameter-efficient fine-tuning (PEFT)\nmethods have been proposed, and their experiments demonstrate that tuning only\n1% of extra parameters could surpass full fine-tuning in low-data resource\nscenarios. However, these methods overlook the task-specific information when\nfine-tuning diverse downstream tasks. In this paper, we propose a simple yet\neffective method called \"Salient Channel Tuning\" (SCT) to leverage the\ntask-specific information by forwarding the model with the task images to\nselect partial channels in a feature map that enables us to tune only 1/8\nchannels leading to significantly lower parameter costs. Experiments outperform\nfull fine-tuning on 18 out of 19 tasks in the VTAB-1K benchmark by adding only\n0.11M parameters of the ViT-B, which is 780x fewer than its full fine-tuning\ncounterpart. Furthermore, experiments on domain generalization and few-shot\nlearning surpass other PEFT methods with lower parameter costs, demonstrating\nour proposed tuning technique's strong capability and effectiveness in the\nlow-data regime.\n","authors":["Henry Hengyuan Zhao","Pichao Wang","Yuyang Zhao","Hao Luo","Fan Wang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2309.08513v3.pdf","comment":"This work has been accepted by IJCV2023"},{"id":"http://arxiv.org/abs/2310.05886v2","updated":"2024-04-18T06:11:43Z","published":"2023-10-09T17:28:35Z","title":"Streaming Anchor Loss: Augmenting Supervision with Temporal Significance","summary":" Streaming neural network models for fast frame-wise responses to various\nspeech and sensory signals are widely adopted on resource-constrained\nplatforms. Hence, increasing the learning capacity of such streaming models\n(i.e., by adding more parameters) to improve the predictive power may not be\nviable for real-world tasks. In this work, we propose a new loss, Streaming\nAnchor Loss (SAL), to better utilize the given learning capacity by encouraging\nthe model to learn more from essential frames. More specifically, our SAL and\nits focal variations dynamically modulate the frame-wise cross entropy loss\nbased on the importance of the corresponding frames so that a higher loss\npenalty is assigned for frames within the temporal proximity of semantically\ncritical events. Therefore, our loss ensures that the model training focuses on\npredicting the relatively rare but task-relevant frames. Experimental results\nwith standard lightweight convolutional and recurrent streaming networks on\nthree different speech based detection tasks demonstrate that SAL enables the\nmodel to learn the overall task more effectively with improved accuracy and\nlatency, without any additional data, model parameters, or architectural\nchanges.\n","authors":["Utkarsh Oggy Sarawgi","John Berkowitz","Vineet Garg","Arnav Kundu","Minsik Cho","Sai Srujana Buddi","Saurabh Adya","Ahmed Tewfik"],"pdf_url":"https://arxiv.org/pdf/2310.05886v2.pdf","comment":"Published at IEEE ICASSP 2024, please see\n https://ieeexplore.ieee.org/abstract/document/10447222"},{"id":"http://arxiv.org/abs/2404.11614v2","updated":"2024-04-18T06:06:29Z","published":"2024-04-17T17:59:55Z","title":"Dynamic Typography: Bringing Text to Life via Video Diffusion Prior","summary":" Text animation serves as an expressive medium, transforming static\ncommunication into dynamic experiences by infusing words with motion to evoke\nemotions, emphasize meanings, and construct compelling narratives. Crafting\nanimations that are semantically aware poses significant challenges, demanding\nexpertise in graphic design and animation. We present an automated text\nanimation scheme, termed \"Dynamic Typography\", which combines two challenging\ntasks. It deforms letters to convey semantic meaning and infuses them with\nvibrant movements based on user prompts. Our technique harnesses vector\ngraphics representations and an end-to-end optimization-based framework. This\nframework employs neural displacement fields to convert letters into base\nshapes and applies per-frame motion, encouraging coherence with the intended\ntextual concept. Shape preservation techniques and perceptual loss\nregularization are employed to maintain legibility and structural integrity\nthroughout the animation process. We demonstrate the generalizability of our\napproach across various text-to-video models and highlight the superiority of\nour end-to-end methodology over baseline methods, which might comprise separate\ntasks. Through quantitative and qualitative evaluations, we demonstrate the\neffectiveness of our framework in generating coherent text animations that\nfaithfully interpret user prompts while maintaining readability. Our code is\navailable at: https://animate-your-word.github.io/demo/.\n","authors":["Zichen Liu","Yihao Meng","Hao Ouyang","Yue Yu","Bolin Zhao","Daniel Cohen-Or","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2404.11614v2.pdf","comment":"Our demo page is available at:\n https://animate-your-word.github.io/demo/"},{"id":"http://arxiv.org/abs/2404.11925v1","updated":"2024-04-18T06:02:54Z","published":"2024-04-18T06:02:54Z","title":"EdgeFusion: On-Device Text-to-Image Generation","summary":" The intensive computational burden of Stable Diffusion (SD) for text-to-image\ngeneration poses a significant hurdle for its practical application. To tackle\nthis challenge, recent research focuses on methods to reduce sampling steps,\nsuch as Latent Consistency Model (LCM), and on employing architectural\noptimizations, including pruning and knowledge distillation. Diverging from\nexisting approaches, we uniquely start with a compact SD variant, BK-SDM. We\nobserve that directly applying LCM to BK-SDM with commonly used crawled\ndatasets yields unsatisfactory results. It leads us to develop two strategies:\n(1) leveraging high-quality image-text pairs from leading generative models and\n(2) designing an advanced distillation process tailored for LCM. Through our\nthorough exploration of quantization, profiling, and on-device deployment, we\nachieve rapid generation of photo-realistic, text-aligned images in just two\nsteps, with latency under one second on resource-limited edge devices.\n","authors":["Thibault Castells","Hyoung-Kyu Song","Tairen Piao","Shinkook Choi","Bo-Kyeong Kim","Hanyoung Yim","Changgwun Lee","Jae Gon Kim","Tae-Ho Kim"],"pdf_url":"https://arxiv.org/pdf/2404.11925v1.pdf","comment":"4 pages, accepted to CVPR24 First Workshop on Efficient and On-Device\n Generation (EDGE)"},{"id":"http://arxiv.org/abs/2403.17924v2","updated":"2024-04-18T05:11:54Z","published":"2024-03-26T17:57:05Z","title":"AID: Attention Interpolation of Text-to-Image Diffusion","summary":" Conditional diffusion models can create unseen images in various settings,\naiding image interpolation. Interpolation in latent spaces is well-studied, but\ninterpolation with specific conditions like text or poses is less understood.\nSimple approaches, such as linear interpolation in the space of conditions,\noften result in images that lack consistency, smoothness, and fidelity. To that\nend, we introduce a novel training-free technique named Attention Interpolation\nvia Diffusion (AID). Our key contributions include 1) proposing an inner/outer\ninterpolated attention layer; 2) fusing the interpolated attention with\nself-attention to boost fidelity; and 3) applying beta distribution to\nselection to increase smoothness. We also present a variant, Prompt-guided\nAttention Interpolation via Diffusion (PAID), that considers interpolation as a\ncondition-dependent generative process. This method enables the creation of new\nimages with greater consistency, smoothness, and efficiency, and offers control\nover the exact path of interpolation. Our approach demonstrates effectiveness\nfor conceptual and spatial interpolation. Code and demo are available at\nhttps://github.com/QY-H00/attention-interpolation-diffusion.\n","authors":["Qiyuan He","Jinghao Wang","Ziwei Liu","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2403.17924v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17893v2","updated":"2024-04-18T05:09:04Z","published":"2024-03-26T17:29:26Z","title":"A Survey on 3D Egocentric Human Pose Estimation","summary":" Egocentric human pose estimation aims to estimate human body poses and\ndevelop body representations from a first-person camera perspective. It has\ngained vast popularity in recent years because of its wide range of\napplications in sectors like XR-technologies, human-computer interaction, and\nfitness tracking. However, to the best of our knowledge, there is no systematic\nliterature review based on the proposed solutions regarding egocentric 3D human\npose estimation. To that end, the aim of this survey paper is to provide an\nextensive overview of the current state of egocentric pose estimation research.\nIn this paper, we categorize and discuss the popular datasets and the different\npose estimation models, highlighting the strengths and weaknesses of different\nmethods by comparative analysis. This survey can be a valuable resource for\nboth researchers and practitioners in the field, offering insights into key\nconcepts and cutting-edge solutions in egocentric pose estimation, its\nwide-ranging applications, as well as the open problems with future scope.\n","authors":["Md Mushfiqur Azam","Kevin Desai"],"pdf_url":"https://arxiv.org/pdf/2403.17893v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11903v1","updated":"2024-04-18T05:06:12Z","published":"2024-04-18T05:06:12Z","title":"Simultaneous Detection and Interaction Reasoning for Object-Centric\n Action Recognition","summary":" The interactions between human and objects are important for recognizing\nobject-centric actions. Existing methods usually adopt a two-stage pipeline,\nwhere object proposals are first detected using a pretrained detector, and then\nare fed to an action recognition model for extracting video features and\nlearning the object relations for action recognition. However, since the action\nprior is unknown in the object detection stage, important objects could be\neasily overlooked, leading to inferior action recognition performance. In this\npaper, we propose an end-to-end object-centric action recognition framework\nthat simultaneously performs Detection And Interaction Reasoning in one stage.\nParticularly, after extracting video features with a base network, we create\nthree modules for concurrent object detection and interaction reasoning. First,\na Patch-based Object Decoder generates proposals from video patch tokens. Then,\nan Interactive Object Refining and Aggregation identifies important objects for\naction recognition, adjusts proposal scores based on position and appearance,\nand aggregates object-level info into a global video representation. Lastly, an\nObject Relation Modeling module encodes object relations. These three modules\ntogether with the video feature extractor can be trained jointly in an\nend-to-end fashion, thus avoiding the heavy reliance on an off-the-shelf object\ndetector, and reducing the multi-stage training burden. We conduct experiments\non two datasets, Something-Else and Ikea-Assembly, to evaluate the performance\nof our proposed approach on conventional, compositional, and few-shot action\nrecognition tasks. Through in-depth experimental analysis, we show the crucial\nrole of interactive objects in learning for action recognition, and we can\noutperform state-of-the-art methods on both datasets.\n","authors":["Xunsong Li","Pengzhan Sun","Yangcen Liu","Lixin Duan","Wen Li"],"pdf_url":"https://arxiv.org/pdf/2404.11903v1.pdf","comment":"12 pages, 5 figures, submitted to IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2404.11897v1","updated":"2024-04-18T04:54:28Z","published":"2024-04-18T04:54:28Z","title":"AG-NeRF: Attention-guided Neural Radiance Fields for Multi-height\n Large-scale Outdoor Scene Rendering","summary":" Existing neural radiance fields (NeRF)-based novel view synthesis methods for\nlarge-scale outdoor scenes are mainly built on a single altitude. Moreover,\nthey often require a priori camera shooting height and scene scope, leading to\ninefficient and impractical applications when camera altitude changes. In this\nwork, we propose an end-to-end framework, termed AG-NeRF, and seek to reduce\nthe training cost of building good reconstructions by synthesizing\nfree-viewpoint images based on varying altitudes of scenes. Specifically, to\ntackle the detail variation problem from low altitude (drone-level) to high\naltitude (satellite-level), a source image selection method and an\nattention-based feature fusion approach are developed to extract and fuse the\nmost relevant features of target view from multi-height images for\nhigh-fidelity rendering. Extensive experiments demonstrate that AG-NeRF\nachieves SOTA performance on 56 Leonard and Transamerica benchmarks and only\nrequires a half hour of training time to reach the competitive PSNR as compared\nto the latest BungeeNeRF.\n","authors":["Jingfeng Guo","Xiaohan Zhang","Baozhu Zhao","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11895v1","updated":"2024-04-18T04:47:28Z","published":"2024-04-18T04:47:28Z","title":"FreeDiff: Progressive Frequency Truncation for Image Editing with\n Diffusion Models","summary":" Precise image editing with text-to-image models has attracted increasing\ninterest due to their remarkable generative capabilities and user-friendly\nnature. However, such attempts face the pivotal challenge of misalignment\nbetween the intended precise editing target regions and the broader area\nimpacted by the guidance in practice. Despite excellent methods leveraging\nattention mechanisms that have been developed to refine the editing guidance,\nthese approaches necessitate modifications through complex network architecture\nand are limited to specific editing tasks. In this work, we re-examine the\ndiffusion process and misalignment problem from a frequency perspective,\nrevealing that, due to the power law of natural images and the decaying noise\nschedule, the denoising network primarily recovers low-frequency image\ncomponents during the earlier timesteps and thus brings excessive low-frequency\nsignals for editing. Leveraging this insight, we introduce a novel fine-tuning\nfree approach that employs progressive $\\textbf{Fre}$qu$\\textbf{e}$ncy\ntruncation to refine the guidance of $\\textbf{Diff}$usion models for universal\nediting tasks ($\\textbf{FreeDiff}$). Our method achieves comparable results\nwith state-of-the-art methods across a variety of editing tasks and on a\ndiverse set of images, highlighting its potential as a versatile tool in image\nediting applications.\n","authors":["Wei Wu","Qingnan Fan","Shuai Qin","Hong Gu","Ruoyu Zhao","Antoni B. Chan"],"pdf_url":"https://arxiv.org/pdf/2404.11895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06118v2","updated":"2024-04-18T04:33:53Z","published":"2024-02-09T01:00:14Z","title":"ViGoR: Improving Visual Grounding of Large Vision Language Models with\n Fine-Grained Reward Modeling","summary":" By combining natural language understanding, generation capabilities, and\nbreadth of knowledge of large language models with image perception, recent\nlarge vision language models (LVLMs) have shown unprecedented visual reasoning\ncapabilities. However, the generated text often suffers from inaccurate\ngrounding in the visual input, resulting in errors such as hallucination of\nnonexistent scene elements, missing significant parts of the scene, and\ninferring incorrect attributes of and relationships between objects. To address\nthese issues, we introduce a novel framework, ViGoR(Visual Grounding Through\nFine-Grained Reward Modeling) that utilizes fine-grained reward modeling to\nsignificantly enhance the visual grounding of LVLMs over pre-trained baselines.\nThis improvement is efficiently achieved using much cheaper human evaluations\ninstead of full supervisions, as well as automated methods. We show the\neffectiveness of our approach through a variety of evaluation methods and\nbenchmarks. Additionally, we plan to release our human annotation comprising\napproximately 16,000 images and generated text pairs with fine-grained\nevaluations to contribute to related research in the community.\n","authors":["Siming Yan","Min Bai","Weifeng Chen","Xiong Zhou","Qixing Huang","Li Erran Li"],"pdf_url":"https://arxiv.org/pdf/2402.06118v2.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.11889v1","updated":"2024-04-18T04:25:56Z","published":"2024-04-18T04:25:56Z","title":"Multi-view X-ray Image Synthesis with Multiple Domain Disentanglement\n from CT Scans","summary":" X-ray images play a vital role in the intraoperative processes due to their\nhigh resolution and fast imaging speed and greatly promote the subsequent\nsegmentation, registration and reconstruction. However, over-dosed X-rays\nsuperimpose potential risks to human health to some extent. Data-driven\nalgorithms from volume scans to X-ray images are restricted by the scarcity of\npaired X-ray and volume data. Existing methods are mainly realized by modelling\nthe whole X-ray imaging procedure. In this study, we propose a learning-based\napproach termed CT2X-GAN to synthesize the X-ray images in an end-to-end manner\nusing the content and style disentanglement from three different image domains.\nOur method decouples the anatomical structure information from CT scans and\nstyle information from unpaired real X-ray images/ digital reconstructed\nradiography (DRR) images via a series of decoupling encoders. Additionally, we\nintroduce a novel consistency regularization term to improve the stylistic\nresemblance between synthesized X-ray images and real X-ray images. Meanwhile,\nwe also impose a supervised process by computing the similarity of computed\nreal DRR and synthesized DRR images. We further develop a pose attention module\nto fully strengthen the comprehensive information in the decoupled content code\nfrom CT scans, facilitating high-quality multi-view image synthesis in the\nlower 2D space. Extensive experiments were conducted on the publicly available\nCTSpine1K dataset and achieved 97.8350, 0.0842 and 3.0938 in terms of FID, KID\nand defined user-scored X-ray similarity, respectively. In comparison with\n3D-aware methods ($\\pi$-GAN, EG3D), CT2X-GAN is superior in improving the\nsynthesis quality and realistic to the real X-ray images.\n","authors":["Lixing Tan","Shuang Song","Kangneng Zhou","Chengbo Duan","Lanying Wang","Huayang Ren","Linlin Liu","Wei Zhang","Ruoxiu Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.11889v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.11326v2","updated":"2024-04-18T04:22:07Z","published":"2024-04-17T12:38:58Z","title":"Single-temporal Supervised Remote Change Detection for Domain\n Generalization","summary":" Change detection is widely applied in remote sensing image analysis. Existing\nmethods require training models separately for each dataset, which leads to\npoor domain generalization. Moreover, these methods rely heavily on large\namounts of high-quality pair-labelled data for training, which is expensive and\nimpractical. In this paper, we propose a multimodal contrastive learning\n(ChangeCLIP) based on visual-language pre-training for change detection domain\ngeneralization. Additionally, we propose a dynamic context optimization for\nprompt learning. Meanwhile, to address the data dependency issue of existing\nmethods, we introduce a single-temporal and controllable AI-generated training\nstrategy (SAIN). This allows us to train the model using a large number of\nsingle-temporal images without image pairs in the real world, achieving\nexcellent generalization. Extensive experiments on series of real change\ndetection datasets validate the superiority and strong generalization of\nChangeCLIP, outperforming state-of-the-art change detection methods. Code will\nbe available.\n","authors":["Qiangang Du","Jinlong Peng","Xu Chen","Qingdong He","Liren He","Qiang Nie","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16512v4","updated":"2024-04-18T04:12:32Z","published":"2023-08-31T07:49:06Z","title":"MVDream: Multi-view Diffusion for 3D Generation","summary":" We introduce MVDream, a diffusion model that is able to generate consistent\nmulti-view images from a given text prompt. Learning from both 2D and 3D data,\na multi-view diffusion model can achieve the generalizability of 2D diffusion\nmodels and the consistency of 3D renderings. We demonstrate that such a\nmulti-view diffusion model is implicitly a generalizable 3D prior agnostic to\n3D representations. It can be applied to 3D generation via Score Distillation\nSampling, significantly enhancing the consistency and stability of existing\n2D-lifting methods. It can also learn new concepts from a few 2D examples, akin\nto DreamBooth, but for 3D generation.\n","authors":["Yichun Shi","Peng Wang","Jianglong Ye","Mai Long","Kejie Li","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2308.16512v4.pdf","comment":"Reorganized for arXiv; Our project page is https://MV-Dream.github.io"},{"id":"http://arxiv.org/abs/2404.11884v1","updated":"2024-04-18T03:58:27Z","published":"2024-04-18T03:58:27Z","title":"Seeing Motion at Nighttime with an Event Camera","summary":" We focus on a very challenging task: imaging at nighttime dynamic scenes.\nMost previous methods rely on the low-light enhancement of a conventional RGB\ncamera. However, they would inevitably face a dilemma between the long exposure\ntime of nighttime and the motion blur of dynamic scenes. Event cameras react to\ndynamic changes with higher temporal resolution (microsecond) and higher\ndynamic range (120dB), offering an alternative solution. In this work, we\npresent a novel nighttime dynamic imaging method with an event camera.\nSpecifically, we discover that the event at nighttime exhibits temporal\ntrailing characteristics and spatial non-stationary distribution. Consequently,\nwe propose a nighttime event reconstruction network (NER-Net) which mainly\nincludes a learnable event timestamps calibration module (LETC) to align the\ntemporal trailing events and a non-uniform illumination aware module (NIAM) to\nstabilize the spatiotemporal distribution of events. Moreover, we construct a\npaired real low-light event dataset (RLED) through a co-axial imaging system,\nincluding 64,200 spatially and temporally aligned image GTs and low-light\nevents. Extensive experiments demonstrate that the proposed method outperforms\nstate-of-the-art methods in terms of visual quality and generalization ability\non real-world nighttime datasets. The project are available at:\nhttps://github.com/Liu-haoyue/NER-Net.\n","authors":["Haoyue Liu","Shihan Peng","Lin Zhu","Yi Chang","Hanyu Zhou","Luxin Yan"],"pdf_url":"https://arxiv.org/pdf/2404.11884v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.01858v3","updated":"2024-04-18T03:54:39Z","published":"2024-02-02T19:28:33Z","title":"Explaining latent representations of generative models with large\n multimodal models","summary":" Learning interpretable representations of data generative latent factors is\nan important topic for the development of artificial intelligence. With the\nrise of the large multimodal model, it can align images with text to generate\nanswers. In this work, we propose a framework to comprehensively explain each\nlatent variable in the generative models using a large multimodal model. We\nfurther measure the uncertainty of our generated explanations, quantitatively\nevaluate the performance of explanation generation among multiple large\nmultimodal models, and qualitatively visualize the variations of each latent\nvariable to learn the disentanglement effects of different generative models on\nexplanations. Finally, we discuss the explanatory capabilities and limitations\nof state-of-the-art large multimodal models.\n","authors":["Mengdan Zhu","Zhenke Liu","Bo Pan","Abhinav Angirekula","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.01858v3.pdf","comment":"ICLR 2024 Workshop on Reliable and Responsible Foundation Models"},{"id":"http://arxiv.org/abs/2404.05673v2","updated":"2024-04-18T03:36:58Z","published":"2024-04-08T16:55:39Z","title":"CoReS: Orchestrating the Dance of Reasoning and Segmentation","summary":" The reasoning segmentation task, which demands a nuanced comprehension of\nintricate queries to accurately pinpoint object regions, is attracting\nincreasing attention. However, Multi-modal Large Language Models (MLLM) often\nfind it difficult to accurately localize the objects described in complex\nreasoning contexts. We believe that the act of reasoning segmentation should\nmirror the cognitive stages of human visual search, where each step is a\nprogressive refinement of thought toward the final object. Thus we introduce\nthe Chains of Reasoning and Segmenting (CoReS) and find this top-down visual\nhierarchy indeed enhances the visual search process. Specifically, we propose a\ndual-chain structure that generates multi-modal, chain-like outputs to aid the\nsegmentation process. Furthermore, to steer the MLLM's outputs into this\nintended hierarchy, we incorporate in-context inputs as guidance. Extensive\nexperiments demonstrate the superior performance of our CoReS, which surpasses\nthe state-of-the-art method by 7.1\\% on the ReasonSeg dataset. Project:\nhttps://chain-of-reasoning-and-segmentation.github.io/.\n","authors":["Xiaoyi Bao","Siyang Sun","Shuailei Ma","Kecheng Zheng","Yuxin Guo","Guosheng Zhao","Yun Zheng","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11630v1","updated":"2024-04-18T03:21:28Z","published":"2024-04-18T03:21:28Z","title":"SNP: Structured Neuron-level Pruning to Preserve Attention Scores","summary":" Multi-head self-attention (MSA) is a key component of Vision Transformers\n(ViTs), which have achieved great success in various vision tasks. However,\ntheir high computational cost and memory footprint hinder their deployment on\nresource-constrained devices. Conventional pruning approaches can only compress\nand accelerate the MSA module using head pruning, although the head is not an\natomic unit. To address this issue, we propose a novel graph-aware neuron-level\npruning method, Structured Neuron-level Pruning (SNP). SNP prunes neurons with\nless informative attention scores and eliminates redundancy among heads.\nSpecifically, it prunes graphically connected query and key layers having the\nleast informative attention scores while preserving the overall attention\nscores. Value layers, which can be pruned independently, are pruned to\neliminate inter-head redundancy. Our proposed method effectively compresses and\naccelerates Transformer-based models for both edge devices and server\nprocessors. For instance, the DeiT-Small with SNP runs 3.1$\\times$ faster than\nthe original model and achieves performance that is 21.94\\% faster and 1.12\\%\nhigher than the DeiT-Tiny. Additionally, SNP combine successfully with\nconventional head or block pruning approaches. SNP with head pruning could\ncompress the DeiT-Base by 80\\% of the parameters and computational costs and\nachieve 3.85$\\times$ faster inference speed on RTX3090 and 4.93$\\times$ on\nJetson Nano.\n","authors":["Kyunghwan Shim","Jaewoong Yun","Shinkook Choi"],"pdf_url":"https://arxiv.org/pdf/2404.11630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11358v2","updated":"2024-04-18T03:18:36Z","published":"2024-04-17T13:14:52Z","title":"DeblurGS: Gaussian Splatting for Camera Motion Blur","summary":" Although significant progress has been made in reconstructing sharp 3D scenes\nfrom motion-blurred images, a transition to real-world applications remains\nchallenging. The primary obstacle stems from the severe blur which leads to\ninaccuracies in the acquisition of initial camera poses through\nStructure-from-Motion, a critical aspect often overlooked by previous\napproaches. To address this challenge, we propose DeblurGS, a method to\noptimize sharp 3D Gaussian Splatting from motion-blurred images, even with the\nnoisy camera pose initialization. We restore a fine-grained sharp scene by\nleveraging the remarkable reconstruction capability of 3D Gaussian Splatting.\nOur approach estimates the 6-Degree-of-Freedom camera motion for each blurry\nobservation and synthesizes corresponding blurry renderings for the\noptimization process. Furthermore, we propose Gaussian Densification Annealing\nstrategy to prevent the generation of inaccurate Gaussians at erroneous\nlocations during the early training stages when camera motion is still\nimprecise. Comprehensive experiments demonstrate that our DeblurGS achieves\nstate-of-the-art performance in deblurring and novel view synthesis for\nreal-world and synthetic benchmark datasets, as well as field-captured blurry\nsmartphone videos.\n","authors":["Jeongtaek Oh","Jaeyoung Chung","Dongwoo Lee","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.11358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11871v1","updated":"2024-04-18T03:10:04Z","published":"2024-04-18T03:10:04Z","title":"Group-On: Boosting One-Shot Segmentation with Supportive Query","summary":" One-shot semantic segmentation aims to segment query images given only ONE\nannotated support image of the same class. This task is challenging because\ntarget objects in the support and query images can be largely different in\nappearance and pose (i.e., intra-class variation). Prior works suggested that\nincorporating more annotated support images in few-shot settings boosts\nperformances but increases costs due to additional manual labeling. In this\npaper, we propose a novel approach for ONE-shot semantic segmentation, called\nGroup-On, which packs multiple query images in batches for the benefit of\nmutual knowledge support within the same category. Specifically, after coarse\nsegmentation masks of the batch of queries are predicted, query-mask pairs act\nas pseudo support data to enhance mask predictions mutually, under the guidance\nof a simple Group-On Voting module. Comprehensive experiments on three standard\nbenchmarks show that, in the ONE-shot setting, our Group-On approach\nsignificantly outperforms previous works by considerable margins. For example,\non the COCO-20i dataset, we increase mIoU scores by 8.21% and 7.46% on ASNet\nand HSNet baselines, respectively. With only one support image, Group-On can be\neven competitive with the counterparts using 5 annotated support images.\n","authors":["Hanjing Zhou","Mingze Yin","JinTai Chen","Danny Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2404.11871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11868v1","updated":"2024-04-18T02:59:48Z","published":"2024-04-18T02:59:48Z","title":"OPTiML: Dense Semantic Invariance Using Optimal Transport for\n Self-Supervised Medical Image Representation","summary":" Self-supervised learning (SSL) has emerged as a promising technique for\nmedical image analysis due to its ability to learn without annotations.\nHowever, despite the promising potential, conventional SSL methods encounter\nlimitations, including challenges in achieving semantic alignment and capturing\nsubtle details. This leads to suboptimal representations, which fail to\naccurately capture the underlying anatomical structures and pathological\ndetails. In response to these constraints, we introduce a novel SSL framework\nOPTiML, employing optimal transport (OT), to capture the dense semantic\ninvariance and fine-grained details, thereby enhancing the overall\neffectiveness of SSL in medical image representation learning. The core idea is\nto integrate OT with a cross-viewpoint semantics infusion module (CV-SIM),\nwhich effectively captures complex, fine-grained details inherent in medical\nimages across different viewpoints. In addition to the CV-SIM module, OPTiML\nimposes the variance and covariance regularizations within OT framework to\nforce the model focus on clinically relevant information while discarding less\ninformative features. Through these, the proposed framework demonstrates its\ncapacity to learn semantically rich representations that can be applied to\nvarious medical imaging tasks. To validate its effectiveness, we conduct\nexperimental studies on three publicly available datasets from chest X-ray\nmodality. Our empirical results reveal OPTiML's superiority over\nstate-of-the-art methods across all evaluated tasks.\n","authors":["Azad Singh","Vandan Gorade","Deepak Mishra"],"pdf_url":"https://arxiv.org/pdf/2404.11868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11865v1","updated":"2024-04-18T02:43:37Z","published":"2024-04-18T02:43:37Z","title":"From Image to Video, what do we need in multimodal LLMs?","summary":" Multimodal Large Language Models (MLLMs) have demonstrated profound\ncapabilities in understanding multimodal information, covering from Image LLMs\nto the more complex Video LLMs. Numerous studies have illustrated their\nexceptional cross-modal comprehension. Recently, integrating video foundation\nmodels with large language models to build a comprehensive video understanding\nsystem has been proposed to overcome the limitations of specific pre-defined\nvision tasks. However, the current advancements in Video LLMs tend to overlook\nthe foundational contributions of Image LLMs, often opting for more complicated\nstructures and a wide variety of multimodal data for pre-training. This\napproach significantly increases the costs associated with these methods.In\nresponse to these challenges, this work introduces an efficient method that\nstrategically leverages the priors of Image LLMs, facilitating a\nresource-efficient transition from Image to Video LLMs. We propose RED-VILLM, a\nResource-Efficient Development pipeline for Video LLMs from Image LLMs, which\nutilizes a temporal adaptation plug-and-play structure within the image fusion\nmodule of Image LLMs. This adaptation extends their understanding capabilities\nto include temporal information, enabling the development of Video LLMs that\nnot only surpass baseline performances but also do so with minimal\ninstructional data and training resources. Our approach highlights the\npotential for a more cost-effective and scalable advancement in multimodal\nmodels, effectively building upon the foundational work of Image LLMs.\n","authors":["Suyuan Huang","Haoxin Zhang","Yan Gao","Yao Hu","Zengchang Qin"],"pdf_url":"https://arxiv.org/pdf/2404.11865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11864v1","updated":"2024-04-18T02:40:31Z","published":"2024-04-18T02:40:31Z","title":"Progressive Multi-modal Conditional Prompt Tuning","summary":" Pre-trained vision-language models (VLMs) have shown remarkable\ngeneralization capabilities via prompting, which leverages VLMs as knowledge\nbases to extract information beneficial for downstream tasks. However, existing\nmethods primarily employ uni-modal prompting, which only engages a uni-modal\nbranch, failing to simultaneously adjust vision-language (V-L) features.\nAdditionally, the one-pass forward pipeline in VLM encoding struggles to align\nV-L features that have a huge gap. Confronting these challenges, we propose a\nnovel method, Progressive Multi-modal conditional Prompt Tuning (ProMPT).\nProMPT exploits a recurrent structure, optimizing and aligning V-L features by\niteratively utilizing image and current encoding information. It comprises an\ninitialization and a multi-modal iterative evolution (MIE) module.\nInitialization is responsible for encoding image and text using a VLM, followed\nby a feature filter that selects text features similar to image. MIE then\nfacilitates multi-modal prompting through class-conditional vision prompting,\ninstance-conditional text prompting, and feature filtering. In each MIE\niteration, vision prompts are obtained from the filtered text features via a\nvision generator, promoting image features to focus more on target object\nduring vision prompting. The encoded image features are fed into a text\ngenerator to produce text prompts that are more robust to class shift. Thus,\nV-L features are progressively aligned, enabling advance from coarse to exact\nclassifications. Extensive experiments are conducted in three settings to\nevaluate the efficacy of ProMPT. The results indicate that ProMPT outperforms\nexisting methods on average across all settings, demonstrating its superior\ngeneralization.\n","authors":["Xiaoyu Qiu","Hao Feng","Yuechen Wang","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.11864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11052v2","updated":"2024-04-18T01:59:27Z","published":"2024-04-17T03:51:55Z","title":"Supervised Contrastive Vision Transformer for Breast Histopathological\n Image Classification","summary":" Invasive ductal carcinoma (IDC) is the most prevalent form of breast cancer.\nBreast tissue histopathological examination is critical in diagnosing and\nclassifying breast cancer. Although existing methods have shown promising\nresults, there is still room for improvement in the classification accuracy and\ngeneralization of IDC using histopathology images. We present a novel approach,\nSupervised Contrastive Vision Transformer (SupCon-ViT), for improving the\nclassification of invasive ductal carcinoma in terms of accuracy and\ngeneralization by leveraging the inherent strengths and advantages of both\ntransfer learning, i.e., pre-trained vision transformer, and supervised\ncontrastive learning. Our results on a benchmark breast cancer dataset\ndemonstrate that SupCon-Vit achieves state-of-the-art performance in IDC\nclassification, with an F1-score of 0.8188, precision of 0.7692, and\nspecificity of 0.8971, outperforming existing methods. In addition, the\nproposed model demonstrates resilience in scenarios with minimal labeled data,\nmaking it highly efficient in real-world clinical settings where labelled data\nis limited. Our findings suggest that supervised contrastive learning in\nconjunction with pre-trained vision transformers appears to be a viable\nstrategy for an accurate classification of IDC, thus paving the way for a more\nefficient and reliable diagnosis of breast cancer through histopathological\nimage analysis.\n","authors":["Mohammad Shiri","Monalika Padma Reddy","Jiangwen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.11052v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.11098v2","updated":"2024-04-18T01:58:07Z","published":"2024-04-17T06:32:42Z","title":"LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing\n Diffusion Models","summary":" In the era of AIGC, the demand for low-budget or even on-device applications\nof diffusion models emerged. In terms of compressing the Stable Diffusion\nmodels (SDMs), several approaches have been proposed, and most of them\nleveraged the handcrafted layer removal methods to obtain smaller U-Nets, along\nwith knowledge distillation to recover the network performance. However, such a\nhandcrafting manner of layer removal is inefficient and lacks scalability and\ngeneralization, and the feature distillation employed in the retraining phase\nfaces an imbalance issue that a few numerically significant feature loss terms\ndominate over others throughout the retraining process. To this end, we\nproposed the layer pruning and normalized distillation for compressing\ndiffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to\ncompress SDM's U-Net automatically and proposed an effective one-shot pruning\ncriterion whose one-shot performance is guaranteed by its good additivity\nproperty, surpassing other layer pruning and handcrafted layer removal methods,\n2) proposed the normalized feature distillation for retraining, alleviated the\nimbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of\nSDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0%\ndecline in PickScore at a pruning ratio of 50% while the comparative methods'\nminimal PickScore decline is 8.2%. We will release our code.\n","authors":["Dingkun Zhang","Sijia Li","Chen Chen","Qingsong Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.11098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11848v1","updated":"2024-04-18T01:55:44Z","published":"2024-04-18T01:55:44Z","title":"Partial Large Kernel CNNs for Efficient Super-Resolution","summary":" Recently, in the super-resolution (SR) domain, transformers have outperformed\nCNNs with fewer FLOPs and fewer parameters since they can deal with long-range\ndependency and adaptively adjust weights based on instance. In this paper, we\ndemonstrate that CNNs, although less focused on in the current SR domain,\nsurpass Transformers in direct efficiency measures. By incorporating the\nadvantages of Transformers into CNNs, we aim to achieve both computational\nefficiency and enhanced performance. However, using a large kernel in the SR\ndomain, which mainly processes large images, incurs a large computational\noverhead. To overcome this, we propose novel approaches to employing the large\nkernel, which can reduce latency by 86\\% compared to the naive large kernel,\nand leverage an Element-wise Attention module to imitate instance-dependent\nweights. As a result, we introduce Partial Large Kernel CNNs for Efficient\nSuper-Resolution (PLKSR), which achieves state-of-the-art performance on four\ndatasets at a scale of $\\times$4, with reductions of 68.1\\% in latency and\n80.2\\% in maximum GPU memory occupancy compared to SRFormer-light.\n","authors":["Dongheon Lee","Seokju Yun","Youngmin Ro"],"pdf_url":"https://arxiv.org/pdf/2404.11848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11843v1","updated":"2024-04-18T01:46:31Z","published":"2024-04-18T01:46:31Z","title":"Computer-Aided Diagnosis of Thoracic Diseases in Chest X-rays using\n hybrid CNN-Transformer Architecture","summary":" Medical imaging has been used for diagnosis of various conditions, making it\none of the most powerful resources for effective patient care. Due to\nwidespread availability, low cost, and low radiation, chest X-ray is one of the\nmost sought after radiology examination for the diagnosis of various thoracic\ndiseases. Due to advancements in medical imaging technologies and increasing\npatient load, current radiology workflow faces various challenges including\nincreasing backlogs, working long hours, and increase in diagnostic errors. An\nautomated computer-aided diagnosis system that can interpret chest X-rays to\naugment radiologists by providing actionable insights has potential to provide\nsecond opinion to radiologists, highlight relevant regions in the image, in\nturn expediting clinical workflow, reducing diagnostic errors, and improving\npatient care. In this study, we applied a novel architecture augmenting the\nDenseNet121 Convolutional Neural Network (CNN) with multi-head self-attention\nmechanism using transformer, namely SA-DenseNet121, that can identify multiple\nthoracic diseases in chest X-rays. We conducted experiments on four of the\nlargest chest X-ray datasets, namely, ChestX-ray14, CheXpert, MIMIC-CXR-JPG,\nand IU-CXR. Experimental results in terms of area under the receiver operating\ncharacteristics (AUC-ROC) shows that augmenting CNN with self-attention has\npotential in diagnosing different thoracic diseases from chest X-rays. The\nproposed methodology has the potential to support the reading workflow, improve\nefficiency, and reduce diagnostic errors.\n","authors":["Sonit Singh"],"pdf_url":"https://arxiv.org/pdf/2404.11843v1.pdf","comment":"24 pages, 13 Figures, 13 Tables. arXiv admin note: text overlap with\n arXiv:1904.09925 by other authors"},{"id":"http://arxiv.org/abs/2401.12451v2","updated":"2024-04-18T01:37:42Z","published":"2024-01-23T02:30:16Z","title":"Methods and strategies for improving the novel view synthesis quality of\n neural radiation field","summary":" Neural Radiation Field (NeRF) technology can learn a 3D implicit model of a\nscene from 2D images and synthesize realistic novel view images. This\ntechnology has received widespread attention from the industry and has good\napplication prospects. In response to the problem that the rendering quality of\nNeRF images needs to be improved, many researchers have proposed various\nmethods to improve the rendering quality in the past three years. The latest\nrelevant papers are classified and reviewed, the technical principles behind\nquality improvement are analyzed, and the future evolution direction of quality\nimprovement methods is discussed. This study can help researchers quickly\nunderstand the current state and evolutionary context of technology in this\nfield, which is helpful in inspiring the development of more efficient\nalgorithms and promoting the application of NeRF technology in related fields.\n","authors":["Shun Fang","Ming Cui","Xing Feng","Yanna Lv"],"pdf_url":"https://arxiv.org/pdf/2401.12451v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11031v2","updated":"2024-04-18T01:10:44Z","published":"2024-04-17T03:13:58Z","title":"TaCOS: Task-Specific Camera Optimization with Simulation","summary":" The performance of robots in their applications heavily depends on the\nquality of sensory input. However, designing sensor payloads and their\nparameters for specific robotic tasks is an expensive process that requires\nwell-established sensor knowledge and extensive experiments with physical\nhardware. With cameras playing a pivotal role in robotic perception, we\nintroduce a novel end-to-end optimization approach for co-designing a camera\nwith specific robotic tasks by combining derivative-free and gradient-based\noptimizers. The proposed method leverages recent computer graphics techniques\nand physical camera characteristics to prototype the camera in software,\nsimulate operational environments and tasks for robots, and optimize the camera\ndesign based on the desired tasks in a cost-effective way. We validate the\naccuracy of our camera simulation by comparing it with physical cameras, and\ndemonstrate the design of cameras with stronger performance than common\noff-the-shelf alternatives. Our approach supports the optimization of both\ncontinuous and discrete camera parameters, manufacturing constraints, and can\nbe generalized to a broad range of camera design scenarios including multiple\ncameras and unconventional cameras. This work advances the fully automated\ndesign of cameras for specific robotics tasks.\n","authors":["Chengyang Yan","Donald G. Dansereau"],"pdf_url":"https://arxiv.org/pdf/2404.11031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11824v1","updated":"2024-04-18T01:10:24Z","published":"2024-04-18T01:10:24Z","title":"TextCenGen: Attention-Guided Text-Centric Background Adaptation for\n Text-to-Image Generation","summary":" Recent advancements in Text-to-image (T2I) generation have witnessed a shift\nfrom adapting text to fixed backgrounds to creating images around text.\nTraditional approaches are often limited to generate layouts within static\nimages for effective text placement. Our proposed approach, TextCenGen,\nintroduces a dynamic adaptation of the blank region for text-friendly image\ngeneration, emphasizing text-centric design and visual harmony generation. Our\nmethod employs force-directed attention guidance in T2I models to generate\nimages that strategically reserve whitespace for pre-defined text areas, even\nfor text or icons at the golden ratio. Observing how cross-attention maps\naffect object placement, we detect and repel conflicting objects using a\nforce-directed graph approach, combined with a Spatial Excluding\nCross-Attention Constraint for smooth attention in whitespace areas. As a novel\ntask in graphic design, experiments indicate that TextCenGen outperforms\nexisting methods with more harmonious compositions. Furthermore, our method\nsignificantly enhances T2I model outcomes on our specially collected prompt\ndatasets, catering to varied text positions. These results demonstrate the\nefficacy of TextCenGen in creating more harmonious and integrated text-image\ncompositions.\n","authors":["Tianyi Liang","Jiangqi Liu","Sicheng Song","Shiqi Jiang","Yifei Huang","Changbo Wang","Chenhui Li"],"pdf_url":"https://arxiv.org/pdf/2404.11824v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.11819v1","updated":"2024-04-18T00:41:32Z","published":"2024-04-18T00:41:32Z","title":"Utilizing Adversarial Examples for Bias Mitigation and Accuracy\n Enhancement","summary":" We propose a novel approach to mitigate biases in computer vision models by\nutilizing counterfactual generation and fine-tuning. While counterfactuals have\nbeen used to analyze and address biases in DNN models, the counterfactuals\nthemselves are often generated from biased generative models, which can\nintroduce additional biases or spurious correlations. To address this issue, we\npropose using adversarial images, that is images that deceive a deep neural\nnetwork but not humans, as counterfactuals for fair model training.\n Our approach leverages a curriculum learning framework combined with a\nfine-grained adversarial loss to fine-tune the model using adversarial\nexamples. By incorporating adversarial images into the training data, we aim to\nprevent biases from propagating through the pipeline. We validate our approach\nthrough both qualitative and quantitative assessments, demonstrating improved\nbias mitigation and accuracy compared to existing methods. Qualitatively, our\nresults indicate that post-training, the decisions made by the model are less\ndependent on the sensitive attribute and our model better disentangles the\nrelationship between sensitive attributes and classification variables.\n","authors":["Pushkar Shukla","Dhruv Srikanth","Lee Cohen","Matthew Turk"],"pdf_url":"https://arxiv.org/pdf/2404.11819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11812v1","updated":"2024-04-18T00:18:07Z","published":"2024-04-18T00:18:07Z","title":"Cross-model Mutual Learning for Exemplar-based Medical Image\n Segmentation","summary":" Medical image segmentation typically demands extensive dense annotations for\nmodel training, which is both time-consuming and skill-intensive. To mitigate\nthis burden, exemplar-based medical image segmentation methods have been\nintroduced to achieve effective training with only one annotated image. In this\npaper, we introduce a novel Cross-model Mutual learning framework for\nExemplar-based Medical image Segmentation (CMEMS), which leverages two models\nto mutually excavate implicit information from unlabeled data at multiple\ngranularities. CMEMS can eliminate confirmation bias and enable collaborative\ntraining to learn complementary information by enforcing consistency at\ndifferent granularities across models. Concretely, cross-model image\nperturbation based mutual learning is devised by using weakly perturbed images\nto generate high-confidence pseudo-labels, supervising predictions of strongly\nperturbed images across models. This approach enables joint pursuit of\nprediction consistency at the image granularity. Moreover, cross-model\nmulti-level feature perturbation based mutual learning is designed by letting\npseudo-labels supervise predictions from perturbed multi-level features with\ndifferent resolutions, which can broaden the perturbation space and enhance the\nrobustness of our framework. CMEMS is jointly trained using exemplar data,\nsynthetic data, and unlabeled data in an end-to-end manner. Experimental\nresults on two medical image datasets indicate that the proposed CMEMS\noutperforms the state-of-the-art segmentation methods with extremely limited\nsupervision.\n","authors":["Qing En","Yuhong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.11812v1.pdf","comment":"AISTATS 2024"},{"id":"http://arxiv.org/abs/2107.09847v2","updated":"2024-04-18T08:11:49Z","published":"2021-07-21T02:33:37Z","title":"CogME: A Cognition-Inspired Multi-Dimensional Evaluation Metric for\n Story Understanding","summary":" We introduce CogME, a cognition-inspired, multi-dimensional evaluation metric\ndesigned for AI models focusing on story understanding. CogME is a framework\ngrounded in human thinking strategies and story elements that involve story\nunderstanding. With a specific breakdown of the questions, this approach\nprovides a nuanced assessment revealing not only AI models' particular\nstrengths and weaknesses but also the characteristics of the benchmark dataset.\nOur case study with the DramaQA dataset demonstrates a refined analysis of the\nmodel and the benchmark dataset. We argue the need for metrics based on\nunderstanding the nature of tasks and designed to align closely with human\ncognitive processes. This approach provides insights beyond traditional overall\nscores and paves the way for more sophisticated AI development targeting higher\ncognitive functions.\n","authors":["Minjung Shin","Seongho Choi","Yu-Jung Heo","Minsu Lee","Byoung-Tak Zhang","Jeh-Kwang Ryu"],"pdf_url":"https://arxiv.org/pdf/2107.09847v2.pdf","comment":"9 pages with 4 figures and 3 tables. This work has been accepted for\n presentation at CogSci 2024 and is currently under revision"},{"id":"http://arxiv.org/abs/2404.12547v1","updated":"2024-04-18T23:52:42Z","published":"2024-04-18T23:52:42Z","title":"Does Gaussian Splatting need SFM Initialization?","summary":" 3D Gaussian Splatting has recently been embraced as a versatile and effective\nmethod for scene reconstruction and novel view synthesis, owing to its\nhigh-quality results and compatibility with hardware rasterization. Despite its\nadvantages, Gaussian Splatting's reliance on high-quality point cloud\ninitialization by Structure-from-Motion (SFM) algorithms is a significant\nlimitation to be overcome. To this end, we investigate various initialization\nstrategies for Gaussian Splatting and delve into how volumetric reconstructions\nfrom Neural Radiance Fields (NeRF) can be utilized to bypass the dependency on\nSFM data. Our findings demonstrate that random initialization can perform much\nbetter if carefully designed and that by employing a combination of improved\ninitialization strategies and structure distillation from low-cost NeRF models,\nit is possible to achieve equivalent results, or at times even superior, to\nthose obtained from SFM initialization.\n","authors":["Yalda Foroutan","Daniel Rebain","Kwang Moo Yi","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2404.12547v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.04725v2","updated":"2024-04-18T23:40:16Z","published":"2023-08-09T06:03:07Z","title":"Self-supervised Learning of Rotation-invariant 3D Point Set Features\n using Transformer and its Self-distillation","summary":" Invariance against rotations of 3D objects is an important property in\nanalyzing 3D point set data. Conventional 3D point set DNNs having rotation\ninvariance typically obtain accurate 3D shape features via supervised learning\nby using labeled 3D point sets as training samples. However, due to the rapid\nincrease in 3D point set data and the high cost of labeling, a framework to\nlearn rotation-invariant 3D shape features from numerous unlabeled 3D point\nsets is required. This paper proposes a novel self-supervised learning\nframework for acquiring accurate and rotation-invariant 3D point set features\nat object-level. Our proposed lightweight DNN architecture decomposes an input\n3D point set into multiple global-scale regions, called tokens, that preserve\nthe spatial layout of partial shapes composing the 3D object. We employ a\nself-attention mechanism to refine the tokens and aggregate them into an\nexpressive rotation-invariant feature per 3D point set. Our DNN is effectively\ntrained by using pseudo-labels generated by a self-distillation framework. To\nfacilitate the learning of accurate features, we propose to combine multi-crop\nand cut-mix data augmentation techniques to diversify 3D point sets for\ntraining. Through a comprehensive evaluation, we empirically demonstrate that,\n(1) existing rotation-invariant DNN architectures designed for supervised\nlearning do not necessarily learn accurate 3D shape features under a\nself-supervised learning scenario, and (2) our proposed algorithm learns\nrotation-invariant 3D point set features that are more accurate than those\nlearned by existing algorithms. Code is available at\nhttps://github.com/takahikof/RIPT_SDMM\n","authors":["Takahiko Furuya","Zhoujie Chen","Ryutarou Ohbuchi","Zhenzhong Kuang"],"pdf_url":"https://arxiv.org/pdf/2308.04725v2.pdf","comment":"Accepted to the CVIU journal"},{"id":"http://arxiv.org/abs/2404.12541v1","updated":"2024-04-18T23:25:27Z","published":"2024-04-18T23:25:27Z","title":"GenVideo: One-shot Target-image and Shape Aware Video Editing using T2I\n Diffusion Models","summary":" Video editing methods based on diffusion models that rely solely on a text\nprompt for the edit are hindered by the limited expressive power of text\nprompts. Thus, incorporating a reference target image as a visual guide becomes\ndesirable for precise control over edit. Also, most existing methods struggle\nto accurately edit a video when the shape and size of the object in the target\nimage differ from the source object. To address these challenges, we propose\n\"GenVideo\" for editing videos leveraging target-image aware T2I models. Our\napproach handles edits with target objects of varying shapes and sizes while\nmaintaining the temporal consistency of the edit using our novel target and\nshape aware InvEdit masks. Further, we propose a novel target-image aware\nlatent noise correction strategy during inference to improve the temporal\nconsistency of the edits. Experimental analyses indicate that GenVideo can\neffectively handle edits with objects of varying shapes, where existing\napproaches fail.\n","authors":["Sai Sree Harsha","Ambareesh Revanur","Dhwanit Agarwal","Shradha Agrawal"],"pdf_url":"https://arxiv.org/pdf/2404.12541v1.pdf","comment":"CVPRw 2024"},{"id":"http://arxiv.org/abs/2404.12538v1","updated":"2024-04-18T23:12:46Z","published":"2024-04-18T23:12:46Z","title":"TrACT: A Training Dynamics Aware Contrastive Learning Framework for\n Long-tail Trajectory Prediction","summary":" As a safety critical task, autonomous driving requires accurate predictions\nof road users' future trajectories for safe motion planning, particularly under\nchallenging conditions. Yet, many recent deep learning methods suffer from a\ndegraded performance on the challenging scenarios, mainly because these\nscenarios appear less frequently in the training data. To address such a\nlong-tail issue, existing methods force challenging scenarios closer together\nin the feature space during training to trigger information sharing among them\nfor more robust learning. These methods, however, primarily rely on the motion\npatterns to characterize scenarios, omitting more informative contextual\ninformation, such as interactions and scene layout. We argue that exploiting\nsuch information not only improves prediction accuracy but also scene\ncompliance of the generated trajectories. In this paper, we propose to\nincorporate richer training dynamics information into a prototypical\ncontrastive learning framework. More specifically, we propose a two-stage\nprocess. First, we generate rich contextual features using a baseline\nencoder-decoder framework. These features are split into clusters based on the\nmodel's output errors, using the training dynamics information, and a prototype\nis computed within each cluster. Second, we retrain the model using the\nprototypes in a contrastive learning framework. We conduct empirical\nevaluations of our approach using two large-scale naturalistic datasets and\nshow that our method achieves state-of-the-art performance by improving\naccuracy and scene compliance on the long-tail samples. Furthermore, we perform\nexperiments on a subset of the clusters to highlight the additional benefit of\nour approach in reducing training bias.\n","authors":["Junrui Zhang","Mozhgan Pourkeshavarz","Amir Rasouli"],"pdf_url":"https://arxiv.org/pdf/2404.12538v1.pdf","comment":"2024 IEEE Intelligent Vehicles Symposium (IV)"},{"id":"http://arxiv.org/abs/2404.12526v1","updated":"2024-04-18T22:01:56Z","published":"2024-04-18T22:01:56Z","title":"Adaptive Memory Replay for Continual Learning","summary":" Foundation Models (FMs) have become the hallmark of modern AI, however, these\nmodels are trained on massive data, leading to financially expensive training.\nUpdating FMs as new data becomes available is important, however, can lead to\n`catastrophic forgetting', where models underperform on tasks related to data\nsub-populations observed too long ago. This continual learning (CL) phenomenon\nhas been extensively studied, but primarily in a setting where only a small\namount of past data can be stored. We advocate for the paradigm where memory is\nabundant, allowing us to keep all previous data, but computational resources\nare limited. In this setting, traditional replay-based CL approaches are\noutperformed by a simple baseline which replays past data selected uniformly at\nrandom, indicating that this setting necessitates a new approach. We address\nthis by introducing a framework of adaptive memory replay for continual\nlearning, where sampling of past data is phrased as a multi-armed bandit\nproblem. We utilize Bolzmann sampling to derive a method which dynamically\nselects past data for training conditioned on the current task, assuming full\ndata access and emphasizing training efficiency. Through extensive evaluations\non both vision and language pre-training tasks, we demonstrate the\neffectiveness of our approach, which maintains high performance while reducing\nforgetting by up to 10% at no training efficiency cost.\n","authors":["James Seale Smith","Lazar Valkov","Shaunak Halbe","Vyshnavi Gutta","Rogerio Feris","Zsolt Kira","Leonid Karlinsky"],"pdf_url":"https://arxiv.org/pdf/2404.12526v1.pdf","comment":"CVPR-W 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2404.12524v1","updated":"2024-04-18T21:55:23Z","published":"2024-04-18T21:55:23Z","title":"DoughNet: A Visual Predictive Model for Topological Manipulation of\n Deformable Objects","summary":" Manipulation of elastoplastic objects like dough often involves topological\nchanges such as splitting and merging. The ability to accurately predict these\ntopological changes that a specific action might incur is critical for planning\ninteractions with elastoplastic objects. We present DoughNet, a\nTransformer-based architecture for handling these challenges, consisting of two\ncomponents. First, a denoising autoencoder represents deformable objects of\nvarying topology as sets of latent codes. Second, a visual predictive model\nperforms autoregressive set prediction to determine long-horizon geometrical\ndeformation and topological changes purely in latent space. Given a partial\ninitial state and desired manipulation trajectories, it infers all resulting\nobject geometries and topologies at each step. DoughNet thereby allows to plan\nrobotic manipulation; selecting a suited tool, its pose and opening width to\nrecreate robot- or human-made goals. Our experiments in simulated and real\nenvironments show that DoughNet is able to significantly outperform related\napproaches that consider deformation only as geometrical change.\n","authors":["Dominik Bauer","Zhenjia Xu","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2404.12524v1.pdf","comment":"Under review. 17 pages, 14 figures"},{"id":"http://arxiv.org/abs/2402.14371v2","updated":"2024-04-18T21:29:39Z","published":"2024-02-22T08:21:46Z","title":"HR-APR: APR-agnostic Framework with Uncertainty Estimation and\n Hierarchical Refinement for Camera Relocalisation","summary":" Absolute Pose Regressors (APRs) directly estimate camera poses from monocular\nimages, but their accuracy is unstable for different queries. Uncertainty-aware\nAPRs provide uncertainty information on the estimated pose, alleviating the\nimpact of these unreliable predictions. However, existing uncertainty modelling\ntechniques are often coupled with a specific APR architecture, resulting in\nsuboptimal performance compared to state-of-the-art (SOTA) APR methods. This\nwork introduces a novel APR-agnostic framework, HR-APR, that formulates\nuncertainty estimation as cosine similarity estimation between the query and\ndatabase features. It does not rely on or affect APR network architecture,\nwhich is flexible and computationally efficient. In addition, we take advantage\nof the uncertainty for pose refinement to enhance the performance of APR. The\nextensive experiments demonstrate the effectiveness of our framework, reducing\n27.4\\% and 15.2\\% of computational overhead on the 7Scenes and Cambridge\nLandmarks datasets while maintaining the SOTA accuracy in single-image APRs.\n","authors":["Changkun Liu","Shuai Chen","Yukun Zhao","Huajian Huang","Victor Prisacariu","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2402.14371v2.pdf","comment":"Accepted in in 2024 IEEE International Conference on Robotics and\n Automation (ICRA). Code: https://github.com/lck666666/HR-APR"},{"id":"http://arxiv.org/abs/2404.12509v1","updated":"2024-04-18T21:09:34Z","published":"2024-04-18T21:09:34Z","title":"Compositional Neural Textures","summary":" Texture plays a vital role in enhancing visual richness in both real\nphotographs and computer-generated imagery. However, the process of editing\ntextures often involves laborious and repetitive manual adjustments of textons,\nwhich are the small, recurring local patterns that define textures. In this\nwork, we introduce a fully unsupervised approach for representing textures\nusing a compositional neural model that captures individual textons. We\nrepresent each texton as a 2D Gaussian function whose spatial support\napproximates its shape, and an associated feature that encodes its detailed\nappearance. By modeling a texture as a discrete composition of Gaussian\ntextons, the representation offers both expressiveness and ease of editing.\nTextures can be edited by modifying the compositional Gaussians within the\nlatent space, and new textures can be efficiently synthesized by feeding the\nmodified Gaussians through a generator network in a feed-forward manner. This\napproach enables a wide range of applications, including transferring\nappearance from an image texture to another image, diversifying textures,\ntexture interpolation, revealing/modifying texture variations, edit\npropagation, texture animation, and direct texton manipulation. The proposed\napproach contributes to advancing texture analysis, modeling, and editing\ntechniques, and opens up new possibilities for creating visually appealing\nimages with controllable textures.\n","authors":["Peihan Tu","Li-Yi Wei","Matthias Zwicker"],"pdf_url":"https://arxiv.org/pdf/2404.12509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12501v1","updated":"2024-04-18T20:43:33Z","published":"2024-04-18T20:43:33Z","title":"SPIdepth: Strengthened Pose Information for Self-supervised Monocular\n Depth Estimation","summary":" Self-supervised monocular depth estimation has garnered considerable\nattention for its applications in autonomous driving and robotics. While recent\nmethods have made strides in leveraging techniques like the Self Query Layer\n(SQL) to infer depth from motion, they often overlook the potential of\nstrengthening pose information. In this paper, we introduce SPIdepth, a novel\napproach that prioritizes enhancing the pose network for improved depth\nestimation. Building upon the foundation laid by SQL, SPIdepth emphasizes the\nimportance of pose information in capturing fine-grained scene structures. By\nenhancing the pose network's capabilities, SPIdepth achieves remarkable\nadvancements in scene understanding and depth estimation. Experimental results\non benchmark datasets such as KITTI and Cityscapes showcase SPIdepth's\nstate-of-the-art performance, surpassing previous methods by significant\nmargins. Notably, SPIdepth's performance exceeds that of unsupervised models\nand, after finetuning on metric data, outperforms all existing methods.\nRemarkably, SPIdepth achieves these results using only a single image for\ninference, surpassing even methods that utilize video sequences for inference,\nthus demonstrating its efficacy and efficiency in real-world applications. Our\napproach represents a significant leap forward in self-supervised monocular\ndepth estimation, underscoring the importance of strengthening pose information\nfor advancing scene understanding in real-world applications.\n","authors":["Mykola Lavreniuk"],"pdf_url":"https://arxiv.org/pdf/2404.12501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12488v1","updated":"2024-04-18T20:03:56Z","published":"2024-04-18T20:03:56Z","title":"Global Counterfactual Directions","summary":" Despite increasing progress in development of methods for generating visual\ncounterfactual explanations, especially with the recent rise of Denoising\nDiffusion Probabilistic Models, previous works consider them as an entirely\nlocal technique. In this work, we take the first step at globalizing them.\nSpecifically, we discover that the latent space of Diffusion Autoencoders\nencodes the inference process of a given classifier in the form of global\ndirections. We propose a novel proxy-based approach that discovers two types of\nthese directions with the use of only single image in an entirely black-box\nmanner. Precisely, g-directions allow for flipping the decision of a given\nclassifier on an entire dataset of images, while h-directions further increase\nthe diversity of explanations. We refer to them in general as Global\nCounterfactual Directions (GCDs). Moreover, we show that GCDs can be naturally\ncombined with Latent Integrated Gradients resulting in a new black-box\nattribution method, while simultaneously enhancing the understanding of\ncounterfactual explanations. We validate our approach on existing benchmarks\nand show that it generalizes to real-world use-cases.\n","authors":["Bartlomiej Sobieski","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.12488v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.12487v1","updated":"2024-04-18T20:02:52Z","published":"2024-04-18T20:02:52Z","title":"Advancing Applications of Satellite Photogrammetry: Novel Approaches for\n Built-up Area Modeling and Natural Environment Monitoring using\n Stereo/Multi-view Satellite Image-derived 3D Data","summary":" With the development of remote sensing technology in recent decades,\nspaceborne sensors with sub-meter and meter spatial resolution (Worldview and\nPlanetScope) have achieved a considerable image quality to generate 3D\ngeospatial data via a stereo matching pipeline. These achievements have\nsignificantly increased the data accessibility in 3D, necessitating adapting\nthese 3D geospatial data to analyze human and natural environments. This\ndissertation explores several novel approaches based on stereo and multi-view\nsatellite image-derived 3D geospatial data, to deal with remote sensing\napplication issues for built-up area modeling and natural environment\nmonitoring, including building model 3D reconstruction, glacier dynamics\ntracking, and lake algae monitoring. Specifically, the dissertation introduces\nfour parts of novel approaches that deal with the spatial and temporal\nchallenges with satellite-derived 3D data. The first study advances LoD-2\nbuilding modeling from satellite-derived Orthophoto and DSMs with a novel\napproach employing a model-driven workflow that generates building rectangular\n3D geometry models. Secondly, we further enhanced our building reconstruction\nframework for dense urban areas and non-rectangular purposes, we implemented\ndeep learning for unit-level segmentation and introduced a gradient-based\ncircle reconstruction for circular buildings to develop a polygon composition\ntechnique for advanced building LoD2 reconstruction. Our third study utilizes\nhigh-spatiotemporal resolution PlanetScope satellite imagery for glacier\ntracking at 3D level in mid-latitude regions. Finally, we proposed a term as\n\"Algal Behavior Function\" to refine the quantification of chlorophyll-a\nconcentrations from satellite imagery in water quality monitoring, addressing\nalgae fluctuations and timing discrepancies between satellite observations and\nfield measurements, thus enhancing the precision of underwater algae volume\nestimates. Overall, this dissertation demonstrates the extensive potential of\nsatellite photogrammetry applications in addressing urban and environmental\nchallenges. It further showcases innovative analytical methodologies that\nenhance the applicability of adapting stereo and multi-view very\nhigh-resolution satellite-derived 3D data. (See full abstract in the document)\n","authors":["Shengxi Gui"],"pdf_url":"https://arxiv.org/pdf/2404.12487v1.pdf","comment":"Ph.D. Dissertation, Geospatial Data Analytics Lab, The Ohio State\n University, 2024, offical version is available in OhioLINK"},{"id":"http://arxiv.org/abs/2403.12459v2","updated":"2024-04-18T19:55:22Z","published":"2024-03-19T05:30:50Z","title":"Non-negative Contrastive Learning","summary":" Deep representations have shown promising performance when transferred to\ndownstream tasks in a black-box manner. Yet, their inherent lack of\ninterpretability remains a significant challenge, as these features are often\nopaque to human understanding. In this paper, we propose Non-negative\nContrastive Learning (NCL), a renaissance of Non-negative Matrix Factorization\n(NMF) aimed at deriving interpretable features. The power of NCL lies in its\nenforcement of non-negativity constraints on features, reminiscent of NMF's\ncapability to extract features that align closely with sample clusters. NCL not\nonly aligns mathematically well with an NMF objective but also preserves NMF's\ninterpretability attributes, resulting in a more sparse and disentangled\nrepresentation compared to standard contrastive learning (CL). Theoretically,\nwe establish guarantees on the identifiability and downstream generalization of\nNCL. Empirically, we show that these advantages enable NCL to outperform CL\nsignificantly on feature disentanglement, feature selection, as well as\ndownstream classification tasks. At last, we show that NCL can be easily\nextended to other learning scenarios and benefit supervised learning as well.\nCode is available at https://github.com/PKU-ML/non_neg.\n","authors":["Yifei Wang","Qi Zhang","Yaoyu Guo","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12459v2.pdf","comment":"22 pages. Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2402.17986v2","updated":"2024-04-18T19:44:53Z","published":"2024-02-28T02:06:11Z","title":"PolyOculus: Simultaneous Multi-view Image-based Novel View Synthesis","summary":" This paper considers the problem of generative novel view synthesis (GNVS),\ngenerating novel, plausible views of a scene given a limited number of known\nviews. Here, we propose a set-based generative model that can simultaneously\ngenerate multiple, self-consistent new views, conditioned on any number of\nviews. Our approach is not limited to generating a single image at a time and\ncan condition on a variable number of views. As a result, when generating a\nlarge number of views, our method is not restricted to a low-order\nautoregressive generation approach and is better able to maintain generated\nimage quality over large sets of images. We evaluate our model on standard NVS\ndatasets and show that it outperforms the state-of-the-art image-based GNVS\nbaselines. Further, we show that the model is capable of generating sets of\nviews that have no natural sequential ordering, like loops and binocular\ntrajectories, and significantly outperforms other methods on such tasks.\n","authors":["Jason J. Yu","Tristan Aumentado-Armstrong","Fereshteh Forghani","Konstantinos G. Derpanis","Marcus A. Brubaker"],"pdf_url":"https://arxiv.org/pdf/2402.17986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07531v2","updated":"2024-04-18T19:43:25Z","published":"2023-12-12T18:57:46Z","title":"WHAM: Reconstructing World-grounded Humans with Accurate 3D Motion","summary":" The estimation of 3D human motion from video has progressed rapidly but\ncurrent methods still have several key limitations. First, most methods\nestimate the human in camera coordinates. Second, prior work on estimating\nhumans in global coordinates often assumes a flat ground plane and produces\nfoot sliding. Third, the most accurate methods rely on computationally\nexpensive optimization pipelines, limiting their use to offline applications.\nFinally, existing video-based methods are surprisingly less accurate than\nsingle-frame methods. We address these limitations with WHAM (World-grounded\nHumans with Accurate Motion), which accurately and efficiently reconstructs 3D\nhuman motion in a global coordinate system from video. WHAM learns to lift 2D\nkeypoint sequences to 3D using motion capture data and fuses this with video\nfeatures, integrating motion context and visual information. WHAM exploits\ncamera angular velocity estimated from a SLAM method together with human motion\nto estimate the body's global trajectory. We combine this with a contact-aware\ntrajectory refinement method that lets WHAM capture human motion in diverse\nconditions, such as climbing stairs. WHAM outperforms all existing 3D human\nmotion recovery methods across multiple in-the-wild benchmarks. Code will be\navailable for research purposes at http://wham.is.tue.mpg.de/\n","authors":["Soyong Shin","Juyong Kim","Eni Halilaj","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2312.07531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16094v2","updated":"2024-04-18T19:38:18Z","published":"2023-11-27T18:59:02Z","title":"Street TryOn: Learning In-the-Wild Virtual Try-On from Unpaired Person\n Images","summary":" Most existing methods for virtual try-on focus on studio person images with a\nlimited range of poses and clean backgrounds. They can achieve plausible\nresults for this studio try-on setting by learning to warp a garment image to\nfit a person's body from paired training data, i.e., garment images paired with\nimages of people wearing the same garment. Such data is often collected from\ncommercial websites, where each garment is demonstrated both by itself and on\nseveral models. By contrast, it is hard to collect paired data for in-the-wild\nscenes, and therefore, virtual try-on for casual images of people with more\ndiverse poses against cluttered backgrounds is rarely studied.\n In this work, we fill the gap by introducing a StreetTryOn benchmark to\nevaluate in-the-wild virtual try-on performance and proposing a novel method\nthat can learn it without paired data, from a set of in-the-wild person images\ndirectly. Our method achieves robust performance across shop and street domains\nusing a novel DensePose warping correction method combined with diffusion-based\nconditional inpainting. Our experiments show competitive performance for\nstandard studio try-on tasks and SOTA performance for street try-on and\ncross-domain try-on tasks.\n","authors":["Aiyu Cui","Jay Mahajan","Viraj Shah","Preeti Gomathinayagam","Chang Liu","Svetlana Lazebnik"],"pdf_url":"https://arxiv.org/pdf/2311.16094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08514v3","updated":"2024-04-18T19:30:49Z","published":"2024-04-12T14:54:26Z","title":"NIR-Assisted Image Denoising: A Selective Fusion Approach and A\n Real-World Benchmark Dataset","summary":" Despite the significant progress in image denoising, it is still challenging\nto restore fine-scale details while removing noise, especially in extremely\nlow-light environments. Leveraging near-infrared (NIR) images to assist visible\nRGB image denoising shows the potential to address this issue, becoming a\npromising technology. Nonetheless, existing works still struggle with taking\nadvantage of NIR information effectively for real-world image denoising, due to\nthe content inconsistency between NIR-RGB images and the scarcity of real-world\npaired datasets. To alleviate the problem, we propose an efficient Selective\nFusion Module (SFM), which can be plug-and-played into the advanced denoising\nnetworks to merge the deep NIR-RGB features. Specifically, we sequentially\nperform the global and local modulation for NIR and RGB features, and then\nintegrate the two modulated features. Furthermore, we present a Real-world\nNIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse\nscenarios as well as various noise levels. Extensive experiments on both\nsynthetic and our real-world datasets demonstrate that the proposed method\nachieves better results than state-of-the-art ones. The dataset, codes, and\npre-trained models will be publicly available at\nhttps://github.com/ronjonxu/NAID.\n","authors":["Rongjian Xu","Zhilu Zhang","Renlong Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.08514v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.00815v2","updated":"2024-04-18T19:22:37Z","published":"2024-03-31T22:18:56Z","title":"Towards Realistic Scene Generation with LiDAR Diffusion Models","summary":" Diffusion models (DMs) excel in photo-realistic image synthesis, but their\nadaptation to LiDAR scene generation poses a substantial hurdle. This is\nprimarily because DMs operating in the point space struggle to preserve the\ncurve-like patterns and 3D geometry of LiDAR scenes, which consumes much of\ntheir representation power. In this paper, we propose LiDAR Diffusion Models\n(LiDMs) to generate LiDAR-realistic scenes from a latent space tailored to\ncapture the realism of LiDAR scenes by incorporating geometric priors into the\nlearning pipeline. Our method targets three major desiderata: pattern realism,\ngeometry realism, and object realism. Specifically, we introduce curve-wise\ncompression to simulate real-world LiDAR patterns, point-wise coordinate\nsupervision to learn scene geometry, and patch-wise encoding for a full 3D\nobject context. With these three core designs, our method achieves competitive\nperformance on unconditional LiDAR generation in 64-beam scenario and state of\nthe art on conditional LiDAR generation, while maintaining high efficiency\ncompared to point-based DMs (up to 107$\\times$ faster). Furthermore, by\ncompressing LiDAR scenes into a latent space, we enable the controllability of\nDMs with various conditions such as semantic maps, camera views, and text\nprompts.\n","authors":["Haoxi Ran","Vitor Guizilini","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00815v2.pdf","comment":"CVPR 2024. Project link: https://lidar-diffusion.github.io"},{"id":"http://arxiv.org/abs/2404.12467v1","updated":"2024-04-18T19:04:27Z","published":"2024-04-18T19:04:27Z","title":"Towards Multi-modal Transformers in Federated Learning","summary":" Multi-modal transformers mark significant progress in different domains, but\nsiloed high-quality data hinders their further improvement. To remedy this,\nfederated learning (FL) has emerged as a promising privacy-preserving paradigm\nfor training models without direct access to the raw data held by different\nclients. Despite its potential, a considerable research direction regarding the\nunpaired uni-modal clients and the transformer architecture in FL remains\nunexplored. To fill this gap, this paper explores a transfer multi-modal\nfederated learning (MFL) scenario within the vision-language domain, where\nclients possess data of various modalities distributed across different\ndatasets. We systematically evaluate the performance of existing methods when a\ntransformer architecture is utilized and introduce a novel framework called\nFederated modality complementary and collaboration (FedCola) by addressing the\nin-modality and cross-modality gaps among clients. Through extensive\nexperiments across various FL settings, FedCola demonstrates superior\nperformance over previous approaches, offering new perspectives on future\nfederated training of multi-modal transformers.\n","authors":["Guangyu Sun","Matias Mendieta","Aritra Dutta","Xin Li","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.12467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09611v4","updated":"2024-04-18T18:51:04Z","published":"2024-03-14T17:51:32Z","title":"MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training","summary":" In this work, we discuss building performant Multimodal Large Language Models\n(MLLMs). In particular, we study the importance of various architecture\ncomponents and data choices. Through careful and comprehensive ablations of the\nimage encoder, the vision language connector, and various pre-training data\nchoices, we identified several crucial design lessons. For example, we\ndemonstrate that for large-scale multimodal pre-training using a careful mix of\nimage-caption, interleaved image-text, and text-only data is crucial for\nachieving state-of-the-art (SOTA) few-shot results across multiple benchmarks,\ncompared to other published pre-training results. Further, we show that the\nimage encoder together with image resolution and the image token count has\nsubstantial impact, while the vision-language connector design is of\ncomparatively negligible importance. By scaling up the presented recipe, we\nbuild MM1, a family of multimodal models up to 30B parameters, including both\ndense models and mixture-of-experts (MoE) variants, that are SOTA in\npre-training metrics and achieve competitive performance after supervised\nfine-tuning on a range of established multimodal benchmarks. Thanks to\nlarge-scale pre-training, MM1 enjoys appealing properties such as enhanced\nin-context learning, and multi-image reasoning, enabling few-shot\nchain-of-thought prompting.\n","authors":["Brandon McKinzie","Zhe Gan","Jean-Philippe Fauconnier","Sam Dodge","Bowen Zhang","Philipp Dufter","Dhruti Shah","Xianzhi Du","Futang Peng","Floris Weers","Anton Belyi","Haotian Zhang","Karanjeet Singh","Doug Kang","Ankur Jain","Hongyu Hè","Max Schwarzer","Tom Gunter","Xiang Kong","Aonan Zhang","Jianyu Wang","Chong Wang","Nan Du","Tao Lei","Sam Wiseman","Guoli Yin","Mark Lee","Zirui Wang","Ruoming Pang","Peter Grasch","Alexander Toshev","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.09611v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07398v2","updated":"2024-04-18T18:49:38Z","published":"2024-01-15T00:27:41Z","title":"Cross Domain Early Crop Mapping using CropSTGAN","summary":" Driven by abundant satellite imagery, machine learning-based approaches have\nrecently been promoted to generate high-resolution crop cultivation maps to\nsupport many agricultural applications. One of the major challenges faced by\nthese approaches is the limited availability of ground truth labels. In the\nabsence of ground truth, existing work usually adopts the \"direct transfer\nstrategy\" that trains a classifier using historical labels collected from other\nregions and then applies the trained model to the target region. Unfortunately,\nthe spectral features of crops exhibit inter-region and inter-annual\nvariability due to changes in soil composition, climate conditions, and crop\nprogress, the resultant models perform poorly on new and unseen regions or\nyears. Despite recent efforts, such as the application of the deep adaptation\nneural network (DANN) model structure in the deep adaptation crop\nclassification network (DACCN), to tackle the above cross-domain challenges,\ntheir effectiveness diminishes significantly when there is a large\ndissimilarity between the source and target regions. This paper introduces the\nCrop Mapping Spectral-temporal Generative Adversarial Neural Network\n(CropSTGAN), a novel solution for cross-domain challenges, that doesn't require\ntarget domain labels. CropSTGAN learns to transform the target domain's\nspectral features to those of the source domain, effectively bridging large\ndissimilarities. Additionally, it employs an identity loss to maintain the\nintrinsic local structure of the data. Comprehensive experiments across various\nregions and years demonstrate the benefits and effectiveness of the proposed\napproach. In experiments, CropSTGAN is benchmarked against various\nstate-of-the-art (SOTA) methods. Notably, CropSTGAN significantly outperforms\nthese methods in scenarios with large data distribution dissimilarities between\nthe target and source domains.\n","authors":["Yiqun Wang","Hui Huang","Radu State"],"pdf_url":"https://arxiv.org/pdf/2401.07398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15969v2","updated":"2024-04-18T18:48:31Z","published":"2024-01-29T08:58:07Z","title":"Routers in Vision Mixture of Experts: An Empirical Study","summary":" Mixture-of-Experts (MoE) models are a promising way to scale up model\ncapacity without significantly increasing computational cost. A key component\nof MoEs is the router, which decides which subset of parameters (experts)\nprocess which feature embeddings (tokens). In this paper, we present a\ncomprehensive study of routers in MoEs for computer vision tasks. We introduce\na unified MoE formulation that subsumes different MoEs with two parametric\nrouting tensors. This formulation covers both sparse MoE, which uses a binary\nor hard assignment between experts and tokens, and soft MoE, which uses a soft\nassignment between experts and weighted combinations of tokens. Routers for\nsparse MoEs can be further grouped into two variants: Token Choice, which\nmatches experts to each token, and Expert Choice, which matches tokens to each\nexpert. We conduct head-to-head experiments with 6 different routers, including\nexisting routers from prior work and new ones we introduce. We show that (i)\nmany routers originally developed for language modeling can be adapted to\nperform strongly in vision tasks, (ii) in sparse MoE, Expert Choice routers\ngenerally outperform Token Choice routers, and (iii) soft MoEs generally\noutperform sparse MoEs with a fixed compute budget. These results provide new\ninsights regarding the crucial role of routers in vision MoE models.\n","authors":["Tianlin Liu","Mathieu Blondel","Carlos Riquelme","Joan Puigcerver"],"pdf_url":"https://arxiv.org/pdf/2401.15969v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03452v5","updated":"2024-04-18T18:26:39Z","published":"2024-03-06T04:36:43Z","title":"D4C Glove-train: Solving the RPM and Bongard-logo Problem by\n Circumscribing and Building Distribution for Concepts","summary":" This paper achieves noteworthy progress in the realm of abstract reasoning,\nparticularly in addressing Raven's Progressive Matrices (RPM) and Bongard-Logo\nchallenges. Initially, we introduce Lico-Net, a novel baseline model that\nresolves RPM problems with remarkable accuracy. Leveraging this foundation, we\nadvance with the D3C approach, which advocates representing the underlying\nconcepts in abstract reasoning problems through distributions. This perspective\nenhances the performance of both Lico-Net and a baseline model excelling in\nBongard-Logo tasks. To bolster the computational efficiency of D3C, we present\nthe D3C-cos variant, offering a streamlined yet precise solution. Furthermore,\nwe propose the D2C method, redefining conceptual boundaries within these\ndomains and bridging the divide between high-level abstractions and their\nlower-dimensional counterparts. Finally, we extend our methodology to D4C,\nemploying adversarial techniques to refine conceptual boundaries further and\ndemonstrate substantial improvements in both RPM and Bongard-Logo challenges.\nOverall, our contributions present a fresh outlook and practical advancements\nin the field of abstract reasoning.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03452v5.pdf","comment":"18 pages, 19 figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.12450v1","updated":"2024-04-18T18:25:00Z","published":"2024-04-18T18:25:00Z","title":"Enhancing AI Diagnostics: Autonomous Lesion Masking via Semi-Supervised\n Deep Learning","summary":" This study presents an unsupervised domain adaptation method aimed at\nautonomously generating image masks outlining regions of interest (ROIs) for\ndifferentiating breast lesions in breast ultrasound (US) imaging. Our\nsemi-supervised learning approach utilizes a primitive model trained on a small\npublic breast US dataset with true annotations. This model is then iteratively\nrefined for the domain adaptation task, generating pseudo-masks for our\nprivate, unannotated breast US dataset. The dataset, twice the size of the\npublic one, exhibits considerable variability in image acquisition perspectives\nand demographic representation, posing a domain-shift challenge. Unlike typical\ndomain adversarial training, we employ downstream classification outcomes as a\nbenchmark to guide the updating of pseudo-masks in subsequent iterations. We\nfound the classification precision to be highly correlated with the\ncompleteness of the generated ROIs, which promotes the explainability of the\ndeep learning classification model. Preliminary findings demonstrate the\nefficacy and reliability of this approach in streamlining the ROI annotation\nprocess, thereby enhancing the classification and localization of breast\nlesions for more precise and interpretable diagnoses.\n","authors":["Ting-Ruen Wei","Michele Hell","Dang Bich Thuy Le","Aren Vierra","Ran Pang","Mahesh Patel","Young Kang","Yuling Yan"],"pdf_url":"https://arxiv.org/pdf/2404.12450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12440v1","updated":"2024-04-18T18:01:15Z","published":"2024-04-18T18:01:15Z","title":"Spot-Compose: A Framework for Open-Vocabulary Object Retrieval and\n Drawer Manipulation in Point Clouds","summary":" In recent years, modern techniques in deep learning and large-scale datasets\nhave led to impressive progress in 3D instance segmentation, grasp pose\nestimation, and robotics. This allows for accurate detection directly in 3D\nscenes, object- and environment-aware grasp prediction, as well as robust and\nrepeatable robotic manipulation. This work aims to integrate these recent\nmethods into a comprehensive framework for robotic interaction and manipulation\nin human-centric environments. Specifically, we leverage 3D reconstructions\nfrom a commodity 3D scanner for open-vocabulary instance segmentation,\nalongside grasp pose estimation, to demonstrate dynamic picking of objects, and\nopening of drawers. We show the performance and robustness of our model in two\nsets of real-world experiments including dynamic object retrieval and drawer\nopening, reporting a 51% and 82% success rate respectively. Code of our\nframework as well as videos are available on: https://spot-compose.github.io/.\n","authors":["Oliver Lemke","Zuria Bauer","René Zurbrügg","Marc Pollefeys","Francis Engelmann","Hermann Blum"],"pdf_url":"https://arxiv.org/pdf/2404.12440v1.pdf","comment":"Accepted at ICRA 2024 Workshops. Code and videos available at\n https://spot-compose.github.io/"},{"id":"http://arxiv.org/abs/2404.12500v1","updated":"2024-04-18T20:43:08Z","published":"2024-04-18T20:43:08Z","title":"UIClip: A Data-driven Model for Assessing User Interface Design","summary":" User interface (UI) design is a difficult yet important task for ensuring the\nusability, accessibility, and aesthetic qualities of applications. In our\npaper, we develop a machine-learned model, UIClip, for assessing the design\nquality and visual relevance of a UI given its screenshot and natural language\ndescription. To train UIClip, we used a combination of automated crawling,\nsynthetic augmentation, and human ratings to construct a large-scale dataset of\nUIs, collated by description and ranked by design quality. Through training on\nthe dataset, UIClip implicitly learns properties of good and bad designs by i)\nassigning a numerical score that represents a UI design's relevance and quality\nand ii) providing design suggestions. In an evaluation that compared the\noutputs of UIClip and other baselines to UIs rated by 12 human designers, we\nfound that UIClip achieved the highest agreement with ground-truth rankings.\nFinally, we present three example applications that demonstrate how UIClip can\nfacilitate downstream applications that rely on instantaneous assessment of UI\ndesign quality: i) UI code generation, ii) UI design tips generation, and iii)\nquality-aware UI example search.\n","authors":["Jason Wu","Yi-Hao Peng","Amanda Li","Amanda Swearngin","Jeffrey P. Bigham","Jeffrey Nichols"],"pdf_url":"https://arxiv.org/pdf/2404.12500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05809v2","updated":"2024-04-18T09:47:48Z","published":"2024-02-08T16:47:43Z","title":"You Only Need One Color Space: An Efficient Network for Low-light Image\n Enhancement","summary":" Low-Light Image Enhancement (LLIE) task tends to restore the details and\nvisual information from corrupted low-light images. Most existing methods learn\nthe mapping function between low/normal-light images by Deep Neural Networks\n(DNNs) on sRGB and HSV color space. Nevertheless, enhancement involves\namplifying image signals, and applying these color spaces to low-light images\nwith a low signal-to-noise ratio can introduce sensitivity and instability into\nthe enhancement process. Consequently, this results in the presence of color\nartifacts and brightness artifacts in the enhanced images. To alleviate this\nproblem, we propose a novel trainable color space, named\nHorizontal/Vertical-Intensity (HVI). It not only decouples brightness and color\nfrom RGB channels to mitigate the instability during enhancement but also\nadapts to low-light images in different illumination ranges due to the\ntrainable parameters. Further, we design a novel Color and Intensity Decoupling\nNetwork (CIDNet) with two branches dedicated to processing the decoupled image\nbrightness and color in the HVI space. Within CIDNet, we introduce the\nLightweight Cross-Attention (LCA) module to facilitate interaction between\nimage structure and content information in both branches, while also\nsuppressing noise in low-light images. Finally, we conducted 22 quantitative\nand qualitative experiments to show that the proposed CIDNet outperforms the\nstate-of-the-art methods on 11 datasets. The code is available at\nhttps://github.com/Fediory/HVI-CIDNet.\n","authors":["Yixu Feng","Cheng Zhang","Pei Wang","Peng Wu","Qingsen Yan","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.05809v2.pdf","comment":null}]},"2024-04-19T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.13046v1","updated":"2024-04-19T17:59:48Z","published":"2024-04-19T17:59:48Z","title":"MoVA: Adapting Mixture of Vision Experts to Multimodal Context","summary":" As the key component in multimodal large language models (MLLMs), the ability\nof the visual encoder greatly affects MLLM's understanding on diverse image\ncontent. Although some large-scale pretrained vision encoders such as vision\nencoders in CLIP and DINOv2 have brought promising performance, we found that\nthere is still no single vision encoder that can dominate various image content\nunderstanding, e.g., the CLIP vision encoder leads to outstanding results on\ngeneral image understanding but poor performance on document or chart content.\nTo alleviate the bias of CLIP vision encoder, we first delve into the inherent\nbehavior of different pre-trained vision encoders and then propose the MoVA, a\npowerful and novel MLLM, adaptively routing and fusing task-specific vision\nexperts with a coarse-to-fine mechanism. In the coarse-grained stage, we design\na context-aware expert routing strategy to dynamically select the most suitable\nvision experts according to the user instruction, input image, and expertise of\nvision experts. This benefits from the powerful model function understanding\nability of the large language model (LLM) equipped with expert-routing low-rank\nadaptation (LoRA). In the fine-grained stage, we elaborately conduct the\nmixture-of-vision-expert adapter (MoV-Adapter) to extract and fuse\ntask-specific knowledge from various experts. This coarse-to-fine paradigm\neffectively leverages representations from experts based on multimodal context\nand model expertise, further enhancing the generalization ability. We conduct\nextensive experiments to evaluate the effectiveness of the proposed approach.\nWithout any bells and whistles, MoVA can achieve significant performance gains\nover current state-of-the-art methods in a wide range of challenging multimodal\nbenchmarks. Codes and models will be available at\nhttps://github.com/TempleX98/MoVA.\n","authors":["Zhuofan Zong","Bingqi Ma","Dazhong Shen","Guanglu Song","Hao Shao","Dongzhi Jiang","Hongsheng Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13044v1","updated":"2024-04-19T17:58:04Z","published":"2024-04-19T17:58:04Z","title":"Unified Scene Representation and Reconstruction for 3D Large Language\n Models","summary":" Enabling Large Language Models (LLMs) to interact with 3D environments is\nchallenging. Existing approaches extract point clouds either from ground truth\n(GT) geometry or 3D scenes reconstructed by auxiliary models. Text-image\naligned 2D features from CLIP are then lifted to point clouds, which serve as\ninputs for LLMs. However, this solution lacks the establishment of 3D\npoint-to-point connections, leading to a deficiency of spatial structure\ninformation. Concurrently, the absence of integration and unification between\nthe geometric and semantic representations of the scene culminates in a\ndiminished level of 3D scene understanding. In this paper, we demonstrate the\nimportance of having a unified scene representation and reconstruction\nframework, which is essential for LLMs in 3D scenes. Specifically, we introduce\nUni3DR^2 extracts 3D geometric and semantic aware representation features via\nthe frozen pre-trained 2D foundation models (e.g., CLIP and SAM) and a\nmulti-scale aggregate 3D decoder. Our learned 3D representations not only\ncontribute to the reconstruction process but also provide valuable knowledge\nfor LLMs. Experimental results validate that our Uni3DR^2 yields convincing\ngains over the baseline on the 3D reconstruction dataset ScanNet (increasing\nF-Score by +1.8\\%). When applied to LLMs, our Uni3DR^2-LLM exhibits superior\nperformance over the baseline on the 3D vision-language understanding dataset\nScanQA (increasing BLEU-1 by +4.0\\% and +4.2\\% on the val set and test set,\nrespectively). Furthermore, it outperforms the state-of-the-art method that\nuses additional GT point clouds on both ScanQA and 3DMV-VQA.\n","authors":["Tao Chu","Pan Zhang","Xiaoyi Dong","Yuhang Zang","Qiong Liu","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13044v1.pdf","comment":"Project Page: https://chtsy.github.io/uni3drr-page/"},{"id":"http://arxiv.org/abs/2404.13043v1","updated":"2024-04-19T17:57:29Z","published":"2024-04-19T17:57:29Z","title":"Data Alignment for Zero-Shot Concept Generation in Dermatology AI","summary":" AI in dermatology is evolving at a rapid pace but the major limitation to\ntraining trustworthy classifiers is the scarcity of data with ground-truth\nconcept level labels, which are meta-labels semantically meaningful to humans.\nFoundation models like CLIP providing zero-shot capabilities can help alleviate\nthis challenge by leveraging vast amounts of image-caption pairs available on\nthe internet. CLIP can be fine-tuned using domain specific image-caption pairs\nto improve classification performance. However, CLIP's pre-training data is not\nwell-aligned with the medical jargon that clinicians use to perform diagnoses.\nThe development of large language models (LLMs) in recent years has led to the\npossibility of leveraging the expressive nature of these models to generate\nrich text. Our goal is to use these models to generate caption text that aligns\nwell with both the clinical lexicon and with the natural human language used in\nCLIP's pre-training data. Starting with captions used for images in PubMed\narticles, we extend them by passing the raw captions through an LLM fine-tuned\non the field's several textbooks. We find that using captions generated by an\nexpressive fine-tuned LLM like GPT-3.5 improves downstream zero-shot concept\nclassification performance.\n","authors":["Soham Gadgil","Mahtab Bigverdi"],"pdf_url":"https://arxiv.org/pdf/2404.13043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13040v1","updated":"2024-04-19T17:53:43Z","published":"2024-04-19T17:53:43Z","title":"Analysis of Classifier-Free Guidance Weight Schedulers","summary":" Classifier-Free Guidance (CFG) enhances the quality and condition adherence\nof text-to-image diffusion models. It operates by combining the conditional and\nunconditional predictions using a fixed weight. However, recent works vary the\nweights throughout the diffusion process, reporting superior results but\nwithout providing any rationale or analysis. By conducting comprehensive\nexperiments, this paper provides insights into CFG weight schedulers. Our\nfindings suggest that simple, monotonically increasing weight schedulers\nconsistently lead to improved performances, requiring merely a single line of\ncode. In addition, more complex parametrized schedulers can be optimized for\nfurther improvement, but do not generalize across different models and tasks.\n","authors":["Xi Wang","Nicolas Dufour","Nefeli Andreou","Marie-Paule Cani","Victoria Fernandez Abrevaya","David Picard","Vicky Kalogeiton"],"pdf_url":"https://arxiv.org/pdf/2404.13040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13039v1","updated":"2024-04-19T17:51:52Z","published":"2024-04-19T17:51:52Z","title":"LaPA: Latent Prompt Assist Model For Medical Visual Question Answering","summary":" Medical visual question answering (Med-VQA) aims to automate the prediction\nof correct answers for medical images and questions, thereby assisting\nphysicians in reducing repetitive tasks and alleviating their workload.\nExisting approaches primarily focus on pre-training models using additional and\ncomprehensive datasets, followed by fine-tuning to enhance performance in\ndownstream tasks. However, there is also significant value in exploring\nexisting models to extract clinically relevant information. In this paper, we\npropose the Latent Prompt Assist model (LaPA) for medical visual question\nanswering. Firstly, we design a latent prompt generation module to generate the\nlatent prompt with the constraint of the target answer. Subsequently, we\npropose a multi-modal fusion block with latent prompt fusion module that\nutilizes the latent prompt to extract clinical-relevant information from\nuni-modal and multi-modal features. Additionally, we introduce a prior\nknowledge fusion module to integrate the relationship between diseases and\norgans with the clinical-relevant information. Finally, we combine the final\nintegrated information with image-language cross-modal information to predict\nthe final answers. Experimental results on three publicly available Med-VQA\ndatasets demonstrate that LaPA outperforms the state-of-the-art model ARL,\nachieving improvements of 1.83%, 0.63%, and 1.80% on VQA-RAD, SLAKE, and\nVQA-2019, respectively. The code is publicly available at\nhttps://github.com/GaryGuTC/LaPA_model.\n","authors":["Tiancheng Gu","Kaicheng Yang","Dongnan Liu","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2404.13039v1.pdf","comment":"10 pages, 4 figures, Accepted by CVPRW2024"},{"id":"http://arxiv.org/abs/2404.13026v1","updated":"2024-04-19T17:41:05Z","published":"2024-04-19T17:41:05Z","title":"PhysDreamer: Physics-Based Interaction with 3D Objects via Video\n Generation","summary":" Realistic object interactions are crucial for creating immersive virtual\nexperiences, yet synthesizing realistic 3D object dynamics in response to novel\ninteractions remains a significant challenge. Unlike unconditional or\ntext-conditioned dynamics generation, action-conditioned dynamics requires\nperceiving the physical material properties of objects and grounding the 3D\nmotion prediction on these properties, such as object stiffness. However,\nestimating physical material properties is an open problem due to the lack of\nmaterial ground-truth data, as measuring these properties for real objects is\nhighly difficult. We present PhysDreamer, a physics-based approach that endows\nstatic 3D objects with interactive dynamics by leveraging the object dynamics\npriors learned by video generation models. By distilling these priors,\nPhysDreamer enables the synthesis of realistic object responses to novel\ninteractions, such as external forces or agent manipulations. We demonstrate\nour approach on diverse examples of elastic objects and evaluate the realism of\nthe synthesized interactions through a user study. PhysDreamer takes a step\ntowards more engaging and realistic virtual experiences by enabling static 3D\nobjects to dynamically respond to interactive stimuli in a physically plausible\nmanner. See our project page at https://physdreamer.github.io/.\n","authors":["Tianyuan Zhang","Hong-Xing Yu","Rundi Wu","Brandon Y. Feng","Changxi Zheng","Noah Snavely","Jiajun Wu","William T. Freeman"],"pdf_url":"https://arxiv.org/pdf/2404.13026v1.pdf","comment":"Project website at: https://physdreamer.github.io/"},{"id":"http://arxiv.org/abs/2404.13024v1","updated":"2024-04-19T17:39:50Z","published":"2024-04-19T17:39:50Z","title":"BANF: Band-limited Neural Fields for Levels of Detail Reconstruction","summary":" Largely due to their implicit nature, neural fields lack a direct mechanism\nfor filtering, as Fourier analysis from discrete signal processing is not\ndirectly applicable to these representations. Effective filtering of neural\nfields is critical to enable level-of-detail processing in downstream\napplications, and support operations that involve sampling the field on regular\ngrids (e.g. marching cubes). Existing methods that attempt to decompose neural\nfields in the frequency domain either resort to heuristics or require extensive\nmodifications to the neural field architecture. We show that via a simple\nmodification, one can obtain neural fields that are low-pass filtered, and in\nturn show how this can be exploited to obtain a frequency decomposition of the\nentire signal. We demonstrate the validity of our technique by investigating\nlevel-of-detail reconstruction, and showing how coarser representations can be\ncomputed effectively.\n","authors":["Ahan Shabanov","Shrisudhan Govindarajan","Cody Reading","Lily Goli","Daniel Rebain","Kwang Moo Yi","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2404.13024v1.pdf","comment":"Project Page: https://theialab.github.io/banf"},{"id":"http://arxiv.org/abs/2404.13016v1","updated":"2024-04-19T17:25:43Z","published":"2024-04-19T17:25:43Z","title":"Optimizing Calibration by Gaining Aware of Prediction Correctness","summary":" Model calibration aims to align confidence with prediction correctness. The\nCross-Entropy CE) loss is widely used for calibrator training, which enforces\nthe model to increase confidence on the ground truth class. However, we find\nthe CE loss has intrinsic limitations. For example, for a narrow\nmisclassification, a calibrator trained by the CE loss often produces high\nconfidence on the wrongly predicted class (e.g., a test sample is wrongly\nclassified and its softmax score on the ground truth class is around 0.4),\nwhich is undesirable. In this paper, we propose a new post-hoc calibration\nobjective derived from the aim of calibration. Intuitively, the proposed\nobjective function asks that the calibrator decrease model confidence on\nwrongly predicted samples and increase confidence on correctly predicted\nsamples. Because a sample itself has insufficient ability to indicate\ncorrectness, we use its transformed versions (e.g., rotated, greyscaled and\ncolor-jittered) during calibrator training. Trained on an in-distribution\nvalidation set and tested with isolated, individual test samples, our method\nachieves competitive calibration performance on both in-distribution and\nout-of-distribution test sets compared with the state of the art. Further, our\nanalysis points out the difference between our method and commonly used\nobjectives such as CE loss and mean square error loss, where the latters\nsometimes deviates from the calibration aim.\n","authors":["Yuchi Liu","Lei Wang","Yuli Zou","James Zou","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.13016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13013v1","updated":"2024-04-19T17:22:51Z","published":"2024-04-19T17:22:51Z","title":"Groma: Localized Visual Tokenization for Grounding Multimodal Large\n Language Models","summary":" We introduce Groma, a Multimodal Large Language Model (MLLM) with grounded\nand fine-grained visual perception ability. Beyond holistic image\nunderstanding, Groma is adept at region-level tasks such as region captioning\nand visual grounding. Such capabilities are built upon a localized visual\ntokenization mechanism, where an image input is decomposed into regions of\ninterest and subsequently encoded into region tokens. By integrating region\ntokens into user instructions and model responses, we seamlessly enable Groma\nto understand user-specified region inputs and ground its textual output to\nimages. Besides, to enhance the grounded chat ability of Groma, we curate a\nvisually grounded instruction dataset by leveraging the powerful GPT-4V and\nvisual prompting techniques. Compared with MLLMs that rely on the language\nmodel or external module for localization, Groma consistently demonstrates\nsuperior performances in standard referring and grounding benchmarks,\nhighlighting the advantages of embedding localization into image tokenization.\nProject page: https://groma-mllm.github.io/.\n","authors":["Chuofan Ma","Yi Jiang","Jiannan Wu","Zehuan Yuan","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2404.13013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13002v1","updated":"2024-04-19T16:59:04Z","published":"2024-04-19T16:59:04Z","title":"Towards Robust Ferrous Scrap Material Classification with Deep Learning\n and Conformal Prediction","summary":" In the steel production domain, recycling ferrous scrap is essential for\nenvironmental and economic sustainability, as it reduces both energy\nconsumption and greenhouse gas emissions. However, the classification of scrap\nmaterials poses a significant challenge, requiring advancements in automation\ntechnology. Additionally, building trust among human operators is a major\nobstacle. Traditional approaches often fail to quantify uncertainty and lack\nclarity in model decision-making, which complicates acceptance. In this\narticle, we describe how conformal prediction can be employed to quantify\nuncertainty and add robustness in scrap classification. We have adapted the\nSplit Conformal Prediction technique to seamlessly integrate with\nstate-of-the-art computer vision models, such as the Vision Transformer (ViT),\nSwin Transformer, and ResNet-50, while also incorporating Explainable\nArtificial Intelligence (XAI) methods. We evaluate the approach using a\ncomprehensive dataset of 8147 images spanning nine ferrous scrap classes. The\napplication of the Split Conformal Prediction method allowed for the\nquantification of each model's uncertainties, which enhanced the understanding\nof predictions and increased the reliability of the results. Specifically, the\nSwin Transformer model demonstrated more reliable outcomes than the others, as\nevidenced by its smaller average size of prediction sets and achieving an\naverage classification accuracy exceeding 95%. Furthermore, the Score-CAM\nmethod proved highly effective in clarifying visual features, significantly\nenhancing the explainability of the classification decisions.\n","authors":["Paulo Henrique dos Santos","Valéria de Carvalho Santos","Eduardo José da Silva Luz"],"pdf_url":"https://arxiv.org/pdf/2404.13002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13000v1","updated":"2024-04-19T16:55:12Z","published":"2024-04-19T16:55:12Z","title":"RadRotator: 3D Rotation of Radiographs with Diffusion Models","summary":" Transforming two-dimensional (2D) images into three-dimensional (3D) volumes\nis a well-known yet challenging problem for the computer vision community. In\nthe medical domain, a few previous studies attempted to convert two or more\ninput radiographs into computed tomography (CT) volumes. Following their\neffort, we introduce a diffusion model-based technology that can rotate the\nanatomical content of any input radiograph in 3D space, potentially enabling\nthe visualization of the entire anatomical content of the radiograph from any\nviewpoint in 3D. Similar to previous studies, we used CT volumes to create\nDigitally Reconstructed Radiographs (DRRs) as the training data for our model.\nHowever, we addressed two significant limitations encountered in previous\nstudies: 1. We utilized conditional diffusion models with classifier-free\nguidance instead of Generative Adversarial Networks (GANs) to achieve higher\nmode coverage and improved output image quality, with the only trade-off being\nslower inference time, which is often less critical in medical applications;\nand 2. We demonstrated that the unreliable output of style transfer deep\nlearning (DL) models, such as Cycle-GAN, to transfer the style of actual\nradiographs to DRRs could be replaced with a simple yet effective training\ntransformation that randomly changes the pixel intensity histograms of the\ninput and ground-truth imaging data during training. This transformation makes\nthe diffusion model agnostic to any distribution variations of the input data\npixel intensity, enabling the reliable training of a DL model on input DRRs and\napplying the exact same model to conventional radiographs (or DRRs) during\ninference.\n","authors":["Pouria Rouzrokh","Bardia Khosravi","Shahriar Faghani","Kellen L. Mulford","Michael J. Taunton","Bradley J. Erickson","Cody C. Wyles"],"pdf_url":"https://arxiv.org/pdf/2404.13000v1.pdf","comment":"Website: https://pouriarouzrokh.github.io/RadRotator Online demo:\n https://huggingface.co/spaces/Pouriarouzrokh/RadRotator Article information:\n 16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.11769v2","updated":"2024-04-19T16:50:05Z","published":"2024-04-17T21:52:21Z","title":"QGen: On the Ability to Generalize in Quantization Aware Training","summary":" Quantization lowers memory usage, computational requirements, and latency by\nutilizing fewer bits to represent model weights and activations. In this work,\nwe investigate the generalization properties of quantized neural networks, a\ncharacteristic that has received little attention despite its implications on\nmodel performance. In particular, first, we develop a theoretical model for\nquantization in neural networks and demonstrate how quantization functions as a\nform of regularization. Second, motivated by recent work connecting the\nsharpness of the loss landscape and generalization, we derive an approximate\nbound for the generalization of quantized models conditioned on the amount of\nquantization noise. We then validate our hypothesis by experimenting with over\n2000 models trained on CIFAR-10, CIFAR-100, and ImageNet datasets on\nconvolutional and transformer-based models.\n","authors":["MohammadHossein AskariHemmat","Ahmadreza Jeddi","Reyhane Askari Hemmat","Ivan Lazarevich","Alexander Hoffman","Sudhakar Sah","Ehsan Saboori","Yvon Savaria","Jean-Pierre David"],"pdf_url":"https://arxiv.org/pdf/2404.11769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12986v1","updated":"2024-04-19T16:36:21Z","published":"2024-04-19T16:36:21Z","title":"Nuclei Instance Segmentation of Cryosectioned H&E Stained Histological\n Images using Triple U-Net Architecture","summary":" Nuclei instance segmentation is crucial in oncological diagnosis and cancer\npathology research. H&E stained images are commonly used for medical diagnosis,\nbut pre-processing is necessary before using them for image processing tasks.\nTwo principal pre-processing methods are formalin-fixed paraffin-embedded\nsamples (FFPE) and frozen tissue samples (FS). While FFPE is widely used, it is\ntime-consuming, while FS samples can be processed quickly. Analyzing H&E\nstained images derived from fast sample preparation, staining, and scanning can\npose difficulties due to the swift process, which can result in the degradation\nof image quality. This paper proposes a method that leverages the unique\noptical characteristics of H&E stained images. A three-branch U-Net\narchitecture has been implemented, where each branch contributes to the final\nsegmentation results. The process includes applying watershed algorithm to\nseparate overlapping regions and enhance accuracy. The Triple U-Net\narchitecture comprises an RGB branch, a Hematoxylin branch, and a Segmentation\nbranch. This study focuses on a novel dataset named CryoNuSeg. The results\nobtained through robust experiments outperform the state-of-the-art results\nacross various metrics. The benchmark score for this dataset is AJI 52.5 and PQ\n47.7, achieved through the implementation of U-Net Architecture. However, the\nproposed Triple U-Net architecture achieves an AJI score of 67.41 and PQ of\n50.56. The proposed architecture improves more on AJI than other evaluation\nmetrics, which further justifies the superiority of the Triple U-Net\narchitecture over the baseline U-Net model, as AJI is a more strict evaluation\nmetric. The use of the three-branch U-Net model, followed by watershed\npost-processing, significantly surpasses the benchmark scores, showing\nsubstantial improvement in the AJI score\n","authors":["Zarif Ahmed","Chowdhury Nur E Alam Siddiqi","Fardifa Fathmiul Alam","Tasnim Ahmed","Tareque Mohmud Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2404.12986v1.pdf","comment":"To be published in \"6th IVPR & 11th ICIEV\""},{"id":"http://arxiv.org/abs/2301.00812v5","updated":"2024-04-19T16:10:40Z","published":"2022-12-16T01:04:52Z","title":"One-shot skill assessment in high-stakes domains with limited data via\n meta learning","summary":" Deep Learning (DL) has achieved robust competency assessment in various\nhigh-stakes fields. However, the applicability of DL models is often hampered\nby their substantial data requirements and confinement to specific training\ndomains. This prevents them from transitioning to new tasks where data is\nscarce. Therefore, domain adaptation emerges as a critical element for the\npractical implementation of DL in real-world scenarios. Herein, we introduce\nA-VBANet, a novel meta-learning model capable of delivering domain-agnostic\nskill assessment via one-shot learning. Our methodology has been tested by\nassessing surgical skills on five laparoscopic and robotic simulators and\nreal-life laparoscopic cholecystectomy. Our model successfully adapted with\naccuracies up to 99.5% in one-shot and 99.9% in few-shot settings for simulated\ntasks and 89.7% for laparoscopic cholecystectomy. This study marks the first\ninstance of a domain-agnostic methodology for skill assessment in critical\nfields setting a precedent for the broad application of DL across diverse\nreal-life domains with limited data.\n","authors":["Erim Yanik","Steven Schwaitzberg","Gene Yang","Xavier Intes","Jack Norfleet","Matthew Hackett","Suvranu De"],"pdf_url":"https://arxiv.org/pdf/2301.00812v5.pdf","comment":"23 pages (Main Manuscript + Supplementary Materials + Arxiv Logs), 4\n figures (+2 Supplementary Figures), 2 tables (+5 Supplementary Tables)"},{"id":"http://arxiv.org/abs/2404.12973v1","updated":"2024-04-19T16:01:00Z","published":"2024-04-19T16:01:00Z","title":"Cross-modal Diffusion Modelling for Super-resolved Spatial\n Transcriptomics","summary":" The recent advancement of spatial transcriptomics (ST) allows to characterize\nspatial gene expression within tissue for discovery research. However, current\nST platforms suffer from low resolution, hindering in-depth understanding of\nspatial gene expression. Super-resolution approaches promise to enhance ST maps\nby integrating histology images with gene expressions of profiled tissue spots.\nHowever, current super-resolution methods are limited by restoration\nuncertainty and mode collapse. Although diffusion models have shown promise in\ncapturing complex interactions between multi-modal conditions, it remains a\nchallenge to integrate histology images and gene expression for super-resolved\nST maps. This paper proposes a cross-modal conditional diffusion model for\nsuper-resolving ST maps with the guidance of histology images. Specifically, we\ndesign a multi-modal disentangling network with cross-modal adaptive modulation\nto utilize complementary information from histology images and spatial gene\nexpression. Moreover, we propose a dynamic cross-attention modelling strategy\nto extract hierarchical cell-to-tissue information from histology images.\nLastly, we propose a co-expression-based gene-correlation graph network to\nmodel the co-expression relationship of multiple genes. Experiments show that\nour method outperforms other state-of-the-art methods in ST super-resolution on\nthree public datasets.\n","authors":["Xiaofei Wang","Xingxu Huang","Stephen J. Price","Chao Li"],"pdf_url":"https://arxiv.org/pdf/2404.12973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12966v1","updated":"2024-04-19T15:53:27Z","published":"2024-04-19T15:53:27Z","title":"Eyes Can Deceive: Benchmarking Counterfactual Reasoning Abilities of\n Multi-modal Large Language Models","summary":" Counterfactual reasoning, as a crucial manifestation of human intelligence,\nrefers to making presuppositions based on established facts and extrapolating\npotential outcomes. Existing multimodal large language models (MLLMs) have\nexhibited impressive cognitive and reasoning capabilities, which have been\nexamined across a wide range of Visual Question Answering (VQA) benchmarks.\nNevertheless, how will existing MLLMs perform when faced with counterfactual\nquestions? To answer this question, we first curate a novel\n\\textbf{C}ounter\\textbf{F}actual \\textbf{M}ulti\\textbf{M}odal reasoning\nbenchmark, abbreviated as \\textbf{CFMM}, to systematically assess the\ncounterfactual reasoning capabilities of MLLMs. Our CFMM comprises six\nchallenging tasks, each including hundreds of carefully human-labeled\ncounterfactual questions, to evaluate MLLM's counterfactual reasoning\ncapabilities across diverse aspects. Through experiments, interestingly, we\nfind that existing MLLMs prefer to believe what they see, but ignore the\ncounterfactual presuppositions presented in the question, thereby leading to\ninaccurate responses. Furthermore, we evaluate a wide range of prevalent MLLMs\non our proposed CFMM. The significant gap between their performance on our CFMM\nand that on several VQA benchmarks indicates that there is still considerable\nroom for improvement in existing MLLMs toward approaching human-level\nintelligence. On the other hand, through boosting MLLMs performances on our\nCFMM in the future, potential avenues toward developing MLLMs with advanced\nintelligence can be explored.\n","authors":["Yian Li","Wentao Tian","Yang Jiao","Jingjing Chen","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.12966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12958v1","updated":"2024-04-19T15:40:47Z","published":"2024-04-19T15:40:47Z","title":"Improving Pediatric Pneumonia Diagnosis with Adult Chest X-ray Images\n Utilizing Contrastive Learning and Embedding Similarity","summary":" Despite the advancement of deep learning-based computer-aided diagnosis (CAD)\nmethods for pneumonia from adult chest x-ray (CXR) images, the performance of\nCAD methods applied to pediatric images remains suboptimal, mainly due to the\nlack of large-scale annotated pediatric imaging datasets. Establishing a proper\nframework to leverage existing adult large-scale CXR datasets can thus enhance\npediatric pneumonia detection performance. In this paper, we propose a\nthree-branch parallel path learning-based framework that utilizes both adult\nand pediatric datasets to improve the performance of deep learning models on\npediatric test datasets. The paths are trained with pediatric only, adult only,\nand both types of CXRs, respectively. Our proposed framework utilizes the\nmulti-positive contrastive loss to cluster the classwise embeddings and the\nembedding similarity loss among these three parallel paths to make the\nclasswise embeddings as close as possible to reduce the effect of domain shift.\nExperimental evaluations on open-access adult and pediatric CXR datasets show\nthat the proposed method achieves a superior AUROC score of 0.8464 compared to\n0.8348 obtained using the conventional approach of join training on both\ndatasets. The proposed approach thus paves the way for generalized CAD models\nthat are effective for both adult and pediatric age groups.\n","authors":["Mohammad Zunaed","Anwarul Hasan","Taufiq Hasan"],"pdf_url":"https://arxiv.org/pdf/2404.12958v1.pdf","comment":"Accepted to International Conference of IEEE Engineering in Medicine\n and Biology Society (EMBC), 2024"},{"id":"http://arxiv.org/abs/2404.04876v2","updated":"2024-04-19T15:33:44Z","published":"2024-04-07T08:46:06Z","title":"HiLo: Detailed and Robust 3D Clothed Human Reconstruction with High-and\n Low-Frequency Information of Parametric Models","summary":" Reconstructing 3D clothed human involves creating a detailed geometry of\nindividuals in clothing, with applications ranging from virtual try-on, movies,\nto games. To enable practical and widespread applications, recent advances\npropose to generate a clothed human from an RGB image. However, they struggle\nto reconstruct detailed and robust avatars simultaneously. We empirically find\nthat the high-frequency (HF) and low-frequency (LF) information from a\nparametric model has the potential to enhance geometry details and improve\nrobustness to noise, respectively. Based on this, we propose HiLo, namely\nclothed human reconstruction with high- and low-frequency information, which\ncontains two components. 1) To recover detailed geometry using HF information,\nwe propose a progressive HF Signed Distance Function to enhance the detailed 3D\ngeometry of a clothed human. We analyze that our progressive learning manner\nalleviates large gradients that hinder model convergence. 2) To achieve robust\nreconstruction against inaccurate estimation of the parametric model by using\nLF information, we propose a spatial interaction implicit function. This\nfunction effectively exploits the complementary spatial information from a\nlow-resolution voxel grid of the parametric model. Experimental results\ndemonstrate that HiLo outperforms the state-of-the-art methods by 10.43% and\n9.54% in terms of Chamfer distance on the Thuman2.0 and CAPE datasets,\nrespectively. Additionally, HiLo demonstrates robustness to noise from the\nparametric model, challenging poses, and various clothing styles.\n","authors":["Yifan Yang","Dong Liu","Shuhai Zhang","Zeshuai Deng","Zixiong Huang","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2404.04876v2.pdf","comment":"CVPR 2024 Accepted Paper"},{"id":"http://arxiv.org/abs/2404.12948v1","updated":"2024-04-19T15:26:36Z","published":"2024-04-19T15:26:36Z","title":"Next Generation Loss Function for Image Classification","summary":" Neural networks are trained by minimizing a loss function that defines the\ndiscrepancy between the predicted model output and the target value. The\nselection of the loss function is crucial to achieve task-specific behaviour\nand highly influences the capability of the model. A variety of loss functions\nhave been proposed for a wide range of tasks affecting training and model\nperformance. For classification tasks, the cross entropy is the de-facto\nstandard and usually the first choice. Here, we try to experimentally challenge\nthe well-known loss functions, including cross entropy (CE) loss, by utilizing\nthe genetic programming (GP) approach, a population-based evolutionary\nalgorithm. GP constructs loss functions from a set of operators and leaf nodes\nand these functions are repeatedly recombined and mutated to find an optimal\nstructure. Experiments were carried out on different small-sized datasets\nCIFAR-10, CIFAR-100 and Fashion-MNIST using an Inception model. The 5 best\nfunctions found were evaluated for different model architectures on a set of\nstandard datasets ranging from 2 to 102 classes and very different sizes. One\nfunction, denoted as Next Generation Loss (NGL), clearly stood out showing same\nor better performance for all tested datasets compared to CE. To evaluate the\nNGL function on a large-scale dataset, we tested its performance on the\nImagenet-1k dataset where it showed improved top-1 accuracy compared to models\ntrained with identical settings and other losses. Finally, the NGL was trained\non a segmentation downstream task for Pascal VOC 2012 and COCO-Stuff164k\ndatasets improving the underlying model performance.\n","authors":["Shakhnaz Akhmedova","Nils Körber"],"pdf_url":"https://arxiv.org/pdf/2404.12948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00132v3","updated":"2024-04-19T15:23:43Z","published":"2023-09-29T20:48:44Z","title":"QDFormer: Towards Robust Audiovisual Segmentation in Complex\n Environments with Quantization-based Semantic Decomposition","summary":" Audiovisual segmentation (AVS) is a challenging task that aims to segment\nvisual objects in videos according to their associated acoustic cues. With\nmultiple sound sources and background disturbances involved, establishing\nrobust correspondences between audio and visual contents poses unique\nchallenges due to (1) complex entanglement across sound sources and (2)\nfrequent changes in the occurrence of distinct sound events. Assuming sound\nevents occur independently, the multi-source semantic space can be represented\nas the Cartesian product of single-source sub-spaces. We are motivated to\ndecompose the multi-source audio semantics into single-source semantics for\nmore effective interactions with visual content. We propose a semantic\ndecomposition method based on product quantization, where the multi-source\nsemantics can be decomposed and represented by several disentangled and\nnoise-suppressed single-source semantics. Furthermore, we introduce a\nglobal-to-local quantization mechanism, which distills knowledge from stable\nglobal (clip-level) features into local (frame-level) ones, to handle frequent\nchanges in audio semantics. Extensive experiments demonstrate that our\nsemantically decomposed audio representation significantly improves AVS\nperformance, e.g., +21.2% mIoU on the challenging AVS-Semantic benchmark with\nResNet50 backbone. https://github.com/lxa9867/QSD.\n","authors":["Xiang Li","Jinglu Wang","Xiaohao Xu","Xiulian Peng","Rita Singh","Yan Lu","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2310.00132v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12942v1","updated":"2024-04-19T15:16:04Z","published":"2024-04-19T15:16:04Z","title":"Purposer: Putting Human Motion Generation in Context","summary":" We present a novel method to generate human motion to populate 3D indoor\nscenes. It can be controlled with various combinations of conditioning signals\nsuch as a path in a scene, target poses, past motions, and scenes represented\nas 3D point clouds. State-of-the-art methods are either models specialized to\none single setting, require vast amounts of high-quality and diverse training\ndata, or are unconditional models that do not integrate scene or other\ncontextual information. As a consequence, they have limited applicability and\nrely on costly training data. To address these limitations, we propose a new\nmethod ,dubbed Purposer, based on neural discrete representation learning. Our\nmodel is capable of exploiting, in a flexible manner, different types of\ninformation already present in open access large-scale datasets such as AMASS.\nFirst, we encode unconditional human motion into a discrete latent space.\nSecond, an autoregressive generative model, conditioned with key contextual\ninformation, either with prompting or additive tokens, and trained for\nnext-step prediction in this space, synthesizes sequences of latent indices. We\nfurther design a novel conditioning block to handle future conditioning\ninformation in such a causal model by using a network with two branches to\ncompute separate stacks of features. In this manner, Purposer can generate\nrealistic motion sequences in diverse test scenes. Through exhaustive\nevaluation, we demonstrate that our multi-contextual solution outperforms\nexisting specialized approaches for specific contextual information, both in\nterms of quality and diversity. Our model is trained with short sequences, but\na byproduct of being able to use various conditioning signals is that at test\ntime different combinations can be used to chain short sequences together and\ngenerate long motions within a context scene.\n","authors":["Nicolas Ugrinovic","Thomas Lucas","Fabien Baradel","Philippe Weinzaepfel","Gregory Rogez","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2404.12942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12940v1","updated":"2024-04-19T15:10:54Z","published":"2024-04-19T15:10:54Z","title":"Neural Flow Diffusion Models: Learnable Forward Process for Improved\n Diffusion Modelling","summary":" Conventional diffusion models typically relies on a fixed forward process,\nwhich implicitly defines complex marginal distributions over latent variables.\nThis can often complicate the reverse process' task in learning generative\ntrajectories, and results in costly inference for diffusion models. To address\nthese limitations, we introduce Neural Flow Diffusion Models (NFDM), a novel\nframework that enhances diffusion models by supporting a broader range of\nforward processes beyond the fixed linear Gaussian. We also propose a novel\nparameterization technique for learning the forward process. Our framework\nprovides an end-to-end, simulation-free optimization objective, effectively\nminimizing a variational upper bound on the negative log-likelihood.\nExperimental results demonstrate NFDM's strong performance, evidenced by\nstate-of-the-art likelihood estimation. Furthermore, we investigate NFDM's\ncapacity for learning generative dynamics with specific characteristics, such\nas deterministic straight lines trajectories. This exploration underscores\nNFDM's versatility and its potential for a wide range of applications.\n","authors":["Grigory Bartosh","Dmitry Vetrov","Christian A. Naesseth"],"pdf_url":"https://arxiv.org/pdf/2404.12940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12925v1","updated":"2024-04-19T14:52:25Z","published":"2024-04-19T14:52:25Z","title":"A Hybrid Generative and Discriminative PointNet on Unordered Point Sets","summary":" As point cloud provides a natural and flexible representation usable in\nmyriad applications (e.g., robotics and self-driving cars), the ability to\nsynthesize point clouds for analysis becomes crucial. Recently, Xie et al.\npropose a generative model for unordered point sets in the form of an\nenergy-based model (EBM). Despite the model achieving an impressive performance\nfor point cloud generation, one separate model needs to be trained for each\ncategory to capture the complex point set distributions. Besides, their method\nis unable to classify point clouds directly and requires additional fine-tuning\nfor classification. One interesting question is: Can we train a single network\nfor a hybrid generative and discriminative model of point clouds? A similar\nquestion has recently been answered in the affirmative for images, introducing\nthe framework of Joint Energy-based Model (JEM), which achieves high\nperformance in image classification and generation simultaneously. This paper\nproposes GDPNet, the first hybrid Generative and Discriminative PointNet that\nextends JEM for point cloud classification and generation. Our GDPNet retains\nstrong discriminative power of modern PointNet classifiers, while generating\npoint cloud samples rivaling state-of-the-art generative approaches.\n","authors":["Yang Ye","Shihao Ji"],"pdf_url":"https://arxiv.org/pdf/2404.12925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12922v1","updated":"2024-04-19T14:45:27Z","published":"2024-04-19T14:45:27Z","title":"Is Retain Set All You Need in Machine Unlearning? Restoring Performance\n of Unlearned Models with Out-Of-Distribution Images","summary":" In this paper, we introduce Selective-distillation for Class and\nArchitecture-agnostic unleaRning (SCAR), a novel approximate unlearning method.\nSCAR efficiently eliminates specific information while preserving the model's\ntest accuracy without using a retain set, which is a key component in\nstate-of-the-art approximate unlearning algorithms. Our approach utilizes a\nmodified Mahalanobis distance to guide the unlearning of the feature vectors of\nthe instances to be forgotten, aligning them to the nearest wrong class\ndistribution. Moreover, we propose a distillation-trick mechanism that distills\nthe knowledge of the original model into the unlearning model with\nout-of-distribution images for retaining the original model's test performance\nwithout using any retain set. Importantly, we propose a self-forget version of\nSCAR that unlearns without having access to the forget set. We experimentally\nverified the effectiveness of our method, on three public datasets, comparing\nit with state-of-the-art methods. Our method obtains performance higher than\nmethods that operate without the retain set and comparable w.r.t the best\nmethods that rely on the retain set.\n","authors":["Jacopo Bonato","Marco Cotogni","Luigi Sabetta"],"pdf_url":"https://arxiv.org/pdf/2404.12922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12920v1","updated":"2024-04-19T14:43:48Z","published":"2024-04-19T14:43:48Z","title":"Zero-Shot Medical Phrase Grounding with Off-the-shelf Diffusion Models","summary":" Localizing the exact pathological regions in a given medical scan is an\nimportant imaging problem that requires a large amount of bounding box ground\ntruth annotations to be accurately solved. However, there exist alternative,\npotentially weaker, forms of supervision, such as accompanying free-text\nreports, which are readily available. The task of performing localization with\ntextual guidance is commonly referred to as phrase grounding. In this work, we\nuse a publicly available Foundation Model, namely the Latent Diffusion Model,\nto solve this challenging task. This choice is supported by the fact that the\nLatent Diffusion Model, despite being generative in nature, contains mechanisms\n(cross-attention) that implicitly align visual and textual features, thus\nleading to intermediate representations that are suitable for the task at hand.\nIn addition, we aim to perform this task in a zero-shot manner, i.e., without\nany further training on target data, meaning that the model's weights remain\nfrozen. To this end, we devise strategies to select features and also refine\nthem via post-processing without extra learnable parameters. We compare our\nproposed method with state-of-the-art approaches which explicitly enforce\nimage-text alignment in a joint embedding space via contrastive learning.\nResults on a popular chest X-ray benchmark indicate that our method is\ncompetitive wih SOTA on different types of pathology, and even outperforms them\non average in terms of two metrics (mean IoU and AUC-ROC). Source code will be\nreleased upon acceptance.\n","authors":["Konstantinos Vilouras","Pedro Sanchez","Alison Q. O'Neil","Sotirios A. Tsaftaris"],"pdf_url":"https://arxiv.org/pdf/2404.12920v1.pdf","comment":"8 pages, 3 figures, submitted to IEEE J-BHI Special Issue on\n Foundation Models in Medical Imaging"},{"id":"http://arxiv.org/abs/2404.12917v1","updated":"2024-04-19T14:42:42Z","published":"2024-04-19T14:42:42Z","title":"Zero-Shot Stitching in Reinforcement Learning using Relative\n Representations","summary":" Visual Reinforcement Learning is a popular and powerful framework that takes\nfull advantage of the Deep Learning breakthrough. However, it is also known\nthat variations in the input (e.g., different colors of the panorama due to the\nseason of the year) or the task (e.g., changing the speed limit for a car to\nrespect) could require complete retraining of the agents. In this work, we\nleverage recent developments in unifying latent representations to demonstrate\nthat it is possible to combine the components of an agent, rather than retrain\nit from scratch. We build upon the recent relative representations framework\nand adapt it for Visual RL. This allows us to create completely new agents\ncapable of handling environment-task combinations never seen during training.\nOur work paves the road toward a more accessible and flexible use of\nreinforcement learning.\n","authors":["Antonio Pio Ricciardi","Valentino Maiorca","Luca Moschella","Riccardo Marin","Emanuele Rodolà"],"pdf_url":"https://arxiv.org/pdf/2404.12917v1.pdf","comment":"13 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.12871v2","updated":"2024-04-19T14:36:15Z","published":"2023-11-18T01:21:38Z","title":"An Embodied Generalist Agent in 3D World","summary":" Leveraging massive knowledge and learning schemes from large language models\n(LLMs), recent machine learning models show notable successes in building\ngeneralist agents that exhibit the capability of general-purpose task solving\nin diverse domains, including natural language processing, computer vision, and\nrobotics. However, a significant challenge remains as these models exhibit\nlimited ability in understanding and interacting with the 3D world. We argue\nthis limitation significantly hinders the current models from performing\nreal-world tasks and further achieving general intelligence. To this end, we\nintroduce an embodied multi-modal and multi-task generalist agent that excels\nin perceiving, grounding, reasoning, planning, and acting in the 3D world. Our\nproposed agent, referred to as LEO, is trained with shared LLM-based model\narchitectures, objectives, and weights in two stages: (i) 3D vision-language\nalignment and (ii) 3D vision-language-action instruction tuning. To facilitate\nthe training, we meticulously curate and generate an extensive dataset\ncomprising object-level and scene-level multi-modal tasks with exceeding scale\nand complexity, necessitating a deep understanding of and interaction with the\n3D world. Through rigorous experiments, we demonstrate LEO's remarkable\nproficiency across a wide spectrum of tasks, including 3D captioning, question\nanswering, embodied reasoning, embodied navigation, and robotic manipulation.\nOur ablation results further provide valuable insights for the development of\nfuture embodied generalist agents.\n","authors":["Jiangyong Huang","Silong Yong","Xiaojian Ma","Xiongkun Linghu","Puhao Li","Yan Wang","Qing Li","Song-Chun Zhu","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2311.12871v2.pdf","comment":"The first four authors contribute equally. Project page:\n https://embodied-generalist.github.io"},{"id":"http://arxiv.org/abs/2404.12908v1","updated":"2024-04-19T14:30:41Z","published":"2024-04-19T14:30:41Z","title":"Robust CLIP-Based Detector for Exposing Diffusion Model-Generated Images","summary":" Diffusion models (DMs) have revolutionized image generation, producing\nhigh-quality images with applications spanning various fields. However, their\nability to create hyper-realistic images poses significant challenges in\ndistinguishing between real and synthetic content, raising concerns about\ndigital authenticity and potential misuse in creating deepfakes. This work\nintroduces a robust detection framework that integrates image and text features\nextracted by CLIP model with a Multilayer Perceptron (MLP) classifier. We\npropose a novel loss that can improve the detector's robustness and handle\nimbalanced datasets. Additionally, we flatten the loss landscape during the\nmodel training to improve the detector's generalization capabilities. The\neffectiveness of our method, which outperforms traditional detection\ntechniques, is demonstrated through extensive experiments, underscoring its\npotential to set a new state-of-the-art approach in DM-generated image\ndetection. The code is available at\nhttps://github.com/Purdue-M2/Robust_DM_Generated_Image_Detection.\n","authors":[" Santosh","Li Lin","Irene Amerini","Xin Wang","Shu Hu"],"pdf_url":"https://arxiv.org/pdf/2404.12908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03631v3","updated":"2024-04-19T14:29:02Z","published":"2023-12-06T17:28:03Z","title":"Mitigating Open-Vocabulary Caption Hallucinations","summary":" While recent years have seen rapid progress in image-conditioned text\ngeneration, image captioning still suffers from the fundamental issue of\nhallucinations, namely, the generation of spurious details that cannot be\ninferred from the given image. Existing methods largely use closed-vocabulary\nobject lists to mitigate or evaluate hallucinations in image captioning,\nignoring the long-tailed nature of hallucinations that occur in practice. To\nthis end, we propose a framework for addressing hallucinations in image\ncaptioning in the open-vocabulary setting. Our framework includes a new\nbenchmark, OpenCHAIR, that leverages generative foundation models to evaluate\nopen-vocabulary object hallucinations for image captioning, surpassing the\npopular and similarly-sized CHAIR benchmark in both diversity and accuracy.\nFurthermore, to mitigate open-vocabulary hallucinations without using a closed\nobject list, we propose MOCHa, an approach harnessing advancements in\nreinforcement learning. Our multi-objective reward function explicitly targets\nthe trade-off between fidelity and adequacy in generations without requiring\nany strong supervision. MOCHa improves a large variety of image captioning\nmodels, as captured by our OpenCHAIR benchmark and other existing metrics. We\nwill release our code and models.\n","authors":["Assaf Ben-Kish","Moran Yanuka","Morris Alper","Raja Giryes","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2312.03631v3.pdf","comment":"Website Link: https://assafbk.github.io/mocha/"},{"id":"http://arxiv.org/abs/2404.11214v2","updated":"2024-04-19T14:26:06Z","published":"2024-04-17T09:58:53Z","title":"Feature Corrective Transfer Learning: End-to-End Solutions to Object\n Detection in Non-Ideal Visual Conditions","summary":" A significant challenge in the field of object detection lies in the system's\nperformance under non-ideal imaging conditions, such as rain, fog, low\nillumination, or raw Bayer images that lack ISP processing. Our study\nintroduces \"Feature Corrective Transfer Learning\", a novel approach that\nleverages transfer learning and a bespoke loss function to facilitate the\nend-to-end detection of objects in these challenging scenarios without the need\nto convert non-ideal images into their RGB counterparts. In our methodology, we\ninitially train a comprehensive model on a pristine RGB image dataset.\nSubsequently, non-ideal images are processed by comparing their feature maps\nagainst those from the initial ideal RGB model. This comparison employs the\nExtended Area Novel Structural Discrepancy Loss (EANSDL), a novel loss function\ndesigned to quantify similarities and integrate them into the detection loss.\nThis approach refines the model's ability to perform object detection across\nvarying conditions through direct feature map correction, encapsulating the\nessence of Feature Corrective Transfer Learning. Experimental validation on\nvariants of the KITTI dataset demonstrates a significant improvement in mean\nAverage Precision (mAP), resulting in a 3.8-8.1% relative enhancement in\ndetection under non-ideal conditions compared to the baseline model, and a less\nmarginal performance difference within 1.3% of the mAP@[0.5:0.95] achieved\nunder ideal conditions by the standard Faster RCNN algorithm.\n","authors":["Chuheng Wei","Guoyuan Wu","Matthew J. Barth"],"pdf_url":"https://arxiv.org/pdf/2404.11214v2.pdf","comment":"2024 CVPR UG2+ Workshop"},{"id":"http://arxiv.org/abs/2312.09780v2","updated":"2024-04-19T14:16:46Z","published":"2023-12-15T13:33:09Z","title":"RANRAC: Robust Neural Scene Representations via Random Ray Consensus","summary":" Learning-based scene representations such as neural radiance fields or light\nfield networks, that rely on fitting a scene model to image observations,\ncommonly encounter challenges in the presence of inconsistencies within the\nimages caused by occlusions, inaccurately estimated camera parameters or\neffects like lens flare. To address this challenge, we introduce RANdom RAy\nConsensus (RANRAC), an efficient approach to eliminate the effect of\ninconsistent data, thereby taking inspiration from classical RANSAC based\noutlier detection for model fitting. In contrast to the down-weighting of the\neffect of outliers based on robust loss formulations, our approach reliably\ndetects and excludes inconsistent perspectives, resulting in clean images\nwithout floating artifacts. For this purpose, we formulate a fuzzy adaption of\nthe RANSAC paradigm, enabling its application to large scale models. We\ninterpret the minimal number of samples to determine the model parameters as a\ntunable hyperparameter, investigate the generation of hypotheses with\ndata-driven models, and analyze the validation of hypotheses in noisy\nenvironments. We demonstrate the compatibility and potential of our solution\nfor both photo-realistic robust multi-view reconstruction from real-world\nimages based on neural radiance fields and for single-shot reconstruction based\non light-field networks. In particular, the results indicate significant\nimprovements compared to state-of-the-art robust methods for novel-view\nsynthesis on both synthetic and captured scenes with various inconsistencies\nincluding occlusions, noisy camera pose estimates, and unfocused perspectives.\nThe results further indicate significant improvements for single-shot\nreconstruction from occluded images. Project Page:\nhttps://bennobuschmann.com/ranrac/\n","authors":["Benno Buschmann","Andreea Dogaru","Elmar Eisemann","Michael Weinmann","Bernhard Egger"],"pdf_url":"https://arxiv.org/pdf/2312.09780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00639v3","updated":"2024-04-19T14:14:59Z","published":"2023-12-01T14:59:43Z","title":"RefinedFields: Radiance Fields Refinement for Unconstrained Scenes","summary":" Modeling large scenes from unconstrained images has proven to be a major\nchallenge in computer vision. Existing methods tackling in-the-wild scene\nmodeling operate in closed-world settings, where no conditioning on priors\nacquired from real-world images is present. We propose RefinedFields, which is,\nto the best of our knowledge, the first method leveraging pre-trained models to\nimprove in-the-wild scene modeling. We employ pre-trained networks to refine\nK-Planes representations via optimization guidance using an alternating\ntraining procedure. We carry out extensive experiments and verify the merit of\nour method on synthetic data and real tourism photo collections. RefinedFields\nenhances rendered scenes with richer details and improves upon its base\nrepresentation on the task of novel view synthesis in the wild. Our project\npage can be found at https://refinedfields.github.io.\n","authors":["Karim Kassab","Antoine Schnepf","Jean-Yves Franceschi","Laurent Caraffa","Jeremie Mary","Valérie Gouet-Brunet"],"pdf_url":"https://arxiv.org/pdf/2312.00639v3.pdf","comment":"Corrected Table 2, where some comparisons were done among models\n trained at different resolutions"},{"id":"http://arxiv.org/abs/2404.12900v1","updated":"2024-04-19T14:13:46Z","published":"2024-04-19T14:13:46Z","title":"Training-and-prompt-free General Painterly Harmonization Using\n Image-wise Attention Sharing","summary":" Painterly Image Harmonization aims at seamlessly blending disparate visual\nelements within a single coherent image. However, previous approaches often\nencounter significant limitations due to training data constraints, the need\nfor time-consuming fine-tuning, or reliance on additional prompts. To surmount\nthese hurdles, we design a Training-and-prompt-Free General Painterly\nHarmonization method using image-wise attention sharing (TF-GPH), which\nintegrates a novel \"share-attention module\". This module redefines the\ntraditional self-attention mechanism by allowing for comprehensive image-wise\nattention, facilitating the use of a state-of-the-art pretrained latent\ndiffusion model without the typical training data limitations. Additionally, we\nfurther introduce \"similarity reweighting\" mechanism enhances performance by\neffectively harnessing cross-image information, surpassing the capabilities of\nfine-tuning or prompt-based approaches. At last, we recognize the deficiencies\nin existing benchmarks and propose the \"General Painterly Harmonization\nBenchmark\", which employs range-based evaluation metrics to more accurately\nreflect real-world application. Extensive experiments demonstrate the superior\nefficacy of our method across various benchmarks. The code and web demo are\navailable at https://github.com/BlueDyee/TF-GPH.\n","authors":["Teng-Fang Hsiao","Bo-Kai Ruan","Hong-Han Shuai"],"pdf_url":"https://arxiv.org/pdf/2404.12900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.07976v5","updated":"2024-04-19T14:13:26Z","published":"2022-03-15T15:05:40Z","title":"On the Pitfalls of Batch Normalization for End-to-End Video Learning: A\n Study on Surgical Workflow Analysis","summary":" Batch Normalization's (BN) unique property of depending on other samples in a\nbatch is known to cause problems in several tasks, including sequence modeling.\nYet, BN-related issues are hardly studied for long video understanding, despite\nthe ubiquitous use of BN in CNNs (Convolutional Neural Networks) for feature\nextraction. Especially in surgical workflow analysis, where the lack of\npretrained feature extractors has led to complex, multi-stage training\npipelines, limited awareness of BN issues may have hidden the benefits of\ntraining CNNs and temporal models end to end. In this paper, we analyze\npitfalls of BN in video learning, including issues specific to online tasks\nsuch as a 'cheating' effect in anticipation. We observe that BN's properties\ncreate major obstacles for end-to-end learning. However, using BN-free\nbackbones, even simple CNN-LSTMs beat the state of the art\n{\\color{\\colorrevtwo}on three surgical workflow benchmarks} by utilizing\nadequate end-to-end training strategies which maximize temporal context. We\nconclude that awareness of BN's pitfalls is crucial for effective end-to-end\nlearning in surgical tasks. By reproducing results on natural-video datasets,\nwe hope our insights will benefit other areas of video learning as well. Code\nis available at: \\url{https://gitlab.com/nct_tso_public/pitfalls_bn}\n","authors":["Dominik Rivoir","Isabel Funke","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2203.07976v5.pdf","comment":"Accepted at Medical Image Analysis (MedIA). Publication link:\n https://www.sciencedirect.com/science/article/pii/S1361841524000513"},{"id":"http://arxiv.org/abs/2211.07440v4","updated":"2024-04-19T14:05:03Z","published":"2022-11-14T15:14:50Z","title":"Leveraging Automatic Personalised Nutrition: Food Image Recognition\n Benchmark and Dataset based on Nutrition Taxonomy","summary":" Maintaining a healthy lifestyle has become increasingly challenging in\ntoday's sedentary society marked by poor eating habits. To address this issue,\nboth national and international organisations have made numerous efforts to\npromote healthier diets and increased physical activity. However, implementing\nthese recommendations in daily life can be difficult, as they are often generic\nand not tailored to individuals. This study presents the AI4Food-NutritionDB\ndatabase, the first nutrition database that incorporates food images and a\nnutrition taxonomy based on recommendations by national and international\nhealth authorities. The database offers a multi-level categorisation,\ncomprising 6 nutritional levels, 19 main categories (e.g., \"Meat\"), 73\nsubcategories (e.g., \"White Meat\"), and 893 specific food products (e.g.,\n\"Chicken\"). The AI4Food-NutritionDB opens the doors to new food computing\napproaches in terms of food intake frequency, quality, and categorisation.\nAlso, we present a standardised experimental protocol and benchmark including\nthree tasks based on the nutrition taxonomy (i.e., category, subcategory, and\nfinal product recognition). These resources are available to the research\ncommunity, including our deep learning models trained on AI4Food-NutritionDB,\nwhich can serve as pre-trained models, achieving accurate recognition results\nfor challenging food image databases.\n","authors":["Sergio Romero-Tapiador","Ruben Tolosana","Aythami Morales","Julian Fierrez","Ruben Vera-Rodriguez","Isabel Espinosa-Salinas","Gala Freixer","Enrique Carrillo de Santa Pau","Ana Ramírez de Molina","Javier Ortega-Garcia"],"pdf_url":"https://arxiv.org/pdf/2211.07440v4.pdf","comment":"12 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.02044v3","updated":"2024-04-19T14:02:26Z","published":"2024-01-04T03:09:39Z","title":"Multi-modal vision-language model for generalizable annotation-free\n pathological lesions localization and clinical diagnosis","summary":" Defining pathologies automatically from medical images aids the understanding\nof the emergence and progression of diseases, and such an ability is crucial in\nclinical diagnostics. However, existing deep learning models heavily rely on\nexpert annotations and lack generalization capabilities in open clinical\nenvironments. In this study, we present a generalizable vision-language\npre-training model for Annotation-Free pathological lesions Localization\n(AFLoc). The core strength of AFLoc lies in its extensive multi-level semantic\nstructure-based contrastive learning, which comprehensively aligns\nmulti-granularity medical concepts from reports with abundant image features,\nto adapt to the diverse expressions of pathologies and unseen pathologies\nwithout the reliance on image annotations from experts. We demonstrate the\nproof of concept on CXR images, with extensive experimental validation across 4\ndistinct external datasets, encompassing 11 types of chest pathologies. The\nresults demonstrate that AFLoc surpasses state-of-the-art methods in\npathological lesions localization and disease classification, and even\noutperforms the human benchmark in locating 5 different pathologies.\nAdditionally, we further verify its generalization ability by applying it to\nretinal fundus images. Our approach showcases AFoc versatilities and\nunderscores its suitability for clinical diagnoses in complex clinical\nenvironments.\n","authors":["Hao Yang","Hong-Yu Zhou","Zhihuan Li","Yuanxu Gao","Cheng Li","Weijian Huang","Jiarun Liu","Hairong Zheng","Kang Zhang","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.02044v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12888v1","updated":"2024-04-19T13:45:14Z","published":"2024-04-19T13:45:14Z","title":"Learn2Talk: 3D Talking Face Learns from 2D Talking Face","summary":" Speech-driven facial animation methods usually contain two main classes, 3D\nand 2D talking face, both of which attract considerable research attention in\nrecent years. However, to the best of our knowledge, the research on 3D talking\nface does not go deeper as 2D talking face, in the aspect of\nlip-synchronization (lip-sync) and speech perception. To mind the gap between\nthe two sub-fields, we propose a learning framework named Learn2Talk, which can\nconstruct a better 3D talking face network by exploiting two expertise points\nfrom the field of 2D talking face. Firstly, inspired by the audio-video sync\nnetwork, a 3D sync-lip expert model is devised for the pursuit of lip-sync\nbetween audio and 3D facial motion. Secondly, a teacher model selected from 2D\ntalking face methods is used to guide the training of the audio-to-3D motions\nregression network to yield more 3D vertex accuracy. Extensive experiments show\nthe advantages of the proposed framework in terms of lip-sync, vertex accuracy\nand speech perception, compared with state-of-the-arts. Finally, we show two\napplications of the proposed framework: audio-visual speech recognition and\nspeech-driven 3D Gaussian Splatting based avatar animation.\n","authors":["Yixiang Zhuang","Baoping Cheng","Yao Cheng","Yuntao Jin","Renshuai Liu","Chengyang Li","Xuan Cheng","Jing Liao","Juncong Lin"],"pdf_url":"https://arxiv.org/pdf/2404.12888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12887v1","updated":"2024-04-19T13:43:14Z","published":"2024-04-19T13:43:14Z","title":"3D Multi-frame Fusion for Video Stabilization","summary":" In this paper, we present RStab, a novel framework for video stabilization\nthat integrates 3D multi-frame fusion through volume rendering. Departing from\nconventional methods, we introduce a 3D multi-frame perspective to generate\nstabilized images, addressing the challenge of full-frame generation while\npreserving structure. The core of our approach lies in Stabilized Rendering\n(SR), a volume rendering module, which extends beyond the image fusion by\nincorporating feature fusion. The core of our RStab framework lies in\nStabilized Rendering (SR), a volume rendering module, fusing multi-frame\ninformation in 3D space. Specifically, SR involves warping features and colors\nfrom multiple frames by projection, fusing them into descriptors to render the\nstabilized image. However, the precision of warped information depends on the\nprojection accuracy, a factor significantly influenced by dynamic regions. In\nresponse, we introduce the Adaptive Ray Range (ARR) module to integrate depth\npriors, adaptively defining the sampling range for the projection process.\nAdditionally, we propose Color Correction (CC) assisting geometric constraints\nwith optical flow for accurate color aggregation. Thanks to the three modules,\nour RStab demonstrates superior performance compared with previous stabilizers\nin the field of view (FOV), image quality, and video stability across various\ndatasets.\n","authors":["Zhan Peng","Xinyi Ye","Weiyue Zhao","Tianqi Liu","Huiqiang Sun","Baopu Li","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2404.12887v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.12886v1","updated":"2024-04-19T13:40:25Z","published":"2024-04-19T13:40:25Z","title":"MCM: Multi-condition Motion Synthesis Framework","summary":" Conditional human motion synthesis (HMS) aims to generate human motion\nsequences that conform to specific conditions. Text and audio represent the two\npredominant modalities employed as HMS control conditions. While existing\nresearch has primarily focused on single conditions, the multi-condition human\nmotion synthesis remains underexplored. In this study, we propose a\nmulti-condition HMS framework, termed MCM, based on a dual-branch structure\ncomposed of a main branch and a control branch. This framework effectively\nextends the applicability of the diffusion model, which is initially predicated\nsolely on textual conditions, to auditory conditions. This extension\nencompasses both music-to-dance and co-speech HMS while preserving the\nintrinsic quality of motion and the capabilities for semantic association\ninherent in the original model. Furthermore, we propose the implementation of a\nTransformer-based diffusion model, designated as MWNet, as the main branch.\nThis model adeptly apprehends the spatial intricacies and inter-joint\ncorrelations inherent in motion sequences, facilitated by the integration of\nmulti-wise self-attention modules. Extensive experiments show that our method\nachieves competitive results in single-condition and multi-condition HMS tasks.\n","authors":["Zeyu Ling","Bo Han","Yongkang Wongkan","Han Lin","Mohan Kankanhalli","Weidong Geng"],"pdf_url":"https://arxiv.org/pdf/2404.12886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18453v5","updated":"2024-04-19T13:37:18Z","published":"2023-05-29T04:14:38Z","title":"Conditional Diffusion Models for Semantic 3D Brain MRI Synthesis","summary":" Artificial intelligence (AI) in healthcare, especially in medical imaging,\nfaces challenges due to data scarcity and privacy concerns. Addressing these,\nwe introduce Med-DDPM, a diffusion model designed for 3D semantic brain MRI\nsynthesis. This model effectively tackles data scarcity and privacy issues by\nintegrating semantic conditioning. This involves the channel-wise concatenation\nof a conditioning image to the model input, enabling control in image\ngeneration. Med-DDPM demonstrates superior stability and performance compared\nto existing 3D brain imaging synthesis methods. It generates diverse,\nanatomically coherent images with high visual fidelity. In terms of dice score\naccuracy in the tumor segmentation task, Med-DDPM achieves 0.6207, close to the\n0.6531 accuracy of real images, and outperforms baseline models. Combined with\nreal images, it further increases segmentation accuracy to 0.6675, showing the\npotential of our proposed method for data augmentation. This model represents\nthe first use of a diffusion model in 3D semantic brain MRI synthesis,\nproducing high-quality images. Its semantic conditioning feature also shows\npotential for image anonymization in biomedical imaging, addressing data and\nprivacy issues. We provide the code and model weights for Med-DDPM on our\nGitHub repository (https://github.com/mobaidoctor/med-ddpm/) to support\nreproducibility.\n","authors":["Zolnamar Dorjsembe","Hsing-Kuo Pao","Sodtavilan Odonchimed","Furen Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.18453v5.pdf","comment":"This document is a preprint and has been accepted for publication in\n the IEEE Journal of Biomedical and Health Informatics. The final, published\n version can be accessed using the following DOI: 10.1109/JBHI.2024.3385504.\n Copyright for this article has been transferred to IEEE"},{"id":"http://arxiv.org/abs/2211.11424v2","updated":"2024-04-19T13:31:47Z","published":"2022-11-21T13:10:19Z","title":"Modeling Hierarchical Structural Distance for Unsupervised Domain\n Adaptation","summary":" Unsupervised domain adaptation (UDA) aims to estimate a transferable model\nfor unlabeled target domains by exploiting labeled source data. Optimal\nTransport (OT) based methods have recently been proven to be a promising\nsolution for UDA with a solid theoretical foundation and competitive\nperformance. However, most of these methods solely focus on domain-level OT\nalignment by leveraging the geometry of domains for domain-invariant features\nbased on the global embeddings of images. However, global representations of\nimages may destroy image structure, leading to the loss of local details that\noffer category-discriminative information. This study proposes an end-to-end\nDeep Hierarchical Optimal Transport method (DeepHOT), which aims to learn both\ndomain-invariant and category-discriminative representations by mining\nhierarchical structural relations among domains. The main idea is to\nincorporate a domain-level OT and image-level OT into a unified OT framework,\nhierarchical optimal transport, to model the underlying geometry in both domain\nspace and image space. In DeepHOT framework, an image-level OT serves as the\nground distance metric for the domain-level OT, leading to the hierarchical\nstructural distance. Compared with the ground distance of the conventional\ndomain-level OT, the image-level OT captures structural associations among\nlocal regions of images that are beneficial to classification. In this way,\nDeepHOT, a unified OT framework, not only aligns domains by domain-level OT,\nbut also enhances the discriminative power through image-level OT. Moreover, to\novercome the limitation of high computational complexity, we propose a robust\nand efficient implementation of DeepHOT by approximating origin OT with sliced\nWasserstein distance in image-level OT and accomplishing the mini-batch\nunbalanced domain-level OT.\n","authors":["Yingxue Xu","Guihua Wen","Yang Hu","Pei Yang"],"pdf_url":"https://arxiv.org/pdf/2211.11424v2.pdf","comment":"accepted by TCVST, code: https://github.com/Innse/DeepHOT"},{"id":"http://arxiv.org/abs/2404.12876v1","updated":"2024-04-19T13:25:27Z","published":"2024-04-19T13:25:27Z","title":"A Large-scale Medical Visual Task Adaptation Benchmark","summary":" Visual task adaptation has been demonstrated to be effective in adapting\npre-trained Vision Transformers (ViTs) to general downstream visual tasks using\nspecialized learnable layers or tokens. However, there is yet a large-scale\nbenchmark to fully explore the effect of visual task adaptation on the\nrealistic and important medical domain, particularly across diverse medical\nvisual modalities, such as color images, X-ray, and CT. To close this gap, we\npresent Med-VTAB, a large-scale Medical Visual Task Adaptation Benchmark\nconsisting of 1.68 million medical images for diverse organs, modalities, and\nadaptation approaches. Based on Med-VTAB, we explore the scaling law of medical\nprompt tuning concerning tunable parameters and the generalizability of medical\nvisual adaptation using non-medical/medical pre-train weights. Besides, we\nstudy the impact of patient ID out-of-distribution on medical visual\nadaptation, which is a real and challenging scenario. Furthermore, results from\nMed-VTAB indicate that a single pre-trained model falls short in medical task\nadaptation. Therefore, we introduce GMoE-Adapter, a novel method that combines\nmedical and general pre-training weights through a gated mixture-of-experts\nadapter, achieving state-of-the-art results in medical visual task adaptation.\n","authors":["Shentong Mo","Xufang Luo","Yansen Wang","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.12876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06860v2","updated":"2024-04-19T13:18:46Z","published":"2024-04-10T09:35:50Z","title":"Monocular 3D lane detection for Autonomous Driving: Recent Achievements,\n Challenges, and Outlooks","summary":" 3D lane detection is essential in autonomous driving as it extracts\nstructural and traffic information from the road in three-dimensional space,\naiding self-driving cars in logical, safe, and comfortable path planning and\nmotion control. Given the cost of sensors and the advantages of visual data in\ncolor information, 3D lane detection based on monocular vision is an important\nresearch direction in the realm of autonomous driving, increasingly gaining\nattention in both industry and academia. Regrettably, recent advancements in\nvisual perception seem inadequate for the development of fully reliable 3D lane\ndetection algorithms, which also hampers the progress of vision-based fully\nautonomous vehicles. We believe that there is still considerable room for\nimprovement in 3D lane detection algorithms for autonomous vehicles using\nvisual sensors, and significant enhancements are needed. This review looks back\nand analyzes the current state of achievements in the field of 3D lane\ndetection research. It covers all current monocular-based 3D lane detection\nprocesses, discusses the performance of these cutting-edge algorithms, analyzes\nthe time complexity of various algorithms, and highlights the main achievements\nand limitations of ongoing research efforts. The survey also includes a\ncomprehensive discussion of available 3D lane detection datasets and the\nchallenges that researchers face but have not yet resolved. Finally, our work\noutlines future research directions and invites researchers and practitioners\nto join this exciting field.\n","authors":["Fulong Ma","Weiqing Qi","Guoyang Zhao","Linwei Zheng","Sheng Wang","Yuxuan Liu","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06860v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12867v1","updated":"2024-04-19T13:08:43Z","published":"2024-04-19T13:08:43Z","title":"FipTR: A Simple yet Effective Transformer Framework for Future Instance\n Prediction in Autonomous Driving","summary":" The future instance prediction from a Bird's Eye View(BEV) perspective is a\nvital component in autonomous driving, which involves future instance\nsegmentation and instance motion prediction. Existing methods usually rely on a\nredundant and complex pipeline which requires multiple auxiliary outputs and\npost-processing procedures. Moreover, estimated errors on each of the auxiliary\npredictions will lead to degradation of the prediction performance. In this\npaper, we propose a simple yet effective fully end-to-end framework named\nFuture Instance Prediction Transformer(FipTR), which views the task as BEV\ninstance segmentation and prediction for future frames. We propose to adopt\ninstance queries representing specific traffic participants to directly\nestimate the corresponding future occupied masks, and thus get rid of complex\npost-processing procedures. Besides, we devise a flow-aware BEV predictor for\nfuture BEV feature prediction composed of a flow-aware deformable attention\nthat takes backward flow guiding the offset sampling. A novel future instance\nmatching strategy is also proposed to further improve the temporal coherence.\nExtensive experiments demonstrate the superiority of FipTR and its\neffectiveness under different temporal BEV encoders.\n","authors":["Xingtai Gui","Tengteng Huang","Haonan Shao","Haotian Yao","Chi Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12866v1","updated":"2024-04-19T13:05:37Z","published":"2024-04-19T13:05:37Z","title":"How Does the Textual Information Affect the Retrieval of Multimodal\n In-Context Learning?","summary":" The increase in parameter size of multimodal large language models (MLLMs)\nintroduces significant capabilities, particularly in-context learning, where\nMLLMs enhance task performance without updating pre-trained parameters. This\neffectiveness, however, hinges on the appropriate selection of in-context\nexamples, a process that is currently biased towards visual data, overlooking\ntextual information. Furthermore, the area of supervised retrievers for MLLMs,\ncrucial for optimal in-context example selection, continues to be\nuninvestigated. Our study offers an in-depth evaluation of the impact of\ntextual information on the unsupervised selection of in-context examples in\nmultimodal contexts, uncovering a notable sensitivity of retriever performance\nto the employed modalities. Responding to this, we introduce a novel supervised\nMLLM-retriever MSIER that employs a neural network to select examples that\nenhance multimodal in-context learning efficiency. This approach is validated\nthrough extensive testing across three distinct tasks, demonstrating the\nmethod's effectiveness. Additionally, we investigate the influence of\nmodalities on our supervised retrieval method's training and pinpoint factors\ncontributing to our model's success. This exploration paves the way for future\nadvancements, highlighting the potential for refined in-context learning in\nMLLMs through the strategic use of multimodal data.\n","authors":["Yang Luo","Zangwei Zheng","Zirui Zhu","Yang You"],"pdf_url":"https://arxiv.org/pdf/2404.12866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12861v1","updated":"2024-04-19T13:01:30Z","published":"2024-04-19T13:01:30Z","title":"Foundation Model assisted Weakly Supervised LiDAR Semantic Segmentation","summary":" Current point cloud semantic segmentation has achieved great advances when\ngiven sufficient labels. However, the dense annotation of LiDAR point clouds\nremains prohibitively expensive and time-consuming, unable to keep up with the\ncontinuously growing volume of data. In this paper, we propose annotating\nimages with scattered points, followed by utilizing SAM (a Foundation model) to\ngenerate semantic segmentation labels for the images. Finally, by mapping the\nsegmentation labels of the images to the LiDAR space using the intrinsic and\nextrinsic parameters of the camera and LiDAR, we obtain labels for point cloud\nsemantic segmentation, and release Scatter-KITTI and Scatter-nuScenes, which\nare the first works to utilize image segmentation-based SAM for weakly\nsupervised point cloud semantic segmentation. Furthermore, to mitigate the\ninfluence of erroneous pseudo labels obtained from sparse annotations on point\ncloud features, we propose a multi-modal weakly supervised network for LiDAR\nsemantic segmentation, called MM-ScatterNet. This network combines features\nfrom both point cloud and image modalities, enhancing the representation\nlearning of point clouds by introducing consistency constraints between\nmulti-modal features and point cloud features. On the SemanticKITTI dataset, we\nachieve 66\\% of fully supervised performance using only 0.02% of annotated\ndata, and on the NuScenes dataset, we achieve 95% of fully supervised\nperformance using only 0.1% labeled points.\n","authors":["Yilong Chen","Zongyi Xu","xiaoshui Huang","Ruicheng Zhang","Xinqi Jiang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2404.12861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12856v1","updated":"2024-04-19T12:50:43Z","published":"2024-04-19T12:50:43Z","title":"Language-Driven Active Learning for Diverse Open-Set 3D Object Detection","summary":" Object detection is crucial for ensuring safe autonomous driving. However,\ndata-driven approaches face challenges when encountering minority or novel\nobjects in the 3D driving scene. In this paper, we propose VisLED, a\nlanguage-driven active learning framework for diverse open-set 3D Object\nDetection. Our method leverages active learning techniques to query diverse and\ninformative data samples from an unlabeled pool, enhancing the model's ability\nto detect underrepresented or novel objects. Specifically, we introduce the\nVision-Language Embedding Diversity Querying (VisLED-Querying) algorithm, which\noperates in both open-world exploring and closed-world mining settings. In\nopen-world exploring, VisLED-Querying selects data points most novel relative\nto existing data, while in closed-world mining, it mines new instances of known\nclasses. We evaluate our approach on the nuScenes dataset and demonstrate its\neffectiveness compared to random sampling and entropy-querying methods. Our\nresults show that VisLED-Querying consistently outperforms random sampling and\noffers competitive performance compared to entropy-querying despite the\nlatter's model-optimality, highlighting the potential of VisLED for improving\nobject detection in autonomous driving scenarios.\n","authors":["Ross Greer","Bjørk Antoniussen","Andreas Møgelmose","Mohan Trivedi"],"pdf_url":"https://arxiv.org/pdf/2404.12856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11669v2","updated":"2024-04-19T12:46:03Z","published":"2024-04-17T18:08:00Z","title":"Factorized Motion Fields for Fast Sparse Input Dynamic View Synthesis","summary":" Designing a 3D representation of a dynamic scene for fast optimization and\nrendering is a challenging task. While recent explicit representations enable\nfast learning and rendering of dynamic radiance fields, they require a dense\nset of input viewpoints. In this work, we focus on learning a fast\nrepresentation for dynamic radiance fields with sparse input viewpoints.\nHowever, the optimization with sparse input is under-constrained and\nnecessitates the use of motion priors to constrain the learning. Existing fast\ndynamic scene models do not explicitly model the motion, making them difficult\nto be constrained with motion priors. We design an explicit motion model as a\nfactorized 4D representation that is fast and can exploit the spatio-temporal\ncorrelation of the motion field. We then introduce reliable flow priors\nincluding a combination of sparse flow priors across cameras and dense flow\npriors within cameras to regularize our motion model. Our model is fast,\ncompact and achieves very good performance on popular multi-view dynamic scene\ndatasets with sparse input viewpoints. The source code for our model can be\nfound on our project page:\nhttps://nagabhushansn95.github.io/publications/2024/RF-DeRF.html.\n","authors":["Nagabhushan Somraj","Kapil Choudhary","Sai Harsha Mupparaju","Rajiv Soundararajan"],"pdf_url":"https://arxiv.org/pdf/2404.11669v2.pdf","comment":"Accepted at SIGGRAPH 2024"},{"id":"http://arxiv.org/abs/2404.12852v1","updated":"2024-04-19T12:42:31Z","published":"2024-04-19T12:42:31Z","title":"LSP Framework: A Compensatory Model for Defeating Trigger Reverse\n Engineering via Label Smoothing Poisoning","summary":" Deep neural networks are vulnerable to backdoor attacks. Among the existing\nbackdoor defense methods, trigger reverse engineering based approaches, which\nreconstruct the backdoor triggers via optimizations, are the most versatile and\neffective ones compared to other types of methods. In this paper, we summarize\nand construct a generic paradigm for the typical trigger reverse engineering\nprocess. Based on this paradigm, we propose a new perspective to defeat trigger\nreverse engineering by manipulating the classification confidence of backdoor\nsamples. To determine the specific modifications of classification confidence,\nwe propose a compensatory model to compute the lower bound of the modification.\nWith proper modifications, the backdoor attack can easily bypass the trigger\nreverse engineering based methods. To achieve this objective, we propose a\nLabel Smoothing Poisoning (LSP) framework, which leverages label smoothing to\nspecifically manipulate the classification confidences of backdoor samples.\nExtensive experiments demonstrate that the proposed work can defeat the\nstate-of-the-art trigger reverse engineering based methods, and possess good\ncompatibility with a variety of existing backdoor attacks.\n","authors":["Beichen Li","Yuanfang Guo","Heqi Peng","Yangxi Li","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12462v4","updated":"2024-04-19T12:39:09Z","published":"2023-08-23T22:55:45Z","title":"Overcoming Generic Knowledge Loss with Selective Parameter Update","summary":" Foundation models encompass an extensive knowledge base and offer remarkable\ntransferability. However, this knowledge becomes outdated or insufficient over\ntime. The challenge lies in continuously updating foundation models to\naccommodate novel information while retaining their original capabilities.\nLeveraging the fact that foundation models have initial knowledge on various\ntasks and domains, we propose a novel approach that, instead of updating all\nparameters equally, localizes the updates to a sparse set of parameters\nrelevant to the task being learned. We strike a balance between efficiency and\nnew task performance, while maintaining the transferability and\ngeneralizability of foundation models. We extensively evaluate our method on\nfoundational vision-language models with a diverse spectrum of continual\nlearning tasks. Our method achieves improvements on the accuracy of the newly\nlearned tasks up to 7% while preserving the pretraining knowledge with a\nnegligible decrease of 0.9% on a representative control set accuracy.\n","authors":["Wenxuan Zhang","Paul Janson","Rahaf Aljundi","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2308.12462v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04346v2","updated":"2024-04-19T12:30:07Z","published":"2024-04-05T18:33:04Z","title":"Koala: Key frame-conditioned long video-LLM","summary":" Long video question answering is a challenging task that involves recognizing\nshort-term activities and reasoning about their fine-grained relationships.\nState-of-the-art video Large Language Models (vLLMs) hold promise as a viable\nsolution due to their demonstrated emergent capabilities on new tasks. However,\ndespite being trained on millions of short seconds-long videos, vLLMs are\nunable to understand minutes-long videos and accurately answer questions about\nthem. To address this limitation, we propose a lightweight and self-supervised\napproach, Key frame-conditioned long video-LLM (Koala), that introduces\nlearnable spatiotemporal queries to adapt pretrained vLLMs for generalizing to\nlonger videos. Our approach introduces two new tokenizers that condition on\nvisual tokens computed from sparse video key frames for understanding short and\nlong video moments. We train our proposed approach on HowTo100M and demonstrate\nits effectiveness on zero-shot long video understanding benchmarks, where it\noutperforms state-of-the-art large models by 3 - 6% in absolute accuracy across\nall tasks. Surprisingly, we also empirically show that our approach not only\nhelps a pretrained vLLM to understand long videos but also improves its\naccuracy on short-term action recognition.\n","authors":["Reuben Tan","Ximeng Sun","Ping Hu","Jui-hsien Wang","Hanieh Deilamsalehy","Bryan A. Plummer","Bryan Russell","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2404.04346v2.pdf","comment":"Accepted at CVPR 2024 as a poster highlight"},{"id":"http://arxiv.org/abs/2306.08386v2","updated":"2024-04-19T12:29:50Z","published":"2023-06-14T09:21:48Z","title":"Efficient Backdoor Attacks for Deep Neural Networks in Real-world\n Scenarios","summary":" Recent deep neural networks (DNNs) have came to rely on vast amounts of\ntraining data, providing an opportunity for malicious attackers to exploit and\ncontaminate the data to carry out backdoor attacks. However, existing backdoor\nattack methods make unrealistic assumptions, assuming that all training data\ncomes from a single source and that attackers have full access to the training\ndata. In this paper, we introduce a more realistic attack scenario where\nvictims collect data from multiple sources, and attackers cannot access the\ncomplete training data. We refer to this scenario as data-constrained backdoor\nattacks. In such cases, previous attack methods suffer from severe efficiency\ndegradation due to the entanglement between benign and poisoning features\nduring the backdoor injection process. To tackle this problem, we introduce\nthree CLIP-based technologies from two distinct streams: Clean Feature\nSuppression and Poisoning Feature Augmentation.effective solution for\ndata-constrained backdoor attacks. The results demonstrate remarkable\nimprovements, with some settings achieving over 100% improvement compared to\nexisting attacks in data-constrained scenarios. Code is available at\nhttps://github.com/sunh1113/Efficient-backdoor-attacks-for-deep-neural-networks-in-real-world-scenarios\n","authors":["Ziqiang Li","Hong Sun","Pengfei Xia","Heng Li","Beihao Xia","Yi Wu","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2306.08386v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.12841v1","updated":"2024-04-19T12:21:27Z","published":"2024-04-19T12:21:27Z","title":"Explainable Deepfake Video Detection using Convolutional Neural Network\n and CapsuleNet","summary":" Deepfake technology, derived from deep learning, seamlessly inserts\nindividuals into digital media, irrespective of their actual participation. Its\nfoundation lies in machine learning and Artificial Intelligence (AI).\nInitially, deepfakes served research, industry, and entertainment. While the\nconcept has existed for decades, recent advancements render deepfakes nearly\nindistinguishable from reality. Accessibility has soared, empowering even\nnovices to create convincing deepfakes. However, this accessibility raises\nsecurity concerns.The primary deepfake creation algorithm, GAN (Generative\nAdversarial Network), employs machine learning to craft realistic images or\nvideos. Our objective is to utilize CNN (Convolutional Neural Network) and\nCapsuleNet with LSTM to differentiate between deepfake-generated frames and\noriginals. Furthermore, we aim to elucidate our model's decision-making process\nthrough Explainable AI, fostering transparent human-AI relationships and\noffering practical examples for real-life scenarios.\n","authors":["Gazi Hasin Ishrak","Zalish Mahmud","MD. Zami Al Zunaed Farabe","Tahera Khanom Tinni","Tanzim Reza","Mohammad Zavid Parvez"],"pdf_url":"https://arxiv.org/pdf/2404.12841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12839v1","updated":"2024-04-19T12:20:49Z","published":"2024-04-19T12:20:49Z","title":"ECOR: Explainable CLIP for Object Recognition","summary":" Large Vision Language Models (VLMs), such as CLIP, have significantly\ncontributed to various computer vision tasks, including object recognition and\nobject detection. Their open vocabulary feature enhances their value. However,\ntheir black-box nature and lack of explainability in predictions make them less\ntrustworthy in critical domains. Recently, some work has been done to force\nVLMs to provide reasonable rationales for object recognition, but this often\ncomes at the expense of classification accuracy. In this paper, we first\npropose a mathematical definition of explainability in the object recognition\ntask based on the joint probability distribution of categories and rationales,\nthen leverage this definition to fine-tune CLIP in an explainable manner.\nThrough evaluations of different datasets, our method demonstrates\nstate-of-the-art performance in explainable classification. Notably, it excels\nin zero-shot settings, showcasing its adaptability. This advancement improves\nexplainable object recognition, enhancing trust across diverse applications.\nThe code will be made available online upon publication.\n","authors":["Ali Rasekh","Sepehr Kazemi Ranjbar","Milad Heidari","Wolfgang Nejdl"],"pdf_url":"https://arxiv.org/pdf/2404.12839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12832v1","updated":"2024-04-19T12:09:49Z","published":"2024-04-19T12:09:49Z","title":"COIN: Counterfactual inpainting for weakly supervised semantic\n segmentation for medical images","summary":" Deep learning is dramatically transforming the field of medical imaging and\nradiology, enabling the identification of pathologies in medical images,\nincluding computed tomography (CT) and X-ray scans. However, the performance of\ndeep learning models, particularly in segmentation tasks, is often limited by\nthe need for extensive annotated datasets. To address this challenge, the\ncapabilities of weakly supervised semantic segmentation are explored through\nthe lens of Explainable AI and the generation of counterfactual explanations.\nThe scope of this research is development of a novel counterfactual inpainting\napproach (COIN) that flips the predicted classification label from abnormal to\nnormal by using a generative model. For instance, if the classifier deems an\ninput medical image X as abnormal, indicating the presence of a pathology, the\ngenerative model aims to inpaint the abnormal region, thus reversing the\nclassifier's original prediction label. The approach enables us to produce\nprecise segmentations for pathologies without depending on pre-existing\nsegmentation masks. Crucially, image-level labels are utilized, which are\nsubstantially easier to acquire than creating detailed segmentation masks. The\neffectiveness of the method is demonstrated by segmenting synthetic targets and\nactual kidney tumors from CT images acquired from Tartu University Hospital in\nEstonia. The findings indicate that COIN greatly surpasses established\nattribution methods, such as RISE, ScoreCAM, and LayerCAM, as well as an\nalternative counterfactual explanation method introduced by Singla et al. This\nevidence suggests that COIN is a promising approach for semantic segmentation\nof tumors in CT images, and presents a step forward in making deep learning\napplications more accessible and effective in healthcare, where annotated data\nis scarce.\n","authors":["Dmytro Shvetsov","Joonas Ariva","Marharyta Domnich","Raul Vicente","Dmytro Fishman"],"pdf_url":"https://arxiv.org/pdf/2404.12832v1.pdf","comment":"This work has been accepted to be presented to The 2nd World\n Conference on eXplainable Artificial Intelligence (xAI 2024), July 17-19,\n 2024 - Valletta, Malta"},{"id":"http://arxiv.org/abs/2404.12819v1","updated":"2024-04-19T11:56:29Z","published":"2024-04-19T11:56:29Z","title":"Unveiling the Ambiguity in Neural Inverse Rendering: A Parameter\n Compensation Analysis","summary":" Inverse rendering aims to reconstruct the scene properties of objects solely\nfrom multiview images. However, it is an ill-posed problem prone to producing\nambiguous estimations deviating from physically accurate representations. In\nthis paper, we utilize Neural Microfacet Fields (NMF), a state-of-the-art\nneural inverse rendering method to illustrate the inherent ambiguity. We\npropose an evaluation framework to assess the degree of compensation or\ninteraction between the estimated scene properties, aiming to explore the\nmechanisms behind this ill-posed problem and potential mitigation strategies.\nSpecifically, we introduce artificial perturbations to one scene property and\nexamine how adjusting another property can compensate for these perturbations.\nTo facilitate such experiments, we introduce a disentangled NMF where material\nproperties are independent. The experimental findings underscore the intrinsic\nambiguity present in neural inverse rendering and highlight the importance of\nproviding additional guidance through geometry, material, and illumination\npriors.\n","authors":["Georgios Kouros","Minye Wu","Sushruth Nagesh","Xianling Zhang","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2404.12819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12814v1","updated":"2024-04-19T11:49:01Z","published":"2024-04-19T11:49:01Z","title":"Generative Modelling with High-Order Langevin Dynamics","summary":" Diffusion generative modelling (DGM) based on stochastic\n differential equations (SDEs) with\n score matching has achieved unprecedented results in data\n generation.\n In this paper, we propose a novel fast high-quality\n generative modelling method\n based on high-order\n Langevin dynamics (HOLD) with score matching.\n This motive is proved by third-order\n Langevin dynamics. By augmenting the\n previous SDEs, e.g.\n variance exploding or variance preserving SDEs\n for single-data variable processes, HOLD can simultaneously\n model position, velocity, and\n acceleration, thereby improving the quality\n and speed of the data\n generation at the same time.\n HOLD is composed of one Ornstein-Uhlenbeck process\n and two Hamiltonians,\n which reduce the mixing time by two orders of magnitude.\n Empirical experiments for unconditional image generation on the\n public data set CIFAR-10 and CelebA-HQ show that the effect is significant in\n both Frechet inception distance (FID) and negative log-likelihood,\n and achieves the\n state-of-the-art FID of 1.85 on CIFAR-10.\n","authors":["Ziqiang Shi","Rujie Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12814v1.pdf","comment":"Some of the results in this paper have been published or accepted at\n conferences such as wacv2024, icassp2024, and icme2024"},{"id":"http://arxiv.org/abs/2404.11981v2","updated":"2024-04-19T11:43:39Z","published":"2024-04-18T08:23:24Z","title":"Tendency-driven Mutual Exclusivity for Weakly Supervised Incremental\n Semantic Segmentation","summary":" Weakly Incremental Learning for Semantic Segmentation (WILSS) leverages a\npre-trained segmentation model to segment new classes using cost-effective and\nreadily available image-level labels. A prevailing way to solve WILSS is the\ngeneration of seed areas for each new class, serving as a form of pixel-level\nsupervision. However, a scenario usually arises where a pixel is concurrently\npredicted as an old class by the pre-trained segmentation model and a new class\nby the seed areas. Such a scenario becomes particularly problematic in WILSS,\nas the lack of pixel-level annotations on new classes makes it intractable to\nascertain whether the pixel pertains to the new class or not. To surmount this\nissue, we propose an innovative, tendency-driven relationship of mutual\nexclusivity, meticulously tailored to govern the behavior of the seed areas and\nthe predictions generated by the pre-trained segmentation model. This\nrelationship stipulates that predictions for the new and old classes must not\nconflict whilst prioritizing the preservation of predictions for the old\nclasses, which not only addresses the conflicting prediction issue but also\neffectively mitigates the inherent challenge of incremental learning -\ncatastrophic forgetting. Furthermore, under the auspices of this\ntendency-driven mutual exclusivity relationship, we generate pseudo masks for\nthe new classes, allowing for concurrent execution with model parameter\nupdating via the resolution of a bi-level optimization problem. Extensive\nexperiments substantiate the effectiveness of our framework, resulting in the\nestablishment of new benchmarks and paving the way for further research in this\nfield.\n","authors":["Chongjie Si","Xuehui Wang","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2404.11981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12804v1","updated":"2024-04-19T11:38:34Z","published":"2024-04-19T11:38:34Z","title":"Linearly-evolved Transformer for Pan-sharpening","summary":" Vision transformer family has dominated the satellite pan-sharpening field\ndriven by the global-wise spatial information modeling mechanism from the core\nself-attention ingredient. The standard modeling rules within these promising\npan-sharpening methods are to roughly stack the transformer variants in a\ncascaded manner. Despite the remarkable advancement, their success may be at\nthe huge cost of model parameters and FLOPs, thus preventing its application\nover low-resource satellites.To address this challenge between favorable\nperformance and expensive computation, we tailor an efficient linearly-evolved\ntransformer variant and employ it to construct a lightweight pan-sharpening\nframework. In detail, we deepen into the popular cascaded transformer modeling\nwith cutting-edge methods and develop the alternative 1-order linearly-evolved\ntransformer variant with the 1-dimensional linear convolution chain to achieve\nthe same function. In this way, our proposed method is capable of benefiting\nthe cascaded modeling rule while achieving favorable performance in the\nefficient manner. Extensive experiments over multiple satellite datasets\nsuggest that our proposed method achieves competitive performance against other\nstate-of-the-art with fewer computational resources. Further, the consistently\nfavorable performance has been verified over the hyper-spectral image fusion\ntask. Our main focus is to provide an alternative global modeling framework\nwith an efficient structure. The code will be publicly available.\n","authors":["Junming Hou","Zihan Cao","Naishan Zheng","Xuan Li","Xiaoyu Chen","Xinyang Liu","Xiaofeng Cong","Man Zhou","Danfeng Hong"],"pdf_url":"https://arxiv.org/pdf/2404.12804v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.12803v1","updated":"2024-04-19T11:38:08Z","published":"2024-04-19T11:38:08Z","title":"TextSquare: Scaling up Text-Centric Visual Instruction Tuning","summary":" Text-centric visual question answering (VQA) has made great strides with the\ndevelopment of Multimodal Large Language Models (MLLMs), yet open-source models\nstill fall short of leading models like GPT4V and Gemini, partly due to a lack\nof extensive, high-quality instruction tuning data. To this end, we introduce a\nnew approach for creating a massive, high-quality instruction-tuning dataset,\nSquare-10M, which is generated using closed-source MLLMs. The data construction\nprocess, termed Square, consists of four steps: Self-Questioning, Answering,\nReasoning, and Evaluation. Our experiments with Square-10M led to three key\nfindings: 1) Our model, TextSquare, considerably surpasses open-source previous\nstate-of-the-art Text-centric MLLMs and sets a new standard on OCRBench(62.2%).\nIt even outperforms top-tier models like GPT4V and Gemini in 6 of 10\ntext-centric benchmarks. 2) Additionally, we demonstrate the critical role of\nVQA reasoning data in offering comprehensive contextual insights for specific\nquestions. This not only improves accuracy but also significantly mitigates\nhallucinations. Specifically, TextSquare scores an average of 75.1% across four\ngeneral VQA and hallucination evaluation datasets, outperforming previous\nstate-of-the-art models. 3) Notably, the phenomenon observed in scaling\ntext-centric VQA datasets reveals a vivid pattern: the exponential increase of\ninstruction tuning data volume is directly proportional to the improvement in\nmodel performance, thereby validating the necessity of the dataset scale and\nthe high quality of Square-10M.\n","authors":["Jingqun Tang","Chunhui Lin","Zhen Zhao","Shu Wei","Binghong Wu","Qi Liu","Hao Feng","Yang Li","Siqi Wang","Lei Liao","Wei Shi","Yuliang Liu","Hao Liu","Yuan Xie","Xiang Bai","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2404.12803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12798v1","updated":"2024-04-19T11:24:34Z","published":"2024-04-19T11:24:34Z","title":"A Point-Based Approach to Efficient LiDAR Multi-Task Perception","summary":" Multi-task networks can potentially improve performance and computational\nefficiency compared to single-task networks, facilitating online deployment.\nHowever, current multi-task architectures in point cloud perception combine\nmultiple task-specific point cloud representations, each requiring a separate\nfeature encoder and making the network structures bulky and slow. We propose\nPAttFormer, an efficient multi-task architecture for joint semantic\nsegmentation and object detection in point clouds that only relies on a\npoint-based representation. The network builds on transformer-based feature\nencoders using neighborhood attention and grid-pooling and a query-based\ndetection decoder using a novel 3D deformable-attention detection head design.\nUnlike other LiDAR-based multi-task architectures, our proposed PAttFormer does\nnot require separate feature encoders for multiple task-specific point cloud\nrepresentations, resulting in a network that is 3x smaller and 1.4x faster\nwhile achieving competitive performance on the nuScenes and KITTI benchmarks\nfor autonomous driving perception. Our extensive evaluations show substantial\ngains from multi-task learning, improving LiDAR semantic segmentation by +1.7%\nin mIou and 3D object detection by +1.7% in mAP on the nuScenes benchmark\ncompared to the single-task models.\n","authors":["Christopher Lang","Alexander Braun","Lars Schillingmann","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2404.12798v1.pdf","comment":"8 pages, 3 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.12794v1","updated":"2024-04-19T11:17:35Z","published":"2024-04-19T11:17:35Z","title":"MambaMOS: LiDAR-based 3D Moving Object Segmentation with Motion-aware\n State Space Model","summary":" LiDAR-based Moving Object Segmentation (MOS) aims to locate and segment\nmoving objects in point clouds of the current scan using motion information\nfrom previous scans. Despite the promising results achieved by previous MOS\nmethods, several key issues, such as the weak coupling of temporal and spatial\ninformation, still need further study. In this paper, we propose a novel\nLiDAR-based 3D Moving Object Segmentation with Motion-aware State Space Model,\ntermed MambaMOS. Firstly, we develop a novel embedding module, the Time Clue\nBootstrapping Embedding (TCBE), to enhance the coupling of temporal and spatial\ninformation in point clouds and alleviate the issue of overlooked temporal\nclues. Secondly, we introduce the Motion-aware State Space Model (MSSM) to\nendow the model with the capacity to understand the temporal correlations of\nthe same object across different time steps. Specifically, MSSM emphasizes the\nmotion states of the same object at different time steps through two distinct\ntemporal modeling and correlation steps. We utilize an improved state space\nmodel to represent these motion differences, significantly modeling the motion\nstates. Finally, extensive experiments on the SemanticKITTI-MOS and KITTI-Road\nbenchmarks demonstrate that the proposed MambaMOS achieves state-of-the-art\nperformance. The source code of this work will be made publicly available at\nhttps://github.com/Terminal-K/MambaMOS.\n","authors":["Kang Zeng","Hao Shi","Jiacheng Lin","Siyu Li","Jintao Cheng","Kaiwei Wang","Zhiyong Li","Kailun Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12794v1.pdf","comment":"The source code will be made publicly available at\n https://github.com/Terminal-K/MambaMOS"},{"id":"http://arxiv.org/abs/2404.12784v1","updated":"2024-04-19T10:47:53Z","published":"2024-04-19T10:47:53Z","title":"Contrastive Gaussian Clustering: Weakly Supervised 3D Scene Segmentation","summary":" We introduce Contrastive Gaussian Clustering, a novel approach capable of\nprovide segmentation masks from any viewpoint and of enabling 3D segmentation\nof the scene. Recent works in novel-view synthesis have shown how to model the\nappearance of a scene via a cloud of 3D Gaussians, and how to generate accurate\nimages from a given viewpoint by projecting on it the Gaussians before $\\alpha$\nblending their color. Following this example, we train a model to include also\na segmentation feature vector for each Gaussian. These can then be used for 3D\nscene segmentation, by clustering Gaussians according to their feature vectors;\nand to generate 2D segmentation masks, by projecting the Gaussians on a plane\nand $\\alpha$ blending over their segmentation features. Using a combination of\ncontrastive learning and spatial regularization, our method can be trained on\ninconsistent 2D segmentation masks, and still learn to generate segmentation\nmasks consistent across all views. Moreover, the resulting model is extremely\naccurate, improving the IoU accuracy of the predicted masks by $+8\\%$ over the\nstate of the art. Code and trained models will be released soon.\n","authors":["Myrna C. Silva","Mahtab Dahaghin","Matteo Toso","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2404.12784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12782v1","updated":"2024-04-19T10:43:25Z","published":"2024-04-19T10:43:25Z","title":"Sentiment-oriented Transformer-based Variational Autoencoder Network for\n Live Video Commenting","summary":" Automatic live video commenting is with increasing attention due to its\nsignificance in narration generation, topic explanation, etc. However, the\ndiverse sentiment consideration of the generated comments is missing from the\ncurrent methods. Sentimental factors are critical in interactive commenting,\nand lack of research so far. Thus, in this paper, we propose a\nSentiment-oriented Transformer-based Variational Autoencoder (So-TVAE) network\nwhich consists of a sentiment-oriented diversity encoder module and a batch\nattention module, to achieve diverse video commenting with multiple sentiments\nand multiple semantics. Specifically, our sentiment-oriented diversity encoder\nelegantly combines VAE and random mask mechanism to achieve semantic diversity\nunder sentiment guidance, which is then fused with cross-modal features to\ngenerate live video comments. Furthermore, a batch attention module is also\nproposed in this paper to alleviate the problem of missing sentimental samples,\ncaused by the data imbalance, which is common in live videos as the popularity\nof videos varies. Extensive experiments on Livebot and VideoIC datasets\ndemonstrate that the proposed So-TVAE outperforms the state-of-the-art methods\nin terms of the quality and diversity of generated comments. Related code is\navailable at https://github.com/fufy1024/So-TVAE.\n","authors":["Fengyi Fu","Shancheng Fang","Weidong Chen","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2404.12782v1.pdf","comment":"27 pages, 10 figures, ACM Transactions on Multimedia Computing,\n Communications and Applications, 2024"},{"id":"http://arxiv.org/abs/2404.12777v1","updated":"2024-04-19T10:32:30Z","published":"2024-04-19T10:32:30Z","title":"EfficientGS: Streamlining Gaussian Splatting for Large-Scale\n High-Resolution Scene Representation","summary":" In the domain of 3D scene representation, 3D Gaussian Splatting (3DGS) has\nemerged as a pivotal technology. However, its application to large-scale,\nhigh-resolution scenes (exceeding 4k$\\times$4k pixels) is hindered by the\nexcessive computational requirements for managing a large number of Gaussians.\nAddressing this, we introduce 'EfficientGS', an advanced approach that\noptimizes 3DGS for high-resolution, large-scale scenes. We analyze the\ndensification process in 3DGS and identify areas of Gaussian\nover-proliferation. We propose a selective strategy, limiting Gaussian increase\nto key primitives, thereby enhancing the representational efficiency.\nAdditionally, we develop a pruning mechanism to remove redundant Gaussians,\nthose that are merely auxiliary to adjacent ones. For further enhancement, we\nintegrate a sparse order increment for Spherical Harmonics (SH), designed to\nalleviate storage constraints and reduce training overhead. Our empirical\nevaluations, conducted on a range of datasets including extensive 4K+ aerial\nimages, demonstrate that 'EfficientGS' not only expedites training and\nrendering times but also achieves this with a model size approximately tenfold\nsmaller than conventional 3DGS while maintaining high rendering fidelity.\n","authors":["Wenkai Liu","Tao Guan","Bin Zhu","Lili Ju","Zikai Song","Dan Li","Yuesong Wang","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.09325v2","updated":"2024-04-19T10:28:03Z","published":"2022-06-19T04:49:35Z","title":"EATFormer: Improving Vision Transformer Inspired by Evolutionary\n Algorithm","summary":" Motivated by biological evolution, this paper explains the rationality of\nVision Transformer by analogy with the proven practical Evolutionary Algorithm\n(EA) and derives that both have consistent mathematical formulation. Then\ninspired by effective EA variants, we propose a novel pyramid EATFormer\nbackbone that only contains the proposed \\emph{EA-based Transformer} (EAT)\nblock, which consists of three residual parts, i.e., \\emph{Multi-Scale Region\nAggregation} (MSRA), \\emph{Global and Local Interaction} (GLI), and\n\\emph{Feed-Forward Network} (FFN) modules, to model multi-scale, interactive,\nand individual information separately. Moreover, we design a \\emph{Task-Related\nHead} (TRH) docked with transformer backbone to complete final information\nfusion more flexibly and \\emph{improve} a \\emph{Modulated Deformable MSA}\n(MD-MSA) to dynamically model irregular locations. Massive quantitative and\nquantitative experiments on image classification, downstream tasks, and\nexplanatory experiments demonstrate the effectiveness and superiority of our\napproach over State-Of-The-Art (SOTA) methods. \\Eg, our Mobile (1.8M), Tiny\n(6.1M), Small (24.3M), and Base (49.0M) models achieve 69.4, 78.4, 83.1, and\n83.9 Top-1 only trained on ImageNet-1K with naive training recipe;\nEATFormer-Tiny/Small/Base armed Mask-R-CNN obtain 45.4/47.4/49.0 box AP and\n41.4/42.9/44.2 mask AP on COCO detection, surpassing contemporary MPViT-T,\nSwin-T, and Swin-S by 0.6/1.4/0.5 box AP and 0.4/1.3/0.9 mask AP separately\nwith less FLOPs; Our EATFormer-Small/Base achieve 47.3/49.3 mIoU on ADE20K by\nUpernet that exceeds Swin-T/S by 2.8/1.7. Code is available at\n\\url{https://github.com/zhangzjn/EATFormer}.\n","authors":["Jiangning Zhang","Xiangtai Li","Yabiao Wang","Chengjie Wang","Yibo Yang","Yong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2206.09325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12770v1","updated":"2024-04-19T10:21:33Z","published":"2024-04-19T10:21:33Z","title":"Camera Agnostic Two-Head Network for Ego-Lane Inference","summary":" Vision-based ego-lane inference using High-Definition (HD) maps is essential\nin autonomous driving and advanced driver assistance systems. The traditional\napproach necessitates well-calibrated cameras, which confines variation of\ncamera configuration, as the algorithm relies on intrinsic and extrinsic\ncalibration. In this paper, we propose a learning-based ego-lane inference by\ndirectly estimating the ego-lane index from a single image. To enhance robust\nperformance, our model incorporates the two-head structure inferring ego-lane\nin two perspectives simultaneously. Furthermore, we utilize an attention\nmechanism guided by vanishing point-and-line to adapt to changes in viewpoint\nwithout requiring accurate calibration. The high adaptability of our model was\nvalidated in diverse environments, devices, and camera mounting points and\norientations.\n","authors":["Chaehyeon Song","Sungho Yoon","Minhyeok Heo","Ayoung Kim","Sujung Kim"],"pdf_url":"https://arxiv.org/pdf/2404.12770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12768v1","updated":"2024-04-19T10:17:10Z","published":"2024-04-19T10:17:10Z","title":"MixLight: Borrowing the Best of both Spherical Harmonics and Gaussian\n Models","summary":" Accurately estimating scene lighting is critical for applications such as\nmixed reality. Existing works estimate illumination by generating illumination\nmaps or regressing illumination parameters. However, the method of generating\nillumination maps has poor generalization performance and parametric models\nsuch as Spherical Harmonic (SH) and Spherical Gaussian (SG) fall short in\ncapturing high-frequency or low-frequency components. This paper presents\nMixLight, a joint model that utilizes the complementary characteristics of SH\nand SG to achieve a more complete illumination representation, which uses SH\nand SG to capture low-frequency ambient and high-frequency light sources\nrespectively. In addition, a special spherical light source sparsemax\n(SLSparsemax) module that refers to the position and brightness relationship\nbetween spherical light sources is designed to improve their sparsity, which is\nsignificant but omitted by prior works. Extensive experiments demonstrate that\nMixLight surpasses state-of-the-art (SOTA) methods on multiple metrics. In\naddition, experiments on Web Dataset also show that MixLight as a parametric\nmethod has better generalization performance than non-parametric methods.\n","authors":["Xinlong Ji","Fangneng Zhan","Shijian Lu","Shi-Sheng Huang","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2404.12768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12766v1","updated":"2024-04-19T10:10:39Z","published":"2024-04-19T10:10:39Z","title":"Continual Learning on a Diet: Learning from Sparsely Labeled Streams\n Under Constrained Computation","summary":" We propose and study a realistic Continual Learning (CL) setting where\nlearning algorithms are granted a restricted computational budget per time step\nwhile training. We apply this setting to large-scale semi-supervised Continual\nLearning scenarios with sparse label rates. Previous proficient CL methods\nperform very poorly in this challenging setting. Overfitting to the sparse\nlabeled data and insufficient computational budget are the two main culprits\nfor such a poor performance. Our new setting encourages learning methods to\neffectively and efficiently utilize the unlabeled data during training. To that\nend, we propose a simple but highly effective baseline, DietCL, which utilizes\nboth unlabeled and labeled data jointly. DietCL meticulously allocates\ncomputational budget for both types of data. We validate our baseline, at\nscale, on several datasets, e.g., CLOC, ImageNet10K, and CGLM, under constraint\nbudget setups. DietCL outperforms, by a large margin, all existing supervised\nCL algorithms as well as more recent continual semi-supervised methods. Our\nextensive analysis and ablations demonstrate that DietCL is stable under a full\nspectrum of label sparsity, computational budget, and various other ablations.\n","authors":["Wenxuan Zhang","Youssef Mohamed","Bernard Ghanem","Philip H. S. Torr","Adel Bibi","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2404.12766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12739v1","updated":"2024-04-19T09:32:16Z","published":"2024-04-19T09:32:16Z","title":"The Solution for the CVPR2024 NICE Image Captioning Challenge","summary":" This report introduces a solution to the Topic 1 Zero-shot Image Captioning\nof 2024 NICE : New frontiers for zero-shot Image Captioning Evaluation. In\ncontrast to NICE 2023 datasets, this challenge involves new annotations by\nhumans with significant differences in caption style and content. Therefore, we\nenhance image captions effectively through retrieval augmentation and caption\ngrading methods. At the data level, we utilize high-quality captions generated\nby image caption models as training data to address the gap in text styles. At\nthe model level, we employ OFA (a large-scale visual-language pre-training\nmodel based on handcrafted templates) to perform the image captioning task.\nSubsequently, we propose caption-level strategy for the high-quality caption\ndata generated by the image caption models and integrate them with retrieval\naugmentation strategy into the template to compel the model to generate higher\nquality, more matching, and semantically enriched captions based on the\nretrieval augmentation prompts. Our approach ranks first on the leaderboard,\nachieving a CIDEr score of 234.11 and 1st in all other metrics.\n","authors":["Longfei Huang","Shupeng Zhong","Xiangyu Wu","Ruoxuan Li","Qingguo Chen","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12734v1","updated":"2024-04-19T09:28:16Z","published":"2024-04-19T09:28:16Z","title":"DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On\n Transformer","summary":" With the continuous development of OCR technology and the expansion of\napplication fields, text recognition in complex scenes has become a key\nchallenge. Factors such as multiple fonts, mixed scenes and complex layouts\nseriously affect the recognition accuracy of traditional OCR models. Although\nOCR models based on deep learning have performed well in specific fields or\nsimilar data sets in recent years, the generalization ability and robustness of\nthe model are still a big challenge when facing complex environments with\nmultiple scenes. Furthermore, training an OCR model from scratch or fine-tuning\nall parameters is very demanding on computing resources and inference time,\nwhich limits the flexibility of its application. This study focuses on a\nfundamental aspect of mixed text recognition in response to the challenges\nmentioned above, which involves effectively fine-tuning the pre-trained basic\nOCR model to demonstrate exceptional performance across various downstream\ntasks. To this end, we propose a parameter-efficient hybrid text recognition\nmethod based on pre-trained OCR Transformer, namely DLoRA-TrOCR. This method\nembeds DoRA into the image encoder and LoRA into the internal structure of the\ntext decoder, enabling efficient parameter fine-tuning for downstream tasks.\nExperimental results show that compared to similar parameter adjustment\nmethods, our model DLoRA-TrOCR has the smallest number of parameters and\nperforms better. It can achieve state-of-the-art performance on complex scene\ndata sets involving simultaneous recognition of mixed handwritten, printed and\nstreet view texts.\n","authors":["Da Chang","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2404.12734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12730v1","updated":"2024-04-19T09:22:20Z","published":"2024-04-19T09:22:20Z","title":"PATE-TripleGAN: Privacy-Preserving Image Synthesis with Gaussian\n Differential Privacy","summary":" Conditional Generative Adversarial Networks (CGANs) exhibit significant\npotential in supervised learning model training by virtue of their ability to\ngenerate realistic labeled images. However, numerous studies have indicated the\nprivacy leakage risk in CGANs models. The solution DPCGAN, incorporating the\ndifferential privacy framework, faces challenges such as heavy reliance on\nlabeled data for model training and potential disruptions to original gradient\ninformation due to excessive gradient clipping, making it difficult to ensure\nmodel accuracy. To address these challenges, we present a privacy-preserving\ntraining framework called PATE-TripleGAN. This framework incorporates a\nclassifier to pre-classify unlabeled data, establishing a three-party min-max\ngame to reduce dependence on labeled data. Furthermore, we present a hybrid\ngradient desensitization algorithm based on the Private Aggregation of Teacher\nEnsembles (PATE) framework and Differential Private Stochastic Gradient Descent\n(DPSGD) method. This algorithm allows the model to retain gradient information\nmore effectively while ensuring privacy protection, thereby enhancing the\nmodel's utility. Privacy analysis and extensive experiments affirm that the\nPATE-TripleGAN model can generate a higher quality labeled image dataset while\nensuring the privacy of the training data.\n","authors":["Zepeng Jiang","Weiwei Ni","Yifan Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16050v2","updated":"2024-04-19T09:22:06Z","published":"2024-03-24T07:33:08Z","title":"Heterogeneous Federated Learning with Splited Language Model","summary":" Federated Split Learning (FSL) is a promising distributed learning paradigm\nin practice, which gathers the strengths of both Federated Learning (FL) and\nSplit Learning (SL) paradigms, to ensure model privacy while diminishing the\nresource overhead of each client, especially on large transformer models in a\nresource-constrained environment, e.g., Internet of Things (IoT). However,\nalmost all works merely investigate the performance with simple neural network\nmodels in FSL. Despite the minor efforts focusing on incorporating Vision\nTransformers (ViT) as model architectures, they train ViT from scratch, thereby\nleading to enormous training overhead in each device with limited resources.\nTherefore, in this paper, we harness Pre-trained Image Transformers (PITs) as\nthe initial model, coined FedV, to accelerate the training process and improve\nmodel robustness. Furthermore, we propose FedVZ to hinder the gradient\ninversion attack, especially having the capability compatible with black-box\nscenarios, where the gradient information is unavailable. Concretely, FedVZ\napproximates the server gradient by utilizing a zeroth-order (ZO) optimization,\nwhich replaces the backward propagation with just one forward process.\nEmpirically, we are the first to provide a systematic evaluation of FSL methods\nwith PITs in real-world datasets, different partial device participations, and\nheterogeneous data splits. Our experiments verify the effectiveness of our\nalgorithms.\n","authors":["Yifan Shi","Yuhui Zhang","Ziyue Huang","Xiaofeng Yang","Li Shen","Wei Chen","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16050v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12725v1","updated":"2024-04-19T09:08:44Z","published":"2024-04-19T09:08:44Z","title":"Separate in the Speech Chain: Cross-Modal Conditional Audio-Visual\n Target Speech Extraction","summary":" The integration of visual cues has revitalized the performance of the target\nspeech extraction task, elevating it to the forefront of the field.\nNevertheless, this multi-modal learning paradigm often encounters the challenge\nof modality imbalance. In audio-visual target speech extraction tasks, the\naudio modality tends to dominate, potentially overshadowing the importance of\nvisual guidance. To tackle this issue, we propose AVSepChain, drawing\ninspiration from the speech chain concept. Our approach partitions the\naudio-visual target speech extraction task into two stages: speech perception\nand speech production. In the speech perception stage, audio serves as the\ndominant modality, while visual information acts as the conditional modality.\nConversely, in the speech production stage, the roles are reversed. This\ntransformation of modality status aims to alleviate the problem of modality\nimbalance. Additionally, we introduce a contrastive semantic matching loss to\nensure that the semantic information conveyed by the generated speech aligns\nwith the semantic information conveyed by lip movements during the speech\nproduction stage. Through extensive experiments conducted on multiple benchmark\ndatasets for audio-visual target speech extraction, we showcase the superior\nperformance achieved by our proposed method.\n","authors":["Zhaoxi Mu","Xinyu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12725v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.12721v1","updated":"2024-04-19T09:01:58Z","published":"2024-04-19T09:01:58Z","title":"Generalized Few-Shot Meets Remote Sensing: Discovering Novel Classes in\n Land Cover Mapping via Hybrid Semantic Segmentation Framework","summary":" Land-cover mapping is one of the vital applications in Earth observation,\naiming at classifying each pixel's land-cover type of remote-sensing images. As\nnatural and human activities change the landscape, the land-cover map needs to\nbe rapidly updated. However, discovering newly appeared land-cover types in\nexisting classification systems is still a non-trivial task hindered by various\nscales of complex land objects and insufficient labeled data over a wide-span\ngeographic area. In this paper, we propose a generalized few-shot\nsegmentation-based framework, named SegLand, to update novel classes in\nhigh-resolution land-cover mapping. Specifically, the proposed framework is\ndesigned in three parts: (a) Data pre-processing: the base training set and the\nfew-shot support sets of novel classes are analyzed and augmented; (b) Hybrid\nsegmentation structure; Multiple base learners and a modified Projection onto\nOrthogonal Prototypes (POP) network are combined to enhance the base-class\nrecognition and to dig novel classes from insufficient labels data; (c)\nUltimate fusion: the semantic segmentation results of the base learners and POP\nnetwork are reasonably fused. The proposed framework has won first place in the\nleaderboard of the OpenEarthMap Land Cover Mapping Few-Shot Challenge.\nExperiments demonstrate the superiority of the framework for automatically\nupdating novel land-cover classes with limited labeled data.\n","authors":["Zhuohong Li","Fangxiao Lu","Jiaqi Zou","Lei Hu","Hongyan Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12721v1.pdf","comment":"11 pages, 11 figures, accepted by CVPR 2024 L3D-IVU Workshop"},{"id":"http://arxiv.org/abs/2404.12720v1","updated":"2024-04-19T09:00:05Z","published":"2024-04-19T09:00:05Z","title":"PDF-MVQA: A Dataset for Multimodal Information Retrieval in PDF-based\n Visual Question Answering","summary":" Document Question Answering (QA) presents a challenge in understanding\nvisually-rich documents (VRD), particularly those dominated by lengthy textual\ncontent like research journal articles. Existing studies primarily focus on\nreal-world documents with sparse text, while challenges persist in\ncomprehending the hierarchical semantic relations among multiple pages to\nlocate multimodal components. To address this gap, we propose PDF-MVQA, which\nis tailored for research journal articles, encompassing multiple pages and\nmultimodal information retrieval. Unlike traditional machine reading\ncomprehension (MRC) tasks, our approach aims to retrieve entire paragraphs\ncontaining answers or visually rich document entities like tables and figures.\nOur contributions include the introduction of a comprehensive PDF Document VQA\ndataset, allowing the examination of semantically hierarchical layout\nstructures in text-dominant documents. We also present new VRD-QA frameworks\ndesigned to grasp textual contents and relations among document layouts\nsimultaneously, extending page-level understanding to the entire multi-page\ndocument. Through this work, we aim to enhance the capabilities of existing\nvision-and-language models in handling challenges posed by text-dominant\ndocuments in VRD-QA.\n","authors":["Yihao Ding","Kaixuan Ren","Jiabin Huang","Siwen Luo","Soyeon Caren Han"],"pdf_url":"https://arxiv.org/pdf/2404.12720v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2307.01004v2","updated":"2024-04-19T08:59:37Z","published":"2023-07-03T13:40:20Z","title":"Joint Coordinate Regression and Association For Multi-Person Pose\n Estimation, A Pure Neural Network Approach","summary":" We introduce a novel one-stage end-to-end multi-person 2D pose estimation\nalgorithm, known as Joint Coordinate Regression and Association (JCRA), that\nproduces human pose joints and associations without requiring any\npost-processing. The proposed algorithm is fast, accurate, effective, and\nsimple. The one-stage end-to-end network architecture significantly improves\nthe inference speed of JCRA. Meanwhile, we devised a symmetric network\nstructure for both the encoder and decoder, which ensures high accuracy in\nidentifying keypoints. It follows an architecture that directly outputs part\npositions via a transformer network, resulting in a significant improvement in\nperformance. Extensive experiments on the MS COCO and CrowdPose benchmarks\ndemonstrate that JCRA outperforms state-of-the-art approaches in both accuracy\nand efficiency. Moreover, JCRA demonstrates 69.2 mAP and is 78\\% faster at\ninference acceleration than previous state-of-the-art bottom-up algorithms. The\ncode for this algorithm will be publicly available.\n","authors":["Dongyang Yu","Yunshi Xie","Wangpeng An","Li Zhang","Yufeng Yao"],"pdf_url":"https://arxiv.org/pdf/2307.01004v2.pdf","comment":"This paper has been accepted by MMasia 2023 and is an oral\n presentation"},{"id":"http://arxiv.org/abs/2404.12718v1","updated":"2024-04-19T08:58:53Z","published":"2024-04-19T08:58:53Z","title":"Improving Prediction Accuracy of Semantic Segmentation Methods Using\n Convolutional Autoencoder Based Pre-processing Layers","summary":" In this paper, we propose a method to improve prediction accuracy of semantic\nsegmentation methods as follows: (1) construct a neural network that has\npre-processing layers based on a convolutional autoencoder ahead of a semantic\nsegmentation network, and (2) train the entire network initialized by the\nweights of the pre-trained autoencoder. We applied this method to the fully\nconvolutional network (FCN) and experimentally compared its prediction accuracy\non the cityscapes dataset. The Mean IoU of the proposed target model with the\nHe normal initialization is 18.7% higher than that of FCN with the He normal\ninitialization. In addition, those of the modified models of the target model\nare significantly higher than that of FCN with the He normal initialization.\nThe accuracy and loss curves during the training showed that these are\nresulting from the improvement of the generalization ability. All of these\nresults provide strong evidence that the proposed method is significantly\neffective in improving the prediction accuracy of FCN. The proposed method has\nthe following features: it is comparatively simple, whereas the effect on\nimproving the generalization ability and prediction accuracy of FCN is\nsignificant; the increase in the number of parameters by using it is very\nsmall, and that in the computation time is substantially large. In principle,\nthe proposed method can be applied to other semantic segmentation methods. For\nsemantic segmentation, at present, there is no effective way to improve the\nprediction accuracy of existing methods. None have published a method which is\nthe same as or similar to our method and none have used such a method in\npractice. Therefore, we believe that our method is useful in practice and\nworthy of being widely known and used.\n","authors":["Hisashi Shimodaira"],"pdf_url":"https://arxiv.org/pdf/2404.12718v1.pdf","comment":"13 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2312.04861v2","updated":"2024-04-19T08:55:34Z","published":"2023-12-08T06:31:19Z","title":"Exploring Radar Data Representations in Autonomous Driving: A\n Comprehensive Review","summary":" With the rapid advancements of sensor technology and deep learning,\nautonomous driving systems are providing safe and efficient access to\nintelligent vehicles as well as intelligent transportation. Among these\nequipped sensors, the radar sensor plays a crucial role in providing robust\nperception information in diverse environmental conditions. This review focuses\non exploring different radar data representations utilized in autonomous\ndriving systems. Firstly, we introduce the capabilities and limitations of the\nradar sensor by examining the working principles of radar perception and signal\nprocessing of radar measurements. Then, we delve into the generation process of\nfive radar representations, including the ADC signal, radar tensor, point\ncloud, grid map, and micro-Doppler signature. For each radar representation, we\nexamine the related datasets, methods, advantages and limitations. Furthermore,\nwe discuss the challenges faced in these data representations and propose\npotential research directions. Above all, this comprehensive review offers an\nin-depth insight into how these representations enhance autonomous system\ncapabilities, providing guidance for radar perception researchers. To\nfacilitate retrieval and comparison of different data representations, datasets\nand methods, we provide an interactive website at\nhttps://radar-camera-fusion.github.io/radar.\n","authors":["Shanliang Yao","Runwei Guan","Zitian Peng","Chenhang Xu","Yilu Shi","Weiping Ding","Eng Gee Lim","Yong Yue","Hyungjoon Seo","Ka Lok Man","Jieming Ma","Xiaohui Zhu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2312.04861v2.pdf","comment":"24 pages, 10 figures, 5 tables. arXiv admin note: text overlap with\n arXiv:2304.10410"},{"id":"http://arxiv.org/abs/2311.15727v2","updated":"2024-04-19T08:51:58Z","published":"2023-11-27T11:24:25Z","title":"MARIS: Referring Image Segmentation via Mutual-Aware Attention Features","summary":" Referring image segmentation (RIS) aims to segment a particular region based\non a language expression prompt. Existing methods incorporate linguistic\nfeatures into visual features and obtain multi-modal features for mask\ndecoding. However, these methods may segment the visually salient entity\ninstead of the correct referring region, as the multi-modal features are\ndominated by the abundant visual context. In this paper, we propose MARIS, a\nreferring image segmentation method that leverages the Segment Anything Model\n(SAM) and introduces a mutual-aware attention mechanism to enhance the\ncross-modal fusion via two parallel branches. Specifically, our mutual-aware\nattention mechanism consists of Vision-Guided Attention and Language-Guided\nAttention, which bidirectionally model the relationship between visual and\nlinguistic features. Correspondingly, we design a Mask Decoder to enable\nexplicit linguistic guidance for more consistent segmentation with the language\nexpression. To this end, a multi-modal query token is proposed to integrate\nlinguistic information and interact with visual information simultaneously.\nExtensive experiments on three benchmark datasets show that our method\noutperforms the state-of-the-art RIS methods. Our code will be publicly\navailable.\n","authors":["Mengxi Zhang","Yiming Liu","Xiangjun Yin","Huanjing Yue","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15727v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12712v1","updated":"2024-04-19T08:46:33Z","published":"2024-04-19T08:46:33Z","title":"uTRAND: Unsupervised Anomaly Detection in Traffic Trajectories","summary":" Deep learning-based approaches have achieved significant improvements on\npublic video anomaly datasets, but often do not perform well in real-world\napplications. This paper addresses two issues: the lack of labeled data and the\ndifficulty of explaining the predictions of a neural network. To this end, we\npresent a framework called uTRAND, that shifts the problem of anomalous\ntrajectory prediction from the pixel space to a semantic-topological domain.\nThe framework detects and tracks all types of traffic agents in bird's-eye-view\nvideos of traffic cameras mounted at an intersection. By conceptualizing the\nintersection as a patch-based graph, it is shown that the framework learns and\nmodels the normal behaviour of traffic agents without costly manual labeling.\nFurthermore, uTRAND allows to formulate simple rules to classify anomalous\ntrajectories in a way suited for human interpretation. We show that uTRAND\noutperforms other state-of-the-art approaches on a dataset of anomalous\ntrajectories collected in a real-world setting, while producing explainable\ndetection results.\n","authors":["Giacomo D'Amicantonio","Egor Bondarau","Peter H. N. de With"],"pdf_url":"https://arxiv.org/pdf/2404.12712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12711v1","updated":"2024-04-19T08:40:52Z","published":"2024-04-19T08:40:52Z","title":"Dynamic Temperature Knowledge Distillation","summary":" Temperature plays a pivotal role in moderating label softness in the realm of\nknowledge distillation (KD). Traditional approaches often employ a static\ntemperature throughout the KD process, which fails to address the nuanced\ncomplexities of samples with varying levels of difficulty and overlooks the\ndistinct capabilities of different teacher-student pairings. This leads to a\nless-than-ideal transfer of knowledge. To improve the process of knowledge\npropagation, we proposed Dynamic Temperature Knowledge Distillation (DTKD)\nwhich introduces a dynamic, cooperative temperature control for both teacher\nand student models simultaneously within each training iterafion. In\nparticular, we proposed \"\\textbf{sharpness}\" as a metric to quantify the\nsmoothness of a model's output distribution. By minimizing the sharpness\ndifference between the teacher and the student, we can derive sample-specific\ntemperatures for them respectively. Extensive experiments on CIFAR-100 and\nImageNet-2012 demonstrate that DTKD performs comparably to leading KD\ntechniques, with added robustness in Target Class KD and None-target Class KD\nscenarios.The code is available at https://github.com/JinYu1998/DTKD.\n","authors":["Yukang Wei","Yu Bai"],"pdf_url":"https://arxiv.org/pdf/2404.12711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07739v3","updated":"2024-04-19T08:36:25Z","published":"2024-02-12T15:57:31Z","title":"Task-conditioned adaptation of visual features in multi-task policy\n learning","summary":" Successfully addressing a wide variety of tasks is a core ability of\nautonomous agents, requiring flexibly adapting the underlying decision-making\nstrategies and, as we argue in this work, also adapting the perception modules.\nAn analogical argument would be the human visual system, which uses top-down\nsignals to focus attention determined by the current task. Similarly, we adapt\npre-trained large vision models conditioned on specific downstream tasks in the\ncontext of multi-task policy learning. We introduce task-conditioned adapters\nthat do not require finetuning any pre-trained weights, combined with a single\npolicy trained with behavior cloning and capable of addressing multiple tasks.\nWe condition the visual adapters on task embeddings, which can be selected at\ninference if the task is known, or alternatively inferred from a set of example\ndemonstrations. To this end, we propose a new optimization-based estimator. We\nevaluate the method on a wide variety of tasks from the CortexBench benchmark\nand show that, compared to existing work, it can be addressed with a single\npolicy. In particular, we demonstrate that adapting visual features is a key\ndesign choice and that the method generalizes to unseen tasks given a few\ndemonstrations.\n","authors":["Pierre Marza","Laetitia Matignon","Olivier Simonin","Christian Wolf"],"pdf_url":"https://arxiv.org/pdf/2402.07739v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09084v3","updated":"2024-04-19T08:24:06Z","published":"2023-08-17T16:23:52Z","title":"MovePose: A High-performance Human Pose Estimation Algorithm on Mobile\n and Edge Devices","summary":" We present MovePose, an optimized lightweight convolutional neural network\ndesigned specifically for real-time body pose estimation on CPU-based mobile\ndevices. The current solutions do not provide satisfactory accuracy and speed\nfor human posture estimation, and MovePose addresses this gap. It aims to\nmaintain real-time performance while improving the accuracy of human posture\nestimation for mobile devices. Our MovePose algorithm has attained an Mean\nAverage Precision (mAP) score of 68.0 on the COCO \\cite{cocodata} validation\ndataset. The MovePose algorithm displayed efficiency with a performance of 69+\nframes per second (fps) when run on an Intel i9-10920x CPU. Additionally, it\nshowcased an increased performance of 452+ fps on an NVIDIA RTX3090 GPU. On an\nAndroid phone equipped with a Snapdragon 8 + 4G processor, the fps reached\nabove 11. To enhance accuracy, we incorporated three techniques: deconvolution,\nlarge kernel convolution, and coordinate classification methods. Compared to\nbasic upsampling, deconvolution is trainable, improves model capacity, and\nenhances the receptive field. Large kernel convolution strengthens these\nproperties at a decreased computational cost. In summary, MovePose provides\nhigh accuracy and real-time performance, marking it a potential tool for a\nvariety of applications, including those focused on mobile-side human posture\nestimation. The code and models for this algorithm will be made publicly\naccessible.\n","authors":["Dongyang Yu","Haoyue Zhang","Ruisheng Zhao","Guoqi Chen","Wangpeng An","Yanhong Yang"],"pdf_url":"https://arxiv.org/pdf/2308.09084v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12702v1","updated":"2024-04-19T08:20:18Z","published":"2024-04-19T08:20:18Z","title":"Modeling Multi-Granularity Context Information Flow for Pavement Crack\n Detection","summary":" Crack detection has become an indispensable, interesting yet challenging task\nin the computer vision community. Specially, pavement cracks have a highly\ncomplex spatial structure, a low contrasting background and a weak spatial\ncontinuity, posing a significant challenge to an effective crack detection\nmethod. In this paper, we address these problems from a view that utilizes\ncontexts of the cracks and propose an end-to-end deep learning method to model\nthe context information flow. To precisely localize crack from an image, it is\ncritical to effectively extract and aggregate multi-granularity context,\nincluding the fine-grained local context around the cracks (in spatial-level)\nand the coarse-grained semantics (in segment-level). Concretely, in\nConvolutional Neural Network (CNN), low-level features extracted by the shallow\nlayers represent the local information, while the deep layers extract the\nsemantic features. Additionally, a second main insight in this work is that the\nsemantic context should be an guidance to local context feature. By the above\ninsights, the proposed method we first apply the dilated convolution as the\nbackbone feature extractor to model local context, then we build a context\nguidance module to leverage semantic context to guide local feature extraction\nat multiple stages. To handle label alignment between stages, we apply the\nMultiple Instance Learning (MIL) strategy to align the high-level feature to\nthe low-level ones in the stage-wise context flow. In addition, compared with\nthese public crack datasets, to our best knowledge, we release the largest,\nmost complex and most challenging Bitumen Pavement Crack (BPC) dataset. The\nexperimental results on the three crack datasets demonstrate that the proposed\nmethod performs well and outperforms the current state-of-the-art methods.\n","authors":["Junbiao Pang","Baocheng Xiong","Jiaqi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.12702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12694v1","updated":"2024-04-19T07:50:13Z","published":"2024-04-19T07:50:13Z","title":"ESC: Evolutionary Stitched Camera Calibration in the Wild","summary":" This work introduces a novel end-to-end approach for estimating extrinsic\nparameters of cameras in multi-camera setups on real-life sports fields. We\nidentify the source of significant calibration errors in multi-camera\nenvironments and address the limitations of existing calibration methods,\nparticularly the disparity between theoretical models and actual sports field\ncharacteristics. We propose the Evolutionary Stitched Camera calibration (ESC)\nalgorithm to bridge this gap. It consists of image segmentation followed by\nevolutionary optimization of a novel loss function, providing a unified and\naccurate multi-camera calibration solution with high visual fidelity. The\noutcome allows the creation of virtual stitched views from multiple video\nsources, being as important for practical applications as numerical accuracy.\nWe demonstrate the superior performance of our approach compared to\nstate-of-the-art methods across diverse real-life football fields with varying\nphysical characteristics.\n","authors":["Grzegorz Rypeść","Grzegorz Kurzejamski"],"pdf_url":"https://arxiv.org/pdf/2404.12694v1.pdf","comment":"Accepted for IEEE CEC 2024"},{"id":"http://arxiv.org/abs/2404.12693v1","updated":"2024-04-19T07:47:23Z","published":"2024-04-19T07:47:23Z","title":"Improving Chinese Character Representation with Formation Tree","summary":" Learning effective representations for Chinese characters presents unique\nchallenges, primarily due to the vast number of characters and their continuous\ngrowth, which requires models to handle an expanding category space.\nAdditionally, the inherent sparsity of character usage complicates the\ngeneralization of learned representations. Prior research has explored\nradical-based sequences to overcome these issues, achieving progress in\nrecognizing unseen characters. However, these approaches fail to fully exploit\nthe inherent tree structure of such sequences. To address these limitations and\nleverage established data properties, we propose Formation Tree-CLIP (FT-CLIP).\nThis model utilizes formation trees to represent characters and incorporates a\ndedicated tree encoder, significantly improving performance in both seen and\nunseen character recognition tasks. We further introduce masking for to both\ncharacter images and tree nodes, enabling efficient and effective training.\nThis approach accelerates training significantly (by a factor of 2 or more)\nwhile enhancing accuracy. Extensive experiments show that processing characters\nthrough formation trees aligns better with their inherent properties than\ndirect sequential methods, significantly enhancing the generality and usability\nof the representations.\n","authors":["Yang Hong","Yinfei Li","Xiaojun Qiao","Rui Li","Junsong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00394v2","updated":"2024-04-19T07:43:52Z","published":"2024-03-01T09:29:41Z","title":"List-Mode PET Image Reconstruction Using Dykstra-Like Splitting","summary":" Convergence of the block iterative method in image reconstruction for\npositron emission tomography (PET) requires careful control of relaxation\nparameters, which is a challenging task. The automatic determination of\nrelaxation parameters for list-mode reconstructions also remains challenging.\nTherefore, a different approach would be desirable. In this study, we propose a\nlist-mode maximum likelihood Dykstra-like splitting PET reconstruction\n(LM-MLDS). LM-MLDS converges the list-mode block iterative method by adding the\ndistance from an initial image as a penalty term into an objective function.\nLM-MLDS takes a two-step approach because its performance depends on the\nquality of the initial image. The first step uses a uniform image as the\ninitial image, and then the second step uses a reconstructed image after one\nmain iteration as the initial image. In a simulation study, LM-MLDS provided a\nbetter tradeoff curve between noise and contrast than the other methods. In a\nclinical study, LM-MLDS removed the false hotspots at the edge of the axial\nfield of view and improved the image quality of slices covering the top of the\nhead to the cerebellum. List-mode proximal splitting reconstruction is useful\nnot only for optimizing nondifferential functions but also for converging block\niterative methods without controlling relaxation parameters.\n","authors":["Kibo Ote","Fumio Hashimoto","Yuya Onishi","Yasuomi Ouchi"],"pdf_url":"https://arxiv.org/pdf/2403.00394v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.12680v1","updated":"2024-04-19T07:30:36Z","published":"2024-04-19T07:30:36Z","title":"VoxAtnNet: A 3D Point Clouds Convolutional Neural Network for\n Generalizable Face Presentation Attack Detection","summary":" Facial biometrics are an essential components of smartphones to ensure\nreliable and trustworthy authentication. However, face biometric systems are\nvulnerable to Presentation Attacks (PAs), and the availability of more\nsophisticated presentation attack instruments such as 3D silicone face masks\nwill allow attackers to deceive face recognition systems easily. In this work,\nwe propose a novel Presentation Attack Detection (PAD) algorithm based on 3D\npoint clouds captured using the frontal camera of a smartphone to detect\npresentation attacks. The proposed PAD algorithm, VoxAtnNet, processes 3D point\nclouds to obtain voxelization to preserve the spatial structure. Then, the\nvoxelized 3D samples were trained using the novel convolutional attention\nnetwork to detect PAs on the smartphone. Extensive experiments were carried out\non the newly constructed 3D face point cloud dataset comprising bona fide and\ntwo different 3D PAIs (3D silicone face mask and wrap photo mask), resulting in\n3480 samples. The performance of the proposed method was compared with existing\nmethods to benchmark the detection performance using three different evaluation\nprotocols. The experimental results demonstrate the improved performance of the\nproposed method in detecting both known and unknown face presentation attacks.\n","authors":["Raghavendra Ramachandra","Narayan Vetrekar","Sushma Venkatesh","Savita Nageshker","Jag Mohan Singh","R. S. Gad"],"pdf_url":"https://arxiv.org/pdf/2404.12680v1.pdf","comment":"Accepted in 2024 18th International Conference on Automatic Face and\n Gesture Recognition (FG)"},{"id":"http://arxiv.org/abs/2404.12679v1","updated":"2024-04-19T07:26:30Z","published":"2024-04-19T07:26:30Z","title":"MLSD-GAN -- Generating Strong High Quality Face Morphing Attacks using\n Latent Semantic Disentanglement","summary":" Face-morphing attacks are a growing concern for biometric researchers, as\nthey can be used to fool face recognition systems (FRS). These attacks can be\ngenerated at the image level (supervised) or representation level\n(unsupervised). Previous unsupervised morphing attacks have relied on\ngenerative adversarial networks (GANs). More recently, researchers have used\nlinear interpolation of StyleGAN-encoded images to generate morphing attacks.\nIn this paper, we propose a new method for generating high-quality morphing\nattacks using StyleGAN disentanglement. Our approach, called MLSD-GAN,\nspherically interpolates the disentangled latents to produce realistic and\ndiverse morphing attacks. We evaluate the vulnerability of MLSD-GAN on two\ndeep-learning-based FRS techniques. The results show that MLSD-GAN poses a\nsignificant threat to FRS, as it can generate morphing attacks that are highly\neffective at fooling these systems.\n","authors":["Aravinda Reddy PN","Raghavendra Ramachandra","Krothapalli Sreenivasa Rao","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2404.12679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12678v1","updated":"2024-04-19T07:24:32Z","published":"2024-04-19T07:24:32Z","title":"Exploring Interactive Semantic Alignment for Efficient HOI Detection\n with Vision-language Model","summary":" Human-Object Interaction (HOI) detection aims to localize human-object pairs\nand comprehend their interactions. Recently, two-stage transformer-based\nmethods have demonstrated competitive performance. However, these methods\nfrequently focus on object appearance features and ignore global contextual\ninformation. Besides, vision-language model CLIP which effectively aligns\nvisual and text embeddings has shown great potential in zero-shot HOI\ndetection. Based on the former facts, We introduce a novel HOI detector named\nISA-HOI, which extensively leverages knowledge from CLIP, aligning interactive\nsemantics between visual and textual features. We first extract global context\nof image and local features of object to Improve interaction Features in images\n(IF). On the other hand, we propose a Verb Semantic Improvement (VSI) module to\nenhance textual features of verb labels via cross-modal fusion. Ultimately, our\nmethod achieves competitive results on the HICO-DET and V-COCO benchmarks with\nmuch fewer training epochs, and outperforms the state-of-the-art under\nzero-shot settings.\n","authors":["Jihao Dong","Renjie Pan","Hua Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12678v1.pdf","comment":"Accepted by ICME2024"},{"id":"http://arxiv.org/abs/2404.12667v1","updated":"2024-04-19T07:07:36Z","published":"2024-04-19T07:07:36Z","title":"Detecting Out-Of-Distribution Earth Observation Images with Diffusion\n Models","summary":" Earth Observation imagery can capture rare and unusual events, such as\ndisasters and major landscape changes, whose visual appearance contrasts with\nthe usual observations. Deep models trained on common remote sensing data will\noutput drastically different features for these out-of-distribution samples,\ncompared to those closer to their training dataset. Detecting them could\ntherefore help anticipate changes in the observations, either geographical or\nenvironmental. In this work, we show that the reconstruction error of diffusion\nmodels can effectively serve as unsupervised out-of-distribution detectors for\nremote sensing images, using them as a plausibility score. Moreover, we\nintroduce ODEED, a novel reconstruction-based scorer using the probability-flow\nODE of diffusion models. We validate it experimentally on SpaceNet 8 with\nvarious scenarios, such as classical OOD detection with geographical shift and\nnear-OOD setups: pre/post-flood and non-flooded/flooded image recognition. We\nshow that our ODEED scorer significantly outperforms other diffusion-based and\ndiscriminative baselines on the more challenging near-OOD scenarios of flood\nimage detection, where OOD images are close to the distribution tail. We aim to\npave the way towards better use of generative models for anomaly detection in\nremote sensing.\n","authors":["Georges Le Bellier","Nicolas Audebert"],"pdf_url":"https://arxiv.org/pdf/2404.12667v1.pdf","comment":"EARTHVISION 2024 IEEE/CVF CVPR Workshop. Large Scale Computer Vision\n for Remote Sensing Imagery, Jun 2024, Seattle, United States"},{"id":"http://arxiv.org/abs/2403.08511v2","updated":"2024-04-19T06:48:52Z","published":"2024-03-13T13:16:26Z","title":"A Multimodal Fusion Network For Student Emotion Recognition Based on\n Transformer and Tensor Product","summary":" This paper introduces a new multi-modal model based on the Transformer\narchitecture and tensor product fusion strategy, combining BERT's text vectors\nand ViT's image vectors to classify students' psychological conditions, with an\naccuracy of 93.65%. The purpose of the study is to accurately analyze the\nmental health status of students from various data sources. This paper\ndiscusses modal fusion methods, including early, late and intermediate fusion,\nto overcome the challenges of integrating multi-modal information. Ablation\nstudies compare the performance of different models and fusion techniques,\nshowing that the proposed model outperforms existing methods such as CLIP and\nViLBERT in terms of accuracy and inference speed. Conclusions indicate that\nwhile this model has significant advantages in emotion recognition, its\npotential to incorporate other data modalities provides areas for future\nresearch.\n","authors":["Ao Xiang","Zongqing Qi","Han Wang","Qin Yang","Danqing Ma"],"pdf_url":"https://arxiv.org/pdf/2403.08511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09530v2","updated":"2024-04-19T06:44:18Z","published":"2024-04-15T07:50:15Z","title":"RanLayNet: A Dataset for Document Layout Detection used for Domain\n Adaptation and Generalization","summary":" Large ground-truth datasets and recent advances in deep learning techniques\nhave been useful for layout detection. However, because of the restricted\nlayout diversity of these datasets, training on them requires a sizable number\nof annotated instances, which is both expensive and time-consuming. As a\nresult, differences between the source and target domains may significantly\nimpact how well these models function. To solve this problem, domain adaptation\napproaches have been developed that use a small quantity of labeled data to\nadjust the model to the target domain. In this research, we introduced a\nsynthetic document dataset called RanLayNet, enriched with automatically\nassigned labels denoting spatial positions, ranges, and types of layout\nelements. The primary aim of this endeavor is to develop a versatile dataset\ncapable of training models with robustness and adaptability to diverse document\nformats. Through empirical experimentation, we demonstrate that a deep layout\nidentification model trained on our dataset exhibits enhanced performance\ncompared to a model trained solely on actual documents. Moreover, we conduct a\ncomparative analysis by fine-tuning inference models using both PubLayNet and\nIIIT-AR-13K datasets on the Doclaynet dataset. Our findings emphasize that\nmodels enriched with our dataset are optimal for tasks such as achieving 0.398\nand 0.588 mAP95 score in the scientific document domain for the TABLE class.\n","authors":["Avinash Anand","Raj Jaiswal","Mohit Gupta","Siddhesh S Bangar","Pijush Bhuyan","Naman Lal","Rajeev Singh","Ritika Jha","Rajiv Ratn Shah","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2404.09530v2.pdf","comment":"8 pages, 6 figures, MMAsia 2023 Proceedings of the 5th ACM\n International Conference on Multimedia in Asia"},{"id":"http://arxiv.org/abs/2404.12652v1","updated":"2024-04-19T06:41:32Z","published":"2024-04-19T06:41:32Z","title":"Pre-trained Vision-Language Models Learn Discoverable Visual Concepts","summary":" Do vision-language models (VLMs) pre-trained to caption an image of a\n\"durian\" learn visual concepts such as \"brown\" (color) and \"spiky\" (texture) at\nthe same time? We aim to answer this question as visual concepts learned \"for\nfree\" would enable wide applications such as neuro-symbolic reasoning or\nhuman-interpretable object classification. We assume that the visual concepts,\nif captured by pre-trained VLMs, can be extracted by their vision-language\ninterface with text-based concept prompts. We observe that recent works\nprompting VLMs with concepts often differ in their strategies to define and\nevaluate the visual concepts, leading to conflicting conclusions. We propose a\nnew concept definition strategy based on two observations: First, certain\nconcept prompts include shortcuts that recognize correct concepts for wrong\nreasons; Second, multimodal information (e.g. visual discriminativeness, and\ntextual knowledge) should be leveraged when selecting the concepts. Our\nproposed concept discovery and learning (CDL) framework is thus designed to\nidentify a diverse list of generic visual concepts (e.g. \"spiky\" as opposed to\n\"spiky durian\"), which are ranked and selected based on visual and language\nmutual information. We carefully design quantitative and human evaluations of\nthe discovered concepts on six diverse visual recognition datasets, which\nconfirm that pre-trained VLMs do learn visual concepts that provide accurate\nand thorough descriptions for the recognized objects. All code and models are\npublicly released.\n","authors":["Yuan Zang","Tian Yun","Hao Tan","Trung Bui","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.12652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12650v1","updated":"2024-04-19T06:32:21Z","published":"2024-04-19T06:32:21Z","title":"F2FLDM: Latent Diffusion Models with Histopathology Pre-Trained\n Embeddings for Unpaired Frozen Section to FFPE Translation","summary":" The Frozen Section (FS) technique is a rapid and efficient method, taking\nonly 15-30 minutes to prepare slides for pathologists' evaluation during\nsurgery, enabling immediate decisions on further surgical interventions.\nHowever, FS process often introduces artifacts and distortions like folds and\nice-crystal effects. In contrast, these artifacts and distortions are absent in\nthe higher-quality formalin-fixed paraffin-embedded (FFPE) slides, which\nrequire 2-3 days to prepare. While Generative Adversarial Network (GAN)-based\nmethods have been used to translate FS to FFPE images (F2F), they may leave\nmorphological inaccuracies with remaining FS artifacts or introduce new\nartifacts, reducing the quality of these translations for clinical assessments.\nIn this study, we benchmark recent generative models, focusing on GANs and\nLatent Diffusion Models (LDMs), to overcome these limitations. We introduce a\nnovel approach that combines LDMs with Histopathology Pre-Trained Embeddings to\nenhance restoration of FS images. Our framework leverages LDMs conditioned by\nboth text and pre-trained embeddings to learn meaningful features of FS and\nFFPE histopathology images. Through diffusion and denoising techniques, our\napproach not only preserves essential diagnostic attributes like color staining\nand tissue morphology but also proposes an embedding translation mechanism to\nbetter predict the targeted FFPE representation of input FS images. As a\nresult, this work achieves a significant improvement in classification\nperformance, with the Area Under the Curve rising from 81.99% to 94.64%,\naccompanied by an advantageous CaseFD. This work establishes a new benchmark\nfor FS to FFPE image translation quality, promising enhanced reliability and\naccuracy in histopathology FS image analysis. Our work is available at\nhttps://minhmanho.github.io/f2f_ldm/.\n","authors":["Man M. Ho","Shikha Dubey","Yosep Chong","Beatrice Knudsen","Tolga Tasdizen"],"pdf_url":"https://arxiv.org/pdf/2404.12650v1.pdf","comment":"Preprint. Our work is available at\n https://minhmanho.github.io/f2f_ldm/"},{"id":"http://arxiv.org/abs/2404.10305v2","updated":"2024-04-19T06:23:20Z","published":"2024-04-16T06:24:53Z","title":"TC-OCR: TableCraft OCR for Efficient Detection & Recognition of Table\n Structure & Content","summary":" The automatic recognition of tabular data in document images presents a\nsignificant challenge due to the diverse range of table styles and complex\nstructures. Tables offer valuable content representation, enhancing the\npredictive capabilities of various systems such as search engines and Knowledge\nGraphs. Addressing the two main problems, namely table detection (TD) and table\nstructure recognition (TSR), has traditionally been approached independently.\nIn this research, we propose an end-to-end pipeline that integrates deep\nlearning models, including DETR, CascadeTabNet, and PP OCR v2, to achieve\ncomprehensive image-based table recognition. This integrated approach\neffectively handles diverse table styles, complex structures, and image\ndistortions, resulting in improved accuracy and efficiency compared to existing\nmethods like Table Transformers. Our system achieves simultaneous table\ndetection (TD), table structure recognition (TSR), and table content\nrecognition (TCR), preserving table structures and accurately extracting\ntabular data from document images. The integration of multiple models addresses\nthe intricacies of table recognition, making our approach a promising solution\nfor image-based table understanding, data extraction, and information retrieval\napplications. Our proposed approach achieves an IOU of 0.96 and an OCR Accuracy\nof 78%, showcasing a remarkable improvement of approximately 25% in the OCR\nAccuracy compared to the previous Table Transformer approach.\n","authors":["Avinash Anand","Raj Jaiswal","Pijush Bhuyan","Mohit Gupta","Siddhesh Bangar","Md. Modassir Imam","Rajiv Ratn Shah","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2404.10305v2.pdf","comment":"8 pages, 2 figures, Workshop of 1st MMIR Deep Multimodal Learning for\n Information Retrieval"},{"id":"http://arxiv.org/abs/2404.06883v2","updated":"2024-04-19T06:07:22Z","published":"2024-04-10T10:13:37Z","title":"Research on Detection of Floating Objects in River and Lake Based on AI\n Intelligent Image Recognition","summary":" With the rapid advancement of artificial intelligence technology, AI-enabled\nimage recognition has emerged as a potent tool for addressing challenges in\ntraditional environmental monitoring. This study focuses on the detection of\nfloating objects in river and lake environments, exploring an innovative\napproach based on deep learning. By intricately analyzing the technical\npathways for detecting static and dynamic features and considering the\ncharacteristics of river and lake debris, a comprehensive image acquisition and\nprocessing workflow has been developed. The study highlights the application\nand performance comparison of three mainstream deep learning models -SSD,\nFaster-RCNN, and YOLOv5- in debris identification. Additionally, a detection\nsystem for floating objects has been designed and implemented, encompassing\nboth hardware platform construction and software framework development. Through\nrigorous experimental validation, the proposed system has demonstrated its\nability to significantly enhance the accuracy and efficiency of debris\ndetection, thus offering a new technological avenue for water quality\nmonitoring in rivers and lakes\n","authors":["Jingyu Zhang","Ao Xiang","Yu Cheng","Qin Yang","Liyang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07918v3","updated":"2024-04-19T05:55:00Z","published":"2023-09-14T17:59:49Z","title":"Unified Human-Scene Interaction via Prompted Chain-of-Contacts","summary":" Human-Scene Interaction (HSI) is a vital component of fields like embodied AI\nand virtual reality. Despite advancements in motion quality and physical\nplausibility, two pivotal factors, versatile interaction control and the\ndevelopment of a user-friendly interface, require further exploration before\nthe practical application of HSI. This paper presents a unified HSI framework,\nUniHSI, which supports unified control of diverse interactions through language\ncommands. This framework is built upon the definition of interaction as Chain\nof Contacts (CoC): steps of human joint-object part pairs, which is inspired by\nthe strong correlation between interaction types and human-object contact\nregions. Based on the definition, UniHSI constitutes a Large Language Model\n(LLM) Planner to translate language prompts into task plans in the form of CoC,\nand a Unified Controller that turns CoC into uniform task execution. To\nfacilitate training and evaluation, we collect a new dataset named ScenePlan\nthat encompasses thousands of task plans generated by LLMs based on diverse\nscenarios. Comprehensive experiments demonstrate the effectiveness of our\nframework in versatile task execution and generalizability to real scanned\nscenes. The project page is at https://github.com/OpenRobotLab/UniHSI .\n","authors":["Zeqi Xiao","Tai Wang","Jingbo Wang","Jinkun Cao","Wenwei Zhang","Bo Dai","Dahua Lin","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2309.07918v3.pdf","comment":"A unified Human-Scene Interaction framework that supports versatile\n interactions through language commands.Project URL:\n https://xizaoqu.github.io/unihsi/ . Code:\n https://github.com/OpenRobotLab/UniHSI"},{"id":"http://arxiv.org/abs/2404.12642v1","updated":"2024-04-19T05:48:09Z","published":"2024-04-19T05:48:09Z","title":"Cooperative Sentiment Agents for Multimodal Sentiment Analysis","summary":" In this paper, we propose a new Multimodal Representation Learning (MRL)\nmethod for Multimodal Sentiment Analysis (MSA), which facilitates the adaptive\ninteraction between modalities through Cooperative Sentiment Agents, named\nCo-SA. Co-SA comprises two critical components: the Sentiment Agents\nEstablishment (SAE) phase and the Sentiment Agents Cooperation (SAC) phase.\nDuring the SAE phase, each sentiment agent deals with an unimodal signal and\nhighlights explicit dynamic sentiment variations within the modality via the\nModality-Sentiment Disentanglement (MSD) and Deep Phase Space Reconstruction\n(DPSR) modules. Subsequently, in the SAC phase, Co-SA meticulously designs\ntask-specific interaction mechanisms for sentiment agents so that coordinating\nmultimodal signals to learn the joint representation. Specifically, Co-SA\nequips an independent policy model for each sentiment agent that captures\nsignificant properties within the modality. These policies are optimized\nmutually through the unified reward adaptive to downstream tasks. Benefitting\nfrom the rewarding mechanism, Co-SA transcends the limitation of pre-defined\nfusion modes and adaptively captures unimodal properties for MRL in the\nmultimodal interaction setting. To demonstrate the effectiveness of Co-SA, we\napply it to address Multimodal Sentiment Analysis (MSA) and Multimodal Emotion\nRecognition (MER) tasks. Our comprehensive experimental results demonstrate\nthat Co-SA excels at discovering diverse cross-modal features, encompassing\nboth common and complementary aspects. The code can be available at\nhttps://github.com/smwanghhh/Co-SA.\n","authors":["Shanmin Wang","Hui Shuai","Qingshan Liu","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03781v3","updated":"2024-04-19T05:45:25Z","published":"2023-12-06T09:39:38Z","title":"Lite-Mind: Towards Efficient and Robust Brain Representation Network","summary":" The limited data availability and the low signal-to-noise ratio of fMRI\nsignals lead to the challenging task of fMRI-to-image retrieval.\nState-of-the-art MindEye remarkably improves fMRI-to-image retrieval\nperformance by leveraging a large model, i.e., a 996M MLP Backbone per subject,\nto align fMRI embeddings to the final hidden layer of CLIP's Vision Transformer\n(ViT). However, significant individual variations exist among subjects, even\nunder identical experimental setups, mandating the training of large\nsubject-specific models. The substantial parameters pose significant challenges\nin deploying fMRI decoding on practical devices. To this end, we propose\nLite-Mind, a lightweight, efficient, and robust brain representation learning\nparadigm based on Discrete Fourier Transform (DFT), which efficiently aligns\nfMRI voxels to fine-grained information of CLIP. We elaborately design a DFT\nbackbone with Spectrum Compression and Frequency Projector modules to learn\ninformative and robust voxel embeddings. Our experiments demonstrate that\nLite-Mind achieves an impressive 94.6% fMRI-to-image retrieval accuracy on the\nNSD dataset for Subject 1, with 98.7% fewer parameters than MindEye. Lite-Mind\nis also proven to be able to be migrated to smaller fMRI datasets and\nestablishes a new state-of-the-art for zero-shot classification on the GOD\ndataset.\n","authors":["Zixuan Gong","Qi Zhang","Guangyin Bao","Lei Zhu","Yu Zhang","Ke Liu","Liang Hu","Duoqian Miao"],"pdf_url":"https://arxiv.org/pdf/2312.03781v3.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2404.12635v1","updated":"2024-04-19T05:32:37Z","published":"2024-04-19T05:32:37Z","title":"AED-PADA:Improving Generalizability of Adversarial Example Detection via\n Principal Adversarial Domain Adaptation","summary":" Adversarial example detection, which can be conveniently applied in many\nscenarios, is important in the area of adversarial defense. Unfortunately,\nexisting detection methods suffer from poor generalization performance, because\ntheir training process usually relies on the examples generated from a single\nknown adversarial attack and there exists a large discrepancy between the\ntraining and unseen testing adversarial examples. To address this issue, we\npropose a novel method, named Adversarial Example Detection via Principal\nAdversarial Domain Adaptation (AED-PADA). Specifically, our approach identifies\nthe Principal Adversarial Domains (PADs), i.e., a combination of features of\nthe adversarial examples from different attacks, which possesses large coverage\nof the entire adversarial feature space. Then, we pioneer to exploit\nmulti-source domain adaptation in adversarial example detection with PADs as\nsource domains. Experiments demonstrate the superior generalization ability of\nour proposed AED-PADA. Note that this superiority is particularly achieved in\nchallenging scenarios characterized by employing the minimal magnitude\nconstraint for the perturbations.\n","authors":["Heqi Peng","Yunhong Wang","Ruijie Yang","Beichen Li","Rui Wang","Yuanfang Guo"],"pdf_url":"https://arxiv.org/pdf/2404.12635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12634v1","updated":"2024-04-19T05:31:37Z","published":"2024-04-19T05:31:37Z","title":"Transformer-Based Classification Outcome Prediction for Multimodal\n Stroke Treatment","summary":" This study proposes a multi-modal fusion framework Multitrans based on the\nTransformer architecture and self-attention mechanism. This architecture\ncombines the study of non-contrast computed tomography (NCCT) images and\ndischarge diagnosis reports of patients undergoing stroke treatment, using a\nvariety of methods based on Transformer architecture approach to predicting\nfunctional outcomes of stroke treatment. The results show that the performance\nof single-modal text classification is significantly better than single-modal\nimage classification, but the effect of multi-modal combination is better than\nany single modality. Although the Transformer model only performs worse on\nimaging data, when combined with clinical meta-diagnostic information, both can\nlearn better complementary information and make good contributions to\naccurately predicting stroke treatment effects..\n","authors":["Danqing Ma","Meng Wang","Ao Xiang","Zongqing Qi","Qin Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09401v2","updated":"2024-04-19T05:26:28Z","published":"2024-04-15T01:27:07Z","title":"Watermark-embedded Adversarial Examples for Copyright Protection against\n Diffusion Models","summary":" Diffusion Models (DMs) have shown remarkable capabilities in various\nimage-generation tasks. However, there are growing concerns that DMs could be\nused to imitate unauthorized creations and thus raise copyright issues. To\naddress this issue, we propose a novel framework that embeds personal\nwatermarks in the generation of adversarial examples. Such examples can force\nDMs to generate images with visible watermarks and prevent DMs from imitating\nunauthorized images. We construct a generator based on conditional adversarial\nnetworks and design three losses (adversarial loss, GAN loss, and perturbation\nloss) to generate adversarial examples that have subtle perturbation but can\neffectively attack DMs to prevent copyright violations. Training a generator\nfor a personal watermark by our method only requires 5-10 samples within 2-3\nminutes, and once the generator is trained, it can generate adversarial\nexamples with that watermark significantly fast (0.2s per image). We conduct\nextensive experiments in various conditional image-generation scenarios.\nCompared to existing methods that generate images with chaotic textures, our\nmethod adds visible watermarks on the generated images, which is a more\nstraightforward way to indicate copyright violations. We also observe that our\nadversarial examples exhibit good transferability across unknown generative\nmodels. Therefore, this work provides a simple yet powerful way to protect\ncopyright from DM-based imitation.\n","authors":["Peifei Zhu","Tsubasa Takahashi","Hirokatsu Kataoka"],"pdf_url":"https://arxiv.org/pdf/2404.09401v2.pdf","comment":"updated references"},{"id":"http://arxiv.org/abs/2404.12630v1","updated":"2024-04-19T05:12:04Z","published":"2024-04-19T05:12:04Z","title":"MindTuner: Cross-Subject Visual Decoding with Visual Fingerprint and\n Semantic Correction","summary":" Decoding natural visual scenes from brain activity has flourished, with\nextensive research in single-subject tasks and, however, less in cross-subject\ntasks. Reconstructing high-quality images in cross-subject tasks is a\nchallenging problem due to profound individual differences between subjects and\nthe scarcity of data annotation. In this work, we proposed MindTuner for\ncross-subject visual decoding, which achieves high-quality and rich-semantic\nreconstructions using only 1 hour of fMRI training data benefiting from the\nphenomena of visual fingerprint in the human visual system and a novel\nfMRI-to-text alignment paradigm. Firstly, we pre-train a multi-subject model\namong 7 subjects and fine-tune it with scarce data on new subjects, where LoRAs\nwith Skip-LoRAs are utilized to learn the visual fingerprint. Then, we take the\nimage modality as the intermediate pivot modality to achieve fMRI-to-text\nalignment, which achieves impressive fMRI-to-text retrieval performance and\ncorrects fMRI-to-image reconstruction with fine-tuned semantics. The results of\nboth qualitative and quantitative analyses demonstrate that MindTuner surpasses\nstate-of-the-art cross-subject visual decoding models on the Natural Scenes\nDataset (NSD), whether using training data of 1 hour or 40 hours.\n","authors":["Zixuan Gong","Qi Zhang","Guangyin Bao","Lei Zhu","Ke Liu","Liang Hu","Duoqian Miao"],"pdf_url":"https://arxiv.org/pdf/2404.12630v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2312.11911v2","updated":"2024-04-19T05:08:13Z","published":"2023-12-19T07:39:45Z","title":"EVI-SAM: Robust, Real-time, Tightly-coupled Event-Visual-Inertial State\n Estimation and 3D Dense Mapping","summary":" Event cameras are bio-inspired, motion-activated sensors that demonstrate\nsubstantial potential in handling challenging situations, such as motion blur\nand high-dynamic range. In this paper, we proposed EVI-SAM to tackle the\nproblem of 6 DoF pose tracking and 3D reconstruction using monocular event\ncamera. A novel event-based hybrid tracking framework is designed to estimate\nthe pose, leveraging the robustness of feature matching and the precision of\ndirect alignment. Specifically, we develop an event-based 2D-2D alignment to\nconstruct the photometric constraint, and tightly integrate it with the\nevent-based reprojection constraint. The mapping module recovers the dense and\ncolorful depth of the scene through the image-guided event-based mapping\nmethod. Subsequently, the appearance, texture, and surface mesh of the 3D scene\ncan be reconstructed by fusing the dense depth map from multiple viewpoints\nusing truncated signed distance function (TSDF) fusion. To the best of our\nknowledge, this is the first non-learning work to realize event-based dense\nmapping. Numerical evaluations are performed on both publicly available and\nself-collected datasets, which qualitatively and quantitatively demonstrate the\nsuperior performance of our method. Our EVI-SAM effectively balances accuracy\nand robustness while maintaining computational efficiency, showcasing superior\npose tracking and dense mapping performance in challenging scenarios. Video\nDemo: https://youtu.be/Nn40U4e5Si8.\n","authors":["Weipeng Guan","Peiyu Chen","Huibin Zhao","Yu Wang","Peng Lu"],"pdf_url":"https://arxiv.org/pdf/2312.11911v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12625v1","updated":"2024-04-19T04:51:18Z","published":"2024-04-19T04:51:18Z","title":"SkelFormer: Markerless 3D Pose and Shape Estimation using Skeletal\n Transformers","summary":" We introduce SkelFormer, a novel markerless motion capture pipeline for\nmulti-view human pose and shape estimation. Our method first uses off-the-shelf\n2D keypoint estimators, pre-trained on large-scale in-the-wild data, to obtain\n3D joint positions. Next, we design a regression-based inverse-kinematic\nskeletal transformer that maps the joint positions to pose and shape\nrepresentations from heavily noisy observations. This module integrates prior\nknowledge about pose space and infers the full pose state at runtime.\nSeparating the 3D keypoint detection and inverse-kinematic problems, along with\nthe expressive representations learned by our skeletal transformer, enhance the\ngeneralization of our method to unseen noisy data. We evaluate our method on\nthree public datasets in both in-distribution and out-of-distribution settings\nusing three datasets, and observe strong performance with respect to prior\nworks. Moreover, ablation experiments demonstrate the impact of each of the\nmodules of our architecture. Finally, we study the performance of our method in\ndealing with noise and heavy occlusions and find considerable robustness with\nrespect to other solutions.\n","authors":["Vandad Davoodnia","Saeed Ghorbani","Alexandre Messier","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2404.12625v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.12624v1","updated":"2024-04-19T04:49:28Z","published":"2024-04-19T04:49:28Z","title":"Dragtraffic: A Non-Expert Interactive and Point-Based Controllable\n Traffic Scene Generation Framework","summary":" The evaluation and training of autonomous driving systems require diverse and\nscalable corner cases. However, most existing scene generation methods lack\ncontrollability, accuracy, and versatility, resulting in unsatisfactory\ngeneration results. To address this problem, we propose Dragtraffic, a\ngeneralized, point-based, and controllable traffic scene generation framework\nbased on conditional diffusion. Dragtraffic enables non-experts to generate a\nvariety of realistic driving scenarios for different types of traffic agents\nthrough an adaptive mixture expert architecture. We use a regression model to\nprovide a general initial solution and a refinement process based on the\nconditional diffusion model to ensure diversity. User-customized context is\nintroduced through cross-attention to ensure high controllability. Experiments\non a real-world driving dataset show that Dragtraffic outperforms existing\nmethods in terms of authenticity, diversity, and freedom.\n","authors":["Sheng Wang","Ge Sun","Fulong Ma","Tianshuai Hu","Yongkang Song","Lei Zhu","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02352v2","updated":"2024-04-19T04:22:04Z","published":"2024-02-04T05:33:04Z","title":"Region-Based Representations Revisited","summary":" We investigate whether region-based representations are effective for\nrecognition. Regions were once a mainstay in recognition approaches, but pixel\nand patch-based features are now used almost exclusively. We show that recent\nclass-agnostic segmenters like SAM can be effectively combined with strong\nunsupervised representations like DINOv2 and used for a wide variety of tasks,\nincluding semantic segmentation, object-based image retrieval, and multi-image\nanalysis. Once the masks and features are extracted, these representations,\neven with linear decoders, enable competitive performance, making them well\nsuited to applications that require custom queries. The compactness of the\nrepresentation also makes it well-suited to video analysis and other problems\nrequiring inference across many images.\n","authors":["Michal Shlapentokh-Rothman","Ansel Blume","Yao Xiao","Yuqun Wu","Sethuraman T V","Heyi Tao","Jae Yong Lee","Wilfredo Torres","Yu-Xiong Wang","Derek Hoiem"],"pdf_url":"https://arxiv.org/pdf/2402.02352v2.pdf","comment":"CVPR 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2404.12612v1","updated":"2024-04-19T03:51:46Z","published":"2024-04-19T03:51:46Z","title":"SA-Attack: Speed-adaptive stealthy adversarial attack on trajectory\n prediction","summary":" Trajectory prediction is critical for the safe planning and navigation of\nautomated vehicles. The trajectory prediction models based on the neural\nnetworks are vulnerable to adversarial attacks. Previous attack methods have\nachieved high attack success rates but overlook the adaptability to realistic\nscenarios and the concealment of the deceits. To address this problem, we\npropose a speed-adaptive stealthy adversarial attack method named SA-Attack.\nThis method searches the sensitive region of trajectory prediction models and\ngenerates the adversarial trajectories by using the vehicle-following method\nand incorporating information about forthcoming trajectories. Our method has\nthe ability to adapt to different speed scenarios by reconstructing the\ntrajectory from scratch. Fusing future trajectory trends and curvature\nconstraints can guarantee the smoothness of adversarial trajectories, further\nensuring the stealthiness of attacks. The empirical study on the datasets of\nnuScenes and Apolloscape demonstrates the attack performance of our proposed\nmethod. Finally, we also demonstrate the adaptability and stealthiness of\nSA-Attack for different speed scenarios. Our code is available at the\nrepository: https://github.com/eclipse-bot/SA-Attack.\n","authors":["Huilin Yin","Jiaxiang Li","Pengju Zhen","Jun Yan"],"pdf_url":"https://arxiv.org/pdf/2404.12612v1.pdf","comment":"This work is published in IEEE IV Symposium"},{"id":"http://arxiv.org/abs/2310.03624v2","updated":"2024-04-19T03:48:13Z","published":"2023-10-05T16:01:29Z","title":"High-Degrees-of-Freedom Dynamic Neural Fields for Robot Self-Modeling\n and Motion Planning","summary":" A robot self-model is a task-agnostic representation of the robot's physical\nmorphology that can be used for motion planning tasks in the absence of a\nclassical geometric kinematic model. In particular, when the latter is hard to\nengineer or the robot's kinematics change unexpectedly, human-free\nself-modeling is a necessary feature of truly autonomous agents. In this work,\nwe leverage neural fields to allow a robot to self-model its kinematics as a\nneural-implicit query model learned only from 2D images annotated with camera\nposes and configurations. This enables significantly greater applicability than\nexisting approaches which have been dependent on depth images or geometry\nknowledge. To this end, alongside a curricular data sampling strategy, we\npropose a new encoder-based neural density field architecture for dynamic\nobject-centric scenes conditioned on high numbers of degrees of freedom (DOFs).\nIn a 7-DOF robot test setup, the learned self-model achieves a Chamfer-L2\ndistance of 2% of the robot's workspace dimension. We demonstrate the\ncapabilities of this model on motion planning tasks as an exemplary downstream\napplication.\n","authors":["Lennart Schulze","Hod Lipson"],"pdf_url":"https://arxiv.org/pdf/2310.03624v2.pdf","comment":"International Conference on Robotics and Automation (ICRA) 2024; ICCV\n 2023 Workshop on Neural Fields for Autonomous Driving and Robotics (oral)"},{"id":"http://arxiv.org/abs/2404.12611v1","updated":"2024-04-19T03:45:12Z","published":"2024-04-19T03:45:12Z","title":"Rethinking Clothes Changing Person ReID: Conflicts, Synthesis, and\n Optimization","summary":" Clothes-changing person re-identification (CC-ReID) aims to retrieve images\nof the same person wearing different outfits. Mainstream researches focus on\ndesigning advanced model structures and strategies to capture identity\ninformation independent of clothing. However, the same-clothes discrimination\nas the standard ReID learning objective in CC-ReID is persistently ignored in\nprevious researches. In this study, we dive into the relationship between\nstandard and clothes-changing~(CC) learning objectives, and bring the inner\nconflicts between these two objectives to the fore. We try to magnify the\nproportion of CC training pairs by supplementing high-fidelity clothes-varying\nsynthesis, produced by our proposed Clothes-Changing Diffusion model. By\nincorporating the synthetic images into CC-ReID model training, we observe a\nsignificant improvement under CC protocol. However, such improvement sacrifices\nthe performance under the standard protocol, caused by the inner conflict\nbetween standard and CC. For conflict mitigation, we decouple these objectives\nand re-formulate CC-ReID learning as a multi-objective optimization (MOO)\nproblem. By effectively regularizing the gradient curvature across multiple\nobjectives and introducing preference restrictions, our MOO solution surpasses\nthe single-task training paradigm. Our framework is model-agnostic, and\ndemonstrates superior performance under both CC and standard ReID protocols.\n","authors":["Junjie Li","Guanshuo Wang","Fufu Yu","Yichao Yan","Qiong Jia","Shouhong Ding","Xingdong Sheng","Yunhui Liu","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00567v2","updated":"2024-04-19T03:17:16Z","published":"2024-03-01T14:44:41Z","title":"Flatten Long-Range Loss Landscapes for Cross-Domain Few-Shot Learning","summary":" Cross-domain few-shot learning (CDFSL) aims to acquire knowledge from limited\ntraining data in the target domain by leveraging prior knowledge transferred\nfrom source domains with abundant training samples. CDFSL faces challenges in\ntransferring knowledge across dissimilar domains and fine-tuning models with\nlimited training data. To address these challenges, we initially extend the\nanalysis of loss landscapes from the parameter space to the representation\nspace, which allows us to simultaneously interpret the transferring and\nfine-tuning difficulties of CDFSL models. We observe that sharp minima in the\nloss landscapes of the representation space result in representations that are\nhard to transfer and fine-tune. Moreover, existing flatness-based methods have\nlimited generalization ability due to their short-range flatness. To enhance\nthe transferability and facilitate fine-tuning, we introduce a simple yet\neffective approach to achieve long-range flattening of the minima in the loss\nlandscape. This approach considers representations that are differently\nnormalized as minima in the loss landscape and flattens the high-loss region in\nthe middle by randomly sampling interpolated representations. We implement this\nmethod as a new normalization layer that replaces the original one in both CNNs\nand ViTs. This layer is simple and lightweight, introducing only a minimal\nnumber of additional parameters. Experimental results on 8 datasets demonstrate\nthat our approach outperforms state-of-the-art methods in terms of average\naccuracy. Moreover, our method achieves performance improvements of up to 9\\%\ncompared to the current best approaches on individual datasets. Our code will\nbe released.\n","authors":["Yixiong Zou","Yicong Liu","Yiman Hu","Yuhua Li","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2403.00567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12606v1","updated":"2024-04-19T03:16:08Z","published":"2024-04-19T03:16:08Z","title":"ELEV-VISION-SAM: Integrated Vision Language and Foundation Model for\n Automated Estimation of Building Lowest Floor Elevation","summary":" Street view imagery, aided by advancements in image quality and\naccessibility, has emerged as a valuable resource for urban analytics research.\nRecent studies have explored its potential for estimating lowest floor\nelevation (LFE), offering a scalable alternative to traditional on-site\nmeasurements, crucial for assessing properties' flood risk and damage extent.\nWhile existing methods rely on object detection, the introduction of image\nsegmentation has broadened street view images' utility for LFE estimation,\nalthough challenges still remain in segmentation quality and capability to\ndistinguish front doors from other doors. To address these challenges in LFE\nestimation, this study integrates the Segment Anything model, a segmentation\nfoundation model, with vision language models to conduct text-prompt image\nsegmentation on street view images for LFE estimation. By evaluating various\nvision language models, integration methods, and text prompts, we identify the\nmost suitable model for street view image analytics and LFE estimation tasks,\nthereby improving the availability of the current LFE estimation model based on\nimage segmentation from 33% to 56% of properties. Remarkably, our proposed\nmethod significantly enhances the availability of LFE estimation to almost all\nproperties in which the front door is visible in the street view image. Also\nthe findings present the first baseline and comparison of various vision models\nof street view image-based LFE estimation. The model and findings not only\ncontribute to advancing street view image segmentation for urban analytics but\nalso provide a novel approach for image segmentation tasks for other civil\nengineering and infrastructure analytics tasks.\n","authors":["Yu-Hsuan Ho","Longxiang Li","Ali Mostafavi"],"pdf_url":"https://arxiv.org/pdf/2404.12606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12602v1","updated":"2024-04-19T03:12:17Z","published":"2024-04-19T03:12:17Z","title":"A visualization method for data domain changes in CNN networks and the\n optimization method for selecting thresholds in classification tasks","summary":" In recent years, Face Anti-Spoofing (FAS) has played a crucial role in\npreserving the security of face recognition technology. With the rise of\ncounterfeit face generation techniques, the challenge posed by digitally edited\nfaces to face anti-spoofing is escalating. Existing FAS technologies primarily\nfocus on intercepting physically forged faces and lack a robust solution for\ncross-domain FAS challenges. Moreover, determining an appropriate threshold to\nachieve optimal deployment results remains an issue for intra-domain FAS. To\naddress these issues, we propose a visualization method that intuitively\nreflects the training outcomes of models by visualizing the prediction results\non datasets. Additionally, we demonstrate that employing data augmentation\ntechniques, such as downsampling and Gaussian blur, can effectively enhance\nperformance on cross-domain tasks. Building upon our data visualization\napproach, we also introduce a methodology for setting threshold values based on\nthe distribution of the training dataset. Ultimately, our methods secured us\nsecond place in both the Unified Physical-Digital Face Attack Detection\ncompetition and the Snapshot Spectral Imaging Face Anti-spoofing contest. The\ntraining code is available at https://github.com/SeaRecluse/CVPRW2024.\n","authors":["Minzhe Huang","Changwei Nie","Weihong Zhong"],"pdf_url":"https://arxiv.org/pdf/2404.12602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12599v1","updated":"2024-04-19T03:06:50Z","published":"2024-04-19T03:06:50Z","title":"QUTE: Quantifying Uncertainty in TinyML models with Early-exit-assisted\n ensembles","summary":" Existing methods for uncertainty quantification incur massive memory and\ncompute overhead, often requiring multiple models/inferences. Hence they are\nimpractical on ultra-low-power KB-sized TinyML devices. To reduce overhead,\nprior works have proposed the use of early-exit networks as ensembles to\nquantify uncertainty in a single forward-pass. However, they still have a\nprohibitive cost for tinyML. To address these challenges, we propose QUTE, a\nnovel resource-efficient early-exit-assisted ensemble architecture optimized\nfor tinyML models. QUTE adds additional output blocks at the final exit of the\nbase network and distills the knowledge of early-exits into these blocks to\ncreate a diverse and lightweight ensemble architecture. Our results show that\nQUTE outperforms popular prior works, and improves the quality of uncertainty\nestimates by 6% with 3.1x lower model size on average compared to the most\nrelevant prior work. Furthermore, we demonstrate that QUTE is also effective in\ndetecting co-variate shifted and out-of-distribution inputs, and shows\ncompetitive performance relative to G-ODIN, a state-of-the-art generalized OOD\ndetector.\n","authors":["Nikhil P Ghanathe","Steve Wilton"],"pdf_url":"https://arxiv.org/pdf/2404.12599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11326v3","updated":"2024-04-19T03:00:21Z","published":"2024-04-17T12:38:58Z","title":"Single-temporal Supervised Remote Change Detection for Domain\n Generalization","summary":" Change detection is widely applied in remote sensing image analysis. Existing\nmethods require training models separately for each dataset, which leads to\npoor domain generalization. Moreover, these methods rely heavily on large\namounts of high-quality pair-labelled data for training, which is expensive and\nimpractical. In this paper, we propose a multimodal contrastive learning\n(ChangeCLIP) based on visual-language pre-training for change detection domain\ngeneralization. Additionally, we propose a dynamic context optimization for\nprompt learning. Meanwhile, to address the data dependency issue of existing\nmethods, we introduce a single-temporal and controllable AI-generated training\nstrategy (SAIN). This allows us to train the model using a large number of\nsingle-temporal images without image pairs in the real world, achieving\nexcellent generalization. Extensive experiments on series of real change\ndetection datasets validate the superiority and strong generalization of\nChangeCLIP, outperforming state-of-the-art change detection methods. Code will\nbe available.\n","authors":["Qiangang Du","Jinlong Peng","Xu Chen","Qingdong He","Liren He","Qiang Nie","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11326v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11098v3","updated":"2024-04-19T02:55:54Z","published":"2024-04-17T06:32:42Z","title":"LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing\n Diffusion Models","summary":" In the era of AIGC, the demand for low-budget or even on-device applications\nof diffusion models emerged. In terms of compressing the Stable Diffusion\nmodels (SDMs), several approaches have been proposed, and most of them\nleveraged the handcrafted layer removal methods to obtain smaller U-Nets, along\nwith knowledge distillation to recover the network performance. However, such a\nhandcrafting manner of layer removal is inefficient and lacks scalability and\ngeneralization, and the feature distillation employed in the retraining phase\nfaces an imbalance issue that a few numerically significant feature loss terms\ndominate over others throughout the retraining process. To this end, we\nproposed the layer pruning and normalized distillation for compressing\ndiffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to\ncompress SDM's U-Net automatically and proposed an effective one-shot pruning\ncriterion whose one-shot performance is guaranteed by its good additivity\nproperty, surpassing other layer pruning and handcrafted layer removal methods,\n2) proposed the normalized feature distillation for retraining, alleviated the\nimbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of\nSDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0%\ndecline in PickScore at a pruning ratio of 50% while the comparative methods'\nminimal PickScore decline is 8.2%. We will release our code.\n","authors":["Dingkun Zhang","Sijia Li","Chen Chen","Qingsong Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.11098v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12235v2","updated":"2024-04-19T02:42:24Z","published":"2024-04-18T14:51:42Z","title":"Beyond Average: Individualized Visual Scanpath Prediction","summary":" Understanding how attention varies across individuals has significant\nscientific and societal impacts. However, existing visual scanpath models treat\nattention uniformly, neglecting individual differences. To bridge this gap,\nthis paper focuses on individualized scanpath prediction (ISP), a new attention\nmodeling task that aims to accurately predict how different individuals shift\ntheir attention in diverse visual tasks. It proposes an ISP method featuring\nthree novel technical components: (1) an observer encoder to characterize and\nintegrate an observer's unique attention traits, (2) an observer-centric\nfeature integration approach that holistically combines visual features, task\nguidance, and observer-specific characteristics, and (3) an adaptive fixation\nprioritization mechanism that refines scanpath predictions by dynamically\nprioritizing semantic feature maps based on individual observers' attention\ntraits. These novel components allow scanpath models to effectively address the\nattention variations across different observers. Our method is generally\napplicable to different datasets, model architectures, and visual tasks,\noffering a comprehensive tool for transforming general scanpath models into\nindividualized ones. Comprehensive evaluations using value-based and\nranking-based metrics verify the method's effectiveness and generalizability.\n","authors":["Xianyu Chen","Ming Jiang","Qi Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.12235v2.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2404.12588v1","updated":"2024-04-19T02:33:23Z","published":"2024-04-19T02:33:23Z","title":"Cross-Modal Adapter: Parameter-Efficient Transfer Learning Approach for\n Vision-Language Models","summary":" Adapter-based parameter-efficient transfer learning has achieved exciting\nresults in vision-language models. Traditional adapter methods often require\ntraining or fine-tuning, facing challenges such as insufficient samples or\nresource limitations. While some methods overcome the need for training by\nleveraging image modality cache and retrieval, they overlook the text\nmodality's importance and cross-modal cues for the efficient adaptation of\nparameters in visual-language models. This work introduces a cross-modal\nparameter-efficient approach named XMAdapter. XMAdapter establishes cache\nmodels for both text and image modalities. It then leverages retrieval through\nvisual-language bimodal information to gather clues for inference. By\ndynamically adjusting the affinity ratio, it achieves cross-modal fusion,\ndecoupling different modal similarities to assess their respective\ncontributions. Additionally, it explores hard samples based on differences in\ncross-modal affinity and enhances model performance through adaptive adjustment\nof sample learning intensity. Extensive experimental results on benchmark\ndatasets demonstrate that XMAdapter outperforms previous adapter-based methods\nsignificantly regarding accuracy, generalization, and efficiency.\n","authors":["Juncheng Yang","Zuchao Li","Shuai Xie","Weiping Zhu","Wei Yu","Shijun Li"],"pdf_url":"https://arxiv.org/pdf/2404.12588v1.pdf","comment":"This paper is accepted to ICME 2024"},{"id":"http://arxiv.org/abs/2404.09778v2","updated":"2024-04-19T02:19:19Z","published":"2024-04-15T13:30:34Z","title":"The Devil is in the Few Shots: Iterative Visual Knowledge Completion for\n Few-shot Learning","summary":" Contrastive Language-Image Pre-training (CLIP) has shown powerful zero-shot\nlearning performance. Few-shot learning aims to further enhance the transfer\ncapability of CLIP by giving few images in each class, aka 'few shots'. Most\nexisting methods either implicitly learn from the few shots by incorporating\nlearnable prompts or adapters, or explicitly embed them in a cache model for\ninference. However, the narrow distribution of few shots often contains\nincomplete class information, leading to biased visual knowledge with high risk\nof misclassification. To tackle this problem, recent methods propose to\nsupplement visual knowledge by generative models or extra databases, which can\nbe costly and time-consuming. In this paper, we propose an Iterative Visual\nKnowledge CompLetion (KCL) method to complement visual knowledge by properly\ntaking advantages of unlabeled samples without access to any auxiliary or\nsynthetic data. Specifically, KCL first measures the similarities between\nunlabeled samples and each category. Then, the samples with top confidence to\neach category is selected and collected by a designed confidence criterion.\nFinally, the collected samples are treated as labeled ones and added to few\nshots to jointly re-estimate the remaining unlabeled ones. The above procedures\nwill be repeated for a certain number of iterations with more and more samples\nbeing collected until convergence, ensuring a progressive and robust knowledge\ncompletion process. Extensive experiments on 11 benchmark datasets demonstrate\nthe effectiveness and efficiency of KCL as a plug-and-play module under both\nfew-shot and zero-shot learning settings. Code is available at\nhttps://github.com/Mark-Sky/KCL.\n","authors":["Yaohui Li","Qifeng Zhou","Haoxing Chen","Jianbing Zhang","Xinyu Dai","Hao Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.09778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16108v4","updated":"2024-04-19T02:05:02Z","published":"2023-09-28T02:20:59Z","title":"Channel Vision Transformers: An Image Is Worth 1 x 16 x 16 Words","summary":" Vision Transformer (ViT) has emerged as a powerful architecture in the realm\nof modern computer vision. However, its application in certain imaging fields,\nsuch as microscopy and satellite imaging, presents unique challenges. In these\ndomains, images often contain multiple channels, each carrying semantically\ndistinct and independent information. Furthermore, the model must demonstrate\nrobustness to sparsity in input channels, as they may not be densely available\nduring training or testing. In this paper, we propose a modification to the ViT\narchitecture that enhances reasoning across the input channels and introduce\nHierarchical Channel Sampling (HCS) as an additional regularization technique\nto ensure robustness when only partial channels are presented during test time.\nOur proposed model, ChannelViT, constructs patch tokens independently from each\ninput channel and utilizes a learnable channel embedding that is added to the\npatch tokens, similar to positional embeddings. We evaluate the performance of\nChannelViT on ImageNet, JUMP-CP (microscopy cell imaging), and So2Sat\n(satellite imaging). Our results show that ChannelViT outperforms ViT on\nclassification tasks and generalizes well, even when a subset of input channels\nis used during testing. Across our experiments, HCS proves to be a powerful\nregularizer, independent of the architecture employed, suggesting itself as a\nstraightforward technique for robust ViT training. Lastly, we find that\nChannelViT generalizes effectively even when there is limited access to all\nchannels during training, highlighting its potential for multi-channel imaging\nunder real-world conditions with sparse sensors. Our code is available at\nhttps://github.com/insitro/ChannelViT.\n","authors":["Yujia Bao","Srinivasan Sivanandan","Theofanis Karaletsos"],"pdf_url":"https://arxiv.org/pdf/2309.16108v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11843v2","updated":"2024-04-19T01:45:02Z","published":"2024-04-18T01:46:31Z","title":"Computer-Aided Diagnosis of Thoracic Diseases in Chest X-rays using\n hybrid CNN-Transformer Architecture","summary":" Medical imaging has been used for diagnosis of various conditions, making it\none of the most powerful resources for effective patient care. Due to\nwidespread availability, low cost, and low radiation, chest X-ray is one of the\nmost sought after radiology examination for the diagnosis of various thoracic\ndiseases. Due to advancements in medical imaging technologies and increasing\npatient load, current radiology workflow faces various challenges including\nincreasing backlogs, working long hours, and increase in diagnostic errors. An\nautomated computer-aided diagnosis system that can interpret chest X-rays to\naugment radiologists by providing actionable insights has potential to provide\nsecond opinion to radiologists, highlight relevant regions in the image, in\nturn expediting clinical workflow, reducing diagnostic errors, and improving\npatient care. In this study, we applied a novel architecture augmenting the\nDenseNet121 Convolutional Neural Network (CNN) with multi-head self-attention\nmechanism using transformer, namely SA-DenseNet121, that can identify multiple\nthoracic diseases in chest X-rays. We conducted experiments on four of the\nlargest chest X-ray datasets, namely, ChestX-ray14, CheXpert, MIMIC-CXR-JPG,\nand IU-CXR. Experimental results in terms of area under the receiver operating\ncharacteristics (AUC-ROC) shows that augmenting CNN with self-attention has\npotential in diagnosing different thoracic diseases from chest X-rays. The\nproposed methodology has the potential to support the reading workflow, improve\nefficiency, and reduce diagnostic errors.\n","authors":["Sonit Singh"],"pdf_url":"https://arxiv.org/pdf/2404.11843v2.pdf","comment":"24 pages, 13 Figures, 13 Tables. This article heavily draws from\n arXiv:1904.09925 where authors originally proposed attention-augmented\n convolutional network. arXiv admin note: text overlap with arXiv:1904.09925\n by other authors"},{"id":"http://arxiv.org/abs/2404.09515v2","updated":"2024-04-19T01:43:56Z","published":"2024-04-15T07:20:09Z","title":"Revealing the structure-property relationships of copper alloys with\n FAGC","summary":" Understanding how the structure of materials affects their properties is a\ncornerstone of materials science and engineering. However, traditional methods\nhave struggled to accurately describe the quantitative structure-property\nrelationships for complex structures. In our study, we bridge this gap by\nleveraging machine learning to analyze images of materials' microstructures,\nthus offering a novel way to understand and predict the properties of materials\nbased on their microstructures. We introduce a method known as FAGC (Feature\nAugmentation on Geodesic Curves), specifically demonstrated for Cu-Cr-Zr\nalloys. This approach utilizes machine learning to examine the shapes within\nimages of the alloys' microstructures and predict their mechanical and\nelectronic properties. This generative FAGC approach can effectively expand the\nrelatively small training datasets due to the limited availability of materials\nimages labeled with quantitative properties. The process begins with extracting\nfeatures from the images using neural networks. These features are then mapped\nonto the Pre-shape space to construct the Geodesic curves. Along these curves,\nnew features are generated, effectively increasing the dataset. Moreover, we\ndesign a pseudo-labeling mechanism for these newly generated features to\nfurther enhance the training dataset. Our FAGC method has shown remarkable\nresults, significantly improving the accuracy of predicting the electronic\nconductivity and hardness of Cu-Cr-Zr alloys, with R-squared values of 0.978\nand 0.998, respectively. These outcomes underscore the potential of FAGC to\naddress the challenge of limited image data in materials science, providing a\npowerful tool for establishing detailed and quantitative relationships between\ncomplex microstructures and material properties.\n","authors":["Yuexing Han","Guanxin Wan","Tao Han","Bing Wang","Yi Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10718v2","updated":"2024-04-19T01:19:25Z","published":"2024-04-16T16:51:27Z","title":"GazeHTA: End-to-end Gaze Target Detection with Head-Target Association","summary":" We propose an end-to-end approach for gaze target detection: predicting a\nhead-target connection between individuals and the target image regions they\nare looking at. Most of the existing methods use independent components such as\noff-the-shelf head detectors or have problems in establishing associations\nbetween heads and gaze targets. In contrast, we investigate an end-to-end\nmulti-person Gaze target detection framework with Heads and Targets Association\n(GazeHTA), which predicts multiple head-target instances based solely on input\nscene image. GazeHTA addresses challenges in gaze target detection by (1)\nleveraging a pre-trained diffusion model to extract scene features for rich\nsemantic understanding, (2) re-injecting a head feature to enhance the head\npriors for improved head understanding, and (3) learning a connection map as\nthe explicit visual associations between heads and gaze targets. Our extensive\nexperimental results demonstrate that GazeHTA outperforms state-of-the-art gaze\ntarget detection methods and two adapted diffusion-based baselines on two\nstandard datasets.\n","authors":["Zhi-Yi Lin","Jouh Yeong Chew","Jan van Gemert","Xucong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10718v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.11593v2","updated":"2024-04-19T00:35:35Z","published":"2022-03-22T10:21:11Z","title":"Unified Negative Pair Generation toward Well-discriminative Feature\n Space for Face Recognition","summary":" The goal of face recognition (FR) can be viewed as a pair similarity\noptimization problem, maximizing a similarity set $\\mathcal{S}^p$ over positive\npairs, while minimizing similarity set $\\mathcal{S}^n$ over negative pairs.\nIdeally, it is expected that FR models form a well-discriminative feature space\n(WDFS) that satisfies $\\inf{\\mathcal{S}^p} > \\sup{\\mathcal{S}^n}$. With regard\nto WDFS, the existing deep feature learning paradigms (i.e., metric and\nclassification losses) can be expressed as a unified perspective on different\npair generation (PG) strategies. Unfortunately, in the metric loss (ML), it is\ninfeasible to generate negative pairs taking all classes into account in each\niteration because of the limited mini-batch size. In contrast, in\nclassification loss (CL), it is difficult to generate extremely hard negative\npairs owing to the convergence of the class weight vectors to their center.\nThis leads to a mismatch between the two similarity distributions of the\nsampled pairs and all negative pairs. Thus, this paper proposes a unified\nnegative pair generation (UNPG) by combining two PG strategies (i.e., MLPG and\nCLPG) from a unified perspective to alleviate the mismatch. UNPG introduces\nuseful information about negative pairs using MLPG to overcome the CLPG\ndeficiency. Moreover, it includes filtering the similarities of noisy negative\npairs to guarantee reliable convergence and improved performance. Exhaustive\nexperiments show the superiority of UNPG by achieving state-of-the-art\nperformance across recent loss functions on public benchmark datasets. Our code\nand pretrained models are publicly available.\n","authors":["Junuk Jung","Seonhoon Lee","Heung-Seon Oh","Yongjun Park","Joochan Park","Sungbin Son"],"pdf_url":"https://arxiv.org/pdf/2203.11593v2.pdf","comment":"9 pages, 6 figures, Published at BMVC22"},{"id":"http://arxiv.org/abs/2310.10404v7","updated":"2024-04-19T00:00:45Z","published":"2023-10-16T13:49:46Z","title":"LLM4SGG: Large Language Models for Weakly Supervised Scene Graph\n Generation","summary":" Weakly-Supervised Scene Graph Generation (WSSGG) research has recently\nemerged as an alternative to the fully-supervised approach that heavily relies\non costly annotations. In this regard, studies on WSSGG have utilized image\ncaptions to obtain unlocalized triplets while primarily focusing on grounding\nthe unlocalized triplets over image regions. However, they have overlooked the\ntwo issues involved in the triplet formation process from the captions: 1)\nSemantic over-simplification issue arises when extracting triplets from\ncaptions, where fine-grained predicates in captions are undesirably converted\ninto coarse-grained predicates, resulting in a long-tailed predicate\ndistribution, and 2) Low-density scene graph issue arises when aligning the\ntriplets in the caption with entity/predicate classes of interest, where many\ntriplets are discarded and not used in training, leading to insufficient\nsupervision. To tackle the two issues, we propose a new approach, i.e., Large\nLanguage Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two\nissues by leveraging the LLM's in-depth understanding of language and reasoning\nability during the extraction of triplets from captions and alignment of\nentity/predicate classes with target data. To further engage the LLM in these\nprocesses, we adopt the idea of Chain-of-Thought and the in-context few-shot\nlearning strategy. To validate the effectiveness of LLM4SGG, we conduct\nextensive experiments on Visual Genome and GQA datasets, showing significant\nimprovements in both Recall@K and mean Recall@K compared to the\nstate-of-the-art WSSGG methods. A further appeal is that LLM4SGG is\ndata-efficient, enabling effective model training with a small amount of\ntraining images.\n","authors":["Kibum Kim","Kanghoon Yoon","Jaehyeong Jeon","Yeonjun In","Jinyoung Moon","Donghyun Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2310.10404v7.pdf","comment":"8 pages; CVPR 2024"},{"id":"http://arxiv.org/abs/2312.07509v2","updated":"2024-04-19T22:38:48Z","published":"2023-12-12T18:43:05Z","title":"PEEKABOO: Interactive Video Generation via Masked-Diffusion","summary":" Modern video generation models like Sora have achieved remarkable success in\nproducing high-quality videos. However, a significant limitation is their\ninability to offer interactive control to users, a feature that promises to\nopen up unprecedented applications and creativity. In this work, we introduce\nthe first solution to equip diffusion-based video generation models with\nspatio-temporal control. We present Peekaboo, a novel masked attention module,\nwhich seamlessly integrates with current video generation models offering\ncontrol without the need for additional training or inference overhead. To\nfacilitate future research, we also introduce a comprehensive benchmark for\ninteractive video generation. This benchmark offers a standardized framework\nfor the community to assess the efficacy of emerging interactive video\ngeneration models. Our extensive qualitative and quantitative assessments\nreveal that Peekaboo achieves up to a 3.8x improvement in mIoU over baseline\nmodels, all while maintaining the same latency. Code and benchmark are\navailable on the webpage.\n","authors":["Yash Jain","Anshul Nasery","Vibhav Vineet","Harkirat Behl"],"pdf_url":"https://arxiv.org/pdf/2312.07509v2.pdf","comment":"Project webpage - https://jinga-lala.github.io/projects/Peekaboo/"},{"id":"http://arxiv.org/abs/2404.13194v1","updated":"2024-04-19T21:54:20Z","published":"2024-04-19T21:54:20Z","title":"Privacy-Preserving Debiasing using Data Augmentation and Machine\n Unlearning","summary":" Data augmentation is widely used to mitigate data bias in the training\ndataset. However, data augmentation exposes machine learning models to privacy\nattacks, such as membership inference attacks. In this paper, we propose an\neffective combination of data augmentation and machine unlearning, which can\nreduce data bias while providing a provable defense against known attacks.\nSpecifically, we maintain the fairness of the trained model with\ndiffusion-based data augmentation, and then utilize multi-shard unlearning to\nremove identifying information of original data from the ML model for\nprotection against privacy attacks. Experimental evaluation across diverse\ndatasets demonstrates that our approach can achieve significant improvements in\nbias reduction as well as robustness against state-of-the-art privacy attacks.\n","authors":["Zhixin Pan","Emma Andrews","Laura Chang","Prabhat Mishra"],"pdf_url":"https://arxiv.org/pdf/2404.13194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13185v1","updated":"2024-04-19T21:21:36Z","published":"2024-04-19T21:21:36Z","title":"Unlocking Robust Segmentation Across All Age Groups via Continual\n Learning","summary":" Most deep learning models in medical imaging are trained on adult data with\nunclear performance on pediatric images. In this work, we aim to address this\nchallenge in the context of automated anatomy segmentation in whole-body\nComputed Tomography (CT). We evaluate the performance of CT organ segmentation\nalgorithms trained on adult data when applied to pediatric CT volumes and\nidentify substantial age-dependent underperformance. We subsequently propose\nand evaluate strategies, including data augmentation and continual learning\napproaches, to achieve good segmentation accuracy across all age groups. Our\nbest-performing model, trained using continual learning, achieves high\nsegmentation accuracy on both adult and pediatric data (Dice scores of 0.90 and\n0.84 respectively).\n","authors":["Chih-Ying Liu","Jeya Maria Jose Valanarasu","Camila Gonzalez","Curtis Langlotz","Andrew Ng","Sergios Gatidis"],"pdf_url":"https://arxiv.org/pdf/2404.13185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08876v6","updated":"2024-04-19T21:13:41Z","published":"2024-01-16T23:19:30Z","title":"Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image\n Labeling","summary":" As deep neural networks are more commonly deployed in high-stakes domains,\ntheir black-box nature makes uncertainty quantification challenging. We\ninvestigate the effects of presenting conformal prediction sets--a\ndistribution-free class of methods for generating prediction sets with\nspecified coverage--to express uncertainty in AI-advised decision-making.\nThrough a large online experiment, we compare the utility of conformal\nprediction sets to displays of Top-1 and Top-k predictions for AI-advised image\nlabeling. In a pre-registered analysis, we find that the utility of prediction\nsets for accuracy varies with the difficulty of the task: while they result in\naccuracy on par with or less than Top-1 and Top-k displays for easy images,\nprediction sets excel at assisting humans in labeling out-of-distribution (OOD)\nimages, especially when the set size is small. Our results empirically pinpoint\npractical challenges of conformal prediction sets and provide implications on\nhow to incorporate them for real-world decision-making.\n","authors":["Dongping Zhang","Angelos Chatzimparmpas","Negar Kamali","Jessica Hullman"],"pdf_url":"https://arxiv.org/pdf/2401.08876v6.pdf","comment":"19 pages, 11 figures, 10 tables. Accepted by ACM CHI 2024"},{"id":"http://arxiv.org/abs/2404.10540v2","updated":"2024-04-19T20:15:45Z","published":"2024-04-12T20:40:12Z","title":"SEVD: Synthetic Event-based Vision Dataset for Ego and Fixed Traffic\n Perception","summary":" Recently, event-based vision sensors have gained attention for autonomous\ndriving applications, as conventional RGB cameras face limitations in handling\nchallenging dynamic conditions. However, the availability of real-world and\nsynthetic event-based vision datasets remains limited. In response to this gap,\nwe present SEVD, a first-of-its-kind multi-view ego, and fixed perception\nsynthetic event-based dataset using multiple dynamic vision sensors within the\nCARLA simulator. Data sequences are recorded across diverse lighting (noon,\nnighttime, twilight) and weather conditions (clear, cloudy, wet, rainy, foggy)\nwith domain shifts (discrete and continuous). SEVD spans urban, suburban,\nrural, and highway scenes featuring various classes of objects (car, truck,\nvan, bicycle, motorcycle, and pedestrian). Alongside event data, SEVD includes\nRGB imagery, depth maps, optical flow, semantic, and instance segmentation,\nfacilitating a comprehensive understanding of the scene. Furthermore, we\nevaluate the dataset using state-of-the-art event-based (RED, RVT) and\nframe-based (YOLOv8) methods for traffic participant detection tasks and\nprovide baseline benchmarks for assessment. Additionally, we conduct\nexperiments to assess the synthetic event-based dataset's generalization\ncapabilities. The dataset is available at\nhttps://eventbasedvision.github.io/SEVD\n","authors":["Manideep Reddy Aliminati","Bharatesh Chakravarthi","Aayush Atul Verma","Arpitsinh Vaghela","Hua Wei","Xuesong Zhou","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.10540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13159v1","updated":"2024-04-19T19:55:15Z","published":"2024-04-19T19:55:15Z","title":"Equivariant Imaging for Self-supervised Hyperspectral Image Inpainting","summary":" Hyperspectral imaging (HSI) is a key technology for earth observation,\nsurveillance, medical imaging and diagnostics, astronomy and space exploration.\nThe conventional technology for HSI in remote sensing applications is based on\nthe push-broom scanning approach in which the camera records the spectral image\nof a stripe of the scene at a time, while the image is generated by the\naggregation of measurements through time. In real-world airborne and spaceborne\nHSI instruments, some empty stripes would appear at certain locations, because\nplatforms do not always maintain a constant programmed attitude, or have access\nto accurate digital elevation maps (DEM), and the travelling track is not\nnecessarily aligned with the hyperspectral cameras at all times. This makes the\nenhancement of the acquired HS images from incomplete or corrupted observations\nan essential task. We introduce a novel HSI inpainting algorithm here, called\nHyperspectral Equivariant Imaging (Hyper-EI). Hyper-EI is a self-supervised\nlearning-based method which does not require training on extensive datasets or\naccess to a pre-trained model. Experimental results show that the proposed\nmethod achieves state-of-the-art inpainting performance compared to the\nexisting methods.\n","authors":["Shuo Li","Mike Davies","Mehrdad Yaghoobi"],"pdf_url":"https://arxiv.org/pdf/2404.13159v1.pdf","comment":"5 Pages, 4 Figures, 2 Tables"},{"id":"http://arxiv.org/abs/2404.13153v1","updated":"2024-04-19T19:44:24Z","published":"2024-04-19T19:44:24Z","title":"Motion-adaptive Separable Collaborative Filters for Blind Motion\n Deblurring","summary":" Eliminating image blur produced by various kinds of motion has been a\nchallenging problem. Dominant approaches rely heavily on model capacity to\nremove blurring by reconstructing residual from blurry observation in feature\nspace. These practices not only prevent the capture of spatially variable\nmotion in the real world but also ignore the tailored handling of various\nmotions in image space. In this paper, we propose a novel real-world deblurring\nfiltering model called the Motion-adaptive Separable Collaborative (MISC)\nFilter. In particular, we use a motion estimation network to capture motion\ninformation from neighborhoods, thereby adaptively estimating spatially-variant\nmotion flow, mask, kernels, weights, and offsets to obtain the MISC Filter. The\nMISC Filter first aligns the motion-induced blurring patterns to the motion\nmiddle along the predicted flow direction, and then collaboratively filters the\naligned image through the predicted kernels, weights, and offsets to generate\nthe output. This design can handle more generalized and complex motion in a\nspatially differentiated manner. Furthermore, we analyze the relationships\nbetween the motion estimation network and the residual reconstruction network.\nExtensive experiments on four widely used benchmarks demonstrate that our\nmethod provides an effective solution for real-world motion blur removal and\nachieves state-of-the-art performance. Code is available at\nhttps://github.com/ChengxuLiu/MISCFilter\n","authors":["Chengxu Liu","Xuan Wang","Xiangyu Xu","Ruhao Tian","Shuai Li","Xueming Qian","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13153v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13148v1","updated":"2024-04-19T19:25:26Z","published":"2024-04-19T19:25:26Z","title":"BACS: Background Aware Continual Semantic Segmentation","summary":" Semantic segmentation plays a crucial role in enabling comprehensive scene\nunderstanding for robotic systems. However, generating annotations is\nchallenging, requiring labels for every pixel in an image. In scenarios like\nautonomous driving, there's a need to progressively incorporate new classes as\nthe operating environment of the deployed agent becomes more complex. For\nenhanced annotation efficiency, ideally, only pixels belonging to new classes\nwould be annotated. This approach is known as Continual Semantic Segmentation\n(CSS). Besides the common problem of classical catastrophic forgetting in the\ncontinual learning setting, CSS suffers from the inherent ambiguity of the\nbackground, a phenomenon we refer to as the \"background shift'', since pixels\nlabeled as background could correspond to future classes (forward background\nshift) or previous classes (backward background shift). As a result, continual\nlearning approaches tend to fail. This paper proposes a Backward Background\nShift Detector (BACS) to detect previously observed classes based on their\ndistance in the latent space from the foreground centroids of previous steps.\nMoreover, we propose a modified version of the cross-entropy loss function,\nincorporating the BACS detector to down-weight background pixels associated\nwith formerly observed classes. To combat catastrophic forgetting, we employ\nmasked feature distillation alongside dark experience replay. Additionally, our\napproach includes a transformer decoder capable of adjusting to new classes\nwithout necessitating an additional classification head. We validate BACS's\nsuperior performance over existing state-of-the-art methods on standard CSS\nbenchmarks.\n","authors":["Mostafa ElAraby","Ali Harakeh","Liam Paull"],"pdf_url":"https://arxiv.org/pdf/2404.13148v1.pdf","comment":"8 pages, 4 figures, CRV 2024"},{"id":"http://arxiv.org/abs/2404.13146v1","updated":"2024-04-19T19:24:20Z","published":"2024-04-19T19:24:20Z","title":"DeepFake-O-Meter v2.0: An Open Platform for DeepFake Detection","summary":" Deepfakes, as AI-generated media, have increasingly threatened media\nintegrity and personal privacy with realistic yet fake digital content. In this\nwork, we introduce an open-source and user-friendly online platform,\nDeepFake-O-Meter v2.0, that integrates state-of-the-art methods for detecting\nDeepfake images, videos, and audio. Built upon DeepFake-O-Meter v1.0, we have\nmade significant upgrades and improvements in platform architecture design,\nincluding user interaction, detector integration, job balancing, and security\nmanagement. The platform aims to offer everyday users a convenient service for\nanalyzing DeepFake media using multiple state-of-the-art detection algorithms.\nIt ensures secure and private delivery of the analysis results. Furthermore, it\nserves as an evaluation and benchmarking platform for researchers in digital\nmedia forensics to compare the performance of multiple algorithms on the same\ninput. We have also conducted detailed usage analysis based on the collected\ndata to gain deeper insights into our platform's statistics. This involves\nanalyzing two-month trends in user activity and evaluating the processing\nefficiency of each detector.\n","authors":["Shuwei Hou","Yan Ju","Chengzhe Sun","Shan Jia","Lipeng Ke","Riky Zhou","Anita Nikolich","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2404.13146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13651v2","updated":"2024-04-19T19:17:03Z","published":"2023-08-25T19:40:56Z","title":"PCNN: Probable-Class Nearest-Neighbor Explanations Improve Fine-Grained\n Image Classification Accuracy for AIs and Humans","summary":" Nearest neighbors (NN) are traditionally used to compute final decisions,\ne.g., in Support Vector Machines or k-NN classifiers, and to provide users with\nexplanations for the model's decision. In this paper, we show a novel utility\nof nearest neighbors: To improve predictions of a frozen, pretrained classifier\nC. We leverage an image comparator S that (1) compares the input image with NN\nimages from the top-K most probable classes; and (2) uses S's output scores to\nweight the confidence scores of C. Our method consistently improves\nfine-grained image classification accuracy on CUB-200, Cars-196, and Dogs-120.\nAlso, a human study finds that showing lay users our probable-class nearest\nneighbors (PCNN) improves their decision accuracy over prior work which only\nshows only the top-1 class examples.\n","authors":["Giang Nguyen","Valerie Chen","Mohammad Reza Taesiri","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.13651v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02899v2","updated":"2024-04-19T18:53:41Z","published":"2024-04-03T17:57:15Z","title":"MatAtlas: Text-driven Consistent Geometry Texturing and Material\n Assignment","summary":" We present MatAtlas, a method for consistent text-guided 3D model texturing.\nFollowing recent progress we leverage a large scale text-to-image generation\nmodel (e.g., Stable Diffusion) as a prior to texture a 3D model. We carefully\ndesign an RGB texturing pipeline that leverages a grid pattern diffusion,\ndriven by depth and edges. By proposing a multi-step texture refinement\nprocess, we significantly improve the quality and 3D consistency of the\ntexturing output. To further address the problem of baked-in lighting, we move\nbeyond RGB colors and pursue assigning parametric materials to the assets.\nGiven the high-quality initial RGB texture, we propose a novel material\nretrieval method capitalized on Large Language Models (LLM), enabling\neditabiliy and relightability. We evaluate our method on a wide variety of\ngeometries and show that our method significantly outperform prior arts. We\nalso analyze the role of each component through a detailed ablation study.\n","authors":["Duygu Ceylan","Valentin Deschaintre","Thibault Groueix","Rosalie Martin","Chun-Hao Huang","Romain Rouffet","Vladimir Kim","Gaëtan Lassagne"],"pdf_url":"https://arxiv.org/pdf/2404.02899v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13134v1","updated":"2024-04-19T18:52:07Z","published":"2024-04-19T18:52:07Z","title":"Deep Learning-based Text-in-Image Watermarking","summary":" In this work, we introduce a novel deep learning-based approach to\ntext-in-image watermarking, a method that embeds and extracts textual\ninformation within images to enhance data security and integrity. Leveraging\nthe capabilities of deep learning, specifically through the use of\nTransformer-based architectures for text processing and Vision Transformers for\nimage feature extraction, our method sets new benchmarks in the domain. The\nproposed method represents the first application of deep learning in\ntext-in-image watermarking that improves adaptivity, allowing the model to\nintelligently adjust to specific image characteristics and emerging threats.\nThrough testing and evaluation, our method has demonstrated superior robustness\ncompared to traditional watermarking techniques, achieving enhanced\nimperceptibility that ensures the watermark remains undetectable across various\nimage contents.\n","authors":["Bishwa Karki","Chun-Hua Tsai","Pei-Chi Huang","Xin Zhong"],"pdf_url":"https://arxiv.org/pdf/2404.13134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09673v2","updated":"2024-04-19T18:48:01Z","published":"2024-01-18T01:18:59Z","title":"Artwork Protection Against Neural Style Transfer Using Locally Adaptive\n Adversarial Color Attack","summary":" Neural style transfer (NST) generates new images by combining the style of\none image with the content of another. However, unauthorized NST can exploit\nartwork, raising concerns about artists' rights and motivating the development\nof proactive protection methods. We propose Locally Adaptive Adversarial Color\nAttack (LAACA), empowering artists to protect their artwork from unauthorized\nstyle transfer by processing before public release. By delving into the\nintricacies of human visual perception and the role of different frequency\ncomponents, our method strategically introduces frequency-adaptive\nperturbations in the image. These perturbations significantly degrade the\ngeneration quality of NST while maintaining an acceptable level of visual\nchange in the original image, ensuring that potential infringers are\ndiscouraged from using the protected artworks, because of its bad NST\ngeneration quality. Additionally, existing metrics often overlook the\nimportance of color fidelity in evaluating color-mattered tasks, such as the\nquality of NST-generated images, which is crucial in the context of artistic\nworks. To comprehensively assess the color-mattered tasks, we propose the\nAdversarial Color Distance Metric (ACDM), designed to quantify the color\ndifference of images pre- and post-manipulations. Experimental results confirm\nthat attacking NST using LAACA results in visually inferior style transfer, and\nthe ACDM can efficiently measure color-mattered tasks. By providing artists\nwith a tool to safeguard their intellectual property, our work relieves the\nsocio-technical challenges posed by the misuse of NST in the art community.\n","authors":["Zhongliang Guo","Junhao Dong","Yifei Qian","Kaixuan Wang","Weiye Li","Ziheng Guo","Yuheng Wang","Yanli Li","Ognjen Arandjelović","Lei Fang"],"pdf_url":"https://arxiv.org/pdf/2401.09673v2.pdf","comment":"9 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.13130v1","updated":"2024-04-19T18:34:52Z","published":"2024-04-19T18:34:52Z","title":"On-board classification of underwater images using hybrid\n classical-quantum CNN based method","summary":" Underwater images taken from autonomous underwater vehicles (AUV's) often\nsuffer from low light, high turbidity, poor contrast, motion-blur and excessive\nlight scattering and hence require image enhancement techniques for object\nrecognition. Machine learning methods are being increasingly used for object\nrecognition under such adverse conditions. These enhanced object recognition\nmethods of images taken from AUV's has potential applications in underwater\npipeline and optical fibre surveillance, ocean bed resource extraction, ocean\nfloor mapping, underwater species exploration, etc. While the classical machine\nlearning methods are very efficient in terms of accuracy, they require large\ndatasets and high computational time for image classification. In the current\nwork, we use quantum-classical hybrid machine learning methods for real-time\nunder-water object recognition on-board an AUV for the first time. We use\nreal-time motion-blurred and low-light images taken from an on-board camera of\nAUV built in-house and apply existing hybrid machine learning methods for\nobject recognition. Our hybrid methods consist of quantum encoding and\nflattening of classical images using quantum circuits and sending them to\nclassical neural networks for image classification. The results of hybrid\nmethods carried out using Pennylane based quantum simulators both on GPU and\nusing pre-trained models on an on-board NVIDIA GPU chipset are compared with\nresults from corresponding classical machine learning methods. We observe that\nthe hybrid quantum machine learning methods show an efficiency greater than\n65\\% and reduction in run-time by one-thirds and require 50\\% smaller dataset\nsizes for training the models compared to classical machine learning methods.\nWe hope that our work opens up further possibilities in quantum enhanced\nreal-time computer vision in autonomous vehicles.\n","authors":["Sreeraj Rajan Warrier","D Sri Harshavardhan Reddy","Sriya Bada","Rohith Achampeta","Sebastian Uppapalli","Jayasri Dontabhaktuni"],"pdf_url":"https://arxiv.org/pdf/2404.13130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14435v1","updated":"2024-04-19T16:40:24Z","published":"2024-04-19T16:40:24Z","title":"FreSeg: Frenet-Frame-based Part Segmentation for 3D Curvilinear\n Structures","summary":" Part segmentation is a crucial task for 3D curvilinear structures like neuron\ndendrites and blood vessels, enabling the analysis of dendritic spines and\naneurysms with scientific and clinical significance. However, their diversely\nwinded morphology poses a generalization challenge to existing deep learning\nmethods, which leads to labor-intensive manual correction. In this work, we\npropose FreSeg, a framework of part segmentation tasks for 3D curvilinear\nstructures. With Frenet-Frame-based point cloud transformation, it enables the\nmodels to learn more generalizable features and have significant performance\nimprovements on tasks involving elongated and curvy geometries. We evaluate\nFreSeg on 2 datasets: 1) DenSpineEM, an in-house dataset for dendritic spine\nsegmentation, and 2) IntrA, a public 3D dataset for intracranial aneurysm\nsegmentation. Further, we will release the DenSpineEM dataset, which includes\nroughly 6,000 spines from 69 dendrites from 3 public electron microscopy (EM)\ndatasets, to foster the development of effective dendritic spine instance\nextraction methods and, consequently, large-scale connectivity analysis to\nbetter understand mammalian brains.\n","authors":["Shixuan Gu","Jason Ken Adhinarta","Mikhail Bessmeltsev","Jiancheng Yang","Jessica Zhang","Daniel Berger","Jeff W. Lichtman","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2404.14435v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.13108v1","updated":"2024-04-19T16:19:30Z","published":"2024-04-19T16:19:30Z","title":"RegWSI: Whole Slide Image Registration using Combined Deep Feature- and\n Intensity-Based Methods: Winner of the ACROBAT 2023 Challenge","summary":" The automatic registration of differently stained whole slide images (WSIs)\nis crucial for improving diagnosis and prognosis by fusing complementary\ninformation emerging from different visible structures. It is also useful to\nquickly transfer annotations between consecutive or restained slides, thus\nsignificantly reducing the annotation time and associated costs. Nevertheless,\nthe slide preparation is different for each stain and the tissue undergoes\ncomplex and large deformations. Therefore, a robust, efficient, and accurate\nregistration method is highly desired by the scientific community and hospitals\nspecializing in digital pathology. We propose a two-step hybrid method\nconsisting of (i) deep learning- and feature-based initial alignment algorithm,\nand (ii) intensity-based nonrigid registration using the instance optimization.\nThe proposed method does not require any fine-tuning to a particular dataset\nand can be used directly for any desired tissue type and stain. The method\nscored 1st place in the ACROBAT 2023 challenge. We evaluated using three open\ndatasets: (i) ANHIR, (ii) ACROBAT, and (iii) HyReCo, and performed several\nablation studies concerning the resolution used for registration and the\ninitial alignment robustness and stability. The method achieves the most\naccurate results for the ACROBAT dataset, the cell-level registration accuracy\nfor the restained slides from the HyReCo dataset, and is among the best methods\nevaluated on the ANHIR dataset. The method does not require any fine-tuning to\na new datasets and can be used out-of-the-box for other types of microscopic\nimages. The method is incorporated into the DeeperHistReg framework, allowing\nothers to directly use it to register, transform, and save the WSIs at any\ndesired pyramid level. The proposed method is a significant contribution to the\nWSI registration, thus advancing the field of digital pathology.\n","authors":["Marek Wodzinski","Niccolò Marini","Manfredo Atzori","Henning Müller"],"pdf_url":"https://arxiv.org/pdf/2404.13108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14434v1","updated":"2024-04-19T15:25:06Z","published":"2024-04-19T15:25:06Z","title":"DeeperHistReg: Robust Whole Slide Images Registration Framework","summary":" DeeperHistReg is a software framework dedicated to registering whole slide\nimages (WSIs) acquired using multiple stains. It allows one to perform the\npreprocessing, initial alignment, and nonrigid registration of WSIs acquired\nusing multiple stains (e.g. hematoxylin \\& eosin, immunochemistry). The\nframework implements several state-of-the-art registration algorithms and\nprovides an interface to operate on arbitrary resolution of the WSIs (up to\n200k x 200k). The framework is extensible and new algorithms can be easily\nintegrated by other researchers. The framework is available both as a PyPI\npackage and as a Docker container.\n","authors":["Marek Wodzinski","Niccolò Marini","Manfredo Atzori","Henning Müller"],"pdf_url":"https://arxiv.org/pdf/2404.14434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13106v1","updated":"2024-04-19T14:43:43Z","published":"2024-04-19T14:43:43Z","title":"Automatic Cranial Defect Reconstruction with Self-Supervised Deep\n Deformable Masked Autoencoders","summary":" Thousands of people suffer from cranial injuries every year. They require\npersonalized implants that need to be designed and manufactured before the\nreconstruction surgery. The manual design is expensive and time-consuming\nleading to searching for algorithms whose goal is to automatize the process.\nThe problem can be formulated as volumetric shape completion and solved by deep\nneural networks dedicated to supervised image segmentation. However, such an\napproach requires annotating the ground-truth defects which is costly and\ntime-consuming. Usually, the process is replaced with synthetic defect\ngeneration. However, even the synthetic ground-truth generation is\ntime-consuming and limits the data heterogeneity, thus the deep models'\ngeneralizability. In our work, we propose an alternative and simple approach to\nuse a self-supervised masked autoencoder to solve the problem. This approach by\ndesign increases the heterogeneity of the training set and can be seen as a\nform of data augmentation. We compare the proposed method with several\nstate-of-the-art deep neural networks and show both the quantitative and\nqualitative improvement on the SkullBreak and SkullFix datasets. The proposed\nmethod can be used to efficiently reconstruct the cranial defects in real time.\n","authors":["Marek Wodzinski","Daria Hemmerling","Mateusz Daniol"],"pdf_url":"https://arxiv.org/pdf/2404.13106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13105v1","updated":"2024-04-19T13:50:30Z","published":"2024-04-19T13:50:30Z","title":"On-Demand Earth System Data Cubes","summary":" Advancements in Earth system science have seen a surge in diverse datasets.\nEarth System Data Cubes (ESDCs) have been introduced to efficiently handle this\ninflux of high-dimensional data. ESDCs offer a structured, intuitive framework\nfor data analysis, organising information within spatio-temporal grids. The\nstructured nature of ESDCs unlocks significant opportunities for Artificial\nIntelligence (AI) applications. By providing well-organised data, ESDCs are\nideally suited for a wide range of sophisticated AI-driven tasks. An automated\nframework for creating AI-focused ESDCs with minimal user input could\nsignificantly accelerate the generation of task-specific training data. Here we\nintroduce cubo, an open-source Python tool designed for easy generation of\nAI-focused ESDCs. Utilising collections in SpatioTemporal Asset Catalogs (STAC)\nthat are stored as Cloud Optimised GeoTIFFs (COGs), cubo efficiently creates\nESDCs, requiring only central coordinates, spatial resolution, edge size, and\ntime range.\n","authors":["David Montero","César Aybar","Chaonan Ji","Guido Kraemer","Maximilian Söchting","Khalil Teber","Miguel D. Mahecha"],"pdf_url":"https://arxiv.org/pdf/2404.13105v1.pdf","comment":"Accepted at IGARSS24"},{"id":"http://arxiv.org/abs/2404.13103v1","updated":"2024-04-19T11:27:56Z","published":"2024-04-19T11:27:56Z","title":"ToNNO: Tomographic Reconstruction of a Neural Network's Output for\n Weakly Supervised Segmentation of 3D Medical Images","summary":" Annotating lots of 3D medical images for training segmentation models is\ntime-consuming. The goal of weakly supervised semantic segmentation is to train\nsegmentation models without using any ground truth segmentation masks. Our work\naddresses the case where only image-level categorical labels, indicating the\npresence or absence of a particular region of interest (such as tumours or\nlesions), are available. Most existing methods rely on class activation mapping\n(CAM). We propose a novel approach, ToNNO, which is based on the Tomographic\nreconstruction of a Neural Network's Output. Our technique extracts stacks of\nslices with different angles from the input 3D volume, feeds these slices to a\n2D encoder, and applies the inverse Radon transform in order to reconstruct a\n3D heatmap of the encoder's predictions. This generic method allows to perform\ndense prediction tasks on 3D volumes using any 2D image encoder. We apply it to\nweakly supervised medical image segmentation by training the 2D encoder to\noutput high values for slices containing the regions of interest. We test it on\nfour large scale medical image datasets and outperform 2D CAM methods. We then\nextend ToNNO by combining tomographic reconstruction with CAM methods,\nproposing Averaged CAM and Tomographic CAM, which obtain even better results.\n","authors":["Marius Schmidt-Mengin","Alexis Benichoux","Shibeshih Belachew","Nikos Komodakis","Nikos Paragios"],"pdf_url":"https://arxiv.org/pdf/2404.13103v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13102v1","updated":"2024-04-19T10:19:18Z","published":"2024-04-19T10:19:18Z","title":"Single-sample image-fusion upsampling of fluorescence lifetime images","summary":" Fluorescence lifetime imaging microscopy (FLIM) provides detailed information\nabout molecular interactions and biological processes. A major bottleneck for\nFLIM is image resolution at high acquisition speeds, due to the engineering and\nsignal-processing limitations of time-resolved imaging technology. Here we\npresent single-sample image-fusion upsampling (SiSIFUS), a data-fusion approach\nto computational FLIM super-resolution that combines measurements from a\nlow-resolution time-resolved detector (that measures photon arrival time) and a\nhigh-resolution camera (that measures intensity only). To solve this otherwise\nill-posed inverse retrieval problem, we introduce statistically informed priors\nthat encode local and global dependencies between the two single-sample\nmeasurements. This bypasses the risk of out-of-distribution hallucination as in\ntraditional data-driven approaches and delivers enhanced images compared for\nexample to standard bilinear interpolation. The general approach laid out by\nSiSIFUS can be applied to other image super-resolution problems where two\ndifferent datasets are available.\n","authors":["Valentin Kapitány","Areeba Fatima","Vytautas Zickus","Jamie Whitelaw","Ewan McGhee","Robert Insall","Laura Machesky","Daniele Faccio"],"pdf_url":"https://arxiv.org/pdf/2404.13102v1.pdf","comment":"18 pages, 11 figures. To be published in Science Advances"},{"id":"http://arxiv.org/abs/2404.13101v1","updated":"2024-04-19T09:52:32Z","published":"2024-04-19T09:52:32Z","title":"DensePANet: An improved generative adversarial network for photoacoustic\n tomography image reconstruction from sparse data","summary":" Image reconstruction is an essential step of every medical imaging method,\nincluding Photoacoustic Tomography (PAT), which is a promising modality of\nimaging, that unites the benefits of both ultrasound and optical imaging\nmethods. Reconstruction of PAT images using conventional methods results in\nrough artifacts, especially when applied directly to sparse PAT data. In recent\nyears, generative adversarial networks (GANs) have shown a powerful performance\nin image generation as well as translation, rendering them a smart choice to be\napplied to reconstruction tasks. In this study, we proposed an end-to-end\nmethod called DensePANet to solve the problem of PAT image reconstruction from\nsparse data. The proposed model employs a novel modification of UNet in its\ngenerator, called FD-UNet++, which considerably improves the reconstruction\nperformance. We evaluated the method on various in-vivo and simulated datasets.\nQuantitative and qualitative results show the better performance of our model\nover other prevalent deep learning techniques.\n","authors":["Hesam hakimnejad","Zohreh Azimifar","Narjes Goshtasbi"],"pdf_url":"https://arxiv.org/pdf/2404.13101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13097v1","updated":"2024-04-19T06:52:57Z","published":"2024-04-19T06:52:57Z","title":"DISC: Latent Diffusion Models with Self-Distillation from Separated\n Conditions for Prostate Cancer Grading","summary":" Latent Diffusion Models (LDMs) can generate high-fidelity images from noise,\noffering a promising approach for augmenting histopathology images for training\ncancer grading models. While previous works successfully generated\nhigh-fidelity histopathology images using LDMs, the generation of image tiles\nto improve prostate cancer grading has not yet been explored. Additionally,\nLDMs face challenges in accurately generating admixtures of multiple cancer\ngrades in a tile when conditioned by a tile mask. In this study, we train\nspecific LDMs to generate synthetic tiles that contain multiple Gleason Grades\n(GGs) by leveraging pixel-wise annotations in input tiles. We introduce a novel\nframework named Self-Distillation from Separated Conditions (DISC) that\ngenerates GG patterns guided by GG masks. Finally, we deploy a training\nframework for pixel-level and slide-level prostate cancer grading, where\nsynthetic tiles are effectively utilized to improve the cancer grading\nperformance of existing models. As a result, this work surpasses previous works\nin two domains: 1) our LDMs enhanced with DISC produce more accurate tiles in\nterms of GG patterns, and 2) our training scheme, incorporating synthetic data,\nsignificantly improves the generalization of the baseline model for prostate\ncancer grading, particularly in challenging cases of rare GG5, demonstrating\nthe potential of generative models to enhance cancer grading when data is\nlimited.\n","authors":["Man M. Ho","Elham Ghelichkhan","Yosep Chong","Yufei Zhou","Beatrice Knudsen","Tolga Tasdizen"],"pdf_url":"https://arxiv.org/pdf/2404.13097v1.pdf","comment":"Abstract accepted for ISBI 2024. Extended version to be presented at\n SynData4CV @ CVPR 2024. See more at https://minhmanho.github.io/disc/"}]},"2024-04-22T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.14412v1","updated":"2024-04-22T17:59:57Z","published":"2024-04-22T17:59:57Z","title":"AutoAD III: The Prequel -- Back to the Pixels","summary":" Generating Audio Description (AD) for movies is a challenging task that\nrequires fine-grained visual understanding and an awareness of the characters\nand their names. Currently, visual language models for AD generation are\nlimited by a lack of suitable training data, and also their evaluation is\nhampered by using performance measures not specialized to the AD domain. In\nthis paper, we make three contributions: (i) We propose two approaches for\nconstructing AD datasets with aligned video data, and build training and\nevaluation datasets using these. These datasets will be publicly released; (ii)\nWe develop a Q-former-based architecture which ingests raw video and generates\nAD, using frozen pre-trained visual encoders and large language models; and\n(iii) We provide new evaluation metrics to benchmark AD quality that are\nwell-matched to human performance. Taken together, we improve the state of the\nart on AD generation.\n","authors":["Tengda Han","Max Bain","Arsha Nagrani","Gül Varol","Weidi Xie","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2404.14412v1.pdf","comment":"CVPR2024. Project page:\n https://www.robots.ox.ac.uk/~vgg/research/autoad/"},{"id":"http://arxiv.org/abs/2404.14410v1","updated":"2024-04-22T17:59:50Z","published":"2024-04-22T17:59:50Z","title":"Guess The Unseen: Dynamic 3D Scene Reconstruction from Partial 2D\n Glimpses","summary":" In this paper, we present a method to reconstruct the world and multiple\ndynamic humans in 3D from a monocular video input. As a key idea, we represent\nboth the world and multiple humans via the recently emerging 3D Gaussian\nSplatting (3D-GS) representation, enabling to conveniently and efficiently\ncompose and render them together. In particular, we address the scenarios with\nseverely limited and sparse observations in 3D human reconstruction, a common\nchallenge encountered in the real world. To tackle this challenge, we introduce\na novel approach to optimize the 3D-GS representation in a canonical space by\nfusing the sparse cues in the common space, where we leverage a pre-trained 2D\ndiffusion model to synthesize unseen views while keeping the consistency with\nthe observed 2D appearances. We demonstrate our method can reconstruct\nhigh-quality animatable 3D humans in various challenging examples, in the\npresence of occlusion, image crops, few-shot, and extremely sparse\nobservations. After reconstruction, our method is capable of not only rendering\nthe scene in any novel views at arbitrary time instances, but also editing the\n3D scene by removing individual humans or applying different motions for each\nhuman. Through various experiments, we demonstrate the quality and efficiency\nof our methods over alternative existing approaches.\n","authors":["Inhee Lee","Byungjun Kim","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2404.14410v1.pdf","comment":"The project page is available at https://snuvclab.github.io/gtu/"},{"id":"http://arxiv.org/abs/2404.14409v1","updated":"2024-04-22T17:59:36Z","published":"2024-04-22T17:59:36Z","title":"CrossScore: Towards Multi-View Image Evaluation and Scoring","summary":" We introduce a novel cross-reference image quality assessment method that\neffectively fills the gap in the image assessment landscape, complementing the\narray of established evaluation schemes -- ranging from full-reference metrics\nlike SSIM, no-reference metrics such as NIQE, to general-reference metrics\nincluding FID, and Multi-modal-reference metrics, e.g., CLIPScore. Utilising a\nneural network with the cross-attention mechanism and a unique data collection\npipeline from NVS optimisation, our method enables accurate image quality\nassessment without requiring ground truth references. By comparing a query\nimage against multiple views of the same scene, our method addresses the\nlimitations of existing metrics in novel view synthesis (NVS) and similar tasks\nwhere direct reference images are unavailable. Experimental results show that\nour method is closely correlated to the full-reference metric SSIM, while not\nrequiring ground truth references.\n","authors":["Zirui Wang","Wenjing Bian","Omkar Parkhi","Yuheng Ren","Victor Adrian Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2404.14409v1.pdf","comment":"Project page see https://crossscore.active.vision"},{"id":"http://arxiv.org/abs/2404.12379v2","updated":"2024-04-22T17:59:27Z","published":"2024-04-18T17:58:16Z","title":"Dynamic Gaussians Mesh: Consistent Mesh Reconstruction from Monocular\n Videos","summary":" Modern 3D engines and graphics pipelines require mesh as a memory-efficient\nrepresentation, which allows efficient rendering, geometry processing, texture\nediting, and many other downstream operations. However, it is still highly\ndifficult to obtain high-quality mesh in terms of structure and detail from\nmonocular visual observations. The problem becomes even more challenging for\ndynamic scenes and objects. To this end, we introduce Dynamic Gaussians Mesh\n(DG-Mesh), a framework to reconstruct a high-fidelity and time-consistent mesh\ngiven a single monocular video. Our work leverages the recent advancement in 3D\nGaussian Splatting to construct the mesh sequence with temporal consistency\nfrom a video. Building on top of this representation, DG-Mesh recovers\nhigh-quality meshes from the Gaussian points and can track the mesh vertices\nover time, which enables applications such as texture editing on dynamic\nobjects. We introduce the Gaussian-Mesh Anchoring, which encourages evenly\ndistributed Gaussians, resulting better mesh reconstruction through mesh-guided\ndensification and pruning on the deformed Gaussians. By applying\ncycle-consistent deformation between the canonical and the deformed space, we\ncan project the anchored Gaussian back to the canonical space and optimize\nGaussians across all time frames. During the evaluation on different datasets,\nDG-Mesh provides significantly better mesh reconstruction and rendering than\nbaselines. Project page: https://www.liuisabella.com/DG-Mesh/\n","authors":["Isabella Liu","Hao Su","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12379v2.pdf","comment":"Project page: https://www.liuisabella.com/DG-Mesh/"},{"id":"http://arxiv.org/abs/2404.14406v1","updated":"2024-04-22T17:59:18Z","published":"2024-04-22T17:59:18Z","title":"Hyp-OC: Hyperbolic One Class Classification for Face Anti-Spoofing","summary":" Face recognition technology has become an integral part of modern security\nsystems and user authentication processes. However, these systems are\nvulnerable to spoofing attacks and can easily be circumvented. Most prior\nresearch in face anti-spoofing (FAS) approaches it as a two-class\nclassification task where models are trained on real samples and known spoof\nattacks and tested for detection performance on unknown spoof attacks. However,\nin practice, FAS should be treated as a one-class classification task where,\nwhile training, one cannot assume any knowledge regarding the spoof samples a\npriori. In this paper, we reformulate the face anti-spoofing task from a\none-class perspective and propose a novel hyperbolic one-class classification\nframework. To train our network, we use a pseudo-negative class sampled from\nthe Gaussian distribution with a weighted running mean and propose two novel\nloss functions: (1) Hyp-PC: Hyperbolic Pairwise Confusion loss, and (2) Hyp-CE:\nHyperbolic Cross Entropy loss, which operate in the hyperbolic space.\nAdditionally, we employ Euclidean feature clipping and gradient clipping to\nstabilize the training in the hyperbolic space. To the best of our knowledge,\nthis is the first work extending hyperbolic embeddings for face anti-spoofing\nin a one-class manner. With extensive experiments on five benchmark datasets:\nRose-Youtu, MSU-MFSD, CASIA-MFSD, Idiap Replay-Attack, and OULU-NPU, we\ndemonstrate that our method significantly outperforms the state-of-the-art,\nachieving better spoof detection performance.\n","authors":["Kartik Narayan","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2404.14406v1.pdf","comment":"Accepted in FG2024, Project Page -\n https://kartik-3004.github.io/hyp-oc/"},{"id":"http://arxiv.org/abs/2404.14403v1","updated":"2024-04-22T17:58:36Z","published":"2024-04-22T17:58:36Z","title":"GeoDiffuser: Geometry-Based Image Editing with Diffusion Models","summary":" The success of image generative models has enabled us to build methods that\ncan edit images based on text or other user input. However, these methods are\nbespoke, imprecise, require additional information, or are limited to only 2D\nimage edits. We present GeoDiffuser, a zero-shot optimization-based method that\nunifies common 2D and 3D image-based object editing capabilities into a single\nmethod. Our key insight is to view image editing operations as geometric\ntransformations. We show that these transformations can be directly\nincorporated into the attention layers in diffusion models to implicitly\nperform editing operations. Our training-free optimization method uses an\nobjective function that seeks to preserve object style but generate plausible\nimages, for instance with accurate lighting and shadows. It also inpaints\ndisoccluded parts of the image where the object was originally located. Given a\nnatural image and user input, we segment the foreground object using SAM and\nestimate a corresponding transform which is used by our optimization approach\nfor editing. GeoDiffuser can perform common 2D and 3D edits like object\ntranslation, 3D rotation, and removal. We present quantitative results,\nincluding a perceptual study, that shows how our approach is better than\nexisting methods. Visit https://ivl.cs.brown.edu/research/geodiffuser.html for\nmore information.\n","authors":["Rahul Sajnani","Jeroen Vanbaar","Jie Min","Kapil Katyal","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2404.14403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14396v1","updated":"2024-04-22T17:56:09Z","published":"2024-04-22T17:56:09Z","title":"SEED-X: Multimodal Models with Unified Multi-granularity Comprehension\n and Generation","summary":" The rapid evolution of multimodal foundation model has demonstrated\nsignificant progresses in vision-language understanding and generation, e.g.,\nour previous work SEED-LLaMA. However, there remains a gap between its\ncapability and the real-world applicability, primarily due to the model's\nlimited capacity to effectively respond to various user instructions and\ninteract with diverse visual data. In this work, we focus on bridging this gap\nthrough integrating two enhanced features: (1) comprehending images of\narbitrary sizes and ratios, and (2) enabling multi-granularity image\ngeneration. We present a unified and versatile foundation model, namely,\nSEED-X, which is able to model multi-granularity visual semantics for\ncomprehension and generation tasks. Besides the competitive results on public\nbenchmarks, SEED-X demonstrates its effectiveness in handling real-world\napplications across various domains after instruction tuning. We hope that our\nwork will inspire future research into what can be achieved by versatile\nmultimodal foundation models in real-world applications. The models, codes, and\ndatasets will be released in https://github.com/AILab-CVC/SEED-X.\n","authors":["Yuying Ge","Sijie Zhao","Jinguo Zhu","Yixiao Ge","Kun Yi","Lin Song","Chen Li","Xiaohan Ding","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2404.14396v1.pdf","comment":"Project released at: https://github.com/AILab-CVC/SEED-X"},{"id":"http://arxiv.org/abs/2404.14394v1","updated":"2024-04-22T17:55:11Z","published":"2024-04-22T17:55:11Z","title":"A Multimodal Automated Interpretability Agent","summary":" This paper describes MAIA, a Multimodal Automated Interpretability Agent.\nMAIA is a system that uses neural models to automate neural model understanding\ntasks like feature interpretation and failure mode discovery. It equips a\npre-trained vision-language model with a set of tools that support iterative\nexperimentation on subcomponents of other models to explain their behavior.\nThese include tools commonly used by human interpretability researchers: for\nsynthesizing and editing inputs, computing maximally activating exemplars from\nreal-world datasets, and summarizing and describing experimental results.\nInterpretability experiments proposed by MAIA compose these tools to describe\nand explain system behavior. We evaluate applications of MAIA to computer\nvision models. We first characterize MAIA's ability to describe (neuron-level)\nfeatures in learned representations of images. Across several trained models\nand a novel dataset of synthetic vision neurons with paired ground-truth\ndescriptions, MAIA produces descriptions comparable to those generated by\nexpert human experimenters. We then show that MAIA can aid in two additional\ninterpretability tasks: reducing sensitivity to spurious features, and\nautomatically identifying inputs likely to be mis-classified.\n","authors":["Tamar Rott Shaham","Sarah Schwettmann","Franklin Wang","Achyuta Rajaram","Evan Hernandez","Jacob Andreas","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2404.14394v1.pdf","comment":"25 pages, 13 figures"},{"id":"http://arxiv.org/abs/2402.18673v2","updated":"2024-04-22T17:54:17Z","published":"2024-02-28T19:35:30Z","title":"Trends, Applications, and Challenges in Human Attention Modelling","summary":" Human attention modelling has proven, in recent years, to be particularly\nuseful not only for understanding the cognitive processes underlying visual\nexploration, but also for providing support to artificial intelligence models\nthat aim to solve problems in various domains, including image and video\nprocessing, vision-and-language applications, and language modelling. This\nsurvey offers a reasoned overview of recent efforts to integrate human\nattention mechanisms into contemporary deep learning models and discusses\nfuture research directions and challenges. For a comprehensive overview on the\nongoing research refer to our dedicated repository available at\nhttps://github.com/aimagelab/awesome-human-visual-attention.\n","authors":["Giuseppe Cartella","Marcella Cornia","Vittorio Cuculo","Alessandro D'Amelio","Dario Zanca","Giuseppe Boccignone","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2402.18673v2.pdf","comment":"Accepted at IJCAI 2024 Survey Track"},{"id":"http://arxiv.org/abs/2404.10108v2","updated":"2024-04-22T17:53:08Z","published":"2024-04-15T19:43:16Z","title":"GeoAI Reproducibility and Replicability: a computational and spatial\n perspective","summary":" GeoAI has emerged as an exciting interdisciplinary research area that\ncombines spatial theories and data with cutting-edge AI models to address\ngeospatial problems in a novel, data-driven manner. While GeoAI research has\nflourished in the GIScience literature, its reproducibility and replicability\n(R&R), fundamental principles that determine the reusability, reliability, and\nscientific rigor of research findings, have rarely been discussed. This paper\naims to provide an in-depth analysis of this topic from both computational and\nspatial perspectives. We first categorize the major goals for reproducing GeoAI\nresearch, namely, validation (repeatability), learning and adapting the method\nfor solving a similar or new problem (reproducibility), and examining the\ngeneralizability of the research findings (replicability). Each of these goals\nrequires different levels of understanding of GeoAI, as well as different\nmethods to ensure its success. We then discuss the factors that may cause the\nlack of R&R in GeoAI research, with an emphasis on (1) the selection and use of\ntraining data; (2) the uncertainty that resides in the GeoAI model design,\ntraining, deployment, and inference processes; and more importantly (3) the\ninherent spatial heterogeneity of geospatial data and processes. We use a deep\nlearning-based image analysis task as an example to demonstrate the results'\nuncertainty and spatial variance caused by different factors. The findings\nreiterate the importance of knowledge sharing, as well as the generation of a\n\"replicability map\" that incorporates spatial autocorrelation and spatial\nheterogeneity into consideration in quantifying the spatial replicability of\nGeoAI research.\n","authors":["Wenwen Li","Chia-Yu Hsu","Sizhe Wang","Peter Kedron"],"pdf_url":"https://arxiv.org/pdf/2404.10108v2.pdf","comment":"Accepted by Annals of the American Association of Geographers"},{"id":"http://arxiv.org/abs/2404.14388v1","updated":"2024-04-22T17:46:29Z","published":"2024-04-22T17:46:29Z","title":"STROOBnet Optimization via GPU-Accelerated Proximal Recurrence\n Strategies","summary":" Spatiotemporal networks' observational capabilities are crucial for accurate\ndata gathering and informed decisions across multiple sectors. This study\nfocuses on the Spatiotemporal Ranged Observer-Observable Bipartite Network\n(STROOBnet), linking observational nodes (e.g., surveillance cameras) to events\nwithin defined geographical regions, enabling efficient monitoring. Using data\nfrom Real-Time Crime Camera (RTCC) systems and Calls for Service (CFS) in New\nOrleans, where RTCC combats rising crime amidst reduced police presence, we\naddress the network's initial observational imbalances. Aiming for uniform\nobservational efficacy, we propose the Proximal Recurrence approach. It\noutperformed traditional clustering methods like k-means and DBSCAN by offering\nholistic event frequency and spatial consideration, enhancing observational\ncoverage.\n","authors":["Ted Edward Holmberg","Mahdi Abdelguerfi","Elias Ioup"],"pdf_url":"https://arxiv.org/pdf/2404.14388v1.pdf","comment":"10 pages, 17 figures, 2023 IEEE International Conference on Big Data\n (BigData)"},{"id":"http://arxiv.org/abs/2404.14381v1","updated":"2024-04-22T17:36:03Z","published":"2024-04-22T17:36:03Z","title":"TAVGBench: Benchmarking Text to Audible-Video Generation","summary":" The Text to Audible-Video Generation (TAVG) task involves generating videos\nwith accompanying audio based on text descriptions. Achieving this requires\nskillful alignment of both audio and video elements. To support research in\nthis field, we have developed a comprehensive Text to Audible-Video Generation\nBenchmark (TAVGBench), which contains over 1.7 million clips with a total\nduration of 11.8 thousand hours. We propose an automatic annotation pipeline to\nensure each audible video has detailed descriptions for both its audio and\nvideo contents. We also introduce the Audio-Visual Harmoni score (AVHScore) to\nprovide a quantitative measure of the alignment between the generated audio and\nvideo modalities. Additionally, we present a baseline model for TAVG called\nTAVDiffusion, which uses a two-stream latent diffusion model to provide a\nfundamental starting point for further research in this area. We achieve the\nalignment of audio and video by employing cross-attention and contrastive\nlearning. Through extensive experiments and evaluations on TAVGBench, we\ndemonstrate the effectiveness of our proposed model under both conventional\nmetrics and our proposed metrics.\n","authors":["Yuxin Mao","Xuyang Shen","Jing Zhang","Zhen Qin","Jinxing Zhou","Mochu Xiang","Yiran Zhong","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2404.14381v1.pdf","comment":"Technical Report. Project\n page:https://github.com/OpenNLPLab/TAVGBench"},{"id":"http://arxiv.org/abs/2404.12547v2","updated":"2024-04-22T17:35:33Z","published":"2024-04-18T23:52:42Z","title":"Does Gaussian Splatting need SFM Initialization?","summary":" 3D Gaussian Splatting has recently been embraced as a versatile and effective\nmethod for scene reconstruction and novel view synthesis, owing to its\nhigh-quality results and compatibility with hardware rasterization. Despite its\nadvantages, Gaussian Splatting's reliance on high-quality point cloud\ninitialization by Structure-from-Motion (SFM) algorithms is a significant\nlimitation to be overcome. To this end, we investigate various initialization\nstrategies for Gaussian Splatting and delve into how volumetric reconstructions\nfrom Neural Radiance Fields (NeRF) can be utilized to bypass the dependency on\nSFM data. Our findings demonstrate that random initialization can perform much\nbetter if carefully designed and that by employing a combination of improved\ninitialization strategies and structure distillation from low-cost NeRF models,\nit is possible to achieve equivalent results, or at times even superior, to\nthose obtained from SFM initialization.\n","authors":["Yalda Foroutan","Daniel Rebain","Kwang Moo Yi","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2404.12547v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.05664v2","updated":"2024-04-22T17:28:30Z","published":"2023-12-09T20:06:29Z","title":"CoGS: Controllable Gaussian Splatting","summary":" Capturing and re-animating the 3D structure of articulated objects present\nsignificant barriers. On one hand, methods requiring extensively calibrated\nmulti-view setups are prohibitively complex and resource-intensive, limiting\ntheir practical applicability. On the other hand, while single-camera Neural\nRadiance Fields (NeRFs) offer a more streamlined approach, they have excessive\ntraining and rendering costs. 3D Gaussian Splatting would be a suitable\nalternative but for two reasons. Firstly, existing methods for 3D dynamic\nGaussians require synchronized multi-view cameras, and secondly, the lack of\ncontrollability in dynamic scenarios. We present CoGS, a method for\nControllable Gaussian Splatting, that enables the direct manipulation of scene\nelements, offering real-time control of dynamic scenes without the prerequisite\nof pre-computing control signals. We evaluated CoGS using both synthetic and\nreal-world datasets that include dynamic objects that differ in degree of\ndifficulty. In our evaluations, CoGS consistently outperformed existing dynamic\nand controllable neural representations in terms of visual fidelity.\n","authors":["Heng Yu","Joel Julin","Zoltán Á. Milacski","Koichiro Niinuma","László A. Jeni"],"pdf_url":"https://arxiv.org/pdf/2312.05664v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.14368v1","updated":"2024-04-22T17:20:38Z","published":"2024-04-22T17:20:38Z","title":"Graphic Design with Large Multimodal Model","summary":" In the field of graphic design, automating the integration of design elements\ninto a cohesive multi-layered artwork not only boosts productivity but also\npaves the way for the democratization of graphic design. One existing practice\nis Graphic Layout Generation (GLG), which aims to layout sequential design\nelements. It has been constrained by the necessity for a predefined correct\nsequence of layers, thus limiting creative potential and increasing user\nworkload. In this paper, we present Hierarchical Layout Generation (HLG) as a\nmore flexible and pragmatic setup, which creates graphic composition from\nunordered sets of design elements. To tackle the HLG task, we introduce\nGraphist, the first layout generation model based on large multimodal models.\nGraphist efficiently reframes the HLG as a sequence generation problem,\nutilizing RGB-A images as input, outputs a JSON draft protocol, indicating the\ncoordinates, size, and order of each element. We develop new evaluation metrics\nfor HLG. Graphist outperforms prior arts and establishes a strong baseline for\nthis field. Project homepage: https://github.com/graphic-design-ai/graphist\n","authors":["Yutao Cheng","Zhao Zhang","Maoke Yang","Hui Nie","Chunyuan Li","Xinglong Wu","Jie Shao"],"pdf_url":"https://arxiv.org/pdf/2404.14368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01497v2","updated":"2024-04-22T17:13:48Z","published":"2024-03-03T12:17:49Z","title":"Learning A Physical-aware Diffusion Model Based on Transformer for\n Underwater Image Enhancement","summary":" Underwater visuals undergo various complex degradations, inevitably\ninfluencing the efficiency of underwater vision tasks. Recently, diffusion\nmodels were employed to underwater image enhancement (UIE) tasks, and gained\nSOTA performance. However, these methods fail to consider the physical\nproperties and underwater imaging mechanisms in the diffusion process, limiting\ninformation completion capacity of diffusion models. In this paper, we\nintroduce a novel UIE framework, named PA-Diff, designed to exploiting the\nknowledge of physics to guide the diffusion process.\n PA-Diff consists of Physics Prior Generation (PPG) Branch, Implicit Neural\nReconstruction (INR) Branch, and Physics-aware Diffusion Transformer (PDT)\nBranch. Our designed PPG branch aims to produce the prior knowledge of physics.\nWith utilizing the physics prior knowledge to guide the diffusion process, PDT\nbranch can obtain underwater-aware ability and model the complex distribution\nin real-world underwater scenes. INR Branch can learn robust feature\nrepresentations from diverse underwater image via implicit neural\nrepresentation, which reduces the difficulty of restoration for PDT branch.\nExtensive experiments prove that our method achieves best performance on UIE\ntasks.\n","authors":["Chen Zhao","Chenyu Dong","Weiling Cai"],"pdf_url":"https://arxiv.org/pdf/2403.01497v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14351v1","updated":"2024-04-22T17:02:33Z","published":"2024-04-22T17:02:33Z","title":"Scene Coordinate Reconstruction: Posing of Image Collections via\n Incremental Learning of a Relocalizer","summary":" We address the task of estimating camera parameters from a set of images\ndepicting a scene. Popular feature-based structure-from-motion (SfM) tools\nsolve this task by incremental reconstruction: they repeat triangulation of\nsparse 3D points and registration of more camera views to the sparse point\ncloud. We re-interpret incremental structure-from-motion as an iterated\napplication and refinement of a visual relocalizer, that is, of a method that\nregisters new views to the current state of the reconstruction. This\nperspective allows us to investigate alternative visual relocalizers that are\nnot rooted in local feature matching. We show that scene coordinate regression,\na learning-based relocalization approach, allows us to build implicit, neural\nscene representations from unposed images. Different from other learning-based\nreconstruction methods, we do not require pose priors nor sequential inputs,\nand we optimize efficiently over thousands of images. Our method, ACE0 (ACE\nZero), estimates camera poses to an accuracy comparable to feature-based SfM,\nas demonstrated by novel view synthesis. Project page:\nhttps://nianticlabs.github.io/acezero/\n","authors":["Eric Brachmann","Jamie Wynn","Shuai Chen","Tommaso Cavallari","Áron Monszpart","Daniyar Turmukhambetov","Victor Adrian Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2404.14351v1.pdf","comment":"Project page: https://nianticlabs.github.io/acezero/"},{"id":"http://arxiv.org/abs/2404.14349v1","updated":"2024-04-22T17:00:57Z","published":"2024-04-22T17:00:57Z","title":"Automatic Discovery of Visual Circuits","summary":" To date, most discoveries of network subcomponents that implement\nhuman-interpretable computations in deep vision models have involved close\nstudy of single units and large amounts of human labor. We explore scalable\nmethods for extracting the subgraph of a vision model's computational graph\nthat underlies recognition of a specific visual concept. We introduce a new\nmethod for identifying these subgraphs: specifying a visual concept using a few\nexamples, and then tracing the interdependence of neuron activations across\nlayers, or their functional connectivity. We find that our approach extracts\ncircuits that causally affect model output, and that editing these circuits can\ndefend large pretrained models from adversarial attacks.\n","authors":["Achyuta Rajaram","Neil Chowdhury","Antonio Torralba","Jacob Andreas","Sarah Schwettmann"],"pdf_url":"https://arxiv.org/pdf/2404.14349v1.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.14344v1","updated":"2024-04-22T16:59:43Z","published":"2024-04-22T16:59:43Z","title":"On-the-Fly Point Annotation for Fast Medical Video Labeling","summary":" Purpose: In medical research, deep learning models rely on high-quality\nannotated data, a process often laborious and timeconsuming. This is\nparticularly true for detection tasks where bounding box annotations are\nrequired. The need to adjust two corners makes the process inherently\nframe-by-frame. Given the scarcity of experts' time, efficient annotation\nmethods suitable for clinicians are needed. Methods: We propose an on-the-fly\nmethod for live video annotation to enhance the annotation efficiency. In this\napproach, a continuous single-point annotation is maintained by keeping the\ncursor on the object in a live video, mitigating the need for tedious pausing\nand repetitive navigation inherent in traditional annotation methods. This\nnovel annotation paradigm inherits the point annotation's ability to generate\npseudo-labels using a point-to-box teacher model. We empirically evaluate this\napproach by developing a dataset and comparing on-the-fly annotation time\nagainst traditional annotation method. Results: Using our method, annotation\nspeed was 3.2x faster than the traditional annotation technique. We achieved a\nmean improvement of 6.51 +- 0.98 AP@50 over conventional method at equivalent\nannotation budgets on the developed dataset. Conclusion: Without bells and\nwhistles, our approach offers a significant speed-up in annotation tasks. It\ncan be easily implemented on any annotation platform to accelerate the\nintegration of deep learning in video-based medical research.\n","authors":["Meyer Adrien","Mazellier Jean-Paul","Jeremy Dana","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2404.14344v1.pdf","comment":"7 pages, 5 figures. Int J CARS (2024)"},{"id":"http://arxiv.org/abs/2404.14343v1","updated":"2024-04-22T16:58:37Z","published":"2024-04-22T16:58:37Z","title":"Heterogeneous Face Recognition Using Domain Invariant Units","summary":" Heterogeneous Face Recognition (HFR) aims to expand the applicability of Face\nRecognition (FR) systems to challenging scenarios, enabling the matching of\nface images across different domains, such as matching thermal images to\nvisible spectra. However, the development of HFR systems is challenging because\nof the significant domain gap between modalities and the lack of availability\nof large-scale paired multi-channel data. In this work, we leverage a\npretrained face recognition model as a teacher network to learn domaininvariant\nnetwork layers called Domain-Invariant Units (DIU) to reduce the domain gap.\nThe proposed DIU can be trained effectively even with a limited amount of\npaired training data, in a contrastive distillation framework. This proposed\napproach has the potential to enhance pretrained models, making them more\nadaptable to a wider range of variations in data. We extensively evaluate our\napproach on multiple challenging benchmarks, demonstrating superior performance\ncompared to state-of-the-art methods.\n","authors":["Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2404.14343v1.pdf","comment":"6 pages, Accepted ICASSP 2024"},{"id":"http://arxiv.org/abs/2206.04406v2","updated":"2024-04-22T16:41:38Z","published":"2022-06-09T10:39:44Z","title":"Unsupervised Learning of the Total Variation Flow","summary":" The total variation (TV) flow generates a scale-space representation of an\nimage based on the TV functional. This gradient flow observes desirable\nfeatures for images, such as sharp edges and enables spectral, scale, and\ntexture analysis. Solving the TV flow is challenging; one reason is the the\nnon-uniqueness of the subgradients. The standard numerical approach for TV flow\nrequires solving multiple non-smooth optimisation problems. Even with\nstate-of-the-art convex optimisation techniques, this is often prohibitively\nexpensive and strongly motivates the use of alternative, faster approaches.\nInspired by and extending the framework of physics-informed neural networks\n(PINNs), we propose the TVflowNET, an unsupervised neural network approach, to\napproximate the solution of the TV flow given an initial image and a time\ninstance. The TVflowNET requires no ground truth data but rather makes use of\nthe PDE for optimisation of the network parameters. We circumvent the\nchallenges related to the non-uniqueness of the subgradients by additionally\nlearning the related diffusivity term. Our approach significantly speeds up the\ncomputation time and we show that the TVflowNET approximates the TV flow\nsolution with high fidelity for different image sizes and image types.\nAdditionally, we give a full comparison of different network architecture\ndesigns as well as training regimes to underscore the effectiveness of our\napproach.\n","authors":["Tamara G. Grossmann","Sören Dittmer","Yury Korolev","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2206.04406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14329v1","updated":"2024-04-22T16:40:11Z","published":"2024-04-22T16:40:11Z","title":"X-Ray: A Sequential 3D Representation for Generation","summary":" In this paper, we introduce X-Ray, an innovative approach to 3D generation\nthat employs a new sequential representation, drawing inspiration from the\ndepth-revealing capabilities of X-Ray scans to meticulously capture both the\nexternal and internal features of objects. Central to our method is the\nutilization of ray casting techniques originating from the camera's viewpoint,\nmeticulously recording the geometric and textural details encountered across\nall intersected surfaces. This process efficiently condenses complete objects\nor scenes into a multi-frame format, just like videos. Such a structure ensures\nthe 3D representation is composed solely of critical surface information.\nHighlighting the practicality and adaptability of our X-Ray representation, we\nshowcase its utility in synthesizing 3D objects, employing a network\narchitecture akin to that used in video diffusion models. The outcomes reveal\nour representation's superior performance in enhancing both the accuracy and\nefficiency of 3D synthesis, heralding new directions for ongoing research and\npractical implementations in the field.\n","authors":["Tao Hu","Wenhang Ge","Yuyang Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.14329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14326v1","updated":"2024-04-22T16:38:41Z","published":"2024-04-22T16:38:41Z","title":"Machine Learning Techniques for MRI Data Processing at Expanding Scale","summary":" Imaging sites around the world generate growing amounts of medical scan data\nwith ever more versatile and affordable technology. Large-scale studies acquire\nMRI for tens of thousands of participants, together with metadata ranging from\nlifestyle questionnaires to biochemical assays, genetic analyses and more.\nThese large datasets encode substantial information about human health and hold\nconsiderable potential for machine learning training and analysis. This chapter\nexamines ongoing large-scale studies and the challenge of distribution shifts\nbetween them. Transfer learning for overcoming such shifts is discussed,\ntogether with federated learning for safe access to distributed training data\nsecurely held at multiple institutions. Finally, representation learning is\nreviewed as a methodology for encoding embeddings that express abstract\nrelationships in multi-modal input formats.\n","authors":["Taro Langner"],"pdf_url":"https://arxiv.org/pdf/2404.14326v1.pdf","comment":"Book chapter pre-print"},{"id":"http://arxiv.org/abs/2404.14322v1","updated":"2024-04-22T16:33:06Z","published":"2024-04-22T16:33:06Z","title":"A Novel Approach to Chest X-ray Lung Segmentation Using U-net and\n Modified Convolutional Block Attention Module","summary":" Lung segmentation in chest X-ray images is of paramount importance as it\nplays a crucial role in the diagnosis and treatment of various lung diseases.\nThis paper presents a novel approach for lung segmentation in chest X-ray\nimages by integrating U-net with attention mechanisms. The proposed method\nenhances the U-net architecture by incorporating a Convolutional Block\nAttention Module (CBAM), which unifies three distinct attention mechanisms:\nchannel attention, spatial attention, and pixel attention. The channel\nattention mechanism enables the model to concentrate on the most informative\nfeatures across various channels. The spatial attention mechanism enhances the\nmodel's precision in localization by focusing on significant spatial locations.\nLastly, the pixel attention mechanism empowers the model to focus on individual\npixels, further refining the model's focus and thereby improving the accuracy\nof segmentation. The adoption of the proposed CBAM in conjunction with the\nU-net architecture marks a significant advancement in the field of medical\nimaging, with potential implications for improving diagnostic precision and\npatient outcomes. The efficacy of this method is validated against contemporary\nstate-of-the-art techniques, showcasing its superiority in segmentation\nperformance.\n","authors":["Mohammad Ali Labbaf Khaniki","Mohammad Manthouri"],"pdf_url":"https://arxiv.org/pdf/2404.14322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00816v3","updated":"2024-04-22T16:26:37Z","published":"2023-06-01T15:42:06Z","title":"Versatile Backdoor Attack with Visible, Semantic, Sample-Specific, and\n Compatible Triggers","summary":" Deep neural networks (DNNs) can be manipulated to exhibit specific behaviors\nwhen exposed to specific trigger patterns, without affecting their performance\non benign samples, dubbed \\textit{backdoor attack}. Currently, implementing\nbackdoor attacks in physical scenarios still faces significant challenges.\nPhysical attacks are labor-intensive and time-consuming, and the triggers are\nselected in a manual and heuristic way. Moreover, expanding digital attacks to\nphysical scenarios faces many challenges due to their sensitivity to visual\ndistortions and the absence of counterparts in the real world. To address these\nchallenges, we define a novel trigger called the \\textbf{V}isible,\n\\textbf{S}emantic, \\textbf{S}ample-Specific, and \\textbf{C}ompatible (VSSC)\ntrigger, to achieve effective, stealthy and robust simultaneously, which can\nalso be effectively deployed in the physical scenario using corresponding\nobjects. To implement the VSSC trigger, we propose an automated pipeline\ncomprising three modules: a trigger selection module that systematically\nidentifies suitable triggers leveraging large language models, a trigger\ninsertion module that employs generative models to seamlessly integrate\ntriggers into images, and a quality assessment module that ensures the natural\nand successful insertion of triggers through vision-language models. Extensive\nexperimental results and analysis validate the effectiveness, stealthiness, and\nrobustness of the VSSC trigger. It can not only maintain robustness under\nvisual distortions but also demonstrates strong practicality in the physical\nscenario. We hope that the proposed VSSC trigger and implementation approach\ncould inspire future studies on designing more practical triggers in backdoor\nattacks.\n","authors":["Ruotong Wang","Hongrui Chen","Zihao Zhu","Li Liu","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2306.00816v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06687v2","updated":"2024-04-22T16:18:53Z","published":"2024-03-11T13:04:21Z","title":"Advancing Graph Neural Networks with HL-HGAT: A Hodge-Laplacian and\n Attention Mechanism Approach for Heterogeneous Graph-Structured Data","summary":" Graph neural networks (GNNs) have proven effective in capturing relationships\namong nodes in a graph. This study introduces a novel perspective by\nconsidering a graph as a simplicial complex, encompassing nodes, edges,\ntriangles, and $k$-simplices, enabling the definition of graph-structured data\non any $k$-simplices. Our contribution is the Hodge-Laplacian heterogeneous\ngraph attention network (HL-HGAT), designed to learn heterogeneous signal\nrepresentations across $k$-simplices. The HL-HGAT incorporates three key\ncomponents: HL convolutional filters (HL-filters), simplicial projection (SP),\nand simplicial attention pooling (SAP) operators, applied to $k$-simplices.\nHL-filters leverage the unique topology of $k$-simplices encoded by the\nHodge-Laplacian (HL) operator, operating within the spectral domain of the\n$k$-th HL operator. To address computation challenges, we introduce a\npolynomial approximation for HL-filters, exhibiting spatial localization\nproperties. Additionally, we propose a pooling operator to coarsen\n$k$-simplices, combining features through simplicial attention mechanisms of\nself-attention and cross-attention via transformers and SP operators, capturing\ntopological interconnections across multiple dimensions of simplices. The\nHL-HGAT is comprehensively evaluated across diverse graph applications,\nincluding NP-hard problems, graph multi-label and classification challenges,\nand graph regression tasks in logistics, computer vision, biology, chemistry,\nand neuroscience. The results demonstrate the model's efficacy and versatility\nin handling a wide range of graph-based scenarios.\n","authors":["Jinghan Huang","Qiufeng Chen","Yijun Bian","Pengli Zhu","Nanguang Chen","Moo K. Chung","Anqi Qiu"],"pdf_url":"https://arxiv.org/pdf/2403.06687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11631v2","updated":"2024-04-22T16:18:38Z","published":"2024-02-18T16:17:25Z","title":"Neuromorphic Face Analysis: a Survey","summary":" Neuromorphic sensors, also known as event cameras, are a class of imaging\ndevices mimicking the function of biological visual systems. Unlike traditional\nframe-based cameras, which capture fixed images at discrete intervals,\nneuromorphic sensors continuously generate events that represent changes in\nlight intensity or motion in the visual field with high temporal resolution and\nlow latency. These properties have proven to be interesting in modeling human\nfaces, both from an effectiveness and a privacy-preserving point of view.\nNeuromorphic face analysis however is still a raw and unstructured field of\nresearch, with several attempts at addressing different tasks with no clear\nstandard or benchmark. This survey paper presents a comprehensive overview of\ncapabilities, challenges and emerging applications in the domain of\nneuromorphic face analysis, to outline promising directions and open issues.\nAfter discussing the fundamental working principles of neuromorphic vision and\npresenting an in-depth overview of the related research, we explore the current\nstate of available data, standard data representations, emerging challenges,\nand limitations that require further investigation. This paper aims to\nhighlight the recent process in this evolving field to provide to both\nexperienced and newly come researchers an all-encompassing analysis of the\nstate of the art along with its problems and shortcomings.\n","authors":["Federico Becattini","Lorenzo Berlincioni","Luca Cultrera","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2402.11631v2.pdf","comment":"Submitted to Patter Recognition Letters"},{"id":"http://arxiv.org/abs/2404.14309v1","updated":"2024-04-22T16:10:38Z","published":"2024-04-22T16:10:38Z","title":"Towards Better Adversarial Purification via Adversarial Denoising\n Diffusion Training","summary":" Recently, diffusion-based purification (DBP) has emerged as a promising\napproach for defending against adversarial attacks. However, previous studies\nhave used questionable methods to evaluate the robustness of DBP models, their\nexplanations of DBP robustness also lack experimental support. We re-examine\nDBP robustness using precise gradient, and discuss the impact of stochasticity\non DBP robustness. To better explain DBP robustness, we assess DBP robustness\nunder a novel attack setting, Deterministic White-box, and pinpoint\nstochasticity as the main factor in DBP robustness. Our results suggest that\nDBP models rely on stochasticity to evade the most effective attack direction,\nrather than directly countering adversarial perturbations. To improve the\nrobustness of DBP models, we propose Adversarial Denoising Diffusion Training\n(ADDT). This technique uses Classifier-Guided Perturbation Optimization (CGPO)\nto generate adversarial perturbation through guidance from a pre-trained\nclassifier, and uses Rank-Based Gaussian Mapping (RBGM) to convert adversarial\npertubation into a normal Gaussian distribution. Empirical results show that\nADDT improves the robustness of DBP models. Further experiments confirm that\nADDT equips DBP models with the ability to directly counter adversarial\nperturbations.\n","authors":["Yiming Liu","Kezhao Liu","Yao Xiao","Ziyi Dong","Xiaogang Xu","Pengxu Wei","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2404.14309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14281v1","updated":"2024-04-22T15:29:28Z","published":"2024-04-22T15:29:28Z","title":"Fast and Robust Normal Estimation for Sparse LiDAR Scans","summary":" Light Detection and Ranging (LiDAR) technology has proven to be an important\npart of many robotics systems. Surface normals estimated from LiDAR data are\ncommonly used for a variety of tasks in such systems. As most of the today's\nmechanical LiDAR sensors produce sparse data, estimating normals from a single\nscan in a robust manner poses difficulties.\n In this paper, we address the problem of estimating normals for sparse LiDAR\ndata avoiding the typical issues of smoothing out the normals in high curvature\nareas.\n Mechanical LiDARs rotate a set of rigidly mounted lasers. One firing of such\na set of lasers produces an array of points where each point's neighbor is\nknown due to the known firing pattern of the scanner. We use this knowledge to\nconnect these points to their neighbors and label them using the angles of the\nlines connecting them. When estimating normals at these points, we only\nconsider points with the same label as neighbors. This allows us to avoid\nestimating normals in high curvature areas.\n We evaluate our approach on various data, both self-recorded and publicly\navailable, acquired using various sparse LiDAR sensors. We show that using our\nmethod for normal estimation leads to normals that are more robust in areas\nwith high curvature which leads to maps of higher quality. We also show that\nour method only incurs a constant factor runtime overhead with respect to a\nlightweight baseline normal estimation procedure and is therefore suited for\noperation in computationally demanding environments.\n","authors":["Igor Bogoslavskyi","Konstantinos Zampogiannis","Raymond Phan"],"pdf_url":"https://arxiv.org/pdf/2404.14281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14280v1","updated":"2024-04-22T15:29:19Z","published":"2024-04-22T15:29:19Z","title":"RESFM: Robust Equivariant Multiview Structure from Motion","summary":" Multiview Structure from Motion is a fundamental and challenging computer\nvision problem. A recent deep-based approach was proposed utilizing matrix\nequivariant architectures for the simultaneous recovery of camera pose and 3D\nscene structure from large image collections. This work however made the\nunrealistic assumption that the point tracks given as input are clean of\noutliers. Here we propose an architecture suited to dealing with outliers by\nadding an inlier/outlier classifying module that respects the model\nequivariance and by adding a robust bundle adjustment step. Experiments\ndemonstrate that our method can be successfully applied in realistic settings\nthat include large image collections and point tracks extracted with common\nheuristics and include many outliers.\n","authors":["Fadi Khatib","Yoni Kasten","Dror Moran","Meirav Galun","Ronen Basri"],"pdf_url":"https://arxiv.org/pdf/2404.14280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14279v1","updated":"2024-04-22T15:28:42Z","published":"2024-04-22T15:28:42Z","title":"Co-designing a Sub-millisecond Latency Event-based Eye Tracking System\n with Submanifold Sparse CNN","summary":" Eye-tracking technology is integral to numerous consumer electronics\napplications, particularly in the realm of virtual and augmented reality\n(VR/AR). These applications demand solutions that excel in three crucial\naspects: low-latency, low-power consumption, and precision. Yet, achieving\noptimal performance across all these fronts presents a formidable challenge,\nnecessitating a balance between sophisticated algorithms and efficient backend\nhardware implementations. In this study, we tackle this challenge through a\nsynergistic software/hardware co-design of the system with an event camera.\nLeveraging the inherent sparsity of event-based input data, we integrate a\nnovel sparse FPGA dataflow accelerator customized for submanifold sparse\nconvolution neural networks (SCNN). The SCNN implemented on the accelerator can\nefficiently extract the embedding feature vector from each representation of\nevent slices by only processing the non-zero activations. Subsequently, these\nvectors undergo further processing by a gated recurrent unit (GRU) and a fully\nconnected layer on the host CPU to generate the eye centers. Deployment and\nevaluation of our system reveal outstanding performance metrics. On the\nEvent-based Eye-Tracking-AIS2024 dataset, our system achieves 81% p5 accuracy,\n99.5% p10 accuracy, and 3.71 Mean Euclidean Distance with 0.7 ms latency while\nonly consuming 2.29 mJ per inference. Notably, our solution opens up\nopportunities for future eye-tracking systems. Code is available at\nhttps://github.com/CASR-HKU/ESDA/tree/eye_tracking.\n","authors":["Baoheng Zhang","Yizhao Gao","Jingyuan Li","Hayden Kwok-Hay So"],"pdf_url":"https://arxiv.org/pdf/2404.14279v1.pdf","comment":"Accepted to CVPR 2024 workshop, AIS: Vision, Graphics, and AI for\n Streaming"},{"id":"http://arxiv.org/abs/2312.13328v2","updated":"2024-04-22T15:05:18Z","published":"2023-12-20T17:18:44Z","title":"NeLF-Pro: Neural Light Field Probes for Multi-Scale Novel View Synthesis","summary":" We present NeLF-Pro, a novel representation to model and reconstruct light\nfields in diverse natural scenes that vary in extent and spatial granularity.\nIn contrast to previous fast reconstruction methods that represent the 3D scene\nglobally, we model the light field of a scene as a set of local light field\nfeature probes, parameterized with position and multi-channel 2D feature maps.\nOur central idea is to bake the scene's light field into spatially varying\nlearnable representations and to query point features by weighted blending of\nprobes close to the camera - allowing for mipmap representation and rendering.\nWe introduce a novel vector-matrix-matrix (VMM) factorization technique that\neffectively represents the light field feature probes as products of core\nfactors (i.e., VM) shared among local feature probes, and a basis factor (i.e.,\nM) - efficiently encoding internal relationships and patterns within the scene.\nExperimentally, we demonstrate that NeLF-Pro significantly boosts the\nperformance of feature grid-based representations, and achieves fast\nreconstruction with better rendering quality while maintaining compact\nmodeling. Project webpage https://sinoyou.github.io/nelf-pro/.\n","authors":["Zinuo You","Andreas Geiger","Anpei Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13328v2.pdf","comment":"CVPR 2024 Conference Paper, Camera Ready Version"},{"id":"http://arxiv.org/abs/2404.14249v1","updated":"2024-04-22T15:01:32Z","published":"2024-04-22T15:01:32Z","title":"CLIP-GS: CLIP-Informed Gaussian Splatting for Real-time and\n View-consistent 3D Semantic Understanding","summary":" The recent 3D Gaussian Splatting (GS) exhibits high-quality and real-time\nsynthesis of novel views in 3D scenes. Currently, it primarily focuses on\ngeometry and appearance modeling, while lacking the semantic understanding of\nscenes. To bridge this gap, we present CLIP-GS, which integrates semantics from\nContrastive Language-Image Pre-Training (CLIP) into Gaussian Splatting to\nefficiently comprehend 3D environments without annotated semantic data. In\nspecific, rather than straightforwardly learning and rendering high-dimensional\nsemantic features of 3D Gaussians, which significantly diminishes the\nefficiency, we propose a Semantic Attribute Compactness (SAC) approach. SAC\nexploits the inherent unified semantics within objects to learn compact yet\neffective semantic representations of 3D Gaussians, enabling highly efficient\nrendering (>100 FPS). Additionally, to address the semantic ambiguity, caused\nby utilizing view-inconsistent 2D CLIP semantics to supervise Gaussians, we\nintroduce a 3D Coherent Self-training (3DCS) strategy, resorting to the\nmulti-view consistency originated from the 3D model. 3DCS imposes cross-view\nsemantic consistency constraints by leveraging refined, self-predicted\npseudo-labels derived from the trained 3D Gaussian model, thereby enhancing\nprecise and view-consistent segmentation results. Extensive experiments\ndemonstrate that our method remarkably outperforms existing state-of-the-art\napproaches, achieving improvements of 17.29% and 20.81% in mIoU metric on\nReplica and ScanNet datasets, respectively, while maintaining real-time\nrendering speed. Furthermore, our approach exhibits superior performance even\nwith sparse input data, verifying the robustness of our method.\n","authors":["Guibiao Liao","Jiankun Li","Zhenyu Bao","Xiaoqing Ye","Jingdong Wang","Qing Li","Kanglin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.14249v1.pdf","comment":"https://github.com/gbliao/CLIP-GS"},{"id":"http://arxiv.org/abs/2404.14248v1","updated":"2024-04-22T15:01:12Z","published":"2024-04-22T15:01:12Z","title":"NTIRE 2024 Challenge on Low Light Image Enhancement: Methods and Results","summary":" This paper reviews the NTIRE 2024 low light image enhancement challenge,\nhighlighting the proposed solutions and results. The aim of this challenge is\nto discover an effective network design or solution capable of generating\nbrighter, clearer, and visually appealing results when dealing with a variety\nof conditions, including ultra-high resolution (4K and beyond), non-uniform\nillumination, backlighting, extreme darkness, and night scenes. A notable total\nof 428 participants registered for the challenge, with 22 teams ultimately\nmaking valid submissions. This paper meticulously evaluates the\nstate-of-the-art advancements in enhancing low-light images, reflecting the\nsignificant progress and creativity in this field.\n","authors":["Xiaoning Liu","Zongwei Wu","Ao Li","Florin-Alexandru Vasluianu","Yulun Zhang","Shuhang Gu","Le Zhang","Ce Zhu","Radu Timofte","Zhi Jin","Hongjun Wu","Chenxi Wang","Haitao Ling","Yuanhao Cai","Hao Bian","Yuxin Zheng","Jing Lin","Alan Yuille","Ben Shao","Jin Guo","Tianli Liu","Mohao Wu","Yixu Feng","Shuo Hou","Haotian Lin","Yu Zhu","Peng Wu","Wei Dong","Jinqiu Sun","Yanning Zhang","Qingsen Yan","Wenbin Zou","Weipeng Yang","Yunxiang Li","Qiaomu Wei","Tian Ye","Sixiang Chen","Zhao Zhang","Suiyi Zhao","Bo Wang","Yan Luo","Zhichao Zuo","Mingshen Wang","Junhu Wang","Yanyan Wei","Xiaopeng Sun","Yu Gao","Jiancheng Huang","Hongming Chen","Xiang Chen","Hui Tang","Yuanbin Chen","Yuanbo Zhou","Xinwei Dai","Xintao Qiu","Wei Deng","Qinquan Gao","Tong Tong","Mingjia Li","Jin Hu","Xinyu He","Xiaojie Guo"," Sabarinathan","K Uma","A Sasithradevi","B Sathya Bama","S. Mohamed Mansoor Roomi","V. Srivatsav","Jinjuan Wang","Long Sun","Qiuying Chen","Jiahong Shao","Yizhi Zhang","Marcos V. Conde","Daniel Feijoo","Juan C. Benito","Alvaro García","Jaeho Lee","Seongwan Kim","Sharif S M A","Nodirkhuja Khujaev","Roman Tsoy","Ali Murtaza","Uswah Khairuddin","Ahmad 'Athif Mohd Faudzi","Sampada Malagi","Amogh Joshi","Nikhil Akalwadi","Chaitra Desai","Ramesh Ashok Tabib","Uma Mudenagudi","Wenyi Lian","Wenjing Lian","Jagadeesh Kalyanshetti","Vijayalaxmi Ashok Aralikatti","Palani Yashaswini","Nitish Upasi","Dikshit Hegde","Ujwala Patil","Sujata C","Xingzhuo Yan","Wei Hao","Minghan Fu","Pooja choksy","Anjali Sarvaiya","Kishor Upla","Kiran Raja","Hailong Yan","Yunkai Zhang","Baiang Li","Jingyi Zhang","Huan Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.14248v1.pdf","comment":"NTIRE 2024 Challenge Report"},{"id":"http://arxiv.org/abs/2404.14247v1","updated":"2024-04-22T15:00:51Z","published":"2024-04-22T15:00:51Z","title":"From Modalities to Styles: Rethinking the Domain Gap in Heterogeneous\n Face Recognition","summary":" Heterogeneous Face Recognition (HFR) focuses on matching faces from different\ndomains, for instance, thermal to visible images, making Face Recognition (FR)\nsystems more versatile for challenging scenarios. However, the domain gap\nbetween these domains and the limited large-scale datasets in the target HFR\nmodalities make it challenging to develop robust HFR models from scratch. In\nour work, we view different modalities as distinct styles and propose a method\nto modulate feature maps of the target modality to address the domain gap. We\npresent a new Conditional Adaptive Instance Modulation (CAIM ) module that\nseamlessly fits into existing FR networks, turning them into HFR-ready systems.\nThe CAIM block modulates intermediate feature maps, efficiently adapting to the\nstyle of the source modality and bridging the domain gap. Our method enables\nend-to-end training using a small set of paired samples. We extensively\nevaluate the proposed approach on various challenging HFR benchmarks, showing\nthat it outperforms state-of-the-art methods. The source code and protocols for\nreproducing the findings will be made publicly available\n","authors":["Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2404.14247v1.pdf","comment":"Accepted for publication in IEEE TBIOM"},{"id":"http://arxiv.org/abs/2404.14241v1","updated":"2024-04-22T14:53:27Z","published":"2024-04-22T14:53:27Z","title":"UrbanCross: Enhancing Satellite Image-Text Retrieval with Cross-Domain\n Adaptation","summary":" Urbanization challenges underscore the necessity for effective satellite\nimage-text retrieval methods to swiftly access specific information enriched\nwith geographic semantics for urban applications. However, existing methods\noften overlook significant domain gaps across diverse urban landscapes,\nprimarily focusing on enhancing retrieval performance within single domains. To\ntackle this issue, we present UrbanCross, a new framework for cross-domain\nsatellite image-text retrieval. UrbanCross leverages a high-quality,\ncross-domain dataset enriched with extensive geo-tags from three countries to\nhighlight domain diversity. It employs the Large Multimodal Model (LMM) for\ntextual refinement and the Segment Anything Model (SAM) for visual\naugmentation, achieving a fine-grained alignment of images, segments and texts,\nyielding a 10% improvement in retrieval performance. Additionally, UrbanCross\nincorporates an adaptive curriculum-based source sampler and a weighted\nadversarial cross-domain fine-tuning module, progressively enhancing\nadaptability across various domains. Extensive experiments confirm UrbanCross's\nsuperior efficiency in retrieval and adaptation to new urban environments,\ndemonstrating an average performance increase of 15% over its version without\ndomain adaptation mechanisms, effectively bridging the domain gap.\n","authors":["Siru Zhong","Xixuan Hao","Yibo Yan","Ying Zhang","Yangqiu Song","Yuxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2404.14241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18985v2","updated":"2024-04-22T14:49:36Z","published":"2024-03-27T20:07:39Z","title":"Robustness and Visual Explanation for Black Box Image, Video, and ECG\n Signal Classification with Reinforcement Learning","summary":" We present a generic Reinforcement Learning (RL) framework optimized for\ncrafting adversarial attacks on different model types spanning from ECG signal\nanalysis (1D), image classification (2D), and video classification (3D). The\nframework focuses on identifying sensitive regions and inducing\nmisclassifications with minimal distortions and various distortion types. The\nnovel RL method outperforms state-of-the-art methods for all three\napplications, proving its efficiency. Our RL approach produces superior\nlocalization masks, enhancing interpretability for image classification and ECG\nanalysis models. For applications such as ECG analysis, our platform highlights\ncritical ECG segments for clinicians while ensuring resilience against\nprevalent distortions. This comprehensive tool aims to bolster both resilience\nwith adversarial training and transparency across varied applications and data\ntypes.\n","authors":["Soumyendu Sarkar","Ashwin Ramesh Babu","Sajad Mousavi","Vineet Gundecha","Avisek Naug","Sahand Ghorbanpour"],"pdf_url":"https://arxiv.org/pdf/2403.18985v2.pdf","comment":"AAAI Proceedings reference:\n https://ojs.aaai.org/index.php/AAAI/article/view/30579"},{"id":"http://arxiv.org/abs/2404.14239v1","updated":"2024-04-22T14:47:54Z","published":"2024-04-22T14:47:54Z","title":"MultiBooth: Towards Generating All Your Concepts in an Image from Text","summary":" This paper introduces MultiBooth, a novel and efficient technique for\nmulti-concept customization in image generation from text. Despite the\nsignificant advancements in customized generation methods, particularly with\nthe success of diffusion models, existing methods often struggle with\nmulti-concept scenarios due to low concept fidelity and high inference cost.\nMultiBooth addresses these issues by dividing the multi-concept generation\nprocess into two phases: a single-concept learning phase and a multi-concept\nintegration phase. During the single-concept learning phase, we employ a\nmulti-modal image encoder and an efficient concept encoding technique to learn\na concise and discriminative representation for each concept. In the\nmulti-concept integration phase, we use bounding boxes to define the generation\narea for each concept within the cross-attention map. This method enables the\ncreation of individual concepts within their specified regions, thereby\nfacilitating the formation of multi-concept images. This strategy not only\nimproves concept fidelity but also reduces additional inference cost.\nMultiBooth surpasses various baselines in both qualitative and quantitative\nevaluations, showcasing its superior performance and computational efficiency.\nProject Page: https://multibooth.github.io/\n","authors":["Chenyang Zhu","Kai Li","Yue Ma","Chunming He","Li Xiu"],"pdf_url":"https://arxiv.org/pdf/2404.14239v1.pdf","comment":"Project Page: https://multibooth.github.io/ . Github Page:\n https://github.com/chenyangzhu1/MultiBooth"},{"id":"http://arxiv.org/abs/2404.14233v1","updated":"2024-04-22T14:46:10Z","published":"2024-04-22T14:46:10Z","title":"Detecting and Mitigating Hallucination in Large Vision Language Models\n via Fine-Grained AI Feedback","summary":" The rapidly developing Large Vision Language Models (LVLMs) have shown\nnotable capabilities on a range of multi-modal tasks, but still face the\nhallucination phenomena where the generated texts do not align with the given\ncontexts, significantly restricting the usages of LVLMs. Most previous work\ndetects and mitigates hallucination at the coarse-grained level or requires\nexpensive annotation (e.g., labeling by proprietary models or human experts).\nTo address these issues, we propose detecting and mitigating hallucinations in\nLVLMs via fine-grained AI feedback. The basic idea is that we generate a\nsmall-size sentence-level hallucination annotation dataset by proprietary\nmodels, whereby we train a hallucination detection model which can perform\nsentence-level hallucination detection, covering primary hallucination types\n(i.e., object, attribute, and relationship). Then, we propose a\ndetect-then-rewrite pipeline to automatically construct preference dataset for\ntraining hallucination mitigating model. Furthermore, we propose\ndifferentiating the severity of hallucinations, and introducing a Hallucination\nSeverity-Aware Direct Preference Optimization (HSA-DPO) for mitigating\nhallucination in LVLMs by incorporating the severity of hallucinations into\npreference learning. Extensive experiments demonstrate the effectiveness of our\nmethod.\n","authors":["Wenyi Xiao","Ziwei Huang","Leilei Gan","Wanggui He","Haoyuan Li","Zhelun Yu","Hao Jiang","Fei Wu","Linchao Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.14233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16368v2","updated":"2024-04-22T14:44:45Z","published":"2024-02-26T07:45:14Z","title":"SPINEPS -- Automatic Whole Spine Segmentation of T2-weighted MR images\n using a Two-Phase Approach to Multi-class Semantic and Instance Segmentation","summary":" Purpose. To present SPINEPS, an open-source deep learning approach for\nsemantic and instance segmentation of 14 spinal structures (ten vertebra\nsubstructures, intervertebral discs, spinal cord, spinal canal, and sacrum) in\nwhole body T2w MRI.\n Methods. During this HIPPA-compliant, retrospective study, we utilized the\npublic SPIDER dataset (218 subjects, 63% female) and a subset of the German\nNational Cohort (1423 subjects, mean age 53, 49% female) for training and\nevaluation. We combined CT and T2w segmentations to train models that segment\n14 spinal structures in T2w sagittal scans both semantically and instance-wise.\nPerformance evaluation metrics included Dice similarity coefficient, average\nsymmetrical surface distance, panoptic quality, segmentation quality, and\nrecognition quality. Statistical significance was assessed using the Wilcoxon\nsigned-rank test. An in-house dataset was used to qualitatively evaluate\nout-of-distribution samples.\n Results. On the public dataset, our approach outperformed the baseline\n(instance-wise vertebra dice score 0.929 vs. 0.907, p-value<0.001). Training on\nauto-generated annotations and evaluating on manually corrected test data from\nthe GNC yielded global dice scores of 0.900 for vertebrae, 0.960 for\nintervertebral discs, and 0.947 for the spinal canal. Incorporating the SPIDER\ndataset during training increased these scores to 0.920, 0.967, 0.958,\nrespectively.\n Conclusions. The proposed segmentation approach offers robust segmentation of\n14 spinal structures in T2w sagittal images, including the spinal cord, spinal\ncanal, intervertebral discs, endplate, sacrum, and vertebrae. The approach\nyields both a semantic and instance mask as output, thus being easy to utilize.\nThis marks the first publicly available algorithm for whole spine segmentation\nin sagittal T2w MR imaging.\n","authors":["Hendrik Möller","Robert Graf","Joachim Schmitt","Benjamin Keinert","Matan Atad","Anjany Sekuboyina","Felix Streckenbach","Hanna Schön","Florian Kofler","Thomas Kroencke","Stefanie Bette","Stefan Willich","Thomas Keil","Thoralf Niendorf","Tobias Pischon","Beate Endemann","Bjoern Menze","Daniel Rueckert","Jan S. Kirschke"],"pdf_url":"https://arxiv.org/pdf/2402.16368v2.pdf","comment":"https://github.com/Hendrik-code/spineps"},{"id":"http://arxiv.org/abs/2404.00257v2","updated":"2024-04-22T14:38:25Z","published":"2024-03-30T06:17:39Z","title":"YOLOOC: YOLO-based Open-Class Incremental Object Detection with Novel\n Class Discovery","summary":" Because of its use in practice, open-world object detection (OWOD) has gotten\na lot of attention recently. The challenge is how can a model detect novel\nclasses and then incrementally learn them without forgetting previously known\nclasses. Previous approaches hinge on strongly-supervised or weakly-supervised\nnovel-class data for novel-class detection, which may not apply to real\napplications. We construct a new benchmark that novel classes are only\nencountered at the inference stage. And we propose a new OWOD detector YOLOOC,\nbased on the YOLO architecture yet for the Open-Class setup. We introduce label\nsmoothing to prevent the detector from over-confidently mapping novel classes\nto known classes and to discover novel classes. Extensive experiments conducted\non our more realistic setup demonstrate the effectiveness of our method for\ndiscovering novel classes in our new benchmark.\n","authors":["Qian Wan","Xiang Xiang","Qinhao Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.00257v2.pdf","comment":"Withdrawn because it was submitted without consent of the first\n author. In addition, this submission has some errors"},{"id":"http://arxiv.org/abs/2207.04934v3","updated":"2024-04-22T14:11:18Z","published":"2022-07-11T15:15:33Z","title":"Multilevel Geometric Optimization for Regularised Constrained Linear\n Inverse Problems","summary":" We present a geometric multilevel optimization approach that smoothly\nincorporates box constraints. Given a box constrained optimization problem, we\nconsider a hierarchy of models with varying discretization levels. Finer models\nare accurate but expensive to compute, while coarser models are less accurate\nbut cheaper to compute. When working at the fine level, multilevel optimisation\ncomputes the search direction based on a coarser model which speeds up updates\nat the fine level. Moreover, exploiting geometry induced by the hierarchy the\nfeasibility of the updates is preserved. In particular, our approach extends\nclassical components of multigrid methods like restriction and prolongation to\nthe Riemannian structure of our constraints.\n","authors":["Sebastian Müller","Stefania Petra","Matthias Zisler"],"pdf_url":"https://arxiv.org/pdf/2207.04934v3.pdf","comment":"25 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.14199v1","updated":"2024-04-22T14:09:53Z","published":"2024-04-22T14:09:53Z","title":"Generalizable Neural Human Renderer","summary":" While recent advancements in animatable human rendering have achieved\nremarkable results, they require test-time optimization for each subject which\ncan be a significant limitation for real-world applications. To address this,\nwe tackle the challenging task of learning a Generalizable Neural Human\nRenderer (GNH), a novel method for rendering animatable humans from monocular\nvideo without any test-time optimization. Our core method focuses on\ntransferring appearance information from the input video to the output image\nplane by utilizing explicit body priors and multi-view geometry. To render the\nsubject in the intended pose, we utilize a straightforward CNN-based image\nrenderer, foregoing the more common ray-sampling or rasterizing-based rendering\nmodules. Our GNH achieves remarkable generalizable, photorealistic rendering\nwith unseen subjects with a three-stage process. We quantitatively and\nqualitatively demonstrate that GNH significantly surpasses current\nstate-of-the-art methods, notably achieving a 31.3% improvement in LPIPS.\n","authors":["Mana Masuda","Jinhyung Park","Shun Iwase","Rawal Khirodkar","Kris Kitani"],"pdf_url":"https://arxiv.org/pdf/2404.14199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14198v1","updated":"2024-04-22T14:07:42Z","published":"2024-04-22T14:07:42Z","title":"BCFPL: Binary classification ConvNet based Fast Parking space\n recognition with Low resolution image","summary":" The automobile plays an important role in the economic activities of mankind,\nespecially in the metropolis. Under the circumstances, the demand of quick\nsearch for available parking spaces has become a major concern for the\nautomobile drivers. Meanwhile, the public sense of privacy is also awaking, the\nimage-based parking space recognition methods lack the attention of privacy\nprotection. In this paper, we proposed a binary convolutional neural network\nwith lightweight design structure named BCFPL, which can be used to train with\nlow-resolution parking space images and offer a reasonable recognition result.\nThe images of parking space were collected from various complex environments,\nincluding different weather, occlusion conditions, and various camera angles.\nWe conducted the training and testing progresses among different datasets and\npartial subsets. The experimental results show that the accuracy of BCFPL does\nnot decrease compared with the original resolution image directly, and can\nreach the average level of the existing mainstream method. BCFPL also has low\nhardware requirements and fast recognition speed while meeting the privacy\nrequirements, so it has application potential in intelligent city construction\nand automatic driving field.\n","authors":["Shuo Zhang","Xin Chen","Zixuan Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04661v3","updated":"2024-04-22T14:04:55Z","published":"2024-03-07T17:07:51Z","title":"Dynamic Cross Attention for Audio-Visual Person Verification","summary":" Although person or identity verification has been predominantly explored\nusing individual modalities such as face and voice, audio-visual fusion has\nrecently shown immense potential to outperform unimodal approaches. Audio and\nvisual modalities are often expected to pose strong complementary\nrelationships, which plays a crucial role in effective audio-visual fusion.\nHowever, they may not always strongly complement each other, they may also\nexhibit weak complementary relationships, resulting in poor audio-visual\nfeature representations. In this paper, we propose a Dynamic Cross-Attention\n(DCA) model that can dynamically select the cross-attended or unattended\nfeatures on the fly based on the strong or weak complementary relationships,\nrespectively, across audio and visual modalities. In particular, a conditional\ngating layer is designed to evaluate the contribution of the cross-attention\nmechanism and choose cross-attended features only when they exhibit strong\ncomplementary relationships, otherwise unattended features. Extensive\nexperiments are conducted on the Voxceleb1 dataset to demonstrate the\nrobustness of the proposed model. Results indicate that the proposed model\nconsistently improves the performance on multiple variants of cross-attention\nwhile outperforming the state-of-the-art methods.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.04661v3.pdf","comment":"Accepted to FG2024"},{"id":"http://arxiv.org/abs/2404.07600v2","updated":"2024-04-22T13:49:54Z","published":"2024-04-11T09:39:58Z","title":"Implicit and Explicit Language Guidance for Diffusion-based Visual\n Perception","summary":" Text-to-image diffusion models have shown powerful ability on conditional\nimage synthesis. With large-scale vision-language pre-training, diffusion\nmodels are able to generate high-quality images with rich texture and\nreasonable structure under different text prompts. However, it is an open\nproblem to adapt the pre-trained diffusion model for visual perception. In this\npaper, we propose an implicit and explicit language guidance framework for\ndiffusion-based perception, named IEDP. Our IEDP comprises an implicit language\nguidance branch and an explicit language guidance branch. The implicit branch\nemploys frozen CLIP image encoder to directly generate implicit text embeddings\nthat are fed to diffusion model, without using explicit text prompts. The\nexplicit branch utilizes the ground-truth labels of corresponding images as\ntext prompts to condition feature extraction of diffusion model. During\ntraining, we jointly train diffusion model by sharing the model weights of\nthese two branches. As a result, implicit and explicit branches can jointly\nguide feature learning. During inference, we only employ implicit branch for\nfinal prediction, which does not require any ground-truth labels. Experiments\nare performed on two typical perception tasks, including semantic segmentation\nand depth estimation. Our IEDP achieves promising performance on both tasks.\nFor semantic segmentation, our IEDP has the mIoU$^\\text{ss}$ score of 55.9% on\nAD20K validation set, which outperforms the baseline method VPD by 2.2%. For\ndepth estimation, our IEDP outperforms the baseline method VPD with a relative\ngain of 11.0%.\n","authors":["Hefeng Wang","Jiale Cao","Jin Xie","Aiping Yang","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2404.07600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14177v1","updated":"2024-04-22T13:49:42Z","published":"2024-04-22T13:49:42Z","title":"Face2Face: Label-driven Facial Retouching Restoration","summary":" With the popularity of social media platforms such as Instagram and TikTok,\nand the widespread availability and convenience of retouching tools, an\nincreasing number of individuals are utilizing these tools to beautify their\nfacial photographs. This poses challenges for fields that place high demands on\nthe authenticity of photographs, such as identity verification and social\nmedia. By altering facial images, users can easily create deceptive images,\nleading to the dissemination of false information. This may pose challenges to\nthe reliability of identity verification systems and social media, and even\nlead to online fraud. To address this issue, some work has proposed makeup\nremoval methods, but they still lack the ability to restore images involving\ngeometric deformations caused by retouching. To tackle the problem of facial\nretouching restoration, we propose a framework, dubbed Face2Face, which\nconsists of three components: a facial retouching detector, an image\nrestoration model named FaceR, and a color correction module called\nHierarchical Adaptive Instance Normalization (H-AdaIN). Firstly, the facial\nretouching detector predicts a retouching label containing three integers,\nindicating the retouching methods and their corresponding degrees. Then FaceR\nrestores the retouched image based on the predicted retouching label. Finally,\nH-AdaIN is applied to address the issue of color shift arising from diffusion\nmodels. Extensive experiments demonstrate the effectiveness of our framework\nand each module.\n","authors":["Guanhua Zhao","Yu Gu","Xuhan Sheng","Yujie Hu","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.14177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13534v2","updated":"2024-04-22T13:30:40Z","published":"2023-12-21T02:28:41Z","title":"SE(3)-Equivariant and Noise-Invariant 3D Rigid Motion Tracking in Brain\n MRI","summary":" Rigid motion tracking is paramount in many medical imaging applications where\nmovements need to be detected, corrected, or accounted for. Modern strategies\nrely on convolutional neural networks (CNN) and pose this problem as rigid\nregistration. Yet, CNNs do not exploit natural symmetries in this task, as they\nare equivariant to translations (their outputs shift with their inputs) but not\nto rotations. Here we propose EquiTrack, the first method that uses recent\nsteerable SE(3)-equivariant CNNs (E-CNN) for motion tracking. While steerable\nE-CNNs can extract corresponding features across different poses, testing them\non noisy medical images reveals that they do not have enough learning capacity\nto learn noise invariance. Thus, we introduce a hybrid architecture that pairs\na denoiser with an E-CNN to decouple the processing of anatomically irrelevant\nintensity features from the extraction of equivariant spatial features. Rigid\ntransforms are then estimated in closed-form. EquiTrack outperforms\nstate-of-the-art learning and optimisation methods for motion tracking in adult\nbrain MRI and fetal MRI time series. Our code is available at\nhttps://github.com/BBillot/EquiTrack.\n","authors":["Benjamin Billot","Neel Dey","Daniel Moyer","Malte Hoffmann","Esra Abaci Turk","Borjan Gagoski","Ellen Grant","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2312.13534v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2404.14162v1","updated":"2024-04-22T13:21:09Z","published":"2024-04-22T13:21:09Z","title":"FLDM-VTON: Faithful Latent Diffusion Model for Virtual Try-on","summary":" Despite their impressive generative performance, latent diffusion model-based\nvirtual try-on (VTON) methods lack faithfulness to crucial details of the\nclothes, such as style, pattern, and text. To alleviate these issues caused by\nthe diffusion stochastic nature and latent supervision, we propose a novel\nFaithful Latent Diffusion Model for VTON, termed FLDM-VTON. FLDM-VTON improves\nthe conventional latent diffusion process in three major aspects. First, we\npropose incorporating warped clothes as both the starting point and local\ncondition, supplying the model with faithful clothes priors. Second, we\nintroduce a novel clothes flattening network to constrain generated try-on\nimages, providing clothes-consistent faithful supervision. Third, we devise a\nclothes-posterior sampling for faithful inference, further enhancing the model\nperformance over conventional clothes-agnostic Gaussian sampling. Extensive\nexperimental results on the benchmark VITON-HD and Dress Code datasets\ndemonstrate that our FLDM-VTON outperforms state-of-the-art baselines and is\nable to generate photo-realistic try-on images with faithful clothing details.\n","authors":["Chenhui Wang","Tao Chen","Zhihao Chen","Zhizhong Huang","Taoran Jiang","Qi Wang","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2404.14162v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2312.02567v2","updated":"2024-04-22T13:11:56Z","published":"2023-12-05T08:32:27Z","title":"Think Twice Before Selection: Federated Evidential Active Learning for\n Medical Image Analysis with Domain Shifts","summary":" Federated learning facilitates the collaborative learning of a global model\nacross multiple distributed medical institutions without centralizing data.\nNevertheless, the expensive cost of annotation on local clients remains an\nobstacle to effectively utilizing local data. To mitigate this issue, federated\nactive learning methods suggest leveraging local and global model predictions\nto select a relatively small amount of informative local data for annotation.\nHowever, existing methods mainly focus on all local data sampled from the same\ndomain, making them unreliable in realistic medical scenarios with domain\nshifts among different clients. In this paper, we make the first attempt to\nassess the informativeness of local data derived from diverse domains and\npropose a novel methodology termed Federated Evidential Active Learning (FEAL)\nto calibrate the data evaluation under domain shift. Specifically, we introduce\na Dirichlet prior distribution in both local and global models to treat the\nprediction as a distribution over the probability simplex and capture both\naleatoric and epistemic uncertainties by using the Dirichlet-based evidential\nmodel. Then we employ the epistemic uncertainty to calibrate the aleatoric\nuncertainty. Afterward, we design a diversity relaxation strategy to reduce\ndata redundancy and maintain data diversity. Extensive experiments and analysis\non five real multi-center medical image datasets demonstrate the superiority of\nFEAL over the state-of-the-art active learning methods in federated scenarios\nwith domain shifts. The code will be available at\nhttps://github.com/JiayiChen815/FEAL.\n","authors":["Jiayi Chen","Benteng Ma","Hengfei Cui","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2312.02567v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.14135v1","updated":"2024-04-22T12:39:12Z","published":"2024-04-22T12:39:12Z","title":"Text in the Dark: Extremely Low-Light Text Image Enhancement","summary":" Extremely low-light text images are common in natural scenes, making scene\ntext detection and recognition challenging. One solution is to enhance these\nimages using low-light image enhancement methods before text extraction.\nHowever, previous methods often do not try to particularly address the\nsignificance of low-level features, which are crucial for optimal performance\non downstream scene text tasks. Further research is also hindered by the lack\nof extremely low-light text datasets. To address these limitations, we propose\na novel encoder-decoder framework with an edge-aware attention module to focus\non scene text regions during enhancement. Our proposed method uses novel text\ndetection and edge reconstruction losses to emphasize low-level scene text\nfeatures, leading to successful text extraction. Additionally, we present a\nSupervised Deep Curve Estimation (Supervised-DCE) model to synthesize extremely\nlow-light images based on publicly available scene text datasets such as\nICDAR15 (IC15). We also labeled texts in the extremely low-light See In the\nDark (SID) and ordinary LOw-Light (LOL) datasets to allow for objective\nassessment of extremely low-light image enhancement through scene text tasks.\nExtensive experiments show that our model outperforms state-of-the-art methods\nin terms of both image quality and scene text metrics on the widely-used LOL,\nSID, and synthetic IC15 datasets. Code and dataset will be released publicly at\nhttps://github.com/chunchet-ng/Text-in-the-Dark.\n","authors":["Che-Tsung Lin","Chun Chet Ng","Zhi Qin Tan","Wan Jun Nah","Xinyu Wang","Jie Long Kew","Pohao Hsu","Shang Hong Lai","Chee Seng Chan","Christopher Zach"],"pdf_url":"https://arxiv.org/pdf/2404.14135v1.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2404.14132v1","updated":"2024-04-22T12:33:18Z","published":"2024-04-22T12:33:18Z","title":"CRNet: A Detail-Preserving Network for Unified Image Restoration and\n Enhancement Task","summary":" In real-world scenarios, images captured often suffer from blurring, noise,\nand other forms of image degradation, and due to sensor limitations, people\nusually can only obtain low dynamic range images. To achieve high-quality\nimages, researchers have attempted various image restoration and enhancement\noperations on photographs, including denoising, deblurring, and high dynamic\nrange imaging. However, merely performing a single type of image enhancement\nstill cannot yield satisfactory images. In this paper, to deal with the\nchallenge above, we propose the Composite Refinement Network (CRNet) to address\nthis issue using multiple exposure images. By fully integrating\ninformation-rich multiple exposure inputs, CRNet can perform unified image\nrestoration and enhancement. To improve the quality of image details, CRNet\nexplicitly separates and strengthens high and low-frequency information through\npooling layers, using specially designed Multi-Branch Blocks for effective\nfusion of these frequencies. To increase the receptive field and fully\nintegrate input features, CRNet employs the High-Frequency Enhancement Module,\nwhich includes large kernel convolutions and an inverted bottleneck ConvFFN.\nOur model secured third place in the first track of the Bracketing Image\nRestoration and Enhancement Challenge, surpassing previous SOTA models in both\ntesting metrics and visual quality.\n","authors":["Kangzhen Yang","Tao Hu","Kexin Dai","Genggeng Chen","Yu Cao","Wei Dong","Peng Wu","Yanning Zhang","Qingsen Yan"],"pdf_url":"https://arxiv.org/pdf/2404.14132v1.pdf","comment":"This paper is accepted by CVPR2024 Workshop, Code:\n https://github.com/CalvinYang0/CRNet"},{"id":"http://arxiv.org/abs/2404.14117v1","updated":"2024-04-22T12:07:10Z","published":"2024-04-22T12:07:10Z","title":"Hierarchical localization with panoramic views and triplet loss\n functions","summary":" The main objective of this paper is to address the mobile robot localization\nproblem with Triplet Convolutional Neural Networks and test their robustness\nagainst changes of the lighting conditions. We have used omnidirectional images\nfrom real indoor environments captured in dynamic conditions that have been\nconverted to panoramic format. Two approaches are proposed to address\nlocalization by means of triplet neural networks. First, hierarchical\nlocalization, which consists in estimating the robot position in two stages: a\ncoarse localization, which involves a room retrieval task, and a fine\nlocalization is addressed by means of image retrieval in the previously\nselected room. Second, global localization, which consists in estimating the\nposition of the robot inside the entire map in a unique step. Besides, an\nexhaustive study of the loss function influence on the network learning process\nhas been made. The experimental section proves that triplet neural networks are\nan efficient and robust tool to address the localization of mobile robots in\nindoor environments, considering real operation conditions.\n","authors":["Marcos Alfaro","Juan José Cabrera","Luis Miguel Jiménez","Óscar Reinoso","Luis Payá"],"pdf_url":"https://arxiv.org/pdf/2404.14117v1.pdf","comment":"This work has been submitted to the Artificial Intelligence Journal\n (Ed. Elsevier) for possible publication. Copyright may be transferred without\n notice, after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2404.14109v1","updated":"2024-04-22T11:52:40Z","published":"2024-04-22T11:52:40Z","title":"CKD: Contrastive Knowledge Distillation from A Sample-wise Perspective","summary":" In this paper, we present a simple yet effective contrastive knowledge\ndistillation approach, which can be formulated as a sample-wise alignment\nproblem with intra- and inter-sample constraints. Unlike traditional knowledge\ndistillation methods that concentrate on maximizing feature similarities or\npreserving class-wise semantic correlations between teacher and student\nfeatures, our method attempts to recover the \"dark knowledge\" by aligning\nsample-wise teacher and student logits. Specifically, our method first\nminimizes logit differences within the same sample by considering their\nnumerical values, thus preserving intra-sample similarities. Next, we bridge\nsemantic disparities by leveraging dissimilarities across different samples.\nNote that constraints on intra-sample similarities and inter-sample\ndissimilarities can be efficiently and effectively reformulated into a\ncontrastive learning framework with newly designed positive and negative pairs.\nThe positive pair consists of the teacher's and student's logits derived from\nan identical sample, while the negative pairs are formed by using logits from\ndifferent samples. With this formulation, our method benefits from the\nsimplicity and efficiency of contrastive learning through the optimization of\nInfoNCE, yielding a run-time complexity that is far less than $O(n^2)$, where\n$n$ represents the total number of training samples. Furthermore, our method\ncan eliminate the need for hyperparameter tuning, particularly related to\ntemperature parameters and large batch sizes. We conduct comprehensive\nexperiments on three datasets including CIFAR-100, ImageNet-1K, and MS COCO.\nExperimental results clearly confirm the effectiveness of the proposed method\non both image classification and object detection tasks. Our source codes will\nbe publicly available at https://github.com/wencheng-zhu/CKD.\n","authors":["Wencheng Zhu","Xin Zhou","Pengfei Zhu","Yu Wang","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2404.14109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14099v1","updated":"2024-04-22T11:37:35Z","published":"2024-04-22T11:37:35Z","title":"DynaMMo: Dynamic Model Merging for Efficient Class Incremental Learning\n for Medical Images","summary":" Continual learning, the ability to acquire knowledge from new data while\nretaining previously learned information, is a fundamental challenge in machine\nlearning. Various approaches, including memory replay, knowledge distillation,\nmodel regularization, and dynamic network expansion, have been proposed to\naddress this issue. Thus far, dynamic network expansion methods have achieved\nstate-of-the-art performance at the cost of incurring significant computational\noverhead. This is due to the need for additional model buffers, which makes it\nless feasible in resource-constrained settings, particularly in the medical\ndomain. To overcome this challenge, we propose Dynamic Model Merging, DynaMMo,\na method that merges multiple networks at different stages of model training to\nachieve better computational efficiency. Specifically, we employ lightweight\nlearnable modules for each task and combine them into a unified model to\nminimize computational overhead. DynaMMo achieves this without compromising\nperformance, offering a cost-effective solution for continual learning in\nmedical applications. We evaluate DynaMMo on three publicly available datasets,\ndemonstrating its effectiveness compared to existing approaches. DynaMMo offers\naround 10-fold reduction in GFLOPS with a small drop of 2.76 in average\naccuracy when compared to state-of-the-art dynamic-based approaches. The code\nimplementation of this work will be available upon the acceptance of this work\nat https://github.com/BioMedIA-MBZUAI/DynaMMo.\n","authors":["Mohammad Areeb Qazi","Ibrahim Almakky","Anees Ur Rehman Hashmi","Santosh Sanjeev","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2404.14099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02611v2","updated":"2024-04-22T11:15:46Z","published":"2024-03-05T02:59:35Z","title":"A Unified Framework for Microscopy Defocus Deblur with Multi-Pyramid\n Transformer and Contrastive Learning","summary":" Defocus blur is a persistent problem in microscope imaging that poses harm to\npathology interpretation and medical intervention in cell microscopy and\nmicroscope surgery. To address this problem, a unified framework including the\nmulti-pyramid transformer (MPT) and extended frequency contrastive\nregularization (EFCR) is proposed to tackle two outstanding challenges in\nmicroscopy deblur: longer attention span and data deficiency. The MPT employs\nan explicit pyramid structure at each network stage that integrates the\ncross-scale window attention (CSWA), the intra-scale channel attention (ISCA),\nand the feature-enhancing feed-forward network (FEFN) to capture long-range\ncross-scale spatial interaction and global channel context. The EFCR addresses\nthe data deficiency problem by exploring latent deblur signals from different\nfrequency bands. It also enables deblur knowledge transfer to learn\ncross-domain information from extra data, improving deblur performance for\nlabeled and unlabeled data. Extensive experiments and downstream task\nvalidation show the framework achieves state-of-the-art performance across\nmultiple datasets. Project page: https://github.com/PieceZhang/MPT-CataBlur.\n","authors":["Yuelin Zhang","Pengyu Zheng","Wanquan Yan","Chengyu Fang","Shing Shin Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.02611v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09359v2","updated":"2024-04-22T10:52:32Z","published":"2024-04-14T21:14:47Z","title":"Exploring Feedback Generation in Automated Skeletal Movement Assessment:\n A Comprehensive Overview","summary":" The application of machine-learning solutions to movement assessment from\nskeleton videos has attracted significant research attention in recent years.\nThis advancement has made rehabilitation at home more accessible, utilizing\nmovement assessment algorithms that can operate on affordable equipment for\nhuman pose detection and analysis from 2D or 3D videos. While the primary\nobjective of automatic assessment tasks is to score movements, the automatic\ngeneration of feedback highlighting key movement issues has the potential to\nsignificantly enhance and accelerate the rehabilitation process. While numerous\nresearch works exist in the field of automatic movement assessment, only a\nhandful address feedback generation. In this study, we explain the types of\nfeedback that can be generated, review existing solutions for automatic\nfeedback generation, and discuss future research directions. To our knowledge,\nthis is the first comprehensive review of feedback generation in skeletal\nmovement assessment.\n","authors":["Tal Hakim"],"pdf_url":"https://arxiv.org/pdf/2404.09359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14077v1","updated":"2024-04-22T10:49:46Z","published":"2024-04-22T10:49:46Z","title":"Research on Robot Path Planning Based on Reinforcement Learning","summary":" This project has conducted research on robot path planning based on Visual\nSLAM. The main work of this project is as follows: (1) Construction of Visual\nSLAM system. Research has been conducted on the basic architecture of Visual\nSLAM. A Visual SLAM system is developed based on ORB-SLAM3 system, which can\nconduct dense point cloud mapping. (2) The map suitable for two-dimensional\npath planning is obtained through map conversion. This part converts the dense\npoint cloud map obtained by Visual SLAM system into an octomap and then\nperforms projection transformation to the grid map. The map conversion converts\nthe dense point cloud map containing a large amount of redundant map\ninformation into an extremely lightweight grid map suitable for path planning.\n(3) Research on path planning algorithm based on reinforcement learning. This\nproject has conducted experimental comparisons between the Q-learning\nalgorithm, the DQN algorithm, and the SARSA algorithm, and found that DQN is\nthe algorithm with the fastest convergence and best performance in\nhigh-dimensional complex environments. This project has conducted experimental\nverification of the Visual SLAM system in a simulation environment. The\nexperimental results obtained based on open-source dataset and self-made\ndataset prove the feasibility and effectiveness of the designed Visual SLAM\nsystem. At the same time, this project has also conducted comparative\nexperiments on the three reinforcement learning algorithms under the same\nexperimental condition to obtain the optimal algorithm under the experimental\ncondition.\n","authors":["Wang Ruiqi"],"pdf_url":"https://arxiv.org/pdf/2404.14077v1.pdf","comment":"My undergrad final year project report, 44 pages and 15 figures"},{"id":"http://arxiv.org/abs/2404.14076v1","updated":"2024-04-22T10:45:59Z","published":"2024-04-22T10:45:59Z","title":"Noise contrastive estimation with soft targets for conditional models","summary":" Soft targets combined with the cross-entropy loss have shown to improve\ngeneralization performance of deep neural networks on supervised classification\ntasks. The standard cross-entropy loss however assumes data to be categorically\ndistributed, which may often not be the case in practice. In contrast, InfoNCE\ndoes not rely on such an explicit assumption but instead implicitly estimates\nthe true conditional through negative sampling. Unfortunately, it cannot be\ncombined with soft targets in its standard formulation, hindering its use in\ncombination with sophisticated training strategies. In this paper, we address\nthis limitation by proposing a principled loss function that is compatible with\nprobabilistic targets. Our new soft target InfoNCE loss is conceptually simple,\nefficient to compute, and can be derived within the framework of noise\ncontrastive estimation. Using a toy example, we demonstrate shortcomings of the\ncategorical distribution assumption of cross-entropy, and discuss implications\nof sampling from soft distributions. We observe that soft target InfoNCE\nperforms on par with strong soft target cross-entropy baselines and outperforms\nhard target NLL and InfoNCE losses on popular benchmarks, including ImageNet.\nFinally, we provide a simple implementation of our loss, geared towards\nsupervised classification and fully compatible with deep classification model\ntrained with cross-entropy.\n","authors":["Johannes Hugger","Virginie Uhlmann"],"pdf_url":"https://arxiv.org/pdf/2404.14076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17469v2","updated":"2024-04-22T10:40:50Z","published":"2023-06-30T08:34:08Z","title":"Manga109Dialog: A Large-scale Dialogue Dataset for Comics Speaker\n Detection","summary":" The expanding market for e-comics has spurred interest in the development of\nautomated methods to analyze comics. For further understanding of comics, an\nautomated approach is needed to link text in comics to characters speaking the\nwords. Comics speaker detection research has practical applications, such as\nautomatic character assignment for audiobooks, automatic translation according\nto characters' personalities, and inference of character relationships and\nstories.\n To deal with the problem of insufficient speaker-to-text annotations, we\ncreated a new annotation dataset Manga109Dialog based on Manga109.\nManga109Dialog is the world's largest comics speaker annotation dataset,\ncontaining 132,692 speaker-to-text pairs. We further divided our dataset into\ndifferent levels by prediction difficulties to evaluate speaker detection\nmethods more appropriately. Unlike existing methods mainly based on distances,\nwe propose a deep learning-based method using scene graph generation models.\nDue to the unique features of comics, we enhance the performance of our\nproposed model by considering the frame reading order. We conducted experiments\nusing Manga109Dialog and other datasets. Experimental results demonstrate that\nour scene-graph-based approach outperforms existing methods, achieving a\nprediction accuracy of over 75%.\n","authors":["Yingxuan Li","Kiyoharu Aizawa","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2306.17469v2.pdf","comment":"Accepted to ICME2024"},{"id":"http://arxiv.org/abs/2403.01644v3","updated":"2024-04-22T10:34:09Z","published":"2024-03-03T23:46:06Z","title":"OccFusion: A Straightforward and Effective Multi-Sensor Fusion Framework\n for 3D Occupancy Prediction","summary":" This paper introduces OccFusion, a straightforward and efficient sensor\nfusion framework for predicting 3D occupancy. A comprehensive understanding of\n3D scenes is crucial in autonomous driving, and recent models for 3D semantic\noccupancy prediction have successfully addressed the challenge of describing\nreal-world objects with varied shapes and classes. However, existing methods\nfor 3D occupancy prediction heavily rely on surround-view camera images, making\nthem susceptible to changes in lighting and weather conditions. By integrating\nfeatures from additional sensors, such as lidar and surround view radars, our\nframework enhances the accuracy and robustness of occupancy prediction,\nresulting in top-tier performance on the nuScenes benchmark. Furthermore,\nextensive experiments conducted on the nuScenes dataset, including challenging\nnight and rainy scenarios, confirm the superior performance of our sensor\nfusion strategy across various perception ranges. The code for this framework\nwill be made available at https://github.com/DanielMing123/OCCFusion.\n","authors":["Zhenxing Ming","Julie Stephany Berrio","Mao Shan","Stewart Worrall"],"pdf_url":"https://arxiv.org/pdf/2403.01644v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14066v1","updated":"2024-04-22T10:23:59Z","published":"2024-04-22T10:23:59Z","title":"SHE-Net: Syntax-Hierarchy-Enhanced Text-Video Retrieval","summary":" The user base of short video apps has experienced unprecedented growth in\nrecent years, resulting in a significant demand for video content analysis. In\nparticular, text-video retrieval, which aims to find the top matching videos\ngiven text descriptions from a vast video corpus, is an essential function, the\nprimary challenge of which is to bridge the modality gap. Nevertheless, most\nexisting approaches treat texts merely as discrete tokens and neglect their\nsyntax structures. Moreover, the abundant spatial and temporal clues in videos\nare often underutilized due to the lack of interaction with text. To address\nthese issues, we argue that using texts as guidance to focus on relevant\ntemporal frames and spatial regions within videos is beneficial. In this paper,\nwe propose a novel Syntax-Hierarchy-Enhanced text-video retrieval method\n(SHE-Net) that exploits the inherent semantic and syntax hierarchy of texts to\nbridge the modality gap from two perspectives. First, to facilitate a more\nfine-grained integration of visual content, we employ the text syntax\nhierarchy, which reveals the grammatical structure of text descriptions, to\nguide the visual representations. Second, to further enhance the multi-modal\ninteraction and alignment, we also utilize the syntax hierarchy to guide the\nsimilarity calculation. We evaluated our method on four public text-video\nretrieval datasets of MSR-VTT, MSVD, DiDeMo, and ActivityNet. The experimental\nresults and ablation studies confirm the advantages of our proposed method.\n","authors":["Xuzheng Yu","Chen Jiang","Xingning Dong","Tian Gan","Ming Yang","Qingpei Guo"],"pdf_url":"https://arxiv.org/pdf/2404.14066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14064v1","updated":"2024-04-22T10:21:41Z","published":"2024-04-22T10:21:41Z","title":"Multi-view Disentanglement for Reinforcement Learning with Multiple\n Cameras","summary":" The performance of image-based Reinforcement Learning (RL) agents can vary\ndepending on the position of the camera used to capture the images. Training on\nmultiple cameras simultaneously, including a first-person egocentric camera,\ncan leverage information from different camera perspectives to improve the\nperformance of RL. However, hardware constraints may limit the availability of\nmultiple cameras in real-world deployment. Additionally, cameras may become\ndamaged in the real-world preventing access to all cameras that were used\nduring training. To overcome these hardware constraints, we propose Multi-View\nDisentanglement (MVD), which uses multiple cameras to learn a policy that\nachieves zero-shot generalisation to any single camera from the training set.\nOur approach is a self-supervised auxiliary task for RL that learns a\ndisentangled representation from multiple cameras, with a shared representation\nthat is aligned across all cameras to allow generalisation to a single camera,\nand a private representation that is camera-specific. We show experimentally\nthat an RL agent trained on a single third-person camera is unable to learn an\noptimal policy in many control tasks; but, our approach, benefiting from\nmultiple cameras during training, is able to solve the task using only the same\nsingle third-person camera.\n","authors":["Mhairi Dunion","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2404.14064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14062v1","updated":"2024-04-22T10:19:16Z","published":"2024-04-22T10:19:16Z","title":"GatedLexiconNet: A Comprehensive End-to-End Handwritten Paragraph Text\n Recognition System","summary":" The Handwritten Text Recognition problem has been a challenge for researchers\nfor the last few decades, especially in the domain of computer vision, a\nsubdomain of pattern recognition. Variability of texts amongst writers,\ncursiveness, and different font styles of handwritten texts with degradation of\nhistorical text images make it a challenging problem. Recognizing scanned\ndocument images in neural network-based systems typically involves a two-step\napproach: segmentation and recognition. However, this method has several\ndrawbacks. These shortcomings encompass challenges in identifying text regions,\nanalyzing layout diversity within pages, and establishing accurate ground truth\nsegmentation. Consequently, these processes are prone to errors, leading to\nbottlenecks in achieving high recognition accuracies. Thus, in this study, we\npresent an end-to-end paragraph recognition system that incorporates internal\nline segmentation and gated convolutional layers based encoder. The gating is a\nmechanism that controls the flow of information and allows to adaptively\nselection of the more relevant features in handwritten text recognition models.\nThe attention module plays an important role in performing internal line\nsegmentation, allowing the page to be processed line-by-line. During the\ndecoding step, we have integrated a connectionist temporal classification-based\nword beam search decoder as a post-processing step. In this work, we have\nextended existing LexiconNet by carefully applying and utilizing gated\nconvolutional layers in the existing deep neural network. Our results at line\nand page levels also favour our new GatedLexiconNet. This study reported\ncharacter error rates of 2.27% on IAM, 0.9% on RIMES, and 2.13% on READ-16, and\nword error rates of 5.73% on IAM, 2.76% on RIMES, and 6.52% on READ-2016\ndatasets.\n","authors":["Lalita Kumari","Sukhdeep Singh","Vaibhav Varish Singh Rathore","Anuj Sharma"],"pdf_url":"https://arxiv.org/pdf/2404.14062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14055v1","updated":"2024-04-22T10:11:31Z","published":"2024-04-22T10:11:31Z","title":"RingID: Rethinking Tree-Ring Watermarking for Enhanced Multi-Key\n Identification","summary":" We revisit Tree-Ring Watermarking, a recent diffusion model watermarking\nmethod that demonstrates great robustness to various attacks. We conduct an\nin-depth study on it and reveal that the distribution shift unintentionally\nintroduced by the watermarking process, apart from watermark pattern matching,\ncontributes to its exceptional robustness. Our investigation further exposes\ninherent flaws in its original design, particularly in its ability to identify\nmultiple distinct keys, where distribution shift offers no assistance. Based on\nthese findings and analysis, we present RingID for enhanced multi-key\nidentification. It consists of a novel multi-channel heterogeneous watermarking\napproach designed to seamlessly amalgamate distinctive advantages from diverse\nwatermarks. Coupled with a series of suggested enhancements, RingID exhibits\nsubstantial advancements in multi-key identification.\n","authors":["Hai Ci","Pei Yang","Yiren Song","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2404.14055v1.pdf","comment":"25 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.08965v2","updated":"2024-04-22T10:08:54Z","published":"2024-04-13T11:07:10Z","title":"Seeing Text in the Dark: Algorithm and Benchmark","summary":" Localizing text in low-light environments is challenging due to visual\ndegradations. Although a straightforward solution involves a two-stage pipeline\nwith low-light image enhancement (LLE) as the initial step followed by\ndetector, LLE is primarily designed for human vision instead of machine and can\naccumulate errors. In this work, we propose an efficient and effective\nsingle-stage approach for localizing text in dark that circumvents the need for\nLLE. We introduce a constrained learning module as an auxiliary mechanism\nduring the training stage of the text detector. This module is designed to\nguide the text detector in preserving textual spatial features amidst feature\nmap resizing, thus minimizing the loss of spatial information in texts under\nlow-light visual degradations. Specifically, we incorporate spatial\nreconstruction and spatial semantic constraints within this module to ensure\nthe text detector acquires essential positional and contextual range knowledge.\nOur approach enhances the original text detector's ability to identify text's\nlocal topological features using a dynamic snake feature pyramid network and\nadopts a bottom-up contour shaping strategy with a novel rectangular\naccumulation technique for accurate delineation of streamlined text features.\nIn addition, we present a comprehensive low-light dataset for arbitrary-shaped\ntext, encompassing diverse scenes and languages. Notably, our method achieves\nstate-of-the-art results on this low-light dataset and exhibits comparable\nperformance on standard normal light datasets. The code and dataset will be\nreleased.\n","authors":["Chengpei Xu","Hao Fu","Long Ma","Wenjing Jia","Chengqi Zhang","Feng Xia","Xiaoyu Ai","Binghao Li","Wenjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13959v5","updated":"2024-04-22T10:00:27Z","published":"2023-03-24T12:33:44Z","title":"Bridging Stereo Geometry and BEV Representation with Reliable Mutual\n Interaction for Semantic Scene Completion","summary":" 3D semantic scene completion (SSC) is an ill-posed perception task that\nrequires inferring a dense 3D scene from limited observations. Previous\ncamera-based methods struggle to predict accurate semantic scenes due to\ninherent geometric ambiguity and incomplete observations. In this paper, we\nresort to stereo matching technique and bird's-eye-view (BEV) representation\nlearning to address such issues in SSC. Complementary to each other, stereo\nmatching mitigates geometric ambiguity with epipolar constraint while BEV\nrepresentation enhances the hallucination ability for invisible regions with\nglobal semantic context. However, due to the inherent representation gap\nbetween stereo geometry and BEV features, it is non-trivial to bridge them for\ndense prediction task of SSC. Therefore, we further develop a unified\noccupancy-based framework dubbed BRGScene, which effectively bridges these two\nrepresentations with dense 3D volumes for reliable semantic scene completion.\nSpecifically, we design a novel Mutual Interactive Ensemble (MIE) block for\npixel-level reliable aggregation of stereo geometry and BEV features. Within\nthe MIE block, a Bi-directional Reliable Interaction (BRI) module, enhanced\nwith confidence re-weighting, is employed to encourage fine-grained interaction\nthrough mutual guidance. Besides, a Dual Volume Ensemble (DVE) module is\nintroduced to facilitate complementary aggregation through channel-wise\nrecalibration and multi-group voting. Our method outperforms all published\ncamera-based methods on SemanticKITTI for semantic scene completion. Our code\nis available on \\url{https://github.com/Arlo0o/StereoScene}.\n","authors":["Bohan Li","Yasheng Sun","Zhujin Liang","Dalong Du","Zhuanghui Zhang","Xiaofeng Wang","Yunnan Wang","Xin Jin","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2303.13959v5.pdf","comment":"IJCAI2024 (https://github.com/Arlo0o/StereoScene)"},{"id":"http://arxiv.org/abs/2404.14044v1","updated":"2024-04-22T09:57:53Z","published":"2024-04-22T09:57:53Z","title":"HashPoint: Accelerated Point Searching and Sampling for Neural Rendering","summary":" In this paper, we address the problem of efficient point searching and\nsampling for volume neural rendering. Within this realm, two typical approaches\nare employed: rasterization and ray tracing. The rasterization-based methods\nenable real-time rendering at the cost of increased memory and lower fidelity.\nIn contrast, the ray-tracing-based methods yield superior quality but demand\nlonger rendering time. We solve this problem by our HashPoint method combining\nthese two strategies, leveraging rasterization for efficient point searching\nand sampling, and ray marching for rendering. Our method optimizes point\nsearching by rasterizing points within the camera's view, organizing them in a\nhash table, and facilitating rapid searches. Notably, we accelerate the\nrendering process by adaptive sampling on the primary surface encountered by\nthe ray. Our approach yields substantial speed-up for a range of\nstate-of-the-art ray-tracing-based methods, maintaining equivalent or superior\naccuracy across synthetic and real test datasets. The code will be available at\nhttps://jiahao-ma.github.io/hashpoint/.\n","authors":["Jiahao Ma","Miaomiao Liu","David Ahmedt-Aristizaba","Chuong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.14044v1.pdf","comment":"CVPR2024 Highlight"},{"id":"http://arxiv.org/abs/2404.14042v1","updated":"2024-04-22T09:55:50Z","published":"2024-04-22T09:55:50Z","title":"CloudFort: Enhancing Robustness of 3D Point Cloud Classification Against\n Backdoor Attacks via Spatial Partitioning and Ensemble Prediction","summary":" The increasing adoption of 3D point cloud data in various applications, such\nas autonomous vehicles, robotics, and virtual reality, has brought about\nsignificant advancements in object recognition and scene understanding.\nHowever, this progress is accompanied by new security challenges, particularly\nin the form of backdoor attacks. These attacks involve inserting malicious\ninformation into the training data of machine learning models, potentially\ncompromising the model's behavior. In this paper, we propose CloudFort, a novel\ndefense mechanism designed to enhance the robustness of 3D point cloud\nclassifiers against backdoor attacks. CloudFort leverages spatial partitioning\nand ensemble prediction techniques to effectively mitigate the impact of\nbackdoor triggers while preserving the model's performance on clean data. We\nevaluate the effectiveness of CloudFort through extensive experiments,\ndemonstrating its strong resilience against the Point Cloud Backdoor Attack\n(PCBA). Our results show that CloudFort significantly enhances the security of\n3D point cloud classification models without compromising their accuracy on\nbenign samples. Furthermore, we explore the limitations of CloudFort and\ndiscuss potential avenues for future research in the field of 3D point cloud\nsecurity. The proposed defense mechanism represents a significant step towards\nensuring the trustworthiness and reliability of point-cloud-based systems in\nreal-world applications.\n","authors":["Wenhao Lan","Yijun Yang","Haihua Shen","Shan Li"],"pdf_url":"https://arxiv.org/pdf/2404.14042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14040v1","updated":"2024-04-22T09:53:55Z","published":"2024-04-22T09:53:55Z","title":"Surgical-DeSAM: Decoupling SAM for Instrument Segmentation in Robotic\n Surgery","summary":" Purpose: The recent Segment Anything Model (SAM) has demonstrated impressive\nperformance with point, text or bounding box prompts, in various applications.\nHowever, in safety-critical surgical tasks, prompting is not possible due to\n(i) the lack of per-frame prompts for supervised learning, (ii) it is\nunrealistic to prompt frame-by-frame in a real-time tracking application, and\n(iii) it is expensive to annotate prompts for offline applications.\n Methods: We develop Surgical-DeSAM to generate automatic bounding box prompts\nfor decoupling SAM to obtain instrument segmentation in real-time robotic\nsurgery. We utilise a commonly used detection architecture, DETR, and\nfine-tuned it to obtain bounding box prompt for the instruments. We then\nempolyed decoupling SAM (DeSAM) by replacing the image encoder with DETR\nencoder and fine-tune prompt encoder and mask decoder to obtain instance\nsegmentation for the surgical instruments. To improve detection performance, we\nadopted the Swin-transformer to better feature representation.\n Results: The proposed method has been validated on two publicly available\ndatasets from the MICCAI surgical instruments segmentation challenge EndoVis\n2017 and 2018. The performance of our method is also compared with SOTA\ninstrument segmentation methods and demonstrated significant improvements with\ndice metrics of 89.62 and 90.70 for the EndoVis 2017 and 2018.\n Conclusion: Our extensive experiments and validations demonstrate that\nSurgical-DeSAM enables real-time instrument segmentation without any additional\nprompting and outperforms other SOTA segmentation methods.\n","authors":["Yuyang Sheng","Sophia Bano","Matthew J. Clarkson","Mobarakol Islam"],"pdf_url":"https://arxiv.org/pdf/2404.14040v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.14037v1","updated":"2024-04-22T09:51:43Z","published":"2024-04-22T09:51:43Z","title":"GaussianTalker: Speaker-specific Talking Head Synthesis via 3D Gaussian\n Splatting","summary":" Recent works on audio-driven talking head synthesis using Neural Radiance\nFields (NeRF) have achieved impressive results. However, due to inadequate pose\nand expression control caused by NeRF implicit representation, these methods\nstill have some limitations, such as unsynchronized or unnatural lip movements,\nand visual jitter and artifacts. In this paper, we propose GaussianTalker, a\nnovel method for audio-driven talking head synthesis based on 3D Gaussian\nSplatting. With the explicit representation property of 3D Gaussians, intuitive\ncontrol of the facial motion is achieved by binding Gaussians to 3D facial\nmodels. GaussianTalker consists of two modules, Speaker-specific Motion\nTranslator and Dynamic Gaussian Renderer. Speaker-specific Motion Translator\nachieves accurate lip movements specific to the target speaker through\nuniversalized audio feature extraction and customized lip motion generation.\nDynamic Gaussian Renderer introduces Speaker-specific BlendShapes to enhance\nfacial detail representation via a latent pose, delivering stable and realistic\nrendered videos. Extensive experimental results suggest that GaussianTalker\noutperforms existing state-of-the-art methods in talking head synthesis,\ndelivering precise lip synchronization and exceptional visual quality. Our\nmethod achieves rendering speeds of 130 FPS on NVIDIA RTX4090 GPU,\nsignificantly exceeding the threshold for real-time rendering performance, and\ncan potentially be deployed on other hardware platforms.\n","authors":["Hongyun Yu","Zhan Qu","Qihang Yu","Jianchuan Chen","Zhonghua Jiang","Zhiwen Chen","Shengyu Zhang","Jimin Xu","Fei Wu","Chengfei Lv","Gang Yu"],"pdf_url":"https://arxiv.org/pdf/2404.14037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14034v1","updated":"2024-04-22T09:50:12Z","published":"2024-04-22T09:50:12Z","title":"PointDifformer: Robust Point Cloud Registration With Neural Diffusion\n and Transformer","summary":" Point cloud registration is a fundamental technique in 3-D computer vision\nwith applications in graphics, autonomous driving, and robotics. However,\nregistration tasks under challenging conditions, under which noise or\nperturbations are prevalent, can be difficult. We propose a robust point cloud\nregistration approach that leverages graph neural partial differential\nequations (PDEs) and heat kernel signatures. Our method first uses graph neural\nPDE modules to extract high dimensional features from point clouds by\naggregating information from the 3-D point neighborhood, thereby enhancing the\nrobustness of the feature representations. Then, we incorporate heat kernel\nsignatures into an attention mechanism to efficiently obtain corresponding\nkeypoints. Finally, a singular value decomposition (SVD) module with learnable\nweights is used to predict the transformation between two point clouds.\nEmpirical experiments on a 3-D point cloud dataset demonstrate that our\napproach not only achieves state-of-the-art performance for point cloud\nregistration but also exhibits better robustness to additive noise or 3-D shape\nperturbations.\n","authors":["Rui She","Qiyu Kang","Sijie Wang","Wee Peng Tay","Kai Zhao","Yang Song","Tianyu Geng","Yi Xu","Diego Navarro Navarro","Andreas Hartmannsgruber"],"pdf_url":"https://arxiv.org/pdf/2404.14034v1.pdf","comment":"Accepted by IEEE Transactions on Geoscience and Remote Sensing"},{"id":"http://arxiv.org/abs/2404.14032v1","updated":"2024-04-22T09:50:05Z","published":"2024-04-22T09:50:05Z","title":"1st Place Solution to the 1st SkatingVerse Challenge","summary":" This paper presents the winning solution for the 1st SkatingVerse Challenge.\nWe propose a method that involves several steps. To begin, we leverage the DINO\nframework to extract the Region of Interest (ROI) and perform precise cropping\nof the raw video footage. Subsequently, we employ three distinct models, namely\nUnmasked Teacher, UniformerV2, and InfoGCN, to capture different aspects of the\ndata. By ensembling the prediction results based on logits, our solution\nattains an impressive leaderboard score of 95.73%.\n","authors":["Tao Sun","Yuanzi Fu","Kaicheng Yang","Jian Wu","Ziyong Feng"],"pdf_url":"https://arxiv.org/pdf/2404.14032v1.pdf","comment":"3 pages, 1st SkatingVerse Challenge, 18th IEEE International\n Conference on Automatic Face and Gesture Recognition workshop"},{"id":"http://arxiv.org/abs/2404.14027v1","updated":"2024-04-22T09:43:03Z","published":"2024-04-22T09:43:03Z","title":"OccFeat: Self-supervised Occupancy Feature Prediction for Pretraining\n BEV Segmentation Networks","summary":" We introduce a self-supervised pretraining method, called OcFeat, for\ncamera-only Bird's-Eye-View (BEV) segmentation networks. With OccFeat, we\npretrain a BEV network via occupancy prediction and feature distillation tasks.\nOccupancy prediction provides a 3D geometric understanding of the scene to the\nmodel. However, the geometry learned is class-agnostic. Hence, we add semantic\ninformation to the model in the 3D space through distillation from a\nself-supervised pretrained image foundation model. Models pretrained with our\nmethod exhibit improved BEV semantic segmentation performance, particularly in\nlow-data scenarios. Moreover, empirical results affirm the efficacy of\nintegrating feature distillation with 3D occupancy prediction in our\npretraining approach.\n","authors":["Sophia Sirko-Galouchenko","Alexandre Boulch","Spyros Gidaris","Andrei Bursuc","Antonin Vobecky","Patrick Pérez","Renaud Marlet"],"pdf_url":"https://arxiv.org/pdf/2404.14027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14025v1","updated":"2024-04-22T09:41:03Z","published":"2024-04-22T09:41:03Z","title":"DHRNet: A Dual-Path Hierarchical Relation Network for Multi-Person Pose\n Estimation","summary":" Multi-person pose estimation (MPPE) presents a formidable yet crucial\nchallenge in computer vision. Most existing methods predominantly concentrate\non isolated interaction either between instances or joints, which is inadequate\nfor scenarios demanding concurrent localization of both instances and joints.\nThis paper introduces a novel CNN-based single-stage method, named Dual-path\nHierarchical Relation Network (DHRNet), to extract instance-to-joint and\njoint-to-instance interactions concurrently. Specifically, we design a\ndual-path interaction modeling module (DIM) that strategically organizes\ncross-instance and cross-joint interaction modeling modules in two\ncomplementary orders, enriching interaction information by integrating merits\nfrom different correlation modeling branches. Notably, DHRNet excels in joint\nlocalization by leveraging information from other instances and joints.\nExtensive evaluations on challenging datasets, including COCO, CrowdPose, and\nOCHuman datasets, showcase DHRNet's state-of-the-art performance. The code will\nbe released at https://github.com/YHDang/dhrnet-multi-pose-estimation.\n","authors":["Yonghao Dang","Jianqin Yin","Liyuan Liu","Yuan Sun","Yanzhu Hu","Pengxiang Ding"],"pdf_url":"https://arxiv.org/pdf/2404.14025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14022v1","updated":"2024-04-22T09:36:17Z","published":"2024-04-22T09:36:17Z","title":"Collaborative Perception Datasets in Autonomous Driving: A Survey","summary":" This survey offers a comprehensive examination of collaborative perception\ndatasets in the context of Vehicle-to-Infrastructure (V2I), Vehicle-to-Vehicle\n(V2V), and Vehicle-to-Everything (V2X). It highlights the latest developments\nin large-scale benchmarks that accelerate advancements in perception tasks for\nautonomous vehicles. The paper systematically analyzes a variety of datasets,\ncomparing them based on aspects such as diversity, sensor setup, quality,\npublic availability, and their applicability to downstream tasks. It also\nhighlights the key challenges such as domain shift, sensor setup limitations,\nand gaps in dataset diversity and availability. The importance of addressing\nprivacy and security concerns in the development of datasets is emphasized,\nregarding data sharing and dataset creation. The conclusion underscores the\nnecessity for comprehensive, globally accessible datasets and collaborative\nefforts from both technological and research communities to overcome these\nchallenges and fully harness the potential of autonomous driving.\n","authors":["Melih Yazgan","Mythra Varun Akkanapragada","J. Marius Zoellner"],"pdf_url":"https://arxiv.org/pdf/2404.14022v1.pdf","comment":"8 pages,3 figures"},{"id":"http://arxiv.org/abs/2404.14019v1","updated":"2024-04-22T09:33:44Z","published":"2024-04-22T09:33:44Z","title":"A Multimodal Feature Distillation with CNN-Transformer Network for Brain\n Tumor Segmentation with Incomplete Modalities","summary":" Existing brain tumor segmentation methods usually utilize multiple Magnetic\nResonance Imaging (MRI) modalities in brain tumor images for segmentation,\nwhich can achieve better segmentation performance. However, in clinical\napplications, some modalities are missing due to resource constraints, leading\nto severe degradation in the performance of methods applying complete modality\nsegmentation. In this paper, we propose a Multimodal feature distillation with\nConvolutional Neural Network (CNN)-Transformer hybrid network (MCTSeg) for\naccurate brain tumor segmentation with missing modalities. We first design a\nMultimodal Feature Distillation (MFD) module to distill feature-level\nmultimodal knowledge into different unimodality to extract complete modality\ninformation. We further develop a Unimodal Feature Enhancement (UFE) module to\nmodel the relationship between global and local information semantically.\nFinally, we build a Cross-Modal Fusion (CMF) module to explicitly align the\nglobal correlations among different modalities even when some modalities are\nmissing. Complementary features within and across different modalities are\nrefined via the CNN-Transformer hybrid architectures in both the UFE and CMF\nmodules, where local and global dependencies are both captured. Our ablation\nstudy demonstrates the importance of the proposed modules with CNN-Transformer\nnetworks and the convolutional blocks in Transformer for improving the\nperformance of brain tumor segmentation with missing modalities. Extensive\nexperiments on the BraTS2018 and BraTS2020 datasets show that the proposed\nMCTSeg framework outperforms the state-of-the-art methods in missing modalities\ncases. Our code is available at: https://github.com/mkang315/MCTSeg.\n","authors":["Ming Kang","Fung Fung Ting","Raphaël C. -W. Phan","Zongyuan Ge","Chee-Ming Ting"],"pdf_url":"https://arxiv.org/pdf/2404.14019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14016v1","updated":"2024-04-22T09:29:14Z","published":"2024-04-22T09:29:14Z","title":"Ungeneralizable Examples","summary":" The training of contemporary deep learning models heavily relies on publicly\navailable data, posing a risk of unauthorized access to online data and raising\nconcerns about data privacy. Current approaches to creating unlearnable data\ninvolve incorporating small, specially designed noises, but these methods\nstrictly limit data usability, overlooking its potential usage in authorized\nscenarios. In this paper, we extend the concept of unlearnable data to\nconditional data learnability and introduce \\textbf{U}n\\textbf{G}eneralizable\n\\textbf{E}xamples (UGEs). UGEs exhibit learnability for authorized users while\nmaintaining unlearnability for potential hackers. The protector defines the\nauthorized network and optimizes UGEs to match the gradients of the original\ndata and its ungeneralizable version, ensuring learnability. To prevent\nunauthorized learning, UGEs are trained by maximizing a designated distance\nloss in a common feature space. Additionally, to further safeguard the\nauthorized side from potential attacks, we introduce additional undistillation\noptimization. Experimental results on multiple datasets and various networks\ndemonstrate that the proposed UGEs framework preserves data usability while\nreducing training performance on hacker networks, even under different types of\nattacks.\n","authors":["Jingwen Ye","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14016v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.14007v1","updated":"2024-04-22T09:16:25Z","published":"2024-04-22T09:16:25Z","title":"Infusion: Preventing Customized Text-to-Image Diffusion from Overfitting","summary":" Text-to-image (T2I) customization aims to create images that embody specific\nvisual concepts delineated in textual descriptions. However, existing works\nstill face a main challenge, concept overfitting. To tackle this challenge, we\nfirst analyze overfitting, categorizing it into concept-agnostic overfitting,\nwhich undermines non-customized concept knowledge, and concept-specific\noverfitting, which is confined to customize on limited modalities, i.e,\nbackgrounds, layouts, styles. To evaluate the overfitting degree, we further\nintroduce two metrics, i.e, Latent Fisher divergence and Wasserstein metric to\nmeasure the distribution changes of non-customized and customized concept\nrespectively. Drawing from the analysis, we propose Infusion, a T2I\ncustomization method that enables the learning of target concepts to avoid\nbeing constrained by limited training modalities, while preserving\nnon-customized knowledge. Remarkably, Infusion achieves this feat with\nremarkable efficiency, requiring a mere 11KB of trained parameters. Extensive\nexperiments also demonstrate that our approach outperforms state-of-the-art\nmethods in both single and multi-concept customized generation.\n","authors":["Weili Zeng","Yichao Yan","Qi Zhu","Zhuo Chen","Pengzhi Chu","Weiming Zhao","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.14007v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.14006v1","updated":"2024-04-22T09:16:14Z","published":"2024-04-22T09:16:14Z","title":"Distilled Datamodel with Reverse Gradient Matching","summary":" The proliferation of large-scale AI models trained on extensive datasets has\nrevolutionized machine learning. With these models taking on increasingly\ncentral roles in various applications, the need to understand their behavior\nand enhance interpretability has become paramount. To investigate the impact of\nchanges in training data on a pre-trained model, a common approach is\nleave-one-out retraining. This entails systematically altering the training\ndataset by removing specific samples to observe resulting changes within the\nmodel. However, retraining the model for each altered dataset presents a\nsignificant computational challenge, given the need to perform this operation\nfor every dataset variation. In this paper, we introduce an efficient framework\nfor assessing data impact, comprising offline training and online evaluation\nstages. During the offline training phase, we approximate the influence of\ntraining data on the target model through a distilled synset, formulated as a\nreversed gradient matching problem. For online evaluation, we expedite the\nleave-one-out process using the synset, which is then utilized to compute the\nattribution matrix based on the evaluation objective. Experimental evaluations,\nincluding training data attribution and assessments of data quality,\ndemonstrate that our proposed method achieves comparable model behavior\nevaluation while significantly speeding up the process compared to the direct\nretraining method.\n","authors":["Jingwen Ye","Ruonan Yu","Songhua Liu","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14006v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.11120v2","updated":"2024-04-22T09:06:54Z","published":"2024-03-17T07:02:55Z","title":"Unifying Feature and Cost Aggregation with Transformers for Semantic and\n Visual Correspondence","summary":" This paper introduces a Transformer-based integrative feature and cost\naggregation network designed for dense matching tasks. In the context of dense\nmatching, many works benefit from one of two forms of aggregation: feature\naggregation, which pertains to the alignment of similar features, or cost\naggregation, a procedure aimed at instilling coherence in the flow estimates\nacross neighboring pixels. In this work, we first show that feature aggregation\nand cost aggregation exhibit distinct characteristics and reveal the potential\nfor substantial benefits stemming from the judicious use of both aggregation\nprocesses. We then introduce a simple yet effective architecture that harnesses\nself- and cross-attention mechanisms to show that our approach unifies feature\naggregation and cost aggregation and effectively harnesses the strengths of\nboth techniques. Within the proposed attention layers, the features and cost\nvolume both complement each other, and the attention layers are interleaved\nthrough a coarse-to-fine design to further promote accurate correspondence\nestimation. Finally at inference, our network produces multi-scale predictions,\ncomputes their confidence scores, and selects the most confident flow for final\nprediction. Our framework is evaluated on standard benchmarks for semantic\nmatching, and also applied to geometric matching, where we show that our\napproach achieves significant improvements compared to existing methods.\n","authors":["Sunghwan Hong","Seokju Cho","Seungryong Kim","Stephen Lin"],"pdf_url":"https://arxiv.org/pdf/2403.11120v2.pdf","comment":"Accepted by ICLR'24"},{"id":"http://arxiv.org/abs/2404.13999v1","updated":"2024-04-22T09:03:21Z","published":"2024-04-22T09:03:21Z","title":"CoFInAl: Enhancing Action Quality Assessment with Coarse-to-Fine\n Instruction Alignment","summary":" Action Quality Assessment (AQA) is pivotal for quantifying actions across\ndomains like sports and medical care. Existing methods often rely on\npre-trained backbones from large-scale action recognition datasets to boost\nperformance on smaller AQA datasets. However, this common strategy yields\nsuboptimal results due to the inherent struggle of these backbones to capture\nthe subtle cues essential for AQA. Moreover, fine-tuning on smaller datasets\nrisks overfitting. To address these issues, we propose Coarse-to-Fine\nInstruction Alignment (CoFInAl). Inspired by recent advances in large language\nmodel tuning, CoFInAl aligns AQA with broader pre-trained tasks by\nreformulating it as a coarse-to-fine classification task. Initially, it learns\ngrade prototypes for coarse assessment and then utilizes fixed sub-grade\nprototypes for fine-grained assessment. This hierarchical approach mirrors the\njudging process, enhancing interpretability within the AQA framework.\nExperimental results on two long-term AQA datasets demonstrate CoFInAl achieves\nstate-of-the-art performance with significant correlation gains of 5.49% and\n3.55% on Rhythmic Gymnastics and Fis-V, respectively. Our code is available at\nhttps://github.com/ZhouKanglei/CoFInAl_AQA.\n","authors":["Kanglei Zhou","Junlin Li","Ruizhi Cai","Liyuan Wang","Xingxing Zhang","Xiaohui Liang"],"pdf_url":"https://arxiv.org/pdf/2404.13999v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.13996v1","updated":"2024-04-22T09:01:14Z","published":"2024-04-22T09:01:14Z","title":"Challenges in automatic and selective plant-clearing","summary":" With the advent of multispectral imagery and AI, there have been numerous\nworks on automatic plant segmentation for purposes such as counting, picking,\nhealth monitoring, localized pesticide delivery, etc. In this paper, we tackle\nthe related problem of automatic and selective plant-clearing in a sustainable\nforestry context, where an autonomous machine has to detect and avoid specific\nplants while clearing any weeds which may compete with the species being\ncultivated. Such an autonomous system requires a high level of robustness to\nweather conditions, plant variability, terrain and weeds while remaining cheap\nand easy to maintain. We notably discuss the lack of robustness of spectral\nimagery, investigate the impact of the reference database's size and discuss\nissues specific to AI systems operating in uncontrolled environments.\n","authors":["Fabrice Mayran de Chamisso","Loïc Cotten","Valentine Dhers","Thomas Lompech","Florian Seywert","Arnaud Susset"],"pdf_url":"https://arxiv.org/pdf/2404.13996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13993v1","updated":"2024-04-22T08:59:35Z","published":"2024-04-22T08:59:35Z","title":"Zero-Shot Character Identification and Speaker Prediction in Comics via\n Iterative Multimodal Fusion","summary":" Recognizing characters and predicting speakers of dialogue are critical for\ncomic processing tasks, such as voice generation or translation. However,\nbecause characters vary by comic title, supervised learning approaches like\ntraining character classifiers which require specific annotations for each\ncomic title are infeasible. This motivates us to propose a novel zero-shot\napproach, allowing machines to identify characters and predict speaker names\nbased solely on unannotated comic images. In spite of their importance in\nreal-world applications, these task have largely remained unexplored due to\nchallenges in story comprehension and multimodal integration. Recent large\nlanguage models (LLMs) have shown great capability for text understanding and\nreasoning, while their application to multimodal content analysis is still an\nopen problem. To address this problem, we propose an iterative multimodal\nframework, the first to employ multimodal information for both character\nidentification and speaker prediction tasks. Our experiments demonstrate the\neffectiveness of the proposed framework, establishing a robust baseline for\nthese tasks. Furthermore, since our method requires no training data or\nannotations, it can be used as-is on any comic series.\n","authors":["Yingxuan Li","Ryota Hinami","Kiyoharu Aizawa","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.13993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13992v1","updated":"2024-04-22T08:58:57Z","published":"2024-04-22T08:58:57Z","title":"Dynamic Proxy Domain Generalizes the Crowd Localization by Better Binary\n Segmentation","summary":" Crowd localization targets on predicting each instance precise location\nwithin an image. Current advanced methods propose the pixel-wise binary\nclassification to tackle the congested prediction, in which the pixel-level\nthresholds binarize the prediction confidence of being the pedestrian head.\nSince the crowd scenes suffer from extremely varying contents, counts and\nscales, the confidence-threshold learner is fragile and under-generalized\nencountering domain knowledge shift. Moreover, at the most time, the target\ndomain is agnostic in training. Hence, it is imperative to exploit how to\nenhance the generalization of confidence-threshold locator to the latent target\ndomain. In this paper, we propose a Dynamic Proxy Domain (DPD) method to\ngeneralize the learner under domain shift. Concretely, based on the theoretical\nanalysis to the generalization error risk upper bound on the latent target\ndomain to a binary classifier, we propose to introduce a generated proxy domain\nto facilitate generalization. Then, based on the theory, we design a DPD\nalgorithm which is composed by a training paradigm and proxy domain generator\nto enhance the domain generalization of the confidence-threshold learner.\nBesides, we conduct our method on five kinds of domain shift scenarios,\ndemonstrating the effectiveness on generalizing the crowd localization. Our\ncode will be available at https://github.com/zhangda1018/DPD.\n","authors":["Junyu Gao","Da Zhang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.13992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13984v1","updated":"2024-04-22T08:44:34Z","published":"2024-04-22T08:44:34Z","title":"RHanDS: Refining Malformed Hands for Generated Images with Decoupled\n Structure and Style Guidance","summary":" Although diffusion models can generate high-quality human images, their\napplications are limited by the instability in generating hands with correct\nstructures. Some previous works mitigate the problem by considering hand\nstructure yet struggle to maintain style consistency between refined malformed\nhands and other image regions. In this paper, we aim to solve the problem of\ninconsistency regarding hand structure and style. We propose a conditional\ndiffusion-based framework RHanDS to refine the hand region with the help of\ndecoupled structure and style guidance. Specifically, the structure guidance is\nthe hand mesh reconstructed from the malformed hand, serving to correct the\nhand structure. The style guidance is a hand image, e.g., the malformed hand\nitself, and is employed to furnish the style reference for hand refining. In\norder to suppress the structure leakage when referencing hand style and\neffectively utilize hand data to improve the capability of the model, we build\na multi-style hand dataset and introduce a twostage training strategy. In the\nfirst stage, we use paired hand images for training to generate hands with the\nsame style as the reference. In the second stage, various hand images generated\nbased on the human mesh are used for training to enable the model to gain\ncontrol over the hand structure. We evaluate our method and counterparts on the\ntest dataset of the proposed multi-style hand dataset. The experimental results\nshow that RHanDS can effectively refine hands structure- and style- correctly\ncompared with previous methods. The codes and datasets will be available soon.\n","authors":["Chengrui Wang","Pengfei Liu","Min Zhou","Ming Zeng","Xubin Li","Tiezheng Ge","Bo zheng"],"pdf_url":"https://arxiv.org/pdf/2404.13984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13983v1","updated":"2024-04-22T08:44:10Z","published":"2024-04-22T08:44:10Z","title":"Structure-Aware Human Body Reshaping with Adaptive Affinity-Graph\n Network","summary":" Given a source portrait, the automatic human body reshaping task aims at\nediting it to an aesthetic body shape. As the technology has been widely used\nin media, several methods have been proposed mainly focusing on generating\noptical flow to warp the body shape. However, those previous works only\nconsider the local transformation of different body parts (arms, torso, and\nlegs), ignoring the global affinity, and limiting the capacity to ensure\nconsistency and quality across the entire body. In this paper, we propose a\nnovel Adaptive Affinity-Graph Network (AAGN), which extracts the global\naffinity between different body parts to enhance the quality of the generated\noptical flow. Specifically, our AAGN primarily introduces the following\ndesigns: (1) we propose an Adaptive Affinity-Graph (AAG) Block that leverages\nthe characteristic of a fully connected graph. AAG represents different body\nparts as nodes in an adaptive fully connected graph and captures all the\naffinities between nodes to obtain a global affinity map. The design could\nbetter improve the consistency between body parts. (2) Besides, for\nhigh-frequency details are crucial for photo aesthetics, a Body Shape\nDiscriminator (BSD) is designed to extract information from both high-frequency\nand spatial domain. Particularly, an SRM filter is utilized to extract\nhigh-frequency details, which are combined with spatial features as input to\nthe BSD. With this design, BSD guides the Flow Generator (FG) to pay attention\nto various fine details rather than rigid pixel-level fitting. Extensive\nexperiments conducted on the BR-5K dataset demonstrate that our framework\nsignificantly enhances the aesthetic appeal of reshaped photos, marginally\nsurpassing all previous work to achieve state-of-the-art in all evaluation\nmetrics.\n","authors":["Qiwen Deng","Yangcen Liu","Wen Li","Guoqing Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13983v1.pdf","comment":"11 pages;"},{"id":"http://arxiv.org/abs/2404.09105v2","updated":"2024-04-22T08:40:43Z","published":"2024-04-14T00:08:56Z","title":"EGGS: Edge Guided Gaussian Splatting for Radiance Fields","summary":" The Gaussian splatting methods are getting popular. However, their loss\nfunction only contains the $\\ell_1$ norm and the structural similarity between\nthe rendered and input images, without considering the edges in these images.\nIt is well-known that the edges in an image provide important information.\nTherefore, in this paper, we propose an Edge Guided Gaussian Splatting (EGGS)\nmethod that leverages the edges in the input images. More specifically, we give\nthe edge region a higher weight than the flat region. With such edge guidance,\nthe resulting Gaussian particles focus more on the edges instead of the flat\nregions. Moreover, such edge guidance does not crease the computation cost\nduring the training and rendering stage. The experiments confirm that such\nsimple edge-weighted loss function indeed improves about $1\\sim2$ dB on several\ndifference data sets. With simply plugging in the edge guidance, the proposed\nmethod can improve all Gaussian splatting methods in different scenarios, such\nas human head modeling, building 3D reconstruction, etc.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2404.09105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13972v1","updated":"2024-04-22T08:28:41Z","published":"2024-04-22T08:28:41Z","title":"Non-Uniform Exposure Imaging via Neuromorphic Shutter Control","summary":" By leveraging the blur-noise trade-off, imaging with non-uniform exposures\nlargely extends the image acquisition flexibility in harsh environments.\nHowever, the limitation of conventional cameras in perceiving intra-frame\ndynamic information prevents existing methods from being implemented in the\nreal-world frame acquisition for real-time adaptive camera shutter control. To\naddress this challenge, we propose a novel Neuromorphic Shutter Control (NSC)\nsystem to avoid motion blurs and alleviate instant noises, where the extremely\nlow latency of events is leveraged to monitor the real-time motion and\nfacilitate the scene-adaptive exposure. Furthermore, to stabilize the\ninconsistent Signal-to-Noise Ratio (SNR) caused by the non-uniform exposure\ntimes, we propose an event-based image denoising network within a\nself-supervised learning paradigm, i.e., SEID, exploring the statistics of\nimage noises and inter-frame motion information of events to obtain artificial\nsupervision signals for high-quality imaging in real-world scenes. To\nillustrate the effectiveness of the proposed NSC, we implement it in hardware\nby building a hybrid-camera imaging prototype system, with which we collect a\nreal-world dataset containing well-synchronized frames and events in diverse\nscenarios with different target scenes and motion patterns. Experiments on the\nsynthetic and real-world datasets demonstrate the superiority of our method\nover state-of-the-art approaches.\n","authors":["Mingyuan Lin","Jian Liu","Chi Zhang","Zibo Zhao","Chu He","Lei Yu"],"pdf_url":"https://arxiv.org/pdf/2404.13972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13953v1","updated":"2024-04-22T07:54:53Z","published":"2024-04-22T07:54:53Z","title":"360VOTS: Visual Object Tracking and Segmentation in Omnidirectional\n Videos","summary":" Visual object tracking and segmentation in omnidirectional videos are\nchallenging due to the wide field-of-view and large spherical distortion\nbrought by 360{\\deg} images. To alleviate these problems, we introduce a novel\nrepresentation, extended bounding field-of-view (eBFoV), for target\nlocalization and use it as the foundation of a general 360 tracking framework\nwhich is applicable for both omnidirectional visual object tracking and\nsegmentation tasks. Building upon our previous work on omnidirectional visual\nobject tracking (360VOT), we propose a comprehensive dataset and benchmark that\nincorporates a new component called omnidirectional video object segmentation\n(360VOS). The 360VOS dataset includes 290 sequences accompanied by dense\npixel-wise masks and covers a broader range of target categories. To support\nboth the development and evaluation of algorithms in this domain, we divide the\ndataset into a training subset with 170 sequences and a testing subset with 120\nsequences. Furthermore, we tailor evaluation metrics for both omnidirectional\ntracking and segmentation to ensure rigorous assessment. Through extensive\nexperiments, we benchmark state-of-the-art approaches and demonstrate the\neffectiveness of our proposed 360 tracking framework and training dataset.\nHomepage: https://360vots.hkustvgd.com/\n","authors":["Yinzhe Xu","Huajian Huang","Yingshu Chen","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2404.13953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13949v1","updated":"2024-04-22T07:50:24Z","published":"2024-04-22T07:50:24Z","title":"PeLiCal: Targetless Extrinsic Calibration via Penetrating Lines for\n RGB-D Cameras with Limited Co-visibility","summary":" RGB-D cameras are crucial in robotic perception, given their ability to\nproduce images augmented with depth data. However, their limited FOV often\nrequires multiple cameras to cover a broader area. In multi-camera RGB-D\nsetups, the goal is typically to reduce camera overlap, optimizing spatial\ncoverage with as few cameras as possible. The extrinsic calibration of these\nsystems introduces additional complexities. Existing methods for extrinsic\ncalibration either necessitate specific tools or highly depend on the accuracy\nof camera motion estimation. To address these issues, we present PeLiCal, a\nnovel line-based calibration approach for RGB-D camera systems exhibiting\nlimited overlap. Our method leverages long line features from surroundings, and\nfilters out outliers with a novel convergence voting algorithm, achieving\ntargetless, real-time, and outlier-robust performance compared to existing\nmethods. We open source our implementation on\n\\url{https://github.com/joomeok/PeLiCal.git}.\n","authors":["Jaeho Shin","Seungsang Yun","Ayoung Kim"],"pdf_url":"https://arxiv.org/pdf/2404.13949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07762v3","updated":"2024-04-22T07:48:26Z","published":"2024-04-11T14:03:16Z","title":"NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous\n Driving","summary":" We present a versatile NeRF-based simulator for testing autonomous driving\n(AD) software systems, designed with a focus on sensor-realistic closed-loop\nevaluation and the creation of safety-critical scenarios. The simulator learns\nfrom sequences of real-world driving sensor data and enables reconfigurations\nand renderings of new, unseen scenarios. In this work, we use our simulator to\ntest the responses of AD models to safety-critical scenarios inspired by the\nEuropean New Car Assessment Programme (Euro NCAP). Our evaluation reveals that,\nwhile state-of-the-art end-to-end planners excel in nominal driving scenarios\nin an open-loop setting, they exhibit critical flaws when navigating our\nsafety-critical scenarios in a closed-loop setting. This highlights the need\nfor advancements in the safety and real-world usability of end-to-end planners.\nBy publicly releasing our simulator and scenarios as an easy-to-run evaluation\nsuite, we invite the research community to explore, refine, and validate their\nAD models in controlled, yet highly configurable and challenging\nsensor-realistic environments. Code and instructions can be found at\nhttps://github.com/atonderski/neuro-ncap\n","authors":["William Ljungbergh","Adam Tonderski","Joakim Johnander","Holger Caesar","Kalle Åström","Michael Felsberg","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2404.07762v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12734v2","updated":"2024-04-22T07:45:18Z","published":"2024-04-19T09:28:16Z","title":"DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On\n Transformer","summary":" With the continuous development of OCR technology and the expansion of\napplication fields, text recognition in complex scenes has become a key\nchallenge. Factors such as multiple fonts, mixed scenes and complex layouts\nseriously affect the recognition accuracy of traditional OCR models. Although\nOCR models based on deep learning have performed well in specific fields or\nsimilar datasets in recent years, the generalization ability and robustness of\nthe model are still a big challenge when facing complex environments with\nmultiple scenes. Furthermore, training an OCR model from scratch or fine-tuning\nall parameters is very demanding on computing resources and inference time,\nwhich limits the flexibility of its application. This study focuses on a\nfundamental aspect of mixed text recognition in response to the challenges\nmentioned above, which involves effectively fine-tuning the pre-trained basic\nOCR model to demonstrate exceptional performance across various downstream\ntasks. To this end, we propose a parameter-efficient mixed text recognition\nmethod based on pre-trained OCR Transformer, namely DLoRA-TrOCR. This method\nembeds DoRA into the image encoder and LoRA into the internal structure of the\ntext decoder, enabling efficient parameter fine-tuning for downstream tasks.\nExperimental results show that compared to similar parameter adjustment\nmethods, our model DLoRA-TrOCR has the smallest number of parameters and\nperforms better. It can achieve state-of-the-art performance on complex scene\ndatasets involving simultaneous recognition of mixed handwritten, printed and\nstreet view texts.\n","authors":["Da Chang","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2404.12734v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13947v1","updated":"2024-04-22T07:44:20Z","published":"2024-04-22T07:44:20Z","title":"Boter: Bootstrapping Knowledge Selection and Question Answering for\n Knowledge-based VQA","summary":" Knowledge-based Visual Question Answering (VQA) requires models to\nincorporate external knowledge to respond to questions about visual content.\nPrevious methods mostly follow the \"retrieve and generate\" paradigm. Initially,\nthey utilize a pre-trained retriever to fetch relevant knowledge documents,\nsubsequently employing them to generate answers. While these methods have\ndemonstrated commendable performance in the task, they possess limitations: (1)\nthey employ an independent retriever to acquire knowledge solely based on the\nsimilarity between the query and knowledge embeddings, without assessing\nwhether the knowledge document is truly conducive to helping answer the\nquestion; (2) they convert the image into text and then conduct retrieval and\nanswering in natural language space, which may not ensure comprehensive\nacquisition of all image information. To address these limitations, we propose\nBoter, a novel framework designed to bootstrap knowledge selection and question\nanswering by leveraging the robust multimodal perception capabilities of the\nMultimodal Large Language Model (MLLM). The framework consists of two modules:\nSelector and Answerer, where both are initialized by the MLLM and\nparameter-efficiently finetuned in a simple cycle: find key knowledge in the\nretrieved knowledge documents using the Selector, and then use them to finetune\nthe Answerer to predict answers; obtain the pseudo-labels of key knowledge\ndocuments based on the predictions of the Answerer and weak supervision labels,\nand then finetune the Selector to select key knowledge; repeat. Our framework\nsignificantly enhances the performance of the baseline on the challenging\nopen-domain Knowledge-based VQA benchmark, OK-VQA, achieving a state-of-the-art\naccuracy of 62.83%.\n","authors":["Dongze Hao","Qunbo Wang","Longteng Guo","Jie Jiang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05817v2","updated":"2024-04-22T07:43:09Z","published":"2024-03-09T06:48:19Z","title":"SAFDNet: A Simple and Effective Network for Fully Sparse 3D Object\n Detection","summary":" LiDAR-based 3D object detection plays an essential role in autonomous\ndriving. Existing high-performing 3D object detectors usually build dense\nfeature maps in the backbone network and prediction head. However, the\ncomputational costs introduced by the dense feature maps grow quadratically as\nthe perception range increases, making these models hard to scale up to\nlong-range detection. Some recent works have attempted to construct fully\nsparse detectors to solve this issue; nevertheless, the resulting models either\nrely on a complex multi-stage pipeline or exhibit inferior performance. In this\nwork, we propose SAFDNet, a straightforward yet highly effective architecture,\ntailored for fully sparse 3D object detection. In SAFDNet, an adaptive feature\ndiffusion strategy is designed to address the center feature missing problem.\nWe conducted extensive experiments on Waymo Open, nuScenes, and Argoverse2\ndatasets. SAFDNet performed slightly better than the previous SOTA on the first\ntwo datasets but much better on the last dataset, which features long-range\ndetection, verifying the efficacy of SAFDNet in scenarios where long-range\ndetection is required. Notably, on Argoverse2, SAFDNet surpassed the previous\nbest hybrid detector HEDNet by 2.6% mAP while being 2.1x faster, and yielded\n2.1% mAP gains over the previous best sparse detector FSDv2 while being 1.3x\nfaster. The code will be available at https://github.com/zhanggang001/HEDNet.\n","authors":["Gang Zhang","Junnan Chen","Guohuan Gao","Jianmin Li","Si Liu","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2403.05817v2.pdf","comment":"Accepted by CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.13944v1","updated":"2024-04-22T07:40:53Z","published":"2024-04-22T07:40:53Z","title":"Gorgeous: Create Your Desired Character Facial Makeup from Any Ideas","summary":" Contemporary makeup transfer methods primarily focus on replicating makeup\nfrom one face to another, considerably limiting their use in creating diverse\nand creative character makeup essential for visual storytelling. Such methods\ntypically fail to address the need for uniqueness and contextual relevance,\nspecifically aligning with character and story settings as they depend heavily\non existing facial makeup in reference images. This approach also presents a\nsignificant challenge when attempting to source a perfectly matched facial\nmakeup style, further complicating the creation of makeup designs inspired by\nvarious story elements, such as theme, background, and props that do not\nnecessarily feature faces. To address these limitations, we introduce\n$Gorgeous$, a novel diffusion-based makeup application method that goes beyond\nsimple transfer by innovatively crafting unique and thematic facial makeup.\nUnlike traditional methods, $Gorgeous$ does not require the presence of a face\nin the reference images. Instead, it draws artistic inspiration from a minimal\nset of three to five images, which can be of any type, and transforms these\nelements into practical makeup applications directly on the face. Our\ncomprehensive experiments demonstrate that $Gorgeous$ can effectively generate\ndistinctive character facial makeup inspired by the chosen thematic reference\nimages. This approach opens up new possibilities for integrating broader story\nelements into character makeup, thereby enhancing the narrative depth and\nvisual impact in storytelling.\n","authors":["Jia Wei Sii","Chee Seng Chan"],"pdf_url":"https://arxiv.org/pdf/2404.13944v1.pdf","comment":"Project page: https://github.com/JiaWeiSii/gorgeous/"},{"id":"http://arxiv.org/abs/2308.04956v2","updated":"2024-04-22T07:22:39Z","published":"2023-08-09T13:41:30Z","title":"Improved cryo-EM Pose Estimation and 3D Classification through\n Latent-Space Disentanglement","summary":" Due to the extremely low signal-to-noise ratio (SNR) and unknown poses\n(projection angles and image shifts) in cryo-electron microscopy (cryo-EM)\nexperiments, reconstructing 3D volumes from 2D images is very challenging. In\naddition to these challenges, heterogeneous cryo-EM reconstruction requires\nconformational classification. In popular cryo-EM reconstruction algorithms,\nposes and conformation classification labels must be predicted for every input\ncryo-EM image, which can be computationally costly for large datasets. An\nemerging class of methods adopted the amortized inference approach. In these\nmethods, only a subset of the input dataset is needed to train neural networks\nfor the estimation of poses and conformations. Once trained, these neural\nnetworks can make pose/conformation predictions and 3D reconstructions at low\ncost for the entire dataset during inference. Unfortunately, when facing\nheterogeneous reconstruction tasks, it is hard for current\namortized-inference-based methods to effectively estimate the conformational\ndistribution and poses from entangled latent variables. Here, we propose a\nself-supervised variational autoencoder architecture called \"HetACUMN\" based on\namortized inference. We employed an auxiliary conditional pose prediction task\nby inverting the order of encoder-decoder to explicitly enforce the\ndisentanglement of conformation and pose predictions. Results on simulated\ndatasets show that HetACUMN generated more accurate conformational\nclassifications than other amortized or non-amortized methods. Furthermore, we\nshow that HetACUMN is capable of performing heterogeneous 3D reconstructions of\na real experimental dataset.\n","authors":["Weijie Chen","Yuhang Wang","Lin Yao"],"pdf_url":"https://arxiv.org/pdf/2308.04956v2.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2404.13929v1","updated":"2024-04-22T07:08:13Z","published":"2024-04-22T07:08:13Z","title":"Exploring Kinetic Curves Features for the Classification of Benign and\n Malignant Breast Lesions in DCE-MRI","summary":" Breast cancer is the most common malignant tumor among women and the second\ncause of cancer-related death. Early diagnosis in clinical practice is crucial\nfor timely treatment and prognosis. Dynamic contrast-enhanced magnetic\nresonance imaging (DCE-MRI) has revealed great usability in the preoperative\ndiagnosis and assessing therapy effects thanks to its capability to reflect the\nmorphology and dynamic characteristics of breast lesions. However, most\nexisting computer-assisted diagnosis algorithms only consider conventional\nradiomic features when classifying benign and malignant lesions in DCE-MRI. In\nthis study, we propose to fully leverage the dynamic characteristics from the\nkinetic curves as well as the radiomic features to boost the classification\naccuracy of benign and malignant breast lesions. The proposed method is a fully\nautomated solution by directly analyzing the 3D features from the DCE-MRI. The\nproposed method is evaluated on an in-house dataset including 200 DCE-MRI scans\nwith 298 breast tumors (172 benign and 126 malignant tumors), achieving\nfavorable classification accuracy with an area under curve (AUC) of 0.94. By\nsimultaneously considering the dynamic and radiomic features, it is beneficial\nto effectively distinguish between benign and malignant breast lesions.\n","authors":["Zixian Li","Yuming Zhong","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13929v1.pdf","comment":"6 pages, 8 figures, conference"},{"id":"http://arxiv.org/abs/2404.13923v1","updated":"2024-04-22T07:00:17Z","published":"2024-04-22T07:00:17Z","title":"MaterialSeg3D: Segmenting Dense Materials from 2D Priors for 3D Assets","summary":" Driven by powerful image diffusion models, recent research has achieved the\nautomatic creation of 3D objects from textual or visual guidance. By performing\nscore distillation sampling (SDS) iteratively across different views, these\nmethods succeed in lifting 2D generative prior to the 3D space. However, such a\n2D generative image prior bakes the effect of illumination and shadow into the\ntexture. As a result, material maps optimized by SDS inevitably involve\nspurious correlated components. The absence of precise material definition\nmakes it infeasible to relight the generated assets reasonably in novel scenes,\nwhich limits their application in downstream scenarios. In contrast, humans can\neffortlessly circumvent this ambiguity by deducing the material of the object\nfrom its appearance and semantics. Motivated by this insight, we propose\nMaterialSeg3D, a 3D asset material generation framework to infer underlying\nmaterial from the 2D semantic prior. Based on such a prior model, we devise a\nmechanism to parse material in 3D space. We maintain a UV stack, each map of\nwhich is unprojected from a specific viewpoint. After traversing all\nviewpoints, we fuse the stack through a weighted voting scheme and then employ\nregion unification to ensure the coherence of the object parts. To fuel the\nlearning of semantics prior, we collect a material dataset, named Materialized\nIndividual Objects (MIO), which features abundant images, diverse categories,\nand accurate annotations. Extensive quantitative and qualitative experiments\ndemonstrate the effectiveness of our method.\n","authors":["Zeyu Li","Ruitong Gan","Chuanchen Luo","Yuxi Wang","Jiaheng Liu","Ziwei Zhu Man Zhang","Qing Li","Xucheng Yin","Zhaoxiang Zhang","Junran Peng"],"pdf_url":"https://arxiv.org/pdf/2404.13923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13921v1","updated":"2024-04-22T06:59:03Z","published":"2024-04-22T06:59:03Z","title":"NeRF-DetS: Enhancing Multi-View 3D Object Detection with\n Sampling-adaptive Network of Continuous NeRF-based Representation","summary":" As a preliminary work, NeRF-Det unifies the tasks of novel view synthesis and\n3D perception, demonstrating that perceptual tasks can benefit from novel view\nsynthesis methods like NeRF, significantly improving the performance of indoor\nmulti-view 3D object detection. Using the geometry MLP of NeRF to direct the\nattention of detection head to crucial parts and incorporating self-supervised\nloss from novel view rendering contribute to the achieved improvement. To\nbetter leverage the notable advantages of the continuous representation through\nneural rendering in space, we introduce a novel 3D perception network\nstructure, NeRF-DetS. The key component of NeRF-DetS is the Multi-level\nSampling-Adaptive Network, making the sampling process adaptively from coarse\nto fine. Also, we propose a superior multi-view information fusion method,\nknown as Multi-head Weighted Fusion. This fusion approach efficiently addresses\nthe challenge of losing multi-view information when using arithmetic mean,\nwhile keeping low computational costs. NeRF-DetS outperforms competitive\nNeRF-Det on the ScanNetV2 dataset, by achieving +5.02% and +5.92% improvement\nin mAP@.25 and mAP@.50, respectively.\n","authors":["Chi Huang","Xinyang Li","Shengchuan Zhang","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.13921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03537v2","updated":"2024-04-22T23:15:32Z","published":"2024-04-04T15:45:25Z","title":"If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face\n Recognition through Synthetic Faces","summary":" Recent advances in deep face recognition have spurred a growing demand for\nlarge, diverse, and manually annotated face datasets. Acquiring authentic,\nhigh-quality data for face recognition has proven to be a challenge, primarily\ndue to privacy concerns. Large face datasets are primarily sourced from\nweb-based images, lacking explicit user consent. In this paper, we examine\nwhether and how synthetic face data can be used to train effective face\nrecognition models with reduced reliance on authentic images, thereby\nmitigating data collection concerns. First, we explored the performance gap\namong recent state-of-the-art face recognition models, trained with synthetic\ndata only and authentic (scarce) data only. Then, we deepened our analysis by\ntraining a state-of-the-art backbone with various combinations of synthetic and\nauthentic data, gaining insights into optimizing the limited use of the latter\nfor verification accuracy. Finally, we assessed the effectiveness of data\naugmentation approaches on synthetic and authentic data, with the same goal in\nmind. Our results highlighted the effectiveness of FR trained on combined\ndatasets, particularly when combined with appropriate augmentation techniques.\n","authors":["Andrea Atzori","Fadi Boutros","Naser Damer","Gianni Fenu","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2404.03537v2.pdf","comment":"Accepted as full paper at FG 2024 main track"},{"id":"http://arxiv.org/abs/2401.08396v3","updated":"2024-04-22T23:04:41Z","published":"2024-01-16T14:41:20Z","title":"Hidden Flaws Behind Expert-Level Accuracy of GPT-4 Vision in Medicine","summary":" Recent studies indicate that Generative Pre-trained Transformer 4 with Vision\n(GPT-4V) outperforms human physicians in medical challenge tasks. However,\nthese evaluations primarily focused on the accuracy of multi-choice questions\nalone. Our study extends the current scope by conducting a comprehensive\nanalysis of GPT-4V's rationales of image comprehension, recall of medical\nknowledge, and step-by-step multimodal reasoning when solving New England\nJournal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test\nthe knowledge and diagnostic capabilities of medical professionals. Evaluation\nresults confirmed that GPT-4V performs comparatively to human physicians\nregarding multi-choice accuracy (81.6% vs. 77.8%). GPT-4V also performs well in\ncases where physicians incorrectly answer, with over 78% accuracy. However, we\ndiscovered that GPT-4V frequently presents flawed rationales in cases where it\nmakes the correct final choices (35.5%), most prominent in image comprehension\n(27.2%). Regardless of GPT-4V's high accuracy in multi-choice questions, our\nfindings emphasize the necessity for further in-depth evaluations of its\nrationales before integrating such multimodal AI models into clinical\nworkflows.\n","authors":["Qiao Jin","Fangyuan Chen","Yiliang Zhou","Ziyang Xu","Justin M. Cheung","Robert Chen","Ronald M. Summers","Justin F. Rousseau","Peiyun Ni","Marc J Landsman","Sally L. Baxter","Subhi J. Al'Aref","Yijia Li","Alex Chen","Josef A. Brejt","Michael F. Chiang","Yifan Peng","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.08396v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2303.06797v3","updated":"2024-04-22T22:39:12Z","published":"2023-03-13T01:07:32Z","title":"Multichannel Orthogonal Transform-Based Perceptron Layers for Efficient\n ResNets","summary":" In this paper, we propose a set of transform-based neural network layers as\nan alternative to the $3\\times3$ Conv2D layers in Convolutional Neural Networks\n(CNNs). The proposed layers can be implemented based on orthogonal transforms\nsuch as the Discrete Cosine Transform (DCT), Hadamard transform (HT), and\nbiorthogonal Block Wavelet Transform (BWT). Furthermore, by taking advantage of\nthe convolution theorems, convolutional filtering operations are performed in\nthe transform domain using element-wise multiplications. Trainable\nsoft-thresholding layers, that remove noise in the transform domain, bring\nnonlinearity to the transform domain layers. Compared to the Conv2D layer,\nwhich is spatial-agnostic and channel-specific, the proposed layers are\nlocation-specific and channel-specific. Moreover, these proposed layers reduce\nthe number of parameters and multiplications significantly while improving the\naccuracy results of regular ResNets on the ImageNet-1K classification task.\nFurthermore, they can be inserted with a batch normalization layer before the\nglobal average pooling layer in the conventional ResNets as an additional layer\nto improve classification accuracy.\n","authors":["Hongyi Pan","Emadeldeen Hamdan","Xin Zhu","Salih Atici","Ahmet Enis Cetin"],"pdf_url":"https://arxiv.org/pdf/2303.06797v3.pdf","comment":"This work is accepted to IEEE Transactions on Neural Networks and\n Learning Systems. The initial title is \"Orthogonal Transform Domain\n Approaches for the Convolutional Layer\". We changed it to \"Multichannel\n Orthogonal Transform-Based Perceptron Layers for Efficient ResNets\" based on\n reviewer's comment. arXiv admin note: text overlap with arXiv:2211.08577"},{"id":"http://arxiv.org/abs/2404.14606v1","updated":"2024-04-22T22:02:19Z","published":"2024-04-22T22:02:19Z","title":"Cross-Task Multi-Branch Vision Transformer for Facial Expression and\n Mask Wearing Classification","summary":" With wearing masks becoming a new cultural norm, facial expression\nrecognition (FER) while taking masks into account has become a significant\nchallenge. In this paper, we propose a unified multi-branch vision transformer\nfor facial expression recognition and mask wearing classification tasks. Our\napproach extracts shared features for both tasks using a dual-branch\narchitecture that obtains multi-scale feature representations. Furthermore, we\npropose a cross-task fusion phase that processes tokens for each task with\nseparate branches, while exchanging information using a cross attention module.\nOur proposed framework reduces the overall complexity compared with using\nseparate networks for both tasks by the simple yet effective cross-task fusion\nphase. Extensive experiments demonstrate that our proposed model performs\nbetter than or on par with different state-of-the-art methods on both facial\nexpression recognition and facial mask wearing classification task.\n","authors":["Armando Zhu","Keqin Li","Tong Wu","Peng Zhao","Wenjing Zhou","Bo Hong"],"pdf_url":"https://arxiv.org/pdf/2404.14606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14588v1","updated":"2024-04-22T21:30:11Z","published":"2024-04-22T21:30:11Z","title":"Brain-Inspired Continual Learning-Robust Feature Distillation and\n Re-Consolidation for Class Incremental Learning","summary":" Artificial intelligence (AI) and neuroscience share a rich history, with\nadvancements in neuroscience shaping the development of AI systems capable of\nhuman-like knowledge retention. Leveraging insights from neuroscience and\nexisting research in adversarial and continual learning, we introduce a novel\nframework comprising two core concepts: feature distillation and\nre-consolidation. Our framework, named Robust Rehearsal, addresses the\nchallenge of catastrophic forgetting inherent in continual learning (CL)\nsystems by distilling and rehearsing robust features. Inspired by the mammalian\nbrain's memory consolidation process, Robust Rehearsal aims to emulate the\nrehearsal of distilled experiences during learning tasks. Additionally, it\nmimics memory re-consolidation, where new experiences influence the integration\nof past experiences to mitigate forgetting. Extensive experiments conducted on\nCIFAR10, CIFAR100, and real-world helicopter attitude datasets showcase the\nsuperior performance of CL models trained with Robust Rehearsal compared to\nbaseline methods. Furthermore, examining different optimization training\nobjectives-joint, continual, and adversarial learning-we highlight the crucial\nrole of feature learning in model performance. This underscores the\nsignificance of rehearsing CL-robust samples in mitigating catastrophic\nforgetting. In conclusion, aligning CL approaches with neuroscience insights\noffers promising solutions to the challenge of catastrophic forgetting, paving\nthe way for more robust and human-like AI systems.\n","authors":["Hikmat Khan","Nidhal Carla Bouaynaya","Ghulam Rasool"],"pdf_url":"https://arxiv.org/pdf/2404.14588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12459v3","updated":"2024-04-22T21:28:17Z","published":"2024-03-19T05:30:50Z","title":"Non-negative Contrastive Learning","summary":" Deep representations have shown promising performance when transferred to\ndownstream tasks in a black-box manner. Yet, their inherent lack of\ninterpretability remains a significant challenge, as these features are often\nopaque to human understanding. In this paper, we propose Non-negative\nContrastive Learning (NCL), a renaissance of Non-negative Matrix Factorization\n(NMF) aimed at deriving interpretable features. The power of NCL lies in its\nenforcement of non-negativity constraints on features, reminiscent of NMF's\ncapability to extract features that align closely with sample clusters. NCL not\nonly aligns mathematically well with an NMF objective but also preserves NMF's\ninterpretability attributes, resulting in a more sparse and disentangled\nrepresentation compared to standard contrastive learning (CL). Theoretically,\nwe establish guarantees on the identifiability and downstream generalization of\nNCL. Empirically, we show that these advantages enable NCL to outperform CL\nsignificantly on feature disentanglement, feature selection, as well as\ndownstream classification tasks. At last, we show that NCL can be easily\nextended to other learning scenarios and benefit supervised learning as well.\nCode is available at https://github.com/PKU-ML/non_neg.\n","authors":["Yifei Wang","Qi Zhang","Yaoyu Guo","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12459v3.pdf","comment":"22 pages. Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2404.14581v1","updated":"2024-04-22T21:00:13Z","published":"2024-04-22T21:00:13Z","title":"The Adversarial AI-Art: Understanding, Generation, Detection, and\n Benchmarking","summary":" Generative AI models can produce high-quality images based on text prompts.\nThe generated images often appear indistinguishable from images generated by\nconventional optical photography devices or created by human artists (i.e.,\nreal images). While the outstanding performance of such generative models is\ngenerally well received, security concerns arise. For instance, such image\ngenerators could be used to facilitate fraud or scam schemes, generate and\nspread misinformation, or produce fabricated artworks. In this paper, we\npresent a systematic attempt at understanding and detecting AI-generated images\n(AI-art) in adversarial scenarios. First, we collect and share a dataset of\nreal images and their corresponding artificial counterparts generated by four\npopular AI image generators. The dataset, named ARIA, contains over 140K images\nin five categories: artworks (painting), social media images, news photos,\ndisaster scenes, and anime pictures. This dataset can be used as a foundation\nto support future research on adversarial AI-art. Next, we present a user study\nthat employs the ARIA dataset to evaluate if real-world users can distinguish\nwith or without reference images. In a benchmarking study, we further evaluate\nif state-of-the-art open-source and commercial AI image detectors can\neffectively identify the images in the ARIA dataset. Finally, we present a\nResNet-50 classifier and evaluate its accuracy and transferability on the ARIA\ndataset.\n","authors":["Yuying Li","Zeyan Liu","Junyi Zhao","Liangqin Ren","Fengjun Li","Jiebo Luo","Bo Luo"],"pdf_url":"https://arxiv.org/pdf/2404.14581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08974v2","updated":"2024-04-22T20:58:52Z","published":"2024-03-13T21:43:24Z","title":"Representing Anatomical Trees by Denoising Diffusion of Implicit Neural\n Fields","summary":" Anatomical trees play a central role in clinical diagnosis and treatment\nplanning. However, accurately representing anatomical trees is challenging due\nto their varying and complex topology and geometry. Traditional methods for\nrepresenting tree structures, captured using medical imaging, while invaluable\nfor visualizing vascular and bronchial networks, exhibit drawbacks in terms of\nlimited resolution, flexibility, and efficiency. Recently, implicit neural\nrepresentations (INRs) have emerged as a powerful tool for representing shapes\naccurately and efficiently. We propose a novel approach for representing\nanatomical trees using INR, while also capturing the distribution of a set of\ntrees via denoising diffusion in the space of INRs. We accurately capture the\nintricate geometries and topologies of anatomical trees at any desired\nresolution. Through extensive qualitative and quantitative evaluation, we\ndemonstrate high-fidelity tree reconstruction with arbitrary resolution yet\ncompact storage, and versatility across anatomical sites and tree complexities.\n","authors":["Ashish Sinha","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2403.08974v2.pdf","comment":"Preprint. In review. Code: https://github.com/sinAshish/TreeDiffusion"},{"id":"http://arxiv.org/abs/2311.00259v2","updated":"2024-04-22T20:43:55Z","published":"2023-11-01T03:15:10Z","title":"Solutions to Elliptic and Parabolic Problems via Finite Difference Based\n Unsupervised Small Linear Convolutional Neural Networks","summary":" In recent years, there has been a growing interest in leveraging deep\nlearning and neural networks to address scientific problems, particularly in\nsolving partial differential equations (PDEs). However, many neural\nnetwork-based methods like PINNs rely on auto differentiation and sampling\ncollocation points, leading to a lack of interpretability and lower accuracy\nthan traditional numerical methods. As a result, we propose a fully\nunsupervised approach, requiring no training data, to estimate finite\ndifference solutions for PDEs directly via small linear convolutional neural\nnetworks. Our proposed approach uses substantially fewer parameters than\nsimilar finite difference-based approaches while also demonstrating comparable\naccuracy to the true solution for several selected elliptic and parabolic\nproblems compared to the finite difference method.\n","authors":["Adrian Celaya","Keegan Kirk","David Fuentes","Beatrice Riviere"],"pdf_url":"https://arxiv.org/pdf/2311.00259v2.pdf","comment":"Submitted to CMA, under review"},{"id":"http://arxiv.org/abs/2312.01117v3","updated":"2024-04-22T20:38:05Z","published":"2023-12-02T12:23:07Z","title":"Paved2Paradise: Cost-Effective and Scalable LiDAR Simulation by\n Factoring the Real World","summary":" To achieve strong real world performance, neural networks must be trained on\nlarge, diverse datasets; however, obtaining and annotating such datasets is\ncostly and time-consuming, particularly for 3D point clouds. In this paper, we\ndescribe Paved2Paradise, a simple, cost-effective approach for generating fully\nlabeled, diverse, and realistic lidar datasets from scratch, all while\nrequiring minimal human annotation. Our key insight is that, by deliberately\ncollecting separate \"background\" and \"object\" datasets (i.e., \"factoring the\nreal world\"), we can intelligently combine them to produce a combinatorially\nlarge and diverse training set. The Paved2Paradise pipeline thus consists of\nfour steps: (1) collecting copious background data, (2) recording individuals\nfrom the desired object class(es) performing different behaviors in an isolated\nenvironment (like a parking lot), (3) bootstrapping labels for the object\ndataset, and (4) generating samples by placing objects at arbitrary locations\nin backgrounds. To demonstrate the utility of Paved2Paradise, we generated\nsynthetic datasets for two tasks: (1) human detection in orchards (a task for\nwhich no public data exists) and (2) pedestrian detection in urban\nenvironments. Qualitatively, we find that a model trained exclusively on\nPaved2Paradise synthetic data is highly effective at detecting humans in\norchards, including when individuals are heavily occluded by tree branches.\nQuantitatively, a model trained on Paved2Paradise data that sources backgrounds\nfrom KITTI performs comparably to a model trained on the actual dataset. These\nresults suggest the Paved2Paradise synthetic data pipeline can help accelerate\npoint cloud model development in sectors where acquiring lidar datasets has\npreviously been cost-prohibitive.\n","authors":["Michael A. Alcorn","Noah Schwartz"],"pdf_url":"https://arxiv.org/pdf/2312.01117v3.pdf","comment":"Accepted to the Synthetic Data for Computer Vision workshop at CVPR\n 2024"},{"id":"http://arxiv.org/abs/2402.13251v2","updated":"2024-04-22T20:35:38Z","published":"2024-02-20T18:59:00Z","title":"FlashTex: Fast Relightable Mesh Texturing with LightControlNet","summary":" Manually creating textures for 3D meshes is time-consuming, even for expert\nvisual content creators. We propose a fast approach for automatically texturing\nan input 3D mesh based on a user-provided text prompt. Importantly, our\napproach disentangles lighting from surface material/reflectance in the\nresulting texture so that the mesh can be properly relit and rendered in any\nlighting environment. We introduce LightControlNet, a new text-to-image model\nbased on the ControlNet architecture, which allows the specification of the\ndesired lighting as a conditioning image to the model. Our text-to-texture\npipeline then constructs the texture in two stages. The first stage produces a\nsparse set of visually consistent reference views of the mesh using\nLightControlNet. The second stage applies a texture optimization based on Score\nDistillation Sampling (SDS) that works with LightControlNet to increase the\ntexture quality while disentangling surface material from lighting. Our\nalgorithm is significantly faster than previous text-to-texture methods, while\nproducing high-quality and relightable textures.\n","authors":["Kangle Deng","Timothy Omernick","Alexander Weiss","Deva Ramanan","Jun-Yan Zhu","Tinghui Zhou","Maneesh Agrawala"],"pdf_url":"https://arxiv.org/pdf/2402.13251v2.pdf","comment":"Project page: https://flashtex.github.io/"},{"id":"http://arxiv.org/abs/2404.14568v1","updated":"2024-04-22T20:30:45Z","published":"2024-04-22T20:30:45Z","title":"UVMap-ID: A Controllable and Personalized UV Map Generative Model","summary":" Recently, diffusion models have made significant strides in synthesizing\nrealistic 2D human images based on provided text prompts. Building upon this,\nresearchers have extended 2D text-to-image diffusion models into the 3D domain\nfor generating human textures (UV Maps). However, some important problems about\nUV Map Generative models are still not solved, i.e., how to generate\npersonalized texture maps for any given face image, and how to define and\nevaluate the quality of these generated texture maps. To solve the above\nproblems, we introduce a novel method, UVMap-ID, which is a controllable and\npersonalized UV Map generative model. Unlike traditional large-scale training\nmethods in 2D, we propose to fine-tune a pre-trained text-to-image diffusion\nmodel which is integrated with a face fusion module for achieving ID-driven\ncustomized generation. To support the finetuning strategy, we introduce a\nsmall-scale attribute-balanced training dataset, including high-quality\ntextures with labeled text and Face ID. Additionally, we introduce some metrics\nto evaluate the multiple aspects of the textures. Finally, both quantitative\nand qualitative analyses demonstrate the effectiveness of our method in\ncontrollable and personalized UV Map generation. Code is publicly available via\nhttps://github.com/twowwj/UVMap-ID.\n","authors":["Weijie Wang","Jichao Zhang","Chang Liu","Xia Li","Xingqian Xu","Humphrey Shi","Nicu Sebe","Bruno Lepri"],"pdf_url":"https://arxiv.org/pdf/2404.14568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14565v1","updated":"2024-04-22T20:21:32Z","published":"2024-04-22T20:21:32Z","title":"\"Where am I?\" Scene Retrieval with Language","summary":" Natural language interfaces to embodied AI are becoming more ubiquitous in\nour daily lives. This opens further opportunities for language-based\ninteraction with embodied agents, such as a user instructing an agent to\nexecute some task in a specific location. For example, \"put the bowls back in\nthe cupboard next to the fridge\" or \"meet me at the intersection under the red\nsign.\" As such, we need methods that interface between natural language and map\nrepresentations of the environment. To this end, we explore the question of\nwhether we can use an open-set natural language query to identify a scene\nrepresented by a 3D scene graph. We define this task as \"language-based\nscene-retrieval\" and it is closely related to \"coarse-localization,\" but we are\ninstead searching for a match from a collection of disjoint scenes and not\nnecessarily a large-scale continuous map. Therefore, we present\nText2SceneGraphMatcher, a \"scene-retrieval\" pipeline that learns joint\nembeddings between text descriptions and scene graphs to determine if they are\nmatched. The code, trained models, and datasets will be made public.\n","authors":["Jiaqi Chen","Daniel Barath","Iro Armeni","Marc Pollefeys","Hermann Blum"],"pdf_url":"https://arxiv.org/pdf/2404.14565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14560v1","updated":"2024-04-22T20:15:43Z","published":"2024-04-22T20:15:43Z","title":"Adaptive Local Binary Pattern: A Novel Feature Descriptor for Enhanced\n Analysis of Kidney Abnormalities in CT Scan Images using ensemble based\n Machine Learning Approach","summary":" The shortage of nephrologists and the growing public health concern over\nrenal failure have spurred the demand for AI systems capable of autonomously\ndetecting kidney abnormalities. Renal failure, marked by a gradual decline in\nkidney function, can result from factors like cysts, stones, and tumors.\nChronic kidney disease may go unnoticed initially, leading to untreated cases\nuntil they reach an advanced stage. The dataset, comprising 12,427 images from\nmultiple hospitals in Dhaka, was categorized into four groups: cyst, tumor,\nstone, and normal. Our methodology aims to enhance CT scan image quality using\nCropping, Resizing, and CALHE techniques, followed by feature extraction with\nour proposed Adaptive Local Binary Pattern (A-LBP) feature extraction method\ncompared with the state-of-the-art local binary pattern (LBP) method. Our\nproposed features fed into classifiers such as Random Forest, Decision Tree,\nNaive Bayes, K-Nearest Neighbor, and SVM. We explored an ensemble model with\nsoft voting to get a more robust model for our task. We got the highest of more\nthan 99% in accuracy using our feature descriptor and ensembling five\nclassifiers (Random Forest, Decision Tree, Naive Bayes, K-Nearest Neighbor,\nSupport Vector Machine) with the soft voting method.\n","authors":["Tahmim Hossain","Faisal Sayed","Solehin Islam"],"pdf_url":"https://arxiv.org/pdf/2404.14560v1.pdf","comment":"17 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2312.09067v2","updated":"2024-04-22T20:06:03Z","published":"2023-12-14T16:04:14Z","title":"Holodeck: Language Guided Generation of 3D Embodied AI Environments","summary":" 3D simulated environments play a critical role in Embodied AI, but their\ncreation requires expertise and extensive manual effort, restricting their\ndiversity and scope. To mitigate this limitation, we present Holodeck, a system\nthat generates 3D environments to match a user-supplied prompt fully\nautomatedly. Holodeck can generate diverse scenes, e.g., arcades, spas, and\nmuseums, adjust the designs for styles, and can capture the semantics of\ncomplex queries such as \"apartment for a researcher with a cat\" and \"office of\na professor who is a fan of Star Wars\". Holodeck leverages a large language\nmodel (i.e., GPT-4) for common sense knowledge about what the scene might look\nlike and uses a large collection of 3D assets from Objaverse to populate the\nscene with diverse objects. To address the challenge of positioning objects\ncorrectly, we prompt GPT-4 to generate spatial relational constraints between\nobjects and then optimize the layout to satisfy those constraints. Our\nlarge-scale human evaluation shows that annotators prefer Holodeck over\nmanually designed procedural baselines in residential scenes and that Holodeck\ncan produce high-quality outputs for diverse scene types. We also demonstrate\nan exciting application of Holodeck in Embodied AI, training agents to navigate\nin novel scenes like music rooms and daycares without human-constructed data,\nwhich is a significant step forward in developing general-purpose embodied\nagents.\n","authors":["Yue Yang","Fan-Yun Sun","Luca Weihs","Eli VanderBilt","Alvaro Herrasti","Winson Han","Jiajun Wu","Nick Haber","Ranjay Krishna","Lingjie Liu","Chris Callison-Burch","Mark Yatskar","Aniruddha Kembhavi","Christopher Clark"],"pdf_url":"https://arxiv.org/pdf/2312.09067v2.pdf","comment":"Published in CVPR 2024, 21 pages, 27 figures, 2 tables"},{"id":"http://arxiv.org/abs/2404.14542v1","updated":"2024-04-22T19:29:12Z","published":"2024-04-22T19:29:12Z","title":"UVEB: A Large-scale Benchmark and Baseline Towards Real-World Underwater\n Video Enhancement","summary":" Learning-based underwater image enhancement (UIE) methods have made great\nprogress. However, the lack of large-scale and high-quality paired training\nsamples has become the main bottleneck hindering the development of UIE. The\ninter-frame information in underwater videos can accelerate or optimize the UIE\nprocess. Thus, we constructed the first large-scale high-resolution underwater\nvideo enhancement benchmark (UVEB) to promote the development of underwater\nvision.It contains 1,308 pairs of video sequences and more than 453,000\nhigh-resolution with 38\\% Ultra-High-Definition (UHD) 4K frame pairs. UVEB\ncomes from multiple countries, containing various scenes and video degradation\ntypes to adapt to diverse and complex underwater environments. We also propose\nthe first supervised underwater video enhancement method, UVE-Net. UVE-Net\nconverts the current frame information into convolutional kernels and passes\nthem to adjacent frames for efficient inter-frame information exchange. By\nfully utilizing the redundant degraded information of underwater videos,\nUVE-Net completes video enhancement better. Experiments show the effective\nnetwork design and good performance of UVE-Net.\n","authors":["Yaofeng Xie","Lingwei Kong","Kai Chen","Ziqiang Zheng","Xiao Yu","Zhibin Yu","Bing Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.14542v1.pdf","comment":"10 pages,CVPR2024 accept"},{"id":"http://arxiv.org/abs/2403.08755v2","updated":"2024-04-22T19:17:49Z","published":"2024-03-13T17:53:47Z","title":"DAM: Dynamic Adapter Merging for Continual Video QA Learning","summary":" We present a parameter-efficient method for continual video\nquestion-answering (VidQA) learning. Our method, named DAM, uses the proposed\nDynamic Adapter Merging to (i) mitigate catastrophic forgetting, (ii) enable\nefficient adaptation to continually arriving datasets, (iii) handle inputs from\nunknown datasets during inference, and (iv) enable knowledge sharing across\nsimilar dataset domains. Given a set of continually streaming VidQA datasets,\nwe sequentially train dataset-specific adapters for each dataset while freezing\nthe parameters of a large pretrained video-language backbone. During inference,\ngiven a video-question sample from an unknown domain, our method first uses the\nproposed non-parametric router function to compute a probability for each\nadapter, reflecting how relevant that adapter is to the current video-question\ninput instance. Subsequently, the proposed dynamic adapter merging scheme\naggregates all the adapter weights into a new adapter instance tailored for\nthat particular test sample to compute the final VidQA prediction, mitigating\nthe impact of inaccurate router predictions and facilitating knowledge sharing\nacross domains. Our DAM model outperforms prior state-of-the-art continual\nlearning approaches by 9.1% while exhibiting 1.9% less forgetting on 6 VidQA\ndatasets spanning various domains. We further extend DAM to continual image\nclassification and image QA and outperform prior methods by a large margin. The\ncode is publicly available at: https://github.com/klauscc/DAM\n","authors":["Feng Cheng","Ziyang Wang","Yi-Lin Sung","Yan-Bo Lin","Mohit Bansal","Gedas Bertasius"],"pdf_url":"https://arxiv.org/pdf/2403.08755v2.pdf","comment":"The first two authors contribute equally"},{"id":"http://arxiv.org/abs/2404.14533v1","updated":"2024-04-22T19:01:18Z","published":"2024-04-22T19:01:18Z","title":"SwinFuSR: an image fusion-inspired model for RGB-guided thermal image\n super-resolution","summary":" Thermal imaging plays a crucial role in various applications, but the\ninherent low resolution of commonly available infrared (IR) cameras limits its\neffectiveness. Conventional super-resolution (SR) methods often struggle with\nthermal images due to their lack of high-frequency details. Guided SR leverages\ninformation from a high-resolution image, typically in the visible spectrum, to\nenhance the reconstruction of a high-res IR image from the low-res input.\nInspired by SwinFusion, we propose SwinFuSR, a guided SR architecture based on\nSwin transformers. In real world scenarios, however, the guiding modality (e.g.\nRBG image) may be missing, so we propose a training method that improves the\nrobustness of the model in this case. Our method has few parameters and\noutperforms state of the art models in terms of Peak Signal to Noise Ratio\n(PSNR) and Structural SIMilarity (SSIM). In Track 2 of the PBVS 2024 Thermal\nImage Super-Resolution Challenge, it achieves 3rd place in the PSNR metric. Our\ncode and pretained weights are available at\nhttps://github.com/VisionICLab/SwinFuSR.\n","authors":["Cyprien Arnold","Philippe Jouvet","Lama Seoud"],"pdf_url":"https://arxiv.org/pdf/2404.14533v1.pdf","comment":"Accepted at 20th IEEE Workshop on Perception Beyond the Visible\n Spectrum, CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11593v2","updated":"2024-04-22T18:21:24Z","published":"2024-04-17T17:45:08Z","title":"IntrinsicAnything: Learning Diffusion Priors for Inverse Rendering Under\n Unknown Illumination","summary":" This paper aims to recover object materials from posed images captured under\nan unknown static lighting condition. Recent methods solve this task by\noptimizing material parameters through differentiable physically based\nrendering. However, due to the coupling between object geometry, materials, and\nenvironment lighting, there is inherent ambiguity during the inverse rendering\nprocess, preventing previous methods from obtaining accurate results. To\novercome this ill-posed problem, our key idea is to learn the material prior\nwith a generative model for regularizing the optimization process. We observe\nthat the general rendering equation can be split into diffuse and specular\nshading terms, and thus formulate the material prior as diffusion models of\nalbedo and specular. Thanks to this design, our model can be trained using the\nexisting abundant 3D object data, and naturally acts as a versatile tool to\nresolve the ambiguity when recovering material representations from RGB images.\nIn addition, we develop a coarse-to-fine training strategy that leverages\nestimated materials to guide diffusion models to satisfy multi-view consistent\nconstraints, leading to more stable and accurate results. Extensive experiments\non real-world and synthetic datasets demonstrate that our approach achieves\nstate-of-the-art performance on material recovery. The code will be available\nat https://zju3dv.github.io/IntrinsicAnything.\n","authors":["Xi Chen","Sida Peng","Dongchen Yang","Yuan Liu","Bowen Pan","Chengfei Lv","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.11593v2.pdf","comment":"Project page: https://zju3dv.github.io/IntrinsicAnything"},{"id":"http://arxiv.org/abs/2404.14507v1","updated":"2024-04-22T18:18:41Z","published":"2024-04-22T18:18:41Z","title":"Align Your Steps: Optimizing Sampling Schedules in Diffusion Models","summary":" Diffusion models (DMs) have established themselves as the state-of-the-art\ngenerative modeling approach in the visual domain and beyond. A crucial\ndrawback of DMs is their slow sampling speed, relying on many sequential\nfunction evaluations through large neural networks. Sampling from DMs can be\nseen as solving a differential equation through a discretized set of noise\nlevels known as the sampling schedule. While past works primarily focused on\nderiving efficient solvers, little attention has been given to finding optimal\nsampling schedules, and the entire literature relies on hand-crafted\nheuristics. In this work, for the first time, we propose a general and\nprincipled approach to optimizing the sampling schedules of DMs for\nhigh-quality outputs, called $\\textit{Align Your Steps}$. We leverage methods\nfrom stochastic calculus and find optimal schedules specific to different\nsolvers, trained DMs and datasets. We evaluate our novel approach on several\nimage, video as well as 2D toy data synthesis benchmarks, using a variety of\ndifferent samplers, and observe that our optimized schedules outperform\nprevious hand-crafted schedules in almost all experiments. Our method\ndemonstrates the untapped potential of sampling schedule optimization,\nespecially in the few-step synthesis regime.\n","authors":["Amirmojtaba Sabour","Sanja Fidler","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2404.14507v1.pdf","comment":"Project page:\n https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/"},{"id":"http://arxiv.org/abs/2404.14471v1","updated":"2024-04-22T17:55:07Z","published":"2024-04-22T17:55:07Z","title":"Narrative Action Evaluation with Prompt-Guided Multimodal Interaction","summary":" In this paper, we investigate a new problem called narrative action\nevaluation (NAE). NAE aims to generate professional commentary that evaluates\nthe execution of an action. Unlike traditional tasks such as score-based action\nquality assessment and video captioning involving superficial sentences, NAE\nfocuses on creating detailed narratives in natural language. These narratives\nprovide intricate descriptions of actions along with objective evaluations. NAE\nis a more challenging task because it requires both narrative flexibility and\nevaluation rigor. One existing possible solution is to use multi-task learning,\nwhere narrative language and evaluative information are predicted separately.\nHowever, this approach results in reduced performance for individual tasks\nbecause of variations between tasks and differences in modality between\nlanguage information and evaluation information. To address this, we propose a\nprompt-guided multimodal interaction framework. This framework utilizes a pair\nof transformers to facilitate the interaction between different modalities of\ninformation. It also uses prompts to transform the score regression task into a\nvideo-text matching task, thus enabling task interactivity. To support further\nresearch in this field, we re-annotate the MTL-AQA and FineGym datasets with\nhigh-quality and comprehensive action narration. Additionally, we establish\nbenchmarks for NAE. Extensive experiment results prove that our method\noutperforms separate learning methods and naive multi-task learning methods.\nData and code are released at\n\\href{https://github.com/shiyi-zh0408/NAE_CVPR2024 }{here}.\n","authors":["Shiyi Zhang","Sule Bai","Guangyi Chen","Lei Chen","Jiwen Lu","Junle Wang","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2404.14471v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13911v1","updated":"2024-04-22T06:43:18Z","published":"2024-04-22T06:43:18Z","title":"Global OpenBuildingMap -- Unveiling the Mystery of Global Buildings","summary":" Understanding how buildings are distributed globally is crucial to revealing\nthe human footprint on our home planet. This built environment affects local\nclimate, land surface albedo, resource distribution, and many other key factors\nthat influence well-being and human health. Despite this, quantitative and\ncomprehensive data on the distribution and properties of buildings worldwide is\nlacking. To this end, by using a big data analytics approach and nearly 800,000\nsatellite images, we generated the highest resolution and highest accuracy\nbuilding map ever created: the Global OpenBuildingMap (Global OBM). A joint\nanalysis of building maps and solar potentials indicates that rooftop solar\nenergy can supply the global energy consumption need at a reasonable cost.\nSpecifically, if solar panels were placed on the roofs of all buildings, they\ncould supply 1.1-3.3 times -- depending on the efficiency of the solar device\n-- the global energy consumption in 2020, which is the year with the highest\nconsumption on record. We also identified a clear geospatial correlation\nbetween building areas and key socioeconomic variables, which indicates our\nglobal building map can serve as an important input to modeling global\nsocioeconomic needs and drivers.\n","authors":["Xiao Xiang Zhu","Qingyu Li","Yilei Shi","Yuanyuan Wang","Adam Stewart","Jonathan Prexl"],"pdf_url":"https://arxiv.org/pdf/2404.13911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13904v1","updated":"2024-04-22T06:28:41Z","published":"2024-04-22T06:28:41Z","title":"Deep Regression Representation Learning with Topology","summary":" Most works studying representation learning focus only on classification and\nneglect regression. Yet, the learning objectives and therefore the\nrepresentation topologies of the two tasks are fundamentally different:\nclassification targets class separation, leading to disconnected\nrepresentations, whereas regression requires ordinality with respect to the\ntarget, leading to continuous representations. We thus wonder how the\neffectiveness of a regression representation is influenced by its topology,\nwith evaluation based on the Information Bottleneck (IB) principle.\n The IB principle is an important framework that provides principles for\nlearning effectiveness representations. We establish two connections between it\nand the topology of regression representations. The first connection reveals\nthat a lower intrinsic dimension of the feature space implies a reduced\ncomplexity of the representation Z. This complexity can be quantified as the\nconditional entropy of Z on the target space Y and serves as an upper bound on\nthe generalization error. The second connection suggests learning a feature\nspace that is topologically similar to the target space will better align with\nthe IB principle. Based on these two connections, we introduce PH-Reg, a\nregularizer specific to regression that matches the intrinsic dimension and\ntopology of the feature space with the target space. Experiments on synthetic\nand real-world regression tasks demonstrate the benefits of PH-Reg.\n","authors":["Shihao Zhang","kenji kawaguchi","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2404.13904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08513v4","updated":"2024-04-22T05:52:44Z","published":"2023-09-15T16:19:09Z","title":"SCT: A Simple Baseline for Parameter-Efficient Fine-Tuning via Salient\n Channels","summary":" Pre-trained vision transformers have strong representation benefits to\nvarious downstream tasks. Recently, many parameter-efficient fine-tuning (PEFT)\nmethods have been proposed, and their experiments demonstrate that tuning only\n1% of extra parameters could surpass full fine-tuning in low-data resource\nscenarios. However, these methods overlook the task-specific information when\nfine-tuning diverse downstream tasks. In this paper, we propose a simple yet\neffective method called \"Salient Channel Tuning\" (SCT) to leverage the\ntask-specific information by forwarding the model with the task images to\nselect partial channels in a feature map that enables us to tune only 1/8\nchannels leading to significantly lower parameter costs. Experiments outperform\nfull fine-tuning on 18 out of 19 tasks in the VTAB-1K benchmark by adding only\n0.11M parameters of the ViT-B, which is 780x fewer than its full fine-tuning\ncounterpart. Furthermore, experiments on domain generalization and few-shot\nlearning surpass other PEFT methods with lower parameter costs, demonstrating\nour proposed tuning technique's strong capability and effectiveness in the\nlow-data regime.\n","authors":["Henry Hengyuan Zhao","Pichao Wang","Yuyang Zhao","Hao Luo","Fan Wang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2309.08513v4.pdf","comment":"This work has been accepted by IJCV2023"},{"id":"http://arxiv.org/abs/2106.14490v5","updated":"2024-04-22T05:24:45Z","published":"2021-06-28T09:09:14Z","title":"Making Images Real Again: A Comprehensive Survey on Deep Image\n Composition","summary":" As a common image editing operation, image composition aims to combine the\nforeground from one image and another background image, resulting in a\ncomposite image. However, there are many issues that could make the composite\nimages unrealistic. These issues can be summarized as the inconsistency between\nforeground and background, which includes appearance inconsistency (e.g.,\nincompatible illumination), geometry inconsistency (e.g., unreasonable size),\nand semantic inconsistency (e.g., mismatched semantic context). Image\ncomposition task could be decomposed into multiple sub-tasks, in which each\nsub-task targets at one or more issues. Specifically, object placement aims to\nfind reasonable scale, location, and shape for the foreground. Image blending\naims to address the unnatural boundary between foreground and background. Image\nharmonization aims to adjust the illumination statistics of foreground. Shadow\ngeneration aims to generate plausible shadow for the foreground. These\nsub-tasks can be executed sequentially or parallelly to acquire realistic\ncomposite images. To the best of our knowledge, there is no previous survey on\nimage composition. In this paper, we conduct comprehensive survey over the\nsub-tasks and combinatorial task of image composition. For each one, we\nsummarize the existing methods, available datasets, and common evaluation\nmetrics. Datasets and codes for image composition are summarized at\nhttps://github.com/bcmi/Awesome-Image-Composition. We have also contributed the\nfirst image composition toolbox: libcom https://github.com/bcmi/libcom, which\nassembles 10+ image composition related functions (e.g., image blending, image\nharmonization, object placement, shadow generation, generative composition).\nThe ultimate goal of this toolbox is solving all the problems related to image\ncomposition with simple `import libcom'.\n","authors":["Li Niu","Wenyan Cong","Liu Liu","Yan Hong","Bo Zhang","Jing Liang","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2106.14490v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13884v1","updated":"2024-04-22T05:12:11Z","published":"2024-04-22T05:12:11Z","title":"MambaUIE&SR: Unraveling the Ocean's Secrets with Only 2.8 FLOPs","summary":" Underwater Image Enhancement (UIE) techniques aim to address the problem of\nunderwater image degradation due to light absorption and scattering. In recent\nyears, both Convolution Neural Network (CNN)-based and Transformer-based\nmethods have been widely explored. In addition, combining CNN and Transformer\ncan effectively combine global and local information for enhancement. However,\nthis approach is still affected by the secondary complexity of the Transformer\nand cannot maximize the performance. Recently, the state-space model (SSM)\nbased architecture Mamba has been proposed, which excels in modeling long\ndistances while maintaining linear complexity. This paper explores the\npotential of this SSM-based model for UIE from both efficiency and\neffectiveness perspectives. However, the performance of directly applying Mamba\nis poor because local fine-grained features, which are crucial for image\nenhancement, cannot be fully utilized. Specifically, we customize the MambaUIE\narchitecture for efficient UIE. Specifically, we introduce visual state space\n(VSS) blocks to capture global contextual information at the macro level while\nmining local information at the micro level. Also, for these two kinds of\ninformation, we propose a Dynamic Interaction Block (DIB) and Spatial\nfeed-forward Network (SGFN) for intra-block feature aggregation. MambaUIE is\nable to efficiently synthesize global and local information and maintains a\nvery small number of parameters with high accuracy. Experiments on UIEB\ndatasets show that our method reduces GFLOPs by 67.4% (2.715G) relative to the\nSOTA method. To the best of our knowledge, this is the first UIE model\nconstructed based on SSM that breaks the limitation of FLOPs on accuracy in\nUIE. The official repository of MambaUIE at\nhttps://github.com/1024AILab/MambaUIE.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.13884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15902v2","updated":"2024-04-22T05:10:57Z","published":"2024-01-29T06:06:45Z","title":"A Concise but High-performing Network for Image Guided Depth Completion\n in Autonomous Driving","summary":" Depth completion is a crucial task in autonomous driving, aiming to convert a\nsparse depth map into a dense depth prediction. Due to its potentially rich\nsemantic information, RGB image is commonly fused to enhance the completion\neffect. Image-guided depth completion involves three key challenges: 1) how to\neffectively fuse the two modalities; 2) how to better recover depth\ninformation; and 3) how to achieve real-time prediction for practical\nautonomous driving. To solve the above problems, we propose a concise but\neffective network, named CENet, to achieve high-performance depth completion\nwith a simple and elegant structure. Firstly, we use a fast guidance module to\nfuse the two sensor features, utilizing abundant auxiliary features extracted\nfrom the color space. Unlike other commonly used complicated guidance modules,\nour approach is intuitive and low-cost. In addition, we find and analyze the\noptimization inconsistency problem for observed and unobserved positions, and a\ndecoupled depth prediction head is proposed to alleviate the issue. The\nproposed decoupled head can better output the depth of valid and invalid\npositions with very few extra inference time. Based on the simple structure of\ndual-encoder and single-decoder, our CENet can achieve superior balance between\naccuracy and efficiency. In the KITTI depth completion benchmark, our CENet\nattains competitive performance and inference speed compared with the\nstate-of-the-art methods. To validate the generalization of our method, we also\nevaluate on indoor NYUv2 dataset, and our CENet still achieve impressive\nresults. The code of this work will be available at\nhttps://github.com/lmomoy/CHNet.\n","authors":["Moyun Liu","Bing Chen","Youping Chen","Jingming Xie","Lei Yao","Yang Zhang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.15902v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13880v1","updated":"2024-04-22T05:07:02Z","published":"2024-04-22T05:07:02Z","title":"Regional Style and Color Transfer","summary":" This paper presents a novel contribution to the field of regional style\ntransfer. Existing methods often suffer from the drawback of applying style\nhomogeneously across the entire image, leading to stylistic inconsistencies or\nforeground object twisted when applied to image with foreground elements such\nas person figures. To address this limitation, we propose a new approach that\nleverages a segmentation network to precisely isolate foreground objects within\nthe input image. Subsequently, style transfer is applied exclusively to the\nbackground region. The isolated foreground objects are then carefully\nreintegrated into the style-transferred background. To enhance the visual\ncoherence between foreground and background, a color transfer step is employed\non the foreground elements prior to their rein-corporation. Finally, we utilize\nfeathering techniques to achieve a seamless amalgamation of foreground and\nbackground, resulting in a visually unified and aesthetically pleasing final\ncomposition. Extensive evaluations demonstrate that our proposed approach\nyields significantly more natural stylistic transformations compared to\nconventional methods.\n","authors":["Zhicheng Ding","Panfeng Li","Qikai Yang","Xinyu Shen","Siyang Li","Qingtian Gong"],"pdf_url":"https://arxiv.org/pdf/2404.13880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13874v1","updated":"2024-04-22T04:49:22Z","published":"2024-04-22T04:49:22Z","title":"VALOR-EVAL: Holistic Coverage and Faithfulness Evaluation of Large\n Vision-Language Models","summary":" Large Vision-Language Models (LVLMs) suffer from hallucination issues,\nwherein the models generate plausible-sounding but factually incorrect outputs,\nundermining their reliability. A comprehensive quantitative evaluation is\nnecessary to identify and understand the extent of hallucinations in these\nmodels. However, existing benchmarks are often limited in scope, focusing\nmainly on object hallucinations. Furthermore, current evaluation methods\nstruggle to effectively address the subtle semantic distinctions between model\noutputs and reference data, as well as the balance between hallucination and\ninformativeness. To address these issues, we introduce a multi-dimensional\nbenchmark covering objects, attributes, and relations, with challenging images\nselected based on associative biases. Moreover, we propose an large language\nmodel (LLM)-based two-stage evaluation framework that generalizes the popular\nCHAIR metric and incorporates both faithfulness and coverage into the\nevaluation. Experiments on 10 established LVLMs demonstrate that our evaluation\nmetric is more comprehensive and better correlated with humans than existing\nwork when evaluating on our challenging human annotated benchmark dataset. Our\nwork also highlights the critical balance between faithfulness and coverage of\nmodel outputs, and encourages future works to address hallucinations in LVLMs\nwhile keeping their outputs informative.\n","authors":["Haoyi Qiu","Wenbo Hu","Zi-Yi Dou","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2404.13874v1.pdf","comment":"Work in process"},{"id":"http://arxiv.org/abs/2404.13873v1","updated":"2024-04-22T04:47:52Z","published":"2024-04-22T04:47:52Z","title":"Texture-aware and Shape-guided Transformer for Sequential DeepFake\n Detection","summary":" Sequential DeepFake detection is an emerging task that aims to predict the\nmanipulation sequence in order. Existing methods typically formulate it as an\nimage-to-sequence problem, employing conventional Transformer architectures for\ndetection. However, these methods lack dedicated design and consequently result\nin limited performance. In this paper, we propose a novel Texture-aware and\nShape-guided Transformer to enhance detection performance. Our method features\nfour major improvements. Firstly, we describe a texture-aware branch that\neffectively captures subtle manipulation traces with the Diversiform Pixel\nDifference Attention module. Then we introduce a Bidirectional Interaction\nCross-attention module that seeks deep correlations among spatial and\nsequential features, enabling effective modeling of complex manipulation\ntraces. To further enhance the cross-attention, we describe a Shape-guided\nGaussian mapping strategy, providing initial priors of the manipulation shape.\nFinally, observing that the latter manipulation in a sequence may influence\ntraces left in the earlier one, we intriguingly invert the prediction order\nfrom forward to backward, leading to notable gains as expected. Extensive\nexperimental results demonstrate that our method outperforms others by a large\nmargin, highlighting the superiority of our method.\n","authors":["Yunfei Li","Jiaran Zhou","Xin Wang","Junyu Dong","Yuezun Li"],"pdf_url":"https://arxiv.org/pdf/2404.13873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13872v1","updated":"2024-04-22T04:41:42Z","published":"2024-04-22T04:41:42Z","title":"FreqBlender: Enhancing DeepFake Detection by Blending Frequency\n Knowledge","summary":" Generating synthetic fake faces, known as pseudo-fake faces, is an effective\nway to improve the generalization of DeepFake detection. Existing methods\ntypically generate these faces by blending real or fake faces in color space.\nWhile these methods have shown promise, they overlook the simulation of\nfrequency distribution in pseudo-fake faces, limiting the learning of generic\nforgery traces in-depth. To address this, this paper introduces {\\em\nFreqBlender}, a new method that can generate pseudo-fake faces by blending\nfrequency knowledge. Specifically, we investigate the major frequency\ncomponents and propose a Frequency Parsing Network to adaptively partition\nfrequency components related to forgery traces. Then we blend this frequency\nknowledge from fake faces into real faces to generate pseudo-fake faces. Since\nthere is no ground truth for frequency components, we describe a dedicated\ntraining strategy by leveraging the inner correlations among different\nfrequency knowledge to instruct the learning process. Experimental results\ndemonstrate the effectiveness of our method in enhancing DeepFake detection,\nmaking it a potential plug-and-play strategy for other methods.\n","authors":["Hanzhe Li","Jiaran Zhou","Bin Li","Junyu Dong","Yuezun Li"],"pdf_url":"https://arxiv.org/pdf/2404.13872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13868v1","updated":"2024-04-22T04:33:40Z","published":"2024-04-22T04:33:40Z","title":"TeamTrack: A Dataset for Multi-Sport Multi-Object Tracking in Full-pitch\n Videos","summary":" Multi-object tracking (MOT) is a critical and challenging task in computer\nvision, particularly in situations involving objects with similar appearances\nbut diverse movements, as seen in team sports. Current methods, largely reliant\non object detection and appearance, often fail to track targets in such complex\nscenarios accurately. This limitation is further exacerbated by the lack of\ncomprehensive and diverse datasets covering the full view of sports pitches.\nAddressing these issues, we introduce TeamTrack, a pioneering benchmark dataset\nspecifically designed for MOT in sports. TeamTrack is an extensive collection\nof full-pitch video data from various sports, including soccer, basketball, and\nhandball. Furthermore, we perform a comprehensive analysis and benchmarking\neffort to underscore TeamTrack's utility and potential impact. Our work\nsignifies a crucial step forward, promising to elevate the precision and\neffectiveness of MOT in complex, dynamic settings such as team sports. The\ndataset, project code and competition is released at:\nhttps://atomscott.github.io/TeamTrack/.\n","authors":["Atom Scott","Ikuma Uchida","Ning Ding","Rikuhei Umemoto","Rory Bunker","Ren Kobayashi","Takeshi Koyama","Masaki Onishi","Yoshinari Kameda","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2404.13868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13866v1","updated":"2024-04-22T04:31:09Z","published":"2024-04-22T04:31:09Z","title":"Plug-and-Play Algorithm Convergence Analysis From The Standpoint of\n Stochastic Differential Equation","summary":" The Plug-and-Play (PnP) algorithm is popular for inverse image\nproblem-solving. However, this algorithm lacks theoretical analysis of its\nconvergence with more advanced plug-in denoisers. We demonstrate that discrete\nPnP iteration can be described by a continuous stochastic differential equation\n(SDE). We can also achieve this transformation through Markov process\nformulation of PnP. Then, we can take a higher standpoint of PnP algorithms\nfrom stochastic differential equations, and give a unified framework for the\nconvergence property of PnP according to the solvability condition of its\ncorresponding SDE. We reveal that a much weaker condition, bounded denoiser\nwith Lipschitz continuous measurement function would be enough for its\nconvergence guarantee, instead of previous Lipschitz continuous denoiser\ncondition.\n","authors":["Zhongqi Wang","Bingnan Wang","Maosheng Xiang"],"pdf_url":"https://arxiv.org/pdf/2404.13866v1.pdf","comment":"17pages, Preprint, Under review"},{"id":"http://arxiv.org/abs/2404.13863v1","updated":"2024-04-22T04:25:02Z","published":"2024-04-22T04:25:02Z","title":"PM-VIS: High-Performance Box-Supervised Video Instance Segmentation","summary":" Labeling pixel-wise object masks in videos is a resource-intensive and\nlaborious process. Box-supervised Video Instance Segmentation (VIS) methods\nhave emerged as a viable solution to mitigate the labor-intensive annotation\nprocess. . In practical applications, the two-step approach is not only more\nflexible but also exhibits a higher recognition accuracy. Inspired by the\nrecent success of Segment Anything Model (SAM), we introduce a novel approach\nthat aims at harnessing instance box annotations from multiple perspectives to\ngenerate high-quality instance pseudo masks, thus enriching the information\ncontained in instance annotations. We leverage ground-truth boxes to create\nthree types of pseudo masks using the HQ-SAM model, the box-supervised VIS\nmodel (IDOL-BoxInst), and the VOS model (DeAOT) separately, along with three\ncorresponding optimization mechanisms. Additionally, we introduce two\nground-truth data filtering methods, assisted by high-quality pseudo masks, to\nfurther enhance the training dataset quality and improve the performance of\nfully supervised VIS methods. To fully capitalize on the obtained high-quality\nPseudo Masks, we introduce a novel algorithm, PM-VIS, to integrate mask losses\ninto IDOL-BoxInst. Our PM-VIS model, trained with high-quality pseudo mask\nannotations, demonstrates strong ability in instance mask prediction, achieving\nstate-of-the-art performance on the YouTube-VIS 2019, YouTube-VIS 2021, and\nOVIS validation sets, notably narrowing the gap between box-supervised and\nfully supervised VIS methods.\n","authors":["Zhangjing Yang","Dun Liu","Wensheng Cheng","Jinqiao Wang","Yi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.13863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13862v1","updated":"2024-04-22T04:22:30Z","published":"2024-04-22T04:22:30Z","title":"PGAHum: Prior-Guided Geometry and Appearance Learning for High-Fidelity\n Animatable Human Reconstruction","summary":" Recent techniques on implicit geometry representation learning and neural\nrendering have shown promising results for 3D clothed human reconstruction from\nsparse video inputs. However, it is still challenging to reconstruct detailed\nsurface geometry and even more difficult to synthesize photorealistic novel\nviews with animated human poses. In this work, we introduce PGAHum, a\nprior-guided geometry and appearance learning framework for high-fidelity\nanimatable human reconstruction. We thoroughly exploit 3D human priors in three\nkey modules of PGAHum to achieve high-quality geometry reconstruction with\nintricate details and photorealistic view synthesis on unseen poses. First, a\nprior-based implicit geometry representation of 3D human, which contains a\ndelta SDF predicted by a tri-plane network and a base SDF derived from the\nprior SMPL model, is proposed to model the surface details and the body shape\nin a disentangled manner. Second, we introduce a novel prior-guided sampling\nstrategy that fully leverages the prior information of the human pose and body\nto sample the query points within or near the body surface. By avoiding\nunnecessary learning in the empty 3D space, the neural rendering can recover\nmore appearance details. Last, we propose a novel iterative backward\ndeformation strategy to progressively find the correspondence for the query\npoint in observation space. A skinning weights prediction model is learned\nbased on the prior provided by the SMPL model to achieve the iterative backward\nLBS deformation. Extensive quantitative and qualitative comparisons on various\ndatasets are conducted and the results demonstrate the superiority of our\nframework. Ablation studies also verify the effectiveness of each scheme for\ngeometry and appearance learning.\n","authors":["Hao Wang","Qingshan Xu","Hongyuan Chen","Rui Ma"],"pdf_url":"https://arxiv.org/pdf/2404.13862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13859v1","updated":"2024-04-22T04:16:40Z","published":"2024-04-22T04:16:40Z","title":"Unveiling and Mitigating Generalized Biases of DNNs through the\n Intrinsic Dimensions of Perceptual Manifolds","summary":" Building fair deep neural networks (DNNs) is a crucial step towards achieving\ntrustworthy artificial intelligence. Delving into deeper factors that affect\nthe fairness of DNNs is paramount and serves as the foundation for mitigating\nmodel biases. However, current methods are limited in accurately predicting DNN\nbiases, relying solely on the number of training samples and lacking more\nprecise measurement tools. Here, we establish a geometric perspective for\nanalyzing the fairness of DNNs, comprehensively exploring how DNNs internally\nshape the intrinsic geometric characteristics of datasets-the intrinsic\ndimensions (IDs) of perceptual manifolds, and the impact of IDs on the fairness\nof DNNs. Based on multiple findings, we propose Intrinsic Dimension\nRegularization (IDR), which enhances the fairness and performance of models by\npromoting the learning of concise and ID-balanced class perceptual manifolds.\nIn various image recognition benchmark tests, IDR significantly mitigates model\nbias while improving its performance.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Lingling Li","Wenping Ma","Shuyuan Yang","Xu Liu","Puhua Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13859v1.pdf","comment":"8pages, 6figures, Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2403.01693v2","updated":"2024-04-22T03:53:51Z","published":"2024-03-04T03:00:22Z","title":"HanDiffuser: Text-to-Image Generation With Realistic Hand Appearances","summary":" Text-to-image generative models can generate high-quality humans, but realism\nis lost when generating hands. Common artifacts include irregular hand poses,\nshapes, incorrect numbers of fingers, and physically implausible finger\norientations. To generate images with realistic hands, we propose a novel\ndiffusion-based architecture called HanDiffuser that achieves realism by\ninjecting hand embeddings in the generative process. HanDiffuser consists of\ntwo components: a Text-to-Hand-Params diffusion model to generate SMPL-Body and\nMANO-Hand parameters from input text prompts, and a Text-Guided\nHand-Params-to-Image diffusion model to synthesize images by conditioning on\nthe prompts and hand parameters generated by the previous component. We\nincorporate multiple aspects of hand representation, including 3D shapes and\njoint-level finger positions, orientations and articulations, for robust\nlearning and reliable performance during inference. We conduct extensive\nquantitative and qualitative experiments and perform user studies to\ndemonstrate the efficacy of our method in generating images with high-quality\nhands.\n","authors":["Supreeth Narasimhaswamy","Uttaran Bhattacharya","Xiang Chen","Ishita Dasgupta","Saayan Mitra","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2403.01693v2.pdf","comment":"Revisions: 1. Added a link to project page in the abstract, 2.\n Updated references and related work, 3. Fixed some grammatical errors"},{"id":"http://arxiv.org/abs/2404.13854v1","updated":"2024-04-22T03:39:03Z","published":"2024-04-22T03:39:03Z","title":"Self-Supervised Monocular Depth Estimation in the Dark: Towards Data\n Distribution Compensation","summary":" Nighttime self-supervised monocular depth estimation has received increasing\nattention in recent years. However, using night images for self-supervision is\nunreliable because the photometric consistency assumption is usually violated\nin the videos taken under complex lighting conditions. Even with domain\nadaptation or photometric loss repair, performance is still limited by the poor\nsupervision of night images on trainable networks. In this paper, we propose a\nself-supervised nighttime monocular depth estimation method that does not use\nany night images during training. Our framework utilizes day images as a stable\nsource for self-supervision and applies physical priors (e.g., wave optics,\nreflection model and read-shot noise model) to compensate for some key\nday-night differences. With day-to-night data distribution compensation, our\nframework can be trained in an efficient one-stage self-supervised manner.\nThough no nighttime images are considered during training, qualitative and\nquantitative results demonstrate that our method achieves SoTA depth estimating\nresults on the challenging nuScenes-Night and RobotCar-Night compared with\nexisting methods.\n","authors":["Haolin Yang","Chaoqiang Zhao","Lu Sheng","Yang Tang"],"pdf_url":"https://arxiv.org/pdf/2404.13854v1.pdf","comment":"Accepted by IJCAI2024"},{"id":"http://arxiv.org/abs/2311.15145v3","updated":"2024-04-22T03:32:18Z","published":"2023-11-26T00:06:12Z","title":"Choosing Wisely and Learning Deeply: Selective Cross-Modality\n Distillation via CLIP for Domain Generalization","summary":" Domain Generalization (DG), a crucial research area, seeks to train models\nacross multiple domains and test them on unseen ones. In this paper, we\nintroduce a novel approach, namely, Selective Cross-Modality Distillation for\nDomain Generalization (SCMD). SCMD leverages the capabilities of large\nvision-language models, specifically CLIP, to train a more efficient model,\nensuring it acquires robust generalization capabilities across unseen domains.\nOur primary contribution is a unique selection framework strategically designed\nto identify hard-to-learn samples for distillation. In parallel, we introduce a\nnovel cross-modality module that seamlessly combines the projected features of\nthe student model with the text embeddings from CLIP, ensuring the alignment of\nsimilarity distributions. We assess SCMD's performance on various benchmarks,\nwhere it empowers a ResNet50 to deliver state-of-the-art performance,\nsurpassing existing domain generalization methods. Furthermore, we provide a\ntheoretical analysis of our selection strategy, offering deeper insight into\nits effectiveness and potential in the field of DG.\n","authors":["Jixuan Leng","Yijiang Li","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15145v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13848v1","updated":"2024-04-22T03:15:42Z","published":"2024-04-22T03:15:42Z","title":"DSDRNet: Disentangling Representation and Reconstruct Network for Domain\n Generalization","summary":" Domain generalization faces challenges due to the distribution shift between\ntraining and testing sets, and the presence of unseen target domains. Common\nsolutions include domain alignment, meta-learning, data augmentation, or\nensemble learning, all of which rely on domain labels or domain adversarial\ntechniques. In this paper, we propose a Dual-Stream Separation and\nReconstruction Network, dubbed DSDRNet. It is a disentanglement-reconstruction\napproach that integrates features of both inter-instance and intra-instance\nthrough dual-stream fusion. The method introduces novel supervised signals by\ncombining inter-instance semantic distance and intra-instance similarity.\nIncorporating Adaptive Instance Normalization (AdaIN) into a two-stage cyclic\nreconstruction process enhances self-disentangled reconstruction signals to\nfacilitate model convergence. Extensive experiments on four benchmark datasets\ndemonstrate that DSDRNet outperforms other popular methods in terms of domain\ngeneralization capabilities.\n","authors":["Juncheng Yang","Zuchao Li","Shuai Xie","Wei Yu","Shijun Li"],"pdf_url":"https://arxiv.org/pdf/2404.13848v1.pdf","comment":"This paper is accepted to IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.13847v1","updated":"2024-04-22T03:05:32Z","published":"2024-04-22T03:05:32Z","title":"EventLens: Leveraging Event-Aware Pretraining and Cross-modal Linking\n Enhances Visual Commonsense Reasoning","summary":" Visual Commonsense Reasoning (VCR) is a cognitive task, challenging models to\nanswer visual questions requiring human commonsense, and to provide rationales\nexplaining why the answers are correct. With emergence of Large Language Models\n(LLMs), it is natural and imperative to explore their applicability to VCR.\nHowever, VCR task demands more external knowledge to tackle its challenging\nquestions, necessitating special designs to activate LLMs' commonsense\nreasoning abilities. Also, most existing Multimodal LLMs adopted an abstraction\nof entire input image, which makes it difficult to comprehend VCR's unique\nco-reference tags between image regions and text, posing challenges for\nfine-grained alignment. To address these issues, we propose EventLens that\nleverages Event-Aware Pretraining and Cross-modal Linking and EnhanceS VCR.\nFirst, by emulating the cognitive process of human reasoning, an Event-Aware\nPretraining auxiliary task is introduced to better activate LLM's global\ncomprehension of intricate scenarios. Second, during fine-tuning, we further\nutilize reference tags to bridge RoI features with texts, while preserving both\nmodality semantics. Finally, we use instruct-style prompts to narrow the gap\nbetween pretraining and fine-tuning, and task-specific adapters to better\nintegrate LLM's inherent knowledge with new commonsense. Experimental results\nshow the effectiveness of our proposed auxiliary task and fine-grained linking\nstrategy.\n","authors":["Mingjie Ma","Zhihuan Yu","Yichao Ma","Guohui Li"],"pdf_url":"https://arxiv.org/pdf/2404.13847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11202v2","updated":"2024-04-22T02:46:44Z","published":"2024-04-17T09:33:31Z","title":"GhostNetV3: Exploring the Training Strategies for Compact Models","summary":" Compact neural networks are specially designed for applications on edge\ndevices with faster inference speed yet modest performance. However, training\nstrategies of compact models are borrowed from that of conventional models at\npresent, which ignores their difference in model capacity and thus may impede\nthe performance of compact models. In this paper, by systematically\ninvestigating the impact of different training ingredients, we introduce a\nstrong training strategy for compact models. We find that the appropriate\ndesigns of re-parameterization and knowledge distillation are crucial for\ntraining high-performance compact models, while some commonly used data\naugmentations for training conventional models, such as Mixup and CutMix, lead\nto worse performance. Our experiments on ImageNet-1K dataset demonstrate that\nour specialized training strategy for compact models is applicable to various\narchitectures, including GhostNetV2, MobileNetV2 and ShuffleNetV2.\nSpecifically, equipped with our strategy, GhostNetV3 1.3$\\times$ achieves a\ntop-1 accuracy of 79.1% with only 269M FLOPs and a latency of 14.46ms on mobile\ndevices, surpassing its ordinarily trained counterpart by a large margin.\nMoreover, our observation can also be extended to object detection scenarios.\nPyTorch code and checkpoints can be found at\nhttps://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv3_pytorch.\n","authors":["Zhenhua Liu","Zhiwei Hao","Kai Han","Yehui Tang","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11202v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13842v1","updated":"2024-04-22T02:42:32Z","published":"2024-04-22T02:42:32Z","title":"On Support Relations Inference and Scene Hierarchy Graph Construction\n from Point Cloud in Clustered Environments","summary":" Over the years, scene understanding has attracted a growing interest in\ncomputer vision, providing the semantic and physical scene information\nnecessary for robots to complete some particular tasks autonomously. In 3D\nscenes, rich spatial geometric and topological information are often ignored by\nRGB-based approaches for scene understanding. In this study, we develop a\nbottom-up approach for scene understanding that infers support relations\nbetween objects from a point cloud. Our approach utilizes the spatial topology\ninformation of the plane pairs in the scene, consisting of three major steps.\n1) Detection of pairwise spatial configuration: dividing primitive pairs into\nlocal support connection and local inner connection; 2) primitive\nclassification: a combinatorial optimization method applied to classify\nprimitives; and 3) support relations inference and hierarchy graph\nconstruction: bottom-up support relations inference and scene hierarchy graph\nconstruction containing primitive level and object level. Through experiments,\nwe demonstrate that the algorithm achieves excellent performance in primitive\nclassification and support relations inference. Additionally, we show that the\nscene hierarchy graph contains rich geometric and topological information of\nobjects, and it possesses great scalability for scene understanding.\n","authors":["Gang Ma","Hui Wei"],"pdf_url":"https://arxiv.org/pdf/2404.13842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13838v1","updated":"2024-04-22T02:34:50Z","published":"2024-04-22T02:34:50Z","title":"C2F-SemiCD: A Coarse-to-Fine Semi-Supervised Change Detection Method\n Based on Consistency Regularization in High-Resolution Remote Sensing Images","summary":" A high-precision feature extraction model is crucial for change detection\n(CD). In the past, many deep learning-based supervised CD methods learned to\nrecognize change feature patterns from a large number of labelled bi-temporal\nimages, whereas labelling bi-temporal remote sensing images is very expensive\nand often time-consuming; therefore, we propose a coarse-to-fine\nsemi-supervised CD method based on consistency regularization (C2F-SemiCD),\nwhich includes a coarse-to-fine CD network with a multiscale attention\nmechanism (C2FNet) and a semi-supervised update method. Among them, the C2FNet\nnetwork gradually completes the extraction of change features from\ncoarse-grained to fine-grained through multiscale feature fusion, channel\nattention mechanism, spatial attention mechanism, global context module,\nfeature refine module, initial aggregation module, and final aggregation\nmodule. The semi-supervised update method uses the mean teacher method. The\nparameters of the student model are updated to the parameters of the teacher\nModel by using the exponential moving average (EMA) method. Through extensive\nexperiments on three datasets and meticulous ablation studies, including\ncrossover experiments across datasets, we verify the significant effectiveness\nand efficiency of the proposed C2F-SemiCD method. The code will be open at:\nhttps://github.com/ChengxiHAN/C2F-SemiCDand-C2FNet.\n","authors":["Chengxi Han","Chen Wu","Meiqi Hu","Jiepan Li","Hongruixuan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06973v2","updated":"2024-04-22T02:13:32Z","published":"2024-03-11T17:55:53Z","title":"Bayesian Diffusion Models for 3D Shape Reconstruction","summary":" We present Bayesian Diffusion Models (BDM), a prediction algorithm that\nperforms effective Bayesian inference by tightly coupling the top-down (prior)\ninformation with the bottom-up (data-driven) procedure via joint diffusion\nprocesses. We show the effectiveness of BDM on the 3D shape reconstruction\ntask. Compared to prototypical deep learning data-driven approaches trained on\npaired (supervised) data-labels (e.g. image-point clouds) datasets, our BDM\nbrings in rich prior information from standalone labels (e.g. point clouds) to\nimprove the bottom-up 3D reconstruction. As opposed to the standard Bayesian\nframeworks where explicit prior and likelihood are required for the inference,\nBDM performs seamless information fusion via coupled diffusion processes with\nlearned gradient computation networks. The specialty of our BDM lies in its\ncapability to engage the active and effective information exchange and fusion\nof the top-down and bottom-up processes where each itself is a diffusion\nprocess. We demonstrate state-of-the-art results on both synthetic and\nreal-world benchmarks for 3D shape reconstruction.\n","authors":["Haiyang Xu","Yu Lei","Zeyuan Chen","Xiang Zhang","Yue Zhao","Yilin Wang","Zhuowen Tu"],"pdf_url":"https://arxiv.org/pdf/2403.06973v2.pdf","comment":"Accepted to CVPR 2024; Project Page: https://mlpc-ucsd.github.io/BDM/"},{"id":"http://arxiv.org/abs/2404.13830v1","updated":"2024-04-22T02:05:15Z","published":"2024-04-22T02:05:15Z","title":"A Comprehensive Survey and Taxonomy on Point Cloud Registration Based on\n Deep Learning","summary":" Point cloud registration (PCR) involves determining a rigid transformation\nthat aligns one point cloud to another. Despite the plethora of outstanding\ndeep learning (DL)-based registration methods proposed, comprehensive and\nsystematic studies on DL-based PCR techniques are still lacking. In this paper,\nwe present a comprehensive survey and taxonomy of recently proposed PCR\nmethods. Firstly, we conduct a taxonomy of commonly utilized datasets and\nevaluation metrics. Secondly, we classify the existing research into two main\ncategories: supervised and unsupervised registration, providing insights into\nthe core concepts of various influential PCR models. Finally, we highlight open\nchallenges and potential directions for future research. A curated collection\nof valuable resources is made available at https://github.com/yxzhang15/PCR.\n","authors":["Yu-Xin Zhang","Jie Gui","Xiaofeng Cong","Xin Gong","Wenbing Tao"],"pdf_url":"https://arxiv.org/pdf/2404.13830v1.pdf","comment":"This paper is accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2310.19540v2","updated":"2024-04-22T02:03:02Z","published":"2023-10-30T13:47:46Z","title":"IterInv: Iterative Inversion for Pixel-Level T2I Models","summary":" Large-scale text-to-image diffusion models have been a ground-breaking\ndevelopment in generating convincing images following an input text prompt. The\ngoal of image editing research is to give users control over the generated\nimages by modifying the text prompt. Current image editing techniques\npredominantly hinge on DDIM inversion as a prevalent practice rooted in Latent\nDiffusion Models (LDM). However, the large pretrained T2I models working on the\nlatent space suffer from losing details due to the first compression stage with\nan autoencoder mechanism. Instead, other mainstream T2I pipeline working on the\npixel level, such as Imagen and DeepFloyd-IF, circumvents the above problem.\nThey are commonly composed of multiple stages, typically starting with a\ntext-to-image stage and followed by several super-resolution stages. In this\npipeline, the DDIM inversion fails to find the initial noise and generate the\noriginal image given that the super-resolution diffusion models are not\ncompatible with the DDIM technique. According to our experimental findings,\niteratively concatenating the noisy image as the condition is the root of this\nproblem. Based on this observation, we develop an iterative inversion (IterInv)\ntechnique for this category of T2I models and verify IterInv with the\nopen-source DeepFloyd-IF model.Specifically, IterInv employ NTI as the\ninversion and reconstruction of low-resolution image generation. In stages 2\nand 3, we update the latent variance at each timestep to find the deterministic\ninversion trace and promote the reconstruction process. By combining our method\nwith a popular image editing method, we prove the application prospects of\nIterInv. The code will be released upon acceptance. The code is available at\n\\url{https://github.com/Tchuanm/IterInv.git}.\n","authors":["Chuanming Tang","Kai Wang","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2310.19540v2.pdf","comment":"Accepted paper at ICME 2024"},{"id":"http://arxiv.org/abs/2404.13827v1","updated":"2024-04-22T01:59:48Z","published":"2024-04-22T01:59:48Z","title":"Swap It Like Its Hot: Segmentation-based spoof attacks on eye-tracking\n images","summary":" Video-based eye trackers capture the iris biometric and enable authentication\nto secure user identity. However, biometric authentication is susceptible to\nspoofing another user's identity through physical or digital manipulation. The\ncurrent standard to identify physical spoofing attacks on eye-tracking sensors\nuses liveness detection. Liveness detection classifies gaze data as real or\nfake, which is sufficient to detect physical presentation attacks. However,\nsuch defenses cannot detect a spoofing attack when real eye image inputs are\ndigitally manipulated to swap the iris pattern of another person. We propose\nIrisSwap as a novel attack on gaze-based liveness detection. IrisSwap allows\nattackers to segment and digitally swap in a victim's iris pattern to fool iris\nauthentication. Both offline and online attacks produce gaze data that deceives\nthe current state-of-the-art defense models at rates up to 58% and motivates\nthe need to develop more advanced authentication methods for eye trackers.\n","authors":["Anish S. Narkar","Brendan David-John"],"pdf_url":"https://arxiv.org/pdf/2404.13827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12621v4","updated":"2024-04-22T01:58:18Z","published":"2023-05-22T01:14:30Z","title":"DermSynth3D: Synthesis of in-the-wild Annotated Dermatology Images","summary":" In recent years, deep learning (DL) has shown great potential in the field of\ndermatological image analysis. However, existing datasets in this domain have\nsignificant limitations, including a small number of image samples, limited\ndisease conditions, insufficient annotations, and non-standardized image\nacquisitions. To address these shortcomings, we propose a novel framework\ncalled DermSynth3D. DermSynth3D blends skin disease patterns onto 3D textured\nmeshes of human subjects using a differentiable renderer and generates 2D\nimages from various camera viewpoints under chosen lighting conditions in\ndiverse background scenes. Our method adheres to top-down rules that constrain\nthe blending and rendering process to create 2D images with skin conditions\nthat mimic in-the-wild acquisitions, ensuring more meaningful results. The\nframework generates photo-realistic 2D dermoscopy images and the corresponding\ndense annotations for semantic segmentation of the skin, skin conditions, body\nparts, bounding boxes around lesions, depth maps, and other 3D scene\nparameters, such as camera position and lighting conditions. DermSynth3D allows\nfor the creation of custom datasets for various dermatology tasks. We\ndemonstrate the effectiveness of data generated using DermSynth3D by training\nDL models on synthetic data and evaluating them on various dermatology tasks\nusing real 2D dermatological images. We make our code publicly available at\nhttps://github.com/sfu-mial/DermSynth3D.\n","authors":["Ashish Sinha","Jeremy Kawahara","Arezou Pakzad","Kumar Abhishek","Matthieu Ruthven","Enjie Ghorbel","Anis Kacem","Djamila Aouada","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2305.12621v4.pdf","comment":"Accepted to Medical Image Analysis (MedIA) 2024"},{"id":"http://arxiv.org/abs/2306.16927v2","updated":"2024-04-22T01:46:43Z","published":"2023-06-29T14:17:24Z","title":"End-to-end Autonomous Driving: Challenges and Frontiers","summary":" The autonomous driving community has witnessed a rapid growth in approaches\nthat embrace an end-to-end algorithm framework, utilizing raw sensor input to\ngenerate vehicle motion plans, instead of concentrating on individual tasks\nsuch as detection and motion prediction. End-to-end systems, in comparison to\nmodular pipelines, benefit from joint feature optimization for perception and\nplanning. This field has flourished due to the availability of large-scale\ndatasets, closed-loop evaluation, and the increasing need for autonomous\ndriving algorithms to perform effectively in challenging scenarios. In this\nsurvey, we provide a comprehensive analysis of more than 270 papers, covering\nthe motivation, roadmap, methodology, challenges, and future trends in\nend-to-end autonomous driving. We delve into several critical challenges,\nincluding multi-modality, interpretability, causal confusion, robustness, and\nworld models, amongst others. Additionally, we discuss current advancements in\nfoundation models and visual pre-training, as well as how to incorporate these\ntechniques within the end-to-end driving framework. we maintain an active\nrepository that contains up-to-date literature and open-source projects at\nhttps://github.com/OpenDriveLab/End-to-end-Autonomous-Driving.\n","authors":["Li Chen","Penghao Wu","Kashyap Chitta","Bernhard Jaeger","Andreas Geiger","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2306.16927v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13819v1","updated":"2024-04-22T01:42:45Z","published":"2024-04-22T01:42:45Z","title":"HOIST-Former: Hand-held Objects Identification, Segmentation, and\n Tracking in the Wild","summary":" We address the challenging task of identifying, segmenting, and tracking\nhand-held objects, which is crucial for applications such as human action\nsegmentation and performance evaluation. This task is particularly challenging\ndue to heavy occlusion, rapid motion, and the transitory nature of objects\nbeing hand-held, where an object may be held, released, and subsequently picked\nup again. To tackle these challenges, we have developed a novel\ntransformer-based architecture called HOIST-Former. HOIST-Former is adept at\nspatially and temporally segmenting hands and objects by iteratively pooling\nfeatures from each other, ensuring that the processes of identification,\nsegmentation, and tracking of hand-held objects depend on the hands' positions\nand their contextual appearance. We further refine HOIST-Former with a contact\nloss that focuses on areas where hands are in contact with objects. Moreover,\nwe also contribute an in-the-wild video dataset called HOIST, which comprises\n4,125 videos complete with bounding boxes, segmentation masks, and tracking IDs\nfor hand-held objects. Through experiments on the HOIST dataset and two\nadditional public datasets, we demonstrate the efficacy of HOIST-Former in\nsegmenting and tracking hand-held objects.\n","authors":["Supreeth Narasimhaswamy","Huy Anh Nguyen","Lihan Huang","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2404.13819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05321v4","updated":"2024-04-22T01:39:07Z","published":"2022-09-12T15:26:13Z","title":"Deep Feature Statistics Mapping for Generalized Screen Content Image\n Quality Assessment","summary":" The statistical regularities of natural images, referred to as natural scene\nstatistics, play an important role in no-reference image quality assessment.\nHowever, it has been widely acknowledged that screen content images (SCIs),\nwhich are typically computer generated, do not hold such statistics. Here we\nmake the first attempt to learn the statistics of SCIs, based upon which the\nquality of SCIs can be effectively determined. The underlying mechanism of the\nproposed approach is based upon the mild assumption that the SCIs, which are\nnot physically acquired, still obey certain statistics that could be understood\nin a learning fashion. We empirically show that the statistics deviation could\nbe effectively leveraged in quality assessment, and the proposed method is\nsuperior when evaluated in different settings. Extensive experimental results\ndemonstrate the Deep Feature Statistics based SCI Quality Assessment (DFSS-IQA)\nmodel delivers promising performance compared with existing NR-IQA models and\nshows a high generalization capability in the cross-dataset settings. The\nimplementation of our method is publicly available at\nhttps://github.com/Baoliang93/DFSS-IQA.\n","authors":["Baoliang Chen","Hanwei Zhu","Lingyu Zhu","Shiqi Wang","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2209.05321v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13816v1","updated":"2024-04-22T01:36:50Z","published":"2024-04-22T01:36:50Z","title":"Neural Radiance Field in Autonomous Driving: A Survey","summary":" Neural Radiance Field (NeRF) has garnered significant attention from both\nacademia and industry due to its intrinsic advantages, particularly its\nimplicit representation and novel view synthesis capabilities. With the rapid\nadvancements in deep learning, a multitude of methods have emerged to explore\nthe potential applications of NeRF in the domain of Autonomous Driving (AD).\nHowever, a conspicuous void is apparent within the current literature. To\nbridge this gap, this paper conducts a comprehensive survey of NeRF's\napplications in the context of AD. Our survey is structured to categorize\nNeRF's applications in Autonomous Driving (AD), specifically encompassing\nperception, 3D reconstruction, simultaneous localization and mapping (SLAM),\nand simulation. We delve into in-depth analysis and summarize the findings for\neach application category, and conclude by providing insights and discussions\non future directions in this field. We hope this paper serves as a\ncomprehensive reference for researchers in this domain. To the best of our\nknowledge, this is the first survey specifically focused on the applications of\nNeRF in the Autonomous Driving domain.\n","authors":["Lei He","Leheng Li","Wenchao Sun","Zeyu Han","Yichen Liu","Sifa Zheng","Jianqiang Wang","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.13816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12814v2","updated":"2024-04-22T01:14:11Z","published":"2024-04-19T11:49:01Z","title":"Generative Modelling with High-Order Langevin Dynamics","summary":" Diffusion generative modelling (DGM) based on stochastic differential\nequations (SDEs) with score matching has achieved unprecedented results in data\ngeneration. In this paper, we propose a novel fast high-quality generative\nmodelling method based on high-order Langevin dynamics (HOLD) with score\nmatching. This motive is proved by third-order Langevin dynamics. By augmenting\nthe previous SDEs, e.g. variance exploding or variance preserving SDEs for\nsingle-data variable processes, HOLD can simultaneously model position,\nvelocity, and acceleration, thereby improving the quality and speed of the data\ngeneration at the same time. HOLD is composed of one Ornstein-Uhlenbeck process\nand two Hamiltonians, which reduce the mixing time by two orders of magnitude.\nEmpirical experiments for unconditional image generation on the public data set\nCIFAR-10 and CelebA-HQ show that the effect is significant in both Frechet\ninception distance (FID) and negative log-likelihood, and achieves the\nstate-of-the-art FID of 1.85 on CIFAR-10.\n","authors":["Ziqiang Shi","Rujie Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12814v2.pdf","comment":"Some of the results in this paper have been published or accepted at\n conferences such as wacv2024, icassp2024, and icme2024"},{"id":"http://arxiv.org/abs/2404.13807v1","updated":"2024-04-22T00:44:13Z","published":"2024-04-22T00:44:13Z","title":"FaceFolds: Meshed Radiance Manifolds for Efficient Volumetric Rendering\n of Dynamic Faces","summary":" 3D rendering of dynamic face captures is a challenging problem, and it\ndemands improvements on several fronts$\\unicode{x2014}$photorealism,\nefficiency, compatibility, and configurability. We present a novel\nrepresentation that enables high-quality volumetric rendering of an actor's\ndynamic facial performances with minimal compute and memory footprint. It runs\nnatively on commodity graphics soft- and hardware, and allows for a graceful\ntrade-off between quality and efficiency. Our method utilizes recent advances\nin neural rendering, particularly learning discrete radiance manifolds to\nsparsely sample the scene to model volumetric effects. We achieve efficient\nmodeling by learning a single set of manifolds for the entire dynamic sequence,\nwhile implicitly modeling appearance changes as temporal canonical texture. We\nexport a single layered mesh and view-independent RGBA texture video that is\ncompatible with legacy graphics renderers without additional ML integration. We\ndemonstrate our method by rendering dynamic face captures of real actors in a\ngame engine, at comparable photorealism to state-of-the-art neural rendering\ntechniques at previously unseen frame rates.\n","authors":["Safa C. Medin","Gengyan Li","Ruofei Du","Stephan Garbin","Philip Davidson","Gregory W. Wornell","Thabo Beeler","Abhimitra Meka"],"pdf_url":"https://arxiv.org/pdf/2404.13807v1.pdf","comment":"In Proceedings of the ACM in Computer Graphics and Interactive\n Techniques, 2024"},{"id":"http://arxiv.org/abs/2312.15320v2","updated":"2024-04-22T00:41:34Z","published":"2023-12-23T18:40:25Z","title":"GestaltMML: Enhancing Rare Genetic Disease Diagnosis through Multimodal\n Machine Learning Combining Facial Images and Clinical Texts","summary":" Individuals with suspected rare genetic disorders often undergo multiple\nclinical evaluations, imaging studies, laboratory tests and genetic tests, to\nfind a possible answer over a prolonged period of time. Addressing this\n\"diagnostic odyssey\" thus has substantial clinical, psychosocial, and economic\nbenefits. Many rare genetic diseases have distinctive facial features, which\ncan be used by artificial intelligence algorithms to facilitate clinical\ndiagnosis, in prioritizing candidate diseases to be further examined by lab\ntests or genetic assays, or in helping the phenotype-driven reinterpretation of\ngenome/exome sequencing data. Existing methods using frontal facial photos were\nbuilt on conventional Convolutional Neural Networks (CNNs), rely exclusively on\nfacial images, and cannot capture non-facial phenotypic traits and demographic\ninformation essential for guiding accurate diagnoses. Here we introduce\nGestaltMML, a multimodal machine learning (MML) approach solely based on the\nTransformer architecture. It integrates facial images, demographic information\n(age, sex, ethnicity), and clinical notes (optionally, a list of Human\nPhenotype Ontology terms) to improve prediction accuracy. Furthermore, we also\nevaluated GestaltMML on a diverse range of datasets, including 528 diseases\nfrom the GestaltMatcher Database, several in-house datasets of\nBeckwith-Wiedemann syndrome (BWS, over-growth syndrome with distinct facial\nfeatures), Sotos syndrome (overgrowth syndrome with overlapping features with\nBWS), NAA10-related neurodevelopmental syndrome, Cornelia de Lange syndrome\n(multiple malformation syndrome), and KBG syndrome (multiple malformation\nsyndrome). Our results suggest that GestaltMML effectively incorporates\nmultiple modalities of data, greatly narrowing candidate genetic diagnoses of\nrare diseases and may facilitate the reinterpretation of genome/exome\nsequencing data.\n","authors":["Da Wu","Jingye Yang","Cong Liu","Tzung-Chien Hsieh","Elaine Marchi","Justin Blair","Peter Krawitz","Chunhua Weng","Wendy Chung","Gholson J. Lyon","Ian D. Krantz","Jennifer M. Kalish","Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2312.15320v2.pdf","comment":"Significant revisions"}]},"2024-04-23T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.15276v1","updated":"2024-04-23T17:59:59Z","published":"2024-04-23T17:59:59Z","title":"SMPLer: Taming Transformers for Monocular 3D Human Shape and Pose\n Estimation","summary":" Existing Transformers for monocular 3D human shape and pose estimation\ntypically have a quadratic computation and memory complexity with respect to\nthe feature length, which hinders the exploitation of fine-grained information\nin high-resolution features that is beneficial for accurate reconstruction. In\nthis work, we propose an SMPL-based Transformer framework (SMPLer) to address\nthis issue. SMPLer incorporates two key ingredients: a decoupled attention\noperation and an SMPL-based target representation, which allow effective\nutilization of high-resolution features in the Transformer. In addition, based\non these two designs, we also introduce several novel modules including a\nmulti-scale attention and a joint-aware attention to further boost the\nreconstruction performance. Extensive experiments demonstrate the effectiveness\nof SMPLer against existing 3D human shape and pose estimation methods both\nquantitatively and qualitatively. Notably, the proposed algorithm achieves an\nMPJPE of 45.2 mm on the Human3.6M dataset, improving upon Mesh Graphormer by\nmore than 10% with fewer than one-third of the parameters. Code and pretrained\nmodels are available at https://github.com/xuxy09/SMPLer.\n","authors":["Xiangyu Xu","Lijuan Liu","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2404.15276v1.pdf","comment":"Published at TPAMI 2024"},{"id":"http://arxiv.org/abs/2404.15275v1","updated":"2024-04-23T17:59:43Z","published":"2024-04-23T17:59:43Z","title":"ID-Animator: Zero-Shot Identity-Preserving Human Video Generation","summary":" Generating high fidelity human video with specified identities has attracted\nsignificant attention in the content generation community. However, existing\ntechniques struggle to strike a balance between training efficiency and\nidentity preservation, either requiring tedious case-by-case finetuning or\nusually missing the identity details in video generation process. In this\nstudy, we present ID-Animator, a zero-shot human-video generation approach that\ncan perform personalized video generation given single reference facial image\nwithout further training. ID-Animator inherits existing diffusion-based video\ngeneration backbones with a face adapter to encode the ID-relevant embeddings\nfrom learnable facial latent queries. To facilitate the extraction of identity\ninformation in video generation, we introduce an ID-oriented dataset\nconstruction pipeline, which incorporates decoupled human attribute and action\ncaptioning technique from a constructed facial image pool. Based on this\npipeline, a random face reference training method is further devised to\nprecisely capture the ID-relevant embeddings from reference images, thus\nimproving the fidelity and generalization capacity of our model for ID-specific\nvideo generation. Extensive experiments demonstrate the superiority of\nID-Animator to generate personalized human videos over previous models.\nMoreover, our method is highly compatible with popular pre-trained T2V models\nlike animatediff and various community backbone models, showing high\nextendability in real-world applications for video generation where identity\npreservation is highly desired. Our codes and checkpoints will be released at\nhttps://github.com/ID-Animator/ID-Animator.\n","authors":["Xuanhua He","Quande Liu","Shengju Qian","Xin Wang","Tao Hu","Ke Cao","Keyu Yan","Man Zhou","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15275v1.pdf","comment":"Project Page: https://id-animator.github.io/"},{"id":"http://arxiv.org/abs/2312.07530v2","updated":"2024-04-23T17:59:25Z","published":"2023-12-12T18:57:25Z","title":"Weakly Supervised 3D Object Detection via Multi-Level Visual Guidance","summary":" Weakly supervised 3D object detection aims to learn a 3D detector with lower\nannotation cost, e.g., 2D labels. Unlike prior work which still relies on few\naccurate 3D annotations, we propose a framework to study how to leverage\nconstraints between 2D and 3D domains without requiring any 3D labels.\nSpecifically, we employ visual data from three perspectives to establish\nconnections between 2D and 3D domains. First, we design a feature-level\nconstraint to align LiDAR and image features based on object-aware regions.\nSecond, the output-level constraint is developed to enforce the overlap between\n2D and projected 3D box estimations. Finally, the training-level constraint is\nutilized by producing accurate and consistent 3D pseudo-labels that align with\nthe visual data. We conduct extensive experiments on the KITTI dataset to\nvalidate the effectiveness of the proposed three constraints. Without using any\n3D labels, our method achieves favorable performance against state-of-the-art\napproaches and is competitive with the method that uses 500-frame 3D\nannotations. Code and models will be made publicly available at\nhttps://github.com/kuanchihhuang/VG-W3D.\n","authors":["Kuan-Chih Huang","Yi-Hsuan Tsai","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.07530v2.pdf","comment":"Project page: https://github.com/kuanchihhuang/VG-W3D"},{"id":"http://arxiv.org/abs/2404.15274v1","updated":"2024-04-23T17:59:12Z","published":"2024-04-23T17:59:12Z","title":"Metric-guided Image Reconstruction Bounds via Conformal Prediction","summary":" Recent advancements in machine learning have led to novel imaging systems and\nalgorithms that address ill-posed problems. Assessing their trustworthiness and\nunderstanding how to deploy them safely at test time remains an important and\nopen problem. We propose a method that leverages conformal prediction to\nretrieve upper/lower bounds and statistical inliers/outliers of reconstructions\nbased on the prediction intervals of downstream metrics. We apply our method to\nsparse-view CT for downstream radiotherapy planning and show 1) that\nmetric-guided bounds have valid coverage for downstream metrics while\nconventional pixel-wise bounds do not and 2) anatomical differences of\nupper/lower bounds between metric-guided and pixel-wise methods. Our work paves\nthe way for more meaningful reconstruction bounds. Code available at\nhttps://github.com/matthewyccheung/conformal-metric\n","authors":["Matt Y Cheung","Tucker J Netherton","Laurence E Court","Ashok Veeraraghavan","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2404.15274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15272v1","updated":"2024-04-23T17:59:01Z","published":"2024-04-23T17:59:01Z","title":"CT-GLIP: 3D Grounded Language-Image Pretraining with CT Scans and\n Radiology Reports for Full-Body Scenarios","summary":" Medical Vision-Language Pretraining (Med-VLP) establishes a connection\nbetween visual content from medical images and the relevant textual\ndescriptions. Existing Med-VLP methods primarily focus on 2D images depicting a\nsingle body part, notably chest X-rays. In this paper, we extend the scope of\nMed-VLP to encompass 3D images, specifically targeting full-body scenarios, by\nusing a multimodal dataset of CT images and reports. Compared with the 2D\ncounterpart, 3D VLP is required to effectively capture essential semantics from\nsignificantly sparser representation in 3D imaging. In this paper, we introduce\nCT-GLIP (Grounded Language-Image Pretraining with CT scans), a novel method\nthat constructs organ-level image-text pairs to enhance multimodal contrastive\nlearning, aligning grounded visual features with precise diagnostic text.\nAdditionally, we developed an abnormality dictionary to augment contrastive\nlearning with diverse negative samples. Our method, trained on a multimodal CT\ndataset comprising 44,011 organ-level vision-text pairs from 17,702 patients\nacross 104 organs, demonstrates it can identify organs and abnormalities in a\nzero-shot manner using natural languages. The performance of CT-GLIP is\nvalidated on a separate test set of 1,130 patients, focusing on the 16 most\nfrequent abnormalities across 7 organs. The experimental results show our\nmodel's superior performance over the standard CLIP framework across zero-shot\nand fine-tuning scenarios, using both CNN and ViT architectures.\n","authors":["Jingyang Lin","Yingda Xia","Jianpeng Zhang","Ke Yan","Le Lu","Jiebo Luo","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15272v1.pdf","comment":"12 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.15271v1","updated":"2024-04-23T17:58:33Z","published":"2024-04-23T17:58:33Z","title":"Automatic Layout Planning for Visually-Rich Documents with\n Instruction-Following Models","summary":" Recent advancements in instruction-following models have made user\ninteractions with models more user-friendly and efficient, broadening their\napplicability. In graphic design, non-professional users often struggle to\ncreate visually appealing layouts due to limited skills and resources. In this\nwork, we introduce a novel multimodal instruction-following framework for\nlayout planning, allowing users to easily arrange visual elements into tailored\nlayouts by specifying canvas size and design purpose, such as for book covers,\nposters, brochures, or menus. We developed three layout reasoning tasks to\ntrain the model in understanding and executing layout instructions. Experiments\non two benchmarks show that our method not only simplifies the design process\nfor non-professionals but also surpasses the performance of few-shot GPT-4V\nmodels, with mIoU higher by 12% on Crello. This progress highlights the\npotential of multimodal instruction-following models to automate and simplify\nthe design process, providing an approachable solution for a wide range of\ndesign tasks on visually-rich documents.\n","authors":["Wanrong Zhu","Jennifer Healey","Ruiyi Zhang","William Yang Wang","Tong Sun"],"pdf_url":"https://arxiv.org/pdf/2404.15271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15267v1","updated":"2024-04-23T17:56:08Z","published":"2024-04-23T17:56:08Z","title":"From Parts to Whole: A Unified Reference Framework for Controllable\n Human Image Generation","summary":" Recent advancements in controllable human image generation have led to\nzero-shot generation using structural signals (e.g., pose, depth) or facial\nappearance. Yet, generating human images conditioned on multiple parts of human\nappearance remains challenging. Addressing this, we introduce Parts2Whole, a\nnovel framework designed for generating customized portraits from multiple\nreference images, including pose images and various aspects of human\nappearance. To achieve this, we first develop a semantic-aware appearance\nencoder to retain details of different human parts, which processes each image\nbased on its textual label to a series of multi-scale feature maps rather than\none image token, preserving the image dimension. Second, our framework supports\nmulti-image conditioned generation through a shared self-attention mechanism\nthat operates across reference and target features during the diffusion\nprocess. We enhance the vanilla attention mechanism by incorporating mask\ninformation from the reference human images, allowing for the precise selection\nof any part. Extensive experiments demonstrate the superiority of our approach\nover existing alternatives, offering advanced capabilities for multi-part\ncontrollable human image customization. See our project page at\nhttps://huanngzh.github.io/Parts2Whole/.\n","authors":["Zehuan Huang","Hongxing Fan","Lipeng Wang","Lu Sheng"],"pdf_url":"https://arxiv.org/pdf/2404.15267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12060v3","updated":"2024-04-23T17:55:37Z","published":"2023-03-21T17:51:23Z","title":"VideoXum: Cross-modal Visual and Textural Summarization of Videos","summary":" Video summarization aims to distill the most important information from a\nsource video to produce either an abridged clip or a textual narrative.\nTraditionally, different methods have been proposed depending on whether the\noutput is a video or text, thus ignoring the correlation between the two\nsemantically related tasks of visual summarization and textual summarization.\nWe propose a new joint video and text summarization task. The goal is to\ngenerate both a shortened video clip along with the corresponding textual\nsummary from a long video, collectively referred to as a cross-modal summary.\nThe generated shortened video clip and text narratives should be semantically\nwell aligned. To this end, we first build a large-scale human-annotated dataset\n-- VideoXum (X refers to different modalities). The dataset is reannotated\nbased on ActivityNet. After we filter out the videos that do not meet the\nlength requirements, 14,001 long videos remain in our new dataset. Each video\nin our reannotated dataset has human-annotated video summaries and the\ncorresponding narrative summaries. We then design a novel end-to-end model --\nVTSUM-BILP to address the challenges of our proposed task. Moreover, we propose\na new metric called VT-CLIPScore to help evaluate the semantic consistency of\ncross-modality summary. The proposed model achieves promising performance on\nthis new task and establishes a benchmark for future research.\n","authors":["Jingyang Lin","Hang Hua","Ming Chen","Yikang Li","Jenhao Hsiao","Chiuman Ho","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2303.12060v3.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.15264v1","updated":"2024-04-23T17:55:07Z","published":"2024-04-23T17:55:07Z","title":"TalkingGaussian: Structure-Persistent 3D Talking Head Synthesis via\n Gaussian Splatting","summary":" Radiance fields have demonstrated impressive performance in synthesizing\nlifelike 3D talking heads. However, due to the difficulty in fitting steep\nappearance changes, the prevailing paradigm that presents facial motions by\ndirectly modifying point appearance may lead to distortions in dynamic regions.\nTo tackle this challenge, we introduce TalkingGaussian, a deformation-based\nradiance fields framework for high-fidelity talking head synthesis. Leveraging\nthe point-based Gaussian Splatting, facial motions can be represented in our\nmethod by applying smooth and continuous deformations to persistent Gaussian\nprimitives, without requiring to learn the difficult appearance change like\nprevious methods. Due to this simplification, precise facial motions can be\nsynthesized while keeping a highly intact facial feature. Under such a\ndeformation paradigm, we further identify a face-mouth motion inconsistency\nthat would affect the learning of detailed speaking motions. To address this\nconflict, we decompose the model into two branches separately for the face and\ninside mouth areas, therefore simplifying the learning tasks to help\nreconstruct more accurate motion and structure of the mouth region. Extensive\nexperiments demonstrate that our method renders high-quality lip-synchronized\ntalking head videos, with better facial fidelity and higher efficiency compared\nwith previous methods.\n","authors":["Jiahe Li","Jiawei Zhang","Xiao Bai","Jin Zheng","Xin Ning","Jun Zhou","Lin Gu"],"pdf_url":"https://arxiv.org/pdf/2404.15264v1.pdf","comment":"Project page: https://fictionarry.github.io/TalkingGaussian/"},{"id":"http://arxiv.org/abs/2404.15263v1","updated":"2024-04-23T17:55:05Z","published":"2024-04-23T17:55:05Z","title":"Multi-Session SLAM with Differentiable Wide-Baseline Pose Optimization","summary":" We introduce a new system for Multi-Session SLAM, which tracks camera motion\nacross multiple disjoint videos under a single global reference. Our approach\ncouples the prediction of optical flow with solver layers to estimate camera\npose. The backbone is trained end-to-end using a novel differentiable solver\nfor wide-baseline two-view pose. The full system can connect disjoint\nsequences, perform visual odometry, and global optimization. Compared to\nexisting approaches, our design is accurate and robust to catastrophic\nfailures. Code is available at github.com/princeton-vl/MultiSlam_DiffPose\n","authors":["Lahav Lipson","Jia Deng"],"pdf_url":"https://arxiv.org/pdf/2404.15263v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.15259v1","updated":"2024-04-23T17:46:50Z","published":"2024-04-23T17:46:50Z","title":"FlowMap: High-Quality Camera Poses, Intrinsics, and Depth via Gradient\n Descent","summary":" This paper introduces FlowMap, an end-to-end differentiable method that\nsolves for precise camera poses, camera intrinsics, and per-frame dense depth\nof a video sequence. Our method performs per-video gradient-descent\nminimization of a simple least-squares objective that compares the optical flow\ninduced by depth, intrinsics, and poses against correspondences obtained via\noff-the-shelf optical flow and point tracking. Alongside the use of point\ntracks to encourage long-term geometric consistency, we introduce\ndifferentiable re-parameterizations of depth, intrinsics, and pose that are\namenable to first-order optimization. We empirically show that camera\nparameters and dense depth recovered by our method enable photo-realistic novel\nview synthesis on 360-degree trajectories using Gaussian Splatting. Our method\nnot only far outperforms prior gradient-descent based bundle adjustment\nmethods, but surprisingly performs on par with COLMAP, the state-of-the-art SfM\nmethod, on the downstream task of 360-degree novel view synthesis (even though\nour method is purely gradient-descent based, fully differentiable, and presents\na complete departure from conventional SfM).\n","authors":["Cameron Smith","David Charatan","Ayush Tewari","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2404.15259v1.pdf","comment":"Project website: https://cameronosmith.github.io/flowmap/"},{"id":"http://arxiv.org/abs/2404.15256v1","updated":"2024-04-23T17:42:45Z","published":"2024-04-23T17:42:45Z","title":"TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and\n Proprioception Estimation","summary":" Legged navigation is typically examined within open-world, off-road, and\nchallenging environments. In these scenarios, estimating external disturbances\nrequires a complex synthesis of multi-modal information. This underlines a\nmajor limitation in existing works that primarily focus on avoiding obstacles.\nIn this work, we propose TOP-Nav, a novel legged navigation framework that\nintegrates a comprehensive path planner with Terrain awareness, Obstacle\navoidance and close-loop Proprioception. TOP-Nav underscores the synergies\nbetween vision and proprioception in both path and motion planning. Within the\npath planner, we present and integrate a terrain estimator that enables the\nrobot to select waypoints on terrains with higher traversability while\neffectively avoiding obstacles. In the motion planning level, we not only\nimplement a locomotion controller to track the navigation commands, but also\nconstruct a proprioception advisor to provide motion evaluations for the path\nplanner. Based on the close-loop motion feedback, we make online corrections\nfor the vision-based terrain and obstacle estimations. Consequently, TOP-Nav\nachieves open-world navigation that the robot can handle terrains or\ndisturbances beyond the distribution of prior knowledge and overcomes\nconstraints imposed by visual conditions. Building upon extensive experiments\nconducted in both simulation and real-world environments, TOP-Nav demonstrates\nsuperior performance in open-world navigation compared to existing methods.\n","authors":["Junli Ren","Yikai Liu","Yingru Dai","Guijin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.15256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15254v1","updated":"2024-04-23T17:39:27Z","published":"2024-04-23T17:39:27Z","title":"UniMERNet: A Universal Network for Real-World Mathematical Expression\n Recognition","summary":" This paper presents the UniMER dataset to provide the first study on\nMathematical Expression Recognition (MER) towards complex real-world scenarios.\nThe UniMER dataset consists of a large-scale training set UniMER-1M offering an\nunprecedented scale and diversity with one million training instances and a\nmeticulously designed test set UniMER-Test that reflects a diverse range of\nformula distributions prevalent in real-world scenarios. Therefore, the UniMER\ndataset enables the training of a robust and high-accuracy MER model and\ncomprehensive evaluation of model performance. Moreover, we introduce the\nUniversal Mathematical Expression Recognition Network (UniMERNet), an\ninnovative framework designed to enhance MER in practical scenarios. UniMERNet\nincorporates a Length-Aware Module to process formulas of varied lengths\nefficiently, thereby enabling the model to handle complex mathematical\nexpressions with greater accuracy. In addition, UniMERNet employs our UniMER-1M\ndata and image augmentation techniques to improve the model's robustness under\ndifferent noise conditions. Our extensive experiments demonstrate that\nUniMERNet outperforms existing MER models, setting a new benchmark in various\nscenarios and ensuring superior recognition quality in real-world applications.\nThe dataset and model are available at\nhttps://github.com/opendatalab/UniMERNet.\n","authors":["Bin Wang","Zhuangcheng Gu","Chao Xu","Bo Zhang","Botian Shi","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2404.15254v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.15252v1","updated":"2024-04-23T17:39:06Z","published":"2024-04-23T17:39:06Z","title":"Source-free Domain Adaptation for Video Object Detection Under Adverse\n Image Conditions","summary":" When deploying pre-trained video object detectors in real-world scenarios,\nthe domain gap between training and testing data caused by adverse image\nconditions often leads to performance degradation. Addressing this issue\nbecomes particularly challenging when only the pre-trained model and degraded\nvideos are available. Although various source-free domain adaptation (SFDA)\nmethods have been proposed for single-frame object detectors, SFDA for video\nobject detection (VOD) remains unexplored. Moreover, most unsupervised domain\nadaptation works for object detection rely on two-stage detectors, while SFDA\nfor one-stage detectors, which are more vulnerable to fine-tuning, is not well\naddressed in the literature. In this paper, we propose Spatial-Temporal\nAlternate Refinement with Mean Teacher (STAR-MT), a simple yet effective SFDA\nmethod for VOD. Specifically, we aim to improve the performance of the\none-stage VOD method, YOLOV, under adverse image conditions, including noise,\nair turbulence, and haze. Extensive experiments on the ImageNetVOD dataset and\nits degraded versions demonstrate that our method consistently improves video\nobject detection performance in challenging imaging conditions, showcasing its\npotential for real-world applications.\n","authors":["Xingguang Zhang","Chih-Hsien Chou"],"pdf_url":"https://arxiv.org/pdf/2404.15252v1.pdf","comment":"accepted by the UG2+ workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.15244v1","updated":"2024-04-23T17:26:34Z","published":"2024-04-23T17:26:34Z","title":"Efficient Transformer Encoders for Mask2Former-style models","summary":" Vision transformer based models bring significant improvements for image\nsegmentation tasks. Although these architectures offer powerful capabilities\nirrespective of specific segmentation tasks, their use of computational\nresources can be taxing on deployed devices. One way to overcome this challenge\nis by adapting the computation level to the specific needs of the input image\nrather than the current one-size-fits-all approach. To this end, we introduce\nECO-M2F or EffiCient TransfOrmer Encoders for Mask2Former-style models. Noting\nthat the encoder module of M2F-style models incur high resource-intensive\ncomputations, ECO-M2F provides a strategy to self-select the number of hidden\nlayers in the encoder, conditioned on the input image. To enable this\nself-selection ability for providing a balance between performance and\ncomputational efficiency, we present a three step recipe. The first step is to\ntrain the parent architecture to enable early exiting from the encoder. The\nsecond step is to create an derived dataset of the ideal number of encoder\nlayers required for each training example. The third step is to use the\naforementioned derived dataset to train a gating network that predicts the\nnumber of encoder layers to be used, conditioned on the input image.\nAdditionally, to change the computational-accuracy tradeoff, only steps two and\nthree need to be repeated which significantly reduces retraining time.\nExperiments on the public datasets show that the proposed approach reduces\nexpected encoder computational cost while maintaining performance, adapts to\nvarious user compute resources, is flexible in architecture configurations, and\ncan be extended beyond the segmentation task to object detection.\n","authors":["Manyi Yao","Abhishek Aich","Yumin Suh","Amit Roy-Chowdhury","Christian Shelton","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2404.15244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15234v1","updated":"2024-04-23T17:10:49Z","published":"2024-04-23T17:10:49Z","title":"Massively Annotated Datasets for Assessment of Synthetic and Real Data\n in Face Recognition","summary":" Face recognition applications have grown in parallel with the size of\ndatasets, complexity of deep learning models and computational power. However,\nwhile deep learning models evolve to become more capable and computational\npower keeps increasing, the datasets available are being retracted and removed\nfrom public access. Privacy and ethical concerns are relevant topics within\nthese domains. Through generative artificial intelligence, researchers have put\nefforts into the development of completely synthetic datasets that can be used\nto train face recognition systems. Nonetheless, the recent advances have not\nbeen sufficient to achieve performance comparable to the state-of-the-art\nmodels trained on real data. To study the drift between the performance of\nmodels trained on real and synthetic datasets, we leverage a massive attribute\nclassifier (MAC) to create annotations for four datasets: two real and two\nsynthetic. From these annotations, we conduct studies on the distribution of\neach attribute within all four datasets. Additionally, we further inspect the\ndifferences between real and synthetic datasets on the attribute set. When\ncomparing through the Kullback-Leibler divergence we have found differences\nbetween real and synthetic samples. Interestingly enough, we have verified that\nwhile real samples suffice to explain the synthetic distribution, the opposite\ncould not be further from being true.\n","authors":["Pedro C. Neto","Rafael M. Mamede","Carolina Albuquerque","Tiago Gonçalves","Ana F. Sequeira"],"pdf_url":"https://arxiv.org/pdf/2404.15234v1.pdf","comment":"Accepted at FG 2024"},{"id":"http://arxiv.org/abs/2404.15228v1","updated":"2024-04-23T16:59:02Z","published":"2024-04-23T16:59:02Z","title":"Re-Thinking Inverse Graphics With Large Language Models","summary":" Inverse graphics -- the task of inverting an image into physical variables\nthat, when rendered, enable reproduction of the observed scene -- is a\nfundamental challenge in computer vision and graphics. Disentangling an image\ninto its constituent elements, such as the shape, color, and material\nproperties of the objects of the 3D scene that produced it, requires a\ncomprehensive understanding of the environment. This requirement limits the\nability of existing carefully engineered approaches to generalize across\ndomains. Inspired by the zero-shot ability of large language models (LLMs) to\ngeneralize to novel contexts, we investigate the possibility of leveraging the\nbroad world knowledge encoded in such models in solving inverse-graphics\nproblems. To this end, we propose the Inverse-Graphics Large Language Model\n(IG-LLM), an inverse-graphics framework centered around an LLM, that\nautoregressively decodes a visual embedding into a structured, compositional\n3D-scene representation. We incorporate a frozen pre-trained visual encoder and\na continuous numeric head to enable end-to-end training. Through our\ninvestigation, we demonstrate the potential of LLMs to facilitate inverse\ngraphics through next-token prediction, without the use of image-space\nsupervision. Our analysis opens up new possibilities for precise spatial\nreasoning about images that exploit the visual knowledge of LLMs. We will\nrelease our code and data to ensure the reproducibility of our investigation\nand to facilitate future research at https://ig-llm.is.tue.mpg.de/\n","authors":["Peter Kulits","Haiwen Feng","Weiyang Liu","Victoria Abrevaya","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2404.15228v1.pdf","comment":"31 pages; project page: https://ig-llm.is.tue.mpg.de/"},{"id":"http://arxiv.org/abs/2404.15224v1","updated":"2024-04-23T16:54:31Z","published":"2024-04-23T16:54:31Z","title":"Deep Models for Multi-View 3D Object Recognition: A Review","summary":" Human decision-making often relies on visual information from multiple\nperspectives or views. In contrast, machine learning-based object recognition\nutilizes information from a single image of the object. However, the\ninformation conveyed by a single image may not be sufficient for accurate\ndecision-making, particularly in complex recognition problems. The utilization\nof multi-view 3D representations for object recognition has thus far\ndemonstrated the most promising results for achieving state-of-the-art\nperformance. This review paper comprehensively covers recent progress in\nmulti-view 3D object recognition methods for 3D classification and retrieval\ntasks. Specifically, we focus on deep learning-based and transformer-based\ntechniques, as they are widely utilized and have achieved state-of-the-art\nperformance. We provide detailed information about existing deep learning-based\nand transformer-based multi-view 3D object recognition models, including the\nmost commonly used 3D datasets, camera configurations and number of views, view\nselection strategies, pre-trained CNN architectures, fusion strategies, and\nrecognition performance on 3D classification and 3D retrieval tasks.\nAdditionally, we examine various computer vision applications that use\nmulti-view classification. Finally, we highlight key findings and future\ndirections for developing multi-view 3D object recognition methods to provide\nreaders with a comprehensive understanding of the field.\n","authors":["Mona Alzahrani","Muhammad Usman","Salma Kammoun","Saeed Anwar","Tarek Helmy"],"pdf_url":"https://arxiv.org/pdf/2404.15224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15194v1","updated":"2024-04-23T16:33:28Z","published":"2024-04-23T16:33:28Z","title":"Closed Loop Interactive Embodied Reasoning for Robot Manipulation","summary":" Embodied reasoning systems integrate robotic hardware and cognitive processes\nto perform complex tasks typically in response to a natural language query\nabout a specific physical environment. This usually involves changing the\nbelief about the scene or physically interacting and changing the scene (e.g.\n'Sort the objects from lightest to heaviest'). In order to facilitate the\ndevelopment of such systems we introduce a new simulating environment that\nmakes use of MuJoCo physics engine and high-quality renderer Blender to provide\nrealistic visual observations that are also accurate to the physical state of\nthe scene. Together with the simulator we propose a new benchmark composed of\n10 classes of multi-step reasoning scenarios that require simultaneous visual\nand physical measurements. Finally, we develop a new modular Closed Loop\nInteractive Reasoning (CLIER) approach that takes into account the measurements\nof non-visual object properties, changes in the scene caused by external\ndisturbances as well as uncertain outcomes of robotic actions. We extensively\nevaluate our reasoning approach in simulation and in the real world\nmanipulation tasks with a success rate above 76% and 64%, respectively.\n","authors":["Michal Nazarczuk","Jan Kristof Behrens","Karla Stepanova","Matej Hoffmann","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2404.15194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15174v1","updated":"2024-04-23T16:14:20Z","published":"2024-04-23T16:14:20Z","title":"Fourier-enhanced Implicit Neural Fusion Network for Multispectral and\n Hyperspectral Image Fusion","summary":" Recently, implicit neural representations (INR) have made significant strides\nin various vision-related domains, providing a novel solution for Multispectral\nand Hyperspectral Image Fusion (MHIF) tasks. However, INR is prone to losing\nhigh-frequency information and is confined to the lack of global perceptual\ncapabilities. To address these issues, this paper introduces a Fourier-enhanced\nImplicit Neural Fusion Network (FeINFN) specifically designed for MHIF task,\ntargeting the following phenomena: The Fourier amplitudes of the HR-HSI latent\ncode and LR-HSI are remarkably similar; however, their phases exhibit different\npatterns. In FeINFN, we innovatively propose a spatial and frequency implicit\nfusion function (Spa-Fre IFF), helping INR capture high-frequency information\nand expanding the receptive field. Besides, a new decoder employing a complex\nGabor wavelet activation function, called Spatial-Frequency Interactive Decoder\n(SFID), is invented to enhance the interaction of INR features. Especially, we\nfurther theoretically prove that the Gabor wavelet activation possesses a\ntime-frequency tightness property that favors learning the optimal bandwidths\nin the decoder. Experiments on two benchmark MHIF datasets verify the\nstate-of-the-art (SOTA) performance of the proposed method, both visually and\nquantitatively. Also, ablation studies demonstrate the mentioned contributions.\nThe code will be available on Anonymous GitHub\n(https://anonymous.4open.science/r/FeINFN-15C9/) after possible acceptance.\n","authors":["Yu-Jie Liang","Zihan Cao","Liang-Jian Deng","Xiao Wu"],"pdf_url":"https://arxiv.org/pdf/2404.15174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15163v1","updated":"2024-04-23T16:02:33Z","published":"2024-04-23T16:02:33Z","title":"Adaptive Mixed-Scale Feature Fusion Network for Blind AI-Generated Image\n Quality Assessment","summary":" With the increasing maturity of the text-to-image and image-to-image\ngenerative models, AI-generated images (AGIs) have shown great application\npotential in advertisement, entertainment, education, social media, etc.\nAlthough remarkable advancements have been achieved in generative models, very\nfew efforts have been paid to design relevant quality assessment models. In\nthis paper, we propose a novel blind image quality assessment (IQA) network,\nnamed AMFF-Net, for AGIs. AMFF-Net evaluates AGI quality from three dimensions,\ni.e., \"visual quality\", \"authenticity\", and \"consistency\". Specifically,\ninspired by the characteristics of the human visual system and motivated by the\nobservation that \"visual quality\" and \"authenticity\" are characterized by both\nlocal and global aspects, AMFF-Net scales the image up and down and takes the\nscaled images and original-sized image as the inputs to obtain multi-scale\nfeatures. After that, an Adaptive Feature Fusion (AFF) block is used to\nadaptively fuse the multi-scale features with learnable weights. In addition,\nconsidering the correlation between the image and prompt, AMFF-Net compares the\nsemantic features from text encoder and image encoder to evaluate the\ntext-to-image alignment. We carry out extensive experiments on three AGI\nquality assessment databases, and the experimental results show that our\nAMFF-Net obtains better performance than nine state-of-the-art blind IQA\nmethods. The results of ablation experiments further demonstrate the\neffectiveness of the proposed multi-scale input strategy and AFF block.\n","authors":["Tianwei Zhou","Songbai Tan","Wei Zhou","Yu Luo","Yuan-Gen Wang","Guanghui Yue"],"pdf_url":"https://arxiv.org/pdf/2404.15163v1.pdf","comment":"IEEE Transactions on Broadcasting (TBC)"},{"id":"http://arxiv.org/abs/2404.15161v1","updated":"2024-04-23T16:01:33Z","published":"2024-04-23T16:01:33Z","title":"Combating Missing Modalities in Egocentric Videos at Test Time","summary":" Understanding videos that contain multiple modalities is crucial, especially\nin egocentric videos, where combining various sensory inputs significantly\nimproves tasks like action recognition and moment localization. However,\nreal-world applications often face challenges with incomplete modalities due to\nprivacy concerns, efficiency needs, or hardware issues. Current methods, while\neffective, often necessitate retraining the model entirely to handle missing\nmodalities, making them computationally intensive, particularly with large\ntraining datasets. In this study, we propose a novel approach to address this\nissue at test time without requiring retraining. We frame the problem as a\ntest-time adaptation task, where the model adjusts to the available unlabeled\ndata at test time. Our method, MiDl~(Mutual information with\nself-Distillation), encourages the model to be insensitive to the specific\nmodality source present during testing by minimizing the mutual information\nbetween the prediction and the available modality. Additionally, we incorporate\nself-distillation to maintain the model's original performance when both\nmodalities are available. MiDl represents the first self-supervised, online\nsolution for handling missing modalities exclusively at test time. Through\nexperiments with various pretrained models and datasets, MiDl demonstrates\nsubstantial performance improvement without the need for retraining.\n","authors":["Merey Ramazanova","Alejandro Pardo","Bernard Ghanem","Motasem Alfarra"],"pdf_url":"https://arxiv.org/pdf/2404.15161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06623v4","updated":"2024-04-23T15:56:34Z","published":"2023-11-11T17:52:06Z","title":"VT-Former: An Exploratory Study on Vehicle Trajectory Prediction for\n Highway Surveillance through Graph Isomorphism and Transformer","summary":" Enhancing roadway safety has become an essential computer vision focus area\nfor Intelligent Transportation Systems (ITS). As a part of ITS, Vehicle\nTrajectory Prediction (VTP) aims to forecast a vehicle's future positions based\non its past and current movements. VTP is a pivotal element for road safety,\naiding in applications such as traffic management, accident prevention,\nwork-zone safety, and energy optimization. While most works in this field focus\non autonomous driving, with the growing number of surveillance cameras, another\nsub-field emerges for surveillance VTP with its own set of challenges. In this\npaper, we introduce VT-Former, a novel transformer-based VTP approach for\nhighway safety and surveillance. In addition to utilizing transformers to\ncapture long-range temporal patterns, a new Graph Attentive Tokenization (GAT)\nmodule has been proposed to capture intricate social interactions among\nvehicles. This study seeks to explore both the advantages and the limitations\ninherent in combining transformer architecture with graphs for VTP. Our\ninvestigation, conducted across three benchmark datasets from diverse\nsurveillance viewpoints, showcases the State-of-the-Art (SotA) or comparable\nperformance of VT-Former in predicting vehicle trajectories. This study\nunderscores the potential of VT-Former and its architecture, opening new\navenues for future research and exploration.\n","authors":["Armin Danesh Pazho","Ghazal Alinezhad Noghre","Vinit Katariya","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2311.06623v4.pdf","comment":"Completely updated based on the reviews received for the paper"},{"id":"http://arxiv.org/abs/2404.15141v1","updated":"2024-04-23T15:47:58Z","published":"2024-04-23T15:47:58Z","title":"CutDiffusion: A Simple, Fast, Cheap, and Strong Diffusion Extrapolation\n Method","summary":" Transforming large pre-trained low-resolution diffusion models to cater to\nhigher-resolution demands, i.e., diffusion extrapolation, significantly\nimproves diffusion adaptability. We propose tuning-free CutDiffusion, aimed at\nsimplifying and accelerating the diffusion extrapolation process, making it\nmore affordable and improving performance. CutDiffusion abides by the existing\npatch-wise extrapolation but cuts a standard patch diffusion process into an\ninitial phase focused on comprehensive structure denoising and a subsequent\nphase dedicated to specific detail refinement. Comprehensive experiments\nhighlight the numerous almighty advantages of CutDiffusion: (1) simple method\nconstruction that enables a concise higher-resolution diffusion process without\nthird-party engagement; (2) fast inference speed achieved through a single-step\nhigher-resolution diffusion process, and fewer inference patches required; (3)\ncheap GPU cost resulting from patch-wise inference and fewer patches during the\ncomprehensive structure denoising; (4) strong generation performance, stemming\nfrom the emphasis on specific detail refinement.\n","authors":["Mingbao Lin","Zhihang Lin","Wengyi Zhan","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.15141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15129v1","updated":"2024-04-23T15:29:02Z","published":"2024-04-23T15:29:02Z","title":"Gallbladder Cancer Detection in Ultrasound Images based on YOLO and\n Faster R-CNN","summary":" Medical image analysis is a significant application of artificial\nintelligence for disease diagnosis. A crucial step in this process is the\nidentification of regions of interest within the images. This task can be\nautomated using object detection algorithms. YOLO and Faster R-CNN are renowned\nfor such algorithms, each with its own strengths and weaknesses. This study\naims to explore the advantages of both techniques to select more accurate\nbounding boxes for gallbladder detection from ultrasound images, thereby\nenhancing gallbladder cancer classification. A fusion method that leverages the\nbenefits of both techniques is presented in this study. The proposed method\ndemonstrated superior classification performance, with an accuracy of 92.62%,\ncompared to the individual use of Faster R-CNN and YOLOv8, which yielded\naccuracies of 90.16% and 82.79%, respectively.\n","authors":["Sara Dadjouy","Hedieh Sajedi"],"pdf_url":"https://arxiv.org/pdf/2404.15129v1.pdf","comment":"Published in 2024 10th International Conference on Artificial\n Intelligence and Robotics (QICAR)"},{"id":"http://arxiv.org/abs/2404.15127v1","updated":"2024-04-23T15:27:19Z","published":"2024-04-23T15:27:19Z","title":"MedDr: Diagnosis-Guided Bootstrapping for Large-Scale Medical\n Vision-Language Learning","summary":" The rapid advancement of large-scale vision-language models has showcased\nremarkable capabilities across various tasks. However, the lack of extensive\nand high-quality image-text data in medicine has greatly hindered the\ndevelopment of large-scale medical vision-language models. In this work, we\npresent a diagnosis-guided bootstrapping strategy that exploits both image and\nlabel information to construct vision-language datasets. Based on the\nconstructed dataset, we developed MedDr, a generalist foundation model for\nhealthcare capable of handling diverse medical data modalities, including\nradiology, pathology, dermatology, retinography, and endoscopy. Moreover,\nduring inference, we propose a simple but effective retrieval-augmented medical\ndiagnosis strategy, which enhances the model's generalization ability.\nExtensive experiments on visual question answering, medical report generation,\nand medical image diagnosis demonstrate the superiority of our method.\n","authors":["Sunan He","Yuxiang Nie","Zhixuan Chen","Zhiyuan Cai","Hongmei Wang","Shu Yang","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15121v1","updated":"2024-04-23T15:20:17Z","published":"2024-04-23T15:20:17Z","title":"Taming Diffusion Probabilistic Models for Character Control","summary":" We present a novel character control framework that effectively utilizes\nmotion diffusion probabilistic models to generate high-quality and diverse\ncharacter animations, responding in real-time to a variety of dynamic\nuser-supplied control signals. At the heart of our method lies a\ntransformer-based Conditional Autoregressive Motion Diffusion Model (CAMDM),\nwhich takes as input the character's historical motion and can generate a range\nof diverse potential future motions conditioned on high-level, coarse user\ncontrol. To meet the demands for diversity, controllability, and computational\nefficiency required by a real-time controller, we incorporate several key\nalgorithmic designs. These include separate condition tokenization,\nclassifier-free guidance on past motion, and heuristic future trajectory\nextension, all designed to address the challenges associated with taming motion\ndiffusion probabilistic models for character control. As a result, our work\nrepresents the first model that enables real-time generation of high-quality,\ndiverse character animations based on user interactive control, supporting\nanimating the character in multiple styles with a single unified model. We\nevaluate our method on a diverse set of locomotion skills, demonstrating the\nmerits of our method over existing character controllers. Project page and\nsource codes: https://aiganimation.github.io/CAMDM/\n","authors":["Rui Chen","Mingyi Shi","Shaoli Huang","Ping Tan","Taku Komura","Xuelin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15121v1.pdf","comment":"Accepted by SIGGRAPH 2024 (Conference Track). Project page and source\n codes: https://aiganimation.github.io/CAMDM/"},{"id":"http://arxiv.org/abs/2308.10680v2","updated":"2024-04-23T15:19:17Z","published":"2023-08-21T12:27:18Z","title":"Co-Speech Gesture Detection through Multi-Phase Sequence Labeling","summary":" Gestures are integral components of face-to-face communication. They unfold\nover time, often following predictable movement phases of preparation, stroke,\nand retraction. Yet, the prevalent approach to automatic gesture detection\ntreats the problem as binary classification, classifying a segment as either\ncontaining a gesture or not, thus failing to capture its inherently sequential\nand contextual nature. To address this, we introduce a novel framework that\nreframes the task as a multi-phase sequence labeling problem rather than binary\nclassification. Our model processes sequences of skeletal movements over time\nwindows, uses Transformer encoders to learn contextual embeddings, and\nleverages Conditional Random Fields to perform sequence labeling. We evaluate\nour proposal on a large dataset of diverse co-speech gestures in task-oriented\nface-to-face dialogues. The results consistently demonstrate that our method\nsignificantly outperforms strong baseline models in detecting gesture strokes.\nFurthermore, applying Transformer encoders to learn contextual embeddings from\nmovement sequences substantially improves gesture unit detection. These results\nhighlight our framework's capacity to capture the fine-grained dynamics of\nco-speech gesture phases, paving the way for more nuanced and accurate gesture\ndetection and analysis.\n","authors":["Esam Ghaleb","Ilya Burenko","Marlou Rasenberg","Wim Pouw","Peter Uhrig","Judith Holler","Ivan Toni","Aslı Özyürek","Raquel Fernández"],"pdf_url":"https://arxiv.org/pdf/2308.10680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15100v1","updated":"2024-04-23T14:53:15Z","published":"2024-04-23T14:53:15Z","title":"Multimodal Large Language Model is a Human-Aligned Annotator for\n Text-to-Image Generation","summary":" Recent studies have demonstrated the exceptional potentials of leveraging\nhuman preference datasets to refine text-to-image generative models, enhancing\nthe alignment between generated images and textual prompts. Despite these\nadvances, current human preference datasets are either prohibitively expensive\nto construct or suffer from a lack of diversity in preference dimensions,\nresulting in limited applicability for instruction tuning in open-source\ntext-to-image generative models and hinder further exploration. To address\nthese challenges and promote the alignment of generative models through\ninstruction tuning, we leverage multimodal large language models to create\nVisionPrefer, a high-quality and fine-grained preference dataset that captures\nmultiple preference aspects. We aggregate feedback from AI annotators across\nfour aspects: prompt-following, aesthetic, fidelity, and harmlessness to\nconstruct VisionPrefer. To validate the effectiveness of VisionPrefer, we train\na reward model VP-Score over VisionPrefer to guide the training of\ntext-to-image generative models and the preference prediction accuracy of\nVP-Score is comparable to human annotators. Furthermore, we use two\nreinforcement learning methods to supervised fine-tune generative models to\nevaluate the performance of VisionPrefer, and extensive experimental results\ndemonstrate that VisionPrefer significantly improves text-image alignment in\ncompositional image generation across diverse aspects, e.g., aesthetic, and\ngeneralizes better than previous human-preference metrics across various image\ndistributions. Moreover, VisionPrefer indicates that the integration of\nAI-generated synthetic data as a supervisory signal is a promising avenue for\nachieving improved alignment with human preferences in vision generative\nmodels.\n","authors":["Xun Wu","Shaohan Huang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2404.15100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00110v2","updated":"2024-04-23T14:49:45Z","published":"2023-11-30T18:19:47Z","title":"CLIP-QDA: An Explainable Concept Bottleneck Model","summary":" In this paper, we introduce an explainable algorithm designed from a\nmulti-modal foundation model, that performs fast and explainable image\nclassification. Drawing inspiration from CLIP-based Concept Bottleneck Models\n(CBMs), our method creates a latent space where each neuron is linked to a\nspecific word. Observing that this latent space can be modeled with simple\ndistributions, we use a Mixture of Gaussians (MoG) formalism to enhance the\ninterpretability of this latent space. Then, we introduce CLIP-QDA, a\nclassifier that only uses statistical values to infer labels from the concepts.\nIn addition, this formalism allows for both local and global explanations.\nThese explanations come from the inner design of our architecture, our work is\npart of a new family of greybox models, combining performances of opaque\nfoundation models and the interpretability of transparent models. Our empirical\nfindings show that in instances where the MoG assumption holds, CLIP-QDA\nachieves similar accuracy with state-of-the-art methods CBMs. Our explanations\ncompete with existing XAI methods while being faster to compute.\n","authors":["Rémi Kazmierczak","Eloïse Berthier","Goran Frehse","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2312.00110v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13576v2","updated":"2024-04-23T14:37:57Z","published":"2024-02-21T07:16:06Z","title":"Improving Video Corpus Moment Retrieval with Partial Relevance\n Enhancement","summary":" Video Corpus Moment Retrieval (VCMR) is a new video retrieval task aimed at\nretrieving a relevant moment from a large corpus of untrimmed videos using a\ntext query. The relevance between the video and query is partial, mainly\nevident in two aspects:~(1)~Scope: The untrimmed video contains many frames,\nbut not all are relevant to the query. Strong relevance is typically observed\nonly within the relevant moment.~(2)~Modality: The relevance of the query\nvaries with different modalities. Action descriptions align more with visual\nelements, while character conversations are more related to textual\ninformation.Existing methods often treat all video contents equally, leading to\nsub-optimal moment retrieval. We argue that effectively capturing the partial\nrelevance between the query and video is essential for the VCMR task. To this\nend, we propose a Partial Relevance Enhanced Model~(PREM) to improve VCMR. VCMR\ninvolves two sub-tasks: video retrieval and moment localization. To align with\ntheir distinct objectives, we implement specialized partial relevance\nenhancement strategies. For video retrieval, we introduce a multi-modal\ncollaborative video retriever, generating different query representations for\nthe two modalities by modality-specific pooling, ensuring a more effective\nmatch. For moment localization, we propose the focus-then-fuse moment\nlocalizer, utilizing modality-specific gates to capture essential content. We\nalso introduce relevant content-enhanced training methods for both retriever\nand localizer to enhance the ability of model to capture relevant content.\nExperimental results on TVR and DiDeMo datasets show that the proposed model\noutperforms the baselines, achieving a new state-of-the-art of VCMR. The code\nis available at \\url{https://github.com/hdy007007/PREM}.\n","authors":["Danyang Hou","Liang Pang","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2402.13576v2.pdf","comment":"camera-ready version of ACM ICMR 2024"},{"id":"http://arxiv.org/abs/2404.15082v1","updated":"2024-04-23T14:31:44Z","published":"2024-04-23T14:31:44Z","title":"Harnessing Optical Imaging Limit through Atmospheric Scattering Media","summary":" Recording and identifying faint objects through atmospheric scattering media\nby an optical system are fundamentally interesting and technologically\nimportant. In this work, we introduce a comprehensive model that incorporates\ncontributions from target characteristics, atmospheric effects, imaging system,\ndigital processing, and visual perception to assess the ultimate perceptible\nlimit of geometrical imaging, specifically the angular resolution at the\nboundary of visible distance. The model allows to reevaluate the effectiveness\nof conventional imaging recording, processing, and perception and to analyze\nthe limiting factors that constrain image recognition capabilities in\natmospheric media. The simulations were compared with the experimental results\nmeasured in a fog chamber and outdoor settings. The results reveal general good\nagreement between analysis and experimental, pointing out the way to harnessing\nthe physical limit for optical imaging in scattering media. An immediate\napplication of the study is the extension of the image range by an amount of\n1.2 times with noise reduction via multi-frame averaging, hence greatly\nenhancing the capability of optical imaging in the atmosphere.\n","authors":["Libang Chen","Jun Yang","Lingye Chen","Yuyang Shui","Yikun Liu","Jianying Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.15082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15081v1","updated":"2024-04-23T14:31:15Z","published":"2024-04-23T14:31:15Z","title":"Perturbing Attention Gives You More Bang for the Buck: Subtle Imaging\n Perturbations That Efficiently Fool Customized Diffusion Models","summary":" Diffusion models (DMs) embark a new era of generative modeling and offer more\nopportunities for efficient generating high-quality and realistic data samples.\nHowever, their widespread use has also brought forth new challenges in model\nsecurity, which motivates the creation of more effective adversarial attackers\non DMs to understand its vulnerability. We propose CAAT, a simple but generic\nand efficient approach that does not require costly training to effectively\nfool latent diffusion models (LDMs). The approach is based on the observation\nthat cross-attention layers exhibits higher sensitivity to gradient change,\nallowing for leveraging subtle perturbations on published images to\nsignificantly corrupt the generated images. We show that a subtle perturbation\non an image can significantly impact the cross-attention layers, thus changing\nthe mapping between text and image during the fine-tuning of customized\ndiffusion models. Extensive experiments demonstrate that CAAT is compatible\nwith diverse diffusion models and outperforms baseline attack methods in a more\neffective (more noise) and efficient (twice as fast as Anti-DreamBooth and\nMist) manner.\n","authors":["Jingyao Xu","Yuetong Lu","Yandong Li","Siyang Lu","Dongdong Wang","Xiang Wei"],"pdf_url":"https://arxiv.org/pdf/2404.15081v1.pdf","comment":"Published at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.15041v1","updated":"2024-04-23T13:43:33Z","published":"2024-04-23T13:43:33Z","title":"LEAF: Unveiling Two Sides of the Same Coin in Semi-supervised Facial\n Expression Recognition","summary":" Semi-supervised learning has emerged as a promising approach to tackle the\nchallenge of label scarcity in facial expression recognition (FER) task.\nHowever, current state-of-the-art methods primarily focus on one side of the\ncoin, i.e., generating high-quality pseudo-labels, while overlooking the other\nside: enhancing expression-relevant representations. In this paper, we unveil\nboth sides of the coin by proposing a unified framework termed hierarchicaL\ndEcoupling And Fusing (LEAF) to coordinate expression-relevant representations\nand pseudo-labels for semi-supervised FER. LEAF introduces a hierarchical\nexpression-aware aggregation strategy that operates at three levels: semantic,\ninstance, and category. (1) At the semantic and instance levels, LEAF decouples\nrepresentations into expression-agnostic and expression-relevant components,\nand adaptively fuses them using learnable gating weights. (2) At the category\nlevel, LEAF assigns ambiguous pseudo-labels by decoupling predictions into\npositive and negative parts, and employs a consistency loss to ensure agreement\nbetween two augmented views of the same image. Extensive experiments on\nbenchmark datasets demonstrate that by unveiling and harmonizing both sides of\nthe coin, LEAF outperforms state-of-the-art semi-supervised FER methods,\neffectively leveraging both labeled and unlabeled data. Moreover, the proposed\nexpression-aware aggregation strategy can be seamlessly integrated into\nexisting semi-supervised frameworks, leading to significant performance gains.\n","authors":["Fan Zhang","Zhi-Qi Cheng","Jian Zhao","Xiaojiang Peng","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.15041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15037v1","updated":"2024-04-23T13:42:12Z","published":"2024-04-23T13:42:12Z","title":"DP-Net: Learning Discriminative Parts for image recognition","summary":" This paper presents Discriminative Part Network (DP-Net), a deep architecture\nwith strong interpretation capabilities, which exploits a pretrained\nConvolutional Neural Network (CNN) combined with a part-based recognition\nmodule. This system learns and detects parts in the images that are\ndiscriminative among categories, without the need for fine-tuning the CNN,\nmaking it more scalable than other part-based models. While part-based\napproaches naturally offer interpretable representations, we propose\nexplanations at image and category levels and introduce specific constraints on\nthe part learning process to make them more discrimative.\n","authors":["Ronan Sicre","Hanwei Zhang","Julien Dejasmin","Chiheb Daaloul","Stéphane Ayache","Thierry Artières"],"pdf_url":"https://arxiv.org/pdf/2404.15037v1.pdf","comment":"IEEE ICIP 2023"},{"id":"http://arxiv.org/abs/2402.14327v2","updated":"2024-04-23T13:41:47Z","published":"2024-02-22T06:47:44Z","title":"Subobject-level Image Tokenization","summary":" Transformer-based vision models typically tokenize images into fixed-size\nsquare patches as input units, which lacks the adaptability to image content\nand overlooks the inherent pixel grouping structure. Inspired by the subword\ntokenization widely adopted in language models, we propose an image tokenizer\nat a subobject level, where the subobjects are represented by semantically\nmeaningful image segments obtained by segmentation models (e.g., segment\nanything models). To implement a learning system based on subobject\ntokenization, we first introduced a Direct Segment Anything Model (DirectSAM)\nthat efficiently produces comprehensive segmentation of subobjects, then embed\nsubobjects into compact latent vectors and fed them into a large language model\nfor vision language learning. Empirical results demonstrated that our\nsubobject-level tokenization significantly facilitates efficient learning of\ntranslating images into object and attribute descriptions compared to the\ntraditional patch-level tokenization. Codes and models are open-sourced at\nhttps://github.com/ChenDelong1999/subobjects.\n","authors":["Delong Chen","Samuel Cahyawijaya","Jianfeng Liu","Baoyuan Wang","Pascale Fung"],"pdf_url":"https://arxiv.org/pdf/2402.14327v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2004.05704v4","updated":"2024-04-23T13:38:36Z","published":"2020-04-12T21:45:23Z","title":"Visual Grounding Methods for VQA are Working for the Wrong Reasons!","summary":" Existing Visual Question Answering (VQA) methods tend to exploit dataset\nbiases and spurious statistical correlations, instead of producing right\nanswers for the right reasons. To address this issue, recent bias mitigation\nmethods for VQA propose to incorporate visual cues (e.g., human attention maps)\nto better ground the VQA models, showcasing impressive gains. However, we show\nthat the performance improvements are not a result of improved visual\ngrounding, but a regularization effect which prevents over-fitting to\nlinguistic priors. For instance, we find that it is not actually necessary to\nprovide proper, human-based cues; random, insensible cues also result in\nsimilar improvements. Based on this observation, we propose a simpler\nregularization scheme that does not require any external annotations and yet\nachieves near state-of-the-art performance on VQA-CPv2.\n","authors":["Robik Shrestha","Kushal Kafle","Christopher Kanan"],"pdf_url":"https://arxiv.org/pdf/2004.05704v4.pdf","comment":"Published in ACL 2020 under the title \"A negative case analysis of\n visual grounding methods for VQA\""},{"id":"http://arxiv.org/abs/2404.15033v1","updated":"2024-04-23T13:38:01Z","published":"2024-04-23T13:38:01Z","title":"IPAD: Industrial Process Anomaly Detection Dataset","summary":" Video anomaly detection (VAD) is a challenging task aiming to recognize\nanomalies in video frames, and existing large-scale VAD researches primarily\nfocus on road traffic and human activity scenes. In industrial scenes, there\nare often a variety of unpredictable anomalies, and the VAD method can play a\nsignificant role in these scenarios. However, there is a lack of applicable\ndatasets and methods specifically tailored for industrial production scenarios\ndue to concerns regarding privacy and security. To bridge this gap, we propose\na new dataset, IPAD, specifically designed for VAD in industrial scenarios. The\nindustrial processes in our dataset are chosen through on-site factory research\nand discussions with engineers. This dataset covers 16 different industrial\ndevices and contains over 6 hours of both synthetic and real-world video\nfootage. Moreover, we annotate the key feature of the industrial process, ie,\nperiodicity. Based on the proposed dataset, we introduce a period memory module\nand a sliding window inspection mechanism to effectively investigate the\nperiodic information in a basic reconstruction model. Our framework leverages\nLoRA adapter to explore the effective migration of pretrained models, which are\ninitially trained using synthetic data, into real-world scenarios. Our proposed\ndataset and method will fill the gap in the field of industrial video anomaly\ndetection and drive the process of video understanding tasks as well as smart\nfactory deployment.\n","authors":["Jinfan Liu","Yichao Yan","Junjie Li","Weiming Zhao","Pengzhi Chu","Xingdong Sheng","Yunhui Liu","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.15033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15028v1","updated":"2024-04-23T13:34:52Z","published":"2024-04-23T13:34:52Z","title":"PRISM: A Promptable and Robust Interactive Segmentation Model with\n Visual Prompts","summary":" In this paper, we present PRISM, a Promptable and Robust Interactive\nSegmentation Model, aiming for precise segmentation of 3D medical images. PRISM\naccepts various visual inputs, including points, boxes, and scribbles as sparse\nprompts, as well as masks as dense prompts. Specifically, PRISM is designed\nwith four principles to achieve robustness: (1) Iterative learning. The model\nproduces segmentations by using visual prompts from previous iterations to\nachieve progressive improvement. (2) Confidence learning. PRISM employs\nmultiple segmentation heads per input image, each generating a continuous map\nand a confidence score to optimize predictions. (3) Corrective learning.\nFollowing each segmentation iteration, PRISM employs a shallow corrective\nrefinement network to reassign mislabeled voxels. (4) Hybrid design. PRISM\nintegrates hybrid encoders to better capture both the local and global\ninformation. Comprehensive validation of PRISM is conducted using four public\ndatasets for tumor segmentation in the colon, pancreas, liver, and kidney,\nhighlighting challenges caused by anatomical variations and ambiguous\nboundaries in accurate tumor identification. Compared to state-of-the-art\nmethods, both with and without prompt engineering, PRISM significantly improves\nperformance, achieving results that are close to human levels. The code is\npublicly available at https://github.com/MedICL-VU/PRISM.\n","authors":["Hao Li","Han Liu","Dewei Hu","Jiacheng Wang","Ipek Oguz"],"pdf_url":"https://arxiv.org/pdf/2404.15028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07633v2","updated":"2024-04-23T13:33:26Z","published":"2023-10-11T16:28:24Z","title":"Attention-Map Augmentation for Hypercomplex Breast Cancer Classification","summary":" Breast cancer is the most widespread neoplasm among women and early detection\nof this disease is critical. Deep learning techniques have become of great\ninterest to improve diagnostic performance. However, distinguishing between\nmalignant and benign masses in whole mammograms poses a challenge, as they\nappear nearly identical to an untrained eye, and the region of interest (ROI)\nconstitutes only a small fraction of the entire image. In this paper, we\npropose a framework, parameterized hypercomplex attention maps (PHAM), to\novercome these problems. Specifically, we deploy an augmentation step based on\ncomputing attention maps. Then, the attention maps are used to condition the\nclassification step by constructing a multi-dimensional input comprised of the\noriginal breast cancer image and the corresponding attention map. In this step,\na parameterized hypercomplex neural network (PHNN) is employed to perform\nbreast cancer classification. The framework offers two main advantages. First,\nattention maps provide critical information regarding the ROI and allow the\nneural model to concentrate on it. Second, the hypercomplex architecture has\nthe ability to model local relations between input dimensions thanks to\nhypercomplex algebra rules, thus properly exploiting the information provided\nby the attention map. We demonstrate the efficacy of the proposed framework on\nboth mammography images as well as histopathological ones. We surpass\nattention-based state-of-the-art networks and the real-valued counterpart of\nour approach. The code of our work is available at\nhttps://github.com/ispamm/AttentionBCS.\n","authors":["Eleonora Lopez","Filippo Betello","Federico Carmignani","Eleonora Grassucci","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2310.07633v2.pdf","comment":"Published in Elsevier Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2404.15024v1","updated":"2024-04-23T13:32:29Z","published":"2024-04-23T13:32:29Z","title":"A Learning Paradigm for Interpretable Gradients","summary":" This paper studies interpretability of convolutional networks by means of\nsaliency maps. Most approaches based on Class Activation Maps (CAM) combine\ninformation from fully connected layers and gradient through variants of\nbackpropagation. However, it is well understood that gradients are noisy and\nalternatives like guided backpropagation have been proposed to obtain better\nvisualization at inference. In this work, we present a novel training approach\nto improve the quality of gradients for interpretability. In particular, we\nintroduce a regularization loss such that the gradient with respect to the\ninput image obtained by standard backpropagation is similar to the gradient\nobtained by guided backpropagation. We find that the resulting gradient is\nqualitatively less noisy and improves quantitatively the interpretability\nproperties of different networks, using several interpretability methods.\n","authors":["Felipe Torres Figueroa","Hanwei Zhang","Ronan Sicre","Yannis Avrithis","Stephane Ayache"],"pdf_url":"https://arxiv.org/pdf/2404.15024v1.pdf","comment":"VISAPP 2024"},{"id":"http://arxiv.org/abs/2404.15022v1","updated":"2024-04-23T13:31:18Z","published":"2024-04-23T13:31:18Z","title":"A review of deep learning-based information fusion techniques for\n multimodal medical image classification","summary":" Multimodal medical imaging plays a pivotal role in clinical diagnosis and\nresearch, as it combines information from various imaging modalities to provide\na more comprehensive understanding of the underlying pathology. Recently, deep\nlearning-based multimodal fusion techniques have emerged as powerful tools for\nimproving medical image classification. This review offers a thorough analysis\nof the developments in deep learning-based multimodal fusion for medical\nclassification tasks. We explore the complementary relationships among\nprevalent clinical modalities and outline three main fusion schemes for\nmultimodal classification networks: input fusion, intermediate fusion\n(encompassing single-level fusion, hierarchical fusion, and attention-based\nfusion), and output fusion. By evaluating the performance of these fusion\ntechniques, we provide insight into the suitability of different network\narchitectures for various multimodal fusion scenarios and application domains.\nFurthermore, we delve into challenges related to network architecture\nselection, handling incomplete multimodal data management, and the potential\nlimitations of multimodal fusion. Finally, we spotlight the promising future of\nTransformer-based multimodal fusion techniques and give recommendations for\nfuture research in this rapidly evolving field.\n","authors":["Yihao Li","Mostafa El Habib Daho","Pierre-Henri Conze","Rachid Zeghlache","Hugo Le Boité","Ramin Tadayoni","Béatrice Cochener","Mathieu Lamard","Gwenolé Quellec"],"pdf_url":"https://arxiv.org/pdf/2404.15022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13388v2","updated":"2024-04-23T13:25:01Z","published":"2024-04-20T14:15:25Z","title":"Diagnosis of Multiple Fundus Disorders Amidst a Scarcity of Medical\n Experts Via Self-supervised Machine Learning","summary":" Fundus diseases are major causes of visual impairment and blindness\nworldwide, especially in underdeveloped regions, where the shortage of\nophthalmologists hinders timely diagnosis. AI-assisted fundus image analysis\nhas several advantages, such as high accuracy, reduced workload, and improved\naccessibility, but it requires a large amount of expert-annotated data to build\nreliable models. To address this dilemma, we propose a general self-supervised\nmachine learning framework that can handle diverse fundus diseases from\nunlabeled fundus images. Our method's AUC surpasses existing supervised\napproaches by 15.7%, and even exceeds performance of a single human expert.\nFurthermore, our model adapts well to various datasets from different regions,\nraces, and heterogeneous image sources or qualities from multiple cameras or\ndevices. Our method offers a label-free general framework to diagnose fundus\ndiseases, which could potentially benefit telehealth programs for early\nscreening of people at risk of vision loss.\n","authors":["Yong Liu","Mengtian Kang","Shuo Gao","Chi Zhang","Ying Liu","Shiming Li","Yue Qi","Arokia Nathan","Wenjun Xu","Chenyu Tang","Edoardo Occhipinti","Mayinuer Yusufu","Ningli Wang","Weiling Bai","Luigi Occhipinti"],"pdf_url":"https://arxiv.org/pdf/2404.13388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15014v1","updated":"2024-04-23T13:20:09Z","published":"2024-04-23T13:20:09Z","title":"OccGen: Generative Multi-modal 3D Occupancy Prediction for Autonomous\n Driving","summary":" Existing solutions for 3D semantic occupancy prediction typically treat the\ntask as a one-shot 3D voxel-wise segmentation perception problem. These\ndiscriminative methods focus on learning the mapping between the inputs and\noccupancy map in a single step, lacking the ability to gradually refine the\noccupancy map and the reasonable scene imaginative capacity to complete the\nlocal regions somewhere. In this paper, we introduce OccGen, a simple yet\npowerful generative perception model for the task of 3D semantic occupancy\nprediction. OccGen adopts a ''noise-to-occupancy'' generative paradigm,\nprogressively inferring and refining the occupancy map by predicting and\neliminating noise originating from a random Gaussian distribution. OccGen\nconsists of two main components: a conditional encoder that is capable of\nprocessing multi-modal inputs, and a progressive refinement decoder that\napplies diffusion denoising using the multi-modal features as conditions. A key\ninsight of this generative pipeline is that the diffusion denoising process is\nnaturally able to model the coarse-to-fine refinement of the dense 3D occupancy\nmap, therefore producing more detailed predictions. Extensive experiments on\nseveral occupancy benchmarks demonstrate the effectiveness of the proposed\nmethod compared to the state-of-the-art methods. For instance, OccGen\nrelatively enhances the mIoU by 9.5%, 6.3%, and 13.3% on nuScenes-Occupancy\ndataset under the muli-modal, LiDAR-only, and camera-only settings,\nrespectively. Moreover, as a generative perception model, OccGen exhibits\ndesirable properties that discriminative models cannot achieve, such as\nproviding uncertainty estimates alongside its multiple-step predictions.\n","authors":["Guoqing Wang","Zhongdao Wang","Pin Tang","Jilai Zheng","Xiangxuan Ren","Bailan Feng","Chao Ma"],"pdf_url":"https://arxiv.org/pdf/2404.15014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15010v1","updated":"2024-04-23T13:15:35Z","published":"2024-04-23T13:15:35Z","title":"X-3D: Explicit 3D Structure Modeling for Point Cloud Recognition","summary":" Numerous prior studies predominantly emphasize constructing relation vectors\nfor individual neighborhood points and generating dynamic kernels for each\nvector and embedding these into high-dimensional spaces to capture implicit\nlocal structures. However, we contend that such implicit high-dimensional\nstructure modeling approch inadequately represents the local geometric\nstructure of point clouds due to the absence of explicit structural\ninformation. Hence, we introduce X-3D, an explicit 3D structure modeling\napproach. X-3D functions by capturing the explicit local structural information\nwithin the input 3D space and employing it to produce dynamic kernels with\nshared weights for all neighborhood points within the current local region.\nThis modeling approach introduces effective geometric prior and significantly\ndiminishes the disparity between the local structure of the embedding space and\nthe original input point cloud, thereby improving the extraction of local\nfeatures. Experiments show that our method can be used on a variety of methods\nand achieves state-of-the-art performance on segmentation, classification,\ndetection tasks with lower extra computational cost, such as \\textbf{90.7\\%} on\nScanObjectNN for classification, \\textbf{79.2\\%} on S3DIS 6 fold and\n\\textbf{74.3\\%} on S3DIS Area 5 for segmentation, \\textbf{76.3\\%} on ScanNetV2\nfor segmentation and \\textbf{64.5\\%} mAP , \\textbf{46.9\\%} mAP on SUN RGB-D and\n\\textbf{69.0\\%} mAP , \\textbf{51.1\\%} mAP on ScanNetV2 . Our code is available\nat\n\\href{https://github.com/sunshuofeng/X-3D}{https://github.com/sunshuofeng/X-3D}.\n","authors":["Shuofeng Sun","Yongming Rao","Jiwen Lu","Haibin Yan"],"pdf_url":"https://arxiv.org/pdf/2404.15010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15009v1","updated":"2024-04-23T13:15:22Z","published":"2024-04-23T13:15:22Z","title":"The Brain Tumor Segmentation in Pediatrics (BraTS-PEDs) Challenge: Focus\n on Pediatrics (CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs)","summary":" Pediatric tumors of the central nervous system are the most common cause of\ncancer-related death in children. The five-year survival rate for high-grade\ngliomas in children is less than 20%. Due to their rarity, the diagnosis of\nthese entities is often delayed, their treatment is mainly based on historic\ntreatment concepts, and clinical trials require multi-institutional\ncollaborations. Here we present the CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs\nchallenge, focused on pediatric brain tumors with data acquired across multiple\ninternational consortia dedicated to pediatric neuro-oncology and clinical\ntrials. The CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs challenge brings together\nclinicians and AI/imaging scientists to lead to faster development of automated\nsegmentation techniques that could benefit clinical trials, and ultimately the\ncare of children with brain tumors.\n","authors":["Anahita Fathi Kazerooni","Nastaran Khalili","Deep Gandhi","Xinyang Liu","Zhifan Jiang","Syed Muhammed Anwar","Jake Albrecht","Maruf Adewole","Udunna Anazodo","Hannah Anderson","Sina Bagheri","Ujjwal Baid","Timothy Bergquist","Austin J. Borja","Evan Calabrese","Verena Chung","Gian-Marco Conte","Farouk Dako","James Eddy","Ivan Ezhov","Ariana Familiar","Keyvan Farahani","Anurag Gottipati","Debanjan Haldar","Shuvanjan Haldar","Juan Eugenio Iglesias","Anastasia Janas","Elaine Johansen","Blaise V Jones","Neda Khalili","Florian Kofler","Dominic LaBella","Hollie Anne Lai","Koen Van Leemput","Hongwei Bran Li","Nazanin Maleki","Aaron S McAllister","Zeke Meier","Bjoern Menze","Ahmed W Moawad","Khanak K Nandolia","Julija Pavaine","Marie Piraud","Tina Poussaint","Sanjay P Prabhu","Zachary Reitman","Andres Rodriguez","Jeffrey D Rudie","Mariana Sanchez-Montano","Ibraheem Salman Shaikh","Lubdha M. Shah","Nakul Sheth","Russel Taki Shinohara","Wenxin Tu","Karthik Viswanathan","Chunhao Wang","Jeffrey B Ware","Benedikt Wiestler","Walter Wiggins","Anna Zapaishchykova","Mariam Aboian","Miriam Bornhorst","Peter de Blank","Michelle Deutsch","Maryam Fouladi","Lindsey Hoffman","Benjamin Kann","Margot Lazow","Leonie Mikael","Ali Nabavizadeh","Roger Packer","Spyridon Bakas","Adam Resnick","Brian Rood","Arastoo Vossough","Marius George Linguraru"],"pdf_url":"https://arxiv.org/pdf/2404.15009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15008v1","updated":"2024-04-23T13:15:07Z","published":"2024-04-23T13:15:07Z","title":"External Prompt Features Enhanced Parameter-efficient Fine-tuning for\n Salient Object Detection","summary":" Salient object detection (SOD) aims at finding the most salient objects in\nimages and outputs pixel-level binary masks. Transformer-based methods achieve\npromising performance due to their global semantic understanding, crucial for\nidentifying salient objects. However, these models tend to be large and require\nnumerous training parameters. To better harness the potential of transformers\nfor SOD, we propose a novel parameter-efficient fine-tuning method aimed at\nreducing the number of training parameters while enhancing the salient object\ndetection capability. Our model, termed EXternal Prompt features Enhanced\nadapteR Tuning (ExPert), features an encoder-decoder structure with adapters\nand injectors interspersed between the layers of a frozen transformer encoder.\nThe adapter modules adapt the pre-trained backbone to SOD while the injector\nmodules incorporate external prompt features to enhance the awareness of\nsalient objects. Comprehensive experiments demonstrate the superiority of our\nmethod. Surpassing former state-of-the-art (SOTA) models across five SOD\ndatasets, ExPert achieves 0.215 mean absolute error (MAE) in ECSSD dataset with\n80.2M trained parameters, 21% better than transformer-based SOTA model and 47%\nbetter than CNN-based SOTA model.\n","authors":["Wen Liang","Peipei Ran","Mengchao Bai","Xiao Liu","P. Bilha Githinji","Wei Zhao","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2404.15008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14055v2","updated":"2024-04-23T13:04:44Z","published":"2024-04-22T10:11:31Z","title":"RingID: Rethinking Tree-Ring Watermarking for Enhanced Multi-Key\n Identification","summary":" We revisit Tree-Ring Watermarking, a recent diffusion model watermarking\nmethod that demonstrates great robustness to various attacks. We conduct an\nin-depth study on it and reveal that the distribution shift unintentionally\nintroduced by the watermarking process, apart from watermark pattern matching,\ncontributes to its exceptional robustness. Our investigation further exposes\ninherent flaws in its original design, particularly in its ability to identify\nmultiple distinct keys, where distribution shift offers no assistance. Based on\nthese findings and analysis, we present RingID for enhanced multi-key\nidentification. It consists of a novel multi-channel heterogeneous watermarking\napproach designed to seamlessly amalgamate distinctive advantages from diverse\nwatermarks. Coupled with a series of suggested enhancements, RingID exhibits\nsubstantial advancements in multi-key identification. Github Page:\nhttps://github.com/showlab/RingID\n","authors":["Hai Ci","Pei Yang","Yiren Song","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2404.14055v2.pdf","comment":"25 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.13896v2","updated":"2024-04-23T13:02:37Z","published":"2024-04-22T06:07:06Z","title":"CT-NeRF: Incremental Optimizing Neural Radiance Field and Poses with\n Complex Trajectory","summary":" Neural radiance field (NeRF) has achieved impressive results in high-quality\n3D scene reconstruction. However, NeRF heavily relies on precise camera poses.\nWhile recent works like BARF have introduced camera pose optimization within\nNeRF, their applicability is limited to simple trajectory scenes. Existing\nmethods struggle while tackling complex trajectories involving large rotations.\nTo address this limitation, we propose CT-NeRF, an incremental reconstruction\noptimization pipeline using only RGB images without pose and depth input. In\nthis pipeline, we first propose a local-global bundle adjustment under a pose\ngraph connecting neighboring frames to enforce the consistency between poses to\nescape the local minima caused by only pose consistency with the scene\nstructure. Further, we instantiate the consistency between poses as a\nreprojected geometric image distance constraint resulting from pixel-level\ncorrespondences between input image pairs. Through the incremental\nreconstruction, CT-NeRF enables the recovery of both camera poses and scene\nstructure and is capable of handling scenes with complex trajectories. We\nevaluate the performance of CT-NeRF on two real-world datasets, NeRFBuster and\nFree-Dataset, which feature complex trajectories. Results show CT-NeRF\noutperforms existing methods in novel view synthesis and pose estimation\naccuracy.\n","authors":["Yunlong Ran","Yanxu Li","Qi Ye","Yuchi Huo","Zechun Bai","Jiahao Sun","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13896v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14996v1","updated":"2024-04-23T12:57:35Z","published":"2024-04-23T12:57:35Z","title":"CA-Stream: Attention-based pooling for interpretable image recognition","summary":" Explanations obtained from transformer-based architectures in the form of raw\nattention, can be seen as a class-agnostic saliency map. Additionally,\nattention-based pooling serves as a form of masking the in feature space.\nMotivated by this observation, we design an attention-based pooling mechanism\nintended to replace Global Average Pooling (GAP) at inference. This mechanism,\ncalled Cross-Attention Stream (CA-Stream), comprises a stream of cross\nattention blocks interacting with features at different network depths.\nCA-Stream enhances interpretability in models, while preserving recognition\nperformance.\n","authors":["Felipe Torres","Hanwei Zhang","Ronan Sicre","Stéphane Ayache","Yannis Avrithis"],"pdf_url":"https://arxiv.org/pdf/2404.14996v1.pdf","comment":"CVPR XAI4CV workshop 2024"},{"id":"http://arxiv.org/abs/2401.03907v4","updated":"2024-04-23T12:48:23Z","published":"2024-01-08T14:10:24Z","title":"RoboFusion: Towards Robust Multi-Modal 3D Object Detection via SAM","summary":" Multi-modal 3D object detectors are dedicated to exploring secure and\nreliable perception systems for autonomous driving (AD).Although achieving\nstate-of-the-art (SOTA) performance on clean benchmark datasets, they tend to\noverlook the complexity and harsh conditions of real-world environments. With\nthe emergence of visual foundation models (VFMs), opportunities and challenges\nare presented for improving the robustness and generalization of multi-modal 3D\nobject detection in AD. Therefore, we propose RoboFusion, a robust framework\nthat leverages VFMs like SAM to tackle out-of-distribution (OOD) noise\nscenarios. We first adapt the original SAM for AD scenarios named SAM-AD. To\nalign SAM or SAM-AD with multi-modal methods, we then introduce AD-FPN for\nupsampling the image features extracted by SAM. We employ wavelet decomposition\nto denoise the depth-guided images for further noise reduction and weather\ninterference. At last, we employ self-attention mechanisms to adaptively\nreweight the fused features, enhancing informative features while suppressing\nexcess noise. In summary, RoboFusion significantly reduces noise by leveraging\nthe generalization and robustness of VFMs, thereby enhancing the resilience of\nmulti-modal 3D object detection. Consequently, RoboFusion achieves SOTA\nperformance in noisy scenarios, as demonstrated by the KITTI-C and nuScenes-C\nbenchmarks. Code is available at https://github.com/adept-thu/RoboFusion.\n","authors":["Ziying Song","Guoxing Zhang","Lin Liu","Lei Yang","Shaoqing Xu","Caiyan Jia","Feiyang Jia","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2401.03907v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08671v2","updated":"2024-04-23T12:46:34Z","published":"2024-02-13T18:53:13Z","title":"Are Semi-Dense Detector-Free Methods Good at Matching Local Features?","summary":" Semi-dense detector-free approaches (SDF), such as LoFTR, are currently among\nthe most popular image matching methods. While SDF methods are trained to\nestablish correspondences between two images, their performances are almost\nexclusively evaluated using relative pose estimation metrics. Thus, the link\nbetween their ability to establish correspondences and the quality of the\nresulting estimated pose has thus far received little attention. This paper is\na first attempt to study this link. We start with proposing a novel structured\nattention-based image matching architecture (SAM). It allows us to show a\ncounter-intuitive result on two datasets (MegaDepth and HPatches): on the one\nhand SAM either outperforms or is on par with SDF methods in terms of\npose/homography estimation metrics, but on the other hand SDF approaches are\nsignificantly better than SAM in terms of matching accuracy. We then propose to\nlimit the computation of the matching accuracy to textured regions, and show\nthat in this case SAM often surpasses SDF methods. Our findings highlight a\nstrong correlation between the ability to establish accurate correspondences in\ntextured regions and the accuracy of the resulting estimated pose/homography.\nOur code will be made available.\n","authors":["Matthieu Vilain","Rémi Giraud","Hugo Germain","Guillaume Bourmaud"],"pdf_url":"https://arxiv.org/pdf/2402.08671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14985v1","updated":"2024-04-23T12:42:07Z","published":"2024-04-23T12:42:07Z","title":"Other Tokens Matter: Exploring Global and Local Features of Vision\n Transformers for Object Re-Identification","summary":" Object Re-Identification (Re-ID) aims to identify and retrieve specific\nobjects from images captured at different places and times. Recently, object\nRe-ID has achieved great success with the advances of Vision Transformers\n(ViT). However, the effects of the global-local relation have not been fully\nexplored in Transformers for object Re-ID. In this work, we first explore the\ninfluence of global and local features of ViT and then further propose a novel\nGlobal-Local Transformer (GLTrans) for high-performance object Re-ID. We find\nthat the features from last few layers of ViT already have a strong\nrepresentational ability, and the global and local information can mutually\nenhance each other. Based on this fact, we propose a Global Aggregation Encoder\n(GAE) to utilize the class tokens of the last few Transformer layers and learn\ncomprehensive global features effectively. Meanwhile, we propose the Local\nMulti-layer Fusion (LMF) which leverages both the global cues from GAE and\nmulti-layer patch tokens to explore the discriminative local representations.\nExtensive experiments demonstrate that our proposed method achieves superior\nperformance on four object Re-ID benchmarks.\n","authors":["Yingquan Wang","Pingping Zhang","Dong Wang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.14985v1.pdf","comment":"Accepted by CVIU2024. More modifications may be performed"},{"id":"http://arxiv.org/abs/2404.14979v1","updated":"2024-04-23T12:36:24Z","published":"2024-04-23T12:36:24Z","title":"SGFormer: Spherical Geometry Transformer for 360 Depth Estimation","summary":" Panoramic distortion poses a significant challenge in 360 depth estimation,\nparticularly pronounced at the north and south poles. Existing methods either\nadopt a bi-projection fusion strategy to remove distortions or model long-range\ndependencies to capture global structures, which can result in either unclear\nstructure or insufficient local perception. In this paper, we propose a\nspherical geometry transformer, named SGFormer, to address the above issues,\nwith an innovative step to integrate spherical geometric priors into vision\ntransformers. To this end, we retarget the transformer decoder to a spherical\nprior decoder (termed SPDecoder), which endeavors to uphold the integrity of\nspherical structures during decoding. Concretely, we leverage bipolar\nre-projection, circular rotation, and curve local embedding to preserve the\nspherical characteristics of equidistortion, continuity, and surface distance,\nrespectively. Furthermore, we present a query-based global conditional position\nembedding to compensate for spatial structure at varying resolutions. It not\nonly boosts the global perception of spatial position but also sharpens the\ndepth structure across different patches. Finally, we conduct extensive\nexperiments on popular benchmarks, demonstrating our superiority over\nstate-of-the-art solutions.\n","authors":["Junsong Zhang","Zisong Chen","Chunyu Lin","Lang Nie","Zhijie Shen","Junda Huang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.14979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14975v1","updated":"2024-04-23T12:30:17Z","published":"2024-04-23T12:30:17Z","title":"CAGE: Circumplex Affect Guided Expression Inference","summary":" Understanding emotions and expressions is a task of interest across multiple\ndisciplines, especially for improving user experiences. Contrary to the common\nperception, it has been shown that emotions are not discrete entities but\ninstead exist along a continuum. People understand discrete emotions\ndifferently due to a variety of factors, including cultural background,\nindividual experiences, and cognitive biases. Therefore, most approaches to\nexpression understanding, particularly those relying on discrete categories,\nare inherently biased. In this paper, we present a comparative in-depth\nanalysis of two common datasets (AffectNet and EMOTIC) equipped with the\ncomponents of the circumplex model of affect. Further, we propose a model for\nthe prediction of facial expressions tailored for lightweight applications.\nUsing a small-scaled MaxViT-based model architecture, we evaluate the impact of\ndiscrete expression category labels in training with the continuous valence and\narousal labels. We show that considering valence and arousal in addition to\ndiscrete category labels helps to significantly improve expression inference.\nThe proposed model outperforms the current state-of-the-art models on\nAffectNet, establishing it as the best-performing model for inferring valence\nand arousal achieving a 7% lower RMSE. Training scripts and trained weights to\nreproduce our results can be found here:\nhttps://github.com/wagner-niklas/CAGE_expression_inference.\n","authors":["Niklas Wagner","Felix Mätzler","Samed R. Vossberg","Helen Schneider","Svetlana Pavlitska","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2404.14975v1.pdf","comment":"Accepted for publication at ABAW Workshop at CVPR2024"},{"id":"http://arxiv.org/abs/2311.12631v3","updated":"2024-04-23T12:24:58Z","published":"2023-11-21T14:24:37Z","title":"GPT4Motion: Scripting Physical Motions in Text-to-Video Generation via\n Blender-Oriented GPT Planning","summary":" Recent advances in text-to-video generation have harnessed the power of\ndiffusion models to create visually compelling content conditioned on text\nprompts. However, they usually encounter high computational costs and often\nstruggle to produce videos with coherent physical motions. To tackle these\nissues, we propose GPT4Motion, a training-free framework that leverages the\nplanning capability of large language models such as GPT, the physical\nsimulation strength of Blender, and the excellent image generation ability of\ntext-to-image diffusion models to enhance the quality of video synthesis.\nSpecifically, GPT4Motion employs GPT-4 to generate a Blender script based on a\nuser textual prompt, which commands Blender's built-in physics engine to craft\nfundamental scene components that encapsulate coherent physical motions across\nframes. Then these components are inputted into Stable Diffusion to generate a\nvideo aligned with the textual prompt. Experimental results on three basic\nphysical motion scenarios, including rigid object drop and collision, cloth\ndraping and swinging, and liquid flow, demonstrate that GPT4Motion can generate\nhigh-quality videos efficiently in maintaining motion coherency and entity\nconsistency. GPT4Motion offers new insights in text-to-video research,\nenhancing its quality and broadening its horizon for further explorations.\n","authors":["Jiaxi Lv","Yi Huang","Mingfu Yan","Jiancheng Huang","Jianzhuang Liu","Yifan Liu","Yafei Wen","Xiaoxin Chen","Shifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12631v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14968v1","updated":"2024-04-23T12:23:42Z","published":"2024-04-23T12:23:42Z","title":"CenterArt: Joint Shape Reconstruction and 6-DoF Grasp Estimation of\n Articulated Objects","summary":" Precisely grasping and reconstructing articulated objects is key to enabling\ngeneral robotic manipulation. In this paper, we propose CenterArt, a novel\napproach for simultaneous 3D shape reconstruction and 6-DoF grasp estimation of\narticulated objects. CenterArt takes RGB-D images of the scene as input and\nfirst predicts the shape and joint codes through an encoder. The decoder then\nleverages these codes to reconstruct 3D shapes and estimate 6-DoF grasp poses\nof the objects. We further develop a mechanism for generating a dataset of\n6-DoF grasp ground truth poses for articulated objects. CenterArt is trained on\nrealistic scenes containing multiple articulated objects with randomized\ndesigns, textures, lighting conditions, and realistic depths. We perform\nextensive experiments demonstrating that CenterArt outperforms existing methods\nin accuracy and robustness.\n","authors":["Sassan Mokhtar","Eugenio Chisari","Nick Heppert","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2404.14968v1.pdf","comment":"4 pages, 2 figures, accepted to the ICRA 2024 Workshop on 3D Visual\n Representations for Robot Manipulation"},{"id":"http://arxiv.org/abs/2404.14967v1","updated":"2024-04-23T12:22:32Z","published":"2024-04-23T12:22:32Z","title":"CoARF: Controllable 3D Artistic Style Transfer for Radiance Fields","summary":" Creating artistic 3D scenes can be time-consuming and requires specialized\nknowledge. To address this, recent works such as ARF, use a radiance\nfield-based approach with style constraints to generate 3D scenes that resemble\na style image provided by the user. However, these methods lack fine-grained\ncontrol over the resulting scenes. In this paper, we introduce Controllable\nArtistic Radiance Fields (CoARF), a novel algorithm for controllable 3D scene\nstylization. CoARF enables style transfer for specified objects, compositional\n3D style transfer and semantic-aware style transfer. We achieve controllability\nusing segmentation masks with different label-dependent loss functions. We also\npropose a semantic-aware nearest neighbor matching algorithm to improve the\nstyle transfer quality. Our extensive experiments demonstrate that CoARF\nprovides user-specified controllability of style transfer and superior style\ntransfer quality with more precise feature matching.\n","authors":["Deheng Zhang","Clara Fernandez-Labrador","Christopher Schroers"],"pdf_url":"https://arxiv.org/pdf/2404.14967v1.pdf","comment":"International Conference on 3D Vision 2024"},{"id":"http://arxiv.org/abs/2404.14966v1","updated":"2024-04-23T12:20:27Z","published":"2024-04-23T12:20:27Z","title":"Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State\n Space Model","summary":" Existing Transformer-based models for point cloud analysis suffer from\nquadratic complexity, leading to compromised point cloud resolution and\ninformation loss. In contrast, the newly proposed Mamba model, based on state\nspace models (SSM), outperforms Transformer in multiple areas with only linear\ncomplexity. However, the straightforward adoption of Mamba does not achieve\nsatisfactory performance on point cloud tasks. In this work, we present\nMamba3D, a state space model tailored for point cloud learning to enhance local\nfeature extraction, achieving superior performance, high efficiency, and\nscalability potential. Specifically, we propose a simple yet effective Local\nNorm Pooling (LNP) block to extract local geometric features. Additionally, to\nobtain better global features, we introduce a bidirectional SSM (bi-SSM) with\nboth a token forward SSM and a novel backward SSM that operates on the feature\nchannel. Extensive experimental results show that Mamba3D surpasses\nTransformer-based counterparts and concurrent works in multiple tasks, with or\nwithout pre-training. Notably, Mamba3D achieves multiple SoTA, including an\noverall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1%\n(with single-modal pre-training) on the ModelNet40 classification task, with\nonly linear complexity.\n","authors":["Xu Han","Yuan Tang","Zhaoxuan Wang","Xianzhi Li"],"pdf_url":"https://arxiv.org/pdf/2404.14966v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.14956v1","updated":"2024-04-23T12:01:21Z","published":"2024-04-23T12:01:21Z","title":"DAWN: Domain-Adaptive Weakly Supervised Nuclei Segmentation via\n Cross-Task Interactions","summary":" Weakly supervised segmentation methods have gained significant attention due\nto their ability to reduce the reliance on costly pixel-level annotations\nduring model training. However, the current weakly supervised nuclei\nsegmentation approaches typically follow a two-stage pseudo-label generation\nand network training process. The performance of the nuclei segmentation\nheavily relies on the quality of the generated pseudo-labels, thereby limiting\nits effectiveness. This paper introduces a novel domain-adaptive weakly\nsupervised nuclei segmentation framework using cross-task interaction\nstrategies to overcome the challenge of pseudo-label generation. Specifically,\nwe utilize weakly annotated data to train an auxiliary detection task, which\nassists the domain adaptation of the segmentation network. To enhance the\nefficiency of domain adaptation, we design a consistent feature constraint\nmodule integrating prior knowledge from the source domain. Furthermore, we\ndevelop pseudo-label optimization and interactive training methods to improve\nthe domain transfer capability. To validate the effectiveness of our proposed\nmethod, we conduct extensive comparative and ablation experiments on six\ndatasets. The results demonstrate the superiority of our approach over existing\nweakly supervised approaches. Remarkably, our method achieves comparable or\neven better performance than fully supervised methods. Our code will be\nreleased in https://github.com/zhangye-zoe/DAWN.\n","authors":["Ye Zhang","Yifeng Wang","Zijie Fang","Hao Bian","Linghan Cai","Ziyue Wang","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.14956v1.pdf","comment":"13 pages, 11 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.14955v1","updated":"2024-04-23T12:00:20Z","published":"2024-04-23T12:00:20Z","title":"Traditional to Transformers: A Survey on Current Trends and Future\n Prospects for Hyperspectral Image Classification","summary":" Hyperspectral image classification is a challenging task due to the high\ndimensionality and complex nature of hyperspectral data. In recent years, deep\nlearning techniques have emerged as powerful tools for addressing these\nchallenges. This survey provides a comprehensive overview of the current trends\nand future prospects in hyperspectral image classification, focusing on the\nadvancements from deep learning models to the emerging use of transformers. We\nreview the key concepts, methodologies, and state-of-the-art approaches in deep\nlearning for hyperspectral image classification. Additionally, we discuss the\npotential of transformer-based models in this field and highlight the\nadvantages and challenges associated with these approaches. Comprehensive\nexperimental results have been undertaken using three Hyperspectral datasets to\nverify the efficacy of various conventional deep-learning models and\nTransformers. Finally, we outline future research directions and potential\napplications that can further enhance the accuracy and efficiency of\nhyperspectral image classification.\n The Source code is available at\nhttps://github.com/mahmad00/Conventional-to-Transformer-for-Hyperspectral-Image-Classification-Survey-2024.\n","authors":["Muhammad Ahmad","Salvatore Distifano","Manuel Mazzara","Adil Mehmood Khan"],"pdf_url":"https://arxiv.org/pdf/2404.14955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14952v1","updated":"2024-04-23T11:54:05Z","published":"2024-04-23T11:54:05Z","title":"Leveraging Speech for Gesture Detection in Multimodal Communication","summary":" Gestures are inherent to human interaction and often complement speech in\nface-to-face communication, forming a multimodal communication system. An\nimportant task in gesture analysis is detecting a gesture's beginning and end.\nResearch on automatic gesture detection has primarily focused on visual and\nkinematic information to detect a limited set of isolated or silent gestures\nwith low variability, neglecting the integration of speech and vision signals\nto detect gestures that co-occur with speech. This work addresses this gap by\nfocusing on co-speech gesture detection, emphasising the synchrony between\nspeech and co-speech hand gestures. We address three main challenges: the\nvariability of gesture forms, the temporal misalignment between gesture and\nspeech onsets, and differences in sampling rate between modalities. We\ninvestigate extended speech time windows and employ separate backbone models\nfor each modality to address the temporal misalignment and sampling rate\ndifferences. We utilize Transformer encoders in cross-modal and early fusion\ntechniques to effectively align and integrate speech and skeletal sequences.\nThe study results show that combining visual and speech information\nsignificantly enhances gesture detection performance. Our findings indicate\nthat expanding the speech buffer beyond visual time segments improves\nperformance and that multimodal integration using cross-modal and early fusion\ntechniques outperforms baseline methods using unimodal and late fusion methods.\nAdditionally, we find a correlation between the models' gesture prediction\nconfidence and low-level speech frequency features potentially associated with\ngestures. Overall, the study provides a better understanding and detection\nmethods for co-speech gestures, facilitating the analysis of multimodal\ncommunication.\n","authors":["Esam Ghaleb","Ilya Burenko","Marlou Rasenberg","Wim Pouw","Ivan Toni","Peter Uhrig","Anna Wilson","Judith Holler","Aslı Özyürek","Raquel Fernández"],"pdf_url":"https://arxiv.org/pdf/2404.14952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14951v1","updated":"2024-04-23T11:53:51Z","published":"2024-04-23T11:53:51Z","title":"Streamlining the Image Stitching Pipeline: Integrating Fusion and\n Rectangling into a Unified Model","summary":" Learning-based image stitching techniques typically involve three distinct\nstages: registration, fusion, and rectangling. These stages are often performed\nsequentially, each trained independently, leading to potential cascading error\npropagation and complex parameter tuning challenges. In rethinking the\nmathematical modeling of the fusion and rectangling stages, we discovered that\nthese processes can be effectively combined into a single, variety-intensity\ninpainting problem. Therefore, we propose the Simple and Robust Stitcher\n(SRStitcher), an efficient training-free image stitching method that merges the\nfusion and rectangling stages into a unified model. By employing the weighted\nmask and large-scale generative model, SRStitcher can solve the fusion and\nrectangling problems in a single inference, without additional training or\nfine-tuning of other models. Our method not only simplifies the stitching\npipeline but also enhances fault tolerance towards misregistration errors.\nExtensive experiments demonstrate that SRStitcher outperforms state-of-the-art\n(SOTA) methods in both quantitative assessments and qualitative evaluations.\nThe code is released at https://github.com/yayoyo66/SRStitcher\n","authors":["Ziqi Xie"],"pdf_url":"https://arxiv.org/pdf/2404.14951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14949v1","updated":"2024-04-23T11:45:32Z","published":"2024-04-23T11:45:32Z","title":"Multi-Modal Prompt Learning on Blind Image Quality Assessment","summary":" Image Quality Assessment (IQA) models benefit significantly from semantic\ninformation, which allows them to treat different types of objects distinctly.\nCurrently, leveraging semantic information to enhance IQA is a crucial research\ndirection. Traditional methods, hindered by a lack of sufficiently annotated\ndata, have employed the CLIP image-text pretraining model as their backbone to\ngain semantic awareness. However, the generalist nature of these pre-trained\nVision-Language (VL) models often renders them suboptimal for IQA-specific\ntasks. Recent approaches have attempted to address this mismatch using prompt\ntechnology, but these solutions have shortcomings. Existing prompt-based VL\nmodels overly focus on incremental semantic information from text, neglecting\nthe rich insights available from visual data analysis. This imbalance limits\ntheir performance improvements in IQA tasks. This paper introduces an\ninnovative multi-modal prompt-based methodology for IQA. Our approach employs\ncarefully crafted prompts that synergistically mine incremental semantic\ninformation from both visual and linguistic data. Specifically, in the visual\nbranch, we introduce a multi-layer prompt structure to enhance the VL model's\nadaptability. In the text branch, we deploy a dual-prompt scheme that steers\nthe model to recognize and differentiate between scene category and distortion\ntype, thereby refining the model's capacity to assess image quality. Our\nexperimental findings underscore the effectiveness of our method over existing\nBlind Image Quality Assessment (BIQA) approaches. Notably, it demonstrates\ncompetitive performance across various datasets. Our method achieves Spearman\nRank Correlation Coefficient (SRCC) values of 0.961(surpassing 0.946 in CSIQ)\nand 0.941 (exceeding 0.930 in KADID), illustrating its robustness and accuracy\nin diverse contexts.\n","authors":["Wensheng Pan","Timin Gao","Yan Zhang","Runze Hu","Xiawu Zheng","Enwei Zhang","Yuting Gao","Yutao Liu","Yunhang Shen","Ke Li","Shengchuan Zhang","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.14949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09158v2","updated":"2024-04-23T11:45:29Z","published":"2024-04-14T06:19:46Z","title":"StreakNet-Arch: An Anti-scattering Network-based Architecture for\n Underwater Carrier LiDAR-Radar Imaging","summary":" In this paper, we introduce StreakNet-Arch, a novel signal processing\narchitecture designed for Underwater Carrier LiDAR-Radar (UCLR) imaging\nsystems, to address the limitations in scatter suppression and real-time\nimaging. StreakNet-Arch formulates the signal processing as a real-time,\nend-to-end binary classification task, enabling real-time image acquisition. To\nachieve this, we leverage Self-Attention networks and propose a novel Double\nBranch Cross Attention (DBC-Attention) mechanism that surpasses the performance\nof traditional methods. Furthermore, we present a method for embedding\nstreak-tube camera images into attention networks, effectively acting as a\nlearned bandpass filter. To facilitate further research, we contribute a\npublicly available streak-tube camera image dataset. The dataset contains\n2,695,168 real-world underwater 3D point cloud data. These advancements\nsignificantly improve UCLR capabilities, enhancing its performance and\napplicability in underwater imaging tasks. The source code and dataset can be\nfound at https://github.com/BestAnHongjun/StreakNet .\n","authors":["Xuelong Li","Hongjun An","Guangying Li","Xing Wang","Guanghua Cheng","Zhe Sun"],"pdf_url":"https://arxiv.org/pdf/2404.09158v2.pdf","comment":"Reduce the number of pages to 13"},{"id":"http://arxiv.org/abs/2404.14945v1","updated":"2024-04-23T11:41:19Z","published":"2024-04-23T11:41:19Z","title":"Pyramid Hierarchical Transformer for Hyperspectral Image Classification","summary":" The traditional Transformer model encounters challenges with variable-length\ninput sequences, particularly in Hyperspectral Image Classification (HSIC),\nleading to efficiency and scalability concerns. To overcome this, we propose a\npyramid-based hierarchical transformer (PyFormer). This innovative approach\norganizes input data hierarchically into segments, each representing distinct\nabstraction levels, thereby enhancing processing efficiency for lengthy\nsequences. At each level, a dedicated transformer module is applied,\neffectively capturing both local and global context. Spatial and spectral\ninformation flow within the hierarchy facilitates communication and abstraction\npropagation. Integration of outputs from different levels culminates in the\nfinal input representation. Experimental results underscore the superiority of\nthe proposed method over traditional approaches. Additionally, the\nincorporation of disjoint samples augments robustness and reliability, thereby\nhighlighting the potential of our approach in advancing HSIC.\n The source code is available at https://github.com/mahmad00/PyFormer.\n","authors":["Muhammad Ahmad","Muhammad Hassaan Farooq Butt","Manuel Mazzara","Salvatore Distifano"],"pdf_url":"https://arxiv.org/pdf/2404.14945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14944v1","updated":"2024-04-23T11:40:52Z","published":"2024-04-23T11:40:52Z","title":"Importance of Disjoint Sampling in Conventional and Transformer Models\n for Hyperspectral Image Classification","summary":" Disjoint sampling is critical for rigorous and unbiased evaluation of\nstate-of-the-art (SOTA) models. When training, validation, and test sets\noverlap or share data, it introduces a bias that inflates performance metrics\nand prevents accurate assessment of a model's true ability to generalize to new\nexamples. This paper presents an innovative disjoint sampling approach for\ntraining SOTA models on Hyperspectral image classification (HSIC) tasks. By\nseparating training, validation, and test data without overlap, the proposed\nmethod facilitates a fairer evaluation of how well a model can classify pixels\nit was not exposed to during training or validation. Experiments demonstrate\nthe approach significantly improves a model's generalization compared to\nalternatives that include training and validation data in test data. By\neliminating data leakage between sets, disjoint sampling provides reliable\nmetrics for benchmarking progress in HSIC. Researchers can have confidence that\nreported performance truly reflects a model's capabilities for classifying new\nscenes, not just memorized pixels. This rigorous methodology is critical for\nadvancing SOTA models and their real-world application to large-scale land\nmapping with Hyperspectral sensors.\n The source code is available at\nhttps://github.com/mahmad00/Disjoint-Sampling-for-Hyperspectral-Image-Classification.\n","authors":["Muhammad Ahmad","Manuel Mazzara","Salvatore Distifano"],"pdf_url":"https://arxiv.org/pdf/2404.14944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14934v1","updated":"2024-04-23T11:22:59Z","published":"2024-04-23T11:22:59Z","title":"G3R: Generating Rich and Fine-grained mmWave Radar Data from 2D Videos\n for Generalized Gesture Recognition","summary":" Millimeter wave radar is gaining traction recently as a promising modality\nfor enabling pervasive and privacy-preserving gesture recognition. However, the\nlack of rich and fine-grained radar datasets hinders progress in developing\ngeneralized deep learning models for gesture recognition across various user\npostures (e.g., standing, sitting), positions, and scenes. To remedy this, we\nresort to designing a software pipeline that exploits wealthy 2D videos to\ngenerate realistic radar data, but it needs to address the challenge of\nsimulating diversified and fine-grained reflection properties of user gestures.\nTo this end, we design G3R with three key components: (i) a gesture reflection\npoint generator expands the arm's skeleton points to form human reflection\npoints; (ii) a signal simulation model simulates the multipath reflection and\nattenuation of radar signals to output the human intensity map; (iii) an\nencoder-decoder model combines a sampling module and a fitting module to\naddress the differences in number and distribution of points between generated\nand real-world radar data for generating realistic radar data. We implement and\nevaluate G3R using 2D videos from public data sources and self-collected\nreal-world radar data, demonstrating its superiority over other\nstate-of-the-art approaches for gesture recognition.\n","authors":["Kaikai Deng","Dong Zhao","Wenxin Zheng","Yue Ling","Kangwen Yin","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2404.14934v1.pdf","comment":"18 pages, 29 figures"},{"id":"http://arxiv.org/abs/2404.14908v1","updated":"2024-04-23T10:51:15Z","published":"2024-04-23T10:51:15Z","title":"Mining Supervision for Dynamic Regions in Self-Supervised Monocular\n Depth Estimation","summary":" This paper focuses on self-supervised monocular depth estimation in dynamic\nscenes trained on monocular videos. Existing methods jointly estimate\npixel-wise depth and motion, relying mainly on an image reconstruction loss.\nDynamic regions1 remain a critical challenge for these methods due to the\ninherent ambiguity in depth and motion estimation, resulting in inaccurate\ndepth estimation. This paper proposes a self-supervised training framework\nexploiting pseudo depth labels for dynamic regions from training data. The key\ncontribution of our framework is to decouple depth estimation for static and\ndynamic regions of images in the training data. We start with an unsupervised\ndepth estimation approach, which provides reliable depth estimates for static\nregions and motion cues for dynamic regions and allows us to extract moving\nobject information at the instance level. In the next stage, we use an object\nnetwork to estimate the depth of those moving objects assuming rigid motions.\nThen, we propose a new scale alignment module to address the scale ambiguity\nbetween estimated depths for static and dynamic regions. We can then use the\ndepth labels generated to train an end-to-end depth estimation network and\nimprove its performance. Extensive experiments on the Cityscapes and KITTI\ndatasets show that our self-training strategy consistently outperforms existing\nself/unsupervised depth estimation methods.\n","authors":["Hoang Chuong Nguyen","Tianyu Wang","Jose M. Alvarez","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2404.14908v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2401.16386v2","updated":"2024-04-23T10:44:01Z","published":"2024-01-29T18:27:52Z","title":"Continual Learning with Pre-Trained Models: A Survey","summary":" Nowadays, real-world applications often face streaming data, which requires\nthe learning system to absorb new knowledge as data evolves. Continual Learning\n(CL) aims to achieve this goal and meanwhile overcome the catastrophic\nforgetting of former knowledge when learning new ones. Typical CL methods build\nthe model from scratch to grow with incoming data. However, the advent of the\npre-trained model (PTM) era has sparked immense research interest, particularly\nin leveraging PTMs' robust representational capabilities. This paper presents a\ncomprehensive survey of the latest advancements in PTM-based CL. We categorize\nexisting methodologies into three distinct groups, providing a comparative\nanalysis of their similarities, differences, and respective advantages and\ndisadvantages. Additionally, we offer an empirical study contrasting various\nstate-of-the-art methods to highlight concerns regarding fairness in\ncomparisons. The source code to reproduce these evaluations is available at:\nhttps://github.com/sun-hailong/LAMDA-PILOT\n","authors":["Da-Wei Zhou","Hai-Long Sun","Jingyi Ning","Han-Jia Ye","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2401.16386v2.pdf","comment":"Accepted to IJCAI 2024. Code is available at:\n https://github.com/sun-hailong/LAMDA-PILOT"},{"id":"http://arxiv.org/abs/2404.14906v1","updated":"2024-04-23T10:42:24Z","published":"2024-04-23T10:42:24Z","title":"Driver Activity Classification Using Generalizable Representations from\n Vision-Language Models","summary":" Driver activity classification is crucial for ensuring road safety, with\napplications ranging from driver assistance systems to autonomous vehicle\ncontrol transitions. In this paper, we present a novel approach leveraging\ngeneralizable representations from vision-language models for driver activity\nclassification. Our method employs a Semantic Representation Late Fusion Neural\nNetwork (SRLF-Net) to process synchronized video frames from multiple\nperspectives. Each frame is encoded using a pretrained vision-language encoder,\nand the resulting embeddings are fused to generate class probability\npredictions. By leveraging contrastively-learned vision-language\nrepresentations, our approach achieves robust performance across diverse driver\nactivities. We evaluate our method on the Naturalistic Driving Action\nRecognition Dataset, demonstrating strong accuracy across many classes. Our\nresults suggest that vision-language representations offer a promising avenue\nfor driver monitoring systems, providing both accuracy and interpretability\nthrough natural language descriptors.\n","authors":["Ross Greer","Mathias Viborg Andersen","Andreas Møgelmose","Mohan Trivedi"],"pdf_url":"https://arxiv.org/pdf/2404.14906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14890v1","updated":"2024-04-23T10:17:42Z","published":"2024-04-23T10:17:42Z","title":"DENOISER: Rethinking the Robustness for Open-Vocabulary Action\n Recognition","summary":" As one of the fundamental video tasks in computer vision, Open-Vocabulary\nAction Recognition (OVAR) recently gains increasing attention, with the\ndevelopment of vision-language pre-trainings. To enable generalization of\narbitrary classes, existing methods treat class labels as text descriptions,\nthen formulate OVAR as evaluating embedding similarity between visual samples\nand textual classes. However, one crucial issue is completely ignored: the\nclass descriptions given by users may be noisy, e.g., misspellings and typos,\nlimiting the real-world practicality of vanilla OVAR. To fill the research gap,\nthis paper pioneers to evaluate existing methods by simulating multi-level\nnoises of various types, and reveals their poor robustness. To tackle the noisy\nOVAR task, we further propose one novel DENOISER framework, covering two parts:\ngeneration and discrimination. Concretely, the generative part denoises noisy\nclass-text names via one decoding process, i.e., propose text candidates, then\nutilize inter-modal and intra-modal information to vote for the best. At the\ndiscriminative part, we use vanilla OVAR models to assign visual samples to\nclass-text names, thus obtaining more semantics. For optimization, we\nalternately iterate between generative and discriminative parts for progressive\nrefinements. The denoised text classes help OVAR models classify visual samples\nmore accurately; in return, classified visual samples help better denoising. On\nthree datasets, we carry out extensive experiments to show our superior\nrobustness, and thorough ablations to dissect the effectiveness of each\ncomponent.\n","authors":["Haozhe Cheng","Cheng Ju","Haicheng Wang","Jinxiang Liu","Mengting Chen","Qiang Hu","Xiaoyun Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14885v1","updated":"2024-04-23T10:13:31Z","published":"2024-04-23T10:13:31Z","title":"Domain adaptive pose estimation via multi-level alignment","summary":" Domain adaptive pose estimation aims to enable deep models trained on source\ndomain (synthesized) datasets produce similar results on the target domain\n(real-world) datasets. The existing methods have made significant progress by\nconducting image-level or feature-level alignment. However, only aligning at a\nsingle level is not sufficient to fully bridge the domain gap and achieve\nexcellent domain adaptive results. In this paper, we propose a multi-level\ndomain adaptation aproach, which aligns different domains at the image,\nfeature, and pose levels. Specifically, we first utilize image style transer to\nensure that images from the source and target domains have a similar\ndistribution. Subsequently, at the feature level, we employ adversarial\ntraining to make the features from the source and target domains preserve\ndomain-invariant characeristics as much as possible. Finally, at the pose\nlevel, a self-supervised approach is utilized to enable the model to learn\ndiverse knowledge, implicitly addressing the domain gap. Experimental results\ndemonstrate that significant imrovement can be achieved by the proposed\nmulti-level alignment method in pose estimation, which outperforms previous\nstate-of-the-art in human pose by up to 2.4% and animal pose estimation by up\nto 3.1% for dogs and 1.4% for sheep.\n","authors":["Yugan Chen","Lin Zhao","Yalong Xu","Honglei Zu","Xiaoqi An","Guangyu Li"],"pdf_url":"https://arxiv.org/pdf/2404.14885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14882v1","updated":"2024-04-23T10:09:32Z","published":"2024-04-23T10:09:32Z","title":"A sensitivity analysis to quantify the impact of neuroimaging\n preprocessing strategies on subsequent statistical analyses","summary":" Even though novel imaging techniques have been successful in studying brain\nstructure and function, the measured biological signals are often contaminated\nby multiple sources of noise, arising due to e.g. head movements of the\nindividual being scanned, limited spatial/temporal resolution, or other issues\nspecific to each imaging technology. Data preprocessing (e.g. denoising) is\ntherefore critical. Preprocessing pipelines have become increasingly complex\nover the years, but also more flexible, and this flexibility can have a\nsignificant impact on the final results and conclusions of a given study. This\nlarge parameter space is often referred to as multiverse analyses. Here, we\nprovide conceptual and practical tools for statistical analyses that can\naggregate multiple pipeline results along with a new sensitivity analysis\ntesting for hypotheses across pipelines such as \"no effect across all\npipelines\" or \"at least one pipeline with no effect\". The proposed framework is\ngeneric and can be applied to any multiverse scenario, but we illustrate its\nuse based on positron emission tomography data.\n","authors":["Brize Ozenne","Martin Norgaard","Cyril Pernet","Melanie Ganz"],"pdf_url":"https://arxiv.org/pdf/2404.14882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09965v3","updated":"2024-04-23T10:03:59Z","published":"2023-10-15T21:54:45Z","title":"ProteusNeRF: Fast Lightweight NeRF Editing using 3D-Aware Image Context","summary":" Neural Radiance Fields (NeRFs) have recently emerged as a popular option for\nphoto-realistic object capture due to their ability to faithfully capture\nhigh-fidelity volumetric content even from handheld video input. Although much\nresearch has been devoted to efficient optimization leading to real-time\ntraining and rendering, options for interactive editing NeRFs remain limited.\nWe present a very simple but effective neural network architecture that is fast\nand efficient while maintaining a low memory footprint. This architecture can\nbe incrementally guided through user-friendly image-based edits. Our\nrepresentation allows straightforward object selection via semantic feature\ndistillation at the training stage. More importantly, we propose a local\n3D-aware image context to facilitate view-consistent image editing that can\nthen be distilled into fine-tuned NeRFs, via geometric and appearance\nadjustments. We evaluate our setup on a variety of examples to demonstrate\nappearance and geometric edits and report 10-30x speedup over concurrent work\nfocusing on text-guided NeRF editing. Video results can be seen on our project\nwebpage at https://proteusnerf.github.io.\n","authors":["Binglun Wang","Niladri Shekhar Dutt","Niloy J. Mitra"],"pdf_url":"https://arxiv.org/pdf/2310.09965v3.pdf","comment":"Accepted at I3D'24 (ACM SIGGRAPH SYMPOSIUM ON INTERACTIVE 3D GRAPHICS\n AND GAMES)"},{"id":"http://arxiv.org/abs/2403.10558v2","updated":"2024-04-23T09:59:42Z","published":"2024-03-14T02:17:57Z","title":"Adaptive Hybrid Masking Strategy for Privacy-Preserving Face Recognition\n Against Model Inversion Attack","summary":" The utilization of personal sensitive data in training face recognition (FR)\nmodels poses significant privacy concerns, as adversaries can employ model\ninversion attacks (MIA) to infer the original training data. Existing defense\nmethods, such as data augmentation and differential privacy, have been employed\nto mitigate this issue. However, these methods often fail to strike an optimal\nbalance between privacy and accuracy. To address this limitation, this paper\nintroduces an adaptive hybrid masking algorithm against MIA. Specifically, face\nimages are masked in the frequency domain using an adaptive MixUp strategy.\nUnlike the traditional MixUp algorithm, which is predominantly used for data\naugmentation, our modified approach incorporates frequency domain mixing.\nPrevious studies have shown that increasing the number of images mixed in MixUp\ncan enhance privacy preservation but at the expense of reduced face recognition\naccuracy. To overcome this trade-off, we develop an enhanced adaptive MixUp\nstrategy based on reinforcement learning, which enables us to mix a larger\nnumber of images while maintaining satisfactory recognition accuracy. To\noptimize privacy protection, we propose maximizing the reward function (i.e.,\nthe loss function of the FR system) during the training of the strategy\nnetwork. While the loss function of the FR network is minimized in the phase of\ntraining the FR network. The strategy network and the face recognition network\ncan be viewed as antagonistic entities in the training process, ultimately\nreaching a more balanced trade-off. Experimental results demonstrate that our\nproposed hybrid masking scheme outperforms existing defense algorithms in terms\nof privacy preservation and recognition accuracy against MIA.\n","authors":["Yinggui Wang","Yuanqing Huang","Jianshu Li","Le Yang","Kai Song","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09812v2","updated":"2024-04-23T09:53:42Z","published":"2024-02-15T09:21:16Z","title":"DreamMatcher: Appearance Matching Self-Attention for\n Semantically-Consistent Text-to-Image Personalization","summary":" The objective of text-to-image (T2I) personalization is to customize a\ndiffusion model to a user-provided reference concept, generating diverse images\nof the concept aligned with the target prompts. Conventional methods\nrepresenting the reference concepts using unique text embeddings often fail to\naccurately mimic the appearance of the reference. To address this, one solution\nmay be explicitly conditioning the reference images into the target denoising\nprocess, known as key-value replacement. However, prior works are constrained\nto local editing since they disrupt the structure path of the pre-trained T2I\nmodel. To overcome this, we propose a novel plug-in method, called\nDreamMatcher, which reformulates T2I personalization as semantic matching.\nSpecifically, DreamMatcher replaces the target values with reference values\naligned by semantic matching, while leaving the structure path unchanged to\npreserve the versatile capability of pre-trained T2I models for generating\ndiverse structures. We also introduce a semantic-consistent masking strategy to\nisolate the personalized concept from irrelevant regions introduced by the\ntarget prompts. Compatible with existing T2I models, DreamMatcher shows\nsignificant improvements in complex scenarios. Intensive analyses demonstrate\nthe effectiveness of our approach.\n","authors":["Jisu Nam","Heesu Kim","DongJae Lee","Siyoon Jin","Seungryong Kim","Seunggyu Chang"],"pdf_url":"https://arxiv.org/pdf/2402.09812v2.pdf","comment":"Project page is available at https://ku-cvlab.github.io/DreamMatcher/"},{"id":"http://arxiv.org/abs/2402.15300v2","updated":"2024-04-23T09:32:25Z","published":"2024-02-23T12:57:16Z","title":"Seeing is Believing: Mitigating Hallucination in Large Vision-Language\n Models via CLIP-Guided Decoding","summary":" Large Vision-Language Models (LVLMs) are susceptible to object\nhallucinations, an issue in which their generated text contains non-existent\nobjects, greatly limiting their reliability and practicality. Current\napproaches often rely on the model's token likelihoods or other internal\ninformation, instruction tuning on additional datasets, or incorporating\ncomplex external tools. We first perform empirical analysis on sentence-level\nLVLM hallucination, finding that CLIP similarity to the image acts as a\nstronger and more robust indicator of hallucination compared to token\nlikelihoods. Motivated by this, we introduce our CLIP-Guided Decoding (CGD)\napproach, a straightforward but effective training-free approach to reduce\nobject hallucination at decoding time. CGD uses CLIP to guide the model's\ndecoding process by enhancing visual grounding of generated text with the\nimage. Experiments demonstrate that CGD effectively mitigates object\nhallucination across multiple LVLM families while preserving the utility of\ntext generation. Codes are available at\nhttps://github.com/d-ailin/CLIP-Guided-Decoding.\n","authors":["Ailin Deng","Zhirui Chen","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.15300v2.pdf","comment":"Code URL: https://github.com/d-ailin/CLIP-Guided-Decoding"},{"id":"http://arxiv.org/abs/2401.01454v2","updated":"2024-04-23T09:08:11Z","published":"2024-01-02T22:35:33Z","title":"A Survey on Autonomous Driving Datasets: Statistics, Annotation Quality,\n and a Future Outlook","summary":" Autonomous driving has rapidly developed and shown promising performance due\nto recent advances in hardware and deep learning techniques. High-quality\ndatasets are fundamental for developing reliable autonomous driving algorithms.\nPrevious dataset surveys either focused on a limited number or lacked detailed\ninvestigation of dataset characteristics. To this end, we present an exhaustive\nstudy of 265 autonomous driving datasets from multiple perspectives, including\nsensor modalities, data size, tasks, and contextual conditions. We introduce a\nnovel metric to evaluate the impact of datasets, which can also be a guide for\ncreating new datasets. Besides, we analyze the annotation processes, existing\nlabeling tools, and the annotation quality of datasets, showing the importance\nof establishing a standard annotation pipeline. On the other hand, we\nthoroughly analyze the impact of geographical and adversarial environmental\nconditions on the performance of autonomous driving systems. Moreover, we\nexhibit the data distribution of several vital datasets and discuss their pros\nand cons accordingly. Finally, we discuss the current challenges and the\ndevelopment trend of the future autonomous driving datasets.\n","authors":["Mingyu Liu","Ekim Yurtsever","Jonathan Fossaert","Xingcheng Zhou","Walter Zimmer","Yuning Cui","Bare Luka Zagar","Alois C. Knoll"],"pdf_url":"https://arxiv.org/pdf/2401.01454v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14852v1","updated":"2024-04-23T09:07:04Z","published":"2024-04-23T09:07:04Z","title":"Ultrasound Nodule Segmentation Using Asymmetric Learning with Simple\n Clinical Annotation","summary":" Recent advances in deep learning have greatly facilitated the automated\nsegmentation of ultrasound images, which is essential for nodule morphological\nanalysis. Nevertheless, most existing methods depend on extensive and precise\nannotations by domain experts, which are labor-intensive and time-consuming. In\nthis study, we suggest using simple aspect ratio annotations directly from\nultrasound clinical diagnoses for automated nodule segmentation. Especially, an\nasymmetric learning framework is developed by extending the aspect ratio\nannotations with two types of pseudo labels, i.e., conservative labels and\nradical labels, to train two asymmetric segmentation networks simultaneously.\nSubsequently, a conservative-radical-balance strategy (CRBS) strategy is\nproposed to complementally combine radical and conservative labels. An\ninconsistency-aware dynamically mixed pseudo-labels supervision (IDMPS) module\nis introduced to address the challenges of over-segmentation and\nunder-segmentation caused by the two types of labels. To further leverage the\nspatial prior knowledge provided by clinical annotations, we also present a\nnovel loss function namely the clinical anatomy prior loss. Extensive\nexperiments on two clinically collected ultrasound datasets (thyroid and\nbreast) demonstrate the superior performance of our proposed method, which can\nachieve comparable and even better performance than fully supervised methods\nusing ground truth annotations.\n","authors":["Xingyue Zhao","Zhongyu Li","Xiangde Luo","Peiqi Li","Peng Huang","Jianwei Zhu","Yang Liu","Jihua Zhu","Meng Yang","Shi Chang","Jun Dong"],"pdf_url":"https://arxiv.org/pdf/2404.14852v1.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2312.04233v3","updated":"2024-04-23T08:59:25Z","published":"2023-12-07T11:39:11Z","title":"Fine-tuning vision foundation model for crack segmentation in civil\n infrastructures","summary":" Large-scale foundation models have become the mainstream deep learning\nmethod, while in civil engineering, the scale of AI models is strictly limited.\nIn this work, a vision foundation model is introduced for crack segmentation.\nTwo parameter-efficient fine-tuning methods, adapter and low-rank adaptation,\nare adopted to fine-tune the foundation model in semantic segmentation: the\nSegment Anything Model (SAM). The fine-tuned CrackSAM shows excellent\nperformance on different scenes and materials. To test the zero-shot\nperformance of the proposed method, two unique datasets related to road and\nexterior wall cracks are collected, annotated and open-sourced, for a total of\n810 images. Comparative experiments are conducted with twelve mature semantic\nsegmentation models. On datasets with artificial noise and previously unseen\ndatasets, the performance of CrackSAM far exceeds that of all state-of-the-art\nmodels. CrackSAM exhibits remarkable superiority, particularly under\nchallenging conditions such as dim lighting, shadows, road markings,\nconstruction joints, and other interference factors. These cross-scenario\nresults demonstrate the outstanding zero-shot capability of foundation models\nand provide new ideas for developing vision models in civil engineering.\n","authors":["Kang Ge","Chen Wang","Yutao Guo","Yansong Tang","Zhenzhong Hu","Hongbing Chen"],"pdf_url":"https://arxiv.org/pdf/2312.04233v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14837v1","updated":"2024-04-23T08:43:32Z","published":"2024-04-23T08:43:32Z","title":"Ultrasound SAM Adapter: Adapting SAM for Breast Lesion Segmentation in\n Ultrasound Images","summary":" Segment Anything Model (SAM) has recently achieved amazing results in the\nfield of natural image segmentation. However, it is not effective for medical\nimage segmentation, owing to the large domain gap between natural and medical\nimages. In this paper, we mainly focus on ultrasound image segmentation. As we\nknow that it is very difficult to train a foundation model for ultrasound image\ndata due to the lack of large-scale annotated ultrasound image data. To address\nthese issues, in this paper, we develop a novel Breast Ultrasound SAM Adapter,\ntermed Breast Ultrasound Segment Anything Model (BUSSAM), which migrates the\nSAM to the field of breast ultrasound image segmentation by using the adapter\ntechnique. To be specific, we first design a novel CNN image encoder, which is\nfully trained on the BUS dataset. Our CNN image encoder is more lightweight,\nand focuses more on features of local receptive field, which provides the\ncomplementary information to the ViT branch in SAM. Then, we design a novel\nCross-Branch Adapter to allow the CNN image encoder to fully interact with the\nViT image encoder in SAM module. Finally, we add both of the Position Adapter\nand the Feature Adapter to the ViT branch to fine-tune the original SAM. The\nexperimental results on AMUBUS and BUSI datasets demonstrate that our proposed\nmodel outperforms other medical image segmentation models significantly. Our\ncode will be available at: https://github.com/bscs12/BUSSAM.\n","authors":["Zhengzheng Tu","Le Gu","Xixi Wang","Bo Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.14837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14835v1","updated":"2024-04-23T08:41:50Z","published":"2024-04-23T08:41:50Z","title":"Semi-supervised 2D Human Pose Estimation via Adaptive Keypoint Masking","summary":" Human pose estimation is a fundamental and challenging task in computer\nvision. Larger-scale and more accurate keypoint annotations, while helpful for\nimproving the accuracy of supervised pose estimation, are often expensive and\ndifficult to obtain. Semi-supervised pose estimation tries to leverage a large\namount of unlabeled data to improve model performance, which can alleviate the\nproblem of insufficient labeled samples. The latest semi-supervised learning\nusually adopts a strong and weak data augmented teacher-student learning\nframework to deal with the challenge of \"Human postural diversity and its\nlong-tailed distribution\". Appropriate data augmentation method is one of the\nkey factors affecting the accuracy and generalization of semi-supervised\nmodels. Aiming at the problem that the difference of sample learning is not\nconsidered in the fixed keypoint masking augmentation method, this paper\nproposes an adaptive keypoint masking method, which can fully mine the\ninformation in the samples and obtain better estimation performance. In order\nto further improve the generalization and robustness of the model, this paper\nproposes a dual-branch data augmentation scheme, which can perform Mixup on\nsamples and features on the basis of adaptive keypoint masking. The\neffectiveness of the proposed method is verified on COCO and MPII,\noutperforming the state-of-the-art semi-supervised pose estimation by 5.2% and\n0.3%, respectively.\n","authors":["Kexin Meng","Ruirui Li","Daguang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.14835v1.pdf","comment":"China Multimedia 2023"},{"id":"http://arxiv.org/abs/2403.08216v2","updated":"2024-04-23T08:41:47Z","published":"2024-03-13T03:28:39Z","title":"PaddingFlow: Improving Normalizing Flows with Padding-Dimensional Noise","summary":" Normalizing flow is a generative modeling approach with efficient sampling.\nHowever, Flow-based models suffer two issues: 1) If the target distribution is\nmanifold, due to the unmatch between the dimensions of the latent target\ndistribution and the data distribution, flow-based models might perform badly.\n2) Discrete data might make flow-based models collapse into a degenerate\nmixture of point masses. To sidestep such two issues, we propose PaddingFlow, a\nnovel dequantization method, which improves normalizing flows with\npadding-dimensional noise. To implement PaddingFlow, only the dimension of\nnormalizing flows needs to be modified. Thus, our method is easy to implement\nand computationally cheap. Moreover, the padding-dimensional noise is only\nadded to the padding dimension, which means PaddingFlow can dequantize without\nchanging data distributions. Implementing existing dequantization methods needs\nto change data distributions, which might degrade performance. We validate our\nmethod on the main benchmarks of unconditional density estimation, including\nfive tabular datasets and four image datasets for Variational Autoencoder (VAE)\nmodels, and the Inverse Kinematics (IK) experiments which are conditional\ndensity estimation. The results show that PaddingFlow can perform better in all\nexperiments in this paper, which means PaddingFlow is widely suitable for\nvarious tasks. The code is available at:\nhttps://github.com/AdamQLMeng/PaddingFlow.\n","authors":["Qinglong Meng","Chongkun Xia","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.08216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05180v2","updated":"2024-04-23T08:36:00Z","published":"2023-01-12T18:04:51Z","title":"Effective Decision Boundary Learning for Class Incremental Learning","summary":" Rehearsal approaches in class incremental learning (CIL) suffer from decision\nboundary overfitting to new classes, which is mainly caused by two factors:\ninsufficiency of old classes data for knowledge distillation and imbalanced\ndata learning between the learned and new classes because of the limited\nstorage memory. In this work, we present a simple but effective approach to\ntackle these two factors. First, we employ a re-sampling strategy and Mixup\nK}nowledge D}istillation (Re-MKD) to improve the performances of KD, which\nwould greatly alleviate the overfitting problem. Specifically, we combine mixup\nand re-sampling strategies to synthesize adequate data used in KD training that\nare more consistent with the latent distribution between the learned and new\nclasses. Second, we propose a novel incremental influence balance (IIB) method\nfor CIL to tackle the classification of imbalanced data by extending the\ninfluence balance method into the CIL setting, which re-weights samples by\ntheir influences to create a proper decision boundary. With these two\nimprovements, we present the effective decision boundary learning algorithm\n(EDBL) which improves the performance of KD and deals with the imbalanced data\nlearning simultaneously. Experiments show that the proposed EDBL achieves\nstate-of-the-art performances on several CIL benchmarks.\n","authors":["Chaoyue Ding","Kunchi Li","Jun Wan","Shan Yu"],"pdf_url":"https://arxiv.org/pdf/2301.05180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14830v1","updated":"2024-04-23T08:32:38Z","published":"2024-04-23T08:32:38Z","title":"CoProNN: Concept-based Prototypical Nearest Neighbors for Explaining\n Vision Models","summary":" Mounting evidence in explainability for artificial intelligence (XAI)\nresearch suggests that good explanations should be tailored to individual tasks\nand should relate to concepts relevant to the task. However, building task\nspecific explanations is time consuming and requires domain expertise which can\nbe difficult to integrate into generic XAI methods. A promising approach\ntowards designing useful task specific explanations with domain experts is\nbased on compositionality of semantic concepts. Here, we present a novel\napproach that enables domain experts to quickly create concept-based\nexplanations for computer vision tasks intuitively via natural language.\nLeveraging recent progress in deep generative methods we propose to generate\nvisual concept-based prototypes via text-to-image methods. These prototypes are\nthen used to explain predictions of computer vision models via a simple\nk-Nearest-Neighbors routine. The modular design of CoProNN is simple to\nimplement, it is straightforward to adapt to novel tasks and allows for\nreplacing the classification and text-to-image models as more powerful models\nare released. The approach can be evaluated offline against the ground-truth of\npredefined prototypes that can be easily communicated also to domain experts as\nthey are based on visual concepts. We show that our strategy competes very well\nwith other concept-based XAI approaches on coarse grained image classification\ntasks and may even outperform those methods on more demanding fine grained\ntasks. We demonstrate the effectiveness of our method for human-machine\ncollaboration settings in qualitative and quantitative user studies. All code\nand experimental data can be found in our GitHub\n$\\href{https://github.com/TeodorChiaburu/beexplainable}{repository}$.\n","authors":["Teodor Chiaburu","Frank Haußer","Felix Bießmann"],"pdf_url":"https://arxiv.org/pdf/2404.14830v1.pdf","comment":"24 pages, 9 figures, 2 tables, accepted at WCXAI 2024 Valletta"},{"id":"http://arxiv.org/abs/2312.11035v3","updated":"2024-04-23T08:32:03Z","published":"2023-12-18T09:11:28Z","title":"Towards Effective Multi-Moving-Camera Tracking: A New Dataset and\n Lightweight Link Model","summary":" Ensuring driving safety for autonomous vehicles has become increasingly\ncrucial, highlighting the need for systematic tracking of on-road pedestrians.\nMost vehicles are equipped with visual sensors, however, the large-scale visual\ndata has not been well studied yet. Multi-target multi-camera (MTMC) tracking\nsystems are composed of two modules: single-camera tracking (SCT) and\ninter-camera tracking (ICT). To reliably coordinate between them, MTMC tracking\nhas been a very complicated task, while tracking across multiple moving cameras\nmakes it even more challenging. In this paper, we focus on multi-target\nmulti-moving-camera (MTMMC) tracking, which is attracting increasing attention\nfrom the research community. Observing there are few datasets for MTMMC\ntracking, we collect a new dataset, called Multi-Moving-Camera Track (MMCT),\nwhich contains sequences under various driving scenarios. To address the common\nproblems of identity switch easily faced by most existing SCT trackers,\nespecially for moving cameras due to ego-motion between the camera and targets,\na lightweight appearance-free global link model, called Linker, is proposed to\nmitigate the identity switch by associating two disjoint tracklets of the same\ntarget into a complete trajectory within the same camera. Incorporated with\nLinker, existing SCT trackers generally obtain a significant improvement.\nMoreover, to alleviate the impact of the image style variations caused by\ndifferent cameras, a color transfer module is effectively incorporated to\nextract cross-camera consistent appearance features for pedestrian association\nacross moving cameras for ICT, resulting in a much improved MTMMC tracking\nsystem, which can constitute a step further towards coordinated mining of\nmultiple moving cameras. The project page is available at\nhttps://dhu-mmct.github.io/.\n","authors":["Yanting Zhang","Shuanghong Wang","Qingxiang Wang","Cairong Yan","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2312.11035v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14829v1","updated":"2024-04-23T08:31:55Z","published":"2024-04-23T08:31:55Z","title":"Revisiting Neural Networks for Continual Learning: An Architectural\n Perspective","summary":" Efforts to overcome catastrophic forgetting have primarily centered around\ndeveloping more effective Continual Learning (CL) methods. In contrast, less\nattention was devoted to analyzing the role of network architecture design\n(e.g., network depth, width, and components) in contributing to CL. This paper\nseeks to bridge this gap between network architecture design and CL, and to\npresent a holistic study on the impact of network architectures on CL. This\nwork considers architecture design at the network scaling level, i.e., width\nand depth, and also at the network components, i.e., skip connections, global\npooling layers, and down-sampling. In both cases, we first derive insights\nthrough systematically exploring how architectural designs affect CL. Then,\ngrounded in these insights, we craft a specialized search space for CL and\nfurther propose a simple yet effective ArchCraft method to steer a CL-friendly\narchitecture, namely, this method recrafts AlexNet/ResNet into AlexAC/ResAC.\nExperimental validation across various CL settings and scenarios demonstrates\nthat improved architectures are parameter-efficient, achieving state-of-the-art\nperformance of CL while being 86%, 61%, and 97% more compact in terms of\nparameters than the naive CL architecture in Class IL and Task IL. Code is\navailable at https://github.com/byyx666/ArchCraft.\n","authors":["Aojun Lu","Tao Feng","Hangjie Yuan","Xiaotian Song","Yanan Sun"],"pdf_url":"https://arxiv.org/pdf/2404.14829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02772v2","updated":"2024-04-23T08:29:53Z","published":"2023-12-05T14:01:43Z","title":"FG-MDM: Towards Zero-Shot Human Motion Generation via Fine-Grained\n Descriptions","summary":" Recently, significant progress has been made in text-based motion generation,\nenabling the generation of diverse and high-quality human motions that conform\nto textual descriptions. However, generating motions beyond the distribution of\noriginal datasets remains challenging, i.e., zero-shot generation. By adopting\na divide-and-conquer strategy, we propose a new framework named Fine-Grained\nHuman Motion Diffusion Model (FG-MDM) for zero-shot human motion generation.\nSpecifically, we first parse previous vague textual annotations into\nfine-grained descriptions of different body parts by leveraging a large\nlanguage model. We then use these fine-grained descriptions to guide a\ntransformer-based diffusion model, which further adopts a design of part\ntokens. FG-MDM can generate human motions beyond the scope of original datasets\nowing to descriptions that are closer to motion essence. Our experimental\nresults demonstrate the superiority of FG-MDM over previous methods in\nzero-shot settings. We will release our fine-grained textual annotations for\nHumanML3D and KIT.\n","authors":["Xu Shi","Wei Yao","Chuanchen Luo","Junran Peng","Hongwen Zhang","Yunlian Sun"],"pdf_url":"https://arxiv.org/pdf/2312.02772v2.pdf","comment":"Project Page: https://sx0207.github.io/fg-mdm/"},{"id":"http://arxiv.org/abs/2404.14822v1","updated":"2024-04-23T08:19:08Z","published":"2024-04-23T08:19:08Z","title":"CNN2GNN: How to Bridge CNN with GNN","summary":" Although the convolutional neural network (CNN) has achieved excellent\nperformance in vision tasks by extracting the intra-sample representation, it\nwill take a higher training expense because of stacking numerous convolutional\nlayers. Recently, as the bilinear models, graph neural networks (GNN) have\nsucceeded in exploring the underlying topological relationship among the graph\ndata with a few graph neural layers. Unfortunately, it cannot be directly\nutilized on non-graph data due to the lack of graph structure and has high\ninference latency on large-scale scenarios. Inspired by these complementary\nstrengths and weaknesses, \\textit{we discuss a natural question, how to bridge\nthese two heterogeneous networks?} In this paper, we propose a novel CNN2GNN\nframework to unify CNN and GNN together via distillation. Firstly, to break the\nlimitations of GNN, a differentiable sparse graph learning module is designed\nas the head of networks to dynamically learn the graph for inductive learning.\nThen, a response-based distillation is introduced to transfer the knowledge\nfrom CNN to GNN and bridge these two heterogeneous networks. Notably, due to\nextracting the intra-sample representation of a single instance and the\ntopological relationship among the datasets simultaneously, the performance of\ndistilled ``boosted'' two-layer GNN on Mini-ImageNet is much higher than CNN\ncontaining dozens of layers such as ResNet152.\n","authors":["Ziheng Jiao","Hongyuan Zhang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.14822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07518v2","updated":"2024-04-23T08:02:23Z","published":"2024-04-11T07:22:14Z","title":"Remembering Transformer for Continual Learning","summary":" Neural networks encounter the challenge of Catastrophic Forgetting (CF) in\ncontinual learning, where new task knowledge interferes with previously learned\nknowledge. We propose Remembering Transformer, inspired by the brain's\nComplementary Learning Systems (CLS), to tackle this issue. Remembering\nTransformer employs a mixture-of-adapters and a generative model-based routing\nmechanism to alleviate CF by dynamically routing task data to relevant\nadapters. Our approach demonstrated a new SOTA performance in various vision\ncontinual learning tasks and great parameter efficiency.\n","authors":["Yuwei Sun","Ippei Fujisawa","Arthur Juliani","Jun Sakuma","Ryota Kanai"],"pdf_url":"https://arxiv.org/pdf/2404.07518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14808v1","updated":"2024-04-23T07:39:09Z","published":"2024-04-23T07:39:09Z","title":"Visual-Augmented Dynamic Semantic Prototype for Generative Zero-Shot\n Learning","summary":" Generative Zero-shot learning (ZSL) learns a generator to synthesize visual\nsamples for unseen classes, which is an effective way to advance ZSL. However,\nexisting generative methods rely on the conditions of Gaussian noise and the\npredefined semantic prototype, which limit the generator only optimized on\nspecific seen classes rather than characterizing each visual instance,\nresulting in poor generalizations (\\textit{e.g.}, overfitting to seen classes).\nTo address this issue, we propose a novel Visual-Augmented Dynamic Semantic\nprototype method (termed VADS) to boost the generator to learn accurate\nsemantic-visual mapping by fully exploiting the visual-augmented knowledge into\nsemantic conditions. In detail, VADS consists of two modules: (1) Visual-aware\nDomain Knowledge Learning module (VDKL) learns the local bias and global prior\nof the visual features (referred to as domain visual knowledge), which replace\npure Gaussian noise to provide richer prior noise information; (2)\nVision-Oriented Semantic Updation module (VOSU) updates the semantic prototype\naccording to the visual representations of the samples. Ultimately, we\nconcatenate their output as a dynamic semantic prototype, which serves as the\ncondition of the generator. Extensive experiments demonstrate that our VADS\nachieves superior CZSL and GZSL performances on three prominent datasets and\noutperforms other state-of-the-art methods with averaging increases by 6.4\\%,\n5.9\\% and 4.2\\% on SUN, CUB and AWA2, respectively.\n","authors":["Wenjin Hou","Shiming Chen","Shuhuang Chen","Ziming Hong","Yan Wang","Xuetao Feng","Salman Khan","Fahad Shahbaz Khan","Xinge You"],"pdf_url":"https://arxiv.org/pdf/2404.14808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14807v1","updated":"2024-04-23T07:37:43Z","published":"2024-04-23T07:37:43Z","title":"Reference-Free Multi-Modality Volume Registration of X-Ray Microscopy\n and Light-Sheet Fluorescence Microscopy","summary":" Recently, X-ray microscopy (XRM) and light-sheet fluorescence microscopy\n(LSFM) have emerged as two pivotal imaging tools in preclinical research on\nbone remodeling diseases, offering micrometer-level resolution. Integrating\nthese complementary modalities provides a holistic view of bone\nmicrostructures, facilitating function-oriented volume analysis across\ndifferent disease cycles. However, registering such independently acquired\nlarge-scale volumes is extremely challenging under real and reference-free\nscenarios. This paper presents a fast two-stage pipeline for volume\nregistration of XRM and LSFM. The first stage extracts the surface features and\nemploys two successive point cloud-based methods for coarse alignment. The\nsecond stage fine-tunes the initial alignment using a modified\ncross-correlation method, ensuring precise volumetric registration. Moreover,\nwe propose residual similarity as a novel metric to assess the alignment of two\ncomplementary modalities. The results imply robust gradual improvement across\nthe stages. In the end, all correlating microstructures, particularly lacunae\nin XRM and bone cells in LSFM, are precisely matched, enabling new insights\ninto bone diseases like osteoporosis which are a substantial burden in aging\nsocieties.\n","authors":["Siyuan Mei","Fuxin Fan","Mareike Thies","Mingxuan Gu","Fabian Wagner","Oliver Aust","Ina Erceg","Zeynab Mirzaei","Georgiana Neag","Yipeng Sun","Yixing Huang","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2404.14807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02238v3","updated":"2024-04-23T07:35:14Z","published":"2023-12-04T09:19:38Z","title":"X-Adapter: Adding Universal Compatibility of Plugins for Upgraded\n Diffusion Model","summary":" We introduce X-Adapter, a universal upgrader to enable the pretrained\nplug-and-play modules (e.g., ControlNet, LoRA) to work directly with the\nupgraded text-to-image diffusion model (e.g., SDXL) without further retraining.\nWe achieve this goal by training an additional network to control the frozen\nupgraded model with the new text-image data pairs. In detail, X-Adapter keeps a\nfrozen copy of the old model to preserve the connectors of different plugins.\nAdditionally, X-Adapter adds trainable mapping layers that bridge the decoders\nfrom models of different versions for feature remapping. The remapped features\nwill be used as guidance for the upgraded model. To enhance the guidance\nability of X-Adapter, we employ a null-text training strategy for the upgraded\nmodel. After training, we also introduce a two-stage denoising strategy to\nalign the initial latents of X-Adapter and the upgraded model. Thanks to our\nstrategies, X-Adapter demonstrates universal compatibility with various plugins\nand also enables plugins of different versions to work together, thereby\nexpanding the functionalities of diffusion community. To verify the\neffectiveness of the proposed method, we conduct extensive experiments and the\nresults show that X-Adapter may facilitate wider application in the upgraded\nfoundational diffusion model.\n","authors":["Lingmin Ran","Xiaodong Cun","Jia-Wei Liu","Rui Zhao","Song Zijie","Xintao Wang","Jussi Keppo","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.02238v3.pdf","comment":"Project page: https://showlab.github.io/X-Adapter/"},{"id":"http://arxiv.org/abs/2404.14801v1","updated":"2024-04-23T07:31:19Z","published":"2024-04-23T07:31:19Z","title":"DesignProbe: A Graphic Design Benchmark for Multimodal Large Language\n Models","summary":" A well-executed graphic design typically achieves harmony in two levels, from\nthe fine-grained design elements (color, font and layout) to the overall\ndesign. This complexity makes the comprehension of graphic design challenging,\nfor it needs the capability to both recognize the design elements and\nunderstand the design. With the rapid development of Multimodal Large Language\nModels (MLLMs), we establish the DesignProbe, a benchmark to investigate the\ncapability of MLLMs in design. Our benchmark includes eight tasks in total,\nacross both the fine-grained element level and the overall design level. At\ndesign element level, we consider both the attribute recognition and semantic\nunderstanding tasks. At overall design level, we include style and metaphor. 9\nMLLMs are tested and we apply GPT-4 as evaluator. Besides, further experiments\nindicates that refining prompts can enhance the performance of MLLMs. We first\nrewrite the prompts by different LLMs and found increased performances appear\nin those who self-refined by their own LLMs. We then add extra task knowledge\nin two different ways (text descriptions and image examples), finding that\nadding images boost much more performance over texts.\n","authors":["Jieru Lin","Danqing Huang","Tiejun Zhao","Dechen Zhan","Chin-Yew Lin"],"pdf_url":"https://arxiv.org/pdf/2404.14801v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2404.07762v4","updated":"2024-04-23T07:29:18Z","published":"2024-04-11T14:03:16Z","title":"NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous\n Driving","summary":" We present a versatile NeRF-based simulator for testing autonomous driving\n(AD) software systems, designed with a focus on sensor-realistic closed-loop\nevaluation and the creation of safety-critical scenarios. The simulator learns\nfrom sequences of real-world driving sensor data and enables reconfigurations\nand renderings of new, unseen scenarios. In this work, we use our simulator to\ntest the responses of AD models to safety-critical scenarios inspired by the\nEuropean New Car Assessment Programme (Euro NCAP). Our evaluation reveals that,\nwhile state-of-the-art end-to-end planners excel in nominal driving scenarios\nin an open-loop setting, they exhibit critical flaws when navigating our\nsafety-critical scenarios in a closed-loop setting. This highlights the need\nfor advancements in the safety and real-world usability of end-to-end planners.\nBy publicly releasing our simulator and scenarios as an easy-to-run evaluation\nsuite, we invite the research community to explore, refine, and validate their\nAD models in controlled, yet highly configurable and challenging\nsensor-realistic environments. Code and instructions can be found at\nhttps://github.com/atonderski/neuro-ncap\n","authors":["William Ljungbergh","Adam Tonderski","Joakim Johnander","Holger Caesar","Kalle Åström","Michael Felsberg","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2404.07762v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13558v2","updated":"2024-04-23T07:17:03Z","published":"2024-04-21T07:13:56Z","title":"LASER: Tuning-Free LLM-Driven Attention Control for Efficient\n Text-conditioned Image-to-Animation","summary":" Revolutionary advancements in text-to-image models have unlocked new\ndimensions for sophisticated content creation, e.g., text-conditioned image\nediting, allowing us to edit the diverse images that convey highly complex\nvisual concepts according to the textual guidance. Despite being promising,\nexisting methods focus on texture- or non-rigid-based visual manipulation,\nwhich struggles to produce the fine-grained animation of smooth\ntext-conditioned image morphing without fine-tuning, i.e., due to their highly\nunstructured latent space. In this paper, we introduce a tuning-free LLM-driven\nattention control framework, encapsulated by the progressive process of LLM\nplanning, prompt-Aware editing, StablE animation geneRation, abbreviated as\nLASER. LASER employs a large language model (LLM) to refine coarse descriptions\ninto detailed prompts, guiding pre-trained text-to-image models for subsequent\nimage generation. We manipulate the model's spatial features and self-attention\nmechanisms to maintain animation integrity and enable seamless morphing\ndirectly from text prompts, eliminating the need for additional fine-tuning or\nannotations. Our meticulous control over spatial features and self-attention\nensures structural consistency in the images. This paper presents a novel\nframework integrating LLMs with text-to-image models to create high-quality\nanimations from a single text input. We also propose a Text-conditioned\nImage-to-Animation Benchmark to validate the effectiveness and efficacy of\nLASER. Extensive experiments demonstrate that LASER produces impressive,\nconsistent, and efficient results in animation generation, positioning it as a\npowerful tool for advanced digital content creation.\n","authors":["Haoyu Zheng","Wenqiao Zhang","Yaoke Wang","Hao Zhou","Jiang Liu","Juncheng Li","Zheqi Lv","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.13558v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.08968v3","updated":"2024-04-23T07:13:30Z","published":"2024-04-13T11:13:56Z","title":"MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes","summary":" Recent advancements in post-hoc and inherently interpretable methods have\nmarkedly enhanced the explanations of black box classifier models. These\nmethods operate either through post-analysis or by integrating concept learning\nduring model training. Although being effective in bridging the semantic gap\nbetween a model's latent space and human interpretation, these explanation\nmethods only partially reveal the model's decision-making process. The outcome\nis typically limited to high-level semantics derived from the last feature map.\nWe argue that the explanations lacking insights into the decision processes at\nlow and mid-level features are neither fully faithful nor useful. Addressing\nthis gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet),\nan inherently interpretable model. MCPNet autonomously learns meaningful\nconcept prototypes across multiple feature map levels using Centered Kernel\nAlignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so\nwithout reliance on predefined concept labels. Further, we propose a novel\nclassifier paradigm that learns and aligns multi-level concept prototype\ndistributions for classification purposes via Class-aware Concept Distribution\n(CCD) loss. Our experiments reveal that our proposed MCPNet while being\nadaptable to various model architectures, offers comprehensive multi-level\nexplanations while maintaining classification accuracy. Additionally, its\nconcept distribution-based classification approach shows improved\ngeneralization capabilities in few-shot classification scenarios.\n","authors":["Bor-Shiun Wang","Chien-Yi Wang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2404.08968v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.14780v1","updated":"2024-04-23T06:37:54Z","published":"2024-04-23T06:37:54Z","title":"ContextualFusion: Context-Based Multi-Sensor Fusion for 3D Object\n Detection in Adverse Operating Conditions","summary":" The fusion of multimodal sensor data streams such as camera images and lidar\npoint clouds plays an important role in the operation of autonomous vehicles\n(AVs). Robust perception across a range of adverse weather and lighting\nconditions is specifically required for AVs to be deployed widely. While\nmulti-sensor fusion networks have been previously developed for perception in\nsunny and clear weather conditions, these methods show a significant\ndegradation in performance under night-time and poor weather conditions. In\nthis paper, we propose a simple yet effective technique called ContextualFusion\nto incorporate the domain knowledge about cameras and lidars behaving\ndifferently across lighting and weather variations into 3D object detection\nmodels. Specifically, we design a Gated Convolutional Fusion (GatedConv)\napproach for the fusion of sensor streams based on the operational context. To\naid in our evaluation, we use the open-source simulator CARLA to create a\nmultimodal adverse-condition dataset called AdverseOp3D to address the\nshortcomings of existing datasets being biased towards daytime and good-weather\nconditions. Our ContextualFusion approach yields an mAP improvement of 6.2%\nover state-of-the-art methods on our context-balanced synthetic dataset.\nFinally, our method enhances state-of-the-art 3D objection performance at night\non the real-world NuScenes dataset with a significant mAP improvement of 11.7%.\n","authors":["Shounak Sural","Nishad Sahu"," Ragunathan"," Rajkumar"],"pdf_url":"https://arxiv.org/pdf/2404.14780v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.14768v1","updated":"2024-04-23T06:10:43Z","published":"2024-04-23T06:10:43Z","title":"Enhancing Prompt Following with Visual Control Through Training-Free\n Mask-Guided Diffusion","summary":" Recently, integrating visual controls into text-to-image~(T2I) models, such\nas ControlNet method, has received significant attention for finer control\ncapabilities. While various training-free methods make efforts to enhance\nprompt following in T2I models, the issue with visual control is still rarely\nstudied, especially in the scenario that visual controls are misaligned with\ntext prompts. In this paper, we address the challenge of ``Prompt Following\nWith Visual Control\" and propose a training-free approach named Mask-guided\nPrompt Following (MGPF). Object masks are introduced to distinct aligned and\nmisaligned parts of visual controls and prompts. Meanwhile, a network, dubbed\nas Masked ControlNet, is designed to utilize these object masks for object\ngeneration in the misaligned visual control region. Further, to improve\nattribute matching, a simple yet efficient loss is designed to align the\nattention maps of attributes with object regions constrained by ControlNet and\nobject masks. The efficacy and superiority of MGPF are validated through\ncomprehensive quantitative and qualitative experiments.\n","authors":["Hongyu Chen","Yiqi Gao","Min Zhou","Peng Wang","Xubin Li","Tiezheng Ge","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.14768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14759v1","updated":"2024-04-23T05:50:02Z","published":"2024-04-23T05:50:02Z","title":"Unified Unsupervised Salient Object Detection via Knowledge Transfer","summary":" Recently, unsupervised salient object detection (USOD) has gained increasing\nattention due to its annotation-free nature. However, current methods mainly\nfocus on specific tasks such as RGB and RGB-D, neglecting the potential for\ntask migration. In this paper, we propose a unified USOD framework for generic\nUSOD tasks. Firstly, we propose a Progressive Curriculum Learning-based\nSaliency Distilling (PCL-SD) mechanism to extract saliency cues from a\npre-trained deep network. This mechanism starts with easy samples and\nprogressively moves towards harder ones, to avoid initial interference caused\nby hard samples. Afterwards, the obtained saliency cues are utilized to train a\nsaliency detector, and we employ a Self-rectify Pseudo-label Refinement (SPR)\nmechanism to improve the quality of pseudo-labels. Finally, an adapter-tuning\nmethod is devised to transfer the acquired saliency knowledge, leveraging\nshared knowledge to attain superior transferring performance on the target\ntasks. Extensive experiments on five representative SOD tasks confirm the\neffectiveness and feasibility of our proposed method. Code and supplement\nmaterials are available at https://github.com/I2-Multimedia-Lab/A2S-v3.\n","authors":["Yao Yuan","Wutao Liu","Pan Gao","Qun Dai","Jie Qin"],"pdf_url":"https://arxiv.org/pdf/2404.14759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12113v5","updated":"2024-04-23T05:46:01Z","published":"2023-08-23T13:06:59Z","title":"Advancements in Point Cloud Data Augmentation for Deep Learning: A\n Survey","summary":" Deep learning (DL) has become one of the mainstream and effective methods for\npoint cloud analysis tasks such as detection, segmentation and classification.\nTo reduce overfitting during training DL models and improve model performance\nespecially when the amount and/or diversity of training data are limited,\naugmentation is often crucial. Although various point cloud data augmentation\nmethods have been widely used in different point cloud processing tasks, there\nare currently no published systematic surveys or reviews of these methods.\nTherefore, this article surveys these methods, categorizing them into a\ntaxonomy framework that comprises basic and specialized point cloud data\naugmentation methods. Through a comprehensive evaluation of these augmentation\nmethods, this article identifies their potentials and limitations, serving as a\nuseful reference for choosing appropriate augmentation methods. In addition,\npotential directions for future research are recommended. This survey\ncontributes to providing a holistic overview of the current state of point\ncloud data augmentation, promoting its wider application and development.\n","authors":["Qinfeng Zhu","Lei Fan","Ningxin Weng"],"pdf_url":"https://arxiv.org/pdf/2308.12113v5.pdf","comment":"Accepted by Pattern Recognition"},{"id":"http://arxiv.org/abs/2404.14755v1","updated":"2024-04-23T05:36:33Z","published":"2024-04-23T05:36:33Z","title":"SkinGEN: an Explainable Dermatology Diagnosis-to-Generation Framework\n with Interactive Vision-Language Models","summary":" With the continuous advancement of vision language models (VLMs) technology,\nremarkable research achievements have emerged in the dermatology field, the\nfourth most prevalent human disease category. However, despite these\nadvancements, VLM still faces \"hallucination\" in dermatological diagnosis, and\ndue to the inherent complexity of dermatological conditions, existing tools\noffer relatively limited support for user comprehension. We propose SkinGEN, a\ndiagnosis-to-generation framework that leverages the stable diffusion (SD)\nmethod to generate reference demonstrations from diagnosis results provided by\nVLM, thereby enhancing the visual explainability for users. Through extensive\nexperiments with Low-Rank Adaptation (LoRA), we identify optimal strategies for\nskin condition image generation. We conduct a user study with 32 participants\nevaluating both the system performance and explainability. Results demonstrate\nthat SkinGEN significantly improves users' comprehension of VLM predictions and\nfosters increased trust in the diagnostic process. This work paves the way for\nmore transparent and user-centric VLM applications in dermatology and beyond.\n","authors":["Bo Lin","Yingjing Xu","Xuanwen Bao","Zhou Zhao","Zuyong Zhang","Zhouyang Wang","Jie Zhang","Shuiguang Deng","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2404.14755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12734v3","updated":"2024-04-23T05:36:31Z","published":"2024-04-19T09:28:16Z","title":"DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On\n Transformer","summary":" With the continuous development of Optical Character Recognition (OCR) and\nthe expansion of application fields, text recognition in complex scenes has\nbecome a key challenge. Factors such as multiple fonts, mixed scenes and\ncomplex layouts seriously affect the recognition accuracy of traditional OCR\nmodels. Although OCR models based on deep learning have performed well in\nspecific fields or similar datasets in recent years, the generalization ability\nand robustness of the model are still a big challenge when facing complex\nenvironments with multiple scenes. Furthermore, training an OCR model from\nscratch or fine-tuning all parameters is very demanding on computing resources\nand inference time, which limits the flexibility of its application. This study\nfocuses on a fundamental aspect of mixed text recognition in response to the\nchallenges mentioned above, which involves effectively fine-tuning the\npre-trained basic OCR model to demonstrate exceptional performance across\nvarious downstream tasks. To this end, we propose a parameter-efficient mixed\ntext recognition method based on pre-trained OCR Transformer, namely\nDLoRA-TrOCR. This method embeds DoRA into the image encoder and LoRA into the\ninternal structure of the text decoder, enabling efficient parameter\nfine-tuning for downstream tasks. Experiments show that compared to similar\nparameter adjustment methods, our model DLoRA-TrOCR has the smallest number of\nparameters and performs better. It can achieve state-of-the-art performance on\ncomplex scene datasets involving simultaneous recognition of mixed handwritten,\nprinted and street view texts.\n","authors":["Da Chang","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2404.12734v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17387v2","updated":"2024-04-23T05:34:50Z","published":"2024-03-26T05:12:18Z","title":"Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object\n Detection","summary":" We delve into pseudo-labeling for semi-supervised monocular 3D object\ndetection (SSM3OD) and discover two primary issues: a misalignment between the\nprediction quality of 3D and 2D attributes and the tendency of depth\nsupervision derived from pseudo-labels to be noisy, leading to significant\noptimization conflicts with other reliable forms of supervision. We introduce a\nnovel decoupled pseudo-labeling (DPL) approach for SSM3OD. Our approach\nfeatures a Decoupled Pseudo-label Generation (DPG) module, designed to\nefficiently generate pseudo-labels by separately processing 2D and 3D\nattributes. This module incorporates a unique homography-based method for\nidentifying dependable pseudo-labels in BEV space, specifically for 3D\nattributes. Additionally, we present a DepthGradient Projection (DGP) module to\nmitigate optimization conflicts caused by noisy depth supervision of\npseudo-labels, effectively decoupling the depth gradient and removing\nconflicting gradients. This dual decoupling strategy-at both the pseudo-label\ngeneration and gradient levels-significantly improves the utilization of\npseudo-labels in SSM3OD. Our comprehensive experiments on the KITTI benchmark\ndemonstrate the superiority of our method over existing approaches.\n","authors":["Jiacheng Zhang","Jiaming Li","Xiangru Lin","Wei Zhang","Xiao Tan","Junyu Han","Errui Ding","Jingdong Wang","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17387v2.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2404.14750v1","updated":"2024-04-23T05:16:24Z","published":"2024-04-23T05:16:24Z","title":"Grounded Knowledge-Enhanced Medical VLP for Chest X-Ray","summary":" Medical vision-language pre-training has emerged as a promising approach for\nlearning domain-general representations of medical image and text. Current\nalgorithms that exploit the global and local alignment between medical image\nand text could however be marred by the redundant information in medical data.\nTo address this issue, we propose a grounded knowledge-enhanced medical\nvision-language pre-training (GK-MVLP) framework for chest X-ray. In this\nframework, medical knowledge is grounded to the appropriate anatomical regions\nby using a transformer-based grounded knowledge-enhanced module for\nfine-grained alignment between anatomical region-level visual features and the\ntextural features of medical knowledge. The performance of GK-MVLP is\ncompetitive with or exceeds the state of the art on downstream chest X-ray\ndisease classification, disease localization, report generation, and medical\nvisual question-answering tasks. Our results show the advantage of\nincorporating grounding mechanism to remove biases and improve the alignment\nbetween chest X-ray image and radiology report.\n","authors":["Qiao Deng","Zhongzhen Huang","Yunqi Wang","Zhichuan Wang","Zhao Wang","Xiaofan Zhang","Qi Dou","Yeung Yu Hui","Edward S. Hui"],"pdf_url":"https://arxiv.org/pdf/2404.14750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11318v2","updated":"2024-04-23T05:09:40Z","published":"2024-04-17T12:32:10Z","title":"Leveraging Fine-Grained Information and Noise Decoupling for Remote\n Sensing Change Detection","summary":" Change detection aims to identify remote sense object changes by analyzing\ndata between bitemporal image pairs. Due to the large temporal and spatial span\nof data collection in change detection image pairs, there are often a\nsignificant amount of task-specific and task-agnostic noise. Previous effort\nhas focused excessively on denoising, with this goes a great deal of loss of\nfine-grained information. In this paper, we revisit the importance of\nfine-grained features in change detection and propose a series of operations\nfor fine-grained information compensation and noise decoupling (FINO). First,\nthe context is utilized to compensate for the fine-grained information in the\nfeature space. Next, a shape-aware and a brightness-aware module are designed\nto improve the capacity for representation learning. The shape-aware module\nguides the backbone for more precise shape estimation, guiding the backbone\nnetwork in extracting object shape features. The brightness-aware module learns\na overall brightness estimation to improve the model's robustness to\ntask-agnostic noise. Finally, a task-specific noise decoupling structure is\ndesigned as a way to improve the model's ability to separate noise interference\nfrom feature similarity. With these training schemes, our proposed method\nachieves new state-of-the-art (SOTA) results in multiple change detection\nbenchmarks. The code will be made available.\n","authors":["Qiangang Du","Jinlong Peng","Changan Wang","Xu Chen","Qingdong He","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05108v3","updated":"2024-04-23T05:06:10Z","published":"2023-10-08T10:44:05Z","title":"Enhancing Representations through Heterogeneous Self-Supervised Learning","summary":" Incorporating heterogeneous representations from different architectures has\nfacilitated various vision tasks, e.g., some hybrid networks combine\ntransformers and convolutions. However, complementarity between such\nheterogeneous architectures has not been well exploited in self-supervised\nlearning. Thus, we propose Heterogeneous Self-Supervised Learning (HSSL), which\nenforces a base model to learn from an auxiliary head whose architecture is\nheterogeneous from the base model. In this process, HSSL endows the base model\nwith new characteristics in a representation learning way without structural\nchanges. To comprehensively understand the HSSL, we conduct experiments on\nvarious heterogeneous pairs containing a base model and an auxiliary head. We\ndiscover that the representation quality of the base model moves up as their\narchitecture discrepancy grows. This observation motivates us to propose a\nsearch strategy that quickly determines the most suitable auxiliary head for a\nspecific base model to learn and several simple but effective methods to\nenlarge the model discrepancy. The HSSL is compatible with various\nself-supervised methods, achieving superior performances on various downstream\ntasks, including image classification, semantic segmentation, instance\nsegmentation, and object detection. Our source code will be made publicly\navailable.\n","authors":["Zhong-Yu Li","Bo-Wen Yin","Yongxiang Liu","Li Liu","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.05108v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11326v4","updated":"2024-04-23T05:04:23Z","published":"2024-04-17T12:38:58Z","title":"Single-temporal Supervised Remote Change Detection for Domain\n Generalization","summary":" Change detection is widely applied in remote sensing image analysis. Existing\nmethods require training models separately for each dataset, which leads to\npoor domain generalization. Moreover, these methods rely heavily on large\namounts of high-quality pair-labelled data for training, which is expensive and\nimpractical. In this paper, we propose a multimodal contrastive learning\n(ChangeCLIP) based on visual-language pre-training for change detection domain\ngeneralization. Additionally, we propose a dynamic context optimization for\nprompt learning. Meanwhile, to address the data dependency issue of existing\nmethods, we introduce a single-temporal and controllable AI-generated training\nstrategy (SAIN). This allows us to train the model using a large number of\nsingle-temporal images without image pairs in the real world, achieving\nexcellent generalization. Extensive experiments on series of real change\ndetection datasets validate the superiority and strong generalization of\nChangeCLIP, outperforming state-of-the-art change detection methods. Code will\nbe available.\n","authors":["Qiangang Du","Jinlong Peng","Xu Chen","Qingdong He","Liren He","Qiang Nie","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11326v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14747v1","updated":"2024-04-23T04:59:34Z","published":"2024-04-23T04:59:34Z","title":"Differentiable Score-Based Likelihoods: Learning CT Motion Compensation\n From Clean Images","summary":" Motion artifacts can compromise the diagnostic value of computed tomography\n(CT) images. Motion correction approaches require a per-scan estimation of\npatient-specific motion patterns. In this work, we train a score-based model to\nact as a probability density estimator for clean head CT images. Given the\ntrained model, we quantify the deviation of a given motion-affected CT image\nfrom the ideal distribution through likelihood computation. We demonstrate that\nthe likelihood can be utilized as a surrogate metric for motion artifact\nseverity in the CT image facilitating the application of an iterative,\ngradient-based motion compensation algorithm. By optimizing the underlying\nmotion parameters to maximize likelihood, our method effectively reduces motion\nartifacts, bringing the image closer to the distribution of motion-free scans.\nOur approach achieves comparable performance to state-of-the-art methods while\neliminating the need for a representative data set of motion-affected samples.\nThis is particularly advantageous in real-world applications, where patient\nmotion patterns may exhibit unforeseen variability, ensuring robustness without\nimplicit assumptions about recoverable motion types.\n","authors":["Mareike Thies","Noah Maul","Siyuan Mei","Laura Pfaff","Nastassia Vysotskaya","Mingxuan Gu","Jonas Utz","Dennis Possart","Lukas Folle","Fabian Wagner","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2404.14747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04517v2","updated":"2024-04-23T04:54:51Z","published":"2024-04-06T06:15:07Z","title":"Latent-based Diffusion Model for Long-tailed Recognition","summary":" Long-tailed imbalance distribution is a common issue in practical computer\nvision applications. Previous works proposed methods to address this problem,\nwhich can be categorized into several classes: re-sampling, re-weighting,\ntransfer learning, and feature augmentation. In recent years, diffusion models\nhave shown an impressive generation ability in many sub-problems of deep\ncomputer vision. However, its powerful generation has not been explored in\nlong-tailed problems. We propose a new approach, the Latent-based Diffusion\nModel for Long-tailed Recognition (LDMLR), as a feature augmentation method to\ntackle the issue. First, we encode the imbalanced dataset into features using\nthe baseline model. Then, we train a Denoising Diffusion Implicit Model (DDIM)\nusing these encoded features to generate pseudo-features. Finally, we train the\nclassifier using the encoded and pseudo-features from the previous two steps.\nThe model's accuracy shows an improvement on the CIFAR-LT and ImageNet-LT\ndatasets by using the proposed method.\n","authors":["Pengxiao Han","Changkun Ye","Jieming Zhou","Jing Zhang","Jie Hong","Xuesong Li"],"pdf_url":"https://arxiv.org/pdf/2404.04517v2.pdf","comment":"8 pages, 3 figures. Accepted by L3DIVU-CVPR2024"},{"id":"http://arxiv.org/abs/2404.14745v1","updated":"2024-04-23T04:54:32Z","published":"2024-04-23T04:54:32Z","title":"TAAT: Think and Act from Arbitrary Texts in Text2Motion","summary":" Text2Motion aims to generate human motions from texts. Existing datasets rely\non the assumption that texts include action labels (such as \"walk, bend, and\npick up\"), which is not flexible for practical scenarios. This paper redefines\nthis problem with a more realistic assumption that the texts are arbitrary.\nSpecifically, arbitrary texts include existing action texts composed of action\nlabels (e.g., A person walks and bends to pick up something), and introduce\nscene texts without explicit action labels (e.g., A person notices his wallet\non the ground ahead).\n To bridge the gaps between this realistic setting and existing datasets, we\nexpand the action texts on the HumanML3D dataset to more scene texts, thereby\ncreating a new HumanML3D++ dataset including arbitrary texts. In this\nchallenging dataset, we benchmark existing state-of-the-art methods and propose\na novel two-stage framework to extract action labels from arbitrary texts by\nthe Large Language Model (LLM) and then generate motions from action labels.\nExtensive experiments are conducted under different application scenarios to\nvalidate the effectiveness of the proposed framework on existing and proposed\ndatasets. The results indicate that Text2Motion in this realistic setting is\nvery challenging, fostering new research in this practical direction. Our\ndataset and code will be released.\n","authors":["Runqi Wang","Caoyuan Ma","GuoPeng Li","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13949v2","updated":"2024-04-23T04:48:47Z","published":"2024-04-22T07:50:24Z","title":"PeLiCal: Targetless Extrinsic Calibration via Penetrating Lines for\n RGB-D Cameras with Limited Co-visibility","summary":" RGB-D cameras are crucial in robotic perception, given their ability to\nproduce images augmented with depth data. However, their limited FOV often\nrequires multiple cameras to cover a broader area. In multi-camera RGB-D\nsetups, the goal is typically to reduce camera overlap, optimizing spatial\ncoverage with as few cameras as possible. The extrinsic calibration of these\nsystems introduces additional complexities. Existing methods for extrinsic\ncalibration either necessitate specific tools or highly depend on the accuracy\nof camera motion estimation. To address these issues, we present PeLiCal, a\nnovel line-based calibration approach for RGB-D camera systems exhibiting\nlimited overlap. Our method leverages long line features from surroundings, and\nfilters out outliers with a novel convergence voting algorithm, achieving\ntargetless, real-time, and outlier-robust performance compared to existing\nmethods. We open source our implementation on\nhttps://github.com/joomeok/PeLiCal.git.\n","authors":["Jaeho Shin","Seungsang Yun","Ayoung Kim"],"pdf_url":"https://arxiv.org/pdf/2404.13949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14739v1","updated":"2024-04-23T04:45:23Z","published":"2024-04-23T04:45:23Z","title":"BMapOpt: Optimization of Brain Tissue Probability Maps using a\n Differentiable MRI Simulator","summary":" Reconstructing digital brain phantoms in the form of multi-channeled brain\ntissue probability maps for individual subjects is essential for capturing\nbrain anatomical variability, understanding neurological diseases, as well as\nfor testing image processing methods. We demonstrate the first framework that\noptimizes brain tissue probability maps (Gray Matter - GM, White Matter - WM,\nand Cerebrospinal fluid - CSF) with the help of a Physics-based differentiable\nMRI simulator that models the magnetization signal at each voxel in the image.\nGiven an observed $T_1$/$T_2$-weighted MRI scan, the corresponding clinical MRI\nsequence, and the MRI differentiable simulator, we optimize the simulator's\ninput probability maps by back-propagating the L2 loss between the simulator's\noutput and the $T_1$/$T_2$-weighted scan. This approach has the significant\nadvantage of not relying on any training data, and instead uses the strong\ninductive bias of the MRI simulator. We tested the model on 20 scans from the\nBrainWeb database and demonstrate a highly accurate reconstruction of GM, WM,\nand CSF.\n","authors":["Utkarsh Gupta","Emmanouil Nikolakakis","Moritz Zaiss","Razvan Marinescu"],"pdf_url":"https://arxiv.org/pdf/2404.14739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15569v2","updated":"2024-04-23T04:08:54Z","published":"2023-07-28T14:04:54Z","title":"Point Clouds Are Specialized Images: A Knowledge Transfer Approach for\n 3D Understanding","summary":" Self-supervised representation learning (SSRL) has gained increasing\nattention in point cloud understanding, in addressing the challenges posed by\n3D data scarcity and high annotation costs. This paper presents PCExpert, a\nnovel SSRL approach that reinterprets point clouds as \"specialized images\".\nThis conceptual shift allows PCExpert to leverage knowledge derived from\nlarge-scale image modality in a more direct and deeper manner, via extensively\nsharing the parameters with a pre-trained image encoder in a multi-way\nTransformer architecture. The parameter sharing strategy, combined with a novel\npretext task for pre-training, i.e., transformation estimation, empowers\nPCExpert to outperform the state of the arts in a variety of tasks, with a\nremarkable reduction in the number of trainable parameters. Notably, PCExpert's\nperformance under LINEAR fine-tuning (e.g., yielding a 90.02% overall accuracy\non ScanObjectNN) has already approached the results obtained with FULL model\nfine-tuning (92.66%), demonstrating its effective and robust representation\ncapability.\n","authors":["Jiachen Kang","Wenjing Jia","Xiangjian He","Kin Man Lam"],"pdf_url":"https://arxiv.org/pdf/2307.15569v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16279v3","updated":"2024-04-23T03:54:27Z","published":"2023-10-25T01:24:12Z","title":"TransPose: 6D Object Pose Estimation with Geometry-Aware Transformer","summary":" Estimating the 6D object pose is an essential task in many applications. Due\nto the lack of depth information, existing RGB-based methods are sensitive to\nocclusion and illumination changes. How to extract and utilize the geometry\nfeatures in depth information is crucial to achieve accurate predictions. To\nthis end, we propose TransPose, a novel 6D pose framework that exploits\nTransformer Encoder with geometry-aware module to develop better learning of\npoint cloud feature representations. Specifically, we first uniformly sample\npoint cloud and extract local geometry features with the designed local feature\nextractor base on graph convolution network. To improve robustness to\nocclusion, we adopt Transformer to perform the exchange of global information,\nmaking each local feature contains global information. Finally, we introduce\ngeometry-aware module in Transformer Encoder, which to form an effective\nconstrain for point cloud feature learning and makes the global information\nexchange more tightly coupled with point cloud tasks. Extensive experiments\nindicate the effectiveness of TransPose, our pose estimation pipeline achieves\ncompetitive results on three benchmark datasets.\n","authors":["Xiao Lin","Deming Wang","Guangliang Zhou","Chengju Liu","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.16279v3.pdf","comment":"Accepted by NEUROCOMPUTING"},{"id":"http://arxiv.org/abs/2404.14716v1","updated":"2024-04-23T03:42:48Z","published":"2024-04-23T03:42:48Z","title":"Bayesian Example Selection Improves In-Context Learning for Speech,\n Text, and Visual Modalities","summary":" Large language models (LLMs) can adapt to new tasks through in-context\nlearning (ICL) based on a few examples presented in dialogue history without\nany model parameter update. Despite such convenience, the performance of ICL\nheavily depends on the quality of the in-context examples presented, which\nmakes the in-context example selection approach a critical choice. This paper\nproposes a novel Bayesian in-Context example Selection method (ByCS) for ICL.\nExtending the inference probability conditioned on in-context examples based on\nBayes' theorem, ByCS focuses on the inverse inference conditioned on test\ninput. Following the assumption that accurate inverse inference probability\n(likelihood) will result in accurate inference probability (posterior),\nin-context examples are selected based on their inverse inference results.\nDiverse and extensive cross-tasking and cross-modality experiments are\nperformed with speech, text, and image examples. Experimental results show the\nefficacy and robustness of our ByCS method on various models, tasks and\nmodalities.\n","authors":["Siyin Wang","Chao-Han Huck Yang","Ji Wu","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.14716v1.pdf","comment":"16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.14715v1","updated":"2024-04-23T03:42:14Z","published":"2024-04-23T03:42:14Z","title":"FINEMATCH: Aspect-based Fine-grained Image and Text Mismatch Detection\n and Correction","summary":" Recent progress in large-scale pre-training has led to the development of\nadvanced vision-language models (VLMs) with remarkable proficiency in\ncomprehending and generating multimodal content. Despite the impressive ability\nto perform complex reasoning for VLMs, current models often struggle to\neffectively and precisely capture the compositional information on both the\nimage and text sides. To address this, we propose FineMatch, a new aspect-based\nfine-grained text and image matching benchmark, focusing on text and image\nmismatch detection and correction. This benchmark introduces a novel task for\nboosting and evaluating the VLMs' compositionality for aspect-based\nfine-grained text and image matching. In this task, models are required to\nidentify mismatched aspect phrases within a caption, determine the aspect's\nclass, and propose corrections for an image-text pair that may contain between\n0 and 3 mismatches. To evaluate the models' performance on this new task, we\npropose a new evaluation metric named ITM-IoU for which our experiments show a\nhigh correlation to human evaluation. In addition, we also provide a\ncomprehensive experimental analysis of existing mainstream VLMs, including\nfully supervised learning and in-context learning settings. We have found that\nmodels trained on FineMatch demonstrate enhanced proficiency in detecting\nfine-grained text and image mismatches. Moreover, models (e.g., GPT-4V, Gemini\nPro Vision) with strong abilities to perform multimodal in-context learning are\nnot as skilled at fine-grained compositional image and text matching analysis.\nWith FineMatch, we are able to build a system for text-to-image generation\nhallucination detection and correction.\n","authors":["Hang Hua","Jing Shi","Kushal Kafle","Simon Jenni","Daoan Zhang","John Collomosse","Scott Cohen","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2404.14715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06375v3","updated":"2024-04-23T03:35:53Z","published":"2024-03-11T01:58:04Z","title":"FlowVQTalker: High-Quality Emotional Talking Face Generation through\n Normalizing Flow and Quantization","summary":" Generating emotional talking faces is a practical yet challenging endeavor.\nTo create a lifelike avatar, we draw upon two critical insights from a human\nperspective: 1) The connection between audio and the non-deterministic facial\ndynamics, encompassing expressions, blinks, poses, should exhibit synchronous\nand one-to-many mapping. 2) Vibrant expressions are often accompanied by\nemotion-aware high-definition (HD) textures and finely detailed teeth. However,\nboth aspects are frequently overlooked by existing methods. To this end, this\npaper proposes using normalizing Flow and Vector-Quantization modeling to\nproduce emotional talking faces that satisfy both insights concurrently\n(FlowVQTalker). Specifically, we develop a flow-based coefficient generator\nthat encodes the dynamics of facial emotion into a multi-emotion-class latent\nspace represented as a mixture distribution. The generation process commences\nwith random sampling from the modeled distribution, guided by the accompanying\naudio, enabling both lip-synchronization and the uncertain nonverbal facial\ncues generation. Furthermore, our designed vector-quantization image generator\ntreats the creation of expressive facial images as a code query task, utilizing\na learned codebook to provide rich, high-quality textures that enhance the\nemotional perception of the results. Extensive experiments are conducted to\nshowcase the effectiveness of our approach.\n","authors":["Shuai Tan","Bin Ji","Ye Pan"],"pdf_url":"https://arxiv.org/pdf/2403.06375v3.pdf","comment":"11 pages, 11 figures, conference"},{"id":"http://arxiv.org/abs/2404.14709v1","updated":"2024-04-23T03:35:27Z","published":"2024-04-23T03:35:27Z","title":"SC-HVPPNet: Spatial and Channel Hybrid-Attention Video Post-Processing\n Network with CNN and Transformer","summary":" Convolutional Neural Network (CNN) and Transformer have attracted much\nattention recently for video post-processing (VPP). However, the interaction\nbetween CNN and Transformer in existing VPP methods is not fully explored,\nleading to inefficient communication between the local and global extracted\nfeatures. In this paper, we explore the interaction between CNN and Transformer\nin the task of VPP, and propose a novel Spatial and Channel Hybrid-Attention\nVideo Post-Processing Network (SC-HVPPNet), which can cooperatively exploit the\nimage priors in both spatial and channel domains. Specifically, in the spatial\ndomain, a novel spatial attention fusion module is designed, in which two\nattention weights are generated to fuse the local and global representations\ncollaboratively. In the channel domain, a novel channel attention fusion module\nis developed, which can blend the deep representations at the channel dimension\ndynamically. Extensive experiments show that SC-HVPPNet notably boosts video\nrestoration quality, with average bitrate savings of 5.29%, 12.42%, and 13.09%\nfor Y, U, and V components in the VTM-11.0-NNVC RA configuration.\n","authors":["Tong Zhang","Wenxue Cui","Shaohui Liu","Feng Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.14709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00391v2","updated":"2024-04-23T03:31:23Z","published":"2023-11-01T09:34:15Z","title":"Fixation-based Self-calibration for Eye Tracking in VR Headsets","summary":" This study proposes a novel self-calibration method for eye tracking in a\nvirtual reality (VR) headset. The proposed method is based on the assumptions\nthat the user's viewpoint can freely move and that the points of regard (PoRs)\nfrom different viewpoints are distributed within a small area on an object\nsurface during visual fixation. In the method, fixations are first detected\nfrom the time-series data of uncalibrated gaze directions using an extension of\nthe I-VDT (velocity and dispersion threshold identification) algorithm to a\nthree-dimensional (3D) scene. Then, the calibration parameters are optimized by\nminimizing the sum of a dispersion metrics of the PoRs. The proposed method can\npotentially identify the optimal calibration parameters representing the\nuser-dependent offset from the optical axis to the visual axis without explicit\nuser calibration, image processing, or marker-substitute objects. For the gaze\ndata of 18 participants walking in two VR environments with many occlusions,\nthe proposed method achieved an accuracy of 2.1$^\\circ$, which was\nsignificantly lower than the average offset. Our method is the first\nself-calibration method with an average error lower than 3$^\\circ$ in 3D\nenvironments. Further, the accuracy of the proposed method can be improved by\nup to 1.2$^\\circ$ by refining the fixation detection or optimization algorithm.\n","authors":["Ryusei Uramune","Sei Ikeda","Hiroki Ishizuka","Osamu Oshiro"],"pdf_url":"https://arxiv.org/pdf/2311.00391v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14572v2","updated":"2024-04-23T03:24:02Z","published":"2023-06-26T10:33:45Z","title":"Feature Imitating Networks Enhance The Performance, Reliability And\n Speed Of Deep Learning On Biomedical Image Processing Tasks","summary":" Feature-Imitating-Networks (FINs) are neural networks that are first trained\nto approximate closed-form statistical features (e.g. Entropy), and then\nembedded into other networks to enhance their performance. In this work, we\nperform the first evaluation of FINs for biomedical image processing tasks. We\nbegin by training a set of FINs to imitate six common radiomics features, and\nthen compare the performance of larger networks (with and without embedding the\nFINs) for three experimental tasks: COVID-19 detection from CT scans, brain\ntumor classification from MRI scans, and brain-tumor segmentation from MRI\nscans. We found that models embedded with FINs provided enhanced performance\nfor all three tasks when compared to baseline networks without FINs, even when\nthose baseline networks had more parameters. Additionally, we found that models\nembedded with FINs converged faster and more consistently compared to baseline\nnetworks with similar or greater representational capacity. The results of our\nexperiments provide evidence that FINs may offer state-of-the-art performance\nfor a variety of other biomedical image processing tasks.\n","authors":["Shangyang Min","Hassan B. Ebadian","Tuka Alhanai","Mohammad Mahdi Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2306.14572v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.00893v2","updated":"2024-04-23T03:23:10Z","published":"2022-06-02T06:46:12Z","title":"Leveraging Systematic Knowledge of 2D Transformations","summary":" The existing deep learning models suffer from out-of-distribution (o.o.d.)\nperformance drop in computer vision tasks. In comparison, humans have a\nremarkable ability to interpret images, even if the scenes in the images are\nrare, thanks to the systematicity of acquired knowledge. This work focuses on\n1) the acquisition of systematic knowledge of 2D transformations, and 2)\narchitectural components that can leverage the learned knowledge in image\nclassification tasks in an o.o.d. setting. With a new training methodology\nbased on synthetic datasets that are constructed under the causal framework,\nthe deep neural networks acquire knowledge from semantically different domains\n(e.g. even from noise), and exhibit certain level of systematicity in parameter\nestimation experiments. Based on this, a novel architecture is devised\nconsisting of a classifier, an estimator and an identifier (abbreviated as\n\"CED\"). By emulating the \"hypothesis-verification\" process in human visual\nperception, CED improves the classification accuracy significantly on test sets\nunder covariate shift.\n","authors":["Jiachen Kang","Wenjing Jia","Xiangjian He"],"pdf_url":"https://arxiv.org/pdf/2206.00893v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14705v1","updated":"2024-04-23T03:22:06Z","published":"2024-04-23T03:22:06Z","title":"Think-Program-reCtify: 3D Situated Reasoning with Large Language Models","summary":" This work addresses the 3D situated reasoning task which aims to answer\nquestions given egocentric observations in a 3D environment. The task remains\nchallenging as it requires comprehensive 3D perception and complex reasoning\nskills. End-to-end models trained on supervised data for 3D situated reasoning\nsuffer from data scarcity and generalization ability. Inspired by the recent\nsuccess of leveraging large language models (LLMs) for visual reasoning, we\npropose LLM-TPC, a novel framework that leverages the planning, tool usage, and\nreflection capabilities of LLMs through a ThinkProgram-reCtify loop. The Think\nphase first decomposes the compositional question into a sequence of steps, and\nthen the Program phase grounds each step to a piece of code and calls carefully\ndesigned 3D visual perception modules. Finally, the Rectify phase adjusts the\nplan and code if the program fails to execute. Experiments and analysis on the\nSQA3D benchmark demonstrate the effectiveness, interpretability and robustness\nof our method. Our code is publicly available at\nhttps://qingrongh.github.io/LLM-TPC/.\n","authors":["Qingrong He","Kejun Lin","Shizhe Chen","Anwen Hu","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2404.14705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14704v1","updated":"2024-04-23T03:17:36Z","published":"2024-04-23T03:17:36Z","title":"Unsupervised Domain Adaptation Architecture Search with Self-Training\n for Land Cover Mapping","summary":" Unsupervised domain adaptation (UDA) is a challenging open problem in land\ncover mapping. Previous studies show encouraging progress in addressing\ncross-domain distribution shifts on remote sensing benchmarks for land cover\nmapping. The existing works are mainly built on large neural network\narchitectures, which makes them resource-hungry systems, limiting their\npractical impact for many real-world applications in resource-constrained\nenvironments. Thus, we proposed a simple yet effective framework to search for\nlightweight neural networks automatically for land cover mapping tasks under\ndomain shifts. This is achieved by integrating Markov random field neural\narchitecture search (MRF-NAS) into a self-training UDA framework to search for\nefficient and effective networks under a limited computation budget. This is\nthe first attempt to combine NAS with self-training UDA as a single framework\nfor land cover mapping. We also investigate two different pseudo-labelling\napproaches (confidence-based and energy-based) in self-training scheme.\nExperimental results on two recent datasets (OpenEarthMap & FLAIR #1) for\nremote sensing UDA demonstrate a satisfactory performance. With only less than\n2M parameters and 30.16 GFLOPs, the best-discovered lightweight network reaches\nstate-of-the-art performance on the regional target domain of OpenEarthMap\n(59.38% mIoU) and the considered target domain of FLAIR #1 (51.19% mIoU). The\ncode is at\nhttps://github.com/cliffbb/UDA-NAS}{https://github.com/cliffbb/UDA-NAS.\n","authors":["Clifford Broni-Bediako","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.14704v1.pdf","comment":"Accepted at CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.13677v2","updated":"2024-04-23T03:12:24Z","published":"2024-04-21T14:36:57Z","title":"A Dataset and Model for Realistic License Plate Deblurring","summary":" Vehicle license plate recognition is a crucial task in intelligent traffic\nmanagement systems. However, the challenge of achieving accurate recognition\npersists due to motion blur from fast-moving vehicles. Despite the widespread\nuse of image synthesis approaches in existing deblurring and recognition\nalgorithms, their effectiveness in real-world scenarios remains unproven. To\naddress this, we introduce the first large-scale license plate deblurring\ndataset named License Plate Blur (LPBlur), captured by a dual-camera system and\nprocessed through a post-processing pipeline to avoid misalignment issues.\nThen, we propose a License Plate Deblurring Generative Adversarial Network\n(LPDGAN) to tackle the license plate deblurring: 1) a Feature Fusion Module to\nintegrate multi-scale latent codes; 2) a Text Reconstruction Module to restore\nstructure through textual modality; 3) a Partition Discriminator Module to\nenhance the model's perception of details in each letter. Extensive experiments\nvalidate the reliability of the LPBlur dataset for both model training and\ntesting, showcasing that our proposed model outperforms other state-of-the-art\nmotion deblurring methods in realistic license plate deblurring scenarios. The\ndataset and code are available at https://github.com/haoyGONG/LPDGAN.\n","authors":["Haoyan Gong","Yuzheng Feng","Zhenrong Zhang","Xianxu Hou","Jingxin Liu","Siqi Huang","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13677v2.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.14696v1","updated":"2024-04-23T02:54:12Z","published":"2024-04-23T02:54:12Z","title":"Adaptive Prompt Learning with Negative Textual Semantics and Uncertainty\n Modeling for Universal Multi-Source Domain Adaptation","summary":" Universal Multi-source Domain Adaptation (UniMDA) transfers knowledge from\nmultiple labeled source domains to an unlabeled target domain under domain\nshifts (different data distribution) and class shifts (unknown target classes).\nExisting solutions focus on excavating image features to detect unknown\nsamples, ignoring abundant information contained in textual semantics. In this\npaper, we propose an Adaptive Prompt learning with Negative textual semantics\nand uncErtainty modeling method based on Contrastive Language-Image\nPre-training (APNE-CLIP) for UniMDA classification tasks. Concretely, we\nutilize the CLIP with adaptive prompts to leverage textual information of class\nsemantics and domain representations, helping the model identify unknown\nsamples and address domain shifts. Additionally, we design a novel global\ninstance-level alignment objective by utilizing negative textual semantics to\nachieve more precise image-text pair alignment. Furthermore, we propose an\nenergy-based uncertainty modeling strategy to enlarge the margin distance\nbetween known and unknown samples. Extensive experiments demonstrate the\nsuperiority of our proposed method.\n","authors":["Yuxiang Yang","Lu Wen","Yuanyuan Xu","Jiliu Zhou","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14696v1.pdf","comment":"Accepted by ICME2024"},{"id":"http://arxiv.org/abs/2308.04956v3","updated":"2024-04-23T02:51:28Z","published":"2023-08-09T13:41:30Z","title":"Improved Cryo-EM Pose Estimation and 3D Classification through\n Latent-Space Disentanglement","summary":" Due to the extremely low signal-to-noise ratio (SNR) and unknown poses\n(projection angles and image shifts) in cryo-electron microscopy (cryo-EM)\nexperiments, reconstructing 3D volumes from 2D images is very challenging. In\naddition to these challenges, heterogeneous cryo-EM reconstruction requires\nconformational classification. In popular cryo-EM reconstruction algorithms,\nposes and conformation classification labels must be predicted for every input\ncryo-EM image, which can be computationally costly for large datasets. An\nemerging class of methods adopted the amortized inference approach. In these\nmethods, only a subset of the input dataset is needed to train neural networks\nfor the estimation of poses and conformations. Once trained, these neural\nnetworks can make pose/conformation predictions and 3D reconstructions at low\ncost for the entire dataset during inference. Unfortunately, when facing\nheterogeneous reconstruction tasks, it is hard for current\namortized-inference-based methods to effectively estimate the conformational\ndistribution and poses from entangled latent variables. Here, we propose a\nself-supervised variational autoencoder architecture called \"HetACUMN\" based on\namortized inference. We employed an auxiliary conditional pose prediction task\nby inverting the order of encoder-decoder to explicitly enforce the\ndisentanglement of conformation and pose predictions. Results on simulated\ndatasets show that HetACUMN generated more accurate conformational\nclassifications than other amortized or non-amortized methods. Furthermore, we\nshow that HetACUMN is capable of performing heterogeneous 3D reconstructions of\na real experimental dataset.\n","authors":["Weijie Chen","Yuhang Wang","Lin Yao"],"pdf_url":"https://arxiv.org/pdf/2308.04956v3.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2404.14693v1","updated":"2024-04-23T02:50:38Z","published":"2024-04-23T02:50:38Z","title":"Double Privacy Guard: Robust Traceable Adversarial Watermarking against\n Face Recognition","summary":" The wide deployment of Face Recognition (FR) systems poses risks of privacy\nleakage. One countermeasure to address this issue is adversarial attacks, which\ndeceive malicious FR searches but simultaneously interfere the normal identity\nverification of trusted authorizers. In this paper, we propose the first Double\nPrivacy Guard (DPG) scheme based on traceable adversarial watermarking. DPG\nemploys a one-time watermark embedding to deceive unauthorized FR models and\nallows authorizers to perform identity verification by extracting the\nwatermark. Specifically, we propose an information-guided adversarial attack\nagainst FR models. The encoder embeds an identity-specific watermark into the\ndeep feature space of the carrier, guiding recognizable features of the image\nto deviate from the source identity. We further adopt a collaborative\nmeta-optimization strategy compatible with sub-tasks, which regularizes the\njoint optimization direction of the encoder and decoder. This strategy enhances\nthe representation of universal carrier features, mitigating multi-objective\noptimization conflicts in watermarking. Experiments confirm that DPG achieves\nsignificant attack success rates and traceability accuracy on state-of-the-art\nFR models, exhibiting remarkable robustness that outperforms the existing\nprivacy protection methods using adversarial attacks and deep watermarking, or\nsimple combinations of the two. Our work potentially opens up new insights into\nproactive protection for FR privacy.\n","authors":["Yunming Zhang","Dengpan Ye","Sipeng Shen","Caiyun Xie","Ziyi Liu","Jiacheng Deng","Long Tang"],"pdf_url":"https://arxiv.org/pdf/2404.14693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13903v2","updated":"2024-04-23T02:33:48Z","published":"2024-04-22T06:25:17Z","title":"Accelerating Image Generation with Sub-path Linear Approximation Model","summary":" Diffusion models have significantly advanced the state of the art in image,\naudio, and video generation tasks. However, their applications in practical\nscenarios are hindered by slow inference speed. Drawing inspiration from the\napproximation strategies utilized in consistency models, we propose the\nSub-path Linear Approximation Model (SLAM), which accelerates diffusion models\nwhile maintaining high-quality image generation. SLAM treats the PF-ODE\ntrajectory as a series of PF-ODE sub-paths divided by sampled points, and\nharnesses sub-path linear (SL) ODEs to form a progressive and continuous error\nestimation along each individual PF-ODE sub-path. The optimization on such\nSL-ODEs allows SLAM to construct denoising mappings with smaller cumulative\napproximated errors. An efficient distillation method is also developed to\nfacilitate the incorporation of more advanced diffusion models, such as latent\ndiffusion models. Our extensive experimental results demonstrate that SLAM\nachieves an efficient training regimen, requiring only 6 A100 GPU days to\nproduce a high-quality generative model capable of 2 to 4-step generation with\nhigh performance. Comprehensive evaluations on LAION, MS COCO 2014, and MS COCO\n2017 datasets also illustrate that SLAM surpasses existing acceleration methods\nin few-step generation tasks, achieving state-of-the-art performance both on\nFID and the quality of the generated images.\n","authors":["Chen Xu","Tianhui Song","Weixin Feng","Xubin Li","Tiezheng Ge","Bo Zheng","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14687v1","updated":"2024-04-23T02:32:57Z","published":"2024-04-23T02:32:57Z","title":"Pegasus-v1 Technical Report","summary":" This technical report introduces Pegasus-1, a multimodal language model\nspecialized in video content understanding and interaction through natural\nlanguage. Pegasus-1 is designed to address the unique challenges posed by video\ndata, such as interpreting spatiotemporal information, to offer nuanced video\ncontent comprehension across various lengths. This technical report overviews\nPegasus-1's architecture, training strategies, and its performance in\nbenchmarks on video conversation, zero-shot video question answering, and video\nsummarization. We also explore qualitative characteristics of Pegasus-1 ,\ndemonstrating its capabilities as well as its limitations, in order to provide\nreaders a balanced view of its current state and its future direction.\n","authors":["Raehyuk Jung","Hyojun Go","Jaehyuk Yi","Jiho Jang","Daniel Kim","Jay Suh","Aiden Lee","Cooper Han","Jae Lee","Jeff Kim","Jin-Young Kim","Junwan Kim","Kyle Park","Lucas Lee","Mars Ha","Minjoon Seo","Abraham Jo","Ed Park","Hassan Kianinejad","SJ Kim","Tony Moon","Wade Jeong","Andrei Popescu","Esther Kim","EK Yoon","Genie Heo","Henry Choi","Jenna Kang","Kevin Han","Noah Seo","Sunny Nguyen","Ryan Won","Yeonhoo Park","Anthony Giuliani","Dave Chung","Hans Yoon","James Le","Jenny Ahn","June Lee","Maninder Saini","Meredith Sanders","Soyoung Lee","Sue Kim","Travis Couture"],"pdf_url":"https://arxiv.org/pdf/2404.14687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14678v1","updated":"2024-04-23T02:06:10Z","published":"2024-04-23T02:06:10Z","title":"3DBench: A Scalable 3D Benchmark and Instruction-Tuning Dataset","summary":" Evaluating the performance of Multi-modal Large Language Models (MLLMs),\nintegrating both point cloud and language, presents significant challenges. The\nlack of a comprehensive assessment hampers determining whether these models\ntruly represent advancements, thereby impeding further progress in the field.\nCurrent evaluations heavily rely on classification and caption tasks, falling\nshort in providing a thorough assessment of MLLMs. A pressing need exists for a\nmore sophisticated evaluation method capable of thoroughly analyzing the\nspatial understanding and expressive capabilities of these models. To address\nthese issues, we introduce a scalable 3D benchmark, accompanied by a\nlarge-scale instruction-tuning dataset known as 3DBench, providing an\nextensible platform for a comprehensive evaluation of MLLMs. Specifically, we\nestablish the benchmark that spans a wide range of spatial and semantic scales,\nfrom object-level to scene-level, addressing both perception and planning\ntasks. Furthermore, we present a rigorous pipeline for automatically\nconstructing scalable 3D instruction-tuning datasets, covering 10 diverse\nmulti-modal tasks with more than 0.23 million QA pairs generated in total.\nThorough experiments evaluating trending MLLMs, comparisons against existing\ndatasets, and variations of training protocols demonstrate the superiority of\n3DBench, offering valuable insights into current limitations and potential\nresearch directions.\n","authors":["Junjie Zhang","Tianci Hu","Xiaoshui Huang","Yongshun Gong","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.14678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14676v1","updated":"2024-04-23T02:04:53Z","published":"2024-04-23T02:04:53Z","title":"DreamPBR: Text-driven Generation of High-resolution SVBRDF with\n Multi-modal Guidance","summary":" Prior material creation methods had limitations in producing diverse results\nmainly because reconstruction-based methods relied on real-world measurements\nand generation-based methods were trained on relatively small material\ndatasets. To address these challenges, we propose DreamPBR, a novel\ndiffusion-based generative framework designed to create spatially-varying\nappearance properties guided by text and multi-modal controls, providing high\ncontrollability and diversity in material generation. Key to achieving diverse\nand high-quality PBR material generation lies in integrating the capabilities\nof recent large-scale vision-language models trained on billions of text-image\npairs, along with material priors derived from hundreds of PBR material\nsamples. We utilize a novel material Latent Diffusion Model (LDM) to establish\nthe mapping between albedo maps and the corresponding latent space. The latent\nrepresentation is then decoded into full SVBRDF parameter maps using a\nrendering-aware PBR decoder. Our method supports tileable generation through\nconvolution with circular padding. Furthermore, we introduce a multi-modal\nguidance module, which includes pixel-aligned guidance, style image guidance,\nand 3D shape guidance, to enhance the control capabilities of the material LDM.\nWe demonstrate the effectiveness of DreamPBR in material creation, showcasing\nits versatility and user-friendliness on a wide range of controllable\ngeneration and editing applications.\n","authors":["Linxuan Xin","Zheng Zhang","Jinfu Wei","Ge Li","Duan Gao"],"pdf_url":"https://arxiv.org/pdf/2404.14676v1.pdf","comment":"16 pages, 17 figures"},{"id":"http://arxiv.org/abs/2404.14674v1","updated":"2024-04-23T02:00:58Z","published":"2024-04-23T02:00:58Z","title":"HOIN: High-Order Implicit Neural Representations","summary":" Implicit neural representations (INR) suffer from worsening spectral bias,\nwhich results in overly smooth solutions to the inverse problem. To deal with\nthis problem, we propose a universal framework for processing inverse problems\ncalled \\textbf{High-Order Implicit Neural Representations (HOIN)}. By refining\nthe traditional cascade structure to foster high-order interactions among\nfeatures, HOIN enhances the model's expressive power and mitigates spectral\nbias through its neural tangent kernel's (NTK) strong diagonal properties,\naccelerating and optimizing inverse problem resolution. By analyzing the\nmodel's expression space, high-order derivatives, and the NTK matrix, we\ntheoretically validate the feasibility of HOIN. HOIN realizes 1 to 3 dB\nimprovements in most inverse problems, establishing a new state-of-the-art\nrecovery quality and training efficiency, thus providing a new general paradigm\nfor INR and paving the way for it to solve the inverse problem.\n","authors":["Yang Chen","Ruituo Wu","Yipeng Liu","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.14674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14671v1","updated":"2024-04-23T01:55:09Z","published":"2024-04-23T01:55:09Z","title":"LaneCorrect: Self-supervised Lane Detection","summary":" Lane detection has evolved highly functional autonomous driving system to\nunderstand driving scenes even under complex environments. In this paper, we\nwork towards developing a generalized computer vision system able to detect\nlanes without using any annotation. We make the following contributions: (i) We\nillustrate how to perform unsupervised 3D lane segmentation by leveraging the\ndistinctive intensity of lanes on the LiDAR point cloud frames, and then obtain\nthe noisy lane labels in the 2D plane by projecting the 3D points; (ii) We\npropose a novel self-supervised training scheme, dubbed LaneCorrect, that\nautomatically corrects the lane label by learning geometric consistency and\ninstance awareness from the adversarial augmentations; (iii) With the\nself-supervised pre-trained model, we distill to train a student network for\narbitrary target lane (e.g., TuSimple) detection without any human labels; (iv)\nWe thoroughly evaluate our self-supervised method on four major lane detection\nbenchmarks (including TuSimple, CULane, CurveLanes and LLAMAS) and demonstrate\nexcellent performance compared with existing supervised counterpart, whilst\nshowing more effective results on alleviating the domain gap, i.e., training on\nCULane and test on TuSimple.\n","authors":["Ming Nie","Xinyue Cai","Hang Xu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.14671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14667v1","updated":"2024-04-23T01:51:58Z","published":"2024-04-23T01:51:58Z","title":"3DFlowRenderer: One-shot Face Re-enactment via Dense 3D Facial Flow\n Estimation","summary":" Performing facial expression transfer under one-shot setting has been\nincreasing in popularity among research community with a focus on precise\ncontrol of expressions. Existing techniques showcase compelling results in\nperceiving expressions, but they lack robustness with extreme head poses. They\nalso struggle to accurately reconstruct background details, thus hindering the\nrealism. In this paper, we propose a novel warping technology which integrates\nthe advantages of both 2D and 3D methods to achieve robust face re-enactment.\nWe generate dense 3D facial flow fields in feature space to warp an input image\nbased on target expressions without depth information. This enables explicit 3D\ngeometric control for re-enacting misaligned source and target faces. We\nregularize the motion estimation capability of the 3D flow prediction network\nthrough proposed \"Cyclic warp loss\" by converting warped 3D features back into\n2D RGB space. To ensure the generation of finer facial region with\nnatural-background, our framework only renders the facial foreground region\nfirst and learns to inpaint the blank area which needs to be filled due to\nsource face translation, thus reconstructing the detailed background without\nany unwanted pixel motion. Extensive evaluation reveals that our method\noutperforms state-of-the-art techniques in rendering artifact-free facial\nimages.\n","authors":["Siddharth Nijhawan","Takuya Yashima","Tamaki Kojima"],"pdf_url":"https://arxiv.org/pdf/2404.14667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02162v2","updated":"2024-04-23T01:48:32Z","published":"2023-04-04T23:27:02Z","title":"Learning to Recover Spectral Reflectance from RGB Images","summary":" This paper tackles spectral reflectance recovery (SRR) from RGB images. Since\ncapturing ground-truth spectral reflectance and camera spectral sensitivity are\nchallenging and costly, most existing approaches are trained on synthetic\nimages and utilize the same parameters for all unseen testing images, which are\nsuboptimal especially when the trained models are tested on real images because\nthey never exploit the internal information of the testing images. To address\nthis issue, we adopt a self-supervised meta-auxiliary learning (MAXL) strategy\nthat fine-tunes the well-trained network parameters with each testing image to\ncombine external with internal information. To the best of our knowledge, this\nis the first work that successfully adapts the MAXL strategy to this problem.\nInstead of relying on naive end-to-end training, we also propose a novel\narchitecture that integrates the physical relationship between the spectral\nreflectance and the corresponding RGB images into the network based on our\nmathematical analysis. Besides, since the spectral reflectance of a scene is\nindependent to its illumination while the corresponding RGB images are not, we\nrecover the spectral reflectance of a scene from its RGB images captured under\nmultiple illuminations to further reduce the unknown. Qualitative and\nquantitative evaluations demonstrate the effectiveness of our proposed network\nand of the MAXL. Our code and data are available at\nhttps://github.com/Dong-Huo/SRR-MAXL.\n","authors":["Dong Huo","Jian Wang","Yiming Qian","Yee-Hong Yang"],"pdf_url":"https://arxiv.org/pdf/2304.02162v2.pdf","comment":"IEEE Transactions on Image Processing (TIP), 2024"},{"id":"http://arxiv.org/abs/2404.14661v1","updated":"2024-04-23T01:45:55Z","published":"2024-04-23T01:45:55Z","title":"First Mapping the Canopy Height of Primeval Forests in the Tallest Tree\n Area of Asia","summary":" We have developed the world's first canopy height map of the distribution\narea of world-level giant trees. This mapping is crucial for discovering more\nindividual and community world-level giant trees, and for analyzing and\nquantifying the effectiveness of biodiversity conservation measures in the\nYarlung Tsangpo Grand Canyon (YTGC) National Nature Reserve. We proposed a\nmethod to map the canopy height of the primeval forest within the world-level\ngiant tree distribution area by using a spaceborne LiDAR fusion satellite\nimagery (Global Ecosystem Dynamics Investigation (GEDI), ICESat-2, and\nSentinel-2) driven deep learning modeling. And we customized a pyramid\nreceptive fields depth separable CNN (PRFXception). PRFXception, a CNN\narchitecture specifically customized for mapping primeval forest canopy height\nto infer the canopy height at the footprint level of GEDI and ICESat-2 from\nSentinel-2 optical imagery with a 10-meter spatial resolution. We conducted a\nfield survey of 227 permanent plots using a stratified sampling method and\nmeasured several giant trees using UAV-LS. The predicted canopy height was\ncompared with ICESat-2 and GEDI validation data (RMSE =7.56 m, MAE=6.07 m,\nME=-0.98 m, R^2=0.58 m), UAV-LS point clouds (RMSE =5.75 m, MAE =3.72 m, ME =\n0.82 m, R^2= 0.65 m), and ground survey data (RMSE = 6.75 m, MAE = 5.56 m, ME=\n2.14 m, R^2=0.60 m). We mapped the potential distribution map of world-level\ngiant trees and discovered two previously undetected giant tree communities\nwith an 89% probability of having trees 80-100 m tall, potentially taller than\nAsia's tallest tree. This paper provides scientific evidence confirming\nsoutheastern Tibet--northwestern Yunnan as the fourth global distribution\ncenter of world-level giant trees initiatives and promoting the inclusion of\nthe YTGC giant tree distribution area within the scope of China's national park\nconservation.\n","authors":["Guangpeng Fan","Fei Yan","Xiangquan Zeng","Qingtao Xu","Ruoyoulan Wang","Binghong Zhang","Jialing Zhou","Liangliang Nan","Jinhu Wang","Zhiwei Zhang","Jia Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15841v4","updated":"2024-04-23T01:40:57Z","published":"2023-11-27T14:07:13Z","title":"Learning Disentangled Identifiers for Action-Customized Text-to-Image\n Generation","summary":" This study focuses on a novel task in text-to-image (T2I) generation, namely\naction customization. The objective of this task is to learn the co-existing\naction from limited data and generalize it to unseen humans or even animals.\nExperimental results show that existing subject-driven customization methods\nfail to learn the representative characteristics of actions and struggle in\ndecoupling actions from context features, including appearance. To overcome the\npreference for low-level features and the entanglement of high-level features,\nwe propose an inversion-based method Action-Disentangled Identifier (ADI) to\nlearn action-specific identifiers from the exemplar images. ADI first expands\nthe semantic conditioning space by introducing layer-wise identifier tokens,\nthereby increasing the representational richness while distributing the\ninversion across different features. Then, to block the inversion of\naction-agnostic features, ADI extracts the gradient invariance from the\nconstructed sample triples and masks the updates of irrelevant channels. To\ncomprehensively evaluate the task, we present an ActionBench that includes a\nvariety of actions, each accompanied by meticulously selected samples. Both\nquantitative and qualitative results show that our ADI outperforms existing\nbaselines in action-customized T2I generation. Our project page is at\nhttps://adi-t2i.github.io/ADI.\n","authors":["Siteng Huang","Biao Gong","Yutong Feng","Xi Chen","Yuqian Fu","Yu Liu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15841v4.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.14657v1","updated":"2024-04-23T01:34:20Z","published":"2024-04-23T01:34:20Z","title":"Progressive Token Length Scaling in Transformer Encoders for Efficient\n Universal Segmentation","summary":" A powerful architecture for universal segmentation relies on transformers\nthat encode multi-scale image features and decode object queries into mask\npredictions. With efficiency being a high priority for scaling such models, we\nobserved that the state-of-the-art method Mask2Former uses ~50% of its compute\nonly on the transformer encoder. This is due to the retention of a full-length\ntoken-level representation of all backbone feature scales at each encoder\nlayer. With this observation, we propose a strategy termed PROgressive Token\nLength SCALing for Efficient transformer encoders (PRO-SCALE) that can be\nplugged-in to the Mask2Former-style segmentation architectures to significantly\nreduce the computational cost. The underlying principle of PRO-SCALE is:\nprogressively scale the length of the tokens with the layers of the encoder.\nThis allows PRO-SCALE to reduce computations by a large margin with minimal\nsacrifice in performance (~52% GFLOPs reduction with no drop in performance on\nCOCO dataset). We validate our framework on multiple public benchmarks.\n","authors":["Abhishek Aich","Yumin Suh","Samuel Schulter","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2404.14657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12368v2","updated":"2024-04-23T01:21:58Z","published":"2024-04-18T17:50:23Z","title":"Gradient-Regularized Out-of-Distribution Detection","summary":" One of the challenges for neural networks in real-life applications is the\noverconfident errors these models make when the data is not from the original\ntraining distribution.\n Addressing this issue is known as Out-of-Distribution (OOD) detection.\n Many state-of-the-art OOD methods employ an auxiliary dataset as a surrogate\nfor OOD data during training to achieve improved performance.\n However, these methods fail to fully exploit the local information embedded\nin the auxiliary dataset.\n In this work, we propose the idea of leveraging the information embedded in\nthe gradient of the loss function during training to enable the network to not\nonly learn a desired OOD score for each sample but also to exhibit similar\nbehavior in a local neighborhood around each sample.\n We also develop a novel energy-based sampling method to allow the network to\nbe exposed to more informative OOD samples during the training phase. This is\nespecially important when the auxiliary dataset is large. We demonstrate the\neffectiveness of our method through extensive experiments on several OOD\nbenchmarks, improving the existing state-of-the-art FPR95 by 4% on our ImageNet\nexperiment.\n We further provide a theoretical analysis through the lens of certified\nrobustness and Lipschitz analysis to showcase the theoretical foundation of our\nwork. We will publicly release our code after the review process.\n","authors":["Sina Sharifi","Taha Entesari","Bardia Safaei","Vishal M. Patel","Mahyar Fazlyab"],"pdf_url":"https://arxiv.org/pdf/2404.12368v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.14653v1","updated":"2024-04-23T01:19:19Z","published":"2024-04-23T01:19:19Z","title":"Machine Vision Based Assessment of Fall Color Changes in Apple Trees:\n Exploring Relationship with Leaf Nitrogen Concentration","summary":" Apple trees being deciduous trees, shed leaves each year which is preceded by\nthe change in color of leaves from green to yellow (also known as senescence)\nduring the fall season. The rate and timing of color change are affected by the\nnumber of factors including nitrogen (N) deficiencies. The green color of\nleaves is highly dependent on the chlorophyll content, which in turn depends on\nthe nitrogen concentration in the leaves. The assessment of the leaf color can\ngive vital information on the nutrient status of the tree. The use of a machine\nvision based system to capture and quantify these timings and changes in leaf\ncolor can be a great tool for that purpose.\n \\par This study is based on data collected during the fall of 2021 and 2023\nat a commercial orchard using a ground-based stereo-vision sensor for five\nweeks. The point cloud obtained from the sensor was segmented to get just the\ntree in the foreground. The study involved the segmentation of the trees in a\nnatural background using point cloud data and quantification of the color using\na custom-defined metric, \\textit{yellowness index}, varying from $-1$ to $+1$\n($-1$ being completely green and $+1$ being completely yellow), which gives the\nproportion of yellow leaves on a tree. The performance of K-means based\nalgorithm and gradient boosting algorithm were compared for \\textit{yellowness\nindex} calculation. The segmentation method proposed in the study was able to\nestimate the \\textit{yellowness index} on the trees with $R^2 = 0.72$. The\nresults showed that the metric was able to capture the gradual color transition\nfrom green to yellow over the study duration. It was also observed that the\ntrees with lower nitrogen showed the color transition to yellow earlier than\nthe trees with higher nitrogen. The onset of color transition during both years\naligned with the $29^{th}$ week post-full bloom.\n","authors":["Achyut Paudel","Jostan Brown","Priyanka Upadhyaya","Atif Bilal Asad","Safal Kshetri","Manoj Karkee","Joseph R. Davidson","Cindy Grimm","Ashley Thompson"],"pdf_url":"https://arxiv.org/pdf/2404.14653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14634v1","updated":"2024-04-23T00:18:00Z","published":"2024-04-23T00:18:00Z","title":"UPose3D: Uncertainty-Aware 3D Human Pose Estimation with Cross-View and\n Temporal Cues","summary":" We introduce UPose3D, a novel approach for multi-view 3D human pose\nestimation, addressing challenges in accuracy and scalability. Our method\nadvances existing pose estimation frameworks by improving robustness and\nflexibility without requiring direct 3D annotations. At the core of our method,\na pose compiler module refines predictions from a 2D keypoints estimator that\noperates on a single image by leveraging temporal and cross-view information.\nOur novel cross-view fusion strategy is scalable to any number of cameras,\nwhile our synthetic data generation strategy ensures generalization across\ndiverse actors, scenes, and viewpoints. Finally, UPose3D leverages the\nprediction uncertainty of both the 2D keypoint estimator and the pose compiler\nmodule. This provides robustness to outliers and noisy data, resulting in\nstate-of-the-art performance in out-of-distribution settings. In addition, for\nin-distribution settings, UPose3D yields a performance rivaling methods that\nrely on 3D annotated data, while being the state-of-the-art among methods\nrelying only on 2D supervision.\n","authors":["Vandad Davoodnia","Saeed Ghorbani","Marc-André Carbonneau","Alexandre Messier","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2404.14634v1.pdf","comment":"18 pages, 12 figures"}]},"2024-04-21T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.13798v1","updated":"2024-04-21T23:34:45Z","published":"2024-04-21T23:34:45Z","title":"Enforcing Conditional Independence for Fair Representation Learning and\n Causal Image Generation","summary":" Conditional independence (CI) constraints are critical for defining and\nevaluating fairness in machine learning, as well as for learning unconfounded\nor causal representations. Traditional methods for ensuring fairness either\nblindly learn invariant features with respect to a protected variable (e.g.,\nrace when classifying sex from face images) or enforce CI relative to the\nprotected attribute only on the model output (e.g., the sex label). Neither of\nthese methods are effective in enforcing CI in high-dimensional feature spaces.\nIn this paper, we focus on a nascent approach characterizing the CI constraint\nin terms of two Jensen-Shannon divergence terms, and we extend it to\nhigh-dimensional feature spaces using a novel dynamic sampling strategy. In\ndoing so, we introduce a new training paradigm that can be applied to any\nencoder architecture. We are able to enforce conditional independence of the\ndiffusion autoencoder latent representation with respect to any protected\nattribute under the equalized odds constraint and show that this approach\nenables causal image generation with controllable latent spaces. Our\nexperimental results demonstrate that our approach can achieve high accuracy on\ndownstream tasks while upholding equality of odds.\n","authors":["Jensen Hwa","Qingyu Zhao","Aditya Lahiri","Adnan Masood","Babak Salimi","Ehsan Adeli"],"pdf_url":"https://arxiv.org/pdf/2404.13798v1.pdf","comment":"To appear at the 2024 IEEE CVPR Workshop on Fair, Data-Efficient, and\n Trusted Computer Vision"},{"id":"http://arxiv.org/abs/2404.13791v1","updated":"2024-04-21T23:01:08Z","published":"2024-04-21T23:01:08Z","title":"Universal Fingerprint Generation: Controllable Diffusion Model with\n Multimodal Conditions","summary":" The utilization of synthetic data for fingerprint recognition has garnered\nincreased attention due to its potential to alleviate privacy concerns\nsurrounding sensitive biometric data. However, current methods for generating\nfingerprints have limitations in creating impressions of the same finger with\nuseful intra-class variations. To tackle this challenge, we present GenPrint, a\nframework to produce fingerprint images of various types while maintaining\nidentity and offering humanly understandable control over different appearance\nfactors such as fingerprint class, acquisition type, sensor device, and quality\nlevel. Unlike previous fingerprint generation approaches, GenPrint is not\nconfined to replicating style characteristics from the training dataset alone:\nit enables the generation of novel styles from unseen devices without requiring\nadditional fine-tuning. To accomplish these objectives, we developed GenPrint\nusing latent diffusion models with multimodal conditions (text and image) for\nconsistent generation of style and identity. Our experiments leverage a variety\nof publicly available datasets for training and evaluation. Results demonstrate\nthe benefits of GenPrint in terms of identity preservation, explainable\ncontrol, and universality of generated images. Importantly, the\nGenPrint-generated images yield comparable or even superior accuracy to models\ntrained solely on real data and further enhances performance when augmenting\nthe diversity of existing real fingerprint datasets.\n","authors":["Steven A. Grosz","Anil K. Jain"],"pdf_url":"https://arxiv.org/pdf/2404.13791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13788v1","updated":"2024-04-21T22:33:57Z","published":"2024-04-21T22:33:57Z","title":"AnyPattern: Towards In-context Image Copy Detection","summary":" This paper explores in-context learning for image copy detection (ICD), i.e.,\nprompting an ICD model to identify replicated images with new tampering\npatterns without the need for additional training. The prompts (or the\ncontexts) are from a small set of image-replica pairs that reflect the new\npatterns and are used at inference time. Such in-context ICD has good realistic\nvalue, because it requires no fine-tuning and thus facilitates fast reaction\nagainst the emergence of unseen patterns. To accommodate the \"seen\n$\\rightarrow$ unseen\" generalization scenario, we construct the first\nlarge-scale pattern dataset named AnyPattern, which has the largest number of\ntamper patterns ($90$ for training and $10$ for testing) among all the existing\nones. We benchmark AnyPattern with popular ICD methods and reveal that existing\nmethods barely generalize to novel tamper patterns. We further propose a simple\nin-context ICD method named ImageStacker. ImageStacker learns to select the\nmost representative image-replica pairs and employs them as the pattern prompts\nin a stacking manner (rather than the popular concatenation manner).\nExperimental results show (1) training with our large-scale dataset\nsubstantially benefits pattern generalization ($+26.66 \\%$ $\\mu AP$), (2) the\nproposed ImageStacker facilitates effective in-context ICD (another round of\n$+16.75 \\%$ $\\mu AP$), and (3) AnyPattern enables in-context ICD, i.e. without\nsuch a large-scale dataset, in-context learning does not emerge even with our\nImageStacker. The project (including the proposed dataset AnyPattern and the\ncode for ImageStacker) is publicly available at https://anypattern.github.io\nunder the MIT Licence.\n","authors":["Wenhao Wang","Yifan Sun","Zhentao Tan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13784v1","updated":"2024-04-21T21:30:17Z","published":"2024-04-21T21:30:17Z","title":"Iteratively Prompting Multimodal LLMs to Reproduce Natural and\n AI-Generated Images","summary":" With the digital imagery landscape rapidly evolving, image stocks and\nAI-generated image marketplaces have become central to visual media.\nTraditional stock images now exist alongside innovative platforms that trade in\nprompts for AI-generated visuals, driven by sophisticated APIs like DALL-E 3\nand Midjourney. This paper studies the possibility of employing multi-modal\nmodels with enhanced visual understanding to mimic the outputs of these\nplatforms, introducing an original attack strategy. Our method leverages\nfine-tuned CLIP models, a multi-label classifier, and the descriptive\ncapabilities of GPT-4V to create prompts that generate images similar to those\navailable in marketplaces and from premium stock image providers, yet at a\nmarkedly lower expense. In presenting this strategy, we aim to spotlight a new\nclass of economic and security considerations within the realm of digital\nimagery. Our findings, supported by both automated metrics and human\nassessment, reveal that comparable visual content can be produced for a\nfraction of the prevailing market prices ($0.23 - $0.27 per image), emphasizing\nthe need for awareness and strategic discussions about the integrity of digital\nmedia in an increasingly AI-integrated landscape. Our work also contributes to\nthe field by assembling a dataset consisting of approximately 19 million\nprompt-image pairs generated by the popular Midjourney platform, which we plan\nto release publicly.\n","authors":["Ali Naseh","Katherine Thai","Mohit Iyyer","Amir Houmansadr"],"pdf_url":"https://arxiv.org/pdf/2404.13784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13770v1","updated":"2024-04-21T20:45:18Z","published":"2024-04-21T20:45:18Z","title":"EncodeNet: A Framework for Boosting DNN Accuracy with Entropy-driven\n Generalized Converting Autoencoder","summary":" Image classification is a fundamental task in computer vision, and the quest\nto enhance DNN accuracy without inflating model size or latency remains a\npressing concern. We make a couple of advances in this regard, leading to a\nnovel EncodeNet design and training framework. The first advancement involves\nConverting Autoencoders, a novel approach that transforms images into an\neasy-to-classify image of its class. Our prior work that applied the Converting\nAutoencoder and a simple classifier in tandem achieved moderate accuracy over\nsimple datasets, such as MNIST and FMNIST. However, on more complex datasets\nlike CIFAR-10, the Converting Autoencoder has a large reconstruction loss,\nmaking it unsuitable for enhancing DNN accuracy. To address these limitations,\nwe generalize the design of Converting Autoencoders by leveraging a larger\nclass of DNNs, those with architectures comprising feature extraction layers\nfollowed by classification layers. We incorporate a generalized algorithmic\ndesign of the Converting Autoencoder and intraclass clustering to identify\nrepresentative images, leading to optimized image feature learning. Next, we\ndemonstrate the effectiveness of our EncodeNet design and training framework,\nimproving the accuracy of well-trained baseline DNNs while maintaining the\noverall model size. EncodeNet's building blocks comprise the trained encoder\nfrom our generalized Converting Autoencoders transferring knowledge to a\nlightweight classifier network - also extracted from the baseline DNN. Our\nexperimental results demonstrate that EncodeNet improves the accuracy of VGG16\nfrom 92.64% to 94.05% on CIFAR-10 and RestNet20 from 74.56% to 76.04% on\nCIFAR-100. It outperforms state-of-the-art techniques that rely on knowledge\ndistillation and attention mechanisms, delivering higher accuracy for models of\ncomparable size.\n","authors":["Hasanul Mahmud","Kevin Desai","Palden Lama","Sushil K. Prasad"],"pdf_url":"https://arxiv.org/pdf/2404.13770v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.13767v1","updated":"2024-04-21T20:32:02Z","published":"2024-04-21T20:32:02Z","title":"Autonomous Robot for Disaster Mapping and Victim Localization","summary":" In response to the critical need for effective reconnaissance in disaster\nscenarios, this research article presents the design and implementation of a\ncomplete autonomous robot system using the Turtlebot3 with Robotic Operating\nSystem (ROS) Noetic. Upon deployment in closed, initially unknown environments,\nthe system aims to generate a comprehensive map and identify any present\n'victims' using AprilTags as stand-ins. We discuss our solution for search and\nrescue missions, while additionally exploring more advanced algorithms to\nimprove search and rescue functionalities. We introduce a Cubature Kalman\nFilter to help reduce the mean squared error [m] for AprilTag localization and\nan information-theoretic exploration algorithm to expedite exploration in\nunknown environments. Just like turtles, our system takes it slow and steady,\nbut when it's time to save the day, it moves at ninja-like speed! Despite\nDonatello's shell, he's no slowpoke - he zips through obstacles with the\nagility of a teenage mutant ninja turtle. So, hang on tight to your shells and\nget ready for a whirlwind of reconnaissance!\n Full pipeline code https://github.com/rzhao5659/MRProject/tree/main\n Exploration code https://github.com/rzhao5659/MRProject/tree/main\n","authors":["Michael Potter","Rahil Bhowal","Richard Zhao","Anuj Patel","Jingming Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.13767v1.pdf","comment":"Class final project for Northeastern University EECE 5550 Mobile\n Robotics Course"},{"id":"http://arxiv.org/abs/2403.20260v2","updated":"2024-04-21T20:29:17Z","published":"2024-03-29T16:08:59Z","title":"Prototype-based Interpretable Breast Cancer Prediction Models: Analysis\n and Challenges","summary":" Deep learning models have achieved high performance in medical applications,\nhowever, their adoption in clinical practice is hindered due to their black-box\nnature. Self-explainable models, like prototype-based models, can be especially\nbeneficial as they are interpretable by design. However, if the learnt\nprototypes are of low quality then the prototype-based models are as good as\nblack-box. Having high quality prototypes is a pre-requisite for a truly\ninterpretable model. In this work, we propose a prototype evaluation framework\nfor coherence (PEF-C) for quantitatively evaluating the quality of the\nprototypes based on domain knowledge. We show the use of PEF-C in the context\nof breast cancer prediction using mammography. Existing works on\nprototype-based models on breast cancer prediction using mammography have\nfocused on improving the classification performance of prototype-based models\ncompared to black-box models and have evaluated prototype quality through\nanecdotal evidence. We are the first to go beyond anecdotal evidence and\nevaluate the quality of the mammography prototypes systematically using our\nPEF-C. Specifically, we apply three state-of-the-art prototype-based models,\nProtoPNet, BRAIxProtoPNet++ and PIP-Net on mammography images for breast cancer\nprediction and evaluate these models w.r.t. i) classification performance, and\nii) quality of the prototypes, on three public datasets. Our results show that\nprototype-based models are competitive with black-box models in terms of\nclassification performance, and achieve a higher score in detecting ROIs.\nHowever, the quality of the prototypes are not yet sufficient and can be\nimproved in aspects of relevance, purity and learning a variety of prototypes.\nWe call the XAI community to systematically evaluate the quality of the\nprototypes to check their true usability in high stake decisions and improve\nsuch models further.\n","authors":["Shreyasi Pathak","Jörg Schlötterer","Jeroen Veltman","Jeroen Geerdink","Maurice van Keulen","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2403.20260v2.pdf","comment":"Accepted at World Conference on Explainable Artificial Intelligence;\n 21 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.13766v1","updated":"2024-04-21T20:26:46Z","published":"2024-04-21T20:26:46Z","title":"Object-Attribute Binding in Text-to-Image Generation: Evaluation and\n Control","summary":" Current diffusion models create photorealistic images given a text prompt as\ninput but struggle to correctly bind attributes mentioned in the text to the\nright objects in the image. This is evidenced by our novel image-graph\nalignment model called EPViT (Edge Prediction Vision Transformer) for the\nevaluation of image-text alignment. To alleviate the above problem, we propose\nfocused cross-attention (FCA) that controls the visual attention maps by\nsyntactic constraints found in the input sentence. Additionally, the syntax\nstructure of the prompt helps to disentangle the multimodal CLIP embeddings\nthat are commonly used in T2I generation. The resulting DisCLIP embeddings and\nFCA are easily integrated in state-of-the-art diffusion models without\nadditional training of these models. We show substantial improvements in T2I\ngeneration and especially its attribute-object binding on several\ndatasets.\\footnote{Code and data will be made available upon acceptance.\n","authors":["Maria Mihaela Trusca","Wolf Nuyts","Jonathan Thomm","Robert Honig","Thomas Hofmann","Tinne Tuytelaars","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2404.13766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19966v2","updated":"2024-04-21T20:16:41Z","published":"2024-03-29T04:02:51Z","title":"Multi-task Magnetic Resonance Imaging Reconstruction using Meta-learning","summary":" Using single-task deep learning methods to reconstruct Magnetic Resonance\nImaging (MRI) data acquired with different imaging sequences is inherently\nchallenging. The trained deep learning model typically lacks generalizability,\nand the dissimilarity among image datasets with different types of contrast\nleads to suboptimal learning performance. This paper proposes a meta-learning\napproach to efficiently learn image features from multiple MR image datasets.\nOur algorithm can perform multi-task learning to simultaneously reconstruct MR\nimages acquired using different imaging sequences with different image\ncontrasts. The experiment results demonstrate the ability of our new\nmeta-learning reconstruction method to successfully reconstruct\nhighly-undersampled k-space data from multiple MRI datasets simultaneously,\noutperforming other compelling reconstruction methods previously developed for\nsingle-task learning.\n","authors":["Wanyu Bian","Albert Jang","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.11720v2","updated":"2024-04-21T20:09:17Z","published":"2022-07-24T11:26:53Z","title":"Progressive Feature Learning for Realistic Cloth-Changing Gait\n Recognition","summary":" Gait recognition is instrumental in crime prevention and social security, for\nit can be conducted at a long distance to figure out the identity of persons.\nHowever, existing datasets and methods cannot satisfactorily deal with the most\nchallenging cloth-changing problem in practice. Specifically, the practical\ngait models are usually trained on automatically labeled data, in which the\nsequences' views and cloth conditions of each person have some restrictions. To\nbe concrete, the cross-view sub-dataset only has normal walking condition\nwithout cloth-changing, while the cross-cloth sub-dataset has cloth-changing\nsequences but only in front views. As a result, the cloth-changing accuracy\ncannot meet practical requirements. In this work, we formulate the problem as\nRealistic Cloth-Changing Gait Recognition (abbreviated as RCC-GR) and we\nconstruct two benchmarks: CASIA-BN-RCC and OUMVLP-RCC, to simulate the above\nsetting. Furthermore, we propose a new framework called Progressive Feature\nLearning that can be applied with off-the-shelf backbones to improve their\nperformance in RCC-GR. Specifically, in our framework, we design Progressive\nMapping and Progressive Uncertainty to extract cross-view features and then\nextract cross-cloth features on the basis. In this way, the feature from the\ncross-view sub-dataset can first dominate the feature space and relieve the\nuneven distribution caused by the adverse effect from the cross-cloth\nsub-dataset. The experiments on our benchmarks show that our framework can\neffectively improve recognition performance, especially in the cloth-changing\nconditions.\n","authors":["Xuqian Ren","Saihui Hou","Chunshui Cao","Xu Liu","Yongzhen Huang"],"pdf_url":"https://arxiv.org/pdf/2207.11720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10772v2","updated":"2024-04-21T19:51:26Z","published":"2023-03-19T21:34:20Z","title":"Unsupervised Gait Recognition with Selective Fusion","summary":" Previous gait recognition methods primarily trained on labeled datasets,\nwhich require painful labeling effort. However, using a pre-trained model on a\nnew dataset without fine-tuning can lead to significant performance\ndegradation. So to make the pre-trained gait recognition model able to be\nfine-tuned on unlabeled datasets, we propose a new task: Unsupervised Gait\nRecognition (UGR). We introduce a new cluster-based baseline to solve UGR with\ncluster-level contrastive learning. But we further find more challenges this\ntask meets. First, sequences of the same person in different clothes tend to\ncluster separately due to the significant appearance changes. Second, sequences\ntaken from 0{\\deg} and 180{\\deg} views lack walking postures and do not cluster\nwith sequences taken from other views. To address these challenges, we propose\na Selective Fusion method, which includes Selective Cluster Fusion (SCF) and\nSelective Sample Fusion (SSF). With SCF, we merge matched clusters of the same\nperson wearing different clothes by updating the cluster-level memory bank with\na multi-cluster update strategy. And in SSF, we merge sequences taken from\nfront/back views gradually with curriculum learning. Extensive experiments show\nthe effectiveness of our method in improving the rank-1 accuracy in walking\nwith different coats condition and front/back views conditions.\n","authors":["Xuqian Ren","Shaopeng Yang","Saihui Hou","Chunshui Cao","Xu Liu","Yongzhen Huang"],"pdf_url":"https://arxiv.org/pdf/2303.10772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13756v1","updated":"2024-04-21T19:42:28Z","published":"2024-04-21T19:42:28Z","title":"BC-MRI-SEG: A Breast Cancer MRI Tumor Segmentation Benchmark","summary":" Binary breast cancer tumor segmentation with Magnetic Resonance Imaging (MRI)\ndata is typically trained and evaluated on private medical data, which makes\ncomparing deep learning approaches difficult. We propose a benchmark\n(BC-MRI-SEG) for binary breast cancer tumor segmentation based on publicly\navailable MRI datasets. The benchmark consists of four datasets in total, where\ntwo datasets are used for supervised training and evaluation, and two are used\nfor zero-shot evaluation. Additionally we compare state-of-the-art (SOTA)\napproaches on our benchmark and provide an exhaustive list of available public\nbreast cancer MRI datasets. The source code has been made available at\nhttps://irulenot.github.io/BC_MRI_SEG_Benchmark.\n","authors":["Anthony Bilic","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13745v1","updated":"2024-04-21T19:02:38Z","published":"2024-04-21T19:02:38Z","title":"A Nasal Cytology Dataset for Object Detection and Deep Learning","summary":" Nasal Cytology is a new and efficient clinical technique to diagnose rhinitis\nand allergies that is not much widespread due to the time-consuming nature of\ncell counting; that is why AI-aided counting could be a turning point for the\ndiffusion of this technique. In this article we present the first dataset of\nrhino-cytological field images: the NCD (Nasal Cytology Dataset), aimed to\ntrain and deploy Object Detection models to support physicians and biologists\nduring clinical practice. The real distribution of the cytotypes, populating\nthe nasal mucosa has been replicated, sampling images from slides of clinical\npatients, and manually annotating each cell found on them. The correspondent\nobject detection task presents non'trivial issues associated with the strong\nclass imbalancement, involving the rarest cell types. This work contributes to\nsome of open challenges by presenting a novel machine learning-based approach\nto aid the automated detection and classification of nasal mucosa cells: the\nDETR and YOLO models shown good performance in detecting cells and classifying\nthem correctly, revealing great potential to accelerate the work of rhinology\nexperts.\n","authors":["Mauro Camporeale","Giovanni Dimauro","Matteo Gelardi","Giorgia Iacobellis","Mattia Sebastiano Ladisa","Sergio Latrofa","Nunzia Lomonte"],"pdf_url":"https://arxiv.org/pdf/2404.13745v1.pdf","comment":"Pre Print almost ready to be submitted"},{"id":"http://arxiv.org/abs/2403.06098v2","updated":"2024-04-21T18:42:44Z","published":"2024-03-10T05:40:12Z","title":"VidProM: A Million-scale Real Prompt-Gallery Dataset for Text-to-Video\n Diffusion Models","summary":" The arrival of Sora marks a new era for text-to-video diffusion models,\nbringing significant advancements in video generation and potential\napplications. However, Sora, along with other text-to-video diffusion models,\nis highly reliant on prompts, and there is no publicly available dataset that\nfeatures a study of text-to-video prompts. In this paper, we introduce VidProM,\nthe first large-scale dataset comprising 1.67 Million unique text-to-Video\nPrompts from real users. Additionally, this dataset includes 6.69 million\nvideos generated by four state-of-the-art diffusion models, alongside some\nrelated data. We initially discuss the curation of this large-scale dataset, a\nprocess that is both time-consuming and costly. Subsequently, we underscore the\nneed for a new prompt dataset specifically designed for text-to-video\ngeneration by illustrating how VidProM differs from DiffusionDB, a large-scale\nprompt-gallery dataset for image generation. Our extensive and diverse dataset\nalso opens up many exciting new research areas. For instance, we suggest\nexploring text-to-video prompt engineering, efficient video generation, and\nvideo copy detection for diffusion models to develop better, more efficient,\nand safer models. The project (including the collected dataset VidProM and\nrelated code) is publicly available at https://vidprom.github.io under the\nCC-BY-NC 4.0 License.\n","authors":["Wenhao Wang","Yifan Sun","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.06098v2.pdf","comment":"The project (including the collected dataset VidProM and related\n code) is publicly available at https://vidprom.github.io under the CC-BY-NC\n 4.0 License"},{"id":"http://arxiv.org/abs/2404.14990v1","updated":"2024-04-21T18:32:08Z","published":"2024-04-21T18:32:08Z","title":"Interpreting COVID Lateral Flow Tests' Results with Foundation Models","summary":" Lateral flow tests (LFTs) enable rapid, low-cost testing for health\nconditions including Covid, pregnancy, HIV, and malaria. Automated readers of\nLFT results can yield many benefits including empowering blind people to\nindependently learn about their health and accelerating data entry for\nlarge-scale monitoring (e.g., for pandemics such as Covid) by using only a\nsingle photograph per LFT test. Accordingly, we explore the abilities of modern\nfoundation vision language models (VLMs) in interpreting such tests. To enable\nthis analysis, we first create a new labeled dataset with hierarchical\nsegmentations of each LFT test and its nested test result window. We call this\ndataset LFT-Grounding. Next, we benchmark eight modern VLMs in zero-shot\nsettings for analyzing these images. We demonstrate that current VLMs\nfrequently fail to correctly identify the type of LFT test, interpret the test\nresults, locate the nested result window of the LFT tests, and recognize LFT\ntests when they partially obfuscated. To facilitate community-wide progress\ntowards automated LFT reading, we publicly release our dataset at\nhttps://iamstuti.github.io/lft_grounding_foundation_models/.\n","authors":["Stuti Pandey","Josh Myers-Dean","Jarek Reynolds","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2404.14990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13733v1","updated":"2024-04-21T18:19:27Z","published":"2024-04-21T18:19:27Z","title":"Elucidating the Design Space of Dataset Condensation","summary":" Dataset condensation, a concept within data-centric learning, efficiently\ntransfers critical attributes from an original dataset to a synthetic version,\nmaintaining both diversity and realism. This approach significantly improves\nmodel training efficiency and is adaptable across multiple application areas.\nPrevious methods in dataset condensation have faced challenges: some incur high\ncomputational costs which limit scalability to larger datasets (e.g., MTT,\nDREAM, and TESLA), while others are restricted to less optimal design spaces,\nwhich could hinder potential improvements, especially in smaller datasets\n(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a\ncomprehensive design framework that includes specific, effective strategies\nlike implementing soft category-aware matching and adjusting the learning rate\nschedule. These strategies are grounded in empirical evidence and theoretical\nbacking. Our resulting approach, Elucidate Dataset Condensation (EDC),\nestablishes a benchmark for both small and large-scale dataset condensation. In\nour testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on\nImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a\ncompression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM,\nand RDED by margins of 27.3%, 17.2%, and 6.6%, respectively.\n","authors":["Shitong Shao","Zikai Zhou","Huanran Chen","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.13733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13711v1","updated":"2024-04-21T16:45:35Z","published":"2024-04-21T16:45:35Z","title":"ArtNeRF: A Stylized Neural Field for 3D-Aware Cartoonized Face Synthesis","summary":" Recent advances in generative visual models and neural radiance fields have\ngreatly boosted 3D-aware image synthesis and stylization tasks. However,\nprevious NeRF-based work is limited to single scene stylization, training a\nmodel to generate 3D-aware cartoon faces with arbitrary styles remains\nunsolved. We propose ArtNeRF, a novel face stylization framework derived from\n3D-aware GAN to tackle this problem. In this framework, we utilize an\nexpressive generator to synthesize stylized faces and a triple-branch\ndiscriminator module to improve the visual quality and style consistency of the\ngenerated faces. Specifically, a style encoder based on contrastive learning is\nleveraged to extract robust low-dimensional embeddings of style images,\nempowering the generator with the knowledge of various styles. To smooth the\ntraining process of cross-domain transfer learning, we propose an adaptive\nstyle blending module which helps inject style information and allows users to\nfreely tune the level of stylization. We further introduce a neural rendering\nmodule to achieve efficient real-time rendering of images with higher\nresolutions. Extensive experiments demonstrate that ArtNeRF is versatile in\ngenerating high-quality 3D-aware cartoon faces with arbitrary styles.\n","authors":["Zichen Tang","Hongyu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13710v1","updated":"2024-04-21T16:44:52Z","published":"2024-04-21T16:44:52Z","title":"SVGEditBench: A Benchmark Dataset for Quantitative Assessment of LLM's\n SVG Editing Capabilities","summary":" Text-to-image models have shown progress in recent years. Along with this\nprogress, generating vector graphics from text has also advanced. SVG is a\npopular format for vector graphics, and SVG represents a scene with XML text.\nTherefore, Large Language Models can directly process SVG code. Taking this\ninto account, we focused on editing SVG with LLMs. For quantitative evaluation\nof LLMs' ability to edit SVG, we propose SVGEditBench. SVGEditBench is a\nbenchmark for assessing the LLMs' ability to edit SVG code. We also show the\nGPT-4 and GPT-3.5 results when evaluated on the proposed benchmark. In the\nexperiments, GPT-4 showed superior performance to GPT-3.5 both quantitatively\nand qualitatively. The dataset is available at\nhttps://github.com/mti-lab/SVGEditBench.\n","authors":["Kunato Nishina","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.13710v1.pdf","comment":"Accepted to Workshop on Graphic Design Understanding and Generation\n (GDUG), a CVPR2024 workshop. Dataset: https://github.com/mti-lab/SVGEditBench"},{"id":"http://arxiv.org/abs/2310.18737v2","updated":"2024-04-21T16:43:36Z","published":"2023-10-28T15:42:07Z","title":"Pre-training with Random Orthogonal Projection Image Modeling","summary":" Masked Image Modeling (MIM) is a powerful self-supervised strategy for visual\npre-training without the use of labels. MIM applies random crops to input\nimages, processes them with an encoder, and then recovers the masked inputs\nwith a decoder, which encourages the network to capture and learn structural\ninformation about objects and scenes. The intermediate feature representations\nobtained from MIM are suitable for fine-tuning on downstream tasks. In this\npaper, we propose an Image Modeling framework based on random orthogonal\nprojection instead of binary masking as in MIM. Our proposed Random Orthogonal\nProjection Image Modeling (ROPIM) reduces spatially-wise token information\nunder guaranteed bound on the noise variance and can be considered as masking\nentire spatial image area under locally varying masking degrees. Since ROPIM\nuses a random subspace for the projection that realizes the masking step, the\nreadily available complement of the subspace can be used during unmasking to\npromote recovery of removed information. In this paper, we show that using\nrandom orthogonal projection leads to superior performance compared to\ncrop-based masking. We demonstrate state-of-the-art results on several popular\nbenchmarks.\n","authors":["Maryam Haghighat","Peyman Moghadam","Shaheer Mohamed","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2310.18737v2.pdf","comment":"Published as a conference paper at the International Conference on\n Learning Representations (ICLR) 2024. 19 pages"},{"id":"http://arxiv.org/abs/2404.13706v1","updated":"2024-04-21T16:35:16Z","published":"2024-04-21T16:35:16Z","title":"Concept Arithmetics for Circumventing Concept Inhibition in Diffusion\n Models","summary":" Motivated by ethical and legal concerns, the scientific community is actively\ndeveloping methods to limit the misuse of Text-to-Image diffusion models for\nreproducing copyrighted, violent, explicit, or personal information in the\ngenerated images. Simultaneously, researchers put these newly developed safety\nmeasures to the test by assuming the role of an adversary to find\nvulnerabilities and backdoors in them. We use compositional property of\ndiffusion models, which allows to leverage multiple prompts in a single image\ngeneration. This property allows us to combine other concepts, that should not\nhave been affected by the inhibition, to reconstruct the vector, responsible\nfor target concept generation, even though the direct computation of this\nvector is no longer accessible. We provide theoretical and empirical evidence\nwhy the proposed attacks are possible and discuss the implications of these\nfindings for safe model deployment. We argue that it is essential to consider\nall possible approaches to image generation with diffusion models that can be\nemployed by an adversary. Our work opens up the discussion about the\nimplications of concept arithmetics and compositional inference for safety\nmechanisms in diffusion models.\n Content Advisory: This paper contains discussions and model-generated content\nthat may be considered offensive. Reader discretion is advised.\n Project page: https://cs-people.bu.edu/vpetsiuk/arc\n","authors":["Vitali Petsiuk","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2404.13706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13704v1","updated":"2024-04-21T16:29:49Z","published":"2024-04-21T16:29:49Z","title":"PEMMA: Parameter-Efficient Multi-Modal Adaptation for Medical Image\n Segmentation","summary":" Imaging modalities such as Computed Tomography (CT) and Positron Emission\nTomography (PET) are key in cancer detection, inspiring Deep Neural Networks\n(DNN) models that merge these scans for tumor segmentation. When both CT and\nPET scans are available, it is common to combine them as two channels of the\ninput to the segmentation model. However, this method requires both scan types\nduring training and inference, posing a challenge due to the limited\navailability of PET scans, thereby sometimes limiting the process to CT scans\nonly. Hence, there is a need to develop a flexible DNN architecture that can be\ntrained/updated using only CT scans but can effectively utilize PET scans when\nthey become available. In this work, we propose a parameter-efficient\nmulti-modal adaptation (PEMMA) framework for lightweight upgrading of a\ntransformer-based segmentation model trained only on CT scans to also\nincorporate PET scans. The benefits of the proposed approach are two-fold.\nFirstly, we leverage the inherent modularity of the transformer architecture\nand perform low-rank adaptation (LoRA) of the attention weights to achieve\nparameter-efficient adaptation. Secondly, since the PEMMA framework attempts to\nminimize cross modal entanglement, it is possible to subsequently update the\ncombined model using only one modality, without causing catastrophic forgetting\nof the other modality. Our proposed method achieves comparable results with the\nperformance of early fusion techniques with just 8% of the trainable\nparameters, especially with a remarkable +28% improvement on the average dice\nscore on PET scans when trained on a single modality.\n","authors":["Nada Saadi","Numan Saeed","Mohammad Yaqub","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2404.13704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13701v1","updated":"2024-04-21T16:05:38Z","published":"2024-04-21T16:05:38Z","title":"Semantic-Rearrangement-Based Multi-Level Alignment for Domain\n Generalized Segmentation","summary":" Domain generalized semantic segmentation is an essential computer vision\ntask, for which models only leverage source data to learn the capability of\ngeneralized semantic segmentation towards the unseen target domains. Previous\nworks typically address this challenge by global style randomization or feature\nregularization. In this paper, we argue that given the observation that\ndifferent local semantic regions perform different visual characteristics from\nthe source domain to the target domain, methods focusing on global operations\nare hard to capture such regional discrepancies, thus failing to construct\ndomain-invariant representations with the consistency from local to global\nlevel. Therefore, we propose the Semantic-Rearrangement-based Multi-Level\nAlignment (SRMA) to overcome this problem. SRMA first incorporates a Semantic\nRearrangement Module (SRM), which conducts semantic region randomization to\nenhance the diversity of the source domain sufficiently. A Multi-Level\nAlignment module (MLA) is subsequently proposed with the help of such diversity\nto establish the global-regional-local consistent domain-invariant\nrepresentations. By aligning features across randomized samples with\ndomain-neutral knowledge at multiple levels, SRMA provides a more robust way to\nhandle the source-target domain gap. Extensive experiments demonstrate the\nsuperiority of SRMA over the current state-of-the-art works on various\nbenchmarks.\n","authors":["Guanlong Jiao","Chenyangguang Zhang","Haonan Yin","Yu Mo","Biqing Huang","Hui Pan","Yi Luo","Jingxian Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13693v1","updated":"2024-04-21T15:42:56Z","published":"2024-04-21T15:42:56Z","title":"PV-S3: Advancing Automatic Photovoltaic Defect Detection using\n Semi-Supervised Semantic Segmentation of Electroluminescence Images","summary":" Photovoltaic (PV) systems allow us to tap into all abundant solar energy,\nhowever they require regular maintenance for high efficiency and to prevent\ndegradation. Traditional manual health check, using Electroluminescence (EL)\nimaging, is expensive and logistically challenging making automated defect\ndetection essential. Current automation approaches require extensive manual\nexpert labeling, which is time-consuming, expensive, and prone to errors. We\npropose PV-S3 (Photovoltaic-Semi Supervised Segmentation), a Semi-Supervised\nLearning approach for semantic segmentation of defects in EL images that\nreduces reliance on extensive labeling. PV-S3 is a Deep learning model trained\nusing a few labeled images along with numerous unlabeled images. We introduce a\nnovel Semi Cross-Entropy loss function to train PV-S3 which addresses the\nchallenges specific to automated PV defect detection, such as diverse defect\ntypes and class imbalance. We evaluate PV-S3 on multiple datasets and\ndemonstrate its effectiveness and adaptability. With merely 20% labeled\nsamples, we achieve an absolute improvement of 9.7% in IoU, 29.9% in Precision,\n12.75% in Recall, and 20.42% in F1-Score over prior state-of-the-art supervised\nmethod (which uses 100% labeled samples) on UCF-EL dataset (largest dataset\navailable for semantic segmentation of EL images) showing improvement in\nperformance while reducing the annotation costs by 80%.\n","authors":["Abhishek Jha","Yogesh Rawat","Shruti Vyas"],"pdf_url":"https://arxiv.org/pdf/2404.13693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13692v1","updated":"2024-04-21T15:40:41Z","published":"2024-04-21T15:40:41Z","title":"A sustainable development perspective on urban-scale roof greening\n priorities and benefits","summary":" Greenspaces are tightly linked to human well-being. Yet, rapid urbanization\nhas exacerbated greenspace exposure inequality and declining human life\nquality. Roof greening has been recognized as an effective strategy to mitigate\nthese negative impacts. Understanding priorities and benefits is crucial to\npromoting green roofs. Here, using geospatial big data, we conduct an\nurban-scale assessment of roof greening at a single building level in Hong Kong\nfrom a sustainable development perspective. We identify that 85.3\\% of\nbuildings reveal potential and urgent demand for roof greening. We further find\ngreen roofs could increase greenspace exposure by \\textasciitilde61\\% and\nproduce hundreds of millions (HK\\$) in economic benefits annually but play a\nsmall role in urban heat mitigation (\\textasciitilde0.15\\degree{C}) and annual\ncarbon emission offsets (\\textasciitilde0.8\\%). Our study offers a\ncomprehensive assessment of roof greening, which could provide reference for\nsustainable development in cities worldwide, from data utilization to solutions\nand findings.\n","authors":["Jie Shao","Wei Yao","Lei Luo","Linzhou Zeng","Zhiyi He","Puzuo Wang","Huadong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.13692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13691v1","updated":"2024-04-21T15:40:32Z","published":"2024-04-21T15:40:32Z","title":"A Complete System for Automated 3D Semantic-Geometric Mapping of\n Corrosion in Industrial Environments","summary":" Corrosion, a naturally occurring process leading to the deterioration of\nmetallic materials, demands diligent detection for quality control and the\npreservation of metal-based objects, especially within industrial contexts.\nTraditional techniques for corrosion identification, including ultrasonic\ntesting, radio-graphic testing, and magnetic flux leakage, necessitate the\ndeployment of expensive and bulky equipment on-site for effective data\nacquisition. An unexplored alternative involves employing lightweight,\nconventional camera systems, and state-of-the-art computer vision methods for\nits identification.\n In this work, we propose a complete system for semi-automated corrosion\nidentification and mapping in industrial environments. We leverage recent\nadvances in LiDAR-based methods for localization and mapping, with vision-based\nsemantic segmentation deep learning techniques, in order to build\nsemantic-geometric maps of industrial environments. Unlike previous corrosion\nidentification systems available in the literature, our designed multi-modal\nsystem is low-cost, portable, semi-autonomous and allows collecting large\ndatasets by untrained personnel.\n A set of experiments in an indoor laboratory environment, demonstrate\nquantitatively the high accuracy of the employed LiDAR based 3D mapping and\nlocalization system, with less then $0.05m$ and 0.02m average absolute and\nrelative pose errors. Also, our data-driven semantic segmentation model,\nachieves around 70\\% precision when trained with our pixel-wise manually\nannotated dataset.\n","authors":["Rui Pimentel de Figueiredo","Stefan Nordborg Eriksen","Ignacio Rodriguez","Simon Bøgh"],"pdf_url":"https://arxiv.org/pdf/2404.13691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02439v3","updated":"2024-04-21T15:20:15Z","published":"2023-12-05T02:41:57Z","title":"Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language\n Models with Creative Humor Generation","summary":" Chain-of-Thought (CoT) guides large language models (LLMs) to reason\nstep-by-step, and can motivate their logical reasoning ability. While effective\nfor logical tasks, CoT is not conducive to creative problem-solving which often\nrequires out-of-box thoughts and is crucial for innovation advancements. In\nthis paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a\nnon-sequential, creative paradigm involving strong associations and knowledge\nleaps. To this end, we study LLMs on the popular Oogiri game which needs\nparticipants to have good creativity and strong associative thinking for\nresponding unexpectedly and humorously to the given image, text, or both, and\nthus is suitable for LoT study. Then to investigate LLMs' LoT ability in the\nOogiri game, we first build a multimodal and multilingual Oogiri-GO dataset\nwhich contains over 130,000 samples from the Oogiri game, and observe the\ninsufficient LoT ability or failures of most existing LLMs on the Oogiri game.\nAccordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve\nLLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into\nLoT-oriented instruction tuning data to train pretrained LLM for achieving\ncertain LoT humor generation and discrimination abilities. Then CLoT designs an\nexplorative self-refinement that encourages the LLM to generate more creative\nLoT data via exploring parallels between seemingly unrelated concepts and\nselects high-quality data to train itself for self-refinement. CLoT not only\nexcels in humor generation in the Oogiri game but also boosts creative\nabilities in various tasks like cloud guessing game and divergent association\ntask. These findings advance our understanding and offer a pathway to improve\nLLMs' creative capacities for innovative applications across domains. The\ndataset, code, and models will be released online.\nhttps://zhongshsh.github.io/CLoT/.\n","authors":["Shanshan Zhong","Zhongzhan Huang","Shanghua Gao","Wushao Wen","Liang Lin","Marinka Zitnik","Pan Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.02439v3.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2404.13686v1","updated":"2024-04-21T15:16:05Z","published":"2024-04-21T15:16:05Z","title":"Hyper-SD: Trajectory Segmented Consistency Model for Efficient Image\n Synthesis","summary":" Recently, a series of diffusion-aware distillation algorithms have emerged to\nalleviate the computational overhead associated with the multi-step inference\nprocess of Diffusion Models (DMs). Current distillation techniques often\ndichotomize into two distinct aspects: i) ODE Trajectory Preservation; and ii)\nODE Trajectory Reformulation. However, these approaches suffer from severe\nperformance degradation or domain shifts. To address these limitations, we\npropose Hyper-SD, a novel framework that synergistically amalgamates the\nadvantages of ODE Trajectory Preservation and Reformulation, while maintaining\nnear-lossless performance during step compression. Firstly, we introduce\nTrajectory Segmented Consistency Distillation to progressively perform\nconsistent distillation within pre-defined time-step segments, which\nfacilitates the preservation of the original ODE trajectory from a higher-order\nperspective. Secondly, we incorporate human feedback learning to boost the\nperformance of the model in a low-step regime and mitigate the performance loss\nincurred by the distillation process. Thirdly, we integrate score distillation\nto further improve the low-step generation capability of the model and offer\nthe first attempt to leverage a unified LoRA to support the inference process\nat all steps. Extensive experiments and user studies demonstrate that Hyper-SD\nachieves SOTA performance from 1 to 8 inference steps for both SDXL and SD1.5.\nFor example, Hyper-SDXL surpasses SDXL-Lightning by +0.68 in CLIP Score and\n+0.51 in Aes Score in the 1-step inference.\n","authors":["Yuxi Ren","Xin Xia","Yanzuo Lu","Jiacheng Zhang","Jie Wu","Pan Xie","Xing Wang","Xuefeng Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.13686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13680v1","updated":"2024-04-21T14:43:31Z","published":"2024-04-21T14:43:31Z","title":"PoseAnimate: Zero-shot high fidelity pose controllable character\n animation","summary":" Image-to-video(I2V) generation aims to create a video sequence from a single\nimage, which requires high temporal coherence and visual fidelity with the\nsource image.However, existing approaches suffer from character appearance\ninconsistency and poor preservation of fine details. Moreover, they require a\nlarge amount of video data for training, which can be computationally\ndemanding.To address these limitations,we propose PoseAnimate, a novel\nzero-shot I2V framework for character animation.PoseAnimate contains three key\ncomponents: 1) Pose-Aware Control Module (PACM) incorporates diverse pose\nsignals into conditional embeddings, to preserve character-independent content\nand maintain precise alignment of actions.2) Dual Consistency Attention Module\n(DCAM) enhances temporal consistency, and retains character identity and\nintricate background details.3) Mask-Guided Decoupling Module (MGDM) refines\ndistinct feature perception, improving animation fidelity by decoupling the\ncharacter and background.We also propose a Pose Alignment Transition Algorithm\n(PATA) to ensure smooth action transition.Extensive experiment results\ndemonstrate that our approach outperforms the state-of-the-art training-based\nmethods in terms of character consistency and detail fidelity. Moreover, it\nmaintains a high level of temporal coherence throughout the generated\nanimations.\n","authors":["Bingwen Zhu","Fanyi Wang","Tianyi Lu","Peng Liu","Jingwen Su","Jinxiu Liu","Yanhao Zhang","Zuxuan Wu","Yu-Gang Jiang","Guo-Jun Qi"],"pdf_url":"https://arxiv.org/pdf/2404.13680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13679v1","updated":"2024-04-21T14:42:10Z","published":"2024-04-21T14:42:10Z","title":"GScream: Learning 3D Geometry and Feature Consistent Gaussian Splatting\n for Object Removal","summary":" This paper tackles the intricate challenge of object removal to update the\nradiance field using the 3D Gaussian Splatting. The main challenges of this\ntask lie in the preservation of geometric consistency and the maintenance of\ntexture coherence in the presence of the substantial discrete nature of\nGaussian primitives. We introduce a robust framework specifically designed to\novercome these obstacles. The key insight of our approach is the enhancement of\ninformation exchange among visible and invisible areas, facilitating content\nrestoration in terms of both geometry and texture. Our methodology begins with\noptimizing the positioning of Gaussian primitives to improve geometric\nconsistency across both removed and visible areas, guided by an online\nregistration process informed by monocular depth estimation. Following this, we\nemploy a novel feature propagation mechanism to bolster texture coherence,\nleveraging a cross-attention design that bridges sampling Gaussians from both\nuncertain and certain areas. This innovative approach significantly refines the\ntexture coherence within the final radiance field. Extensive experiments\nvalidate that our method not only elevates the quality of novel view synthesis\nfor scenes undergoing object removal but also showcases notable efficiency\ngains in training and rendering speeds.\n","authors":["Yuxin Wang","Qianyi Wu","Guofeng Zhang","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.13679v1.pdf","comment":"Project Page: https://w-ted.github.io/publications/gscream"},{"id":"http://arxiv.org/abs/2404.13671v1","updated":"2024-04-21T14:22:04Z","published":"2024-04-21T14:22:04Z","title":"FiLo: Zero-Shot Anomaly Detection by Fine-Grained Description and\n High-Quality Localization","summary":" Zero-shot anomaly detection (ZSAD) methods entail detecting anomalies\ndirectly without access to any known normal or abnormal samples within the\ntarget item categories. Existing approaches typically rely on the robust\ngeneralization capabilities of multimodal pretrained models, computing\nsimilarities between manually crafted textual features representing \"normal\" or\n\"abnormal\" semantics and image features to detect anomalies and localize\nanomalous patches. However, the generic descriptions of \"abnormal\" often fail\nto precisely match diverse types of anomalies across different object\ncategories. Additionally, computing feature similarities for single patches\nstruggles to pinpoint specific locations of anomalies with various sizes and\nscales. To address these issues, we propose a novel ZSAD method called FiLo,\ncomprising two components: adaptively learned Fine-Grained Description (FG-Des)\nand position-enhanced High-Quality Localization (HQ-Loc). FG-Des introduces\nfine-grained anomaly descriptions for each category using Large Language Models\n(LLMs) and employs adaptively learned textual templates to enhance the accuracy\nand interpretability of anomaly detection. HQ-Loc, utilizing Grounding DINO for\npreliminary localization, position-enhanced text prompts, and Multi-scale\nMulti-shape Cross-modal Interaction (MMCI) module, facilitates more accurate\nlocalization of anomalies of different sizes and shapes. Experimental results\non datasets like MVTec and VisA demonstrate that FiLo significantly improves\nthe performance of ZSAD in both detection and localization, achieving\nstate-of-the-art performance with an image-level AUC of 83.9% and a pixel-level\nAUC of 95.9% on the VisA dataset.\n","authors":["Zhaopeng Gu","Bingke Zhu","Guibo Zhu","Yingying Chen","Hao Li","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13667v1","updated":"2024-04-21T14:03:34Z","published":"2024-04-21T14:03:34Z","title":"MathNet: A Data-Centric Approach for Printed Mathematical Expression\n Recognition","summary":" Printed mathematical expression recognition (MER) models are usually trained\nand tested using LaTeX-generated mathematical expressions (MEs) as input and\nthe LaTeX source code as ground truth. As the same ME can be generated by\nvarious different LaTeX source codes, this leads to unwanted variations in the\nground truth data that bias test performance results and hinder efficient\nlearning. In addition, the use of only one font to generate the MEs heavily\nlimits the generalization of the reported results to realistic scenarios. We\npropose a data-centric approach to overcome this problem, and present\nconvincing experimental results: Our main contribution is an enhanced LaTeX\nnormalization to map any LaTeX ME to a canonical form. Based on this process,\nwe developed an improved version of the benchmark dataset im2latex-100k,\nfeaturing 30 fonts instead of one. Second, we introduce the real-world dataset\nrealFormula, with MEs extracted from papers. Third, we developed a MER model,\nMathNet, based on a convolutional vision transformer, with superior results on\nall four test sets (im2latex-100k, im2latexv2, realFormula, and InftyMDB-1),\noutperforming the previous state of the art by up to 88.3%.\n","authors":["Felix M. Schmitt-Koopmann","Elaine M. Huang","Hans-Peter Hutter","Thilo Stadelmann","Alireza Darvishy"],"pdf_url":"https://arxiv.org/pdf/2404.13667v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.13659v1","updated":"2024-04-21T13:29:42Z","published":"2024-04-21T13:29:42Z","title":"LMFNet: An Efficient Multimodal Fusion Approach for Semantic\n Segmentation in High-Resolution Remote Sensing","summary":" Despite the rapid evolution of semantic segmentation for land cover\nclassification in high-resolution remote sensing imagery, integrating multiple\ndata modalities such as Digital Surface Model (DSM), RGB, and Near-infrared\n(NIR) remains a challenge. Current methods often process only two types of\ndata, missing out on the rich information that additional modalities can\nprovide. Addressing this gap, we propose a novel \\textbf{L}ightweight\n\\textbf{M}ultimodal data \\textbf{F}usion \\textbf{Net}work (LMFNet) to\naccomplish the tasks of fusion and semantic segmentation of multimodal remote\nsensing images. LMFNet uniquely accommodates various data types simultaneously,\nincluding RGB, NirRG, and DSM, through a weight-sharing, multi-branch vision\ntransformer that minimizes parameter count while ensuring robust feature\nextraction. Our proposed multimodal fusion module integrates a\n\\textit{Multimodal Feature Fusion Reconstruction Layer} and \\textit{Multimodal\nFeature Self-Attention Fusion Layer}, which can reconstruct and fuse multimodal\nfeatures. Extensive testing on public datasets such as US3D, ISPRS Potsdam, and\nISPRS Vaihingen demonstrates the effectiveness of LMFNet. Specifically, it\nachieves a mean Intersection over Union ($mIoU$) of 85.09\\% on the US3D\ndataset, marking a significant improvement over existing methods. Compared to\nunimodal approaches, LMFNet shows a 10\\% enhancement in $mIoU$ with only a 0.5M\nincrease in parameter count. Furthermore, against bimodal methods, our approach\nwith trilateral inputs enhances $mIoU$ by 0.46 percentage points.\n","authors":["Tong Wang","Guanzhou Chen","Xiaodong Zhang","Chenxi Liu","Xiaoliang Tan","Jiaqi Wang","Chanjuan He","Wenlin Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.13659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13657v1","updated":"2024-04-21T13:25:46Z","published":"2024-04-21T13:25:46Z","title":"MLP: Motion Label Prior for Temporal Sentence Localization in Untrimmed\n 3D Human Motions","summary":" In this paper, we address the unexplored question of temporal sentence\nlocalization in human motions (TSLM), aiming to locate a target moment from a\n3D human motion that semantically corresponds to a text query. Considering that\n3D human motions are captured using specialized motion capture devices, motions\nwith only a few joints lack complex scene information like objects and\nlighting. Due to this character, motion data has low contextual richness and\nsemantic ambiguity between frames, which limits the accuracy of predictions\nmade by current video localization frameworks extended to TSLM to only a rough\nlevel. To refine this, we devise two novel label-prior-assisted training\nschemes: one embed prior knowledge of foreground and background to highlight\nthe localization chances of target moments, and the other forces the originally\nrough predictions to overlap with the more accurate predictions obtained from\nthe flipped start/end prior label sequences during recovery training. We show\nthat injecting label-prior knowledge into the model is crucial for improving\nperformance at high IoU. In our constructed TSLM benchmark, our model termed\nMLP achieves a recall of 44.13 at IoU@0.7 on the BABEL dataset and 71.17 on\nHumanML3D (Restore), outperforming prior works. Finally, we showcase the\npotential of our approach in corpus-level moment retrieval. Our source code is\nopenly accessible at https://github.com/eanson023/mlp.\n","authors":["Sheng Yan","Mengyuan Liu","Yong Wang","Yang Liu","Chen Chen","Hong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13657v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.13648v1","updated":"2024-04-21T12:50:38Z","published":"2024-04-21T12:50:38Z","title":"Data-independent Module-aware Pruning for Hierarchical Vision\n Transformers","summary":" Hierarchical vision transformers (ViTs) have two advantages over conventional\nViTs. First, hierarchical ViTs achieve linear computational complexity with\nrespect to image size by local self-attention. Second, hierarchical ViTs create\nhierarchical feature maps by merging image patches in deeper layers for dense\nprediction. However, existing pruning methods ignore the unique properties of\nhierarchical ViTs and use the magnitude value as the weight importance. This\napproach leads to two main drawbacks. First, the \"local\" attention weights are\ncompared at a \"global\" level, which may cause some \"locally\" important weights\nto be pruned due to their relatively small magnitude \"globally\". The second\nissue with magnitude pruning is that it fails to consider the distinct weight\ndistributions of the network, which are essential for extracting coarse to\nfine-grained features at various hierarchical levels.\n To solve the aforementioned issues, we have developed a Data-independent\nModule-Aware Pruning method (DIMAP) to compress hierarchical ViTs. To ensure\nthat \"local\" attention weights at different hierarchical levels are compared\nfairly in terms of their contribution, we treat them as a module and examine\ntheir contribution by analyzing their information distortion. Furthermore, we\nintroduce a novel weight metric that is solely based on weights and does not\nrequire input images, thereby eliminating the dependence on the patch merging\nprocess. Our method validates its usefulness and strengths on Swin Transformers\nof different sizes on ImageNet-1k classification. Notably, the top-5 accuracy\ndrop is only 0.07% when we remove 52.5% FLOPs and 52.7% parameters of Swin-B.\nWhen we reduce 33.2% FLOPs and 33.2% parameters of Swin-S, we can even achieve\na 0.8% higher relative top-5 accuracy than the original model. Code is\navailable at: https://github.com/he-y/Data-independent-Module-Aware-Pruning\n","authors":["Yang He","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.13648v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2404.13640v1","updated":"2024-04-21T12:33:07Z","published":"2024-04-21T12:33:07Z","title":"Beyond Alignment: Blind Video Face Restoration via Parsing-Guided\n Temporal-Coherent Transformer","summary":" Multiple complex degradations are coupled in low-quality video faces in the\nreal world. Therefore, blind video face restoration is a highly challenging\nill-posed problem, requiring not only hallucinating high-fidelity details but\nalso enhancing temporal coherence across diverse pose variations. Restoring\neach frame independently in a naive manner inevitably introduces temporal\nincoherence and artifacts from pose changes and keypoint localization errors.\nTo address this, we propose the first blind video face restoration approach\nwith a novel parsing-guided temporal-coherent transformer (PGTFormer) without\npre-alignment. PGTFormer leverages semantic parsing guidance to select optimal\nface priors for generating temporally coherent artifact-free results.\nSpecifically, we pre-train a temporal-spatial vector quantized auto-encoder on\nhigh-quality video face datasets to extract expressive context-rich priors.\nThen, the temporal parse-guided codebook predictor (TPCP) restores faces in\ndifferent poses based on face parsing context cues without performing face\npre-alignment. This strategy reduces artifacts and mitigates jitter caused by\ncumulative errors from face pre-alignment. Finally, the temporal fidelity\nregulator (TFR) enhances fidelity through temporal feature interaction and\nimproves video temporal consistency. Extensive experiments on face videos show\nthat our method outperforms previous face restoration baselines. The code will\nbe released on\n\\href{https://github.com/kepengxu/PGTFormer}{https://github.com/kepengxu/PGTFormer}.\n","authors":["Kepeng Xu","Li Xu","Gang He","Wenxin Yu","Yunsong Li"],"pdf_url":"https://arxiv.org/pdf/2404.13640v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2404.13621v1","updated":"2024-04-21T11:21:27Z","published":"2024-04-21T11:21:27Z","title":"Attack on Scene Flow using Point Clouds","summary":" Deep neural networks have made significant advancements in accurately\nestimating scene flow using point clouds, which is vital for many applications\nlike video analysis, action recognition, and navigation. Robustness of these\ntechniques, however, remains a concern, particularly in the face of adversarial\nattacks that have been proven to deceive state-of-the-art deep neural networks\nin many domains. Surprisingly, the robustness of scene flow networks against\nsuch attacks has not been thoroughly investigated. To address this problem, the\nproposed approach aims to bridge this gap by introducing adversarial white-box\nattacks specifically tailored for scene flow networks. Experimental results\nshow that the generated adversarial examples obtain up to 33.7 relative\ndegradation in average end-point error on the KITTI and FlyingThings3D\ndatasets. The study also reveals the significant impact that attacks targeting\npoint clouds in only one dimension or color channel have on average end-point\nerror. Analyzing the success and failure of these attacks on the scene flow\nnetworks and their 2D optical flow network variants show a higher vulnerability\nfor the optical flow networks.\n","authors":["Haniyeh Ehsani Oskouie","Mohammad-Shahram Moin","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2404.13621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13611v1","updated":"2024-04-21T10:41:04Z","published":"2024-04-21T10:41:04Z","title":"Video sentence grounding with temporally global textual knowledge","summary":" Temporal sentence grounding involves the retrieval of a video moment with a\nnatural language query. Many existing works directly incorporate the given\nvideo and temporally localized query for temporal grounding, overlooking the\ninherent domain gap between different modalities. In this paper, we utilize\npseudo-query features containing extensive temporally global textual knowledge\nsourced from the same video-query pair, to enhance the bridging of domain gaps\nand attain a heightened level of similarity between multi-modal features.\nSpecifically, we propose a Pseudo-query Intermediary Network (PIN) to achieve\nan improved alignment of visual and comprehensive pseudo-query features within\nthe feature space through contrastive learning. Subsequently, we utilize\nlearnable prompts to encapsulate the knowledge of pseudo-queries, propagating\nthem into the textual encoder and multi-modal fusion module, further enhancing\nthe feature alignment between visual and language for better temporal\ngrounding. Extensive experiments conducted on the Charades-STA and\nActivityNet-Captions datasets demonstrate the effectiveness of our method.\n","authors":["Cai Chen","Runzhong Zhang","Jianjun Gao","Kejun Wu","Kim-Hui Yap","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13605v1","updated":"2024-04-21T10:28:34Z","published":"2024-04-21T10:28:34Z","title":"Turb-Seg-Res: A Segment-then-Restore Pipeline for Dynamic Videos with\n Atmospheric Turbulence","summary":" Tackling image degradation due to atmospheric turbulence, particularly in\ndynamic environment, remains a challenge for long-range imaging systems.\nExisting techniques have been primarily designed for static scenes or scenes\nwith small motion. This paper presents the first segment-then-restore pipeline\nfor restoring the videos of dynamic scenes in turbulent environment. We\nleverage mean optical flow with an unsupervised motion segmentation method to\nseparate dynamic and static scene components prior to restoration. After camera\nshake compensation and segmentation, we introduce foreground/background\nenhancement leveraging the statistics of turbulence strength and a transformer\nmodel trained on a novel noise-based procedural turbulence generator for fast\ndataset augmentation. Benchmarked against existing restoration methods, our\napproach restores most of the geometric distortion and enhances sharpness for\nvideos. We make our code, simulator, and data publicly available to advance the\nfield of video restoration from turbulence: riponcs.github.io/TurbSegRes\n","authors":["Ripon Kumar Saha","Dehao Qin","Nianyi Li","Jinwei Ye","Suren Jayasuriya"],"pdf_url":"https://arxiv.org/pdf/2404.13605v1.pdf","comment":"CVPR 2024 Paper"},{"id":"http://arxiv.org/abs/2404.08544v2","updated":"2024-04-21T10:24:45Z","published":"2024-04-12T15:37:53Z","title":"Analyzing Decades-Long Environmental Changes in Namibia Using Archival\n Aerial Photography and Deep Learning","summary":" This study explores object detection in historical aerial photographs of\nNamibia to identify long-term environmental changes. Specifically, we aim to\nidentify key objects -- Waterholes, Omuti homesteads, and Big trees -- around\nOshikango in Namibia using sub-meter gray-scale aerial imagery from 1943 and\n1972. In this work, we propose a workflow for analyzing historical aerial\nimagery using a deep semantic segmentation model on sparse hand-labels. To this\nend, we employ a number of strategies including class-weighting,\npseudo-labeling and empirical p-value-based filtering to balance skewed and\nsparse representations of objects in the ground truth data. Results demonstrate\nthe benefits of these different training strategies resulting in an average\n$F_1=0.661$ and $F_1=0.755$ over the three objects of interest for the 1943 and\n1972 imagery, respectively. We also identified that the average size of\nWaterhole and Big trees increased while the average size of Omuti homesteads\ndecreased between 1943 and 1972 reflecting some of the local effects of the\nmassive post-Second World War economic, agricultural, demographic, and\nenvironmental changes. This work also highlights the untapped potential of\nhistorical aerial photographs in understanding long-term environmental changes\nbeyond Namibia (and Africa). With the lack of adequate satellite technology in\nthe past, archival aerial photography offers a great alternative to uncover\ndecades-long environmental changes.\n","authors":["Girmaw Abebe Tadesse","Caleb Robinson","Gilles Quentin Hacheme","Akram Zaytar","Rahul Dodhia","Tsering Wangyal Shawa","Juan M. Lavista Ferres","Emmanuel H. Kreike"],"pdf_url":"https://arxiv.org/pdf/2404.08544v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06629v3","updated":"2024-04-21T10:05:06Z","published":"2023-10-10T13:48:18Z","title":"EViT: An Eagle Vision Transformer with Bi-Fovea Self-Attention","summary":" Thanks to the advancement of deep learning technology, vision transformers\nhas demonstrated competitive performance in various computer vision tasks.\nUnfortunately, vision transformers still faces some challenges such as high\ncomputational complexity and absence of desirable inductive bias. To alleviate\nthese issues, we propose a novel Bi-Fovea Self-Attention (BFSA) inspired by the\nphysiological structure and visual properties of eagle eyes. This BFSA is used\nto simulate the shallow and deep fovea of eagle vision, prompting the network\nto learn the feature representation of targets from coarse to fine.\nAdditionally, we design a Bionic Eagle Vision (BEV) block based on BFSA. It\ncombines the advantages of convolution and introduces a novel Bi-Fovea\nFeedforward Network (BFFN) to mimic the working way of biological visual cortex\nprocesses information in hierarchically and parallel. Furthermore, we develop a\nunified and efficient pyramid backbone network family called Eagle Vision\nTransformers (EViTs) by stacking BEV blocks. Experimental results show that\nEViTs exhibit highly competitive performance in various computer vision tasks\nsuch as image classification, object detection and semantic segmentation.\nEspecially in terms of performance and computational efficiency, EViTs show\nsignificant advantages compared with other counterparts. Code is available at\nhttps://github.com/nkusyl/EViT\n","authors":["Yulong Shi","Mingwei Sun","Yongshuai Wang","Jiahao Ma","Zengqiang Chen"],"pdf_url":"https://arxiv.org/pdf/2310.06629v3.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2402.09181v2","updated":"2024-04-21T09:51:58Z","published":"2024-02-14T13:51:56Z","title":"OmniMedVQA: A New Large-Scale Comprehensive Evaluation Benchmark for\n Medical LVLM","summary":" Large Vision-Language Models (LVLMs) have demonstrated remarkable\ncapabilities in various multimodal tasks. However, their potential in the\nmedical domain remains largely unexplored. A significant challenge arises from\nthe scarcity of diverse medical images spanning various modalities and\nanatomical regions, which is essential in real-world medical applications. To\nsolve this problem, in this paper, we introduce OmniMedVQA, a novel\ncomprehensive medical Visual Question Answering (VQA) benchmark. This benchmark\nis collected from 73 different medical datasets, including 12 different\nmodalities and covering more than 20 distinct anatomical regions. Importantly,\nall images in this benchmark are sourced from authentic medical scenarios,\nensuring alignment with the requirements of the medical field and suitability\nfor evaluating LVLMs. Through our extensive experiments, we have found that\nexisting LVLMs struggle to address these medical VQA problems effectively.\nMoreover, what surprises us is that medical-specialized LVLMs even exhibit\ninferior performance to those general-domain models, calling for a more\nversatile and robust LVLM in the biomedical field. The evaluation results not\nonly reveal the current limitations of LVLM in understanding real medical\nimages but also highlight our dataset's significance. Our code with dataset are\navailable at https://github.com/OpenGVLab/Multi-Modality-Arena.\n","authors":["Yutao Hu","Tianbin Li","Quanfeng Lu","Wenqi Shao","Junjun He","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2402.09181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13594v1","updated":"2024-04-21T09:23:36Z","published":"2024-04-21T09:23:36Z","title":"Lost in Space: Probing Fine-grained Spatial Understanding in Vision and\n Language Resamplers","summary":" An effective method for combining frozen large language models (LLM) and\nvisual encoders involves a resampler module that creates a `visual prompt'\nwhich is provided to the LLM, along with the textual prompt. While this\napproach has enabled impressive performance across many coarse-grained tasks\nlike image captioning and visual question answering, more fine-grained tasks\nthat require spatial understanding have not been thoroughly examined. In this\npaper, we use \\textit{diagnostic classifiers} to measure the extent to which\nthe visual prompt produced by the resampler encodes spatial information. Our\nresults show that this information is largely absent from the resampler output\nwhen kept frozen during training of the classifiers. However, when the\nresampler and classifier are trained jointly, we observe a significant\nperformance boost. This shows that the compression achieved by the resamplers\ncan in principle encode the requisite spatial information, but that more\nobject-aware objectives are needed at the pretraining stage to facilitate this\ncapability\n","authors":["Georgios Pantazopoulos","Alessandro Suglia","Oliver Lemon","Arash Eshghi"],"pdf_url":"https://arxiv.org/pdf/2404.13594v1.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2404.13591v1","updated":"2024-04-21T09:15:02Z","published":"2024-04-21T09:15:02Z","title":"MARVEL: Multidimensional Abstraction and Reasoning through Visual\n Evaluation and Learning","summary":" While multi-modal large language models (MLLMs) have shown significant\nprogress on many popular visual reasoning benchmarks, whether they possess\nabstract visual reasoning abilities remains an open question. Similar to the\nSudoku puzzles, abstract visual reasoning (AVR) problems require finding\nhigh-level patterns (e.g., repetition constraints) that control the input\nshapes (e.g., digits) in a specific task configuration (e.g., matrix). However,\nexisting AVR benchmarks only considered a limited set of patterns (addition,\nconjunction), input shapes (rectangle, square), and task configurations (3 by 3\nmatrices). To evaluate MLLMs' reasoning abilities comprehensively, we introduce\nMARVEL, a multidimensional AVR benchmark with 770 puzzles composed of six core\nknowledge patterns, geometric and abstract shapes, and five different task\nconfigurations. To inspect whether the model accuracy is grounded in perception\nand reasoning, MARVEL complements the general AVR question with perception\nquestions in a hierarchical evaluation framework. We conduct comprehensive\nexperiments on MARVEL with nine representative MLLMs in zero-shot and few-shot\nsettings. Our experiments reveal that all models show near-random performance\non the AVR question, with significant performance gaps (40%) compared to humans\nacross all patterns and task configurations. Further analysis of perception\nquestions reveals that MLLMs struggle to comprehend the visual features\n(near-random performance) and even count the panels in the puzzle ( <45%),\nhindering their ability for abstract reasoning. We release our entire code and\ndataset.\n","authors":["Yifan Jiang","Jiarui Zhang","Kexuan Sun","Zhivar Sourati","Kian Ahrabian","Kaixin Ma","Filip Ilievski","Jay Pujara"],"pdf_url":"https://arxiv.org/pdf/2404.13591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14593v2","updated":"2024-04-21T09:02:36Z","published":"2023-07-27T02:36:13Z","title":"FakeTracer: Catching Face-swap DeepFakes via Implanting Traces in\n Training","summary":" Face-swap DeepFake is an emerging AI-based face forgery technique that can\nreplace the original face in a video with a generated face of the target\nidentity while retaining consistent facial attributes such as expression and\norientation. Due to the high privacy of faces, the misuse of this technique can\nraise severe social concerns, drawing tremendous attention to defend against\nDeepFakes recently. In this paper, we describe a new proactive defense method\ncalled FakeTracer to expose face-swap DeepFakes via implanting traces in\ntraining. Compared to general face-synthesis DeepFake, the face-swap DeepFake\nis more complex as it involves identity change, is subjected to the\nencoding-decoding process, and is trained unsupervised, increasing the\ndifficulty of implanting traces into the training phase. To effectively defend\nagainst face-swap DeepFake, we design two types of traces, sustainable trace\n(STrace) and erasable trace (ETrace), to be added to training faces. During the\ntraining, these manipulated faces affect the learning of the face-swap DeepFake\nmodel, enabling it to generate faces that only contain sustainable traces. In\nlight of these two traces, our method can effectively expose DeepFakes by\nidentifying them. Extensive experiments corroborate the efficacy of our method\non defending against face-swap DeepFake.\n","authors":["Pu Sun","Honggang Qi","Yuezun Li","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2307.14593v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13584v1","updated":"2024-04-21T08:52:22Z","published":"2024-04-21T08:52:22Z","title":"Rethink Arbitrary Style Transfer with Transformer and Contrastive\n Learning","summary":" Arbitrary style transfer holds widespread attention in research and boasts\nnumerous practical applications. The existing methods, which either employ\ncross-attention to incorporate deep style attributes into content attributes or\nuse adaptive normalization to adjust content features, fail to generate\nhigh-quality stylized images. In this paper, we introduce an innovative\ntechnique to improve the quality of stylized images. Firstly, we propose Style\nConsistency Instance Normalization (SCIN), a method to refine the alignment\nbetween content and style features. In addition, we have developed an\nInstance-based Contrastive Learning (ICL) approach designed to understand the\nrelationships among various styles, thereby enhancing the quality of the\nresulting stylized images. Recognizing that VGG networks are more adept at\nextracting classification features and need to be better suited for capturing\nstyle features, we have also introduced the Perception Encoder (PE) to capture\nstyle features. Extensive experiments demonstrate that our proposed method\ngenerates high-quality stylized images and effectively prevents artifacts\ncompared with the existing state-of-the-art methods.\n","authors":["Zhanjie Zhang","Jiakai Sun","Guangyuan Li","Lei Zhao","Quanwei Zhang","Zehua Lan","Haolin Yin","Wei Xing","Huaizhong Lin","Zhiwen Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.13584v1.pdf","comment":"Accepted by CVIU"},{"id":"http://arxiv.org/abs/2404.13579v1","updated":"2024-04-21T08:37:43Z","published":"2024-04-21T08:37:43Z","title":"LTOS: Layout-controllable Text-Object Synthesis via Adaptive\n Cross-attention Fusions","summary":" Controllable text-to-image generation synthesizes visual text and objects in\nimages with certain conditions, which are frequently applied to emoji and\nposter generation. Visual text rendering and layout-to-image generation tasks\nhave been popular in controllable text-to-image generation. However, each of\nthese tasks typically focuses on single modality generation or rendering,\nleaving yet-to-be-bridged gaps between the approaches correspondingly designed\nfor each of the tasks. In this paper, we combine text rendering and\nlayout-to-image generation tasks into a single task: layout-controllable\ntext-object synthesis (LTOS) task, aiming at synthesizing images with object\nand visual text based on predefined object layout and text contents. As\ncompliant datasets are not readily available for our LTOS task, we construct a\nlayout-aware text-object synthesis dataset, containing elaborate well-aligned\nlabels of visual text and object information. Based on the dataset, we propose\na layout-controllable text-object adaptive fusion (TOF) framework, which\ngenerates images with clear, legible visual text and plausible objects. We\nconstruct a visual-text rendering module to synthesize text and employ an\nobject-layout control module to generate objects while integrating the two\nmodules to harmoniously generate and integrate text content and objects in\nimages. To better the image-text integration, we propose a self-adaptive\ncross-attention fusion module that helps the image generation to attend more to\nimportant text information. Within such a fusion module, we use a self-adaptive\nlearnable factor to learn to flexibly control the influence of cross-attention\noutputs on image generation. Experimental results show that our method\noutperforms the state-of-the-art in LTOS, text rendering, and layout-to-image\ntasks, enabling harmonious visual text rendering and object generation.\n","authors":["Xiaoran Zhao","Tianhao Wu","Yu Lai","Zhiliang Tian","Zhen Huang","Yahui Liu","Zejiang He","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.13579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12322v2","updated":"2024-04-21T08:37:09Z","published":"2024-04-18T16:53:08Z","title":"Generalizable Face Landmarking Guided by Conditional Face Warping","summary":" As a significant step for human face modeling, editing, and generation, face\nlandmarking aims at extracting facial keypoints from images. A generalizable\nface landmarker is required in practice because real-world facial images, e.g.,\nthe avatars in animations and games, are often stylized in various ways.\nHowever, achieving generalizable face landmarking is challenging due to the\ndiversity of facial styles and the scarcity of labeled stylized faces. In this\nstudy, we propose a simple but effective paradigm to learn a generalizable face\nlandmarker based on labeled real human faces and unlabeled stylized faces. Our\nmethod learns the face landmarker as the key module of a conditional face\nwarper. Given a pair of real and stylized facial images, the conditional face\nwarper predicts a warping field from the real face to the stylized one, in\nwhich the face landmarker predicts the ending points of the warping field and\nprovides us with high-quality pseudo landmarks for the corresponding stylized\nfacial images. Applying an alternating optimization strategy, we learn the face\nlandmarker to minimize $i)$ the discrepancy between the stylized faces and the\nwarped real ones and $ii)$ the prediction errors of both real and pseudo\nlandmarks. Experiments on various datasets show that our method outperforms\nexisting state-of-the-art domain adaptation methods in face landmarking tasks,\nleading to a face landmarker with better generalizability. Code is available at\nhttps://plustwo0.github.io/project-face-landmarker.\n","authors":["Jiayi Liang","Haotian Liu","Hongteng Xu","Dixin Luo"],"pdf_url":"https://arxiv.org/pdf/2404.12322v2.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13576v1","updated":"2024-04-21T08:28:52Z","published":"2024-04-21T08:28:52Z","title":"I2CANSAY:Inter-Class Analogical Augmentation and Intra-Class\n Significance Analysis for Non-Exemplar Online Task-Free Continual Learning","summary":" Online task-free continual learning (OTFCL) is a more challenging variant of\ncontinual learning which emphasizes the gradual shift of task boundaries and\nlearns in an online mode. Existing methods rely on a memory buffer composed of\nold samples to prevent forgetting. However,the use of memory buffers not only\nraises privacy concerns but also hinders the efficient learning of new samples.\nTo address this problem, we propose a novel framework called I2CANSAY that gets\nrid of the dependence on memory buffers and efficiently learns the knowledge of\nnew data from one-shot samples. Concretely, our framework comprises two main\nmodules. Firstly, the Inter-Class Analogical Augmentation (ICAN) module\ngenerates diverse pseudo-features for old classes based on the inter-class\nanalogy of feature distributions for different new classes, serving as a\nsubstitute for the memory buffer. Secondly, the Intra-Class Significance\nAnalysis (ISAY) module analyzes the significance of attributes for each class\nvia its distribution standard deviation, and generates the importance vector as\na correction bias for the linear classifier, thereby enhancing the capability\nof learning from new samples. We run our experiments on four popular image\nclassification datasets: CoRe50, CIFAR-10, CIFAR-100, and CUB-200, our approach\noutperforms the prior state-of-the-art by a large margin.\n","authors":["Songlin Dong","Yingjie Chen","Yuhang He","Yuhan Jin","Alex C. Kot","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2404.13576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13573v1","updated":"2024-04-21T08:27:20Z","published":"2024-04-21T08:27:20Z","title":"Exploring AIGC Video Quality: A Focus on Visual Harmony, Video-Text\n Consistency and Domain Distribution Gap","summary":" The recent advancements in Text-to-Video Artificial Intelligence Generated\nContent (AIGC) have been remarkable. Compared with traditional videos, the\nassessment of AIGC videos encounters various challenges: visual inconsistency\nthat defy common sense, discrepancies between content and the textual prompt,\nand distribution gap between various generative models, etc. Target at these\nchallenges, in this work, we categorize the assessment of AIGC video quality\ninto three dimensions: visual harmony, video-text consistency, and domain\ndistribution gap. For each dimension, we design specific modules to provide a\ncomprehensive quality assessment of AIGC videos. Furthermore, our research\nidentifies significant variations in visual quality, fluidity, and style among\nvideos generated by different text-to-video models. Predicting the source\ngenerative model can make the AIGC video features more discriminative, which\nenhances the quality assessment performance. The proposed method was used in\nthe third-place winner of the NTIRE 2024 Quality Assessment for AI-Generated\nContent - Track 2 Video, demonstrating its effectiveness.\n","authors":["Bowen Qu","Xiaoyu Liang","Shangkun Sun","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2404.13573v1.pdf","comment":"9 pages, 3 figures, 3 tables. Accepted by CVPR2024 Workshop (3rd\n place of NTIRE2024 Quality Assessment for AI-Generated Content - Track 2\n Video)"},{"id":"http://arxiv.org/abs/2404.15190v1","updated":"2024-04-21T08:10:20Z","published":"2024-04-21T08:10:20Z","title":"Socratic Planner: Inquiry-Based Zero-Shot Planning for Embodied\n Instruction Following","summary":" Embodied Instruction Following (EIF) is the task of executing natural\nlanguage instructions by navigating and interacting with objects in 3D\nenvironments. One of the primary challenges in EIF is compositional task\nplanning, which is often addressed with supervised or in-context learning with\nlabeled data. To this end, we introduce the Socratic Planner, the first\nzero-shot planning method that infers without the need for any training data.\nSocratic Planner first decomposes the instructions into substructural\ninformation of the task through self-questioning and answering, translating it\ninto a high-level plan, i.e., a sequence of subgoals. Subgoals are executed\nsequentially, with our visually grounded re-planning mechanism adjusting plans\ndynamically through a dense visual feedback. We also introduce an evaluation\nmetric of high-level plans, RelaxedHLP, for a more comprehensive evaluation.\nExperiments demonstrate the effectiveness of the Socratic Planner, achieving\ncompetitive performance on both zero-shot and few-shot task planning in the\nALFRED benchmark, particularly excelling in tasks requiring higher-dimensional\ninference. Additionally, a precise adjustments in the plan were achieved by\nincorporating environmental visual information.\n","authors":["Suyeon Shin","Sujin jeon","Junghyun Kim","Gi-Cheon Kang","Byoung-Tak Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15190v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.13565v1","updated":"2024-04-21T07:34:44Z","published":"2024-04-21T07:34:44Z","title":"Exploring Diverse Methods in Visual Question Answering","summary":" This study explores innovative methods for improving Visual Question\nAnswering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and\nattention mechanisms. Leveraging a balanced VQA dataset, we investigate three\ndistinct strategies. Firstly, GAN-based approaches aim to generate answer\nembeddings conditioned on image and question inputs, showing potential but\nstruggling with more complex tasks. Secondly, autoencoder-based techniques\nfocus on learning optimal embeddings for questions and images, achieving\ncomparable results with GAN due to better ability on complex questions. Lastly,\nattention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB),\naddress language priors and attention modeling, albeit with a\ncomplexity-performance trade-off. This study underscores the challenges and\nopportunities in VQA and suggests avenues for future research, including\nalternative GAN formulations and attentional mechanisms.\n","authors":["Panfeng Li","Qikai Yang","Xieming Geng","Wenjing Zhou","Zhicheng Ding","Yi Nian"],"pdf_url":"https://arxiv.org/pdf/2404.13565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13564v1","updated":"2024-04-21T07:26:09Z","published":"2024-04-21T07:26:09Z","title":"Masked Latent Transformer with the Random Masking Ratio to Advance the\n Diagnosis of Dental Fluorosis","summary":" Dental fluorosis is a chronic disease caused by long-term overconsumption of\nfluoride, which leads to changes in the appearance of tooth enamel. It is an\nimportant basis for early non-invasive diagnosis of endemic fluorosis. However,\neven dental professionals may not be able to accurately distinguish dental\nfluorosis and its severity based on tooth images. Currently, there is still a\ngap in research on applying deep learning to diagnosing dental fluorosis.\nTherefore, we construct the first open-source dental fluorosis image dataset\n(DFID), laying the foundation for deep learning research in this field. To\nadvance the diagnosis of dental fluorosis, we propose a pioneering deep\nlearning model called masked latent transformer with the random masking ratio\n(MLTrMR). MLTrMR introduces a mask latent modeling scheme based on Vision\nTransformer to enhance contextual learning of dental fluorosis lesion\ncharacteristics. Consisting of a latent embedder, encoder, and decoder, MLTrMR\nemploys the latent embedder to extract latent tokens from the original image,\nwhereas the encoder and decoder comprising the latent transformer (LT) block\nare used to process unmasked tokens and predict masked tokens, respectively. To\nmitigate the lack of inductive bias in Vision Transformer, which may result in\nperformance degradation, the LT block introduces latent tokens to enhance the\nlearning capacity of latent lesion features. Furthermore, we design an\nauxiliary loss function to constrain the parameter update direction of the\nmodel. MLTrMR achieves 80.19% accuracy, 75.79% F1, and 81.28% quadratic\nweighted kappa on DFID, making it state-of-the-art (SOTA).\n","authors":["Yun Wu","Hao Xu","Maohua Gu","Zhongchuan Jiang","Jun Xu","Youliang Tian"],"pdf_url":"https://arxiv.org/pdf/2404.13564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05175v2","updated":"2024-04-21T07:15:01Z","published":"2023-06-08T13:14:35Z","title":"Large-scale Dataset Pruning with Dynamic Uncertainty","summary":" The state of the art of many learning tasks, e.g., image classification, is\nadvanced by collecting larger datasets and then training larger models on them.\nAs the outcome, the increasing computational cost is becoming unaffordable. In\nthis paper, we investigate how to prune the large-scale datasets, and thus\nproduce an informative subset for training sophisticated deep models with\nnegligible performance drop. We propose a simple yet effective dataset pruning\nmethod by exploring both the prediction uncertainty and training dynamics. We\nstudy dataset pruning by measuring the variation of predictions during the\nwhole training process on large-scale datasets, i.e., ImageNet-1K and\nImageNet-21K, and advanced models, i.e., Swin Transformer and ConvNeXt.\nExtensive experimental results indicate that our method outperforms the state\nof the art and achieves 25% lossless pruning ratio on both ImageNet-1K and\nImageNet-21K. The code and pruned datasets are available at\nhttps://github.com/BAAI-DCAI/Dataset-Pruning.\n","authors":["Muyang He","Shuo Yang","Tiejun Huang","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2306.05175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09069v2","updated":"2024-04-21T07:08:15Z","published":"2023-12-14T16:04:34Z","title":"PI3D: Efficient Text-to-3D Generation with Pseudo-Image Diffusion","summary":" Diffusion models trained on large-scale text-image datasets have demonstrated\na strong capability of controllable high-quality image generation from\narbitrary text prompts. However, the generation quality and generalization\nability of 3D diffusion models is hindered by the scarcity of high-quality and\nlarge-scale 3D datasets. In this paper, we present PI3D, a framework that fully\nleverages the pre-trained text-to-image diffusion models' ability to generate\nhigh-quality 3D shapes from text prompts in minutes. The core idea is to\nconnect the 2D and 3D domains by representing a 3D shape as a set of Pseudo RGB\nImages. We fine-tune an existing text-to-image diffusion model to produce such\npseudo-images using a small number of text-3D pairs. Surprisingly, we find that\nit can already generate meaningful and consistent 3D shapes given complex text\ndescriptions. We further take the generated shapes as the starting point for a\nlightweight iterative refinement using score distillation sampling to achieve\nhigh-quality generation under a low budget. PI3D generates a single 3D shape\nfrom text in only 3 minutes and the quality is validated to outperform existing\n3D generative models by a large margin.\n","authors":["Ying-Tian Liu","Yuan-Chen Guo","Guan Luo","Heyi Sun","Wei Yin","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.09069v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13555v1","updated":"2024-04-21T07:03:48Z","published":"2024-04-21T07:03:48Z","title":"Cell Phone Image-Based Persian Rice Detection and Classification Using\n Deep Learning Techniques","summary":" This study introduces an innovative approach to classifying various types of\nPersian rice using image-based deep learning techniques, highlighting the\npractical application of everyday technology in food categorization.\nRecognizing the diversity of Persian rice and its culinary significance, we\nleveraged the capabilities of convolutional neural networks (CNNs),\nspecifically by fine-tuning a ResNet model for accurate identification of\ndifferent rice varieties and employing a U-Net architecture for precise\nsegmentation of rice grains in bulk images. This dual-methodology framework\nallows for both individual grain classification and comprehensive analysis of\nbulk rice samples, addressing two crucial aspects of rice quality assessment.\nUtilizing images captured with consumer-grade cell phones reflects a realistic\nscenario in which individuals can leverage this technology for assistance with\ngrocery shopping and meal preparation. The dataset, comprising various rice\ntypes photographed under natural conditions without professional lighting or\nequipment, presents a challenging yet practical classification problem. Our\nfindings demonstrate the feasibility of using non-professional images for food\nclassification and the potential of deep learning models, like ResNet and\nU-Net, to adapt to the nuances of everyday objects and textures. This study\ncontributes to the field by providing insights into the applicability of\nimage-based deep learning in daily life, specifically for enhancing consumer\nexperiences and knowledge in food selection. Furthermore, it opens avenues for\nextending this approach to other food categories and practical applications,\nemphasizing the role of accessible technology in bridging the gap between\nsophisticated computational methods and everyday tasks.\n","authors":["Mahmood Saeedi kelishami","Amin Saeidi Kelishami","Sajjad Saeedi Kelishami"],"pdf_url":"https://arxiv.org/pdf/2404.13555v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.04727v2","updated":"2024-04-21T06:53:31Z","published":"2024-01-09T18:58:40Z","title":"Revisiting Adversarial Training at Scale","summary":" The machine learning community has witnessed a drastic change in the training\npipeline, pivoted by those ''foundation models'' with unprecedented scales.\nHowever, the field of adversarial training is lagging behind, predominantly\ncentered around small model sizes like ResNet-50, and tiny and low-resolution\ndatasets like CIFAR-10. To bridge this transformation gap, this paper provides\na modern re-examination with adversarial training, investigating its potential\nbenefits when applied at scale. Additionally, we introduce an efficient and\neffective training strategy to enable adversarial training with giant models\nand web-scale data at an affordable computing cost. We denote this newly\nintroduced framework as AdvXL.\n Empirical results demonstrate that AdvXL establishes new state-of-the-art\nrobust accuracy records under AutoAttack on ImageNet-1K. For example, by\ntraining on DataComp-1B dataset, our AdvXL empowers a vanilla ViT-g model to\nsubstantially surpass the previous records of $l_{\\infty}$-, $l_{2}$-, and\n$l_{1}$-robust accuracy by margins of 11.4%, 14.2% and 12.9%, respectively.\nThis achievement posits AdvXL as a pioneering approach, charting a new\ntrajectory for the efficient training of robust visual representations at\nsignificantly larger scales. Our code is available at\nhttps://github.com/UCSC-VLAA/AdvXL.\n","authors":["Zeyu Wang","Xianhang Li","Hongru Zhu","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2401.04727v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11947v2","updated":"2024-04-21T06:36:08Z","published":"2024-04-18T06:59:40Z","title":"VCC-INFUSE: Towards Accurate and Efficient Selection of Unlabeled\n Examples in Semi-supervised Learning","summary":" Despite the progress of Semi-supervised Learning (SSL), existing methods fail\nto utilize unlabeled data effectively and efficiently. Many pseudo-label-based\nmethods select unlabeled examples based on inaccurate confidence scores from\nthe classifier. Most prior work also uses all available unlabeled data without\npruning, making it difficult to handle large amounts of unlabeled data. To\naddress these issues, we propose two methods: Variational Confidence\nCalibration (VCC) and Influence-Function-based Unlabeled Sample Elimination\n(INFUSE). VCC is an universal plugin for SSL confidence calibration, using a\nvariational autoencoder to select more accurate pseudo labels based on three\ntypes of consistency scores. INFUSE is a data pruning method that constructs a\ncore dataset of unlabeled examples under SSL. Our methods are effective in\nmultiple datasets and settings, reducing classification errors rates and saving\ntraining time. Together, VCC-INFUSE reduces the error rate of FlexMatch on the\nCIFAR-100 dataset by 1.08% while saving nearly half of the training time.\n","authors":["Shijie Fang","Qianhan Feng","Tong Lin"],"pdf_url":"https://arxiv.org/pdf/2404.11947v2.pdf","comment":"Accepted paper of IJCAI 2024. Shijie Fang and Qianhan Feng\n contributed equally to this paper. New version, some problems and typos are\n fixed"},{"id":"http://arxiv.org/abs/2404.13550v1","updated":"2024-04-21T06:31:29Z","published":"2024-04-21T06:31:29Z","title":"Pointsoup: High-Performance and Extremely Low-Decoding-Latency Learned\n Geometry Codec for Large-Scale Point Cloud Scenes","summary":" Despite considerable progress being achieved in point cloud geometry\ncompression, there still remains a challenge in effectively compressing\nlarge-scale scenes with sparse surfaces. Another key challenge lies in reducing\ndecoding latency, a crucial requirement in real-world application. In this\npaper, we propose Pointsoup, an efficient learning-based geometry codec that\nattains high-performance and extremely low-decoding-latency simultaneously.\nInspired by conventional Trisoup codec, a point model-based strategy is devised\nto characterize local surfaces. Specifically, skin features are embedded from\nlocal windows via an attention-based encoder, and dilated windows are\nintroduced as cross-scale priors to infer the distribution of quantized\nfeatures in parallel. During decoding, features undergo fast refinement,\nfollowed by a folding-based point generator that reconstructs point coordinates\nwith fairly fast speed. Experiments show that Pointsoup achieves\nstate-of-the-art performance on multiple benchmarks with significantly lower\ndecoding complexity, i.e., up to 90$\\sim$160$\\times$ faster than the G-PCCv23\nTrisoup decoder on a comparatively low-end platform (e.g., one RTX 2080Ti).\nFurthermore, it offers variable-rate control with a single neural model\n(2.9MB), which is attractive for industrial practitioners.\n","authors":["Kang You","Kai Liu","Li Yu","Pan Gao","Dandan Ding"],"pdf_url":"https://arxiv.org/pdf/2404.13550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13541v1","updated":"2024-04-21T05:39:44Z","published":"2024-04-21T05:39:44Z","title":"Generalizable Novel-View Synthesis using a Stereo Camera","summary":" In this paper, we propose the first generalizable view synthesis approach\nthat specifically targets multi-view stereo-camera images. Since recent stereo\nmatching has demonstrated accurate geometry prediction, we introduce stereo\nmatching into novel-view synthesis for high-quality geometry reconstruction. To\nthis end, this paper proposes a novel framework, dubbed StereoNeRF, which\nintegrates stereo matching into a NeRF-based generalizable view synthesis\napproach. StereoNeRF is equipped with three key components to effectively\nexploit stereo matching in novel-view synthesis: a stereo feature extractor, a\ndepth-guided plane-sweeping, and a stereo depth loss. Moreover, we propose the\nStereoNVS dataset, the first multi-view dataset of stereo-camera images,\nencompassing a wide variety of both real and synthetic scenes. Our experimental\nresults demonstrate that StereoNeRF surpasses previous approaches in\ngeneralizable view synthesis.\n","authors":["Haechan Lee","Wonjoon Jin","Seung-Hwan Baek","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.13541v1.pdf","comment":"Accepted to CVPR 2024. Project page URL:\n https://jinwonjoon.github.io/stereonerf/"},{"id":"http://arxiv.org/abs/2404.13537v1","updated":"2024-04-21T05:11:37Z","published":"2024-04-21T05:11:37Z","title":"Bracketing Image Restoration and Enhancement with High-Low Frequency\n Decomposition","summary":" In real-world scenarios, due to a series of image degradations, obtaining\nhigh-quality, clear content photos is challenging. While significant progress\nhas been made in synthesizing high-quality images, previous methods for image\nrestoration and enhancement often overlooked the characteristics of different\ndegradations. They applied the same structure to address various types of\ndegradation, resulting in less-than-ideal restoration outcomes. Inspired by the\nnotion that high/low frequency information is applicable to different\ndegradations, we introduce HLNet, a Bracketing Image Restoration and\nEnhancement method based on high-low frequency decomposition. Specifically, we\nemploy two modules for feature extraction: shared weight modules and non-shared\nweight modules. In the shared weight modules, we use SCConv to extract common\nfeatures from different degradations. In the non-shared weight modules, we\nintroduce the High-Low Frequency Decomposition Block (HLFDB), which employs\ndifferent methods to handle high-low frequency information, enabling the model\nto address different degradations more effectively. Compared to other networks,\nour method takes into account the characteristics of different degradations,\nthus achieving higher-quality image restoration.\n","authors":["Genggeng Chen","Kexin Dai","Kangzhen Yang","Tao Hu","Xiangyu Chen","Yongqing Yang","Wei Dong","Peng Wu","Yanning Zhang","Qingsen Yan"],"pdf_url":"https://arxiv.org/pdf/2404.13537v1.pdf","comment":"This paper is accepted by CVPR 2024 Workshop"},{"id":"http://arxiv.org/abs/2404.13534v1","updated":"2024-04-21T05:09:56Z","published":"2024-04-21T05:09:56Z","title":"Motion-aware Latent Diffusion Models for Video Frame Interpolation","summary":" With the advancement of AIGC, video frame interpolation (VFI) has become a\ncrucial component in existing video generation frameworks, attracting\nwidespread research interest. For the VFI task, the motion estimation between\nneighboring frames plays a crucial role in avoiding motion ambiguity. However,\nexisting VFI methods always struggle to accurately predict the motion\ninformation between consecutive frames, and this imprecise estimation leads to\nblurred and visually incoherent interpolated frames. In this paper, we propose\na novel diffusion framework, motion-aware latent diffusion models (MADiff),\nwhich is specifically designed for the VFI task. By incorporating motion priors\nbetween the conditional neighboring frames with the target interpolated frame\npredicted throughout the diffusion sampling procedure, MADiff progressively\nrefines the intermediate outcomes, culminating in generating both visually\nsmooth and realistic results. Extensive experiments conducted on benchmark\ndatasets demonstrate that our method achieves state-of-the-art performance\nsignificantly outperforming existing approaches, especially under challenging\nscenarios involving dynamic textures with complex motion.\n","authors":["Zhilin Huang","Yijie Yu","Ling Yang","Chujun Qin","Bing Zheng","Xiawu Zheng","Zikun Zhou","Yaowei Wang","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13534v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.09508 by\n other authors"},{"id":"http://arxiv.org/abs/2404.13530v1","updated":"2024-04-21T04:55:13Z","published":"2024-04-21T04:55:13Z","title":"Listen Then See: Video Alignment with Speaker Attention","summary":" Video-based Question Answering (Video QA) is a challenging task and becomes\neven more intricate when addressing Socially Intelligent Question Answering\n(SIQA). SIQA requires context understanding, temporal reasoning, and the\nintegration of multimodal information, but in addition, it requires processing\nnuanced human behavior. Furthermore, the complexities involved are exacerbated\nby the dominance of the primary modality (text) over the others. Thus, there is\na need to help the task's secondary modalities to work in tandem with the\nprimary modality. In this work, we introduce a cross-modal alignment and\nsubsequent representation fusion approach that achieves state-of-the-art\nresults (82.06\\% accuracy) on the Social IQ 2.0 dataset for SIQA. Our approach\nexhibits an improved ability to leverage the video modality by using the audio\nmodality as a bridge with the language modality. This leads to enhanced\nperformance by reducing the prevalent issue of language overfitting and\nresultant video modality bypassing encountered by current existing techniques.\nOur code and models are publicly available at\nhttps://github.com/sts-vlcc/sts-vlcc\n","authors":["Aviral Agrawal","Carlos Mateo Samudio Lezcano","Iqui Balam Heredia-Marin","Prabhdeep Singh Sethi"],"pdf_url":"https://arxiv.org/pdf/2404.13530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13521v1","updated":"2024-04-21T04:06:09Z","published":"2024-04-21T04:06:09Z","title":"Graph4GUI: Graph Neural Networks for Representing Graphical User\n Interfaces","summary":" Present-day graphical user interfaces (GUIs) exhibit diverse arrangements of\ntext, graphics, and interactive elements such as buttons and menus, but\nrepresentations of GUIs have not kept up. They do not encapsulate both semantic\nand visuo-spatial relationships among elements. To seize machine learning's\npotential for GUIs more efficiently, Graph4GUI exploits graph neural networks\nto capture individual elements' properties and their semantic-visuo-spatial\nconstraints in a layout. The learned representation demonstrated its\neffectiveness in multiple tasks, especially generating designs in a challenging\nGUI autocompletion task, which involved predicting the positions of remaining\nunplaced elements in a partially completed GUI. The new model's suggestions\nshowed alignment and visual appeal superior to the baseline method and received\nhigher subjective ratings for preference. Furthermore, we demonstrate the\npractical benefits and efficiency advantages designers perceive when utilizing\nour model as an autocompletion plug-in.\n","authors":["Yue Jiang","Changkong Zhou","Vikas Garg","Antti Oulasvirta"],"pdf_url":"https://arxiv.org/pdf/2404.13521v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2309.04891v2","updated":"2024-04-21T03:42:47Z","published":"2023-09-09T23:03:50Z","title":"How to Evaluate Semantic Communications for Images with ViTScore Metric?","summary":" Semantic communications (SC) have been expected to be a new paradigm shifting\nto catalyze the next generation communication, whose main concerns shift from\naccurate bit transmission to effective semantic information exchange in\ncommunications. However, the previous and widely-used metrics for images are\nnot applicable to evaluate the image semantic similarity in SC. Classical\nmetrics to measure the similarity between two images usually rely on the pixel\nlevel or the structural level, such as the PSNR and the MS-SSIM.\nStraightforwardly using some tailored metrics based on deep-learning methods in\nCV community, such as the LPIPS, is infeasible for SC. To tackle this, inspired\nby BERTScore in NLP community, we propose a novel metric for evaluating image\nsemantic similarity, named Vision Transformer Score (ViTScore). We prove\ntheoretically that ViTScore has 3 important properties, including symmetry,\nboundedness, and normalization, which make ViTScore convenient and intuitive\nfor image measurement. To evaluate the performance of ViTScore, we compare\nViTScore with 3 typical metrics (PSNR, MS-SSIM, and LPIPS) through 4 classes of\nexperiments: (i) correlation with BERTScore through evaluation of image caption\ndownstream CV task, (ii) evaluation in classical image communications, (iii)\nevaluation in image semantic communication systems, and (iv) evaluation in\nimage semantic communication systems with semantic attack. Experimental results\ndemonstrate that ViTScore is robust and efficient in evaluating the semantic\nsimilarity of images. Particularly, ViTScore outperforms the other 3 typical\nmetrics in evaluating the image semantic changes by semantic attack, such as\nimage inverse with Generative Adversarial Networks (GANs). This indicates that\nViTScore is an effective performance metric when deployed in SC scenarios.\n","authors":["Tingting Zhu","Bo Peng","Jifan Liang","Tingchen Han","Hai Wan","Jingqiao Fu","Junjie Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03641v2","updated":"2024-04-21T03:39:55Z","published":"2023-09-07T11:21:10Z","title":"Spiking Structured State Space Model for Monaural Speech Enhancement","summary":" Speech enhancement seeks to extract clean speech from noisy signals.\nTraditional deep learning methods face two challenges: efficiently using\ninformation in long speech sequences and high computational costs. To address\nthese, we introduce the Spiking Structured State Space Model (Spiking-S4). This\napproach merges the energy efficiency of Spiking Neural Networks (SNN) with the\nlong-range sequence modeling capabilities of Structured State Space Models\n(S4), offering a compelling solution. Evaluation on the DNS Challenge and\nVoiceBank+Demand Datasets confirms that Spiking-S4 rivals existing Artificial\nNeural Network (ANN) methods but with fewer computational resources, as\nevidenced by reduced parameters and Floating Point Operations (FLOPs).\n","authors":["Yu Du","Xu Liu","Yansong Chua"],"pdf_url":"https://arxiv.org/pdf/2309.03641v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11395v3","updated":"2024-04-21T03:26:27Z","published":"2024-01-21T04:13:58Z","title":"UniM-OV3D: Uni-Modality Open-Vocabulary 3D Scene Understanding with\n Fine-Grained Feature Representation","summary":" 3D open-vocabulary scene understanding aims to recognize arbitrary novel\ncategories beyond the base label space. However, existing works not only fail\nto fully utilize all the available modal information in the 3D domain but also\nlack sufficient granularity in representing the features of each modality. In\nthis paper, we propose a unified multimodal 3D open-vocabulary scene\nunderstanding network, namely UniM-OV3D, which aligns point clouds with image,\nlanguage and depth. To better integrate global and local features of the point\nclouds, we design a hierarchical point cloud feature extraction module that\nlearns comprehensive fine-grained feature representations. Further, to\nfacilitate the learning of coarse-to-fine point-semantic representations from\ncaptions, we propose the utilization of hierarchical 3D caption pairs,\ncapitalizing on geometric constraints across various viewpoints of 3D scenes.\nExtensive experimental results demonstrate the effectiveness and superiority of\nour method in open-vocabulary semantic and instance segmentation, which\nachieves state-of-the-art performance on both indoor and outdoor benchmarks\nsuch as ScanNet, ScanNet200, S3IDS and nuScenes. Code is available at\nhttps://github.com/hithqd/UniM-OV3D.\n","authors":["Qingdong He","Jinlong Peng","Zhengkai Jiang","Kai Wu","Xiaozhong Ji","Jiangning Zhang","Yabiao Wang","Chengjie Wang","Mingang Chen","Yunsheng Wu"],"pdf_url":"https://arxiv.org/pdf/2401.11395v3.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.10163v2","updated":"2024-04-21T03:17:23Z","published":"2024-04-15T22:26:27Z","title":"EyeFormer: Predicting Personalized Scanpaths with Transformer-Guided\n Reinforcement Learning","summary":" From a visual perception perspective, modern graphical user interfaces (GUIs)\ncomprise a complex graphics-rich two-dimensional visuospatial arrangement of\ntext, images, and interactive objects such as buttons and menus. While existing\nmodels can accurately predict regions and objects that are likely to attract\nattention ``on average'', so far there is no scanpath model capable of\npredicting scanpaths for an individual. To close this gap, we introduce\nEyeFormer, which leverages a Transformer architecture as a policy network to\nguide a deep reinforcement learning algorithm that controls gaze locations. Our\nmodel has the unique capability of producing personalized predictions when\ngiven a few user scanpath samples. It can predict full scanpath information,\nincluding fixation positions and duration, across individuals and various\nstimulus types. Additionally, we demonstrate applications in GUI layout\noptimization driven by our model. Our software and models will be publicly\navailable.\n","authors":["Yue Jiang","Zixin Guo","Hamed Rezazadegan Tavakoli","Luis A. Leiva","Antti Oulasvirta"],"pdf_url":"https://arxiv.org/pdf/2404.10163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09457v3","updated":"2024-04-21T03:06:09Z","published":"2023-10-14T00:32:11Z","title":"UCM-Net: A Lightweight and Efficient Solution for Skin Lesion\n Segmentation using MLP and CNN","summary":" Skin cancer poses a significant public health challenge, necessitating\nefficient diagnostic tools. We introduce UCM-Net, a novel skin lesion\nsegmentation model combining Multi-Layer Perceptrons (MLP) and Convolutional\nNeural Networks (CNN). This lightweight, efficient architecture, deviating from\ntraditional UNet designs, dramatically reduces computational demands, making it\nideal for mobile health applications. Evaluated on PH2, ISIC 2017, and ISIC\n2018 datasets, UCM-Net demonstrates robust performance with fewer than 50KB\nparameters and requires less than 0.05 Giga Operations Per Second (GLOPs).\nMoreover, its minimal memory requirement is just 1.19MB in CPU environment\npositions. It is a potential benchmark for efficiency in skin lesion\nsegmentation, suitable for deployment in resource-constrained settings. In\norder to facilitate accessibility and further research in the field, the\nUCM-Net source code is https://github.com/chunyuyuan/UCM-Net.\n","authors":["Chunyu Yuan","Dongfang Zhao","Sos S. Agaian"],"pdf_url":"https://arxiv.org/pdf/2310.09457v3.pdf","comment":"17 pages, under review"},{"id":"http://arxiv.org/abs/2403.05146v2","updated":"2024-04-21T02:44:55Z","published":"2024-03-08T08:31:46Z","title":"Motion-Guided Dual-Camera Tracker for Low-Cost Skill Evaluation of\n Gastric Endoscopy","summary":" Gastric simulators with objective educational feedback have been proven\nuseful for endoscopy training. Existing electronic simulators with feedback are\nhowever not commonly adopted due to their high cost. In this work, a\nmotion-guided dual-camera tracker is proposed to provide reliable endoscope tip\nposition feedback at a low cost inside a mechanical simulator for endoscopy\nskill evaluation, tackling several unique challenges. To address the issue of\nsignificant appearance variation of the endoscope tip while keeping dual-camera\ntracking consistency, the cross-camera mutual template strategy (CMT) is\nproposed to introduce dynamic transient mutual templates to dual-camera\ntracking. To alleviate disturbance from large occlusion and distortion by the\nlight source from the endoscope tip, the Mamba-based motion-guided prediction\nhead (MMH) is presented to aggregate historical motion with visual tracking. It\nis the first application of Mamba for object tracking. The proposed tracker was\nevaluated on datasets captured by low-cost camera pairs during endoscopy\nprocedures performed inside the mechanical simulator. The tracker achieves SOTA\nperformance with robust and consistent tracking on dual cameras. Further\ndownstream evaluation proves that the 3D tip position determined by the\nproposed tracker enables reliable skill differentiation. The code and dataset\nare available at https://github.com/PieceZhang/MotionDCTrack\n","authors":["Yuelin Zhang","Wanquan Yan","Kim Yan","Chun Ping Lam","Yufu Qiu","Pengyu Zheng","Raymond Shing-Yan Tang","Shing Shin Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.05146v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13505v1","updated":"2024-04-21T02:21:30Z","published":"2024-04-21T02:21:30Z","title":"Dynamic in Static: Hybrid Visual Correspondence for Self-Supervised\n Video Object Segmentation","summary":" Conventional video object segmentation (VOS) methods usually necessitate a\nsubstantial volume of pixel-level annotated video data for fully supervised\nlearning. In this paper, we present HVC, a \\textbf{h}ybrid static-dynamic\n\\textbf{v}isual \\textbf{c}orrespondence framework for self-supervised VOS. HVC\nextracts pseudo-dynamic signals from static images, enabling an efficient and\nscalable VOS model. Our approach utilizes a minimalist fully-convolutional\narchitecture to capture static-dynamic visual correspondence in image-cropped\nviews. To achieve this objective, we present a unified self-supervised approach\nto learn visual representations of static-dynamic feature similarity. Firstly,\nwe establish static correspondence by utilizing a priori coordinate information\nbetween cropped views to guide the formation of consistent static feature\nrepresentations. Subsequently, we devise a concise convolutional layer to\ncapture the forward / backward pseudo-dynamic signals between two views,\nserving as cues for dynamic representations. Finally, we propose a hybrid\nvisual correspondence loss to learn joint static and dynamic consistency\nrepresentations. Our approach, without bells and whistles, necessitates only\none training session using static image data, significantly reducing memory\nconsumption ($\\sim$16GB) and training time ($\\sim$\\textbf{2h}). Moreover, HVC\nachieves state-of-the-art performance in several self-supervised VOS benchmarks\nand additional video label propagation tasks.\n","authors":["Gensheng Pei","Yazhou Yao","Jianbo Jiao","Wenguan Wang","Liqiang Nie","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2404.13505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.03095v2","updated":"2024-04-21T00:39:30Z","published":"2022-11-30T15:55:40Z","title":"Interpretation of Neural Networks is Susceptible to Universal\n Adversarial Perturbations","summary":" Interpreting neural network classifiers using gradient-based saliency maps\nhas been extensively studied in the deep learning literature. While the\nexisting algorithms manage to achieve satisfactory performance in application\nto standard image recognition datasets, recent works demonstrate the\nvulnerability of widely-used gradient-based interpretation schemes to\nnorm-bounded perturbations adversarially designed for every individual input\nsample. However, such adversarial perturbations are commonly designed using the\nknowledge of an input sample, and hence perform sub-optimally in application to\nan unknown or constantly changing data point. In this paper, we show the\nexistence of a Universal Perturbation for Interpretation (UPI) for standard\nimage datasets, which can alter a gradient-based feature map of neural networks\nover a significant fraction of test samples. To design such a UPI, we propose a\ngradient-based optimization method as well as a principal component analysis\n(PCA)-based approach to compute a UPI which can effectively alter a neural\nnetwork's gradient-based interpretation on different samples. We support the\nproposed UPI approaches by presenting several numerical results of their\nsuccessful applications to standard image datasets.\n","authors":["Haniyeh Ehsani Oskouie","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2212.03095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13493v1","updated":"2024-04-21T00:14:03Z","published":"2024-04-21T00:14:03Z","title":"Authentic Emotion Mapping: Benchmarking Facial Expressions in Real News","summary":" In this paper, we present a novel benchmark for Emotion Recognition using\nfacial landmarks extracted from realistic news videos. Traditional methods\nrelying on RGB images are resource-intensive, whereas our approach with Facial\nLandmark Emotion Recognition (FLER) offers a simplified yet effective\nalternative. By leveraging Graph Neural Networks (GNNs) to analyze the\ngeometric and spatial relationships of facial landmarks, our method enhances\nthe understanding and accuracy of emotion recognition. We discuss the\nadvancements and challenges in deep learning techniques for emotion\nrecognition, particularly focusing on Graph Neural Networks (GNNs) and\nTransformers. Our experimental results demonstrate the viability and potential\nof our dataset as a benchmark, setting a new direction for future research in\nemotion recognition technologies. The codes and models are at:\nhttps://github.com/wangzhifengharrison/benchmark_real_news\n","authors":["Qixuan Zhang","Zhifeng Wang","Yang Liu","Zhenyue Qin","Kaihao Zhang","Sabrina Caldwell","Tom Gedeon"],"pdf_url":"https://arxiv.org/pdf/2404.13493v1.pdf","comment":null}]},"2024-04-20T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2205.06265v3","updated":"2024-04-20T23:40:53Z","published":"2022-05-12T17:59:56Z","title":"ELODI: Ensemble Logit Difference Inhibition for Positive-Congruent\n Training","summary":" Negative flips are errors introduced in a classification system when a legacy\nmodel is updated. Existing methods to reduce the negative flip rate (NFR)\neither do so at the expense of overall accuracy by forcing a new model to\nimitate the old models, or use ensembles, which multiply inference cost\nprohibitively. We analyze the role of ensembles in reducing NFR and observe\nthat they remove negative flips that are typically not close to the decision\nboundary, but often exhibit large deviations in the distance among their\nlogits. Based on the observation, we present a method, called Ensemble Logit\nDifference Inhibition (ELODI), to train a classification system that achieves\nparagon performance in both error rate and NFR, at the inference cost of a\nsingle model. The method distills a homogeneous ensemble to a single student\nmodel which is used to update the classification system. ELODI also introduces\na generalized distillation objective, Logit Difference Inhibition (LDI), which\nonly penalizes the logit difference of a subset of classes with the highest\nlogit values. On multiple image classification benchmarks, model updates with\nELODI demonstrate superior accuracy retention and NFR reduction.\n","authors":["Yue Zhao","Yantao Shen","Yuanjun Xiong","Shuo Yang","Wei Xia","Zhuowen Tu","Bernt Schiele","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2205.06265v3.pdf","comment":"Accepted as a Regular Paper in TPAMI. Code is at\n https://github.com/amazon-science/regression-constraint-model-upgrade"},{"id":"http://arxiv.org/abs/2403.16967v3","updated":"2024-04-20T23:22:02Z","published":"2024-03-25T17:26:08Z","title":"Visual Whole-Body Control for Legged Loco-Manipulation","summary":" We study the problem of mobile manipulation using legged robots equipped with\nan arm, namely legged loco-manipulation. The robot legs, while usually utilized\nfor mobility, offer an opportunity to amplify the manipulation capabilities by\nconducting whole-body control. That is, the robot can control the legs and the\narm at the same time to extend its workspace. We propose a framework that can\nconduct the whole-body control autonomously with visual observations. Our\napproach, namely Visual Whole-Body Control(VBC), is composed of a low-level\npolicy using all degrees of freedom to track the end-effector manipulator\nposition and a high-level policy proposing the end-effector position based on\nvisual inputs. We train both levels of policies in simulation and perform\nSim2Real transfer for real robot deployment. We perform extensive experiments\nand show significant improvements over baselines in picking up diverse objects\nin different configurations (heights, locations, orientations) and\nenvironments. Project page: https://wholebody-b1.github.io\n","authors":["Minghuan Liu","Zixuan Chen","Xuxin Cheng","Yandong Ji","Rizhao Qiu","Ruihan Yang","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16967v3.pdf","comment":"Add more details. The first two authors contribute equally. Project\n page: https://wholebody-b1.github.io"},{"id":"http://arxiv.org/abs/2310.10352v3","updated":"2024-04-20T23:18:59Z","published":"2023-10-16T12:42:43Z","title":"Semi-Supervised Crowd Counting with Contextual Modeling: Facilitating\n Holistic Understanding of Crowd Scenes","summary":" To alleviate the heavy annotation burden for training a reliable crowd\ncounting model and thus make the model more practicable and accurate by being\nable to benefit from more data, this paper presents a new semi-supervised\nmethod based on the mean teacher framework. When there is a scarcity of labeled\ndata available, the model is prone to overfit local patches. Within such\ncontexts, the conventional approach of solely improving the accuracy of local\npatch predictions through unlabeled data proves inadequate. Consequently, we\npropose a more nuanced approach: fostering the model's intrinsic 'subitizing'\ncapability. This ability allows the model to accurately estimate the count in\nregions by leveraging its understanding of the crowd scenes, mirroring the\nhuman cognitive process. To achieve this goal, we apply masking on unlabeled\ndata, guiding the model to make predictions for these masked patches based on\nthe holistic cues. Furthermore, to help with feature learning, herein we\nincorporate a fine-grained density classification task. Our method is general\nand applicable to most existing crowd counting methods as it doesn't have\nstrict structural or loss constraints. In addition, we observe that the model\ntrained with our framework exhibits a 'subitizing'-like behavior. It accurately\npredicts low-density regions with only a 'glance', while incorporating local\ndetails to predict high-density regions. Our method achieves the\nstate-of-the-art performance, surpassing previous approaches by a large margin\non challenging benchmarks such as ShanghaiTech A and UCF-QNRF. The code is\navailable at: https://github.com/cha15yq/MRC-Crowd.\n","authors":["Yifei Qian","Xiaopeng Hong","Zhongliang Guo","Ognjen Arandjelović","Carl R. Donovan"],"pdf_url":"https://arxiv.org/pdf/2310.10352v3.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2404.13484v1","updated":"2024-04-20T23:02:57Z","published":"2024-04-20T23:02:57Z","title":"Joint Quality Assessment and Example-Guided Image Processing by\n Disentangling Picture Appearance from Content","summary":" The deep learning revolution has strongly impacted low-level image processing\ntasks such as style/domain transfer, enhancement/restoration, and visual\nquality assessments. Despite often being treated separately, the aforementioned\ntasks share a common theme of understanding, editing, or enhancing the\nappearance of input images without modifying the underlying content. We\nleverage this observation to develop a novel disentangled representation\nlearning method that decomposes inputs into content and appearance features.\nThe model is trained in a self-supervised manner and we use the learned\nfeatures to develop a new quality prediction model named DisQUE. We demonstrate\nthrough extensive evaluations that DisQUE achieves state-of-the-art accuracy\nacross quality prediction tasks and distortion types. Moreover, we demonstrate\nthat the same features may also be used for image processing tasks such as HDR\ntone mapping, where the desired output characteristics may be tuned using\nexample input-output pairs.\n","authors":["Abhinau K. Venkataramanan","Cosmin Stejerean","Ioannis Katsavounidis","Hassene Tmar","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2404.13484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13478v1","updated":"2024-04-20T22:16:56Z","published":"2024-04-20T22:16:56Z","title":"Deep SE(3)-Equivariant Geometric Reasoning for Precise Placement Tasks","summary":" Many robot manipulation tasks can be framed as geometric reasoning tasks,\nwhere an agent must be able to precisely manipulate an object into a position\nthat satisfies the task from a set of initial conditions. Often, task success\nis defined based on the relationship between two objects - for instance,\nhanging a mug on a rack. In such cases, the solution should be equivariant to\nthe initial position of the objects as well as the agent, and invariant to the\npose of the camera. This poses a challenge for learning systems which attempt\nto solve this task by learning directly from high-dimensional demonstrations:\nthe agent must learn to be both equivariant as well as precise, which can be\nchallenging without any inductive biases about the problem. In this work, we\npropose a method for precise relative pose prediction which is provably\nSE(3)-equivariant, can be learned from only a few demonstrations, and can\ngeneralize across variations in a class of objects. We accomplish this by\nfactoring the problem into learning an SE(3) invariant task-specific\nrepresentation of the scene and then interpreting this representation with\nnovel geometric reasoning layers which are provably SE(3) equivariant. We\ndemonstrate that our method can yield substantially more precise placement\npredictions in simulated placement tasks than previous methods trained with the\nsame amount of data, and can accurately represent relative placement\nrelationships data collected from real-world demonstrations. Supplementary\ninformation and videos can be found at\nhttps://sites.google.com/view/reldist-iclr-2023.\n","authors":["Ben Eisner","Yi Yang","Todor Davchev","Mel Vecerik","Jonathan Scholz","David Held"],"pdf_url":"https://arxiv.org/pdf/2404.13478v1.pdf","comment":"Published at International Conference on Representation Learning\n (ICLR 2024)"},{"id":"http://arxiv.org/abs/2404.06605v2","updated":"2024-04-20T22:10:37Z","published":"2024-04-09T20:24:29Z","title":"RoadBEV: Road Surface Reconstruction in Bird's Eye View","summary":" Road surface conditions, especially geometry profiles, enormously affect\ndriving performance of autonomous vehicles. Vision-based online road\nreconstruction promisingly captures road information in advance. Existing\nsolutions like monocular depth estimation and stereo matching suffer from\nmodest performance. The recent technique of Bird's-Eye-View (BEV) perception\nprovides immense potential to more reliable and accurate reconstruction. This\npaper uniformly proposes two simple yet effective models for road elevation\nreconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate\nroad elevation with monocular and stereo images, respectively. The former\ndirectly fits elevation values based on voxel features queried from image view,\nwhile the latter efficiently recognizes road elevation patterns based on BEV\nvolume representing discrepancy between left and right voxel features.\nInsightful analyses reveal their consistence and difference with perspective\nview. Experiments on real-world dataset verify the models' effectiveness and\nsuperiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm\nand 0.50cm, respectively. The estimation performance improves by 50\\% in BEV\nbased on monocular image. Our models are promising for practical applications,\nproviding valuable references for vision-based BEV perception in autonomous\ndriving. The code is released at https://github.com/ztsrxh/RoadBEV.\n","authors":["Tong Zhao","Lei Yang","Yichen Xie","Mingyu Ding","Masayoshi Tomizuka","Yintao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.06605v2.pdf","comment":"Dataset page: https://thu-rsxd.com/rsrd Code:\n https://github.com/ztsrxh/RoadBEV"},{"id":"http://arxiv.org/abs/2312.14494v2","updated":"2024-04-20T22:00:41Z","published":"2023-12-22T07:42:00Z","title":"Revisiting Few-Shot Object Detection with Vision-Language Models","summary":" Few-shot object detection (FSOD) benchmarks have advanced techniques for\ndetecting new categories with limited annotations. Existing benchmarks\nrepurpose well-established datasets like COCO by partitioning categories into\nbase and novel classes for pre-training and fine-tuning respectively. However,\nthese benchmarks do not reflect how FSOD is deployed in practice. Rather than\nonly pre-training on a small number of base categories, we argue that it is\nmore practical to fine-tune a foundation model (e.g., a vision-language model\n(VLM) pre-trained on web-scale data) for a target domain. Surprisingly, we find\nthat zero-shot inference from VLMs like GroundingDINO significantly outperforms\nthe state-of-the-art (48.3 vs. 33.1 AP) on COCO. However, such zero-shot models\ncan still be misaligned to target concepts of interest. For example, trailers\non the web may be different from trailers in the context of autonomous\nvehicles. In this work, we propose Foundational FSOD, a new benchmark protocol\nthat evaluates detectors pre-trained on any external datasets and fine-tuned on\nK-shots per target class. Further, we note that current FSOD benchmarks are\nactually federated datasets containing exhaustive annotations for each category\non a subset of the data. We leverage this insight to propose simple strategies\nfor fine-tuning VLMs with federated losses. We demonstrate the effectiveness of\nour approach on LVIS and nuImages, improving over prior work by 5.9 AP. Our\ncode is available at https://github.com/anishmadan23/foundational_fsod\n","authors":["Anish Madan","Neehar Peri","Shu Kong","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2312.14494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13474v1","updated":"2024-04-20T21:51:15Z","published":"2024-04-20T21:51:15Z","title":"Composing Pre-Trained Object-Centric Representations for Robotics From\n \"What\" and \"Where\" Foundation Models","summary":" There have recently been large advances both in pre-training visual\nrepresentations for robotic control and segmenting unknown category objects in\ngeneral images. To leverage these for improved robot learning, we propose\n$\\textbf{POCR}$, a new framework for building pre-trained object-centric\nrepresentations for robotic control. Building on theories of \"what-where\"\nrepresentations in psychology and computer vision, we use segmentations from a\npre-trained model to stably locate across timesteps, various entities in the\nscene, capturing \"where\" information. To each such segmented entity, we apply\nother pre-trained models that build vector descriptions suitable for robotic\ncontrol tasks, thus capturing \"what\" the entity is. Thus, our pre-trained\nobject-centric representations for control are constructed by appropriately\ncombining the outputs of off-the-shelf pre-trained models, with no new\ntraining. On various simulated and real robotic tasks, we show that imitation\npolicies for robotic manipulators trained on POCR achieve better performance\nand systematic generalization than state of the art pre-trained representations\nfor robotics, as well as prior object-centric representations that are\ntypically trained from scratch.\n","authors":["Junyao Shi","Jianing Qian","Yecheng Jason Ma","Dinesh Jayaraman"],"pdf_url":"https://arxiv.org/pdf/2404.13474v1.pdf","comment":"ICRA 2024. Project website: https://sites.google.com/view/pocr"},{"id":"http://arxiv.org/abs/2312.02914v4","updated":"2024-04-20T21:28:24Z","published":"2023-12-05T17:39:19Z","title":"Unsupervised Video Domain Adaptation with Masked Pre-Training and\n Collaborative Self-Training","summary":" In this work, we tackle the problem of unsupervised domain adaptation (UDA)\nfor video action recognition. Our approach, which we call UNITE, uses an image\nteacher model to adapt a video student model to the target domain. UNITE first\nemploys self-supervised pre-training to promote discriminative feature learning\non target domain videos using a teacher-guided masked distillation objective.\nWe then perform self-training on masked target data, using the video student\nmodel and image teacher model together to generate improved pseudolabels for\nunlabeled target videos. Our self-training process successfully leverages the\nstrengths of both models to achieve strong transfer performance across domains.\nWe evaluate our approach on multiple video domain adaptation benchmarks and\nobserve significant improvements upon previously reported results.\n","authors":["Arun Reddy","William Paul","Corban Rivera","Ketul Shah","Celso M. de Melo","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2312.02914v4.pdf","comment":"Accepted at CVPR 2024. 13 pages, 4 figures. Approved for public\n release: distribution unlimited"},{"id":"http://arxiv.org/abs/2401.17484v3","updated":"2024-04-20T21:14:15Z","published":"2024-01-30T22:37:24Z","title":"Pixel to Elevation: Learning to Predict Elevation Maps at Long Range\n using Images for Autonomous Offroad Navigation","summary":" Understanding terrain topology at long-range is crucial for the success of\noff-road robotic missions, especially when navigating at high-speeds. LiDAR\nsensors, which are currently heavily relied upon for geometric mapping, provide\nsparse measurements when mapping at greater distances. To address this\nchallenge, we present a novel learning-based approach capable of predicting\nterrain elevation maps at long-range using only onboard egocentric images in\nreal-time. Our proposed method is comprised of three main elements. First, a\ntransformer-based encoder is introduced that learns cross-view associations\nbetween the egocentric views and prior bird-eye-view elevation map predictions.\nSecond, an orientation-aware positional encoding is proposed to incorporate the\n3D vehicle pose information over complex unstructured terrain with multi-view\nvisual image features. Lastly, a history-augmented learn-able map embedding is\nproposed to achieve better temporal consistency between elevation map\npredictions to facilitate the downstream navigational tasks. We experimentally\nvalidate the applicability of our proposed approach for autonomous offroad\nrobotic navigation in complex and unstructured terrain using real-world offroad\ndriving data. Furthermore, the method is qualitatively and quantitatively\ncompared against the current state-of-the-art methods. Extensive field\nexperiments demonstrate that our method surpasses baseline models in accurately\npredicting terrain elevation while effectively capturing the overall terrain\ntopology at long-ranges. Finally, ablation studies are conducted to highlight\nand understand the effect of key components of the proposed approach and\nvalidate their suitability to improve offroad robotic navigation capabilities.\n","authors":["Chanyoung Chung","Georgios Georgakis","Patrick Spieler","Curtis Padgett","Ali Agha","Shehryar Khattak"],"pdf_url":"https://arxiv.org/pdf/2401.17484v3.pdf","comment":"8 pages, 6 figures, Accepted in IEEE Robotics and Automation Letters\n (RA-L)"},{"id":"http://arxiv.org/abs/2403.15977v3","updated":"2024-04-20T20:19:11Z","published":"2024-03-24T01:20:08Z","title":"Towards Two-Stream Foveation-based Active Vision Learning","summary":" Deep neural network (DNN) based machine perception frameworks process the\nentire input in a one-shot manner to provide answers to both \"what object is\nbeing observed\" and \"where it is located\". In contrast, the \"two-stream\nhypothesis\" from neuroscience explains the neural processing in the human\nvisual cortex as an active vision system that utilizes two separate regions of\nthe brain to answer the what and the where questions. In this work, we propose\na machine learning framework inspired by the \"two-stream hypothesis\" and\nexplore the potential benefits that it offers. Specifically, the proposed\nframework models the following mechanisms: 1) ventral (what) stream focusing on\nthe input regions perceived by the fovea part of an eye (foveation), 2) dorsal\n(where) stream providing visual guidance, and 3) iterative processing of the\ntwo streams to calibrate visual focus and process the sequence of focused image\npatches. The training of the proposed framework is accomplished by label-based\nDNN training for the ventral stream model and reinforcement learning for the\ndorsal stream model. We show that the two-stream foveation-based learning is\napplicable to the challenging task of weakly-supervised object localization\n(WSOL), where the training data is limited to the object class or its\nattributes. The framework is capable of both predicting the properties of an\nobject and successfully localizing it by predicting its bounding box. We also\nshow that, due to the independent nature of the two streams, the dorsal model\ncan be applied on its own to unseen images to localize objects from different\ndatasets.\n","authors":["Timur Ibrayev","Amitangshu Mukherjee","Sai Aparna Aketi","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2403.15977v3.pdf","comment":"Accepted version of the article, 18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2311.17241v2","updated":"2024-04-20T19:30:38Z","published":"2023-11-28T21:31:04Z","title":"End-to-End Temporal Action Detection with 1B Parameters Across 1000\n Frames","summary":" Recently, temporal action detection (TAD) has seen significant performance\nimprovement with end-to-end training. However, due to the memory bottleneck,\nonly models with limited scales and limited data volumes can afford end-to-end\ntraining, which inevitably restricts TAD performance. In this paper, we reduce\nthe memory consumption for end-to-end training, and manage to scale up the TAD\nbackbone to 1 billion parameters and the input video to 1,536 frames, leading\nto significant detection performance. The key to our approach lies in our\nproposed temporal-informative adapter (TIA), which is a novel lightweight\nmodule that reduces training memory. Using TIA, we free the humongous backbone\nfrom learning to adapt to the TAD task by only updating the parameters in TIA.\nTIA also leads to better TAD representation by temporally aggregating context\nfrom adjacent frames throughout the backbone. We evaluate our model across four\nrepresentative datasets. Owing to our efficient design, we are able to train\nend-to-end on VideoMAEv2-giant and achieve 75.4% mAP on THUMOS14, being the\nfirst end-to-end model to outperform the best feature-based methods. Code is\navailable at https://github.com/sming256/AdaTAD.\n","authors":["Shuming Liu","Chen-Lin Zhang","Chen Zhao","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2311.17241v2.pdf","comment":"Accepted to CVPR 2024. Camera-Ready Version"},{"id":"http://arxiv.org/abs/2404.13452v1","updated":"2024-04-20T19:29:51Z","published":"2024-04-20T19:29:51Z","title":"Cut-FUNQUE: An Objective Quality Model for Compressed Tone-Mapped High\n Dynamic Range Videos","summary":" High Dynamic Range (HDR) videos have enjoyed a surge in popularity in recent\nyears due to their ability to represent a wider range of contrast and color\nthan Standard Dynamic Range (SDR) videos. Although HDR video capture has seen\nincreasing popularity because of recent flagship mobile phones such as Apple\niPhones, Google Pixels, and Samsung Galaxy phones, a broad swath of consumers\nstill utilize legacy SDR displays that are unable to display HDR videos. As\nresult, HDR videos must be processed, i.e., tone-mapped, before streaming to a\nlarge section of SDR-capable video consumers. However, server-side tone-mapping\ninvolves automating decisions regarding the choices of tone-mapping operators\n(TMOs) and their parameters to yield high-fidelity outputs. Moreover, these\nchoices must be balanced against the effects of lossy compression, which is\nubiquitous in streaming scenarios. In this work, we develop a novel, efficient\nmodel of objective video quality named Cut-FUNQUE that is able to accurately\npredict the visual quality of tone-mapped and compressed HDR videos. Finally,\nwe evaluate Cut-FUNQUE on a large-scale crowdsourced database of such videos\nand show that it achieves state-of-the-art accuracy.\n","authors":["Abhinau K. Venkataramanan","Cosmin Stejerean","Ioannis Katsavounidis","Hassene Tmar","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2404.13452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13449v1","updated":"2024-04-20T19:17:40Z","published":"2024-04-20T19:17:40Z","title":"SiNC+: Adaptive Camera-Based Vitals with Unsupervised Learning of\n Periodic Signals","summary":" Subtle periodic signals, such as blood volume pulse and respiration, can be\nextracted from RGB video, enabling noncontact health monitoring at low cost.\nAdvancements in remote pulse estimation -- or remote photoplethysmography\n(rPPG) -- are currently driven by deep learning solutions. However, modern\napproaches are trained and evaluated on benchmark datasets with ground truth\nfrom contact-PPG sensors. We present the first non-contrastive unsupervised\nlearning framework for signal regression to mitigate the need for labelled\nvideo data. With minimal assumptions of periodicity and finite bandwidth, our\napproach discovers the blood volume pulse directly from unlabelled videos. We\nfind that encouraging sparse power spectra within normal physiological\nbandlimits and variance over batches of power spectra is sufficient for\nlearning visual features of periodic signals. We perform the first experiments\nutilizing unlabelled video data not specifically created for rPPG to train\nrobust pulse rate estimators. Given the limited inductive biases, we\nsuccessfully applied the same approach to camera-based respiration by changing\nthe bandlimits of the target signal. This shows that the approach is general\nenough for unsupervised learning of bandlimited quasi-periodic signals from\ndifferent domains. Furthermore, we show that the framework is effective for\nfinetuning models on unlabelled video from a single subject, allowing for\npersonalized and adaptive signal regressors.\n","authors":["Jeremy Speth","Nathan Vance","Patrick Flynn","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2404.13449v1.pdf","comment":"Extension of CVPR2023 highlight paper. arXiv admin note: substantial\n text overlap with arXiv:2303.07944"},{"id":"http://arxiv.org/abs/2404.13445v1","updated":"2024-04-20T18:52:51Z","published":"2024-04-20T18:52:51Z","title":"DMesh: A Differentiable Representation for General Meshes","summary":" We present a differentiable representation, DMesh, for general 3D triangular\nmeshes. DMesh considers both the geometry and connectivity information of a\nmesh. In our design, we first get a set of convex tetrahedra that compactly\ntessellates the domain based on Weighted Delaunay Triangulation (WDT), and\nformulate probability of faces to exist on our desired mesh in a differentiable\nmanner based on the WDT. This enables DMesh to represent meshes of various\ntopology in a differentiable way, and allows us to reconstruct the mesh under\nvarious observations, such as point cloud and multi-view images using\ngradient-based optimization. The source code and full paper is available at:\nhttps://sonsang.github.io/dmesh-project.\n","authors":["Sanghyun Son","Matheus Gadelha","Yang Zhou","Zexiang Xu","Ming C. Lin","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.13445v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.13443v1","updated":"2024-04-20T18:50:57Z","published":"2024-04-20T18:50:57Z","title":"FisheyeDetNet: Object Detection on Fisheye Surround View Camera Systems\n for Automated Driving","summary":" Object detection is a mature problem in autonomous driving with pedestrian\ndetection being one of the first deployed algorithms. It has been\ncomprehensively studied in the literature. However, object detection is\nrelatively less explored for fisheye cameras used for surround-view near field\nsensing. The standard bounding box representation fails in fisheye cameras due\nto heavy radial distortion, particularly in the periphery. To mitigate this, we\nexplore extending the standard object detection output representation of\nbounding box. We design rotated bounding boxes, ellipse, generic polygon as\npolar arc/angle representations and define an instance segmentation mIOU metric\nto analyze these representations. The proposed model FisheyeDetNet with polygon\noutperforms others and achieves a mAP score of 49.5 % on Valeo fisheye\nsurround-view dataset for automated driving applications. This dataset has 60K\nimages captured from 4 surround-view cameras across Europe, North America and\nAsia. To the best of our knowledge, this is the first detailed study on object\ndetection on fisheye cameras for autonomous driving scenarios.\n","authors":["Ganesh Sistu","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2404.13443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11156v2","updated":"2024-04-20T18:10:34Z","published":"2024-04-17T08:09:25Z","title":"Learning SO(3)-Invariant Semantic Correspondence via Local Shape\n Transform","summary":" Establishing accurate 3D correspondences between shapes stands as a pivotal\nchallenge with profound implications for computer vision and robotics. However,\nexisting self-supervised methods for this problem assume perfect input shape\nalignment, restricting their real-world applicability. In this work, we\nintroduce a novel self-supervised Rotation-Invariant 3D correspondence learner\nwith Local Shape Transform, dubbed RIST, that learns to establish dense\ncorrespondences between shapes even under challenging intra-class variations\nand arbitrary orientations. Specifically, RIST learns to dynamically formulate\nan SO(3)-invariant local shape transform for each point, which maps the\nSO(3)-equivariant global shape descriptor of the input shape to a local shape\ndescriptor. These local shape descriptors are provided as inputs to our decoder\nto facilitate point cloud self- and cross-reconstruction. Our proposed\nself-supervised training pipeline encourages semantically corresponding points\nfrom different shapes to be mapped to similar local shape descriptors, enabling\nRIST to establish dense point-wise correspondences. RIST demonstrates\nstate-of-the-art performances on 3D part label transfer and semantic keypoint\ntransfer given arbitrarily rotated point cloud pairs, outperforming existing\nmethods by significant margins.\n","authors":["Chunghyun Park","Seungwook Kim","Jaesik Park","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2404.11156v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13437v1","updated":"2024-04-20T18:06:26Z","published":"2024-04-20T18:06:26Z","title":"High-fidelity Endoscopic Image Synthesis by Utilizing Depth-guided\n Neural Surfaces","summary":" In surgical oncology, screening colonoscopy plays a pivotal role in providing\ndiagnostic assistance, such as biopsy, and facilitating surgical navigation,\nparticularly in polyp detection. Computer-assisted endoscopic surgery has\nrecently gained attention and amalgamated various 3D computer vision\ntechniques, including camera localization, depth estimation, surface\nreconstruction, etc. Neural Radiance Fields (NeRFs) and Neural Implicit\nSurfaces (NeuS) have emerged as promising methodologies for deriving accurate\n3D surface models from sets of registered images, addressing the limitations of\nexisting colon reconstruction approaches stemming from constrained camera\nmovement.\n However, the inadequate tissue texture representation and confused scale\nproblem in monocular colonoscopic image reconstruction still impede the\nprogress of the final rendering results. In this paper, we introduce a novel\nmethod for colon section reconstruction by leveraging NeuS applied to\nendoscopic images, supplemented by a single frame of depth map. Notably, we\npioneered the exploration of utilizing only one frame depth map in\nphotorealistic reconstruction and neural rendering applications while this\nsingle depth map can be easily obtainable from other monocular depth estimation\nnetworks with an object scale. Through rigorous experimentation and validation\non phantom imagery, our approach demonstrates exceptional accuracy in\ncompletely rendering colon sections, even capturing unseen portions of the\nsurface. This breakthrough opens avenues for achieving stable and consistently\nscaled reconstructions, promising enhanced quality in cancer screening\nprocedures and treatment interventions.\n","authors":["Baoru Huang","Yida Wang","Anh Nguyen","Daniel Elson","Francisco Vasconcelos","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2404.13437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13434v1","updated":"2024-04-20T17:56:14Z","published":"2024-04-20T17:56:14Z","title":"Nested-TNT: Hierarchical Vision Transformers with Multi-Scale Feature\n Processing","summary":" Transformer has been applied in the field of computer vision due to its\nexcellent performance in natural language processing, surpassing traditional\nconvolutional neural networks and achieving new state-of-the-art. ViT divides\nan image into several local patches, known as \"visual sentences\". However, the\ninformation contained in the image is vast and complex, and focusing only on\nthe features at the \"visual sentence\" level is not enough. The features between\nlocal patches should also be taken into consideration. In order to achieve\nfurther improvement, the TNT model is proposed, whose algorithm further divides\nthe image into smaller patches, namely \"visual words,\" achieving more accurate\nresults. The core of Transformer is the Multi-Head Attention mechanism, and\ntraditional attention mechanisms ignore interactions across different attention\nheads. In order to reduce redundancy and improve utilization, we introduce the\nnested algorithm and apply the Nested-TNT to image classification tasks. The\nexperiment confirms that the proposed model has achieved better classification\nperformance over ViT and TNT, exceeding 2.25%, 1.1% on dataset CIFAR10 and\n2.78%, 0.25% on dataset FLOWERS102 respectively.\n","authors":["Yuang Liu","Zhiheng Qiu","Xiaokai Qin"],"pdf_url":"https://arxiv.org/pdf/2404.13434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13425v1","updated":"2024-04-20T17:19:54Z","published":"2024-04-20T17:19:54Z","title":"AdvLoRA: Adversarial Low-Rank Adaptation of Vision-Language Models","summary":" Vision-Language Models (VLMs) are a significant technique for Artificial\nGeneral Intelligence (AGI). With the fast growth of AGI, the security problem\nbecome one of the most important challenges for VLMs. In this paper, through\nextensive experiments, we demonstrate the vulnerability of the conventional\nadaptation methods for VLMs, which may bring significant security risks. In\naddition, as the size of the VLMs increases, performing conventional\nadversarial adaptation techniques on VLMs results in high computational costs.\nTo solve these problems, we propose a parameter-efficient\n\\underline{Adv}ersarial adaptation method named \\underline{AdvLoRA} by\n\\underline{Lo}w-\\underline{R}ank \\underline{A}daptation. At first, we\ninvestigate and reveal the intrinsic low-rank property during the adversarial\nadaptation for VLMs. Different from LoRA, we improve the efficiency and\nrobustness of adversarial adaptation by designing a novel reparameterizing\nmethod based on parameter clustering and parameter alignment. In addition, an\nadaptive parameter update strategy is proposed to further improve the\nrobustness. By these settings, our proposed AdvLoRA alleviates the model\nsecurity and high resource waste problems. Extensive experiments demonstrate\nthe effectiveness and efficiency of the AdvLoRA.\n","authors":["Yuheng Ji","Yue Liu","Zhicheng Zhang","Zhao Zhang","Yuting Zhao","Gang Zhou","Xingwei Zhang","Xinwang Liu","Xiaolong Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.13425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09498v2","updated":"2024-04-20T16:42:42Z","published":"2024-04-15T06:37:21Z","title":"FusionMamba: Dynamic Feature Enhancement for Multimodal Image Fusion\n with Mamba","summary":" Multi-modal image fusion aims to combine information from different modes to\ncreate a single image with comprehensive information and detailed textures.\nHowever, fusion models based on convolutional neural networks encounter\nlimitations in capturing global image features due to their focus on local\nconvolution operations. Transformer-based models, while excelling in global\nfeature modeling, confront computational challenges stemming from their\nquadratic complexity. Recently, the Selective Structured State Space Model has\nexhibited significant potential for long-range dependency modeling with linear\ncomplexity, offering a promising avenue to address the aforementioned dilemma.\nIn this paper, we propose FusionMamba, a novel dynamic feature enhancement\nmethod for multimodal image fusion with Mamba. Specifically, we devise an\nimproved efficient Mamba model for image fusion, integrating efficient visual\nstate space model with dynamic convolution and channel attention. This refined\nmodel not only upholds the performance of Mamba and global modeling capability\nbut also diminishes channel redundancy while enhancing local enhancement\ncapability. Additionally, we devise a dynamic feature fusion module (DFFM)\ncomprising two dynamic feature enhancement modules (DFEM) and a cross modality\nfusion mamba module (CMFM). The former serves for dynamic texture enhancement\nand dynamic difference perception, whereas the latter enhances correlation\nfeatures between modes and suppresses redundant intermodal information.\nFusionMamba has yielded state-of-the-art (SOTA) performance across various\nmultimodal medical image fusion tasks (CT-MRI, PET-MRI, SPECT-MRI), infrared\nand visible image fusion task (IR-VIS) and multimodal biomedical image fusion\ndataset (GFP-PC), which is proved that our model has generalization ability.\nThe code for FusionMamba is available at\nhttps://github.com/millieXie/FusionMamba.\n","authors":["Xinyu Xie","Yawen Cui","Chio-In Ieong","Tao Tan","Xiaozhi Zhang","Xubin Zheng","Zitong Yu"],"pdf_url":"https://arxiv.org/pdf/2404.09498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13420v1","updated":"2024-04-20T16:36:24Z","published":"2024-04-20T16:36:24Z","title":"NeurCADRecon: Neural Representation for Reconstructing CAD Surfaces by\n Enforcing Zero Gaussian Curvature","summary":" Despite recent advances in reconstructing an organic model with the neural\nsigned distance function (SDF), the high-fidelity reconstruction of a CAD model\ndirectly from low-quality unoriented point clouds remains a significant\nchallenge. In this paper, we address this challenge based on the prior\nobservation that the surface of a CAD model is generally composed of piecewise\nsurface patches, each approximately developable even around the feature line.\nOur approach, named NeurCADRecon, is self-supervised, and its loss includes a\ndevelopability term to encourage the Gaussian curvature toward 0 while ensuring\nfidelity to the input points. Noticing that the Gaussian curvature is non-zero\nat tip points, we introduce a double-trough curve to tolerate the existence of\nthese tip points. Furthermore, we develop a dynamic sampling strategy to deal\nwith situations where the given points are incomplete or too sparse. Since our\nresulting neural SDFs can clearly manifest sharp feature points/lines, one can\neasily extract the feature-aligned triangle mesh from the SDF and then\ndecompose it into smooth surface patches, greatly reducing the difficulty of\nrecovering the parametric CAD design. A comprehensive comparison with existing\nstate-of-the-art methods shows the significant advantage of our approach in\nreconstructing faithful CAD shapes.\n","authors":["Qiujie Dong","Rui Xu","Pengfei Wang","Shuangmin Chen","Shiqing Xin","Xiaohong Jia","Wenping Wang","Changhe Tu"],"pdf_url":"https://arxiv.org/pdf/2404.13420v1.pdf","comment":"ACM Transactions on Graphics (SIGGRAPH 2024)"},{"id":"http://arxiv.org/abs/2403.10488v3","updated":"2024-04-20T16:24:44Z","published":"2024-03-15T17:23:38Z","title":"Joint Multimodal Transformer for Emotion Recognition in the Wild","summary":" Multimodal emotion recognition (MMER) systems typically outperform unimodal\nsystems by leveraging the inter- and intra-modal relationships between, e.g.,\nvisual, textual, physiological, and auditory modalities. This paper proposes an\nMMER method that relies on a joint multimodal transformer (JMT) for fusion with\nkey-based cross-attention. This framework can exploit the complementary nature\nof diverse modalities to improve predictive accuracy. Separate backbones\ncapture intra-modal spatiotemporal dependencies within each modality over video\nsequences. Subsequently, our JMT fusion architecture integrates the individual\nmodality embeddings, allowing the model to effectively capture inter- and\nintra-modal relationships. Extensive experiments on two challenging expression\nrecognition tasks -- (1) dimensional emotion recognition on the Affwild2\ndataset (with face and voice) and (2) pain estimation on the Biovid dataset\n(with face and biosensors) -- indicate that our JMT fusion can provide a\ncost-effective solution for MMER. Empirical results show that MMER systems with\nour proposed fusion allow us to outperform relevant baseline and\nstate-of-the-art methods.\n","authors":["Paul Waligora","Haseeb Aslam","Osama Zeeshan","Soufiane Belharbi","Alessandro Lameiras Koerich","Marco Pedersoli","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2403.10488v3.pdf","comment":"10 pages, 4 figures, 6 tables, CVPRw 2024"},{"id":"http://arxiv.org/abs/2404.05238v3","updated":"2024-04-20T16:15:53Z","published":"2024-04-08T07:09:15Z","title":"Allowing humans to interactively guide machines where to look does not\n always improve human-AI team's classification accuracy","summary":" Via thousands of papers in Explainable AI (XAI), attention maps\n\\cite{vaswani2017attention} and feature importance maps \\cite{bansal2020sam}\nhave been established as a common means for finding how important each input\nfeature is to an AI's decisions. It is an interesting, unexplored question\nwhether allowing users to edit the feature importance at test time would\nimprove a human-AI team's accuracy on downstream tasks. In this paper, we\naddress this question by leveraging CHM-Corr, a state-of-the-art, ante-hoc\nexplainable classifier \\cite{taesiri2022visual} that first predicts patch-wise\ncorrespondences between the input and training-set images, and then bases on\nthem to make classification decisions. We build CHM-Corr++, an interactive\ninterface for CHM-Corr, enabling users to edit the feature importance map\nprovided by CHM-Corr and observe updated model decisions. Via CHM-Corr++, users\ncan gain insights into if, when, and how the model changes its outputs,\nimproving their understanding beyond static explanations. However, our study\nwith 18 expert users who performed 1,400 decisions finds no statistical\nsignificance that our interactive approach improves user accuracy on CUB-200\nbird image classification over static explanations. This challenges the\nhypothesis that interactivity can boost human-AI team accuracy and raises needs\nfor future research. We open-source CHM-Corr++, an interactive tool for editing\nimage classifier attention (see an interactive demo here:\nhttp://137.184.82.109:7080/). We release code and data on github:\nhttps://github.com/anguyen8/chm-corr-interactive.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Sunnie S. Y. Kim","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05238v3.pdf","comment":"Accepted for presentation at the XAI4CV Workshop, part of the CVPR\n 2024 proceedings"},{"id":"http://arxiv.org/abs/2404.13417v1","updated":"2024-04-20T16:11:47Z","published":"2024-04-20T16:11:47Z","title":"Efficient and Concise Explanations for Object Detection with\n Gaussian-Class Activation Mapping Explainer","summary":" To address the challenges of providing quick and plausible explanations in\nExplainable AI (XAI) for object detection models, we introduce the Gaussian\nClass Activation Mapping Explainer (G-CAME). Our method efficiently generates\nconcise saliency maps by utilizing activation maps from selected layers and\napplying a Gaussian kernel to emphasize critical image regions for the\npredicted object. Compared with other Region-based approaches, G-CAME\nsignificantly reduces explanation time to 0.5 seconds without compromising the\nquality. Our evaluation of G-CAME, using Faster-RCNN and YOLOX on the MS-COCO\n2017 dataset, demonstrates its ability to offer highly plausible and faithful\nexplanations, especially in reducing the bias on tiny object detection.\n","authors":["Quoc Khanh Nguyen","Truong Thanh Hung Nguyen","Vo Thanh Khang Nguyen","Van Binh Truong","Tuong Phan","Hung Cao"],"pdf_url":"https://arxiv.org/pdf/2404.13417v1.pdf","comment":"Canadian AI 2024"},{"id":"http://arxiv.org/abs/2401.09630v3","updated":"2024-04-20T16:10:04Z","published":"2024-01-17T22:44:18Z","title":"CT Liver Segmentation via PVT-based Encoding and Refined Decoding","summary":" Accurate liver segmentation from CT scans is essential for effective\ndiagnosis and treatment planning. Computer-aided diagnosis systems promise to\nimprove the precision of liver disease diagnosis, disease progression, and\ntreatment planning. In response to the need, we propose a novel deep learning\napproach, \\textit{\\textbf{PVTFormer}}, that is built upon a pretrained pyramid\nvision transformer (PVT v2) combined with advanced residual upsampling and\ndecoder block. By integrating a refined feature channel approach with a\nhierarchical decoding strategy, PVTFormer generates high quality segmentation\nmasks by enhancing semantic features. Rigorous evaluation of the proposed\nmethod on Liver Tumor Segmentation Benchmark (LiTS) 2017 demonstrates that our\nproposed architecture not only achieves a high dice coefficient of 86.78\\%,\nmIoU of 78.46\\%, but also obtains a low HD of 3.50. The results underscore\nPVTFormer's efficacy in setting a new benchmark for state-of-the-art liver\nsegmentation methods. The source code of the proposed PVTFormer is available at\n\\url{https://github.com/DebeshJha/PVTFormer}.\n","authors":["Debesh Jha","Nikhil Kumar Tomar","Koushik Biswas","Gorkem Durak","Alpay Medetalibeyoglu","Matthew Antalek","Yury Velichko","Daniela Ladner","Amir Borhani","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2401.09630v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17948v2","updated":"2024-04-20T15:29:00Z","published":"2023-11-29T05:28:05Z","title":"Action-slot: Visual Action-centric Representations for Multi-label\n Atomic Activity Recognition in Traffic Scenes","summary":" In this paper, we study multi-label atomic activity recognition. Despite the\nnotable progress in action recognition, it is still challenging to recognize\natomic activities due to a deficiency in a holistic understanding of both\nmultiple road users' motions and their contextual information. In this paper,\nwe introduce Action-slot, a slot attention-based approach that learns visual\naction-centric representations, capturing both motion and contextual\ninformation. Our key idea is to design action slots that are capable of paying\nattention to regions where atomic activities occur, without the need for\nexplicit perception guidance. To further enhance slot attention, we introduce a\nbackground slot that competes with action slots, aiding the training process in\navoiding unnecessary focus on background regions devoid of activities. Yet, the\nimbalanced class distribution in the existing dataset hampers the assessment of\nrare activities. To address the limitation, we collect a synthetic dataset\ncalled TACO, which is four times larger than OATS and features a balanced\ndistribution of atomic activities. To validate the effectiveness of our method,\nwe conduct comprehensive experiments and ablation studies against various\naction recognition baselines. We also show that the performance of multi-label\natomic activity recognition on real-world datasets can be improved by\npretraining representations on TACO. We will release our source code and\ndataset. See the videos of visualization on the project page:\nhttps://hcis-lab.github.io/Action-slot/\n","authors":["Chi-Hsi Kung","Shu-Wei Lu","Yi-Hsuan Tsai","Yi-Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17948v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13408v1","updated":"2024-04-20T15:23:15Z","published":"2024-04-20T15:23:15Z","title":"AMMUNet: Multi-Scale Attention Map Merging for Remote Sensing Image\n Segmentation","summary":" The advancement of deep learning has driven notable progress in remote\nsensing semantic segmentation. Attention mechanisms, while enabling global\nmodeling and utilizing contextual information, face challenges of high\ncomputational costs and require window-based operations that weaken capturing\nlong-range dependencies, hindering their effectiveness for remote sensing image\nprocessing. In this letter, we propose AMMUNet, a UNet-based framework that\nemploys multi-scale attention map merging, comprising two key innovations: the\ngranular multi-head self-attention (GMSA) module and the attention map merging\nmechanism (AMMM). GMSA efficiently acquires global information while\nsubstantially mitigating computational costs in contrast to global multi-head\nself-attention mechanism. This is accomplished through the strategic\nutilization of dimension correspondence to align granularity and the reduction\nof relative position bias parameters, thereby optimizing computational\nefficiency. The proposed AMMM effectively combines multi-scale attention maps\ninto a unified representation using a fixed mask template, enabling the\nmodeling of global attention mechanism. Experimental evaluations highlight the\nsuperior performance of our approach, achieving remarkable mean intersection\nover union (mIoU) scores of 75.48\\% on the challenging Vaihingen dataset and an\nexceptional 77.90\\% on the Potsdam dataset, demonstrating the superiority of\nour method in precise remote sensing semantic segmentation. Codes are available\nat https://github.com/interpretty/AMMUNet.\n","authors":["Yang Yang","Shunyi Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.13408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17132v3","updated":"2024-04-20T15:20:14Z","published":"2023-11-28T18:03:27Z","title":"TransNeXt: Robust Foveal Visual Perception for Vision Transformers","summary":" Due to the depth degradation effect in residual connections, many efficient\nVision Transformers models that rely on stacking layers for information\nexchange often fail to form sufficient information mixing, leading to unnatural\nvisual perception. To address this issue, in this paper, we propose Aggregated\nAttention, a biomimetic design-based token mixer that simulates biological\nfoveal vision and continuous eye movement while enabling each token on the\nfeature map to have a global perception. Furthermore, we incorporate learnable\ntokens that interact with conventional queries and keys, which further\ndiversifies the generation of affinity matrices beyond merely relying on the\nsimilarity between queries and keys. Our approach does not rely on stacking for\ninformation exchange, thus effectively avoiding depth degradation and achieving\nnatural visual perception. Additionally, we propose Convolutional GLU, a\nchannel mixer that bridges the gap between GLU and SE mechanism, which empowers\neach token to have channel attention based on its nearest neighbor image\nfeatures, enhancing local modeling capability and model robustness. We combine\naggregated attention and convolutional GLU to create a new visual backbone\ncalled TransNeXt. Extensive experiments demonstrate that our TransNeXt achieves\nstate-of-the-art performance across multiple model sizes. At a resolution of\n$224^2$, TransNeXt-Tiny attains an ImageNet accuracy of 84.0%, surpassing\nConvNeXt-B with 69% fewer parameters. Our TransNeXt-Base achieves an ImageNet\naccuracy of 86.2% and an ImageNet-A accuracy of 61.6% at a resolution of\n$384^2$, a COCO object detection mAP of 57.1, and an ADE20K semantic\nsegmentation mIoU of 54.7.\n","authors":["Dai Shi"],"pdf_url":"https://arxiv.org/pdf/2311.17132v3.pdf","comment":"CVPR 2024 Camera-ready Version. Project Page:\n https://github.com/DaiShiResearch/TransNeXt"},{"id":"http://arxiv.org/abs/2404.09640v3","updated":"2024-04-20T15:18:03Z","published":"2024-04-15T10:19:39Z","title":"CREST: Cross-modal Resonance through Evidential Deep Learning for\n Enhanced Zero-Shot Learning","summary":" Zero-shot learning (ZSL) enables the recognition of novel classes by\nleveraging semantic knowledge transfer from known to unknown categories. This\nknowledge, typically encapsulated in attribute descriptions, aids in\nidentifying class-specific visual features, thus facilitating visual-semantic\nalignment and improving ZSL performance. However, real-world challenges such as\ndistribution imbalances and attribute co-occurrence among instances often\nhinder the discernment of local variances in images, a problem exacerbated by\nthe scarcity of fine-grained, region-specific attribute annotations. Moreover,\nthe variability in visual presentation within categories can also skew\nattribute-category associations. In response, we propose a bidirectional\ncross-modal ZSL approach CREST. It begins by extracting representations for\nattribute and visual localization and employs Evidential Deep Learning (EDL) to\nmeasure underlying epistemic uncertainty, thereby enhancing the model's\nresilience against hard negatives. CREST incorporates dual learning pathways,\nfocusing on both visual-category and attribute-category alignments, to ensure\nrobust correlation between latent and observable spaces. Moreover, we introduce\nan uncertainty-informed cross-modal fusion technique to refine visual-attribute\ninference. Extensive experiments demonstrate our model's effectiveness and\nunique explainability across multiple datasets. Our code and data are available\nat: https://github.com/JethroJames/CREST\n","authors":["Haojian Huang","Xiaozhen Qiao","Zhuo Chen","Haodong Chen","Bingyu Li","Zhe Sun","Mulin Chen","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.09640v3.pdf","comment":"Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at:\n https://github.com/JethroJames/CREST"},{"id":"http://arxiv.org/abs/2212.11152v2","updated":"2024-04-20T15:15:13Z","published":"2022-12-10T13:01:18Z","title":"OpenPack: A Large-scale Dataset for Recognizing Packaging Works in\n IoT-enabled Logistic Environments","summary":" Unlike human daily activities, existing publicly available sensor datasets\nfor work activity recognition in industrial domains are limited by difficulties\nin collecting realistic data as close collaboration with industrial sites is\nrequired. This also limits research on and development of methods for\nindustrial applications. To address these challenges and contribute to research\non machine recognition of work activities in industrial domains, in this study,\nwe introduce a new large-scale dataset for packaging work recognition called\nOpenPack. OpenPack contains 53.8 hours of multimodal sensor data, including\nacceleration data, keypoints, depth images, and readings from IoT-enabled\ndevices (e.g., handheld barcode scanners), collected from 16 distinct subjects\nwith different levels of packaging work experience. We apply state-of-the-art\nhuman activity recognition techniques to the dataset and provide future\ndirections of complex work activity recognition studies in the pervasive\ncomputing community based on the results. We believe that OpenPack will\ncontribute to the sensor-based action/activity recognition community by\nproviding challenging tasks. The OpenPack dataset is available at\nhttps://open-pack.github.io.\n","authors":["Naoya Yoshimura","Jaime Morales","Takuya Maekawa","Takahiro Hara"],"pdf_url":"https://arxiv.org/pdf/2212.11152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13400v1","updated":"2024-04-20T14:57:31Z","published":"2024-04-20T14:57:31Z","title":"HiVG: Hierarchical Multimodal Fine-grained Modulation for Visual\n Grounding","summary":" Visual grounding, which aims to ground a visual region via natural language,\nis a task that heavily relies on cross-modal alignment. Existing works utilized\nuni-modal pre-trained models to transfer visual/linguistic knowledge separately\nwhile ignoring the multimodal corresponding information. Motivated by recent\nadvancements in contrastive language-image pre-training and low-rank adaptation\n(LoRA) methods, we aim to solve the grounding task based on multimodal\npre-training. However, there exists significant task gaps between pre-training\nand grounding. Therefore, to address these gaps, we propose a concise and\nefficient hierarchical multimodal fine-grained modulation framework, namely\nHiVG. Specifically, HiVG consists of a multi-layer adaptive cross-modal bridge\nand a hierarchical multimodal low-rank adaptation (Hi LoRA) paradigm. The\ncross-modal bridge can address the inconsistency between visual features and\nthose required for grounding, and establish a connection between multi-level\nvisual and text features. Hi LoRA prevents the accumulation of perceptual\nerrors by adapting the cross-modal features from shallow to deep layers in a\nhierarchical manner. Experimental results on five datasets demonstrate the\neffectiveness of our approach and showcase the significant grounding\ncapabilities as well as promising energy efficiency advantages. The project\npage: https://github.com/linhuixiao/HiVG.\n","authors":["Linhui Xiao","Xiaoshan Yang","Fang Peng","Yaowei Wang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2404.13400v1.pdf","comment":"The project page: https://github.com/linhuixiao/HiVG"},{"id":"http://arxiv.org/abs/2404.02148v2","updated":"2024-04-20T14:45:54Z","published":"2024-04-02T17:58:03Z","title":"Diffusion$^2$: Dynamic 3D Content Generation via Score Composition of\n Orthogonal Diffusion Models","summary":" Recent advancements in 3D generation are predominantly propelled by\nimprovements in 3D-aware image diffusion models which are pretrained on\nInternet-scale image data and fine-tuned on massive 3D data, offering the\ncapability of producing highly consistent multi-view images. However, due to\nthe scarcity of synchronized multi-view video data, it is impractical to adapt\nthis paradigm to 4D generation directly. Despite that, the available video and\n3D data are adequate for training video and multi-view diffusion models that\ncan provide satisfactory dynamic and geometric priors respectively. In this\npaper, we present Diffusion$^2$, a novel framework for dynamic 3D content\ncreation that leverages the knowledge about geometric consistency and temporal\nsmoothness from these models to directly sample dense multi-view and\nmulti-frame images which can be employed to optimize continuous 4D\nrepresentation. Specifically, we design a simple yet effective denoising\nstrategy via score composition of video and multi-view diffusion models based\non the probability structure of the images to be generated. Owing to the high\nparallelism of the image generation and the efficiency of the modern 4D\nreconstruction pipeline, our framework can generate 4D content within few\nminutes. Furthermore, our method circumvents the reliance on 4D data, thereby\nhaving the potential to benefit from the scalability of the foundation video\nand multi-view diffusion models. Extensive experiments demonstrate the efficacy\nof our proposed framework and its capability to flexibly adapt to various types\nof prompts.\n","authors":["Zeyu Yang","Zijie Pan","Chun Gu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.02148v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2304.02970v6","updated":"2024-04-20T14:16:48Z","published":"2023-04-06T09:54:06Z","title":"Unraveling Instance Associations: A Closer Look for Audio-Visual\n Segmentation","summary":" Audio-visual segmentation (AVS) is a challenging task that involves\naccurately segmenting sounding objects based on audio-visual cues. The\neffectiveness of audio-visual learning critically depends on achieving accurate\ncross-modal alignment between sound and visual objects. Successful audio-visual\nlearning requires two essential components: 1) a challenging dataset with\nhigh-quality pixel-level multi-class annotated images associated with audio\nfiles, and 2) a model that can establish strong links between audio information\nand its corresponding visual object. However, these requirements are only\npartially addressed by current methods, with training sets containing biased\naudio-visual data, and models that generalise poorly beyond this biased\ntraining set. In this work, we propose a new cost-effective strategy to build\nchallenging and relatively unbiased high-quality audio-visual segmentation\nbenchmarks. We also propose a new informative sample mining method for\naudio-visual supervised contrastive learning to leverage discriminative\ncontrastive samples to enforce cross-modal understanding. We show empirical\nresults that demonstrate the effectiveness of our benchmark. Furthermore,\nexperiments conducted on existing AVS datasets and on our new benchmark show\nthat our method achieves state-of-the-art (SOTA) segmentation accuracy.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Helen Frazer","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v6.pdf","comment":"Code is available at https://github.com/cyh-0/CAVP"},{"id":"http://arxiv.org/abs/2312.01431v3","updated":"2024-04-20T14:15:36Z","published":"2023-12-03T15:40:10Z","title":"D$^2$ST-Adapter: Disentangled-and-Deformable Spatio-Temporal Adapter for\n Few-shot Action Recognition","summary":" Adapting large pre-trained image models to few-shot action recognition has\nproven to be an effective and efficient strategy for learning robust feature\nextractors, which is essential for few-shot learning. Typical fine-tuning based\nadaptation paradigm is prone to overfitting in the few-shot learning scenarios\nand offers little modeling flexibility for learning temporal features in video\ndata. In this work we present the Disentangled-and-Deformable Spatio-Temporal\nAdapter (D$^2$ST-Adapter), which is a novel adapter tuning framework\nwell-suited for few-shot action recognition due to lightweight design and low\nparameter-learning overhead. It is designed in a dual-pathway architecture to\nencode spatial and temporal features in a disentangled manner. In particular,\nwe devise the anisotropic Deformable Spatio-Temporal Attention module as the\ncore component of D$^2$ST-Adapter, which can be tailored with anisotropic\nsampling densities along spatial and temporal domains to learn spatial and\ntemporal features specifically in corresponding pathways, allowing our\nD$^2$ST-Adapter to encode features in a global view in 3D spatio-temporal space\nwhile maintaining a lightweight design. Extensive experiments with\ninstantiations of our method on both pre-trained ResNet and ViT demonstrate the\nsuperiority of our method over state-of-the-art methods for few-shot action\nrecognition. Our method is particularly well-suited to challenging scenarios\nwhere temporal dynamics are critical for action recognition.\n","authors":["Wenjie Pei","Qizhong Tan","Guangming Lu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2312.01431v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13386v1","updated":"2024-04-20T14:06:04Z","published":"2024-04-20T14:06:04Z","title":"SSVT: Self-Supervised Vision Transformer For Eye Disease Diagnosis Based\n On Fundus Images","summary":" Machine learning-based fundus image diagnosis technologies trigger worldwide\ninterest owing to their benefits such as reducing medical resource power and\nproviding objective evaluation results. However, current methods are commonly\nbased on supervised methods, bringing in a heavy workload to biomedical staff\nand hence suffering in expanding effective databases. To address this issue, in\nthis article, we established a label-free method, name 'SSVT',which can\nautomatically analyze un-labeled fundus images and generate high evaluation\naccuracy of 97.0% of four main eye diseases based on six public datasets and\ntwo datasets collected by Beijing Tongren Hospital. The promising results\nshowcased the effectiveness of the proposed unsupervised learning method, and\nthe strong application potential in biomedical resource shortage regions to\nimprove global eye health.\n","authors":["Jiaqi Wang","Mengtian Kang","Yong Liu","Chi Zhang","Ying Liu","Shiming Li","Yue Qi","Wenjun Xu","Chenyu Tang","Edoardo Occhipinti","Mayinuer Yusufu","Ningli Wang","Weiling Bai","Shuo Gao","Luigi G. Occhipinti"],"pdf_url":"https://arxiv.org/pdf/2404.13386v1.pdf","comment":"ISBI 2024"},{"id":"http://arxiv.org/abs/2401.13516v2","updated":"2024-04-20T13:56:32Z","published":"2024-01-24T15:14:05Z","title":"Delocate: Detection and Localization for Deepfake Videos with\n Randomly-Located Tampered Traces","summary":" Deepfake videos are becoming increasingly realistic, showing subtle tampering\ntraces on facial areasthat vary between frames. Consequently, many existing\nDeepfake detection methods struggle to detect unknown domain Deepfake videos\nwhile accurately locating the tampered region. To address thislimitation, we\npropose Delocate, a novel Deepfake detection model that can both recognize\nandlocalize unknown domain Deepfake videos. Ourmethod consists of two stages\nnamed recoveringand localization. In the recovering stage, the modelrandomly\nmasks regions of interest (ROIs) and reconstructs real faces without tampering\ntraces, resulting in a relatively good recovery effect for realfaces and a poor\nrecovery effect for fake faces. Inthe localization stage, the output of the\nrecoveryphase and the forgery ground truth mask serve assupervision to guide\nthe forgery localization process. This process strategically emphasizes the\nrecovery phase of fake faces with poor recovery, facilitating the localization\nof tampered regions. Ourextensive experiments on four widely used benchmark\ndatasets demonstrate that Delocate not onlyexcels in localizing tampered areas\nbut also enhances cross-domain detection performance.\n","authors":["Juan Hu","Xin Liao","Difei Gao","Satoshi Tsutsui","Qian Wang","Zheng Qin","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2401.13516v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2308.09921,\n arXiv:2305.05943"},{"id":"http://arxiv.org/abs/2404.13372v1","updated":"2024-04-20T13:19:08Z","published":"2024-04-20T13:19:08Z","title":"HybridFlow: Infusing Continuity into Masked Codebook for Extreme\n Low-Bitrate Image Compression","summary":" This paper investigates the challenging problem of learned image compression\n(LIC) with extreme low bitrates. Previous LIC methods based on transmitting\nquantized continuous features often yield blurry and noisy reconstruction due\nto the severe quantization loss. While previous LIC methods based on learned\ncodebooks that discretize visual space usually give poor-fidelity\nreconstruction due to the insufficient representation power of limited\ncodewords in capturing faithful details. We propose a novel dual-stream\nframework, HyrbidFlow, which combines the continuous-feature-based and\ncodebook-based streams to achieve both high perceptual quality and high\nfidelity under extreme low bitrates. The codebook-based stream benefits from\nthe high-quality learned codebook priors to provide high quality and clarity in\nreconstructed images. The continuous feature stream targets at maintaining\nfidelity details. To achieve the ultra low bitrate, a masked token-based\ntransformer is further proposed, where we only transmit a masked portion of\ncodeword indices and recover the missing indices through token generation\nguided by information from the continuous feature stream. We also develop a\nbridging correction network to merge the two streams in pixel decoding for\nfinal image reconstruction, where the continuous stream features rectify biases\nof the codebook-based pixel decoder to impose reconstructed fidelity details.\nExperimental results demonstrate superior performance across several datasets\nunder extremely low bitrates, compared with existing single-stream\ncodebook-based or continuous-feature-based LIC methods.\n","authors":["Lei Lu","Yanyue Xie","Wei Jiang","Wei Wang","Xue Lin","Yanzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13370v1","updated":"2024-04-20T13:15:27Z","published":"2024-04-20T13:15:27Z","title":"Movie101v2: Improved Movie Narration Benchmark","summary":" Automatic movie narration targets at creating video-aligned plot descriptions\nto assist visually impaired audiences. It differs from standard video\ncaptioning in that it requires not only describing key visual details but also\ninferring the plots developed across multiple movie shots, thus posing unique\nand ongoing challenges. To advance the development of automatic movie narrating\nsystems, we first revisit the limitations of existing datasets and develop a\nlarge-scale, bilingual movie narration dataset, Movie101v2. Second, taking into\naccount the essential difficulties in achieving applicable movie narration, we\nbreak the long-term goal into three progressive stages and tentatively focus on\nthe initial stages featuring understanding within individual clips. We also\nintroduce a new narration assessment to align with our staged task goals.\nThird, using our new dataset, we baseline several leading large vision-language\nmodels, including GPT-4V, and conduct in-depth investigations into the\nchallenges current models face for movie narration generation. Our findings\nreveal that achieving applicable movie narration generation is a fascinating\ngoal that requires thorough research.\n","authors":["Zihao Yue","Yepeng Zhang","Ziheng Wang","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2404.13370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06214v3","updated":"2024-04-20T13:15:21Z","published":"2023-10-10T00:07:25Z","title":"CoT3DRef: Chain-of-Thoughts Data-Efficient 3D Visual Grounding","summary":" 3D visual grounding is the ability to localize objects in 3D scenes\nconditioned by utterances. Most existing methods devote the referring head to\nlocalize the referred object directly, causing failure in complex scenarios. In\naddition, it does not illustrate how and why the network reaches the final\ndecision. In this paper, we address this question Can we design an\ninterpretable 3D visual grounding framework that has the potential to mimic the\nhuman perception system?. To this end, we formulate the 3D visual grounding\nproblem as a sequence-to-sequence Seq2Seq task by first predicting a chain of\nanchors and then the final target. Interpretability not only improves the\noverall performance but also helps us identify failure cases. Following the\nchain of thoughts approach enables us to decompose the referring task into\ninterpretable intermediate steps, boosting the performance and making our\nframework extremely data-efficient. Moreover, our proposed framework can be\neasily integrated into any existing architecture. We validate our approach\nthrough comprehensive experiments on the Nr3D, Sr3D, and Scanrefer benchmarks\nand show consistent performance gains compared to existing methods without\nrequiring manually annotated data. Furthermore, our proposed framework, dubbed\nCoT3DRef, is significantly data-efficient, whereas on the Sr3D dataset, when\ntrained only on 10% of the data, we match the SOTA performance that trained on\nthe entire data. The code is available at\nhttps:eslambakr.github.io/cot3dref.github.io/.\n","authors":["Eslam Mohamed Bakr","Mohamed Ayman","Mahmoud Ahmed","Habib Slim","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2310.06214v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2402.11677v3","updated":"2024-04-20T13:00:25Z","published":"2024-02-18T18:56:13Z","title":"MultiCorrupt: A Multi-Modal Robustness Dataset and Benchmark of\n LiDAR-Camera Fusion for 3D Object Detection","summary":" Multi-modal 3D object detection models for automated driving have\ndemonstrated exceptional performance on computer vision benchmarks like\nnuScenes. However, their reliance on densely sampled LiDAR point clouds and\nmeticulously calibrated sensor arrays poses challenges for real-world\napplications. Issues such as sensor misalignment, miscalibration, and disparate\nsampling frequencies lead to spatial and temporal misalignment in data from\nLiDAR and cameras. Additionally, the integrity of LiDAR and camera data is\noften compromised by adverse environmental conditions such as inclement\nweather, leading to occlusions and noise interference. To address this\nchallenge, we introduce MultiCorrupt, a comprehensive benchmark designed to\nevaluate the robustness of multi-modal 3D object detectors against ten distinct\ntypes of corruptions. We evaluate five state-of-the-art multi-modal detectors\non MultiCorrupt and analyze their performance in terms of their resistance\nability. Our results show that existing methods exhibit varying degrees of\nrobustness depending on the type of corruption and their fusion strategy. We\nprovide insights into which multi-modal design choices make such models robust\nagainst certain perturbations. The dataset generation code and benchmark are\nopen-sourced at https://github.com/ika-rwth-aachen/MultiCorrupt.\n","authors":["Till Beemelmanns","Quan Zhang","Christian Geller","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2402.11677v3.pdf","comment":"Code: https://github.com/ika-rwth-aachen/MultiCorrupt"},{"id":"http://arxiv.org/abs/2307.06737v2","updated":"2024-04-20T11:53:13Z","published":"2023-07-13T13:17:50Z","title":"Improving 2D Human Pose Estimation in Rare Camera Views with Synthetic\n Data","summary":" Methods and datasets for human pose estimation focus predominantly on side-\nand front-view scenarios. We overcome the limitation by leveraging synthetic\ndata and introduce RePoGen (RarE POses GENerator), an SMPL-based method for\ngenerating synthetic humans with comprehensive control over pose and view.\nExperiments on top-view datasets and a new dataset of real images with diverse\nposes show that adding the RePoGen data to the COCO dataset outperforms\nprevious approaches to top- and bottom-view pose estimation without harming\nperformance on common views. An ablation study shows that anatomical\nplausibility, a property prior research focused on, is not a prerequisite for\neffective performance. The introduced dataset and the corresponding code are\navailable on https://mirapurkrabek.github.io/RePoGen-paper/ .\n","authors":["Miroslav Purkrabek","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2307.06737v2.pdf","comment":"https://mirapurkrabek.github.io/RePoGen-paper/"},{"id":"http://arxiv.org/abs/2404.13353v1","updated":"2024-04-20T11:28:14Z","published":"2024-04-20T11:28:14Z","title":"Generating Daylight-driven Architectural Design via Diffusion Models","summary":" In recent years, the rapid development of large-scale models has made new\npossibilities for interdisciplinary fields such as architecture. In this paper,\nwe present a novel daylight-driven AI-aided architectural design method.\nFirstly, we formulate a method for generating massing models, producing\narchitectural massing models using random parameters quickly. Subsequently, we\nintegrate a daylight-driven facade design strategy, accurately determining\nwindow layouts and applying them to the massing models. Finally, we seamlessly\ncombine a large-scale language model with a text-to-image model, enhancing the\nefficiency of generating visual architectural design renderings. Experimental\nresults demonstrate that our approach supports architects' creative\ninspirations and pioneers novel avenues for architectural design development.\nProject page: https://zrealli.github.io/DDADesign/.\n","authors":["Pengzhi Li","Baijuan Li"],"pdf_url":"https://arxiv.org/pdf/2404.13353v1.pdf","comment":"Project page: https://zrealli.github.io/DDADesign/"},{"id":"http://arxiv.org/abs/2404.13342v1","updated":"2024-04-20T10:40:12Z","published":"2024-04-20T10:40:12Z","title":"Hyperspectral Anomaly Detection with Self-Supervised Anomaly Prior","summary":" The majority of existing hyperspectral anomaly detection (HAD) methods use\nthe low-rank representation (LRR) model to separate the background and anomaly\ncomponents, where the anomaly component is optimized by handcrafted sparse\npriors (e.g., $\\ell_{2,1}$-norm). However, this may not be ideal since they\noverlook the spatial structure present in anomalies and make the detection\nresult largely dependent on manually set sparsity. To tackle these problems, we\nredefine the optimization criterion for the anomaly component in the LRR model\nwith a self-supervised network called self-supervised anomaly prior (SAP). This\nprior is obtained by the pretext task of self-supervised learning, which is\ncustomized to learn the characteristics of hyperspectral anomalies.\nSpecifically, this pretext task is a classification task to distinguish the\noriginal hyperspectral image (HSI) and the pseudo-anomaly HSI, where the\npseudo-anomaly is generated from the original HSI and designed as a prism with\narbitrary polygon bases and arbitrary spectral bands. In addition, a\ndual-purified strategy is proposed to provide a more refined background\nrepresentation with an enriched background dictionary, facilitating the\nseparation of anomalies from complex backgrounds. Extensive experiments on\nvarious hyperspectral datasets demonstrate that the proposed SAP offers a more\naccurate and interpretable solution than other advanced HAD methods.\n","authors":["Yidan Liu","Weiying Xie","Kai Jiang","Jiaqing Zhang","Yunsong Li","Leyuan Fang"],"pdf_url":"https://arxiv.org/pdf/2404.13342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13330v1","updated":"2024-04-20T09:27:05Z","published":"2024-04-20T09:27:05Z","title":"SEGSRNet for Stereo-Endoscopic Image Super-Resolution and Surgical\n Instrument Segmentation","summary":" SEGSRNet addresses the challenge of precisely identifying surgical\ninstruments in low-resolution stereo endoscopic images, a common issue in\nmedical imaging and robotic surgery. Our innovative framework enhances image\nclarity and segmentation accuracy by applying state-of-the-art super-resolution\ntechniques before segmentation. This ensures higher-quality inputs for more\nprecise segmentation. SEGSRNet combines advanced feature extraction and\nattention mechanisms with spatial processing to sharpen image details, which is\nsignificant for accurate tool identification in medical images. Our proposed\nmodel outperforms current models including Dice, IoU, PSNR, and SSIM, SEGSRNet\nwhere it produces clearer and more accurate images for stereo endoscopic\nsurgical imaging. SEGSRNet can provide image resolution and precise\nsegmentation which can significantly enhance surgical accuracy and patient care\noutcomes.\n","authors":["Mansoor Hayat","Supavadee Aramvith","Titipat Achakulvisut"],"pdf_url":"https://arxiv.org/pdf/2404.13330v1.pdf","comment":"Paper accepted for Presentation in 46th Annual International\n Conference of the IEEE Engineering in Medicine and Biology Society (EMBS),\n Orlando, Florida, USA"},{"id":"http://arxiv.org/abs/2403.17881v3","updated":"2024-04-20T09:06:02Z","published":"2024-03-26T17:12:34Z","title":"Deepfake Generation and Detection: A Benchmark and Survey","summary":" Deepfake is a technology dedicated to creating highly realistic facial images\nand videos under specific conditions, which has significant application\npotential in fields such as entertainment, movie production, digital human\ncreation, to name a few. With the advancements in deep learning, techniques\nprimarily represented by Variational Autoencoders and Generative Adversarial\nNetworks have achieved impressive generation results. More recently, the\nemergence of diffusion models with powerful generation capabilities has sparked\na renewed wave of research. In addition to deepfake generation, corresponding\ndetection technologies continuously evolve to regulate the potential misuse of\ndeepfakes, such as for privacy invasion and phishing attacks. This survey\ncomprehensively reviews the latest developments in deepfake generation and\ndetection, summarizing and analyzing current state-of-the-arts in this rapidly\nevolving field. We first unify task definitions, comprehensively introduce\ndatasets and metrics, and discuss developing technologies. Then, we discuss the\ndevelopment of several related sub-fields and focus on researching four\nrepresentative deepfake fields: face swapping, face reenactment, talking face\ngeneration, and facial attribute editing, as well as forgery detection.\nSubsequently, we comprehensively benchmark representative methods on popular\ndatasets for each field, fully evaluating the latest and influential published\nworks. Finally, we analyze challenges and future research directions of the\ndiscussed fields.\n","authors":["Gan Pei","Jiangning Zhang","Menghan Hu","Zhenyu Zhang","Chengjie Wang","Yunsheng Wu","Guangtao Zhai","Jian Yang","Chunhua Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.17881v3.pdf","comment":"We closely follow the latest developments in\n https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection"},{"id":"http://arxiv.org/abs/2404.13324v1","updated":"2024-04-20T08:48:37Z","published":"2024-04-20T08:48:37Z","title":"Collaborative Visual Place Recognition through Federated Learning","summary":" Visual Place Recognition (VPR) aims to estimate the location of an image by\ntreating it as a retrieval problem. VPR uses a database of geo-tagged images\nand leverages deep neural networks to extract a global representation, called\ndescriptor, from each image. While the training data for VPR models often\noriginates from diverse, geographically scattered sources (geo-tagged images),\nthe training process itself is typically assumed to be centralized. This\nresearch revisits the task of VPR through the lens of Federated Learning (FL),\naddressing several key challenges associated with this adaptation. VPR data\ninherently lacks well-defined classes, and models are typically trained using\ncontrastive learning, which necessitates a data mining step on a centralized\ndatabase. Additionally, client devices in federated systems can be highly\nheterogeneous in terms of their processing capabilities. The proposed FedVPR\nframework not only presents a novel approach for VPR but also introduces a new,\nchallenging, and realistic task for FL research, paving the way to other image\nretrieval tasks in FL.\n","authors":["Mattia Dutto","Gabriele Berton","Debora Caldarola","Eros Fanì","Gabriele Trivigno","Carlo Masone"],"pdf_url":"https://arxiv.org/pdf/2404.13324v1.pdf","comment":"13 pages, 7 figures, CVPR - The 3rd International Workshop on\n Federated Learning for Computer Vision (FedVision-2024)"},{"id":"http://arxiv.org/abs/2404.13320v1","updated":"2024-04-20T08:28:43Z","published":"2024-04-20T08:28:43Z","title":"Pixel is a Barrier: Diffusion Models Are More Adversarially Robust Than\n We Think","summary":" Adversarial examples for diffusion models are widely used as solutions for\nsafety concerns. By adding adversarial perturbations to personal images,\nattackers can not edit or imitate them easily. However, it is essential to note\nthat all these protections target the latent diffusion model (LDMs), the\nadversarial examples for diffusion models in the pixel space (PDMs) are largely\noverlooked. This may mislead us to think that the diffusion models are\nvulnerable to adversarial attacks like most deep models. In this paper, we show\nnovel findings that: even though gradient-based white-box attacks can be used\nto attack the LDMs, they fail to attack PDMs. This finding is supported by\nextensive experiments of almost a wide range of attacking methods on various\nPDMs and LDMs with different model structures, which means diffusion models are\nindeed much more robust against adversarial attacks. We also find that PDMs can\nbe used as an off-the-shelf purifier to effectively remove the adversarial\npatterns that were generated on LDMs to protect the images, which means that\nmost protection methods nowadays, to some extent, cannot protect our images\nfrom malicious attacks. We hope that our insights will inspire the community to\nrethink the adversarial samples for diffusion models as protection methods and\nmove forward to more effective protection. Codes are available in\nhttps://github.com/xavihart/PDM-Pure.\n","authors":["Haotian Xue","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13311v1","updated":"2024-04-20T07:56:21Z","published":"2024-04-20T07:56:21Z","title":"STAT: Towards Generalizable Temporal Action Localization","summary":" Weakly-supervised temporal action localization (WTAL) aims to recognize and\nlocalize action instances with only video-level labels. Despite the significant\nprogress, existing methods suffer from severe performance degradation when\ntransferring to different distributions and thus may hardly adapt to real-world\nscenarios . To address this problem, we propose the Generalizable Temporal\nAction Localization task (GTAL), which focuses on improving the\ngeneralizability of action localization methods. We observed that the\nperformance decline can be primarily attributed to the lack of generalizability\nto different action scales. To address this problem, we propose STAT\n(Self-supervised Temporal Adaptive Teacher), which leverages a teacher-student\nstructure for iterative refinement. Our STAT features a refinement module and\nan alignment module. The former iteratively refines the model's output by\nleveraging contextual information and helps adapt to the target scale. The\nlatter improves the refinement process by promoting a consensus between student\nand teacher models. We conduct extensive experiments on three datasets,\nTHUMOS14, ActivityNet1.2, and HACS, and the results show that our method\nsignificantly improves the Baseline methods under the cross-distribution\nevaluation setting, even approaching the same-distribution evaluation\nperformance.\n","authors":["Yangcen Liu","Ziyi Liu","Yuanhao Zhai","Wen Li","David Doerman","Junsong Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.13311v1.pdf","comment":"14 pages, LaTeX;"},{"id":"http://arxiv.org/abs/2404.04848v2","updated":"2024-04-20T07:54:18Z","published":"2024-04-07T07:42:04Z","title":"Task-Aware Encoder Control for Deep Video Compression","summary":" Prior research on deep video compression (DVC) for machine tasks typically\nnecessitates training a unique codec for each specific task, mandating a\ndedicated decoder per task. In contrast, traditional video codecs employ a\nflexible encoder controller, enabling the adaptation of a single codec to\ndifferent tasks through mechanisms like mode prediction. Drawing inspiration\nfrom this, we introduce an innovative encoder controller for deep video\ncompression for machines. This controller features a mode prediction and a\nGroup of Pictures (GoP) selection module. Our approach centralizes control at\nthe encoding stage, allowing for adaptable encoder adjustments across different\ntasks, such as detection and tracking, while maintaining compatibility with a\nstandard pre-trained DVC decoder. Empirical evidence demonstrates that our\nmethod is applicable across multiple tasks with various existing pre-trained\nDVCs. Moreover, extensive experiments demonstrate that our method outperforms\nprevious DVC by about 25% bitrate for different tasks, with only one\npre-trained decoder.\n","authors":["Xingtong Ge","Jixiang Luo","Xinjie Zhang","Tongda Xu","Guo Lu","Dailan He","Jing Geng","Yan Wang","Jun Zhang","Hongwei Qin"],"pdf_url":"https://arxiv.org/pdf/2404.04848v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01643v2","updated":"2024-04-20T07:41:32Z","published":"2024-04-02T05:19:27Z","title":"A Closer Look at Spatial-Slice Features Learning for COVID-19 Detection","summary":" Conventional Computed Tomography (CT) imaging recognition faces two\nsignificant challenges: (1) There is often considerable variability in the\nresolution and size of each CT scan, necessitating strict requirements for the\ninput size and adaptability of models. (2) CT-scan contains large number of\nout-of-distribution (OOD) slices. The crucial features may only be present in\nspecific spatial regions and slices of the entire CT scan. How can we\neffectively figure out where these are located? To deal with this, we introduce\nan enhanced Spatial-Slice Feature Learning (SSFL++) framework specifically\ndesigned for CT scan. It aim to filter out a OOD data within whole CT scan,\nenabling our to select crucial spatial-slice for analysis by reducing 70%\nredundancy totally. Meanwhile, we proposed Kernel-Density-based slice Sampling\n(KDS) method to improve the stability when training and inference stage,\ntherefore speeding up the rate of convergence and boosting performance. As a\nresult, the experiments demonstrate the promising performance of our model\nusing a simple EfficientNet-2D (E2D) model, even with only 1% of the training\ndata. The efficacy of our approach has been validated on the COVID-19-CT-DB\ndatasets provided by the DEF-AI-MIA workshop, in conjunction with CVPR 2024.\nOur source code is available at https://github.com/ming053l/E2D\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yang Fan Chiang","Yi-Shiuan Chou","Chih-Yu Jiang","Shen-Chieh Tai","Chi-Han Tsai"],"pdf_url":"https://arxiv.org/pdf/2404.01643v2.pdf","comment":"Camera-ready version, accepted by DEF-AI-MIA workshop, in conjunted\n with CVPR2024"},{"id":"http://arxiv.org/abs/2404.13306v1","updated":"2024-04-20T07:28:55Z","published":"2024-04-20T07:28:55Z","title":"FakeBench: Uncover the Achilles' Heels of Fake Images with Large\n Multimodal Models","summary":" Recently, fake images generated by artificial intelligence (AI) models have\nbecome indistinguishable from the real, exerting new challenges for fake image\ndetection models. To this extent, simple binary judgments of real or fake seem\nless convincing and credible due to the absence of human-understandable\nexplanations. Fortunately, Large Multimodal Models (LMMs) bring possibilities\nto materialize the judgment process while their performance remains\nundetermined. Therefore, we propose FakeBench, the first-of-a-kind benchmark\ntowards transparent defake, consisting of fake images with human language\ndescriptions on forgery signs. FakeBench gropes for two open questions of LMMs:\n(1) can LMMs distinguish fake images generated by AI, and (2) how do LMMs\ndistinguish fake images? In specific, we construct the FakeClass dataset with\n6k diverse-sourced fake and real images, each equipped with a Question&Answer\npair concerning the authenticity of images, which are utilized to benchmark the\ndetection ability. To examine the reasoning and interpretation abilities of\nLMMs, we present the FakeClue dataset, consisting of 15k pieces of descriptions\non the telltale clues revealing the falsification of fake images. Besides, we\nconstruct the FakeQA to measure the LMMs' open-question answering ability on\nfine-grained authenticity-relevant aspects. Our experimental results discover\nthat current LMMs possess moderate identification ability, preliminary\ninterpretation and reasoning ability, and passable open-question answering\nability for image defake. The FakeBench will be made publicly available soon.\n","authors":["Yixuan Li","Xuelin Liu","Xiaoyang Wang","Shiqi Wang","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2404.13306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13299v1","updated":"2024-04-20T07:05:45Z","published":"2024-04-20T07:05:45Z","title":"PCQA: A Strong Baseline for AIGC Quality Assessment Based on Prompt\n Condition","summary":" The development of Large Language Models (LLM) and Diffusion Models brings\nthe boom of Artificial Intelligence Generated Content (AIGC). It is essential\nto build an effective quality assessment framework to provide a quantifiable\nevaluation of different images or videos based on the AIGC technologies. The\ncontent generated by AIGC methods is driven by the crafted prompts. Therefore,\nit is intuitive that the prompts can also serve as the foundation of the AIGC\nquality assessment. This study proposes an effective AIGC quality assessment\n(QA) framework. First, we propose a hybrid prompt encoding method based on a\ndual-source CLIP (Contrastive Language-Image Pre-Training) text encoder to\nunderstand and respond to the prompt conditions. Second, we propose an\nensemble-based feature mixer module to effectively blend the adapted prompt and\nvision features. The empirical study practices in two datasets: AIGIQA-20K\n(AI-Generated Image Quality Assessment database) and T2VQA-DB (Text-to-Video\nQuality Assessment DataBase), which validates the effectiveness of our proposed\nmethod: Prompt Condition Quality Assessment (PCQA). Our proposed simple and\nfeasible framework may promote research development in the multimodal\ngeneration field.\n","authors":["Xi Fang","Weigang Wang","Xiaoxin Lv","Jun Yan"],"pdf_url":"https://arxiv.org/pdf/2404.13299v1.pdf","comment":"Published in CVPR-2024's NTIRE: New Trends in Image Restoration and\n Enhancement workshop and challenges"},{"id":"http://arxiv.org/abs/2404.13288v1","updated":"2024-04-20T06:25:32Z","published":"2024-04-20T06:25:32Z","title":"PoseINN: Realtime Visual-based Pose Regression and Localization with\n Invertible Neural Networks","summary":" Estimating ego-pose from cameras is an important problem in robotics with\napplications ranging from mobile robotics to augmented reality. While SOTA\nmodels are becoming increasingly accurate, they can still be unwieldy due to\nhigh computational costs. In this paper, we propose to solve the problem by\nusing invertible neural networks (INN) to find the mapping between the latent\nspace of images and poses for a given scene. Our model achieves similar\nperformance to the SOTA while being faster to train and only requiring offline\nrendering of low-resolution synthetic data. By using normalizing flows, the\nproposed method also provides uncertainty estimation for the output. We also\ndemonstrated the efficiency of this method by deploying the model on a mobile\nrobot.\n","authors":["Zirui Zang","Ahmad Amine","Rahul Mangharam"],"pdf_url":"https://arxiv.org/pdf/2404.13288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05060v2","updated":"2024-04-20T06:14:34Z","published":"2023-04-11T08:43:52Z","title":"SPIRiT-Diffusion: Self-Consistency Driven Diffusion Model for\n Accelerated MRI","summary":" Diffusion models have emerged as a leading methodology for image generation\nand have proven successful in the realm of magnetic resonance imaging (MRI)\nreconstruction. However, existing reconstruction methods based on diffusion\nmodels are primarily formulated in the image domain, making the reconstruction\nquality susceptible to inaccuracies in coil sensitivity maps (CSMs). k-space\ninterpolation methods can effectively address this issue but conventional\ndiffusion models are not readily applicable in k-space interpolation. To\novercome this challenge, we introduce a novel approach called SPIRiT-Diffusion,\nwhich is a diffusion model for k-space interpolation inspired by the iterative\nself-consistent SPIRiT method. Specifically, we utilize the iterative solver of\nthe self-consistent term (i.e., k-space physical prior) in SPIRiT to formulate\na novel stochastic differential equation (SDE) governing the diffusion process.\nSubsequently, k-space data can be interpolated by executing the diffusion\nprocess. This innovative approach highlights the optimization model's role in\ndesigning the SDE in diffusion models, enabling the diffusion process to align\nclosely with the physics inherent in the optimization model, a concept referred\nto as model-driven diffusion. We evaluated the proposed SPIRiT-Diffusion method\nusing a 3D joint intracranial and carotid vessel wall imaging dataset. The\nresults convincingly demonstrate its superiority over image-domain\nreconstruction methods, achieving high reconstruction quality even at a\nsubstantial acceleration rate of 10.\n","authors":["Zhuo-Xu Cui","Chentao Cao","Yue Wang","Sen Jia","Jing Cheng","Xin Liu","Hairong Zheng","Dong Liang","Yanjie Zhu"],"pdf_url":"https://arxiv.org/pdf/2304.05060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13282v1","updated":"2024-04-20T06:01:09Z","published":"2024-04-20T06:01:09Z","title":"Wills Aligner: A Robust Multi-Subject Brain Representation Learner","summary":" Decoding visual information from human brain activity has seen remarkable\nadvancements in recent research. However, due to the significant variability in\ncortical parcellation and cognition patterns across subjects, current\napproaches personalized deep models for each subject, constraining the\npracticality of this technology in real-world contexts. To tackle the\nchallenges, we introduce Wills Aligner, a robust multi-subject brain\nrepresentation learner. Our Wills Aligner initially aligns different subjects'\nbrains at the anatomical level. Subsequently, it incorporates a mixture of\nbrain experts to learn individual cognition patterns. Additionally, it\ndecouples the multi-subject learning task into a two-stage training, propelling\nthe deep model and its plugin network to learn inter-subject commonality\nknowledge and various cognition patterns, respectively. Wills Aligner enables\nus to overcome anatomical differences and to efficiently leverage a single\nmodel for multi-subject brain representation learning. We meticulously evaluate\nthe performance of our approach across coarse-grained and fine-grained visual\ndecoding tasks. The experimental results demonstrate that our Wills Aligner\nachieves state-of-the-art performance.\n","authors":["Guangyin Bao","Zixuan Gong","Qi Zhang","Jialei Zhou","Wei Fan","Kun Yi","Usman Naseem","Liang Hu","Duoqian Miao"],"pdf_url":"https://arxiv.org/pdf/2404.13282v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.13277v1","updated":"2024-04-20T05:24:06Z","published":"2024-04-20T05:24:06Z","title":"Beyond Score Changes: Adversarial Attack on No-Reference Image Quality\n Assessment from Two Perspectives","summary":" Deep neural networks have demonstrated impressive success in No-Reference\nImage Quality Assessment (NR-IQA). However, recent researches highlight the\nvulnerability of NR-IQA models to subtle adversarial perturbations, leading to\ninconsistencies between model predictions and subjective ratings. Current\nadversarial attacks, however, focus on perturbing predicted scores of\nindividual images, neglecting the crucial aspect of inter-score correlation\nrelationships within an entire image set. Meanwhile, it is important to note\nthat the correlation, like ranking correlation, plays a significant role in\nNR-IQA tasks. To comprehensively explore the robustness of NR-IQA models, we\nintroduce a new framework of correlation-error-based attacks that perturb both\nthe correlation within an image set and score changes on individual images. Our\nresearch primarily focuses on ranking-related correlation metrics like\nSpearman's Rank-Order Correlation Coefficient (SROCC) and prediction\nerror-related metrics like Mean Squared Error (MSE). As an instantiation, we\npropose a practical two-stage SROCC-MSE-Attack (SMA) that initially optimizes\ntarget attack scores for the entire image set and then generates adversarial\nexamples guided by these scores. Experimental results demonstrate that our SMA\nmethod not only significantly disrupts the SROCC to negative values but also\nmaintains a considerable change in the scores of individual images. Meanwhile,\nit exhibits state-of-the-art performance across metrics with different\ncategories. Our method provides a new perspective on the robustness of NR-IQA\nmodels.\n","authors":["Chenxi Yang","Yujia Liu","Dingquan Li","Yan Zhong","Tingting Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.13277v1.pdf","comment":"Submitted to a conference"},{"id":"http://arxiv.org/abs/2404.13273v1","updated":"2024-04-20T05:13:56Z","published":"2024-04-20T05:13:56Z","title":"Multi-feature Reconstruction Network using Crossed-mask Restoration for\n Unsupervised Anomaly Detection","summary":" Unsupervised anomaly detection using only normal samples is of great\nsignificance for quality inspection in industrial manufacturing. Although\nexisting reconstruction-based methods have achieved promising results, they\nstill face two problems: poor distinguishable information in image\nreconstruction and well abnormal regeneration caused by model\nover-generalization ability. To overcome the above issues, we convert the image\nreconstruction into a combination of parallel feature restorations and propose\na multi-feature reconstruction network, MFRNet, using crossed-mask restoration\nin this paper. Specifically, a multi-scale feature aggregator is first\ndeveloped to generate more discriminative hierarchical representations of the\ninput images from a pre-trained model. Subsequently, a crossed-mask generator\nis adopted to randomly cover the extracted feature map, followed by a\nrestoration network based on the transformer structure for high-quality repair\nof the missing regions. Finally, a hybrid loss is equipped to guide model\ntraining and anomaly estimation, which gives consideration to both the pixel\nand structural similarity. Extensive experiments show that our method is highly\ncompetitive with or significantly outperforms other state-of-the-arts on four\npublic available datasets and one self-made dataset.\n","authors":["Junpu Wang","Guili Xu","Chunlei Li","Guangshuai Gao","Yuehua Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.13273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13270v1","updated":"2024-04-20T04:51:59Z","published":"2024-04-20T04:51:59Z","title":"StrideNET: Swin Transformer for Terrain Recognition with Dynamic\n Roughness Extraction","summary":" Advancements in deep learning are revolutionizing the classification of\nremote-sensing images. Transformer-based architectures, utilizing\nself-attention mechanisms, have emerged as alternatives to conventional\nconvolution methods, enabling the capture of long-range dependencies along with\nglobal relationships in the image. Motivated by these advancements, this paper\npresents StrideNET, a novel dual-branch architecture designed for terrain\nrecognition and implicit properties estimation. The terrain recognition branch\nutilizes the Swin Transformer, leveraging its hierarchical representation and\nlow computational cost to efficiently capture both local and global features.\nThe terrain properties branch focuses on the extraction of surface properties\nsuch as roughness and slipperiness using a statistical texture analysis method.\nBy computing surface terrain properties, an enhanced environmental perception\ncan be obtained. The StrideNET model is trained on a dataset comprising four\ntarget terrain classes: Grassy, Marshy, Sandy, and Rocky. StrideNET attains\ncompetitive performance compared to contemporary methods. The implications of\nthis work extend to various applications, including environmental monitoring,\nland use and land cover (LULC) classification, disaster response, precision\nagriculture, and much more.\n","authors":["Maitreya Shelare","Neha Shigvan","Atharva Satam","Poonam Sonar"],"pdf_url":"https://arxiv.org/pdf/2404.13270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01123v2","updated":"2024-04-20T04:38:35Z","published":"2024-02-02T03:50:45Z","title":"A Single Simple Patch is All You Need for AI-generated Image Detection","summary":" The recent development of generative models unleashes the potential of\ngenerating hyper-realistic fake images. To prevent the malicious usage of fake\nimages, AI-generated image detection aims to distinguish fake images from real\nimages. However, existing method suffer from severe performance drop when\ndetecting images generated by unseen generators. We find that generative models\ntend to focus on generating the patches with rich textures to make the images\nmore realistic while neglecting the hidden noise caused by camera capture\npresent in simple patches. In this paper, we propose to exploit the noise\npattern of a single simple patch to identify fake images. Furthermore, due to\nthe performance decline when handling low-quality generated images, we\nintroduce an enhancement module and a perception module to remove the\ninterfering information. Extensive experiments demonstrate that our method can\nachieve state-of-the-art performance on public benchmarks.\n","authors":["Jiaxuan Chen","Jieteng Yao","Li Niu"],"pdf_url":"https://arxiv.org/pdf/2402.01123v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12216v2","updated":"2024-04-20T04:33:08Z","published":"2024-04-18T14:20:30Z","title":"ProTA: Probabilistic Token Aggregation for Text-Video Retrieval","summary":" Text-video retrieval aims to find the most relevant cross-modal samples for a\ngiven query. Recent methods focus on modeling the whole spatial-temporal\nrelations. However, since video clips contain more diverse content than\ncaptions, the model aligning these asymmetric video-text pairs has a high risk\nof retrieving many false positive results. In this paper, we propose\nProbabilistic Token Aggregation (ProTA) to handle cross-modal interaction with\ncontent asymmetry. Specifically, we propose dual partial-related aggregation to\ndisentangle and re-aggregate token representations in both low-dimension and\nhigh-dimension spaces. We propose token-based probabilistic alignment to\ngenerate token-level probabilistic representation and maintain the feature\nrepresentation diversity. In addition, an adaptive contrastive loss is proposed\nto learn compact cross-modal distribution space. Based on extensive\nexperiments, ProTA achieves significant improvements on MSR-VTT (50.9%), LSMDC\n(25.8%), and DiDeMo (47.2%).\n","authors":["Han Fang","Xianghao Zang","Chao Ban","Zerun Feng","Lanxiang Zhou","Zhongjiang He","Yongxiang Li","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2404.12216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13268v1","updated":"2024-04-20T04:30:38Z","published":"2024-04-20T04:30:38Z","title":"Multi-Cell Decoder and Mutual Learning for Table Structure and Character\n Recognition","summary":" Extracting table contents from documents such as scientific papers and\nfinancial reports and converting them into a format that can be processed by\nlarge language models is an important task in knowledge information processing.\nEnd-to-end approaches, which recognize not only table structure but also cell\ncontents, achieved performance comparable to state-of-the-art models using\nexternal character recognition systems, and have potential for further\nimprovements. In addition, these models can now recognize long tables with\nhundreds of cells by introducing local attention. However, the models recognize\ntable structure in one direction from the header to the footer, and cell\ncontent recognition is performed independently for each cell, so there is no\nopportunity to retrieve useful information from the neighbor cells. In this\npaper, we propose a multi-cell content decoder and bidirectional mutual\nlearning mechanism to improve the end-to-end approach. The effectiveness is\ndemonstrated on two large datasets, and the experimental results show\ncomparable performance to state-of-the-art models, even for long tables with\nlarge numbers of cells.\n","authors":["Takaya Kawakatsu"],"pdf_url":"https://arxiv.org/pdf/2404.13268v1.pdf","comment":"ICDAR 2024"},{"id":"http://arxiv.org/abs/2404.13263v1","updated":"2024-04-20T04:17:34Z","published":"2024-04-20T04:17:34Z","title":"FilterPrompt: Guiding Image Transfer in Diffusion Models","summary":" In controllable generation tasks, flexibly manipulating the generated images\nto attain a desired appearance or structure based on a single input image cue\nremains a critical and longstanding challenge. Achieving this requires the\neffective decoupling of key attributes within the input image data, aiming to\nget representations accurately. Previous research has predominantly\nconcentrated on disentangling image attributes within feature space. However,\nthe complex distribution present in real-world data often makes the application\nof such decoupling algorithms to other datasets challenging. Moreover, the\ngranularity of control over feature encoding frequently fails to meet specific\ntask requirements. Upon scrutinizing the characteristics of various generative\nmodels, we have observed that the input sensitivity and dynamic evolution\nproperties of the diffusion model can be effectively fused with the explicit\ndecomposition operation in pixel space. This integration enables the image\nprocessing operations performed in pixel space for a specific feature\ndistribution of the input image, and can achieve the desired control effect in\nthe generated results. Therefore, we propose FilterPrompt, an approach to\nenhance the model control effect. It can be universally applied to any\ndiffusion model, allowing users to adjust the representation of specific image\nfeatures in accordance with task requirements, thereby facilitating more\nprecise and controllable generation outcomes. In particular, our designed\nexperiments demonstrate that the FilterPrompt optimizes feature correlation,\nmitigates content conflicts during the generation process, and enhances the\nmodel's control capability.\n","authors":["Xi Wang","Yichen Peng","Heng Fang","Haoran Xie","Xi Yang","Chuntao Li"],"pdf_url":"https://arxiv.org/pdf/2404.13263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13252v1","updated":"2024-04-20T03:39:54Z","published":"2024-04-20T03:39:54Z","title":"3D-Convolution Guided Spectral-Spatial Transformer for Hyperspectral\n Image Classification","summary":" In recent years, Vision Transformers (ViTs) have shown promising\nclassification performance over Convolutional Neural Networks (CNNs) due to\ntheir self-attention mechanism. Many researchers have incorporated ViTs for\nHyperspectral Image (HSI) classification. HSIs are characterised by narrow\ncontiguous spectral bands, providing rich spectral data. Although ViTs excel\nwith sequential data, they cannot extract spectral-spatial information like\nCNNs. Furthermore, to have high classification performance, there should be a\nstrong interaction between the HSI token and the class (CLS) token. To solve\nthese issues, we propose a 3D-Convolution guided Spectral-Spatial Transformer\n(3D-ConvSST) for HSI classification that utilizes a 3D-Convolution Guided\nResidual Module (CGRM) in-between encoders to \"fuse\" the local spatial and\nspectral information and to enhance the feature propagation. Furthermore, we\nforego the class token and instead apply Global Average Pooling, which\neffectively encodes more discriminative and pertinent high-level features for\nclassification. Extensive experiments have been conducted on three public HSI\ndatasets to show the superiority of the proposed model over state-of-the-art\ntraditional, convolutional, and Transformer models. The code is available at\nhttps://github.com/ShyamVarahagiri/3D-ConvSST.\n","authors":["Shyam Varahagiri","Aryaman Sinha","Shiv Ram Dubey","Satish Kumar Singh"],"pdf_url":"https://arxiv.org/pdf/2404.13252v1.pdf","comment":"Accepted in IEEE Conference on Artificial Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.13239v1","updated":"2024-04-20T02:40:49Z","published":"2024-04-20T02:40:49Z","title":"Beyond Pixel-Wise Supervision for Medical Image Segmentation: From\n Traditional Models to Foundation Models","summary":" Medical image segmentation plays an important role in many image-guided\nclinical approaches. However, existing segmentation algorithms mostly rely on\nthe availability of fully annotated images with pixel-wise annotations for\ntraining, which can be both labor-intensive and expertise-demanding, especially\nin the medical imaging domain where only experts can provide reliable and\naccurate annotations. To alleviate this challenge, there has been a growing\nfocus on developing segmentation methods that can train deep models with weak\nannotations, such as image-level, bounding boxes, scribbles, and points. The\nemergence of vision foundation models, notably the Segment Anything Model\n(SAM), has introduced innovative capabilities for segmentation tasks using weak\nannotations for promptable segmentation enabled by large-scale pre-training.\nAdopting foundation models together with traditional learning methods has\nincreasingly gained recent interest research community and shown potential for\nreal-world applications. In this paper, we present a comprehensive survey of\nrecent progress on annotation-efficient learning for medical image segmentation\nutilizing weak annotations before and in the era of foundation models.\nFurthermore, we analyze and discuss several challenges of existing approaches,\nwhich we believe will provide valuable guidance for shaping the trajectory of\nfoundational models to further advance the field of medical image segmentation.\n","authors":["Yuyan Shi","Jialu Ma","Jin Yang","Shasha Wang","Yichi Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.13239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00717v2","updated":"2024-04-20T02:32:17Z","published":"2024-03-31T15:22:11Z","title":"End-to-End Autonomous Driving through V2X Cooperation","summary":" Cooperatively utilizing both ego-vehicle and infrastructure sensor data via\nV2X communication has emerged as a promising approach for advanced autonomous\ndriving. However, current research mainly focuses on improving individual\nmodules, rather than taking end-to-end learning to optimize final planning\nperformance, resulting in underutilized data potential. In this paper, we\nintroduce UniV2X, a pioneering cooperative autonomous driving framework that\nseamlessly integrates all key driving modules across diverse views into a\nunified network. We propose a sparse-dense hybrid data transmission and fusion\nmechanism for effective vehicle-infrastructure cooperation, offering three\nadvantages: 1) Effective for simultaneously enhancing agent perception, online\nmapping, and occupancy prediction, ultimately improving planning performance.\n2) Transmission-friendly for practical and limited communication conditions. 3)\nReliable data fusion with interpretability of this hybrid data. We implement\nUniV2X, as well as reproducing several benchmark methods, on the challenging\nDAIR-V2X, the real-world cooperative driving dataset. Experimental results\ndemonstrate the effectiveness of UniV2X in significantly enhancing planning\nperformance, as well as all intermediate output performance. Code is at\nhttps://github.com/AIR-THU/UniV2X.\n","authors":["Haibao Yu","Wenxian Yang","Jiaru Zhong","Zhenwei Yang","Siqi Fan","Ping Luo","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2404.00717v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13237v1","updated":"2024-04-20T02:25:46Z","published":"2024-04-20T02:25:46Z","title":"PAFedFV: Personalized and Asynchronous Federated Learning for Finger\n Vein Recognition","summary":" With the increasing emphasis on user privacy protection, biometric\nrecognition based on federated learning have become the latest research\nhotspot. However, traditional federated learning methods cannot be directly\napplied to finger vein recognition, due to heterogeneity of data and open-set\nverification. Therefore, only a few application cases have been proposed. And\nthese methods still have two drawbacks. (1) Uniform model results in poor\nperformance in some clients, as the finger vein data is highly heterogeneous\nand non-Independently Identically Distributed (non-IID). (2) On individual\nclient, a large amount of time is underutilized, such as the time to wait for\nreturning model from server. To address those problems, this paper proposes a\nPersonalized and Asynchronous Federated Learning for Finger Vein Recognition\n(PAFedFV) framework. PAFedFV designs personalized model aggregation method to\nsolve the heterogeneity among non-IID data. Meanwhile, it employs an\nasynchronized training module for clients to utilize their waiting time.\nFinally, extensive experiments on six finger vein datasets are conducted. Base\non these experiment results, the impact of non-IID finger vein data on\nperformance of federated learning are analyzed, and the superiority of PAFedFV\nin accuracy and robustness are demonstrated.\n","authors":["Hengyu Mu","Jian Guo","Chong Han","Lijuan Sun"],"pdf_url":"https://arxiv.org/pdf/2404.13237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10518v3","updated":"2024-04-20T02:01:11Z","published":"2024-03-15T17:59:33Z","title":"Lodge: A Coarse to Fine Diffusion Network for Long Dance Generation\n Guided by the Characteristic Dance Primitives","summary":" We propose Lodge, a network capable of generating extremely long dance\nsequences conditioned on given music. We design Lodge as a two-stage coarse to\nfine diffusion architecture, and propose the characteristic dance primitives\nthat possess significant expressiveness as intermediate representations between\ntwo diffusion models. The first stage is global diffusion, which focuses on\ncomprehending the coarse-level music-dance correlation and production\ncharacteristic dance primitives. In contrast, the second-stage is the local\ndiffusion, which parallelly generates detailed motion sequences under the\nguidance of the dance primitives and choreographic rules. In addition, we\npropose a Foot Refine Block to optimize the contact between the feet and the\nground, enhancing the physical realism of the motion. Our approach can\nparallelly generate dance sequences of extremely long length, striking a\nbalance between global choreographic patterns and local motion quality and\nexpressiveness. Extensive experiments validate the efficacy of our method.\n","authors":["Ronghui Li","YuXiang Zhang","Yachao Zhang","Hongwen Zhang","Jie Guo","Yan Zhang","Yebin Liu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2403.10518v3.pdf","comment":"Accepted by CVPR2024, Project page:\n https://li-ronghui.github.io/lodge"},{"id":"http://arxiv.org/abs/2404.13222v1","updated":"2024-04-20T00:44:40Z","published":"2024-04-20T00:44:40Z","title":"Vim4Path: Self-Supervised Vision Mamba for Histopathology Images","summary":" Representation learning from Gigapixel Whole Slide Images (WSI) poses a\nsignificant challenge in computational pathology due to the complicated nature\nof tissue structures and the scarcity of labeled data. Multi-instance learning\nmethods have addressed this challenge, leveraging image patches to classify\nslides utilizing pretrained models using Self-Supervised Learning (SSL)\napproaches. The performance of both SSL and MIL methods relies on the\narchitecture of the feature encoder. This paper proposes leveraging the Vision\nMamba (Vim) architecture, inspired by state space models, within the DINO\nframework for representation learning in computational pathology. We evaluate\nthe performance of Vim against Vision Transformers (ViT) on the Camelyon16\ndataset for both patch-level and slide-level classification. Our findings\nhighlight Vim's enhanced performance compared to ViT, particularly at smaller\nscales, where Vim achieves an 8.21 increase in ROC AUC for models of similar\nsize. An explainability analysis further highlights Vim's capabilities, which\nreveals that Vim uniquely emulates the pathologist workflow-unlike ViT. This\nalignment with human expert analysis highlights Vim's potential in practical\ndiagnostic settings and contributes significantly to developing effective\nrepresentation-learning algorithms in computational pathology. We release the\ncodes and pretrained weights at\n\\url{https://github.com/AtlasAnalyticsLab/Vim4Path}.\n","authors":["Ali Nasiri-Sarvi","Vincent Quoc-Huy Trinh","Hassan Rivaz","Mahdi S. Hosseini"],"pdf_url":"https://arxiv.org/pdf/2404.13222v1.pdf","comment":"Accepted in CVPR2023 (9th Workshop on Computer Vision for Microscopy\n Image Analysis)"},{"id":"http://arxiv.org/abs/2404.14441v1","updated":"2024-04-20T00:21:06Z","published":"2024-04-20T00:21:06Z","title":"Optimizing Contrail Detection: A Deep Learning Approach with\n EfficientNet-b4 Encoding","summary":" In the pursuit of environmental sustainability, the aviation industry faces\nthe challenge of minimizing its ecological footprint. Among the key solutions\nis contrail avoidance, targeting the linear ice-crystal clouds produced by\naircraft exhaust. These contrails exacerbate global warming by trapping\natmospheric heat, necessitating precise segmentation and comprehensive analysis\nof contrail images to gauge their environmental impact. However, this\nsegmentation task is complex due to the varying appearances of contrails under\ndifferent atmospheric conditions and potential misalignment issues in\npredictive modeling. This paper presents an innovative deep-learning approach\nutilizing the efficient net-b4 encoder for feature extraction, seamlessly\nintegrating misalignment correction, soft labeling, and pseudo-labeling\ntechniques to enhance the accuracy and efficiency of contrail detection in\nsatellite imagery. The proposed methodology aims to redefine contrail image\nanalysis and contribute to the objectives of sustainable aviation by providing\na robust framework for precise contrail detection and analysis in satellite\nimagery, thus aiding in the mitigation of aviation's environmental impact.\n","authors":["Qunwei Lin","Qian Leng","Zhicheng Ding","Chao Yan","Xiaonan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.14441v1.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000..7f5166c Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 0000000..9ded9d9 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..ba6167b --- /dev/null +++ b/index.html @@ -0,0 +1,185503 @@ + + + + + Yibo's arxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 143 + +
+
+
+ + ☆ SMPLer: Taming Transformers for Monocular 3D Human Shape and Pose + Estimation + + +
+ Existing Transformers for monocular 3D human shape and pose estimation +typically have a quadratic computation and memory complexity with respect to +the feature length, which hinders the exploitation of fine-grained information +in high-resolution features that is beneficial for accurate reconstruction. In +this work, we propose an SMPL-based Transformer framework (SMPLer) to address +this issue. SMPLer incorporates two key ingredients: a decoupled attention +operation and an SMPL-based target representation, which allow effective +utilization of high-resolution features in the Transformer. In addition, based +on these two designs, we also introduce several novel modules including a +multi-scale attention and a joint-aware attention to further boost the +reconstruction performance. Extensive experiments demonstrate the effectiveness +of SMPLer against existing 3D human shape and pose estimation methods both +quantitatively and qualitatively. Notably, the proposed algorithm achieves an +MPJPE of 45.2 mm on the Human3.6M dataset, improving upon Mesh Graphormer by +more than 10% with fewer than one-third of the parameters. Code and pretrained +models are available at https://github.com/xuxy09/SMPLer. + +
+
+ comment: Published at TPAMI 2024 +
+
+
+
+
+ + ☆ ID-Animator: Zero-Shot Identity-Preserving Human Video Generation + + +
+ Generating high fidelity human video with specified identities has attracted +significant attention in the content generation community. However, existing +techniques struggle to strike a balance between training efficiency and +identity preservation, either requiring tedious case-by-case finetuning or +usually missing the identity details in video generation process. In this +study, we present ID-Animator, a zero-shot human-video generation approach that +can perform personalized video generation given single reference facial image +without further training. ID-Animator inherits existing diffusion-based video +generation backbones with a face adapter to encode the ID-relevant embeddings +from learnable facial latent queries. To facilitate the extraction of identity +information in video generation, we introduce an ID-oriented dataset +construction pipeline, which incorporates decoupled human attribute and action +captioning technique from a constructed facial image pool. Based on this +pipeline, a random face reference training method is further devised to +precisely capture the ID-relevant embeddings from reference images, thus +improving the fidelity and generalization capacity of our model for ID-specific +video generation. Extensive experiments demonstrate the superiority of +ID-Animator to generate personalized human videos over previous models. +Moreover, our method is highly compatible with popular pre-trained T2V models +like animatediff and various community backbone models, showing high +extendability in real-world applications for video generation where identity +preservation is highly desired. Our codes and checkpoints will be released at +https://github.com/ID-Animator/ID-Animator. + +
+
+ comment: Project Page: https://id-animator.github.io/ +
+
+
+
+
+ + ☆ Metric-guided Image Reconstruction Bounds via Conformal Prediction + + +
+ Recent advancements in machine learning have led to novel imaging systems and +algorithms that address ill-posed problems. Assessing their trustworthiness and +understanding how to deploy them safely at test time remains an important and +open problem. We propose a method that leverages conformal prediction to +retrieve upper/lower bounds and statistical inliers/outliers of reconstructions +based on the prediction intervals of downstream metrics. We apply our method to +sparse-view CT for downstream radiotherapy planning and show 1) that +metric-guided bounds have valid coverage for downstream metrics while +conventional pixel-wise bounds do not and 2) anatomical differences of +upper/lower bounds between metric-guided and pixel-wise methods. Our work paves +the way for more meaningful reconstruction bounds. Code available at +https://github.com/matthewyccheung/conformal-metric + +
+
+
+
+
+ + ☆ CT-GLIP: 3D Grounded Language-Image Pretraining with CT Scans and + Radiology Reports for Full-Body Scenarios + + +
+ Medical Vision-Language Pretraining (Med-VLP) establishes a connection +between visual content from medical images and the relevant textual +descriptions. Existing Med-VLP methods primarily focus on 2D images depicting a +single body part, notably chest X-rays. In this paper, we extend the scope of +Med-VLP to encompass 3D images, specifically targeting full-body scenarios, by +using a multimodal dataset of CT images and reports. Compared with the 2D +counterpart, 3D VLP is required to effectively capture essential semantics from +significantly sparser representation in 3D imaging. In this paper, we introduce +CT-GLIP (Grounded Language-Image Pretraining with CT scans), a novel method +that constructs organ-level image-text pairs to enhance multimodal contrastive +learning, aligning grounded visual features with precise diagnostic text. +Additionally, we developed an abnormality dictionary to augment contrastive +learning with diverse negative samples. Our method, trained on a multimodal CT +dataset comprising 44,011 organ-level vision-text pairs from 17,702 patients +across 104 organs, demonstrates it can identify organs and abnormalities in a +zero-shot manner using natural languages. The performance of CT-GLIP is +validated on a separate test set of 1,130 patients, focusing on the 16 most +frequent abnormalities across 7 organs. The experimental results show our +model's superior performance over the standard CLIP framework across zero-shot +and fine-tuning scenarios, using both CNN and ViT architectures. + +
+
+ comment: 12 pages, 5 figures, 3 tables +
+
+
+
+
+ + ☆ Automatic Layout Planning for Visually-Rich Documents with + Instruction-Following Models + + +
+ Recent advancements in instruction-following models have made user +interactions with models more user-friendly and efficient, broadening their +applicability. In graphic design, non-professional users often struggle to +create visually appealing layouts due to limited skills and resources. In this +work, we introduce a novel multimodal instruction-following framework for +layout planning, allowing users to easily arrange visual elements into tailored +layouts by specifying canvas size and design purpose, such as for book covers, +posters, brochures, or menus. We developed three layout reasoning tasks to +train the model in understanding and executing layout instructions. Experiments +on two benchmarks show that our method not only simplifies the design process +for non-professionals but also surpasses the performance of few-shot GPT-4V +models, with mIoU higher by 12% on Crello. This progress highlights the +potential of multimodal instruction-following models to automate and simplify +the design process, providing an approachable solution for a wide range of +design tasks on visually-rich documents. + +
+
+
+
+
+ + ☆ From Parts to Whole: A Unified Reference Framework for Controllable + Human Image Generation + + +
+ Recent advancements in controllable human image generation have led to +zero-shot generation using structural signals (e.g., pose, depth) or facial +appearance. Yet, generating human images conditioned on multiple parts of human +appearance remains challenging. Addressing this, we introduce Parts2Whole, a +novel framework designed for generating customized portraits from multiple +reference images, including pose images and various aspects of human +appearance. To achieve this, we first develop a semantic-aware appearance +encoder to retain details of different human parts, which processes each image +based on its textual label to a series of multi-scale feature maps rather than +one image token, preserving the image dimension. Second, our framework supports +multi-image conditioned generation through a shared self-attention mechanism +that operates across reference and target features during the diffusion +process. We enhance the vanilla attention mechanism by incorporating mask +information from the reference human images, allowing for the precise selection +of any part. Extensive experiments demonstrate the superiority of our approach +over existing alternatives, offering advanced capabilities for multi-part +controllable human image customization. See our project page at +https://huanngzh.github.io/Parts2Whole/. + +
+
+
+
+
+ + ☆ TalkingGaussian: Structure-Persistent 3D Talking Head Synthesis via + Gaussian Splatting + + +
+ Radiance fields have demonstrated impressive performance in synthesizing +lifelike 3D talking heads. However, due to the difficulty in fitting steep +appearance changes, the prevailing paradigm that presents facial motions by +directly modifying point appearance may lead to distortions in dynamic regions. +To tackle this challenge, we introduce TalkingGaussian, a deformation-based +radiance fields framework for high-fidelity talking head synthesis. Leveraging +the point-based Gaussian Splatting, facial motions can be represented in our +method by applying smooth and continuous deformations to persistent Gaussian +primitives, without requiring to learn the difficult appearance change like +previous methods. Due to this simplification, precise facial motions can be +synthesized while keeping a highly intact facial feature. Under such a +deformation paradigm, we further identify a face-mouth motion inconsistency +that would affect the learning of detailed speaking motions. To address this +conflict, we decompose the model into two branches separately for the face and +inside mouth areas, therefore simplifying the learning tasks to help +reconstruct more accurate motion and structure of the mouth region. Extensive +experiments demonstrate that our method renders high-quality lip-synchronized +talking head videos, with better facial fidelity and higher efficiency compared +with previous methods. + +
+
+ comment: Project page: https://fictionarry.github.io/TalkingGaussian/ +
+
+
+
+
+ + ☆ Multi-Session SLAM with Differentiable Wide-Baseline Pose Optimization CVPR 2024 + + +
+ We introduce a new system for Multi-Session SLAM, which tracks camera motion +across multiple disjoint videos under a single global reference. Our approach +couples the prediction of optical flow with solver layers to estimate camera +pose. The backbone is trained end-to-end using a novel differentiable solver +for wide-baseline two-view pose. The full system can connect disjoint +sequences, perform visual odometry, and global optimization. Compared to +existing approaches, our design is accurate and robust to catastrophic +failures. Code is available at github.com/princeton-vl/MultiSlam_DiffPose + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ FlowMap: High-Quality Camera Poses, Intrinsics, and Depth via Gradient + Descent + + +
+ This paper introduces FlowMap, an end-to-end differentiable method that +solves for precise camera poses, camera intrinsics, and per-frame dense depth +of a video sequence. Our method performs per-video gradient-descent +minimization of a simple least-squares objective that compares the optical flow +induced by depth, intrinsics, and poses against correspondences obtained via +off-the-shelf optical flow and point tracking. Alongside the use of point +tracks to encourage long-term geometric consistency, we introduce +differentiable re-parameterizations of depth, intrinsics, and pose that are +amenable to first-order optimization. We empirically show that camera +parameters and dense depth recovered by our method enable photo-realistic novel +view synthesis on 360-degree trajectories using Gaussian Splatting. Our method +not only far outperforms prior gradient-descent based bundle adjustment +methods, but surprisingly performs on par with COLMAP, the state-of-the-art SfM +method, on the downstream task of 360-degree novel view synthesis (even though +our method is purely gradient-descent based, fully differentiable, and presents +a complete departure from conventional SfM). + +
+
+ comment: Project website: https://cameronosmith.github.io/flowmap/ +
+
+
+
+
+ + ☆ TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and + Proprioception Estimation + + +
+ Legged navigation is typically examined within open-world, off-road, and +challenging environments. In these scenarios, estimating external disturbances +requires a complex synthesis of multi-modal information. This underlines a +major limitation in existing works that primarily focus on avoiding obstacles. +In this work, we propose TOP-Nav, a novel legged navigation framework that +integrates a comprehensive path planner with Terrain awareness, Obstacle +avoidance and close-loop Proprioception. TOP-Nav underscores the synergies +between vision and proprioception in both path and motion planning. Within the +path planner, we present and integrate a terrain estimator that enables the +robot to select waypoints on terrains with higher traversability while +effectively avoiding obstacles. In the motion planning level, we not only +implement a locomotion controller to track the navigation commands, but also +construct a proprioception advisor to provide motion evaluations for the path +planner. Based on the close-loop motion feedback, we make online corrections +for the vision-based terrain and obstacle estimations. Consequently, TOP-Nav +achieves open-world navigation that the robot can handle terrains or +disturbances beyond the distribution of prior knowledge and overcomes +constraints imposed by visual conditions. Building upon extensive experiments +conducted in both simulation and real-world environments, TOP-Nav demonstrates +superior performance in open-world navigation compared to existing methods. + +
+
+
+
+
+ + ☆ UniMERNet: A Universal Network for Real-World Mathematical Expression + Recognition + + +
+ This paper presents the UniMER dataset to provide the first study on +Mathematical Expression Recognition (MER) towards complex real-world scenarios. +The UniMER dataset consists of a large-scale training set UniMER-1M offering an +unprecedented scale and diversity with one million training instances and a +meticulously designed test set UniMER-Test that reflects a diverse range of +formula distributions prevalent in real-world scenarios. Therefore, the UniMER +dataset enables the training of a robust and high-accuracy MER model and +comprehensive evaluation of model performance. Moreover, we introduce the +Universal Mathematical Expression Recognition Network (UniMERNet), an +innovative framework designed to enhance MER in practical scenarios. UniMERNet +incorporates a Length-Aware Module to process formulas of varied lengths +efficiently, thereby enabling the model to handle complex mathematical +expressions with greater accuracy. In addition, UniMERNet employs our UniMER-1M +data and image augmentation techniques to improve the model's robustness under +different noise conditions. Our extensive experiments demonstrate that +UniMERNet outperforms existing MER models, setting a new benchmark in various +scenarios and ensuring superior recognition quality in real-world applications. +The dataset and model are available at +https://github.com/opendatalab/UniMERNet. + +
+
+ comment: 17 pages, 5 figures +
+
+
+
+
+ + ☆ Source-free Domain Adaptation for Video Object Detection Under Adverse + Image Conditions CVPR 2024 + + +
+ When deploying pre-trained video object detectors in real-world scenarios, +the domain gap between training and testing data caused by adverse image +conditions often leads to performance degradation. Addressing this issue +becomes particularly challenging when only the pre-trained model and degraded +videos are available. Although various source-free domain adaptation (SFDA) +methods have been proposed for single-frame object detectors, SFDA for video +object detection (VOD) remains unexplored. Moreover, most unsupervised domain +adaptation works for object detection rely on two-stage detectors, while SFDA +for one-stage detectors, which are more vulnerable to fine-tuning, is not well +addressed in the literature. In this paper, we propose Spatial-Temporal +Alternate Refinement with Mean Teacher (STAR-MT), a simple yet effective SFDA +method for VOD. Specifically, we aim to improve the performance of the +one-stage VOD method, YOLOV, under adverse image conditions, including noise, +air turbulence, and haze. Extensive experiments on the ImageNetVOD dataset and +its degraded versions demonstrate that our method consistently improves video +object detection performance in challenging imaging conditions, showcasing its +potential for real-world applications. + +
+
+ comment: accepted by the UG2+ workshop at CVPR 2024 +
+
+
+
+
+ + ☆ Efficient Transformer Encoders for Mask2Former-style models + + +
+ Vision transformer based models bring significant improvements for image +segmentation tasks. Although these architectures offer powerful capabilities +irrespective of specific segmentation tasks, their use of computational +resources can be taxing on deployed devices. One way to overcome this challenge +is by adapting the computation level to the specific needs of the input image +rather than the current one-size-fits-all approach. To this end, we introduce +ECO-M2F or EffiCient TransfOrmer Encoders for Mask2Former-style models. Noting +that the encoder module of M2F-style models incur high resource-intensive +computations, ECO-M2F provides a strategy to self-select the number of hidden +layers in the encoder, conditioned on the input image. To enable this +self-selection ability for providing a balance between performance and +computational efficiency, we present a three step recipe. The first step is to +train the parent architecture to enable early exiting from the encoder. The +second step is to create an derived dataset of the ideal number of encoder +layers required for each training example. The third step is to use the +aforementioned derived dataset to train a gating network that predicts the +number of encoder layers to be used, conditioned on the input image. +Additionally, to change the computational-accuracy tradeoff, only steps two and +three need to be repeated which significantly reduces retraining time. +Experiments on the public datasets show that the proposed approach reduces +expected encoder computational cost while maintaining performance, adapts to +various user compute resources, is flexible in architecture configurations, and +can be extended beyond the segmentation task to object detection. + +
+
+
+
+
+ + ☆ Massively Annotated Datasets for Assessment of Synthetic and Real Data + in Face Recognition + + +
+ Face recognition applications have grown in parallel with the size of +datasets, complexity of deep learning models and computational power. However, +while deep learning models evolve to become more capable and computational +power keeps increasing, the datasets available are being retracted and removed +from public access. Privacy and ethical concerns are relevant topics within +these domains. Through generative artificial intelligence, researchers have put +efforts into the development of completely synthetic datasets that can be used +to train face recognition systems. Nonetheless, the recent advances have not +been sufficient to achieve performance comparable to the state-of-the-art +models trained on real data. To study the drift between the performance of +models trained on real and synthetic datasets, we leverage a massive attribute +classifier (MAC) to create annotations for four datasets: two real and two +synthetic. From these annotations, we conduct studies on the distribution of +each attribute within all four datasets. Additionally, we further inspect the +differences between real and synthetic datasets on the attribute set. When +comparing through the Kullback-Leibler divergence we have found differences +between real and synthetic samples. Interestingly enough, we have verified that +while real samples suffice to explain the synthetic distribution, the opposite +could not be further from being true. + +
+
+ comment: Accepted at FG 2024 +
+
+
+
+
+ + ☆ Re-Thinking Inverse Graphics With Large Language Models + + +
+ Inverse graphics -- the task of inverting an image into physical variables +that, when rendered, enable reproduction of the observed scene -- is a +fundamental challenge in computer vision and graphics. Disentangling an image +into its constituent elements, such as the shape, color, and material +properties of the objects of the 3D scene that produced it, requires a +comprehensive understanding of the environment. This requirement limits the +ability of existing carefully engineered approaches to generalize across +domains. Inspired by the zero-shot ability of large language models (LLMs) to +generalize to novel contexts, we investigate the possibility of leveraging the +broad world knowledge encoded in such models in solving inverse-graphics +problems. To this end, we propose the Inverse-Graphics Large Language Model +(IG-LLM), an inverse-graphics framework centered around an LLM, that +autoregressively decodes a visual embedding into a structured, compositional +3D-scene representation. We incorporate a frozen pre-trained visual encoder and +a continuous numeric head to enable end-to-end training. Through our +investigation, we demonstrate the potential of LLMs to facilitate inverse +graphics through next-token prediction, without the use of image-space +supervision. Our analysis opens up new possibilities for precise spatial +reasoning about images that exploit the visual knowledge of LLMs. We will +release our code and data to ensure the reproducibility of our investigation +and to facilitate future research at https://ig-llm.is.tue.mpg.de/ + +
+
+ comment: 31 pages; project page: https://ig-llm.is.tue.mpg.de/ +
+
+
+
+
+ + ☆ Deep Models for Multi-View 3D Object Recognition: A Review + + +
+ Human decision-making often relies on visual information from multiple +perspectives or views. In contrast, machine learning-based object recognition +utilizes information from a single image of the object. However, the +information conveyed by a single image may not be sufficient for accurate +decision-making, particularly in complex recognition problems. The utilization +of multi-view 3D representations for object recognition has thus far +demonstrated the most promising results for achieving state-of-the-art +performance. This review paper comprehensively covers recent progress in +multi-view 3D object recognition methods for 3D classification and retrieval +tasks. Specifically, we focus on deep learning-based and transformer-based +techniques, as they are widely utilized and have achieved state-of-the-art +performance. We provide detailed information about existing deep learning-based +and transformer-based multi-view 3D object recognition models, including the +most commonly used 3D datasets, camera configurations and number of views, view +selection strategies, pre-trained CNN architectures, fusion strategies, and +recognition performance on 3D classification and 3D retrieval tasks. +Additionally, we examine various computer vision applications that use +multi-view classification. Finally, we highlight key findings and future +directions for developing multi-view 3D object recognition methods to provide +readers with a comprehensive understanding of the field. + +
+
+
+
+
+ + ☆ Closed Loop Interactive Embodied Reasoning for Robot Manipulation + + +
+ Embodied reasoning systems integrate robotic hardware and cognitive processes +to perform complex tasks typically in response to a natural language query +about a specific physical environment. This usually involves changing the +belief about the scene or physically interacting and changing the scene (e.g. +'Sort the objects from lightest to heaviest'). In order to facilitate the +development of such systems we introduce a new simulating environment that +makes use of MuJoCo physics engine and high-quality renderer Blender to provide +realistic visual observations that are also accurate to the physical state of +the scene. Together with the simulator we propose a new benchmark composed of +10 classes of multi-step reasoning scenarios that require simultaneous visual +and physical measurements. Finally, we develop a new modular Closed Loop +Interactive Reasoning (CLIER) approach that takes into account the measurements +of non-visual object properties, changes in the scene caused by external +disturbances as well as uncertain outcomes of robotic actions. We extensively +evaluate our reasoning approach in simulation and in the real world +manipulation tasks with a success rate above 76% and 64%, respectively. + +
+
+
+
+
+ + ☆ Fourier-enhanced Implicit Neural Fusion Network for Multispectral and + Hyperspectral Image Fusion + + +
+ Recently, implicit neural representations (INR) have made significant strides +in various vision-related domains, providing a novel solution for Multispectral +and Hyperspectral Image Fusion (MHIF) tasks. However, INR is prone to losing +high-frequency information and is confined to the lack of global perceptual +capabilities. To address these issues, this paper introduces a Fourier-enhanced +Implicit Neural Fusion Network (FeINFN) specifically designed for MHIF task, +targeting the following phenomena: The Fourier amplitudes of the HR-HSI latent +code and LR-HSI are remarkably similar; however, their phases exhibit different +patterns. In FeINFN, we innovatively propose a spatial and frequency implicit +fusion function (Spa-Fre IFF), helping INR capture high-frequency information +and expanding the receptive field. Besides, a new decoder employing a complex +Gabor wavelet activation function, called Spatial-Frequency Interactive Decoder +(SFID), is invented to enhance the interaction of INR features. Especially, we +further theoretically prove that the Gabor wavelet activation possesses a +time-frequency tightness property that favors learning the optimal bandwidths +in the decoder. Experiments on two benchmark MHIF datasets verify the +state-of-the-art (SOTA) performance of the proposed method, both visually and +quantitatively. Also, ablation studies demonstrate the mentioned contributions. +The code will be available on Anonymous GitHub +(https://anonymous.4open.science/r/FeINFN-15C9/) after possible acceptance. + +
+
+
+
+
+ + ☆ Adaptive Mixed-Scale Feature Fusion Network for Blind AI-Generated Image + Quality Assessment + + +
+ With the increasing maturity of the text-to-image and image-to-image +generative models, AI-generated images (AGIs) have shown great application +potential in advertisement, entertainment, education, social media, etc. +Although remarkable advancements have been achieved in generative models, very +few efforts have been paid to design relevant quality assessment models. In +this paper, we propose a novel blind image quality assessment (IQA) network, +named AMFF-Net, for AGIs. AMFF-Net evaluates AGI quality from three dimensions, +i.e., "visual quality", "authenticity", and "consistency". Specifically, +inspired by the characteristics of the human visual system and motivated by the +observation that "visual quality" and "authenticity" are characterized by both +local and global aspects, AMFF-Net scales the image up and down and takes the +scaled images and original-sized image as the inputs to obtain multi-scale +features. After that, an Adaptive Feature Fusion (AFF) block is used to +adaptively fuse the multi-scale features with learnable weights. In addition, +considering the correlation between the image and prompt, AMFF-Net compares the +semantic features from text encoder and image encoder to evaluate the +text-to-image alignment. We carry out extensive experiments on three AGI +quality assessment databases, and the experimental results show that our +AMFF-Net obtains better performance than nine state-of-the-art blind IQA +methods. The results of ablation experiments further demonstrate the +effectiveness of the proposed multi-scale input strategy and AFF block. + +
+
+ comment: IEEE Transactions on Broadcasting (TBC) +
+
+
+
+
+ + ☆ Combating Missing Modalities in Egocentric Videos at Test Time + + +
+ Understanding videos that contain multiple modalities is crucial, especially +in egocentric videos, where combining various sensory inputs significantly +improves tasks like action recognition and moment localization. However, +real-world applications often face challenges with incomplete modalities due to +privacy concerns, efficiency needs, or hardware issues. Current methods, while +effective, often necessitate retraining the model entirely to handle missing +modalities, making them computationally intensive, particularly with large +training datasets. In this study, we propose a novel approach to address this +issue at test time without requiring retraining. We frame the problem as a +test-time adaptation task, where the model adjusts to the available unlabeled +data at test time. Our method, MiDl~(Mutual information with +self-Distillation), encourages the model to be insensitive to the specific +modality source present during testing by minimizing the mutual information +between the prediction and the available modality. Additionally, we incorporate +self-distillation to maintain the model's original performance when both +modalities are available. MiDl represents the first self-supervised, online +solution for handling missing modalities exclusively at test time. Through +experiments with various pretrained models and datasets, MiDl demonstrates +substantial performance improvement without the need for retraining. + +
+
+
+
+
+ + ☆ CutDiffusion: A Simple, Fast, Cheap, and Strong Diffusion Extrapolation + Method + + +
+ Transforming large pre-trained low-resolution diffusion models to cater to +higher-resolution demands, i.e., diffusion extrapolation, significantly +improves diffusion adaptability. We propose tuning-free CutDiffusion, aimed at +simplifying and accelerating the diffusion extrapolation process, making it +more affordable and improving performance. CutDiffusion abides by the existing +patch-wise extrapolation but cuts a standard patch diffusion process into an +initial phase focused on comprehensive structure denoising and a subsequent +phase dedicated to specific detail refinement. Comprehensive experiments +highlight the numerous almighty advantages of CutDiffusion: (1) simple method +construction that enables a concise higher-resolution diffusion process without +third-party engagement; (2) fast inference speed achieved through a single-step +higher-resolution diffusion process, and fewer inference patches required; (3) +cheap GPU cost resulting from patch-wise inference and fewer patches during the +comprehensive structure denoising; (4) strong generation performance, stemming +from the emphasis on specific detail refinement. + +
+
+
+
+
+ + ☆ Gallbladder Cancer Detection in Ultrasound Images based on YOLO and + Faster R-CNN + + +
+ Medical image analysis is a significant application of artificial +intelligence for disease diagnosis. A crucial step in this process is the +identification of regions of interest within the images. This task can be +automated using object detection algorithms. YOLO and Faster R-CNN are renowned +for such algorithms, each with its own strengths and weaknesses. This study +aims to explore the advantages of both techniques to select more accurate +bounding boxes for gallbladder detection from ultrasound images, thereby +enhancing gallbladder cancer classification. A fusion method that leverages the +benefits of both techniques is presented in this study. The proposed method +demonstrated superior classification performance, with an accuracy of 92.62%, +compared to the individual use of Faster R-CNN and YOLOv8, which yielded +accuracies of 90.16% and 82.79%, respectively. + +
+
+ comment: Published in 2024 10th International Conference on Artificial + Intelligence and Robotics (QICAR) +
+
+
+
+
+ + ☆ MedDr: Diagnosis-Guided Bootstrapping for Large-Scale Medical + Vision-Language Learning + + +
+ The rapid advancement of large-scale vision-language models has showcased +remarkable capabilities across various tasks. However, the lack of extensive +and high-quality image-text data in medicine has greatly hindered the +development of large-scale medical vision-language models. In this work, we +present a diagnosis-guided bootstrapping strategy that exploits both image and +label information to construct vision-language datasets. Based on the +constructed dataset, we developed MedDr, a generalist foundation model for +healthcare capable of handling diverse medical data modalities, including +radiology, pathology, dermatology, retinography, and endoscopy. Moreover, +during inference, we propose a simple but effective retrieval-augmented medical +diagnosis strategy, which enhances the model's generalization ability. +Extensive experiments on visual question answering, medical report generation, +and medical image diagnosis demonstrate the superiority of our method. + +
+
+
+
+
+ + ☆ Taming Diffusion Probabilistic Models for Character Control SIGGRAPH 2024 + + +
+ We present a novel character control framework that effectively utilizes +motion diffusion probabilistic models to generate high-quality and diverse +character animations, responding in real-time to a variety of dynamic +user-supplied control signals. At the heart of our method lies a +transformer-based Conditional Autoregressive Motion Diffusion Model (CAMDM), +which takes as input the character's historical motion and can generate a range +of diverse potential future motions conditioned on high-level, coarse user +control. To meet the demands for diversity, controllability, and computational +efficiency required by a real-time controller, we incorporate several key +algorithmic designs. These include separate condition tokenization, +classifier-free guidance on past motion, and heuristic future trajectory +extension, all designed to address the challenges associated with taming motion +diffusion probabilistic models for character control. As a result, our work +represents the first model that enables real-time generation of high-quality, +diverse character animations based on user interactive control, supporting +animating the character in multiple styles with a single unified model. We +evaluate our method on a diverse set of locomotion skills, demonstrating the +merits of our method over existing character controllers. Project page and +source codes: https://aiganimation.github.io/CAMDM/ + +
+
+ comment: Accepted by SIGGRAPH 2024 (Conference Track). Project page and source + codes: https://aiganimation.github.io/CAMDM/ +
+
+
+
+
+ + ☆ Multimodal Large Language Model is a Human-Aligned Annotator for + Text-to-Image Generation + + +
+ Recent studies have demonstrated the exceptional potentials of leveraging +human preference datasets to refine text-to-image generative models, enhancing +the alignment between generated images and textual prompts. Despite these +advances, current human preference datasets are either prohibitively expensive +to construct or suffer from a lack of diversity in preference dimensions, +resulting in limited applicability for instruction tuning in open-source +text-to-image generative models and hinder further exploration. To address +these challenges and promote the alignment of generative models through +instruction tuning, we leverage multimodal large language models to create +VisionPrefer, a high-quality and fine-grained preference dataset that captures +multiple preference aspects. We aggregate feedback from AI annotators across +four aspects: prompt-following, aesthetic, fidelity, and harmlessness to +construct VisionPrefer. To validate the effectiveness of VisionPrefer, we train +a reward model VP-Score over VisionPrefer to guide the training of +text-to-image generative models and the preference prediction accuracy of +VP-Score is comparable to human annotators. Furthermore, we use two +reinforcement learning methods to supervised fine-tune generative models to +evaluate the performance of VisionPrefer, and extensive experimental results +demonstrate that VisionPrefer significantly improves text-image alignment in +compositional image generation across diverse aspects, e.g., aesthetic, and +generalizes better than previous human-preference metrics across various image +distributions. Moreover, VisionPrefer indicates that the integration of +AI-generated synthetic data as a supervisory signal is a promising avenue for +achieving improved alignment with human preferences in vision generative +models. + +
+
+
+
+
+ + ☆ Harnessing Optical Imaging Limit through Atmospheric Scattering Media + + +
+ Recording and identifying faint objects through atmospheric scattering media +by an optical system are fundamentally interesting and technologically +important. In this work, we introduce a comprehensive model that incorporates +contributions from target characteristics, atmospheric effects, imaging system, +digital processing, and visual perception to assess the ultimate perceptible +limit of geometrical imaging, specifically the angular resolution at the +boundary of visible distance. The model allows to reevaluate the effectiveness +of conventional imaging recording, processing, and perception and to analyze +the limiting factors that constrain image recognition capabilities in +atmospheric media. The simulations were compared with the experimental results +measured in a fog chamber and outdoor settings. The results reveal general good +agreement between analysis and experimental, pointing out the way to harnessing +the physical limit for optical imaging in scattering media. An immediate +application of the study is the extension of the image range by an amount of +1.2 times with noise reduction via multi-frame averaging, hence greatly +enhancing the capability of optical imaging in the atmosphere. + +
+
+
+
+
+ + ☆ Perturbing Attention Gives You More Bang for the Buck: Subtle Imaging + Perturbations That Efficiently Fool Customized Diffusion Models CVPR 2024 + + +
+ Diffusion models (DMs) embark a new era of generative modeling and offer more +opportunities for efficient generating high-quality and realistic data samples. +However, their widespread use has also brought forth new challenges in model +security, which motivates the creation of more effective adversarial attackers +on DMs to understand its vulnerability. We propose CAAT, a simple but generic +and efficient approach that does not require costly training to effectively +fool latent diffusion models (LDMs). The approach is based on the observation +that cross-attention layers exhibits higher sensitivity to gradient change, +allowing for leveraging subtle perturbations on published images to +significantly corrupt the generated images. We show that a subtle perturbation +on an image can significantly impact the cross-attention layers, thus changing +the mapping between text and image during the fine-tuning of customized +diffusion models. Extensive experiments demonstrate that CAAT is compatible +with diverse diffusion models and outperforms baseline attack methods in a more +effective (more noise) and efficient (twice as fast as Anti-DreamBooth and +Mist) manner. + +
+
+ comment: Published at CVPR 2024 +
+
+
+
+
+ + ☆ LEAF: Unveiling Two Sides of the Same Coin in Semi-supervised Facial + Expression Recognition + + +
+ Semi-supervised learning has emerged as a promising approach to tackle the +challenge of label scarcity in facial expression recognition (FER) task. +However, current state-of-the-art methods primarily focus on one side of the +coin, i.e., generating high-quality pseudo-labels, while overlooking the other +side: enhancing expression-relevant representations. In this paper, we unveil +both sides of the coin by proposing a unified framework termed hierarchicaL +dEcoupling And Fusing (LEAF) to coordinate expression-relevant representations +and pseudo-labels for semi-supervised FER. LEAF introduces a hierarchical +expression-aware aggregation strategy that operates at three levels: semantic, +instance, and category. (1) At the semantic and instance levels, LEAF decouples +representations into expression-agnostic and expression-relevant components, +and adaptively fuses them using learnable gating weights. (2) At the category +level, LEAF assigns ambiguous pseudo-labels by decoupling predictions into +positive and negative parts, and employs a consistency loss to ensure agreement +between two augmented views of the same image. Extensive experiments on +benchmark datasets demonstrate that by unveiling and harmonizing both sides of +the coin, LEAF outperforms state-of-the-art semi-supervised FER methods, +effectively leveraging both labeled and unlabeled data. Moreover, the proposed +expression-aware aggregation strategy can be seamlessly integrated into +existing semi-supervised frameworks, leading to significant performance gains. + +
+
+
+
+
+ + ☆ DP-Net: Learning Discriminative Parts for image recognition ICIP 2023 + + +
+ This paper presents Discriminative Part Network (DP-Net), a deep architecture +with strong interpretation capabilities, which exploits a pretrained +Convolutional Neural Network (CNN) combined with a part-based recognition +module. This system learns and detects parts in the images that are +discriminative among categories, without the need for fine-tuning the CNN, +making it more scalable than other part-based models. While part-based +approaches naturally offer interpretable representations, we propose +explanations at image and category levels and introduce specific constraints on +the part learning process to make them more discrimative. + +
+
+ comment: IEEE ICIP 2023 +
+
+
+
+
+ + ☆ IPAD: Industrial Process Anomaly Detection Dataset + + +
+ Video anomaly detection (VAD) is a challenging task aiming to recognize +anomalies in video frames, and existing large-scale VAD researches primarily +focus on road traffic and human activity scenes. In industrial scenes, there +are often a variety of unpredictable anomalies, and the VAD method can play a +significant role in these scenarios. However, there is a lack of applicable +datasets and methods specifically tailored for industrial production scenarios +due to concerns regarding privacy and security. To bridge this gap, we propose +a new dataset, IPAD, specifically designed for VAD in industrial scenarios. The +industrial processes in our dataset are chosen through on-site factory research +and discussions with engineers. This dataset covers 16 different industrial +devices and contains over 6 hours of both synthetic and real-world video +footage. Moreover, we annotate the key feature of the industrial process, ie, +periodicity. Based on the proposed dataset, we introduce a period memory module +and a sliding window inspection mechanism to effectively investigate the +periodic information in a basic reconstruction model. Our framework leverages +LoRA adapter to explore the effective migration of pretrained models, which are +initially trained using synthetic data, into real-world scenarios. Our proposed +dataset and method will fill the gap in the field of industrial video anomaly +detection and drive the process of video understanding tasks as well as smart +factory deployment. + +
+
+
+
+
+ + ☆ PRISM: A Promptable and Robust Interactive Segmentation Model with + Visual Prompts + + +
+ In this paper, we present PRISM, a Promptable and Robust Interactive +Segmentation Model, aiming for precise segmentation of 3D medical images. PRISM +accepts various visual inputs, including points, boxes, and scribbles as sparse +prompts, as well as masks as dense prompts. Specifically, PRISM is designed +with four principles to achieve robustness: (1) Iterative learning. The model +produces segmentations by using visual prompts from previous iterations to +achieve progressive improvement. (2) Confidence learning. PRISM employs +multiple segmentation heads per input image, each generating a continuous map +and a confidence score to optimize predictions. (3) Corrective learning. +Following each segmentation iteration, PRISM employs a shallow corrective +refinement network to reassign mislabeled voxels. (4) Hybrid design. PRISM +integrates hybrid encoders to better capture both the local and global +information. Comprehensive validation of PRISM is conducted using four public +datasets for tumor segmentation in the colon, pancreas, liver, and kidney, +highlighting challenges caused by anatomical variations and ambiguous +boundaries in accurate tumor identification. Compared to state-of-the-art +methods, both with and without prompt engineering, PRISM significantly improves +performance, achieving results that are close to human levels. The code is +publicly available at https://github.com/MedICL-VU/PRISM. + +
+
+
+
+
+ + ☆ A Learning Paradigm for Interpretable Gradients + + +
+ This paper studies interpretability of convolutional networks by means of +saliency maps. Most approaches based on Class Activation Maps (CAM) combine +information from fully connected layers and gradient through variants of +backpropagation. However, it is well understood that gradients are noisy and +alternatives like guided backpropagation have been proposed to obtain better +visualization at inference. In this work, we present a novel training approach +to improve the quality of gradients for interpretability. In particular, we +introduce a regularization loss such that the gradient with respect to the +input image obtained by standard backpropagation is similar to the gradient +obtained by guided backpropagation. We find that the resulting gradient is +qualitatively less noisy and improves quantitatively the interpretability +properties of different networks, using several interpretability methods. + +
+
+ comment: VISAPP 2024 +
+
+
+
+
+ + ☆ A review of deep learning-based information fusion techniques for + multimodal medical image classification + + +
+ Multimodal medical imaging plays a pivotal role in clinical diagnosis and +research, as it combines information from various imaging modalities to provide +a more comprehensive understanding of the underlying pathology. Recently, deep +learning-based multimodal fusion techniques have emerged as powerful tools for +improving medical image classification. This review offers a thorough analysis +of the developments in deep learning-based multimodal fusion for medical +classification tasks. We explore the complementary relationships among +prevalent clinical modalities and outline three main fusion schemes for +multimodal classification networks: input fusion, intermediate fusion +(encompassing single-level fusion, hierarchical fusion, and attention-based +fusion), and output fusion. By evaluating the performance of these fusion +techniques, we provide insight into the suitability of different network +architectures for various multimodal fusion scenarios and application domains. +Furthermore, we delve into challenges related to network architecture +selection, handling incomplete multimodal data management, and the potential +limitations of multimodal fusion. Finally, we spotlight the promising future of +Transformer-based multimodal fusion techniques and give recommendations for +future research in this rapidly evolving field. + +
+
+
+
+
+ + ☆ OccGen: Generative Multi-modal 3D Occupancy Prediction for Autonomous + Driving + + +
+ Existing solutions for 3D semantic occupancy prediction typically treat the +task as a one-shot 3D voxel-wise segmentation perception problem. These +discriminative methods focus on learning the mapping between the inputs and +occupancy map in a single step, lacking the ability to gradually refine the +occupancy map and the reasonable scene imaginative capacity to complete the +local regions somewhere. In this paper, we introduce OccGen, a simple yet +powerful generative perception model for the task of 3D semantic occupancy +prediction. OccGen adopts a ''noise-to-occupancy'' generative paradigm, +progressively inferring and refining the occupancy map by predicting and +eliminating noise originating from a random Gaussian distribution. OccGen +consists of two main components: a conditional encoder that is capable of +processing multi-modal inputs, and a progressive refinement decoder that +applies diffusion denoising using the multi-modal features as conditions. A key +insight of this generative pipeline is that the diffusion denoising process is +naturally able to model the coarse-to-fine refinement of the dense 3D occupancy +map, therefore producing more detailed predictions. Extensive experiments on +several occupancy benchmarks demonstrate the effectiveness of the proposed +method compared to the state-of-the-art methods. For instance, OccGen +relatively enhances the mIoU by 9.5%, 6.3%, and 13.3% on nuScenes-Occupancy +dataset under the muli-modal, LiDAR-only, and camera-only settings, +respectively. Moreover, as a generative perception model, OccGen exhibits +desirable properties that discriminative models cannot achieve, such as +providing uncertainty estimates alongside its multiple-step predictions. + +
+
+
+
+
+ + ☆ X-3D: Explicit 3D Structure Modeling for Point Cloud Recognition + + +
+ Numerous prior studies predominantly emphasize constructing relation vectors +for individual neighborhood points and generating dynamic kernels for each +vector and embedding these into high-dimensional spaces to capture implicit +local structures. However, we contend that such implicit high-dimensional +structure modeling approch inadequately represents the local geometric +structure of point clouds due to the absence of explicit structural +information. Hence, we introduce X-3D, an explicit 3D structure modeling +approach. X-3D functions by capturing the explicit local structural information +within the input 3D space and employing it to produce dynamic kernels with +shared weights for all neighborhood points within the current local region. +This modeling approach introduces effective geometric prior and significantly +diminishes the disparity between the local structure of the embedding space and +the original input point cloud, thereby improving the extraction of local +features. Experiments show that our method can be used on a variety of methods +and achieves state-of-the-art performance on segmentation, classification, +detection tasks with lower extra computational cost, such as \textbf{90.7\%} on +ScanObjectNN for classification, \textbf{79.2\%} on S3DIS 6 fold and +\textbf{74.3\%} on S3DIS Area 5 for segmentation, \textbf{76.3\%} on ScanNetV2 +for segmentation and \textbf{64.5\%} mAP , \textbf{46.9\%} mAP on SUN RGB-D and +\textbf{69.0\%} mAP , \textbf{51.1\%} mAP on ScanNetV2 . Our code is available +at +\href{https://github.com/sunshuofeng/X-3D}{https://github.com/sunshuofeng/X-3D}. + +
+
+
+
+
+ + ☆ The Brain Tumor Segmentation in Pediatrics (BraTS-PEDs) Challenge: Focus + on Pediatrics (CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs) + + +
+ Pediatric tumors of the central nervous system are the most common cause of +cancer-related death in children. The five-year survival rate for high-grade +gliomas in children is less than 20%. Due to their rarity, the diagnosis of +these entities is often delayed, their treatment is mainly based on historic +treatment concepts, and clinical trials require multi-institutional +collaborations. Here we present the CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs +challenge, focused on pediatric brain tumors with data acquired across multiple +international consortia dedicated to pediatric neuro-oncology and clinical +trials. The CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs challenge brings together +clinicians and AI/imaging scientists to lead to faster development of automated +segmentation techniques that could benefit clinical trials, and ultimately the +care of children with brain tumors. + +
+
+
+
+
+ + ☆ External Prompt Features Enhanced Parameter-efficient Fine-tuning for + Salient Object Detection + + +
+ Salient object detection (SOD) aims at finding the most salient objects in +images and outputs pixel-level binary masks. Transformer-based methods achieve +promising performance due to their global semantic understanding, crucial for +identifying salient objects. However, these models tend to be large and require +numerous training parameters. To better harness the potential of transformers +for SOD, we propose a novel parameter-efficient fine-tuning method aimed at +reducing the number of training parameters while enhancing the salient object +detection capability. Our model, termed EXternal Prompt features Enhanced +adapteR Tuning (ExPert), features an encoder-decoder structure with adapters +and injectors interspersed between the layers of a frozen transformer encoder. +The adapter modules adapt the pre-trained backbone to SOD while the injector +modules incorporate external prompt features to enhance the awareness of +salient objects. Comprehensive experiments demonstrate the superiority of our +method. Surpassing former state-of-the-art (SOTA) models across five SOD +datasets, ExPert achieves 0.215 mean absolute error (MAE) in ECSSD dataset with +80.2M trained parameters, 21% better than transformer-based SOTA model and 47% +better than CNN-based SOTA model. + +
+
+
+
+
+ + ☆ CA-Stream: Attention-based pooling for interpretable image recognition CVPR + + +
+ Explanations obtained from transformer-based architectures in the form of raw +attention, can be seen as a class-agnostic saliency map. Additionally, +attention-based pooling serves as a form of masking the in feature space. +Motivated by this observation, we design an attention-based pooling mechanism +intended to replace Global Average Pooling (GAP) at inference. This mechanism, +called Cross-Attention Stream (CA-Stream), comprises a stream of cross +attention blocks interacting with features at different network depths. +CA-Stream enhances interpretability in models, while preserving recognition +performance. + +
+
+ comment: CVPR XAI4CV workshop 2024 +
+
+
+
+
+ + ☆ Other Tokens Matter: Exploring Global and Local Features of Vision + Transformers for Object Re-Identification + + +
+ Object Re-Identification (Re-ID) aims to identify and retrieve specific +objects from images captured at different places and times. Recently, object +Re-ID has achieved great success with the advances of Vision Transformers +(ViT). However, the effects of the global-local relation have not been fully +explored in Transformers for object Re-ID. In this work, we first explore the +influence of global and local features of ViT and then further propose a novel +Global-Local Transformer (GLTrans) for high-performance object Re-ID. We find +that the features from last few layers of ViT already have a strong +representational ability, and the global and local information can mutually +enhance each other. Based on this fact, we propose a Global Aggregation Encoder +(GAE) to utilize the class tokens of the last few Transformer layers and learn +comprehensive global features effectively. Meanwhile, we propose the Local +Multi-layer Fusion (LMF) which leverages both the global cues from GAE and +multi-layer patch tokens to explore the discriminative local representations. +Extensive experiments demonstrate that our proposed method achieves superior +performance on four object Re-ID benchmarks. + +
+
+ comment: Accepted by CVIU2024. More modifications may be performed +
+
+
+
+
+ + ☆ SGFormer: Spherical Geometry Transformer for 360 Depth Estimation + + +
+ Panoramic distortion poses a significant challenge in 360 depth estimation, +particularly pronounced at the north and south poles. Existing methods either +adopt a bi-projection fusion strategy to remove distortions or model long-range +dependencies to capture global structures, which can result in either unclear +structure or insufficient local perception. In this paper, we propose a +spherical geometry transformer, named SGFormer, to address the above issues, +with an innovative step to integrate spherical geometric priors into vision +transformers. To this end, we retarget the transformer decoder to a spherical +prior decoder (termed SPDecoder), which endeavors to uphold the integrity of +spherical structures during decoding. Concretely, we leverage bipolar +re-projection, circular rotation, and curve local embedding to preserve the +spherical characteristics of equidistortion, continuity, and surface distance, +respectively. Furthermore, we present a query-based global conditional position +embedding to compensate for spatial structure at varying resolutions. It not +only boosts the global perception of spatial position but also sharpens the +depth structure across different patches. Finally, we conduct extensive +experiments on popular benchmarks, demonstrating our superiority over +state-of-the-art solutions. + +
+
+
+
+
+ + ☆ CAGE: Circumplex Affect Guided Expression Inference CVPR2024 + + +
+ Understanding emotions and expressions is a task of interest across multiple +disciplines, especially for improving user experiences. Contrary to the common +perception, it has been shown that emotions are not discrete entities but +instead exist along a continuum. People understand discrete emotions +differently due to a variety of factors, including cultural background, +individual experiences, and cognitive biases. Therefore, most approaches to +expression understanding, particularly those relying on discrete categories, +are inherently biased. In this paper, we present a comparative in-depth +analysis of two common datasets (AffectNet and EMOTIC) equipped with the +components of the circumplex model of affect. Further, we propose a model for +the prediction of facial expressions tailored for lightweight applications. +Using a small-scaled MaxViT-based model architecture, we evaluate the impact of +discrete expression category labels in training with the continuous valence and +arousal labels. We show that considering valence and arousal in addition to +discrete category labels helps to significantly improve expression inference. +The proposed model outperforms the current state-of-the-art models on +AffectNet, establishing it as the best-performing model for inferring valence +and arousal achieving a 7% lower RMSE. Training scripts and trained weights to +reproduce our results can be found here: +https://github.com/wagner-niklas/CAGE_expression_inference. + +
+
+ comment: Accepted for publication at ABAW Workshop at CVPR2024 +
+
+
+
+
+ + ☆ CenterArt: Joint Shape Reconstruction and 6-DoF Grasp Estimation of + Articulated Objects ICRA 2024 + + +
+ Precisely grasping and reconstructing articulated objects is key to enabling +general robotic manipulation. In this paper, we propose CenterArt, a novel +approach for simultaneous 3D shape reconstruction and 6-DoF grasp estimation of +articulated objects. CenterArt takes RGB-D images of the scene as input and +first predicts the shape and joint codes through an encoder. The decoder then +leverages these codes to reconstruct 3D shapes and estimate 6-DoF grasp poses +of the objects. We further develop a mechanism for generating a dataset of +6-DoF grasp ground truth poses for articulated objects. CenterArt is trained on +realistic scenes containing multiple articulated objects with randomized +designs, textures, lighting conditions, and realistic depths. We perform +extensive experiments demonstrating that CenterArt outperforms existing methods +in accuracy and robustness. + +
+
+ comment: 4 pages, 2 figures, accepted to the ICRA 2024 Workshop on 3D Visual + Representations for Robot Manipulation +
+
+
+
+
+ + ☆ CoARF: Controllable 3D Artistic Style Transfer for Radiance Fields + + +
+ Creating artistic 3D scenes can be time-consuming and requires specialized +knowledge. To address this, recent works such as ARF, use a radiance +field-based approach with style constraints to generate 3D scenes that resemble +a style image provided by the user. However, these methods lack fine-grained +control over the resulting scenes. In this paper, we introduce Controllable +Artistic Radiance Fields (CoARF), a novel algorithm for controllable 3D scene +stylization. CoARF enables style transfer for specified objects, compositional +3D style transfer and semantic-aware style transfer. We achieve controllability +using segmentation masks with different label-dependent loss functions. We also +propose a semantic-aware nearest neighbor matching algorithm to improve the +style transfer quality. Our extensive experiments demonstrate that CoARF +provides user-specified controllability of style transfer and superior style +transfer quality with more precise feature matching. + +
+
+ comment: International Conference on 3D Vision 2024 +
+
+
+
+
+ + ☆ Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State + Space Model + + +
+ Existing Transformer-based models for point cloud analysis suffer from +quadratic complexity, leading to compromised point cloud resolution and +information loss. In contrast, the newly proposed Mamba model, based on state +space models (SSM), outperforms Transformer in multiple areas with only linear +complexity. However, the straightforward adoption of Mamba does not achieve +satisfactory performance on point cloud tasks. In this work, we present +Mamba3D, a state space model tailored for point cloud learning to enhance local +feature extraction, achieving superior performance, high efficiency, and +scalability potential. Specifically, we propose a simple yet effective Local +Norm Pooling (LNP) block to extract local geometric features. Additionally, to +obtain better global features, we introduce a bidirectional SSM (bi-SSM) with +both a token forward SSM and a novel backward SSM that operates on the feature +channel. Extensive experimental results show that Mamba3D surpasses +Transformer-based counterparts and concurrent works in multiple tasks, with or +without pre-training. Notably, Mamba3D achieves multiple SoTA, including an +overall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1% +(with single-modal pre-training) on the ModelNet40 classification task, with +only linear complexity. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ DAWN: Domain-Adaptive Weakly Supervised Nuclei Segmentation via + Cross-Task Interactions + + +
+ Weakly supervised segmentation methods have gained significant attention due +to their ability to reduce the reliance on costly pixel-level annotations +during model training. However, the current weakly supervised nuclei +segmentation approaches typically follow a two-stage pseudo-label generation +and network training process. The performance of the nuclei segmentation +heavily relies on the quality of the generated pseudo-labels, thereby limiting +its effectiveness. This paper introduces a novel domain-adaptive weakly +supervised nuclei segmentation framework using cross-task interaction +strategies to overcome the challenge of pseudo-label generation. Specifically, +we utilize weakly annotated data to train an auxiliary detection task, which +assists the domain adaptation of the segmentation network. To enhance the +efficiency of domain adaptation, we design a consistent feature constraint +module integrating prior knowledge from the source domain. Furthermore, we +develop pseudo-label optimization and interactive training methods to improve +the domain transfer capability. To validate the effectiveness of our proposed +method, we conduct extensive comparative and ablation experiments on six +datasets. The results demonstrate the superiority of our approach over existing +weakly supervised approaches. Remarkably, our method achieves comparable or +even better performance than fully supervised methods. Our code will be +released in https://github.com/zhangye-zoe/DAWN. + +
+
+ comment: 13 pages, 11 figures, 8 tables +
+
+
+
+
+ + ☆ Traditional to Transformers: A Survey on Current Trends and Future + Prospects for Hyperspectral Image Classification + + +
+ Hyperspectral image classification is a challenging task due to the high +dimensionality and complex nature of hyperspectral data. In recent years, deep +learning techniques have emerged as powerful tools for addressing these +challenges. This survey provides a comprehensive overview of the current trends +and future prospects in hyperspectral image classification, focusing on the +advancements from deep learning models to the emerging use of transformers. We +review the key concepts, methodologies, and state-of-the-art approaches in deep +learning for hyperspectral image classification. Additionally, we discuss the +potential of transformer-based models in this field and highlight the +advantages and challenges associated with these approaches. Comprehensive +experimental results have been undertaken using three Hyperspectral datasets to +verify the efficacy of various conventional deep-learning models and +Transformers. Finally, we outline future research directions and potential +applications that can further enhance the accuracy and efficiency of +hyperspectral image classification. + The Source code is available at +https://github.com/mahmad00/Conventional-to-Transformer-for-Hyperspectral-Image-Classification-Survey-2024. + +
+
+
+
+
+ + ☆ Leveraging Speech for Gesture Detection in Multimodal Communication + + +
+ Gestures are inherent to human interaction and often complement speech in +face-to-face communication, forming a multimodal communication system. An +important task in gesture analysis is detecting a gesture's beginning and end. +Research on automatic gesture detection has primarily focused on visual and +kinematic information to detect a limited set of isolated or silent gestures +with low variability, neglecting the integration of speech and vision signals +to detect gestures that co-occur with speech. This work addresses this gap by +focusing on co-speech gesture detection, emphasising the synchrony between +speech and co-speech hand gestures. We address three main challenges: the +variability of gesture forms, the temporal misalignment between gesture and +speech onsets, and differences in sampling rate between modalities. We +investigate extended speech time windows and employ separate backbone models +for each modality to address the temporal misalignment and sampling rate +differences. We utilize Transformer encoders in cross-modal and early fusion +techniques to effectively align and integrate speech and skeletal sequences. +The study results show that combining visual and speech information +significantly enhances gesture detection performance. Our findings indicate +that expanding the speech buffer beyond visual time segments improves +performance and that multimodal integration using cross-modal and early fusion +techniques outperforms baseline methods using unimodal and late fusion methods. +Additionally, we find a correlation between the models' gesture prediction +confidence and low-level speech frequency features potentially associated with +gestures. Overall, the study provides a better understanding and detection +methods for co-speech gestures, facilitating the analysis of multimodal +communication. + +
+
+
+
+
+ + ☆ Streamlining the Image Stitching Pipeline: Integrating Fusion and + Rectangling into a Unified Model + + +
+ Learning-based image stitching techniques typically involve three distinct +stages: registration, fusion, and rectangling. These stages are often performed +sequentially, each trained independently, leading to potential cascading error +propagation and complex parameter tuning challenges. In rethinking the +mathematical modeling of the fusion and rectangling stages, we discovered that +these processes can be effectively combined into a single, variety-intensity +inpainting problem. Therefore, we propose the Simple and Robust Stitcher +(SRStitcher), an efficient training-free image stitching method that merges the +fusion and rectangling stages into a unified model. By employing the weighted +mask and large-scale generative model, SRStitcher can solve the fusion and +rectangling problems in a single inference, without additional training or +fine-tuning of other models. Our method not only simplifies the stitching +pipeline but also enhances fault tolerance towards misregistration errors. +Extensive experiments demonstrate that SRStitcher outperforms state-of-the-art +(SOTA) methods in both quantitative assessments and qualitative evaluations. +The code is released at https://github.com/yayoyo66/SRStitcher + +
+
+
+
+
+ + ☆ Multi-Modal Prompt Learning on Blind Image Quality Assessment + + +
+ Image Quality Assessment (IQA) models benefit significantly from semantic +information, which allows them to treat different types of objects distinctly. +Currently, leveraging semantic information to enhance IQA is a crucial research +direction. Traditional methods, hindered by a lack of sufficiently annotated +data, have employed the CLIP image-text pretraining model as their backbone to +gain semantic awareness. However, the generalist nature of these pre-trained +Vision-Language (VL) models often renders them suboptimal for IQA-specific +tasks. Recent approaches have attempted to address this mismatch using prompt +technology, but these solutions have shortcomings. Existing prompt-based VL +models overly focus on incremental semantic information from text, neglecting +the rich insights available from visual data analysis. This imbalance limits +their performance improvements in IQA tasks. This paper introduces an +innovative multi-modal prompt-based methodology for IQA. Our approach employs +carefully crafted prompts that synergistically mine incremental semantic +information from both visual and linguistic data. Specifically, in the visual +branch, we introduce a multi-layer prompt structure to enhance the VL model's +adaptability. In the text branch, we deploy a dual-prompt scheme that steers +the model to recognize and differentiate between scene category and distortion +type, thereby refining the model's capacity to assess image quality. Our +experimental findings underscore the effectiveness of our method over existing +Blind Image Quality Assessment (BIQA) approaches. Notably, it demonstrates +competitive performance across various datasets. Our method achieves Spearman +Rank Correlation Coefficient (SRCC) values of 0.961(surpassing 0.946 in CSIQ) +and 0.941 (exceeding 0.930 in KADID), illustrating its robustness and accuracy +in diverse contexts. + +
+
+
+
+
+ + ☆ Pyramid Hierarchical Transformer for Hyperspectral Image Classification + + +
+ The traditional Transformer model encounters challenges with variable-length +input sequences, particularly in Hyperspectral Image Classification (HSIC), +leading to efficiency and scalability concerns. To overcome this, we propose a +pyramid-based hierarchical transformer (PyFormer). This innovative approach +organizes input data hierarchically into segments, each representing distinct +abstraction levels, thereby enhancing processing efficiency for lengthy +sequences. At each level, a dedicated transformer module is applied, +effectively capturing both local and global context. Spatial and spectral +information flow within the hierarchy facilitates communication and abstraction +propagation. Integration of outputs from different levels culminates in the +final input representation. Experimental results underscore the superiority of +the proposed method over traditional approaches. Additionally, the +incorporation of disjoint samples augments robustness and reliability, thereby +highlighting the potential of our approach in advancing HSIC. + The source code is available at https://github.com/mahmad00/PyFormer. + +
+
+
+
+
+ + ☆ Importance of Disjoint Sampling in Conventional and Transformer Models + for Hyperspectral Image Classification + + +
+ Disjoint sampling is critical for rigorous and unbiased evaluation of +state-of-the-art (SOTA) models. When training, validation, and test sets +overlap or share data, it introduces a bias that inflates performance metrics +and prevents accurate assessment of a model's true ability to generalize to new +examples. This paper presents an innovative disjoint sampling approach for +training SOTA models on Hyperspectral image classification (HSIC) tasks. By +separating training, validation, and test data without overlap, the proposed +method facilitates a fairer evaluation of how well a model can classify pixels +it was not exposed to during training or validation. Experiments demonstrate +the approach significantly improves a model's generalization compared to +alternatives that include training and validation data in test data. By +eliminating data leakage between sets, disjoint sampling provides reliable +metrics for benchmarking progress in HSIC. Researchers can have confidence that +reported performance truly reflects a model's capabilities for classifying new +scenes, not just memorized pixels. This rigorous methodology is critical for +advancing SOTA models and their real-world application to large-scale land +mapping with Hyperspectral sensors. + The source code is available at +https://github.com/mahmad00/Disjoint-Sampling-for-Hyperspectral-Image-Classification. + +
+
+
+
+
+ + ☆ G3R: Generating Rich and Fine-grained mmWave Radar Data from 2D Videos + for Generalized Gesture Recognition + + +
+ Millimeter wave radar is gaining traction recently as a promising modality +for enabling pervasive and privacy-preserving gesture recognition. However, the +lack of rich and fine-grained radar datasets hinders progress in developing +generalized deep learning models for gesture recognition across various user +postures (e.g., standing, sitting), positions, and scenes. To remedy this, we +resort to designing a software pipeline that exploits wealthy 2D videos to +generate realistic radar data, but it needs to address the challenge of +simulating diversified and fine-grained reflection properties of user gestures. +To this end, we design G3R with three key components: (i) a gesture reflection +point generator expands the arm's skeleton points to form human reflection +points; (ii) a signal simulation model simulates the multipath reflection and +attenuation of radar signals to output the human intensity map; (iii) an +encoder-decoder model combines a sampling module and a fitting module to +address the differences in number and distribution of points between generated +and real-world radar data for generating realistic radar data. We implement and +evaluate G3R using 2D videos from public data sources and self-collected +real-world radar data, demonstrating its superiority over other +state-of-the-art approaches for gesture recognition. + +
+
+ comment: 18 pages, 29 figures +
+
+
+
+
+ + ☆ Mining Supervision for Dynamic Regions in Self-Supervised Monocular + Depth Estimation CVPR2024 + + +
+ This paper focuses on self-supervised monocular depth estimation in dynamic +scenes trained on monocular videos. Existing methods jointly estimate +pixel-wise depth and motion, relying mainly on an image reconstruction loss. +Dynamic regions1 remain a critical challenge for these methods due to the +inherent ambiguity in depth and motion estimation, resulting in inaccurate +depth estimation. This paper proposes a self-supervised training framework +exploiting pseudo depth labels for dynamic regions from training data. The key +contribution of our framework is to decouple depth estimation for static and +dynamic regions of images in the training data. We start with an unsupervised +depth estimation approach, which provides reliable depth estimates for static +regions and motion cues for dynamic regions and allows us to extract moving +object information at the instance level. In the next stage, we use an object +network to estimate the depth of those moving objects assuming rigid motions. +Then, we propose a new scale alignment module to address the scale ambiguity +between estimated depths for static and dynamic regions. We can then use the +depth labels generated to train an end-to-end depth estimation network and +improve its performance. Extensive experiments on the Cityscapes and KITTI +datasets show that our self-training strategy consistently outperforms existing +self/unsupervised depth estimation methods. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Driver Activity Classification Using Generalizable Representations from + Vision-Language Models + + +
+ Driver activity classification is crucial for ensuring road safety, with +applications ranging from driver assistance systems to autonomous vehicle +control transitions. In this paper, we present a novel approach leveraging +generalizable representations from vision-language models for driver activity +classification. Our method employs a Semantic Representation Late Fusion Neural +Network (SRLF-Net) to process synchronized video frames from multiple +perspectives. Each frame is encoded using a pretrained vision-language encoder, +and the resulting embeddings are fused to generate class probability +predictions. By leveraging contrastively-learned vision-language +representations, our approach achieves robust performance across diverse driver +activities. We evaluate our method on the Naturalistic Driving Action +Recognition Dataset, demonstrating strong accuracy across many classes. Our +results suggest that vision-language representations offer a promising avenue +for driver monitoring systems, providing both accuracy and interpretability +through natural language descriptors. + +
+
+
+
+
+ + ☆ DENOISER: Rethinking the Robustness for Open-Vocabulary Action + Recognition + + +
+ As one of the fundamental video tasks in computer vision, Open-Vocabulary +Action Recognition (OVAR) recently gains increasing attention, with the +development of vision-language pre-trainings. To enable generalization of +arbitrary classes, existing methods treat class labels as text descriptions, +then formulate OVAR as evaluating embedding similarity between visual samples +and textual classes. However, one crucial issue is completely ignored: the +class descriptions given by users may be noisy, e.g., misspellings and typos, +limiting the real-world practicality of vanilla OVAR. To fill the research gap, +this paper pioneers to evaluate existing methods by simulating multi-level +noises of various types, and reveals their poor robustness. To tackle the noisy +OVAR task, we further propose one novel DENOISER framework, covering two parts: +generation and discrimination. Concretely, the generative part denoises noisy +class-text names via one decoding process, i.e., propose text candidates, then +utilize inter-modal and intra-modal information to vote for the best. At the +discriminative part, we use vanilla OVAR models to assign visual samples to +class-text names, thus obtaining more semantics. For optimization, we +alternately iterate between generative and discriminative parts for progressive +refinements. The denoised text classes help OVAR models classify visual samples +more accurately; in return, classified visual samples help better denoising. On +three datasets, we carry out extensive experiments to show our superior +robustness, and thorough ablations to dissect the effectiveness of each +component. + +
+
+
+
+
+ + ☆ Domain adaptive pose estimation via multi-level alignment + + +
+ Domain adaptive pose estimation aims to enable deep models trained on source +domain (synthesized) datasets produce similar results on the target domain +(real-world) datasets. The existing methods have made significant progress by +conducting image-level or feature-level alignment. However, only aligning at a +single level is not sufficient to fully bridge the domain gap and achieve +excellent domain adaptive results. In this paper, we propose a multi-level +domain adaptation aproach, which aligns different domains at the image, +feature, and pose levels. Specifically, we first utilize image style transer to +ensure that images from the source and target domains have a similar +distribution. Subsequently, at the feature level, we employ adversarial +training to make the features from the source and target domains preserve +domain-invariant characeristics as much as possible. Finally, at the pose +level, a self-supervised approach is utilized to enable the model to learn +diverse knowledge, implicitly addressing the domain gap. Experimental results +demonstrate that significant imrovement can be achieved by the proposed +multi-level alignment method in pose estimation, which outperforms previous +state-of-the-art in human pose by up to 2.4% and animal pose estimation by up +to 3.1% for dogs and 1.4% for sheep. + +
+
+
+
+
+ + ☆ A sensitivity analysis to quantify the impact of neuroimaging + preprocessing strategies on subsequent statistical analyses + + +
+ Even though novel imaging techniques have been successful in studying brain +structure and function, the measured biological signals are often contaminated +by multiple sources of noise, arising due to e.g. head movements of the +individual being scanned, limited spatial/temporal resolution, or other issues +specific to each imaging technology. Data preprocessing (e.g. denoising) is +therefore critical. Preprocessing pipelines have become increasingly complex +over the years, but also more flexible, and this flexibility can have a +significant impact on the final results and conclusions of a given study. This +large parameter space is often referred to as multiverse analyses. Here, we +provide conceptual and practical tools for statistical analyses that can +aggregate multiple pipeline results along with a new sensitivity analysis +testing for hypotheses across pipelines such as "no effect across all +pipelines" or "at least one pipeline with no effect". The proposed framework is +generic and can be applied to any multiverse scenario, but we illustrate its +use based on positron emission tomography data. + +
+
+
+
+
+ + ☆ Ultrasound Nodule Segmentation Using Asymmetric Learning with Simple + Clinical Annotation + + +
+ Recent advances in deep learning have greatly facilitated the automated +segmentation of ultrasound images, which is essential for nodule morphological +analysis. Nevertheless, most existing methods depend on extensive and precise +annotations by domain experts, which are labor-intensive and time-consuming. In +this study, we suggest using simple aspect ratio annotations directly from +ultrasound clinical diagnoses for automated nodule segmentation. Especially, an +asymmetric learning framework is developed by extending the aspect ratio +annotations with two types of pseudo labels, i.e., conservative labels and +radical labels, to train two asymmetric segmentation networks simultaneously. +Subsequently, a conservative-radical-balance strategy (CRBS) strategy is +proposed to complementally combine radical and conservative labels. An +inconsistency-aware dynamically mixed pseudo-labels supervision (IDMPS) module +is introduced to address the challenges of over-segmentation and +under-segmentation caused by the two types of labels. To further leverage the +spatial prior knowledge provided by clinical annotations, we also present a +novel loss function namely the clinical anatomy prior loss. Extensive +experiments on two clinically collected ultrasound datasets (thyroid and +breast) demonstrate the superior performance of our proposed method, which can +achieve comparable and even better performance than fully supervised methods +using ground truth annotations. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ☆ Ultrasound SAM Adapter: Adapting SAM for Breast Lesion Segmentation in + Ultrasound Images + + +
+ Segment Anything Model (SAM) has recently achieved amazing results in the +field of natural image segmentation. However, it is not effective for medical +image segmentation, owing to the large domain gap between natural and medical +images. In this paper, we mainly focus on ultrasound image segmentation. As we +know that it is very difficult to train a foundation model for ultrasound image +data due to the lack of large-scale annotated ultrasound image data. To address +these issues, in this paper, we develop a novel Breast Ultrasound SAM Adapter, +termed Breast Ultrasound Segment Anything Model (BUSSAM), which migrates the +SAM to the field of breast ultrasound image segmentation by using the adapter +technique. To be specific, we first design a novel CNN image encoder, which is +fully trained on the BUS dataset. Our CNN image encoder is more lightweight, +and focuses more on features of local receptive field, which provides the +complementary information to the ViT branch in SAM. Then, we design a novel +Cross-Branch Adapter to allow the CNN image encoder to fully interact with the +ViT image encoder in SAM module. Finally, we add both of the Position Adapter +and the Feature Adapter to the ViT branch to fine-tune the original SAM. The +experimental results on AMUBUS and BUSI datasets demonstrate that our proposed +model outperforms other medical image segmentation models significantly. Our +code will be available at: https://github.com/bscs12/BUSSAM. + +
+
+
+
+
+ + ☆ Semi-supervised 2D Human Pose Estimation via Adaptive Keypoint Masking + + +
+ Human pose estimation is a fundamental and challenging task in computer +vision. Larger-scale and more accurate keypoint annotations, while helpful for +improving the accuracy of supervised pose estimation, are often expensive and +difficult to obtain. Semi-supervised pose estimation tries to leverage a large +amount of unlabeled data to improve model performance, which can alleviate the +problem of insufficient labeled samples. The latest semi-supervised learning +usually adopts a strong and weak data augmented teacher-student learning +framework to deal with the challenge of "Human postural diversity and its +long-tailed distribution". Appropriate data augmentation method is one of the +key factors affecting the accuracy and generalization of semi-supervised +models. Aiming at the problem that the difference of sample learning is not +considered in the fixed keypoint masking augmentation method, this paper +proposes an adaptive keypoint masking method, which can fully mine the +information in the samples and obtain better estimation performance. In order +to further improve the generalization and robustness of the model, this paper +proposes a dual-branch data augmentation scheme, which can perform Mixup on +samples and features on the basis of adaptive keypoint masking. The +effectiveness of the proposed method is verified on COCO and MPII, +outperforming the state-of-the-art semi-supervised pose estimation by 5.2% and +0.3%, respectively. + +
+
+ comment: China Multimedia 2023 +
+
+
+
+
+ + ☆ CoProNN: Concept-based Prototypical Nearest Neighbors for Explaining + Vision Models + + +
+ Mounting evidence in explainability for artificial intelligence (XAI) +research suggests that good explanations should be tailored to individual tasks +and should relate to concepts relevant to the task. However, building task +specific explanations is time consuming and requires domain expertise which can +be difficult to integrate into generic XAI methods. A promising approach +towards designing useful task specific explanations with domain experts is +based on compositionality of semantic concepts. Here, we present a novel +approach that enables domain experts to quickly create concept-based +explanations for computer vision tasks intuitively via natural language. +Leveraging recent progress in deep generative methods we propose to generate +visual concept-based prototypes via text-to-image methods. These prototypes are +then used to explain predictions of computer vision models via a simple +k-Nearest-Neighbors routine. The modular design of CoProNN is simple to +implement, it is straightforward to adapt to novel tasks and allows for +replacing the classification and text-to-image models as more powerful models +are released. The approach can be evaluated offline against the ground-truth of +predefined prototypes that can be easily communicated also to domain experts as +they are based on visual concepts. We show that our strategy competes very well +with other concept-based XAI approaches on coarse grained image classification +tasks and may even outperform those methods on more demanding fine grained +tasks. We demonstrate the effectiveness of our method for human-machine +collaboration settings in qualitative and quantitative user studies. All code +and experimental data can be found in our GitHub +$\href{https://github.com/TeodorChiaburu/beexplainable}{repository}$. + +
+
+ comment: 24 pages, 9 figures, 2 tables, accepted at WCXAI 2024 Valletta +
+
+
+
+
+ + ☆ Revisiting Neural Networks for Continual Learning: An Architectural + Perspective + + +
+ Efforts to overcome catastrophic forgetting have primarily centered around +developing more effective Continual Learning (CL) methods. In contrast, less +attention was devoted to analyzing the role of network architecture design +(e.g., network depth, width, and components) in contributing to CL. This paper +seeks to bridge this gap between network architecture design and CL, and to +present a holistic study on the impact of network architectures on CL. This +work considers architecture design at the network scaling level, i.e., width +and depth, and also at the network components, i.e., skip connections, global +pooling layers, and down-sampling. In both cases, we first derive insights +through systematically exploring how architectural designs affect CL. Then, +grounded in these insights, we craft a specialized search space for CL and +further propose a simple yet effective ArchCraft method to steer a CL-friendly +architecture, namely, this method recrafts AlexNet/ResNet into AlexAC/ResAC. +Experimental validation across various CL settings and scenarios demonstrates +that improved architectures are parameter-efficient, achieving state-of-the-art +performance of CL while being 86%, 61%, and 97% more compact in terms of +parameters than the naive CL architecture in Class IL and Task IL. Code is +available at https://github.com/byyx666/ArchCraft. + +
+
+
+
+
+ + ☆ CNN2GNN: How to Bridge CNN with GNN + + +
+ Although the convolutional neural network (CNN) has achieved excellent +performance in vision tasks by extracting the intra-sample representation, it +will take a higher training expense because of stacking numerous convolutional +layers. Recently, as the bilinear models, graph neural networks (GNN) have +succeeded in exploring the underlying topological relationship among the graph +data with a few graph neural layers. Unfortunately, it cannot be directly +utilized on non-graph data due to the lack of graph structure and has high +inference latency on large-scale scenarios. Inspired by these complementary +strengths and weaknesses, \textit{we discuss a natural question, how to bridge +these two heterogeneous networks?} In this paper, we propose a novel CNN2GNN +framework to unify CNN and GNN together via distillation. Firstly, to break the +limitations of GNN, a differentiable sparse graph learning module is designed +as the head of networks to dynamically learn the graph for inductive learning. +Then, a response-based distillation is introduced to transfer the knowledge +from CNN to GNN and bridge these two heterogeneous networks. Notably, due to +extracting the intra-sample representation of a single instance and the +topological relationship among the datasets simultaneously, the performance of +distilled ``boosted'' two-layer GNN on Mini-ImageNet is much higher than CNN +containing dozens of layers such as ResNet152. + +
+
+
+
+
+ + ☆ Visual-Augmented Dynamic Semantic Prototype for Generative Zero-Shot + Learning + + +
+ Generative Zero-shot learning (ZSL) learns a generator to synthesize visual +samples for unseen classes, which is an effective way to advance ZSL. However, +existing generative methods rely on the conditions of Gaussian noise and the +predefined semantic prototype, which limit the generator only optimized on +specific seen classes rather than characterizing each visual instance, +resulting in poor generalizations (\textit{e.g.}, overfitting to seen classes). +To address this issue, we propose a novel Visual-Augmented Dynamic Semantic +prototype method (termed VADS) to boost the generator to learn accurate +semantic-visual mapping by fully exploiting the visual-augmented knowledge into +semantic conditions. In detail, VADS consists of two modules: (1) Visual-aware +Domain Knowledge Learning module (VDKL) learns the local bias and global prior +of the visual features (referred to as domain visual knowledge), which replace +pure Gaussian noise to provide richer prior noise information; (2) +Vision-Oriented Semantic Updation module (VOSU) updates the semantic prototype +according to the visual representations of the samples. Ultimately, we +concatenate their output as a dynamic semantic prototype, which serves as the +condition of the generator. Extensive experiments demonstrate that our VADS +achieves superior CZSL and GZSL performances on three prominent datasets and +outperforms other state-of-the-art methods with averaging increases by 6.4\%, +5.9\% and 4.2\% on SUN, CUB and AWA2, respectively. + +
+
+
+
+
+ + ☆ Reference-Free Multi-Modality Volume Registration of X-Ray Microscopy + and Light-Sheet Fluorescence Microscopy + + +
+ Recently, X-ray microscopy (XRM) and light-sheet fluorescence microscopy +(LSFM) have emerged as two pivotal imaging tools in preclinical research on +bone remodeling diseases, offering micrometer-level resolution. Integrating +these complementary modalities provides a holistic view of bone +microstructures, facilitating function-oriented volume analysis across +different disease cycles. However, registering such independently acquired +large-scale volumes is extremely challenging under real and reference-free +scenarios. This paper presents a fast two-stage pipeline for volume +registration of XRM and LSFM. The first stage extracts the surface features and +employs two successive point cloud-based methods for coarse alignment. The +second stage fine-tunes the initial alignment using a modified +cross-correlation method, ensuring precise volumetric registration. Moreover, +we propose residual similarity as a novel metric to assess the alignment of two +complementary modalities. The results imply robust gradual improvement across +the stages. In the end, all correlating microstructures, particularly lacunae +in XRM and bone cells in LSFM, are precisely matched, enabling new insights +into bone diseases like osteoporosis which are a substantial burden in aging +societies. + +
+
+
+
+
+ + ☆ DesignProbe: A Graphic Design Benchmark for Multimodal Large Language + Models + + +
+ A well-executed graphic design typically achieves harmony in two levels, from +the fine-grained design elements (color, font and layout) to the overall +design. This complexity makes the comprehension of graphic design challenging, +for it needs the capability to both recognize the design elements and +understand the design. With the rapid development of Multimodal Large Language +Models (MLLMs), we establish the DesignProbe, a benchmark to investigate the +capability of MLLMs in design. Our benchmark includes eight tasks in total, +across both the fine-grained element level and the overall design level. At +design element level, we consider both the attribute recognition and semantic +understanding tasks. At overall design level, we include style and metaphor. 9 +MLLMs are tested and we apply GPT-4 as evaluator. Besides, further experiments +indicates that refining prompts can enhance the performance of MLLMs. We first +rewrite the prompts by different LLMs and found increased performances appear +in those who self-refined by their own LLMs. We then add extra task knowledge +in two different ways (text descriptions and image examples), finding that +adding images boost much more performance over texts. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ ContextualFusion: Context-Based Multi-Sensor Fusion for 3D Object + Detection in Adverse Operating Conditions + + +
+ The fusion of multimodal sensor data streams such as camera images and lidar +point clouds plays an important role in the operation of autonomous vehicles +(AVs). Robust perception across a range of adverse weather and lighting +conditions is specifically required for AVs to be deployed widely. While +multi-sensor fusion networks have been previously developed for perception in +sunny and clear weather conditions, these methods show a significant +degradation in performance under night-time and poor weather conditions. In +this paper, we propose a simple yet effective technique called ContextualFusion +to incorporate the domain knowledge about cameras and lidars behaving +differently across lighting and weather variations into 3D object detection +models. Specifically, we design a Gated Convolutional Fusion (GatedConv) +approach for the fusion of sensor streams based on the operational context. To +aid in our evaluation, we use the open-source simulator CARLA to create a +multimodal adverse-condition dataset called AdverseOp3D to address the +shortcomings of existing datasets being biased towards daytime and good-weather +conditions. Our ContextualFusion approach yields an mAP improvement of 6.2% +over state-of-the-art methods on our context-balanced synthetic dataset. +Finally, our method enhances state-of-the-art 3D objection performance at night +on the real-world NuScenes dataset with a significant mAP improvement of 11.7%. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Enhancing Prompt Following with Visual Control Through Training-Free + Mask-Guided Diffusion + + +
+ Recently, integrating visual controls into text-to-image~(T2I) models, such +as ControlNet method, has received significant attention for finer control +capabilities. While various training-free methods make efforts to enhance +prompt following in T2I models, the issue with visual control is still rarely +studied, especially in the scenario that visual controls are misaligned with +text prompts. In this paper, we address the challenge of ``Prompt Following +With Visual Control" and propose a training-free approach named Mask-guided +Prompt Following (MGPF). Object masks are introduced to distinct aligned and +misaligned parts of visual controls and prompts. Meanwhile, a network, dubbed +as Masked ControlNet, is designed to utilize these object masks for object +generation in the misaligned visual control region. Further, to improve +attribute matching, a simple yet efficient loss is designed to align the +attention maps of attributes with object regions constrained by ControlNet and +object masks. The efficacy and superiority of MGPF are validated through +comprehensive quantitative and qualitative experiments. + +
+
+
+
+
+ + ☆ Unified Unsupervised Salient Object Detection via Knowledge Transfer + + +
+ Recently, unsupervised salient object detection (USOD) has gained increasing +attention due to its annotation-free nature. However, current methods mainly +focus on specific tasks such as RGB and RGB-D, neglecting the potential for +task migration. In this paper, we propose a unified USOD framework for generic +USOD tasks. Firstly, we propose a Progressive Curriculum Learning-based +Saliency Distilling (PCL-SD) mechanism to extract saliency cues from a +pre-trained deep network. This mechanism starts with easy samples and +progressively moves towards harder ones, to avoid initial interference caused +by hard samples. Afterwards, the obtained saliency cues are utilized to train a +saliency detector, and we employ a Self-rectify Pseudo-label Refinement (SPR) +mechanism to improve the quality of pseudo-labels. Finally, an adapter-tuning +method is devised to transfer the acquired saliency knowledge, leveraging +shared knowledge to attain superior transferring performance on the target +tasks. Extensive experiments on five representative SOD tasks confirm the +effectiveness and feasibility of our proposed method. Code and supplement +materials are available at https://github.com/I2-Multimedia-Lab/A2S-v3. + +
+
+
+
+
+ + ☆ SkinGEN: an Explainable Dermatology Diagnosis-to-Generation Framework + with Interactive Vision-Language Models + + +
+ With the continuous advancement of vision language models (VLMs) technology, +remarkable research achievements have emerged in the dermatology field, the +fourth most prevalent human disease category. However, despite these +advancements, VLM still faces "hallucination" in dermatological diagnosis, and +due to the inherent complexity of dermatological conditions, existing tools +offer relatively limited support for user comprehension. We propose SkinGEN, a +diagnosis-to-generation framework that leverages the stable diffusion (SD) +method to generate reference demonstrations from diagnosis results provided by +VLM, thereby enhancing the visual explainability for users. Through extensive +experiments with Low-Rank Adaptation (LoRA), we identify optimal strategies for +skin condition image generation. We conduct a user study with 32 participants +evaluating both the system performance and explainability. Results demonstrate +that SkinGEN significantly improves users' comprehension of VLM predictions and +fosters increased trust in the diagnostic process. This work paves the way for +more transparent and user-centric VLM applications in dermatology and beyond. + +
+
+
+
+
+ + ☆ Grounded Knowledge-Enhanced Medical VLP for Chest X-Ray + + +
+ Medical vision-language pre-training has emerged as a promising approach for +learning domain-general representations of medical image and text. Current +algorithms that exploit the global and local alignment between medical image +and text could however be marred by the redundant information in medical data. +To address this issue, we propose a grounded knowledge-enhanced medical +vision-language pre-training (GK-MVLP) framework for chest X-ray. In this +framework, medical knowledge is grounded to the appropriate anatomical regions +by using a transformer-based grounded knowledge-enhanced module for +fine-grained alignment between anatomical region-level visual features and the +textural features of medical knowledge. The performance of GK-MVLP is +competitive with or exceeds the state of the art on downstream chest X-ray +disease classification, disease localization, report generation, and medical +visual question-answering tasks. Our results show the advantage of +incorporating grounding mechanism to remove biases and improve the alignment +between chest X-ray image and radiology report. + +
+
+
+
+
+ + ☆ Differentiable Score-Based Likelihoods: Learning CT Motion Compensation + From Clean Images + + +
+ Motion artifacts can compromise the diagnostic value of computed tomography +(CT) images. Motion correction approaches require a per-scan estimation of +patient-specific motion patterns. In this work, we train a score-based model to +act as a probability density estimator for clean head CT images. Given the +trained model, we quantify the deviation of a given motion-affected CT image +from the ideal distribution through likelihood computation. We demonstrate that +the likelihood can be utilized as a surrogate metric for motion artifact +severity in the CT image facilitating the application of an iterative, +gradient-based motion compensation algorithm. By optimizing the underlying +motion parameters to maximize likelihood, our method effectively reduces motion +artifacts, bringing the image closer to the distribution of motion-free scans. +Our approach achieves comparable performance to state-of-the-art methods while +eliminating the need for a representative data set of motion-affected samples. +This is particularly advantageous in real-world applications, where patient +motion patterns may exhibit unforeseen variability, ensuring robustness without +implicit assumptions about recoverable motion types. + +
+
+
+
+
+ + ☆ TAAT: Think and Act from Arbitrary Texts in Text2Motion + + +
+ Text2Motion aims to generate human motions from texts. Existing datasets rely +on the assumption that texts include action labels (such as "walk, bend, and +pick up"), which is not flexible for practical scenarios. This paper redefines +this problem with a more realistic assumption that the texts are arbitrary. +Specifically, arbitrary texts include existing action texts composed of action +labels (e.g., A person walks and bends to pick up something), and introduce +scene texts without explicit action labels (e.g., A person notices his wallet +on the ground ahead). + To bridge the gaps between this realistic setting and existing datasets, we +expand the action texts on the HumanML3D dataset to more scene texts, thereby +creating a new HumanML3D++ dataset including arbitrary texts. In this +challenging dataset, we benchmark existing state-of-the-art methods and propose +a novel two-stage framework to extract action labels from arbitrary texts by +the Large Language Model (LLM) and then generate motions from action labels. +Extensive experiments are conducted under different application scenarios to +validate the effectiveness of the proposed framework on existing and proposed +datasets. The results indicate that Text2Motion in this realistic setting is +very challenging, fostering new research in this practical direction. Our +dataset and code will be released. + +
+
+
+
+
+ + ☆ BMapOpt: Optimization of Brain Tissue Probability Maps using a + Differentiable MRI Simulator + + +
+ Reconstructing digital brain phantoms in the form of multi-channeled brain +tissue probability maps for individual subjects is essential for capturing +brain anatomical variability, understanding neurological diseases, as well as +for testing image processing methods. We demonstrate the first framework that +optimizes brain tissue probability maps (Gray Matter - GM, White Matter - WM, +and Cerebrospinal fluid - CSF) with the help of a Physics-based differentiable +MRI simulator that models the magnetization signal at each voxel in the image. +Given an observed $T_1$/$T_2$-weighted MRI scan, the corresponding clinical MRI +sequence, and the MRI differentiable simulator, we optimize the simulator's +input probability maps by back-propagating the L2 loss between the simulator's +output and the $T_1$/$T_2$-weighted scan. This approach has the significant +advantage of not relying on any training data, and instead uses the strong +inductive bias of the MRI simulator. We tested the model on 20 scans from the +BrainWeb database and demonstrate a highly accurate reconstruction of GM, WM, +and CSF. + +
+
+
+
+
+ + ☆ Bayesian Example Selection Improves In-Context Learning for Speech, + Text, and Visual Modalities + + +
+ Large language models (LLMs) can adapt to new tasks through in-context +learning (ICL) based on a few examples presented in dialogue history without +any model parameter update. Despite such convenience, the performance of ICL +heavily depends on the quality of the in-context examples presented, which +makes the in-context example selection approach a critical choice. This paper +proposes a novel Bayesian in-Context example Selection method (ByCS) for ICL. +Extending the inference probability conditioned on in-context examples based on +Bayes' theorem, ByCS focuses on the inverse inference conditioned on test +input. Following the assumption that accurate inverse inference probability +(likelihood) will result in accurate inference probability (posterior), +in-context examples are selected based on their inverse inference results. +Diverse and extensive cross-tasking and cross-modality experiments are +performed with speech, text, and image examples. Experimental results show the +efficacy and robustness of our ByCS method on various models, tasks and +modalities. + +
+
+ comment: 16 pages, 6 figures +
+
+
+
+
+ + ☆ FINEMATCH: Aspect-based Fine-grained Image and Text Mismatch Detection + and Correction + + +
+ Recent progress in large-scale pre-training has led to the development of +advanced vision-language models (VLMs) with remarkable proficiency in +comprehending and generating multimodal content. Despite the impressive ability +to perform complex reasoning for VLMs, current models often struggle to +effectively and precisely capture the compositional information on both the +image and text sides. To address this, we propose FineMatch, a new aspect-based +fine-grained text and image matching benchmark, focusing on text and image +mismatch detection and correction. This benchmark introduces a novel task for +boosting and evaluating the VLMs' compositionality for aspect-based +fine-grained text and image matching. In this task, models are required to +identify mismatched aspect phrases within a caption, determine the aspect's +class, and propose corrections for an image-text pair that may contain between +0 and 3 mismatches. To evaluate the models' performance on this new task, we +propose a new evaluation metric named ITM-IoU for which our experiments show a +high correlation to human evaluation. In addition, we also provide a +comprehensive experimental analysis of existing mainstream VLMs, including +fully supervised learning and in-context learning settings. We have found that +models trained on FineMatch demonstrate enhanced proficiency in detecting +fine-grained text and image mismatches. Moreover, models (e.g., GPT-4V, Gemini +Pro Vision) with strong abilities to perform multimodal in-context learning are +not as skilled at fine-grained compositional image and text matching analysis. +With FineMatch, we are able to build a system for text-to-image generation +hallucination detection and correction. + +
+
+
+
+
+ + ☆ SC-HVPPNet: Spatial and Channel Hybrid-Attention Video Post-Processing + Network with CNN and Transformer + + +
+ Convolutional Neural Network (CNN) and Transformer have attracted much +attention recently for video post-processing (VPP). However, the interaction +between CNN and Transformer in existing VPP methods is not fully explored, +leading to inefficient communication between the local and global extracted +features. In this paper, we explore the interaction between CNN and Transformer +in the task of VPP, and propose a novel Spatial and Channel Hybrid-Attention +Video Post-Processing Network (SC-HVPPNet), which can cooperatively exploit the +image priors in both spatial and channel domains. Specifically, in the spatial +domain, a novel spatial attention fusion module is designed, in which two +attention weights are generated to fuse the local and global representations +collaboratively. In the channel domain, a novel channel attention fusion module +is developed, which can blend the deep representations at the channel dimension +dynamically. Extensive experiments show that SC-HVPPNet notably boosts video +restoration quality, with average bitrate savings of 5.29%, 12.42%, and 13.09% +for Y, U, and V components in the VTM-11.0-NNVC RA configuration. + +
+
+
+
+
+ + ☆ Think-Program-reCtify: 3D Situated Reasoning with Large Language Models + + +
+ This work addresses the 3D situated reasoning task which aims to answer +questions given egocentric observations in a 3D environment. The task remains +challenging as it requires comprehensive 3D perception and complex reasoning +skills. End-to-end models trained on supervised data for 3D situated reasoning +suffer from data scarcity and generalization ability. Inspired by the recent +success of leveraging large language models (LLMs) for visual reasoning, we +propose LLM-TPC, a novel framework that leverages the planning, tool usage, and +reflection capabilities of LLMs through a ThinkProgram-reCtify loop. The Think +phase first decomposes the compositional question into a sequence of steps, and +then the Program phase grounds each step to a piece of code and calls carefully +designed 3D visual perception modules. Finally, the Rectify phase adjusts the +plan and code if the program fails to execute. Experiments and analysis on the +SQA3D benchmark demonstrate the effectiveness, interpretability and robustness +of our method. Our code is publicly available at +https://qingrongh.github.io/LLM-TPC/. + +
+
+
+
+
+ + ☆ Unsupervised Domain Adaptation Architecture Search with Self-Training + for Land Cover Mapping CVPR + + +
+ Unsupervised domain adaptation (UDA) is a challenging open problem in land +cover mapping. Previous studies show encouraging progress in addressing +cross-domain distribution shifts on remote sensing benchmarks for land cover +mapping. The existing works are mainly built on large neural network +architectures, which makes them resource-hungry systems, limiting their +practical impact for many real-world applications in resource-constrained +environments. Thus, we proposed a simple yet effective framework to search for +lightweight neural networks automatically for land cover mapping tasks under +domain shifts. This is achieved by integrating Markov random field neural +architecture search (MRF-NAS) into a self-training UDA framework to search for +efficient and effective networks under a limited computation budget. This is +the first attempt to combine NAS with self-training UDA as a single framework +for land cover mapping. We also investigate two different pseudo-labelling +approaches (confidence-based and energy-based) in self-training scheme. +Experimental results on two recent datasets (OpenEarthMap & FLAIR #1) for +remote sensing UDA demonstrate a satisfactory performance. With only less than +2M parameters and 30.16 GFLOPs, the best-discovered lightweight network reaches +state-of-the-art performance on the regional target domain of OpenEarthMap +(59.38% mIoU) and the considered target domain of FLAIR #1 (51.19% mIoU). The +code is at +https://github.com/cliffbb/UDA-NAS}{https://github.com/cliffbb/UDA-NAS. + +
+
+ comment: Accepted at CVPRW 2024 +
+
+
+
+
+ + ☆ Adaptive Prompt Learning with Negative Textual Semantics and Uncertainty + Modeling for Universal Multi-Source Domain Adaptation ICME2024 + + +
+ Universal Multi-source Domain Adaptation (UniMDA) transfers knowledge from +multiple labeled source domains to an unlabeled target domain under domain +shifts (different data distribution) and class shifts (unknown target classes). +Existing solutions focus on excavating image features to detect unknown +samples, ignoring abundant information contained in textual semantics. In this +paper, we propose an Adaptive Prompt learning with Negative textual semantics +and uncErtainty modeling method based on Contrastive Language-Image +Pre-training (APNE-CLIP) for UniMDA classification tasks. Concretely, we +utilize the CLIP with adaptive prompts to leverage textual information of class +semantics and domain representations, helping the model identify unknown +samples and address domain shifts. Additionally, we design a novel global +instance-level alignment objective by utilizing negative textual semantics to +achieve more precise image-text pair alignment. Furthermore, we propose an +energy-based uncertainty modeling strategy to enlarge the margin distance +between known and unknown samples. Extensive experiments demonstrate the +superiority of our proposed method. + +
+
+ comment: Accepted by ICME2024 +
+
+
+
+
+ + ☆ Double Privacy Guard: Robust Traceable Adversarial Watermarking against + Face Recognition + + +
+ The wide deployment of Face Recognition (FR) systems poses risks of privacy +leakage. One countermeasure to address this issue is adversarial attacks, which +deceive malicious FR searches but simultaneously interfere the normal identity +verification of trusted authorizers. In this paper, we propose the first Double +Privacy Guard (DPG) scheme based on traceable adversarial watermarking. DPG +employs a one-time watermark embedding to deceive unauthorized FR models and +allows authorizers to perform identity verification by extracting the +watermark. Specifically, we propose an information-guided adversarial attack +against FR models. The encoder embeds an identity-specific watermark into the +deep feature space of the carrier, guiding recognizable features of the image +to deviate from the source identity. We further adopt a collaborative +meta-optimization strategy compatible with sub-tasks, which regularizes the +joint optimization direction of the encoder and decoder. This strategy enhances +the representation of universal carrier features, mitigating multi-objective +optimization conflicts in watermarking. Experiments confirm that DPG achieves +significant attack success rates and traceability accuracy on state-of-the-art +FR models, exhibiting remarkable robustness that outperforms the existing +privacy protection methods using adversarial attacks and deep watermarking, or +simple combinations of the two. Our work potentially opens up new insights into +proactive protection for FR privacy. + +
+
+
+
+
+ + ☆ Pegasus-v1 Technical Report + + +
+ This technical report introduces Pegasus-1, a multimodal language model +specialized in video content understanding and interaction through natural +language. Pegasus-1 is designed to address the unique challenges posed by video +data, such as interpreting spatiotemporal information, to offer nuanced video +content comprehension across various lengths. This technical report overviews +Pegasus-1's architecture, training strategies, and its performance in +benchmarks on video conversation, zero-shot video question answering, and video +summarization. We also explore qualitative characteristics of Pegasus-1 , +demonstrating its capabilities as well as its limitations, in order to provide +readers a balanced view of its current state and its future direction. + +
+
+
+
+
+ + ☆ 3DBench: A Scalable 3D Benchmark and Instruction-Tuning Dataset + + +
+ Evaluating the performance of Multi-modal Large Language Models (MLLMs), +integrating both point cloud and language, presents significant challenges. The +lack of a comprehensive assessment hampers determining whether these models +truly represent advancements, thereby impeding further progress in the field. +Current evaluations heavily rely on classification and caption tasks, falling +short in providing a thorough assessment of MLLMs. A pressing need exists for a +more sophisticated evaluation method capable of thoroughly analyzing the +spatial understanding and expressive capabilities of these models. To address +these issues, we introduce a scalable 3D benchmark, accompanied by a +large-scale instruction-tuning dataset known as 3DBench, providing an +extensible platform for a comprehensive evaluation of MLLMs. Specifically, we +establish the benchmark that spans a wide range of spatial and semantic scales, +from object-level to scene-level, addressing both perception and planning +tasks. Furthermore, we present a rigorous pipeline for automatically +constructing scalable 3D instruction-tuning datasets, covering 10 diverse +multi-modal tasks with more than 0.23 million QA pairs generated in total. +Thorough experiments evaluating trending MLLMs, comparisons against existing +datasets, and variations of training protocols demonstrate the superiority of +3DBench, offering valuable insights into current limitations and potential +research directions. + +
+
+
+
+
+ + ☆ DreamPBR: Text-driven Generation of High-resolution SVBRDF with + Multi-modal Guidance + + +
+ Prior material creation methods had limitations in producing diverse results +mainly because reconstruction-based methods relied on real-world measurements +and generation-based methods were trained on relatively small material +datasets. To address these challenges, we propose DreamPBR, a novel +diffusion-based generative framework designed to create spatially-varying +appearance properties guided by text and multi-modal controls, providing high +controllability and diversity in material generation. Key to achieving diverse +and high-quality PBR material generation lies in integrating the capabilities +of recent large-scale vision-language models trained on billions of text-image +pairs, along with material priors derived from hundreds of PBR material +samples. We utilize a novel material Latent Diffusion Model (LDM) to establish +the mapping between albedo maps and the corresponding latent space. The latent +representation is then decoded into full SVBRDF parameter maps using a +rendering-aware PBR decoder. Our method supports tileable generation through +convolution with circular padding. Furthermore, we introduce a multi-modal +guidance module, which includes pixel-aligned guidance, style image guidance, +and 3D shape guidance, to enhance the control capabilities of the material LDM. +We demonstrate the effectiveness of DreamPBR in material creation, showcasing +its versatility and user-friendliness on a wide range of controllable +generation and editing applications. + +
+
+ comment: 16 pages, 17 figures +
+
+
+
+
+ + ☆ HOIN: High-Order Implicit Neural Representations + + +
+ Implicit neural representations (INR) suffer from worsening spectral bias, +which results in overly smooth solutions to the inverse problem. To deal with +this problem, we propose a universal framework for processing inverse problems +called \textbf{High-Order Implicit Neural Representations (HOIN)}. By refining +the traditional cascade structure to foster high-order interactions among +features, HOIN enhances the model's expressive power and mitigates spectral +bias through its neural tangent kernel's (NTK) strong diagonal properties, +accelerating and optimizing inverse problem resolution. By analyzing the +model's expression space, high-order derivatives, and the NTK matrix, we +theoretically validate the feasibility of HOIN. HOIN realizes 1 to 3 dB +improvements in most inverse problems, establishing a new state-of-the-art +recovery quality and training efficiency, thus providing a new general paradigm +for INR and paving the way for it to solve the inverse problem. + +
+
+
+
+
+ + ☆ LaneCorrect: Self-supervised Lane Detection + + +
+ Lane detection has evolved highly functional autonomous driving system to +understand driving scenes even under complex environments. In this paper, we +work towards developing a generalized computer vision system able to detect +lanes without using any annotation. We make the following contributions: (i) We +illustrate how to perform unsupervised 3D lane segmentation by leveraging the +distinctive intensity of lanes on the LiDAR point cloud frames, and then obtain +the noisy lane labels in the 2D plane by projecting the 3D points; (ii) We +propose a novel self-supervised training scheme, dubbed LaneCorrect, that +automatically corrects the lane label by learning geometric consistency and +instance awareness from the adversarial augmentations; (iii) With the +self-supervised pre-trained model, we distill to train a student network for +arbitrary target lane (e.g., TuSimple) detection without any human labels; (iv) +We thoroughly evaluate our self-supervised method on four major lane detection +benchmarks (including TuSimple, CULane, CurveLanes and LLAMAS) and demonstrate +excellent performance compared with existing supervised counterpart, whilst +showing more effective results on alleviating the domain gap, i.e., training on +CULane and test on TuSimple. + +
+
+
+
+
+ + ☆ 3DFlowRenderer: One-shot Face Re-enactment via Dense 3D Facial Flow + Estimation + + +
+ Performing facial expression transfer under one-shot setting has been +increasing in popularity among research community with a focus on precise +control of expressions. Existing techniques showcase compelling results in +perceiving expressions, but they lack robustness with extreme head poses. They +also struggle to accurately reconstruct background details, thus hindering the +realism. In this paper, we propose a novel warping technology which integrates +the advantages of both 2D and 3D methods to achieve robust face re-enactment. +We generate dense 3D facial flow fields in feature space to warp an input image +based on target expressions without depth information. This enables explicit 3D +geometric control for re-enacting misaligned source and target faces. We +regularize the motion estimation capability of the 3D flow prediction network +through proposed "Cyclic warp loss" by converting warped 3D features back into +2D RGB space. To ensure the generation of finer facial region with +natural-background, our framework only renders the facial foreground region +first and learns to inpaint the blank area which needs to be filled due to +source face translation, thus reconstructing the detailed background without +any unwanted pixel motion. Extensive evaluation reveals that our method +outperforms state-of-the-art techniques in rendering artifact-free facial +images. + +
+
+
+
+
+ + ☆ First Mapping the Canopy Height of Primeval Forests in the Tallest Tree + Area of Asia + + +
+ We have developed the world's first canopy height map of the distribution +area of world-level giant trees. This mapping is crucial for discovering more +individual and community world-level giant trees, and for analyzing and +quantifying the effectiveness of biodiversity conservation measures in the +Yarlung Tsangpo Grand Canyon (YTGC) National Nature Reserve. We proposed a +method to map the canopy height of the primeval forest within the world-level +giant tree distribution area by using a spaceborne LiDAR fusion satellite +imagery (Global Ecosystem Dynamics Investigation (GEDI), ICESat-2, and +Sentinel-2) driven deep learning modeling. And we customized a pyramid +receptive fields depth separable CNN (PRFXception). PRFXception, a CNN +architecture specifically customized for mapping primeval forest canopy height +to infer the canopy height at the footprint level of GEDI and ICESat-2 from +Sentinel-2 optical imagery with a 10-meter spatial resolution. We conducted a +field survey of 227 permanent plots using a stratified sampling method and +measured several giant trees using UAV-LS. The predicted canopy height was +compared with ICESat-2 and GEDI validation data (RMSE =7.56 m, MAE=6.07 m, +ME=-0.98 m, R^2=0.58 m), UAV-LS point clouds (RMSE =5.75 m, MAE =3.72 m, ME = +0.82 m, R^2= 0.65 m), and ground survey data (RMSE = 6.75 m, MAE = 5.56 m, ME= +2.14 m, R^2=0.60 m). We mapped the potential distribution map of world-level +giant trees and discovered two previously undetected giant tree communities +with an 89% probability of having trees 80-100 m tall, potentially taller than +Asia's tallest tree. This paper provides scientific evidence confirming +southeastern Tibet--northwestern Yunnan as the fourth global distribution +center of world-level giant trees initiatives and promoting the inclusion of +the YTGC giant tree distribution area within the scope of China's national park +conservation. + +
+
+
+
+
+ + ☆ Progressive Token Length Scaling in Transformer Encoders for Efficient + Universal Segmentation + + +
+ A powerful architecture for universal segmentation relies on transformers +that encode multi-scale image features and decode object queries into mask +predictions. With efficiency being a high priority for scaling such models, we +observed that the state-of-the-art method Mask2Former uses ~50% of its compute +only on the transformer encoder. This is due to the retention of a full-length +token-level representation of all backbone feature scales at each encoder +layer. With this observation, we propose a strategy termed PROgressive Token +Length SCALing for Efficient transformer encoders (PRO-SCALE) that can be +plugged-in to the Mask2Former-style segmentation architectures to significantly +reduce the computational cost. The underlying principle of PRO-SCALE is: +progressively scale the length of the tokens with the layers of the encoder. +This allows PRO-SCALE to reduce computations by a large margin with minimal +sacrifice in performance (~52% GFLOPs reduction with no drop in performance on +COCO dataset). We validate our framework on multiple public benchmarks. + +
+
+
+
+
+ + ☆ Machine Vision Based Assessment of Fall Color Changes in Apple Trees: + Exploring Relationship with Leaf Nitrogen Concentration + + +
+ Apple trees being deciduous trees, shed leaves each year which is preceded by +the change in color of leaves from green to yellow (also known as senescence) +during the fall season. The rate and timing of color change are affected by the +number of factors including nitrogen (N) deficiencies. The green color of +leaves is highly dependent on the chlorophyll content, which in turn depends on +the nitrogen concentration in the leaves. The assessment of the leaf color can +give vital information on the nutrient status of the tree. The use of a machine +vision based system to capture and quantify these timings and changes in leaf +color can be a great tool for that purpose. + \par This study is based on data collected during the fall of 2021 and 2023 +at a commercial orchard using a ground-based stereo-vision sensor for five +weeks. The point cloud obtained from the sensor was segmented to get just the +tree in the foreground. The study involved the segmentation of the trees in a +natural background using point cloud data and quantification of the color using +a custom-defined metric, \textit{yellowness index}, varying from $-1$ to $+1$ +($-1$ being completely green and $+1$ being completely yellow), which gives the +proportion of yellow leaves on a tree. The performance of K-means based +algorithm and gradient boosting algorithm were compared for \textit{yellowness +index} calculation. The segmentation method proposed in the study was able to +estimate the \textit{yellowness index} on the trees with $R^2 = 0.72$. The +results showed that the metric was able to capture the gradual color transition +from green to yellow over the study duration. It was also observed that the +trees with lower nitrogen showed the color transition to yellow earlier than +the trees with higher nitrogen. The onset of color transition during both years +aligned with the $29^{th}$ week post-full bloom. + +
+
+
+
+
+ + ☆ UPose3D: Uncertainty-Aware 3D Human Pose Estimation with Cross-View and + Temporal Cues + + +
+ We introduce UPose3D, a novel approach for multi-view 3D human pose +estimation, addressing challenges in accuracy and scalability. Our method +advances existing pose estimation frameworks by improving robustness and +flexibility without requiring direct 3D annotations. At the core of our method, +a pose compiler module refines predictions from a 2D keypoints estimator that +operates on a single image by leveraging temporal and cross-view information. +Our novel cross-view fusion strategy is scalable to any number of cameras, +while our synthetic data generation strategy ensures generalization across +diverse actors, scenes, and viewpoints. Finally, UPose3D leverages the +prediction uncertainty of both the 2D keypoint estimator and the pose compiler +module. This provides robustness to outliers and noisy data, resulting in +state-of-the-art performance in out-of-distribution settings. In addition, for +in-distribution settings, UPose3D yields a performance rivaling methods that +rely on 3D annotated data, while being the state-of-the-art among methods +relying only on 2D supervision. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Weakly Supervised 3D Object Detection via Multi-Level Visual Guidance + + +
+ Weakly supervised 3D object detection aims to learn a 3D detector with lower +annotation cost, e.g., 2D labels. Unlike prior work which still relies on few +accurate 3D annotations, we propose a framework to study how to leverage +constraints between 2D and 3D domains without requiring any 3D labels. +Specifically, we employ visual data from three perspectives to establish +connections between 2D and 3D domains. First, we design a feature-level +constraint to align LiDAR and image features based on object-aware regions. +Second, the output-level constraint is developed to enforce the overlap between +2D and projected 3D box estimations. Finally, the training-level constraint is +utilized by producing accurate and consistent 3D pseudo-labels that align with +the visual data. We conduct extensive experiments on the KITTI dataset to +validate the effectiveness of the proposed three constraints. Without using any +3D labels, our method achieves favorable performance against state-of-the-art +approaches and is competitive with the method that uses 500-frame 3D +annotations. Code and models will be made publicly available at +https://github.com/kuanchihhuang/VG-W3D. + +
+
+ comment: Project page: https://github.com/kuanchihhuang/VG-W3D +
+
+
+
+
+ + ♻ ☆ VideoXum: Cross-modal Visual and Textural Summarization of Videos + + +
+ Video summarization aims to distill the most important information from a +source video to produce either an abridged clip or a textual narrative. +Traditionally, different methods have been proposed depending on whether the +output is a video or text, thus ignoring the correlation between the two +semantically related tasks of visual summarization and textual summarization. +We propose a new joint video and text summarization task. The goal is to +generate both a shortened video clip along with the corresponding textual +summary from a long video, collectively referred to as a cross-modal summary. +The generated shortened video clip and text narratives should be semantically +well aligned. To this end, we first build a large-scale human-annotated dataset +-- VideoXum (X refers to different modalities). The dataset is reannotated +based on ActivityNet. After we filter out the videos that do not meet the +length requirements, 14,001 long videos remain in our new dataset. Each video +in our reannotated dataset has human-annotated video summaries and the +corresponding narrative summaries. We then design a novel end-to-end model -- +VTSUM-BILP to address the challenges of our proposed task. Moreover, we propose +a new metric called VT-CLIPScore to help evaluate the semantic consistency of +cross-modality summary. The proposed model achieves promising performance on +this new task and establishes a benchmark for future research. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ VT-Former: An Exploratory Study on Vehicle Trajectory Prediction for + Highway Surveillance through Graph Isomorphism and Transformer + + +
+ Enhancing roadway safety has become an essential computer vision focus area +for Intelligent Transportation Systems (ITS). As a part of ITS, Vehicle +Trajectory Prediction (VTP) aims to forecast a vehicle's future positions based +on its past and current movements. VTP is a pivotal element for road safety, +aiding in applications such as traffic management, accident prevention, +work-zone safety, and energy optimization. While most works in this field focus +on autonomous driving, with the growing number of surveillance cameras, another +sub-field emerges for surveillance VTP with its own set of challenges. In this +paper, we introduce VT-Former, a novel transformer-based VTP approach for +highway safety and surveillance. In addition to utilizing transformers to +capture long-range temporal patterns, a new Graph Attentive Tokenization (GAT) +module has been proposed to capture intricate social interactions among +vehicles. This study seeks to explore both the advantages and the limitations +inherent in combining transformer architecture with graphs for VTP. Our +investigation, conducted across three benchmark datasets from diverse +surveillance viewpoints, showcases the State-of-the-Art (SotA) or comparable +performance of VT-Former in predicting vehicle trajectories. This study +underscores the potential of VT-Former and its architecture, opening new +avenues for future research and exploration. + +
+
+ comment: Completely updated based on the reviews received for the paper +
+
+
+
+
+ + ♻ ☆ Co-Speech Gesture Detection through Multi-Phase Sequence Labeling + + +
+ Gestures are integral components of face-to-face communication. They unfold +over time, often following predictable movement phases of preparation, stroke, +and retraction. Yet, the prevalent approach to automatic gesture detection +treats the problem as binary classification, classifying a segment as either +containing a gesture or not, thus failing to capture its inherently sequential +and contextual nature. To address this, we introduce a novel framework that +reframes the task as a multi-phase sequence labeling problem rather than binary +classification. Our model processes sequences of skeletal movements over time +windows, uses Transformer encoders to learn contextual embeddings, and +leverages Conditional Random Fields to perform sequence labeling. We evaluate +our proposal on a large dataset of diverse co-speech gestures in task-oriented +face-to-face dialogues. The results consistently demonstrate that our method +significantly outperforms strong baseline models in detecting gesture strokes. +Furthermore, applying Transformer encoders to learn contextual embeddings from +movement sequences substantially improves gesture unit detection. These results +highlight our framework's capacity to capture the fine-grained dynamics of +co-speech gesture phases, paving the way for more nuanced and accurate gesture +detection and analysis. + +
+
+
+
+
+ + ♻ ☆ CLIP-QDA: An Explainable Concept Bottleneck Model + + +
+ In this paper, we introduce an explainable algorithm designed from a +multi-modal foundation model, that performs fast and explainable image +classification. Drawing inspiration from CLIP-based Concept Bottleneck Models +(CBMs), our method creates a latent space where each neuron is linked to a +specific word. Observing that this latent space can be modeled with simple +distributions, we use a Mixture of Gaussians (MoG) formalism to enhance the +interpretability of this latent space. Then, we introduce CLIP-QDA, a +classifier that only uses statistical values to infer labels from the concepts. +In addition, this formalism allows for both local and global explanations. +These explanations come from the inner design of our architecture, our work is +part of a new family of greybox models, combining performances of opaque +foundation models and the interpretability of transparent models. Our empirical +findings show that in instances where the MoG assumption holds, CLIP-QDA +achieves similar accuracy with state-of-the-art methods CBMs. Our explanations +compete with existing XAI methods while being faster to compute. + +
+
+
+
+
+ + ♻ ☆ Improving Video Corpus Moment Retrieval with Partial Relevance + Enhancement ICMR 2024 + + +
+ Video Corpus Moment Retrieval (VCMR) is a new video retrieval task aimed at +retrieving a relevant moment from a large corpus of untrimmed videos using a +text query. The relevance between the video and query is partial, mainly +evident in two aspects:~(1)~Scope: The untrimmed video contains many frames, +but not all are relevant to the query. Strong relevance is typically observed +only within the relevant moment.~(2)~Modality: The relevance of the query +varies with different modalities. Action descriptions align more with visual +elements, while character conversations are more related to textual +information.Existing methods often treat all video contents equally, leading to +sub-optimal moment retrieval. We argue that effectively capturing the partial +relevance between the query and video is essential for the VCMR task. To this +end, we propose a Partial Relevance Enhanced Model~(PREM) to improve VCMR. VCMR +involves two sub-tasks: video retrieval and moment localization. To align with +their distinct objectives, we implement specialized partial relevance +enhancement strategies. For video retrieval, we introduce a multi-modal +collaborative video retriever, generating different query representations for +the two modalities by modality-specific pooling, ensuring a more effective +match. For moment localization, we propose the focus-then-fuse moment +localizer, utilizing modality-specific gates to capture essential content. We +also introduce relevant content-enhanced training methods for both retriever +and localizer to enhance the ability of model to capture relevant content. +Experimental results on TVR and DiDeMo datasets show that the proposed model +outperforms the baselines, achieving a new state-of-the-art of VCMR. The code +is available at \url{https://github.com/hdy007007/PREM}. + +
+
+ comment: camera-ready version of ACM ICMR 2024 +
+
+
+
+
+ + ♻ ☆ Subobject-level Image Tokenization + + +
+ Transformer-based vision models typically tokenize images into fixed-size +square patches as input units, which lacks the adaptability to image content +and overlooks the inherent pixel grouping structure. Inspired by the subword +tokenization widely adopted in language models, we propose an image tokenizer +at a subobject level, where the subobjects are represented by semantically +meaningful image segments obtained by segmentation models (e.g., segment +anything models). To implement a learning system based on subobject +tokenization, we first introduced a Direct Segment Anything Model (DirectSAM) +that efficiently produces comprehensive segmentation of subobjects, then embed +subobjects into compact latent vectors and fed them into a large language model +for vision language learning. Empirical results demonstrated that our +subobject-level tokenization significantly facilitates efficient learning of +translating images into object and attribute descriptions compared to the +traditional patch-level tokenization. Codes and models are open-sourced at +https://github.com/ChenDelong1999/subobjects. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Visual Grounding Methods for VQA are Working for the Wrong Reasons! ACL 2020 + + +
+ Existing Visual Question Answering (VQA) methods tend to exploit dataset +biases and spurious statistical correlations, instead of producing right +answers for the right reasons. To address this issue, recent bias mitigation +methods for VQA propose to incorporate visual cues (e.g., human attention maps) +to better ground the VQA models, showcasing impressive gains. However, we show +that the performance improvements are not a result of improved visual +grounding, but a regularization effect which prevents over-fitting to +linguistic priors. For instance, we find that it is not actually necessary to +provide proper, human-based cues; random, insensible cues also result in +similar improvements. Based on this observation, we propose a simpler +regularization scheme that does not require any external annotations and yet +achieves near state-of-the-art performance on VQA-CPv2. + +
+
+ comment: Published in ACL 2020 under the title "A negative case analysis of + visual grounding methods for VQA" +
+
+
+
+
+ + ♻ ☆ Attention-Map Augmentation for Hypercomplex Breast Cancer Classification + + +
+ Breast cancer is the most widespread neoplasm among women and early detection +of this disease is critical. Deep learning techniques have become of great +interest to improve diagnostic performance. However, distinguishing between +malignant and benign masses in whole mammograms poses a challenge, as they +appear nearly identical to an untrained eye, and the region of interest (ROI) +constitutes only a small fraction of the entire image. In this paper, we +propose a framework, parameterized hypercomplex attention maps (PHAM), to +overcome these problems. Specifically, we deploy an augmentation step based on +computing attention maps. Then, the attention maps are used to condition the +classification step by constructing a multi-dimensional input comprised of the +original breast cancer image and the corresponding attention map. In this step, +a parameterized hypercomplex neural network (PHNN) is employed to perform +breast cancer classification. The framework offers two main advantages. First, +attention maps provide critical information regarding the ROI and allow the +neural model to concentrate on it. Second, the hypercomplex architecture has +the ability to model local relations between input dimensions thanks to +hypercomplex algebra rules, thus properly exploiting the information provided +by the attention map. We demonstrate the efficacy of the proposed framework on +both mammography images as well as histopathological ones. We surpass +attention-based state-of-the-art networks and the real-valued counterpart of +our approach. The code of our work is available at +https://github.com/ispamm/AttentionBCS. + +
+
+ comment: Published in Elsevier Pattern Recognition Letters +
+
+
+
+
+ + ♻ ☆ Diagnosis of Multiple Fundus Disorders Amidst a Scarcity of Medical + Experts Via Self-supervised Machine Learning + + +
+ Fundus diseases are major causes of visual impairment and blindness +worldwide, especially in underdeveloped regions, where the shortage of +ophthalmologists hinders timely diagnosis. AI-assisted fundus image analysis +has several advantages, such as high accuracy, reduced workload, and improved +accessibility, but it requires a large amount of expert-annotated data to build +reliable models. To address this dilemma, we propose a general self-supervised +machine learning framework that can handle diverse fundus diseases from +unlabeled fundus images. Our method's AUC surpasses existing supervised +approaches by 15.7%, and even exceeds performance of a single human expert. +Furthermore, our model adapts well to various datasets from different regions, +races, and heterogeneous image sources or qualities from multiple cameras or +devices. Our method offers a label-free general framework to diagnose fundus +diseases, which could potentially benefit telehealth programs for early +screening of people at risk of vision loss. + +
+
+
+
+
+ + ♻ ☆ RingID: Rethinking Tree-Ring Watermarking for Enhanced Multi-Key + Identification + + +
+ We revisit Tree-Ring Watermarking, a recent diffusion model watermarking +method that demonstrates great robustness to various attacks. We conduct an +in-depth study on it and reveal that the distribution shift unintentionally +introduced by the watermarking process, apart from watermark pattern matching, +contributes to its exceptional robustness. Our investigation further exposes +inherent flaws in its original design, particularly in its ability to identify +multiple distinct keys, where distribution shift offers no assistance. Based on +these findings and analysis, we present RingID for enhanced multi-key +identification. It consists of a novel multi-channel heterogeneous watermarking +approach designed to seamlessly amalgamate distinctive advantages from diverse +watermarks. Coupled with a series of suggested enhancements, RingID exhibits +substantial advancements in multi-key identification. Github Page: +https://github.com/showlab/RingID + +
+
+ comment: 25 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ CT-NeRF: Incremental Optimizing Neural Radiance Field and Poses with + Complex Trajectory + + +
+ Neural radiance field (NeRF) has achieved impressive results in high-quality +3D scene reconstruction. However, NeRF heavily relies on precise camera poses. +While recent works like BARF have introduced camera pose optimization within +NeRF, their applicability is limited to simple trajectory scenes. Existing +methods struggle while tackling complex trajectories involving large rotations. +To address this limitation, we propose CT-NeRF, an incremental reconstruction +optimization pipeline using only RGB images without pose and depth input. In +this pipeline, we first propose a local-global bundle adjustment under a pose +graph connecting neighboring frames to enforce the consistency between poses to +escape the local minima caused by only pose consistency with the scene +structure. Further, we instantiate the consistency between poses as a +reprojected geometric image distance constraint resulting from pixel-level +correspondences between input image pairs. Through the incremental +reconstruction, CT-NeRF enables the recovery of both camera poses and scene +structure and is capable of handling scenes with complex trajectories. We +evaluate the performance of CT-NeRF on two real-world datasets, NeRFBuster and +Free-Dataset, which feature complex trajectories. Results show CT-NeRF +outperforms existing methods in novel view synthesis and pose estimation +accuracy. + +
+
+
+
+
+ + ♻ ☆ RoboFusion: Towards Robust Multi-Modal 3D Object Detection via SAM + + +
+ Multi-modal 3D object detectors are dedicated to exploring secure and +reliable perception systems for autonomous driving (AD).Although achieving +state-of-the-art (SOTA) performance on clean benchmark datasets, they tend to +overlook the complexity and harsh conditions of real-world environments. With +the emergence of visual foundation models (VFMs), opportunities and challenges +are presented for improving the robustness and generalization of multi-modal 3D +object detection in AD. Therefore, we propose RoboFusion, a robust framework +that leverages VFMs like SAM to tackle out-of-distribution (OOD) noise +scenarios. We first adapt the original SAM for AD scenarios named SAM-AD. To +align SAM or SAM-AD with multi-modal methods, we then introduce AD-FPN for +upsampling the image features extracted by SAM. We employ wavelet decomposition +to denoise the depth-guided images for further noise reduction and weather +interference. At last, we employ self-attention mechanisms to adaptively +reweight the fused features, enhancing informative features while suppressing +excess noise. In summary, RoboFusion significantly reduces noise by leveraging +the generalization and robustness of VFMs, thereby enhancing the resilience of +multi-modal 3D object detection. Consequently, RoboFusion achieves SOTA +performance in noisy scenarios, as demonstrated by the KITTI-C and nuScenes-C +benchmarks. Code is available at https://github.com/adept-thu/RoboFusion. + +
+
+
+
+
+ + ♻ ☆ Are Semi-Dense Detector-Free Methods Good at Matching Local Features? + + +
+ Semi-dense detector-free approaches (SDF), such as LoFTR, are currently among +the most popular image matching methods. While SDF methods are trained to +establish correspondences between two images, their performances are almost +exclusively evaluated using relative pose estimation metrics. Thus, the link +between their ability to establish correspondences and the quality of the +resulting estimated pose has thus far received little attention. This paper is +a first attempt to study this link. We start with proposing a novel structured +attention-based image matching architecture (SAM). It allows us to show a +counter-intuitive result on two datasets (MegaDepth and HPatches): on the one +hand SAM either outperforms or is on par with SDF methods in terms of +pose/homography estimation metrics, but on the other hand SDF approaches are +significantly better than SAM in terms of matching accuracy. We then propose to +limit the computation of the matching accuracy to textured regions, and show +that in this case SAM often surpasses SDF methods. Our findings highlight a +strong correlation between the ability to establish accurate correspondences in +textured regions and the accuracy of the resulting estimated pose/homography. +Our code will be made available. + +
+
+
+
+
+ + ♻ ☆ GPT4Motion: Scripting Physical Motions in Text-to-Video Generation via + Blender-Oriented GPT Planning + + +
+ Recent advances in text-to-video generation have harnessed the power of +diffusion models to create visually compelling content conditioned on text +prompts. However, they usually encounter high computational costs and often +struggle to produce videos with coherent physical motions. To tackle these +issues, we propose GPT4Motion, a training-free framework that leverages the +planning capability of large language models such as GPT, the physical +simulation strength of Blender, and the excellent image generation ability of +text-to-image diffusion models to enhance the quality of video synthesis. +Specifically, GPT4Motion employs GPT-4 to generate a Blender script based on a +user textual prompt, which commands Blender's built-in physics engine to craft +fundamental scene components that encapsulate coherent physical motions across +frames. Then these components are inputted into Stable Diffusion to generate a +video aligned with the textual prompt. Experimental results on three basic +physical motion scenarios, including rigid object drop and collision, cloth +draping and swinging, and liquid flow, demonstrate that GPT4Motion can generate +high-quality videos efficiently in maintaining motion coherency and entity +consistency. GPT4Motion offers new insights in text-to-video research, +enhancing its quality and broadening its horizon for further explorations. + +
+
+
+
+
+ + ♻ ☆ StreakNet-Arch: An Anti-scattering Network-based Architecture for + Underwater Carrier LiDAR-Radar Imaging + + +
+ In this paper, we introduce StreakNet-Arch, a novel signal processing +architecture designed for Underwater Carrier LiDAR-Radar (UCLR) imaging +systems, to address the limitations in scatter suppression and real-time +imaging. StreakNet-Arch formulates the signal processing as a real-time, +end-to-end binary classification task, enabling real-time image acquisition. To +achieve this, we leverage Self-Attention networks and propose a novel Double +Branch Cross Attention (DBC-Attention) mechanism that surpasses the performance +of traditional methods. Furthermore, we present a method for embedding +streak-tube camera images into attention networks, effectively acting as a +learned bandpass filter. To facilitate further research, we contribute a +publicly available streak-tube camera image dataset. The dataset contains +2,695,168 real-world underwater 3D point cloud data. These advancements +significantly improve UCLR capabilities, enhancing its performance and +applicability in underwater imaging tasks. The source code and dataset can be +found at https://github.com/BestAnHongjun/StreakNet . + +
+
+ comment: Reduce the number of pages to 13 +
+
+
+
+
+ + ♻ ☆ Continual Learning with Pre-Trained Models: A Survey IJCAI 2024 + + +
+ Nowadays, real-world applications often face streaming data, which requires +the learning system to absorb new knowledge as data evolves. Continual Learning +(CL) aims to achieve this goal and meanwhile overcome the catastrophic +forgetting of former knowledge when learning new ones. Typical CL methods build +the model from scratch to grow with incoming data. However, the advent of the +pre-trained model (PTM) era has sparked immense research interest, particularly +in leveraging PTMs' robust representational capabilities. This paper presents a +comprehensive survey of the latest advancements in PTM-based CL. We categorize +existing methodologies into three distinct groups, providing a comparative +analysis of their similarities, differences, and respective advantages and +disadvantages. Additionally, we offer an empirical study contrasting various +state-of-the-art methods to highlight concerns regarding fairness in +comparisons. The source code to reproduce these evaluations is available at: +https://github.com/sun-hailong/LAMDA-PILOT + +
+
+ comment: Accepted to IJCAI 2024. Code is available at: + https://github.com/sun-hailong/LAMDA-PILOT +
+
+
+
+
+ + ♻ ☆ ProteusNeRF: Fast Lightweight NeRF Editing using 3D-Aware Image Context SIGGRAPH + + +
+ Neural Radiance Fields (NeRFs) have recently emerged as a popular option for +photo-realistic object capture due to their ability to faithfully capture +high-fidelity volumetric content even from handheld video input. Although much +research has been devoted to efficient optimization leading to real-time +training and rendering, options for interactive editing NeRFs remain limited. +We present a very simple but effective neural network architecture that is fast +and efficient while maintaining a low memory footprint. This architecture can +be incrementally guided through user-friendly image-based edits. Our +representation allows straightforward object selection via semantic feature +distillation at the training stage. More importantly, we propose a local +3D-aware image context to facilitate view-consistent image editing that can +then be distilled into fine-tuned NeRFs, via geometric and appearance +adjustments. We evaluate our setup on a variety of examples to demonstrate +appearance and geometric edits and report 10-30x speedup over concurrent work +focusing on text-guided NeRF editing. Video results can be seen on our project +webpage at https://proteusnerf.github.io. + +
+
+ comment: Accepted at I3D'24 (ACM SIGGRAPH SYMPOSIUM ON INTERACTIVE 3D GRAPHICS + AND GAMES) +
+
+
+
+
+ + ♻ ☆ Adaptive Hybrid Masking Strategy for Privacy-Preserving Face Recognition + Against Model Inversion Attack + + +
+ The utilization of personal sensitive data in training face recognition (FR) +models poses significant privacy concerns, as adversaries can employ model +inversion attacks (MIA) to infer the original training data. Existing defense +methods, such as data augmentation and differential privacy, have been employed +to mitigate this issue. However, these methods often fail to strike an optimal +balance between privacy and accuracy. To address this limitation, this paper +introduces an adaptive hybrid masking algorithm against MIA. Specifically, face +images are masked in the frequency domain using an adaptive MixUp strategy. +Unlike the traditional MixUp algorithm, which is predominantly used for data +augmentation, our modified approach incorporates frequency domain mixing. +Previous studies have shown that increasing the number of images mixed in MixUp +can enhance privacy preservation but at the expense of reduced face recognition +accuracy. To overcome this trade-off, we develop an enhanced adaptive MixUp +strategy based on reinforcement learning, which enables us to mix a larger +number of images while maintaining satisfactory recognition accuracy. To +optimize privacy protection, we propose maximizing the reward function (i.e., +the loss function of the FR system) during the training of the strategy +network. While the loss function of the FR network is minimized in the phase of +training the FR network. The strategy network and the face recognition network +can be viewed as antagonistic entities in the training process, ultimately +reaching a more balanced trade-off. Experimental results demonstrate that our +proposed hybrid masking scheme outperforms existing defense algorithms in terms +of privacy preservation and recognition accuracy against MIA. + +
+
+
+
+
+ + ♻ ☆ DreamMatcher: Appearance Matching Self-Attention for + Semantically-Consistent Text-to-Image Personalization + + +
+ The objective of text-to-image (T2I) personalization is to customize a +diffusion model to a user-provided reference concept, generating diverse images +of the concept aligned with the target prompts. Conventional methods +representing the reference concepts using unique text embeddings often fail to +accurately mimic the appearance of the reference. To address this, one solution +may be explicitly conditioning the reference images into the target denoising +process, known as key-value replacement. However, prior works are constrained +to local editing since they disrupt the structure path of the pre-trained T2I +model. To overcome this, we propose a novel plug-in method, called +DreamMatcher, which reformulates T2I personalization as semantic matching. +Specifically, DreamMatcher replaces the target values with reference values +aligned by semantic matching, while leaving the structure path unchanged to +preserve the versatile capability of pre-trained T2I models for generating +diverse structures. We also introduce a semantic-consistent masking strategy to +isolate the personalized concept from irrelevant regions introduced by the +target prompts. Compatible with existing T2I models, DreamMatcher shows +significant improvements in complex scenarios. Intensive analyses demonstrate +the effectiveness of our approach. + +
+
+ comment: Project page is available at https://ku-cvlab.github.io/DreamMatcher/ +
+
+
+
+
+ + ♻ ☆ Seeing is Believing: Mitigating Hallucination in Large Vision-Language + Models via CLIP-Guided Decoding + + +
+ Large Vision-Language Models (LVLMs) are susceptible to object +hallucinations, an issue in which their generated text contains non-existent +objects, greatly limiting their reliability and practicality. Current +approaches often rely on the model's token likelihoods or other internal +information, instruction tuning on additional datasets, or incorporating +complex external tools. We first perform empirical analysis on sentence-level +LVLM hallucination, finding that CLIP similarity to the image acts as a +stronger and more robust indicator of hallucination compared to token +likelihoods. Motivated by this, we introduce our CLIP-Guided Decoding (CGD) +approach, a straightforward but effective training-free approach to reduce +object hallucination at decoding time. CGD uses CLIP to guide the model's +decoding process by enhancing visual grounding of generated text with the +image. Experiments demonstrate that CGD effectively mitigates object +hallucination across multiple LVLM families while preserving the utility of +text generation. Codes are available at +https://github.com/d-ailin/CLIP-Guided-Decoding. + +
+
+ comment: Code URL: https://github.com/d-ailin/CLIP-Guided-Decoding +
+
+
+
+
+ + ♻ ☆ A Survey on Autonomous Driving Datasets: Statistics, Annotation Quality, + and a Future Outlook + + +
+ Autonomous driving has rapidly developed and shown promising performance due +to recent advances in hardware and deep learning techniques. High-quality +datasets are fundamental for developing reliable autonomous driving algorithms. +Previous dataset surveys either focused on a limited number or lacked detailed +investigation of dataset characteristics. To this end, we present an exhaustive +study of 265 autonomous driving datasets from multiple perspectives, including +sensor modalities, data size, tasks, and contextual conditions. We introduce a +novel metric to evaluate the impact of datasets, which can also be a guide for +creating new datasets. Besides, we analyze the annotation processes, existing +labeling tools, and the annotation quality of datasets, showing the importance +of establishing a standard annotation pipeline. On the other hand, we +thoroughly analyze the impact of geographical and adversarial environmental +conditions on the performance of autonomous driving systems. Moreover, we +exhibit the data distribution of several vital datasets and discuss their pros +and cons accordingly. Finally, we discuss the current challenges and the +development trend of the future autonomous driving datasets. + +
+
+
+
+
+ + ♻ ☆ Fine-tuning vision foundation model for crack segmentation in civil + infrastructures + + +
+ Large-scale foundation models have become the mainstream deep learning +method, while in civil engineering, the scale of AI models is strictly limited. +In this work, a vision foundation model is introduced for crack segmentation. +Two parameter-efficient fine-tuning methods, adapter and low-rank adaptation, +are adopted to fine-tune the foundation model in semantic segmentation: the +Segment Anything Model (SAM). The fine-tuned CrackSAM shows excellent +performance on different scenes and materials. To test the zero-shot +performance of the proposed method, two unique datasets related to road and +exterior wall cracks are collected, annotated and open-sourced, for a total of +810 images. Comparative experiments are conducted with twelve mature semantic +segmentation models. On datasets with artificial noise and previously unseen +datasets, the performance of CrackSAM far exceeds that of all state-of-the-art +models. CrackSAM exhibits remarkable superiority, particularly under +challenging conditions such as dim lighting, shadows, road markings, +construction joints, and other interference factors. These cross-scenario +results demonstrate the outstanding zero-shot capability of foundation models +and provide new ideas for developing vision models in civil engineering. + +
+
+
+
+
+ + ♻ ☆ PaddingFlow: Improving Normalizing Flows with Padding-Dimensional Noise + + +
+ Normalizing flow is a generative modeling approach with efficient sampling. +However, Flow-based models suffer two issues: 1) If the target distribution is +manifold, due to the unmatch between the dimensions of the latent target +distribution and the data distribution, flow-based models might perform badly. +2) Discrete data might make flow-based models collapse into a degenerate +mixture of point masses. To sidestep such two issues, we propose PaddingFlow, a +novel dequantization method, which improves normalizing flows with +padding-dimensional noise. To implement PaddingFlow, only the dimension of +normalizing flows needs to be modified. Thus, our method is easy to implement +and computationally cheap. Moreover, the padding-dimensional noise is only +added to the padding dimension, which means PaddingFlow can dequantize without +changing data distributions. Implementing existing dequantization methods needs +to change data distributions, which might degrade performance. We validate our +method on the main benchmarks of unconditional density estimation, including +five tabular datasets and four image datasets for Variational Autoencoder (VAE) +models, and the Inverse Kinematics (IK) experiments which are conditional +density estimation. The results show that PaddingFlow can perform better in all +experiments in this paper, which means PaddingFlow is widely suitable for +various tasks. The code is available at: +https://github.com/AdamQLMeng/PaddingFlow. + +
+
+
+
+
+ + ♻ ☆ Effective Decision Boundary Learning for Class Incremental Learning + + +
+ Rehearsal approaches in class incremental learning (CIL) suffer from decision +boundary overfitting to new classes, which is mainly caused by two factors: +insufficiency of old classes data for knowledge distillation and imbalanced +data learning between the learned and new classes because of the limited +storage memory. In this work, we present a simple but effective approach to +tackle these two factors. First, we employ a re-sampling strategy and Mixup +K}nowledge D}istillation (Re-MKD) to improve the performances of KD, which +would greatly alleviate the overfitting problem. Specifically, we combine mixup +and re-sampling strategies to synthesize adequate data used in KD training that +are more consistent with the latent distribution between the learned and new +classes. Second, we propose a novel incremental influence balance (IIB) method +for CIL to tackle the classification of imbalanced data by extending the +influence balance method into the CIL setting, which re-weights samples by +their influences to create a proper decision boundary. With these two +improvements, we present the effective decision boundary learning algorithm +(EDBL) which improves the performance of KD and deals with the imbalanced data +learning simultaneously. Experiments show that the proposed EDBL achieves +state-of-the-art performances on several CIL benchmarks. + +
+
+
+
+
+ + ♻ ☆ Towards Effective Multi-Moving-Camera Tracking: A New Dataset and + Lightweight Link Model + + +
+ Ensuring driving safety for autonomous vehicles has become increasingly +crucial, highlighting the need for systematic tracking of on-road pedestrians. +Most vehicles are equipped with visual sensors, however, the large-scale visual +data has not been well studied yet. Multi-target multi-camera (MTMC) tracking +systems are composed of two modules: single-camera tracking (SCT) and +inter-camera tracking (ICT). To reliably coordinate between them, MTMC tracking +has been a very complicated task, while tracking across multiple moving cameras +makes it even more challenging. In this paper, we focus on multi-target +multi-moving-camera (MTMMC) tracking, which is attracting increasing attention +from the research community. Observing there are few datasets for MTMMC +tracking, we collect a new dataset, called Multi-Moving-Camera Track (MMCT), +which contains sequences under various driving scenarios. To address the common +problems of identity switch easily faced by most existing SCT trackers, +especially for moving cameras due to ego-motion between the camera and targets, +a lightweight appearance-free global link model, called Linker, is proposed to +mitigate the identity switch by associating two disjoint tracklets of the same +target into a complete trajectory within the same camera. Incorporated with +Linker, existing SCT trackers generally obtain a significant improvement. +Moreover, to alleviate the impact of the image style variations caused by +different cameras, a color transfer module is effectively incorporated to +extract cross-camera consistent appearance features for pedestrian association +across moving cameras for ICT, resulting in a much improved MTMMC tracking +system, which can constitute a step further towards coordinated mining of +multiple moving cameras. The project page is available at +https://dhu-mmct.github.io/. + +
+
+
+
+
+ + ♻ ☆ FG-MDM: Towards Zero-Shot Human Motion Generation via Fine-Grained + Descriptions + + +
+ Recently, significant progress has been made in text-based motion generation, +enabling the generation of diverse and high-quality human motions that conform +to textual descriptions. However, generating motions beyond the distribution of +original datasets remains challenging, i.e., zero-shot generation. By adopting +a divide-and-conquer strategy, we propose a new framework named Fine-Grained +Human Motion Diffusion Model (FG-MDM) for zero-shot human motion generation. +Specifically, we first parse previous vague textual annotations into +fine-grained descriptions of different body parts by leveraging a large +language model. We then use these fine-grained descriptions to guide a +transformer-based diffusion model, which further adopts a design of part +tokens. FG-MDM can generate human motions beyond the scope of original datasets +owing to descriptions that are closer to motion essence. Our experimental +results demonstrate the superiority of FG-MDM over previous methods in +zero-shot settings. We will release our fine-grained textual annotations for +HumanML3D and KIT. + +
+
+ comment: Project Page: https://sx0207.github.io/fg-mdm/ +
+
+
+
+
+ + ♻ ☆ Remembering Transformer for Continual Learning + + +
+ Neural networks encounter the challenge of Catastrophic Forgetting (CF) in +continual learning, where new task knowledge interferes with previously learned +knowledge. We propose Remembering Transformer, inspired by the brain's +Complementary Learning Systems (CLS), to tackle this issue. Remembering +Transformer employs a mixture-of-adapters and a generative model-based routing +mechanism to alleviate CF by dynamically routing task data to relevant +adapters. Our approach demonstrated a new SOTA performance in various vision +continual learning tasks and great parameter efficiency. + +
+
+
+
+
+ + ♻ ☆ X-Adapter: Adding Universal Compatibility of Plugins for Upgraded + Diffusion Model + + +
+ We introduce X-Adapter, a universal upgrader to enable the pretrained +plug-and-play modules (e.g., ControlNet, LoRA) to work directly with the +upgraded text-to-image diffusion model (e.g., SDXL) without further retraining. +We achieve this goal by training an additional network to control the frozen +upgraded model with the new text-image data pairs. In detail, X-Adapter keeps a +frozen copy of the old model to preserve the connectors of different plugins. +Additionally, X-Adapter adds trainable mapping layers that bridge the decoders +from models of different versions for feature remapping. The remapped features +will be used as guidance for the upgraded model. To enhance the guidance +ability of X-Adapter, we employ a null-text training strategy for the upgraded +model. After training, we also introduce a two-stage denoising strategy to +align the initial latents of X-Adapter and the upgraded model. Thanks to our +strategies, X-Adapter demonstrates universal compatibility with various plugins +and also enables plugins of different versions to work together, thereby +expanding the functionalities of diffusion community. To verify the +effectiveness of the proposed method, we conduct extensive experiments and the +results show that X-Adapter may facilitate wider application in the upgraded +foundational diffusion model. + +
+
+ comment: Project page: https://showlab.github.io/X-Adapter/ +
+
+
+
+
+ + ♻ ☆ NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous + Driving + + +
+ We present a versatile NeRF-based simulator for testing autonomous driving +(AD) software systems, designed with a focus on sensor-realistic closed-loop +evaluation and the creation of safety-critical scenarios. The simulator learns +from sequences of real-world driving sensor data and enables reconfigurations +and renderings of new, unseen scenarios. In this work, we use our simulator to +test the responses of AD models to safety-critical scenarios inspired by the +European New Car Assessment Programme (Euro NCAP). Our evaluation reveals that, +while state-of-the-art end-to-end planners excel in nominal driving scenarios +in an open-loop setting, they exhibit critical flaws when navigating our +safety-critical scenarios in a closed-loop setting. This highlights the need +for advancements in the safety and real-world usability of end-to-end planners. +By publicly releasing our simulator and scenarios as an easy-to-run evaluation +suite, we invite the research community to explore, refine, and validate their +AD models in controlled, yet highly configurable and challenging +sensor-realistic environments. Code and instructions can be found at +https://github.com/atonderski/neuro-ncap + +
+
+
+
+
+ + ♻ ☆ LASER: Tuning-Free LLM-Driven Attention Control for Efficient + Text-conditioned Image-to-Animation + + +
+ Revolutionary advancements in text-to-image models have unlocked new +dimensions for sophisticated content creation, e.g., text-conditioned image +editing, allowing us to edit the diverse images that convey highly complex +visual concepts according to the textual guidance. Despite being promising, +existing methods focus on texture- or non-rigid-based visual manipulation, +which struggles to produce the fine-grained animation of smooth +text-conditioned image morphing without fine-tuning, i.e., due to their highly +unstructured latent space. In this paper, we introduce a tuning-free LLM-driven +attention control framework, encapsulated by the progressive process of LLM +planning, prompt-Aware editing, StablE animation geneRation, abbreviated as +LASER. LASER employs a large language model (LLM) to refine coarse descriptions +into detailed prompts, guiding pre-trained text-to-image models for subsequent +image generation. We manipulate the model's spatial features and self-attention +mechanisms to maintain animation integrity and enable seamless morphing +directly from text prompts, eliminating the need for additional fine-tuning or +annotations. Our meticulous control over spatial features and self-attention +ensures structural consistency in the images. This paper presents a novel +framework integrating LLMs with text-to-image models to create high-quality +animations from a single text input. We also propose a Text-conditioned +Image-to-Animation Benchmark to validate the effectiveness and efficacy of +LASER. Extensive experiments demonstrate that LASER produces impressive, +consistent, and efficient results in animation generation, positioning it as a +powerful tool for advanced digital content creation. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes CVPR 2024 + + +
+ Recent advancements in post-hoc and inherently interpretable methods have +markedly enhanced the explanations of black box classifier models. These +methods operate either through post-analysis or by integrating concept learning +during model training. Although being effective in bridging the semantic gap +between a model's latent space and human interpretation, these explanation +methods only partially reveal the model's decision-making process. The outcome +is typically limited to high-level semantics derived from the last feature map. +We argue that the explanations lacking insights into the decision processes at +low and mid-level features are neither fully faithful nor useful. Addressing +this gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet), +an inherently interpretable model. MCPNet autonomously learns meaningful +concept prototypes across multiple feature map levels using Centered Kernel +Alignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so +without reliance on predefined concept labels. Further, we propose a novel +classifier paradigm that learns and aligns multi-level concept prototype +distributions for classification purposes via Class-aware Concept Distribution +(CCD) loss. Our experiments reveal that our proposed MCPNet while being +adaptable to various model architectures, offers comprehensive multi-level +explanations while maintaining classification accuracy. Additionally, its +concept distribution-based classification approach shows improved +generalization capabilities in few-shot classification scenarios. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Advancements in Point Cloud Data Augmentation for Deep Learning: A + Survey + + +
+ Deep learning (DL) has become one of the mainstream and effective methods for +point cloud analysis tasks such as detection, segmentation and classification. +To reduce overfitting during training DL models and improve model performance +especially when the amount and/or diversity of training data are limited, +augmentation is often crucial. Although various point cloud data augmentation +methods have been widely used in different point cloud processing tasks, there +are currently no published systematic surveys or reviews of these methods. +Therefore, this article surveys these methods, categorizing them into a +taxonomy framework that comprises basic and specialized point cloud data +augmentation methods. Through a comprehensive evaluation of these augmentation +methods, this article identifies their potentials and limitations, serving as a +useful reference for choosing appropriate augmentation methods. In addition, +potential directions for future research are recommended. This survey +contributes to providing a holistic overview of the current state of point +cloud data augmentation, promoting its wider application and development. + +
+
+ comment: Accepted by Pattern Recognition +
+
+
+
+
+ + ♻ ☆ DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On + Transformer + + +
+ With the continuous development of Optical Character Recognition (OCR) and +the expansion of application fields, text recognition in complex scenes has +become a key challenge. Factors such as multiple fonts, mixed scenes and +complex layouts seriously affect the recognition accuracy of traditional OCR +models. Although OCR models based on deep learning have performed well in +specific fields or similar datasets in recent years, the generalization ability +and robustness of the model are still a big challenge when facing complex +environments with multiple scenes. Furthermore, training an OCR model from +scratch or fine-tuning all parameters is very demanding on computing resources +and inference time, which limits the flexibility of its application. This study +focuses on a fundamental aspect of mixed text recognition in response to the +challenges mentioned above, which involves effectively fine-tuning the +pre-trained basic OCR model to demonstrate exceptional performance across +various downstream tasks. To this end, we propose a parameter-efficient mixed +text recognition method based on pre-trained OCR Transformer, namely +DLoRA-TrOCR. This method embeds DoRA into the image encoder and LoRA into the +internal structure of the text decoder, enabling efficient parameter +fine-tuning for downstream tasks. Experiments show that compared to similar +parameter adjustment methods, our model DLoRA-TrOCR has the smallest number of +parameters and performs better. It can achieve state-of-the-art performance on +complex scene datasets involving simultaneous recognition of mixed handwritten, +printed and street view texts. + +
+
+
+
+
+ + ♻ ☆ Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object + Detection CVPR2024 + + +
+ We delve into pseudo-labeling for semi-supervised monocular 3D object +detection (SSM3OD) and discover two primary issues: a misalignment between the +prediction quality of 3D and 2D attributes and the tendency of depth +supervision derived from pseudo-labels to be noisy, leading to significant +optimization conflicts with other reliable forms of supervision. We introduce a +novel decoupled pseudo-labeling (DPL) approach for SSM3OD. Our approach +features a Decoupled Pseudo-label Generation (DPG) module, designed to +efficiently generate pseudo-labels by separately processing 2D and 3D +attributes. This module incorporates a unique homography-based method for +identifying dependable pseudo-labels in BEV space, specifically for 3D +attributes. Additionally, we present a DepthGradient Projection (DGP) module to +mitigate optimization conflicts caused by noisy depth supervision of +pseudo-labels, effectively decoupling the depth gradient and removing +conflicting gradients. This dual decoupling strategy-at both the pseudo-label +generation and gradient levels-significantly improves the utilization of +pseudo-labels in SSM3OD. Our comprehensive experiments on the KITTI benchmark +demonstrate the superiority of our method over existing approaches. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Fine-Grained Information and Noise Decoupling for Remote + Sensing Change Detection + + +
+ Change detection aims to identify remote sense object changes by analyzing +data between bitemporal image pairs. Due to the large temporal and spatial span +of data collection in change detection image pairs, there are often a +significant amount of task-specific and task-agnostic noise. Previous effort +has focused excessively on denoising, with this goes a great deal of loss of +fine-grained information. In this paper, we revisit the importance of +fine-grained features in change detection and propose a series of operations +for fine-grained information compensation and noise decoupling (FINO). First, +the context is utilized to compensate for the fine-grained information in the +feature space. Next, a shape-aware and a brightness-aware module are designed +to improve the capacity for representation learning. The shape-aware module +guides the backbone for more precise shape estimation, guiding the backbone +network in extracting object shape features. The brightness-aware module learns +a overall brightness estimation to improve the model's robustness to +task-agnostic noise. Finally, a task-specific noise decoupling structure is +designed as a way to improve the model's ability to separate noise interference +from feature similarity. With these training schemes, our proposed method +achieves new state-of-the-art (SOTA) results in multiple change detection +benchmarks. The code will be made available. + +
+
+
+
+
+ + ♻ ☆ Enhancing Representations through Heterogeneous Self-Supervised Learning + + +
+ Incorporating heterogeneous representations from different architectures has +facilitated various vision tasks, e.g., some hybrid networks combine +transformers and convolutions. However, complementarity between such +heterogeneous architectures has not been well exploited in self-supervised +learning. Thus, we propose Heterogeneous Self-Supervised Learning (HSSL), which +enforces a base model to learn from an auxiliary head whose architecture is +heterogeneous from the base model. In this process, HSSL endows the base model +with new characteristics in a representation learning way without structural +changes. To comprehensively understand the HSSL, we conduct experiments on +various heterogeneous pairs containing a base model and an auxiliary head. We +discover that the representation quality of the base model moves up as their +architecture discrepancy grows. This observation motivates us to propose a +search strategy that quickly determines the most suitable auxiliary head for a +specific base model to learn and several simple but effective methods to +enlarge the model discrepancy. The HSSL is compatible with various +self-supervised methods, achieving superior performances on various downstream +tasks, including image classification, semantic segmentation, instance +segmentation, and object detection. Our source code will be made publicly +available. + +
+
+
+
+
+ + ♻ ☆ Single-temporal Supervised Remote Change Detection for Domain + Generalization + + +
+ Change detection is widely applied in remote sensing image analysis. Existing +methods require training models separately for each dataset, which leads to +poor domain generalization. Moreover, these methods rely heavily on large +amounts of high-quality pair-labelled data for training, which is expensive and +impractical. In this paper, we propose a multimodal contrastive learning +(ChangeCLIP) based on visual-language pre-training for change detection domain +generalization. Additionally, we propose a dynamic context optimization for +prompt learning. Meanwhile, to address the data dependency issue of existing +methods, we introduce a single-temporal and controllable AI-generated training +strategy (SAIN). This allows us to train the model using a large number of +single-temporal images without image pairs in the real world, achieving +excellent generalization. Extensive experiments on series of real change +detection datasets validate the superiority and strong generalization of +ChangeCLIP, outperforming state-of-the-art change detection methods. Code will +be available. + +
+
+
+
+
+ + ♻ ☆ Latent-based Diffusion Model for Long-tailed Recognition CVPR2024 + + +
+ Long-tailed imbalance distribution is a common issue in practical computer +vision applications. Previous works proposed methods to address this problem, +which can be categorized into several classes: re-sampling, re-weighting, +transfer learning, and feature augmentation. In recent years, diffusion models +have shown an impressive generation ability in many sub-problems of deep +computer vision. However, its powerful generation has not been explored in +long-tailed problems. We propose a new approach, the Latent-based Diffusion +Model for Long-tailed Recognition (LDMLR), as a feature augmentation method to +tackle the issue. First, we encode the imbalanced dataset into features using +the baseline model. Then, we train a Denoising Diffusion Implicit Model (DDIM) +using these encoded features to generate pseudo-features. Finally, we train the +classifier using the encoded and pseudo-features from the previous two steps. +The model's accuracy shows an improvement on the CIFAR-LT and ImageNet-LT +datasets by using the proposed method. + +
+
+ comment: 8 pages, 3 figures. Accepted by L3DIVU-CVPR2024 +
+
+
+
+
+ + ♻ ☆ PeLiCal: Targetless Extrinsic Calibration via Penetrating Lines for + RGB-D Cameras with Limited Co-visibility + + +
+ RGB-D cameras are crucial in robotic perception, given their ability to +produce images augmented with depth data. However, their limited FOV often +requires multiple cameras to cover a broader area. In multi-camera RGB-D +setups, the goal is typically to reduce camera overlap, optimizing spatial +coverage with as few cameras as possible. The extrinsic calibration of these +systems introduces additional complexities. Existing methods for extrinsic +calibration either necessitate specific tools or highly depend on the accuracy +of camera motion estimation. To address these issues, we present PeLiCal, a +novel line-based calibration approach for RGB-D camera systems exhibiting +limited overlap. Our method leverages long line features from surroundings, and +filters out outliers with a novel convergence voting algorithm, achieving +targetless, real-time, and outlier-robust performance compared to existing +methods. We open source our implementation on +https://github.com/joomeok/PeLiCal.git. + +
+
+
+
+
+ + ♻ ☆ Point Clouds Are Specialized Images: A Knowledge Transfer Approach for + 3D Understanding + + +
+ Self-supervised representation learning (SSRL) has gained increasing +attention in point cloud understanding, in addressing the challenges posed by +3D data scarcity and high annotation costs. This paper presents PCExpert, a +novel SSRL approach that reinterprets point clouds as "specialized images". +This conceptual shift allows PCExpert to leverage knowledge derived from +large-scale image modality in a more direct and deeper manner, via extensively +sharing the parameters with a pre-trained image encoder in a multi-way +Transformer architecture. The parameter sharing strategy, combined with a novel +pretext task for pre-training, i.e., transformation estimation, empowers +PCExpert to outperform the state of the arts in a variety of tasks, with a +remarkable reduction in the number of trainable parameters. Notably, PCExpert's +performance under LINEAR fine-tuning (e.g., yielding a 90.02% overall accuracy +on ScanObjectNN) has already approached the results obtained with FULL model +fine-tuning (92.66%), demonstrating its effective and robust representation +capability. + +
+
+
+
+
+ + ♻ ☆ TransPose: 6D Object Pose Estimation with Geometry-Aware Transformer + + +
+ Estimating the 6D object pose is an essential task in many applications. Due +to the lack of depth information, existing RGB-based methods are sensitive to +occlusion and illumination changes. How to extract and utilize the geometry +features in depth information is crucial to achieve accurate predictions. To +this end, we propose TransPose, a novel 6D pose framework that exploits +Transformer Encoder with geometry-aware module to develop better learning of +point cloud feature representations. Specifically, we first uniformly sample +point cloud and extract local geometry features with the designed local feature +extractor base on graph convolution network. To improve robustness to +occlusion, we adopt Transformer to perform the exchange of global information, +making each local feature contains global information. Finally, we introduce +geometry-aware module in Transformer Encoder, which to form an effective +constrain for point cloud feature learning and makes the global information +exchange more tightly coupled with point cloud tasks. Extensive experiments +indicate the effectiveness of TransPose, our pose estimation pipeline achieves +competitive results on three benchmark datasets. + +
+
+ comment: Accepted by NEUROCOMPUTING +
+
+
+
+
+ + ♻ ☆ FlowVQTalker: High-Quality Emotional Talking Face Generation through + Normalizing Flow and Quantization + + +
+ Generating emotional talking faces is a practical yet challenging endeavor. +To create a lifelike avatar, we draw upon two critical insights from a human +perspective: 1) The connection between audio and the non-deterministic facial +dynamics, encompassing expressions, blinks, poses, should exhibit synchronous +and one-to-many mapping. 2) Vibrant expressions are often accompanied by +emotion-aware high-definition (HD) textures and finely detailed teeth. However, +both aspects are frequently overlooked by existing methods. To this end, this +paper proposes using normalizing Flow and Vector-Quantization modeling to +produce emotional talking faces that satisfy both insights concurrently +(FlowVQTalker). Specifically, we develop a flow-based coefficient generator +that encodes the dynamics of facial emotion into a multi-emotion-class latent +space represented as a mixture distribution. The generation process commences +with random sampling from the modeled distribution, guided by the accompanying +audio, enabling both lip-synchronization and the uncertain nonverbal facial +cues generation. Furthermore, our designed vector-quantization image generator +treats the creation of expressive facial images as a code query task, utilizing +a learned codebook to provide rich, high-quality textures that enhance the +emotional perception of the results. Extensive experiments are conducted to +showcase the effectiveness of our approach. + +
+
+ comment: 11 pages, 11 figures, conference +
+
+
+
+
+ + ♻ ☆ Fixation-based Self-calibration for Eye Tracking in VR Headsets + + +
+ This study proposes a novel self-calibration method for eye tracking in a +virtual reality (VR) headset. The proposed method is based on the assumptions +that the user's viewpoint can freely move and that the points of regard (PoRs) +from different viewpoints are distributed within a small area on an object +surface during visual fixation. In the method, fixations are first detected +from the time-series data of uncalibrated gaze directions using an extension of +the I-VDT (velocity and dispersion threshold identification) algorithm to a +three-dimensional (3D) scene. Then, the calibration parameters are optimized by +minimizing the sum of a dispersion metrics of the PoRs. The proposed method can +potentially identify the optimal calibration parameters representing the +user-dependent offset from the optical axis to the visual axis without explicit +user calibration, image processing, or marker-substitute objects. For the gaze +data of 18 participants walking in two VR environments with many occlusions, +the proposed method achieved an accuracy of 2.1$^\circ$, which was +significantly lower than the average offset. Our method is the first +self-calibration method with an average error lower than 3$^\circ$ in 3D +environments. Further, the accuracy of the proposed method can be improved by +up to 1.2$^\circ$ by refining the fixation detection or optimization algorithm. + +
+
+
+
+
+ + ♻ ☆ Feature Imitating Networks Enhance The Performance, Reliability And + Speed Of Deep Learning On Biomedical Image Processing Tasks + + +
+ Feature-Imitating-Networks (FINs) are neural networks that are first trained +to approximate closed-form statistical features (e.g. Entropy), and then +embedded into other networks to enhance their performance. In this work, we +perform the first evaluation of FINs for biomedical image processing tasks. We +begin by training a set of FINs to imitate six common radiomics features, and +then compare the performance of larger networks (with and without embedding the +FINs) for three experimental tasks: COVID-19 detection from CT scans, brain +tumor classification from MRI scans, and brain-tumor segmentation from MRI +scans. We found that models embedded with FINs provided enhanced performance +for all three tasks when compared to baseline networks without FINs, even when +those baseline networks had more parameters. Additionally, we found that models +embedded with FINs converged faster and more consistently compared to baseline +networks with similar or greater representational capacity. The results of our +experiments provide evidence that FINs may offer state-of-the-art performance +for a variety of other biomedical image processing tasks. + +
+
+
+
+
+ + ♻ ☆ Leveraging Systematic Knowledge of 2D Transformations + + +
+ The existing deep learning models suffer from out-of-distribution (o.o.d.) +performance drop in computer vision tasks. In comparison, humans have a +remarkable ability to interpret images, even if the scenes in the images are +rare, thanks to the systematicity of acquired knowledge. This work focuses on +1) the acquisition of systematic knowledge of 2D transformations, and 2) +architectural components that can leverage the learned knowledge in image +classification tasks in an o.o.d. setting. With a new training methodology +based on synthetic datasets that are constructed under the causal framework, +the deep neural networks acquire knowledge from semantically different domains +(e.g. even from noise), and exhibit certain level of systematicity in parameter +estimation experiments. Based on this, a novel architecture is devised +consisting of a classifier, an estimator and an identifier (abbreviated as +"CED"). By emulating the "hypothesis-verification" process in human visual +perception, CED improves the classification accuracy significantly on test sets +under covariate shift. + +
+
+
+
+
+ + ♻ ☆ A Dataset and Model for Realistic License Plate Deblurring IJCAI 2024 + + +
+ Vehicle license plate recognition is a crucial task in intelligent traffic +management systems. However, the challenge of achieving accurate recognition +persists due to motion blur from fast-moving vehicles. Despite the widespread +use of image synthesis approaches in existing deblurring and recognition +algorithms, their effectiveness in real-world scenarios remains unproven. To +address this, we introduce the first large-scale license plate deblurring +dataset named License Plate Blur (LPBlur), captured by a dual-camera system and +processed through a post-processing pipeline to avoid misalignment issues. +Then, we propose a License Plate Deblurring Generative Adversarial Network +(LPDGAN) to tackle the license plate deblurring: 1) a Feature Fusion Module to +integrate multi-scale latent codes; 2) a Text Reconstruction Module to restore +structure through textual modality; 3) a Partition Discriminator Module to +enhance the model's perception of details in each letter. Extensive experiments +validate the reliability of the LPBlur dataset for both model training and +testing, showcasing that our proposed model outperforms other state-of-the-art +motion deblurring methods in realistic license plate deblurring scenarios. The +dataset and code are available at https://github.com/haoyGONG/LPDGAN. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Improved Cryo-EM Pose Estimation and 3D Classification through + Latent-Space Disentanglement + + +
+ Due to the extremely low signal-to-noise ratio (SNR) and unknown poses +(projection angles and image shifts) in cryo-electron microscopy (cryo-EM) +experiments, reconstructing 3D volumes from 2D images is very challenging. In +addition to these challenges, heterogeneous cryo-EM reconstruction requires +conformational classification. In popular cryo-EM reconstruction algorithms, +poses and conformation classification labels must be predicted for every input +cryo-EM image, which can be computationally costly for large datasets. An +emerging class of methods adopted the amortized inference approach. In these +methods, only a subset of the input dataset is needed to train neural networks +for the estimation of poses and conformations. Once trained, these neural +networks can make pose/conformation predictions and 3D reconstructions at low +cost for the entire dataset during inference. Unfortunately, when facing +heterogeneous reconstruction tasks, it is hard for current +amortized-inference-based methods to effectively estimate the conformational +distribution and poses from entangled latent variables. Here, we propose a +self-supervised variational autoencoder architecture called "HetACUMN" based on +amortized inference. We employed an auxiliary conditional pose prediction task +by inverting the order of encoder-decoder to explicitly enforce the +disentanglement of conformation and pose predictions. Results on simulated +datasets show that HetACUMN generated more accurate conformational +classifications than other amortized or non-amortized methods. Furthermore, we +show that HetACUMN is capable of performing heterogeneous 3D reconstructions of +a real experimental dataset. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ Accelerating Image Generation with Sub-path Linear Approximation Model + + +
+ Diffusion models have significantly advanced the state of the art in image, +audio, and video generation tasks. However, their applications in practical +scenarios are hindered by slow inference speed. Drawing inspiration from the +approximation strategies utilized in consistency models, we propose the +Sub-path Linear Approximation Model (SLAM), which accelerates diffusion models +while maintaining high-quality image generation. SLAM treats the PF-ODE +trajectory as a series of PF-ODE sub-paths divided by sampled points, and +harnesses sub-path linear (SL) ODEs to form a progressive and continuous error +estimation along each individual PF-ODE sub-path. The optimization on such +SL-ODEs allows SLAM to construct denoising mappings with smaller cumulative +approximated errors. An efficient distillation method is also developed to +facilitate the incorporation of more advanced diffusion models, such as latent +diffusion models. Our extensive experimental results demonstrate that SLAM +achieves an efficient training regimen, requiring only 6 A100 GPU days to +produce a high-quality generative model capable of 2 to 4-step generation with +high performance. Comprehensive evaluations on LAION, MS COCO 2014, and MS COCO +2017 datasets also illustrate that SLAM surpasses existing acceleration methods +in few-step generation tasks, achieving state-of-the-art performance both on +FID and the quality of the generated images. + +
+
+
+
+
+ + ♻ ☆ Learning to Recover Spectral Reflectance from RGB Images + + +
+ This paper tackles spectral reflectance recovery (SRR) from RGB images. Since +capturing ground-truth spectral reflectance and camera spectral sensitivity are +challenging and costly, most existing approaches are trained on synthetic +images and utilize the same parameters for all unseen testing images, which are +suboptimal especially when the trained models are tested on real images because +they never exploit the internal information of the testing images. To address +this issue, we adopt a self-supervised meta-auxiliary learning (MAXL) strategy +that fine-tunes the well-trained network parameters with each testing image to +combine external with internal information. To the best of our knowledge, this +is the first work that successfully adapts the MAXL strategy to this problem. +Instead of relying on naive end-to-end training, we also propose a novel +architecture that integrates the physical relationship between the spectral +reflectance and the corresponding RGB images into the network based on our +mathematical analysis. Besides, since the spectral reflectance of a scene is +independent to its illumination while the corresponding RGB images are not, we +recover the spectral reflectance of a scene from its RGB images captured under +multiple illuminations to further reduce the unknown. Qualitative and +quantitative evaluations demonstrate the effectiveness of our proposed network +and of the MAXL. Our code and data are available at +https://github.com/Dong-Huo/SRR-MAXL. + +
+
+ comment: IEEE Transactions on Image Processing (TIP), 2024 +
+
+
+
+
+ + ♻ ☆ Learning Disentangled Identifiers for Action-Customized Text-to-Image + Generation CVPR 2024 + + +
+ This study focuses on a novel task in text-to-image (T2I) generation, namely +action customization. The objective of this task is to learn the co-existing +action from limited data and generalize it to unseen humans or even animals. +Experimental results show that existing subject-driven customization methods +fail to learn the representative characteristics of actions and struggle in +decoupling actions from context features, including appearance. To overcome the +preference for low-level features and the entanglement of high-level features, +we propose an inversion-based method Action-Disentangled Identifier (ADI) to +learn action-specific identifiers from the exemplar images. ADI first expands +the semantic conditioning space by introducing layer-wise identifier tokens, +thereby increasing the representational richness while distributing the +inversion across different features. Then, to block the inversion of +action-agnostic features, ADI extracts the gradient invariance from the +constructed sample triples and masks the updates of irrelevant channels. To +comprehensively evaluate the task, we present an ActionBench that includes a +variety of actions, each accompanied by meticulously selected samples. Both +quantitative and qualitative results show that our ADI outperforms existing +baselines in action-customized T2I generation. Our project page is at +https://adi-t2i.github.io/ADI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Gradient-Regularized Out-of-Distribution Detection + + +
+ One of the challenges for neural networks in real-life applications is the +overconfident errors these models make when the data is not from the original +training distribution. + Addressing this issue is known as Out-of-Distribution (OOD) detection. + Many state-of-the-art OOD methods employ an auxiliary dataset as a surrogate +for OOD data during training to achieve improved performance. + However, these methods fail to fully exploit the local information embedded +in the auxiliary dataset. + In this work, we propose the idea of leveraging the information embedded in +the gradient of the loss function during training to enable the network to not +only learn a desired OOD score for each sample but also to exhibit similar +behavior in a local neighborhood around each sample. + We also develop a novel energy-based sampling method to allow the network to +be exposed to more informative OOD samples during the training phase. This is +especially important when the auxiliary dataset is large. We demonstrate the +effectiveness of our method through extensive experiments on several OOD +benchmarks, improving the existing state-of-the-art FPR95 by 4% on our ImageNet +experiment. + We further provide a theoretical analysis through the lens of certified +robustness and Lipschitz analysis to showcase the theoretical foundation of our +work. We will publicly release our code after the review process. + +
+
+ comment: Under review +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 156 + +
+
+
+ + ☆ AutoAD III: The Prequel -- Back to the Pixels CVPR2024 + + +
+ Generating Audio Description (AD) for movies is a challenging task that +requires fine-grained visual understanding and an awareness of the characters +and their names. Currently, visual language models for AD generation are +limited by a lack of suitable training data, and also their evaluation is +hampered by using performance measures not specialized to the AD domain. In +this paper, we make three contributions: (i) We propose two approaches for +constructing AD datasets with aligned video data, and build training and +evaluation datasets using these. These datasets will be publicly released; (ii) +We develop a Q-former-based architecture which ingests raw video and generates +AD, using frozen pre-trained visual encoders and large language models; and +(iii) We provide new evaluation metrics to benchmark AD quality that are +well-matched to human performance. Taken together, we improve the state of the +art on AD generation. + +
+
+ comment: CVPR2024. Project page: + https://www.robots.ox.ac.uk/~vgg/research/autoad/ +
+
+
+
+
+ + ☆ Guess The Unseen: Dynamic 3D Scene Reconstruction from Partial 2D + Glimpses + + +
+ In this paper, we present a method to reconstruct the world and multiple +dynamic humans in 3D from a monocular video input. As a key idea, we represent +both the world and multiple humans via the recently emerging 3D Gaussian +Splatting (3D-GS) representation, enabling to conveniently and efficiently +compose and render them together. In particular, we address the scenarios with +severely limited and sparse observations in 3D human reconstruction, a common +challenge encountered in the real world. To tackle this challenge, we introduce +a novel approach to optimize the 3D-GS representation in a canonical space by +fusing the sparse cues in the common space, where we leverage a pre-trained 2D +diffusion model to synthesize unseen views while keeping the consistency with +the observed 2D appearances. We demonstrate our method can reconstruct +high-quality animatable 3D humans in various challenging examples, in the +presence of occlusion, image crops, few-shot, and extremely sparse +observations. After reconstruction, our method is capable of not only rendering +the scene in any novel views at arbitrary time instances, but also editing the +3D scene by removing individual humans or applying different motions for each +human. Through various experiments, we demonstrate the quality and efficiency +of our methods over alternative existing approaches. + +
+
+ comment: The project page is available at https://snuvclab.github.io/gtu/ +
+
+
+
+
+ + ☆ CrossScore: Towards Multi-View Image Evaluation and Scoring + + +
+ We introduce a novel cross-reference image quality assessment method that +effectively fills the gap in the image assessment landscape, complementing the +array of established evaluation schemes -- ranging from full-reference metrics +like SSIM, no-reference metrics such as NIQE, to general-reference metrics +including FID, and Multi-modal-reference metrics, e.g., CLIPScore. Utilising a +neural network with the cross-attention mechanism and a unique data collection +pipeline from NVS optimisation, our method enables accurate image quality +assessment without requiring ground truth references. By comparing a query +image against multiple views of the same scene, our method addresses the +limitations of existing metrics in novel view synthesis (NVS) and similar tasks +where direct reference images are unavailable. Experimental results show that +our method is closely correlated to the full-reference metric SSIM, while not +requiring ground truth references. + +
+
+ comment: Project page see https://crossscore.active.vision +
+
+
+
+
+ + ☆ Hyp-OC: Hyperbolic One Class Classification for Face Anti-Spoofing + + +
+ Face recognition technology has become an integral part of modern security +systems and user authentication processes. However, these systems are +vulnerable to spoofing attacks and can easily be circumvented. Most prior +research in face anti-spoofing (FAS) approaches it as a two-class +classification task where models are trained on real samples and known spoof +attacks and tested for detection performance on unknown spoof attacks. However, +in practice, FAS should be treated as a one-class classification task where, +while training, one cannot assume any knowledge regarding the spoof samples a +priori. In this paper, we reformulate the face anti-spoofing task from a +one-class perspective and propose a novel hyperbolic one-class classification +framework. To train our network, we use a pseudo-negative class sampled from +the Gaussian distribution with a weighted running mean and propose two novel +loss functions: (1) Hyp-PC: Hyperbolic Pairwise Confusion loss, and (2) Hyp-CE: +Hyperbolic Cross Entropy loss, which operate in the hyperbolic space. +Additionally, we employ Euclidean feature clipping and gradient clipping to +stabilize the training in the hyperbolic space. To the best of our knowledge, +this is the first work extending hyperbolic embeddings for face anti-spoofing +in a one-class manner. With extensive experiments on five benchmark datasets: +Rose-Youtu, MSU-MFSD, CASIA-MFSD, Idiap Replay-Attack, and OULU-NPU, we +demonstrate that our method significantly outperforms the state-of-the-art, +achieving better spoof detection performance. + +
+
+ comment: Accepted in FG2024, Project Page - + https://kartik-3004.github.io/hyp-oc/ +
+
+
+
+
+ + ☆ GeoDiffuser: Geometry-Based Image Editing with Diffusion Models + + +
+ The success of image generative models has enabled us to build methods that +can edit images based on text or other user input. However, these methods are +bespoke, imprecise, require additional information, or are limited to only 2D +image edits. We present GeoDiffuser, a zero-shot optimization-based method that +unifies common 2D and 3D image-based object editing capabilities into a single +method. Our key insight is to view image editing operations as geometric +transformations. We show that these transformations can be directly +incorporated into the attention layers in diffusion models to implicitly +perform editing operations. Our training-free optimization method uses an +objective function that seeks to preserve object style but generate plausible +images, for instance with accurate lighting and shadows. It also inpaints +disoccluded parts of the image where the object was originally located. Given a +natural image and user input, we segment the foreground object using SAM and +estimate a corresponding transform which is used by our optimization approach +for editing. GeoDiffuser can perform common 2D and 3D edits like object +translation, 3D rotation, and removal. We present quantitative results, +including a perceptual study, that shows how our approach is better than +existing methods. Visit https://ivl.cs.brown.edu/research/geodiffuser.html for +more information. + +
+
+
+
+
+ + ☆ SEED-X: Multimodal Models with Unified Multi-granularity Comprehension + and Generation + + +
+ The rapid evolution of multimodal foundation model has demonstrated +significant progresses in vision-language understanding and generation, e.g., +our previous work SEED-LLaMA. However, there remains a gap between its +capability and the real-world applicability, primarily due to the model's +limited capacity to effectively respond to various user instructions and +interact with diverse visual data. In this work, we focus on bridging this gap +through integrating two enhanced features: (1) comprehending images of +arbitrary sizes and ratios, and (2) enabling multi-granularity image +generation. We present a unified and versatile foundation model, namely, +SEED-X, which is able to model multi-granularity visual semantics for +comprehension and generation tasks. Besides the competitive results on public +benchmarks, SEED-X demonstrates its effectiveness in handling real-world +applications across various domains after instruction tuning. We hope that our +work will inspire future research into what can be achieved by versatile +multimodal foundation models in real-world applications. The models, codes, and +datasets will be released in https://github.com/AILab-CVC/SEED-X. + +
+
+ comment: Project released at: https://github.com/AILab-CVC/SEED-X +
+
+
+
+
+ + ☆ A Multimodal Automated Interpretability Agent + + +
+ This paper describes MAIA, a Multimodal Automated Interpretability Agent. +MAIA is a system that uses neural models to automate neural model understanding +tasks like feature interpretation and failure mode discovery. It equips a +pre-trained vision-language model with a set of tools that support iterative +experimentation on subcomponents of other models to explain their behavior. +These include tools commonly used by human interpretability researchers: for +synthesizing and editing inputs, computing maximally activating exemplars from +real-world datasets, and summarizing and describing experimental results. +Interpretability experiments proposed by MAIA compose these tools to describe +and explain system behavior. We evaluate applications of MAIA to computer +vision models. We first characterize MAIA's ability to describe (neuron-level) +features in learned representations of images. Across several trained models +and a novel dataset of synthetic vision neurons with paired ground-truth +descriptions, MAIA produces descriptions comparable to those generated by +expert human experimenters. We then show that MAIA can aid in two additional +interpretability tasks: reducing sensitivity to spurious features, and +automatically identifying inputs likely to be mis-classified. + +
+
+ comment: 25 pages, 13 figures +
+
+
+
+
+ + ☆ STROOBnet Optimization via GPU-Accelerated Proximal Recurrence + Strategies + + +
+ Spatiotemporal networks' observational capabilities are crucial for accurate +data gathering and informed decisions across multiple sectors. This study +focuses on the Spatiotemporal Ranged Observer-Observable Bipartite Network +(STROOBnet), linking observational nodes (e.g., surveillance cameras) to events +within defined geographical regions, enabling efficient monitoring. Using data +from Real-Time Crime Camera (RTCC) systems and Calls for Service (CFS) in New +Orleans, where RTCC combats rising crime amidst reduced police presence, we +address the network's initial observational imbalances. Aiming for uniform +observational efficacy, we propose the Proximal Recurrence approach. It +outperformed traditional clustering methods like k-means and DBSCAN by offering +holistic event frequency and spatial consideration, enhancing observational +coverage. + +
+
+ comment: 10 pages, 17 figures, 2023 IEEE International Conference on Big Data + (BigData) +
+
+
+
+
+ + ☆ TAVGBench: Benchmarking Text to Audible-Video Generation + + +
+ The Text to Audible-Video Generation (TAVG) task involves generating videos +with accompanying audio based on text descriptions. Achieving this requires +skillful alignment of both audio and video elements. To support research in +this field, we have developed a comprehensive Text to Audible-Video Generation +Benchmark (TAVGBench), which contains over 1.7 million clips with a total +duration of 11.8 thousand hours. We propose an automatic annotation pipeline to +ensure each audible video has detailed descriptions for both its audio and +video contents. We also introduce the Audio-Visual Harmoni score (AVHScore) to +provide a quantitative measure of the alignment between the generated audio and +video modalities. Additionally, we present a baseline model for TAVG called +TAVDiffusion, which uses a two-stream latent diffusion model to provide a +fundamental starting point for further research in this area. We achieve the +alignment of audio and video by employing cross-attention and contrastive +learning. Through extensive experiments and evaluations on TAVGBench, we +demonstrate the effectiveness of our proposed model under both conventional +metrics and our proposed metrics. + +
+
+ comment: Technical Report. Project + page:https://github.com/OpenNLPLab/TAVGBench +
+
+
+
+
+ + ☆ Graphic Design with Large Multimodal Model + + +
+ In the field of graphic design, automating the integration of design elements +into a cohesive multi-layered artwork not only boosts productivity but also +paves the way for the democratization of graphic design. One existing practice +is Graphic Layout Generation (GLG), which aims to layout sequential design +elements. It has been constrained by the necessity for a predefined correct +sequence of layers, thus limiting creative potential and increasing user +workload. In this paper, we present Hierarchical Layout Generation (HLG) as a +more flexible and pragmatic setup, which creates graphic composition from +unordered sets of design elements. To tackle the HLG task, we introduce +Graphist, the first layout generation model based on large multimodal models. +Graphist efficiently reframes the HLG as a sequence generation problem, +utilizing RGB-A images as input, outputs a JSON draft protocol, indicating the +coordinates, size, and order of each element. We develop new evaluation metrics +for HLG. Graphist outperforms prior arts and establishes a strong baseline for +this field. Project homepage: https://github.com/graphic-design-ai/graphist + +
+
+
+
+
+ + ☆ Scene Coordinate Reconstruction: Posing of Image Collections via + Incremental Learning of a Relocalizer + + +
+ We address the task of estimating camera parameters from a set of images +depicting a scene. Popular feature-based structure-from-motion (SfM) tools +solve this task by incremental reconstruction: they repeat triangulation of +sparse 3D points and registration of more camera views to the sparse point +cloud. We re-interpret incremental structure-from-motion as an iterated +application and refinement of a visual relocalizer, that is, of a method that +registers new views to the current state of the reconstruction. This +perspective allows us to investigate alternative visual relocalizers that are +not rooted in local feature matching. We show that scene coordinate regression, +a learning-based relocalization approach, allows us to build implicit, neural +scene representations from unposed images. Different from other learning-based +reconstruction methods, we do not require pose priors nor sequential inputs, +and we optimize efficiently over thousands of images. Our method, ACE0 (ACE +Zero), estimates camera poses to an accuracy comparable to feature-based SfM, +as demonstrated by novel view synthesis. Project page: +https://nianticlabs.github.io/acezero/ + +
+
+ comment: Project page: https://nianticlabs.github.io/acezero/ +
+
+
+
+
+ + ☆ Automatic Discovery of Visual Circuits + + +
+ To date, most discoveries of network subcomponents that implement +human-interpretable computations in deep vision models have involved close +study of single units and large amounts of human labor. We explore scalable +methods for extracting the subgraph of a vision model's computational graph +that underlies recognition of a specific visual concept. We introduce a new +method for identifying these subgraphs: specifying a visual concept using a few +examples, and then tracing the interdependence of neuron activations across +layers, or their functional connectivity. We find that our approach extracts +circuits that causally affect model output, and that editing these circuits can +defend large pretrained models from adversarial attacks. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ☆ On-the-Fly Point Annotation for Fast Medical Video Labeling + + +
+ Purpose: In medical research, deep learning models rely on high-quality +annotated data, a process often laborious and timeconsuming. This is +particularly true for detection tasks where bounding box annotations are +required. The need to adjust two corners makes the process inherently +frame-by-frame. Given the scarcity of experts' time, efficient annotation +methods suitable for clinicians are needed. Methods: We propose an on-the-fly +method for live video annotation to enhance the annotation efficiency. In this +approach, a continuous single-point annotation is maintained by keeping the +cursor on the object in a live video, mitigating the need for tedious pausing +and repetitive navigation inherent in traditional annotation methods. This +novel annotation paradigm inherits the point annotation's ability to generate +pseudo-labels using a point-to-box teacher model. We empirically evaluate this +approach by developing a dataset and comparing on-the-fly annotation time +against traditional annotation method. Results: Using our method, annotation +speed was 3.2x faster than the traditional annotation technique. We achieved a +mean improvement of 6.51 +- 0.98 AP@50 over conventional method at equivalent +annotation budgets on the developed dataset. Conclusion: Without bells and +whistles, our approach offers a significant speed-up in annotation tasks. It +can be easily implemented on any annotation platform to accelerate the +integration of deep learning in video-based medical research. + +
+
+ comment: 7 pages, 5 figures. Int J CARS (2024) +
+
+
+
+
+ + ☆ Heterogeneous Face Recognition Using Domain Invariant Units ICASSP 2024 + + +
+ Heterogeneous Face Recognition (HFR) aims to expand the applicability of Face +Recognition (FR) systems to challenging scenarios, enabling the matching of +face images across different domains, such as matching thermal images to +visible spectra. However, the development of HFR systems is challenging because +of the significant domain gap between modalities and the lack of availability +of large-scale paired multi-channel data. In this work, we leverage a +pretrained face recognition model as a teacher network to learn domaininvariant +network layers called Domain-Invariant Units (DIU) to reduce the domain gap. +The proposed DIU can be trained effectively even with a limited amount of +paired training data, in a contrastive distillation framework. This proposed +approach has the potential to enhance pretrained models, making them more +adaptable to a wider range of variations in data. We extensively evaluate our +approach on multiple challenging benchmarks, demonstrating superior performance +compared to state-of-the-art methods. + +
+
+ comment: 6 pages, Accepted ICASSP 2024 +
+
+
+
+
+ + ☆ X-Ray: A Sequential 3D Representation for Generation + + +
+ In this paper, we introduce X-Ray, an innovative approach to 3D generation +that employs a new sequential representation, drawing inspiration from the +depth-revealing capabilities of X-Ray scans to meticulously capture both the +external and internal features of objects. Central to our method is the +utilization of ray casting techniques originating from the camera's viewpoint, +meticulously recording the geometric and textural details encountered across +all intersected surfaces. This process efficiently condenses complete objects +or scenes into a multi-frame format, just like videos. Such a structure ensures +the 3D representation is composed solely of critical surface information. +Highlighting the practicality and adaptability of our X-Ray representation, we +showcase its utility in synthesizing 3D objects, employing a network +architecture akin to that used in video diffusion models. The outcomes reveal +our representation's superior performance in enhancing both the accuracy and +efficiency of 3D synthesis, heralding new directions for ongoing research and +practical implementations in the field. + +
+
+
+
+
+ + ☆ Machine Learning Techniques for MRI Data Processing at Expanding Scale + + +
+ Imaging sites around the world generate growing amounts of medical scan data +with ever more versatile and affordable technology. Large-scale studies acquire +MRI for tens of thousands of participants, together with metadata ranging from +lifestyle questionnaires to biochemical assays, genetic analyses and more. +These large datasets encode substantial information about human health and hold +considerable potential for machine learning training and analysis. This chapter +examines ongoing large-scale studies and the challenge of distribution shifts +between them. Transfer learning for overcoming such shifts is discussed, +together with federated learning for safe access to distributed training data +securely held at multiple institutions. Finally, representation learning is +reviewed as a methodology for encoding embeddings that express abstract +relationships in multi-modal input formats. + +
+
+ comment: Book chapter pre-print +
+
+
+
+
+ + ☆ A Novel Approach to Chest X-ray Lung Segmentation Using U-net and + Modified Convolutional Block Attention Module + + +
+ Lung segmentation in chest X-ray images is of paramount importance as it +plays a crucial role in the diagnosis and treatment of various lung diseases. +This paper presents a novel approach for lung segmentation in chest X-ray +images by integrating U-net with attention mechanisms. The proposed method +enhances the U-net architecture by incorporating a Convolutional Block +Attention Module (CBAM), which unifies three distinct attention mechanisms: +channel attention, spatial attention, and pixel attention. The channel +attention mechanism enables the model to concentrate on the most informative +features across various channels. The spatial attention mechanism enhances the +model's precision in localization by focusing on significant spatial locations. +Lastly, the pixel attention mechanism empowers the model to focus on individual +pixels, further refining the model's focus and thereby improving the accuracy +of segmentation. The adoption of the proposed CBAM in conjunction with the +U-net architecture marks a significant advancement in the field of medical +imaging, with potential implications for improving diagnostic precision and +patient outcomes. The efficacy of this method is validated against contemporary +state-of-the-art techniques, showcasing its superiority in segmentation +performance. + +
+
+
+
+
+ + ☆ Towards Better Adversarial Purification via Adversarial Denoising + Diffusion Training + + +
+ Recently, diffusion-based purification (DBP) has emerged as a promising +approach for defending against adversarial attacks. However, previous studies +have used questionable methods to evaluate the robustness of DBP models, their +explanations of DBP robustness also lack experimental support. We re-examine +DBP robustness using precise gradient, and discuss the impact of stochasticity +on DBP robustness. To better explain DBP robustness, we assess DBP robustness +under a novel attack setting, Deterministic White-box, and pinpoint +stochasticity as the main factor in DBP robustness. Our results suggest that +DBP models rely on stochasticity to evade the most effective attack direction, +rather than directly countering adversarial perturbations. To improve the +robustness of DBP models, we propose Adversarial Denoising Diffusion Training +(ADDT). This technique uses Classifier-Guided Perturbation Optimization (CGPO) +to generate adversarial perturbation through guidance from a pre-trained +classifier, and uses Rank-Based Gaussian Mapping (RBGM) to convert adversarial +pertubation into a normal Gaussian distribution. Empirical results show that +ADDT improves the robustness of DBP models. Further experiments confirm that +ADDT equips DBP models with the ability to directly counter adversarial +perturbations. + +
+
+
+
+
+ + ☆ Fast and Robust Normal Estimation for Sparse LiDAR Scans + + +
+ Light Detection and Ranging (LiDAR) technology has proven to be an important +part of many robotics systems. Surface normals estimated from LiDAR data are +commonly used for a variety of tasks in such systems. As most of the today's +mechanical LiDAR sensors produce sparse data, estimating normals from a single +scan in a robust manner poses difficulties. + In this paper, we address the problem of estimating normals for sparse LiDAR +data avoiding the typical issues of smoothing out the normals in high curvature +areas. + Mechanical LiDARs rotate a set of rigidly mounted lasers. One firing of such +a set of lasers produces an array of points where each point's neighbor is +known due to the known firing pattern of the scanner. We use this knowledge to +connect these points to their neighbors and label them using the angles of the +lines connecting them. When estimating normals at these points, we only +consider points with the same label as neighbors. This allows us to avoid +estimating normals in high curvature areas. + We evaluate our approach on various data, both self-recorded and publicly +available, acquired using various sparse LiDAR sensors. We show that using our +method for normal estimation leads to normals that are more robust in areas +with high curvature which leads to maps of higher quality. We also show that +our method only incurs a constant factor runtime overhead with respect to a +lightweight baseline normal estimation procedure and is therefore suited for +operation in computationally demanding environments. + +
+
+
+
+
+ + ☆ RESFM: Robust Equivariant Multiview Structure from Motion + + +
+ Multiview Structure from Motion is a fundamental and challenging computer +vision problem. A recent deep-based approach was proposed utilizing matrix +equivariant architectures for the simultaneous recovery of camera pose and 3D +scene structure from large image collections. This work however made the +unrealistic assumption that the point tracks given as input are clean of +outliers. Here we propose an architecture suited to dealing with outliers by +adding an inlier/outlier classifying module that respects the model +equivariance and by adding a robust bundle adjustment step. Experiments +demonstrate that our method can be successfully applied in realistic settings +that include large image collections and point tracks extracted with common +heuristics and include many outliers. + +
+
+
+
+
+ + ☆ Co-designing a Sub-millisecond Latency Event-based Eye Tracking System + with Submanifold Sparse CNN CVPR 2024 + + +
+ Eye-tracking technology is integral to numerous consumer electronics +applications, particularly in the realm of virtual and augmented reality +(VR/AR). These applications demand solutions that excel in three crucial +aspects: low-latency, low-power consumption, and precision. Yet, achieving +optimal performance across all these fronts presents a formidable challenge, +necessitating a balance between sophisticated algorithms and efficient backend +hardware implementations. In this study, we tackle this challenge through a +synergistic software/hardware co-design of the system with an event camera. +Leveraging the inherent sparsity of event-based input data, we integrate a +novel sparse FPGA dataflow accelerator customized for submanifold sparse +convolution neural networks (SCNN). The SCNN implemented on the accelerator can +efficiently extract the embedding feature vector from each representation of +event slices by only processing the non-zero activations. Subsequently, these +vectors undergo further processing by a gated recurrent unit (GRU) and a fully +connected layer on the host CPU to generate the eye centers. Deployment and +evaluation of our system reveal outstanding performance metrics. On the +Event-based Eye-Tracking-AIS2024 dataset, our system achieves 81% p5 accuracy, +99.5% p10 accuracy, and 3.71 Mean Euclidean Distance with 0.7 ms latency while +only consuming 2.29 mJ per inference. Notably, our solution opens up +opportunities for future eye-tracking systems. Code is available at +https://github.com/CASR-HKU/ESDA/tree/eye_tracking. + +
+
+ comment: Accepted to CVPR 2024 workshop, AIS: Vision, Graphics, and AI for + Streaming +
+
+
+
+
+ + ☆ CLIP-GS: CLIP-Informed Gaussian Splatting for Real-time and + View-consistent 3D Semantic Understanding + + +
+ The recent 3D Gaussian Splatting (GS) exhibits high-quality and real-time +synthesis of novel views in 3D scenes. Currently, it primarily focuses on +geometry and appearance modeling, while lacking the semantic understanding of +scenes. To bridge this gap, we present CLIP-GS, which integrates semantics from +Contrastive Language-Image Pre-Training (CLIP) into Gaussian Splatting to +efficiently comprehend 3D environments without annotated semantic data. In +specific, rather than straightforwardly learning and rendering high-dimensional +semantic features of 3D Gaussians, which significantly diminishes the +efficiency, we propose a Semantic Attribute Compactness (SAC) approach. SAC +exploits the inherent unified semantics within objects to learn compact yet +effective semantic representations of 3D Gaussians, enabling highly efficient +rendering (>100 FPS). Additionally, to address the semantic ambiguity, caused +by utilizing view-inconsistent 2D CLIP semantics to supervise Gaussians, we +introduce a 3D Coherent Self-training (3DCS) strategy, resorting to the +multi-view consistency originated from the 3D model. 3DCS imposes cross-view +semantic consistency constraints by leveraging refined, self-predicted +pseudo-labels derived from the trained 3D Gaussian model, thereby enhancing +precise and view-consistent segmentation results. Extensive experiments +demonstrate that our method remarkably outperforms existing state-of-the-art +approaches, achieving improvements of 17.29% and 20.81% in mIoU metric on +Replica and ScanNet datasets, respectively, while maintaining real-time +rendering speed. Furthermore, our approach exhibits superior performance even +with sparse input data, verifying the robustness of our method. + +
+
+ comment: https://github.com/gbliao/CLIP-GS +
+
+
+
+
+ + ☆ NTIRE 2024 Challenge on Low Light Image Enhancement: Methods and Results + + +
+ This paper reviews the NTIRE 2024 low light image enhancement challenge, +highlighting the proposed solutions and results. The aim of this challenge is +to discover an effective network design or solution capable of generating +brighter, clearer, and visually appealing results when dealing with a variety +of conditions, including ultra-high resolution (4K and beyond), non-uniform +illumination, backlighting, extreme darkness, and night scenes. A notable total +of 428 participants registered for the challenge, with 22 teams ultimately +making valid submissions. This paper meticulously evaluates the +state-of-the-art advancements in enhancing low-light images, reflecting the +significant progress and creativity in this field. + +
+
+ comment: NTIRE 2024 Challenge Report +
+
+
+
+
+ + ☆ From Modalities to Styles: Rethinking the Domain Gap in Heterogeneous + Face Recognition + + +
+ Heterogeneous Face Recognition (HFR) focuses on matching faces from different +domains, for instance, thermal to visible images, making Face Recognition (FR) +systems more versatile for challenging scenarios. However, the domain gap +between these domains and the limited large-scale datasets in the target HFR +modalities make it challenging to develop robust HFR models from scratch. In +our work, we view different modalities as distinct styles and propose a method +to modulate feature maps of the target modality to address the domain gap. We +present a new Conditional Adaptive Instance Modulation (CAIM ) module that +seamlessly fits into existing FR networks, turning them into HFR-ready systems. +The CAIM block modulates intermediate feature maps, efficiently adapting to the +style of the source modality and bridging the domain gap. Our method enables +end-to-end training using a small set of paired samples. We extensively +evaluate the proposed approach on various challenging HFR benchmarks, showing +that it outperforms state-of-the-art methods. The source code and protocols for +reproducing the findings will be made publicly available + +
+
+ comment: Accepted for publication in IEEE TBIOM +
+
+
+
+
+ + ☆ UrbanCross: Enhancing Satellite Image-Text Retrieval with Cross-Domain + Adaptation + + +
+ Urbanization challenges underscore the necessity for effective satellite +image-text retrieval methods to swiftly access specific information enriched +with geographic semantics for urban applications. However, existing methods +often overlook significant domain gaps across diverse urban landscapes, +primarily focusing on enhancing retrieval performance within single domains. To +tackle this issue, we present UrbanCross, a new framework for cross-domain +satellite image-text retrieval. UrbanCross leverages a high-quality, +cross-domain dataset enriched with extensive geo-tags from three countries to +highlight domain diversity. It employs the Large Multimodal Model (LMM) for +textual refinement and the Segment Anything Model (SAM) for visual +augmentation, achieving a fine-grained alignment of images, segments and texts, +yielding a 10% improvement in retrieval performance. Additionally, UrbanCross +incorporates an adaptive curriculum-based source sampler and a weighted +adversarial cross-domain fine-tuning module, progressively enhancing +adaptability across various domains. Extensive experiments confirm UrbanCross's +superior efficiency in retrieval and adaptation to new urban environments, +demonstrating an average performance increase of 15% over its version without +domain adaptation mechanisms, effectively bridging the domain gap. + +
+
+
+
+
+ + ☆ MultiBooth: Towards Generating All Your Concepts in an Image from Text + + +
+ This paper introduces MultiBooth, a novel and efficient technique for +multi-concept customization in image generation from text. Despite the +significant advancements in customized generation methods, particularly with +the success of diffusion models, existing methods often struggle with +multi-concept scenarios due to low concept fidelity and high inference cost. +MultiBooth addresses these issues by dividing the multi-concept generation +process into two phases: a single-concept learning phase and a multi-concept +integration phase. During the single-concept learning phase, we employ a +multi-modal image encoder and an efficient concept encoding technique to learn +a concise and discriminative representation for each concept. In the +multi-concept integration phase, we use bounding boxes to define the generation +area for each concept within the cross-attention map. This method enables the +creation of individual concepts within their specified regions, thereby +facilitating the formation of multi-concept images. This strategy not only +improves concept fidelity but also reduces additional inference cost. +MultiBooth surpasses various baselines in both qualitative and quantitative +evaluations, showcasing its superior performance and computational efficiency. +Project Page: https://multibooth.github.io/ + +
+
+ comment: Project Page: https://multibooth.github.io/ . Github Page: + https://github.com/chenyangzhu1/MultiBooth +
+
+
+
+
+ + ☆ Detecting and Mitigating Hallucination in Large Vision Language Models + via Fine-Grained AI Feedback + + +
+ The rapidly developing Large Vision Language Models (LVLMs) have shown +notable capabilities on a range of multi-modal tasks, but still face the +hallucination phenomena where the generated texts do not align with the given +contexts, significantly restricting the usages of LVLMs. Most previous work +detects and mitigates hallucination at the coarse-grained level or requires +expensive annotation (e.g., labeling by proprietary models or human experts). +To address these issues, we propose detecting and mitigating hallucinations in +LVLMs via fine-grained AI feedback. The basic idea is that we generate a +small-size sentence-level hallucination annotation dataset by proprietary +models, whereby we train a hallucination detection model which can perform +sentence-level hallucination detection, covering primary hallucination types +(i.e., object, attribute, and relationship). Then, we propose a +detect-then-rewrite pipeline to automatically construct preference dataset for +training hallucination mitigating model. Furthermore, we propose +differentiating the severity of hallucinations, and introducing a Hallucination +Severity-Aware Direct Preference Optimization (HSA-DPO) for mitigating +hallucination in LVLMs by incorporating the severity of hallucinations into +preference learning. Extensive experiments demonstrate the effectiveness of our +method. + +
+
+
+
+
+ + ☆ Generalizable Neural Human Renderer + + +
+ While recent advancements in animatable human rendering have achieved +remarkable results, they require test-time optimization for each subject which +can be a significant limitation for real-world applications. To address this, +we tackle the challenging task of learning a Generalizable Neural Human +Renderer (GNH), a novel method for rendering animatable humans from monocular +video without any test-time optimization. Our core method focuses on +transferring appearance information from the input video to the output image +plane by utilizing explicit body priors and multi-view geometry. To render the +subject in the intended pose, we utilize a straightforward CNN-based image +renderer, foregoing the more common ray-sampling or rasterizing-based rendering +modules. Our GNH achieves remarkable generalizable, photorealistic rendering +with unseen subjects with a three-stage process. We quantitatively and +qualitatively demonstrate that GNH significantly surpasses current +state-of-the-art methods, notably achieving a 31.3% improvement in LPIPS. + +
+
+
+
+
+ + ☆ BCFPL: Binary classification ConvNet based Fast Parking space + recognition with Low resolution image + + +
+ The automobile plays an important role in the economic activities of mankind, +especially in the metropolis. Under the circumstances, the demand of quick +search for available parking spaces has become a major concern for the +automobile drivers. Meanwhile, the public sense of privacy is also awaking, the +image-based parking space recognition methods lack the attention of privacy +protection. In this paper, we proposed a binary convolutional neural network +with lightweight design structure named BCFPL, which can be used to train with +low-resolution parking space images and offer a reasonable recognition result. +The images of parking space were collected from various complex environments, +including different weather, occlusion conditions, and various camera angles. +We conducted the training and testing progresses among different datasets and +partial subsets. The experimental results show that the accuracy of BCFPL does +not decrease compared with the original resolution image directly, and can +reach the average level of the existing mainstream method. BCFPL also has low +hardware requirements and fast recognition speed while meeting the privacy +requirements, so it has application potential in intelligent city construction +and automatic driving field. + +
+
+
+
+
+ + ☆ Face2Face: Label-driven Facial Retouching Restoration + + +
+ With the popularity of social media platforms such as Instagram and TikTok, +and the widespread availability and convenience of retouching tools, an +increasing number of individuals are utilizing these tools to beautify their +facial photographs. This poses challenges for fields that place high demands on +the authenticity of photographs, such as identity verification and social +media. By altering facial images, users can easily create deceptive images, +leading to the dissemination of false information. This may pose challenges to +the reliability of identity verification systems and social media, and even +lead to online fraud. To address this issue, some work has proposed makeup +removal methods, but they still lack the ability to restore images involving +geometric deformations caused by retouching. To tackle the problem of facial +retouching restoration, we propose a framework, dubbed Face2Face, which +consists of three components: a facial retouching detector, an image +restoration model named FaceR, and a color correction module called +Hierarchical Adaptive Instance Normalization (H-AdaIN). Firstly, the facial +retouching detector predicts a retouching label containing three integers, +indicating the retouching methods and their corresponding degrees. Then FaceR +restores the retouched image based on the predicted retouching label. Finally, +H-AdaIN is applied to address the issue of color shift arising from diffusion +models. Extensive experiments demonstrate the effectiveness of our framework +and each module. + +
+
+
+
+
+ + ☆ FLDM-VTON: Faithful Latent Diffusion Model for Virtual Try-on IJCAI 2024 + + +
+ Despite their impressive generative performance, latent diffusion model-based +virtual try-on (VTON) methods lack faithfulness to crucial details of the +clothes, such as style, pattern, and text. To alleviate these issues caused by +the diffusion stochastic nature and latent supervision, we propose a novel +Faithful Latent Diffusion Model for VTON, termed FLDM-VTON. FLDM-VTON improves +the conventional latent diffusion process in three major aspects. First, we +propose incorporating warped clothes as both the starting point and local +condition, supplying the model with faithful clothes priors. Second, we +introduce a novel clothes flattening network to constrain generated try-on +images, providing clothes-consistent faithful supervision. Third, we devise a +clothes-posterior sampling for faithful inference, further enhancing the model +performance over conventional clothes-agnostic Gaussian sampling. Extensive +experimental results on the benchmark VITON-HD and Dress Code datasets +demonstrate that our FLDM-VTON outperforms state-of-the-art baselines and is +able to generate photo-realistic try-on images with faithful clothing details. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Text in the Dark: Extremely Low-Light Text Image Enhancement + + +
+ Extremely low-light text images are common in natural scenes, making scene +text detection and recognition challenging. One solution is to enhance these +images using low-light image enhancement methods before text extraction. +However, previous methods often do not try to particularly address the +significance of low-level features, which are crucial for optimal performance +on downstream scene text tasks. Further research is also hindered by the lack +of extremely low-light text datasets. To address these limitations, we propose +a novel encoder-decoder framework with an edge-aware attention module to focus +on scene text regions during enhancement. Our proposed method uses novel text +detection and edge reconstruction losses to emphasize low-level scene text +features, leading to successful text extraction. Additionally, we present a +Supervised Deep Curve Estimation (Supervised-DCE) model to synthesize extremely +low-light images based on publicly available scene text datasets such as +ICDAR15 (IC15). We also labeled texts in the extremely low-light See In the +Dark (SID) and ordinary LOw-Light (LOL) datasets to allow for objective +assessment of extremely low-light image enhancement through scene text tasks. +Extensive experiments show that our model outperforms state-of-the-art methods +in terms of both image quality and scene text metrics on the widely-used LOL, +SID, and synthetic IC15 datasets. Code and dataset will be released publicly at +https://github.com/chunchet-ng/Text-in-the-Dark. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ☆ CRNet: A Detail-Preserving Network for Unified Image Restoration and + Enhancement Task CVPR2024 + + +
+ In real-world scenarios, images captured often suffer from blurring, noise, +and other forms of image degradation, and due to sensor limitations, people +usually can only obtain low dynamic range images. To achieve high-quality +images, researchers have attempted various image restoration and enhancement +operations on photographs, including denoising, deblurring, and high dynamic +range imaging. However, merely performing a single type of image enhancement +still cannot yield satisfactory images. In this paper, to deal with the +challenge above, we propose the Composite Refinement Network (CRNet) to address +this issue using multiple exposure images. By fully integrating +information-rich multiple exposure inputs, CRNet can perform unified image +restoration and enhancement. To improve the quality of image details, CRNet +explicitly separates and strengthens high and low-frequency information through +pooling layers, using specially designed Multi-Branch Blocks for effective +fusion of these frequencies. To increase the receptive field and fully +integrate input features, CRNet employs the High-Frequency Enhancement Module, +which includes large kernel convolutions and an inverted bottleneck ConvFFN. +Our model secured third place in the first track of the Bracketing Image +Restoration and Enhancement Challenge, surpassing previous SOTA models in both +testing metrics and visual quality. + +
+
+ comment: This paper is accepted by CVPR2024 Workshop, Code: + https://github.com/CalvinYang0/CRNet +
+
+
+
+
+ + ☆ Hierarchical localization with panoramic views and triplet loss + functions + + +
+ The main objective of this paper is to address the mobile robot localization +problem with Triplet Convolutional Neural Networks and test their robustness +against changes of the lighting conditions. We have used omnidirectional images +from real indoor environments captured in dynamic conditions that have been +converted to panoramic format. Two approaches are proposed to address +localization by means of triplet neural networks. First, hierarchical +localization, which consists in estimating the robot position in two stages: a +coarse localization, which involves a room retrieval task, and a fine +localization is addressed by means of image retrieval in the previously +selected room. Second, global localization, which consists in estimating the +position of the robot inside the entire map in a unique step. Besides, an +exhaustive study of the loss function influence on the network learning process +has been made. The experimental section proves that triplet neural networks are +an efficient and robust tool to address the localization of mobile robots in +indoor environments, considering real operation conditions. + +
+
+ comment: This work has been submitted to the Artificial Intelligence Journal + (Ed. Elsevier) for possible publication. Copyright may be transferred without + notice, after which this version may no longer be accessible +
+
+
+
+
+ + ☆ CKD: Contrastive Knowledge Distillation from A Sample-wise Perspective + + +
+ In this paper, we present a simple yet effective contrastive knowledge +distillation approach, which can be formulated as a sample-wise alignment +problem with intra- and inter-sample constraints. Unlike traditional knowledge +distillation methods that concentrate on maximizing feature similarities or +preserving class-wise semantic correlations between teacher and student +features, our method attempts to recover the "dark knowledge" by aligning +sample-wise teacher and student logits. Specifically, our method first +minimizes logit differences within the same sample by considering their +numerical values, thus preserving intra-sample similarities. Next, we bridge +semantic disparities by leveraging dissimilarities across different samples. +Note that constraints on intra-sample similarities and inter-sample +dissimilarities can be efficiently and effectively reformulated into a +contrastive learning framework with newly designed positive and negative pairs. +The positive pair consists of the teacher's and student's logits derived from +an identical sample, while the negative pairs are formed by using logits from +different samples. With this formulation, our method benefits from the +simplicity and efficiency of contrastive learning through the optimization of +InfoNCE, yielding a run-time complexity that is far less than $O(n^2)$, where +$n$ represents the total number of training samples. Furthermore, our method +can eliminate the need for hyperparameter tuning, particularly related to +temperature parameters and large batch sizes. We conduct comprehensive +experiments on three datasets including CIFAR-100, ImageNet-1K, and MS COCO. +Experimental results clearly confirm the effectiveness of the proposed method +on both image classification and object detection tasks. Our source codes will +be publicly available at https://github.com/wencheng-zhu/CKD. + +
+
+
+
+
+ + ☆ DynaMMo: Dynamic Model Merging for Efficient Class Incremental Learning + for Medical Images + + +
+ Continual learning, the ability to acquire knowledge from new data while +retaining previously learned information, is a fundamental challenge in machine +learning. Various approaches, including memory replay, knowledge distillation, +model regularization, and dynamic network expansion, have been proposed to +address this issue. Thus far, dynamic network expansion methods have achieved +state-of-the-art performance at the cost of incurring significant computational +overhead. This is due to the need for additional model buffers, which makes it +less feasible in resource-constrained settings, particularly in the medical +domain. To overcome this challenge, we propose Dynamic Model Merging, DynaMMo, +a method that merges multiple networks at different stages of model training to +achieve better computational efficiency. Specifically, we employ lightweight +learnable modules for each task and combine them into a unified model to +minimize computational overhead. DynaMMo achieves this without compromising +performance, offering a cost-effective solution for continual learning in +medical applications. We evaluate DynaMMo on three publicly available datasets, +demonstrating its effectiveness compared to existing approaches. DynaMMo offers +around 10-fold reduction in GFLOPS with a small drop of 2.76 in average +accuracy when compared to state-of-the-art dynamic-based approaches. The code +implementation of this work will be available upon the acceptance of this work +at https://github.com/BioMedIA-MBZUAI/DynaMMo. + +
+
+
+
+
+ + ☆ Research on Robot Path Planning Based on Reinforcement Learning + + +
+ This project has conducted research on robot path planning based on Visual +SLAM. The main work of this project is as follows: (1) Construction of Visual +SLAM system. Research has been conducted on the basic architecture of Visual +SLAM. A Visual SLAM system is developed based on ORB-SLAM3 system, which can +conduct dense point cloud mapping. (2) The map suitable for two-dimensional +path planning is obtained through map conversion. This part converts the dense +point cloud map obtained by Visual SLAM system into an octomap and then +performs projection transformation to the grid map. The map conversion converts +the dense point cloud map containing a large amount of redundant map +information into an extremely lightweight grid map suitable for path planning. +(3) Research on path planning algorithm based on reinforcement learning. This +project has conducted experimental comparisons between the Q-learning +algorithm, the DQN algorithm, and the SARSA algorithm, and found that DQN is +the algorithm with the fastest convergence and best performance in +high-dimensional complex environments. This project has conducted experimental +verification of the Visual SLAM system in a simulation environment. The +experimental results obtained based on open-source dataset and self-made +dataset prove the feasibility and effectiveness of the designed Visual SLAM +system. At the same time, this project has also conducted comparative +experiments on the three reinforcement learning algorithms under the same +experimental condition to obtain the optimal algorithm under the experimental +condition. + +
+
+ comment: My undergrad final year project report, 44 pages and 15 figures +
+
+
+
+
+ + ☆ Noise contrastive estimation with soft targets for conditional models + + +
+ Soft targets combined with the cross-entropy loss have shown to improve +generalization performance of deep neural networks on supervised classification +tasks. The standard cross-entropy loss however assumes data to be categorically +distributed, which may often not be the case in practice. In contrast, InfoNCE +does not rely on such an explicit assumption but instead implicitly estimates +the true conditional through negative sampling. Unfortunately, it cannot be +combined with soft targets in its standard formulation, hindering its use in +combination with sophisticated training strategies. In this paper, we address +this limitation by proposing a principled loss function that is compatible with +probabilistic targets. Our new soft target InfoNCE loss is conceptually simple, +efficient to compute, and can be derived within the framework of noise +contrastive estimation. Using a toy example, we demonstrate shortcomings of the +categorical distribution assumption of cross-entropy, and discuss implications +of sampling from soft distributions. We observe that soft target InfoNCE +performs on par with strong soft target cross-entropy baselines and outperforms +hard target NLL and InfoNCE losses on popular benchmarks, including ImageNet. +Finally, we provide a simple implementation of our loss, geared towards +supervised classification and fully compatible with deep classification model +trained with cross-entropy. + +
+
+
+
+
+ + ☆ SHE-Net: Syntax-Hierarchy-Enhanced Text-Video Retrieval + + +
+ The user base of short video apps has experienced unprecedented growth in +recent years, resulting in a significant demand for video content analysis. In +particular, text-video retrieval, which aims to find the top matching videos +given text descriptions from a vast video corpus, is an essential function, the +primary challenge of which is to bridge the modality gap. Nevertheless, most +existing approaches treat texts merely as discrete tokens and neglect their +syntax structures. Moreover, the abundant spatial and temporal clues in videos +are often underutilized due to the lack of interaction with text. To address +these issues, we argue that using texts as guidance to focus on relevant +temporal frames and spatial regions within videos is beneficial. In this paper, +we propose a novel Syntax-Hierarchy-Enhanced text-video retrieval method +(SHE-Net) that exploits the inherent semantic and syntax hierarchy of texts to +bridge the modality gap from two perspectives. First, to facilitate a more +fine-grained integration of visual content, we employ the text syntax +hierarchy, which reveals the grammatical structure of text descriptions, to +guide the visual representations. Second, to further enhance the multi-modal +interaction and alignment, we also utilize the syntax hierarchy to guide the +similarity calculation. We evaluated our method on four public text-video +retrieval datasets of MSR-VTT, MSVD, DiDeMo, and ActivityNet. The experimental +results and ablation studies confirm the advantages of our proposed method. + +
+
+
+
+
+ + ☆ Multi-view Disentanglement for Reinforcement Learning with Multiple + Cameras + + +
+ The performance of image-based Reinforcement Learning (RL) agents can vary +depending on the position of the camera used to capture the images. Training on +multiple cameras simultaneously, including a first-person egocentric camera, +can leverage information from different camera perspectives to improve the +performance of RL. However, hardware constraints may limit the availability of +multiple cameras in real-world deployment. Additionally, cameras may become +damaged in the real-world preventing access to all cameras that were used +during training. To overcome these hardware constraints, we propose Multi-View +Disentanglement (MVD), which uses multiple cameras to learn a policy that +achieves zero-shot generalisation to any single camera from the training set. +Our approach is a self-supervised auxiliary task for RL that learns a +disentangled representation from multiple cameras, with a shared representation +that is aligned across all cameras to allow generalisation to a single camera, +and a private representation that is camera-specific. We show experimentally +that an RL agent trained on a single third-person camera is unable to learn an +optimal policy in many control tasks; but, our approach, benefiting from +multiple cameras during training, is able to solve the task using only the same +single third-person camera. + +
+
+
+
+
+ + ☆ GatedLexiconNet: A Comprehensive End-to-End Handwritten Paragraph Text + Recognition System + + +
+ The Handwritten Text Recognition problem has been a challenge for researchers +for the last few decades, especially in the domain of computer vision, a +subdomain of pattern recognition. Variability of texts amongst writers, +cursiveness, and different font styles of handwritten texts with degradation of +historical text images make it a challenging problem. Recognizing scanned +document images in neural network-based systems typically involves a two-step +approach: segmentation and recognition. However, this method has several +drawbacks. These shortcomings encompass challenges in identifying text regions, +analyzing layout diversity within pages, and establishing accurate ground truth +segmentation. Consequently, these processes are prone to errors, leading to +bottlenecks in achieving high recognition accuracies. Thus, in this study, we +present an end-to-end paragraph recognition system that incorporates internal +line segmentation and gated convolutional layers based encoder. The gating is a +mechanism that controls the flow of information and allows to adaptively +selection of the more relevant features in handwritten text recognition models. +The attention module plays an important role in performing internal line +segmentation, allowing the page to be processed line-by-line. During the +decoding step, we have integrated a connectionist temporal classification-based +word beam search decoder as a post-processing step. In this work, we have +extended existing LexiconNet by carefully applying and utilizing gated +convolutional layers in the existing deep neural network. Our results at line +and page levels also favour our new GatedLexiconNet. This study reported +character error rates of 2.27% on IAM, 0.9% on RIMES, and 2.13% on READ-16, and +word error rates of 5.73% on IAM, 2.76% on RIMES, and 6.52% on READ-2016 +datasets. + +
+
+
+
+
+ + ☆ RingID: Rethinking Tree-Ring Watermarking for Enhanced Multi-Key + Identification + + +
+ We revisit Tree-Ring Watermarking, a recent diffusion model watermarking +method that demonstrates great robustness to various attacks. We conduct an +in-depth study on it and reveal that the distribution shift unintentionally +introduced by the watermarking process, apart from watermark pattern matching, +contributes to its exceptional robustness. Our investigation further exposes +inherent flaws in its original design, particularly in its ability to identify +multiple distinct keys, where distribution shift offers no assistance. Based on +these findings and analysis, we present RingID for enhanced multi-key +identification. It consists of a novel multi-channel heterogeneous watermarking +approach designed to seamlessly amalgamate distinctive advantages from diverse +watermarks. Coupled with a series of suggested enhancements, RingID exhibits +substantial advancements in multi-key identification. + +
+
+ comment: 25 pages, 8 figures +
+
+
+
+
+ + ☆ HashPoint: Accelerated Point Searching and Sampling for Neural Rendering CVPR2024 + + +
+ In this paper, we address the problem of efficient point searching and +sampling for volume neural rendering. Within this realm, two typical approaches +are employed: rasterization and ray tracing. The rasterization-based methods +enable real-time rendering at the cost of increased memory and lower fidelity. +In contrast, the ray-tracing-based methods yield superior quality but demand +longer rendering time. We solve this problem by our HashPoint method combining +these two strategies, leveraging rasterization for efficient point searching +and sampling, and ray marching for rendering. Our method optimizes point +searching by rasterizing points within the camera's view, organizing them in a +hash table, and facilitating rapid searches. Notably, we accelerate the +rendering process by adaptive sampling on the primary surface encountered by +the ray. Our approach yields substantial speed-up for a range of +state-of-the-art ray-tracing-based methods, maintaining equivalent or superior +accuracy across synthetic and real test datasets. The code will be available at +https://jiahao-ma.github.io/hashpoint/. + +
+
+ comment: CVPR2024 Highlight +
+
+
+
+
+ + ☆ CloudFort: Enhancing Robustness of 3D Point Cloud Classification Against + Backdoor Attacks via Spatial Partitioning and Ensemble Prediction + + +
+ The increasing adoption of 3D point cloud data in various applications, such +as autonomous vehicles, robotics, and virtual reality, has brought about +significant advancements in object recognition and scene understanding. +However, this progress is accompanied by new security challenges, particularly +in the form of backdoor attacks. These attacks involve inserting malicious +information into the training data of machine learning models, potentially +compromising the model's behavior. In this paper, we propose CloudFort, a novel +defense mechanism designed to enhance the robustness of 3D point cloud +classifiers against backdoor attacks. CloudFort leverages spatial partitioning +and ensemble prediction techniques to effectively mitigate the impact of +backdoor triggers while preserving the model's performance on clean data. We +evaluate the effectiveness of CloudFort through extensive experiments, +demonstrating its strong resilience against the Point Cloud Backdoor Attack +(PCBA). Our results show that CloudFort significantly enhances the security of +3D point cloud classification models without compromising their accuracy on +benign samples. Furthermore, we explore the limitations of CloudFort and +discuss potential avenues for future research in the field of 3D point cloud +security. The proposed defense mechanism represents a significant step towards +ensuring the trustworthiness and reliability of point-cloud-based systems in +real-world applications. + +
+
+
+
+
+ + ☆ Surgical-DeSAM: Decoupling SAM for Instrument Segmentation in Robotic + Surgery + + +
+ Purpose: The recent Segment Anything Model (SAM) has demonstrated impressive +performance with point, text or bounding box prompts, in various applications. +However, in safety-critical surgical tasks, prompting is not possible due to +(i) the lack of per-frame prompts for supervised learning, (ii) it is +unrealistic to prompt frame-by-frame in a real-time tracking application, and +(iii) it is expensive to annotate prompts for offline applications. + Methods: We develop Surgical-DeSAM to generate automatic bounding box prompts +for decoupling SAM to obtain instrument segmentation in real-time robotic +surgery. We utilise a commonly used detection architecture, DETR, and +fine-tuned it to obtain bounding box prompt for the instruments. We then +empolyed decoupling SAM (DeSAM) by replacing the image encoder with DETR +encoder and fine-tune prompt encoder and mask decoder to obtain instance +segmentation for the surgical instruments. To improve detection performance, we +adopted the Swin-transformer to better feature representation. + Results: The proposed method has been validated on two publicly available +datasets from the MICCAI surgical instruments segmentation challenge EndoVis +2017 and 2018. The performance of our method is also compared with SOTA +instrument segmentation methods and demonstrated significant improvements with +dice metrics of 89.62 and 90.70 for the EndoVis 2017 and 2018. + Conclusion: Our extensive experiments and validations demonstrate that +Surgical-DeSAM enables real-time instrument segmentation without any additional +prompting and outperforms other SOTA segmentation methods. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ GaussianTalker: Speaker-specific Talking Head Synthesis via 3D Gaussian + Splatting + + +
+ Recent works on audio-driven talking head synthesis using Neural Radiance +Fields (NeRF) have achieved impressive results. However, due to inadequate pose +and expression control caused by NeRF implicit representation, these methods +still have some limitations, such as unsynchronized or unnatural lip movements, +and visual jitter and artifacts. In this paper, we propose GaussianTalker, a +novel method for audio-driven talking head synthesis based on 3D Gaussian +Splatting. With the explicit representation property of 3D Gaussians, intuitive +control of the facial motion is achieved by binding Gaussians to 3D facial +models. GaussianTalker consists of two modules, Speaker-specific Motion +Translator and Dynamic Gaussian Renderer. Speaker-specific Motion Translator +achieves accurate lip movements specific to the target speaker through +universalized audio feature extraction and customized lip motion generation. +Dynamic Gaussian Renderer introduces Speaker-specific BlendShapes to enhance +facial detail representation via a latent pose, delivering stable and realistic +rendered videos. Extensive experimental results suggest that GaussianTalker +outperforms existing state-of-the-art methods in talking head synthesis, +delivering precise lip synchronization and exceptional visual quality. Our +method achieves rendering speeds of 130 FPS on NVIDIA RTX4090 GPU, +significantly exceeding the threshold for real-time rendering performance, and +can potentially be deployed on other hardware platforms. + +
+
+
+
+
+ + ☆ PointDifformer: Robust Point Cloud Registration With Neural Diffusion + and Transformer + + +
+ Point cloud registration is a fundamental technique in 3-D computer vision +with applications in graphics, autonomous driving, and robotics. However, +registration tasks under challenging conditions, under which noise or +perturbations are prevalent, can be difficult. We propose a robust point cloud +registration approach that leverages graph neural partial differential +equations (PDEs) and heat kernel signatures. Our method first uses graph neural +PDE modules to extract high dimensional features from point clouds by +aggregating information from the 3-D point neighborhood, thereby enhancing the +robustness of the feature representations. Then, we incorporate heat kernel +signatures into an attention mechanism to efficiently obtain corresponding +keypoints. Finally, a singular value decomposition (SVD) module with learnable +weights is used to predict the transformation between two point clouds. +Empirical experiments on a 3-D point cloud dataset demonstrate that our +approach not only achieves state-of-the-art performance for point cloud +registration but also exhibits better robustness to additive noise or 3-D shape +perturbations. + +
+
+ comment: Accepted by IEEE Transactions on Geoscience and Remote Sensing +
+
+
+
+
+ + ☆ 1st Place Solution to the 1st SkatingVerse Challenge + + +
+ This paper presents the winning solution for the 1st SkatingVerse Challenge. +We propose a method that involves several steps. To begin, we leverage the DINO +framework to extract the Region of Interest (ROI) and perform precise cropping +of the raw video footage. Subsequently, we employ three distinct models, namely +Unmasked Teacher, UniformerV2, and InfoGCN, to capture different aspects of the +data. By ensembling the prediction results based on logits, our solution +attains an impressive leaderboard score of 95.73%. + +
+
+ comment: 3 pages, 1st SkatingVerse Challenge, 18th IEEE International + Conference on Automatic Face and Gesture Recognition workshop +
+
+
+
+
+ + ☆ OccFeat: Self-supervised Occupancy Feature Prediction for Pretraining + BEV Segmentation Networks + + +
+ We introduce a self-supervised pretraining method, called OcFeat, for +camera-only Bird's-Eye-View (BEV) segmentation networks. With OccFeat, we +pretrain a BEV network via occupancy prediction and feature distillation tasks. +Occupancy prediction provides a 3D geometric understanding of the scene to the +model. However, the geometry learned is class-agnostic. Hence, we add semantic +information to the model in the 3D space through distillation from a +self-supervised pretrained image foundation model. Models pretrained with our +method exhibit improved BEV semantic segmentation performance, particularly in +low-data scenarios. Moreover, empirical results affirm the efficacy of +integrating feature distillation with 3D occupancy prediction in our +pretraining approach. + +
+
+
+
+
+ + ☆ DHRNet: A Dual-Path Hierarchical Relation Network for Multi-Person Pose + Estimation + + +
+ Multi-person pose estimation (MPPE) presents a formidable yet crucial +challenge in computer vision. Most existing methods predominantly concentrate +on isolated interaction either between instances or joints, which is inadequate +for scenarios demanding concurrent localization of both instances and joints. +This paper introduces a novel CNN-based single-stage method, named Dual-path +Hierarchical Relation Network (DHRNet), to extract instance-to-joint and +joint-to-instance interactions concurrently. Specifically, we design a +dual-path interaction modeling module (DIM) that strategically organizes +cross-instance and cross-joint interaction modeling modules in two +complementary orders, enriching interaction information by integrating merits +from different correlation modeling branches. Notably, DHRNet excels in joint +localization by leveraging information from other instances and joints. +Extensive evaluations on challenging datasets, including COCO, CrowdPose, and +OCHuman datasets, showcase DHRNet's state-of-the-art performance. The code will +be released at https://github.com/YHDang/dhrnet-multi-pose-estimation. + +
+
+
+
+
+ + ☆ Collaborative Perception Datasets in Autonomous Driving: A Survey + + +
+ This survey offers a comprehensive examination of collaborative perception +datasets in the context of Vehicle-to-Infrastructure (V2I), Vehicle-to-Vehicle +(V2V), and Vehicle-to-Everything (V2X). It highlights the latest developments +in large-scale benchmarks that accelerate advancements in perception tasks for +autonomous vehicles. The paper systematically analyzes a variety of datasets, +comparing them based on aspects such as diversity, sensor setup, quality, +public availability, and their applicability to downstream tasks. It also +highlights the key challenges such as domain shift, sensor setup limitations, +and gaps in dataset diversity and availability. The importance of addressing +privacy and security concerns in the development of datasets is emphasized, +regarding data sharing and dataset creation. The conclusion underscores the +necessity for comprehensive, globally accessible datasets and collaborative +efforts from both technological and research communities to overcome these +challenges and fully harness the potential of autonomous driving. + +
+
+ comment: 8 pages,3 figures +
+
+
+
+
+ + ☆ A Multimodal Feature Distillation with CNN-Transformer Network for Brain + Tumor Segmentation with Incomplete Modalities + + +
+ Existing brain tumor segmentation methods usually utilize multiple Magnetic +Resonance Imaging (MRI) modalities in brain tumor images for segmentation, +which can achieve better segmentation performance. However, in clinical +applications, some modalities are missing due to resource constraints, leading +to severe degradation in the performance of methods applying complete modality +segmentation. In this paper, we propose a Multimodal feature distillation with +Convolutional Neural Network (CNN)-Transformer hybrid network (MCTSeg) for +accurate brain tumor segmentation with missing modalities. We first design a +Multimodal Feature Distillation (MFD) module to distill feature-level +multimodal knowledge into different unimodality to extract complete modality +information. We further develop a Unimodal Feature Enhancement (UFE) module to +model the relationship between global and local information semantically. +Finally, we build a Cross-Modal Fusion (CMF) module to explicitly align the +global correlations among different modalities even when some modalities are +missing. Complementary features within and across different modalities are +refined via the CNN-Transformer hybrid architectures in both the UFE and CMF +modules, where local and global dependencies are both captured. Our ablation +study demonstrates the importance of the proposed modules with CNN-Transformer +networks and the convolutional blocks in Transformer for improving the +performance of brain tumor segmentation with missing modalities. Extensive +experiments on the BraTS2018 and BraTS2020 datasets show that the proposed +MCTSeg framework outperforms the state-of-the-art methods in missing modalities +cases. Our code is available at: https://github.com/mkang315/MCTSeg. + +
+
+
+
+
+ + ☆ Ungeneralizable Examples CVPR2024 + + +
+ The training of contemporary deep learning models heavily relies on publicly +available data, posing a risk of unauthorized access to online data and raising +concerns about data privacy. Current approaches to creating unlearnable data +involve incorporating small, specially designed noises, but these methods +strictly limit data usability, overlooking its potential usage in authorized +scenarios. In this paper, we extend the concept of unlearnable data to +conditional data learnability and introduce \textbf{U}n\textbf{G}eneralizable +\textbf{E}xamples (UGEs). UGEs exhibit learnability for authorized users while +maintaining unlearnability for potential hackers. The protector defines the +authorized network and optimizes UGEs to match the gradients of the original +data and its ungeneralizable version, ensuring learnability. To prevent +unauthorized learning, UGEs are trained by maximizing a designated distance +loss in a common feature space. Additionally, to further safeguard the +authorized side from potential attacks, we introduce additional undistillation +optimization. Experimental results on multiple datasets and various networks +demonstrate that the proposed UGEs framework preserves data usability while +reducing training performance on hacker networks, even under different types of +attacks. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Infusion: Preventing Customized Text-to-Image Diffusion from Overfitting + + +
+ Text-to-image (T2I) customization aims to create images that embody specific +visual concepts delineated in textual descriptions. However, existing works +still face a main challenge, concept overfitting. To tackle this challenge, we +first analyze overfitting, categorizing it into concept-agnostic overfitting, +which undermines non-customized concept knowledge, and concept-specific +overfitting, which is confined to customize on limited modalities, i.e, +backgrounds, layouts, styles. To evaluate the overfitting degree, we further +introduce two metrics, i.e, Latent Fisher divergence and Wasserstein metric to +measure the distribution changes of non-customized and customized concept +respectively. Drawing from the analysis, we propose Infusion, a T2I +customization method that enables the learning of target concepts to avoid +being constrained by limited training modalities, while preserving +non-customized knowledge. Remarkably, Infusion achieves this feat with +remarkable efficiency, requiring a mere 11KB of trained parameters. Extensive +experiments also demonstrate that our approach outperforms state-of-the-art +methods in both single and multi-concept customized generation. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Distilled Datamodel with Reverse Gradient Matching CVPR2024 + + +
+ The proliferation of large-scale AI models trained on extensive datasets has +revolutionized machine learning. With these models taking on increasingly +central roles in various applications, the need to understand their behavior +and enhance interpretability has become paramount. To investigate the impact of +changes in training data on a pre-trained model, a common approach is +leave-one-out retraining. This entails systematically altering the training +dataset by removing specific samples to observe resulting changes within the +model. However, retraining the model for each altered dataset presents a +significant computational challenge, given the need to perform this operation +for every dataset variation. In this paper, we introduce an efficient framework +for assessing data impact, comprising offline training and online evaluation +stages. During the offline training phase, we approximate the influence of +training data on the target model through a distilled synset, formulated as a +reversed gradient matching problem. For online evaluation, we expedite the +leave-one-out process using the synset, which is then utilized to compute the +attribution matrix based on the evaluation objective. Experimental evaluations, +including training data attribution and assessments of data quality, +demonstrate that our proposed method achieves comparable model behavior +evaluation while significantly speeding up the process compared to the direct +retraining method. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ CoFInAl: Enhancing Action Quality Assessment with Coarse-to-Fine + Instruction Alignment IJCAI 2024 + + +
+ Action Quality Assessment (AQA) is pivotal for quantifying actions across +domains like sports and medical care. Existing methods often rely on +pre-trained backbones from large-scale action recognition datasets to boost +performance on smaller AQA datasets. However, this common strategy yields +suboptimal results due to the inherent struggle of these backbones to capture +the subtle cues essential for AQA. Moreover, fine-tuning on smaller datasets +risks overfitting. To address these issues, we propose Coarse-to-Fine +Instruction Alignment (CoFInAl). Inspired by recent advances in large language +model tuning, CoFInAl aligns AQA with broader pre-trained tasks by +reformulating it as a coarse-to-fine classification task. Initially, it learns +grade prototypes for coarse assessment and then utilizes fixed sub-grade +prototypes for fine-grained assessment. This hierarchical approach mirrors the +judging process, enhancing interpretability within the AQA framework. +Experimental results on two long-term AQA datasets demonstrate CoFInAl achieves +state-of-the-art performance with significant correlation gains of 5.49% and +3.55% on Rhythmic Gymnastics and Fis-V, respectively. Our code is available at +https://github.com/ZhouKanglei/CoFInAl_AQA. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Challenges in automatic and selective plant-clearing + + +
+ With the advent of multispectral imagery and AI, there have been numerous +works on automatic plant segmentation for purposes such as counting, picking, +health monitoring, localized pesticide delivery, etc. In this paper, we tackle +the related problem of automatic and selective plant-clearing in a sustainable +forestry context, where an autonomous machine has to detect and avoid specific +plants while clearing any weeds which may compete with the species being +cultivated. Such an autonomous system requires a high level of robustness to +weather conditions, plant variability, terrain and weeds while remaining cheap +and easy to maintain. We notably discuss the lack of robustness of spectral +imagery, investigate the impact of the reference database's size and discuss +issues specific to AI systems operating in uncontrolled environments. + +
+
+
+
+
+ + ☆ Zero-Shot Character Identification and Speaker Prediction in Comics via + Iterative Multimodal Fusion + + +
+ Recognizing characters and predicting speakers of dialogue are critical for +comic processing tasks, such as voice generation or translation. However, +because characters vary by comic title, supervised learning approaches like +training character classifiers which require specific annotations for each +comic title are infeasible. This motivates us to propose a novel zero-shot +approach, allowing machines to identify characters and predict speaker names +based solely on unannotated comic images. In spite of their importance in +real-world applications, these task have largely remained unexplored due to +challenges in story comprehension and multimodal integration. Recent large +language models (LLMs) have shown great capability for text understanding and +reasoning, while their application to multimodal content analysis is still an +open problem. To address this problem, we propose an iterative multimodal +framework, the first to employ multimodal information for both character +identification and speaker prediction tasks. Our experiments demonstrate the +effectiveness of the proposed framework, establishing a robust baseline for +these tasks. Furthermore, since our method requires no training data or +annotations, it can be used as-is on any comic series. + +
+
+
+
+
+ + ☆ Dynamic Proxy Domain Generalizes the Crowd Localization by Better Binary + Segmentation + + +
+ Crowd localization targets on predicting each instance precise location +within an image. Current advanced methods propose the pixel-wise binary +classification to tackle the congested prediction, in which the pixel-level +thresholds binarize the prediction confidence of being the pedestrian head. +Since the crowd scenes suffer from extremely varying contents, counts and +scales, the confidence-threshold learner is fragile and under-generalized +encountering domain knowledge shift. Moreover, at the most time, the target +domain is agnostic in training. Hence, it is imperative to exploit how to +enhance the generalization of confidence-threshold locator to the latent target +domain. In this paper, we propose a Dynamic Proxy Domain (DPD) method to +generalize the learner under domain shift. Concretely, based on the theoretical +analysis to the generalization error risk upper bound on the latent target +domain to a binary classifier, we propose to introduce a generated proxy domain +to facilitate generalization. Then, based on the theory, we design a DPD +algorithm which is composed by a training paradigm and proxy domain generator +to enhance the domain generalization of the confidence-threshold learner. +Besides, we conduct our method on five kinds of domain shift scenarios, +demonstrating the effectiveness on generalizing the crowd localization. Our +code will be available at https://github.com/zhangda1018/DPD. + +
+
+
+
+
+ + ☆ RHanDS: Refining Malformed Hands for Generated Images with Decoupled + Structure and Style Guidance + + +
+ Although diffusion models can generate high-quality human images, their +applications are limited by the instability in generating hands with correct +structures. Some previous works mitigate the problem by considering hand +structure yet struggle to maintain style consistency between refined malformed +hands and other image regions. In this paper, we aim to solve the problem of +inconsistency regarding hand structure and style. We propose a conditional +diffusion-based framework RHanDS to refine the hand region with the help of +decoupled structure and style guidance. Specifically, the structure guidance is +the hand mesh reconstructed from the malformed hand, serving to correct the +hand structure. The style guidance is a hand image, e.g., the malformed hand +itself, and is employed to furnish the style reference for hand refining. In +order to suppress the structure leakage when referencing hand style and +effectively utilize hand data to improve the capability of the model, we build +a multi-style hand dataset and introduce a twostage training strategy. In the +first stage, we use paired hand images for training to generate hands with the +same style as the reference. In the second stage, various hand images generated +based on the human mesh are used for training to enable the model to gain +control over the hand structure. We evaluate our method and counterparts on the +test dataset of the proposed multi-style hand dataset. The experimental results +show that RHanDS can effectively refine hands structure- and style- correctly +compared with previous methods. The codes and datasets will be available soon. + +
+
+
+
+
+ + ☆ Structure-Aware Human Body Reshaping with Adaptive Affinity-Graph + Network + + +
+ Given a source portrait, the automatic human body reshaping task aims at +editing it to an aesthetic body shape. As the technology has been widely used +in media, several methods have been proposed mainly focusing on generating +optical flow to warp the body shape. However, those previous works only +consider the local transformation of different body parts (arms, torso, and +legs), ignoring the global affinity, and limiting the capacity to ensure +consistency and quality across the entire body. In this paper, we propose a +novel Adaptive Affinity-Graph Network (AAGN), which extracts the global +affinity between different body parts to enhance the quality of the generated +optical flow. Specifically, our AAGN primarily introduces the following +designs: (1) we propose an Adaptive Affinity-Graph (AAG) Block that leverages +the characteristic of a fully connected graph. AAG represents different body +parts as nodes in an adaptive fully connected graph and captures all the +affinities between nodes to obtain a global affinity map. The design could +better improve the consistency between body parts. (2) Besides, for +high-frequency details are crucial for photo aesthetics, a Body Shape +Discriminator (BSD) is designed to extract information from both high-frequency +and spatial domain. Particularly, an SRM filter is utilized to extract +high-frequency details, which are combined with spatial features as input to +the BSD. With this design, BSD guides the Flow Generator (FG) to pay attention +to various fine details rather than rigid pixel-level fitting. Extensive +experiments conducted on the BR-5K dataset demonstrate that our framework +significantly enhances the aesthetic appeal of reshaped photos, marginally +surpassing all previous work to achieve state-of-the-art in all evaluation +metrics. + +
+
+ comment: 11 pages; +
+
+
+
+
+ + ☆ Non-Uniform Exposure Imaging via Neuromorphic Shutter Control + + +
+ By leveraging the blur-noise trade-off, imaging with non-uniform exposures +largely extends the image acquisition flexibility in harsh environments. +However, the limitation of conventional cameras in perceiving intra-frame +dynamic information prevents existing methods from being implemented in the +real-world frame acquisition for real-time adaptive camera shutter control. To +address this challenge, we propose a novel Neuromorphic Shutter Control (NSC) +system to avoid motion blurs and alleviate instant noises, where the extremely +low latency of events is leveraged to monitor the real-time motion and +facilitate the scene-adaptive exposure. Furthermore, to stabilize the +inconsistent Signal-to-Noise Ratio (SNR) caused by the non-uniform exposure +times, we propose an event-based image denoising network within a +self-supervised learning paradigm, i.e., SEID, exploring the statistics of +image noises and inter-frame motion information of events to obtain artificial +supervision signals for high-quality imaging in real-world scenes. To +illustrate the effectiveness of the proposed NSC, we implement it in hardware +by building a hybrid-camera imaging prototype system, with which we collect a +real-world dataset containing well-synchronized frames and events in diverse +scenarios with different target scenes and motion patterns. Experiments on the +synthetic and real-world datasets demonstrate the superiority of our method +over state-of-the-art approaches. + +
+
+
+
+
+ + ☆ 360VOTS: Visual Object Tracking and Segmentation in Omnidirectional + Videos + + +
+ Visual object tracking and segmentation in omnidirectional videos are +challenging due to the wide field-of-view and large spherical distortion +brought by 360{\deg} images. To alleviate these problems, we introduce a novel +representation, extended bounding field-of-view (eBFoV), for target +localization and use it as the foundation of a general 360 tracking framework +which is applicable for both omnidirectional visual object tracking and +segmentation tasks. Building upon our previous work on omnidirectional visual +object tracking (360VOT), we propose a comprehensive dataset and benchmark that +incorporates a new component called omnidirectional video object segmentation +(360VOS). The 360VOS dataset includes 290 sequences accompanied by dense +pixel-wise masks and covers a broader range of target categories. To support +both the development and evaluation of algorithms in this domain, we divide the +dataset into a training subset with 170 sequences and a testing subset with 120 +sequences. Furthermore, we tailor evaluation metrics for both omnidirectional +tracking and segmentation to ensure rigorous assessment. Through extensive +experiments, we benchmark state-of-the-art approaches and demonstrate the +effectiveness of our proposed 360 tracking framework and training dataset. +Homepage: https://360vots.hkustvgd.com/ + +
+
+
+
+
+ + ☆ PeLiCal: Targetless Extrinsic Calibration via Penetrating Lines for + RGB-D Cameras with Limited Co-visibility + + +
+ RGB-D cameras are crucial in robotic perception, given their ability to +produce images augmented with depth data. However, their limited FOV often +requires multiple cameras to cover a broader area. In multi-camera RGB-D +setups, the goal is typically to reduce camera overlap, optimizing spatial +coverage with as few cameras as possible. The extrinsic calibration of these +systems introduces additional complexities. Existing methods for extrinsic +calibration either necessitate specific tools or highly depend on the accuracy +of camera motion estimation. To address these issues, we present PeLiCal, a +novel line-based calibration approach for RGB-D camera systems exhibiting +limited overlap. Our method leverages long line features from surroundings, and +filters out outliers with a novel convergence voting algorithm, achieving +targetless, real-time, and outlier-robust performance compared to existing +methods. We open source our implementation on +\url{https://github.com/joomeok/PeLiCal.git}. + +
+
+
+
+
+ + ☆ Boter: Bootstrapping Knowledge Selection and Question Answering for + Knowledge-based VQA + + +
+ Knowledge-based Visual Question Answering (VQA) requires models to +incorporate external knowledge to respond to questions about visual content. +Previous methods mostly follow the "retrieve and generate" paradigm. Initially, +they utilize a pre-trained retriever to fetch relevant knowledge documents, +subsequently employing them to generate answers. While these methods have +demonstrated commendable performance in the task, they possess limitations: (1) +they employ an independent retriever to acquire knowledge solely based on the +similarity between the query and knowledge embeddings, without assessing +whether the knowledge document is truly conducive to helping answer the +question; (2) they convert the image into text and then conduct retrieval and +answering in natural language space, which may not ensure comprehensive +acquisition of all image information. To address these limitations, we propose +Boter, a novel framework designed to bootstrap knowledge selection and question +answering by leveraging the robust multimodal perception capabilities of the +Multimodal Large Language Model (MLLM). The framework consists of two modules: +Selector and Answerer, where both are initialized by the MLLM and +parameter-efficiently finetuned in a simple cycle: find key knowledge in the +retrieved knowledge documents using the Selector, and then use them to finetune +the Answerer to predict answers; obtain the pseudo-labels of key knowledge +documents based on the predictions of the Answerer and weak supervision labels, +and then finetune the Selector to select key knowledge; repeat. Our framework +significantly enhances the performance of the baseline on the challenging +open-domain Knowledge-based VQA benchmark, OK-VQA, achieving a state-of-the-art +accuracy of 62.83%. + +
+
+
+
+
+ + ☆ Gorgeous: Create Your Desired Character Facial Makeup from Any Ideas + + +
+ Contemporary makeup transfer methods primarily focus on replicating makeup +from one face to another, considerably limiting their use in creating diverse +and creative character makeup essential for visual storytelling. Such methods +typically fail to address the need for uniqueness and contextual relevance, +specifically aligning with character and story settings as they depend heavily +on existing facial makeup in reference images. This approach also presents a +significant challenge when attempting to source a perfectly matched facial +makeup style, further complicating the creation of makeup designs inspired by +various story elements, such as theme, background, and props that do not +necessarily feature faces. To address these limitations, we introduce +$Gorgeous$, a novel diffusion-based makeup application method that goes beyond +simple transfer by innovatively crafting unique and thematic facial makeup. +Unlike traditional methods, $Gorgeous$ does not require the presence of a face +in the reference images. Instead, it draws artistic inspiration from a minimal +set of three to five images, which can be of any type, and transforms these +elements into practical makeup applications directly on the face. Our +comprehensive experiments demonstrate that $Gorgeous$ can effectively generate +distinctive character facial makeup inspired by the chosen thematic reference +images. This approach opens up new possibilities for integrating broader story +elements into character makeup, thereby enhancing the narrative depth and +visual impact in storytelling. + +
+
+ comment: Project page: https://github.com/JiaWeiSii/gorgeous/ +
+
+
+
+
+ + ☆ Exploring Kinetic Curves Features for the Classification of Benign and + Malignant Breast Lesions in DCE-MRI + + +
+ Breast cancer is the most common malignant tumor among women and the second +cause of cancer-related death. Early diagnosis in clinical practice is crucial +for timely treatment and prognosis. Dynamic contrast-enhanced magnetic +resonance imaging (DCE-MRI) has revealed great usability in the preoperative +diagnosis and assessing therapy effects thanks to its capability to reflect the +morphology and dynamic characteristics of breast lesions. However, most +existing computer-assisted diagnosis algorithms only consider conventional +radiomic features when classifying benign and malignant lesions in DCE-MRI. In +this study, we propose to fully leverage the dynamic characteristics from the +kinetic curves as well as the radiomic features to boost the classification +accuracy of benign and malignant breast lesions. The proposed method is a fully +automated solution by directly analyzing the 3D features from the DCE-MRI. The +proposed method is evaluated on an in-house dataset including 200 DCE-MRI scans +with 298 breast tumors (172 benign and 126 malignant tumors), achieving +favorable classification accuracy with an area under curve (AUC) of 0.94. By +simultaneously considering the dynamic and radiomic features, it is beneficial +to effectively distinguish between benign and malignant breast lesions. + +
+
+ comment: 6 pages, 8 figures, conference +
+
+
+
+
+ + ☆ MaterialSeg3D: Segmenting Dense Materials from 2D Priors for 3D Assets + + +
+ Driven by powerful image diffusion models, recent research has achieved the +automatic creation of 3D objects from textual or visual guidance. By performing +score distillation sampling (SDS) iteratively across different views, these +methods succeed in lifting 2D generative prior to the 3D space. However, such a +2D generative image prior bakes the effect of illumination and shadow into the +texture. As a result, material maps optimized by SDS inevitably involve +spurious correlated components. The absence of precise material definition +makes it infeasible to relight the generated assets reasonably in novel scenes, +which limits their application in downstream scenarios. In contrast, humans can +effortlessly circumvent this ambiguity by deducing the material of the object +from its appearance and semantics. Motivated by this insight, we propose +MaterialSeg3D, a 3D asset material generation framework to infer underlying +material from the 2D semantic prior. Based on such a prior model, we devise a +mechanism to parse material in 3D space. We maintain a UV stack, each map of +which is unprojected from a specific viewpoint. After traversing all +viewpoints, we fuse the stack through a weighted voting scheme and then employ +region unification to ensure the coherence of the object parts. To fuel the +learning of semantics prior, we collect a material dataset, named Materialized +Individual Objects (MIO), which features abundant images, diverse categories, +and accurate annotations. Extensive quantitative and qualitative experiments +demonstrate the effectiveness of our method. + +
+
+
+
+
+ + ☆ NeRF-DetS: Enhancing Multi-View 3D Object Detection with + Sampling-adaptive Network of Continuous NeRF-based Representation + + +
+ As a preliminary work, NeRF-Det unifies the tasks of novel view synthesis and +3D perception, demonstrating that perceptual tasks can benefit from novel view +synthesis methods like NeRF, significantly improving the performance of indoor +multi-view 3D object detection. Using the geometry MLP of NeRF to direct the +attention of detection head to crucial parts and incorporating self-supervised +loss from novel view rendering contribute to the achieved improvement. To +better leverage the notable advantages of the continuous representation through +neural rendering in space, we introduce a novel 3D perception network +structure, NeRF-DetS. The key component of NeRF-DetS is the Multi-level +Sampling-Adaptive Network, making the sampling process adaptively from coarse +to fine. Also, we propose a superior multi-view information fusion method, +known as Multi-head Weighted Fusion. This fusion approach efficiently addresses +the challenge of losing multi-view information when using arithmetic mean, +while keeping low computational costs. NeRF-DetS outperforms competitive +NeRF-Det on the ScanNetV2 dataset, by achieving +5.02% and +5.92% improvement +in mAP@.25 and mAP@.50, respectively. + +
+
+
+
+
+ + ☆ Cross-Task Multi-Branch Vision Transformer for Facial Expression and + Mask Wearing Classification + + +
+ With wearing masks becoming a new cultural norm, facial expression +recognition (FER) while taking masks into account has become a significant +challenge. In this paper, we propose a unified multi-branch vision transformer +for facial expression recognition and mask wearing classification tasks. Our +approach extracts shared features for both tasks using a dual-branch +architecture that obtains multi-scale feature representations. Furthermore, we +propose a cross-task fusion phase that processes tokens for each task with +separate branches, while exchanging information using a cross attention module. +Our proposed framework reduces the overall complexity compared with using +separate networks for both tasks by the simple yet effective cross-task fusion +phase. Extensive experiments demonstrate that our proposed model performs +better than or on par with different state-of-the-art methods on both facial +expression recognition and facial mask wearing classification task. + +
+
+
+
+
+ + ☆ Brain-Inspired Continual Learning-Robust Feature Distillation and + Re-Consolidation for Class Incremental Learning + + +
+ Artificial intelligence (AI) and neuroscience share a rich history, with +advancements in neuroscience shaping the development of AI systems capable of +human-like knowledge retention. Leveraging insights from neuroscience and +existing research in adversarial and continual learning, we introduce a novel +framework comprising two core concepts: feature distillation and +re-consolidation. Our framework, named Robust Rehearsal, addresses the +challenge of catastrophic forgetting inherent in continual learning (CL) +systems by distilling and rehearsing robust features. Inspired by the mammalian +brain's memory consolidation process, Robust Rehearsal aims to emulate the +rehearsal of distilled experiences during learning tasks. Additionally, it +mimics memory re-consolidation, where new experiences influence the integration +of past experiences to mitigate forgetting. Extensive experiments conducted on +CIFAR10, CIFAR100, and real-world helicopter attitude datasets showcase the +superior performance of CL models trained with Robust Rehearsal compared to +baseline methods. Furthermore, examining different optimization training +objectives-joint, continual, and adversarial learning-we highlight the crucial +role of feature learning in model performance. This underscores the +significance of rehearsing CL-robust samples in mitigating catastrophic +forgetting. In conclusion, aligning CL approaches with neuroscience insights +offers promising solutions to the challenge of catastrophic forgetting, paving +the way for more robust and human-like AI systems. + +
+
+
+
+
+ + ☆ The Adversarial AI-Art: Understanding, Generation, Detection, and + Benchmarking + + +
+ Generative AI models can produce high-quality images based on text prompts. +The generated images often appear indistinguishable from images generated by +conventional optical photography devices or created by human artists (i.e., +real images). While the outstanding performance of such generative models is +generally well received, security concerns arise. For instance, such image +generators could be used to facilitate fraud or scam schemes, generate and +spread misinformation, or produce fabricated artworks. In this paper, we +present a systematic attempt at understanding and detecting AI-generated images +(AI-art) in adversarial scenarios. First, we collect and share a dataset of +real images and their corresponding artificial counterparts generated by four +popular AI image generators. The dataset, named ARIA, contains over 140K images +in five categories: artworks (painting), social media images, news photos, +disaster scenes, and anime pictures. This dataset can be used as a foundation +to support future research on adversarial AI-art. Next, we present a user study +that employs the ARIA dataset to evaluate if real-world users can distinguish +with or without reference images. In a benchmarking study, we further evaluate +if state-of-the-art open-source and commercial AI image detectors can +effectively identify the images in the ARIA dataset. Finally, we present a +ResNet-50 classifier and evaluate its accuracy and transferability on the ARIA +dataset. + +
+
+
+
+
+ + ☆ UVMap-ID: A Controllable and Personalized UV Map Generative Model + + +
+ Recently, diffusion models have made significant strides in synthesizing +realistic 2D human images based on provided text prompts. Building upon this, +researchers have extended 2D text-to-image diffusion models into the 3D domain +for generating human textures (UV Maps). However, some important problems about +UV Map Generative models are still not solved, i.e., how to generate +personalized texture maps for any given face image, and how to define and +evaluate the quality of these generated texture maps. To solve the above +problems, we introduce a novel method, UVMap-ID, which is a controllable and +personalized UV Map generative model. Unlike traditional large-scale training +methods in 2D, we propose to fine-tune a pre-trained text-to-image diffusion +model which is integrated with a face fusion module for achieving ID-driven +customized generation. To support the finetuning strategy, we introduce a +small-scale attribute-balanced training dataset, including high-quality +textures with labeled text and Face ID. Additionally, we introduce some metrics +to evaluate the multiple aspects of the textures. Finally, both quantitative +and qualitative analyses demonstrate the effectiveness of our method in +controllable and personalized UV Map generation. Code is publicly available via +https://github.com/twowwj/UVMap-ID. + +
+
+
+
+
+ + ☆ "Where am I?" Scene Retrieval with Language + + +
+ Natural language interfaces to embodied AI are becoming more ubiquitous in +our daily lives. This opens further opportunities for language-based +interaction with embodied agents, such as a user instructing an agent to +execute some task in a specific location. For example, "put the bowls back in +the cupboard next to the fridge" or "meet me at the intersection under the red +sign." As such, we need methods that interface between natural language and map +representations of the environment. To this end, we explore the question of +whether we can use an open-set natural language query to identify a scene +represented by a 3D scene graph. We define this task as "language-based +scene-retrieval" and it is closely related to "coarse-localization," but we are +instead searching for a match from a collection of disjoint scenes and not +necessarily a large-scale continuous map. Therefore, we present +Text2SceneGraphMatcher, a "scene-retrieval" pipeline that learns joint +embeddings between text descriptions and scene graphs to determine if they are +matched. The code, trained models, and datasets will be made public. + +
+
+
+
+
+ + ☆ Adaptive Local Binary Pattern: A Novel Feature Descriptor for Enhanced + Analysis of Kidney Abnormalities in CT Scan Images using ensemble based + Machine Learning Approach + + +
+ The shortage of nephrologists and the growing public health concern over +renal failure have spurred the demand for AI systems capable of autonomously +detecting kidney abnormalities. Renal failure, marked by a gradual decline in +kidney function, can result from factors like cysts, stones, and tumors. +Chronic kidney disease may go unnoticed initially, leading to untreated cases +until they reach an advanced stage. The dataset, comprising 12,427 images from +multiple hospitals in Dhaka, was categorized into four groups: cyst, tumor, +stone, and normal. Our methodology aims to enhance CT scan image quality using +Cropping, Resizing, and CALHE techniques, followed by feature extraction with +our proposed Adaptive Local Binary Pattern (A-LBP) feature extraction method +compared with the state-of-the-art local binary pattern (LBP) method. Our +proposed features fed into classifiers such as Random Forest, Decision Tree, +Naive Bayes, K-Nearest Neighbor, and SVM. We explored an ensemble model with +soft voting to get a more robust model for our task. We got the highest of more +than 99% in accuracy using our feature descriptor and ensembling five +classifiers (Random Forest, Decision Tree, Naive Bayes, K-Nearest Neighbor, +Support Vector Machine) with the soft voting method. + +
+
+ comment: 17 pages, 5 tables, 4 figures +
+
+
+
+
+ + ☆ UVEB: A Large-scale Benchmark and Baseline Towards Real-World Underwater + Video Enhancement CVPR2024 + + +
+ Learning-based underwater image enhancement (UIE) methods have made great +progress. However, the lack of large-scale and high-quality paired training +samples has become the main bottleneck hindering the development of UIE. The +inter-frame information in underwater videos can accelerate or optimize the UIE +process. Thus, we constructed the first large-scale high-resolution underwater +video enhancement benchmark (UVEB) to promote the development of underwater +vision.It contains 1,308 pairs of video sequences and more than 453,000 +high-resolution with 38\% Ultra-High-Definition (UHD) 4K frame pairs. UVEB +comes from multiple countries, containing various scenes and video degradation +types to adapt to diverse and complex underwater environments. We also propose +the first supervised underwater video enhancement method, UVE-Net. UVE-Net +converts the current frame information into convolutional kernels and passes +them to adjacent frames for efficient inter-frame information exchange. By +fully utilizing the redundant degraded information of underwater videos, +UVE-Net completes video enhancement better. Experiments show the effective +network design and good performance of UVE-Net. + +
+
+ comment: 10 pages,CVPR2024 accept +
+
+
+
+
+ + ☆ SwinFuSR: an image fusion-inspired model for RGB-guided thermal image + super-resolution CVPR 2024 + + +
+ Thermal imaging plays a crucial role in various applications, but the +inherent low resolution of commonly available infrared (IR) cameras limits its +effectiveness. Conventional super-resolution (SR) methods often struggle with +thermal images due to their lack of high-frequency details. Guided SR leverages +information from a high-resolution image, typically in the visible spectrum, to +enhance the reconstruction of a high-res IR image from the low-res input. +Inspired by SwinFusion, we propose SwinFuSR, a guided SR architecture based on +Swin transformers. In real world scenarios, however, the guiding modality (e.g. +RBG image) may be missing, so we propose a training method that improves the +robustness of the model in this case. Our method has few parameters and +outperforms state of the art models in terms of Peak Signal to Noise Ratio +(PSNR) and Structural SIMilarity (SSIM). In Track 2 of the PBVS 2024 Thermal +Image Super-Resolution Challenge, it achieves 3rd place in the PSNR metric. Our +code and pretained weights are available at +https://github.com/VisionICLab/SwinFuSR. + +
+
+ comment: Accepted at 20th IEEE Workshop on Perception Beyond the Visible + Spectrum, CVPR 2024 +
+
+
+
+
+ + ☆ Align Your Steps: Optimizing Sampling Schedules in Diffusion Models + + +
+ Diffusion models (DMs) have established themselves as the state-of-the-art +generative modeling approach in the visual domain and beyond. A crucial +drawback of DMs is their slow sampling speed, relying on many sequential +function evaluations through large neural networks. Sampling from DMs can be +seen as solving a differential equation through a discretized set of noise +levels known as the sampling schedule. While past works primarily focused on +deriving efficient solvers, little attention has been given to finding optimal +sampling schedules, and the entire literature relies on hand-crafted +heuristics. In this work, for the first time, we propose a general and +principled approach to optimizing the sampling schedules of DMs for +high-quality outputs, called $\textit{Align Your Steps}$. We leverage methods +from stochastic calculus and find optimal schedules specific to different +solvers, trained DMs and datasets. We evaluate our novel approach on several +image, video as well as 2D toy data synthesis benchmarks, using a variety of +different samplers, and observe that our optimized schedules outperform +previous hand-crafted schedules in almost all experiments. Our method +demonstrates the untapped potential of sampling schedule optimization, +especially in the few-step synthesis regime. + +
+
+ comment: Project page: + https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/ +
+
+
+
+
+ + ☆ Narrative Action Evaluation with Prompt-Guided Multimodal Interaction CVPR 2024 + + +
+ In this paper, we investigate a new problem called narrative action +evaluation (NAE). NAE aims to generate professional commentary that evaluates +the execution of an action. Unlike traditional tasks such as score-based action +quality assessment and video captioning involving superficial sentences, NAE +focuses on creating detailed narratives in natural language. These narratives +provide intricate descriptions of actions along with objective evaluations. NAE +is a more challenging task because it requires both narrative flexibility and +evaluation rigor. One existing possible solution is to use multi-task learning, +where narrative language and evaluative information are predicted separately. +However, this approach results in reduced performance for individual tasks +because of variations between tasks and differences in modality between +language information and evaluation information. To address this, we propose a +prompt-guided multimodal interaction framework. This framework utilizes a pair +of transformers to facilitate the interaction between different modalities of +information. It also uses prompts to transform the score regression task into a +video-text matching task, thus enabling task interactivity. To support further +research in this field, we re-annotate the MTL-AQA and FineGym datasets with +high-quality and comprehensive action narration. Additionally, we establish +benchmarks for NAE. Extensive experiment results prove that our method +outperforms separate learning methods and naive multi-task learning methods. +Data and code are released at +\href{https://github.com/shiyi-zh0408/NAE_CVPR2024 }{here}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Global OpenBuildingMap -- Unveiling the Mystery of Global Buildings + + +
+ Understanding how buildings are distributed globally is crucial to revealing +the human footprint on our home planet. This built environment affects local +climate, land surface albedo, resource distribution, and many other key factors +that influence well-being and human health. Despite this, quantitative and +comprehensive data on the distribution and properties of buildings worldwide is +lacking. To this end, by using a big data analytics approach and nearly 800,000 +satellite images, we generated the highest resolution and highest accuracy +building map ever created: the Global OpenBuildingMap (Global OBM). A joint +analysis of building maps and solar potentials indicates that rooftop solar +energy can supply the global energy consumption need at a reasonable cost. +Specifically, if solar panels were placed on the roofs of all buildings, they +could supply 1.1-3.3 times -- depending on the efficiency of the solar device +-- the global energy consumption in 2020, which is the year with the highest +consumption on record. We also identified a clear geospatial correlation +between building areas and key socioeconomic variables, which indicates our +global building map can serve as an important input to modeling global +socioeconomic needs and drivers. + +
+
+
+
+
+ + ☆ Deep Regression Representation Learning with Topology + + +
+ Most works studying representation learning focus only on classification and +neglect regression. Yet, the learning objectives and therefore the +representation topologies of the two tasks are fundamentally different: +classification targets class separation, leading to disconnected +representations, whereas regression requires ordinality with respect to the +target, leading to continuous representations. We thus wonder how the +effectiveness of a regression representation is influenced by its topology, +with evaluation based on the Information Bottleneck (IB) principle. + The IB principle is an important framework that provides principles for +learning effectiveness representations. We establish two connections between it +and the topology of regression representations. The first connection reveals +that a lower intrinsic dimension of the feature space implies a reduced +complexity of the representation Z. This complexity can be quantified as the +conditional entropy of Z on the target space Y and serves as an upper bound on +the generalization error. The second connection suggests learning a feature +space that is topologically similar to the target space will better align with +the IB principle. Based on these two connections, we introduce PH-Reg, a +regularizer specific to regression that matches the intrinsic dimension and +topology of the feature space with the target space. Experiments on synthetic +and real-world regression tasks demonstrate the benefits of PH-Reg. + +
+
+
+
+
+ + ☆ MambaUIE&SR: Unraveling the Ocean's Secrets with Only 2.8 FLOPs + + +
+ Underwater Image Enhancement (UIE) techniques aim to address the problem of +underwater image degradation due to light absorption and scattering. In recent +years, both Convolution Neural Network (CNN)-based and Transformer-based +methods have been widely explored. In addition, combining CNN and Transformer +can effectively combine global and local information for enhancement. However, +this approach is still affected by the secondary complexity of the Transformer +and cannot maximize the performance. Recently, the state-space model (SSM) +based architecture Mamba has been proposed, which excels in modeling long +distances while maintaining linear complexity. This paper explores the +potential of this SSM-based model for UIE from both efficiency and +effectiveness perspectives. However, the performance of directly applying Mamba +is poor because local fine-grained features, which are crucial for image +enhancement, cannot be fully utilized. Specifically, we customize the MambaUIE +architecture for efficient UIE. Specifically, we introduce visual state space +(VSS) blocks to capture global contextual information at the macro level while +mining local information at the micro level. Also, for these two kinds of +information, we propose a Dynamic Interaction Block (DIB) and Spatial +feed-forward Network (SGFN) for intra-block feature aggregation. MambaUIE is +able to efficiently synthesize global and local information and maintains a +very small number of parameters with high accuracy. Experiments on UIEB +datasets show that our method reduces GFLOPs by 67.4% (2.715G) relative to the +SOTA method. To the best of our knowledge, this is the first UIE model +constructed based on SSM that breaks the limitation of FLOPs on accuracy in +UIE. The official repository of MambaUIE at +https://github.com/1024AILab/MambaUIE. + +
+
+
+
+
+ + ☆ Regional Style and Color Transfer + + +
+ This paper presents a novel contribution to the field of regional style +transfer. Existing methods often suffer from the drawback of applying style +homogeneously across the entire image, leading to stylistic inconsistencies or +foreground object twisted when applied to image with foreground elements such +as person figures. To address this limitation, we propose a new approach that +leverages a segmentation network to precisely isolate foreground objects within +the input image. Subsequently, style transfer is applied exclusively to the +background region. The isolated foreground objects are then carefully +reintegrated into the style-transferred background. To enhance the visual +coherence between foreground and background, a color transfer step is employed +on the foreground elements prior to their rein-corporation. Finally, we utilize +feathering techniques to achieve a seamless amalgamation of foreground and +background, resulting in a visually unified and aesthetically pleasing final +composition. Extensive evaluations demonstrate that our proposed approach +yields significantly more natural stylistic transformations compared to +conventional methods. + +
+
+
+
+
+ + ☆ VALOR-EVAL: Holistic Coverage and Faithfulness Evaluation of Large + Vision-Language Models + + +
+ Large Vision-Language Models (LVLMs) suffer from hallucination issues, +wherein the models generate plausible-sounding but factually incorrect outputs, +undermining their reliability. A comprehensive quantitative evaluation is +necessary to identify and understand the extent of hallucinations in these +models. However, existing benchmarks are often limited in scope, focusing +mainly on object hallucinations. Furthermore, current evaluation methods +struggle to effectively address the subtle semantic distinctions between model +outputs and reference data, as well as the balance between hallucination and +informativeness. To address these issues, we introduce a multi-dimensional +benchmark covering objects, attributes, and relations, with challenging images +selected based on associative biases. Moreover, we propose an large language +model (LLM)-based two-stage evaluation framework that generalizes the popular +CHAIR metric and incorporates both faithfulness and coverage into the +evaluation. Experiments on 10 established LVLMs demonstrate that our evaluation +metric is more comprehensive and better correlated with humans than existing +work when evaluating on our challenging human annotated benchmark dataset. Our +work also highlights the critical balance between faithfulness and coverage of +model outputs, and encourages future works to address hallucinations in LVLMs +while keeping their outputs informative. + +
+
+ comment: Work in process +
+
+
+
+
+ + ☆ Texture-aware and Shape-guided Transformer for Sequential DeepFake + Detection + + +
+ Sequential DeepFake detection is an emerging task that aims to predict the +manipulation sequence in order. Existing methods typically formulate it as an +image-to-sequence problem, employing conventional Transformer architectures for +detection. However, these methods lack dedicated design and consequently result +in limited performance. In this paper, we propose a novel Texture-aware and +Shape-guided Transformer to enhance detection performance. Our method features +four major improvements. Firstly, we describe a texture-aware branch that +effectively captures subtle manipulation traces with the Diversiform Pixel +Difference Attention module. Then we introduce a Bidirectional Interaction +Cross-attention module that seeks deep correlations among spatial and +sequential features, enabling effective modeling of complex manipulation +traces. To further enhance the cross-attention, we describe a Shape-guided +Gaussian mapping strategy, providing initial priors of the manipulation shape. +Finally, observing that the latter manipulation in a sequence may influence +traces left in the earlier one, we intriguingly invert the prediction order +from forward to backward, leading to notable gains as expected. Extensive +experimental results demonstrate that our method outperforms others by a large +margin, highlighting the superiority of our method. + +
+
+
+
+
+ + ☆ FreqBlender: Enhancing DeepFake Detection by Blending Frequency + Knowledge + + +
+ Generating synthetic fake faces, known as pseudo-fake faces, is an effective +way to improve the generalization of DeepFake detection. Existing methods +typically generate these faces by blending real or fake faces in color space. +While these methods have shown promise, they overlook the simulation of +frequency distribution in pseudo-fake faces, limiting the learning of generic +forgery traces in-depth. To address this, this paper introduces {\em +FreqBlender}, a new method that can generate pseudo-fake faces by blending +frequency knowledge. Specifically, we investigate the major frequency +components and propose a Frequency Parsing Network to adaptively partition +frequency components related to forgery traces. Then we blend this frequency +knowledge from fake faces into real faces to generate pseudo-fake faces. Since +there is no ground truth for frequency components, we describe a dedicated +training strategy by leveraging the inner correlations among different +frequency knowledge to instruct the learning process. Experimental results +demonstrate the effectiveness of our method in enhancing DeepFake detection, +making it a potential plug-and-play strategy for other methods. + +
+
+
+
+
+ + ☆ TeamTrack: A Dataset for Multi-Sport Multi-Object Tracking in Full-pitch + Videos + + +
+ Multi-object tracking (MOT) is a critical and challenging task in computer +vision, particularly in situations involving objects with similar appearances +but diverse movements, as seen in team sports. Current methods, largely reliant +on object detection and appearance, often fail to track targets in such complex +scenarios accurately. This limitation is further exacerbated by the lack of +comprehensive and diverse datasets covering the full view of sports pitches. +Addressing these issues, we introduce TeamTrack, a pioneering benchmark dataset +specifically designed for MOT in sports. TeamTrack is an extensive collection +of full-pitch video data from various sports, including soccer, basketball, and +handball. Furthermore, we perform a comprehensive analysis and benchmarking +effort to underscore TeamTrack's utility and potential impact. Our work +signifies a crucial step forward, promising to elevate the precision and +effectiveness of MOT in complex, dynamic settings such as team sports. The +dataset, project code and competition is released at: +https://atomscott.github.io/TeamTrack/. + +
+
+
+
+
+ + ☆ Plug-and-Play Algorithm Convergence Analysis From The Standpoint of + Stochastic Differential Equation + + +
+ The Plug-and-Play (PnP) algorithm is popular for inverse image +problem-solving. However, this algorithm lacks theoretical analysis of its +convergence with more advanced plug-in denoisers. We demonstrate that discrete +PnP iteration can be described by a continuous stochastic differential equation +(SDE). We can also achieve this transformation through Markov process +formulation of PnP. Then, we can take a higher standpoint of PnP algorithms +from stochastic differential equations, and give a unified framework for the +convergence property of PnP according to the solvability condition of its +corresponding SDE. We reveal that a much weaker condition, bounded denoiser +with Lipschitz continuous measurement function would be enough for its +convergence guarantee, instead of previous Lipschitz continuous denoiser +condition. + +
+
+ comment: 17pages, Preprint, Under review +
+
+
+
+
+ + ☆ PM-VIS: High-Performance Box-Supervised Video Instance Segmentation + + +
+ Labeling pixel-wise object masks in videos is a resource-intensive and +laborious process. Box-supervised Video Instance Segmentation (VIS) methods +have emerged as a viable solution to mitigate the labor-intensive annotation +process. . In practical applications, the two-step approach is not only more +flexible but also exhibits a higher recognition accuracy. Inspired by the +recent success of Segment Anything Model (SAM), we introduce a novel approach +that aims at harnessing instance box annotations from multiple perspectives to +generate high-quality instance pseudo masks, thus enriching the information +contained in instance annotations. We leverage ground-truth boxes to create +three types of pseudo masks using the HQ-SAM model, the box-supervised VIS +model (IDOL-BoxInst), and the VOS model (DeAOT) separately, along with three +corresponding optimization mechanisms. Additionally, we introduce two +ground-truth data filtering methods, assisted by high-quality pseudo masks, to +further enhance the training dataset quality and improve the performance of +fully supervised VIS methods. To fully capitalize on the obtained high-quality +Pseudo Masks, we introduce a novel algorithm, PM-VIS, to integrate mask losses +into IDOL-BoxInst. Our PM-VIS model, trained with high-quality pseudo mask +annotations, demonstrates strong ability in instance mask prediction, achieving +state-of-the-art performance on the YouTube-VIS 2019, YouTube-VIS 2021, and +OVIS validation sets, notably narrowing the gap between box-supervised and +fully supervised VIS methods. + +
+
+
+
+
+ + ☆ PGAHum: Prior-Guided Geometry and Appearance Learning for High-Fidelity + Animatable Human Reconstruction + + +
+ Recent techniques on implicit geometry representation learning and neural +rendering have shown promising results for 3D clothed human reconstruction from +sparse video inputs. However, it is still challenging to reconstruct detailed +surface geometry and even more difficult to synthesize photorealistic novel +views with animated human poses. In this work, we introduce PGAHum, a +prior-guided geometry and appearance learning framework for high-fidelity +animatable human reconstruction. We thoroughly exploit 3D human priors in three +key modules of PGAHum to achieve high-quality geometry reconstruction with +intricate details and photorealistic view synthesis on unseen poses. First, a +prior-based implicit geometry representation of 3D human, which contains a +delta SDF predicted by a tri-plane network and a base SDF derived from the +prior SMPL model, is proposed to model the surface details and the body shape +in a disentangled manner. Second, we introduce a novel prior-guided sampling +strategy that fully leverages the prior information of the human pose and body +to sample the query points within or near the body surface. By avoiding +unnecessary learning in the empty 3D space, the neural rendering can recover +more appearance details. Last, we propose a novel iterative backward +deformation strategy to progressively find the correspondence for the query +point in observation space. A skinning weights prediction model is learned +based on the prior provided by the SMPL model to achieve the iterative backward +LBS deformation. Extensive quantitative and qualitative comparisons on various +datasets are conducted and the results demonstrate the superiority of our +framework. Ablation studies also verify the effectiveness of each scheme for +geometry and appearance learning. + +
+
+
+
+
+ + ☆ Unveiling and Mitigating Generalized Biases of DNNs through the + Intrinsic Dimensions of Perceptual Manifolds + + +
+ Building fair deep neural networks (DNNs) is a crucial step towards achieving +trustworthy artificial intelligence. Delving into deeper factors that affect +the fairness of DNNs is paramount and serves as the foundation for mitigating +model biases. However, current methods are limited in accurately predicting DNN +biases, relying solely on the number of training samples and lacking more +precise measurement tools. Here, we establish a geometric perspective for +analyzing the fairness of DNNs, comprehensively exploring how DNNs internally +shape the intrinsic geometric characteristics of datasets-the intrinsic +dimensions (IDs) of perceptual manifolds, and the impact of IDs on the fairness +of DNNs. Based on multiple findings, we propose Intrinsic Dimension +Regularization (IDR), which enhances the fairness and performance of models by +promoting the learning of concise and ID-balanced class perceptual manifolds. +In various image recognition benchmark tests, IDR significantly mitigates model +bias while improving its performance. + +
+
+ comment: 8pages, 6figures, Submitted to TPAMI +
+
+
+
+
+ + ☆ Self-Supervised Monocular Depth Estimation in the Dark: Towards Data + Distribution Compensation IJCAI2024 + + +
+ Nighttime self-supervised monocular depth estimation has received increasing +attention in recent years. However, using night images for self-supervision is +unreliable because the photometric consistency assumption is usually violated +in the videos taken under complex lighting conditions. Even with domain +adaptation or photometric loss repair, performance is still limited by the poor +supervision of night images on trainable networks. In this paper, we propose a +self-supervised nighttime monocular depth estimation method that does not use +any night images during training. Our framework utilizes day images as a stable +source for self-supervision and applies physical priors (e.g., wave optics, +reflection model and read-shot noise model) to compensate for some key +day-night differences. With day-to-night data distribution compensation, our +framework can be trained in an efficient one-stage self-supervised manner. +Though no nighttime images are considered during training, qualitative and +quantitative results demonstrate that our method achieves SoTA depth estimating +results on the challenging nuScenes-Night and RobotCar-Night compared with +existing methods. + +
+
+ comment: Accepted by IJCAI2024 +
+
+
+
+
+ + ☆ DSDRNet: Disentangling Representation and Reconstruct Network for Domain + Generalization IJCNN 2024 + + +
+ Domain generalization faces challenges due to the distribution shift between +training and testing sets, and the presence of unseen target domains. Common +solutions include domain alignment, meta-learning, data augmentation, or +ensemble learning, all of which rely on domain labels or domain adversarial +techniques. In this paper, we propose a Dual-Stream Separation and +Reconstruction Network, dubbed DSDRNet. It is a disentanglement-reconstruction +approach that integrates features of both inter-instance and intra-instance +through dual-stream fusion. The method introduces novel supervised signals by +combining inter-instance semantic distance and intra-instance similarity. +Incorporating Adaptive Instance Normalization (AdaIN) into a two-stage cyclic +reconstruction process enhances self-disentangled reconstruction signals to +facilitate model convergence. Extensive experiments on four benchmark datasets +demonstrate that DSDRNet outperforms other popular methods in terms of domain +generalization capabilities. + +
+
+ comment: This paper is accepted to IJCNN 2024 +
+
+
+
+
+ + ☆ EventLens: Leveraging Event-Aware Pretraining and Cross-modal Linking + Enhances Visual Commonsense Reasoning + + +
+ Visual Commonsense Reasoning (VCR) is a cognitive task, challenging models to +answer visual questions requiring human commonsense, and to provide rationales +explaining why the answers are correct. With emergence of Large Language Models +(LLMs), it is natural and imperative to explore their applicability to VCR. +However, VCR task demands more external knowledge to tackle its challenging +questions, necessitating special designs to activate LLMs' commonsense +reasoning abilities. Also, most existing Multimodal LLMs adopted an abstraction +of entire input image, which makes it difficult to comprehend VCR's unique +co-reference tags between image regions and text, posing challenges for +fine-grained alignment. To address these issues, we propose EventLens that +leverages Event-Aware Pretraining and Cross-modal Linking and EnhanceS VCR. +First, by emulating the cognitive process of human reasoning, an Event-Aware +Pretraining auxiliary task is introduced to better activate LLM's global +comprehension of intricate scenarios. Second, during fine-tuning, we further +utilize reference tags to bridge RoI features with texts, while preserving both +modality semantics. Finally, we use instruct-style prompts to narrow the gap +between pretraining and fine-tuning, and task-specific adapters to better +integrate LLM's inherent knowledge with new commonsense. Experimental results +show the effectiveness of our proposed auxiliary task and fine-grained linking +strategy. + +
+
+
+
+
+ + ☆ On Support Relations Inference and Scene Hierarchy Graph Construction + from Point Cloud in Clustered Environments + + +
+ Over the years, scene understanding has attracted a growing interest in +computer vision, providing the semantic and physical scene information +necessary for robots to complete some particular tasks autonomously. In 3D +scenes, rich spatial geometric and topological information are often ignored by +RGB-based approaches for scene understanding. In this study, we develop a +bottom-up approach for scene understanding that infers support relations +between objects from a point cloud. Our approach utilizes the spatial topology +information of the plane pairs in the scene, consisting of three major steps. +1) Detection of pairwise spatial configuration: dividing primitive pairs into +local support connection and local inner connection; 2) primitive +classification: a combinatorial optimization method applied to classify +primitives; and 3) support relations inference and hierarchy graph +construction: bottom-up support relations inference and scene hierarchy graph +construction containing primitive level and object level. Through experiments, +we demonstrate that the algorithm achieves excellent performance in primitive +classification and support relations inference. Additionally, we show that the +scene hierarchy graph contains rich geometric and topological information of +objects, and it possesses great scalability for scene understanding. + +
+
+
+
+
+ + ☆ C2F-SemiCD: A Coarse-to-Fine Semi-Supervised Change Detection Method + Based on Consistency Regularization in High-Resolution Remote Sensing Images + + +
+ A high-precision feature extraction model is crucial for change detection +(CD). In the past, many deep learning-based supervised CD methods learned to +recognize change feature patterns from a large number of labelled bi-temporal +images, whereas labelling bi-temporal remote sensing images is very expensive +and often time-consuming; therefore, we propose a coarse-to-fine +semi-supervised CD method based on consistency regularization (C2F-SemiCD), +which includes a coarse-to-fine CD network with a multiscale attention +mechanism (C2FNet) and a semi-supervised update method. Among them, the C2FNet +network gradually completes the extraction of change features from +coarse-grained to fine-grained through multiscale feature fusion, channel +attention mechanism, spatial attention mechanism, global context module, +feature refine module, initial aggregation module, and final aggregation +module. The semi-supervised update method uses the mean teacher method. The +parameters of the student model are updated to the parameters of the teacher +Model by using the exponential moving average (EMA) method. Through extensive +experiments on three datasets and meticulous ablation studies, including +crossover experiments across datasets, we verify the significant effectiveness +and efficiency of the proposed C2F-SemiCD method. The code will be open at: +https://github.com/ChengxiHAN/C2F-SemiCDand-C2FNet. + +
+
+
+
+
+ + ☆ A Comprehensive Survey and Taxonomy on Point Cloud Registration Based on + Deep Learning IJCAI 2024 + + +
+ Point cloud registration (PCR) involves determining a rigid transformation +that aligns one point cloud to another. Despite the plethora of outstanding +deep learning (DL)-based registration methods proposed, comprehensive and +systematic studies on DL-based PCR techniques are still lacking. In this paper, +we present a comprehensive survey and taxonomy of recently proposed PCR +methods. Firstly, we conduct a taxonomy of commonly utilized datasets and +evaluation metrics. Secondly, we classify the existing research into two main +categories: supervised and unsupervised registration, providing insights into +the core concepts of various influential PCR models. Finally, we highlight open +challenges and potential directions for future research. A curated collection +of valuable resources is made available at https://github.com/yxzhang15/PCR. + +
+
+ comment: This paper is accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Swap It Like Its Hot: Segmentation-based spoof attacks on eye-tracking + images + + +
+ Video-based eye trackers capture the iris biometric and enable authentication +to secure user identity. However, biometric authentication is susceptible to +spoofing another user's identity through physical or digital manipulation. The +current standard to identify physical spoofing attacks on eye-tracking sensors +uses liveness detection. Liveness detection classifies gaze data as real or +fake, which is sufficient to detect physical presentation attacks. However, +such defenses cannot detect a spoofing attack when real eye image inputs are +digitally manipulated to swap the iris pattern of another person. We propose +IrisSwap as a novel attack on gaze-based liveness detection. IrisSwap allows +attackers to segment and digitally swap in a victim's iris pattern to fool iris +authentication. Both offline and online attacks produce gaze data that deceives +the current state-of-the-art defense models at rates up to 58% and motivates +the need to develop more advanced authentication methods for eye trackers. + +
+
+
+
+
+ + ☆ HOIST-Former: Hand-held Objects Identification, Segmentation, and + Tracking in the Wild + + +
+ We address the challenging task of identifying, segmenting, and tracking +hand-held objects, which is crucial for applications such as human action +segmentation and performance evaluation. This task is particularly challenging +due to heavy occlusion, rapid motion, and the transitory nature of objects +being hand-held, where an object may be held, released, and subsequently picked +up again. To tackle these challenges, we have developed a novel +transformer-based architecture called HOIST-Former. HOIST-Former is adept at +spatially and temporally segmenting hands and objects by iteratively pooling +features from each other, ensuring that the processes of identification, +segmentation, and tracking of hand-held objects depend on the hands' positions +and their contextual appearance. We further refine HOIST-Former with a contact +loss that focuses on areas where hands are in contact with objects. Moreover, +we also contribute an in-the-wild video dataset called HOIST, which comprises +4,125 videos complete with bounding boxes, segmentation masks, and tracking IDs +for hand-held objects. Through experiments on the HOIST dataset and two +additional public datasets, we demonstrate the efficacy of HOIST-Former in +segmenting and tracking hand-held objects. + +
+
+
+
+
+ + ☆ Neural Radiance Field in Autonomous Driving: A Survey + + +
+ Neural Radiance Field (NeRF) has garnered significant attention from both +academia and industry due to its intrinsic advantages, particularly its +implicit representation and novel view synthesis capabilities. With the rapid +advancements in deep learning, a multitude of methods have emerged to explore +the potential applications of NeRF in the domain of Autonomous Driving (AD). +However, a conspicuous void is apparent within the current literature. To +bridge this gap, this paper conducts a comprehensive survey of NeRF's +applications in the context of AD. Our survey is structured to categorize +NeRF's applications in Autonomous Driving (AD), specifically encompassing +perception, 3D reconstruction, simultaneous localization and mapping (SLAM), +and simulation. We delve into in-depth analysis and summarize the findings for +each application category, and conclude by providing insights and discussions +on future directions in this field. We hope this paper serves as a +comprehensive reference for researchers in this domain. To the best of our +knowledge, this is the first survey specifically focused on the applications of +NeRF in the Autonomous Driving domain. + +
+
+
+
+
+ + ☆ FaceFolds: Meshed Radiance Manifolds for Efficient Volumetric Rendering + of Dynamic Faces + + +
+ 3D rendering of dynamic face captures is a challenging problem, and it +demands improvements on several fronts$\unicode{x2014}$photorealism, +efficiency, compatibility, and configurability. We present a novel +representation that enables high-quality volumetric rendering of an actor's +dynamic facial performances with minimal compute and memory footprint. It runs +natively on commodity graphics soft- and hardware, and allows for a graceful +trade-off between quality and efficiency. Our method utilizes recent advances +in neural rendering, particularly learning discrete radiance manifolds to +sparsely sample the scene to model volumetric effects. We achieve efficient +modeling by learning a single set of manifolds for the entire dynamic sequence, +while implicitly modeling appearance changes as temporal canonical texture. We +export a single layered mesh and view-independent RGBA texture video that is +compatible with legacy graphics renderers without additional ML integration. We +demonstrate our method by rendering dynamic face captures of real actors in a +game engine, at comparable photorealism to state-of-the-art neural rendering +techniques at previously unseen frame rates. + +
+
+ comment: In Proceedings of the ACM in Computer Graphics and Interactive + Techniques, 2024 +
+
+
+
+
+ + ♻ ☆ Dynamic Gaussians Mesh: Consistent Mesh Reconstruction from Monocular + Videos + + +
+ Modern 3D engines and graphics pipelines require mesh as a memory-efficient +representation, which allows efficient rendering, geometry processing, texture +editing, and many other downstream operations. However, it is still highly +difficult to obtain high-quality mesh in terms of structure and detail from +monocular visual observations. The problem becomes even more challenging for +dynamic scenes and objects. To this end, we introduce Dynamic Gaussians Mesh +(DG-Mesh), a framework to reconstruct a high-fidelity and time-consistent mesh +given a single monocular video. Our work leverages the recent advancement in 3D +Gaussian Splatting to construct the mesh sequence with temporal consistency +from a video. Building on top of this representation, DG-Mesh recovers +high-quality meshes from the Gaussian points and can track the mesh vertices +over time, which enables applications such as texture editing on dynamic +objects. We introduce the Gaussian-Mesh Anchoring, which encourages evenly +distributed Gaussians, resulting better mesh reconstruction through mesh-guided +densification and pruning on the deformed Gaussians. By applying +cycle-consistent deformation between the canonical and the deformed space, we +can project the anchored Gaussian back to the canonical space and optimize +Gaussians across all time frames. During the evaluation on different datasets, +DG-Mesh provides significantly better mesh reconstruction and rendering than +baselines. Project page: https://www.liuisabella.com/DG-Mesh/ + +
+
+ comment: Project page: https://www.liuisabella.com/DG-Mesh/ +
+
+
+
+
+ + ♻ ☆ Trends, Applications, and Challenges in Human Attention Modelling IJCAI 2024 + + +
+ Human attention modelling has proven, in recent years, to be particularly +useful not only for understanding the cognitive processes underlying visual +exploration, but also for providing support to artificial intelligence models +that aim to solve problems in various domains, including image and video +processing, vision-and-language applications, and language modelling. This +survey offers a reasoned overview of recent efforts to integrate human +attention mechanisms into contemporary deep learning models and discusses +future research directions and challenges. For a comprehensive overview on the +ongoing research refer to our dedicated repository available at +https://github.com/aimagelab/awesome-human-visual-attention. + +
+
+ comment: Accepted at IJCAI 2024 Survey Track +
+
+
+
+
+ + ♻ ☆ GeoAI Reproducibility and Replicability: a computational and spatial + perspective + + +
+ GeoAI has emerged as an exciting interdisciplinary research area that +combines spatial theories and data with cutting-edge AI models to address +geospatial problems in a novel, data-driven manner. While GeoAI research has +flourished in the GIScience literature, its reproducibility and replicability +(R&R), fundamental principles that determine the reusability, reliability, and +scientific rigor of research findings, have rarely been discussed. This paper +aims to provide an in-depth analysis of this topic from both computational and +spatial perspectives. We first categorize the major goals for reproducing GeoAI +research, namely, validation (repeatability), learning and adapting the method +for solving a similar or new problem (reproducibility), and examining the +generalizability of the research findings (replicability). Each of these goals +requires different levels of understanding of GeoAI, as well as different +methods to ensure its success. We then discuss the factors that may cause the +lack of R&R in GeoAI research, with an emphasis on (1) the selection and use of +training data; (2) the uncertainty that resides in the GeoAI model design, +training, deployment, and inference processes; and more importantly (3) the +inherent spatial heterogeneity of geospatial data and processes. We use a deep +learning-based image analysis task as an example to demonstrate the results' +uncertainty and spatial variance caused by different factors. The findings +reiterate the importance of knowledge sharing, as well as the generation of a +"replicability map" that incorporates spatial autocorrelation and spatial +heterogeneity into consideration in quantifying the spatial replicability of +GeoAI research. + +
+
+ comment: Accepted by Annals of the American Association of Geographers +
+
+
+
+
+ + ♻ ☆ Does Gaussian Splatting need SFM Initialization? + + +
+ 3D Gaussian Splatting has recently been embraced as a versatile and effective +method for scene reconstruction and novel view synthesis, owing to its +high-quality results and compatibility with hardware rasterization. Despite its +advantages, Gaussian Splatting's reliance on high-quality point cloud +initialization by Structure-from-Motion (SFM) algorithms is a significant +limitation to be overcome. To this end, we investigate various initialization +strategies for Gaussian Splatting and delve into how volumetric reconstructions +from Neural Radiance Fields (NeRF) can be utilized to bypass the dependency on +SFM data. Our findings demonstrate that random initialization can perform much +better if carefully designed and that by employing a combination of improved +initialization strategies and structure distillation from low-cost NeRF models, +it is possible to achieve equivalent results, or at times even superior, to +those obtained from SFM initialization. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ CoGS: Controllable Gaussian Splatting CVPR 2024 + + +
+ Capturing and re-animating the 3D structure of articulated objects present +significant barriers. On one hand, methods requiring extensively calibrated +multi-view setups are prohibitively complex and resource-intensive, limiting +their practical applicability. On the other hand, while single-camera Neural +Radiance Fields (NeRFs) offer a more streamlined approach, they have excessive +training and rendering costs. 3D Gaussian Splatting would be a suitable +alternative but for two reasons. Firstly, existing methods for 3D dynamic +Gaussians require synchronized multi-view cameras, and secondly, the lack of +controllability in dynamic scenarios. We present CoGS, a method for +Controllable Gaussian Splatting, that enables the direct manipulation of scene +elements, offering real-time control of dynamic scenes without the prerequisite +of pre-computing control signals. We evaluated CoGS using both synthetic and +real-world datasets that include dynamic objects that differ in degree of +difficulty. In our evaluations, CoGS consistently outperformed existing dynamic +and controllable neural representations in terms of visual fidelity. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning A Physical-aware Diffusion Model Based on Transformer for + Underwater Image Enhancement + + +
+ Underwater visuals undergo various complex degradations, inevitably +influencing the efficiency of underwater vision tasks. Recently, diffusion +models were employed to underwater image enhancement (UIE) tasks, and gained +SOTA performance. However, these methods fail to consider the physical +properties and underwater imaging mechanisms in the diffusion process, limiting +information completion capacity of diffusion models. In this paper, we +introduce a novel UIE framework, named PA-Diff, designed to exploiting the +knowledge of physics to guide the diffusion process. + PA-Diff consists of Physics Prior Generation (PPG) Branch, Implicit Neural +Reconstruction (INR) Branch, and Physics-aware Diffusion Transformer (PDT) +Branch. Our designed PPG branch aims to produce the prior knowledge of physics. +With utilizing the physics prior knowledge to guide the diffusion process, PDT +branch can obtain underwater-aware ability and model the complex distribution +in real-world underwater scenes. INR Branch can learn robust feature +representations from diverse underwater image via implicit neural +representation, which reduces the difficulty of restoration for PDT branch. +Extensive experiments prove that our method achieves best performance on UIE +tasks. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Learning of the Total Variation Flow + + +
+ The total variation (TV) flow generates a scale-space representation of an +image based on the TV functional. This gradient flow observes desirable +features for images, such as sharp edges and enables spectral, scale, and +texture analysis. Solving the TV flow is challenging; one reason is the the +non-uniqueness of the subgradients. The standard numerical approach for TV flow +requires solving multiple non-smooth optimisation problems. Even with +state-of-the-art convex optimisation techniques, this is often prohibitively +expensive and strongly motivates the use of alternative, faster approaches. +Inspired by and extending the framework of physics-informed neural networks +(PINNs), we propose the TVflowNET, an unsupervised neural network approach, to +approximate the solution of the TV flow given an initial image and a time +instance. The TVflowNET requires no ground truth data but rather makes use of +the PDE for optimisation of the network parameters. We circumvent the +challenges related to the non-uniqueness of the subgradients by additionally +learning the related diffusivity term. Our approach significantly speeds up the +computation time and we show that the TVflowNET approximates the TV flow +solution with high fidelity for different image sizes and image types. +Additionally, we give a full comparison of different network architecture +designs as well as training regimes to underscore the effectiveness of our +approach. + +
+
+
+
+
+ + ♻ ☆ Versatile Backdoor Attack with Visible, Semantic, Sample-Specific, and + Compatible Triggers + + +
+ Deep neural networks (DNNs) can be manipulated to exhibit specific behaviors +when exposed to specific trigger patterns, without affecting their performance +on benign samples, dubbed \textit{backdoor attack}. Currently, implementing +backdoor attacks in physical scenarios still faces significant challenges. +Physical attacks are labor-intensive and time-consuming, and the triggers are +selected in a manual and heuristic way. Moreover, expanding digital attacks to +physical scenarios faces many challenges due to their sensitivity to visual +distortions and the absence of counterparts in the real world. To address these +challenges, we define a novel trigger called the \textbf{V}isible, +\textbf{S}emantic, \textbf{S}ample-Specific, and \textbf{C}ompatible (VSSC) +trigger, to achieve effective, stealthy and robust simultaneously, which can +also be effectively deployed in the physical scenario using corresponding +objects. To implement the VSSC trigger, we propose an automated pipeline +comprising three modules: a trigger selection module that systematically +identifies suitable triggers leveraging large language models, a trigger +insertion module that employs generative models to seamlessly integrate +triggers into images, and a quality assessment module that ensures the natural +and successful insertion of triggers through vision-language models. Extensive +experimental results and analysis validate the effectiveness, stealthiness, and +robustness of the VSSC trigger. It can not only maintain robustness under +visual distortions but also demonstrates strong practicality in the physical +scenario. We hope that the proposed VSSC trigger and implementation approach +could inspire future studies on designing more practical triggers in backdoor +attacks. + +
+
+
+
+
+ + ♻ ☆ Advancing Graph Neural Networks with HL-HGAT: A Hodge-Laplacian and + Attention Mechanism Approach for Heterogeneous Graph-Structured Data + + +
+ Graph neural networks (GNNs) have proven effective in capturing relationships +among nodes in a graph. This study introduces a novel perspective by +considering a graph as a simplicial complex, encompassing nodes, edges, +triangles, and $k$-simplices, enabling the definition of graph-structured data +on any $k$-simplices. Our contribution is the Hodge-Laplacian heterogeneous +graph attention network (HL-HGAT), designed to learn heterogeneous signal +representations across $k$-simplices. The HL-HGAT incorporates three key +components: HL convolutional filters (HL-filters), simplicial projection (SP), +and simplicial attention pooling (SAP) operators, applied to $k$-simplices. +HL-filters leverage the unique topology of $k$-simplices encoded by the +Hodge-Laplacian (HL) operator, operating within the spectral domain of the +$k$-th HL operator. To address computation challenges, we introduce a +polynomial approximation for HL-filters, exhibiting spatial localization +properties. Additionally, we propose a pooling operator to coarsen +$k$-simplices, combining features through simplicial attention mechanisms of +self-attention and cross-attention via transformers and SP operators, capturing +topological interconnections across multiple dimensions of simplices. The +HL-HGAT is comprehensively evaluated across diverse graph applications, +including NP-hard problems, graph multi-label and classification challenges, +and graph regression tasks in logistics, computer vision, biology, chemistry, +and neuroscience. The results demonstrate the model's efficacy and versatility +in handling a wide range of graph-based scenarios. + +
+
+
+
+
+ + ♻ ☆ Neuromorphic Face Analysis: a Survey + + +
+ Neuromorphic sensors, also known as event cameras, are a class of imaging +devices mimicking the function of biological visual systems. Unlike traditional +frame-based cameras, which capture fixed images at discrete intervals, +neuromorphic sensors continuously generate events that represent changes in +light intensity or motion in the visual field with high temporal resolution and +low latency. These properties have proven to be interesting in modeling human +faces, both from an effectiveness and a privacy-preserving point of view. +Neuromorphic face analysis however is still a raw and unstructured field of +research, with several attempts at addressing different tasks with no clear +standard or benchmark. This survey paper presents a comprehensive overview of +capabilities, challenges and emerging applications in the domain of +neuromorphic face analysis, to outline promising directions and open issues. +After discussing the fundamental working principles of neuromorphic vision and +presenting an in-depth overview of the related research, we explore the current +state of available data, standard data representations, emerging challenges, +and limitations that require further investigation. This paper aims to +highlight the recent process in this evolving field to provide to both +experienced and newly come researchers an all-encompassing analysis of the +state of the art along with its problems and shortcomings. + +
+
+ comment: Submitted to Patter Recognition Letters +
+
+
+
+
+ + ♻ ☆ NeLF-Pro: Neural Light Field Probes for Multi-Scale Novel View Synthesis CVPR 2024 + + +
+ We present NeLF-Pro, a novel representation to model and reconstruct light +fields in diverse natural scenes that vary in extent and spatial granularity. +In contrast to previous fast reconstruction methods that represent the 3D scene +globally, we model the light field of a scene as a set of local light field +feature probes, parameterized with position and multi-channel 2D feature maps. +Our central idea is to bake the scene's light field into spatially varying +learnable representations and to query point features by weighted blending of +probes close to the camera - allowing for mipmap representation and rendering. +We introduce a novel vector-matrix-matrix (VMM) factorization technique that +effectively represents the light field feature probes as products of core +factors (i.e., VM) shared among local feature probes, and a basis factor (i.e., +M) - efficiently encoding internal relationships and patterns within the scene. +Experimentally, we demonstrate that NeLF-Pro significantly boosts the +performance of feature grid-based representations, and achieves fast +reconstruction with better rendering quality while maintaining compact +modeling. Project webpage https://sinoyou.github.io/nelf-pro/. + +
+
+ comment: CVPR 2024 Conference Paper, Camera Ready Version +
+
+
+
+
+ + ♻ ☆ Robustness and Visual Explanation for Black Box Image, Video, and ECG + Signal Classification with Reinforcement Learning AAAI + + +
+ We present a generic Reinforcement Learning (RL) framework optimized for +crafting adversarial attacks on different model types spanning from ECG signal +analysis (1D), image classification (2D), and video classification (3D). The +framework focuses on identifying sensitive regions and inducing +misclassifications with minimal distortions and various distortion types. The +novel RL method outperforms state-of-the-art methods for all three +applications, proving its efficiency. Our RL approach produces superior +localization masks, enhancing interpretability for image classification and ECG +analysis models. For applications such as ECG analysis, our platform highlights +critical ECG segments for clinicians while ensuring resilience against +prevalent distortions. This comprehensive tool aims to bolster both resilience +with adversarial training and transparency across varied applications and data +types. + +
+
+ comment: AAAI Proceedings reference: + https://ojs.aaai.org/index.php/AAAI/article/view/30579 +
+
+
+
+
+ + ♻ ☆ SPINEPS -- Automatic Whole Spine Segmentation of T2-weighted MR images + using a Two-Phase Approach to Multi-class Semantic and Instance Segmentation + + +
+ Purpose. To present SPINEPS, an open-source deep learning approach for +semantic and instance segmentation of 14 spinal structures (ten vertebra +substructures, intervertebral discs, spinal cord, spinal canal, and sacrum) in +whole body T2w MRI. + Methods. During this HIPPA-compliant, retrospective study, we utilized the +public SPIDER dataset (218 subjects, 63% female) and a subset of the German +National Cohort (1423 subjects, mean age 53, 49% female) for training and +evaluation. We combined CT and T2w segmentations to train models that segment +14 spinal structures in T2w sagittal scans both semantically and instance-wise. +Performance evaluation metrics included Dice similarity coefficient, average +symmetrical surface distance, panoptic quality, segmentation quality, and +recognition quality. Statistical significance was assessed using the Wilcoxon +signed-rank test. An in-house dataset was used to qualitatively evaluate +out-of-distribution samples. + Results. On the public dataset, our approach outperformed the baseline +(instance-wise vertebra dice score 0.929 vs. 0.907, p-value<0.001). Training on +auto-generated annotations and evaluating on manually corrected test data from +the GNC yielded global dice scores of 0.900 for vertebrae, 0.960 for +intervertebral discs, and 0.947 for the spinal canal. Incorporating the SPIDER +dataset during training increased these scores to 0.920, 0.967, 0.958, +respectively. + Conclusions. The proposed segmentation approach offers robust segmentation of +14 spinal structures in T2w sagittal images, including the spinal cord, spinal +canal, intervertebral discs, endplate, sacrum, and vertebrae. The approach +yields both a semantic and instance mask as output, thus being easy to utilize. +This marks the first publicly available algorithm for whole spine segmentation +in sagittal T2w MR imaging. + +
+
+ comment: https://github.com/Hendrik-code/spineps +
+
+
+
+
+ + ♻ ☆ YOLOOC: YOLO-based Open-Class Incremental Object Detection with Novel + Class Discovery + + +
+ Because of its use in practice, open-world object detection (OWOD) has gotten +a lot of attention recently. The challenge is how can a model detect novel +classes and then incrementally learn them without forgetting previously known +classes. Previous approaches hinge on strongly-supervised or weakly-supervised +novel-class data for novel-class detection, which may not apply to real +applications. We construct a new benchmark that novel classes are only +encountered at the inference stage. And we propose a new OWOD detector YOLOOC, +based on the YOLO architecture yet for the Open-Class setup. We introduce label +smoothing to prevent the detector from over-confidently mapping novel classes +to known classes and to discover novel classes. Extensive experiments conducted +on our more realistic setup demonstrate the effectiveness of our method for +discovering novel classes in our new benchmark. + +
+
+ comment: Withdrawn because it was submitted without consent of the first + author. In addition, this submission has some errors +
+
+
+
+
+ + ♻ ☆ Multilevel Geometric Optimization for Regularised Constrained Linear + Inverse Problems + + +
+ We present a geometric multilevel optimization approach that smoothly +incorporates box constraints. Given a box constrained optimization problem, we +consider a hierarchy of models with varying discretization levels. Finer models +are accurate but expensive to compute, while coarser models are less accurate +but cheaper to compute. When working at the fine level, multilevel optimisation +computes the search direction based on a coarser model which speeds up updates +at the fine level. Moreover, exploiting geometry induced by the hierarchy the +feasibility of the updates is preserved. In particular, our approach extends +classical components of multigrid methods like restriction and prolongation to +the Riemannian structure of our constraints. + +
+
+ comment: 25 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Dynamic Cross Attention for Audio-Visual Person Verification + + +
+ Although person or identity verification has been predominantly explored +using individual modalities such as face and voice, audio-visual fusion has +recently shown immense potential to outperform unimodal approaches. Audio and +visual modalities are often expected to pose strong complementary +relationships, which plays a crucial role in effective audio-visual fusion. +However, they may not always strongly complement each other, they may also +exhibit weak complementary relationships, resulting in poor audio-visual +feature representations. In this paper, we propose a Dynamic Cross-Attention +(DCA) model that can dynamically select the cross-attended or unattended +features on the fly based on the strong or weak complementary relationships, +respectively, across audio and visual modalities. In particular, a conditional +gating layer is designed to evaluate the contribution of the cross-attention +mechanism and choose cross-attended features only when they exhibit strong +complementary relationships, otherwise unattended features. Extensive +experiments are conducted on the Voxceleb1 dataset to demonstrate the +robustness of the proposed model. Results indicate that the proposed model +consistently improves the performance on multiple variants of cross-attention +while outperforming the state-of-the-art methods. + +
+
+ comment: Accepted to FG2024 +
+
+
+
+
+ + ♻ ☆ Implicit and Explicit Language Guidance for Diffusion-based Visual + Perception + + +
+ Text-to-image diffusion models have shown powerful ability on conditional +image synthesis. With large-scale vision-language pre-training, diffusion +models are able to generate high-quality images with rich texture and +reasonable structure under different text prompts. However, it is an open +problem to adapt the pre-trained diffusion model for visual perception. In this +paper, we propose an implicit and explicit language guidance framework for +diffusion-based perception, named IEDP. Our IEDP comprises an implicit language +guidance branch and an explicit language guidance branch. The implicit branch +employs frozen CLIP image encoder to directly generate implicit text embeddings +that are fed to diffusion model, without using explicit text prompts. The +explicit branch utilizes the ground-truth labels of corresponding images as +text prompts to condition feature extraction of diffusion model. During +training, we jointly train diffusion model by sharing the model weights of +these two branches. As a result, implicit and explicit branches can jointly +guide feature learning. During inference, we only employ implicit branch for +final prediction, which does not require any ground-truth labels. Experiments +are performed on two typical perception tasks, including semantic segmentation +and depth estimation. Our IEDP achieves promising performance on both tasks. +For semantic segmentation, our IEDP has the mIoU$^\text{ss}$ score of 55.9% on +AD20K validation set, which outperforms the baseline method VPD by 2.2%. For +depth estimation, our IEDP outperforms the baseline method VPD with a relative +gain of 11.0%. + +
+
+
+
+
+ + ♻ ☆ SE(3)-Equivariant and Noise-Invariant 3D Rigid Motion Tracking in Brain + MRI + + +
+ Rigid motion tracking is paramount in many medical imaging applications where +movements need to be detected, corrected, or accounted for. Modern strategies +rely on convolutional neural networks (CNN) and pose this problem as rigid +registration. Yet, CNNs do not exploit natural symmetries in this task, as they +are equivariant to translations (their outputs shift with their inputs) but not +to rotations. Here we propose EquiTrack, the first method that uses recent +steerable SE(3)-equivariant CNNs (E-CNN) for motion tracking. While steerable +E-CNNs can extract corresponding features across different poses, testing them +on noisy medical images reveals that they do not have enough learning capacity +to learn noise invariance. Thus, we introduce a hybrid architecture that pairs +a denoiser with an E-CNN to decouple the processing of anatomically irrelevant +intensity features from the extraction of equivariant spatial features. Rigid +transforms are then estimated in closed-form. EquiTrack outperforms +state-of-the-art learning and optimisation methods for motion tracking in adult +brain MRI and fetal MRI time series. Our code is available at +https://github.com/BBillot/EquiTrack. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Think Twice Before Selection: Federated Evidential Active Learning for + Medical Image Analysis with Domain Shifts CVPR 2024 + + +
+ Federated learning facilitates the collaborative learning of a global model +across multiple distributed medical institutions without centralizing data. +Nevertheless, the expensive cost of annotation on local clients remains an +obstacle to effectively utilizing local data. To mitigate this issue, federated +active learning methods suggest leveraging local and global model predictions +to select a relatively small amount of informative local data for annotation. +However, existing methods mainly focus on all local data sampled from the same +domain, making them unreliable in realistic medical scenarios with domain +shifts among different clients. In this paper, we make the first attempt to +assess the informativeness of local data derived from diverse domains and +propose a novel methodology termed Federated Evidential Active Learning (FEAL) +to calibrate the data evaluation under domain shift. Specifically, we introduce +a Dirichlet prior distribution in both local and global models to treat the +prediction as a distribution over the probability simplex and capture both +aleatoric and epistemic uncertainties by using the Dirichlet-based evidential +model. Then we employ the epistemic uncertainty to calibrate the aleatoric +uncertainty. Afterward, we design a diversity relaxation strategy to reduce +data redundancy and maintain data diversity. Extensive experiments and analysis +on five real multi-center medical image datasets demonstrate the superiority of +FEAL over the state-of-the-art active learning methods in federated scenarios +with domain shifts. The code will be available at +https://github.com/JiayiChen815/FEAL. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Microscopy Defocus Deblur with Multi-Pyramid + Transformer and Contrastive Learning CVPR 2024 + + +
+ Defocus blur is a persistent problem in microscope imaging that poses harm to +pathology interpretation and medical intervention in cell microscopy and +microscope surgery. To address this problem, a unified framework including the +multi-pyramid transformer (MPT) and extended frequency contrastive +regularization (EFCR) is proposed to tackle two outstanding challenges in +microscopy deblur: longer attention span and data deficiency. The MPT employs +an explicit pyramid structure at each network stage that integrates the +cross-scale window attention (CSWA), the intra-scale channel attention (ISCA), +and the feature-enhancing feed-forward network (FEFN) to capture long-range +cross-scale spatial interaction and global channel context. The EFCR addresses +the data deficiency problem by exploring latent deblur signals from different +frequency bands. It also enables deblur knowledge transfer to learn +cross-domain information from extra data, improving deblur performance for +labeled and unlabeled data. Extensive experiments and downstream task +validation show the framework achieves state-of-the-art performance across +multiple datasets. Project page: https://github.com/PieceZhang/MPT-CataBlur. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Feedback Generation in Automated Skeletal Movement Assessment: + A Comprehensive Overview + + +
+ The application of machine-learning solutions to movement assessment from +skeleton videos has attracted significant research attention in recent years. +This advancement has made rehabilitation at home more accessible, utilizing +movement assessment algorithms that can operate on affordable equipment for +human pose detection and analysis from 2D or 3D videos. While the primary +objective of automatic assessment tasks is to score movements, the automatic +generation of feedback highlighting key movement issues has the potential to +significantly enhance and accelerate the rehabilitation process. While numerous +research works exist in the field of automatic movement assessment, only a +handful address feedback generation. In this study, we explain the types of +feedback that can be generated, review existing solutions for automatic +feedback generation, and discuss future research directions. To our knowledge, +this is the first comprehensive review of feedback generation in skeletal +movement assessment. + +
+
+
+
+
+ + ♻ ☆ Manga109Dialog: A Large-scale Dialogue Dataset for Comics Speaker + Detection ICME2024 + + +
+ The expanding market for e-comics has spurred interest in the development of +automated methods to analyze comics. For further understanding of comics, an +automated approach is needed to link text in comics to characters speaking the +words. Comics speaker detection research has practical applications, such as +automatic character assignment for audiobooks, automatic translation according +to characters' personalities, and inference of character relationships and +stories. + To deal with the problem of insufficient speaker-to-text annotations, we +created a new annotation dataset Manga109Dialog based on Manga109. +Manga109Dialog is the world's largest comics speaker annotation dataset, +containing 132,692 speaker-to-text pairs. We further divided our dataset into +different levels by prediction difficulties to evaluate speaker detection +methods more appropriately. Unlike existing methods mainly based on distances, +we propose a deep learning-based method using scene graph generation models. +Due to the unique features of comics, we enhance the performance of our +proposed model by considering the frame reading order. We conducted experiments +using Manga109Dialog and other datasets. Experimental results demonstrate that +our scene-graph-based approach outperforms existing methods, achieving a +prediction accuracy of over 75%. + +
+
+ comment: Accepted to ICME2024 +
+
+
+
+
+ + ♻ ☆ OccFusion: A Straightforward and Effective Multi-Sensor Fusion Framework + for 3D Occupancy Prediction + + +
+ This paper introduces OccFusion, a straightforward and efficient sensor +fusion framework for predicting 3D occupancy. A comprehensive understanding of +3D scenes is crucial in autonomous driving, and recent models for 3D semantic +occupancy prediction have successfully addressed the challenge of describing +real-world objects with varied shapes and classes. However, existing methods +for 3D occupancy prediction heavily rely on surround-view camera images, making +them susceptible to changes in lighting and weather conditions. By integrating +features from additional sensors, such as lidar and surround view radars, our +framework enhances the accuracy and robustness of occupancy prediction, +resulting in top-tier performance on the nuScenes benchmark. Furthermore, +extensive experiments conducted on the nuScenes dataset, including challenging +night and rainy scenarios, confirm the superior performance of our sensor +fusion strategy across various perception ranges. The code for this framework +will be made available at https://github.com/DanielMing123/OCCFusion. + +
+
+
+
+
+ + ♻ ☆ Seeing Text in the Dark: Algorithm and Benchmark + + +
+ Localizing text in low-light environments is challenging due to visual +degradations. Although a straightforward solution involves a two-stage pipeline +with low-light image enhancement (LLE) as the initial step followed by +detector, LLE is primarily designed for human vision instead of machine and can +accumulate errors. In this work, we propose an efficient and effective +single-stage approach for localizing text in dark that circumvents the need for +LLE. We introduce a constrained learning module as an auxiliary mechanism +during the training stage of the text detector. This module is designed to +guide the text detector in preserving textual spatial features amidst feature +map resizing, thus minimizing the loss of spatial information in texts under +low-light visual degradations. Specifically, we incorporate spatial +reconstruction and spatial semantic constraints within this module to ensure +the text detector acquires essential positional and contextual range knowledge. +Our approach enhances the original text detector's ability to identify text's +local topological features using a dynamic snake feature pyramid network and +adopts a bottom-up contour shaping strategy with a novel rectangular +accumulation technique for accurate delineation of streamlined text features. +In addition, we present a comprehensive low-light dataset for arbitrary-shaped +text, encompassing diverse scenes and languages. Notably, our method achieves +state-of-the-art results on this low-light dataset and exhibits comparable +performance on standard normal light datasets. The code and dataset will be +released. + +
+
+
+
+
+ + ♻ ☆ Bridging Stereo Geometry and BEV Representation with Reliable Mutual + Interaction for Semantic Scene Completion IJCAI2024 + + +
+ 3D semantic scene completion (SSC) is an ill-posed perception task that +requires inferring a dense 3D scene from limited observations. Previous +camera-based methods struggle to predict accurate semantic scenes due to +inherent geometric ambiguity and incomplete observations. In this paper, we +resort to stereo matching technique and bird's-eye-view (BEV) representation +learning to address such issues in SSC. Complementary to each other, stereo +matching mitigates geometric ambiguity with epipolar constraint while BEV +representation enhances the hallucination ability for invisible regions with +global semantic context. However, due to the inherent representation gap +between stereo geometry and BEV features, it is non-trivial to bridge them for +dense prediction task of SSC. Therefore, we further develop a unified +occupancy-based framework dubbed BRGScene, which effectively bridges these two +representations with dense 3D volumes for reliable semantic scene completion. +Specifically, we design a novel Mutual Interactive Ensemble (MIE) block for +pixel-level reliable aggregation of stereo geometry and BEV features. Within +the MIE block, a Bi-directional Reliable Interaction (BRI) module, enhanced +with confidence re-weighting, is employed to encourage fine-grained interaction +through mutual guidance. Besides, a Dual Volume Ensemble (DVE) module is +introduced to facilitate complementary aggregation through channel-wise +recalibration and multi-group voting. Our method outperforms all published +camera-based methods on SemanticKITTI for semantic scene completion. Our code +is available on \url{https://github.com/Arlo0o/StereoScene}. + +
+
+ comment: IJCAI2024 (https://github.com/Arlo0o/StereoScene) +
+
+
+
+
+ + ♻ ☆ Unifying Feature and Cost Aggregation with Transformers for Semantic and + Visual Correspondence ICLR'24 + + +
+ This paper introduces a Transformer-based integrative feature and cost +aggregation network designed for dense matching tasks. In the context of dense +matching, many works benefit from one of two forms of aggregation: feature +aggregation, which pertains to the alignment of similar features, or cost +aggregation, a procedure aimed at instilling coherence in the flow estimates +across neighboring pixels. In this work, we first show that feature aggregation +and cost aggregation exhibit distinct characteristics and reveal the potential +for substantial benefits stemming from the judicious use of both aggregation +processes. We then introduce a simple yet effective architecture that harnesses +self- and cross-attention mechanisms to show that our approach unifies feature +aggregation and cost aggregation and effectively harnesses the strengths of +both techniques. Within the proposed attention layers, the features and cost +volume both complement each other, and the attention layers are interleaved +through a coarse-to-fine design to further promote accurate correspondence +estimation. Finally at inference, our network produces multi-scale predictions, +computes their confidence scores, and selects the most confident flow for final +prediction. Our framework is evaluated on standard benchmarks for semantic +matching, and also applied to geometric matching, where we show that our +approach achieves significant improvements compared to existing methods. + +
+
+ comment: Accepted by ICLR'24 +
+
+
+
+
+ + ♻ ☆ EGGS: Edge Guided Gaussian Splatting for Radiance Fields + + +
+ The Gaussian splatting methods are getting popular. However, their loss +function only contains the $\ell_1$ norm and the structural similarity between +the rendered and input images, without considering the edges in these images. +It is well-known that the edges in an image provide important information. +Therefore, in this paper, we propose an Edge Guided Gaussian Splatting (EGGS) +method that leverages the edges in the input images. More specifically, we give +the edge region a higher weight than the flat region. With such edge guidance, +the resulting Gaussian particles focus more on the edges instead of the flat +regions. Moreover, such edge guidance does not crease the computation cost +during the training and rendering stage. The experiments confirm that such +simple edge-weighted loss function indeed improves about $1\sim2$ dB on several +difference data sets. With simply plugging in the edge guidance, the proposed +method can improve all Gaussian splatting methods in different scenarios, such +as human head modeling, building 3D reconstruction, etc. + +
+
+
+
+
+ + ♻ ☆ NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous + Driving + + +
+ We present a versatile NeRF-based simulator for testing autonomous driving +(AD) software systems, designed with a focus on sensor-realistic closed-loop +evaluation and the creation of safety-critical scenarios. The simulator learns +from sequences of real-world driving sensor data and enables reconfigurations +and renderings of new, unseen scenarios. In this work, we use our simulator to +test the responses of AD models to safety-critical scenarios inspired by the +European New Car Assessment Programme (Euro NCAP). Our evaluation reveals that, +while state-of-the-art end-to-end planners excel in nominal driving scenarios +in an open-loop setting, they exhibit critical flaws when navigating our +safety-critical scenarios in a closed-loop setting. This highlights the need +for advancements in the safety and real-world usability of end-to-end planners. +By publicly releasing our simulator and scenarios as an easy-to-run evaluation +suite, we invite the research community to explore, refine, and validate their +AD models in controlled, yet highly configurable and challenging +sensor-realistic environments. Code and instructions can be found at +https://github.com/atonderski/neuro-ncap + +
+
+
+
+
+ + ♻ ☆ DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On + Transformer + + +
+ With the continuous development of OCR technology and the expansion of +application fields, text recognition in complex scenes has become a key +challenge. Factors such as multiple fonts, mixed scenes and complex layouts +seriously affect the recognition accuracy of traditional OCR models. Although +OCR models based on deep learning have performed well in specific fields or +similar datasets in recent years, the generalization ability and robustness of +the model are still a big challenge when facing complex environments with +multiple scenes. Furthermore, training an OCR model from scratch or fine-tuning +all parameters is very demanding on computing resources and inference time, +which limits the flexibility of its application. This study focuses on a +fundamental aspect of mixed text recognition in response to the challenges +mentioned above, which involves effectively fine-tuning the pre-trained basic +OCR model to demonstrate exceptional performance across various downstream +tasks. To this end, we propose a parameter-efficient mixed text recognition +method based on pre-trained OCR Transformer, namely DLoRA-TrOCR. This method +embeds DoRA into the image encoder and LoRA into the internal structure of the +text decoder, enabling efficient parameter fine-tuning for downstream tasks. +Experimental results show that compared to similar parameter adjustment +methods, our model DLoRA-TrOCR has the smallest number of parameters and +performs better. It can achieve state-of-the-art performance on complex scene +datasets involving simultaneous recognition of mixed handwritten, printed and +street view texts. + +
+
+
+
+
+ + ♻ ☆ SAFDNet: A Simple and Effective Network for Fully Sparse 3D Object + Detection CVPR 2024 + + +
+ LiDAR-based 3D object detection plays an essential role in autonomous +driving. Existing high-performing 3D object detectors usually build dense +feature maps in the backbone network and prediction head. However, the +computational costs introduced by the dense feature maps grow quadratically as +the perception range increases, making these models hard to scale up to +long-range detection. Some recent works have attempted to construct fully +sparse detectors to solve this issue; nevertheless, the resulting models either +rely on a complex multi-stage pipeline or exhibit inferior performance. In this +work, we propose SAFDNet, a straightforward yet highly effective architecture, +tailored for fully sparse 3D object detection. In SAFDNet, an adaptive feature +diffusion strategy is designed to address the center feature missing problem. +We conducted extensive experiments on Waymo Open, nuScenes, and Argoverse2 +datasets. SAFDNet performed slightly better than the previous SOTA on the first +two datasets but much better on the last dataset, which features long-range +detection, verifying the efficacy of SAFDNet in scenarios where long-range +detection is required. Notably, on Argoverse2, SAFDNet surpassed the previous +best hybrid detector HEDNet by 2.6% mAP while being 2.1x faster, and yielded +2.1% mAP gains over the previous best sparse detector FSDv2 while being 1.3x +faster. The code will be available at https://github.com/zhanggang001/HEDNet. + +
+
+ comment: Accepted by CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Improved cryo-EM Pose Estimation and 3D Classification through + Latent-Space Disentanglement + + +
+ Due to the extremely low signal-to-noise ratio (SNR) and unknown poses +(projection angles and image shifts) in cryo-electron microscopy (cryo-EM) +experiments, reconstructing 3D volumes from 2D images is very challenging. In +addition to these challenges, heterogeneous cryo-EM reconstruction requires +conformational classification. In popular cryo-EM reconstruction algorithms, +poses and conformation classification labels must be predicted for every input +cryo-EM image, which can be computationally costly for large datasets. An +emerging class of methods adopted the amortized inference approach. In these +methods, only a subset of the input dataset is needed to train neural networks +for the estimation of poses and conformations. Once trained, these neural +networks can make pose/conformation predictions and 3D reconstructions at low +cost for the entire dataset during inference. Unfortunately, when facing +heterogeneous reconstruction tasks, it is hard for current +amortized-inference-based methods to effectively estimate the conformational +distribution and poses from entangled latent variables. Here, we propose a +self-supervised variational autoencoder architecture called "HetACUMN" based on +amortized inference. We employed an auxiliary conditional pose prediction task +by inverting the order of encoder-decoder to explicitly enforce the +disentanglement of conformation and pose predictions. Results on simulated +datasets show that HetACUMN generated more accurate conformational +classifications than other amortized or non-amortized methods. Furthermore, we +show that HetACUMN is capable of performing heterogeneous 3D reconstructions of +a real experimental dataset. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face + Recognition through Synthetic Faces + + +
+ Recent advances in deep face recognition have spurred a growing demand for +large, diverse, and manually annotated face datasets. Acquiring authentic, +high-quality data for face recognition has proven to be a challenge, primarily +due to privacy concerns. Large face datasets are primarily sourced from +web-based images, lacking explicit user consent. In this paper, we examine +whether and how synthetic face data can be used to train effective face +recognition models with reduced reliance on authentic images, thereby +mitigating data collection concerns. First, we explored the performance gap +among recent state-of-the-art face recognition models, trained with synthetic +data only and authentic (scarce) data only. Then, we deepened our analysis by +training a state-of-the-art backbone with various combinations of synthetic and +authentic data, gaining insights into optimizing the limited use of the latter +for verification accuracy. Finally, we assessed the effectiveness of data +augmentation approaches on synthetic and authentic data, with the same goal in +mind. Our results highlighted the effectiveness of FR trained on combined +datasets, particularly when combined with appropriate augmentation techniques. + +
+
+ comment: Accepted as full paper at FG 2024 main track +
+
+
+
+
+ + ♻ ☆ Hidden Flaws Behind Expert-Level Accuracy of GPT-4 Vision in Medicine + + +
+ Recent studies indicate that Generative Pre-trained Transformer 4 with Vision +(GPT-4V) outperforms human physicians in medical challenge tasks. However, +these evaluations primarily focused on the accuracy of multi-choice questions +alone. Our study extends the current scope by conducting a comprehensive +analysis of GPT-4V's rationales of image comprehension, recall of medical +knowledge, and step-by-step multimodal reasoning when solving New England +Journal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test +the knowledge and diagnostic capabilities of medical professionals. Evaluation +results confirmed that GPT-4V performs comparatively to human physicians +regarding multi-choice accuracy (81.6% vs. 77.8%). GPT-4V also performs well in +cases where physicians incorrectly answer, with over 78% accuracy. However, we +discovered that GPT-4V frequently presents flawed rationales in cases where it +makes the correct final choices (35.5%), most prominent in image comprehension +(27.2%). Regardless of GPT-4V's high accuracy in multi-choice questions, our +findings emphasize the necessity for further in-depth evaluations of its +rationales before integrating such multimodal AI models into clinical +workflows. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Multichannel Orthogonal Transform-Based Perceptron Layers for Efficient + ResNets + + +
+ In this paper, we propose a set of transform-based neural network layers as +an alternative to the $3\times3$ Conv2D layers in Convolutional Neural Networks +(CNNs). The proposed layers can be implemented based on orthogonal transforms +such as the Discrete Cosine Transform (DCT), Hadamard transform (HT), and +biorthogonal Block Wavelet Transform (BWT). Furthermore, by taking advantage of +the convolution theorems, convolutional filtering operations are performed in +the transform domain using element-wise multiplications. Trainable +soft-thresholding layers, that remove noise in the transform domain, bring +nonlinearity to the transform domain layers. Compared to the Conv2D layer, +which is spatial-agnostic and channel-specific, the proposed layers are +location-specific and channel-specific. Moreover, these proposed layers reduce +the number of parameters and multiplications significantly while improving the +accuracy results of regular ResNets on the ImageNet-1K classification task. +Furthermore, they can be inserted with a batch normalization layer before the +global average pooling layer in the conventional ResNets as an additional layer +to improve classification accuracy. + +
+
+ comment: This work is accepted to IEEE Transactions on Neural Networks and + Learning Systems. The initial title is "Orthogonal Transform Domain + Approaches for the Convolutional Layer". We changed it to "Multichannel + Orthogonal Transform-Based Perceptron Layers for Efficient ResNets" based on + reviewer's comment. arXiv admin note: text overlap with arXiv:2211.08577 +
+
+
+
+
+ + ♻ ☆ Non-negative Contrastive Learning ICLR 2024 + + +
+ Deep representations have shown promising performance when transferred to +downstream tasks in a black-box manner. Yet, their inherent lack of +interpretability remains a significant challenge, as these features are often +opaque to human understanding. In this paper, we propose Non-negative +Contrastive Learning (NCL), a renaissance of Non-negative Matrix Factorization +(NMF) aimed at deriving interpretable features. The power of NCL lies in its +enforcement of non-negativity constraints on features, reminiscent of NMF's +capability to extract features that align closely with sample clusters. NCL not +only aligns mathematically well with an NMF objective but also preserves NMF's +interpretability attributes, resulting in a more sparse and disentangled +representation compared to standard contrastive learning (CL). Theoretically, +we establish guarantees on the identifiability and downstream generalization of +NCL. Empirically, we show that these advantages enable NCL to outperform CL +significantly on feature disentanglement, feature selection, as well as +downstream classification tasks. At last, we show that NCL can be easily +extended to other learning scenarios and benefit supervised learning as well. +Code is available at https://github.com/PKU-ML/non_neg. + +
+
+ comment: 22 pages. Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Representing Anatomical Trees by Denoising Diffusion of Implicit Neural + Fields + + +
+ Anatomical trees play a central role in clinical diagnosis and treatment +planning. However, accurately representing anatomical trees is challenging due +to their varying and complex topology and geometry. Traditional methods for +representing tree structures, captured using medical imaging, while invaluable +for visualizing vascular and bronchial networks, exhibit drawbacks in terms of +limited resolution, flexibility, and efficiency. Recently, implicit neural +representations (INRs) have emerged as a powerful tool for representing shapes +accurately and efficiently. We propose a novel approach for representing +anatomical trees using INR, while also capturing the distribution of a set of +trees via denoising diffusion in the space of INRs. We accurately capture the +intricate geometries and topologies of anatomical trees at any desired +resolution. Through extensive qualitative and quantitative evaluation, we +demonstrate high-fidelity tree reconstruction with arbitrary resolution yet +compact storage, and versatility across anatomical sites and tree complexities. + +
+
+ comment: Preprint. In review. Code: https://github.com/sinAshish/TreeDiffusion +
+
+
+
+
+ + ♻ ☆ Solutions to Elliptic and Parabolic Problems via Finite Difference Based + Unsupervised Small Linear Convolutional Neural Networks + + +
+ In recent years, there has been a growing interest in leveraging deep +learning and neural networks to address scientific problems, particularly in +solving partial differential equations (PDEs). However, many neural +network-based methods like PINNs rely on auto differentiation and sampling +collocation points, leading to a lack of interpretability and lower accuracy +than traditional numerical methods. As a result, we propose a fully +unsupervised approach, requiring no training data, to estimate finite +difference solutions for PDEs directly via small linear convolutional neural +networks. Our proposed approach uses substantially fewer parameters than +similar finite difference-based approaches while also demonstrating comparable +accuracy to the true solution for several selected elliptic and parabolic +problems compared to the finite difference method. + +
+
+ comment: Submitted to CMA, under review +
+
+
+
+
+ + ♻ ☆ Paved2Paradise: Cost-Effective and Scalable LiDAR Simulation by + Factoring the Real World CVPR + 2024 + + +
+ To achieve strong real world performance, neural networks must be trained on +large, diverse datasets; however, obtaining and annotating such datasets is +costly and time-consuming, particularly for 3D point clouds. In this paper, we +describe Paved2Paradise, a simple, cost-effective approach for generating fully +labeled, diverse, and realistic lidar datasets from scratch, all while +requiring minimal human annotation. Our key insight is that, by deliberately +collecting separate "background" and "object" datasets (i.e., "factoring the +real world"), we can intelligently combine them to produce a combinatorially +large and diverse training set. The Paved2Paradise pipeline thus consists of +four steps: (1) collecting copious background data, (2) recording individuals +from the desired object class(es) performing different behaviors in an isolated +environment (like a parking lot), (3) bootstrapping labels for the object +dataset, and (4) generating samples by placing objects at arbitrary locations +in backgrounds. To demonstrate the utility of Paved2Paradise, we generated +synthetic datasets for two tasks: (1) human detection in orchards (a task for +which no public data exists) and (2) pedestrian detection in urban +environments. Qualitatively, we find that a model trained exclusively on +Paved2Paradise synthetic data is highly effective at detecting humans in +orchards, including when individuals are heavily occluded by tree branches. +Quantitatively, a model trained on Paved2Paradise data that sources backgrounds +from KITTI performs comparably to a model trained on the actual dataset. These +results suggest the Paved2Paradise synthetic data pipeline can help accelerate +point cloud model development in sectors where acquiring lidar datasets has +previously been cost-prohibitive. + +
+
+ comment: Accepted to the Synthetic Data for Computer Vision workshop at CVPR + 2024 +
+
+
+
+
+ + ♻ ☆ FlashTex: Fast Relightable Mesh Texturing with LightControlNet + + +
+ Manually creating textures for 3D meshes is time-consuming, even for expert +visual content creators. We propose a fast approach for automatically texturing +an input 3D mesh based on a user-provided text prompt. Importantly, our +approach disentangles lighting from surface material/reflectance in the +resulting texture so that the mesh can be properly relit and rendered in any +lighting environment. We introduce LightControlNet, a new text-to-image model +based on the ControlNet architecture, which allows the specification of the +desired lighting as a conditioning image to the model. Our text-to-texture +pipeline then constructs the texture in two stages. The first stage produces a +sparse set of visually consistent reference views of the mesh using +LightControlNet. The second stage applies a texture optimization based on Score +Distillation Sampling (SDS) that works with LightControlNet to increase the +texture quality while disentangling surface material from lighting. Our +algorithm is significantly faster than previous text-to-texture methods, while +producing high-quality and relightable textures. + +
+
+ comment: Project page: https://flashtex.github.io/ +
+
+
+
+
+ + ♻ ☆ Holodeck: Language Guided Generation of 3D Embodied AI Environments CVPR 2024 + + +
+ 3D simulated environments play a critical role in Embodied AI, but their +creation requires expertise and extensive manual effort, restricting their +diversity and scope. To mitigate this limitation, we present Holodeck, a system +that generates 3D environments to match a user-supplied prompt fully +automatedly. Holodeck can generate diverse scenes, e.g., arcades, spas, and +museums, adjust the designs for styles, and can capture the semantics of +complex queries such as "apartment for a researcher with a cat" and "office of +a professor who is a fan of Star Wars". Holodeck leverages a large language +model (i.e., GPT-4) for common sense knowledge about what the scene might look +like and uses a large collection of 3D assets from Objaverse to populate the +scene with diverse objects. To address the challenge of positioning objects +correctly, we prompt GPT-4 to generate spatial relational constraints between +objects and then optimize the layout to satisfy those constraints. Our +large-scale human evaluation shows that annotators prefer Holodeck over +manually designed procedural baselines in residential scenes and that Holodeck +can produce high-quality outputs for diverse scene types. We also demonstrate +an exciting application of Holodeck in Embodied AI, training agents to navigate +in novel scenes like music rooms and daycares without human-constructed data, +which is a significant step forward in developing general-purpose embodied +agents. + +
+
+ comment: Published in CVPR 2024, 21 pages, 27 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ DAM: Dynamic Adapter Merging for Continual Video QA Learning + + +
+ We present a parameter-efficient method for continual video +question-answering (VidQA) learning. Our method, named DAM, uses the proposed +Dynamic Adapter Merging to (i) mitigate catastrophic forgetting, (ii) enable +efficient adaptation to continually arriving datasets, (iii) handle inputs from +unknown datasets during inference, and (iv) enable knowledge sharing across +similar dataset domains. Given a set of continually streaming VidQA datasets, +we sequentially train dataset-specific adapters for each dataset while freezing +the parameters of a large pretrained video-language backbone. During inference, +given a video-question sample from an unknown domain, our method first uses the +proposed non-parametric router function to compute a probability for each +adapter, reflecting how relevant that adapter is to the current video-question +input instance. Subsequently, the proposed dynamic adapter merging scheme +aggregates all the adapter weights into a new adapter instance tailored for +that particular test sample to compute the final VidQA prediction, mitigating +the impact of inaccurate router predictions and facilitating knowledge sharing +across domains. Our DAM model outperforms prior state-of-the-art continual +learning approaches by 9.1% while exhibiting 1.9% less forgetting on 6 VidQA +datasets spanning various domains. We further extend DAM to continual image +classification and image QA and outperform prior methods by a large margin. The +code is publicly available at: https://github.com/klauscc/DAM + +
+
+ comment: The first two authors contribute equally +
+
+
+
+
+ + ♻ ☆ IntrinsicAnything: Learning Diffusion Priors for Inverse Rendering Under + Unknown Illumination + + +
+ This paper aims to recover object materials from posed images captured under +an unknown static lighting condition. Recent methods solve this task by +optimizing material parameters through differentiable physically based +rendering. However, due to the coupling between object geometry, materials, and +environment lighting, there is inherent ambiguity during the inverse rendering +process, preventing previous methods from obtaining accurate results. To +overcome this ill-posed problem, our key idea is to learn the material prior +with a generative model for regularizing the optimization process. We observe +that the general rendering equation can be split into diffuse and specular +shading terms, and thus formulate the material prior as diffusion models of +albedo and specular. Thanks to this design, our model can be trained using the +existing abundant 3D object data, and naturally acts as a versatile tool to +resolve the ambiguity when recovering material representations from RGB images. +In addition, we develop a coarse-to-fine training strategy that leverages +estimated materials to guide diffusion models to satisfy multi-view consistent +constraints, leading to more stable and accurate results. Extensive experiments +on real-world and synthetic datasets demonstrate that our approach achieves +state-of-the-art performance on material recovery. The code will be available +at https://zju3dv.github.io/IntrinsicAnything. + +
+
+ comment: Project page: https://zju3dv.github.io/IntrinsicAnything +
+
+
+
+
+ + ♻ ☆ SCT: A Simple Baseline for Parameter-Efficient Fine-Tuning via Salient + Channels + + +
+ Pre-trained vision transformers have strong representation benefits to +various downstream tasks. Recently, many parameter-efficient fine-tuning (PEFT) +methods have been proposed, and their experiments demonstrate that tuning only +1% of extra parameters could surpass full fine-tuning in low-data resource +scenarios. However, these methods overlook the task-specific information when +fine-tuning diverse downstream tasks. In this paper, we propose a simple yet +effective method called "Salient Channel Tuning" (SCT) to leverage the +task-specific information by forwarding the model with the task images to +select partial channels in a feature map that enables us to tune only 1/8 +channels leading to significantly lower parameter costs. Experiments outperform +full fine-tuning on 18 out of 19 tasks in the VTAB-1K benchmark by adding only +0.11M parameters of the ViT-B, which is 780x fewer than its full fine-tuning +counterpart. Furthermore, experiments on domain generalization and few-shot +learning surpass other PEFT methods with lower parameter costs, demonstrating +our proposed tuning technique's strong capability and effectiveness in the +low-data regime. + +
+
+ comment: This work has been accepted by IJCV2023 +
+
+
+
+
+ + ♻ ☆ Making Images Real Again: A Comprehensive Survey on Deep Image + Composition + + +
+ As a common image editing operation, image composition aims to combine the +foreground from one image and another background image, resulting in a +composite image. However, there are many issues that could make the composite +images unrealistic. These issues can be summarized as the inconsistency between +foreground and background, which includes appearance inconsistency (e.g., +incompatible illumination), geometry inconsistency (e.g., unreasonable size), +and semantic inconsistency (e.g., mismatched semantic context). Image +composition task could be decomposed into multiple sub-tasks, in which each +sub-task targets at one or more issues. Specifically, object placement aims to +find reasonable scale, location, and shape for the foreground. Image blending +aims to address the unnatural boundary between foreground and background. Image +harmonization aims to adjust the illumination statistics of foreground. Shadow +generation aims to generate plausible shadow for the foreground. These +sub-tasks can be executed sequentially or parallelly to acquire realistic +composite images. To the best of our knowledge, there is no previous survey on +image composition. In this paper, we conduct comprehensive survey over the +sub-tasks and combinatorial task of image composition. For each one, we +summarize the existing methods, available datasets, and common evaluation +metrics. Datasets and codes for image composition are summarized at +https://github.com/bcmi/Awesome-Image-Composition. We have also contributed the +first image composition toolbox: libcom https://github.com/bcmi/libcom, which +assembles 10+ image composition related functions (e.g., image blending, image +harmonization, object placement, shadow generation, generative composition). +The ultimate goal of this toolbox is solving all the problems related to image +composition with simple `import libcom'. + +
+
+
+
+
+ + ♻ ☆ A Concise but High-performing Network for Image Guided Depth Completion + in Autonomous Driving + + +
+ Depth completion is a crucial task in autonomous driving, aiming to convert a +sparse depth map into a dense depth prediction. Due to its potentially rich +semantic information, RGB image is commonly fused to enhance the completion +effect. Image-guided depth completion involves three key challenges: 1) how to +effectively fuse the two modalities; 2) how to better recover depth +information; and 3) how to achieve real-time prediction for practical +autonomous driving. To solve the above problems, we propose a concise but +effective network, named CENet, to achieve high-performance depth completion +with a simple and elegant structure. Firstly, we use a fast guidance module to +fuse the two sensor features, utilizing abundant auxiliary features extracted +from the color space. Unlike other commonly used complicated guidance modules, +our approach is intuitive and low-cost. In addition, we find and analyze the +optimization inconsistency problem for observed and unobserved positions, and a +decoupled depth prediction head is proposed to alleviate the issue. The +proposed decoupled head can better output the depth of valid and invalid +positions with very few extra inference time. Based on the simple structure of +dual-encoder and single-decoder, our CENet can achieve superior balance between +accuracy and efficiency. In the KITTI depth completion benchmark, our CENet +attains competitive performance and inference speed compared with the +state-of-the-art methods. To validate the generalization of our method, we also +evaluate on indoor NYUv2 dataset, and our CENet still achieve impressive +results. The code of this work will be available at +https://github.com/lmomoy/CHNet. + +
+
+
+
+
+ + ♻ ☆ HanDiffuser: Text-to-Image Generation With Realistic Hand Appearances + + +
+ Text-to-image generative models can generate high-quality humans, but realism +is lost when generating hands. Common artifacts include irregular hand poses, +shapes, incorrect numbers of fingers, and physically implausible finger +orientations. To generate images with realistic hands, we propose a novel +diffusion-based architecture called HanDiffuser that achieves realism by +injecting hand embeddings in the generative process. HanDiffuser consists of +two components: a Text-to-Hand-Params diffusion model to generate SMPL-Body and +MANO-Hand parameters from input text prompts, and a Text-Guided +Hand-Params-to-Image diffusion model to synthesize images by conditioning on +the prompts and hand parameters generated by the previous component. We +incorporate multiple aspects of hand representation, including 3D shapes and +joint-level finger positions, orientations and articulations, for robust +learning and reliable performance during inference. We conduct extensive +quantitative and qualitative experiments and perform user studies to +demonstrate the efficacy of our method in generating images with high-quality +hands. + +
+
+ comment: Revisions: 1. Added a link to project page in the abstract, 2. + Updated references and related work, 3. Fixed some grammatical errors +
+
+
+
+
+ + ♻ ☆ Choosing Wisely and Learning Deeply: Selective Cross-Modality + Distillation via CLIP for Domain Generalization + + +
+ Domain Generalization (DG), a crucial research area, seeks to train models +across multiple domains and test them on unseen ones. In this paper, we +introduce a novel approach, namely, Selective Cross-Modality Distillation for +Domain Generalization (SCMD). SCMD leverages the capabilities of large +vision-language models, specifically CLIP, to train a more efficient model, +ensuring it acquires robust generalization capabilities across unseen domains. +Our primary contribution is a unique selection framework strategically designed +to identify hard-to-learn samples for distillation. In parallel, we introduce a +novel cross-modality module that seamlessly combines the projected features of +the student model with the text embeddings from CLIP, ensuring the alignment of +similarity distributions. We assess SCMD's performance on various benchmarks, +where it empowers a ResNet50 to deliver state-of-the-art performance, +surpassing existing domain generalization methods. Furthermore, we provide a +theoretical analysis of our selection strategy, offering deeper insight into +its effectiveness and potential in the field of DG. + +
+
+
+
+
+ + ♻ ☆ GhostNetV3: Exploring the Training Strategies for Compact Models + + +
+ Compact neural networks are specially designed for applications on edge +devices with faster inference speed yet modest performance. However, training +strategies of compact models are borrowed from that of conventional models at +present, which ignores their difference in model capacity and thus may impede +the performance of compact models. In this paper, by systematically +investigating the impact of different training ingredients, we introduce a +strong training strategy for compact models. We find that the appropriate +designs of re-parameterization and knowledge distillation are crucial for +training high-performance compact models, while some commonly used data +augmentations for training conventional models, such as Mixup and CutMix, lead +to worse performance. Our experiments on ImageNet-1K dataset demonstrate that +our specialized training strategy for compact models is applicable to various +architectures, including GhostNetV2, MobileNetV2 and ShuffleNetV2. +Specifically, equipped with our strategy, GhostNetV3 1.3$\times$ achieves a +top-1 accuracy of 79.1% with only 269M FLOPs and a latency of 14.46ms on mobile +devices, surpassing its ordinarily trained counterpart by a large margin. +Moreover, our observation can also be extended to object detection scenarios. +PyTorch code and checkpoints can be found at +https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv3_pytorch. + +
+
+
+
+
+ + ♻ ☆ Bayesian Diffusion Models for 3D Shape Reconstruction CVPR 2024 + + +
+ We present Bayesian Diffusion Models (BDM), a prediction algorithm that +performs effective Bayesian inference by tightly coupling the top-down (prior) +information with the bottom-up (data-driven) procedure via joint diffusion +processes. We show the effectiveness of BDM on the 3D shape reconstruction +task. Compared to prototypical deep learning data-driven approaches trained on +paired (supervised) data-labels (e.g. image-point clouds) datasets, our BDM +brings in rich prior information from standalone labels (e.g. point clouds) to +improve the bottom-up 3D reconstruction. As opposed to the standard Bayesian +frameworks where explicit prior and likelihood are required for the inference, +BDM performs seamless information fusion via coupled diffusion processes with +learned gradient computation networks. The specialty of our BDM lies in its +capability to engage the active and effective information exchange and fusion +of the top-down and bottom-up processes where each itself is a diffusion +process. We demonstrate state-of-the-art results on both synthetic and +real-world benchmarks for 3D shape reconstruction. + +
+
+ comment: Accepted to CVPR 2024; Project Page: https://mlpc-ucsd.github.io/BDM/ +
+
+
+
+
+ + ♻ ☆ IterInv: Iterative Inversion for Pixel-Level T2I Models ICME 2024 + + +
+ Large-scale text-to-image diffusion models have been a ground-breaking +development in generating convincing images following an input text prompt. The +goal of image editing research is to give users control over the generated +images by modifying the text prompt. Current image editing techniques +predominantly hinge on DDIM inversion as a prevalent practice rooted in Latent +Diffusion Models (LDM). However, the large pretrained T2I models working on the +latent space suffer from losing details due to the first compression stage with +an autoencoder mechanism. Instead, other mainstream T2I pipeline working on the +pixel level, such as Imagen and DeepFloyd-IF, circumvents the above problem. +They are commonly composed of multiple stages, typically starting with a +text-to-image stage and followed by several super-resolution stages. In this +pipeline, the DDIM inversion fails to find the initial noise and generate the +original image given that the super-resolution diffusion models are not +compatible with the DDIM technique. According to our experimental findings, +iteratively concatenating the noisy image as the condition is the root of this +problem. Based on this observation, we develop an iterative inversion (IterInv) +technique for this category of T2I models and verify IterInv with the +open-source DeepFloyd-IF model.Specifically, IterInv employ NTI as the +inversion and reconstruction of low-resolution image generation. In stages 2 +and 3, we update the latent variance at each timestep to find the deterministic +inversion trace and promote the reconstruction process. By combining our method +with a popular image editing method, we prove the application prospects of +IterInv. The code will be released upon acceptance. The code is available at +\url{https://github.com/Tchuanm/IterInv.git}. + +
+
+ comment: Accepted paper at ICME 2024 +
+
+
+
+
+ + ♻ ☆ DermSynth3D: Synthesis of in-the-wild Annotated Dermatology Images + + +
+ In recent years, deep learning (DL) has shown great potential in the field of +dermatological image analysis. However, existing datasets in this domain have +significant limitations, including a small number of image samples, limited +disease conditions, insufficient annotations, and non-standardized image +acquisitions. To address these shortcomings, we propose a novel framework +called DermSynth3D. DermSynth3D blends skin disease patterns onto 3D textured +meshes of human subjects using a differentiable renderer and generates 2D +images from various camera viewpoints under chosen lighting conditions in +diverse background scenes. Our method adheres to top-down rules that constrain +the blending and rendering process to create 2D images with skin conditions +that mimic in-the-wild acquisitions, ensuring more meaningful results. The +framework generates photo-realistic 2D dermoscopy images and the corresponding +dense annotations for semantic segmentation of the skin, skin conditions, body +parts, bounding boxes around lesions, depth maps, and other 3D scene +parameters, such as camera position and lighting conditions. DermSynth3D allows +for the creation of custom datasets for various dermatology tasks. We +demonstrate the effectiveness of data generated using DermSynth3D by training +DL models on synthetic data and evaluating them on various dermatology tasks +using real 2D dermatological images. We make our code publicly available at +https://github.com/sfu-mial/DermSynth3D. + +
+
+ comment: Accepted to Medical Image Analysis (MedIA) 2024 +
+
+
+
+
+ + ♻ ☆ End-to-end Autonomous Driving: Challenges and Frontiers + + +
+ The autonomous driving community has witnessed a rapid growth in approaches +that embrace an end-to-end algorithm framework, utilizing raw sensor input to +generate vehicle motion plans, instead of concentrating on individual tasks +such as detection and motion prediction. End-to-end systems, in comparison to +modular pipelines, benefit from joint feature optimization for perception and +planning. This field has flourished due to the availability of large-scale +datasets, closed-loop evaluation, and the increasing need for autonomous +driving algorithms to perform effectively in challenging scenarios. In this +survey, we provide a comprehensive analysis of more than 270 papers, covering +the motivation, roadmap, methodology, challenges, and future trends in +end-to-end autonomous driving. We delve into several critical challenges, +including multi-modality, interpretability, causal confusion, robustness, and +world models, amongst others. Additionally, we discuss current advancements in +foundation models and visual pre-training, as well as how to incorporate these +techniques within the end-to-end driving framework. we maintain an active +repository that contains up-to-date literature and open-source projects at +https://github.com/OpenDriveLab/End-to-end-Autonomous-Driving. + +
+
+
+
+
+ + ♻ ☆ Deep Feature Statistics Mapping for Generalized Screen Content Image + Quality Assessment + + +
+ The statistical regularities of natural images, referred to as natural scene +statistics, play an important role in no-reference image quality assessment. +However, it has been widely acknowledged that screen content images (SCIs), +which are typically computer generated, do not hold such statistics. Here we +make the first attempt to learn the statistics of SCIs, based upon which the +quality of SCIs can be effectively determined. The underlying mechanism of the +proposed approach is based upon the mild assumption that the SCIs, which are +not physically acquired, still obey certain statistics that could be understood +in a learning fashion. We empirically show that the statistics deviation could +be effectively leveraged in quality assessment, and the proposed method is +superior when evaluated in different settings. Extensive experimental results +demonstrate the Deep Feature Statistics based SCI Quality Assessment (DFSS-IQA) +model delivers promising performance compared with existing NR-IQA models and +shows a high generalization capability in the cross-dataset settings. The +implementation of our method is publicly available at +https://github.com/Baoliang93/DFSS-IQA. + +
+
+
+
+
+ + ♻ ☆ Generative Modelling with High-Order Langevin Dynamics + + +
+ Diffusion generative modelling (DGM) based on stochastic differential +equations (SDEs) with score matching has achieved unprecedented results in data +generation. In this paper, we propose a novel fast high-quality generative +modelling method based on high-order Langevin dynamics (HOLD) with score +matching. This motive is proved by third-order Langevin dynamics. By augmenting +the previous SDEs, e.g. variance exploding or variance preserving SDEs for +single-data variable processes, HOLD can simultaneously model position, +velocity, and acceleration, thereby improving the quality and speed of the data +generation at the same time. HOLD is composed of one Ornstein-Uhlenbeck process +and two Hamiltonians, which reduce the mixing time by two orders of magnitude. +Empirical experiments for unconditional image generation on the public data set +CIFAR-10 and CelebA-HQ show that the effect is significant in both Frechet +inception distance (FID) and negative log-likelihood, and achieves the +state-of-the-art FID of 1.85 on CIFAR-10. + +
+
+ comment: Some of the results in this paper have been published or accepted at + conferences such as wacv2024, icassp2024, and icme2024 +
+
+
+
+
+ + ♻ ☆ GestaltMML: Enhancing Rare Genetic Disease Diagnosis through Multimodal + Machine Learning Combining Facial Images and Clinical Texts + + +
+ Individuals with suspected rare genetic disorders often undergo multiple +clinical evaluations, imaging studies, laboratory tests and genetic tests, to +find a possible answer over a prolonged period of time. Addressing this +"diagnostic odyssey" thus has substantial clinical, psychosocial, and economic +benefits. Many rare genetic diseases have distinctive facial features, which +can be used by artificial intelligence algorithms to facilitate clinical +diagnosis, in prioritizing candidate diseases to be further examined by lab +tests or genetic assays, or in helping the phenotype-driven reinterpretation of +genome/exome sequencing data. Existing methods using frontal facial photos were +built on conventional Convolutional Neural Networks (CNNs), rely exclusively on +facial images, and cannot capture non-facial phenotypic traits and demographic +information essential for guiding accurate diagnoses. Here we introduce +GestaltMML, a multimodal machine learning (MML) approach solely based on the +Transformer architecture. It integrates facial images, demographic information +(age, sex, ethnicity), and clinical notes (optionally, a list of Human +Phenotype Ontology terms) to improve prediction accuracy. Furthermore, we also +evaluated GestaltMML on a diverse range of datasets, including 528 diseases +from the GestaltMatcher Database, several in-house datasets of +Beckwith-Wiedemann syndrome (BWS, over-growth syndrome with distinct facial +features), Sotos syndrome (overgrowth syndrome with overlapping features with +BWS), NAA10-related neurodevelopmental syndrome, Cornelia de Lange syndrome +(multiple malformation syndrome), and KBG syndrome (multiple malformation +syndrome). Our results suggest that GestaltMML effectively incorporates +multiple modalities of data, greatly narrowing candidate genetic diagnoses of +rare diseases and may facilitate the reinterpretation of genome/exome +sequencing data. + +
+
+ comment: Significant revisions +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 72 + +
+
+
+ + ☆ Enforcing Conditional Independence for Fair Representation Learning and + Causal Image Generation CVPR + + +
+ Conditional independence (CI) constraints are critical for defining and +evaluating fairness in machine learning, as well as for learning unconfounded +or causal representations. Traditional methods for ensuring fairness either +blindly learn invariant features with respect to a protected variable (e.g., +race when classifying sex from face images) or enforce CI relative to the +protected attribute only on the model output (e.g., the sex label). Neither of +these methods are effective in enforcing CI in high-dimensional feature spaces. +In this paper, we focus on a nascent approach characterizing the CI constraint +in terms of two Jensen-Shannon divergence terms, and we extend it to +high-dimensional feature spaces using a novel dynamic sampling strategy. In +doing so, we introduce a new training paradigm that can be applied to any +encoder architecture. We are able to enforce conditional independence of the +diffusion autoencoder latent representation with respect to any protected +attribute under the equalized odds constraint and show that this approach +enables causal image generation with controllable latent spaces. Our +experimental results demonstrate that our approach can achieve high accuracy on +downstream tasks while upholding equality of odds. + +
+
+ comment: To appear at the 2024 IEEE CVPR Workshop on Fair, Data-Efficient, and + Trusted Computer Vision +
+
+
+
+
+ + ☆ Universal Fingerprint Generation: Controllable Diffusion Model with + Multimodal Conditions + + +
+ The utilization of synthetic data for fingerprint recognition has garnered +increased attention due to its potential to alleviate privacy concerns +surrounding sensitive biometric data. However, current methods for generating +fingerprints have limitations in creating impressions of the same finger with +useful intra-class variations. To tackle this challenge, we present GenPrint, a +framework to produce fingerprint images of various types while maintaining +identity and offering humanly understandable control over different appearance +factors such as fingerprint class, acquisition type, sensor device, and quality +level. Unlike previous fingerprint generation approaches, GenPrint is not +confined to replicating style characteristics from the training dataset alone: +it enables the generation of novel styles from unseen devices without requiring +additional fine-tuning. To accomplish these objectives, we developed GenPrint +using latent diffusion models with multimodal conditions (text and image) for +consistent generation of style and identity. Our experiments leverage a variety +of publicly available datasets for training and evaluation. Results demonstrate +the benefits of GenPrint in terms of identity preservation, explainable +control, and universality of generated images. Importantly, the +GenPrint-generated images yield comparable or even superior accuracy to models +trained solely on real data and further enhances performance when augmenting +the diversity of existing real fingerprint datasets. + +
+
+
+
+
+ + ☆ AnyPattern: Towards In-context Image Copy Detection + + +
+ This paper explores in-context learning for image copy detection (ICD), i.e., +prompting an ICD model to identify replicated images with new tampering +patterns without the need for additional training. The prompts (or the +contexts) are from a small set of image-replica pairs that reflect the new +patterns and are used at inference time. Such in-context ICD has good realistic +value, because it requires no fine-tuning and thus facilitates fast reaction +against the emergence of unseen patterns. To accommodate the "seen +$\rightarrow$ unseen" generalization scenario, we construct the first +large-scale pattern dataset named AnyPattern, which has the largest number of +tamper patterns ($90$ for training and $10$ for testing) among all the existing +ones. We benchmark AnyPattern with popular ICD methods and reveal that existing +methods barely generalize to novel tamper patterns. We further propose a simple +in-context ICD method named ImageStacker. ImageStacker learns to select the +most representative image-replica pairs and employs them as the pattern prompts +in a stacking manner (rather than the popular concatenation manner). +Experimental results show (1) training with our large-scale dataset +substantially benefits pattern generalization ($+26.66 \%$ $\mu AP$), (2) the +proposed ImageStacker facilitates effective in-context ICD (another round of +$+16.75 \%$ $\mu AP$), and (3) AnyPattern enables in-context ICD, i.e. without +such a large-scale dataset, in-context learning does not emerge even with our +ImageStacker. The project (including the proposed dataset AnyPattern and the +code for ImageStacker) is publicly available at https://anypattern.github.io +under the MIT Licence. + +
+
+
+
+
+ + ☆ Iteratively Prompting Multimodal LLMs to Reproduce Natural and + AI-Generated Images + + +
+ With the digital imagery landscape rapidly evolving, image stocks and +AI-generated image marketplaces have become central to visual media. +Traditional stock images now exist alongside innovative platforms that trade in +prompts for AI-generated visuals, driven by sophisticated APIs like DALL-E 3 +and Midjourney. This paper studies the possibility of employing multi-modal +models with enhanced visual understanding to mimic the outputs of these +platforms, introducing an original attack strategy. Our method leverages +fine-tuned CLIP models, a multi-label classifier, and the descriptive +capabilities of GPT-4V to create prompts that generate images similar to those +available in marketplaces and from premium stock image providers, yet at a +markedly lower expense. In presenting this strategy, we aim to spotlight a new +class of economic and security considerations within the realm of digital +imagery. Our findings, supported by both automated metrics and human +assessment, reveal that comparable visual content can be produced for a +fraction of the prevailing market prices ($0.23 - $0.27 per image), emphasizing +the need for awareness and strategic discussions about the integrity of digital +media in an increasingly AI-integrated landscape. Our work also contributes to +the field by assembling a dataset consisting of approximately 19 million +prompt-image pairs generated by the popular Midjourney platform, which we plan +to release publicly. + +
+
+
+
+
+ + ☆ EncodeNet: A Framework for Boosting DNN Accuracy with Entropy-driven + Generalized Converting Autoencoder + + +
+ Image classification is a fundamental task in computer vision, and the quest +to enhance DNN accuracy without inflating model size or latency remains a +pressing concern. We make a couple of advances in this regard, leading to a +novel EncodeNet design and training framework. The first advancement involves +Converting Autoencoders, a novel approach that transforms images into an +easy-to-classify image of its class. Our prior work that applied the Converting +Autoencoder and a simple classifier in tandem achieved moderate accuracy over +simple datasets, such as MNIST and FMNIST. However, on more complex datasets +like CIFAR-10, the Converting Autoencoder has a large reconstruction loss, +making it unsuitable for enhancing DNN accuracy. To address these limitations, +we generalize the design of Converting Autoencoders by leveraging a larger +class of DNNs, those with architectures comprising feature extraction layers +followed by classification layers. We incorporate a generalized algorithmic +design of the Converting Autoencoder and intraclass clustering to identify +representative images, leading to optimized image feature learning. Next, we +demonstrate the effectiveness of our EncodeNet design and training framework, +improving the accuracy of well-trained baseline DNNs while maintaining the +overall model size. EncodeNet's building blocks comprise the trained encoder +from our generalized Converting Autoencoders transferring knowledge to a +lightweight classifier network - also extracted from the baseline DNN. Our +experimental results demonstrate that EncodeNet improves the accuracy of VGG16 +from 92.64% to 94.05% on CIFAR-10 and RestNet20 from 74.56% to 76.04% on +CIFAR-100. It outperforms state-of-the-art techniques that rely on knowledge +distillation and attention mechanisms, delivering higher accuracy for models of +comparable size. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Autonomous Robot for Disaster Mapping and Victim Localization + + +
+ In response to the critical need for effective reconnaissance in disaster +scenarios, this research article presents the design and implementation of a +complete autonomous robot system using the Turtlebot3 with Robotic Operating +System (ROS) Noetic. Upon deployment in closed, initially unknown environments, +the system aims to generate a comprehensive map and identify any present +'victims' using AprilTags as stand-ins. We discuss our solution for search and +rescue missions, while additionally exploring more advanced algorithms to +improve search and rescue functionalities. We introduce a Cubature Kalman +Filter to help reduce the mean squared error [m] for AprilTag localization and +an information-theoretic exploration algorithm to expedite exploration in +unknown environments. Just like turtles, our system takes it slow and steady, +but when it's time to save the day, it moves at ninja-like speed! Despite +Donatello's shell, he's no slowpoke - he zips through obstacles with the +agility of a teenage mutant ninja turtle. So, hang on tight to your shells and +get ready for a whirlwind of reconnaissance! + Full pipeline code https://github.com/rzhao5659/MRProject/tree/main + Exploration code https://github.com/rzhao5659/MRProject/tree/main + +
+
+ comment: Class final project for Northeastern University EECE 5550 Mobile + Robotics Course +
+
+
+
+
+ + ☆ Object-Attribute Binding in Text-to-Image Generation: Evaluation and + Control + + +
+ Current diffusion models create photorealistic images given a text prompt as +input but struggle to correctly bind attributes mentioned in the text to the +right objects in the image. This is evidenced by our novel image-graph +alignment model called EPViT (Edge Prediction Vision Transformer) for the +evaluation of image-text alignment. To alleviate the above problem, we propose +focused cross-attention (FCA) that controls the visual attention maps by +syntactic constraints found in the input sentence. Additionally, the syntax +structure of the prompt helps to disentangle the multimodal CLIP embeddings +that are commonly used in T2I generation. The resulting DisCLIP embeddings and +FCA are easily integrated in state-of-the-art diffusion models without +additional training of these models. We show substantial improvements in T2I +generation and especially its attribute-object binding on several +datasets.\footnote{Code and data will be made available upon acceptance. + +
+
+
+
+
+ + ☆ BC-MRI-SEG: A Breast Cancer MRI Tumor Segmentation Benchmark + + +
+ Binary breast cancer tumor segmentation with Magnetic Resonance Imaging (MRI) +data is typically trained and evaluated on private medical data, which makes +comparing deep learning approaches difficult. We propose a benchmark +(BC-MRI-SEG) for binary breast cancer tumor segmentation based on publicly +available MRI datasets. The benchmark consists of four datasets in total, where +two datasets are used for supervised training and evaluation, and two are used +for zero-shot evaluation. Additionally we compare state-of-the-art (SOTA) +approaches on our benchmark and provide an exhaustive list of available public +breast cancer MRI datasets. The source code has been made available at +https://irulenot.github.io/BC_MRI_SEG_Benchmark. + +
+
+
+
+
+ + ☆ A Nasal Cytology Dataset for Object Detection and Deep Learning + + +
+ Nasal Cytology is a new and efficient clinical technique to diagnose rhinitis +and allergies that is not much widespread due to the time-consuming nature of +cell counting; that is why AI-aided counting could be a turning point for the +diffusion of this technique. In this article we present the first dataset of +rhino-cytological field images: the NCD (Nasal Cytology Dataset), aimed to +train and deploy Object Detection models to support physicians and biologists +during clinical practice. The real distribution of the cytotypes, populating +the nasal mucosa has been replicated, sampling images from slides of clinical +patients, and manually annotating each cell found on them. The correspondent +object detection task presents non'trivial issues associated with the strong +class imbalancement, involving the rarest cell types. This work contributes to +some of open challenges by presenting a novel machine learning-based approach +to aid the automated detection and classification of nasal mucosa cells: the +DETR and YOLO models shown good performance in detecting cells and classifying +them correctly, revealing great potential to accelerate the work of rhinology +experts. + +
+
+ comment: Pre Print almost ready to be submitted +
+
+
+
+
+ + ☆ Interpreting COVID Lateral Flow Tests' Results with Foundation Models + + +
+ Lateral flow tests (LFTs) enable rapid, low-cost testing for health +conditions including Covid, pregnancy, HIV, and malaria. Automated readers of +LFT results can yield many benefits including empowering blind people to +independently learn about their health and accelerating data entry for +large-scale monitoring (e.g., for pandemics such as Covid) by using only a +single photograph per LFT test. Accordingly, we explore the abilities of modern +foundation vision language models (VLMs) in interpreting such tests. To enable +this analysis, we first create a new labeled dataset with hierarchical +segmentations of each LFT test and its nested test result window. We call this +dataset LFT-Grounding. Next, we benchmark eight modern VLMs in zero-shot +settings for analyzing these images. We demonstrate that current VLMs +frequently fail to correctly identify the type of LFT test, interpret the test +results, locate the nested result window of the LFT tests, and recognize LFT +tests when they partially obfuscated. To facilitate community-wide progress +towards automated LFT reading, we publicly release our dataset at +https://iamstuti.github.io/lft_grounding_foundation_models/. + +
+
+
+
+
+ + ☆ Elucidating the Design Space of Dataset Condensation + + +
+ Dataset condensation, a concept within data-centric learning, efficiently +transfers critical attributes from an original dataset to a synthetic version, +maintaining both diversity and realism. This approach significantly improves +model training efficiency and is adaptable across multiple application areas. +Previous methods in dataset condensation have faced challenges: some incur high +computational costs which limit scalability to larger datasets (e.g., MTT, +DREAM, and TESLA), while others are restricted to less optimal design spaces, +which could hinder potential improvements, especially in smaller datasets +(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a +comprehensive design framework that includes specific, effective strategies +like implementing soft category-aware matching and adjusting the learning rate +schedule. These strategies are grounded in empirical evidence and theoretical +backing. Our resulting approach, Elucidate Dataset Condensation (EDC), +establishes a benchmark for both small and large-scale dataset condensation. In +our testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on +ImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a +compression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM, +and RDED by margins of 27.3%, 17.2%, and 6.6%, respectively. + +
+
+
+
+
+ + ☆ ArtNeRF: A Stylized Neural Field for 3D-Aware Cartoonized Face Synthesis + + +
+ Recent advances in generative visual models and neural radiance fields have +greatly boosted 3D-aware image synthesis and stylization tasks. However, +previous NeRF-based work is limited to single scene stylization, training a +model to generate 3D-aware cartoon faces with arbitrary styles remains +unsolved. We propose ArtNeRF, a novel face stylization framework derived from +3D-aware GAN to tackle this problem. In this framework, we utilize an +expressive generator to synthesize stylized faces and a triple-branch +discriminator module to improve the visual quality and style consistency of the +generated faces. Specifically, a style encoder based on contrastive learning is +leveraged to extract robust low-dimensional embeddings of style images, +empowering the generator with the knowledge of various styles. To smooth the +training process of cross-domain transfer learning, we propose an adaptive +style blending module which helps inject style information and allows users to +freely tune the level of stylization. We further introduce a neural rendering +module to achieve efficient real-time rendering of images with higher +resolutions. Extensive experiments demonstrate that ArtNeRF is versatile in +generating high-quality 3D-aware cartoon faces with arbitrary styles. + +
+
+
+
+
+ + ☆ SVGEditBench: A Benchmark Dataset for Quantitative Assessment of LLM's + SVG Editing Capabilities CVPR2024 + + +
+ Text-to-image models have shown progress in recent years. Along with this +progress, generating vector graphics from text has also advanced. SVG is a +popular format for vector graphics, and SVG represents a scene with XML text. +Therefore, Large Language Models can directly process SVG code. Taking this +into account, we focused on editing SVG with LLMs. For quantitative evaluation +of LLMs' ability to edit SVG, we propose SVGEditBench. SVGEditBench is a +benchmark for assessing the LLMs' ability to edit SVG code. We also show the +GPT-4 and GPT-3.5 results when evaluated on the proposed benchmark. In the +experiments, GPT-4 showed superior performance to GPT-3.5 both quantitatively +and qualitatively. The dataset is available at +https://github.com/mti-lab/SVGEditBench. + +
+
+ comment: Accepted to Workshop on Graphic Design Understanding and Generation + (GDUG), a CVPR2024 workshop. Dataset: https://github.com/mti-lab/SVGEditBench +
+
+
+
+
+ + ☆ Concept Arithmetics for Circumventing Concept Inhibition in Diffusion + Models + + +
+ Motivated by ethical and legal concerns, the scientific community is actively +developing methods to limit the misuse of Text-to-Image diffusion models for +reproducing copyrighted, violent, explicit, or personal information in the +generated images. Simultaneously, researchers put these newly developed safety +measures to the test by assuming the role of an adversary to find +vulnerabilities and backdoors in them. We use compositional property of +diffusion models, which allows to leverage multiple prompts in a single image +generation. This property allows us to combine other concepts, that should not +have been affected by the inhibition, to reconstruct the vector, responsible +for target concept generation, even though the direct computation of this +vector is no longer accessible. We provide theoretical and empirical evidence +why the proposed attacks are possible and discuss the implications of these +findings for safe model deployment. We argue that it is essential to consider +all possible approaches to image generation with diffusion models that can be +employed by an adversary. Our work opens up the discussion about the +implications of concept arithmetics and compositional inference for safety +mechanisms in diffusion models. + Content Advisory: This paper contains discussions and model-generated content +that may be considered offensive. Reader discretion is advised. + Project page: https://cs-people.bu.edu/vpetsiuk/arc + +
+
+
+
+
+ + ☆ PEMMA: Parameter-Efficient Multi-Modal Adaptation for Medical Image + Segmentation + + +
+ Imaging modalities such as Computed Tomography (CT) and Positron Emission +Tomography (PET) are key in cancer detection, inspiring Deep Neural Networks +(DNN) models that merge these scans for tumor segmentation. When both CT and +PET scans are available, it is common to combine them as two channels of the +input to the segmentation model. However, this method requires both scan types +during training and inference, posing a challenge due to the limited +availability of PET scans, thereby sometimes limiting the process to CT scans +only. Hence, there is a need to develop a flexible DNN architecture that can be +trained/updated using only CT scans but can effectively utilize PET scans when +they become available. In this work, we propose a parameter-efficient +multi-modal adaptation (PEMMA) framework for lightweight upgrading of a +transformer-based segmentation model trained only on CT scans to also +incorporate PET scans. The benefits of the proposed approach are two-fold. +Firstly, we leverage the inherent modularity of the transformer architecture +and perform low-rank adaptation (LoRA) of the attention weights to achieve +parameter-efficient adaptation. Secondly, since the PEMMA framework attempts to +minimize cross modal entanglement, it is possible to subsequently update the +combined model using only one modality, without causing catastrophic forgetting +of the other modality. Our proposed method achieves comparable results with the +performance of early fusion techniques with just 8% of the trainable +parameters, especially with a remarkable +28% improvement on the average dice +score on PET scans when trained on a single modality. + +
+
+
+
+
+ + ☆ Semantic-Rearrangement-Based Multi-Level Alignment for Domain + Generalized Segmentation + + +
+ Domain generalized semantic segmentation is an essential computer vision +task, for which models only leverage source data to learn the capability of +generalized semantic segmentation towards the unseen target domains. Previous +works typically address this challenge by global style randomization or feature +regularization. In this paper, we argue that given the observation that +different local semantic regions perform different visual characteristics from +the source domain to the target domain, methods focusing on global operations +are hard to capture such regional discrepancies, thus failing to construct +domain-invariant representations with the consistency from local to global +level. Therefore, we propose the Semantic-Rearrangement-based Multi-Level +Alignment (SRMA) to overcome this problem. SRMA first incorporates a Semantic +Rearrangement Module (SRM), which conducts semantic region randomization to +enhance the diversity of the source domain sufficiently. A Multi-Level +Alignment module (MLA) is subsequently proposed with the help of such diversity +to establish the global-regional-local consistent domain-invariant +representations. By aligning features across randomized samples with +domain-neutral knowledge at multiple levels, SRMA provides a more robust way to +handle the source-target domain gap. Extensive experiments demonstrate the +superiority of SRMA over the current state-of-the-art works on various +benchmarks. + +
+
+
+
+
+ + ☆ PV-S3: Advancing Automatic Photovoltaic Defect Detection using + Semi-Supervised Semantic Segmentation of Electroluminescence Images + + +
+ Photovoltaic (PV) systems allow us to tap into all abundant solar energy, +however they require regular maintenance for high efficiency and to prevent +degradation. Traditional manual health check, using Electroluminescence (EL) +imaging, is expensive and logistically challenging making automated defect +detection essential. Current automation approaches require extensive manual +expert labeling, which is time-consuming, expensive, and prone to errors. We +propose PV-S3 (Photovoltaic-Semi Supervised Segmentation), a Semi-Supervised +Learning approach for semantic segmentation of defects in EL images that +reduces reliance on extensive labeling. PV-S3 is a Deep learning model trained +using a few labeled images along with numerous unlabeled images. We introduce a +novel Semi Cross-Entropy loss function to train PV-S3 which addresses the +challenges specific to automated PV defect detection, such as diverse defect +types and class imbalance. We evaluate PV-S3 on multiple datasets and +demonstrate its effectiveness and adaptability. With merely 20% labeled +samples, we achieve an absolute improvement of 9.7% in IoU, 29.9% in Precision, +12.75% in Recall, and 20.42% in F1-Score over prior state-of-the-art supervised +method (which uses 100% labeled samples) on UCF-EL dataset (largest dataset +available for semantic segmentation of EL images) showing improvement in +performance while reducing the annotation costs by 80%. + +
+
+
+
+
+ + ☆ A sustainable development perspective on urban-scale roof greening + priorities and benefits + + +
+ Greenspaces are tightly linked to human well-being. Yet, rapid urbanization +has exacerbated greenspace exposure inequality and declining human life +quality. Roof greening has been recognized as an effective strategy to mitigate +these negative impacts. Understanding priorities and benefits is crucial to +promoting green roofs. Here, using geospatial big data, we conduct an +urban-scale assessment of roof greening at a single building level in Hong Kong +from a sustainable development perspective. We identify that 85.3\% of +buildings reveal potential and urgent demand for roof greening. We further find +green roofs could increase greenspace exposure by \textasciitilde61\% and +produce hundreds of millions (HK\$) in economic benefits annually but play a +small role in urban heat mitigation (\textasciitilde0.15\degree{C}) and annual +carbon emission offsets (\textasciitilde0.8\%). Our study offers a +comprehensive assessment of roof greening, which could provide reference for +sustainable development in cities worldwide, from data utilization to solutions +and findings. + +
+
+
+
+
+ + ☆ A Complete System for Automated 3D Semantic-Geometric Mapping of + Corrosion in Industrial Environments + + +
+ Corrosion, a naturally occurring process leading to the deterioration of +metallic materials, demands diligent detection for quality control and the +preservation of metal-based objects, especially within industrial contexts. +Traditional techniques for corrosion identification, including ultrasonic +testing, radio-graphic testing, and magnetic flux leakage, necessitate the +deployment of expensive and bulky equipment on-site for effective data +acquisition. An unexplored alternative involves employing lightweight, +conventional camera systems, and state-of-the-art computer vision methods for +its identification. + In this work, we propose a complete system for semi-automated corrosion +identification and mapping in industrial environments. We leverage recent +advances in LiDAR-based methods for localization and mapping, with vision-based +semantic segmentation deep learning techniques, in order to build +semantic-geometric maps of industrial environments. Unlike previous corrosion +identification systems available in the literature, our designed multi-modal +system is low-cost, portable, semi-autonomous and allows collecting large +datasets by untrained personnel. + A set of experiments in an indoor laboratory environment, demonstrate +quantitatively the high accuracy of the employed LiDAR based 3D mapping and +localization system, with less then $0.05m$ and 0.02m average absolute and +relative pose errors. Also, our data-driven semantic segmentation model, +achieves around 70\% precision when trained with our pixel-wise manually +annotated dataset. + +
+
+
+
+
+ + ☆ Hyper-SD: Trajectory Segmented Consistency Model for Efficient Image + Synthesis + + +
+ Recently, a series of diffusion-aware distillation algorithms have emerged to +alleviate the computational overhead associated with the multi-step inference +process of Diffusion Models (DMs). Current distillation techniques often +dichotomize into two distinct aspects: i) ODE Trajectory Preservation; and ii) +ODE Trajectory Reformulation. However, these approaches suffer from severe +performance degradation or domain shifts. To address these limitations, we +propose Hyper-SD, a novel framework that synergistically amalgamates the +advantages of ODE Trajectory Preservation and Reformulation, while maintaining +near-lossless performance during step compression. Firstly, we introduce +Trajectory Segmented Consistency Distillation to progressively perform +consistent distillation within pre-defined time-step segments, which +facilitates the preservation of the original ODE trajectory from a higher-order +perspective. Secondly, we incorporate human feedback learning to boost the +performance of the model in a low-step regime and mitigate the performance loss +incurred by the distillation process. Thirdly, we integrate score distillation +to further improve the low-step generation capability of the model and offer +the first attempt to leverage a unified LoRA to support the inference process +at all steps. Extensive experiments and user studies demonstrate that Hyper-SD +achieves SOTA performance from 1 to 8 inference steps for both SDXL and SD1.5. +For example, Hyper-SDXL surpasses SDXL-Lightning by +0.68 in CLIP Score and ++0.51 in Aes Score in the 1-step inference. + +
+
+
+
+
+ + ☆ PoseAnimate: Zero-shot high fidelity pose controllable character + animation + + +
+ Image-to-video(I2V) generation aims to create a video sequence from a single +image, which requires high temporal coherence and visual fidelity with the +source image.However, existing approaches suffer from character appearance +inconsistency and poor preservation of fine details. Moreover, they require a +large amount of video data for training, which can be computationally +demanding.To address these limitations,we propose PoseAnimate, a novel +zero-shot I2V framework for character animation.PoseAnimate contains three key +components: 1) Pose-Aware Control Module (PACM) incorporates diverse pose +signals into conditional embeddings, to preserve character-independent content +and maintain precise alignment of actions.2) Dual Consistency Attention Module +(DCAM) enhances temporal consistency, and retains character identity and +intricate background details.3) Mask-Guided Decoupling Module (MGDM) refines +distinct feature perception, improving animation fidelity by decoupling the +character and background.We also propose a Pose Alignment Transition Algorithm +(PATA) to ensure smooth action transition.Extensive experiment results +demonstrate that our approach outperforms the state-of-the-art training-based +methods in terms of character consistency and detail fidelity. Moreover, it +maintains a high level of temporal coherence throughout the generated +animations. + +
+
+
+
+
+ + ☆ GScream: Learning 3D Geometry and Feature Consistent Gaussian Splatting + for Object Removal + + +
+ This paper tackles the intricate challenge of object removal to update the +radiance field using the 3D Gaussian Splatting. The main challenges of this +task lie in the preservation of geometric consistency and the maintenance of +texture coherence in the presence of the substantial discrete nature of +Gaussian primitives. We introduce a robust framework specifically designed to +overcome these obstacles. The key insight of our approach is the enhancement of +information exchange among visible and invisible areas, facilitating content +restoration in terms of both geometry and texture. Our methodology begins with +optimizing the positioning of Gaussian primitives to improve geometric +consistency across both removed and visible areas, guided by an online +registration process informed by monocular depth estimation. Following this, we +employ a novel feature propagation mechanism to bolster texture coherence, +leveraging a cross-attention design that bridges sampling Gaussians from both +uncertain and certain areas. This innovative approach significantly refines the +texture coherence within the final radiance field. Extensive experiments +validate that our method not only elevates the quality of novel view synthesis +for scenes undergoing object removal but also showcases notable efficiency +gains in training and rendering speeds. + +
+
+ comment: Project Page: https://w-ted.github.io/publications/gscream +
+
+
+
+
+ + ☆ FiLo: Zero-Shot Anomaly Detection by Fine-Grained Description and + High-Quality Localization + + +
+ Zero-shot anomaly detection (ZSAD) methods entail detecting anomalies +directly without access to any known normal or abnormal samples within the +target item categories. Existing approaches typically rely on the robust +generalization capabilities of multimodal pretrained models, computing +similarities between manually crafted textual features representing "normal" or +"abnormal" semantics and image features to detect anomalies and localize +anomalous patches. However, the generic descriptions of "abnormal" often fail +to precisely match diverse types of anomalies across different object +categories. Additionally, computing feature similarities for single patches +struggles to pinpoint specific locations of anomalies with various sizes and +scales. To address these issues, we propose a novel ZSAD method called FiLo, +comprising two components: adaptively learned Fine-Grained Description (FG-Des) +and position-enhanced High-Quality Localization (HQ-Loc). FG-Des introduces +fine-grained anomaly descriptions for each category using Large Language Models +(LLMs) and employs adaptively learned textual templates to enhance the accuracy +and interpretability of anomaly detection. HQ-Loc, utilizing Grounding DINO for +preliminary localization, position-enhanced text prompts, and Multi-scale +Multi-shape Cross-modal Interaction (MMCI) module, facilitates more accurate +localization of anomalies of different sizes and shapes. Experimental results +on datasets like MVTec and VisA demonstrate that FiLo significantly improves +the performance of ZSAD in both detection and localization, achieving +state-of-the-art performance with an image-level AUC of 83.9% and a pixel-level +AUC of 95.9% on the VisA dataset. + +
+
+
+
+
+ + ☆ MathNet: A Data-Centric Approach for Printed Mathematical Expression + Recognition + + +
+ Printed mathematical expression recognition (MER) models are usually trained +and tested using LaTeX-generated mathematical expressions (MEs) as input and +the LaTeX source code as ground truth. As the same ME can be generated by +various different LaTeX source codes, this leads to unwanted variations in the +ground truth data that bias test performance results and hinder efficient +learning. In addition, the use of only one font to generate the MEs heavily +limits the generalization of the reported results to realistic scenarios. We +propose a data-centric approach to overcome this problem, and present +convincing experimental results: Our main contribution is an enhanced LaTeX +normalization to map any LaTeX ME to a canonical form. Based on this process, +we developed an improved version of the benchmark dataset im2latex-100k, +featuring 30 fonts instead of one. Second, we introduce the real-world dataset +realFormula, with MEs extracted from papers. Third, we developed a MER model, +MathNet, based on a convolutional vision transformer, with superior results on +all four test sets (im2latex-100k, im2latexv2, realFormula, and InftyMDB-1), +outperforming the previous state of the art by up to 88.3%. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ LMFNet: An Efficient Multimodal Fusion Approach for Semantic + Segmentation in High-Resolution Remote Sensing + + +
+ Despite the rapid evolution of semantic segmentation for land cover +classification in high-resolution remote sensing imagery, integrating multiple +data modalities such as Digital Surface Model (DSM), RGB, and Near-infrared +(NIR) remains a challenge. Current methods often process only two types of +data, missing out on the rich information that additional modalities can +provide. Addressing this gap, we propose a novel \textbf{L}ightweight +\textbf{M}ultimodal data \textbf{F}usion \textbf{Net}work (LMFNet) to +accomplish the tasks of fusion and semantic segmentation of multimodal remote +sensing images. LMFNet uniquely accommodates various data types simultaneously, +including RGB, NirRG, and DSM, through a weight-sharing, multi-branch vision +transformer that minimizes parameter count while ensuring robust feature +extraction. Our proposed multimodal fusion module integrates a +\textit{Multimodal Feature Fusion Reconstruction Layer} and \textit{Multimodal +Feature Self-Attention Fusion Layer}, which can reconstruct and fuse multimodal +features. Extensive testing on public datasets such as US3D, ISPRS Potsdam, and +ISPRS Vaihingen demonstrates the effectiveness of LMFNet. Specifically, it +achieves a mean Intersection over Union ($mIoU$) of 85.09\% on the US3D +dataset, marking a significant improvement over existing methods. Compared to +unimodal approaches, LMFNet shows a 10\% enhancement in $mIoU$ with only a 0.5M +increase in parameter count. Furthermore, against bimodal methods, our approach +with trilateral inputs enhances $mIoU$ by 0.46 percentage points. + +
+
+
+
+
+ + ☆ MLP: Motion Label Prior for Temporal Sentence Localization in Untrimmed + 3D Human Motions + + +
+ In this paper, we address the unexplored question of temporal sentence +localization in human motions (TSLM), aiming to locate a target moment from a +3D human motion that semantically corresponds to a text query. Considering that +3D human motions are captured using specialized motion capture devices, motions +with only a few joints lack complex scene information like objects and +lighting. Due to this character, motion data has low contextual richness and +semantic ambiguity between frames, which limits the accuracy of predictions +made by current video localization frameworks extended to TSLM to only a rough +level. To refine this, we devise two novel label-prior-assisted training +schemes: one embed prior knowledge of foreground and background to highlight +the localization chances of target moments, and the other forces the originally +rough predictions to overlap with the more accurate predictions obtained from +the flipped start/end prior label sequences during recovery training. We show +that injecting label-prior knowledge into the model is crucial for improving +performance at high IoU. In our constructed TSLM benchmark, our model termed +MLP achieves a recall of 44.13 at IoU@0.7 on the BABEL dataset and 71.17 on +HumanML3D (Restore), outperforming prior works. Finally, we showcase the +potential of our approach in corpus-level moment retrieval. Our source code is +openly accessible at https://github.com/eanson023/mlp. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ Data-independent Module-aware Pruning for Hierarchical Vision + Transformers ICLR 2024 + + +
+ Hierarchical vision transformers (ViTs) have two advantages over conventional +ViTs. First, hierarchical ViTs achieve linear computational complexity with +respect to image size by local self-attention. Second, hierarchical ViTs create +hierarchical feature maps by merging image patches in deeper layers for dense +prediction. However, existing pruning methods ignore the unique properties of +hierarchical ViTs and use the magnitude value as the weight importance. This +approach leads to two main drawbacks. First, the "local" attention weights are +compared at a "global" level, which may cause some "locally" important weights +to be pruned due to their relatively small magnitude "globally". The second +issue with magnitude pruning is that it fails to consider the distinct weight +distributions of the network, which are essential for extracting coarse to +fine-grained features at various hierarchical levels. + To solve the aforementioned issues, we have developed a Data-independent +Module-Aware Pruning method (DIMAP) to compress hierarchical ViTs. To ensure +that "local" attention weights at different hierarchical levels are compared +fairly in terms of their contribution, we treat them as a module and examine +their contribution by analyzing their information distortion. Furthermore, we +introduce a novel weight metric that is solely based on weights and does not +require input images, thereby eliminating the dependence on the patch merging +process. Our method validates its usefulness and strengths on Swin Transformers +of different sizes on ImageNet-1k classification. Notably, the top-5 accuracy +drop is only 0.07% when we remove 52.5% FLOPs and 52.7% parameters of Swin-B. +When we reduce 33.2% FLOPs and 33.2% parameters of Swin-S, we can even achieve +a 0.8% higher relative top-5 accuracy than the original model. Code is +available at: https://github.com/he-y/Data-independent-Module-Aware-Pruning + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ Beyond Alignment: Blind Video Face Restoration via Parsing-Guided + Temporal-Coherent Transformer + + +
+ Multiple complex degradations are coupled in low-quality video faces in the +real world. Therefore, blind video face restoration is a highly challenging +ill-posed problem, requiring not only hallucinating high-fidelity details but +also enhancing temporal coherence across diverse pose variations. Restoring +each frame independently in a naive manner inevitably introduces temporal +incoherence and artifacts from pose changes and keypoint localization errors. +To address this, we propose the first blind video face restoration approach +with a novel parsing-guided temporal-coherent transformer (PGTFormer) without +pre-alignment. PGTFormer leverages semantic parsing guidance to select optimal +face priors for generating temporally coherent artifact-free results. +Specifically, we pre-train a temporal-spatial vector quantized auto-encoder on +high-quality video face datasets to extract expressive context-rich priors. +Then, the temporal parse-guided codebook predictor (TPCP) restores faces in +different poses based on face parsing context cues without performing face +pre-alignment. This strategy reduces artifacts and mitigates jitter caused by +cumulative errors from face pre-alignment. Finally, the temporal fidelity +regulator (TFR) enhances fidelity through temporal feature interaction and +improves video temporal consistency. Extensive experiments on face videos show +that our method outperforms previous face restoration baselines. The code will +be released on +\href{https://github.com/kepengxu/PGTFormer}{https://github.com/kepengxu/PGTFormer}. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Attack on Scene Flow using Point Clouds + + +
+ Deep neural networks have made significant advancements in accurately +estimating scene flow using point clouds, which is vital for many applications +like video analysis, action recognition, and navigation. Robustness of these +techniques, however, remains a concern, particularly in the face of adversarial +attacks that have been proven to deceive state-of-the-art deep neural networks +in many domains. Surprisingly, the robustness of scene flow networks against +such attacks has not been thoroughly investigated. To address this problem, the +proposed approach aims to bridge this gap by introducing adversarial white-box +attacks specifically tailored for scene flow networks. Experimental results +show that the generated adversarial examples obtain up to 33.7 relative +degradation in average end-point error on the KITTI and FlyingThings3D +datasets. The study also reveals the significant impact that attacks targeting +point clouds in only one dimension or color channel have on average end-point +error. Analyzing the success and failure of these attacks on the scene flow +networks and their 2D optical flow network variants show a higher vulnerability +for the optical flow networks. + +
+
+
+
+
+ + ☆ Video sentence grounding with temporally global textual knowledge + + +
+ Temporal sentence grounding involves the retrieval of a video moment with a +natural language query. Many existing works directly incorporate the given +video and temporally localized query for temporal grounding, overlooking the +inherent domain gap between different modalities. In this paper, we utilize +pseudo-query features containing extensive temporally global textual knowledge +sourced from the same video-query pair, to enhance the bridging of domain gaps +and attain a heightened level of similarity between multi-modal features. +Specifically, we propose a Pseudo-query Intermediary Network (PIN) to achieve +an improved alignment of visual and comprehensive pseudo-query features within +the feature space through contrastive learning. Subsequently, we utilize +learnable prompts to encapsulate the knowledge of pseudo-queries, propagating +them into the textual encoder and multi-modal fusion module, further enhancing +the feature alignment between visual and language for better temporal +grounding. Extensive experiments conducted on the Charades-STA and +ActivityNet-Captions datasets demonstrate the effectiveness of our method. + +
+
+
+
+
+ + ☆ Turb-Seg-Res: A Segment-then-Restore Pipeline for Dynamic Videos with + Atmospheric Turbulence CVPR 2024 + + +
+ Tackling image degradation due to atmospheric turbulence, particularly in +dynamic environment, remains a challenge for long-range imaging systems. +Existing techniques have been primarily designed for static scenes or scenes +with small motion. This paper presents the first segment-then-restore pipeline +for restoring the videos of dynamic scenes in turbulent environment. We +leverage mean optical flow with an unsupervised motion segmentation method to +separate dynamic and static scene components prior to restoration. After camera +shake compensation and segmentation, we introduce foreground/background +enhancement leveraging the statistics of turbulence strength and a transformer +model trained on a novel noise-based procedural turbulence generator for fast +dataset augmentation. Benchmarked against existing restoration methods, our +approach restores most of the geometric distortion and enhances sharpness for +videos. We make our code, simulator, and data publicly available to advance the +field of video restoration from turbulence: riponcs.github.io/TurbSegRes + +
+
+ comment: CVPR 2024 Paper +
+
+
+
+
+ + ☆ Lost in Space: Probing Fine-grained Spatial Understanding in Vision and + Language Resamplers NAACL 2024 + + +
+ An effective method for combining frozen large language models (LLM) and +visual encoders involves a resampler module that creates a `visual prompt' +which is provided to the LLM, along with the textual prompt. While this +approach has enabled impressive performance across many coarse-grained tasks +like image captioning and visual question answering, more fine-grained tasks +that require spatial understanding have not been thoroughly examined. In this +paper, we use \textit{diagnostic classifiers} to measure the extent to which +the visual prompt produced by the resampler encodes spatial information. Our +results show that this information is largely absent from the resampler output +when kept frozen during training of the classifiers. However, when the +resampler and classifier are trained jointly, we observe a significant +performance boost. This shows that the compression achieved by the resamplers +can in principle encode the requisite spatial information, but that more +object-aware objectives are needed at the pretraining stage to facilitate this +capability + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ☆ MARVEL: Multidimensional Abstraction and Reasoning through Visual + Evaluation and Learning + + +
+ While multi-modal large language models (MLLMs) have shown significant +progress on many popular visual reasoning benchmarks, whether they possess +abstract visual reasoning abilities remains an open question. Similar to the +Sudoku puzzles, abstract visual reasoning (AVR) problems require finding +high-level patterns (e.g., repetition constraints) that control the input +shapes (e.g., digits) in a specific task configuration (e.g., matrix). However, +existing AVR benchmarks only considered a limited set of patterns (addition, +conjunction), input shapes (rectangle, square), and task configurations (3 by 3 +matrices). To evaluate MLLMs' reasoning abilities comprehensively, we introduce +MARVEL, a multidimensional AVR benchmark with 770 puzzles composed of six core +knowledge patterns, geometric and abstract shapes, and five different task +configurations. To inspect whether the model accuracy is grounded in perception +and reasoning, MARVEL complements the general AVR question with perception +questions in a hierarchical evaluation framework. We conduct comprehensive +experiments on MARVEL with nine representative MLLMs in zero-shot and few-shot +settings. Our experiments reveal that all models show near-random performance +on the AVR question, with significant performance gaps (40%) compared to humans +across all patterns and task configurations. Further analysis of perception +questions reveals that MLLMs struggle to comprehend the visual features +(near-random performance) and even count the panels in the puzzle ( <45%), +hindering their ability for abstract reasoning. We release our entire code and +dataset. + +
+
+
+
+
+ + ☆ Rethink Arbitrary Style Transfer with Transformer and Contrastive + Learning + + +
+ Arbitrary style transfer holds widespread attention in research and boasts +numerous practical applications. The existing methods, which either employ +cross-attention to incorporate deep style attributes into content attributes or +use adaptive normalization to adjust content features, fail to generate +high-quality stylized images. In this paper, we introduce an innovative +technique to improve the quality of stylized images. Firstly, we propose Style +Consistency Instance Normalization (SCIN), a method to refine the alignment +between content and style features. In addition, we have developed an +Instance-based Contrastive Learning (ICL) approach designed to understand the +relationships among various styles, thereby enhancing the quality of the +resulting stylized images. Recognizing that VGG networks are more adept at +extracting classification features and need to be better suited for capturing +style features, we have also introduced the Perception Encoder (PE) to capture +style features. Extensive experiments demonstrate that our proposed method +generates high-quality stylized images and effectively prevents artifacts +compared with the existing state-of-the-art methods. + +
+
+ comment: Accepted by CVIU +
+
+
+
+
+ + ☆ LTOS: Layout-controllable Text-Object Synthesis via Adaptive + Cross-attention Fusions + + +
+ Controllable text-to-image generation synthesizes visual text and objects in +images with certain conditions, which are frequently applied to emoji and +poster generation. Visual text rendering and layout-to-image generation tasks +have been popular in controllable text-to-image generation. However, each of +these tasks typically focuses on single modality generation or rendering, +leaving yet-to-be-bridged gaps between the approaches correspondingly designed +for each of the tasks. In this paper, we combine text rendering and +layout-to-image generation tasks into a single task: layout-controllable +text-object synthesis (LTOS) task, aiming at synthesizing images with object +and visual text based on predefined object layout and text contents. As +compliant datasets are not readily available for our LTOS task, we construct a +layout-aware text-object synthesis dataset, containing elaborate well-aligned +labels of visual text and object information. Based on the dataset, we propose +a layout-controllable text-object adaptive fusion (TOF) framework, which +generates images with clear, legible visual text and plausible objects. We +construct a visual-text rendering module to synthesize text and employ an +object-layout control module to generate objects while integrating the two +modules to harmoniously generate and integrate text content and objects in +images. To better the image-text integration, we propose a self-adaptive +cross-attention fusion module that helps the image generation to attend more to +important text information. Within such a fusion module, we use a self-adaptive +learnable factor to learn to flexibly control the influence of cross-attention +outputs on image generation. Experimental results show that our method +outperforms the state-of-the-art in LTOS, text rendering, and layout-to-image +tasks, enabling harmonious visual text rendering and object generation. + +
+
+
+
+
+ + ☆ I2CANSAY:Inter-Class Analogical Augmentation and Intra-Class + Significance Analysis for Non-Exemplar Online Task-Free Continual Learning + + +
+ Online task-free continual learning (OTFCL) is a more challenging variant of +continual learning which emphasizes the gradual shift of task boundaries and +learns in an online mode. Existing methods rely on a memory buffer composed of +old samples to prevent forgetting. However,the use of memory buffers not only +raises privacy concerns but also hinders the efficient learning of new samples. +To address this problem, we propose a novel framework called I2CANSAY that gets +rid of the dependence on memory buffers and efficiently learns the knowledge of +new data from one-shot samples. Concretely, our framework comprises two main +modules. Firstly, the Inter-Class Analogical Augmentation (ICAN) module +generates diverse pseudo-features for old classes based on the inter-class +analogy of feature distributions for different new classes, serving as a +substitute for the memory buffer. Secondly, the Intra-Class Significance +Analysis (ISAY) module analyzes the significance of attributes for each class +via its distribution standard deviation, and generates the importance vector as +a correction bias for the linear classifier, thereby enhancing the capability +of learning from new samples. We run our experiments on four popular image +classification datasets: CoRe50, CIFAR-10, CIFAR-100, and CUB-200, our approach +outperforms the prior state-of-the-art by a large margin. + +
+
+
+
+
+ + ☆ Exploring AIGC Video Quality: A Focus on Visual Harmony, Video-Text + Consistency and Domain Distribution Gap CVPR2024 + + +
+ The recent advancements in Text-to-Video Artificial Intelligence Generated +Content (AIGC) have been remarkable. Compared with traditional videos, the +assessment of AIGC videos encounters various challenges: visual inconsistency +that defy common sense, discrepancies between content and the textual prompt, +and distribution gap between various generative models, etc. Target at these +challenges, in this work, we categorize the assessment of AIGC video quality +into three dimensions: visual harmony, video-text consistency, and domain +distribution gap. For each dimension, we design specific modules to provide a +comprehensive quality assessment of AIGC videos. Furthermore, our research +identifies significant variations in visual quality, fluidity, and style among +videos generated by different text-to-video models. Predicting the source +generative model can make the AIGC video features more discriminative, which +enhances the quality assessment performance. The proposed method was used in +the third-place winner of the NTIRE 2024 Quality Assessment for AI-Generated +Content - Track 2 Video, demonstrating its effectiveness. + +
+
+ comment: 9 pages, 3 figures, 3 tables. Accepted by CVPR2024 Workshop (3rd + place of NTIRE2024 Quality Assessment for AI-Generated Content - Track 2 + Video) +
+
+
+
+
+ + ☆ Socratic Planner: Inquiry-Based Zero-Shot Planning for Embodied + Instruction Following + + +
+ Embodied Instruction Following (EIF) is the task of executing natural +language instructions by navigating and interacting with objects in 3D +environments. One of the primary challenges in EIF is compositional task +planning, which is often addressed with supervised or in-context learning with +labeled data. To this end, we introduce the Socratic Planner, the first +zero-shot planning method that infers without the need for any training data. +Socratic Planner first decomposes the instructions into substructural +information of the task through self-questioning and answering, translating it +into a high-level plan, i.e., a sequence of subgoals. Subgoals are executed +sequentially, with our visually grounded re-planning mechanism adjusting plans +dynamically through a dense visual feedback. We also introduce an evaluation +metric of high-level plans, RelaxedHLP, for a more comprehensive evaluation. +Experiments demonstrate the effectiveness of the Socratic Planner, achieving +competitive performance on both zero-shot and few-shot task planning in the +ALFRED benchmark, particularly excelling in tasks requiring higher-dimensional +inference. Additionally, a precise adjustments in the plan were achieved by +incorporating environmental visual information. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ Exploring Diverse Methods in Visual Question Answering + + +
+ This study explores innovative methods for improving Visual Question +Answering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and +attention mechanisms. Leveraging a balanced VQA dataset, we investigate three +distinct strategies. Firstly, GAN-based approaches aim to generate answer +embeddings conditioned on image and question inputs, showing potential but +struggling with more complex tasks. Secondly, autoencoder-based techniques +focus on learning optimal embeddings for questions and images, achieving +comparable results with GAN due to better ability on complex questions. Lastly, +attention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB), +address language priors and attention modeling, albeit with a +complexity-performance trade-off. This study underscores the challenges and +opportunities in VQA and suggests avenues for future research, including +alternative GAN formulations and attentional mechanisms. + +
+
+
+
+
+ + ☆ Masked Latent Transformer with the Random Masking Ratio to Advance the + Diagnosis of Dental Fluorosis + + +
+ Dental fluorosis is a chronic disease caused by long-term overconsumption of +fluoride, which leads to changes in the appearance of tooth enamel. It is an +important basis for early non-invasive diagnosis of endemic fluorosis. However, +even dental professionals may not be able to accurately distinguish dental +fluorosis and its severity based on tooth images. Currently, there is still a +gap in research on applying deep learning to diagnosing dental fluorosis. +Therefore, we construct the first open-source dental fluorosis image dataset +(DFID), laying the foundation for deep learning research in this field. To +advance the diagnosis of dental fluorosis, we propose a pioneering deep +learning model called masked latent transformer with the random masking ratio +(MLTrMR). MLTrMR introduces a mask latent modeling scheme based on Vision +Transformer to enhance contextual learning of dental fluorosis lesion +characteristics. Consisting of a latent embedder, encoder, and decoder, MLTrMR +employs the latent embedder to extract latent tokens from the original image, +whereas the encoder and decoder comprising the latent transformer (LT) block +are used to process unmasked tokens and predict masked tokens, respectively. To +mitigate the lack of inductive bias in Vision Transformer, which may result in +performance degradation, the LT block introduces latent tokens to enhance the +learning capacity of latent lesion features. Furthermore, we design an +auxiliary loss function to constrain the parameter update direction of the +model. MLTrMR achieves 80.19% accuracy, 75.79% F1, and 81.28% quadratic +weighted kappa on DFID, making it state-of-the-art (SOTA). + +
+
+
+
+
+ + ☆ Cell Phone Image-Based Persian Rice Detection and Classification Using + Deep Learning Techniques + + +
+ This study introduces an innovative approach to classifying various types of +Persian rice using image-based deep learning techniques, highlighting the +practical application of everyday technology in food categorization. +Recognizing the diversity of Persian rice and its culinary significance, we +leveraged the capabilities of convolutional neural networks (CNNs), +specifically by fine-tuning a ResNet model for accurate identification of +different rice varieties and employing a U-Net architecture for precise +segmentation of rice grains in bulk images. This dual-methodology framework +allows for both individual grain classification and comprehensive analysis of +bulk rice samples, addressing two crucial aspects of rice quality assessment. +Utilizing images captured with consumer-grade cell phones reflects a realistic +scenario in which individuals can leverage this technology for assistance with +grocery shopping and meal preparation. The dataset, comprising various rice +types photographed under natural conditions without professional lighting or +equipment, presents a challenging yet practical classification problem. Our +findings demonstrate the feasibility of using non-professional images for food +classification and the potential of deep learning models, like ResNet and +U-Net, to adapt to the nuances of everyday objects and textures. This study +contributes to the field by providing insights into the applicability of +image-based deep learning in daily life, specifically for enhancing consumer +experiences and knowledge in food selection. Furthermore, it opens avenues for +extending this approach to other food categories and practical applications, +emphasizing the role of accessible technology in bridging the gap between +sophisticated computational methods and everyday tasks. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Pointsoup: High-Performance and Extremely Low-Decoding-Latency Learned + Geometry Codec for Large-Scale Point Cloud Scenes + + +
+ Despite considerable progress being achieved in point cloud geometry +compression, there still remains a challenge in effectively compressing +large-scale scenes with sparse surfaces. Another key challenge lies in reducing +decoding latency, a crucial requirement in real-world application. In this +paper, we propose Pointsoup, an efficient learning-based geometry codec that +attains high-performance and extremely low-decoding-latency simultaneously. +Inspired by conventional Trisoup codec, a point model-based strategy is devised +to characterize local surfaces. Specifically, skin features are embedded from +local windows via an attention-based encoder, and dilated windows are +introduced as cross-scale priors to infer the distribution of quantized +features in parallel. During decoding, features undergo fast refinement, +followed by a folding-based point generator that reconstructs point coordinates +with fairly fast speed. Experiments show that Pointsoup achieves +state-of-the-art performance on multiple benchmarks with significantly lower +decoding complexity, i.e., up to 90$\sim$160$\times$ faster than the G-PCCv23 +Trisoup decoder on a comparatively low-end platform (e.g., one RTX 2080Ti). +Furthermore, it offers variable-rate control with a single neural model +(2.9MB), which is attractive for industrial practitioners. + +
+
+
+
+
+ + ☆ Generalizable Novel-View Synthesis using a Stereo Camera CVPR 2024 + + +
+ In this paper, we propose the first generalizable view synthesis approach +that specifically targets multi-view stereo-camera images. Since recent stereo +matching has demonstrated accurate geometry prediction, we introduce stereo +matching into novel-view synthesis for high-quality geometry reconstruction. To +this end, this paper proposes a novel framework, dubbed StereoNeRF, which +integrates stereo matching into a NeRF-based generalizable view synthesis +approach. StereoNeRF is equipped with three key components to effectively +exploit stereo matching in novel-view synthesis: a stereo feature extractor, a +depth-guided plane-sweeping, and a stereo depth loss. Moreover, we propose the +StereoNVS dataset, the first multi-view dataset of stereo-camera images, +encompassing a wide variety of both real and synthetic scenes. Our experimental +results demonstrate that StereoNeRF surpasses previous approaches in +generalizable view synthesis. + +
+
+ comment: Accepted to CVPR 2024. Project page URL: + https://jinwonjoon.github.io/stereonerf/ +
+
+
+
+
+ + ☆ Bracketing Image Restoration and Enhancement with High-Low Frequency + Decomposition CVPR 2024 + + +
+ In real-world scenarios, due to a series of image degradations, obtaining +high-quality, clear content photos is challenging. While significant progress +has been made in synthesizing high-quality images, previous methods for image +restoration and enhancement often overlooked the characteristics of different +degradations. They applied the same structure to address various types of +degradation, resulting in less-than-ideal restoration outcomes. Inspired by the +notion that high/low frequency information is applicable to different +degradations, we introduce HLNet, a Bracketing Image Restoration and +Enhancement method based on high-low frequency decomposition. Specifically, we +employ two modules for feature extraction: shared weight modules and non-shared +weight modules. In the shared weight modules, we use SCConv to extract common +features from different degradations. In the non-shared weight modules, we +introduce the High-Low Frequency Decomposition Block (HLFDB), which employs +different methods to handle high-low frequency information, enabling the model +to address different degradations more effectively. Compared to other networks, +our method takes into account the characteristics of different degradations, +thus achieving higher-quality image restoration. + +
+
+ comment: This paper is accepted by CVPR 2024 Workshop +
+
+
+
+
+ + ☆ Motion-aware Latent Diffusion Models for Video Frame Interpolation + + +
+ With the advancement of AIGC, video frame interpolation (VFI) has become a +crucial component in existing video generation frameworks, attracting +widespread research interest. For the VFI task, the motion estimation between +neighboring frames plays a crucial role in avoiding motion ambiguity. However, +existing VFI methods always struggle to accurately predict the motion +information between consecutive frames, and this imprecise estimation leads to +blurred and visually incoherent interpolated frames. In this paper, we propose +a novel diffusion framework, motion-aware latent diffusion models (MADiff), +which is specifically designed for the VFI task. By incorporating motion priors +between the conditional neighboring frames with the target interpolated frame +predicted throughout the diffusion sampling procedure, MADiff progressively +refines the intermediate outcomes, culminating in generating both visually +smooth and realistic results. Extensive experiments conducted on benchmark +datasets demonstrate that our method achieves state-of-the-art performance +significantly outperforming existing approaches, especially under challenging +scenarios involving dynamic textures with complex motion. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.09508 by + other authors +
+
+
+
+
+ + ☆ Listen Then See: Video Alignment with Speaker Attention + + +
+ Video-based Question Answering (Video QA) is a challenging task and becomes +even more intricate when addressing Socially Intelligent Question Answering +(SIQA). SIQA requires context understanding, temporal reasoning, and the +integration of multimodal information, but in addition, it requires processing +nuanced human behavior. Furthermore, the complexities involved are exacerbated +by the dominance of the primary modality (text) over the others. Thus, there is +a need to help the task's secondary modalities to work in tandem with the +primary modality. In this work, we introduce a cross-modal alignment and +subsequent representation fusion approach that achieves state-of-the-art +results (82.06\% accuracy) on the Social IQ 2.0 dataset for SIQA. Our approach +exhibits an improved ability to leverage the video modality by using the audio +modality as a bridge with the language modality. This leads to enhanced +performance by reducing the prevalent issue of language overfitting and +resultant video modality bypassing encountered by current existing techniques. +Our code and models are publicly available at +https://github.com/sts-vlcc/sts-vlcc + +
+
+
+
+
+ + ☆ Graph4GUI: Graph Neural Networks for Representing Graphical User + Interfaces + + +
+ Present-day graphical user interfaces (GUIs) exhibit diverse arrangements of +text, graphics, and interactive elements such as buttons and menus, but +representations of GUIs have not kept up. They do not encapsulate both semantic +and visuo-spatial relationships among elements. To seize machine learning's +potential for GUIs more efficiently, Graph4GUI exploits graph neural networks +to capture individual elements' properties and their semantic-visuo-spatial +constraints in a layout. The learned representation demonstrated its +effectiveness in multiple tasks, especially generating designs in a challenging +GUI autocompletion task, which involved predicting the positions of remaining +unplaced elements in a partially completed GUI. The new model's suggestions +showed alignment and visual appeal superior to the baseline method and received +higher subjective ratings for preference. Furthermore, we demonstrate the +practical benefits and efficiency advantages designers perceive when utilizing +our model as an autocompletion plug-in. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Dynamic in Static: Hybrid Visual Correspondence for Self-Supervised + Video Object Segmentation + + +
+ Conventional video object segmentation (VOS) methods usually necessitate a +substantial volume of pixel-level annotated video data for fully supervised +learning. In this paper, we present HVC, a \textbf{h}ybrid static-dynamic +\textbf{v}isual \textbf{c}orrespondence framework for self-supervised VOS. HVC +extracts pseudo-dynamic signals from static images, enabling an efficient and +scalable VOS model. Our approach utilizes a minimalist fully-convolutional +architecture to capture static-dynamic visual correspondence in image-cropped +views. To achieve this objective, we present a unified self-supervised approach +to learn visual representations of static-dynamic feature similarity. Firstly, +we establish static correspondence by utilizing a priori coordinate information +between cropped views to guide the formation of consistent static feature +representations. Subsequently, we devise a concise convolutional layer to +capture the forward / backward pseudo-dynamic signals between two views, +serving as cues for dynamic representations. Finally, we propose a hybrid +visual correspondence loss to learn joint static and dynamic consistency +representations. Our approach, without bells and whistles, necessitates only +one training session using static image data, significantly reducing memory +consumption ($\sim$16GB) and training time ($\sim$\textbf{2h}). Moreover, HVC +achieves state-of-the-art performance in several self-supervised VOS benchmarks +and additional video label propagation tasks. + +
+
+
+
+
+ + ☆ Authentic Emotion Mapping: Benchmarking Facial Expressions in Real News + + +
+ In this paper, we present a novel benchmark for Emotion Recognition using +facial landmarks extracted from realistic news videos. Traditional methods +relying on RGB images are resource-intensive, whereas our approach with Facial +Landmark Emotion Recognition (FLER) offers a simplified yet effective +alternative. By leveraging Graph Neural Networks (GNNs) to analyze the +geometric and spatial relationships of facial landmarks, our method enhances +the understanding and accuracy of emotion recognition. We discuss the +advancements and challenges in deep learning techniques for emotion +recognition, particularly focusing on Graph Neural Networks (GNNs) and +Transformers. Our experimental results demonstrate the viability and potential +of our dataset as a benchmark, setting a new direction for future research in +emotion recognition technologies. The codes and models are at: +https://github.com/wangzhifengharrison/benchmark_real_news + +
+
+
+
+
+ + ♻ ☆ Prototype-based Interpretable Breast Cancer Prediction Models: Analysis + and Challenges + + +
+ Deep learning models have achieved high performance in medical applications, +however, their adoption in clinical practice is hindered due to their black-box +nature. Self-explainable models, like prototype-based models, can be especially +beneficial as they are interpretable by design. However, if the learnt +prototypes are of low quality then the prototype-based models are as good as +black-box. Having high quality prototypes is a pre-requisite for a truly +interpretable model. In this work, we propose a prototype evaluation framework +for coherence (PEF-C) for quantitatively evaluating the quality of the +prototypes based on domain knowledge. We show the use of PEF-C in the context +of breast cancer prediction using mammography. Existing works on +prototype-based models on breast cancer prediction using mammography have +focused on improving the classification performance of prototype-based models +compared to black-box models and have evaluated prototype quality through +anecdotal evidence. We are the first to go beyond anecdotal evidence and +evaluate the quality of the mammography prototypes systematically using our +PEF-C. Specifically, we apply three state-of-the-art prototype-based models, +ProtoPNet, BRAIxProtoPNet++ and PIP-Net on mammography images for breast cancer +prediction and evaluate these models w.r.t. i) classification performance, and +ii) quality of the prototypes, on three public datasets. Our results show that +prototype-based models are competitive with black-box models in terms of +classification performance, and achieve a higher score in detecting ROIs. +However, the quality of the prototypes are not yet sufficient and can be +improved in aspects of relevance, purity and learning a variety of prototypes. +We call the XAI community to systematically evaluate the quality of the +prototypes to check their true usability in high stake decisions and improve +such models further. + +
+
+ comment: Accepted at World Conference on Explainable Artificial Intelligence; + 21 pages, 5 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Multi-task Magnetic Resonance Imaging Reconstruction using Meta-learning + + +
+ Using single-task deep learning methods to reconstruct Magnetic Resonance +Imaging (MRI) data acquired with different imaging sequences is inherently +challenging. The trained deep learning model typically lacks generalizability, +and the dissimilarity among image datasets with different types of contrast +leads to suboptimal learning performance. This paper proposes a meta-learning +approach to efficiently learn image features from multiple MR image datasets. +Our algorithm can perform multi-task learning to simultaneously reconstruct MR +images acquired using different imaging sequences with different image +contrasts. The experiment results demonstrate the ability of our new +meta-learning reconstruction method to successfully reconstruct +highly-undersampled k-space data from multiple MRI datasets simultaneously, +outperforming other compelling reconstruction methods previously developed for +single-task learning. + +
+
+
+
+
+ + ♻ ☆ Progressive Feature Learning for Realistic Cloth-Changing Gait + Recognition + + +
+ Gait recognition is instrumental in crime prevention and social security, for +it can be conducted at a long distance to figure out the identity of persons. +However, existing datasets and methods cannot satisfactorily deal with the most +challenging cloth-changing problem in practice. Specifically, the practical +gait models are usually trained on automatically labeled data, in which the +sequences' views and cloth conditions of each person have some restrictions. To +be concrete, the cross-view sub-dataset only has normal walking condition +without cloth-changing, while the cross-cloth sub-dataset has cloth-changing +sequences but only in front views. As a result, the cloth-changing accuracy +cannot meet practical requirements. In this work, we formulate the problem as +Realistic Cloth-Changing Gait Recognition (abbreviated as RCC-GR) and we +construct two benchmarks: CASIA-BN-RCC and OUMVLP-RCC, to simulate the above +setting. Furthermore, we propose a new framework called Progressive Feature +Learning that can be applied with off-the-shelf backbones to improve their +performance in RCC-GR. Specifically, in our framework, we design Progressive +Mapping and Progressive Uncertainty to extract cross-view features and then +extract cross-cloth features on the basis. In this way, the feature from the +cross-view sub-dataset can first dominate the feature space and relieve the +uneven distribution caused by the adverse effect from the cross-cloth +sub-dataset. The experiments on our benchmarks show that our framework can +effectively improve recognition performance, especially in the cloth-changing +conditions. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Gait Recognition with Selective Fusion + + +
+ Previous gait recognition methods primarily trained on labeled datasets, +which require painful labeling effort. However, using a pre-trained model on a +new dataset without fine-tuning can lead to significant performance +degradation. So to make the pre-trained gait recognition model able to be +fine-tuned on unlabeled datasets, we propose a new task: Unsupervised Gait +Recognition (UGR). We introduce a new cluster-based baseline to solve UGR with +cluster-level contrastive learning. But we further find more challenges this +task meets. First, sequences of the same person in different clothes tend to +cluster separately due to the significant appearance changes. Second, sequences +taken from 0{\deg} and 180{\deg} views lack walking postures and do not cluster +with sequences taken from other views. To address these challenges, we propose +a Selective Fusion method, which includes Selective Cluster Fusion (SCF) and +Selective Sample Fusion (SSF). With SCF, we merge matched clusters of the same +person wearing different clothes by updating the cluster-level memory bank with +a multi-cluster update strategy. And in SSF, we merge sequences taken from +front/back views gradually with curriculum learning. Extensive experiments show +the effectiveness of our method in improving the rank-1 accuracy in walking +with different coats condition and front/back views conditions. + +
+
+
+
+
+ + ♻ ☆ VidProM: A Million-scale Real Prompt-Gallery Dataset for Text-to-Video + Diffusion Models + + +
+ The arrival of Sora marks a new era for text-to-video diffusion models, +bringing significant advancements in video generation and potential +applications. However, Sora, along with other text-to-video diffusion models, +is highly reliant on prompts, and there is no publicly available dataset that +features a study of text-to-video prompts. In this paper, we introduce VidProM, +the first large-scale dataset comprising 1.67 Million unique text-to-Video +Prompts from real users. Additionally, this dataset includes 6.69 million +videos generated by four state-of-the-art diffusion models, alongside some +related data. We initially discuss the curation of this large-scale dataset, a +process that is both time-consuming and costly. Subsequently, we underscore the +need for a new prompt dataset specifically designed for text-to-video +generation by illustrating how VidProM differs from DiffusionDB, a large-scale +prompt-gallery dataset for image generation. Our extensive and diverse dataset +also opens up many exciting new research areas. For instance, we suggest +exploring text-to-video prompt engineering, efficient video generation, and +video copy detection for diffusion models to develop better, more efficient, +and safer models. The project (including the collected dataset VidProM and +related code) is publicly available at https://vidprom.github.io under the +CC-BY-NC 4.0 License. + +
+
+ comment: The project (including the collected dataset VidProM and related + code) is publicly available at https://vidprom.github.io under the CC-BY-NC + 4.0 License +
+
+
+
+
+ + ♻ ☆ Pre-training with Random Orthogonal Projection Image Modeling ICLR + + +
+ Masked Image Modeling (MIM) is a powerful self-supervised strategy for visual +pre-training without the use of labels. MIM applies random crops to input +images, processes them with an encoder, and then recovers the masked inputs +with a decoder, which encourages the network to capture and learn structural +information about objects and scenes. The intermediate feature representations +obtained from MIM are suitable for fine-tuning on downstream tasks. In this +paper, we propose an Image Modeling framework based on random orthogonal +projection instead of binary masking as in MIM. Our proposed Random Orthogonal +Projection Image Modeling (ROPIM) reduces spatially-wise token information +under guaranteed bound on the noise variance and can be considered as masking +entire spatial image area under locally varying masking degrees. Since ROPIM +uses a random subspace for the projection that realizes the masking step, the +readily available complement of the subspace can be used during unmasking to +promote recovery of removed information. In this paper, we show that using +random orthogonal projection leads to superior performance compared to +crop-based masking. We demonstrate state-of-the-art results on several popular +benchmarks. + +
+
+ comment: Published as a conference paper at the International Conference on + Learning Representations (ICLR) 2024. 19 pages +
+
+
+
+
+ + ♻ ☆ Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language + Models with Creative Humor Generation + + +
+ Chain-of-Thought (CoT) guides large language models (LLMs) to reason +step-by-step, and can motivate their logical reasoning ability. While effective +for logical tasks, CoT is not conducive to creative problem-solving which often +requires out-of-box thoughts and is crucial for innovation advancements. In +this paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a +non-sequential, creative paradigm involving strong associations and knowledge +leaps. To this end, we study LLMs on the popular Oogiri game which needs +participants to have good creativity and strong associative thinking for +responding unexpectedly and humorously to the given image, text, or both, and +thus is suitable for LoT study. Then to investigate LLMs' LoT ability in the +Oogiri game, we first build a multimodal and multilingual Oogiri-GO dataset +which contains over 130,000 samples from the Oogiri game, and observe the +insufficient LoT ability or failures of most existing LLMs on the Oogiri game. +Accordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve +LLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into +LoT-oriented instruction tuning data to train pretrained LLM for achieving +certain LoT humor generation and discrimination abilities. Then CLoT designs an +explorative self-refinement that encourages the LLM to generate more creative +LoT data via exploring parallels between seemingly unrelated concepts and +selects high-quality data to train itself for self-refinement. CLoT not only +excels in humor generation in the Oogiri game but also boosts creative +abilities in various tasks like cloud guessing game and divergent association +task. These findings advance our understanding and offer a pathway to improve +LLMs' creative capacities for innovative applications across domains. The +dataset, code, and models will be released online. +https://zhongshsh.github.io/CLoT/. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ Analyzing Decades-Long Environmental Changes in Namibia Using Archival + Aerial Photography and Deep Learning + + +
+ This study explores object detection in historical aerial photographs of +Namibia to identify long-term environmental changes. Specifically, we aim to +identify key objects -- Waterholes, Omuti homesteads, and Big trees -- around +Oshikango in Namibia using sub-meter gray-scale aerial imagery from 1943 and +1972. In this work, we propose a workflow for analyzing historical aerial +imagery using a deep semantic segmentation model on sparse hand-labels. To this +end, we employ a number of strategies including class-weighting, +pseudo-labeling and empirical p-value-based filtering to balance skewed and +sparse representations of objects in the ground truth data. Results demonstrate +the benefits of these different training strategies resulting in an average +$F_1=0.661$ and $F_1=0.755$ over the three objects of interest for the 1943 and +1972 imagery, respectively. We also identified that the average size of +Waterhole and Big trees increased while the average size of Omuti homesteads +decreased between 1943 and 1972 reflecting some of the local effects of the +massive post-Second World War economic, agricultural, demographic, and +environmental changes. This work also highlights the untapped potential of +historical aerial photographs in understanding long-term environmental changes +beyond Namibia (and Africa). With the lack of adequate satellite technology in +the past, archival aerial photography offers a great alternative to uncover +decades-long environmental changes. + +
+
+
+
+
+ + ♻ ☆ EViT: An Eagle Vision Transformer with Bi-Fovea Self-Attention + + +
+ Thanks to the advancement of deep learning technology, vision transformers +has demonstrated competitive performance in various computer vision tasks. +Unfortunately, vision transformers still faces some challenges such as high +computational complexity and absence of desirable inductive bias. To alleviate +these issues, we propose a novel Bi-Fovea Self-Attention (BFSA) inspired by the +physiological structure and visual properties of eagle eyes. This BFSA is used +to simulate the shallow and deep fovea of eagle vision, prompting the network +to learn the feature representation of targets from coarse to fine. +Additionally, we design a Bionic Eagle Vision (BEV) block based on BFSA. It +combines the advantages of convolution and introduces a novel Bi-Fovea +Feedforward Network (BFFN) to mimic the working way of biological visual cortex +processes information in hierarchically and parallel. Furthermore, we develop a +unified and efficient pyramid backbone network family called Eagle Vision +Transformers (EViTs) by stacking BEV blocks. Experimental results show that +EViTs exhibit highly competitive performance in various computer vision tasks +such as image classification, object detection and semantic segmentation. +Especially in terms of performance and computational efficiency, EViTs show +significant advantages compared with other counterparts. Code is available at +https://github.com/nkusyl/EViT + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ OmniMedVQA: A New Large-Scale Comprehensive Evaluation Benchmark for + Medical LVLM + + +
+ Large Vision-Language Models (LVLMs) have demonstrated remarkable +capabilities in various multimodal tasks. However, their potential in the +medical domain remains largely unexplored. A significant challenge arises from +the scarcity of diverse medical images spanning various modalities and +anatomical regions, which is essential in real-world medical applications. To +solve this problem, in this paper, we introduce OmniMedVQA, a novel +comprehensive medical Visual Question Answering (VQA) benchmark. This benchmark +is collected from 73 different medical datasets, including 12 different +modalities and covering more than 20 distinct anatomical regions. Importantly, +all images in this benchmark are sourced from authentic medical scenarios, +ensuring alignment with the requirements of the medical field and suitability +for evaluating LVLMs. Through our extensive experiments, we have found that +existing LVLMs struggle to address these medical VQA problems effectively. +Moreover, what surprises us is that medical-specialized LVLMs even exhibit +inferior performance to those general-domain models, calling for a more +versatile and robust LVLM in the biomedical field. The evaluation results not +only reveal the current limitations of LVLM in understanding real medical +images but also highlight our dataset's significance. Our code with dataset are +available at https://github.com/OpenGVLab/Multi-Modality-Arena. + +
+
+
+
+
+ + ♻ ☆ FakeTracer: Catching Face-swap DeepFakes via Implanting Traces in + Training + + +
+ Face-swap DeepFake is an emerging AI-based face forgery technique that can +replace the original face in a video with a generated face of the target +identity while retaining consistent facial attributes such as expression and +orientation. Due to the high privacy of faces, the misuse of this technique can +raise severe social concerns, drawing tremendous attention to defend against +DeepFakes recently. In this paper, we describe a new proactive defense method +called FakeTracer to expose face-swap DeepFakes via implanting traces in +training. Compared to general face-synthesis DeepFake, the face-swap DeepFake +is more complex as it involves identity change, is subjected to the +encoding-decoding process, and is trained unsupervised, increasing the +difficulty of implanting traces into the training phase. To effectively defend +against face-swap DeepFake, we design two types of traces, sustainable trace +(STrace) and erasable trace (ETrace), to be added to training faces. During the +training, these manipulated faces affect the learning of the face-swap DeepFake +model, enabling it to generate faces that only contain sustainable traces. In +light of these two traces, our method can effectively expose DeepFakes by +identifying them. Extensive experiments corroborate the efficacy of our method +on defending against face-swap DeepFake. + +
+
+
+
+
+ + ♻ ☆ Generalizable Face Landmarking Guided by Conditional Face Warping CVPR 2024 + + +
+ As a significant step for human face modeling, editing, and generation, face +landmarking aims at extracting facial keypoints from images. A generalizable +face landmarker is required in practice because real-world facial images, e.g., +the avatars in animations and games, are often stylized in various ways. +However, achieving generalizable face landmarking is challenging due to the +diversity of facial styles and the scarcity of labeled stylized faces. In this +study, we propose a simple but effective paradigm to learn a generalizable face +landmarker based on labeled real human faces and unlabeled stylized faces. Our +method learns the face landmarker as the key module of a conditional face +warper. Given a pair of real and stylized facial images, the conditional face +warper predicts a warping field from the real face to the stylized one, in +which the face landmarker predicts the ending points of the warping field and +provides us with high-quality pseudo landmarks for the corresponding stylized +facial images. Applying an alternating optimization strategy, we learn the face +landmarker to minimize $i)$ the discrepancy between the stylized faces and the +warped real ones and $ii)$ the prediction errors of both real and pseudo +landmarks. Experiments on various datasets show that our method outperforms +existing state-of-the-art domain adaptation methods in face landmarking tasks, +leading to a face landmarker with better generalizability. Code is available at +https://plustwo0.github.io/project-face-landmarker. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Large-scale Dataset Pruning with Dynamic Uncertainty + + +
+ The state of the art of many learning tasks, e.g., image classification, is +advanced by collecting larger datasets and then training larger models on them. +As the outcome, the increasing computational cost is becoming unaffordable. In +this paper, we investigate how to prune the large-scale datasets, and thus +produce an informative subset for training sophisticated deep models with +negligible performance drop. We propose a simple yet effective dataset pruning +method by exploring both the prediction uncertainty and training dynamics. We +study dataset pruning by measuring the variation of predictions during the +whole training process on large-scale datasets, i.e., ImageNet-1K and +ImageNet-21K, and advanced models, i.e., Swin Transformer and ConvNeXt. +Extensive experimental results indicate that our method outperforms the state +of the art and achieves 25% lossless pruning ratio on both ImageNet-1K and +ImageNet-21K. The code and pruned datasets are available at +https://github.com/BAAI-DCAI/Dataset-Pruning. + +
+
+
+
+
+ + ♻ ☆ PI3D: Efficient Text-to-3D Generation with Pseudo-Image Diffusion CVPR 2024 + + +
+ Diffusion models trained on large-scale text-image datasets have demonstrated +a strong capability of controllable high-quality image generation from +arbitrary text prompts. However, the generation quality and generalization +ability of 3D diffusion models is hindered by the scarcity of high-quality and +large-scale 3D datasets. In this paper, we present PI3D, a framework that fully +leverages the pre-trained text-to-image diffusion models' ability to generate +high-quality 3D shapes from text prompts in minutes. The core idea is to +connect the 2D and 3D domains by representing a 3D shape as a set of Pseudo RGB +Images. We fine-tune an existing text-to-image diffusion model to produce such +pseudo-images using a small number of text-3D pairs. Surprisingly, we find that +it can already generate meaningful and consistent 3D shapes given complex text +descriptions. We further take the generated shapes as the starting point for a +lightweight iterative refinement using score distillation sampling to achieve +high-quality generation under a low budget. PI3D generates a single 3D shape +from text in only 3 minutes and the quality is validated to outperform existing +3D generative models by a large margin. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Revisiting Adversarial Training at Scale CVPR 2024 + + +
+ The machine learning community has witnessed a drastic change in the training +pipeline, pivoted by those ''foundation models'' with unprecedented scales. +However, the field of adversarial training is lagging behind, predominantly +centered around small model sizes like ResNet-50, and tiny and low-resolution +datasets like CIFAR-10. To bridge this transformation gap, this paper provides +a modern re-examination with adversarial training, investigating its potential +benefits when applied at scale. Additionally, we introduce an efficient and +effective training strategy to enable adversarial training with giant models +and web-scale data at an affordable computing cost. We denote this newly +introduced framework as AdvXL. + Empirical results demonstrate that AdvXL establishes new state-of-the-art +robust accuracy records under AutoAttack on ImageNet-1K. For example, by +training on DataComp-1B dataset, our AdvXL empowers a vanilla ViT-g model to +substantially surpass the previous records of $l_{\infty}$-, $l_{2}$-, and +$l_{1}$-robust accuracy by margins of 11.4%, 14.2% and 12.9%, respectively. +This achievement posits AdvXL as a pioneering approach, charting a new +trajectory for the efficient training of robust visual representations at +significantly larger scales. Our code is available at +https://github.com/UCSC-VLAA/AdvXL. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ VCC-INFUSE: Towards Accurate and Efficient Selection of Unlabeled + Examples in Semi-supervised Learning IJCAI 2024 + + +
+ Despite the progress of Semi-supervised Learning (SSL), existing methods fail +to utilize unlabeled data effectively and efficiently. Many pseudo-label-based +methods select unlabeled examples based on inaccurate confidence scores from +the classifier. Most prior work also uses all available unlabeled data without +pruning, making it difficult to handle large amounts of unlabeled data. To +address these issues, we propose two methods: Variational Confidence +Calibration (VCC) and Influence-Function-based Unlabeled Sample Elimination +(INFUSE). VCC is an universal plugin for SSL confidence calibration, using a +variational autoencoder to select more accurate pseudo labels based on three +types of consistency scores. INFUSE is a data pruning method that constructs a +core dataset of unlabeled examples under SSL. Our methods are effective in +multiple datasets and settings, reducing classification errors rates and saving +training time. Together, VCC-INFUSE reduces the error rate of FlexMatch on the +CIFAR-100 dataset by 1.08% while saving nearly half of the training time. + +
+
+ comment: Accepted paper of IJCAI 2024. Shijie Fang and Qianhan Feng + contributed equally to this paper. New version, some problems and typos are + fixed +
+
+
+
+
+ + ♻ ☆ How to Evaluate Semantic Communications for Images with ViTScore Metric? + + +
+ Semantic communications (SC) have been expected to be a new paradigm shifting +to catalyze the next generation communication, whose main concerns shift from +accurate bit transmission to effective semantic information exchange in +communications. However, the previous and widely-used metrics for images are +not applicable to evaluate the image semantic similarity in SC. Classical +metrics to measure the similarity between two images usually rely on the pixel +level or the structural level, such as the PSNR and the MS-SSIM. +Straightforwardly using some tailored metrics based on deep-learning methods in +CV community, such as the LPIPS, is infeasible for SC. To tackle this, inspired +by BERTScore in NLP community, we propose a novel metric for evaluating image +semantic similarity, named Vision Transformer Score (ViTScore). We prove +theoretically that ViTScore has 3 important properties, including symmetry, +boundedness, and normalization, which make ViTScore convenient and intuitive +for image measurement. To evaluate the performance of ViTScore, we compare +ViTScore with 3 typical metrics (PSNR, MS-SSIM, and LPIPS) through 4 classes of +experiments: (i) correlation with BERTScore through evaluation of image caption +downstream CV task, (ii) evaluation in classical image communications, (iii) +evaluation in image semantic communication systems, and (iv) evaluation in +image semantic communication systems with semantic attack. Experimental results +demonstrate that ViTScore is robust and efficient in evaluating the semantic +similarity of images. Particularly, ViTScore outperforms the other 3 typical +metrics in evaluating the image semantic changes by semantic attack, such as +image inverse with Generative Adversarial Networks (GANs). This indicates that +ViTScore is an effective performance metric when deployed in SC scenarios. + +
+
+
+
+
+ + ♻ ☆ Spiking Structured State Space Model for Monaural Speech Enhancement + + +
+ Speech enhancement seeks to extract clean speech from noisy signals. +Traditional deep learning methods face two challenges: efficiently using +information in long speech sequences and high computational costs. To address +these, we introduce the Spiking Structured State Space Model (Spiking-S4). This +approach merges the energy efficiency of Spiking Neural Networks (SNN) with the +long-range sequence modeling capabilities of Structured State Space Models +(S4), offering a compelling solution. Evaluation on the DNS Challenge and +VoiceBank+Demand Datasets confirms that Spiking-S4 rivals existing Artificial +Neural Network (ANN) methods but with fewer computational resources, as +evidenced by reduced parameters and Floating Point Operations (FLOPs). + +
+
+
+
+
+ + ♻ ☆ UniM-OV3D: Uni-Modality Open-Vocabulary 3D Scene Understanding with + Fine-Grained Feature Representation IJCAI 2024 + + +
+ 3D open-vocabulary scene understanding aims to recognize arbitrary novel +categories beyond the base label space. However, existing works not only fail +to fully utilize all the available modal information in the 3D domain but also +lack sufficient granularity in representing the features of each modality. In +this paper, we propose a unified multimodal 3D open-vocabulary scene +understanding network, namely UniM-OV3D, which aligns point clouds with image, +language and depth. To better integrate global and local features of the point +clouds, we design a hierarchical point cloud feature extraction module that +learns comprehensive fine-grained feature representations. Further, to +facilitate the learning of coarse-to-fine point-semantic representations from +captions, we propose the utilization of hierarchical 3D caption pairs, +capitalizing on geometric constraints across various viewpoints of 3D scenes. +Extensive experimental results demonstrate the effectiveness and superiority of +our method in open-vocabulary semantic and instance segmentation, which +achieves state-of-the-art performance on both indoor and outdoor benchmarks +such as ScanNet, ScanNet200, S3IDS and nuScenes. Code is available at +https://github.com/hithqd/UniM-OV3D. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ EyeFormer: Predicting Personalized Scanpaths with Transformer-Guided + Reinforcement Learning + + +
+ From a visual perception perspective, modern graphical user interfaces (GUIs) +comprise a complex graphics-rich two-dimensional visuospatial arrangement of +text, images, and interactive objects such as buttons and menus. While existing +models can accurately predict regions and objects that are likely to attract +attention ``on average'', so far there is no scanpath model capable of +predicting scanpaths for an individual. To close this gap, we introduce +EyeFormer, which leverages a Transformer architecture as a policy network to +guide a deep reinforcement learning algorithm that controls gaze locations. Our +model has the unique capability of producing personalized predictions when +given a few user scanpath samples. It can predict full scanpath information, +including fixation positions and duration, across individuals and various +stimulus types. Additionally, we demonstrate applications in GUI layout +optimization driven by our model. Our software and models will be publicly +available. + +
+
+
+
+
+ + ♻ ☆ UCM-Net: A Lightweight and Efficient Solution for Skin Lesion + Segmentation using MLP and CNN + + +
+ Skin cancer poses a significant public health challenge, necessitating +efficient diagnostic tools. We introduce UCM-Net, a novel skin lesion +segmentation model combining Multi-Layer Perceptrons (MLP) and Convolutional +Neural Networks (CNN). This lightweight, efficient architecture, deviating from +traditional UNet designs, dramatically reduces computational demands, making it +ideal for mobile health applications. Evaluated on PH2, ISIC 2017, and ISIC +2018 datasets, UCM-Net demonstrates robust performance with fewer than 50KB +parameters and requires less than 0.05 Giga Operations Per Second (GLOPs). +Moreover, its minimal memory requirement is just 1.19MB in CPU environment +positions. It is a potential benchmark for efficiency in skin lesion +segmentation, suitable for deployment in resource-constrained settings. In +order to facilitate accessibility and further research in the field, the +UCM-Net source code is https://github.com/chunyuyuan/UCM-Net. + +
+
+ comment: 17 pages, under review +
+
+
+
+
+ + ♻ ☆ Motion-Guided Dual-Camera Tracker for Low-Cost Skill Evaluation of + Gastric Endoscopy + + +
+ Gastric simulators with objective educational feedback have been proven +useful for endoscopy training. Existing electronic simulators with feedback are +however not commonly adopted due to their high cost. In this work, a +motion-guided dual-camera tracker is proposed to provide reliable endoscope tip +position feedback at a low cost inside a mechanical simulator for endoscopy +skill evaluation, tackling several unique challenges. To address the issue of +significant appearance variation of the endoscope tip while keeping dual-camera +tracking consistency, the cross-camera mutual template strategy (CMT) is +proposed to introduce dynamic transient mutual templates to dual-camera +tracking. To alleviate disturbance from large occlusion and distortion by the +light source from the endoscope tip, the Mamba-based motion-guided prediction +head (MMH) is presented to aggregate historical motion with visual tracking. It +is the first application of Mamba for object tracking. The proposed tracker was +evaluated on datasets captured by low-cost camera pairs during endoscopy +procedures performed inside the mechanical simulator. The tracker achieves SOTA +performance with robust and consistent tracking on dual cameras. Further +downstream evaluation proves that the 3D tip position determined by the +proposed tracker enables reliable skill differentiation. The code and dataset +are available at https://github.com/PieceZhang/MotionDCTrack + +
+
+
+
+
+ + ♻ ☆ Interpretation of Neural Networks is Susceptible to Universal + Adversarial Perturbations + + +
+ Interpreting neural network classifiers using gradient-based saliency maps +has been extensively studied in the deep learning literature. While the +existing algorithms manage to achieve satisfactory performance in application +to standard image recognition datasets, recent works demonstrate the +vulnerability of widely-used gradient-based interpretation schemes to +norm-bounded perturbations adversarially designed for every individual input +sample. However, such adversarial perturbations are commonly designed using the +knowledge of an input sample, and hence perform sub-optimally in application to +an unknown or constantly changing data point. In this paper, we show the +existence of a Universal Perturbation for Interpretation (UPI) for standard +image datasets, which can alter a gradient-based feature map of neural networks +over a significant fraction of test samples. To design such a UPI, we propose a +gradient-based optimization method as well as a principal component analysis +(PCA)-based approach to compute a UPI which can effectively alter a neural +network's gradient-based interpretation on different samples. We support the +proposed UPI approaches by presenting several numerical results of their +successful applications to standard image datasets. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 70 + +
+
+
+ + ☆ Joint Quality Assessment and Example-Guided Image Processing by + Disentangling Picture Appearance from Content + + +
+ The deep learning revolution has strongly impacted low-level image processing +tasks such as style/domain transfer, enhancement/restoration, and visual +quality assessments. Despite often being treated separately, the aforementioned +tasks share a common theme of understanding, editing, or enhancing the +appearance of input images without modifying the underlying content. We +leverage this observation to develop a novel disentangled representation +learning method that decomposes inputs into content and appearance features. +The model is trained in a self-supervised manner and we use the learned +features to develop a new quality prediction model named DisQUE. We demonstrate +through extensive evaluations that DisQUE achieves state-of-the-art accuracy +across quality prediction tasks and distortion types. Moreover, we demonstrate +that the same features may also be used for image processing tasks such as HDR +tone mapping, where the desired output characteristics may be tuned using +example input-output pairs. + +
+
+
+
+
+ + ☆ Deep SE(3)-Equivariant Geometric Reasoning for Precise Placement Tasks ICLR 2024 + + +
+ Many robot manipulation tasks can be framed as geometric reasoning tasks, +where an agent must be able to precisely manipulate an object into a position +that satisfies the task from a set of initial conditions. Often, task success +is defined based on the relationship between two objects - for instance, +hanging a mug on a rack. In such cases, the solution should be equivariant to +the initial position of the objects as well as the agent, and invariant to the +pose of the camera. This poses a challenge for learning systems which attempt +to solve this task by learning directly from high-dimensional demonstrations: +the agent must learn to be both equivariant as well as precise, which can be +challenging without any inductive biases about the problem. In this work, we +propose a method for precise relative pose prediction which is provably +SE(3)-equivariant, can be learned from only a few demonstrations, and can +generalize across variations in a class of objects. We accomplish this by +factoring the problem into learning an SE(3) invariant task-specific +representation of the scene and then interpreting this representation with +novel geometric reasoning layers which are provably SE(3) equivariant. We +demonstrate that our method can yield substantially more precise placement +predictions in simulated placement tasks than previous methods trained with the +same amount of data, and can accurately represent relative placement +relationships data collected from real-world demonstrations. Supplementary +information and videos can be found at +https://sites.google.com/view/reldist-iclr-2023. + +
+
+ comment: Published at International Conference on Representation Learning + (ICLR 2024) +
+
+
+
+
+ + ☆ Composing Pre-Trained Object-Centric Representations for Robotics From + "What" and "Where" Foundation Models ICRA 2024 + + +
+ There have recently been large advances both in pre-training visual +representations for robotic control and segmenting unknown category objects in +general images. To leverage these for improved robot learning, we propose +$\textbf{POCR}$, a new framework for building pre-trained object-centric +representations for robotic control. Building on theories of "what-where" +representations in psychology and computer vision, we use segmentations from a +pre-trained model to stably locate across timesteps, various entities in the +scene, capturing "where" information. To each such segmented entity, we apply +other pre-trained models that build vector descriptions suitable for robotic +control tasks, thus capturing "what" the entity is. Thus, our pre-trained +object-centric representations for control are constructed by appropriately +combining the outputs of off-the-shelf pre-trained models, with no new +training. On various simulated and real robotic tasks, we show that imitation +policies for robotic manipulators trained on POCR achieve better performance +and systematic generalization than state of the art pre-trained representations +for robotics, as well as prior object-centric representations that are +typically trained from scratch. + +
+
+ comment: ICRA 2024. Project website: https://sites.google.com/view/pocr +
+
+
+
+
+ + ☆ Cut-FUNQUE: An Objective Quality Model for Compressed Tone-Mapped High + Dynamic Range Videos + + +
+ High Dynamic Range (HDR) videos have enjoyed a surge in popularity in recent +years due to their ability to represent a wider range of contrast and color +than Standard Dynamic Range (SDR) videos. Although HDR video capture has seen +increasing popularity because of recent flagship mobile phones such as Apple +iPhones, Google Pixels, and Samsung Galaxy phones, a broad swath of consumers +still utilize legacy SDR displays that are unable to display HDR videos. As +result, HDR videos must be processed, i.e., tone-mapped, before streaming to a +large section of SDR-capable video consumers. However, server-side tone-mapping +involves automating decisions regarding the choices of tone-mapping operators +(TMOs) and their parameters to yield high-fidelity outputs. Moreover, these +choices must be balanced against the effects of lossy compression, which is +ubiquitous in streaming scenarios. In this work, we develop a novel, efficient +model of objective video quality named Cut-FUNQUE that is able to accurately +predict the visual quality of tone-mapped and compressed HDR videos. Finally, +we evaluate Cut-FUNQUE on a large-scale crowdsourced database of such videos +and show that it achieves state-of-the-art accuracy. + +
+
+
+
+
+ + ☆ SiNC+: Adaptive Camera-Based Vitals with Unsupervised Learning of + Periodic Signals CVPR2023 + + +
+ Subtle periodic signals, such as blood volume pulse and respiration, can be +extracted from RGB video, enabling noncontact health monitoring at low cost. +Advancements in remote pulse estimation -- or remote photoplethysmography +(rPPG) -- are currently driven by deep learning solutions. However, modern +approaches are trained and evaluated on benchmark datasets with ground truth +from contact-PPG sensors. We present the first non-contrastive unsupervised +learning framework for signal regression to mitigate the need for labelled +video data. With minimal assumptions of periodicity and finite bandwidth, our +approach discovers the blood volume pulse directly from unlabelled videos. We +find that encouraging sparse power spectra within normal physiological +bandlimits and variance over batches of power spectra is sufficient for +learning visual features of periodic signals. We perform the first experiments +utilizing unlabelled video data not specifically created for rPPG to train +robust pulse rate estimators. Given the limited inductive biases, we +successfully applied the same approach to camera-based respiration by changing +the bandlimits of the target signal. This shows that the approach is general +enough for unsupervised learning of bandlimited quasi-periodic signals from +different domains. Furthermore, we show that the framework is effective for +finetuning models on unlabelled video from a single subject, allowing for +personalized and adaptive signal regressors. + +
+
+ comment: Extension of CVPR2023 highlight paper. arXiv admin note: substantial + text overlap with arXiv:2303.07944 +
+
+
+
+
+ + ☆ DMesh: A Differentiable Representation for General Meshes + + +
+ We present a differentiable representation, DMesh, for general 3D triangular +meshes. DMesh considers both the geometry and connectivity information of a +mesh. In our design, we first get a set of convex tetrahedra that compactly +tessellates the domain based on Weighted Delaunay Triangulation (WDT), and +formulate probability of faces to exist on our desired mesh in a differentiable +manner based on the WDT. This enables DMesh to represent meshes of various +topology in a differentiable way, and allows us to reconstruct the mesh under +various observations, such as point cloud and multi-view images using +gradient-based optimization. The source code and full paper is available at: +https://sonsang.github.io/dmesh-project. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ FisheyeDetNet: Object Detection on Fisheye Surround View Camera Systems + for Automated Driving + + +
+ Object detection is a mature problem in autonomous driving with pedestrian +detection being one of the first deployed algorithms. It has been +comprehensively studied in the literature. However, object detection is +relatively less explored for fisheye cameras used for surround-view near field +sensing. The standard bounding box representation fails in fisheye cameras due +to heavy radial distortion, particularly in the periphery. To mitigate this, we +explore extending the standard object detection output representation of +bounding box. We design rotated bounding boxes, ellipse, generic polygon as +polar arc/angle representations and define an instance segmentation mIOU metric +to analyze these representations. The proposed model FisheyeDetNet with polygon +outperforms others and achieves a mAP score of 49.5 % on Valeo fisheye +surround-view dataset for automated driving applications. This dataset has 60K +images captured from 4 surround-view cameras across Europe, North America and +Asia. To the best of our knowledge, this is the first detailed study on object +detection on fisheye cameras for autonomous driving scenarios. + +
+
+
+
+
+ + ☆ High-fidelity Endoscopic Image Synthesis by Utilizing Depth-guided + Neural Surfaces + + +
+ In surgical oncology, screening colonoscopy plays a pivotal role in providing +diagnostic assistance, such as biopsy, and facilitating surgical navigation, +particularly in polyp detection. Computer-assisted endoscopic surgery has +recently gained attention and amalgamated various 3D computer vision +techniques, including camera localization, depth estimation, surface +reconstruction, etc. Neural Radiance Fields (NeRFs) and Neural Implicit +Surfaces (NeuS) have emerged as promising methodologies for deriving accurate +3D surface models from sets of registered images, addressing the limitations of +existing colon reconstruction approaches stemming from constrained camera +movement. + However, the inadequate tissue texture representation and confused scale +problem in monocular colonoscopic image reconstruction still impede the +progress of the final rendering results. In this paper, we introduce a novel +method for colon section reconstruction by leveraging NeuS applied to +endoscopic images, supplemented by a single frame of depth map. Notably, we +pioneered the exploration of utilizing only one frame depth map in +photorealistic reconstruction and neural rendering applications while this +single depth map can be easily obtainable from other monocular depth estimation +networks with an object scale. Through rigorous experimentation and validation +on phantom imagery, our approach demonstrates exceptional accuracy in +completely rendering colon sections, even capturing unseen portions of the +surface. This breakthrough opens avenues for achieving stable and consistently +scaled reconstructions, promising enhanced quality in cancer screening +procedures and treatment interventions. + +
+
+
+
+
+ + ☆ Nested-TNT: Hierarchical Vision Transformers with Multi-Scale Feature + Processing + + +
+ Transformer has been applied in the field of computer vision due to its +excellent performance in natural language processing, surpassing traditional +convolutional neural networks and achieving new state-of-the-art. ViT divides +an image into several local patches, known as "visual sentences". However, the +information contained in the image is vast and complex, and focusing only on +the features at the "visual sentence" level is not enough. The features between +local patches should also be taken into consideration. In order to achieve +further improvement, the TNT model is proposed, whose algorithm further divides +the image into smaller patches, namely "visual words," achieving more accurate +results. The core of Transformer is the Multi-Head Attention mechanism, and +traditional attention mechanisms ignore interactions across different attention +heads. In order to reduce redundancy and improve utilization, we introduce the +nested algorithm and apply the Nested-TNT to image classification tasks. The +experiment confirms that the proposed model has achieved better classification +performance over ViT and TNT, exceeding 2.25%, 1.1% on dataset CIFAR10 and +2.78%, 0.25% on dataset FLOWERS102 respectively. + +
+
+
+
+
+ + ☆ AdvLoRA: Adversarial Low-Rank Adaptation of Vision-Language Models + + +
+ Vision-Language Models (VLMs) are a significant technique for Artificial +General Intelligence (AGI). With the fast growth of AGI, the security problem +become one of the most important challenges for VLMs. In this paper, through +extensive experiments, we demonstrate the vulnerability of the conventional +adaptation methods for VLMs, which may bring significant security risks. In +addition, as the size of the VLMs increases, performing conventional +adversarial adaptation techniques on VLMs results in high computational costs. +To solve these problems, we propose a parameter-efficient +\underline{Adv}ersarial adaptation method named \underline{AdvLoRA} by +\underline{Lo}w-\underline{R}ank \underline{A}daptation. At first, we +investigate and reveal the intrinsic low-rank property during the adversarial +adaptation for VLMs. Different from LoRA, we improve the efficiency and +robustness of adversarial adaptation by designing a novel reparameterizing +method based on parameter clustering and parameter alignment. In addition, an +adaptive parameter update strategy is proposed to further improve the +robustness. By these settings, our proposed AdvLoRA alleviates the model +security and high resource waste problems. Extensive experiments demonstrate +the effectiveness and efficiency of the AdvLoRA. + +
+
+
+
+
+ + ☆ NeurCADRecon: Neural Representation for Reconstructing CAD Surfaces by + Enforcing Zero Gaussian Curvature SIGGRAPH 2024 + + +
+ Despite recent advances in reconstructing an organic model with the neural +signed distance function (SDF), the high-fidelity reconstruction of a CAD model +directly from low-quality unoriented point clouds remains a significant +challenge. In this paper, we address this challenge based on the prior +observation that the surface of a CAD model is generally composed of piecewise +surface patches, each approximately developable even around the feature line. +Our approach, named NeurCADRecon, is self-supervised, and its loss includes a +developability term to encourage the Gaussian curvature toward 0 while ensuring +fidelity to the input points. Noticing that the Gaussian curvature is non-zero +at tip points, we introduce a double-trough curve to tolerate the existence of +these tip points. Furthermore, we develop a dynamic sampling strategy to deal +with situations where the given points are incomplete or too sparse. Since our +resulting neural SDFs can clearly manifest sharp feature points/lines, one can +easily extract the feature-aligned triangle mesh from the SDF and then +decompose it into smooth surface patches, greatly reducing the difficulty of +recovering the parametric CAD design. A comprehensive comparison with existing +state-of-the-art methods shows the significant advantage of our approach in +reconstructing faithful CAD shapes. + +
+
+ comment: ACM Transactions on Graphics (SIGGRAPH 2024) +
+
+
+
+
+ + ☆ Efficient and Concise Explanations for Object Detection with + Gaussian-Class Activation Mapping Explainer + + +
+ To address the challenges of providing quick and plausible explanations in +Explainable AI (XAI) for object detection models, we introduce the Gaussian +Class Activation Mapping Explainer (G-CAME). Our method efficiently generates +concise saliency maps by utilizing activation maps from selected layers and +applying a Gaussian kernel to emphasize critical image regions for the +predicted object. Compared with other Region-based approaches, G-CAME +significantly reduces explanation time to 0.5 seconds without compromising the +quality. Our evaluation of G-CAME, using Faster-RCNN and YOLOX on the MS-COCO +2017 dataset, demonstrates its ability to offer highly plausible and faithful +explanations, especially in reducing the bias on tiny object detection. + +
+
+ comment: Canadian AI 2024 +
+
+
+
+
+ + ☆ AMMUNet: Multi-Scale Attention Map Merging for Remote Sensing Image + Segmentation + + +
+ The advancement of deep learning has driven notable progress in remote +sensing semantic segmentation. Attention mechanisms, while enabling global +modeling and utilizing contextual information, face challenges of high +computational costs and require window-based operations that weaken capturing +long-range dependencies, hindering their effectiveness for remote sensing image +processing. In this letter, we propose AMMUNet, a UNet-based framework that +employs multi-scale attention map merging, comprising two key innovations: the +granular multi-head self-attention (GMSA) module and the attention map merging +mechanism (AMMM). GMSA efficiently acquires global information while +substantially mitigating computational costs in contrast to global multi-head +self-attention mechanism. This is accomplished through the strategic +utilization of dimension correspondence to align granularity and the reduction +of relative position bias parameters, thereby optimizing computational +efficiency. The proposed AMMM effectively combines multi-scale attention maps +into a unified representation using a fixed mask template, enabling the +modeling of global attention mechanism. Experimental evaluations highlight the +superior performance of our approach, achieving remarkable mean intersection +over union (mIoU) scores of 75.48\% on the challenging Vaihingen dataset and an +exceptional 77.90\% on the Potsdam dataset, demonstrating the superiority of +our method in precise remote sensing semantic segmentation. Codes are available +at https://github.com/interpretty/AMMUNet. + +
+
+
+
+
+ + ☆ HiVG: Hierarchical Multimodal Fine-grained Modulation for Visual + Grounding + + +
+ Visual grounding, which aims to ground a visual region via natural language, +is a task that heavily relies on cross-modal alignment. Existing works utilized +uni-modal pre-trained models to transfer visual/linguistic knowledge separately +while ignoring the multimodal corresponding information. Motivated by recent +advancements in contrastive language-image pre-training and low-rank adaptation +(LoRA) methods, we aim to solve the grounding task based on multimodal +pre-training. However, there exists significant task gaps between pre-training +and grounding. Therefore, to address these gaps, we propose a concise and +efficient hierarchical multimodal fine-grained modulation framework, namely +HiVG. Specifically, HiVG consists of a multi-layer adaptive cross-modal bridge +and a hierarchical multimodal low-rank adaptation (Hi LoRA) paradigm. The +cross-modal bridge can address the inconsistency between visual features and +those required for grounding, and establish a connection between multi-level +visual and text features. Hi LoRA prevents the accumulation of perceptual +errors by adapting the cross-modal features from shallow to deep layers in a +hierarchical manner. Experimental results on five datasets demonstrate the +effectiveness of our approach and showcase the significant grounding +capabilities as well as promising energy efficiency advantages. The project +page: https://github.com/linhuixiao/HiVG. + +
+
+ comment: The project page: https://github.com/linhuixiao/HiVG +
+
+
+
+
+ + ☆ SSVT: Self-Supervised Vision Transformer For Eye Disease Diagnosis Based + On Fundus Images + + +
+ Machine learning-based fundus image diagnosis technologies trigger worldwide +interest owing to their benefits such as reducing medical resource power and +providing objective evaluation results. However, current methods are commonly +based on supervised methods, bringing in a heavy workload to biomedical staff +and hence suffering in expanding effective databases. To address this issue, in +this article, we established a label-free method, name 'SSVT',which can +automatically analyze un-labeled fundus images and generate high evaluation +accuracy of 97.0% of four main eye diseases based on six public datasets and +two datasets collected by Beijing Tongren Hospital. The promising results +showcased the effectiveness of the proposed unsupervised learning method, and +the strong application potential in biomedical resource shortage regions to +improve global eye health. + +
+
+ comment: ISBI 2024 +
+
+
+
+
+ + ☆ HybridFlow: Infusing Continuity into Masked Codebook for Extreme + Low-Bitrate Image Compression + + +
+ This paper investigates the challenging problem of learned image compression +(LIC) with extreme low bitrates. Previous LIC methods based on transmitting +quantized continuous features often yield blurry and noisy reconstruction due +to the severe quantization loss. While previous LIC methods based on learned +codebooks that discretize visual space usually give poor-fidelity +reconstruction due to the insufficient representation power of limited +codewords in capturing faithful details. We propose a novel dual-stream +framework, HyrbidFlow, which combines the continuous-feature-based and +codebook-based streams to achieve both high perceptual quality and high +fidelity under extreme low bitrates. The codebook-based stream benefits from +the high-quality learned codebook priors to provide high quality and clarity in +reconstructed images. The continuous feature stream targets at maintaining +fidelity details. To achieve the ultra low bitrate, a masked token-based +transformer is further proposed, where we only transmit a masked portion of +codeword indices and recover the missing indices through token generation +guided by information from the continuous feature stream. We also develop a +bridging correction network to merge the two streams in pixel decoding for +final image reconstruction, where the continuous stream features rectify biases +of the codebook-based pixel decoder to impose reconstructed fidelity details. +Experimental results demonstrate superior performance across several datasets +under extremely low bitrates, compared with existing single-stream +codebook-based or continuous-feature-based LIC methods. + +
+
+
+
+
+ + ☆ Movie101v2: Improved Movie Narration Benchmark + + +
+ Automatic movie narration targets at creating video-aligned plot descriptions +to assist visually impaired audiences. It differs from standard video +captioning in that it requires not only describing key visual details but also +inferring the plots developed across multiple movie shots, thus posing unique +and ongoing challenges. To advance the development of automatic movie narrating +systems, we first revisit the limitations of existing datasets and develop a +large-scale, bilingual movie narration dataset, Movie101v2. Second, taking into +account the essential difficulties in achieving applicable movie narration, we +break the long-term goal into three progressive stages and tentatively focus on +the initial stages featuring understanding within individual clips. We also +introduce a new narration assessment to align with our staged task goals. +Third, using our new dataset, we baseline several leading large vision-language +models, including GPT-4V, and conduct in-depth investigations into the +challenges current models face for movie narration generation. Our findings +reveal that achieving applicable movie narration generation is a fascinating +goal that requires thorough research. + +
+
+
+
+
+ + ☆ Generating Daylight-driven Architectural Design via Diffusion Models + + +
+ In recent years, the rapid development of large-scale models has made new +possibilities for interdisciplinary fields such as architecture. In this paper, +we present a novel daylight-driven AI-aided architectural design method. +Firstly, we formulate a method for generating massing models, producing +architectural massing models using random parameters quickly. Subsequently, we +integrate a daylight-driven facade design strategy, accurately determining +window layouts and applying them to the massing models. Finally, we seamlessly +combine a large-scale language model with a text-to-image model, enhancing the +efficiency of generating visual architectural design renderings. Experimental +results demonstrate that our approach supports architects' creative +inspirations and pioneers novel avenues for architectural design development. +Project page: https://zrealli.github.io/DDADesign/. + +
+
+ comment: Project page: https://zrealli.github.io/DDADesign/ +
+
+
+
+
+ + ☆ Hyperspectral Anomaly Detection with Self-Supervised Anomaly Prior + + +
+ The majority of existing hyperspectral anomaly detection (HAD) methods use +the low-rank representation (LRR) model to separate the background and anomaly +components, where the anomaly component is optimized by handcrafted sparse +priors (e.g., $\ell_{2,1}$-norm). However, this may not be ideal since they +overlook the spatial structure present in anomalies and make the detection +result largely dependent on manually set sparsity. To tackle these problems, we +redefine the optimization criterion for the anomaly component in the LRR model +with a self-supervised network called self-supervised anomaly prior (SAP). This +prior is obtained by the pretext task of self-supervised learning, which is +customized to learn the characteristics of hyperspectral anomalies. +Specifically, this pretext task is a classification task to distinguish the +original hyperspectral image (HSI) and the pseudo-anomaly HSI, where the +pseudo-anomaly is generated from the original HSI and designed as a prism with +arbitrary polygon bases and arbitrary spectral bands. In addition, a +dual-purified strategy is proposed to provide a more refined background +representation with an enriched background dictionary, facilitating the +separation of anomalies from complex backgrounds. Extensive experiments on +various hyperspectral datasets demonstrate that the proposed SAP offers a more +accurate and interpretable solution than other advanced HAD methods. + +
+
+
+
+
+ + ☆ SEGSRNet for Stereo-Endoscopic Image Super-Resolution and Surgical + Instrument Segmentation + + +
+ SEGSRNet addresses the challenge of precisely identifying surgical +instruments in low-resolution stereo endoscopic images, a common issue in +medical imaging and robotic surgery. Our innovative framework enhances image +clarity and segmentation accuracy by applying state-of-the-art super-resolution +techniques before segmentation. This ensures higher-quality inputs for more +precise segmentation. SEGSRNet combines advanced feature extraction and +attention mechanisms with spatial processing to sharpen image details, which is +significant for accurate tool identification in medical images. Our proposed +model outperforms current models including Dice, IoU, PSNR, and SSIM, SEGSRNet +where it produces clearer and more accurate images for stereo endoscopic +surgical imaging. SEGSRNet can provide image resolution and precise +segmentation which can significantly enhance surgical accuracy and patient care +outcomes. + +
+
+ comment: Paper accepted for Presentation in 46th Annual International + Conference of the IEEE Engineering in Medicine and Biology Society (EMBS), + Orlando, Florida, USA +
+
+
+
+
+ + ☆ Collaborative Visual Place Recognition through Federated Learning CVPR + + +
+ Visual Place Recognition (VPR) aims to estimate the location of an image by +treating it as a retrieval problem. VPR uses a database of geo-tagged images +and leverages deep neural networks to extract a global representation, called +descriptor, from each image. While the training data for VPR models often +originates from diverse, geographically scattered sources (geo-tagged images), +the training process itself is typically assumed to be centralized. This +research revisits the task of VPR through the lens of Federated Learning (FL), +addressing several key challenges associated with this adaptation. VPR data +inherently lacks well-defined classes, and models are typically trained using +contrastive learning, which necessitates a data mining step on a centralized +database. Additionally, client devices in federated systems can be highly +heterogeneous in terms of their processing capabilities. The proposed FedVPR +framework not only presents a novel approach for VPR but also introduces a new, +challenging, and realistic task for FL research, paving the way to other image +retrieval tasks in FL. + +
+
+ comment: 13 pages, 7 figures, CVPR - The 3rd International Workshop on + Federated Learning for Computer Vision (FedVision-2024) +
+
+
+
+
+ + ☆ Pixel is a Barrier: Diffusion Models Are More Adversarially Robust Than + We Think + + +
+ Adversarial examples for diffusion models are widely used as solutions for +safety concerns. By adding adversarial perturbations to personal images, +attackers can not edit or imitate them easily. However, it is essential to note +that all these protections target the latent diffusion model (LDMs), the +adversarial examples for diffusion models in the pixel space (PDMs) are largely +overlooked. This may mislead us to think that the diffusion models are +vulnerable to adversarial attacks like most deep models. In this paper, we show +novel findings that: even though gradient-based white-box attacks can be used +to attack the LDMs, they fail to attack PDMs. This finding is supported by +extensive experiments of almost a wide range of attacking methods on various +PDMs and LDMs with different model structures, which means diffusion models are +indeed much more robust against adversarial attacks. We also find that PDMs can +be used as an off-the-shelf purifier to effectively remove the adversarial +patterns that were generated on LDMs to protect the images, which means that +most protection methods nowadays, to some extent, cannot protect our images +from malicious attacks. We hope that our insights will inspire the community to +rethink the adversarial samples for diffusion models as protection methods and +move forward to more effective protection. Codes are available in +https://github.com/xavihart/PDM-Pure. + +
+
+
+
+
+ + ☆ STAT: Towards Generalizable Temporal Action Localization + + +
+ Weakly-supervised temporal action localization (WTAL) aims to recognize and +localize action instances with only video-level labels. Despite the significant +progress, existing methods suffer from severe performance degradation when +transferring to different distributions and thus may hardly adapt to real-world +scenarios . To address this problem, we propose the Generalizable Temporal +Action Localization task (GTAL), which focuses on improving the +generalizability of action localization methods. We observed that the +performance decline can be primarily attributed to the lack of generalizability +to different action scales. To address this problem, we propose STAT +(Self-supervised Temporal Adaptive Teacher), which leverages a teacher-student +structure for iterative refinement. Our STAT features a refinement module and +an alignment module. The former iteratively refines the model's output by +leveraging contextual information and helps adapt to the target scale. The +latter improves the refinement process by promoting a consensus between student +and teacher models. We conduct extensive experiments on three datasets, +THUMOS14, ActivityNet1.2, and HACS, and the results show that our method +significantly improves the Baseline methods under the cross-distribution +evaluation setting, even approaching the same-distribution evaluation +performance. + +
+
+ comment: 14 pages, LaTeX; +
+
+
+
+
+ + ☆ FakeBench: Uncover the Achilles' Heels of Fake Images with Large + Multimodal Models + + +
+ Recently, fake images generated by artificial intelligence (AI) models have +become indistinguishable from the real, exerting new challenges for fake image +detection models. To this extent, simple binary judgments of real or fake seem +less convincing and credible due to the absence of human-understandable +explanations. Fortunately, Large Multimodal Models (LMMs) bring possibilities +to materialize the judgment process while their performance remains +undetermined. Therefore, we propose FakeBench, the first-of-a-kind benchmark +towards transparent defake, consisting of fake images with human language +descriptions on forgery signs. FakeBench gropes for two open questions of LMMs: +(1) can LMMs distinguish fake images generated by AI, and (2) how do LMMs +distinguish fake images? In specific, we construct the FakeClass dataset with +6k diverse-sourced fake and real images, each equipped with a Question&Answer +pair concerning the authenticity of images, which are utilized to benchmark the +detection ability. To examine the reasoning and interpretation abilities of +LMMs, we present the FakeClue dataset, consisting of 15k pieces of descriptions +on the telltale clues revealing the falsification of fake images. Besides, we +construct the FakeQA to measure the LMMs' open-question answering ability on +fine-grained authenticity-relevant aspects. Our experimental results discover +that current LMMs possess moderate identification ability, preliminary +interpretation and reasoning ability, and passable open-question answering +ability for image defake. The FakeBench will be made publicly available soon. + +
+
+
+
+
+ + ☆ PCQA: A Strong Baseline for AIGC Quality Assessment Based on Prompt + Condition CVPR-2024 + + +
+ The development of Large Language Models (LLM) and Diffusion Models brings +the boom of Artificial Intelligence Generated Content (AIGC). It is essential +to build an effective quality assessment framework to provide a quantifiable +evaluation of different images or videos based on the AIGC technologies. The +content generated by AIGC methods is driven by the crafted prompts. Therefore, +it is intuitive that the prompts can also serve as the foundation of the AIGC +quality assessment. This study proposes an effective AIGC quality assessment +(QA) framework. First, we propose a hybrid prompt encoding method based on a +dual-source CLIP (Contrastive Language-Image Pre-Training) text encoder to +understand and respond to the prompt conditions. Second, we propose an +ensemble-based feature mixer module to effectively blend the adapted prompt and +vision features. The empirical study practices in two datasets: AIGIQA-20K +(AI-Generated Image Quality Assessment database) and T2VQA-DB (Text-to-Video +Quality Assessment DataBase), which validates the effectiveness of our proposed +method: Prompt Condition Quality Assessment (PCQA). Our proposed simple and +feasible framework may promote research development in the multimodal +generation field. + +
+
+ comment: Published in CVPR-2024's NTIRE: New Trends in Image Restoration and + Enhancement workshop and challenges +
+
+
+
+
+ + ☆ PoseINN: Realtime Visual-based Pose Regression and Localization with + Invertible Neural Networks + + +
+ Estimating ego-pose from cameras is an important problem in robotics with +applications ranging from mobile robotics to augmented reality. While SOTA +models are becoming increasingly accurate, they can still be unwieldy due to +high computational costs. In this paper, we propose to solve the problem by +using invertible neural networks (INN) to find the mapping between the latent +space of images and poses for a given scene. Our model achieves similar +performance to the SOTA while being faster to train and only requiring offline +rendering of low-resolution synthetic data. By using normalizing flows, the +proposed method also provides uncertainty estimation for the output. We also +demonstrated the efficiency of this method by deploying the model on a mobile +robot. + +
+
+
+
+
+ + ☆ Wills Aligner: A Robust Multi-Subject Brain Representation Learner + + +
+ Decoding visual information from human brain activity has seen remarkable +advancements in recent research. However, due to the significant variability in +cortical parcellation and cognition patterns across subjects, current +approaches personalized deep models for each subject, constraining the +practicality of this technology in real-world contexts. To tackle the +challenges, we introduce Wills Aligner, a robust multi-subject brain +representation learner. Our Wills Aligner initially aligns different subjects' +brains at the anatomical level. Subsequently, it incorporates a mixture of +brain experts to learn individual cognition patterns. Additionally, it +decouples the multi-subject learning task into a two-stage training, propelling +the deep model and its plugin network to learn inter-subject commonality +knowledge and various cognition patterns, respectively. Wills Aligner enables +us to overcome anatomical differences and to efficiently leverage a single +model for multi-subject brain representation learning. We meticulously evaluate +the performance of our approach across coarse-grained and fine-grained visual +decoding tasks. The experimental results demonstrate that our Wills Aligner +achieves state-of-the-art performance. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Beyond Score Changes: Adversarial Attack on No-Reference Image Quality + Assessment from Two Perspectives + + +
+ Deep neural networks have demonstrated impressive success in No-Reference +Image Quality Assessment (NR-IQA). However, recent researches highlight the +vulnerability of NR-IQA models to subtle adversarial perturbations, leading to +inconsistencies between model predictions and subjective ratings. Current +adversarial attacks, however, focus on perturbing predicted scores of +individual images, neglecting the crucial aspect of inter-score correlation +relationships within an entire image set. Meanwhile, it is important to note +that the correlation, like ranking correlation, plays a significant role in +NR-IQA tasks. To comprehensively explore the robustness of NR-IQA models, we +introduce a new framework of correlation-error-based attacks that perturb both +the correlation within an image set and score changes on individual images. Our +research primarily focuses on ranking-related correlation metrics like +Spearman's Rank-Order Correlation Coefficient (SROCC) and prediction +error-related metrics like Mean Squared Error (MSE). As an instantiation, we +propose a practical two-stage SROCC-MSE-Attack (SMA) that initially optimizes +target attack scores for the entire image set and then generates adversarial +examples guided by these scores. Experimental results demonstrate that our SMA +method not only significantly disrupts the SROCC to negative values but also +maintains a considerable change in the scores of individual images. Meanwhile, +it exhibits state-of-the-art performance across metrics with different +categories. Our method provides a new perspective on the robustness of NR-IQA +models. + +
+
+ comment: Submitted to a conference +
+
+
+
+
+ + ☆ Multi-feature Reconstruction Network using Crossed-mask Restoration for + Unsupervised Anomaly Detection + + +
+ Unsupervised anomaly detection using only normal samples is of great +significance for quality inspection in industrial manufacturing. Although +existing reconstruction-based methods have achieved promising results, they +still face two problems: poor distinguishable information in image +reconstruction and well abnormal regeneration caused by model +over-generalization ability. To overcome the above issues, we convert the image +reconstruction into a combination of parallel feature restorations and propose +a multi-feature reconstruction network, MFRNet, using crossed-mask restoration +in this paper. Specifically, a multi-scale feature aggregator is first +developed to generate more discriminative hierarchical representations of the +input images from a pre-trained model. Subsequently, a crossed-mask generator +is adopted to randomly cover the extracted feature map, followed by a +restoration network based on the transformer structure for high-quality repair +of the missing regions. Finally, a hybrid loss is equipped to guide model +training and anomaly estimation, which gives consideration to both the pixel +and structural similarity. Extensive experiments show that our method is highly +competitive with or significantly outperforms other state-of-the-arts on four +public available datasets and one self-made dataset. + +
+
+
+
+
+ + ☆ StrideNET: Swin Transformer for Terrain Recognition with Dynamic + Roughness Extraction + + +
+ Advancements in deep learning are revolutionizing the classification of +remote-sensing images. Transformer-based architectures, utilizing +self-attention mechanisms, have emerged as alternatives to conventional +convolution methods, enabling the capture of long-range dependencies along with +global relationships in the image. Motivated by these advancements, this paper +presents StrideNET, a novel dual-branch architecture designed for terrain +recognition and implicit properties estimation. The terrain recognition branch +utilizes the Swin Transformer, leveraging its hierarchical representation and +low computational cost to efficiently capture both local and global features. +The terrain properties branch focuses on the extraction of surface properties +such as roughness and slipperiness using a statistical texture analysis method. +By computing surface terrain properties, an enhanced environmental perception +can be obtained. The StrideNET model is trained on a dataset comprising four +target terrain classes: Grassy, Marshy, Sandy, and Rocky. StrideNET attains +competitive performance compared to contemporary methods. The implications of +this work extend to various applications, including environmental monitoring, +land use and land cover (LULC) classification, disaster response, precision +agriculture, and much more. + +
+
+
+
+
+ + ☆ Multi-Cell Decoder and Mutual Learning for Table Structure and Character + Recognition ICDAR 2024 + + +
+ Extracting table contents from documents such as scientific papers and +financial reports and converting them into a format that can be processed by +large language models is an important task in knowledge information processing. +End-to-end approaches, which recognize not only table structure but also cell +contents, achieved performance comparable to state-of-the-art models using +external character recognition systems, and have potential for further +improvements. In addition, these models can now recognize long tables with +hundreds of cells by introducing local attention. However, the models recognize +table structure in one direction from the header to the footer, and cell +content recognition is performed independently for each cell, so there is no +opportunity to retrieve useful information from the neighbor cells. In this +paper, we propose a multi-cell content decoder and bidirectional mutual +learning mechanism to improve the end-to-end approach. The effectiveness is +demonstrated on two large datasets, and the experimental results show +comparable performance to state-of-the-art models, even for long tables with +large numbers of cells. + +
+
+ comment: ICDAR 2024 +
+
+
+
+
+ + ☆ FilterPrompt: Guiding Image Transfer in Diffusion Models + + +
+ In controllable generation tasks, flexibly manipulating the generated images +to attain a desired appearance or structure based on a single input image cue +remains a critical and longstanding challenge. Achieving this requires the +effective decoupling of key attributes within the input image data, aiming to +get representations accurately. Previous research has predominantly +concentrated on disentangling image attributes within feature space. However, +the complex distribution present in real-world data often makes the application +of such decoupling algorithms to other datasets challenging. Moreover, the +granularity of control over feature encoding frequently fails to meet specific +task requirements. Upon scrutinizing the characteristics of various generative +models, we have observed that the input sensitivity and dynamic evolution +properties of the diffusion model can be effectively fused with the explicit +decomposition operation in pixel space. This integration enables the image +processing operations performed in pixel space for a specific feature +distribution of the input image, and can achieve the desired control effect in +the generated results. Therefore, we propose FilterPrompt, an approach to +enhance the model control effect. It can be universally applied to any +diffusion model, allowing users to adjust the representation of specific image +features in accordance with task requirements, thereby facilitating more +precise and controllable generation outcomes. In particular, our designed +experiments demonstrate that the FilterPrompt optimizes feature correlation, +mitigates content conflicts during the generation process, and enhances the +model's control capability. + +
+
+
+
+
+ + ☆ 3D-Convolution Guided Spectral-Spatial Transformer for Hyperspectral + Image Classification + + +
+ In recent years, Vision Transformers (ViTs) have shown promising +classification performance over Convolutional Neural Networks (CNNs) due to +their self-attention mechanism. Many researchers have incorporated ViTs for +Hyperspectral Image (HSI) classification. HSIs are characterised by narrow +contiguous spectral bands, providing rich spectral data. Although ViTs excel +with sequential data, they cannot extract spectral-spatial information like +CNNs. Furthermore, to have high classification performance, there should be a +strong interaction between the HSI token and the class (CLS) token. To solve +these issues, we propose a 3D-Convolution guided Spectral-Spatial Transformer +(3D-ConvSST) for HSI classification that utilizes a 3D-Convolution Guided +Residual Module (CGRM) in-between encoders to "fuse" the local spatial and +spectral information and to enhance the feature propagation. Furthermore, we +forego the class token and instead apply Global Average Pooling, which +effectively encodes more discriminative and pertinent high-level features for +classification. Extensive experiments have been conducted on three public HSI +datasets to show the superiority of the proposed model over state-of-the-art +traditional, convolutional, and Transformer models. The code is available at +https://github.com/ShyamVarahagiri/3D-ConvSST. + +
+
+ comment: Accepted in IEEE Conference on Artificial Intelligence, 2024 +
+
+
+
+
+ + ☆ Beyond Pixel-Wise Supervision for Medical Image Segmentation: From + Traditional Models to Foundation Models + + +
+ Medical image segmentation plays an important role in many image-guided +clinical approaches. However, existing segmentation algorithms mostly rely on +the availability of fully annotated images with pixel-wise annotations for +training, which can be both labor-intensive and expertise-demanding, especially +in the medical imaging domain where only experts can provide reliable and +accurate annotations. To alleviate this challenge, there has been a growing +focus on developing segmentation methods that can train deep models with weak +annotations, such as image-level, bounding boxes, scribbles, and points. The +emergence of vision foundation models, notably the Segment Anything Model +(SAM), has introduced innovative capabilities for segmentation tasks using weak +annotations for promptable segmentation enabled by large-scale pre-training. +Adopting foundation models together with traditional learning methods has +increasingly gained recent interest research community and shown potential for +real-world applications. In this paper, we present a comprehensive survey of +recent progress on annotation-efficient learning for medical image segmentation +utilizing weak annotations before and in the era of foundation models. +Furthermore, we analyze and discuss several challenges of existing approaches, +which we believe will provide valuable guidance for shaping the trajectory of +foundational models to further advance the field of medical image segmentation. + +
+
+
+
+
+ + ☆ PAFedFV: Personalized and Asynchronous Federated Learning for Finger + Vein Recognition + + +
+ With the increasing emphasis on user privacy protection, biometric +recognition based on federated learning have become the latest research +hotspot. However, traditional federated learning methods cannot be directly +applied to finger vein recognition, due to heterogeneity of data and open-set +verification. Therefore, only a few application cases have been proposed. And +these methods still have two drawbacks. (1) Uniform model results in poor +performance in some clients, as the finger vein data is highly heterogeneous +and non-Independently Identically Distributed (non-IID). (2) On individual +client, a large amount of time is underutilized, such as the time to wait for +returning model from server. To address those problems, this paper proposes a +Personalized and Asynchronous Federated Learning for Finger Vein Recognition +(PAFedFV) framework. PAFedFV designs personalized model aggregation method to +solve the heterogeneity among non-IID data. Meanwhile, it employs an +asynchronized training module for clients to utilize their waiting time. +Finally, extensive experiments on six finger vein datasets are conducted. Base +on these experiment results, the impact of non-IID finger vein data on +performance of federated learning are analyzed, and the superiority of PAFedFV +in accuracy and robustness are demonstrated. + +
+
+
+
+
+ + ☆ Vim4Path: Self-Supervised Vision Mamba for Histopathology Images CVPR2023 + + +
+ Representation learning from Gigapixel Whole Slide Images (WSI) poses a +significant challenge in computational pathology due to the complicated nature +of tissue structures and the scarcity of labeled data. Multi-instance learning +methods have addressed this challenge, leveraging image patches to classify +slides utilizing pretrained models using Self-Supervised Learning (SSL) +approaches. The performance of both SSL and MIL methods relies on the +architecture of the feature encoder. This paper proposes leveraging the Vision +Mamba (Vim) architecture, inspired by state space models, within the DINO +framework for representation learning in computational pathology. We evaluate +the performance of Vim against Vision Transformers (ViT) on the Camelyon16 +dataset for both patch-level and slide-level classification. Our findings +highlight Vim's enhanced performance compared to ViT, particularly at smaller +scales, where Vim achieves an 8.21 increase in ROC AUC for models of similar +size. An explainability analysis further highlights Vim's capabilities, which +reveals that Vim uniquely emulates the pathologist workflow-unlike ViT. This +alignment with human expert analysis highlights Vim's potential in practical +diagnostic settings and contributes significantly to developing effective +representation-learning algorithms in computational pathology. We release the +codes and pretrained weights at +\url{https://github.com/AtlasAnalyticsLab/Vim4Path}. + +
+
+ comment: Accepted in CVPR2023 (9th Workshop on Computer Vision for Microscopy + Image Analysis) +
+
+
+
+
+ + ☆ Optimizing Contrail Detection: A Deep Learning Approach with + EfficientNet-b4 Encoding + + +
+ In the pursuit of environmental sustainability, the aviation industry faces +the challenge of minimizing its ecological footprint. Among the key solutions +is contrail avoidance, targeting the linear ice-crystal clouds produced by +aircraft exhaust. These contrails exacerbate global warming by trapping +atmospheric heat, necessitating precise segmentation and comprehensive analysis +of contrail images to gauge their environmental impact. However, this +segmentation task is complex due to the varying appearances of contrails under +different atmospheric conditions and potential misalignment issues in +predictive modeling. This paper presents an innovative deep-learning approach +utilizing the efficient net-b4 encoder for feature extraction, seamlessly +integrating misalignment correction, soft labeling, and pseudo-labeling +techniques to enhance the accuracy and efficiency of contrail detection in +satellite imagery. The proposed methodology aims to redefine contrail image +analysis and contribute to the objectives of sustainable aviation by providing +a robust framework for precise contrail detection and analysis in satellite +imagery, thus aiding in the mitigation of aviation's environmental impact. + +
+
+
+
+
+ + ♻ ☆ ELODI: Ensemble Logit Difference Inhibition for Positive-Congruent + Training + + +
+ Negative flips are errors introduced in a classification system when a legacy +model is updated. Existing methods to reduce the negative flip rate (NFR) +either do so at the expense of overall accuracy by forcing a new model to +imitate the old models, or use ensembles, which multiply inference cost +prohibitively. We analyze the role of ensembles in reducing NFR and observe +that they remove negative flips that are typically not close to the decision +boundary, but often exhibit large deviations in the distance among their +logits. Based on the observation, we present a method, called Ensemble Logit +Difference Inhibition (ELODI), to train a classification system that achieves +paragon performance in both error rate and NFR, at the inference cost of a +single model. The method distills a homogeneous ensemble to a single student +model which is used to update the classification system. ELODI also introduces +a generalized distillation objective, Logit Difference Inhibition (LDI), which +only penalizes the logit difference of a subset of classes with the highest +logit values. On multiple image classification benchmarks, model updates with +ELODI demonstrate superior accuracy retention and NFR reduction. + +
+
+ comment: Accepted as a Regular Paper in TPAMI. Code is at + https://github.com/amazon-science/regression-constraint-model-upgrade +
+
+
+
+
+ + ♻ ☆ Visual Whole-Body Control for Legged Loco-Manipulation + + +
+ We study the problem of mobile manipulation using legged robots equipped with +an arm, namely legged loco-manipulation. The robot legs, while usually utilized +for mobility, offer an opportunity to amplify the manipulation capabilities by +conducting whole-body control. That is, the robot can control the legs and the +arm at the same time to extend its workspace. We propose a framework that can +conduct the whole-body control autonomously with visual observations. Our +approach, namely Visual Whole-Body Control(VBC), is composed of a low-level +policy using all degrees of freedom to track the end-effector manipulator +position and a high-level policy proposing the end-effector position based on +visual inputs. We train both levels of policies in simulation and perform +Sim2Real transfer for real robot deployment. We perform extensive experiments +and show significant improvements over baselines in picking up diverse objects +in different configurations (heights, locations, orientations) and +environments. Project page: https://wholebody-b1.github.io + +
+
+ comment: Add more details. The first two authors contribute equally. Project + page: https://wholebody-b1.github.io +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Crowd Counting with Contextual Modeling: Facilitating + Holistic Understanding of Crowd Scenes + + +
+ To alleviate the heavy annotation burden for training a reliable crowd +counting model and thus make the model more practicable and accurate by being +able to benefit from more data, this paper presents a new semi-supervised +method based on the mean teacher framework. When there is a scarcity of labeled +data available, the model is prone to overfit local patches. Within such +contexts, the conventional approach of solely improving the accuracy of local +patch predictions through unlabeled data proves inadequate. Consequently, we +propose a more nuanced approach: fostering the model's intrinsic 'subitizing' +capability. This ability allows the model to accurately estimate the count in +regions by leveraging its understanding of the crowd scenes, mirroring the +human cognitive process. To achieve this goal, we apply masking on unlabeled +data, guiding the model to make predictions for these masked patches based on +the holistic cues. Furthermore, to help with feature learning, herein we +incorporate a fine-grained density classification task. Our method is general +and applicable to most existing crowd counting methods as it doesn't have +strict structural or loss constraints. In addition, we observe that the model +trained with our framework exhibits a 'subitizing'-like behavior. It accurately +predicts low-density regions with only a 'glance', while incorporating local +details to predict high-density regions. Our method achieves the +state-of-the-art performance, surpassing previous approaches by a large margin +on challenging benchmarks such as ShanghaiTech A and UCF-QNRF. The code is +available at: https://github.com/cha15yq/MRC-Crowd. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ♻ ☆ RoadBEV: Road Surface Reconstruction in Bird's Eye View + + +
+ Road surface conditions, especially geometry profiles, enormously affect +driving performance of autonomous vehicles. Vision-based online road +reconstruction promisingly captures road information in advance. Existing +solutions like monocular depth estimation and stereo matching suffer from +modest performance. The recent technique of Bird's-Eye-View (BEV) perception +provides immense potential to more reliable and accurate reconstruction. This +paper uniformly proposes two simple yet effective models for road elevation +reconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate +road elevation with monocular and stereo images, respectively. The former +directly fits elevation values based on voxel features queried from image view, +while the latter efficiently recognizes road elevation patterns based on BEV +volume representing discrepancy between left and right voxel features. +Insightful analyses reveal their consistence and difference with perspective +view. Experiments on real-world dataset verify the models' effectiveness and +superiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm +and 0.50cm, respectively. The estimation performance improves by 50\% in BEV +based on monocular image. Our models are promising for practical applications, +providing valuable references for vision-based BEV perception in autonomous +driving. The code is released at https://github.com/ztsrxh/RoadBEV. + +
+
+ comment: Dataset page: https://thu-rsxd.com/rsrd Code: + https://github.com/ztsrxh/RoadBEV +
+
+
+
+
+ + ♻ ☆ Revisiting Few-Shot Object Detection with Vision-Language Models + + +
+ Few-shot object detection (FSOD) benchmarks have advanced techniques for +detecting new categories with limited annotations. Existing benchmarks +repurpose well-established datasets like COCO by partitioning categories into +base and novel classes for pre-training and fine-tuning respectively. However, +these benchmarks do not reflect how FSOD is deployed in practice. Rather than +only pre-training on a small number of base categories, we argue that it is +more practical to fine-tune a foundation model (e.g., a vision-language model +(VLM) pre-trained on web-scale data) for a target domain. Surprisingly, we find +that zero-shot inference from VLMs like GroundingDINO significantly outperforms +the state-of-the-art (48.3 vs. 33.1 AP) on COCO. However, such zero-shot models +can still be misaligned to target concepts of interest. For example, trailers +on the web may be different from trailers in the context of autonomous +vehicles. In this work, we propose Foundational FSOD, a new benchmark protocol +that evaluates detectors pre-trained on any external datasets and fine-tuned on +K-shots per target class. Further, we note that current FSOD benchmarks are +actually federated datasets containing exhaustive annotations for each category +on a subset of the data. We leverage this insight to propose simple strategies +for fine-tuning VLMs with federated losses. We demonstrate the effectiveness of +our approach on LVIS and nuImages, improving over prior work by 5.9 AP. Our +code is available at https://github.com/anishmadan23/foundational_fsod + +
+
+
+
+
+ + ♻ ☆ Unsupervised Video Domain Adaptation with Masked Pre-Training and + Collaborative Self-Training CVPR 2024 + + +
+ In this work, we tackle the problem of unsupervised domain adaptation (UDA) +for video action recognition. Our approach, which we call UNITE, uses an image +teacher model to adapt a video student model to the target domain. UNITE first +employs self-supervised pre-training to promote discriminative feature learning +on target domain videos using a teacher-guided masked distillation objective. +We then perform self-training on masked target data, using the video student +model and image teacher model together to generate improved pseudolabels for +unlabeled target videos. Our self-training process successfully leverages the +strengths of both models to achieve strong transfer performance across domains. +We evaluate our approach on multiple video domain adaptation benchmarks and +observe significant improvements upon previously reported results. + +
+
+ comment: Accepted at CVPR 2024. 13 pages, 4 figures. Approved for public + release: distribution unlimited +
+
+
+
+
+ + ♻ ☆ Pixel to Elevation: Learning to Predict Elevation Maps at Long Range + using Images for Autonomous Offroad Navigation + + +
+ Understanding terrain topology at long-range is crucial for the success of +off-road robotic missions, especially when navigating at high-speeds. LiDAR +sensors, which are currently heavily relied upon for geometric mapping, provide +sparse measurements when mapping at greater distances. To address this +challenge, we present a novel learning-based approach capable of predicting +terrain elevation maps at long-range using only onboard egocentric images in +real-time. Our proposed method is comprised of three main elements. First, a +transformer-based encoder is introduced that learns cross-view associations +between the egocentric views and prior bird-eye-view elevation map predictions. +Second, an orientation-aware positional encoding is proposed to incorporate the +3D vehicle pose information over complex unstructured terrain with multi-view +visual image features. Lastly, a history-augmented learn-able map embedding is +proposed to achieve better temporal consistency between elevation map +predictions to facilitate the downstream navigational tasks. We experimentally +validate the applicability of our proposed approach for autonomous offroad +robotic navigation in complex and unstructured terrain using real-world offroad +driving data. Furthermore, the method is qualitatively and quantitatively +compared against the current state-of-the-art methods. Extensive field +experiments demonstrate that our method surpasses baseline models in accurately +predicting terrain elevation while effectively capturing the overall terrain +topology at long-ranges. Finally, ablation studies are conducted to highlight +and understand the effect of key components of the proposed approach and +validate their suitability to improve offroad robotic navigation capabilities. + +
+
+ comment: 8 pages, 6 figures, Accepted in IEEE Robotics and Automation Letters + (RA-L) +
+
+
+
+
+ + ♻ ☆ Towards Two-Stream Foveation-based Active Vision Learning + + +
+ Deep neural network (DNN) based machine perception frameworks process the +entire input in a one-shot manner to provide answers to both "what object is +being observed" and "where it is located". In contrast, the "two-stream +hypothesis" from neuroscience explains the neural processing in the human +visual cortex as an active vision system that utilizes two separate regions of +the brain to answer the what and the where questions. In this work, we propose +a machine learning framework inspired by the "two-stream hypothesis" and +explore the potential benefits that it offers. Specifically, the proposed +framework models the following mechanisms: 1) ventral (what) stream focusing on +the input regions perceived by the fovea part of an eye (foveation), 2) dorsal +(where) stream providing visual guidance, and 3) iterative processing of the +two streams to calibrate visual focus and process the sequence of focused image +patches. The training of the proposed framework is accomplished by label-based +DNN training for the ventral stream model and reinforcement learning for the +dorsal stream model. We show that the two-stream foveation-based learning is +applicable to the challenging task of weakly-supervised object localization +(WSOL), where the training data is limited to the object class or its +attributes. The framework is capable of both predicting the properties of an +object and successfully localizing it by predicting its bounding box. We also +show that, due to the independent nature of the two streams, the dorsal model +can be applied on its own to unseen images to localize objects from different +datasets. + +
+
+ comment: Accepted version of the article, 18 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ End-to-End Temporal Action Detection with 1B Parameters Across 1000 + Frames CVPR 2024 + + +
+ Recently, temporal action detection (TAD) has seen significant performance +improvement with end-to-end training. However, due to the memory bottleneck, +only models with limited scales and limited data volumes can afford end-to-end +training, which inevitably restricts TAD performance. In this paper, we reduce +the memory consumption for end-to-end training, and manage to scale up the TAD +backbone to 1 billion parameters and the input video to 1,536 frames, leading +to significant detection performance. The key to our approach lies in our +proposed temporal-informative adapter (TIA), which is a novel lightweight +module that reduces training memory. Using TIA, we free the humongous backbone +from learning to adapt to the TAD task by only updating the parameters in TIA. +TIA also leads to better TAD representation by temporally aggregating context +from adjacent frames throughout the backbone. We evaluate our model across four +representative datasets. Owing to our efficient design, we are able to train +end-to-end on VideoMAEv2-giant and achieve 75.4% mAP on THUMOS14, being the +first end-to-end model to outperform the best feature-based methods. Code is +available at https://github.com/sming256/AdaTAD. + +
+
+ comment: Accepted to CVPR 2024. Camera-Ready Version +
+
+
+
+
+ + ♻ ☆ Learning SO(3)-Invariant Semantic Correspondence via Local Shape + Transform CVPR 2024 + + +
+ Establishing accurate 3D correspondences between shapes stands as a pivotal +challenge with profound implications for computer vision and robotics. However, +existing self-supervised methods for this problem assume perfect input shape +alignment, restricting their real-world applicability. In this work, we +introduce a novel self-supervised Rotation-Invariant 3D correspondence learner +with Local Shape Transform, dubbed RIST, that learns to establish dense +correspondences between shapes even under challenging intra-class variations +and arbitrary orientations. Specifically, RIST learns to dynamically formulate +an SO(3)-invariant local shape transform for each point, which maps the +SO(3)-equivariant global shape descriptor of the input shape to a local shape +descriptor. These local shape descriptors are provided as inputs to our decoder +to facilitate point cloud self- and cross-reconstruction. Our proposed +self-supervised training pipeline encourages semantically corresponding points +from different shapes to be mapped to similar local shape descriptors, enabling +RIST to establish dense point-wise correspondences. RIST demonstrates +state-of-the-art performances on 3D part label transfer and semantic keypoint +transfer given arbitrarily rotated point cloud pairs, outperforming existing +methods by significant margins. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FusionMamba: Dynamic Feature Enhancement for Multimodal Image Fusion + with Mamba + + +
+ Multi-modal image fusion aims to combine information from different modes to +create a single image with comprehensive information and detailed textures. +However, fusion models based on convolutional neural networks encounter +limitations in capturing global image features due to their focus on local +convolution operations. Transformer-based models, while excelling in global +feature modeling, confront computational challenges stemming from their +quadratic complexity. Recently, the Selective Structured State Space Model has +exhibited significant potential for long-range dependency modeling with linear +complexity, offering a promising avenue to address the aforementioned dilemma. +In this paper, we propose FusionMamba, a novel dynamic feature enhancement +method for multimodal image fusion with Mamba. Specifically, we devise an +improved efficient Mamba model for image fusion, integrating efficient visual +state space model with dynamic convolution and channel attention. This refined +model not only upholds the performance of Mamba and global modeling capability +but also diminishes channel redundancy while enhancing local enhancement +capability. Additionally, we devise a dynamic feature fusion module (DFFM) +comprising two dynamic feature enhancement modules (DFEM) and a cross modality +fusion mamba module (CMFM). The former serves for dynamic texture enhancement +and dynamic difference perception, whereas the latter enhances correlation +features between modes and suppresses redundant intermodal information. +FusionMamba has yielded state-of-the-art (SOTA) performance across various +multimodal medical image fusion tasks (CT-MRI, PET-MRI, SPECT-MRI), infrared +and visible image fusion task (IR-VIS) and multimodal biomedical image fusion +dataset (GFP-PC), which is proved that our model has generalization ability. +The code for FusionMamba is available at +https://github.com/millieXie/FusionMamba. + +
+
+
+
+
+ + ♻ ☆ Joint Multimodal Transformer for Emotion Recognition in the Wild CVPR + + +
+ Multimodal emotion recognition (MMER) systems typically outperform unimodal +systems by leveraging the inter- and intra-modal relationships between, e.g., +visual, textual, physiological, and auditory modalities. This paper proposes an +MMER method that relies on a joint multimodal transformer (JMT) for fusion with +key-based cross-attention. This framework can exploit the complementary nature +of diverse modalities to improve predictive accuracy. Separate backbones +capture intra-modal spatiotemporal dependencies within each modality over video +sequences. Subsequently, our JMT fusion architecture integrates the individual +modality embeddings, allowing the model to effectively capture inter- and +intra-modal relationships. Extensive experiments on two challenging expression +recognition tasks -- (1) dimensional emotion recognition on the Affwild2 +dataset (with face and voice) and (2) pain estimation on the Biovid dataset +(with face and biosensors) -- indicate that our JMT fusion can provide a +cost-effective solution for MMER. Empirical results show that MMER systems with +our proposed fusion allow us to outperform relevant baseline and +state-of-the-art methods. + +
+
+ comment: 10 pages, 4 figures, 6 tables, CVPRw 2024 +
+
+
+
+
+ + ♻ ☆ Allowing humans to interactively guide machines where to look does not + always improve human-AI team's classification accuracy CVPR + 2024 + + +
+ Via thousands of papers in Explainable AI (XAI), attention maps +\cite{vaswani2017attention} and feature importance maps \cite{bansal2020sam} +have been established as a common means for finding how important each input +feature is to an AI's decisions. It is an interesting, unexplored question +whether allowing users to edit the feature importance at test time would +improve a human-AI team's accuracy on downstream tasks. In this paper, we +address this question by leveraging CHM-Corr, a state-of-the-art, ante-hoc +explainable classifier \cite{taesiri2022visual} that first predicts patch-wise +correspondences between the input and training-set images, and then bases on +them to make classification decisions. We build CHM-Corr++, an interactive +interface for CHM-Corr, enabling users to edit the feature importance map +provided by CHM-Corr and observe updated model decisions. Via CHM-Corr++, users +can gain insights into if, when, and how the model changes its outputs, +improving their understanding beyond static explanations. However, our study +with 18 expert users who performed 1,400 decisions finds no statistical +significance that our interactive approach improves user accuracy on CUB-200 +bird image classification over static explanations. This challenges the +hypothesis that interactivity can boost human-AI team accuracy and raises needs +for future research. We open-source CHM-Corr++, an interactive tool for editing +image classifier attention (see an interactive demo here: +http://137.184.82.109:7080/). We release code and data on github: +https://github.com/anguyen8/chm-corr-interactive. + +
+
+ comment: Accepted for presentation at the XAI4CV Workshop, part of the CVPR + 2024 proceedings +
+
+
+
+
+ + ♻ ☆ CT Liver Segmentation via PVT-based Encoding and Refined Decoding + + +
+ Accurate liver segmentation from CT scans is essential for effective +diagnosis and treatment planning. Computer-aided diagnosis systems promise to +improve the precision of liver disease diagnosis, disease progression, and +treatment planning. In response to the need, we propose a novel deep learning +approach, \textit{\textbf{PVTFormer}}, that is built upon a pretrained pyramid +vision transformer (PVT v2) combined with advanced residual upsampling and +decoder block. By integrating a refined feature channel approach with a +hierarchical decoding strategy, PVTFormer generates high quality segmentation +masks by enhancing semantic features. Rigorous evaluation of the proposed +method on Liver Tumor Segmentation Benchmark (LiTS) 2017 demonstrates that our +proposed architecture not only achieves a high dice coefficient of 86.78\%, +mIoU of 78.46\%, but also obtains a low HD of 3.50. The results underscore +PVTFormer's efficacy in setting a new benchmark for state-of-the-art liver +segmentation methods. The source code of the proposed PVTFormer is available at +\url{https://github.com/DebeshJha/PVTFormer}. + +
+
+
+
+
+ + ♻ ☆ Action-slot: Visual Action-centric Representations for Multi-label + Atomic Activity Recognition in Traffic Scenes + + +
+ In this paper, we study multi-label atomic activity recognition. Despite the +notable progress in action recognition, it is still challenging to recognize +atomic activities due to a deficiency in a holistic understanding of both +multiple road users' motions and their contextual information. In this paper, +we introduce Action-slot, a slot attention-based approach that learns visual +action-centric representations, capturing both motion and contextual +information. Our key idea is to design action slots that are capable of paying +attention to regions where atomic activities occur, without the need for +explicit perception guidance. To further enhance slot attention, we introduce a +background slot that competes with action slots, aiding the training process in +avoiding unnecessary focus on background regions devoid of activities. Yet, the +imbalanced class distribution in the existing dataset hampers the assessment of +rare activities. To address the limitation, we collect a synthetic dataset +called TACO, which is four times larger than OATS and features a balanced +distribution of atomic activities. To validate the effectiveness of our method, +we conduct comprehensive experiments and ablation studies against various +action recognition baselines. We also show that the performance of multi-label +atomic activity recognition on real-world datasets can be improved by +pretraining representations on TACO. We will release our source code and +dataset. See the videos of visualization on the project page: +https://hcis-lab.github.io/Action-slot/ + +
+
+
+
+
+ + ♻ ☆ TransNeXt: Robust Foveal Visual Perception for Vision Transformers CVPR 2024 + + +
+ Due to the depth degradation effect in residual connections, many efficient +Vision Transformers models that rely on stacking layers for information +exchange often fail to form sufficient information mixing, leading to unnatural +visual perception. To address this issue, in this paper, we propose Aggregated +Attention, a biomimetic design-based token mixer that simulates biological +foveal vision and continuous eye movement while enabling each token on the +feature map to have a global perception. Furthermore, we incorporate learnable +tokens that interact with conventional queries and keys, which further +diversifies the generation of affinity matrices beyond merely relying on the +similarity between queries and keys. Our approach does not rely on stacking for +information exchange, thus effectively avoiding depth degradation and achieving +natural visual perception. Additionally, we propose Convolutional GLU, a +channel mixer that bridges the gap between GLU and SE mechanism, which empowers +each token to have channel attention based on its nearest neighbor image +features, enhancing local modeling capability and model robustness. We combine +aggregated attention and convolutional GLU to create a new visual backbone +called TransNeXt. Extensive experiments demonstrate that our TransNeXt achieves +state-of-the-art performance across multiple model sizes. At a resolution of +$224^2$, TransNeXt-Tiny attains an ImageNet accuracy of 84.0%, surpassing +ConvNeXt-B with 69% fewer parameters. Our TransNeXt-Base achieves an ImageNet +accuracy of 86.2% and an ImageNet-A accuracy of 61.6% at a resolution of +$384^2$, a COCO object detection mAP of 57.1, and an ADE20K semantic +segmentation mIoU of 54.7. + +
+
+ comment: CVPR 2024 Camera-ready Version. Project Page: + https://github.com/DaiShiResearch/TransNeXt +
+
+
+
+
+ + ♻ ☆ CREST: Cross-modal Resonance through Evidential Deep Learning for + Enhanced Zero-Shot Learning + + +
+ Zero-shot learning (ZSL) enables the recognition of novel classes by +leveraging semantic knowledge transfer from known to unknown categories. This +knowledge, typically encapsulated in attribute descriptions, aids in +identifying class-specific visual features, thus facilitating visual-semantic +alignment and improving ZSL performance. However, real-world challenges such as +distribution imbalances and attribute co-occurrence among instances often +hinder the discernment of local variances in images, a problem exacerbated by +the scarcity of fine-grained, region-specific attribute annotations. Moreover, +the variability in visual presentation within categories can also skew +attribute-category associations. In response, we propose a bidirectional +cross-modal ZSL approach CREST. It begins by extracting representations for +attribute and visual localization and employs Evidential Deep Learning (EDL) to +measure underlying epistemic uncertainty, thereby enhancing the model's +resilience against hard negatives. CREST incorporates dual learning pathways, +focusing on both visual-category and attribute-category alignments, to ensure +robust correlation between latent and observable spaces. Moreover, we introduce +an uncertainty-informed cross-modal fusion technique to refine visual-attribute +inference. Extensive experiments demonstrate our model's effectiveness and +unique explainability across multiple datasets. Our code and data are available +at: https://github.com/JethroJames/CREST + +
+
+ comment: Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at: + https://github.com/JethroJames/CREST +
+
+
+
+
+ + ♻ ☆ OpenPack: A Large-scale Dataset for Recognizing Packaging Works in + IoT-enabled Logistic Environments + + +
+ Unlike human daily activities, existing publicly available sensor datasets +for work activity recognition in industrial domains are limited by difficulties +in collecting realistic data as close collaboration with industrial sites is +required. This also limits research on and development of methods for +industrial applications. To address these challenges and contribute to research +on machine recognition of work activities in industrial domains, in this study, +we introduce a new large-scale dataset for packaging work recognition called +OpenPack. OpenPack contains 53.8 hours of multimodal sensor data, including +acceleration data, keypoints, depth images, and readings from IoT-enabled +devices (e.g., handheld barcode scanners), collected from 16 distinct subjects +with different levels of packaging work experience. We apply state-of-the-art +human activity recognition techniques to the dataset and provide future +directions of complex work activity recognition studies in the pervasive +computing community based on the results. We believe that OpenPack will +contribute to the sensor-based action/activity recognition community by +providing challenging tasks. The OpenPack dataset is available at +https://open-pack.github.io. + +
+
+
+
+
+ + ♻ ☆ Diffusion$^2$: Dynamic 3D Content Generation via Score Composition of + Orthogonal Diffusion Models + + +
+ Recent advancements in 3D generation are predominantly propelled by +improvements in 3D-aware image diffusion models which are pretrained on +Internet-scale image data and fine-tuned on massive 3D data, offering the +capability of producing highly consistent multi-view images. However, due to +the scarcity of synchronized multi-view video data, it is impractical to adapt +this paradigm to 4D generation directly. Despite that, the available video and +3D data are adequate for training video and multi-view diffusion models that +can provide satisfactory dynamic and geometric priors respectively. In this +paper, we present Diffusion$^2$, a novel framework for dynamic 3D content +creation that leverages the knowledge about geometric consistency and temporal +smoothness from these models to directly sample dense multi-view and +multi-frame images which can be employed to optimize continuous 4D +representation. Specifically, we design a simple yet effective denoising +strategy via score composition of video and multi-view diffusion models based +on the probability structure of the images to be generated. Owing to the high +parallelism of the image generation and the efficiency of the modern 4D +reconstruction pipeline, our framework can generate 4D content within few +minutes. Furthermore, our method circumvents the reliance on 4D data, thereby +having the potential to benefit from the scalability of the foundation video +and multi-view diffusion models. Extensive experiments demonstrate the efficacy +of our proposed framework and its capability to flexibly adapt to various types +of prompts. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Unraveling Instance Associations: A Closer Look for Audio-Visual + Segmentation + + +
+ Audio-visual segmentation (AVS) is a challenging task that involves +accurately segmenting sounding objects based on audio-visual cues. The +effectiveness of audio-visual learning critically depends on achieving accurate +cross-modal alignment between sound and visual objects. Successful audio-visual +learning requires two essential components: 1) a challenging dataset with +high-quality pixel-level multi-class annotated images associated with audio +files, and 2) a model that can establish strong links between audio information +and its corresponding visual object. However, these requirements are only +partially addressed by current methods, with training sets containing biased +audio-visual data, and models that generalise poorly beyond this biased +training set. In this work, we propose a new cost-effective strategy to build +challenging and relatively unbiased high-quality audio-visual segmentation +benchmarks. We also propose a new informative sample mining method for +audio-visual supervised contrastive learning to leverage discriminative +contrastive samples to enforce cross-modal understanding. We show empirical +results that demonstrate the effectiveness of our benchmark. Furthermore, +experiments conducted on existing AVS datasets and on our new benchmark show +that our method achieves state-of-the-art (SOTA) segmentation accuracy. + +
+
+ comment: Code is available at https://github.com/cyh-0/CAVP +
+
+
+
+
+ + ♻ ☆ D$^2$ST-Adapter: Disentangled-and-Deformable Spatio-Temporal Adapter for + Few-shot Action Recognition + + +
+ Adapting large pre-trained image models to few-shot action recognition has +proven to be an effective and efficient strategy for learning robust feature +extractors, which is essential for few-shot learning. Typical fine-tuning based +adaptation paradigm is prone to overfitting in the few-shot learning scenarios +and offers little modeling flexibility for learning temporal features in video +data. In this work we present the Disentangled-and-Deformable Spatio-Temporal +Adapter (D$^2$ST-Adapter), which is a novel adapter tuning framework +well-suited for few-shot action recognition due to lightweight design and low +parameter-learning overhead. It is designed in a dual-pathway architecture to +encode spatial and temporal features in a disentangled manner. In particular, +we devise the anisotropic Deformable Spatio-Temporal Attention module as the +core component of D$^2$ST-Adapter, which can be tailored with anisotropic +sampling densities along spatial and temporal domains to learn spatial and +temporal features specifically in corresponding pathways, allowing our +D$^2$ST-Adapter to encode features in a global view in 3D spatio-temporal space +while maintaining a lightweight design. Extensive experiments with +instantiations of our method on both pre-trained ResNet and ViT demonstrate the +superiority of our method over state-of-the-art methods for few-shot action +recognition. Our method is particularly well-suited to challenging scenarios +where temporal dynamics are critical for action recognition. + +
+
+
+
+
+ + ♻ ☆ Delocate: Detection and Localization for Deepfake Videos with + Randomly-Located Tampered Traces + + +
+ Deepfake videos are becoming increasingly realistic, showing subtle tampering +traces on facial areasthat vary between frames. Consequently, many existing +Deepfake detection methods struggle to detect unknown domain Deepfake videos +while accurately locating the tampered region. To address thislimitation, we +propose Delocate, a novel Deepfake detection model that can both recognize +andlocalize unknown domain Deepfake videos. Ourmethod consists of two stages +named recoveringand localization. In the recovering stage, the modelrandomly +masks regions of interest (ROIs) and reconstructs real faces without tampering +traces, resulting in a relatively good recovery effect for realfaces and a poor +recovery effect for fake faces. Inthe localization stage, the output of the +recoveryphase and the forgery ground truth mask serve assupervision to guide +the forgery localization process. This process strategically emphasizes the +recovery phase of fake faces with poor recovery, facilitating the localization +of tampered regions. Ourextensive experiments on four widely used benchmark +datasets demonstrate that Delocate not onlyexcels in localizing tampered areas +but also enhances cross-domain detection performance. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2308.09921, + arXiv:2305.05943 +
+
+
+
+
+ + ♻ ☆ CoT3DRef: Chain-of-Thoughts Data-Efficient 3D Visual Grounding ICLR 2024 + + +
+ 3D visual grounding is the ability to localize objects in 3D scenes +conditioned by utterances. Most existing methods devote the referring head to +localize the referred object directly, causing failure in complex scenarios. In +addition, it does not illustrate how and why the network reaches the final +decision. In this paper, we address this question Can we design an +interpretable 3D visual grounding framework that has the potential to mimic the +human perception system?. To this end, we formulate the 3D visual grounding +problem as a sequence-to-sequence Seq2Seq task by first predicting a chain of +anchors and then the final target. Interpretability not only improves the +overall performance but also helps us identify failure cases. Following the +chain of thoughts approach enables us to decompose the referring task into +interpretable intermediate steps, boosting the performance and making our +framework extremely data-efficient. Moreover, our proposed framework can be +easily integrated into any existing architecture. We validate our approach +through comprehensive experiments on the Nr3D, Sr3D, and Scanrefer benchmarks +and show consistent performance gains compared to existing methods without +requiring manually annotated data. Furthermore, our proposed framework, dubbed +CoT3DRef, is significantly data-efficient, whereas on the Sr3D dataset, when +trained only on 10% of the data, we match the SOTA performance that trained on +the entire data. The code is available at +https:eslambakr.github.io/cot3dref.github.io/. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ MultiCorrupt: A Multi-Modal Robustness Dataset and Benchmark of + LiDAR-Camera Fusion for 3D Object Detection + + +
+ Multi-modal 3D object detection models for automated driving have +demonstrated exceptional performance on computer vision benchmarks like +nuScenes. However, their reliance on densely sampled LiDAR point clouds and +meticulously calibrated sensor arrays poses challenges for real-world +applications. Issues such as sensor misalignment, miscalibration, and disparate +sampling frequencies lead to spatial and temporal misalignment in data from +LiDAR and cameras. Additionally, the integrity of LiDAR and camera data is +often compromised by adverse environmental conditions such as inclement +weather, leading to occlusions and noise interference. To address this +challenge, we introduce MultiCorrupt, a comprehensive benchmark designed to +evaluate the robustness of multi-modal 3D object detectors against ten distinct +types of corruptions. We evaluate five state-of-the-art multi-modal detectors +on MultiCorrupt and analyze their performance in terms of their resistance +ability. Our results show that existing methods exhibit varying degrees of +robustness depending on the type of corruption and their fusion strategy. We +provide insights into which multi-modal design choices make such models robust +against certain perturbations. The dataset generation code and benchmark are +open-sourced at https://github.com/ika-rwth-aachen/MultiCorrupt. + +
+
+ comment: Code: https://github.com/ika-rwth-aachen/MultiCorrupt +
+
+
+
+
+ + ♻ ☆ Improving 2D Human Pose Estimation in Rare Camera Views with Synthetic + Data + + +
+ Methods and datasets for human pose estimation focus predominantly on side- +and front-view scenarios. We overcome the limitation by leveraging synthetic +data and introduce RePoGen (RarE POses GENerator), an SMPL-based method for +generating synthetic humans with comprehensive control over pose and view. +Experiments on top-view datasets and a new dataset of real images with diverse +poses show that adding the RePoGen data to the COCO dataset outperforms +previous approaches to top- and bottom-view pose estimation without harming +performance on common views. An ablation study shows that anatomical +plausibility, a property prior research focused on, is not a prerequisite for +effective performance. The introduced dataset and the corresponding code are +available on https://mirapurkrabek.github.io/RePoGen-paper/ . + +
+
+ comment: https://mirapurkrabek.github.io/RePoGen-paper/ +
+
+
+
+
+ + ♻ ☆ Deepfake Generation and Detection: A Benchmark and Survey + + +
+ Deepfake is a technology dedicated to creating highly realistic facial images +and videos under specific conditions, which has significant application +potential in fields such as entertainment, movie production, digital human +creation, to name a few. With the advancements in deep learning, techniques +primarily represented by Variational Autoencoders and Generative Adversarial +Networks have achieved impressive generation results. More recently, the +emergence of diffusion models with powerful generation capabilities has sparked +a renewed wave of research. In addition to deepfake generation, corresponding +detection technologies continuously evolve to regulate the potential misuse of +deepfakes, such as for privacy invasion and phishing attacks. This survey +comprehensively reviews the latest developments in deepfake generation and +detection, summarizing and analyzing current state-of-the-arts in this rapidly +evolving field. We first unify task definitions, comprehensively introduce +datasets and metrics, and discuss developing technologies. Then, we discuss the +development of several related sub-fields and focus on researching four +representative deepfake fields: face swapping, face reenactment, talking face +generation, and facial attribute editing, as well as forgery detection. +Subsequently, we comprehensively benchmark representative methods on popular +datasets for each field, fully evaluating the latest and influential published +works. Finally, we analyze challenges and future research directions of the +discussed fields. + +
+
+ comment: We closely follow the latest developments in + https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection +
+
+
+
+
+ + ♻ ☆ Task-Aware Encoder Control for Deep Video Compression CVPR 2024 + + +
+ Prior research on deep video compression (DVC) for machine tasks typically +necessitates training a unique codec for each specific task, mandating a +dedicated decoder per task. In contrast, traditional video codecs employ a +flexible encoder controller, enabling the adaptation of a single codec to +different tasks through mechanisms like mode prediction. Drawing inspiration +from this, we introduce an innovative encoder controller for deep video +compression for machines. This controller features a mode prediction and a +Group of Pictures (GoP) selection module. Our approach centralizes control at +the encoding stage, allowing for adaptable encoder adjustments across different +tasks, such as detection and tracking, while maintaining compatibility with a +standard pre-trained DVC decoder. Empirical evidence demonstrates that our +method is applicable across multiple tasks with various existing pre-trained +DVCs. Moreover, extensive experiments demonstrate that our method outperforms +previous DVC by about 25% bitrate for different tasks, with only one +pre-trained decoder. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A Closer Look at Spatial-Slice Features Learning for COVID-19 Detection CVPR2024 + + +
+ Conventional Computed Tomography (CT) imaging recognition faces two +significant challenges: (1) There is often considerable variability in the +resolution and size of each CT scan, necessitating strict requirements for the +input size and adaptability of models. (2) CT-scan contains large number of +out-of-distribution (OOD) slices. The crucial features may only be present in +specific spatial regions and slices of the entire CT scan. How can we +effectively figure out where these are located? To deal with this, we introduce +an enhanced Spatial-Slice Feature Learning (SSFL++) framework specifically +designed for CT scan. It aim to filter out a OOD data within whole CT scan, +enabling our to select crucial spatial-slice for analysis by reducing 70% +redundancy totally. Meanwhile, we proposed Kernel-Density-based slice Sampling +(KDS) method to improve the stability when training and inference stage, +therefore speeding up the rate of convergence and boosting performance. As a +result, the experiments demonstrate the promising performance of our model +using a simple EfficientNet-2D (E2D) model, even with only 1% of the training +data. The efficacy of our approach has been validated on the COVID-19-CT-DB +datasets provided by the DEF-AI-MIA workshop, in conjunction with CVPR 2024. +Our source code is available at https://github.com/ming053l/E2D + +
+
+ comment: Camera-ready version, accepted by DEF-AI-MIA workshop, in conjunted + with CVPR2024 +
+
+
+
+
+ + ♻ ☆ SPIRiT-Diffusion: Self-Consistency Driven Diffusion Model for + Accelerated MRI + + +
+ Diffusion models have emerged as a leading methodology for image generation +and have proven successful in the realm of magnetic resonance imaging (MRI) +reconstruction. However, existing reconstruction methods based on diffusion +models are primarily formulated in the image domain, making the reconstruction +quality susceptible to inaccuracies in coil sensitivity maps (CSMs). k-space +interpolation methods can effectively address this issue but conventional +diffusion models are not readily applicable in k-space interpolation. To +overcome this challenge, we introduce a novel approach called SPIRiT-Diffusion, +which is a diffusion model for k-space interpolation inspired by the iterative +self-consistent SPIRiT method. Specifically, we utilize the iterative solver of +the self-consistent term (i.e., k-space physical prior) in SPIRiT to formulate +a novel stochastic differential equation (SDE) governing the diffusion process. +Subsequently, k-space data can be interpolated by executing the diffusion +process. This innovative approach highlights the optimization model's role in +designing the SDE in diffusion models, enabling the diffusion process to align +closely with the physics inherent in the optimization model, a concept referred +to as model-driven diffusion. We evaluated the proposed SPIRiT-Diffusion method +using a 3D joint intracranial and carotid vessel wall imaging dataset. The +results convincingly demonstrate its superiority over image-domain +reconstruction methods, achieving high reconstruction quality even at a +substantial acceleration rate of 10. + +
+
+
+
+
+ + ♻ ☆ A Single Simple Patch is All You Need for AI-generated Image Detection + + +
+ The recent development of generative models unleashes the potential of +generating hyper-realistic fake images. To prevent the malicious usage of fake +images, AI-generated image detection aims to distinguish fake images from real +images. However, existing method suffer from severe performance drop when +detecting images generated by unseen generators. We find that generative models +tend to focus on generating the patches with rich textures to make the images +more realistic while neglecting the hidden noise caused by camera capture +present in simple patches. In this paper, we propose to exploit the noise +pattern of a single simple patch to identify fake images. Furthermore, due to +the performance decline when handling low-quality generated images, we +introduce an enhancement module and a perception module to remove the +interfering information. Extensive experiments demonstrate that our method can +achieve state-of-the-art performance on public benchmarks. + +
+
+
+
+
+ + ♻ ☆ ProTA: Probabilistic Token Aggregation for Text-Video Retrieval + + +
+ Text-video retrieval aims to find the most relevant cross-modal samples for a +given query. Recent methods focus on modeling the whole spatial-temporal +relations. However, since video clips contain more diverse content than +captions, the model aligning these asymmetric video-text pairs has a high risk +of retrieving many false positive results. In this paper, we propose +Probabilistic Token Aggregation (ProTA) to handle cross-modal interaction with +content asymmetry. Specifically, we propose dual partial-related aggregation to +disentangle and re-aggregate token representations in both low-dimension and +high-dimension spaces. We propose token-based probabilistic alignment to +generate token-level probabilistic representation and maintain the feature +representation diversity. In addition, an adaptive contrastive loss is proposed +to learn compact cross-modal distribution space. Based on extensive +experiments, ProTA achieves significant improvements on MSR-VTT (50.9%), LSMDC +(25.8%), and DiDeMo (47.2%). + +
+
+
+
+
+ + ♻ ☆ End-to-End Autonomous Driving through V2X Cooperation + + +
+ Cooperatively utilizing both ego-vehicle and infrastructure sensor data via +V2X communication has emerged as a promising approach for advanced autonomous +driving. However, current research mainly focuses on improving individual +modules, rather than taking end-to-end learning to optimize final planning +performance, resulting in underutilized data potential. In this paper, we +introduce UniV2X, a pioneering cooperative autonomous driving framework that +seamlessly integrates all key driving modules across diverse views into a +unified network. We propose a sparse-dense hybrid data transmission and fusion +mechanism for effective vehicle-infrastructure cooperation, offering three +advantages: 1) Effective for simultaneously enhancing agent perception, online +mapping, and occupancy prediction, ultimately improving planning performance. +2) Transmission-friendly for practical and limited communication conditions. 3) +Reliable data fusion with interpretability of this hybrid data. We implement +UniV2X, as well as reproducing several benchmark methods, on the challenging +DAIR-V2X, the real-world cooperative driving dataset. Experimental results +demonstrate the effectiveness of UniV2X in significantly enhancing planning +performance, as well as all intermediate output performance. Code is at +https://github.com/AIR-THU/UniV2X. + +
+
+
+
+
+ + ♻ ☆ Lodge: A Coarse to Fine Diffusion Network for Long Dance Generation + Guided by the Characteristic Dance Primitives CVPR2024 + + +
+ We propose Lodge, a network capable of generating extremely long dance +sequences conditioned on given music. We design Lodge as a two-stage coarse to +fine diffusion architecture, and propose the characteristic dance primitives +that possess significant expressiveness as intermediate representations between +two diffusion models. The first stage is global diffusion, which focuses on +comprehending the coarse-level music-dance correlation and production +characteristic dance primitives. In contrast, the second-stage is the local +diffusion, which parallelly generates detailed motion sequences under the +guidance of the dance primitives and choreographic rules. In addition, we +propose a Foot Refine Block to optimize the contact between the feet and the +ground, enhancing the physical realism of the motion. Our approach can +parallelly generate dance sequences of extremely long length, striking a +balance between global choreographic patterns and local motion quality and +expressiveness. Extensive experiments validate the efficacy of our method. + +
+
+ comment: Accepted by CVPR2024, Project page: + https://li-ronghui.github.io/lodge +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ MoVA: Adapting Mixture of Vision Experts to Multimodal Context + + +
+ As the key component in multimodal large language models (MLLMs), the ability +of the visual encoder greatly affects MLLM's understanding on diverse image +content. Although some large-scale pretrained vision encoders such as vision +encoders in CLIP and DINOv2 have brought promising performance, we found that +there is still no single vision encoder that can dominate various image content +understanding, e.g., the CLIP vision encoder leads to outstanding results on +general image understanding but poor performance on document or chart content. +To alleviate the bias of CLIP vision encoder, we first delve into the inherent +behavior of different pre-trained vision encoders and then propose the MoVA, a +powerful and novel MLLM, adaptively routing and fusing task-specific vision +experts with a coarse-to-fine mechanism. In the coarse-grained stage, we design +a context-aware expert routing strategy to dynamically select the most suitable +vision experts according to the user instruction, input image, and expertise of +vision experts. This benefits from the powerful model function understanding +ability of the large language model (LLM) equipped with expert-routing low-rank +adaptation (LoRA). In the fine-grained stage, we elaborately conduct the +mixture-of-vision-expert adapter (MoV-Adapter) to extract and fuse +task-specific knowledge from various experts. This coarse-to-fine paradigm +effectively leverages representations from experts based on multimodal context +and model expertise, further enhancing the generalization ability. We conduct +extensive experiments to evaluate the effectiveness of the proposed approach. +Without any bells and whistles, MoVA can achieve significant performance gains +over current state-of-the-art methods in a wide range of challenging multimodal +benchmarks. Codes and models will be available at +https://github.com/TempleX98/MoVA. + +
+
+
+
+
+ + ☆ Unified Scene Representation and Reconstruction for 3D Large Language + Models + + +
+ Enabling Large Language Models (LLMs) to interact with 3D environments is +challenging. Existing approaches extract point clouds either from ground truth +(GT) geometry or 3D scenes reconstructed by auxiliary models. Text-image +aligned 2D features from CLIP are then lifted to point clouds, which serve as +inputs for LLMs. However, this solution lacks the establishment of 3D +point-to-point connections, leading to a deficiency of spatial structure +information. Concurrently, the absence of integration and unification between +the geometric and semantic representations of the scene culminates in a +diminished level of 3D scene understanding. In this paper, we demonstrate the +importance of having a unified scene representation and reconstruction +framework, which is essential for LLMs in 3D scenes. Specifically, we introduce +Uni3DR^2 extracts 3D geometric and semantic aware representation features via +the frozen pre-trained 2D foundation models (e.g., CLIP and SAM) and a +multi-scale aggregate 3D decoder. Our learned 3D representations not only +contribute to the reconstruction process but also provide valuable knowledge +for LLMs. Experimental results validate that our Uni3DR^2 yields convincing +gains over the baseline on the 3D reconstruction dataset ScanNet (increasing +F-Score by +1.8\%). When applied to LLMs, our Uni3DR^2-LLM exhibits superior +performance over the baseline on the 3D vision-language understanding dataset +ScanQA (increasing BLEU-1 by +4.0\% and +4.2\% on the val set and test set, +respectively). Furthermore, it outperforms the state-of-the-art method that +uses additional GT point clouds on both ScanQA and 3DMV-VQA. + +
+
+ comment: Project Page: https://chtsy.github.io/uni3drr-page/ +
+
+
+
+
+ + ☆ Data Alignment for Zero-Shot Concept Generation in Dermatology AI + + +
+ AI in dermatology is evolving at a rapid pace but the major limitation to +training trustworthy classifiers is the scarcity of data with ground-truth +concept level labels, which are meta-labels semantically meaningful to humans. +Foundation models like CLIP providing zero-shot capabilities can help alleviate +this challenge by leveraging vast amounts of image-caption pairs available on +the internet. CLIP can be fine-tuned using domain specific image-caption pairs +to improve classification performance. However, CLIP's pre-training data is not +well-aligned with the medical jargon that clinicians use to perform diagnoses. +The development of large language models (LLMs) in recent years has led to the +possibility of leveraging the expressive nature of these models to generate +rich text. Our goal is to use these models to generate caption text that aligns +well with both the clinical lexicon and with the natural human language used in +CLIP's pre-training data. Starting with captions used for images in PubMed +articles, we extend them by passing the raw captions through an LLM fine-tuned +on the field's several textbooks. We find that using captions generated by an +expressive fine-tuned LLM like GPT-3.5 improves downstream zero-shot concept +classification performance. + +
+
+
+
+
+ + ☆ Analysis of Classifier-Free Guidance Weight Schedulers + + +
+ Classifier-Free Guidance (CFG) enhances the quality and condition adherence +of text-to-image diffusion models. It operates by combining the conditional and +unconditional predictions using a fixed weight. However, recent works vary the +weights throughout the diffusion process, reporting superior results but +without providing any rationale or analysis. By conducting comprehensive +experiments, this paper provides insights into CFG weight schedulers. Our +findings suggest that simple, monotonically increasing weight schedulers +consistently lead to improved performances, requiring merely a single line of +code. In addition, more complex parametrized schedulers can be optimized for +further improvement, but do not generalize across different models and tasks. + +
+
+
+
+
+ + ☆ LaPA: Latent Prompt Assist Model For Medical Visual Question Answering CVPR + + +
+ Medical visual question answering (Med-VQA) aims to automate the prediction +of correct answers for medical images and questions, thereby assisting +physicians in reducing repetitive tasks and alleviating their workload. +Existing approaches primarily focus on pre-training models using additional and +comprehensive datasets, followed by fine-tuning to enhance performance in +downstream tasks. However, there is also significant value in exploring +existing models to extract clinically relevant information. In this paper, we +propose the Latent Prompt Assist model (LaPA) for medical visual question +answering. Firstly, we design a latent prompt generation module to generate the +latent prompt with the constraint of the target answer. Subsequently, we +propose a multi-modal fusion block with latent prompt fusion module that +utilizes the latent prompt to extract clinical-relevant information from +uni-modal and multi-modal features. Additionally, we introduce a prior +knowledge fusion module to integrate the relationship between diseases and +organs with the clinical-relevant information. Finally, we combine the final +integrated information with image-language cross-modal information to predict +the final answers. Experimental results on three publicly available Med-VQA +datasets demonstrate that LaPA outperforms the state-of-the-art model ARL, +achieving improvements of 1.83%, 0.63%, and 1.80% on VQA-RAD, SLAKE, and +VQA-2019, respectively. The code is publicly available at +https://github.com/GaryGuTC/LaPA_model. + +
+
+ comment: 10 pages, 4 figures, Accepted by CVPRW2024 +
+
+
+
+
+ + ☆ PhysDreamer: Physics-Based Interaction with 3D Objects via Video + Generation + + +
+ Realistic object interactions are crucial for creating immersive virtual +experiences, yet synthesizing realistic 3D object dynamics in response to novel +interactions remains a significant challenge. Unlike unconditional or +text-conditioned dynamics generation, action-conditioned dynamics requires +perceiving the physical material properties of objects and grounding the 3D +motion prediction on these properties, such as object stiffness. However, +estimating physical material properties is an open problem due to the lack of +material ground-truth data, as measuring these properties for real objects is +highly difficult. We present PhysDreamer, a physics-based approach that endows +static 3D objects with interactive dynamics by leveraging the object dynamics +priors learned by video generation models. By distilling these priors, +PhysDreamer enables the synthesis of realistic object responses to novel +interactions, such as external forces or agent manipulations. We demonstrate +our approach on diverse examples of elastic objects and evaluate the realism of +the synthesized interactions through a user study. PhysDreamer takes a step +towards more engaging and realistic virtual experiences by enabling static 3D +objects to dynamically respond to interactive stimuli in a physically plausible +manner. See our project page at https://physdreamer.github.io/. + +
+
+ comment: Project website at: https://physdreamer.github.io/ +
+
+
+
+
+ + ☆ BANF: Band-limited Neural Fields for Levels of Detail Reconstruction + + +
+ Largely due to their implicit nature, neural fields lack a direct mechanism +for filtering, as Fourier analysis from discrete signal processing is not +directly applicable to these representations. Effective filtering of neural +fields is critical to enable level-of-detail processing in downstream +applications, and support operations that involve sampling the field on regular +grids (e.g. marching cubes). Existing methods that attempt to decompose neural +fields in the frequency domain either resort to heuristics or require extensive +modifications to the neural field architecture. We show that via a simple +modification, one can obtain neural fields that are low-pass filtered, and in +turn show how this can be exploited to obtain a frequency decomposition of the +entire signal. We demonstrate the validity of our technique by investigating +level-of-detail reconstruction, and showing how coarser representations can be +computed effectively. + +
+
+ comment: Project Page: https://theialab.github.io/banf +
+
+
+
+
+ + ☆ Optimizing Calibration by Gaining Aware of Prediction Correctness + + +
+ Model calibration aims to align confidence with prediction correctness. The +Cross-Entropy CE) loss is widely used for calibrator training, which enforces +the model to increase confidence on the ground truth class. However, we find +the CE loss has intrinsic limitations. For example, for a narrow +misclassification, a calibrator trained by the CE loss often produces high +confidence on the wrongly predicted class (e.g., a test sample is wrongly +classified and its softmax score on the ground truth class is around 0.4), +which is undesirable. In this paper, we propose a new post-hoc calibration +objective derived from the aim of calibration. Intuitively, the proposed +objective function asks that the calibrator decrease model confidence on +wrongly predicted samples and increase confidence on correctly predicted +samples. Because a sample itself has insufficient ability to indicate +correctness, we use its transformed versions (e.g., rotated, greyscaled and +color-jittered) during calibrator training. Trained on an in-distribution +validation set and tested with isolated, individual test samples, our method +achieves competitive calibration performance on both in-distribution and +out-of-distribution test sets compared with the state of the art. Further, our +analysis points out the difference between our method and commonly used +objectives such as CE loss and mean square error loss, where the latters +sometimes deviates from the calibration aim. + +
+
+
+
+
+ + ☆ Groma: Localized Visual Tokenization for Grounding Multimodal Large + Language Models + + +
+ We introduce Groma, a Multimodal Large Language Model (MLLM) with grounded +and fine-grained visual perception ability. Beyond holistic image +understanding, Groma is adept at region-level tasks such as region captioning +and visual grounding. Such capabilities are built upon a localized visual +tokenization mechanism, where an image input is decomposed into regions of +interest and subsequently encoded into region tokens. By integrating region +tokens into user instructions and model responses, we seamlessly enable Groma +to understand user-specified region inputs and ground its textual output to +images. Besides, to enhance the grounded chat ability of Groma, we curate a +visually grounded instruction dataset by leveraging the powerful GPT-4V and +visual prompting techniques. Compared with MLLMs that rely on the language +model or external module for localization, Groma consistently demonstrates +superior performances in standard referring and grounding benchmarks, +highlighting the advantages of embedding localization into image tokenization. +Project page: https://groma-mllm.github.io/. + +
+
+
+
+
+ + ☆ Towards Robust Ferrous Scrap Material Classification with Deep Learning + and Conformal Prediction + + +
+ In the steel production domain, recycling ferrous scrap is essential for +environmental and economic sustainability, as it reduces both energy +consumption and greenhouse gas emissions. However, the classification of scrap +materials poses a significant challenge, requiring advancements in automation +technology. Additionally, building trust among human operators is a major +obstacle. Traditional approaches often fail to quantify uncertainty and lack +clarity in model decision-making, which complicates acceptance. In this +article, we describe how conformal prediction can be employed to quantify +uncertainty and add robustness in scrap classification. We have adapted the +Split Conformal Prediction technique to seamlessly integrate with +state-of-the-art computer vision models, such as the Vision Transformer (ViT), +Swin Transformer, and ResNet-50, while also incorporating Explainable +Artificial Intelligence (XAI) methods. We evaluate the approach using a +comprehensive dataset of 8147 images spanning nine ferrous scrap classes. The +application of the Split Conformal Prediction method allowed for the +quantification of each model's uncertainties, which enhanced the understanding +of predictions and increased the reliability of the results. Specifically, the +Swin Transformer model demonstrated more reliable outcomes than the others, as +evidenced by its smaller average size of prediction sets and achieving an +average classification accuracy exceeding 95%. Furthermore, the Score-CAM +method proved highly effective in clarifying visual features, significantly +enhancing the explainability of the classification decisions. + +
+
+
+
+
+ + ☆ RadRotator: 3D Rotation of Radiographs with Diffusion Models + + +
+ Transforming two-dimensional (2D) images into three-dimensional (3D) volumes +is a well-known yet challenging problem for the computer vision community. In +the medical domain, a few previous studies attempted to convert two or more +input radiographs into computed tomography (CT) volumes. Following their +effort, we introduce a diffusion model-based technology that can rotate the +anatomical content of any input radiograph in 3D space, potentially enabling +the visualization of the entire anatomical content of the radiograph from any +viewpoint in 3D. Similar to previous studies, we used CT volumes to create +Digitally Reconstructed Radiographs (DRRs) as the training data for our model. +However, we addressed two significant limitations encountered in previous +studies: 1. We utilized conditional diffusion models with classifier-free +guidance instead of Generative Adversarial Networks (GANs) to achieve higher +mode coverage and improved output image quality, with the only trade-off being +slower inference time, which is often less critical in medical applications; +and 2. We demonstrated that the unreliable output of style transfer deep +learning (DL) models, such as Cycle-GAN, to transfer the style of actual +radiographs to DRRs could be replaced with a simple yet effective training +transformation that randomly changes the pixel intensity histograms of the +input and ground-truth imaging data during training. This transformation makes +the diffusion model agnostic to any distribution variations of the input data +pixel intensity, enabling the reliable training of a DL model on input DRRs and +applying the exact same model to conventional radiographs (or DRRs) during +inference. + +
+
+ comment: Website: https://pouriarouzrokh.github.io/RadRotator Online demo: + https://huggingface.co/spaces/Pouriarouzrokh/RadRotator Article information: + 16 pages, 11 figures +
+
+
+
+
+ + ☆ Nuclei Instance Segmentation of Cryosectioned H&E Stained Histological + Images using Triple U-Net Architecture + + +
+ Nuclei instance segmentation is crucial in oncological diagnosis and cancer +pathology research. H&E stained images are commonly used for medical diagnosis, +but pre-processing is necessary before using them for image processing tasks. +Two principal pre-processing methods are formalin-fixed paraffin-embedded +samples (FFPE) and frozen tissue samples (FS). While FFPE is widely used, it is +time-consuming, while FS samples can be processed quickly. Analyzing H&E +stained images derived from fast sample preparation, staining, and scanning can +pose difficulties due to the swift process, which can result in the degradation +of image quality. This paper proposes a method that leverages the unique +optical characteristics of H&E stained images. A three-branch U-Net +architecture has been implemented, where each branch contributes to the final +segmentation results. The process includes applying watershed algorithm to +separate overlapping regions and enhance accuracy. The Triple U-Net +architecture comprises an RGB branch, a Hematoxylin branch, and a Segmentation +branch. This study focuses on a novel dataset named CryoNuSeg. The results +obtained through robust experiments outperform the state-of-the-art results +across various metrics. The benchmark score for this dataset is AJI 52.5 and PQ +47.7, achieved through the implementation of U-Net Architecture. However, the +proposed Triple U-Net architecture achieves an AJI score of 67.41 and PQ of +50.56. The proposed architecture improves more on AJI than other evaluation +metrics, which further justifies the superiority of the Triple U-Net +architecture over the baseline U-Net model, as AJI is a more strict evaluation +metric. The use of the three-branch U-Net model, followed by watershed +post-processing, significantly surpasses the benchmark scores, showing +substantial improvement in the AJI score + +
+
+ comment: To be published in "6th IVPR & 11th ICIEV" +
+
+
+
+
+ + ☆ Cross-modal Diffusion Modelling for Super-resolved Spatial + Transcriptomics + + +
+ The recent advancement of spatial transcriptomics (ST) allows to characterize +spatial gene expression within tissue for discovery research. However, current +ST platforms suffer from low resolution, hindering in-depth understanding of +spatial gene expression. Super-resolution approaches promise to enhance ST maps +by integrating histology images with gene expressions of profiled tissue spots. +However, current super-resolution methods are limited by restoration +uncertainty and mode collapse. Although diffusion models have shown promise in +capturing complex interactions between multi-modal conditions, it remains a +challenge to integrate histology images and gene expression for super-resolved +ST maps. This paper proposes a cross-modal conditional diffusion model for +super-resolving ST maps with the guidance of histology images. Specifically, we +design a multi-modal disentangling network with cross-modal adaptive modulation +to utilize complementary information from histology images and spatial gene +expression. Moreover, we propose a dynamic cross-attention modelling strategy +to extract hierarchical cell-to-tissue information from histology images. +Lastly, we propose a co-expression-based gene-correlation graph network to +model the co-expression relationship of multiple genes. Experiments show that +our method outperforms other state-of-the-art methods in ST super-resolution on +three public datasets. + +
+
+
+
+
+ + ☆ Eyes Can Deceive: Benchmarking Counterfactual Reasoning Abilities of + Multi-modal Large Language Models + + +
+ Counterfactual reasoning, as a crucial manifestation of human intelligence, +refers to making presuppositions based on established facts and extrapolating +potential outcomes. Existing multimodal large language models (MLLMs) have +exhibited impressive cognitive and reasoning capabilities, which have been +examined across a wide range of Visual Question Answering (VQA) benchmarks. +Nevertheless, how will existing MLLMs perform when faced with counterfactual +questions? To answer this question, we first curate a novel +\textbf{C}ounter\textbf{F}actual \textbf{M}ulti\textbf{M}odal reasoning +benchmark, abbreviated as \textbf{CFMM}, to systematically assess the +counterfactual reasoning capabilities of MLLMs. Our CFMM comprises six +challenging tasks, each including hundreds of carefully human-labeled +counterfactual questions, to evaluate MLLM's counterfactual reasoning +capabilities across diverse aspects. Through experiments, interestingly, we +find that existing MLLMs prefer to believe what they see, but ignore the +counterfactual presuppositions presented in the question, thereby leading to +inaccurate responses. Furthermore, we evaluate a wide range of prevalent MLLMs +on our proposed CFMM. The significant gap between their performance on our CFMM +and that on several VQA benchmarks indicates that there is still considerable +room for improvement in existing MLLMs toward approaching human-level +intelligence. On the other hand, through boosting MLLMs performances on our +CFMM in the future, potential avenues toward developing MLLMs with advanced +intelligence can be explored. + +
+
+
+
+
+ + ☆ Improving Pediatric Pneumonia Diagnosis with Adult Chest X-ray Images + Utilizing Contrastive Learning and Embedding Similarity + + +
+ Despite the advancement of deep learning-based computer-aided diagnosis (CAD) +methods for pneumonia from adult chest x-ray (CXR) images, the performance of +CAD methods applied to pediatric images remains suboptimal, mainly due to the +lack of large-scale annotated pediatric imaging datasets. Establishing a proper +framework to leverage existing adult large-scale CXR datasets can thus enhance +pediatric pneumonia detection performance. In this paper, we propose a +three-branch parallel path learning-based framework that utilizes both adult +and pediatric datasets to improve the performance of deep learning models on +pediatric test datasets. The paths are trained with pediatric only, adult only, +and both types of CXRs, respectively. Our proposed framework utilizes the +multi-positive contrastive loss to cluster the classwise embeddings and the +embedding similarity loss among these three parallel paths to make the +classwise embeddings as close as possible to reduce the effect of domain shift. +Experimental evaluations on open-access adult and pediatric CXR datasets show +that the proposed method achieves a superior AUROC score of 0.8464 compared to +0.8348 obtained using the conventional approach of join training on both +datasets. The proposed approach thus paves the way for generalized CAD models +that are effective for both adult and pediatric age groups. + +
+
+ comment: Accepted to International Conference of IEEE Engineering in Medicine + and Biology Society (EMBC), 2024 +
+
+
+
+
+ + ☆ Next Generation Loss Function for Image Classification + + +
+ Neural networks are trained by minimizing a loss function that defines the +discrepancy between the predicted model output and the target value. The +selection of the loss function is crucial to achieve task-specific behaviour +and highly influences the capability of the model. A variety of loss functions +have been proposed for a wide range of tasks affecting training and model +performance. For classification tasks, the cross entropy is the de-facto +standard and usually the first choice. Here, we try to experimentally challenge +the well-known loss functions, including cross entropy (CE) loss, by utilizing +the genetic programming (GP) approach, a population-based evolutionary +algorithm. GP constructs loss functions from a set of operators and leaf nodes +and these functions are repeatedly recombined and mutated to find an optimal +structure. Experiments were carried out on different small-sized datasets +CIFAR-10, CIFAR-100 and Fashion-MNIST using an Inception model. The 5 best +functions found were evaluated for different model architectures on a set of +standard datasets ranging from 2 to 102 classes and very different sizes. One +function, denoted as Next Generation Loss (NGL), clearly stood out showing same +or better performance for all tested datasets compared to CE. To evaluate the +NGL function on a large-scale dataset, we tested its performance on the +Imagenet-1k dataset where it showed improved top-1 accuracy compared to models +trained with identical settings and other losses. Finally, the NGL was trained +on a segmentation downstream task for Pascal VOC 2012 and COCO-Stuff164k +datasets improving the underlying model performance. + +
+
+
+
+
+ + ☆ Purposer: Putting Human Motion Generation in Context + + +
+ We present a novel method to generate human motion to populate 3D indoor +scenes. It can be controlled with various combinations of conditioning signals +such as a path in a scene, target poses, past motions, and scenes represented +as 3D point clouds. State-of-the-art methods are either models specialized to +one single setting, require vast amounts of high-quality and diverse training +data, or are unconditional models that do not integrate scene or other +contextual information. As a consequence, they have limited applicability and +rely on costly training data. To address these limitations, we propose a new +method ,dubbed Purposer, based on neural discrete representation learning. Our +model is capable of exploiting, in a flexible manner, different types of +information already present in open access large-scale datasets such as AMASS. +First, we encode unconditional human motion into a discrete latent space. +Second, an autoregressive generative model, conditioned with key contextual +information, either with prompting or additive tokens, and trained for +next-step prediction in this space, synthesizes sequences of latent indices. We +further design a novel conditioning block to handle future conditioning +information in such a causal model by using a network with two branches to +compute separate stacks of features. In this manner, Purposer can generate +realistic motion sequences in diverse test scenes. Through exhaustive +evaluation, we demonstrate that our multi-contextual solution outperforms +existing specialized approaches for specific contextual information, both in +terms of quality and diversity. Our model is trained with short sequences, but +a byproduct of being able to use various conditioning signals is that at test +time different combinations can be used to chain short sequences together and +generate long motions within a context scene. + +
+
+
+
+
+ + ☆ Neural Flow Diffusion Models: Learnable Forward Process for Improved + Diffusion Modelling + + +
+ Conventional diffusion models typically relies on a fixed forward process, +which implicitly defines complex marginal distributions over latent variables. +This can often complicate the reverse process' task in learning generative +trajectories, and results in costly inference for diffusion models. To address +these limitations, we introduce Neural Flow Diffusion Models (NFDM), a novel +framework that enhances diffusion models by supporting a broader range of +forward processes beyond the fixed linear Gaussian. We also propose a novel +parameterization technique for learning the forward process. Our framework +provides an end-to-end, simulation-free optimization objective, effectively +minimizing a variational upper bound on the negative log-likelihood. +Experimental results demonstrate NFDM's strong performance, evidenced by +state-of-the-art likelihood estimation. Furthermore, we investigate NFDM's +capacity for learning generative dynamics with specific characteristics, such +as deterministic straight lines trajectories. This exploration underscores +NFDM's versatility and its potential for a wide range of applications. + +
+
+
+
+
+ + ☆ A Hybrid Generative and Discriminative PointNet on Unordered Point Sets + + +
+ As point cloud provides a natural and flexible representation usable in +myriad applications (e.g., robotics and self-driving cars), the ability to +synthesize point clouds for analysis becomes crucial. Recently, Xie et al. +propose a generative model for unordered point sets in the form of an +energy-based model (EBM). Despite the model achieving an impressive performance +for point cloud generation, one separate model needs to be trained for each +category to capture the complex point set distributions. Besides, their method +is unable to classify point clouds directly and requires additional fine-tuning +for classification. One interesting question is: Can we train a single network +for a hybrid generative and discriminative model of point clouds? A similar +question has recently been answered in the affirmative for images, introducing +the framework of Joint Energy-based Model (JEM), which achieves high +performance in image classification and generation simultaneously. This paper +proposes GDPNet, the first hybrid Generative and Discriminative PointNet that +extends JEM for point cloud classification and generation. Our GDPNet retains +strong discriminative power of modern PointNet classifiers, while generating +point cloud samples rivaling state-of-the-art generative approaches. + +
+
+
+
+
+ + ☆ Is Retain Set All You Need in Machine Unlearning? Restoring Performance + of Unlearned Models with Out-Of-Distribution Images + + +
+ In this paper, we introduce Selective-distillation for Class and +Architecture-agnostic unleaRning (SCAR), a novel approximate unlearning method. +SCAR efficiently eliminates specific information while preserving the model's +test accuracy without using a retain set, which is a key component in +state-of-the-art approximate unlearning algorithms. Our approach utilizes a +modified Mahalanobis distance to guide the unlearning of the feature vectors of +the instances to be forgotten, aligning them to the nearest wrong class +distribution. Moreover, we propose a distillation-trick mechanism that distills +the knowledge of the original model into the unlearning model with +out-of-distribution images for retaining the original model's test performance +without using any retain set. Importantly, we propose a self-forget version of +SCAR that unlearns without having access to the forget set. We experimentally +verified the effectiveness of our method, on three public datasets, comparing +it with state-of-the-art methods. Our method obtains performance higher than +methods that operate without the retain set and comparable w.r.t the best +methods that rely on the retain set. + +
+
+
+
+
+ + ☆ Zero-Shot Medical Phrase Grounding with Off-the-shelf Diffusion Models + + +
+ Localizing the exact pathological regions in a given medical scan is an +important imaging problem that requires a large amount of bounding box ground +truth annotations to be accurately solved. However, there exist alternative, +potentially weaker, forms of supervision, such as accompanying free-text +reports, which are readily available. The task of performing localization with +textual guidance is commonly referred to as phrase grounding. In this work, we +use a publicly available Foundation Model, namely the Latent Diffusion Model, +to solve this challenging task. This choice is supported by the fact that the +Latent Diffusion Model, despite being generative in nature, contains mechanisms +(cross-attention) that implicitly align visual and textual features, thus +leading to intermediate representations that are suitable for the task at hand. +In addition, we aim to perform this task in a zero-shot manner, i.e., without +any further training on target data, meaning that the model's weights remain +frozen. To this end, we devise strategies to select features and also refine +them via post-processing without extra learnable parameters. We compare our +proposed method with state-of-the-art approaches which explicitly enforce +image-text alignment in a joint embedding space via contrastive learning. +Results on a popular chest X-ray benchmark indicate that our method is +competitive wih SOTA on different types of pathology, and even outperforms them +on average in terms of two metrics (mean IoU and AUC-ROC). Source code will be +released upon acceptance. + +
+
+ comment: 8 pages, 3 figures, submitted to IEEE J-BHI Special Issue on + Foundation Models in Medical Imaging +
+
+
+
+
+ + ☆ Zero-Shot Stitching in Reinforcement Learning using Relative + Representations + + +
+ Visual Reinforcement Learning is a popular and powerful framework that takes +full advantage of the Deep Learning breakthrough. However, it is also known +that variations in the input (e.g., different colors of the panorama due to the +season of the year) or the task (e.g., changing the speed limit for a car to +respect) could require complete retraining of the agents. In this work, we +leverage recent developments in unifying latent representations to demonstrate +that it is possible to combine the components of an agent, rather than retrain +it from scratch. We build upon the recent relative representations framework +and adapt it for Visual RL. This allows us to create completely new agents +capable of handling environment-task combinations never seen during training. +Our work paves the road toward a more accessible and flexible use of +reinforcement learning. + +
+
+ comment: 13 pages, 10 figures, 4 tables +
+
+
+
+
+ + ☆ Robust CLIP-Based Detector for Exposing Diffusion Model-Generated Images + + +
+ Diffusion models (DMs) have revolutionized image generation, producing +high-quality images with applications spanning various fields. However, their +ability to create hyper-realistic images poses significant challenges in +distinguishing between real and synthetic content, raising concerns about +digital authenticity and potential misuse in creating deepfakes. This work +introduces a robust detection framework that integrates image and text features +extracted by CLIP model with a Multilayer Perceptron (MLP) classifier. We +propose a novel loss that can improve the detector's robustness and handle +imbalanced datasets. Additionally, we flatten the loss landscape during the +model training to improve the detector's generalization capabilities. The +effectiveness of our method, which outperforms traditional detection +techniques, is demonstrated through extensive experiments, underscoring its +potential to set a new state-of-the-art approach in DM-generated image +detection. The code is available at +https://github.com/Purdue-M2/Robust_DM_Generated_Image_Detection. + +
+
+
+
+
+ + ☆ Training-and-prompt-free General Painterly Harmonization Using + Image-wise Attention Sharing + + +
+ Painterly Image Harmonization aims at seamlessly blending disparate visual +elements within a single coherent image. However, previous approaches often +encounter significant limitations due to training data constraints, the need +for time-consuming fine-tuning, or reliance on additional prompts. To surmount +these hurdles, we design a Training-and-prompt-Free General Painterly +Harmonization method using image-wise attention sharing (TF-GPH), which +integrates a novel "share-attention module". This module redefines the +traditional self-attention mechanism by allowing for comprehensive image-wise +attention, facilitating the use of a state-of-the-art pretrained latent +diffusion model without the typical training data limitations. Additionally, we +further introduce "similarity reweighting" mechanism enhances performance by +effectively harnessing cross-image information, surpassing the capabilities of +fine-tuning or prompt-based approaches. At last, we recognize the deficiencies +in existing benchmarks and propose the "General Painterly Harmonization +Benchmark", which employs range-based evaluation metrics to more accurately +reflect real-world application. Extensive experiments demonstrate the superior +efficacy of our method across various benchmarks. The code and web demo are +available at https://github.com/BlueDyee/TF-GPH. + +
+
+
+
+
+ + ☆ Learn2Talk: 3D Talking Face Learns from 2D Talking Face + + +
+ Speech-driven facial animation methods usually contain two main classes, 3D +and 2D talking face, both of which attract considerable research attention in +recent years. However, to the best of our knowledge, the research on 3D talking +face does not go deeper as 2D talking face, in the aspect of +lip-synchronization (lip-sync) and speech perception. To mind the gap between +the two sub-fields, we propose a learning framework named Learn2Talk, which can +construct a better 3D talking face network by exploiting two expertise points +from the field of 2D talking face. Firstly, inspired by the audio-video sync +network, a 3D sync-lip expert model is devised for the pursuit of lip-sync +between audio and 3D facial motion. Secondly, a teacher model selected from 2D +talking face methods is used to guide the training of the audio-to-3D motions +regression network to yield more 3D vertex accuracy. Extensive experiments show +the advantages of the proposed framework in terms of lip-sync, vertex accuracy +and speech perception, compared with state-of-the-arts. Finally, we show two +applications of the proposed framework: audio-visual speech recognition and +speech-driven 3D Gaussian Splatting based avatar animation. + +
+
+
+
+
+ + ☆ 3D Multi-frame Fusion for Video Stabilization CVPR 2024 + + +
+ In this paper, we present RStab, a novel framework for video stabilization +that integrates 3D multi-frame fusion through volume rendering. Departing from +conventional methods, we introduce a 3D multi-frame perspective to generate +stabilized images, addressing the challenge of full-frame generation while +preserving structure. The core of our approach lies in Stabilized Rendering +(SR), a volume rendering module, which extends beyond the image fusion by +incorporating feature fusion. The core of our RStab framework lies in +Stabilized Rendering (SR), a volume rendering module, fusing multi-frame +information in 3D space. Specifically, SR involves warping features and colors +from multiple frames by projection, fusing them into descriptors to render the +stabilized image. However, the precision of warped information depends on the +projection accuracy, a factor significantly influenced by dynamic regions. In +response, we introduce the Adaptive Ray Range (ARR) module to integrate depth +priors, adaptively defining the sampling range for the projection process. +Additionally, we propose Color Correction (CC) assisting geometric constraints +with optical flow for accurate color aggregation. Thanks to the three modules, +our RStab demonstrates superior performance compared with previous stabilizers +in the field of view (FOV), image quality, and video stability across various +datasets. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ MCM: Multi-condition Motion Synthesis Framework + + +
+ Conditional human motion synthesis (HMS) aims to generate human motion +sequences that conform to specific conditions. Text and audio represent the two +predominant modalities employed as HMS control conditions. While existing +research has primarily focused on single conditions, the multi-condition human +motion synthesis remains underexplored. In this study, we propose a +multi-condition HMS framework, termed MCM, based on a dual-branch structure +composed of a main branch and a control branch. This framework effectively +extends the applicability of the diffusion model, which is initially predicated +solely on textual conditions, to auditory conditions. This extension +encompasses both music-to-dance and co-speech HMS while preserving the +intrinsic quality of motion and the capabilities for semantic association +inherent in the original model. Furthermore, we propose the implementation of a +Transformer-based diffusion model, designated as MWNet, as the main branch. +This model adeptly apprehends the spatial intricacies and inter-joint +correlations inherent in motion sequences, facilitated by the integration of +multi-wise self-attention modules. Extensive experiments show that our method +achieves competitive results in single-condition and multi-condition HMS tasks. + +
+
+
+
+
+ + ☆ A Large-scale Medical Visual Task Adaptation Benchmark + + +
+ Visual task adaptation has been demonstrated to be effective in adapting +pre-trained Vision Transformers (ViTs) to general downstream visual tasks using +specialized learnable layers or tokens. However, there is yet a large-scale +benchmark to fully explore the effect of visual task adaptation on the +realistic and important medical domain, particularly across diverse medical +visual modalities, such as color images, X-ray, and CT. To close this gap, we +present Med-VTAB, a large-scale Medical Visual Task Adaptation Benchmark +consisting of 1.68 million medical images for diverse organs, modalities, and +adaptation approaches. Based on Med-VTAB, we explore the scaling law of medical +prompt tuning concerning tunable parameters and the generalizability of medical +visual adaptation using non-medical/medical pre-train weights. Besides, we +study the impact of patient ID out-of-distribution on medical visual +adaptation, which is a real and challenging scenario. Furthermore, results from +Med-VTAB indicate that a single pre-trained model falls short in medical task +adaptation. Therefore, we introduce GMoE-Adapter, a novel method that combines +medical and general pre-training weights through a gated mixture-of-experts +adapter, achieving state-of-the-art results in medical visual task adaptation. + +
+
+
+
+
+ + ☆ FipTR: A Simple yet Effective Transformer Framework for Future Instance + Prediction in Autonomous Driving + + +
+ The future instance prediction from a Bird's Eye View(BEV) perspective is a +vital component in autonomous driving, which involves future instance +segmentation and instance motion prediction. Existing methods usually rely on a +redundant and complex pipeline which requires multiple auxiliary outputs and +post-processing procedures. Moreover, estimated errors on each of the auxiliary +predictions will lead to degradation of the prediction performance. In this +paper, we propose a simple yet effective fully end-to-end framework named +Future Instance Prediction Transformer(FipTR), which views the task as BEV +instance segmentation and prediction for future frames. We propose to adopt +instance queries representing specific traffic participants to directly +estimate the corresponding future occupied masks, and thus get rid of complex +post-processing procedures. Besides, we devise a flow-aware BEV predictor for +future BEV feature prediction composed of a flow-aware deformable attention +that takes backward flow guiding the offset sampling. A novel future instance +matching strategy is also proposed to further improve the temporal coherence. +Extensive experiments demonstrate the superiority of FipTR and its +effectiveness under different temporal BEV encoders. + +
+
+
+
+
+ + ☆ How Does the Textual Information Affect the Retrieval of Multimodal + In-Context Learning? + + +
+ The increase in parameter size of multimodal large language models (MLLMs) +introduces significant capabilities, particularly in-context learning, where +MLLMs enhance task performance without updating pre-trained parameters. This +effectiveness, however, hinges on the appropriate selection of in-context +examples, a process that is currently biased towards visual data, overlooking +textual information. Furthermore, the area of supervised retrievers for MLLMs, +crucial for optimal in-context example selection, continues to be +uninvestigated. Our study offers an in-depth evaluation of the impact of +textual information on the unsupervised selection of in-context examples in +multimodal contexts, uncovering a notable sensitivity of retriever performance +to the employed modalities. Responding to this, we introduce a novel supervised +MLLM-retriever MSIER that employs a neural network to select examples that +enhance multimodal in-context learning efficiency. This approach is validated +through extensive testing across three distinct tasks, demonstrating the +method's effectiveness. Additionally, we investigate the influence of +modalities on our supervised retrieval method's training and pinpoint factors +contributing to our model's success. This exploration paves the way for future +advancements, highlighting the potential for refined in-context learning in +MLLMs through the strategic use of multimodal data. + +
+
+
+
+
+ + ☆ Foundation Model assisted Weakly Supervised LiDAR Semantic Segmentation + + +
+ Current point cloud semantic segmentation has achieved great advances when +given sufficient labels. However, the dense annotation of LiDAR point clouds +remains prohibitively expensive and time-consuming, unable to keep up with the +continuously growing volume of data. In this paper, we propose annotating +images with scattered points, followed by utilizing SAM (a Foundation model) to +generate semantic segmentation labels for the images. Finally, by mapping the +segmentation labels of the images to the LiDAR space using the intrinsic and +extrinsic parameters of the camera and LiDAR, we obtain labels for point cloud +semantic segmentation, and release Scatter-KITTI and Scatter-nuScenes, which +are the first works to utilize image segmentation-based SAM for weakly +supervised point cloud semantic segmentation. Furthermore, to mitigate the +influence of erroneous pseudo labels obtained from sparse annotations on point +cloud features, we propose a multi-modal weakly supervised network for LiDAR +semantic segmentation, called MM-ScatterNet. This network combines features +from both point cloud and image modalities, enhancing the representation +learning of point clouds by introducing consistency constraints between +multi-modal features and point cloud features. On the SemanticKITTI dataset, we +achieve 66\% of fully supervised performance using only 0.02% of annotated +data, and on the NuScenes dataset, we achieve 95% of fully supervised +performance using only 0.1% labeled points. + +
+
+
+
+
+ + ☆ Language-Driven Active Learning for Diverse Open-Set 3D Object Detection + + +
+ Object detection is crucial for ensuring safe autonomous driving. However, +data-driven approaches face challenges when encountering minority or novel +objects in the 3D driving scene. In this paper, we propose VisLED, a +language-driven active learning framework for diverse open-set 3D Object +Detection. Our method leverages active learning techniques to query diverse and +informative data samples from an unlabeled pool, enhancing the model's ability +to detect underrepresented or novel objects. Specifically, we introduce the +Vision-Language Embedding Diversity Querying (VisLED-Querying) algorithm, which +operates in both open-world exploring and closed-world mining settings. In +open-world exploring, VisLED-Querying selects data points most novel relative +to existing data, while in closed-world mining, it mines new instances of known +classes. We evaluate our approach on the nuScenes dataset and demonstrate its +effectiveness compared to random sampling and entropy-querying methods. Our +results show that VisLED-Querying consistently outperforms random sampling and +offers competitive performance compared to entropy-querying despite the +latter's model-optimality, highlighting the potential of VisLED for improving +object detection in autonomous driving scenarios. + +
+
+
+
+
+ + ☆ LSP Framework: A Compensatory Model for Defeating Trigger Reverse + Engineering via Label Smoothing Poisoning + + +
+ Deep neural networks are vulnerable to backdoor attacks. Among the existing +backdoor defense methods, trigger reverse engineering based approaches, which +reconstruct the backdoor triggers via optimizations, are the most versatile and +effective ones compared to other types of methods. In this paper, we summarize +and construct a generic paradigm for the typical trigger reverse engineering +process. Based on this paradigm, we propose a new perspective to defeat trigger +reverse engineering by manipulating the classification confidence of backdoor +samples. To determine the specific modifications of classification confidence, +we propose a compensatory model to compute the lower bound of the modification. +With proper modifications, the backdoor attack can easily bypass the trigger +reverse engineering based methods. To achieve this objective, we propose a +Label Smoothing Poisoning (LSP) framework, which leverages label smoothing to +specifically manipulate the classification confidences of backdoor samples. +Extensive experiments demonstrate that the proposed work can defeat the +state-of-the-art trigger reverse engineering based methods, and possess good +compatibility with a variety of existing backdoor attacks. + +
+
+
+
+
+ + ☆ Explainable Deepfake Video Detection using Convolutional Neural Network + and CapsuleNet + + +
+ Deepfake technology, derived from deep learning, seamlessly inserts +individuals into digital media, irrespective of their actual participation. Its +foundation lies in machine learning and Artificial Intelligence (AI). +Initially, deepfakes served research, industry, and entertainment. While the +concept has existed for decades, recent advancements render deepfakes nearly +indistinguishable from reality. Accessibility has soared, empowering even +novices to create convincing deepfakes. However, this accessibility raises +security concerns.The primary deepfake creation algorithm, GAN (Generative +Adversarial Network), employs machine learning to craft realistic images or +videos. Our objective is to utilize CNN (Convolutional Neural Network) and +CapsuleNet with LSTM to differentiate between deepfake-generated frames and +originals. Furthermore, we aim to elucidate our model's decision-making process +through Explainable AI, fostering transparent human-AI relationships and +offering practical examples for real-life scenarios. + +
+
+
+
+
+ + ☆ ECOR: Explainable CLIP for Object Recognition + + +
+ Large Vision Language Models (VLMs), such as CLIP, have significantly +contributed to various computer vision tasks, including object recognition and +object detection. Their open vocabulary feature enhances their value. However, +their black-box nature and lack of explainability in predictions make them less +trustworthy in critical domains. Recently, some work has been done to force +VLMs to provide reasonable rationales for object recognition, but this often +comes at the expense of classification accuracy. In this paper, we first +propose a mathematical definition of explainability in the object recognition +task based on the joint probability distribution of categories and rationales, +then leverage this definition to fine-tune CLIP in an explainable manner. +Through evaluations of different datasets, our method demonstrates +state-of-the-art performance in explainable classification. Notably, it excels +in zero-shot settings, showcasing its adaptability. This advancement improves +explainable object recognition, enhancing trust across diverse applications. +The code will be made available online upon publication. + +
+
+
+
+
+ + ☆ COIN: Counterfactual inpainting for weakly supervised semantic + segmentation for medical images + + +
+ Deep learning is dramatically transforming the field of medical imaging and +radiology, enabling the identification of pathologies in medical images, +including computed tomography (CT) and X-ray scans. However, the performance of +deep learning models, particularly in segmentation tasks, is often limited by +the need for extensive annotated datasets. To address this challenge, the +capabilities of weakly supervised semantic segmentation are explored through +the lens of Explainable AI and the generation of counterfactual explanations. +The scope of this research is development of a novel counterfactual inpainting +approach (COIN) that flips the predicted classification label from abnormal to +normal by using a generative model. For instance, if the classifier deems an +input medical image X as abnormal, indicating the presence of a pathology, the +generative model aims to inpaint the abnormal region, thus reversing the +classifier's original prediction label. The approach enables us to produce +precise segmentations for pathologies without depending on pre-existing +segmentation masks. Crucially, image-level labels are utilized, which are +substantially easier to acquire than creating detailed segmentation masks. The +effectiveness of the method is demonstrated by segmenting synthetic targets and +actual kidney tumors from CT images acquired from Tartu University Hospital in +Estonia. The findings indicate that COIN greatly surpasses established +attribution methods, such as RISE, ScoreCAM, and LayerCAM, as well as an +alternative counterfactual explanation method introduced by Singla et al. This +evidence suggests that COIN is a promising approach for semantic segmentation +of tumors in CT images, and presents a step forward in making deep learning +applications more accessible and effective in healthcare, where annotated data +is scarce. + +
+
+ comment: This work has been accepted to be presented to The 2nd World + Conference on eXplainable Artificial Intelligence (xAI 2024), July 17-19, + 2024 - Valletta, Malta +
+
+
+
+
+ + ☆ Unveiling the Ambiguity in Neural Inverse Rendering: A Parameter + Compensation Analysis + + +
+ Inverse rendering aims to reconstruct the scene properties of objects solely +from multiview images. However, it is an ill-posed problem prone to producing +ambiguous estimations deviating from physically accurate representations. In +this paper, we utilize Neural Microfacet Fields (NMF), a state-of-the-art +neural inverse rendering method to illustrate the inherent ambiguity. We +propose an evaluation framework to assess the degree of compensation or +interaction between the estimated scene properties, aiming to explore the +mechanisms behind this ill-posed problem and potential mitigation strategies. +Specifically, we introduce artificial perturbations to one scene property and +examine how adjusting another property can compensate for these perturbations. +To facilitate such experiments, we introduce a disentangled NMF where material +properties are independent. The experimental findings underscore the intrinsic +ambiguity present in neural inverse rendering and highlight the importance of +providing additional guidance through geometry, material, and illumination +priors. + +
+
+
+
+
+ + ☆ Generative Modelling with High-Order Langevin Dynamics + + +
+ Diffusion generative modelling (DGM) based on stochastic + differential equations (SDEs) with + score matching has achieved unprecedented results in data + generation. + In this paper, we propose a novel fast high-quality + generative modelling method + based on high-order + Langevin dynamics (HOLD) with score matching. + This motive is proved by third-order + Langevin dynamics. By augmenting the + previous SDEs, e.g. + variance exploding or variance preserving SDEs + for single-data variable processes, HOLD can simultaneously + model position, velocity, and + acceleration, thereby improving the quality + and speed of the data + generation at the same time. + HOLD is composed of one Ornstein-Uhlenbeck process + and two Hamiltonians, + which reduce the mixing time by two orders of magnitude. + Empirical experiments for unconditional image generation on the + public data set CIFAR-10 and CelebA-HQ show that the effect is significant in + both Frechet inception distance (FID) and negative log-likelihood, + and achieves the + state-of-the-art FID of 1.85 on CIFAR-10. + +
+
+ comment: Some of the results in this paper have been published or accepted at + conferences such as wacv2024, icassp2024, and icme2024 +
+
+
+
+
+ + ☆ Linearly-evolved Transformer for Pan-sharpening + + +
+ Vision transformer family has dominated the satellite pan-sharpening field +driven by the global-wise spatial information modeling mechanism from the core +self-attention ingredient. The standard modeling rules within these promising +pan-sharpening methods are to roughly stack the transformer variants in a +cascaded manner. Despite the remarkable advancement, their success may be at +the huge cost of model parameters and FLOPs, thus preventing its application +over low-resource satellites.To address this challenge between favorable +performance and expensive computation, we tailor an efficient linearly-evolved +transformer variant and employ it to construct a lightweight pan-sharpening +framework. In detail, we deepen into the popular cascaded transformer modeling +with cutting-edge methods and develop the alternative 1-order linearly-evolved +transformer variant with the 1-dimensional linear convolution chain to achieve +the same function. In this way, our proposed method is capable of benefiting +the cascaded modeling rule while achieving favorable performance in the +efficient manner. Extensive experiments over multiple satellite datasets +suggest that our proposed method achieves competitive performance against other +state-of-the-art with fewer computational resources. Further, the consistently +favorable performance has been verified over the hyper-spectral image fusion +task. Our main focus is to provide an alternative global modeling framework +with an efficient structure. The code will be publicly available. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ TextSquare: Scaling up Text-Centric Visual Instruction Tuning + + +
+ Text-centric visual question answering (VQA) has made great strides with the +development of Multimodal Large Language Models (MLLMs), yet open-source models +still fall short of leading models like GPT4V and Gemini, partly due to a lack +of extensive, high-quality instruction tuning data. To this end, we introduce a +new approach for creating a massive, high-quality instruction-tuning dataset, +Square-10M, which is generated using closed-source MLLMs. The data construction +process, termed Square, consists of four steps: Self-Questioning, Answering, +Reasoning, and Evaluation. Our experiments with Square-10M led to three key +findings: 1) Our model, TextSquare, considerably surpasses open-source previous +state-of-the-art Text-centric MLLMs and sets a new standard on OCRBench(62.2%). +It even outperforms top-tier models like GPT4V and Gemini in 6 of 10 +text-centric benchmarks. 2) Additionally, we demonstrate the critical role of +VQA reasoning data in offering comprehensive contextual insights for specific +questions. This not only improves accuracy but also significantly mitigates +hallucinations. Specifically, TextSquare scores an average of 75.1% across four +general VQA and hallucination evaluation datasets, outperforming previous +state-of-the-art models. 3) Notably, the phenomenon observed in scaling +text-centric VQA datasets reveals a vivid pattern: the exponential increase of +instruction tuning data volume is directly proportional to the improvement in +model performance, thereby validating the necessity of the dataset scale and +the high quality of Square-10M. + +
+
+
+
+
+ + ☆ A Point-Based Approach to Efficient LiDAR Multi-Task Perception + + +
+ Multi-task networks can potentially improve performance and computational +efficiency compared to single-task networks, facilitating online deployment. +However, current multi-task architectures in point cloud perception combine +multiple task-specific point cloud representations, each requiring a separate +feature encoder and making the network structures bulky and slow. We propose +PAttFormer, an efficient multi-task architecture for joint semantic +segmentation and object detection in point clouds that only relies on a +point-based representation. The network builds on transformer-based feature +encoders using neighborhood attention and grid-pooling and a query-based +detection decoder using a novel 3D deformable-attention detection head design. +Unlike other LiDAR-based multi-task architectures, our proposed PAttFormer does +not require separate feature encoders for multiple task-specific point cloud +representations, resulting in a network that is 3x smaller and 1.4x faster +while achieving competitive performance on the nuScenes and KITTI benchmarks +for autonomous driving perception. Our extensive evaluations show substantial +gains from multi-task learning, improving LiDAR semantic segmentation by +1.7% +in mIou and 3D object detection by +1.7% in mAP on the nuScenes benchmark +compared to the single-task models. + +
+
+ comment: 8 pages, 3 figures, 8 tables +
+
+
+
+
+ + ☆ MambaMOS: LiDAR-based 3D Moving Object Segmentation with Motion-aware + State Space Model + + +
+ LiDAR-based Moving Object Segmentation (MOS) aims to locate and segment +moving objects in point clouds of the current scan using motion information +from previous scans. Despite the promising results achieved by previous MOS +methods, several key issues, such as the weak coupling of temporal and spatial +information, still need further study. In this paper, we propose a novel +LiDAR-based 3D Moving Object Segmentation with Motion-aware State Space Model, +termed MambaMOS. Firstly, we develop a novel embedding module, the Time Clue +Bootstrapping Embedding (TCBE), to enhance the coupling of temporal and spatial +information in point clouds and alleviate the issue of overlooked temporal +clues. Secondly, we introduce the Motion-aware State Space Model (MSSM) to +endow the model with the capacity to understand the temporal correlations of +the same object across different time steps. Specifically, MSSM emphasizes the +motion states of the same object at different time steps through two distinct +temporal modeling and correlation steps. We utilize an improved state space +model to represent these motion differences, significantly modeling the motion +states. Finally, extensive experiments on the SemanticKITTI-MOS and KITTI-Road +benchmarks demonstrate that the proposed MambaMOS achieves state-of-the-art +performance. The source code of this work will be made publicly available at +https://github.com/Terminal-K/MambaMOS. + +
+
+ comment: The source code will be made publicly available at + https://github.com/Terminal-K/MambaMOS +
+
+
+
+
+ + ☆ Contrastive Gaussian Clustering: Weakly Supervised 3D Scene Segmentation + + +
+ We introduce Contrastive Gaussian Clustering, a novel approach capable of +provide segmentation masks from any viewpoint and of enabling 3D segmentation +of the scene. Recent works in novel-view synthesis have shown how to model the +appearance of a scene via a cloud of 3D Gaussians, and how to generate accurate +images from a given viewpoint by projecting on it the Gaussians before $\alpha$ +blending their color. Following this example, we train a model to include also +a segmentation feature vector for each Gaussian. These can then be used for 3D +scene segmentation, by clustering Gaussians according to their feature vectors; +and to generate 2D segmentation masks, by projecting the Gaussians on a plane +and $\alpha$ blending over their segmentation features. Using a combination of +contrastive learning and spatial regularization, our method can be trained on +inconsistent 2D segmentation masks, and still learn to generate segmentation +masks consistent across all views. Moreover, the resulting model is extremely +accurate, improving the IoU accuracy of the predicted masks by $+8\%$ over the +state of the art. Code and trained models will be released soon. + +
+
+
+
+
+ + ☆ Sentiment-oriented Transformer-based Variational Autoencoder Network for + Live Video Commenting + + +
+ Automatic live video commenting is with increasing attention due to its +significance in narration generation, topic explanation, etc. However, the +diverse sentiment consideration of the generated comments is missing from the +current methods. Sentimental factors are critical in interactive commenting, +and lack of research so far. Thus, in this paper, we propose a +Sentiment-oriented Transformer-based Variational Autoencoder (So-TVAE) network +which consists of a sentiment-oriented diversity encoder module and a batch +attention module, to achieve diverse video commenting with multiple sentiments +and multiple semantics. Specifically, our sentiment-oriented diversity encoder +elegantly combines VAE and random mask mechanism to achieve semantic diversity +under sentiment guidance, which is then fused with cross-modal features to +generate live video comments. Furthermore, a batch attention module is also +proposed in this paper to alleviate the problem of missing sentimental samples, +caused by the data imbalance, which is common in live videos as the popularity +of videos varies. Extensive experiments on Livebot and VideoIC datasets +demonstrate that the proposed So-TVAE outperforms the state-of-the-art methods +in terms of the quality and diversity of generated comments. Related code is +available at https://github.com/fufy1024/So-TVAE. + +
+
+ comment: 27 pages, 10 figures, ACM Transactions on Multimedia Computing, + Communications and Applications, 2024 +
+
+
+
+
+ + ☆ EfficientGS: Streamlining Gaussian Splatting for Large-Scale + High-Resolution Scene Representation + + +
+ In the domain of 3D scene representation, 3D Gaussian Splatting (3DGS) has +emerged as a pivotal technology. However, its application to large-scale, +high-resolution scenes (exceeding 4k$\times$4k pixels) is hindered by the +excessive computational requirements for managing a large number of Gaussians. +Addressing this, we introduce 'EfficientGS', an advanced approach that +optimizes 3DGS for high-resolution, large-scale scenes. We analyze the +densification process in 3DGS and identify areas of Gaussian +over-proliferation. We propose a selective strategy, limiting Gaussian increase +to key primitives, thereby enhancing the representational efficiency. +Additionally, we develop a pruning mechanism to remove redundant Gaussians, +those that are merely auxiliary to adjacent ones. For further enhancement, we +integrate a sparse order increment for Spherical Harmonics (SH), designed to +alleviate storage constraints and reduce training overhead. Our empirical +evaluations, conducted on a range of datasets including extensive 4K+ aerial +images, demonstrate that 'EfficientGS' not only expedites training and +rendering times but also achieves this with a model size approximately tenfold +smaller than conventional 3DGS while maintaining high rendering fidelity. + +
+
+
+
+
+ + ☆ Camera Agnostic Two-Head Network for Ego-Lane Inference + + +
+ Vision-based ego-lane inference using High-Definition (HD) maps is essential +in autonomous driving and advanced driver assistance systems. The traditional +approach necessitates well-calibrated cameras, which confines variation of +camera configuration, as the algorithm relies on intrinsic and extrinsic +calibration. In this paper, we propose a learning-based ego-lane inference by +directly estimating the ego-lane index from a single image. To enhance robust +performance, our model incorporates the two-head structure inferring ego-lane +in two perspectives simultaneously. Furthermore, we utilize an attention +mechanism guided by vanishing point-and-line to adapt to changes in viewpoint +without requiring accurate calibration. The high adaptability of our model was +validated in diverse environments, devices, and camera mounting points and +orientations. + +
+
+
+
+
+ + ☆ MixLight: Borrowing the Best of both Spherical Harmonics and Gaussian + Models + + +
+ Accurately estimating scene lighting is critical for applications such as +mixed reality. Existing works estimate illumination by generating illumination +maps or regressing illumination parameters. However, the method of generating +illumination maps has poor generalization performance and parametric models +such as Spherical Harmonic (SH) and Spherical Gaussian (SG) fall short in +capturing high-frequency or low-frequency components. This paper presents +MixLight, a joint model that utilizes the complementary characteristics of SH +and SG to achieve a more complete illumination representation, which uses SH +and SG to capture low-frequency ambient and high-frequency light sources +respectively. In addition, a special spherical light source sparsemax +(SLSparsemax) module that refers to the position and brightness relationship +between spherical light sources is designed to improve their sparsity, which is +significant but omitted by prior works. Extensive experiments demonstrate that +MixLight surpasses state-of-the-art (SOTA) methods on multiple metrics. In +addition, experiments on Web Dataset also show that MixLight as a parametric +method has better generalization performance than non-parametric methods. + +
+
+
+
+
+ + ☆ Continual Learning on a Diet: Learning from Sparsely Labeled Streams + Under Constrained Computation + + +
+ We propose and study a realistic Continual Learning (CL) setting where +learning algorithms are granted a restricted computational budget per time step +while training. We apply this setting to large-scale semi-supervised Continual +Learning scenarios with sparse label rates. Previous proficient CL methods +perform very poorly in this challenging setting. Overfitting to the sparse +labeled data and insufficient computational budget are the two main culprits +for such a poor performance. Our new setting encourages learning methods to +effectively and efficiently utilize the unlabeled data during training. To that +end, we propose a simple but highly effective baseline, DietCL, which utilizes +both unlabeled and labeled data jointly. DietCL meticulously allocates +computational budget for both types of data. We validate our baseline, at +scale, on several datasets, e.g., CLOC, ImageNet10K, and CGLM, under constraint +budget setups. DietCL outperforms, by a large margin, all existing supervised +CL algorithms as well as more recent continual semi-supervised methods. Our +extensive analysis and ablations demonstrate that DietCL is stable under a full +spectrum of label sparsity, computational budget, and various other ablations. + +
+
+
+
+
+ + ☆ The Solution for the CVPR2024 NICE Image Captioning Challenge + + +
+ This report introduces a solution to the Topic 1 Zero-shot Image Captioning +of 2024 NICE : New frontiers for zero-shot Image Captioning Evaluation. In +contrast to NICE 2023 datasets, this challenge involves new annotations by +humans with significant differences in caption style and content. Therefore, we +enhance image captions effectively through retrieval augmentation and caption +grading methods. At the data level, we utilize high-quality captions generated +by image caption models as training data to address the gap in text styles. At +the model level, we employ OFA (a large-scale visual-language pre-training +model based on handcrafted templates) to perform the image captioning task. +Subsequently, we propose caption-level strategy for the high-quality caption +data generated by the image caption models and integrate them with retrieval +augmentation strategy into the template to compel the model to generate higher +quality, more matching, and semantically enriched captions based on the +retrieval augmentation prompts. Our approach ranks first on the leaderboard, +achieving a CIDEr score of 234.11 and 1st in all other metrics. + +
+
+
+
+
+ + ☆ DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On + Transformer + + +
+ With the continuous development of OCR technology and the expansion of +application fields, text recognition in complex scenes has become a key +challenge. Factors such as multiple fonts, mixed scenes and complex layouts +seriously affect the recognition accuracy of traditional OCR models. Although +OCR models based on deep learning have performed well in specific fields or +similar data sets in recent years, the generalization ability and robustness of +the model are still a big challenge when facing complex environments with +multiple scenes. Furthermore, training an OCR model from scratch or fine-tuning +all parameters is very demanding on computing resources and inference time, +which limits the flexibility of its application. This study focuses on a +fundamental aspect of mixed text recognition in response to the challenges +mentioned above, which involves effectively fine-tuning the pre-trained basic +OCR model to demonstrate exceptional performance across various downstream +tasks. To this end, we propose a parameter-efficient hybrid text recognition +method based on pre-trained OCR Transformer, namely DLoRA-TrOCR. This method +embeds DoRA into the image encoder and LoRA into the internal structure of the +text decoder, enabling efficient parameter fine-tuning for downstream tasks. +Experimental results show that compared to similar parameter adjustment +methods, our model DLoRA-TrOCR has the smallest number of parameters and +performs better. It can achieve state-of-the-art performance on complex scene +data sets involving simultaneous recognition of mixed handwritten, printed and +street view texts. + +
+
+
+
+
+ + ☆ PATE-TripleGAN: Privacy-Preserving Image Synthesis with Gaussian + Differential Privacy + + +
+ Conditional Generative Adversarial Networks (CGANs) exhibit significant +potential in supervised learning model training by virtue of their ability to +generate realistic labeled images. However, numerous studies have indicated the +privacy leakage risk in CGANs models. The solution DPCGAN, incorporating the +differential privacy framework, faces challenges such as heavy reliance on +labeled data for model training and potential disruptions to original gradient +information due to excessive gradient clipping, making it difficult to ensure +model accuracy. To address these challenges, we present a privacy-preserving +training framework called PATE-TripleGAN. This framework incorporates a +classifier to pre-classify unlabeled data, establishing a three-party min-max +game to reduce dependence on labeled data. Furthermore, we present a hybrid +gradient desensitization algorithm based on the Private Aggregation of Teacher +Ensembles (PATE) framework and Differential Private Stochastic Gradient Descent +(DPSGD) method. This algorithm allows the model to retain gradient information +more effectively while ensuring privacy protection, thereby enhancing the +model's utility. Privacy analysis and extensive experiments affirm that the +PATE-TripleGAN model can generate a higher quality labeled image dataset while +ensuring the privacy of the training data. + +
+
+
+
+
+ + ☆ Separate in the Speech Chain: Cross-Modal Conditional Audio-Visual + Target Speech Extraction IJCAI 2024 + + +
+ The integration of visual cues has revitalized the performance of the target +speech extraction task, elevating it to the forefront of the field. +Nevertheless, this multi-modal learning paradigm often encounters the challenge +of modality imbalance. In audio-visual target speech extraction tasks, the +audio modality tends to dominate, potentially overshadowing the importance of +visual guidance. To tackle this issue, we propose AVSepChain, drawing +inspiration from the speech chain concept. Our approach partitions the +audio-visual target speech extraction task into two stages: speech perception +and speech production. In the speech perception stage, audio serves as the +dominant modality, while visual information acts as the conditional modality. +Conversely, in the speech production stage, the roles are reversed. This +transformation of modality status aims to alleviate the problem of modality +imbalance. Additionally, we introduce a contrastive semantic matching loss to +ensure that the semantic information conveyed by the generated speech aligns +with the semantic information conveyed by lip movements during the speech +production stage. Through extensive experiments conducted on multiple benchmark +datasets for audio-visual target speech extraction, we showcase the superior +performance achieved by our proposed method. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Generalized Few-Shot Meets Remote Sensing: Discovering Novel Classes in + Land Cover Mapping via Hybrid Semantic Segmentation Framework CVPR 2024 + + +
+ Land-cover mapping is one of the vital applications in Earth observation, +aiming at classifying each pixel's land-cover type of remote-sensing images. As +natural and human activities change the landscape, the land-cover map needs to +be rapidly updated. However, discovering newly appeared land-cover types in +existing classification systems is still a non-trivial task hindered by various +scales of complex land objects and insufficient labeled data over a wide-span +geographic area. In this paper, we propose a generalized few-shot +segmentation-based framework, named SegLand, to update novel classes in +high-resolution land-cover mapping. Specifically, the proposed framework is +designed in three parts: (a) Data pre-processing: the base training set and the +few-shot support sets of novel classes are analyzed and augmented; (b) Hybrid +segmentation structure; Multiple base learners and a modified Projection onto +Orthogonal Prototypes (POP) network are combined to enhance the base-class +recognition and to dig novel classes from insufficient labels data; (c) +Ultimate fusion: the semantic segmentation results of the base learners and POP +network are reasonably fused. The proposed framework has won first place in the +leaderboard of the OpenEarthMap Land Cover Mapping Few-Shot Challenge. +Experiments demonstrate the superiority of the framework for automatically +updating novel land-cover classes with limited labeled data. + +
+
+ comment: 11 pages, 11 figures, accepted by CVPR 2024 L3D-IVU Workshop +
+
+
+
+
+ + ☆ PDF-MVQA: A Dataset for Multimodal Information Retrieval in PDF-based + Visual Question Answering IJCAI 2024 + + +
+ Document Question Answering (QA) presents a challenge in understanding +visually-rich documents (VRD), particularly those dominated by lengthy textual +content like research journal articles. Existing studies primarily focus on +real-world documents with sparse text, while challenges persist in +comprehending the hierarchical semantic relations among multiple pages to +locate multimodal components. To address this gap, we propose PDF-MVQA, which +is tailored for research journal articles, encompassing multiple pages and +multimodal information retrieval. Unlike traditional machine reading +comprehension (MRC) tasks, our approach aims to retrieve entire paragraphs +containing answers or visually rich document entities like tables and figures. +Our contributions include the introduction of a comprehensive PDF Document VQA +dataset, allowing the examination of semantically hierarchical layout +structures in text-dominant documents. We also present new VRD-QA frameworks +designed to grasp textual contents and relations among document layouts +simultaneously, extending page-level understanding to the entire multi-page +document. Through this work, we aim to enhance the capabilities of existing +vision-and-language models in handling challenges posed by text-dominant +documents in VRD-QA. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Improving Prediction Accuracy of Semantic Segmentation Methods Using + Convolutional Autoencoder Based Pre-processing Layers + + +
+ In this paper, we propose a method to improve prediction accuracy of semantic +segmentation methods as follows: (1) construct a neural network that has +pre-processing layers based on a convolutional autoencoder ahead of a semantic +segmentation network, and (2) train the entire network initialized by the +weights of the pre-trained autoencoder. We applied this method to the fully +convolutional network (FCN) and experimentally compared its prediction accuracy +on the cityscapes dataset. The Mean IoU of the proposed target model with the +He normal initialization is 18.7% higher than that of FCN with the He normal +initialization. In addition, those of the modified models of the target model +are significantly higher than that of FCN with the He normal initialization. +The accuracy and loss curves during the training showed that these are +resulting from the improvement of the generalization ability. All of these +results provide strong evidence that the proposed method is significantly +effective in improving the prediction accuracy of FCN. The proposed method has +the following features: it is comparatively simple, whereas the effect on +improving the generalization ability and prediction accuracy of FCN is +significant; the increase in the number of parameters by using it is very +small, and that in the computation time is substantially large. In principle, +the proposed method can be applied to other semantic segmentation methods. For +semantic segmentation, at present, there is no effective way to improve the +prediction accuracy of existing methods. None have published a method which is +the same as or similar to our method and none have used such a method in +practice. Therefore, we believe that our method is useful in practice and +worthy of being widely known and used. + +
+
+ comment: 13 pages, 8 figures, 7 tables +
+
+
+
+
+ + ☆ uTRAND: Unsupervised Anomaly Detection in Traffic Trajectories + + +
+ Deep learning-based approaches have achieved significant improvements on +public video anomaly datasets, but often do not perform well in real-world +applications. This paper addresses two issues: the lack of labeled data and the +difficulty of explaining the predictions of a neural network. To this end, we +present a framework called uTRAND, that shifts the problem of anomalous +trajectory prediction from the pixel space to a semantic-topological domain. +The framework detects and tracks all types of traffic agents in bird's-eye-view +videos of traffic cameras mounted at an intersection. By conceptualizing the +intersection as a patch-based graph, it is shown that the framework learns and +models the normal behaviour of traffic agents without costly manual labeling. +Furthermore, uTRAND allows to formulate simple rules to classify anomalous +trajectories in a way suited for human interpretation. We show that uTRAND +outperforms other state-of-the-art approaches on a dataset of anomalous +trajectories collected in a real-world setting, while producing explainable +detection results. + +
+
+
+
+
+ + ☆ Dynamic Temperature Knowledge Distillation + + +
+ Temperature plays a pivotal role in moderating label softness in the realm of +knowledge distillation (KD). Traditional approaches often employ a static +temperature throughout the KD process, which fails to address the nuanced +complexities of samples with varying levels of difficulty and overlooks the +distinct capabilities of different teacher-student pairings. This leads to a +less-than-ideal transfer of knowledge. To improve the process of knowledge +propagation, we proposed Dynamic Temperature Knowledge Distillation (DTKD) +which introduces a dynamic, cooperative temperature control for both teacher +and student models simultaneously within each training iterafion. In +particular, we proposed "\textbf{sharpness}" as a metric to quantify the +smoothness of a model's output distribution. By minimizing the sharpness +difference between the teacher and the student, we can derive sample-specific +temperatures for them respectively. Extensive experiments on CIFAR-100 and +ImageNet-2012 demonstrate that DTKD performs comparably to leading KD +techniques, with added robustness in Target Class KD and None-target Class KD +scenarios.The code is available at https://github.com/JinYu1998/DTKD. + +
+
+
+
+
+ + ☆ Modeling Multi-Granularity Context Information Flow for Pavement Crack + Detection + + +
+ Crack detection has become an indispensable, interesting yet challenging task +in the computer vision community. Specially, pavement cracks have a highly +complex spatial structure, a low contrasting background and a weak spatial +continuity, posing a significant challenge to an effective crack detection +method. In this paper, we address these problems from a view that utilizes +contexts of the cracks and propose an end-to-end deep learning method to model +the context information flow. To precisely localize crack from an image, it is +critical to effectively extract and aggregate multi-granularity context, +including the fine-grained local context around the cracks (in spatial-level) +and the coarse-grained semantics (in segment-level). Concretely, in +Convolutional Neural Network (CNN), low-level features extracted by the shallow +layers represent the local information, while the deep layers extract the +semantic features. Additionally, a second main insight in this work is that the +semantic context should be an guidance to local context feature. By the above +insights, the proposed method we first apply the dilated convolution as the +backbone feature extractor to model local context, then we build a context +guidance module to leverage semantic context to guide local feature extraction +at multiple stages. To handle label alignment between stages, we apply the +Multiple Instance Learning (MIL) strategy to align the high-level feature to +the low-level ones in the stage-wise context flow. In addition, compared with +these public crack datasets, to our best knowledge, we release the largest, +most complex and most challenging Bitumen Pavement Crack (BPC) dataset. The +experimental results on the three crack datasets demonstrate that the proposed +method performs well and outperforms the current state-of-the-art methods. + +
+
+
+
+
+ + ☆ ESC: Evolutionary Stitched Camera Calibration in the Wild CEC 2024 + + +
+ This work introduces a novel end-to-end approach for estimating extrinsic +parameters of cameras in multi-camera setups on real-life sports fields. We +identify the source of significant calibration errors in multi-camera +environments and address the limitations of existing calibration methods, +particularly the disparity between theoretical models and actual sports field +characteristics. We propose the Evolutionary Stitched Camera calibration (ESC) +algorithm to bridge this gap. It consists of image segmentation followed by +evolutionary optimization of a novel loss function, providing a unified and +accurate multi-camera calibration solution with high visual fidelity. The +outcome allows the creation of virtual stitched views from multiple video +sources, being as important for practical applications as numerical accuracy. +We demonstrate the superior performance of our approach compared to +state-of-the-art methods across diverse real-life football fields with varying +physical characteristics. + +
+
+ comment: Accepted for IEEE CEC 2024 +
+
+
+
+
+ + ☆ Improving Chinese Character Representation with Formation Tree + + +
+ Learning effective representations for Chinese characters presents unique +challenges, primarily due to the vast number of characters and their continuous +growth, which requires models to handle an expanding category space. +Additionally, the inherent sparsity of character usage complicates the +generalization of learned representations. Prior research has explored +radical-based sequences to overcome these issues, achieving progress in +recognizing unseen characters. However, these approaches fail to fully exploit +the inherent tree structure of such sequences. To address these limitations and +leverage established data properties, we propose Formation Tree-CLIP (FT-CLIP). +This model utilizes formation trees to represent characters and incorporates a +dedicated tree encoder, significantly improving performance in both seen and +unseen character recognition tasks. We further introduce masking for to both +character images and tree nodes, enabling efficient and effective training. +This approach accelerates training significantly (by a factor of 2 or more) +while enhancing accuracy. Extensive experiments show that processing characters +through formation trees aligns better with their inherent properties than +direct sequential methods, significantly enhancing the generality and usability +of the representations. + +
+
+
+
+
+ + ☆ VoxAtnNet: A 3D Point Clouds Convolutional Neural Network for + Generalizable Face Presentation Attack Detection + + +
+ Facial biometrics are an essential components of smartphones to ensure +reliable and trustworthy authentication. However, face biometric systems are +vulnerable to Presentation Attacks (PAs), and the availability of more +sophisticated presentation attack instruments such as 3D silicone face masks +will allow attackers to deceive face recognition systems easily. In this work, +we propose a novel Presentation Attack Detection (PAD) algorithm based on 3D +point clouds captured using the frontal camera of a smartphone to detect +presentation attacks. The proposed PAD algorithm, VoxAtnNet, processes 3D point +clouds to obtain voxelization to preserve the spatial structure. Then, the +voxelized 3D samples were trained using the novel convolutional attention +network to detect PAs on the smartphone. Extensive experiments were carried out +on the newly constructed 3D face point cloud dataset comprising bona fide and +two different 3D PAIs (3D silicone face mask and wrap photo mask), resulting in +3480 samples. The performance of the proposed method was compared with existing +methods to benchmark the detection performance using three different evaluation +protocols. The experimental results demonstrate the improved performance of the +proposed method in detecting both known and unknown face presentation attacks. + +
+
+ comment: Accepted in 2024 18th International Conference on Automatic Face and + Gesture Recognition (FG) +
+
+
+
+
+ + ☆ MLSD-GAN -- Generating Strong High Quality Face Morphing Attacks using + Latent Semantic Disentanglement + + +
+ Face-morphing attacks are a growing concern for biometric researchers, as +they can be used to fool face recognition systems (FRS). These attacks can be +generated at the image level (supervised) or representation level +(unsupervised). Previous unsupervised morphing attacks have relied on +generative adversarial networks (GANs). More recently, researchers have used +linear interpolation of StyleGAN-encoded images to generate morphing attacks. +In this paper, we propose a new method for generating high-quality morphing +attacks using StyleGAN disentanglement. Our approach, called MLSD-GAN, +spherically interpolates the disentangled latents to produce realistic and +diverse morphing attacks. We evaluate the vulnerability of MLSD-GAN on two +deep-learning-based FRS techniques. The results show that MLSD-GAN poses a +significant threat to FRS, as it can generate morphing attacks that are highly +effective at fooling these systems. + +
+
+
+
+
+ + ☆ Exploring Interactive Semantic Alignment for Efficient HOI Detection + with Vision-language Model ICME2024 + + +
+ Human-Object Interaction (HOI) detection aims to localize human-object pairs +and comprehend their interactions. Recently, two-stage transformer-based +methods have demonstrated competitive performance. However, these methods +frequently focus on object appearance features and ignore global contextual +information. Besides, vision-language model CLIP which effectively aligns +visual and text embeddings has shown great potential in zero-shot HOI +detection. Based on the former facts, We introduce a novel HOI detector named +ISA-HOI, which extensively leverages knowledge from CLIP, aligning interactive +semantics between visual and textual features. We first extract global context +of image and local features of object to Improve interaction Features in images +(IF). On the other hand, we propose a Verb Semantic Improvement (VSI) module to +enhance textual features of verb labels via cross-modal fusion. Ultimately, our +method achieves competitive results on the HICO-DET and V-COCO benchmarks with +much fewer training epochs, and outperforms the state-of-the-art under +zero-shot settings. + +
+
+ comment: Accepted by ICME2024 +
+
+
+
+
+ + ☆ Detecting Out-Of-Distribution Earth Observation Images with Diffusion + Models CVPR + + +
+ Earth Observation imagery can capture rare and unusual events, such as +disasters and major landscape changes, whose visual appearance contrasts with +the usual observations. Deep models trained on common remote sensing data will +output drastically different features for these out-of-distribution samples, +compared to those closer to their training dataset. Detecting them could +therefore help anticipate changes in the observations, either geographical or +environmental. In this work, we show that the reconstruction error of diffusion +models can effectively serve as unsupervised out-of-distribution detectors for +remote sensing images, using them as a plausibility score. Moreover, we +introduce ODEED, a novel reconstruction-based scorer using the probability-flow +ODE of diffusion models. We validate it experimentally on SpaceNet 8 with +various scenarios, such as classical OOD detection with geographical shift and +near-OOD setups: pre/post-flood and non-flooded/flooded image recognition. We +show that our ODEED scorer significantly outperforms other diffusion-based and +discriminative baselines on the more challenging near-OOD scenarios of flood +image detection, where OOD images are close to the distribution tail. We aim to +pave the way towards better use of generative models for anomaly detection in +remote sensing. + +
+
+ comment: EARTHVISION 2024 IEEE/CVF CVPR Workshop. Large Scale Computer Vision + for Remote Sensing Imagery, Jun 2024, Seattle, United States +
+
+
+
+
+ + ☆ Pre-trained Vision-Language Models Learn Discoverable Visual Concepts + + +
+ Do vision-language models (VLMs) pre-trained to caption an image of a +"durian" learn visual concepts such as "brown" (color) and "spiky" (texture) at +the same time? We aim to answer this question as visual concepts learned "for +free" would enable wide applications such as neuro-symbolic reasoning or +human-interpretable object classification. We assume that the visual concepts, +if captured by pre-trained VLMs, can be extracted by their vision-language +interface with text-based concept prompts. We observe that recent works +prompting VLMs with concepts often differ in their strategies to define and +evaluate the visual concepts, leading to conflicting conclusions. We propose a +new concept definition strategy based on two observations: First, certain +concept prompts include shortcuts that recognize correct concepts for wrong +reasons; Second, multimodal information (e.g. visual discriminativeness, and +textual knowledge) should be leveraged when selecting the concepts. Our +proposed concept discovery and learning (CDL) framework is thus designed to +identify a diverse list of generic visual concepts (e.g. "spiky" as opposed to +"spiky durian"), which are ranked and selected based on visual and language +mutual information. We carefully design quantitative and human evaluations of +the discovered concepts on six diverse visual recognition datasets, which +confirm that pre-trained VLMs do learn visual concepts that provide accurate +and thorough descriptions for the recognized objects. All code and models are +publicly released. + +
+
+
+
+
+ + ☆ F2FLDM: Latent Diffusion Models with Histopathology Pre-Trained + Embeddings for Unpaired Frozen Section to FFPE Translation + + +
+ The Frozen Section (FS) technique is a rapid and efficient method, taking +only 15-30 minutes to prepare slides for pathologists' evaluation during +surgery, enabling immediate decisions on further surgical interventions. +However, FS process often introduces artifacts and distortions like folds and +ice-crystal effects. In contrast, these artifacts and distortions are absent in +the higher-quality formalin-fixed paraffin-embedded (FFPE) slides, which +require 2-3 days to prepare. While Generative Adversarial Network (GAN)-based +methods have been used to translate FS to FFPE images (F2F), they may leave +morphological inaccuracies with remaining FS artifacts or introduce new +artifacts, reducing the quality of these translations for clinical assessments. +In this study, we benchmark recent generative models, focusing on GANs and +Latent Diffusion Models (LDMs), to overcome these limitations. We introduce a +novel approach that combines LDMs with Histopathology Pre-Trained Embeddings to +enhance restoration of FS images. Our framework leverages LDMs conditioned by +both text and pre-trained embeddings to learn meaningful features of FS and +FFPE histopathology images. Through diffusion and denoising techniques, our +approach not only preserves essential diagnostic attributes like color staining +and tissue morphology but also proposes an embedding translation mechanism to +better predict the targeted FFPE representation of input FS images. As a +result, this work achieves a significant improvement in classification +performance, with the Area Under the Curve rising from 81.99% to 94.64%, +accompanied by an advantageous CaseFD. This work establishes a new benchmark +for FS to FFPE image translation quality, promising enhanced reliability and +accuracy in histopathology FS image analysis. Our work is available at +https://minhmanho.github.io/f2f_ldm/. + +
+
+ comment: Preprint. Our work is available at + https://minhmanho.github.io/f2f_ldm/ +
+
+
+
+
+ + ☆ Cooperative Sentiment Agents for Multimodal Sentiment Analysis + + +
+ In this paper, we propose a new Multimodal Representation Learning (MRL) +method for Multimodal Sentiment Analysis (MSA), which facilitates the adaptive +interaction between modalities through Cooperative Sentiment Agents, named +Co-SA. Co-SA comprises two critical components: the Sentiment Agents +Establishment (SAE) phase and the Sentiment Agents Cooperation (SAC) phase. +During the SAE phase, each sentiment agent deals with an unimodal signal and +highlights explicit dynamic sentiment variations within the modality via the +Modality-Sentiment Disentanglement (MSD) and Deep Phase Space Reconstruction +(DPSR) modules. Subsequently, in the SAC phase, Co-SA meticulously designs +task-specific interaction mechanisms for sentiment agents so that coordinating +multimodal signals to learn the joint representation. Specifically, Co-SA +equips an independent policy model for each sentiment agent that captures +significant properties within the modality. These policies are optimized +mutually through the unified reward adaptive to downstream tasks. Benefitting +from the rewarding mechanism, Co-SA transcends the limitation of pre-defined +fusion modes and adaptively captures unimodal properties for MRL in the +multimodal interaction setting. To demonstrate the effectiveness of Co-SA, we +apply it to address Multimodal Sentiment Analysis (MSA) and Multimodal Emotion +Recognition (MER) tasks. Our comprehensive experimental results demonstrate +that Co-SA excels at discovering diverse cross-modal features, encompassing +both common and complementary aspects. The code can be available at +https://github.com/smwanghhh/Co-SA. + +
+
+
+
+
+ + ☆ AED-PADA:Improving Generalizability of Adversarial Example Detection via + Principal Adversarial Domain Adaptation + + +
+ Adversarial example detection, which can be conveniently applied in many +scenarios, is important in the area of adversarial defense. Unfortunately, +existing detection methods suffer from poor generalization performance, because +their training process usually relies on the examples generated from a single +known adversarial attack and there exists a large discrepancy between the +training and unseen testing adversarial examples. To address this issue, we +propose a novel method, named Adversarial Example Detection via Principal +Adversarial Domain Adaptation (AED-PADA). Specifically, our approach identifies +the Principal Adversarial Domains (PADs), i.e., a combination of features of +the adversarial examples from different attacks, which possesses large coverage +of the entire adversarial feature space. Then, we pioneer to exploit +multi-source domain adaptation in adversarial example detection with PADs as +source domains. Experiments demonstrate the superior generalization ability of +our proposed AED-PADA. Note that this superiority is particularly achieved in +challenging scenarios characterized by employing the minimal magnitude +constraint for the perturbations. + +
+
+
+
+
+ + ☆ Transformer-Based Classification Outcome Prediction for Multimodal + Stroke Treatment + + +
+ This study proposes a multi-modal fusion framework Multitrans based on the +Transformer architecture and self-attention mechanism. This architecture +combines the study of non-contrast computed tomography (NCCT) images and +discharge diagnosis reports of patients undergoing stroke treatment, using a +variety of methods based on Transformer architecture approach to predicting +functional outcomes of stroke treatment. The results show that the performance +of single-modal text classification is significantly better than single-modal +image classification, but the effect of multi-modal combination is better than +any single modality. Although the Transformer model only performs worse on +imaging data, when combined with clinical meta-diagnostic information, both can +learn better complementary information and make good contributions to +accurately predicting stroke treatment effects.. + +
+
+
+
+
+ + ☆ MindTuner: Cross-Subject Visual Decoding with Visual Fingerprint and + Semantic Correction + + +
+ Decoding natural visual scenes from brain activity has flourished, with +extensive research in single-subject tasks and, however, less in cross-subject +tasks. Reconstructing high-quality images in cross-subject tasks is a +challenging problem due to profound individual differences between subjects and +the scarcity of data annotation. In this work, we proposed MindTuner for +cross-subject visual decoding, which achieves high-quality and rich-semantic +reconstructions using only 1 hour of fMRI training data benefiting from the +phenomena of visual fingerprint in the human visual system and a novel +fMRI-to-text alignment paradigm. Firstly, we pre-train a multi-subject model +among 7 subjects and fine-tune it with scarce data on new subjects, where LoRAs +with Skip-LoRAs are utilized to learn the visual fingerprint. Then, we take the +image modality as the intermediate pivot modality to achieve fMRI-to-text +alignment, which achieves impressive fMRI-to-text retrieval performance and +corrects fMRI-to-image reconstruction with fine-tuned semantics. The results of +both qualitative and quantitative analyses demonstrate that MindTuner surpasses +state-of-the-art cross-subject visual decoding models on the Natural Scenes +Dataset (NSD), whether using training data of 1 hour or 40 hours. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ SkelFormer: Markerless 3D Pose and Shape Estimation using Skeletal + Transformers + + +
+ We introduce SkelFormer, a novel markerless motion capture pipeline for +multi-view human pose and shape estimation. Our method first uses off-the-shelf +2D keypoint estimators, pre-trained on large-scale in-the-wild data, to obtain +3D joint positions. Next, we design a regression-based inverse-kinematic +skeletal transformer that maps the joint positions to pose and shape +representations from heavily noisy observations. This module integrates prior +knowledge about pose space and infers the full pose state at runtime. +Separating the 3D keypoint detection and inverse-kinematic problems, along with +the expressive representations learned by our skeletal transformer, enhance the +generalization of our method to unseen noisy data. We evaluate our method on +three public datasets in both in-distribution and out-of-distribution settings +using three datasets, and observe strong performance with respect to prior +works. Moreover, ablation experiments demonstrate the impact of each of the +modules of our architecture. Finally, we study the performance of our method in +dealing with noise and heavy occlusions and find considerable robustness with +respect to other solutions. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Dragtraffic: A Non-Expert Interactive and Point-Based Controllable + Traffic Scene Generation Framework + + +
+ The evaluation and training of autonomous driving systems require diverse and +scalable corner cases. However, most existing scene generation methods lack +controllability, accuracy, and versatility, resulting in unsatisfactory +generation results. To address this problem, we propose Dragtraffic, a +generalized, point-based, and controllable traffic scene generation framework +based on conditional diffusion. Dragtraffic enables non-experts to generate a +variety of realistic driving scenarios for different types of traffic agents +through an adaptive mixture expert architecture. We use a regression model to +provide a general initial solution and a refinement process based on the +conditional diffusion model to ensure diversity. User-customized context is +introduced through cross-attention to ensure high controllability. Experiments +on a real-world driving dataset show that Dragtraffic outperforms existing +methods in terms of authenticity, diversity, and freedom. + +
+
+
+
+
+ + ☆ SA-Attack: Speed-adaptive stealthy adversarial attack on trajectory + prediction + + +
+ Trajectory prediction is critical for the safe planning and navigation of +automated vehicles. The trajectory prediction models based on the neural +networks are vulnerable to adversarial attacks. Previous attack methods have +achieved high attack success rates but overlook the adaptability to realistic +scenarios and the concealment of the deceits. To address this problem, we +propose a speed-adaptive stealthy adversarial attack method named SA-Attack. +This method searches the sensitive region of trajectory prediction models and +generates the adversarial trajectories by using the vehicle-following method +and incorporating information about forthcoming trajectories. Our method has +the ability to adapt to different speed scenarios by reconstructing the +trajectory from scratch. Fusing future trajectory trends and curvature +constraints can guarantee the smoothness of adversarial trajectories, further +ensuring the stealthiness of attacks. The empirical study on the datasets of +nuScenes and Apolloscape demonstrates the attack performance of our proposed +method. Finally, we also demonstrate the adaptability and stealthiness of +SA-Attack for different speed scenarios. Our code is available at the +repository: https://github.com/eclipse-bot/SA-Attack. + +
+
+ comment: This work is published in IEEE IV Symposium +
+
+
+
+
+ + ☆ Rethinking Clothes Changing Person ReID: Conflicts, Synthesis, and + Optimization + + +
+ Clothes-changing person re-identification (CC-ReID) aims to retrieve images +of the same person wearing different outfits. Mainstream researches focus on +designing advanced model structures and strategies to capture identity +information independent of clothing. However, the same-clothes discrimination +as the standard ReID learning objective in CC-ReID is persistently ignored in +previous researches. In this study, we dive into the relationship between +standard and clothes-changing~(CC) learning objectives, and bring the inner +conflicts between these two objectives to the fore. We try to magnify the +proportion of CC training pairs by supplementing high-fidelity clothes-varying +synthesis, produced by our proposed Clothes-Changing Diffusion model. By +incorporating the synthetic images into CC-ReID model training, we observe a +significant improvement under CC protocol. However, such improvement sacrifices +the performance under the standard protocol, caused by the inner conflict +between standard and CC. For conflict mitigation, we decouple these objectives +and re-formulate CC-ReID learning as a multi-objective optimization (MOO) +problem. By effectively regularizing the gradient curvature across multiple +objectives and introducing preference restrictions, our MOO solution surpasses +the single-task training paradigm. Our framework is model-agnostic, and +demonstrates superior performance under both CC and standard ReID protocols. + +
+
+
+
+
+ + ☆ ELEV-VISION-SAM: Integrated Vision Language and Foundation Model for + Automated Estimation of Building Lowest Floor Elevation + + +
+ Street view imagery, aided by advancements in image quality and +accessibility, has emerged as a valuable resource for urban analytics research. +Recent studies have explored its potential for estimating lowest floor +elevation (LFE), offering a scalable alternative to traditional on-site +measurements, crucial for assessing properties' flood risk and damage extent. +While existing methods rely on object detection, the introduction of image +segmentation has broadened street view images' utility for LFE estimation, +although challenges still remain in segmentation quality and capability to +distinguish front doors from other doors. To address these challenges in LFE +estimation, this study integrates the Segment Anything model, a segmentation +foundation model, with vision language models to conduct text-prompt image +segmentation on street view images for LFE estimation. By evaluating various +vision language models, integration methods, and text prompts, we identify the +most suitable model for street view image analytics and LFE estimation tasks, +thereby improving the availability of the current LFE estimation model based on +image segmentation from 33% to 56% of properties. Remarkably, our proposed +method significantly enhances the availability of LFE estimation to almost all +properties in which the front door is visible in the street view image. Also +the findings present the first baseline and comparison of various vision models +of street view image-based LFE estimation. The model and findings not only +contribute to advancing street view image segmentation for urban analytics but +also provide a novel approach for image segmentation tasks for other civil +engineering and infrastructure analytics tasks. + +
+
+
+
+
+ + ☆ A visualization method for data domain changes in CNN networks and the + optimization method for selecting thresholds in classification tasks + + +
+ In recent years, Face Anti-Spoofing (FAS) has played a crucial role in +preserving the security of face recognition technology. With the rise of +counterfeit face generation techniques, the challenge posed by digitally edited +faces to face anti-spoofing is escalating. Existing FAS technologies primarily +focus on intercepting physically forged faces and lack a robust solution for +cross-domain FAS challenges. Moreover, determining an appropriate threshold to +achieve optimal deployment results remains an issue for intra-domain FAS. To +address these issues, we propose a visualization method that intuitively +reflects the training outcomes of models by visualizing the prediction results +on datasets. Additionally, we demonstrate that employing data augmentation +techniques, such as downsampling and Gaussian blur, can effectively enhance +performance on cross-domain tasks. Building upon our data visualization +approach, we also introduce a methodology for setting threshold values based on +the distribution of the training dataset. Ultimately, our methods secured us +second place in both the Unified Physical-Digital Face Attack Detection +competition and the Snapshot Spectral Imaging Face Anti-spoofing contest. The +training code is available at https://github.com/SeaRecluse/CVPRW2024. + +
+
+
+
+
+ + ☆ QUTE: Quantifying Uncertainty in TinyML models with Early-exit-assisted + ensembles + + +
+ Existing methods for uncertainty quantification incur massive memory and +compute overhead, often requiring multiple models/inferences. Hence they are +impractical on ultra-low-power KB-sized TinyML devices. To reduce overhead, +prior works have proposed the use of early-exit networks as ensembles to +quantify uncertainty in a single forward-pass. However, they still have a +prohibitive cost for tinyML. To address these challenges, we propose QUTE, a +novel resource-efficient early-exit-assisted ensemble architecture optimized +for tinyML models. QUTE adds additional output blocks at the final exit of the +base network and distills the knowledge of early-exits into these blocks to +create a diverse and lightweight ensemble architecture. Our results show that +QUTE outperforms popular prior works, and improves the quality of uncertainty +estimates by 6% with 3.1x lower model size on average compared to the most +relevant prior work. Furthermore, we demonstrate that QUTE is also effective in +detecting co-variate shifted and out-of-distribution inputs, and shows +competitive performance relative to G-ODIN, a state-of-the-art generalized OOD +detector. + +
+
+
+
+
+ + ☆ Cross-Modal Adapter: Parameter-Efficient Transfer Learning Approach for + Vision-Language Models ICME 2024 + + +
+ Adapter-based parameter-efficient transfer learning has achieved exciting +results in vision-language models. Traditional adapter methods often require +training or fine-tuning, facing challenges such as insufficient samples or +resource limitations. While some methods overcome the need for training by +leveraging image modality cache and retrieval, they overlook the text +modality's importance and cross-modal cues for the efficient adaptation of +parameters in visual-language models. This work introduces a cross-modal +parameter-efficient approach named XMAdapter. XMAdapter establishes cache +models for both text and image modalities. It then leverages retrieval through +visual-language bimodal information to gather clues for inference. By +dynamically adjusting the affinity ratio, it achieves cross-modal fusion, +decoupling different modal similarities to assess their respective +contributions. Additionally, it explores hard samples based on differences in +cross-modal affinity and enhances model performance through adaptive adjustment +of sample learning intensity. Extensive experimental results on benchmark +datasets demonstrate that XMAdapter outperforms previous adapter-based methods +significantly regarding accuracy, generalization, and efficiency. + +
+
+ comment: This paper is accepted to ICME 2024 +
+
+
+
+
+ + ☆ Privacy-Preserving Debiasing using Data Augmentation and Machine + Unlearning + + +
+ Data augmentation is widely used to mitigate data bias in the training +dataset. However, data augmentation exposes machine learning models to privacy +attacks, such as membership inference attacks. In this paper, we propose an +effective combination of data augmentation and machine unlearning, which can +reduce data bias while providing a provable defense against known attacks. +Specifically, we maintain the fairness of the trained model with +diffusion-based data augmentation, and then utilize multi-shard unlearning to +remove identifying information of original data from the ML model for +protection against privacy attacks. Experimental evaluation across diverse +datasets demonstrates that our approach can achieve significant improvements in +bias reduction as well as robustness against state-of-the-art privacy attacks. + +
+
+
+
+
+ + ☆ Unlocking Robust Segmentation Across All Age Groups via Continual + Learning + + +
+ Most deep learning models in medical imaging are trained on adult data with +unclear performance on pediatric images. In this work, we aim to address this +challenge in the context of automated anatomy segmentation in whole-body +Computed Tomography (CT). We evaluate the performance of CT organ segmentation +algorithms trained on adult data when applied to pediatric CT volumes and +identify substantial age-dependent underperformance. We subsequently propose +and evaluate strategies, including data augmentation and continual learning +approaches, to achieve good segmentation accuracy across all age groups. Our +best-performing model, trained using continual learning, achieves high +segmentation accuracy on both adult and pediatric data (Dice scores of 0.90 and +0.84 respectively). + +
+
+
+
+
+ + ☆ Equivariant Imaging for Self-supervised Hyperspectral Image Inpainting + + +
+ Hyperspectral imaging (HSI) is a key technology for earth observation, +surveillance, medical imaging and diagnostics, astronomy and space exploration. +The conventional technology for HSI in remote sensing applications is based on +the push-broom scanning approach in which the camera records the spectral image +of a stripe of the scene at a time, while the image is generated by the +aggregation of measurements through time. In real-world airborne and spaceborne +HSI instruments, some empty stripes would appear at certain locations, because +platforms do not always maintain a constant programmed attitude, or have access +to accurate digital elevation maps (DEM), and the travelling track is not +necessarily aligned with the hyperspectral cameras at all times. This makes the +enhancement of the acquired HS images from incomplete or corrupted observations +an essential task. We introduce a novel HSI inpainting algorithm here, called +Hyperspectral Equivariant Imaging (Hyper-EI). Hyper-EI is a self-supervised +learning-based method which does not require training on extensive datasets or +access to a pre-trained model. Experimental results show that the proposed +method achieves state-of-the-art inpainting performance compared to the +existing methods. + +
+
+ comment: 5 Pages, 4 Figures, 2 Tables +
+
+
+
+
+ + ☆ Motion-adaptive Separable Collaborative Filters for Blind Motion + Deblurring CVPR 2024 + + +
+ Eliminating image blur produced by various kinds of motion has been a +challenging problem. Dominant approaches rely heavily on model capacity to +remove blurring by reconstructing residual from blurry observation in feature +space. These practices not only prevent the capture of spatially variable +motion in the real world but also ignore the tailored handling of various +motions in image space. In this paper, we propose a novel real-world deblurring +filtering model called the Motion-adaptive Separable Collaborative (MISC) +Filter. In particular, we use a motion estimation network to capture motion +information from neighborhoods, thereby adaptively estimating spatially-variant +motion flow, mask, kernels, weights, and offsets to obtain the MISC Filter. The +MISC Filter first aligns the motion-induced blurring patterns to the motion +middle along the predicted flow direction, and then collaboratively filters the +aligned image through the predicted kernels, weights, and offsets to generate +the output. This design can handle more generalized and complex motion in a +spatially differentiated manner. Furthermore, we analyze the relationships +between the motion estimation network and the residual reconstruction network. +Extensive experiments on four widely used benchmarks demonstrate that our +method provides an effective solution for real-world motion blur removal and +achieves state-of-the-art performance. Code is available at +https://github.com/ChengxuLiu/MISCFilter + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ BACS: Background Aware Continual Semantic Segmentation + + +
+ Semantic segmentation plays a crucial role in enabling comprehensive scene +understanding for robotic systems. However, generating annotations is +challenging, requiring labels for every pixel in an image. In scenarios like +autonomous driving, there's a need to progressively incorporate new classes as +the operating environment of the deployed agent becomes more complex. For +enhanced annotation efficiency, ideally, only pixels belonging to new classes +would be annotated. This approach is known as Continual Semantic Segmentation +(CSS). Besides the common problem of classical catastrophic forgetting in the +continual learning setting, CSS suffers from the inherent ambiguity of the +background, a phenomenon we refer to as the "background shift'', since pixels +labeled as background could correspond to future classes (forward background +shift) or previous classes (backward background shift). As a result, continual +learning approaches tend to fail. This paper proposes a Backward Background +Shift Detector (BACS) to detect previously observed classes based on their +distance in the latent space from the foreground centroids of previous steps. +Moreover, we propose a modified version of the cross-entropy loss function, +incorporating the BACS detector to down-weight background pixels associated +with formerly observed classes. To combat catastrophic forgetting, we employ +masked feature distillation alongside dark experience replay. Additionally, our +approach includes a transformer decoder capable of adjusting to new classes +without necessitating an additional classification head. We validate BACS's +superior performance over existing state-of-the-art methods on standard CSS +benchmarks. + +
+
+ comment: 8 pages, 4 figures, CRV 2024 +
+
+
+
+
+ + ☆ DeepFake-O-Meter v2.0: An Open Platform for DeepFake Detection + + +
+ Deepfakes, as AI-generated media, have increasingly threatened media +integrity and personal privacy with realistic yet fake digital content. In this +work, we introduce an open-source and user-friendly online platform, +DeepFake-O-Meter v2.0, that integrates state-of-the-art methods for detecting +Deepfake images, videos, and audio. Built upon DeepFake-O-Meter v1.0, we have +made significant upgrades and improvements in platform architecture design, +including user interaction, detector integration, job balancing, and security +management. The platform aims to offer everyday users a convenient service for +analyzing DeepFake media using multiple state-of-the-art detection algorithms. +It ensures secure and private delivery of the analysis results. Furthermore, it +serves as an evaluation and benchmarking platform for researchers in digital +media forensics to compare the performance of multiple algorithms on the same +input. We have also conducted detailed usage analysis based on the collected +data to gain deeper insights into our platform's statistics. This involves +analyzing two-month trends in user activity and evaluating the processing +efficiency of each detector. + +
+
+
+
+
+ + ☆ Deep Learning-based Text-in-Image Watermarking + + +
+ In this work, we introduce a novel deep learning-based approach to +text-in-image watermarking, a method that embeds and extracts textual +information within images to enhance data security and integrity. Leveraging +the capabilities of deep learning, specifically through the use of +Transformer-based architectures for text processing and Vision Transformers for +image feature extraction, our method sets new benchmarks in the domain. The +proposed method represents the first application of deep learning in +text-in-image watermarking that improves adaptivity, allowing the model to +intelligently adjust to specific image characteristics and emerging threats. +Through testing and evaluation, our method has demonstrated superior robustness +compared to traditional watermarking techniques, achieving enhanced +imperceptibility that ensures the watermark remains undetectable across various +image contents. + +
+
+
+
+
+ + ☆ On-board classification of underwater images using hybrid + classical-quantum CNN based method + + +
+ Underwater images taken from autonomous underwater vehicles (AUV's) often +suffer from low light, high turbidity, poor contrast, motion-blur and excessive +light scattering and hence require image enhancement techniques for object +recognition. Machine learning methods are being increasingly used for object +recognition under such adverse conditions. These enhanced object recognition +methods of images taken from AUV's has potential applications in underwater +pipeline and optical fibre surveillance, ocean bed resource extraction, ocean +floor mapping, underwater species exploration, etc. While the classical machine +learning methods are very efficient in terms of accuracy, they require large +datasets and high computational time for image classification. In the current +work, we use quantum-classical hybrid machine learning methods for real-time +under-water object recognition on-board an AUV for the first time. We use +real-time motion-blurred and low-light images taken from an on-board camera of +AUV built in-house and apply existing hybrid machine learning methods for +object recognition. Our hybrid methods consist of quantum encoding and +flattening of classical images using quantum circuits and sending them to +classical neural networks for image classification. The results of hybrid +methods carried out using Pennylane based quantum simulators both on GPU and +using pre-trained models on an on-board NVIDIA GPU chipset are compared with +results from corresponding classical machine learning methods. We observe that +the hybrid quantum machine learning methods show an efficiency greater than +65\% and reduction in run-time by one-thirds and require 50\% smaller dataset +sizes for training the models compared to classical machine learning methods. +We hope that our work opens up further possibilities in quantum enhanced +real-time computer vision in autonomous vehicles. + +
+
+
+
+
+ + ☆ FreSeg: Frenet-Frame-based Part Segmentation for 3D Curvilinear + Structures + + +
+ Part segmentation is a crucial task for 3D curvilinear structures like neuron +dendrites and blood vessels, enabling the analysis of dendritic spines and +aneurysms with scientific and clinical significance. However, their diversely +winded morphology poses a generalization challenge to existing deep learning +methods, which leads to labor-intensive manual correction. In this work, we +propose FreSeg, a framework of part segmentation tasks for 3D curvilinear +structures. With Frenet-Frame-based point cloud transformation, it enables the +models to learn more generalizable features and have significant performance +improvements on tasks involving elongated and curvy geometries. We evaluate +FreSeg on 2 datasets: 1) DenSpineEM, an in-house dataset for dendritic spine +segmentation, and 2) IntrA, a public 3D dataset for intracranial aneurysm +segmentation. Further, we will release the DenSpineEM dataset, which includes +roughly 6,000 spines from 69 dendrites from 3 public electron microscopy (EM) +datasets, to foster the development of effective dendritic spine instance +extraction methods and, consequently, large-scale connectivity analysis to +better understand mammalian brains. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ RegWSI: Whole Slide Image Registration using Combined Deep Feature- and + Intensity-Based Methods: Winner of the ACROBAT 2023 Challenge + + +
+ The automatic registration of differently stained whole slide images (WSIs) +is crucial for improving diagnosis and prognosis by fusing complementary +information emerging from different visible structures. It is also useful to +quickly transfer annotations between consecutive or restained slides, thus +significantly reducing the annotation time and associated costs. Nevertheless, +the slide preparation is different for each stain and the tissue undergoes +complex and large deformations. Therefore, a robust, efficient, and accurate +registration method is highly desired by the scientific community and hospitals +specializing in digital pathology. We propose a two-step hybrid method +consisting of (i) deep learning- and feature-based initial alignment algorithm, +and (ii) intensity-based nonrigid registration using the instance optimization. +The proposed method does not require any fine-tuning to a particular dataset +and can be used directly for any desired tissue type and stain. The method +scored 1st place in the ACROBAT 2023 challenge. We evaluated using three open +datasets: (i) ANHIR, (ii) ACROBAT, and (iii) HyReCo, and performed several +ablation studies concerning the resolution used for registration and the +initial alignment robustness and stability. The method achieves the most +accurate results for the ACROBAT dataset, the cell-level registration accuracy +for the restained slides from the HyReCo dataset, and is among the best methods +evaluated on the ANHIR dataset. The method does not require any fine-tuning to +a new datasets and can be used out-of-the-box for other types of microscopic +images. The method is incorporated into the DeeperHistReg framework, allowing +others to directly use it to register, transform, and save the WSIs at any +desired pyramid level. The proposed method is a significant contribution to the +WSI registration, thus advancing the field of digital pathology. + +
+
+
+
+
+ + ☆ DeeperHistReg: Robust Whole Slide Images Registration Framework + + +
+ DeeperHistReg is a software framework dedicated to registering whole slide +images (WSIs) acquired using multiple stains. It allows one to perform the +preprocessing, initial alignment, and nonrigid registration of WSIs acquired +using multiple stains (e.g. hematoxylin \& eosin, immunochemistry). The +framework implements several state-of-the-art registration algorithms and +provides an interface to operate on arbitrary resolution of the WSIs (up to +200k x 200k). The framework is extensible and new algorithms can be easily +integrated by other researchers. The framework is available both as a PyPI +package and as a Docker container. + +
+
+
+
+
+ + ☆ Automatic Cranial Defect Reconstruction with Self-Supervised Deep + Deformable Masked Autoencoders + + +
+ Thousands of people suffer from cranial injuries every year. They require +personalized implants that need to be designed and manufactured before the +reconstruction surgery. The manual design is expensive and time-consuming +leading to searching for algorithms whose goal is to automatize the process. +The problem can be formulated as volumetric shape completion and solved by deep +neural networks dedicated to supervised image segmentation. However, such an +approach requires annotating the ground-truth defects which is costly and +time-consuming. Usually, the process is replaced with synthetic defect +generation. However, even the synthetic ground-truth generation is +time-consuming and limits the data heterogeneity, thus the deep models' +generalizability. In our work, we propose an alternative and simple approach to +use a self-supervised masked autoencoder to solve the problem. This approach by +design increases the heterogeneity of the training set and can be seen as a +form of data augmentation. We compare the proposed method with several +state-of-the-art deep neural networks and show both the quantitative and +qualitative improvement on the SkullBreak and SkullFix datasets. The proposed +method can be used to efficiently reconstruct the cranial defects in real time. + +
+
+
+
+
+ + ☆ On-Demand Earth System Data Cubes + + +
+ Advancements in Earth system science have seen a surge in diverse datasets. +Earth System Data Cubes (ESDCs) have been introduced to efficiently handle this +influx of high-dimensional data. ESDCs offer a structured, intuitive framework +for data analysis, organising information within spatio-temporal grids. The +structured nature of ESDCs unlocks significant opportunities for Artificial +Intelligence (AI) applications. By providing well-organised data, ESDCs are +ideally suited for a wide range of sophisticated AI-driven tasks. An automated +framework for creating AI-focused ESDCs with minimal user input could +significantly accelerate the generation of task-specific training data. Here we +introduce cubo, an open-source Python tool designed for easy generation of +AI-focused ESDCs. Utilising collections in SpatioTemporal Asset Catalogs (STAC) +that are stored as Cloud Optimised GeoTIFFs (COGs), cubo efficiently creates +ESDCs, requiring only central coordinates, spatial resolution, edge size, and +time range. + +
+
+ comment: Accepted at IGARSS24 +
+
+
+
+
+ + ☆ ToNNO: Tomographic Reconstruction of a Neural Network's Output for + Weakly Supervised Segmentation of 3D Medical Images CVPR 2024 + + +
+ Annotating lots of 3D medical images for training segmentation models is +time-consuming. The goal of weakly supervised semantic segmentation is to train +segmentation models without using any ground truth segmentation masks. Our work +addresses the case where only image-level categorical labels, indicating the +presence or absence of a particular region of interest (such as tumours or +lesions), are available. Most existing methods rely on class activation mapping +(CAM). We propose a novel approach, ToNNO, which is based on the Tomographic +reconstruction of a Neural Network's Output. Our technique extracts stacks of +slices with different angles from the input 3D volume, feeds these slices to a +2D encoder, and applies the inverse Radon transform in order to reconstruct a +3D heatmap of the encoder's predictions. This generic method allows to perform +dense prediction tasks on 3D volumes using any 2D image encoder. We apply it to +weakly supervised medical image segmentation by training the 2D encoder to +output high values for slices containing the regions of interest. We test it on +four large scale medical image datasets and outperform 2D CAM methods. We then +extend ToNNO by combining tomographic reconstruction with CAM methods, +proposing Averaged CAM and Tomographic CAM, which obtain even better results. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Single-sample image-fusion upsampling of fluorescence lifetime images + + +
+ Fluorescence lifetime imaging microscopy (FLIM) provides detailed information +about molecular interactions and biological processes. A major bottleneck for +FLIM is image resolution at high acquisition speeds, due to the engineering and +signal-processing limitations of time-resolved imaging technology. Here we +present single-sample image-fusion upsampling (SiSIFUS), a data-fusion approach +to computational FLIM super-resolution that combines measurements from a +low-resolution time-resolved detector (that measures photon arrival time) and a +high-resolution camera (that measures intensity only). To solve this otherwise +ill-posed inverse retrieval problem, we introduce statistically informed priors +that encode local and global dependencies between the two single-sample +measurements. This bypasses the risk of out-of-distribution hallucination as in +traditional data-driven approaches and delivers enhanced images compared for +example to standard bilinear interpolation. The general approach laid out by +SiSIFUS can be applied to other image super-resolution problems where two +different datasets are available. + +
+
+ comment: 18 pages, 11 figures. To be published in Science Advances +
+
+
+
+
+ + ☆ DensePANet: An improved generative adversarial network for photoacoustic + tomography image reconstruction from sparse data + + +
+ Image reconstruction is an essential step of every medical imaging method, +including Photoacoustic Tomography (PAT), which is a promising modality of +imaging, that unites the benefits of both ultrasound and optical imaging +methods. Reconstruction of PAT images using conventional methods results in +rough artifacts, especially when applied directly to sparse PAT data. In recent +years, generative adversarial networks (GANs) have shown a powerful performance +in image generation as well as translation, rendering them a smart choice to be +applied to reconstruction tasks. In this study, we proposed an end-to-end +method called DensePANet to solve the problem of PAT image reconstruction from +sparse data. The proposed model employs a novel modification of UNet in its +generator, called FD-UNet++, which considerably improves the reconstruction +performance. We evaluated the method on various in-vivo and simulated datasets. +Quantitative and qualitative results show the better performance of our model +over other prevalent deep learning techniques. + +
+
+
+
+
+ + ☆ DISC: Latent Diffusion Models with Self-Distillation from Separated + Conditions for Prostate Cancer Grading CVPR 2024 + + +
+ Latent Diffusion Models (LDMs) can generate high-fidelity images from noise, +offering a promising approach for augmenting histopathology images for training +cancer grading models. While previous works successfully generated +high-fidelity histopathology images using LDMs, the generation of image tiles +to improve prostate cancer grading has not yet been explored. Additionally, +LDMs face challenges in accurately generating admixtures of multiple cancer +grades in a tile when conditioned by a tile mask. In this study, we train +specific LDMs to generate synthetic tiles that contain multiple Gleason Grades +(GGs) by leveraging pixel-wise annotations in input tiles. We introduce a novel +framework named Self-Distillation from Separated Conditions (DISC) that +generates GG patterns guided by GG masks. Finally, we deploy a training +framework for pixel-level and slide-level prostate cancer grading, where +synthetic tiles are effectively utilized to improve the cancer grading +performance of existing models. As a result, this work surpasses previous works +in two domains: 1) our LDMs enhanced with DISC produce more accurate tiles in +terms of GG patterns, and 2) our training scheme, incorporating synthetic data, +significantly improves the generalization of the baseline model for prostate +cancer grading, particularly in challenging cases of rare GG5, demonstrating +the potential of generative models to enhance cancer grading when data is +limited. + +
+
+ comment: Abstract accepted for ISBI 2024. Extended version to be presented at + SynData4CV @ CVPR 2024. See more at https://minhmanho.github.io/disc/ +
+
+
+
+
+ + ♻ ☆ QGen: On the Ability to Generalize in Quantization Aware Training + + +
+ Quantization lowers memory usage, computational requirements, and latency by +utilizing fewer bits to represent model weights and activations. In this work, +we investigate the generalization properties of quantized neural networks, a +characteristic that has received little attention despite its implications on +model performance. In particular, first, we develop a theoretical model for +quantization in neural networks and demonstrate how quantization functions as a +form of regularization. Second, motivated by recent work connecting the +sharpness of the loss landscape and generalization, we derive an approximate +bound for the generalization of quantized models conditioned on the amount of +quantization noise. We then validate our hypothesis by experimenting with over +2000 models trained on CIFAR-10, CIFAR-100, and ImageNet datasets on +convolutional and transformer-based models. + +
+
+
+
+
+ + ♻ ☆ One-shot skill assessment in high-stakes domains with limited data via + meta learning + + +
+ Deep Learning (DL) has achieved robust competency assessment in various +high-stakes fields. However, the applicability of DL models is often hampered +by their substantial data requirements and confinement to specific training +domains. This prevents them from transitioning to new tasks where data is +scarce. Therefore, domain adaptation emerges as a critical element for the +practical implementation of DL in real-world scenarios. Herein, we introduce +A-VBANet, a novel meta-learning model capable of delivering domain-agnostic +skill assessment via one-shot learning. Our methodology has been tested by +assessing surgical skills on five laparoscopic and robotic simulators and +real-life laparoscopic cholecystectomy. Our model successfully adapted with +accuracies up to 99.5% in one-shot and 99.9% in few-shot settings for simulated +tasks and 89.7% for laparoscopic cholecystectomy. This study marks the first +instance of a domain-agnostic methodology for skill assessment in critical +fields setting a precedent for the broad application of DL across diverse +real-life domains with limited data. + +
+
+ comment: 23 pages (Main Manuscript + Supplementary Materials + Arxiv Logs), 4 + figures (+2 Supplementary Figures), 2 tables (+5 Supplementary Tables) +
+
+
+
+
+ + ♻ ☆ HiLo: Detailed and Robust 3D Clothed Human Reconstruction with High-and + Low-Frequency Information of Parametric Models CVPR 2024 + + +
+ Reconstructing 3D clothed human involves creating a detailed geometry of +individuals in clothing, with applications ranging from virtual try-on, movies, +to games. To enable practical and widespread applications, recent advances +propose to generate a clothed human from an RGB image. However, they struggle +to reconstruct detailed and robust avatars simultaneously. We empirically find +that the high-frequency (HF) and low-frequency (LF) information from a +parametric model has the potential to enhance geometry details and improve +robustness to noise, respectively. Based on this, we propose HiLo, namely +clothed human reconstruction with high- and low-frequency information, which +contains two components. 1) To recover detailed geometry using HF information, +we propose a progressive HF Signed Distance Function to enhance the detailed 3D +geometry of a clothed human. We analyze that our progressive learning manner +alleviates large gradients that hinder model convergence. 2) To achieve robust +reconstruction against inaccurate estimation of the parametric model by using +LF information, we propose a spatial interaction implicit function. This +function effectively exploits the complementary spatial information from a +low-resolution voxel grid of the parametric model. Experimental results +demonstrate that HiLo outperforms the state-of-the-art methods by 10.43% and +9.54% in terms of Chamfer distance on the Thuman2.0 and CAPE datasets, +respectively. Additionally, HiLo demonstrates robustness to noise from the +parametric model, challenging poses, and various clothing styles. + +
+
+ comment: CVPR 2024 Accepted Paper +
+
+
+
+
+ + ♻ ☆ QDFormer: Towards Robust Audiovisual Segmentation in Complex + Environments with Quantization-based Semantic Decomposition + + +
+ Audiovisual segmentation (AVS) is a challenging task that aims to segment +visual objects in videos according to their associated acoustic cues. With +multiple sound sources and background disturbances involved, establishing +robust correspondences between audio and visual contents poses unique +challenges due to (1) complex entanglement across sound sources and (2) +frequent changes in the occurrence of distinct sound events. Assuming sound +events occur independently, the multi-source semantic space can be represented +as the Cartesian product of single-source sub-spaces. We are motivated to +decompose the multi-source audio semantics into single-source semantics for +more effective interactions with visual content. We propose a semantic +decomposition method based on product quantization, where the multi-source +semantics can be decomposed and represented by several disentangled and +noise-suppressed single-source semantics. Furthermore, we introduce a +global-to-local quantization mechanism, which distills knowledge from stable +global (clip-level) features into local (frame-level) ones, to handle frequent +changes in audio semantics. Extensive experiments demonstrate that our +semantically decomposed audio representation significantly improves AVS +performance, e.g., +21.2% mIoU on the challenging AVS-Semantic benchmark with +ResNet50 backbone. https://github.com/lxa9867/QSD. + +
+
+
+
+
+ + ♻ ☆ An Embodied Generalist Agent in 3D World + + +
+ Leveraging massive knowledge and learning schemes from large language models +(LLMs), recent machine learning models show notable successes in building +generalist agents that exhibit the capability of general-purpose task solving +in diverse domains, including natural language processing, computer vision, and +robotics. However, a significant challenge remains as these models exhibit +limited ability in understanding and interacting with the 3D world. We argue +this limitation significantly hinders the current models from performing +real-world tasks and further achieving general intelligence. To this end, we +introduce an embodied multi-modal and multi-task generalist agent that excels +in perceiving, grounding, reasoning, planning, and acting in the 3D world. Our +proposed agent, referred to as LEO, is trained with shared LLM-based model +architectures, objectives, and weights in two stages: (i) 3D vision-language +alignment and (ii) 3D vision-language-action instruction tuning. To facilitate +the training, we meticulously curate and generate an extensive dataset +comprising object-level and scene-level multi-modal tasks with exceeding scale +and complexity, necessitating a deep understanding of and interaction with the +3D world. Through rigorous experiments, we demonstrate LEO's remarkable +proficiency across a wide spectrum of tasks, including 3D captioning, question +answering, embodied reasoning, embodied navigation, and robotic manipulation. +Our ablation results further provide valuable insights for the development of +future embodied generalist agents. + +
+
+ comment: The first four authors contribute equally. Project page: + https://embodied-generalist.github.io +
+
+
+
+
+ + ♻ ☆ Mitigating Open-Vocabulary Caption Hallucinations + + +
+ While recent years have seen rapid progress in image-conditioned text +generation, image captioning still suffers from the fundamental issue of +hallucinations, namely, the generation of spurious details that cannot be +inferred from the given image. Existing methods largely use closed-vocabulary +object lists to mitigate or evaluate hallucinations in image captioning, +ignoring the long-tailed nature of hallucinations that occur in practice. To +this end, we propose a framework for addressing hallucinations in image +captioning in the open-vocabulary setting. Our framework includes a new +benchmark, OpenCHAIR, that leverages generative foundation models to evaluate +open-vocabulary object hallucinations for image captioning, surpassing the +popular and similarly-sized CHAIR benchmark in both diversity and accuracy. +Furthermore, to mitigate open-vocabulary hallucinations without using a closed +object list, we propose MOCHa, an approach harnessing advancements in +reinforcement learning. Our multi-objective reward function explicitly targets +the trade-off between fidelity and adequacy in generations without requiring +any strong supervision. MOCHa improves a large variety of image captioning +models, as captured by our OpenCHAIR benchmark and other existing metrics. We +will release our code and models. + +
+
+ comment: Website Link: https://assafbk.github.io/mocha/ +
+
+
+
+
+ + ♻ ☆ Feature Corrective Transfer Learning: End-to-End Solutions to Object + Detection in Non-Ideal Visual Conditions CVPR + + +
+ A significant challenge in the field of object detection lies in the system's +performance under non-ideal imaging conditions, such as rain, fog, low +illumination, or raw Bayer images that lack ISP processing. Our study +introduces "Feature Corrective Transfer Learning", a novel approach that +leverages transfer learning and a bespoke loss function to facilitate the +end-to-end detection of objects in these challenging scenarios without the need +to convert non-ideal images into their RGB counterparts. In our methodology, we +initially train a comprehensive model on a pristine RGB image dataset. +Subsequently, non-ideal images are processed by comparing their feature maps +against those from the initial ideal RGB model. This comparison employs the +Extended Area Novel Structural Discrepancy Loss (EANSDL), a novel loss function +designed to quantify similarities and integrate them into the detection loss. +This approach refines the model's ability to perform object detection across +varying conditions through direct feature map correction, encapsulating the +essence of Feature Corrective Transfer Learning. Experimental validation on +variants of the KITTI dataset demonstrates a significant improvement in mean +Average Precision (mAP), resulting in a 3.8-8.1% relative enhancement in +detection under non-ideal conditions compared to the baseline model, and a less +marginal performance difference within 1.3% of the mAP@[0.5:0.95] achieved +under ideal conditions by the standard Faster RCNN algorithm. + +
+
+ comment: 2024 CVPR UG2+ Workshop +
+
+
+
+
+ + ♻ ☆ RANRAC: Robust Neural Scene Representations via Random Ray Consensus + + +
+ Learning-based scene representations such as neural radiance fields or light +field networks, that rely on fitting a scene model to image observations, +commonly encounter challenges in the presence of inconsistencies within the +images caused by occlusions, inaccurately estimated camera parameters or +effects like lens flare. To address this challenge, we introduce RANdom RAy +Consensus (RANRAC), an efficient approach to eliminate the effect of +inconsistent data, thereby taking inspiration from classical RANSAC based +outlier detection for model fitting. In contrast to the down-weighting of the +effect of outliers based on robust loss formulations, our approach reliably +detects and excludes inconsistent perspectives, resulting in clean images +without floating artifacts. For this purpose, we formulate a fuzzy adaption of +the RANSAC paradigm, enabling its application to large scale models. We +interpret the minimal number of samples to determine the model parameters as a +tunable hyperparameter, investigate the generation of hypotheses with +data-driven models, and analyze the validation of hypotheses in noisy +environments. We demonstrate the compatibility and potential of our solution +for both photo-realistic robust multi-view reconstruction from real-world +images based on neural radiance fields and for single-shot reconstruction based +on light-field networks. In particular, the results indicate significant +improvements compared to state-of-the-art robust methods for novel-view +synthesis on both synthetic and captured scenes with various inconsistencies +including occlusions, noisy camera pose estimates, and unfocused perspectives. +The results further indicate significant improvements for single-shot +reconstruction from occluded images. Project Page: +https://bennobuschmann.com/ranrac/ + +
+
+
+
+
+ + ♻ ☆ RefinedFields: Radiance Fields Refinement for Unconstrained Scenes + + +
+ Modeling large scenes from unconstrained images has proven to be a major +challenge in computer vision. Existing methods tackling in-the-wild scene +modeling operate in closed-world settings, where no conditioning on priors +acquired from real-world images is present. We propose RefinedFields, which is, +to the best of our knowledge, the first method leveraging pre-trained models to +improve in-the-wild scene modeling. We employ pre-trained networks to refine +K-Planes representations via optimization guidance using an alternating +training procedure. We carry out extensive experiments and verify the merit of +our method on synthetic data and real tourism photo collections. RefinedFields +enhances rendered scenes with richer details and improves upon its base +representation on the task of novel view synthesis in the wild. Our project +page can be found at https://refinedfields.github.io. + +
+
+ comment: Corrected Table 2, where some comparisons were done among models + trained at different resolutions +
+
+
+
+
+ + ♻ ☆ On the Pitfalls of Batch Normalization for End-to-End Video Learning: A + Study on Surgical Workflow Analysis + + +
+ Batch Normalization's (BN) unique property of depending on other samples in a +batch is known to cause problems in several tasks, including sequence modeling. +Yet, BN-related issues are hardly studied for long video understanding, despite +the ubiquitous use of BN in CNNs (Convolutional Neural Networks) for feature +extraction. Especially in surgical workflow analysis, where the lack of +pretrained feature extractors has led to complex, multi-stage training +pipelines, limited awareness of BN issues may have hidden the benefits of +training CNNs and temporal models end to end. In this paper, we analyze +pitfalls of BN in video learning, including issues specific to online tasks +such as a 'cheating' effect in anticipation. We observe that BN's properties +create major obstacles for end-to-end learning. However, using BN-free +backbones, even simple CNN-LSTMs beat the state of the art +{\color{\colorrevtwo}on three surgical workflow benchmarks} by utilizing +adequate end-to-end training strategies which maximize temporal context. We +conclude that awareness of BN's pitfalls is crucial for effective end-to-end +learning in surgical tasks. By reproducing results on natural-video datasets, +we hope our insights will benefit other areas of video learning as well. Code +is available at: \url{https://gitlab.com/nct_tso_public/pitfalls_bn} + +
+
+ comment: Accepted at Medical Image Analysis (MedIA). Publication link: + https://www.sciencedirect.com/science/article/pii/S1361841524000513 +
+
+
+
+
+ + ♻ ☆ Leveraging Automatic Personalised Nutrition: Food Image Recognition + Benchmark and Dataset based on Nutrition Taxonomy + + +
+ Maintaining a healthy lifestyle has become increasingly challenging in +today's sedentary society marked by poor eating habits. To address this issue, +both national and international organisations have made numerous efforts to +promote healthier diets and increased physical activity. However, implementing +these recommendations in daily life can be difficult, as they are often generic +and not tailored to individuals. This study presents the AI4Food-NutritionDB +database, the first nutrition database that incorporates food images and a +nutrition taxonomy based on recommendations by national and international +health authorities. The database offers a multi-level categorisation, +comprising 6 nutritional levels, 19 main categories (e.g., "Meat"), 73 +subcategories (e.g., "White Meat"), and 893 specific food products (e.g., +"Chicken"). The AI4Food-NutritionDB opens the doors to new food computing +approaches in terms of food intake frequency, quality, and categorisation. +Also, we present a standardised experimental protocol and benchmark including +three tasks based on the nutrition taxonomy (i.e., category, subcategory, and +final product recognition). These resources are available to the research +community, including our deep learning models trained on AI4Food-NutritionDB, +which can serve as pre-trained models, achieving accurate recognition results +for challenging food image databases. + +
+
+ comment: 12 pages, 4 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Multi-modal vision-language model for generalizable annotation-free + pathological lesions localization and clinical diagnosis + + +
+ Defining pathologies automatically from medical images aids the understanding +of the emergence and progression of diseases, and such an ability is crucial in +clinical diagnostics. However, existing deep learning models heavily rely on +expert annotations and lack generalization capabilities in open clinical +environments. In this study, we present a generalizable vision-language +pre-training model for Annotation-Free pathological lesions Localization +(AFLoc). The core strength of AFLoc lies in its extensive multi-level semantic +structure-based contrastive learning, which comprehensively aligns +multi-granularity medical concepts from reports with abundant image features, +to adapt to the diverse expressions of pathologies and unseen pathologies +without the reliance on image annotations from experts. We demonstrate the +proof of concept on CXR images, with extensive experimental validation across 4 +distinct external datasets, encompassing 11 types of chest pathologies. The +results demonstrate that AFLoc surpasses state-of-the-art methods in +pathological lesions localization and disease classification, and even +outperforms the human benchmark in locating 5 different pathologies. +Additionally, we further verify its generalization ability by applying it to +retinal fundus images. Our approach showcases AFoc versatilities and +underscores its suitability for clinical diagnoses in complex clinical +environments. + +
+
+
+
+
+ + ♻ ☆ Conditional Diffusion Models for Semantic 3D Brain MRI Synthesis + + +
+ Artificial intelligence (AI) in healthcare, especially in medical imaging, +faces challenges due to data scarcity and privacy concerns. Addressing these, +we introduce Med-DDPM, a diffusion model designed for 3D semantic brain MRI +synthesis. This model effectively tackles data scarcity and privacy issues by +integrating semantic conditioning. This involves the channel-wise concatenation +of a conditioning image to the model input, enabling control in image +generation. Med-DDPM demonstrates superior stability and performance compared +to existing 3D brain imaging synthesis methods. It generates diverse, +anatomically coherent images with high visual fidelity. In terms of dice score +accuracy in the tumor segmentation task, Med-DDPM achieves 0.6207, close to the +0.6531 accuracy of real images, and outperforms baseline models. Combined with +real images, it further increases segmentation accuracy to 0.6675, showing the +potential of our proposed method for data augmentation. This model represents +the first use of a diffusion model in 3D semantic brain MRI synthesis, +producing high-quality images. Its semantic conditioning feature also shows +potential for image anonymization in biomedical imaging, addressing data and +privacy issues. We provide the code and model weights for Med-DDPM on our +GitHub repository (https://github.com/mobaidoctor/med-ddpm/) to support +reproducibility. + +
+
+ comment: This document is a preprint and has been accepted for publication in + the IEEE Journal of Biomedical and Health Informatics. The final, published + version can be accessed using the following DOI: 10.1109/JBHI.2024.3385504. + Copyright for this article has been transferred to IEEE +
+
+
+
+
+ + ♻ ☆ Modeling Hierarchical Structural Distance for Unsupervised Domain + Adaptation + + +
+ Unsupervised domain adaptation (UDA) aims to estimate a transferable model +for unlabeled target domains by exploiting labeled source data. Optimal +Transport (OT) based methods have recently been proven to be a promising +solution for UDA with a solid theoretical foundation and competitive +performance. However, most of these methods solely focus on domain-level OT +alignment by leveraging the geometry of domains for domain-invariant features +based on the global embeddings of images. However, global representations of +images may destroy image structure, leading to the loss of local details that +offer category-discriminative information. This study proposes an end-to-end +Deep Hierarchical Optimal Transport method (DeepHOT), which aims to learn both +domain-invariant and category-discriminative representations by mining +hierarchical structural relations among domains. The main idea is to +incorporate a domain-level OT and image-level OT into a unified OT framework, +hierarchical optimal transport, to model the underlying geometry in both domain +space and image space. In DeepHOT framework, an image-level OT serves as the +ground distance metric for the domain-level OT, leading to the hierarchical +structural distance. Compared with the ground distance of the conventional +domain-level OT, the image-level OT captures structural associations among +local regions of images that are beneficial to classification. In this way, +DeepHOT, a unified OT framework, not only aligns domains by domain-level OT, +but also enhances the discriminative power through image-level OT. Moreover, to +overcome the limitation of high computational complexity, we propose a robust +and efficient implementation of DeepHOT by approximating origin OT with sliced +Wasserstein distance in image-level OT and accomplishing the mini-batch +unbalanced domain-level OT. + +
+
+ comment: accepted by TCVST, code: https://github.com/Innse/DeepHOT +
+
+
+
+
+ + ♻ ☆ Monocular 3D lane detection for Autonomous Driving: Recent Achievements, + Challenges, and Outlooks + + +
+ 3D lane detection is essential in autonomous driving as it extracts +structural and traffic information from the road in three-dimensional space, +aiding self-driving cars in logical, safe, and comfortable path planning and +motion control. Given the cost of sensors and the advantages of visual data in +color information, 3D lane detection based on monocular vision is an important +research direction in the realm of autonomous driving, increasingly gaining +attention in both industry and academia. Regrettably, recent advancements in +visual perception seem inadequate for the development of fully reliable 3D lane +detection algorithms, which also hampers the progress of vision-based fully +autonomous vehicles. We believe that there is still considerable room for +improvement in 3D lane detection algorithms for autonomous vehicles using +visual sensors, and significant enhancements are needed. This review looks back +and analyzes the current state of achievements in the field of 3D lane +detection research. It covers all current monocular-based 3D lane detection +processes, discusses the performance of these cutting-edge algorithms, analyzes +the time complexity of various algorithms, and highlights the main achievements +and limitations of ongoing research efforts. The survey also includes a +comprehensive discussion of available 3D lane detection datasets and the +challenges that researchers face but have not yet resolved. Finally, our work +outlines future research directions and invites researchers and practitioners +to join this exciting field. + +
+
+
+
+
+ + ♻ ☆ Factorized Motion Fields for Fast Sparse Input Dynamic View Synthesis SIGGRAPH 2024 + + +
+ Designing a 3D representation of a dynamic scene for fast optimization and +rendering is a challenging task. While recent explicit representations enable +fast learning and rendering of dynamic radiance fields, they require a dense +set of input viewpoints. In this work, we focus on learning a fast +representation for dynamic radiance fields with sparse input viewpoints. +However, the optimization with sparse input is under-constrained and +necessitates the use of motion priors to constrain the learning. Existing fast +dynamic scene models do not explicitly model the motion, making them difficult +to be constrained with motion priors. We design an explicit motion model as a +factorized 4D representation that is fast and can exploit the spatio-temporal +correlation of the motion field. We then introduce reliable flow priors +including a combination of sparse flow priors across cameras and dense flow +priors within cameras to regularize our motion model. Our model is fast, +compact and achieves very good performance on popular multi-view dynamic scene +datasets with sparse input viewpoints. The source code for our model can be +found on our project page: +https://nagabhushansn95.github.io/publications/2024/RF-DeRF.html. + +
+
+ comment: Accepted at SIGGRAPH 2024 +
+
+
+
+
+ + ♻ ☆ Overcoming Generic Knowledge Loss with Selective Parameter Update + + +
+ Foundation models encompass an extensive knowledge base and offer remarkable +transferability. However, this knowledge becomes outdated or insufficient over +time. The challenge lies in continuously updating foundation models to +accommodate novel information while retaining their original capabilities. +Leveraging the fact that foundation models have initial knowledge on various +tasks and domains, we propose a novel approach that, instead of updating all +parameters equally, localizes the updates to a sparse set of parameters +relevant to the task being learned. We strike a balance between efficiency and +new task performance, while maintaining the transferability and +generalizability of foundation models. We extensively evaluate our method on +foundational vision-language models with a diverse spectrum of continual +learning tasks. Our method achieves improvements on the accuracy of the newly +learned tasks up to 7% while preserving the pretraining knowledge with a +negligible decrease of 0.9% on a representative control set accuracy. + +
+
+
+
+
+ + ♻ ☆ Koala: Key frame-conditioned long video-LLM CVPR 2024 + + +
+ Long video question answering is a challenging task that involves recognizing +short-term activities and reasoning about their fine-grained relationships. +State-of-the-art video Large Language Models (vLLMs) hold promise as a viable +solution due to their demonstrated emergent capabilities on new tasks. However, +despite being trained on millions of short seconds-long videos, vLLMs are +unable to understand minutes-long videos and accurately answer questions about +them. To address this limitation, we propose a lightweight and self-supervised +approach, Key frame-conditioned long video-LLM (Koala), that introduces +learnable spatiotemporal queries to adapt pretrained vLLMs for generalizing to +longer videos. Our approach introduces two new tokenizers that condition on +visual tokens computed from sparse video key frames for understanding short and +long video moments. We train our proposed approach on HowTo100M and demonstrate +its effectiveness on zero-shot long video understanding benchmarks, where it +outperforms state-of-the-art large models by 3 - 6% in absolute accuracy across +all tasks. Surprisingly, we also empirically show that our approach not only +helps a pretrained vLLM to understand long videos but also improves its +accuracy on short-term action recognition. + +
+
+ comment: Accepted at CVPR 2024 as a poster highlight +
+
+
+
+
+ + ♻ ☆ Efficient Backdoor Attacks for Deep Neural Networks in Real-world + Scenarios ICLR 2024 + + +
+ Recent deep neural networks (DNNs) have came to rely on vast amounts of +training data, providing an opportunity for malicious attackers to exploit and +contaminate the data to carry out backdoor attacks. However, existing backdoor +attack methods make unrealistic assumptions, assuming that all training data +comes from a single source and that attackers have full access to the training +data. In this paper, we introduce a more realistic attack scenario where +victims collect data from multiple sources, and attackers cannot access the +complete training data. We refer to this scenario as data-constrained backdoor +attacks. In such cases, previous attack methods suffer from severe efficiency +degradation due to the entanglement between benign and poisoning features +during the backdoor injection process. To tackle this problem, we introduce +three CLIP-based technologies from two distinct streams: Clean Feature +Suppression and Poisoning Feature Augmentation.effective solution for +data-constrained backdoor attacks. The results demonstrate remarkable +improvements, with some settings achieving over 100% improvement compared to +existing attacks in data-constrained scenarios. Code is available at +https://github.com/sunh1113/Efficient-backdoor-attacks-for-deep-neural-networks-in-real-world-scenarios + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Tendency-driven Mutual Exclusivity for Weakly Supervised Incremental + Semantic Segmentation + + +
+ Weakly Incremental Learning for Semantic Segmentation (WILSS) leverages a +pre-trained segmentation model to segment new classes using cost-effective and +readily available image-level labels. A prevailing way to solve WILSS is the +generation of seed areas for each new class, serving as a form of pixel-level +supervision. However, a scenario usually arises where a pixel is concurrently +predicted as an old class by the pre-trained segmentation model and a new class +by the seed areas. Such a scenario becomes particularly problematic in WILSS, +as the lack of pixel-level annotations on new classes makes it intractable to +ascertain whether the pixel pertains to the new class or not. To surmount this +issue, we propose an innovative, tendency-driven relationship of mutual +exclusivity, meticulously tailored to govern the behavior of the seed areas and +the predictions generated by the pre-trained segmentation model. This +relationship stipulates that predictions for the new and old classes must not +conflict whilst prioritizing the preservation of predictions for the old +classes, which not only addresses the conflicting prediction issue but also +effectively mitigates the inherent challenge of incremental learning - +catastrophic forgetting. Furthermore, under the auspices of this +tendency-driven mutual exclusivity relationship, we generate pseudo masks for +the new classes, allowing for concurrent execution with model parameter +updating via the resolution of a bi-level optimization problem. Extensive +experiments substantiate the effectiveness of our framework, resulting in the +establishment of new benchmarks and paving the way for further research in this +field. + +
+
+
+
+
+ + ♻ ☆ EATFormer: Improving Vision Transformer Inspired by Evolutionary + Algorithm + + +
+ Motivated by biological evolution, this paper explains the rationality of +Vision Transformer by analogy with the proven practical Evolutionary Algorithm +(EA) and derives that both have consistent mathematical formulation. Then +inspired by effective EA variants, we propose a novel pyramid EATFormer +backbone that only contains the proposed \emph{EA-based Transformer} (EAT) +block, which consists of three residual parts, i.e., \emph{Multi-Scale Region +Aggregation} (MSRA), \emph{Global and Local Interaction} (GLI), and +\emph{Feed-Forward Network} (FFN) modules, to model multi-scale, interactive, +and individual information separately. Moreover, we design a \emph{Task-Related +Head} (TRH) docked with transformer backbone to complete final information +fusion more flexibly and \emph{improve} a \emph{Modulated Deformable MSA} +(MD-MSA) to dynamically model irregular locations. Massive quantitative and +quantitative experiments on image classification, downstream tasks, and +explanatory experiments demonstrate the effectiveness and superiority of our +approach over State-Of-The-Art (SOTA) methods. \Eg, our Mobile (1.8M), Tiny +(6.1M), Small (24.3M), and Base (49.0M) models achieve 69.4, 78.4, 83.1, and +83.9 Top-1 only trained on ImageNet-1K with naive training recipe; +EATFormer-Tiny/Small/Base armed Mask-R-CNN obtain 45.4/47.4/49.0 box AP and +41.4/42.9/44.2 mask AP on COCO detection, surpassing contemporary MPViT-T, +Swin-T, and Swin-S by 0.6/1.4/0.5 box AP and 0.4/1.3/0.9 mask AP separately +with less FLOPs; Our EATFormer-Small/Base achieve 47.3/49.3 mIoU on ADE20K by +Upernet that exceeds Swin-T/S by 2.8/1.7. Code is available at +\url{https://github.com/zhangzjn/EATFormer}. + +
+
+
+
+
+ + ♻ ☆ Heterogeneous Federated Learning with Splited Language Model + + +
+ Federated Split Learning (FSL) is a promising distributed learning paradigm +in practice, which gathers the strengths of both Federated Learning (FL) and +Split Learning (SL) paradigms, to ensure model privacy while diminishing the +resource overhead of each client, especially on large transformer models in a +resource-constrained environment, e.g., Internet of Things (IoT). However, +almost all works merely investigate the performance with simple neural network +models in FSL. Despite the minor efforts focusing on incorporating Vision +Transformers (ViT) as model architectures, they train ViT from scratch, thereby +leading to enormous training overhead in each device with limited resources. +Therefore, in this paper, we harness Pre-trained Image Transformers (PITs) as +the initial model, coined FedV, to accelerate the training process and improve +model robustness. Furthermore, we propose FedVZ to hinder the gradient +inversion attack, especially having the capability compatible with black-box +scenarios, where the gradient information is unavailable. Concretely, FedVZ +approximates the server gradient by utilizing a zeroth-order (ZO) optimization, +which replaces the backward propagation with just one forward process. +Empirically, we are the first to provide a systematic evaluation of FSL methods +with PITs in real-world datasets, different partial device participations, and +heterogeneous data splits. Our experiments verify the effectiveness of our +algorithms. + +
+
+
+
+
+ + ♻ ☆ Joint Coordinate Regression and Association For Multi-Person Pose + Estimation, A Pure Neural Network Approach + + +
+ We introduce a novel one-stage end-to-end multi-person 2D pose estimation +algorithm, known as Joint Coordinate Regression and Association (JCRA), that +produces human pose joints and associations without requiring any +post-processing. The proposed algorithm is fast, accurate, effective, and +simple. The one-stage end-to-end network architecture significantly improves +the inference speed of JCRA. Meanwhile, we devised a symmetric network +structure for both the encoder and decoder, which ensures high accuracy in +identifying keypoints. It follows an architecture that directly outputs part +positions via a transformer network, resulting in a significant improvement in +performance. Extensive experiments on the MS COCO and CrowdPose benchmarks +demonstrate that JCRA outperforms state-of-the-art approaches in both accuracy +and efficiency. Moreover, JCRA demonstrates 69.2 mAP and is 78\% faster at +inference acceleration than previous state-of-the-art bottom-up algorithms. The +code for this algorithm will be publicly available. + +
+
+ comment: This paper has been accepted by MMasia 2023 and is an oral + presentation +
+
+
+
+
+ + ♻ ☆ Exploring Radar Data Representations in Autonomous Driving: A + Comprehensive Review + + +
+ With the rapid advancements of sensor technology and deep learning, +autonomous driving systems are providing safe and efficient access to +intelligent vehicles as well as intelligent transportation. Among these +equipped sensors, the radar sensor plays a crucial role in providing robust +perception information in diverse environmental conditions. This review focuses +on exploring different radar data representations utilized in autonomous +driving systems. Firstly, we introduce the capabilities and limitations of the +radar sensor by examining the working principles of radar perception and signal +processing of radar measurements. Then, we delve into the generation process of +five radar representations, including the ADC signal, radar tensor, point +cloud, grid map, and micro-Doppler signature. For each radar representation, we +examine the related datasets, methods, advantages and limitations. Furthermore, +we discuss the challenges faced in these data representations and propose +potential research directions. Above all, this comprehensive review offers an +in-depth insight into how these representations enhance autonomous system +capabilities, providing guidance for radar perception researchers. To +facilitate retrieval and comparison of different data representations, datasets +and methods, we provide an interactive website at +https://radar-camera-fusion.github.io/radar. + +
+
+ comment: 24 pages, 10 figures, 5 tables. arXiv admin note: text overlap with + arXiv:2304.10410 +
+
+
+
+
+ + ♻ ☆ MARIS: Referring Image Segmentation via Mutual-Aware Attention Features + + +
+ Referring image segmentation (RIS) aims to segment a particular region based +on a language expression prompt. Existing methods incorporate linguistic +features into visual features and obtain multi-modal features for mask +decoding. However, these methods may segment the visually salient entity +instead of the correct referring region, as the multi-modal features are +dominated by the abundant visual context. In this paper, we propose MARIS, a +referring image segmentation method that leverages the Segment Anything Model +(SAM) and introduces a mutual-aware attention mechanism to enhance the +cross-modal fusion via two parallel branches. Specifically, our mutual-aware +attention mechanism consists of Vision-Guided Attention and Language-Guided +Attention, which bidirectionally model the relationship between visual and +linguistic features. Correspondingly, we design a Mask Decoder to enable +explicit linguistic guidance for more consistent segmentation with the language +expression. To this end, a multi-modal query token is proposed to integrate +linguistic information and interact with visual information simultaneously. +Extensive experiments on three benchmark datasets show that our method +outperforms the state-of-the-art RIS methods. Our code will be publicly +available. + +
+
+
+
+
+ + ♻ ☆ Task-conditioned adaptation of visual features in multi-task policy + learning + + +
+ Successfully addressing a wide variety of tasks is a core ability of +autonomous agents, requiring flexibly adapting the underlying decision-making +strategies and, as we argue in this work, also adapting the perception modules. +An analogical argument would be the human visual system, which uses top-down +signals to focus attention determined by the current task. Similarly, we adapt +pre-trained large vision models conditioned on specific downstream tasks in the +context of multi-task policy learning. We introduce task-conditioned adapters +that do not require finetuning any pre-trained weights, combined with a single +policy trained with behavior cloning and capable of addressing multiple tasks. +We condition the visual adapters on task embeddings, which can be selected at +inference if the task is known, or alternatively inferred from a set of example +demonstrations. To this end, we propose a new optimization-based estimator. We +evaluate the method on a wide variety of tasks from the CortexBench benchmark +and show that, compared to existing work, it can be addressed with a single +policy. In particular, we demonstrate that adapting visual features is a key +design choice and that the method generalizes to unseen tasks given a few +demonstrations. + +
+
+
+
+
+ + ♻ ☆ MovePose: A High-performance Human Pose Estimation Algorithm on Mobile + and Edge Devices + + +
+ We present MovePose, an optimized lightweight convolutional neural network +designed specifically for real-time body pose estimation on CPU-based mobile +devices. The current solutions do not provide satisfactory accuracy and speed +for human posture estimation, and MovePose addresses this gap. It aims to +maintain real-time performance while improving the accuracy of human posture +estimation for mobile devices. Our MovePose algorithm has attained an Mean +Average Precision (mAP) score of 68.0 on the COCO \cite{cocodata} validation +dataset. The MovePose algorithm displayed efficiency with a performance of 69+ +frames per second (fps) when run on an Intel i9-10920x CPU. Additionally, it +showcased an increased performance of 452+ fps on an NVIDIA RTX3090 GPU. On an +Android phone equipped with a Snapdragon 8 + 4G processor, the fps reached +above 11. To enhance accuracy, we incorporated three techniques: deconvolution, +large kernel convolution, and coordinate classification methods. Compared to +basic upsampling, deconvolution is trainable, improves model capacity, and +enhances the receptive field. Large kernel convolution strengthens these +properties at a decreased computational cost. In summary, MovePose provides +high accuracy and real-time performance, marking it a potential tool for a +variety of applications, including those focused on mobile-side human posture +estimation. The code and models for this algorithm will be made publicly +accessible. + +
+
+
+
+
+ + ♻ ☆ List-Mode PET Image Reconstruction Using Dykstra-Like Splitting + + +
+ Convergence of the block iterative method in image reconstruction for +positron emission tomography (PET) requires careful control of relaxation +parameters, which is a challenging task. The automatic determination of +relaxation parameters for list-mode reconstructions also remains challenging. +Therefore, a different approach would be desirable. In this study, we propose a +list-mode maximum likelihood Dykstra-like splitting PET reconstruction +(LM-MLDS). LM-MLDS converges the list-mode block iterative method by adding the +distance from an initial image as a penalty term into an objective function. +LM-MLDS takes a two-step approach because its performance depends on the +quality of the initial image. The first step uses a uniform image as the +initial image, and then the second step uses a reconstructed image after one +main iteration as the initial image. In a simulation study, LM-MLDS provided a +better tradeoff curve between noise and contrast than the other methods. In a +clinical study, LM-MLDS removed the false hotspots at the edge of the axial +field of view and improved the image quality of slices covering the top of the +head to the cerebellum. List-mode proximal splitting reconstruction is useful +not only for optimizing nondifferential functions but also for converging block +iterative methods without controlling relaxation parameters. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ A Multimodal Fusion Network For Student Emotion Recognition Based on + Transformer and Tensor Product + + +
+ This paper introduces a new multi-modal model based on the Transformer +architecture and tensor product fusion strategy, combining BERT's text vectors +and ViT's image vectors to classify students' psychological conditions, with an +accuracy of 93.65%. The purpose of the study is to accurately analyze the +mental health status of students from various data sources. This paper +discusses modal fusion methods, including early, late and intermediate fusion, +to overcome the challenges of integrating multi-modal information. Ablation +studies compare the performance of different models and fusion techniques, +showing that the proposed model outperforms existing methods such as CLIP and +ViLBERT in terms of accuracy and inference speed. Conclusions indicate that +while this model has significant advantages in emotion recognition, its +potential to incorporate other data modalities provides areas for future +research. + +
+
+
+
+
+ + ♻ ☆ RanLayNet: A Dataset for Document Layout Detection used for Domain + Adaptation and Generalization + + +
+ Large ground-truth datasets and recent advances in deep learning techniques +have been useful for layout detection. However, because of the restricted +layout diversity of these datasets, training on them requires a sizable number +of annotated instances, which is both expensive and time-consuming. As a +result, differences between the source and target domains may significantly +impact how well these models function. To solve this problem, domain adaptation +approaches have been developed that use a small quantity of labeled data to +adjust the model to the target domain. In this research, we introduced a +synthetic document dataset called RanLayNet, enriched with automatically +assigned labels denoting spatial positions, ranges, and types of layout +elements. The primary aim of this endeavor is to develop a versatile dataset +capable of training models with robustness and adaptability to diverse document +formats. Through empirical experimentation, we demonstrate that a deep layout +identification model trained on our dataset exhibits enhanced performance +compared to a model trained solely on actual documents. Moreover, we conduct a +comparative analysis by fine-tuning inference models using both PubLayNet and +IIIT-AR-13K datasets on the Doclaynet dataset. Our findings emphasize that +models enriched with our dataset are optimal for tasks such as achieving 0.398 +and 0.588 mAP95 score in the scientific document domain for the TABLE class. + +
+
+ comment: 8 pages, 6 figures, MMAsia 2023 Proceedings of the 5th ACM + International Conference on Multimedia in Asia +
+
+
+
+
+ + ♻ ☆ TC-OCR: TableCraft OCR for Efficient Detection & Recognition of Table + Structure & Content + + +
+ The automatic recognition of tabular data in document images presents a +significant challenge due to the diverse range of table styles and complex +structures. Tables offer valuable content representation, enhancing the +predictive capabilities of various systems such as search engines and Knowledge +Graphs. Addressing the two main problems, namely table detection (TD) and table +structure recognition (TSR), has traditionally been approached independently. +In this research, we propose an end-to-end pipeline that integrates deep +learning models, including DETR, CascadeTabNet, and PP OCR v2, to achieve +comprehensive image-based table recognition. This integrated approach +effectively handles diverse table styles, complex structures, and image +distortions, resulting in improved accuracy and efficiency compared to existing +methods like Table Transformers. Our system achieves simultaneous table +detection (TD), table structure recognition (TSR), and table content +recognition (TCR), preserving table structures and accurately extracting +tabular data from document images. The integration of multiple models addresses +the intricacies of table recognition, making our approach a promising solution +for image-based table understanding, data extraction, and information retrieval +applications. Our proposed approach achieves an IOU of 0.96 and an OCR Accuracy +of 78%, showcasing a remarkable improvement of approximately 25% in the OCR +Accuracy compared to the previous Table Transformer approach. + +
+
+ comment: 8 pages, 2 figures, Workshop of 1st MMIR Deep Multimodal Learning for + Information Retrieval +
+
+
+
+
+ + ♻ ☆ Research on Detection of Floating Objects in River and Lake Based on AI + Intelligent Image Recognition + + +
+ With the rapid advancement of artificial intelligence technology, AI-enabled +image recognition has emerged as a potent tool for addressing challenges in +traditional environmental monitoring. This study focuses on the detection of +floating objects in river and lake environments, exploring an innovative +approach based on deep learning. By intricately analyzing the technical +pathways for detecting static and dynamic features and considering the +characteristics of river and lake debris, a comprehensive image acquisition and +processing workflow has been developed. The study highlights the application +and performance comparison of three mainstream deep learning models -SSD, +Faster-RCNN, and YOLOv5- in debris identification. Additionally, a detection +system for floating objects has been designed and implemented, encompassing +both hardware platform construction and software framework development. Through +rigorous experimental validation, the proposed system has demonstrated its +ability to significantly enhance the accuracy and efficiency of debris +detection, thus offering a new technological avenue for water quality +monitoring in rivers and lakes + +
+
+
+
+
+ + ♻ ☆ Unified Human-Scene Interaction via Prompted Chain-of-Contacts + + +
+ Human-Scene Interaction (HSI) is a vital component of fields like embodied AI +and virtual reality. Despite advancements in motion quality and physical +plausibility, two pivotal factors, versatile interaction control and the +development of a user-friendly interface, require further exploration before +the practical application of HSI. This paper presents a unified HSI framework, +UniHSI, which supports unified control of diverse interactions through language +commands. This framework is built upon the definition of interaction as Chain +of Contacts (CoC): steps of human joint-object part pairs, which is inspired by +the strong correlation between interaction types and human-object contact +regions. Based on the definition, UniHSI constitutes a Large Language Model +(LLM) Planner to translate language prompts into task plans in the form of CoC, +and a Unified Controller that turns CoC into uniform task execution. To +facilitate training and evaluation, we collect a new dataset named ScenePlan +that encompasses thousands of task plans generated by LLMs based on diverse +scenarios. Comprehensive experiments demonstrate the effectiveness of our +framework in versatile task execution and generalizability to real scanned +scenes. The project page is at https://github.com/OpenRobotLab/UniHSI . + +
+
+ comment: A unified Human-Scene Interaction framework that supports versatile + interactions through language commands.Project URL: + https://xizaoqu.github.io/unihsi/ . Code: + https://github.com/OpenRobotLab/UniHSI +
+
+
+
+
+ + ♻ ☆ Lite-Mind: Towards Efficient and Robust Brain Representation Network + + +
+ The limited data availability and the low signal-to-noise ratio of fMRI +signals lead to the challenging task of fMRI-to-image retrieval. +State-of-the-art MindEye remarkably improves fMRI-to-image retrieval +performance by leveraging a large model, i.e., a 996M MLP Backbone per subject, +to align fMRI embeddings to the final hidden layer of CLIP's Vision Transformer +(ViT). However, significant individual variations exist among subjects, even +under identical experimental setups, mandating the training of large +subject-specific models. The substantial parameters pose significant challenges +in deploying fMRI decoding on practical devices. To this end, we propose +Lite-Mind, a lightweight, efficient, and robust brain representation learning +paradigm based on Discrete Fourier Transform (DFT), which efficiently aligns +fMRI voxels to fine-grained information of CLIP. We elaborately design a DFT +backbone with Spectrum Compression and Frequency Projector modules to learn +informative and robust voxel embeddings. Our experiments demonstrate that +Lite-Mind achieves an impressive 94.6% fMRI-to-image retrieval accuracy on the +NSD dataset for Subject 1, with 98.7% fewer parameters than MindEye. Lite-Mind +is also proven to be able to be migrated to smaller fMRI datasets and +establishes a new state-of-the-art for zero-shot classification on the GOD +dataset. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Watermark-embedded Adversarial Examples for Copyright Protection against + Diffusion Models + + +
+ Diffusion Models (DMs) have shown remarkable capabilities in various +image-generation tasks. However, there are growing concerns that DMs could be +used to imitate unauthorized creations and thus raise copyright issues. To +address this issue, we propose a novel framework that embeds personal +watermarks in the generation of adversarial examples. Such examples can force +DMs to generate images with visible watermarks and prevent DMs from imitating +unauthorized images. We construct a generator based on conditional adversarial +networks and design three losses (adversarial loss, GAN loss, and perturbation +loss) to generate adversarial examples that have subtle perturbation but can +effectively attack DMs to prevent copyright violations. Training a generator +for a personal watermark by our method only requires 5-10 samples within 2-3 +minutes, and once the generator is trained, it can generate adversarial +examples with that watermark significantly fast (0.2s per image). We conduct +extensive experiments in various conditional image-generation scenarios. +Compared to existing methods that generate images with chaotic textures, our +method adds visible watermarks on the generated images, which is a more +straightforward way to indicate copyright violations. We also observe that our +adversarial examples exhibit good transferability across unknown generative +models. Therefore, this work provides a simple yet powerful way to protect +copyright from DM-based imitation. + +
+
+ comment: updated references +
+
+
+
+
+ + ♻ ☆ EVI-SAM: Robust, Real-time, Tightly-coupled Event-Visual-Inertial State + Estimation and 3D Dense Mapping + + +
+ Event cameras are bio-inspired, motion-activated sensors that demonstrate +substantial potential in handling challenging situations, such as motion blur +and high-dynamic range. In this paper, we proposed EVI-SAM to tackle the +problem of 6 DoF pose tracking and 3D reconstruction using monocular event +camera. A novel event-based hybrid tracking framework is designed to estimate +the pose, leveraging the robustness of feature matching and the precision of +direct alignment. Specifically, we develop an event-based 2D-2D alignment to +construct the photometric constraint, and tightly integrate it with the +event-based reprojection constraint. The mapping module recovers the dense and +colorful depth of the scene through the image-guided event-based mapping +method. Subsequently, the appearance, texture, and surface mesh of the 3D scene +can be reconstructed by fusing the dense depth map from multiple viewpoints +using truncated signed distance function (TSDF) fusion. To the best of our +knowledge, this is the first non-learning work to realize event-based dense +mapping. Numerical evaluations are performed on both publicly available and +self-collected datasets, which qualitatively and quantitatively demonstrate the +superior performance of our method. Our EVI-SAM effectively balances accuracy +and robustness while maintaining computational efficiency, showcasing superior +pose tracking and dense mapping performance in challenging scenarios. Video +Demo: https://youtu.be/Nn40U4e5Si8. + +
+
+
+
+
+ + ♻ ☆ Region-Based Representations Revisited CVPR 2024 + + +
+ We investigate whether region-based representations are effective for +recognition. Regions were once a mainstay in recognition approaches, but pixel +and patch-based features are now used almost exclusively. We show that recent +class-agnostic segmenters like SAM can be effectively combined with strong +unsupervised representations like DINOv2 and used for a wide variety of tasks, +including semantic segmentation, object-based image retrieval, and multi-image +analysis. Once the masks and features are extracted, these representations, +even with linear decoders, enable competitive performance, making them well +suited to applications that require custom queries. The compactness of the +representation also makes it well-suited to video analysis and other problems +requiring inference across many images. + +
+
+ comment: CVPR 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ High-Degrees-of-Freedom Dynamic Neural Fields for Robot Self-Modeling + and Motion Planning ICRA + + +
+ A robot self-model is a task-agnostic representation of the robot's physical +morphology that can be used for motion planning tasks in the absence of a +classical geometric kinematic model. In particular, when the latter is hard to +engineer or the robot's kinematics change unexpectedly, human-free +self-modeling is a necessary feature of truly autonomous agents. In this work, +we leverage neural fields to allow a robot to self-model its kinematics as a +neural-implicit query model learned only from 2D images annotated with camera +poses and configurations. This enables significantly greater applicability than +existing approaches which have been dependent on depth images or geometry +knowledge. To this end, alongside a curricular data sampling strategy, we +propose a new encoder-based neural density field architecture for dynamic +object-centric scenes conditioned on high numbers of degrees of freedom (DOFs). +In a 7-DOF robot test setup, the learned self-model achieves a Chamfer-L2 +distance of 2% of the robot's workspace dimension. We demonstrate the +capabilities of this model on motion planning tasks as an exemplary downstream +application. + +
+
+ comment: International Conference on Robotics and Automation (ICRA) 2024; ICCV + 2023 Workshop on Neural Fields for Autonomous Driving and Robotics (oral) +
+
+
+
+
+ + ♻ ☆ Flatten Long-Range Loss Landscapes for Cross-Domain Few-Shot Learning + + +
+ Cross-domain few-shot learning (CDFSL) aims to acquire knowledge from limited +training data in the target domain by leveraging prior knowledge transferred +from source domains with abundant training samples. CDFSL faces challenges in +transferring knowledge across dissimilar domains and fine-tuning models with +limited training data. To address these challenges, we initially extend the +analysis of loss landscapes from the parameter space to the representation +space, which allows us to simultaneously interpret the transferring and +fine-tuning difficulties of CDFSL models. We observe that sharp minima in the +loss landscapes of the representation space result in representations that are +hard to transfer and fine-tune. Moreover, existing flatness-based methods have +limited generalization ability due to their short-range flatness. To enhance +the transferability and facilitate fine-tuning, we introduce a simple yet +effective approach to achieve long-range flattening of the minima in the loss +landscape. This approach considers representations that are differently +normalized as minima in the loss landscape and flattens the high-loss region in +the middle by randomly sampling interpolated representations. We implement this +method as a new normalization layer that replaces the original one in both CNNs +and ViTs. This layer is simple and lightweight, introducing only a minimal +number of additional parameters. Experimental results on 8 datasets demonstrate +that our approach outperforms state-of-the-art methods in terms of average +accuracy. Moreover, our method achieves performance improvements of up to 9\% +compared to the current best approaches on individual datasets. Our code will +be released. + +
+
+
+
+
+ + ♻ ☆ Single-temporal Supervised Remote Change Detection for Domain + Generalization + + +
+ Change detection is widely applied in remote sensing image analysis. Existing +methods require training models separately for each dataset, which leads to +poor domain generalization. Moreover, these methods rely heavily on large +amounts of high-quality pair-labelled data for training, which is expensive and +impractical. In this paper, we propose a multimodal contrastive learning +(ChangeCLIP) based on visual-language pre-training for change detection domain +generalization. Additionally, we propose a dynamic context optimization for +prompt learning. Meanwhile, to address the data dependency issue of existing +methods, we introduce a single-temporal and controllable AI-generated training +strategy (SAIN). This allows us to train the model using a large number of +single-temporal images without image pairs in the real world, achieving +excellent generalization. Extensive experiments on series of real change +detection datasets validate the superiority and strong generalization of +ChangeCLIP, outperforming state-of-the-art change detection methods. Code will +be available. + +
+
+
+
+
+ + ♻ ☆ LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing + Diffusion Models + + +
+ In the era of AIGC, the demand for low-budget or even on-device applications +of diffusion models emerged. In terms of compressing the Stable Diffusion +models (SDMs), several approaches have been proposed, and most of them +leveraged the handcrafted layer removal methods to obtain smaller U-Nets, along +with knowledge distillation to recover the network performance. However, such a +handcrafting manner of layer removal is inefficient and lacks scalability and +generalization, and the feature distillation employed in the retraining phase +faces an imbalance issue that a few numerically significant feature loss terms +dominate over others throughout the retraining process. To this end, we +proposed the layer pruning and normalized distillation for compressing +diffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to +compress SDM's U-Net automatically and proposed an effective one-shot pruning +criterion whose one-shot performance is guaranteed by its good additivity +property, surpassing other layer pruning and handcrafted layer removal methods, +2) proposed the normalized feature distillation for retraining, alleviated the +imbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of +SDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0% +decline in PickScore at a pruning ratio of 50% while the comparative methods' +minimal PickScore decline is 8.2%. We will release our code. + +
+
+
+
+
+ + ♻ ☆ Beyond Average: Individualized Visual Scanpath Prediction CVPR2024 + + +
+ Understanding how attention varies across individuals has significant +scientific and societal impacts. However, existing visual scanpath models treat +attention uniformly, neglecting individual differences. To bridge this gap, +this paper focuses on individualized scanpath prediction (ISP), a new attention +modeling task that aims to accurately predict how different individuals shift +their attention in diverse visual tasks. It proposes an ISP method featuring +three novel technical components: (1) an observer encoder to characterize and +integrate an observer's unique attention traits, (2) an observer-centric +feature integration approach that holistically combines visual features, task +guidance, and observer-specific characteristics, and (3) an adaptive fixation +prioritization mechanism that refines scanpath predictions by dynamically +prioritizing semantic feature maps based on individual observers' attention +traits. These novel components allow scanpath models to effectively address the +attention variations across different observers. Our method is generally +applicable to different datasets, model architectures, and visual tasks, +offering a comprehensive tool for transforming general scanpath models into +individualized ones. Comprehensive evaluations using value-based and +ranking-based metrics verify the method's effectiveness and generalizability. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ♻ ☆ The Devil is in the Few Shots: Iterative Visual Knowledge Completion for + Few-shot Learning + + +
+ Contrastive Language-Image Pre-training (CLIP) has shown powerful zero-shot +learning performance. Few-shot learning aims to further enhance the transfer +capability of CLIP by giving few images in each class, aka 'few shots'. Most +existing methods either implicitly learn from the few shots by incorporating +learnable prompts or adapters, or explicitly embed them in a cache model for +inference. However, the narrow distribution of few shots often contains +incomplete class information, leading to biased visual knowledge with high risk +of misclassification. To tackle this problem, recent methods propose to +supplement visual knowledge by generative models or extra databases, which can +be costly and time-consuming. In this paper, we propose an Iterative Visual +Knowledge CompLetion (KCL) method to complement visual knowledge by properly +taking advantages of unlabeled samples without access to any auxiliary or +synthetic data. Specifically, KCL first measures the similarities between +unlabeled samples and each category. Then, the samples with top confidence to +each category is selected and collected by a designed confidence criterion. +Finally, the collected samples are treated as labeled ones and added to few +shots to jointly re-estimate the remaining unlabeled ones. The above procedures +will be repeated for a certain number of iterations with more and more samples +being collected until convergence, ensuring a progressive and robust knowledge +completion process. Extensive experiments on 11 benchmark datasets demonstrate +the effectiveness and efficiency of KCL as a plug-and-play module under both +few-shot and zero-shot learning settings. Code is available at +https://github.com/Mark-Sky/KCL. + +
+
+
+
+
+ + ♻ ☆ Channel Vision Transformers: An Image Is Worth 1 x 16 x 16 Words + + +
+ Vision Transformer (ViT) has emerged as a powerful architecture in the realm +of modern computer vision. However, its application in certain imaging fields, +such as microscopy and satellite imaging, presents unique challenges. In these +domains, images often contain multiple channels, each carrying semantically +distinct and independent information. Furthermore, the model must demonstrate +robustness to sparsity in input channels, as they may not be densely available +during training or testing. In this paper, we propose a modification to the ViT +architecture that enhances reasoning across the input channels and introduce +Hierarchical Channel Sampling (HCS) as an additional regularization technique +to ensure robustness when only partial channels are presented during test time. +Our proposed model, ChannelViT, constructs patch tokens independently from each +input channel and utilizes a learnable channel embedding that is added to the +patch tokens, similar to positional embeddings. We evaluate the performance of +ChannelViT on ImageNet, JUMP-CP (microscopy cell imaging), and So2Sat +(satellite imaging). Our results show that ChannelViT outperforms ViT on +classification tasks and generalizes well, even when a subset of input channels +is used during testing. Across our experiments, HCS proves to be a powerful +regularizer, independent of the architecture employed, suggesting itself as a +straightforward technique for robust ViT training. Lastly, we find that +ChannelViT generalizes effectively even when there is limited access to all +channels during training, highlighting its potential for multi-channel imaging +under real-world conditions with sparse sensors. Our code is available at +https://github.com/insitro/ChannelViT. + +
+
+
+
+
+ + ♻ ☆ Computer-Aided Diagnosis of Thoracic Diseases in Chest X-rays using + hybrid CNN-Transformer Architecture + + +
+ Medical imaging has been used for diagnosis of various conditions, making it +one of the most powerful resources for effective patient care. Due to +widespread availability, low cost, and low radiation, chest X-ray is one of the +most sought after radiology examination for the diagnosis of various thoracic +diseases. Due to advancements in medical imaging technologies and increasing +patient load, current radiology workflow faces various challenges including +increasing backlogs, working long hours, and increase in diagnostic errors. An +automated computer-aided diagnosis system that can interpret chest X-rays to +augment radiologists by providing actionable insights has potential to provide +second opinion to radiologists, highlight relevant regions in the image, in +turn expediting clinical workflow, reducing diagnostic errors, and improving +patient care. In this study, we applied a novel architecture augmenting the +DenseNet121 Convolutional Neural Network (CNN) with multi-head self-attention +mechanism using transformer, namely SA-DenseNet121, that can identify multiple +thoracic diseases in chest X-rays. We conducted experiments on four of the +largest chest X-ray datasets, namely, ChestX-ray14, CheXpert, MIMIC-CXR-JPG, +and IU-CXR. Experimental results in terms of area under the receiver operating +characteristics (AUC-ROC) shows that augmenting CNN with self-attention has +potential in diagnosing different thoracic diseases from chest X-rays. The +proposed methodology has the potential to support the reading workflow, improve +efficiency, and reduce diagnostic errors. + +
+
+ comment: 24 pages, 13 Figures, 13 Tables. This article heavily draws from + arXiv:1904.09925 where authors originally proposed attention-augmented + convolutional network. arXiv admin note: text overlap with arXiv:1904.09925 + by other authors +
+
+
+
+
+ + ♻ ☆ Revealing the structure-property relationships of copper alloys with + FAGC + + +
+ Understanding how the structure of materials affects their properties is a +cornerstone of materials science and engineering. However, traditional methods +have struggled to accurately describe the quantitative structure-property +relationships for complex structures. In our study, we bridge this gap by +leveraging machine learning to analyze images of materials' microstructures, +thus offering a novel way to understand and predict the properties of materials +based on their microstructures. We introduce a method known as FAGC (Feature +Augmentation on Geodesic Curves), specifically demonstrated for Cu-Cr-Zr +alloys. This approach utilizes machine learning to examine the shapes within +images of the alloys' microstructures and predict their mechanical and +electronic properties. This generative FAGC approach can effectively expand the +relatively small training datasets due to the limited availability of materials +images labeled with quantitative properties. The process begins with extracting +features from the images using neural networks. These features are then mapped +onto the Pre-shape space to construct the Geodesic curves. Along these curves, +new features are generated, effectively increasing the dataset. Moreover, we +design a pseudo-labeling mechanism for these newly generated features to +further enhance the training dataset. Our FAGC method has shown remarkable +results, significantly improving the accuracy of predicting the electronic +conductivity and hardness of Cu-Cr-Zr alloys, with R-squared values of 0.978 +and 0.998, respectively. These outcomes underscore the potential of FAGC to +address the challenge of limited image data in materials science, providing a +powerful tool for establishing detailed and quantitative relationships between +complex microstructures and material properties. + +
+
+
+
+
+ + ♻ ☆ GazeHTA: End-to-end Gaze Target Detection with Head-Target Association + + +
+ We propose an end-to-end approach for gaze target detection: predicting a +head-target connection between individuals and the target image regions they +are looking at. Most of the existing methods use independent components such as +off-the-shelf head detectors or have problems in establishing associations +between heads and gaze targets. In contrast, we investigate an end-to-end +multi-person Gaze target detection framework with Heads and Targets Association +(GazeHTA), which predicts multiple head-target instances based solely on input +scene image. GazeHTA addresses challenges in gaze target detection by (1) +leveraging a pre-trained diffusion model to extract scene features for rich +semantic understanding, (2) re-injecting a head feature to enhance the head +priors for improved head understanding, and (3) learning a connection map as +the explicit visual associations between heads and gaze targets. Our extensive +experimental results demonstrate that GazeHTA outperforms state-of-the-art gaze +target detection methods and two adapted diffusion-based baselines on two +standard datasets. + +
+
+
+
+
+ + ♻ ☆ Unified Negative Pair Generation toward Well-discriminative Feature + Space for Face Recognition BMVC22 + + +
+ The goal of face recognition (FR) can be viewed as a pair similarity +optimization problem, maximizing a similarity set $\mathcal{S}^p$ over positive +pairs, while minimizing similarity set $\mathcal{S}^n$ over negative pairs. +Ideally, it is expected that FR models form a well-discriminative feature space +(WDFS) that satisfies $\inf{\mathcal{S}^p} > \sup{\mathcal{S}^n}$. With regard +to WDFS, the existing deep feature learning paradigms (i.e., metric and +classification losses) can be expressed as a unified perspective on different +pair generation (PG) strategies. Unfortunately, in the metric loss (ML), it is +infeasible to generate negative pairs taking all classes into account in each +iteration because of the limited mini-batch size. In contrast, in +classification loss (CL), it is difficult to generate extremely hard negative +pairs owing to the convergence of the class weight vectors to their center. +This leads to a mismatch between the two similarity distributions of the +sampled pairs and all negative pairs. Thus, this paper proposes a unified +negative pair generation (UNPG) by combining two PG strategies (i.e., MLPG and +CLPG) from a unified perspective to alleviate the mismatch. UNPG introduces +useful information about negative pairs using MLPG to overcome the CLPG +deficiency. Moreover, it includes filtering the similarities of noisy negative +pairs to guarantee reliable convergence and improved performance. Exhaustive +experiments show the superiority of UNPG by achieving state-of-the-art +performance across recent loss functions on public benchmark datasets. Our code +and pretrained models are publicly available. + +
+
+ comment: 9 pages, 6 figures, Published at BMVC22 +
+
+
+
+
+ + ♻ ☆ LLM4SGG: Large Language Models for Weakly Supervised Scene Graph + Generation CVPR 2024 + + +
+ Weakly-Supervised Scene Graph Generation (WSSGG) research has recently +emerged as an alternative to the fully-supervised approach that heavily relies +on costly annotations. In this regard, studies on WSSGG have utilized image +captions to obtain unlocalized triplets while primarily focusing on grounding +the unlocalized triplets over image regions. However, they have overlooked the +two issues involved in the triplet formation process from the captions: 1) +Semantic over-simplification issue arises when extracting triplets from +captions, where fine-grained predicates in captions are undesirably converted +into coarse-grained predicates, resulting in a long-tailed predicate +distribution, and 2) Low-density scene graph issue arises when aligning the +triplets in the caption with entity/predicate classes of interest, where many +triplets are discarded and not used in training, leading to insufficient +supervision. To tackle the two issues, we propose a new approach, i.e., Large +Language Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two +issues by leveraging the LLM's in-depth understanding of language and reasoning +ability during the extraction of triplets from captions and alignment of +entity/predicate classes with target data. To further engage the LLM in these +processes, we adopt the idea of Chain-of-Thought and the in-context few-shot +learning strategy. To validate the effectiveness of LLM4SGG, we conduct +extensive experiments on Visual Genome and GQA datasets, showing significant +improvements in both Recall@K and mean Recall@K compared to the +state-of-the-art WSSGG methods. A further appeal is that LLM4SGG is +data-efficient, enabling effective model training with a small amount of +training images. + +
+
+ comment: 8 pages; CVPR 2024 +
+
+
+
+
+ + ♻ ☆ PEEKABOO: Interactive Video Generation via Masked-Diffusion + + +
+ Modern video generation models like Sora have achieved remarkable success in +producing high-quality videos. However, a significant limitation is their +inability to offer interactive control to users, a feature that promises to +open up unprecedented applications and creativity. In this work, we introduce +the first solution to equip diffusion-based video generation models with +spatio-temporal control. We present Peekaboo, a novel masked attention module, +which seamlessly integrates with current video generation models offering +control without the need for additional training or inference overhead. To +facilitate future research, we also introduce a comprehensive benchmark for +interactive video generation. This benchmark offers a standardized framework +for the community to assess the efficacy of emerging interactive video +generation models. Our extensive qualitative and quantitative assessments +reveal that Peekaboo achieves up to a 3.8x improvement in mIoU over baseline +models, all while maintaining the same latency. Code and benchmark are +available on the webpage. + +
+
+ comment: Project webpage - https://jinga-lala.github.io/projects/Peekaboo/ +
+
+
+
+
+ + ♻ ☆ Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image + Labeling + + +
+ As deep neural networks are more commonly deployed in high-stakes domains, +their black-box nature makes uncertainty quantification challenging. We +investigate the effects of presenting conformal prediction sets--a +distribution-free class of methods for generating prediction sets with +specified coverage--to express uncertainty in AI-advised decision-making. +Through a large online experiment, we compare the utility of conformal +prediction sets to displays of Top-1 and Top-k predictions for AI-advised image +labeling. In a pre-registered analysis, we find that the utility of prediction +sets for accuracy varies with the difficulty of the task: while they result in +accuracy on par with or less than Top-1 and Top-k displays for easy images, +prediction sets excel at assisting humans in labeling out-of-distribution (OOD) +images, especially when the set size is small. Our results empirically pinpoint +practical challenges of conformal prediction sets and provide implications on +how to incorporate them for real-world decision-making. + +
+
+ comment: 19 pages, 11 figures, 10 tables. Accepted by ACM CHI 2024 +
+
+
+
+
+ + ♻ ☆ SEVD: Synthetic Event-based Vision Dataset for Ego and Fixed Traffic + Perception + + +
+ Recently, event-based vision sensors have gained attention for autonomous +driving applications, as conventional RGB cameras face limitations in handling +challenging dynamic conditions. However, the availability of real-world and +synthetic event-based vision datasets remains limited. In response to this gap, +we present SEVD, a first-of-its-kind multi-view ego, and fixed perception +synthetic event-based dataset using multiple dynamic vision sensors within the +CARLA simulator. Data sequences are recorded across diverse lighting (noon, +nighttime, twilight) and weather conditions (clear, cloudy, wet, rainy, foggy) +with domain shifts (discrete and continuous). SEVD spans urban, suburban, +rural, and highway scenes featuring various classes of objects (car, truck, +van, bicycle, motorcycle, and pedestrian). Alongside event data, SEVD includes +RGB imagery, depth maps, optical flow, semantic, and instance segmentation, +facilitating a comprehensive understanding of the scene. Furthermore, we +evaluate the dataset using state-of-the-art event-based (RED, RVT) and +frame-based (YOLOv8) methods for traffic participant detection tasks and +provide baseline benchmarks for assessment. Additionally, we conduct +experiments to assess the synthetic event-based dataset's generalization +capabilities. The dataset is available at +https://eventbasedvision.github.io/SEVD + +
+
+
+
+
+ + ♻ ☆ PCNN: Probable-Class Nearest-Neighbor Explanations Improve Fine-Grained + Image Classification Accuracy for AIs and Humans + + +
+ Nearest neighbors (NN) are traditionally used to compute final decisions, +e.g., in Support Vector Machines or k-NN classifiers, and to provide users with +explanations for the model's decision. In this paper, we show a novel utility +of nearest neighbors: To improve predictions of a frozen, pretrained classifier +C. We leverage an image comparator S that (1) compares the input image with NN +images from the top-K most probable classes; and (2) uses S's output scores to +weight the confidence scores of C. Our method consistently improves +fine-grained image classification accuracy on CUB-200, Cars-196, and Dogs-120. +Also, a human study finds that showing lay users our probable-class nearest +neighbors (PCNN) improves their decision accuracy over prior work which only +shows only the top-1 class examples. + +
+
+
+
+
+ + ♻ ☆ MatAtlas: Text-driven Consistent Geometry Texturing and Material + Assignment + + +
+ We present MatAtlas, a method for consistent text-guided 3D model texturing. +Following recent progress we leverage a large scale text-to-image generation +model (e.g., Stable Diffusion) as a prior to texture a 3D model. We carefully +design an RGB texturing pipeline that leverages a grid pattern diffusion, +driven by depth and edges. By proposing a multi-step texture refinement +process, we significantly improve the quality and 3D consistency of the +texturing output. To further address the problem of baked-in lighting, we move +beyond RGB colors and pursue assigning parametric materials to the assets. +Given the high-quality initial RGB texture, we propose a novel material +retrieval method capitalized on Large Language Models (LLM), enabling +editabiliy and relightability. We evaluate our method on a wide variety of +geometries and show that our method significantly outperform prior arts. We +also analyze the role of each component through a detailed ablation study. + +
+
+
+
+
+ + ♻ ☆ Artwork Protection Against Neural Style Transfer Using Locally Adaptive + Adversarial Color Attack + + +
+ Neural style transfer (NST) generates new images by combining the style of +one image with the content of another. However, unauthorized NST can exploit +artwork, raising concerns about artists' rights and motivating the development +of proactive protection methods. We propose Locally Adaptive Adversarial Color +Attack (LAACA), empowering artists to protect their artwork from unauthorized +style transfer by processing before public release. By delving into the +intricacies of human visual perception and the role of different frequency +components, our method strategically introduces frequency-adaptive +perturbations in the image. These perturbations significantly degrade the +generation quality of NST while maintaining an acceptable level of visual +change in the original image, ensuring that potential infringers are +discouraged from using the protected artworks, because of its bad NST +generation quality. Additionally, existing metrics often overlook the +importance of color fidelity in evaluating color-mattered tasks, such as the +quality of NST-generated images, which is crucial in the context of artistic +works. To comprehensively assess the color-mattered tasks, we propose the +Adversarial Color Distance Metric (ACDM), designed to quantify the color +difference of images pre- and post-manipulations. Experimental results confirm +that attacking NST using LAACA results in visually inferior style transfer, and +the ACDM can efficiently measure color-mattered tasks. By providing artists +with a tool to safeguard their intellectual property, our work relieves the +socio-technical challenges posed by the misuse of NST in the art community. + +
+
+ comment: 9 pages, 5 figures, 4 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 164 + +
+
+
+ + ☆ On the Content Bias in Fréchet Video Distance CVPR 2024 + + +
+ Fr\'echet Video Distance (FVD), a prominent metric for evaluating video +generation models, is known to conflict with human perception occasionally. In +this paper, we aim to explore the extent of FVD's bias toward per-frame quality +over temporal realism and identify its sources. We first quantify the FVD's +sensitivity to the temporal axis by decoupling the frame and motion quality and +find that the FVD increases only slightly with large temporal corruption. We +then analyze the generated videos and show that via careful sampling from a +large set of generated videos that do not contain motions, one can drastically +decrease FVD without improving the temporal quality. Both studies suggest FVD's +bias towards the quality of individual frames. We further observe that the bias +can be attributed to the features extracted from a supervised video classifier +trained on the content-biased dataset. We show that FVD with features extracted +from the recent large-scale self-supervised video models is less biased toward +image quality. Finally, we revisit a few real-world examples to validate our +hypothesis. + +
+
+ comment: CVPR 2024. Project webpage: https://content-debiased-fvd.github.io/ +
+
+
+
+
+ + ☆ BLINK: Multimodal Large Language Models Can See but Not Perceive + + +
+ We introduce Blink, a new benchmark for multimodal language models (LLMs) +that focuses on core visual perception abilities not found in other +evaluations. Most of the Blink tasks can be solved by humans "within a blink" +(e.g., relative depth estimation, visual correspondence, forensics detection, +and multi-view reasoning). However, we find these perception-demanding tasks +cast significant challenges for current multimodal LLMs because they resist +mediation through natural language. Blink reformats 14 classic computer vision +tasks into 3,807 multiple-choice questions, paired with single or multiple +images and visual prompting. While humans get 95.70% accuracy on average, Blink +is surprisingly challenging for existing multimodal LLMs: even the +best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only +13.17% and 7.63% higher than random guessing, indicating that such perception +abilities have not "emerged" yet in recent multimodal LLMs. Our analysis also +highlights that specialist CV models could solve these problems much better, +suggesting potential pathways for future improvements. We believe Blink will +stimulate the community to help multimodal LLMs catch up with human-level +visual perception. + +
+
+ comment: Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/ +
+
+
+
+
+ + ☆ VideoGigaGAN: Towards Detail-rich Video Super-Resolution + + +
+ Video super-resolution (VSR) approaches have shown impressive temporal +consistency in upsampled videos. However, these approaches tend to generate +blurrier results than their image counterparts as they are limited in their +generative capability. This raises a fundamental question: can we extend the +success of a generative image upsampler to the VSR task while preserving the +temporal consistency? We introduce VideoGigaGAN, a new generative VSR model +that can produce videos with high-frequency details and temporal consistency. +VideoGigaGAN builds upon a large-scale image upsampler -- GigaGAN. Simply +inflating GigaGAN to a video model by adding temporal modules produces severe +temporal flickering. We identify several key issues and propose techniques that +significantly improve the temporal consistency of upsampled videos. Our +experiments show that, unlike previous VSR methods, VideoGigaGAN generates +temporally consistent videos with more fine-grained appearance details. We +validate the effectiveness of VideoGigaGAN by comparing it with +state-of-the-art VSR models on public datasets and showcasing video results +with $8\times$ super-resolution. + +
+
+ comment: project page: https://videogigagan.github.io/ +
+
+
+
+
+ + ☆ Moving Object Segmentation: All You Need Is SAM (and Flow) + + +
+ The objective of this paper is motion segmentation -- discovering and +segmenting the moving objects in a video. This is a much studied area with +numerous careful,and sometimes complex, approaches and training schemes +including: self-supervised learning, learning from synthetic datasets, +object-centric representations, amodal representations, and many more. Our +interest in this paper is to determine if the Segment Anything model (SAM) can +contribute to this task. We investigate two models for combining SAM with +optical flow that harness the segmentation power of SAM with the ability of +flow to discover and group moving objects. In the first model, we adapt SAM to +take optical flow, rather than RGB, as an input. In the second, SAM takes RGB +as an input, and flow is used as a segmentation prompt. These surprisingly +simple methods, without any further modifications, outperform all previous +approaches by a considerable margin in both single and multi-object benchmarks. +We also extend these frame-level segmentations to sequence-level segmentations +that maintain object identity. Again, this simple model outperforms previous +methods on multiple video object segmentation benchmarks. + +
+
+ comment: Project Page: https://www.robots.ox.ac.uk/~vgg/research/flowsam/ +
+
+
+
+
+ + ☆ Reka Core, Flash, and Edge: A Series of Powerful Multimodal Language + Models + + +
+ We introduce Reka Core, Flash, and Edge, a series of powerful multimodal +language models trained from scratch by Reka. Reka models are able to process +and reason with text, images, video, and audio inputs. This technical report +discusses details of training some of these models and provides comprehensive +evaluation results. We show that Reka Edge and Reka Flash are not only +state-of-the-art but also outperform many much larger models, delivering +outsized values for their respective compute class. Meanwhile, our most capable +and largest model, Reka Core, approaches the best frontier models on both +automatic evaluations and blind human evaluations. On image question answering +benchmarks (e.g. MMMU, VQAv2), Core performs competitively to GPT4-V. +Meanwhile, on multimodal chat, Core ranks as the second most preferred model +under a blind third-party human evaluation setup, outperforming other models +such as Claude 3 Opus. On text benchmarks, Core not only performs competitively +to other frontier models on a set of well-established benchmarks (e.g. MMLU, +GSM8K) but also outperforms GPT4-0613 on human evaluation. On video question +answering (Perception-Test), Core outperforms Gemini Ultra. Models are shipped +in production at http://chat.reka.ai . A showcase of non cherry picked +qualitative examples can also be found at http://showcase.reka.ai . + +
+
+
+
+
+ + ☆ SOHES: Self-supervised Open-world Hierarchical Entity Segmentation ICLR 2024 + + +
+ Open-world entity segmentation, as an emerging computer vision task, aims at +segmenting entities in images without being restricted by pre-defined classes, +offering impressive generalization capabilities on unseen images and concepts. +Despite its promise, existing entity segmentation methods like Segment Anything +Model (SAM) rely heavily on costly expert annotators. This work presents +Self-supervised Open-world Hierarchical Entity Segmentation (SOHES), a novel +approach that eliminates the need for human annotations. SOHES operates in +three phases: self-exploration, self-instruction, and self-correction. Given a +pre-trained self-supervised representation, we produce abundant high-quality +pseudo-labels through visual feature clustering. Then, we train a segmentation +model on the pseudo-labels, and rectify the noises in pseudo-labels via a +teacher-student mutual-learning procedure. Beyond segmenting entities, SOHES +also captures their constituent parts, providing a hierarchical understanding +of visual entities. Using raw images as the sole training data, our method +achieves unprecedented performance in self-supervised open-world segmentation, +marking a significant milestone towards high-quality open-world entity +segmentation in the absence of human-annotated masks. Project page: +https://SOHES.github.io. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ MeshLRM: Large Reconstruction Model for High-Quality Mesh + + +
+ We propose MeshLRM, a novel LRM-based approach that can reconstruct a +high-quality mesh from merely four input images in less than one second. +Different from previous large reconstruction models (LRMs) that focus on +NeRF-based reconstruction, MeshLRM incorporates differentiable mesh extraction +and rendering within the LRM framework. This allows for end-to-end mesh +reconstruction by fine-tuning a pre-trained NeRF LRM with mesh rendering. +Moreover, we improve the LRM architecture by simplifying several complex +designs in previous LRMs. MeshLRM's NeRF initialization is sequentially trained +with low- and high-resolution images; this new LRM training strategy enables +significantly faster convergence and thereby leads to better quality with less +compute. Our approach achieves state-of-the-art mesh reconstruction from +sparse-view inputs and also allows for many downstream applications, including +text-to-3D and single-image-to-3D generation. Project page: +https://sarahweiii.github.io/meshlrm/ + +
+
+
+
+
+ + ☆ G-HOP: Generative Hand-Object Prior for Interaction Reconstruction and + Grasp Synthesis CVPR2024 + + +
+ We propose G-HOP, a denoising diffusion based generative prior for +hand-object interactions that allows modeling both the 3D object and a human +hand, conditioned on the object category. To learn a 3D spatial diffusion model +that can capture this joint distribution, we represent the human hand via a +skeletal distance field to obtain a representation aligned with the (latent) +signed distance field for the object. We show that this hand-object prior can +then serve as generic guidance to facilitate other tasks like reconstruction +from interaction clip and human grasp synthesis. We believe that our model, +trained by aggregating seven diverse real-world interaction datasets spanning +across 155 categories, represents a first approach that allows jointly +generating both hand and object. Our empirical evaluations demonstrate the +benefit of this joint prior in video-based reconstruction and human grasp +synthesis, outperforming current task-specific baselines. + Project website: https://judyye.github.io/ghop-www + +
+
+ comment: accepted to CVPR2024; project page at + https://judyye.github.io/ghop-www +
+
+
+
+
+ + ☆ Lazy Diffusion Transformer for Interactive Image Editing + + +
+ We introduce a novel diffusion transformer, LazyDiffusion, that generates +partial image updates efficiently. Our approach targets interactive image +editing applications in which, starting from a blank canvas or an image, a user +specifies a sequence of localized image modifications using binary masks and +text prompts. Our generator operates in two phases. First, a context encoder +processes the current canvas and user mask to produce a compact global context +tailored to the region to generate. Second, conditioned on this context, a +diffusion-based transformer decoder synthesizes the masked pixels in a "lazy" +fashion, i.e., it only generates the masked region. This contrasts with +previous works that either regenerate the full canvas, wasting time and +computation, or confine processing to a tight rectangular crop around the mask, +ignoring the global image context altogether. Our decoder's runtime scales with +the mask size, which is typically small, while our encoder introduces +negligible overhead. We demonstrate that our approach is competitive with +state-of-the-art inpainting methods in terms of quality and fidelity while +providing a 10x speedup for typical user interactions, where the editing mask +represents 10% of the image. + +
+
+
+
+
+ + ☆ 6Img-to-3D: Few-Image Large-Scale Outdoor Driving Scene Reconstruction + + +
+ Current 3D reconstruction techniques struggle to infer unbounded scenes from +a few images faithfully. Specifically, existing methods have high computational +demands, require detailed pose information, and cannot reconstruct occluded +regions reliably. We introduce 6Img-to-3D, an efficient, scalable +transformer-based encoder-renderer method for single-shot image to 3D +reconstruction. Our method outputs a 3D-consistent parameterized triplane from +only six outward-facing input images for large-scale, unbounded outdoor driving +scenarios. We take a step towards resolving existing shortcomings by combining +contracted custom cross- and self-attention mechanisms for triplane +parameterization, differentiable volume rendering, scene contraction, and image +feature projection. We showcase that six surround-view vehicle images from a +single timestamp without global pose information are enough to reconstruct +360$^{\circ}$ scenes during inference time, taking 395 ms. Our method allows, +for example, rendering third-person images and birds-eye views. Our code is +available at https://github.com/continental/6Img-to-3D, and more examples can +be found at our website here https://6Img-to-3D.GitHub.io/. + +
+
+ comment: Joint first authorship. Project page: https://6Img-to-3D.GitHub.io/ + Code https://github.com/continental/6Img-to-3D +
+
+
+
+
+ + ☆ Dynamic Gaussians Mesh: Consistent Mesh Reconstruction from Monocular + Videos + + +
+ Modern 3D engines and graphics pipelines require mesh as a memory-efficient +representation, which allows efficient rendering, geometry processing, texture +editing, and many other downstream operations. However, it is still highly +difficult to obtain high-quality mesh in terms of structure and detail from +monocular visual observations. The problem becomes even more challenging for +dynamic scenes and objects. To this end, we introduce Dynamic Gaussians Mesh +(DG-Mesh), a framework to reconstruct a high-fidelity and time-consistent mesh +given a single monocular video. Our work leverages the recent advancement in 3D +Gaussian Splatting to construct the mesh sequence with temporal consistency +from a video. Building on top of this representation, DG-Mesh recovers +high-quality meshes from the Gaussian points and can track the mesh vertices +over time, which enables applications such as texture editing on dynamic +objects. We introduce the Gaussian-Mesh Anchoring, which encourages evenly +distributed Gaussians, resulting better mesh reconstruction through mesh-guided +densification and pruning on the deformed Gaussians. By applying +cycle-consistent deformation between the canonical and the deformed space, we +can project the anchored Gaussian back to the canonical space and optimize +Gaussians across all time frames. During the evaluation on different datasets, +DG-Mesh provides significantly better mesh reconstruction and rendering than +baselines. + +
+
+ comment: Project page: https://www.liuisabella.com/DG-Mesh/ +
+
+
+
+
+ + ☆ MedThink: Explaining Medical Visual Question Answering via Multimodal + Decision-Making Rationale + + +
+ Medical Visual Question Answering (MedVQA), which offers language responses +to image-based medical inquiries, represents a challenging task and significant +advancement in healthcare. It assists medical experts to swiftly interpret +medical images, thereby enabling faster and more accurate diagnoses. However, +the model interpretability and transparency of existing MedVQA solutions are +often limited, posing challenges in understanding their decision-making +processes. To address this issue, we devise a semi-automated annotation process +to streamlining data preparation and build new benchmark MedVQA datasets R-RAD +and R-SLAKE. The R-RAD and R-SLAKE datasets provide intermediate medical +decision-making rationales generated by multimodal large language models and +human annotations for question-answering pairs in existing MedVQA datasets, +i.e., VQA-RAD and SLAKE. Moreover, we design a novel framework which finetunes +lightweight pretrained generative models by incorporating medical +decision-making rationales into the training process. The framework includes +three distinct strategies to generate decision outcomes and corresponding +rationales, thereby clearly showcasing the medical decision-making process +during reasoning. Extensive experiments demonstrate that our method can achieve +an accuracy of 83.5% on R-RAD and 86.3% on R-SLAKE, significantly outperforming +existing state-of-the-art baselines. Dataset and code will be released. + +
+
+
+
+
+ + ☆ Gradient-Regularized Out-of-Distribution Detection ECCV + + +
+ One of the challenges for neural networks in real-life applications is the +overconfident errors these models make when the data is not from the original +training distribution. + Addressing this issue is known as Out-of-Distribution (OOD) detection. + Many state-of-the-art OOD methods employ an auxiliary dataset as a surrogate +for OOD data during training to achieve improved performance. + However, these methods fail to fully exploit the local information embedded +in the auxiliary dataset. + In this work, we propose the idea of leveraging the information embedded in +the gradient of the loss function during training to enable the network to not +only learn a desired OOD score for each sample but also to exhibit similar +behavior in a local neighborhood around each sample. + We also develop a novel energy-based sampling method to allow the network to +be exposed to more informative OOD samples during the training phase. This is +especially important when the auxiliary dataset is large. We demonstrate the +effectiveness of our method through extensive experiments on several OOD +benchmarks, improving the existing state-of-the-art FPR95 by 4% on our ImageNet +experiment. + We further provide a theoretical analysis through the lens of certified +robustness and Lipschitz analysis to showcase the theoretical foundation of our +work. We will publicly release our code after the review process. + +
+
+ comment: Under review for the 18th European Conference on Computer Vision + (ECCV) 2024 +
+
+
+
+
+ + ☆ Inverse Neural Rendering for Explainable Multi-Object Tracking + + +
+ Today, most methods for image understanding tasks rely on feed-forward neural +networks. While this approach has allowed for empirical accuracy, efficiency, +and task adaptation via fine-tuning, it also comes with fundamental +disadvantages. Existing networks often struggle to generalize across different +datasets, even on the same task. By design, these networks ultimately reason +about high-dimensional scene features, which are challenging to analyze. This +is true especially when attempting to predict 3D information based on 2D +images. We propose to recast 3D multi-object tracking from RGB cameras as an +\emph{Inverse Rendering (IR)} problem, by optimizing via a differentiable +rendering pipeline over the latent space of pre-trained 3D object +representations and retrieve the latents that best represent object instances +in a given input image. To this end, we optimize an image loss over generative +latent spaces that inherently disentangle shape and appearance properties. We +investigate not only an alternate take on tracking but our method also enables +examining the generated objects, reasoning about failure situations, and +resolving ambiguous cases. We validate the generalization and scaling +capabilities of our method by learning the generative prior exclusively from +synthetic data and assessing camera-based 3D tracking on the nuScenes and Waymo +datasets. Both these datasets are completely unseen to our method and do not +require fine-tuning. Videos and code are available at +https://light.princeton.edu/inverse-rendering-tracking/. + +
+
+
+
+
+ + ☆ V2Xum-LLM: Cross-Modal Video Summarization with Temporal Prompt + Instruction Tuning + + +
+ Video summarization aims to create short, accurate, and cohesive summaries of +longer videos. Despite the existence of various video summarization datasets, a +notable limitation is their limited amount of source videos, which hampers the +effective fine-tuning of advanced large vision-language models (VLMs). +Additionally, most existing datasets are created for video-to-video +summarization, overlooking the contemporary need for multimodal video content +summarization. Recent efforts have been made to expand from unimodal to +multimodal video summarization, categorizing the task into three sub-tasks +based on the summary's modality: video-to-video (V2V), video-to-text (V2T), and +a combination of video and text summarization (V2VT). However, the textual +summaries in previous multimodal datasets are inadequate. To address these +issues, we introduce Instruct-V2Xum, a cross-modal video summarization dataset +featuring 30,000 diverse videos sourced from YouTube, with lengths ranging from +40 to 940 seconds and an average summarization ratio of 16.39\%. Each video +summary in Instruct-V2Xum is paired with a textual summary that references +specific frame indexes, facilitating the generation of aligned video and +textual summaries. In addition, we propose a new video summarization framework +named V2Xum-LLM. V2Xum-LLM, specifically V2Xum-LLaMA in this study, is the +first framework that unifies different video summarization tasks into one large +language model's (LLM) text decoder and achieves task-controllable video +summarization with temporal prompts and task instructions. Experiments show +that V2Xum-LLaMA outperforms strong baseline models on multiple video +summarization tasks. Furthermore, we propose an enhanced evaluation metric for +V2V and V2VT summarization tasks. + +
+
+
+
+
+ + ☆ Point-In-Context: Understanding Point Cloud via In-Context Learning + + +
+ With the emergence of large-scale models trained on diverse datasets, +in-context learning has emerged as a promising paradigm for multitasking, +notably in natural language processing and image processing. However, its +application in 3D point cloud tasks remains largely unexplored. In this work, +we introduce Point-In-Context (PIC), a novel framework for 3D point cloud +understanding via in-context learning. We address the technical challenge of +effectively extending masked point modeling to 3D point clouds by introducing a +Joint Sampling module and proposing a vanilla version of PIC called +Point-In-Context-Generalist (PIC-G). PIC-G is designed as a generalist model +for various 3D point cloud tasks, with inputs and outputs modeled as +coordinates. In this paradigm, the challenging segmentation task is achieved by +assigning label points with XYZ coordinates for each category; the final +prediction is then chosen based on the label point closest to the predictions. +To break the limitation by the fixed label-coordinate assignment, which has +poor generalization upon novel classes, we propose two novel training +strategies, In-Context Labeling and In-Context Enhancing, forming an extended +version of PIC named Point-In-Context-Segmenter (PIC-S), targeting improving +dynamic context labeling and model training. By utilizing dynamic in-context +labels and extra in-context pairs, PIC-S achieves enhanced performance and +generalization capability in and across part segmentation datasets. PIC is a +general framework so that other tasks or datasets can be seamlessly introduced +into our PIC through a unified data format. We conduct extensive experiments to +validate the versatility and adaptability of our proposed methods in handling a +wide range of tasks and segmenting multi-datasets. Our PIC-S is capable of +generalizing unseen datasets and performing novel part segmentation by +customizing prompts. + +
+
+ comment: Project page: https://fanglaosi.github.io/Point-In-Context_Pages. + arXiv admin note: text overlap with arXiv:2306.08659 +
+
+
+
+
+ + ☆ AniClipart: Clipart Animation with Text-to-Video Priors + + +
+ Clipart, a pre-made graphic art form, offers a convenient and efficient way +of illustrating visual content. Traditional workflows to convert static clipart +images into motion sequences are laborious and time-consuming, involving +numerous intricate steps like rigging, key animation and in-betweening. Recent +advancements in text-to-video generation hold great potential in resolving this +problem. Nevertheless, direct application of text-to-video generation models +often struggles to retain the visual identity of clipart images or generate +cartoon-style motions, resulting in unsatisfactory animation outcomes. In this +paper, we introduce AniClipart, a system that transforms static clipart images +into high-quality motion sequences guided by text-to-video priors. To generate +cartoon-style and smooth motion, we first define B\'{e}zier curves over +keypoints of the clipart image as a form of motion regularization. We then +align the motion trajectories of the keypoints with the provided text prompt by +optimizing the Video Score Distillation Sampling (VSDS) loss, which encodes +adequate knowledge of natural motion within a pretrained text-to-video +diffusion model. With a differentiable As-Rigid-As-Possible shape deformation +algorithm, our method can be end-to-end optimized while maintaining deformation +rigidity. Experimental results show that the proposed AniClipart consistently +outperforms existing image-to-video generation models, in terms of text-video +alignment, visual identity preservation, and motion consistency. Furthermore, +we showcase the versatility of AniClipart by adapting it to generate a broader +array of animation formats, such as layered animation, which allows topological +changes. + +
+
+ comment: Project Page: https://aniclipart.github.io/ +
+
+
+
+
+ + ☆ Measuring Feature Dependency of Neural Networks by Collapsing Feature + Dimensions in the Data Manifold + + +
+ This paper introduces a new technique to measure the feature dependency of +neural network models. The motivation is to better understand a model by +querying whether it is using information from human-understandable features, +e.g., anatomical shape, volume, or image texture. Our method is based on the +principle that if a model is dependent on a feature, then removal of that +feature should significantly harm its performance. A targeted feature is +"removed" by collapsing the dimension in the data distribution that corresponds +to that feature. We perform this by moving data points along the feature +dimension to a baseline feature value while staying on the data manifold, as +estimated by a deep generative model. Then we observe how the model's +performance changes on the modified test data set, with the target feature +dimension removed. We test our method on deep neural network models trained on +synthetic image data with known ground truth, an Alzheimer's disease prediction +task using MRI and hippocampus segmentations from the OASIS-3 dataset, and a +cell nuclei classification task using the Lizard dataset. + +
+
+ comment: Accepted and will be pulished in International Symposium on + Biomedical Imaging (ISBI) 2024 +
+
+
+
+
+ + ☆ SPOT: Point Cloud Based Stereo Visual Place Recognition for Similar and + Opposing Viewpoints ICRA 2024 + + +
+ Recognizing places from an opposing viewpoint during a return trip is a +common experience for human drivers. However, the analogous robotics +capability, visual place recognition (VPR) with limited field of view cameras +under 180 degree rotations, has proven to be challenging to achieve. To address +this problem, this paper presents Same Place Opposing Trajectory (SPOT), a +technique for opposing viewpoint VPR that relies exclusively on structure +estimated through stereo visual odometry (VO). The method extends recent +advances in lidar descriptors and utilizes a novel double (similar and +opposing) distance matrix sequence matching method. We evaluate SPOT on a +publicly available dataset with 6.7-7.6 km routes driven in similar and +opposing directions under various lighting conditions. The proposed algorithm +demonstrates remarkable improvement over the state-of-the-art, achieving up to +91.7% recall at 100% precision in opposing viewpoint cases, while requiring +less storage than all baselines tested and running faster than all but one. +Moreover, the proposed method assumes no a priori knowledge of whether the +viewpoint is similar or opposing, and also demonstrates competitive performance +in similar viewpoint cases. + +
+
+ comment: Accepted to ICRA 2024, project website: + https://umautobots.github.io/spot +
+
+
+
+
+ + ☆ Customizing Text-to-Image Diffusion with Camera Viewpoint Control + + +
+ Model customization introduces new concepts to existing text-to-image models, +enabling the generation of the new concept in novel contexts. However, such +methods lack accurate camera view control w.r.t the object, and users must +resort to prompt engineering (e.g., adding "top-view") to achieve coarse view +control. In this work, we introduce a new task -- enabling explicit control of +camera viewpoint for model customization. This allows us to modify object +properties amongst various background scenes via text prompts, all while +incorporating the target camera pose as additional control. This new task +presents significant challenges in merging a 3D representation from the +multi-view images of the new concept with a general, 2D text-to-image model. To +bridge this gap, we propose to condition the 2D diffusion process on rendered, +view-dependent features of the new object. During training, we jointly adapt +the 2D diffusion modules and 3D feature predictions to reconstruct the object's +appearance and geometry while reducing overfitting to the input multi-view +images. Our method outperforms existing image editing and model personalization +baselines in preserving the custom object's identity while following the input +text prompt and the object's camera pose. + +
+
+ comment: project page: https://customdiffusion360.github.io +
+
+
+
+
+ + ☆ A Perspective on Deep Vision Performance with Standard Image and Video + Codecs CVPR 2024 + + +
+ Resource-constrained hardware, such as edge devices or cell phones, often +rely on cloud servers to provide the required computational resources for +inference in deep vision models. However, transferring image and video data +from an edge or mobile device to a cloud server requires coding to deal with +network constraints. The use of standardized codecs, such as JPEG or H.264, is +prevalent and required to ensure interoperability. This paper aims to examine +the implications of employing standardized codecs within deep vision pipelines. +We find that using JPEG and H.264 coding significantly deteriorates the +accuracy across a broad range of vision tasks and models. For instance, strong +compression rates reduce semantic segmentation accuracy by more than 80% in +mIoU. In contrast to previous findings, our analysis extends beyond image and +action classification to localization and dense prediction tasks, thus +providing a more comprehensive perspective. + +
+
+ comment: Accepted at CVPR 2024 Workshop on AI for Streaming (AIS) +
+
+
+
+
+ + ☆ Generalizable Face Landmarking Guided by Conditional Face Warping CVPR 2024 + + +
+ As a significant step for human face modeling, editing, and generation, face +landmarking aims at extracting facial keypoints from images. A generalizable +face landmarker is required in practice because real-world facial images, e.g., +the avatars in animations and games, are often stylized in various ways. +However, achieving generalizable face landmarking is challenging due to the +diversity of facial styles and the scarcity of labeled stylized faces. In this +study, we propose a simple but effective paradigm to learn a generalizable face +landmarker based on labeled real human faces and unlabeled stylized faces. Our +method learns the face landmarker as the key module of a conditional face +warper. Given a pair of real and stylized facial images, the conditional face +warper predicts a warping field from the real face to the stylized one, in +which the face landmarker predicts the ending points of the warping field and +provides us with high-quality pseudo landmarks for the corresponding stylized +facial images. Applying an alternating optimization strategy, we learn the face +landmarker to minimize $i)$ the discrepancy between the stylized faces and the +warped real ones and $ii)$ the prediction errors of both real and pseudo +landmarks. Experiments on various datasets show that our method outperforms +existing state-of-the-art domain adaptation methods in face landmarking tasks, +leading to a face landmarker with better generalizability. Code is available at +https://plustwo0.github.io/project-face-landmarker}{https://plustwo0.github.io/project-face-landmarker. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ☆ iRAG: An Incremental Retrieval Augmented Generation System for Videos + + +
+ Retrieval augmented generation (RAG) systems combine the strengths of +language generation and information retrieval to power many real-world +applications like chatbots. Use of RAG for combined understanding of multimodal +data such as text, images and videos is appealing but two critical limitations +exist: one-time, upfront capture of all content in large multimodal data as +text descriptions entails high processing times, and not all information in the +rich multimodal data is typically in the text descriptions. Since the user +queries are not known apriori, developing a system for multimodal to text +conversion and interactive querying of multimodal data is challenging. + To address these limitations, we propose iRAG, which augments RAG with a +novel incremental workflow to enable interactive querying of large corpus of +multimodal data. Unlike traditional RAG, iRAG quickly indexes large +repositories of multimodal data, and in the incremental workflow, it uses the +index to opportunistically extract more details from select portions of the +multimodal data to retrieve context relevant to an interactive user query. Such +an incremental workflow avoids long multimodal to text conversion times, +overcomes information loss issues by doing on-demand query-specific extraction +of details in multimodal data, and ensures high quality of responses to +interactive user queries that are often not known apriori. To the best of our +knowledge, iRAG is the first system to augment RAG with an incremental workflow +to support efficient interactive querying of large, real-world multimodal data. +Experimental results on real-world long videos demonstrate 23x to 25x faster +video to text ingestion, while ensuring that quality of responses to +interactive user queries is comparable to responses from a traditional RAG +where all video data is converted to text upfront before any querying. + +
+
+
+
+
+ + ☆ When Medical Imaging Met Self-Attention: A Love Story That Didn't Quite + Work Out + + +
+ A substantial body of research has focused on developing systems that assist +medical professionals during labor-intensive early screening processes, many +based on convolutional deep-learning architectures. Recently, multiple studies +explored the application of so-called self-attention mechanisms in the vision +domain. These studies often report empirical improvements over fully +convolutional approaches on various datasets and tasks. To evaluate this trend +for medical imaging, we extend two widely adopted convolutional architectures +with different self-attention variants on two different medical datasets. With +this, we aim to specifically evaluate the possible advantages of additional +self-attention. We compare our models with similarly sized convolutional and +attention-based baselines and evaluate performance gains statistically. +Additionally, we investigate how including such layers changes the features +learned by these models during the training. Following a hyperparameter search, +and contrary to our expectations, we observe no significant improvement in +balanced accuracy over fully convolutional models. We also find that important +features, such as dermoscopic structures in skin lesion images, are still not +learned by employing self-attention. Finally, analyzing local explanations, we +confirm biased feature usage. We conclude that merely incorporating attention +is insufficient to surpass the performance of existing fully convolutional +methods. + +
+
+ comment: 10 pages, 2 figures, 5 tables, presented at VISAPP 2024 +
+
+
+
+
+ + ☆ Reducing Bias in Pre-trained Models by Tuning while Penalizing Change + + +
+ Deep models trained on large amounts of data often incorporate implicit +biases present during training time. If later such a bias is discovered during +inference or deployment, it is often necessary to acquire new data and retrain +the model. This behavior is especially problematic in critical areas such as +autonomous driving or medical decision-making. In these scenarios, new data is +often expensive and hard to come by. In this work, we present a method based on +change penalization that takes a pre-trained model and adapts the weights to +mitigate a previously detected bias. We achieve this by tuning a +zero-initialized copy of a frozen pre-trained network. Our method needs very +few, in extreme cases only a single, examples that contradict the bias to +increase performance. Additionally, we propose an early stopping criterion to +modify baselines and reduce overfitting. We evaluate our approach on a +well-known bias in skin lesion classification and three other datasets from the +domain shift literature. We find that our approach works especially well with +very few images. Simple fine-tuning combined with our early stopping also leads +to performance benefits for a larger number of tuning samples. + +
+
+ comment: 12 pages, 12 figures, presented at VISAPP 2024 +
+
+
+
+
+ + ☆ Performance Evaluation of Segment Anything Model with Variational + Prompting for Application to Non-Visible Spectrum Imagery + + +
+ The Segment Anything Model (SAM) is a deep neural network foundational model +designed to perform instance segmentation which has gained significant +popularity given its zero-shot segmentation ability. SAM operates by generating +masks based on various input prompts such as text, bounding boxes, points, or +masks, introducing a novel methodology to overcome the constraints posed by +dataset-specific scarcity. While SAM is trained on an extensive dataset, +comprising ~11M images, it mostly consists of natural photographic images with +only very limited images from other modalities. Whilst the rapid progress in +visual infrared surveillance and X-ray security screening imaging technologies, +driven forward by advances in deep learning, has significantly enhanced the +ability to detect, classify and segment objects with high accuracy, it is not +evident if the SAM zero-shot capabilities can be transferred to such +modalities. This work assesses SAM capabilities in segmenting objects of +interest in the X-ray/infrared modalities. Our approach reuses the pre-trained +SAM with three different prompts: bounding box, centroid and random points. We +present quantitative/qualitative results to showcase the performance on +selected datasets. Our results show that SAM can segment objects in the X-ray +modality when given a box prompt, but its performance varies for point prompts. +Specifically, SAM performs poorly in segmenting slender objects and organic +materials, such as plastic bottles. We find that infrared objects are also +challenging to segment with point prompts given the low-contrast nature of this +modality. This study shows that while SAM demonstrates outstanding zero-shot +capabilities with box prompts, its performance ranges from moderate to poor for +point prompts, indicating that special consideration on the cross-modal +generalisation of SAM is needed when considering use on X-ray/infrared imagery. + +
+
+
+
+
+ + ☆ Alleviating Catastrophic Forgetting in Facial Expression Recognition + with Emotion-Centered Models + + +
+ Facial expression recognition is a pivotal component in machine learning, +facilitating various applications. However, convolutional neural networks +(CNNs) are often plagued by catastrophic forgetting, impeding their +adaptability. The proposed method, emotion-centered generative replay (ECgr), +tackles this challenge by integrating synthetic images from generative +adversarial networks. Moreover, ECgr incorporates a quality assurance algorithm +to ensure the fidelity of generated images. This dual approach enables CNNs to +retain past knowledge while learning new tasks, enhancing their performance in +emotion recognition. The experimental results on four diverse facial expression +datasets demonstrate that incorporating images generated by our +pseudo-rehearsal method enhances training on the targeted dataset and the +source dataset while making the CNN retain previously learned knowledge. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ DeepLocalization: Using change point detection for Temporal Action + Localization + + +
+ In this study, we introduce DeepLocalization, an innovative framework devised +for the real-time localization of actions tailored explicitly for monitoring +driver behavior. Utilizing the power of advanced deep learning methodologies, +our objective is to tackle the critical issue of distracted driving-a +significant factor contributing to road accidents. Our strategy employs a dual +approach: leveraging Graph-Based Change-Point Detection for pinpointing actions +in time alongside a Video Large Language Model (Video-LLM) for precisely +categorizing activities. Through careful prompt engineering, we customize the +Video-LLM to adeptly handle driving activities' nuances, ensuring its +classification efficacy even with sparse data. Engineered to be lightweight, +our framework is optimized for consumer-grade GPUs, making it vastly applicable +in practical scenarios. We subjected our method to rigorous testing on the +SynDD2 dataset, a complex benchmark for distracted driving behaviors, where it +demonstrated commendable performance-achieving 57.5% accuracy in event +classification and 51% in event detection. These outcomes underscore the +substantial promise of DeepLocalization in accurately identifying diverse +driver behaviors and their temporal occurrences, all within the bounds of +limited computational resources. + +
+
+
+
+
+ + ☆ Food Portion Estimation via 3D Object Scaling + + +
+ Image-based methods to analyze food images have alleviated the user burden +and biases associated with traditional methods. However, accurate portion +estimation remains a major challenge due to the loss of 3D information in the +2D representation of foods captured by smartphone cameras or wearable devices. +In this paper, we propose a new framework to estimate both food volume and +energy from 2D images by leveraging the power of 3D food models and physical +reference in the eating scene. Our method estimates the pose of the camera and +the food object in the input image and recreates the eating occasion by +rendering an image of a 3D model of the food with the estimated poses. We also +introduce a new dataset, SimpleFood45, which contains 2D images of 45 food +items and associated annotations including food volume, weight, and energy. Our +method achieves an average error of 31.10 kCal (17.67%) on this dataset, +outperforming existing portion estimation methods. + +
+
+
+
+
+ + ☆ Deep Gaussian mixture model for unsupervised image segmentation + + +
+ The recent emergence of deep learning has led to a great deal of work on +designing supervised deep semantic segmentation algorithms. As in many tasks +sufficient pixel-level labels are very difficult to obtain, we propose a method +which combines a Gaussian mixture model (GMM) with unsupervised deep learning +techniques. In the standard GMM the pixel values with each sub-region are +modelled by a Gaussian distribution. In order to identify the different +regions, the parameter vector that minimizes the negative log-likelihood (NLL) +function regarding the GMM has to be approximated. For this task, usually +iterative optimization methods such as the expectation-maximization (EM) +algorithm are used. In this paper, we propose to estimate these parameters +directly from the image using a convolutional neural network (CNN). We thus +change the iterative procedure in the EM algorithm replacing the +expectation-step by a gradient-step with regard to the networks parameters. +This means that the network is trained to minimize the NLL function of the GMM +which comes with at least two advantages. As once trained, the network is able +to predict label probabilities very quickly compared with time consuming +iterative optimization methods. Secondly, due to the deep image prior our +method is able to partially overcome one of the main disadvantages of GMM, +which is not taking into account correlation between neighboring pixels, as it +assumes independence between them. We demonstrate the advantages of our method +in various experiments on the example of myocardial infarct segmentation on +multi-sequence MRI images. + +
+
+
+
+
+ + ☆ Dynamic Modality and View Selection for Multimodal Emotion Recognition + with Missing Modalities + + +
+ The study of human emotions, traditionally a cornerstone in fields like +psychology and neuroscience, has been profoundly impacted by the advent of +artificial intelligence (AI). Multiple channels, such as speech (voice) and +facial expressions (image), are crucial in understanding human emotions. +However, AI's journey in multimodal emotion recognition (MER) is marked by +substantial technical challenges. One significant hurdle is how AI models +manage the absence of a particular modality - a frequent occurrence in +real-world situations. This study's central focus is assessing the performance +and resilience of two strategies when confronted with the lack of one modality: +a novel multimodal dynamic modality and view selection and a cross-attention +mechanism. Results on the RECOLA dataset show that dynamic selection-based +methods are a promising approach for MER. In the missing modalities scenarios, +all dynamic selection-based methods outperformed the baseline. The study +concludes by emphasizing the intricate interplay between audio and video +modalities in emotion prediction, showcasing the adaptability of dynamic +selection methods in handling missing modalities. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Blind Localization and Clustering of Anomalies in Textures + + +
+ Anomaly detection and localization in images is a growing field in computer +vision. In this area, a seemingly understudied problem is anomaly clustering, +i.e., identifying and grouping different types of anomalies in a fully +unsupervised manner. In this work, we propose a novel method for clustering +anomalies in largely stationary images (textures) in a blind setting. That is, +the input consists of normal and anomalous images without distinction and +without labels. What contributes to the difficulty of the task is that +anomalous regions are often small and may present only subtle changes in +appearance, which can be easily overshadowed by the genuine variance in the +texture. Moreover, each anomaly type may have a complex appearance +distribution. We introduce a novel scheme for solving this task using a +combination of blind anomaly localization and contrastive learning. By +identifying the anomalous regions with high fidelity, we can restrict our focus +to those regions of interest; then, contrastive learning is employed to +increase the separability of different anomaly types and reduce the intra-class +variation. Our experiments show that the proposed solution yields significantly +better results compared to prior work, setting a new state of the art. Project +page: https://reality.tf.fau.de/pub/ardelean2024blind.html. + +
+
+
+
+
+ + ☆ Beyond Average: Individualized Visual Scanpath Prediction CVPR2024 + + +
+ Understanding how attention varies across individuals has significant +scientific and societal impacts. However, existing visual scanpath models treat +attention uniformly, neglecting individual differences. To bridge this gap, +this paper focuses on individualized scanpath prediction (ISP), a new attention +modeling task that aims to accurately predict how different individuals shift +their attention in diverse visual tasks. It proposes an ISP method featuring +three novel technical components: (1) an observer encoder to characterize and +integrate an observer's unique attention traits, (2) an observer-centric +feature integration approach that holistically combines visual features, task +guidance, and observer-specific characteristics, and (3) an adaptive fixation +prioritization mechanism that refines scanpath predictions by dynamically +prioritizing semantic feature maps based on individual observers' attention +traits. These novel components allow scanpath models to effectively address the +attention variations across different observers. Our method is generally +applicable to different datasets, model architectures, and visual tasks, +offering a comprehensive tool for transforming general scanpath models into +individualized ones. Comprehensive evaluations using value-based and +ranking-based metrics verify the method's effectiveness and generalizability. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ☆ ProTA: Probabilistic Token Aggregation for Text-Video Retrieval + + +
+ Text-video retrieval aims to find the most relevant cross-modal samples for a +given query. Recent methods focus on modeling the whole spatial-temporal +relations. However, since video clips contain more diverse content than +captions, the model aligning these asymmetric video-text pairs has a high risk +of retrieving many false positive results. In this paper, we propose +Probabilistic Token Aggregation (\textit{ProTA}) to handle cross-modal +interaction with content asymmetry. Specifically, we propose dual +partial-related aggregation to disentangle and re-aggregate token +representations in both low-dimension and high-dimension spaces. We propose +token-based probabilistic alignment to generate token-level probabilistic +representation and maintain the feature representation diversity. In addition, +an adaptive contrastive loss is proposed to learn compact cross-modal +distribution space. Based on extensive experiments, \textit{ProTA} achieves +significant improvements on MSR-VTT (50.9%), LSMDC (25.8%), and DiDeMo (47.2%). + +
+
+
+
+
+ + ☆ Observation, Analysis, and Solution: Exploring Strong Lightweight Vision + Transformers via Masked Image Modeling Pre-Training + + +
+ Masked image modeling (MIM) pre-training for large-scale vision transformers +(ViTs) in computer vision has enabled promising downstream performance on top +of the learned self-supervised ViT features. In this paper, we question if the +extremely simple ViTs' fine-tuning performance with a small-scale architecture +can also benefit from this pre-training paradigm, which is considerably less +studied yet in contrast to the well-established lightweight architecture design +methodology with sophisticated components introduced. By carefully adapting +various typical MIM pre-training methods to this lightweight regime and +comparing them with the contrastive learning (CL) pre-training on various +downstream image classification and dense prediction tasks, we systematically +observe different behaviors between MIM and CL with respect to the downstream +fine-tuning data scales. Furthermore, we analyze the frozen features under +linear probing evaluation and also the layer representation similarities and +attention maps across the obtained models, which clearly show the inferior +learning of MIM pre-training on higher layers, leading to unsatisfactory +fine-tuning performance on data-insufficient downstream tasks. This finding is +naturally a guide to choosing appropriate distillation strategies during +pre-training to solve the above deterioration problem. Extensive experiments on +various vision tasks demonstrate the effectiveness of our +observation-analysis-solution flow. In particular, our pre-training with +distillation on pure lightweight ViTs with vanilla/hierarchical design +(5.7M/6.5M) can achieve 79.4%/78.9% top-1 accuracy on ImageNet-1K. It also +enables SOTA performance on the ADE20K semantic segmentation task (42.8% mIoU) +and LaSOT visual tracking task (66.1% AUC) in the lightweight regime. The +latter even surpasses all the current SOTA lightweight CPU-realtime trackers. + +
+
+
+
+
+ + ☆ Partial-to-Partial Shape Matching with Geometric Consistency + + +
+ Finding correspondences between 3D shapes is an important and long-standing +problem in computer vision, graphics and beyond. A prominent challenge are +partial-to-partial shape matching settings, which occur when the shapes to +match are only observed incompletely (e.g. from 3D scanning). Although +partial-to-partial matching is a highly relevant setting in practice, it is +rarely explored. Our work bridges the gap between existing (rather artificial) +3D full shape matching and partial-to-partial real-world settings by exploiting +geometric consistency as a strong constraint. We demonstrate that it is indeed +possible to solve this challenging problem in a variety of settings. For the +first time, we achieve geometric consistency for partial-to-partial matching, +which is realized by a novel integer non-linear program formalism building on +triangle product spaces, along with a new pruning algorithm based on linear +integer programming. Further, we generate a new inter-class dataset for +partial-to-partial shape-matching. We show that our method outperforms current +SOTA methods on both an established intra-class dataset and our novel +inter-class dataset. + +
+
+
+
+
+ + ☆ GraFIQs: Face Image Quality Assessment Using Gradient Magnitudes CVPR + + +
+ Face Image Quality Assessment (FIQA) estimates the utility of face images for +automated face recognition (FR) systems. We propose in this work a novel +approach to assess the quality of face images based on inspecting the required +changes in the pre-trained FR model weights to minimize differences between +testing samples and the distribution of the FR training dataset. To achieve +that, we propose quantifying the discrepancy in Batch Normalization statistics +(BNS), including mean and variance, between those recorded during FR training +and those obtained by processing testing samples through the pretrained FR +model. We then generate gradient magnitudes of pretrained FR weights by +backpropagating the BNS through the pretrained model. The cumulative absolute +sum of these gradient magnitudes serves as the FIQ for our approach. Through +comprehensive experimentation, we demonstrate the effectiveness of our +training-free and quality labeling-free approach, achieving competitive +performance to recent state-of-theart FIQA approaches without relying on +quality labeling, the need to train regression networks, specialized +architectures, or designing and optimizing specific loss functions. + +
+
+ comment: Accepted at CVPR Workshop 2024 +
+
+
+
+
+ + ☆ Aligning Actions and Walking to LLM-Generated Textual Descriptions + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +various domains, including data augmentation and synthetic data generation. +This work explores the use of LLMs to generate rich textual descriptions for +motion sequences, encompassing both actions and walking patterns. We leverage +the expressive power of LLMs to align motion representations with high-level +linguistic cues, addressing two distinct tasks: action recognition and +retrieval of walking sequences based on appearance attributes. For action +recognition, we employ LLMs to generate textual descriptions of actions in the +BABEL-60 dataset, facilitating the alignment of motion sequences with +linguistic representations. In the domain of gait analysis, we investigate the +impact of appearance attributes on walking patterns by generating textual +descriptions of motion sequences from the DenseGait dataset using LLMs. These +descriptions capture subtle variations in walking styles influenced by factors +such as clothing choices and footwear. Our approach demonstrates the potential +of LLMs in augmenting structured motion attributes and aligning multi-modal +representations. The findings contribute to the advancement of comprehensive +motion understanding and open up new avenues for leveraging LLMs in multi-modal +alignment and data augmentation for motion analysis. We make the code publicly +available at https://github.com/Radu1999/WalkAndText + +
+
+ comment: Accepted at 2nd Workshop on Learning with Few or without Annotated + Face, Body and Gesture Data +
+
+
+
+
+ + ☆ Gait Recognition from Highly Compressed Videos + + +
+ Surveillance footage represents a valuable resource and opportunities for +conducting gait analysis. However, the typical low quality and high noise +levels in such footage can severely impact the accuracy of pose estimation +algorithms, which are foundational for reliable gait analysis. Existing +literature suggests a direct correlation between the efficacy of pose +estimation and the subsequent gait analysis results. A common mitigation +strategy involves fine-tuning pose estimation models on noisy data to improve +robustness. However, this approach may degrade the downstream model's +performance on the original high-quality data, leading to a trade-off that is +undesirable in practice. We propose a processing pipeline that incorporates a +task-targeted artifact correction model specifically designed to pre-process +and enhance surveillance footage before pose estimation. Our artifact +correction model is optimized to work alongside a state-of-the-art pose +estimation network, HRNet, without requiring repeated fine-tuning of the pose +estimation model. Furthermore, we propose a simple and robust method for +obtaining low quality videos that are annotated with poses in an automatic +manner with the purpose of training the artifact correction model. We +systematically evaluate the performance of our artifact correction model +against a range of noisy surveillance data and demonstrate that our approach +not only achieves improved pose estimation on low-quality surveillance footage, +but also preserves the integrity of the pose estimation on high resolution +footage. Our experiments show a clear enhancement in gait analysis performance, +supporting the viability of the proposed method as a superior alternative to +direct fine-tuning strategies. Our contributions pave the way for more reliable +gait analysis using surveillance data in real-world applications, regardless of +data quality. + +
+
+ comment: Accepted at 2nd Workshop on Learning with Few or without Annotated + Face, Body and Gesture Data +
+
+
+
+
+ + ☆ How to Benchmark Vision Foundation Models for Semantic Segmentation? CVPR 2024 + + +
+ Recent vision foundation models (VFMs) have demonstrated proficiency in +various tasks but require supervised fine-tuning to perform the task of +semantic segmentation effectively. Benchmarking their performance is essential +for selecting current models and guiding future model developments for this +task. The lack of a standardized benchmark complicates comparisons. Therefore, +the primary objective of this paper is to study how VFMs should be benchmarked +for semantic segmentation. To do so, various VFMs are fine-tuned under various +settings, and the impact of individual settings on the performance ranking and +training time is assessed. Based on the results, the recommendation is to +fine-tune the ViT-B variants of VFMs with a 16x16 patch size and a linear +decoder, as these settings are representative of using a larger model, more +advanced decoder and smaller patch size, while reducing training time by more +than 13 times. Using multiple datasets for training and evaluation is also +recommended, as the performance ranking across datasets and domain shifts +varies. Linear probing, a common practice for some VFMs, is not recommended, as +it is not representative of end-to-end fine-tuning. The benchmarking setup +recommended in this paper enables a performance analysis of VFMs for semantic +segmentation. The findings of such an analysis reveal that pretraining with +promptable segmentation is not beneficial, whereas masked image modeling (MIM) +with abstract representations is crucial, even more important than the type of +supervision used. The code for efficiently fine-tuning VFMs for semantic +segmentation can be accessed through the project page at: +https://tue-mps.github.io/benchmark-vfm-ss/. + +
+
+ comment: CVPR 2024 Workshop Proceedings for the Second Workshop on Foundation + Models +
+
+
+
+
+ + ☆ Real-World Efficient Blind Motion Deblurring via Blur Pixel + Discretization CVPR2024 + + +
+ As recent advances in mobile camera technology have enabled the capability to +capture high-resolution images, such as 4K images, the demand for an efficient +deblurring model handling large motion has increased. In this paper, we +discover that the image residual errors, i.e., blur-sharp pixel differences, +can be grouped into some categories according to their motion blur type and how +complex their neighboring pixels are. Inspired by this, we decompose the +deblurring (regression) task into blur pixel discretization (pixel-level blur +classification) and discrete-to-continuous conversion (regression with blur +class map) tasks. Specifically, we generate the discretized image residual +errors by identifying the blur pixels and then transform them to a continuous +form, which is computationally more efficient than naively solving the original +regression problem with continuous values. Here, we found that the +discretization result, i.e., blur segmentation map, remarkably exhibits visual +similarity with the image residual errors. As a result, our efficient model +shows comparable performance to state-of-the-art methods in realistic +benchmarks, while our method is up to 10 times computationally more efficient. + +
+
+ comment: CVPR2024 Camera-Ready +
+
+
+
+
+ + ☆ StyleBooth: Image Style Editing with Multimodal Instruction + + +
+ Given an original image, image editing aims to generate an image that align +with the provided instruction. The challenges are to accept multimodal inputs +as instructions and a scarcity of high-quality training data, including crucial +triplets of source/target image pairs and multimodal (text and image) +instructions. In this paper, we focus on image style editing and present +StyleBooth, a method that proposes a comprehensive framework for image editing +and a feasible strategy for building a high-quality style editing dataset. We +integrate encoded textual instruction and image exemplar as a unified condition +for diffusion model, enabling the editing of original image following +multimodal instructions. Furthermore, by iterative style-destyle tuning and +editing and usability filtering, the StyleBooth dataset provides +content-consistent stylized/plain image pairs in various categories of styles. +To show the flexibility of StyleBooth, we conduct experiments on diverse tasks, +such as text-based style editing, exemplar-based style editing and +compositional style editing. The results demonstrate that the quality and +variety of training data significantly enhance the ability to preserve content +and improve the overall quality of generated images in editing tasks. Project +page can be found at https://ali-vilab.github.io/stylebooth-page/. + +
+
+
+
+
+ + ☆ Omniview-Tuning: Boosting Viewpoint Invariance of Vision-Language + Pre-training Models + + +
+ Vision-Language Pre-training (VLP) models like CLIP have achieved remarkable +success in computer vision and particularly demonstrated superior robustness to +distribution shifts of 2D images. However, their robustness under 3D viewpoint +variations is still limited, which can hinder the development for real-world +applications. This paper successfully addresses this concern while keeping +VLPs' original performance by breaking through two primary obstacles: 1) the +scarcity of training data and 2) the suboptimal fine-tuning paradigms. To +combat data scarcity, we build the Multi-View Caption (MVCap) dataset -- a +comprehensive collection of over four million multi-view image-text pairs +across more than 100K objects, providing more potential for VLP models to +develop generalizable viewpoint-invariant representations. To address the +limitations of existing paradigms in performance trade-offs and training +efficiency, we design a novel fine-tuning framework named Omniview-Tuning +(OVT). Specifically, OVT introduces a Cross-Viewpoint Alignment objective +through a minimax-like optimization strategy, which effectively aligns +representations of identical objects from diverse viewpoints without causing +overfitting. Additionally, OVT fine-tunes VLP models in a parameter-efficient +manner, leading to minimal computational cost. Extensive experiments on various +VLP models with different architectures validate that OVT significantly +improves the models' resilience to viewpoint shifts and keeps the original +performance, establishing a pioneering standard for boosting the viewpoint +invariance of VLP models. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ One-Shot Sequential Federated Learning for Non-IID Data by Enhancing + Local Model Diversity + + +
+ Traditional federated learning mainly focuses on parallel settings (PFL), +which can suffer significant communication and computation costs. In contrast, +one-shot and sequential federated learning (SFL) have emerged as innovative +paradigms to alleviate these costs. However, the issue of non-IID (Independent +and Identically Distributed) data persists as a significant challenge in +one-shot and SFL settings, exacerbated by the restricted communication between +clients. In this paper, we improve the one-shot sequential federated learning +for non-IID data by proposing a local model diversity-enhancing strategy. +Specifically, to leverage the potential of local model diversity for improving +model performance, we introduce a local model pool for each client that +comprises diverse models generated during local training, and propose two +distance measurements to further enhance the model diversity and mitigate the +effect of non-IID data. Consequently, our proposed framework can improve the +global model performance while maintaining low communication costs. Extensive +experiments demonstrate that our method exhibits superior performance to +existing one-shot PFL methods and achieves better accuracy compared with +state-of-the-art one-shot SFL methods on both label-skew and domain-shift tasks +(e.g., 6%+ accuracy improvement on the CIFAR-10 dataset). + +
+
+
+
+
+ + ☆ Fortify the Guardian, Not the Treasure: Resilient Adversarial Detectors + + +
+ This paper presents RADAR-Robust Adversarial Detection via Adversarial +Retraining-an approach designed to enhance the robustness of adversarial +detectors against adaptive attacks, while maintaining classifier performance. +An adaptive attack is one where the attacker is aware of the defenses and +adapts their strategy accordingly. Our proposed method leverages adversarial +training to reinforce the ability to detect attacks, without compromising clean +accuracy. During the training phase, we integrate into the dataset adversarial +examples, which were optimized to fool both the classifier and the adversarial +detector, enabling the adversarial detector to learn and adapt to potential +attack scenarios. Experimental evaluations on the CIFAR-10 and SVHN datasets +demonstrate that our proposed algorithm significantly improves a detector's +ability to accurately identify adaptive adversarial attacks -- without +sacrificing clean accuracy. + +
+
+
+
+
+ + ☆ Ethical-Lens: Curbing Malicious Usages of Open-Source Text-to-Image + Models + + +
+ The burgeoning landscape of text-to-image models, exemplified by innovations +such as Midjourney and DALLE 3, has revolutionized content creation across +diverse sectors. However, these advancements bring forth critical ethical +concerns, particularly with the misuse of open-source models to generate +content that violates societal norms. Addressing this, we introduce +Ethical-Lens, a framework designed to facilitate the value-aligned usage of +text-to-image tools without necessitating internal model revision. Ethical-Lens +ensures value alignment in text-to-image models across toxicity and bias +dimensions by refining user commands and rectifying model outputs. Systematic +evaluation metrics, combining GPT4-V, HEIM, and FairFace scores, assess +alignment capability. Our experiments reveal that Ethical-Lens enhances +alignment capabilities to levels comparable with or superior to commercial +models like DALLE 3, ensuring user-generated content adheres to ethical +standards while maintaining image quality. This study indicates the potential +of Ethical-Lens to ensure the sustainable development of open-source +text-to-image tools and their beneficial integration into society. Our code is +available at https://github.com/yuzhu-cai/Ethical-Lens. + +
+
+ comment: 42 pages, 17 figures, 29 tables +
+
+
+
+
+ + ☆ S3R-Net: A Single-Stage Approach to Self-Supervised Shadow Removal CVPR 2024 + + +
+ In this paper we present S3R-Net, the Self-Supervised Shadow Removal Network. +The two-branch WGAN model achieves self-supervision relying on the +unify-and-adaptphenomenon - it unifies the style of the output data and infers +its characteristics from a database of unaligned shadow-free reference images. +This approach stands in contrast to the large body of supervised frameworks. +S3R-Net also differentiates itself from the few existing self-supervised models +operating in a cycle-consistent manner, as it is a non-cyclic, unidirectional +solution. The proposed framework achieves comparable numerical scores to recent +selfsupervised shadow removal models while exhibiting superior qualitative +performance and keeping the computational cost low. + +
+
+ comment: NTIRE workshop @ CVPR 2024. Code & models available at + https://github.com/n-kubiak/S3R-Net +
+
+
+
+
+ + ☆ Harnessing Joint Rain-/Detail-aware Representations to Eliminate + Intricate Rains + + +
+ Recent advances in image deraining have focused on training powerful models +on mixed multiple datasets comprising diverse rain types and backgrounds. +However, this approach tends to overlook the inherent differences among rainy +images, leading to suboptimal results. To overcome this limitation, we focus on +addressing various rainy images by delving into meaningful representations that +encapsulate both the rain and background components. Leveraging these +representations as instructive guidance, we put forth a Context-based +Instance-level Modulation (CoI-M) mechanism adept at efficiently modulating +CNN- or Transformer-based models. Furthermore, we devise a rain-/detail-aware +contrastive learning strategy to help extract joint rain-/detail-aware +representations. By integrating CoI-M with the rain-/detail-aware Contrastive +learning, we develop CoIC, an innovative and potent algorithm tailored for +training models on mixed datasets. Moreover, CoIC offers insight into modeling +relationships of datasets, quantitatively assessing the impact of rain and +details on restoration, and unveiling distinct behaviors of models given +diverse inputs. Extensive experiments validate the efficacy of CoIC in boosting +the deraining ability of CNN and Transformer models. CoIC also enhances the +deraining prowess remarkably when real-world dataset is included. + +
+
+ comment: 21 pages, 14 figures +
+
+
+
+
+ + ☆ MambaPupil: Bidirectional Selective Recurrent model for Event-based Eye + tracking CVPR 2024 + + +
+ Event-based eye tracking has shown great promise with the high temporal +resolution and low redundancy provided by the event camera. However, the +diversity and abruptness of eye movement patterns, including blinking, +fixating, saccades, and smooth pursuit, pose significant challenges for eye +localization. To achieve a stable event-based eye-tracking system, this paper +proposes a bidirectional long-term sequence modeling and time-varying state +selection mechanism to fully utilize contextual temporal information in +response to the variability of eye movements. Specifically, the MambaPupil +network is proposed, which consists of the multi-layer convolutional encoder to +extract features from the event representations, a bidirectional Gated +Recurrent Unit (GRU), and a Linear Time-Varying State Space Module (LTV-SSM), +to selectively capture contextual correlation from the forward and backward +temporal relationship. Furthermore, the Bina-rep is utilized as a compact event +representation, and the tailor-made data augmentation, called as Event-Cutout, +is proposed to enhance the model's robustness by applying spatial random +masking to the event image. The evaluation on the ThreeET-plus benchmark shows +the superior performance of the MambaPupil, which secured the 1st place in +CVPR'2024 AIS Event-based Eye Tracking challenge. + +
+
+ comment: Accepted by CVPR 2024 Workshop (AIS: Vision, Graphics and AI for + Streaming), top solution of challenge Event-based Eye Tracking, see + https://www.kaggle.com/competitions/event-based-eye-tracking-ais2024 +
+
+
+
+
+ + ☆ MaskCD: A Remote Sensing Change Detection Network Based on Mask + Classification + + +
+ Change detection (CD) from remote sensing (RS) images using deep learning has +been widely investigated in the literature. It is typically regarded as a +pixel-wise labeling task that aims to classify each pixel as changed or +unchanged. Although per-pixel classification networks in encoder-decoder +structures have shown dominance, they still suffer from imprecise boundaries +and incomplete object delineation at various scenes. For high-resolution RS +images, partly or totally changed objects are more worthy of attention rather +than a single pixel. Therefore, we revisit the CD task from the mask prediction +and classification perspective and propose MaskCD to detect changed areas by +adaptively generating categorized masks from input image pairs. Specifically, +it utilizes a cross-level change representation perceiver (CLCRP) to learn +multiscale change-aware representations and capture spatiotemporal relations +from encoded features by exploiting deformable multihead self-attention +(DeformMHSA). Subsequently, a masked-attention-based detection transformers +(MA-DETR) decoder is developed to accurately locate and identify changed +objects based on masked attention and self-attention mechanisms. It +reconstructs the desired changed objects by decoding the pixel-wise +representations into learnable mask proposals and making final predictions from +these candidates. Experimental results on five benchmark datasets demonstrate +the proposed approach outperforms other state-of-the-art models. Codes and +pretrained models are available online (https://github.com/EricYu97/MaskCD). + +
+
+
+
+
+ + ☆ PureForest: A Large-scale Aerial Lidar and Aerial Imagery Dataset for + Tree Species Classification in Monospecific Forests + + +
+ Knowledge of tree species distribution is fundamental to managing forests. +New deep learning approaches promise significant accuracy gains for forest +mapping, and are becoming a critical tool for mapping multiple tree species at +scale. To advance the field, deep learning researchers need large benchmark +datasets with high-quality annotations. To this end, we present the PureForest +dataset: a large-scale, open, multimodal dataset designed for tree species +classification from both Aerial Lidar Scanning (ALS) point clouds and Very High +Resolution (VHR) aerial images. Most current public Lidar datasets for tree +species classification have low diversity as they only span a small area of a +few dozen annotated hectares at most. In contrast, PureForest has 18 tree +species grouped into 13 semantic classes, and spans 339 km$^2$ across 449 +distinct monospecific forests, and is to date the largest and most +comprehensive Lidar dataset for the identification of tree species. By making +PureForest publicly available, we hope to provide a challenging benchmark +dataset to support the development of deep learning approaches for tree species +identification from Lidar and/or aerial imagery. In this data paper, we +describe the annotation workflow, the dataset, the recommended evaluation +methodology, and establish a baseline performance from both 3D and 2D +modalities. + +
+
+ comment: 14 pages | 5 figures | Dataset is available at + http://huggingface.co/datasets/IGNF/PureForest +
+
+
+
+
+ + ☆ MIDGET: Music Conditioned 3D Dance Generation + + +
+ In this paper, we introduce a MusIc conditioned 3D Dance GEneraTion model, +named MIDGET based on Dance motion Vector Quantised Variational AutoEncoder +(VQ-VAE) model and Motion Generative Pre-Training (GPT) model to generate +vibrant and highquality dances that match the music rhythm. To tackle +challenges in the field, we introduce three new components: 1) a pre-trained +memory codebook based on the Motion VQ-VAE model to store different human pose +codes, 2) employing Motion GPT model to generate pose codes with music and +motion Encoders, 3) a simple framework for music feature extraction. We compare +with existing state-of-the-art models and perform ablation experiments on +AIST++, the largest publicly available music-dance dataset. Experiments +demonstrate that our proposed framework achieves state-of-the-art performance +on motion quality and its alignment with the music. + +
+
+ comment: 12 pages, 6 figures Published in AI 2023: Advances in Artificial + Intelligence +
+
+
+
+
+ + ☆ Improving the perception of visual fiducial markers in the field using + Adaptive Active Exposure Control + + +
+ Accurate localization is fundamental for autonomous underwater vehicles +(AUVs) to carry out precise tasks, such as manipulation and construction. +Vision-based solutions using fiducial marker are promising, but extremely +challenging underwater because of harsh lighting condition underwater. This +paper introduces a gradient-based active camera exposure control method to +tackle sharp lighting variations during image acquisition, which can establish +better foundation for subsequent image enhancement procedures. Considering a +typical scenario for underwater operations where visual tags are used, we +proposed several experiments comparing our method with other state-of-the-art +exposure control method including Active Exposure Control (AEC) and +Gradient-based Exposure Control (GEC). Results show a significant improvement +in the accuracy of robot localization. This method is an important component +that can be used in visual-based state estimation pipeline to improve the +overall localization accuracy. + +
+
+ comment: Paper accepted by ISER 2023 +
+
+
+
+
+ + ☆ Data-free Knowledge Distillation for Fine-grained Visual Categorization + + +
+ Data-free knowledge distillation (DFKD) is a promising approach for +addressing issues related to model compression, security privacy, and +transmission restrictions. Although the existing methods exploiting DFKD have +achieved inspiring achievements in coarse-grained classification, in practical +applications involving fine-grained classification tasks that require more +detailed distinctions between similar categories, sub-optimal results are +obtained. To address this issue, we propose an approach called DFKD-FGVC that +extends DFKD to fine-grained visual categorization~(FGVC) tasks. Our approach +utilizes an adversarial distillation framework with attention generator, mixed +high-order attention distillation, and semantic feature contrast learning. +Specifically, we introduce a spatial-wise attention mechanism to the generator +to synthesize fine-grained images with more details of discriminative parts. We +also utilize the mixed high-order attention mechanism to capture complex +interactions among parts and the subtle differences among discriminative +features of the fine-grained categories, paying attention to both local +features and semantic context relationships. Moreover, we leverage the teacher +and student models of the distillation framework to contrast high-level +semantic feature maps in the hyperspace, comparing variances of different +categories. We evaluate our approach on three widely-used FGVC benchmarks +(Aircraft, Cars196, and CUB200) and demonstrate its superior performance. + +
+
+
+
+
+ + ☆ MLS-Track: Multilevel Semantic Interaction in RMOT + + +
+ The new trend in multi-object tracking task is to track objects of interest +using natural language. However, the scarcity of paired prompt-instance data +hinders its progress. To address this challenge, we propose a high-quality yet +low-cost data generation method base on Unreal Engine 5 and construct a +brand-new benchmark dataset, named Refer-UE-City, which primarily includes +scenes from intersection surveillance videos, detailing the appearance and +actions of people and vehicles. Specifically, it provides 14 videos with a +total of 714 expressions, and is comparable in scale to the Refer-KITTI +dataset. Additionally, we propose a multi-level semantic-guided multi-object +framework called MLS-Track, where the interaction between the model and text is +enhanced layer by layer through the introduction of Semantic Guidance Module +(SGM) and Semantic Correlation Branch (SCB). Extensive experiments on +Refer-UE-City and Refer-KITTI datasets demonstrate the effectiveness of our +proposed framework and it achieves state-of-the-art performance. Code and +datatsets will be available. + +
+
+ comment: 17 pages 8 figures +
+
+
+
+
+ + ☆ Meta-Auxiliary Learning for Micro-Expression Recognition + + +
+ Micro-expressions (MEs) are involuntary movements revealing people's hidden +feelings, which has attracted numerous interests for its objectivity in emotion +detection. However, despite its wide applications in various scenarios, +micro-expression recognition (MER) remains a challenging problem in real life +due to three reasons, including (i) data-level: lack of data and imbalanced +classes, (ii) feature-level: subtle, rapid changing, and complex features of +MEs, and (iii) decision-making-level: impact of individual differences. To +address these issues, we propose a dual-branch meta-auxiliary learning method, +called LightmanNet, for fast and robust micro-expression recognition. +Specifically, LightmanNet learns general MER knowledge from limited data +through a dual-branch bi-level optimization process: (i) In the first level, it +obtains task-specific MER knowledge by learning in two branches, where the +first branch is for learning MER features via primary MER tasks, while the +other branch is for guiding the model obtain discriminative features via +auxiliary tasks, i.e., image alignment between micro-expressions and +macro-expressions since their resemblance in both spatial and temporal +behavioral patterns. The two branches of learning jointly constrain the model +of learning meaningful task-specific MER knowledge while avoiding learning +noise or superficial connections between MEs and emotions that may damage its +generalization ability. (ii) In the second level, LightmanNet further refines +the learned task-specific knowledge, improving model generalization and +efficiency. Extensive experiments on various benchmark datasets demonstrate the +superior robustness and efficiency of LightmanNet. + +
+
+ comment: 10 pages, 7 figures, 3 tables +
+
+
+
+
+ + ☆ Look, Listen, and Answer: Overcoming Biases for Audio-Visual Question + Answering + + +
+ Audio-Visual Question Answering (AVQA) is a complex multi-modal reasoning +task, demanding intelligent systems to accurately respond to natural language +queries based on audio-video input pairs. Nevertheless, prevalent AVQA +approaches are prone to overlearning dataset biases, resulting in poor +robustness. Furthermore, current datasets may not provide a precise diagnostic +for these methods. To tackle these challenges, firstly, we propose a novel +dataset, \textit{MUSIC-AVQA-R}, crafted in two steps: rephrasing questions +within the test split of a public dataset (\textit{MUSIC-AVQA}) and +subsequently introducing distribution shifts to split questions. The former +leads to a large, diverse test space, while the latter results in a +comprehensive robustness evaluation on rare, frequent, and overall questions. +Secondly, we propose a robust architecture that utilizes a multifaceted cycle +collaborative debiasing strategy to overcome bias learning. Experimental +results show that this architecture achieves state-of-the-art performance on +both datasets, especially obtaining a significant improvement of 9.68\% on the +proposed dataset. Extensive ablation experiments are conducted on these two +datasets to validate the effectiveness of the debiasing strategy. Additionally, +we highlight the limited robustness of existing multi-modal QA methods through +the evaluation on our dataset. + +
+
+ comment: 16 pages, 9 figures,5 Tables +
+
+
+
+
+ + ☆ What does CLIP know about peeling a banana? CVPR2024 + + +
+ Humans show an innate capability to identify tools to support specific +actions. The association between objects parts and the actions they facilitate +is usually named affordance. Being able to segment objects parts depending on +the tasks they afford is crucial to enable intelligent robots to use objects of +daily living. Traditional supervised learning methods for affordance +segmentation require costly pixel-level annotations, while weakly supervised +approaches, though less demanding, still rely on object-interaction examples +and support a closed set of actions. These limitations hinder scalability, may +introduce biases, and usually restrict models to a limited set of predefined +actions. This paper proposes AffordanceCLIP, to overcome these limitations by +leveraging the implicit affordance knowledge embedded within large pre-trained +Vision-Language models like CLIP. We experimentally demonstrate that CLIP, +although not explicitly trained for affordances detection, retains valuable +information for the task. Our AffordanceCLIP achieves competitive zero-shot +performance compared to methods with specialized training, while offering +several advantages: i) it works with any action prompt, not just a predefined +set; ii) it requires training only a small number of additional parameters +compared to existing solutions and iii) eliminates the need for direct +supervision on action-object pairs, opening new perspectives for +functionality-based reasoning of models. + +
+
+ comment: Accepted to MAR Workshop at CVPR2024 +
+
+
+
+
+ + ☆ Curriculum Point Prompting for Weakly-Supervised Referring Image + Segmentation CVPR 2024 + + +
+ Referring image segmentation (RIS) aims to precisely segment referents in +images through corresponding natural language expressions, yet relying on +cost-intensive mask annotations. Weakly supervised RIS thus learns from +image-text pairs to pixel-level semantics, which is challenging for segmenting +fine-grained masks. A natural approach to enhancing segmentation precision is +to empower weakly supervised RIS with the image segmentation foundation model +SAM. Nevertheless, we observe that simply integrating SAM yields limited +benefits and can even lead to performance regression due to the inevitable +noise issues and challenges in excessive focus on object parts. In this paper, +we present an innovative framework, Point PrompTing (PPT), incorporated with +the proposed multi-source curriculum learning strategy to address these +challenges. Specifically, the core of PPT is a point generator that not only +harnesses CLIP's text-image alignment capability and SAM's powerful mask +generation ability but also generates negative point prompts to address the +noisy and excessive focus issues inherently and effectively. In addition, we +introduce a curriculum learning strategy with object-centric images to help PPT +gradually learn from simpler yet precise semantic alignment to more complex +RIS. Experiments demonstrate that our PPT significantly and consistently +outperforms prior weakly supervised techniques on mIoU by 11.34%, 14.14%, and +6.97% across RefCOCO, RefCOCO+, and G-Ref, respectively. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ MultiPhys: Multi-Person Physics-aware 3D Motion Estimation + + +
+ We introduce MultiPhys, a method designed for recovering multi-person motion +from monocular videos. Our focus lies in capturing coherent spatial placement +between pairs of individuals across varying degrees of engagement. MultiPhys, +being physically aware, exhibits robustness to jittering and occlusions, and +effectively eliminates penetration issues between the two individuals. We +devise a pipeline in which the motion estimated by a kinematic-based method is +fed into a physics simulator in an autoregressive manner. We introduce distinct +components that enable our model to harness the simulator's properties without +compromising the accuracy of the kinematic estimates. This results in final +motion estimates that are both kinematically coherent and physically compliant. +Extensive evaluations on three challenging datasets characterized by +substantial inter-person interaction show that our method significantly reduces +errors associated with penetration and foot skating, while performing +competitively with the state-of-the-art on motion accuracy and smoothness. +Results and code can be found on our project page +(http://www.iri.upc.edu/people/nugrinovic/multiphys/). + +
+
+
+
+
+ + ☆ Tendency-driven Mutual Exclusivity for Weakly Supervised Incremental + Semantic Segmentation + + +
+ Weakly Incremental Learning for Semantic Segmentation (WILSS) leverages a +pre-trained segmentation model to segment new classes using cost-effective and +readily available image-level labels. A prevailing way to solve WILSS is the +generation of seed areas for each new class, serving as a form of pixel-level +supervision. However, a scenario usually arises where a pixel is concurrently +predicted as an old class by the pre-trained segmentation model and a new class +by the seed areas. Such a scenario becomes particularly problematic in WILSS, +as the lack of pixel-level annotations on new classes makes it intractable to +ascertain whether the pixel pertains to the new class or not. To surmount this +issue, we propose an innovative, tendency-driven relationship of mutual +exclusivity, meticulously tailored to govern the behavior of the seed areas and +the predictions generated by the pre-trained segmentation model. This +relationship stipulates that predictions for the new and old classes must not +conflict whilst prioritizing the preservation of predictions for the old +classes, which not only addresses the conflicting prediction issue but also +effectively mitigates the inherent challenge of incremental learning - +catastrophic forgetting. Furthermore, under the auspices of this +tendency-driven mutual exclusivity relationship, we generate pseudo masks for +the new classes, allowing for concurrent execution with model parameter +updating via the resolution of a bi-level optimization problem. Extensive +experiments substantiate the effectiveness of our framework, resulting in the +establishment of new benchmarks and paving the way for further research in this +field. + +
+
+
+
+
+ + ☆ MTGA: Multi-view Temporal Granularity aligned Aggregation for + Event-based Lip-reading + + +
+ Lip-reading is to utilize the visual information of the speaker's lip +movements to recognize words and sentences. Existing event-based lip-reading +solutions integrate different frame rate branches to learn spatio-temporal +features of varying granularities. However, aggregating events into event +frames inevitably leads to the loss of fine-grained temporal information within +frames. To remedy this drawback, we propose a novel framework termed Multi-view +Temporal Granularity aligned Aggregation (MTGA). Specifically, we first present +a novel event representation method, namely time-segmented voxel graph list, +where the most significant local voxels are temporally connected into a graph +list. Then we design a spatio-temporal fusion module based on temporal +granularity alignment, where the global spatial features extracted from event +frames, together with the local relative spatial and temporal features +contained in voxel graph list are effectively aligned and integrated. Finally, +we design a temporal aggregation module that incorporates positional encoding, +which enables the capture of local absolute spatial and global temporal +information. Experiments demonstrate that our method outperforms both the +event-based and video-based lip-reading counterparts. Our code will be publicly +available. + +
+
+
+
+
+ + ☆ Device (In)Dependence of Deep Learning-based Image Age Approximation ICPR + + +
+ The goal of temporal image forensic is to approximate the age of a digital +image relative to images from the same device. Usually, this is based on traces +left during the image acquisition pipeline. For example, several methods exist +that exploit the presence of in-field sensor defects for this purpose. In +addition to these 'classical' methods, there is also an approach in which a +Convolutional Neural Network (CNN) is trained to approximate the image age. One +advantage of a CNN is that it independently learns the age features used. This +would make it possible to exploit other (different) age traces in addition to +the known ones (i.e., in-field sensor defects). In a previous work, we have +shown that the presence of strong in-field sensor defects is irrelevant for a +CNN to predict the age class. Based on this observation, the question arises +how device (in)dependent the learned features are. In this work, we empirically +asses this by training a network on images from a single device and then apply +the trained model to images from different devices. This evaluation is +performed on 14 different devices, including 10 devices from the publicly +available 'Northumbria Temporal Image Forensics' database. These 10 different +devices are based on five different device pairs (i.e., with the identical +camera model). + +
+
+ comment: This work was accepted and presented in: 2022 ICPR-Workshop on + Artificial Intelligence for Multimedia Forensics and Disinformation + Detection. Montreal, Quebec, Canada. However, due to a technical issue on the + publishing companies' side, the work does not appear in the workshop + proceedings +
+
+
+
+
+ + ☆ ©Plug-in Authorization for Human Content Copyright Protection + in Text-to-Image Model + + +
+ This paper addresses the contentious issue of copyright infringement in +images generated by text-to-image models, sparking debates among AI developers, +content creators, and legal entities. State-of-the-art models create +high-quality content without crediting original creators, causing concern in +the artistic community. To mitigate this, we propose the \copyright Plug-in +Authorization framework, introducing three operations: addition, extraction, +and combination. Addition involves training a \copyright plug-in for specific +copyright, facilitating proper credit attribution. Extraction allows creators +to reclaim copyright from infringing models, and combination enables users to +merge different \copyright plug-ins. These operations act as permits, +incentivizing fair use and providing flexibility in authorization. We present +innovative approaches,"Reverse LoRA" for extraction and "EasyMerge" for +seamless combination. Experiments in artist-style replication and cartoon IP +recreation demonstrate \copyright plug-ins' effectiveness, offering a valuable +solution for human copyright protection in the age of generative AIs. + +
+
+ comment: 20 pages, 6 figures +
+
+
+
+
+ + ☆ Not All Voxels Are Equal: Hardness-Aware Semantic Scene Completion with + Self-Distillation CVPR2024 + + +
+ Semantic scene completion, also known as semantic occupancy prediction, can +provide dense geometric and semantic information for autonomous vehicles, which +attracts the increasing attention of both academia and industry. Unfortunately, +existing methods usually formulate this task as a voxel-wise classification +problem and treat each voxel equally in 3D space during training. As the hard +voxels have not been paid enough attention, the performance in some challenging +regions is limited. The 3D dense space typically contains a large number of +empty voxels, which are easy to learn but require amounts of computation due to +handling all the voxels uniformly for the existing models. Furthermore, the +voxels in the boundary region are more challenging to differentiate than those +in the interior. In this paper, we propose HASSC approach to train the semantic +scene completion model with hardness-aware design. The global hardness from the +network optimization process is defined for dynamical hard voxel selection. +Then, the local hardness with geometric anisotropy is adopted for voxel-wise +refinement. Besides, self-distillation strategy is introduced to make training +process stable and consistent. Extensive experiments show that our HASSC scheme +can effectively promote the accuracy of the baseline model without incurring +the extra inference cost. Source code is available at: +https://github.com/songw-zju/HASSC. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ The devil is in the object boundary: towards annotation-free instance + segmentation using Foundation Models ICLR2024 + + +
+ Foundation models, pre-trained on a large amount of data have demonstrated +impressive zero-shot capabilities in various downstream tasks. However, in +object detection and instance segmentation, two fundamental computer vision +tasks heavily reliant on extensive human annotations, foundation models such as +SAM and DINO struggle to achieve satisfactory performance. In this study, we +reveal that the devil is in the object boundary, \textit{i.e.}, these +foundation models fail to discern boundaries between individual objects. For +the first time, we probe that CLIP, which has never accessed any instance-level +annotations, can provide a highly beneficial and strong instance-level boundary +prior in the clustering results of its particular intermediate layer. Following +this surprising observation, we propose $\textbf{Zip}$ which $\textbf{Z}$ips up +CL$\textbf{ip}$ and SAM in a novel classification-first-then-discovery +pipeline, enabling annotation-free, complex-scene-capable, open-vocabulary +object detection and instance segmentation. Our Zip significantly boosts SAM's +mask AP on COCO dataset by 12.5% and establishes state-of-the-art performance +in various settings, including training-free, self-training, and +label-efficient finetuning. Furthermore, annotation-free Zip even achieves +comparable performance to the best-performing open-vocabulary object detecters +using base annotations. Code is released at +https://github.com/ChengShiest/Zip-Your-CLIP + +
+
+ comment: ICLR2024, Code is released at + https://github.com/ChengShiest/Zip-Your-CLIP +
+
+
+
+
+ + ☆ Sketch-guided Image Inpainting with Partial Discrete Diffusion Process CVPR 2024 + + +
+ In this work, we study the task of sketch-guided image inpainting. Unlike the +well-explored natural language-guided image inpainting, which excels in +capturing semantic details, the relatively less-studied sketch-guided +inpainting offers greater user control in specifying the object's shape and +pose to be inpainted. As one of the early solutions to this task, we introduce +a novel partial discrete diffusion process (PDDP). The forward pass of the PDDP +corrupts the masked regions of the image and the backward pass reconstructs +these masked regions conditioned on hand-drawn sketches using our proposed +sketch-guided bi-directional transformer. The proposed novel transformer module +accepts two inputs -- the image containing the masked region to be inpainted +and the query sketch to model the reverse diffusion process. This strategy +effectively addresses the domain gap between sketches and natural images, +thereby, enhancing the quality of inpainting results. In the absence of a +large-scale dataset specific to this task, we synthesize a dataset from the +MS-COCO to train and extensively evaluate our proposed framework against +various competent approaches in the literature. The qualitative and +quantitative results and user studies establish that the proposed method +inpaints realistic objects that fit the context in terms of the visual +appearance of the provided sketch. To aid further research, we have made our +code publicly available at https://github.com/vl2g/Sketch-Inpainting . + +
+
+ comment: Accepted to NTIRE Workshop @ CVPR 2024 +
+
+
+
+
+ + ☆ VCC-INFUSE: Towards Accurate and Efficient Selection of Unlabeled + Examples in Semi-supervised Learning IJCAI 2024 + + +
+ Despite the progress of Semi-supervised Learning (SSL), existing methods fail +to utilize unlabeled data effectively and efficiently. Many pseudo-label-based +methods select unlabeled examples based on inaccurate confidence scores from +the classifier. Most prior work also uses all available unlabeled data without +pruning, making it difficult to handle large amounts of unlabeled data. To +address these issues, we propose two methods: Variational Confidence +Calibration (VCC) and Influence-Function-based Unlabeled Sample Elimination +(INFUSE). VCC is an universal plugin for SSL confidence calibration, using a +variational autoencoder to select more accurate pseudo labels based on three +types of consistency scores. INFUSE is a data pruning method that constructs a +core dataset of unlabeled examples under SSL. Our methods are effective in +multiple datasets and settings, reducing classification errors rates and saving +training time. Together, VCC-INFUSE reduces the error rate of FlexMatch on the +CIFAR-100 dataset by 1.08% while saving nearly half of the training time. + +
+
+ comment: Accepted paper of IJCAI 2024. Shijie Fang and Qianhan Feng + contributed equally to this paper +
+
+
+
+
+ + ☆ S4TP: Social-Suitable and Safety-Sensitive Trajectory Planning for + Autonomous Vehicles + + +
+ In public roads, autonomous vehicles (AVs) face the challenge of frequent +interactions with human-driven vehicles (HDVs), which render uncertain driving +behavior due to varying social characteristics among humans. To effectively +assess the risks prevailing in the vicinity of AVs in social interactive +traffic scenarios and achieve safe autonomous driving, this article proposes a +social-suitable and safety-sensitive trajectory planning (S4TP) framework. +Specifically, S4TP integrates the Social-Aware Trajectory Prediction (SATP) and +Social-Aware Driving Risk Field (SADRF) modules. SATP utilizes Transformers to +effectively encode the driving scene and incorporates an AV's planned +trajectory during the prediction decoding process. SADRF assesses the expected +surrounding risk degrees during AVs-HDVs interactions, each with different +social characteristics, visualized as two-dimensional heat maps centered on the +AV. SADRF models the driving intentions of the surrounding HDVs and predicts +trajectories based on the representation of vehicular interactions. S4TP +employs an optimization-based approach for motion planning, utilizing the +predicted HDVs'trajectories as input. With the integration of SADRF, S4TP +executes real-time online optimization of the planned trajectory of AV within +lowrisk regions, thus improving the safety and the interpretability of the +planned trajectory. We have conducted comprehensive tests of the proposed +method using the SMARTS simulator. Experimental results in complex social +scenarios, such as unprotected left turn intersections, merging, cruising, and +overtaking, validate the superiority of our proposed S4TP in terms of safety +and rationality. S4TP achieves a pass rate of 100% across all scenarios, +surpassing the current state-of-the-art methods Fanta of 98.25% and +Predictive-Decision of 94.75%. + +
+
+ comment: 12 pages,4 figures, published to IEEE Transactions on Intelligent + Vehicles +
+
+
+
+
+ + ☆ LD-Pruner: Efficient Pruning of Latent Diffusion Models using + Task-Agnostic Insights CVPR24 + + +
+ Latent Diffusion Models (LDMs) have emerged as powerful generative models, +known for delivering remarkable results under constrained computational +resources. However, deploying LDMs on resource-limited devices remains a +complex issue, presenting challenges such as memory consumption and inference +speed. To address this issue, we introduce LD-Pruner, a novel +performance-preserving structured pruning method for compressing LDMs. +Traditional pruning methods for deep neural networks are not tailored to the +unique characteristics of LDMs, such as the high computational cost of training +and the absence of a fast, straightforward and task-agnostic method for +evaluating model performance. Our method tackles these challenges by leveraging +the latent space during the pruning process, enabling us to effectively +quantify the impact of pruning on model performance, independently of the task +at hand. This targeted pruning of components with minimal impact on the output +allows for faster convergence during training, as the model has less +information to re-learn, thereby addressing the high computational cost of +training. Consequently, our approach achieves a compressed model that offers +improved inference speed and reduced parameter count, while maintaining minimal +performance degradation. We demonstrate the effectiveness of our approach on +three different tasks: text-to-image (T2I) generation, Unconditional Image +Generation (UIG) and Unconditional Audio Generation (UAG). Notably, we reduce +the inference time of Stable Diffusion (SD) by 34.9% while simultaneously +improving its FID by 5.2% on MS-COCO T2I benchmark. This work paves the way for +more efficient pruning methods for LDMs, enhancing their applicability. + +
+
+ comment: 8 pages, accepted to CVPR24 First Workshop on Efficient and On-Device + Generation (EDGE) +
+
+
+
+
+ + ☆ A Symmetric Regressor for MRI-Based Assessment of Striatal Dopamine + Transporter Uptake in Parkinson's Disease + + +
+ Dopamine transporter (DAT) imaging is commonly used for monitoring +Parkinson's disease (PD), where striatal DAT uptake amount is computed to +assess PD severity. However, DAT imaging has a high cost and the risk of +radiance exposure and is not available in general clinics. Recently, MRI patch +of the nigral region has been proposed as a safer and easier alternative. This +paper proposes a symmetric regressor for predicting the DAT uptake amount from +the nigral MRI patch. Acknowledging the symmetry between the right and left +nigrae, the proposed regressor incorporates a paired input-output model that +simultaneously predicts the DAT uptake amounts for both the right and left +striata. Moreover, it employs a symmetric loss that imposes a constraint on the +difference between right-to-left predictions, resembling the high correlation +in DAT uptake amounts in the two lateral sides. Additionally, we propose a +symmetric Monte-Carlo (MC) dropout method for providing a fruitful uncertainty +estimate of the DAT uptake prediction, which utilizes the above symmetry. We +evaluated the proposed approach on 734 nigral patches, which demonstrated +significantly improved performance of the symmetric regressor compared with the +standard regressors while giving better explainability and feature +representation. The symmetric MC dropout also gave precise uncertainty ranges +with a high probability of including the true DAT uptake amounts within the +range. + +
+
+
+
+
+ + ☆ EdgeFusion: On-Device Text-to-Image Generation CVPR24 + + +
+ The intensive computational burden of Stable Diffusion (SD) for text-to-image +generation poses a significant hurdle for its practical application. To tackle +this challenge, recent research focuses on methods to reduce sampling steps, +such as Latent Consistency Model (LCM), and on employing architectural +optimizations, including pruning and knowledge distillation. Diverging from +existing approaches, we uniquely start with a compact SD variant, BK-SDM. We +observe that directly applying LCM to BK-SDM with commonly used crawled +datasets yields unsatisfactory results. It leads us to develop two strategies: +(1) leveraging high-quality image-text pairs from leading generative models and +(2) designing an advanced distillation process tailored for LCM. Through our +thorough exploration of quantization, profiling, and on-device deployment, we +achieve rapid generation of photo-realistic, text-aligned images in just two +steps, with latency under one second on resource-limited edge devices. + +
+
+ comment: 4 pages, accepted to CVPR24 First Workshop on Efficient and On-Device + Generation (EDGE) +
+
+
+
+
+ + ☆ Simultaneous Detection and Interaction Reasoning for Object-Centric + Action Recognition + + +
+ The interactions between human and objects are important for recognizing +object-centric actions. Existing methods usually adopt a two-stage pipeline, +where object proposals are first detected using a pretrained detector, and then +are fed to an action recognition model for extracting video features and +learning the object relations for action recognition. However, since the action +prior is unknown in the object detection stage, important objects could be +easily overlooked, leading to inferior action recognition performance. In this +paper, we propose an end-to-end object-centric action recognition framework +that simultaneously performs Detection And Interaction Reasoning in one stage. +Particularly, after extracting video features with a base network, we create +three modules for concurrent object detection and interaction reasoning. First, +a Patch-based Object Decoder generates proposals from video patch tokens. Then, +an Interactive Object Refining and Aggregation identifies important objects for +action recognition, adjusts proposal scores based on position and appearance, +and aggregates object-level info into a global video representation. Lastly, an +Object Relation Modeling module encodes object relations. These three modules +together with the video feature extractor can be trained jointly in an +end-to-end fashion, thus avoiding the heavy reliance on an off-the-shelf object +detector, and reducing the multi-stage training burden. We conduct experiments +on two datasets, Something-Else and Ikea-Assembly, to evaluate the performance +of our proposed approach on conventional, compositional, and few-shot action +recognition tasks. Through in-depth experimental analysis, we show the crucial +role of interactive objects in learning for action recognition, and we can +outperform state-of-the-art methods on both datasets. + +
+
+ comment: 12 pages, 5 figures, submitted to IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ AG-NeRF: Attention-guided Neural Radiance Fields for Multi-height + Large-scale Outdoor Scene Rendering + + +
+ Existing neural radiance fields (NeRF)-based novel view synthesis methods for +large-scale outdoor scenes are mainly built on a single altitude. Moreover, +they often require a priori camera shooting height and scene scope, leading to +inefficient and impractical applications when camera altitude changes. In this +work, we propose an end-to-end framework, termed AG-NeRF, and seek to reduce +the training cost of building good reconstructions by synthesizing +free-viewpoint images based on varying altitudes of scenes. Specifically, to +tackle the detail variation problem from low altitude (drone-level) to high +altitude (satellite-level), a source image selection method and an +attention-based feature fusion approach are developed to extract and fuse the +most relevant features of target view from multi-height images for +high-fidelity rendering. Extensive experiments demonstrate that AG-NeRF +achieves SOTA performance on 56 Leonard and Transamerica benchmarks and only +requires a half hour of training time to reach the competitive PSNR as compared +to the latest BungeeNeRF. + +
+
+
+
+
+ + ☆ FreeDiff: Progressive Frequency Truncation for Image Editing with + Diffusion Models + + +
+ Precise image editing with text-to-image models has attracted increasing +interest due to their remarkable generative capabilities and user-friendly +nature. However, such attempts face the pivotal challenge of misalignment +between the intended precise editing target regions and the broader area +impacted by the guidance in practice. Despite excellent methods leveraging +attention mechanisms that have been developed to refine the editing guidance, +these approaches necessitate modifications through complex network architecture +and are limited to specific editing tasks. In this work, we re-examine the +diffusion process and misalignment problem from a frequency perspective, +revealing that, due to the power law of natural images and the decaying noise +schedule, the denoising network primarily recovers low-frequency image +components during the earlier timesteps and thus brings excessive low-frequency +signals for editing. Leveraging this insight, we introduce a novel fine-tuning +free approach that employs progressive $\textbf{Fre}$qu$\textbf{e}$ncy +truncation to refine the guidance of $\textbf{Diff}$usion models for universal +editing tasks ($\textbf{FreeDiff}$). Our method achieves comparable results +with state-of-the-art methods across a variety of editing tasks and on a +diverse set of images, highlighting its potential as a versatile tool in image +editing applications. + +
+
+
+
+
+ + ☆ Multi-view X-ray Image Synthesis with Multiple Domain Disentanglement + from CT Scans + + +
+ X-ray images play a vital role in the intraoperative processes due to their +high resolution and fast imaging speed and greatly promote the subsequent +segmentation, registration and reconstruction. However, over-dosed X-rays +superimpose potential risks to human health to some extent. Data-driven +algorithms from volume scans to X-ray images are restricted by the scarcity of +paired X-ray and volume data. Existing methods are mainly realized by modelling +the whole X-ray imaging procedure. In this study, we propose a learning-based +approach termed CT2X-GAN to synthesize the X-ray images in an end-to-end manner +using the content and style disentanglement from three different image domains. +Our method decouples the anatomical structure information from CT scans and +style information from unpaired real X-ray images/ digital reconstructed +radiography (DRR) images via a series of decoupling encoders. Additionally, we +introduce a novel consistency regularization term to improve the stylistic +resemblance between synthesized X-ray images and real X-ray images. Meanwhile, +we also impose a supervised process by computing the similarity of computed +real DRR and synthesized DRR images. We further develop a pose attention module +to fully strengthen the comprehensive information in the decoupled content code +from CT scans, facilitating high-quality multi-view image synthesis in the +lower 2D space. Extensive experiments were conducted on the publicly available +CTSpine1K dataset and achieved 97.8350, 0.0842 and 3.0938 in terms of FID, KID +and defined user-scored X-ray similarity, respectively. In comparison with +3D-aware methods ($\pi$-GAN, EG3D), CT2X-GAN is superior in improving the +synthesis quality and realistic to the real X-ray images. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ Seeing Motion at Nighttime with an Event Camera CVPR 2024 + + +
+ We focus on a very challenging task: imaging at nighttime dynamic scenes. +Most previous methods rely on the low-light enhancement of a conventional RGB +camera. However, they would inevitably face a dilemma between the long exposure +time of nighttime and the motion blur of dynamic scenes. Event cameras react to +dynamic changes with higher temporal resolution (microsecond) and higher +dynamic range (120dB), offering an alternative solution. In this work, we +present a novel nighttime dynamic imaging method with an event camera. +Specifically, we discover that the event at nighttime exhibits temporal +trailing characteristics and spatial non-stationary distribution. Consequently, +we propose a nighttime event reconstruction network (NER-Net) which mainly +includes a learnable event timestamps calibration module (LETC) to align the +temporal trailing events and a non-uniform illumination aware module (NIAM) to +stabilize the spatiotemporal distribution of events. Moreover, we construct a +paired real low-light event dataset (RLED) through a co-axial imaging system, +including 64,200 spatially and temporally aligned image GTs and low-light +events. Extensive experiments demonstrate that the proposed method outperforms +state-of-the-art methods in terms of visual quality and generalization ability +on real-world nighttime datasets. The project are available at: +https://github.com/Liu-haoyue/NER-Net. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ SNP: Structured Neuron-level Pruning to Preserve Attention Scores + + +
+ Multi-head self-attention (MSA) is a key component of Vision Transformers +(ViTs), which have achieved great success in various vision tasks. However, +their high computational cost and memory footprint hinder their deployment on +resource-constrained devices. Conventional pruning approaches can only compress +and accelerate the MSA module using head pruning, although the head is not an +atomic unit. To address this issue, we propose a novel graph-aware neuron-level +pruning method, Structured Neuron-level Pruning (SNP). SNP prunes neurons with +less informative attention scores and eliminates redundancy among heads. +Specifically, it prunes graphically connected query and key layers having the +least informative attention scores while preserving the overall attention +scores. Value layers, which can be pruned independently, are pruned to +eliminate inter-head redundancy. Our proposed method effectively compresses and +accelerates Transformer-based models for both edge devices and server +processors. For instance, the DeiT-Small with SNP runs 3.1$\times$ faster than +the original model and achieves performance that is 21.94\% faster and 1.12\% +higher than the DeiT-Tiny. Additionally, SNP combine successfully with +conventional head or block pruning approaches. SNP with head pruning could +compress the DeiT-Base by 80\% of the parameters and computational costs and +achieve 3.85$\times$ faster inference speed on RTX3090 and 4.93$\times$ on +Jetson Nano. + +
+
+
+
+
+ + ☆ Group-On: Boosting One-Shot Segmentation with Supportive Query + + +
+ One-shot semantic segmentation aims to segment query images given only ONE +annotated support image of the same class. This task is challenging because +target objects in the support and query images can be largely different in +appearance and pose (i.e., intra-class variation). Prior works suggested that +incorporating more annotated support images in few-shot settings boosts +performances but increases costs due to additional manual labeling. In this +paper, we propose a novel approach for ONE-shot semantic segmentation, called +Group-On, which packs multiple query images in batches for the benefit of +mutual knowledge support within the same category. Specifically, after coarse +segmentation masks of the batch of queries are predicted, query-mask pairs act +as pseudo support data to enhance mask predictions mutually, under the guidance +of a simple Group-On Voting module. Comprehensive experiments on three standard +benchmarks show that, in the ONE-shot setting, our Group-On approach +significantly outperforms previous works by considerable margins. For example, +on the COCO-20i dataset, we increase mIoU scores by 8.21% and 7.46% on ASNet +and HSNet baselines, respectively. With only one support image, Group-On can be +even competitive with the counterparts using 5 annotated support images. + +
+
+
+
+
+ + ☆ OPTiML: Dense Semantic Invariance Using Optimal Transport for + Self-Supervised Medical Image Representation + + +
+ Self-supervised learning (SSL) has emerged as a promising technique for +medical image analysis due to its ability to learn without annotations. +However, despite the promising potential, conventional SSL methods encounter +limitations, including challenges in achieving semantic alignment and capturing +subtle details. This leads to suboptimal representations, which fail to +accurately capture the underlying anatomical structures and pathological +details. In response to these constraints, we introduce a novel SSL framework +OPTiML, employing optimal transport (OT), to capture the dense semantic +invariance and fine-grained details, thereby enhancing the overall +effectiveness of SSL in medical image representation learning. The core idea is +to integrate OT with a cross-viewpoint semantics infusion module (CV-SIM), +which effectively captures complex, fine-grained details inherent in medical +images across different viewpoints. In addition to the CV-SIM module, OPTiML +imposes the variance and covariance regularizations within OT framework to +force the model focus on clinically relevant information while discarding less +informative features. Through these, the proposed framework demonstrates its +capacity to learn semantically rich representations that can be applied to +various medical imaging tasks. To validate its effectiveness, we conduct +experimental studies on three publicly available datasets from chest X-ray +modality. Our empirical results reveal OPTiML's superiority over +state-of-the-art methods across all evaluated tasks. + +
+
+
+
+
+ + ☆ From Image to Video, what do we need in multimodal LLMs? + + +
+ Multimodal Large Language Models (MLLMs) have demonstrated profound +capabilities in understanding multimodal information, covering from Image LLMs +to the more complex Video LLMs. Numerous studies have illustrated their +exceptional cross-modal comprehension. Recently, integrating video foundation +models with large language models to build a comprehensive video understanding +system has been proposed to overcome the limitations of specific pre-defined +vision tasks. However, the current advancements in Video LLMs tend to overlook +the foundational contributions of Image LLMs, often opting for more complicated +structures and a wide variety of multimodal data for pre-training. This +approach significantly increases the costs associated with these methods.In +response to these challenges, this work introduces an efficient method that +strategically leverages the priors of Image LLMs, facilitating a +resource-efficient transition from Image to Video LLMs. We propose RED-VILLM, a +Resource-Efficient Development pipeline for Video LLMs from Image LLMs, which +utilizes a temporal adaptation plug-and-play structure within the image fusion +module of Image LLMs. This adaptation extends their understanding capabilities +to include temporal information, enabling the development of Video LLMs that +not only surpass baseline performances but also do so with minimal +instructional data and training resources. Our approach highlights the +potential for a more cost-effective and scalable advancement in multimodal +models, effectively building upon the foundational work of Image LLMs. + +
+
+
+
+
+ + ☆ Progressive Multi-modal Conditional Prompt Tuning + + +
+ Pre-trained vision-language models (VLMs) have shown remarkable +generalization capabilities via prompting, which leverages VLMs as knowledge +bases to extract information beneficial for downstream tasks. However, existing +methods primarily employ uni-modal prompting, which only engages a uni-modal +branch, failing to simultaneously adjust vision-language (V-L) features. +Additionally, the one-pass forward pipeline in VLM encoding struggles to align +V-L features that have a huge gap. Confronting these challenges, we propose a +novel method, Progressive Multi-modal conditional Prompt Tuning (ProMPT). +ProMPT exploits a recurrent structure, optimizing and aligning V-L features by +iteratively utilizing image and current encoding information. It comprises an +initialization and a multi-modal iterative evolution (MIE) module. +Initialization is responsible for encoding image and text using a VLM, followed +by a feature filter that selects text features similar to image. MIE then +facilitates multi-modal prompting through class-conditional vision prompting, +instance-conditional text prompting, and feature filtering. In each MIE +iteration, vision prompts are obtained from the filtered text features via a +vision generator, promoting image features to focus more on target object +during vision prompting. The encoded image features are fed into a text +generator to produce text prompts that are more robust to class shift. Thus, +V-L features are progressively aligned, enabling advance from coarse to exact +classifications. Extensive experiments are conducted in three settings to +evaluate the efficacy of ProMPT. The results indicate that ProMPT outperforms +existing methods on average across all settings, demonstrating its superior +generalization. + +
+
+
+
+
+ + ☆ Partial Large Kernel CNNs for Efficient Super-Resolution + + +
+ Recently, in the super-resolution (SR) domain, transformers have outperformed +CNNs with fewer FLOPs and fewer parameters since they can deal with long-range +dependency and adaptively adjust weights based on instance. In this paper, we +demonstrate that CNNs, although less focused on in the current SR domain, +surpass Transformers in direct efficiency measures. By incorporating the +advantages of Transformers into CNNs, we aim to achieve both computational +efficiency and enhanced performance. However, using a large kernel in the SR +domain, which mainly processes large images, incurs a large computational +overhead. To overcome this, we propose novel approaches to employing the large +kernel, which can reduce latency by 86\% compared to the naive large kernel, +and leverage an Element-wise Attention module to imitate instance-dependent +weights. As a result, we introduce Partial Large Kernel CNNs for Efficient +Super-Resolution (PLKSR), which achieves state-of-the-art performance on four +datasets at a scale of $\times$4, with reductions of 68.1\% in latency and +80.2\% in maximum GPU memory occupancy compared to SRFormer-light. + +
+
+
+
+
+ + ☆ Computer-Aided Diagnosis of Thoracic Diseases in Chest X-rays using + hybrid CNN-Transformer Architecture + + +
+ Medical imaging has been used for diagnosis of various conditions, making it +one of the most powerful resources for effective patient care. Due to +widespread availability, low cost, and low radiation, chest X-ray is one of the +most sought after radiology examination for the diagnosis of various thoracic +diseases. Due to advancements in medical imaging technologies and increasing +patient load, current radiology workflow faces various challenges including +increasing backlogs, working long hours, and increase in diagnostic errors. An +automated computer-aided diagnosis system that can interpret chest X-rays to +augment radiologists by providing actionable insights has potential to provide +second opinion to radiologists, highlight relevant regions in the image, in +turn expediting clinical workflow, reducing diagnostic errors, and improving +patient care. In this study, we applied a novel architecture augmenting the +DenseNet121 Convolutional Neural Network (CNN) with multi-head self-attention +mechanism using transformer, namely SA-DenseNet121, that can identify multiple +thoracic diseases in chest X-rays. We conducted experiments on four of the +largest chest X-ray datasets, namely, ChestX-ray14, CheXpert, MIMIC-CXR-JPG, +and IU-CXR. Experimental results in terms of area under the receiver operating +characteristics (AUC-ROC) shows that augmenting CNN with self-attention has +potential in diagnosing different thoracic diseases from chest X-rays. The +proposed methodology has the potential to support the reading workflow, improve +efficiency, and reduce diagnostic errors. + +
+
+ comment: 24 pages, 13 Figures, 13 Tables. arXiv admin note: text overlap with + arXiv:1904.09925 by other authors +
+
+
+
+
+ + ☆ TextCenGen: Attention-Guided Text-Centric Background Adaptation for + Text-to-Image Generation + + +
+ Recent advancements in Text-to-image (T2I) generation have witnessed a shift +from adapting text to fixed backgrounds to creating images around text. +Traditional approaches are often limited to generate layouts within static +images for effective text placement. Our proposed approach, TextCenGen, +introduces a dynamic adaptation of the blank region for text-friendly image +generation, emphasizing text-centric design and visual harmony generation. Our +method employs force-directed attention guidance in T2I models to generate +images that strategically reserve whitespace for pre-defined text areas, even +for text or icons at the golden ratio. Observing how cross-attention maps +affect object placement, we detect and repel conflicting objects using a +force-directed graph approach, combined with a Spatial Excluding +Cross-Attention Constraint for smooth attention in whitespace areas. As a novel +task in graphic design, experiments indicate that TextCenGen outperforms +existing methods with more harmonious compositions. Furthermore, our method +significantly enhances T2I model outcomes on our specially collected prompt +datasets, catering to varied text positions. These results demonstrate the +efficacy of TextCenGen in creating more harmonious and integrated text-image +compositions. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Utilizing Adversarial Examples for Bias Mitigation and Accuracy + Enhancement + + +
+ We propose a novel approach to mitigate biases in computer vision models by +utilizing counterfactual generation and fine-tuning. While counterfactuals have +been used to analyze and address biases in DNN models, the counterfactuals +themselves are often generated from biased generative models, which can +introduce additional biases or spurious correlations. To address this issue, we +propose using adversarial images, that is images that deceive a deep neural +network but not humans, as counterfactuals for fair model training. + Our approach leverages a curriculum learning framework combined with a +fine-grained adversarial loss to fine-tune the model using adversarial +examples. By incorporating adversarial images into the training data, we aim to +prevent biases from propagating through the pipeline. We validate our approach +through both qualitative and quantitative assessments, demonstrating improved +bias mitigation and accuracy compared to existing methods. Qualitatively, our +results indicate that post-training, the decisions made by the model are less +dependent on the sensitive attribute and our model better disentangles the +relationship between sensitive attributes and classification variables. + +
+
+
+
+
+ + ☆ Cross-model Mutual Learning for Exemplar-based Medical Image + Segmentation AISTATS 2024 + + +
+ Medical image segmentation typically demands extensive dense annotations for +model training, which is both time-consuming and skill-intensive. To mitigate +this burden, exemplar-based medical image segmentation methods have been +introduced to achieve effective training with only one annotated image. In this +paper, we introduce a novel Cross-model Mutual learning framework for +Exemplar-based Medical image Segmentation (CMEMS), which leverages two models +to mutually excavate implicit information from unlabeled data at multiple +granularities. CMEMS can eliminate confirmation bias and enable collaborative +training to learn complementary information by enforcing consistency at +different granularities across models. Concretely, cross-model image +perturbation based mutual learning is devised by using weakly perturbed images +to generate high-confidence pseudo-labels, supervising predictions of strongly +perturbed images across models. This approach enables joint pursuit of +prediction consistency at the image granularity. Moreover, cross-model +multi-level feature perturbation based mutual learning is designed by letting +pseudo-labels supervise predictions from perturbed multi-level features with +different resolutions, which can broaden the perturbation space and enhance the +robustness of our framework. CMEMS is jointly trained using exemplar data, +synthetic data, and unlabeled data in an end-to-end manner. Experimental +results on two medical image datasets indicate that the proposed CMEMS +outperforms the state-of-the-art segmentation methods with extremely limited +supervision. + +
+
+ comment: AISTATS 2024 +
+
+
+
+
+ + ☆ Does Gaussian Splatting need SFM Initialization? + + +
+ 3D Gaussian Splatting has recently been embraced as a versatile and effective +method for scene reconstruction and novel view synthesis, owing to its +high-quality results and compatibility with hardware rasterization. Despite its +advantages, Gaussian Splatting's reliance on high-quality point cloud +initialization by Structure-from-Motion (SFM) algorithms is a significant +limitation to be overcome. To this end, we investigate various initialization +strategies for Gaussian Splatting and delve into how volumetric reconstructions +from Neural Radiance Fields (NeRF) can be utilized to bypass the dependency on +SFM data. Our findings demonstrate that random initialization can perform much +better if carefully designed and that by employing a combination of improved +initialization strategies and structure distillation from low-cost NeRF models, +it is possible to achieve equivalent results, or at times even superior, to +those obtained from SFM initialization. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ GenVideo: One-shot Target-image and Shape Aware Video Editing using T2I + Diffusion Models CVPR + + +
+ Video editing methods based on diffusion models that rely solely on a text +prompt for the edit are hindered by the limited expressive power of text +prompts. Thus, incorporating a reference target image as a visual guide becomes +desirable for precise control over edit. Also, most existing methods struggle +to accurately edit a video when the shape and size of the object in the target +image differ from the source object. To address these challenges, we propose +"GenVideo" for editing videos leveraging target-image aware T2I models. Our +approach handles edits with target objects of varying shapes and sizes while +maintaining the temporal consistency of the edit using our novel target and +shape aware InvEdit masks. Further, we propose a novel target-image aware +latent noise correction strategy during inference to improve the temporal +consistency of the edits. Experimental analyses indicate that GenVideo can +effectively handle edits with objects of varying shapes, where existing +approaches fail. + +
+
+ comment: CVPRw 2024 +
+
+
+
+
+ + ☆ TrACT: A Training Dynamics Aware Contrastive Learning Framework for + Long-tail Trajectory Prediction + + +
+ As a safety critical task, autonomous driving requires accurate predictions +of road users' future trajectories for safe motion planning, particularly under +challenging conditions. Yet, many recent deep learning methods suffer from a +degraded performance on the challenging scenarios, mainly because these +scenarios appear less frequently in the training data. To address such a +long-tail issue, existing methods force challenging scenarios closer together +in the feature space during training to trigger information sharing among them +for more robust learning. These methods, however, primarily rely on the motion +patterns to characterize scenarios, omitting more informative contextual +information, such as interactions and scene layout. We argue that exploiting +such information not only improves prediction accuracy but also scene +compliance of the generated trajectories. In this paper, we propose to +incorporate richer training dynamics information into a prototypical +contrastive learning framework. More specifically, we propose a two-stage +process. First, we generate rich contextual features using a baseline +encoder-decoder framework. These features are split into clusters based on the +model's output errors, using the training dynamics information, and a prototype +is computed within each cluster. Second, we retrain the model using the +prototypes in a contrastive learning framework. We conduct empirical +evaluations of our approach using two large-scale naturalistic datasets and +show that our method achieves state-of-the-art performance by improving +accuracy and scene compliance on the long-tail samples. Furthermore, we perform +experiments on a subset of the clusters to highlight the additional benefit of +our approach in reducing training bias. + +
+
+ comment: 2024 IEEE Intelligent Vehicles Symposium (IV) +
+
+
+
+
+ + ☆ Adaptive Memory Replay for Continual Learning CVPR + + +
+ Foundation Models (FMs) have become the hallmark of modern AI, however, these +models are trained on massive data, leading to financially expensive training. +Updating FMs as new data becomes available is important, however, can lead to +`catastrophic forgetting', where models underperform on tasks related to data +sub-populations observed too long ago. This continual learning (CL) phenomenon +has been extensively studied, but primarily in a setting where only a small +amount of past data can be stored. We advocate for the paradigm where memory is +abundant, allowing us to keep all previous data, but computational resources +are limited. In this setting, traditional replay-based CL approaches are +outperformed by a simple baseline which replays past data selected uniformly at +random, indicating that this setting necessitates a new approach. We address +this by introducing a framework of adaptive memory replay for continual +learning, where sampling of past data is phrased as a multi-armed bandit +problem. We utilize Bolzmann sampling to derive a method which dynamically +selects past data for training conditioned on the current task, assuming full +data access and emphasizing training efficiency. Through extensive evaluations +on both vision and language pre-training tasks, we demonstrate the +effectiveness of our approach, which maintains high performance while reducing +forgetting by up to 10% at no training efficiency cost. + +
+
+ comment: CVPR-W 2024 (Spotlight) +
+
+
+
+
+ + ☆ DoughNet: A Visual Predictive Model for Topological Manipulation of + Deformable Objects + + +
+ Manipulation of elastoplastic objects like dough often involves topological +changes such as splitting and merging. The ability to accurately predict these +topological changes that a specific action might incur is critical for planning +interactions with elastoplastic objects. We present DoughNet, a +Transformer-based architecture for handling these challenges, consisting of two +components. First, a denoising autoencoder represents deformable objects of +varying topology as sets of latent codes. Second, a visual predictive model +performs autoregressive set prediction to determine long-horizon geometrical +deformation and topological changes purely in latent space. Given a partial +initial state and desired manipulation trajectories, it infers all resulting +object geometries and topologies at each step. DoughNet thereby allows to plan +robotic manipulation; selecting a suited tool, its pose and opening width to +recreate robot- or human-made goals. Our experiments in simulated and real +environments show that DoughNet is able to significantly outperform related +approaches that consider deformation only as geometrical change. + +
+
+ comment: Under review. 17 pages, 14 figures +
+
+
+
+
+ + ☆ Compositional Neural Textures + + +
+ Texture plays a vital role in enhancing visual richness in both real +photographs and computer-generated imagery. However, the process of editing +textures often involves laborious and repetitive manual adjustments of textons, +which are the small, recurring local patterns that define textures. In this +work, we introduce a fully unsupervised approach for representing textures +using a compositional neural model that captures individual textons. We +represent each texton as a 2D Gaussian function whose spatial support +approximates its shape, and an associated feature that encodes its detailed +appearance. By modeling a texture as a discrete composition of Gaussian +textons, the representation offers both expressiveness and ease of editing. +Textures can be edited by modifying the compositional Gaussians within the +latent space, and new textures can be efficiently synthesized by feeding the +modified Gaussians through a generator network in a feed-forward manner. This +approach enables a wide range of applications, including transferring +appearance from an image texture to another image, diversifying textures, +texture interpolation, revealing/modifying texture variations, edit +propagation, texture animation, and direct texton manipulation. The proposed +approach contributes to advancing texture analysis, modeling, and editing +techniques, and opens up new possibilities for creating visually appealing +images with controllable textures. + +
+
+
+
+
+ + ☆ SPIdepth: Strengthened Pose Information for Self-supervised Monocular + Depth Estimation + + +
+ Self-supervised monocular depth estimation has garnered considerable +attention for its applications in autonomous driving and robotics. While recent +methods have made strides in leveraging techniques like the Self Query Layer +(SQL) to infer depth from motion, they often overlook the potential of +strengthening pose information. In this paper, we introduce SPIdepth, a novel +approach that prioritizes enhancing the pose network for improved depth +estimation. Building upon the foundation laid by SQL, SPIdepth emphasizes the +importance of pose information in capturing fine-grained scene structures. By +enhancing the pose network's capabilities, SPIdepth achieves remarkable +advancements in scene understanding and depth estimation. Experimental results +on benchmark datasets such as KITTI and Cityscapes showcase SPIdepth's +state-of-the-art performance, surpassing previous methods by significant +margins. Notably, SPIdepth's performance exceeds that of unsupervised models +and, after finetuning on metric data, outperforms all existing methods. +Remarkably, SPIdepth achieves these results using only a single image for +inference, surpassing even methods that utilize video sequences for inference, +thus demonstrating its efficacy and efficiency in real-world applications. Our +approach represents a significant leap forward in self-supervised monocular +depth estimation, underscoring the importance of strengthening pose information +for advancing scene understanding in real-world applications. + +
+
+
+
+
+ + ☆ Global Counterfactual Directions + + +
+ Despite increasing progress in development of methods for generating visual +counterfactual explanations, especially with the recent rise of Denoising +Diffusion Probabilistic Models, previous works consider them as an entirely +local technique. In this work, we take the first step at globalizing them. +Specifically, we discover that the latent space of Diffusion Autoencoders +encodes the inference process of a given classifier in the form of global +directions. We propose a novel proxy-based approach that discovers two types of +these directions with the use of only single image in an entirely black-box +manner. Precisely, g-directions allow for flipping the decision of a given +classifier on an entire dataset of images, while h-directions further increase +the diversity of explanations. We refer to them in general as Global +Counterfactual Directions (GCDs). Moreover, we show that GCDs can be naturally +combined with Latent Integrated Gradients resulting in a new black-box +attribution method, while simultaneously enhancing the understanding of +counterfactual explanations. We validate our approach on existing benchmarks +and show that it generalizes to real-world use-cases. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Advancing Applications of Satellite Photogrammetry: Novel Approaches for + Built-up Area Modeling and Natural Environment Monitoring using + Stereo/Multi-view Satellite Image-derived 3D Data + + +
+ With the development of remote sensing technology in recent decades, +spaceborne sensors with sub-meter and meter spatial resolution (Worldview and +PlanetScope) have achieved a considerable image quality to generate 3D +geospatial data via a stereo matching pipeline. These achievements have +significantly increased the data accessibility in 3D, necessitating adapting +these 3D geospatial data to analyze human and natural environments. This +dissertation explores several novel approaches based on stereo and multi-view +satellite image-derived 3D geospatial data, to deal with remote sensing +application issues for built-up area modeling and natural environment +monitoring, including building model 3D reconstruction, glacier dynamics +tracking, and lake algae monitoring. Specifically, the dissertation introduces +four parts of novel approaches that deal with the spatial and temporal +challenges with satellite-derived 3D data. The first study advances LoD-2 +building modeling from satellite-derived Orthophoto and DSMs with a novel +approach employing a model-driven workflow that generates building rectangular +3D geometry models. Secondly, we further enhanced our building reconstruction +framework for dense urban areas and non-rectangular purposes, we implemented +deep learning for unit-level segmentation and introduced a gradient-based +circle reconstruction for circular buildings to develop a polygon composition +technique for advanced building LoD2 reconstruction. Our third study utilizes +high-spatiotemporal resolution PlanetScope satellite imagery for glacier +tracking at 3D level in mid-latitude regions. Finally, we proposed a term as +"Algal Behavior Function" to refine the quantification of chlorophyll-a +concentrations from satellite imagery in water quality monitoring, addressing +algae fluctuations and timing discrepancies between satellite observations and +field measurements, thus enhancing the precision of underwater algae volume +estimates. Overall, this dissertation demonstrates the extensive potential of +satellite photogrammetry applications in addressing urban and environmental +challenges. It further showcases innovative analytical methodologies that +enhance the applicability of adapting stereo and multi-view very +high-resolution satellite-derived 3D data. (See full abstract in the document) + +
+
+ comment: Ph.D. Dissertation, Geospatial Data Analytics Lab, The Ohio State + University, 2024, offical version is available in OhioLINK +
+
+
+
+
+ + ☆ Towards Multi-modal Transformers in Federated Learning + + +
+ Multi-modal transformers mark significant progress in different domains, but +siloed high-quality data hinders their further improvement. To remedy this, +federated learning (FL) has emerged as a promising privacy-preserving paradigm +for training models without direct access to the raw data held by different +clients. Despite its potential, a considerable research direction regarding the +unpaired uni-modal clients and the transformer architecture in FL remains +unexplored. To fill this gap, this paper explores a transfer multi-modal +federated learning (MFL) scenario within the vision-language domain, where +clients possess data of various modalities distributed across different +datasets. We systematically evaluate the performance of existing methods when a +transformer architecture is utilized and introduce a novel framework called +Federated modality complementary and collaboration (FedCola) by addressing the +in-modality and cross-modality gaps among clients. Through extensive +experiments across various FL settings, FedCola demonstrates superior +performance over previous approaches, offering new perspectives on future +federated training of multi-modal transformers. + +
+
+
+
+
+ + ☆ Enhancing AI Diagnostics: Autonomous Lesion Masking via Semi-Supervised + Deep Learning + + +
+ This study presents an unsupervised domain adaptation method aimed at +autonomously generating image masks outlining regions of interest (ROIs) for +differentiating breast lesions in breast ultrasound (US) imaging. Our +semi-supervised learning approach utilizes a primitive model trained on a small +public breast US dataset with true annotations. This model is then iteratively +refined for the domain adaptation task, generating pseudo-masks for our +private, unannotated breast US dataset. The dataset, twice the size of the +public one, exhibits considerable variability in image acquisition perspectives +and demographic representation, posing a domain-shift challenge. Unlike typical +domain adversarial training, we employ downstream classification outcomes as a +benchmark to guide the updating of pseudo-masks in subsequent iterations. We +found the classification precision to be highly correlated with the +completeness of the generated ROIs, which promotes the explainability of the +deep learning classification model. Preliminary findings demonstrate the +efficacy and reliability of this approach in streamlining the ROI annotation +process, thereby enhancing the classification and localization of breast +lesions for more precise and interpretable diagnoses. + +
+
+
+
+
+ + ☆ Spot-Compose: A Framework for Open-Vocabulary Object Retrieval and + Drawer Manipulation in Point Clouds ICRA 2024 + + +
+ In recent years, modern techniques in deep learning and large-scale datasets +have led to impressive progress in 3D instance segmentation, grasp pose +estimation, and robotics. This allows for accurate detection directly in 3D +scenes, object- and environment-aware grasp prediction, as well as robust and +repeatable robotic manipulation. This work aims to integrate these recent +methods into a comprehensive framework for robotic interaction and manipulation +in human-centric environments. Specifically, we leverage 3D reconstructions +from a commodity 3D scanner for open-vocabulary instance segmentation, +alongside grasp pose estimation, to demonstrate dynamic picking of objects, and +opening of drawers. We show the performance and robustness of our model in two +sets of real-world experiments including dynamic object retrieval and drawer +opening, reporting a 51% and 82% success rate respectively. Code of our +framework as well as videos are available on: https://spot-compose.github.io/. + +
+
+ comment: Accepted at ICRA 2024 Workshops. Code and videos available at + https://spot-compose.github.io/ +
+
+
+
+
+ + ☆ UIClip: A Data-driven Model for Assessing User Interface Design + + +
+ User interface (UI) design is a difficult yet important task for ensuring the +usability, accessibility, and aesthetic qualities of applications. In our +paper, we develop a machine-learned model, UIClip, for assessing the design +quality and visual relevance of a UI given its screenshot and natural language +description. To train UIClip, we used a combination of automated crawling, +synthetic augmentation, and human ratings to construct a large-scale dataset of +UIs, collated by description and ranked by design quality. Through training on +the dataset, UIClip implicitly learns properties of good and bad designs by i) +assigning a numerical score that represents a UI design's relevance and quality +and ii) providing design suggestions. In an evaluation that compared the +outputs of UIClip and other baselines to UIs rated by 12 human designers, we +found that UIClip achieved the highest agreement with ground-truth rankings. +Finally, we present three example applications that demonstrate how UIClip can +facilitate downstream applications that rely on instantaneous assessment of UI +design quality: i) UI code generation, ii) UI design tips generation, and iii) +quality-aware UI example search. + +
+
+
+
+
+ + ♻ ☆ NeRF-MAE: Masked AutoEncoders for Self-Supervised 3D Representation + Learning for Neural Radiance Fields + + +
+ Neural fields excel in computer vision and robotics due to their ability to +understand the 3D visual world such as inferring semantics, geometry, and +dynamics. Given the capabilities of neural fields in densely representing a 3D +scene from 2D images, we ask the question: Can we scale their self-supervised +pretraining, specifically using masked autoencoders, to generate effective 3D +representations from posed RGB images. Owing to the astounding success of +extending transformers to novel data modalities, we employ standard 3D Vision +Transformers to suit the unique formulation of NeRFs. We leverage NeRF's +volumetric grid as a dense input to the transformer, contrasting it with other +3D representations such as pointclouds where the information density can be +uneven, and the representation is irregular. Due to the difficulty of applying +masked autoencoders to an implicit representation, such as NeRF, we opt for +extracting an explicit representation that canonicalizes scenes across domains +by employing the camera trajectory for sampling. Our goal is made possible by +masking random patches from NeRF's radiance and density grid and employing a +standard 3D Swin Transformer to reconstruct the masked patches. In doing so, +the model can learn the semantic and spatial structure of complete scenes. We +pretrain this representation at scale on our proposed curated posed-RGB data, +totaling over 1.6 million images. Once pretrained, the encoder is used for +effective 3D transfer learning. Our novel self-supervised pretraining for +NeRFs, NeRF-MAE, scales remarkably well and improves performance on various +challenging 3D tasks. Utilizing unlabeled posed 2D data for pretraining, +NeRF-MAE significantly outperforms self-supervised 3D pretraining and NeRF +scene understanding baselines on Front3D and ScanNet datasets with an absolute +performance improvement of over 20% AP50 and 8% AP25 for 3D object detection. + +
+
+ comment: 29 pages, 13 figures. Project Page: https://nerf-mae.github.io/ +
+
+
+
+
+ + ♻ ☆ Beyond Known Clusters: Probe New Prototypes for Efficient Generalized + Class Discovery + + +
+ Generalized Class Discovery (GCD) aims to dynamically assign labels to +unlabelled data partially based on knowledge learned from labelled data, where +the unlabelled data may come from known or novel classes. The prevailing +approach generally involves clustering across all data and learning conceptions +by prototypical contrastive learning. However, existing methods largely hinge +on the performance of clustering algorithms and are thus subject to their +inherent limitations. Firstly, the estimated cluster number is often smaller +than the ground truth, making the existing methods suffer from the lack of +prototypes for comprehensive conception learning. To address this issue, we +propose an adaptive probing mechanism that introduces learnable potential +prototypes to expand cluster prototypes (centers). As there is no ground truth +for the potential prototype, we develop a self-supervised prototype learning +framework to optimize the potential prototype in an end-to-end fashion. +Secondly, clustering is computationally intensive, and the conventional +strategy of clustering both labelled and unlabelled instances exacerbates this +issue. To counteract this inefficiency, we opt to cluster only the unlabelled +instances and subsequently expand the cluster prototypes with our introduced +potential prototypes to fast explore novel classes. Despite the simplicity of +our proposed method, extensive empirical analysis on a wide range of datasets +confirms that our method consistently delivers state-of-the-art results. +Specifically, our method surpasses the nearest competitor by a significant +margin of \textbf{9.7}$\%$ within the Stanford Cars dataset and +\textbf{12$\times$} clustering efficiency within the Herbarium 19 dataset. We +will make the code and checkpoints publicly available at +\url{https://github.com/xjtuYW/PNP.git}. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Low-rank tensor completion via tensor joint rank with logarithmic + composite norm + + +
+ Low-rank tensor completion (LRTC) aims to recover a complete low-rank tensor +from incomplete observed tensor, attracting extensive attention in various +practical applications such as image processing and computer vision. However, +current methods often perform well only when there is a sufficient of observed +information, and they perform poorly or may fail when the observed information +is less than 5\%. In order to improve the utilization of observed information, +a new method called the tensor joint rank with logarithmic composite norm +(TJLC) method is proposed. This method simultaneously exploits two types of +tensor low-rank structures, namely tensor Tucker rank and tubal rank, thereby +enhancing the inherent correlations between known and missing elements. To +address the challenge of applying two tensor ranks with significantly different +directly to LRTC, a new tensor Logarithmic composite norm is further proposed. +Subsequently, the TJLC model and algorithm for the LRTC problem are proposed. +Additionally, theoretical convergence guarantees for the TJLC method are +provided. Experiments on various real datasets demonstrate that the proposed +method outperforms state-of-the-art methods significantly. Particularly, the +proposed method achieves satisfactory recovery even when the observed +information is as low as 1\%, and the recovery performance improves +significantly as the observed information increases. + +
+
+
+
+
+ + ♻ ☆ Struggle with Adversarial Defense? Try Diffusion + + +
+ Adversarial attacks induce misclassification by introducing subtle +perturbations. Recently, diffusion models are applied to the image classifiers +to improve adversarial robustness through adversarial training or by purifying +adversarial noise. However, diffusion-based adversarial training often +encounters convergence challenges and high computational expenses. +Additionally, diffusion-based purification inevitably causes data shift and is +deemed susceptible to stronger adaptive attacks. To tackle these issues, we +propose the Truth Maximization Diffusion Classifier (TMDC), a generative +Bayesian classifier that builds upon pre-trained diffusion models and the +Bayesian theorem. Unlike data-driven classifiers, TMDC, guided by Bayesian +principles, utilizes the conditional likelihood from diffusion models to +determine the class probabilities of input images, thereby insulating against +the influences of data shift and the limitations of adversarial training. +Moreover, to enhance TMDC's resilience against more potent adversarial attacks, +we propose an optimization strategy for diffusion classifiers. This strategy +involves post-training the diffusion model on perturbed datasets with +ground-truth labels as conditions, guiding the diffusion model to learn the +data distribution and maximizing the likelihood under the ground-truth labels. +The proposed method achieves state-of-the-art performance on the CIFAR10 +dataset against heavy white-box attacks and strong adaptive attacks. +Specifically, TMDC achieves robust accuracies of 82.81% against $l_{\infty}$ +norm-bounded perturbations and 86.05% against $l_{2}$ norm-bounded +perturbations, respectively, with $\epsilon=0.05$. + +
+
+
+
+
+ + ♻ ☆ A new dataset for measuring the performance of blood vessel segmentation + methods under distribution shifts + + +
+ Creating a dataset for training supervised machine learning algorithms can be +a demanding task. This is especially true for medical image segmentation since +one or more specialists are usually required for image annotation, and creating +ground truth labels for just a single image can take up to several hours. In +addition, it is paramount that the annotated samples represent well the +different conditions that might affect the imaged tissues as well as possible +changes in the image acquisition process. This can only be achieved by +considering samples that are typical in the dataset as well as atypical, or +even outlier, samples. We introduce VessMAP, a heterogeneous blood vessel +segmentation dataset acquired by carefully sampling relevant images from a +larger non-annotated dataset. A methodology was developed to select both +prototypical and atypical samples from the base dataset, thus defining an +assorted set of images that can be used for measuring the performance of +segmentation algorithms on samples that are highly distinct from each other. To +demonstrate the potential of the new dataset, we show that the validation +performance of a neural network changes significantly depending on the splits +used for training the network. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Can We Edit Multimodal Large Language Models? EMNLP 2023 + + +
+ In this paper, we focus on editing Multimodal Large Language Models (MLLMs). +Compared to editing single-modal LLMs, multimodal model editing is more +challenging, which demands a higher level of scrutiny and careful consideration +in the editing process. To facilitate research in this area, we construct a new +benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite +of innovative metrics for evaluation. We conduct comprehensive experiments +involving various model editing baselines and analyze the impact of editing +different components for multimodal LLMs. Empirically, we notice that previous +baselines can implement editing multimodal LLMs to some extent, but the effect +is still barely satisfactory, indicating the potential difficulty of this task. +We hope that our work can provide the NLP community with insights. Code and +dataset are available in https://github.com/zjunlp/EasyEdit. + +
+
+ comment: EMNLP 2023. Add the Exact Match/Accuracy results of Reliability and + T-Generality +
+
+
+
+
+ + ♻ ☆ Exposing Image Splicing Traces in Scientific Publications via + Uncertainty-guided Refinement + + +
+ Recently, a surge in scientific publications suspected of image manipulation +has led to numerous retractions, bringing the issue of image integrity into +sharp focus. Although research on forensic detectors for image plagiarism and +image synthesis exists, the detection of image splicing traces in scientific +publications remains unexplored. Compared to image duplication and synthesis, +image splicing detection is more challenging due to the lack of reference +images and the typically small tampered areas. Furthermore, disruptive factors +in scientific images, such as artifacts from digital compression, abnormal +patterns, and noise from physical operations, present misleading features like +splicing traces, significantly increasing the difficulty of this task. +Moreover, the scarcity of high-quality datasets of spliced scientific images +limits potential advancements. In this work, we propose an Uncertainty-guided +Refinement Network (URN) to mitigate the impact of these disruptive factors. +Our URN can explicitly suppress the propagation of unreliable information flow +caused by disruptive factors between regions, thus obtaining robust splicing +features. Additionally, the URN is designed to concentrate improvements in +uncertain prediction areas during the decoding phase. We also construct a +dataset for image splicing detection (SciSp) containing 1,290 spliced images. +Compared to existing datasets, SciSp includes the largest number of spliced +images and the most diverse sources. Comprehensive experiments conducted on +three benchmark datasets demonstrate the superiority of our approach. We also +validate the URN's generalisability in resisting cross-dataset domain shifts +and its robustness against various post-processing techniques, including +advanced deep-learning-based inpainting. + +
+
+
+
+
+ + ♻ ☆ State Space Models for Event Cameras CVPR 2024 + + +
+ Today, state-of-the-art deep neural networks that process event-camera data +first convert a temporal window of events into dense, grid-like input +representations. As such, they exhibit poor generalizability when deployed at +higher inference frequencies (i.e., smaller temporal windows) than the ones +they were trained on. We address this challenge by introducing state-space +models (SSMs) with learnable timescale parameters to event-based vision. This +design adapts to varying frequencies without the need to retrain the network at +different frequencies. Additionally, we investigate two strategies to +counteract aliasing effects when deploying the model at higher frequencies. We +comprehensively evaluate our approach against existing methods based on RNN and +Transformer architectures across various benchmarks, including Gen1 and 1 Mpx +event camera datasets. Our results demonstrate that SSM-based models train 33% +faster and also exhibit minimal performance degradation when tested at higher +frequencies than the training input. Traditional RNN and Transformer models +exhibit performance drops of more than 20 mAP, with SSMs having a drop of 3.76 +mAP, highlighting the effectiveness of SSMs in event-based vision tasks. + +
+
+ comment: 18 pages, 5 figures, 6 tables, CVPR 2024 Camera Ready paper +
+
+
+
+
+ + ♻ ☆ Reciprocal Attention Mixing Transformer for Lightweight Image + Restoration CVPR 2024 + + +
+ Although many recent works have made advancements in the image restoration +(IR) field, they often suffer from an excessive number of parameters. Another +issue is that most Transformer-based IR methods focus only on either local or +global features, leading to limited receptive fields or deficient parameter +issues. To address these problems, we propose a lightweight IR network, +Reciprocal Attention Mixing Transformer (RAMiT). It employs our proposed +dimensional reciprocal attention mixing Transformer (D-RAMiT) blocks, which +compute bi-dimensional (spatial and channel) self-attentions in parallel with +different numbers of multi-heads. The bi-dimensional attentions help each other +to complement their counterpart's drawbacks and are then mixed. Additionally, +we introduce a hierarchical reciprocal attention mixing (H-RAMi) layer that +compensates for pixel-level information losses and utilizes semantic +information while maintaining an efficient hierarchical structure. Furthermore, +we revisit and modify MobileNet V1 and V2 to attach efficient convolutions to +our proposed components. The experimental results demonstrate that RAMiT +achieves state-of-the-art performance on multiple lightweight IR tasks, +including super-resolution, color denoising, grayscale denoising, low-light +enhancement, and deraining. Codes are available at +https://github.com/rami0205/RAMiT. + +
+
+ comment: CVPR 2024 Workshop - NTIRE. Codes are available at + https://github.com/rami0205/RAMiT +
+
+
+
+
+ + ♻ ☆ Post-Training Network Compression for 3D Medical Image Segmentation: + Reducing Computational Efforts via Tucker Decomposition + + +
+ We address the computational barrier of deploying advanced deep learning +segmentation models in clinical settings by studying the efficacy of network +compression through tensor decomposition. We propose a post-training Tucker +factorization that enables the decomposition of pre-existing models to reduce +computational requirements without impeding segmentation accuracy. We applied +Tucker decomposition to the convolutional kernels of the TotalSegmentator (TS) +model, an nnU-Net model trained on a comprehensive dataset for automatic +segmentation of 117 anatomical structures. Our approach reduced the +floating-point operations (FLOPs) and memory required during inference, +offering an adjustable trade-off between computational efficiency and +segmentation quality. This study utilized the publicly available TS dataset, +employing various downsampling factors to explore the relationship between +model size, inference speed, and segmentation performance. The application of +Tucker decomposition to the TS model substantially reduced the model parameters +and FLOPs across various compression rates, with limited loss in segmentation +accuracy. We removed up to 88% of the model's parameters with no significant +performance changes in the majority of classes after fine-tuning. Practical +benefits varied across different graphics processing unit (GPU) architectures, +with more distinct speed-ups on less powerful hardware. Post-hoc network +compression via Tucker decomposition presents a viable strategy for reducing +the computational demand of medical image segmentation models without +substantially sacrificing accuracy. This approach enables the broader adoption +of advanced deep learning technologies in clinical practice, offering a way to +navigate the constraints of hardware capabilities. + +
+
+
+
+
+ + ♻ ☆ Efficiently Adversarial Examples Generation for Visual-Language Models + under Targeted Transfer Scenarios using Diffusion Models + + +
+ Targeted transfer-based attacks involving adversarial examples pose a +significant threat to large visual-language models (VLMs). However, the +state-of-the-art (SOTA) transfer-based attacks incur high costs due to +excessive iteration counts. Furthermore, the generated adversarial examples +exhibit pronounced adversarial noise and demonstrate limited efficacy in +evading defense methods such as DiffPure. To address these issues, inspired by +score matching, we introduce AdvDiffVLM, which utilizes diffusion models to +generate natural, unrestricted adversarial examples. Specifically, AdvDiffVLM +employs Adaptive Ensemble Gradient Estimation to modify the score during the +diffusion model's reverse generation process, ensuring the adversarial examples +produced contain natural adversarial semantics and thus possess enhanced +transferability. Simultaneously, to enhance the quality of adversarial examples +further, we employ the GradCAM-guided Mask method to disperse adversarial +semantics throughout the image, rather than concentrating them in a specific +area. Experimental results demonstrate that our method achieves a speedup +ranging from 10X to 30X compared to existing transfer-based attack methods, +while maintaining superior quality of adversarial examples. Additionally, the +generated adversarial examples possess strong transferability and exhibit +increased robustness against adversarial defense methods. Notably, AdvDiffVLM +can successfully attack commercial VLMs, including GPT-4V, in a black-box +manner. + +
+
+
+
+
+ + ♻ ☆ Multi-Level Aggregation and Recursive Alignment Architecture for + Efficient Parallel Inference Segmentation Network + + +
+ Real-time semantic segmentation is a crucial research for real-world +applications. However, many methods lay particular emphasis on reducing the +computational complexity and model size, while largely sacrificing the +accuracy. To tackle this problem, we propose a parallel inference network +customized for semantic segmentation tasks to achieve a good trade-off between +speed and accuracy. We employ a shallow backbone to ensure real-time speed, and +propose three core components to compensate for the reduced model capacity to +improve accuracy. Specifically, we first design a dual-pyramidal path +architecture (Multi-level Feature Aggregation Module, MFAM) to aggregate +multi-level features from the encoder to each scale, providing hierarchical +clues for subsequent spatial alignment and corresponding in-network inference. +Then, we build Recursive Alignment Module (RAM) by combining the flow-based +alignment module with recursive upsampling architecture for accurate spatial +alignment between multi-scale feature maps with half the computational +complexity of the straightforward alignment method. Finally, we perform +independent parallel inference on the aligned features to obtain multi-scale +scores, and adaptively fuse them through an attention-based Adaptive Scores +Fusion Module (ASFM) so that the final prediction can favor objects of multiple +scales. Our framework shows a better balance between speed and accuracy than +state-of-the-art real-time methods on Cityscapes and CamVid datasets. We also +conducted systematic ablation studies to gain insight into our motivation and +architectural design. Code is available at: +https://github.com/Yanhua-Zhang/MFARANet. + +
+
+ comment: 15 pages, 9 figures and 12 Tables. Manuscript completed on April 30, + 2022 +
+
+
+
+
+ + ♻ ☆ REF$^2$-NeRF: Reflection and Refraction aware Neural Radiance Field + + +
+ Recently, significant progress has been made in the study of methods for 3D +reconstruction from multiple images using implicit neural representations, +exemplified by the neural radiance field (NeRF) method. Such methods, which are +based on volume rendering, can model various light phenomena, and various +extended methods have been proposed to accommodate different scenes and +situations. However, when handling scenes with multiple glass objects, e.g., +objects in a glass showcase, modeling the target scene accurately has been +challenging due to the presence of multiple reflection and refraction effects. +Thus, this paper proposes a NeRF-based modeling method for scenes containing a +glass case. In the proposed method, refraction and reflection are modeled using +elements that are dependent and independent of the viewer's perspective. This +approach allows us to estimate the surfaces where refraction occurs, i.e., +glass surfaces, and enables the separation and modeling of both direct and +reflected light components. The proposed method requires predetermined camera +poses, but accurately estimating these poses in scenes with glass objects is +difficult. Therefore, we used a robotic arm with an attached camera to acquire +images with known poses. Compared to existing methods, the proposed method +enables more accurate modeling of both glass refraction and the overall scene. + +
+
+ comment: 10 pages, 8 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ NeuRAD: Neural Rendering for Autonomous Driving + + +
+ Neural radiance fields (NeRFs) have gained popularity in the autonomous +driving (AD) community. Recent methods show NeRFs' potential for closed-loop +simulation, enabling testing of AD systems, and as an advanced training data +augmentation technique. However, existing methods often require long training +times, dense semantic supervision, or lack generalizability. This, in turn, +hinders the application of NeRFs for AD at scale. In this paper, we propose +NeuRAD, a robust novel view synthesis method tailored to dynamic AD data. Our +method features simple network design, extensive sensor modeling for both +camera and lidar -- including rolling shutter, beam divergence and ray dropping +-- and is applicable to multiple datasets out of the box. We verify its +performance on five popular AD datasets, achieving state-of-the-art performance +across the board. To encourage further development, we will openly release the +NeuRAD source code. See https://github.com/georghess/NeuRAD . + +
+
+
+
+
+ + ♻ ☆ Back to Basics: Fast Denoising Iterative Algorithm + + +
+ We introduce Back to Basics (BTB), a fast iterative algorithm for noise +reduction. Our method is computationally efficient, does not require training +or ground truth data, and can be applied in the presence of independent noise, +as well as correlated (coherent) noise, where the noise level is unknown. We +examine three study cases: natural image denoising in the presence of additive +white Gaussian noise, Poisson-distributed image denoising, and speckle +suppression in optical coherence tomography (OCT). Experimental results +demonstrate that the proposed approach can effectively improve image quality, +in challenging noise settings. Theoretical guarantees are provided for +convergence stability. + +
+
+
+
+
+ + ♻ ☆ XIMAGENET-12: An Explainable AI Benchmark Dataset for Model Robustness + Evaluation CVPR 2024 + + +
+ Despite the promising performance of existing visual models on public +benchmarks, the critical assessment of their robustness for real-world +applications remains an ongoing challenge. To bridge this gap, we propose an +explainable visual dataset, XIMAGENET-12, to evaluate the robustness of visual +models. XIMAGENET-12 consists of over 200K images with 15,410 manual semantic +annotations. Specifically, we deliberately selected 12 categories from +ImageNet, representing objects commonly encountered in practical life. To +simulate real-world situations, we incorporated six diverse scenarios, such as +overexposure, blurring, and color changes, etc. We further develop a +quantitative criterion for robustness assessment, allowing for a nuanced +understanding of how visual models perform under varying conditions, notably in +relation to the background. We make the XIMAGENET-12 dataset and its +corresponding code openly accessible at +\url{https://sites.google.com/view/ximagenet-12/home}. We expect the +introduction of the XIMAGENET-12 dataset will empower researchers to thoroughly +evaluate the robustness of their visual models under challenging conditions. + +
+
+ comment: Paper accepted by Synthetic Data for Computer Vision Workshop @ IEEE + CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Unified Physical-Digital Attack Detection Challenge + + +
+ Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR) +Systems. In real-world scenarios, FRs are confronted with both physical and +digital attacks. However, existing algorithms often address only one type of +attack at a time, which poses significant limitations in real-world scenarios +where FR systems face hybrid physical-digital threats. To facilitate the +research of Unified Attack Detection (UAD) algorithms, a large-scale +UniAttackData dataset has been collected. UniAttackData is the largest public +dataset for Unified Attack Detection, with a total of 28,706 videos, where each +unique identity encompasses all advanced attack types. Based on this dataset, +we organized a Unified Physical-Digital Face Attack Detection Challenge to +boost the research in Unified Attack Detections. It attracted 136 teams for the +development phase, with 13 qualifying for the final round. The results +re-verified by the organizing team were used for the final ranking. This paper +comprehensively reviews the challenge, detailing the dataset introduction, +protocol definition, evaluation criteria, and a summary of published results. +Finally, we focus on the detailed analysis of the highest-performing algorithms +and offer potential directions for unified physical-digital attack detection +inspired by this competition. Challenge Website: +https://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Bridging Stereo Geometry and BEV Representation with Reliable Mutual + Interaction for Semantic Scene Completion IJCAI2024 + + +
+ 3D semantic scene completion (SSC) is an ill-posed perception task that +requires inferring a dense 3D scene from limited observations. Previous +camera-based methods struggle to predict accurate semantic scenes due to +inherent geometric ambiguity and incomplete observations. In this paper, we +resort to stereo matching technique and bird's-eye-view (BEV) representation +learning to address such issues in SSC. Complementary to each other, stereo +matching mitigates geometric ambiguity with epipolar constraint while BEV +representation enhances the hallucination ability for invisible regions with +global semantic context. However, due to the inherent representation gap +between stereo geometry and BEV features, it is non-trivial to bridge them for +dense prediction task of SSC. Therefore, we further develop a unified +occupancy-based framework dubbed BRGScene, which effectively bridges these two +representations with dense 3D volumes for reliable semantic scene completion. +Specifically, we design a novel Mutual Interactive Ensemble (MIE) block for +pixel-level reliable aggregation of stereo geometry and BEV features. Within +the MIE block, a Bi-directional Reliable Interaction (BRI) module, enhanced +with confidence re-weighting, is employed to encourage fine-grained interaction +through mutual guidance. Besides, a Dual Volume Ensemble (DVE) module is +introduced to facilitate complementary aggregation through channel-wise +recalibration and multi-group voting. Our method outperforms all published +camera-based methods on SemanticKITTI for semantic scene completion. Our code +is available on \url{https://github.com/Arlo0o/StereoScene}. + +
+
+ comment: IJCAI2024 +
+
+
+
+
+ + ♻ ☆ Low-resolution Prior Equilibrium Network for CT Reconstruction + + +
+ The unrolling method has been investigated for learning variational models in +X-ray computed tomography. However, it has been observed that directly +unrolling the regularization model through gradient descent does not produce +satisfactory results. In this paper, we present a novel deep learning-based CT +reconstruction model, where the low-resolution image is introduced to obtain an +effective regularization term for improving the network`s robustness. Our +approach involves constructing the backbone network architecture by algorithm +unrolling that is realized using the deep equilibrium architecture. We +theoretically discuss the convergence of the proposed low-resolution prior +equilibrium model and provide the conditions to guarantee convergence. +Experimental results on both sparse-view and limited-angle reconstruction +problems are provided, demonstrating that our end-to-end low-resolution prior +equilibrium model outperforms other state-of-the-art methods in terms of noise +reduction, contrast-to-noise ratio, and preservation of edge details. + +
+
+
+
+
+ + ♻ ☆ Bootstrapping Autonomous Driving Radars with Self-Supervised Learning + + +
+ The perception of autonomous vehicles using radars has attracted increased +research interest due its ability to operate in fog and bad weather. However, +training radar models is hindered by the cost and difficulty of annotating +large-scale radar data. To overcome this bottleneck, we propose a +self-supervised learning framework to leverage the large amount of unlabeled +radar data to pre-train radar-only embeddings for self-driving perception +tasks. The proposed method combines radar-to-radar and radar-to-vision +contrastive losses to learn a general representation from unlabeled radar +heatmaps paired with their corresponding camera images. When used for +downstream object detection, we demonstrate that the proposed self-supervision +framework can improve the accuracy of state-of-the-art supervised baselines by +$5.8\%$ in mAP. Code is available at \url{https://github.com/yiduohao/Radical}. + +
+
+ comment: 12 pages, 5 figures, to be published in Proceedings of the IEEE/CVF + Conference on Computer Vision and Pattern Recognition 2024 +
+
+
+
+
+ + ♻ ☆ DualFluidNet: an Attention-based Dual-pipeline Network for FLuid + Simulation + + +
+ Fluid motion can be considered as a point cloud transformation when using the +SPH method. Compared to traditional numerical analysis methods, using machine +learning techniques to learn physics simulations can achieve near-accurate +results, while significantly increasing efficiency. In this paper, we propose +an innovative approach for 3D fluid simulations utilizing an Attention-based +Dual-pipeline Network, which employs a dual-pipeline architecture, seamlessly +integrated with an Attention-based Feature Fusion Module. Unlike previous +methods, which often make difficult trade-offs between global fluid control and +physical law constraints, we find a way to achieve a better balance between +these two crucial aspects with a well-designed dual-pipeline approach. +Additionally, we design a Type-aware Input Module to adaptively recognize +particles of different types and perform feature fusion afterward, such that +fluid-solid coupling issues can be better dealt with. Furthermore, we propose a +new dataset, Tank3D, to further explore the network's ability to handle more +complicated scenes. The experiments demonstrate that our approach not only +attains a quantitative enhancement in various metrics, surpassing the +state-of-the-art methods but also signifies a qualitative leap in neural +network-based simulation by faithfully adhering to the physical laws. Code and +video demonstrations are available at +https://github.com/chenyu-xjtu/DualFluidNet. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics + Perception + + +
+ The highly abstract nature of image aesthetics perception (IAP) poses +significant challenge for current multimodal large language models (MLLMs). The +lack of human-annotated multi-modality aesthetic data further exacerbates this +dilemma, resulting in MLLMs falling short of aesthetics perception +capabilities. To address the above challenge, we first introduce a +comprehensively annotated Aesthetic Multi-Modality Instruction Tuning (AesMMIT) +dataset, which serves as the footstone for building multi-modality aesthetics +foundation models. Specifically, to align MLLMs with human aesthetics +perception, we construct a corpus-rich aesthetic critique database with 21,904 +diverse-sourced images and 88K human natural language feedbacks, which are +collected via progressive questions, ranging from coarse-grained aesthetic +grades to fine-grained aesthetic descriptions. To ensure that MLLMs can handle +diverse queries, we further prompt GPT to refine the aesthetic critiques and +assemble the large-scale aesthetic instruction tuning dataset, i.e. AesMMIT, +which consists of 409K multi-typed instructions to activate stronger aesthetic +capabilities. Based on the AesMMIT database, we fine-tune the open-sourced +general foundation models, achieving multi-modality Aesthetic Expert models, +dubbed AesExpert. Extensive experiments demonstrate that the proposed AesExpert +models deliver significantly better aesthetic perception performances than the +state-of-the-art MLLMs, including the most advanced GPT-4V and +Gemini-Pro-Vision. Source data will be available at +https://github.com/yipoh/AesExpert. + +
+
+
+
+
+ + ♻ ☆ FaceFilterSense: A Filter-Resistant Face Recognition and Facial + Attribute Analysis Framework + + +
+ With the advent of social media, fun selfie filters have come into tremendous +mainstream use affecting the functioning of facial biometric systems as well as +image recognition systems. These filters vary from beautification filters and +Augmented Reality (AR)-based filters to filters that modify facial landmarks. +Hence, there is a need to assess the impact of such filters on the performance +of existing face recognition systems. The limitation associated with existing +solutions is that these solutions focus more on the beautification filters. +However, the current AR-based filters and filters which distort facial key +points are in vogue recently and make the faces highly unrecognizable even to +the naked eye. Also, the filters considered are mostly obsolete with limited +variations. To mitigate these limitations, we aim to perform a holistic impact +analysis of the latest filters and propose an user recognition model with the +filtered images. We have utilized a benchmark dataset for baseline images, and +applied the latest filters over them to generate a beautified/filtered dataset. +Next, we have introduced a model FaceFilterNet for beautified user recognition. +In this framework, we also utilize our model to comment on various attributes +of the person including age, gender, and ethnicity. In addition, we have also +presented a filter-wise impact analysis on face recognition, age estimation, +gender, and ethnicity prediction. The proposed method affirms the efficacy of +our dataset with an accuracy of 87.25% and an optimal accuracy for facial +attribute analysis. + +
+
+
+
+
+ + ♻ ☆ Cross-view and Cross-pose Completion for 3D Human Understanding CVPR 2024 + + +
+ Human perception and understanding is a major domain of computer vision +which, like many other vision subdomains recently, stands to gain from the use +of large models pre-trained on large datasets. We hypothesize that the most +common pre-training strategy of relying on general purpose, object-centric +image datasets such as ImageNet, is limited by an important domain shift. On +the other hand, collecting domain-specific ground truth such as 2D or 3D labels +does not scale well. Therefore, we propose a pre-training approach based on +self-supervised learning that works on human-centric data using only images. +Our method uses pairs of images of humans: the first is partially masked and +the model is trained to reconstruct the masked parts given the visible ones and +a second image. It relies on both stereoscopic (cross-view) pairs, and temporal +(cross-pose) pairs taken from videos, in order to learn priors about 3D as well +as human motion. We pre-train a model for body-centric tasks and one for +hand-centric tasks. With a generic transformer architecture, these models +outperform existing self-supervised pre-training methods on a wide set of +human-centric downstream tasks, and obtain state-of-the-art performance for +instance when fine-tuning for model-based and model-free human mesh recovery. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Predicting and Enhancing the Fairness of DNNs with the Curvature of + Perceptual Manifolds CVPR 2023 + + +
+ To address the challenges of long-tailed classification, researchers have +proposed several approaches to reduce model bias, most of which assume that +classes with few samples are weak classes. However, recent studies have shown +that tail classes are not always hard to learn, and model bias has been +observed on sample-balanced datasets, suggesting the existence of other factors +that affect model bias. In this work, we first establish a geometric +perspective for analyzing model fairness and then systematically propose a +series of geometric measurements for perceptual manifolds in deep neural +networks. Subsequently, we comprehensively explore the effect of the geometric +characteristics of perceptual manifolds on classification difficulty and how +learning shapes the geometric characteristics of perceptual manifolds. An +unanticipated finding is that the correlation between the class accuracy and +the separation degree of perceptual manifolds gradually decreases during +training, while the negative correlation with the curvature gradually +increases, implying that curvature imbalance leads to model bias.Building upon +these observations, we propose curvature regularization to facilitate the model +to learn curvature-balanced and flatter perceptual manifolds. Evaluations on +multiple long-tailed and non-long-tailed datasets show the excellent +performance and exciting generality of our approach, especially in achieving +significant performance improvements based on current state-of-the-art +techniques. Our work opens up a geometric analysis perspective on model bias +and reminds researchers to pay attention to model bias on non-long-tailed and +even sample-balanced datasets. + +
+
+ comment: 17pages, Accepted by CVPR 2023, Submitted to TPAMI +
+
+
+
+
+ + ♻ ☆ MARformer: An Efficient Metal Artifact Reduction Transformer for Dental + CBCT Images + + +
+ Cone Beam Computed Tomography (CBCT) plays a key role in dental diagnosis and +surgery. However, the metal teeth implants could bring annoying metal artifacts +during the CBCT imaging process, interfering diagnosis and downstream +processing such as tooth segmentation. In this paper, we develop an efficient +Transformer to perform metal artifacts reduction (MAR) from dental CBCT images. +The proposed MAR Transformer (MARformer) reduces computation complexity in the +multihead self-attention by a new Dimension-Reduced Self-Attention (DRSA) +module, based on that the CBCT images have globally similar structure. A +Patch-wise Perceptive Feed Forward Network (P2FFN) is also proposed to perceive +local image information for fine-grained restoration. Experimental results on +CBCT images with synthetic and real-world metal artifacts show that our +MARformer is efficient and outperforms previous MAR methods and two restoration +Transformers. + +
+
+ comment: under consideration of Computer Vision and Image Understanding + journal +
+
+
+
+
+ + ♻ ☆ PDE-CNNs: Axiomatic Derivations and Applications + + +
+ PDE-based Group Convolutional Neural Networks (PDE-G-CNNs) utilize solvers of +geometrically meaningful evolution PDEs as substitutes for the conventional +components in G-CNNs. PDE-G-CNNs offer several key benefits all at once: fewer +parameters, inherent equivariance, better performance, data efficiency, and +geometric interpretability. + In this article we focus on Euclidean equivariant PDE-G-CNNs where the +feature maps are two dimensional throughout. We call this variant of the +framework a PDE-CNN. + From a machine learning perspective, we list several practically desirable +axioms and derive from these which PDEs should be used in a PDE-CNN. Here our +approach to geometric learning via PDEs is inspired by the axioms of classical +linear and morphological scale-space theory, which we generalize by introducing +semifield-valued signals. + Furthermore, we experimentally confirm for small networks that PDE-CNNs offer +fewer parameters, increased performance, and better data efficiency when +compared to CNNs. We also investigate what effect the use of different +semifields has on the performance of the models. + +
+
+
+
+
+ + ♻ ☆ Stronger, Fewer, & Superior: Harnessing Vision Foundation Models for + Domain Generalized Semantic Segmentation + + +
+ In this paper, we first assess and harness various Vision Foundation Models +(VFMs) in the context of Domain Generalized Semantic Segmentation (DGSS). +Driven by the motivation that Leveraging Stronger pre-trained models and Fewer +trainable parameters for Superior generalizability, we introduce a robust +fine-tuning approach, namely Rein, to parameter-efficiently harness VFMs for +DGSS. Built upon a set of trainable tokens, each linked to distinct instances, +Rein precisely refines and forwards the feature maps from each layer to the +next layer within the backbone. This process produces diverse refinements for +different categories within a single image. With fewer trainable parameters, +Rein efficiently fine-tunes VFMs for DGSS tasks, surprisingly surpassing full +parameter fine-tuning. Extensive experiments across various settings +demonstrate that Rein significantly outperforms state-of-the-art methods. +Remarkably, with just an extra 1% of trainable parameters within the frozen +backbone, Rein achieves a mIoU of 78.4% on the Cityscapes, without accessing +any real urban-scene datasets.Code is available at +https://github.com/w1oves/Rein.git. + +
+
+
+
+
+ + ♻ ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: minor fixes/rephrasing +
+
+
+
+
+ + ♻ ☆ JointViT: Modeling Oxygen Saturation Levels with Joint Supervision on + Long-Tailed OCTA + + +
+ The oxygen saturation level in the blood (SaO2) is crucial for health, +particularly in relation to sleep-related breathing disorders. However, +continuous monitoring of SaO2 is time-consuming and highly variable depending +on patients' conditions. Recently, optical coherence tomography angiography +(OCTA) has shown promising development in rapidly and effectively screening +eye-related lesions, offering the potential for diagnosing sleep-related +disorders. To bridge this gap, our paper presents three key contributions. +Firstly, we propose JointViT, a novel model based on the Vision Transformer +architecture, incorporating a joint loss function for supervision. Secondly, we +introduce a balancing augmentation technique during data preprocessing to +improve the model's performance, particularly on the long-tail distribution +within the OCTA dataset. Lastly, through comprehensive experiments on the OCTA +dataset, our proposed method significantly outperforms other state-of-the-art +methods, achieving improvements of up to 12.28% in overall accuracy. This +advancement lays the groundwork for the future utilization of OCTA in +diagnosing sleep-related disorders. See project website +https://steve-zeyu-zhang.github.io/JointViT + +
+
+
+
+
+ + ♻ ☆ Quantifying and Enhancing Multi-modal Robustness with Modality + Preference ICLR 2024 + + +
+ Multi-modal models have shown a promising capability to effectively integrate +information from various sources, yet meanwhile, they are found vulnerable to +pervasive perturbations, such as uni-modal attacks and missing conditions. To +counter these perturbations, robust multi-modal representations are highly +expected, which are positioned well away from the discriminative multi-modal +decision boundary. In this paper, different from conventional empirical +studies, we focus on a commonly used joint multi-modal framework and +theoretically discover that larger uni-modal representation margins and more +reliable integration for modalities are essential components for achieving +higher robustness. This discovery can further explain the limitation of +multi-modal robustness and the phenomenon that multi-modal models are often +vulnerable to attacks on the specific modality. Moreover, our analysis reveals +how the widespread issue, that the model has different preferences for +modalities, limits the multi-modal robustness by influencing the essential +components and could lead to attacks on the specific modality highly effective. +Inspired by our theoretical finding, we introduce a training procedure called +Certifiable Robust Multi-modal Training (CRMT), which can alleviate this +influence from modality preference and explicitly regulate essential components +to significantly improve robustness in a certifiable manner. Our method +demonstrates substantial improvements in performance and robustness compared +with existing methods. Furthermore, our training procedure can be easily +extended to enhance other robust training strategies, highlighting its +credibility and flexibility. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Relaxed forced choice improves performance of visual quality assessment + methods + + +
+ In image quality assessment, a collective visual quality score for an image +or video is obtained from the individual ratings of many subjects. One commonly +used format for these experiments is the two-alternative forced choice method. +Two stimuli with the same content but differing visual quality are presented +sequentially or side-by-side. Subjects are asked to select the one of better +quality, and when uncertain, they are required to guess. The relaxed +alternative forced choice format aims to reduce the cognitive load and the +noise in the responses due to the guessing by providing a third response +option, namely, ``not sure''. This work presents a large and comprehensive +crowdsourcing experiment to compare these two response formats: the one with +the ``not sure'' option and the one without it. To provide unambiguous ground +truth for quality evaluation, subjects were shown pairs of images with +differing numbers of dots and asked each time to choose the one with more dots. +Our crowdsourcing study involved 254 participants and was conducted using a +within-subject design. Each participant was asked to respond to 40 pair +comparisons with and without the ``not sure'' response option and completed a +questionnaire to evaluate their cognitive load for each testing condition. The +experimental results show that the inclusion of the ``not sure'' response +option in the forced choice method reduced mental load and led to models with +better data fit and correspondence to ground truth. We also tested for the +equivalence of the models and found that they were different. The dataset is +available at http://database.mmsp-kn.de/cogvqa-database.html. + +
+
+ comment: 6 pages, 3 figures, accepted at the 2023 15th International + Conference on Quality of Multimedia Experience (QoMEX). Database is publicly + accessible at http://database.mmsp-kn.de/cogvqa-database.html +
+
+
+
+
+ + ♻ ☆ Terrain-Informed Self-Supervised Learning: Enhancing Building Footprint + Extraction from LiDAR Data with Limited Annotations + + +
+ Estimating building footprint maps from geospatial data is of paramount +importance in urban planning, development, disaster management, and various +other applications. Deep learning methodologies have gained prominence in +building segmentation maps, offering the promise of precise footprint +extraction without extensive post-processing. However, these methods face +challenges in generalization and label efficiency, particularly in remote +sensing, where obtaining accurate labels can be both expensive and +time-consuming. To address these challenges, we propose terrain-aware +self-supervised learning, tailored to remote sensing, using digital elevation +models from LiDAR data. We propose to learn a model to differentiate between +bare Earth and superimposed structures enabling the network to implicitly learn +domain-relevant features without the need for extensive pixel-level +annotations. We test the effectiveness of our approach by evaluating building +segmentation performance on test datasets with varying label fractions. +Remarkably, with only 1% of the labels (equivalent to 25 labeled examples), our +method improves over ImageNet pre-training, showing the advantage of leveraging +unlabeled data for feature extraction in the domain of remote sensing. The +performance improvement is more pronounced in few-shot scenarios and gradually +closes the gap with ImageNet pre-training as the label fraction increases. We +test on a dataset characterized by substantial distribution shifts and labeling +errors to demonstrate the generalizability of our approach. When compared to +other baselines, including ImageNet pretraining and more complex architectures, +our approach consistently performs better, demonstrating the efficiency and +effectiveness of self-supervised terrain-aware feature learning. + +
+
+
+
+
+ + ♻ ☆ Octopus v3: Technical Report for On-device Sub-billion Multimodal AI + Agent + + +
+ A multimodal AI agent is characterized by its ability to process and learn +from various types of data, including natural language, visual, and audio +inputs, to inform its actions. Despite advancements in large language models +that incorporate visual data, such as GPT-4V, effectively translating +image-based data into actionable outcomes for AI agents continues to be +challenging. In this paper, we introduce a multimodal model that incorporates +the concept of functional token specifically designed for AI agent +applications. To ensure compatibility with edge devices, our model is optimized +to a compact size of less than 1B parameters. Like GPT-4, our model can process +both English and Chinese. We demonstrate that this model is capable of +operating efficiently on a wide range of edge devices, including as constrained +as a Raspberry Pi. + +
+
+
+
+
+ + ♻ ☆ Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual + Perception ICLR 2024 + + +
+ Mobile device agent based on Multimodal Large Language Models (MLLM) is +becoming a popular application. In this paper, we introduce Mobile-Agent, an +autonomous multi-modal mobile device agent. Mobile-Agent first leverages visual +perception tools to accurately identify and locate both the visual and textual +elements within the app's front-end interface. Based on the perceived vision +context, it then autonomously plans and decomposes the complex operation task, +and navigates the mobile Apps through operations step by step. Different from +previous solutions that rely on XML files of Apps or mobile system metadata, +Mobile-Agent allows for greater adaptability across diverse mobile operating +environments in a vision-centric way, thereby eliminating the necessity for +system-specific customizations. To assess the performance of Mobile-Agent, we +introduced Mobile-Eval, a benchmark for evaluating mobile device operations. +Based on Mobile-Eval, we conducted a comprehensive evaluation of Mobile-Agent. +The experimental results indicate that Mobile-Agent achieved remarkable +accuracy and completion rates. Even with challenging instructions, such as +multi-app operations, Mobile-Agent can still complete the requirements. Code +and model will be open-sourced at https://github.com/X-PLUG/MobileAgent. + +
+
+ comment: Accepted by ICLR 2024 Workshop in Large Language Model (LLM) Agents +
+
+
+
+
+ + ♻ ☆ SCT: A Simple Baseline for Parameter-Efficient Fine-Tuning via Salient + Channels + + +
+ Pre-trained vision transformers have strong representation benefits to +various downstream tasks. Recently, many parameter-efficient fine-tuning (PEFT) +methods have been proposed, and their experiments demonstrate that tuning only +1% of extra parameters could surpass full fine-tuning in low-data resource +scenarios. However, these methods overlook the task-specific information when +fine-tuning diverse downstream tasks. In this paper, we propose a simple yet +effective method called "Salient Channel Tuning" (SCT) to leverage the +task-specific information by forwarding the model with the task images to +select partial channels in a feature map that enables us to tune only 1/8 +channels leading to significantly lower parameter costs. Experiments outperform +full fine-tuning on 18 out of 19 tasks in the VTAB-1K benchmark by adding only +0.11M parameters of the ViT-B, which is 780x fewer than its full fine-tuning +counterpart. Furthermore, experiments on domain generalization and few-shot +learning surpass other PEFT methods with lower parameter costs, demonstrating +our proposed tuning technique's strong capability and effectiveness in the +low-data regime. + +
+
+ comment: This work has been accepted by IJCV2023 +
+
+
+
+
+ + ♻ ☆ Streaming Anchor Loss: Augmenting Supervision with Temporal Significance ICASSP 2024 + + +
+ Streaming neural network models for fast frame-wise responses to various +speech and sensory signals are widely adopted on resource-constrained +platforms. Hence, increasing the learning capacity of such streaming models +(i.e., by adding more parameters) to improve the predictive power may not be +viable for real-world tasks. In this work, we propose a new loss, Streaming +Anchor Loss (SAL), to better utilize the given learning capacity by encouraging +the model to learn more from essential frames. More specifically, our SAL and +its focal variations dynamically modulate the frame-wise cross entropy loss +based on the importance of the corresponding frames so that a higher loss +penalty is assigned for frames within the temporal proximity of semantically +critical events. Therefore, our loss ensures that the model training focuses on +predicting the relatively rare but task-relevant frames. Experimental results +with standard lightweight convolutional and recurrent streaming networks on +three different speech based detection tasks demonstrate that SAL enables the +model to learn the overall task more effectively with improved accuracy and +latency, without any additional data, model parameters, or architectural +changes. + +
+
+ comment: Published at IEEE ICASSP 2024, please see + https://ieeexplore.ieee.org/abstract/document/10447222 +
+
+
+
+
+ + ♻ ☆ Dynamic Typography: Bringing Text to Life via Video Diffusion Prior + + +
+ Text animation serves as an expressive medium, transforming static +communication into dynamic experiences by infusing words with motion to evoke +emotions, emphasize meanings, and construct compelling narratives. Crafting +animations that are semantically aware poses significant challenges, demanding +expertise in graphic design and animation. We present an automated text +animation scheme, termed "Dynamic Typography", which combines two challenging +tasks. It deforms letters to convey semantic meaning and infuses them with +vibrant movements based on user prompts. Our technique harnesses vector +graphics representations and an end-to-end optimization-based framework. This +framework employs neural displacement fields to convert letters into base +shapes and applies per-frame motion, encouraging coherence with the intended +textual concept. Shape preservation techniques and perceptual loss +regularization are employed to maintain legibility and structural integrity +throughout the animation process. We demonstrate the generalizability of our +approach across various text-to-video models and highlight the superiority of +our end-to-end methodology over baseline methods, which might comprise separate +tasks. Through quantitative and qualitative evaluations, we demonstrate the +effectiveness of our framework in generating coherent text animations that +faithfully interpret user prompts while maintaining readability. Our code is +available at: https://animate-your-word.github.io/demo/. + +
+
+ comment: Our demo page is available at: + https://animate-your-word.github.io/demo/ +
+
+
+
+
+ + ♻ ☆ AID: Attention Interpolation of Text-to-Image Diffusion + + +
+ Conditional diffusion models can create unseen images in various settings, +aiding image interpolation. Interpolation in latent spaces is well-studied, but +interpolation with specific conditions like text or poses is less understood. +Simple approaches, such as linear interpolation in the space of conditions, +often result in images that lack consistency, smoothness, and fidelity. To that +end, we introduce a novel training-free technique named Attention Interpolation +via Diffusion (AID). Our key contributions include 1) proposing an inner/outer +interpolated attention layer; 2) fusing the interpolated attention with +self-attention to boost fidelity; and 3) applying beta distribution to +selection to increase smoothness. We also present a variant, Prompt-guided +Attention Interpolation via Diffusion (PAID), that considers interpolation as a +condition-dependent generative process. This method enables the creation of new +images with greater consistency, smoothness, and efficiency, and offers control +over the exact path of interpolation. Our approach demonstrates effectiveness +for conceptual and spatial interpolation. Code and demo are available at +https://github.com/QY-H00/attention-interpolation-diffusion. + +
+
+
+
+
+ + ♻ ☆ A Survey on 3D Egocentric Human Pose Estimation + + +
+ Egocentric human pose estimation aims to estimate human body poses and +develop body representations from a first-person camera perspective. It has +gained vast popularity in recent years because of its wide range of +applications in sectors like XR-technologies, human-computer interaction, and +fitness tracking. However, to the best of our knowledge, there is no systematic +literature review based on the proposed solutions regarding egocentric 3D human +pose estimation. To that end, the aim of this survey paper is to provide an +extensive overview of the current state of egocentric pose estimation research. +In this paper, we categorize and discuss the popular datasets and the different +pose estimation models, highlighting the strengths and weaknesses of different +methods by comparative analysis. This survey can be a valuable resource for +both researchers and practitioners in the field, offering insights into key +concepts and cutting-edge solutions in egocentric pose estimation, its +wide-ranging applications, as well as the open problems with future scope. + +
+
+
+
+
+ + ♻ ☆ ViGoR: Improving Visual Grounding of Large Vision Language Models with + Fine-Grained Reward Modeling + + +
+ By combining natural language understanding, generation capabilities, and +breadth of knowledge of large language models with image perception, recent +large vision language models (LVLMs) have shown unprecedented visual reasoning +capabilities. However, the generated text often suffers from inaccurate +grounding in the visual input, resulting in errors such as hallucination of +nonexistent scene elements, missing significant parts of the scene, and +inferring incorrect attributes of and relationships between objects. To address +these issues, we introduce a novel framework, ViGoR(Visual Grounding Through +Fine-Grained Reward Modeling) that utilizes fine-grained reward modeling to +significantly enhance the visual grounding of LVLMs over pre-trained baselines. +This improvement is efficiently achieved using much cheaper human evaluations +instead of full supervisions, as well as automated methods. We show the +effectiveness of our approach through a variety of evaluation methods and +benchmarks. Additionally, we plan to release our human annotation comprising +approximately 16,000 images and generated text pairs with fine-grained +evaluations to contribute to related research in the community. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Single-temporal Supervised Remote Change Detection for Domain + Generalization + + +
+ Change detection is widely applied in remote sensing image analysis. Existing +methods require training models separately for each dataset, which leads to +poor domain generalization. Moreover, these methods rely heavily on large +amounts of high-quality pair-labelled data for training, which is expensive and +impractical. In this paper, we propose a multimodal contrastive learning +(ChangeCLIP) based on visual-language pre-training for change detection domain +generalization. Additionally, we propose a dynamic context optimization for +prompt learning. Meanwhile, to address the data dependency issue of existing +methods, we introduce a single-temporal and controllable AI-generated training +strategy (SAIN). This allows us to train the model using a large number of +single-temporal images without image pairs in the real world, achieving +excellent generalization. Extensive experiments on series of real change +detection datasets validate the superiority and strong generalization of +ChangeCLIP, outperforming state-of-the-art change detection methods. Code will +be available. + +
+
+
+
+
+ + ♻ ☆ MVDream: Multi-view Diffusion for 3D Generation + + +
+ We introduce MVDream, a diffusion model that is able to generate consistent +multi-view images from a given text prompt. Learning from both 2D and 3D data, +a multi-view diffusion model can achieve the generalizability of 2D diffusion +models and the consistency of 3D renderings. We demonstrate that such a +multi-view diffusion model is implicitly a generalizable 3D prior agnostic to +3D representations. It can be applied to 3D generation via Score Distillation +Sampling, significantly enhancing the consistency and stability of existing +2D-lifting methods. It can also learn new concepts from a few 2D examples, akin +to DreamBooth, but for 3D generation. + +
+
+ comment: Reorganized for arXiv; Our project page is https://MV-Dream.github.io +
+
+
+
+
+ + ♻ ☆ Explaining latent representations of generative models with large + multimodal models ICLR 2024 + + +
+ Learning interpretable representations of data generative latent factors is +an important topic for the development of artificial intelligence. With the +rise of the large multimodal model, it can align images with text to generate +answers. In this work, we propose a framework to comprehensively explain each +latent variable in the generative models using a large multimodal model. We +further measure the uncertainty of our generated explanations, quantitatively +evaluate the performance of explanation generation among multiple large +multimodal models, and qualitatively visualize the variations of each latent +variable to learn the disentanglement effects of different generative models on +explanations. Finally, we discuss the explanatory capabilities and limitations +of state-of-the-art large multimodal models. + +
+
+ comment: ICLR 2024 Workshop on Reliable and Responsible Foundation Models +
+
+
+
+
+ + ♻ ☆ CoReS: Orchestrating the Dance of Reasoning and Segmentation + + +
+ The reasoning segmentation task, which demands a nuanced comprehension of +intricate queries to accurately pinpoint object regions, is attracting +increasing attention. However, Multi-modal Large Language Models (MLLM) often +find it difficult to accurately localize the objects described in complex +reasoning contexts. We believe that the act of reasoning segmentation should +mirror the cognitive stages of human visual search, where each step is a +progressive refinement of thought toward the final object. Thus we introduce +the Chains of Reasoning and Segmenting (CoReS) and find this top-down visual +hierarchy indeed enhances the visual search process. Specifically, we propose a +dual-chain structure that generates multi-modal, chain-like outputs to aid the +segmentation process. Furthermore, to steer the MLLM's outputs into this +intended hierarchy, we incorporate in-context inputs as guidance. Extensive +experiments demonstrate the superior performance of our CoReS, which surpasses +the state-of-the-art method by 7.1\% on the ReasonSeg dataset. Project: +https://chain-of-reasoning-and-segmentation.github.io/. + +
+
+
+
+
+ + ♻ ☆ DeblurGS: Gaussian Splatting for Camera Motion Blur + + +
+ Although significant progress has been made in reconstructing sharp 3D scenes +from motion-blurred images, a transition to real-world applications remains +challenging. The primary obstacle stems from the severe blur which leads to +inaccuracies in the acquisition of initial camera poses through +Structure-from-Motion, a critical aspect often overlooked by previous +approaches. To address this challenge, we propose DeblurGS, a method to +optimize sharp 3D Gaussian Splatting from motion-blurred images, even with the +noisy camera pose initialization. We restore a fine-grained sharp scene by +leveraging the remarkable reconstruction capability of 3D Gaussian Splatting. +Our approach estimates the 6-Degree-of-Freedom camera motion for each blurry +observation and synthesizes corresponding blurry renderings for the +optimization process. Furthermore, we propose Gaussian Densification Annealing +strategy to prevent the generation of inaccurate Gaussians at erroneous +locations during the early training stages when camera motion is still +imprecise. Comprehensive experiments demonstrate that our DeblurGS achieves +state-of-the-art performance in deblurring and novel view synthesis for +real-world and synthetic benchmark datasets, as well as field-captured blurry +smartphone videos. + +
+
+
+
+
+ + ♻ ☆ Supervised Contrastive Vision Transformer for Breast Histopathological + Image Classification + + +
+ Invasive ductal carcinoma (IDC) is the most prevalent form of breast cancer. +Breast tissue histopathological examination is critical in diagnosing and +classifying breast cancer. Although existing methods have shown promising +results, there is still room for improvement in the classification accuracy and +generalization of IDC using histopathology images. We present a novel approach, +Supervised Contrastive Vision Transformer (SupCon-ViT), for improving the +classification of invasive ductal carcinoma in terms of accuracy and +generalization by leveraging the inherent strengths and advantages of both +transfer learning, i.e., pre-trained vision transformer, and supervised +contrastive learning. Our results on a benchmark breast cancer dataset +demonstrate that SupCon-Vit achieves state-of-the-art performance in IDC +classification, with an F1-score of 0.8188, precision of 0.7692, and +specificity of 0.8971, outperforming existing methods. In addition, the +proposed model demonstrates resilience in scenarios with minimal labeled data, +making it highly efficient in real-world clinical settings where labelled data +is limited. Our findings suggest that supervised contrastive learning in +conjunction with pre-trained vision transformers appears to be a viable +strategy for an accurate classification of IDC, thus paving the way for a more +efficient and reliable diagnosis of breast cancer through histopathological +image analysis. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing + Diffusion Models + + +
+ In the era of AIGC, the demand for low-budget or even on-device applications +of diffusion models emerged. In terms of compressing the Stable Diffusion +models (SDMs), several approaches have been proposed, and most of them +leveraged the handcrafted layer removal methods to obtain smaller U-Nets, along +with knowledge distillation to recover the network performance. However, such a +handcrafting manner of layer removal is inefficient and lacks scalability and +generalization, and the feature distillation employed in the retraining phase +faces an imbalance issue that a few numerically significant feature loss terms +dominate over others throughout the retraining process. To this end, we +proposed the layer pruning and normalized distillation for compressing +diffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to +compress SDM's U-Net automatically and proposed an effective one-shot pruning +criterion whose one-shot performance is guaranteed by its good additivity +property, surpassing other layer pruning and handcrafted layer removal methods, +2) proposed the normalized feature distillation for retraining, alleviated the +imbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of +SDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0% +decline in PickScore at a pruning ratio of 50% while the comparative methods' +minimal PickScore decline is 8.2%. We will release our code. + +
+
+
+
+
+ + ♻ ☆ Methods and strategies for improving the novel view synthesis quality of + neural radiation field + + +
+ Neural Radiation Field (NeRF) technology can learn a 3D implicit model of a +scene from 2D images and synthesize realistic novel view images. This +technology has received widespread attention from the industry and has good +application prospects. In response to the problem that the rendering quality of +NeRF images needs to be improved, many researchers have proposed various +methods to improve the rendering quality in the past three years. The latest +relevant papers are classified and reviewed, the technical principles behind +quality improvement are analyzed, and the future evolution direction of quality +improvement methods is discussed. This study can help researchers quickly +understand the current state and evolutionary context of technology in this +field, which is helpful in inspiring the development of more efficient +algorithms and promoting the application of NeRF technology in related fields. + +
+
+
+
+
+ + ♻ ☆ TaCOS: Task-Specific Camera Optimization with Simulation + + +
+ The performance of robots in their applications heavily depends on the +quality of sensory input. However, designing sensor payloads and their +parameters for specific robotic tasks is an expensive process that requires +well-established sensor knowledge and extensive experiments with physical +hardware. With cameras playing a pivotal role in robotic perception, we +introduce a novel end-to-end optimization approach for co-designing a camera +with specific robotic tasks by combining derivative-free and gradient-based +optimizers. The proposed method leverages recent computer graphics techniques +and physical camera characteristics to prototype the camera in software, +simulate operational environments and tasks for robots, and optimize the camera +design based on the desired tasks in a cost-effective way. We validate the +accuracy of our camera simulation by comparing it with physical cameras, and +demonstrate the design of cameras with stronger performance than common +off-the-shelf alternatives. Our approach supports the optimization of both +continuous and discrete camera parameters, manufacturing constraints, and can +be generalized to a broad range of camera design scenarios including multiple +cameras and unconventional cameras. This work advances the fully automated +design of cameras for specific robotics tasks. + +
+
+
+
+
+ + ♻ ☆ CogME: A Cognition-Inspired Multi-Dimensional Evaluation Metric for + Story Understanding + + +
+ We introduce CogME, a cognition-inspired, multi-dimensional evaluation metric +designed for AI models focusing on story understanding. CogME is a framework +grounded in human thinking strategies and story elements that involve story +understanding. With a specific breakdown of the questions, this approach +provides a nuanced assessment revealing not only AI models' particular +strengths and weaknesses but also the characteristics of the benchmark dataset. +Our case study with the DramaQA dataset demonstrates a refined analysis of the +model and the benchmark dataset. We argue the need for metrics based on +understanding the nature of tasks and designed to align closely with human +cognitive processes. This approach provides insights beyond traditional overall +scores and paves the way for more sophisticated AI development targeting higher +cognitive functions. + +
+
+ comment: 9 pages with 4 figures and 3 tables. This work has been accepted for + presentation at CogSci 2024 and is currently under revision +
+
+
+
+
+ + ♻ ☆ Self-supervised Learning of Rotation-invariant 3D Point Set Features + using Transformer and its Self-distillation + + +
+ Invariance against rotations of 3D objects is an important property in +analyzing 3D point set data. Conventional 3D point set DNNs having rotation +invariance typically obtain accurate 3D shape features via supervised learning +by using labeled 3D point sets as training samples. However, due to the rapid +increase in 3D point set data and the high cost of labeling, a framework to +learn rotation-invariant 3D shape features from numerous unlabeled 3D point +sets is required. This paper proposes a novel self-supervised learning +framework for acquiring accurate and rotation-invariant 3D point set features +at object-level. Our proposed lightweight DNN architecture decomposes an input +3D point set into multiple global-scale regions, called tokens, that preserve +the spatial layout of partial shapes composing the 3D object. We employ a +self-attention mechanism to refine the tokens and aggregate them into an +expressive rotation-invariant feature per 3D point set. Our DNN is effectively +trained by using pseudo-labels generated by a self-distillation framework. To +facilitate the learning of accurate features, we propose to combine multi-crop +and cut-mix data augmentation techniques to diversify 3D point sets for +training. Through a comprehensive evaluation, we empirically demonstrate that, +(1) existing rotation-invariant DNN architectures designed for supervised +learning do not necessarily learn accurate 3D shape features under a +self-supervised learning scenario, and (2) our proposed algorithm learns +rotation-invariant 3D point set features that are more accurate than those +learned by existing algorithms. Code is available at +https://github.com/takahikof/RIPT_SDMM + +
+
+ comment: Accepted to the CVIU journal +
+
+
+
+
+ + ♻ ☆ HR-APR: APR-agnostic Framework with Uncertainty Estimation and + Hierarchical Refinement for Camera Relocalisation ICRA + + +
+ Absolute Pose Regressors (APRs) directly estimate camera poses from monocular +images, but their accuracy is unstable for different queries. Uncertainty-aware +APRs provide uncertainty information on the estimated pose, alleviating the +impact of these unreliable predictions. However, existing uncertainty modelling +techniques are often coupled with a specific APR architecture, resulting in +suboptimal performance compared to state-of-the-art (SOTA) APR methods. This +work introduces a novel APR-agnostic framework, HR-APR, that formulates +uncertainty estimation as cosine similarity estimation between the query and +database features. It does not rely on or affect APR network architecture, +which is flexible and computationally efficient. In addition, we take advantage +of the uncertainty for pose refinement to enhance the performance of APR. The +extensive experiments demonstrate the effectiveness of our framework, reducing +27.4\% and 15.2\% of computational overhead on the 7Scenes and Cambridge +Landmarks datasets while maintaining the SOTA accuracy in single-image APRs. + +
+
+ comment: Accepted in in 2024 IEEE International Conference on Robotics and + Automation (ICRA). Code: https://github.com/lck666666/HR-APR +
+
+
+
+
+ + ♻ ☆ Non-negative Contrastive Learning ICLR 2024 + + +
+ Deep representations have shown promising performance when transferred to +downstream tasks in a black-box manner. Yet, their inherent lack of +interpretability remains a significant challenge, as these features are often +opaque to human understanding. In this paper, we propose Non-negative +Contrastive Learning (NCL), a renaissance of Non-negative Matrix Factorization +(NMF) aimed at deriving interpretable features. The power of NCL lies in its +enforcement of non-negativity constraints on features, reminiscent of NMF's +capability to extract features that align closely with sample clusters. NCL not +only aligns mathematically well with an NMF objective but also preserves NMF's +interpretability attributes, resulting in a more sparse and disentangled +representation compared to standard contrastive learning (CL). Theoretically, +we establish guarantees on the identifiability and downstream generalization of +NCL. Empirically, we show that these advantages enable NCL to outperform CL +significantly on feature disentanglement, feature selection, as well as +downstream classification tasks. At last, we show that NCL can be easily +extended to other learning scenarios and benefit supervised learning as well. +Code is available at https://github.com/PKU-ML/non_neg. + +
+
+ comment: 22 pages. Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ PolyOculus: Simultaneous Multi-view Image-based Novel View Synthesis + + +
+ This paper considers the problem of generative novel view synthesis (GNVS), +generating novel, plausible views of a scene given a limited number of known +views. Here, we propose a set-based generative model that can simultaneously +generate multiple, self-consistent new views, conditioned on any number of +views. Our approach is not limited to generating a single image at a time and +can condition on a variable number of views. As a result, when generating a +large number of views, our method is not restricted to a low-order +autoregressive generation approach and is better able to maintain generated +image quality over large sets of images. We evaluate our model on standard NVS +datasets and show that it outperforms the state-of-the-art image-based GNVS +baselines. Further, we show that the model is capable of generating sets of +views that have no natural sequential ordering, like loops and binocular +trajectories, and significantly outperforms other methods on such tasks. + +
+
+
+
+
+ + ♻ ☆ WHAM: Reconstructing World-grounded Humans with Accurate 3D Motion + + +
+ The estimation of 3D human motion from video has progressed rapidly but +current methods still have several key limitations. First, most methods +estimate the human in camera coordinates. Second, prior work on estimating +humans in global coordinates often assumes a flat ground plane and produces +foot sliding. Third, the most accurate methods rely on computationally +expensive optimization pipelines, limiting their use to offline applications. +Finally, existing video-based methods are surprisingly less accurate than +single-frame methods. We address these limitations with WHAM (World-grounded +Humans with Accurate Motion), which accurately and efficiently reconstructs 3D +human motion in a global coordinate system from video. WHAM learns to lift 2D +keypoint sequences to 3D using motion capture data and fuses this with video +features, integrating motion context and visual information. WHAM exploits +camera angular velocity estimated from a SLAM method together with human motion +to estimate the body's global trajectory. We combine this with a contact-aware +trajectory refinement method that lets WHAM capture human motion in diverse +conditions, such as climbing stairs. WHAM outperforms all existing 3D human +motion recovery methods across multiple in-the-wild benchmarks. Code will be +available for research purposes at http://wham.is.tue.mpg.de/ + +
+
+
+
+
+ + ♻ ☆ Street TryOn: Learning In-the-Wild Virtual Try-On from Unpaired Person + Images + + +
+ Most existing methods for virtual try-on focus on studio person images with a +limited range of poses and clean backgrounds. They can achieve plausible +results for this studio try-on setting by learning to warp a garment image to +fit a person's body from paired training data, i.e., garment images paired with +images of people wearing the same garment. Such data is often collected from +commercial websites, where each garment is demonstrated both by itself and on +several models. By contrast, it is hard to collect paired data for in-the-wild +scenes, and therefore, virtual try-on for casual images of people with more +diverse poses against cluttered backgrounds is rarely studied. + In this work, we fill the gap by introducing a StreetTryOn benchmark to +evaluate in-the-wild virtual try-on performance and proposing a novel method +that can learn it without paired data, from a set of in-the-wild person images +directly. Our method achieves robust performance across shop and street domains +using a novel DensePose warping correction method combined with diffusion-based +conditional inpainting. Our experiments show competitive performance for +standard studio try-on tasks and SOTA performance for street try-on and +cross-domain try-on tasks. + +
+
+
+
+
+ + ♻ ☆ NIR-Assisted Image Denoising: A Selective Fusion Approach and A + Real-World Benchmark Dataset + + +
+ Despite the significant progress in image denoising, it is still challenging +to restore fine-scale details while removing noise, especially in extremely +low-light environments. Leveraging near-infrared (NIR) images to assist visible +RGB image denoising shows the potential to address this issue, becoming a +promising technology. Nonetheless, existing works still struggle with taking +advantage of NIR information effectively for real-world image denoising, due to +the content inconsistency between NIR-RGB images and the scarcity of real-world +paired datasets. To alleviate the problem, we propose an efficient Selective +Fusion Module (SFM), which can be plug-and-played into the advanced denoising +networks to merge the deep NIR-RGB features. Specifically, we sequentially +perform the global and local modulation for NIR and RGB features, and then +integrate the two modulated features. Furthermore, we present a Real-world +NIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse +scenarios as well as various noise levels. Extensive experiments on both +synthetic and our real-world datasets demonstrate that the proposed method +achieves better results than state-of-the-art ones. The dataset, codes, and +pre-trained models will be publicly available at +https://github.com/ronjonxu/NAID. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Towards Realistic Scene Generation with LiDAR Diffusion Models CVPR 2024 + + +
+ Diffusion models (DMs) excel in photo-realistic image synthesis, but their +adaptation to LiDAR scene generation poses a substantial hurdle. This is +primarily because DMs operating in the point space struggle to preserve the +curve-like patterns and 3D geometry of LiDAR scenes, which consumes much of +their representation power. In this paper, we propose LiDAR Diffusion Models +(LiDMs) to generate LiDAR-realistic scenes from a latent space tailored to +capture the realism of LiDAR scenes by incorporating geometric priors into the +learning pipeline. Our method targets three major desiderata: pattern realism, +geometry realism, and object realism. Specifically, we introduce curve-wise +compression to simulate real-world LiDAR patterns, point-wise coordinate +supervision to learn scene geometry, and patch-wise encoding for a full 3D +object context. With these three core designs, our method achieves competitive +performance on unconditional LiDAR generation in 64-beam scenario and state of +the art on conditional LiDAR generation, while maintaining high efficiency +compared to point-based DMs (up to 107$\times$ faster). Furthermore, by +compressing LiDAR scenes into a latent space, we enable the controllability of +DMs with various conditions such as semantic maps, camera views, and text +prompts. + +
+
+ comment: CVPR 2024. Project link: https://lidar-diffusion.github.io +
+
+
+
+
+ + ♻ ☆ MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training + + +
+ In this work, we discuss building performant Multimodal Large Language Models +(MLLMs). In particular, we study the importance of various architecture +components and data choices. Through careful and comprehensive ablations of the +image encoder, the vision language connector, and various pre-training data +choices, we identified several crucial design lessons. For example, we +demonstrate that for large-scale multimodal pre-training using a careful mix of +image-caption, interleaved image-text, and text-only data is crucial for +achieving state-of-the-art (SOTA) few-shot results across multiple benchmarks, +compared to other published pre-training results. Further, we show that the +image encoder together with image resolution and the image token count has +substantial impact, while the vision-language connector design is of +comparatively negligible importance. By scaling up the presented recipe, we +build MM1, a family of multimodal models up to 30B parameters, including both +dense models and mixture-of-experts (MoE) variants, that are SOTA in +pre-training metrics and achieve competitive performance after supervised +fine-tuning on a range of established multimodal benchmarks. Thanks to +large-scale pre-training, MM1 enjoys appealing properties such as enhanced +in-context learning, and multi-image reasoning, enabling few-shot +chain-of-thought prompting. + +
+
+
+
+
+ + ♻ ☆ Cross Domain Early Crop Mapping using CropSTGAN + + +
+ Driven by abundant satellite imagery, machine learning-based approaches have +recently been promoted to generate high-resolution crop cultivation maps to +support many agricultural applications. One of the major challenges faced by +these approaches is the limited availability of ground truth labels. In the +absence of ground truth, existing work usually adopts the "direct transfer +strategy" that trains a classifier using historical labels collected from other +regions and then applies the trained model to the target region. Unfortunately, +the spectral features of crops exhibit inter-region and inter-annual +variability due to changes in soil composition, climate conditions, and crop +progress, the resultant models perform poorly on new and unseen regions or +years. Despite recent efforts, such as the application of the deep adaptation +neural network (DANN) model structure in the deep adaptation crop +classification network (DACCN), to tackle the above cross-domain challenges, +their effectiveness diminishes significantly when there is a large +dissimilarity between the source and target regions. This paper introduces the +Crop Mapping Spectral-temporal Generative Adversarial Neural Network +(CropSTGAN), a novel solution for cross-domain challenges, that doesn't require +target domain labels. CropSTGAN learns to transform the target domain's +spectral features to those of the source domain, effectively bridging large +dissimilarities. Additionally, it employs an identity loss to maintain the +intrinsic local structure of the data. Comprehensive experiments across various +regions and years demonstrate the benefits and effectiveness of the proposed +approach. In experiments, CropSTGAN is benchmarked against various +state-of-the-art (SOTA) methods. Notably, CropSTGAN significantly outperforms +these methods in scenarios with large data distribution dissimilarities between +the target and source domains. + +
+
+
+
+
+ + ♻ ☆ Routers in Vision Mixture of Experts: An Empirical Study + + +
+ Mixture-of-Experts (MoE) models are a promising way to scale up model +capacity without significantly increasing computational cost. A key component +of MoEs is the router, which decides which subset of parameters (experts) +process which feature embeddings (tokens). In this paper, we present a +comprehensive study of routers in MoEs for computer vision tasks. We introduce +a unified MoE formulation that subsumes different MoEs with two parametric +routing tensors. This formulation covers both sparse MoE, which uses a binary +or hard assignment between experts and tokens, and soft MoE, which uses a soft +assignment between experts and weighted combinations of tokens. Routers for +sparse MoEs can be further grouped into two variants: Token Choice, which +matches experts to each token, and Expert Choice, which matches tokens to each +expert. We conduct head-to-head experiments with 6 different routers, including +existing routers from prior work and new ones we introduce. We show that (i) +many routers originally developed for language modeling can be adapted to +perform strongly in vision tasks, (ii) in sparse MoE, Expert Choice routers +generally outperform Token Choice routers, and (iii) soft MoEs generally +outperform sparse MoEs with a fixed compute budget. These results provide new +insights regarding the crucial role of routers in vision MoE models. + +
+
+
+
+
+ + ♻ ☆ D4C Glove-train: Solving the RPM and Bongard-logo Problem by + Circumscribing and Building Distribution for Concepts + + +
+ This paper achieves noteworthy progress in the realm of abstract reasoning, +particularly in addressing Raven's Progressive Matrices (RPM) and Bongard-Logo +challenges. Initially, we introduce Lico-Net, a novel baseline model that +resolves RPM problems with remarkable accuracy. Leveraging this foundation, we +advance with the D3C approach, which advocates representing the underlying +concepts in abstract reasoning problems through distributions. This perspective +enhances the performance of both Lico-Net and a baseline model excelling in +Bongard-Logo tasks. To bolster the computational efficiency of D3C, we present +the D3C-cos variant, offering a streamlined yet precise solution. Furthermore, +we propose the D2C method, redefining conceptual boundaries within these +domains and bridging the divide between high-level abstractions and their +lower-dimensional counterparts. Finally, we extend our methodology to D4C, +employing adversarial techniques to refine conceptual boundaries further and +demonstrate substantial improvements in both RPM and Bongard-Logo challenges. +Overall, our contributions present a fresh outlook and practical advancements +in the field of abstract reasoning. + +
+
+ comment: 18 pages, 19 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ You Only Need One Color Space: An Efficient Network for Low-light Image + Enhancement + + +
+ Low-Light Image Enhancement (LLIE) task tends to restore the details and +visual information from corrupted low-light images. Most existing methods learn +the mapping function between low/normal-light images by Deep Neural Networks +(DNNs) on sRGB and HSV color space. Nevertheless, enhancement involves +amplifying image signals, and applying these color spaces to low-light images +with a low signal-to-noise ratio can introduce sensitivity and instability into +the enhancement process. Consequently, this results in the presence of color +artifacts and brightness artifacts in the enhanced images. To alleviate this +problem, we propose a novel trainable color space, named +Horizontal/Vertical-Intensity (HVI). It not only decouples brightness and color +from RGB channels to mitigate the instability during enhancement but also +adapts to low-light images in different illumination ranges due to the +trainable parameters. Further, we design a novel Color and Intensity Decoupling +Network (CIDNet) with two branches dedicated to processing the decoupled image +brightness and color in the HVI space. Within CIDNet, we introduce the +Lightweight Cross-Attention (LCA) module to facilitate interaction between +image structure and content information in both branches, while also +suppressing noise in low-light images. Finally, we conducted 22 quantitative +and qualitative experiments to show that the proposed CIDNet outperforms the +state-of-the-art methods on 11 datasets. The code is available at +https://github.com/Fediory/HVI-CIDNet. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 183 + +
+
+
+ + ☆ Factorized Diffusion: Perceptual Illusions by Noise Decomposition + + +
+ Given a factorization of an image into a sum of linear components, we present +a zero-shot method to control each individual component through diffusion model +sampling. For example, we can decompose an image into low and high spatial +frequencies and condition these components on different text prompts. This +produces hybrid images, which change appearance depending on viewing distance. +By decomposing an image into three frequency subbands, we can generate hybrid +images with three prompts. We also use a decomposition into grayscale and color +components to produce images whose appearance changes when they are viewed in +grayscale, a phenomena that naturally occurs under dim lighting. And we explore +a decomposition by a motion blur kernel, which produces images that change +appearance under motion blurring. Our method works by denoising with a +composite noise estimate, built from the components of noise estimates +conditioned on different prompts. We also show that for certain decompositions, +our method recovers prior approaches to compositional generation and spatial +control. Finally, we show that we can extend our approach to generate hybrid +images from real images. We do this by holding one component fixed and +generating the remaining components, effectively solving an inverse problem. + +
+
+
+
+
+ + ☆ Dynamic Typography: Bringing Words to Life + + +
+ Text animation serves as an expressive medium, transforming static +communication into dynamic experiences by infusing words with motion to evoke +emotions, emphasize meanings, and construct compelling narratives. Crafting +animations that are semantically aware poses significant challenges, demanding +expertise in graphic design and animation. We present an automated text +animation scheme, termed "Dynamic Typography", which combines two challenging +tasks. It deforms letters to convey semantic meaning and infuses them with +vibrant movements based on user prompts. Our technique harnesses vector +graphics representations and an end-to-end optimization-based framework. This +framework employs neural displacement fields to convert letters into base +shapes and applies per-frame motion, encouraging coherence with the intended +textual concept. Shape preservation techniques and perceptual loss +regularization are employed to maintain legibility and structural integrity +throughout the animation process. We demonstrate the generalizability of our +approach across various text-to-video models and highlight the superiority of +our end-to-end methodology over baseline methods, which might comprise separate +tasks. Through quantitative and qualitative evaluations, we demonstrate the +effectiveness of our framework in generating coherent text animations that +faithfully interpret user prompts while maintaining readability. Our code is +available at: https://animate-your-word.github.io/demo/. + +
+
+ comment: Our demo page is available at: + https://animate-your-word.github.io/demo/ +
+
+
+
+
+ + ☆ InFusion: Inpainting 3D Gaussians via Learning Depth Completion from + Diffusion Prior + + +
+ 3D Gaussians have recently emerged as an efficient representation for novel +view synthesis. This work studies its editability with a particular focus on +the inpainting task, which aims to supplement an incomplete set of 3D Gaussians +with additional points for visually harmonious rendering. Compared to 2D +inpainting, the crux of inpainting 3D Gaussians is to figure out the +rendering-relevant properties of the introduced points, whose optimization +largely benefits from their initial 3D positions. To this end, we propose to +guide the point initialization with an image-conditioned depth completion +model, which learns to directly restore the depth map based on the observed +image. Such a design allows our model to fill in depth values at an aligned +scale with the original depth, and also to harness strong generalizability from +largescale diffusion prior. Thanks to the more accurate depth completion, our +approach, dubbed InFusion, surpasses existing alternatives with sufficiently +better fidelity and efficiency under various complex scenarios. We further +demonstrate the effectiveness of InFusion with several practical applications, +such as inpainting with user-specific texture or with novel object insertion. + +
+
+ comment: Project page: https://johanan528.github.io/Infusion +
+
+
+
+
+ + ☆ VG4D: Vision-Language Model Goes 4D Video Recognition ICRA 2024 + + +
+ Understanding the real world through point cloud video is a crucial aspect of +robotics and autonomous driving systems. However, prevailing methods for 4D +point cloud recognition have limitations due to sensor resolution, which leads +to a lack of detailed information. Recent advances have shown that +Vision-Language Models (VLM) pre-trained on web-scale text-image datasets can +learn fine-grained visual concepts that can be transferred to various +downstream tasks. However, effectively integrating VLM into the domain of 4D +point clouds remains an unresolved problem. In this work, we propose the +Vision-Language Models Goes 4D (VG4D) framework to transfer VLM knowledge from +visual-text pre-trained models to a 4D point cloud network. Our approach +involves aligning the 4D encoder's representation with a VLM to learn a shared +visual and text space from training on large-scale image-text pairs. By +transferring the knowledge of the VLM to the 4D encoder and combining the VLM, +our VG4D achieves improved recognition performance. To enhance the 4D encoder, +we modernize the classic dynamic point cloud backbone and propose an improved +version of PSTNet, im-PSTNet, which can efficiently model point cloud videos. +Experiments demonstrate that our method achieves state-of-the-art performance +for action recognition on both the NTU RGB+D 60 dataset and the NTU RGB+D 120 +dataset. Code is available at \url{https://github.com/Shark0-0/VG4D}. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ☆ Variational Bayesian Last Layers ICLR + + +
+ We introduce a deterministic variational formulation for training Bayesian +last layer neural networks. This yields a sampling-free, single-pass model and +loss that effectively improves uncertainty estimation. Our variational Bayesian +last layer (VBLL) can be trained and evaluated with only quadratic complexity +in last layer width, and is thus (nearly) computationally free to add to +standard architectures. We experimentally investigate VBLLs, and show that they +improve predictive accuracy, calibration, and out of distribution detection +over baselines across both regression and classification. Finally, we +investigate combining VBLL layers with variational Bayesian feature learning, +yielding a lower variance collapsed variational inference method for Bayesian +neural networks. + +
+
+ comment: International Conference on Learning Representations (ICLR) 2024 +
+
+
+
+
+ + ☆ IntrinsicAnything: Learning Diffusion Priors for Inverse Rendering Under + Unknown Illumination + + +
+ This paper aims to recover object materials from posed images captured under +an unknown static lighting condition. Recent methods solve this task by +optimizing material parameters through differentiable physically based +rendering. However, due to the coupling between object geometry, materials, and +environment lighting, there is inherent ambiguity during the inverse rendering +process, preventing previous methods from obtaining accurate results. To +overcome this ill-posed problem, our key idea is to learn the material prior +with a generative model for regularizing the optimization process. We observe +that the general rendering equation can be split into diffuse and specular +shading terms, and thus formulate the material prior as diffusion models of +albedo and specular. Thanks to this design, our model can be trained using the +existing abundant 3D object data, and naturally acts as a versatile tool to +resolve the ambiguity when recovering material representations from RGB images. +In addition, we develop a coarse-to-fine training strategy that leverages +estimated materials to guide diffusion models to satisfy multi-view consistent +constraints, leading to more stable and accurate results. Extensive experiments +on real-world and synthetic datasets demonstrate that our approach achieves +state-of-the-art performance on material recovery. The code will be available +at https://zju3dv.github.io/IntrinsicAnything. + +
+
+ comment: Project page: https://zju3dv.github.io/IntrinsicAnything +
+
+
+
+
+ + ☆ A Subspace-Constrained Tyler's Estimator and its Applications to + Structure from Motion CVPR 24 + + +
+ We present the subspace-constrained Tyler's estimator (STE) designed for +recovering a low-dimensional subspace within a dataset that may be highly +corrupted with outliers. STE is a fusion of the Tyler's M-estimator (TME) and a +variant of the fast median subspace. Our theoretical analysis suggests that, +under a common inlier-outlier model, STE can effectively recover the underlying +subspace, even when it contains a smaller fraction of inliers relative to other +methods in the field of robust subspace recovery. We apply STE in the context +of Structure from Motion (SfM) in two ways: for robust estimation of the +fundamental matrix and for the removal of outlying cameras, enhancing the +robustness of the SfM pipeline. Numerical experiments confirm the +state-of-the-art performance of our method in these applications. This research +makes significant contributions to the field of robust subspace recovery, +particularly in the context of computer vision and 3D reconstruction. + +
+
+ comment: 23 pages, accepted by CVPR 24 +
+
+
+
+
+ + ☆ Prompt Optimizer of Text-to-Image Diffusion Models for Abstract Concept + Understanding WWW 2024 + + +
+ The rapid evolution of text-to-image diffusion models has opened the door of +generative AI, enabling the translation of textual descriptions into visually +compelling images with remarkable quality. However, a persistent challenge +within this domain is the optimization of prompts to effectively convey +abstract concepts into concrete objects. For example, text encoders can hardly +express "peace", while can easily illustrate olive branches and white doves. +This paper introduces a novel approach named Prompt Optimizer for Abstract +Concepts (POAC) specifically designed to enhance the performance of +text-to-image diffusion models in interpreting and generating images from +abstract concepts. We propose a Prompt Language Model (PLM), which is +initialized from a pre-trained language model, and then fine-tuned with a +curated dataset of abstract concept prompts. The dataset is created with GPT-4 +to extend the abstract concept to a scene and concrete objects. Our framework +employs a Reinforcement Learning (RL)-based optimization strategy, focusing on +the alignment between the generated images by a stable diffusion model and +optimized prompts. Through extensive experiments, we demonstrate that our +proposed POAC significantly improves the accuracy and aesthetic quality of +generated images, particularly in the description of abstract concepts and +alignment with optimized prompts. We also present a comprehensive analysis of +our model's performance across diffusion models under different settings, +showcasing its versatility and effectiveness in enhancing abstract concept +representation. + +
+
+ comment: WWW 2024 Companion +
+
+
+
+
+ + ☆ State-space Decomposition Model for Video Prediction Considering + Long-term Motion Trend + + +
+ Stochastic video prediction enables the consideration of uncertainty in +future motion, thereby providing a better reflection of the dynamic nature of +the environment. Stochastic video prediction methods based on image +auto-regressive recurrent models need to feed their predictions back into the +latent space. Conversely, the state-space models, which decouple frame +synthesis and temporal prediction, proves to be more efficient. However, +inferring long-term temporal information about motion and generalizing to +dynamic scenarios under non-stationary assumptions remains an unresolved +challenge. In this paper, we propose a state-space decomposition stochastic +video prediction model that decomposes the overall video frame generation into +deterministic appearance prediction and stochastic motion prediction. Through +adaptive decomposition, the model's generalization capability to dynamic +scenarios is enhanced. In the context of motion prediction, obtaining a prior +on the long-term trend of future motion is crucial. Thus, in the stochastic +motion prediction branch, we infer the long-term motion trend from conditional +frames to guide the generation of future frames that exhibit high consistency +with the conditional frames. Experimental results demonstrate that our model +outperforms baselines on multiple datasets. + +
+
+
+
+
+ + ☆ Simple Image Signal Processing using Global Context Guidance + + +
+ In modern smartphone cameras, the Image Signal Processor (ISP) is the core +element that converts the RAW readings from the sensor into perceptually +pleasant RGB images for the end users. The ISP is typically proprietary and +handcrafted and consists of several blocks such as white balance, color +correction, and tone mapping. Deep learning-based ISPs aim to transform RAW +images into DSLR-like RGB images using deep neural networks. However, most +learned ISPs are trained using patches (small regions) due to computational +limitations. Such methods lack global context, which limits their efficacy on +full-resolution images and harms their ability to capture global properties +such as color constancy or illumination. First, we propose a novel module that +can be integrated into any neural ISP to capture the global context information +from the full RAW images. Second, we propose an efficient and simple neural ISP +that utilizes our proposed module. Our model achieves state-of-the-art results +on different benchmarks using diverse and real smartphone images. + +
+
+ comment: Preprint under review +
+
+
+
+
+ + ☆ MoA: Mixture-of-Attention for Subject-Context Disentanglement in + Personalized Image Generation + + +
+ We introduce a new architecture for personalization of text-to-image +diffusion models, coined Mixture-of-Attention (MoA). Inspired by the +Mixture-of-Experts mechanism utilized in large language models (LLMs), MoA +distributes the generation workload between two attention pathways: a +personalized branch and a non-personalized prior branch. MoA is designed to +retain the original model's prior by fixing its attention layers in the prior +branch, while minimally intervening in the generation process with the +personalized branch that learns to embed subjects in the layout and context +generated by the prior branch. A novel routing mechanism manages the +distribution of pixels in each layer across these branches to optimize the +blend of personalized and generic content creation. Once trained, MoA +facilitates the creation of high-quality, personalized images featuring +multiple subjects with compositions and interactions as diverse as those +generated by the original model. Crucially, MoA enhances the distinction +between the model's pre-existing capability and the newly augmented +personalized intervention, thereby offering a more disentangled subject-context +control that was previously unattainable. Project page: +https://snap-research.github.io/mixture-of-attention + +
+
+ comment: Project Website: https://snap-research.github.io/mixture-of-attention +
+
+
+
+
+ + ☆ Predicting Long-horizon Futures by Conditioning on Geometry and Time + + +
+ Our work explores the task of generating future sensor observations +conditioned on the past. We are motivated by `predictive coding' concepts from +neuroscience as well as robotic applications such as self-driving vehicles. +Predictive video modeling is challenging because the future may be multi-modal +and learning at scale remains computationally expensive for video processing. +To address both challenges, our key insight is to leverage the large-scale +pretraining of image diffusion models which can handle multi-modality. We +repurpose image models for video prediction by conditioning on new frame +timestamps. Such models can be trained with videos of both static and dynamic +scenes. To allow them to be trained with modestly-sized datasets, we introduce +invariances by factoring out illumination and texture by forcing the model to +predict (pseudo) depth, readily obtained for in-the-wild videos via +off-the-shelf monocular depth networks. In fact, we show that simply modifying +networks to predict grayscale pixels already improves the accuracy of video +prediction. Given the extra controllability with timestamp conditioning, we +propose sampling schedules that work better than the traditional autoregressive +and hierarchical sampling strategies. Motivated by probabilistic metrics from +the object forecasting literature, we create a benchmark for video prediction +on a diverse set of videos spanning indoor and outdoor scenes and a large +vocabulary of objects. Our experiments illustrate the effectiveness of learning +to condition on timestamps, and show the importance of predicting the future +with invariant modalities. + +
+
+ comment: Project page: http://www.cs.cmu.edu/~tkhurana/depthforecasting/ +
+
+
+
+
+ + ☆ SSDiff: Spatial-spectral Integrated Diffusion Model for Remote Sensing + Pansharpening + + +
+ Pansharpening is a significant image fusion technique that merges the spatial +content and spectral characteristics of remote sensing images to generate +high-resolution multispectral images. Recently, denoising diffusion +probabilistic models have been gradually applied to visual tasks, enhancing +controllable image generation through low-rank adaptation (LoRA). In this +paper, we introduce a spatial-spectral integrated diffusion model for the +remote sensing pansharpening task, called SSDiff, which considers the +pansharpening process as the fusion process of spatial and spectral components +from the perspective of subspace decomposition. Specifically, SSDiff utilizes +spatial and spectral branches to learn spatial details and spectral features +separately, then employs a designed alternating projection fusion module (APFM) +to accomplish the fusion. Furthermore, we propose a frequency modulation +inter-branch module (FMIM) to modulate the frequency distribution between +branches. The two components of SSDiff can perform favorably against the APFM +when utilizing a LoRA-like branch-wise alternative fine-tuning method. It +refines SSDiff to capture component-discriminating features more sufficiently. +Finally, extensive experiments on four commonly used datasets, i.e., +WorldView-3, WorldView-2, GaoFen-2, and QuickBird, demonstrate the superiority +of SSDiff both visually and quantitatively. The code will be made open source +after possible acceptance. + +
+
+
+
+
+ + ☆ JointViT: Modeling Oxygen Saturation Levels with Joint Supervision on + Long-Tailed OCTA + + +
+ The oxygen saturation level in the blood (SaO2) is crucial for health, +particularly in relation to sleep-related breathing disorders. However, +continuous monitoring of SaO2 is time-consuming and highly variable depending +on patients' conditions. Recently, optical coherence tomography angiography +(OCTA) has shown promising development in rapidly and effectively screening +eye-related lesions, offering the potential for diagnosing sleep-related +disorders. To bridge this gap, our paper presents three key contributions. +Firstly, we propose JointViT, a novel model based on the Vision Transformer +architecture, incorporating a joint loss function for supervision. Secondly, we +introduce a balancing augmentation technique during data preprocessing to +improve the model's performance, particularly on the long-tail distribution +within the OCTA dataset. Lastly, through comprehensive experiments on the OCTA +dataset, our proposed method significantly outperforms other state-of-the-art +methods, achieving improvements of up to 12.28% in overall accuracy. This +advancement lays the groundwork for the future utilization of OCTA in +diagnosing sleep-related disorders. See project website +https://steve-zeyu-zhang.github.io/JointViT + +
+
+
+
+
+ + ☆ Event Cameras Meet SPADs for High-Speed, Low-Bandwidth Imaging + + +
+ Traditional cameras face a trade-off between low-light performance and +high-speed imaging: longer exposure times to capture sufficient light results +in motion blur, whereas shorter exposures result in Poisson-corrupted noisy +images. While burst photography techniques help mitigate this tradeoff, +conventional cameras are fundamentally limited in their sensor noise +characteristics. Event cameras and single-photon avalanche diode (SPAD) sensors +have emerged as promising alternatives to conventional cameras due to their +desirable properties. SPADs are capable of single-photon sensitivity with +microsecond temporal resolution, and event cameras can measure brightness +changes up to 1 MHz with low bandwidth requirements. We show that these +properties are complementary, and can help achieve low-light, high-speed image +reconstruction with low bandwidth requirements. We introduce a sensor fusion +framework to combine SPADs with event cameras to improves the reconstruction of +high-speed, low-light scenes while reducing the high bandwidth cost associated +with using every SPAD frame. Our evaluation, on both synthetic and real sensor +data, demonstrates significant enhancements ( > 5 dB PSNR) in reconstructing +low-light scenes at high temporal resolution (100 kHz) compared to conventional +cameras. Event-SPAD fusion shows great promise for real-world applications, +such as robotics or medical imaging. + +
+
+
+
+
+ + ☆ arcjetCV: an open-source software to analyze material ablation + + +
+ arcjetCV is an open-source Python software designed to automate time-resolved +measurements of heatshield material recession and recession rates from arcjet +test video footage. This new automated and accessible capability greatly +exceeds previous manual extraction methods, enabling rapid and detailed +characterization of material recession for any sample with a profile video. +arcjetCV automates the video segmentation process using machine learning +models, including a one-dimensional (1D) Convolutional Neural Network (CNN) to +infer the time-window of interest, a two-dimensional (2D) CNN for image and +edge segmentation, and a Local Outlier Factor (LOF) for outlier filtering. A +graphical user interface (GUI) simplifies the user experience and an +application programming interface (API) allows users to call the core functions +from scripts, enabling video batch processing. arcjetCV's capability to measure +time-resolved recession in turn enables characterization of non-linear +processes (shrinkage, swelling, melt flows, etc.), contributing to higher +fidelity validation and improved modeling of heatshield material performance. +The source code associated with this article can be found at +https://github.com/magnus-haw/arcjetCV. + +
+
+
+
+
+ + ☆ Multi-resolution Rescored ByteTrack for Video Object Detection on + Ultra-low-power Embedded Systems + + +
+ This paper introduces Multi-Resolution Rescored Byte-Track (MR2-ByteTrack), a +novel video object detection framework for ultra-low-power embedded processors. +This method reduces the average compute load of an off-the-shelf Deep Neural +Network (DNN) based object detector by up to 2.25$\times$ by alternating the +processing of high-resolution images (320$\times$320 pixels) with multiple +down-sized frames (192$\times$192 pixels). To tackle the accuracy degradation +due to the reduced image input size, MR2-ByteTrack correlates the output +detections over time using the ByteTrack tracker and corrects potential +misclassification using a novel probabilistic Rescore algorithm. By +interleaving two down-sized images for every high-resolution one as the input +of different state-of-the-art DNN object detectors with our MR2-ByteTrack, we +demonstrate an average accuracy increase of 2.16% and a latency reduction of +43% on the GAP9 microcontroller compared to a baseline frame-by-frame inference +scheme using exclusively full-resolution images. Code available at: +https://github.com/Bomps4/Multi_Resolution_Rescored_ByteTrack + +
+
+ comment: 9 pages, 3 figures Accepted for publication at the Embedded Vision + Workshop of the Computer Vision and Pattern Recognition conference, Seattle, + 2024 +
+
+
+
+
+ + ☆ AdaIR: Exploiting Underlying Similarities of Image Restoration Tasks + with Adapters + + +
+ Existing image restoration approaches typically employ extensive networks +specifically trained for designated degradations. Despite being effective, such +methods inevitably entail considerable storage costs and computational +overheads due to the reliance on task-specific networks. In this work, we go +beyond this well-established framework and exploit the inherent commonalities +among image restoration tasks. The primary objective is to identify components +that are shareable across restoration tasks and augment the shared components +with modules specifically trained for individual tasks. Towards this goal, we +propose AdaIR, a novel framework that enables low storage cost and efficient +training without sacrificing performance. Specifically, a generic restoration +network is first constructed through self-supervised pre-training using +synthetic degradations. Subsequent to the pre-training phase, adapters are +trained to adapt the pre-trained network to specific degradations. AdaIR +requires solely the training of lightweight, task-specific modules, ensuring a +more efficient storage and training regimen. We have conducted extensive +experiments to validate the effectiveness of AdaIR and analyze the influence of +the pre-training strategy on discovering shareable components. Extensive +experimental results show that AdaIR achieves outstanding results on multi-task +restoration while utilizing significantly fewer parameters (1.9 MB) and less +training time (7 hours) for each restoration task. The source codes and trained +models will be released. + +
+
+
+
+
+ + ☆ Towards Highly Realistic Artistic Style Transfer via Stable Diffusion + with Step-aware and Layer-aware Prompt IJCAI2024 + + +
+ Artistic style transfer aims to transfer the learned artistic style onto an +arbitrary content image, generating artistic stylized images. Existing +generative adversarial network-based methods fail to generate highly realistic +stylized images and always introduce obvious artifacts and disharmonious +patterns. Recently, large-scale pre-trained diffusion models opened up a new +way for generating highly realistic artistic stylized images. However, +diffusion model-based methods generally fail to preserve the content structure +of input content images well, introducing some undesired content structure and +style patterns. To address the above problems, we propose a novel pre-trained +diffusion-based artistic style transfer method, called LSAST, which can +generate highly realistic artistic stylized images while preserving the content +structure of input content images well, without bringing obvious artifacts and +disharmonious style patterns. Specifically, we introduce a Step-aware and +Layer-aware Prompt Space, a set of learnable prompts, which can learn the style +information from the collection of artworks and dynamically adjusts the input +images' content structure and style pattern. To train our prompt space, we +propose a novel inversion method, called Step-ware and Layer-aware Prompt +Inversion, which allows the prompt space to learn the style information of the +artworks collection. In addition, we inject a pre-trained conditional branch of +ControlNet into our LSAST, which further improved our framework's ability to +maintain content structure. Extensive experiments demonstrate that our proposed +method can generate more highly realistic artistic stylized images than the +state-of-the-art artistic style transfer methods. + +
+
+ comment: Accepted by IJCAI2024 +
+
+
+
+
+ + ☆ Using Game Engines and Machine Learning to Create Synthetic Satellite + Imagery for a Tabletop Verification Exercise + + +
+ Satellite imagery is regarded as a great opportunity for citizen-based +monitoring of activities of interest. Relevant imagery may however not be +available at sufficiently high resolution, quality, or cadence -- let alone be +uniformly accessible to open-source analysts. This limits an assessment of the +true long-term potential of citizen-based monitoring of nuclear activities +using publicly available satellite imagery. In this article, we demonstrate how +modern game engines combined with advanced machine-learning techniques can be +used to generate synthetic imagery of sites of interest with the ability to +choose relevant parameters upon request; these include time of day, cloud +cover, season, or level of activity onsite. At the same time, resolution and +off-nadir angle can be adjusted to simulate different characteristics of the +satellite. While there are several possible use-cases for synthetic imagery, +here we focus on its usefulness to support tabletop exercises in which simple +monitoring scenarios can be examined to better understand verification +capabilities enabled by new satellite constellations and very short revisit +times. + +
+
+ comment: Annual Meeting of the Institute of Nuclear Materials Management + (INMM), Vienna +
+
+
+
+
+ + ☆ Octopus v3: Technical Report for On-device Sub-billion Multimodal AI + Agent + + +
+ A multimodal AI agent is characterized by its ability to process and learn +from various types of data, including natural language, visual, and audio +inputs, to inform its actions. Despite advancements in large language models +that incorporate visual data, such as GPT-4V, effectively translating +image-based data into actionable outcomes for AI agents continues to be +challenging. In this paper, we introduce a multimodal model that incorporates +the concept of functional token specifically designed for AI agent +applications. To ensure compatibility with edge devices, our model is optimized +to a compact size of less than 1B parameters. Like GPT-4, our model can process +both English and Chinese. We demonstrate that this model is capable of +operating efficiently on a wide range of edge devices, including as constrained +as a Raspberry Pi. + +
+
+
+
+
+ + ☆ CarcassFormer: An End-to-end Transformer-based Framework for + Simultaneous Localization, Segmentation and Classification of Poultry Carcass + Defect + + +
+ In the food industry, assessing the quality of poultry carcasses during +processing is a crucial step. This study proposes an effective approach for +automating the assessment of carcass quality without requiring skilled labor or +inspector involvement. The proposed system is based on machine learning (ML) +and computer vision (CV) techniques, enabling automated defect detection and +carcass quality assessment. To this end, an end-to-end framework called +CarcassFormer is introduced. It is built upon a Transformer-based architecture +designed to effectively extract visual representations while simultaneously +detecting, segmenting, and classifying poultry carcass defects. Our proposed +framework is capable of analyzing imperfections resulting from production and +transport welfare issues, as well as processing plant stunner, scalder, picker, +and other equipment malfunctions. To benchmark the framework, a dataset of +7,321 images was initially acquired, which contained both single and multiple +carcasses per image. In this study, the performance of the CarcassFormer system +is compared with other state-of-the-art (SOTA) approaches for both +classification, detection, and segmentation tasks. Through extensive +quantitative experiments, our framework consistently outperforms existing +methods, demonstrating remarkable improvements across various evaluation +metrics such as AP, AP@50, and AP@75. Furthermore, the qualitative results +highlight the strengths of CarcassFormer in capturing fine details, including +feathers, and accurately localizing and segmenting carcasses with high +precision. To facilitate further research and collaboration, the pre-trained +model and source code of CarcassFormer is available for research purposes at: +\url{https://github.com/UARK-AICV/CarcassFormer}. + +
+
+ comment: Accepted to Poultry Science Journal +
+
+
+
+
+ + ☆ Explainable Lung Disease Classification from Chest X-Ray Images + Utilizing Deep Learning and XAI + + +
+ Lung diseases remain a critical global health concern, and it's crucial to +have accurate and quick ways to diagnose them. This work focuses on classifying +different lung diseases into five groups: viral pneumonia, bacterial pneumonia, +COVID, tuberculosis, and normal lungs. Employing advanced deep learning +techniques, we explore a diverse range of models including CNN, hybrid models, +ensembles, transformers, and Big Transfer. The research encompasses +comprehensive methodologies such as hyperparameter tuning, stratified k-fold +cross-validation, and transfer learning with fine-tuning.Remarkably, our +findings reveal that the Xception model, fine-tuned through 5-fold +cross-validation, achieves the highest accuracy of 96.21\%. This success shows +that our methods work well in accurately identifying different lung diseases. +The exploration of explainable artificial intelligence (XAI) methodologies +further enhances our understanding of the decision-making processes employed by +these models, contributing to increased trust in their clinical applications. + +
+
+
+
+
+ + ☆ SPAMming Labels: Efficient Annotations for the Trackers of Tomorrow + + +
+ Increasing the annotation efficiency of trajectory annotations from videos +has the potential to enable the next generation of data-hungry tracking +algorithms to thrive on large-scale datasets. Despite the importance of this +task, there are currently very few works exploring how to efficiently label +tracking datasets comprehensively. In this work, we introduce SPAM, a tracking +data engine that provides high-quality labels with minimal human intervention. +SPAM is built around two key insights: i) most tracking scenarios can be easily +resolved. To take advantage of this, we utilize a pre-trained model to generate +high-quality pseudo-labels, reserving human involvement for a smaller subset of +more difficult instances; ii) handling the spatiotemporal dependencies of track +annotations across time can be elegantly and efficiently formulated through +graphs. Therefore, we use a unified graph formulation to address the annotation +of both detections and identity association for tracks across time. Based on +these insights, SPAM produces high-quality annotations with a fraction of +ground truth labeling cost. We demonstrate that trackers trained on SPAM labels +achieve comparable performance to those trained on human annotations while +requiring only 3-20% of the human labeling effort. Hence, SPAM paves the way +towards highly efficient labeling of large-scale tracking datasets. Our code +and models will be available upon acceptance. + +
+
+
+
+
+ + ☆ SLAIM: Robust Dense Neural SLAM for Online Tracking and Mapping + + +
+ We present SLAIM - Simultaneous Localization and Implicit Mapping. We propose +a novel coarse-to-fine tracking model tailored for Neural Radiance Field SLAM +(NeRF-SLAM) to achieve state-of-the-art tracking performance. Notably, existing +NeRF-SLAM systems consistently exhibit inferior tracking performance compared +to traditional SLAM algorithms. NeRF-SLAM methods solve camera tracking via +image alignment and photometric bundle-adjustment. Such optimization processes +are difficult to optimize due to the narrow basin of attraction of the +optimization loss in image space (local minima) and the lack of initial +correspondences. We mitigate these limitations by implementing a Gaussian +pyramid filter on top of NeRF, facilitating a coarse-to-fine tracking +optimization strategy. Furthermore, NeRF systems encounter challenges in +converging to the right geometry with limited input views. While prior +approaches use a Signed-Distance Function (SDF)-based NeRF and directly +supervise SDF values by approximating ground truth SDF through depth +measurements, this often results in suboptimal geometry. In contrast, our +method employs a volume density representation and introduces a novel KL +regularizer on the ray termination distribution, constraining scene geometry to +consist of empty space and opaque surfaces. Our solution implements both local +and global bundle-adjustment to produce a robust (coarse-to-fine) and accurate +(KL regularizer) SLAM solution. We conduct experiments on multiple datasets +(ScanNet, TUM, Replica) showing state-of-the-art results in tracking and in +reconstruction accuracy. + +
+
+
+
+
+ + ☆ Neural Shrödinger Bridge Matching for Pansharpening + + +
+ Recent diffusion probabilistic models (DPM) in the field of pansharpening +have been gradually gaining attention and have achieved state-of-the-art (SOTA) +performance. In this paper, we identify shortcomings in directly applying DPMs +to the task of pansharpening as an inverse problem: 1) initiating sampling +directly from Gaussian noise neglects the low-resolution multispectral image +(LRMS) as a prior; 2) low sampling efficiency often necessitates a higher +number of sampling steps. We first reformulate pansharpening into the +stochastic differential equation (SDE) form of an inverse problem. Building +upon this, we propose a Schr\"odinger bridge matching method that addresses +both issues. + We design an efficient deep neural network architecture tailored for the +proposed SB matching. + In comparison to the well-established DL-regressive-based framework and the +recent DPM framework, our method demonstrates SOTA performance with fewer +sampling steps. Moreover, we discuss the relationship between SB matching and +other methods based on SDEs and ordinary differential equations (ODEs), as well +as its connection with optimal transport. + Code will be available. + +
+
+
+
+
+ + ☆ RainyScape: Unsupervised Rainy Scene Reconstruction using Decoupled + Neural Rendering + + +
+ We propose RainyScape, an unsupervised framework for reconstructing clean +scenes from a collection of multi-view rainy images. RainyScape consists of two +main modules: a neural rendering module and a rain-prediction module that +incorporates a predictor network and a learnable latent embedding that captures +the rain characteristics of the scene. Specifically, based on the spectral bias +property of neural networks, we first optimize the neural rendering pipeline to +obtain a low-frequency scene representation. Subsequently, we jointly optimize +the two modules, driven by the proposed adaptive direction-sensitive +gradient-based reconstruction loss, which encourages the network to distinguish +between scene details and rain streaks, facilitating the propagation of +gradients to the relevant components. Extensive experiments on both the classic +neural radiance field and the recently proposed 3D Gaussian splatting +demonstrate the superiority of our method in effectively eliminating rain +streaks and rendering clean images, achieving state-of-the-art performance. The +constructed high-quality dataset and source code will be publicly available. + +
+
+
+
+
+ + ☆ Text-controlled Motion Mamba: Text-Instructed Temporal Grounding of + Human Motion + + +
+ Human motion understanding is a fundamental task with diverse practical +applications, facilitated by the availability of large-scale motion capture +datasets. Recent studies focus on text-motion tasks, such as text-based motion +generation, editing and question answering. In this study, we introduce the +novel task of text-based human motion grounding (THMG), aimed at precisely +localizing temporal segments corresponding to given textual descriptions within +untrimmed motion sequences. Capturing global temporal information is crucial +for the THMG task. However, transformer-based models that rely on global +temporal self-attention face challenges when handling long untrimmed sequences +due to the quadratic computational cost. We address these challenges by +proposing Text-controlled Motion Mamba (TM-Mamba), a unified model that +integrates temporal global context, language query control, and spatial graph +topology with only linear memory cost. The core of the model is a +text-controlled selection mechanism which dynamically incorporates global +temporal information based on text query. The model is further enhanced to be +topology-aware through the integration of relational embeddings. For +evaluation, we introduce BABEL-Grounding, the first text-motion dataset that +provides detailed textual descriptions of human actions along with their +corresponding temporal segments. Extensive evaluations demonstrate the +effectiveness of TM-Mamba on BABEL-Grounding. + +
+
+
+
+
+ + ☆ Boosting Medical Image Segmentation Performance with Adaptive + Convolution Layer + + +
+ Medical image segmentation plays a vital role in various clinical +applications, enabling accurate delineation and analysis of anatomical +structures or pathological regions. Traditional CNNs have achieved remarkable +success in this field. However, they often rely on fixed kernel sizes, which +can limit their performance and adaptability in medical images where features +exhibit diverse scales and configurations due to variability in equipment, +target sizes, and expert interpretations. + In this paper, we propose an adaptive layer placed ahead of leading +deep-learning models such as UCTransNet, which dynamically adjusts the kernel +size based on the local context of the input image. + By adaptively capturing and fusing features at multiple scales, our approach +enhances the network's ability to handle diverse anatomical structures and +subtle image details, even for recently performing architectures that +internally implement intra-scale modules, such as UCTransnet. + Extensive experiments are conducted on + benchmark medical image datasets to evaluate the effectiveness of our +proposal. It consistently outperforms traditional \glspl{CNN} with fixed kernel +sizes with a similar number of parameters, achieving superior segmentation +Accuracy, Dice, and IoU in popular datasets such as SegPC2021 and ISIC2018. The +model and data are published in the open-source repository, ensuring +transparency and reproducibility of our promising results. + +
+
+
+
+
+ + ☆ DeblurGS: Gaussian Splatting for Camera Motion Blur + + +
+ Although significant progress has been made in reconstructing sharp 3D scenes +from motion-blurred images, a transition to real-world applications remains +challenging. The primary obstacle stems from the severe blur which leads to +inaccuracies in the acquisition of initial camera poses through +Structure-from-Motion, a critical aspect often overlooked by previous +approaches. To address this challenge, we propose DeblurGS, a method to +optimize sharp 3D Gaussian Splatting from motion-blurred images, even with the +noisy camera pose initialization. We restore a fine-grained sharp scene by +leveraging the remarkable reconstruction capability of 3D Gaussian Splatting. +Our approach estimates the 6-Degree-of-Freedom camera motion for each blurry +observation and synthesizes corresponding blurry renderings for the +optimization process. Furthermore, we propose Gaussian Densification Annealing +strategy to prevent the generation of inaccurate Gaussians at erroneous +locations during the early training stages when camera motion is still +imprecise. Comprehensive experiments demonstrate that our DeblurGS achieves +state-of-the-art performance in deblurring and novel view synthesis for +real-world and synthetic benchmark datasets, as well as field-captured blurry +smartphone videos. + +
+
+
+
+
+ + ☆ Detector Collapse: Backdooring Object Detection to Catastrophic Overload + or Blindness IJCAI-24 + + +
+ Object detection tasks, crucial in safety-critical systems like autonomous +driving, focus on pinpointing object locations. These detectors are known to be +susceptible to backdoor attacks. However, existing backdoor techniques have +primarily been adapted from classification tasks, overlooking deeper +vulnerabilities specific to object detection. This paper is dedicated to +bridging this gap by introducing Detector Collapse} (DC), a brand-new backdoor +attack paradigm tailored for object detection. DC is designed to instantly +incapacitate detectors (i.e., severely impairing detector's performance and +culminating in a denial-of-service). To this end, we develop two innovative +attack schemes: Sponge for triggering widespread misidentifications and +Blinding for rendering objects invisible. Remarkably, we introduce a novel +poisoning strategy exploiting natural objects, enabling DC to act as a +practical backdoor in real-world environments. Our experiments on different +detectors across several benchmarks show a significant improvement +($\sim$10\%-60\% absolute and $\sim$2-7$\times$ relative) in attack efficacy +over state-of-the-art attacks. + +
+
+ comment: Accepted by IJCAI-24 +
+
+
+
+
+ + ☆ Consisaug: A Consistency-based Augmentation for Polyp Detection in + Endoscopy Image Analysis + + +
+ Colorectal cancer (CRC), which frequently originates from initially benign +polyps, remains a significant contributor to global cancer-related mortality. +Early and accurate detection of these polyps via colonoscopy is crucial for CRC +prevention. However, traditional colonoscopy methods depend heavily on the +operator's experience, leading to suboptimal polyp detection rates. Besides, +the public database are limited in polyp size and shape diversity. To enhance +the available data for polyp detection, we introduce Consisaug, an innovative +and effective methodology to augment data that leverages deep learning. We +utilize the constraint that when the image is flipped the class label should be +equal and the bonding boxes should be consistent. We implement our Consisaug on +five public polyp datasets and at three backbones, and the results show the +effectiveness of our method. + +
+
+ comment: MLMI 2023 +
+
+
+
+
+ + ☆ Best Practices for a Handwritten Text Recognition System + + +
+ Handwritten text recognition has been developed rapidly in the recent years, +following the rise of deep learning and its applications. Though deep learning +methods provide notable boost in performance concerning text recognition, +non-trivial deviation in performance can be detected even when small +pre-processing or architectural/optimization elements are changed. This work +follows a ``best practice'' rationale; highlight simple yet effective empirical +practices that can further help training and provide well-performing +handwritten text recognition systems. Specifically, we considered three basic +aspects of a deep HTR system and we proposed simple yet effective solutions: 1) +retain the aspect ratio of the images in the preprocessing step, 2) use +max-pooling for converting the 3D feature map of CNN output into a sequence of +features and 3) assist the training procedure via an additional CTC loss which +acts as a shortcut on the max-pooled sequential features. Using these proposed +simple modifications, one can attain close to state-of-the-art results, while +considering a basic convolutional-recurrent (CNN+LSTM) architecture, for both +IAM and RIMES datasets. Code is available at +https://github.com/georgeretsi/HTR-best-practices/. + +
+
+
+
+
+ + ☆ Vision-based control for landing an aerial vehicle on a marine vessel + + +
+ This work addresses the landing problem of an aerial vehicle, exemplified by +a simple quadrotor, on a moving platform using image-based visual servo +control. First, the mathematical model of the quadrotor aircraft is introduced, +followed by the design of the inner-loop control. At the second stage, the +image features on the textured target plane are exploited to derive a +vision-based control law. The image of the spherical centroid of a set of +landmarks present in the landing target is used as a position measurement, +whereas the translational optical flow is used as velocity measurement. The +kinematics of the vision-based system is expressed in terms of the observable +features, and the proposed control law guarantees convergence without +estimating the unknown distance between the vision system and the target, which +is also guaranteed to remain strictly positive, avoiding undesired collisions. +The performance of the proposed control law is evaluated in MATLAB and 3-D +simulation software Gazebo. Simulation results for a quadrotor UAV are provided +for different velocity profiles of the moving target, showcasing the robustness +of the proposed controller. + +
+
+
+
+
+ + ☆ SoccerNet Game State Reconstruction: End-to-End Athlete Tracking and + Identification on a Minimap + + +
+ Tracking and identifying athletes on the pitch holds a central role in +collecting essential insights from the game, such as estimating the total +distance covered by players or understanding team tactics. This tracking and +identification process is crucial for reconstructing the game state, defined by +the athletes' positions and identities on a 2D top-view of the pitch, (i.e. a +minimap). However, reconstructing the game state from videos captured by a +single camera is challenging. It requires understanding the position of the +athletes and the viewpoint of the camera to localize and identify players +within the field. In this work, we formalize the task of Game State +Reconstruction and introduce SoccerNet-GSR, a novel Game State Reconstruction +dataset focusing on football videos. SoccerNet-GSR is composed of 200 video +sequences of 30 seconds, annotated with 9.37 million line points for pitch +localization and camera calibration, as well as over 2.36 million athlete +positions on the pitch with their respective role, team, and jersey number. +Furthermore, we introduce GS-HOTA, a novel metric to evaluate game state +reconstruction methods. Finally, we propose and release an end-to-end baseline +for game state reconstruction, bootstrapping the research on this task. Our +experiments show that GSR is a challenging novel task, which opens the field +for future research. Our dataset and codebase are publicly available at +https://github.com/SoccerNet/sn-gamestate. + +
+
+
+
+
+ + ☆ Following the Human Thread in Social Navigation + + +
+ The success of collaboration between humans and robots in shared environments +relies on the robot's real-time adaptation to human motion. Specifically, in +Social Navigation, the agent should be close enough to assist but ready to back +up to let the human move freely, avoiding collisions. Human trajectories emerge +as crucial cues in Social Navigation, but they are partially observable from +the robot's egocentric view and computationally complex to process. + We propose the first Social Dynamics Adaptation model (SDA) based on the +robot's state-action history to infer the social dynamics. We propose a +two-stage Reinforcement Learning framework: the first learns to encode the +human trajectories into social dynamics and learns a motion policy conditioned +on this encoded information, the current status, and the previous action. Here, +the trajectories are fully visible, i.e., assumed as privileged information. In +the second stage, the trained policy operates without direct access to +trajectories. Instead, the model infers the social dynamics solely from the +history of previous actions and statuses in real-time. Tested on the novel +Habitat 3.0 platform, SDA sets a novel state of the art (SoA) performance in +finding and following humans. + +
+
+
+
+
+ + ☆ Single-temporal Supervised Remote Change Detection for Domain + Generalization + + +
+ Change detection is widely applied in remote sensing image analysis. Existing +methods require training models separately for each dataset, which leads to +poor domain generalization. Moreover, these methods rely heavily on large +amounts of high-quality pair-labelled data for training, which is expensive and +impractical. In this paper, we propose a multimodal contrastive learning +(ChangeCLIP) based on visual-language pre-training for change detection domain +generalization. Additionally, we propose a dynamic context optimization for +prompt learning. Meanwhile, to address the data dependency issue of existing +methods, we introduce a single-temporal and controllable AI-generated training +strategy (SAIN). This allows us to train the model using a large number of +single-temporal images without image pairs in the real world, achieving +excellent generalization. Extensive experiments on series of real change +detection datasets validate the superiority and strong generalization of +ChangeCLIP, outperforming state-of-the-art change detection methods. Code will +be available. + +
+
+
+
+
+ + ☆ VBR: A Vision Benchmark in Rome ICRA 2024 + + +
+ This paper presents a vision and perception research dataset collected in +Rome, featuring RGB data, 3D point clouds, IMU, and GPS data. We introduce a +new benchmark targeting visual odometry and SLAM, to advance the research in +autonomous robotics and computer vision. This work complements existing +datasets by simultaneously addressing several issues, such as environment +diversity, motion patterns, and sensor frequency. It uses up-to-date devices +and presents effective procedures to accurately calibrate the intrinsic and +extrinsic of the sensors while addressing temporal synchronization. During +recording, we cover multi-floor buildings, gardens, urban and highway +scenarios. Combining handheld and car-based data collections, our setup can +simulate any robot (quadrupeds, quadrotors, autonomous vehicles). The dataset +includes an accurate 6-dof ground truth based on a novel methodology that +refines the RTK-GPS estimate with LiDAR point clouds through Bundle Adjustment. +All sequences divided in training and testing are accessible through our +website. + +
+
+ comment: Accepted at IEEE ICRA 2024 Website: + https://rvp-group.net/datasets/slam.html +
+
+
+
+
+ + ☆ Leveraging Fine-Grained Information and Noise Decoupling for Remote + Sensing Change Detection + + +
+ Change detection aims to identify remote sense object changes by analyzing +data between bitemporal image pairs. Due to the large temporal and spatial span +of data collection in change detection image pairs, there are often a +significant amount of task-specific and task-agnostic noise. Previous effort +has focused excessively on denoising, with this goes a great deal of loss of +fine-grained information. In this paper, we revisit the importance of +fine-grained features in change detection and propose a series of operations +for fine-grained information compensation and noise decoupling (FINO). First, +the context is utilized to compensate for the fine-grained information in the +feature space. Next, a shape-aware and a brightness-aware module are designed +to improve the capacity for representation learning. The shape-aware module +guides the backbone for more precise shape estimation, guiding the backbone +network in extracting object shape features. The brightness-aware module learns +a overall brightness estimation to improve the model's robustness to +task-agnostic noise. Finally, a task-specific noise decoupling structure is +designed as a way to improve the model's ability to separate noise interference +from feature similarity. With these training schemes, our proposed method +achieves new state-of-the-art (SOTA) results in multiple change detection +benchmarks. The code will be made available. + +
+
+
+
+
+ + ☆ Improving Composed Image Retrieval via Contrastive Learning with Scaling + Positives and Negatives + + +
+ The Composed Image Retrieval (CIR) task aims to retrieve target images using +a composed query consisting of a reference image and a modified text. Advanced +methods often utilize contrastive learning as the optimization objective, which +benefits from adequate positive and negative examples. However, the triplet for +CIR incurs high manual annotation costs, resulting in limited positive +examples. Furthermore, existing methods commonly use in-batch negative +sampling, which reduces the negative number available for the model. To address +the problem of lack of positives, we propose a data generation method by +leveraging a multi-modal large language model to construct triplets for CIR. To +introduce more negatives during fine-tuning, we design a two-stage fine-tuning +framework for CIR, whose second stage introduces plenty of static +representations of negatives to optimize the representation space rapidly. The +above two improvements can be effectively stacked and designed to be +plug-and-play, easily applied to existing CIR models without changing their +original architectures. Extensive experiments and ablation analysis demonstrate +that our method effectively scales positives and negatives and achieves +state-of-the-art results on both FashionIQ and CIRR datasets. In addition, our +methods also perform well in zero-shot composed image retrieval, providing a +new CIR solution for the low-resources scenario. + +
+
+ comment: 12 pages, 11 figures +
+
+
+
+
+ + ☆ Achieving Rotation Invariance in Convolution Operations: Shifting from + Data-Driven to Mechanism-Assured + + +
+ Achieving rotation invariance in deep neural networks without relying on data +has always been a hot research topic. Intrinsic rotation invariance can enhance +the model's feature representation capability, enabling better performance in +tasks such as multi-orientation object recognition and detection. Based on +various types of non-learnable operators, including gradient, sort, local +binary pattern, maximum, etc., this paper designs a set of new convolution +operations that are natually invariant to arbitrary rotations. Unlike most +previous studies, these rotation-invariant convolutions (RIConvs) have the same +number of learnable parameters and a similar computational process as +conventional convolution operations, allowing them to be interchangeable. Using +the MNIST-Rot dataset, we first verify the invariance of these RIConvs under +various rotation angles and compare their performance with previous +rotation-invariant convolutional neural networks (RI-CNNs). Two types of +RIConvs based on gradient operators achieve state-of-the-art results. +Subsequently, we combine RIConvs with different types and depths of classic CNN +backbones. Using the OuTex_00012, MTARSI, and NWPU-RESISC-45 datasets, we test +their performance on texture recognition, aircraft type recognition, and remote +sensing image classification tasks. The results show that RIConvs significantly +improve the accuracy of these CNN backbones, especially when the training data +is limited. Furthermore, we find that even with data augmentation, RIConvs can +further enhance model performance. + +
+
+
+
+
+ + ☆ A Semantic Segmentation-guided Approach for Ground-to-Aerial Image + Matching + + +
+ Nowadays the accurate geo-localization of ground-view images has an important +role across domains as diverse as journalism, forensics analysis, transports, +and Earth Observation. This work addresses the problem of matching a query +ground-view image with the corresponding satellite image without GPS data. This +is done by comparing the features from a ground-view image and a satellite one, +innovatively leveraging the corresponding latter's segmentation mask through a +three-stream Siamese-like network. The proposed method, Semantic Align Net +(SAN), focuses on limited Field-of-View (FoV) and ground panorama images +(images with a FoV of 360{\deg}). The novelty lies in the fusion of satellite +images in combination with their semantic segmentation masks, aimed at ensuring +that the model can extract useful features and focus on the significant parts +of the images. This work shows how SAN through semantic analysis of images +improves the performance on the unlabelled CVUSA dataset for all the tested +FoVs. + +
+
+ comment: 6 pages, 2 figures, 2 tables, Submitted to IGARSS 2024 +
+
+
+
+
+ + ☆ Learning from Unlabelled Data with Transformers: Domain Adaptation for + Semantic Segmentation of High Resolution Aerial Images + + +
+ Data from satellites or aerial vehicles are most of the times unlabelled. +Annotating such data accurately is difficult, requires expertise, and is costly +in terms of time. Even if Earth Observation (EO) data were correctly labelled, +labels might change over time. Learning from unlabelled data within a +semi-supervised learning framework for segmentation of aerial images is +challenging. In this paper, we develop a new model for semantic segmentation of +unlabelled images, the Non-annotated Earth Observation Semantic Segmentation +(NEOS) model. NEOS performs domain adaptation as the target domain does not +have ground truth semantic segmentation masks. The distribution inconsistencies +between the target and source domains are due to differences in acquisition +scenes, environment conditions, sensors, and times. Our model aligns the +learned representations of the different domains to make them coincide. The +evaluation results show that NEOS is successful and outperforms other models +for semantic segmentation of unlabelled data. + +
+
+ comment: 6 pages, 7 figures, Submitted to IGARSS 2024 +
+
+
+
+
+ + ☆ Closely Interactive Human Reconstruction with Proxemics and + Physics-Guided Adaption CVPR2024 + + +
+ Existing multi-person human reconstruction approaches mainly focus on +recovering accurate poses or avoiding penetration, but overlook the modeling of +close interactions. In this work, we tackle the task of reconstructing closely +interactive humans from a monocular video. The main challenge of this task +comes from insufficient visual information caused by depth ambiguity and severe +inter-person occlusion. In view of this, we propose to leverage knowledge from +proxemic behavior and physics to compensate the lack of visual information. +This is based on the observation that human interaction has specific patterns +following the social proxemics. Specifically, we first design a latent +representation based on Vector Quantised-Variational AutoEncoder (VQ-VAE) to +model human interaction. A proxemics and physics guided diffusion model is then +introduced to denoise the initial distribution. We design the diffusion model +as dual branch with each branch representing one individual such that the +interaction can be modeled via cross attention. With the learned priors of +VQ-VAE and physical constraint as the additional information, our proposed +approach is capable of estimating accurate poses that are also proxemics and +physics plausible. Experimental results on Hi4D, 3DPW, and CHI3D demonstrate +that our method outperforms existing approaches. The code is available at +\url{https://github.com/boycehbz/HumanInteraction}. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Training Transformer Models by Wavelet Losses Improves Quantitative and + Visual Performance in Single Image Super-Resolution + + +
+ Transformer-based models have achieved remarkable results in low-level vision +tasks including image super-resolution (SR). However, early Transformer-based +approaches that rely on self-attention within non-overlapping windows encounter +challenges in acquiring global information. To activate more input pixels +globally, hybrid attention models have been proposed. Moreover, training by +solely minimizing pixel-wise RGB losses, such as L1, have been found inadequate +for capturing essential high-frequency details. This paper presents two +contributions: i) We introduce convolutional non-local sparse attention (NLSA) +blocks to extend the hybrid transformer architecture in order to further +enhance its receptive field. ii) We employ wavelet losses to train Transformer +models to improve quantitative and subjective performance. While wavelet losses +have been explored previously, showing their power in training +Transformer-based SR models is novel. Our experimental results demonstrate that +the proposed model provides state-of-the-art PSNR results as well as superior +visual performance across various benchmark datasets. + +
+
+ comment: total of 10 pages including references, 5 tables and 5 figures, + accepted for NTIRE 2024 Single Image Super Resolution (x4) challenge +
+
+
+
+
+ + ☆ Criteria for Uncertainty-based Corner Cases Detection in Instance + Segmentation + + +
+ The operating environment of a highly automated vehicle is subject to change, +e.g., weather, illumination, or the scenario containing different objects and +other participants in which the highly automated vehicle has to navigate its +passengers safely. These situations must be considered when developing and +validating highly automated driving functions. This already poses a problem for +training and evaluating deep learning models because without the costly +labeling of thousands of recordings, not knowing whether the data contains +relevant, interesting data for further model training, it is a guess under +which conditions and situations the model performs poorly. For this purpose, we +present corner case criteria based on the predictive uncertainty. With our +corner case criteria, we are able to detect uncertainty-based corner cases of +an object instance segmentation model without relying on ground truth (GT) +data. We evaluated each corner case criterion using the COCO and the NuImages +dataset to analyze the potential of our approach. We also provide a corner case +decision function that allows us to distinguish each object into True Positive +(TP), localization and/or classification corner case, or False Positive (FP). +We also present our first results of an iterative training cycle that +outperforms the baseline and where the data added to the training dataset is +selected based on the corner case decision function. + +
+
+
+
+
+ + ☆ The Victim and The Beneficiary: Exploiting a Poisoned Model to Train a + Clean Model on Poisoned Data ICCV + + +
+ Recently, backdoor attacks have posed a serious security threat to the +training process of deep neural networks (DNNs). The attacked model behaves +normally on benign samples but outputs a specific result when the trigger is +present. However, compared with the rocketing progress of backdoor attacks, +existing defenses are difficult to deal with these threats effectively or +require benign samples to work, which may be unavailable in real scenarios. In +this paper, we find that the poisoned samples and benign samples can be +distinguished with prediction entropy. This inspires us to propose a novel +dual-network training framework: The Victim and The Beneficiary (V&B), which +exploits a poisoned model to train a clean model without extra benign samples. +Firstly, we sacrifice the Victim network to be a powerful poisoned sample +detector by training on suspicious samples. Secondly, we train the Beneficiary +network on the credible samples selected by the Victim to inhibit backdoor +injection. Thirdly, a semi-supervised suppression strategy is adopted for +erasing potential backdoors and improving model performance. Furthermore, to +better inhibit missed poisoned samples, we propose a strong data augmentation +method, AttentionMix, which works well with our proposed V&B framework. +Extensive experiments on two widely used datasets against 6 state-of-the-art +attacks demonstrate that our framework is effective in preventing backdoor +injection and robust to various attacks while maintaining the performance on +benign samples. Our code is available at https://github.com/Zixuan-Zhu/VaB. + +
+
+ comment: 13 pages, 6 figures, published to ICCV +
+
+
+
+
+ + ☆ MMCBE: Multi-modality Dataset for Crop Biomass Estimation and Beyond + + +
+ Crop biomass, a critical indicator of plant growth, health, and productivity, +is invaluable for crop breeding programs and agronomic research. However, the +accurate and scalable quantification of crop biomass remains inaccessible due +to limitations in existing measurement methods. One of the obstacles impeding +the advancement of current crop biomass prediction methodologies is the +scarcity of publicly available datasets. Addressing this gap, we introduce a +new dataset in this domain, i.e. Multi-modality dataset for crop biomass +estimation (MMCBE). Comprising 216 sets of multi-view drone images, coupled +with LiDAR point clouds, and hand-labelled ground truth, MMCBE represents the +first multi-modality one in the field. This dataset aims to establish benchmark +methods for crop biomass quantification and foster the development of +vision-based approaches. We have rigorously evaluated state-of-the-art crop +biomass estimation methods using MMCBE and ventured into additional potential +applications, such as 3D crop reconstruction from drone imagery and novel-view +rendering. With this publication, we are making our comprehensive dataset +available to the broader community. + +
+
+ comment: 10 pages, 10 figures, 3 tables +
+
+
+
+
+ + ☆ A Progressive Framework of Vision-language Knowledge Distillation and + Alignment for Multilingual Scene + + +
+ Pre-trained vision-language (V-L) models such as CLIP have shown excellent +performance in many downstream cross-modal tasks. However, most of them are +only applicable to the English context. Subsequent research has focused on this +problem and proposed improved models, such as CN-CLIP and AltCLIP, to +facilitate their applicability to Chinese and even other languages. +Nevertheless, these models suffer from high latency and a large memory +footprint in inference, which limits their further deployment on +resource-constrained edge devices. In this work, we propose a conceptually +simple yet effective multilingual CLIP Compression framework and train a +lightweight multilingual vision-language model, called DC-CLIP, for both +Chinese and English context. In this framework, we collect high-quality Chinese +and English text-image pairs and design two training stages, including +multilingual vision-language feature distillation and alignment. During the +first stage, lightweight image/text student models are designed to learn robust +visual/multilingual textual feature representation ability from corresponding +teacher models, respectively. Subsequently, the multilingual vision-language +alignment stage enables effective alignment of visual and multilingual textual +features to further improve the model's multilingual performance. Comprehensive +experiments in zero-shot image classification, conducted based on the ELEVATER +benchmark, showcase that DC-CLIP achieves superior performance in the English +context and competitive performance in the Chinese context, even with less +training data, when compared to existing models of similar parameter magnitude. +The evaluation demonstrates the effectiveness of our designed training +mechanism. + +
+
+
+
+
+ + ☆ Optical Image-to-Image Translation Using Denoising Diffusion Models: + Heterogeneous Change Detection as a Use Case + + +
+ We introduce an innovative deep learning-based method that uses a denoising +diffusion-based model to translate low-resolution images to high-resolution +ones from different optical sensors while preserving the contents and avoiding +undesired artifacts. The proposed method is trained and tested on a large and +diverse data set of paired Sentinel-II and Planet Dove images. We show that it +can solve serious image generation issues observed when the popular +classifier-free guided Denoising Diffusion Implicit Model (DDIM) framework is +used in the task of Image-to-Image Translation of multi-sensor optical remote +sensing images and that it can generate large images with highly consistent +patches, both in colors and in features. Moreover, we demonstrate how our +method improves heterogeneous change detection results in two urban areas: +Beirut, Lebanon, and Austin, USA. Our contributions are: i) a new training and +testing algorithm based on denoising diffusion models for optical image +translation; ii) a comprehensive image quality evaluation and ablation study; +iii) a comparison with the classifier-free guided DDIM framework; and iv) +change detection experiments on heterogeneous data. + +
+
+
+
+
+ + ☆ ONOT: a High-Quality ICAO-compliant Synthetic Mugshot Dataset + + +
+ Nowadays, state-of-the-art AI-based generative models represent a viable +solution to overcome privacy issues and biases in the collection of datasets +containing personal information, such as faces. Following this intuition, in +this paper we introduce ONOT, a synthetic dataset specifically focused on the +generation of high-quality faces in adherence to the requirements of the +ISO/IEC 39794-5 standards that, following the guidelines of the International +Civil Aviation Organization (ICAO), defines the interchange formats of face +images in electronic Machine-Readable Travel Documents (eMRTD). The strictly +controlled and varied mugshot images included in ONOT are useful in research +fields related to the analysis of face images in eMRTD, such as Morphing Attack +Detection and Face Quality Assessment. The dataset is publicly released, in +combination with the generation procedure details in order to improve the +reproducibility and enable future extensions. + +
+
+ comment: Paper accepted in IEEE FG 2024 +
+
+
+
+
+ + ☆ Energy-Efficient Uncertainty-Aware Biomass Composition Prediction at the + Edge CVPR 2024 + + +
+ Clover fixates nitrogen from the atmosphere to the ground, making +grass-clover mixtures highly desirable to reduce external nitrogen +fertilization. Herbage containing clover additionally promotes higher food +intake, resulting in higher milk production. Herbage probing however remains +largely unused as it requires a time-intensive manual laboratory analysis. +Without this information, farmers are unable to perform localized clover sowing +or take targeted fertilization decisions. Deep learning algorithms have been +proposed with the goal to estimate the dry biomass composition from images of +the grass directly in the fields. The energy-intensive nature of deep learning +however limits deployment to practical edge devices such as smartphones. This +paper proposes to fill this gap by applying filter pruning to reduce the energy +requirement of existing deep learning solutions. We report that although pruned +networks are accurate on controlled, high-quality images of the grass, they +struggle to generalize to real-world smartphone images that are blurry or taken +from challenging angles. We address this challenge by training filter-pruned +models using a variance attenuation loss so they can predict the uncertainty of +their predictions. When the uncertainty exceeds a threshold, we re-infer using +a more accurate unpruned model. This hybrid approach allows us to reduce energy +consumption while retaining a high accuracy. We evaluate our algorithm on two +datasets: the GrassClover and the Irish clover using an NVIDIA Jetson Nano edge +device. We find that we reduce energy reduction with respect to +state-of-the-art solutions by 50% on average with only 4% accuracy loss. + +
+
+ comment: The paper has been accepted to CVPR 2024 5th Workshop on Vision for + Agriculture +
+
+
+
+
+ + ☆ Simple In-place Data Augmentation for Surveillance Object Detection CVPR + + +
+ Motivated by the need to improve model performance in traffic monitoring +tasks with limited labeled samples, we propose a straightforward augmentation +technique tailored for object detection datasets, specifically designed for +stationary camera-based applications. Our approach focuses on placing objects +in the same positions as the originals to ensure its effectiveness. By applying +in-place augmentation on objects from the same camera input image, we address +the challenge of overlapping with original and previously selected objects. +Through extensive testing on two traffic monitoring datasets, we illustrate the +efficacy of our augmentation strategy in improving model performance, +particularly in scenarios with limited labeled samples and imbalanced class +distributions. Notably, our method achieves comparable performance to models +trained on the entire dataset while utilizing only 8.5 percent of the original +data. Moreover, we report significant improvements, with mAP@.5 increasing from +0.4798 to 0.5025, and the mAP@.5:.95 rising from 0.29 to 0.3138 on the +FishEye8K dataset. These results highlight the potential of our augmentation +approach in enhancing object detection models for traffic monitoring +applications. + +
+
+ comment: CVPR Workshop 2024 +
+
+
+
+
+ + ☆ Feature Corrective Transfer Learning: End-to-End Solutions to Object + Detection in Non-Ideal Visual Conditions CVPR + + +
+ A significant challenge in the field of object detection lies in the system's +performance under non-ideal imaging conditions, such as rain, fog, low +illumination, or raw Bayer images that lack ISP processing. Our study +introduces "Feature Corrective Transfer Learning", a novel approach that +leverages transfer learning and a bespoke loss function to facilitate the +end-to-end detection of objects in these challenging scenarios without the need +to convert non-ideal images into their RGB counterparts. In our methodology, we +initially train a comprehensive model on a pristine RGB image dataset. +Subsequently, non-ideal images are processed by comparing their feature maps +against those from the initial ideal RGB model. This comparison employs the +Extended Area Novel Structural Discrepancy Loss (EANSDL), a novel loss function +designed to quantify similarities and integrate them into the detection loss. +This approach refines the model's ability to perform object detection across +varying conditions through direct feature map correction, encapsulating the +essence of Feature Corrective Transfer Learning. Experimental validation on +variants of the KITTI dataset demonstrates a significant improvement in mean +Average Precision (mAP), resulting in a 3.8-8.1% relative enhancement in +detection under non-ideal conditions compared to the baseline model, and a less +marginal performance difference within 1.3% of the mAP@[0.5:0.95] achieved +under ideal conditions by the standard Faster RCNN algorithm. + +
+
+ comment: 10 pages, 3 figures, accepted by 2024 CVPR UG2 Workshop +
+
+
+
+
+ + ☆ Prompt-Guided Generation of Structured Chest X-Ray Report Using a + Pre-trained LLM + + +
+ Medical report generation automates radiology descriptions from images, +easing the burden on physicians and minimizing errors. However, current methods +lack structured outputs and physician interactivity for clear, clinically +relevant reports. Our method introduces a prompt-guided approach to generate +structured chest X-ray reports using a pre-trained large language model (LLM). +First, we identify anatomical regions in chest X-rays to generate focused +sentences that center on key visual elements, thereby establishing a structured +report foundation with anatomy-based sentences. We also convert the detected +anatomy into textual prompts conveying anatomical comprehension to the LLM. +Additionally, the clinical context prompts guide the LLM to emphasize +interactivity and clinical requirements. By integrating anatomy-focused +sentences and anatomy/clinical prompts, the pre-trained LLM can generate +structured chest X-ray reports tailored to prompted anatomical regions and +clinical contexts. We evaluate using language generation and clinical +effectiveness metrics, demonstrating strong performance. + +
+
+ comment: Accepted by IEEE Conference on Multimedia Expo 2024 +
+
+
+
+
+ + ☆ Exploring the Transferability of Visual Prompting for Multimodal Large + Language Models CVPR 2024 + + +
+ Although Multimodal Large Language Models (MLLMs) have demonstrated promising +versatile capabilities, their performance is still inferior to specialized +models on downstream tasks, which makes adaptation necessary to enhance their +utility. However, fine-tuning methods require independent training for every +model, leading to huge computation and memory overheads. In this paper, we +propose a novel setting where we aim to improve the performance of diverse +MLLMs with a group of shared parameters optimized for a downstream task. To +achieve this, we propose Transferable Visual Prompting (TVP), a simple and +effective approach to generate visual prompts that can transfer to different +models and improve their performance on downstream tasks after trained on only +one model. We introduce two strategies to address the issue of cross-model +feature corruption of existing visual prompting methods and enhance the +transferability of the learned prompts, including 1) Feature Consistency +Alignment: which imposes constraints to the prompted feature changes to +maintain task-agnostic knowledge; 2) Task Semantics Enrichment: which +encourages the prompted images to contain richer task-specific semantics with +language guidance. We validate the effectiveness of TVP through extensive +experiments with 6 modern MLLMs on a wide variety of tasks ranging from object +recognition and counting to multimodal reasoning and hallucination correction. + +
+
+ comment: Accepted in CVPR 2024 as Poster (Highlight) +
+
+
+
+
+ + ☆ Kathakali Hand Gesture Recognition With Minimal Data + + +
+ The Indian classical dance-drama Kathakali has a set of hand gestures called +Mudras, which form the fundamental units of all its dance moves and postures. +Recognizing the depicted mudra becomes one of the first steps in its digital +processing. The work treats the problem as a 24-class classification task and +proposes a vector-similarity-based approach using pose estimation, eliminating +the need for further training or fine-tuning. This approach overcomes the +challenge of data scarcity that limits the application of AI in similar +domains. The method attains 92% accuracy which is a similar or better +performance as other model-training-based works existing in the domain, with +the added advantage that the method can still work with data sizes as small as +1 or 5 samples with a slightly reduced performance. Working with images, +videos, and even real-time streams is possible. The system can work with +hand-cropped or full-body images alike. We have developed and made public a +dataset for the Kathakali Mudra Recognition as part of this work. + +
+
+
+
+
+ + ☆ GhostNetV3: Exploring the Training Strategies for Compact Models + + +
+ Compact neural networks are specially designed for applications on edge +devices with faster inference speed yet modest performance. However, training +strategies of compact models are borrowed from that of conventional models at +present, which ignores their difference in model capacity and thus may impede +the performance of compact models. In this paper, by systematically +investigating the impact of different training ingredients, we introduce a +strong training strategy for compact models. We find that the appropriate +designs of re-parameterization and knowledge distillation are crucial for +training high-performance compact models, while some commonly used data +augmentations for training conventional models, such as Mixup and CutMix, lead +to worse performance. Our experiments on ImageNet-1K dataset demonstrate that +our specialized training strategy for compact models is applicable to various +architectures, including GhostNetV2, MobileNetV2 and ShuffleNetV2. +Specifically, equipped with our strategy, GhostNetV3 1.3$\times$ achieves a +top-1 accuracy of 79.1% with only 269M FLOPs and a latency of 14.46ms on mobile +devices, surpassing its ordinarily trained counterpart by a large margin. +Moreover, our observation can also be extended to object detection scenarios. +PyTorch code and checkpoints can be found at +https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv3_pytorch. + +
+
+
+
+
+ + ☆ Pre-processing matters: A segment search method for WSI classification + + +
+ Pre-processing for whole slide images can affect classification performance +both in the training and inference stages. Our study analyzes the impact of +pre-processing parameters on inference and training across single- and +multiple-domain datasets. However, searching for an optimal parameter set is +time-consuming. To overcome this, we propose a novel Similarity-based Simulated +Annealing approach for fast parameter tuning to enhance inference performance +on single-domain data. Our method demonstrates significant performance +improvements in accuracy, which raise accuracy from 0.512 to 0.847 in a single +domain. We further extend our insight into training performance in multi-domain +data by employing a novel Bayesian optimization to search optimal +pre-processing parameters, resulting in a high AUC of 0.967. We highlight that +better pre-processing for WSI can contribute to further accuracy improvement in +the histology area. + +
+
+
+
+
+ + ☆ Deep Portrait Quality Assessment. A NTIRE 2024 Challenge Survey CVPR + + +
+ This paper reviews the NTIRE 2024 Portrait Quality Assessment Challenge, +highlighting the proposed solutions and results. This challenge aims to obtain +an efficient deep neural network capable of estimating the perceptual quality +of real portrait photos. The methods must generalize to diverse scenes and +diverse lighting conditions (indoor, outdoor, low-light), movement, blur, and +other challenging conditions. In the challenge, 140 participants registered, +and 35 submitted results during the challenge period. The performance of the +top 5 submissions is reviewed and provided here as a gauge for the current +state-of-the-art in Portrait Quality Assessment. + +
+
+ comment: CVPRW - NTIRE 2024 +
+
+
+
+
+ + ☆ Learning SO(3)-Invariant Semantic Correspondence via Local Shape + Transform CVPR 2024 + + +
+ Establishing accurate 3D correspondences between shapes stands as a pivotal +challenge with profound implications for computer vision and robotics. However, +existing self-supervised methods for this problem assume perfect input shape +alignment, restricting their real-world applicability. In this work, we +introduce a novel self-supervised Rotation-Invariant 3D correspondence learner +with Local Shape Transform, dubbed RIST, that learns to establish dense +correspondences between shapes even under challenging intra-class variations +and arbitrary orientations. Specifically, RIST learns to dynamically formulate +an SO(3)-invariant local shape transform for each point, which maps the +SO(3)-equivariant global shape descriptor of the input shape to a local shape +descriptor. These local shape descriptors are provided as inputs to our decoder +to facilitate point cloud self- and cross-reconstruction. Our proposed +self-supervised training pipeline encourages semantically corresponding points +from different shapes to be mapped to similar local shape descriptors, enabling +RIST to establish dense point-wise correspondences. RIST demonstrates +state-of-the-art performances on 3D part label transfer and semantic keypoint +transfer given arbitrarily rotated point cloud pairs, outperforming existing +methods by significant margins. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ HybriMap: Hybrid Clues Utilization for Effective Vectorized HD Map + Construction + + +
+ Constructing vectorized high-definition maps from surround-view cameras has +garnered significant attention in recent years. However, the commonly employed +multi-stage sequential workflow in prevailing approaches often leads to the +loss of early-stage information, particularly in perspective-view features. +Usually, such loss is observed as an instance missing or shape mismatching in +the final birds-eye-view predictions. To address this concern, we propose a +novel approach, namely \textbf{HybriMap}, which effectively exploits clues from +hybrid features to ensure the delivery of valuable information. Specifically, +we design the Dual Enhancement Module, to enable both explicit integration and +implicit modification under the guidance of hybrid features. Additionally, the +perspective keypoints are utilized as supervision, further directing the +feature enhancement process. Extensive experiments conducted on existing +benchmarks have demonstrated the state-of-the-art performance of our proposed +approach. + +
+
+
+
+
+ + ☆ Multi-target and multi-stage liver lesion segmentation and detection in + multi-phase computed tomography scans + + +
+ Multi-phase computed tomography (CT) scans use contrast agents to highlight +different anatomical structures within the body to improve the probability of +identifying and detecting anatomical structures of interest and abnormalities +such as liver lesions. Yet, detecting these lesions remains a challenging task +as these lesions vary significantly in their size, shape, texture, and contrast +with respect to surrounding tissue. Therefore, radiologists need to have an +extensive experience to be able to identify and detect these lesions. +Segmentation-based neural networks can assist radiologists with this task. +Current state-of-the-art lesion segmentation networks use the encoder-decoder +design paradigm based on the UNet architecture where the multi-phase CT scan +volume is fed to the network as a multi-channel input. Although this approach +utilizes information from all the phases and outperform single-phase +segmentation networks, we demonstrate that their performance is not optimal and +can be further improved by incorporating the learning from models trained on +each single-phase individually. Our approach comprises three stages. The first +stage identifies the regions within the liver where there might be lesions at +three different scales (4, 8, and 16 mm). The second stage includes the main +segmentation model trained using all the phases as well as a segmentation model +trained on each of the phases individually. The third stage uses the +multi-phase CT volumes together with the predictions from each of the +segmentation models to generate the final segmentation map. Overall, our +approach improves relative liver lesion segmentation performance by 1.6% while +reducing performance variability across subjects by 8% when compared to the +current state-of-the-art models. + +
+
+
+
+
+ + ☆ REACTO: Reconstructing Articulated Objects from a Single Video + + +
+ In this paper, we address the challenge of reconstructing general articulated +3D objects from a single video. Existing works employing dynamic neural +radiance fields have advanced the modeling of articulated objects like humans +and animals from videos, but face challenges with piece-wise rigid general +articulated objects due to limitations in their deformation models. To tackle +this, we propose Quasi-Rigid Blend Skinning, a novel deformation model that +enhances the rigidity of each part while maintaining flexible deformation of +the joints. Our primary insight combines three distinct approaches: 1) an +enhanced bone rigging system for improved component modeling, 2) the use of +quasi-sparse skinning weights to boost part rigidity and reconstruction +fidelity, and 3) the application of geodesic point assignment for precise +motion and seamless deformation. Our method outperforms previous works in +producing higher-fidelity 3D reconstructions of general articulated objects, as +demonstrated on both real and synthetic datasets. Project page: +https://chaoyuesong.github.io/REACTO. + +
+
+
+
+
+ + ☆ GeoReF: Geometric Alignment Across Shape Variation for Category-level + Object Pose Refinement + + +
+ Object pose refinement is essential for robust object pose estimation. +Previous work has made significant progress towards instance-level object pose +refinement. Yet, category-level pose refinement is a more challenging problem +due to large shape variations within a category and the discrepancies between +the target object and the shape prior. To address these challenges, we +introduce a novel architecture for category-level object pose refinement. Our +approach integrates an HS-layer and learnable affine transformations, which +aims to enhance the extraction and alignment of geometric information. +Additionally, we introduce a cross-cloud transformation mechanism that +efficiently merges diverse data sources. Finally, we push the limits of our +model by incorporating the shape prior information for translation and size +error prediction. We conducted extensive experiments to demonstrate the +effectiveness of the proposed framework. Through extensive quantitative +experiments, we demonstrate significant improvement over the baseline method by +a large margin across all metrics. + +
+
+ comment: The IEEE/CVF Conference on Computer Vision and Pattern Recognition + 2024 +
+
+
+
+
+ + ☆ Fact :Teaching MLLMs with Faithful, Concise and Transferable Rationales + + +
+ The remarkable performance of Multimodal Large Language Models (MLLMs) has +unequivocally demonstrated their proficient understanding capabilities in +handling a wide array of visual tasks. Nevertheless, the opaque nature of their +black-box reasoning processes persists as an enigma, rendering them +uninterpretable and struggling with hallucination. Their ability to execute +intricate compositional reasoning tasks is also constrained, culminating in a +stagnation of learning progression for these models. In this work, we introduce +Fact, a novel paradigm designed to generate multimodal rationales that are +faithful, concise, and transferable for teaching MLLMs. This paradigm utilizes +verifiable visual programming to generate executable code guaranteeing +faithfulness and precision. Subsequently, through a series of operations +including pruning, merging, and bridging, the rationale enhances its +conciseness. Furthermore, we filter rationales that can be transferred to +end-to-end paradigms from programming paradigms to guarantee transferability. +Empirical evidence from experiments demonstrates the superiority of our method +across models of varying parameter sizes, significantly enhancing their +compositional reasoning and generalization ability. Our approach also reduces +hallucinations owing to its high correlation between images and text. + +
+
+
+
+
+ + ☆ D-Aug: Enhancing Data Augmentation for Dynamic LiDAR Scenes + + +
+ Creating large LiDAR datasets with pixel-level labeling poses significant +challenges. While numerous data augmentation methods have been developed to +reduce the reliance on manual labeling, these methods predominantly focus on +static scenes and they overlook the importance of data augmentation for dynamic +scenes, which is critical for autonomous driving. To address this issue, we +propose D-Aug, a LiDAR data augmentation method tailored for augmenting dynamic +scenes. D-Aug extracts objects and inserts them into dynamic scenes, +considering the continuity of these objects across consecutive frames. For +seamless insertion into dynamic scenes, we propose a reference-guided method +that involves dynamic collision detection and rotation alignment. Additionally, +we present a pixel-level road identification strategy to efficiently determine +suitable insertion positions. We validated our method using the nuScenes +dataset with various 3D detection and tracking methods. Comparative experiments +demonstrate the superiority of D-Aug. + +
+
+ comment: 4pages, 4 figures +
+
+
+
+
+ + ☆ TiNO-Edit: Timestep and Noise Optimization for Robust Diffusion-Based + Image Editing CVPR + + +
+ Despite many attempts to leverage pre-trained text-to-image models (T2I) like +Stable Diffusion (SD) for controllable image editing, producing good +predictable results remains a challenge. Previous approaches have focused on +either fine-tuning pre-trained T2I models on specific datasets to generate +certain kinds of images (e.g., with a specific object or person), or on +optimizing the weights, text prompts, and/or learning features for each input +image in an attempt to coax the image generator to produce the desired result. +However, these approaches all have shortcomings and fail to produce good +results in a predictable and controllable manner. To address this problem, we +present TiNO-Edit, an SD-based method that focuses on optimizing the noise +patterns and diffusion timesteps during editing, something previously +unexplored in the literature. With this simple change, we are able to generate +results that both better align with the original images and reflect the desired +result. Furthermore, we propose a set of new loss functions that operate in the +latent domain of SD, greatly speeding up the optimization when compared to +prior approaches, which operate in the pixel domain. Our method can be easily +applied to variations of SD including Textual Inversion and DreamBooth that +encode new concepts and incorporate them into the edited results. We present a +host of image-editing capabilities enabled by our approach. Our code is +publicly available at https://github.com/SherryXTChen/TiNO-Edit. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ MHLR: Moving Haar Learning Rate Scheduler for Large-scale Face + Recognition Training with One GPU + + +
+ Face recognition (FR) has seen significant advancements due to the +utilization of large-scale datasets. Training deep FR models on large-scale +datasets with multiple GPUs is now a common practice. In fact, computing power +has evolved into a foundational and indispensable resource in the area of deep +learning. It is nearly impossible to train a deep FR model without holding +adequate hardware resources. Recognizing this challenge, some FR approaches +have started exploring ways to reduce the time complexity of the +fully-connected layer in FR models. Unlike other approaches, this paper +introduces a simple yet highly effective approach, Moving Haar Learning Rate +(MHLR) scheduler, for scheduling the learning rate promptly and accurately in +the training process. MHLR supports large-scale FR training with only one GPU, +which is able to accelerate the model to 1/4 of its original training time +without sacrificing more than 1% accuracy. More specifically, MHLR only needs +$30$ hours to train the model ResNet100 on the dataset WebFace12M containing +more than 12M face images with 0.6M identities. Extensive experiments validate +the efficiency and effectiveness of MHLR. + +
+
+
+
+
+ + ☆ CorrNet+: Sign Language Recognition and Translation via Spatial-Temporal + Correlation + + +
+ In sign language, the conveyance of human body trajectories predominantly +relies upon the coordinated movements of hands and facial expressions across +successive frames. Despite the recent advancements of sign language +understanding methods, they often solely focus on individual frames, inevitably +overlooking the inter-frame correlations that are essential for effectively +modeling human body trajectories. To address this limitation, this paper +introduces a spatial-temporal correlation network, denoted as CorrNet+, which +explicitly identifies body trajectories across multiple frames. In specific, +CorrNet+ employs a correlation module and an identification module to build +human body trajectories. Afterwards, a temporal attention module is followed to +adaptively evaluate the contributions of different frames. The resultant +features offer a holistic perspective on human body movements, facilitating a +deeper understanding of sign language. As a unified model, CorrNet+ achieves +new state-of-the-art performance on two extensive sign language understanding +tasks, including continuous sign language recognition (CSLR) and sign language +translation (SLT). Especially, CorrNet+ surpasses previous methods equipped +with resource-intensive pose-estimation networks or pre-extracted heatmaps for +hand and facial feature extraction. Compared with CorrNet, CorrNet+ achieves a +significant performance boost across all benchmarks while halving the +computational overhead. A comprehensive comparison with previous +spatial-temporal reasoning methods verifies the superiority of CorrNet+. Code +is available at https://github.com/hulianyuyy/CorrNet_Plus. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.03202 +
+
+
+
+
+ + ☆ LADDER: An Efficient Framework for Video Frame Interpolation + + +
+ Video Frame Interpolation (VFI) is a crucial technique in various +applications such as slow-motion generation, frame rate conversion, video frame +restoration etc. This paper introduces an efficient video frame interpolation +framework that aims to strike a favorable balance between efficiency and +quality. Our framework follows a general paradigm consisting of a flow +estimator and a refinement module, while incorporating carefully designed +components. First of all, we adopt depth-wise convolution with large kernels in +the flow estimator that simultaneously reduces the parameters and enhances the +receptive field for encoding rich context and handling complex motion. +Secondly, diverging from a common design for the refinement module with a +UNet-structure (encoder-decoder structure), which we find redundant, our +decoder-only refinement module directly enhances the result from coarse to fine +features, offering a more efficient process. In addition, to address the +challenge of handling high-definition frames, we also introduce an innovative +HD-aware augmentation strategy during training, leading to consistent +enhancement on HD images. Extensive experiments are conducted on diverse +datasets, Vimeo90K, UCF101, Xiph and SNU-FILM. The results demonstrate that our +approach achieves state-of-the-art performance with clear improvement while +requiring much less FLOPs and parameters, reaching to a better spot for +balancing efficiency and quality. + +
+
+
+
+
+ + ☆ Object Remover Performance Evaluation Methods using Class-wise Object + Removal Images + + +
+ Object removal refers to the process of erasing designated objects from an +image while preserving the overall appearance, and it is one area where image +inpainting is widely used in real-world applications. The performance of an +object remover is quantitatively evaluated by measuring the quality of object +removal results, similar to how the performance of an image inpainter is +gauged. Current works reporting quantitative performance evaluations utilize +original images as references. In this letter, to validate the current +evaluation methods cannot properly evaluate the performance of an object +remover, we create a dataset with object removal ground truth and compare the +evaluations made by the current methods using original images to those +utilizing object removal ground truth images. The disparities between two +evaluation sets validate that the current methods are not suitable for +measuring the performance of an object remover. Additionally, we propose new +evaluation methods tailored to gauge the performance of an object remover. The +proposed methods evaluate the performance through class-wise object removal +results and utilize images without the target class objects as a comparison +set. We confirm that the proposed methods can make judgments consistent with +human evaluators in the COCO dataset, and that they can produce measurements +aligning with those using object removal ground truth in the self-acquired +dataset. + +
+
+
+
+
+ + ☆ Synthesizing Realistic Data for Table Recognition ICDAR 2024 + + +
+ To overcome the limitations and challenges of current automatic table data +annotation methods and random table data synthesis approaches, we propose a +novel method for synthesizing annotation data specifically designed for table +recognition. This method utilizes the structure and content of existing complex +tables, facilitating the efficient creation of tables that closely replicate +the authentic styles found in the target domain. By leveraging the actual +structure and content of tables from Chinese financial announcements, we have +developed the first extensive table annotation dataset in this domain. We used +this dataset to train several recent deep learning-based end-to-end table +recognition models. Additionally, we have established the inaugural benchmark +for real-world complex tables in the Chinese financial announcement domain, +using it to assess the performance of models trained on our synthetic data, +thereby effectively validating our method's practicality and effectiveness. +Furthermore, we applied our synthesis method to augment the FinTabNet dataset, +extracted from English financial announcements, by increasing the proportion of +tables with multiple spanning cells to introduce greater complexity. Our +experiments show that models trained on this augmented dataset achieve +comprehensive improvements in performance, especially in the recognition of +tables with multiple spanning cells. + +
+
+ comment: ICDAR 2024 +
+
+
+
+
+ + ☆ LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing + Diffusion Models + + +
+ In the era of AIGC, the demand for low-budget or even on-device applications +of diffusion models emerged. In terms of compressing the Stable Diffusion +models (SDMs), several approaches have been proposed, and most of them +leveraged the handcrafted layer removal methods to obtain smaller U-Nets, along +with knowledge distillation to recover the network performance. However, such a +handcrafting manner of layer removal is inefficient and lacks scalability and +generalization, and the feature distillation employed in the retraining phase +faces an imbalance issue that a few numerically significant feature loss terms +dominate over others throughout the retraining process. To this end, we +proposed the layer pruning and normalized distillation for compressing +diffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to +compress SDM's U-Net automatically and proposed an effective one-shot pruning +criterion whose one-shot performance is guaranteed by its good additivity +property, surpassing other layer pruning and handcrafted layer removal methods, +2) proposed the normalized feature distillation for retraining, alleviated the +imbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of +SDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0% +decline in PickScore at a pruning ratio of 50% while the comparative methods' +minimal PickScore decline is 8.2%. We will release our code. + +
+
+
+
+
+ + ☆ Sky-GVIO: an enhanced GNSS/INS/Vision navigation with FCN-based + sky-segmentation in urban canyon + + +
+ Accurate, continuous, and reliable positioning is a critical component of +achieving autonomous driving. However, in complex urban canyon environments, +the vulnerability of a stand-alone sensor and non-line-of-sight (NLOS) caused +by high buildings, trees, and elevated structures seriously affect positioning +results. To address these challenges, a sky-view images segmentation algorithm +based on Fully Convolutional Network (FCN) is proposed for GNSS NLOS detection. +Building upon this, a novel NLOS detection and mitigation algorithm (named +S-NDM) is extended to the tightly coupled Global Navigation Satellite Systems +(GNSS), Inertial Measurement Units (IMU), and visual feature system which is +called Sky-GVIO, with the aim of achieving continuous and accurate positioning +in urban canyon environments. Furthermore, the system harmonizes Single Point +Positioning (SPP) with Real-Time Kinematic (RTK) methodologies to bolster its +operational versatility and resilience. In urban canyon environments, the +positioning performance of S-NDM algorithm proposed in this paper is evaluated +under different tightly coupled SPP-related and RTK-related models. The results +exhibit that Sky-GVIO system achieves meter-level accuracy under SPP mode and +sub-decimeter precision with RTK, surpassing the performance of GNSS/INS/Vision +frameworks devoid of S-NDM. Additionally, the sky-view image dataset, inclusive +of training and evaluation subsets, has been made publicly accessible for +scholarly exploration at https://github.com/whuwangjr/sky-view-images . + +
+
+
+
+
+ + ☆ Rethinking 3D Dense Caption and Visual Grounding in A Unified Framework + through Prompt-based Localization + + +
+ 3D Visual Grounding (3DVG) and 3D Dense Captioning (3DDC) are two crucial +tasks in various 3D applications, which require both shared and complementary +information in localization and visual-language relationships. Therefore, +existing approaches adopt the two-stage "detect-then-describe/discriminate" +pipeline, which relies heavily on the performance of the detector, resulting in +suboptimal performance. Inspired by DETR, we propose a unified framework, +3DGCTR, to jointly solve these two distinct but closely related tasks in an +end-to-end fashion. The key idea is to reconsider the prompt-based localization +ability of the 3DVG model. In this way, the 3DVG model with a well-designed +prompt as input can assist the 3DDC task by extracting localization information +from the prompt. In terms of implementation, we integrate a Lightweight Caption +Head into the existing 3DVG network with a Caption Text Prompt as a connection, +effectively harnessing the existing 3DVG model's inherent localization +capacity, thereby boosting 3DDC capability. This integration facilitates +simultaneous multi-task training on both tasks, mutually enhancing their +performance. Extensive experimental results demonstrate the effectiveness of +this approach. Specifically, on the ScanRefer dataset, 3DGCTR surpasses the +state-of-the-art 3DDC method by 4.3% in CIDEr@0.5IoU in MLE training and +improves upon the SOTA 3DVG method by 3.16% in Acc@0.25IoU. + +
+
+
+
+
+ + ☆ Multilateral Temporal-view Pyramid Transformer for Video Inpainting + Detection + + +
+ The task of video inpainting detection is to expose the pixel-level inpainted +regions within a video sequence. Existing methods usually focus on leveraging +spatial and temporal inconsistencies. However, these methods typically employ +fixed operations to combine spatial and temporal clues, limiting their +applicability in different scenarios. In this paper, we introduce a novel +Multilateral Temporal-view Pyramid Transformer ({\em MumPy}) that collaborates +spatial-temporal clues flexibly. Our method utilizes a newly designed +multilateral temporal-view encoder to extract various collaborations of +spatial-temporal clues and introduces a deformable window-based temporal-view +interaction module to enhance the diversity of these collaborations. +Subsequently, we develop a multi-pyramid decoder to aggregate the various types +of features and generate detection maps. By adjusting the contribution strength +of spatial and temporal clues, our method can effectively identify inpainted +regions. We validate our method on existing datasets and also introduce a new +challenging and large-scale Video Inpainting dataset based on the YouTube-VOS +dataset, which employs several more recent inpainting methods. The results +demonstrate the superiority of our method in both in-domain and cross-domain +evaluation scenarios. + +
+
+
+
+
+ + ☆ Supervised Contrastive Vision Transformer for Breast Histopathological + Image Classification + + +
+ Invasive ductal carcinoma (IDC) is the most prevalent form of breast cancer. +Breast tissue histopathological examination is critical in diagnosing and +classifying breast cancer. Although existing methods have shown promising +results, there is still room for improvement in the classification accuracy and +generalization of IDC using histopathology images. We present a novel approach, +Supervised Contrastive Vision Transformer (SupCon-ViT), for improving the +classification of invasive ductal carcinoma in terms of accuracy and +generalization by leveraging the inherent strengths and advantages of both +transfer learning, i.e., pre-trained vision transformer, and supervised +contrastive learning. Our results on a benchmark breast cancer dataset +demonstrate that SupCon-Vit achieves state-of-the-art performance in IDC +classification, with an F1-score of 0.8188, precision of 0.7692, and +specificity of 0.8971, outperforming existing methods. In addition, the +proposed model demonstrates resilience in scenarios with minimal labeled data, +making it highly efficient in real-world clinical settings where labelled data +is limited. Our findings suggest that supervised contrastive learning in +conjunction with pre-trained vision transformers appears to be a viable +strategy for an accurate classification of IDC, thus paving the way for a more +efficient and reliable diagnosis of breast cancer through histopathological +image analysis. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ WPS-Dataset: A benchmark for wood plate segmentation in bark removal + processing + + +
+ Using deep learning methods is a promising approach to improving bark removal +efficiency and enhancing the quality of wood products. However, the lack of +publicly available datasets for wood plate segmentation in bark removal +processing poses challenges for researchers in this field. To address this +issue, a benchmark for wood plate segmentation in bark removal processing named +WPS-dataset is proposed in this study, which consists of 4863 images. We +designed an image acquisition device and assembled it on a bark removal +equipment to capture images in real industrial settings. We evaluated the +WPS-dataset using six typical segmentation models. The models effectively learn +and understand the WPS-dataset characteristics during training, resulting in +high performance and accuracy in wood plate segmentation tasks. We believe that +our dataset can lay a solid foundation for future research in bark removal +processing and contribute to advancements in this field. + +
+
+
+
+
+ + ☆ Lightweight Unsupervised Federated Learning with Pretrained Vision + Language Model + + +
+ Federated learning aims to tackle the ``isolated data island" problem, where +it trains a collective model from physically isolated clients while +safeguarding the privacy of users' data. However, supervised federated learning +necessitates that each client labels their data for training, which can be both +time-consuming and resource-intensive, and may even be impractical for edge +devices. Moreover, the training and transmission of deep models present +challenges to the computation and communication capabilities of the clients. To +address these two inherent challenges in supervised federated learning, we +propose a novel lightweight unsupervised federated learning approach that +leverages unlabeled data on each client to perform lightweight model training +and communication by harnessing pretrained vision-language models, such as +CLIP. By capitalizing on the zero-shot prediction capability and the +well-trained image encoder of the pre-trained CLIP model, we have carefully +crafted an efficient and resilient self-training approach. This method refines +the initial zero-shot predicted pseudo-labels of unlabeled instances through +the sole training of a linear classifier on top of the fixed image encoder. +Additionally, to address data heterogeneity within each client, we propose a +class-balanced text feature sampling strategy for generating synthetic +instances in the feature space to support local training. Experiments are +conducted on multiple benchmark datasets. The experimental results demonstrate +that our proposed method greatly enhances model performance in comparison to +CLIP's zero-shot predictions and even outperforms supervised federated learning +benchmark methods given limited computational and communication overhead. + +
+
+
+
+
+ + ☆ TaCOS: Task-Specific Camera Optimization with Simulation + + +
+ The performance of robots in their applications heavily depends on the +quality of sensory input. However, designing sensor payloads and their +parameters for specific robotic tasks is an expensive process that requires +well-established sensor knowledge and extensive experiments with physical +hardware. With cameras playing a pivotal role in robotic perception, we +introduce a novel end-to-end optimization approach for co-designing a camera +with specific robotic tasks by combining derivative-free and gradient-based +optimizers. The proposed method leverages recent computer graphics techniques +and physical camera characteristics to prototype the camera in software, +simulate operational environments and tasks for robots, and optimize the camera +design based on the desired tasks in a cost-effective way. We validate the +accuracy of our camera simulation by comparing it with physical cameras, and +demonstrate the design of cameras with stronger performance than common +off-the-shelf alternatives. Our approach supports the optimization of both +continuous and discrete camera parameters, manufacturing constraints, and can +be generalized to a broad range of camera design scenarios including multiple +cameras and unconventional cameras. This work advances the fully automated +design of cameras for specific robotics tasks. + +
+
+
+
+
+ + ☆ Spatial-Aware Image Retrieval: A Hyperdimensional Computing Approach for + Efficient Similarity Hashing + + +
+ In the face of burgeoning image data, efficiently retrieving similar images +poses a formidable challenge. Past research has focused on refining hash +functions to distill images into compact indicators of resemblance. Initial +attempts used shallow models, evolving to attention mechanism-based +architectures from Convolutional Neural Networks (CNNs) to advanced models. +Recognizing limitations in gradient-based models for spatial information +embedding, we propose an innovative image hashing method, NeuroHash leveraging +Hyperdimensional Computing (HDC). HDC symbolically encodes spatial information +into high-dimensional vectors, reshaping image representation. Our approach +combines pre-trained large vision models with HDC operations, enabling +spatially encoded feature representations. Hashing with locality-sensitive +hashing (LSH) ensures swift and efficient image retrieval. Notably, our +framework allows dynamic hash manipulation for conditional image retrieval. Our +work introduces a transformative image hashing framework enabling spatial-aware +conditional retrieval. By seamlessly combining DNN-based neural and HDC-based +symbolic models, our methodology breaks from traditional training, offering +flexible and conditional image retrieval. Performance evaluations signify a +paradigm shift in image-hashing methodologies, demonstrating enhanced retrieval +accuracy. + +
+
+
+
+
+ + ☆ MaeFuse: Transferring Omni Features with Pretrained Masked Autoencoders + for Infrared and Visible Image Fusion via Guided Training + + +
+ In this research, we introduce MaeFuse, a novel autoencoder model designed +for infrared and visible image fusion (IVIF). The existing approaches for image +fusion often rely on training combined with downstream tasks to obtain +high-level visual information, which is effective in emphasizing target objects +and delivering impressive results in visual quality and task-specific +applications. MaeFuse, however, deviates from the norm. Instead of being driven +by downstream tasks, our model utilizes a pretrained encoder from Masked +Autoencoders (MAE), which facilities the omni features extraction for low-level +reconstruction and high-level vision tasks, to obtain perception friendly +features with a low cost. In order to eliminate the domain gap of different +modal features and the block effect caused by the MAE encoder, we further +develop a guided training strategy. This strategy is meticulously crafted to +ensure that the fusion layer seamlessly adjusts to the feature space of the +encoder, gradually enhancing the fusion effect. It facilitates the +comprehensive integration of feature vectors from both infrared and visible +modalities, preserving the rich details inherent in each. MaeFuse not only +introduces a novel perspective in the realm of fusion techniques but also +stands out with impressive performance across various public datasets. + +
+
+
+
+
+ + ☆ AKGNet: Attribute Knowledge-Guided Unsupervised Lung-Infected Area + Segmentation + + +
+ Lung-infected area segmentation is crucial for assessing the severity of lung +diseases. However, existing image-text multi-modal methods typically rely on +labour-intensive annotations for model training, posing challenges regarding +time and expertise. To address this issue, we propose a novel attribute +knowledge-guided framework for unsupervised lung-infected area segmentation +(AKGNet), which achieves segmentation solely based on image-text data without +any mask annotation. AKGNet facilitates text attribute knowledge learning, +attribute-image cross-attention fusion, and high-confidence-based pseudo-label +exploration simultaneously. It can learn statistical information and capture +spatial correlations between image and text attributes in the embedding space, +iteratively refining the mask to enhance segmentation. Specifically, we +introduce a text attribute knowledge learning module by extracting attribute +knowledge and incorporating it into feature representations, enabling the model +to learn statistical information and adapt to different attributes. Moreover, +we devise an attribute-image cross-attention module by calculating the +correlation between attributes and images in the embedding space to capture +spatial dependency information, thus selectively focusing on relevant regions +while filtering irrelevant areas. Finally, a self-training mask improvement +process is employed by generating pseudo-labels using high-confidence +predictions to iteratively enhance the mask and segmentation. Experimental +results on a benchmark medical image dataset demonstrate the superior +performance of our method compared to state-of-the-art segmentation techniques +in unsupervised scenarios. + +
+
+
+
+
+ + ☆ InfoMatch: Entropy Neural Estimation for Semi-Supervised Image + Classification IJCAI 2024 + + +
+ Semi-supervised image classification, leveraging pseudo supervision and +consistency regularization, has demonstrated remarkable success. However, the +ongoing challenge lies in fully exploiting the potential of unlabeled data. To +address this, we employ information entropy neural estimation to harness the +potential of unlabeled samples. Inspired by contrastive learning, the entropy +is estimated by maximizing a lower bound on mutual information across different +augmented views. Moreover, we theoretically analyze that the information +entropy of the posterior of an image classifier is approximated by maximizing +the likelihood function of the softmax predictions. Guided by these insights, +we optimize our model from both perspectives to ensure that the predicted +probability distribution closely aligns with the ground-truth distribution. +Given the theoretical connection to information entropy, we name our method +\textit{InfoMatch}. Through extensive experiments, we show its superior +performance. + +
+
+ comment: IJCAI 2024 +
+
+
+
+
+ + ☆ How to deal with glare for improved perception of Autonomous Vehicles + + +
+ Vision sensors are versatile and can capture a wide range of visual cues, +such as color, texture, shape, and depth. This versatility, along with the +relatively inexpensive availability of machine vision cameras, played an +important role in adopting vision-based environment perception systems in +autonomous vehicles (AVs). However, vision-based perception systems can be +easily affected by glare in the presence of a bright source of light, such as +the sun or the headlights of the oncoming vehicle at night or simply by light +reflecting off snow or ice-covered surfaces; scenarios encountered frequently +during driving. In this paper, we investigate various glare reduction +techniques, including the proposed saturated pixel-aware glare reduction +technique for improved performance of the computer vision (CV) tasks employed +by the perception layer of AVs. We evaluate these glare reduction methods based +on various performance metrics of the CV algorithms used by the perception +layer. Specifically, we considered object detection, object recognition, object +tracking, depth estimation, and lane detection which are crucial for autonomous +driving. The experimental findings validate the efficacy of the proposed glare +reduction approach, showcasing enhanced performance across diverse perception +tasks and remarkable resilience against varying levels of glare. + +
+
+ comment: 14 pages, 9 figures, Accepted IEEE TIV +
+
+
+
+
+ + ☆ FairSSD: Understanding Bias in Synthetic Speech Detectors CVPR 2024 + + +
+ Methods that can generate synthetic speech which is perceptually +indistinguishable from speech recorded by a human speaker, are easily +available. Several incidents report misuse of synthetic speech generated from +these methods to commit fraud. To counter such misuse, many methods have been +proposed to detect synthetic speech. Some of these detectors are more +interpretable, can generalize to detect synthetic speech in the wild and are +robust to noise. However, limited work has been done on understanding bias in +these detectors. In this work, we examine bias in existing synthetic speech +detectors to determine if they will unfairly target a particular gender, age +and accent group. We also inspect whether these detectors will have a higher +misclassification rate for bona fide speech from speech-impaired speakers w.r.t +fluent speakers. Extensive experiments on 6 existing synthetic speech detectors +using more than 0.9 million speech signals demonstrate that most detectors are +gender, age and accent biased, and future work is needed to ensure fairness. To +support future research, we release our evaluation dataset, models used in our +study and source code at https://gitlab.com/viper-purdue/fairssd. + +
+
+ comment: Accepted at CVPR 2024 (WMF) +
+
+
+
+
+ + ☆ Pixel-Wise Symbol Spotting via Progressive Points Location for Parsing + CAD Images + + +
+ Parsing Computer-Aided Design (CAD) drawings is a fundamental step for CAD +revision, semantic-based management, and the generation of 3D prototypes in +both the architecture and engineering industries. Labeling symbols from a CAD +drawing is a challenging yet notorious task from a practical point of view. In +this work, we propose to label and spot symbols from CAD images that are +converted from CAD drawings. The advantage of spotting symbols from CAD images +lies in the low requirement of labelers and the low-cost annotation. However, +pixel-wise spotting symbols from CAD images is challenging work. We propose a +pixel-wise point location via Progressive Gaussian Kernels (PGK) to balance +between training efficiency and location accuracy. Besides, we introduce a +local offset to the heatmap-based point location method. Based on the keypoints +detection, we propose a symbol grouping method to redraw the rectangle symbols +in CAD images. We have released a dataset containing CAD images of equipment +rooms from telecommunication industrial CAD drawings. Extensive experiments on +this real-world dataset show that the proposed method has good generalization +ability. + +
+
+ comment: 10 pages, 10 figures,6 tables +
+
+
+
+
+ + ☆ Hyper Evidential Deep Learning to Quantify Composite Classification + Uncertainty ICLR 2024 + + +
+ Deep neural networks (DNNs) have been shown to perform well on exclusive, +multi-class classification tasks. However, when different classes have similar +visual features, it becomes challenging for human annotators to differentiate +them. This scenario necessitates the use of composite class labels. In this +paper, we propose a novel framework called Hyper-Evidential Neural Network +(HENN) that explicitly models predictive uncertainty due to composite class +labels in training data in the context of the belief theory called Subjective +Logic (SL). By placing a grouped Dirichlet distribution on the class +probabilities, we treat predictions of a neural network as parameters of +hyper-subjective opinions and learn the network that collects both single and +composite evidence leading to these hyper-opinions by a deterministic DNN from +data. We introduce a new uncertainty type called vagueness originally designed +for hyper-opinions in SL to quantify composite classification uncertainty for +DNNs. Our results demonstrate that HENN outperforms its state-of-the-art +counterparts based on four image datasets. The code and datasets are available +at: https://github.com/Hugo101/HyperEvidentialNN. + +
+
+ comment: In Proceedings of The Twelfth International Conference on Learning + Representations, ICLR 2024 +
+
+
+
+
+ + ☆ Leveraging 3D LiDAR Sensors to Enable Enhanced Urban Safety and Public + Health: Pedestrian Monitoring and Abnormal Activity Detection + + +
+ The integration of Light Detection and Ranging (LiDAR) and Internet of Things +(IoT) technologies offers transformative opportunities for public health +informatics in urban safety and pedestrian well-being. This paper proposes a +novel framework utilizing these technologies for enhanced 3D object detection +and activity classification in urban traffic scenarios. By employing elevated +LiDAR, we obtain detailed 3D point cloud data, enabling precise pedestrian +activity monitoring. To overcome urban data scarcity, we create a specialized +dataset through simulated traffic environments in Blender, facilitating +targeted model training. Our approach employs a modified Point +Voxel-Region-based Convolutional Neural Network (PV-RCNN) for robust 3D +detection and PointNet for classifying pedestrian activities, significantly +benefiting urban traffic management and public health by offering insights into +pedestrian behavior and promoting safer urban environments. Our dual-model +approach not only enhances urban traffic management but also contributes +significantly to public health by providing insights into pedestrian behavior +and promoting safer urban environment. + +
+
+
+
+
+ + ☆ Domain-Specific Block Selection and Paired-View Pseudo-Labeling for + Online Test-Time Adaptation CVPR 2024 + + +
+ Test-time adaptation (TTA) aims to adapt a pre-trained model to a new test +domain without access to source data after deployment. Existing approaches +typically rely on self-training with pseudo-labels since ground-truth cannot be +obtained from test data. Although the quality of pseudo labels is important for +stable and accurate long-term adaptation, it has not been previously addressed. +In this work, we propose DPLOT, a simple yet effective TTA framework that +consists of two components: (1) domain-specific block selection and (2) +pseudo-label generation using paired-view images. Specifically, we select +blocks that involve domain-specific feature extraction and train these blocks +by entropy minimization. After blocks are adjusted for current test domain, we +generate pseudo-labels by averaging given test images and corresponding flipped +counterparts. By simply using flip augmentation, we prevent a decrease in the +quality of the pseudo-labels, which can be caused by the domain gap resulting +from strong augmentation. Our experimental results demonstrate that DPLOT +outperforms previous TTA methods in CIFAR10-C, CIFAR100-C, and ImageNet-C +benchmarks, reducing error by up to 5.4%, 9.1%, and 2.9%, respectively. Also, +we provide an extensive analysis to demonstrate effectiveness of our framework. +Code is available at +https://github.com/gist-ailab/domain-specific-block-selection-and-paired-view-pseudo-labeling-for-online-TTA. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ TempBEV: Improving Learned BEV Encoders with Combined Image and BEV + Space Temporal Aggregation + + +
+ Autonomous driving requires an accurate representation of the environment. A +strategy toward high accuracy is to fuse data from several sensors. Learned +Bird's-Eye View (BEV) encoders can achieve this by mapping data from individual +sensors into one joint latent space. For cost-efficient camera-only systems, +this provides an effective mechanism to fuse data from multiple cameras with +different views. Accuracy can further be improved by aggregating sensor +information over time. This is especially important in monocular camera systems +to account for the lack of explicit depth and velocity measurements. Thereby, +the effectiveness of developed BEV encoders crucially depends on the operators +used to aggregate temporal information and on the used latent representation +spaces. We analyze BEV encoders proposed in the literature and compare their +effectiveness, quantifying the effects of aggregation operators and latent +representations. While most existing approaches aggregate temporal information +either in image or in BEV latent space, our analyses and performance +comparisons suggest that these latent representations exhibit complementary +strengths. Therefore, we develop a novel temporal BEV encoder, TempBEV, which +integrates aggregated temporal information from both latent spaces. We consider +subsequent image frames as stereo through time and leverage methods from +optical flow estimation for temporal stereo encoding. Empirical evaluation on +the NuScenes dataset shows a significant improvement by TempBEV over the +baseline for 3D object detection and BEV segmentation. The ablation uncovers a +strong synergy of joint temporal aggregation in the image and BEV latent space. +These results indicate the overall effectiveness of our approach and make a +strong case for aggregating temporal information in both image and BEV latent +spaces. + +
+
+
+
+
+ + ☆ Establishing a Baseline for Gaze-driven Authentication Performance in + VR: A Breadth-First Investigation on a Very Large Dataset + + +
+ This paper performs the crucial work of establishing a baseline for +gaze-driven authentication performance to begin answering fundamental research +questions using a very large dataset of gaze recordings from 9202 people with a +level of eye tracking (ET) signal quality equivalent to modern consumer-facing +virtual reality (VR) platforms. The size of the employed dataset is at least an +order-of-magnitude larger than any other dataset from previous related work. +Binocular estimates of the optical and visual axes of the eyes and a minimum +duration for enrollment and verification are required for our model to achieve +a false rejection rate (FRR) of below 3% at a false acceptance rate (FAR) of 1 +in 50,000. In terms of identification accuracy which decreases with gallery +size, we estimate that our model would fall below chance-level accuracy for +gallery sizes of 148,000 or more. Our major findings indicate that gaze +authentication can be as accurate as required by the FIDO standard when driven +by a state-of-the-art machine learning architecture and a sufficiently large +training dataset. + +
+
+ comment: 28 pages, 18 figures, 5 tables, includes supplementary material +
+
+
+
+
+ + ☆ When are Foundation Models Effective? Understanding the Suitability for + Pixel-Level Classification Using Multispectral Imagery + + +
+ Foundation models, i.e., very large deep learning models, have demonstrated +impressive performances in various language and vision tasks that are otherwise +difficult to reach using smaller-size models. The major success of GPT-type of +language models is particularly exciting and raises expectations on the +potential of foundation models in other domains including satellite remote +sensing. In this context, great efforts have been made to build foundation +models to test their capabilities in broader applications, and examples include +Prithvi by NASA-IBM, Segment-Anything-Model, ViT, etc. This leads to an +important question: Are foundation models always a suitable choice for +different remote sensing tasks, and when or when not? This work aims to enhance +the understanding of the status and suitability of foundation models for +pixel-level classification using multispectral imagery at moderate resolution, +through comparisons with traditional machine learning (ML) and regular-size +deep learning models. Interestingly, the results reveal that in many scenarios +traditional ML models still have similar or better performance compared to +foundation models, especially for tasks where texture is less useful for +classification. On the other hand, deep learning models did show more promising +results for tasks where labels partially depend on texture (e.g., burn scar), +while the difference in performance between foundation models and deep learning +models is not obvious. The results conform with our analysis: The suitability +of foundation models depend on the alignment between the self-supervised +learning tasks and the real downstream tasks, and the typical masked +autoencoder paradigm is not necessarily suitable for many remote sensing +problems. + +
+
+
+
+
+ + ☆ Prompt-Driven Feature Diffusion for Open-World Semi-Supervised Learning + + +
+ In this paper, we present a novel approach termed Prompt-Driven Feature +Diffusion (PDFD) within a semi-supervised learning framework for Open World +Semi-Supervised Learning (OW-SSL). At its core, PDFD deploys an efficient +feature-level diffusion model with the guidance of class-specific prompts to +support discriminative feature representation learning and feature generation, +tackling the challenge of the non-availability of labeled data for unseen +classes in OW-SSL. In particular, PDFD utilizes class prototypes as prompts in +the diffusion model, leveraging their class-discriminative and semantic +generalization ability to condition and guide the diffusion process across all +the seen and unseen classes. Furthermore, PDFD incorporates a class-conditional +adversarial loss for diffusion model training, ensuring that the features +generated via the diffusion process can be discriminatively aligned with the +class-conditional features of the real data. Additionally, the class prototypes +of the unseen classes are computed using only unlabeled instances with +confident predictions within a semi-supervised learning framework. We conduct +extensive experiments to evaluate the proposed PDFD. The empirical results show +PDFD exhibits remarkable performance enhancements over many state-of-the-art +existing methods. + +
+
+
+
+
+ + ☆ CU-Mamba: Selective State Space Models with Channel Learning for Image + Restoration + + +
+ Reconstructing degraded images is a critical task in image processing. +Although CNN and Transformer-based models are prevalent in this field, they +exhibit inherent limitations, such as inadequate long-range dependency modeling +and high computational costs. To overcome these issues, we introduce the +Channel-Aware U-Shaped Mamba (CU-Mamba) model, which incorporates a dual State +Space Model (SSM) framework into the U-Net architecture. CU-Mamba employs a +Spatial SSM module for global context encoding and a Channel SSM component to +preserve channel correlation features, both in linear computational complexity +relative to the feature map size. Extensive experimental results validate +CU-Mamba's superiority over existing state-of-the-art methods, underscoring the +importance of integrating both spatial and channel contexts in image +restoration. + +
+
+
+
+
+ + ☆ 3D object quality prediction for Metal Jet Printer with Multimodal + thermal encoder + + +
+ With the advancements in 3D printing technologies, it is extremely important +that the quality of 3D printed objects, and dimensional accuracies should meet +the customer's specifications. Various factors during metal printing affect the +printed parts' quality, including the power quality, the printing stage +parameters, the print part's location inside the print bed, the curing stage +parameters, and the metal sintering process. With the large data gathered from +HP's MetJet printing process, AI techniques can be used to analyze, learn, and +effectively infer the printed part quality metrics, as well as assist in +improving the print yield. In-situ thermal sensing data captured by +printer-installed thermal sensors contains the part thermal signature of fusing +layers. Such part thermal signature contains a convoluted impact from various +factors. In this paper, we use a multimodal thermal encoder network to fuse +data of a different nature including the video data vectorized printer control +data, and exact part thermal signatures with a trained encoder-decoder module. +We explored the data fusing techniques and stages for data fusing, the +optimized end-to-end model architecture indicates an improved part quality +prediction accuracy. + +
+
+
+
+
+ + ☆ Event-Based Eye Tracking. AIS 2024 Challenge Survey + + +
+ This survey reviews the AIS 2024 Event-Based Eye Tracking (EET) Challenge. +The task of the challenge focuses on processing eye movement recorded with +event cameras and predicting the pupil center of the eye. The challenge +emphasizes efficient eye tracking with event cameras to achieve good task +accuracy and efficiency trade-off. During the challenge period, 38 participants +registered for the Kaggle competition, and 8 teams submitted a challenge +factsheet. The novel and diverse methods from the submitted factsheets are +reviewed and analyzed in this survey to advance future event-based eye tracking +research. + +
+
+ comment: Qinyu Chen is the corresponding author +
+
+
+
+
+ + ☆ QGen: On the Ability to Generalize in Quantization Aware Training + + +
+ Quantization lowers memory usage, computational requirements, and latency by +utilizing fewer bits to represent model weights and activations. In this work, +we investigate the generalization properties of quantized neural networks, a +characteristic that has received little attention despite its implications on +model performance. In particular, first, we develop a theoretical model for +quantization in neural networks and demonstrate how quantization functions as a +form of regularization. Second, motivated by recent work connecting the +sharpness of the loss landscape and generalization, we derive an approximate +bound for the generalization of quantized models conditioned on the amount of +quantization noise. We then validate our hypothesis by experimenting with over +2000 models trained on CIFAR-10, CIFAR-100, and ImageNet datasets on +convolutional and transformer-based models. + +
+
+
+
+
+ + ☆ Multimodal 3D Object Detection on Unseen Domains + + +
+ LiDAR datasets for autonomous driving exhibit biases in properties such as +point cloud density, range, and object dimensions. As a result, object +detection networks trained and evaluated in different environments often +experience performance degradation. Domain adaptation approaches assume access +to unannotated samples from the test distribution to address this problem. +However, in the real world, the exact conditions of deployment and access to +samples representative of the test dataset may be unavailable while training. +We argue that the more realistic and challenging formulation is to require +robustness in performance to unseen target domains. We propose to address this +problem in a two-pronged manner. First, we leverage paired LiDAR-image data +present in most autonomous driving datasets to perform multimodal object +detection. We suggest that working with multimodal features by leveraging both +images and LiDAR point clouds for scene understanding tasks results in object +detectors more robust to unseen domain shifts. Second, we train a 3D object +detector to learn multimodal object features across different distributions and +promote feature invariance across these source domains to improve +generalizability to unseen target domains. To this end, we propose +CLIX$^\text{3D}$, a multimodal fusion and supervised contrastive learning +framework for 3D object detection that performs alignment of object features +from same-class samples of different domains while pushing the features from +different classes apart. We show that CLIX$^\text{3D}$ yields state-of-the-art +domain generalization performance under multiple dataset shifts. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ IrrNet: Advancing Irrigation Mapping with Incremental Patch Size + Training on Remote Sensing Imagery CVPR + + +
+ Irrigation mapping plays a crucial role in effective water management, +essential for preserving both water quality and quantity, and is key to +mitigating the global issue of water scarcity. The complexity of agricultural +fields, adorned with diverse irrigation practices, especially when multiple +systems coexist in close quarters, poses a unique challenge. This complexity is +further compounded by the nature of Landsat's remote sensing data, where each +pixel is rich with densely packed information, complicating the task of +accurate irrigation mapping. In this study, we introduce an innovative approach +that employs a progressive training method, which strategically increases patch +sizes throughout the training process, utilizing datasets from Landsat 5 and 7, +labeled with the WRLU dataset for precise labeling. This initial focus allows +the model to capture detailed features, progressively shifting to broader, more +general features as the patch size enlarges. Remarkably, our method enhances +the performance of existing state-of-the-art models by approximately 20%. +Furthermore, our analysis delves into the significance of incorporating various +spectral bands into the model, assessing their impact on performance. The +findings reveal that additional bands are instrumental in enabling the model to +discern finer details more effectively. This work sets a new standard for +leveraging remote sensing imagery in irrigation mapping. + +
+
+ comment: Full version of the paper will be appearing in Proceedings of the + IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + Workshops, 2024 +
+
+
+
+
+ + ☆ Diffusion Schrödinger Bridge Models for High-Quality MR-to-CT + Synthesis for Head and Neck Proton Treatment Planning + + +
+ In recent advancements in proton therapy, MR-based treatment planning is +gaining momentum to minimize additional radiation exposure compared to +traditional CT-based methods. This transition highlights the critical need for +accurate MR-to-CT image synthesis, which is essential for precise proton dose +calculations. Our research introduces the Diffusion Schr\"odinger Bridge Models +(DSBM), an innovative approach for high-quality MR-to-CT synthesis. DSBM learns +the nonlinear diffusion processes between MR and CT data distributions. This +method improves upon traditional diffusion models by initiating synthesis from +the prior distribution rather than the Gaussian distribution, enhancing both +generation quality and efficiency. We validated the effectiveness of DSBM on a +head and neck cancer dataset, demonstrating its superiority over traditional +image synthesis methods through both image-level and dosimetric-level +evaluations. The effectiveness of DSBM in MR-based proton treatment planning +highlights its potential as a valuable tool in various clinical scenarios. + +
+
+ comment: International Conference on the use of Computers in Radiation therapy + (ICCR) +
+
+
+
+
+ + ☆ Equivariant Spatio-Temporal Self-Supervision for LiDAR Object Detection + + +
+ Popular representation learning methods encourage feature invariance under +transformations applied at the input. However, in 3D perception tasks like +object localization and segmentation, outputs are naturally equivariant to some +transformations, such as rotation. Using pre-training loss functions that +encourage equivariance of features under certain transformations provides a +strong self-supervision signal while also retaining information of geometric +relationships between transformed feature representations. This can enable +improved performance in downstream tasks that are equivariant to such +transformations. In this paper, we propose a spatio-temporal equivariant +learning framework by considering both spatial and temporal augmentations +jointly. Our experiments show that the best performance arises with a +pre-training approach that encourages equivariance to translation, scaling, and +flip, rotation and scene flow. For spatial augmentations, we find that +depending on the transformation, either a contrastive objective or an +equivariance-by-classification objective yields best results. To leverage +real-world object deformations and motion, we consider sequential LiDAR scene +pairs and develop a novel 3D scene flow-based equivariance objective that leads +to improved performance overall. We show our pre-training method for 3D object +detection which outperforms existing equivariant and invariant approaches in +many settings. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ Learning with 3D rotations, a hitchhiker's guide to SO(3) + + +
+ Many settings in machine learning require the selection of a rotation +representation. However, choosing a suitable representation from the many +available options is challenging. This paper acts as a survey and guide through +rotation representations. We walk through their properties that harm or benefit +deep learning with gradient-based optimization. By consolidating insights from +rotation-based learning, we provide a comprehensive overview of learning +functions with rotation representations. We provide guidance on selecting +representations based on whether rotations are in the model's input or output +and whether the data primarily comprises small angles. + +
+
+
+
+
+ + ☆ Visual Prompting for Generalized Few-shot Segmentation: A Multi-scale + Approach CVPR 2024 + + +
+ The emergence of attention-based transformer models has led to their +extensive use in various tasks, due to their superior generalization and +transfer properties. Recent research has demonstrated that such models, when +prompted appropriately, are excellent for few-shot inference. However, such +techniques are under-explored for dense prediction tasks like semantic +segmentation. In this work, we examine the effectiveness of prompting a +transformer-decoder with learned visual prompts for the generalized few-shot +segmentation (GFSS) task. Our goal is to achieve strong performance not only on +novel categories with limited examples, but also to retain performance on base +categories. We propose an approach to learn visual prompts with limited +examples. These learned visual prompts are used to prompt a multiscale +transformer decoder to facilitate accurate dense predictions. Additionally, we +introduce a unidirectional causal attention mechanism between the novel +prompts, learned with limited examples, and the base prompts, learned with +abundant data. This mechanism enriches the novel prompts without deteriorating +the base class performance. Overall, this form of prompting helps us achieve +state-of-the-art performance for GFSS on two different benchmark datasets: +COCO-$20^i$ and Pascal-$5^i$, without the need for test-time optimization (or +transduction). Furthermore, test-time optimization leveraging unlabelled test +data can be used to improve the prompts, which we refer to as transductive +prompt tuning. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Deep Learning for Video-Based Assessment of Endotracheal Intubation + Skills + + +
+ Endotracheal intubation (ETI) is an emergency procedure performed in civilian +and combat casualty care settings to establish an airway. Objective and +automated assessment of ETI skills is essential for the training and +certification of healthcare providers. However, the current approach is based +on manual feedback by an expert, which is subjective, time- and +resource-intensive, and is prone to poor inter-rater reliability and halo +effects. This work proposes a framework to evaluate ETI skills using single and +multi-view videos. The framework consists of two stages. First, a 2D +convolutional autoencoder (AE) and a pre-trained self-supervision network +extract features from videos. Second, a 1D convolutional enhanced with a +cross-view attention module takes the features from the AE as input and outputs +predictions for skill evaluation. The ETI datasets were collected in two +phases. In the first phase, ETI is performed by two subject cohorts: Experts +and Novices. In the second phase, novice subjects perform ETI under time +pressure, and the outcome is either Successful or Unsuccessful. A third dataset +of videos from a single head-mounted camera for Experts and Novices is also +analyzed. The study achieved an accuracy of 100% in identifying Expert/Novice +trials in the initial phase. In the second phase, the model showed 85% accuracy +in classifying Successful/Unsuccessful procedures. Using head-mounted cameras +alone, the model showed a 96% accuracy on Expert and Novice classification +while maintaining an accuracy of 85% on classifying successful and +unsuccessful. In addition, GradCAMs are presented to explain the differences +between Expert and Novice behavior and Successful and Unsuccessful trials. The +approach offers a reliable and objective method for automated assessment of ETI +skills. + +
+
+
+
+
+ + ☆ Postoperative glioblastoma segmentation: Development of a fully + automated pipeline using deep convolutional neural networks and comparison + with currently available models + + +
+ Accurately assessing tumor removal is paramount in the management of +glioblastoma. We developed a pipeline using MRI scans and neural networks to +segment tumor subregions and the surgical cavity in postoperative images. Our +model excels in accurately classifying the extent of resection, offering a +valuable tool for clinicians in assessing treatment effectiveness. + +
+
+
+
+
+ + ☆ Unifying Scene Representation and Hand-Eye Calibration with 3D + Foundation Models + + +
+ Representing the environment is a central challenge in robotics, and is +essential for effective decision-making. Traditionally, before capturing images +with a manipulator-mounted camera, users need to calibrate the camera using a +specific external marker, such as a checkerboard or AprilTag. However, recent +advances in computer vision have led to the development of \emph{3D foundation +models}. These are large, pre-trained neural networks that can establish fast +and accurate multi-view correspondences with very few images, even in the +absence of rich visual features. This paper advocates for the integration of 3D +foundation models into scene representation approaches for robotic systems +equipped with manipulator-mounted RGB cameras. Specifically, we propose the +Joint Calibration and Representation (JCR) method. JCR uses RGB images, +captured by a manipulator-mounted camera, to simultaneously construct an +environmental representation and calibrate the camera relative to the robot's +end-effector, in the absence of specific calibration markers. The resulting 3D +environment representation is aligned with the robot's coordinate frame and +maintains physically accurate scales. We demonstrate that JCR can build +effective scene representations using a low-cost RGB camera attached to a +manipulator, without prior calibration. + +
+
+
+
+
+ + ☆ Factorized Motion Fields for Fast Sparse Input Dynamic View Synthesis SIGGRAPH 2024 + + +
+ Designing a 3D representation of a dynamic scene for fast optimization and +rendering is a challenging task. While recent explicit representations enable +fast learning and rendering of dynamic radiance fields, they require a dense +set of input viewpoints. In this work, we focus on learning a fast +representation for dynamic radiance fields with sparse input viewpoints. +However, the optimization with sparse input is under-constrained and +necessitates the use of motion priors to constrain the learning. Existing fast +dynamic scene models do not explicitly model the motion, making them difficult +to be constrained with motion priors. We design an explicit motion model as a +factorized 4D representation that is fast and can exploit the spatio-temporal +correlation of the motion field. We then introduce reliable flow priors +including a combination of sparse flow priors across cameras and dense flow +priors within cameras to regularize our motion model. Our model is fast, +compact and achieves very good performance on popular multi-view dynamic scene +datasets with sparse input viewpoints. The source code for our model can be +found on our project page: +https://nagabhushansn95.github.io/publications/2024/RF-DeRF.html. + +
+
+ comment: Accepted at SIGGRAPH 2024 +
+
+
+
+
+ + ☆ Deep Dependency Networks and Advanced Inference Schemes for Multi-Label + Classification AISTATS 2024 + + +
+ We present a unified framework called deep dependency networks (DDNs) that +combines dependency networks and deep learning architectures for multi-label +classification, with a particular emphasis on image and video data. The primary +advantage of dependency networks is their ease of training, in contrast to +other probabilistic graphical models like Markov networks. In particular, when +combined with deep learning architectures, they provide an intuitive, +easy-to-use loss function for multi-label classification. A drawback of DDNs +compared to Markov networks is their lack of advanced inference schemes, +necessitating the use of Gibbs sampling. To address this challenge, we propose +novel inference schemes based on local search and integer linear programming +for computing the most likely assignment to the labels given observations. We +evaluate our novel methods on three video datasets (Charades, TACoS, Wetlab) +and three image datasets (MS-COCO, PASCAL VOC, NUS-WIDE), comparing their +performance with (a) basic neural architectures and (b) neural architectures +combined with Markov networks equipped with advanced inference and learning +techniques. Our results demonstrate the superiority of our new DDN methods over +the two competing approaches. + +
+
+ comment: Will appear in AISTATS 2024. arXiv admin note: substantial text + overlap with arXiv:2302.00633 +
+
+
+
+
+ + ☆ Unsupervised Microscopy Video Denoising CVPR + + +
+ In this paper, we introduce a novel unsupervised network to denoise +microscopy videos featured by image sequences captured by a fixed location +microscopy camera. Specifically, we propose a DeepTemporal Interpolation +method, leveraging a temporal signal filter integrated into the bottom CNN +layers, to restore microscopy videos corrupted by unknown noise types. Our +unsupervised denoising architecture is distinguished by its ability to adapt to +multiple noise conditions without the need for pre-existing noise distribution +knowledge, addressing a significant challenge in real-world medical +applications. Furthermore, we evaluate our denoising framework using both real +microscopy recordings and simulated data, validating our outperforming video +denoising performance across a broad spectrum of noise scenarios. Extensive +experiments demonstrate that our unsupervised model consistently outperforms +state-of-the-art supervised and unsupervised video denoising techniques, +proving especially effective for microscopy videos. + +
+
+ comment: Accepted at CVPRW 2024 +
+
+
+
+
+ + ☆ SDIP: Self-Reinforcement Deep Image Prior Framework for Image Processing + + +
+ Deep image prior (DIP) proposed in recent research has revealed the inherent +trait of convolutional neural networks (CNN) for capturing substantial +low-level image statistics priors. This framework efficiently addresses the +inverse problems in image processing and has induced extensive applications in +various domains. However, as the whole algorithm is initialized randomly, the +DIP algorithm often lacks stability. Thus, this method still has space for +further improvement. In this paper, we propose the self-reinforcement deep +image prior (SDIP) as an improved version of the original DIP. We observed that +the changes in the DIP networks' input and output are highly correlated during +each iteration. SDIP efficiently utilizes this trait in a reinforcement +learning manner, where the current iteration's output is utilized by a steering +algorithm to update the network input for the next iteration, guiding the +algorithm toward improved results. Experimental results across multiple +applications demonstrate that our proposed SDIP framework offers improvement +compared to the original DIP method and other state-of-the-art methods. + +
+
+
+
+
+ + ☆ Mushroom Segmentation and 3D Pose Estimation from Point Clouds using + Fully Convolutional Geometric Features and Implicit Pose Encoding + + +
+ Modern agricultural applications rely more and more on deep learning +solutions. However, training well-performing deep networks requires a large +amount of annotated data that may not be available and in the case of 3D +annotation may not even be feasible for human annotators. In this work, we +develop a deep learning approach to segment mushrooms and estimate their pose +on 3D data, in the form of point clouds acquired by depth sensors. To +circumvent the annotation problem, we create a synthetic dataset of mushroom +scenes, where we are fully aware of 3D information, such as the pose of each +mushroom. The proposed network has a fully convolutional backbone, that parses +sparse 3D data, and predicts pose information that implicitly defines both +instance segmentation and pose estimation task. We have validated the +effectiveness of the proposed implicit-based approach for a synthetic test set, +as well as provided qualitative results for a small set of real acquired point +clouds with depth sensors. Code is publicly available at +https://github.com/georgeretsi/mushroom-pose. + +
+
+
+
+
+ + ☆ Soil Fertility Prediction Using Combined USB-microscope Based Soil + Image, Auxiliary Variables, and Portable X-Ray Fluorescence Spectrometry + + +
+ This study explored the application of portable X-ray fluorescence (PXRF) +spectrometry and soil image analysis to rapidly assess soil fertility, focusing +on critical parameters such as available B, organic carbon (OC), available Mn, +available S, and the sulfur availability index (SAI). Analyzing 1,133 soil +samples from various agro-climatic zones in Eastern India, the research +combined color and texture features from microscopic soil images, PXRF data, +and auxiliary soil variables (AVs) using a Random Forest model. Results +indicated that integrating image features (IFs) with auxiliary variables (AVs) +significantly enhanced prediction accuracy for available B (R^2 = 0.80) and OC +(R^2 = 0.88). A data fusion approach, incorporating IFs, AVs, and PXRF data, +further improved predictions for available Mn and SAI with R^2 values of 0.72 +and 0.70, respectively. The study demonstrated how these integrated +technologies have the potential to provide quick and affordable options for +soil testing, opening up access to more sophisticated prediction models and a +better comprehension of the fertility and health of the soil. Future research +should focus on the application of deep learning models on a larger dataset of +soil images, developed using soils from a broader range of agro-climatic zones +under field condition. + +
+
+ comment: 37 pages, 10 figures; manuscript under peer-review for publication in + the jounral 'Computers and Electronics in Agriculture' +
+
+
+
+
+ + ☆ MoA: Mixture-of-Attention for Subject-Context Disentanglement in + Personalized Image Generation + + +
+ We introduce a new architecture for personalization of text-to-image +diffusion models, coined Mixture-of-Attention (MoA). Inspired by the +Mixture-of-Experts mechanism utilized in large language models (LLMs), MoA +distributes the generation workload between two attention pathways: a +personalized branch and a non-personalized prior branch. MoA is designed to +retain the original model's prior by fixing its attention layers in the prior +branch, while minimally intervening in the generation process with the +personalized branch that learns to embed subjects in the layout and context +generated by the prior branch. A novel routing mechanism manages the +distribution of pixels in each layer across these branches to optimize the +blend of personalized and generic content creation. Once trained, MoA +facilitates the creation of high-quality, personalized images featuring +multiple subjects with compositions and interactions as diverse as those +generated by the original model. Crucially, MoA enhances the distinction +between the model's pre-existing capability and the newly augmented +personalized intervention, thereby offering a more disentangled subject-context +control that was previously unattainable. Project page: +https://snap-research.github.io/mixture-of-attention + +
+
+ comment: Project Website: https://snap-research.github.io/mixture-of-attention +
+
+
+
+
+ + ♻ ☆ VehicleGAN: Pair-flexible Pose Guided Image Synthesis for Vehicle + Re-identification + + +
+ Vehicle Re-identification (Re-ID) has been broadly studied in the last +decade; however, the different camera view angle leading to confused +discrimination in the feature subspace for the vehicles of various poses, is +still challenging for the Vehicle Re-ID models in the real world. To promote +the Vehicle Re-ID models, this paper proposes to synthesize a large number of +vehicle images in the target pose, whose idea is to project the vehicles of +diverse poses into the unified target pose so as to enhance feature +discrimination. Considering that the paired data of the same vehicles in +different traffic surveillance cameras might be not available in the real +world, we propose the first Pair-flexible Pose Guided Image Synthesis method +for Vehicle Re-ID, named as VehicleGAN in this paper, which works for both +supervised and unsupervised settings without the knowledge of geometric 3D +models. Because of the feature distribution difference between real and +synthetic data, simply training a traditional metric learning based Re-ID model +with data-level fusion (i.e., data augmentation) is not satisfactory, therefore +we propose a new Joint Metric Learning (JML) via effective feature-level fusion +from both real and synthetic data. Intensive experimental results on the public +VeRi-776 and VehicleID datasets prove the accuracy and effectiveness of our +proposed VehicleGAN and JML. + +
+
+
+
+
+ + ♻ ☆ The Brain Tumor Sequence Registration (BraTS-Reg) Challenge: + Establishing Correspondence Between Pre-Operative and Follow-up MRI Scans of + Diffuse Glioma Patients + + +
+ Registration of longitudinal brain MRI scans containing pathologies is +challenging due to dramatic changes in tissue appearance. Although there has +been progress in developing general-purpose medical image registration +techniques, they have not yet attained the requisite precision and reliability +for this task, highlighting its inherent complexity. Here we describe the Brain +Tumor Sequence Registration (BraTS-Reg) challenge, as the first public +benchmark environment for deformable registration algorithms focusing on +estimating correspondences between pre-operative and follow-up scans of the +same patient diagnosed with a diffuse brain glioma. The BraTS-Reg data comprise +de-identified multi-institutional multi-parametric MRI (mpMRI) scans, curated +for size and resolution according to a canonical anatomical template, and +divided into training, validation, and testing sets. Clinical experts annotated +ground truth (GT) landmark points of anatomical locations distinct across the +temporal domain. Quantitative evaluation and ranking were based on the Median +Euclidean Error (MEE), Robustness, and the determinant of the Jacobian of the +displacement field. The top-ranked methodologies yielded similar performance +across all evaluation metrics and shared several methodological commonalities, +including pre-alignment, deep neural networks, inverse consistency analysis, +and test-time instance optimization per-case basis as a post-processing step. +The top-ranked method attained the MEE at or below that of the inter-rater +variability for approximately 60% of the evaluated landmarks, underscoring the +scope for further accuracy and robustness improvements, especially relative to +human experts. The aim of BraTS-Reg is to continue to serve as an active +resource for research, with the data and online evaluation tools accessible at +https://bratsreg.github.io/. + +
+
+
+
+
+ + ♻ ☆ Re-Nerfing: Improving Novel Views Synthesis through Novel Views + Synthesis + + +
+ Neural Radiance Fields (NeRFs) have shown remarkable novel view synthesis +capabilities even in large-scale, unbounded scenes, albeit requiring hundreds +of views or introducing artifacts in sparser settings. Their optimization +suffers from shape-radiance ambiguities wherever only a small visual overlap is +available. This leads to erroneous scene geometry and artifacts. In this paper, +we propose Re-Nerfing, a simple and general multi-stage data augmentation +approach that leverages NeRF's own view synthesis ability to address these +limitations. With Re-Nerfing, we enhance the geometric consistency of novel +views as follows: First, we train a NeRF with the available views. Then, we use +the optimized NeRF to synthesize pseudo-views around the original ones with a +view selection strategy to improve coverage and preserve view quality. Finally, +we train a second NeRF with both the original images and the pseudo views +masking out uncertain regions. Extensive experiments applying Re-Nerfing on +various pipelines on the mip-NeRF 360 dataset, including Gaussian Splatting, +provide valuable insights into the improvements achievable without external +data or supervision, on denser and sparser input scenarios. Project page: +https://renerfing.github.io + +
+
+ comment: Code will be released upon acceptance +
+
+
+
+
+ + ♻ ☆ Segmenting the motion components of a video: A long-term unsupervised + model + + +
+ Human beings have the ability to continuously analyze a video and immediately +extract the motion components. We want to adopt this paradigm to provide a +coherent and stable motion segmentation over the video sequence. In this +perspective, we propose a novel long-term spatio-temporal model operating in a +totally unsupervised way. It takes as input the volume of consecutive optical +flow (OF) fields, and delivers a volume of segments of coherent motion over the +video. More specifically, we have designed a transformer-based network, where +we leverage a mathematically well-founded framework, the Evidence Lower Bound +(ELBO), to derive the loss function. The loss function combines a flow +reconstruction term involving spatio-temporal parametric motion models +combining, in a novel way, polynomial (quadratic) motion models for the spatial +dimensions and B-splines for the time dimension of the video sequence, and a +regularization term enforcing temporal consistency on the segments. We report +experiments on four VOS benchmarks, demonstrating competitive quantitative +results, while performing motion segmentation on a whole sequence in one go. We +also highlight through visual results the key contributions on temporal +consistency brought by our method. + +
+
+
+
+
+ + ♻ ☆ TCJA-SNN: Temporal-Channel Joint Attention for Spiking Neural Networks + + +
+ Spiking Neural Networks (SNNs) are attracting widespread interest due to +their biological plausibility, energy efficiency, and powerful spatio-temporal +information representation ability. Given the critical role of attention +mechanisms in enhancing neural network performance, the integration of SNNs and +attention mechanisms exhibits potential to deliver energy-efficient and +high-performance computing paradigms. We present a novel Temporal-Channel Joint +Attention mechanism for SNNs, referred to as TCJA-SNN. The proposed TCJA-SNN +framework can effectively assess the significance of spike sequence from both +spatial and temporal dimensions. More specifically, our essential technical +contribution lies on: 1) We employ the squeeze operation to compress the spike +stream into an average matrix. Then, we leverage two local attention mechanisms +based on efficient 1D convolutions to facilitate comprehensive feature +extraction at the temporal and channel levels independently. 2) We introduce +the Cross Convolutional Fusion (CCF) layer as a novel approach to model the +inter-dependencies between the temporal and channel scopes. This layer breaks +the independence of these two dimensions and enables the interaction between +features. Experimental results demonstrate that the proposed TCJA-SNN +outperforms SOTA by up to 15.7% accuracy on standard static and neuromorphic +datasets, including Fashion-MNIST, CIFAR10-DVS, N-Caltech 101, and DVS128 +Gesture. Furthermore, we apply the TCJA-SNN framework to image generation tasks +by leveraging a variation autoencoder. To the best of our knowledge, this study +is the first instance where the SNN-attention mechanism has been employed for +image classification and generation tasks. Notably, our approach has achieved +SOTA performance in both domains, establishing a significant advancement in the +field. Codes are available at https://github.com/ridgerchu/TCJA. + +
+
+ comment: Accepted by IEEE Transactions on Neural Networks and Learning Systems +
+
+
+
+
+ + ♻ ☆ ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal + Instance Segmentation IJCNN2024 + + +
+ Amodal Instance Segmentation (AIS) presents a challenging task as it involves +predicting both visible and occluded parts of objects within images. Existing +AIS methods rely on a bidirectional approach, encompassing both the transition +from amodal features to visible features (amodal-to-visible) and from visible +features to amodal features (visible-to-amodal). Our observation shows that the +utilization of amodal features through the amodal-to-visible can confuse the +visible features due to the extra information of occluded/hidden segments not +presented in visible display. Consequently, this compromised quality of visible +features during the subsequent visible-to-amodal transition. To tackle this +issue, we introduce ShapeFormer, a decoupled Transformer-based model with a +visible-to-amodal transition. It facilitates the explicit relationship between +output segmentations and avoids the need for amodal-to-visible transitions. +ShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for +predicting visible segmentation with occlusion awareness, (ii) Shape-Prior +Amodal Mask Head for predicting amodal and occluded masks, and (iii) +Category-Specific Shape Prior Retriever aims to provide shape prior knowledge. +Comprehensive experiments and extensive ablation studies across various AIS +benchmarks demonstrate the effectiveness of our ShapeFormer. The code is +available at: \url{https://github.com/UARK-AICV/ShapeFormer} + +
+
+ comment: Accepted to IJCNN2024 +
+
+
+
+
+ + ♻ ☆ Hybrid Functional Maps for Crease-Aware Non-Isometric Shape Matching CVPR 2024 + + +
+ Non-isometric shape correspondence remains a fundamental challenge in +computer vision. Traditional methods using Laplace-Beltrami operator (LBO) +eigenmodes face limitations in characterizing high-frequency extrinsic shape +changes like bending and creases. We propose a novel approach of combining the +non-orthogonal extrinsic basis of eigenfunctions of the elastic thin-shell +hessian with the intrinsic ones of the LBO, creating a hybrid spectral space in +which we construct functional maps. To this end, we present a theoretical +framework to effectively integrate non-orthogonal basis functions into +descriptor- and learning-based functional map methods. Our approach can be +incorporated easily into existing functional map pipelines across varying +applications and is able to handle complex deformations beyond isometries. We +show extensive evaluations across various supervised and unsupervised settings +and demonstrate significant improvements. Notably, our approach achieves up to +15% better mean geodesic error for non-isometric correspondence settings and up +to 45% improvement in scenarios with topological noise. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SuperPrimitive: Scene Reconstruction at a Primitive Level CVPR2024 + + +
+ Joint camera pose and dense geometry estimation from a set of images or a +monocular video remains a challenging problem due to its computational +complexity and inherent visual ambiguities. Most dense incremental +reconstruction systems operate directly on image pixels and solve for their 3D +positions using multi-view geometry cues. Such pixel-level approaches suffer +from ambiguities or violations of multi-view consistency (e.g. caused by +textureless or specular surfaces). + We address this issue with a new image representation which we call a +SuperPrimitive. SuperPrimitives are obtained by splitting images into +semantically correlated local regions and enhancing them with estimated surface +normal directions, both of which are predicted by state-of-the-art single image +neural networks. This provides a local geometry estimate per SuperPrimitive, +while their relative positions are adjusted based on multi-view observations. + We demonstrate the versatility of our new representation by addressing three +3D reconstruction tasks: depth completion, few-view structure from motion, and +monocular dense visual odometry. + +
+
+ comment: CVPR2024. Project Page: https://makezur.github.io/SuperPrimitive/ +
+
+
+
+
+ + ♻ ☆ Leveraging Foundation Models for Content-Based Medical Image Retrieval + in Radiology + + +
+ Content-based image retrieval (CBIR) has the potential to significantly +improve diagnostic aid and medical research in radiology. Current CBIR systems +face limitations due to their specialization to certain pathologies, limiting +their utility. In response, we propose using vision foundation models as +powerful and versatile off-the-shelf feature extractors for content-based +medical image retrieval. By benchmarking these models on a comprehensive +dataset of 1.6 million 2D radiological images spanning four modalities and 161 +pathologies, we identify weakly-supervised models as superior, achieving a P@1 +of up to 0.594. This performance not only competes with a specialized model but +does so without the need for fine-tuning. Our analysis further explores the +challenges in retrieving pathological versus anatomical structures, indicating +that accurate retrieval of pathological features presents greater difficulty. +Despite these challenges, our research underscores the vast potential of +foundation models for CBIR in radiology, proposing a shift towards versatile, +general-purpose medical image retrieval systems that do not require specific +tuning. + +
+
+
+
+
+ + ♻ ☆ Influencer Backdoor Attack on Semantic Segmentation + + +
+ When a small number of poisoned samples are injected into the training +dataset of a deep neural network, the network can be induced to exhibit +malicious behavior during inferences, which poses potential threats to +real-world applications. While they have been intensively studied in +classification, backdoor attacks on semantic segmentation have been largely +overlooked. Unlike classification, semantic segmentation aims to classify every +pixel within a given image. In this work, we explore backdoor attacks on +segmentation models to misclassify all pixels of a victim class by injecting a +specific trigger on non-victim pixels during inferences, which is dubbed +Influencer Backdoor Attack (IBA). IBA is expected to maintain the +classification accuracy of non-victim pixels and mislead classifications of all +victim pixels in every single inference and could be easily applied to +real-world scenes. Based on the context aggregation ability of segmentation +models, we proposed a simple, yet effective, Nearest-Neighbor trigger injection +strategy. We also introduce an innovative Pixel Random Labeling strategy which +maintains optimal performance even when the trigger is placed far from the +victim pixels. Our extensive experiments reveal that current segmentation +models do suffer from backdoor attacks, demonstrate IBA real-world +applicability, and show that our proposed techniques can further increase +attack performance. + +
+
+
+
+
+ + ♻ ☆ The LuViRA Dataset: Measurement Description ICRA 2024 + + +
+ We present a dataset to evaluate localization algorithms, which utilizes +vision, audio, and radio sensors: the Lund University Vision, Radio, and Audio +(LuViRA) Dataset. The dataset includes RGB images, corresponding depth maps, +IMU readings, channel response between a massive MIMO channel sounder and a +user equipment, audio recorded by 12 microphones, and 0.5 mm accurate 6DoF pose +ground truth. We synchronize these sensors to make sure that all data are +recorded simultaneously. A camera, speaker, and transmit antenna are placed on +top of a slowly moving service robot and 88 trajectories are recorded. Each +trajectory includes 20 to 50 seconds of recorded sensor data and ground truth +labels. The data from different sensors can be used separately or jointly to +conduct localization tasks and a motion capture system is used to verify the +results obtained by the localization algorithms. The main aim of this dataset +is to enable research on fusing the most commonly used sensors for localization +tasks. However, the full dataset or some parts of it can also be used for other +research areas such as channel estimation, image classification, etc. Fusing +sensor data can lead to increased localization accuracy and reliability, as +well as decreased latency and power consumption. The created dataset will be +made public at a later date. + +
+
+ comment: 7 pages, 7 figures, Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ ChatCAD+: Towards a Universal and Reliable Interactive CAD using LLMs + + +
+ The integration of Computer-Aided Diagnosis (CAD) with Large Language Models +(LLMs) presents a promising frontier in clinical applications, notably in +automating diagnostic processes akin to those performed by radiologists and +providing consultations similar to a virtual family doctor. Despite the +promising potential of this integration, current works face at least two +limitations: (1) From the perspective of a radiologist, existing studies +typically have a restricted scope of applicable imaging domains, failing to +meet the diagnostic needs of different patients. Also, the insufficient +diagnostic capability of LLMs further undermine the quality and reliability of +the generated medical reports. (2) Current LLMs lack the requisite depth in +medical expertise, rendering them less effective as virtual family doctors due +to the potential unreliability of the advice provided during patient +consultations. To address these limitations, we introduce ChatCAD+, to be +universal and reliable. Specifically, it is featured by two main modules: (1) +Reliable Report Generation and (2) Reliable Interaction. The Reliable Report +Generation module is capable of interpreting medical images from diverse +domains and generate high-quality medical reports via our proposed hierarchical +in-context learning. Concurrently, the interaction module leverages up-to-date +information from reputable medical websites to provide reliable medical advice. +Together, these designed modules synergize to closely align with the expertise +of human medical professionals, offering enhanced consistency and reliability +for interpretation and advice. The source code is available at +https://github.com/zhaozh10/ChatCAD. + +
+
+ comment: Authors Zihao Zhao, Sheng Wang, Jinchen Gu, Yitao Zhu contributed + equally to this work and should be considered co-first authors +
+
+
+
+
+ + ♻ ☆ ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth + Estimation CVPR + + +
+ In the absence of parallax cues, a learning-based single image depth +estimation (SIDE) model relies heavily on shading and contextual cues in the +image. While this simplicity is attractive, it is necessary to train such +models on large and varied datasets, which are difficult to capture. It has +been shown that using embeddings from pre-trained foundational models, such as +CLIP, improves zero shot transfer in several applications. Taking inspiration +from this, in our paper we explore the use of global image priors generated +from a pre-trained ViT model to provide more detailed contextual information. +We argue that the embedding vector from a ViT model, pre-trained on a large +dataset, captures greater relevant information for SIDE than the usual route of +generating pseudo image captions, followed by CLIP based text embeddings. Based +on this idea, we propose a new SIDE model using a diffusion backbone which is +conditioned on ViT embeddings. Our proposed design establishes a new +state-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of +0.059 (14% improvement) compared to 0.069 by the current SOTA (VPD). And on +KITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to +0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model +trained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%) +over NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%, +18%, 45%, 9%) by ZoeDepth. The project page is available at +https://ecodepth-iitd.github.io + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ Distance and Collision Probability Estimation from Gaussian Surface + Models + + +
+ This paper describes continuous-space methodologies to estimate the collision +probability, Euclidean distance and gradient between an ellipsoidal robot model +and an environment surface modeled as a set of Gaussian distributions. +Continuous-space collision probability estimation is critical for +uncertainty-aware motion planning. Most collision detection and avoidance +approaches assume the robot is modeled as a sphere, but ellipsoidal +representations provide tighter approximations and enable navigation in +cluttered and narrow spaces. State-of-the-art methods derive the Euclidean +distance and gradient by processing raw point clouds, which is computationally +expensive for large workspaces. Recent advances in Gaussian surface modeling +(e.g. mixture models, splatting) enable compressed and high-fidelity surface +representations. Few methods exist to estimate continuous-space occupancy from +such models. They require Gaussians to model free space and are unable to +estimate the collision probability, Euclidean distance and gradient for an +ellipsoidal robot. The proposed methods bridge this gap by extending prior work +in ellipsoid-to-ellipsoid Euclidean distance and collision probability +estimation to Gaussian surface models. A geometric blending approach is also +proposed to improve collision probability estimation. The approaches are +evaluated with numerical 2D and 3D experiments using real-world point cloud +data. Methods for efficient calculation of these quantities are demonstrated to +execute within a few microseconds per ellipsoid pair using a single-thread on +low-power CPUs of modern embedded computers + +
+
+
+
+
+ + ♻ ☆ MISC: Ultra-low Bitrate Image Semantic Compression Driven by Large + Multimodal Model + + +
+ With the evolution of storage and communication protocols, ultra-low bitrate +image compression has become a highly demanding topic. However, existing +compression algorithms must sacrifice either consistency with the ground truth +or perceptual quality at ultra-low bitrate. In recent years, the rapid +development of the Large Multimodal Model (LMM) has made it possible to balance +these two goals. To solve this problem, this paper proposes a method called +Multimodal Image Semantic Compression (MISC), which consists of an LMM encoder +for extracting the semantic information of the image, a map encoder to locate +the region corresponding to the semantic, an image encoder generates an +extremely compressed bitstream, and a decoder reconstructs the image based on +the above information. Experimental results show that our proposed MISC is +suitable for compressing both traditional Natural Sense Images (NSIs) and +emerging AI-Generated Images (AIGIs) content. It can achieve optimal +consistency and perception results while saving 50% bitrate, which has strong +potential applications in the next generation of storage and communication. The +code will be released on https://github.com/lcysyzxdxc/MISC. + +
+
+ comment: 13 page, 11 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ EgoPlan-Bench: Benchmarking Egocentric Embodied Planning with Multimodal + Large Language Models + + +
+ Multimodal Large Language Models, combining the remarkable reasoning and +generalization capabilities of Large Language Models (LLMs) with the ability to +comprehend visual inputs, have opened up new avenues for embodied task +planning. Given diverse environmental inputs, including real-time task +progress, visual observations, and open-form language instructions, a +proficient task planner is expected to predict feasible actions, which is a +feat inherently achievable by Multimodal Large Language Models (MLLMs). In this +paper, we aim to quantitatively investigate the potential of MLLMs as embodied +task planners in real-world scenarios by introducing a benchmark with human +annotations named EgoPlan-Bench. Our benchmark is distinguished by realistic +tasks derived from real-world videos, a diverse set of actions involving +interactions with hundreds of different objects, and complex visual +observations from varied scenes. We evaluate a wide range of MLLMs, revealing +that these models have not yet evolved into embodied planning generalists (even +GPT-4V). We further construct an instruction-tuning dataset EgoPlan-IT from +videos with human-object interactions, to facilitate the learning of high-level +task planning in intricate real-world situations. The experiment results +demonstrate that the model tuned on EgoPlan-IT not only significantly improves +performance on our benchmark, but can also be applied as a task planner for +guiding embodied agents in simulations. + +
+
+ comment: Project released at: https://github.com/ChenYi99/EgoPlan +
+
+
+
+
+ + ♻ ☆ Deepfake detection by exploiting surface anomalies: the SurFake approach + + +
+ The ever-increasing use of synthetically generated content in different +sectors of our everyday life, one for all media information, poses a strong +need for deepfake detection tools in order to avoid the proliferation of +altered messages. The process to identify manipulated content, in particular +images and videos, is basically performed by looking for the presence of some +inconsistencies and/or anomalies specifically due to the fake generation +process. Different techniques exist in the scientific literature that exploit +diverse ad-hoc features in order to highlight possible modifications. In this +paper, we propose to investigate how deepfake creation can impact on the +characteristics that the whole scene had at the time of the acquisition. In +particular, when an image (video) is captured the overall geometry of the scene +(e.g. surfaces) and the acquisition process (e.g. illumination) determine a +univocal environment that is directly represented by the image pixel values; +all these intrinsic relations are possibly changed by the deepfake generation +process. By resorting to the analysis of the characteristics of the surfaces +depicted in the image it is possible to obtain a descriptor usable to train a +CNN for deepfake detection: we refer to such an approach as SurFake. +Experimental results carried out on the FF++ dataset for different kinds of +deepfake forgeries and diverse deep learning models confirm that such a feature +can be adopted to discriminate between pristine and altered images; +furthermore, experiments witness that it can also be combined with visual data +to provide a certain improvement in terms of detection accuracy. + +
+
+
+
+
+ + ♻ ☆ High-throughput Visual Nano-drone to Nano-drone Relative Localization + using Onboard Fully Convolutional Networks ICRA 2024 + + +
+ Relative drone-to-drone localization is a fundamental building block for any +swarm operations. We address this task in the context of miniaturized +nano-drones, i.e., 10cm in diameter, which show an ever-growing interest due to +novel use cases enabled by their reduced form factor. The price for their +versatility comes with limited onboard resources, i.e., sensors, processing +units, and memory, which limits the complexity of the onboard algorithms. A +traditional solution to overcome these limitations is represented by +lightweight deep learning models directly deployed aboard nano-drones. This +work tackles the challenging relative pose estimation between nano-drones using +only a gray-scale low-resolution camera and an ultra-low-power System-on-Chip +(SoC) hosted onboard. We present a vertically integrated system based on a +novel vision-based fully convolutional neural network (FCNN), which runs at +39Hz within 101mW onboard a Crazyflie nano-drone extended with the GWT GAP8 +SoC. We compare our FCNN against three State-of-the-Art (SoA) systems. +Considering the best-performing SoA approach, our model results in an R-squared +improvement from 32 to 47% on the horizontal image coordinate and from 18 to +55% on the vertical image coordinate, on a real-world dataset of 30k images. +Finally, our in-field tests show a reduction of the average tracking error of +37% compared to a previous SoA work and an endurance performance up to the +entire battery lifetime of 4 minutes. + +
+
+ comment: ICRA 2024, IEEE Conference +
+
+
+
+
+ + ♻ ☆ Exploring Missing Modality in Multimodal Egocentric Datasets + + +
+ Multimodal video understanding is crucial for analyzing egocentric videos, +where integrating multiple sensory signals significantly enhances action +recognition and moment localization. However, practical applications often +grapple with incomplete modalities due to factors like privacy concerns, +efficiency demands, or hardware malfunctions. Addressing this, our study delves +into the impact of missing modalities on egocentric action recognition, +particularly within transformer-based models. We introduce a novel concept +-Missing Modality Token (MMT)-to maintain performance even when modalities are +absent, a strategy that proves effective in the Ego4D, Epic-Kitchens, and +Epic-Sounds datasets. Our method mitigates the performance loss, reducing it +from its original $\sim 30\%$ drop to only $\sim 10\%$ when half of the test +set is modal-incomplete. Through extensive experimentation, we demonstrate the +adaptability of MMT to different training scenarios and its superiority in +handling missing modalities compared to current methods. Our research +contributes a comprehensive analysis and an innovative approach, opening +avenues for more resilient multimodal systems in real-world settings. + +
+
+
+
+
+ + ♻ ☆ Video shutter angle estimation using optical flow and linear blur + + +
+ We present a method for estimating the shutter angle, a.k.a. exposure +fraction - the ratio of the exposure time and the reciprocal of frame rate - of +videoclips containing motion. The approach exploits the relation of the +exposure fraction, optical flow, and linear motion blur. Robustness is achieved +by selecting image patches where both the optical flow and blur estimates are +reliable, checking their consistency. The method was evaluated on the publicly +available Beam-Splitter Dataset with a range of exposure fractions from 0.015 +to 0.36. The best achieved mean absolute error of estimates was 0.039. We +successfully test the suitability of the method for a forensic application of +detection of video tampering by frame removal or insertion + +
+
+
+
+
+ + ♻ ☆ D$^2$ST-Adapter: Disentangled-and-Deformable Spatio-Temporal Adapter for + Few-shot Action Recognition + + +
+ Adapting large pre-trained image models to few-shot action recognition has +proven to be an effective and efficient strategy for learning robust feature +extractors, which is essential for few-shot learning. Typical fine-tuning based +adaptation paradigm is prone to overfitting in the few-shot learning scenarios +and offers little modeling flexibility for learning temporal features in video +data. In this work we present the Disentangled-and-Deformable Spatio-Temporal +Adapter (D$^2$ST-Adapter), which is a novel adapter tuning framework +well-suited for few-shot action recognition due to lightweight design and low +parameter-learning overhead. It is designed in a dual-pathway architecture to +encode spatial and temporal features in a disentangled manner. In particular, +we devise the anisotropic Deformable Spatio-Temporal Attention module as the +core component of D$^2$ST-Adapter, which can be tailored with anisotropic +sampling densities along spatial and temporal domains to learn spatial and +temporal features specifically in corresponding pathways, allowing our +D$^2$ST-Adapter to encode features in a global view in 3D spatio-temporal space +while maintaining a lightweight design. Extensive experiments with +instantiations of our method on both pre-trained ResNet and ViT demonstrate the +superiority of our method over state-of-the-art methods for few-shot action +recognition. Our method is particularly well-suited to challenging scenarios +where temporal dynamics are critical for action recognition. + +
+
+
+
+
+ + ♻ ☆ Bridging the Gap: Learning Pace Synchronization for Open-World + Semi-Supervised Learning + + +
+ In open-world semi-supervised learning, a machine learning model is tasked +with uncovering novel categories from unlabeled data while maintaining +performance on seen categories from labeled data. The central challenge is the +substantial learning gap between seen and novel categories, as the model learns +the former faster due to accurate supervisory information. Moreover, capturing +the semantics of unlabeled novel category samples is also challenging due to +the missing label information. To address the above issues, we introduce 1) the +adaptive synchronizing marginal loss which imposes class-specific negative +margins to alleviate the model bias towards seen classes, and 2) the +pseudo-label contrastive clustering which exploits pseudo-labels predicted by +the model to group unlabeled data from the same category together in the output +space. Extensive experiments on benchmark datasets demonstrate that previous +approaches may significantly hinder novel class learning, whereas our method +strikingly balances the learning pace between seen and novel classes, achieving +a remarkable 3% average accuracy increase on the ImageNet dataset. Importantly, +we find that fine-tuning the self-supervised pre-trained model significantly +boosts the performance, which is overlooked in prior literature. Our code is +available at https://github.com/yebo0216best/LPS-main. + +
+
+
+
+
+ + ♻ ☆ Do Counterfactual Examples Complicate Adversarial Training? CVPR'24 + + +
+ We leverage diffusion models to study the robustness-performance tradeoff of +robust classifiers. Our approach introduces a simple, pretrained diffusion +method to generate low-norm counterfactual examples (CEs): semantically altered +data which results in different true class membership. We report that the +confidence and accuracy of robust models on their clean training data are +associated with the proximity of the data to their CEs. Moreover, robust models +perform very poorly when evaluated on the CEs directly, as they become +increasingly invariant to the low-norm, semantic changes brought by CEs. The +results indicate a significant overlap between non-robust and semantic +features, countering the common assumption that non-robust features are not +interpretable. + +
+
+ comment: Accepted as a short paper to the GCV Workshop at CVPR'24 +
+
+
+
+
+ + ♻ ☆ ODM: A Text-Image Further Alignment Pre-training Approach for Scene Text + Detection and Spotting CVPR2024 + + +
+ In recent years, text-image joint pre-training techniques have shown +promising results in various tasks. However, in Optical Character Recognition +(OCR) tasks, aligning text instances with their corresponding text regions in +images poses a challenge, as it requires effective alignment between text and +OCR-Text (referring to the text in images as OCR-Text to distinguish from the +text in natural language) rather than a holistic understanding of the overall +image content. In this paper, we propose a new pre-training method called +OCR-Text Destylization Modeling (ODM) that transfers diverse styles of text +found in images to a uniform style based on the text prompt. With ODM, we +achieve better alignment between text and OCR-Text and enable pre-trained +models to adapt to the complex and diverse styles of scene text detection and +spotting tasks. Additionally, we have designed a new labeling generation method +specifically for ODM and combined it with our proposed Text-Controller module +to address the challenge of annotation costs in OCR tasks, allowing a larger +amount of unlabeled data to participate in pre-training. Extensive experiments +on multiple public datasets demonstrate that our method significantly improves +performance and outperforms current pre-training methods in scene text +detection and spotting tasks. Code is available at +https://github.com/PriNing/ODM. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ ICSVR: Investigating Compositional and Syntactic Understanding in Video + Retrieval Models + + +
+ Video retrieval (VR) involves retrieving the ground truth video from the +video database given a text caption or vice-versa. The two important components +of compositionality: objects & attributes and actions are joined using correct +syntax to form a proper text query. These components (objects & attributes, +actions and syntax) each play an important role to help distinguish among +videos and retrieve the correct ground truth video. However, it is unclear what +is the effect of these components on the video retrieval performance. We +therefore, conduct a systematic study to evaluate the compositional and +syntactic understanding of video retrieval models on standard benchmarks such +as MSRVTT, MSVD and DIDEMO. The study is performed on two categories of video +retrieval models: (i) which are pre-trained on video-text pairs and fine-tuned +on downstream video retrieval datasets (Eg. Frozen-in-Time, Violet, MCQ etc.) +(ii) which adapt pre-trained image-text representations like CLIP for video +retrieval (Eg. CLIP4Clip, XCLIP, CLIP2Video etc.). Our experiments reveal that +actions and syntax play a minor role compared to objects & attributes in video +understanding. Moreover, video retrieval models that use pre-trained image-text +representations (CLIP) have better syntactic and compositional understanding as +compared to models pre-trained on video-text data. The code is available at +https://github.com/IntelLabs/multimodal_cognitive_ai/tree/main/ICSVR + +
+
+
+
+
+ + ♻ ☆ PE-MVCNet: Multi-view and Cross-modal Fusion Network for Pulmonary + Embolism Prediction + + +
+ The early detection of a pulmonary embolism (PE) is critical for enhancing +patient survival rates. Both image-based and non-image-based features are of +utmost importance in medical classification tasks. In a clinical setting, +physicians tend to rely on the contextual information provided by Electronic +Medical Records (EMR) to interpret medical imaging. However, very few models +effectively integrate clinical information with imaging data. To address this +shortcoming, we suggest a multimodal fusion methodology, termed PE-MVCNet, +which capitalizes on Computed Tomography Pulmonary Angiography imaging and EMR +data. This method comprises the Image-only module with an integrated multi-view +block, the EMR-only module, and the Cross-modal Attention Fusion (CMAF) module. +These modules cooperate to extract comprehensive features that subsequently +generate predictions for PE. We conducted experiments using the publicly +accessible Stanford University Medical Center dataset, achieving an AUROC of +94.1%, an accuracy rate of 90.2%, and an F1 score of 90.6%. Our proposed model +outperforms existing methodologies, corroborating that our multimodal fusion +model excels compared to models that use a single data modality. Our source +code is available at https://github.com/LeavingStarW/PE-MVCNET. + +
+
+
+
+
+ + ♻ ☆ One-Prompt to Segment All Medical Images + + +
+ Large foundation models, known for their strong zero-shot generalization, +have excelled in visual and language applications. However, applying them to +medical image segmentation, a domain with diverse imaging types and target +labels, remains an open challenge. Current approaches, such as adapting +interactive segmentation models like Segment Anything Model (SAM), require user +prompts for each sample during inference. Alternatively, transfer learning +methods like few/one-shot models demand labeled samples, leading to high costs. +This paper introduces a new paradigm toward the universal medical image +segmentation, termed 'One-Prompt Segmentation.' One-Prompt Segmentation +combines the strengths of one-shot and interactive methods. In the inference +stage, with just \textbf{one prompted sample}, it can adeptly handle the unseen +task in a single forward pass. We train One-Prompt Model on 64 open-source +medical datasets, accompanied by the collection of over 3,000 clinician-labeled +prompts. Tested on 14 previously unseen datasets, the One-Prompt Model +showcases superior zero-shot segmentation capabilities, outperforming a wide +range of related methods. The code and data is released as +https://github.com/KidsWithTokens/one-prompt. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.12620 +
+
+
+
+
+ + ♻ ☆ AsymFormer: Asymmetrical Cross-Modal Representation Learning for Mobile + Platform Real-Time RGB-D Semantic Segmentation + + +
+ Understanding indoor scenes is crucial for urban studies. Considering the +dynamic nature of indoor environments, effective semantic segmentation requires +both real-time operation and high accuracy.To address this, we propose +AsymFormer, a novel network that improves real-time semantic segmentation +accuracy using RGB-D multi-modal information without substantially increasing +network complexity. AsymFormer uses an asymmetrical backbone for multimodal +feature extraction, reducing redundant parameters by optimizing computational +resource distribution. To fuse asymmetric multimodal features, a Local +Attention-Guided Feature Selection (LAFS) module is used to selectively fuse +features from different modalities by leveraging their dependencies. +Subsequently, a Cross-Modal Attention-Guided Feature Correlation Embedding +(CMA) module is introduced to further extract cross-modal representations. The +AsymFormer demonstrates competitive results with 54.1% mIoU on NYUv2 and 49.1% +mIoU on SUNRGBD. Notably, AsymFormer achieves an inference speed of 65 FPS (79 +FPS after implementing mixed precision quantization) on RTX3090, demonstrating +that AsymFormer can strike a balance between high accuracy and efficiency. + +
+
+
+
+
+ + ♻ ☆ A2XP: Towards Private Domain Generalization CVPR 2024 + + +
+ Deep Neural Networks (DNNs) have become pivotal in various fields, especially +in computer vision, outperforming previous methodologies. A critical challenge +in their deployment is the bias inherent in data across different domains, such +as image style and environmental conditions, leading to domain gaps. This +necessitates techniques for learning general representations from biased +training data, known as domain generalization. This paper presents Attend to +eXpert Prompts (A2XP), a novel approach for domain generalization that +preserves the privacy and integrity of the network architecture. A2XP consists +of two phases: Expert Adaptation and Domain Generalization. In the first phase, +prompts for each source domain are optimized to guide the model towards the +optimal direction. In the second phase, two embedder networks are trained to +effectively amalgamate these expert prompts, aiming for an optimal output. Our +extensive experiments demonstrate that A2XP achieves state-of-the-art results +over existing non-private domain generalization methods. The experimental +results validate that the proposed approach not only tackles the domain +generalization challenge in DNNs but also offers a privacy-preserving, +efficient solution to the broader field of computer vision. + +
+
+ comment: Accepted to CVPR 2024. Our code is available at + https://github.com/AIRLABkhu/A2XP +
+
+
+
+
+ + ♻ ☆ T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation + + +
+ Recent methods in text-to-3D leverage powerful pretrained diffusion models to +optimize NeRF. Notably, these methods are able to produce high-quality 3D +scenes without training on 3D data. Due to the open-ended nature of the task, +most studies evaluate their results with subjective case studies and user +experiments, thereby presenting a challenge in quantitatively addressing the +question: How has current progress in Text-to-3D gone so far? In this paper, we +introduce T$^3$Bench, the first comprehensive text-to-3D benchmark containing +diverse text prompts of three increasing complexity levels that are specially +designed for 3D generation. To assess both the subjective quality and the text +alignment, we propose two automatic metrics based on multi-view images produced +by the 3D contents. The quality metric combines multi-view text-image scores +and regional convolution to detect quality and view inconsistency. The +alignment metric uses multi-view captioning and GPT-4 evaluation to measure +text-3D consistency. Both metrics closely correlate with different dimensions +of human judgments, providing a paradigm for efficiently evaluating text-to-3D +models. The benchmarking results, shown in Fig. 1, reveal performance +differences among an extensive 10 prevalent text-to-3D methods. Our analysis +further highlights the common struggles for current methods on generating +surroundings and multi-object scenes, as well as the bottleneck of leveraging +2D guidance for 3D generation. Our project page is available at: +https://t3bench.com. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ MV-CLIP: Multi-View CLIP for Zero-shot 3D Shape Recognition + + +
+ Large-scale pre-trained models have demonstrated impressive performance in +vision and language tasks within open-world scenarios. Due to the lack of +comparable pre-trained models for 3D shapes, recent methods utilize +language-image pre-training to realize zero-shot 3D shape recognition. However, +due to the modality gap, pretrained language-image models are not confident +enough in the generalization to 3D shape recognition. Consequently, this paper +aims to improve the confidence with view selection and hierarchical prompts. +Leveraging the CLIP model as an example, we employ view selection on the vision +side by identifying views with high prediction confidence from multiple +rendered views of a 3D shape. On the textual side, the strategy of hierarchical +prompts is proposed for the first time. The first layer prompts several +classification candidates with traditional class-level descriptions, while the +second layer refines the prediction based on function-level descriptions or +further distinctions between the candidates. Remarkably, without the need for +additional training, our proposed method achieves impressive zero-shot 3D +classification accuracies of 84.44%, 91.51%, and 66.17% on ModelNet40, +ModelNet10, and ShapeNet Core55, respectively. Furthermore, we will make the +code publicly available to facilitate reproducibility and further research in +this area. + +
+
+
+
+
+ + ♻ ☆ Dual Modalities of Text: Visual and Textual Generative Pre-training + + +
+ Harnessing visual texts represents a burgeoning frontier in the evolution of +language modeling. In this paper, we introduce a novel pre-training framework +for a suite of pixel-based autoregressive language models, pre-training on a +corpus of over 400 million documents rendered as RGB images. Our approach is +characterized by a dual-modality training regimen, engaging both visual data +through next patch prediction with a regression head and textual data via next +token prediction with a classification head. This study is particularly focused +on investigating the synergistic interplay between visual and textual +modalities of language. Our comprehensive evaluation across a diverse array of +benchmarks reveals that the confluence of visual and textual data substantially +augments the efficacy of pixel-based language models. Notably, our findings +show that a unidirectional pixel-based model, devoid of textual data during +training, can match the performance levels of advanced bidirectional +pixel-based models on various language understanding benchmarks. This work +highlights the considerable untapped potential of integrating visual and +textual information for language modeling purposes. We will release our code, +data, and checkpoints to inspire further research advancement. + +
+
+
+
+
+ + ♻ ☆ 3D Face Reconstruction with the Geometric Guidance of Facial Part + Segmentation CVPR2024 + + +
+ 3D Morphable Models (3DMMs) provide promising 3D face reconstructions in +various applications. However, existing methods struggle to reconstruct faces +with extreme expressions due to deficiencies in supervisory signals, such as +sparse or inaccurate landmarks. Segmentation information contains effective +geometric contexts for face reconstruction. Certain attempts intuitively depend +on differentiable renderers to compare the rendered silhouettes of +reconstruction with segmentation, which is prone to issues like local optima +and gradient instability. In this paper, we fully utilize the facial part +segmentation geometry by introducing Part Re-projection Distance Loss (PRDL). +Specifically, PRDL transforms facial part segmentation into 2D points and +re-projects the reconstruction onto the image plane. Subsequently, by +introducing grid anchors and computing different statistical distances from +these anchors to the point sets, PRDL establishes geometry descriptors to +optimize the distribution of the point sets for face reconstruction. PRDL +exhibits a clear gradient compared to the renderer-based methods and presents +state-of-the-art reconstruction performance in extensive quantitative and +qualitative experiments. Our project is available at +https://github.com/wang-zidu/3DDFA-V3 . + +
+
+ comment: CVPR2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ KDAS: Knowledge Distillation via Attention Supervision Framework for + Polyp Segmentation + + +
+ Polyp segmentation, a contentious issue in medical imaging, has seen numerous +proposed methods aimed at improving the quality of segmented masks. While +current state-of-the-art techniques yield impressive results, the size and +computational cost of these models create challenges for practical industry +applications. To address this challenge, we present KDAS, a Knowledge +Distillation framework that incorporates attention supervision, and our +proposed Symmetrical Guiding Module. This framework is designed to facilitate a +compact student model with fewer parameters, allowing it to learn the strengths +of the teacher model and mitigate the inconsistency between teacher features +and student features, a common challenge in Knowledge Distillation, via the +Symmetrical Guiding Module. Through extensive experiments, our compact models +demonstrate their strength by achieving competitive results with +state-of-the-art methods, offering a promising approach to creating compact +models with high accuracy for polyp segmentation and in the medical imaging +field. The implementation is available on https://github.com/huyquoctrinh/KDAS. + +
+
+
+
+
+ + ♻ ☆ ConsistencyDet: A Robust Object Detector with a Denoising Paradigm of + Consistency Model + + +
+ Object detection, a quintessential task in the realm of perceptual computing, +can be tackled using a generative methodology. In the present study, we +introduce a novel framework designed to articulate object detection as a +denoising diffusion process, which operates on the perturbed bounding boxes of +annotated entities. This framework, termed ConsistencyDet, leverages an +innovative denoising concept known as the Consistency Model. The hallmark of +this model is its self-consistency feature, which empowers the model to map +distorted information from any temporal stage back to its pristine state, +thereby realizing a "one-step denoising" mechanism. Such an attribute markedly +elevates the operational efficiency of the model, setting it apart from the +conventional Diffusion Model. Throughout the training phase, ConsistencyDet +initiates the diffusion sequence with noise-infused boxes derived from the +ground-truth annotations and conditions the model to perform the denoising +task. Subsequently, in the inference stage, the model employs a denoising +sampling strategy that commences with bounding boxes randomly sampled from a +normal distribution. Through iterative refinement, the model transforms an +assortment of arbitrarily generated boxes into definitive detections. +Comprehensive evaluations employing standard benchmarks, such as MS-COCO and +LVIS, corroborate that ConsistencyDet surpasses other leading-edge detectors in +performance metrics. Our code is available at +https://github.com/Tankowa/ConsistencyDet. + +
+
+
+
+
+ + ♻ ☆ Weight Copy and Low-Rank Adaptation for Few-Shot Distillation of Vision + Transformers + + +
+ Few-shot knowledge distillation recently emerged as a viable approach to +harness the knowledge of large-scale pre-trained models, using limited data and +computational resources. In this paper, we propose a novel few-shot feature +distillation approach for vision transformers. Our approach is based on two key +steps. Leveraging the fact that vision transformers have a consistent +depth-wise structure, we first copy the weights from intermittent layers of +existing pre-trained vision transformers (teachers) into shallower +architectures (students), where the intermittence factor controls the +complexity of the student transformer with respect to its teacher. Next, we +employ an enhanced version of Low-Rank Adaptation (LoRA) to distill knowledge +into the student in a few-shot scenario, aiming to recover the information +processing carried out by the skipped teacher layers. We present comprehensive +experiments with supervised and self-supervised transformers as teachers, on +five data sets from various domains, including natural, medical and satellite +images. The empirical results confirm the superiority of our approach over +competitive baselines. Moreover, the ablation results demonstrate the +usefulness of each component of the proposed pipeline. + +
+
+
+
+
+ + ♻ ☆ Low-light Image Enhancement via CLIP-Fourier Guided Wavelet Diffusion + + +
+ Low-light image enhancement techniques have significantly progressed, but +unstable image quality recovery and unsatisfactory visual perception are still +significant challenges. To solve these problems, we propose a novel and robust +low-light image enhancement method via CLIP-Fourier Guided Wavelet Diffusion, +abbreviated as CFWD. Specifically, CFWD leverages multimodal visual-language +information in the frequency domain space created by multiple wavelet +transforms to guide the enhancement process. Multi-scale supervision across +different modalities facilitates the alignment of image features with semantic +features during the wavelet diffusion process, effectively bridging the gap +between degraded and normal domains. Moreover, to further promote the effective +recovery of the image details, we combine the Fourier transform based on the +wavelet transform and construct a Hybrid High Frequency Perception Module +(HFPM) with a significant perception of the detailed features. This module +avoids the diversity confusion of the wavelet diffusion process by guiding the +fine-grained structure recovery of the enhancement results to achieve +favourable metric and perceptually oriented enhancement. Extensive quantitative +and qualitative experiments on publicly available real-world benchmarks show +that our approach outperforms existing state-of-the-art methods, achieving +significant progress in image quality and noise suppression. The project code +is available at https://github.com/hejh8/CFWD. + +
+
+
+
+
+ + ♻ ☆ Diffusion Models Meet Remote Sensing: Principles, Methods, and + Perspectives + + +
+ As a newly emerging advance in deep generative models, diffusion models have +achieved state-of-the-art results in many fields, including computer vision, +natural language processing, and molecule design. The remote sensing community +has also noticed the powerful ability of diffusion models and quickly applied +them to a variety of tasks for image processing. Given the rapid increase in +research on diffusion models in the field of remote sensing, it is necessary to +conduct a comprehensive review of existing diffusion model-based remote sensing +papers, to help researchers recognize the potential of diffusion models and +provide some directions for further exploration. Specifically, this paper first +introduces the theoretical background of diffusion models, and then +systematically reviews the applications of diffusion models in remote sensing, +including image generation, enhancement, and interpretation. Finally, the +limitations of existing remote sensing diffusion models and worthy research +directions for further exploration are discussed and summarized. + +
+
+
+
+
+ + ♻ ☆ Representation Alignment Contrastive Regularization for Multi-Object + Tracking + + +
+ Achieving high-performance in multi-object tracking algorithms heavily relies +on modeling spatio-temporal relationships during the data association stage. +Mainstream approaches encompass rule-based and deep learning-based methods for +spatio-temporal relationship modeling. While the former relies on physical +motion laws, offering wider applicability but yielding suboptimal results for +complex object movements, the latter, though achieving high-performance, lacks +interpretability and involves complex module designs. This work aims to +simplify deep learning-based spatio-temporal relationship models and introduce +interpretability into features for data association. Specifically, a +lightweight single-layer transformer encoder is utilized to model +spatio-temporal relationships. To make features more interpretative, two +contrastive regularization losses based on representation alignment are +proposed, derived from spatio-temporal consistency rules. By applying weighted +summation to affinity matrices, the aligned features can seamlessly integrate +into the data association stage of the original tracking workflow. Experimental +results showcase that our model enhances the majority of existing tracking +networks' performance without excessive complexity, with minimal increase in +training overhead and nearly negligible computational and storage costs. + +
+
+
+
+
+ + ♻ ☆ OmniSSR: Zero-shot Omnidirectional Image Super-Resolution using Stable + Diffusion Model + + +
+ Omnidirectional images (ODIs) are commonly used in real-world visual tasks, +and high-resolution ODIs help improve the performance of related visual tasks. +Most existing super-resolution methods for ODIs use end-to-end learning +strategies, resulting in inferior realness of generated images and a lack of +effective out-of-domain generalization capabilities in training methods. Image +generation methods represented by diffusion model provide strong priors for +visual tasks and have been proven to be effectively applied to image +restoration tasks. Leveraging the image priors of the Stable Diffusion (SD) +model, we achieve omnidirectional image super-resolution with both fidelity and +realness, dubbed as OmniSSR. Firstly, we transform the equirectangular +projection (ERP) images into tangent projection (TP) images, whose distribution +approximates the planar image domain. Then, we use SD to iteratively sample +initial high-resolution results. At each denoising iteration, we further +correct and update the initial results using the proposed Octadecaplex Tangent +Information Interaction (OTII) and Gradient Decomposition (GD) technique to +ensure better consistency. Finally, the TP images are transformed back to +obtain the final high-resolution results. Our method is zero-shot, requiring no +training or fine-tuning. Experiments of our method on two benchmark datasets +demonstrate the effectiveness of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Retina : Low-Power Eye Tracking with Event Camera and Spiking Hardware + + +
+ This paper introduces a neuromorphic methodology for eye tracking, harnessing +pure event data captured by a Dynamic Vision Sensor (DVS) camera. The framework +integrates a directly trained Spiking Neuron Network (SNN) regression model and +leverages a state-of-the-art low power edge neuromorphic processor - Speck, +collectively aiming to advance the precision and efficiency of eye-tracking +systems. First, we introduce a representative event-based eye-tracking dataset, +"Ini-30", which was collected with two glass-mounted DVS cameras from thirty +volunteers. Then,a SNN model, based on Integrate And Fire (IAF) neurons, named +"Retina", is described , featuring only 64k parameters (6.63x fewer than the +latest) and achieving pupil tracking error of only 3.24 pixels in a 64x64 DVS +input. The continous regression output is obtained by means of convolution +using a non-spiking temporal 1D filter slided across the output spiking layer. +Finally, we evaluate Retina on the neuromorphic processor, showing an +end-to-end power between 2.89-4.8 mW and a latency of 5.57-8.01 mS dependent on +the time window. We also benchmark our model against the latest event-based +eye-tracking method, "3ET", which was built upon event frames. Results show +that Retina achieves superior precision with 1.24px less pupil centroid error +and reduced computational complexity with 35 times fewer MAC operations. We +hope this work will open avenues for further investigation of close-loop +neuromorphic solutions and true event-based training pursuing edge performance. + +
+
+
+
+
+ + ♻ ☆ Digging into contrastive learning for robust depth estimation with + diffusion models + + +
+ Recently, diffusion-based depth estimation methods have drawn widespread +attention due to their elegant denoising patterns and promising performance. +However, they are typically unreliable under adverse conditions prevalent in +real-world scenarios, such as rainy, snowy, etc. In this paper, we propose a +novel robust depth estimation method called D4RD, featuring a custom +contrastive learning mode tailored for diffusion models to mitigate performance +degradation in complex environments. Concretely, we integrate the strength of +knowledge distillation into contrastive learning, building the `trinity' +contrastive scheme. This scheme utilizes the sampled noise of the forward +diffusion process as a natural reference, guiding the predicted noise in +diverse scenes toward a more stable and precise optimum. Moreover, we extend +noise-level trinity to encompass more generic feature and image levels, +establishing a multi-level contrast to distribute the burden of robust +perception across the overall network. Before addressing complex scenarios, we +enhance the stability of the baseline diffusion model with three +straightforward yet effective improvements, which facilitate convergence and +remove depth outliers. Extensive experiments demonstrate that D4RD surpasses +existing state-of-the-art solutions on synthetic corruption datasets and +real-world weather conditions. The code for D4RD will be made available for +further exploration and adoption. + +
+
+ comment: 8 pages,6 figures +
+
+
+
+
+ + ♻ ☆ The All-Seeing Project V2: Towards General Relation Comprehension of the + Open World + + +
+ We present the All-Seeing Project V2: a new model and dataset designed for +understanding object relations in images. Specifically, we propose the +All-Seeing Model V2 (ASMv2) that integrates the formulation of text generation, +object localization, and relation comprehension into a relation conversation +(ReC) task. Leveraging this unified task, our model excels not only in +perceiving and recognizing all objects within the image but also in grasping +the intricate relation graph between them, diminishing the relation +hallucination often encountered by Multi-modal Large Language Models (MLLMs). +To facilitate training and evaluation of MLLMs in relation understanding, we +created the first high-quality ReC dataset ({AS-V2) which is aligned with the +format of standard instruction tuning data. In addition, we design a new +benchmark, termed Circular-based Relation Probing Evaluation (CRPE) for +comprehensively evaluating the relation comprehension capabilities of MLLMs. +Notably, our ASMv2 achieves an overall accuracy of 52.04 on this relation-aware +benchmark, surpassing the 43.14 of LLaVA-1.5 by a large margin. We hope that +our work can inspire more future research and contribute to the evolution +towards artificial general intelligence. Our project is released at +https://github.com/OpenGVLab/all-seeing. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes CVPR 2024 + + +
+ Recent advancements in post-hoc and inherently interpretable methods have +markedly enhanced the explanations of black box classifier models. These +methods operate either through post-analysis or by integrating concept learning +during model training. Although being effective in bridging the semantic gap +between a model's latent space and human interpretation, these explanation +methods only partially reveal the model's decision-making process. The outcome +is typically limited to high-level semantics derived from the last feature map. +We argue that the explanations lacking insights into the decision processes at +low and mid-level features are neither fully faithful nor useful. Addressing +this gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet), +an inherently interpretable model. MCPNet autonomously learns meaningful +concept prototypes across multiple feature map levels using Centered Kernel +Alignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so +without reliance on predefined concept labels. Further, we propose a novel +classifier paradigm that learns and aligns multi-level concept prototype +distributions for classification purposes via Class-aware Concept Distribution +(CCD) loss. Our experiments reveal that our proposed MCPNet while being +adaptable to various model architectures, offers comprehensive multi-level +explanations while maintaining classification accuracy. Additionally, its +concept distribution-based classification approach shows improved +generalization capabilities in few-shot classification scenarios. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Anomaly Detection with Budget Annotation Using Semi-Supervised + Residual Transformer + + +
+ Anomaly Detection is challenging as usually only the normal samples are seen +during training and the detector needs to discover anomalies on-the-fly. The +recently proposed deep-learning-based approaches could somehow alleviate the +problem but there is still a long way to go in obtaining an industrial-class +anomaly detector for real-world applications. On the other hand, in some +particular AD tasks, a few anomalous samples are labeled manually for achieving +higher accuracy. However, this performance gain is at the cost of considerable +annotation efforts, which can be intractable in many practical scenarios. + In this work, the above two problems are addressed in a unified framework. +Firstly, inspired by the success of the patch-matching-based AD algorithms, we +train a sliding vision transformer over the residuals generated by a novel +position-constrained patch-matching. Secondly, the conventional pixel-wise +segmentation problem is cast into a block-wise classification problem. Thus the +sliding transformer can attain even higher accuracy with much less annotation +labor. Thirdly, to further reduce the labeling cost, we propose to label the +anomalous regions using only bounding boxes. The unlabeled regions caused by +the weak labels are effectively exploited using a highly-customized +semi-supervised learning scheme equipped with two novel data augmentation +methods. The proposed method outperforms all the state-of-the-art approaches +using all the evaluation metrics in both the unsupervised and supervised +scenarios. On the popular MVTec-AD dataset, our SemiREST algorithm obtains the +Average Precision (AP) of 81.2% in the unsupervised condition and 84.4% AP for +supervised anomaly detection. Surprisingly, with the bounding-box-based +semi-supervisions, SemiREST still outperforms the SOTA methods with full +supervision (83.8% AP) on MVTec-AD. + +
+
+ comment: 20 pages,6 figures +
+
+
+
+
+ + ♻ ☆ LaVy: Vietnamese Multimodal Large Language Model + + +
+ Large Language Models (LLMs) and Multimodal Large language models (MLLMs) +have taken the world by storm with impressive abilities in complex reasoning +and linguistic comprehension. Meanwhile there are plethora of works related to +Vietnamese Large Language Models, the lack of high-quality resources in +multimodality limits the progress of Vietnamese MLLMs. In this paper, we +pioneer in address this by introducing LaVy, a state-of-the-art Vietnamese +MLLM, and we also introduce LaVy-Bench benchmark designated for evaluating +MLLMs's understanding on Vietnamese visual language tasks. Our project is +public at https://github.com/baochi0212/LaVy + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ GBSD: Generative Bokeh with Stage Diffusion ICASSP + + +
+ The bokeh effect is an artistic technique that blurs out-of-focus areas in a +photograph and has gained interest due to recent developments in text-to-image +synthesis and the ubiquity of smart-phone cameras and photo-sharing apps. Prior +work on rendering bokeh effects have focused on post hoc image manipulation to +produce similar blurring effects in existing photographs using classical +computer graphics or neural rendering techniques, but have either depth +discontinuity artifacts or are restricted to reproducing bokeh effects that are +present in the training data. More recent diffusion based models can synthesize +images with an artistic style, but either require the generation of +high-dimensional masks, expensive fine-tuning, or affect global image +characteristics. In this paper, we present GBSD, the first generative +text-to-image model that synthesizes photorealistic images with a bokeh style. +Motivated by how image synthesis occurs progressively in diffusion models, our +approach combines latent diffusion models with a 2-stage conditioning algorithm +to render bokeh effects on semantically defined objects. Since we can focus the +effect on objects, this semantic bokeh effect is more versatile than classical +rendering techniques. We evaluate GBSD both quantitatively and qualitatively +and demonstrate its ability to be applied in both text-to-image and +image-to-image settings. + +
+
+ comment: Short Version is accepted by International Conference on Acoustics, + Speech, and Signal Processing (ICASSP) 2024 +
+
+
+
+
+ + ♻ ☆ RoboFusion: Towards Robust Multi-Modal 3D Object Detection via SAM + + +
+ Multi-modal 3D object detectors are dedicated to exploring secure and +reliable perception systems for autonomous driving (AD). However, while +achieving state-of-the-art (SOTA) performance on clean benchmark datasets, they +tend to overlook the complexity and harsh conditions of real-world +environments. Meanwhile, with the emergence of visual foundation models (VFMs), +opportunities and challenges are presented for improving the robustness and +generalization of multi-modal 3D object detection in autonomous driving. +Therefore, we propose RoboFusion, a robust framework that leverages VFMs like +SAM to tackle out-of-distribution (OOD) noise scenarios. We first adapt the +original SAM for autonomous driving scenarios named SAM-AD. To align SAM or +SAM-AD with multi-modal methods, we then introduce AD-FPN for upsampling the +image features extracted by SAM. We employ wavelet decomposition to denoise the +depth-guided images for further noise reduction and weather interference. +Lastly, we employ self-attention mechanisms to adaptively reweight the fused +features, enhancing informative features while suppressing excess noise. In +summary, our RoboFusion gradually reduces noise by leveraging the +generalization and robustness of VFMs, thereby enhancing the resilience of +multi-modal 3D object detection. Consequently, our RoboFusion achieves +state-of-the-art performance in noisy scenarios, as demonstrated by the KITTI-C +and nuScenes-C benchmarks. + +
+
+
+
+
+ + ♻ ☆ Transformer-based Multimodal Change Detection with Multitask Consistency + Constraints + + +
+ Change detection plays a fundamental role in Earth observation for analyzing +temporal iterations over time. However, recent studies have largely neglected +the utilization of multimodal data that presents significant practical and +technical advantages compared to single-modal approaches. This research focuses +on leveraging {pre-event} digital surface model (DSM) data and {post-event} +digital aerial images captured at different times for detecting change beyond +2D. We observe that the current change detection methods struggle with the +multitask conflicts between semantic and height change detection tasks. To +address this challenge, we propose an efficient Transformer-based network that +learns shared representation between cross-dimensional inputs through +cross-attention. {It adopts a consistency constraint to establish the +multimodal relationship. Initially, pseudo-changes are derived by employing +height change thresholding. Subsequently, the $L2$ distance between semantic +and pseudo-changes within their overlapping regions is minimized. This +explicitly endows the height change detection (regression task) and semantic +change detection (classification task) with representation consistency.} A +DSM-to-image multimodal dataset encompassing three cities in the Netherlands +was constructed. It lays a new foundation for beyond-2D change detection from +cross-dimensional inputs. Compared to five state-of-the-art change detection +methods, our model demonstrates consistent multitask superiority in terms of +semantic and height change detection. Furthermore, the consistency strategy can +be seamlessly adapted to the other methods, yielding promising improvements. + +
+
+
+
+
+ + ♻ ☆ SDXS: Real-Time One-Step Latent Diffusion Models with Image Conditions + + +
+ Recent advancements in diffusion models have positioned them at the forefront +of image generation. Despite their superior performance, diffusion models are +not without drawbacks; they are characterized by complex architectures and +substantial computational demands, resulting in significant latency due to +their iterative sampling process. To mitigate these limitations, we introduce a +dual approach involving model miniaturization and a reduction in sampling +steps, aimed at significantly decreasing model latency. Our methodology +leverages knowledge distillation to streamline the U-Net and image decoder +architectures, and introduces an innovative one-step DM training technique that +utilizes feature matching and score distillation. We present two models, +SDXS-512 and SDXS-1024, achieving inference speeds of approximately 100 FPS +(30x faster than SD v1.5) and 30 FPS (60x faster than SDXL) on a single GPU, +respectively. Moreover, our training approach offers promising applications in +image-conditioned control, facilitating efficient image-to-image translation. + +
+
+
+
+
+ + ♻ ☆ Optimization of Prompt Learning via Multi-Knowledge Representation for + Vision-Language Models + + +
+ Vision-Language Models (VLMs), such as CLIP, play a foundational role in +various cross-modal applications. To fully leverage VLMs' potential in adapting +to downstream tasks, context optimization methods like Prompt Tuning are +essential. However, one key limitation is the lack of diversity in prompt +templates, whether they are hand-crafted or learned through additional modules. +This limitation restricts the capabilities of pretrained VLMs and can result in +incorrect predictions in downstream tasks. To address this challenge, we +propose Context Optimization with Multi-Knowledge Representation (CoKnow), a +framework that enhances Prompt Learning for VLMs with rich contextual +knowledge. To facilitate CoKnow during inference, we trained lightweight +semantic knowledge mappers, which are capable of generating Multi-Knowledge +Representation for an input image without requiring additional priors. +Experimentally, We conducted extensive experiments on 11 publicly available +datasets, demonstrating that CoKnow outperforms a series of previous methods. +We will make all resources open-source: https://github.com/EMZucas/CoKnow. + +
+
+
+
+
+ + ♻ ☆ Kinematics Modeling Network for Video-based Human Pose Estimation + + +
+ Estimating human poses from videos is critical in human-computer interaction. +Joints cooperate rather than move independently during human movement. There +are both spatial and temporal correlations between joints. Despite the positive +results of previous approaches, most focus on modeling the spatial correlation +between joints while only straightforwardly integrating features along the +temporal dimension, ignoring the temporal correlation between joints. In this +work, we propose a plug-and-play kinematics modeling module (KMM) to explicitly +model temporal correlations between joints across different frames by +calculating their temporal similarity. In this way, KMM can capture motion cues +of the current joint relative to all joints in different time. Besides, we +formulate video-based human pose estimation as a Markov Decision Process and +design a novel kinematics modeling network (KIMNet) to simulate the Markov +Chain, allowing KIMNet to locate joints recursively. Our approach achieves +state-of-the-art results on two challenging benchmarks. In particular, KIMNet +shows robustness to the occlusion. The code will be released at +https://github.com/YHDang/KIMNet. + +
+
+
+
+
+ + ♻ ☆ Full-dose Whole-body PET Synthesis from Low-dose PET Using + High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency + Model + + +
+ Objective: Positron Emission Tomography (PET) has been a commonly used +imaging modality in broad clinical applications. One of the most important +tradeoffs in PET imaging is between image quality and radiation dose: high +image quality comes with high radiation exposure. Improving image quality is +desirable for all clinical applications while minimizing radiation exposure is +needed to reduce risk to patients. Approach: We introduce PET Consistency Model +(PET-CM), an efficient diffusion-based method for generating high-quality +full-dose PET images from low-dose PET images. It employs a two-step process, +adding Gaussian noise to full-dose PET images in the forward diffusion, and +then denoising them using a PET Shifted-window Vision Transformer (PET-VIT) +network in the reverse diffusion. The PET-VIT network learns a consistency +function that enables direct denoising of Gaussian noise into clean full-dose +PET images. PET-CM achieves state-of-the-art image quality while requiring +significantly less computation time than other methods. Results: In experiments +comparing eighth-dose to full-dose images, PET-CM demonstrated impressive +performance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of +0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of +0.255+/-0.318%, with an average generation time of 62 seconds per patient. This +is a significant improvement compared to the state-of-the-art diffusion-based +model with PET-CM reaching this result 12x faster. Similarly, in the +quarter-dose to full-dose image experiments, PET-CM delivered competitive +outcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM +of 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of +0.151+/-0.192% using the same generation process, which underlining its high +quantitative and clinical precision in both denoising scenario. + +
+
+
+
+
+ + ♻ ☆ Vision Augmentation Prediction Autoencoder with Attention Design + (VAPAAD) + + +
+ Recent advancements in sequence prediction have significantly improved the +accuracy of video data interpretation; however, existing models often overlook +the potential of attention-based mechanisms for next-frame prediction. This +study introduces the Vision Augmentation Prediction Autoencoder with Attention +Design (VAPAAD), an innovative approach that integrates attention mechanisms +into sequence prediction, enabling nuanced analysis and understanding of +temporal dynamics in video sequences. Utilizing the Moving MNIST dataset, we +demonstrate VAPAAD's robust performance and superior handling of complex +temporal data compared to traditional methods. VAPAAD combines data +augmentation, ConvLSTM2D layers, and a custom-built self-attention mechanism to +effectively focus on salient features within a sequence, enhancing predictive +accuracy and context-aware analysis. This methodology not only adheres to human +cognitive processes during video interpretation but also addresses limitations +in conventional models, which often struggle with the variability inherent in +video sequences. The experimental results confirm that VAPAAD outperforms +existing models, especially in integrating attention mechanisms, which +significantly improve predictive performance. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Alpha Invariance: On Inverse Scaling Between Distance and Volume Density + in Neural Radiance Fields CVPR 2024 + + +
+ Scale-ambiguity in 3D scene dimensions leads to magnitude-ambiguity of +volumetric densities in neural radiance fields, i.e., the densities double when +scene size is halved, and vice versa. We call this property alpha invariance. +For NeRFs to better maintain alpha invariance, we recommend 1) parameterizing +both distance and volume densities in log space, and 2) a +discretization-agnostic initialization strategy to guarantee high ray +transmittance. We revisit a few popular radiance field models and find that +these systems use various heuristics to deal with issues arising from scene +scaling. We test their behaviors and show our recipe to be more robust. + +
+
+ comment: CVPR 2024. project page https://pals.ttic.edu/p/alpha-invariance +
+
+
+
+
+ + ♻ ☆ Iterated Learning Improves Compositionality in Large Vision-Language + Models CVPR 2024 + + +
+ A fundamental characteristic common to both human vision and natural language +is their compositional nature. Yet, despite the performance gains contributed +by large vision and language pretraining, recent investigations find that +most-if not all-our state-of-the-art vision-language models struggle at +compositionality. They are unable to distinguish between images of " a girl in +white facing a man in black" and "a girl in black facing a man in white". +Moreover, prior work suggests that compositionality doesn't arise with scale: +larger model sizes or training data don't help. This paper develops a new +iterated training algorithm that incentivizes compositionality. We draw on +decades of cognitive science research that identifies cultural transmission-the +need to teach a new generation-as a necessary inductive prior that incentivizes +humans to develop compositional languages. Specifically, we reframe +vision-language contrastive learning as the Lewis Signaling Game between a +vision agent and a language agent, and operationalize cultural transmission by +iteratively resetting one of the agent's weights during training. After every +iteration, this training paradigm induces representations that become "easier +to learn", a property of compositional languages: e.g. our model trained on +CC3M and CC12M improves standard CLIP by 4.7%, 4.0% respectfully in the +SugarCrepe benchmark. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning to Score Sign Language with Two-stage Method + + +
+ Human action recognition and performance assessment have been hot research +topics in recent years. Recognition problems have mature solutions in the field +of sign language, but past research in performance analysis has focused on +competitive sports and medical training, overlooking the scoring assessment +,which is an important part of sign language teaching digitalization. In this +paper, we analyze the existing technologies for performance assessment and +adopt methods that perform well in human pose reconstruction tasks combined +with motion rotation embedded expressions, proposing a two-stage sign language +performance evaluation pipeline. Our analysis shows that choosing +reconstruction tasks in the first stage can provide more expressive features, +and using smoothing methods can provide an effective reference for assessment. +Experiments show that our method provides good score feedback mechanisms and +high consistency with professional assessments compared to end-to-end +evaluations. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Runner re-identification from single-view running video in the + open-world setting + + +
+ In many sports, player re-identification is crucial for automatic video +processing and analysis. However, most of the current studies on player +re-identification in multi- or single-view sports videos focus on +re-identification in the closed-world setting using labeled image dataset, and +player re-identification in the open-world setting for automatic video analysis +is not well developed. In this paper, we propose a runner re-identification +system that directly processes single-view video to address the open-world +setting. In the open-world setting, we cannot use labeled dataset and have to +process video directly. The proposed system automatically processes raw video +as input to identify runners, and it can identify runners even when they are +framed out multiple times. For the automatic processing, we first detect the +runners in the video using the pre-trained YOLOv8 and the fine-tuned +EfficientNet. We then track the runners using ByteTrack and detect their shoes +with the fine-tuned YOLOv8. Finally, we extract the image features of the +runners using an unsupervised method with the gated recurrent unit autoencoder +and global and local features mixing. To improve the accuracy of runner +re-identification, we use shoe images as local image features and dynamic +features of running sequence images. We evaluated the system on a running +practice video dataset and showed that the proposed method identified runners +with higher accuracy than some state-of-the-art models in unsupervised +re-identification. We also showed that our proposed local image feature and +running dynamic feature were effective for runner re-identification. Our runner +re-identification system can be useful for the automatic analysis of running +videos. + +
+
+ comment: 20 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Robust Analysis of Multi-Task Learning Efficiency: New Benchmarks on + Light-Weighed Backbones and Effective Measurement of Multi-Task Learning + Challenges by Feature Disentanglement + + +
+ One of the main motivations of MTL is to develop neural networks capable of +inferring multiple tasks simultaneously. While countless methods have been +proposed in the past decade investigating robust model architectures and +efficient training algorithms, there is still lack of understanding of these +methods when applied on smaller feature extraction backbones, the +generalizability of the commonly used fast approximation technique of replacing +parameter-level gradients with feature level gradients, and lack of +comprehensive understanding of MTL challenges and how one can efficiently and +effectively identify the challenges. In this paper, we focus on the +aforementioned efficiency aspects of existing MTL methods. We first carry out +large-scale experiments of the methods with smaller backbones and on a the +MetaGraspNet dataset as a new test ground. We also compare the existing methods +with and without using the fast gradient surrogate and empirically study the +generalizability of this technique. Lastly, we propose Feature Disentanglement +measure as a novel and efficient identifier of the challenges in MTL, and +propose Ranking Similarity score as an evaluation metric for different +identifiers to prove the faithfulness of our method. + +
+
+
+
+
+ + ♻ ☆ Automated mapping of virtual environments with visual predictive coding + + +
+ Humans construct internal cognitive maps of their environment directly from +sensory inputs without access to a system of explicit coordinates or distance +measurements. While machine learning algorithms like SLAM utilize specialized +visual inference procedures to identify visual features and construct spatial +maps from visual and odometry data, the general nature of cognitive maps in the +brain suggests a unified mapping algorithmic strategy that can generalize to +auditory, tactile, and linguistic inputs. Here, we demonstrate that predictive +coding provides a natural and versatile neural network algorithm for +constructing spatial maps using sensory data. We introduce a framework in which +an agent navigates a virtual environment while engaging in visual predictive +coding using a self-attention-equipped convolutional neural network. While +learning a next image prediction task, the agent automatically constructs an +internal representation of the environment that quantitatively reflects +distances. The internal map enables the agent to pinpoint its location relative +to landmarks using only visual information.The predictive coding network +generates a vectorized encoding of the environment that supports vector +navigation where individual latent space units delineate localized, overlapping +neighborhoods in the environment. Broadly, our work introduces predictive +coding as a unified algorithmic framework for constructing cognitive maps that +can naturally extend to the mapping of auditory, sensorimotor, and linguistic +inputs. + +
+
+
+
+
+ + ♻ ☆ Framework-agnostic Semantically-aware Global Reasoning for Segmentation WACV 2024 + + +
+ Recent advances in pixel-level tasks (e.g. segmentation) illustrate the +benefit of of long-range interactions between aggregated region-based +representations that can enhance local features. However, such aggregated +representations, often in the form of attention, fail to model the underlying +semantics of the scene (e.g. individual objects and, by extension, their +interactions). In this work, we address the issue by proposing a component that +learns to project image features into latent representations and reason between +them using a transformer encoder to generate contextualized and +scene-consistent representations which are fused with original image features. +Our design encourages the latent regions to represent semantic concepts by +ensuring that the activated regions are spatially disjoint and the union of +such regions corresponds to a connected object segment. The proposed semantic +global reasoning (SGR) component is end-to-end trainable and can be easily +added to a wide variety of backbones (CNN or transformer-based) and +segmentation heads (per-pixel or mask classification) to consistently improve +the segmentation results on different datasets. In addition, our latent tokens +are semantically interpretable and diverse and provide a rich set of features +that can be transferred to downstream tasks like object detection and +segmentation, with improved performance. Furthermore, we also proposed metrics +to quantify the semantics of latent tokens at both class \& instance level. + +
+
+ comment: Published in WACV 2024 +
+
+
+
+
+ + ♻ ☆ Training point-based deep learning networks for forest segmentation with + synthetic data ICPR + + +
+ Remote sensing through unmanned aerial systems (UAS) has been increasing in +forestry in recent years, along with using machine learning for data +processing. Deep learning architectures, extensively applied in natural +language and image processing, have recently been extended to the point cloud +domain. However, the availability of point cloud datasets for training and +testing remains limited. Creating forested environment point cloud datasets is +expensive, requires high-precision sensors, and is time-consuming as manual +point classification is required. Moreover, forest areas could be inaccessible +or dangerous for humans, further complicating data collection. Then, a question +arises whether it is possible to use synthetic data to train deep learning +networks without the need to rely on large volumes of real forest data. To +answer this question, we developed a realistic simulator that procedurally +generates synthetic forest scenes. Thanks to this, we have conducted a +comparative study of different state-of-the-art point-based deep learning +networks for forest segmentation. Using created datasets, we determined the +feasibility of using synthetic data to train deep learning networks to classify +point clouds from real forest datasets. Both the simulator and the datasets are +released as part of this work. + +
+
+ comment: 15 pages, 4 figures. Submitted to the International Conference on + Pattern Recognition (ICPR) 2024 +
+
+
+
+
+ + ♻ ☆ Predicting Thrombectomy Recanalization from CT Imaging Using Deep + Learning Models + + +
+ For acute ischemic stroke (AIS) patients with large vessel occlusions, +clinicians must decide if the benefit of mechanical thrombectomy (MTB) +outweighs the risks and potential complications following an invasive +procedure. Pre-treatment computed tomography (CT) and angiography (CTA) are +widely used to characterize occlusions in the brain vasculature. If a patient +is deemed eligible, a modified treatment in cerebral ischemia (mTICI) score +will be used to grade how well blood flow is reestablished throughout and +following the MTB procedure. An estimation of the likelihood of successful +recanalization can support treatment decision-making. In this study, we +proposed a fully automated prediction of a patient's recanalization score using +pre-treatment CT and CTA imaging. We designed a spatial cross attention network +(SCANet) that utilizes vision transformers to localize to pertinent slices and +brain regions. Our top model achieved an average cross-validated ROC-AUC of +77.33 $\pm$ 3.9\%. This is a promising result that supports future applications +of deep learning on CT and CTA for the identification of eligible AIS patients +for MTB. + +
+
+ comment: Medical Imaging with Deep Learning 2022 accepted short paper Jun 2022 +
+
+
+
+
+ + ♻ ☆ Boomerang: Local sampling on image manifolds using diffusion models + + +
+ The inference stage of diffusion models can be seen as running a reverse-time +diffusion stochastic differential equation, where samples from a Gaussian +latent distribution are transformed into samples from a target distribution +that usually reside on a low-dimensional manifold, e.g., an image manifold. The +intermediate values between the initial latent space and the image manifold can +be interpreted as noisy images, with the amount of noise determined by the +forward diffusion process noise schedule. We utilize this interpretation to +present Boomerang, an approach for local sampling of image manifolds. As +implied by its name, Boomerang local sampling involves adding noise to an input +image, moving it closer to the latent space, and then mapping it back to the +image manifold through a partial reverse diffusion process. Thus, Boomerang +generates images on the manifold that are ``similar,'' but nonidentical, to the +original input image. We can control the proximity of the generated images to +the original by adjusting the amount of noise added. Furthermore, due to the +stochastic nature of the reverse diffusion process in Boomerang, the generated +images display a certain degree of stochasticity, allowing us to obtain local +samples from the manifold without encountering any duplicates. Boomerang offers +the flexibility to work seamlessly with any pretrained diffusion model, such as +Stable Diffusion, without necessitating any adjustments to the reverse +diffusion process. We present three applications for Boomerang. First, we +provide a framework for constructing privacy-preserving datasets having +controllable degrees of anonymity. Second, we show that using Boomerang for +data augmentation increases generalization performance and outperforms +state-of-the-art synthetic data augmentation. Lastly, we introduce a perceptual +image enhancement framework, which enables resolution enhancement. + +
+
+ comment: Published in Transactions on Machine Learning Research +
+
+
+
+
+ + ♻ ☆ AffordanceLLM: Grounding Affordance from Vision Language Models + + +
+ Affordance grounding refers to the task of finding the area of an object with +which one can interact. It is a fundamental but challenging task, as a +successful solution requires the comprehensive understanding of a scene in +multiple aspects including detection, localization, and recognition of objects +with their parts, of geo-spatial configuration/layout of the scene, of 3D +shapes and physics, as well as of the functionality and potential interaction +of the objects and humans. Much of the knowledge is hidden and beyond the image +content with the supervised labels from a limited training set. In this paper, +we make an attempt to improve the generalization capability of the current +affordance grounding by taking the advantage of the rich world, abstract, and +human-object-interaction knowledge from pretrained large-scale vision language +models. Under the AGD20K benchmark, our proposed model demonstrates a +significant performance gain over the competing methods for in-the-wild object +affordance grounding. We further demonstrate it can ground affordance for +objects from random Internet images, even if both objects and actions are +unseen during training. Project site: https://jasonqsy.github.io/AffordanceLLM/ + +
+
+
+
+
+ + ♻ ☆ A Hybrid ANN-SNN Architecture for Low-Power and Low-Latency Visual + Perception + + +
+ Spiking Neural Networks (SNN) are a class of bio-inspired neural networks +that promise to bring low-power and low-latency inference to edge devices +through asynchronous and sparse processing. However, being temporal models, +SNNs depend heavily on expressive states to generate predictions on par with +classical artificial neural networks (ANNs). These states converge only after +long transient periods, and quickly decay without input data, leading to higher +latency, power consumption, and lower accuracy. This work addresses this issue +by initializing the state with an auxiliary ANN running at a low rate. The SNN +then uses the state to generate predictions with high temporal resolution until +the next initialization phase. Our hybrid ANN-SNN model thus combines the best +of both worlds: It does not suffer from long state transients and state decay +thanks to the ANN, and can generate predictions with high temporal resolution, +low latency, and low power thanks to the SNN. We show for the task of +event-based 2D and 3D human pose estimation that our method consumes 88% less +power with only a 4% decrease in performance compared to its fully ANN +counterparts when run at the same inference rate. Moreover, when compared to +SNNs, our method achieves a 74% lower error. This research thus provides a new +understanding of how ANNs and SNNs can be used to maximize their respective +benefits. + +
+
+
+
+
+ + ♻ ☆ Read Between the Layers: Leveraging Intra-Layer Representations for + Rehearsal-Free Continual Learning with Pre-Trained Models + + +
+ We address the Continual Learning (CL) problem, wherein a model must learn a +sequence of tasks from non-stationary distributions while preserving prior +knowledge upon encountering new experiences. With the advancement of foundation +models, CL research has pivoted from the initial learning-from-scratch paradigm +towards utilizing generic features from large-scale pre-training. However, +existing approaches to CL with pre-trained models primarily focus on separating +class-specific features from the final representation layer and neglect the +potential of intermediate representations to capture low- and mid-level +features, which are more invariant to domain shifts. In this work, we propose +LayUP, a new prototype-based approach to continual learning that leverages +second-order feature statistics from multiple intermediate layers of a +pre-trained network. Our method is conceptually simple, does not require access +to prior data, and works out of the box with any foundation model. LayUP +surpasses the state of the art in four of the seven class-incremental learning +benchmarks, all three domain-incremental learning benchmarks and in six of the +seven online continual learning benchmarks, while significantly reducing memory +and computational requirements compared to existing baselines. Our results +demonstrate that fully exhausting the representational capacities of +pre-trained models in CL goes well beyond their final embeddings. + +
+
+ comment: Preprint under review +
+
+
+
+
+ + ♻ ☆ Sora: A Review on Background, Technology, Limitations, and Opportunities + of Large Vision Models + + +
+ Sora is a text-to-video generative AI model, released by OpenAI in February +2024. The model is trained to generate videos of realistic or imaginative +scenes from text instructions and show potential in simulating the physical +world. Based on public technical reports and reverse engineering, this paper +presents a comprehensive review of the model's background, related +technologies, applications, remaining challenges, and future directions of +text-to-video AI models. We first trace Sora's development and investigate the +underlying technologies used to build this "world simulator". Then, we describe +in detail the applications and potential impact of Sora in multiple industries +ranging from film-making and education to marketing. We discuss the main +challenges and limitations that need to be addressed to widely deploy Sora, +such as ensuring safe and unbiased video generation. Lastly, we discuss the +future development of Sora and video generation models in general, and how +advancements in the field could enable new ways of human-AI interaction, +boosting productivity and creativity of video generation. + +
+
+ comment: 37 pages, 18 figures; GitHub: + https://github.com/lichao-sun/SoraReview +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 177 + +
+
+
+ + ☆ COMBO: Compositional World Models for Embodied Multi-Agent Cooperation + + +
+ In this paper, we investigate the problem of embodied multi-agent +cooperation, where decentralized agents must cooperate given only partial +egocentric views of the world. To effectively plan in this setting, in contrast +to learning world dynamics in a single-agent scenario, we must simulate world +dynamics conditioned on an arbitrary number of agents' actions given only +partial egocentric visual observations of the world. To address this issue of +partial observability, we first train generative models to estimate the overall +world state given partial egocentric observations. To enable accurate +simulation of multiple sets of actions on this world state, we then propose to +learn a compositional world model for multi-agent cooperation by factorizing +the naturally composable joint actions of multiple agents and compositionally +generating the video. By leveraging this compositional world model, in +combination with Vision Language Models to infer the actions of other agents, +we can use a tree search procedure to integrate these modules and facilitate +online cooperative planning. To evaluate the efficacy of our methods, we create +two challenging embodied multi-agent long-horizon cooperation tasks using the +ThreeDWorld simulator and conduct experiments with 2-4 agents. The results show +our compositional world model is effective and the framework enables the +embodied agents to cooperate efficiently with different agents across various +tasks and an arbitrary number of agents, showing the promising future of our +proposed framework. More videos can be found at +https://vis-www.cs.umass.edu/combo/. + +
+
+ comment: 23 pages. The first three authors contributed equally +
+
+
+
+
+ + ☆ Gaussian Opacity Fields: Efficient and Compact Surface Reconstruction in + Unbounded Scenes + + +
+ Recently, 3D Gaussian Splatting (3DGS) has demonstrated impressive novel view +synthesis results, while allowing the rendering of high-resolution images in +real-time. However, leveraging 3D Gaussians for surface reconstruction poses +significant challenges due to the explicit and disconnected nature of 3D +Gaussians. In this work, we present Gaussian Opacity Fields (GOF), a novel +approach for efficient, high-quality, and compact surface reconstruction in +unbounded scenes. Our GOF is derived from ray-tracing-based volume rendering of +3D Gaussians, enabling direct geometry extraction from 3D Gaussians by +identifying its levelset, without resorting to Poisson reconstruction or TSDF +fusion as in previous work. We approximate the surface normal of Gaussians as +the normal of the ray-Gaussian intersection plane, enabling the application of +regularization that significantly enhances geometry. Furthermore, we develop an +efficient geometry extraction method utilizing marching tetrahedra, where the +tetrahedral grids are induced from 3D Gaussians and thus adapt to the scene's +complexity. Our evaluations reveal that GOF surpasses existing 3DGS-based +methods in surface reconstruction and novel view synthesis. Further, it +compares favorably to, or even outperforms, neural implicit methods in both +quality and speed. + +
+
+ comment: Project page: + https://niujinshuchong.github.io/gaussian-opacity-fields +
+
+
+
+
+ + ☆ RapidVol: Rapid Reconstruction of 3D Ultrasound Volumes from Sensorless + 2D Scans + + +
+ Two-dimensional (2D) freehand ultrasonography is one of the most commonly +used medical imaging modalities, particularly in obstetrics and gynaecology. +However, it only captures 2D cross-sectional views of inherently 3D anatomies, +losing valuable contextual information. As an alternative to requiring costly +and complex 3D ultrasound scanners, 3D volumes can be constructed from 2D scans +using machine learning. However this usually requires long computational time. +Here, we propose RapidVol: a neural representation framework to speed up +slice-to-volume ultrasound reconstruction. We use tensor-rank decomposition, to +decompose the typical 3D volume into sets of tri-planes, and store those +instead, as well as a small neural network. A set of 2D ultrasound scans, with +their ground truth (or estimated) 3D position and orientation (pose) is all +that is required to form a complete 3D reconstruction. Reconstructions are +formed from real fetal brain scans, and then evaluated by requesting novel +cross-sectional views. When compared to prior approaches based on fully +implicit representation (e.g. neural radiance fields), our method is over 3x +quicker, 46% more accurate, and if given inaccurate poses is more robust. +Further speed-up is also possible by reconstructing from a structural prior +rather than from scratch. + +
+
+
+
+
+ + ☆ RefFusion: Reference Adapted Diffusion Models for 3D Scene Inpainting + + +
+ Neural reconstruction approaches are rapidly emerging as the preferred +representation for 3D scenes, but their limited editability is still posing a +challenge. In this work, we propose an approach for 3D scene inpainting -- the +task of coherently replacing parts of the reconstructed scene with desired +content. Scene inpainting is an inherently ill-posed task as there exist many +solutions that plausibly replace the missing content. A good inpainting method +should therefore not only enable high-quality synthesis but also a high degree +of control. Based on this observation, we focus on enabling explicit control +over the inpainted content and leverage a reference image as an efficient means +to achieve this goal. Specifically, we introduce RefFusion, a novel 3D +inpainting method based on a multi-scale personalization of an image inpainting +diffusion model to the given reference view. The personalization effectively +adapts the prior distribution to the target scene, resulting in a lower +variance of score distillation objective and hence significantly sharper +details. Our framework achieves state-of-the-art results for object removal +while maintaining high controllability. We further demonstrate the generality +of our formulation on other downstream tasks such as object insertion, scene +outpainting, and sparse view reconstruction. + +
+
+ comment: Project page: https://reffusion.github.io +
+
+
+
+
+ + ☆ LaDiC: Are Diffusion Models Really Inferior to Autoregressive + Counterparts for Image-to-Text Generation? + + +
+ Diffusion models have exhibited remarkable capabilities in text-to-image +generation. However, their performance in image-to-text generation, +specifically image captioning, has lagged behind Auto-Regressive (AR) models, +casting doubt on their applicability for such tasks. In this work, we revisit +diffusion models, highlighting their capacity for holistic context modeling and +parallel decoding. With these benefits, diffusion models can alleviate the +inherent limitations of AR methods, including their slow inference speed, error +propagation, and unidirectional constraints. Furthermore, we identify the prior +underperformance of diffusion models stemming from the absence of an effective +latent space for image-text alignment, and the discrepancy between continuous +diffusion processes and discrete textual data. In response, we introduce a +novel architecture, LaDiC, which utilizes a split BERT to create a dedicated +latent space for captions and integrates a regularization module to manage +varying text lengths. Our framework also includes a diffuser for semantic +image-to-text conversion and a Back&Refine technique to enhance token +interactivity during inference. LaDiC achieves state-of-the-art performance for +diffusion-based methods on the MS COCO dataset with 38.2 BLEU@4 and 126.2 +CIDEr, demonstrating exceptional performance without pre-training or ancillary +modules. This indicates strong competitiveness with AR models, revealing the +previously untapped potential of diffusion models in image-to-text generation. + +
+
+
+
+
+ + ☆ Learning Feature Inversion for Multi-class Anomaly Detection under + General-purpose COCO-AD Benchmark + + +
+ Anomaly detection (AD) is often focused on detecting anomaly areas for +industrial quality inspection and medical lesion examination. However, due to +the specific scenario targets, the data scale for AD is relatively small, and +evaluation metrics are still deficient compared to classic vision tasks, such +as object detection and semantic segmentation. To fill these gaps, this work +first constructs a large-scale and general-purpose COCO-AD dataset by extending +COCO to the AD field. This enables fair evaluation and sustainable development +for different methods on this challenging benchmark. Moreover, current metrics +such as AU-ROC have nearly reached saturation on simple datasets, which +prevents a comprehensive evaluation of different methods. Inspired by the +metrics in the segmentation field, we further propose several more practical +threshold-dependent AD-specific metrics, ie, m$F_1$$^{.2}_{.8}$, +mAcc$^{.2}_{.8}$, mIoU$^{.2}_{.8}$, and mIoU-max. Motivated by GAN inversion's +high-quality reconstruction capability, we propose a simple but more powerful +InvAD framework to achieve high-quality feature reconstruction. Our method +improves the effectiveness of reconstruction-based methods on popular MVTec AD, +VisA, and our newly proposed COCO-AD datasets under a multi-class unsupervised +setting, where only a single detection model is trained to detect anomalies +from different classes. Extensive ablation experiments have demonstrated the +effectiveness of each component of our InvAD. Full codes and models are +available at https://github.com/zhangzjn/ader. + +
+
+
+
+
+ + ☆ Watch Your Step: Optimal Retrieval for Continual Learning at Scale + + +
+ One of the most widely used approaches in continual learning is referred to +as replay. Replay methods support interleaved learning by storing past +experiences in a replay buffer. Although there are methods for selectively +constructing the buffer and reprocessing its contents, there is limited +exploration of the problem of selectively retrieving samples from the buffer. +Current solutions have been tested in limited settings and, more importantly, +in isolation. Existing work has also not explored the impact of duplicate +replays on performance. In this work, we propose a framework for evaluating +selective retrieval strategies, categorized by simple, independent class- and +sample-selective primitives. We evaluated several combinations of existing +strategies for selective retrieval and present their performances. Furthermore, +we propose a set of strategies to prevent duplicate replays and explore whether +new samples with low loss values can be learned without replay. In an effort to +match our problem setting to a realistic continual learning pipeline, we +restrict our experiments to a setting involving a large, pre-trained, open +vocabulary object detection model, which is fully fine-tuned on a sequence of +15 datasets. + +
+
+
+
+
+ + ☆ GazeHTA: End-to-end Gaze Target Detection with Head-Target Association + + +
+ We propose an end-to-end approach for gaze target detection: predicting a +head-target connection between individuals and the target image regions they +are looking at. Most of the existing methods use independent components such as +off-the-shelf head detectors or have problems in establishing associations +between heads and gaze targets. In contrast, we investigate an end-to-end +multi-person Gaze target detection framework with Heads and Targets Association +(GazeHTA), which predicts multiple head-target instances based solely on input +scene image. GazeHTA addresses challenges in gaze target detection by (1) +leveraging a pre-trained diffusion model to extract scene features for rich +semantic understanding, (2) re-injecting a head feature to enhance the head +priors for improved head understanding, and (3) learning a connection map as +the explicit visual associations between heads and gaze targets. Our extensive +experimental results demonstrate that GazeHTA outperforms state-of-the-art gaze +target detection methods and two adapted diffusion-based baselines on two +standard datasets. + +
+
+
+
+
+ + ☆ Mixed Prototype Consistency Learning for Semi-supervised Medical Image + Segmentation + + +
+ Recently, prototype learning has emerged in semi-supervised medical image +segmentation and achieved remarkable performance. However, the scarcity of +labeled data limits the expressiveness of prototypes in previous methods, +potentially hindering the complete representation of prototypes for class +embedding. To address this problem, we propose the Mixed Prototype Consistency +Learning (MPCL) framework, which includes a Mean Teacher and an auxiliary +network. The Mean Teacher generates prototypes for labeled and unlabeled data, +while the auxiliary network produces additional prototypes for mixed data +processed by CutMix. Through prototype fusion, mixed prototypes provide extra +semantic information to both labeled and unlabeled prototypes. High-quality +global prototypes for each class are formed by fusing two enhanced prototypes, +optimizing the distribution of hidden embeddings used in consistency learning. +Extensive experiments on the left atrium and type B aortic dissection datasets +demonstrate MPCL's superiority over previous state-of-the-art approaches, +confirming the effectiveness of our framework. The code will be released soon. + +
+
+ comment: 15 pages, 2 figures +
+
+
+
+
+ + ☆ MOWA: Multiple-in-One Image Warping Model + + +
+ While recent image warping approaches achieved remarkable success on existing +benchmarks, they still require training separate models for each specific task +and cannot generalize well to different camera models or customized +manipulations. To address diverse types of warping in practice, we propose a +Multiple-in-One image WArping model (named MOWA) in this work. Specifically, we +mitigate the difficulty of multi-task learning by disentangling the motion +estimation at both the region level and pixel level. To further enable dynamic +task-aware image warping, we introduce a lightweight point-based classifier +that predicts the task type, serving as prompts to modulate the feature maps +for better estimation. To our knowledge, this is the first work that solves +multiple practical warping tasks in one single model. Extensive experiments +demonstrate that our MOWA, which is trained on six tasks for multiple-in-one +image warping, outperforms state-of-the-art task-specific models across most +tasks. Moreover, MOWA also exhibits promising potential to generalize into +unseen scenes, as evidenced by cross-domain and zero-shot evaluations. The code +will be made publicly available. + +
+
+ comment: Project page: https://kangliao929.github.io/projects/mowa/ +
+
+
+
+
+ + ☆ AV-GAN: Attention-Based Varifocal Generative Adversarial Network for + Uneven Medical Image Translation + + +
+ Different types of staining highlight different structures in organs, thereby +assisting in diagnosis. However, due to the impossibility of repeated staining, +we cannot obtain different types of stained slides of the same tissue area. +Translating the slide that is easy to obtain (e.g., H&E) to slides of staining +types difficult to obtain (e.g., MT, PAS) is a promising way to solve this +problem. However, some regions are closely connected to other regions, and to +maintain this connection, they often have complex structures and are difficult +to translate, which may lead to wrong translations. In this paper, we propose +the Attention-Based Varifocal Generative Adversarial Network (AV-GAN), which +solves multiple problems in pathologic image translation tasks, such as uneven +translation difficulty in different regions, mutual interference of multiple +resolution information, and nuclear deformation. Specifically, we develop an +Attention-Based Key Region Selection Module, which can attend to regions with +higher translation difficulty. We then develop a Varifocal Module to translate +these regions at multiple resolutions. Experimental results show that our +proposed AV-GAN outperforms existing image translation methods with two virtual +kidney tissue staining tasks and improves FID values by 15.9 and 4.16 +respectively in the H&E-MT and H&E-PAS tasks. + +
+
+
+
+
+ + ☆ A Plausibility Study of Using Augmented Reality in the + Ventriculoperitoneal Shunt Operations + + +
+ The field of augmented reality (AR) has undergone substantial growth, finding +diverse applications in the medical industry. This paper delves into various +techniques employed in medical surgeries, scrutinizing factors such as cost, +implementation, and accessibility. The focus of this exploration is on AR-based +solutions, with a particular emphasis on addressing challenges and proposing an +innovative solution for ventriculoperitoneal shunt (VP) operations. The +proposed solution introduces a novel flow in the pre-surgery phase, aiming to +substantially reduce setup time and operation duration by creating 3D models of +the skull and ventricles. Experiments are conducted where the models are +visualized on a 3D- printed skull through an AR device, specifically the +Microsoft HoloLens 2. The paper then conducts an in-depth analysis of this +proposed solution, discussing its feasibility, advantages, limitations,and +future implications. + +
+
+ comment: Accepted for the 2024 - 16th International Conference on Knowledge + and Smart Technology (KST). To be published in IEEEXplore Digital Library + (#61284), ISBN: 979-8-3503-7073-7 +
+
+
+
+
+ + ☆ Dual Modalities of Text: Visual and Textual Generative Pre-training + + +
+ Harnessing visual texts represents a burgeoning frontier in the evolution of +language modeling. In this paper, we introduce a novel pre-training framework +for a suite of pixel-based autoregressive language models, pre-training on a +corpus of over 400 million documents rendered as RGB images. Our approach is +characterized by a dual-modality training regimen, engaging both visual data +through next patch prediction with a regression head and textual data via next +token prediction with a classification head. This study is particularly focused +on investigating the synergistic interplay between visual and textual +modalities of language. Our comprehensive evaluation across a diverse array of +benchmarks reveals that the confluence of visual and textual data substantially +augments the efficacy of pixel-based language models. Notably, our findings +show that a unidirectional pixel-based model, devoid of textual data during +training, can match the performance levels of advanced bidirectional +pixel-based models on various language understanding benchmarks. This work +highlights the considerable untapped potential of integrating visual and +textual information for language modeling purposes. We will release our code, +data, and checkpoints to inspire further research advancement. + +
+
+
+
+
+ + ☆ Rawformer: Unpaired Raw-to-Raw Translation for Learnable Camera ISPs + + +
+ Modern smartphone camera quality heavily relies on the image signal processor +(ISP) to enhance captured raw images, utilizing carefully designed modules to +produce final output images encoded in a standard color space (e.g., sRGB). +Neural-based end-to-end learnable ISPs offer promising advancements, +potentially replacing traditional ISPs with their ability to adapt without +requiring extensive tuning for each new camera model, as is often the case for +nearly every module in traditional ISPs. However, the key challenge with the +recent learning-based ISPs is the urge to collect large paired datasets for +each distinct camera model due to the influence of intrinsic camera +characteristics on the formation of input raw images. This paper tackles this +challenge by introducing a novel method for unpaired learning of raw-to-raw +translation across diverse cameras. Specifically, we propose Rawformer, an +unsupervised Transformer-based encoder-decoder method for raw-to-raw +translation. It accurately maps raw images captured by a certain camera to the +target camera, facilitating the generalization of learnable ISPs to new unseen +cameras. Our method demonstrates superior performance on real camera datasets, +achieving higher accuracy compared to previous state-of-the-art techniques, and +preserving a more robust correlation between the original and translated raw +images. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ ECLAIR: A High-Fidelity Aerial LiDAR Dataset for Semantic Segmentation + + +
+ We introduce ECLAIR (Extended Classification of Lidar for AI Recognition), a +new outdoor large-scale aerial LiDAR dataset designed specifically for +advancing research in point cloud semantic segmentation. As the most extensive +and diverse collection of its kind to date, the dataset covers a total area of +10$km^2$ with close to 600 million points and features eleven distinct object +categories. To guarantee the dataset's quality and utility, we have thoroughly +curated the point labels through an internal team of experts, ensuring accuracy +and consistency in semantic labeling. The dataset is engineered to move forward +the fields of 3D urban modeling, scene understanding, and utility +infrastructure management by presenting new challenges and potential +applications. As a benchmark, we report qualitative and quantitative analysis +of a voxel-based point cloud segmentation approach based on the Minkowski +Engine. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ MathWriting: A Dataset For Handwritten Mathematical Expression + Recognition + + +
+ We introduce MathWriting, the largest online handwritten mathematical +expression dataset to date. It consists of 230k human-written samples and an +additional 400k synthetic ones. MathWriting can also be used for offline HME +recognition and is larger than all existing offline HME datasets like +IM2LATEX-100K. We introduce a benchmark based on MathWriting data in order to +advance research on both online and offline HME recognition. + +
+
+
+
+
+ + ☆ Efficient Conditional Diffusion Model with Probability Flow Sampling for + Image Super-resolution AAAI 2024 + + +
+ Image super-resolution is a fundamentally ill-posed problem because multiple +valid high-resolution images exist for one low-resolution image. +Super-resolution methods based on diffusion probabilistic models can deal with +the ill-posed nature by learning the distribution of high-resolution images +conditioned on low-resolution images, avoiding the problem of blurry images in +PSNR-oriented methods. However, existing diffusion-based super-resolution +methods have high time consumption with the use of iterative sampling, while +the quality and consistency of generated images are less than ideal due to +problems like color shifting. In this paper, we propose Efficient Conditional +Diffusion Model with Probability Flow Sampling (ECDP) for image +super-resolution. To reduce the time consumption, we design a continuous-time +conditional diffusion model for image super-resolution, which enables the use +of probability flow sampling for efficient generation. Additionally, to improve +the consistency of generated images, we propose a hybrid parametrization for +the denoiser network, which interpolates between the data-predicting +parametrization and the noise-predicting parametrization for different noise +scales. Moreover, we design an image quality loss as a complement to the score +matching loss of diffusion models, further improving the consistency and +quality of super-resolution. Extensive experiments on DIV2K, ImageNet, and +CelebA demonstrate that our method achieves higher super-resolution quality +than existing diffusion-based image super-resolution methods while having lower +time consumption. Our code is available at https://github.com/Yuan-Yutao/ECDP. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ Generating Human Interaction Motions in Scenes with Text Control + + +
+ We present TeSMo, a method for text-controlled scene-aware motion generation +based on denoising diffusion models. Previous text-to-motion methods focus on +characters in isolation without considering scenes due to the limited +availability of datasets that include motion, text descriptions, and +interactive scenes. Our approach begins with pre-training a scene-agnostic +text-to-motion diffusion model, emphasizing goal-reaching constraints on +large-scale motion-capture datasets. We then enhance this model with a +scene-aware component, fine-tuned using data augmented with detailed scene +information, including ground plane and object shapes. To facilitate training, +we embed annotated navigation and interaction motions within scenes. The +proposed method produces realistic and diverse human-object interactions, such +as navigation and sitting, in different scenes with various object shapes, +orientations, initial body positions, and poses. Extensive experiments +demonstrate that our approach surpasses prior techniques in terms of the +plausibility of human-scene interactions, as well as the realism and variety of +the generated motions. Code will be released upon publication of this work at +https://research.nvidia.com/labs/toronto-ai/tesmo. + +
+
+ comment: Project Page: https://research.nvidia.com/labs/toronto-ai/tesmo/ +
+
+
+
+
+ + ☆ StyleCity: Large-Scale 3D Urban Scenes Stylization with Vision-and-Text + Reference via Progressive Optimization + + +
+ Creating large-scale virtual urban scenes with variant styles is inherently +challenging. To facilitate prototypes of virtual production and bypass the need +for complex materials and lighting setups, we introduce the first +vision-and-text-driven texture stylization system for large-scale urban scenes, +StyleCity. Taking an image and text as references, StyleCity stylizes a 3D +textured mesh of a large-scale urban scene in a semantics-aware fashion and +generates a harmonic omnidirectional sky background. To achieve that, we +propose to stylize a neural texture field by transferring 2D vision-and-text +priors to 3D globally and locally. During 3D stylization, we progressively +scale the planned training views of the input 3D scene at different levels in +order to preserve high-quality scene content. We then optimize the scene style +globally by adapting the scale of the style image with the scale of the +training views. Moreover, we enhance local semantics consistency by the +semantics-aware style loss which is crucial for photo-realistic stylization. +Besides texture stylization, we further adopt a generative diffusion model to +synthesize a style-consistent omnidirectional sky image, which offers a more +immersive atmosphere and assists the semantic stylization process. The stylized +neural texture field can be baked into an arbitrary-resolution texture, +enabling seamless integration into conventional rendering pipelines and +significantly easing the virtual production prototyping process. Extensive +experiments demonstrate our stylized scenes' superiority in qualitative and +quantitative performance and user preferences. + +
+
+ comment: project page: https://chenyingshu.github.io/stylecity3d/ +
+
+
+
+
+ + ☆ VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time + + +
+ We introduce VASA, a framework for generating lifelike talking faces with +appealing visual affective skills (VAS) given a single static image and a +speech audio clip. Our premiere model, VASA-1, is capable of not only producing +lip movements that are exquisitely synchronized with the audio, but also +capturing a large spectrum of facial nuances and natural head motions that +contribute to the perception of authenticity and liveliness. The core +innovations include a holistic facial dynamics and head movement generation +model that works in a face latent space, and the development of such an +expressive and disentangled face latent space using videos. Through extensive +experiments including evaluation on a set of new metrics, we show that our +method significantly outperforms previous methods along various dimensions +comprehensively. Our method not only delivers high video quality with realistic +facial and head dynamics but also supports the online generation of 512x512 +videos at up to 40 FPS with negligible starting latency. It paves the way for +real-time engagements with lifelike avatars that emulate human conversational +behaviors. + +
+
+ comment: Tech Report. Project webpage: + https://www.microsoft.com/en-us/research/project/vasa-1/ +
+
+
+
+
+ + ☆ Assessing The Impact of CNN Auto Encoder-Based Image Denoising on Image + Classification Tasks + + +
+ Images captured from the real world are often affected by different types of +noise, which can significantly impact the performance of Computer Vision +systems and the quality of visual data. This study presents a novel approach +for defect detection in casting product noisy images, specifically focusing on +submersible pump impellers. The methodology involves utilizing deep learning +models such as VGG16, InceptionV3, and other models in both the spatial and +frequency domains to identify noise types and defect status. The research +process begins with preprocessing images, followed by applying denoising +techniques tailored to specific noise categories. The goal is to enhance the +accuracy and robustness of defect detection by integrating noise detection and +denoising into the classification pipeline. The study achieved remarkable +results using VGG16 for noise type classification in the frequency domain, +achieving an accuracy of over 99%. Removal of salt and pepper noise resulted in +an average SSIM of 87.9, while Gaussian noise removal had an average SSIM of +64.0, and periodic noise removal yielded an average SSIM of 81.6. This +comprehensive approach showcases the effectiveness of the deep AutoEncoder +model and median filter, for denoising strategies in real-world industrial +applications. Finally, our study reports significant improvements in binary +classification accuracy for defect detection compared to previous methods. For +the VGG16 classifier, accuracy increased from 94.6% to 97.0%, demonstrating the +effectiveness of the proposed noise detection and denoising approach. +Similarly, for the InceptionV3 classifier, accuracy improved from 84.7% to +90.0%, further validating the benefits of integrating noise analysis into the +classification pipeline. + +
+
+ comment: 13 pages, 13 figures, 13th International conference on innovative + technologies in the field of science, engineering and technology +
+
+
+
+
+ + ☆ Contextrast: Contextual Contrastive Learning for Semantic Segmentation + + +
+ Despite great improvements in semantic segmentation, challenges persist +because of the lack of local/global contexts and the relationship between them. +In this paper, we propose Contextrast, a contrastive learning-based semantic +segmentation method that allows to capture local/global contexts and comprehend +their relationships. Our proposed method comprises two parts: a) contextual +contrastive learning (CCL) and b) boundary-aware negative (BANE) sampling. +Contextual contrastive learning obtains local/global context from multi-scale +feature aggregation and inter/intra-relationship of features for better +discrimination capabilities. Meanwhile, BANE sampling selects embedding +features along the boundaries of incorrectly predicted regions to employ them +as harder negative samples on our contrastive learning, resolving segmentation +issues along the boundary region by exploiting fine-grained details. We +demonstrate that our Contextrast substantially enhances the performance of +semantic segmentation networks, outperforming state-of-the-art contrastive +learning approaches on diverse public datasets, e.g. Cityscapes, CamVid, +PASCAL-C, COCO-Stuff, and ADE20K, without an increase in computational cost +during inference. + +
+
+
+
+
+ + ☆ Exploring selective image matching methods for zero-shot and few-sample + unsupervised domain adaptation of urban canopy prediction ICLR 2024 + + +
+ We explore simple methods for adapting a trained multi-task UNet which +predicts canopy cover and height to a new geographic setting using remotely +sensed data without the need of training a domain-adaptive classifier and +extensive fine-tuning. Extending previous research, we followed a selective +alignment process to identify similar images in the two geographical domains +and then tested an array of data-based unsupervised domain adaptation +approaches in a zero-shot setting as well as with a small amount of +fine-tuning. We find that the selective aligned data-based image matching +methods produce promising results in a zero-shot setting, and even more so with +a small amount of fine-tuning. These methods outperform both an untransformed +baseline and a popular data-based image-to-image translation model. The best +performing methods were pixel distribution adaptation and fourier domain +adaptation on the canopy cover and height tasks respectively. + +
+
+ comment: ICLR 2024 Machine Learning for Remote Sensing (ML4RS) Workshop +
+
+
+
+
+ + ☆ Gaussian Splatting Decoder for 3D-aware Generative Adversarial Networks CVPR + + +
+ NeRF-based 3D-aware Generative Adversarial Networks (GANs) like EG3D or +GIRAFFE have shown very high rendering quality under large representational +variety. However, rendering with Neural Radiance Fields poses challenges for 3D +applications: First, the significant computational demands of NeRF rendering +preclude its use on low-power devices, such as mobiles and VR/AR headsets. +Second, implicit representations based on neural networks are difficult to +incorporate into explicit 3D scenes, such as VR environments or video games. 3D +Gaussian Splatting (3DGS) overcomes these limitations by providing an explicit +3D representation that can be rendered efficiently at high frame rates. In this +work, we present a novel approach that combines the high rendering quality of +NeRF-based 3D-aware GANs with the flexibility and computational advantages of +3DGS. By training a decoder that maps implicit NeRF representations to explicit +3D Gaussian Splatting attributes, we can integrate the representational +diversity and quality of 3D GANs into the ecosystem of 3D Gaussian Splatting +for the first time. Additionally, our approach allows for a high resolution GAN +inversion and real-time GAN editing with 3D Gaussian Splatting scenes. + +
+
+ comment: CVPRW +
+
+
+
+
+ + ☆ PyTorchGeoNodes: Enabling Differentiable Shape Programs for 3D Shape + Reconstruction + + +
+ We propose PyTorchGeoNodes, a differentiable module for reconstructing 3D +objects from images using interpretable shape programs. In comparison to +traditional CAD model retrieval methods, the use of shape programs for 3D +reconstruction allows for reasoning about the semantic properties of +reconstructed objects, editing, low memory footprint, etc. However, the +utilization of shape programs for 3D scene understanding has been largely +neglected in past works. As our main contribution, we enable gradient-based +optimization by introducing a module that translates shape programs designed in +Blender, for example, into efficient PyTorch code. We also provide a method +that relies on PyTorchGeoNodes and is inspired by Monte Carlo Tree Search +(MCTS) to jointly optimize discrete and continuous parameters of shape programs +and reconstruct 3D objects for input scenes. In our experiments, we apply our +algorithm to reconstruct 3D objects in the ScanNet dataset and evaluate our +results against CAD model retrieval-based reconstructions. Our experiments +indicate that our reconstructions match well the input scenes while enabling +semantic reasoning about reconstructed objects. + +
+
+ comment: In Submission +
+
+
+
+
+ + ☆ Private Attribute Inference from Images with Vision-Language Models + + +
+ As large language models (LLMs) become ubiquitous in our daily tasks and +digital interactions, associated privacy risks are increasingly in focus. While +LLM privacy research has primarily focused on the leakage of model training +data, it has recently been shown that the increase in models' capabilities has +enabled LLMs to make accurate privacy-infringing inferences from previously +unseen texts. With the rise of multimodal vision-language models (VLMs), +capable of understanding both images and text, a pertinent question is whether +such results transfer to the previously unexplored domain of benign images +posted online. To investigate the risks associated with the image reasoning +capabilities of newly emerging VLMs, we compile an image dataset with +human-annotated labels of the image owner's personal attributes. In order to +understand the additional privacy risk posed by VLMs beyond traditional human +attribute recognition, our dataset consists of images where the inferable +private attributes do not stem from direct depictions of humans. On this +dataset, we evaluate the inferential capabilities of 7 state-of-the-art VLMs, +finding that they can infer various personal attributes at up to 77.6% +accuracy. Concerningly, we observe that accuracy scales with the general +capabilities of the models, implying that future models can be misused as +stronger adversaries, establishing an imperative for the development of +adequate defenses. + +
+
+
+
+
+ + ☆ Enhancing 3D Fidelity of Text-to-3D using Cross-View Correspondences CVPR 2024 + + +
+ Leveraging multi-view diffusion models as priors for 3D optimization have +alleviated the problem of 3D consistency, e.g., the Janus face problem or the +content drift problem, in zero-shot text-to-3D models. However, the 3D +geometric fidelity of the output remains an unresolved issue; albeit the +rendered 2D views are realistic, the underlying geometry may contain errors +such as unreasonable concavities. In this work, we propose CorrespondentDream, +an effective method to leverage annotation-free, cross-view correspondences +yielded from the diffusion U-Net to provide additional 3D prior to the NeRF +optimization process. We find that these correspondences are strongly +consistent with human perception, and by adopting it in our loss design, we are +able to produce NeRF models with geometries that are more coherent with common +sense, e.g., more smoothed object surface, yielding higher 3D fidelity. We +demonstrate the efficacy of our approach through various comparative +qualitative results and a solid user study. + +
+
+ comment: 25 pages, 22 figures, accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Intra-operative tumour margin evaluation in breast-conserving surgery + with deep learning + + +
+ A positive margin may result in an increased risk of local recurrences after +breast retention surgery for any malignant tumour. In order to reduce the +number of positive margins would offer surgeon real-time intra-operative +information on the presence of positive resection margins. This study aims to +design an intra-operative tumour margin evaluation scheme by using specimen +mammography in breast-conserving surgery. Total of 30 cases were evaluated and +compared with the manually determined contours by experienced physicians and +pathology report. The proposed method utilizes image thresholding to extract +regions of interest and then performs a deep learning model, i.e. SegNet, to +segment tumour tissue. The margin width of normal tissues surrounding it is +evaluated as the result. The desired size of margin around the tumor was set +for 10 mm. The smallest average difference to manual sketched margin (6.53 mm ++- 5.84). In the all case, the SegNet architecture was utilized to obtain +tissue specimen boundary and tumor contour, respectively. The simulation +results indicated that this technology is helpful in discriminating positive +from negative margins in the intra-operative setting. The aim of proposed +scheme was a potential procedure in the intra-operative measurement system. The +experimental results reveal that deep learning techniques can draw results that +are consistent with pathology reports. + +
+
+ comment: 1 pages, 6 figures and 2 tables +
+
+
+
+
+ + ☆ Automated Evaluation of Large Vision-Language Models on Self-driving + Corner Cases + + +
+ Large Vision-Language Models (LVLMs), due to the remarkable visual reasoning +ability to understand images and videos, have received widespread attention in +the autonomous driving domain, which significantly advances the development of +interpretable end-to-end autonomous driving. However, current evaluations of +LVLMs primarily focus on the multi-faceted capabilities in common scenarios, +lacking quantifiable and automated assessment in autonomous driving contexts, +let alone severe road corner cases that even the state-of-the-art autonomous +driving perception systems struggle to handle. In this paper, we propose +CODA-LM, a novel vision-language benchmark for self-driving, which provides the +first automatic and quantitative evaluation of LVLMs for interpretable +autonomous driving including general perception, regional perception, and +driving suggestions. CODA-LM utilizes the texts to describe the road images, +exploiting powerful text-only large language models (LLMs) without image inputs +to assess the capabilities of LVLMs in autonomous driving scenarios, which +reveals stronger alignment with human preferences than LVLM judges. Experiments +demonstrate that even the closed-sourced commercial LVLMs like GPT-4V cannot +deal with road corner cases well, suggesting that we are still far from a +strong LVLM-powered intelligent driving agent, and we hope our CODA-LM can +become the catalyst to promote future development. + +
+
+ comment: Project Page: https://coda-dataset.github.io/coda-lm/ +
+
+
+
+
+ + ☆ Do Counterfactual Examples Complicate Adversarial Training? CVPR'24 + + +
+ We leverage diffusion models to study the robustness-performance tradeoff of +robust classifiers. Our approach introduces a simple, pretrained diffusion +method to generate low-norm counterfactual examples (CEs): semantically altered +data which results in different true class membership. We report that the +confidence and accuracy of robust models on their clean training data are +associated with the proximity of the data to their CEs. Moreover, robust models +perform very poorly when evaluated on the CEs directly, as they become +increasingly invariant to the low-norm, semantic changes brought by CEs. The +results indicate a significant overlap between non-robust and semantic +features, countering the common assumption that non-robust features are not +interpretable. + +
+
+ comment: Accepted as a short paper to the GCV Workshop at CVPR'24 +
+
+
+
+
+ + ☆ ReWiTe: Realistic Wide-angle and Telephoto Dual Camera Fusion Dataset + via Beam Splitter Camera Rig + + +
+ The fusion of images from dual camera systems featuring a wide-angle and a +telephoto camera has become a hotspot problem recently. By integrating +simultaneously captured wide-angle and telephoto images from these systems, the +resulting fused image achieves a wide field of view (FOV) coupled with +high-definition quality. Existing approaches are mostly deep learning methods, +and predominantly rely on supervised learning, where the training dataset plays +a pivotal role. However, current datasets typically adopt a data synthesis +approach generate input pairs of wide-angle and telephoto images alongside +ground-truth images. Notably, the wide-angle inputs are synthesized rather than +captured using real wide-angle cameras, and the ground-truth image is captured +by wide-angle camera whose quality is substantially lower than that of input +telephoto images captured by telephoto cameras. To address these limitations, +we introduce a novel hardware setup utilizing a beam splitter to simultaneously +capture three images, i.e. input pairs and ground-truth images, from two +authentic cellphones equipped with wide-angle and telephoto dual cameras. +Specifically, the wide-angle and telephoto images captured by cellphone 2 serve +as the input pair, while the telephoto image captured by cellphone 1, which is +calibrated to match the optical path of the wide-angle image from cellphone 2, +serves as the ground-truth image, maintaining quality on par with the input +telephoto image. Experiments validate the efficacy of our newly introduced +dataset, named ReWiTe, significantly enhances the performance of various +existing methods for real-world wide-angle and telephoto dual image fusion +tasks. + +
+
+
+
+
+ + ☆ EMC$^2$: Efficient MCMC Negative Sampling for Contrastive Learning with + Global Convergence + + +
+ A key challenge in contrastive learning is to generate negative samples from +a large sample set to contrast with positive samples, for learning better +encoding of the data. These negative samples often follow a softmax +distribution which are dynamically updated during the training process. +However, sampling from this distribution is non-trivial due to the high +computational costs in computing the partition function. In this paper, we +propose an Efficient Markov Chain Monte Carlo negative sampling method for +Contrastive learning (EMC$^2$). We follow the global contrastive learning loss +as introduced in SogCLR, and propose EMC$^2$ which utilizes an adaptive +Metropolis-Hastings subroutine to generate hardness-aware negative samples in +an online fashion during the optimization. We prove that EMC$^2$ finds an +$\mathcal{O}(1/\sqrt{T})$-stationary point of the global contrastive loss in +$T$ iterations. Compared to prior works, EMC$^2$ is the first algorithm that +exhibits global convergence (to stationarity) regardless of the choice of batch +size while exhibiting low computation and memory cost. Numerical experiments +validate that EMC$^2$ is effective with small batch training and achieves +comparable or better performance than baseline algorithms. We report the +results for pre-training image encoders on STL-10 and Imagenet-100. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Uncertainty-guided Open-Set Source-Free Unsupervised Domain Adaptation + with Target-private Class Segregation + + +
+ Standard Unsupervised Domain Adaptation (UDA) aims to transfer knowledge from +a labeled source domain to an unlabeled target but usually requires +simultaneous access to both source and target data. Moreover, UDA approaches +commonly assume that source and target domains share the same labels space. +Yet, these two assumptions are hardly satisfied in real-world scenarios. This +paper considers the more challenging Source-Free Open-set Domain Adaptation +(SF-OSDA) setting, where both assumptions are dropped. We propose a novel +approach for SF-OSDA that exploits the granularity of target-private categories +by segregating their samples into multiple unknown classes. Starting from an +initial clustering-based assignment, our method progressively improves the +segregation of target-private samples by refining their pseudo-labels with the +guide of an uncertainty-based sample selection module. Additionally, we propose +a novel contrastive loss, named NL-InfoNCELoss, that, integrating negative +learning into self-supervised contrastive learning, enhances the model +robustness to noisy pseudo-labels. Extensive experiments on benchmark datasets +demonstrate the superiority of the proposed method over existing approaches, +establishing new state-of-the-art performance. Notably, additional analyses +show that our method is able to learn the underlying semantics of novel +classes, opening the possibility to perform novel class discovery. + +
+
+
+
+
+ + ☆ Label merge-and-split: A graph-colouring approach for memory-efficient + brain parcellation + + +
+ Whole brain parcellation requires inferring hundreds of segmentation labels +in large image volumes and thus presents significant practical challenges for +deep learning approaches. We introduce label merge-and-split, a method that +first greatly reduces the effective number of labels required for +learning-based whole brain parcellation and then recovers original labels. +Using a greedy graph colouring algorithm, our method automatically groups and +merges multiple spatially separate labels prior to model training and +inference. The merged labels may be semantically unrelated. A deep learning +model is trained to predict merged labels. At inference time, original labels +are restored using atlas-based influence regions. In our experiments, the +proposed approach reduces the number of labels by up to 68% while achieving +segmentation accuracy comparable to the baseline method without label merging +and splitting. Moreover, model training and inference times as well as GPU +memory requirements were reduced significantly. The proposed method can be +applied to all semantic segmentation tasks with a large number of spatially +separate classes within an atlas-based prior. + +
+
+
+
+
+ + ☆ CMU-Flownet: Exploring Point Cloud Scene Flow Estimation in Occluded + Scenario + + +
+ Occlusions hinder point cloud frame alignment in LiDAR data, a challenge +inadequately addressed by scene flow models tested mainly on occlusion-free +datasets. Attempts to integrate occlusion handling within networks often suffer +accuracy issues due to two main limitations: a) the inadequate use of occlusion +information, often merging it with flow estimation without an effective +integration strategy, and b) reliance on distance-weighted upsampling that +falls short in correcting occlusion-related errors. To address these +challenges, we introduce the Correlation Matrix Upsampling Flownet +(CMU-Flownet), incorporating an occlusion estimation module within its cost +volume layer, alongside an Occlusion-aware Cost Volume (OCV) mechanism. +Specifically, we propose an enhanced upsampling approach that expands the +sensory field of the sampling process which integrates a Correlation Matrix +designed to evaluate point-level similarity. Meanwhile, our model robustly +integrates occlusion data within the context of scene flow, deploying this +information strategically during the refinement phase of the flow estimation. +The efficacy of this approach is demonstrated through subsequent experimental +validation. Empirical assessments reveal that CMU-Flownet establishes +state-of-the-art performance within the realms of occluded Flyingthings3D and +KITTY datasets, surpassing previous methodologies across a majority of +evaluated metrics. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Classification of Prostate Cancer in 3D Magnetic Resonance Imaging Data + based on Convolutional Neural Networks + + +
+ Prostate cancer is a commonly diagnosed cancerous disease among men +world-wide. Even with modern technology such as multi-parametric magnetic +resonance tomography and guided biopsies, the process for diagnosing prostate +cancer remains time consuming and requires highly trained professionals. In +this paper, different convolutional neural networks (CNN) are evaluated on +their abilities to reliably classify whether an MRI sequence contains malignant +lesions. Implementations of a ResNet, a ConvNet and a ConvNeXt for 3D image +data are trained and evaluated. The models are trained using different data +augmentation techniques, learning rates, and optimizers. The data is taken from +a private dataset, provided by Cantonal Hospital Aarau. The best result was +achieved by a ResNet3D, yielding an average precision score of 0.4583 and AUC +ROC score of 0.6214. + +
+
+ comment: Previous version published in Buzug T.M., Handels H., M\"uller S., + H\"ubner C., Mertins A., Rostalski P.: Student Conference Proceedings 2023, + Infinite Science Publishing, 2023 (ISBN/EAN 978-3-945954-72-0). 7 pages, 2 + figures +
+
+
+
+
+ + ☆ SPVLoc: Semantic Panoramic Viewport Matching for 6D Camera Localization + in Unseen Environments + + +
+ In this paper, we present SPVLoc, a global indoor localization method that +accurately determines the six-dimensional (6D) camera pose of a query image and +requires minimal scene-specific prior knowledge and no scene-specific training. +Our approach employs a novel matching procedure to localize the perspective +camera's viewport, given as an RGB image, within a set of panoramic semantic +layout representations of the indoor environment. The panoramas are rendered +from an untextured 3D reference model, which only comprises approximate +structural information about room shapes, along with door and window +annotations. We demonstrate that a straightforward convolutional network +structure can successfully achieve image-to-panorama and ultimately +image-to-model matching. Through a viewport classification score, we rank +reference panoramas and select the best match for the query image. Then, a 6D +relative pose is estimated between the chosen panorama and query image. Our +experiments demonstrate that this approach not only efficiently bridges the +domain gap but also generalizes well to previously unseen scenes that are not +part of the training data. Moreover, it achieves superior localization accuracy +compared to the state of the art methods and also estimates more degrees of +freedom of the camera pose. We will make our source code publicly available at +https://github.com/fraunhoferhhi/spvloc . + +
+
+ comment: This submission includes the paper and supplementary material. 24 + pages, 11 figures +
+
+
+
+
+ + ☆ MobileNetV4 - Universal Models for the Mobile Ecosystem + + +
+ We present the latest generation of MobileNets, known as MobileNetV4 (MNv4), +featuring universally efficient architecture designs for mobile devices. At its +core, we introduce the Universal Inverted Bottleneck (UIB) search block, a +unified and flexible structure that merges Inverted Bottleneck (IB), ConvNext, +Feed Forward Network (FFN), and a novel Extra Depthwise (ExtraDW) variant. +Alongside UIB, we present Mobile MQA, an attention block tailored for mobile +accelerators, delivering a significant 39% speedup. An optimized neural +architecture search (NAS) recipe is also introduced which improves MNv4 search +effectiveness. The integration of UIB, Mobile MQA and the refined NAS recipe +results in a new suite of MNv4 models that are mostly Pareto optimal across +mobile CPUs, DSPs, GPUs, as well as specialized accelerators like Apple Neural +Engine and Google Pixel EdgeTPU - a characteristic not found in any other +models tested. Finally, to further boost accuracy, we introduce a novel +distillation technique. Enhanced by this technique, our MNv4-Hybrid-Large model +delivers 87% ImageNet-1K accuracy, with a Pixel 8 EdgeTPU runtime of just +3.8ms. + +
+
+
+
+
+ + ☆ Self-Supervised Visual Preference Alignment + + +
+ This paper makes the first attempt towards unsupervised preference alignment +in Vision-Language Models (VLMs). We generate chosen and rejected responses +with regard to the original and augmented image pairs, and conduct preference +alignment with direct preference optimization. It is based on a core idea: +properly designed augmentation to the image input will induce VLM to generate +false but hard negative responses, which helps the model to learn from and +produce more robust and powerful answers. The whole pipeline no longer hinges +on supervision from GPT4 or human involvement during alignment, and is highly +efficient with few lines of code. With only 8k randomly sampled unsupervised +data, it achieves 90\% relative score to GPT-4 on complex reasoning in +LLaVA-Bench, and improves LLaVA-7B/13B by 6.7\%/5.6\% score on complex +multi-modal benchmark MM-Vet. Visualizations shows its improved ability to +align with user-intentions. A series of ablations are firmly conducted to +reveal the latent mechanism of the approach, which also indicates its potential +towards further scaling. Code will be available. + +
+
+
+
+
+ + ☆ Robust Noisy Label Learning via Two-Stream Sample Distillation + + +
+ Noisy label learning aims to learn robust networks under the supervision of +noisy labels, which plays a critical role in deep learning. Existing work +either conducts sample selection or label correction to deal with noisy labels +during the model training process. In this paper, we design a simple yet +effective sample selection framework, termed Two-Stream Sample Distillation +(TSSD), for noisy label learning, which can extract more high-quality samples +with clean labels to improve the robustness of network training. Firstly, a +novel Parallel Sample Division (PSD) module is designed to generate a certain +training set with sufficient reliable positive and negative samples by jointly +considering the sample structure in feature space and the human prior in loss +space. Secondly, a novel Meta Sample Purification (MSP) module is further +designed to mine adequate semi-hard samples from the remaining uncertain +training set by learning a strong meta classifier with extra golden data. As a +result, more and more high-quality samples will be distilled from the noisy +training set to train networks robustly in every iteration. Extensive +experiments on four benchmark datasets, including CIFAR-10, CIFAR-100, +Tiny-ImageNet, and Clothing-1M, show that our method has achieved +state-of-the-art results over its competitors. + +
+
+
+
+
+ + ☆ LAECIPS: Large Vision Model Assisted Adaptive Edge-Cloud Collaboration + for IoT-based Perception System + + +
+ Recent large vision models (e.g., SAM) enjoy great potential to facilitate +intelligent perception with high accuracy. Yet, the resource constraints in the +IoT environment tend to limit such large vision models to be locally deployed, +incurring considerable inference latency thereby making it difficult to support +real-time applications, such as autonomous driving and robotics. Edge-cloud +collaboration with large-small model co-inference offers a promising approach +to achieving high inference accuracy and low latency. However, existing +edge-cloud collaboration methods are tightly coupled with the model +architecture and cannot adapt to the dynamic data drifts in heterogeneous IoT +environments. To address the issues, we propose LAECIPS, a new edge-cloud +collaboration framework. In LAECIPS, both the large vision model on the cloud +and the lightweight model on the edge are plug-and-play. We design an +edge-cloud collaboration strategy based on hard input mining, optimized for +both high accuracy and low latency. We propose to update the edge model and its +collaboration strategy with the cloud under the supervision of the large vision +model, so as to adapt to the dynamic IoT data streams. Theoretical analysis of +LAECIPS proves its feasibility. Experiments conducted in a robotic semantic +segmentation system using real-world datasets show that LAECIPS outperforms its +state-of-the-art competitors in accuracy, latency, and communication overhead +while having better adaptability to dynamic environments. + +
+
+
+
+
+ + ☆ Teaching Chinese Sign Language with Feedback in Mixed Reality + + +
+ Traditional sign language teaching methods face challenges such as limited +feedback and diverse learning scenarios. Although 2D resources lack real-time +feedback, classroom teaching is constrained by a scarcity of teacher. Methods +based on VR and AR have relatively primitive interaction feedback mechanisms. +This study proposes an innovative teaching model that uses real-time monocular +vision and mixed reality technology. First, we introduce an improved +hand-posture reconstruction method to achieve sign language semantic retention +and real-time feedback. Second, a ternary system evaluation algorithm is +proposed for a comprehensive assessment, maintaining good consistency with +experts in sign language. Furthermore, we use mixed reality technology to +construct a scenario-based 3D sign language classroom and explore the user +experience of scenario teaching. Overall, this paper presents a novel teaching +method that provides an immersive learning experience, advanced posture +reconstruction, and precise feedback, achieving positive feedback on user +experience and learning effectiveness. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ AbsGS: Recovering Fine Details for 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting (3D-GS) technique couples 3D Gaussian primitives with +differentiable rasterization to achieve high-quality novel view synthesis +results while providing advanced real-time rendering performance. However, due +to the flaw of its adaptive density control strategy in 3D-GS, it frequently +suffers from over-reconstruction issue in intricate scenes containing +high-frequency details, leading to blurry rendered images. The underlying +reason for the flaw has still been under-explored. In this work, we present a +comprehensive analysis of the cause of aforementioned artifacts, namely +gradient collision, which prevents large Gaussians in over-reconstructed +regions from splitting. To address this issue, we propose the novel +homodirectional view-space positional gradient as the criterion for +densification. Our strategy efficiently identifies large Gaussians in +over-reconstructed regions, and recovers fine details by splitting. We evaluate +our proposed method on various challenging datasets. The experimental results +indicate that our approach achieves the best rendering quality with reduced or +similar memory consumption. Our method is easy to implement and can be +incorporated into a wide variety of most recent Gaussian Splatting-based +methods. We will open source our codes upon formal publication. Our project +page is available at: https://ty424.github.io/AbsGS.github.io/ + +
+
+
+
+
+ + ☆ Efficient optimal dispersed Haar-like filters for face detection + + +
+ This paper introduces a new dispersed Haar-like filter for efficiently +detection face. The basic idea for finding the filter is maximising +between-class and minimising within-class variance. The proposed filters can be +considered as an optimal configuration dispersed Haar-like filters; filters +with disjoint black and white parts. + +
+
+
+
+
+ + ☆ Toward a Realistic Benchmark for Out-of-Distribution Detection + + +
+ Deep neural networks are increasingly used in a wide range of technologies +and services, but remain highly susceptible to out-of-distribution (OOD) +samples, that is, drawn from a different distribution than the original +training set. A common approach to address this issue is to endow deep neural +networks with the ability to detect OOD samples. Several benchmarks have been +proposed to design and validate OOD detection techniques. However, many of them +are based on far-OOD samples drawn from very different distributions, and thus +lack the complexity needed to capture the nuances of real-world scenarios. In +this work, we introduce a comprehensive benchmark for OOD detection, based on +ImageNet and Places365, that assigns individual classes as in-distribution or +out-of-distribution depending on the semantic similarity with the training set. +Several techniques can be used to determine which classes should be considered +in-distribution, yielding benchmarks with varying properties. Experimental +results on different OOD detection techniques show how their measured efficacy +depends on the selected benchmark and how confidence-based techniques may +outperform classifier-based ones on near-OOD samples. + +
+
+
+
+
+ + ☆ A Computer Vision-Based Quality Assessment Technique for the automatic + control of consumables for analytical laboratories + + +
+ The rapid growth of the Industry 4.0 paradigm is increasing the pressure to +develop effective automated monitoring systems. Artificial Intelligence (AI) is +a convenient tool to improve the efficiency of industrial processes while +reducing errors and waste. In fact, it allows the use of real-time data to +increase the effectiveness of monitoring systems, minimize errors, make the +production process more sustainable, and save costs. In this paper, a novel +automatic monitoring system is proposed in the context of production process of +plastic consumables used in analysis laboratories, with the aim to increase the +effectiveness of the control process currently performed by a human operator. +In particular, we considered the problem of classifying the presence or absence +of a transparent anticoagulant substance inside test tubes. Specifically, a +hand-designed deep network model is used and compared with some +state-of-the-art models for its ability to categorize different images of vials +that can be either filled with the anticoagulant or empty. Collected results +indicate that the proposed approach is competitive with state-of-the-art models +in terms of accuracy. Furthermore, we increased the complexity of the task by +training the models on the ability to discriminate not only the presence or +absence of the anticoagulant inside the vial, but also the size of the test +tube. The analysis performed in the latter scenario confirms the +competitiveness of our approach. Moreover, our model is remarkably superior in +terms of its generalization ability and requires significantly fewer resources. +These results suggest the possibility of successfully implementing such a model +in the production process of a plastic consumables company. + +
+
+ comment: 31 pages, 13 figures, 10 tables +
+
+
+
+
+ + ☆ 1st Place Solution for ICCV 2023 OmniObject3D Challenge: Sparse-View + Reconstruction + + +
+ In this report, we present the 1st place solution for ICCV 2023 OmniObject3D +Challenge: Sparse-View Reconstruction. The challenge aims to evaluate +approaches for novel view synthesis and surface reconstruction using only a few +posed images of each object. We utilize Pixel-NeRF as the basic model, and +apply depth supervision as well as coarse-to-fine positional encoding. The +experiments demonstrate the effectiveness of our approach in improving +sparse-view reconstruction quality. We ranked first in the final test with a +PSNR of 25.44614. + +
+
+
+
+
+ + ☆ The Unreasonable Effectiveness of Pre-Trained Features for Camera Pose + Refinement CVPR2024 + + +
+ Pose refinement is an interesting and practically relevant research +direction. Pose refinement can be used to (1) obtain a more accurate pose +estimate from an initial prior (e.g., from retrieval), (2) as pre-processing, +i.e., to provide a better starting point to a more expensive pose estimator, +(3) as post-processing of a more accurate localizer. Existing approaches focus +on learning features / scene representations for the pose refinement task. This +involves training an implicit scene representation or learning features while +optimizing a camera pose-based loss. A natural question is whether training +specific features / representations is truly necessary or whether similar +results can be already achieved with more generic features. In this work, we +present a simple approach that combines pre-trained features with a particle +filter and a renderable representation of the scene. Despite its simplicity, it +achieves state-of-the-art results, demonstrating that one can easily build a +pose refiner without the need for specific training. The code is at +https://github.com/ga1i13o/mcloc_poseref + +
+
+ comment: Accepted to CVPR2024 (Highlight) +
+
+
+
+
+ + ☆ Explainable concept mappings of MRI: Revealing the mechanisms underlying + deep learning-based brain disease classification + + +
+ Motivation. While recent studies show high accuracy in the classification of +Alzheimer's disease using deep neural networks, the underlying learned concepts +have not been investigated. + Goals. To systematically identify changes in brain regions through concepts +learned by the deep neural network for model validation. + Approach. Using quantitative R2* maps we separated Alzheimer's patients +(n=117) from normal controls (n=219) by using a convolutional neural network +and systematically investigated the learned concepts using Concept Relevance +Propagation and compared these results to a conventional region of +interest-based analysis. + Results. In line with established histological findings and the region of +interest-based analyses, highly relevant concepts were primarily found in and +adjacent to the basal ganglia. + Impact. The identification of concepts learned by deep neural networks for +disease classification enables validation of the models and could potentially +improve reliability. + +
+
+
+
+
+ + ☆ Camera clustering for scalable stream-based active distillation + + +
+ We present a scalable framework designed to craft efficient lightweight +models for video object detection utilizing self-training and knowledge +distillation techniques. We scrutinize methodologies for the ideal selection of +training images from video streams and the efficacy of model sharing across +numerous cameras. By advocating for a camera clustering methodology, we aim to +diminish the requisite number of models for training while augmenting the +distillation dataset. The findings affirm that proper camera clustering notably +amplifies the accuracy of distilled models, eclipsing the methodologies that +employ distinct models for each camera or a universal model trained on the +aggregate camera data. + +
+
+ comment: This manuscript is currently under review at IEEE Transactions on + Circuits and Systems for Video Technology +
+
+
+
+
+ + ☆ Adversarial Identity Injection for Semantic Face Image Synthesis CVPR 2024 + + +
+ Nowadays, deep learning models have reached incredible performance in the +task of image generation. Plenty of literature works address the task of face +generation and editing, with human and automatic systems that struggle to +distinguish what's real from generated. Whereas most systems reached excellent +visual generation quality, they still face difficulties in preserving the +identity of the starting input subject. Among all the explored techniques, +Semantic Image Synthesis (SIS) methods, whose goal is to generate an image +conditioned on a semantic segmentation mask, are the most promising, even +though preserving the perceived identity of the input subject is not their main +concern. Therefore, in this paper, we investigate the problem of identity +preservation in face image generation and present an SIS architecture that +exploits a cross-attention mechanism to merge identity, style, and semantic +features to generate faces whose identities are as similar as possible to the +input ones. Experimental results reveal that the proposed method is not only +suitable for preserving the identity but is also effective in the face +recognition adversarial attack, i.e. hiding a second identity in the generated +faces. + +
+
+ comment: Paper accepted at CVPR 2024 Biometrics Workshop +
+
+
+
+
+ + ☆ Comprehensive Survey of Model Compression and Speed up for Vision + Transformers + + +
+ Vision Transformers (ViT) have marked a paradigm shift in computer vision, +outperforming state-of-the-art models across diverse tasks. However, their +practical deployment is hampered by high computational and memory demands. This +study addresses the challenge by evaluating four primary model compression +techniques: quantization, low-rank approximation, knowledge distillation, and +pruning. We methodically analyze and compare the efficacy of these techniques +and their combinations in optimizing ViTs for resource-constrained +environments. Our comprehensive experimental evaluation demonstrates that these +methods facilitate a balanced compromise between model accuracy and +computational efficiency, paving the way for wider application in edge +computing devices. + +
+
+
+
+
+ + ☆ Integration of Self-Supervised BYOL in Semi-Supervised Medical Image + Recognition CCS 2024 + + +
+ Image recognition techniques heavily rely on abundant labeled data, +particularly in medical contexts. Addressing the challenges associated with +obtaining labeled data has led to the prominence of self-supervised learning +and semi-supervised learning, especially in scenarios with limited annotated +data. In this paper, we proposed an innovative approach by integrating +self-supervised learning into semi-supervised models to enhance medical image +recognition. Our methodology commences with pre-training on unlabeled data +utilizing the BYOL method. Subsequently, we merge pseudo-labeled and labeled +datasets to construct a neural network classifier, refining it through +iterative fine-tuning. Experimental results on three different datasets +demonstrate that our approach optimally leverages unlabeled data, outperforming +existing methods in terms of accuracy for medical image recognition. + +
+
+ comment: Accepted by ICCS 2024 +
+
+
+
+
+ + ☆ Portrait3D: Text-Guided High-Quality 3D Portrait Generation Using + Pyramid Representation and GANs Prior + + +
+ Existing neural rendering-based text-to-3D-portrait generation methods +typically make use of human geometry prior and diffusion models to obtain +guidance. However, relying solely on geometry information introduces issues +such as the Janus problem, over-saturation, and over-smoothing. We present +Portrait3D, a novel neural rendering-based framework with a novel joint +geometry-appearance prior to achieve text-to-3D-portrait generation that +overcomes the aforementioned issues. To accomplish this, we train a 3D portrait +generator, 3DPortraitGAN-Pyramid, as a robust prior. This generator is capable +of producing 360{\deg} canonical 3D portraits, serving as a starting point for +the subsequent diffusion-based generation process. To mitigate the "grid-like" +artifact caused by the high-frequency information in the feature-map-based 3D +representation commonly used by most 3D-aware GANs, we integrate a novel +pyramid tri-grid 3D representation into 3DPortraitGAN-Pyramid. To generate 3D +portraits from text, we first project a randomly generated image aligned with +the given prompt into the pre-trained 3DPortraitGAN-Pyramid's latent space. The +resulting latent code is then used to synthesize a pyramid tri-grid. Beginning +with the obtained pyramid tri-grid, we use score distillation sampling to +distill the diffusion model's knowledge into the pyramid tri-grid. Following +that, we utilize the diffusion model to refine the rendered images of the 3D +portrait and then use these refined images as training data to further optimize +the pyramid tri-grid, effectively eliminating issues with unrealistic color and +unnatural artifacts. Our experimental results show that Portrait3D can produce +realistic, high-quality, and canonical 3D portraits that align with the prompt. + +
+
+
+
+
+ + ☆ CNN-based explanation ensembling for dataset, representation and + explanations evaluation + + +
+ Explainable Artificial Intelligence has gained significant attention due to +the widespread use of complex deep learning models in high-stake domains such +as medicine, finance, and autonomous cars. However, different explanations +often present different aspects of the model's behavior. In this research +manuscript, we explore the potential of ensembling explanations generated by +deep classification models using convolutional model. Through experimentation +and analysis, we aim to investigate the implications of combining explanations +to uncover a more coherent and reliable patterns of the model's behavior, +leading to the possibility of evaluating the representation learned by the +model. With our method, we can uncover problems of under-representation of +images in a certain class. Moreover, we discuss other side benefits like +features' reduction by replacing the original image with its explanations +resulting in the removal of some sensitive information. Through the use of +carefully selected evaluation metrics from the Quantus library, we demonstrated +the method's superior performance in terms of Localisation and Faithfulness, +compared to individual explanations. + +
+
+ comment: accepted at 2nd World Conference on eXplainable Artificial + Intelligence +
+
+
+
+
+ + ☆ Learning to Score Sign Language with Two-stage Method + + +
+ Human action recognition and performance assessment have been hot research +topics in recent years. Recognition problems have mature solutions in the field +of sign language, but past research in performance analysis has focused on +competitive sports and medical training, overlooking the scoring assessment +,which is an important part of sign language teaching digitalization. In this +paper, we analyze the existing technologies for performance assessment and +adopt methods that perform well in human pose reconstruction tasks combined +with motion rotation embedded expressions, proposing a two-stage sign language +performance evaluation pipeline. Our analysis shows that choosing +reconstruction tasks in the first stage can provide more expressive features, +and using smoothing methods can provide an effective reference for assessment. +Experiments show that our method provides good score feedback mechanisms and +high consistency with professional assessments compared to end-to-end +evaluations. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Second Edition FRCSyn Challenge at CVPR 2024: Face Recognition Challenge + in the Era of Synthetic Data + + +
+ Synthetic data is gaining increasing relevance for training machine learning +models. This is mainly motivated due to several factors such as the lack of +real data and intra-class variability, time and errors produced in manual +labeling, and in some cases privacy concerns, among others. This paper presents +an overview of the 2nd edition of the Face Recognition Challenge in the Era of +Synthetic Data (FRCSyn) organized at CVPR 2024. FRCSyn aims to investigate the +use of synthetic data in face recognition to address current technological +limitations, including data privacy concerns, demographic biases, +generalization to novel scenarios, and performance constraints in challenging +situations such as aging, pose variations, and occlusions. Unlike the 1st +edition, in which synthetic data from DCFace and GANDiffFace methods was only +allowed to train face recognition systems, in this 2nd edition we propose new +sub-tasks that allow participants to explore novel face generative methods. The +outcomes of the 2nd FRCSyn Challenge, along with the proposed experimental +protocol and benchmarking contribute significantly to the application of +synthetic data to face recognition. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2311.10476 +
+
+
+
+
+ + ☆ Know Yourself Better: Diverse Discriminative Feature Learning Improves + Open Set Recognition + + +
+ Open set recognition (OSR) is a critical aspect of machine learning, +addressing the challenge of detecting novel classes during inference. Within +the realm of deep learning, neural classifiers trained on a closed set of data +typically struggle to identify novel classes, leading to erroneous predictions. +To address this issue, various heuristic methods have been proposed, allowing +models to express uncertainty by stating "I don't know." However, a gap in the +literature remains, as there has been limited exploration of the underlying +mechanisms of these methods. In this paper, we conduct an analysis of open set +recognition methods, focusing on the aspect of feature diversity. Our research +reveals a significant correlation between learning diverse discriminative +features and enhancing OSR performance. Building on this insight, we propose a +novel OSR approach that leverages the advantages of feature diversity. The +efficacy of our method is substantiated through rigorous evaluation on a +standard OSR testbench, demonstrating a substantial improvement over +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Improving Bracket Image Restoration and Enhancement with Flow-guided + Alignment and Enhanced Feature Aggregation + + +
+ In this paper, we address the Bracket Image Restoration and Enhancement +(BracketIRE) task using a novel framework, which requires restoring a +high-quality high dynamic range (HDR) image from a sequence of noisy, blurred, +and low dynamic range (LDR) multi-exposure RAW inputs. To overcome this +challenge, we present the IREANet, which improves the multiple exposure +alignment and aggregation with a Flow-guide Feature Alignment Module (FFAM) and +an Enhanced Feature Aggregation Module (EFAM). Specifically, the proposed FFAM +incorporates the inter-frame optical flow as guidance to facilitate the +deformable alignment and spatial attention modules for better feature +alignment. The EFAM further employs the proposed Enhanced Residual Block (ERB) +as a foundational component, wherein a unidirectional recurrent network +aggregates the aligned temporal features to better reconstruct the results. To +improve model generalization and performance, we additionally employ the Bayer +preserving augmentation (BayerAug) strategy to augment the multi-exposure RAW +inputs. Our experimental evaluations demonstrate that the proposed IREANet +shows state-of-the-art performance compared with previous methods. + +
+
+
+
+
+ + ☆ Optimization of Prompt Learning via Multi-Knowledge Representation for + Vision-Language Models + + +
+ Vision-Language Models (VLMs), such as CLIP, play a foundational role in +various cross-modal applications. To fully leverage VLMs' potential in adapting +to downstream tasks, context optimization methods like Prompt Tuning are +essential. However, one key limitation is the lack of diversity in prompt +templates, whether they are hand-crafted or learned through additional modules. +This limitation restricts the capabilities of pretrained VLMs and can result in +incorrect predictions in downstream tasks. To address this challenge, we +propose Context Optimization with Multi-Knowledge Representation (CoKnow), a +framework that enhances Prompt Learning for VLMs with rich contextual +knowledge. To facilitate CoKnow during inference, we trained lightweight +semantic knowledge mappers, which are capable of generating Multi-Knowledge +Representation for an input image without requiring additional priors. +Experimentally, We conducted extensive experiments on 11 publicly available +datasets, demonstrating that CoKnow outperforms a series of previous methods. +We will make all resources open-source: https://github.com/EMZucas/CoKnow. + +
+
+
+
+
+ + ☆ The Ninth NTIRE 2024 Efficient Super-Resolution Challenge Report CVPR + + +
+ This paper provides a comprehensive review of the NTIRE 2024 challenge, +focusing on efficient single-image super-resolution (ESR) solutions and their +outcomes. The task of this challenge is to super-resolve an input image with a +magnification factor of x4 based on pairs of low and corresponding +high-resolution images. The primary objective is to develop networks that +optimize various aspects such as runtime, parameters, and FLOPs, while still +maintaining a peak signal-to-noise ratio (PSNR) of approximately 26.90 dB on +the DIV2K_LSDIR_valid dataset and 26.99 dB on the DIV2K_LSDIR_test dataset. In +addition, this challenge has 4 tracks including the main track (overall +performance), sub-track 1 (runtime), sub-track 2 (FLOPs), and sub-track 3 +(parameters). In the main track, all three metrics (ie runtime, FLOPs, and +parameter count) were considered. The ranking of the main track is calculated +based on a weighted sum-up of the scores of all other sub-tracks. In sub-track +1, the practical runtime performance of the submissions was evaluated, and the +corresponding score was used to determine the ranking. In sub-track 2, the +number of FLOPs was considered. The score calculated based on the corresponding +FLOPs was used to determine the ranking. In sub-track 3, the number of +parameters was considered. The score calculated based on the corresponding +parameters was used to determine the ranking. RLFN is set as the baseline for +efficiency measurement. The challenge had 262 registered participants, and 34 +teams made valid submissions. They gauge the state-of-the-art in efficient +single-image super-resolution. To facilitate the reproducibility of the +challenge and enable other researchers to build upon these findings, the code +and the pre-trained model of validated solutions are made publicly available at +https://github.com/Amazingren/NTIRE2024_ESR/. + +
+
+ comment: The report paper of NTIRE2024 Efficient Super-resolution, accepted by + CVPRW2024 +
+
+
+
+
+ + ☆ Referring Flexible Image Restoration + + +
+ In reality, images often exhibit multiple degradations, such as rain and fog +at night (triple degradations). However, in many cases, individuals may not +want to remove all degradations, for instance, a blurry lens revealing a +beautiful snowy landscape (double degradations). In such scenarios, people may +only desire to deblur. These situations and requirements shed light on a new +challenge in image restoration, where a model must perceive and remove specific +degradation types specified by human commands in images with multiple +degradations. We term this task Referring Flexible Image Restoration (RFIR). To +address this, we first construct a large-scale synthetic dataset called RFIR, +comprising 153,423 samples with the degraded image, text prompt for specific +degradation removal and restored image. RFIR consists of five basic degradation +types: blur, rain, haze, low light and snow while six main sub-categories are +included for varying degrees of degradation removal. To tackle the challenge, +we propose a novel transformer-based multi-task model named TransRFIR, which +simultaneously perceives degradation types in the degraded image and removes +specific degradation upon text prompt. TransRFIR is based on two devised +attention modules, Multi-Head Agent Self-Attention (MHASA) and Multi-Head Agent +Cross Attention (MHACA), where MHASA and MHACA introduce the agent token and +reach the linear complexity, achieving lower computation cost than vanilla +self-attention and cross-attention and obtaining competitive performances. Our +TransRFIR achieves state-of-the-art performances compared with other +counterparts and is proven as an effective architecture for image restoration. +We release our project at https://github.com/GuanRunwei/FIR-CP. + +
+
+ comment: 15 pages, 19 figures +
+
+
+
+
+ + ☆ Efficiently Adversarial Examples Generation for Visual-Language Models + under Targeted Transfer Scenarios using Diffusion Models + + +
+ Targeted transfer-based attacks involving adversarial examples pose a +significant threat to large visual-language models (VLMs). However, the +state-of-the-art (SOTA) transfer-based attacks incur high costs due to +excessive iteration counts. Furthermore, the generated adversarial examples +exhibit pronounced adversarial noise and demonstrate limited efficacy in +evading defense methods such as DiffPure. To address these issues, inspired by +score matching, we introduce AdvDiffVLM, which utilizes diffusion models to +generate natural, unrestricted adversarial examples. Specifically, AdvDiffVLM +employs Adaptive Ensemble Gradient Estimation to modify the score during the +diffusion model's reverse generation process, ensuring the adversarial examples +produced contain natural adversarial semantics and thus possess enhanced +transferability. Simultaneously, to enhance the quality of adversarial examples +further, we employ the GradCAM-guided Mask method to disperse adversarial +semantics throughout the image, rather than concentrating them in a specific +area. Experimental results demonstrate that our method achieves a speedup +ranging from 10X to 30X compared to existing transfer-based attack methods, +while maintaining superior quality of adversarial examples. Additionally, the +generated adversarial examples possess strong transferability and exhibit +increased robustness against adversarial defense methods. Notably, AdvDiffVLM +can successfully attack commercial VLMs, including GPT-4V, in a black-box +manner. + +
+
+
+
+
+ + ☆ Prescribing the Right Remedy: Mitigating Hallucinations in Large + Vision-Language Models via Targeted Instruction Tuning + + +
+ Despite achieving outstanding performance on various cross-modal tasks, +current large vision-language models (LVLMs) still suffer from hallucination +issues, manifesting as inconsistencies between their generated responses and +the corresponding images. Prior research has implicated that the low quality of +instruction data, particularly the skewed balance between positive and negative +samples, is a significant contributor to model hallucinations. Recently, +researchers have proposed high-quality instruction datasets, such as +LRV-Instruction, to mitigate model hallucination. Nonetheless, our +investigation reveals that hallucinatory concepts from different LVLMs exhibit +specificity, i.e. the distribution of hallucinatory concepts varies +significantly across models. Existing datasets did not consider the +hallucination specificity of different models in the design processes, thereby +diminishing their efficacy in mitigating model hallucination. In this paper, we +propose a targeted instruction data generation framework named DFTG that +tailored to the hallucination specificity of different models. Concretely, DFTG +consists of two stages: hallucination diagnosis, which extracts the necessary +information from the model's responses and images for hallucination diagnosis; +and targeted data generation, which generates targeted instruction data based +on diagnostic results. The experimental results on hallucination benchmarks +demonstrate that the targeted instruction data generated by our method are more +effective in mitigating hallucinations compared to previous datasets. + +
+
+
+
+
+ + ☆ Domain-Rectifying Adapter for Cross-Domain Few-Shot Segmentation CVPR 2024 + + +
+ Few-shot semantic segmentation (FSS) has achieved great success on segmenting +objects of novel classes, supported by only a few annotated samples. However, +existing FSS methods often underperform in the presence of domain shifts, +especially when encountering new domain styles that are unseen during training. +It is suboptimal to directly adapt or generalize the entire model to new +domains in the few-shot scenario. Instead, our key idea is to adapt a small +adapter for rectifying diverse target domain styles to the source domain. +Consequently, the rectified target domain features can fittingly benefit from +the well-optimized source domain segmentation model, which is intently trained +on sufficient source domain data. Training domain-rectifying adapter requires +sufficiently diverse target domains. We thus propose a novel local-global style +perturbation method to simulate diverse potential target domains by +perturbating the feature channel statistics of the individual images and +collective statistics of the entire source domain, respectively. Additionally, +we propose a cyclic domain alignment module to facilitate the adapter +effectively rectifying domains using a reverse domain rectification +supervision. The adapter is trained to rectify the image features from diverse +synthesized target domains to align with the source domain. During testing on +target domains, we start by rectifying the image features and then conduct +few-shot segmentation on the domain-rectified features. Extensive experiments +demonstrate the effectiveness of our method, achieving promising results on +cross-domain few-shot semantic segmentation tasks. Our code is available at +https://github.com/Matt-Su/DR-Adapter. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Application of Deep Learning Methods to Processing of Noisy Medical + Video Data + + +
+ Cells count become a challenging problem when the cells move in a continuous +stream, and their boundaries are difficult for visual detection. To resolve +this problem we modified the training and decision making processes using +curriculum learning and multi-view predictions techniques, respectively. + +
+
+
+
+
+ + ☆ SRGS: Super-Resolution 3D Gaussian Splatting ACM MM 2024 + + +
+ Recently, 3D Gaussian Splatting (3DGS) has gained popularity as a novel +explicit 3D representation. This approach relies on the representation power of +Gaussian primitives to provide a high-quality rendering. However, primitives +optimized at low resolution inevitably exhibit sparsity and texture deficiency, +posing a challenge for achieving high-resolution novel view synthesis (HRNVS). +To address this problem, we propose Super-Resolution 3D Gaussian Splatting +(SRGS) to perform the optimization in a high-resolution (HR) space. The +sub-pixel constraint is introduced for the increased viewpoints in HR space, +exploiting the sub-pixel cross-view information of the multiple low-resolution +(LR) views. The gradient accumulated from more viewpoints will facilitate the +densification of primitives. Furthermore, a pre-trained 2D super-resolution +model is integrated with the sub-pixel constraint, enabling these dense +primitives to learn faithful texture features. In general, our method focuses +on densification and texture learning to effectively enhance the representation +ability of primitives. Experimentally, our method achieves high rendering +quality on HRNVS only with LR inputs, outperforming state-of-the-art methods on +challenging datasets such as Mip-NeRF 360 and Tanks & Temples. Related codes +will be released upon acceptance. + +
+
+ comment: submit ACM MM 2024 +
+
+
+
+
+ + ☆ Awareness of uncertainty in classification using a multivariate model + and multi-views + + +
+ One of the ways to make artificial intelligence more natural is to give it +some room for doubt. Two main questions should be resolved in that way. First, +how to train a model to estimate uncertainties of its own predictions? And +then, what to do with the uncertain predictions if they appear? First, we +proposed an uncertainty-aware negative log-likelihood loss for the case of +N-dimensional multivariate normal distribution with spherical variance matrix +to the solution of N-classes classification tasks. The loss is similar to the +heteroscedastic regression loss. The proposed model regularizes uncertain +predictions, and trains to calculate both the predictions and their uncertainty +estimations. The model fits well with the label smoothing technique. Second, we +expanded the limits of data augmentation at the training and test stages, and +made the trained model to give multiple predictions for a given number of +augmented versions of each test sample. Given the multi-view predictions +together with their uncertainties and confidences, we proposed several methods +to calculate final predictions, including mode values and bin counts with soft +and hard weights. For the latter method, we formalized the model tuning task in +the form of multimodal optimization with non-differentiable criteria of maximum +accuracy, and applied particle swarm optimization to solve the tuning task. The +proposed methodology was tested using CIFAR-10 dataset with clean and noisy +labels and demonstrated good results in comparison with other uncertainty +estimation methods related to sample selection, co-teaching, and label +smoothing. + +
+
+
+
+
+ + ☆ OmniSSR: Zero-shot Omnidirectional Image Super-Resolution using Stable + Diffusion Model + + +
+ Omnidirectional images (ODIs) are commonly used in real-world visual tasks, +and high-resolution ODIs help improve the performance of related visual tasks. +Most existing super-resolution methods for ODIs use end-to-end learning +strategies, resulting in inferior realness of generated images and a lack of +effective out-of-domain generalization capabilities in training methods. Image +generation methods represented by diffusion model provide strong priors for +visual tasks and have been proven to be effectively applied to image +restoration tasks. Leveraging the image priors of the Stable Diffusion (SD) +model, we achieve omnidirectional image super-resolution with both fidelity and +realness, dubbed as OmniSSR. Firstly, we transform the equirectangular +projection (ERP) images into tangent projection (TP) images, whose distribution +approximates the planar image domain. Then, we use SD to iteratively sample +initial high-resolution results. At each denoising iteration, we further +correct and update the initial results using the proposed Octadecaplex Tangent +Information Interaction (OTII) and Gradient Decomposition (GD) technique to +ensure better consistency. Finally, the TP images are transformed back to +obtain the final high-resolution results. Our method is zero-shot, requiring no +training or fine-tuning. Experiments of our method on two benchmark datasets +demonstrate the effectiveness of our proposed method. + +
+
+
+
+
+ + ☆ Learnable Prompt for Few-Shot Semantic Segmentation in Remote Sensing + Domain CVPR + + +
+ Few-shot segmentation is a task to segment objects or regions of novel +classes within an image given only a few annotated examples. In the generalized +setting, the task extends to segment both the base and the novel classes. The +main challenge is how to train the model such that the addition of novel +classes does not hurt the base classes performance, also known as catastrophic +forgetting. To mitigate this issue, we use SegGPT as our base model and train +it on the base classes. Then, we use separate learnable prompts to handle +predictions for each novel class. To handle various object sizes which +typically present in remote sensing domain, we perform patch-based prediction. +To address the discontinuities along patch boundaries, we propose a +patch-and-stitch technique by re-framing the problem as an image inpainting +task. During inference, we also utilize image similarity search over image +embeddings for prompt selection and novel class filtering to reduce false +positive predictions. Based on our experiments, our proposed method boosts the +weighted mIoU of a simple fine-tuned SegGPT from 15.96 to 35.08 on the +validation set of few-shot OpenEarthMap dataset given in the challenge. + +
+
+ comment: Accepted to CVPRW 2024 +
+
+
+
+
+ + ☆ TC-OCR: TableCraft OCR for Efficient Detection & Recognition of Table + Structure & Content + + +
+ The automatic recognition of tabular data in document images presents a +significant challenge due to the diverse range of table styles and complex +structures. Tables offer valuable content representation, enhancing the +predictive capabilities of various systems such as search engines and Knowledge +Graphs. Addressing the two main problems, namely table detection (TD) and table +structure recognition (TSR), has traditionally been approached independently. +In this research, we propose an end-to-end pipeline that integrates deep +learning models, including DETR, CascadeTabNet, and PP OCR v2, to achieve +comprehensive image-based table recognition. This integrated approach +effectively handles diverse table styles, complex structures, and image +distortions, resulting in improved accuracy and efficiency compared to existing +methods like Table Transformers. Our system achieves simultaneous table +detection (TD), table structure recognition (TSR), and table content +recognition (TCR), preserving table structures and accurately extracting +tabular data from document images. The integration of multiple models addresses +the intricacies of table recognition, making our approach a promising solution +for image-based table understanding, data extraction, and information retrieval +applications. Our proposed approach achieves an IOU of 0.96 and an OCR Accuracy +of 78%, showcasing a remarkable improvement of approximately 25% in the OCR +Accuracy compared to the previous Table Transformer approach. + +
+
+ comment: 8 pages, 2 figures, Workshop of 1st MMIR Deep Multimodal Learning for + Information Retrieval +
+
+
+
+
+ + ☆ From Data Deluge to Data Curation: A Filtering-WoRA Paradigm for + Efficient Text-based Person Search + + +
+ In text-based person search endeavors, data generation has emerged as a +prevailing practice, addressing concerns over privacy preservation and the +arduous task of manual annotation. Although the number of synthesized data can +be infinite in theory, the scientific conundrum persists that how much +generated data optimally fuels subsequent model training. We observe that only +a subset of the data in these constructed datasets plays a decisive role. +Therefore, we introduce a new Filtering-WoRA paradigm, which contains a +filtering algorithm to identify this crucial data subset and WoRA (Weighted +Low-Rank Adaptation) learning strategy for light fine-tuning. The filtering +algorithm is based on the cross-modality relevance to remove the lots of coarse +matching synthesis pairs. As the number of data decreases, we do not need to +fine-tune the entire model. Therefore, we propose a WoRA learning strategy to +efficiently update a minimal portion of model parameters. WoRA streamlines the +learning process, enabling heightened efficiency in extracting knowledge from +fewer, yet potent, data instances. Extensive experimentation validates the +efficacy of pretraining, where our model achieves advanced and efficient +retrieval performance on challenging real-world benchmarks. Notably, on the +CUHK-PEDES dataset, we have achieved a competitive mAP of 67.02% while reducing +model training time by 19.82%. + +
+
+
+
+
+ + ☆ NeuroMorphix: A Novel Brain MRI Asymmetry-specific Feature Construction + Approach For Seizure Recurrence Prediction + + +
+ Seizure recurrence is an important concern after an initial unprovoked +seizure; without drug treatment, it occurs within 2 years in 40-50% of cases. +The decision to treat currently relies on predictors of seizure recurrence risk +that are inaccurate, resulting in unnecessary, possibly harmful, treatment in +some patients and potentially preventable seizures in others. Because of the +link between brain lesions and seizure recurrence, we developed a recurrence +prediction tool using machine learning and clinical 3T brain MRI. We developed +NeuroMorphix, a feature construction approach based on MRI brain anatomy. Each +of seven NeuroMorphix features measures the absolute or relative difference +between corresponding regions in each cerebral hemisphere. FreeSurfer was used +to segment brain regions and to generate values for morphometric parameters (8 +for each cortical region and 5 for each subcortical region). The parameters +were then mapped to whole brain NeuroMorphix features, yielding a total of 91 +features per subject. Features were generated for a first seizure patient +cohort (n = 169) categorised into seizure recurrence and non-recurrence +subgroups. State-of-the-art classification algorithms were trained and tested +using NeuroMorphix features to predict seizure recurrence. Classification +models using the top 5 features, ranked by sequential forward selection, +demonstrated excellent performance in predicting seizure recurrence, with area +under the ROC curve of 88-93%, accuracy of 83-89%, and F1 score of 83-90%. +Highly ranked features aligned with structural alterations known to be +associated with epilepsy. This study highlights the potential for targeted, +data-driven approaches to aid clinical decision-making in brain disorders. + +
+
+ comment: This work has been submitted to the IEEE TMI for possible publication +
+
+
+
+
+ + ☆ Tripod: Three Complementary Inductive Biases for Disentangled + Representation Learning + + +
+ Inductive biases are crucial in disentangled representation learning for +narrowing down an underspecified solution set. In this work, we consider +endowing a neural network autoencoder with three select inductive biases from +the literature: data compression into a grid-like latent space via +quantization, collective independence amongst latents, and minimal functional +influence of any latent on how other latents determine data generation. In +principle, these inductive biases are deeply complementary: they most directly +specify properties of the latent space, encoder, and decoder, respectively. In +practice, however, naively combining existing techniques instantiating these +inductive biases fails to yield significant benefits. To address this, we +propose adaptations to the three techniques that simplify the learning problem, +equip key regularization terms with stabilizing invariances, and quash +degenerate incentives. The resulting model, Tripod, achieves state-of-the-art +results on a suite of four image disentanglement benchmarks. We also verify +that Tripod significantly improves upon its naive incarnation and that all +three of its "legs" are necessary for best performance. + +
+
+ comment: 22 pages, 10 figures, code available at + https://github.com/kylehkhsu/tripod +
+
+
+
+
+ + ☆ EucliDreamer: Fast and High-Quality Texturing for 3D Models with + Depth-Conditioned Stable Diffusion + + +
+ We present EucliDreamer, a simple and effective method to generate textures +for 3D models given text prompts and meshes. The texture is parametrized as an +implicit function on the 3D surface, which is optimized with the Score +Distillation Sampling (SDS) process and differentiable rendering. To generate +high-quality textures, we leverage a depth-conditioned Stable Diffusion model +guided by the depth image rendered from the mesh. We test our approach on 3D +models in Objaverse and conducted a user study, which shows its superior +quality compared to existing texturing methods like Text2Tex. In addition, our +method converges 2 times faster than DreamFusion. Through text prompting, +textures of diverse art styles can be produced. We hope Euclidreamer proides a +viable solution to automate a labor-intensive stage in 3D content creation. + +
+
+ comment: Short version of arXiv:2311.15573 +
+
+
+
+
+ + ☆ Plug-and-Play Acceleration of Occupancy Grid-based NeRF Rendering using + VDB Grid and Hierarchical Ray Traversal CVPR + + +
+ Transmittance estimators such as Occupancy Grid (OG) can accelerate the +training and rendering of Neural Radiance Field (NeRF) by predicting important +samples that contributes much to the generated image. However, OG manages +occupied regions in the form of the dense binary grid, in which there are many +blocks with the same values that cause redundant examination of voxels' +emptiness in ray-tracing. In our work, we introduce two techniques to improve +the efficiency of ray-tracing in trained OG without fine-tuning. First, we +replace the dense grids with VDB grids to reduce the spatial redundancy. +Second, we use hierarchical digital differential analyzer (HDDA) to efficiently +trace voxels in the VDB grids. Our experiments on NeRF-Synthetic and Mip-NeRF +360 datasets show that our proposed method successfully accelerates rendering +NeRF-Synthetic dataset by 12% in average and Mip-NeRF 360 dataset by 4% in +average, compared to a fast implementation of OG, NerfAcc, without losing the +quality of rendered images. + +
+
+ comment: Short paper for CVPR Neural Rendering Intelligence Workshop 2024. + Code: https://github.com/Yosshi999/faster-occgrid +
+
+
+
+
+ + ☆ OneActor: Consistent Character Generation via Cluster-Conditioned + Guidance + + +
+ Text-to-image diffusion models benefit artists with high-quality image +generation. Yet its stochastic nature prevent artists from creating consistent +images of the same character. Existing methods try to tackle this challenge and +generate consistent content in various ways. However, they either depend on +external data or require expensive tuning of the diffusion model. For this +issue, we argue that a lightweight but intricate guidance is enough to +function. Aiming at this, we lead the way to formalize the objective of +consistent generation, derive a clustering-based score function and propose a +novel paradigm, OneActor. We design a cluster-conditioned model which +incorporates posterior samples to guide the denoising trajectories towards the +target cluster. To overcome the overfitting challenge shared by one-shot tuning +pipelines, we devise auxiliary components to simultaneously augment the tuning +and regulate the inference. This technique is later verified to significantly +enhance the content diversity of generated images. Comprehensive experiments +show that our method outperforms a variety of baselines with satisfactory +character consistency, superior prompt conformity as well as high image +quality. And our method is at least 4 times faster than tuning-based baselines. +Furthermore, to our best knowledge, we first prove that the semantic space has +the same interpolation property as the latent space dose. This property can +serve as another promising tool for fine generation control. + +
+
+
+
+
+ + ☆ PreGSU-A Generalized Traffic Scene Understanding Model for Autonomous + Driving based on Pre-trained Graph Attention Network + + +
+ Scene understanding, defined as learning, extraction, and representation of +interactions among traffic elements, is one of the critical challenges toward +high-level autonomous driving (AD). Current scene understanding methods mainly +focus on one concrete single task, such as trajectory prediction and risk level +evaluation. Although they perform well on specific metrics, the generalization +ability is insufficient to adapt to the real traffic complexity and downstream +demand diversity. In this study, we propose PreGSU, a generalized pre-trained +scene understanding model based on graph attention network to learn the +universal interaction and reasoning of traffic scenes to support various +downstream tasks. After the feature engineering and sub-graph module, all +elements are embedded as nodes to form a dynamic weighted graph. Then, four +graph attention layers are applied to learn the relationships among agents and +lanes. In the pre-train phase, the understanding model is trained on two +self-supervised tasks: Virtual Interaction Force (VIF) modeling and Masked Road +Modeling (MRM). Based on the artificial potential field theory, VIF modeling +enables PreGSU to capture the agent-to-agent interactions while MRM extracts +agent-to-road connections. In the fine-tuning process, the pre-trained +parameters are loaded to derive detailed understanding outputs. We conduct +validation experiments on two downstream tasks, i.e., trajectory prediction in +urban scenario, and intention recognition in highway scenario, to verify the +generalized ability and understanding ability. Results show that compared with +the baselines, PreGSU achieves better accuracy on both tasks, indicating the +potential to be generalized to various scenes and targets. Ablation study shows +the effectiveness of pre-train task design. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Masked Autoencoders for Microscopy are Scalable Learners of Cellular + Biology CVPR 2024 + + +
+ Featurizing microscopy images for use in biological research remains a +significant challenge, especially for large-scale experiments spanning millions +of images. This work explores the scaling properties of weakly supervised +classifiers and self-supervised masked autoencoders (MAEs) when training with +increasingly larger model backbones and microscopy datasets. Our results show +that ViT-based MAEs outperform weakly supervised classifiers on a variety of +tasks, achieving as much as a 11.5% relative improvement when recalling known +biological relationships curated from public databases. Additionally, we +develop a new channel-agnostic MAE architecture (CA-MAE) that allows for +inputting images of different numbers and orders of channels at inference time. +We demonstrate that CA-MAEs effectively generalize by inferring and evaluating +on a microscopy image dataset (JUMP-CP) generated under different experimental +conditions with a different channel structure than our pretraining data +(RPI-93M). Our findings motivate continued research into scaling +self-supervised learning on microscopy data in order to create powerful +foundation models of cellular biology that have the potential to catalyze +advancements in drug discovery and beyond. + +
+
+ comment: CVPR 2024 Highlight. arXiv admin note: text overlap with + arXiv:2309.16064 +
+
+
+
+
+ + ☆ Vision-and-Language Navigation via Causal Learning + + +
+ In the pursuit of robust and generalizable environment perception and +language understanding, the ubiquitous challenge of dataset bias continues to +plague vision-and-language navigation (VLN) agents, hindering their performance +in unseen environments. This paper introduces the generalized cross-modal +causal transformer (GOAT), a pioneering solution rooted in the paradigm of +causal inference. By delving into both observable and unobservable confounders +within vision, language, and history, we propose the back-door and front-door +adjustment causal learning (BACL and FACL) modules to promote unbiased learning +by comprehensively mitigating potential spurious correlations. Additionally, to +capture global confounder features, we propose a cross-modal feature pooling +(CFP) module supervised by contrastive learning, which is also shown to be +effective in improving cross-modal representations during pre-training. +Extensive experiments across multiple VLN datasets (R2R, REVERIE, RxR, and +SOON) underscore the superiority of our proposed method over previous +state-of-the-art approaches. Code is available at +https://github.com/CrystalSixone/VLN-GOAT. + +
+
+
+
+
+ + ☆ MoE-TinyMed: Mixture of Experts for Tiny Medical Large Vision-Language + Models + + +
+ Mixture of Expert Tuning (MoE-Tuning) has effectively enhanced the +performance of general MLLMs with fewer parameters, yet its application in +resource-limited medical settings has not been fully explored. To address this +gap, we developed MoE-TinyMed, a model tailored for medical applications that +significantly lowers parameter demands. In evaluations on the VQA-RAD, SLAKE, +and Path-VQA datasets, MoE-TinyMed outperformed LLaVA-Med in all Med-VQA closed +settings with just 3.6B parameters. Additionally, a streamlined version with 2B +parameters surpassed LLaVA-Med's performance in PathVQA, showcasing its +effectiveness in resource-limited healthcare settings. + +
+
+
+
+
+ + ☆ Compressible and Searchable: AI-native Multi-Modal Retrieval System with + Learned Image Compression + + +
+ The burgeoning volume of digital content across diverse modalities +necessitates efficient storage and retrieval methods. Conventional approaches +struggle to cope with the escalating complexity and scale of multimedia data. +In this paper, we proposed framework addresses this challenge by fusing +AI-native multi-modal search capabilities with neural image compression. First +we analyze the intricate relationship between compressibility and +searchability, recognizing the pivotal role each plays in the efficiency of +storage and retrieval systems. Through the usage of simple adapter is to bridge +the feature of Learned Image Compression(LIC) and Contrastive Language-Image +Pretraining(CLIP) while retaining semantic fidelity and retrieval of +multi-modal data. Experimental evaluations on Kodak datasets demonstrate the +efficacy of our approach, showcasing significant enhancements in compression +efficiency and search accuracy compared to existing methodologies. Our work +marks a significant advancement towards scalable and efficient multi-modal +search systems in the era of big data. + +
+
+
+
+
+ + ☆ MS-MANO: Enabling Hand Pose Tracking with Biomechanical Constraints CVPR 2024 + + +
+ This work proposes a novel learning framework for visual hand dynamics +analysis that takes into account the physiological aspects of hand motion. The +existing models, which are simplified joint-actuated systems, often produce +unnatural motions. To address this, we integrate a musculoskeletal system with +a learnable parametric hand model, MANO, to create a new model, MS-MANO. This +model emulates the dynamics of muscles and tendons to drive the skeletal +system, imposing physiologically realistic constraints on the resulting torque +trajectories. We further propose a simulation-in-the-loop pose refinement +framework, BioPR, that refines the initial estimated pose through a multi-layer +perceptron (MLP) network. Our evaluation of the accuracy of MS-MANO and the +efficacy of the BioPR is conducted in two separate parts. The accuracy of +MS-MANO is compared with MyoSuite, while the efficacy of BioPR is benchmarked +against two large-scale public datasets and two recent state-of-the-art +methods. The results demonstrate that our approach consistently improves the +baseline methods both quantitatively and qualitatively. + +
+
+ comment: 11 pages, 5 figures; CVPR 2024 +
+
+
+
+
+ + ☆ Find The Gap: Knowledge Base Reasoning For Visual Question Answering + + +
+ We analyze knowledge-based visual question answering, for which given a +question, the models need to ground it into the visual modality and retrieve +the relevant knowledge from a given large knowledge base (KB) to be able to +answer. Our analysis has two folds, one based on designing neural architectures +and training them from scratch, and another based on large pre-trained language +models (LLMs). Our research questions are: 1) Can we effectively augment models +by explicit supervised retrieval of the relevant KB information to solve the +KB-VQA problem? 2) How do task-specific and LLM-based models perform in the +integration of visual and external knowledge, and multi-hop reasoning over both +sources of information? 3) Is the implicit knowledge of LLMs sufficient for +KB-VQA and to what extent it can replace the explicit KB? Our results +demonstrate the positive impact of empowering task-specific and LLM models with +supervised external and visual knowledge retrieval models. Our findings show +that though LLMs are stronger in 1-hop reasoning, they suffer in 2-hop +reasoning in comparison with our fine-tuned NN model even if the relevant +information from both modalities is available to the model. Moreover, we +observed that LLM models outperform the NN model for KB-related questions which +confirms the effectiveness of implicit knowledge in LLMs however, they do not +alleviate the need for external KB. + +
+
+
+
+
+ + ☆ Closed-Loop Open-Vocabulary Mobile Manipulation with GPT-4V + + +
+ Autonomous robot navigation and manipulation in open environments require +reasoning and replanning with closed-loop feedback. We present COME-robot, the +first closed-loop framework utilizing the GPT-4V vision-language foundation +model for open-ended reasoning and adaptive planning in real-world scenarios. +We meticulously construct a library of action primitives for robot exploration, +navigation, and manipulation, serving as callable execution modules for GPT-4V +in task planning. On top of these modules, GPT-4V serves as the brain that can +accomplish multimodal reasoning, generate action policy with code, verify the +task progress, and provide feedback for replanning. Such design enables +COME-robot to (i) actively perceive the environments, (ii) perform situated +reasoning, and (iii) recover from failures. Through comprehensive experiments +involving 8 challenging real-world tabletop and manipulation tasks, COME-robot +demonstrates a significant improvement in task success rate (~25%) compared to +state-of-the-art baseline methods. We further conduct comprehensive analyses to +elucidate how COME-robot's design facilitates failure recovery, free-form +instruction following, and long-horizon task planning. + +
+
+
+
+
+ + ☆ GaitPoint+: A Gait Recognition Network Incorporating Point Cloud + Analysis and Recycling + + +
+ Gait is a behavioral biometric modality that can be used to recognize +individuals by the way they walk from a far distance. Most existing gait +recognition approaches rely on either silhouettes or skeletons, while their +joint use is underexplored. Features from silhouettes and skeletons can provide +complementary information for more robust recognition against appearance +changes or pose estimation errors. To exploit the benefits of both silhouette +and skeleton features, we propose a new gait recognition network, referred to +as the GaitPoint+. Our approach models skeleton key points as a 3D point cloud, +and employs a computational complexity-conscious 3D point processing approach +to extract skeleton features, which are then combined with silhouette features +for improved accuracy. Since silhouette- or CNN-based methods already require +considerable amount of computational resources, it is preferable that the key +point learning module is faster and more lightweight. We present a detailed +analysis of the utilization of every human key point after the use of +traditional max-pooling, and show that while elbow and ankle points are used +most commonly, many useful points are discarded by max-pooling. Thus, we +present a method to recycle some of the discarded points by a Recycling +Max-Pooling module, during processing of skeleton point clouds, and achieve +further performance improvement. We provide a comprehensive set of experimental +results showing that (i) incorporating skeleton features obtained by a +point-based 3D point cloud processing approach boosts the performance of three +different state-of-the-art silhouette- and CNN-based baselines; (ii) recycling +the discarded points increases the accuracy further. Ablation studies are also +provided to show the effectiveness and contribution of different components of +our approach. + +
+
+
+
+
+ + ☆ LWIRPOSE: A novel LWIR Thermal Image Dataset and Benchmark ICIP2024 + + +
+ Human pose estimation faces hurdles in real-world applications due to factors +like lighting changes, occlusions, and cluttered environments. We introduce a +unique RGB-Thermal Nearly Paired and Annotated 2D Pose Dataset, comprising over +2,400 high-quality LWIR (thermal) images. Each image is meticulously annotated +with 2D human poses, offering a valuable resource for researchers and +practitioners. This dataset, captured from seven actors performing diverse +everyday activities like sitting, eating, and walking, facilitates pose +estimation on occlusion and other challenging scenarios. We benchmark +state-of-the-art pose estimation methods on the dataset to showcase its +potential, establishing a strong baseline for future research. Our results +demonstrate the dataset's effectiveness in promoting advancements in pose +estimation for various applications, including surveillance, healthcare, and +sports analytics. The dataset and code are available at +https://github.com/avinres/LWIRPOSE + +
+
+ comment: Submitted in ICIP2024 +
+
+
+
+
+ + ☆ MK-SGN: A Spiking Graph Convolutional Network with Multimodal Fusion and + Knowledge Distillation for Skeleton-based Action Recognition + + +
+ In recent years, skeleton-based action recognition, leveraging multimodal +Graph Convolutional Networks (GCN), has achieved remarkable results. However, +due to their deep structure and reliance on continuous floating-point +operations, GCN-based methods are energy-intensive. To address this issue, we +propose an innovative Spiking Graph Convolutional Network with Multimodal +Fusion and Knowledge Distillation (MK-SGN). By merging the energy efficiency of +Spiking Neural Network (SNN) with the graph representation capability of GCN, +the proposed MK-SGN reduces energy consumption while maintaining recognition +accuracy. Firstly, we convert GCN into Spiking Graph Convolutional Network +(SGN) and construct a foundational Base-SGN for skeleton-based action +recognition, establishing a new benchmark and paving the way for future +research exploration. Secondly, we further propose a Spiking Multimodal Fusion +module (SMF), leveraging mutual information to process multimodal data more +efficiently. Additionally, we introduce a spiking attention mechanism and +design a Spatio Graph Convolution module with a Spatial Global Spiking +Attention mechanism (SA-SGC), enhancing feature learning capability. +Furthermore, we delve into knowledge distillation methods from multimodal GCN +to SGN and propose a novel, integrated method that simultaneously focuses on +both intermediate layer distillation and soft label distillation to improve the +performance of SGN. On two challenging datasets for skeleton-based action +recognition, MK-SGN outperforms the state-of-the-art GCN-like frameworks in +reducing computational load and energy consumption. In contrast, typical GCN +methods typically consume more than 35mJ per action sample, while MK-SGN +reduces energy consumption by more than 98%. + +
+
+
+
+
+ + ☆ Consistency and Uncertainty: Identifying Unreliable Responses From + Black-Box Vision-Language Models for Selective Visual Question Answering CVPR 2024 + + +
+ The goal of selective prediction is to allow an a model to abstain when it +may not be able to deliver a reliable prediction, which is important in +safety-critical contexts. Existing approaches to selective prediction typically +require access to the internals of a model, require retraining a model or study +only unimodal models. However, the most powerful models (e.g. GPT-4) are +typically only available as black boxes with inaccessible internals, are not +retrainable by end-users, and are frequently used for multimodal tasks. We +study the possibility of selective prediction for vision-language models in a +realistic, black-box setting. We propose using the principle of +\textit{neighborhood consistency} to identify unreliable responses from a +black-box vision-language model in question answering tasks. We hypothesize +that given only a visual question and model response, the consistency of the +model's responses over the neighborhood of a visual question will indicate +reliability. It is impossible to directly sample neighbors in feature space in +a black-box setting. Instead, we show that it is possible to use a smaller +proxy model to approximately sample from the neighborhood. We find that +neighborhood consistency can be used to identify model responses to visual +questions that are likely unreliable, even in adversarial settings or settings +that are out-of-distribution to the proxy model. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Residual Connections Harm Self-Supervised Abstract Feature Learning + + +
+ We demonstrate that adding a weighting factor to decay the strength of +identity shortcuts within residual networks substantially improves semantic +feature learning in the state-of-the-art self-supervised masked autoencoding +(MAE) paradigm. Our modification to the identity shortcuts within a VIT-B/16 +backbone of an MAE boosts linear probing accuracy on ImageNet from 67.3% to +72.3%. This significant gap suggests that, while residual connection structure +serves an essential role in facilitating gradient propagation, it may have a +harmful side effect of reducing capacity for abstract learning by virtue of +injecting an echo of shallower representations into deeper layers. We +ameliorate this downside via a fixed formula for monotonically decreasing the +contribution of identity connections as layer depth increases. Our design +promotes the gradual development of feature abstractions, without impacting +network trainability. Analyzing the representations learned by our modified +residual networks, we find correlation between low effective feature rank and +downstream task performance. + +
+
+
+
+
+ + ☆ Neuromorphic Vision-based Motion Segmentation with Graph Transformer + Neural Network + + +
+ Moving object segmentation is critical to interpret scene dynamics for +robotic navigation systems in challenging environments. Neuromorphic vision +sensors are tailored for motion perception due to their asynchronous nature, +high temporal resolution, and reduced power consumption. However, their +unconventional output requires novel perception paradigms to leverage their +spatially sparse and temporally dense nature. In this work, we propose a novel +event-based motion segmentation algorithm using a Graph Transformer Neural +Network, dubbed GTNN. Our proposed algorithm processes event streams as 3D +graphs by a series of nonlinear transformations to unveil local and global +spatiotemporal correlations between events. Based on these correlations, events +belonging to moving objects are segmented from the background without prior +knowledge of the dynamic scene geometry. The algorithm is trained on publicly +available datasets including MOD, EV-IMO, and \textcolor{black}{EV-IMO2} using +the proposed training scheme to facilitate efficient training on extensive +datasets. Moreover, we introduce the Dynamic Object Mask-aware Event Labeling +(DOMEL) approach for generating approximate ground-truth labels for event-based +motion segmentation datasets. We use DOMEL to label our own recorded Event +dataset for Motion Segmentation (EMS-DOMEL), which we release to the public for +further research and benchmarking. Rigorous experiments are conducted on +several unseen publicly-available datasets where the results revealed that GTNN +outperforms state-of-the-art methods in the presence of dynamic background +variations, motion patterns, and multiple dynamic objects with varying sizes +and velocities. GTNN achieves significant performance gains with an average +increase of 9.4% and 4.5% in terms of motion segmentation accuracy (IoU%) and +detection rate (DR%), respectively. + +
+
+
+
+
+ + ☆ A Concise Tiling Strategy for Preserving Spatial Context in Earth + Observation Imagery ICLR 2024 + + +
+ We propose a new tiling strategy, Flip-n-Slide, which has been developed for +specific use with large Earth observation satellite images when the location of +objects-of-interest (OoI) is unknown and spatial context can be necessary for +class disambiguation. Flip-n-Slide is a concise and minimalistic approach that +allows OoI to be represented at multiple tile positions and orientations. This +strategy introduces multiple views of spatio-contextual information, without +introducing redundancies into the training set. By maintaining distinct +transformation permutations for each tile overlap, we enhance the +generalizability of the training set without misrepresenting the true data +distribution. Our experiments validate the effectiveness of Flip-n-Slide in the +task of semantic segmentation, a necessary data product in geophysical studies. +We find that Flip-n-Slide outperforms the previous state-of-the-art +augmentation routines for tiled data in all evaluation metrics. For +underrepresented classes, Flip-n-Slide increases precision by as much as 15.8%. + +
+
+ comment: Accepted to the Machine Learning for Remote Sensing (ML4RS) Workshop + at ICLR 2024 +
+
+
+
+
+ + ☆ Multi-Task Multi-Modal Self-Supervised Learning for Facial Expression + Recognition CVPR 2024 + + +
+ Human communication is multi-modal; e.g., face-to-face interaction involves +auditory signals (speech) and visual signals (face movements and hand +gestures). Hence, it is essential to exploit multiple modalities when designing +machine learning-based facial expression recognition systems. In addition, +given the ever-growing quantities of video data that capture human facial +expressions, such systems should utilize raw unlabeled videos without requiring +expensive annotations. Therefore, in this work, we employ a multitask +multi-modal self-supervised learning method for facial expression recognition +from in-the-wild video data. Our model combines three self-supervised objective +functions: First, a multi-modal contrastive loss, that pulls diverse data +modalities of the same video together in the representation space. Second, a +multi-modal clustering loss that preserves the semantic structure of input data +in the representation space. Finally, a multi-modal data reconstruction loss. +We conduct a comprehensive study on this multimodal multi-task self-supervised +learning method on three facial expression recognition benchmarks. To that end, +we examine the performance of learning through different combinations of +self-supervised tasks on the facial expression recognition downstream task. Our +model ConCluGen outperforms several multi-modal self-supervised and fully +supervised baselines on the CMU-MOSEI dataset. Our results generally show that +multi-modal self-supervision tasks offer large performance gains for +challenging tasks such as facial expression recognition, while also reducing +the amount of manual annotations required. We release our pre-trained models as +well as source code publicly + +
+
+ comment: The paper will appear in the CVPR 2024 workshops proceedings +
+
+
+
+
+ + ☆ From a Lossless (~1.5:1) Compression Algorithm for Llama2 7B Weights to + Variable Precision, Variable Range, Compressed Numeric Data Types for CNNs + and LLMs + + +
+ This paper starts with a simple lossless ~1.5:1 compression algorithm for the +weights of the Large Language Model (LLM) Llama2 7B [1] that can be implemented +in ~200 LUTs in AMD FPGAs, processing over 800 million bfloat16 numbers per +second. This framework is then extended to variable precision, variable range, +compressed numerical data types that are a user defined super set of both +floats and posits [2]. The paper then discusses a simple hardware +implementation of such format based on ANS (Asymmetrical Numeral Systems) [3] +that acts as a bridge between this flexible data format and a computational +engine while, at the same time, achieving bandwidth reduction. An example of a +token factory using weight compression and sharing is also given. + +
+
+
+
+
+ + ☆ Semantics-Aware Attention Guidance for Diagnosing Whole Slide Images + + +
+ Accurate cancer diagnosis remains a critical challenge in digital pathology, +largely due to the gigapixel size and complex spatial relationships present in +whole slide images. Traditional multiple instance learning (MIL) methods often +struggle with these intricacies, especially in preserving the necessary context +for accurate diagnosis. In response, we introduce a novel framework named +Semantics-Aware Attention Guidance (SAG), which includes 1) a technique for +converting diagnostically relevant entities into attention signals, and 2) a +flexible attention loss that efficiently integrates various semantically +significant information, such as tissue anatomy and cancerous regions. Our +experiments on two distinct cancer datasets demonstrate consistent improvements +in accuracy, precision, and recall with two state-of-the-art baseline models. +Qualitative analysis further reveals that the incorporation of heuristic +guidance enables the model to focus on regions critical for diagnosis. SAG is +not only effective for the models discussed here, but its adaptability extends +to any attention-based diagnostic model. This opens up exciting possibilities +for further improving the accuracy and efficiency of cancer diagnostics. + +
+
+
+
+
+ + ☆ Automatic classification of prostate MR series type using image content + and metadata + + +
+ With the wealth of medical image data, efficient curation is essential. +Assigning the sequence type to magnetic resonance images is necessary for +scientific studies and artificial intelligence-based analysis. However, +incomplete or missing metadata prevents effective automation. We therefore +propose a deep-learning method for classification of prostate cancer scanning +sequences based on a combination of image data and DICOM metadata. We +demonstrate superior results compared to metadata or image data alone, and make +our code publicly available at +https://github.com/deepakri201/DICOMScanClassification. + +
+
+
+
+
+ + ☆ HumMUSS: Human Motion Understanding using State Space Models CVPR 24 + + +
+ Understanding human motion from video is essential for a range of +applications, including pose estimation, mesh recovery and action recognition. +While state-of-the-art methods predominantly rely on transformer-based +architectures, these approaches have limitations in practical scenarios. +Transformers are slower when sequentially predicting on a continuous stream of +frames in real-time, and do not generalize to new frame rates. In light of +these constraints, we propose a novel attention-free spatiotemporal model for +human motion understanding building upon recent advancements in state space +models. Our model not only matches the performance of transformer-based models +in various motion understanding tasks but also brings added benefits like +adaptability to different video frame rates and enhanced training speed when +working with longer sequence of keypoints. Moreover, the proposed model +supports both offline and real-time applications. For real-time sequential +prediction, our model is both memory efficient and several times faster than +transformer-based approaches while maintaining their high accuracy. + +
+
+ comment: CVPR 24 +
+
+
+
+
+ + ☆ OSR-ViT: A Simple and Modular Framework for Open-Set Object Detection + and Discovery + + +
+ An object detector's ability to detect and flag \textit{novel} objects during +open-world deployments is critical for many real-world applications. +Unfortunately, much of the work in open object detection today is disjointed +and fails to adequately address applications that prioritize unknown object +recall \textit{in addition to} known-class accuracy. To close this gap, we +present a new task called Open-Set Object Detection and Discovery (OSODD) and +as a solution propose the Open-Set Regions with ViT features (OSR-ViT) +detection framework. OSR-ViT combines a class-agnostic proposal network with a +powerful ViT-based classifier. Its modular design simplifies optimization and +allows users to easily swap proposal solutions and feature extractors to best +suit their application. Using our multifaceted evaluation protocol, we show +that OSR-ViT obtains performance levels that far exceed state-of-the-art +supervised methods. Our method also excels in low-data settings, outperforming +supervised baselines using a fraction of the training data. + +
+
+ comment: 28 pages, 8 figures, 7 tables +
+
+
+
+
+ + ☆ Vocabulary-free Image Classification and Semantic Segmentation + + +
+ Large vision-language models revolutionized image classification and semantic +segmentation paradigms. However, they typically assume a pre-defined set of +categories, or vocabulary, at test time for composing textual prompts. This +assumption is impractical in scenarios with unknown or evolving semantic +context. Here, we address this issue and introduce the Vocabulary-free Image +Classification (VIC) task, which aims to assign a class from an unconstrained +language-induced semantic space to an input image without needing a known +vocabulary. VIC is challenging due to the vastness of the semantic space, which +contains millions of concepts, including fine-grained categories. To address +VIC, we propose Category Search from External Databases (CaSED), a +training-free method that leverages a pre-trained vision-language model and an +external database. CaSED first extracts the set of candidate categories from +the most semantically similar captions in the database and then assigns the +image to the best-matching candidate category according to the same +vision-language model. Furthermore, we demonstrate that CaSED can be applied +locally to generate a coarse segmentation mask that classifies image regions, +introducing the task of Vocabulary-free Semantic Segmentation. CaSED and its +variants outperform other more complex vision-language models, on +classification and semantic segmentation benchmarks, while using much fewer +parameters. + +
+
+ comment: Under review, 22 pages, 10 figures, code is available at + https://github.com/altndrr/vicss. arXiv admin note: text overlap with + arXiv:2306.00917 +
+
+
+
+
+ + ☆ UruDendro, a public dataset of cross-section images of Pinus taeda + + +
+ The automatic detection of tree-ring boundaries and other anatomical features +using image analysis has progressed substantially over the past decade with +advances in machine learning and imagery technology, as well as increasing +demands from the dendrochronology community. This paper presents a publicly +available database of 64 scanned images of transverse sections of commercially +grown Pinus taeda trees from northern Uruguay, ranging from 17 to 24 years old. +The collection contains several challenging features for automatic ring +detection, including illumination and surface preparation variation, fungal +infection (blue stains), knot formation, missing cortex or interruptions in +outer rings, and radial cracking. This dataset can be used to develop and test +automatic tree ring detection algorithms. This paper presents to the +dendrochronology community one such method, Cross-Section Tree-Ring Detection +(CS-TRD), which identifies and marks complete annual rings in cross-sections +for tree species presenting a clear definition between early and latewood. We +compare the CS-TRD performance against the ground truth manual delineation of +all rings over the UruDendro dataset. The CS-TRD software identified rings with +an average F-score of 89% and RMSE error of 5.27px for the entire database in +less than 20 seconds per image. Finally, we propose a robust measure of the +ring growth using the \emph{equivalent radius} of a circle having the same area +enclosed by the detected tree ring. Overall, this study contributes to the +dendrochronologist's toolbox of fast and low-cost methods to automatically +detect rings in conifer species, particularly for measuring diameter growth +rates and stem transverse area using entire cross-sections. + +
+
+ comment: Submitted to Dendrochronologia. arXiv admin note: text overlap with + arXiv:2305.10809 +
+
+
+
+
+ + ☆ Gasformer: A Transformer-based Architecture for Segmenting Methane + Emissions from Livestock in Optical Gas Imaging CVPR + + +
+ Methane emissions from livestock, particularly cattle, significantly +contribute to climate change. Effective methane emission mitigation strategies +are crucial as the global population and demand for livestock products +increase. We introduce Gasformer, a novel semantic segmentation architecture +for detecting low-flow rate methane emissions from livestock, and controlled +release experiments using optical gas imaging. We present two unique datasets +captured with a FLIR GF77 OGI camera. Gasformer leverages a Mix Vision +Transformer encoder and a Light-Ham decoder to generate multi-scale features +and refine segmentation maps. Gasformer outperforms other state-of-the-art +models on both datasets, demonstrating its effectiveness in detecting and +segmenting methane plumes in controlled and real-world scenarios. On the +livestock dataset, Gasformer achieves mIoU of 88.56%, surpassing other +state-of-the-art models. Materials are available at: +github.com/toqitahamid/Gasformer. + +
+
+ comment: 9 pages, 5 figures, this paper has been submitted and accepted for + publication at CVPRW 2024 +
+
+
+
+
+ + ☆ Dynamic Self-adaptive Multiscale Distillation from Pre-trained + Multimodal Large Model for Efficient Cross-modal Representation Learning + + +
+ In recent years, pre-trained multimodal large models have attracted +widespread attention due to their outstanding performance in various multimodal +applications. Nonetheless, the extensive computational resources and vast +datasets required for their training present significant hurdles for deployment +in environments with limited computational resources. To address this +challenge, we propose a novel dynamic self-adaptive multiscale distillation +from pre-trained multimodal large model for efficient cross-modal +representation learning for the first time. Unlike existing distillation +methods, our strategy employs a multiscale perspective, enabling the extraction +structural knowledge across from the pre-trained multimodal large model. +Ensuring that the student model inherits a comprehensive and nuanced +understanding of the teacher knowledge. To optimize each distillation loss in a +balanced and efficient manner, we propose a dynamic self-adaptive distillation +loss balancer, a novel component eliminating the need for manual loss weight +adjustments and dynamically balances each loss item during the distillation +process. Our methodology streamlines pre-trained multimodal large models using +only their output features and original image-level information, requiring +minimal computational resources. This efficient approach is suited for various +applications and allows the deployment of advanced multimodal technologies even +in resource-limited settings. Extensive experiments has demonstrated that our +method maintains high performance while significantly reducing model complexity +and training costs. Moreover, our distilled student model utilizes only +image-level information to achieve state-of-the-art performance on cross-modal +retrieval tasks, surpassing previous methods that relied on region-level +information. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Semantic-Based Active Perception for Humanoid Visual Tasks with Foveal + Sensors + + +
+ The aim of this work is to establish how accurately a recent semantic-based +foveal active perception model is able to complete visual tasks that are +regularly performed by humans, namely, scene exploration and visual search. +This model exploits the ability of current object detectors to localize and +classify a large number of object classes and to update a semantic description +of a scene across multiple fixations. It has been used previously in scene +exploration tasks. In this paper, we revisit the model and extend its +application to visual search tasks. To illustrate the benefits of using +semantic information in scene exploration and visual search tasks, we compare +its performance against traditional saliency-based models. In the task of scene +exploration, the semantic-based method demonstrates superior performance +compared to the traditional saliency-based model in accurately representing the +semantic information present in the visual scene. In visual search experiments, +searching for instances of a target class in a visual field containing multiple +distractors shows superior performance compared to the saliency-driven model +and a random gaze selection algorithm. Our results demonstrate that semantic +information, from the top-down, influences visual exploration and search tasks +significantly, suggesting a potential area of research for integrating it with +traditional bottom-up cues. + +
+
+
+
+
+ + ☆ MobileNetV4 -- Universal Models for the Mobile Ecosystem + + +
+ We present the latest generation of MobileNets, known as MobileNetV4 (MNv4), +featuring universally efficient architecture designs for mobile devices. At its +core, we introduce the Universal Inverted Bottleneck (UIB) search block, a +unified and flexible structure that merges Inverted Bottleneck (IB), ConvNext, +Feed Forward Network (FFN), and a novel Extra Depthwise (ExtraDW) variant. +Alongside UIB, we present Mobile MQA, an attention block tailored for mobile +accelerators, delivering a significant 39% speedup. An optimized neural +architecture search (NAS) recipe is also introduced which improves MNv4 search +effectiveness. The integration of UIB, Mobile MQA and the refined NAS recipe +results in a new suite of MNv4 models that are mostly Pareto optimal across +mobile CPUs, DSPs, GPUs, as well as specialized accelerators like Apple Neural +Engine and Google Pixel EdgeTPU - a characteristic not found in any other +models tested. Finally, to further boost accuracy, we introduce a novel +distillation technique. Enhanced by this technique, our MNv4-Hybrid-Large model +delivers 87% ImageNet-1K accuracy, with a Pixel 8 EdgeTPU runtime of just +3.8ms. + +
+
+
+
+
+ + ☆ TV100: A TV Series Dataset that Pre-Trained CLIP Has Not Seen + + +
+ The era of pre-trained models has ushered in a wealth of new insights for the +machine learning community. Among the myriad of questions that arise, one of +paramount importance is: 'Do pre-trained models possess comprehensive +knowledge?' This paper seeks to address this crucial inquiry. In line with our +objective, we have made publicly available a novel dataset comprised of images +from TV series released post-2021. This dataset holds significant potential for +use in various research areas, including the evaluation of incremental +learning, novel class discovery, and long-tailed learning, among others. +Project page: https://tv-100.github.io/ + +
+
+ comment: Project page: https://tv-100.github.io/ +
+
+
+
+
+ + ♻ ☆ GROUNDHOG: Grounding Large Language Models to Holistic Segmentation CVPR 2024 + + +
+ Most multimodal large language models (MLLMs) learn language-to-object +grounding through causal language modeling where grounded objects are captured +by bounding boxes as sequences of location tokens. This paradigm lacks +pixel-level representations that are important for fine-grained visual +understanding and diagnosis. In this work, we introduce GROUNDHOG, an MLLM +developed by grounding Large Language Models to holistic segmentation. +GROUNDHOG incorporates a masked feature extractor and converts extracted +features into visual entity tokens for the MLLM backbone, which then connects +groundable phrases to unified grounding masks by retrieving and merging the +entity masks. To train GROUNDHOG, we carefully curated M3G2, a grounded visual +instruction tuning dataset with Multi-Modal Multi-Grained Grounding, by +harvesting a collection of segmentation-grounded datasets with rich +annotations. Our experimental results show that GROUNDHOG achieves superior +performance on various language grounding tasks without task-specific +fine-tuning, and significantly reduces object hallucination. GROUNDHOG also +demonstrates better grounding towards complex forms of visual input and +provides easy-to-understand diagnosis in failure cases. + +
+
+ comment: Accepted to CVPR 2024. Website: https://groundhog-mllm.github.io/ +
+
+
+
+
+ + ♻ ☆ Splatter Image: Ultra-Fast Single-View 3D Reconstruction CVPR 2024 + + +
+ We introduce the \method, an ultra-efficient approach for monocular 3D object +reconstruction. Splatter Image is based on Gaussian Splatting, which allows +fast and high-quality reconstruction of 3D scenes from multiple images. We +apply Gaussian Splatting to monocular reconstruction by learning a neural +network that, at test time, performs reconstruction in a feed-forward manner, +at 38 FPS. Our main innovation is the surprisingly straightforward design of +this network, which, using 2D operators, maps the input image to one 3D +Gaussian per pixel. The resulting set of Gaussians thus has the form an image, +the Splatter Image. We further extend the method take several images as input +via cross-view attention. Owning to the speed of the renderer (588 FPS), we use +a single GPU for training while generating entire images at each iteration to +optimize perceptual metrics like LPIPS. On several synthetic, real, +multi-category and large-scale benchmark datasets, we achieve better results in +terms of PSNR, LPIPS, and other metrics while training and evaluating much +faster than prior works. Code, models, demo and more results are available at +https://szymanowiczs.github.io/splatter-image. + +
+
+ comment: CVPR 2024. Project page: + https://szymanowiczs.github.io/splatter-image.html . Code: + https://github.com/szymanowiczs/splatter-image , Demo: + https://huggingface.co/spaces/szymanowiczs/splatter_image +
+
+
+
+
+ + ♻ ☆ Hunting imaging biomarkers in pulmonary fibrosis: Benchmarks of the + AIIB23 challenge + + +
+ Airway-related quantitative imaging biomarkers are crucial for examination, +diagnosis, and prognosis in pulmonary diseases. However, the manual delineation +of airway trees remains prohibitively time-consuming. While significant efforts +have been made towards enhancing airway modelling, current public-available +datasets concentrate on lung diseases with moderate morphological variations. +The intricate honeycombing patterns present in the lung tissues of fibrotic +lung disease patients exacerbate the challenges, often leading to various +prediction errors. To address this issue, the 'Airway-Informed Quantitative CT +Imaging Biomarker for Fibrotic Lung Disease 2023' (AIIB23) competition was +organized in conjunction with the official 2023 International Conference on +Medical Image Computing and Computer Assisted Intervention (MICCAI). The airway +structures were meticulously annotated by three experienced radiologists. +Competitors were encouraged to develop automatic airway segmentation models +with high robustness and generalization abilities, followed by exploring the +most correlated QIB of mortality prediction. A training set of 120 +high-resolution computerised tomography (HRCT) scans were publicly released +with expert annotations and mortality status. The online validation set +incorporated 52 HRCT scans from patients with fibrotic lung disease and the +offline test set included 140 cases from fibrosis and COVID-19 patients. The +results have shown that the capacity of extracting airway trees from patients +with fibrotic lung disease could be enhanced by introducing voxel-wise weighted +general union loss and continuity loss. In addition to the competitive image +biomarkers for prognosis, a strong airway-derived biomarker (Hazard ratio>1.5, +p<0.0001) was revealed for survival prognostication compared with existing +clinical measurements, clinician assessment and AI-based biomarkers. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Pixel to Elevation: Learning to Predict Elevation Maps at Long Range + using Images for Autonomous Offroad Navigation + + +
+ Understanding terrain topology at long-range is crucial for the success of +off-road robotic missions, especially when navigating at high-speeds. LiDAR +sensors, which are currently heavily relied upon for geometric mapping, provide +sparse measurements when mapping at greater distances. To address this +challenge, we present a novel learning-based approach capable of predicting +terrain elevation maps at long-range using only onboard egocentric images in +real-time. Our proposed method is comprised of three main elements. First, a +transformer-based encoder is introduced that learns cross-view associations +between the egocentric views and prior bird-eye-view elevation map predictions. +Second, an orientation-aware positional encoding is proposed to incorporate the +3D vehicle pose information over complex unstructured terrain with multi-view +visual image features. Lastly, a history-augmented learn-able map embedding is +proposed to achieve better temporal consistency between elevation map +predictions to facilitate the downstream navigational tasks. We experimentally +validate the applicability of our proposed approach for autonomous offroad +robotic navigation in complex and unstructured terrain using real-world offroad +driving data. Furthermore, the method is qualitatively and quantitatively +compared against the current state-of-the-art methods. Extensive field +experiments demonstrate that our method surpasses baseline models in accurately +predicting terrain elevation while effectively capturing the overall terrain +topology at long-ranges. Finally, ablation studies are conducted to highlight +and understand the effect of key components of the proposed approach and +validate their suitability to improve offroad robotic navigation capabilities. + +
+
+ comment: 8 pages, 6 figures, Accepted in IEEE Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ A Survey and Benchmark of Automatic Surface Reconstruction from Point + Clouds + + +
+ We present a comprehensive survey and benchmark of both traditional and +learning-based methods for surface reconstruction from point clouds. This task +is particularly challenging for real-world acquisitions due to factors like +noise, outliers, non-uniform sampling, and missing data. Traditional approaches +often simplify the problem by imposing handcrafted priors on either the input +point clouds or the resulting surface, a process that can necessitate tedious +hyperparameter tuning. Conversely, deep learning models have the capability to +directly learn the properties of input point clouds and desired surfaces from +data. We study the influence of these handcrafted and learned priors on the +precision and robustness of surface reconstruction techniques. We evaluate +various time-tested and contemporary methods in a standardized manner. When +both trained and evaluated on point clouds with identical characteristics, the +learning-based models consistently produce superior surfaces compared to their +traditional counterparts$\unicode{x2013}$even in scenarios involving novel +shape categories. However, traditional methods demonstrate greater resilience +to the diverse array of point cloud anomalies commonly found in real-world 3D +acquisitions. For the benefit of the research community, we make our code and +datasets available, inviting further enhancements to learning-based surface +reconstruction. This can be accessed at +https://github.com/raphaelsulzer/dsr-benchmark . + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Ghost-dil-NetVLAD: A Lightweight Neural Network for Visual Place + Recognition + + +
+ Visual place recognition (VPR) is a challenging task with the unbalance +between enormous computational cost and high recognition performance. Thanks to +the practical feature extraction ability of the lightweight convolution neural +networks (CNNs) and the train-ability of the vector of locally aggregated +descriptors (VLAD) layer, we propose a lightweight weakly supervised end-to-end +neural network consisting of a front-ended perception model called GhostCNN and +a learnable VLAD layer as a back-end. GhostCNN is based on Ghost modules that +are lightweight CNN-based architectures. They can generate redundant feature +maps using linear operations instead of the traditional convolution process, +making a good trade-off between computation resources and recognition accuracy. +To enhance our proposed lightweight model further, we add dilated convolutions +to the Ghost module to get features containing more spatial semantic +information, improving accuracy. Finally, rich experiments conducted on a +commonly used public benchmark and our private dataset validate that the +proposed neural network reduces the FLOPs and parameters of VGG16-NetVLAD by +99.04% and 80.16%, respectively. Besides, both models achieve similar accuracy. + +
+
+
+
+
+ + ♻ ☆ VehicleGAN: Pair-flexible Pose Guided Image Synthesis for Vehicle + Re-identification + + +
+ Vehicle Re-identification (Re-ID) has been broadly studied in the last +decade; however, the different camera view angle leading to confused +discrimination in the feature subspace for the vehicles of various poses, is +still challenging for the Vehicle Re-ID models in the real world. To promote +the Vehicle Re-ID models, this paper proposes to synthesize a large number of +vehicle images in the target pose, whose idea is to project the vehicles of +diverse poses into the unified target pose so as to enhance feature +discrimination. Considering that the paired data of the same vehicles in +different traffic surveillance cameras might be not available in the real +world, we propose the first Pair-flexible Pose Guided Image Synthesis method +for Vehicle Re-ID, named as VehicleGAN in this paper, which works for both +supervised and unsupervised settings without the knowledge of geometric 3D +models. Because of the feature distribution difference between real and +synthetic data, simply training a traditional metric learning based Re-ID model +with data-level fusion (i.e., data augmentation) is not satisfactory, therefore +we propose a new Joint Metric Learning (JML) via effective feature-level fusion +from both real and synthetic data. Intensive experimental results on the public +VeRi-776 and VehicleID datasets prove the accuracy and effectiveness of our +proposed VehicleGAN and JML. + +
+
+
+
+
+ + ♻ ☆ SplaTAM: Splat, Track & Map 3D Gaussians for Dense RGB-D SLAM CVPR 2024 + + +
+ Dense simultaneous localization and mapping (SLAM) is crucial for robotics +and augmented reality applications. However, current methods are often hampered +by the non-volumetric or implicit way they represent a scene. This work +introduces SplaTAM, an approach that, for the first time, leverages explicit +volumetric representations, i.e., 3D Gaussians, to enable high-fidelity +reconstruction from a single unposed RGB-D camera, surpassing the capabilities +of existing methods. SplaTAM employs a simple online tracking and mapping +system tailored to the underlying Gaussian representation. It utilizes a +silhouette mask to elegantly capture the presence of scene density. This +combination enables several benefits over prior representations, including fast +rendering and dense optimization, quickly determining if areas have been +previously mapped, and structured map expansion by adding more Gaussians. +Extensive experiments show that SplaTAM achieves up to 2x superior performance +in camera pose estimation, map construction, and novel-view synthesis over +existing methods, paving the way for more immersive high-fidelity SLAM +applications. + +
+
+ comment: CVPR 2024. Website: https://spla-tam.github.io/ +
+
+
+
+
+ + ♻ ☆ LaVy: Vietnamese Multimodal Large Language Model + + +
+ Large Language Models (LLMs) and Multimodal Large language models (MLLMs) +have taken the world by storm with impressive abilities in complex reasoning +and linguistic comprehension. Meanwhile there are plethora of works related to +Vietnamese Large Language Models, the lack of high-quality resources in +multimodality limits the progress of Vietnamese MLLMs. In this paper, we +pioneer in address this by introducing LaVy, a state-of-the-art Vietnamese +MLLM, and we also introduce LaVy-Bench benchmark designated for evaluating +MLLMs's understanding on Vietnamese visual language tasks. Our project is +public at https://github.com/baochi0212/LaVy + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ LoopAnimate: Loopable Salient Object Animation + + +
+ Research on diffusion model-based video generation has advanced rapidly. +However, limitations in object fidelity and generation length hinder its +practical applications. Additionally, specific domains like animated wallpapers +require seamless looping, where the first and last frames of the video match +seamlessly. To address these challenges, this paper proposes LoopAnimate, a +novel method for generating videos with consistent start and end frames. To +enhance object fidelity, we introduce a framework that decouples multi-level +image appearance and textual semantic information. Building upon an +image-to-image diffusion model, our approach incorporates both pixel-level and +feature-level information from the input image, injecting image appearance and +textual semantic embeddings at different positions of the diffusion model. +Existing UNet-based video generation models require to input the entire videos +during training to encode temporal and positional information at once. However, +due to limitations in GPU memory, the number of frames is typically restricted +to 16. To address this, this paper proposes a three-stage training strategy +with progressively increasing frame numbers and reducing fine-tuning modules. +Additionally, we introduce the Temporal E nhanced Motion Module(TEMM) to extend +the capacity for encoding temporal and positional information up to 36 frames. +The proposed LoopAnimate, which for the first time extends the single-pass +generation length of UNet-based video generation models to 35 frames while +maintaining high-quality video generation. Experiments demonstrate that +LoopAnimate achieves state-of-the-art performance in both objective metrics, +such as fidelity and temporal consistency, and subjective evaluation results. + +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ A Systematic Review of Low-Rank and Local Low-Rank Matrix Approximation + in Big Data Medical Imaging + + +
+ The large volume and complexity of medical imaging datasets are bottlenecks +for storage, transmission, and processing. To tackle these challenges, the +application of low-rank matrix approximation (LRMA) and its derivative, local +LRMA (LLRMA) has demonstrated potential. + A detailed analysis of the literature identifies LRMA and LLRMA methods +applied to various imaging modalities, and the challenges and limitations +associated with existing LRMA and LLRMA methods are addressed. + We note a significant shift towards a preference for LLRMA in the medical +imaging field since 2015, demonstrating its potential and effectiveness in +capturing complex structures in medical data compared to LRMA. Acknowledging +the limitations of shallow similarity methods used with LLRMA, we suggest +advanced semantic image segmentation for similarity measure, explaining in +detail how it can measure similar patches and their feasibility. + We note that LRMA and LLRMA are mainly applied to unstructured medical data, +and we propose extending their application to different medical data types, +including structured and semi-structured. This paper also discusses how LRMA +and LLRMA can be applied to regular data with missing entries and the impact of +inaccuracies in predicting missing values and their effects. We discuss the +impact of patch size and propose the use of random search (RS) to determine the +optimal patch size. To enhance feasibility, a hybrid approach using Bayesian +optimization and RS is proposed, which could improve the application of LRMA +and LLRMA in medical imaging. + +
+
+
+
+
+ + ♻ ☆ Slide-SAM: Medical SAM Meets Sliding Window + + +
+ The Segment Anything Model (SAM) has achieved a notable success in +two-dimensional image segmentation in natural images. However, the substantial +gap between medical and natural images hinders its direct application to +medical image segmentation tasks. Particularly in 3D medical images, SAM +struggles to learn contextual relationships between slices, limiting its +practical applicability. Moreover, applying 2D SAM to 3D images requires +prompting the entire volume, which is time- and label-consuming. To address +these problems, we propose Slide-SAM, which treats a stack of three adjacent +slices as a prediction window. It firstly takes three slices from a 3D volume +and point- or bounding box prompts on the central slice as inputs to predict +segmentation masks for all three slices. Subsequently, the masks of the top and +bottom slices are then used to generate new prompts for adjacent slices. +Finally, step-wise prediction can be achieved by sliding the prediction window +forward or backward through the entire volume. Our model is trained on multiple +public and private medical datasets and demonstrates its effectiveness through +extensive 3D segmetnation experiments, with the help of minimal prompts. Code +is available at \url{https://github.com/Curli-quan/Slide-SAM}. + +
+
+
+
+
+ + ♻ ☆ E3: Ensemble of Expert Embedders for Adapting Synthetic Image Detectors + to New Generators Using Limited Data CVPR + + +
+ As generative AI progresses rapidly, new synthetic image generators continue +to emerge at a swift pace. Traditional detection methods face two main +challenges in adapting to these generators: the forensic traces of synthetic +images from new techniques can vastly differ from those learned during +training, and access to data for these new generators is often limited. To +address these issues, we introduce the Ensemble of Expert Embedders (E3), a +novel continual learning framework for updating synthetic image detectors. E3 +enables the accurate detection of images from newly emerged generators using +minimal training data. Our approach does this by first employing transfer +learning to develop a suite of expert embedders, each specializing in the +forensic traces of a specific generator. Then, all embeddings are jointly +analyzed by an Expert Knowledge Fusion Network to produce accurate and reliable +detection decisions. Our experiments demonstrate that E3 outperforms existing +continual learning methods, including those developed specifically for +synthetic image detection. + +
+
+ comment: 11 pages, 4 figures, To be published in CVPRWMF24 +
+
+
+
+
+ + ♻ ☆ DP-RDM: Adapting Diffusion Models to Private Domains Without Fine-Tuning + + +
+ Text-to-image diffusion models have been shown to suffer from sample-level +memorization, possibly reproducing near-perfect replica of images that they are +trained on, which may be undesirable. To remedy this issue, we develop the +first differentially private (DP) retrieval-augmented generation algorithm that +is capable of generating high-quality image samples while providing provable +privacy guarantees. Specifically, we assume access to a text-to-image diffusion +model trained on a small amount of public data, and design a DP retrieval +mechanism to augment the text prompt with samples retrieved from a private +retrieval dataset. Our \emph{differentially private retrieval-augmented +diffusion model} (DP-RDM) requires no fine-tuning on the retrieval dataset to +adapt to another domain, and can use state-of-the-art generative models to +generate high-quality image samples while satisfying rigorous DP guarantees. +For instance, when evaluated on MS-COCO, our DP-RDM can generate samples with a +privacy budget of $\epsilon=10$, while providing a $3.5$ point improvement in +FID compared to public-only retrieval for up to $10,000$ queries. + +
+
+
+
+
+ + ♻ ☆ LoopGaussian: Creating 3D Cinemagraph with Multi-view Images via + Eulerian Motion Field + + +
+ Cinemagraph is a unique form of visual media that combines elements of still +photography and subtle motion to create a captivating experience. However, the +majority of videos generated by recent works lack depth information and are +confined to the constraints of 2D image space. In this paper, inspired by +significant progress in the field of novel view synthesis (NVS) achieved by 3D +Gaussian Splatting (3D-GS), we propose LoopGaussian to elevate cinemagraph from +2D image space to 3D space using 3D Gaussian modeling. To achieve this, we +first employ the 3D-GS method to reconstruct 3D Gaussian point clouds from +multi-view images of static scenes,incorporating shape regularization terms to +prevent blurring or artifacts caused by object deformation. We then adopt an +autoencoder tailored for 3D Gaussian to project it into feature space. To +maintain the local continuity of the scene, we devise SuperGaussian for +clustering based on the acquired features. By calculating the similarity +between clusters and employing a two-stage estimation method, we derive an +Eulerian motion field to describe velocities across the entire scene. The 3D +Gaussian points then move within the estimated Eulerian motion field. Through +bidirectional animation techniques, we ultimately generate a 3D Cinemagraph +that exhibits natural and seamlessly loopable dynamics. Experiment results +validate the effectiveness of our approach, demonstrating high-quality and +visually appealing scene generation. The project is available at +https://pokerlishao.github.io/LoopGaussian/. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Using Multi-scale SwinTransformer-HTC with Data augmentation in CoNIC + Challenge + + +
+ Colorectal cancer is one of the most common cancers worldwide, so early +pathological examination is very important. However, it is time-consuming and +labor-intensive to identify the number and type of cells on H&E images in +clinical. Therefore, automatic segmentation and classification task and +counting the cellular composition of H&E images from pathological sections is +proposed by CoNIC Challenge 2022. We proposed a multi-scale Swin transformer +with HTC for this challenge, and also applied the known normalization methods +to generate more augmentation data. Finally, our strategy showed that the +multi-scale played a crucial role to identify different scale features and the +augmentation arose the recognition of model. + +
+
+ comment: Errors have been identified in the analysis +
+
+
+
+
+ + ♻ ☆ 2S-UDF: A Novel Two-stage UDF Learning Method for Robust Non-watertight + Model Reconstruction from Multi-view Images CVPR 2024 + + +
+ Recently, building on the foundation of neural radiance field, various +techniques have emerged to learn unsigned distance fields (UDF) to reconstruct +3D non-watertight models from multi-view images. Yet, a central challenge in +UDF-based volume rendering is formulating a proper way to convert unsigned +distance values into volume density, ensuring that the resulting weight +function remains unbiased and sensitive to occlusions. Falling short on these +requirements often results in incorrect topology or large reconstruction errors +in resulting models. This paper addresses this challenge by presenting a novel +two-stage algorithm, 2S-UDF, for learning a high-quality UDF from multi-view +images. Initially, the method applies an easily trainable density function +that, while slightly biased and transparent, aids in coarse reconstruction. The +subsequent stage then refines the geometry and appearance of the object to +achieve a high-quality reconstruction by directly adjusting the weight function +used in volume rendering to ensure that it is unbiased and occlusion-aware. +Decoupling density and weight in two stages makes our training stable and +robust, distinguishing our technique from existing UDF learning approaches. +Evaluations on the DeepFashion3D, DTU, and BlendedMVS datasets validate the +robustness and effectiveness of our proposed approach. In both quantitative +metrics and visual quality, the results indicate our superior performance over +other UDF learning techniques in reconstructing 3D non-watertight models from +multi-view images. Our code is available at +https://bitbucket.org/jkdeng/2sudf/. + +
+
+ comment: accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Video Codec Control for Vision Models CVPR 2024 + + +
+ Standardized lossy video coding is at the core of almost all real-world video +processing pipelines. Rate control is used to enable standard codecs to adapt +to different network bandwidth conditions or storage constraints. However, +standard video codecs (e.g., H.264) and their rate control modules aim to +minimize video distortion w.r.t. human quality assessment. We demonstrate +empirically that standard-coded videos vastly deteriorate the performance of +deep vision models. To overcome the deterioration of vision performance, this +paper presents the first end-to-end learnable deep video codec control that +considers both bandwidth constraints and downstream deep vision performance, +while adhering to existing standardization. We demonstrate that our approach +better preserves downstream deep vision performance than traditional standard +video coding. + +
+
+ comment: Accepted at CVPR 2024 Workshop on AI for Streaming (AIS) +
+
+
+
+
+ + ♻ ☆ Absolute-Unified Multi-Class Anomaly Detection via Class-Agnostic + Distribution Alignment + + +
+ Conventional unsupervised anomaly detection (UAD) methods build separate +models for each object category. Recent studies have proposed to train a +unified model for multiple classes, namely model-unified UAD. However, such +methods still implement the unified model separately on each class during +inference with respective anomaly decision thresholds, which hinders their +application when the image categories are entirely unavailable. In this work, +we present a simple yet powerful method to address multi-class anomaly +detection without any class information, namely \textit{absolute-unified} UAD. +We target the crux of prior works in this challenging setting: different +objects have mismatched anomaly score distributions. We propose Class-Agnostic +Distribution Alignment (CADA) to align the mismatched score distribution of +each implicit class without knowing class information, which enables unified +anomaly detection for all classes and samples. The essence of CADA is to +predict each class's score distribution of normal samples given any image, +normal or anomalous, of this class. As a general component, CADA can activate +the potential of nearly all UAD methods under absolute-unified setting. Our +approach is extensively evaluated under the proposed setting on two popular UAD +benchmark datasets, MVTec AD and VisA, where we exceed previous +state-of-the-art by a large margin. + +
+
+
+
+
+ + ♻ ☆ Pixel-Wise Contrastive Distillation ICCV 2023 + + +
+ We present a simple but effective pixel-level self-supervised distillation +framework friendly to dense prediction tasks. Our method, called Pixel-Wise +Contrastive Distillation (PCD), distills knowledge by attracting the +corresponding pixels from student's and teacher's output feature maps. PCD +includes a novel design called SpatialAdaptor which ``reshapes'' a part of the +teacher network while preserving the distribution of its output features. Our +ablation experiments suggest that this reshaping behavior enables more +informative pixel-to-pixel distillation. Moreover, we utilize a plug-in +multi-head self-attention module that explicitly relates the pixels of +student's feature maps to enhance the effective receptive field, leading to a +more competitive student. PCD \textbf{outperforms} previous self-supervised +distillation methods on various dense prediction tasks. A backbone of +\mbox{ResNet-18-FPN} distilled by PCD achieves $37.4$ AP$^\text{bbox}$ and +$34.0$ AP$^\text{mask}$ on COCO dataset using the detector of \mbox{Mask +R-CNN}. We hope our study will inspire future research on how to pre-train a +small model friendly to dense prediction tasks in a self-supervised fashion. + +
+
+ comment: ICCV 2023 camera-ready +
+
+
+
+
+ + ♻ ☆ About latent roles in forecasting players in team sports + + +
+ Forecasting players in sports has grown in popularity due to the potential +for a tactical advantage and the applicability of such research to multi-agent +interaction systems. Team sports contain a significant social component that +influences interactions between teammates and opponents. However, it still +needs to be fully exploited. In this work, we hypothesize that each participant +has a specific function in each action and that role-based interaction is +critical for predicting players' future moves. We create RolFor, a novel +end-to-end model for Role-based Forecasting. RolFor uses a new module we +developed called Ordering Neural Networks (OrderNN) to permute the order of the +players such that each player is assigned to a latent role. The latent role is +then modeled with a RoleGCN. Thanks to its graph representation, it provides a +fully learnable adjacency matrix that captures the relationships between roles +and is subsequently used to forecast the players' future trajectories. +Extensive experiments on a challenging NBA basketball dataset back up the +importance of roles and justify our goal of modeling them using optimizable +models. When an oracle provides roles, the proposed RolFor compares favorably +to the current state-of-the-art (it ranks first in terms of ADE and second in +terms of FDE errors). However, training the end-to-end RolFor incurs the issues +of differentiability of permutation methods, which we experimentally review. +Finally, this work restates differentiable ranking as a difficult open problem +and its great potential in conjunction with graph-based interaction models. +Project is available at: https://www.pinlab.org/aboutlatentroles + +
+
+
+
+
+ + ♻ ☆ Regularization by Texts for Latent Diffusion Inverse Solvers + + +
+ The recent advent of diffusion models has led to significant progress in +solving inverse problems, leveraging these models as effective generative +priors. Nonetheless, there remain challenges related to the ill-posed nature of +such problems, often due to inherent ambiguities in measurements or intrinsic +system symmetries. To address this, drawing inspiration from the human ability +to resolve visual ambiguities through perceptual biases, here we introduce a +novel latent diffusion inverse solver by regularization by texts (TReg). +Specifically, TReg applies the textual description of the preconception of the +solution during the reverse diffusion sampling, of which the description is +dynamically reinforced through null-text optimization for adaptive negation. +Our comprehensive experimental results demonstrate that TReg successfully +mitigates ambiguity in the inverse problems, enhancing their effectiveness and +accuracy. + +
+
+
+
+
+ + ♻ ☆ GPS-Gaussian: Generalizable Pixel-wise 3D Gaussian Splatting for + Real-time Human Novel View Synthesis CVPR 2024 + + +
+ We present a new approach, termed GPS-Gaussian, for synthesizing novel views +of a character in a real-time manner. The proposed method enables 2K-resolution +rendering under a sparse-view camera setting. Unlike the original Gaussian +Splatting or neural implicit rendering methods that necessitate per-subject +optimizations, we introduce Gaussian parameter maps defined on the source views +and regress directly Gaussian Splatting properties for instant novel view +synthesis without any fine-tuning or optimization. To this end, we train our +Gaussian parameter regression module on a large amount of human scan data, +jointly with a depth estimation module to lift 2D parameter maps to 3D space. +The proposed framework is fully differentiable and experiments on several +datasets demonstrate that our method outperforms state-of-the-art methods while +achieving an exceeding rendering speed. + +
+
+ comment: Accepted by CVPR 2024 (Highlight). Project page: + https://shunyuanzheng.github.io/GPS-Gaussian +
+
+
+
+
+ + ♻ ☆ Leveraging Image Matching Toward End-to-End Relative Camera Pose + Regression + + +
+ This paper proposes a generalizable, end-to-end deep learning-based method +for relative pose regression between two images. Given two images of the same +scene captured from different viewpoints, our method predicts the relative +rotation and translation (including direction and scale) between the two +respective cameras. Inspired by the classical pipeline, our method leverages +Image Matching (IM) as a pre-trained task for relative pose regression. +Specifically, we use LoFTR, an architecture that utilizes an attention-based +network pre-trained on Scannet, to extract semi-dense feature maps, which are +then warped and fed into a pose regression network. Notably, we use a loss +function that utilizes separate terms to account for the translation direction +and scale. We believe such a separation is important because translation +direction is determined by point correspondences while the scale is inferred +from prior on shape sizes. Our ablations further support this choice. We +evaluate our method on several datasets and show that it outperforms previous +end-to-end methods. The method also generalizes well to unseen datasets. + +
+
+ comment: Project webpage: https://fadikhatib.github.io/GRelPose +
+
+
+
+
+ + ♻ ☆ Deep Generative Data Assimilation in Multimodal Setting CVPR2024 + + +
+ Robust integration of physical knowledge and data is key to improve +computational simulations, such as Earth system models. Data assimilation is +crucial for achieving this goal because it provides a systematic framework to +calibrate model outputs with observations, which can include remote sensing +imagery and ground station measurements, with uncertainty quantification. +Conventional methods, including Kalman filters and variational approaches, +inherently rely on simplifying linear and Gaussian assumptions, and can be +computationally expensive. Nevertheless, with the rapid adoption of data-driven +methods in many areas of computational sciences, we see the potential of +emulating traditional data assimilation with deep learning, especially +generative models. In particular, the diffusion-based probabilistic framework +has large overlaps with data assimilation principles: both allows for +conditional generation of samples with a Bayesian inverse framework. These +models have shown remarkable success in text-conditioned image generation or +image-controlled video synthesis. Likewise, one can frame data assimilation as +observation-conditioned state calibration. In this work, we propose SLAMS: +Score-based Latent Assimilation in Multimodal Setting. Specifically, we +assimilate in-situ weather station data and ex-situ satellite imagery to +calibrate the vertical temperature profiles, globally. Through extensive +ablation, we demonstrate that SLAMS is robust even in low-resolution, noisy, +and sparse data settings. To our knowledge, our work is the first to apply deep +generative framework for multimodal data assimilation using real-world +datasets; an important step for building robust computational simulators, +including the next-generation Earth system models. Our code is available at: +https://github.com/yongquan-qu/SLAMS + +
+
+ comment: CVPR2024 EarthVision +
+
+
+
+
+ + ♻ ☆ Open-Pose 3D Zero-Shot Learning: Benchmark and Challenges + + +
+ With the explosive 3D data growth, the urgency of utilizing zero-shot +learning to facilitate data labeling becomes evident. Recently, methods +transferring language or language-image pre-training models like Contrastive +Language-Image Pre-training (CLIP) to 3D vision have made significant progress +in the 3D zero-shot classification task. These methods primarily focus on 3D +object classification with an aligned pose; such a setting is, however, rather +restrictive, which overlooks the recognition of 3D objects with open poses +typically encountered in real-world scenarios, such as an overturned chair or a +lying teddy bear. To this end, we propose a more realistic and challenging +scenario named open-pose 3D zero-shot classification, focusing on the +recognition of 3D objects regardless of their orientation. First, we revisit +the current research on 3D zero-shot classification, and propose two benchmark +datasets specifically designed for the open-pose setting. We empirically +validate many of the most popular methods in the proposed open-pose benchmark. +Our investigations reveal that most current 3D zero-shot classification models +suffer from poor performance, indicating a substantial exploration room towards +the new direction. Furthermore, we study a concise pipeline with an iterative +angle refinement mechanism that automatically optimizes one ideal angle to +classify these open-pose 3D objects. In particular, to make validation more +compelling and not just limited to existing CLIP-based methods, we also pioneer +the exploration of knowledge transfer based on Diffusion models. While the +proposed solutions can serve as a new benchmark for open-pose 3D zero-shot +classification, we discuss the complexities and challenges of this scenario +that remain for further research development. The code is available publicly at +https://github.com/weiguangzhao/Diff-OP3D. + +
+
+
+
+
+ + ♻ ☆ PartDistill: 3D Shape Part Segmentation by Vision-Language Model + Distillation CVPR 2024 + + +
+ This paper proposes a cross-modal distillation framework, PartDistill, which +transfers 2D knowledge from vision-language models (VLMs) to facilitate 3D +shape part segmentation. PartDistill addresses three major challenges in this +task: the lack of 3D segmentation in invisible or undetected regions in the 2D +projections, inconsistent 2D predictions by VLMs, and the lack of knowledge +accumulation across different 3D shapes. PartDistill consists of a teacher +network that uses a VLM to make 2D predictions and a student network that +learns from the 2D predictions while extracting geometrical features from +multiple 3D shapes to carry out 3D part segmentation. A bi-directional +distillation, including forward and backward distillations, is carried out +within the framework, where the former forward distills the 2D predictions to +the student network, and the latter improves the quality of the 2D predictions, +which subsequently enhances the final 3D segmentation. Moreover, PartDistill +can exploit generative models that facilitate effortless 3D shape creation for +generating knowledge sources to be distilled. Through extensive experiments, +PartDistill boosts the existing methods with substantial margins on widely used +ShapeNetPart and PartNetE datasets, by more than 15% and 12% higher mIoU +scores, respectively. The code for this work is available at +https://github.com/ardianumam/PartDistill. + +
+
+ comment: CVPR 2024 Accepted +
+
+
+
+
+ + ♻ ☆ Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical + Image Segmentation + + +
+ Image segmentation holds a vital position in the realms of diagnosis and +treatment within the medical domain. Traditional convolutional neural networks +(CNNs) and Transformer models have made significant advancements in this realm, +but they still encounter challenges because of limited receptive field or high +computing complexity. Recently, State Space Models (SSMs), particularly Mamba +and its variants, have demonstrated notable performance in the field of vision. +However, their feature extraction methods may not be sufficiently effective and +retain some redundant structures, leaving room for parameter reduction. +Motivated by previous spatial and channel attention methods, we propose Triplet +Mamba-UNet. The method leverages residual VSS Blocks to extract intensive +contextual features, while Triplet SSM is employed to fuse features across +spatial and channel dimensions. We conducted experiments on ISIC17, ISIC18, +CVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets, +demonstrating the superior segmentation performance of our proposed TM-UNet. +Additionally, compared to the previous VM-UNet, our model achieves a one-third +reduction in parameters. + +
+
+
+
+
+ + ♻ ☆ GPT-4V-AD: Exploring Grounding Potential of VQA-oriented GPT-4V for + Zero-shot Anomaly Detection + + +
+ Large Multimodal Model (LMM) GPT-4V(ision) endows GPT-4 with visual grounding +capabilities, making it possible to handle certain tasks through the Visual +Question Answering (VQA) paradigm. This paper explores the potential of +VQA-oriented GPT-4V in the recently popular visual Anomaly Detection (AD) and +is the first to conduct qualitative and quantitative evaluations on the popular +MVTec AD and VisA datasets. Considering that this task requires both +image-/pixel-level evaluations, the proposed GPT-4V-AD framework contains three +components: \textbf{\textit{1)}} Granular Region Division, \textbf{\textit{2)}} +Prompt Designing, \textbf{\textit{3)}} Text2Segmentation for easy quantitative +evaluation, and have made some different attempts for comparative analysis. The +results show that GPT-4V can achieve certain results in the zero-shot AD task +through a VQA paradigm, such as achieving image-level 77.1/88.0 and pixel-level +68.0/76.6 AU-ROCs on MVTec AD and VisA datasets, respectively. However, its +performance still has a certain gap compared to the state-of-the-art zero-shot +method, \eg, WinCLIP and CLIP-AD, and further researches are needed. This study +provides a baseline reference for the research of VQA-oriented LMM in the +zero-shot AD task, and we also post several possible future works. Code is +available at \url{https://github.com/zhangzjn/GPT-4V-AD}. + +
+
+
+
+
+ + ♻ ☆ RemoteCLIP: A Vision Language Foundation Model for Remote Sensing + + +
+ General-purpose foundation models have led to recent breakthroughs in +artificial intelligence. In remote sensing, self-supervised learning (SSL) and +Masked Image Modeling (MIM) have been adopted to build foundation models. +However, these models primarily learn low-level features and require annotated +data for fine-tuning. Moreover, they are inapplicable for retrieval and +zero-shot applications due to the lack of language understanding. To address +these limitations, we propose RemoteCLIP, the first vision-language foundation +model for remote sensing that aims to learn robust visual features with rich +semantics and aligned text embeddings for seamless downstream application. To +address the scarcity of pre-training data, we leverage data scaling which +converts heterogeneous annotations into a unified image-caption data format +based on Box-to-Caption (B2C) and Mask-to-Box (M2B) conversion. By further +incorporating UAV imagery, we produce a 12 $\times$ larger pretraining dataset +than the combination of all available datasets. RemoteCLIP can be applied to a +variety of downstream tasks, including zero-shot image classification, linear +probing, $\textit{k}$-NN classification, few-shot classification, image-text +retrieval, and object counting in remote sensing images. Evaluation on 16 +datasets, including a newly introduced RemoteCount benchmark to test the object +counting ability, shows that RemoteCLIP consistently outperforms baseline +foundation models across different model scales. Impressively, RemoteCLIP beats +the state-of-the-art method by 9.14% mean recall on the RSITMD dataset and +8.92% on the RSICD dataset. For zero-shot classification, our RemoteCLIP +outperforms the CLIP baseline by up to 6.39% average accuracy on 12 downstream +datasets. Project website: https://github.com/ChenDelong1999/RemoteCLIP + +
+
+ comment: Accepted by IEEE Transactions on Geoscience and Remote Sensing (TGRS) +
+
+
+
+
+ + ♻ ☆ Face-voice Association in Multilingual Environments (FAME) Challenge + 2024 Evaluation Plan + + +
+ The advancements of technology have led to the use of multimodal systems in +various real-world applications. Among them, the audio-visual systems are one +of the widely used multimodal systems. In the recent years, associating face +and voice of a person has gained attention due to presence of unique +correlation between them. The Face-voice Association in Multilingual +Environments (FAME) Challenge 2024 focuses on exploring face-voice association +under a unique condition of multilingual scenario. This condition is inspired +from the fact that half of the world's population is bilingual and most often +people communicate under multilingual scenario. The challenge uses a dataset +namely, Multilingual Audio-Visual (MAV-Celeb) for exploring face-voice +association in multilingual environments. This report provides the details of +the challenge, dataset, baselines and task details for the FAME Challenge. + +
+
+ comment: ACM Multimedia Conference - Grand Challenge +
+
+
+
+
+ + ♻ ☆ 3D Human Scan With A Moving Event Camera + + +
+ Capturing a 3D human body is one of the important tasks in computer vision +with a wide range of applications such as virtual reality and sports analysis. +However, conventional frame cameras are limited by their temporal resolution +and dynamic range, which imposes constraints in real-world application setups. +Event cameras have the advantages of high temporal resolution and high dynamic +range (HDR), but the development of event-based methods is necessary to handle +data with different characteristics. This paper proposes a novel event-based +method for 3D pose estimation and human mesh recovery. Prior work on +event-based human mesh recovery require frames (images) as well as event data. +The proposed method solely relies on events; it carves 3D voxels by moving the +event camera around a stationary body, reconstructs the human pose and mesh by +attenuated rays, and fit statistical body models, preserving high-frequency +details. The experimental results show that the proposed method outperforms +conventional frame-based methods in the estimation accuracy of both pose and +body mesh. We also demonstrate results in challenging situations where a +conventional camera has motion blur. This is the first to demonstrate +event-only human mesh recovery, and we hope that it is the first step toward +achieving robust and accurate 3D human body scanning from vision sensors. +https://florpeng.github.io/event-based-human-scan/ + +
+
+
+
+
+ + ♻ ☆ Mind-to-Image: Projecting Visual Mental Imagination of the Brain from + fMRI + + +
+ The reconstruction of images observed by subjects from fMRI data collected +during visual stimuli has made significant strides in the past decade, thanks +to the availability of extensive fMRI datasets and advancements in generative +models for image generation. However, the application of visual reconstruction +has remained limited. Reconstructing visual imagination presents a greater +challenge, with potentially revolutionary applications ranging from aiding +individuals with disabilities to verifying witness accounts in court. The +primary hurdles in this field are the absence of data collection protocols for +visual imagery and the lack of datasets on the subject. Traditionally, +fMRI-to-image relies on data collected from subjects exposed to visual stimuli, +which poses issues for generating visual imagery based on the difference of +brain activity between visual stimulation and visual imagery. For the first +time, we have compiled a substantial dataset (around 6h of scans) on visual +imagery along with a proposed data collection protocol. We then train a +modified version of an fMRI-to-image model and demonstrate the feasibility of +reconstructing images from two modes of imagination: from memory and from pure +imagination. This marks an important step towards creating a technology that +allow direct reconstruction of visual imagery. + +
+
+ comment: Pre-print to be updated. Work in progress +
+
+
+
+
+ + ♻ ☆ Theoretically Achieving Continuous Representation of Oriented Bounding + Boxes CVPR'24 + + +
+ Considerable efforts have been devoted to Oriented Object Detection (OOD). +However, one lasting issue regarding the discontinuity in Oriented Bounding Box +(OBB) representation remains unresolved, which is an inherent bottleneck for +extant OOD methods. This paper endeavors to completely solve this issue in a +theoretically guaranteed manner and puts an end to the ad-hoc efforts in this +direction. Prior studies typically can only address one of the two cases of +discontinuity: rotation and aspect ratio, and often inadvertently introduce +decoding discontinuity, e.g. Decoding Incompleteness (DI) and Decoding +Ambiguity (DA) as discussed in literature. Specifically, we propose a novel +representation method called Continuous OBB (COBB), which can be readily +integrated into existing detectors e.g. Faster-RCNN as a plugin. It can +theoretically ensure continuity in bounding box regression which to our best +knowledge, has not been achieved in literature for rectangle-based object +representation. For fairness and transparency of experiments, we have developed +a modularized benchmark based on the open-source deep learning framework +Jittor's detection toolbox JDet for OOD evaluation. On the popular DOTA +dataset, by integrating Faster-RCNN as the same baseline model, our new method +outperforms the peer method Gliding Vertex by 1.13% mAP50 (relative improvement +1.54%), and 2.46% mAP75 (relative improvement 5.91%), without any tricks. + +
+
+ comment: 17 pages, 12 tables, 8 figures. Accepted by CVPR'24. Code: + https://github.com/514flowey/JDet-COBB +
+
+
+
+
+ + ♻ ☆ Privacy Preserving Image Registration + + +
+ Image registration is a key task in medical imaging applications, allowing to +represent medical images in a common spatial reference frame. Current +approaches to image registration are generally based on the assumption that the +content of the images is usually accessible in clear form, from which the +spatial transformation is subsequently estimated. This common assumption may +not be met in practical applications, since the sensitive nature of medical +images may ultimately require their analysis under privacy constraints, +preventing to openly share the image content.In this work, we formulate the +problem of image registration under a privacy preserving regime, where images +are assumed to be confidential and cannot be disclosed in clear. We derive our +privacy preserving image registration framework by extending classical +registration paradigms to account for advanced cryptographic tools, such as +secure multi-party computation and homomorphic encryption, that enable the +execution of operations without leaking the underlying data. To overcome the +problem of performance and scalability of cryptographic tools in high +dimensions, we propose several techniques to optimize the image registration +operations by using gradient approximations, and by revisiting the use of +homomorphic encryption trough packing, to allow the efficient encryption and +multiplication of large matrices. We demonstrate our privacy preserving +framework in linear and non-linear registration problems, evaluating its +accuracy and scalability with respect to standard, non-private counterparts. +Our results show that privacy preserving image registration is feasible and can +be adopted in sensitive medical imaging applications. + +
+
+ comment: v4 Accepted at Medical Image Computing and Computer Assisted + Intervention (2022) 130-140 +
+
+
+
+
+ + ♻ ☆ NIR-Assisted Image Denoising: A Selective Fusion Approach and A + Real-World Benchmark Datase + + +
+ Despite the significant progress in image denoising, it is still challenging +to restore fine-scale details while removing noise, especially in extremely +low-light environments. Leveraging near-infrared (NIR) images to assist visible +RGB image denoising shows the potential to address this issue, becoming a +promising technology. Nonetheless, existing works still struggle with taking +advantage of NIR information effectively for real-world image denoising, due to +the content inconsistency between NIR-RGB images and the scarcity of real-world +paired datasets. To alleviate the problem, we propose an efficient Selective +Fusion Module (SFM), which can be plug-and-played into the advanced denoising +networks to merge the deep NIR-RGB features. Specifically, we sequentially +perform the global and local modulation for NIR and RGB features, and then +integrate the two modulated features. Furthermore, we present a Real-world +NIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse +scenarios as well as various noise levels. Extensive experiments on both +synthetic and our real-world datasets demonstrate that the proposed method +achieves better results than state-of-the-art ones. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Proposing an intelligent mesh smoothing method with graph neural + networks + + +
+ In CFD, mesh smoothing methods are commonly utilized to refine the mesh +quality to achieve high-precision numerical simulations. Specifically, +optimization-based smoothing is used for high-quality mesh smoothing, but it +incurs significant computational overhead. Pioneer works improve its smoothing +efficiency by adopting supervised learning to learn smoothing methods from +high-quality meshes. However, they pose difficulty in smoothing the mesh nodes +with varying degrees and also need data augmentation to address the node input +sequence problem. Additionally, the required labeled high-quality meshes +further limit the applicability of the proposed method. In this paper, we +present GMSNet, a lightweight neural network model for intelligent mesh +smoothing. GMSNet adopts graph neural networks to extract features of the +node's neighbors and output the optimal node position. During smoothing, we +also introduce a fault-tolerance mechanism to prevent GMSNet from generating +negative volume elements. With a lightweight model, GMSNet can effectively +smoothing mesh nodes with varying degrees and remain unaffected by the order of +input data. A novel loss function, MetricLoss, is also developed to eliminate +the need for high-quality meshes, which provides a stable and rapid convergence +during training. We compare GMSNet with commonly used mesh smoothing methods on +two-dimensional triangle meshes. The experimental results show that GMSNet +achieves outstanding mesh smoothing performances with 5% model parameters of +the previous model, and attains 13.56 times faster than optimization-based +smoothing. + +
+
+
+
+
+ + ♻ ☆ Positive Label Is All You Need for Multi-Label Classification ICME 2024 + + +
+ Multi-label classification (MLC) faces challenges from label noise in +training data due to annotating diverse semantic labels for each image. Current +methods mainly target identifying and correcting label mistakes using trained +MLC models, but still struggle with persistent noisy labels during training, +resulting in imprecise recognition and reduced performance. Our paper addresses +label noise in MLC by introducing a positive and unlabeled multi-label +classification (PU-MLC) method. To counteract noisy labels, we directly discard +negative labels, focusing on the abundance of negative labels and the origin of +most noisy labels. PU-MLC employs positive-unlabeled learning, training the +model with only positive labels and unlabeled data. The method incorporates +adaptive re-balance factors and temperature coefficients in the loss function +to address label distribution imbalance and prevent over-smoothing of +probabilities during training. Additionally, we introduce a local-global +convolution module to capture both local and global dependencies in the image +without requiring backbone retraining. PU-MLC proves effective on MLC and MLC +with partial labels (MLC-PL) tasks, demonstrating significant improvements on +MS-COCO and PASCAL VOC datasets with fewer annotations. Code is available at: +https://github.com/TAKELAMAG/PU-MLC. + +
+
+ comment: ICME 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging edge detection and neural networks for better UAV + localization + + +
+ We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs) +in environments lacking Global Navigation Satellite Systems (GNSS). Current +state-of-the-art techniques employ an offline-trained encoder to generate a +vector representation (embedding) of the UAV's current view, which is then +compared with pre-computed embeddings of geo-referenced images to determine the +UAV's position. Here, we demonstrate that the performance of these methods can +be significantly enhanced by preprocessing the images to extract their edges, +which exhibit robustness to seasonal and illumination variations. Furthermore, +we establish that utilizing edges enhances resilience to orientation and +altitude inaccuracies. Additionally, we introduce a confidence criterion for +localization. Our findings are substantiated through synthetic experiments. + +
+
+ comment: Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Joining Forces for Pathology Diagnostics with AI Assistance: The EMPAIA + Initiative + + +
+ Over the past decade, artificial intelligence (AI) methods in pathology have +advanced substantially. However, integration into routine clinical practice has +been slow due to numerous challenges, including technical and regulatory +hurdles in translating research results into clinical diagnostic products and +the lack of standardized interfaces. The open and vendor-neutral EMPAIA +initiative addresses these challenges. Here, we provide an overview of EMPAIA's +achievements and lessons learned. EMPAIA integrates various stakeholders of the +pathology AI ecosystem, i.e., pathologists, computer scientists, and industry. +In close collaboration, we developed technical interoperability standards, +recommendations for AI testing and product development, and explainability +methods. We implemented the modular and open-source EMPAIA platform and +successfully integrated 14 AI-based image analysis apps from 8 different +vendors, demonstrating how different apps can use a single standardized +interface. We prioritized requirements and evaluated the use of AI in real +clinical settings with 14 different pathology laboratories in Europe and Asia. +In addition to technical developments, we created a forum for all stakeholders +to share information and experiences on digital pathology and AI. Commercial, +clinical, and academic stakeholders can now adopt EMPAIA's common open-source +interfaces, providing a unique opportunity for large-scale standardization and +streamlining of processes. Further efforts are needed to effectively and +broadly establish AI assistance in routine laboratory use. To this end, a +sustainable infrastructure, the non-profit association EMPAIA International, +has been established to continue standardization and support broad +implementation and advocacy for an AI-assisted digital pathology future. + +
+
+
+
+
+ + ♻ ☆ Objects as volumes: A stochastic geometry view of opaque solids + + +
+ We develop a theory for the representation of opaque solids as volumes. +Starting from a stochastic representation of opaque solids as random indicator +functions, we prove the conditions under which such solids can be modeled using +exponential volumetric transport. We also derive expressions for the volumetric +attenuation coefficient as a functional of the probability distributions of the +underlying indicator functions. We generalize our theory to account for +isotropic and anisotropic scattering at different parts of the solid, and for +representations of opaque solids as stochastic implicit surfaces. We derive our +volumetric representation from first principles, which ensures that it +satisfies physical constraints such as reciprocity and reversibility. We use +our theory to explain, compare, and correct previous volumetric +representations, as well as propose meaningful extensions that lead to improved +performance in 3D reconstruction tasks. + +
+
+ comment: project page: https://imaging.cs.cmu.edu/volumetric_opaque_solids +
+
+
+
+
+ + ♻ ☆ CuNeRF: Cube-Based Neural Radiance Field for Zero-Shot Medical Image + Arbitrary-Scale Super Resolution ICCV + + +
+ Medical image arbitrary-scale super-resolution (MIASSR) has recently gained +widespread attention, aiming to super sample medical volumes at arbitrary +scales via a single model. However, existing MIASSR methods face two major +limitations: (i) reliance on high-resolution (HR) volumes and (ii) limited +generalization ability, which restricts their application in various scenarios. +To overcome these limitations, we propose Cube-based Neural Radiance Field +(CuNeRF), a zero-shot MIASSR framework that can yield medical images at +arbitrary scales and viewpoints in a continuous domain. Unlike existing MIASSR +methods that fit the mapping between low-resolution (LR) and HR volumes, CuNeRF +focuses on building a coordinate-intensity continuous representation from LR +volumes without the need for HR references. This is achieved by the proposed +differentiable modules: including cube-based sampling, isotropic volume +rendering, and cube-based hierarchical rendering. Through extensive experiments +on magnetic resource imaging (MRI) and computed tomography (CT) modalities, we +demonstrate that CuNeRF outperforms state-of-the-art MIASSR methods. CuNeRF +yields better visual verisimilitude and reduces aliasing artifacts at various +upsampling factors. Moreover, our CuNeRF does not need any LR-HR training +pairs, which is more flexible and easier to be used than others. Our code is +released at https://github.com/NarcissusEx/CuNeRF. + +
+
+ comment: This paper is accepted by the International Conference on Computer + Vision (ICCV) 2023 +
+
+
+
+
+ + ♻ ☆ Human-in-the-Loop Segmentation of Multi-species Coral Imagery CVPR2024 + + +
+ Broad-scale marine surveys performed by underwater vehicles significantly +increase the availability of coral reef imagery, however it is costly and +time-consuming for domain experts to label images. Point label propagation is +an approach used to leverage existing image data labeled with sparse point +labels. The resulting augmented ground truth generated is then used to train a +semantic segmentation model. Here, we first demonstrate that recent advances in +foundation models enable generation of multi-species coral augmented ground +truth masks using denoised DINOv2 features and K-Nearest Neighbors (KNN), +without the need for any pre-training or custom-designed algorithms. For +extremely sparsely labeled images, we propose a labeling regime based on +human-in-the-loop principles, resulting in significant improvement in +annotation efficiency: If only 5 point labels per image are available, our +proposed human-in-the-loop approach improves on the state-of-the-art by 17.3% +for pixel accuracy and 22.6% for mIoU; and by 10.6% and 19.1% when 10 point +labels per image are available. Even if the human-in-the-loop labeling regime +is not used, the denoised DINOv2 features with a KNN outperforms the prior +state-of-the-art by 3.5% for pixel accuracy and 5.7% for mIoU (5 grid points). +We also provide a detailed analysis of how point labeling style and the +quantity of points per image affects the point label propagation quality and +provide general recommendations on maximizing point label efficiency. + +
+
+ comment: Accepted at the CVPR2024 3rd Workshop on Learning with Limited + Labelled Data for Image and Video Understanding (L3D-IVU), 10 pages, 6 + figures, an additional 4 pages of supplementary material +
+
+
+
+
+ + ♻ ☆ Generative Active Learning for Image Synthesis Personalization + + +
+ This paper presents a pilot study that explores the application of active +learning, traditionally studied in the context of discriminative models, to +generative models. We specifically focus on image synthesis personalization +tasks. The primary challenge in conducting active learning on generative models +lies in the open-ended nature of querying, which differs from the closed form +of querying in discriminative models that typically target a single concept. We +introduce the concept of anchor directions to transform the querying process +into a semi-open problem. We propose a direction-based uncertainty sampling +strategy to enable generative active learning and tackle the +exploitation-exploration dilemma. Extensive experiments are conducted to +validate the effectiveness of our approach, demonstrating that an open-source +model can achieve superior performance compared to closed-source models +developed by large companies, such as Google's StyleDrop. The source code is +available at https://github.com/zhangxulu1996/GAL4Personalization. + +
+
+
+
+
+ + ♻ ☆ Learning Self-Prior for Mesh Inpainting Using Self-Supervised Graph + Convolutional Networks + + +
+ In this paper, we present a self-prior-based mesh inpainting framework that +requires only an incomplete mesh as input, without the need for any training +datasets. Additionally, our method maintains the polygonal mesh format +throughout the inpainting process without converting the shape format to an +intermediate one, such as a voxel grid, a point cloud, or an implicit function, +which are typically considered easier for deep neural networks to process. To +achieve this goal, we introduce two graph convolutional networks (GCNs): +single-resolution GCN (SGCN) and multi-resolution GCN (MGCN), both trained in a +self-supervised manner. Our approach refines a watertight mesh obtained from +the initial hole filling to generate a complete output mesh. Specifically, we +train the GCNs to deform an oversmoothed version of the input mesh into the +expected complete shape. The deformation is described by vertex displacements, +and the GCNs are supervised to obtain accurate displacements at vertices in +real holes. To this end, we specify several connected regions of the mesh as +fake holes, thereby generating meshes with various sets of fake holes. The +correct displacements of vertices are known in these fake holes, thus enabling +training GCNs with loss functions that assess the accuracy of vertex +displacements. We demonstrate that our method outperforms traditional +dataset-independent approaches and exhibits greater robustness compared with +other deep-learning-based methods for shapes that infrequently appear in shape +datasets. Our code and test data are available at +https://github.com/astaka-pe/SeMIGCN. + +
+
+ comment: 18 pages, 18 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ MetaCloak: Preventing Unauthorized Subject-driven Text-to-image + Diffusion-based Synthesis via Meta-learning CVPR 2024 + + +
+ Text-to-image diffusion models allow seamless generation of personalized +images from scant reference photos. Yet, these tools, in the wrong hands, can +fabricate misleading or harmful content, endangering individuals. To address +this problem, existing poisoning-based approaches perturb user images in an +imperceptible way to render them "unlearnable" from malicious uses. We identify +two limitations of these defending approaches: i) sub-optimal due to the +hand-crafted heuristics for solving the intractable bilevel optimization and +ii) lack of robustness against simple data transformations like Gaussian +filtering. To solve these challenges, we propose MetaCloak, which solves the +bi-level poisoning problem with a meta-learning framework with an additional +transformation sampling process to craft transferable and robust perturbation. +Specifically, we employ a pool of surrogate diffusion models to craft +transferable and model-agnostic perturbation. Furthermore, by incorporating an +additional transformation process, we design a simple denoising-error +maximization loss that is sufficient for causing transformation-robust semantic +distortion and degradation in a personalized generation. Extensive experiments +on the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing +approaches. Notably, MetaCloak can successfully fool online training services +like Replicate, in a black-box manner, demonstrating the effectiveness of +MetaCloak in real-world scenarios. Our code is available at +https://github.com/liuyixin-louis/MetaCloak. + +
+
+ comment: Accepted to CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ CREST: Cross-modal Resonance through Evidential Deep Learning for + Enhanced Zero-Shot Learning + + +
+ Zero-shot learning (ZSL) enables the recognition of novel classes by +leveraging semantic knowledge transfer from known to unknown categories. This +knowledge, typically encapsulated in attribute descriptions, aids in +identifying class-specific visual features, thus facilitating visual-semantic +alignment and improving ZSL performance. However, real-world challenges such as +distribution imbalances and attribute co-occurrence among instances often +hinder the discernment of local variances in images, a problem exacerbated by +the scarcity of fine-grained, region-specific attribute annotations. Moreover, +the variability in visual presentation within categories can also skew +attribute-category associations. In response, we propose a bidirectional +cross-modal ZSL approach CREST. It begins by extracting representations for +attribute and visual localization and employs Evidential Deep Learning (EDL) to +measure underlying epistemic uncertainty, thereby enhancing the model's +resilience against hard negatives. CREST incorporates dual learning pathways, +focusing on both visual-category and attribute-category alignments, to ensure +robust correlation between latent and observable spaces. Moreover, we introduce +an uncertainty-informed cross-modal fusion technique to refine visual-attribute +inference. Extensive experiments demonstrate our model's effectiveness and +unique explainability across multiple datasets. Our code and data are available +at: https://github.com/JethroJames/CREST. + +
+
+ comment: Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at: + https://github.com/JethroJames/CREST +
+
+
+
+
+ + ♻ ☆ Orientation-conditioned Facial Texture Mapping for Video-based Facial + Remote Photoplethysmography Estimation + + +
+ Camera-based remote photoplethysmography (rPPG) enables contactless +measurement of important physiological signals such as pulse rate (PR). +However, dynamic and unconstrained subject motion introduces significant +variability into the facial appearance in video, confounding the ability of +video-based methods to accurately extract the rPPG signal. In this study, we +leverage the 3D facial surface to construct a novel orientation-conditioned +facial texture video representation which improves the motion robustness of +existing video-based facial rPPG estimation methods. Our proposed method +achieves a significant 18.2% performance improvement in cross-dataset testing +on MMPD over our baseline using the PhysNet model trained on PURE, highlighting +the efficacy and generalization benefits of our designed video representation. +We demonstrate significant performance improvements of up to 29.6% in all +tested motion scenarios in cross-dataset testing on MMPD, even in the presence +of dynamic and unconstrained subject motion, emphasizing the benefits of +disentangling motion through modeling the 3D facial surface for motion robust +facial rPPG estimation. We validate the efficacy of our design decisions and +the impact of different video processing steps through an ablation study. Our +findings illustrate the potential strengths of exploiting the 3D facial surface +as a general strategy for addressing dynamic and unconstrained subject motion +in videos. The code is available at +https://samcantrill.github.io/orientation-uv-rppg/. + +
+
+ comment: 12 pages, 8 figures, 6 tables; corrected abstract typo +
+
+
+
+
+ + ♻ ☆ Attention-based Shape-Deformation Networks for Artifact-Free Geometry + Reconstruction of Lumbar Spine from MR Images + + +
+ Lumbar disc degeneration, a progressive structural wear and tear of lumbar +intervertebral disc, is regarded as an essential role on low back pain, a +significant global health concern. Automated lumbar spine geometry +reconstruction from MR images will enable fast measurement of medical +parameters to evaluate the lumbar status, in order to determine a suitable +treatment. Existing image segmentation-based techniques often generate +erroneous segments or unstructured point clouds, unsuitable for medical +parameter measurement. In this work, we present TransDeformer: a novel +attention-based deep learning approach that reconstructs the geometry of the +lumbar spine with high spatial accuracy and mesh correspondence across +patients, and we also present a variant of TransDeformer for error estimation. +Specially, we devise new attention modules with a new attention formula, which +integrate image features and tokenized contour features to predict the +displacements of the points on a shape template without the need for image +segmentation. The deformed template reveals the lumbar spine geometry in an +image. Experiment results show that our TransDeformer generates artifact-free +geometry outputs, and its variant predicts the error of a reconstructed +geometry. Our code is available at +https://github.com/linchenq/TransDeformer-Mesh. + +
+
+
+
+
+ + ♻ ☆ A Simple Strategy for Body Estimation from Partial-View Images CVPR + + +
+ Virtual try-on and product personalization have become increasingly important +in modern online shopping, highlighting the need for accurate body measurement +estimation. Although previous research has advanced in estimating 3D body +shapes from RGB images, the task is inherently ambiguous as the observed scale +of human subjects in the images depends on two unknown factors: capture +distance and body dimensions. This ambiguity is particularly pronounced in +partial-view scenarios. To address this challenge, we propose a modular and +simple height normalization solution. This solution relocates the subject +skeleton to the desired position, thereby normalizing the scale and +disentangling the relationship between the two variables. Our experimental +results demonstrate that integrating this technique into state-of-the-art human +mesh reconstruction models significantly enhances partial body measurement +estimation. Additionally, we illustrate the applicability of this approach to +multi-view settings, showcasing its versatility. + +
+
+ comment: Accepted to CVPRW 2024 Computer Vision for Fashion, Art, and Design +
+
+
+
+
+ + ♻ ☆ Overcoming the Pitfalls of Vision-Language Model Finetuning for OOD + Generalization ICLR 2024 + + +
+ Existing vision-language models exhibit strong generalization on a variety of +visual domains and tasks. However, such models mainly perform zero-shot +recognition in a closed-set manner, and thus struggle to handle open-domain +visual concepts by design. There are recent finetuning methods, such as prompt +learning, that not only study the discrimination between in-distribution (ID) +and out-of-distribution (OOD) samples, but also show some improvements in both +ID and OOD accuracies. In this paper, we first demonstrate that vision-language +models, after long enough finetuning but without proper regularization, tend to +overfit the known classes in the given dataset, with degraded performance on +unknown classes. Then we propose a novel approach OGEN to address this pitfall, +with the main focus on improving the OOD GENeralization of finetuned models. +Specifically, a class-conditional feature generator is introduced to synthesize +OOD features using just the class name of any unknown class. Such synthesized +features will provide useful knowledge about unknowns and help regularize the +decision boundary between ID and OOD data when optimized jointly. Equally +important is our adaptive self-distillation mechanism to regularize our feature +generation model during joint optimization, i.e., adaptively transferring +knowledge between model states to further prevent overfitting. Experiments +validate that our method yields convincing gains in OOD generalization +performance in different settings. Code: https://github.com/apple/ml-ogen. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Improving the Robustness of 3D Human Pose Estimation: A Benchmark and + Learning from Noisy Input + + +
+ Despite the promising performance of current 3D human pose estimation +techniques, understanding and enhancing their generalization on challenging +in-the-wild videos remain an open problem. In this work, we focus on the +robustness of 2D-to-3D pose lifters. To this end, we develop two benchmark +datasets, namely Human3.6M-C and HumanEva-I-C, to examine the robustness of +video-based 3D pose lifters to a wide range of common video corruptions +including temporary occlusion, motion blur, and pixel-level noise. We observe +the poor generalization of state-of-the-art 3D pose lifters in the presence of +corruption and establish two techniques to tackle this issue. First, we +introduce Temporal Additive Gaussian Noise (TAGN) as a simple yet effective 2D +input pose data augmentation. Additionally, to incorporate the confidence +scores output by the 2D pose detectors, we design a confidence-aware +convolution (CA-Conv) block. Extensively tested on corrupted videos, the +proposed strategies consistently boost the robustness of 3D pose lifters and +serve as new baselines for future research. + +
+
+
+
+
+ + ♻ ☆ AVS-Net: Point Sampling with Adaptive Voxel Size for 3D Scene + Understanding + + +
+ The recent advancements in point cloud learning have enabled intelligent +vehicles and robots to comprehend 3D environments better. However, processing +large-scale 3D scenes remains a challenging problem, such that efficient +downsampling methods play a crucial role in point cloud learning. Existing +downsampling methods either require a huge computational burden or sacrifice +fine-grained geometric information. For such purpose, this paper presents an +advanced sampler that achieves both high accuracy and efficiency. The proposed +method utilizes voxel centroid sampling as a foundation but effectively +addresses the challenges regarding voxel size determination and the +preservation of critical geometric cues. Specifically, we propose a Voxel +Adaptation Module that adaptively adjusts voxel sizes with the reference of +point-based downsampling ratio. This ensures that the sampling results exhibit +a favorable distribution for comprehending various 3D objects or scenes. +Meanwhile, we introduce a network compatible with arbitrary voxel sizes for +sampling and feature extraction while maintaining high efficiency. The proposed +approach is demonstrated with 3D object detection and 3D semantic segmentation. +Compared to existing state-of-the-art methods, our approach achieves better +accuracy on outdoor and indoor large-scale datasets, e.g. Waymo and ScanNet, +with promising efficiency. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Solving Inverse Problems with Latent Diffusion Models via Hard Data + Consistency + + +
+ Diffusion models have recently emerged as powerful generative priors for +solving inverse problems. However, training diffusion models in the pixel space +are both data-intensive and computationally demanding, which restricts their +applicability as priors for high-dimensional real-world data such as medical +images. Latent diffusion models, which operate in a much lower-dimensional +space, offer a solution to these challenges. However, incorporating latent +diffusion models to solve inverse problems remains a challenging problem due to +the nonlinearity of the encoder and decoder. To address these issues, we +propose \textit{ReSample}, an algorithm that can solve general inverse problems +with pre-trained latent diffusion models. Our algorithm incorporates data +consistency by solving an optimization problem during the reverse sampling +process, a concept that we term as hard data consistency. Upon solving this +optimization problem, we propose a novel resampling scheme to map the +measurement-consistent sample back onto the noisy data manifold and +theoretically demonstrate its benefits. Lastly, we apply our algorithm to solve +a wide range of linear and nonlinear inverse problems in both natural and +medical images, demonstrating that our approach outperforms existing +state-of-the-art approaches, including those based on pixel-space diffusion +models. + +
+
+ comment: 27 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ Achieving Reliable and Fair Skin Lesion Diagnosis via Unsupervised + Domain Adaptation + + +
+ The development of reliable and fair diagnostic systems is often constrained +by the scarcity of labeled data. To address this challenge, our work explores +the feasibility of unsupervised domain adaptation (UDA) to integrate large +external datasets for developing reliable classifiers. The adoption of UDA with +multiple sources can simultaneously enrich the training set and bridge the +domain gap between different skin lesion datasets, which vary due to distinct +acquisition protocols. Particularly, UDA shows practical promise for improving +diagnostic reliability when training with a custom skin lesion dataset, where +only limited labeled data are available from the target domain. In this study, +we investigate three UDA training schemes based on source data utilization: +single-source, combined-source, and multi-source UDA. Our findings demonstrate +the effectiveness of applying UDA on multiple sources for binary and +multi-class classification. A strong correlation between test error and label +shift in multi-class tasks has been observed in the experiment. Crucially, our +study shows that UDA can effectively mitigate bias against minority groups and +enhance fairness in diagnostic systems, while maintaining superior +classification performance. This is achieved even without directly implementing +fairness-focused techniques. This success is potentially attributed to the +increased and well-adapted demographic information obtained from multiple +sources. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised MRI Reconstruction with Unrolled Diffusion Models + + +
+ Magnetic Resonance Imaging (MRI) produces excellent soft tissue contrast, +albeit it is an inherently slow imaging modality. Promising deep learning +methods have recently been proposed to reconstruct accelerated MRI scans. +However, existing methods still suffer from various limitations regarding image +fidelity, contextual sensitivity, and reliance on fully-sampled acquisitions +for model training. To comprehensively address these limitations, we propose a +novel self-supervised deep reconstruction model, named Self-Supervised +Diffusion Reconstruction (SSDiffRecon). SSDiffRecon expresses a conditional +diffusion process as an unrolled architecture that interleaves cross-attention +transformers for reverse diffusion steps with data-consistency blocks for +physics-driven processing. Unlike recent diffusion methods for MRI +reconstruction, a self-supervision strategy is adopted to train SSDiffRecon +using only undersampled k-space data. Comprehensive experiments on public brain +MR datasets demonstrates the superiority of SSDiffRecon against +state-of-the-art supervised, and self-supervised baselines in terms of +reconstruction speed and quality. Implementation will be available at +https://github.com/yilmazkorkmaz1/SSDiffRecon. + +
+
+
+
+
+ + ♻ ☆ Segment Anything in 3D with Radiance Fields NeurIPS 2023 + + +
+ The Segment Anything Model (SAM) emerges as a powerful vision foundation +model to generate high-quality 2D segmentation results. This paper aims to +generalize SAM to segment 3D objects. Rather than replicating the data +acquisition and annotation procedure which is costly in 3D, we design an +efficient solution, leveraging the radiance field as a cheap and off-the-shelf +prior that connects multi-view 2D images to the 3D space. We refer to the +proposed solution as SA3D, short for Segment Anything in 3D. With SA3D, the +user is only required to provide a 2D segmentation prompt (e.g., rough points) +for the target object in a single view, which is used to generate its +corresponding 2D mask with SAM. Next, SA3D alternately performs mask inverse +rendering and cross-view self-prompting across various views to iteratively +refine the 3D mask of the target object. For one view, mask inverse rendering +projects the 2D mask obtained by SAM into the 3D space with guidance of the +density distribution learned by the radiance field for 3D mask refinement; +Then, cross-view self-prompting extracts reliable prompts automatically as the +input to SAM from the rendered 2D mask of the inaccurate 3D mask for a new +view. We show in experiments that SA3D adapts to various scenes and achieves 3D +segmentation within seconds. Our research reveals a potential methodology to +lift the ability of a 2D segmentation model to 3D. Our code is available at +https://github.com/Jumpat/SegmentAnythingin3D. + +
+
+ comment: Extension version of SA3D (NeurIPS 2023). Project page: + https://jumpat.github.io/SA3D/ +
+
+
+
+
+ + ♻ ☆ GenCorres: Consistent Shape Matching via Coupled Implicit-Explicit Shape + Generative Models ICLR 2024 + + +
+ This paper introduces GenCorres, a novel unsupervised joint shape matching +(JSM) approach. Our key idea is to learn a mesh generator to fit an unorganized +deformable shape collection while constraining deformations between adjacent +synthetic shapes to preserve geometric structures such as local rigidity and +local conformality. GenCorres presents three appealing advantages over existing +JSM techniques. First, GenCorres performs JSM among a synthetic shape +collection whose size is much bigger than the input shapes and fully leverages +the datadriven power of JSM. Second, GenCorres unifies consistent shape +matching and pairwise matching (i.e., by enforcing deformation priors between +adjacent synthetic shapes). Third, the generator provides a concise encoding of +consistent shape correspondences. However, learning a mesh generator from an +unorganized shape collection is challenging, requiring a good initialization. +GenCorres addresses this issue by learning an implicit generator from the input +shapes, which provides intermediate shapes between two arbitrary shapes. We +introduce a novel approach for computing correspondences between adjacent +implicit surfaces, which we use to regularize the implicit generator. Synthetic +shapes of the implicit generator then guide initial fittings (i.e., via +template-based deformation) for learning the mesh generator. Experimental +results show that GenCorres considerably outperforms state-of-the-art JSM +techniques. The synthetic shapes of GenCorres also achieve salient performance +gains against state-of-the-art deformable shape generators. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and + Training Strategies + + +
+ This paper investigates the performance of the Contrastive Language-Image +Pre-training (CLIP) when scaled down to limited computation budgets. We explore +CLIP along three dimensions: data, architecture, and training strategies. With +regards to data, we demonstrate the significance of high-quality training data +and show that a smaller dataset of high-quality data can outperform a larger +dataset with lower quality. We also examine how model performance varies with +different dataset sizes, suggesting that smaller ViT models are better suited +for smaller datasets, while larger models perform better on larger datasets +with fixed compute. Additionally, we provide guidance on when to choose a +CNN-based architecture or a ViT-based architecture for CLIP training. We +compare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data +Augmentation - and show that the choice of training strategy depends on the +available compute resource. Our analysis reveals that CLIP+Data Augmentation +can achieve comparable performance to CLIP using only half of the training +data. This work provides practical insights into how to effectively train and +deploy CLIP models, making them more accessible and affordable for practical +use in various applications. + +
+
+
+
+
+ + ♻ ☆ ScribbleGen: Generative Data Augmentation Improves Scribble-supervised + Semantic Segmentation + + +
+ Recent advances in generative models, such as diffusion models, have made +generating high-quality synthetic images widely accessible. Prior works have +shown that training on synthetic images improves many perception tasks, such as +image classification, object detection, and semantic segmentation. We are the +first to explore generative data augmentations for scribble-supervised semantic +segmentation. We propose ScribbleGen, a generative data augmentation method +that leverages a ControlNet diffusion model conditioned on semantic scribbles +to produce high-quality training data. However, naive implementations of +generative data augmentations may inadvertently harm the performance of the +downstream segmentor rather than improve it. We leverage classifier-free +diffusion guidance to enforce class consistency and introduce encode ratios to +trade off data diversity for data realism. Using the guidance scale and encode +ratio, we can generate a spectrum of high-quality training images. We propose +multiple augmentation schemes and find that these schemes significantly impact +model performance, especially in the low-data regime. Our framework further +reduces the gap between the performance of scribble-supervised segmentation and +that of fully-supervised segmentation. We also show that our framework +significantly improves segmentation performance on small datasets, even +surpassing fully-supervised segmentation. The code is available at +https://github.com/mengtang-lab/scribblegen. + +
+
+
+
+
+ + ♻ ☆ Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese + Structure Network ACM MM 2023 + + +
+ Semi-supervised semantic segmentation (SSS) is an important task that +utilizes both labeled and unlabeled data to reduce expenses on labeling +training examples. However, the effectiveness of SSS algorithms is limited by +the difficulty of fully exploiting the potential of unlabeled data. To address +this, we propose a dual-level Siamese structure network (DSSN) for pixel-wise +contrastive learning. By aligning positive pairs with a pixel-wise contrastive +loss using strong augmented views in both low-level image space and high-level +feature space, the proposed DSSN is designed to maximize the utilization of +available unlabeled data. Additionally, we introduce a novel class-aware +pseudo-label selection strategy for weak-to-strong supervision, which addresses +the limitations of most existing methods that do not perform selection or apply +a predefined threshold for all classes. Specifically, our strategy selects the +top high-confidence prediction of the weak view for each class to generate +pseudo labels that supervise the strong augmented views. This strategy is +capable of taking into account the class imbalance and improving the +performance of long-tailed classes. Our proposed method achieves +state-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes, +outperforming other SSS algorithms by a significant margin. The source code is +available at https://github.com/kunzhan/DSSN. + +
+
+ comment: ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ NARUTO: Neural Active Reconstruction from Uncertain Target Observations CVPR2024 + + +
+ We present NARUTO, a neural active reconstruction system that combines a +hybrid neural representation with uncertainty learning, enabling high-fidelity +surface reconstruction. Our approach leverages a multi-resolution hash-grid as +the mapping backbone, chosen for its exceptional convergence speed and capacity +to capture high-frequency local features.The centerpiece of our work is the +incorporation of an uncertainty learning module that dynamically quantifies +reconstruction uncertainty while actively reconstructing the environment. By +harnessing learned uncertainty, we propose a novel uncertainty aggregation +strategy for goal searching and efficient path planning. Our system +autonomously explores by targeting uncertain observations and reconstructs +environments with remarkable completeness and fidelity. We also demonstrate the +utility of this uncertainty-aware approach by enhancing SOTA neural SLAM +systems through an active ray sampling strategy. Extensive evaluations of +NARUTO in various environments, using an indoor scene simulator, confirm its +superior performance and state-of-the-art status in active reconstruction, as +evidenced by its impressive results on benchmark datasets like Replica and +MP3D. + +
+
+ comment: Accepted to CVPR2024. Project page: + https://oppo-us-research.github.io/NARUTO-website/. Code: + https://github.com/oppo-us-research/NARUTO +
+
+
+
+
+ + ♻ ☆ BOP Challenge 2023 on Detection, Segmentation and Pose Estimation of + Seen and Unseen Rigid Objects + + +
+ We present the evaluation methodology, datasets and results of the BOP +Challenge 2023, the fifth in a series of public competitions organized to +capture the state of the art in model-based 6D object pose estimation from an +RGB/RGB-D image and related tasks. Besides the three tasks from 2022 +(model-based 2D detection, 2D segmentation, and 6D localization of objects seen +during training), the 2023 challenge introduced new variants of these tasks +focused on objects unseen during training. In the new tasks, methods were +required to learn new objects during a short onboarding stage (max 5 minutes, 1 +GPU) from provided 3D object models. The best 2023 method for 6D localization +of unseen objects (GenFlow) notably reached the accuracy of the best 2020 +method for seen objects (CosyPose), although being noticeably slower. The best +2023 method for seen objects (GPose) achieved a moderate accuracy improvement +but a significant 43% run-time improvement compared to the best 2022 +counterpart (GDRNPP). Since 2017, the accuracy of 6D localization of seen +objects has improved by more than 50% (from 56.9 to 85.6 AR_C). The online +evaluation system stays open and is available at: http://bop.felk.cvut.cz/. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2302.13075 +
+
+
+
+
+ + ♻ ☆ GenURL: A General Framework for Unsupervised Representation Learning + + +
+ Unsupervised representation learning (URL), which learns compact embeddings +of high-dimensional data without supervision, has made remarkable progress +recently. However, the development of URLs for different requirements is +independent, which limits the generalization of the algorithms, especially +prohibitive as the number of tasks grows. For example, dimension reduction +methods, t-SNE, and UMAP optimize pair-wise data relationships by preserving +the global geometric structure, while self-supervised learning, SimCLR, and +BYOL focus on mining the local statistics of instances under specific +augmentations. To address this dilemma, we summarize and propose a unified +similarity-based URL framework, GenURL, which can smoothly adapt to various URL +tasks. In this paper, we regard URL tasks as different implicit constraints on +the data geometric structure that help to seek optimal low-dimensional +representations that boil down to data structural modeling (DSM) and +low-dimensional transformation (LDT). Specifically, DMS provides a +structure-based submodule to describe the global structures, and LDT learns +compact low-dimensional embeddings with given pretext tasks. Moreover, an +objective function, General Kullback-Leibler divergence (GKL), is proposed to +connect DMS and LDT naturally. Comprehensive experiments demonstrate that +GenURL achieves consistent state-of-the-art performance in self-supervised +visual learning, unsupervised knowledge distillation (KD), graph embeddings +(GE), and dimension reduction. + +
+
+ comment: TNNLS 2024 version with 13 pages and 14 figures +
+
+
+
+
+ + ♻ ☆ Neural Language of Thought Models ICLR 2024 + + +
+ The Language of Thought Hypothesis suggests that human cognition operates on +a structured, language-like system of mental representations. While neural +language models can naturally benefit from the compositional structure +inherently and explicitly expressed in language data, learning such +representations from non-linguistic general observations, like images, remains +a challenge. In this work, we introduce the Neural Language of Thought Model +(NLoTM), a novel approach for unsupervised learning of LoTH-inspired +representation and generation. NLoTM comprises two key components: (1) the +Semantic Vector-Quantized Variational Autoencoder, which learns hierarchical, +composable discrete representations aligned with objects and their properties, +and (2) the Autoregressive LoT Prior, an autoregressive transformer that learns +to generate semantic concept tokens compositionally, capturing the underlying +data distribution. We evaluate NLoTM on several 2D and 3D image datasets, +demonstrating superior performance in downstream tasks, out-of-distribution +generalization, and image generation quality compared to patch-based VQ-VAE and +continuous object-centric representations. Our work presents a significant step +towards creating neural networks exhibiting more human-like understanding by +developing LoT-like representations and offers insights into the intersection +of cognitive science and machine learning. + +
+
+ comment: Accepted in ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Orbital Polarimetric Tomography of a Flare Near the Sagittarius A* + Supermassive Black Hole + + +
+ The interaction between the supermassive black hole at the center of the +Milky Way, Sagittarius A*, and its accretion disk occasionally produces +high-energy flares seen in X-ray, infrared, and radio. One proposed mechanism +that produces flares is the formation of compact, bright regions that appear +within the accretion disk and close to the event horizon. Understanding these +flares provides a window into accretion processes. Although sophisticated +simulations predict the formation of these flares, their structure has yet to +be recovered by observations. Here we show the first three-dimensional (3D) +reconstruction of an emission flare recovered from ALMA light curves observed +on April 11, 2017. Our recovery shows compact, bright regions at a distance of +roughly six times the event horizon. Moreover, it suggests a clockwise rotation +in a low-inclination orbital plane, consistent with prior studies by GRAVITY +and EHT. To recover this emission structure, we solve an ill-posed tomography +problem by integrating a neural 3D representation with a gravitational model +for black holes. Although the recovery is subject to, and sometimes sensitive +to, the model assumptions, under physically motivated choices, our results are +stable, and our approach is successful on simulated data. + +
+
+
+
+
+ + ♻ ☆ Social-Transmotion: Promptable Human Trajectory Prediction ICLR 2024 + + +
+ Accurate human trajectory prediction is crucial for applications such as +autonomous vehicles, robotics, and surveillance systems. Yet, existing models +often fail to fully leverage the non-verbal social cues human subconsciously +communicate when navigating the space. To address this, we introduce +Social-Transmotion, a generic Transformer-based model that exploits diverse and +numerous visual cues to predict human behavior. We translate the idea of a +prompt from Natural Language Processing (NLP) to the task of human trajectory +prediction, where a prompt can be a sequence of x-y coordinates on the ground, +bounding boxes in the image plane, or body pose keypoints in either 2D or 3D. +This, in turn, augments trajectory data, leading to enhanced human trajectory +prediction. Using masking technique, our model exhibits flexibility and +adaptability by capturing spatiotemporal interactions between agents based on +the available visual cues. We delve into the merits of using 2D versus 3D +poses, and a limited set of poses. Additionally, we investigate the spatial and +temporal attention map to identify which keypoints and time-steps in the +sequence are vital for optimizing human trajectory prediction. Our approach is +validated on multiple datasets, including JTA, JRDB, Pedestrians and Cyclists +in Road Traffic, and ETH-UCY. The code is publicly available: +https://github.com/vita-epfl/social-transmotion. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Fooling Contrastive Language-Image Pre-trained Models with + CLIPMasterPrints + + +
+ Models leveraging both visual and textual data such as Contrastive +Language-Image Pre-training (CLIP), are the backbone of many recent advances in +artificial intelligence. In this work, we show that despite their versatility, +such models are vulnerable to what we refer to as fooling master images. +Fooling master images are capable of maximizing the confidence score of a CLIP +model for a significant number of widely varying prompts, while being either +unrecognizable or unrelated to the attacked prompts for humans. The existence +of such images is problematic as it could be used by bad actors to maliciously +interfere with CLIP-trained image retrieval models in production with +comparably small effort as a single image can attack many different prompts. We +demonstrate how fooling master images for CLIP (CLIPMasterPrints) can be mined +using stochastic gradient descent, projected gradient descent, or blackbox +optimization. Contrary to many common adversarial attacks, the blackbox +optimization approach allows us to mine CLIPMasterPrints even when the weights +of the model are not accessible. We investigate the properties of the mined +images, and find that images trained on a small number of image captions +generalize to a much larger number of semantically related captions. We +evaluate possible mitigation strategies, where we increase the robustness of +the model and introduce an approach to automatically detect CLIPMasterPrints to +sanitize the input of vulnerable models. Finally, we find that vulnerability to +CLIPMasterPrints is related to a modality gap in contrastive pre-trained +multi-modal networks. Code available at +https://github.com/matfrei/CLIPMasterPrints. + +
+
+ comment: This work was supported by a research grant (40575) from VILLUM + FONDEN +
+
+
+
+
+ + ♻ ☆ Tunable Hybrid Proposal Networks for the Open World WACV 2024 + + +
+ Current state-of-the-art object proposal networks are trained with a +closed-world assumption, meaning they learn to only detect objects of the +training classes. These models fail to provide high recall in open-world +environments where important novel objects may be encountered. While a handful +of recent works attempt to tackle this problem, they fail to consider that the +optimal behavior of a proposal network can vary significantly depending on the +data and application. Our goal is to provide a flexible proposal solution that +can be easily tuned to suit a variety of open-world settings. To this end, we +design a Tunable Hybrid Proposal Network (THPN) that leverages an adjustable +hybrid architecture, a novel self-training procedure, and dynamic loss +components to optimize the tradeoff between known and unknown object detection +performance. To thoroughly evaluate our method, we devise several new +challenges which invoke varying degrees of label bias by altering known class +diversity and label count. We find that in every task, THPN easily outperforms +existing baselines (e.g., RPN, OLN). Our method is also highly data efficient, +surpassing baseline recall with a fraction of the labeled data. + +
+
+ comment: Published in WACV 2024. 22 pages, 9 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Multi-Level Feature Aggregation and Recursive Alignment Network for + Real-Time Semantic Segmentation + + +
+ Real-time semantic segmentation is a crucial research for real-world +applications. However, many methods lay particular emphasis on reducing the +computational complexity and model size, while largely sacrificing the +accuracy. To tackle this problem, we propose a parallel inference network +customized for semantic segmentation tasks to achieve a good trade-off between +speed and accuracy. We employ a shallow backbone to ensure real-time speed, and +propose three core components to compensate for the reduced model capacity to +improve accuracy. Specifically, we first design a dual-pyramidal path +architecture (Multi-level Feature Aggregation Module, MFAM) to aggregate +multi-level features from the encoder to each scale, providing hierarchical +clues for subsequent spatial alignment and corresponding in-network inference. +Then, we build Recursive Alignment Module (RAM) by combining the flow-based +alignment module with recursive upsampling architecture for accurate spatial +alignment between multi-scale feature maps with half the computational +complexity of the straightforward alignment method. Finally, we perform +independent parallel inference on the aligned features to obtain multi-scale +scores, and adaptively fuse them through an attention-based Adaptive Scores +Fusion Module (ASFM) so that the final prediction can favor objects of multiple +scales. Our framework shows a better balance between speed and accuracy than +state-of-the-art real-time methods on Cityscapes and CamVid datasets. We also +conducted systematic ablation studies to gain insight into our motivation and +architectural design. Code is available at: +https://github.com/Yanhua-Zhang/MFARANet. + +
+
+ comment: 15 pages, 9 figures and 12 Tables. Manuscript completed on April 30, + 2022 +
+
+
+
+
+ + ♻ ☆ DiffusionAvatars: Deferred Diffusion for High-fidelity 3D Head Avatars + + +
+ DiffusionAvatars synthesizes a high-fidelity 3D head avatar of a person, +offering intuitive control over both pose and expression. We propose a +diffusion-based neural renderer that leverages generic 2D priors to produce +compelling images of faces. For coarse guidance of the expression and head +pose, we render a neural parametric head model (NPHM) from the target +viewpoint, which acts as a proxy geometry of the person. Additionally, to +enhance the modeling of intricate facial expressions, we condition +DiffusionAvatars directly on the expression codes obtained from NPHM via +cross-attention. Finally, to synthesize consistent surface details across +different viewpoints and expressions, we rig learnable spatial features to the +head's surface via TriPlane lookup in NPHM's canonical space. We train +DiffusionAvatars on RGB videos and corresponding fitted NPHM meshes of a person +and test the obtained avatars in both self-reenactment and animation scenarios. +Our experiments demonstrate that DiffusionAvatars generates temporally +consistent and visually appealing videos for novel poses and expressions of a +person, outperforming existing approaches. + +
+
+ comment: Project Page: https://tobias-kirschstein.github.io/diffusion-avatars/ + , Video: https://youtu.be/nSjDiiTnp2E +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 212 + +
+
+
+ + ☆ Can We Break Free from Strong Data Augmentations in Self-Supervised + Learning? + + +
+ Self-supervised learning (SSL) has emerged as a promising solution for +addressing the challenge of limited labeled data in deep neural networks +(DNNs), offering scalability potential. However, the impact of design +dependencies within the SSL framework remains insufficiently investigated. In +this study, we comprehensively explore SSL behavior across a spectrum of +augmentations, revealing their crucial role in shaping SSL model performance +and learning mechanisms. Leveraging these insights, we propose a novel learning +approach that integrates prior knowledge, with the aim of curtailing the need +for extensive data augmentations and thereby amplifying the efficacy of learned +representations. Notably, our findings underscore that SSL models imbued with +prior knowledge exhibit reduced texture bias, diminished reliance on shortcuts +and augmentations, and improved robustness against both natural and adversarial +corruptions. These findings not only illuminate a new direction in SSL +research, but also pave the way for enhancing DNN performance while +concurrently alleviating the imperative for intensive data augmentation, +thereby enhancing scalability and real-world problem-solving capabilities. + +
+
+
+
+
+ + ☆ LetsGo: Large-Scale Garage Modeling and Rendering via LiDAR-Assisted + Gaussian Primitives + + +
+ Large garages are ubiquitous yet intricate scenes in our daily lives, posing +challenges characterized by monotonous colors, repetitive patterns, reflective +surfaces, and transparent vehicle glass. Conventional Structure from Motion +(SfM) methods for camera pose estimation and 3D reconstruction fail in these +environments due to poor correspondence construction. To address these +challenges, this paper introduces LetsGo, a LiDAR-assisted Gaussian splatting +approach for large-scale garage modeling and rendering. We develop a handheld +scanner, Polar, equipped with IMU, LiDAR, and a fisheye camera, to facilitate +accurate LiDAR and image data scanning. With this Polar device, we present a +GarageWorld dataset consisting of five expansive garage scenes with diverse +geometric structures and will release the dataset to the community for further +research. We demonstrate that the collected LiDAR point cloud by the Polar +device enhances a suite of 3D Gaussian splatting algorithms for garage scene +modeling and rendering. We also propose a novel depth regularizer for 3D +Gaussian splatting algorithm training, effectively eliminating floating +artifacts in rendered images, and a lightweight Level of Detail (LOD) Gaussian +renderer for real-time viewing on web-based devices. Additionally, we explore a +hybrid representation that combines the advantages of traditional mesh in +depicting simple geometry and colors (e.g., walls and the ground) with modern +3D Gaussian representations capturing complex details and high-frequency +textures. This strategy achieves an optimal balance between memory performance +and rendering quality. Experimental results on our dataset, along with +ScanNet++ and KITTI-360, demonstrate the superiority of our method in rendering +quality and resource efficiency. + +
+
+ comment: Project Page: https://jdtsui.github.io/letsgo/ +
+
+
+
+
+ + ☆ FSRT: Facial Scene Representation Transformer for Face Reenactment from + Factorized Appearance, Head-pose, and Facial Expression Features CVPR 2024 + + +
+ The task of face reenactment is to transfer the head motion and facial +expressions from a driving video to the appearance of a source image, which may +be of a different person (cross-reenactment). Most existing methods are +CNN-based and estimate optical flow from the source image to the current +driving frame, which is then inpainted and refined to produce the output +animation. We propose a transformer-based encoder for computing a set-latent +representation of the source image(s). We then predict the output color of a +query pixel using a transformer-based decoder, which is conditioned with +keypoints and a facial expression vector extracted from the driving frame. +Latent representations of the source person are learned in a self-supervised +manner that factorize their appearance, head pose, and facial expressions. +Thus, they are perfectly suited for cross-reenactment. In contrast to most +related work, our method naturally extends to multiple source images and can +thus adapt to person-specific facial dynamics. We also propose data +augmentation and regularization schemes that are necessary to prevent +overfitting and support generalizability of the learned representations. We +evaluated our approach in a randomized user study. The results indicate +superior performance compared to the state-of-the-art in terms of motion +transfer quality and temporal consistency. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Equipping Diffusion Models with Differentiable Spatial Entropy for + Low-Light Image Enhancement CVPR + + +
+ Image restoration, which aims to recover high-quality images from their +corrupted counterparts, often faces the challenge of being an ill-posed problem +that allows multiple solutions for a single input. However, most deep learning +based works simply employ l1 loss to train their network in a deterministic +way, resulting in over-smoothed predictions with inferior perceptual quality. +In this work, we propose a novel method that shifts the focus from a +deterministic pixel-by-pixel comparison to a statistical perspective, +emphasizing the learning of distributions rather than individual pixel values. +The core idea is to introduce spatial entropy into the loss function to measure +the distribution difference between predictions and targets. To make this +spatial entropy differentiable, we employ kernel density estimation (KDE) to +approximate the probabilities for specific intensity values of each pixel with +their neighbor areas. Specifically, we equip the entropy with diffusion models +and aim for superior accuracy and enhanced perceptual quality over l1 based +noise matching loss. In the experiments, we evaluate the proposed method for +low light enhancement on two datasets and the NTIRE challenge 2024. All these +results illustrate the effectiveness of our statistic-based entropy loss. Code +is available at https://github.com/shermanlian/spatial-entropy-loss. + +
+
+ comment: CVPRW 2024, best LPIPS in the NTIRE low light enhancement challenge + 2024 +
+
+
+
+
+ + ☆ Photo-Realistic Image Restoration in the Wild with Controlled + Vision-Language Models CVPR + + +
+ Though diffusion models have been successfully applied to various image +restoration (IR) tasks, their performance is sensitive to the choice of +training datasets. Typically, diffusion models trained in specific datasets +fail to recover images that have out-of-distribution degradations. To address +this problem, this work leverages a capable vision-language model and a +synthetic degradation pipeline to learn image restoration in the wild (wild +IR). More specifically, all low-quality images are simulated with a synthetic +degradation pipeline that contains multiple common degradations such as blur, +resize, noise, and JPEG compression. Then we introduce robust training for a +degradation-aware CLIP model to extract enriched image content features to +assist high-quality image restoration. Our base diffusion model is the image +restoration SDE (IR-SDE). Built upon it, we further present a posterior +sampling strategy for fast noise-free image generation. We evaluate our model +on both synthetic and real-world degradation datasets. Moreover, experiments on +the unified image restoration task illustrate that the proposed posterior +sampling improves image generation quality for various degradations. + +
+
+ comment: CVPRW 2024; Code: https://github.com/Algolzw/daclip-uir +
+
+
+
+
+ + ☆ Adaptive Patching for High-resolution Image Segmentation with + Transformers + + +
+ Attention-based models are proliferating in the space of image analytics, +including segmentation. The standard method of feeding images to transformer +encoders is to divide the images into patches and then feed the patches to the +model as a linear sequence of tokens. For high-resolution images, e.g. +microscopic pathology images, the quadratic compute and memory cost prohibits +the use of an attention-based model, if we are to use smaller patch sizes that +are favorable in segmentation. The solution is to either use custom complex +multi-resolution models or approximate attention schemes. We take inspiration +from Adapative Mesh Refinement (AMR) methods in HPC by adaptively patching the +images, as a pre-processing step, based on the image details to reduce the +number of patches being fed to the model, by orders of magnitude. This method +has a negligible overhead, and works seamlessly with any attention-based model, +i.e. it is a pre-processing step that can be adopted by any attention-based +model without friction. We demonstrate superior segmentation quality over SoTA +segmentation models for real-world pathology datasets while gaining a geomean +speedup of $6.9\times$ for resolutions up to $64K^2$, on up to $2,048$ GPUs. + +
+
+
+
+
+ + ☆ HSIDMamba: Exploring Bidirectional State-Space Models for Hyperspectral + Denoising + + +
+ Effectively discerning spatial-spectral dependencies in HSI denoising is +crucial, but prevailing methods using convolution or transformers still face +computational efficiency limitations. Recently, the emerging Selective State +Space Model(Mamba) has risen with its nearly linear computational complexity in +processing natural language sequences, which inspired us to explore its +potential in handling long spectral sequences. In this paper, we propose +HSIDMamba(HSDM), tailored to exploit the linear complexity for effectively +capturing spatial-spectral dependencies in HSI denoising. In particular, HSDM +comprises multiple Hyperspectral Continuous Scan Blocks, incorporating +BCSM(Bidirectional Continuous Scanning Mechanism), scale residual, and spectral +attention mechanisms to enhance the capture of long-range and local +spatial-spectral information. BCSM strengthens spatial-spectral interactions by +linking forward and backward scans and enhancing information from eight +directions through SSM, significantly enhancing the perceptual capability of +HSDM and improving denoising performance more effectively. Extensive +evaluations against HSI denoising benchmarks validate the superior performance +of HSDM, achieving state-of-the-art results in performance and surpassing the +efficiency of the latest transformer architectures by $30\%$. + +
+
+
+
+
+ + ☆ XoFTR: Cross-modal Feature Matching Transformer CVPR + + +
+ We introduce, XoFTR, a cross-modal cross-view method for local feature +matching between thermal infrared (TIR) and visible images. Unlike visible +images, TIR images are less susceptible to adverse lighting and weather +conditions but present difficulties in matching due to significant texture and +intensity differences. Current hand-crafted and learning-based methods for +visible-TIR matching fall short in handling viewpoint, scale, and texture +diversities. To address this, XoFTR incorporates masked image modeling +pre-training and fine-tuning with pseudo-thermal image augmentation to handle +the modality differences. Additionally, we introduce a refined matching +pipeline that adjusts for scale discrepancies and enhances match reliability +through sub-pixel level refinement. To validate our approach, we collect a +comprehensive visible-thermal dataset, and show that our method outperforms +existing methods on many benchmarks. + +
+
+ comment: CVPR Image Matching Workshop, 2024. 12 pages, 7 figures, 5 tables. + Codes and dataset are available at https://github.com/OnderT/XoFTR +
+
+
+
+
+ + ☆ Harnessing GPT-4V(ision) for Insurance: A Preliminary Exploration + + +
+ The emergence of Large Multimodal Models (LMMs) marks a significant milestone +in the development of artificial intelligence. Insurance, as a vast and complex +discipline, involves a wide variety of data forms in its operational processes, +including text, images, and videos, thereby giving rise to diverse multimodal +tasks. Despite this, there has been limited systematic exploration of +multimodal tasks specific to insurance, nor a thorough investigation into how +LMMs can address these challenges. In this paper, we explore GPT-4V's +capabilities in the insurance domain. We categorize multimodal tasks by +focusing primarily on visual aspects based on types of insurance (e.g., auto, +household/commercial property, health, and agricultural insurance) and +insurance stages (e.g., risk assessment, risk monitoring, and claims +processing). Our experiment reveals that GPT-4V exhibits remarkable abilities +in insurance-related tasks, demonstrating not only a robust understanding of +multimodal content in the insurance domain but also a comprehensive knowledge +of insurance scenarios. However, there are notable shortcomings: GPT-4V +struggles with detailed risk rating and loss assessment, suffers from +hallucination in image understanding, and shows variable support for different +languages. Through this work, we aim to bridge the insurance domain with +cutting-edge LMM technology, facilitate interdisciplinary exchange and +development, and provide a foundation for the continued advancement and +evolution of future research endeavors. + +
+
+
+
+
+ + ☆ Post-Training Network Compression for 3D Medical Image Segmentation: + Reducing Computational Efforts via Tucker Decomposition + + +
+ We address the computational barrier of deploying advanced deep learning +segmentation models in clinical settings by studying the efficacy of network +compression through tensor decomposition. We propose a post-training Tucker +factorization that enables the decomposition of pre-existing models to reduce +computational requirements without impeding segmentation accuracy. We applied +Tucker decomposition to the convolutional kernels of the TotalSegmentator (TS) +model, an nnU-Net model trained on a comprehensive dataset for automatic +segmentation of 117 anatomical structures. Our approach reduced the +floating-point operations (FLOPs) and memory required during inference, +offering an adjustable trade-off between computational efficiency and +segmentation quality. This study utilized the publicly available TS dataset, +employing various downsampling factors to explore the relationship between +model size, inference speed, and segmentation performance. The application of +Tucker decomposition to the TS model substantially reduced the model parameters +and FLOPs across various compression rates, with limited loss in segmentation +accuracy. We removed up to 88% of the model's parameters with no significant +performance changes in the majority of classes after fine-tuning. Practical +benefits varied across different graphics processing unit (GPU) architectures, +with more distinct speed-ups on less powerful hardware. Post-hoc network +compression via Tucker decomposition presents a viable strategy for reducing +the computational demand of medical image segmentation models without +substantially sacrificing accuracy. This approach enables the broader adoption +of advanced deep learning technologies in clinical practice, offering a way to +navigate the constraints of hardware capabilities. + +
+
+
+
+
+ + ☆ Deformable MRI Sequence Registration for AI-based Prostate Cancer + Diagnosis + + +
+ The PI-CAI (Prostate Imaging: Cancer AI) challenge led to expert-level +diagnostic algorithms for clinically significant prostate cancer detection. The +algorithms receive biparametric MRI scans as input, which consist of +T2-weighted and diffusion-weighted scans. These scans can be misaligned due to +multiple factors in the scanning process. Image registration can alleviate this +issue by predicting the deformation between the sequences. We investigate the +effect of image registration on the diagnostic performance of AI-based prostate +cancer diagnosis. First, the image registration algorithm, developed in +MeVisLab, is analyzed using a dataset with paired lesion annotations. Second, +the effect on diagnosis is evaluated by comparing case-level cancer diagnosis +performance between using the original dataset, rigidly aligned +diffusion-weighted scans, or deformably aligned diffusion-weighted scans. Rigid +registration showed no improvement. Deformable registration demonstrated a +substantial improvement in lesion overlap (+10% median Dice score) and a +positive yet non-significant improvement in diagnostic performance (+0.3% +AUROC, p=0.18). Our investigation shows that a substantial improvement in +lesion alignment does not directly lead to a significant improvement in +diagnostic performance. Qualitative analysis indicated that jointly developing +image registration methods and diagnostic AI algorithms could enhance +diagnostic accuracy and patient outcomes. + +
+
+
+
+
+ + ☆ Do LLMs Understand Visual Anomalies? Uncovering LLM Capabilities in + Zero-shot Anomaly Detection + + +
+ Large vision-language models (LVLMs) are markedly proficient in deriving +visual representations guided by natural language. Recent explorations have +utilized LVLMs to tackle zero-shot visual anomaly detection (VAD) challenges by +pairing images with textual descriptions indicative of normal and abnormal +conditions, referred to as anomaly prompts. However, existing approaches depend +on static anomaly prompts that are prone to cross-semantic ambiguity, and +prioritize global image-level representations over crucial local pixel-level +image-to-text alignment that is necessary for accurate anomaly localization. In +this paper, we present ALFA, a training-free approach designed to address these +challenges via a unified model. We propose a run-time prompt adaptation +strategy, which first generates informative anomaly prompts to leverage the +capabilities of a large language model (LLM). This strategy is enhanced by a +contextual scoring mechanism for per-image anomaly prompt adaptation and +cross-semantic ambiguity mitigation. We further introduce a novel fine-grained +aligner to fuse local pixel-level semantics for precise anomaly localization, +by projecting the image-text alignment from global to local semantic spaces. +Extensive evaluations on the challenging MVTec and VisA datasets confirm ALFA's +effectiveness in harnessing the language potential for zero-shot VAD, achieving +significant PRO improvements of 12.1% on MVTec AD and 8.9% on VisA compared to +state-of-the-art zero-shot VAD approaches. + +
+
+
+
+
+ + ☆ Real-world Instance-specific Image Goal Navigation for Service Robots: + Bridging the Domain Gap with Contrastive Learning IROS2024 + + +
+ Improving instance-specific image goal navigation (InstanceImageNav), which +locates the identical object in a real-world environment from a query image, is +essential for robotic systems to assist users in finding desired objects. The +challenge lies in the domain gap between low-quality images observed by the +moving robot, characterized by motion blur and low-resolution, and high-quality +query images provided by the user. Such domain gaps could significantly reduce +the task success rate but have not been the focus of previous work. To address +this, we propose a novel method called Few-shot Cross-quality Instance-aware +Adaptation (CrossIA), which employs contrastive learning with an instance +classifier to align features between massive low- and few high-quality images. +This approach effectively reduces the domain gap by bringing the latent +representations of cross-quality images closer on an instance basis. +Additionally, the system integrates an object image collection with a +pre-trained deblurring model to enhance the observed image quality. Our method +fine-tunes the SimSiam model, pre-trained on ImageNet, using CrossIA. We +evaluated our method's effectiveness through an InstanceImageNav task with 20 +different types of instances, where the robot identifies the same instance in a +real-world environment as a high-quality query image. Our experiments showed +that our method improves the task success rate by up to three times compared to +the baseline, a conventional approach based on SuperGlue. These findings +highlight the potential of leveraging contrastive learning and image +enhancement techniques to bridge the domain gap and improve object localization +in robotic applications. The project website is +https://emergentsystemlabstudent.github.io/DomainBridgingNav/. + +
+
+ comment: See website at + https://emergentsystemlabstudent.github.io/DomainBridgingNav/. Submitted to + IROS2024 +
+
+
+
+
+ + ☆ CREST: Cross-modal Resonance through Evidential Deep Learning for + Enhanced Zero-Shot Learning + + +
+ Zero-shot learning (ZSL) enables the recognition of novel classes by +leveraging semantic knowledge transfer from known to unknown categories. This +knowledge, typically encapsulated in attribute descriptions, aids in +identifying class-specific visual features, thus facilitating visual-semantic +alignment and improving ZSL performance. However, real-world challenges such as +distribution imbalances and attribute co-occurrence among instances often +hinder the discernment of local variances in images, a problem exacerbated by +the scarcity of fine-grained, region-specific attribute annotations. Moreover, +the variability in visual presentation within categories can also skew +attribute-category associations. In response, we propose a bidirectional +cross-modal ZSL approach CREST. It begins by extracting representations for +attribute and visual localization and employs Evidential Deep Learning (EDL) to +measure underlying epistemic uncertainty, thereby enhancing the model's +resilience against hard negatives. CREST incorporates dual learning pathways, +focusing on both visual-category and attribute-category alignments, to ensure +robust correlation between latent and observable spaces. Moreover, we introduce +an uncertainty-informed cross-modal fusion technique to refine visual-attribute +inference. Extensive experiments demonstrate our model's effectiveness and +unique explainability across multiple datasets. Our code and data are available +at: Comments: Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at +https://github.com/JethroJames/CREST. + +
+
+ comment: Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at + https://github.com/JethroJames/CREST +
+
+
+
+
+ + ☆ In-Context Translation: Towards Unifying Image Recognition, Processing, + and Generation + + +
+ We propose In-Context Translation (ICT), a general learning framework to +unify visual recognition (e.g., semantic segmentation), low-level image +processing (e.g., denoising), and conditional image generation (e.g., +edge-to-image synthesis). Thanks to unification, ICT significantly reduces the +inherent inductive bias that comes with designing models for specific tasks, +and it maximizes mutual enhancement across similar tasks. However, the +unification across a large number of tasks is non-trivial due to various data +formats and training pipelines. To this end, ICT introduces two designs. +Firstly, it standardizes input-output data of different tasks into RGB image +pairs, e.g., semantic segmentation data pairs an RGB image with its +segmentation mask in the same RGB format. This turns different tasks into a +general translation task between two RGB images. Secondly, it standardizes the +training of different tasks into a general in-context learning, where +"in-context" means the input comprises an example input-output pair of the +target task and a query image. The learning objective is to generate the +"missing" data paired with the query. The implicit translation process is thus +between the query and the generated image. In experiments, ICT unifies ten +vision tasks and showcases impressive performance on their respective +benchmarks. Notably, compared to its competitors, e.g., Painter and +PromptDiffusion, ICT trained on only 4 RTX 3090 GPUs is shown to be more +efficient and less costly in training. + +
+
+
+
+
+ + ☆ Bridging Vision and Language Spaces with Assignment Prediction ICLR 2024 + + +
+ This paper introduces VLAP, a novel approach that bridges pretrained vision +models and large language models (LLMs) to make frozen LLMs understand the +visual world. VLAP transforms the embedding space of pretrained vision models +into the LLMs' word embedding space using a single linear layer for efficient +and general-purpose visual and language understanding. Specifically, we harness +well-established word embeddings to bridge two modality embedding spaces. The +visual and text representations are simultaneously assigned to a set of word +embeddings within pretrained LLMs by formulating the assigning procedure as an +optimal transport problem. We predict the assignment of one modality from the +representation of another modality data, enforcing consistent assignments for +paired multimodal data. This allows vision and language representations to +contain the same information, grounding the frozen LLMs' word embedding space +in visual data. Moreover, a robust semantic taxonomy of LLMs can be preserved +with visual data since the LLMs interpret and reason linguistic information +from correlations between word embeddings. Experimental results show that VLAP +achieves substantial improvements over the previous linear transformation-based +approaches across a range of vision-language tasks, including image captioning, +visual question answering, and cross-modal retrieval. We also demonstrate the +learned visual representations hold a semantic taxonomy of LLMs, making visual +semantic arithmetic possible. + +
+
+ comment: ICLR 2024 Camera-ready +
+
+
+
+
+ + ☆ AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics + Perception + + +
+ The highly abstract nature of image aesthetics perception (IAP) poses +significant challenge for current multimodal large language models (MLLMs). The +lack of human-annotated multi-modality aesthetic data further exacerbates this +dilemma, resulting in MLLMs falling short of aesthetics perception +capabilities. To address the above challenge, we first introduce a +comprehensively annotated Aesthetic Multi-Modality Instruction Tuning (AesMMIT) +dataset, which serves as the footstone for building multi-modality aesthetics +foundation models. Specifically, to align MLLMs with human aesthetics +perception, we construct a corpus-rich aesthetic critique database with 21,904 +diverse-sourced images and 88K human natural language feedbacks, which are +collected via progressive questions, ranging from coarse-grained aesthetic +grades to fine-grained aesthetic descriptions. To ensure that MLLMs can handle +diverse queries, we further prompt GPT to refine the aesthetic critiques and +assemble the large-scale aesthetic instruction tuning dataset, i.e. AesMMIT, +which consists of 409K multi-typed instructions to activate stronger aesthetic +capabilities. Based on the AesMMIT database, we fine-tune the open-sourced +general foundation models, achieving multi-modality Aesthetic Expert models, +dubbed AesExpert. Extensive experiments demonstrate that the proposed AesExpert +models deliver significantly better aesthetic perception performances than the +state-of-the-art MLLMs, including the most advanced GPT-4V and +Gemini-Pro-Vision. Source data will be available at +https://github.com/yipoh/AesExpert. + +
+
+
+
+
+ + ☆ UNIAA: A Unified Multi-modal Image Aesthetic Assessment Baseline and + Benchmark + + +
+ As an alternative to expensive expert evaluation, Image Aesthetic Assessment +(IAA) stands out as a crucial task in computer vision. However, traditional IAA +methods are typically constrained to a single data source or task, restricting +the universality and broader application. In this work, to better align with +human aesthetics, we propose a Unified Multi-modal Image Aesthetic Assessment +(UNIAA) framework, including a Multi-modal Large Language Model (MLLM) named +UNIAA-LLaVA and a comprehensive benchmark named UNIAA-Bench. We choose MLLMs +with both visual perception and language ability for IAA and establish a +low-cost paradigm for transforming the existing datasets into unified and +high-quality visual instruction tuning data, from which the UNIAA-LLaVA is +trained. To further evaluate the IAA capability of MLLMs, we construct the +UNIAA-Bench, which consists of three aesthetic levels: Perception, Description, +and Assessment. Extensive experiments validate the effectiveness and +rationality of UNIAA. UNIAA-LLaVA achieves competitive performance on all +levels of UNIAA-Bench, compared with existing MLLMs. Specifically, our model +performs better than GPT-4V in aesthetic perception and even approaches the +junior-level human. We find MLLMs have great potential in IAA, yet there +remains plenty of room for further improvement. The UNIAA-LLaVA and UNIAA-Bench +will be released. + +
+
+
+
+
+ + ☆ A Review and Efficient Implementation of Scene Graph Generation Metrics + + +
+ Scene graph generation has emerged as a prominent research field in computer +vision, witnessing significant advancements in the recent years. However, +despite these strides, precise and thorough definitions for the metrics used to +evaluate scene graph generation models are lacking. In this paper, we address +this gap in the literature by providing a review and precise definition of +commonly used metrics in scene graph generation. Our comprehensive examination +clarifies the underlying principles of these metrics and can serve as a +reference or introduction to scene graph metrics. + Furthermore, to facilitate the usage of these metrics, we introduce a +standalone Python package called SGBench that efficiently implements all +defined metrics, ensuring their accessibility to the research community. +Additionally, we present a scene graph benchmarking web service, that enables +researchers to compare scene graph generation methods and increase visibility +of new methods in a central place. + All of our code can be found at https://lorjul.github.io/sgbench/. + +
+
+
+
+
+ + ☆ Reactive Model Correction: Mitigating Harm to Task-Relevant Features via + Conditional Bias Suppression + + +
+ Deep Neural Networks are prone to learning and relying on spurious +correlations in the training data, which, for high-risk applications, can have +fatal consequences. Various approaches to suppress model reliance on harmful +features have been proposed that can be applied post-hoc without additional +training. Whereas those methods can be applied with efficiency, they also tend +to harm model performance by globally shifting the distribution of latent +features. To mitigate unintended overcorrection of model behavior, we propose a +reactive approach conditioned on model-derived knowledge and eXplainable +Artificial Intelligence (XAI) insights. While the reactive approach can be +applied to many post-hoc methods, we demonstrate the incorporation of +reactivity in particular for P-ClArC (Projective Class Artifact Compensation), +introducing a new method called R-ClArC (Reactive Class Artifact Compensation). +Through rigorous experiments in controlled settings (FunnyBirds) and with a +real-world dataset (ISIC2019), we show that introducing reactivity can minimize +the detrimental effect of the applied correction while simultaneously ensuring +low reliance on spurious features. + +
+
+
+
+
+ + ☆ 3D Gaussian Splatting as Markov Chain Monte Carlo + + +
+ While 3D Gaussian Splatting has recently become popular for neural rendering, +current methods rely on carefully engineered cloning and splitting strategies +for placing Gaussians, which does not always generalize and may lead to +poor-quality renderings. In addition, for real-world scenes, they rely on a +good initial point cloud to perform well. In this work, we rethink 3D Gaussians +as random samples drawn from an underlying probability distribution describing +the physical representation of the scene -- in other words, Markov Chain Monte +Carlo (MCMC) samples. Under this view, we show that the 3D Gaussian updates are +strikingly similar to a Stochastic Langevin Gradient Descent (SGLD) update. As +with MCMC, samples are nothing but past visit locations, adding new Gaussians +under our framework can simply be realized without heuristics as placing +Gaussians at existing Gaussian locations. To encourage using fewer Gaussians +for efficiency, we introduce an L1-regularizer on the Gaussians. On various +standard evaluation scenes, we show that our method provides improved rendering +quality, easy control over the number of Gaussians, and robustness to +initialization. + +
+
+
+
+
+ + ☆ Mitigating the Curse of Dimensionality for Certified Robustness via Dual + Randomized Smoothing + + +
+ Randomized Smoothing (RS) has been proven a promising method for endowing an +arbitrary image classifier with certified robustness. However, the substantial +uncertainty inherent in the high-dimensional isotropic Gaussian noise imposes +the curse of dimensionality on RS. Specifically, the upper bound of ${\ell_2}$ +certified robustness radius provided by RS exhibits a diminishing trend with +the expansion of the input dimension $d$, proportionally decreasing at a rate +of $1/\sqrt{d}$. This paper explores the feasibility of providing ${\ell_2}$ +certified robustness for high-dimensional input through the utilization of dual +smoothing in the lower-dimensional space. The proposed Dual Randomized +Smoothing (DRS) down-samples the input image into two sub-images and smooths +the two sub-images in lower dimensions. Theoretically, we prove that DRS +guarantees a tight ${\ell_2}$ certified robustness radius for the original +input and reveal that DRS attains a superior upper bound on the ${\ell_2}$ +robustness radius, which decreases proportionally at a rate of $(1/\sqrt m + +1/\sqrt n )$ with $m+n=d$. Extensive experiments demonstrate the +generalizability and effectiveness of DRS, which exhibits a notable capability +to integrate with established methodologies, yielding substantial improvements +in both accuracy and ${\ell_2}$ certified robustness baselines of RS on the +CIFAR-10 and ImageNet datasets. Code is available at +https://github.com/xiasong0501/DRS. + +
+
+
+
+
+ + ☆ Pseudo-label Learning with Calibrated Confidence Using an Energy-based + Model IJCNN 2024 + + +
+ In pseudo-labeling (PL), which is a type of semi-supervised learning, +pseudo-labels are assigned based on the confidence scores provided by the +classifier; therefore, accurate confidence is important for successful PL. In +this study, we propose a PL algorithm based on an energy-based model (EBM), +which is referred to as the energy-based PL (EBPL). In EBPL, a neural +network-based classifier and an EBM are jointly trained by sharing their +feature extraction parts. This approach enables the model to learn both the +class decision boundary and input data distribution, enhancing confidence +calibration during network training. The experimental results demonstrate that +EBPL outperforms the existing PL method in semi-supervised image classification +tasks, with superior confidence calibration error and recognition accuracy. + +
+
+ comment: 8 pages, 8 figures, Accepted at IJCNN 2024 +
+
+
+
+
+ + ☆ MTKD: Multi-Teacher Knowledge Distillation for Image Super-Resolution + + +
+ Knowledge distillation (KD) has emerged as a promising technique in deep +learning, typically employed to enhance a compact student network through +learning from their high-performance but more complex teacher variant. When +applied in the context of image super-resolution, most KD approaches are +modified versions of methods developed for other computer vision tasks, which +are based on training strategies with a single teacher and simple loss +functions. In this paper, we propose a novel Multi-Teacher Knowledge +Distillation (MTKD) framework specifically for image super-resolution. It +exploits the advantages of multiple teachers by combining and enhancing the +outputs of these teacher models, which then guides the learning process of the +compact student network. To achieve more effective learning performance, we +have also developed a new wavelet-based loss function for MTKD, which can +better optimize the training process by observing differences in both the +spatial and frequency domains. We fully evaluate the effectiveness of the +proposed method by comparing it to five commonly used KD methods for image +super-resolution based on three popular network architectures. The results show +that the proposed MTKD method achieves evident improvements in super-resolution +performance, up to 0.46dB (based on PSNR), over state-of-the-art KD approaches +across different network structures. The source code of MTKD will be made +available here for public evaluation. + +
+
+
+
+
+ + ☆ The revenge of BiSeNet: Efficient Multi-Task Image Segmentation CVPR2024 + + +
+ Recent advancements in image segmentation have focused on enhancing the +efficiency of the models to meet the demands of real-time applications, +especially on edge devices. However, existing research has primarily +concentrated on single-task settings, especially on semantic segmentation, +leading to redundant efforts and specialized architectures for different tasks. +To address this limitation, we propose a novel architecture for efficient +multi-task image segmentation, capable of handling various segmentation tasks +without sacrificing efficiency or accuracy. We introduce BiSeNetFormer, that +leverages the efficiency of two-stream semantic segmentation architectures and +it extends them into a mask classification framework. Our approach maintains +the efficient spatial and context paths to capture detailed and semantic +information, respectively, while leveraging an efficient transformed-based +segmentation head that computes the binary masks and class probabilities. By +seamlessly supporting multiple tasks, namely semantic and panoptic +segmentation, BiSeNetFormer offers a versatile solution for multi-task +segmentation. We evaluate our approach on popular datasets, Cityscapes and +ADE20K, demonstrating impressive inference speeds while maintaining competitive +accuracy compared to state-of-the-art architectures. Our results indicate that +BiSeNetFormer represents a significant advancement towards fast, efficient, and +multi-task segmentation networks, bridging the gap between model efficiency and +task adaptability. + +
+
+ comment: Accepted to ECV workshop at CVPR2024 +
+
+
+
+
+ + ☆ nnU-Net Revisited: A Call for Rigorous Validation in 3D Medical Image + Segmentation + + +
+ The release of nnU-Net marked a paradigm shift in 3D medical image +segmentation, demonstrating that a properly configured U-Net architecture could +still achieve state-of-the-art results. Despite this, the pursuit of novel +architectures, and the respective claims of superior performance over the U-Net +baseline, continued. In this study, we demonstrate that many of these recent +claims fail to hold up when scrutinized for common validation shortcomings, +such as the use of inadequate baselines, insufficient datasets, and neglected +computational resources. By meticulously avoiding these pitfalls, we conduct a +thorough and comprehensive benchmarking of current segmentation methods +including CNN-based, Transformer-based, and Mamba-based approaches. In contrast +to current beliefs, we find that the recipe for state-of-the-art performance is +1) employing CNN-based U-Net models, including ResNet and ConvNeXt variants, 2) +using the nnU-Net framework, and 3) scaling models to modern hardware +resources. These results indicate an ongoing innovation bias towards novel +architectures in the field and underscore the need for more stringent +validation standards in the quest for scientific progress. + +
+
+
+
+
+ + ☆ AI-KD: Towards Alignment Invariant Face Image Quality Assessment Using + Knowledge Distillation + + +
+ Face Image Quality Assessment (FIQA) techniques have seen steady improvements +over recent years, but their performance still deteriorates if the input face +samples are not properly aligned. This alignment sensitivity comes from the +fact that most FIQA techniques are trained or designed using a specific face +alignment procedure. If the alignment technique changes, the performance of +most existing FIQA techniques quickly becomes suboptimal. To address this +problem, we present in this paper a novel knowledge distillation approach, +termed AI-KD that can extend on any existing FIQA technique, improving its +robustness to alignment variations and, in turn, performance with different +alignment procedures. To validate the proposed distillation approach, we +conduct comprehensive experiments on 6 face datasets with 4 recent face +recognition models and in comparison to 7 state-of-the-art FIQA techniques. Our +results show that AI-KD consistently improves performance of the initial FIQA +techniques not only with misaligned samples, but also with properly aligned +facial images. Furthermore, it leads to a new state-of-the-art, when used with +a competitive initial FIQA approach. The code for AI-KD is made publicly +available from: https://github.com/LSIbabnikz/AI-KD. + +
+
+ comment: IEEE International Workshop on Biometrics and Forensics (IWBF) 2024, + pp. 6 +
+
+
+
+
+ + ☆ Text-Driven Diverse Facial Texture Generation via Progressive + Latent-Space Refinement + + +
+ Automatic 3D facial texture generation has gained significant interest +recently. Existing approaches may not support the traditional physically based +rendering pipeline or rely on 3D data captured by Light Stage. Our key +contribution is a progressive latent space refinement approach that can +bootstrap from 3D Morphable Models (3DMMs)-based texture maps generated from +facial images to generate high-quality and diverse PBR textures, including +albedo, normal, and roughness. It starts with enhancing Generative Adversarial +Networks (GANs) for text-guided and diverse texture generation. To this end, we +design a self-supervised paradigm to overcome the reliance on ground truth 3D +textures and train the generative model with only entangled texture maps. +Besides, we foster mutual enhancement between GANs and Score Distillation +Sampling (SDS). SDS boosts GANs with more generative modes, while GANs promote +more efficient optimization of SDS. Furthermore, we introduce an edge-aware SDS +for multi-view consistent facial structure. Experiments demonstrate that our +method outperforms existing 3D texture generation methods regarding +photo-realistic quality, diversity, and efficiency. + +
+
+
+
+
+ + ☆ WiTUnet: A U-Shaped Architecture Integrating CNN and Transformer for + Improved Feature Alignment and Local Information Fusion + + +
+ Low-dose computed tomography (LDCT) has become the technology of choice for +diagnostic medical imaging, given its lower radiation dose compared to standard +CT, despite increasing image noise and potentially affecting diagnostic +accuracy. To address this, advanced deep learning-based LDCT denoising +algorithms have been developed, primarily using Convolutional Neural Networks +(CNNs) or Transformer Networks with the Unet architecture. This architecture +enhances image detail by integrating feature maps from the encoder and decoder +via skip connections. However, current methods often overlook enhancements to +the Unet architecture itself, focusing instead on optimizing encoder and +decoder structures. This approach can be problematic due to the significant +differences in feature map characteristics between the encoder and decoder, +where simple fusion strategies may not effectively reconstruct images.In this +paper, we introduce WiTUnet, a novel LDCT image denoising method that utilizes +nested, dense skip pathways instead of traditional skip connections to improve +feature integration. WiTUnet also incorporates a windowed Transformer structure +to process images in smaller, non-overlapping segments, reducing computational +load. Additionally, the integration of a Local Image Perception Enhancement +(LiPe) module in both the encoder and decoder replaces the standard multi-layer +perceptron (MLP) in Transformers, enhancing local feature capture and +representation. Through extensive experimental comparisons, WiTUnet has +demonstrated superior performance over existing methods in key metrics such as +Peak Signal-to-Noise Ratio (PSNR), Structural Similarity (SSIM), and Root Mean +Square Error (RMSE), significantly improving noise removal and image quality. + +
+
+
+
+
+ + ☆ TMPQ-DM: Joint Timestep Reduction and Quantization Precision Selection + for Efficient Diffusion Models + + +
+ Diffusion models have emerged as preeminent contenders in the realm of +generative models. Distinguished by their distinctive sequential generative +processes, characterized by hundreds or even thousands of timesteps, diffusion +models progressively reconstruct images from pure Gaussian noise, with each +timestep necessitating full inference of the entire model. However, the +substantial computational demands inherent to these models present challenges +for deployment, quantization is thus widely used to lower the bit-width for +reducing the storage and computing overheads. Current quantization +methodologies primarily focus on model-side optimization, disregarding the +temporal dimension, such as the length of the timestep sequence, thereby +allowing redundant timesteps to continue consuming computational resources, +leaving substantial scope for accelerating the generative process. In this +paper, we introduce TMPQ-DM, which jointly optimizes timestep reduction and +quantization to achieve a superior performance-efficiency trade-off, addressing +both temporal and model optimization aspects. For timestep reduction, we devise +a non-uniform grouping scheme tailored to the non-uniform nature of the +denoising process, thereby mitigating the explosive combinations of timesteps. +In terms of quantization, we adopt a fine-grained layer-wise approach to +allocate varying bit-widths to different layers based on their respective +contributions to the final generative performance, thus rectifying performance +degradation observed in prior studies. To expedite the evaluation of +fine-grained quantization, we further devise a super-network to serve as a +precision solver by leveraging shared quantization results. These two design +components are seamlessly integrated within our framework, enabling rapid joint +exploration of the exponentially large decision space via a gradient-free +evolutionary search algorithm. + +
+
+
+
+
+ + ☆ Oblique-MERF: Revisiting and Improving MERF for Oblique Photography + + +
+ Neural implicit fields have established a new paradigm for scene +representation, with subsequent work achieving high-quality real-time +rendering. However, reconstructing 3D scenes from oblique aerial photography +presents unique challenges, such as varying spatial scale distributions and a +constrained range of tilt angles, often resulting in high memory consumption +and reduced rendering quality at extrapolated viewpoints. In this paper, we +enhance MERF to accommodate these data characteristics by introducing an +innovative adaptive occupancy plane optimized during the volume rendering +process and a smoothness regularization term for view-dependent color to +address these issues. Our approach, termed Oblique-MERF, surpasses +state-of-the-art real-time methods by approximately 0.7 dB, reduces VRAM usage +by about 40%, and achieves higher rendering frame rates with more realistic +rendering outcomes across most viewpoints. + +
+
+
+
+
+ + ☆ RanLayNet: A Dataset for Document Layout Detection used for Domain + Adaptation and Generalization + + +
+ Large ground-truth datasets and recent advances in deep learning techniques +have been useful for layout detection. However, because of the restricted +layout diversity of these datasets, training on them requires a sizable number +of annotated instances, which is both expensive and time-consuming. As a +result, differences between the source and target domains may significantly +impact how well these models function. To solve this problem, domain adaptation +approaches have been developed that use a small quantity of labeled data to +adjust the model to the target domain. In this research, we introduced a +synthetic document dataset called RanLayNet, enriched with automatically +assigned labels denoting spatial positions, ranges, and types of layout +elements. The primary aim of this endeavor is to develop a versatile dataset +capable of training models with robustness and adaptability to diverse document +formats. Through empirical experimentation, we demonstrate that a deep layout +identification model trained on our dataset exhibits enhanced performance +compared to a model trained solely on actual documents. Moreover, we conduct a +comparative analysis by fine-tuning inference models using both PubLayNet and +IIIT-AR-13K datasets on the Doclaynet dataset. Our findings emphasize that +models enriched with our dataset are optimal for tasks such as achieving 0.398 +and 0.588 mAP95 score in the scientific document domain for the TABLE class. + +
+
+
+
+
+ + ☆ State Space Model for New-Generation Network Alternative to + Transformers: A Survey + + +
+ In the post-deep learning era, the Transformer architecture has demonstrated +its powerful performance across pre-trained big models and various downstream +tasks. However, the enormous computational demands of this architecture have +deterred many researchers. To further reduce the complexity of attention +models, numerous efforts have been made to design more efficient methods. Among +them, the State Space Model (SSM), as a possible replacement for the +self-attention based Transformer model, has drawn more and more attention in +recent years. In this paper, we give the first comprehensive review of these +works and also provide experimental comparisons and analysis to better +demonstrate the features and advantages of SSM. Specifically, we first give a +detailed description of principles to help the readers quickly capture the key +ideas of SSM. After that, we dive into the reviews of existing SSMs and their +various applications, including natural language processing, computer vision, +graph, multi-modal and multi-media, point cloud/event stream, time series data, +and other domains. In addition, we give statistical comparisons and analysis of +these models and hope it helps the readers to understand the effectiveness of +different structures on various tasks. Then, we propose possible research +points in this direction to better promote the development of the theoretical +model and application of SSM. More related works will be continuously updated +on the following GitHub: +https://github.com/Event-AHU/Mamba_State_Space_Model_Paper_List. + +
+
+ comment: The First review of State Space Model (SSM)/Mamba and their + applications in artificial intelligence, 33 pages +
+
+
+
+
+ + ☆ Deep image learning of quantitative structure-property relationships of + cooper alloys via feature augmentation on Geodesic curve in shape space + + +
+ Understanding how the structure of materials affects their properties is a +cornerstone of materials science and engineering. However, traditional methods +have struggled to accurately describe the quantitative structure-property +relationships for complex structures. In our study, we bridge this gap by +leveraging machine learning to analyze images of materials' microstructures, +thus offering a novel way to understand and predict the properties of materials +based on their microstructures. We introduce a method known as FAGC (Feature +Augmentation on Geodesic Curves), specifically demonstrated for Cu-Cr-Zr +alloys. This approach utilizes machine learning to examine the shapes within +images of the alloys' microstructures and predict their mechanical and +electronic properties. This generative FAGC approach can effectively expand the +relatively small training datasets due to the limited availability of materials +images labeled with quantitative properties. The process begins with extracting +features from the images using neural networks. These features are then mapped +onto the Pre-shape space to construct the Geodesic curves. Along these curves, +new features are generated, effectively increasing the dataset. Moreover, we +design a pseudo-labeling mechanism for these newly generated features to +further enhance the training dataset. Our FAGC method has shown remarkable +results, significantly improving the accuracy of predicting the electronic +conductivity and hardness of Cu-Cr-Zr alloys, with R-squared values of 0.978 +and 0.998, respectively. These outcomes underscore the potential of FAGC to +address the challenge of limited image data in materials science, providing a +powerful tool for establishing detailed and quantitative relationships between +complex microstructures and material properties. + +
+
+
+
+
+ + ☆ Magic Clothing: Controllable Garment-Driven Image Synthesis + + +
+ We propose Magic Clothing, a latent diffusion model (LDM)-based network +architecture for an unexplored garment-driven image synthesis task. Aiming at +generating customized characters wearing the target garments with diverse text +prompts, the image controllability is the most critical issue, i.e., to +preserve the garment details and maintain faithfulness to the text prompts. To +this end, we introduce a garment extractor to capture the detailed garment +features, and employ self-attention fusion to incorporate them into the +pretrained LDMs, ensuring that the garment details remain unchanged on the +target character. Then, we leverage the joint classifier-free guidance to +balance the control of garment features and text prompts over the generated +results. Meanwhile, the proposed garment extractor is a plug-in module +applicable to various finetuned LDMs, and it can be combined with other +extensions like ControlNet and IP-Adapter to enhance the diversity and +controllability of the generated characters. Furthermore, we design +Matched-Points-LPIPS (MP-LPIPS), a robust metric for evaluating the consistency +of the target image to the source garment. Extensive experiments demonstrate +that our Magic Clothing achieves state-of-the-art results under various +conditional controls for garment-driven image synthesis. Our source code is +available at https://github.com/ShineChen1024/MagicClothing. + +
+
+
+
+
+ + ☆ Fuse after Align: Improving Face-Voice Association Learning via + Multimodal Encoder + + +
+ Today, there have been many achievements in learning the association between +voice and face. However, most previous work models rely on cosine similarity or +L2 distance to evaluate the likeness of voices and faces following contrastive +learning, subsequently applied to retrieval and matching tasks. This method +only considers the embeddings as high-dimensional vectors, utilizing a minimal +scope of available information. This paper introduces a novel framework within +an unsupervised setting for learning voice-face associations. By employing a +multimodal encoder after contrastive learning and addressing the problem +through binary classification, we can learn the implicit information within the +embeddings in a more effective and varied manner. Furthermore, by introducing +an effective pair selection method, we enhance the learning outcomes of both +contrastive learning and the matching task. Empirical evidence demonstrates +that our framework achieves state-of-the-art results in voice-face matching, +verification, and retrieval tasks, improving verification by approximately 3%, +matching by about 2.5%, and retrieval by around 1.3%. + +
+
+
+
+
+ + ☆ Clothes-Changing Person Re-Identification with Feasibility-Aware + Intermediary Matching + + +
+ Current clothes-changing person re-identification (re-id) approaches usually +perform retrieval based on clothes-irrelevant features, while neglecting the +potential of clothes-relevant features. However, we observe that relying solely +on clothes-irrelevant features for clothes-changing re-id is limited, since +they often lack adequate identity information and suffer from large intra-class +variations. On the contrary, clothes-relevant features can be used to discover +same-clothes intermediaries that possess informative identity clues. Based on +this observation, we propose a Feasibility-Aware Intermediary Matching (FAIM) +framework to additionally utilize clothes-relevant features for retrieval. +Firstly, an Intermediary Matching (IM) module is designed to perform an +intermediary-assisted matching process. This process involves using +clothes-relevant features to find informative intermediates, and then using +clothes-irrelevant features of these intermediates to complete the matching. +Secondly, in order to reduce the negative effect of low-quality intermediaries, +an Intermediary-Based Feasibility Weighting (IBFW) module is designed to +evaluate the feasibility of intermediary matching process by assessing the +quality of intermediaries. Extensive experiments demonstrate that our method +outperforms state-of-the-art methods on several widely-used clothes-changing +re-id benchmarks. + +
+
+
+
+
+ + ☆ Learning Tracking Representations from Single Point Annotations CVPR2024 + + +
+ Existing deep trackers are typically trained with largescale video frames +with annotated bounding boxes. However, these bounding boxes are expensive and +time-consuming to annotate, in particular for large scale datasets. In this +paper, we propose to learn tracking representations from single point +annotations (i.e., 4.5x faster to annotate than the traditional bounding box) +in a weakly supervised manner. Specifically, we propose a soft contrastive +learning (SoCL) framework that incorporates target objectness prior into +end-to-end contrastive learning. Our SoCL consists of adaptive positive and +negative sample generation, which is memory-efficient and effective for +learning tracking representations. We apply the learned representation of SoCL +to visual tracking and show that our method can 1) achieve better performance +than the fully supervised baseline trained with box annotations under the same +annotation time cost; 2) achieve comparable performance of the fully supervised +baseline by using the same number of training frames and meanwhile reducing +annotation time cost by 78% and total fees by 85%; 3) be robust to annotation +noise. + +
+
+ comment: Accept to CVPR2024-L3DIVU +
+
+
+
+
+ + ☆ SparseOcc: Rethinking Sparse Latent Representation for Vision-Based + Semantic Occupancy Prediction CVPR 2024 + + +
+ Vision-based perception for autonomous driving requires an explicit modeling +of a 3D space, where 2D latent representations are mapped and subsequent 3D +operators are applied. However, operating on dense latent spaces introduces a +cubic time and space complexity, which limits scalability in terms of +perception range or spatial resolution. Existing approaches compress the dense +representation using projections like Bird's Eye View (BEV) or Tri-Perspective +View (TPV). Although efficient, these projections result in information loss, +especially for tasks like semantic occupancy prediction. To address this, we +propose SparseOcc, an efficient occupancy network inspired by sparse point +cloud processing. It utilizes a lossless sparse latent representation with +three key innovations. Firstly, a 3D sparse diffuser performs latent completion +using spatially decomposed 3D sparse convolutional kernels. Secondly, a feature +pyramid and sparse interpolation enhance scales with information from others. +Finally, the transformer head is redesigned as a sparse variant. SparseOcc +achieves a remarkable 74.9% reduction on FLOPs over the dense baseline. +Interestingly, it also improves accuracy, from 12.8% to 14.1% mIOU, which in +part can be attributed to the sparse representation's ability to avoid +hallucinations on empty voxels. + +
+
+ comment: 10 pages, 4 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Learning Human Motion from Monocular Videos via Cross-Modal Manifold + Alignment + + +
+ Learning 3D human motion from 2D inputs is a fundamental task in the realms +of computer vision and computer graphics. Many previous methods grapple with +this inherently ambiguous task by introducing motion priors into the learning +process. However, these approaches face difficulties in defining the complete +configurations of such priors or training a robust model. In this paper, we +present the Video-to-Motion Generator (VTM), which leverages motion priors +through cross-modal latent feature space alignment between 3D human motion and +2D inputs, namely videos and 2D keypoints. To reduce the complexity of modeling +motion priors, we model the motion data separately for the upper and lower body +parts. Additionally, we align the motion data with a scale-invariant virtual +skeleton to mitigate the interference of human skeleton variations to the +motion priors. Evaluated on AIST++, the VTM showcases state-of-the-art +performance in reconstructing 3D human motion from monocular videos. Notably, +our VTM exhibits the capabilities for generalization to unseen view angles and +in-the-wild videos. + +
+
+
+
+
+ + ☆ FusionMamba: Dynamic Feature Enhancement for Multimodal Image Fusion + with Mamba + + +
+ Multi-modal image fusion aims to combine information from different modes to +create a single image with comprehensive information and detailed textures. +However, fusion models based on convolutional neural networks encounter +limitations in capturing global image features due to their focus on local +convolution operations. Transformer-based models, while excelling in global +feature modeling, confront computational challenges stemming from their +quadratic complexity. Recently, the Selective Structured State Space Model has +exhibited significant potential for long-range dependency modeling with linear +complexity, offering a promising avenue to address the aforementioned dilemma. +In this paper, we propose FusionMamba, a novel dynamic feature enhancement +method for multimodal image fusion with Mamba. Specifically, we devise an +improved efficient Mamba model for image fusion, integrating efficient visual +state space model with dynamic convolution and channel attention. This refined +model not only upholds the performance of Mamba and global modeling capability +but also diminishes channel redundancy while enhancing local enhancement +capability. Additionally, we devise a dynamic feature fusion module (DFFM) +comprising two dynamic feature enhancement modules (DFEM) and a cross modality +fusion mamba module (CMFM). The former serves for dynamic texture enhancement +and dynamic difference perception, whereas the latter enhances correlation +features between modes and suppresses redundant intermodal information. +FusionMamba has yielded state-of-the-art (SOTA) performance across various +multimodal medical image fusion tasks (CT-MRI, PET-MRI, SPECT-MRI), infrared +and visible image fusion task (IR-VIS) and multimodal biomedical image fusion +dataset (GFP-PC), which is proved that our model has generalization ability. +The code for FusionMamba is available at +https://github.com/millieXie/FusionMamba. + +
+
+
+
+
+ + ☆ Towards Collaborative Autonomous Driving: Simulation Platform and + End-to-End System + + +
+ Vehicle-to-everything-aided autonomous driving (V2X-AD) has a huge potential +to provide a safer driving solution. Despite extensive researches in +transportation and communication to support V2X-AD, the actual utilization of +these infrastructures and communication resources in enhancing driving +performances remains largely unexplored. This highlights the necessity of +collaborative autonomous driving: a machine learning approach that optimizes +the information sharing strategy to improve the driving performance of each +vehicle. This effort necessitates two key foundations: a platform capable of +generating data to facilitate the training and testing of V2X-AD, and a +comprehensive system that integrates full driving-related functionalities with +mechanisms for information sharing. From the platform perspective, we present +V2Xverse, a comprehensive simulation platform for collaborative autonomous +driving. This platform provides a complete pipeline for collaborative driving. +From the system perspective, we introduce CoDriving, a novel end-to-end +collaborative driving system that properly integrates V2X communication over +the entire autonomous pipeline, promoting driving with shared perceptual +information. The core idea is a novel driving-oriented communication strategy. +Leveraging this strategy, CoDriving improves driving performance while +optimizing communication efficiency. We make comprehensive benchmarks with +V2Xverse, analyzing both modular performance and closed-loop driving +performance. Experimental results show that CoDriving: i) significantly +improves the driving score by 62.49% and drastically reduces the pedestrian +collision rate by 53.50% compared to the SOTA end-to-end driving method, and +ii) achieves sustaining driving performance superiority over dynamic constraint +communication conditions. + +
+
+
+
+
+ + ☆ Leveraging Temporal Contextualization for Video Action Recognition + + +
+ Pretrained vision-language models have shown effectiveness in video +understanding. However, recent studies have not sufficiently leveraged +essential temporal information from videos, simply averaging frame-wise +representations or referencing consecutive frames. We introduce Temporally +Contextualized CLIP (TC-CLIP), a pioneering framework for video understanding +that effectively and efficiently leverages comprehensive video information. We +propose Temporal Contextualization (TC), a novel layer-wise temporal +information infusion mechanism for video that extracts core information from +each frame, interconnects relevant information across the video to summarize +into context tokens, and ultimately leverages the context tokens during the +feature encoding process. Furthermore, our Video-conditional Prompting (VP) +module manufactures context tokens to generate informative prompts in text +modality. We conduct extensive experiments in zero-shot, few-shot, +base-to-novel, and fully-supervised action recognition to validate the +superiority of our TC-CLIP. Ablation studies for TC and VP guarantee our design +choices. Code is available at https://github.com/naver-ai/tc-clip + +
+
+ comment: 24 pages, 10 figures, 12 tables +
+
+
+
+
+ + ☆ MMCode: Evaluating Multi-Modal Code Large Language Models with Visually + Rich Programming Problems + + +
+ Programming often involves converting detailed and complex specifications +into code, a process during which developers typically utilize visual aids to +more effectively convey concepts. While recent developments in Large Multimodal +Models have demonstrated remarkable abilities in visual reasoning and +mathematical tasks, there is little work on investigating whether these models +can effectively interpret visual elements for code generation. To this end, we +present MMCode, the first multi-modal coding dataset for evaluating algorithmic +problem-solving skills in visually rich contexts. MMCode contains 3,548 +questions and 6,620 images collected from real-world programming challenges +harvested from 10 code competition websites, presenting significant challenges +due to the extreme demand for reasoning abilities. Our experiment results show +that current state-of-the-art models struggle to solve these problems. The +results highlight the lack of powerful vision-code models, and we hope MMCode +can serve as an inspiration for future works in this domain. The data and code +are publicly available at https://github.com/happylkx/MMCode. + +
+
+ comment: 46 pages, 21 figures and 6 tables +
+
+
+
+
+ + ☆ FreqMamba: Viewing Mamba from a Frequency Perspective for Image + Deraining + + +
+ Images corrupted by rain streaks often lose vital frequency information for +perception, and image deraining aims to solve this issue which relies on global +and local degradation modeling. Recent studies have witnessed the effectiveness +and efficiency of Mamba for perceiving global and local information based on +its exploiting local correlation among patches, however, rarely attempts have +been explored to extend it with frequency analysis for image deraining, +limiting its ability to perceive global degradation that is relevant to +frequency modeling (e.g. Fourier transform). In this paper, we propose +FreqMamba, an effective and efficient paradigm that leverages the complementary +between Mamba and frequency analysis for image deraining. The core of our +method lies in extending Mamba with frequency analysis from two perspectives: +extending it with frequency-band for exploiting frequency correlation, and +connecting it with Fourier transform for global degradation modeling. +Specifically, FreqMamba introduces complementary triple interaction structures +including spatial Mamba, frequency band Mamba, and Fourier global modeling. +Frequency band Mamba decomposes the image into sub-bands of different +frequencies to allow 2D scanning from the frequency dimension. Furthermore, +leveraging Mamba's unique data-dependent properties, we use rainy images at +different scales to provide degradation priors to the network, thereby +facilitating efficient training. Extensive experiments show that our method +outperforms state-of-the-art methods both visually and quantitatively. + +
+
+
+
+
+ + ☆ Improving Weakly-Supervised Object Localization Using Adversarial + Erasing and Pseudo Label + + +
+ Weakly-supervised learning approaches have gained significant attention due +to their ability to reduce the effort required for human annotations in +training neural networks. This paper investigates a framework for +weakly-supervised object localization, which aims to train a neural network +capable of predicting both the object class and its location using only images +and their image-level class labels. The proposed framework consists of a shared +feature extractor, a classifier, and a localizer. The localizer predicts +pixel-level class probabilities, while the classifier predicts the object class +at the image level. Since image-level class labels are insufficient for +training the localizer, weakly-supervised object localization methods often +encounter challenges in accurately localizing the entire object region. To +address this issue, the proposed method incorporates adversarial erasing and +pseudo labels to improve localization accuracy. Specifically, novel losses are +designed to utilize adversarially erased foreground features and adversarially +erased feature maps, reducing dependence on the most discriminative region. +Additionally, the proposed method employs pseudo labels to suppress activation +values in the background while increasing them in the foreground. The proposed +method is applied to two backbone networks (MobileNetV1 and InceptionV3) and is +evaluated on three publicly available datasets (ILSVRC-2012, CUB-200-2011, and +PASCAL VOC 2012). The experimental results demonstrate that the proposed method +outperforms previous state-of-the-art methods across all evaluated metrics. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ TCCT-Net: Two-Stream Network Architecture for Fast and Efficient + Engagement Estimation via Behavioral Feature Signals CVPR 2024 + + +
+ Engagement analysis finds various applications in healthcare, education, +advertisement, services. Deep Neural Networks, used for analysis, possess +complex architecture and need large amounts of input data, computational power, +inference time. These constraints challenge embedding systems into devices for +real-time use. To address these limitations, we present a novel two-stream +feature fusion "Tensor-Convolution and Convolution-Transformer Network" +(TCCT-Net) architecture. To better learn the meaningful patterns in the +temporal-spatial domain, we design a "CT" stream that integrates a hybrid +convolutional-transformer. In parallel, to efficiently extract rich patterns +from the temporal-frequency domain and boost processing speed, we introduce a +"TC" stream that uses Continuous Wavelet Transform (CWT) to represent +information in a 2D tensor form. Evaluated on the EngageNet dataset, the +proposed method outperforms existing baselines, utilizing only two behavioral +features (head pose rotations) compared to the 98 used in baseline models. +Furthermore, comparative analysis shows TCCT-Net's architecture offers an +order-of-magnitude improvement in inference speed compared to state-of-the-art +image-based Recurrent Neural Network (RNN) methods. The code will be released +at https://github.com/vedernikovphoto/TCCT_Net. + +
+
+ comment: Accepted for the CVPR 2024 workshop (ABAW) +
+
+
+
+
+ + ☆ Q2A: Querying Implicit Fully Continuous Feature Pyramid to Align + Features for Medical Image Segmentation + + +
+ Recent medical image segmentation methods apply implicit neural +representation (INR) to the decoder for achieving a continuous coordinate +decoding to tackle the drawback of conventional discrete grid-based data +representations. However, the INR-based decoder cannot well handle the feature +misalignment problem brought about by the naive latent code acquisition +strategy in INR. Although there exist many feature alignment works, they all +adopt a progressive multi-step aligning paradigm on a discrete feature pyramid, +which is incompatible with the continuous one-step characteristics of INR-based +decoder, and thus fails to be the solution. Therefore, we propose Q2A, a novel +one-step query-based aligning paradigm, to solve the feature misalignment +problem in the INR-based decoder. Specifically, for each target coordinate, Q2A +first generates several queries depicting the spatial offsets and the cell +resolutions of the contextual features aligned to the coordinate, then +calculates the corresponding aligned features by feeding the queries into a +novel implicit fully continuous feature pyramid (FCFP), finally fuses the +aligned features to predict the class distribution. In FCFP, we further propose +a novel universal partition-and-aggregate strategy (P&A) to replace the naive +interpolation strategy for latent code acquisition in INR, which mitigates the +information loss problem that occurs when the query cell resolution is +relatively large and achieves an effective feature decoding at arbitrary +continuous resolution. We conduct extensive experiments on two medical +datasets, i.e. Glas and Synapse, and a universal dataset, i.e. Cityscapes, and +they show the superiority of the proposed Q2A. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Virtually Enriched NYU Depth V2 Dataset for Monocular Depth Estimation: + Do We Need Artificial Augmentation? + + +
+ We present ANYU, a new virtually augmented version of the NYU depth v2 +dataset, designed for monocular depth estimation. In contrast to the well-known +approach where full 3D scenes of a virtual world are utilized to generate +artificial datasets, ANYU was created by incorporating RGB-D representations of +virtual reality objects into the original NYU depth v2 images. We specifically +did not match each generated virtual object with an appropriate texture and a +suitable location within the real-world image. Instead, an assignment of +texture, location, lighting, and other rendering parameters was randomized to +maximize a diversity of the training data, and to show that it is randomness +that can improve the generalizing ability of a dataset. By conducting extensive +experiments with our virtually modified dataset and validating on the original +NYU depth v2 and iBims-1 benchmarks, we show that ANYU improves the monocular +depth estimation performance and generalization of deep neural networks with +considerably different architectures, especially for the current +state-of-the-art VPD model. To the best of our knowledge, this is the first +work that augments a real-world dataset with randomly generated virtual 3D +objects for monocular depth estimation. We make our ANYU dataset publicly +available in two training configurations with 10% and 100% additional +synthetically enriched RGB-D pairs of training images, respectively, for +efficient training and empirical exploration of virtual augmentation at +https://github.com/ABrain-One/ANYU + +
+
+
+
+
+ + ☆ PhyScene: Physically Interactable 3D Scene Synthesis for Embodied AI CVPR 2024 + + +
+ With recent developments in Embodied Artificial Intelligence (EAI) research, +there has been a growing demand for high-quality, large-scale interactive scene +generation. While prior methods in scene synthesis have prioritized the +naturalness and realism of the generated scenes, the physical plausibility and +interactivity of scenes have been largely left unexplored. To address this +disparity, we introduce PhyScene, a novel method dedicated to generating +interactive 3D scenes characterized by realistic layouts, articulated objects, +and rich physical interactivity tailored for embodied agents. Based on a +conditional diffusion model for capturing scene layouts, we devise novel +physics- and interactivity-based guidance mechanisms that integrate constraints +from object collision, room layout, and object reachability. Through extensive +experiments, we demonstrate that PhyScene effectively leverages these guidance +functions for physically interactable scene synthesis, outperforming existing +state-of-the-art scene synthesis methods by a large margin. Our findings +suggest that the scenes generated by PhyScene hold considerable potential for +facilitating diverse skill acquisition among agents within interactive +environments, thereby catalyzing further advancements in embodied AI research. +Project website: http://physcene.github.io. + +
+
+ comment: Accepted by CVPR 2024, 18 pages +
+
+
+
+
+ + ☆ Improved Object-Based Style Transfer with Single Deep Network + + +
+ This research paper proposes a novel methodology for image-to-image style +transfer on objects utilizing a single deep convolutional neural network. The +proposed approach leverages the You Only Look Once version 8 (YOLOv8) +segmentation model and the backbone neural network of YOLOv8 for style +transfer. The primary objective is to enhance the visual appeal of objects in +images by seamlessly transferring artistic styles while preserving the original +object characteristics. The proposed approach's novelty lies in combining +segmentation and style transfer in a single deep convolutional neural network. +This approach omits the need for multiple stages or models, thus resulting in +simpler training and deployment of the model for practical applications. The +results of this approach are shown on two content images by applying different +style images. The paper also demonstrates the ability to apply style transfer +on multiple objects in the same image. + +
+
+ comment: In Proceedings of the Fourth International Conference on Innovations + in Computational Intelligence and Computer Vision +
+
+
+
+
+ + ☆ CompGS: Efficient 3D Scene Representation via Compressed Gaussian + Splatting + + +
+ Gaussian splatting, renowned for its exceptional rendering quality and +efficiency, has emerged as a prominent technique in 3D scene representation. +However, the substantial data volume of Gaussian splatting impedes its +practical utility in real-world applications. Herein, we propose an efficient +3D scene representation, named Compressed Gaussian Splatting (CompGS), which +harnesses compact Gaussian primitives for faithful 3D scene modeling with a +remarkably reduced data size. To ensure the compactness of Gaussian primitives, +we devise a hybrid primitive structure that captures predictive relationships +between each other. Then, we exploit a small set of anchor primitives for +prediction, allowing the majority of primitives to be encapsulated into highly +compact residual forms. Moreover, we develop a rate-constrained optimization +scheme to eliminate redundancies within such hybrid primitives, steering our +CompGS towards an optimal trade-off between bitrate consumption and +representation efficacy. Experimental results show that the proposed CompGS +significantly outperforms existing methods, achieving superior compactness in +3D scene representation without compromising model accuracy and rendering +quality. Our code will be released on GitHub for further research. + +
+
+ comment: Submitted to a conference +
+
+
+
+
+ + ☆ Utility-Fairness Trade-Offs and How to Find Them + + +
+ When building classification systems with demographic fairness +considerations, there are two objectives to satisfy: 1) maximizing utility for +the specific task and 2) ensuring fairness w.r.t. a known demographic +attribute. These objectives often compete, so optimizing both can lead to a +trade-off between utility and fairness. While existing works acknowledge the +trade-offs and study their limits, two questions remain unanswered: 1) What are +the optimal trade-offs between utility and fairness? and 2) How can we +numerically quantify these trade-offs from data for a desired prediction task +and demographic attribute of interest? This paper addresses these questions. We +introduce two utility-fairness trade-offs: the Data-Space and Label-Space +Trade-off. The trade-offs reveal three regions within the utility-fairness +plane, delineating what is fully and partially possible and impossible. We +propose U-FaTE, a method to numerically quantify the trade-offs for a given +prediction task and group fairness definition from data samples. Based on the +trade-offs, we introduce a new scheme for evaluating representations. An +extensive evaluation of fair representation learning methods and +representations from over 1000 pre-trained models revealed that most current +approaches are far from the estimated and achievable fairness-utility +trade-offs across multiple datasets and prediction tasks. + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2024 +
+
+
+
+
+ + ☆ Contrastive Mean-Shift Learning for Generalized Category Discovery CVPR 2024 + + +
+ We address the problem of generalized category discovery (GCD) that aims to +partition a partially labeled collection of images; only a small part of the +collection is labeled and the total number of target classes is unknown. To +address this generalized image clustering problem, we revisit the mean-shift +algorithm, i.e., a classic, powerful technique for mode seeking, and +incorporate it into a contrastive learning framework. The proposed method, +dubbed Contrastive Mean-Shift (CMS) learning, trains an image encoder to +produce representations with better clustering properties by an iterative +process of mean shift and contrastive update. Experiments demonstrate that our +method, both in settings with and without the total number of clusters being +known, achieves state-of-the-art performance on six public GCD benchmarks +without bells and whistles. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ kNN-CLIP: Retrieval Enables Training-Free Segmentation on Continually + Expanding Large Vocabularies + + +
+ Rapid advancements in continual segmentation have yet to bridge the gap of +scaling to large continually expanding vocabularies under compute-constrained +scenarios. We discover that traditional continual training leads to +catastrophic forgetting under compute constraints, unable to outperform +zero-shot segmentation methods. We introduce a novel strategy for semantic and +panoptic segmentation with zero forgetting, capable of adapting to continually +growing vocabularies without the need for retraining or large memory costs. Our +training-free approach, kNN-CLIP, leverages a database of instance embeddings +to enable open-vocabulary segmentation approaches to continually expand their +vocabulary on any given domain with a single-pass through data, while only +storing embeddings minimizing both compute and memory costs. This method +achieves state-of-the-art mIoU performance across large-vocabulary semantic and +panoptic segmentation datasets. We hope kNN-CLIP represents a step forward in +enabling more efficient and adaptable continual segmentation, paving the way +for advances in real-world large-vocabulary continual segmentation methods. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ Exploring Text-to-Motion Generation with Human Preference CVPR 2024 + + +
+ This paper presents an exploration of preference learning in text-to-motion +generation. We find that current improvements in text-to-motion generation +still rely on datasets requiring expert labelers with motion capture systems. +Instead, learning from human preference data does not require motion capture +systems; a labeler with no expertise simply compares two generated motions. +This is particularly efficient because evaluating the model's output is easier +than gathering the motion that performs a desired task (e.g. backflip). To +pioneer the exploration of this paradigm, we annotate 3,528 preference pairs +generated by MotionGPT, marking the first effort to investigate various +algorithms for learning from preference data. In particular, our exploration +highlights important design choices when using preference data. Additionally, +our experimental results show that preference learning has the potential to +greatly improve current text-to-motion generative models. Our code and dataset +are publicly available at +https://github.com/THU-LYJ-Lab/InstructMotion}{https://github.com/THU-LYJ-Lab/InstructMotion +to further facilitate research in this area. + +
+
+ comment: Accepted to CVPR 2024 HuMoGen Workshop +
+
+
+
+
+ + ☆ The 8th AI City Challenge CVPR 2024 + + +
+ The eighth AI City Challenge highlighted the convergence of computer vision +and artificial intelligence in areas like retail, warehouse settings, and +Intelligent Traffic Systems (ITS), presenting significant research +opportunities. The 2024 edition featured five tracks, attracting unprecedented +interest from 726 teams in 47 countries and regions. Track 1 dealt with +multi-target multi-camera (MTMC) people tracking, highlighting significant +enhancements in camera count, character number, 3D annotation, and camera +matrices, alongside new rules for 3D tracking and online tracking algorithm +encouragement. Track 2 introduced dense video captioning for traffic safety, +focusing on pedestrian accidents using multi-camera feeds to improve insights +for insurance and prevention. Track 3 required teams to classify driver actions +in a naturalistic driving analysis. Track 4 explored fish-eye camera analytics +using the FishEye8K dataset. Track 5 focused on motorcycle helmet rule +violation detection. The challenge utilized two leaderboards to showcase +methods, with participants setting new benchmarks, some surpassing existing +state-of-the-art achievements. + +
+
+ comment: Summary of the 8th AI City Challenge Workshop in conjunction with + CVPR 2024 +
+
+
+
+
+ + ☆ VFMM3D: Releasing the Potential of Image by Vision Foundation Model for + Monocular 3D Object Detection + + +
+ Due to its cost-effectiveness and widespread availability, monocular 3D +object detection, which relies solely on a single camera during inference, +holds significant importance across various applications, including autonomous +driving and robotics. Nevertheless, directly predicting the coordinates of +objects in 3D space from monocular images poses challenges. Therefore, an +effective solution involves transforming monocular images into LiDAR-like +representations and employing a LiDAR-based 3D object detector to predict the +3D coordinates of objects. The key step in this method is accurately converting +the monocular image into a reliable point cloud form. In this paper, we present +VFMM3D, an innovative approach that leverages the capabilities of Vision +Foundation Models (VFMs) to accurately transform single-view images into LiDAR +point cloud representations. VFMM3D utilizes the Segment Anything Model (SAM) +and Depth Anything Model (DAM) to generate high-quality pseudo-LiDAR data +enriched with rich foreground information. Specifically, the Depth Anything +Model (DAM) is employed to generate dense depth maps. Subsequently, the Segment +Anything Model (SAM) is utilized to differentiate foreground and background +regions by predicting instance masks. These predicted instance masks and depth +maps are then combined and projected into 3D space to generate pseudo-LiDAR +points. Finally, any object detectors based on point clouds can be utilized to +predict the 3D coordinates of objects. Comprehensive experiments are conducted +on the challenging 3D object detection dataset KITTI. Our VFMM3D establishes a +new state-of-the-art performance. Additionally, experimental results +demonstrate the generality of VFMM3D, showcasing its seamless integration into +various LiDAR-based 3D object detectors. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ ViFu: Multiple 360$^\circ$ Objects Reconstruction with Clean Background + via Visible Part Fusion + + +
+ In this paper, we propose a method to segment and recover a static, clean +background and multiple 360$^\circ$ objects from observations of scenes at +different timestamps. Recent works have used neural radiance fields to model 3D +scenes and improved the quality of novel view synthesis, while few studies have +focused on modeling the invisible or occluded parts of the training images. +These under-reconstruction parts constrain both scene editing and rendering +view selection, thereby limiting their utility for synthetic data generation +for downstream tasks. Our basic idea is that, by observing the same set of +objects in various arrangement, so that parts that are invisible in one scene +may become visible in others. By fusing the visible parts from each scene, +occlusion-free rendering of both background and foreground objects can be +achieved. + We decompose the multi-scene fusion task into two main components: (1) +objects/background segmentation and alignment, where we leverage point +cloud-based methods tailored to our novel problem formulation; (2) radiance +fields fusion, where we introduce visibility field to quantify the visible +information of radiance fields, and propose visibility-aware rendering for the +fusion of series of scenes, ultimately obtaining clean background and +360$^\circ$ object rendering. Comprehensive experiments were conducted on +synthetic and real datasets, and the results demonstrate the effectiveness of +our method. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Super-resolution of biomedical volumes with 2D supervision CVPR + + +
+ Volumetric biomedical microscopy has the potential to increase the diagnostic +information extracted from clinical tissue specimens and improve the diagnostic +accuracy of both human pathologists and computational pathology models. +Unfortunately, barriers to integrating 3-dimensional (3D) volumetric microscopy +into clinical medicine include long imaging times, poor depth / z-axis +resolution, and an insufficient amount of high-quality volumetric data. +Leveraging the abundance of high-resolution 2D microscopy data, we introduce +masked slice diffusion for super-resolution (MSDSR), which exploits the +inherent equivalence in the data-generating distribution across all spatial +dimensions of biological specimens. This intrinsic characteristic allows for +super-resolution models trained on high-resolution images from one plane (e.g., +XY) to effectively generalize to others (XZ, YZ), overcoming the traditional +dependency on orientation. We focus on the application of MSDSR to stimulated +Raman histology (SRH), an optical imaging modality for biological specimen +analysis and intraoperative diagnosis, characterized by its rapid acquisition +of high-resolution 2D images but slow and costly optical z-sectioning. To +evaluate MSDSR's efficacy, we introduce a new performance metric, SliceFID, and +demonstrate MSDSR's superior performance over baseline models through extensive +evaluations. Our findings reveal that MSDSR not only significantly enhances the +quality and resolution of 3D volumetric data, but also addresses major +obstacles hindering the broader application of 3D volumetric microscopy in +clinical diagnostics and biomedical research. + +
+
+ comment: CVPR Workshop on Computer Vision for Microscopy Image Analysis 2024 +
+
+
+
+
+ + ☆ A Review on Machine Learning Algorithms for Dust Aerosol Detection using + Satellite Data + + +
+ Dust storms are associated with certain respiratory illnesses across +different areas in the world. Researchers have devoted time and resources to +study the elements surrounding dust storm phenomena. This paper reviews the +efforts of those who have investigated dust aerosols using sensors onboard of +satellites using machine learning-based approaches. We have reviewed the most +common issues revolving dust aerosol modeling using different datasets and +different sensors from a historical perspective. Our findings suggest that +multi-spectral approaches based on linear and non-linear combinations of +spectral bands are some of the most successful for visualization and +quantitative analysis; however, when researchers have leveraged machine +learning, performance has been improved and new opportunities to solve unique +problems arise. + +
+
+ comment: The 23rd International Conference on Artificial Intelligence (ICAI + 2021) +
+
+
+
+
+ + ☆ DeferredGS: Decoupled and Editable Gaussian Splatting with Deferred + Shading + + +
+ Reconstructing and editing 3D objects and scenes both play crucial roles in +computer graphics and computer vision. Neural radiance fields (NeRFs) can +achieve realistic reconstruction and editing results but suffer from +inefficiency in rendering. Gaussian splatting significantly accelerates +rendering by rasterizing Gaussian ellipsoids. However, Gaussian splatting +utilizes a single Spherical Harmonic (SH) function to model both texture and +lighting, limiting independent editing capabilities of these components. +Recently, attempts have been made to decouple texture and lighting with the +Gaussian splatting representation but may fail to produce plausible geometry +and decomposition results on reflective scenes. Additionally, the forward +shading technique they employ introduces noticeable blending artifacts during +relighting, as the geometry attributes of Gaussians are optimized under the +original illumination and may not be suitable for novel lighting conditions. To +address these issues, we introduce DeferredGS, a method for decoupling and +editing the Gaussian splatting representation using deferred shading. To +achieve successful decoupling, we model the illumination with a learnable +environment map and define additional attributes such as texture parameters and +normal direction on Gaussians, where the normal is distilled from a jointly +trained signed distance function. More importantly, we apply deferred shading, +resulting in more realistic relighting effects compared to previous methods. +Both qualitative and quantitative experiments demonstrate the superior +performance of DeferredGS in novel view synthesis and editing tasks. + +
+
+
+
+
+ + ☆ Human-in-the-Loop Segmentation of Multi-species Coral Imagery + + +
+ Broad-scale marine surveys performed by underwater vehicles significantly +increase the availability of coral reef imagery, however it is costly and +time-consuming for domain experts to label images. Point label propagation is +an approach used to leverage existing image data labeled with sparse point +labels. The resulting augmented ground truth generated is then used to train a +semantic segmentation model. Here, we first demonstrate that recent advances in +foundation models enable generation of multi-species coral augmented ground +truth masks using denoised DINOv2 features and K-Nearest Neighbors (KNN), +without the need for any pre-training or custom-designed algorithms. For +extremely sparsely labeled images, we propose a labeling regime based on +human-in-the-loop principles, resulting in significant improvement in +annotation efficiency: If only 5 point labels per image are available, our +proposed human-in-the-loop approach improves on the state-of-the-art by 17.3% +for pixel accuracy and 22.6% for mIoU; and by 10.6% and 19.1% when 10 point +labels per image are available. Even if the human-in-the-loop labeling regime +is not used, the denoised DINOv2 features with a KNN outperforms the prior +state-of-the-art by 3.5% for pixel accuracy and 5.7% for mIoU (5 grid points). +We also provide a detailed analysis of how point labeling style and the +quantity of points per image affects the point label propagation quality and +provide general recommendations on maximizing point label efficiency. + +
+
+ comment: 10 pages, 6 figures, an additional 4 pages of supplementary material +
+
+
+
+
+ + ☆ Watermark-embedded Adversarial Examples for Copyright Protection against + Diffusion Models + + +
+ Diffusion Models (DMs) have shown remarkable capabilities in various +image-generation tasks. However, there are growing concerns that DMs could be +used to imitate unauthorized creations and thus raise copyright issues. To +address this issue, we propose a novel framework that embeds personal +watermarks in the generation of adversarial examples. Such examples can force +DMs to generate images with visible watermarks and prevent DMs from imitating +unauthorized images. We construct a generator based on conditional adversarial +networks and design three losses (adversarial loss, GAN loss, and perturbation +loss) to generate adversarial examples that have subtle perturbation but can +effectively attack DMs to prevent copyright violations. Training a generator +for a personal watermark by our method only requires 5-10 samples within 2-3 +minutes, and once the generator is trained, it can generate adversarial +examples with that watermark significantly fast (0.2s per image). We conduct +extensive experiments in various conditional image-generation scenarios. +Compared to existing methods that generate images with chaotic textures, our +method adds visible watermarks on the generated images, which is a more +straightforward way to indicate copyright violations. We also observe that our +adversarial examples exhibit good transferability across unknown generative +models. Therefore, this work provides a simple yet powerful way to protect +copyright from DM-based imitation. + +
+
+
+
+
+ + ☆ Masked and Shuffled Blind Spot Denoising for Real-World Images + + +
+ We introduce a novel approach to single image denoising based on the Blind +Spot Denoising principle, which we call MAsked and SHuffled Blind Spot +Denoising (MASH). We focus on the case of correlated noise, which often plagues +real images. MASH is the result of a careful analysis to determine the +relationships between the level of blindness (masking) of the input and the +(unknown) noise correlation. Moreover, we introduce a shuffling technique to +weaken the local correlation of noise, which in turn yields an additional +denoising performance improvement. We evaluate MASH via extensive experiments +on real-world noisy image datasets. We demonstrate on par or better results +compared to existing self-supervised denoising methods. + +
+
+
+
+
+ + ☆ RankCLIP: Ranking-Consistent Language-Image Pretraining + + +
+ Among the ever-evolving development of vision-language models, contrastive +language-image pretraining (CLIP) has set new benchmarks in many downstream +tasks such as zero-shot classifications by leveraging self-supervised +contrastive learning on large amounts of text-image pairs. However, its +dependency on rigid one-to-one mappings overlooks the complex and often +multifaceted relationships between and within texts and images. To this end, we +introduce RankCLIP, a novel pretraining method that extends beyond the rigid +one-to-one matching framework of CLIP and its variants. By leveraging both +in-modal and cross-modal ranking consistency, RankCLIP improves the alignment +process, enabling it to capture the nuanced many-to-many relationships between +and within each modality. Through comprehensive experiments, we demonstrate the +enhanced capability of RankCLIP to effectively improve performance across +various downstream tasks, notably achieving significant gains in zero-shot +classifications over state-of-the-art methods, underscoring the potential of +RankCLIP in further advancing vision-language pretraining. + +
+
+ comment: 10 pages, 3 figures, 6 tables. Code and model checkpoints are + available at https://github.com/Jam1ezhang/RankCLIP +
+
+
+
+
+ + ☆ CryoMAE: Few-Shot Cryo-EM Particle Picking with Masked Autoencoders + + +
+ Cryo-electron microscopy (cryo-EM) emerges as a pivotal technology for +determining the architecture of cells, viruses, and protein assemblies at +near-atomic resolution. Traditional particle picking, a key step in cryo-EM, +struggles with manual effort and automated methods' sensitivity to low +signal-to-noise ratio (SNR) and varied particle orientations. Furthermore, +existing neural network (NN)-based approaches often require extensive labeled +datasets, limiting their practicality. To overcome these obstacles, we +introduce cryoMAE, a novel approach based on few-shot learning that harnesses +the capabilities of Masked Autoencoders (MAE) to enable efficient selection of +single particles in cryo-EM images. Contrary to conventional NN-based +techniques, cryoMAE requires only a minimal set of positive particle images for +training yet demonstrates high performance in particle detection. Furthermore, +the implementation of a self-cross similarity loss ensures distinct features +for particle and background regions, thereby enhancing the discrimination +capability of cryoMAE. Experiments on large-scale cryo-EM datasets show that +cryoMAE outperforms existing state-of-the-art (SOTA) methods, improving 3D +reconstruction resolution by up to 22.4%. + +
+
+
+
+
+ + ☆ PD-L1 Classification of Weakly-Labeled Whole Slide Images of Breast + Cancer + + +
+ Specific and effective breast cancer therapy relies on the accurate +quantification of PD-L1 positivity in tumors, which appears in the form of +brown stainings in high resolution whole slide images (WSIs). However, the +retrieval and extensive labeling of PD-L1 stained WSIs is a time-consuming and +challenging task for pathologists, resulting in low reproducibility, especially +for borderline images. This study aims to develop and compare models able to +classify PD-L1 positivity of breast cancer samples based on WSI analysis, +relying only on WSI-level labels. The task consists of two phases: identifying +regions of interest (ROI) and classifying tumors as PD-L1 positive or negative. +For the latter, two model categories were developed, with different feature +extraction methodologies. The first encodes images based on the colour distance +from a base color. The second uses a convolutional autoencoder to obtain +embeddings of WSI tiles, and aggregates them into a WSI-level embedding. For +both model types, features are fed into downstream ML classifiers. Two datasets +from different clinical centers were used in two different training +configurations: (1) training on one dataset and testing on the other; (2) +combining the datasets. We also tested the performance with or without human +preprocessing to remove brown artefacts Colour distance based models achieve +the best performances on testing configuration (1) with artefact removal, while +autoencoder-based models are superior in the remaining cases, which are prone +to greater data variability. + +
+
+
+
+
+ + ☆ Forensic Iris Image-Based Post-Mortem Interval Estimation + + +
+ Post-mortem iris recognition is an emerging application of iris-based human +identification in a forensic setup. One factor that may be useful in +conditioning iris recognition methods is the tissue decomposition level, which +is correlated with the post-mortem interval (PMI), i.g., the number of hours +that have elapsed since death. PMI, however, is not always available, and its +precise estimation remains one of the core challenges in forensic examination. +This paper presents the first known to us method of PMI estimation directly +from forensic iris images. To assess the feasibility of the iris-based PMI +estimation, convolutional neural networks-based models (VGG19, DenseNet121, +ResNet152, and Inception_v3) were trained to predict the PMI from (a) +near-infrared (NIR), (b) visible (RGB), and (c) multispectral forensic iris +images. Models were evaluated following a 10-fold cross-validation in (S1) +sample-disjoint, (S2) subject-disjoint, and (S3) cross-dataset scenarios. We +found that using the multispectral data offers a spectacularly low mean +absolute error (MAE) of approximately 3.5 hours in scenario (S1), a bit worse +MAE of approximately 17.5 hours in scenario (S2), and an MAE of approximately +69.0 hours of in the scenario (S3). This suggests that if the environmental +conditions are favorable (e.g., bodies are kept in low temperatures), forensic +iris images provide features that are indicative of the PMI and can be +automatically estimated. The source codes and model weights are made available +with the paper. + +
+
+
+
+
+ + ☆ High-Resolution Detection of Earth Structural Heterogeneities from + Seismic Amplitudes using Convolutional Neural Networks with Attention layers + + +
+ Earth structural heterogeneities have a remarkable role in the petroleum +economy for both exploration and production projects. Automatic detection of +detailed structural heterogeneities is challenging when considering modern +machine learning techniques like deep neural networks. Typically, these +techniques can be an excellent tool for assisted interpretation of such +heterogeneities, but it heavily depends on the amount of data to be trained. + We propose an efficient and cost-effective architecture for detecting seismic +structural heterogeneities using Convolutional Neural Networks (CNNs) combined +with Attention layers. The attention mechanism reduces costs and enhances +accuracy, even in cases with relatively noisy data. Our model has half the +parameters compared to the state-of-the-art, and it outperforms previous +methods in terms of Intersection over Union (IoU) by 0.6% and precision by +0.4%. By leveraging synthetic data, we apply transfer learning to train and +fine-tune the model, addressing the challenge of limited annotated data +availability. + +
+
+
+
+
+ + ☆ Self-Supervised Learning Featuring Small-Scale Image Dataset for + Treatable Retinal Diseases Classification + + +
+ Automated medical diagnosis through image-based neural networks has increased +in popularity and matured over years. Nevertheless, it is confined by the +scarcity of medical images and the expensive labor annotation costs. +Self-Supervised Learning (SSL) is an good alternative to Transfer Learning (TL) +and is suitable for imbalanced image datasets. In this study, we assess four +pretrained SSL models and two TL models in treatable retinal diseases +classification using small-scale Optical Coherence Tomography (OCT) images +ranging from 125 to 4000 with balanced or imbalanced distribution for training. +The proposed SSL model achieves the state-of-art accuracy of 98.84% using only +4,000 training images. Our results suggest the SSL models provide superior +performance under both the balanced and imbalanced training scenarios. The SSL +model with MoCo-v2 scheme has consistent good performance under the imbalanced +scenario and, especially, surpasses the other models when the training set is +less than 500 images. + +
+
+
+
+
+ + ☆ EyeFormer: Predicting Personalized Scanpaths with Transformer-Guided + Reinforcement Learning + + +
+ From a visual perception perspective, modern graphical user interfaces (GUIs) +comprise a complex graphics-rich two-dimensional visuospatial arrangement of +text, images, and interactive objects such as buttons and menus. While existing +models can accurately predict regions and objects that are likely to attract +attention ``on average'', so far there is no scanpath model capable of +predicting scanpaths for an individual. To close this gap, we introduce +EyeFormer, which leverages a Transformer architecture as a policy network to +guide a deep reinforcement learning algorithm that controls gaze locations. Our +model has the unique capability of producing personalized predictions when +given a few user scanpath samples. It can predict full scanpath information, +including fixation positions and duration, across individuals and various +stimulus types. Additionally, we demonstrate applications in GUI layout +optimization driven by our model. Our software and models will be publicly +available. + +
+
+
+
+
+ + ☆ Salient Object-Aware Background Generation using Text-Guided Diffusion + Models CVPR 2024 + + +
+ Generating background scenes for salient objects plays a crucial role across +various domains including creative design and e-commerce, as it enhances the +presentation and context of subjects by integrating them into tailored +environments. Background generation can be framed as a task of text-conditioned +outpainting, where the goal is to extend image content beyond a salient +object's boundaries on a blank background. Although popular diffusion models +for text-guided inpainting can also be used for outpainting by mask inversion, +they are trained to fill in missing parts of an image rather than to place an +object into a scene. Consequently, when used for background creation, +inpainting models frequently extend the salient object's boundaries and thereby +change the object's identity, which is a phenomenon we call "object expansion." +This paper introduces a model for adapting inpainting diffusion models to the +salient object outpainting task using Stable Diffusion and ControlNet +architectures. We present a series of qualitative and quantitative results +across models and datasets, including a newly proposed metric to measure object +expansion that does not require any human labeling. Compared to Stable +Diffusion 2.0 Inpainting, our proposed approach reduces object expansion by +3.6x on average with no degradation in standard visual metrics across multiple +datasets. + +
+
+ comment: Accepted for publication at CVPR 2024's Generative Models for + Computer Vision workshop +
+
+
+
+
+ + ☆ SegFormer3D: an Efficient Transformer for 3D Medical Image Segmentation CVPR + + +
+ The adoption of Vision Transformers (ViTs) based architectures represents a +significant advancement in 3D Medical Image (MI) segmentation, surpassing +traditional Convolutional Neural Network (CNN) models by enhancing global +contextual understanding. While this paradigm shift has significantly enhanced +3D segmentation performance, state-of-the-art architectures require extremely +large and complex architectures with large scale computing resources for +training and deployment. Furthermore, in the context of limited datasets, often +encountered in medical imaging, larger models can present hurdles in both model +generalization and convergence. In response to these challenges and to +demonstrate that lightweight models are a valuable area of research in 3D +medical imaging, we present SegFormer3D, a hierarchical Transformer that +calculates attention across multiscale volumetric features. Additionally, +SegFormer3D avoids complex decoders and uses an all-MLP decoder to aggregate +local and global attention features to produce highly accurate segmentation +masks. The proposed memory efficient Transformer preserves the performance +characteristics of a significantly larger model in a compact design. +SegFormer3D democratizes deep learning for 3D medical image segmentation by +offering a model with 33x less parameters and a 13x reduction in GFLOPS +compared to the current state-of-the-art (SOTA). We benchmark SegFormer3D +against the current SOTA models on three widely used datasets Synapse, BRaTs, +and ACDC, achieving competitive results. Code: +https://github.com/OSUPCVLab/SegFormer3D.git + +
+
+ comment: Accepted at CVPR Workshop 2024 +
+
+
+
+
+ + ☆ Eyes on the Streets: Leveraging Street-Level Imaging to Model Urban + Crime Dynamics + + +
+ This study addresses the challenge of urban safety in New York City by +examining the relationship between the built environment and crime rates using +machine learning and a comprehensive dataset of street view im- ages. We aim to +identify how urban landscapes correlate with crime statistics, focusing on the +characteristics of street views and their association with crime rates. The +findings offer insights for urban planning and crime pre- vention, highlighting +the potential of environmental de- sign in enhancing public safety. + +
+
+
+
+
+ + ☆ Cross-Modal Self-Training: Aligning Images and Pointclouds to Learn + Classification without Labels CVPR 2024 + + +
+ Large-scale vision 2D vision language models, such as CLIP can be aligned +with a 3D encoder to learn generalizable (open-vocabulary) 3D vision models. +However, current methods require supervised pre-training for such alignment, +and the performance of such 3D zero-shot models remains sub-optimal for +real-world adaptation. In this work, we propose an optimization framework: +Cross-MoST: Cross-Modal Self-Training, to improve the label-free classification +performance of a zero-shot 3D vision model by simply leveraging unlabeled 3D +data and their accompanying 2D views. We propose a student-teacher framework to +simultaneously process 2D views and 3D point clouds and generate joint pseudo +labels to train a classifier and guide cross-model feature alignment. Thereby +we demonstrate that 2D vision language models such as CLIP can be used to +complement 3D representation learning to improve classification performance +without the need for expensive class annotations. Using synthetic and +real-world 3D datasets, we further demonstrate that Cross-MoST enables +efficient cross-modal knowledge exchange resulting in both image and point +cloud modalities learning from each other's rich representations. + +
+
+ comment: To be published in Workshop for Learning 3D with Multi-View + Supervision (3DMV) at CVPR 2024 +
+
+
+
+
+ + ☆ ANCHOR: LLM-driven News Subject Conditioning for Text-to-Image Synthesis + + +
+ Text-to-Image (T2I) Synthesis has made tremendous strides in enhancing +synthesized image quality, but current datasets evaluate model performance only +on descriptive, instruction-based prompts. Real-world news image captions take +a more pragmatic approach, providing high-level situational and Named-Entity +(NE) information and limited physical object descriptions, making them +abstractive. To evaluate the ability of T2I models to capture intended subjects +from news captions, we introduce the Abstractive News Captions with High-level +cOntext Representation (ANCHOR) dataset, containing 70K+ samples sourced from 5 +different news media organizations. With Large Language Models (LLM) achieving +success in language and commonsense reasoning tasks, we explore the ability of +different LLMs to identify and understand key subjects from abstractive +captions. Our proposed method Subject-Aware Finetuning (SAFE), selects and +enhances the representation of key subjects in synthesized images by leveraging +LLM-generated subject weights. It also adapts to the domain distribution of +news images and captions through custom Domain Fine-tuning, outperforming +current T2I baselines on ANCHOR. By launching the ANCHOR dataset, we hope to +motivate research in furthering the Natural Language Understanding (NLU) +capabilities of T2I models. + +
+
+ comment: 23 pages, 9 figures +
+
+
+
+
+ + ☆ WB LUTs: Contrastive Learning for White Balancing Lookup Tables + + +
+ Automatic white balancing (AWB), one of the first steps in an integrated +signal processing (ISP) pipeline, aims to correct the color cast induced by the +scene illuminant. An incorrect white balance (WB) setting or AWB failure can +lead to an undesired blue or red tint in the rendered sRGB image. To address +this, recent methods pose the post-capture WB correction problem as an +image-to-image translation task and train deep neural networks to learn the +necessary color adjustments at a lower resolution. These low resolution outputs +are post-processed to generate high resolution WB corrected images, forming a +bottleneck in the end-to-end run time. In this paper we present a 3D Lookup +Table (LUT) based WB correction model called WB LUTs that can generate high +resolution outputs in real time. We introduce a contrastive learning framework +with a novel hard sample mining strategy, which improves the WB correction +quality of baseline 3D LUTs by 25.5%. Experimental results demonstrate that the +proposed WB LUTs perform competitively against state-of-the-art models on two +benchmark datasets while being 300 times faster using 12.7 times less memory. +Our model and code are available at https://github.com/skrmanne/3DLUT_sRGB_WB. + +
+
+
+
+
+ + ☆ NOISe: Nuclei-Aware Osteoclast Instance Segmentation for Mouse-to-Human + Domain Transfer + + +
+ Osteoclast cell image analysis plays a key role in osteoporosis research, but +it typically involves extensive manual image processing and hand annotations by +a trained expert. In the last few years, a handful of machine learning +approaches for osteoclast image analysis have been developed, but none have +addressed the full instance segmentation task required to produce the same +output as that of the human expert led process. Furthermore, none of the prior, +fully automated algorithms have publicly available code, pretrained models, or +annotated datasets, inhibiting reproduction and extension of their work. We +present a new dataset with ~2*10^5 expert annotated mouse osteoclast masks, +together with a deep learning instance segmentation method which works for both +in vitro mouse osteoclast cells on plastic tissue culture plates and human +osteoclast cells on bone chips. To our knowledge, this is the first work to +automate the full osteoclast instance segmentation task. Our method achieves a +performance of 0.82 mAP_0.5 (mean average precision at intersection-over-union +threshold of 0.5) in cross validation for mouse osteoclasts. We present a novel +nuclei-aware osteoclast instance segmentation training strategy (NOISe) based +on the unique biology of osteoclasts, to improve the model's generalizability +and boost the mAP_0.5 from 0.60 to 0.82 on human osteoclasts. We publish our +annotated mouse osteoclast image dataset, instance segmentation models, and +code at github.com/michaelwwan/noise to enable reproducibility and to provide a +public tool to accelerate osteoporosis research. + +
+
+
+
+
+ + ☆ Epistemic Uncertainty Quantification For Pre-trained Neural Network CVPR 2024 + + +
+ Epistemic uncertainty quantification (UQ) identifies where models lack +knowledge. Traditional UQ methods, often based on Bayesian neural networks, are +not suitable for pre-trained non-Bayesian models. Our study addresses +quantifying epistemic uncertainty for any pre-trained model, which does not +need the original training data or model modifications and can ensure broad +applicability regardless of network architectures or training techniques. +Specifically, we propose a gradient-based approach to assess epistemic +uncertainty, analyzing the gradients of outputs relative to model parameters, +and thereby indicating necessary model adjustments to accurately represent the +inputs. We first explore theoretical guarantees of gradient-based methods for +epistemic UQ, questioning the view that this uncertainty is only calculable +through differences between multiple models. We further improve gradient-driven +UQ by using class-specific weights for integrating gradients and emphasizing +distinct contributions from neural network layers. Additionally, we enhance UQ +accuracy by combining gradient and perturbation methods to refine the +gradients. We evaluate our approach on out-of-distribution detection, +uncertainty calibration, and active learning, demonstrating its superiority +over current state-of-the-art UQ methods for pre-trained models. + +
+
+ comment: Published at CVPR 2024 +
+
+
+
+
+ + ☆ GeoAI Reproducibility and Replicability: a computational and spatial + perspective + + +
+ GeoAI has emerged as an exciting interdisciplinary research area that +combines spatial theories and data with cutting-edge AI models to address +geospatial problems in a novel, data-driven manner. While GeoAI research has +flourished in the GIScience literature, its reproducibility and replicability +(R&R), fundamental principles that determine the reusability, reliability, and +scientific rigor of research findings, have rarely been discussed. This paper +aims to provide an in-depth analysis of this topic from both computational and +spatial perspectives. We first categorize the major goals for reproducing GeoAI +research, namely, validation (repeatability), learning and adapting the method +for solving a similar or new problem (reproducibility), and examining the +generalizability of the research findings (replicability). Each of these goals +requires different levels of understanding of GeoAI, as well as different +methods to ensure its success. We then discuss the factors that may cause the +lack of R&R in GeoAI research, with an emphasis on (1) the selection and use of +training data; (2) the uncertainty that resides in the GeoAI model design, +training, deployment, and inference processes; and more importantly (3) the +inherent spatial heterogeneity of geospatial data and processes. We use a deep +learning-based image analysis task as an example to demonstrate the results' +uncertainty and spatial variance caused by different factors. The findings +reiterate the importance of knowledge sharing, as well as the generation of a +"replicability map" that incorporates spatial autocorrelation and spatial +heterogeneity into consideration in quantifying the spatial replicability of +GeoAI research. + +
+
+ comment: Accepted by Annals of the American Association of Geographers +
+
+
+
+
+ + ☆ Vision Augmentation Prediction Autoencoder with Attention Design + (VAPAAD) + + +
+ Despite significant advancements in sequence prediction, current methods lack +attention-based mechanisms for next-frame prediction. Our work introduces +VAPAAD or Vision Augmentation Prediction Autoencoder with Attention Design, an +innovative model that enhances predictive performance by integrating attention +designs, allowing for nuanced understanding and handling of temporal dynamics +in video sequences. We demonstrate using the famous Moving MNIST dataset the +robust performance of the proposed model and potential applicability of such +design in the literature. + +
+
+
+
+
+ + ☆ Low-Light Image Enhancement Framework for Improved Object Detection in + Fisheye Lens Datasets + + +
+ This study addresses the evolving challenges in urban traffic monitoring +detection systems based on fisheye lens cameras by proposing a framework that +improves the efficacy and accuracy of these systems. In the context of urban +infrastructure and transportation management, advanced traffic monitoring +systems have become critical for managing the complexities of urbanization and +increasing vehicle density. Traditional monitoring methods, which rely on +static cameras with narrow fields of view, are ineffective in dynamic urban +environments, necessitating the installation of multiple cameras, which raises +costs. Fisheye lenses, which were recently introduced, provide wide and +omnidirectional coverage in a single frame, making them a transformative +solution. However, issues such as distorted views and blurriness arise, +preventing accurate object detection on these images. Motivated by these +challenges, this study proposes a novel approach that combines a +ransformer-based image enhancement framework and ensemble learning technique to +address these challenges and improve traffic monitoring accuracy, making +significant contributions to the future of intelligent traffic management +systems. Our proposed methodological framework won 5th place in the 2024 AI +City Challenge, Track 4, with an F1 score of 0.5965 on experimental validation +data. The experimental results demonstrate the effectiveness, efficiency, and +robustness of the proposed system. Our code is publicly available at +https://github.com/daitranskku/AIC2024-TRACK4-TEAM15. + +
+
+
+
+
+ + ☆ Explainable Light-Weight Deep Learning Pipeline for Improved Drought + Stres + + +
+ Early identification of drought stress in crops is vital for implementing +effective mitigation measures and reducing yield loss. Non-invasive imaging +techniques hold immense potential by capturing subtle physiological changes in +plants under water deficit. Sensor based imaging data serves as a rich source +of information for machine learning and deep learning algorithms, facilitating +further analysis aimed at identifying drought stress. While these approaches +yield favorable results, real-time field applications requires algorithms +specifically designed for the complexities of natural agricultural conditions. +Our work proposes a novel deep learning framework for classifying drought +stress in potato crops captured by UAVs in natural settings. The novelty lies +in the synergistic combination of a pretrained network with carefully designed +custom layers. This architecture leverages feature extraction capabilities of +the pre-trained network while the custom layers enable targeted dimensionality +reduction and enhanced regularization, ultimately leading to improved +performance. A key innovation of our work involves the integration of +Gradient-Class Activation Mapping (Grad-CAM), an explainability technique. +Grad-CAM sheds light on the internal workings of the deep learning model, +typically referred to as a black box. By visualizing the focus areas of the +model within the images, Grad-CAM fosters interpretability and builds trust in +the decision-making process of the model. Our proposed framework achieves +superior performance, particularly with the DenseNet121 pre-trained network, +reaching a precision of 98% to identify the stressed class with an overall +accuracy of 90%. Comparative analysis of existing state-of-the-art object +detection algorithms reveals the superiority of our approach in significantly +higher precision and accuracy. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ☆ AIGeN: An Adversarial Approach for Instruction Generation in VLN + + +
+ In the last few years, the research interest in Vision-and-Language +Navigation (VLN) has grown significantly. VLN is a challenging task that +involves an agent following human instructions and navigating in a previously +unknown environment to reach a specified goal. Recent work in literature +focuses on different ways to augment the available datasets of instructions for +improving navigation performance by exploiting synthetic training data. In this +work, we propose AIGeN, a novel architecture inspired by Generative Adversarial +Networks (GANs) that produces meaningful and well-formed synthetic instructions +to improve navigation agents' performance. The model is composed of a +Transformer decoder (GPT-2) and a Transformer encoder (BERT). During the +training phase, the decoder generates sentences for a sequence of images +describing the agent's path to a particular point while the encoder +discriminates between real and fake instructions. Experimentally, we evaluate +the quality of the generated instructions and perform extensive ablation +studies. Additionally, we generate synthetic instructions for 217K trajectories +using AIGeN on Habitat-Matterport 3D Dataset (HM3D) and show an improvement in +the performance of an off-the-shelf VLN method. The validation analysis of our +proposal is conducted on REVERIE and R2R and highlights the promising aspects +of our proposal, achieving state-of-the-art performance. + +
+
+ comment: Accepted to 7th Multimodal Learning and Applications Workshop (MULA + 2024) at the IEEE/CVF Conference on Computer Vision and Pattern Recognition + 2024 +
+
+
+
+
+ + ☆ Taming Latent Diffusion Model for Neural Radiance Field Inpainting + + +
+ Neural Radiance Field (NeRF) is a representation for 3D reconstruction from +multi-view images. Despite some recent work showing preliminary success in +editing a reconstructed NeRF with diffusion prior, they remain struggling to +synthesize reasonable geometry in completely uncovered regions. One major +reason is the high diversity of synthetic contents from the diffusion model, +which hinders the radiance field from converging to a crisp and deterministic +geometry. Moreover, applying latent diffusion models on real data often yields +a textural shift incoherent to the image condition due to auto-encoding errors. +These two problems are further reinforced with the use of pixel-distance +losses. To address these issues, we propose tempering the diffusion model's +stochasticity with per-scene customization and mitigating the textural shift +with masked adversarial training. During the analyses, we also found the +commonly used pixel and perceptual losses are harmful in the NeRF inpainting +task. Through rigorous experiments, our framework yields state-of-the-art NeRF +inpainting results on various real-world scenes. Project page: +https://hubert0527.github.io/MALD-NeRF + +
+
+ comment: Project page: https://hubert0527.github.io/MALD-NeRF +
+
+
+
+
+ + ☆ No More Ambiguity in 360° Room Layout via Bi-Layout Estimation CVPR 2024 + + +
+ Inherent ambiguity in layout annotations poses significant challenges to +developing accurate 360{\deg} room layout estimation models. To address this +issue, we propose a novel Bi-Layout model capable of predicting two distinct +layout types. One stops at ambiguous regions, while the other extends to +encompass all visible areas. Our model employs two global context embeddings, +where each embedding is designed to capture specific contextual information for +each layout type. With our novel feature guidance module, the image feature +retrieves relevant context from these embeddings, generating layout-aware +features for precise bi-layout predictions. A unique property of our Bi-Layout +model is its ability to inherently detect ambiguous regions by comparing the +two predictions. To circumvent the need for manual correction of ambiguous +annotations during testing, we also introduce a new metric for disambiguating +ground truth layouts. Our method demonstrates superior performance on benchmark +datasets, notably outperforming leading approaches. Specifically, on the +MatterportLayout dataset, it improves 3DIoU from 81.70% to 82.57% across the +full test set and notably from 54.80% to 59.97% in subsets with significant +ambiguity. Project page: https://liagm.github.io/Bi_Layout/ + +
+
+ comment: CVPR 2024, Project page: https://liagm.github.io/Bi_Layout/ +
+
+
+
+
+ + ☆ MMInA: Benchmarking Multihop Multimodal Internet Agents + + +
+ Autonomous embodied agents live on an Internet of multimedia websites. Can +they hop around multimodal websites to complete complex user tasks? Existing +benchmarks fail to assess them in a realistic, evolving environment for their +embodiment across websites. To answer this question, we present MMInA, a +multihop and multimodal benchmark to evaluate the embodied agents for +compositional Internet tasks, with several appealing properties: 1) Evolving +real-world multimodal websites. Our benchmark uniquely operates on evolving +real-world websites, ensuring a high degree of realism and applicability to +natural user tasks. Our data includes 1,050 human-written tasks covering +various domains such as shopping and travel, with each task requiring the agent +to autonomously extract multimodal information from web pages as observations; +2) Multihop web browsing. Our dataset features naturally compositional tasks +that require information from or actions on multiple websites to solve, to +assess long-range reasoning capabilities on web tasks; 3) Holistic evaluation. +We propose a novel protocol for evaluating an agent's progress in completing +multihop tasks. We experiment with both standalone (multimodal) language models +and heuristic-based web agents. Extensive experiments demonstrate that while +long-chain multihop web tasks are easy for humans, they remain challenging for +state-of-the-art web agents. We identify that agents are more likely to fail on +the early hops when solving tasks of more hops, which results in lower task +success rates. To address this issue, we propose a simple memory augmentation +approach replaying past action trajectories to reflect. Our method +significantly improved both the single-hop and multihop web browsing abilities +of agents. See our code and data at https://mmina.cliangyu.com + +
+
+
+
+
+ + EgoPet: Egomotion and Interaction Data from an Animal's Perspective + + +
+ Animals perceive the world to plan their actions and interact with other +agents to accomplish complex tasks, demonstrating capabilities that are still +unmatched by AI systems. To advance our understanding and reduce the gap +between the capabilities of animals and AI systems, we introduce a dataset of +pet egomotion imagery with diverse examples of simultaneous egomotion and +multi-agent interaction. Current video datasets separately contain egomotion +and interaction examples, but rarely both at the same time. In addition, EgoPet +offers a radically distinct perspective from existing egocentric datasets of +humans or vehicles. We define two in-domain benchmark tasks that capture animal +behavior, and a third benchmark to assess the utility of EgoPet as a +pretraining resource to robotic quadruped locomotion, showing that models +trained from EgoPet outperform those trained from prior datasets. + +
+
+ comment: https://www.amirbar.net/egopet +
+
+
+
+
+ + ☆ HQ-Edit: A High-Quality Dataset for Instruction-based Image Editing + + +
+ This study introduces HQ-Edit, a high-quality instruction-based image editing +dataset with around 200,000 edits. Unlike prior approaches relying on attribute +guidance or human feedback on building datasets, we devise a scalable data +collection pipeline leveraging advanced foundation models, namely GPT-4V and +DALL-E 3. To ensure its high quality, diverse examples are first collected +online, expanded, and then used to create high-quality diptychs featuring input +and output images with detailed text prompts, followed by precise alignment +ensured through post-processing. In addition, we propose two evaluation +metrics, Alignment and Coherence, to quantitatively assess the quality of image +edit pairs using GPT-4V. HQ-Edits high-resolution images, rich in detail and +accompanied by comprehensive editing prompts, substantially enhance the +capabilities of existing image editing models. For example, an HQ-Edit +finetuned InstructPix2Pix can attain state-of-the-art image editing +performance, even surpassing those models fine-tuned with human-annotated data. +The project page is https://thefllood.github.io/HQEdit_web. + +
+
+ comment: Project Page: https://thefllood.github.io/HQEdit_web +
+
+
+
+
+ + ☆ in2IN: Leveraging individual Information to Generate Human INteractions + + +
+ Generating human-human motion interactions conditioned on textual +descriptions is a very useful application in many areas such as robotics, +gaming, animation, and the metaverse. Alongside this utility also comes a great +difficulty in modeling the highly dimensional inter-personal dynamics. In +addition, properly capturing the intra-personal diversity of interactions has a +lot of challenges. Current methods generate interactions with limited diversity +of intra-person dynamics due to the limitations of the available datasets and +conditioning strategies. For this, we introduce in2IN, a novel diffusion model +for human-human motion generation which is conditioned not only on the textual +description of the overall interaction but also on the individual descriptions +of the actions performed by each person involved in the interaction. To train +this model, we use a large language model to extend the InterHuman dataset with +individual descriptions. As a result, in2IN achieves state-of-the-art +performance in the InterHuman dataset. Furthermore, in order to increase the +intra-personal diversity on the existing interaction datasets, we propose +DualMDM, a model composition technique that combines the motions generated with +in2IN and the motions generated by a single-person motion prior pre-trained on +HumanML3D. As a result, DualMDM generates motions with higher individual +diversity and improves control over the intra-person dynamics while maintaining +inter-personal coherence. + +
+
+ comment: Project page: https://pabloruizponce.github.io/in2IN/ +
+
+
+
+
+ + ☆ OneChart: Purify the Chart Structural Extraction via One Auxiliary Token + + +
+ Chart parsing poses a significant challenge due to the diversity of styles, +values, texts, and so forth. Even advanced large vision-language models (LVLMs) +with billions of parameters struggle to handle such tasks satisfactorily. To +address this, we propose OneChart: a reliable agent specifically devised for +the structural extraction of chart information. Similar to popular LVLMs, +OneChart incorporates an autoregressive main body. Uniquely, to enhance the +reliability of the numerical parts of the output, we introduce an auxiliary +token placed at the beginning of the total tokens along with an additional +decoder. The numerically optimized (auxiliary) token allows subsequent tokens +for chart parsing to capture enhanced numerical features through causal +attention. Furthermore, with the aid of the auxiliary token, we have devised a +self-evaluation mechanism that enables the model to gauge the reliability of +its chart parsing results by providing confidence scores for the generated +content. Compared to current state-of-the-art (SOTA) chart parsing models, +e.g., DePlot, ChartVLM, ChartAst, OneChart significantly outperforms in Average +Precision (AP) for chart structural extraction across multiple public +benchmarks, despite enjoying only 0.2 billion parameters. Moreover, as a chart +parsing agent, it also brings 10%+ accuracy gains for the popular LVLM +(LLaVA-1.6) in the downstream ChartQA benchmark. + +
+
+ comment: 14 pages, 9 figures and 6 tables +
+
+
+
+
+ + ☆ One-Click Upgrade from 2D to 3D: Sandwiched RGB-D Video Compression for + Stereoscopic Teleconferencing CVPR 2024 + + +
+ Stereoscopic video conferencing is still challenging due to the need to +compress stereo RGB-D video in real-time. Though hardware implementations of +standard video codecs such as H.264 / AVC and HEVC are widely available, they +are not designed for stereoscopic videos and suffer from reduced quality and +performance. Specific multiview or 3D extensions of these codecs are complex +and lack efficient implementations. In this paper, we propose a new approach to +upgrade a 2D video codec to support stereo RGB-D video compression, by wrapping +it with a neural pre- and post-processor pair. The neural networks are +end-to-end trained with an image codec proxy, and shown to work with a more +sophisticated video codec. We also propose a geometry-aware loss function to +improve rendering quality. We train the neural pre- and post-processors on a +synthetic 4D people dataset, and evaluate it on both synthetic and +real-captured stereo RGB-D videos. Experimental results show that the neural +networks generalize well to unseen data and work out-of-box with various video +codecs. Our approach saves about 30% bit-rate compared to a conventional video +coding scheme and MV-HEVC at the same level of rendering quality from a novel +view, without the need of a task-specific hardware upgrade. + +
+
+ comment: Accepted by CVPR 2024 Workshop (AIS: Vision, Graphics and AI for + Streaming https://ai4streaming-workshop.github.io ) +
+
+
+
+
+ + ☆ MaxFusion: Plug&Play Multi-Modal Generation in Text-to-Image Diffusion + Models + + +
+ Large diffusion-based Text-to-Image (T2I) models have shown impressive +generative powers for text-to-image generation as well as spatially conditioned +image generation. For most applications, we can train the model end-toend with +paired data to obtain photorealistic generation quality. However, to add an +additional task, one often needs to retrain the model from scratch using paired +data across all modalities to retain good generation performance. In this +paper, we tackle this issue and propose a novel strategy to scale a generative +model across new tasks with minimal compute. During our experiments, we +discovered that the variance maps of intermediate feature maps of diffusion +models capture the intensity of conditioning. Utilizing this prior information, +we propose MaxFusion, an efficient strategy to scale up text-to-image +generation models to accommodate new modality conditions. Specifically, we +combine aligned features of multiple models, hence bringing a compositional +effect. Our fusion strategy can be integrated into off-the-shelf models to +enhance their generative prowess. + +
+
+
+
+
+ + ☆ Diffscaler: Enhancing the Generative Prowess of Diffusion Transformers + + +
+ Recently, diffusion transformers have gained wide attention with its +excellent performance in text-to-image and text-to-vidoe models, emphasizing +the need for transformers as backbone for diffusion models. Transformer-based +models have shown better generalization capability compared to CNN-based models +for general vision tasks. However, much less has been explored in the existing +literature regarding the capabilities of transformer-based diffusion backbones +and expanding their generative prowess to other datasets. This paper focuses on +enabling a single pre-trained diffusion transformer model to scale across +multiple datasets swiftly, allowing for the completion of diverse generative +tasks using just one model. To this end, we propose DiffScaler, an efficient +scaling strategy for diffusion models where we train a minimal amount of +parameters to adapt to different tasks. In particular, we learn task-specific +transformations at each layer by incorporating the ability to utilize the +learned subspaces of the pre-trained model, as well as the ability to learn +additional task-specific subspaces, which may be absent in the pre-training +dataset. As these parameters are independent, a single diffusion model with +these task-specific parameters can be used to perform multiple tasks +simultaneously. Moreover, we find that transformer-based diffusion models +significantly outperform CNN-based diffusion models methods while performing +fine-tuning over smaller datasets. We perform experiments on four unconditional +image generation datasets. We show that using our proposed method, a single +pre-trained model can scale up to perform these conditional and unconditional +tasks, respectively, with minimal parameter tuning while performing as close as +fine-tuning an entire diffusion model for that particular task. + +
+
+
+
+
+ + ☆ Ctrl-Adapter: An Efficient and Versatile Framework for Adapting Diverse + Controls to Any Diffusion Model + + +
+ ControlNets are widely used for adding spatial control in image generation +with different conditions, such as depth maps, canny edges, and human poses. +However, there are several challenges when leveraging the pretrained image +ControlNets for controlled video generation. First, pretrained ControlNet +cannot be directly plugged into new backbone models due to the mismatch of +feature spaces, and the cost of training ControlNets for new backbones is a big +burden. Second, ControlNet features for different frames might not effectively +handle the temporal consistency. To address these challenges, we introduce +Ctrl-Adapter, an efficient and versatile framework that adds diverse controls +to any image/video diffusion models, by adapting pretrained ControlNets (and +improving temporal alignment for videos). Ctrl-Adapter provides diverse +capabilities including image control, video control, video control with sparse +frames, multi-condition control, compatibility with different backbones, +adaptation to unseen control conditions, and video editing. In Ctrl-Adapter, we +train adapter layers that fuse pretrained ControlNet features to different +image/video diffusion models, while keeping the parameters of the ControlNets +and the diffusion models frozen. Ctrl-Adapter consists of temporal and spatial +modules so that it can effectively handle the temporal consistency of videos. +We also propose latent skipping and inverse timestep sampling for robust +adaptation and sparse control. Moreover, Ctrl-Adapter enables control from +multiple conditions by simply taking the (weighted) average of ControlNet +outputs. With diverse image/video diffusion backbones (SDXL, Hotshot-XL, +I2VGen-XL, and SVD), Ctrl-Adapter matches ControlNet for image control and +outperforms all baselines for video control (achieving the SOTA accuracy on the +DAVIS 2017 dataset) with significantly lower computational costs (less than 10 +GPU hours). + +
+
+ comment: First two authors contributed equally; Project page: + https://ctrl-adapter.github.io/ +
+
+
+
+
+ + ☆ Design and Analysis of Efficient Attention in Transformers for Social + Group Activity Recognition + + +
+ Social group activity recognition is a challenging task extended from group +activity recognition, where social groups must be recognized with their +activities and group members. Existing methods tackle this task by leveraging +region features of individuals following existing group activity recognition +methods. However, the effectiveness of region features is susceptible to person +localization and variable semantics of individual actions. To overcome these +issues, we propose leveraging attention modules in transformers to generate +social group features. In this method, multiple embeddings are used to +aggregate features for a social group, each of which is assigned to a group +member without duplication. Due to this non-duplicated assignment, the number +of embeddings must be significant to avoid missing group members and thus +renders attention in transformers ineffective. To find optimal attention +designs with a large number of embeddings, we explore several design choices of +queries for feature aggregation and self-attention modules in transformer +decoders. Extensive experimental results show that the proposed method achieves +state-of-the-art performance and verify that the proposed attention designs are +highly effective on social group activity recognition. + +
+
+ comment: Accepted to IJCV, preprint version +
+
+
+
+
+ + ☆ Ti-Patch: Tiled Physical Adversarial Patch for no-reference video + quality metrics + + +
+ Objective no-reference image- and video-quality metrics are crucial in many +computer vision tasks. However, state-of-the-art no-reference metrics have +become learning-based and are vulnerable to adversarial attacks. The +vulnerability of quality metrics imposes restrictions on using such metrics in +quality control systems and comparing objective algorithms. Also, using +vulnerable metrics as a loss for deep learning model training can mislead +training to worsen visual quality. Because of that, quality metrics testing for +vulnerability is a task of current interest. This paper proposes a new method +for testing quality metrics vulnerability in the physical space. To our +knowledge, quality metrics were not previously tested for vulnerability to this +attack; they were only tested in the pixel space. We applied a physical +adversarial Ti-Patch (Tiled Patch) attack to quality metrics and did +experiments both in pixel and physical space. We also performed experiments on +the implementation of physical adversarial wallpaper. The proposed method can +be used as additional quality metrics in vulnerability evaluation, +complementing traditional subjective comparison and vulnerability tests in the +pixel space. We made our code and adversarial videos available on GitHub: +https://github.com/leonenkova/Ti-Patch. + +
+
+ comment: Accepted to WAIT AINL 2024 +
+
+
+
+
+ + ☆ How to build the best medical image segmentation algorithm using + foundation models: a comprehensive empirical study with Segment Anything + Model + + +
+ Automated segmentation is a fundamental medical image analysis task, which +enjoys significant advances due to the advent of deep learning. While +foundation models have been useful in natural language processing and some +vision tasks for some time, the foundation model developed with image +segmentation in mind - Segment Anything Model (SAM) - has been developed only +recently and has shown similar promise. However, there are still no systematic +analyses or ``best-practice'' guidelines for optimal fine-tuning of SAM for +medical image segmentation. This work summarizes existing fine-tuning +strategies with various backbone architectures, model components, and +fine-tuning algorithms across 18 combinations, and evaluates them on 17 +datasets covering all common radiology modalities. Our study reveals that (1) +fine-tuning SAM leads to slightly better performance than previous segmentation +methods, (2) fine-tuning strategies that use parameter-efficient learning in +both the encoder and decoder are superior to other strategies, (3) network +architecture has a small impact on final performance, (4) further training SAM +with self-supervised learning can improve final model performance. We also +demonstrate the ineffectiveness of some methods popular in the literature and +further expand our experiments into few-shot and prompt-based settings. Lastly, +we released our code and MRI-specific fine-tuned weights, which consistently +obtained superior performance over the original SAM, at +https://github.com/mazurowski-lab/finetune-SAM. + +
+
+ comment: Code available at https://github.com/mazurowski-lab/finetune-SAM +
+
+
+
+
+ + ☆ Realistic Model Selection for Weakly Supervised Object Localization + + +
+ Weakly Supervised Object Localization (WSOL) allows for training deep +learning models for classification and localization, using only global +class-level labels. The lack of bounding box (bbox) supervision during training +represents a considerable challenge for hyper-parameter search and model +selection. Earlier WSOL works implicitly observed localization performance over +a test set which leads to biased performance evaluation. More recently, a +better WSOL protocol has been proposed, where a validation set with bbox +annotations is held out for model selection. Although it does not rely on the +test set, this protocol is unrealistic since bboxes are not available in +real-world applications, and when available, it is better to use them directly +to fit model weights. Our initial empirical analysis shows that the +localization performance of a model declines significantly when using only +image-class labels for model selection (compared to using bounding-box +annotations). This suggests that adding bounding-box labels is preferable for +selecting the best model for localization. In this paper, we introduce a new +WSOL validation protocol that provides a localization signal without the need +for manual bbox annotations. In particular, we leverage noisy pseudo boxes from +an off-the-shelf ROI proposal generator such as Selective-Search, CLIP, and RPN +pretrained models for model selection. Our experimental results with several +WSOL methods on ILSVRC and CUB-200-2011 datasets show that our noisy boxes +allow selecting models with performance close to those selected using ground +truth boxes, and better than models selected using only image-class labels. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Unifying Global and Local Scene Entities Modelling for Precise Action + Spotting IJCNN 2024 + + +
+ Sports videos pose complex challenges, including cluttered backgrounds, +camera angle changes, small action-representing objects, and imbalanced action +class distribution. Existing methods for detecting actions in sports videos +heavily rely on global features, utilizing a backbone network as a black box +that encompasses the entire spatial frame. However, these approaches tend to +overlook the nuances of the scene and struggle with detecting actions that +occupy a small portion of the frame. In particular, they face difficulties when +dealing with action classes involving small objects, such as balls or +yellow/red cards in soccer, which only occupy a fraction of the screen space. +To address these challenges, we introduce a novel approach that analyzes and +models scene entities using an adaptive attention mechanism. Particularly, our +model disentangles the scene content into the global environment feature and +local relevant scene entities feature. To efficiently extract environmental +features while considering temporal information with less computational cost, +we propose the use of a 2D backbone network with a time-shift mechanism. To +accurately capture relevant scene entities, we employ a Vision-Language model +in conjunction with the adaptive attention mechanism. Our model has +demonstrated outstanding performance, securing the 1st place in the +SoccerNet-v2 Action Spotting, FineDiving, and FineGym challenge with a +substantial performance improvement of 1.6, 2.0, and 1.3 points in avg-mAP +compared to the runner-up methods. Furthermore, our approach offers +interpretability capabilities in contrast to other deep learning models, which +are often designed as black boxes. Our code and models are released at: +https://github.com/Fsoft-AIC/unifying-global-local-feature. + +
+
+ comment: Accepted to IJCNN 2024 +
+
+
+
+
+ + ☆ Knowledge-enhanced Visual-Language Pretraining for Computational + Pathology + + +
+ In this paper, we consider the problem of visual representation learning for +computational pathology, by exploiting large-scale image-text pairs gathered +from public resources, along with the domain specific knowledge in pathology. +Specifically, we make the following contributions: (i) We curate a pathology +knowledge tree that consists of 50,470 informative attributes for 4,718 +diseases requiring pathology diagnosis from 32 human tissues. To our knowledge, +this is the first comprehensive structured pathology knowledge base; (ii) We +develop a knowledge-enhanced visual-language pretraining approach, where we +first project pathology-specific knowledge into latent embedding space via +language model, and use it to guide the visual representation learning; (iii) +We conduct thorough experiments to validate the effectiveness of our proposed +components, demonstrating significant performance improvement on various +downstream tasks, including cross-modal retrieval, zero-shot classification on +pathology patches, and zero-shot tumor subtyping on whole slide images (WSIs). +All codes, models and the pathology knowledge tree will be released to the +research community + +
+
+
+
+
+ + ☆ Evolving Interpretable Visual Classifiers with Large Language Models + + +
+ Multimodal pre-trained models, such as CLIP, are popular for zero-shot +classification due to their open-vocabulary flexibility and high performance. +However, vision-language models, which compute similarity scores between images +and class labels, are largely black-box, with limited interpretability, risk +for bias, and inability to discover new visual concepts not written down. +Moreover, in practical settings, the vocabulary for class names and attributes +of specialized concepts will not be known, preventing these methods from +performing well on images uncommon in large-scale vision-language datasets. To +address these limitations, we present a novel method that discovers +interpretable yet discriminative sets of attributes for visual recognition. We +introduce an evolutionary search algorithm that uses a large language model and +its in-context learning abilities to iteratively mutate a concept bottleneck of +attributes for classification. Our method produces state-of-the-art, +interpretable fine-grained classifiers. We outperform the latest baselines by +18.4% on five fine-grained iNaturalist datasets and by 22.2% on two KikiBouba +datasets, despite the baselines having access to privileged information about +class names. + +
+
+
+
+
+ + ☆ eMotion-GAN: A Motion-based GAN for Photorealistic and Facial Expression + Preserving Frontal View Synthesis + + +
+ Many existing facial expression recognition (FER) systems encounter +substantial performance degradation when faced with variations in head pose. +Numerous frontalization methods have been proposed to enhance these systems' +performance under such conditions. However, they often introduce undesirable +deformations, rendering them less suitable for precise facial expression +analysis. In this paper, we present eMotion-GAN, a novel deep learning approach +designed for frontal view synthesis while preserving facial expressions within +the motion domain. Considering the motion induced by head variation as noise +and the motion induced by facial expression as the relevant information, our +model is trained to filter out the noisy motion in order to retain only the +motion related to facial expression. The filtered motion is then mapped onto a +neutral frontal face to generate the corresponding expressive frontal face. We +conducted extensive evaluations using several widely recognized dynamic FER +datasets, which encompass sequences exhibiting various degrees of head pose +variations in both intensity and orientation. Our results demonstrate the +effectiveness of our approach in significantly reducing the FER performance gap +between frontal and non-frontal faces. Specifically, we achieved a FER +improvement of up to +5\% for small pose variations and up to +20\% improvement +for larger pose variations. Code available at +\url{https://github.com/o-ikne/eMotion-GAN.git}. + +
+
+
+
+
+ + ☆ HOI-Ref: Hand-Object Interaction Referral in Egocentric Vision + + +
+ Large Vision Language Models (VLMs) are now the de facto state-of-the-art for +a number of tasks including visual question answering, recognising objects, and +spatial referral. In this work, we propose the HOI-Ref task for egocentric +images that aims to understand interactions between hands and objects using +VLMs. To enable HOI-Ref, we curate the HOI-QA dataset that consists of 3.9M +question-answer pairs for training and evaluating VLMs. HOI-QA includes +questions relating to locating hands, objects, and critically their +interactions (e.g. referring to the object being manipulated by the hand). We +train the first VLM for HOI-Ref on this dataset and call it VLM4HOI. Our +results demonstrate that VLMs trained for referral on third person images fail +to recognise and refer hands and objects in egocentric images. When fine-tuned +on our egocentric HOI-QA dataset, performance improves by 27.9% for referring +hands and objects, and by 26.7% for referring interactions. + +
+
+ comment: Project Page: https://sid2697.github.io/hoi-ref/ +
+
+
+
+
+ + ☆ Zero-shot detection of buildings in mobile LiDAR using Language Vision + Model + + +
+ Recent advances have demonstrated that Language Vision Models (LVMs) surpass +the existing State-of-the-Art (SOTA) in two-dimensional (2D) computer vision +tasks, motivating attempts to apply LVMs to three-dimensional (3D) data. While +LVMs are efficient and effective in addressing various downstream 2D vision +tasks without training, they face significant challenges when it comes to point +clouds, a representative format for representing 3D data. It is more difficult +to extract features from 3D data and there are challenges due to large data +sizes and the cost of the collection and labelling, resulting in a notably +limited availability of datasets. Moreover, constructing LVMs for point clouds +is even more challenging due to the requirements for large amounts of data and +training time. To address these issues, our research aims to 1) apply the +Grounded SAM through Spherical Projection to transfer 3D to 2D, and 2) +experiment with synthetic data to evaluate its effectiveness in bridging the +gap between synthetic and real-world data domains. Our approach exhibited high +performance with an accuracy of 0.96, an IoU of 0.85, precision of 0.92, recall +of 0.91, and an F1 score of 0.92, confirming its potential. However, challenges +such as occlusion problems and pixel-level overlaps of multi-label points +during spherical image generation remain to be addressed in future studies. + +
+
+ comment: 7 pages, 6 figures, conference +
+
+
+
+
+ + ☆ Zero-shot Building Age Classification from Facade Image Using GPT-4 + + +
+ A building's age of construction is crucial for supporting many geospatial +applications. Much current research focuses on estimating building age from +facade images using deep learning. However, building an accurate deep learning +model requires a considerable amount of labelled training data, and the trained +models often have geographical constraints. Recently, large pre-trained vision +language models (VLMs) such as GPT-4 Vision, which demonstrate significant +generalisation capabilities, have emerged as potential training-free tools for +dealing with specific vision tasks, but their applicability and reliability for +building information remain unexplored. In this study, a zero-shot building age +classifier for facade images is developed using prompts that include logical +instructions. Taking London as a test case, we introduce a new dataset, +FI-London, comprising facade images and building age epochs. Although the +training-free classifier achieved a modest accuracy of 39.69%, the mean +absolute error of 0.85 decades indicates that the model can predict building +age epochs successfully albeit with a small bias. The ensuing discussion +reveals that the classifier struggles to predict the age of very old buildings +and is challenged by fine-grained predictions within 2 decades. Overall, the +classifier utilising GPT-4 Vision is capable of predicting the rough age epoch +of a building from a single facade image without any training. + +
+
+
+
+
+ + ☆ EdgeRelight360: Text-Conditioned 360-Degree HDR Image Generation for + Real-Time On-Device Video Portrait Relighting CVPR + + +
+ In this paper, we present EdgeRelight360, an approach for real-time video +portrait relighting on mobile devices, utilizing text-conditioned generation of +360-degree high dynamic range image (HDRI) maps. Our method proposes a +diffusion-based text-to-360-degree image generation in the HDR domain, taking +advantage of the HDR10 standard. This technique facilitates the generation of +high-quality, realistic lighting conditions from textual descriptions, offering +flexibility and control in portrait video relighting task. Unlike the previous +relighting frameworks, our proposed system performs video relighting directly +on-device, enabling real-time inference with real 360-degree HDRI maps. This +on-device processing ensures both privacy and guarantees low runtime, providing +an immediate response to changes in lighting conditions or user inputs. Our +approach paves the way for new possibilities in real-time video applications, +including video conferencing, gaming, and augmented reality, by allowing +dynamic, text-based control of lighting conditions. + +
+
+ comment: Camera-ready version (CVPR workshop - EDGE'24) +
+
+
+
+
+ + ☆ Evaluating the Explainability of Attributes and Prototypes for a Medical + Classification Model + + +
+ Due to the sensitive nature of medicine, it is particularly important and +highly demanded that AI methods are explainable. This need has been recognised +and there is great research interest in xAI solutions with medical +applications. However, there is a lack of user-centred evaluation regarding the +actual impact of the explanations. We evaluate attribute- and prototype-based +explanations with the Proto-Caps model. This xAI model reasons the target +classification with human-defined visual features of the target object in the +form of scores and attribute-specific prototypes. The model thus provides a +multimodal explanation that is intuitively understandable to humans thanks to +predefined attributes. A user study involving six radiologists shows that the +explanations are subjectivly perceived as helpful, as they reflect their +decision-making process. The results of the model are considered a second +opinion that radiologists can discuss using the model's explanations. However, +it was shown that the inclusion and increased magnitude of model explanations +objectively can increase confidence in the model's predictions when the model +is incorrect. We can conclude that attribute scores and visual prototypes +enhance confidence in the model. However, additional development and repeated +user studies are needed to tailor the explanation to the respective use case. + +
+
+ comment: Accepted at The 2nd World Conference on eXplainable Artificial + Intelligence +
+
+
+
+
+ + ☆ ReffAKD: Resource-efficient Autoencoder-based Knowledge Distillation + + +
+ In this research, we propose an innovative method to boost Knowledge +Distillation efficiency without the need for resource-heavy teacher models. +Knowledge Distillation trains a smaller ``student'' model with guidance from a +larger ``teacher'' model, which is computationally costly. However, the main +benefit comes from the soft labels provided by the teacher, helping the student +grasp nuanced class similarities. In our work, we propose an efficient method +for generating these soft labels, thereby eliminating the need for a large +teacher model. We employ a compact autoencoder to extract essential features +and calculate similarity scores between different classes. Afterward, we apply +the softmax function to these similarity scores to obtain a soft probability +vector. This vector serves as valuable guidance during the training of the +student model. Our extensive experiments on various datasets, including +CIFAR-100, Tiny Imagenet, and Fashion MNIST, demonstrate the superior resource +efficiency of our approach compared to traditional knowledge distillation +methods that rely on large teacher models. Importantly, our approach +consistently achieves similar or even superior performance in terms of model +accuracy. We also perform a comparative study with various techniques recently +developed for knowledge distillation showing our approach achieves competitive +performance with using significantly less resources. We also show that our +approach can be easily added to any logit based knowledge distillation method. +This research contributes to making knowledge distillation more accessible and +cost-effective for practical applications, making it a promising avenue for +improving the efficiency of model training. The code for this work is available +at, https://github.com/JEKimLab/ReffAKD. + +
+
+
+
+
+ + ☆ Map-Relative Pose Regression for Visual Re-Localization CVPR + + +
+ Pose regression networks predict the camera pose of a query image relative to +a known environment. Within this family of methods, absolute pose regression +(APR) has recently shown promising accuracy in the range of a few centimeters +in position error. APR networks encode the scene geometry implicitly in their +weights. To achieve high accuracy, they require vast amounts of training data +that, realistically, can only be created using novel view synthesis in a +days-long process. This process has to be repeated for each new scene again and +again. We present a new approach to pose regression, map-relative pose +regression (marepo), that satisfies the data hunger of the pose regression +network in a scene-agnostic fashion. We condition the pose regressor on a +scene-specific map representation such that its pose predictions are relative +to the scene map. This allows us to train the pose regressor across hundreds of +scenes to learn the generic relation between a scene-specific map +representation and the camera pose. Our map-relative pose regressor can be +applied to new map representations immediately or after mere minutes of +fine-tuning for the highest accuracy. Our approach outperforms previous pose +regression methods by far on two public datasets, indoor and outdoor. Code is +available: https://nianticlabs.github.io/marepo + +
+
+ comment: IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) + 2024, Highlight Paper +
+
+
+
+
+ + ☆ Conditional Prototype Rectification Prompt Learning + + +
+ Pre-trained large-scale vision-language models (VLMs) have acquired profound +understanding of general visual concepts. Recent advancements in efficient +transfer learning (ETL) have shown remarkable success in fine-tuning VLMs +within the scenario of limited data, introducing only a few parameters to +harness task-specific insights from VLMs. Despite significant progress, current +leading ETL methods tend to overfit the narrow distributions of base classes +seen during training and encounter two primary challenges: (i) only utilizing +uni-modal information to modeling task-specific knowledge; and (ii) using +costly and time-consuming methods to supplement knowledge. To address these +issues, we propose a Conditional Prototype Rectification Prompt Learning (CPR) +method to correct the bias of base examples and augment limited data in an +effective way. Specifically, we alleviate overfitting on base classes from two +aspects. First, each input image acquires knowledge from both textual and +visual prototypes, and then generates sample-conditional text tokens. Second, +we extract utilizable knowledge from unlabeled data to further refine the +prototypes. These two strategies mitigate biases stemming from base classes, +yielding a more effective classifier. Extensive experiments on 11 benchmark +datasets show that our CPR achieves state-of-the-art performance on both +few-shot classification and base-to-new generalization tasks. Our code is +avaliable at \url{https://github.com/chenhaoxing/CPR}. + +
+
+
+
+
+ + ☆ Table tennis ball spin estimation with an event camera CVPR + + +
+ Spin plays a pivotal role in ball-based sports. Estimating spin becomes a key +skill due to its impact on the ball's trajectory and bouncing behavior. Spin +cannot be observed directly, making it inherently challenging to estimate. In +table tennis, the combination of high velocity and spin renders traditional low +frame rate cameras inadequate for quickly and accurately observing the ball's +logo to estimate the spin due to the motion blur. Event cameras do not suffer +as much from motion blur, thanks to their high temporal resolution. Moreover, +the sparse nature of the event stream solves communication bandwidth +limitations many frame cameras face. To the best of our knowledge, we present +the first method for table tennis spin estimation using an event camera. We use +ordinal time surfaces to track the ball and then isolate the events generated +by the logo on the ball. Optical flow is then estimated from the extracted +events to infer the ball's spin. We achieved a spin magnitude mean error of +$10.7 \pm 17.3$ rps and a spin axis mean error of $32.9 \pm 38.2\deg$ in real +time for a flying ball. + +
+
+ comment: Accepted to CVsport (CVPRW 2024) +
+
+
+
+
+ + ☆ Empowering Embodied Visual Tracking with Visual Foundation Models and + Offline RL + + +
+ Embodied visual tracking is to follow a target object in dynamic 3D +environments using an agent's egocentric vision. This is a vital and +challenging skill for embodied agents. However, existing methods suffer from +inefficient training and poor generalization. In this paper, we propose a novel +framework that combines visual foundation models (VFM) and offline +reinforcement learning (offline RL) to empower embodied visual tracking. We use +a pre-trained VFM, such as ``Tracking Anything", to extract semantic +segmentation masks with text prompts. We then train a recurrent policy network +with offline RL, e.g., Conservative Q-Learning, to learn from the collected +demonstrations without online agent-environment interactions. To further +improve the robustness and generalization of the policy network, we also +introduce a mask re-targeting mechanism and a multi-level data collection +strategy. In this way, we can train a robust tracker within an hour on a +consumer-level GPU, e.g., Nvidia RTX 3090. Such efficiency is unprecedented for +RL-based visual tracking methods. We evaluate our tracker on several +high-fidelity environments with challenging situations, such as distraction and +occlusion. The results show that our agent outperforms state-of-the-art methods +in terms of sample efficiency, robustness to distractors, and generalization to +unseen scenarios and targets. We also demonstrate the transferability of the +learned tracker from the virtual world to real-world scenarios. + +
+
+
+
+
+ + ☆ A Diffusion-based Data Generator for Training Object Recognition Models + in Ultra-Range Distance + + +
+ Object recognition, commonly performed by a camera, is a fundamental +requirement for robots to complete complex tasks. Some tasks require +recognizing objects far from the robot's camera. A challenging example is +Ultra-Range Gesture Recognition (URGR) in human-robot interaction where the +user exhibits directive gestures at a distance of up to 25~m from the robot. +However, training a model to recognize hardly visible objects located in +ultra-range requires an exhaustive collection of a significant amount of +labeled samples. The generation of synthetic training datasets is a recent +solution to the lack of real-world data, while unable to properly replicate the +realistic visual characteristics of distant objects in images. In this letter, +we propose the Diffusion in Ultra-Range (DUR) framework based on a Diffusion +model to generate labeled images of distant objects in various scenes. The DUR +generator receives a desired distance and class (e.g., gesture) and outputs a +corresponding synthetic image. We apply DUR to train a URGR model with +directive gestures in which fine details of the gesturing hand are challenging +to distinguish. DUR is compared to other types of generative models showcasing +superiority both in fidelity and in recognition success rate when training a +URGR model. More importantly, training a DUR model on a limited amount of real +data and then using it to generate synthetic data for training a URGR model +outperforms directly training the URGR model on real data. The synthetic-based +URGR model is also demonstrated in gesture-based direction of a ground robot. + +
+
+
+
+
+ + ☆ STMixer: A One-Stage Sparse Action Detector CVPR + 2023 + + +
+ Traditional video action detectors typically adopt the two-stage pipeline, +where a person detector is first employed to generate actor boxes and then 3D +RoIAlign is used to extract actor-specific features for classification. This +detection paradigm requires multi-stage training and inference, and the feature +sampling is constrained inside the box, failing to effectively leverage richer +context information outside. Recently, a few query-based action detectors have +been proposed to predict action instances in an end-to-end manner. However, +they still lack adaptability in feature sampling and decoding, thus suffering +from the issues of inferior performance or slower convergence. In this paper, +we propose two core designs for a more flexible one-stage sparse action +detector. First, we present a query-based adaptive feature sampling module, +which endows the detector with the flexibility of mining a group of +discriminative features from the entire spatio-temporal domain. Second, we +devise a decoupled feature mixing module, which dynamically attends to and +mixes video features along the spatial and temporal dimensions respectively for +better feature decoding. Based on these designs, we instantiate two detection +pipelines, that is, STMixer-K for keyframe action detection and STMixer-T for +action tubelet detection. Without bells and whistles, our STMixer detectors +obtain state-of-the-art results on five challenging spatio-temporal action +detection benchmarks for keyframe action detection or action tube detection. + +
+
+ comment: Extended version of the paper arXiv:2303.15879 presented at CVPR + 2023. Accepted by TPAMI 2024 +
+
+
+
+
+ + ☆ Video2Game: Real-time, Interactive, Realistic and Browser-Compatible + Environment from a Single Video CVPR 2024 + + +
+ Creating high-quality and interactive virtual environments, such as games and +simulators, often involves complex and costly manual modeling processes. In +this paper, we present Video2Game, a novel approach that automatically converts +videos of real-world scenes into realistic and interactive game environments. +At the heart of our system are three core components:(i) a neural radiance +fields (NeRF) module that effectively captures the geometry and visual +appearance of the scene; (ii) a mesh module that distills the knowledge from +NeRF for faster rendering; and (iii) a physics module that models the +interactions and physical dynamics among the objects. By following the +carefully designed pipeline, one can construct an interactable and actionable +digital replica of the real world. We benchmark our system on both indoor and +large-scale outdoor scenes. We show that we can not only produce +highly-realistic renderings in real-time, but also build interactive games on +top. + +
+
+ comment: CVPR 2024. Project page (with code): https://video2game.github.io/ +
+
+
+
+
+ + ☆ Digging into contrastive learning for robust depth estimation with + diffusion models + + +
+ Recently, diffusion-based depth estimation methods have drawn widespread +attention due to their elegant denoising patterns and promising performance. +However, they are typically unreliable under adverse conditions prevalent in +real-world scenarios, such as rainy, snowy, etc. In this paper, we propose a +novel robust depth estimation method called D4RD, featuring a custom +contrastive learning mode tailored for diffusion models to mitigate performance +degradation in complex environments. Concretely, we integrate the strength of +knowledge distillation into contrastive learning, building the `trinity' +contrastive scheme. This scheme utilizes the sampled noise of the forward +diffusion process as a natural reference, guiding the predicted noise in +diverse scenes toward a more stable and precise optimum. Moreover, we extend +noise-level trinity to encompass more generic feature and image levels, +establishing a multi-level contrast to distribute the burden of robust +perception across the overall network. Before addressing complex scenarios, we +enhance the stability of the baseline diffusion model with three +straightforward yet effective improvements, which facilitate convergence and +remove depth outliers. Extensive experiments demonstrate that D4RD surpasses +existing state-of-the-art solutions on synthetic corruption datasets and +real-world weather conditions. The code for D4RD will be made available for +further exploration and adoption. + +
+
+ comment: 8 pages,6 figures +
+
+
+
+
+ + ☆ Interaction as Explanation: A User Interaction-based Method for + Explaining Image Classification Models + + +
+ In computer vision, explainable AI (xAI) methods seek to mitigate the +'black-box' problem by making the decision-making process of deep learning +models more interpretable and transparent. Traditional xAI methods concentrate +on visualizing input features that influence model predictions, providing +insights primarily suited for experts. In this work, we present an +interaction-based xAI method that enhances user comprehension of image +classification models through their interaction. Thus, we developed a web-based +prototype allowing users to modify images via painting and erasing, thereby +observing changes in classification results. Our approach enables users to +discern critical features influencing the model's decision-making process, +aligning their mental models with the model's logic. Experiments conducted with +five images demonstrate the potential of the method to reveal feature +importance through user interaction. Our work contributes a novel perspective +to xAI by centering on end-user engagement and understanding, paving the way +for more intuitive and accessible explainability in AI systems. + +
+
+ comment: 5 pages, 2 figures, 1 table +
+
+
+
+
+ + ☆ A Recipe for CAC: Mosaic-based Generalized Loss for Improved + Class-Agnostic Counting + + +
+ Class agnostic counting (CAC) is a vision task that can be used to count the +total occurrence number of any given reference objects in the query image. The +task is usually formulated as a density map estimation problem through +similarity computation among a few image samples of the reference object and +the query image. In this paper, we point out a severe issue of the existing CAC +framework: Given a multi-class setting, models don't consider reference images +and instead blindly match all dominant objects in the query image. Moreover, +the current evaluation metrics and dataset cannot be used to faithfully assess +the model's generalization performance and robustness. To this end, we discover +that the combination of mosaic augmentation with generalized loss is essential +for addressing the aforementioned issue of CAC models to count objects of +majority (i.e. dominant objects) regardless of the references. Furthermore, we +introduce a new evaluation protocol and metrics for resolving the problem +behind the existing CAC evaluation scheme and better benchmarking CAC models in +a more fair manner. Besides, extensive evaluation results demonstrate that our +proposed recipe can consistently improve the performance of different CAC +models. The code will be released upon acceptance. + +
+
+
+
+
+ + ☆ 3D Face Tracking from 2D Video through Iterative Dense UV to Image Flow CVPR 2024 + + +
+ When working with 3D facial data, improving fidelity and avoiding the uncanny +valley effect is critically dependent on accurate 3D facial performance +capture. Because such methods are expensive and due to the widespread +availability of 2D videos, recent methods have focused on how to perform +monocular 3D face tracking. However, these methods often fall short in +capturing precise facial movements due to limitations in their network +architecture, training, and evaluation processes. Addressing these challenges, +we propose a novel face tracker, FlowFace, that introduces an innovative 2D +alignment network for dense per-vertex alignment. Unlike prior work, FlowFace +is trained on high-quality 3D scan annotations rather than weak supervision or +synthetic data. Our 3D model fitting module jointly fits a 3D face model from +one or many observations, integrating existing neutral shape priors for +enhanced identity and expression disentanglement and per-vertex deformations +for detailed facial feature reconstruction. Additionally, we propose a novel +metric and benchmark for assessing tracking accuracy. Our method exhibits +superior performance on both custom and publicly available benchmarks. We +further validate the effectiveness of our tracker by generating high-quality 3D +data from 2D videos, which leads to performance gains on downstream tasks. + +
+
+ comment: 22 pages, 25 figures, to be published in CVPR 2024 +
+
+
+
+
+ + ☆ Neighbour-level Message Interaction Encoding for Improved Representation + Learning on Graphs + + +
+ Message passing has become the dominant framework in graph representation +learning. The essential idea of the message-passing framework is to update node +embeddings based on the information aggregated from local neighbours. However, +most existing aggregation methods have not encoded neighbour-level message +interactions into the aggregated message, resulting in an information lost in +embedding generation. And this information lost could be accumulated and become +more serious as more layers are added to the graph network model. To address +this issue, we propose a neighbour-level message interaction information +encoding method for improving graph representation learning. For messages that +are aggregated at a node, we explicitly generate an encoding between each +message and the rest messages using an encoding function. Then we aggregate +these learned encodings and take the sum of the aggregated encoding and the +aggregated message to update the embedding for the node. By this way, +neighbour-level message interaction information is integrated into the +generated node embeddings. The proposed encoding method is a generic method +which can be integrated into message-passing graph convolutional networks. +Extensive experiments are conducted on six popular benchmark datasets across +four highly-demanded tasks. The results show that integrating neighbour-level +message interactions achieves improved performance of the base models, +advancing the state of the art results for representation learning over graphs. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ A Universal Protocol to Benchmark Camera Calibration for Sports + + +
+ Camera calibration is a crucial component in the realm of sports analytics, +as it serves as the foundation to extract 3D information out of the broadcast +images. Despite the significance of camera calibration research in sports +analytics, progress is impeded by outdated benchmarking criteria. Indeed, the +annotation data and evaluation metrics provided by most currently available +benchmarks strongly favor and incite the development of sports field +registration methods, i.e. methods estimating homographies that map the sports +field plane to the image plane. However, such homography-based methods are +doomed to overlook the broader capabilities of camera calibration in bridging +the 3D world to the image. In particular, real-world non-planar sports field +elements (such as goals, corner flags, baskets, ...) and image distortion +caused by broadcast camera lenses are out of the scope of sports field +registration methods. To overcome these limitations, we designed a new +benchmarking protocol, named ProCC, based on two principles: (1) the protocol +should be agnostic to the camera model chosen for a camera calibration method, +and (2) the protocol should fairly evaluate camera calibration methods using +the reprojection of arbitrary yet accurately known 3D objects. Indirectly, we +also provide insights into the metric used in SoccerNet-calibration, which +solely relies on image annotation data of viewed 3D objects as ground truth, +thus implementing our protocol. With experiments on the World Cup 2014, CARWC, +and SoccerNet datasets, we show that our benchmarking protocol provides fairer +evaluations of camera calibration methods. By defining our requirements for +proper benchmarking, we hope to pave the way for a new stage in camera +calibration for sports applications with high accuracy standards. + +
+
+ comment: 12 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ TextCoT: Zoom In for Enhanced Multimodal Text-Rich Image Understanding + + +
+ The advent of Large Multimodal Models (LMMs) has sparked a surge in research +aimed at harnessing their remarkable reasoning abilities. However, for +understanding text-rich images, challenges persist in fully leveraging the +potential of LMMs, and existing methods struggle with effectively processing +high-resolution images. In this work, we propose TextCoT, a novel +Chain-of-Thought framework for text-rich image understanding. TextCoT utilizes +the captioning ability of LMMs to grasp the global context of the image and the +grounding capability to examine local textual regions. This allows for the +extraction of both global and local visual information, facilitating more +accurate question-answering. Technically, TextCoT consists of three stages, +including image overview, coarse localization, and fine-grained observation. +The image overview stage provides a comprehensive understanding of the global +scene information, and the coarse localization stage approximates the image +area containing the answer based on the question asked. Then, integrating the +obtained global image descriptions, the final stage further examines specific +regions to provide accurate answers. Our method is free of extra training, +offering immediate plug-and-play functionality. Extensive experiments are +conducted on a series of text-rich image question-answering benchmark datasets +based on several advanced LMMs, and the results demonstrate the effectiveness +and strong generalization ability of our method. Code is available at +https://github.com/bzluan/TextCoT. + +
+
+
+
+
+ + ☆ NTIRE 2024 Challenge on Image Super-Resolution ($\times$4): Methods and + Results + + +
+ This paper reviews the NTIRE 2024 challenge on image super-resolution +($\times$4), highlighting the solutions proposed and the outcomes obtained. The +challenge involves generating corresponding high-resolution (HR) images, +magnified by a factor of four, from low-resolution (LR) inputs using prior +information. The LR images originate from bicubic downsampling degradation. The +aim of the challenge is to obtain designs/solutions with the most advanced SR +performance, with no constraints on computational resources (e.g., model size +and FLOPs) or training data. The track of this challenge assesses performance +with the PSNR metric on the DIV2K testing dataset. The competition attracted +199 registrants, with 20 teams submitting valid entries. This collective +endeavour not only pushes the boundaries of performance in single-image SR but +also offers a comprehensive overview of current trends in this field. + +
+
+ comment: NTIRE 2024 webpage: https://cvlai.net/ntire/2024. Code: + https://github.com/zhengchen1999/NTIRE2024_ImageSR_x4 +
+
+
+
+
+ + ☆ The Devil is in the Few Shots: Iterative Visual Knowledge Completion for + Few-shot Learning ECCV 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) has shown powerful zero-shot +learning performance. Few-shot learning aims to further enhance the transfer +capability of CLIP by giving few images in each class, aka 'few shots'. Most +existing methods either implicitly learn from the few shots by incorporating +learnable prompts or adapters, or explicitly embed them in a cache model for +inference. However, the narrow distribution of few shots often contains +incomplete class information, leading to biased visual knowledge with high risk +of misclassification. To tackle this problem, recent methods propose to +supplement visual knowledge by generative models or extra databases, which can +be costly and time-consuming. In this paper, we propose an Iterative Visual +Knowledge CompLetion (KCL) method to complement visual knowledge by properly +taking advantages of unlabeled samples without access to any auxiliary or +synthetic data. Specifically, KCL first measures the similarities between +unlabeled samples and each category. Then, the samples with top confidence to +each category is selected and collected by a designed confidence criterion. +Finally, the collected samples are treated as labeled ones and added to few +shots to jointly re-estimate the remaining unlabeled ones. The above procedures +will be repeated for a certain number of iterations with more and more samples +being collected until convergence, ensuring a progressive and robust knowledge +completion process. Extensive experiments on 11 benchmark datasets demonstrate +the effectiveness and efficiency of KCL as a plug-and-play module under both +few-shot and zero-shot learning settings. Code is available at +https://github.com/Mark-Sky/KCL. + +
+
+ comment: 26 pages, submitted to ECCV 2024 +
+
+
+
+
+ + ☆ RandAlign: A Parameter-Free Method for Regularizing Graph Convolutional + Networks + + +
+ Studies continually find that message-passing graph convolutional networks +suffer from the over-smoothing issue. Basically, the issue of over-smoothing +refers to the phenomenon that the learned embeddings for all nodes can become +very similar to one another and therefore are uninformative after repeatedly +applying message passing iterations. Intuitively, we can expect the generated +embeddings become smooth asymptotically layerwisely, that is each layer of +graph convolution generates a smoothed version of embeddings as compared to +that generated by the previous layer. Based on this intuition, we propose +RandAlign, a stochastic regularization method for graph convolutional networks. +The idea of RandAlign is to randomly align the learned embedding for each node +with that of the previous layer using randomly interpolation in each graph +convolution layer. Through alignment, the smoothness of the generated +embeddings is explicitly reduced. To better maintain the benefit yielded by the +graph convolution, in the alignment step we introduce to first scale the +embedding of the previous layer to the same norm as the generated embedding and +then perform random interpolation for aligning the generated embedding. +RandAlign is a parameter-free method and can be directly applied without +introducing additional trainable weights or hyper-parameters. We experimentally +evaluate RandAlign on different graph domain tasks on seven benchmark datasets. +The experimental results show that RandAlign is a general method that improves +the generalization performance of various graph convolutional network models +and also improves the numerical stability of optimization, advancing the state +of the art performance for graph representation learning. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Contrastive Pretraining for Visual Concept Explanations of Socioeconomic + Outcomes + + +
+ Predicting socioeconomic indicators from satellite imagery with deep learning +has become an increasingly popular research direction. Post-hoc concept-based +explanations can be an important step towards broader adoption of these models +in policy-making as they enable the interpretation of socioeconomic outcomes +based on visual concepts that are intuitive to humans. In this paper, we study +the interplay between representation learning using an additional task-specific +contrastive loss and post-hoc concept explainability for socioeconomic studies. +Our results on two different geographical locations and tasks indicate that the +task-specific pretraining imposes a continuous ordering of the latent space +embeddings according to the socioeconomic outcomes. This improves the model's +interpretability as it enables the latent space of the model to associate urban +concepts with continuous intervals of socioeconomic outcomes. Further, we +illustrate how analyzing the model's conceptual sensitivity for the intervals +of socioeconomic outcomes can shed light on new insights for urban studies. + +
+
+
+
+
+ + ☆ Deep Learning-Based Segmentation of Tumors in PET/CT Volumes: Benchmark + of Different Architectures and Training Strategies + + +
+ Cancer is one of the leading causes of death globally, and early diagnosis is +crucial for patient survival. Deep learning algorithms have great potential for +automatic cancer analysis. Artificial intelligence has achieved high +performance in recognizing and segmenting single lesions. However, diagnosing +multiple lesions remains a challenge. This study examines and compares various +neural network architectures and training strategies for automatically +segmentation of cancer lesions using PET/CT images from the head, neck, and +whole body. The authors analyzed datasets from the AutoPET and HECKTOR +challenges, exploring popular single-step segmentation architectures and +presenting a two-step approach. The results indicate that the V-Net and nnU-Net +models were the most effective for their respective datasets. The results for +the HECKTOR dataset ranged from 0.75 to 0.76 for the aggregated Dice +coefficient. Eliminating cancer-free cases from the AutoPET dataset was found +to improve the performance of most models. In the case of AutoPET data, the +average segmentation efficiency after training only on images containing cancer +lesions increased from 0.55 to 0.66 for the classic Dice coefficient and from +0.65 to 0.73 for the aggregated Dice coefficient. The research demonstrates the +potential of artificial intelligence in precise oncological diagnostics and may +contribute to the development of more targeted and effective cancer assessment +techniques. + +
+
+
+
+
+ + ☆ Eyes on the Streets: Leveraging Street-Level Imaging to Model Urban + Crime Dynamics + + +
+ This study addresses the challenge of urban safety in New York City by +examining the relationship between the built environment and crime rates using +machine learning and a comprehensive dataset of street view images. We aim to +identify how urban landscapes correlate with crime statistics, focusing on the +characteristics of street views and their association with crime rates. The +findings offer insights for urban planning and crime prevention, highlighting +the potential of environmental design in enhancing public safety. + +
+
+
+
+
+ + ♻ ☆ Sparse Global Matching for Video Frame Interpolation with Large Motion CVPR 2024 + + +
+ Large motion poses a critical challenge in Video Frame Interpolation (VFI) +task. Existing methods are often constrained by limited receptive fields, +resulting in sub-optimal performance when handling scenarios with large motion. +In this paper, we introduce a new pipeline for VFI, which can effectively +integrate global-level information to alleviate issues associated with large +motion. Specifically, we first estimate a pair of initial intermediate flows +using a high-resolution feature map for extracting local details. Then, we +incorporate a sparse global matching branch to compensate for flow estimation, +which consists of identifying flaws in initial flows and generating sparse flow +compensation with a global receptive field. Finally, we adaptively merge the +initial flow estimation with global flow compensation, yielding a more accurate +intermediate flow. To evaluate the effectiveness of our method in handling +large motion, we carefully curate a more challenging subset from commonly used +benchmarks. Our method demonstrates the state-of-the-art performance on these +VFI subsets with large motion. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/. + Fixed some typos in the supplementary material +
+
+
+
+
+ + ♻ ☆ Image-based Deep Learning for the time-dependent prediction of fresh + concrete properties + + +
+ Increasing the degree of digitisation and automation in the concrete +production process can play a crucial role in reducing the CO$_2$ emissions +that are associated with the production of concrete. In this paper, a method is +presented that makes it possible to predict the properties of fresh concrete +during the mixing process based on stereoscopic image sequences of the +concretes flow behaviour. A Convolutional Neural Network (CNN) is used for the +prediction, which receives the images supported by information on the mix +design as input. In addition, the network receives temporal information in the +form of the time difference between the time at which the images are taken and +the time at which the reference values of the concretes are carried out. With +this temporal information, the network implicitly learns the time-dependent +behaviour of the concretes properties. The network predicts the slump flow +diameter, the yield stress and the plastic viscosity. The time-dependent +prediction potentially opens up the pathway to determine the temporal +development of the fresh concrete properties already during mixing. This +provides a huge advantage for the concrete industry. As a result, +countermeasures can be taken in a timely manner. It is shown that an approach +based on depth and optical flow images, supported by information of the mix +design, achieves the best results. + +
+
+
+
+
+ + ♻ ☆ Human vs. LMMs: Exploring the Discrepancy in Emoji Interpretation and + Usage in Digital Communication + + +
+ Leveraging Large Multimodal Models (LMMs) to simulate human behaviors when +processing multimodal information, especially in the context of social media, +has garnered immense interest due to its broad potential and far-reaching +implications. Emojis, as one of the most unique aspects of digital +communication, are pivotal in enriching and often clarifying the emotional and +tonal dimensions. Yet, there is a notable gap in understanding how these +advanced models, such as GPT-4V, interpret and employ emojis in the nuanced +context of online interaction. This study intends to bridge this gap by +examining the behavior of GPT-4V in replicating human-like use of emojis. The +findings reveal a discernible discrepancy between human and GPT-4V behaviors, +likely due to the subjective nature of human interpretation and the limitations +of GPT-4V's English-centric training, suggesting cultural biases and inadequate +representation of non-English cultures. + +
+
+ comment: Accepted for publication in ICWSM 2024 +
+
+
+
+
+ + ♻ ☆ Physics-guided Shape-from-Template: Monocular Video Perception through + Neural Surrogate Models + + +
+ 3D reconstruction of dynamic scenes is a long-standing problem in computer +graphics and increasingly difficult the less information is available. +Shape-from-Template (SfT) methods aim to reconstruct a template-based geometry +from RGB images or video sequences, often leveraging just a single monocular +camera without depth information, such as regular smartphone recordings. +Unfortunately, existing reconstruction methods are either unphysical and noisy +or slow in optimization. To solve this problem, we propose a novel SfT +reconstruction algorithm for cloth using a pre-trained neural surrogate model +that is fast to evaluate, stable, and produces smooth reconstructions due to a +regularizing physics simulation. Differentiable rendering of the simulated mesh +enables pixel-wise comparisons between the reconstruction and a target video +sequence that can be used for a gradient-based optimization procedure to +extract not only shape information but also physical parameters such as +stretching, shearing, or bending stiffness of the cloth. This allows to retain +a precise, stable, and smooth reconstructed geometry while reducing the runtime +by a factor of 400-500 compared to $\phi$-SfT, a state-of-the-art physics-based +SfT approach. + +
+
+
+
+
+ + ♻ ☆ Towards Variable and Coordinated Holistic Co-Speech Motion Generation CVPR 2024 + + +
+ This paper addresses the problem of generating lifelike holistic co-speech +motions for 3D avatars, focusing on two key aspects: variability and +coordination. Variability allows the avatar to exhibit a wide range of motions +even with similar speech content, while coordination ensures a harmonious +alignment among facial expressions, hand gestures, and body poses. We aim to +achieve both with ProbTalk, a unified probabilistic framework designed to +jointly model facial, hand, and body movements in speech. ProbTalk builds on +the variational autoencoder (VAE) architecture and incorporates three core +designs. First, we introduce product quantization (PQ) to the VAE, which +enriches the representation of complex holistic motion. Second, we devise a +novel non-autoregressive model that embeds 2D positional encoding into the +product-quantized representation, thereby preserving essential structure +information of the PQ codes. Last, we employ a secondary stage to refine the +preliminary prediction, further sharpening the high-frequency details. Coupling +these three designs enables ProbTalk to generate natural and diverse holistic +co-speech motions, outperforming several state-of-the-art methods in +qualitative and quantitative evaluations, particularly in terms of realism. Our +code and model will be released for research purposes at +https://feifeifeiliu.github.io/probtalk/. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Dancing with Still Images: Video Distillation via Static-Dynamic + Disentanglement CVPR 2024 + + +
+ Recently, dataset distillation has paved the way towards efficient machine +learning, especially for image datasets. However, the distillation for videos, +characterized by an exclusive temporal dimension, remains an underexplored +domain. In this work, we provide the first systematic study of video +distillation and introduce a taxonomy to categorize temporal compression. Our +investigation reveals that the temporal information is usually not well learned +during distillation, and the temporal dimension of synthetic data contributes +little. The observations motivate our unified framework of disentangling the +dynamic and static information in the videos. It first distills the videos into +still images as static memory and then compensates the dynamic and motion +information with a learnable dynamic memory block. Our method achieves +state-of-the-art on video datasets at different scales, with a notably smaller +memory storage budget. Our code is available at +https://github.com/yuz1wan/video_distillation. + +
+
+ comment: CVPR 2024, project page: https://mvig-rhos.com/video-distill +
+
+
+
+
+ + ♻ ☆ A Survey of Neural Network Robustness Assessment in Image Recognition + + +
+ In recent years, there has been significant attention given to the robustness +assessment of neural networks. Robustness plays a critical role in ensuring +reliable operation of artificial intelligence (AI) systems in complex and +uncertain environments. Deep learning's robustness problem is particularly +significant, highlighted by the discovery of adversarial attacks on image +classification models. Researchers have dedicated efforts to evaluate +robustness in diverse perturbation conditions for image recognition tasks. +Robustness assessment encompasses two main techniques: robustness verification/ +certification for deliberate adversarial attacks and robustness testing for +random data corruptions. In this survey, we present a detailed examination of +both adversarial robustness (AR) and corruption robustness (CR) in neural +network assessment. Analyzing current research papers and standards, we provide +an extensive overview of robustness assessment in image recognition. Three +essential aspects are analyzed: concepts, metrics, and assessment methods. We +investigate the perturbation metrics and range representations used to measure +the degree of perturbations on images, as well as the robustness metrics +specifically for the robustness conditions of classification models. The +strengths and limitations of the existing methods are also discussed, and some +potential directions for future research are provided. + +
+
+ comment: Corrected typos and grammatical errors in Section 5 +
+
+
+
+
+ + ♻ ☆ SyncDreamer: Generating Multiview-consistent Images from a Single-view + Image ICLR 2024 + + +
+ In this paper, we present a novel diffusion model called that generates +multiview-consistent images from a single-view image. Using pretrained +large-scale 2D diffusion models, recent work Zero123 demonstrates the ability +to generate plausible novel views from a single-view image of an object. +However, maintaining consistency in geometry and colors for the generated +images remains a challenge. To address this issue, we propose a synchronized +multiview diffusion model that models the joint probability distribution of +multiview images, enabling the generation of multiview-consistent images in a +single reverse process. SyncDreamer synchronizes the intermediate states of all +the generated images at every step of the reverse process through a 3D-aware +feature attention mechanism that correlates the corresponding features across +different views. Experiments show that SyncDreamer generates images with high +consistency across different views, thus making it well-suited for various 3D +generation tasks such as novel-view-synthesis, text-to-3D, and image-to-3D. + +
+
+ comment: ICLR 2024 Spotlight. Project page: + https://liuyuan-pal.github.io/SyncDreamer/ Code: + https://github.com/liuyuan-pal/SyncDreamer +
+
+
+
+
+ + ♻ ☆ Mind-to-Image: Projecting Visual Mental Imagination of the Brain from + fMRI + + +
+ The reconstruction of images observed by subjects from fMRI data collected +during visual stimuli has made significant strides in the past decade, thanks +to the availability of extensive fMRI datasets and advancements in generative +models for image generation. However, the application of visual reconstruction +has remained limited. Reconstructing visual imagination presents a greater +challenge, with potentially revolutionary applications ranging from aiding +individuals with disabilities to verifying witness accounts in court. The +primary hurdles in this field are the absence of data collection protocols for +visual imagery and the lack of datasets on the subject. Traditionally, +fMRI-to-image relies on data collected from subjects exposed to visual stimuli, +which poses issues for generating visual imagery based on the difference of +brain activity between visual stimulation and visual imagery. For the first +time, we have compiled a substantial dataset (around 6h of scans) on visual +imagery along with a proposed data collection protocol. We then train a +modified version of an fMRI-to-image model and demonstrate the feasibility of +reconstructing images from two modes of imagination: from memory and from pure +imagination. This marks an important step towards creating a technology that +allow direct reconstruction of visual imagery. + +
+
+ comment: Pre-print to be updated +
+
+
+
+
+ + ♻ ☆ Are NeRFs ready for autonomous driving? Towards closing the + real-to-simulation gap CVPR 2024 + + +
+ Neural Radiance Fields (NeRFs) have emerged as promising tools for advancing +autonomous driving (AD) research, offering scalable closed-loop simulation and +data augmentation capabilities. However, to trust the results achieved in +simulation, one needs to ensure that AD systems perceive real and rendered data +in the same way. Although the performance of rendering methods is increasing, +many scenarios will remain inherently challenging to reconstruct faithfully. To +this end, we propose a novel perspective for addressing the real-to-simulated +data gap. Rather than solely focusing on improving rendering fidelity, we +explore simple yet effective methods to enhance perception model robustness to +NeRF artifacts without compromising performance on real data. Moreover, we +conduct the first large-scale investigation into the real-to-simulated data gap +in an AD setting using a state-of-the-art neural rendering technique. +Specifically, we evaluate object detectors and an online mapping model on real +and simulated data, and study the effects of different fine-tuning +strategies.Our results show notable improvements in model robustness to +simulated data, even improving real-world performance in some cases. Last, we +delve into the correlation between the real-to-simulated gap and image +reconstruction metrics, identifying FID and LPIPS as strong indicators. See +https://research.zenseact.com/publications/closing-real2sim-gap for our project +page. + +
+
+ comment: Accepted at Workshop on Autonomous Driving, CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Geometrically-driven Aggregation for Zero-shot 3D Point Cloud + Understanding CVPR 2024 + + +
+ Zero-shot 3D point cloud understanding can be achieved via 2D Vision-Language +Models (VLMs). Existing strategies directly map Vision-Language Models from 2D +pixels of rendered or captured views to 3D points, overlooking the inherent and +expressible point cloud geometric structure. Geometrically similar or close +regions can be exploited for bolstering point cloud understanding as they are +likely to share semantic information. To this end, we introduce the first +training-free aggregation technique that leverages the point cloud's 3D +geometric structure to improve the quality of the transferred Vision-Language +Models. Our approach operates iteratively, performing local-to-global +aggregation based on geometric and semantic point-level reasoning. We benchmark +our approach on three downstream tasks, including classification, part +segmentation, and semantic segmentation, with a variety of datasets +representing both synthetic/real-world, and indoor/outdoor scenarios. Our +approach achieves new state-of-the-art results in all benchmarks. Our approach +operates iteratively, performing local-to-global aggregation based on geometric +and semantic point-level reasoning. Code and dataset are available at +https://luigiriz.github.io/geoze-website/ + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Flattening the Parent Bias: Hierarchical Semantic Segmentation in the + Poincaré Ball + + +
+ Hierarchy is a natural representation of semantic taxonomies, including the +ones routinely used in image segmentation. Indeed, recent work on semantic +segmentation reports improved accuracy from supervised training leveraging +hierarchical label structures. Encouraged by these results, we revisit the +fundamental assumptions behind that work. We postulate and then empirically +verify that the reasons for the observed improvement in segmentation accuracy +may be entirely unrelated to the use of the semantic hierarchy. To demonstrate +this, we design a range of cross-domain experiments with a representative +hierarchical approach. We find that on the new testing domains, a flat +(non-hierarchical) segmentation network, in which the parents are inferred from +the children, has superior segmentation accuracy to the hierarchical approach +across the board. Complementing these findings and inspired by the intrinsic +properties of hyperbolic spaces, we study a more principled approach to +hierarchical segmentation using the Poincar\'e ball model. The hyperbolic +representation largely outperforms the previous (Euclidean) hierarchical +approach as well and is on par with our flat Euclidean baseline in terms of +segmentation accuracy. However, it additionally exhibits surprisingly strong +calibration quality of the parent nodes in the semantic hierarchy, especially +on the more challenging domains. Our combined analysis suggests that the +established practice of hierarchical segmentation may be limited to in-domain +settings, whereas flat classifiers generalize substantially better, especially +if they are modeled in the hyperbolic space. + +
+
+
+
+
+ + ♻ ☆ TTK is Getting MPI-Ready + + +
+ This system paper documents the technical foundations for the extension of +the Topology ToolKit (TTK) to distributed-memory parallelism with the Message +Passing Interface (MPI). While several recent papers introduced topology-based +approaches for distributed-memory environments, these were reporting +experiments obtained with tailored, mono-algorithm implementations. In +contrast, we describe in this paper a versatile approach (supporting both +triangulated domains and regular grids) for the support of topological analysis +pipelines, i.e. a sequence of topological algorithms interacting together. +While developing this extension, we faced several algorithmic and software +engineering challenges, which we document in this paper. We describe an MPI +extension of TTK's data structure for triangulation representation and +traversal, a central component to the global performance and generality of +TTK's topological implementations. We also introduce an intermediate interface +between TTK and MPI, both at the global pipeline level, and at the fine-grain +algorithmic level. We provide a taxonomy for the distributed-memory topological +algorithms supported by TTK, depending on their communication needs and provide +examples of hybrid MPI+thread parallelizations. Performance analyses show that +parallel efficiencies range from 20% to 80% (depending on the algorithms), and +that the MPI-specific preconditioning introduced by our framework induces a +negligible computation time overhead. We illustrate the new distributed-memory +capabilities of TTK with an example of advanced analysis pipeline, combining +multiple algorithms, run on the largest publicly available dataset we have +found (120 billion vertices) on a cluster with 64 nodes (for a total of 1536 +cores). Finally, we provide a roadmap for the completion of TTK's MPI +extension, along with generic recommendations for each algorithm communication +category. + +
+
+ comment: 18 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ CrossKD: Cross-Head Knowledge Distillation for Object Detection + + +
+ Knowledge Distillation (KD) has been validated as an effective model +compression technique for learning compact object detectors. Existing +state-of-the-art KD methods for object detection are mostly based on feature +imitation. In this paper, we present a general and effective prediction +mimicking distillation scheme, called CrossKD, which delivers the intermediate +features of the student's detection head to the teacher's detection head. The +resulting cross-head predictions are then forced to mimic the teacher's +predictions. This manner relieves the student's head from receiving +contradictory supervision signals from the annotations and the teacher's +predictions, greatly improving the student's detection performance. Moreover, +as mimicking the teacher's predictions is the target of KD, CrossKD offers more +task-oriented information in contrast with feature imitation. On MS COCO, with +only prediction mimicking losses applied, our CrossKD boosts the average +precision of GFL ResNet-50 with 1x training schedule from 40.2 to 43.7, +outperforming all existing KD methods. In addition, our method also works well +when distilling detectors with heterogeneous backbones. Code is available at +https://github.com/jbwang1997/CrossKD. + +
+
+
+
+
+ + ♻ ☆ Z-GMOT: Zero-shot Generic Multiple Object Tracking NAACL 2024 + + +
+ Despite recent significant progress, Multi-Object Tracking (MOT) faces +limitations such as reliance on prior knowledge and predefined categories and +struggles with unseen objects. To address these issues, Generic Multiple Object +Tracking (GMOT) has emerged as an alternative approach, requiring less prior +information. However, current GMOT methods often rely on initial bounding boxes +and struggle to handle variations in factors such as viewpoint, lighting, +occlusion, and scale, among others. Our contributions commence with the +introduction of the \textit{Referring GMOT dataset} a collection of videos, +each accompanied by detailed textual descriptions of their attributes. +Subsequently, we propose $\mathtt{Z-GMOT}$, a cutting-edge tracking solution +capable of tracking objects from \textit{never-seen categories} without the +need of initial bounding boxes or predefined categories. Within our +$\mathtt{Z-GMOT}$ framework, we introduce two novel components: (i) +$\mathtt{iGLIP}$, an improved Grounded language-image pretraining, for +accurately detecting unseen objects with specific characteristics. (ii) +$\mathtt{MA-SORT}$, a novel object association approach that adeptly integrates +motion and appearance-based matching strategies to tackle the complex task of +tracking objects with high similarity. Our contributions are benchmarked +through extensive experiments conducted on the Referring GMOT dataset for GMOT +task. Additionally, to assess the generalizability of the proposed +$\mathtt{Z-GMOT}$, we conduct ablation studies on the DanceTrack and MOT20 +datasets for the MOT task. Our dataset, code, and models are released at: +https://fsoft-aic.github.io/Z-GMOT. + +
+
+ comment: Accepted to NAACL 2024 +
+
+
+
+
+ + ♻ ☆ Stimulating the Diffusion Model for Image Denoising via Adaptive + Embedding and Ensembling + + +
+ Image denoising is a fundamental problem in computational photography, where +achieving high perception with low distortion is highly demanding. Current +methods either struggle with perceptual quality or suffer from significant +distortion. Recently, the emerging diffusion model has achieved +state-of-the-art performance in various tasks and demonstrates great potential +for image denoising. However, stimulating diffusion models for image denoising +is not straightforward and requires solving several critical problems. For one +thing, the input inconsistency hinders the connection between diffusion models +and image denoising. For another, the content inconsistency between the +generated image and the desired denoised image introduces distortion. To tackle +these problems, we present a novel strategy called the Diffusion Model for +Image Denoising (DMID) by understanding and rethinking the diffusion model from +a denoising perspective. Our DMID strategy includes an adaptive embedding +method that embeds the noisy image into a pre-trained unconditional diffusion +model and an adaptive ensembling method that reduces distortion in the denoised +image. Our DMID strategy achieves state-of-the-art performance on both +distortion-based and perception-based metrics, for both Gaussian and real-world +image denoising.The code is available at https://github.com/Li-Tong-621/DMID. + +
+
+ comment: 18 pages,15 figures +
+
+
+
+
+ + ♻ ☆ Evaluating Text-to-Image Synthesis: Survey and Taxonomy of Image Quality + Metrics + + +
+ Recent advances in text-to-image synthesis enabled through a combination of +language and vision foundation models have led to a proliferation of the tools +available and an increased attention to the field. When conducting +text-to-image synthesis, a central goal is to ensure that the content between +text and image is aligned. As such, there exist numerous evaluation metrics +that aim to mimic human judgement. However, it is often unclear which metric to +use for evaluating text-to-image synthesis systems as their evaluation is +highly nuanced. In this work, we provide a comprehensive overview of existing +text-to-image evaluation metrics. Based on our findings, we propose a new +taxonomy for categorizing these metrics. Our taxonomy is grounded in the +assumption that there are two main quality criteria, namely compositionality +and generality, which ideally map to human preferences. Ultimately, we derive +guidelines for practitioners conducting text-to-image evaluation, discuss open +challenges of evaluation mechanisms, and surface limitations of current +metrics. + +
+
+ comment: preprint, 20 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ PEAN: A Diffusion-Based Prior-Enhanced Attention Network for Scene Text + Image Super-Resolution + + +
+ Scene text image super-resolution (STISR) aims at simultaneously increasing +the resolution and readability of low-resolution scene text images, thus +boosting the performance of the downstream recognition task. Two factors in +scene text images, visual structure and semantic information, affect the +recognition performance significantly. To mitigate the effects from these +factors, this paper proposes a Prior-Enhanced Attention Network (PEAN). +Specifically, an attention-based modulation module is leveraged to understand +scene text images by neatly perceiving the local and global dependence of +images, despite the shape of the text. Meanwhile, a diffusion-based module is +developed to enhance the text prior, hence offering better guidance for the SR +network to generate SR images with higher semantic accuracy. Additionally, a +multi-task learning paradigm is employed to optimize the network, enabling the +model to generate legible SR images. As a result, PEAN establishes new SOTA +results on the TextZoom benchmark. Experiments are also conducted to analyze +the importance of the enhanced text prior as a means of improving the +performance of the SR network. Code will be made available at +https://github.com/jdfxzzy/PEAN. + +
+
+
+
+
+ + ♻ ☆ Do More With What You Have: Transferring Depth-Scale from Labeled to + Unlabeled Domains + + +
+ Transferring the absolute depth prediction capabilities of an estimator to a +new domain is a task with significant real-world applications. This task is +specifically challenging when images from the new domain are collected without +ground-truth depth measurements, and possibly with sensors of different +intrinsics. To overcome such limitations, a recent zero-shot solution was +trained on an extensive training dataset and encoded the various camera +intrinsics. Other solutions generated synthetic data with depth labels that +matched the intrinsics of the new target data to enable depth-scale transfer +between the domains. + In this work we present an alternative solution that can utilize any existing +synthetic or real dataset, that has a small number of images annotated with +ground truth depth labels. Specifically, we show that self-supervised depth +estimators result in up-to-scale predictions that are linearly correlated to +their absolute depth values across the domain, a property that we model in this +work using a single scalar. In addition, aligning the field-of-view of two +datasets prior to training, results in a common linear relationship for both +domains. We use this observed property to transfer the depth-scale from source +datasets that have absolute depth labels to new target datasets that lack these +measurements, enabling absolute depth predictions in the target domain. + The suggested method was successfully demonstrated on the KITTI, DDAD and +nuScenes datasets, while using other existing real or synthetic source +datasets, that have a different field-of-view, other image style or structural +content, achieving comparable or better accuracy than other existing methods +that do not use target ground-truth depths. + +
+
+
+
+
+ + ♻ ☆ Disentangled Explanations of Neural Network Predictions by Finding + Relevant Subspaces + + +
+ Explainable AI aims to overcome the black-box nature of complex ML models +like neural networks by generating explanations for their predictions. +Explanations often take the form of a heatmap identifying input features (e.g. +pixels) that are relevant to the model's decision. These explanations, however, +entangle the potentially multiple factors that enter into the overall complex +decision strategy. We propose to disentangle explanations by extracting at some +intermediate layer of a neural network, subspaces that capture the multiple and +distinct activation patterns (e.g. visual concepts) that are relevant to the +prediction. To automatically extract these subspaces, we propose two new +analyses, extending principles found in PCA or ICA to explanations. These novel +analyses, which we call principal relevant component analysis (PRCA) and +disentangled relevant subspace analysis (DRSA), maximize relevance instead of +e.g. variance or kurtosis. This allows for a much stronger focus of the +analysis on what the ML model actually uses for predicting, ignoring +activations or concepts to which the model is invariant. Our approach is +general enough to work alongside common attribution techniques such as Shapley +Value, Integrated Gradients, or LRP. Our proposed methods show to be +practically useful and compare favorably to the state of the art as +demonstrated on benchmarks and three use cases. + +
+
+ comment: 17 pages + supplement +
+
+
+
+
+ + ♻ ☆ CF-Font: Content Fusion for Few-shot Font Generation CVPR 2023 + + +
+ Content and style disentanglement is an effective way to achieve few-shot +font generation. It allows to transfer the style of the font image in a source +domain to the style defined with a few reference images in a target domain. +However, the content feature extracted using a representative font might not be +optimal. In light of this, we propose a content fusion module (CFM) to project +the content feature into a linear space defined by the content features of +basis fonts, which can take the variation of content features caused by +different fonts into consideration. Our method also allows to optimize the +style representation vector of reference images through a lightweight iterative +style-vector refinement (ISR) strategy. Moreover, we treat the 1D projection of +a character image as a probability distribution and leverage the distance +between two distributions as the reconstruction loss (namely projected +character loss, PCL). Compared to L2 or L1 reconstruction loss, the +distribution distance pays more attention to the global shape of characters. We +have evaluated our method on a dataset of 300 fonts with 6.5k characters each. +Experimental results verify that our method outperforms existing +state-of-the-art few-shot font generation methods by a large margin. The source +code can be found at https://github.com/wangchi95/CF-Font. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ ASH: Animatable Gaussian Splats for Efficient and Photoreal Human + Rendering + + +
+ Real-time rendering of photorealistic and controllable human avatars stands +as a cornerstone in Computer Vision and Graphics. While recent advances in +neural implicit rendering have unlocked unprecedented photorealism for digital +avatars, real-time performance has mostly been demonstrated for static scenes +only. To address this, we propose ASH, an animatable Gaussian splatting +approach for photorealistic rendering of dynamic humans in real-time. We +parameterize the clothed human as animatable 3D Gaussians, which can be +efficiently splatted into image space to generate the final rendering. However, +naively learning the Gaussian parameters in 3D space poses a severe challenge +in terms of compute. Instead, we attach the Gaussians onto a deformable +character model, and learn their parameters in 2D texture space, which allows +leveraging efficient 2D convolutional architectures that easily scale with the +required number of Gaussians. We benchmark ASH with competing methods on +pose-controllable avatars, demonstrating that our method outperforms existing +real-time methods by a large margin and shows comparable or even better results +than offline methods. + +
+
+ comment: For project page, see https://vcai.mpi-inf.mpg.de/projects/ash/ +
+
+
+
+
+ + ♻ ☆ Text-Driven Traffic Anomaly Detection with Temporal High-Frequency + Modeling in Driving Videos + + +
+ Traffic anomaly detection (TAD) in driving videos is critical for ensuring +the safety of autonomous driving and advanced driver assistance systems. +Previous single-stage TAD methods primarily rely on frame prediction, making +them vulnerable to interference from dynamic backgrounds induced by the rapid +movement of the dashboard camera. While two-stage TAD methods appear to be a +natural solution to mitigate such interference by pre-extracting +background-independent features (such as bounding boxes and optical flow) using +perceptual algorithms, they are susceptible to the performance of first-stage +perceptual algorithms and may result in error propagation. In this paper, we +introduce TTHF, a novel single-stage method aligning video clips with text +prompts, offering a new perspective on traffic anomaly detection. Unlike +previous approaches, the supervised signal of our method is derived from +languages rather than orthogonal one-hot vectors, providing a more +comprehensive representation. Further, concerning visual representation, we +propose to model the high frequency of driving videos in the temporal domain. +This modeling captures the dynamic changes of driving scenes, enhances the +perception of driving behavior, and significantly improves the detection of +traffic anomalies. In addition, to better perceive various types of traffic +anomalies, we carefully design an attentive anomaly focusing mechanism that +visually and linguistically guides the model to adaptively focus on the visual +context of interest, thereby facilitating the detection of traffic anomalies. +It is shown that our proposed TTHF achieves promising performance, +outperforming state-of-the-art competitors by +5.4% AUC on the DoTA dataset and +achieving high generalization on the DADA dataset. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Few Shot Part Segmentation Reveals Compositional Logic for Industrial + Anomaly Detection AAAI2024 + + +
+ Logical anomalies (LA) refer to data violating underlying logical constraints +e.g., the quantity, arrangement, or composition of components within an image. +Detecting accurately such anomalies requires models to reason about various +component types through segmentation. However, curation of pixel-level +annotations for semantic segmentation is both time-consuming and expensive. +Although there are some prior few-shot or unsupervised co-part segmentation +algorithms, they often fail on images with industrial object. These images have +components with similar textures and shapes, and a precise differentiation +proves challenging. In this study, we introduce a novel component segmentation +model for LA detection that leverages a few labeled samples and unlabeled +images sharing logical constraints. To ensure consistent segmentation across +unlabeled images, we employ a histogram matching loss in conjunction with an +entropy loss. As segmentation predictions play a crucial role, we propose to +enhance both local and global sample validity detection by capturing key +aspects from visual semantics via three memory banks: class histograms, +component composition embeddings and patch-level representations. For effective +LA detection, we propose an adaptive scaling strategy to standardize anomaly +scores from different memory banks in inference. Extensive experiments on the +public benchmark MVTec LOCO AD reveal our method achieves 98.1% AUROC in LA +detection vs. 89.6% from competing methods. + +
+
+ comment: Accepted in AAAI2024 +
+
+
+
+
+ + ♻ ☆ Adapting Short-Term Transformers for Action Detection in Untrimmed + Videos CVPR2024 + + +
+ Vision Transformer (ViT) has shown high potential in video recognition, owing +to its flexible design, adaptable self-attention mechanisms, and the efficacy +of masked pre-training. Yet, it remains unclear how to adapt these pre-trained +short-term ViTs for temporal action detection (TAD) in untrimmed videos. The +existing works treat them as off-the-shelf feature extractors for each +short-trimmed snippet without capturing the fine-grained relation among +different snippets in a broader temporal context. To mitigate this issue, this +paper focuses on designing a new mechanism for adapting these pre-trained ViT +models as a unified long-form video transformer to fully unleash its modeling +power in capturing inter-snippet relation, while still keeping low computation +overhead and memory consumption for efficient TAD. To this end, we design +effective cross-snippet propagation modules to gradually exchange short-term +video information among different snippets from two levels. For inner-backbone +information propagation, we introduce a cross-snippet propagation strategy to +enable multi-snippet temporal feature interaction inside the backbone.For +post-backbone information propagation, we propose temporal transformer layers +for further clip-level modeling. With the plain ViT-B pre-trained with +VideoMAE, our end-to-end temporal action detector (ViT-TAD) yields a very +competitive performance to previous temporal action detectors, riching up to +69.5 average mAP on THUMOS14, 37.40 average mAP on ActivityNet-1.3 and 17.20 +average mAP on FineAction. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ On the Road to Portability: Compressing End-to-End Motion Planner for + Autonomous Driving CVPR 2024 + + +
+ End-to-end motion planning models equipped with deep neural networks have +shown great potential for enabling full autonomous driving. However, the +oversized neural networks render them impractical for deployment on +resource-constrained systems, which unavoidably requires more computational +time and resources during reference.To handle this, knowledge distillation +offers a promising approach that compresses models by enabling a smaller +student model to learn from a larger teacher model. Nevertheless, how to apply +knowledge distillation to compress motion planners has not been explored so +far. In this paper, we propose PlanKD, the first knowledge distillation +framework tailored for compressing end-to-end motion planners. First, +considering that driving scenes are inherently complex, often containing +planning-irrelevant or even noisy information, transferring such information is +not beneficial for the student planner. Thus, we design an information +bottleneck based strategy to only distill planning-relevant information, rather +than transfer all information indiscriminately. Second, different waypoints in +an output planned trajectory may hold varying degrees of importance for motion +planning, where a slight deviation in certain crucial waypoints might lead to a +collision. Therefore, we devise a safety-aware waypoint-attentive distillation +module that assigns adaptive weights to different waypoints based on the +importance, to encourage the student to accurately mimic more crucial +waypoints, thereby improving overall safety. Experiments demonstrate that our +PlanKD can boost the performance of smaller planners by a large margin, and +significantly reduce their reference time. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Maintaining User Trust Through Multistage Uncertainty Aware Inference + + +
+ This paper describes and evaluates a multistage approach to AI deployment. +Each stage involves a more accurate method of inference, yet engaging each +comes with an increasing cost. In outlining the architecture, we present a +method for quantifying model uncertainty that facilitates confident deferral +decisions. The architecture is currently under active deployment to thousands +of cotton farmers across India. The broader idea however is applicable to a +growing sector of AI deployments in challenging low resources settings. + +
+
+
+
+
+ + ♻ ☆ Robust image segmentation model based on binary level set SC + + +
+ In order to improve the robustness of traditional image segmentation models +to noise, this paper models the illumination term in intensity inhomogeneity +images. Additionally, to enhance the model's robustness to noisy images, we +incorporate the binary level set model into the proposed model. Compared to the +traditional level set, the binary level set eliminates the need for continuous +reinitialization. Moreover, by introducing the variational operator GL, our +model demonstrates better capability in segmenting noisy images. Finally, we +employ the three-step splitting operator method for solving, and the +effectiveness of the proposed model is demonstrated on various images. + +
+
+ comment: SCI +
+
+
+
+
+ + ♻ ☆ LiDAR-Guided Cross-Attention Fusion for Hyperspectral Band Selection and + Image Classification + + +
+ The fusion of hyperspectral and LiDAR data has been an active research topic. +Existing fusion methods have ignored the high-dimensionality and redundancy +challenges in hyperspectral images, despite that band selection methods have +been intensively studied for hyperspectral image (HSI) processing. This paper +addresses this significant gap by introducing a cross-attention mechanism from +the transformer architecture for the selection of HSI bands guided by LiDAR +data. LiDAR provides high-resolution vertical structural information, which can +be useful in distinguishing different types of land cover that may have similar +spectral signatures but different structural profiles. In our approach, the +LiDAR data are used as the "query" to search and identify the "key" from the +HSI to choose the most pertinent bands for LiDAR. This method ensures that the +selected HSI bands drastically reduce redundancy and computational requirements +while working optimally with the LiDAR data. Extensive experiments have been +undertaken on three paired HSI and LiDAR data sets: Houston 2013, Trento and +MUUFL. The results highlight the superiority of the cross-attention mechanism, +underlining the enhanced classification accuracy of the identified HSI bands +when fused with the LiDAR features. The results also show that the use of fewer +bands combined with LiDAR surpasses the performance of state-of-the-art fusion +models. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics CVPR 2024 + + +
+ We introduce PhysGaussian, a new method that seamlessly integrates physically +grounded Newtonian dynamics within 3D Gaussians to achieve high-quality novel +motion synthesis. Employing a custom Material Point Method (MPM), our approach +enriches 3D Gaussian kernels with physically meaningful kinematic deformation +and mechanical stress attributes, all evolved in line with continuum mechanics +principles. A defining characteristic of our method is the seamless integration +between physical simulation and visual rendering: both components utilize the +same 3D Gaussian kernels as their discrete representations. This negates the +necessity for triangle/tetrahedron meshing, marching cubes, "cage meshes," or +any other geometry embedding, highlighting the principle of "what you see is +what you simulate (WS$^2$)." Our method demonstrates exceptional versatility +across a wide variety of materials--including elastic entities, metals, +non-Newtonian fluids, and granular materials--showcasing its strong +capabilities in creating diverse visual content with novel viewpoints and +movements. Our project page is at: https://xpandora.github.io/PhysGaussian/ + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Eliminating Hard Label Constraints in Gradient Inversion Attacks ICLR2024 + + +
+ Gradient inversion attacks aim to reconstruct local training data from +intermediate gradients exposed in the federated learning framework. Despite +successful attacks, all previous methods, starting from reconstructing a single +data point and then relaxing the single-image limit to batch level, are only +tested under hard label constraints. Even for single-image reconstruction, we +still lack an analysis-based algorithm to recover augmented soft labels. In +this work, we change the focus from enlarging batchsize to investigating the +hard label constraints, considering a more realistic circumstance where label +smoothing and mixup techniques are used in the training process. In particular, +we are the first to initiate a novel algorithm to simultaneously recover the +ground-truth augmented label and the input feature of the last fully-connected +layer from single-input gradients, and provide a necessary condition for any +analytical-based label recovery methods. Extensive experiments testify to the +label recovery accuracy, as well as the benefits to the following image +reconstruction. We believe soft labels in classification tasks are worth +further attention in gradient inversion attacks. + +
+
+ comment: ICLR2024 poster +
+
+
+
+
+ + ♻ ☆ Exploring Sparse Visual Prompt for Domain Adaptive Dense Prediction AAAI 2024 + + +
+ The visual prompts have provided an efficient manner in addressing visual +cross-domain problems. In previous works, Visual Domain Prompt (VDP) first +introduces domain prompts to tackle the classification Test-Time Adaptation +(TTA) problem by warping image-level prompts on the input and fine-tuning +prompts for each target domain. However, since the image-level prompts mask out +continuous spatial details in the prompt-allocated region, it will suffer from +inaccurate contextual information and limited domain knowledge extraction, +particularly when dealing with dense prediction TTA problems. To overcome these +challenges, we propose a novel Sparse Visual Domain Prompts (SVDP) approach, +which holds minimal trainable parameters (e.g., 0.1\%) in the image-level +prompt and reserves more spatial information of the input. To better apply SVDP +in extracting domain-specific knowledge, we introduce the Domain Prompt +Placement (DPP) method to adaptively allocates trainable parameters of SVDP on +the pixels with large distribution shifts. Furthermore, recognizing that each +target domain sample exhibits a unique domain shift, we design Domain Prompt +Updating (DPU) strategy to optimize prompt parameters differently for each +sample, facilitating efficient adaptation to the target domain. Extensive +experiments were conducted on widely-used TTA and continual TTA benchmarks, and +our proposed method achieves state-of-the-art performance in both semantic +segmentation and depth estimation tasks. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: updated section II-C ("A-Frame"), updated references +
+
+
+
+
+ + ♻ ☆ Comment-aided Video-Language Alignment via Contrastive Pre-training for + Short-form Video Humor Detection ICMR 2024 + + +
+ The growing importance of multi-modal humor detection within affective +computing correlates with the expanding influence of short-form video sharing +on social media platforms. In this paper, we propose a novel two-branch +hierarchical model for short-form video humor detection (SVHD), named +Comment-aided Video-Language Alignment (CVLA) via data-augmented multi-modal +contrastive pre-training. Notably, our CVLA not only operates on raw signals +across various modal channels but also yields an appropriate multi-modal +representation by aligning the video and language components within a +consistent semantic space. The experimental results on two humor detection +datasets, including DY11k and UR-FUNNY, demonstrate that CVLA dramatically +outperforms state-of-the-art and several competitive baseline approaches. Our +dataset, code and model release at https://github.com/yliu-cs/CVLA. + +
+
+ comment: Accepted by ICMR 2024 +
+
+
+
+
+ + ♻ ☆ LadleNet: A Two-Stage UNet for Infrared Image to Visible Image + Translation Guided by Semantic Segmentation + + +
+ The translation of thermal infrared (TIR) images into visible light (VI) +images plays a critical role in enhancing model performance and generalization +capability, particularly in various fields such as registration and fusion of +TIR and VI images. However, current research in this field faces challenges of +insufficiently realistic image quality after translation and the difficulty of +existing models in adapting to unseen scenarios. In order to develop a more +generalizable image translation architecture, we conducted an analysis of +existing translation architectures. By exploring the interpretability of +intermediate modalities in existing translation architectures, we found that +the intermediate modality in the image translation process for street scene +images essentially performs semantic segmentation, distinguishing street images +based on background and foreground patterns before assigning color information. +Based on these principles, we propose an improved algorithm based on U-net +called LadleNet. This network utilizes a two-stage U-net concatenation +structure, consisting of Handle and Bowl modules. The Handle module is +responsible for constructing an abstract semantic space, while the Bowl module +decodes the semantic space to obtain the mapped VI image. Due to the +characteristic of semantic segmentation, the Handle module has strong +extensibility. Therefore, we also propose LadleNet+, which replaces the Handle +module in LadleNet with a pre-trained DeepLabv3+ network, enabling the model to +have a more powerful capability in constructing semantic space. The proposed +methods were trained and tested on the KAIST dataset, followed by quantitative +and qualitative analysis. Compared to existing methods, LadleNet and LadleNet+ +achieved an average improvement of 12.4% and 15.2% in SSIM metrics, and 37.9% +and 50.6% in MS-SSIM metrics, respectively. + +
+
+
+
+
+ + ♻ ☆ A Survey on Open-Vocabulary Detection and Segmentation: Past, Present, + and Future + + +
+ As the most fundamental scene understanding tasks, object detection and +segmentation have made tremendous progress in deep learning era. Due to the +expensive manual labeling cost, the annotated categories in existing datasets +are often small-scale and pre-defined, i.e., state-of-the-art fully-supervised +detectors and segmentors fail to generalize beyond the closed vocabulary. To +resolve this limitation, in the last few years, the community has witnessed an +increasing attention toward Open-Vocabulary Detection (OVD) and Segmentation +(OVS). By ``open-vocabulary'', we mean that the models can classify objects +beyond pre-defined categories. In this survey, we provide a comprehensive +review on recent developments of OVD and OVS. A taxonomy is first developed to +organize different tasks and methodologies. We find that the permission and +usage of weak supervision signals can well discriminate different +methodologies, including: visual-semantic space mapping, novel visual feature +synthesis, region-aware training, pseudo-labeling, knowledge distillation, and +transfer learning. The proposed taxonomy is universal across different tasks, +covering object detection, semantic/instance/panoptic segmentation, 3D and +video understanding. The main design principles, key challenges, development +routes, methodology strengths, and weaknesses are thoroughly analyzed. In +addition, we benchmark each task along with the vital components of each method +in appendix and updated online at +https://github.com/seanzhuh/awesome-open-vocabulary-detection-and-segmentation. +Finally, several promising directions are provided and discussed to stimulate +future research. + +
+
+
+
+
+ + ♻ ☆ Fine-Grained Side Information Guided Dual-Prompts for Zero-Shot Skeleton + Action Recognition + + +
+ Skeleton-based zero-shot action recognition aims to recognize unknown human +actions based on the learned priors of the known skeleton-based actions and a +semantic descriptor space shared by both known and unknown categories. However, +previous works focus on establishing the bridges between the known skeleton +representation space and semantic descriptions space at the coarse-grained +level for recognizing unknown action categories, ignoring the fine-grained +alignment of these two spaces, resulting in suboptimal performance in +distinguishing high-similarity action categories. To address these challenges, +we propose a novel method via Side information and dual-prompts learning for +skeleton-based zero-shot action recognition (STAR) at the fine-grained level. +Specifically, 1) we decompose the skeleton into several parts based on its +topology structure and introduce the side information concerning multi-part +descriptions of human body movements for alignment between the skeleton and the +semantic space at the fine-grained level; 2) we design the visual-attribute and +semantic-part prompts to improve the intra-class compactness within the +skeleton space and inter-class separability within the semantic space, +respectively, to distinguish the high-similarity actions. Extensive experiments +show that our method achieves state-of-the-art performance in ZSL and GZSL +settings on NTU RGB+D, NTU RGB+D 120, and PKU-MMD datasets. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering + + +
+ Rendering dynamic 3D human from monocular videos is crucial for various +applications such as virtual reality and digital entertainment. Most methods +assume the people is in an unobstructed scene, while various objects may cause +the occlusion of body parts in real-life scenarios. Previous method utilizing +NeRF for surface rendering to recover the occluded areas, but it requiring more +than one day to train and several seconds to render, failing to meet the +requirements of real-time interactive applications. To address these issues, we +propose OccGaussian based on 3D Gaussian Splatting, which can be trained within +6 minutes and produces high-quality human renderings up to 160 FPS with +occluded input. OccGaussian initializes 3D Gaussian distributions in the +canonical space, and we perform occlusion feature query at occluded regions, +the aggregated pixel-align feature is extracted to compensate for the missing +information. Then we use Gaussian Feature MLP to further process the feature +along with the occlusion-aware loss functions to better perceive the occluded +area. Extensive experiments both in simulated and real-world occlusions, +demonstrate that our method achieves comparable or even superior performance +compared to the state-of-the-art method. And we improving training and +inference speeds by 250x and 800x, respectively. Our code will be available for +research purposes. + +
+
+
+
+
+ + ♻ ☆ ParamISP: Learned Forward and Inverse ISPs using Camera Parameters + + +
+ RAW images are rarely shared mainly due to its excessive data size compared +to their sRGB counterparts obtained by camera ISPs. Learning the forward and +inverse processes of camera ISPs has been recently demonstrated, enabling +physically-meaningful RAW-level image processing on input sRGB images. However, +existing learning-based ISP methods fail to handle the large variations in the +ISP processes with respect to camera parameters such as ISO and exposure time, +and have limitations when used for various applications. In this paper, we +propose ParamISP, a learning-based method for forward and inverse conversion +between sRGB and RAW images, that adopts a novel neural-network module to +utilize camera parameters, which is dubbed as ParamNet. Given the camera +parameters provided in the EXIF data, ParamNet converts them into a feature +vector to control the ISP networks. Extensive experiments demonstrate that +ParamISP achieve superior RAW and sRGB reconstruction results compared to +previous methods and it can be effectively used for a variety of applications +such as deblurring dataset synthesis, raw deblurring, HDR reconstruction, and +camera-to-camera transfer. + +
+
+
+
+
+ + ♻ ☆ Investigating Low Data, Confidence Aware Image Prediction on Smooth + Repetitive Videos using Gaussian Processes + + +
+ The ability to predict future states is crucial to informed decision-making +while interacting with dynamic environments. With cameras providing a prevalent +and information-rich sensing modality, the problem of predicting future states +from image sequences has garnered a lot of attention. Current state-of-the-art +methods typically train large parametric models for their predictions. Though +often able to predict with accuracy these models often fail to provide +interpretable confidence metrics around their predictions. Additionally these +methods are reliant on the availability of large training datasets to converge +to useful solutions. In this paper, we focus on the problem of predicting +future images of an image sequence with interpretable confidence bounds from +very little training data. To approach this problem, we use non-parametric +models to take a probabilistic approach to image prediction. We generate +probability distributions over sequentially predicted images, and propagate +uncertainty through time to generate a confidence metric for our predictions. +Gaussian Processes are used for their data efficiency and ability to readily +incorporate new training data online. Our methods predictions are evaluated on +a smooth fluid simulation environment. We showcase the capabilities of our +approach on real world data by predicting pedestrian flows and weather patterns +from satellite imagery. + +
+
+
+
+
+ + ♻ ☆ Transformer based Pluralistic Image Completion with Reduced Information + Loss + + +
+ Transformer based methods have achieved great success in image inpainting +recently. However, we find that these solutions regard each pixel as a token, +thus suffering from an information loss issue from two aspects: 1) They +downsample the input image into much lower resolutions for efficiency +consideration. 2) They quantize $256^3$ RGB values to a small number (such as +512) of quantized color values. The indices of quantized pixels are used as +tokens for the inputs and prediction targets of the transformer. To mitigate +these issues, we propose a new transformer based framework called "PUT". +Specifically, to avoid input downsampling while maintaining computation +efficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts +the masked image into non-overlapped patch tokens and the decoder recovers the +masked regions from the inpainted tokens while keeping the unmasked regions +unchanged. To eliminate the information loss caused by input quantization, an +Un-quantized Transformer is applied. It directly takes features from the +P-VQVAE encoder as input without any quantization and only regards the +quantized tokens as prediction targets. Furthermore, to make the inpainting +process more controllable, we introduce semantic and structural conditions as +extra guidance. Extensive experiments show that our method greatly outperforms +existing transformer based methods on image fidelity and achieves much higher +diversity and better fidelity than state-of-the-art pluralistic inpainting +methods on complex large-scale datasets (e.g., ImageNet). Codes are available +at https://github.com/liuqk3/PUT. + +
+
+ comment: Accepted by TPAMI (2024). arXiv admin note: text overlap with + arXiv:2205.05076 +
+
+
+
+
+ + ♻ ☆ Are Bias Mitigation Techniques for Deep Learning Effective? WACV 2022 + + +
+ A critical problem in deep learning is that systems learn inappropriate +biases, resulting in their inability to perform well on minority groups. This +has led to the creation of multiple algorithms that endeavor to mitigate bias. +However, it is not clear how effective these methods are. This is because study +protocols differ among papers, systems are tested on datasets that fail to test +many forms of bias, and systems have access to hidden knowledge or are tuned +specifically to the test set. To address this, we introduce an improved +evaluation protocol, sensible metrics, and a new dataset, which enables us to +ask and answer critical questions about bias mitigation algorithms. We evaluate +seven state-of-the-art algorithms using the same network architecture and +hyperparameter selection policy across three benchmark datasets. We introduce a +new dataset called Biased MNIST that enables assessment of robustness to +multiple bias sources. We use Biased MNIST and a visual question answering +(VQA) benchmark to assess robustness to hidden biases. Rather than only tuning +to the test set distribution, we study robustness across different tuning +distributions, which is critical because for many applications the test +distribution may not be known during development. We find that algorithms +exploit hidden biases, are unable to scale to multiple forms of bias, and are +highly sensitive to the choice of tuning set. Based on our findings, we implore +the community to adopt more rigorous assessment of future bias mitigation +methods. All data, code, and results are publicly available at: +https://github.com/erobic/bias-mitigators. + +
+
+ comment: WACV 2022 +
+
+
+
+
+ + ♻ ☆ Direct May Not Be the Best: An Incremental Evolution View of Pose + Generation AAAI2024 + + +
+ Pose diversity is an inherent representative characteristic of 2D images. Due +to the 3D to 2D projection mechanism, there is evident content discrepancy +among distinct pose images. This is the main obstacle bothering pose +transformation related researches. To deal with this challenge, we propose a +fine-grained incremental evolution centered pose generation framework, rather +than traditional direct one-to-one in a rush. Since proposed approach actually +bypasses the theoretical difficulty of directly modeling dramatic non-linear +variation, the incurred content distortion and blurring could be effectively +constrained, at the same time the various individual pose details, especially +clothes texture, could be precisely maintained. In order to systematically +guide the evolution course, both global and incremental evolution constraints +are elaborately designed and merged into the overall framework. And a novel +triple-path knowledge fusion structure is worked out to take full advantage of +all available valuable knowledge to conduct high-quality pose synthesis. In +addition, our framework could generate a series of valuable byproducts, namely +the various intermediate poses. Extensive experiments have been conducted to +verify the effectiveness of the proposed approach. Code is available at +https://github.com/Xiaofei-CN/Incremental-Evolution-Pose-Generation. + +
+
+ comment: Accepted at AAAI2024 +
+
+
+
+
+ + ♻ ☆ DRCT: Saving Image Super-resolution away from Information Bottleneck + + +
+ In recent years, Vision Transformer-based approaches for low-level vision +tasks have achieved widespread success. Unlike CNN-based models, Transformers +are more adept at capturing long-range dependencies, enabling the +reconstruction of images utilizing non-local information. In the domain of +super-resolution, Swin-transformer-based models have become mainstream due to +their capability of global spatial information modeling and their +shifting-window attention mechanism that facilitates the interchange of +information between different windows. Many researchers have enhanced model +performance by expanding the receptive fields or designing meticulous networks, +yielding commendable results. However, we observed that it is a general +phenomenon for the feature map intensity to be abruptly suppressed to small +values towards the network's end. This implies an information bottleneck and a +diminishment of spatial information, implicitly limiting the model's potential. +To address this, we propose the Dense-residual-connected Transformer (DRCT), +aimed at mitigating the loss of spatial information and stabilizing the +information flow through dense-residual connections between layers, thereby +unleashing the model's potential and saving the model away from information +bottleneck. Experiment results indicate that our approach surpasses +state-of-the-art methods on benchmark datasets and performs commendably at the +NTIRE-2024 Image Super-Resolution (x4) Challenge. Our source code is available +at https://github.com/ming053l/DRCT + +
+
+ comment: Camera-ready version, NTIRE 2024 Image Super-resolution (x4) +
+
+
+
+
+ + ♻ ☆ CADS: Unleashing the Diversity of Diffusion Models through + Condition-Annealed Sampling ICLR 2024 + + +
+ While conditional diffusion models are known to have good coverage of the +data distribution, they still face limitations in output diversity, +particularly when sampled with a high classifier-free guidance scale for +optimal image quality or when trained on small datasets. We attribute this +problem to the role of the conditioning signal in inference and offer an +improved sampling strategy for diffusion models that can increase generation +diversity, especially at high guidance scales, with minimal loss of sample +quality. Our sampling strategy anneals the conditioning signal by adding +scheduled, monotonically decreasing Gaussian noise to the conditioning vector +during inference to balance diversity and condition alignment. Our +Condition-Annealed Diffusion Sampler (CADS) can be used with any pretrained +model and sampling algorithm, and we show that it boosts the diversity of +diffusion models in various conditional generation tasks. Further, using an +existing pretrained diffusion model, CADS achieves a new state-of-the-art FID +of 1.70 and 2.31 for class-conditional ImageNet generation at 256$\times$256 +and 512$\times$512 respectively. + +
+
+ comment: Published as a conference paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ DistriFusion: Distributed Parallel Inference for High-Resolution + Diffusion Models CVPR 2024 + + +
+ Diffusion models have achieved great success in synthesizing high-quality +images. However, generating high-resolution images with diffusion models is +still challenging due to the enormous computational costs, resulting in a +prohibitive latency for interactive applications. In this paper, we propose +DistriFusion to tackle this problem by leveraging parallelism across multiple +GPUs. Our method splits the model input into multiple patches and assigns each +patch to a GPU. However, naively implementing such an algorithm breaks the +interaction between patches and loses fidelity, while incorporating such an +interaction will incur tremendous communication overhead. To overcome this +dilemma, we observe the high similarity between the input from adjacent +diffusion steps and propose displaced patch parallelism, which takes advantage +of the sequential nature of the diffusion process by reusing the pre-computed +feature maps from the previous timestep to provide context for the current +step. Therefore, our method supports asynchronous communication, which can be +pipelined by computation. Extensive experiments show that our method can be +applied to recent Stable Diffusion XL with no quality degradation and achieve +up to a 6.1$\times$ speedup on eight NVIDIA A100s compared to one. Our code is +publicly available at https://github.com/mit-han-lab/distrifuser. + +
+
+ comment: CVPR 2024 Highlight Code: https://github.com/mit-han-lab/distrifuser + Website: https://hanlab.mit.edu/projects/distrifusion Blog: + https://hanlab.mit.edu/blog/distrifusion +
+
+
+
+
+ + ♻ ☆ Retrieval-Augmented Layout Transformer for Content-Aware Layout + Generation CVPR 2024 + + +
+ Content-aware graphic layout generation aims to automatically arrange visual +elements along with a given content, such as an e-commerce product image. In +this paper, we argue that the current layout generation approaches suffer from +the limited training data for the high-dimensional layout structure. We show +that a simple retrieval augmentation can significantly improve the generation +quality. Our model, which is named Retrieval-Augmented Layout Transformer +(RALF), retrieves nearest neighbor layout examples based on an input image and +feeds these results into an autoregressive generator. Our model can apply +retrieval augmentation to various controllable generation tasks and yield +high-quality layouts within a unified architecture. Our extensive experiments +show that RALF successfully generates content-aware layouts in both constrained +and unconstrained settings and significantly outperforms the baselines. + +
+
+ comment: Accepted to CVPR 2024 (Oral), Project website: + https://udonda.github.io/RALF/ , GitHub: + https://github.com/CyberAgentAILab/RALF +
+
+
+
+
+ + ♻ ☆ A design of Convolutional Neural Network model for the Diagnosis of the + COVID-19 + + +
+ With the spread of COVID-19 around the globe over the past year, the usage of +artificial intelligence (AI) algorithms and image processing methods to analyze +the X-ray images of patients' chest with COVID-19 has become essential. The +COVID-19 virus recognition in the lung area of a patient is one of the basic +and essential needs of clicical centers and hospitals. Most research in this +field has been devoted to papers on the basis of deep learning methods +utilizing CNNs (Convolutional Neural Network), which mainly deal with the +screening of sick and healthy people.In this study, a new structure of a +19-layer CNN has been recommended for accurately recognition of the COVID-19 +from the X-ray pictures of chest. The offered CNN is developed to serve as a +precise diagnosis system for a three class (viral pneumonia, Normal, COVID) and +a four classclassification (Lung opacity, Normal, COVID-19, and pneumonia). A +comparison is conducted among the outcomes of the offered procedure and some +popular pretrained networks, including Inception, Alexnet, ResNet50, +Squeezenet, and VGG19 and based on Specificity, Accuracy, Precision, +Sensitivity, Confusion Matrix, and F1-score. The experimental results of the +offered CNN method specify its dominance over the existing published +procedures. This method can be a useful tool for clinicians in deciding +properly about COVID-19. + +
+
+ comment: Important mistakes. Also, another author has contributed some to the + revised version. So it is not appropriate for it to be with only my name +
+
+
+
+
+ + ♻ ☆ Vision Transformer Computation and Resilience for Dynamic Inference + + +
+ State-of-the-art deep learning models for computer vision tasks are based on +the transformer architecture and often deployed in real-time applications. In +this scenario, the resources available for every inference can vary, so it is +useful to be able to dynamically adapt execution to trade accuracy for +efficiency. To create dynamic models, we leverage the resilience of vision +transformers to pruning and switch between different scaled versions of a +model. Surprisingly, we find that most FLOPs are generated by convolutions, not +attention. These relative FLOP counts are not a good predictor of GPU +performance since GPUs have special optimizations for convolutions. Some models +are fairly resilient and their model execution can be adapted without +retraining, while all models achieve better accuracy with retraining +alternative execution paths. These insights mean that we can leverage CNN +accelerators and these alternative execution paths to enable efficient and +dynamic vision transformer inference. Our analysis shows that leveraging this +type of dynamic execution can lead to saving 28\% of energy with a 1.4\% +accuracy drop for SegFormer (63 GFLOPs), with no additional training, and 53\% +of energy for ResNet-50 (4 GFLOPs) with a 3.3\% accuracy drop by switching +between pretrained Once-For-All models. + +
+
+
+
+
+ + ♻ ☆ Distilling Vision-Language Models on Millions of Videos CVPR 2024 + + +
+ The recent advance in vision-language models is largely attributed to the +abundance of image-text data. We aim to replicate this success for +video-language models, but there simply is not enough human-curated video-text +data available. We thus resort to fine-tuning a video-language model from a +strong image-language baseline with synthesized instructional data. The +resulting video model by video-instruction-tuning (VIIT) is then used to +auto-label millions of videos to generate high-quality captions. We show the +adapted video-language model performs well on a wide range of video-language +benchmarks. For instance, it surpasses the best prior result on open-ended +NExT-QA by 2.8%. Besides, our model generates detailed descriptions for +previously unseen videos, which provide better textual supervision than +existing methods. Experiments show that a video-language dual-encoder model +contrastively trained on these auto-generated captions is 3.8% better than the +strongest baseline that also leverages vision-language models. Our best model +outperforms state-of-the-art methods on MSR-VTT zero-shot text-to-video +retrieval by 6%. As a side product, we generate the largest video caption +dataset to date. + +
+
+ comment: CVPR 2024. Project page: + https://zhaoyue-zephyrus.github.io/video-instruction-tuning +
+
+
+
+
+ + ♻ ☆ Towards Two-Stream Foveation-based Active Vision Learning + + +
+ Deep neural network (DNN) based machine perception frameworks process the +entire input in a one-shot manner to provide answers to both "what object is +being observed" and "where it is located". In contrast, the "two-stream +hypothesis" from neuroscience explains the neural processing in the human +visual cortex as an active vision system that utilizes two separate regions of +the brain to answer the what and the where questions. In this work, we propose +a machine learning framework inspired by the "two-stream hypothesis" and +explore the potential benefits that it offers. Specifically, the proposed +framework models the following mechanisms: 1) ventral (what) stream focusing on +the input regions perceived by the fovea part of an eye (foveation), 2) dorsal +(where) stream providing visual guidance, and 3) iterative processing of the +two streams to calibrate visual focus and process the sequence of focused image +patches. The training of the proposed framework is accomplished by label-based +DNN training for the ventral stream model and reinforcement learning for the +dorsal stream model. We show that the two-stream foveation-based learning is +applicable to the challenging task of weakly-supervised object localization +(WSOL), where the training data is limited to the object class or its +attributes. The framework is capable of both predicting the properties of an +object and successfully localizing it by predicting its bounding box. We also +show that, due to the independent nature of the two streams, the dorsal model +can be applied on its own to unseen images to localize objects from different +datasets. + +
+
+ comment: Accepted for publication at IEEE Transactions on Cognitive and + Developmental Systems (IEEE TCDS), 18 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ nnMobileNet: Rethinking CNN for Retinopathy Research CVPR + + +
+ Over the past few decades, convolutional neural networks (CNNs) have been at +the forefront of the detection and tracking of various retinal diseases (RD). +Despite their success, the emergence of vision transformers (ViT) in the 2020s +has shifted the trajectory of RD model development. The leading-edge +performance of ViT-based models in RD can be largely credited to their +scalability-their ability to improve as more parameters are added. As a result, +ViT-based models tend to outshine traditional CNNs in RD applications, albeit +at the cost of increased data and computational demands. ViTs also differ from +CNNs in their approach to processing images, working with patches rather than +local regions, which can complicate the precise localization of small, variably +presented lesions in RD. In our study, we revisited and updated the +architecture of a CNN model, specifically MobileNet, to enhance its utility in +RD diagnostics. We found that an optimized MobileNet, through selective +modifications, can surpass ViT-based models in various RD benchmarks, including +diabetic retinopathy grading, detection of multiple fundus diseases, and +classification of diabetic macular edema. The code is available at +https://github.com/Retinal-Research/NN-MOBILENET + +
+
+ comment: Accepted as a conference paper to 2024 CVPRW +
+
+
+
+
+ + ♻ ☆ Visual Grounding Methods for VQA are Working for the Wrong Reasons! ACL 2020 + + +
+ Existing Visual Question Answering (VQA) methods tend to exploit dataset +biases and spurious statistical correlations, instead of producing right +answers for the right reasons. To address this issue, recent bias mitigation +methods for VQA propose to incorporate visual cues (e.g., human attention maps) +to better ground the VQA models, showcasing impressive gains. However, we show +that the performance improvements are not a result of improved visual +grounding, but a regularization effect which prevents over-fitting to +linguistic priors. For instance, we find that it is not actually necessary to +provide proper, human-based cues; random, insensible cues also result in +similar improvements. Based on this observation, we propose a simpler +regularization scheme that does not require any external annotations and yet +achieves near state-of-the-art performance on VQA-CPv2. + +
+
+ comment: ACL 2020 +
+
+
+
+
+ + ♻ ☆ SAWEC: Sensing-Assisted Wireless Edge Computing + + +
+ Emerging mobile virtual reality (VR) systems will require to continuously +perform complex computer vision tasks on ultra-high-resolution video frames +through the execution of deep neural networks (DNNs)-based algorithms. Since +state-of-the-art DNNs require computational power that is excessive for mobile +devices, techniques based on wireless edge computing (WEC) have been recently +proposed. However, existing WEC methods require the transmission and processing +of a high amount of video data which may ultimately saturate the wireless link. +In this paper, we propose a novel Sensing-Assisted Wireless Edge Computing +(SAWEC) paradigm to address this issue. SAWEC leverages knowledge about the +physical environment to reduce the end-to-end latency and overall computational +burden by transmitting to the edge server only the relevant data for the +delivery of the service. Our intuition is that the transmission of the portion +of the video frames where there are no changes with respect to previous frames +can be avoided. Specifically, we leverage wireless sensing techniques to +estimate the location of objects in the environment and obtain insights about +the environment dynamics. Hence, only the part of the frames where any +environmental change is detected is transmitted and processed. We evaluated +SAWEC by using a 10K 360$^{\circ}$ with a Wi-Fi 6 sensing system operating at +160 MHz and performing localization and tracking. We considered instance +segmentation and object detection as benchmarking tasks for performance +evaluation. We carried out experiments in an anechoic chamber and an entrance +hall with two human subjects in six different setups. Experimental results show +that SAWEC reduces both the channel occupation and end-to-end latency by more +than 90% while improving the instance segmentation and object detection +performance with respect to state-of-the-art WEC approaches. + +
+
+ comment: Submitted to ACM for possible publication +
+
+
+
+
+ + ♻ ☆ The Bias of Harmful Label Associations in Vision-Language Models + + +
+ Despite the remarkable performance of foundation vision-language models, the +shared representation space for text and vision can also encode harmful label +associations detrimental to fairness. While prior work has uncovered bias in +vision-language models' (VLMs) classification performance across geography, +work has been limited along the important axis of harmful label associations +due to a lack of rich, labeled data. In this work, we investigate harmful label +associations in the recently released Casual Conversations datasets containing +more than 70,000 videos. We study bias in the frequency of harmful label +associations across self-provided labels for age, gender, apparent skin tone, +and physical adornments across several leading VLMs. We find that VLMs are +$4-7$x more likely to harmfully classify individuals with darker skin tones. We +also find scaling transformer encoder model size leads to higher confidence in +harmful predictions. Finally, we find improvements on standard vision tasks +across VLMs does not address disparities in harmful label associations. + +
+
+
+
+
+ + ♻ ☆ Equivariant Multi-Modality Image Fusion CVPR 2024 + + +
+ Multi-modality image fusion is a technique that combines information from +different sensors or modalities, enabling the fused image to retain +complementary features from each modality, such as functional highlights and +texture details. However, effective training of such fusion models is +challenging due to the scarcity of ground truth fusion data. To tackle this +issue, we propose the Equivariant Multi-Modality imAge fusion (EMMA) paradigm +for end-to-end self-supervised learning. Our approach is rooted in the prior +knowledge that natural imaging responses are equivariant to certain +transformations. Consequently, we introduce a novel training paradigm that +encompasses a fusion module, a pseudo-sensing module, and an equivariant fusion +module. These components enable the net training to follow the principles of +the natural sensing-imaging process while satisfying the equivariant imaging +prior. Extensive experiments confirm that EMMA yields high-quality fusion +results for infrared-visible and medical images, concurrently facilitating +downstream multi-modal segmentation and detection tasks. The code is available +at https://github.com/Zhaozixiang1228/MMIF-EMMA. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ Clustering-based Image-Text Graph Matching for Domain Generalization + + +
+ Learning domain-invariant visual representations is important to train a +model that can generalize well to unseen target task domains. Recent works +demonstrate that text descriptions contain high-level class-discriminative +information and such auxiliary semantic cues can be used as effective pivot +embedding for domain generalization problem. However, they use pivot embedding +in global manner (i.e., aligning an image embedding with sentence-level text +embedding), not fully utilizing the semantic cues of given text description. In +this work, we advocate for the use of local alignment between image regions and +corresponding textual descriptions. To this end, we first represent image and +text inputs with graphs. We subsequently cluster nodes in those graphs and +match the graph-based image node features into textual graphs. This matching +process is conducted globally and locally, tightly aligning visual and textual +semantic sub-structures. We experiment with large-scale public datasets, such +as CUB-DG and DomainBed, and our model achieves matched or better +state-of-the-art performance on these datasets. Our code will be publicly +available upon publication. + +
+
+
+
+
+ + ♻ ☆ Strategies to Improve Real-World Applicability of Laparoscopic Anatomy + Segmentation Models CVPR 2024 + + +
+ Accurate identification and localization of anatomical structures of varying +size and appearance in laparoscopic imaging are necessary to leverage the +potential of computer vision techniques for surgical decision support. +Segmentation performance of such models is traditionally reported using metrics +of overlap such as IoU. However, imbalanced and unrealistic representation of +classes in the training data and suboptimal selection of reported metrics have +the potential to skew nominal segmentation performance and thereby ultimately +limit clinical translation. In this work, we systematically analyze the impact +of class characteristics (i.e., organ size differences), training and test data +composition (i.e., representation of positive and negative examples), and +modeling parameters (i.e., foreground-to-background class weight) on eight +segmentation metrics: accuracy, precision, recall, IoU, F1 score (Dice +Similarity Coefficient), specificity, Hausdorff Distance, and Average Symmetric +Surface Distance. Our findings support two adjustments to account for data +biases in surgical data science: First, training on datasets that are similar +to the clinical real-world scenarios in terms of class distribution, and +second, class weight adjustments to optimize segmentation model performance +with regard to metrics of particular relevance in the respective clinical +setting. + +
+
+ comment: 14 pages, 5 figures, 4 tables; accepted for the workshop "Data + Curation and Augmentation in Medical Imaging" at CVPR 2024 (archival track) +
+
+
+
+
+ + ♻ ☆ LLM-driven Multimodal Target Volume Contouring in Radiation Oncology + + +
+ Target volume contouring for radiation therapy is considered significantly +more challenging than the normal organ segmentation tasks as it necessitates +the utilization of both image and text-based clinical information. Inspired by +the recent advancement of large language models (LLMs) that can facilitate the +integration of the textural information and images, here we present a novel +LLM-driven multimodal AI, namely LLMSeg, that utilizes the clinical text +information and is applicable to the challenging task of target volume +contouring for radiation therapy, and validate it within the context of breast +cancer radiation therapy target volume contouring. Using external validation +and data-insufficient environments, which attributes highly conducive to +real-world applications, we demonstrate that the proposed model exhibits +markedly improved performance compared to conventional unimodal AI models, +particularly exhibiting robust generalization performance and data efficiency. +To our best knowledge, this is the first LLM-driven multimodal AI model that +integrates the clinical text information into target volume delineation for +radiation oncology. + +
+
+
+
+
+ + ♻ ☆ SCott: Accelerating Diffusion Models with Stochastic Consistency + Distillation + + +
+ The iterative sampling procedure employed by diffusion models (DMs) often +leads to significant inference latency. To address this, we propose Stochastic +Consistency Distillation (SCott) to enable accelerated text-to-image +generation, where high-quality generations can be achieved with just 1-2 +sampling steps, and further improvements can be obtained by adding additional +steps. In contrast to vanilla consistency distillation (CD) which distills the +ordinary differential equation solvers-based sampling process of a pretrained +teacher model into a student, SCott explores the possibility and validates the +efficacy of integrating stochastic differential equation (SDE) solvers into CD +to fully unleash the potential of the teacher. SCott is augmented with +elaborate strategies to control the noise strength and sampling process of the +SDE solver. An adversarial loss is further incorporated to strengthen the +sample quality with rare sampling steps. Empirically, on the MSCOCO-2017 5K +dataset with a Stable Diffusion-V1.5 teacher, SCott achieves an FID (Frechet +Inceptio Distance) of 22.1, surpassing that (23.4) of the 1-step InstaFlow (Liu +et al., 2023) and matching that of 4-step UFOGen (Xue et al., 2023b). Moreover, +SCott can yield more diverse samples than other consistency models for +high-resolution image generation (Luo et al., 2023a), with up to 16% +improvement in a qualified metric. The code and checkpoints are coming soon. + +
+
+ comment: 22 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ A Medical Data-Effective Learning Benchmark for Highly Efficient + Pre-training of Foundation Models + + +
+ Foundation models, pre-trained on massive datasets, have achieved +unprecedented generalizability. However, is it truly necessary to involve such +vast amounts of data in pre-training, consuming extensive computational +resources? This paper introduces data-effective learning, aiming to use data in +the most impactful way to pre-train foundation models. This involves strategies +that focus on data quality rather than quantity, ensuring the data used for +training has high informational value. Data-effective learning plays a profound +role in accelerating foundation model training, reducing computational costs, +and saving data storage, which is very important as the volume of medical data +in recent years has grown beyond many people's expectations. However, due to +the lack of standards and comprehensive benchmarks, research on medical +data-effective learning is poorly studied. To address this gap, our paper +introduces a comprehensive benchmark specifically for evaluating data-effective +learning in the medical field. This benchmark includes a dataset with millions +of data samples from 31 medical centers (DataDEL), a baseline method for +comparison (MedDEL), and a new evaluation metric (NormDEL) to objectively +measure data-effective learning performance. Our extensive experimental results +show the baseline MedDEL can achieve performance comparable to the original +large dataset with only 5% of the data. Establishing such an open +data-effective learning benchmark is crucial for the medical foundation model +research community because it facilitates efficient data use, promotes +collaborative breakthroughs, and fosters the development of cost-effective, +scalable, and impactful healthcare solutions. + +
+
+
+
+
+ + ♻ ☆ BAA-NGP: Bundle-Adjusting Accelerated Neural Graphics Primitives + + +
+ Implicit neural representations have become pivotal in robotic perception, +enabling robots to comprehend 3D environments from 2D images. Given a set of +camera poses and associated images, the models can be trained to synthesize +novel, unseen views. To successfully navigate and interact in dynamic settings, +robots require the understanding of their spatial surroundings driven by +unassisted reconstruction of 3D scenes and camera poses from real-time video +footage. Existing approaches like COLMAP and bundle-adjusting neural radiance +field methods take hours to days to process due to the high computational +demands of feature matching, dense point sampling, and training of a +multi-layer perceptron structure with a large number of parameters. To address +these challenges, we propose a framework called bundle-adjusting accelerated +neural graphics primitives (BAA-NGP) which leverages accelerated sampling and +hash encoding to expedite automatic pose refinement/estimation and 3D scene +reconstruction. Experimental results demonstrate 10 to 20 x speed improvement +compared to other bundle-adjusting neural radiance field methods without +sacrificing the quality of pose estimation. The github repository can be found +here https://github.com/IntelLabs/baa-ngp. + +
+
+
+
+
+ + ♻ ☆ Backdoor Federated Learning by Poisoning Backdoor-Critical Layers ICLR'24 + + +
+ Federated learning (FL) has been widely deployed to enable machine learning +training on sensitive data across distributed devices. However, the +decentralized learning paradigm and heterogeneity of FL further extend the +attack surface for backdoor attacks. Existing FL attack and defense +methodologies typically focus on the whole model. None of them recognizes the +existence of backdoor-critical (BC) layers-a small subset of layers that +dominate the model vulnerabilities. Attacking the BC layers achieves equivalent +effects as attacking the whole model but at a far smaller chance of being +detected by state-of-the-art (SOTA) defenses. This paper proposes a general +in-situ approach that identifies and verifies BC layers from the perspective of +attackers. Based on the identified BC layers, we carefully craft a new backdoor +attack methodology that adaptively seeks a fundamental balance between +attacking effects and stealthiness under various defense strategies. Extensive +experiments show that our BC layer-aware backdoor attacks can successfully +backdoor FL under seven SOTA defenses with only 10% malicious clients and +outperform the latest backdoor attack methods. + +
+
+ comment: Accepted to ICLR'24 +
+
+
+
+
+ + ♻ ☆ Post-processing of coronary and myocardial spatial data + + +
+ Numerical simulations of real-world phenomenon are implemented with at least +two parts: the computational scheme and the computational domain. In the +context of hemodynamics, the computational domain of a simulation represents +the blood vessel network through which blood flows. Such blood vessel networks +can contain millions of individual vessels that are joined together to form a +in series and parallel to form the network. It is computationally unfeasible to +explicitly simulate blood flow in all blood vessels. Here, from imaged data of +a single porcine left coronary arterial tree, we develop a data-pipeline to +obtain computational domains for hemodynmaic simulations from a graph +representing the coronary vascular tree. Further, we develop a method to +ascertain which subregions of the left ventricle are most likely to be perfused +via a given artery using a comparison with the American Heart Association +division of the left ventricle as a sense check. + +
+
+ comment: 21 pages, 22 figures +
+
+
+
+
+ + ♻ ☆ Transfer Learning for Cross-dataset Isolated Sign Language Recognition + in Under-Resourced Datasets + + +
+ Sign language recognition (SLR) has recently achieved a breakthrough in +performance thanks to deep neural networks trained on large annotated sign +datasets. Of the many different sign languages, these annotated datasets are +only available for a select few. Since acquiring gloss-level labels on sign +language videos is difficult, learning by transferring knowledge from existing +annotated sources is useful for recognition in under-resourced sign languages. +This study provides a publicly available cross-dataset transfer learning +benchmark from two existing public Turkish SLR datasets. We use a temporal +graph convolution-based sign language recognition approach to evaluate five +supervised transfer learning approaches and experiment with closed-set and +partial-set cross-dataset transfer learning. Experiments demonstrate that +improvement over finetuning based transfer learning is possible with +specialized supervised transfer learning methods. + +
+
+ comment: Accepted to The 18th IEEE International Conference on Automatic Face + and Gesture Recognition 2024, Code available in + https://github.com/alpk/tid-supervised-transfer-learning-dataset +
+
+
+
+
+ + ♻ ☆ MoDA: Leveraging Motion Priors from Videos for Advancing Unsupervised + Domain Adaptation in Semantic Segmentation CVPR 2024 + + +
+ Unsupervised domain adaptation (UDA) has been a potent technique to handle +the lack of annotations in the target domain, particularly in semantic +segmentation task. This study introduces a different UDA scenarios where the +target domain contains unlabeled video frames. Drawing upon recent advancements +of self-supervised learning of the object motion from unlabeled videos with +geometric constraint, we design a \textbf{Mo}tion-guided \textbf{D}omain +\textbf{A}daptive semantic segmentation framework (MoDA). MoDA harnesses the +self-supervised object motion cues to facilitate cross-domain alignment for +segmentation task. First, we present an object discovery module to localize and +segment target moving objects using object motion information. Then, we propose +a semantic mining module that takes the object masks to refine the pseudo +labels in the target domain. Subsequently, these high-quality pseudo labels are +used in the self-training loop to bridge the cross-domain gap. On domain +adaptive video and image segmentation experiments, MoDA shows the effectiveness +utilizing object motion as guidance for domain alignment compared with optical +flow information. Moreover, MoDA exhibits versatility as it can complement +existing state-of-the-art UDA approaches. Code at +https://github.com/feipanir/MoDA. + +
+
+ comment: CVPR 2024 Workshop on Learning with Limited Labelled Data for Image + and Video Understanding. Best Paper Award +
+
+
+
+
+ + ♻ ☆ Semantics-aware Motion Retargeting with Vision-Language Models CVPR2024 + + +
+ Capturing and preserving motion semantics is essential to motion retargeting +between animation characters. However, most of the previous works neglect the +semantic information or rely on human-designed joint-level representations. +Here, we present a novel Semantics-aware Motion reTargeting (SMT) method with +the advantage of vision-language models to extract and maintain meaningful +motion semantics. We utilize a differentiable module to render 3D motions. Then +the high-level motion semantics are incorporated into the motion retargeting +process by feeding the vision-language model with the rendered images and +aligning the extracted semantic embeddings. To ensure the preservation of +fine-grained motion details and high-level semantics, we adopt a two-stage +pipeline consisting of skeleton-aware pre-training and fine-tuning with +semantics and geometry constraints. Experimental results show the effectiveness +of the proposed method in producing high-quality motion retargeting results +while accurately preserving motion semantics. + +
+
+ comment: Accepted in CVPR2024 +
+
+
+
+
+ + ♻ ☆ Adversarial Nibbler: An Open Red-Teaming Method for Identifying Diverse + Harms in Text-to-Image Generation + + +
+ With the rise of text-to-image (T2I) generative AI models reaching wide +audiences, it is critical to evaluate model robustness against non-obvious +attacks to mitigate the generation of offensive images. By focusing on +``implicitly adversarial'' prompts (those that trigger T2I models to generate +unsafe images for non-obvious reasons), we isolate a set of difficult safety +issues that human creativity is well-suited to uncover. To this end, we built +the Adversarial Nibbler Challenge, a red-teaming methodology for crowdsourcing +a diverse set of implicitly adversarial prompts. We have assembled a suite of +state-of-the-art T2I models, employed a simple user interface to identify and +annotate harms, and engaged diverse populations to capture long-tail safety +issues that may be overlooked in standard testing. The challenge is run in +consecutive rounds to enable a sustained discovery and analysis of safety +pitfalls in T2I models. + In this paper, we present an in-depth account of our methodology, a +systematic study of novel attack strategies and discussion of safety failures +revealed by challenge participants. We also release a companion visualization +tool for easy exploration and derivation of insights from the dataset. The +first challenge round resulted in over 10k prompt-image pairs with machine +annotations for safety. A subset of 1.5k samples contains rich human +annotations of harm types and attack styles. We find that 14% of images that +humans consider harmful are mislabeled as ``safe'' by machines. We have +identified new attack strategies that highlight the complexity of ensuring T2I +model robustness. Our findings emphasize the necessity of continual auditing +and adaptation as new vulnerabilities emerge. We are confident that this work +will enable proactive, iterative safety assessments and promote responsible +development of T2I models. + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ CONDA: Continual Unsupervised Domain Adaptation Learning in Visual + Perception for Self-Driving Cars CVPR + + +
+ Although unsupervised domain adaptation methods have achieved remarkable +performance in semantic scene segmentation in visual perception for +self-driving cars, these approaches remain impractical in real-world use cases. +In practice, the segmentation models may encounter new data that have not been +seen yet. Also, the previous data training of segmentation models may be +inaccessible due to privacy problems. Therefore, to address these problems, in +this work, we propose a Continual Unsupervised Domain Adaptation (CONDA) +approach that allows the model to continuously learn and adapt with respect to +the presence of the new data. Moreover, our proposed approach is designed +without the requirement of accessing previous training data. To avoid the +catastrophic forgetting problem and maintain the performance of the +segmentation models, we present a novel Bijective Maximum Likelihood loss to +impose the constraint of predicted segmentation distribution shifts. The +experimental results on the benchmark of continual unsupervised domain +adaptation have shown the advanced performance of the proposed CONDA method. + +
+
+ comment: Accepted to CVPRW 2024 +
+
+
+
+
+ + ♻ ☆ Language-guided Image Reflection Separation + + +
+ This paper studies the problem of language-guided reflection separation, +which aims at addressing the ill-posed reflection separation problem by +introducing language descriptions to provide layer content. We propose a +unified framework to solve this problem, which leverages the cross-attention +mechanism with contrastive learning strategies to construct the correspondence +between language descriptions and image layers. A gated network design and a +randomized training strategy are employed to tackle the recognizable layer +ambiguity. The effectiveness of the proposed method is validated by the +significant performance advantage over existing reflection separation methods +on both quantitative and qualitative comparisons. + +
+
+
+
+
+ + ♻ ☆ Vision-Language Models for Medical Report Generation and Visual Question + Answering: A Review + + +
+ Medical vision-language models (VLMs) combine computer vision (CV) and +natural language processing (NLP) to analyze visual and textual medical data. +Our paper reviews recent advancements in developing VLMs specialized for +healthcare, focusing on models designed for medical report generation and +visual question answering (VQA). We provide background on NLP and CV, +explaining how techniques from both fields are integrated into VLMs to enable +learning from multimodal data. Key areas we address include the exploration of +medical vision-language datasets, in-depth analyses of architectures and +pre-training strategies employed in recent noteworthy medical VLMs, and +comprehensive discussion on evaluation metrics for assessing VLMs' performance +in medical report generation and VQA. We also highlight current challenges and +propose future directions, including enhancing clinical validity and addressing +patient privacy concerns. Overall, our review summarizes recent progress in +developing VLMs to harness multimodal medical data for improved healthcare +applications. + +
+
+ comment: 43 pages; paper edited and restructured +
+
+
+
+
+ + ♻ ☆ Trajectory Consistency Distillation: Improved Latent Consistency + Distillation by Semi-Linear Consistency Function with Trajectory Mapping + + +
+ Latent Consistency Model (LCM) extends the Consistency Model to the latent +space and leverages the guided consistency distillation technique to achieve +impressive performance in accelerating text-to-image synthesis. However, we +observed that LCM struggles to generate images with both clarity and detailed +intricacy. Consequently, we introduce Trajectory Consistency Distillation +(TCD), which encompasses trajectory consistency function and strategic +stochastic sampling. The trajectory consistency function diminishes the +parameterisation and distillation errors by broadening the scope of the +self-consistency boundary condition with trajectory mapping and endowing the +TCD with the ability to accurately trace the entire trajectory of the +Probability Flow ODE in semi-linear form with an Exponential Integrator. +Additionally, strategic stochastic sampling provides explicit control of +stochastic and circumvents the accumulated errors inherent in multi-step +consistency sampling. Experiments demonstrate that TCD not only significantly +enhances image quality at low NFEs but also yields more detailed results +compared to the teacher model at high NFEs. + +
+
+ comment: Project Page: https://mhh0318.github.io/tcd +
+
+
+
+
+ + ♻ ☆ EAMA : Entity-Aware Multimodal Alignment Based Approach for News Image + Captioning + + +
+ News image captioning requires model to generate an informative caption rich +in entities, with the news image and the associated news article. Though +Multimodal Large Language Models (MLLMs) have demonstrated remarkable +capabilities in addressing various vision-language tasks, our research finds +that current MLLMs still bear limitations in handling entity information on +news image captioning task. Besides, while MLLMs have the ability to process +long inputs, generating high-quality news image captions still requires a +trade-off between sufficiency and conciseness of textual input information. To +explore the potential of MLLMs and address problems we discovered, we propose : +an Entity-Aware Multimodal Alignment based approach for news image captioning. +Our approach first aligns the MLLM through Balance Training Strategy with two +extra alignment tasks: Entity-Aware Sentence Selection task and Entity +Selection task, together with News Image Captioning task, to enhance its +capability in handling multimodal entity information. The aligned MLLM will +utilizes the additional entity-related information it explicitly extract to +supplement its textual input while generating news image captions. Our approach +achieves better results than all previous models in CIDEr score on GoodNews +dataset (72.33 -> 88.39) and NYTimes800k dataset (70.83 -> 85.61). + +
+
+
+
+
+ + ♻ ☆ FlowIBR: Leveraging Pre-Training for Efficient Neural Image-Based + Rendering of Dynamic Scenes CVPR 2024 + + +
+ We introduce FlowIBR, a novel approach for efficient monocular novel view +synthesis of dynamic scenes. Existing techniques already show impressive +rendering quality but tend to focus on optimization within a single scene +without leveraging prior knowledge, resulting in long optimization times per +scene. FlowIBR circumvents this limitation by integrating a neural image-based +rendering method, pre-trained on a large corpus of widely available static +scenes, with a per-scene optimized scene flow field. Utilizing this flow field, +we bend the camera rays to counteract the scene dynamics, thereby presenting +the dynamic scene as if it were static to the rendering network. The proposed +method reduces per-scene optimization time by an order of magnitude, achieving +comparable rendering quality to existing methods -- all on a single +consumer-grade GPU. + +
+
+ comment: Accepted to CVPR 2024 Workshop on Efficient Deep Learning for + Computer Vision. Project page: https://flowibr.github.io +
+
+
+
+
+ + ♻ ☆ 4D Facial Expression Diffusion Model + + +
+ Facial expression generation is one of the most challenging and long-sought +aspects of character animation, with many interesting applications. The +challenging task, traditionally having relied heavily on digital craftspersons, +remains yet to be explored. In this paper, we introduce a generative framework +for generating 3D facial expression sequences (i.e. 4D faces) that can be +conditioned on different inputs to animate an arbitrary 3D face mesh. It is +composed of two tasks: (1) Learning the generative model that is trained over a +set of 3D landmark sequences, and (2) Generating 3D mesh sequences of an input +facial mesh driven by the generated landmark sequences. The generative model is +based on a Denoising Diffusion Probabilistic Model (DDPM), which has achieved +remarkable success in generative tasks of other domains. While it can be +trained unconditionally, its reverse process can still be conditioned by +various condition signals. This allows us to efficiently develop several +downstream tasks involving various conditional generation, by using expression +labels, text, partial sequences, or simply a facial geometry. To obtain the +full mesh deformation, we then develop a landmark-guided encoder-decoder to +apply the geometrical deformation embedded in landmarks on a given facial mesh. +Experiments show that our model has learned to generate realistic, quality +expressions solely from the dataset of relatively small size, improving over +the state-of-the-art methods. Videos and qualitative comparisons with other +methods can be found at \url{https://github.com/ZOUKaifeng/4DFM}. + +
+
+
+
+
+ + ♻ ☆ Exploring Limits of Diffusion-Synthetic Training with Weakly Supervised + Semantic Segmentation + + +
+ The advance of generative models for images has inspired various training +techniques for image recognition utilizing synthetic images. In semantic +segmentation, one promising approach is extracting pseudo-masks from attention +maps in text-to-image diffusion models, which enables +real-image-and-annotation-free training. However, the pioneering training +method using the diffusion-synthetic images and pseudo-masks, i.e., DiffuMask +has limitations in terms of mask quality, scalability, and ranges of applicable +domains. To overcome these limitations, this work introduces three techniques +for diffusion-synthetic semantic segmentation training. First, +reliability-aware robust training, originally used in weakly supervised +learning, helps segmentation with insufficient synthetic mask quality. %Second, +large-scale pretraining of whole segmentation models, not only backbones, on +synthetic ImageNet-1k-class images with pixel-labels benefits downstream +segmentation tasks. Second, we introduce prompt augmentation, data augmentation +to the prompt text set to scale up and diversify training images with a limited +text resources. Finally, LoRA-based adaptation of Stable Diffusion enables the +transfer to a distant domain, e.g., auto-driving images. Experiments in PASCAL +VOC, ImageNet-S, and Cityscapes show that our method effectively closes gap +between real and synthetic training in semantic segmentation. + +
+
+
+
+
+ + ♻ ☆ Unbiased Image Synthesis via Manifold Guidance in Diffusion Models + + +
+ Diffusion Models are a potent class of generative models capable of producing +high-quality images. However, they often inadvertently favor certain data +attributes, undermining the diversity of generated images. This issue is +starkly apparent in skewed datasets like CelebA, where the initial dataset +disproportionately favors females over males by 57.9%, this bias amplified in +generated data where female representation outstrips males by 148%. In +response, we propose a plug-and-play method named Manifold Guidance Sampling, +which is also the first unsupervised method to mitigate bias issue in DDPMs. +Leveraging the inherent structure of the data manifold, this method steers the +sampling process towards a more uniform distribution, effectively dispersing +the clustering of biased data. Without the need for modifying the existing +model or additional training, it significantly mitigates data bias and enhances +the quality and unbiasedness of the generated images. + +
+
+
+
+
+ + ♻ ☆ Modeling Dense Multimodal Interactions Between Biological Pathways and + Histology for Survival Prediction CVPR 2024 + + +
+ Integrating whole-slide images (WSIs) and bulk transcriptomics for predicting +patient survival can improve our understanding of patient prognosis. However, +this multimodal task is particularly challenging due to the different nature of +these data: WSIs represent a very high-dimensional spatial description of a +tumor, while bulk transcriptomics represent a global description of gene +expression levels within that tumor. In this context, our work aims to address +two key challenges: (1) how can we tokenize transcriptomics in a semantically +meaningful and interpretable way?, and (2) how can we capture dense multimodal +interactions between these two modalities? Specifically, we propose to learn +biological pathway tokens from transcriptomics that can encode specific +cellular functions. Together with histology patch tokens that encode the +different morphological patterns in the WSI, we argue that they form +appropriate reasoning units for downstream interpretability analyses. We +propose fusing both modalities using a memory-efficient multimodal Transformer +that can model interactions between pathway and histology patch tokens. Our +proposed model, SURVPATH, achieves state-of-the-art performance when evaluated +against both unimodal and multimodal baselines on five datasets from The Cancer +Genome Atlas. Our interpretability framework identifies key multimodal +prognostic factors, and, as such, can provide valuable insights into the +interaction between genotype and phenotype, enabling a deeper understanding of +the underlying biological mechanisms at play. We make our code public at: +https://github.com/ajv012/SurvPath. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Latent Noise Segmentation: How Neural Noise Leads to the Emergence of + Segmentation and Grouping + + +
+ Humans are able to segment images effortlessly without supervision using +perceptual grouping. In this work, we propose a counter-intuitive computational +approach to solving unsupervised perceptual grouping and segmentation: that +they arise \textit{because} of neural noise, rather than in spite of it. We (1) +mathematically demonstrate that under realistic assumptions, neural noise can +be used to separate objects from each other; (2) that adding noise in a DNN +enables the network to segment images even though it was never trained on any +segmentation labels; and (3) that segmenting objects using noise results in +segmentation performance that aligns with the perceptual grouping phenomena +observed in humans, and is sample-efficient. We introduce the Good Gestalt (GG) +datasets -- six datasets designed to specifically test perceptual grouping, and +show that our DNN models reproduce many important phenomena in human +perception, such as illusory contours, closure, continuity, proximity, and +occlusion. Finally, we (4) show that our model improves performance on our GG +datasets compared to other tested unsupervised models by $24.9\%$. Together, +our results suggest a novel unsupervised segmentation method requiring few +assumptions, a new explanation for the formation of perceptual grouping, and a +novel potential benefit of neural noise. + +
+
+
+
+
+ + ♻ ☆ RoHM: Robust Human Motion Reconstruction via Diffusion + + +
+ We propose RoHM, an approach for robust 3D human motion reconstruction from +monocular RGB(-D) videos in the presence of noise and occlusions. Most previous +approaches either train neural networks to directly regress motion in 3D or +learn data-driven motion priors and combine them with optimization at test +time. The former do not recover globally coherent motion and fail under +occlusions; the latter are time-consuming, prone to local minima, and require +manual tuning. To overcome these shortcomings, we exploit the iterative, +denoising nature of diffusion models. RoHM is a novel diffusion-based motion +model that, conditioned on noisy and occluded input data, reconstructs +complete, plausible motions in consistent global coordinates. Given the +complexity of the problem -- requiring one to address different tasks +(denoising and infilling) in different solution spaces (local and global +motion) -- we decompose it into two sub-tasks and learn two models, one for +global trajectory and one for local motion. To capture the correlations between +the two, we then introduce a novel conditioning module, combining it with an +iterative inference scheme. We apply RoHM to a variety of tasks -- from motion +reconstruction and denoising to spatial and temporal infilling. Extensive +experiments on three popular datasets show that our method outperforms +state-of-the-art approaches qualitatively and quantitatively, while being +faster at test time. The code is available at +https://sanweiliti.github.io/ROHM/ROHM.html. + +
+
+ comment: With the appendix included +
+
+
+
+
+ + ♻ ☆ Neural Knitworks: Patched Neural Implicit Representation Networks + + +
+ Coordinate-based Multilayer Perceptron (MLP) networks, despite being capable +of learning neural implicit representations, are not performant for internal +image synthesis applications. Convolutional Neural Networks (CNNs) are +typically used instead for a variety of internal generative tasks, at the cost +of a larger model. We propose Neural Knitwork, an architecture for neural +implicit representation learning of natural images that achieves image +synthesis by optimizing the distribution of image patches in an adversarial +manner and by enforcing consistency between the patch predictions. To the best +of our knowledge, this is the first implementation of a coordinate-based MLP +tailored for synthesis tasks such as image inpainting, super-resolution, and +denoising. We demonstrate the utility of the proposed technique by training on +these three tasks. The results show that modeling natural images using patches, +rather than pixels, produces results of higher fidelity. The resulting model +requires 80% fewer parameters than alternative CNN-based solutions while +achieving comparable performance and training time. + +
+
+ comment: Published in Pattern Recognition +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 66 + +
+
+
+ + ☆ Orientation-conditioned Facial Texture Mapping for Video-based Facial + Remote Photoplethysmography Estimation + + +
+ Camera-based remote photoplethysmography (rPPG) enables contactless +measurement of important physiological signals such as pulse rate (PR). +However, dynamic and unconstrained subject motion introduces significant +variability into the facial appearance in video, confounding the ability of +video-based methods to accurately extract the rPPG signal. In this study, we +leverage the 3D facial surface to construct a novel orientation-conditioned +facial texture video representation which improves the motion robustness of +existing video-based facial rPPG estimation methods. Our proposed method +achieves a significant 18.2% performance improvement in cross-dataset testing +on MMPD over our baseline using the PhysNet model trained on PURE, highlighting +the efficacy and generalization benefits of our designed video representation. +We demonstrate significant performance improvements of up to 29.6% in all +tested motion scenarios in cross-dataset testing on MMPD, even in the presence +of dynamic and unconstrained subject motion. Emphasizing the benefits the +benefits of disentangling motion through modeling the 3D facial surface for +motion robust facial rPPG estimation. We validate the efficacy of our design +decisions and the impact of different video processing steps through an +ablation study. Our findings illustrate the potential strengths of exploiting +the 3D facial surface as a general strategy for addressing dynamic and +unconstrained subject motion in videos. The code is available at +https://samcantrill.github.io/orientation-uv-rppg/. + +
+
+ comment: 12 pages, 8 figures, 6 tables +
+
+
+
+
+ + ☆ \textit{sweet} -- An Open Source Modular Platform for Contactless Hand + Vascular Biometric Experiments + + +
+ Current finger-vein or palm-vein recognition systems usually require direct +contact of the subject with the apparatus. This can be problematic in +environments where hygiene is of primary importance. In this work we present a +contactless vascular biometrics sensor platform named \sweet which can be used +for hand vascular biometrics studies (wrist-, palm- and finger-vein) and +surface features such as palmprint. It supports several acquisition modalities +such as multi-spectral Near-Infrared (NIR), RGB-color, Stereo Vision (SV) and +Photometric Stereo (PS). Using this platform we collect a dataset consisting of +the fingers, palm and wrist vascular data of 120 subjects and develop a +powerful 3D pipeline for the pre-processing of this data. We then present +biometric experimental results, focusing on Finger-Vein Recognition (FVR). +Finally, we discuss fusion of multiple modalities, such palm-vein combined with +palm-print biometrics. The acquisition software, parts of the hardware design, +the new FV dataset, as well as source-code for our experiments are publicly +available for research purposes. + +
+
+
+
+
+ + ☆ Exploring Feedback Generation in Automated Skeletal Movement Assessment: + A Comprehensive Overview + + +
+ The application of machine-learning solutions to movement assessment from +skeleton videos has attracted significant research attention in recent years. +This advancement has made rehabilitation at home more accessible, utilizing +movement assessment algorithms that can operate on affordable equipment for +human pose detection from 2D or 3D videos. While the primary objective of +automatic assessment tasks is to score movements, the automatic generation of +feedback highlighting key movement issues has the potential to significantly +enhance and accelerate the rehabilitation process. In this study, we explain +the types of feedback that can be generated, review existing solutions for +automatic feedback generation, and discuss future research directions. To our +knowledge, this is the first comprehensive review of feedback generation in +skeletal movement assessment. + +
+
+
+
+
+ + ☆ Adversarial Robustness Limits via Scaling-Law and Human-Alignment + Studies + + +
+ This paper revisits the simple, long-studied, yet still unsolved problem of +making image classifiers robust to imperceptible perturbations. Taking CIFAR10 +as an example, SOTA clean accuracy is about $100$%, but SOTA robustness to +$\ell_{\infty}$-norm bounded perturbations barely exceeds $70$%. To understand +this gap, we analyze how model size, dataset size, and synthetic data quality +affect robustness by developing the first scaling laws for adversarial +training. Our scaling laws reveal inefficiencies in prior art and provide +actionable feedback to advance the field. For instance, we discovered that SOTA +methods diverge notably from compute-optimal setups, using excess compute for +their level of robustness. Leveraging a compute-efficient setup, we surpass the +prior SOTA with $20$% ($70$%) fewer training (inference) FLOPs. We trained +various compute-efficient models, with our best achieving $74$% AutoAttack +accuracy ($+3$% gain). However, our scaling laws also predict robustness slowly +grows then plateaus at $90$%: dwarfing our new SOTA by scaling is impractical, +and perfect robustness is impossible. To better understand this predicted +limit, we carry out a small-scale human evaluation on the AutoAttack data that +fools our top-performing model. Concerningly, we estimate that human +performance also plateaus near $90$%, which we show to be attributable to +$\ell_{\infty}$-constrained attacks' generation of invalid images not +consistent with their original labels. Having characterized limiting +roadblocks, we outline promising paths for future research. + +
+
+
+
+
+ + ☆ Face-voice Association in Multilingual Environments (FAME) Challenge + 2024 Evaluation Plan + + +
+ The advancements of technology have led to the use of multimodal systems in +various real-world applications. Among them, the audio-visual systems are one +of the widely used multimodal systems. In the recent years, associating face +and voice of a person has gained attention due to presence of unique +correlation between them. The Face-voice Association in Multilingual +Environments (FAME) Challenge 2024 focuses on exploring face-voice association +under a unique condition of multilingual scenario. This condition is inspired +from the fact that half of the world's population is bilingual and most often +people communicate under multilingual scenario. The challenge uses a dataset +namely, Multilingual Audio-Visual (MAV-Celeb) for exploring face-voice +association in multilingual environments. This report provides the details of +the challenge, dataset, baselines and task details for the FAME Challenge. + +
+
+ comment: ACM Multimedia Conference - Grand Challenge +
+
+
+
+
+ + ☆ Weight Copy and Low-Rank Adaptation for Few-Shot Distillation of Vision + Transformers + + +
+ Few-shot knowledge distillation recently emerged as a viable approach to +harness the knowledge of large-scale pre-trained models, using limited data and +computational resources. In this paper, we propose a novel few-shot feature +distillation approach for vision transformers. Our approach is based on two key +steps. Leveraging the fact that vision transformers have a consistent +depth-wise structure, we first copy the weights from intermittent layers of +existing pre-trained vision transformers (teachers) into shallower +architectures (students), where the intermittence factor controls the +complexity of the student transformer with respect to its teacher. Next, we +employ an enhanced version of Low-Rank Adaptation (LoRA) to distill knowledge +into the student in a few-shot scenario, aiming to recover the information +processing carried out by the skipped teacher layers. We present comprehensive +experiments with supervised and self-supervised transformers as teachers, on +five data sets from various domains, including natural, medical and satellite +images. The empirical results confirm the superiority of our approach over +competitive baselines. Moreover, the ablation results demonstrate the +usefulness of each component of the proposed pipeline. + +
+
+
+
+
+ + ☆ In My Perspective, In My Hands: Accurate Egocentric 2D Hand Pose and + Action Recognition + + +
+ Action recognition is essential for egocentric video understanding, allowing +automatic and continuous monitoring of Activities of Daily Living (ADLs) +without user effort. Existing literature focuses on 3D hand pose input, which +requires computationally intensive depth estimation networks or wearing an +uncomfortable depth sensor. In contrast, there has been insufficient research +in understanding 2D hand pose for egocentric action recognition, despite the +availability of user-friendly smart glasses in the market capable of capturing +a single RGB image. Our study aims to fill this research gap by exploring the +field of 2D hand pose estimation for egocentric action recognition, making two +contributions. Firstly, we introduce two novel approaches for 2D hand pose +estimation, namely EffHandNet for single-hand estimation and EffHandEgoNet, +tailored for an egocentric perspective, capturing interactions between hands +and objects. Both methods outperform state-of-the-art models on H2O and FPHA +public benchmarks. Secondly, we present a robust action recognition +architecture from 2D hand and object poses. This method incorporates +EffHandEgoNet, and a transformer-based action recognition method. Evaluated on +H2O and FPHA datasets, our architecture has a faster inference time and +achieves an accuracy of 91.32% and 94.43%, respectively, surpassing state of +the art, including 3D-based methods. Our work demonstrates that using 2D +skeletal data is a robust approach for egocentric action understanding. +Extensive evaluation and ablation studies show the impact of the hand pose +estimation approach, and how each input affects the overall performance. + +
+
+ comment: Accepted at: The 18th IEEE International Conference on Automatic Face + and Gesture Recognition +
+
+
+
+
+ + ☆ A Simple Strategy for Body Estimation from Partial-View Images CVPR + + +
+ Virtual try-on and product personalization have become increasingly important +in modern online shopping, highlighting the need for accurate body measurement +estimation. Although previous research has advanced in estimating 3D body +shapes from RGB images, the task is inherently ambiguous as the observed scale +of human subjects in the images depends on two unknown factors: capture +distance and body dimensions. This ambiguity is particularly pronounced in +partial-view scenarios. To address this challenge, we propose a modular and +simple height normalization solution. This solution relocates the subject +skeleton to the desired position, thereby normalizing the scale and +disentangling the relationship between the two variables. Our experimental +results demonstrate that integrating this technique into state-of-the-art human +mesh reconstruction models significantly enhances partial body measurement +estimation. Additionally, we illustrate the applicability of this approach to +multi-view settings, showcasing its versatility. + +
+
+ comment: Accepted to CVPRW 2024 Computer Vision for Fashion, Art, and Design +
+
+
+
+
+ + ☆ A Novel State Space Model with Local Enhancement and State Sharing for + Image Fusion + + +
+ In image fusion tasks, images from different sources possess distinct +characteristics. This has driven the development of numerous methods to explore +better ways of fusing them while preserving their respective characteristics. +Mamba, as a state space model, has emerged in the field of natural language +processing. Recently, many studies have attempted to extend Mamba to vision +tasks. However, due to the nature of images different from casual language +sequences, the limited state capacity of Mamba weakens its ability to model +image information. Additionally, the sequence modeling ability of Mamba is only +capable of spatial information and cannot effectively capture the rich spectral +information in images. Motivated by these challenges, we customize and improve +the vision Mamba network designed for the image fusion task. Specifically, we +propose the local-enhanced vision Mamba block, dubbed as LEVM. The LEVM block +can improve local information perception of the network and simultaneously +learn local and global spatial information. Furthermore, we propose the state +sharing technique to enhance spatial details and integrate spatial and spectral +information. Finally, the overall network is a multi-scale structure based on +vision Mamba, called LE-Mamba. Extensive experiments show the proposed methods +achieve state-of-the-art results on multispectral pansharpening and +multispectral and hyperspectral image fusion datasets, and demonstrate the +effectiveness of the proposed approach. Code will be made available. + +
+
+
+
+
+ + ☆ Bridging Data Islands: Geographic Heterogeneity-Aware Federated Learning + for Collaborative Remote Sensing Semantic Segmentation + + +
+ Remote sensing semantic segmentation (RSS) is an essential task in Earth +Observation missions. Due to data privacy concerns, high-quality remote sensing +images with annotations cannot be well shared among institutions, making it +difficult to fully utilize RSS data to train a generalized model. Federated +Learning (FL), a privacy-preserving collaborative learning technology, is a +potential solution. However, the current research on how to effectively apply +FL in RSS is still scarce and requires further investigation. Remote sensing +images in various institutions often exhibit strong geographical heterogeneity. +More specifically, it is reflected in terms of class-distribution heterogeneity +and object-appearance heterogeneity. Unfortunately, most existing FL studies +show inadequate focus on geographical heterogeneity, thus leading to +performance degradation in the global model. Considering the aforementioned +issues, we propose a novel Geographic Heterogeneity-Aware Federated Learning +(GeoFed) framework to address privacy-preserving RSS. Through Global Feature +Extension and Tail Regeneration modules, class-distribution heterogeneity is +alleviated. Additionally, we design an Essential Feature Mining strategy to +alleviate object-appearance heterogeneity by constructing essential features. +Extensive experiments on three datasets (i.e., FBP, CASID, Inria) show that our +GeoFed consistently outperforms the current state-of-the-art methods. The code +will be available publicly. + +
+
+ comment: 13 pages,9 figures, 4 tables +
+
+
+
+
+ + ☆ RoofDiffusion: Constructing Roofs from Severely Corrupted Point Data via + Diffusion + + +
+ Accurate completion and denoising of roof height maps are crucial to +reconstructing high-quality 3D buildings. Repairing sparse points can enhance +low-cost sensor use and reduce UAV flight overlap. RoofDiffusion is a new +end-to-end self-supervised diffusion technique for robustly completing, in +particular difficult, roof height maps. RoofDiffusion leverages +widely-available curated footprints and can so handle up to 99\% point sparsity +and 80\% roof area occlusion (regional incompleteness). A variant, No-FP +RoofDiffusion, simultaneously predicts building footprints and heights. Both +quantitatively outperform state-of-the-art unguided depth completion and +representative inpainting methods for Digital Elevation Models (DEM), on both a +roof-specific benchmark and the BuildingNet dataset. Qualitative assessments +show the effectiveness of RoofDiffusion for datasets with real-world scans +including AHN3, Dales3D, and USGS 3DEP LiDAR. Tested with the leading City3D +algorithm, preprocessing height maps with RoofDiffusion noticeably improves 3D +building reconstruction. RoofDiffusion is complemented by a new dataset of 13k +complex roof geometries, focusing on long-tail issues in remote sensing; a +novel simulation of tree occlusion; and a wide variety of large-area roof +cut-outs for data augmentation and benchmarking. + +
+
+
+
+
+ + ☆ SyntStereo2Real: Edge-Aware GAN for Remote Sensing Image-to-Image + Translation while Maintaining Stereo Constraint CVPR + + +
+ In the field of remote sensing, the scarcity of stereo-matched and +particularly lack of accurate ground truth data often hinders the training of +deep neural networks. The use of synthetically generated images as an +alternative, alleviates this problem but suffers from the problem of domain +generalization. Unifying the capabilities of image-to-image translation and +stereo-matching presents an effective solution to address the issue of domain +generalization. Current methods involve combining two networks, an unpaired +image-to-image translation network and a stereo-matching network, while jointly +optimizing them. We propose an edge-aware GAN-based network that effectively +tackles both tasks simultaneously. We obtain edge maps of input images from the +Sobel operator and use it as an additional input to the encoder in the +generator to enforce geometric consistency during translation. We additionally +include a warping loss calculated from the translated images to maintain the +stereo consistency. We demonstrate that our model produces qualitatively and +quantitatively superior results than existing models, and its applicability +extends to diverse domains, including autonomous driving. + +
+
+ comment: Accepted to IEEE Conference on Computer Vision and Pattern + Recognition Workshop (CVPRW) EarthVision +
+
+
+
+
+ + ☆ TrafficVLM: A Controllable Visual Language Model for Traffic Video + Captioning + + +
+ Traffic video description and analysis have received much attention recently +due to the growing demand for efficient and reliable urban surveillance +systems. Most existing methods only focus on locating traffic event segments, +which severely lack descriptive details related to the behaviour and context of +all the subjects of interest in the events. In this paper, we present +TrafficVLM, a novel multi-modal dense video captioning model for vehicle ego +camera view. TrafficVLM models traffic video events at different levels of +analysis, both spatially and temporally, and generates long fine-grained +descriptions for the vehicle and pedestrian at different phases of the event. +We also propose a conditional component for TrafficVLM to control the +generation outputs and a multi-task fine-tuning paradigm to enhance +TrafficVLM's learning capability. Experiments show that TrafficVLM performs +well on both vehicle and overhead camera views. Our solution achieved +outstanding results in Track 2 of the AI City Challenge 2024, ranking us third +in the challenge standings. Our code is publicly available at +https://github.com/quangminhdinh/TrafficVLM. + +
+
+
+
+
+ + ☆ VRS-NeRF: Visual Relocalization with Sparse Neural Radiance Field + + +
+ Visual relocalization is a key technique to autonomous driving, robotics, and +virtual/augmented reality. After decades of explorations, absolute pose +regression (APR), scene coordinate regression (SCR), and hierarchical methods +(HMs) have become the most popular frameworks. However, in spite of high +efficiency, APRs and SCRs have limited accuracy especially in large-scale +outdoor scenes; HMs are accurate but need to store a large number of 2D +descriptors for matching, resulting in poor efficiency. In this paper, we +propose an efficient and accurate framework, called VRS-NeRF, for visual +relocalization with sparse neural radiance field. Precisely, we introduce an +explicit geometric map (EGM) for 3D map representation and an implicit learning +map (ILM) for sparse patches rendering. In this localization process, EGP +provides priors of spare 2D points and ILM utilizes these sparse points to +render patches with sparse NeRFs for matching. This allows us to discard a +large number of 2D descriptors so as to reduce the map size. Moreover, +rendering patches only for useful points rather than all pixels in the whole +image reduces the rendering time significantly. This framework inherits the +accuracy of HMs and discards their low efficiency. Experiments on 7Scenes, +CambridgeLandmarks, and Aachen datasets show that our method gives much better +accuracy than APRs and SCRs, and close performance to HMs but is much more +efficient. + +
+
+ comment: source code https://github.com/feixue94/vrs-nerf +
+
+
+
+
+ + ☆ PANet: A Physics-guided Parametric Augmentation Net for Image Dehazing + by Hazing + + +
+ Image dehazing faces challenges when dealing with hazy images in real-world +scenarios. A huge domain gap between synthetic and real-world haze images +degrades dehazing performance in practical settings. However, collecting +real-world image datasets for training dehazing models is challenging since +both hazy and clean pairs must be captured under the same conditions. In this +paper, we propose a Physics-guided Parametric Augmentation Network (PANet) that +generates photo-realistic hazy and clean training pairs to effectively enhance +real-world dehazing performance. PANet comprises a Haze-to-Parameter Mapper +(HPM) to project hazy images into a parameter space and a Parameter-to-Haze +Mapper (PHM) to map the resampled haze parameters back to hazy images. In the +parameter space, we can pixel-wisely resample individual haze parameter maps to +generate diverse hazy images with physically-explainable haze conditions unseen +in the training set. Our experimental results demonstrate that PANet can +augment diverse realistic hazy images to enrich existing hazy image benchmarks +so as to effectively boost the performances of state-of-the-art image dehazing +models. + +
+
+
+
+
+ + ☆ Task-Driven Exploration: Decoupling and Inter-Task Feedback for Joint + Moment Retrieval and Highlight Detection + + +
+ Video moment retrieval and highlight detection are two highly valuable tasks +in video understanding, but until recently they have been jointly studied. +Although existing studies have made impressive advancement recently, they +predominantly follow the data-driven bottom-up paradigm. Such paradigm +overlooks task-specific and inter-task effects, resulting in poor model +performance. In this paper, we propose a novel task-driven top-down framework +TaskWeave for joint moment retrieval and highlight detection. The framework +introduces a task-decoupled unit to capture task-specific and common +representations. To investigate the interplay between the two tasks, we propose +an inter-task feedback mechanism, which transforms the results of one task as +guiding masks to assist the other task. Different from existing methods, we +present a task-dependent joint loss function to optimize the model. +Comprehensive experiments and in-depth ablation studies on QVHighlights, TVSum, +and Charades-STA datasets corroborate the effectiveness and flexibility of the +proposed framework. Codes are available at +https://github.com/EdenGabriel/TaskWeave. + +
+
+
+
+
+ + ☆ FedCCL: Federated Dual-Clustered Feature Contrast Under Domain + Heterogeneity + + +
+ Federated learning (FL) facilitates a privacy-preserving neural network +training paradigm through collaboration between edge clients and a central +server. One significant challenge is that the distributed data is not +independently and identically distributed (non-IID), typically including both +intra-domain and inter-domain heterogeneity. However, recent research is +limited to simply using averaged signals as a form of regularization and only +focusing on one aspect of these non-IID challenges. Given these limitations, +this paper clarifies these two non-IID challenges and attempts to introduce +cluster representation to address them from both local and global perspectives. +Specifically, we propose a dual-clustered feature contrast-based FL framework +with dual focuses. First, we employ clustering on the local representations of +each client, aiming to capture intra-class information based on these local +clusters at a high level of granularity. Then, we facilitate cross-client +knowledge sharing by pulling the local representation closer to clusters shared +by clients with similar semantics while pushing them away from clusters with +dissimilar semantics. Second, since the sizes of local clusters belonging to +the same class may differ for each client, we further utilize clustering on the +global side and conduct averaging to create a consistent global signal for +guiding each local training in a contrastive manner. Experimental results on +multiple datasets demonstrate that our proposal achieves comparable or superior +performance gain under intra-domain and inter-domain heterogeneity. + +
+
+
+
+
+ + ☆ TEXT2TASTE: A Versatile Egocentric Vision System for Intelligent Reading + Assistance Using Large Language Model + + +
+ The ability to read, understand and find important information from written +text is a critical skill in our daily lives for our independence, comfort and +safety. However, a significant part of our society is affected by partial +vision impairment, which leads to discomfort and dependency in daily +activities. To address the limitations of this part of society, we propose an +intelligent reading assistant based on smart glasses with embedded RGB cameras +and a Large Language Model (LLM), whose functionality goes beyond corrective +lenses. The video recorded from the egocentric perspective of a person wearing +the glasses is processed to localise text information using object detection +and optical character recognition methods. The LLM processes the data and +allows the user to interact with the text and responds to a given query, thus +extending the functionality of corrective lenses with the ability to find and +summarize knowledge from the text. To evaluate our method, we create a +chat-based application that allows the user to interact with the system. The +evaluation is conducted in a real-world setting, such as reading menus in a +restaurant, and involves four participants. The results show robust accuracy in +text retrieval. The system not only provides accurate meal suggestions but also +achieves high user satisfaction, highlighting the potential of smart glasses +and LLMs in assisting people with special needs. + +
+
+ comment: Accepted at ICCHP 2024 +
+
+
+
+
+ + ☆ Arena: A Patch-of-Interest ViT Inference Acceleration System for + Edge-Assisted Video Analytics + + +
+ The advent of edge computing has made real-time intelligent video analytics +feasible. Previous works, based on traditional model architecture (e.g., CNN, +RNN, etc.), employ various strategies to filter out non-region-of-interest +content to minimize bandwidth and computation consumption but show inferior +performance in adverse environments. Recently, visual foundation models based +on transformers have shown great performance in adverse environments due to +their amazing generalization capability. However, they require a large amount +of computation power, which limits their applications in real-time intelligent +video analytics. In this paper, we find visual foundation models like Vision +Transformer (ViT) also have a dedicated acceleration mechanism for video +analytics. To this end, we introduce Arena, an end-to-end edge-assisted video +inference acceleration system based on ViT. We leverage the capability of ViT +that can be accelerated through token pruning by only offloading and feeding +Patches-of-Interest (PoIs) to the downstream models. Additionally, we employ +probability-based patch sampling, which provides a simple but efficient +mechanism for determining PoIs where the probable locations of objects are in +subsequent frames. Through extensive evaluations on public datasets, our +findings reveal that Arena can boost inference speeds by up to $1.58\times$ and +$1.82\times$ on average while consuming only 54% and 34% of the bandwidth, +respectively, all with high inference accuracy. + +
+
+
+
+
+ + ☆ Tri-modal Confluence with Temporal Dynamics for Scene Graph Generation + in Operating Rooms + + +
+ A comprehensive understanding of surgical scenes allows for monitoring of the +surgical process, reducing the occurrence of accidents and enhancing efficiency +for medical professionals. Semantic modeling within operating rooms, as a scene +graph generation (SGG) task, is challenging since it involves consecutive +recognition of subtle surgical actions over prolonged periods. To address this +challenge, we propose a Tri-modal (i.e., images, point clouds, and language) +confluence with Temporal dynamics framework, termed TriTemp-OR. Diverging from +previous approaches that integrated temporal information via memory graphs, our +method embraces two advantages: 1) we directly exploit bi-modal temporal +information from the video streaming for hierarchical feature interaction, and +2) the prior knowledge from Large Language Models (LLMs) is embedded to +alleviate the class-imbalance problem in the operating theatre. Specifically, +our model performs temporal interactions across 2D frames and 3D point clouds, +including a scale-adaptive multi-view temporal interaction (ViewTemp) and a +geometric-temporal point aggregation (PointTemp). Furthermore, we transfer +knowledge from the biomedical LLM, LLaVA-Med, to deepen the comprehension of +intraoperative relations. The proposed TriTemp-OR enables the aggregation of +tri-modal features through relation-aware unification to predict relations so +as to generate scene graphs. Experimental results on the 4D-OR benchmark +demonstrate the superior performance of our model for long-term OR streaming. + +
+
+ comment: 10 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ DreamScape: 3D Scene Creation via Gaussian Splatting joint Correlation + Modeling + + +
+ Recent progress in text-to-3D creation has been propelled by integrating the +potent prior of Diffusion Models from text-to-image generation into the 3D +domain. Nevertheless, generating 3D scenes characterized by multiple instances +and intricate arrangements remains challenging. In this study, we present +DreamScape, a method for creating highly consistent 3D scenes solely from +textual descriptions, leveraging the strong 3D representation capabilities of +Gaussian Splatting and the complex arrangement abilities of large language +models (LLMs). Our approach involves a 3D Gaussian Guide ($3{DG^2}$) for scene +representation, consisting of semantic primitives (objects) and their spatial +transformations and relationships derived directly from text prompts using +LLMs. This compositional representation allows for local-to-global optimization +of the entire scene. A progressive scale control is tailored during local +object generation, ensuring that objects of different sizes and densities adapt +to the scene, which addresses training instability issue arising from simple +blending in the subsequent global optimization stage. To mitigate potential +biases of LLM priors, we model collision relationships between objects at the +global level, enhancing physical correctness and overall realism. Additionally, +to generate pervasive objects like rain and snow distributed extensively across +the scene, we introduce a sparse initialization and densification strategy. +Experiments demonstrate that DreamScape offers high usability and +controllability, enabling the generation of high-fidelity 3D scenes from only +text prompts and achieving state-of-the-art performance compared to other +methods. + +
+
+
+
+
+ + ☆ Breast Cancer Image Classification Method Based on Deep Transfer + Learning + + +
+ To address the issues of limited samples, time-consuming feature design, and +low accuracy in detection and classification of breast cancer pathological +images, a breast cancer image classification model algorithm combining deep +learning and transfer learning is proposed. This algorithm is based on the +DenseNet structure of deep neural networks, and constructs a network model by +introducing attention mechanisms, and trains the enhanced dataset using +multi-level transfer learning. Experimental results demonstrate that the +algorithm achieves an efficiency of over 84.0\% in the test set, with a +significantly improved classification accuracy compared to previous models, +making it applicable to medical breast cancer detection tasks. + +
+
+
+
+
+ + ☆ DetCLIPv3: Towards Versatile Generative Open-vocabulary Object Detection CVPR2024 + + +
+ Existing open-vocabulary object detectors typically require a predefined set +of categories from users, significantly confining their application scenarios. +In this paper, we introduce DetCLIPv3, a high-performing detector that excels +not only at both open-vocabulary object detection, but also generating +hierarchical labels for detected objects. DetCLIPv3 is characterized by three +core designs: 1. Versatile model architecture: we derive a robust open-set +detection framework which is further empowered with generation ability via the +integration of a caption head. 2. High information density data: we develop an +auto-annotation pipeline leveraging visual large language model to refine +captions for large-scale image-text pairs, providing rich, multi-granular +object labels to enhance the training. 3. Efficient training strategy: we +employ a pre-training stage with low-resolution inputs that enables the object +captioner to efficiently learn a broad spectrum of visual concepts from +extensive image-text paired data. This is followed by a fine-tuning stage that +leverages a small number of high-resolution samples to further enhance +detection performance. With these effective designs, DetCLIPv3 demonstrates +superior open-vocabulary detection performance, \eg, our Swin-T backbone model +achieves a notable 47.0 zero-shot fixed AP on the LVIS minival benchmark, +outperforming GLIPv2, GroundingDINO, and DetCLIPv2 by 18.0/19.6/6.6 AP, +respectively. DetCLIPv3 also achieves a state-of-the-art 19.7 AP in dense +captioning task on VG dataset, showcasing its strong generative capability. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ FedDistill: Global Model Distillation for Local Model De-Biasing in + Non-IID Federated Learning + + +
+ Federated Learning (FL) is a novel approach that allows for collaborative +machine learning while preserving data privacy by leveraging models trained on +decentralized devices. However, FL faces challenges due to non-uniformly +distributed (non-iid) data across clients, which impacts model performance and +its generalization capabilities. To tackle the non-iid issue, recent efforts +have utilized the global model as a teaching mechanism for local models. +However, our pilot study shows that their effectiveness is constrained by +imbalanced data distribution, which induces biases in local models and leads to +a 'local forgetting' phenomenon, where the ability of models to generalize +degrades over time, particularly for underrepresented classes. This paper +introduces FedDistill, a framework enhancing the knowledge transfer from the +global model to local models, focusing on the issue of imbalanced class +distribution. Specifically, FedDistill employs group distillation, segmenting +classes based on their frequency in local datasets to facilitate a focused +distillation process to classes with fewer samples. Additionally, FedDistill +dissects the global model into a feature extractor and a classifier. This +separation empowers local models with more generalized data representation +capabilities and ensures more accurate classification across all classes. +FedDistill mitigates the adverse effects of data imbalance, ensuring that local +models do not forget underrepresented classes but instead become more adept at +recognizing and classifying them accurately. Our comprehensive experiments +demonstrate FedDistill's effectiveness, surpassing existing baselines in +accuracy and convergence speed across several benchmark datasets. + +
+
+ comment: 13 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ TextHawk: Exploring Efficient Fine-Grained Perception of Multimodal + Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) have shown impressive results on +various multimodal tasks. However, most existing MLLMs are not well suited for +document-oriented tasks, which require fine-grained image perception and +information compression. In this paper, we present TextHawk, a MLLM that is +specifically designed for document-oriented tasks, while preserving the general +capabilities of MLLMs. TextHawk is aimed to explore efficient fine-grained +perception by designing four dedicated components. Firstly, a ReSampling and +ReArrangement (ReSA) module is proposed to reduce the redundancy in the +document texts and lower the computational cost of the MLLM. We explore +encoding the positions of each local feature by presenting Scalable Positional +Embeddings (SPEs), which can preserve the scalability of various image sizes. A +Query Proposal Network (QPN) is then adopted to initialize the queries +dynamically among different sub-images. To further enhance the fine-grained +visual perceptual ability of the MLLM, we design a Multi-Level Cross-Attention +(MLCA) mechanism that captures the hierarchical structure and semantic +relations of document images. Furthermore, we create a new instruction-tuning +dataset for document-oriented tasks by enriching the multimodal document data +with Gemini Pro. We conduct extensive experiments on both general and +document-oriented MLLM benchmarks, and show that TextHawk outperforms the +state-of-the-art methods, demonstrating its effectiveness and superiority in +fine-grained document perception and general abilities. + +
+
+
+
+
+ + ☆ FaceCat: Enhancing Face Recognition Security with a Unified Generative + Model Framework + + +
+ Face anti-spoofing (FAS) and adversarial detection (FAD) have been regarded +as critical technologies to ensure the safety of face recognition systems. As a +consequence of their limited practicality and generalization, some existing +methods aim to devise a framework capable of concurrently detecting both +threats to address the challenge. Nevertheless, these methods still encounter +challenges of insufficient generalization and suboptimal robustness, +potentially owing to the inherent drawback of discriminative models. Motivated +by the rich structural and detailed features of face generative models, we +propose FaceCat which utilizes the face generative model as a pre-trained model +to improve the performance of FAS and FAD. Specifically, FaceCat elaborately +designs a hierarchical fusion mechanism to capture rich face semantic features +of the generative model. These features then serve as a robust foundation for a +lightweight head, designed to execute FAS and FAD tasks simultaneously. As +relying solely on single-modality data often leads to suboptimal performance, +we further propose a novel text-guided multi-modal alignment strategy that +utilizes text prompts to enrich feature representation, thereby enhancing +performance. For fair evaluations, we build a comprehensive protocol with a +wide range of 28 attack types to benchmark the performance. Extensive +experiments validate the effectiveness of FaceCat generalizes significantly +better and obtains excellent robustness against input transformations. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Change Guiding Network: Incorporating Change Prior to Guide Change + Detection in Remote Sensing Imagery + + +
+ The rapid advancement of automated artificial intelligence algorithms and +remote sensing instruments has benefited change detection (CD) tasks. However, +there is still a lot of space to study for precise detection, especially the +edge integrity and internal holes phenomenon of change features. In order to +solve these problems, we design the Change Guiding Network (CGNet), to tackle +the insufficient expression problem of change features in the conventional +U-Net structure adopted in previous methods, which causes inaccurate edge +detection and internal holes. Change maps from deep features with rich semantic +information are generated and used as prior information to guide multi-scale +feature fusion, which can improve the expression ability of change features. +Meanwhile, we propose a self-attention module named Change Guide Module (CGM), +which can effectively capture the long-distance dependency among pixels and +effectively overcome the problem of the insufficient receptive field of +traditional convolutional neural networks. On four major CD datasets, we verify +the usefulness and efficiency of the CGNet, and a large number of experiments +and ablation studies demonstrate the effectiveness of CGNet. We're going to +open-source our code at https://github.com/ChengxiHAN/CGNet-CD. + +
+
+
+
+
+ + ☆ HANet: A Hierarchical Attention Network for Change Detection With + Bitemporal Very-High-Resolution Remote Sensing Images + + +
+ Benefiting from the developments in deep learning technology, +deep-learning-based algorithms employing automatic feature extraction have +achieved remarkable performance on the change detection (CD) task. However, the +performance of existing deep-learning-based CD methods is hindered by the +imbalance between changed and unchanged pixels. To tackle this problem, a +progressive foreground-balanced sampling strategy on the basis of not adding +change information is proposed in this article to help the model accurately +learn the features of the changed pixels during the early training process and +thereby improve detection performance.Furthermore, we design a discriminative +Siamese network, hierarchical attention network (HANet), which can integrate +multiscale features and refine detailed features. The main part of HANet is the +HAN module, which is a lightweight and effective self-attention mechanism. +Extensive experiments and ablation studies on two CDdatasets with extremely +unbalanced labels validate the effectiveness and efficiency of the proposed +method. + +
+
+
+
+
+ + ☆ LoopAnimate: Loopable Salient Object Animation + + +
+ Research on diffusion model-based video generation has advanced rapidly. +However, limitations in object fidelity and generation length hinder its +practical applications. Additionally, specific domains like animated wallpapers +require seamless looping, where the first and last frames of the video match +seamlessly. To address these challenges, this paper proposes LoopAnimate, a +novel method for generating videos with consistent start and end frames. To +enhance object fidelity, we introduce a framework that decouples multi-level +image appearance and textual semantic information. Building upon an +image-to-image diffusion model, our approach incorporates both pixel-level and +feature-level information from the input image, injecting image appearance and +textual semantic embeddings at different positions of the diffusion model. +Existing UNet-based video generation models require to input the entire videos +during training to encode temporal and positional information at once. However, +due to limitations in GPU memory, the number of frames is typically restricted +to 16. To address this, this paper proposes a three-stage training strategy +with progressively increasing frame numbers and reducing fine-tuning modules. +Additionally, we introduce the Temporal E nhanced Motion Module(TEMM) to extend +the capacity for encoding temporal and positional information up to 36 frames. +The proposed LoopAnimate, which for the first time extends the single-pass +generation length of UNet-based video generation models to 35 frames while +maintaining high-quality video generation. Experiments demonstrate that +LoopAnimate achieves state-of-the-art performance in both objective metrics, +such as fidelity and temporal consistency, and subjective evaluation results. + +
+
+
+
+
+ + ☆ Coreset Selection for Object Detection CVPR 2024 + + +
+ Coreset selection is a method for selecting a small, representative subset of +an entire dataset. It has been primarily researched in image classification, +assuming there is only one object per image. However, coreset selection for +object detection is more challenging as an image can contain multiple objects. +As a result, much research has yet to be done on this topic. Therefore, we +introduce a new approach, Coreset Selection for Object Detection (CSOD). CSOD +generates imagewise and classwise representative feature vectors for multiple +objects of the same class within each image. Subsequently, we adopt submodular +optimization for considering both representativeness and diversity and utilize +the representative vectors in the submodular optimization process to select a +subset. When we evaluated CSOD on the Pascal VOC dataset, CSOD outperformed +random selection by +6.4%p in AP$_{50}$ when selecting 200 images. + +
+
+ comment: Accepted by CVPR 2024: 1st Workshop on Dataset Distillation for + Computer Vision +
+
+
+
+
+ + ☆ StreakNet-Arch: An Anti-scattering Network-based Architecture for + Underwater Carrier LiDAR-Radar Imaging + + +
+ In this paper, we introduce StreakNet-Arch, a novel signal processing +architecture designed for Underwater Carrier LiDAR-Radar (UCLR) imaging +systems, to address the limitations in scatter suppression and real-time +imaging. StreakNet-Arch formulates the signal processing as a real-time, +end-to-end binary classification task, enabling real-time image acquisition. To +achieve this, we leverage Self-Attention networks and propose a novel Double +Branch Cross Attention (DBC-Attention) mechanism that surpasses the performance +of traditional methods. Furthermore, we present a method for embedding +streak-tube camera images into attention networks, effectively acting as a +learned bandpass filter. To facilitate further research, we contribute a +publicly available streak-tube camera image dataset. The dataset contains +2,695,168 real-world underwater 3D point cloud data. These advancements +significantly improve UCLR capabilities, enhancing its performance and +applicability in underwater imaging tasks. The source code and dataset can be +found at https://github.com/BestAnHongjun/StreakNet . + +
+
+
+
+
+ + ☆ Fusion-Mamba for Cross-modality Object Detection + + +
+ Cross-modality fusing complementary information from different modalities +effectively improves object detection performance, making it more useful and +robust for a wider range of applications. Existing fusion strategies combine +different types of images or merge different backbone features through +elaborated neural network modules. However, these methods neglect that modality +disparities affect cross-modality fusion performance, as different modalities +with different camera focal lengths, placements, and angles are hardly fused. +In this paper, we investigate cross-modality fusion by associating cross-modal +features in a hidden state space based on an improved Mamba with a gating +mechanism. We design a Fusion-Mamba block (FMB) to map cross-modal features +into a hidden state space for interaction, thereby reducing disparities between +cross-modal features and enhancing the representation consistency of fused +features. FMB contains two modules: the State Space Channel Swapping (SSCS) +module facilitates shallow feature fusion, and the Dual State Space Fusion +(DSSF) enables deep fusion in a hidden state space. Through extensive +experiments on public datasets, our proposed approach outperforms the +state-of-the-art methods on $m$AP with 5.9% on $M^3FD$ and 4.9% on FLIR-Aligned +datasets, demonstrating superior object detection performance. To the best of +our knowledge, this is the first work to explore the potential of Mamba for +cross-modal fusion and establish a new baseline for cross-modality object +detection. + +
+
+
+
+
+ + ☆ GCC: Generative Calibration Clustering + + +
+ Deep clustering as an important branch of unsupervised representation +learning focuses on embedding semantically similar samples into the identical +feature space. This core demand inspires the exploration of contrastive +learning and subspace clustering. However, these solutions always rely on the +basic assumption that there are sufficient and category-balanced samples for +generating valid high-level representation. This hypothesis actually is too +strict to be satisfied for real-world applications. To overcome such a +challenge, the natural strategy is utilizing generative models to augment +considerable instances. How to use these novel samples to effectively fulfill +clustering performance improvement is still difficult and under-explored. In +this paper, we propose a novel Generative Calibration Clustering (GCC) method +to delicately incorporate feature learning and augmentation into clustering +procedure. First, we develop a discriminative feature alignment mechanism to +discover intrinsic relationship across real and generated samples. Second, we +design a self-supervised metric learning to generate more reliable cluster +assignment to boost the conditional diffusion generation. Extensive +experimental results on three benchmarks validate the effectiveness and +advantage of our proposed method over the state-of-the-art methods. + +
+
+
+
+
+ + ☆ Exploring Generative AI for Sim2Real in Driving Data Synthesis + + +
+ Datasets are essential for training and testing vehicle perception +algorithms. However, the collection and annotation of real-world images is +time-consuming and expensive. Driving simulators offer a solution by +automatically generating various driving scenarios with corresponding +annotations, but the simulation-to-reality (Sim2Real) domain gap remains a +challenge. While most of the Generative Artificial Intelligence (AI) follows +the de facto Generative Adversarial Nets (GANs)-based methods, the recent +emerging diffusion probabilistic models have not been fully explored in +mitigating Sim2Real challenges for driving data synthesis. To explore the +performance, this paper applied three different generative AI methods to +leverage semantic label maps from a driving simulator as a bridge for the +creation of realistic datasets. A comparative analysis of these methods is +presented from the perspective of image quality and perception. New synthetic +datasets, which include driving images and auto-generated high-quality +annotations, are produced with low costs and high scene variability. The +experimental results show that although GAN-based methods are adept at +generating high-quality images when provided with manually annotated labels, +ControlNet produces synthetic datasets with fewer artefacts and more structural +fidelity when using simulator-generated labels. This suggests that the +diffusion-based approach may provide improved stability and an alternative +method for addressing Sim2Real challenges. + +
+
+
+
+
+ + ☆ EGGS: Edge Guided Gaussian Splatting for Radiance Fields + + +
+ The Gaussian splatting methods are getting popular. However, their loss +function only contains the $\ell_1$ norm and the structural similarity between +the rendered and input images, without considering the edges in these images. +It is well-known that the edges in an image provide important information. +Therefore, in this paper, we propose an Edge Guided Gaussian Splatting (EGGS) +method that leverages the edges in the input images. More specifically, we give +the edge region a higher weight than the flat region. With such edge guidance, +the resulting Gaussian particles focus more on the edges instead of the flat +regions. Moreover, such edge guidance does not crease the computation cost +during the training and rendering stage. The experiments confirm that such +simple edge-weighted loss function indeed improves about $1\sim2$ dB on several +difference data sets. With simply plugging in the edge guidance, the proposed +method can improve all Gaussian splatting methods in different scenarios, such +as human head modeling, building 3D reconstruction, etc. + +
+
+
+
+
+ + ☆ VideoSAGE: Video Summarization with Graph Representation Learning + + +
+ We propose a graph-based representation learning framework for video +summarization. First, we convert an input video to a graph where nodes +correspond to each of the video frames. Then, we impose sparsity on the graph +by connecting only those pairs of nodes that are within a specified temporal +distance. We then formulate the video summarization task as a binary node +classification problem, precisely classifying video frames whether they should +belong to the output summary video. A graph constructed this way aims to +capture long-range interactions among video frames, and the sparsity ensures +the model trains without hitting the memory and compute bottleneck. Experiments +on two datasets(SumMe and TVSum) demonstrate the effectiveness of the proposed +nimble model compared to existing state-of-the-art summarization approaches +while being one order of magnitude more efficient in compute time and memory + +
+
+ comment: arXiv admin note: text overlap with arXiv:2207.07783 +
+
+
+
+
+ + ♻ ☆ In-N-Out: Faithful 3D GAN Inversion with Volumetric Decomposition for + Face Editing + + +
+ 3D-aware GANs offer new capabilities for view synthesis while preserving the +editing functionalities of their 2D counterparts. GAN inversion is a crucial +step that seeks the latent code to reconstruct input images or videos, +subsequently enabling diverse editing tasks through manipulation of this latent +code. However, a model pre-trained on a particular dataset (e.g., FFHQ) often +has difficulty reconstructing images with out-of-distribution (OOD) objects +such as faces with heavy make-up or occluding objects. We address this issue by +explicitly modeling OOD objects from the input in 3D-aware GANs. Our core idea +is to represent the image using two individual neural radiance fields: one for +the in-distribution content and the other for the out-of-distribution object. +The final reconstruction is achieved by optimizing the composition of these two +radiance fields with carefully designed regularization. We demonstrate that our +explicit decomposition alleviates the inherent trade-off between reconstruction +fidelity and editability. We evaluate reconstruction accuracy and editability +of our method on challenging real face images and videos and showcase favorable +results against other baselines. + +
+
+ comment: Project page: https://in-n-out-3d.github.io/ +
+
+
+
+
+ + ♻ ☆ Gaussian Splatting SLAM CVPR2024 + + +
+ We present the first application of 3D Gaussian Splatting in monocular SLAM, +the most fundamental but the hardest setup for Visual SLAM. Our method, which +runs live at 3fps, utilises Gaussians as the only 3D representation, unifying +the required representation for accurate, efficient tracking, mapping, and +high-quality rendering. Designed for challenging monocular settings, our +approach is seamlessly extendable to RGB-D SLAM when an external depth sensor +is available. Several innovations are required to continuously reconstruct 3D +scenes with high fidelity from a live camera. First, to move beyond the +original 3DGS algorithm, which requires accurate poses from an offline +Structure from Motion (SfM) system, we formulate camera tracking for 3DGS using +direct optimisation against the 3D Gaussians, and show that this enables fast +and robust tracking with a wide basin of convergence. Second, by utilising the +explicit nature of the Gaussians, we introduce geometric verification and +regularisation to handle the ambiguities occurring in incremental 3D dense +reconstruction. Finally, we introduce a full SLAM system which not only +achieves state-of-the-art results in novel view synthesis and trajectory +estimation but also reconstruction of tiny and even transparent objects. + +
+
+ comment: CVPR2024 Highlight. First two authors contributed equally to this + work. Project Page: https://rmurai.co.uk/projects/GaussianSplattingSLAM/ +
+
+
+
+
+ + ♻ ☆ OmniControl: Control Any Joint at Any Time for Human Motion Generation ICLR 2024 + + +
+ We present a novel approach named OmniControl for incorporating flexible +spatial control signals into a text-conditioned human motion generation model +based on the diffusion process. Unlike previous methods that can only control +the pelvis trajectory, OmniControl can incorporate flexible spatial control +signals over different joints at different times with only one model. +Specifically, we propose analytic spatial guidance that ensures the generated +motion can tightly conform to the input control signals. At the same time, +realism guidance is introduced to refine all the joints to generate more +coherent motion. Both the spatial and realism guidance are essential and they +are highly complementary for balancing control accuracy and motion realism. By +combining them, OmniControl generates motions that are realistic, coherent, and +consistent with the spatial constraints. Experiments on HumanML3D and KIT-ML +datasets show that OmniControl not only achieves significant improvement over +state-of-the-art methods on pelvis control but also shows promising results +when incorporating the constraints over other joints. + +
+
+ comment: ICLR 2024. Project page: https://neu-vi.github.io/omnicontrol/ +
+
+
+
+
+ + ♻ ☆ Analysis of the Two-Step Heterogeneous Transfer Learning for Laryngeal + Blood Vessel Classification: Issue and Improvement + + +
+ Accurate classification of laryngeal vascular as benign or malignant is +crucial for early detection of laryngeal cancer. However, organizations with +limited access to laryngeal vascular images face challenges due to the lack of +large and homogeneous public datasets for effective learning. Distinguished +from the most familiar works, which directly transfer the ImageNet pre-trained +models to the target domain for fine-tuning, this work pioneers exploring +two-step heterogeneous transfer learning (THTL) for laryngeal lesion +classification with nine deep-learning models, utilizing the diabetic +retinopathy color fundus images, semantically non-identical yet vascular +images, as the intermediate domain. Attention visualization technique, Layer +Class Activate Map (LayerCAM), reveals a novel finding that yet the +intermediate and the target domain both reflect vascular structure to a certain +extent, the prevalent radial vascular pattern in the intermediate domain +prevents learning the features of twisted and tangled vessels that distinguish +the malignant class in the target domain, summarizes a vital rule for laryngeal +lesion classification using THTL. To address this, we introduce an enhanced +fine-tuning strategy in THTL called Step-Wise Fine-Tuning (SWFT) and apply it +to the ResNet models. SWFT progressively refines model performance by +accumulating fine-tuning layers from back to front, guided by the visualization +results of LayerCAM. Comparison with the original THTL approach shows +significant improvements. For ResNet18, the accuracy and malignant recall +increases by 26.1% and 79.8%, respectively, while for ResNet50, these +indicators improve by 20.4% and 62.2%, respectively. + +
+
+
+
+
+ + ♻ ☆ VMambaMorph: a Multi-Modality Deformable Image Registration Framework + based on Visual State Space Model with Cross-Scan Module + + +
+ Image registration, a critical process in medical imaging, involves aligning +different sets of medical imaging data into a single unified coordinate system. +Deep learning networks, such as the Convolutional Neural Network (CNN)-based +VoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model +(SSM)-based MambaMorph, have demonstrated effective performance in this domain. +The recent Visual State Space Model (VMamba), which incorporates a cross-scan +module with SSM, has exhibited promising improvements in modeling global-range +dependencies with efficient computational cost in computer vision tasks. This +paper hereby introduces an exploration of VMamba with image registration, named +VMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for +3D image registration. Utilizing a U-shaped network architecture, VMambaMorph +computes the deformation field based on target and source volumes. The +VMamba-based block with 2D cross-scan module is redesigned for 3D volumetric +feature processing. To overcome the complex motion and structure on +multi-modality images, we further propose a fine-tune recursive registration +framework. We validate VMambaMorph using a public benchmark brain MR-CT +registration dataset, comparing its performance against current +state-of-the-art methods. The results indicate that VMambaMorph achieves +competitive registration quality. The code for VMambaMorph with all baseline +methods is available on GitHub. + +
+
+
+
+
+ + ♻ ☆ Image Restoration by Denoising Diffusion Models with Iteratively + Preconditioned Guidance CVPR 2024 + + +
+ Training deep neural networks has become a common approach for addressing +image restoration problems. An alternative for training a "task-specific" +network for each observation model is to use pretrained deep denoisers for +imposing only the signal's prior within iterative algorithms, without +additional training. Recently, a sampling-based variant of this approach has +become popular with the rise of diffusion/score-based generative models. Using +denoisers for general purpose restoration requires guiding the iterations to +ensure agreement of the signal with the observations. In low-noise settings, +guidance that is based on back-projection (BP) has been shown to be a promising +strategy (used recently also under the names "pseudoinverse" or +"range/null-space" guidance). However, the presence of noise in the +observations hinders the gains from this approach. In this paper, we propose a +novel guidance technique, based on preconditioning that allows traversing from +BP-based guidance to least squares based guidance along the restoration scheme. +The proposed approach is robust to noise while still having much simpler +implementation than alternative methods (e.g., it does not require SVD or a +large number of iterations). We use it within both an optimization scheme and a +sampling-based scheme, and demonstrate its advantages over existing methods for +image deblurring and super-resolution. + +
+
+ comment: CVPR 2024 (camera-ready). Code can be found at: + https://github.com/tirer-lab/DDPG +
+
+
+
+
+ + ♻ ☆ SwiftBrush: One-Step Text-to-Image Diffusion Model with Variational + Score Distillation CVPR 2024 + + +
+ Despite their ability to generate high-resolution and diverse images from +text prompts, text-to-image diffusion models often suffer from slow iterative +sampling processes. Model distillation is one of the most effective directions +to accelerate these models. However, previous distillation methods fail to +retain the generation quality while requiring a significant amount of images +for training, either from real data or synthetically generated by the teacher +model. In response to this limitation, we present a novel image-free +distillation scheme named $\textbf{SwiftBrush}$. Drawing inspiration from +text-to-3D synthesis, in which a 3D neural radiance field that aligns with the +input prompt can be obtained from a 2D text-to-image diffusion prior via a +specialized loss without the use of any 3D data ground-truth, our approach +re-purposes that same loss for distilling a pretrained multi-step text-to-image +model to a student network that can generate high-fidelity images with just a +single inference step. In spite of its simplicity, our model stands as one of +the first one-step text-to-image generators that can produce images of +comparable quality to Stable Diffusion without reliance on any training image +data. Remarkably, SwiftBrush achieves an FID score of $\textbf{16.67}$ and a +CLIP score of $\textbf{0.29}$ on the COCO-30K benchmark, achieving competitive +results or even substantially surpassing existing state-of-the-art distillation +techniques. + +
+
+ comment: Accepted to CVPR 2024; Project Page: + https://thuanz123.github.io/swiftbrush/ +
+
+
+
+
+ + ♻ ☆ TFNet: Exploiting Temporal Cues for Fast and Accurate LiDAR Semantic + Segmentation CVPR2024 + + +
+ LiDAR semantic segmentation plays a crucial role in enabling autonomous +driving and robots to understand their surroundings accurately and robustly. A +multitude of methods exist within this domain, including point-based, +range-image-based, polar-coordinate-based, and hybrid strategies. Among these, +range-image-based techniques have gained widespread adoption in practical +applications due to their efficiency. However, they face a significant +challenge known as the ``many-to-one'' problem caused by the range image's +limited horizontal and vertical angular resolution. As a result, around 20% of +the 3D points can be occluded. In this paper, we present TFNet, a +range-image-based LiDAR semantic segmentation method that utilizes temporal +information to address this issue. Specifically, we incorporate a temporal +fusion layer to extract useful information from previous scans and integrate it +with the current scan. We then design a max-voting-based post-processing +technique to correct false predictions, particularly those caused by the +``many-to-one'' issue. We evaluated the approach on two benchmarks and +demonstrated that the plug-in post-processing technique is generic and can be +applied to various networks. + +
+
+ comment: accepted by CVPR2024 Workshop on Autonomous Driving +
+
+
+
+
+ + ♻ ☆ InstantMesh: Efficient 3D Mesh Generation from a Single Image with + Sparse-view Large Reconstruction Models + + +
+ We present InstantMesh, a feed-forward framework for instant 3D mesh +generation from a single image, featuring state-of-the-art generation quality +and significant training scalability. By synergizing the strengths of an +off-the-shelf multiview diffusion model and a sparse-view reconstruction model +based on the LRM architecture, InstantMesh is able to create diverse 3D assets +within 10 seconds. To enhance the training efficiency and exploit more +geometric supervisions, e.g, depths and normals, we integrate a differentiable +iso-surface extraction module into our framework and directly optimize on the +mesh representation. Experimental results on public datasets demonstrate that +InstantMesh significantly outperforms other latest image-to-3D baselines, both +qualitatively and quantitatively. We release all the code, weights, and demo of +InstantMesh, with the intention that it can make substantial contributions to +the community of 3D generative AI and empower both researchers and content +creators. + +
+
+ comment: Technical report. Project: https://github.com/TencentARC/InstantMesh +
+
+
+
+
+ + ♻ ☆ Specialty-Oriented Generalist Medical AI for Chest CT Screening + + +
+ Modern medical records include a vast amount of multimodal free text clinical +data and imaging data from radiology, cardiology, and digital pathology. Fully +mining such big data requires multitasking; otherwise, occult but important +aspects may be overlooked, adversely affecting clinical management and +population healthcare. Despite remarkable successes of AI in individual tasks +with single-modal data, the progress in developing generalist medical AI +remains relatively slow to combine multimodal data for multitasks because of +the dual challenges of data curation and model architecture. The data challenge +involves querying and curating multimodal structured and unstructured text, +alphanumeric, and especially 3D tomographic scans on an individual patient +level for real-time decisions and on a scale to estimate population health +statistics. The model challenge demands a scalable and adaptable network +architecture to integrate multimodal datasets for diverse clinical tasks. Here +we propose the first-of-its-kind medical multimodal-multitask foundation model +(M3FM) with application in lung cancer screening and related tasks. After we +curated a comprehensive multimodal multitask dataset consisting of 49 clinical +data types including 163,725 chest CT series and 17 medical tasks involved in +LCS, we develop a multimodal question-answering framework as a unified training +and inference strategy to synergize multimodal information and perform multiple +tasks via free-text prompting. M3FM consistently outperforms the +state-of-the-art single-modal task-specific models, identifies multimodal data +elements informative for clinical tasks and flexibly adapts to new tasks with a +small out-of-distribution dataset. As a specialty-oriented generalist medical +AI model, M3FM paves the way for similar breakthroughs in other areas of +medicine, closing the gap between specialists and the generalist. + +
+
+
+
+
+ + ♻ ☆ Domain Generalization for Crop Segmentation with Standardized Ensemble + Knowledge Distillation + + +
+ In recent years, precision agriculture has gradually oriented farming closer +to automation processes to support all the activities related to field +management. Service robotics plays a predominant role in this evolution by +deploying autonomous agents that can navigate fields while performing tasks +such as monitoring, spraying, and harvesting without human intervention. To +execute these precise actions, mobile robots need a real-time perception system +that understands their surroundings and identifies their targets in the wild. +Existing methods, however, often fall short in generalizing to new crops and +environmental conditions. This limit is critical for practical applications +where labeled samples are rarely available. In this paper, we investigate the +problem of crop segmentation and propose a novel approach to enhance domain +generalization using knowledge distillation. In the proposed framework, we +transfer knowledge from a standardized ensemble of models individually trained +on source domains to a student model that can adapt to unseen realistic +scenarios. To support the proposed method, we present a synthetic multi-domain +dataset for crop segmentation containing plants of variegate species and +covering different terrain styles, weather conditions, and light scenarios for +more than 70,000 samples. We demonstrate significant improvements in +performance over state-of-the-art methods and superior sim-to-real +generalization. Our approach provides a promising solution for domain +generalization in crop segmentation and has the potential to enhance a wide +variety of agriculture applications. + +
+
+
+
+
+ + ♻ ☆ RSBuilding: Towards General Remote Sensing Image Building Extraction and + Change Detection with Foundation Model + + +
+ The intelligent interpretation of buildings plays a significant role in urban +planning and management, macroeconomic analysis, population dynamics, etc. +Remote sensing image building interpretation primarily encompasses building +extraction and change detection. However, current methodologies often treat +these two tasks as separate entities, thereby failing to leverage shared +knowledge. Moreover, the complexity and diversity of remote sensing image +scenes pose additional challenges, as most algorithms are designed to model +individual small datasets, thus lacking cross-scene generalization. In this +paper, we propose a comprehensive remote sensing image building understanding +model, termed RSBuilding, developed from the perspective of the foundation +model. RSBuilding is designed to enhance cross-scene generalization and task +universality. Specifically, we extract image features based on the prior +knowledge of the foundation model and devise a multi-level feature sampler to +augment scale information. To unify task representation and integrate image +spatiotemporal clues, we introduce a cross-attention decoder with task prompts. +Addressing the current shortage of datasets that incorporate annotations for +both tasks, we have developed a federated training strategy to facilitate +smooth model convergence even when supervision for some tasks is missing, +thereby bolstering the complementarity of different tasks. Our model was +trained on a dataset comprising up to 245,000 images and validated on multiple +building extraction and change detection datasets. The experimental results +substantiate that RSBuilding can concurrently handle two structurally distinct +tasks and exhibits robust zero-shot generalization capabilities. + +
+
+
+
+
+ + ♻ ☆ AM-RADIO: Agglomerative Vision Foundation Model -- Reduce All Domains + Into One CVPR 2024 + + +
+ A handful of visual foundation models (VFMs) have recently emerged as the +backbones for numerous downstream tasks. VFMs like CLIP, DINOv2, SAM are +trained with distinct objectives, exhibiting unique characteristics for various +downstream tasks. We find that despite their conceptual differences, these +models can be effectively merged into a unified model through multi-teacher +distillation. We name this approach AM-RADIO (Agglomerative Model -- Reduce All +Domains Into One). This integrative approach not only surpasses the performance +of individual teacher models but also amalgamates their distinctive features, +such as zero-shot vision-language comprehension, detailed pixel-level +understanding, and open vocabulary segmentation capabilities. In pursuit of the +most hardware-efficient backbone, we evaluated numerous architectures in our +multi-teacher distillation pipeline using the same training recipe. This led to +the development of a novel architecture (E-RADIO) that exceeds the performance +of its predecessors and is at least 7x faster than the teacher models. Our +comprehensive benchmarking process covers downstream tasks including ImageNet +classification, ADE20k semantic segmentation, COCO object detection and +LLaVa-1.5 framework. + Code: https://github.com/NVlabs/RADIO + +
+
+ comment: CVPR 2024 Version 3: CVPR Camera Ready, reconfigured full paper, + table 1 is now more comprehensive Version 2: Added more acknowledgements and + updated table 7 with more recent results. Ensured that the link in the + abstract to our code is working properly Version 3: Fix broken hyperlinks +
+
+
+
+
+ + ♻ ☆ RMAFF-PSN: A Residual Multi-Scale Attention Feature Fusion Photometric + Stereo Network + + +
+ Predicting accurate normal maps of objects from two-dimensional images in +regions of complex structure and spatial material variations is challenging +using photometric stereo methods due to the influence of surface reflection +properties caused by variations in object geometry and surface materials. To +address this issue, we propose a photometric stereo network called a RMAFF-PSN +that uses residual multiscale attentional feature fusion to handle the +``difficult'' regions of the object. Unlike previous approaches that only use +stacked convolutional layers to extract deep features from the input image, our +method integrates feature information from different resolution stages and +scales of the image. This approach preserves more physical information, such as +texture and geometry of the object in complex regions, through shallow-deep +stage feature extraction, double branching enhancement, and attention +optimization. To test the network structure under real-world conditions, we +propose a new real dataset called Simple PS data, which contains multiple +objects with varying structures and materials. Experimental results on a +publicly available benchmark dataset demonstrate that our method outperforms +most existing calibrated photometric stereo methods for the same number of +input images, especially in the case of highly non-convex object structures. +Our method also obtains good results under sparse lighting conditions. + +
+
+ comment: 17 pages,12 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised + Learning AAAI2024 + + +
+ Semi-supervised learning (SSL) methods assume that labeled data, unlabeled +data and test data are from the same distribution. Open-set semi-supervised +learning (Open-set SSL) considers a more practical scenario, where unlabeled +data and test data contain new categories (outliers) not observed in labeled +data (inliers). Most previous works focused on outlier detection via binary +classifiers, which suffer from insufficient scalability and inability to +distinguish different types of uncertainty. In this paper, we propose a novel +framework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these +limitations. Concretely, we first introduce evidential deep learning (EDL) as +an outlier detector to quantify different types of uncertainty, and design +different uncertainty metrics for self-training and inference. Furthermore, we +propose a novel adaptive negative optimization strategy, making EDL more +tailored to the unlabeled dataset containing both inliers and outliers. As +demonstrated empirically, our proposed method outperforms existing +state-of-the-art methods across four datasets. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ DeS3: Adaptive Attention-driven Self and Soft Shadow Removal using ViT + Similarity AAAI2024 + + +
+ Removing soft and self shadows that lack clear boundaries from a single image +is still challenging. Self shadows are shadows that are cast on the object +itself. Most existing methods rely on binary shadow masks, without considering +the ambiguous boundaries of soft and self shadows. In this paper, we present +DeS3, a method that removes hard, soft and self shadows based on adaptive +attention and ViT similarity. Our novel ViT similarity loss utilizes features +extracted from a pre-trained Vision Transformer. This loss helps guide the +reverse sampling towards recovering scene structures. Our adaptive attention is +able to differentiate shadow regions from the underlying objects, as well as +shadow regions from the object casting the shadow. This capability enables DeS3 +to better recover the structures of objects even when they are partially +occluded by shadows. Different from existing methods that rely on constraints +during the training phase, we incorporate the ViT similarity during the +sampling stage. Our method outperforms state-of-the-art methods on the SRD, +AISTD, LRSS, USR and UIUC datasets, removing hard, soft, and self shadows +robustly. Specifically, our method outperforms the SOTA method by 16\% of the +RMSE of the whole image on the LRSS dataset. Our data and code is available at: +\url{https://github.com/jinyeying/DeS3_Deshadow} + +
+
+ comment: Accepted to AAAI2024, diffusion shadow removal, + \url{https://github.com/jinyeying/DeS3_Deshadow} +
+
+
+
+
+ + ♻ ☆ Allowing humans to interactively guide machines where to look does not + always improve human-AI team's classification accuracy CVPR + 2024 + + +
+ Via thousands of papers in Explainable AI (XAI), attention maps +\cite{vaswani2017attention} and feature attribution maps \cite{bansal2020sam} +have been established as a common means for finding how important each input +feature is to an AI's decisions. It is an interesting, unexplored question +whether allowing users to edit the feature importance at test time would +improve a human-AI team's accuracy on downstream tasks. In this paper, we +address this question by leveraging CHM-Corr, a state-of-the-art, ante-hoc +explainable classifier \cite{taesiri2022visual} that first predicts patch-wise +correspondences between the input and training-set images, and then base on +them to make classification decisions. We build CHM-Corr++, an interactive +interface for CHM-Corr, enabling users to edit the feature attribution map +provided by CHM-Corr and observe updated model decisions. Via CHM-Corr++, users +can gain insights into if, when, and how the model changes its outputs, +improving their understanding beyond static explanations. However, our user +study with 18 users who performed 1,400 decisions finds no statistical +significance that our interactive approach improves user accuracy on CUB-200 +bird image classification over static explanations. This challenges the +hypothesis that interactivity can boost human-AI team +accuracy~\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding} +and raises needs for future research. We open-source CHM-Corr++, an interactive +tool for editing image classifier attention (see an interactive demo +\href{http://137.184.82.109:7080/}{here}). % , and it lays the groundwork for +future research to enable effective human-AI interaction in computer vision. We +release code and data on +\href{https://github.com/anguyen8/chm-corr-interactive}{github}. + +
+
+ comment: Accepted for presentation at the XAI4CV Workshop, part of the CVPR + 2024 proceedings +
+
+
+
+
+ + ♻ ☆ High-quality Image Dehazing with Diffusion Model + + +
+ Image dehazing is quite challenging in dense-haze scenarios, where quite less +original information remains in the hazy image. Though previous methods have +made marvelous progress, they still suffer from information loss in content and +color in dense-haze scenarios. The recently emerged Denoising Diffusion +Probabilistic Model (DDPM) exhibits strong generation ability, showing +potential for solving this problem. However, DDPM fails to consider the physics +property of dehazing task, limiting its information completion capacity. In +this work, we propose DehazeDDPM: A DDPM-based and physics-aware image dehazing +framework that applies to complex hazy scenarios. Specifically, DehazeDDPM +works in two stages. The former stage physically models the dehazing task with +the Atmospheric Scattering Model (ASM), pulling the distribution closer to the +clear data and endowing DehazeDDPM with fog-aware ability. The latter stage +exploits the strong generation ability of DDPM to compensate for the +haze-induced huge information loss, by working in conjunction with the physical +modelling. Extensive experiments demonstrate that our method attains +state-of-the-art performance on both synthetic and real-world hazy datasets. + +
+
+
+
+
+ + ♻ ☆ Images are Achilles' Heel of Alignment: Exploiting Visual + Vulnerabilities for Jailbreaking Multimodal Large Language Models + + +
+ In this paper, we study the harmlessness alignment problem of multimodal +large language models (MLLMs). We conduct a systematic empirical analysis of +the harmlessness performance of representative MLLMs and reveal that the image +input poses the alignment vulnerability of MLLMs. Inspired by this, we propose +a novel jailbreak method named HADES, which hides and amplifies the harmfulness +of the malicious intent within the text input, using meticulously crafted +images. Experimental results show that HADES can effectively jailbreak existing +MLLMs, which achieves an average Attack Success Rate (ASR) of 90.26% for +LLaVA-1.5 and 71.60% for Gemini Pro Vision. Our code and data will be publicly +released. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State + Space Model + + +
+ Convolutional neural networks (CNN) and Transformers have made impressive +progress in the field of remote sensing change detection (CD). However, both +architectures have inherent shortcomings. Recently, the Mamba architecture, +based on state space models, has shown remarkable performance in a series of +natural language processing tasks, which can effectively compensate for the +shortcomings of the above two architectures. In this paper, we explore for the +first time the potential of the Mamba architecture for remote sensing CD tasks. +We tailor the corresponding frameworks, called MambaBCD, MambaSCD, and +MambaBDA, for binary change detection (BCD), semantic change detection (SCD), +and building damage assessment (BDA), respectively. All three frameworks adopt +the cutting-edge Visual Mamba architecture as the encoder, which allows full +learning of global spatial contextual information from the input images. For +the change decoder, which is available in all three architectures, we propose +three spatio-temporal relationship modeling mechanisms, which can be naturally +combined with the Mamba architecture and fully utilize its attribute to achieve +spatio-temporal interaction of multi-temporal features, thereby obtaining +accurate change information. On five benchmark datasets, our proposed +frameworks outperform current CNN- and Transformer-based approaches without +using any complex training strategies or tricks, fully demonstrating the +potential of the Mamba architecture in CD tasks. Specifically, we obtained +83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, LEVIR-CD+, +and WHU-CD; on the SCD dataset SECOND, we obtained 24.11% SeK; and on the BDA +dataset xBD, we obtained 81.41% overall F1 score. Further experiments show that +our architecture is quite robust to degraded data. The source code will be +available in https://github.com/ChenHongruixuan/MambaCD + +
+
+
+
+
+ + ♻ ☆ UFineBench: Towards Text-based Person Retrieval with Ultra-fine + Granularity + + +
+ Existing text-based person retrieval datasets often have relatively +coarse-grained text annotations. This hinders the model to comprehend the +fine-grained semantics of query texts in real scenarios. To address this +problem, we contribute a new benchmark named \textbf{UFineBench} for text-based +person retrieval with ultra-fine granularity. + Firstly, we construct a new \textbf{dataset} named UFine6926. We collect a +large number of person images and manually annotate each image with two +detailed textual descriptions, averaging 80.8 words each. The average word +count is three to four times that of the previous datasets. In addition of +standard in-domain evaluation, we also propose a special \textbf{evaluation +paradigm} more representative of real scenarios. It contains a new evaluation +set with cross domains, cross textual granularity and cross textual styles, +named UFine3C, and a new evaluation metric for accurately measuring retrieval +ability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a +more efficient \textbf{algorithm} especially designed for text-based person +retrieval with ultra fine-grained texts. It achieves fine granularity mining by +adopting a shared cross-modal granularity decoder and hard negative match +mechanism. + With standard in-domain evaluation, CFAM establishes competitive performance +across various datasets, especially on our ultra fine-grained UFine6926. +Furthermore, by evaluating on UFine3C, we demonstrate that training on our +UFine6926 significantly improves generalization to real scenarios compared with +other coarse-grained datasets. The dataset and code will be made publicly +available at \url{https://github.com/Zplusdragon/UFineBench}. + +
+
+
+
+
+ + ♻ ☆ Multisize Dataset Condensation ICLR 2024 + + +
+ While dataset condensation effectively enhances training efficiency, its +application in on-device scenarios brings unique challenges. 1) Due to the +fluctuating computational resources of these devices, there's a demand for a +flexible dataset size that diverges from a predefined size. 2) The limited +computational power on devices often prevents additional condensation +operations. These two challenges connect to the "subset degradation problem" in +traditional dataset condensation: a subset from a larger condensed dataset is +often unrepresentative compared to directly condensing the whole dataset to +that smaller size. In this paper, we propose Multisize Dataset Condensation +(MDC) by compressing N condensation processes into a single condensation +process to obtain datasets with multiple sizes. Specifically, we introduce an +"adaptive subset loss" on top of the basic condensation loss to mitigate the +"subset degradation problem". Our MDC method offers several benefits: 1) No +additional condensation process is required; 2) reduced storage requirement by +reusing condensed images. Experiments validate our findings on networks +including ConvNet, ResNet and DenseNet, and datasets including SVHN, CIFAR-10, +CIFAR-100 and ImageNet. For example, we achieved 5.22%-6.40% average accuracy +gains on condensing CIFAR-10 to ten images per class. Code is available at: +https://github.com/he-y/Multisize-Dataset-Condensation. + +
+
+ comment: Accepted by ICLR 2024 Oral +
+
+
+
+
+ + ♻ ☆ MambaAD: Exploring State Space Models for Multi-class Unsupervised + Anomaly Detection + + +
+ Recent advancements in anomaly detection have seen the efficacy of CNN- and +transformer-based approaches. However, CNNs struggle with long-range +dependencies, while transformers are burdened by quadratic computational +complexity. Mamba-based models, with their superior long-range modeling and +linear efficiency, have garnered substantial attention. This study pioneers the +application of Mamba to multi-class unsupervised anomaly detection, presenting +MambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring +(Locality-Enhanced State Space) LSS modules at multi-scales. The proposed LSS +module, integrating parallel cascaded (Hybrid State Space) HSS blocks and +multi-kernel convolutions operations, effectively captures both long-range and +local information. The HSS block, utilizing (Hybrid Scanning) HS encoders, +encodes feature maps into five scanning methods and eight directions, thereby +strengthening global connections through the (State Space Model) SSM. The use +of Hilbert scanning and eight directions significantly improves feature +sequence modeling. Comprehensive experiments on six diverse anomaly detection +datasets and seven metrics demonstrate state-of-the-art performance, +substantiating the method's effectiveness. + +
+
+
+
+
+ + ♻ ☆ 3D Geometry-aware Deformable Gaussian Splatting for Dynamic View + Synthesis CVPR 2024 + + +
+ In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting +method for dynamic view synthesis. Existing neural radiance fields (NeRF) based +solutions learn the deformation in an implicit manner, which cannot incorporate +3D scene geometry. Therefore, the learned deformation is not necessarily +geometrically coherent, which results in unsatisfactory dynamic view synthesis +and 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new +representation of the 3D scene, building upon which the 3D geometry could be +exploited in learning the complex 3D deformation. Specifically, the scenes are +represented as a collection of 3D Gaussian, where each 3D Gaussian is optimized +to move and rotate over time to model the deformation. To enforce the 3D scene +geometry constraint during deformation, we explicitly extract 3D geometry +features and integrate them in learning the 3D deformation. In this way, our +solution achieves 3D geometry-aware deformation modeling, which enables +improved dynamic view synthesis and 3D dynamic reconstruction. Extensive +experimental results on both synthetic and real datasets prove the superiority +of our solution, which achieves new state-of-the-art performance. + The project is available at https://npucvr.github.io/GaGS/ + +
+
+ comment: Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/ +
+
+
+
+
+ + ♻ ☆ Learning Spatial Features from Audio-Visual Correspondence in Egocentric + Videos CVPR 2024 + + +
+ We propose a self-supervised method for learning representations based on +spatial audio-visual correspondences in egocentric videos. Our method uses a +masked auto-encoding framework to synthesize masked binaural (multi-channel) +audio through the synergy of audio and vision, thereby learning useful spatial +relationships between the two modalities. We use our pretrained features to +tackle two downstream video tasks requiring spatial understanding in social +scenarios: active speaker detection and spatial audio denoising. Through +extensive experiments, we show that our features are generic enough to improve +over multiple state-of-the-art baselines on both tasks on two challenging +egocentric video datasets that offer binaural audio, EgoCom and EasyCom. +Project: http://vision.cs.utexas.edu/projects/ego_av_corr. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Retrieval-Augmented Generation for AI-Generated Content: A Survey + + +
+ Advancements in model algorithms, the growth of foundational models, and +access to high-quality datasets have propelled the evolution of Artificial +Intelligence Generated Content (AIGC). Despite its notable successes, AIGC +still faces hurdles such as updating knowledge, handling long-tail data, +mitigating data leakage, and managing high training and inference costs. +Retrieval-Augmented Generation (RAG) has recently emerged as a paradigm to +address such challenges. In particular, RAG introduces the information +retrieval process, which enhances the generation process by retrieving relevant +objects from available data stores, leading to higher accuracy and better +robustness. In this paper, we comprehensively review existing efforts that +integrate RAG technique into AIGC scenarios. We first classify RAG foundations +according to how the retriever augments the generator, distilling the +fundamental abstractions of the augmentation methodologies for various +retrievers and generators. This unified perspective encompasses all RAG +scenarios, illuminating advancements and pivotal technologies that help with +potential future progress. We also summarize additional enhancements methods +for RAG, facilitating effective engineering and implementation of RAG systems. +Then from another view, we survey on practical applications of RAG across +different modalities and tasks, offering valuable references for researchers +and practitioners. Furthermore, we introduce the benchmarks for RAG, discuss +the limitations of current RAG systems, and suggest potential directions for +future research. Github: https://github.com/PKU-DAIR/RAG-Survey. + +
+
+ comment: Citing 377 papers, 28 pages, 1 table, 12 figures. Project: + https://github.com/PKU-DAIR/RAG-Survey +
+
+
+
+
+ + ♻ ☆ A Survey on 3D Gaussian Splatting + + +
+ 3D Gaussian splatting (GS) has recently emerged as a transformative technique +in the realm of explicit radiance field and computer graphics. This innovative +approach, characterized by the utilization of millions of learnable 3D +Gaussians, represents a significant departure from mainstream neural radiance +field approaches, which predominantly use implicit, coordinate-based models to +map spatial coordinates to pixel values. 3D GS, with its explicit scene +representation and differentiable rendering algorithm, not only promises +real-time rendering capability but also introduces unprecedented levels of +editability. This positions 3D GS as a potential game-changer for the next +generation of 3D reconstruction and representation. In the present paper, we +provide the first systematic overview of the recent developments and critical +contributions in the domain of 3D GS. We begin with a detailed exploration of +the underlying principles and the driving forces behind the emergence of 3D GS, +laying the groundwork for understanding its significance. A focal point of our +discussion is the practical applicability of 3D GS. By enabling unprecedented +rendering speed, 3D GS opens up a plethora of applications, ranging from +virtual reality to interactive media and beyond. This is complemented by a +comparative analysis of leading 3D GS models, evaluated across various +benchmark tasks to highlight their performance and practical utility. The +survey concludes by identifying current challenges and suggesting potential +avenues for future research in this domain. Through this survey, we aim to +provide a valuable resource for both newcomers and seasoned researchers, +fostering further exploration and advancement in applicable and explicit +radiance field representation. + +
+
+ comment: Ongoing project +
+
+
+
+
+ + ♻ ☆ The Curse of Recursion: Training on Generated Data Makes Models Forget + + +
+ Stable Diffusion revolutionised image creation from descriptive text. GPT-2, +GPT-3(.5) and GPT-4 demonstrated astonishing performance across a variety of +language tasks. ChatGPT introduced such language models to the general public. +It is now clear that large language models (LLMs) are here to stay, and will +bring about drastic change in the whole ecosystem of online text and images. In +this paper we consider what the future might hold. What will happen to GPT-{n} +once LLMs contribute much of the language found online? We find that use of +model-generated content in training causes irreversible defects in the +resulting models, where tails of the original content distribution disappear. +We refer to this effect as Model Collapse and show that it can occur in +Variational Autoencoders, Gaussian Mixture Models and LLMs. We build +theoretical intuition behind the phenomenon and portray its ubiquity amongst +all learned generative models. We demonstrate that it has to be taken seriously +if we are to sustain the benefits of training from large-scale data scraped +from the web. Indeed, the value of data collected about genuine human +interactions with systems will be increasingly valuable in the presence of +content generated by LLMs in data crawled from the Internet. + +
+
+ comment: Fixed typos in eqn 4,5 +
+
+
+
+
+ + ♻ ☆ Curvature-Balanced Feature Manifold Learning for Long-Tailed + Classification CVPR 2023 + + +
+ To address the challenges of long-tailed classification, researchers have +proposed several approaches to reduce model bias, most of which assume that +classes with few samples are weak classes. However, recent studies have shown +that tail classes are not always hard to learn, and model bias has been +observed on sample-balanced datasets, suggesting the existence of other factors +that affect model bias. In this work, we systematically propose a series of +geometric measurements for perceptual manifolds in deep neural networks, and +then explore the effect of the geometric characteristics of perceptual +manifolds on classification difficulty and how learning shapes the geometric +characteristics of perceptual manifolds. An unanticipated finding is that the +correlation between the class accuracy and the separation degree of perceptual +manifolds gradually decreases during training, while the negative correlation +with the curvature gradually increases, implying that curvature imbalance leads +to model bias. Therefore, we propose curvature regularization to facilitate the +model to learn curvature-balanced and flatter perceptual manifolds. Evaluations +on multiple long-tailed and non-long-tailed datasets show the excellent +performance and exciting generality of our approach, especially in achieving +significant performance improvements based on current state-of-the-art +techniques. Our work opens up a geometric analysis perspective on model bias +and reminds researchers to pay attention to model bias on non-long-tailed and +even sample-balanced datasets. The code and model will be made public. + +
+
+ comment: 20pages, Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Towards Reliable Medical Image Segmentation by utilizing Evidential + Calibrated Uncertainty + + +
+ Medical image segmentation is critical for disease diagnosis and treatment +assessment. However, concerns regarding the reliability of segmentation regions +persist among clinicians, mainly attributed to the absence of confidence +assessment, robustness, and calibration to accuracy. To address this, we +introduce DEviS, an easily implementable foundational model that seamlessly +integrates into various medical image segmentation networks. DEviS not only +enhances the calibration and robustness of baseline segmentation accuracy but +also provides high-efficiency uncertainty estimation for reliable predictions. +By leveraging subjective logic theory, we explicitly model probability and +uncertainty for the problem of medical image segmentation. Here, the Dirichlet +distribution parameterizes the distribution of probabilities for different +classes of the segmentation results. To generate calibrated predictions and +uncertainty, we develop a trainable calibrated uncertainty penalty. +Furthermore, DEviS incorporates an uncertainty-aware filtering module, which +utilizes the metric of uncertainty-calibrated error to filter reliable data +within the dataset. We conducted validation studies to assess both the accuracy +and robustness of DEviS segmentation, along with evaluating the efficiency and +reliability of uncertainty estimation. These evaluations were performed using +publicly available datasets including ISIC2018, LiTS2017, and BraTS2019. +Additionally, two potential clinical trials are being conducted at Johns +Hopkins OCT, Duke-OCT-DME, and FIVES datasets to demonstrate their efficacy in +filtering high-quality or out-of-distribution data. Our code has been released +in https://github.com/Cocofeat/DEviS. + +
+
+ comment: 34 pages, 11 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 72 + +
+
+
+ + ☆ Probabilistic Directed Distance Fields for Ray-Based Shape + Representations + + +
+ In modern computer vision, the optimal representation of 3D shape continues +to be task-dependent. One fundamental operation applied to such representations +is differentiable rendering, as it enables inverse graphics approaches in +learning frameworks. Standard explicit shape representations (voxels, point +clouds, or meshes) are often easily rendered, but can suffer from limited +geometric fidelity, among other issues. On the other hand, implicit +representations (occupancy, distance, or radiance fields) preserve greater +fidelity, but suffer from complex or inefficient rendering processes, limiting +scalability. In this work, we devise Directed Distance Fields (DDFs), a novel +neural shape representation that builds upon classical distance fields. The +fundamental operation in a DDF maps an oriented point (position and direction) +to surface visibility and depth. This enables efficient differentiable +rendering, obtaining depth with a single forward pass per pixel, as well as +differential geometric quantity extraction (e.g., surface normals), with only +additional backward passes. Using probabilistic DDFs (PDDFs), we show how to +model inherent discontinuities in the underlying field. We then apply DDFs to +several applications, including single-shape fitting, generative modelling, and +single-image 3D reconstruction, showcasing strong performance with simple +architectural components via the versatility of our representation. Finally, +since the dimensionality of DDFs permits view-dependent geometric artifacts, we +conduct a theoretical investigation of the constraints necessary for view +consistency. We find a small set of field properties that are sufficient to +guarantee a DDF is consistent, without knowing, for instance, which shape the +field is expressing. + +
+
+ comment: Extension of arXiv:2112.05300 +
+
+
+
+
+ + ☆ Exploring Explainability in Video Action Recognition CVPR 2024 + + +
+ Image Classification and Video Action Recognition are perhaps the two most +foundational tasks in computer vision. Consequently, explaining the inner +workings of trained deep neural networks is of prime importance. While numerous +efforts focus on explaining the decisions of trained deep neural networks in +image classification, exploration in the domain of its temporal version, video +action recognition, has been scant. In this work, we take a deeper look at this +problem. We begin by revisiting Grad-CAM, one of the popular feature +attribution methods for Image Classification, and its extension to Video Action +Recognition tasks and examine the method's limitations. To address these, we +introduce Video-TCAV, by building on TCAV for Image Classification tasks, which +aims to quantify the importance of specific concepts in the decision-making +process of Video Action Recognition models. As the scalable generation of +concepts is still an open problem, we propose a machine-assisted approach to +generate spatial and spatiotemporal concepts relevant to Video Action +Recognition for testing Video-TCAV. We then establish the importance of +temporally-varying concepts by demonstrating the superiority of dynamic +spatiotemporal concepts over trivial spatial concepts. In conclusion, we +introduce a framework for investigating hypotheses in action recognition and +quantitatively testing them, thus advancing research in the explainability of +deep neural networks used in video action recognition. + +
+
+ comment: 6 pages, 10 figures, Accepted to the 3rd Explainable AI for Computer + Vision (XAI4CV) Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ Rethinking Iterative Stereo Matching from Diffusion Bridge Model + Perspective + + +
+ Recently, iteration-based stereo matching has shown great potential. However, +these models optimize the disparity map using RNN variants. The discrete +optimization process poses a challenge of information loss, which restricts the +level of detail that can be expressed in the generated disparity map. In order +to address these issues, we propose a novel training approach that incorporates +diffusion models into the iterative optimization process. We designed a +Time-based Gated Recurrent Unit (T-GRU) to correlate temporal and disparity +outputs. Unlike standard recurrent units, we employ Agent Attention to generate +more expressive features. We also designed an attention-based context network +to capture a large amount of contextual information. Experiments on several +public benchmarks show that we have achieved competitive stereo matching +performance. Our model ranks first in the Scene Flow dataset, achieving over a +7% improvement compared to competing methods, and requires only 8 iterations to +achieve state-of-the-art results. + +
+
+ comment: tip. arXiv admin note: text overlap with arXiv:2303.06615 by other + authors +
+
+
+
+
+ + ☆ Improving Personalisation in Valence and Arousal Prediction using Data + Augmentation + + +
+ In the field of emotion recognition and Human-Machine Interaction (HMI), +personalised approaches have exhibited their efficacy in capturing +individual-specific characteristics and enhancing affective prediction +accuracy. However, personalisation techniques often face the challenge of +limited data for target individuals. This paper presents our work on an +enhanced personalisation strategy, that leverages data augmentation to develop +tailored models for continuous valence and arousal prediction. Our proposed +approach, Distance Weighting Augmentation (DWA), employs a weighting-based +augmentation method that expands a target individual's dataset, leveraging +distance metrics to identify similar samples at the segment-level. Experimental +results on the MuSe-Personalisation 2023 Challenge dataset demonstrate that our +method significantly improves the performance of features sets which have low +baseline performance, on the test set. This improvement in poor-performing +features comes without sacrificing performance on high-performing features. In +particular, our method achieves a maximum combined testing CCC of 0.78, +compared to the reported baseline score of 0.76 (reproduced at 0.72). It also +achieved a peak arousal and valence scores of 0.81 and 0.76, compared to +reproduced baseline scores of 0.76 and 0.67 respectively. Through this work, we +make significant contributions to the advancement of personalised affective +computing models, enhancing the practicality and adaptability of data-level +personalisation in real world contexts. + +
+
+
+
+
+ + ☆ Theoretical research on generative diffusion models: an overview + + +
+ Generative diffusion models showed high success in many fields with a +powerful theoretical background. They convert the data distribution to noise +and remove the noise back to obtain a similar distribution. Many existing +reviews focused on the specific application areas without concentrating on the +research about the algorithm. Unlike them we investigated the theoretical +developments of the generative diffusion models. These approaches mainly divide +into two: training-based and sampling-based. Awakening to this allowed us a +clear and understandable categorization for the researchers who will make new +developments in the future. + +
+
+
+
+
+ + ☆ PracticalDG: Perturbation Distillation on Vision-Language Models for + Hybrid Domain Generalization CVPR2024 + + +
+ Domain Generalization (DG) aims to resolve distribution shifts between source +and target domains, and current DG methods are default to the setting that data +from source and target domains share identical categories. Nevertheless, there +exists unseen classes from target domains in practical scenarios. To address +this issue, Open Set Domain Generalization (OSDG) has emerged and several +methods have been exclusively proposed. However, most existing methods adopt +complex architectures with slight improvement compared with DG methods. +Recently, vision-language models (VLMs) have been introduced in DG following +the fine-tuning paradigm, but consume huge training overhead with large vision +models. Therefore, in this paper, we innovate to transfer knowledge from VLMs +to lightweight vision models and improve the robustness by introducing +Perturbation Distillation (PD) from three perspectives, including Score, Class +and Instance (SCI), named SCI-PD. Moreover, previous methods are oriented by +the benchmarks with identical and fixed splits, ignoring the divergence between +source domains. These methods are revealed to suffer from sharp performance +decay with our proposed new benchmark Hybrid Domain Generalization (HDG) and a +novel metric $H^{2}$-CV, which construct various splits to comprehensively +assess the robustness of algorithms. Extensive experiments demonstrate that our +method outperforms state-of-the-art algorithms on multiple datasets, especially +improving the robustness when confronting data scarcity. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ MMA-DFER: MultiModal Adaptation of unimodal models for Dynamic Facial + Expression Recognition in-the-wild CVPR 2024 + + +
+ Dynamic Facial Expression Recognition (DFER) has received significant +interest in the recent years dictated by its pivotal role in enabling empathic +and human-compatible technologies. Achieving robustness towards in-the-wild +data in DFER is particularly important for real-world applications. One of the +directions aimed at improving such models is multimodal emotion recognition +based on audio and video data. Multimodal learning in DFER increases the model +capabilities by leveraging richer, complementary data representations. Within +the field of multimodal DFER, recent methods have focused on exploiting +advances of self-supervised learning (SSL) for pre-training of strong +multimodal encoders. Another line of research has focused on adapting +pre-trained static models for DFER. In this work, we propose a different +perspective on the problem and investigate the advancement of multimodal DFER +performance by adapting SSL-pre-trained disjoint unimodal encoders. We identify +main challenges associated with this task, namely, intra-modality adaptation, +cross-modal alignment, and temporal adaptation, and propose solutions to each +of them. As a result, we demonstrate improvement over current state-of-the-art +on two popular DFER benchmarks, namely DFEW and MFAW. + +
+
+ comment: accepted to CVPR 2024 ABAW Workshop +
+
+
+
+
+ + ☆ THQA: A Perceptual Quality Assessment Database for Talking Heads + + +
+ In the realm of media technology, digital humans have gained prominence due +to rapid advancements in computer technology. However, the manual modeling and +control required for the majority of digital humans pose significant obstacles +to efficient development. The speech-driven methods offer a novel avenue for +manipulating the mouth shape and expressions of digital humans. Despite the +proliferation of driving methods, the quality of many generated talking head +(TH) videos remains a concern, impacting user visual experiences. To tackle +this issue, this paper introduces the Talking Head Quality Assessment (THQA) +database, featuring 800 TH videos generated through 8 diverse speech-driven +methods. Extensive experiments affirm the THQA database's richness in character +and speech features. Subsequent subjective quality assessment experiments +analyze correlations between scoring results and speech-driven methods, ages, +and genders. In addition, experimental results show that mainstream image and +video quality assessment methods have limitations for the THQA database, +underscoring the imperative for further research to enhance TH video quality +assessment. The THQA database is publicly accessible at +https://github.com/zyj-2000/THQA. + +
+
+
+
+
+ + ☆ Smart Help: Strategic Opponent Modeling for Proactive and Adaptive Robot + Assistance in Households + + +
+ Despite the significant demand for assistive technology among vulnerable +groups (e.g., the elderly, children, and the disabled) in daily tasks, research +into advanced AI-driven assistive solutions that genuinely accommodate their +diverse needs remains sparse. Traditional human-machine interaction tasks often +require machines to simply help without nuanced consideration of human +abilities and feelings, such as their opportunity for practice and learning, +sense of self-improvement, and self-esteem. Addressing this gap, we define a +pivotal and novel challenge Smart Help, which aims to provide proactive yet +adaptive support to human agents with diverse disabilities and dynamic goals in +various tasks and environments. To establish this challenge, we leverage +AI2-THOR to build a new interactive 3D realistic household environment for the +Smart Help task. We introduce an innovative opponent modeling module that +provides a nuanced understanding of the main agent's capabilities and goals, in +order to optimize the assisting agent's helping policy. Rigorous experiments +validate the efficacy of our model components and show the superiority of our +holistic approach against established baselines. Our findings illustrate the +potential of AI-imbued assistive robots in improving the well-being of +vulnerable groups. + +
+
+
+
+
+ + ☆ MaSkel: A Model for Human Whole-body X-rays Generation from Human + Masking Images + + +
+ The human whole-body X-rays could offer a valuable reference for various +applications, including medical diagnostics, digital animation modeling, and +ergonomic design. The traditional method of obtaining X-ray information +requires the use of CT (Computed Tomography) scan machines, which emit +potentially harmful radiation. Thus it faces a significant limitation for +realistic applications because it lacks adaptability and safety. In our work, +We proposed a new method to directly generate the 2D human whole-body X-rays +from the human masking images. The predicted images will be similar to the real +ones with the same image style and anatomic structure. We employed a +data-driven strategy. By leveraging advanced generative techniques, our model +MaSkel(Masking image to Skeleton X-rays) could generate a high-quality X-ray +image from a human masking image without the need for invasive and harmful +radiation exposure, which not only provides a new path to generate highly +anatomic and customized data but also reduces health risks. To our knowledge, +our model MaSkel is the first work for predicting whole-body X-rays. In this +paper, we did two parts of the work. The first one is to solve the data +limitation problem, the diffusion-based techniques are utilized to make a data +augmentation, which provides two synthetic datasets for preliminary +pretraining. Then we designed a two-stage training strategy to train MaSkel. At +last, we make qualitative and quantitative evaluations of the generated X-rays. +In addition, we invite some professional doctors to assess our predicted data. +These evaluations demonstrate the MaSkel's superior ability to generate +anatomic X-rays from human masking images. The related code and links of the +dataset are available at https://github.com/2022yingjie/MaSkel. + +
+
+
+
+
+ + ☆ Beyond Known Clusters: Probe New Prototypes for Efficient Generalized + Class Discovery + + +
+ Generalized Class Discovery (GCD) aims to dynamically assign labels to +unlabelled data partially based on knowledge learned from labelled data, where +the unlabelled data may come from known or novel classes. The prevailing +approach generally involves clustering across all data and learning conceptions +by prototypical contrastive learning. However, existing methods largely hinge +on the performance of clustering algorithms and are thus subject to their +inherent limitations. Firstly, the estimated cluster number is often smaller +than the ground truth, making the existing methods suffer from the lack of +prototypes for comprehensive conception learning. To address this issue, we +propose an adaptive probing mechanism that introduces learnable potential +prototypes to expand cluster prototypes (centers). As there is no ground truth +for the potential prototype, we develop a self-supervised prototype learning +framework to optimize the potential prototype in an end-to-end fashion. +Secondly, clustering is computationally intensive, and the conventional +strategy of clustering both labelled and unlabelled instances exacerbates this +issue. To counteract this inefficiency, we opt to cluster only the unlabelled +instances and subsequently expand the cluster prototypes with our introduced +potential prototypes to fast explore novel classes. Despite the simplicity of +our proposed method, extensive empirical analysis on a wide range of datasets +confirms that our method consistently delivers state-of-the-art results. +Specifically, our method surpasses the nearest competitor by a significant +margin of \textbf{9.7}$\%$ within the Stanford Cars dataset and +\textbf{12$\times$} clustering efficiency within the Herbarium 19 dataset. We +will make the code and checkpoints publicly available at +\url{https://github.com/xjtuYW/PNP.git}. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ A Fourier-enhanced multi-modal 3D small object optical mark recognition + and positioning method for percutaneous abdominal puncture surgical + navigation + + +
+ Navigation for thoracoabdominal puncture surgery is used to locate the needle +entry point on the patient's body surface. The traditional reflective ball +navigation method is difficult to position the needle entry point on the soft, +irregular, smooth chest and abdomen. Due to the lack of clear characteristic +points on the body surface using structured light technology, it is difficult +to identify and locate arbitrary needle insertion points. Based on the high +stability and high accuracy requirements of surgical navigation, this paper +proposed a novel method, a muti-modal 3D small object medical marker detection +method, which identifies the center of a small single ring as the needle +insertion point. Moreover, this novel method leverages Fourier transform +enhancement technology to augment the dataset, enrich image details, and +enhance the network's capability. The method extracts the Region of Interest +(ROI) of the feature image from both enhanced and original images, followed by +generating a mask map. Subsequently, the point cloud of the ROI from the depth +map is obtained through the registration of ROI point cloud contour fitting. In +addition, this method employs Tukey loss for optimal precision. The +experimental results show this novel method proposed in this paper not only +achieves high-precision and high-stability positioning, but also enables the +positioning of any needle insertion point. + +
+
+ comment: 19 pages, 6 figures, +
+
+
+
+
+ + ☆ Fast Fishing: Approximating BAIT for Efficient and Scalable Deep Active + Image Classification + + +
+ Deep active learning (AL) seeks to minimize the annotation costs for training +deep neural networks. BAIT, a recently proposed AL strategy based on the Fisher +Information, has demonstrated impressive performance across various datasets. +However, BAIT's high computational and memory requirements hinder its +applicability on large-scale classification tasks, resulting in current +research neglecting BAIT in their evaluation. This paper introduces two methods +to enhance BAIT's computational efficiency and scalability. Notably, we +significantly reduce its time complexity by approximating the Fisher +Information. In particular, we adapt the original formulation by i) taking the +expectation over the most probable classes, and ii) constructing a binary +classification task, leading to an alternative likelihood for gradient +computations. Consequently, this allows the efficient use of BAIT on +large-scale datasets, including ImageNet. Our unified and comprehensive +evaluation across a variety of datasets demonstrates that our approximations +achieve strong performance with considerably reduced time complexity. +Furthermore, we provide an extensive open-source toolbox that implements recent +state-of-the-art AL strategies, available at +https://github.com/dhuseljic/dal-toolbox. + +
+
+
+
+
+ + ☆ BG-YOLO: A Bidirectional-Guided Method for Underwater Object Detection + + +
+ Degraded underwater images decrease the accuracy of underwater object +detection. However, existing methods for underwater image enhancement mainly +focus on improving the indicators in visual aspects, which may not benefit the +tasks of underwater image detection, and may lead to serious degradation in +performance. To alleviate this problem, we proposed a bidirectional-guided +method for underwater object detection, referred to as BG-YOLO. In the proposed +method, network is organized by constructing an enhancement branch and a +detection branch in a parallel way. The enhancement branch consists of a +cascade of an image enhancement subnet and an object detection subnet. And the +detection branch only consists of a detection subnet. A feature guided module +connects the shallow convolution layer of the two branches. When training the +enhancement branch, the object detection subnet in the enhancement branch +guides the image enhancement subnet to be optimized towards the direction that +is most conducive to the detection task. The shallow feature map of the trained +enhancement branch will be output to the feature guided module, constraining +the optimization of detection branch through consistency loss and prompting +detection branch to learn more detailed information of the objects. And hence +the detection performance will be refined. During the detection tasks, only +detection branch will be reserved so that no additional cost of computation +will be introduced. Extensive experiments demonstrate that the proposed method +shows significant improvement in performance of the detector in severely +degraded underwater scenes while maintaining a remarkable detection speed. + +
+
+ comment: 15 pages, 8 figures, 4 tables +
+
+
+
+
+ + ☆ MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes CVPR 2024 + + +
+ Recent advancements in post-hoc and inherently interpretable methods have +markedly enhanced the explanations of black box classifier models. These +methods operate either through post-analysis or by integrating concept learning +during model training. Although being effective in bridging the semantic gap +between a model's latent space and human interpretation, these explanation +methods only partially reveal the model's decision-making process. The outcome +is typically limited to high-level semantics derived from the last feature map. +We argue that the explanations lacking insights into the decision processes at +low and mid-level features are neither fully faithful nor useful. Addressing +this gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet), +an inherently interpretable model. MCPNet autonomously learns meaningful +concept prototypes across multiple feature map levels using Centered Kernel +Alignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so +without reliance on predefined concept labels. Further, we propose a novel +classifier paradigm that learns and aligns multi-level concept prototype +distributions for classification purposes via Class-aware Concept Distribution +(CCD) loss. Our experiments reveal that our proposed MCPNet while being +adaptable to various model architectures, offers comprehensive multi-level +explanations while maintaining classification accuracy. Additionally, its +concept distribution-based classification approach shows improved +generalization capabilities in few-shot classification scenarios. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ LoopGaussian: Creating 3D Cinemagraph with Multi-view Images via + Eulerian Motion Field + + +
+ Cinemagraph is a unique form of visual media that combines elements of still +photography and subtle motion to create a captivating experience. However, the +majority of videos generated by recent works lack depth information and are +confined to the constraints of 2D image space. In this paper, inspired by +significant progress in the field of novel view synthesis (NVS) achieved by 3D +Gaussian Splatting (3D-GS), we propose LoopGaussian to elevate cinemagraph from +2D image space to 3D space using 3D Gaussian modeling. To achieve this, we +first employ the 3D-GS method to reconstruct 3D Gaussian point clouds from +multi-view images of static scenes,incorporating shape regularization terms to +prevent blurring or artifacts caused by object deformation. We then adopt an +autoencoder tailored for 3D Gaussian to project it into feature space. To +maintain the local continuity of the scene, we devise SuperGaussian for +clustering based on the acquired features. By calculating the similarity +between clusters and employing a two-stage estimation method, we derive an +Eulerian motion field to describe velocities across the entire scene. The 3D +Gaussian points then move within the estimated Eulerian motion field. Through +bidirectional animation techniques, we ultimately generate a 3D Cinemagraph +that exhibits natural and seamlessly loopable dynamics. Experiment results +validate the effectiveness of our approach, demonstrating high-quality and +visually appealing scene generation. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Seeing Text in the Dark: Algorithm and Benchmark + + +
+ Localizing text in low-light environments is challenging due to visual +degradations. Although a straightforward solution involves a two-stage pipeline +with low-light image enhancement (LLE) as the initial step followed by +detector, LLE is primarily designed for human vision instead of machine and can +accumulate errors. In this work, we propose an efficient and effective +single-stage approach for localizing text in dark that circumvents the need for +LLE. We introduce a constrained learning module as an auxiliary mechanism +during the training stage of the text detector. This module is designed to +guide the text detector in preserving textual spatial features amidst feature +map resizing, thus minimizing the loss of spatial information in texts under +low-light visual degradations. Specifically, we incorporate spatial +reconstruction and spatial semantic constraints within this module to ensure +the text detector acquires essential positional and contextual range knowledge. +Our approach enhances the original text detector's ability to identify text's +local topological features using a dynamic snake feature pyramid network and +adopts a bottom-up contour shaping strategy with a novel rectangular +accumulation technique for accurate delineation of streamlined text features. +In addition, we present a comprehensive low-light dataset for arbitrary-shaped +text, encompassing diverse scenes and languages. Notably, our method achieves +state-of-the-art results on this low-light dataset and exhibits comparable +performance on standard normal light datasets. The code and dataset will be +released. + +
+
+
+
+
+ + ☆ Understanding Multimodal Deep Neural Networks: A Concept Selection View + + +
+ The multimodal deep neural networks, represented by CLIP, have generated rich +downstream applications owing to their excellent performance, thus making +understanding the decision-making process of CLIP an essential research topic. +Due to the complex structure and the massive pre-training data, it is often +regarded as a black-box model that is too difficult to understand and +interpret. Concept-based models map the black-box visual representations +extracted by deep neural networks onto a set of human-understandable concepts +and use the concepts to make predictions, enhancing the transparency of the +decision-making process. However, these methods involve the datasets labeled +with fine-grained attributes by expert knowledge, which incur high costs and +introduce excessive human prior knowledge and bias. In this paper, we observe +the long-tail distribution of concepts, based on which we propose a two-stage +Concept Selection Model (CSM) to mine core concepts without introducing any +human priors. The concept greedy rough selection algorithm is applied to +extract head concepts, and then the concept mask fine selection method performs +the extraction of core concepts. Experiments show that our approach achieves +comparable performance to end-to-end black-box models, and human evaluation +demonstrates that the concepts discovered by our method are interpretable and +comprehensible for humans. + +
+
+
+
+
+ + ☆ AMU-Tuning: Effective Logit Bias for CLIP-based Few-shot Learning CVPR 2024 + + +
+ Recently, pre-trained vision-language models (e.g., CLIP) have shown great +potential in few-shot learning and attracted a lot of research interest. +Although efforts have been made to improve few-shot ability of CLIP, key +factors on the effectiveness of existing methods have not been well studied, +limiting further exploration of CLIP's potential in few-shot learning. In this +paper, we first introduce a unified formulation to analyze CLIP-based few-shot +learning methods from a perspective of logit bias, which encourages us to learn +an effective logit bias for further improving performance of CLIP-based +few-shot learning methods. To this end, we disassemble three key components +involved in computation of logit bias (i.e., logit features, logit predictor, +and logit fusion) and empirically analyze the effect on performance of few-shot +classification. Based on analysis of key components, this paper proposes a +novel AMU-Tuning method to learn effective logit bias for CLIP-based few-shot +classification. Specifically, our AMU-Tuning predicts logit bias by exploiting +the appropriate $\underline{\textbf{A}}$uxiliary features, which are fed into +an efficient feature-initialized linear classifier with +$\underline{\textbf{M}}$ulti-branch training. Finally, an +$\underline{\textbf{U}}$ncertainty-based fusion is developed to incorporate +logit bias into CLIP for few-shot classification. The experiments are conducted +on several widely used benchmarks, and the results show AMU-Tuning clearly +outperforms its counterparts while achieving state-of-the-art performance of +CLIP-based few-shot learning without bells and whistles. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Constructing and Exploring Intermediate Domains in Mixed Domain + Semi-supervised Medical Image Segmentation + + +
+ Both limited annotation and domain shift are prevalent challenges in medical +image segmentation. Traditional semi-supervised segmentation and unsupervised +domain adaptation methods address one of these issues separately. However, the +coexistence of limited annotation and domain shift is quite common, which +motivates us to introduce a novel and challenging scenario: Mixed Domain +Semi-supervised medical image Segmentation (MiDSS). In this scenario, we handle +data from multiple medical centers, with limited annotations available for a +single domain and a large amount of unlabeled data from multiple domains. We +found that the key to solving the problem lies in how to generate reliable +pseudo labels for the unlabeled data in the presence of domain shift with +labeled data. To tackle this issue, we employ Unified Copy-Paste (UCP) between +images to construct intermediate domains, facilitating the knowledge transfer +from the domain of labeled data to the domains of unlabeled data. To fully +utilize the information within the intermediate domain, we propose a symmetric +Guidance training strategy (SymGD), which additionally offers direct guidance +to unlabeled data by merging pseudo labels from intermediate samples. +Subsequently, we introduce a Training Process aware Random Amplitude MixUp +(TP-RAM) to progressively incorporate style-transition components into +intermediate samples. Compared with existing state-of-the-art approaches, our +method achieves a notable 13.57% improvement in Dice score on Prostate dataset, +as demonstrated on three public datasets. Our code is available at +https://github.com/MQinghe/MiDSS . + +
+
+
+
+
+ + ☆ ChimpVLM: Ethogram-Enhanced Chimpanzee Behaviour Recognition + + +
+ We show that chimpanzee behaviour understanding from camera traps can be +enhanced by providing visual architectures with access to an embedding of text +descriptions that detail species behaviours. In particular, we present a +vision-language model which employs multi-modal decoding of visual features +extracted directly from camera trap videos to process query tokens representing +behaviours and output class predictions. Query tokens are initialised using a +standardised ethogram of chimpanzee behaviour, rather than using random or +name-based initialisations. In addition, the effect of initialising query +tokens using a masked language model fine-tuned on a text corpus of known +behavioural patterns is explored. We evaluate our system on the PanAf500 and +PanAf20K datasets and demonstrate the performance benefits of our multi-modal +decoding approach and query initialisation strategy on multi-class and +multi-label recognition tasks, respectively. Results and ablations corroborate +performance improvements. We achieve state-of-the-art performance over vision +and vision-language models in top-1 accuracy (+6.34%) on PanAf500 and overall +(+1.1%) and tail-class (+2.26%) mean average precision on PanAf20K. We share +complete source code and network weights for full reproducibility of results +and easy utilisation. + +
+
+
+
+
+ + ☆ Shifting Spotlight for Co-supervision: A Simple yet Efficient + Single-branch Network to See Through Camouflage + + +
+ Efficient and accurate camouflaged object detection (COD) poses a challenge +in the field of computer vision. Recent approaches explored the utility of edge +information for network co-supervision, achieving notable advancements. +However, these approaches introduce an extra branch for complex edge +extraction, complicate the model architecture and increases computational +demands. Addressing this issue, our work replicates the effect that animal's +camouflage can be easily revealed under a shifting spotlight, and leverages it +for network co-supervision to form a compact yet efficient single-branch +network, the Co-Supervised Spotlight Shifting Network (CS$^3$Net). The +spotlight shifting strategy allows CS$^3$Net to learn additional prior within a +single-branch framework, obviating the need for resource demanding multi-branch +design. To leverage the prior of spotlight shifting co-supervision, we propose +Shadow Refinement Module (SRM) and Projection Aware Attention (PAA) for feature +refinement and enhancement. To ensure the continuity of multi-scale features +aggregation, we utilize the Extended Neighbor Connection Decoder (ENCD) for +generating the final predictions. Empirical evaluations on public datasets +confirm that our CS$^3$Net offers an optimal balance between efficiency and +performance: it accomplishes a 32.13% reduction in Multiply-Accumulate (MACs) +operations compared to leading efficient COD models, while also delivering +superior performance. + +
+
+
+
+
+ + ☆ Label-free Anomaly Detection in Aerial Agricultural Images with Masked + Image Modeling CVPR 2024 + + +
+ Detecting various types of stresses (nutritional, water, nitrogen, etc.) in +agricultural fields is critical for farmers to ensure maximum productivity. +However, stresses show up in different shapes and sizes across different crop +types and varieties. Hence, this is posed as an anomaly detection task in +agricultural images. Accurate anomaly detection in agricultural UAV images is +vital for early identification of field irregularities. Traditional supervised +learning faces challenges in adapting to diverse anomalies, necessitating +extensive annotated data. In this work, we overcome this limitation with +self-supervised learning using a masked image modeling approach. Masked +Autoencoders (MAE) extract meaningful normal features from unlabeled image +samples which produces high reconstruction error for the abnormal pixels during +reconstruction. To remove the need of using only ``normal" data while training, +we use an anomaly suppression loss mechanism that effectively minimizes the +reconstruction of anomalous pixels and allows the model to learn anomalous +areas without explicitly separating ``normal" images for training. Evaluation +on the Agriculture-Vision data challenge shows a mIOU score improvement in +comparison to prior state of the art in unsupervised and self-supervised +methods. A single model generalizes across all the anomaly categories in the +Agri-Vision Challenge Dataset + +
+
+ comment: The paper has been accepted to CVPR 2024 5th Workshop on Vision for + Agriculture as an Oral Paper +
+
+
+
+
+ + ☆ DeDoDe v2: Analyzing and Improving the DeDoDe Keypoint Detector CVPR + + +
+ In this paper, we analyze and improve into the recently proposed DeDoDe +keypoint detector. We focus our analysis on some key issues. First, we find +that DeDoDe keypoints tend to cluster together, which we fix by performing +non-max suppression on the target distribution of the detector during training. +Second, we address issues related to data augmentation. In particular, the +DeDoDe detector is sensitive to large rotations. We fix this by including +90-degree rotations as well as horizontal flips. Finally, the decoupled nature +of the DeDoDe detector makes evaluation of downstream usefulness problematic. +We fix this by matching the keypoints with a pretrained dense matcher (RoMa) +and evaluating two-view pose estimates. We find that the original long training +is detrimental to performance, and therefore propose a much shorter training +schedule. We integrate all these improvements into our proposed detector DeDoDe +v2 and evaluate it with the original DeDoDe descriptor on the MegaDepth-1500 +and IMC2022 benchmarks. Our proposed detector significantly increases pose +estimation results, notably from 75.9 to 78.3 mAA on the IMC2022 challenge. +Code and weights are available at https://github.com/Parskatt/DeDoDe + +
+
+ comment: Accepted to Sixth Workshop on Image Matching - CVPRW 2024 +
+
+
+
+
+ + ☆ Diffusion Models Meet Remote Sensing: Principles, Methods, and + Perspectives + + +
+ As a newly emerging advance in deep generative models, diffusion models have +achieved state-of-the-art results in many fields, including computer vision, +natural language processing, and molecule design. The remote sensing community +has also noticed the powerful ability of diffusion models and quickly applied +them to a variety of tasks for image processing. Given the rapid increase in +research on diffusion models in the field of remote sensing, it is necessary to +conduct a comprehensive review of existing diffusion model-based remote sensing +papers, to help researchers recognize the potential of diffusion models and +provide some directions for further exploration. Specifically, this paper first +introduces the theoretical background of diffusion models, and then +systematically reviews the applications of diffusion models in remote sensing, +including image generation, enhancement, and interpretation. Finally, the +limitations of existing remote sensing diffusion models and worthy research +directions for further exploration are discussed and summarized. + +
+
+
+
+
+ + ☆ Trustworthy Multimodal Fusion for Sentiment Analysis in Ordinal + Sentiment Space + + +
+ Multimodal video sentiment analysis aims to integrate multiple modal +information to analyze the opinions and attitudes of speakers. Most previous +work focuses on exploring the semantic interactions of intra- and +inter-modality. However, these works ignore the reliability of multimodality, +i.e., modalities tend to contain noise, semantic ambiguity, missing modalities, +etc. In addition, previous multimodal approaches treat different modalities +equally, largely ignoring their different contributions. Furthermore, existing +multimodal sentiment analysis methods directly regress sentiment scores without +considering ordinal relationships within sentiment categories, with limited +performance. To address the aforementioned problems, we propose a trustworthy +multimodal sentiment ordinal network (TMSON) to improve performance in +sentiment analysis. Specifically, we first devise a unimodal feature extractor +for each modality to obtain modality-specific features. Then, an uncertainty +distribution estimation network is customized, which estimates the unimodal +uncertainty distributions. Next, Bayesian fusion is performed on the learned +unimodal distributions to obtain multimodal distributions for sentiment +prediction. Finally, an ordinal-aware sentiment space is constructed, where +ordinal regression is used to constrain the multimodal distributions. Our +proposed TMSON outperforms baselines on multimodal sentiment analysis tasks, +and empirical results demonstrate that TMSON is capable of reducing uncertainty +to obtain more robust predictions. + +
+
+ comment: 14 pages, 9 figures, Accepted by IEEE Transactions on Circuits and + Systems for Video Technology +
+
+
+
+
+ + ☆ PNeRV: Enhancing Spatial Consistency via Pyramidal Neural Representation + for Videos + + +
+ The primary focus of Neural Representation for Videos (NeRV) is to +effectively model its spatiotemporal consistency. However, current NeRV systems +often face a significant issue of spatial inconsistency, leading to decreased +perceptual quality. To address this issue, we introduce the Pyramidal Neural +Representation for Videos (PNeRV), which is built on a multi-scale information +connection and comprises a lightweight rescaling operator, Kronecker +Fully-connected layer (KFc), and a Benign Selective Memory (BSM) mechanism. The +KFc, inspired by the tensor decomposition of the vanilla Fully-connected layer, +facilitates low-cost rescaling and global correlation modeling. BSM merges +high-level features with granular ones adaptively. Furthermore, we provide an +analysis based on the Universal Approximation Theory of the NeRV system and +validate the effectiveness of the proposed PNeRV.We conducted comprehensive +experiments to demonstrate that PNeRV surpasses the performance of contemporary +NeRV models, achieving the best results in video regression on UVG and DAVIS +under various metrics (PSNR, SSIM, LPIPS, and FVD). Compared to vanilla NeRV, +PNeRV achieves a +4.49 dB gain in PSNR and a 231% increase in FVD on UVG, along +with a +3.28 dB PSNR and 634% FVD increase on DAVIS. + +
+
+
+
+
+ + ☆ MAProtoNet: A Multi-scale Attentive Interpretable Prototypical Part + Network for 3D Magnetic Resonance Imaging Brain Tumor Classification + + +
+ Automated diagnosis with artificial intelligence has emerged as a promising +area in the realm of medical imaging, while the interpretability of the +introduced deep neural networks still remains an urgent concern. Although +contemporary works, such as XProtoNet and MProtoNet, has sought to design +interpretable prediction models for the issue, the localization precision of +their resulting attribution maps can be further improved. To this end, we +propose a Multi-scale Attentive Prototypical part Network, termed MAProtoNet, +to provide more precise maps for attribution. Specifically, we introduce a +concise multi-scale module to merge attentive features from quadruplet +attention layers, and produces attribution maps. The proposed quadruplet +attention layers can enhance the existing online class activation mapping loss +via capturing interactions between the spatial and channel dimension, while the +multi-scale module then fuses both fine-grained and coarse-grained information +for precise maps generation. We also apply a novel multi-scale mapping loss for +supervision on the proposed multi-scale module. Compared to existing +interpretable prototypical part networks in medical imaging, MAProtoNet can +achieve state-of-the-art performance in localization on brain tumor +segmentation (BraTS) datasets, resulting in approximately 4% overall +improvement on activation precision score (with a best score of 85.8%), without +using additional annotated labels of segmentation. Our code will be released in +https://github.com/TUAT-Novice/maprotonet. + +
+
+
+
+
+ + ☆ Meply: A Large-scale Dataset and Baseline Evaluations for Metastatic + Perirectal Lymph Node Detection and Segmentation + + +
+ Accurate segmentation of metastatic lymph nodes in rectal cancer is crucial +for the staging and treatment of rectal cancer. However, existing segmentation +approaches face challenges due to the absence of pixel-level annotated datasets +tailored for lymph nodes around the rectum. Additionally, metastatic lymph +nodes are characterized by their relatively small size, irregular shapes, and +lower contrast compared to the background, further complicating the +segmentation task. To address these challenges, we present the first +large-scale perirectal metastatic lymph node CT image dataset called Meply, +which encompasses pixel-level annotations of 269 patients diagnosed with rectal +cancer. Furthermore, we introduce a novel lymph-node segmentation model named +CoSAM. The CoSAM utilizes sequence-based detection to guide the segmentation of +metastatic lymph nodes in rectal cancer, contributing to improved localization +performance for the segmentation model. It comprises three key components: +sequence-based detection module, segmentation module, and collaborative +convergence unit. To evaluate the effectiveness of CoSAM, we systematically +compare its performance with several popular segmentation methods using the +Meply dataset. Our code and dataset will be publicly available at: +https://github.com/kanydao/CoSAM. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ PM2: A New Prompting Multi-modal Model Paradigm for Few-shot Medical + Image Classification + + +
+ Few-shot learning has been successfully applied to medical image +classification as only very few medical examples are available for training. +Due to the challenging problem of limited number of annotated medical images, +image representations should not be solely derived from a single image modality +which is insufficient for characterizing concept classes. In this paper, we +propose a new prompting multi-modal model paradigm on medical image +classification based on multi-modal foundation models, called PM2. Besides +image modality,PM2 introduces another supplementary text input, known as +prompt, to further describe corresponding image or concept classes and +facilitate few-shot learning across diverse modalities. To better explore the +potential of prompt engineering, we empirically investigate five distinct +prompt schemes under the new paradigm. Furthermore, linear probing in +multi-modal models acts as a linear classification head taking as input only +class token, which ignores completely merits of rich statistics inherent in +high-level visual tokens. Thus, we alternatively perform a linear +classification on feature distribution of visual tokens and class token +simultaneously. To effectively mine such rich statistics, a global covariance +pooling with efficient matrix power normalization is used to aggregate visual +tokens. Then we study and combine two classification heads. One is shared for +class token of image from vision encoder and prompt representation encoded by +text encoder. The other is to classification on feature distribution of visual +tokens from vision encoder. Extensive experiments on three medical datasets +show that our PM2 significantly outperforms counterparts regardless of prompt +schemes and achieves state-of-the-art performance. + +
+
+
+
+
+ + ☆ HEAT: Head-level Parameter Efficient Adaptation of Vision Transformers + with Taylor-expansion Importance Scores + + +
+ Prior computer vision research extensively explores adapting pre-trained +vision transformers (ViT) to downstream tasks. However, the substantial number +of parameters requiring adaptation has led to a focus on Parameter Efficient +Transfer Learning (PETL) as an approach to efficiently adapt large pre-trained +models by training only a subset of parameters, achieving both parameter and +storage efficiency. Although the significantly reduced parameters have shown +promising performance under transfer learning scenarios, the structural +redundancy inherent in the model still leaves room for improvement, which +warrants further investigation. In this paper, we propose Head-level Efficient +Adaptation with Taylor-expansion importance score (HEAT): a simple method that +efficiently fine-tuning ViTs at head levels. In particular, the first-order +Taylor expansion is employed to calculate each head's importance score, termed +Taylor-expansion Importance Score (TIS), indicating its contribution to +specific tasks. Additionally, three strategies for calculating TIS have been +employed to maximize the effectiveness of TIS. These strategies calculate TIS +from different perspectives, reflecting varying contributions of parameters. +Besides ViT, HEAT has also been applied to hierarchical transformers such as +Swin Transformer, demonstrating its versatility across different transformer +architectures. Through extensive experiments, HEAT has demonstrated superior +performance over state-of-the-art PETL methods on the VTAB-1K benchmark. + +
+
+
+
+
+ + ☆ ChangeAnywhere: Sample Generation for Remote Sensing Change Detection + via Semantic Latent Diffusion Model + + +
+ Remote sensing change detection (CD) is a pivotal technique that pinpoints +changes on a global scale based on multi-temporal images. With the recent +expansion of deep learning, supervised deep learning-based CD models have shown +satisfactory performance. However, CD sample labeling is very time-consuming as +it is densely labeled and requires expert knowledge. To alleviate this problem, +we introduce ChangeAnywhere, a novel CD sample generation method using the +semantic latent diffusion model and single-temporal images. Specifically, +ChangeAnywhere leverages the relative ease of acquiring large single-temporal +semantic datasets to generate large-scale, diverse, and semantically annotated +bi-temporal CD datasets. ChangeAnywhere captures the two essentials of CD +samples, i.e., change implies semantically different, and non-change implies +reasonable change under the same semantic constraints. We generated +ChangeAnywhere-100K, the largest synthesis CD dataset with 100,000 pairs of CD +samples based on the proposed method. The ChangeAnywhere-100K significantly +improved both zero-shot and few-shot performance on two CD benchmark datasets +for various deep learning-based CD models, as demonstrated by transfer +experiments. This paper delineates the enormous potential of ChangeAnywhere for +CD sample generation and demonstrates the subsequent enhancement of model +performance. Therefore, ChangeAnywhere offers a potent tool for remote sensing +CD. All codes and pre-trained models will be available at +https://github.com/tangkai-RS/ChangeAnywhere. + +
+
+ comment: Concise manuscript version of ChangeAnywhere +
+
+
+
+
+ + ☆ EIVEN: Efficient Implicit Attribute Value Extraction using Multimodal + LLM NAACL 2024 + + +
+ In e-commerce, accurately extracting product attribute values from multimodal +data is crucial for improving user experience and operational efficiency of +retailers. However, previous approaches to multimodal attribute value +extraction often struggle with implicit attribute values embedded in images or +text, rely heavily on extensive labeled data, and can easily confuse similar +attribute values. To address these issues, we introduce EIVEN, a data- and +parameter-efficient generative framework that pioneers the use of multimodal +LLM for implicit attribute value extraction. EIVEN leverages the rich inherent +knowledge of a pre-trained LLM and vision encoder to reduce reliance on labeled +data. We also introduce a novel Learning-by-Comparison technique to reduce +model confusion by enforcing attribute value comparison and difference +identification. Additionally, we construct initial open-source datasets for +multimodal implicit attribute value extraction. Our extensive experiments +reveal that EIVEN significantly outperforms existing methods in extracting +implicit attribute values while requiring less labeled data. + +
+
+ comment: Accepted by NAACL 2024 Industry Track +
+
+
+
+
+ + ☆ A Lightweight Spatiotemporal Network for Online Eye Tracking with Event + Camera + + +
+ Event-based data are commonly encountered in edge computing environments +where efficiency and low latency are critical. To interface with such data and +leverage their rich temporal features, we propose a causal spatiotemporal +convolutional network. This solution targets efficient implementation on +edge-appropriate hardware with limited resources in three ways: 1) deliberately +targets a simple architecture and set of operations (convolutions, ReLU +activations) 2) can be configured to perform online inference efficiently via +buffering of layer outputs 3) can achieve more than 90% activation sparsity +through regularization during training, enabling very significant efficiency +gains on event-based processors. In addition, we propose a general affine +augmentation strategy acting directly on the events, which alleviates the +problem of dataset scarcity for event-based systems. We apply our model on the +AIS 2024 event-based eye tracking challenge, reaching a score of 0.9916 p10 +accuracy on the Kaggle private testset. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Multimodal Attack Detection for Action Recognition Models + + +
+ Adversarial machine learning attacks on video action recognition models is a +growing research area and many effective attacks were introduced in recent +years. These attacks show that action recognition models can be breached in +many ways. Hence using these models in practice raises significant security +concerns. However, there are very few works which focus on defending against or +detecting attacks. In this work, we propose a novel universal detection method +which is compatible with any action recognition model. In our extensive +experiments, we show that our method consistently detects various attacks +against different target models with high true positive rates while satisfying +very low false positive rates. Tested against four state-of-the-art attacks +targeting four action recognition models, the proposed detector achieves an +average AUC of 0.911 over 16 test cases while the best performance achieved by +the existing detectors is 0.645 average AUC. This 41.2% improvement is enabled +by the robustness of the proposed detector to varying attack methods and target +models. The lowest AUC achieved by our detector across the 16 test cases is +0.837 while the competing detector's performance drops as low as 0.211. We also +show that the proposed detector is robust to varying attack strengths. In +addition, we analyze our method's real-time performance with different hardware +setups to demonstrate its potential as a practical defense mechanism. + +
+
+
+
+
+ + ♻ ☆ Recursive Joint Cross-Modal Attention for Multimodal Fusion in + Dimensional Emotion Recognition + + +
+ Though multimodal emotion recognition has achieved significant progress over +recent years, the potential of rich synergic relationships across the +modalities is not fully exploited. In this paper, we introduce Recursive Joint +Cross-Modal Attention (RJCMA) to effectively capture both intra- and +inter-modal relationships across audio, visual, and text modalities for +dimensional emotion recognition. In particular, we compute the attention +weights based on cross-correlation between the joint audio-visual-text feature +representations and the feature representations of individual modalities to +simultaneously capture intra- and intermodal relationships across the +modalities. The attended features of the individual modalities are again fed as +input to the fusion model in a recursive mechanism to obtain more refined +feature representations. We have also explored Temporal Convolutional Networks +(TCNs) to improve the temporal modeling of the feature representations of +individual modalities. Extensive experiments are conducted to evaluate the +performance of the proposed fusion model on the challenging Affwild2 dataset. +By effectively capturing the synergic intra- and inter-modal relationships +across audio, visual, and text modalities, the proposed fusion model achieves a +Concordance Correlation Coefficient (CCC) of 0.585 (0.542) and 0.674 (0.619) +for valence and arousal respectively on the validation set(test set). This +shows a significant improvement over the baseline of 0.240 (0.211) and 0.200 +(0.191) for valence and arousal, respectively, in the validation set (test +set), achieving second place in the valence-arousal challenge of the 6th +Affective Behavior Analysis in-the-Wild (ABAW) competition. + +
+
+
+
+
+ + ♻ ☆ ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal + Instance Segmentation IJCNN 2024 + + +
+ Amodal Instance Segmentation (AIS) presents a challenging task as it involves +predicting both visible and occluded parts of objects within images. Existing +AIS methods rely on a bidirectional approach, encompassing both the transition +from amodal features to visible features (amodal-to-visible) and from visible +features to amodal features (visible-to-amodal). Our observation shows that the +utilization of amodal features through the amodal-to-visible can confuse the +visible features due to the extra information of occluded/hidden segments not +presented in visible display. Consequently, this compromised quality of visible +features during the subsequent visible-to-amodal transition. To tackle this +issue, we introduce ShapeFormer, a decoupled Transformer-based model with a +visible-to-amodal transition. It facilitates the explicit relationship between +output segmentations and avoids the need for amodal-to-visible transitions. +ShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for +predicting visible segmentation with occlusion awareness, (ii) Shape-Prior +Amodal Mask Head for predicting amodal and occluded masks, and (iii) +Category-Specific Shape Prior Retriever aims to provide shape prior knowledge. +Comprehensive experiments and extensive ablation studies across various AIS +benchmarks demonstrate the effectiveness of our ShapeFormer. The code is +available at: https://github.com/UARK-AICV/ShapeFormer + +
+
+ comment: Accepted to IJCNN 2024 +
+
+
+
+
+ + ♻ ☆ Tackling Structural Hallucination in Image Translation with Local + Diffusion + + +
+ Recent developments in diffusion models have advanced conditioned image +generation, yet they struggle with reconstructing out-of-distribution (OOD) +images, such as unseen tumors in medical images, causing ``image +hallucination'' and risking misdiagnosis. We hypothesize such hallucinations +result from local OOD regions in the conditional images. We verify that +partitioning the OOD region and conducting separate image generations +alleviates hallucinations in several applications. From this, we propose a +training-free diffusion framework that reduces hallucination with multiple +Local Diffusion processes. Our approach involves OOD estimation followed by two +modules: a ``branching'' module generates locally both within and outside OOD +regions, and a ``fusion'' module integrates these predictions into one. Our +evaluation shows our method mitigates hallucination over baseline models +quantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the +real-world medical and natural image datasets, respectively. It also +demonstrates compatibility with various pre-trained diffusion models. + +
+
+
+
+
+ + ♻ ☆ Dynamic Clue Bottlenecks: Towards Interpretable-by-Design Visual + Question Answering + + +
+ Recent advances in multimodal large language models (LLMs) have shown extreme +effectiveness in visual question answering (VQA). However, the design nature of +these end-to-end models prevents them from being interpretable to humans, +undermining trust and applicability in critical domains. While post-hoc +rationales offer certain insight into understanding model behavior, these +explanations are not guaranteed to be faithful to the model. In this paper, we +address these shortcomings by introducing an interpretable by design model that +factors model decisions into intermediate human-legible explanations, and +allows people to easily understand why a model fails or succeeds. We propose +the Dynamic Clue Bottleneck Model ( (DCLUB), a method that is designed towards +an inherently interpretable VQA system. DCLUB provides an explainable +intermediate space before the VQA decision and is faithful from the beginning, +while maintaining comparable performance to black-box systems. Given a +question, DCLUB first returns a set of visual clues: natural language +statements of visually salient evidence from the image, and then generates the +output based solely on the visual clues. To supervise and evaluate the +generation of VQA explanations within DCLUB, we collect a dataset of 1.7k +reasoning-focused questions with visual clues. Evaluations show that our +inherently interpretable system can improve 4.64% over a comparable black-box +system in reasoning-focused questions while preserving 99.43% of performance on +VQA-v2. + +
+
+ comment: Multimodal, Visual Question Answering, Vision and Language +
+
+
+
+
+ + ♻ ☆ When are Lemons Purple? The Concept Association Bias of Vision-Language + Models EMNLP 2023 + + +
+ Large-scale vision-language models such as CLIP have shown impressive +performance on zero-shot image classification and image-to-text retrieval. +However, such performance does not realize in tasks that require a +finer-grained correspondence between vision and language, such as Visual +Question Answering (VQA). As a potential cause of the difficulty of applying +these models to VQA and similar tasks, we report an interesting phenomenon of +vision-language models, which we call the Concept Association Bias (CAB). We +find that models with CAB tend to treat input as a bag of concepts and attempt +to fill in the other missing concept crossmodally, leading to an unexpected +zero-shot prediction. We demonstrate CAB by showing that CLIP's zero-shot +classification performance greatly suffers when there is a strong concept +association between an object (e.g. eggplant) and an attribute (e.g. color +purple). We also show that the strength of CAB predicts the performance on VQA. +We observe that CAB is prevalent in vision-language models trained with +contrastive losses, even when autoregressive losses are jointly employed. +However, a model that solely relies on autoregressive loss seems to exhibit +minimal or no signs of CAB. + +
+
+ comment: EMNLP 2023 main +
+
+
+
+
+ + ♻ ☆ Objects With Lighting: A Real-World Dataset for Evaluating + Reconstruction and Rendering for Object Relighting 3DV 2024 + + +
+ Reconstructing an object from photos and placing it virtually in a new +environment goes beyond the standard novel view synthesis task as the +appearance of the object has to not only adapt to the novel viewpoint but also +to the new lighting conditions and yet evaluations of inverse rendering methods +rely on novel view synthesis data or simplistic synthetic datasets for +quantitative analysis. This work presents a real-world dataset for measuring +the reconstruction and rendering of objects for relighting. To this end, we +capture the environment lighting and ground truth images of the same objects in +multiple environments allowing to reconstruct the objects from images taken in +one environment and quantify the quality of the rendered views for the unseen +lighting environments. Further, we introduce a simple baseline composed of +off-the-shelf methods and test several state-of-the-art methods on the +relighting task and show that novel view synthesis is not a reliable proxy to +measure performance. Code and dataset are available at +https://github.com/isl-org/objects-with-lighting . + +
+
+ comment: Accepted at 3DV 2024, Oral presentation. For the project page see + https://github.com/isl-org/objects-with-lighting +
+
+
+
+
+ + ♻ ☆ IRAD: Implicit Representation-driven Image Resampling against + Adversarial Attacks + + +
+ We introduce a novel approach to counter adversarial attacks, namely, image +resampling. Image resampling transforms a discrete image into a new one, +simulating the process of scene recapturing or rerendering as specified by a +geometrical transformation. The underlying rationale behind our idea is that +image resampling can alleviate the influence of adversarial perturbations while +preserving essential semantic information, thereby conferring an inherent +advantage in defending against adversarial attacks. To validate this concept, +we present a comprehensive study on leveraging image resampling to defend +against adversarial attacks. We have developed basic resampling methods that +employ interpolation strategies and coordinate shifting magnitudes. Our +analysis reveals that these basic methods can partially mitigate adversarial +attacks. However, they come with apparent limitations: the accuracy of clean +images noticeably decreases, while the improvement in accuracy on adversarial +examples is not substantial. We propose implicit representation-driven image +resampling (IRAD) to overcome these limitations. First, we construct an +implicit continuous representation that enables us to represent any input image +within a continuous coordinate space. Second, we introduce SampleNet, which +automatically generates pixel-wise shifts for resampling in response to +different inputs. Furthermore, we can extend our approach to the +state-of-the-art diffusion-based method, accelerating it with fewer time steps +while preserving its defense capability. Extensive experiments demonstrate that +our method significantly enhances the adversarial robustness of diverse deep +models against various attacks while maintaining high accuracy on clean images. + +
+
+
+
+
+ + ♻ ☆ PV-SSD: A Multi-Modal Point Cloud Feature Fusion Method for Projection + Features and Variable Receptive Field Voxel Features + + +
+ LiDAR-based 3D object detection and classification is crucial for autonomous +driving. However, real-time inference from extremely sparse 3D data is a +formidable challenge. To address this problem, a typical class of approaches +transforms the point cloud cast into a regular data representation (voxels or +projection maps). Then, it performs feature extraction with convolutional +neural networks. However, such methods often result in a certain degree of +information loss due to down-sampling or over-compression of feature +information. This paper proposes a multi-modal point cloud feature fusion +method for projection features and variable receptive field voxel features +(PV-SSD) based on projection and variable voxelization to solve the information +loss problem. We design a two-branch feature extraction structure with a 2D +convolutional neural network to extract the point cloud's projection features +in bird's-eye view to focus on the correlation between local features. A voxel +feature extraction branch is used to extract local fine-grained features. +Meanwhile, we propose a voxel feature extraction method with variable sensory +fields to reduce the information loss of voxel branches due to downsampling. It +avoids missing critical point information by selecting more useful feature +points based on feature point weights for the detection task. In addition, we +propose a multi-modal feature fusion module for point clouds. To validate the +effectiveness of our method, we tested it on the KITTI dataset and ONCE +dataset. + +
+
+
+
+
+ + ♻ ☆ Adapting LLaMA Decoder to Vision Transformer + + +
+ This work examines whether decoder-only Transformers such as LLaMA, which +were originally designed for large language models (LLMs), can be adapted to +the computer vision field. We first "LLaMAfy" a standard ViT step-by-step to +align with LLaMA's architecture, and find that directly applying a casual mask +to the self-attention brings an attention collapse issue, resulting in the +failure to the network training. We suggest to reposition the class token +behind the image tokens with a post-sequence class token technique to overcome +this challenge, enabling causal self-attention to efficiently capture the +entire image's information. Additionally, we develop a soft mask strategy that +gradually introduces a casual mask to the self-attention at the onset of +training to facilitate the optimization behavior. The tailored model, dubbed as +image LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct +supervised learning. Its causal self-attention boosts computational efficiency +and learns complex representation by elevating attention map ranks. iLLaMA +rivals the performance with its encoder-only counterparts, achieving 75.1% +ImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M +and pre-training on ImageNet-21K further enhances the accuracy to 86.0%. +Extensive experiments demonstrate iLLaMA's reliable properties: calibration, +shape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR +transfer learning. We hope our study can kindle fresh views to visual model +design in the wave of LLMs. Pre-trained models and codes are available here. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ LaVy: Vietnamese Multimodal Large Language Model + + +
+ Large Language Models (LLMs) and Multimodal Large language models (MLLMs) +have taken the world by storm with impressive abilities in complex reasoning +and linguistic comprehension. Meanwhile there are plethora of works related to +Vietnamese Large Language Models, the lack of high-quality resources in +multimodality limits the progress of Vietnamese MLLMs. In this paper, we +pioneer in address this by introducing LaVy, a state-of-the-art Vietnamese +MLLM, and we also introduce LaVy-Bench benchmark designated for evaluating +MLLMs's understanding on Vietnamese visual language tasks. All code and model +weights are public at https://github.com/baochi0212/LaVy + +
+
+ comment: 4 pages +
+
+
+
+
+ + ♻ ☆ Detoxifying Large Language Models via Knowledge Editing + + +
+ This paper investigates using knowledge editing techniques to detoxify Large +Language Models (LLMs). We construct a benchmark, SafeEdit, which covers nine +unsafe categories with various powerful attack prompts and equips comprehensive +metrics for systematic evaluation. We conduct experiments with several +knowledge editing approaches, indicating that knowledge editing has the +potential to efficiently detoxify LLMs with limited impact on general +performance. Then, we propose a simple yet effective baseline, dubbed +Detoxifying with Intraoperative Neural Monitoring (DINM), to diminish the +toxicity of LLMs within a few tuning steps via only one instance. We further +provide an in-depth analysis of the internal mechanism for various detoxifying +approaches, demonstrating that previous methods like SFT and DPO may merely +suppress the activations of toxic parameters, while DINM mitigates the toxicity +of the toxic parameters to a certain extent, making permanent adjustments. We +hope that these insights could shed light on future work of developing +detoxifying approaches and the underlying knowledge mechanisms of LLMs. Code +and benchmark are available at https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Ongoing work. Project website: + https://zjunlp.github.io/project/SafeEdit Add and update experimental results + in Tables 1 and 3 +
+
+
+
+
+ + ♻ ☆ Inconsistency Masks: Removing the Uncertainty from Input-Pseudo-Label + Pairs + + +
+ Efficiently generating sufficient labeled data remains a major bottleneck in +deep learning, particularly for image segmentation tasks where labeling +requires significant time and effort. This study tackles this issue in a +resource-constrained environment, devoid of extensive datasets or pre-existing +models. We introduce Inconsistency Masks (IM), a novel approach that filters +uncertainty in image-pseudo-label pairs to substantially enhance segmentation +quality, surpassing traditional semi-supervised learning techniques. Employing +IM, we achieve strong segmentation results with as little as 10% labeled data, +across four diverse datasets and it further benefits from integration with +other techniques, indicating broad applicability. Notably on the ISIC 2018 +dataset, three of our hybrid approaches even outperform models trained on the +fully labeled dataset. We also present a detailed comparative analysis of +prevalent semi-supervised learning strategies, all under uniform starting +conditions, to underline our approach's effectiveness and robustness. The full +code is available at: https://github.com/MichaelVorndran/InconsistencyMasks + +
+
+
+
+
+ + ♻ ☆ G-ACIL: Analytic Learning for Exemplar-Free Generalized Class + Incremental Learning + + +
+ Class incremental learning (CIL) trains a network on sequential tasks with +separated categories but suffers from catastrophic forgetting, where models +quickly lose previously learned knowledge when acquiring new tasks. The +generalized CIL (GCIL) aims to address the CIL problem in a more real-world +scenario, where incoming data have mixed data categories and unknown sample +size distribution, leading to intensified forgetting. Existing attempts for the +GCIL either have poor performance, or invade data privacy by saving historical +exemplars. To address this, in this paper, we propose an exemplar-free +generalized analytic class incremental learning (G-ACIL). The G-ACIL adopts +analytic learning (a gradient-free training technique), and delivers an +analytical solution (i.e., closed-form) to the GCIL scenario. This solution is +derived via decomposing the incoming data into exposed and unexposed classes, +allowing an equivalence between the incremental learning and its joint +training, i.e., the weight-invariant property. Such an equivalence is +theoretically validated through matrix analysis tools, and hence contributes +interpretability in GCIL. It is also empirically evidenced by experiments on +various datasets and settings of GCIL. The results show that the G-ACIL +exhibits leading performance with high robustness compared with existing +competitive GCIL methods. Codes will be ready at +\url{https://github.com/ZHUANGHP/Analytic-continual-learning}. + +
+
+
+
+
+ + ♻ ☆ CoLLaVO: Crayon Large Language and Vision mOdel + + +
+ The remarkable success of Large Language Models (LLMs) and instruction tuning +drives the evolution of Vision Language Models (VLMs) towards a versatile +general-purpose model. Yet, it remains unexplored whether current VLMs +genuinely possess quality object-level image understanding capabilities +determined from `what objects are in the image?' or `which object corresponds +to a specified bounding box?'. Our findings reveal that the image understanding +capabilities of current VLMs are strongly correlated with their zero-shot +performance on vision language (VL) tasks. This suggests that prioritizing +basic image understanding is crucial for VLMs to excel at VL tasks. To enhance +object-level image understanding, we propose Crayon Large Language and Vision +mOdel (CoLLaVO), which incorporates instruction tuning with Crayon Prompt as a +new visual prompt tuning scheme based on panoptic color maps. Furthermore, we +present a learning strategy of Dual QLoRA to preserve object-level image +understanding without forgetting it during visual instruction tuning, thereby +achieving a significant leap in numerous VL benchmarks in a zero-shot setting. + +
+
+ comment: Code available: https://github.com/ByungKwanLee/CoLLaVO +
+
+
+
+
+ + ♻ ☆ VeCAF: Vision-language Collaborative Active Finetuning with Training + Objective Awareness + + +
+ Finetuning a pretrained vision model (PVM) is a common technique for learning +downstream vision tasks. However, the conventional finetuning process with +randomly sampled data points results in diminished training efficiency. To +address this drawback, we propose a novel approach, Vision-language +Collaborative Active Finetuning (VeCAF). With the emerging availability of +labels and natural language annotations of images through web-scale crawling or +controlled generation, VeCAF makes use of these information to perform +parametric data selection for PVM finetuning. VeCAF incorporates the finetuning +objective to select significant data points that effectively guide the PVM +towards faster convergence to meet the performance goal. This process is +assisted by the inherent semantic richness of the text embedding space which we +use to augment image features. Furthermore, the flexibility of text-domain +augmentation allows VeCAF to handle out-of-distribution scenarios without +external data. Extensive experiments show the leading performance and high +computational efficiency of VeCAF that is superior to baselines in both +in-distribution and out-of-distribution image classification tasks. On +ImageNet, VeCAF uses up to 3.3x less training batches to reach the target +performance compared to full finetuning, and achieves an accuracy improvement +of 2.7% over the state-of-the-art active finetuning method with the same number +of batches. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ FM-G-CAM: A Holistic Approach for Explainable AI in Computer Vision + + +
+ Explainability is an aspect of modern AI that is vital for impact and +usability in the real world. The main objective of this paper is to emphasise +the need to understand the predictions of Computer Vision models, specifically +Convolutional Neural Network (CNN) based models. Existing methods of explaining +CNN predictions are mostly based on Gradient-weighted Class Activation Maps +(Grad-CAM) and solely focus on a single target class. We show that from the +point of the target class selection, we make an assumption on the prediction +process, hence neglecting a large portion of the predictor CNN model's thinking +process. In this paper, we present an exhaustive methodology called Fused +Multi-class Gradient-weighted Class Activation Map (FM-G-CAM) that considers +multiple top predicted classes, which provides a holistic explanation of the +predictor CNN's thinking rationale. We also provide a detailed and +comprehensive mathematical and algorithmic description of our method. +Furthermore, along with a concise comparison of existing methods, we compare +FM-G-CAM with Grad-CAM, highlighting its benefits through real-world practical +use cases. Finally, we present an open-source Python library with FM-G-CAM +implementation to conveniently generate saliency maps for CNN-based model +predictions. + +
+
+
+
+
+ + ♻ ☆ Part-Attention Based Model Make Occluded Person Re-Identification + Stronger + + +
+ The goal of occluded person re-identification (ReID) is to retrieve specific +pedestrians in occluded situations. However, occluded person ReID still suffers +from background clutter and low-quality local feature representations, which +limits model performance. In our research, we introduce a new framework called +PAB-ReID, which is a novel ReID model incorporating part-attention mechanisms +to tackle the aforementioned issues effectively. Firstly, we introduce the +human parsing label to guide the generation of more accurate human part +attention maps. In addition, we propose a fine-grained feature focuser for +generating fine-grained human local feature representations while suppressing +background interference. Moreover, We also design a part triplet loss to +supervise the learning of human local features, which optimizes +intra/inter-class distance. We conducted extensive experiments on specialized +occlusion and regular ReID datasets, showcasing that our approach outperforms +the existing state-of-the-art methods. + +
+
+ comment: Accepted By International Joint Conference on Neural Networks 2024 +
+
+
+
+
+ + ♻ ☆ Motion2VecSets: 4D Latent Vector Set Diffusion for Non-rigid Shape + Reconstruction and Tracking + + +
+ We introduce Motion2VecSets, a 4D diffusion model for dynamic surface +reconstruction from point cloud sequences. While existing state-of-the-art +methods have demonstrated success in reconstructing non-rigid objects using +neural field representations, conventional feed-forward networks encounter +challenges with ambiguous observations from noisy, partial, or sparse point +clouds. To address these challenges, we introduce a diffusion model that +explicitly learns the shape and motion distribution of non-rigid objects +through an iterative denoising process of compressed latent representations. +The diffusion-based priors enable more plausible and probabilistic +reconstructions when handling ambiguous inputs. We parameterize 4D dynamics +with latent sets instead of using global latent codes. This novel 4D +representation allows us to learn local shape and deformation patterns, leading +to more accurate non-linear motion capture and significantly improving +generalizability to unseen motions and identities. For more temporally-coherent +object tracking, we synchronously denoise deformation latent sets and exchange +information across multiple frames. To avoid computational overhead, we +designed an interleaved space and time attention block to alternately aggregate +deformation latents along spatial and temporal domains. Extensive comparisons +against state-of-the-art methods demonstrate the superiority of our +Motion2VecSets in 4D reconstruction from various imperfect observations. More +detailed information can be found at +https://vveicao.github.io/projects/Motion2VecSets/. + +
+
+
+
+
+ + ♻ ☆ Unraveling Batch Normalization for Realistic Test-Time Adaptation AAAI 2024 + + +
+ While recent test-time adaptations exhibit efficacy by adjusting batch +normalization to narrow domain disparities, their effectiveness diminishes with +realistic mini-batches due to inaccurate target estimation. As previous +attempts merely introduce source statistics to mitigate this issue, the +fundamental problem of inaccurate target estimation still persists, leaving the +intrinsic test-time domain shifts unresolved. This paper delves into the +problem of mini-batch degradation. By unraveling batch normalization, we +discover that the inexact target statistics largely stem from the substantially +reduced class diversity in batch. Drawing upon this insight, we introduce a +straightforward tool, Test-time Exponential Moving Average (TEMA), to bridge +the class diversity gap between training and testing batches. Importantly, our +TEMA adaptively extends the scope of typical methods beyond the current batch +to incorporate a diverse set of class information, which in turn boosts an +accurate target estimation. Built upon this foundation, we further design a +novel layer-wise rectification strategy to consistently promote test-time +performance. Our proposed method enjoys a unique advantage as it requires +neither training nor tuning parameters, offering a truly hassle-free solution. +It significantly enhances model robustness against shifted domains and +maintains resilience in diverse real-world scenarios with various batch sizes, +achieving state-of-the-art performance on several major benchmarks. Code is +available at \url{https://github.com/kiwi12138/RealisticTTA}. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ GauU-Scene V2: Assessing the Reliability of Image-Based Metrics with + Expansive Lidar Image Dataset Using 3DGS and NeRF + + +
+ We introduce a novel, multimodal large-scale scene reconstruction benchmark +that utilizes newly developed 3D representation approaches: Gaussian Splatting +and Neural Radiance Fields (NeRF). Our expansive U-Scene dataset surpasses any +previously existing real large-scale outdoor LiDAR and image dataset in both +area and point count. GauU-Scene encompasses over 6.5 square kilometers and +features a comprehensive RGB dataset coupled with LiDAR ground truth. +Additionally, we are the first to propose a LiDAR and image alignment method +for a drone-based dataset. Our assessment of GauU-Scene includes a detailed +analysis across various novel viewpoints, employing image-based metrics such as +SSIM, LPIPS, and PSNR on NeRF and Gaussian Splatting based methods. This +analysis reveals contradictory results when applying geometric-based metrics +like Chamfer distance. The experimental results on our multimodal dataset +highlight the unreliability of current image-based metrics and reveal +significant drawbacks in geometric reconstruction using the current Gaussian +Splatting-based method, further illustrating the necessity of our dataset for +assessing geometry reconstruction tasks. We also provide detailed supplementary +information on data collection protocols and make the dataset available on the +following anonymous project page + +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised 3D Visual Grounding based on Visual Linguistic + Alignment + + +
+ Learning to ground natural language queries to target objects or regions in +3D point clouds is quite essential for 3D scene understanding. Nevertheless, +existing 3D visual grounding approaches require a substantial number of +bounding box annotations for text queries, which is time-consuming and +labor-intensive to obtain. In this paper, we propose \textbf{3D-VLA}, a weakly +supervised approach for \textbf{3D} visual grounding based on \textbf{V}isual +\textbf{L}inguistic \textbf{A}lignment. Our 3D-VLA exploits the superior +ability of current large-scale vision-language models (VLMs) on aligning the +semantics between texts and 2D images, as well as the naturally existing +correspondences between 2D images and 3D point clouds, and thus implicitly +constructs correspondences between texts and 3D point clouds with no need for +fine-grained box annotations in the training procedure. During the inference +stage, the learned text-3D correspondence will help us ground the text queries +to the 3D target objects even without 2D images. To the best of our knowledge, +this is the first work to investigate 3D visual grounding in a weakly +supervised manner by involving large scale vision-language models, and +extensive experiments on ReferIt3D and ScanRefer datasets demonstrate that our +3D-VLA achieves comparable and even superior results over the fully supervised +methods. + +
+
+
+
+
+ + ♻ ☆ Recent Advances in 3D Gaussian Splatting + + +
+ The emergence of 3D Gaussian Splatting (3DGS) has greatly accelerated the +rendering speed of novel view synthesis. Unlike neural implicit representations +like Neural Radiance Fields (NeRF) that represent a 3D scene with position and +viewpoint-conditioned neural networks, 3D Gaussian Splatting utilizes a set of +Gaussian ellipsoids to model the scene so that efficient rendering can be +accomplished by rasterizing Gaussian ellipsoids into images. Apart from the +fast rendering speed, the explicit representation of 3D Gaussian Splatting +facilitates editing tasks like dynamic reconstruction, geometry editing, and +physical simulation. Considering the rapid change and growing number of works +in this field, we present a literature review of recent 3D Gaussian Splatting +methods, which can be roughly classified into 3D reconstruction, 3D editing, +and other downstream applications by functionality. Traditional point-based +rendering methods and the rendering formulation of 3D Gaussian Splatting are +also illustrated for a better understanding of this technique. This survey aims +to help beginners get into this field quickly and provide experienced +researchers with a comprehensive overview, which can stimulate the future +development of the 3D Gaussian Splatting representation. + +
+
+
+
+
+ + ♻ ☆ FreeReg: Image-to-Point Cloud Registration Leveraging Pretrained + Diffusion Models and Monocular Depth Estimators ICLR 2024 + + +
+ Matching cross-modality features between images and point clouds is a +fundamental problem for image-to-point cloud registration. However, due to the +modality difference between images and points, it is difficult to learn robust +and discriminative cross-modality features by existing metric learning methods +for feature matching. Instead of applying metric learning on cross-modality +data, we propose to unify the modality between images and point clouds by +pretrained large-scale models first, and then establish robust correspondence +within the same modality. We show that the intermediate features, called +diffusion features, extracted by depth-to-image diffusion models are +semantically consistent between images and point clouds, which enables the +building of coarse but robust cross-modality correspondences. We further +extract geometric features on depth maps produced by the monocular depth +estimator. By matching such geometric features, we significantly improve the +accuracy of the coarse correspondences produced by diffusion features. +Extensive experiments demonstrate that without any task-specific training, +direct utilization of both features produces accurate image-to-point cloud +registration. On three public indoor and outdoor benchmarks, the proposed +method averagely achieves a 20.6 percent improvement in Inlier Ratio, a +three-fold higher Inlier Number, and a 48.6 percent improvement in Registration +Recall than existing state-of-the-arts. + +
+
+ comment: CameraReady version for ICLR 2024. Project Page: + https://whu-usi3dv.github.io/FreeReg/ +
+
+
+
+
+ + ♻ ☆ EfficientDM: Efficient Quantization-Aware Fine-Tuning of Low-Bit + Diffusion Models ICLR 2024 + + +
+ Diffusion models have demonstrated remarkable capabilities in image synthesis +and related generative tasks. Nevertheless, their practicality for real-world +applications is constrained by substantial computational costs and latency +issues. Quantization is a dominant way to compress and accelerate diffusion +models, where post-training quantization (PTQ) and quantization-aware training +(QAT) are two main approaches, each bearing its own properties. While PTQ +exhibits efficiency in terms of both time and data usage, it may lead to +diminished performance in low bit-width. On the other hand, QAT can alleviate +performance degradation but comes with substantial demands on computational and +data resources. In this paper, we introduce a data-free and parameter-efficient +fine-tuning framework for low-bit diffusion models, dubbed EfficientDM, to +achieve QAT-level performance with PTQ-like efficiency. Specifically, we +propose a quantization-aware variant of the low-rank adapter (QALoRA) that can +be merged with model weights and jointly quantized to low bit-width. The +fine-tuning process distills the denoising capabilities of the full-precision +model into its quantized counterpart, eliminating the requirement for training +data. We also introduce scale-aware optimization and temporal learned step-size +quantization to further enhance performance. Extensive experimental results +demonstrate that our method significantly outperforms previous PTQ-based +diffusion models while maintaining similar time and data efficiency. +Specifically, there is only a 0.05 sFID increase when quantizing both weights +and activations of LDM-4 to 4-bit on ImageNet 256x256. Compared to QAT-based +methods, our EfficientDM also boasts a 16.2x faster quantization speed with +comparable generation quality. Code is available at +\href{https://github.com/ThisisBillhe/EfficientDM}{this hrl}. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ A Specific Task-oriented Semantic Image Communication System for + substation patrol inspection + + +
+ Intelligent inspection robots are widely used in substation patrol +inspection, which can help check potential safety hazards by patrolling the +substation and sending back scene images. However, when patrolling some +marginal areas with weak signal, the scene images cannot be sucessfully +transmissted to be used for hidden danger elimination, which greatly reduces +the quality of robots'daily work. To solve such problem, a Specific +Task-oriented Semantic Communication System for Imag-STSCI is designed, which +involves the semantic features extraction, transmission, restoration and +enhancement to get clearer images sent by intelligent robots under weak +signals. Inspired by that only some specific details of the image are needed in +such substation patrol inspection task, we proposed a new paradigm of semantic +enhancement in such specific task to ensure the clarity of key semantic +information when facing a lower bit rate or a low signal-to-noise ratio +situation. Across the reality-based simulation, experiments show our STSCI can +generally surpass traditional image-compression-based and channel-codingbased +or other semantic communication system in the substation patrol inspection task +with a lower bit rate even under a low signal-to-noise ratio situation. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Visual Tuning + + +
+ Fine-tuning visual models has been widely shown promising performance on many +downstream visual tasks. With the surprising development of pre-trained visual +foundation models, visual tuning jumped out of the standard modus operandi that +fine-tunes the whole pre-trained model or just the fully connected layer. +Instead, recent advances can achieve superior performance than full-tuning the +whole pre-trained parameters by updating far fewer parameters, enabling edge +devices and downstream applications to reuse the increasingly large foundation +models deployed on the cloud. With the aim of helping researchers get the full +picture and future directions of visual tuning, this survey characterizes a +large and thoughtful selection of recent works, providing a systematic and +comprehensive overview of existing work and models. Specifically, it provides a +detailed background of visual tuning and categorizes recent visual tuning +techniques into five groups: prompt tuning, adapter tuning, parameter tuning, +and remapping tuning. Meanwhile, it offers some exciting research directions +for prospective pre-training and various interactions in visual tuning. + +
+
+ comment: 37 pages. Accepted to ACM CSUR +
+
+
+
+
+ + ♻ ☆ The Method of Detecting Flying Birds in Surveillance Video Based on + Their Characteristics + + +
+ Aiming at the characteristics of the flying bird object in surveillance +video, such as the single frame image feature is not obvious, the size is small +in most cases, and asymmetric, this paper proposes a Flying Bird Object +Detection method in Surveillance Video (FBOD-SV). Firstly, a new feature +aggregation module, the Correlation Attention Feature Aggregation +(Co-Attention-FA) module, is designed to aggregate the features of the flying +bird object according to the bird object's correlation on multiple consecutive +frames of images. Secondly, a Flying Bird Object Detection Network (FBOD-Net) +with down-sampling and then up-sampling is designed, which uses a large feature +layer that fuses fine spatial information and large receptive field information +to detect special multi-scale (mostly small-scale) bird objects. Finally, the +SimOTA dynamic label allocation method is applied to One-Category object +detection, and the SimOTA-OC dynamic label strategy is proposed to solve the +difficult problem of label allocation caused by irregular flying bird objects. +In this paper, the algorithm's performance is verified by the experimental data +set of the surveillance video of the flying bird object of the traction +substation. The experimental results show that the surveillance video flying +bird object detection method proposed in this paper effectively improves the +detection performance of flying bird objects. + +
+
+
+
+
+ + ♻ ☆ UNK-VQA: A Dataset and a Probe into the Abstention Ability of + Multi-modal Large Models + + +
+ Teaching Visual Question Answering (VQA) models to refrain from answering +unanswerable questions is necessary for building a trustworthy AI system. +Existing studies, though have explored various aspects of VQA but somewhat +ignored this particular attribute. This paper aims to bridge the research gap +by contributing a comprehensive dataset, called UNK-VQA. The dataset is +specifically designed to address the challenge of questions that models do not +know. To this end, we first augment the existing data via deliberate +perturbations on either the image or question. In specific, we carefully ensure +that the question-image semantics remain close to the original unperturbed +distribution. By this means, the identification of unanswerable questions +becomes challenging, setting our dataset apart from others that involve mere +image replacement. We then extensively evaluate the zero- and few-shot +performance of several emerging multi-modal large models and discover their +significant limitations when applied to our dataset. Additionally, we also +propose a straightforward method to tackle these unanswerable questions. This +dataset, we believe, will serve as a valuable benchmark for enhancing the +abstention capability of VQA models, thereby leading to increased +trustworthiness of AI systems. We have made the dataset +(https://github.com/guoyang9/UNK-VQA) available to facilitate further +exploration in this area. + +
+
+
+
+
+ + ♻ ☆ M$^{2}$Chat: Empowering VLM for Multimodal LLM Interleaved Text-Image + Generation + + +
+ While current LLM chatbots like GPT-4V bridge the gap between human +instructions and visual representations to enable text-image generations, they +still lack efficient alignment methods for high-fidelity performance on +multiple downstream tasks. In this paper, we propose \textbf{$M^{2}Chat$}, a +novel unified multimodal LLM framework for generating interleaved text-image +conversation across various scenarios. Specifically, we propose an +$M^{3}Adapter$ that efficiently integrates granular low-level visual +information and high-level semantic features from multi-modality prompts. Upon +the well-aligned fused feature, $M^{3}Adapter$ tailors a learnable gating +strategy to balance the model creativity and consistency across various tasks +adaptively. Moreover, to further enhance the effectiveness of $M^{3}Adapter$ +while preserving the coherence of semantic context comprehension, we introduce +a two-stage $M^{3}FT$ fine-tuning strategy. This strategy optimizes disjoint +groups of parameters for image-text alignment and visual-instruction +respectively. Extensive experiments demonstrate our $M^{2}Chat$ surpasses +state-of-the-art counterparts across diverse benchmarks, showcasing its prowess +in interleaving generation, storytelling, and multimodal dialogue systems. The +demo and code are available at +\red{https://mattie-e.github.io/M2Chat.github.io}. + +
+
+
+
+
+ + ♻ ☆ UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery CVPR + + +
+ Raindrops adhering to the lens of UAVs can obstruct visibility of the +background scene and degrade image quality. Despite recent progress in image +deraining methods and datasets, there is a lack of focus on raindrop removal +from UAV aerial imagery due to the unique challenges posed by varying angles +and rapid movement during drone flight. To fill the gap in this research, we +first construct a new benchmark dataset for removing raindrops from UAV images, +called UAV-Rain1k. In this letter, we provide a dataset generation pipeline, +which includes modeling raindrop shapes using Blender, collecting background +images from various UAV angles, random sampling of rain masks and etc. Based on +the proposed benchmark, we further present a comprehensive evaluation of +existing representative image deraining algorithms, and reveal future research +opportunities worth exploring. The proposed dataset is publicly available at +https://github.com/cschenxiang/UAV-Rain1k. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition Workshops (CVPRW) 2024 +
+
+
+
+
+ + ♻ ☆ Multi-scale Attention Network for Single Image Super-Resolution + + +
+ ConvNets can compete with transformers in high-level tasks by exploiting +larger receptive fields. To unleash the potential of ConvNet in +super-resolution, we propose a multi-scale attention network (MAN), by coupling +classical multi-scale mechanism with emerging large kernel attention. In +particular, we proposed multi-scale large kernel attention (MLKA) and gated +spatial attention unit (GSAU). Through our MLKA, we modify large kernel +attention with multi-scale and gate schemes to obtain the abundant attention +map at various granularity levels, thereby aggregating global and local +information and avoiding potential blocking artifacts. In GSAU, we integrate +gate mechanism and spatial attention to remove the unnecessary linear layer and +aggregate informative spatial context. To confirm the effectiveness of our +designs, we evaluate MAN with multiple complexities by simply stacking +different numbers of MLKA and GSAU. Experimental results illustrate that our +MAN can perform on par with SwinIR and achieve varied trade-offs between +state-of-the-art performance and computations. + +
+
+
+
+
+ + ♻ ☆ Generating Enhanced Negatives for Training Language-Based Object + Detectors CVPR 2024 + + +
+ The recent progress in language-based open-vocabulary object detection can be +largely attributed to finding better ways of leveraging large-scale data with +free-form text annotations. Training such models with a discriminative +objective function has proven successful, but requires good positive and +negative samples. However, the free-form nature and the open vocabulary of +object descriptions make the space of negatives extremely large. Prior works +randomly sample negatives or use rule-based techniques to build them. In +contrast, we propose to leverage the vast knowledge built into modern +generative models to automatically build negatives that are more relevant to +the original data. Specifically, we use large-language-models to generate +negative text descriptions, and text-to-image diffusion models to also generate +corresponding negative images. Our experimental analysis confirms the relevance +of the generated negative data, and its use in language-based detectors +improves performance on two complex benchmarks. Code is available at +\url{https://github.com/xiaofeng94/Gen-Enhanced-Negs}. + +
+
+ comment: Accepted to CVPR 2024. The supplementary document included +
+
+
+
+
+ + ♻ ☆ LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge + Retrieval-Augmented Diffusion CVPR 2024 + + +
+ Camouflaged vision perception is an important vision task with numerous +practical applications. Due to the expensive collection and labeling costs, +this community struggles with a major bottleneck that the species category of +its datasets is limited to a small number of object species. However, the +existing camouflaged generation methods require specifying the background +manually, thus failing to extend the camouflaged sample diversity in a low-cost +manner. In this paper, we propose a Latent Background Knowledge +Retrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To +our knowledge, our contributions mainly include: (1) For the first time, we +propose a camouflaged generation paradigm that does not need to receive any +background inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented +method with interpretability for camouflaged generation, in which we propose an +idea that knowledge retrieval and reasoning enhancement are separated +explicitly, to alleviate the task-specific challenges. Moreover, our method is +not restricted to specific foreground targets or backgrounds, offering a +potential for extending camouflaged vision perception to more diverse domains. +(3) Experimental results demonstrate that our method outperforms the existing +approaches, generating more realistic camouflage images. + +
+
+ comment: Accepted by CVPR 2024, Fig.3 revised +
+
+
+
+
+ + ♻ ☆ Taming Self-Training for Open-Vocabulary Object Detection CVPR 2024 + + +
+ Recent studies have shown promising performance in open-vocabulary object +detection (OVD) by utilizing pseudo labels (PLs) from pretrained vision and +language models (VLMs). However, teacher-student self-training, a powerful and +widely used paradigm to leverage PLs, is rarely explored for OVD. This work +identifies two challenges of using self-training in OVD: noisy PLs from VLMs +and frequent distribution changes of PLs. To address these challenges, we +propose SAS-Det that tames self-training for OVD from two key perspectives. +First, we present a split-and-fusion (SAF) head that splits a standard +detection into an open-branch and a closed-branch. This design can reduce noisy +supervision from pseudo boxes. Moreover, the two branches learn complementary +knowledge from different training data, significantly enhancing performance +when fused together. Second, in our view, unlike in closed-set tasks, the PL +distributions in OVD are solely determined by the teacher model. We introduce a +periodic update strategy to decrease the number of updates to the teacher, +thereby decreasing the frequency of changes in PL distributions, which +stabilizes the training process. Extensive experiments demonstrate SAS-Det is +both efficient and effective. SAS-Det outperforms recent models of the same +scale by a clear margin and achieves 37.4 AP50 and 29.1 APr on novel categories +of the COCO and LVIS benchmarks, respectively. Code is available at +\url{https://github.com/xiaofeng94/SAS-Det}. + +
+
+ comment: Accepted to CVPR 2024. The supplementary document included +
+
+
+
+
+ + ♻ ☆ Segment Anything Model for Road Network Graph Extraction CVPR + + +
+ We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for +extracting large-scale, vectorized road network graphs from satellite imagery. +To predict graph geometry, we formulate it as a dense semantic segmentation +task, leveraging the inherent strengths of SAM. The image encoder of SAM is +fine-tuned to produce probability masks for roads and intersections, from which +the graph vertices are extracted via simple non-maximum suppression. To predict +graph topology, we designed a lightweight transformer-based graph neural +network, which leverages the SAM image embeddings to estimate the edge +existence probabilities between vertices. Our approach directly predicts the +graph vertices and edges for large regions without expensive and complex +post-processing heuristics, and is capable of building complete road network +graphs spanning multiple square kilometers in a matter of seconds. With its +simple, straightforward, and minimalist design, SAM-Road achieves comparable +accuracy with the state-of-the-art method RNGDet++, while being 40 times faster +on the City-scale dataset. We thus demonstrate the power of a foundational +vision model when applied to a graph learning task. The code is available at +https://github.com/htcr/sam_road. + +
+
+ comment: Accepted by IEEE/CVF Computer Vision and Pattern Recognition + Conference (CVPR) 2024, 2nd Workshop on Scene Graphs and Graph Representation + Learning +
+
+
+
+
+ + ♻ ☆ ComCLIP: Training-Free Compositional Image and Text Matching + + +
+ Contrastive Language-Image Pretraining (CLIP) has demonstrated great +zero-shot performance for matching images and text. However, it is still +challenging to adapt vision-lanaguage pretrained models like CLIP to +compositional image and text matching -- a more challenging image and text +matching task requiring the model understanding of compositional word concepts +and visual components. Towards better compositional generalization in zero-shot +image and text matching, in this paper, we study the problem from a causal +perspective: the erroneous semantics of individual entities are essentially +confounders that cause the matching failure. Therefore, we propose a novel +\textbf{\textit{training-free}} compositional CLIP model (ComCLIP). ComCLIP +disentangles input images into subjects, objects, and action sub-images and +composes CLIP's vision encoder and text encoder to perform evolving matching +over compositional text embedding and sub-image embeddings. In this way, +ComCLIP can mitigate spurious correlations introduced by the pretrained CLIP +models and dynamically evaluate the importance of each component. Experiments +on four compositional image-text matching datasets: SVO, ComVG, Winoground, and +VL-checklist, and two general image-text retrieval datasets: Flick30K, and +MSCOCO demonstrate the effectiveness of our plug-and-play method, which boosts +the \textbf{\textit{zero-shot}} inference ability of CLIP, SLIP, and BLIP2 even +without further training or fine-tuning. Our codes can be found at +https://github.com/eric-ai-lab/ComCLIP. + +
+
+
+
+
+ + ♻ ☆ NICEST: Noisy Label Correction and Training for Robust Scene Graph + Generation CVPR'22 + + +
+ Nearly all existing scene graph generation (SGG) models have overlooked the +ground-truth annotation qualities of mainstream SGG datasets, i.e., they +assume: 1) all the manually annotated positive samples are equally correct; 2) +all the un-annotated negative samples are absolutely background. In this paper, +we argue that neither of the assumptions applies to SGG: there are numerous +noisy ground-truth predicate labels that break these two assumptions and harm +the training of unbiased SGG models. To this end, we propose a novel NoIsy +label CorrEction and Sample Training strategy for SGG: NICEST. Specifically, it +consists of two parts: NICE and NIST, which rule out these noisy label issues +by generating high-quality samples and the effective training strategy, +respectively. NICE first detects noisy samples and then reassigns them more +high-quality soft predicate labels. NIST is a multi-teacher knowledge +distillation based training strategy, which enables the model to learn unbiased +fusion knowledge. And a dynamic trade-off weighting strategy in NIST is +designed to penalize the bias of different teachers. Due to the model-agnostic +nature of both NICE and NIST, our NICEST can be seamlessly incorporated into +any SGG architecture to boost its performance on different predicate +categories. In addition, to better evaluate the generalization of SGG models, +we further propose a new benchmark VG-OOD, by re-organizing the prevalent VG +dataset and deliberately making the predicate distributions of the training and +test sets as different as possible for each subject-object category pair. This +new benchmark helps disentangle the influence of subject-object category based +frequency biases. Extensive ablations and results on different backbones and +tasks have attested to the effectiveness and generalization ability of each +component of NICEST. + +
+
+ comment: Extension of CVPR'22 work (The Devil is in the Labels: Noisy Label + Correction for Robust Scene Graph Generation). arXiv admin note: substantial + text overlap with arXiv:2206.03014 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 148 + +
+
+
+ + ☆ EventEgo3D: 3D Human Motion Capture from Egocentric Event Streams CVPR + + +
+ Monocular egocentric 3D human motion capture is a challenging and actively +researched problem. Existing methods use synchronously operating visual sensors +(e.g. RGB cameras) and often fail under low lighting and fast motions, which +can be restricting in many applications involving head-mounted devices. In +response to the existing limitations, this paper 1) introduces a new problem, +i.e., 3D human motion capture from an egocentric monocular event camera with a +fisheye lens, and 2) proposes the first approach to it called EventEgo3D +(EE3D). Event streams have high temporal resolution and provide reliable cues +for 3D human motion capture under high-speed human motions and rapidly changing +illumination. The proposed EE3D framework is specifically tailored for learning +with event streams in the LNES representation, enabling high 3D reconstruction +accuracy. We also design a prototype of a mobile head-mounted device with an +event camera and record a real dataset with event observations and the +ground-truth 3D human poses (in addition to the synthetic dataset). Our EE3D +demonstrates robustness and superior 3D accuracy compared to existing solutions +across various challenging experiments while supporting real-time 3D pose +update rates of 140Hz. + +
+
+ comment: 14 pages, 11 figures and 6 tables; project page: + https://4dqv.mpi-inf.mpg.de/EventEgo3D/; Computer Vision and Pattern + Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ COCONut: Modernizing COCO Segmentation CVPR2024 + + +
+ In recent decades, the vision community has witnessed remarkable progress in +visual recognition, partially owing to advancements in dataset benchmarks. +Notably, the established COCO benchmark has propelled the development of modern +detection and segmentation systems. However, the COCO segmentation benchmark +has seen comparatively slow improvement over the last decade. Originally +equipped with coarse polygon annotations for thing instances, it gradually +incorporated coarse superpixel annotations for stuff regions, which were +subsequently heuristically amalgamated to yield panoptic segmentation +annotations. These annotations, executed by different groups of raters, have +resulted not only in coarse segmentation masks but also in inconsistencies +between segmentation types. In this study, we undertake a comprehensive +reevaluation of the COCO segmentation annotations. By enhancing the annotation +quality and expanding the dataset to encompass 383K images with more than 5.18M +panoptic masks, we introduce COCONut, the COCO Next Universal segmenTation +dataset. COCONut harmonizes segmentation annotations across semantic, instance, +and panoptic segmentation with meticulously crafted high-quality masks, and +establishes a robust benchmark for all segmentation tasks. To our knowledge, +COCONut stands as the inaugural large-scale universal segmentation dataset, +verified by human raters. We anticipate that the release of COCONut will +significantly contribute to the community's ability to assess the progress of +novel neural networks. + +
+
+ comment: Accepted at CVPR2024, data available at + https://xdeng7.github.io/coconut.github.io/ +
+
+
+
+
+ + ☆ Probing the 3D Awareness of Visual Foundation Models CVPR 2024 + + +
+ Recent advances in large-scale pretraining have yielded visual foundation +models with strong capabilities. Not only can recent models generalize to +arbitrary images for their training task, their intermediate representations +are useful for other visual tasks such as detection and segmentation. Given +that such models can classify, delineate, and localize objects in 2D, we ask +whether they also represent their 3D structure? In this work, we analyze the 3D +awareness of visual foundation models. We posit that 3D awareness implies that +representations (1) encode the 3D structure of the scene and (2) consistently +represent the surface across views. We conduct a series of experiments using +task-specific probes and zero-shot inference procedures on frozen features. Our +experiments reveal several limitations of the current models. Our code and +analysis can be found at https://github.com/mbanani/probe3d. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://github.com/mbanani/probe3d +
+
+
+
+
+ + ☆ Automatic Quantification of Serial PET/CT Images for Pediatric Hodgkin + Lymphoma Patients Using a Longitudinally-Aware Segmentation Network + + +
+ $\textbf{Purpose}$: Automatic quantification of longitudinal changes in PET +scans for lymphoma patients has proven challenging, as residual disease in +interim-therapy scans is often subtle and difficult to detect. Our goal was to +develop a longitudinally-aware segmentation network (LAS-Net) that can quantify +serial PET/CT images for pediatric Hodgkin lymphoma patients. +$\textbf{Materials and Methods}$: This retrospective study included baseline +(PET1) and interim (PET2) PET/CT images from 297 patients enrolled in two +Children's Oncology Group clinical trials (AHOD1331 and AHOD0831). LAS-Net +incorporates longitudinal cross-attention, allowing relevant features from PET1 +to inform the analysis of PET2. Model performance was evaluated using Dice +coefficients for PET1 and detection F1 scores for PET2. Additionally, we +extracted and compared quantitative PET metrics, including metabolic tumor +volume (MTV) and total lesion glycolysis (TLG) in PET1, as well as qPET and +$\Delta$SUVmax in PET2, against physician measurements. We quantified their +agreement using Spearman's $\rho$ correlations and employed bootstrap +resampling for statistical analysis. $\textbf{Results}$: LAS-Net detected +residual lymphoma in PET2 with an F1 score of 0.606 (precision/recall: +0.615/0.600), outperforming all comparator methods (P<0.01). For baseline +segmentation, LAS-Net achieved a mean Dice score of 0.772. In PET +quantification, LAS-Net's measurements of qPET, $\Delta$SUVmax, MTV and TLG +were strongly correlated with physician measurements, with Spearman's $\rho$ of +0.78, 0.80, 0.93 and 0.96, respectively. The performance remained high, with a +slight decrease, in an external testing cohort. $\textbf{Conclusion}$: LAS-Net +achieved high performance in quantifying PET metrics across serial scans, +highlighting the value of longitudinal awareness in evaluating multi-time-point +imaging datasets. + +
+
+ comment: 6 figures, 4 tables in the main text +
+
+
+
+
+ + ☆ Training-free Boost for Open-Vocabulary Object Detection with Confidence + Aggregation + + +
+ Open-vocabulary object detection (OVOD) aims at localizing and recognizing +visual objects from novel classes unseen at the training time. Whereas, +empirical studies reveal that advanced detectors generally assign lower scores +to those novel instances, which are inadvertently suppressed during inference +by commonly adopted greedy strategies like Non-Maximum Suppression (NMS), +leading to sub-optimal detection performance for novel classes. This paper +systematically investigates this problem with the commonly-adopted two-stage +OVOD paradigm. Specifically, in the region-proposal stage, proposals that +contain novel instances showcase lower objectness scores, since they are +treated as background proposals during the training phase. Meanwhile, in the +object-classification stage, novel objects share lower region-text similarities +(i.e., classification scores) due to the biased visual-language alignment by +seen training samples. To alleviate this problem, this paper introduces two +advanced measures to adjust confidence scores and conserve erroneously +dismissed objects: (1) a class-agnostic localization quality estimate via +overlap degree of region/object proposals, and (2) a text-guided visual +similarity estimate with proxy prototypes for novel classes. Integrated with +adjusting techniques specifically designed for the region-proposal and +object-classification stages, this paper derives the aggregated confidence +estimate for the open-vocabulary object detection paradigm (AggDet). Our AggDet +is a generic and training-free post-processing scheme, which consistently +bolsters open-vocabulary detectors across model scales and architecture +designs. For instance, AggDet receives 3.3% and 1.5% gains on OV-COCO and +OV-LVIS benchmarks respectively, without any training cost. + +
+
+
+
+
+ + ☆ Improving Referring Image Segmentation using Vision-Aware Text Features + + +
+ Referring image segmentation is a challenging task that involves generating +pixel-wise segmentation masks based on natural language descriptions. Existing +methods have relied mostly on visual features to generate the segmentation +masks while treating text features as supporting components. This over-reliance +on visual features can lead to suboptimal results, especially in complex +scenarios where text prompts are ambiguous or context-dependent. To overcome +these challenges, we present a novel framework VATEX to improve referring image +segmentation by enhancing object and context understanding with Vision-Aware +Text Feature. Our method involves using CLIP to derive a CLIP Prior that +integrates an object-centric visual heatmap with text description, which can be +used as the initial query in DETR-based architecture for the segmentation task. +Furthermore, by observing that there are multiple ways to describe an instance +in an image, we enforce feature similarity between text variations referring to +the same visual input by two components: a novel Contextual Multimodal Decoder +that turns text embeddings into vision-aware text features, and a Meaning +Consistency Constraint to ensure further the coherent and consistent +interpretation of language expressions with the context understanding obtained +from the image. Our method achieves a significant performance improvement on +three benchmark datasets RefCOCO, RefCOCO+ and G-Ref. Code is available at: +https://nero1342.github.io/VATEX\_RIS. + +
+
+ comment: 30 pages including supplementary +
+
+
+
+
+ + ☆ Enhancing Visual Question Answering through Question-Driven Image + Captions as Prompts CVPR 2024 + + +
+ Visual question answering (VQA) is known as an AI-complete task as it +requires understanding, reasoning, and inferring about the vision and the +language content. Over the past few years, numerous neural architectures have +been suggested for the VQA problem. However, achieving success in zero-shot VQA +remains a challenge due to its requirement for advanced generalization and +reasoning skills. This study explores the impact of incorporating image +captioning as an intermediary process within the VQA pipeline. Specifically, we +explore the efficacy of utilizing image captions instead of images and +leveraging large language models (LLMs) to establish a zero-shot setting. Since +image captioning is the most crucial step in this process, we compare the +impact of state-of-the-art image captioning models on VQA performance across +various question types in terms of structure and semantics. We propose a +straightforward and efficient question-driven image captioning approach within +this pipeline to transfer contextual information into the question-answering +(QA) model. This method involves extracting keywords from the question, +generating a caption for each image-question pair using the keywords, and +incorporating the question-driven caption into the LLM prompt. We evaluate the +efficacy of using general-purpose and question-driven image captions in the VQA +pipeline. Our study highlights the potential of employing image captions and +harnessing the capabilities of LLMs to achieve competitive performance on GQA +under the zero-shot setting. Our code is available at +\url{https://github.com/ovguyo/captions-in-VQA}. + +
+
+ comment: The paper has been accepted for presentation at CVPR 2024 Workshop on + Prompting in Vision +
+
+
+
+
+ + ☆ Advanced wood species identification based on multiple anatomical + sections and using deep feature transfer and fusion + + +
+ In recent years, we have seen many advancements in wood species +identification. Methods like DNA analysis, Near Infrared (NIR) spectroscopy, +and Direct Analysis in Real Time (DART) mass spectrometry complement the +long-established wood anatomical assessment of cell and tissue morphology. +However, most of these methods have some limitations such as high costs, the +need for skilled experts for data interpretation, and the lack of good datasets +for professional reference. Therefore, most of these methods, and certainly the +wood anatomical assessment, may benefit from tools based on Artificial +Intelligence. In this paper, we apply two transfer learning techniques with +Convolutional Neural Networks (CNNs) to a multi-view Congolese wood species +dataset including sections from different orientations and viewed at different +microscopic magnifications. We explore two feature extraction methods in +detail, namely Global Average Pooling (GAP) and Random Encoding of Aggregated +Deep Activation Maps (RADAM), for efficient and accurate wood species +identification. Our results indicate superior accuracy on diverse datasets and +anatomical sections, surpassing the results of other methods. Our proposal +represents a significant advancement in wood species identification, offering a +robust tool to support the conservation of forest ecosystems and promote +sustainable forestry practices. + +
+
+ comment: 33 pages, 7 tables, 9 figures +
+
+
+
+
+ + ☆ Pathological Primitive Segmentation Based on Visual Foundation Model + with Zero-Shot Mask Generation + + +
+ Medical image processing usually requires a model trained with carefully +crafted datasets due to unique image characteristics and domain-specific +challenges, especially in pathology. Primitive detection and segmentation in +digitized tissue samples are essential for objective and automated diagnosis +and prognosis of cancer. SAM (Segment Anything Model) has recently been +developed to segment general objects from natural images with high accuracy, +but it requires human prompts to generate masks. In this work, we present a +novel approach that adapts pre-trained natural image encoders of SAM for +detection-based region proposals. Regions proposed by a pre-trained encoder are +sent to cascaded feature propagation layers for projection. Then, local +semantic and global context is aggregated from multi-scale for bounding box +localization and classification. Finally, the SAM decoder uses the identified +bounding boxes as essential prompts to generate a comprehensive primitive +segmentation map. The entire base framework, SAM, requires no additional +training or fine-tuning but could produce an end-to-end result for two +fundamental segmentation tasks in pathology. Our method compares with +state-of-the-art models in F1 score for nuclei detection and binary/multiclass +panoptic(bPQ/mPQ) and mask quality(dice) for segmentation quality on the +PanNuke dataset while offering end-to-end efficiency. Our model also achieves +remarkable Average Precision (+4.5%) on the secondary dataset (HuBMAP Kidney) +compared to Faster RCNN. The code is publicly available at +https://github.com/learner-codec/autoprom_sam. + +
+
+ comment: 2024 IEEE International Symposium on Biomedical Imaging +
+
+
+
+
+ + ☆ FashionFail: Addressing Failure Cases in Fashion Object Detection and + Segmentation IJCNN + + +
+ In the realm of fashion object detection and segmentation for online shopping +images, existing state-of-the-art fashion parsing models encounter limitations, +particularly when exposed to non-model-worn apparel and close-up shots. To +address these failures, we introduce FashionFail; a new fashion dataset with +e-commerce images for object detection and segmentation. The dataset is +efficiently curated using our novel annotation tool that leverages recent +foundation models. The primary objective of FashionFail is to serve as a test +bed for evaluating the robustness of models. Our analysis reveals the +shortcomings of leading models, such as Attribute-Mask R-CNN and Fashionformer. +Additionally, we propose a baseline approach using naive data augmentation to +mitigate common failure cases and improve model robustness. Through this work, +we aim to inspire and support further research in fashion item detection and +segmentation for industrial applications. The dataset, annotation tool, code, +and models are available at \url{https://rizavelioglu.github.io/fashionfail/}. + +
+
+ comment: to be published in 2024 International Joint Conference on Neural + Networks (IJCNN) +
+
+
+
+
+ + ☆ Lossy Image Compression with Foundation Diffusion Models + + +
+ Incorporating diffusion models in the image compression domain has the +potential to produce realistic and detailed reconstructions, especially at +extremely low bitrates. Previous methods focus on using diffusion models as +expressive decoders robust to quantization errors in the conditioning signals, +yet achieving competitive results in this manner requires costly training of +the diffusion model and long inference times due to the iterative generative +process. In this work we formulate the removal of quantization error as a +denoising task, using diffusion to recover lost information in the transmitted +image latent. Our approach allows us to perform less than 10\% of the full +diffusion generative process and requires no architectural changes to the +diffusion model, enabling the use of foundation models as a strong prior +without additional fine tuning of the backbone. Our proposed codec outperforms +previous methods in quantitative realism metrics, and we verify that our +reconstructions are qualitatively preferred by end users, even when other +methods use twice the bitrate. + +
+
+
+
+
+ + ☆ IDD-X: A Multi-View Dataset for Ego-relative Important Object + Localization and Explanation in Dense and Unstructured Traffic ICRA 2024 + + +
+ Intelligent vehicle systems require a deep understanding of the interplay +between road conditions, surrounding entities, and the ego vehicle's driving +behavior for safe and efficient navigation. This is particularly critical in +developing countries where traffic situations are often dense and unstructured +with heterogeneous road occupants. Existing datasets, predominantly geared +towards structured and sparse traffic scenarios, fall short of capturing the +complexity of driving in such environments. To fill this gap, we present IDD-X, +a large-scale dual-view driving video dataset. With 697K bounding boxes, 9K +important object tracks, and 1-12 objects per video, IDD-X offers comprehensive +ego-relative annotations for multiple important road objects covering 10 +categories and 19 explanation label categories. The dataset also incorporates +rearview information to provide a more complete representation of the driving +environment. We also introduce custom-designed deep networks aimed at multiple +important object localization and per-object explanation prediction. Overall, +our dataset and introduced prediction models form the foundation for studying +how road conditions and surrounding entities affect driving behavior in complex +traffic situations. + +
+
+ comment: Accepted at ICRA 2024 +
+
+
+
+
+ + ☆ Scalability in Building Component Data Annotation: Enhancing Facade + Material Classification with Synthetic Data + + +
+ Computer vision models trained on Google Street View images can create +material cadastres. However, current approaches need manually annotated +datasets that are difficult to obtain and often have class imbalance. To +address these challenges, this paper fine-tuned a Swin Transformer model on a +synthetic dataset generated with DALL-E and compared the performance to a +similar manually annotated dataset. Although manual annotation remains the gold +standard, the synthetic dataset performance demonstrates a reasonable +alternative. The findings will ease annotation needed to develop material +cadastres, offering architects insights into opportunities for material reuse, +thus contributing to the reduction of demolition waste. + +
+
+ comment: 10 pages, 6 figures, submitted to 2024 European Conference of + Computing in Construction +
+
+
+
+
+ + ☆ Benchmarking the Cell Image Segmentation Models Robustness under the + Microscope Optical Aberrations + + +
+ Cell segmentation is essential in biomedical research for analyzing cellular +morphology and behavior. Deep learning methods, particularly convolutional +neural networks (CNNs), have revolutionized cell segmentation by extracting +intricate features from images. However, the robustness of these methods under +microscope optical aberrations remains a critical challenge. This study +comprehensively evaluates the performance of cell instance segmentation models +under simulated aberration conditions using the DynamicNuclearNet (DNN) and +LIVECell datasets. Aberrations, including Astigmatism, Coma, Spherical, and +Trefoil, were simulated using Zernike polynomial equations. Various +segmentation models, such as Mask R-CNN with different network heads (FPN, C3) +and backbones (ResNet, VGG19, SwinS), were trained and tested under aberrated +conditions. Results indicate that FPN combined with SwinS demonstrates superior +robustness in handling simple cell images affected by minor aberrations. +Conversely, Cellpose2.0 proves effective for complex cell images under similar +conditions. Our findings provide insights into selecting appropriate +segmentation models based on cell morphology and aberration severity, enhancing +the reliability of cell segmentation in biomedical applications. Further +research is warranted to validate these methods with diverse aberration types +and emerging segmentation models. Overall, this research aims to guide +researchers in effectively utilizing cell segmentation models in the presence +of minor optical aberrations. + +
+
+
+
+
+ + ☆ Analyzing Decades-Long Environmental Changes in Namibia Using Archival + Aerial Photography and Deep Learning + + +
+ This study explores object detection in historical aerial photographs of +Namibia to identify long-term environmental changes. Specifically, we aim to +identify key objects -- \textit{Waterholes}, \textit{Omuti homesteads}, and +\textit{Big trees} -- around Oshikango in Namibia using sub-meter gray-scale +aerial imagery from 1943 and 1972. In this work, we propose a workflow for +analyzing historical aerial imagery using a deep semantic segmentation model on +sparse hand-labels. To this end, we employ a number of strategies including +class-weighting, pseudo-labeling and empirical p-value-based filtering to +balance skewed and sparse representations of objects in the ground truth data. +Results demonstrate the benefits of these different training strategies +resulting in an average $F_1=0.661$ and $F_1=0.755$ over the three objects of +interest for the 1943 and 1972 imagery, respectively. We also identified that +the average size of Waterhole and Big trees increased while the average size of +Omutis decreased between 1943 and 1972 reflecting some of the local effects of +the massive post-Second World War economic, agricultural, demographic, and +environmental changes. This work also highlights the untapped potential of +historical aerial photographs in understanding long-term environmental changes +beyond Namibia (and Africa). With the lack of adequate satellite technology in +the past, archival aerial photography offers a great alternative to uncover +decades-long environmental changes. + +
+
+
+
+
+ + ☆ On the Robustness of Language Guidance for Low-Level Vision Tasks: + Findings from Depth Estimation CVPR 2024 + + +
+ Recent advances in monocular depth estimation have been made by incorporating +natural language as additional guidance. Although yielding impressive results, +the impact of the language prior, particularly in terms of generalization and +robustness, remains unexplored. In this paper, we address this gap by +quantifying the impact of this prior and introduce methods to benchmark its +effectiveness across various settings. We generate "low-level" sentences that +convey object-centric, three-dimensional spatial relationships, incorporate +them as additional language priors and evaluate their downstream impact on +depth estimation. Our key finding is that current language-guided depth +estimators perform optimally only with scene-level descriptions and +counter-intuitively fare worse with low level descriptions. Despite leveraging +additional data, these methods are not robust to directed adversarial attacks +and decline in performance with an increase in distribution shift. Finally, to +provide a foundation for future research, we identify points of failures and +offer insights to better understand these shortcomings. With an increasing +number of methods using language for depth estimation, our findings highlight +the opportunities and pitfalls that require careful consideration for effective +deployment in real-world settings + +
+
+ comment: Accepted to CVPR 2024. Project webpage: + https://agneetchatterjee.com/robustness_depth_lang/ +
+
+
+
+
+ + ☆ Generalized Contrastive Learning for Multi-Modal Retrieval and Ranking + + +
+ Contrastive learning has gained widespread adoption for retrieval tasks due +to its minimal requirement for manual annotations. However, popular contrastive +frameworks typically learn from binary relevance, making them ineffective at +incorporating direct fine-grained rankings. In this paper, we curate a +large-scale dataset featuring detailed relevance scores for each query-document +pair to facilitate future research and evaluation. Subsequently, we propose +Generalized Contrastive Learning for Multi-Modal Retrieval and Ranking (GCL), +which is designed to learn from fine-grained rankings beyond binary relevance +scores. Our results show that GCL achieves a 94.5% increase in NDCG@10 for +in-domain and 26.3 to 48.8% increases for cold-start evaluations, all relative +to the CLIP baseline and involving ground truth rankings. + +
+
+
+
+
+ + ☆ Text Prompt with Normality Guidance for Weakly Supervised Video Anomaly + Detection CVPR2024 + + +
+ Weakly supervised video anomaly detection (WSVAD) is a challenging task. +Generating fine-grained pseudo-labels based on weak-label and then +self-training a classifier is currently a promising solution. However, since +the existing methods use only RGB visual modality and the utilization of +category text information is neglected, thus limiting the generation of more +accurate pseudo-labels and affecting the performance of self-training. Inspired +by the manual labeling process based on the event description, in this paper, +we propose a novel pseudo-label generation and self-training framework based on +Text Prompt with Normality Guidance (TPWNG) for WSVAD. Our idea is to transfer +the rich language-visual knowledge of the contrastive language-image +pre-training (CLIP) model for aligning the video event description text and +corresponding video frames to generate pseudo-labels. Specifically, We first +fine-tune the CLIP for domain adaptation by designing two ranking losses and a +distributional inconsistency loss. Further, we propose a learnable text prompt +mechanism with the assist of a normality visual prompt to further improve the +matching accuracy of video event description text and video frames. Then, we +design a pseudo-label generation module based on the normality guidance to +infer reliable frame-level pseudo-labels. Finally, we introduce a temporal +context self-adaptive learning module to learn the temporal dependencies of +different video events more flexibly and accurately. Extensive experiments show +that our method achieves state-of-the-art performance on two benchmark +datasets, UCF-Crime and XD-Viole + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Masked Image Modeling as a Framework for Self-Supervised Learning across + Eye Movements + + +
+ To make sense of their surroundings, intelligent systems must transform +complex sensory inputs to structured codes that are reduced to task-relevant +information such as object category. Biological agents achieve this in a +largely autonomous manner, presumably via self-\allowbreak super-\allowbreak +vised learning. Whereas previous attempts to model the underlying mechanisms +were largely discriminative in nature, there is ample evidence that the brain +employs a generative model of the world. Here, we propose that eye movements, +in combination with the focused nature of primate vision, constitute a +generative, self-supervised task of predicting and revealing visual +information. We construct a proof-of-principle model starting from the +framework of masked image modeling (MIM), a common approach in deep +representation learning. To do so, we analyze how core components of MIM such +as masking technique and data augmentation influence the formation of +category-specific representations. This allows us not only to better understand +the principles behind MIM, but to then reassemble a MIM more in line with the +focused nature of biological perception. From a theoretical angle, we find that +MIM disentangles neurons in latent space, a property that has been suggested to +structure visual representations in primates, without explicit regulation. +Together with previous findings of invariance learning, this highlights an +interesting connection of MIM to latent regularization approaches for +self-supervised learning. The source code is available under +https://github.com/RobinWeiler/FocusMIM + +
+
+
+
+
+ + ☆ ChatGPT and general-purpose AI count fruits in pictures surprisingly + well + + +
+ Object counting is a popular task in deep learning applications in various +domains, including agriculture. A conventional deep learning approach requires +a large amount of training data, often a logistic problem in a real-world +application. To address this issue, we examined how well ChatGPT (GPT4V) and a +general-purpose AI (foundation model for object counting, T-Rex) can count the +number of fruit bodies (coffee cherries) in 100 images. The foundation model +with few-shot learning outperformed the trained YOLOv8 model (R2 = 0.923 and +0.900, respectively). ChatGPT also showed some interesting potential, +especially when few-shot learning with human feedback was applied (R2 = 0.360 +and 0.460, respectively). Moreover, we examined the time required for +implementation as a practical question. Obtaining the results with the +foundation model and ChatGPT were much shorter than the YOLOv8 model (0.83 hrs, +1.75 hrs, and 161 hrs). We interpret these results as two surprises for deep +learning users in applied domains: a foundation model with few-shot +domain-specific learning can drastically save time and effort compared to the +conventional approach, and ChatGPT can reveal a relatively good performance. +Both approaches do not need coding skills, which can foster AI education and +dissemination. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ NIR-Assisted Image Denoising: A Selective Fusion Approach and A + Real-World Benchmark Datase + + +
+ Despite the significant progress in image denoising, it is still challenging +to restore fine-scale details while removing noise, especially in extremely +low-light environments. Leveraging near-infrared (NIR) images to assist visible +RGB image denoising shows the potential to address this issue, becoming a +promising technology. Nonetheless, existing works still struggle with taking +advantage of NIR information effectively for real-world image denoising, due to +the content inconsistency between NIR-RGB images and the scarcity of real-world +paired datasets. To alleviate the problem, we propose an efficient Selective +Fusion Module (SFM), which can be plug-and-played into the advanced denoising +networks to merge the deep NIR-RGB features. Specifically, we sequentially +perform the global and local modulation for NIR and RGB features, and then +integrate the two modulated features. Furthermore, we present a Real-world +NIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse +scenarios as well as various noise levels. Extensive experiments on both +synthetic and our real-world datasets demonstrate that the proposed method +achieves better results than state-of-the-art ones. The dataset, codes, and +pre-trained models will be publicly available at +https://github.com/ronjonxu/NAID. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ LaSagnA: Language-based Segmentation Assistant for Complex Queries + + +
+ Recent advancements have empowered Large Language Models for Vision (vLLMs) +to generate detailed perceptual outcomes, including bounding boxes and masks. +Nonetheless, there are two constraints that restrict the further application of +these vLLMs: the incapability of handling multiple targets per query and the +failure to identify the absence of query objects in the image. In this study, +we acknowledge that the main cause of these problems is the insufficient +complexity of training queries. Consequently, we define the general sequence +format for complex queries. Then we incorporate a semantic segmentation task in +the current pipeline to fulfill the requirements of training data. Furthermore, +we present three novel strategies to effectively handle the challenges arising +from the direct integration of the proposed format. The effectiveness of our +model in processing complex queries is validated by the comparable results with +conventional methods on both close-set and open-set semantic segmentation +datasets. Additionally, we outperform a series of vLLMs in reasoning and +referring segmentation, showcasing our model's remarkable capabilities. We +release the code at https://github.com/congvvc/LaSagnA. + +
+
+
+
+
+ + ☆ 3D Human Scan With A Moving Event Camera + + +
+ Capturing the 3D human body is one of the important tasks in computer vision +with a wide range of applications such as virtual reality and sports analysis. +However, conventional frame cameras are limited by their temporal resolution +and dynamic range, which imposes constraints in real-world application setups. +Event cameras have the advantages of high temporal resolution and high dynamic +range (HDR), but the development of event-based methods is necessary to handle +data with different characteristics. This paper proposes a novel event-based +method for 3D pose estimation and human mesh recovery. Prior work on +event-based human mesh recovery require frames (images) as well as event data. +The proposed method solely relies on events; it carves 3D voxels by moving the +event camera around a stationary body, reconstructs the human pose and mesh by +attenuated rays, and fit statistical body models, preserving high-frequency +details. The experimental results show that the proposed method outperforms +conventional frame-based methods in the estimation accuracy of both pose and +body mesh. We also demonstrate results in challenging situations where a +conventional camera has motion blur. This is the first to demonstrate +event-only human mesh recovery, and we hope that it is the first step toward +achieving robust and accurate 3D human body scanning from vision sensors. + +
+
+
+
+
+ + ☆ SpectralMamba: Efficient Mamba for Hyperspectral Image Classification + + +
+ Recurrent neural networks and Transformers have recently dominated most +applications in hyperspectral (HS) imaging, owing to their capability to +capture long-range dependencies from spectrum sequences. However, despite the +success of these sequential architectures, the non-ignorable inefficiency +caused by either difficulty in parallelization or computationally prohibitive +attention still hinders their practicality, especially for large-scale +observation in remote sensing scenarios. To address this issue, we herein +propose SpectralMamba -- a novel state space model incorporated efficient deep +learning framework for HS image classification. SpectralMamba features the +simplified but adequate modeling of HS data dynamics at two levels. First, in +spatial-spectral space, a dynamical mask is learned by efficient convolutions +to simultaneously encode spatial regularity and spectral peculiarity, thus +attenuating the spectral variability and confusion in discriminative +representation learning. Second, the merged spectrum can then be efficiently +operated in the hidden state space with all parameters learned input-dependent, +yielding selectively focused responses without reliance on redundant attention +or imparallelizable recurrence. To explore the room for further computational +downsizing, a piece-wise scanning mechanism is employed in-between, +transferring approximately continuous spectrum into sequences with squeezed +length while maintaining short- and long-term contextual profiles among +hundreds of bands. Through extensive experiments on four benchmark HS datasets +acquired by satellite-, aircraft-, and UAV-borne imagers, SpectralMamba +surprisingly creates promising win-wins from both performance and efficiency +perspectives. + +
+
+
+
+
+ + ☆ New Efficient Visual OILU Markers + + +
+ Basic patterns are the source of a wide range of more or less complex +geometric structures. We will exploit such patterns to develop new efficient +visual markers. Besides being projective invariants, the proposed markers allow +producing rich panel of unique identifiers, highly required for +resource-intensive navigation and augmented reality applications. The spiral +topology of our markers permits the validation of an accurate identification +scheme, which is based on level set methods. The robustness of the markers +against acquisition and geometric distortions is validated by extensive +experimental tests. + +
+
+
+
+
+ + ☆ MoE-FFD: Mixture of Experts for Generalized and Parameter-Efficient Face + Forgery Detection + + +
+ Deepfakes have recently raised significant trust issues and security concerns +among the public. Compared to CNN face forgery detectors, ViT-based methods +take advantage of the expressivity of transformers, achieving superior +detection performance. However, these approaches still exhibit the following +limitations: (1). Fully fine-tuning ViT-based models from ImageNet weights +demands substantial computational and storage resources; (2). ViT-based methods +struggle to capture local forgery clues, leading to model bias and limited +generalizability. To tackle these challenges, this work introduces +Mixture-of-Experts modules for Face Forgery Detection (MoE-FFD), a generalized +yet parameter-efficient ViT-based approach. MoE-FFD only updates lightweight +Low-Rank Adaptation (LoRA) and Adapter layers while keeping the ViT backbone +frozen, thereby achieving parameter-efficient training. Moreover, MoE-FFD +leverages the expressivity of transformers and local priors of CNNs to +simultaneously extract global and local forgery clues. Additionally, novel MoE +modules are designed to scale the model's capacity and select optimal forgery +experts, further enhancing forgery detection performance. The proposed MoE +learning scheme can be seamlessly adapted to various transformer backbones in a +plug-and-play manner. Extensive experimental results demonstrate that the +proposed method achieves state-of-the-art face forgery detection performance +with reduced parameter overhead. The code will be released upon acceptance. + +
+
+
+
+
+ + ☆ Joint Physical-Digital Facial Attack Detection Via Simulating Spoofing + Clues CVPR + + +
+ Face recognition systems are frequently subjected to a variety of physical +and digital attacks of different types. Previous methods have achieved +satisfactory performance in scenarios that address physical attacks and digital +attacks, respectively. However, few methods are considered to integrate a model +that simultaneously addresses both physical and digital attacks, implying the +necessity to develop and maintain multiple models. To jointly detect physical +and digital attacks within a single model, we propose an innovative approach +that can adapt to any network architecture. Our approach mainly contains two +types of data augmentation, which we call Simulated Physical Spoofing Clues +augmentation (SPSC) and Simulated Digital Spoofing Clues augmentation (SDSC). +SPSC and SDSC augment live samples into simulated attack samples by simulating +spoofing clues of physical and digital attacks, respectively, which +significantly improve the capability of the model to detect "unseen" attack +types. Extensive experiments show that SPSC and SDSC can achieve +state-of-the-art generalization in Protocols 2.1 and 2.2 of the UniAttackData +dataset, respectively. Our method won first place in "Unified Physical-Digital +Face Attack Detection" of the 5th Face Anti-spoofing Challenge@CVPR2024. Our +final submission obtains 3.75% APCER, 0.93% BPCER, and 2.34% ACER, +respectively. Our code is available at +https://github.com/Xianhua-He/cvpr2024-face-anti-spoofing-challenge. + +
+
+ comment: 10 pages with 6 figures, Accepted by CVPRW 2024 +
+
+
+
+
+ + ☆ OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering + + +
+ Rendering dynamic 3D human from monocular videos is crucial for various +applications such as virtual reality and digital entertainment. Most methods +assume the people is in an unobstructed scene, while various objects may cause +the occlusion of body parts in real-life scenarios. Previous method utilizing +NeRF for surface rendering to recover the occluded areas, but it requiring more +than one day to train and several seconds to render, failing to meet the +requirements of real-time interactive applications. To address these issues, we +propose OccGaussian based on 3D Gaussian Splatting, which can be trained within +6 minutes and produces high-quality human renderings up to 160 FPS with +occluded input. OccGaussian initializes 3D Gaussian distributions in the +canonical space, and we perform occlusion feature query at occluded regions, +the aggregated pixel-align feature is extracted to compensate for the missing +information. Then we use Gaussian Feature MLP to further process the feature +along with the occlusion-aware loss functions to better perceive the occluded +area. Extensive experiments both in simulated and real-world occlusions, +demonstrate that our method achieves comparable or even superior performance +compared to the state-of-the-art method. And we improving training and +inference speeds by 250x and 800x, respectively. Our code will be available for +research purposes. + +
+
+ comment: 12 April, 2024; originally announced April 2024 +
+
+
+
+
+ + ☆ MSSTNet: A Multi-Scale Spatio-Temporal CNN-Transformer Network for + Dynamic Facial Expression Recognition ICASSP 2024 + + +
+ Unlike typical video action recognition, Dynamic Facial Expression +Recognition (DFER) does not involve distinct moving targets but relies on +localized changes in facial muscles. Addressing this distinctive attribute, we +propose a Multi-Scale Spatio-temporal CNN-Transformer network (MSSTNet). Our +approach takes spatial features of different scales extracted by CNN and feeds +them into a Multi-scale Embedding Layer (MELayer). The MELayer extracts +multi-scale spatial information and encodes these features before sending them +into a Temporal Transformer (T-Former). The T-Former simultaneously extracts +temporal information while continually integrating multi-scale spatial +information. This process culminates in the generation of multi-scale +spatio-temporal features that are utilized for the final classification. Our +method achieves state-of-the-art results on two in-the-wild datasets. +Furthermore, a series of ablation experiments and visualizations provide +further validation of our approach's proficiency in leveraging spatio-temporal +information within DFER. + +
+
+ comment: Accepted to 2024 IEEE International Conference on Acoustics, Speech, + and Signal Processing (ICASSP 2024) +
+
+
+
+
+ + ☆ Adapting the Segment Anything Model During Usage in Novel Situations + + +
+ The interactive segmentation task consists in the creation of object +segmentation masks based on user interactions. The most common way to guide a +model towards producing a correct segmentation consists in clicks on the object +and background. The recently published Segment Anything Model (SAM) supports a +generalized version of the interactive segmentation problem and has been +trained on an object segmentation dataset which contains 1.1B masks. Though +being trained extensively and with the explicit purpose of serving as a +foundation model, we show significant limitations of SAM when being applied for +interactive segmentation on novel domains or object types. On the used +datasets, SAM displays a failure rate $\text{FR}_{30}@90$ of up to $72.6 \%$. +Since we still want such foundation models to be immediately applicable, we +present a framework that can adapt SAM during immediate usage. For this we will +leverage the user interactions and masks, which are constructed during the +interactive segmentation process. We use this information to generate +pseudo-labels, which we use to compute a loss function and optimize a part of +the SAM model. The presented method causes a relative reduction of up to $48.1 +\%$ in the $\text{FR}_{20}@85$ and $46.6 \%$ in the $\text{FR}_{30}@90$ +metrics. + +
+
+ comment: 11 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ Direct May Not Be the Best: An Incremental Evolution View of Pose + Generation + + +
+ Pose diversity is an inherent representative characteristic of 2D images. Due +to the 3D to 2D projection mechanism, there is evident content discrepancy +among distinct pose images. This is the main obstacle bothering pose +transformation related researches. To deal with this challenge, we propose a +fine-grained incremental evolution centered pose generation framework, rather +than traditional direct one-to-one in a rush. Since proposed approach actually +bypasses the theoretical difficulty of directly modeling dramatic non-linear +variation, the incurred content distortion and blurring could be effectively +constrained, at the same time the various individual pose details, especially +clothes texture, could be precisely maintained. In order to systematically +guide the evolution course, both global and incremental evolution constraints +are elaborately designed and merged into the overall frame?work. And a novel +triple-path knowledge fusion structure is worked out to take full advantage of +all available valuable knowledge to conduct high-quality pose synthesis. In +addition, our framework could generate a series of valuable byproducts, namely +the various intermediate poses. Extensive experiments have been conducted to +verify the effectiveness of the proposed approach. Code is available at +https://github.com/Xiaofei-CN/Incremental-Evolution-Pose-Generation. + +
+
+
+
+
+ + ☆ MambaDFuse: A Mamba-based Dual-phase Model for Multi-modality Image + Fusion + + +
+ Multi-modality image fusion (MMIF) aims to integrate complementary +information from different modalities into a single fused image to represent +the imaging scene and facilitate downstream visual tasks comprehensively. In +recent years, significant progress has been made in MMIF tasks due to advances +in deep neural networks. However, existing methods cannot effectively and +efficiently extract modality-specific and modality-fused features constrained +by the inherent local reductive bias (CNN) or quadratic computational +complexity (Transformers). To overcome this issue, we propose a Mamba-based +Dual-phase Fusion (MambaDFuse) model. Firstly, a dual-level feature extractor +is designed to capture long-range features from single-modality images by +extracting low and high-level features from CNN and Mamba blocks. Then, a +dual-phase feature fusion module is proposed to obtain fusion features that +combine complementary information from different modalities. It uses the +channel exchange method for shallow fusion and the enhanced Multi-modal Mamba +(M3) blocks for deep fusion. Finally, the fused image reconstruction module +utilizes the inverse transformation of the feature extraction to generate the +fused result. Through extensive experiments, our approach achieves promising +fusion results in infrared-visible image fusion and medical image fusion. +Additionally, in a unified benchmark, MambaDFuse has also demonstrated improved +performance in downstream tasks such as object detection. Code with checkpoints +will be available after the peer-review process. + +
+
+
+
+
+ + ☆ No Bells, Just Whistles: Sports Field Registration by Leveraging + Geometric Properties CVPR + + +
+ Broadcast sports field registration is traditionally addressed as a +homography estimation task, mapping the visible image area to a planar field +model, predominantly focusing on the main camera shot. Addressing the +shortcomings of previous approaches, we propose a novel calibration pipeline +enabling camera calibration using a 3D soccer field model and extending the +process to assess the multiple-view nature of broadcast videos. Our approach +begins with a keypoint generation pipeline derived from SoccerNet dataset +annotations, leveraging the geometric properties of the court. Subsequently, we +execute classical camera calibration through DLT algorithm in a minimalist +fashion, without further refinement. Through extensive experimentation on +real-world soccer broadcast datasets such as SoccerNet-Calibration, WorldCup +2014 and TS- WorldCup, our method demonstrates superior performance in both +multiple- and single-view 3D camera calibration while maintaining competitive +results in homography estimation compared to state-of-the-art techniques. + +
+
+ comment: Accepted in CVPRW 2024 +
+
+
+
+
+ + ☆ Mitigating Challenges of the Space Environment for Onboard Artificial + Intelligence: Design Overview of the Imaging Payload on SpIRIT CVPR 2024 + + +
+ Artificial intelligence (AI) and autonomous edge computing in space are +emerging areas of interest to augment capabilities of nanosatellites, where +modern sensors generate orders of magnitude more data than can typically be +transmitted to mission control. Here, we present the hardware and software +design of an onboard AI subsystem hosted on SpIRIT. The system is optimised for +on-board computer vision experiments based on visible light and long wave +infrared cameras. This paper highlights the key design choices made to maximise +the robustness of the system in harsh space conditions, and their motivation +relative to key mission requirements, such as limited compute resources, +resilience to cosmic radiation, extreme temperature variations, distribution +shifts, and very low transmission bandwidths. The payload, called Loris, +consists of six visible light cameras, three infrared cameras, a camera control +board and a Graphics Processing Unit (GPU) system-on-module. Loris enables the +execution of AI models with on-orbit fine-tuning as well as a next-generation +image compression algorithm, including progressive coding. This innovative +approach not only enhances the data processing capabilities of nanosatellites +but also lays the groundwork for broader applications to remote sensing from +space. + +
+
+ comment: AI4Space 2024, 3rd Workshop on AI for Space, CVPR 2024 +
+
+
+
+
+ + ☆ NC-TTT: A Noise Contrastive Approach for Test-Time Training + + +
+ Despite their exceptional performance in vision tasks, deep learning models +often struggle when faced with domain shifts during testing. Test-Time Training +(TTT) methods have recently gained popularity by their ability to enhance the +robustness of models through the addition of an auxiliary objective that is +jointly optimized with the main task. Being strictly unsupervised, this +auxiliary objective is used at test time to adapt the model without any access +to labels. In this work, we propose Noise-Contrastive Test-Time Training +(NC-TTT), a novel unsupervised TTT technique based on the discrimination of +noisy feature maps. By learning to classify noisy views of projected feature +maps, and then adapting the model accordingly on new domains, classification +performance can be recovered by an important margin. Experiments on several +popular test-time adaptation baselines demonstrate the advantages of our method +compared to recent approaches for this task. The code can be found +at:https://github.com/GustavoVargasHakim/NCTTT.git + +
+
+
+
+
+ + ☆ Let It Flow: Simultaneous Optimization of 3D Flow and Object Clustering ECCV + + +
+ We study the problem of self-supervised 3D scene flow estimation from real +large-scale raw point cloud sequences, which is crucial to various tasks like +trajectory prediction or instance segmentation. In the absence of ground truth +scene flow labels, contemporary approaches concentrate on deducing optimizing +flow across sequential pairs of point clouds by incorporating structure based +regularization on flow and object rigidity. The rigid objects are estimated by +a variety of 3D spatial clustering methods. While state-of-the-art methods +successfully capture overall scene motion using the Neural Prior structure, +they encounter challenges in discerning multi-object motions. We identified the +structural constraints and the use of large and strict rigid clusters as the +main pitfall of the current approaches and we propose a novel clustering +approach that allows for combination of overlapping soft clusters as well as +non-overlapping rigid clusters representation. Flow is then jointly estimated +with progressively growing non-overlapping rigid clusters together with fixed +size overlapping soft clusters. We evaluate our method on multiple datasets +with LiDAR point clouds, demonstrating the superior performance over the +self-supervised baselines reaching new state of the art results. Our method +especially excels in resolving flow in complicated dynamic scenes with multiple +independently moving objects close to each other which includes pedestrians, +cyclists and other vulnerable road users. Our codes will be publicly available. + +
+
+ comment: ECCV submission +
+
+
+
+
+ + ☆ TDANet: Target-Directed Attention Network For Object-Goal Visual + Navigation With Zero-Shot Ability + + +
+ The generalization of the end-to-end deep reinforcement learning (DRL) for +object-goal visual navigation is a long-standing challenge since object classes +and placements vary in new test environments. Learning domain-independent +visual representation is critical for enabling the trained DRL agent with the +ability to generalize to unseen scenes and objects. In this letter, a +target-directed attention network (TDANet) is proposed to learn the end-to-end +object-goal visual navigation policy with zero-shot ability. TDANet features a +novel target attention (TA) module that learns both the spatial and semantic +relationships among objects to help TDANet focus on the most relevant observed +objects to the target. With the Siamese architecture (SA) design, TDANet +distinguishes the difference between the current and target states and +generates the domain-independent visual representation. To evaluate the +navigation performance of TDANet, extensive experiments are conducted in the +AI2-THOR embodied AI environment. The simulation results demonstrate a strong +generalization ability of TDANet to unseen scenes and target objects, with +higher navigation success rate (SR) and success weighted by length (SPL) than +other state-of-the-art models. + +
+
+
+
+
+ + ☆ OmniSat: Self-Supervised Modality Fusion for Earth Observation + + +
+ The field of Earth Observations (EO) offers a wealth of data from diverse +sensors, presenting a great opportunity for advancing self-supervised +multimodal learning. However, current multimodal EO datasets and models focus +on a single data type, either mono-date images or time series, which limits +their expressivity. We introduce OmniSat, a novel architecture that exploits +the spatial alignment between multiple EO modalities to learn expressive +multimodal representations without labels. To demonstrate the advantages of +combining modalities of different natures, we augment two existing datasets +with new modalities. As demonstrated on three downstream tasks: forestry, land +cover classification, and crop mapping. OmniSat can learn rich representations +in an unsupervised manner, leading to improved performance in the semi- and +fully-supervised settings, even when only one modality is available for +inference. The code and dataset are available at github.com/gastruc/OmniSat. + +
+
+
+
+
+ + ☆ Self-Supervised k-Space Regularization for Motion-Resolved Abdominal MRI + Using Neural Implicit k-Space Representation + + +
+ Neural implicit k-space representations have shown promising results for +dynamic MRI at high temporal resolutions. Yet, their exclusive training in +k-space limits the application of common image regularization methods to +improve the final reconstruction. In this work, we introduce the concept of +parallel imaging-inspired self-consistency (PISCO), which we incorporate as +novel self-supervised k-space regularization enforcing a consistent +neighborhood relationship. At no additional data cost, the proposed +regularization significantly improves neural implicit k-space reconstructions +on simulated data. Abdominal in-vivo reconstructions using PISCO result in +enhanced spatio-temporal image quality compared to state-of-the-art methods. +Code is available at https://github.com/vjspi/PISCO-NIK. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Learning to Rebalance Multi-Modal Optimization by Adaptively Masking + Subnetworks + + +
+ Multi-modal learning aims to enhance performance by unifying models from +various modalities but often faces the "modality imbalance" problem in real +data, leading to a bias towards dominant modalities and neglecting others, +thereby limiting its overall effectiveness. To address this challenge, the core +idea is to balance the optimization of each modality to achieve a joint +optimum. Existing approaches often employ a modal-level control mechanism for +adjusting the update of each modal parameter. However, such a global-wise +updating mechanism ignores the different importance of each parameter. Inspired +by subnetwork optimization, we explore a uniform sampling-based optimization +strategy and find it more effective than global-wise updating. According to the +findings, we further propose a novel importance sampling-based, element-wise +joint optimization method, called Adaptively Mask Subnetworks Considering Modal +Significance(AMSS). Specifically, we incorporate mutual information rates to +determine the modal significance and employ non-uniform adaptive sampling to +select foreground subnetworks from each modality for parameter updates, thereby +rebalancing multi-modal learning. Additionally, we demonstrate the reliability +of the AMSS strategy through convergence analysis. Building upon theoretical +insights, we further enhance the multi-modal mask subnetwork strategy using +unbiased estimation, referred to as AMSS+. Extensive experiments reveal the +superiority of our approach over comparison methods. + +
+
+ comment: 17 pages;6 figures +
+
+
+
+
+ + ☆ Counterfactual Explanations for Face Forgery Detection via Adversarial + Removal of Artifacts ICME2024 + + +
+ Highly realistic AI generated face forgeries known as deepfakes have raised +serious social concerns. Although DNN-based face forgery detection models have +achieved good performance, they are vulnerable to latest generative methods +that have less forgery traces and adversarial attacks. This limitation of +generalization and robustness hinders the credibility of detection results and +requires more explanations. In this work, we provide counterfactual +explanations for face forgery detection from an artifact removal perspective. +Specifically, we first invert the forgery images into the StyleGAN latent +space, and then adversarially optimize their latent representations with the +discrimination supervision from the target detection model. We verify the +effectiveness of the proposed explanations from two aspects: (1) Counterfactual +Trace Visualization: the enhanced forgery images are useful to reveal artifacts +by visually contrasting the original images and two different visualization +methods; (2) Transferable Adversarial Attacks: the adversarial forgery images +generated by attacking the detection model are able to mislead other detection +models, implying the removed artifacts are general. Extensive experiments +demonstrate that our method achieves over 90% attack success rate and superior +attack transferability. Compared with naive adversarial noise methods, our +method adopts both generative and discriminative model priors, and optimize the +latent representations in a synthesis-by-analysis way, which forces the search +of counterfactual explanations on the natural face manifold. Thus, more general +counterfactual traces can be found and better adversarial attack +transferability can be achieved. + +
+
+ comment: Accepted to ICME2024 +
+
+
+
+
+ + ☆ Emerging Property of Masked Token for Effective Pre-training + + +
+ Driven by the success of Masked Language Modeling (MLM), the realm of +self-supervised learning for computer vision has been invigorated by the +central role of Masked Image Modeling (MIM) in driving recent breakthroughs. +Notwithstanding the achievements of MIM across various downstream tasks, its +overall efficiency is occasionally hampered by the lengthy duration of the +pre-training phase. This paper presents a perspective that the optimization of +masked tokens as a means of addressing the prevailing issue. Initially, we +delve into an exploration of the inherent properties that a masked token ought +to possess. Within the properties, we principally dedicated to articulating and +emphasizing the `data singularity' attribute inherent in masked tokens. Through +a comprehensive analysis of the heterogeneity between masked tokens and visible +tokens within pre-trained models, we propose a novel approach termed masked +token optimization (MTO), specifically designed to improve model efficiency +through weight recalibration and the enhancement of the key property of masked +tokens. The proposed method serves as an adaptable solution that seamlessly +integrates into any MIM approach that leverages masked tokens. As a result, MTO +achieves a considerable improvement in pre-training efficiency, resulting in an +approximately 50% reduction in pre-training epochs required to attain converged +performance of the recent approaches. + +
+
+
+
+
+ + ☆ Salience-Based Adaptive Masking: Revisiting Token Dynamics for Enhanced + Pre-training + + +
+ In this paper, we introduce Saliency-Based Adaptive Masking (SBAM), a novel +and cost-effective approach that significantly enhances the pre-training +performance of Masked Image Modeling (MIM) approaches by prioritizing token +salience. Our method provides robustness against variations in masking ratios, +effectively mitigating the performance instability issues common in existing +methods. This relaxes the sensitivity of MIM-based pre-training to masking +ratios, which in turn allows us to propose an adaptive strategy for `tailored' +masking ratios for each data sample, which no existing method can provide. +Toward this goal, we propose an Adaptive Masking Ratio (AMR) strategy that +dynamically adjusts the proportion of masking for the unique content of each +image based on token salience. We show that our method significantly improves +over the state-of-the-art in mask-based pre-training on the ImageNet-1K +dataset. + +
+
+
+
+
+ + ☆ GPN: Generative Point-based NeRF + + +
+ Scanning real-life scenes with modern registration devices typically gives +incomplete point cloud representations, primarily due to the limitations of +partial scanning, 3D occlusions, and dynamic light conditions. Recent works on +processing incomplete point clouds have always focused on point cloud +completion. However, these approaches do not ensure consistency between the +completed point cloud and the captured images regarding color and geometry. We +propose using Generative Point-based NeRF (GPN) to reconstruct and repair a +partial cloud by fully utilizing the scanning images and the corresponding +reconstructed cloud. The repaired point cloud can achieve multi-view +consistency with the captured images at high spatial resolution. For the +finetunes of a single scene, we optimize the global latent condition by +incorporating an Auto-Decoder architecture while retaining multi-view +consistency. As a result, the generated point clouds are smooth, plausible, and +geometrically consistent with the partial scanning images. Extensive +experiments on ShapeNet demonstrate that our works achieve competitive +performances to the other state-of-the-art point cloud-based neural scene +rendering and editing performances. + +
+
+
+
+
+ + ☆ Interference Motion Removal for Doppler Radar Vital Sign Detection Using + Variational Encoder-Decoder Neural Network + + +
+ The treatment of interfering motion contributions remains one of the key +challenges in the domain of radar-based vital sign monitoring. Removal of the +interference to extract the vital sign contributions is demanding due to +overlapping Doppler bands, the complex structure of the interference motions +and significant variations in the power levels of their contributions. A novel +approach to the removal of interference through the use of a probabilistic deep +learning model is presented. Results show that a convolutional encoder-decoder +neural network with a variational objective is capable of learning a meaningful +representation space of vital sign Doppler-time distribution facilitating their +extraction from a mixture signal. The approach is tested on semi-experimental +data containing real vital sign signatures and simulated returns from +interfering body motions. The application of the proposed network enhances the +extraction of the micro-Doppler frequency corresponding to the respiration rate +is demonstrated. + +
+
+ comment: Presented at 2021 IEEE Radar Conference (RadarConf21) +
+
+
+
+
+ + ☆ Overcoming Scene Context Constraints for Object Detection in wild using + Defilters + + +
+ This paper focuses on improving object detection performance by addressing +the issue of image distortions, commonly encountered in uncontrolled +acquisition environments. High-level computer vision tasks such as object +detection, recognition, and segmentation are particularly sensitive to image +distortion. To address this issue, we propose a novel approach employing an +image defilter to rectify image distortion prior to object detection. This +method enhances object detection accuracy, as models perform optimally when +trained on non-distorted images. Our experiments demonstrate that utilizing +defiltered images significantly improves mean average precision compared to +training object detection models on distorted images. Consequently, our +proposed method offers considerable benefits for real-world applications +plagued by image distortion. To our knowledge, the contribution lies in +employing distortion-removal paradigm for object detection on images captured +in natural settings. We achieved an improvement of 0.562 and 0.564 of mean +Average precision on validation and test data. + +
+
+
+
+
+ + ☆ AdaContour: Adaptive Contour Descriptor with Hierarchical Representation + + +
+ Existing angle-based contour descriptors suffer from lossy representation for +non-starconvex shapes. By and large, this is the result of the shape being +registered with a single global inner center and a set of radii corresponding +to a polar coordinate parameterization. In this paper, we propose AdaContour, +an adaptive contour descriptor that uses multiple local representations to +desirably characterize complex shapes. After hierarchically encoding object +shapes in a training set and constructing a contour matrix of all subdivided +regions, we compute a robust low-rank robust subspace and approximate each +local contour by linearly combining the shared basis vectors to represent an +object. Experiments show that AdaContour is able to represent shapes more +accurately and robustly than other descriptors while retaining effectiveness. +We validate AdaContour by integrating it into off-the-shelf detectors to enable +instance segmentation which demonstrates faithful performance. The code is +available at https://github.com/tding1/AdaContour. + +
+
+
+
+
+ + ☆ On Input Formats for Radar Micro-Doppler Signature Processing by + Convolutional Neural Networks + + +
+ Convolutional neural networks have often been proposed for processing radar +Micro-Doppler signatures, most commonly with the goal of classifying the +signals. The majority of works tend to disregard phase information from the +complex time-frequency representation. Here, the utility of the phase +information, as well as the optimal format of the Doppler-time input for a +convolutional neural network, is analysed. It is found that the performance +achieved by convolutional neural network classifiers is heavily influenced by +the type of input representation, even across formats with equivalent +information. Furthermore, it is demonstrated that the phase component of the +Doppler-time representation contains rich information useful for classification +and that unwrapping the phase in the temporal dimension can improve the results +compared to a magnitude-only solution, improving accuracy from 0.920 to 0.938 +on the tested human activity dataset. Further improvement of 0.947 is achieved +by training a linear classifier on embeddings from multiple-formats. + +
+
+ comment: Presented at International Conference on Radar Systems (RADAR 2022) +
+
+
+
+
+ + ☆ A Survey of Neural Network Robustness Assessment in Image Recognition + + +
+ In recent years, there has been significant attention given to the robustness +assessment of neural networks. Robustness plays a critical role in ensuring +reliable operation of artificial intelligence (AI) systems in complex and +uncertain environments. Deep learning's robustness problem is particularly +significant, highlighted by the discovery of adversarial attacks on image +classification models. Researchers have dedicated efforts to evaluate +robustness in diverse perturbation conditions for image recognition tasks. +Robustness assessment encompasses two main techniques: robustness verification/ +certification for deliberate adversarial attacks and robustness testing for +random data corruptions. In this survey, we present a detailed examination of +both adversarial robustness (AR) and corruption robustness (CR) in neural +network assessment. Analyzing current research papers and standards, we provide +an extensive overview of robustness assessment in image recognition. Three +essential aspects are analyzed: concepts, metrics, and assessment methods. We +investigate the perturbation metrics and range representations used to measure +the degree of perturbations on images, as well as the robustness metrics +specifically for the robustness conditions of classification models. The +strengths and limitations of the existing methods are also discussed, and some +potential directions for future research are provided. + +
+
+
+
+
+ + ☆ Calibration & Reconstruction: Deep Integrated Language for Referring + Image Segmentation ICMR2024 + + +
+ Referring image segmentation aims to segment an object referred to by natural +language expression from an image. The primary challenge lies in the efficient +propagation of fine-grained semantic information from textual features to +visual features. Many recent works utilize a Transformer to address this +challenge. However, conventional transformer decoders can distort linguistic +information with deeper layers, leading to suboptimal results. In this paper, +we introduce CRFormer, a model that iteratively calibrates multi-modal features +in the transformer decoder. We start by generating language queries using +vision features, emphasizing different aspects of the input language. Then, we +propose a novel Calibration Decoder (CDec) wherein the multi-modal features can +iteratively calibrated by the input language features. In the Calibration +Decoder, we use the output of each decoder layer and the original language +features to generate new queries for continuous calibration, which gradually +updates the language features. Based on CDec, we introduce a Language +Reconstruction Module and a reconstruction loss. This module leverages queries +from the final layer of the decoder to reconstruct the input language and +compute the reconstruction loss. This can further prevent the language +information from being lost or distorted. Our experiments consistently show the +superior performance of our approach across RefCOCO, RefCOCO+, and G-Ref +datasets compared to state-of-the-art methods. + +
+
+ comment: 9 pages, 8 figures ICMR2024. arXiv admin note: text overlap with + arXiv:2305.14969 +
+
+
+
+
+ + ☆ Convolutional neural network classification of cancer cytopathology + images: taking breast cancer as an example + + +
+ Breast cancer is a relatively common cancer among gynecological cancers. Its +diagnosis often relies on the pathology of cells in the lesion. The +pathological diagnosis of breast cancer not only requires professionals and +time, but also sometimes involves subjective judgment. To address the +challenges of dependence on pathologists expertise and the time-consuming +nature of achieving accurate breast pathological image classification, this +paper introduces an approach utilizing convolutional neural networks (CNNs) for +the rapid categorization of pathological images, aiming to enhance the +efficiency of breast pathological image detection. And the approach enables the +rapid and automatic classification of pathological images into benign and +malignant groups. The methodology involves utilizing a convolutional neural +network (CNN) model leveraging the Inceptionv3 architecture and transfer +learning algorithm for extracting features from pathological images. Utilizing +a neural network with fully connected layers and employing the SoftMax function +for image classification. Additionally, the concept of image partitioning is +introduced to handle high-resolution images. To achieve the ultimate +classification outcome, the classification probabilities of each image block +are aggregated using three algorithms: summation, product, and maximum. +Experimental validation was conducted on the BreaKHis public dataset, resulting +in accuracy rates surpassing 0.92 across all four magnification coefficients +(40X, 100X, 200X, and 400X). It demonstrates that the proposed method +effectively enhances the accuracy in classifying pathological images of breast +cancer. + +
+
+
+
+
+ + ☆ FaceFilterSense: A Filter-Resistant Face Recognition and Facial + Attribute Analysis Framework + + +
+ With the advent of social media, fun selfie filters have come into tremendous +mainstream use affecting the functioning of facial biometric systems as well as +image recognition systems. These filters vary from beautification filters and +Augmented Reality (AR)-based filters to filters that modify facial landmarks. +Hence, there is a need to assess the impact of such filters on the performance +of existing face recognition systems. The limitation associated with existing +solutions is that these solutions focus more on the beautification filters. +However, the current AR-based filters and filters which distort facial key +points are in vogue recently and make the faces highly unrecognizable even to +the naked eye. Also, the filters considered are mostly obsolete with limited +variations. To mitigate these limitations, we aim to perform a holistic impact +analysis of the latest filters and propose an user recognition model with the +filtered images. We have utilized a benchmark dataset for baseline images, and +applied the latest filters over them to generate a beautified/filtered dataset. +Next, we have introduced a model FaceFilterNet for beautified user recognition. +In this framework, we also utilize our model to comment on various attributes +of the person including age, gender, and ethnicity. In addition, we have also +presented a filter-wise impact analysis on face recognition, age estimation, +gender, and ethnicity prediction. The proposed method affirms the efficacy of +our dataset with an accuracy of 87.25% and an optimal accuracy for facial +attribute analysis. + +
+
+
+
+
+ + ☆ Struggle with Adversarial Defense? Try Diffusion + + +
+ Adversarial attacks induce misclassification by introducing subtle +perturbations. Recently, diffusion models are applied to the image classifiers +to improve adversarial robustness through adversarial training or by purifying +adversarial noise. However, diffusion-based adversarial training often +encounters convergence challenges and high computational expenses. +Additionally, diffusion-based purification inevitably causes data shift and is +deemed susceptible to stronger adaptive attacks. To tackle these issues, we +propose the Truth Maximization Diffusion Classifier (TMDC), a generative +Bayesian classifier that builds upon pre-trained diffusion models and the +Bayesian theorem. Unlike data-driven classifiers, TMDC, guided by Bayesian +principles, utilizes the conditional likelihood from diffusion models to +determine the class probabilities of input images, thereby insulating against +the influences of data shift and the limitations of adversarial training. +Moreover, to enhance TMDC's resilience against more potent adversarial attacks, +we propose an optimization strategy for diffusion classifiers. This strategy +involves post-training the diffusion model on perturbed datasets with +ground-truth labels as conditions, guiding the diffusion model to learn the +data distribution and maximizing the likelihood under the ground-truth labels. +The proposed method achieves state-of-the-art performance on the CIFAR10 +dataset against heavy white-box attacks and strong adaptive attacks. +Specifically, TMDC achieves robust accuracies of 82.81% against $l_{\infty}$ +norm-bounded perturbations and 86.05% against $l_{2}$ norm-bounded +perturbations, respectively, with $\epsilon=0.05$. + +
+
+
+
+
+ + ☆ Guided Masked Self-Distillation Modeling for Distributed Multimedia + Sensor Event Analysis + + +
+ Observations with distributed sensors are essential in analyzing a series of +human and machine activities (referred to as 'events' in this paper) in complex +and extensive real-world environments. This is because the information obtained +from a single sensor is often missing or fragmented in such an environment; +observations from multiple locations and modalities should be integrated to +analyze events comprehensively. However, a learning method has yet to be +established to extract joint representations that effectively combine such +distributed observations. Therefore, we propose Guided Masked sELf-Distillation +modeling (Guided-MELD) for inter-sensor relationship modeling. The basic idea +of Guided-MELD is to learn to supplement the information from the masked sensor +with information from other sensors needed to detect the event. Guided-MELD is +expected to enable the system to effectively distill the fragmented or +redundant target event information obtained by the sensors without being overly +dependent on any specific sensors. To validate the effectiveness of the +proposed method in novel tasks of distributed multimedia sensor event analysis, +we recorded two new datasets that fit the problem setting: MM-Store and +MM-Office. These datasets consist of human activities in a convenience store +and an office, recorded using distributed cameras and microphones. Experimental +results on these datasets show that the proposed Guided-MELD improves event +tagging and detection performance and outperforms conventional inter-sensor +relationship modeling methods. Furthermore, the proposed method performed +robustly even when sensors were reduced. + +
+
+ comment: 13page, 7figure, under review +
+
+
+
+
+ + ☆ Practical Region-level Attack against Segment Anything Models + + +
+ Segment Anything Models (SAM) have made significant advancements in image +segmentation, allowing users to segment target portions of an image with a +single click (i.e., user prompt). Given its broad applications, the robustness +of SAM against adversarial attacks is a critical concern. While recent works +have explored adversarial attacks against a pre-defined prompt/click, their +threat model is not yet realistic: (1) they often assume the user-click +position is known to the attacker (point-based attack), and (2) they often +operate under a white-box setting with limited transferability. In this paper, +we propose a more practical region-level attack where attackers do not need to +know the precise user prompt. The attack remains effective as the user clicks +on any point on the target object in the image, hiding the object from SAM. +Also, by adapting a spectrum transformation method, we make the attack more +transferable under a black-box setting. Both control experiments and testing +against real-world SAM services confirm its effectiveness. + +
+
+
+
+
+ + ☆ MonoPatchNeRF: Improving Neural Radiance Fields with Patch-based + Monocular Guidance + + +
+ The latest regularized Neural Radiance Field (NeRF) approaches produce poor +geometry and view extrapolation for multiview stereo (MVS) benchmarks such as +ETH3D. In this paper, we aim to create 3D models that provide accurate geometry +and view synthesis, partially closing the large geometric performance gap +between NeRF and traditional MVS methods. We propose a patch-based approach +that effectively leverages monocular surface normal and relative depth +predictions. The patch-based ray sampling also enables the appearance +regularization of normalized cross-correlation (NCC) and structural similarity +(SSIM) between randomly sampled virtual and training views. We further show +that "density restrictions" based on sparse structure-from-motion points can +help greatly improve geometric accuracy with a slight drop in novel view +synthesis metrics. Our experiments show 4x the performance of RegNeRF and 8x +that of FreeNeRF on average F1@2cm for ETH3D MVS benchmark, suggesting a +fruitful research direction to improve the geometric accuracy of NeRF-based +models, and sheds light on a potential future approach to enable NeRF-based +optimization to eventually outperform traditional MVS. + +
+
+ comment: 26 pages, 15 figures +
+
+
+
+
+ + ☆ Simulation of a Vision Correction Display System + + +
+ Eyes serve as our primary sensory organs, responsible for processing up to +80\% of our sensory input. However, common visual aberrations like myopia and +hyperopia affect a significant portion of the global population. This paper +focuses on simulating a Vision Correction Display (VCD) to enhance the visual +experience of individuals with various visual impairments. Utilising Blender, +we digitally model the functionality of a VCD in correcting refractive errors +such as myopia and hyperopia. With these simulations we can see potential +improvements in visual acuity and comfort. These simulations provide valuable +insights for the design and development of future VCD technologies, ultimately +advancing accessibility and usability for individuals with visual challenges. + +
+
+
+
+
+ + ☆ IFViT: Interpretable Fixed-Length Representation for Fingerprint + Matching via Vision Transformer + + +
+ Determining dense feature points on fingerprints used in constructing deep +fixed-length representations for accurate matching, particularly at the pixel +level, is of significant interest. To explore the interpretability of +fingerprint matching, we propose a multi-stage interpretable fingerprint +matching network, namely Interpretable Fixed-length Representation for +Fingerprint Matching via Vision Transformer (IFViT), which consists of two +primary modules. The first module, an interpretable dense registration module, +establishes a Vision Transformer (ViT)-based Siamese Network to capture +long-range dependencies and the global context in fingerprint pairs. It +provides interpretable dense pixel-wise correspondences of feature points for +fingerprint alignment and enhances the interpretability in the subsequent +matching stage. The second module takes into account both local and global +representations of the aligned fingerprint pair to achieve an interpretable +fixed-length representation extraction and matching. It employs the ViTs +trained in the first module with the additional fully connected layer and +retrains them to simultaneously produce the discriminative fixed-length +representation and interpretable dense pixel-wise correspondences of feature +points. Extensive experimental results on diverse publicly available +fingerprint databases demonstrate that the proposed framework not only exhibits +superior performance on dense registration and matching but also significantly +promotes the interpretability in deep fixed-length representations-based +fingerprint matching. + +
+
+ comment: ready to submit to IEEE Transactions on Information Forensics and + Security (TIFS) +
+
+
+
+
+ + ☆ Enhancing Traffic Safety with Parallel Dense Video Captioning for + End-to-End Event Analysis + + +
+ This paper introduces our solution for Track 2 in AI City Challenge 2024. The +task aims to solve traffic safety description and analysis with the dataset of +Woven Traffic Safety (WTS), a real-world Pedestrian-Centric Traffic Video +Dataset for Fine-grained Spatial-Temporal Understanding. Our solution mainly +focuses on the following points: 1) To solve dense video captioning, we +leverage the framework of dense video captioning with parallel decoding (PDVC) +to model visual-language sequences and generate dense caption by chapters for +video. 2) Our work leverages CLIP to extract visual features to more +efficiently perform cross-modality training between visual and textual +representations. 3) We conduct domain-specific model adaptation to mitigate +domain shift problem that poses recognition challenge in video understanding. +4) Moreover, we leverage BDD-5K captioned videos to conduct knowledge transfer +for better understanding WTS videos and more accurate captioning. Our solution +has yielded on the test set, achieving 6th place in the competition. The open +source code will be available at https://github.com/UCF-SST-Lab/AICity2024CVPRW + +
+
+
+
+
+ + ☆ Improving Continuous Sign Language Recognition with Adapted Image Models + + +
+ The increase of web-scale weakly labelled image-text pairs have greatly +facilitated the development of large-scale vision-language models (e.g., CLIP), +which have shown impressive generalization performance over a series of +downstream tasks. However, the massive model size and scarcity of available +data limit their applications to fine-tune the whole model in downstream tasks. +Besides, fully fine-tuning the model easily forgets the generic essential +knowledge acquired in the pretraining stage and overfits the downstream data. +To enable high efficiency when adapting these large vision-language models +(e.g., CLIP) to performing continuous sign language recognition (CSLR) while +preserving their generalizability, we propose a novel strategy (AdaptSign). +Especially, CLIP is adopted as the visual backbone to extract frame-wise +features whose parameters are fixed, and a set of learnable modules are +introduced to model spatial sign variations or capture temporal sign movements. +The introduced additional modules are quite lightweight, only owning 3.2% extra +computations with high efficiency. The generic knowledge acquired in the +pretraining stage is well-preserved in the frozen CLIP backbone in this +process. Extensive experiments show that despite being efficient, AdaptSign is +able to demonstrate superior performance across a series of CSLR benchmarks +including PHOENIX14, PHOENIX14-T, CSL-Daily and CSL compared to existing +methods. Visualizations show that AdaptSign could learn to dynamically pay +major attention to the informative spatial regions and cross-frame trajectories +in sign videos. + +
+
+
+
+
+ + ☆ A Mutual Inclusion Mechanism for Precise Boundary Segmentation in + Medical Images + + +
+ In medical imaging, accurate image segmentation is crucial for quantifying +diseases, assessing prognosis, and evaluating treatment outcomes. However, +existing methods lack an in-depth integration of global and local features, +failing to pay special attention to abnormal regions and boundary details in +medical images. To this end, we present a novel deep learning-based approach, +MIPC-Net, for precise boundary segmentation in medical images. Our approach, +inspired by radiologists' working patterns, features two distinct modules: (i) +\textbf{Mutual Inclusion of Position and Channel Attention (MIPC) module}: To +enhance the precision of boundary segmentation in medical images, we introduce +the MIPC module, which enhances the focus on channel information when +extracting position features and vice versa; (ii) \textbf{GL-MIPC-Residue}: To +improve the restoration of medical images, we propose the GL-MIPC-Residue, a +global residual connection that enhances the integration of the encoder and +decoder by filtering out invalid information and restoring the most effective +information lost during the feature extraction process. We evaluate the +performance of the proposed model using metrics such as Dice coefficient (DSC) +and Hausdorff Distance (HD) on three publicly accessible datasets: Synapse, +ISIC2018-Task, and Segpc. Our ablation study shows that each module contributes +to improving the quality of segmentation results. Furthermore, with the +assistance of both modules, our approach outperforms state-of-the-art methods +across all metrics on the benchmark datasets, notably achieving a 2.23mm +reduction in HD on the Synapse dataset, strongly evidencing our model's +enhanced capability for precise image boundary segmentation. Codes will be +available at https://github.com/SUN-1024/MIPC-Net. + +
+
+
+
+
+ + ☆ Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and + Training Strategies + + +
+ This paper investigates the performance of the Contrastive Language-Image +Pre-training (CLIP) when scaled down to limited computation budgets. We explore +CLIP along three dimensions: data, architecture, and training strategies. With +regards to data, we demonstrate the significance of high-quality training data +and show that a smaller dataset of high-quality data can outperform a larger +dataset with lower quality. We also examine how model performance varies with +different dataset sizes, suggesting that smaller ViT models are better suited +for smaller datasets, while larger models perform better on larger datasets +with fixed compute. Additionally, we provide guidance on when to choose a +CNN-based architecture or a ViT-based architecture for CLIP training. We +compare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data +Augmentation - and show that the choice of training strategy depends on the +available compute resource. Our analysis reveals that CLIP+Data Augmentation +can achieve comparable performance to CLIP using only half of the training +data. This work provides practical insights into how to effectively train and +deploy CLIP models, making them more accessible and affordable for practical +use in various applications. + +
+
+
+
+
+ + ☆ Tackling Ambiguity from Perspective of Uncertainty Inference and + Affinity Diversification for Weakly Supervised Semantic Segmentation + + +
+ Weakly supervised semantic segmentation (WSSS) with image-level labels +intends to achieve dense tasks without laborious annotations. However, due to +the ambiguous contexts and fuzzy regions, the performance of WSSS, especially +the stages of generating Class Activation Maps (CAMs) and refining pseudo +masks, widely suffers from ambiguity while being barely noticed by previous +literature. In this work, we propose UniA, a unified single-staged WSSS +framework, to efficiently tackle this issue from the perspective of uncertainty +inference and affinity diversification, respectively. When activating class +objects, we argue that the false activation stems from the bias to the +ambiguous regions during the feature extraction. Therefore, we design a more +robust feature representation with a probabilistic Gaussian distribution and +introduce the uncertainty estimation to avoid the bias. A distribution loss is +particularly proposed to supervise the process, which effectively captures the +ambiguity and models the complex dependencies among features. When refining +pseudo labels, we observe that the affinity from the prevailing refinement +methods intends to be similar among ambiguities. To this end, an affinity +diversification module is proposed to promote diversity among semantics. A +mutual complementing refinement is proposed to initially rectify the ambiguous +affinity with multiple inferred pseudo labels. More importantly, a contrastive +affinity loss is further designed to diversify the relations among unrelated +semantics, which reliably propagates the diversity into the whole feature +representations and helps generate better pseudo masks. Extensive experiments +are conducted on PASCAL VOC, MS COCO, and medical ACDC datasets, which validate +the efficiency of UniA tackling ambiguity and the superiority over recent +single-staged or even most multi-staged competitors. + +
+
+
+
+
+ + ☆ Adapting CNNs for Fisheye Cameras without Retraining + + +
+ The majority of image processing approaches assume images are in or can be +rectified to a perspective projection. However, in many applications it is +beneficial to use non conventional cameras, such as fisheye cameras, that have +a larger field of view (FOV). The issue arises that these large-FOV images +can't be rectified to a perspective projection without significant cropping of +the original image. To address this issue we propose Rectified Convolutions +(RectConv); a new approach for adapting pre-trained convolutional networks to +operate with new non-perspective images, without any retraining. Replacing the +convolutional layers of the network with RectConv layers allows the network to +see both rectified patches and the entire FOV. We demonstrate RectConv adapting +multiple pre-trained networks to perform segmentation and detection on fisheye +imagery from two publicly available datasets. Our approach requires no +additional data or training, and operates directly on the native image as +captured from the camera. We believe this work is a step toward adapting the +vast resources available for perspective images to operate across a broad range +of camera geometries. + +
+
+ comment: Project page: https://roboticimaging.org/Projects/RectConv/ +
+
+
+
+
+ + ☆ Measuring Domain Shifts using Deep Learning Remote Photoplethysmography + Model Similarity + + +
+ Domain shift differences between training data for deep learning models and +the deployment context can result in severe performance issues for models which +fail to generalize. We study the domain shift problem under the context of +remote photoplethysmography (rPPG), a technique for video-based heart rate +inference. We propose metrics based on model similarity which may be used as a +measure of domain shift, and we demonstrate high correlation between these +metrics and empirical performance. One of the proposed metrics with viable +correlations, DS-diff, does not assume access to the ground truth of the target +domain, i.e. it may be applied to in-the-wild data. To that end, we investigate +a model selection problem in which ground truth results for the evaluation +domain is not known, demonstrating a 13.9% performance improvement over the +average case baseline. + +
+
+
+
+
+ + ☆ Pay Attention to Your Neighbours: Training-Free Open-Vocabulary Semantic + Segmentation + + +
+ Despite the significant progress in deep learning for dense visual +recognition problems, such as semantic segmentation, traditional methods are +constrained by fixed class sets. Meanwhile, vision-language foundation models, +such as CLIP, have showcased remarkable effectiveness in numerous zero-shot +image-level tasks, owing to their robust generalizability. Recently, a body of +work has investigated utilizing these models in open-vocabulary semantic +segmentation (OVSS). However, existing approaches often rely on impractical +supervised pre-training or access to additional pre-trained networks. In this +work, we propose a strong baseline for training-free OVSS, termed +Neighbour-Aware CLIP (NACLIP), representing a straightforward adaptation of +CLIP tailored for this scenario. Our method enforces localization of patches in +the self-attention of CLIP's vision transformer which, despite being crucial +for dense prediction tasks, has been overlooked in the OVSS literature. By +incorporating design choices favouring segmentation, our approach significantly +improves performance without requiring additional data, auxiliary pre-trained +networks, or extensive hyperparameter tuning, making it highly practical for +real-world applications. Experiments are performed on 8 popular semantic +segmentation benchmarks, yielding state-of-the-art performance on most +scenarios. Our code is publicly available at https://github.com/sinahmr/NACLIP . + +
+
+
+
+
+ + ☆ Uncertainty Quantification in Detecting Choroidal Metastases on MRI via + Evolutionary Strategies + + +
+ Uncertainty quantification plays a vital role in facilitating the practical +implementation of AI in radiology by addressing growing concerns around +trustworthiness. Given the challenges associated with acquiring large, +annotated datasets in this field, there is a need for methods that enable +uncertainty quantification in small data AI approaches tailored to radiology +images. In this study, we focused on uncertainty quantification within the +context of the small data evolutionary strategies-based technique of deep +neuroevolution (DNE). Specifically, we employed DNE to train a simple +Convolutional Neural Network (CNN) with MRI images of the eyes for binary +classification. The goal was to distinguish between normal eyes and those with +metastatic tumors called choroidal metastases. The training set comprised 18 +images with choroidal metastases and 18 without tumors, while the testing set +contained a tumor-to-normal ratio of 15:15. + We trained CNN model weights via DNE for approximately 40,000 episodes, +ultimately reaching a convergence of 100% accuracy on the training set. We +saved all models that achieved maximal training set accuracy. Then, by applying +these models to the testing set, we established an ensemble method for +uncertainty quantification.The saved set of models produced distributions for +each testing set image between the two classes of normal and tumor-containing. +The relative frequencies permitted uncertainty quantification of model +predictions. Intriguingly, we found that subjective features appreciated by +human radiologists explained images for which uncertainty was high, +highlighting the significance of uncertainty quantification in AI-driven +radiological analyses. + +
+
+
+
+
+ + ☆ Structured Model Pruning for Efficient Inference in Computational + Pathology + + +
+ Recent years have seen significant efforts to adopt Artificial Intelligence +(AI) in healthcare for various use cases, from computer-aided diagnosis to ICU +triage. However, the size of AI models has been rapidly growing due to scaling +laws and the success of foundational models, which poses an increasing +challenge to leverage advanced models in practical applications. It is thus +imperative to develop efficient models, especially for deploying AI solutions +under resource-constrains or with time sensitivity. One potential solution is +to perform model compression, a set of techniques that remove less important +model components or reduce parameter precision, to reduce model computation +demand. In this work, we demonstrate that model pruning, as a model compression +technique, can effectively reduce inference cost for computational and digital +pathology based analysis with a negligible loss of analysis performance. To +this end, we develop a methodology for pruning the widely used U-Net-style +architectures in biomedical imaging, with which we evaluate multiple pruning +heuristics on nuclei instance segmentation and classification, and empirically +demonstrate that pruning can compress models by at least 70% with a negligible +drop in performance. + +
+
+
+
+
+ + ☆ "Don't forget to put the milk back!" Dataset for Enabling Embodied + Agents to Detect Anomalous Situations + + +
+ Home robots intend to make their users lives easier. Our work assists in this +goal by enabling robots to inform their users of dangerous or unsanitary +anomalies in their home. Some examples of these anomalies include the user +leaving their milk out, forgetting to turn off the stove, or leaving poison +accessible to children. To move towards enabling home robots with these +abilities, we have created a new dataset, which we call SafetyDetect. The +SafetyDetect dataset consists of 1000 anomalous home scenes, each of which +contains unsafe or unsanitary situations for an agent to detect. Our approach +utilizes large language models (LLMs) alongside both a graph representation of +the scene and the relationships between the objects in the scene. Our key +insight is that this connected scene graph and the object relationships it +encodes enables the LLM to better reason about the scene -- especially as it +relates to detecting dangerous or unsanitary situations. Our most promising +approach utilizes GPT-4 and pursues a categorization technique where object +relations from the scene graph are classified as normal, dangerous, unsanitary, +or dangerous for children. This method is able to correctly identify over 90% +of anomalous scenarios in the SafetyDetect Dataset. Additionally, we conduct +real world experiments on a ClearPath TurtleBot where we generate a scene graph +from visuals of the real world scene, and run our approach with no +modification. This setup resulted in little performance loss. The SafetyDetect +Dataset and code will be released to the public upon this papers publication. + +
+
+
+
+
+ + ☆ Single-image driven 3d viewpoint training data augmentation for + effective wine label recognition + + +
+ Confronting the critical challenge of insufficient training data in the field +of complex image recognition, this paper introduces a novel 3D viewpoint +augmentation technique specifically tailored for wine label recognition. This +method enhances deep learning model performance by generating visually +realistic training samples from a single real-world wine label image, +overcoming the challenges posed by the intricate combinations of text and +logos. Classical Generative Adversarial Network (GAN) methods fall short in +synthesizing such intricate content combination. Our proposed solution +leverages time-tested computer vision and image processing strategies to expand +our training dataset, thereby broadening the range of training samples for deep +learning applications. This innovative approach to data augmentation +circumvents the constraints of limited training resources. Using the augmented +training images through batch-all triplet metric learning on a Vision +Transformer (ViT) architecture, we can get the most discriminative embedding +features for every wine label, enabling us to perform one-shot recognition of +existing wine labels in the training classes or future newly collected wine +labels unavailable in the training. Experimental results show a significant +increase in recognition accuracy over conventional 2D data augmentation +techniques. + +
+
+
+
+
+ + ☆ E3: Ensemble of Expert Embedders for Adapting Synthetic Image Detectors + to New Generators Using Limited Data + + +
+ As generative AI progresses rapidly, new synthetic image generators continue +to emerge at a swift pace. Traditional detection methods face two main +challenges in adapting to these generators: the forensic traces of synthetic +images from new techniques can vastly differ from those learned during +training, and access to data for these new generators is often limited. To +address these issues, we introduce the Ensemble of Expert Embedders (E3), a +novel continual learning framework for updating synthetic image detectors. E3 +enables the accurate detection of images from newly emerged generators using +minimal training data. Our approach does this by first employing transfer +learning to develop a suite of expert embedders, each specializing in the +forensic traces of a specific generator. Then, all embeddings are jointly +analyzed by an Expert Knowledge Fusion Network to produce accurate and reliable +detection decisions. Our experiments demonstrate that E3 outperforms existing +continual learning methods, including those developed specifically for +synthetic image detection. + +
+
+
+
+
+ + ☆ Real-time guidewire tracking and segmentation in intraoperative x-ray + + +
+ During endovascular interventions, physicians have to perform accurate and +immediate operations based on the available real-time information, such as the +shape and position of guidewires observed on the fluoroscopic images, haptic +information and the patients' physiological signals. For this purpose, +real-time and accurate guidewire segmentation and tracking can enhance the +visualization of guidewires and provide visual feedback for physicians during +the intervention as well as for robot-assisted interventions. Nevertheless, +this task often comes with the challenge of elongated deformable structures +that present themselves with low contrast in the noisy fluoroscopic image +sequences. To address these issues, a two-stage deep learning framework for +real-time guidewire segmentation and tracking is proposed. In the first stage, +a Yolov5s detector is trained, using the original X-ray images as well as +synthetic ones, which is employed to output the bounding boxes of possible +target guidewires. More importantly, a refinement module based on +spatiotemporal constraints is incorporated to robustly localize the guidewire +and remove false detections. In the second stage, a novel and efficient network +is proposed to segment the guidewire in each detected bounding box. The network +contains two major modules, namely a hessian-based enhancement embedding module +and a dual self-attention module. Quantitative and qualitative evaluations on +clinical intra-operative images demonstrate that the proposed approach +significantly outperforms our baselines as well as the current state of the art +and, in comparison, shows higher robustness to low quality images. + +
+
+
+
+
+ + ☆ Semantic Approach to Quantifying the Consistency of Diffusion Model + Image Generation CVPR 3 + + +
+ In this study, we identify the need for an interpretable, quantitative score +of the repeatability, or consistency, of image generation in diffusion models. +We propose a semantic approach, using a pairwise mean CLIP (Contrastive +Language-Image Pretraining) score as our semantic consistency score. We applied +this metric to compare two state-of-the-art open-source image generation +diffusion models, Stable Diffusion XL and PixArt-{\alpha}, and we found +statistically significant differences between the semantic consistency scores +for the models. Agreement between the Semantic Consistency Score selected model +and aggregated human annotations was 94%. We also explored the consistency of +SDXL and a LoRA-fine-tuned version of SDXL and found that the fine-tuned model +had significantly higher semantic consistency in generated images. The Semantic +Consistency Score proposed here offers a measure of image generation alignment, +facilitating the evaluation of model architectures for specific tasks and +aiding in informed decision-making regarding model selection. + +
+
+ comment: Accepted to 2024 CVPR 3rd Explainable AI for Computer Vision (XAI4CV) + Workshop +
+
+
+
+
+ + ☆ Detecting AI-Generated Images via CLIP + + +
+ As AI-generated image (AIGI) methods become more powerful and accessible, it +has become a critical task to determine if an image is real or AI-generated. +Because AIGI lack the signatures of photographs and have their own unique +patterns, new models are needed to determine if an image is AI-generated. In +this paper, we investigate the ability of the Contrastive Language-Image +Pre-training (CLIP) architecture, pre-trained on massive internet-scale data +sets, to perform this differentiation. We fine-tune CLIP on real images and +AIGI from several generative models, enabling CLIP to determine if an image is +AI-generated and, if so, determine what generation method was used to create +it. We show that the fine-tuned CLIP architecture is able to differentiate AIGI +as well or better than models whose architecture is specifically designed to +detect AIGI. Our method will significantly increase access to AIGI-detecting +tools and reduce the negative effects of AIGI on society, as our CLIP +fine-tuning procedures require no architecture changes from publicly available +model repositories and consume significantly less GPU resources than other AIGI +detection models. + +
+
+ comment: submitted for publication in Machine Vision and Applications +
+
+
+
+
+ + ☆ Under pressure: learning-based analog gauge reading in the wild ICRA + + +
+ We propose an interpretable framework for reading analog gauges that is +deployable on real world robotic systems. Our framework splits the reading task +into distinct steps, such that we can detect potential failures at each step. +Our system needs no prior knowledge of the type of gauge or the range of the +scale and is able to extract the units used. We show that our gauge reading +algorithm is able to extract readings with a relative reading error of less +than 2%. + +
+
+ comment: 7 pages, 8 figures, accepted for presentation at the 2024 IEEE + International Conference on Robotics and Automation (ICRA) and for inclusion + in the conference proceedings, finalist for the IEEE ICRA 2024 Best Paper + Award in Automation, source code + https://github.com/ethz-asl/analog_gauge_reader, Autonomous Systems Lab, ETH + Zurich +
+
+
+
+
+ + ☆ Towards Sim-to-Real Industrial Parts Classification with Synthetic + Dataset CVPR + + +
+ This paper is about effectively utilizing synthetic data for training deep +neural networks for industrial parts classification, in particular, by taking +into account the domain gap against real-world images. To this end, we +introduce a synthetic dataset that may serve as a preliminary testbed for the +Sim-to-Real challenge; it contains 17 objects of six industrial use cases, +including isolated and assembled parts. A few subsets of objects exhibit large +similarities in shape and albedo for reflecting challenging cases of industrial +parts. All the sample images come with and without random backgrounds and +post-processing for evaluating the importance of domain randomization. We call +it Synthetic Industrial Parts dataset (SIP-17). We study the usefulness of +SIP-17 through benchmarking the performance of five state-of-the-art deep +network models, supervised and self-supervised, trained only on the synthetic +data while testing them on real data. By analyzing the results, we deduce some +insights on the feasibility and challenges of using synthetic data for +industrial parts classification and for further developing larger-scale +synthetic datasets. Our dataset and code are publicly available. + +
+
+ comment: Published in 2023 IEEE/CVF Conference on Computer Vision and Pattern + Recognition Workshops (CVPRW) +
+
+
+
+
+ + ☆ LLM-Seg: Bridging Image Segmentation and Large Language Model Reasoning + + +
+ Understanding human instructions to identify the target objects is vital for +perception systems. In recent years, the advancements of Large Language Models +(LLMs) have introduced new possibilities for image segmentation. In this work, +we delve into reasoning segmentation, a novel task that enables segmentation +system to reason and interpret implicit user intention via large language model +reasoning and then segment the corresponding target. Our work on reasoning +segmentation contributes on both the methodological design and dataset +labeling. For the model, we propose a new framework named LLM-Seg. LLM-Seg +effectively connects the current foundational Segmentation Anything Model and +the LLM by mask proposals selection. For the dataset, we propose an automatic +data generation pipeline and construct a new reasoning segmentation dataset +named LLM-Seg40K. Experiments demonstrate that our LLM-Seg exhibits competitive +performance compared with existing methods. Furthermore, our proposed pipeline +can efficiently produce high-quality reasoning segmentation datasets. The +LLM-Seg40K dataset, developed through this pipeline, serves as a new benchmark +for training and evaluating various reasoning segmentation approaches. Our +code, models and dataset are at https://github.com/wangjunchi/LLMSeg. + +
+
+ comment: Github: https://github.com/wangjunchi/LLMSeg +
+
+
+
+
+ + ☆ `Eyes of a Hawk and Ears of a Fox': Part Prototype Network for + Generalized Zero-Shot Learning CVPR 2024 + + +
+ Current approaches in Generalized Zero-Shot Learning (GZSL) are built upon +base models which consider only a single class attribute vector representation +over the entire image. This is an oversimplification of the process of novel +category recognition, where different regions of the image may have properties +from different seen classes and thus have different predominant attributes. +With this in mind, we take a fundamentally different approach: a pre-trained +Vision-Language detector (VINVL) sensitive to attribute information is employed +to efficiently obtain region features. A learned function maps the region +features to region-specific attribute attention used to construct class part +prototypes. We conduct experiments on a popular GZSL benchmark consisting of +the CUB, SUN, and AWA2 datasets where our proposed Part Prototype Network (PPN) +achieves promising results when compared with other popular base models. +Corresponding ablation studies and analysis show that our approach is highly +practical and has a distinct advantage over global attribute attention when +localized proposals are available. + +
+
+ comment: Accepted to the CVPR 2024 LIMIT Workshop +
+
+
+
+
+ + ☆ SCOUT+: Towards Practical Task-Driven Drivers' Gaze Prediction + + +
+ Accurate prediction of drivers' gaze is an important component of +vision-based driver monitoring and assistive systems. Of particular interest +are safety-critical episodes, such as performing maneuvers or crossing +intersections. In such scenarios, drivers' gaze distribution changes +significantly and becomes difficult to predict, especially if the task and +context information is represented implicitly, as is common in many +state-of-the-art models. However, explicit modeling of top-down factors +affecting drivers' attention often requires additional information and +annotations that may not be readily available. + In this paper, we address the challenge of effective modeling of task and +context with common sources of data for use in practical systems. To this end, +we introduce SCOUT+, a task- and context-aware model for drivers' gaze +prediction, which leverages route and map information inferred from commonly +available GPS data. We evaluate our model on two datasets, DR(eye)VE and BDD-A, +and demonstrate that using maps improves results compared to bottom-up models +and reaches performance comparable to the top-down model SCOUT which relies on +privileged ground truth information. Code is available at +https://github.com/ykotseruba/SCOUT. + +
+
+ comment: Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024 +
+
+
+
+
+ + ☆ Training a Vision Language Model as Smartphone Assistant ICLR 2024 + + +
+ Addressing the challenge of a digital assistant capable of executing a wide +array of user tasks, our research focuses on the realm of instruction-based +mobile device control. We leverage recent advancements in large language models +(LLMs) and present a visual language model (VLM) that can fulfill diverse tasks +on mobile devices. Our model functions by interacting solely with the user +interface (UI). It uses the visual input from the device screen and mimics +human-like interactions, encompassing gestures such as tapping and swiping. +This generality in the input and output space allows our agent to interact with +any application on the device. Unlike previous methods, our model operates not +only on a single screen image but on vision-language sentences created from +sequences of past screenshots along with corresponding actions. Evaluating our +method on the challenging Android in the Wild benchmark demonstrates its +promising efficacy and potential. + +
+
+ comment: ICLR 2024 workshop on Generative Models for Decision Making +
+
+
+
+
+ + ☆ Data Limitations for Modeling Top-Down Effects on Drivers' Attention + + +
+ Driving is a visuomotor task, i.e., there is a connection between what +drivers see and what they do. While some models of drivers' gaze account for +top-down effects of drivers' actions, the majority learn only bottom-up +correlations between human gaze and driving footage. The crux of the problem is +lack of public data with annotations that could be used to train top-down +models and evaluate how well models of any kind capture effects of task on +attention. As a result, top-down models are trained and evaluated on private +data and public benchmarks measure only the overall fit to human data. + In this paper, we focus on data limitations by examining four large-scale +public datasets, DR(eye)VE, BDD-A, MAAD, and LBW, used to train and evaluate +algorithms for drivers' gaze prediction. We define a set of driving tasks +(lateral and longitudinal maneuvers) and context elements (intersections and +right-of-way) known to affect drivers' attention, augment the datasets with +annotations based on the said definitions, and analyze the characteristics of +data recording and processing pipelines w.r.t. capturing what the drivers see +and do. In sum, the contributions of this work are: 1) quantifying biases of +the public datasets, 2) examining performance of the SOTA bottom-up models on +subsets of the data involving non-trivial drivers' actions, 3) linking +shortcomings of the bottom-up models to data limitations, and 4) +recommendations for future data collection and processing. The new annotations +and code for reproducing the results is available at +https://github.com/ykotseruba/SCOUT. + +
+
+ comment: Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024 +
+
+
+
+
+ + ☆ Multi-Branch Generative Models for Multichannel Imaging with an + Application to PET/CT Joint Reconstruction + + +
+ This paper presents a proof-of-concept approach for learned synergistic +reconstruction of medical images using multi-branch generative models. +Leveraging variational autoencoders (VAEs) and generative adversarial networks +(GANs), our models learn from pairs of images simultaneously, enabling +effective denoising and reconstruction. Synergistic image reconstruction is +achieved by incorporating the trained models in a regularizer that evaluates +the distance between the images and the model, in a similar fashion to +multichannel dictionary learning (DiL). We demonstrate the efficacy of our +approach on both Modified National Institute of Standards and Technology +(MNIST) and positron emission tomography (PET)/computed tomography (CT) +datasets, showcasing improved image quality and information sharing between +modalities. Despite challenges such as patch decomposition and model +limitations, our results underscore the potential of generative models for +enhancing medical imaging reconstruction. + +
+
+ comment: 12 pages, 16 figures, submitted to IEEE TRPMS +
+
+
+
+
+ + ☆ Into the Fog: Evaluating Multiple Object Tracking Robustness + + +
+ State-of-the-art (SOTA) trackers have shown remarkable Multiple Object +Tracking (MOT) performance when trained and evaluated on current benchmarks. +However, these benchmarks primarily consist of clear scenarios, overlooking +adverse atmospheric conditions such as fog, haze, smoke and dust. As a result, +the robustness of SOTA trackers remains underexplored. To address these +limitations, we propose a pipeline for physic-based volumetric fog simulation +in arbitrary real-world MOT dataset utilizing frame-by-frame monocular depth +estimation and a fog formation optical model. Moreover, we enhance our +simulation by rendering of both homogeneous and heterogeneous fog effects. We +propose to use the dark channel prior method to estimate fog (smoke) color, +which shows promising results even in night and indoor scenes. We present the +leading tracking benchmark MOTChallenge (MOT17 dataset) overlaid by fog (smoke +for indoor scenes) of various intensity levels and conduct a comprehensive +evaluation of SOTA MOT methods, revealing their limitations under fog and +fog-similar challenges. + +
+
+
+
+
+ + ☆ SEVD: Synthetic Event-based Vision Dataset for Ego and Fixed Traffic + Perception + + +
+ Recently, event-based vision sensors have gained attention for autonomous +driving applications, as conventional RGB cameras face limitations in handling +challenging dynamic conditions. However, the availability of real-world and +synthetic event-based vision datasets remains limited. In response to this gap, +we present SEVD, a first-of-its-kind multi-view ego, and fixed perception +synthetic event-based dataset using multiple dynamic vision sensors within the +CARLA simulator. Data sequences are recorded across diverse lighting (noon, +nighttime, twilight) and weather conditions (clear, cloudy, wet, rainy, foggy) +with domain shifts (discrete and continuous). SEVD spans urban, suburban, +rural, and highway scenes featuring various classes of objects (car, truck, +van, bicycle, motorcycle, and pedestrian). Alongside event data, SEVD includes +RGB imagery, depth maps, optical flow, semantic, and instance segmentation, +facilitating a comprehensive understanding of the scene. Furthermore, we +evaluate the dataset using state-of-the-art event-based (RED, RVT) and +frame-based (YOLOv8) methods for traffic participant detection tasks and +provide baseline benchmarks for assessment. Additionally, we conduct +experiments to assess the synthetic event-based dataset's generalization +capabilities. The dataset is available at +https://eventbasedvision.github.io/SEVD + +
+
+
+
+
+ + ♻ ☆ LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal + Models + + +
+ Large Multimodal Models (LMMs) have shown significant reasoning capabilities +by connecting a visual encoder and a large language model. LMMs typically use a +fixed amount of visual tokens, such as the penultimate layer features in the +CLIP visual encoder, as the prefix content. Recent LMMs incorporate more +complex visual inputs, such as high-resolution images and videos, which +increase the number of visual tokens significantly. However, due to the design +of the Transformer architecture, computational costs associated with these +models tend to increase quadratically with the number of input tokens. To +tackle this problem, we explore a token reduction mechanism and find, similar +to prior work, that many visual tokens are spatially redundant. Based on this, +we propose PruMerge, a novel adaptive visual token reduction approach, which +largely reduces the number of visual tokens while maintaining comparable model +performance. We first select the unpruned visual tokens based on their +similarity to class tokens and spatial tokens. We then cluster the pruned +tokens based on key similarity and merge the clustered tokens with the unpruned +tokens to supplement their information. Empirically, when applied to LLaVA-1.5, +our approach can compress the visual tokens by 18 times on average, and achieve +comparable performance across diverse visual question-answering and reasoning +tasks. Code and checkpoints are at https://llava-prumerge.github.io/. + +
+
+ comment: Project page: https://llava-prumerge.github.io/ +
+
+
+
+
+ + ♻ ☆ FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal + Consistency and Correlation Debiasing CVPR 2024 + + +
+ Dynamic scene graph generation (SGG) from videos requires not only a +comprehensive understanding of objects across scenes but also a method to +capture the temporal motions and interactions with different objects. Moreover, +the long-tailed distribution of visual relationships is a crucial bottleneck +for most dynamic SGG methods. This is because many of them focus on capturing +spatio-temporal context using complex architectures, leading to the generation +of biased scene graphs. To address these challenges, we propose FloCoDe: +Flow-aware Temporal Consistency and Correlation Debiasing with uncertainty +attenuation for unbiased dynamic scene graphs. FloCoDe employs feature warping +using flow to detect temporally consistent objects across frames. To address +the long-tail issue of visual relationships, we propose correlation debiasing +and a label correlation-based loss to learn unbiased relation representations +for long-tailed classes. Specifically, we propose to incorporate label +correlations using contrastive loss to capture commonly co-occurring relations, +which aids in learning robust representations for long-tailed classes. Further, +we adopt the uncertainty attenuation-based classifier framework to handle noisy +annotations in the SGG data. Extensive experimental evaluation shows a +performance gain as high as 4.1%, demonstrating the superiority of generating +more unbiased scene graphs. + +
+
+ comment: Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ PromptSync: Bridging Domain Gaps in Vision-Language Models through + Class-Aware Prototype Alignment and Discrimination CVPR 2024 + + +
+ The potential for zero-shot generalization in vision-language (V-L) models +such as CLIP has spurred their widespread adoption in addressing numerous +downstream tasks. Previous methods have employed test-time prompt tuning to +adapt the model to unseen domains, but they overlooked the issue of imbalanced +class distributions. In this study, we explicitly address this problem by +employing class-aware prototype alignment weighted by mean class probabilities +obtained for the test sample and filtered augmented views. Additionally, we +ensure that the class probabilities are as accurate as possible by performing +prototype discrimination using contrastive learning. The combination of +alignment and discriminative loss serves as a geometric regularizer, preventing +the prompt representation from collapsing onto a single class and effectively +bridging the distribution gap between the source and test domains. Our method, +named PromptSync, synchronizes the prompts for each test sample on both the +text and vision branches of the V-L model. In empirical evaluations on the +domain generalization benchmark, our method outperforms previous best methods +by 2.33% in overall performance, by 1% in base-to-novel generalization, and by +2.84% in cross-dataset transfer tasks. + +
+
+ comment: Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures +
+
+
+
+
+ + ♻ ☆ WonderJourney: Going from Anywhere to Everywhere + + +
+ We introduce WonderJourney, a modularized framework for perpetual 3D scene +generation. Unlike prior work on view generation that focuses on a single type +of scenes, we start at any user-provided location (by a text description or an +image) and generate a journey through a long sequence of diverse yet coherently +connected 3D scenes. We leverage an LLM to generate textual descriptions of the +scenes in this journey, a text-driven point cloud generation pipeline to make a +compelling and coherent sequence of 3D scenes, and a large VLM to verify the +generated scenes. We show compelling, diverse visual results across various +scene types and styles, forming imaginary "wonderjourneys". Project website: +https://kovenyu.com/WonderJourney/ + +
+
+ comment: Project website with video results: + https://kovenyu.com/WonderJourney/ +
+
+
+
+
+ + ♻ ☆ ProbMCL: Simple Probabilistic Contrastive Learning for Multi-label + Visual Classification ICASSP 2024 + + +
+ Multi-label image classification presents a challenging task in many domains, +including computer vision and medical imaging. Recent advancements have +introduced graph-based and transformer-based methods to improve performance and +capture label dependencies. However, these methods often include complex +modules that entail heavy computation and lack interpretability. In this paper, +we propose Probabilistic Multi-label Contrastive Learning (ProbMCL), a novel +framework to address these challenges in multi-label image classification +tasks. Our simple yet effective approach employs supervised contrastive +learning, in which samples that share enough labels with an anchor image based +on a decision threshold are introduced as a positive set. This structure +captures label dependencies by pulling positive pair embeddings together and +pushing away negative samples that fall below the threshold. We enhance +representation learning by incorporating a mixture density network into +contrastive learning and generating Gaussian mixture distributions to explore +the epistemic uncertainty of the feature encoder. We validate the effectiveness +of our framework through experimentation with datasets from the computer vision +and medical imaging domains. Our method outperforms the existing +state-of-the-art methods while achieving a low computational footprint on both +datasets. Visualization analyses also demonstrate that ProbMCL-learned +classifiers maintain a meaningful semantic topology. + +
+
+ comment: This paper has been accepted for the ICASSP 2024 - 2024 IEEE + International Conference on Acoustics, Speech and Signal Processing (ICASSP) +
+
+
+
+
+ + ♻ ☆ A Change Detection Reality Check + + +
+ In recent years, there has been an explosion of proposed change detection +deep learning architectures in the remote sensing literature. These approaches +claim to offer state-of-the-art performance on different standard benchmark +datasets. However, has the field truly made significant progress? In this paper +we perform experiments which conclude a simple U-Net segmentation baseline +without training tricks or complicated architectural changes is still a top +performer for the task of change detection. + +
+
+
+
+
+ + ♻ ☆ Generalization in diffusion models arises from geometry-adaptive + harmonic representations ICLR + + +
+ Deep neural networks (DNNs) trained for image denoising are able to generate +high-quality samples with score-based reverse diffusion algorithms. These +impressive capabilities seem to imply an escape from the curse of +dimensionality, but recent reports of memorization of the training set raise +the question of whether these networks are learning the "true" continuous +density of the data. Here, we show that two DNNs trained on non-overlapping +subsets of a dataset learn nearly the same score function, and thus the same +density, when the number of training images is large enough. In this regime of +strong generalization, diffusion-generated images are distinct from the +training set, and are of high visual quality, suggesting that the inductive +biases of the DNNs are well-aligned with the data density. We analyze the +learned denoising functions and show that the inductive biases give rise to a +shrinkage operation in a basis adapted to the underlying image. Examination of +these bases reveals oscillating harmonic structures along contours and in +homogeneous regions. We demonstrate that trained denoisers are inductively +biased towards these geometry-adaptive harmonic bases since they arise not only +when the network is trained on photographic images, but also when it is trained +on image classes supported on low-dimensional manifolds for which the harmonic +basis is suboptimal. Finally, we show that when trained on regular image +classes for which the optimal basis is known to be geometry-adaptive and +harmonic, the denoising performance of the networks is near-optimal. + +
+
+ comment: Accepted for oral presentation at ICLR, Vienna, May 2024 +
+
+
+
+
+ + ♻ ☆ A novel Fourier neural operator framework for classification of + multi-sized images: Application to three dimensional digital porous media + + +
+ Fourier neural operators (FNOs) are invariant with respect to the size of +input images, and thus images with any size can be fed into FNO-based +frameworks without any modification of network architectures, in contrast to +traditional convolutional neural networks (CNNs). Leveraging the advantage of +FNOs, we propose a novel deep-learning framework for classifying images with +varying sizes. Particularly, we simultaneously train the proposed network on +multi-sized images. As a practical application, we consider the problem of +predicting the label (e.g., permeability) of three-dimensional digital porous +media. To construct the framework, an intuitive approach is to connect FNO +layers to a classifier using adaptive max pooling. First, we show that this +approach is only effective for porous media with fixed sizes, whereas it fails +for porous media of varying sizes. To overcome this limitation, we introduce +our approach: instead of using adaptive max pooling, we use static max pooling +with the size of channel width of FNO layers. Since the channel width of the +FNO layers is independent of input image size, the introduced framework can +handle multi-sized images during training. We show the effectiveness of the +introduced framework and compare its performance with the intuitive approach +through the example of the classification of three-dimensional digital porous +media of varying sizes. + +
+
+
+
+
+ + ♻ ☆ View-Consistent 3D Editing with Gaussian Splatting + + +
+ The advent of 3D Gaussian Splatting (3DGS) has revolutionized 3D editing, +offering efficient, high-fidelity rendering and enabling precise local +manipulations. Currently, diffusion-based 2D editing models are harnessed to +modify multi-view rendered images, which then guide the editing of 3DGS models. +However, this approach faces a critical issue of multi-view inconsistency, +where the guidance images exhibit significant discrepancies across views, +leading to mode collapse and visual artifacts of 3DGS. To this end, we +introduce View-consistent Editing (VcEdit), a novel framework that seamlessly +incorporates 3DGS into image editing processes, ensuring multi-view consistency +in edited guidance images and effectively mitigating mode collapse issues. +VcEdit employs two innovative consistency modules: the Cross-attention +Consistency Module and the Editing Consistency Module, both designed to reduce +inconsistencies in edited images. By incorporating these consistency modules +into an iterative pattern, VcEdit proficiently resolves the issue of multi-view +inconsistency, facilitating high-quality 3DGS editing across a diverse range of +scenes. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike + Camera + + +
+ One of the most critical factors in achieving sharp Novel View Synthesis +(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D +Gaussian Splatting (3DGS) is the quality of the training images. However, +Conventional RGB cameras are susceptible to motion blur. In contrast, +neuromorphic cameras like event and spike cameras inherently capture more +comprehensive temporal information, which can provide a sharp representation of +the scene as additional training data. Recent methods have explored the +integration of event cameras to improve the quality of NVS. The event-RGB +approaches have some limitations, such as high training costs and the inability +to work effectively in the background. Instead, our study introduces a new +method that uses the spike camera to overcome these limitations. By considering +texture reconstruction from spike streams as ground truth, we design the +Texture from Spike (TfS) loss. Since the spike camera relies on temporal +integration instead of temporal differentiation used by event cameras, our +proposed TfS loss maintains manageable training costs. It handles foreground +objects with backgrounds simultaneously. We also provide a real-world dataset +captured with our spike-RGB camera system to facilitate future research +endeavors. We conduct extensive experiments using synthetic and real-world +datasets to demonstrate that our design can enhance novel view synthesis across +NeRF and 3DGS. The code and dataset will be made available for public access. + +
+
+
+
+
+ + ♻ ☆ Identifying Important Group of Pixels using Interactions CVPR 2024 + + +
+ To better understand the behavior of image classifiers, it is useful to +visualize the contribution of individual pixels to the model prediction. In +this study, we propose a method, MoXI ($\textbf{Mo}$del e$\textbf{X}$planation +by $\textbf{I}$nteractions), that efficiently and accurately identifies a group +of pixels with high prediction confidence. The proposed method employs +game-theoretic concepts, Shapley values and interactions, taking into account +the effects of individual pixels and the cooperative influence of pixels on +model confidence. Theoretical analysis and experiments demonstrate that our +method better identifies the pixels that are highly contributing to the model +outputs than widely-used visualization by Grad-CAM, Attention rollout, and +Shapley value. While prior studies have suffered from the exponential +computational cost in the computation of Shapley value and interactions, we +show that this can be reduced to quadratic cost for our task. The code is +available at https://github.com/KosukeSumiyasu/MoXI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FoodLMM: A Versatile Food Assistant using Large Multi-modal Model + + +
+ Large Multi-modal Models (LMMs) have made impressive progress in many +vision-language tasks. Nevertheless, the performance of general LMMs in +specific domains is still far from satisfactory. This paper proposes FoodLMM, a +versatile food assistant based on LMMs with various capabilities, including +food recognition, ingredient recognition, recipe generation, nutrition +estimation, food segmentation and multi-round conversation. To facilitate +FoodLMM to deal with tasks beyond pure text output, we introduce a series of +novel task-specific tokens and heads, enabling the model to predict food +nutritional values and multiple segmentation masks. We adopt a two-stage +training strategy. In the first stage, we utilize multiple public food +benchmarks for multi-task learning by leveraging the instruct-following +paradigm. In the second stage, we construct a multi-round conversation dataset +and a reasoning segmentation dataset to fine-tune the model, enabling it to +conduct professional dialogues and generate segmentation masks based on complex +reasoning in the food domain. Our fine-tuned FoodLMM achieves state-of-the-art +results across several food benchmarks. We will make our code, models and +datasets publicly available. + +
+
+
+
+
+ + ♻ ☆ Transformer based Pluralistic Image Completion with Reduced Information + Loss + + +
+ Transformer based methods have achieved great success in image inpainting +recently. However, we find that these solutions regard each pixel as a token, +thus suffering from an information loss issue from two aspects: 1) They +downsample the input image into much lower resolutions for efficiency +consideration. 2) They quantize $256^3$ RGB values to a small number (such as +512) of quantized color values. The indices of quantized pixels are used as +tokens for the inputs and prediction targets of the transformer. To mitigate +these issues, we propose a new transformer based framework called "PUT". +Specifically, to avoid input downsampling while maintaining computation +efficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts +the masked image into non-overlapped patch tokens and the decoder recovers the +masked regions from the inpainted tokens while keeping the unmasked regions +unchanged. To eliminate the information loss caused by input quantization, an +Un-quantized Transformer is applied. It directly takes features from the +P-VQVAE encoder as input without any quantization and only regards the +quantized tokens as prediction targets. Furthermore, to make the inpainting +process more controllable, we introduce semantic and structural conditions as +extra guidance. Extensive experiments show that our method greatly outperforms +existing transformer based methods on image fidelity and achieves much higher +diversity and better fidelity than state-of-the-art pluralistic inpainting +methods on complex large-scale datasets (e.g., ImageNet). Codes are available +at https://github.com/liuqk3/PUT. + +
+
+ comment: Accepted by TPAMI (2024). arXiv admin note: text overlap with + arXiv:2205.05076 +
+
+
+
+
+ + ♻ ☆ WildFusion: Learning 3D-Aware Latent Diffusion Models in View Space + + +
+ Modern learning-based approaches to 3D-aware image synthesis achieve high +photorealism and 3D-consistent viewpoint changes for the generated images. +Existing approaches represent instances in a shared canonical space. However, +for in-the-wild datasets a shared canonical system can be difficult to define +or might not even exist. In this work, we instead model instances in view +space, alleviating the need for posed images and learned camera distributions. +We find that in this setting, existing GAN-based methods are prone to +generating flat geometry and struggle with distribution coverage. We hence +propose WildFusion, a new approach to 3D-aware image synthesis based on latent +diffusion models (LDMs). We first train an autoencoder that infers a compressed +latent representation, which additionally captures the images' underlying 3D +structure and enables not only reconstruction but also novel view synthesis. To +learn a faithful 3D representation, we leverage cues from monocular depth +prediction. Then, we train a diffusion model in the 3D-aware latent space, +thereby enabling synthesis of high-quality 3D-consistent image samples, +outperforming recent state-of-the-art GAN-based methods. Importantly, our +3D-aware LDM is trained without any direct supervision from multiview images or +3D geometry and does not require posed images or learned pose or camera +distributions. It directly learns a 3D representation without relying on +canonical camera coordinates. This opens up promising research avenues for +scalable 3D-aware image synthesis and 3D content creation from in-the-wild +image data. See https://katjaschwarz.github.io/wildfusion for videos of our 3D +results. + +
+
+
+
+
+ + ♻ ☆ Toward Reliable Human Pose Forecasting with Uncertainty + + +
+ Recently, there has been an arms race of pose forecasting methods aimed at +solving the spatio-temporal task of predicting a sequence of future 3D poses of +a person given a sequence of past observed ones. However, the lack of unified +benchmarks and limited uncertainty analysis have hindered progress in the +field. To address this, we first develop an open-source library for human pose +forecasting, including multiple models, supporting several datasets, and +employing standardized evaluation metrics, with the aim of promoting research +and moving toward a unified and consistent evaluation. Second, we devise two +types of uncertainty in the problem to increase performance and convey better +trust: 1) we propose a method for modeling aleatoric uncertainty by using +uncertainty priors to inject knowledge about the pattern of uncertainty. This +focuses the capacity of the model in the direction of more meaningful +supervision while reducing the number of learned parameters and improving +stability; 2) we introduce a novel approach for quantifying the epistemic +uncertainty of any model through clustering and measuring the entropy of its +assignments. Our experiments demonstrate up to $25\%$ improvements in +forecasting at short horizons, with no loss on longer horizons on Human3.6M, +AMSS, and 3DPW datasets, and better performance in uncertainty estimation. The +code is available online at https://github.com/vita-epfl/UnPOSed. + +
+
+ comment: Published in RA-L 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Masked Face Recognition Method during the COVID-19 Pandemic + + +
+ The coronavirus disease (COVID-19) is an unparalleled crisis leading to a +huge number of casualties and security problems. In order to reduce the spread +of coronavirus, people often wear masks to protect themselves. This makes face +recognition a very difficult task since certain parts of the face are hidden. A +primary focus of researchers during the ongoing coronavirus pandemic is to come +up with suggestions to handle this problem through rapid and efficient +solutions. In this paper, we propose a reliable method based on occlusion +removal and deep learning-based features in order to address the problem of the +masked face recognition process. The first step is to remove the masked face +region. Next, we apply three pre-trained deep Convolutional Neural Networks +(CNN) namely, VGG-16, AlexNet, and ResNet-50, and use them to extract deep +features from the obtained regions (mostly eyes and forehead regions). The +Bag-of-features paradigm is then applied to the feature maps of the last +convolutional layer in order to quantize them and to get a slight +representation comparing to the fully connected layer of classical CNN. +Finally, Multilayer Perceptron (MLP) is applied for the classification process. +Experimental results on Real-World-Masked-Face-Dataset show high recognition +performance compared to other state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Impacts of Color and Texture Distortions on Earth Observation Data in + Deep Learning + + +
+ Land cover classification and change detection are two important applications +of remote sensing and Earth observation (EO) that have benefited greatly from +the advances of deep learning. Convolutional and transformer-based U-net models +are the state-of-the-art architectures for these tasks, and their performances +have been boosted by an increased availability of large-scale annotated EO +datasets. However, the influence of different visual characteristics of the +input EO data on a model's predictions is not well understood. In this work we +systematically examine model sensitivities with respect to several color- and +texture-based distortions on the input EO data during inference, given models +that have been trained without such distortions. We conduct experiments with +multiple state-of-the-art segmentation networks for land cover classification +and show that they are in general more sensitive to texture than to color +distortions. Beyond revealing intriguing characteristics of widely used land +cover classification models, our results can also be used to guide the +development of more robust models within the EO domain. + +
+
+
+
+
+ + ♻ ☆ Vision Transformers Need Registers + + +
+ Transformers have recently emerged as a powerful tool for learning visual +representations. In this paper, we identify and characterize artifacts in +feature maps of both supervised and self-supervised ViT networks. The artifacts +correspond to high-norm tokens appearing during inference primarily in +low-informative background areas of images, that are repurposed for internal +computations. We propose a simple yet effective solution based on providing +additional tokens to the input sequence of the Vision Transformer to fill that +role. We show that this solution fixes that problem entirely for both +supervised and self-supervised models, sets a new state of the art for +self-supervised visual models on dense visual prediction tasks, enables object +discovery methods with larger models, and most importantly leads to smoother +feature maps and attention maps for downstream visual processing. + +
+
+
+
+
+ + ♻ ☆ Safe-CLIP: Removing NSFW Concepts from Vision-and-Language Models + + +
+ Large-scale vision-and-language models, such as CLIP, are typically trained +on web-scale data, which can introduce inappropriate content and lead to the +development of unsafe and biased behavior. This, in turn, hampers their +applicability in sensitive and trustworthy contexts and could raise significant +concerns in their adoption. Our research introduces a novel approach to +enhancing the safety of vision-and-language models by diminishing their +sensitivity to NSFW (not safe for work) inputs. In particular, our methodology +seeks to sever "toxic" linguistic and visual concepts, unlearning the linkage +between unsafe linguistic or visual items and unsafe regions of the embedding +space. We show how this can be done by fine-tuning a CLIP model on synthetic +data obtained from a large language model trained to convert between safe and +unsafe sentences, and a text-to-image generator. We conduct extensive +experiments on the resulting embedding space for cross-modal retrieval, +text-to-image, and image-to-text generation, where we show that our model can +be remarkably employed with pre-trained generative models. Our source code and +trained models are available at: https://github.com/aimagelab/safe-clip. + +
+
+
+
+
+ + ♻ ☆ Lightweight Deep Learning for Resource-Constrained Environments: A + Survey + + +
+ Over the past decade, the dominance of deep learning has prevailed across +various domains of artificial intelligence, including natural language +processing, computer vision, and biomedical signal processing. While there have +been remarkable improvements in model accuracy, deploying these models on +lightweight devices, such as mobile phones and microcontrollers, is constrained +by limited resources. In this survey, we provide comprehensive design guidance +tailored for these devices, detailing the meticulous design of lightweight +models, compression methods, and hardware acceleration strategies. The +principal goal of this work is to explore methods and concepts for getting +around hardware constraints without compromising the model's accuracy. +Additionally, we explore two notable paths for lightweight deep learning in the +future: deployment techniques for TinyML and Large Language Models. Although +these paths undoubtedly have potential, they also present significant +challenges, encouraging research into unexplored areas. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Which Transformer to Favor: A Comparative Analysis of Efficiency in + Vision Transformers + + +
+ Transformers come with a high computational cost, yet their effectiveness in +addressing problems in language and vision has sparked extensive research aimed +at enhancing their efficiency. However, diverse experimental conditions, +spanning multiple input domains, prevent a fair comparison based solely on +reported results, posing challenges for model selection. To address this gap in +comparability, we design a comprehensive benchmark of more than 30 models for +image classification, evaluating key efficiency aspects, including accuracy, +speed, and memory usage. This benchmark provides a standardized baseline across +the landscape of efficiency-oriented transformers and our framework of +analysis, based on Pareto optimality, reveals surprising insights. Despite +claims of other models being more efficient, ViT remains Pareto optimal across +multiple metrics. We observe that hybrid attention-CNN models exhibit +remarkable inference memory- and parameter-efficiency. Moreover, our benchmark +shows that using a larger model in general is more efficient than using higher +resolution images. Thanks to our holistic evaluation, we provide a centralized +resource for practitioners and researchers, facilitating informed decisions +when selecting transformers or measuring progress of the development of +efficient transformers. + +
+
+
+
+
+ + ♻ ☆ NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous + Driving + + +
+ We present a versatile NeRF-based simulator for testing autonomous driving +(AD) software systems, designed with a focus on sensor-realistic closed-loop +evaluation and the creation of safety-critical scenarios. The simulator learns +from sequences of real-world driving sensor data and enables reconfigurations +and renderings of new, unseen scenarios. In this work, we use our simulator to +test the responses of AD models to safety-critical scenarios inspired by the +European New Car Assessment Programme (Euro NCAP). Our evaluation reveals that, +while state-of-the-art end-to-end planners excel in nominal driving scenarios +in an open-loop setting, they exhibit critical flaws when navigating our +safety-critical scenarios in a closed-loop setting. This highlights the need +for advancements in the safety and real-world usability of end-to-end planners. +By publicly releasing our simulator and scenarios as an easy-to-run evaluation +suite, we invite the research community to explore, refine, and validate their +AD models in controlled, yet highly configurable and challenging +sensor-realistic environments. Code and instructions can be found at +https://github.com/wljungbergh/NeuroNCAP + +
+
+
+
+
+ + ♻ ☆ ZONE: Zero-Shot Instruction-Guided Local Editing CVPR 2024 + + +
+ Recent advances in vision-language models like Stable Diffusion have shown +remarkable power in creative image synthesis and editing.However, most existing +text-to-image editing methods encounter two obstacles: First, the text prompt +needs to be carefully crafted to achieve good results, which is not intuitive +or user-friendly. Second, they are insensitive to local edits and can +irreversibly affect non-edited regions, leaving obvious editing traces. To +tackle these problems, we propose a Zero-shot instructiON-guided local image +Editing approach, termed ZONE. We first convert the editing intent from the +user-provided instruction (e.g., "make his tie blue") into specific image +editing regions through InstructPix2Pix. We then propose a Region-IoU scheme +for precise image layer extraction from an off-the-shelf segment model. We +further develop an edge smoother based on FFT for seamless blending between the +layer and the image.Our method allows for arbitrary manipulation of a specific +region with a single instruction while preserving the rest. Extensive +experiments demonstrate that our ZONE achieves remarkable local editing results +and user-friendliness, outperforming state-of-the-art methods. Code is +available at https://github.com/lsl001006/ZONE. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Foundation Models for Content-Based Medical Image Retrieval + in Radiology + + +
+ Content-based image retrieval (CBIR) has the potential to significantly +improve diagnostic aid and medical research in radiology. Current CBIR systems +face limitations due to their specialization to certain pathologies, limiting +their utility. In response, we propose using vision foundation models as +powerful and versatile off-the-shelf feature extractors for content-based +medical image retrieval. By benchmarking these models on a comprehensive +dataset of 1.6 million 2D radiological images spanning four modalities and 161 +pathologies, we identify weakly-supervised models as superior, achieving a P@1 +of up to 0.594. This performance not only competes with a specialized model but +does so without the need for fine-tuning. Our analysis further explores the +challenges in retrieving pathological versus anatomical structures, indicating +that accurate retrieval of pathological features presents greater difficulty. +Despite these challenges, our research underscores the vast potential of +foundation models for CBIR in radiology, proposing a shift towards versatile, +general-purpose medical image retrieval systems that do not require specific +tuning. + +
+
+
+
+
+ + ♻ ☆ DUFOMap: Efficient Dynamic Awareness Mapping + + +
+ The dynamic nature of the real world is one of the main challenges in +robotics. The first step in dealing with it is to detect which parts of the +world are dynamic. A typical benchmark task is to create a map that contains +only the static part of the world to support, for example, localization and +planning. Current solutions are often applied in post-processing, where +parameter tuning allows the user to adjust the setting for a specific dataset. +In this paper, we propose DUFOMap, a novel dynamic awareness mapping framework +designed for efficient online processing. Despite having the same parameter +settings for all scenarios, it performs better or is on par with +state-of-the-art methods. Ray casting is utilized to identify and classify +fully observed empty regions. Since these regions have been observed empty, it +follows that anything inside them at another time must be dynamic. Evaluation +is carried out in various scenarios, including outdoor environments in KITTI +and Argoverse 2, open areas on the KTH campus, and with different sensor types. +DUFOMap outperforms the state of the art in terms of accuracy and computational +efficiency. The source code, benchmarks, and links to the datasets utilized are +provided. See https://kth-rpl.github.io/dufomap for more details. + +
+
+ comment: The first two authors hold equal contribution. 8 pages, 7 figures, + project page https://kth-rpl.github.io/dufomap +
+
+
+
+
+ + ♻ ☆ A Systematic Survey of Deep Learning-based Single-Image Super-Resolution + + +
+ Single-image super-resolution (SISR) is an important task in image +processing, which aims to enhance the resolution of imaging systems. Recently, +SISR has made a huge leap and has achieved promising results with the help of +deep learning (DL). In this survey, we give an overview of DL-based SISR +methods and group them according to their design targets. Specifically, we +first introduce the problem definition, research background, and the +significance of SISR. Secondly, we introduce some related works, including +benchmark datasets, upsampling methods, optimization objectives, and image +quality assessment methods. Thirdly, we provide a detailed investigation of +SISR and give some domain-specific applications of it. Fourthly, we present the +reconstruction results of some classic SISR methods to intuitively know their +performance. Finally, we discuss some issues that still exist in SISR and +summarize some new trends and future directions. This is an exhaustive survey +of SISR, which can help researchers better understand SISR and inspire more +exciting research in this field. An investigation project for SISR is provided +at https://github.com/CV-JunchengLi/SISR-Survey. + +
+
+ comment: 40 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ How is Visual Attention Influenced by Text Guidance? Database and Model + + +
+ The analysis and prediction of visual attention have long been crucial tasks +in the fields of computer vision and image processing. In practical +applications, images are generally accompanied by various text descriptions, +however, few studies have explored the influence of text descriptions on visual +attention, let alone developed visual saliency prediction models considering +text guidance. In this paper, we conduct a comprehensive study on text-guided +image saliency (TIS) from both subjective and objective perspectives. +Specifically, we construct a TIS database named SJTU-TIS, which includes 1200 +text-image pairs and the corresponding collected eye-tracking data. Based on +the established SJTU-TIS database, we analyze the influence of various text +descriptions on visual attention. Then, to facilitate the development of +saliency prediction models considering text influence, we construct a benchmark +for the established SJTU-TIS database using state-of-the-art saliency models. +Finally, considering the effect of text descriptions on visual attention, while +most existing saliency models ignore this impact, we further propose a +text-guided saliency (TGSal) prediction model, which extracts and integrates +both image features and text features to predict the image saliency under +various text-description conditions. Our proposed model significantly +outperforms the state-of-the-art saliency models on both the SJTU-TIS database +and the pure image saliency databases in terms of various evaluation metrics. +The SJTU-TIS database and the code of the proposed TGSal model will be released +at: https://github.com/IntMeGroup/TGSal. + +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ Rapid post-disaster infrastructure damage characterisation enabled by + remote sensing and deep learning technologies -- a tiered approach + + +
+ Critical infrastructure, such as transport networks and bridges, are +systematically targeted during wars and suffer damage during extensive natural +disasters because it is vital for enabling connectivity and transportation of +people and goods, and hence, underpins national and international economic +growth. Mass destruction of transport assets, in conjunction with minimal or no +accessibility in the wake of natural and anthropogenic disasters, prevents us +from delivering rapid recovery and adaptation. As a result, systemic +operability is drastically reduced, leading to low levels of resilience. Thus, +there is a need for rapid assessment of its condition to allow for informed +decision-making for restoration prioritisation. A solution to this challenge is +to use technology that enables stand-off observations. Nevertheless, no methods +exist for automated characterisation of damage at multiple scales, i.e. +regional (e.g., network), asset (e.g., bridges), and structural (e.g., road +pavement) scales. We propose a methodology based on an integrated, multi-scale +tiered approach to fill this capability gap. In doing so, we demonstrate how +automated damage characterisation can be enabled by fit-for-purpose digital +technologies. Next, the methodology is applied and validated to a case study in +Ukraine that includes 17 bridges, damaged by human targeted interventions. From +regional to component scale, we deploy technology to integrate assessments +using Sentinel-1 SAR images, crowdsourced information, and high-resolution +images for deep learning to facilitate automatic damage detection and +characterisation. For the first time, the interferometric coherence difference +and semantic segmentation of images were deployed in a tiered multi-scale +approach to improve the reliability of damage characterisations at different +scales. + +
+
+ comment: 43 pages; 20 figures +
+
+
+
+
+ + ♻ ☆ Perceptual Assessment and Optimization of High Dynamic Range Image + Rendering + + +
+ High dynamic range (HDR) rendering has the ability to faithfully reproduce +the wide luminance ranges in natural scenes, but how to accurately assess the +rendering quality is relatively underexplored. Existing quality models are +mostly designed for low dynamic range (LDR) images, and do not align well with +human perception of HDR image quality. To fill this gap, we propose a family of +HDR quality metrics, in which the key step is employing a simple inverse +display model to decompose an HDR image into a stack of LDR images with varying +exposures. Subsequently, these decomposed images are assessed through +well-established LDR quality metrics. Our HDR quality models present three +distinct benefits. First, they directly inherit the recent advancements of LDR +quality metrics. Second, they do not rely on human perceptual data of HDR image +quality for re-calibration. Third, they facilitate the alignment and +prioritization of specific luminance ranges for more accurate and detailed +quality assessment. Experimental results show that our HDR quality metrics +consistently outperform existing models in terms of quality assessment on four +HDR image quality datasets and perceptual optimization of HDR novel view +synthesis. + +
+
+
+
+
+ + ♻ ☆ FairVision: Equitable Deep Learning for Eye Disease Screening via Fair + Identity Scaling + + +
+ Equity in AI for healthcare is crucial due to its direct impact on human +well-being. Despite advancements in 2D medical imaging fairness, the fairness +of 3D models remains underexplored, hindered by the small sizes of 3D fairness +datasets. Since 3D imaging surpasses 2D imaging in SOTA clinical care, it is +critical to understand the fairness of these 3D models. To address this +research gap, we conduct the first comprehensive study on the fairness of 3D +medical imaging models across multiple protected attributes. Our investigation +spans both 2D and 3D models and evaluates fairness across five architectures on +three common eye diseases, revealing significant biases across race, gender, +and ethnicity. To alleviate these biases, we propose a novel fair identity +scaling (FIS) method that improves both overall performance and fairness, +outperforming various SOTA fairness methods. Moreover, we release +Harvard-FairVision, the first large-scale medical fairness dataset with 30,000 +subjects featuring both 2D and 3D imaging data and six demographic identity +attributes. Harvard-FairVision provides labels for three major eye disorders +affecting about 380 million people worldwide, serving as a valuable resource +for both 2D and 3D fairness learning. Our code and dataset are publicly +accessible at +\url{https://ophai.hms.harvard.edu/datasets/harvard-fairvision30k}. + +
+
+
+
+
+ + ♻ ☆ Deep Learning-Based MR Image Re-parameterization SC + + +
+ Magnetic resonance (MR) image re-parameterization refers to the process of +generating via simulations of an MR image with a new set of MRI scanning +parameters. Different parameter values generate distinct contrast between +different tissues, helping identify pathologic tissue. Typically, more than one +scan is required for diagnosis; however, acquiring repeated scans can be +costly, time-consuming, and difficult for patients. Thus, using MR image +re-parameterization to predict and estimate the contrast in these imaging scans +can be an effective alternative. In this work, we propose a novel deep learning +(DL) based convolutional model for MRI re-parameterization. Based on our +preliminary results, DL-based techniques hold the potential to learn the +non-linearities that govern the re-parameterization. + +
+
+ comment: A. Narang, A. Raj, M. Pop and M. Ebrahimi, "Deep Learning-Based MR + Image Re-parameterization," 2023 Congress in Computer Science, Computer + Engineering, & Applied Computing (CSCE), Las Vegas, NV, USA, 2023, pp. + 536-541, doi: 10.1109/CSCE60160.2023.00094 +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks in Vision-Language Image Understanding: A Survey + + +
+ 2D image understanding is a complex problem within computer vision, but it +holds the key to providing human-level scene comprehension. It goes further +than identifying the objects in an image, and instead, it attempts to +understand the scene. Solutions to this problem form the underpinning of a +range of tasks, including image captioning, visual question answering (VQA), +and image retrieval. Graphs provide a natural way to represent the relational +arrangement between objects in an image, and thus, in recent years graph neural +networks (GNNs) have become a standard component of many 2D image understanding +pipelines, becoming a core architectural component, especially in the VQA group +of tasks. In this survey, we review this rapidly evolving field and we provide +a taxonomy of graph types used in 2D image understanding approaches, a +comprehensive list of the GNN models used in this domain, and a roadmap of +future potential developments. To the best of our knowledge, this is the first +comprehensive survey that covers image captioning, visual question answering, +and image retrieval techniques that focus on using GNNs as the main part of +their architecture. + +
+
+ comment: 20 pages, 5 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ DiffusionGAN3D: Boosting Text-guided 3D Generation and Domain Adaptation + by Combining 3D GANs and Diffusion Priors CVPR2024 + + +
+ Text-guided domain adaptation and generation of 3D-aware portraits find many +applications in various fields. However, due to the lack of training data and +the challenges in handling the high variety of geometry and appearance, the +existing methods for these tasks suffer from issues like inflexibility, +instability, and low fidelity. In this paper, we propose a novel framework +DiffusionGAN3D, which boosts text-guided 3D domain adaptation and generation by +combining 3D GANs and diffusion priors. Specifically, we integrate the +pre-trained 3D generative models (e.g., EG3D) and text-to-image diffusion +models. The former provides a strong foundation for stable and high-quality +avatar generation from text. And the diffusion models in turn offer powerful +priors and guide the 3D generator finetuning with informative direction to +achieve flexible and efficient text-guided domain adaptation. To enhance the +diversity in domain adaptation and the generation capability in text-to-avatar, +we introduce the relative distance loss and case-specific learnable triplane +respectively. Besides, we design a progressive texture refinement module to +improve the texture quality for both tasks above. Extensive experiments +demonstrate that the proposed framework achieves excellent results in both +domain adaptation and text-to-avatar tasks, outperforming existing methods in +terms of generation quality and efficiency. The project homepage is at +https://younglbw.github.io/DiffusionGAN3D-homepage/. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation + + +
+ Customized text-to-image generation aims to synthesize instantiations of +user-specified concepts and has achieved unprecedented progress in handling +individual concept. However, when extending to multiple customized concepts, +existing methods exhibit limitations in terms of flexibility and fidelity, only +accommodating the combination of limited types of models and potentially +resulting in a mix of characteristics from different concepts. In this paper, +we introduce the Multi-concept guidance for Multi-concept customization, termed +MC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the +requirements for model architecture via inference time optimization, allowing +the integration of various heterogeneous single-concept customized models. It +adaptively refines the attention weights between visual and textual tokens, +directing image regions to focus on their associated words while diminishing +the impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$ +even surpasses previous methods that require additional training in terms of +consistency with input prompt and reference images. Moreover, MC$^2$ can be +extended to elevate the compositional capabilities of text-to-image generation, +yielding appealing results. Code will be publicly available at +https://github.com/JIANGJiaXiu/MC-2. + +
+
+
+
+
+ + ♻ ☆ FF-LOGO: Cross-Modality Point Cloud Registration with Feature Filtering + and Local to Global Optimization ICRA + + +
+ Cross-modality point cloud registration is confronted with significant +challenges due to inherent differences in modalities between different sensors. +We propose a cross-modality point cloud registration framework FF-LOGO: a +cross-modality point cloud registration method with feature filtering and +local-global optimization. The cross-modality feature correlation filtering +module extracts geometric transformation-invariant features from cross-modality +point clouds and achieves point selection by feature matching. We also +introduce a cross-modality optimization process, including a local adaptive key +region aggregation module and a global modality consistency fusion optimization +module. Experimental results demonstrate that our two-stage optimization +significantly improves the registration accuracy of the feature association and +selection module. Our method achieves a substantial increase in recall rate +compared to the current state-of-the-art methods on the 3DCSR dataset, +improving from 40.59% to 75.74%. Our code will be available at +https://github.com/wangmohan17/FFLOGO. + +
+
+ comment: Accepted by 2024 IEEE International Conference on Robotics and + Automation (ICRA),7 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior + + +
+ We present DiffBIR, a general restoration pipeline that could handle +different blind image restoration tasks in a unified framework. DiffBIR +decouples blind image restoration problem into two stages: 1) degradation +removal: removing image-independent content; 2) information regeneration: +generating the lost image content. Each stage is developed independently but +they work seamlessly in a cascaded manner. In the first stage, we use +restoration modules to remove degradations and obtain high-fidelity restored +results. For the second stage, we propose IRControlNet that leverages the +generative ability of latent diffusion models to generate realistic details. +Specifically, IRControlNet is trained based on specially produced condition +images without distracting noisy content for stable generation performance. +Moreover, we design a region-adaptive restoration guidance that can modify the +denoising process during inference without model re-training, allowing users to +balance realness and fidelity through a tunable guidance scale. Extensive +experiments have demonstrated DiffBIR's superiority over state-of-the-art +approaches for blind image super-resolution, blind face restoration and blind +image denoising tasks on both synthetic and real-world datasets. The code is +available at https://github.com/XPixelGroup/DiffBIR. + +
+
+
+
+
+ + ♻ ☆ RoadFormer: Duplex Transformer for RGB-Normal Semantic Road Scene + Parsing + + +
+ The recent advancements in deep convolutional neural networks have shown +significant promise in the domain of road scene parsing. Nevertheless, the +existing works focus primarily on freespace detection, with little attention +given to hazardous road defects that could compromise both driving safety and +comfort. In this paper, we introduce RoadFormer, a novel Transformer-based +data-fusion network developed for road scene parsing. RoadFormer utilizes a +duplex encoder architecture to extract heterogeneous features from both RGB +images and surface normal information. The encoded features are subsequently +fed into a novel heterogeneous feature synergy block for effective feature +fusion and recalibration. The pixel decoder then learns multi-scale long-range +dependencies from the fused and recalibrated heterogeneous features, which are +subsequently processed by a Transformer decoder to produce the final semantic +prediction. Additionally, we release SYN-UDTIRI, the first large-scale road +scene parsing dataset that contains over 10,407 RGB images, dense depth images, +and the corresponding pixel-level annotations for both freespace and road +defects of different shapes and sizes. Extensive experimental evaluations +conducted on our SYN-UDTIRI dataset, as well as on three public datasets, +including KITTI road, CityScapes, and ORFD, demonstrate that RoadFormer +outperforms all other state-of-the-art networks for road scene parsing. +Specifically, RoadFormer ranks first on the KITTI road benchmark. Our source +code, created dataset, and demo video are publicly available at +mias.group/RoadFormer. + +
+
+ comment: 9 pages 7 figures. Accepted by Transactions on Intelligent Vehicles +
+
+
+
+
+ + ♻ ☆ Accelerating ViT Inference on FPGA through Static and Dynamic Pruning + + +
+ Vision Transformers (ViTs) have achieved state-of-the-art accuracy on various +computer vision tasks. However, their high computational complexity prevents +them from being applied to many real-world applications. Weight and token +pruning are two well-known methods for reducing complexity: weight pruning +reduces the model size and associated computational demands, while token +pruning further dynamically reduces the computation based on the input. +Combining these two techniques should significantly reduce computation +complexity and model size; however, naively integrating them results in +irregular computation patterns, leading to significant accuracy drops and +difficulties in hardware acceleration. + Addressing the above challenges, we propose a comprehensive +algorithm-hardware codesign for accelerating ViT on FPGA through simultaneous +pruning -combining static weight pruning and dynamic token pruning. For +algorithm design, we systematically combine a hardware-aware structured +block-pruning method for pruning model parameters and a dynamic token pruning +method for removing unimportant token vectors. Moreover, we design a novel +training algorithm to recover the model's accuracy. For hardware design, we +develop a novel hardware accelerator for executing the pruned model. The +proposed hardware design employs multi-level parallelism with load balancing +strategy to efficiently deal with the irregular computation pattern led by the +two pruning approaches. Moreover, we develop an efficient hardware mechanism +for efficiently executing the on-the-fly token pruning. + +
+
+ comment: FCCM 2024 +
+
+
+
+
+ + ♻ ☆ Conv-Adapter: Exploring Parameter Efficient Transfer Learning for + ConvNets + + +
+ While parameter efficient tuning (PET) methods have shown great potential +with transformer architecture on Natural Language Processing (NLP) tasks, their +effectiveness with large-scale ConvNets is still under-studied on Computer +Vision (CV) tasks. This paper proposes Conv-Adapter, a PET module designed for +ConvNets. Conv-Adapter is light-weight, domain-transferable, and +architecture-agnostic with generalized performance on different tasks. When +transferring on downstream tasks, Conv-Adapter learns tasks-specific feature +modulation to the intermediate representations of backbones while keeping the +pre-trained parameters frozen. By introducing only a tiny amount of learnable +parameters, e.g., only 3.5% full fine-tuning parameters of ResNet50. It can +also be applied for transformer-based backbones. Conv-Adapter outperforms +previous PET baseline methods and achieves comparable or surpasses the +performance of full fine-tuning on 23 classification tasks of various domains. +It also presents superior performance on the few-shot classification with an +average margin of 3.39%. Beyond classification, Conv-Adapter can generalize to +detection and segmentation tasks with more than 50% reduction of parameters but +comparable performance to the traditional full fine-tuning. + +
+
+
+
+
+ + ♻ ☆ Robust Representation Learning with Self-Distillation for Domain + Generalization + + +
+ Despite the recent success of deep neural networks, there remains a need for +effective methods to enhance domain generalization using vision transformers. +In this paper, we propose a novel domain generalization technique called Robust +Representation Learning with Self-Distillation (RRLD) comprising i) +intermediate-block self-distillation and ii) augmentation-guided +self-distillation to improve the generalization capabilities of +transformer-based models on unseen domains. This approach enables the network +to learn robust and general features that are invariant to different +augmentations and domain shifts while effectively mitigating overfitting to +source domains. To evaluate the effectiveness of our proposed method, we +perform extensive experiments on PACS and OfficeHome benchmark datasets, as +well as an industrial wafer semiconductor defect dataset. The results +demonstrate that RRLD achieves robust and accurate generalization performance. +We observe an average accuracy improvement in the range of 1.2% to 2.3% over +the state-of-the-art on the three datasets. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker + + +
+ Most of 3D single object trackers (SOT) in point clouds follow the two-stream +multi-stage 3D Siamese or motion tracking paradigms, which process the template +and search area point clouds with two parallel branches, built on supervised +point cloud backbones. In this work, beyond typical 3D Siamese or motion +tracking, we propose a neat and compact one-stream transformer 3D SOT paradigm +from the novel perspective, termed as \textbf{EasyTrack}, which consists of +three special designs: 1) A 3D point clouds tracking feature pre-training +module is developed to exploit the masked autoencoding for learning 3D point +clouds tracking representations. 2) A unified 3D tracking feature learning and +fusion network is proposed to simultaneously learns target-aware 3D features, +and extensively captures mutual correlation through the flexible self-attention +mechanism. 3) A target location network in the dense bird's eye view (BEV) +feature space is constructed for target classification and regression. +Moreover, we develop an enhanced version named EasyTrack++, which designs the +center points interaction (CPI) strategy to reduce the ambiguous targets caused +by the noise point cloud background information. The proposed EasyTrack and +EasyTrack++ set a new state-of-the-art performance ($\textbf{18\%}$, +$\textbf{40\%}$ and $\textbf{3\%}$ success gains) in KITTI, NuScenes, and Waymo +while runing at \textbf{52.6fps} with few parameters (\textbf{1.3M}). The code +will be available at https://github.com/KnightApple427/Easytrack. + +
+
+
+
+
+ + ♻ ☆ Universal Humanoid Motion Representations for Physics-Based Control ICLR 2024 + + +
+ We present a universal motion representation that encompasses a comprehensive +range of motor skills for physics-based humanoid control. Due to the high +dimensionality of humanoids and the inherent difficulties in reinforcement +learning, prior methods have focused on learning skill embeddings for a narrow +range of movement styles (e.g. locomotion, game characters) from specialized +motion datasets. This limited scope hampers their applicability in complex +tasks. We close this gap by significantly increasing the coverage of our motion +representation space. To achieve this, we first learn a motion imitator that +can imitate all of human motion from a large, unstructured motion dataset. We +then create our motion representation by distilling skills directly from the +imitator. This is achieved by using an encoder-decoder structure with a +variational information bottleneck. Additionally, we jointly learn a prior +conditioned on proprioception (humanoid's own pose and velocities) to improve +model expressiveness and sampling efficiency for downstream tasks. By sampling +from the prior, we can generate long, stable, and diverse human motions. Using +this latent space for hierarchical RL, we show that our policies solve tasks +using human-like behavior. We demonstrate the effectiveness of our motion +representation by solving generative tasks (e.g. strike, terrain traversal) and +motion tracking using VR controllers. + +
+
+ comment: ICLR 2024 Spotlight. Project page: + https://zhengyiluo.github.io/PULSE/ +
+
+
+
+
+ + ♻ ☆ Eye-gaze Guided Multi-modal Alignment Framework for Radiology + + +
+ In multi-modal frameworks, the alignment of cross-modal features presents a +significant challenge. The predominant approach in multi-modal pre-training +emphasizes either global or local alignment between modalities, utilizing +extensive datasets. This bottom-up driven method often suffers from a lack of +interpretability, a critical concern in radiology. Previous studies have +integrated high-level labels in medical images or text, but these still rely on +manual annotation, a costly and labor-intensive process. Our work introduces a +novel approach by using eye-gaze data, collected synchronously by radiologists +during diagnostic evaluations. This data, indicating radiologists' focus areas, +naturally links chest X-rays to diagnostic texts. We propose the Eye-gaze +Guided Multi-modal Alignment (EGMA) framework to harness eye-gaze data for +better alignment of image and text features, aiming to reduce reliance on +manual annotations and thus cut training costs. Our model demonstrates robust +performance, outperforming other state-of-the-art methods in zero-shot +classification and retrieval tasks. The incorporation of easily-obtained +eye-gaze data during routine radiological diagnoses signifies a step towards +minimizing manual annotation dependency. Additionally, we explore the impact of +varying amounts of eye-gaze data on model performance, highlighting the +feasibility and utility of integrating this auxiliary data into multi-modal +pre-training. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Technique for Classifying Static Gestures Using UWB Radar + + +
+ Our paper presents a robust framework for UWB-based static gesture +recognition, leveraging proprietary UWB radar sensor technology. Extensive data +collection efforts were undertaken to compile datasets containing five commonly +used gestures. Our approach involves a comprehensive data pre-processing +pipeline that encompasses outlier handling, aspect ratio-preserving resizing, +and false-color image transformation. Both CNN and MobileNet models were +trained on the processed images. Remarkably, our best-performing model achieved +an accuracy of 96.78%. Additionally, we developed a user-friendly GUI framework +to assess the model's system resource usage and processing times, which +revealed low memory utilization and real-time task completion in under one +second. This research marks a significant step towards enhancing static gesture +recognition using UWB technology, promising practical applications in various +domains. + +
+
+ comment: This is not a technical research paper, but an excerpt of what was + applied during a funded project for the promotion of Open Science +
+
+
+
+
+ + ♻ ☆ ChangeNet: Multi-Temporal Asymmetric Change Detection Dataset ICASSP 2024 + + +
+ Change Detection (CD) has been attracting extensive interests with the +availability of bi-temporal datasets. However, due to the huge cost of +multi-temporal images acquisition and labeling, existing change detection +datasets are small in quantity, short in temporal, and low in practicability. +Therefore, a large-scale practical-oriented dataset covering wide temporal +phases is urgently needed to facilitate the community. To this end, the +ChangeNet dataset is presented especially for multi-temporal change detection, +along with the new task of "Asymmetric Change Detection". Specifically, +ChangeNet consists of 31,000 multi-temporal images pairs, a wide range of +complex scenes from 100 cities, and 6 pixel-level annotated categories, which +is far superior to all the existing change detection datasets including +LEVIR-CD, WHU Building CD, etc.. In addition, ChangeNet contains amounts of +real-world perspective distortions in different temporal phases on the same +areas, which is able to promote the practical application of change detection +algorithms. The ChangeNet dataset is suitable for both binary change detection +(BCD) and semantic change detection (SCD) tasks. Accordingly, we benchmark the +ChangeNet dataset on six BCD methods and two SCD methods, and extensive +experiments demonstrate its challenges and great significance. The dataset is +available at https://github.com/jankyee/ChangeNet. + +
+
+ comment: Accepted to ICASSP 2024 Oral/Lecture +
+
+
+
+
+ + ♻ ☆ Comment-aided Video-Language Alignment via Contrastive Pre-training for + Short-form Video Humor Detection ICMR 2024 + + +
+ The growing importance of multi-modal humor detection within affective +computing correlates with the expanding influence of short-form video sharing +on social media platforms. In this paper, we propose a novel two-branch +hierarchical model for short-form video humor detection (SVHD), named +Comment-aided Video-Language Alignment (CVLA) via data-augmented multi-modal +contrastive pre-training. Notably, our CVLA not only operates on raw signals +across various modal channels but also yields an appropriate multi-modal +representation by aligning the video and language components within a +consistent semantic space. The experimental results on two humor detection +datasets, including DY11k and UR-FUNNY, demonstrate that CVLA dramatically +outperforms state-of-the-art and several competitive baseline approaches. Our +dataset, code and model release at https://github.com/yliu-cs/CVLA. + +
+
+ comment: Accepted by ICMR 2024 +
+
+
+
+
+ + ♻ ☆ CosalPure: Learning Concept from Group Images for Robust Co-Saliency + Detection CVPR 2024 + + +
+ Co-salient object detection (CoSOD) aims to identify the common and salient +(usually in the foreground) regions across a given group of images. Although +achieving significant progress, state-of-the-art CoSODs could be easily +affected by some adversarial perturbations, leading to substantial accuracy +reduction. The adversarial perturbations can mislead CoSODs but do not change +the high-level semantic information (e.g., concept) of the co-salient objects. +In this paper, we propose a novel robustness enhancement framework by first +learning the concept of the co-salient objects based on the input group images +and then leveraging this concept to purify adversarial perturbations, which are +subsequently fed to CoSODs for robustness enhancement. Specifically, we propose +CosalPure containing two modules, i.e., group-image concept learning and +concept-guided diffusion purification. For the first module, we adopt a +pre-trained text-to-image diffusion model to learn the concept of co-salient +objects within group images where the learned concept is robust to adversarial +examples. For the second module, we map the adversarial image to the latent +space and then perform diffusion generation by embedding the learned concept +into the noise prediction function as an extra condition. Our method can +effectively alleviate the influence of the SOTA adversarial attack containing +different adversarial patterns, including exposure and noise. The extensive +results demonstrate that our method could enhance the robustness of CoSODs +significantly. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RDFC-GAN: RGB-Depth Fusion CycleGAN for Indoor Depth Completion CVPR 2022 + + +
+ Raw depth images captured in indoor scenarios frequently exhibit extensive +missing values due to the inherent limitations of the sensors and environments. +For example, transparent materials frequently elude detection by depth sensors; +surfaces may introduce measurement inaccuracies due to their polished textures, +extended distances, and oblique incidence angles from the sensor. The presence +of incomplete depth maps imposes significant challenges for subsequent vision +applications, prompting the development of numerous depth completion techniques +to mitigate this problem. Numerous methods excel at reconstructing dense depth +maps from sparse samples, but they often falter when faced with extensive +contiguous regions of missing depth values, a prevalent and critical challenge +in indoor environments. To overcome these challenges, we design a novel +two-branch end-to-end fusion network named RDFC-GAN, which takes a pair of RGB +and incomplete depth images as input to predict a dense and completed depth +map. The first branch employs an encoder-decoder structure, by adhering to the +Manhattan world assumption and utilizing normal maps from RGB-D information as +guidance, to regress the local dense depth values from the raw depth map. The +other branch applies an RGB-depth fusion CycleGAN, adept at translating RGB +imagery into detailed, textured depth maps while ensuring high fidelity through +cycle consistency. We fuse the two branches via adaptive fusion modules named +W-AdaIN and train the model with the help of pseudo depth maps. Comprehensive +evaluations on NYU-Depth V2 and SUN RGB-D datasets show that our method +significantly enhances depth completion performance particularly in realistic +indoor settings. + +
+
+ comment: Haowen Wang and Zhengping Che are with equal contributions. Paper + accepted by IEEE Transactions on Pattern Analysis and Machine Intelligence + (TPAMI). An earlier version has been accepted by CVPR 2022 + (arXiv:2203.10856). arXiv admin note: text overlap with arXiv:2203.10856 +
+
+
+
+
+ + ♻ ☆ HICO-DET-SG and V-COCO-SG: New Data Splits for Evaluating the Systematic + Generalization Performance of Human-Object Interaction Detection Models + + +
+ Human-Object Interaction (HOI) detection is a task to localize humans and +objects in an image and predict the interactions in human-object pairs. In +real-world scenarios, HOI detection models need systematic generalization, +i.e., generalization to novel combinations of objects and interactions, because +the train data are expected to cover a limited portion of all possible +combinations. To evaluate the systematic generalization performance of HOI +detection models, we created two new sets of HOI detection data splits named +HICO-DET-SG and V-COCO-SG based on the HICO-DET and V-COCO datasets, +respectively. When evaluated on the new data splits, HOI detection models with +various characteristics performed much more poorly than when evaluated on the +original splits. This shows that systematic generalization is a challenging +goal in HOI detection. By analyzing the evaluation results, we also gain +insights for improving the systematic generalization performance and identify +four possible future research directions. We hope that our new data splits and +presented analysis will encourage further research on systematic generalization +in HOI detection. + +
+
+ comment: 19 pages, 3 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Enhanced Muscle and Fat Segmentation for CT-Based Body Composition + Analysis: A Comparative Study + + +
+ Purpose: Body composition measurements from routine abdominal CT can yield +personalized risk assessments for asymptomatic and diseased patients. In +particular, attenuation and volume measures of muscle and fat are associated +with important clinical outcomes, such as cardiovascular events, fractures, and +death. This study evaluates the reliability of an Internal tool for the +segmentation of muscle and fat (subcutaneous and visceral) as compared to the +well-established public TotalSegmentator tool. + Methods: We assessed the tools across 900 CT series from the publicly +available SAROS dataset, focusing on muscle, subcutaneous fat, and visceral +fat. The Dice score was employed to assess accuracy in subcutaneous fat and +muscle segmentation. Due to the lack of ground truth segmentations for visceral +fat, Cohen's Kappa was utilized to assess segmentation agreement between the +tools. + Results: Our Internal tool achieved a 3% higher Dice (83.8 vs. 80.8) for +subcutaneous fat and a 5% improvement (87.6 vs. 83.2) for muscle segmentation +respectively. A Wilcoxon signed-rank test revealed that our results were +statistically different with p<0.01. For visceral fat, the Cohen's kappa score +of 0.856 indicated near-perfect agreement between the two tools. Our internal +tool also showed very strong correlations for muscle volume (R^2=0.99), muscle +attenuation (R^2=0.93), and subcutaneous fat volume (R^2=0.99) with a moderate +correlation for subcutaneous fat attenuation (R^2=0.45). + Conclusion: Our findings indicated that our Internal tool outperformed +TotalSegmentator in measuring subcutaneous fat and muscle. The high Cohen's +Kappa score for visceral fat suggests a reliable level of agreement between the +two tools. These results demonstrate the potential of our tool in advancing the +accuracy of body composition analysis. + +
+
+
+
+
+ + ♻ ☆ General surgery vision transformer: A video pre-trained foundation model + for general surgery + + +
+ The absence of openly accessible data and specialized foundation models is a +major barrier for computational research in surgery. Toward this, (i) we +open-source the largest dataset of general surgery videos to-date, consisting +of 680 hours of surgical videos, including data from robotic and laparoscopic +techniques across 28 procedures; (ii) we propose a technique for video +pre-training a general surgery vision transformer (GSViT) on surgical videos +based on forward video prediction that can run in real-time for surgical +applications, toward which we open-source the code and weights of GSViT; (iii) +we also release code and weights for procedure-specific fine-tuned versions of +GSViT across 10 procedures; (iv) we demonstrate the performance of GSViT on the +Cholec80 phase annotation task, displaying improved performance over +state-of-the-art single frame predictors. + +
+
+
+
+
+ + ♻ ☆ SatCLIP: Global, General-Purpose Location Embeddings with Satellite + Imagery + + +
+ Geographic information is essential for modeling tasks in fields ranging from +ecology to epidemiology. However, extracting relevant location characteristics +for a given task can be challenging, often requiring expensive data fusion or +distillation from massive global imagery datasets. To address this challenge, +we introduce Satellite Contrastive Location-Image Pretraining (SatCLIP). This +global, general-purpose geographic location encoder learns an implicit +representation of locations by matching CNN and ViT inferred visual patterns of +openly available satellite imagery with their geographic coordinates. The +resulting SatCLIP location encoder efficiently summarizes the characteristics +of any given location for convenient use in downstream tasks. In our +experiments, we use SatCLIP embeddings to improve prediction performance on +nine diverse location-dependent tasks including temperature prediction, animal +recognition, and population density estimation. Across tasks, SatCLIP +consistently outperforms alternative location encoders and improves geographic +generalization by encoding visual similarities of spatially distant +environments. These results demonstrate the potential of vision-location models +to learn meaningful representations of our planet from the vast, varied, and +largely untapped modalities of geospatial data. + +
+
+
+
+
+ + ♻ ☆ PrivImage: Differentially Private Synthetic Image Generation using + Diffusion Models with Semantic-Aware Pretraining USENIX Security 2024 + + +
+ Differential Privacy (DP) image data synthesis, which leverages the DP +technique to generate synthetic data to replace the sensitive data, allowing +organizations to share and utilize synthetic images without privacy concerns. +Previous methods incorporate the advanced techniques of generative models and +pre-training on a public dataset to produce exceptional DP image data, but +suffer from problems of unstable training and massive computational resource +demands. This paper proposes a novel DP image synthesis method, termed +PRIVIMAGE, which meticulously selects pre-training data, promoting the +efficient creation of DP datasets with high fidelity and utility. PRIVIMAGE +first establishes a semantic query function using a public dataset. Then, this +function assists in querying the semantic distribution of the sensitive +dataset, facilitating the selection of data from the public dataset with +analogous semantics for pre-training. Finally, we pre-train an image generative +model using the selected data and then fine-tune this model on the sensitive +dataset using Differentially Private Stochastic Gradient Descent (DP-SGD). +PRIVIMAGE allows us to train a lightly parameterized generative model, reducing +the noise in the gradient during DP-SGD training and enhancing training +stability. Extensive experiments demonstrate that PRIVIMAGE uses only 1% of the +public dataset for pre-training and 7.6% of the parameters in the generative +model compared to the state-of-the-art method, whereas achieves superior +synthetic performance and conserves more computational resources. On average, +PRIVIMAGE achieves 30.1% lower FID and 12.6% higher Classification Accuracy +than the state-of-the-art method. The replication package and datasets can be +accessed online. + +
+
+ comment: Accepted at USENIX Security 2024. The first two authors contributed + equally +
+
+
+
+
+ + ♻ ☆ MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly + Mixed Classifiers + + +
+ Adversarial robustness often comes at the cost of degraded accuracy, impeding +the real-life application of robust classification models. Training-based +solutions for better trade-offs are limited by incompatibilities with +already-trained high-performance large models, necessitating the exploration of +training-free ensemble approaches. Observing that robust models are more +confident in correct predictions than in incorrect ones on clean and +adversarial data alike, we speculate amplifying this "benign confidence +property" can reconcile accuracy and robustness in an ensemble setting. To +achieve so, we propose "MixedNUTS", a training-free method where the output +logits of a robust classifier and a standard non-robust classifier are +processed by nonlinear transformations with only three parameters, which are +optimized through an efficient algorithm. MixedNUTS then converts the +transformed logits into probabilities and mixes them as the overall output. On +CIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom +strong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and +near-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points, +sacrificing merely 0.87 points in robust accuracy. + +
+
+
+
+
+ + ♻ ☆ Exploring the Frontier of Vision-Language Models: A Survey of Current + Methodologies and Future Directions + + +
+ The advent of Large Language Models (LLMs) has significantly reshaped the +trajectory of the AI revolution. Nevertheless, these LLMs exhibit a notable +limitation, as they are primarily adept at processing textual information. To +address this constraint, researchers have endeavored to integrate visual +capabilities with LLMs, resulting in the emergence of Vision-Language Models +(VLMs). These advanced models are instrumental in tackling more intricate tasks +such as image captioning and visual question answering. In our comprehensive +survey paper, we delve into the key advancements within the realm of VLMs. Our +classification organizes VLMs into three distinct categories: models dedicated +to vision-language understanding, models that process multimodal inputs to +generate unimodal (textual) outputs and models that both accept and produce +multimodal inputs and outputs.This classification is based on their respective +capabilities and functionalities in processing and generating various +modalities of data.We meticulously dissect each model, offering an extensive +analysis of its foundational architecture, training data sources, as well as +its strengths and limitations wherever possible, providing readers with a +comprehensive understanding of its essential components. We also analyzed the +performance of VLMs in various benchmark datasets. By doing so, we aim to offer +a nuanced understanding of the diverse landscape of VLMs. Additionally, we +underscore potential avenues for future research in this dynamic domain, +anticipating further breakthroughs and advancements. + +
+
+ comment: The most extensive and up to date Survey on Visual Language Models + covering 76 Visual Language Models +
+
+
+
+
+ + ♻ ☆ Paved2Paradise: Cost-Effective and Scalable LiDAR Simulation by + Factoring the Real World CVPR + 2024 + + +
+ To achieve strong real world performance, neural networks must be trained on +large, diverse datasets; however, obtaining and annotating such datasets is +costly and time-consuming, particularly for 3D point clouds. In this paper, we +describe Paved2Paradise, a simple, cost-effective approach for generating fully +labeled, diverse, and realistic lidar datasets from scratch, all while +requiring minimal human annotation. Our key insight is that, by deliberately +collecting separate "background" and "object" datasets (i.e., "factoring the +real world"), we can intelligently combine them to produce a combinatorially +large and diverse training set. The Paved2Paradise pipeline thus consists of +four steps: (1) collecting copious background data, (2) recording individuals +from the desired object class(es) performing different behaviors in an isolated +environment (like a parking lot), (3) bootstrapping labels for the object +dataset, and (4) generating samples by placing objects at arbitrary locations +in backgrounds. To demonstrate the utility of Paved2Paradise, we generated +synthetic datasets for two tasks: (1) human detection in orchards (a task for +which no public data exists) and (2) pedestrian detection in urban +environments. Qualitatively, we find that a model trained exclusively on +Paved2Paradise synthetic data is highly effective at detecting humans in +orchards, including when individuals are heavily occluded by tree branches. +Quantitatively, a model trained on Paved2Paradise data that sources backgrounds +from KITTI performs comparably to a model trained on the actual dataset. These +results suggest the Paved2Paradise synthetic data pipeline can help accelerate +point cloud model development in sectors where acquiring lidar datasets has +previously been cost-prohibitive. + +
+
+ comment: Accepted to the Synthetic Data for Computer Vision workshop at CVPR + 2024 +
+
+
+
+
+ + ♻ ☆ Masked Diffusion as Self-supervised Representation Learner + + +
+ Denoising diffusion probabilistic models have recently demonstrated +state-of-the-art generative performance and have been used as strong +pixel-level representation learners. This paper decomposes the interrelation +between the generative capability and representation learning ability inherent +in diffusion models. We present the masked diffusion model (MDM), a scalable +self-supervised representation learner for semantic segmentation, substituting +the conventional additive Gaussian noise of traditional diffusion with a +masking mechanism. Our proposed approach convincingly surpasses prior +benchmarks, demonstrating remarkable advancements in both medical and natural +image semantic segmentation tasks, particularly in few-shot scenarios. + +
+
+
+
+
+ + ♻ ☆ ScribblePrompt: Fast and Flexible Interactive Segmentation for Any + Biomedical Image + + +
+ Biomedical image segmentation is a crucial part of both scientific research +and clinical care. With enough labelled data, deep learning models can be +trained to accurately automate specific biomedical image segmentation tasks. +However, manually segmenting images to create training data is highly labor +intensive and requires domain expertise. We present ScribblePrompt, a flexible +neural network based interactive segmentation tool for biomedical imaging that +enables human annotators to segment previously unseen structures using +scribbles, clicks, and bounding boxes. Through rigorous quantitative +experiments, we demonstrate that given comparable amounts of interaction, +ScribblePrompt produces more accurate segmentations than previous methods on +datasets unseen during training. In a user study with domain experts, +ScribblePrompt reduced annotation time by 28% while improving Dice by 15% +compared to the next best method. ScribblePrompt's success rests on a set of +careful design decisions. These include a training strategy that incorporates +both a highly diverse set of images and tasks, novel algorithms for simulated +user interactions and labels, and a network that enables fast inference. We +showcase ScribblePrompt in an online demo and provide code at +https://scribbleprompt.csail.mit.edu + +
+
+ comment: Project Website: https://scribbleprompt.csail.mit.edu Keywords: + Interactive Segmentation, Medical Imaging, Segment Anything Model, SAM, + Scribble Annotations, Prompt +
+
+
+
+
+ + ♻ ☆ Generative AI-Based Effective Malware Detection for Embedded Computing + Systems + + +
+ One of the pivotal security threats for the embedded computing systems is +malicious software a.k.a malware. With efficiency and efficacy, Machine +Learning (ML) has been widely adopted for malware detection in recent times. +Despite being efficient, the existing techniques require a tremendous number of +benign and malware samples for training and modeling an efficient malware +detector. Furthermore, such constraints limit the detection of emerging malware +samples due to the lack of sufficient malware samples required for efficient +training. To address such concerns, we introduce a code-aware data generation +technique that generates multiple mutated samples of the limitedly seen malware +by the devices. Loss minimization ensures that the generated samples closely +mimic the limitedly seen malware and mitigate the impractical samples. Such +developed malware is further incorporated into the training set to formulate +the model that can efficiently detect the emerging malware despite having +limited exposure. The experimental results demonstrates that the proposed +technique achieves an accuracy of 90% in detecting limitedly seen malware, +which is approximately 3x more than the accuracy attained by state-of-the-art +techniques. + +
+
+
+
+
+ + ♻ ☆ PEEB: Part-based Image Classifiers with an Explainable and Editable + Language Bottleneck NAACL 2024 + + +
+ CLIP-based classifiers rely on the prompt containing a {class name} that is +known to the text encoder. Therefore, they perform poorly on new classes or the +classes whose names rarely appear on the Internet (e.g., scientific names of +birds). For fine-grained classification, we propose PEEB - an explainable and +editable classifier to (1) express the class name into a set of text +descriptors that describe the visual parts of that class; and (2) match the +embeddings of the detected parts to their textual descriptors in each class to +compute a logit score for classification. In a zero-shot setting where the +class names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1 +accuracy). Compared to part-based classifiers, PEEB is not only the +state-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20% +accuracy on CUB-200 and Dogs-120, respectively) but also the first to enable +users to edit the text descriptors to form a new classifier without any +re-training. Compared to concept bottleneck models, PEEB is also the SOTA in +both zero-shot and supervised-learning settings. + +
+
+ comment: Findings of NAACL 2024 (long paper) +
+
+
+
+
+ + ♻ ☆ Coverage Axis++: Efficient Inner Point Selection for 3D Shape + Skeletonization + + +
+ We introduce Coverage Axis++, a novel and efficient approach to 3D shape +skeletonization. The current state-of-the-art approaches for this task often +rely on the watertightness of the input or suffer from substantial +computational costs, thereby limiting their practicality. To address this +challenge, Coverage Axis++ proposes a heuristic algorithm to select skeletal +points, offering a high-accuracy approximation of the Medial Axis Transform +(MAT) while significantly mitigating computational intensity for various shape +representations. We introduce a simple yet effective strategy that considers +shape coverage, uniformity, and centrality to derive skeletal points. The +selection procedure enforces consistency with the shape structure while +favoring the dominant medial balls, which thus introduces a compact underlying +shape representation in terms of MAT. As a result, Coverage Axis++ allows for +skeletonization for various shape representations (e.g., water-tight meshes, +triangle soups, point clouds), specification of the number of skeletal points, +few hyperparameters, and highly efficient computation with improved +reconstruction accuracy. Extensive experiments across a wide range of 3D shapes +validate the efficiency and effectiveness of Coverage Axis++. The code will be +publicly available once the paper is published. + +
+
+
+
+
+ + ♻ ☆ Generating Illustrated Instructions CVPR 2024 + + +
+ We introduce the new task of generating Illustrated Instructions, i.e., +visual instructions customized to a user's needs. We identify desiderata unique +to this task, and formalize it through a suite of automatic and human +evaluation metrics, designed to measure the validity, consistency, and efficacy +of the generations. We combine the power of large language models (LLMs) +together with strong text-to-image generation diffusion models to propose a +simple approach called StackedDiffusion, which generates such illustrated +instructions given text as input. The resulting model strongly outperforms +baseline approaches and state-of-the-art multimodal LLMs; and in 30% of cases, +users even prefer it to human-generated articles. Most notably, it enables +various new and exciting applications far beyond what static articles on the +web can provide, such as personalized instructions complete with intermediate +steps and pictures in response to a user's individual situation. + +
+
+ comment: Accepted to CVPR 2024. Project website: + http://facebookresearch.github.io/IllustratedInstructions. Code reproduction: + https://github.com/sachit-menon/generating-illustrated-instructions-reproduction +
+
+
+
+
+ + ♻ ☆ Understanding and Modeling the Effects of Task and Context on Drivers' + Gaze Allocation + + +
+ To further advance driver monitoring and assistance systems, it is important +to understand how drivers allocate their attention, in other words, where do +they tend to look and why. Traditionally, factors affecting human visual +attention have been divided into bottom-up (involuntary attraction to salient +regions) and top-down (driven by the demands of the task being performed). +Although both play a role in directing drivers' gaze, most of the existing +models for drivers' gaze prediction apply techniques developed for bottom-up +saliency and do not consider influences of the drivers' actions explicitly. +Likewise, common driving attention benchmarks lack relevant annotations for +drivers' actions and the context in which they are performed. Therefore, to +enable analysis and modeling of these factors for drivers' gaze prediction, we +propose the following: 1) we correct the data processing pipeline used in +DR(eye)VE to reduce noise in the recorded gaze data; 2) we then add per-frame +labels for driving task and context; 3) we benchmark a number of baseline and +SOTA models for saliency and driver gaze prediction and use new annotations to +analyze how their performance changes in scenarios involving different tasks; +and, lastly, 4) we develop a novel model that modulates drivers' gaze +prediction with explicit action and context information. While reducing noise +in the DR(eye)VE gaze data improves results of all models, we show that using +task information in our proposed model boosts performance even further compared +to bottom-up models on the cleaned up data, both overall (by 24% KLD and 89% +NSS) and on scenarios that involve performing safety-critical maneuvers and +crossing intersections (by up to 10--30% KLD). Extended annotations and code +are available at https://github.com/ykotseruba/SCOUT. + +
+
+ comment: Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 168 + +
+
+
+ + ☆ GoMVS: Geometrically Consistent Cost Aggregation for Multi-View Stereo CVPR 2024 + + +
+ Matching cost aggregation plays a fundamental role in learning-based +multi-view stereo networks. However, directly aggregating adjacent costs can +lead to suboptimal results due to local geometric inconsistency. Related +methods either seek selective aggregation or improve aggregated depth in the 2D +space, both are unable to handle geometric inconsistency in the cost volume +effectively. In this paper, we propose GoMVS to aggregate geometrically +consistent costs, yielding better utilization of adjacent geometries. More +specifically, we correspond and propagate adjacent costs to the reference pixel +by leveraging the local geometric smoothness in conjunction with surface +normals. We achieve this by the geometric consistent propagation (GCP) module. +It computes the correspondence from the adjacent depth hypothesis space to the +reference depth space using surface normals, then uses the correspondence to +propagate adjacent costs to the reference geometry, followed by a convolution +for aggregation. Our method achieves new state-of-the-art performance on DTU, +Tanks & Temple, and ETH3D datasets. Notably, our method ranks 1st on the Tanks +& Temple Advanced benchmark. + +
+
+ comment: CVPR 2024. Project page: https://wuuu3511.github.io/gomvs/ Code: + https://github.com/Wuuu3511/GoMVS +
+
+
+
+
+ + ☆ Connecting NeRFs, Images, and Text CVPR + + +
+ Neural Radiance Fields (NeRFs) have emerged as a standard framework for +representing 3D scenes and objects, introducing a novel data type for +information exchange and storage. Concurrently, significant progress has been +made in multimodal representation learning for text and image data. This paper +explores a novel research direction that aims to connect the NeRF modality with +other modalities, similar to established methodologies for images and text. To +this end, we propose a simple framework that exploits pre-trained models for +NeRF representations alongside multimodal models for text and image processing. +Our framework learns a bidirectional mapping between NeRF embeddings and those +obtained from corresponding images and text. This mapping unlocks several novel +and useful applications, including NeRF zero-shot classification and NeRF +retrieval from images or text. + +
+
+ comment: Accepted at CVPRW-INRV 2024 +
+
+
+
+
+ + ☆ GoMAvatar: Efficient Animatable Human Modeling from Monocular Video + Using Gaussians-on-Mesh CVPR 2024 + + +
+ We introduce GoMAvatar, a novel approach for real-time, memory-efficient, +high-quality animatable human modeling. GoMAvatar takes as input a single +monocular video to create a digital avatar capable of re-articulation in new +poses and real-time rendering from novel viewpoints, while seamlessly +integrating with rasterization-based graphics pipelines. Central to our method +is the Gaussians-on-Mesh representation, a hybrid 3D model combining rendering +quality and speed of Gaussian splatting with geometry modeling and +compatibility of deformable meshes. We assess GoMAvatar on ZJU-MoCap data and +various YouTube videos. GoMAvatar matches or surpasses current monocular human +modeling algorithms in rendering quality and significantly outperforms them in +computational efficiency (43 FPS) while being memory-efficient (3.63 MB per +subject). + +
+
+ comment: CVPR 2024; project page: https://wenj.github.io/GoMAvatar/ +
+
+
+
+
+ + ☆ OpenBias: Open-set Bias Detection in Text-to-Image Generative Models CVPR 2024 + + +
+ Text-to-image generative models are becoming increasingly popular and +accessible to the general public. As these models see large-scale deployments, +it is necessary to deeply investigate their safety and fairness to not +disseminate and perpetuate any kind of biases. However, existing works focus on +detecting closed sets of biases defined a priori, limiting the studies to +well-known concepts. In this paper, we tackle the challenge of open-set bias +detection in text-to-image generative models presenting OpenBias, a new +pipeline that identifies and quantifies the severity of biases agnostically, +without access to any precompiled set. OpenBias has three stages. In the first +phase, we leverage a Large Language Model (LLM) to propose biases given a set +of captions. Secondly, the target generative model produces images using the +same set of captions. Lastly, a Vision Question Answering model recognizes the +presence and extent of the previously proposed biases. We study the behavior of +Stable Diffusion 1.5, 2, and XL emphasizing new biases, never investigated +before. Via quantitative experiments, we demonstrate that OpenBias agrees with +current closed-set bias detection methods and human judgement. + +
+
+ comment: CVPR 2024 Highlight - Code: + https://github.com/Picsart-AI-Research/OpenBias +
+
+
+
+
+ + ☆ Any2Point: Empowering Any-modality Large Models for Efficient 3D + Understanding + + +
+ Large foundation models have recently emerged as a prominent focus of +interest, attaining superior performance in widespread scenarios. Due to the +scarcity of 3D data, many efforts have been made to adapt pre-trained +transformers from vision to 3D domains. However, such 2D-to-3D approaches are +still limited, due to the potential loss of spatial geometries and high +computation cost. More importantly, their frameworks are mainly designed for 2D +models, lacking a general any-to-3D paradigm. In this paper, we introduce +Any2Point, a parameter-efficient method to empower any-modality large models +(vision, language, audio) for 3D understanding. Given a frozen transformer from +any source modality, we propose a 3D-to-any (1D or 2D) virtual projection +strategy that correlates the input 3D points to the original 1D or 2D positions +within the source modality. This mechanism enables us to assign each 3D token +with a positional encoding paired with the pre-trained model, which avoids 3D +geometry loss caused by the true projection and better motivates the +transformer for 3D learning with 1D/2D positional priors. Then, within each +transformer block, we insert an any-to-3D guided adapter module for +parameter-efficient fine-tuning. The adapter incorporates prior spatial +knowledge from the source modality to guide the local feature aggregation of 3D +tokens, compelling the semantic adaption of any-modality transformers. We +conduct extensive experiments to showcase the effectiveness and efficiency of +our method. Code and models are released at +https://github.com/Ivan-Tang-3D/Any2Point. + +
+
+ comment: Code and models are released at + https://github.com/Ivan-Tang-3D/Any2Point +
+
+
+
+
+ + ☆ QuasiSim: Parameterized Quasi-Physical Simulators for Dexterous + Manipulations Transfer + + +
+ We explore the dexterous manipulation transfer problem by designing +simulators. The task wishes to transfer human manipulations to dexterous robot +hand simulations and is inherently difficult due to its intricate, +highly-constrained, and discontinuous dynamics and the need to control a +dexterous hand with a DoF to accurately replicate human manipulations. Previous +approaches that optimize in high-fidelity black-box simulators or a modified +one with relaxed constraints only demonstrate limited capabilities or are +restricted by insufficient simulation fidelity. We introduce parameterized +quasi-physical simulators and a physics curriculum to overcome these +limitations. The key ideas are 1) balancing between fidelity and optimizability +of the simulation via a curriculum of parameterized simulators, and 2) solving +the problem in each of the simulators from the curriculum, with properties +ranging from high task optimizability to high fidelity. We successfully enable +a dexterous hand to track complex and diverse manipulations in high-fidelity +simulated environments, boosting the success rate by 11\%+ from the +best-performed baseline. The project website is available at +https://meowuu7.github.io/QuasiSim/. + +
+
+ comment: Project website: https://meowuu7.github.io/QuasiSim/ Code: + https://github.com/Meowuu7/QuasiSim Hugging Face Demo: + https://huggingface.co/spaces/xymeow7/quasi-physical-sims +
+
+
+
+
+ + ☆ ControlNet++: Improving Conditional Controls with Efficient Consistency + Feedback + + +
+ To enhance the controllability of text-to-image diffusion models, existing +efforts like ControlNet incorporated image-based conditional controls. In this +paper, we reveal that existing methods still face significant challenges in +generating images that align with the image conditional controls. To this end, +we propose ControlNet++, a novel approach that improves controllable generation +by explicitly optimizing pixel-level cycle consistency between generated images +and conditional controls. Specifically, for an input conditional control, we +use a pre-trained discriminative reward model to extract the corresponding +condition of the generated images, and then optimize the consistency loss +between the input conditional control and extracted condition. A +straightforward implementation would be generating images from random noises +and then calculating the consistency loss, but such an approach requires +storing gradients for multiple sampling timesteps, leading to considerable time +and memory costs. To address this, we introduce an efficient reward strategy +that deliberately disturbs the input images by adding noise, and then uses the +single-step denoised images for reward fine-tuning. This avoids the extensive +costs associated with image sampling, allowing for more efficient reward +fine-tuning. Extensive experiments show that ControlNet++ significantly +improves controllability under various conditional controls. For example, it +achieves improvements over ControlNet by 7.9% mIoU, 13.4% SSIM, and 7.6% RMSE, +respectively, for segmentation mask, line-art edge, and depth conditions. + +
+
+ comment: Project Page: https://liming-ai.github.io/ControlNet_Plus_Plus +
+
+
+
+
+ + ☆ WaveMo: Learning Wavefront Modulations to See Through Scattering + + +
+ Imaging through scattering media is a fundamental and pervasive challenge in +fields ranging from medical diagnostics to astronomy. A promising strategy to +overcome this challenge is wavefront modulation, which induces measurement +diversity during image acquisition. Despite its importance, designing optimal +wavefront modulations to image through scattering remains under-explored. This +paper introduces a novel learning-based framework to address the gap. Our +approach jointly optimizes wavefront modulations and a computationally +lightweight feedforward "proxy" reconstruction network. This network is trained +to recover scenes obscured by scattering, using measurements that are modified +by these modulations. The learned modulations produced by our framework +generalize effectively to unseen scattering scenarios and exhibit remarkable +versatility. During deployment, the learned modulations can be decoupled from +the proxy network to augment other more computationally expensive restoration +algorithms. Through extensive experiments, we demonstrate our approach +significantly advances the state of the art in imaging through scattering +media. Our project webpage is at https://wavemo-2024.github.io/. + +
+
+
+
+
+ + ☆ View Selection for 3D Captioning via Diffusion Ranking + + +
+ Scalable annotation approaches are crucial for constructing extensive 3D-text +datasets, facilitating a broader range of applications. However, existing +methods sometimes lead to the generation of hallucinated captions, compromising +caption quality. This paper explores the issue of hallucination in 3D object +captioning, with a focus on Cap3D method, which renders 3D objects into 2D +views for captioning using pre-trained models. We pinpoint a major challenge: +certain rendered views of 3D objects are atypical, deviating from the training +data of standard image captioning models and causing hallucinations. To tackle +this, we present DiffuRank, a method that leverages a pre-trained text-to-3D +model to assess the alignment between 3D objects and their 2D rendered views, +where the view with high alignment closely represent the object's +characteristics. By ranking all rendered views and feeding the top-ranked ones +into GPT4-Vision, we enhance the accuracy and detail of captions, enabling the +correction of 200k captions in the Cap3D dataset and extending it to 1 million +captions across Objaverse and Objaverse-XL datasets. Additionally, we showcase +the adaptability of DiffuRank by applying it to pre-trained text-to-image +models for a Visual Question Answering task, where it outperforms the CLIP +model. + +
+
+ comment: Dataset link: https://huggingface.co/datasets/tiange/Cap3D +
+
+
+
+
+ + ☆ Two Effects, One Trigger: On the Modality Gap, Object Bias, and + Information Imbalance in Contrastive Vision-Language Representation Learning + + +
+ Contrastive vision-language models like CLIP have gained popularity for their +versatile applicable learned representations in various downstream tasks. +Despite their successes in some tasks, like zero-shot image recognition, they +also perform surprisingly poor on other tasks, like attribute detection. +Previous work has attributed these challenges to the modality gap, a separation +of image and text in the shared representation space, and a bias towards +objects over other factors, such as attributes. In this work we investigate +both phenomena. We find that only a few embedding dimensions drive the modality +gap. Further, we propose a measure for object bias and find that object bias +does not lead to worse performance on other concepts, such as attributes. But +what leads to the emergence of the modality gap and object bias? To answer this +question we carefully designed an experimental setting which allows us to +control the amount of shared information between the modalities. This revealed +that the driving factor behind both, the modality gap and the object bias, is +the information imbalance between images and captions. + +
+
+
+
+
+ + ☆ Gaga: Group Any Gaussians via 3D-aware Memory Bank + + +
+ We introduce Gaga, a framework that reconstructs and segments open-world 3D +scenes by leveraging inconsistent 2D masks predicted by zero-shot segmentation +models. Contrasted to prior 3D scene segmentation approaches that heavily rely +on video object tracking, Gaga utilizes spatial information and effectively +associates object masks across diverse camera poses. By eliminating the +assumption of continuous view changes in training images, Gaga demonstrates +robustness to variations in camera poses, particularly beneficial for sparsely +sampled images, ensuring precise mask label consistency. Furthermore, Gaga +accommodates 2D segmentation masks from diverse sources and demonstrates robust +performance with different open-world zero-shot segmentation models, enhancing +its versatility. Extensive qualitative and quantitative evaluations demonstrate +that Gaga performs favorably against state-of-the-art methods, emphasizing its +potential for real-world applications such as scene understanding and +manipulation. + +
+
+ comment: Project Page: https://www.gaga.gallery +
+
+
+
+
+ + ☆ Self-supervised Dataset Distillation: A Good Compression Is All You Need + + +
+ Dataset distillation aims to compress information from a large-scale original +dataset to a new compact dataset while striving to preserve the utmost degree +of the original data informational essence. Previous studies have predominantly +concentrated on aligning the intermediate statistics between the original and +distilled data, such as weight trajectory, features, gradient, BatchNorm, etc. +In this work, we consider addressing this task through the new lens of model +informativeness in the compression stage on the original dataset pretraining. +We observe that with the prior state-of-the-art SRe$^2$L, as model sizes +increase, it becomes increasingly challenging for supervised pretrained models +to recover learned information during data synthesis, as the channel-wise mean +and variance inside the model are flatting and less informative. We further +notice that larger variances in BN statistics from self-supervised models +enable larger loss signals to update the recovered data by gradients, enjoying +more informativeness during synthesis. Building on this observation, we +introduce SC-DD, a simple yet effective Self-supervised Compression framework +for Dataset Distillation that facilitates diverse information compression and +recovery compared to traditional supervised learning schemes, further reaps the +potential of large pretrained models with enhanced capabilities. Extensive +experiments are conducted on CIFAR-100, Tiny-ImageNet and ImageNet-1K datasets +to demonstrate the superiority of our proposed approach. The proposed SC-DD +outperforms all previous state-of-the-art supervised dataset distillation +methods when employing larger models, such as SRe$^2$L, MTT, TESLA, DC, CAFE, +etc., by large margins under the same recovery and post-training budgets. Code +is available at https://github.com/VILA-Lab/SRe2L/tree/main/SCDD/. + +
+
+
+
+
+ + ☆ Ferret-v2: An Improved Baseline for Referring and Grounding with Large + Language Models + + +
+ While Ferret seamlessly integrates regional understanding into the Large +Language Model (LLM) to facilitate its referring and grounding capability, it +poses certain limitations: constrained by the pre-trained fixed visual encoder +and failed to perform well on broader tasks. In this work, we unveil Ferret-v2, +a significant upgrade to Ferret, with three key designs. (1) Any resolution +grounding and referring: A flexible approach that effortlessly handles higher +image resolution, improving the model's ability to process and understand +images in greater detail. (2) Multi-granularity visual encoding: By integrating +the additional DINOv2 encoder, the model learns better and diverse underlying +contexts for global and fine-grained visual information. (3) A three-stage +training paradigm: Besides image-caption alignment, an additional stage is +proposed for high-resolution dense alignment before the final instruction +tuning. Experiments show that Ferret-v2 provides substantial improvements over +Ferret and other state-of-the-art methods, thanks to its high-resolution +scaling and fine-grained visual processing. + +
+
+ comment: Preprint. 14 pages, 4 figures +
+
+
+
+
+ + ☆ Taming Stable Diffusion for Text to 360° Panorama Image Generation CVPR 2024 + + +
+ Generative models, e.g., Stable Diffusion, have enabled the creation of +photorealistic images from text prompts. Yet, the generation of 360-degree +panorama images from text remains a challenge, particularly due to the dearth +of paired text-panorama data and the domain gap between panorama and +perspective images. In this paper, we introduce a novel dual-branch diffusion +model named PanFusion to generate a 360-degree image from a text prompt. We +leverage the stable diffusion model as one branch to provide prior knowledge in +natural image generation and register it to another panorama branch for +holistic image generation. We propose a unique cross-attention mechanism with +projection awareness to minimize distortion during the collaborative denoising +process. Our experiments validate that PanFusion surpasses existing methods +and, thanks to its dual-branch structure, can integrate additional constraints +like room layout for customized panorama outputs. Code is available at +https://chengzhag.github.io/publication/panfusion. + +
+
+ comment: CVPR 2024. Project Page: + https://chengzhag.github.io/publication/panfusion Code: + https://github.com/chengzhag/PanFusion +
+
+
+
+
+ + ☆ Boosting Self-Supervision for Single-View Scene Completion via Knowledge + Distillation + + +
+ Inferring scene geometry from images via Structure from Motion is a +long-standing and fundamental problem in computer vision. While classical +approaches and, more recently, depth map predictions only focus on the visible +parts of a scene, the task of scene completion aims to reason about geometry +even in occluded regions. With the popularity of neural radiance fields +(NeRFs), implicit representations also became popular for scene completion by +predicting so-called density fields. Unlike explicit approaches. e.g. +voxel-based methods, density fields also allow for accurate depth prediction +and novel-view synthesis via image-based rendering. In this work, we propose to +fuse the scene reconstruction from multiple images and distill this knowledge +into a more accurate single-view scene reconstruction. To this end, we propose +Multi-View Behind the Scenes (MVBTS) to fuse density fields from multiple posed +images, trained fully self-supervised only from image data. Using knowledge +distillation, we use MVBTS to train a single-view scene completion network via +direct supervision called KDBTS. It achieves state-of-the-art performance on +occupancy prediction, especially in occluded regions. + +
+
+
+
+
+ + ☆ FusionMamba: Efficient Image Fusion with State Space Model + + +
+ Image fusion aims to generate a high-resolution multi/hyper-spectral image by +combining a high-resolution image with limited spectral information and a +low-resolution image with abundant spectral data. Current deep learning +(DL)-based methods for image fusion primarily rely on CNNs or Transformers to +extract features and merge different types of data. While CNNs are efficient, +their receptive fields are limited, restricting their capacity to capture +global context. Conversely, Transformers excel at learning global information +but are hindered by their quadratic complexity. Fortunately, recent +advancements in the State Space Model (SSM), particularly Mamba, offer a +promising solution to this issue by enabling global awareness with linear +complexity. However, there have been few attempts to explore the potential of +SSM in information fusion, which is a crucial ability in domains like image +fusion. Therefore, we propose FusionMamba, an innovative method for efficient +image fusion. Our contributions mainly focus on two aspects. Firstly, +recognizing that images from different sources possess distinct properties, we +incorporate Mamba blocks into two U-shaped networks, presenting a novel +architecture that extracts spatial and spectral features in an efficient, +independent, and hierarchical manner. Secondly, to effectively combine spatial +and spectral information, we extend the Mamba block to accommodate dual inputs. +This expansion leads to the creation of a new module called the FusionMamba +block, which outperforms existing fusion techniques such as concatenation and +cross-attention. To validate FusionMamba's effectiveness, we conduct a series +of experiments on five datasets related to three image fusion tasks. The +quantitative and qualitative evaluation results demonstrate that our method +achieves state-of-the-art (SOTA) performance, underscoring the superiority of +FusionMamba. + +
+
+
+
+
+ + ☆ Parameter Hierarchical Optimization for Visible-Infrared Person + Re-Identification + + +
+ Visible-infrared person re-identification (VI-reID) aims at matching +cross-modality pedestrian images captured by disjoint visible or infrared +cameras. Existing methods alleviate the cross-modality discrepancies via +designing different kinds of network architectures. Different from available +methods, in this paper, we propose a novel parameter optimizing paradigm, +parameter hierarchical optimization (PHO) method, for the task of VI-ReID. It +allows part of parameters to be directly optimized without any training, which +narrows the search space of parameters and makes the whole network more easier +to be trained. Specifically, we first divide the parameters into different +types, and then introduce a self-adaptive alignment strategy (SAS) to +automatically align the visible and infrared images through transformation. +Considering that features in different dimension have varying importance, we +develop an auto-weighted alignment learning (AAL) module that can automatically +weight features according to their importance. Importantly, in the alignment +process of SAS and AAL, all the parameters are immediately optimized with +optimization principles rather than training the whole network, which yields a +better parameter training manner. Furthermore, we establish the cross-modality +consistent learning (CCL) loss to extract discriminative person representations +with translation consistency. We provide both theoretical justification and +empirical evidence that our proposed PHO method outperform existing VI-reID +approaches. + +
+
+
+
+
+ + ☆ LaVy: Vietnamese Multimodal Large Language Model + + +
+ Large Language Models (LLMs) and Multimodal Large language models (MLLMs) +have taken the world by storm with impressive abilities in complex reasoning +and linguistic comprehension. Meanwhile there are plethora of works related to +Vietnamese Large Language Models, the lack of high-quality resources in +multimodality limits the progress of Vietnamese MLLMs. In this paper, we +pioneer in address this by introducing LaVy, a state-of-the-art Vietnamese +MLLM, and we also introduce LaVy-Bench benchmark designated for evaluating +MLLMs's understanding on Vietnamese visual language tasks. All code and model +weights are public at https://github.com/baochi0212/LaVy + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Context-aware Video Anomaly Detection in Long-Term Datasets + + +
+ Video anomaly detection research is generally evaluated on short, isolated +benchmark videos only a few minutes long. However, in real-world environments, +security cameras observe the same scene for months or years at a time, and the +notion of anomalous behavior critically depends on context, such as the time of +day, day of week, or schedule of events. Here, we propose a context-aware video +anomaly detection algorithm, Trinity, specifically targeted to these scenarios. +Trinity is especially well-suited to crowded scenes in which individuals cannot +be easily tracked, and anomalies are due to speed, direction, or absence of +group motion. Trinity is a contrastive learning framework that aims to learn +alignments between context, appearance, and motion, and uses alignment quality +to classify videos as normal or anomalous. We evaluate our algorithm on both +conventional benchmarks and a public webcam-based dataset we collected that +spans more than three months of activity. + +
+
+
+
+
+ + ☆ The Power of Properties: Uncovering the Influential Factors in Emotion + Classification ICPR + + +
+ Facial expression-based human emotion recognition is a critical research area +in psychology and medicine. State-of-the-art classification performance is only +reached by end-to-end trained neural networks. Nevertheless, such black-box +models lack transparency in their decision-making processes, prompting efforts +to ascertain the rules that underlie classifiers' decisions. Analyzing single +inputs alone fails to expose systematic learned biases. These biases can be +characterized as facial properties summarizing abstract information like age or +medical conditions. Therefore, understanding a model's prediction behavior +requires an analysis rooted in causality along such selected properties. We +demonstrate that up to 91.25% of classifier output behavior changes are +statistically significant concerning basic properties. Among those are age, +gender, and facial symmetry. Furthermore, the medical usage of surface +electromyography significantly influences emotion prediction. We introduce a +workflow to evaluate explicit properties and their impact. These insights might +help medical professionals select and apply classifiers regarding their +specialized data and properties. + +
+
+ comment: 8 pages, 3 tables, 1 figure, accepted at ICPRAI 2024 +
+
+
+
+
+ + ☆ Resolve Domain Conflicts for Generalizable Remote Physiological + Measurement ACM MM 2023 + + +
+ Remote photoplethysmography (rPPG) technology has become increasingly popular +due to its non-invasive monitoring of various physiological indicators, making +it widely applicable in multimedia interaction, healthcare, and emotion +analysis. Existing rPPG methods utilize multiple datasets for training to +enhance the generalizability of models. However, they often overlook the +underlying conflict issues across different datasets, such as (1) label +conflict resulting from different phase delays between physiological signal +labels and face videos at the instance level, and (2) attribute conflict +stemming from distribution shifts caused by head movements, illumination +changes, skin types, etc. To address this, we introduce the DOmain-HArmonious +framework (DOHA). Specifically, we first propose a harmonious phase strategy to +eliminate uncertain phase delays and preserve the temporal variation of +physiological signals. Next, we design a harmonious hyperplane optimization +that reduces irrelevant attribute shifts and encourages the model's +optimization towards a global solution that fits more valid scenarios. Our +experiments demonstrate that DOHA significantly improves the performance of +existing methods under multiple protocols. Our code is available at +https://github.com/SWY666/rPPG-DOHA. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ MindBridge: A Cross-Subject Brain Decoding Framework CVPR 2024 + + +
+ Brain decoding, a pivotal field in neuroscience, aims to reconstruct stimuli +from acquired brain signals, primarily utilizing functional magnetic resonance +imaging (fMRI). Currently, brain decoding is confined to a +per-subject-per-model paradigm, limiting its applicability to the same +individual for whom the decoding model is trained. This constraint stems from +three key challenges: 1) the inherent variability in input dimensions across +subjects due to differences in brain size; 2) the unique intrinsic neural +patterns, influencing how different individuals perceive and process sensory +information; 3) limited data availability for new subjects in real-world +scenarios hampers the performance of decoding models. In this paper, we present +a novel approach, MindBridge, that achieves cross-subject brain decoding by +employing only one model. Our proposed framework establishes a generic paradigm +capable of addressing these challenges by introducing biological-inspired +aggregation function and novel cyclic fMRI reconstruction mechanism for +subject-invariant representation learning. Notably, by cycle reconstruction of +fMRI, MindBridge can enable novel fMRI synthesis, which also can serve as +pseudo data augmentation. Within the framework, we also devise a novel +reset-tuning method for adapting a pretrained model to a new subject. +Experimental results demonstrate MindBridge's ability to reconstruct images for +multiple subjects, which is competitive with dedicated subject-specific models. +Furthermore, with limited data for a new subject, we achieve a high level of +decoding accuracy, surpassing that of subject-specific models. This advancement +in cross-subject brain decoding suggests promising directions for wider +applications in neuroscience and indicates potential for more efficient +utilization of limited fMRI data in real-world scenarios. Project page: +https://littlepure2333.github.io/MindBridge + +
+
+ comment: CVPR 2024 highlight. Code is available at + https://github.com/littlepure2333/MindBridge +
+
+
+
+
+ + ☆ Fuss-Free Network: A Simplified and Efficient Neural Network for Crowd + Counting + + +
+ In the field of crowd-counting research, many recent deep learning based +methods have demonstrated robust capabilities for accurately estimating crowd +sizes. However, the enhancement in their performance often arises from an +increase in the complexity of the model structure. This paper introduces the +Fuss-Free Network (FFNet), a crowd counting deep learning model that is +characterized by its simplicity and efficiency in terms of its structure. The +model comprises only a backbone of a neural network and a multi-scale feature +fusion structure.The multi-scale feature fusion structure is a simple +architecture consisting of three branches, each only equipped with a focus +transition module, and combines the features from these branches through the +concatenation operation.Our proposed crowd counting model is trained and +evaluated on four widely used public datasets, and it achieves accuracy that is +comparable to that of existing complex models.The experimental results further +indicate that excellent performance in crowd counting tasks can also be +achieved by utilizing a simple, low-parameter, and computationally efficient +neural network structure. + +
+
+
+
+
+ + ☆ TBSN: Transformer-Based Blind-Spot Network for Self-Supervised Image + Denoising + + +
+ Blind-spot networks (BSN) have been prevalent network architectures in +self-supervised image denoising (SSID). Existing BSNs are mostly conducted with +convolution layers. Although transformers offer potential solutions to the +limitations of convolutions and have demonstrated success in various image +restoration tasks, their attention mechanisms may violate the blind-spot +requirement, thus restricting their applicability in SSID. In this paper, we +present a transformer-based blind-spot network (TBSN) by analyzing and +redesigning the transformer operators that meet the blind-spot requirement. +Specifically, TBSN follows the architectural principles of dilated BSNs, and +incorporates spatial as well as channel self-attention layers to enhance the +network capability. For spatial self-attention, an elaborate mask is applied to +the attention matrix to restrict its receptive field, thus mimicking the +dilated convolution. For channel self-attention, we observe that it may leak +the blind-spot information when the channel number is greater than spatial size +in the deep layers of multi-scale architectures. To eliminate this effect, we +divide the channel into several groups and perform channel attention +separately. Furthermore, we introduce a knowledge distillation strategy that +distills TBSN into smaller denoisers to improve computational efficiency while +maintaining performance. Extensive experiments on real-world image denoising +datasets show that TBSN largely extends the receptive field and exhibits +favorable performance against state-of-the-art SSID methods. The code and +pre-trained models will be publicly available at +https://github.com/nagejacob/TBSN. + +
+
+
+
+
+ + ☆ Streamlined Photoacoustic Image Processing with Foundation Models: A + Training-Free Solution + + +
+ Foundation models have rapidly evolved and have achieved significant +accomplishments in computer vision tasks. Specifically, the prompt mechanism +conveniently allows users to integrate image prior information into the model, +making it possible to apply models without any training. Therefore, we propose +a method based on foundation models and zero training to solve the tasks of +photoacoustic (PA) image segmentation. We employed the segment anything model +(SAM) by setting simple prompts and integrating the model's outputs with prior +knowledge of the imaged objects to accomplish various tasks, including: (1) +removing the skin signal in three-dimensional PA image rendering; (2) dual +speed-of-sound reconstruction, and (3) segmentation of finger blood vessels. +Through these demonstrations, we have concluded that deep learning can be +directly applied in PA imaging without the requirement for network design and +training. This potentially allows for a hands-on, convenient approach to +achieving efficient and accurate segmentation of PA images. This letter serves +as a comprehensive tutorial, facilitating the mastery of the technique through +the provision of code and sample datasets. + +
+
+
+
+
+ + ☆ Heron-Bench: A Benchmark for Evaluating Vision Language Models in + Japanese + + +
+ Vision Language Models (VLMs) have undergone a rapid evolution, giving rise +to significant advancements in the realm of multimodal understanding tasks. +However, the majority of these models are trained and evaluated on +English-centric datasets, leaving a gap in the development and evaluation of +VLMs for other languages, such as Japanese. This gap can be attributed to the +lack of methodologies for constructing VLMs and the absence of benchmarks to +accurately measure their performance. To address this issue, we introduce a +novel benchmark, Japanese Heron-Bench, for evaluating Japanese capabilities of +VLMs. The Japanese Heron-Bench consists of a variety of imagequestion answer +pairs tailored to the Japanese context. Additionally, we present a baseline +Japanese VLM that has been trained with Japanese visual instruction tuning +datasets. Our Heron-Bench reveals the strengths and limitations of the proposed +VLM across various ability dimensions. Furthermore, we clarify the capability +gap between strong closed models like GPT-4V and the baseline model, providing +valuable insights for future research in this domain. We release the benchmark +dataset and training code to facilitate further developments in Japanese VLM +research. + +
+
+
+
+
+ + ☆ Sparse Laneformer + + +
+ Lane detection is a fundamental task in autonomous driving, and has achieved +great progress as deep learning emerges. Previous anchor-based methods often +design dense anchors, which highly depend on the training dataset and remain +fixed during inference. We analyze that dense anchors are not necessary for +lane detection, and propose a transformer-based lane detection framework based +on a sparse anchor mechanism. To this end, we generate sparse anchors with +position-aware lane queries and angle queries instead of traditional explicit +anchors. We adopt Horizontal Perceptual Attention (HPA) to aggregate the lane +features along the horizontal direction, and adopt Lane-Angle Cross Attention +(LACA) to perform interactions between lane queries and angle queries. We also +propose Lane Perceptual Attention (LPA) based on deformable cross attention to +further refine the lane predictions. Our method, named Sparse Laneformer, is +easy-to-implement and end-to-end trainable. Extensive experiments demonstrate +that Sparse Laneformer performs favorably against the state-of-the-art methods, +e.g., surpassing Laneformer by 3.0% F1 score and O2SFormer by 0.7% F1 score +with fewer MACs on CULane with the same ResNet-34 backbone. + +
+
+
+
+
+ + ☆ Voice-Assisted Real-Time Traffic Sign Recognition System Using + Convolutional Neural Network + + +
+ Traffic signs are important in communicating information to drivers. Thus, +comprehension of traffic signs is essential for road safety and ignorance may +result in road accidents. Traffic sign detection has been a research spotlight +over the past few decades. Real-time and accurate detections are the +preliminaries of robust traffic sign detection system which is yet to be +achieved. This study presents a voice-assisted real-time traffic sign +recognition system which is capable of assisting drivers. This system functions +under two subsystems. Initially, the detection and recognition of the traffic +signs are carried out using a trained Convolutional Neural Network (CNN). After +recognizing the specific traffic sign, it is narrated to the driver as a voice +message using a text-to-speech engine. An efficient CNN model for a benchmark +dataset is developed for real-time detection and recognition using Deep +Learning techniques. The advantage of this system is that even if the driver +misses a traffic sign, or does not look at the traffic sign, or is unable to +comprehend the sign, the system detects it and narrates it to the driver. A +system of this type is also important in the development of autonomous +vehicles. + +
+
+
+
+
+ + ☆ DGMamba: Domain Generalization via Generalized State Space Model + + +
+ Domain generalization~(DG) aims at solving distribution shift problems in +various scenes. Existing approaches are based on Convolution Neural Networks +(CNNs) or Vision Transformers (ViTs), which suffer from limited receptive +fields or quadratic complexities issues. Mamba, as an emerging state space +model (SSM), possesses superior linear complexity and global receptive fields. +Despite this, it can hardly be applied to DG to address distribution shifts, +due to the hidden state issues and inappropriate scan mechanisms. In this +paper, we propose a novel framework for DG, named DGMamba, that excels in +strong generalizability toward unseen domains and meanwhile has the advantages +of global receptive fields, and efficient linear complexity. Our DGMamba +compromises two core components: Hidden State Suppressing~(HSS) and +Semantic-aware Patch refining~(SPR). In particular, HSS is introduced to +mitigate the influence of hidden states associated with domain-specific +features during output prediction. SPR strives to encourage the model to +concentrate more on objects rather than context, consisting of two designs: +Prior-Free Scanning~(PFS), and Domain Context Interchange~(DCI). Concretely, +PFS aims to shuffle the non-semantic patches within images, creating more +flexible and effective sequences from images, and DCI is designed to regularize +Mamba with the combination of mismatched non-semantic and semantic information +by fusing patches among domains. Extensive experiments on four commonly used DG +benchmarks demonstrate that the proposed DGMamba achieves remarkably superior +results to state-of-the-art models. The code will be made publicly available. + +
+
+
+
+
+ + ☆ VIFNet: An End-to-end Visible-Infrared Fusion Network for Image Dehazing + + +
+ Image dehazing poses significant challenges in environmental perception. +Recent research mainly focus on deep learning-based methods with single +modality, while they may result in severe information loss especially in +dense-haze scenarios. The infrared image exhibits robustness to the haze, +however, existing methods have primarily treated the infrared modality as +auxiliary information, failing to fully explore its rich information in +dehazing. To address this challenge, the key insight of this study is to design +a visible-infrared fusion network for image dehazing. In particular, we propose +a multi-scale Deep Structure Feature Extraction (DSFE) module, which +incorporates the Channel-Pixel Attention Block (CPAB) to restore more spatial +and marginal information within the deep structural features. Additionally, we +introduce an inconsistency weighted fusion strategy to merge the two modalities +by leveraging the more reliable information. To validate this, we construct a +visible-infrared multimodal dataset called AirSim-VID based on the AirSim +simulation platform. Extensive experiments performed on challenging real and +simulated image datasets demonstrate that VIFNet can outperform many +state-of-the-art competing methods. The code and dataset are available at +https://github.com/mengyu212/VIFNet_dehazing. + +
+
+
+
+
+ + ☆ AUG: A New Dataset and An Efficient Model for Aerial Image Urban Scene + Graph Generation + + +
+ Scene graph generation (SGG) aims to understand the visual objects and their +semantic relationships from one given image. Until now, lots of SGG datasets +with the eyelevel view are released but the SGG dataset with the overhead view +is scarcely studied. By contrast to the object occlusion problem in the +eyelevel view, which impedes the SGG, the overhead view provides a new +perspective that helps to promote the SGG by providing a clear perception of +the spatial relationships of objects in the ground scene. To fill in the gap of +the overhead view dataset, this paper constructs and releases an aerial image +urban scene graph generation (AUG) dataset. Images from the AUG dataset are +captured with the low-attitude overhead view. In the AUG dataset, 25,594 +objects, 16,970 relationships, and 27,175 attributes are manually annotated. To +avoid the local context being overwhelmed in the complex aerial urban scene, +this paper proposes one new locality-preserving graph convolutional network +(LPG). Different from the traditional graph convolutional network, which has +the natural advantage of capturing the global context for SGG, the +convolutional layer in the LPG integrates the non-destructive initial features +of the objects with dynamically updated neighborhood information to preserve +the local context under the premise of mining the global context. To address +the problem that there exists an extra-large number of potential object +relationship pairs but only a small part of them is meaningful in AUG, we +propose the adaptive bounding box scaling factor for potential relationship +detection (ABS-PRD) to intelligently prune the meaningless relationship pairs. +Extensive experiments on the AUG dataset show that our LPG can significantly +outperform the state-of-the-art methods and the effectiveness of the proposed +locality-preserving strategy. + +
+
+
+
+
+ + ☆ PRAM: Place Recognition Anywhere Model for Efficient Visual Localization + + +
+ Humans localize themselves efficiently in known environments by first +recognizing landmarks defined on certain objects and their spatial +relationships, and then verifying the location by aligning detailed structures +of recognized objects with those in the memory. Inspired by this, we propose +the place recognition anywhere model (PRAM) to perform visual localization as +efficiently as humans do. PRAM consists of two main components - recognition +and registration. In detail, first of all, a self-supervised map-centric +landmark definition strategy is adopted, making places in either indoor or +outdoor scenes act as unique landmarks. Then, sparse keypoints extracted from +images, are utilized as the input to a transformer-based deep neural network +for landmark recognition; these keypoints enable PRAM to recognize hundreds of +landmarks with high time and memory efficiency. Keypoints along with recognized +landmark labels are further used for registration between query images and the +3D landmark map. Different from previous hierarchical methods, PRAM discards +global and local descriptors, and reduces over 90% storage. Since PRAM utilizes +recognition and landmark-wise verification to replace global reference search +and exhaustive matching respectively, it runs 2.4 times faster than prior +state-of-the-art approaches. Moreover, PRAM opens new directions for visual +localization including multi-modality localization, map-centric feature +learning, and hierarchical scene coordinate regression. + +
+
+ comment: project page: https://feixue94.github.io/pram-project/ +
+
+
+
+
+ + ☆ ConsistencyDet: Robust Object Detector with Denoising Paradigm of + Consistency Model + + +
+ Object detection, a quintessential task in the realm of perceptual computing, +can be tackled using a generative methodology. In the present study, we +introduce a novel framework designed to articulate object detection as a +denoising diffusion process, which operates on perturbed bounding boxes of +annotated entities. This framework, termed ConsistencyDet, leverages an +innovative denoising concept known as the Consistency Model. The hallmark of +this model is its self-consistency feature, which empowers the model to map +distorted information from any temporal stage back to its pristine state, +thereby realizing a ``one-step denoising'' mechanism. Such an attribute +markedly elevates the operational efficiency of the model, setting it apart +from the conventional Diffusion Model. Throughout the training phase, +ConsistencyDet initiates the diffusion sequence with noise-infused boxes +derived from the ground-truth annotations and conditions the model to perform +the denoising task. Subsequently, in the inference stage, the model employs a +denoising sampling strategy that commences with bounding boxes randomly sampled +from a normal distribution. Through iterative refinement, the model transforms +an assortment of arbitrarily generated boxes into the definitive detections. +Comprehensive evaluations employing standard benchmarks, such as MS-COCO and +LVIS, corroborate that ConsistencyDet surpasses other leading-edge detectors in +performance metrics. + +
+
+
+
+
+ + ☆ Joint Conditional Diffusion Model for Image Restoration with Mixed + Degradations + + +
+ Image restoration is rather challenging in adverse weather conditions, +especially when multiple degradations occur simultaneously. Blind image +decomposition was proposed to tackle this issue, however, its effectiveness +heavily relies on the accurate estimation of each component. Although +diffusion-based models exhibit strong generative abilities in image restoration +tasks, they may generate irrelevant contents when the degraded images are +severely corrupted. To address these issues, we leverage physical constraints +to guide the whole restoration process, where a mixed degradation model based +on atmosphere scattering model is constructed. Then we formulate our Joint +Conditional Diffusion Model (JCDM) by incorporating the degraded image and +degradation mask to provide precise guidance. To achieve better color and +detail recovery results, we further integrate a refinement network to +reconstruct the restored image, where Uncertainty Estimation Block (UEB) is +employed to enhance the features. Extensive experiments performed on both +multi-weather and weather-specific datasets demonstrate the superiority of our +method over state-of-the-art competing methods. + +
+
+
+
+
+ + ☆ RMAFF-PSN: A Residual Multi-Scale Attention Feature Fusion Photometric + Stereo Network + + +
+ Predicting accurate normal maps of objects from two-dimensional images in +regions of complex structure and spatial material variations is challenging +using photometric stereo methods due to the influence of surface reflection +properties caused by variations in object geometry and surface materials. To +address this issue, we propose a photometric stereo network called a RMAFF-PSN +that uses residual multiscale attentional feature fusion to handle the +``difficult'' regions of the object. Unlike previous approaches that only use +stacked convolutional layers to extract deep features from the input image, our +method integrates feature information from different resolution stages and +scales of the image. This approach preserves more physical information, such as +texture and geometry of the object in complex regions, through shallow-deep +stage feature extraction, double branching enhancement, and attention +optimization. To test the network structure under real-world conditions, we +propose a new real dataset called Simple PS data, which contains multiple +objects with varying structures and materials. Experimental results on a +publicly available benchmark dataset demonstrate that our method outperforms +most existing calibrated photometric stereo methods for the same number of +input images, especially in the case of highly non-convex object structures. +Our method also obtains good results under sparse lighting conditions. + +
+
+ comment: 17 pages,12 figures +
+
+
+
+
+ + ☆ NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous + Driving + + +
+ We present a versatile NeRF-based simulator for testing autonomous driving +(AD) software systems, designed with a focus on sensor-realistic closed-loop +evaluation and the creation of safety-critical scenarios. The simulator learns +from sequences of real-world driving sensor data and enables reconfigurations +and renderings of new, unseen scenarios. In this work, we use our simulator to +test the responses of AD models to safety-critical scenarios inspired by the +European New Car Assessment Programme (Euro NCAP). Our evaluation reveals that, +while state-of-the-art end-to-end planners excel in nominal driving scenarios +in an open-loop setting, they exhibit critical flaws when navigating our +safety-critical scenarios in a closed-loop setting. This highlights the need +for advancements in the safety and real-world usability of end-to-end planners. +By publicly releasing our simulator and scenarios as an easy-to-run evaluation +suite, we invite the research community to explore, refine, and validate their +AD models in controlled, yet highly configurable and challenging +sensor-realistic environments. Code and instructions can be found at +https://github.com/wljungbergh/NeuroNCAP + +
+
+
+
+
+ + ☆ Generating Synthetic Satellite Imagery With Deep-Learning Text-to-Image + Models -- Technical Challenges and Implications for Monitoring and + Verification + + +
+ Novel deep-learning (DL) architectures have reached a level where they can +generate digital media, including photorealistic images, that are difficult to +distinguish from real data. These technologies have already been used to +generate training data for Machine Learning (ML) models, and large +text-to-image models like DALL-E 2, Imagen, and Stable Diffusion are achieving +remarkable results in realistic high-resolution image generation. Given these +developments, issues of data authentication in monitoring and verification +deserve a careful and systematic analysis: How realistic are synthetic images? +How easily can they be generated? How useful are they for ML researchers, and +what is their potential for Open Science? In this work, we use novel DL models +to explore how synthetic satellite images can be created using conditioning +mechanisms. We investigate the challenges of synthetic satellite image +generation and evaluate the results based on authenticity and state-of-the-art +metrics. Furthermore, we investigate how synthetic data can alleviate the lack +of data in the context of ML methods for remote-sensing. Finally we discuss +implications of synthetic satellite imagery in the context of monitoring and +verification. + +
+
+ comment: https://resources.inmm.org/annual-meeting-proceedings/generating-synthetic-satellite-imagery-deep-learning-text-image-models +
+
+
+
+
+ + ☆ 3D-CSAD: Untrained 3D Anomaly Detection for Complex Manufacturing + Surfaces + + +
+ The surface quality inspection of manufacturing parts based on 3D point cloud +data has attracted increasing attention in recent years. The reason is that the +3D point cloud can capture the entire surface of manufacturing parts, unlike +the previous practices that focus on some key product characteristics. However, +achieving accurate 3D anomaly detection is challenging, due to the complex +surfaces of manufacturing parts and the difficulty of collecting sufficient +anomaly samples. To address these challenges, we propose a novel untrained +anomaly detection method based on 3D point cloud data for complex manufacturing +parts, which can achieve accurate anomaly detection in a single sample without +training data. In the proposed framework, we transform an input sample into two +sets of profiles along different directions. Based on one set of the profiles, +a novel segmentation module is devised to segment the complex surface into +multiple basic and simple components. In each component, another set of +profiles, which have the nature of similar shapes, can be modeled as a low-rank +matrix. Thus, accurate 3D anomaly detection can be achieved by using Robust +Principal Component Analysis (RPCA) on these low-rank matrices. Extensive +numerical experiments on different types of parts show that our method achieves +promising results compared with the benchmark methods. + +
+
+
+
+
+ + ☆ Exploiting Object-based and Segmentation-based Semantic Features for + Deep Learning-based Indoor Scene Classification + + +
+ Indoor scenes are usually characterized by scattered objects and their +relationships, which turns the indoor scene classification task into a +challenging computer vision task. Despite the significant performance boost in +classification tasks achieved in recent years, provided by the use of +deep-learning-based methods, limitations such as inter-category ambiguity and +intra-category variation have been holding back their performance. To overcome +such issues, gathering semantic information has been shown to be a promising +source of information towards a more complete and discriminative feature +representation of indoor scenes. Therefore, the work described in this paper +uses both semantic information, obtained from object detection, and semantic +segmentation techniques. While object detection techniques provide the 2D +location of objects allowing to obtain spatial distributions between objects, +semantic segmentation techniques provide pixel-level information that allows to +obtain, at a pixel-level, a spatial distribution and shape-related features of +the segmentation categories. Hence, a novel approach that uses a semantic +segmentation mask to provide Hu-moments-based segmentation categories' shape +characterization, designated by Segmentation-based Hu-Moments Features (SHMFs), +is proposed. Moreover, a three-main-branch network, designated by +GOS$^2$F$^2$App, that exploits deep-learning-based global features, +object-based features, and semantic segmentation-based features is also +proposed. GOS$^2$F$^2$App was evaluated in two indoor scene benchmark datasets: +SUN RGB-D and NYU Depth V2, where, to the best of our knowledge, +state-of-the-art results were achieved on both datasets, which present +evidences of the effectiveness of the proposed approach. + +
+
+ comment: This preprint was submitted at IEEE Transactions on Image Processing +
+
+
+
+
+ + ☆ Realistic Continual Learning Approach using Pre-trained Models + + +
+ Continual learning (CL) is crucial for evaluating adaptability in learning +solutions to retain knowledge. Our research addresses the challenge of +catastrophic forgetting, where models lose proficiency in previously learned +tasks as they acquire new ones. While numerous solutions have been proposed, +existing experimental setups often rely on idealized class-incremental learning +scenarios. We introduce Realistic Continual Learning (RealCL), a novel CL +paradigm where class distributions across tasks are random, departing from +structured setups. + We also present CLARE (Continual Learning Approach with pRE-trained models +for RealCL scenarios), a pre-trained model-based solution designed to integrate +new knowledge while preserving past learning. Our contributions include +pioneering RealCL as a generalization of traditional CL setups, proposing CLARE +as an adaptable approach for RealCL tasks, and conducting extensive experiments +demonstrating its effectiveness across various RealCL scenarios. Notably, CLARE +outperforms existing models on RealCL benchmarks, highlighting its versatility +and robustness in unpredictable learning environments. + +
+
+
+
+
+ + ☆ Applying Guidance in a Limited Interval Improves Sample and Distribution + Quality in Diffusion Models + + +
+ Guidance is a crucial technique for extracting the best performance out of +image-generating diffusion models. Traditionally, a constant guidance weight +has been applied throughout the sampling chain of an image. We show that +guidance is clearly harmful toward the beginning of the chain (high noise +levels), largely unnecessary toward the end (low noise levels), and only +beneficial in the middle. We thus restrict it to a specific range of noise +levels, improving both the inference speed and result quality. This limited +guidance interval improves the record FID in ImageNet-512 significantly, from +1.81 to 1.40. We show that it is quantitatively and qualitatively beneficial +across different sampler parameters, network architectures, and datasets, +including the large-scale setting of Stable Diffusion XL. We thus suggest +exposing the guidance interval as a hyperparameter in all diffusion models that +use guidance. + +
+
+
+
+
+ + ☆ Progressive Semantic-Guided Vision Transformer for Zero-Shot Learning CVPR'24 + + +
+ Zero-shot learning (ZSL) recognizes the unseen classes by conducting +visual-semantic interactions to transfer semantic knowledge from seen classes +to unseen ones, supported by semantic information (e.g., attributes). However, +existing ZSL methods simply extract visual features using a pre-trained network +backbone (i.e., CNN or ViT), which fail to learn matched visual-semantic +correspondences for representing semantic-related visual features as lacking of +the guidance of semantic information, resulting in undesirable visual-semantic +interactions. To tackle this issue, we propose a progressive semantic-guided +vision transformer for zero-shot learning (dubbed ZSLViT). ZSLViT mainly +considers two properties in the whole network: i) discover the semantic-related +visual representations explicitly, and ii) discard the semantic-unrelated +visual information. Specifically, we first introduce semantic-embedded token +learning to improve the visual-semantic correspondences via semantic +enhancement and discover the semantic-related visual tokens explicitly with +semantic-guided token attention. Then, we fuse low semantic-visual +correspondence visual tokens to discard the semantic-unrelated visual +information for visual enhancement. These two operations are integrated into +various encoders to progressively learn semantic-related visual representations +for accurate visual-semantic interactions in ZSL. The extensive experiments +show that our ZSLViT achieves significant performance gains on three popular +benchmark datasets, i.e., CUB, SUN, and AWA2. + +
+
+ comment: Accepted to CVPR'24 +
+
+
+
+
+ + ☆ OpenTrench3D: A Photogrammetric 3D Point Cloud Dataset for Semantic + Segmentation of Underground Utilities + + +
+ Identifying and classifying underground utilities is an important task for +efficient and effective urban planning and infrastructure maintenance. We +present OpenTrench3D, a novel and comprehensive 3D Semantic Segmentation point +cloud dataset, designed to advance research and development in underground +utility surveying and mapping. OpenTrench3D covers a completely novel domain +for public 3D point cloud datasets and is unique in its focus, scope, and +cost-effective capturing method. The dataset consists of 310 point clouds +collected across 7 distinct areas. These include 5 water utility areas and 2 +district heating utility areas. The inclusion of different geographical areas +and main utilities (water and district heating utilities) makes OpenTrench3D +particularly valuable for inter-domain transfer learning experiments. We +provide benchmark results for the dataset using three state-of-the-art semantic +segmentation models, PointNeXt, PointVector and PointMetaBase. Benchmarks are +conducted by training on data from water areas, fine-tuning on district heating +area 1 and evaluating on district heating area 2. The dataset is publicly +available. With OpenTrench3D, we seek to foster innovation and progress in the +field of 3D semantic segmentation in applications related to detection and +documentation of underground utilities as well as in transfer learning methods +in general. + +
+
+
+
+
+ + ☆ ViM-UNet: Vision Mamba for Biomedical Segmentation + + +
+ CNNs, most notably the UNet, are the default architecture for biomedical +segmentation. Transformer-based approaches, such as UNETR, have been proposed +to replace them, benefiting from a global field of view, but suffering from +larger runtimes and higher parameter counts. The recent Vision Mamba +architecture offers a compelling alternative to transformers, also providing a +global field of view, but at higher efficiency. Here, we introduce ViM-UNet, a +novel segmentation architecture based on it and compare it to UNet and UNETR +for two challenging microscopy instance segmentation tasks. We find that it +performs similarly or better than UNet, depending on the task, and outperforms +UNETR while being more efficient. Our code is open source and documented at +https://github.com/constantinpape/torch-em/blob/main/vimunet.md. + +
+
+
+
+
+ + ☆ Point Cloud Geometry Scalable Coding with a Quality-Conditioned Latents + Probability Estimator ICIP 2024 + + +
+ The widespread usage of point clouds (PC) for immersive visual applications +has resulted in the use of very heterogeneous receiving conditions and devices, +notably in terms of network, hardware, and display capabilities. In this +scenario, quality scalability, i.e., the ability to reconstruct a signal at +different qualities by progressively decoding a single bitstream, is a major +requirement that has yet to be conveniently addressed, notably in most +learning-based PC coding solutions. This paper proposes a quality scalability +scheme, named Scalable Quality Hyperprior (SQH), adaptable to learning-based +static point cloud geometry codecs, which uses a Quality-conditioned Latents +Probability Estimator (QuLPE) to decode a high-quality version of a PC +learning-based representation, based on an available lower quality base layer. +SQH is integrated in the future JPEG PC coding standard, allowing to create a +layered bitstream that can be used to progressively decode the PC geometry with +increasing quality and fidelity. Experimental results show that SQH offers the +quality scalability feature with very limited or no compression performance +penalty at all when compared with the corresponding non-scalable solution, thus +preserving the significant compression gains over other state-of-the-art PC +codecs. + +
+
+ comment: Submitted at ICIP 2024 +
+
+
+
+
+ + ☆ Flatness Improves Backbone Generalisation in Few-shot Classification + + +
+ Deployment of deep neural networks in real-world settings typically requires +adaptation to new tasks with few examples. Few-shot classification (FSC) +provides a solution to this problem by leveraging pre-trained backbones for +fast adaptation to new classes. Surprisingly, most efforts have only focused on +developing architectures for easing the adaptation to the target domain without +considering the importance of backbone training for good generalisation. We +show that flatness-aware backbone training with vanilla fine-tuning results in +a simpler yet competitive baseline compared to the state-of-the-art. Our +results indicate that for in- and cross-domain FSC, backbone training is +crucial to achieving good generalisation across different adaptation methods. +We advocate more care should be taken when training these models. + +
+
+
+
+
+ + ☆ Chaos in Motion: Unveiling Robustness in Remote Heart Rate Measurement + through Brain-Inspired Skin Tracking + + +
+ Heart rate is an important physiological indicator of human health status. +Existing remote heart rate measurement methods typically involve facial +detection followed by signal extraction from the region of interest (ROI). +These SOTA methods have three serious problems: (a) inaccuracies even failures +in detection caused by environmental influences or subject movement; (b) +failures for special patients such as infants and burn victims; (c) privacy +leakage issues resulting from collecting face video. To address these issues, +we regard the remote heart rate measurement as the process of analyzing the +spatiotemporal characteristics of the optical flow signal in the video. We +apply chaos theory to computer vision tasks for the first time, thus designing +a brain-inspired framework. Firstly, using an artificial primary visual cortex +model to extract the skin in the videos, and then calculate heart rate by +time-frequency analysis on all pixels. Our method achieves Robust Skin Tracking +for Heart Rate measurement, called HR-RST. The experimental results show that +HR-RST overcomes the difficulty of environmental influences and effectively +tracks the subject movement. Moreover, the method could extend to other body +parts. Consequently, the method can be applied to special patients and +effectively protect individual privacy, offering an innovative solution. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ Depth Estimation using Weighted-loss and Transfer Learning + + +
+ Depth estimation from 2D images is a common computer vision task that has +applications in many fields including autonomous vehicles, scene understanding +and robotics. The accuracy of a supervised depth estimation method mainly +relies on the chosen loss function, the model architecture, quality of data and +performance metrics. In this study, we propose a simplified and adaptable +approach to improve depth estimation accuracy using transfer learning and an +optimized loss function. The optimized loss function is a combination of +weighted losses to which enhance robustness and generalization: Mean Absolute +Error (MAE), Edge Loss and Structural Similarity Index (SSIM). We use a grid +search and a random search method to find optimized weights for the losses, +which leads to an improved model. We explore multiple encoder-decoder-based +models including DenseNet121, DenseNet169, DenseNet201, and EfficientNet for +the supervised depth estimation model on NYU Depth Dataset v2. We observe that +the EfficientNet model, pre-trained on ImageNet for classification when used as +an encoder, with a simple upsampling decoder, gives the best results in terms +of RSME, REL and log10: 0.386, 0.113 and 0.049, respectively. We also perform a +qualitative analysis which illustrates that our model produces depth maps that +closely resemble ground truth, even in cases where the ground truth is flawed. +The results indicate significant improvements in accuracy and robustness, with +EfficientNet being the most successful architecture. + +
+
+
+
+
+ + ☆ Run-time Monitoring of 3D Object Detection in Automated Driving Systems + Using Early Layer Neural Activation Patterns CVPR 2024 + + +
+ Monitoring the integrity of object detection for errors within the perception +module of automated driving systems (ADS) is paramount for ensuring safety. +Despite recent advancements in deep neural network (DNN)-based object +detectors, their susceptibility to detection errors, particularly in the +less-explored realm of 3D object detection, remains a significant concern. +State-of-the-art integrity monitoring (also known as introspection) mechanisms +in 2D object detection mainly utilise the activation patterns in the final +layer of the DNN-based detector's backbone. However, that may not sufficiently +address the complexities and sparsity of data in 3D object detection. To this +end, we conduct, in this article, an extensive investigation into the effects +of activation patterns extracted from various layers of the backbone network +for introspecting the operation of 3D object detectors. Through a comparative +analysis using Kitti and NuScenes datasets with PointPillars and CenterPoint +detectors, we demonstrate that using earlier layers' activation patterns +enhances the error detection performance of the integrity monitoring system, +yet increases computational complexity. To address the real-time operation +requirements in ADS, we also introduce a novel introspection method that +combines activation patterns from multiple layers of the detector's backbone +and report its performance. + +
+
+ comment: Accepted by CVPR 2024 Workshop on Safe Autonomy for All Domains + (SAIAD) +
+
+
+
+
+ + ☆ Model-based Cleaning of the QUILT-1M Pathology Dataset for + Text-Conditional Image Synthesis + + +
+ The QUILT-1M dataset is the first openly available dataset containing images +harvested from various online sources. While it provides a huge data variety, +the image quality and composition is highly heterogeneous, impacting its +utility for text-conditional image synthesis. We propose an automatic pipeline +that provides predictions of the most common impurities within the images, +e.g., visibility of narrators, desktop environment and pathology software, or +text within the image. Additionally, we propose to use semantic alignment +filtering of the image-text pairs. Our findings demonstrate that by rigorously +filtering the dataset, there is a substantial enhancement of image fidelity in +text-to-image tasks. + +
+
+ comment: 4 pages (short paper) +
+
+
+
+
+ + ☆ Deep learning-driven pulmonary arteries and veins segmentation reveals + demography-associated pulmonary vasculature anatomy + + +
+ Pulmonary artery-vein segmentation is crucial for diagnosing pulmonary +diseases and surgical planning, and is traditionally achieved by Computed +Tomography Pulmonary Angiography (CTPA). However, concerns regarding adverse +health effects from contrast agents used in CTPA have constrained its clinical +utility. In contrast, identifying arteries and veins using non-contrast CT, a +conventional and low-cost clinical examination routine, has long been +considered impossible. Here we propose a High-abundant Pulmonary Artery-vein +Segmentation (HiPaS) framework achieving accurate artery-vein segmentation on +both non-contrast CT and CTPA across various spatial resolutions. HiPaS first +performs spatial normalization on raw CT scans via a super-resolution module, +and then iteratively achieves segmentation results at different branch levels +by utilizing the low-level vessel segmentation as a prior for high-level vessel +segmentation. We trained and validated HiPaS on our established multi-centric +dataset comprising 1,073 CT volumes with meticulous manual annotation. Both +quantitative experiments and clinical evaluation demonstrated the superior +performance of HiPaS, achieving a dice score of 91.8% and a sensitivity of +98.0%. Further experiments demonstrated the non-inferiority of HiPaS +segmentation on non-contrast CT compared to segmentation on CTPA. Employing +HiPaS, we have conducted an anatomical study of pulmonary vasculature on 10,613 +participants in China (five sites), discovering a new association between +pulmonary vessel abundance and sex and age: vessel abundance is significantly +higher in females than in males, and slightly decreases with age, under the +controlling of lung volumes (p < 0.0001). HiPaS realizing accurate artery-vein +segmentation delineates a promising avenue for clinical diagnosis and +understanding pulmonary physiology in a non-invasive manner. + +
+
+
+
+
+ + ☆ Shape Completion in the Dark: Completing Vertebrae Morphology from 3D + Ultrasound + + +
+ Purpose: Ultrasound (US) imaging, while advantageous for its radiation-free +nature, is challenging to interpret due to only partially visible organs and a +lack of complete 3D information. While performing US-based diagnosis or +investigation, medical professionals therefore create a mental map of the 3D +anatomy. In this work, we aim to replicate this process and enhance the visual +representation of anatomical structures. + Methods: We introduce a point-cloud-based probabilistic DL method to complete +occluded anatomical structures through 3D shape completion and choose US-based +spine examinations as our application. To enable training, we generate +synthetic 3D representations of partially occluded spinal views by mimicking US +physics and accounting for inherent artifacts. + Results: The proposed model performs consistently on synthetic and patient +data, with mean and median differences of 2.02 and 0.03 in CD, respectively. +Our ablation study demonstrates the importance of US physics-based data +generation, reflected in the large mean and median difference of 11.8 CD and +9.55 CD, respectively. Additionally, we demonstrate that anatomic landmarks, +such as the spinous process (with reconstruction CD of 4.73) and the facet +joints (mean distance to GT of 4.96mm) are preserved in the 3D completion. + Conclusion: Our work establishes the feasibility of 3D shape completion for +lumbar vertebrae, ensuring the preservation of level-wise characteristics and +successful generalization from synthetic to real data. The incorporation of US +physics contributes to more accurate patient data completions. Notably, our +method preserves essential anatomic landmarks and reconstructs crucial +injections sites at their correct locations. The generated data and source code +will be made publicly available +(https://github.com/miruna20/Shape-Completion-in-the-Dark). + +
+
+
+
+
+ + ☆ Dealing with Subject Similarity in Differential Morphing Attack + Detection + + +
+ The advent of morphing attacks has posed significant security concerns for +automated Face Recognition systems, raising the pressing need for robust and +effective Morphing Attack Detection (MAD) methods able to effectively address +this issue. In this paper, we focus on Differential MAD (D-MAD), where a +trusted live capture, usually representing the criminal, is compared with the +document image to classify it as morphed or bona fide. We show these approaches +based on identity features are effective when the morphed image and the live +one are sufficiently diverse; unfortunately, the effectiveness is significantly +reduced when the same approaches are applied to look-alike subjects or in all +those cases when the similarity between the two compared images is high (e.g. +comparison between the morphed image and the accomplice). Therefore, in this +paper, we propose ACIdA, a modular D-MAD system, consisting of a module for the +attempt type classification, and two modules for the identity and artifacts +analysis on input images. Successfully addressing this task would allow +broadening the D-MAD applications including, for instance, the document +enrollment stage, which currently relies entirely on human evaluation, thus +limiting the possibility of releasing ID documents with manipulated images, as +well as the automated gates to detect both accomplices and criminals. An +extensive cross-dataset experimental evaluation conducted on the introduced +scenario shows that ACIdA achieves state-of-the-art results, outperforming +literature competitors, while maintaining good performance in traditional D-MAD +benchmarks. + +
+
+
+
+
+ + ☆ Finding Dino: A plug-and-play framework for unsupervised detection of + out-of-distribution objects using prototypes + + +
+ Detecting and localising unknown or Out-of-distribution (OOD) objects in any +scene can be a challenging task in vision. Particularly, in safety-critical +cases involving autonomous systems like automated vehicles or trains. +Supervised anomaly segmentation or open-world object detection models depend on +training on exhaustively annotated datasets for every domain and still struggle +in distinguishing between background and OOD objects. In this work, we present +a plug-and-play generalised framework - PRototype-based zero-shot OOD detection +Without Labels (PROWL). It is an inference-based method that does not require +training on the domain dataset and relies on extracting relevant features from +self-supervised pre-trained models. PROWL can be easily adapted to detect OOD +objects in any operational design domain by specifying a list of known classes +from this domain. PROWL, as an unsupervised method, outperforms other +supervised methods trained without auxiliary OOD data on the RoadAnomaly and +RoadObstacle datasets provided in SegmentMeIfYouCan (SMIYC) benchmark. We also +demonstrate its suitability for other domains such as rail and maritime scenes. + +
+
+
+
+
+ + ☆ Separated Attention: An Improved Cycle GAN Based Under Water Image + Enhancement Method + + +
+ In this paper we have present an improved Cycle GAN based model for under +water image enhancement. We have utilized the cycle consistent learning +technique of the state-of-the-art Cycle GAN model with modification in the loss +function in terms of depth-oriented attention which enhance the contrast of the +overall image, keeping global content, color, local texture, and style +information intact. We trained the Cycle GAN model with the modified loss +functions on the benchmarked Enhancing Underwater Visual Perception (EUPV) +dataset a large dataset including paired and unpaired sets of underwater images +(poor and good quality) taken with seven distinct cameras in a range of +visibility situation during research on ocean exploration and human-robot +cooperation. In addition, we perform qualitative and quantitative evaluation +which supports the given technique applied and provided a better contrast +enhancement model of underwater imagery. More significantly, the upgraded +images provide better results from conventional models and further for under +water navigation, pose estimation, saliency prediction, object detection and +tracking. The results validate the appropriateness of the model for autonomous +underwater vehicles (AUV) in visual navigation. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ Simba: Mamba augmented U-ShiftGCN for Skeletal Action Recognition in + Videos + + +
+ Skeleton Action Recognition (SAR) involves identifying human actions using +skeletal joint coordinates and their interconnections. While plain Transformers +have been attempted for this task, they still fall short compared to the +current leading methods, which are rooted in Graph Convolutional Networks +(GCNs) due to the absence of structural priors. Recently, a novel selective +state space model, Mamba, has surfaced as a compelling alternative to the +attention mechanism in Transformers, offering efficient modeling of long +sequences. In this work, to the utmost extent of our awareness, we present the +first SAR framework incorporating Mamba. Each fundamental block of our model +adopts a novel U-ShiftGCN architecture with Mamba as its core component. The +encoder segment of the U-ShiftGCN is devised to extract spatial features from +the skeletal data using downsampling vanilla Shift S-GCN blocks. These spatial +features then undergo intermediate temporal modeling facilitated by the Mamba +block before progressing to the encoder section, which comprises vanilla +upsampling Shift S-GCN blocks. Additionally, a Shift T-GCN (ShiftTCN) temporal +modeling unit is employed before the exit of each fundamental block to refine +temporal representations. This particular integration of downsampling spatial, +intermediate temporal, upsampling spatial, and ultimate temporal subunits +yields promising results for skeleton action recognition. We dub the resulting +model \textbf{Simba}, which attains state-of-the-art performance across three +well-known benchmark skeleton action recognition datasets: NTU RGB+D, NTU RGB+D +120, and Northwestern-UCLA. Interestingly, U-ShiftGCN (Simba without +Intermediate Mamba Block) by itself is capable of performing reasonably well +and surpasses our baseline. + +
+
+ comment: 20 pages, 6 tables, 1 figure +
+
+
+
+
+ + ☆ Homography Guided Temporal Fusion for Road Line and Marking Segmentation ICCV 2023 + + +
+ Reliable segmentation of road lines and markings is critical to autonomous +driving. Our work is motivated by the observations that road lines and markings +are (1) frequently occluded in the presence of moving vehicles, shadow, and +glare and (2) highly structured with low intra-class shape variance and overall +high appearance consistency. To solve these issues, we propose a Homography +Guided Fusion (HomoFusion) module to exploit temporally-adjacent video frames +for complementary cues facilitating the correct classification of the partially +occluded road lines or markings. To reduce computational complexity, a novel +surface normal estimator is proposed to establish spatial correspondences +between the sampled frames, allowing the HomoFusion module to perform a +pixel-to-pixel attention mechanism in updating the representation of the +occluded road lines or markings. Experiments on ApolloScape, a large-scale lane +mark segmentation dataset, and ApolloScape Night with artificial simulated +night-time road conditions, demonstrate that our method outperforms other +existing SOTA lane mark segmentation models with less than 9\% of their +parameters and computational complexity. We show that exploiting available +camera intrinsic data and ground plane assumption for cross-frame +correspondence can lead to a light-weight network with significantly improved +performances in speed and accuracy. We also prove the versatility of our +HomoFusion approach by applying it to the problem of water puddle segmentation +and achieving SOTA performance. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Multi-Image Visual Question Answering for Unsupervised Anomaly Detection + + +
+ Unsupervised anomaly detection enables the identification of potential +pathological areas by juxtaposing original images with their pseudo-healthy +reconstructions generated by models trained exclusively on normal images. +However, the clinical interpretation of resultant anomaly maps presents a +challenge due to a lack of detailed, understandable explanations. Recent +advancements in language models have shown the capability of mimicking +human-like understanding and providing detailed descriptions. This raises an +interesting question: \textit{How can language models be employed to make the +anomaly maps more explainable?} To the best of our knowledge, we are the first +to leverage a language model for unsupervised anomaly detection, for which we +construct a dataset with different questions and answers. Additionally, we +present a novel multi-image visual question answering framework tailored for +anomaly detection, incorporating diverse feature fusion strategies to enhance +visual knowledge extraction. Our experiments reveal that the framework, +augmented by our new Knowledge Q-Former module, adeptly answers questions on +the anomaly detection dataset. Besides, integrating anomaly maps as inputs +distinctly aids in improving the detection of unseen pathologies. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Diffusion Probabilistic Multi-cue Level Set for Reducing Edge + Uncertainty in Pancreas Segmentation + + +
+ Accurately segmenting the pancreas remains a huge challenge. Traditional +methods encounter difficulties in semantic localization due to the small volume +and distorted structure of the pancreas, while deep learning methods encounter +challenges in obtaining accurate edges because of low contrast and organ +overlapping. To overcome these issues, we propose a multi-cue level set method +based on the diffusion probabilistic model, namely Diff-mcs. Our method adopts +a coarse-to-fine segmentation strategy. We use the diffusion probabilistic +model in the coarse segmentation stage, with the obtained probability +distribution serving as both the initial localization and prior cues for the +level set method. In the fine segmentation stage, we combine the prior cues +with grayscale cues and texture cues to refine the edge by maximizing the +difference between probability distributions of the cues inside and outside the +level set curve. The method is validated on three public datasets and achieves +state-of-the-art performance, which can obtain more accurate segmentation +results with lower uncertainty segmentation edges. In addition, we conduct +ablation studies and uncertainty analysis to verify that the diffusion +probability model provides a more appropriate initialization for the level set +method. Furthermore, when combined with multiple cues, the level set method can +better obtain edges and improve the overall accuracy. Our code is available at +https://github.com/GOUYUEE/Diff-mcs. + +
+
+
+
+
+ + ☆ Do You Remember? Dense Video Captioning with Cross-Modal Memory + Retrieval CVPR 2024 + + +
+ There has been significant attention to the research on dense video +captioning, which aims to automatically localize and caption all events within +untrimmed video. Several studies introduce methods by designing dense video +captioning as a multitasking problem of event localization and event captioning +to consider inter-task relations. However, addressing both tasks using only +visual input is challenging due to the lack of semantic content. In this study, +we address this by proposing a novel framework inspired by the cognitive +information processing of humans. Our model utilizes external memory to +incorporate prior knowledge. The memory retrieval method is proposed with +cross-modal video-to-text matching. To effectively incorporate retrieved text +features, the versatile encoder and the decoder with visual and textual +cross-attention modules are designed. Comparative experiments have been +conducted to show the effectiveness of the proposed method on ActivityNet +Captions and YouCook2 datasets. Experimental results show promising performance +of our model without extensive pretraining from a large video dataset. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Automatic Detection of Dark Ship-to-Ship Transfers using Deep Learning + and Satellite Imagery + + +
+ Despite extensive research into ship detection via remote sensing, no studies +identify ship-to-ship transfers in satellite imagery. Given the importance of +transshipment in illicit shipping practices, this is a significant gap. In what +follows, I train a convolutional neural network to accurately detect 4 +different types of cargo vessel and two different types of Ship-to-Ship +transfer in PlanetScope satellite imagery. I then elaborate a pipeline for the +automatic detection of suspected illicit ship-to-ship transfers by +cross-referencing satellite detections with vessel borne GPS data. Finally, I +apply this method to the Kerch Strait between Ukraine and Russia to identify +over 400 dark transshipment events since 2022. + +
+
+
+
+
+ + ☆ Contrastive-Based Deep Embeddings for Label Noise-Resilient + Histopathology Image Classification + + +
+ Recent advancements in deep learning have proven highly effective in medical +image classification, notably within histopathology. However, noisy labels +represent a critical challenge in histopathology image classification, where +accurate annotations are vital for training robust deep learning models. +Indeed, deep neural networks can easily overfit label noise, leading to severe +degradations in model performance. While numerous public pathology foundation +models have emerged recently, none have evaluated their resilience to label +noise. Through thorough empirical analyses across multiple datasets, we exhibit +the label noise resilience property of embeddings extracted from foundation +models trained in a self-supervised contrastive manner. We demonstrate that +training with such embeddings substantially enhances label noise robustness +when compared to non-contrastive-based ones as well as commonly used +noise-resilient methods. Our results unequivocally underline the superiority of +contrastive learning in effectively mitigating the label noise challenge. Code +is publicly available at +https://github.com/LucasDedieu/NoiseResilientHistopathology. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ GLID: Pre-training a Generalist Encoder-Decoder Vision Model CVPR 2024 + + +
+ This paper proposes a GeneraLIst encoder-Decoder (GLID) pre-training method +for better handling various downstream computer vision tasks. While +self-supervised pre-training approaches, e.g., Masked Autoencoder, have shown +success in transfer learning, task-specific sub-architectures are still +required to be appended for different downstream tasks, which cannot enjoy the +benefits of large-scale pre-training. GLID overcomes this challenge by allowing +the pre-trained generalist encoder-decoder to be fine-tuned on various vision +tasks with minimal task-specific architecture modifications. In the GLID +training scheme, pre-training pretext task and other downstream tasks are +modeled as "query-to-answer" problems, including the pre-training pretext task +and other downstream tasks. We pre-train a task-agnostic encoder-decoder with +query-mask pairs. During fine-tuning, GLID maintains the pre-trained +encoder-decoder and queries, only replacing the topmost linear transformation +layer with task-specific linear heads. This minimizes the pretrain-finetune +architecture inconsistency and enables the pre-trained model to better adapt to +downstream tasks. GLID achieves competitive performance on various vision +tasks, including object detection, image segmentation, pose estimation, and +depth estimation, outperforming or matching specialist models such as +Mask2Former, DETR, ViTPose, and BinsFormer. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Attention based End to end network for Offline Writer Identification on + Word level data + + +
+ Writer identification due to its widespread application in various fields has +gained popularity over the years. In scenarios where optimum handwriting +samples are available, whether they be in the form of a single line, a +sentence, or an entire page, writer identification algorithms have demonstrated +noteworthy levels of accuracy. However, in scenarios where only a limited +number of handwritten samples are available, particularly in the form of word +images, there is a significant scope for improvement. + In this paper, we propose a writer identification system based on an +attention-driven Convolutional Neural Network (CNN). The system is trained +utilizing image segments, known as fragments, extracted from word images, +employing a pyramid-based strategy. This methodology enables the system to +capture a comprehensive representation of the data, encompassing both +fine-grained details and coarse features across various levels of abstraction. +These extracted fragments serve as the training data for the convolutional +network, enabling it to learn a more robust representation compared to +traditional convolution-based networks trained on word images. Additionally, +the paper explores the integration of an attention mechanism to enhance the +representational power of the learned features. The efficacy of the proposed +algorithm is evaluated on three benchmark databases, demonstrating its +proficiency in writer identification tasks, particularly in scenarios with +limited access to handwriting data. + +
+
+
+
+
+ + ☆ Implicit and Explicit Language Guidance for Diffusion-based Visual + Perception + + +
+ Text-to-image diffusion models have shown powerful ability on conditional +image synthesis. With large-scale vision-language pre-training, diffusion +models are able to generate high-quality images with rich texture and +reasonable structure under different text prompts. However, it is an open +problem to adapt the pre-trained diffusion model for visual perception. In this +paper, we propose an implicit and explicit language guidance framework for +diffusion-based perception, named IEDP. Our IEDP comprises of an implicit +language guidance branch and an explicit language guidance branch. The implicit +branch employs frozen CLIP image encoder to directly generate implicit text +embeddings that are fed to diffusion model, without using explicit text +prompts. The explicit branch utilizes the ground-truth labels of corresponding +images as text prompts to condition feature extraction of diffusion model. +During training, we jointly train diffusion model by sharing the model weights +of these two branches. As a result, implicit and explicit branches can jointly +guide feature learning. During inference, we only employ implicit branch for +final prediction, which does not require any ground-truth labels. Experiments +are performed on two typical perception tasks, including semantic segmentation +and depth estimation. Our IEDP achieves promising performance on both tasks. +For semantic segmentation, our IEDP has the mIoU score of 55.9% on AD20K +validation set, which outperforms the baseline method VPD by 2.2%. For depth +estimation, our IEDP outperforms the baseline method VPD with a relative gain +of 10.2%. + +
+
+
+
+
+ + ☆ Weakly-Supervised Learning via Multi-Lateral Decoder Branching for + Guidewire Segmentation in Robot-Assisted Cardiovascular Catheterization + + +
+ Although robot-assisted cardiovascular catheterization is commonly performed +for intervention of cardiovascular diseases, more studies are needed to support +the procedure with automated tool segmentation. This can aid surgeons on tool +tracking and visualization during intervention. Learning-based segmentation has +recently offered state-of-the-art segmentation performances however, generating +ground-truth signals for fully-supervised methods is labor-intensive and time +consuming for the interventionists. In this study, a weakly-supervised learning +method with multi-lateral pseudo labeling is proposed for tool segmentation in +cardiac angiograms. The method includes a modified U-Net model with one encoder +and multiple lateral-branched decoders that produce pseudo labels as +supervision signals under different perturbation. The pseudo labels are +self-generated through a mixed loss function and shared consistency in the +decoders. We trained the model end-to-end with weakly-annotated data obtained +during robotic cardiac catheterization. Experiments with the proposed model +shows weakly annotated data has closer performance to when fully annotated data +is used. Compared to three existing weakly-supervised methods, our approach +yielded higher segmentation performance across three different cardiac +angiogram data. With ablation study, we showed consistent performance under +different parameters. Thus, we offer a less expensive method for real-time tool +segmentation and tracking during robot-assisted cardiac catheterization. + +
+
+
+
+
+ + ☆ Multi-rater Prompting for Ambiguous Medical Image Segmentation + + +
+ Multi-rater annotations commonly occur when medical images are independently +annotated by multiple experts (raters). In this paper, we tackle two challenges +arisen in multi-rater annotations for medical image segmentation (called +ambiguous medical image segmentation): (1) How to train a deep learning model +when a group of raters produces a set of diverse but plausible annotations, and +(2) how to fine-tune the model efficiently when computation resources are not +available for re-training the entire model on a different dataset domain. We +propose a multi-rater prompt-based approach to address these two challenges +altogether. Specifically, we introduce a series of rater-aware prompts that can +be plugged into the U-Net model for uncertainty estimation to handle +multi-annotation cases. During the prompt-based fine-tuning process, only 0.3% +of learnable parameters are required to be updated comparing to training the +entire model. Further, in order to integrate expert consensus and disagreement, +we explore different multi-rater incorporation strategies and design a +mix-training strategy for comprehensive insight learning. Extensive experiments +verify the effectiveness of our new approach for ambiguous medical image +segmentation on two public datasets while alleviating the heavy burden of model +re-training. + +
+
+
+
+
+ + ☆ ObjBlur: A Curriculum Learning Approach With Progressive Object-Level + Blurring for Improved Layout-to-Image Generation + + +
+ We present ObjBlur, a novel curriculum learning approach to improve +layout-to-image generation models, where the task is to produce realistic +images from layouts composed of boxes and labels. Our method is based on +progressive object-level blurring, which effectively stabilizes training and +enhances the quality of generated images. This curriculum learning strategy +systematically applies varying degrees of blurring to individual objects or the +background during training, starting from strong blurring to progressively +cleaner images. Our findings reveal that this approach yields significant +performance improvements, stabilized training, smoother convergence, and +reduced variance between multiple runs. Moreover, our technique demonstrates +its versatility by being compatible with generative adversarial networks and +diffusion models, underlining its applicability across various generative +modeling paradigms. With ObjBlur, we reach new state-of-the-art results on the +complex COCO and Visual Genome datasets. + +
+
+
+
+
+ + ☆ Attention-Aware Laparoscopic Image Desmoking Network with Lightness + Embedding and Hybrid Guided Embedding + + +
+ This paper presents a novel method of smoke removal from the laparoscopic +images. Due to the heterogeneous nature of surgical smoke, a two-stage network +is proposed to estimate the smoke distribution and reconstruct a clear, +smoke-free surgical scene. The utilization of the lightness channel plays a +pivotal role in providing vital information pertaining to smoke density. The +reconstruction of smoke-free image is guided by a hybrid embedding, which +combines the estimated smoke mask with the initial image. Experimental results +demonstrate that the proposed method boasts a Peak Signal to Noise Ratio that +is $2.79\%$ higher than the state-of-the-art methods, while also exhibits a +remarkable $38.2\%$ reduction in run-time. Overall, the proposed method offers +comparable or even superior performance in terms of both smoke removal quality +and computational efficiency when compared to existing state-of-the-art +methods. This work will be publicly available on +http://homepage.hit.edu.cn/wpgao + +
+
+ comment: ISBI2024 +
+
+
+
+
+ + ☆ CAT: Contrastive Adapter Training for Personalized Image Generation CVPR + + +
+ The emergence of various adapters, including Low-Rank Adaptation (LoRA) +applied from the field of natural language processing, has allowed diffusion +models to personalize image generation at a low cost. However, due to the +various challenges including limited datasets and shortage of regularization +and computation resources, adapter training often results in unsatisfactory +outcomes, leading to the corruption of the backbone model's prior knowledge. +One of the well known phenomena is the loss of diversity in object generation, +especially within the same class which leads to generating almost identical +objects with minor variations. This poses challenges in generation +capabilities. To solve this issue, we present Contrastive Adapter Training +(CAT), a simple yet effective strategy to enhance adapter training through the +application of CAT loss. Our approach facilitates the preservation of the base +model's original knowledge when the model initiates adapters. Furthermore, we +introduce the Knowledge Preservation Score (KPS) to evaluate CAT's ability to +keep the former information. We qualitatively and quantitatively compare CAT's +improvement. Finally, we mention the possibility of CAT in the aspects of +multi-concept adapter and optimization. + +
+
+ comment: CVPRW 2024 +
+
+
+
+
+ + ☆ SFSORT: Scene Features-based Simple Online Real-Time Tracker + + +
+ This paper introduces SFSORT, the world's fastest multi-object tracking +system based on experiments conducted on MOT Challenge datasets. To achieve an +accurate and computationally efficient tracker, this paper employs a +tracking-by-detection method, following the online real-time tracking approach +established in prior literature. By introducing a novel cost function called +the Bounding Box Similarity Index, this work eliminates the Kalman Filter, +leading to reduced computational requirements. Additionally, this paper +demonstrates the impact of scene features on enhancing object-track association +and improving track post-processing. Using a 2.2 GHz Intel Xeon CPU, the +proposed method achieves an HOTA of 61.7\% with a processing speed of 2242 Hz +on the MOT17 dataset and an HOTA of 60.9\% with a processing speed of 304 Hz on +the MOT20 dataset. The tracker's source code, fine-tuned object detection +model, and tutorials are available at +\url{https://github.com/gitmehrdad/SFSORT}. + +
+
+
+
+
+ + ☆ Event-Enhanced Snapshot Compressive Videography at 10K FPS + + +
+ Video snapshot compressive imaging (SCI) encodes the target dynamic scene +compactly into a snapshot and reconstructs its high-speed frame sequence +afterward, greatly reducing the required data footprint and transmission +bandwidth as well as enabling high-speed imaging with a low frame rate +intensity camera. In implementation, high-speed dynamics are encoded via +temporally varying patterns, and only frames at corresponding temporal +intervals can be reconstructed, while the dynamics occurring between +consecutive frames are lost. To unlock the potential of conventional snapshot +compressive videography, we propose a novel hybrid "intensity+event" imaging +scheme by incorporating an event camera into a video SCI setup. Our proposed +system consists of a dual-path optical setup to record the coded intensity +measurement and intermediate event signals simultaneously, which is compact and +photon-efficient by collecting the half photons discarded in conventional video +SCI. Correspondingly, we developed a dual-branch Transformer utilizing the +reciprocal relationship between two data modes to decode dense video frames. +Extensive experiments on both simulated and real-captured data demonstrate our +superiority to state-of-the-art video SCI and video frame interpolation (VFI) +methods. Benefiting from the new hybrid design leveraging both intrinsic +redundancy in videos and the unique feature of event cameras, we achieve +high-quality videography at 0.1ms time intervals with a low-cost CMOS image +sensor working at 24 FPS. + +
+
+
+
+
+ + ☆ Stereo-LiDAR Depth Estimation with Deformable Propagation and Learned + Disparity-Depth Conversion ICRA 2024 + + +
+ Accurate and dense depth estimation with stereo cameras and LiDAR is an +important task for automatic driving and robotic perception. While sparse hints +from LiDAR points have improved cost aggregation in stereo matching, their +effectiveness is limited by the low density and non-uniform distribution. To +address this issue, we propose a novel stereo-LiDAR depth estimation network +with Semi-Dense hint Guidance, named SDG-Depth. Our network includes a +deformable propagation module for generating a semi-dense hint map and a +confidence map by propagating sparse hints using a learned deformable window. +These maps then guide cost aggregation in stereo matching. To reduce the +triangulation error in depth recovery from disparity, especially in distant +regions, we introduce a disparity-depth conversion module. Our method is both +accurate and efficient. The experimental results on benchmark tests show its +superior performance. Our code is available at +https://github.com/SJTU-ViSYS/SDG-Depth. + +
+
+ comment: Accepted in ICRA 2024. 8 pages, 6 figures +
+
+
+
+
+ + ☆ Content-Adaptive Non-Local Convolution for Remote Sensing Pansharpening CVPR 2024 + + +
+ Currently, machine learning-based methods for remote sensing pansharpening +have progressed rapidly. However, existing pansharpening methods often do not +fully exploit differentiating regional information in non-local spaces, thereby +limiting the effectiveness of the methods and resulting in redundant learning +parameters. In this paper, we introduce a so-called content-adaptive non-local +convolution (CANConv), a novel method tailored for remote sensing image +pansharpening. Specifically, CANConv employs adaptive convolution, ensuring +spatial adaptability, and incorporates non-local self-similarity through the +similarity relationship partition (SRP) and the partition-wise adaptive +convolution (PWAC) sub-modules. Furthermore, we also propose a corresponding +network architecture, called CANNet, which mainly utilizes the multi-scale +self-similarity. Extensive experiments demonstrate the superior performance of +CANConv, compared with recent promising fusion methods. Besides, we +substantiate the method's effectiveness through visualization, ablation +experiments, and comparison with existing methods on multiple test sets. The +source code is publicly available at https://github.com/duanyll/CANConv. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ How is Visual Attention Influenced by Text Guidance? Database and Model + + +
+ The analysis and prediction of visual attention have long been crucial tasks +in the fields of computer vision and image processing. In practical +applications, images are generally accompanied by various text descriptions, +however, few studies have explored the influence of text descriptions on visual +attention, let alone developed visual saliency prediction models considering +text guidance. In this paper, we conduct a comprehensive study on text-guided +image saliency (TIS) from both subjective and objective perspectives. +Specifically, we construct a TIS database named SJTU-TIS, which includes 1200 +text-image pairs and the corresponding collected eye-tracking data. Based on +the established SJTU-TIS database, we analyze the influence of various text +descriptions on visual attention. Then, to facilitate the development of +saliency prediction models considering text influence, we construct a benchmark +for the established SJTU-TIS database using state-of-the-art saliency models. +Finally, considering the effect of text descriptions on visual attention, while +most existing saliency models ignore this impact, we further propose a +text-guided saliency (TGSal) prediction model, which extracts and integrates +both image features and text features to predict the image saliency under +various text-description conditions. Our proposed model significantly +outperforms the state-of-the-art saliency models on both the SJTU-TIS database +and the pure image saliency databases in terms of various evaluation metrics. +The SJTU-TIS database and the code of the proposed TGSal model will be released +at: https://github.com/IntMeGroup/TGSal. + +
+
+
+
+
+ + ☆ PromptSync: Bridging Domain Gaps in Vision-Language Models through + Class-Aware Prototype Alignment and Discrimination CVPR 2024 + + +
+ The potential for zero-shot generalization in vision-language (V-L) models +such as CLIP has spurred their widespread adoption in addressing numerous +downstream tasks. Previous methods have employed test-time prompt tuning to +adapt the model to unseen domains, but they overlooked the issue of imbalanced +class distributions. In this study, we explicitly address this problem by +employing class-aware prototype alignment weighted by mean class probabilities +obtained for the test sample and filtered augmented views. Additionally, we +ensure that the class probabilities are as accurate as possible by performing +prototype discrimination using contrastive learning. The combination of +alignment and discriminative loss serves as a geometric regularizer, preventing +the prompt representation from collapsing onto a single class and effectively +bridging the distribution gap between the source and test domains. Our method, +named PromptSync, synchronizes the prompts for each test sample on both the +text and vision branches of the V-L model. In empirical evaluations on the +domain generalization benchmark, our method outperforms previous best methods +by 2.33\% in overall performance, by 1\% in base-to-novel generalization, and +by 2.84\% in cross-dataset transfer tasks. + +
+
+ comment: Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures +
+
+
+
+
+ + ☆ Remembering Transformer for Continual Learning + + +
+ Neural networks encounter the challenge of Catastrophic Forgetting (CF) in +continual learning, where new task knowledge interferes with previously learned +knowledge. We propose Remembering Transformer, inspired by the brain's +Complementary Learning Systems (CLS), to tackle this issue. Remembering +Transformer employs a mixture-of-adapters and a generative model-based routing +mechanism to alleviate CF by dynamically routing task data to relevant +adapters. Our approach demonstrated a new SOTA performance in various vision +continual learning tasks and great parameter efficiency. + +
+
+
+
+
+ + ☆ Generalization Gap in Data Augmentation: Insights from Illumination + + +
+ In the field of computer vision, data augmentation is widely used to enrich +the feature complexity of training datasets with deep learning techniques. +However, regarding the generalization capabilities of models, the difference in +artificial features generated by data augmentation and natural visual features +has not been fully revealed. This study focuses on the visual representation +variable 'illumination', by simulating its distribution degradation and +examining how data augmentation techniques enhance model performance on a +classification task. Our goal is to investigate the differences in +generalization between models trained with augmented data and those trained +under real-world illumination conditions. Results indicate that after +undergoing various data augmentation methods, model performance has been +significantly improved. Yet, a noticeable generalization gap still exists after +utilizing various data augmentation methods, emphasizing the critical role of +feature diversity in the training set for enhancing model generalization. + +
+
+
+
+
+ + ☆ Learning to Classify New Foods Incrementally Via Compressed Exemplars + + +
+ Food image classification systems play a crucial role in health monitoring +and diet tracking through image-based dietary assessment techniques. However, +existing food recognition systems rely on static datasets characterized by a +pre-defined fixed number of food classes. This contrasts drastically with the +reality of food consumption, which features constantly changing data. +Therefore, food image classification systems should adapt to and manage data +that continuously evolves. This is where continual learning plays an important +role. A challenge in continual learning is catastrophic forgetting, where ML +models tend to discard old knowledge upon learning new information. While +memory-replay algorithms have shown promise in mitigating this problem by +storing old data as exemplars, they are hampered by the limited capacity of +memory buffers, leading to an imbalance between new and previously learned +data. To address this, our work explores the use of neural image compression to +extend buffer size and enhance data diversity. We introduced the concept of +continuously learning a neural compression model to adaptively improve the +quality of compressed data and optimize the bitrates per pixel (bpp) to store +more exemplars. Our extensive experiments, including evaluations on +food-specific datasets including Food-101 and VFN-74, as well as the general +dataset ImageNet-100, demonstrate improvements in classification accuracy. This +progress is pivotal in advancing more realistic food recognition systems that +are capable of adapting to continually evolving data. Moreover, the principles +and methodologies we've developed hold promise for broader applications, +extending their benefits to other domains of continual machine learning +systems. + +
+
+
+
+
+ + ☆ Mitigating Object Dependencies: Improving Point Cloud Self-Supervised + Learning through Object Exchange + + +
+ In the realm of point cloud scene understanding, particularly in indoor +scenes, objects are arranged following human habits, resulting in objects of +certain semantics being closely positioned and displaying notable inter-object +correlations. This can create a tendency for neural networks to exploit these +strong dependencies, bypassing the individual object patterns. To address this +challenge, we introduce a novel self-supervised learning (SSL) strategy. Our +approach leverages both object patterns and contextual cues to produce robust +features. It begins with the formulation of an object-exchanging strategy, +where pairs of objects with comparable sizes are exchanged across different +scenes, effectively disentangling the strong contextual dependencies. +Subsequently, we introduce a context-aware feature learning strategy, which +encodes object patterns without relying on their specific context by +aggregating object features across various scenes. Our extensive experiments +demonstrate the superiority of our method over existing SSL techniques, further +showing its better robustness to environmental changes. Moreover, we showcase +the applicability of our approach by transferring pre-trained models to diverse +point cloud datasets. + +
+
+
+
+
+ + ☆ PillarTrack: Redesigning Pillar-based Transformer Network for Single + Object Tracking on Point Clouds + + +
+ LiDAR-based 3D single object tracking (3D SOT) is a critical issue in +robotics and autonomous driving. It aims to obtain accurate 3D BBox from the +search area based on similarity or motion. However, existing 3D SOT methods +usually follow the point-based pipeline, where the sampling operation +inevitably leads to redundant or lost information, resulting in unexpected +performance. To address these issues, we propose PillarTrack, a pillar-based 3D +single object tracking framework. Firstly, we transform sparse point clouds +into dense pillars to preserve the local and global geometrics. Secondly, we +introduce a Pyramid-type Encoding Pillar Feature Encoder (PE-PFE) design to +help the feature representation of each pillar. Thirdly, we present an +efficient Transformer-based backbone from the perspective of modality +differences. Finally, we construct our PillarTrack tracker based above designs. +Extensive experiments on the KITTI and nuScenes dataset demonstrate the +superiority of our proposed method. Notably, our method achieves +state-of-the-art performance on the KITTI and nuScenes dataset and enables +real-time tracking speed. We hope our work could encourage the community to +rethink existing 3D SOT tracker designs.We will open source our code to the +research community in https://github.com/StiphyJay/PillarTrack. + +
+
+
+
+
+ + ☆ Fine-Grained Side Information Guided Dual-Prompts for Zero-Shot Skeleton + Action Recognition + + +
+ Skeleton-based zero-shot action recognition aims to recognize unknown human +actions based on the learned priors of the known skeleton-based actions and a +semantic descriptor space shared by both known and unknown categories. However, +previous works focus on establishing the bridges between the known skeleton +representation space and semantic descriptions space at the coarse-grained +level for recognizing unknown action categories, ignoring the fine-grained +alignment of these two spaces, resulting in suboptimal performance in +distinguishing high-similarity action categories. To address these challenges, +we propose a novel method via Side information and dual-prompts learning for +skeleton-based zero-shot action recognition (STAR) at the fine-grained level. +Specifically, 1) we decompose the skeleton into several parts based on its +topology structure and introduce the side information concerning multi-part +descriptions of human body movements for alignment between the skeleton and the +semantic space at the fine-grained level; 2) we design the visual-attribute and +semantic-part prompts to improve the intra-class compactness within the +skeleton space and inter-class separability within the semantic space, +respectively, to distinguish the high-similarity actions. Extensive experiments +show that our method achieves state-of-the-art performance in ZSL and GZSL +settings on NTU RGB+D, NTU RGB+D 120, and PKU-MMD datasets. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ G-NeRF: Geometry-enhanced Novel View Synthesis from Single-View Images CVPR 2024 + + +
+ Novel view synthesis aims to generate new view images of a given view image +collection. Recent attempts address this problem relying on 3D geometry priors +(e.g., shapes, sizes, and positions) learned from multi-view images. However, +such methods encounter the following limitations: 1) they require a set of +multi-view images as training data for a specific scene (e.g., face, car or +chair), which is often unavailable in many real-world scenarios; 2) they fail +to extract the geometry priors from single-view images due to the lack of +multi-view supervision. In this paper, we propose a Geometry-enhanced NeRF +(G-NeRF), which seeks to enhance the geometry priors by a geometry-guided +multi-view synthesis approach, followed by a depth-aware training. In the +synthesis process, inspired that existing 3D GAN models can unconditionally +synthesize high-fidelity multi-view images, we seek to adopt off-the-shelf 3D +GAN models, such as EG3D, as a free source to provide geometry priors through +synthesizing multi-view data. Simultaneously, to further improve the geometry +quality of the synthetic data, we introduce a truncation method to effectively +sample latent codes within 3D GAN models. To tackle the absence of multi-view +supervision for single-view images, we design the depth-aware training +approach, incorporating a depth-aware discriminator to guide geometry priors +through depth maps. Experiments demonstrate the effectiveness of our method in +terms of both qualitative and quantitative results. + +
+
+ comment: CVPR 2024 Accepted Paper +
+
+
+
+
+ + ☆ LUCF-Net: Lightweight U-shaped Cascade Fusion Network for Medical Image + Segmentation + + +
+ In this study, the performance of existing U-shaped neural network +architectures was enhanced for medical image segmentation by adding +Transformer. Although Transformer architectures are powerful at extracting +global information, its ability to capture local information is limited due to +its high complexity. To address this challenge, we proposed a new lightweight +U-shaped cascade fusion network (LUCF-Net) for medical image segmentation. It +utilized an asymmetrical structural design and incorporated both local and +global modules to enhance its capacity for local and global modeling. +Additionally, a multi-layer cascade fusion decoding network was designed to +further bolster the network's information fusion capabilities. Validation +results achieved on multi-organ datasets in CT format, cardiac segmentation +datasets in MRI format, and dermatology datasets in image format demonstrated +that the proposed model outperformed other state-of-the-art methods in handling +local-global information, achieving an improvement of 1.54% in Dice coefficient +and 2.6 mm in Hausdorff distance on multi-organ segmentation. Furthermore, as a +network that combines Convolutional Neural Network and Transformer +architectures, it achieves competitive segmentation performance with only 6.93 +million parameters and 6.6 gigabytes of floating point operations, without the +need of pre-training. In summary, the proposed method demonstrated enhanced +performance while retaining a simpler model design compared to other +Transformer-based segmentation networks. + +
+
+
+
+
+ + ☆ Trashbusters: Deep Learning Approach for Litter Detection and Tracking + + +
+ The illegal disposal of trash is a major public health and environmental +concern. Disposing of trash in unplanned places poses serious health and +environmental risks. We should try to restrict public trash cans as much as +possible. This research focuses on automating the penalization of litterbugs, +addressing the persistent problem of littering in public places. Traditional +approaches relying on manual intervention and witness reporting suffer from +delays, inaccuracies, and anonymity issues. To overcome these challenges, this +paper proposes a fully automated system that utilizes surveillance cameras and +advanced computer vision algorithms for litter detection, object tracking, and +face recognition. The system accurately identifies and tracks individuals +engaged in littering activities, attaches their identities through face +recognition, and enables efficient enforcement of anti-littering policies. By +reducing reliance on manual intervention, minimizing human error, and providing +prompt identification, the proposed system offers significant advantages in +addressing littering incidents. The primary contribution of this research lies +in the implementation of the proposed system, leveraging advanced technologies +to enhance surveillance operations and automate the penalization of litterbugs. + +
+
+
+
+
+ + ☆ Learning to Localize Objects Improves Spatial Reasoning in Visual-LLMs + + +
+ Integration of Large Language Models (LLMs) into visual domain tasks, +resulting in visual-LLMs (V-LLMs), has enabled exceptional performance in +vision-language tasks, particularly for visual question answering (VQA). +However, existing V-LLMs (e.g. BLIP-2, LLaVA) demonstrate weak spatial +reasoning and localization awareness. Despite generating highly descriptive and +elaborate textual answers, these models fail at simple tasks like +distinguishing a left vs right location. In this work, we explore how +image-space coordinate based instruction fine-tuning objectives could inject +spatial awareness into V-LLMs. We discover optimal coordinate representations, +data-efficient instruction fine-tuning objectives, and pseudo-data generation +strategies that lead to improved spatial awareness in V-LLMs. Additionally, our +resulting model improves VQA across image and video domains, reduces undesired +hallucination, and generates better contextual object descriptions. Experiments +across 5 vision-language tasks involving 14 different datasets establish the +clear performance improvements achieved by our proposed framework. + +
+
+
+
+
+ + ☆ Transferable and Principled Efficiency for Open-Vocabulary Segmentation + + +
+ Recent success of pre-trained foundation vision-language models makes +Open-Vocabulary Segmentation (OVS) possible. Despite the promising performance, +this approach introduces heavy computational overheads for two challenges: 1) +large model sizes of the backbone; 2) expensive costs during the fine-tuning. +These challenges hinder this OVS strategy from being widely applicable and +affordable in real-world scenarios. Although traditional methods such as model +compression and efficient fine-tuning can address these challenges, they often +rely on heuristics. This means that their solutions cannot be easily +transferred and necessitate re-training on different models, which comes at a +cost. In the context of efficient OVS, we target achieving performance that is +comparable to or even better than prior OVS works based on large +vision-language foundation models, by utilizing smaller models that incur lower +training costs. The core strategy is to make our efficiency principled and thus +seamlessly transferable from one OVS framework to others without further +customization. Comprehensive experiments on diverse OVS benchmarks demonstrate +our superior trade-off between segmentation accuracy and computation costs over +previous works. Our code is available on https://github.com/Xujxyang/OpenTrans + +
+
+
+
+
+ + ☆ Multi-view Aggregation Network for Dichotomous Image Segmentation CVPR2024 + + +
+ Dichotomous Image Segmentation (DIS) has recently emerged towards +high-precision object segmentation from high-resolution natural images. + When designing an effective DIS model, the main challenge is how to balance +the semantic dispersion of high-resolution targets in the small receptive field +and the loss of high-precision details in the large receptive field. Existing +methods rely on tedious multiple encoder-decoder streams and stages to +gradually complete the global localization and local refinement. + Human visual system captures regions of interest by observing them from +multiple views. Inspired by it, we model DIS as a multi-view object perception +problem and provide a parsimonious multi-view aggregation network (MVANet), +which unifies the feature fusion of the distant view and close-up view into a +single stream with one encoder-decoder structure. With the help of the proposed +multi-view complementary localization and refinement modules, our approach +established long-range, profound visual interactions across multiple views, +allowing the features of the detailed close-up view to focus on highly slender +structures.Experiments on the popular DIS-5K dataset show that our MVANet +significantly outperforms state-of-the-art methods in both accuracy and speed. +The source code and datasets will be publicly available at +\href{https://github.com/qianyu-dlut/MVANet}{MVANet}. + +
+
+ comment: Accepted by CVPR2024 as Highlight +
+
+
+
+
+ + ☆ Encoding Urban Ecologies: Automated Building Archetype Generation + through Self-Supervised Learning for Energy Modeling + + +
+ As the global population and urbanization expand, the building sector has +emerged as the predominant energy consumer and carbon emission contributor. The +need for innovative Urban Building Energy Modeling grows, yet existing building +archetypes often fail to capture the unique attributes of local buildings and +the nuanced distinctions between different cities, jeopardizing the precision +of energy modeling. This paper presents an alternative tool employing +self-supervised learning to distill complex geometric data into representative, +locale-specific archetypes. This study attempts to foster a new paradigm of +interaction with built environments, incorporating local parameters to conduct +bespoke energy simulations at the community level. The catered archetypes can +augment the precision and applicability of energy consumption modeling at +different scales across diverse building inventories. This tool provides a +potential solution that encourages the exploration of emerging local ecologies. +By integrating building envelope characteristics and cultural granularity into +the building archetype generation process, we seek a future where architecture +and urban design are intricately interwoven with the energy sector in shaping +our built environments. + +
+
+
+
+
+ + ☆ CopilotCAD: Empowering Radiologists with Report Completion Models and + Quantitative Evidence from Medical Image Foundation Models + + +
+ Computer-aided diagnosis systems hold great promise to aid radiologists and +clinicians in radiological clinical practice and enhance diagnostic accuracy +and efficiency. However, the conventional systems primarily focus on delivering +diagnostic results through text report generation or medical image +classification, positioning them as standalone decision-makers rather than +helpers and ignoring radiologists' expertise. This study introduces an +innovative paradigm to create an assistive co-pilot system for empowering +radiologists by leveraging Large Language Models (LLMs) and medical image +analysis tools. Specifically, we develop a collaborative framework to integrate +LLMs and quantitative medical image analysis results generated by foundation +models with radiologists in the loop, achieving efficient and safe generation +of radiology reports and effective utilization of computational power of AI and +the expertise of medical professionals. This approach empowers radiologists to +generate more precise and detailed diagnostic reports, enhancing patient +outcomes while reducing the burnout of clinicians. Our methodology underscores +the potential of AI as a supportive tool in medical diagnostics, promoting a +harmonious integration of technology and human expertise to advance the field +of radiology. + +
+
+
+
+
+ + ☆ Improving Shift Invariance in Convolutional Neural Networks with + Translation Invariant Polyphase Sampling + + +
+ Downsampling operators break the shift invariance of convolutional neural +networks (CNNs) and this affects the robustness of features learned by CNNs +when dealing with even small pixel-level shift. Through a large-scale +correlation analysis framework, we study shift invariance of CNNs by inspecting +existing downsampling operators in terms of their maximum-sampling bias (MSB), +and find that MSB is negatively correlated with shift invariance. Based on this +crucial insight, we propose a learnable pooling operator called Translation +Invariant Polyphase Sampling (TIPS) and two regularizations on the intermediate +feature maps of TIPS to reduce MSB and learn translation-invariant +representations. TIPS can be integrated into any CNN and can be trained +end-to-end with marginal computational overhead. Our experiments demonstrate +that TIPS results in consistent performance gains in terms of accuracy, shift +consistency, and shift fidelity on multiple benchmarks for image classification +and semantic segmentation compared to previous methods and also leads to +improvements in adversarial and distributional robustness. TIPS results in the +lowest MSB compared to all previous methods, thus explaining our strong +empirical results. + +
+
+
+
+
+ + ☆ Simplifying Two-Stage Detectors for On-Device Inference in Remote + Sensing + + +
+ Deep learning has been successfully applied to object detection from remotely +sensed images. Images are typically processed on the ground rather than +on-board due to the computation power of the ground system. Such offloaded +processing causes delays in acquiring target mission information, which hinders +its application to real-time use cases. For on-device object detection, +researches have been conducted on designing efficient detectors or model +compression to reduce inference latency. However, highly accurate two-stage +detectors still need further exploitation for acceleration. In this paper, we +propose a model simplification method for two-stage object detectors. Instead +of constructing a general feature pyramid, we utilize only one feature +extraction in the two-stage detector. To compensate for the accuracy drop, we +apply a high pass filter to the RPN's score map. Our approach is applicable to +any two-stage detector using a feature pyramid network. In the experiments with +state-of-the-art two-stage detectors such as ReDet, Oriented-RCNN, and LSKNet, +our method reduced computation costs upto 61.2% with the accuracy loss within +2.1% on the DOTAv1.5 dataset. Source code will be released. + +
+
+
+
+
+ + ☆ Post-hurricane building damage assessment using street-view imagery and + structured data: A multi-modal deep learning approach + + +
+ Accurately assessing building damage is critical for disaster response and +recovery. However, many existing models for detecting building damage have poor +prediction accuracy due to their limited capabilities of identifying detailed, +comprehensive structural and/or non-structural damage from the street-view +image. Additionally, these models mainly rely on the imagery data for damage +classification, failing to account for other critical information, such as wind +speed, building characteristics, evacuation zones, and distance of the building +to the hurricane track. To address these limitations, in this study, we propose +a novel multi-modal (i.e., imagery and structured data) approach for +post-hurricane building damage classification, named the Multi-Modal Swin +Transformer (MMST). We empirically train and evaluate the proposed MMST using +data collected from the 2022 Hurricane Ian in Florida, USA. Results show that +MMST outperforms all selected state-of-the-art benchmark models and can achieve +an accuracy of 92.67%, which are 7.71% improvement in accuracy compared to +Visual Geometry Group 16 (VGG-16). In addition to the street-view imagery data, +building value, building age, and wind speed are the most important predictors +for damage level classification. The proposed MMST can be deployed to assist in +rapid damage assessment and guide reconnaissance efforts in future hurricanes. + +
+
+
+
+
+ + ☆ Global versus Local: Evaluating AlexNet Architectures for Tropical + Cyclone Intensity Estimation + + +
+ Given the destructive impacts of tropical cyclones, it is critical to have a +reliable system for cyclone intensity detection. Various techniques are +available for this purpose, each with differing levels of accuracy. In this +paper, we introduce two ensemble-based models based on AlexNet architecture to +estimate tropical cyclone intensity using visible satellite images. The first +model, trained on the entire dataset, is called the global AlexNet model. The +second model is a distributed version of AlexNet in which multiple AlexNets are +trained separately on subsets of the training data categorized according to the +Saffir-Simpson wind speed scale prescribed by the meterologists. We evaluated +the performance of both models against a deep learning benchmark model called +\textit{Deepti} using a publicly available cyclone image dataset. Results +indicate that both the global model (with a root mean square error (RMSE) of +9.03 knots) and the distributed model (with a RMSE of 9.3 knots) outperform the +benchmark model (with a RMSE of 13.62 knots). We provide a thorough discussion +of our solution approach, including an explanantion of the AlexNet's +performance using gradient class activation maps (grad-CAM). Our proposed +solution strategy allows future experimentation with various deep learning +models in both single and multi-channel settings. + +
+
+
+
+
+ + ☆ SciFlow: Empowering Lightweight Optical Flow Models with Self-Cleaning + Iterations CVPR + + +
+ Optical flow estimation is crucial to a variety of vision tasks. Despite +substantial recent advancements, achieving real-time on-device optical flow +estimation remains a complex challenge. First, an optical flow model must be +sufficiently lightweight to meet computation and memory constraints to ensure +real-time performance on devices. Second, the necessity for real-time on-device +operation imposes constraints that weaken the model's capacity to adequately +handle ambiguities in flow estimation, thereby intensifying the difficulty of +preserving flow accuracy. This paper introduces two synergistic techniques, +Self-Cleaning Iteration (SCI) and Regression Focal Loss (RFL), designed to +enhance the capabilities of optical flow models, with a focus on addressing +optical flow regression ambiguities. These techniques prove particularly +effective in mitigating error propagation, a prevalent issue in optical flow +models that employ iterative refinement. Notably, these techniques add +negligible to zero overhead in model parameters and inference latency, thereby +preserving real-time on-device efficiency. The effectiveness of our proposed +SCI and RFL techniques, collectively referred to as SciFlow for brevity, is +demonstrated across two distinct lightweight optical flow model architectures +in our experiments. Remarkably, SciFlow enables substantial reduction in error +metrics (EPE and Fl-all) over the baseline models by up to 6.3% and 10.5% for +in-domain scenarios and by up to 6.2% and 13.5% for cross-domain scenarios on +the Sintel and KITTI 2015 datasets, respectively. + +
+
+ comment: CVPRW 2024 +
+
+
+
+
+ + ☆ Self-Supervised Learning of Color Constancy + + +
+ Color constancy (CC) describes the ability of the visual system to perceive +an object as having a relatively constant color despite changes in lighting +conditions. While CC and its limitations have been carefully characterized in +humans, it is still unclear how the visual system acquires this ability during +development. Here, we present a first study showing that CC develops in a +neural network trained in a self-supervised manner through an invariance +learning objective. During learning, objects are presented under changing +illuminations, while the network aims to map subsequent views of the same +object onto close-by latent representations. This gives rise to representations +that are largely invariant to the illumination conditions, offering a plausible +example of how CC could emerge during human cognitive development via a form of +self-supervised learning. + +
+
+ comment: 7 pages, 5 figures, submitted to the IEEE International Conference on + Development and Learning (ICDL 2024) +
+
+
+
+
+ + ☆ S3Editor: A Sparse Semantic-Disentangled Self-Training Framework for + Face Video Editing + + +
+ Face attribute editing plays a pivotal role in various applications. However, +existing methods encounter challenges in achieving high-quality results while +preserving identity, editing faithfulness, and temporal consistency. These +challenges are rooted in issues related to the training pipeline, including +limited supervision, architecture design, and optimization strategy. In this +work, we introduce S3Editor, a Sparse Semantic-disentangled Self-training +framework for face video editing. S3Editor is a generic solution that +comprehensively addresses these challenges with three key contributions. +Firstly, S3Editor adopts a self-training paradigm to enhance the training +process through semi-supervision. Secondly, we propose a semantic disentangled +architecture with a dynamic routing mechanism that accommodates diverse editing +requirements. Thirdly, we present a structured sparse optimization schema that +identifies and deactivates malicious neurons to further disentangle impacts +from untarget attributes. S3Editor is model-agnostic and compatible with +various editing approaches. Our extensive qualitative and quantitative results +affirm that our approach significantly enhances identity preservation, editing +fidelity, as well as temporal consistency. + +
+
+
+
+
+ + ☆ Visual Context-Aware Person Fall Detection + + +
+ As the global population ages, the number of fall-related incidents is on the +rise. Effective fall detection systems, specifically in healthcare sector, are +crucial to mitigate the risks associated with such events. This study evaluates +the role of visual context, including background objects, on the accuracy of +fall detection classifiers. We present a segmentation pipeline to +semi-automatically separate individuals and objects in images. Well-established +models like ResNet-18, EfficientNetV2-S, and Swin-Small are trained and +evaluated. During training, pixel-based transformations are applied to +segmented objects, and the models are then evaluated on raw images without +segmentation. Our findings highlight the significant influence of visual +context on fall detection. The application of Gaussian blur to the image +background notably improves the performance and generalization capabilities of +all models. Background objects such as beds, chairs, or wheelchairs can +challenge fall detection systems, leading to false positive alarms. However, we +demonstrate that object-specific contextual transformations during training +effectively mitigate this challenge. Further analysis using saliency maps +supports our observation that visual context is crucial in classification +tasks. We create both dataset processing API and segmentation pipeline, +available at https://github.com/A-NGJ/image-segmentation-cli. + +
+
+ comment: 10 pages, 6 figures, KES IDT-24 conference +
+
+
+
+
+ + ☆ Real-Time Detection and Analysis of Vehicles and Pedestrians using Deep + Learning + + +
+ Computer vision, particularly vehicle and pedestrian identification is +critical to the evolution of autonomous driving, artificial intelligence, and +video surveillance. Current traffic monitoring systems confront major +difficulty in recognizing small objects and pedestrians effectively in +real-time, posing a serious risk to public safety and contributing to traffic +inefficiency. Recognizing these difficulties, our project focuses on the +creation and validation of an advanced deep-learning framework capable of +processing complex visual input for precise, real-time recognition of cars and +people in a variety of environmental situations. On a dataset representing +complicated urban settings, we trained and evaluated different versions of the +YOLOv8 and RT-DETR models. The YOLOv8 Large version proved to be the most +effective, especially in pedestrian recognition, with great precision and +robustness. The results, which include Mean Average Precision and recall rates, +demonstrate the model's ability to dramatically improve traffic monitoring and +safety. This study makes an important addition to real-time, reliable detection +in computer vision, establishing new benchmarks for traffic management systems. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ DIMAT: Decentralized Iterative Merging-And-Training for Deep Learning + Models CVPR 2024 + + +
+ Recent advances in decentralized deep learning algorithms have demonstrated +cutting-edge performance on various tasks with large pre-trained models. +However, a pivotal prerequisite for achieving this level of competitiveness is +the significant communication and computation overheads when updating these +models, which prohibits the applications of them to real-world scenarios. To +address this issue, drawing inspiration from advanced model merging techniques +without requiring additional training, we introduce the Decentralized Iterative +Merging-And-Training (DIMAT) paradigm--a novel decentralized deep learning +framework. Within DIMAT, each agent is trained on their local data and +periodically merged with their neighboring agents using advanced model merging +techniques like activation matching until convergence is achieved. DIMAT +provably converges with the best available rate for nonconvex functions with +various first-order methods, while yielding tighter error bounds compared to +the popular existing approaches. We conduct a comprehensive empirical analysis +to validate DIMAT's superiority over baselines across diverse computer vision +tasks sourced from multiple datasets. Empirical results validate our +theoretical claims by showing that DIMAT attains faster and higher initial gain +in accuracy with independent and identically distributed (IID) and non-IID +data, incurring lower communication overhead. This DIMAT paradigm presents a +new opportunity for the future decentralized learning, enhancing its +adaptability to real-world with sparse and light-weight communication and +computation. + +
+
+ comment: CVPR 2024 accepted paper, 22 pages, 12 figures +
+
+
+
+
+ + ☆ Latent Guard: a Safety Framework for Text-to-image Generation + + +
+ With the ability to generate high-quality images, text-to-image (T2I) models +can be exploited for creating inappropriate content. To prevent misuse, +existing safety measures are either based on text blacklists, which can be +easily circumvented, or harmful content classification, requiring large +datasets for training and offering low flexibility. Hence, we propose Latent +Guard, a framework designed to improve safety measures in text-to-image +generation. Inspired by blacklist-based approaches, Latent Guard learns a +latent space on top of the T2I model's text encoder, where it is possible to +check the presence of harmful concepts in the input text embeddings. Our +proposed framework is composed of a data generation pipeline specific to the +task using large language models, ad-hoc architectural components, and a +contrastive learning strategy to benefit from the generated data. The +effectiveness of our method is verified on three datasets and against four +baselines. Code and data will be shared at +https://github.com/rt219/LatentGuard. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Rethinking Artistic Copyright Infringements in the Era of Text-to-Image + Generative Models + + +
+ Recent text-to-image generative models such as Stable Diffusion are extremely +adept at mimicking and generating copyrighted content, raising concerns amongst +artists that their unique styles may be improperly copied. Understanding how +generative models copy "artistic style" is more complex than duplicating a +single image, as style is comprised by a set of elements (or signature) that +frequently co-occurs across a body of work, where each individual work may vary +significantly. In our paper, we first reformulate the problem of "artistic +copyright infringement" to a classification problem over image sets, instead of +probing image-wise similarities. We then introduce ArtSavant, a practical +(i.e., efficient and easy to understand) tool to (i) determine the unique style +of an artist by comparing it to a reference dataset of works from 372 artists +curated from WikiArt, and (ii) recognize if the identified style reappears in +generated images. We leverage two complementary methods to perform artistic +style classification over image sets, includingTagMatch, which is a novel +inherently interpretable and attributable method, making it more suitable for +broader use by non-technical stake holders (artists, lawyers, judges, etc). +Leveraging ArtSavant, we then perform a large-scale empirical study to provide +quantitative insight on the prevalence of artistic style copying across 3 +popular text-to-image generative models. Namely, amongst a dataset of prolific +artists (including many famous ones), only 20% of them appear to have their +styles be at a risk of copying via simple prompting of today's popular +text-to-image generative models. + +
+
+
+
+
+ + ☆ SurvMamba: State Space Model with Multi-grained Multi-modal Interaction + for Survival Prediction + + +
+ Multi-modal learning that combines pathological images with genomic data has +significantly enhanced the accuracy of survival prediction. Nevertheless, +existing methods have not fully utilized the inherent hierarchical structure +within both whole slide images (WSIs) and transcriptomic data, from which +better intra-modal representations and inter-modal integration could be +derived. Moreover, many existing studies attempt to improve multi-modal +representations through attention mechanisms, which inevitably lead to high +complexity when processing high-dimensional WSIs and transcriptomic data. +Recently, a structured state space model named Mamba emerged as a promising +approach for its superior performance in modeling long sequences with low +complexity. In this study, we propose Mamba with multi-grained multi-modal +interaction (SurvMamba) for survival prediction. SurvMamba is implemented with +a Hierarchical Interaction Mamba (HIM) module that facilitates efficient +intra-modal interactions at different granularities, thereby capturing more +detailed local features as well as rich global representations. In addition, an +Interaction Fusion Mamba (IFM) module is used for cascaded inter-modal +interactive fusion, yielding more comprehensive features for survival +prediction. Comprehensive evaluations on five TCGA datasets demonstrate that +SurvMamba outperforms other existing methods in terms of performance and +computational cost. + +
+
+
+
+
+ + ☆ Synthetic Brain Images: Bridging the Gap in Brain Mapping With + Generative Adversarial Model + + +
+ Magnetic Resonance Imaging (MRI) is a vital modality for gaining precise +anatomical information, and it plays a significant role in medical imaging for +diagnosis and therapy planning. Image synthesis problems have seen a revolution +in recent years due to the introduction of deep learning techniques, +specifically Generative Adversarial Networks (GANs). This work investigates the +use of Deep Convolutional Generative Adversarial Networks (DCGAN) for producing +high-fidelity and realistic MRI image slices. The suggested approach uses a +dataset with a variety of brain MRI scans to train a DCGAN architecture. While +the discriminator network discerns between created and real slices, the +generator network learns to synthesise realistic MRI image slices. The +generator refines its capacity to generate slices that closely mimic real MRI +data through an adversarial training approach. The outcomes demonstrate that +the DCGAN promise for a range of uses in medical imaging research, since they +show that it can effectively produce MRI image slices if we train them for a +consequent number of epochs. This work adds to the expanding corpus of research +on the application of deep learning techniques for medical image synthesis. The +slices that are could be produced possess the capability to enhance datasets, +provide data augmentation in the training of deep learning models, as well as a +number of functions are made available to make MRI data cleaning easier, and a +three ready to use and clean dataset on the major anatomical plans. + +
+
+
+
+
+ + ♻ ☆ Supervised Fine-tuning in turn Improves Visual Foundation Models + + +
+ Image-text training like CLIP has dominated the pretraining of vision +foundation models in recent years. Subsequent efforts have been made to +introduce region-level visual learning into CLIP's pretraining but face +scalability challenges due to the lack of large-scale region-level datasets. +Drawing inspiration from supervised fine-tuning (SFT) in natural language +processing such as instruction tuning, we explore the potential of fine-grained +SFT in enhancing the generation of vision foundation models after their +pretraining. Thus a two-stage method ViSFT (Vision SFT) is proposed to unleash +the fine-grained knowledge of vision foundation models. In ViSFT, the vision +foundation model is enhanced by performing visual joint learning on some +in-domain tasks and then tested on out-of-domain benchmarks. With updating +using ViSFT on 8 V100 GPUs in less than 2 days, a vision transformer with over +4.4B parameters shows improvements across various out-of-domain benchmarks +including vision and vision-linguistic scenarios. + +
+
+ comment: 23 pages, 3 figures, Project page: + https://github.com/TencentARC/ViSFT/tree/main +
+
+
+
+
+ + ♻ ☆ Low-Resource Vision Challenges for Foundation Models CVPR2024 + + +
+ Low-resource settings are well-established in natural language processing, +where many languages lack sufficient data for deep learning at scale. However, +low-resource problems are under-explored in computer vision. In this paper, we +address this gap and explore the challenges of low-resource image tasks with +vision foundation models. We first collect a benchmark of genuinely +low-resource image data, covering historic maps, circuit diagrams, and +mechanical drawings. These low-resource settings all share three challenges: +data scarcity, fine-grained differences, and the distribution shift from +natural images to the specialized domain of interest. While existing foundation +models have shown impressive generalizability, we find they cannot transfer +well to our low-resource tasks. To begin to tackle the challenges of +low-resource vision, we introduce one simple baseline per challenge. +Specifically, we i) enlarge the data space by generative models, ii) adopt the +best sub-kernels to encode local regions for fine-grained difference discovery +and iii) learn attention for specialized domains. Experiments on our three +low-resource tasks demonstrate our proposals already provide a better baseline +than transfer learning, data augmentation, and fine-grained methods. This +highlights the unique characteristics and challenges of low-resource vision for +foundation models that warrant further investigation. Project page: +https://xiaobai1217.github.io/Low-Resource-Vision/. + +
+
+ comment: Accepted at CVPR2024 +
+
+
+
+
+ + ♻ ☆ EgoGen: An Egocentric Synthetic Data Generator CVPR 2024 + + +
+ Understanding the world in first-person view is fundamental in Augmented +Reality (AR). This immersive perspective brings dramatic visual changes and +unique challenges compared to third-person views. Synthetic data has empowered +third-person-view vision models, but its application to embodied egocentric +perception tasks remains largely unexplored. A critical challenge lies in +simulating natural human movements and behaviors that effectively steer the +embodied cameras to capture a faithful egocentric representation of the 3D +world. To address this challenge, we introduce EgoGen, a new synthetic data +generator that can produce accurate and rich ground-truth training data for +egocentric perception tasks. At the heart of EgoGen is a novel human motion +synthesis model that directly leverages egocentric visual inputs of a virtual +human to sense the 3D environment. Combined with collision-avoiding motion +primitives and a two-stage reinforcement learning approach, our motion +synthesis model offers a closed-loop solution where the embodied perception and +movement of the virtual human are seamlessly coupled. Compared to previous +works, our model eliminates the need for a pre-defined global path, and is +directly applicable to dynamic environments. Combined with our easy-to-use and +scalable data generation pipeline, we demonstrate EgoGen's efficacy in three +tasks: mapping and localization for head-mounted cameras, egocentric camera +tracking, and human mesh recovery from egocentric views. EgoGen will be fully +open-sourced, offering a practical solution for creating realistic egocentric +training data and aiming to serve as a useful tool for egocentric computer +vision research. Refer to our project page: https://ego-gen.github.io/. + +
+
+ comment: Accepted by CVPR 2024 (Oral). 23 pages, 17 figures. Project page: + https://ego-gen.github.io/ +
+
+
+
+
+ + ♻ ☆ MambaAD: Exploring State Space Models for Multi-class Unsupervised + Anomaly Detection + + +
+ Recent advancements in anomaly detection have seen the efficacy of CNN- and +transformer-based approaches. However, CNNs struggle with long-range +dependencies, while transformers are burdened by quadratic computational +complexity. Mamba-based models, with their superior long-range modeling and +linear efficiency, have garnered substantial attention. This study pioneers the +application of Mamba to multi-class unsupervised anomaly detection, presenting +MambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring +(Locality-Enhanced State Space) LSS modules at multi-scales. The proposed LSS +module, integrating parallel cascaded (Hybrid State Space) HSS blocks and +multi-kernel convolutions operations, effectively captures both long-range and +local information. The HSS block, utilizing (Hybrid Scanning) HS encoders, +encodes feature maps into five scanning methods and eight directions, thereby +strengthening global connections through the (State Space Model) SSM. The use +of Hilbert scanning and eight directions significantly improves feature +sequence modeling. Comprehensive experiments on six diverse anomaly detection +datasets and seven metrics demonstrate state-of-the-art performance, +substantiating the method's effectiveness. + +
+
+
+
+
+ + ♻ ☆ Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised + Medical Image Segmentation + + +
+ Although the existing uncertainty-based semi-supervised medical segmentation +methods have achieved excellent performance, they usually only consider a +single uncertainty evaluation, which often fails to solve the problem related +to credibility completely. Therefore, based on the framework of evidential deep +learning, this paper integrates the evidential predictive results in the +cross-region of mixed and original samples to reallocate the confidence degree +and uncertainty measure of each voxel, which is realized by emphasizing +uncertain information of probability assignments fusion rule of traditional +evidence theory. Furthermore, we design a voxel-level asymptotic learning +strategy by introducing information entropy to combine with the fused +uncertainty measure to estimate voxel prediction more precisely. The model will +gradually pay attention to the prediction results with high uncertainty in the +learning process, to learn the features that are difficult to master. The +experimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the +superior performance of our proposed method in comparison with the existing +state of the arts. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Event-based Vision: A Comprehensive Survey and + Benchmarks + + +
+ Event cameras are bio-inspired sensors that capture the per-pixel intensity +changes asynchronously and produce event streams encoding the time, pixel +position, and polarity (sign) of the intensity changes. Event cameras possess a +myriad of advantages over canonical frame-based cameras, such as high temporal +resolution, high dynamic range, low latency, etc. Being capable of capturing +information in challenging visual conditions, event cameras have the potential +to overcome the limitations of frame-based cameras in the computer vision and +robotics community. In very recent years, deep learning (DL) has been brought +to this emerging field and inspired active research endeavors in mining its +potential. However, there is still a lack of taxonomies in DL techniques for +event-based vision. We first scrutinize the typical event representations with +quality enhancement methods as they play a pivotal role as inputs to the DL +models. We then provide a comprehensive survey of existing DL-based methods by +structurally grouping them into two major categories: 1) image/video +reconstruction and restoration; 2) event-based scene understanding and 3D +vision. We conduct benchmark experiments for the existing methods in some +representative research directions, i.e., image reconstruction, deblurring, and +object recognition, to identify some critical insights and problems. Finally, +we have discussions regarding the challenges and provide new perspectives for +inspiring more research studies. + +
+
+
+
+
+ + ♻ ☆ MoCha-Stereo: Motif Channel Attention Network for Stereo Matching CVPR 2024 + + +
+ Learning-based stereo matching techniques have made significant progress. +However, existing methods inevitably lose geometrical structure information +during the feature channel generation process, resulting in edge detail +mismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network +(MoCha-Stereo) is designed to address this problem. We provide the Motif +Channel Correlation Volume (MCCV) to determine more accurate edge matching +costs. MCCV is achieved by projecting motif channels, which capture common +geometric structures in feature channels, onto feature maps and cost volumes. +In addition, edge variations in %potential feature channels of the +reconstruction error map also affect details matching, we propose the +Reconstruction Error Motif Penalty (REMP) module to further refine the +full-resolution disparity estimation. REMP integrates the frequency information +of typical channel features from the reconstruction error. MoCha-Stereo ranks +1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure +also shows excellent performance in Multi-View Stereo. Code is avaliable at +https://github.com/ZYangChen/MoCha-Stereo. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Time-step Curriculum for One Image to 3D Generation CVPR 2024 + + +
+ Score distillation sampling~(SDS) has been widely adopted to overcome the +absence of unseen views in reconstructing 3D objects from a \textbf{single} +image. It leverages pre-trained 2D diffusion models as teacher to guide the +reconstruction of student 3D models. Despite their remarkable success, +SDS-based methods often encounter geometric artifacts and texture saturation. +We find out the crux is the overlooked indiscriminate treatment of diffusion +time-steps during optimization: it unreasonably treats the student-teacher +knowledge distillation to be equal at all time-steps and thus entangles +coarse-grained and fine-grained modeling. Therefore, we propose the Diffusion +Time-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the +teacher and student models collaborating with the time-step curriculum in a +coarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and +Level50 benchmark demonstrate that DTC123 can produce multi-view consistent, +high-quality, and diverse 3D assets. Codes and more generation demos will be +released in https://github.com/yxymessi/DTC123. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Masked Autoencoders for Sensor-Agnostic Image Retrieval in + Remote Sensing + + +
+ Self-supervised learning through masked autoencoders (MAEs) has recently +attracted great attention for remote sensing (RS) image representation +learning, and thus embodies a significant potential for content-based image +retrieval (CBIR) from ever-growing RS image archives. However, the existing +studies on MAEs in RS assume that the considered RS images are acquired by a +single image sensor, and thus are only suitable for uni-modal CBIR problems. +The effectiveness of MAEs for cross-sensor CBIR, which aims to search +semantically similar images across different image modalities, has not been +explored yet. In this paper, we take the first step to explore the +effectiveness of MAEs for sensor-agnostic CBIR in RS. To this end, we present a +systematic overview on the possible adaptations of the vanilla MAE to exploit +masked image modeling on multi-sensor RS image archives (denoted as +cross-sensor masked autoencoders [CSMAEs]). Based on different adjustments +applied to the vanilla MAE, we introduce different CSMAE models. We also +provide an extensive experimental analysis of these CSMAE models. We finally +derive a guideline to exploit masked image modeling for uni-modal and +cross-modal CBIR problems in RS. The code of this work is publicly available at +https://github.com/jakhac/CSMAE. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Our code is available at https://github.com/jakhac/CSMAE +
+
+
+
+
+ + ♻ ☆ An Autonomous Vision-Based Algorithm for Interplanetary Navigation + + +
+ The surge of deep-space probes makes it unsustainable to navigate them with +standard radiometric tracking. Self-driving interplanetary satellites represent +a solution to this problem. In this work, a full vision-based navigation +algorithm is built by combining an orbit determination method with an image +processing pipeline suitable for interplanetary transfers of autonomous +platforms. To increase the computational efficiency of the algorithm, a +non-dimensional extended Kalman filter is selected as state estimator, fed by +the positions of the planets extracted from deep-space images. An enhancement +of the estimation accuracy is performed by applying an optimal strategy to +select the best pair of planets to track. Moreover, a novel analytical +measurement model for deep-space navigation is developed providing a +first-order approximation of the light-aberration and light-time effects. +Algorithm performance is tested on a high-fidelity, Earth--Mars interplanetary +transfer, showing the algorithm applicability for deep-space navigation. + +
+
+
+
+
+ + ♻ ☆ Attention Calibration for Disentangled Text-to-Image Personalization CVPR 2024 + + +
+ Recent thrilling progress in large-scale text-to-image (T2I) models has +unlocked unprecedented synthesis quality of AI-generated content (AIGC) +including image generation, 3D and video composition. Further, personalized +techniques enable appealing customized production of a novel concept given only +several images as reference. However, an intriguing problem persists: Is it +possible to capture multiple, novel concepts from one single reference image? +In this paper, we identify that existing approaches fail to preserve visual +consistency with the reference image and eliminate cross-influence from +concepts. To alleviate this, we propose an attention calibration mechanism to +improve the concept-level understanding of the T2I model. Specifically, we +first introduce new learnable modifiers bound with classes to capture +attributes of multiple concepts. Then, the classes are separated and +strengthened following the activation of the cross-attention operation, +ensuring comprehensive and self-contained concepts. Additionally, we suppress +the attention activation of different classes to mitigate mutual influence +among concepts. Together, our proposed method, dubbed DisenDiff, can learn +disentangled multiple concepts from one single image and produce novel +customized images with learned concepts. We demonstrate that our method +outperforms the current state of the art in both qualitative and quantitative +evaluations. More importantly, our proposed techniques are compatible with LoRA +and inpainting pipelines, enabling more interactive experiences. + +
+
+ comment: CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ A Deep Learning Method for Simultaneous Denoising and Missing Wedge + Reconstruction in Cryogenic Electron Tomography + + +
+ Cryogenic electron tomography is a technique for imaging biological samples +in 3D. A microscope collects a series of 2D projections of the sample, and the +goal is to reconstruct the 3D density of the sample called the tomogram. +Reconstruction is difficult as the 2D projections are noisy and can not be +recorded from all directions, resulting in a missing wedge of information. +Tomograms conventionally reconstructed with filtered back-projection suffer +from noise and strong artifacts due to the missing wedge. Here, we propose a +deep-learning approach for simultaneous denoising and missing wedge +reconstruction called DeepDeWedge. The algorithm requires no ground truth data +and is based on fitting a neural network to the 2D projections using a +self-supervised loss. DeepDeWedge performs better than CryoCARE and IsoNet, +which are state-of-the-art methods for denoising and missing wedge +reconstruction, and similarly and, in some cases, better than the combination +of the two methods. At the same time, DeepDeWedge is simpler than this two-step +approach, as it does denoising and missing wedge reconstruction simultaneously +rather than sequentially. + +
+
+
+
+
+ + ♻ ☆ T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise + Event Spotting in Sports Videos + + +
+ In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer +Encoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses +multiple challenges in the task, including the need for discriminability among +frame representations, high output temporal resolution to maintain prediction +precision, and the necessity to capture information at different temporal +scales to handle events with varying dynamics. It tackles these challenges +through its specifically designed architecture, featuring an encoder-decoder +for leveraging multiple temporal scales and achieving high output temporal +resolution, along with temporal modules designed to increase token +discriminability. Leveraging these characteristics, T-DEED achieves SOTA +performance on the FigureSkating and FineDiving datasets. Code is available at +https://github.com/arturxe2/T-DEED. + +
+
+
+
+
+ + ♻ ☆ Flattening the Parent Bias: Hierarchical Semantic Segmentation in the + Poincar{é} Ball + + +
+ Hierarchy is a natural representation of semantic taxonomies, including the +ones routinely used in image segmentation. Indeed, recent work on semantic +segmentation reports improved accuracy from supervised training leveraging +hierarchical label structures. Encouraged by these results, we revisit the +fundamental assumptions behind that work. We postulate and then empirically +verify that the reasons for the observed improvement in segmentation accuracy +may be entirely unrelated to the use of the semantic hierarchy. To demonstrate +this, we design a range of cross-domain experiments with a representative +hierarchical approach. We find that on the new testing domains, a flat +(non-hierarchical) segmentation network, in which the parents are inferred from +the children, has superior segmentation accuracy to the hierarchical approach +across the board. Complementing these findings and inspired by the intrinsic +properties of hyperbolic spaces, we study a more principled approach to +hierarchical segmentation using the Poincar\'e ball model. The hyperbolic +representation largely outperforms the previous (Euclidean) hierarchical +approach as well and is on par with our flat Euclidean baseline in terms of +segmentation accuracy. However, it additionally exhibits surprisingly strong +calibration quality of the parent nodes in the semantic hierarchy, especially +on the more challenging domains. Our combined analysis suggests that the +established practice of hierarchical segmentation may be limited to in-domain +settings, whereas flat classifiers generalize substantially better, especially +if they are modeled in the hyperbolic space. + +
+
+
+
+
+ + ♻ ☆ Exploring Efficient Asymmetric Blind-Spots for Self-Supervised Denoising + in Real-World Scenarios CVPR 2024 + + +
+ Self-supervised denoising has attracted widespread attention due to its +ability to train without clean images. However, noise in real-world scenarios +is often spatially correlated, which causes many self-supervised algorithms +that assume pixel-wise independent noise to perform poorly. Recent works have +attempted to break noise correlation with downsampling or neighborhood masking. +However, denoising on downsampled subgraphs can lead to aliasing effects and +loss of details due to a lower sampling rate. Furthermore, the neighborhood +masking methods either come with high computational complexity or do not +consider local spatial preservation during inference. Through the analysis of +existing methods, we point out that the key to obtaining high-quality and +texture-rich results in real-world self-supervised denoising tasks is to train +at the original input resolution structure and use asymmetric operations during +training and inference. Based on this, we propose Asymmetric Tunable Blind-Spot +Network (AT-BSN), where the blind-spot size can be freely adjusted, thus better +balancing noise correlation suppression and image local spatial destruction +during training and inference. In addition, we regard the pre-trained AT-BSN as +a meta-teacher network capable of generating various teacher networks by +sampling different blind-spots. We propose a blind-spot based multi-teacher +distillation strategy to distill a lightweight network, significantly improving +performance. Experimental results on multiple datasets prove that our method +achieves state-of-the-art, and is superior to other self-supervised algorithms +in terms of computational overhead and visual effects. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Satellite Image Time Series Analysis: A Review + + +
+ Earth observation (EO) satellite missions have been providing detailed images +about the state of the Earth and its land cover for over 50 years. Long term +missions, such as NASA's Landsat, Terra, and Aqua satellites, and more +recently, the ESA's Sentinel missions, record images of the entire world every +few days. Although single images provide point-in-time data, repeated images of +the same area, or satellite image time series (SITS) provide information about +the changing state of vegetation and land use. These SITS are useful for +modeling dynamic processes and seasonal changes such as plant phenology. They +have potential benefits for many aspects of land and natural resource +management, including applications in agricultural, forest, water, and disaster +management, urban planning, and mining. However, the resulting satellite image +time series (SITS) are complex, incorporating information from the temporal, +spatial, and spectral dimensions. Therefore, deep learning methods are often +deployed as they can analyze these complex relationships. This review presents +a summary of the state-of-the-art methods of modelling environmental, +agricultural, and other Earth observation variables from SITS data using deep +learning methods. We aim to provide a resource for remote sensing experts +interested in using deep learning techniques to enhance Earth observation +models with temporal information. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Is Medieval Distant Viewing Possible? : Extending and Enriching + Annotation of Legacy Image Collections using Visual Analytics + + +
+ Distant viewing approaches have typically used image datasets close to the +contemporary image data used to train machine learning models. To work with +images from other historical periods requires expert annotated data, and the +quality of labels is crucial for the quality of results. Especially when +working with cultural heritage collections that contain myriad uncertainties, +annotating data, or re-annotating, legacy data is an arduous task. In this +paper, we describe working with two pre-annotated sets of medieval manuscript +images that exhibit conflicting and overlapping metadata. Since a manual +reconciliation of the two legacy ontologies would be very expensive, we aim (1) +to create a more uniform set of descriptive labels to serve as a "bridge" in +the combined dataset, and (2) to establish a high quality hierarchical +classification that can be used as a valuable input for subsequent supervised +machine learning. To achieve these goals, we developed visualization and +interaction mechanisms, enabling medievalists to combine, regularize and extend +the vocabulary used to describe these, and other cognate, image datasets. The +visual interfaces provide experts an overview of relationships in the data +going beyond the sum total of the metadata. Word and image embeddings as well +as co-occurrences of labels across the datasets, enable batch re-annotation of +images, recommendation of label candidates and support composing a hierarchical +classification of labels. + +
+
+ comment: Revision after DSH Peer Review. Paper is now accepted at DSH +
+
+
+
+
+ + ♻ ☆ How NeRFs and 3D Gaussian Splatting are Reshaping SLAM: a Survey + + +
+ Over the past two decades, research in the field of Simultaneous Localization +and Mapping (SLAM) has undergone a significant evolution, highlighting its +critical role in enabling autonomous exploration of unknown environments. This +evolution ranges from hand-crafted methods, through the era of deep learning, +to more recent developments focused on Neural Radiance Fields (NeRFs) and 3D +Gaussian Splatting (3DGS) representations. Recognizing the growing body of +research and the absence of a comprehensive survey on the topic, this paper +aims to provide the first comprehensive overview of SLAM progress through the +lens of the latest advancements in radiance fields. It sheds light on the +background, evolutionary path, inherent strengths and limitations, and serves +as a fundamental reference to highlight the dynamic progress and specific +challenges. + +
+
+
+
+
+ + ♻ ☆ 3D Human Reconstruction in the Wild with Synthetic Data Using Generative + Models + + +
+ In this work, we show that synthetic data created by generative models is +complementary to computer graphics (CG) rendered data for achieving remarkable +generalization performance on diverse real-world scenes for 3D human pose and +shape estimation (HPS). Specifically, we propose an effective approach based on +recent diffusion models, termed HumanWild, which can effortlessly generate +human images and corresponding 3D mesh annotations. We first collect a +large-scale human-centric dataset with comprehensive annotations, e.g., text +captions and surface normal images. Then, we train a customized ControlNet +model upon this dataset to generate diverse human images and initial +ground-truth labels. At the core of this step is that we can easily obtain +numerous surface normal images from a 3D human parametric model, e.g., SMPL-X, +by rendering the 3D mesh onto the image plane. As there exists inevitable noise +in the initial labels, we then apply an off-the-shelf foundation segmentation +model, i.e., SAM, to filter negative data samples. Our data generation pipeline +is flexible and customizable to facilitate different real-world tasks, e.g., +ego-centric scenes and perspective-distortion scenes. The generated dataset +comprises 0.79M images with corresponding 3D annotations, covering versatile +viewpoints, scenes, and human identities. We train various HPS regressors on +top of the generated data and evaluate them on a wide range of benchmarks +(3DPW, RICH, EgoBody, AGORA, SSP-3D) to verify the effectiveness of the +generated data. By exclusively employing generative models, we generate +large-scale in-the-wild human images and high-quality annotations, eliminating +the need for real-world data collection. + +
+
+ comment: project page: https://yongtaoge.github.io/projects/humanwild +
+
+
+
+
+ + ♻ ☆ NRDF: Neural Riemannian Distance Fields for Learning Articulated Pose + Priors CVPR 2024 + + +
+ Faithfully modeling the space of articulations is a crucial task that allows +recovery and generation of realistic poses, and remains a notorious challenge. +To this end, we introduce Neural Riemannian Distance Fields (NRDFs), +data-driven priors modeling the space of plausible articulations, represented +as the zero-level-set of a neural field in a high-dimensional +product-quaternion space. To train NRDFs only on positive examples, we +introduce a new sampling algorithm, ensuring that the geodesic distances follow +a desired distribution, yielding a principled distance field learning paradigm. +We then devise a projection algorithm to map any random pose onto the level-set +by an adaptive-step Riemannian optimizer, adhering to the product manifold of +joint rotations at all times. NRDFs can compute the Riemannian gradient via +backpropagation and by mathematical analogy, are related to Riemannian flow +matching, a recent generative model. We conduct a comprehensive evaluation of +NRDF against other pose priors in various downstream tasks, i.e., pose +generation, image-based pose estimation, and solving inverse kinematics, +highlighting NRDF's superior performance. Besides humans, NRDF's versatility +extends to hand and animal poses, as it can effectively represent any +articulation. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://virtualhumans.mpi-inf.mpg.de/nrdf +
+
+
+
+
+ + ♻ ☆ ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State + Space Model + + +
+ Convolutional neural networks (CNN) and Transformers have made impressive +progress in the field of remote sensing change detection (CD). However, both +architectures have inherent shortcomings. Recently, the Mamba architecture, +based on state space models, has shown remarkable performance in a series of +natural language processing tasks, which can effectively compensate for the +shortcomings of the above two architectures. In this paper, we explore for the +first time the potential of the Mamba architecture for remote sensing CD tasks. +We tailor the corresponding frameworks, called MambaBCD, MambaSCD, and +MambaBDA, for binary change detection (BCD), semantic change detection (SCD), +and building damage assessment (BDA), respectively. All three frameworks adopt +the cutting-edge Visual Mamba architecture as the encoder, which allows full +learning of global spatial contextual information from the input images. For +the change decoder, which is available in all three architectures, we propose +three spatio-temporal relationship modeling mechanisms, which can be naturally +combined with the Mamba architecture and fully utilize its attribute to achieve +spatio-temporal interaction of multi-temporal features, thereby obtaining +accurate change information. On five benchmark datasets, our proposed +frameworks outperform current CNN- and Transformer-based approaches without +using any complex training strategies or tricks, fully demonstrating the +potential of the Mamba architecture in CD tasks. Specifically, we obtained +83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, LEVIR-CD+, +and WHU-CD; on the SCD dataset SECOND, we obtained 24.11% SeK; and on the BDA +dataset xBD, we obtained 81.41% overall F1 score. Further experiments show that +our architecture is quite robust to degraded data. The source code will be +available in https://github.com/ChenHongruixuan/MambaCD + +
+
+
+
+
+ + ♻ ☆ RePoseDM: Recurrent Pose Alignment and Gradient Guidance for Pose Guided + Image Synthesis CVPR 2024 + + +
+ Pose-guided person image synthesis task requires re-rendering a reference +image, which should have a photorealistic appearance and flawless pose +transfer. Since person images are highly structured, existing approaches +require dense connections for complex deformations and occlusions because these +are generally handled through multi-level warping and masking in latent space. +The feature maps generated by convolutional neural networks do not have +equivariance, and hence multi-level warping is required to perform pose +alignment. Inspired by the ability of the diffusion model to generate +photorealistic images from the given conditional guidance, we propose recurrent +pose alignment to provide pose-aligned texture features as conditional +guidance. Due to the leakage of the source pose in conditional guidance, we +propose gradient guidance from pose interaction fields, which output the +distance from the valid pose manifold given a predicted pose as input. This +helps in learning plausible pose transfer trajectories that result in +photorealism and undistorted texture details. Extensive results on two +large-scale benchmarks and a user study demonstrate the ability of our proposed +approach to generate photorealistic pose transfer under challenging scenarios. +Additionally, we demonstrate the efficiency of gradient guidance in pose-guided +image generation on the HumanArt dataset with fine-tuned stable diffusion. + +
+
+ comment: Accepted at CVPR 2024 SyntaGen Workshop, 13 pages, 4 tables, 7 + figures +
+
+
+
+
+ + ♻ ☆ COTR: Compact Occupancy TRansformer for Vision-based 3D Occupancy + Prediction CVPR2024 + + +
+ The autonomous driving community has shown significant interest in 3D +occupancy prediction, driven by its exceptional geometric perception and +general object recognition capabilities. To achieve this, current works try to +construct a Tri-Perspective View (TPV) or Occupancy (OCC) representation +extending from the Bird-Eye-View perception. However, compressed views like TPV +representation lose 3D geometry information while raw and sparse OCC +representation requires heavy but redundant computational costs. To address the +above limitations, we propose Compact Occupancy TRansformer (COTR), with a +geometry-aware occupancy encoder and a semantic-aware group decoder to +reconstruct a compact 3D OCC representation. The occupancy encoder first +generates a compact geometrical OCC feature through efficient explicit-implicit +view transformation. Then, the occupancy decoder further enhances the semantic +discriminability of the compact OCC representation by a coarse-to-fine semantic +grouping strategy. Empirical experiments show that there are evident +performance gains across multiple baselines, e.g., COTR outperforms baselines +with a relative improvement of 8%-15%, demonstrating the superiority of our +method. + +
+
+ comment: CVPR2024. Code is available at https://github.com/NotACracker/COTR +
+
+
+
+
+ + ♻ ☆ IIDM: Inter and Intra-domain Mixing for Semi-supervised Domain + Adaptation in Semantic Segmentation + + +
+ Despite recent advances in semantic segmentation, an inevitable challenge is +the performance degradation caused by the domain shift in real applications. +Current dominant approach to solve this problem is unsupervised domain +adaptation (UDA). However, the absence of labeled target data in UDA is overly +restrictive and limits performance. To overcome this limitation, a more +practical scenario called semi-supervised domain adaptation (SSDA) has been +proposed. Existing SSDA methods are derived from the UDA paradigm and primarily +focus on leveraging the unlabeled target data and source data. In this paper, +we highlight the significance of exploiting the intra-domain information +between the labeled target data and unlabeled target data. Instead of solely +using the scarce labeled target data for supervision, we propose a novel SSDA +framework that incorporates both Inter and Intra Domain Mixing (IIDM), where +inter-domain mixing mitigates the source-target domain gap and intra-domain +mixing enriches the available target domain information, and the network can +capture more domain-invariant features. We also explore different domain mixing +strategies to better exploit the target domain information. Comprehensive +experiments conducted on the GTA5 to Cityscapes and SYNTHIA to Cityscapes +benchmarks demonstrate the effectiveness of IIDM, surpassing previous methods +by a large margin. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ WWW: A Unified Framework for Explaining What, Where and Why of Neural + Networks by Interpretation of Neuron Concepts CVPR 2024 + + +
+ Recent advancements in neural networks have showcased their remarkable +capabilities across various domains. Despite these successes, the "black box" +problem still remains. Addressing this, we propose a novel framework, WWW, that +offers the 'what', 'where', and 'why' of the neural network decisions in +human-understandable terms. Specifically, WWW utilizes adaptive selection for +concept discovery, employing adaptive cosine similarity and thresholding +techniques to effectively explain 'what'. To address the 'where' and 'why', we +proposed a novel combination of neuron activation maps (NAMs) with Shapley +values, generating localized concept maps and heatmaps for individual inputs. +Furthermore, WWW introduces a method for predicting uncertainty, leveraging +heatmap similarities to estimate 'how' reliable the prediction is. Experimental +evaluations of WWW demonstrate superior performance in both quantitative and +qualitative metrics, outperforming existing methods in interpretability. WWW +provides a unified solution for explaining 'what', 'where', and 'why', +introducing a method for localized explanations from global interpretations and +offering a plug-and-play solution adaptable to various architectures. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Samba: Semantic Segmentation of Remotely Sensed Images with State Space + Model + + +
+ High-resolution remotely sensed images pose a challenge for commonly used +semantic segmentation methods such as Convolutional Neural Network (CNN) and +Vision Transformer (ViT). CNN-based methods struggle with handling such +high-resolution images due to their limited receptive field, while ViT faces +challenges in handling long sequences. Inspired by Mamba, which adopts a State +Space Model (SSM) to efficiently capture global semantic information, we +propose a semantic segmentation framework for high-resolution remotely sensed +images, named Samba. Samba utilizes an encoder-decoder architecture, with Samba +blocks serving as the encoder for efficient multi-level semantic information +extraction, and UperNet functioning as the decoder. We evaluate Samba on the +LoveDA, ISPRS Vaihingen, and ISPRS Potsdam datasets, comparing its performance +against top-performing CNN and ViT methods. The results reveal that Samba +achieved unparalleled performance on commonly used remote sensing datasets for +semantic segmentation. Our proposed Samba demonstrates for the first time the +effectiveness of SSM in semantic segmentation of remotely sensed images, +setting a new benchmark in performance for Mamba-based techniques in this +specific application. The source code and baseline implementations are +available at https://github.com/zhuqinfeng1999/Samba. + +
+
+
+
+
+ + ♻ ☆ Driver Attention Tracking and Analysis + + +
+ We propose a novel method to estimate a driver's points-of-gaze using a pair +of ordinary cameras mounted on the windshield and dashboard of a car. This is a +challenging problem due to the dynamics of traffic environments with 3D scenes +of unknown depths. This problem is further complicated by the volatile distance +between the driver and the camera system. To tackle these challenges, we +develop a novel convolutional network that simultaneously analyzes the image of +the scene and the image of the driver's face. This network has a camera +calibration module that can compute an embedding vector that represents the +spatial configuration between the driver and the camera system. This +calibration module improves the overall network's performance, which can be +jointly trained end to end. + We also address the lack of annotated data for training and evaluation by +introducing a large-scale driving dataset with point-of-gaze annotations. This +is an in situ dataset of real driving sessions in an urban city, containing +synchronized images of the driving scene as well as the face and gaze of the +driver. Experiments on this dataset show that the proposed method outperforms +various baseline methods, having the mean prediction error of 29.69 pixels, +which is relatively small compared to the $1280{\times}720$ resolution of the +scene camera. + +
+
+
+
+
+ + ♻ ☆ SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike + Camera + + +
+ One of the most critical factors in achieving sharp Novel View Synthesis +(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D +Gaussian Splatting (3DGS) is the quality of the training images. However, +Conventional RGB cameras are susceptible to motion blur. In contrast, +neuromorphic cameras like event and spike cameras inherently capture more +comprehensive temporal information, which can provide a sharp representation of +the scene as additional training data. Recent methods have explored the +integration of event cameras to improve the quality of NVS. The event-RGB +approaches have some limitations, such as high training costs and the inability +to work effectively in the background. Instead, our study introduces a new +method that uses the spike camera to overcome these limitations. By considering +texture reconstruction from spike streams as ground truth, we design the +Texture from Spike (TfS) loss. Since the spike camera relies on temporal +integration instead of temporal differentiation used by event cameras, our +proposed TfS loss maintains manageable training costs. It handles foreground +objects with backgrounds simultaneously. We also provide a real-world dataset +captured with our spike-RGB camera system to facilitate future research +endeavors. We conduct extensive experiments using synthetic and real-world +datasets to demonstrate that our design can enhance novel view synthesis across +NeRF and 3DGS. The code and dataset will be made available for public access. + +
+
+
+
+
+ + ♻ ☆ Learning Object Permanence from Videos via Latent Imaginations + + +
+ While human infants exhibit knowledge about object permanence from two months +of age onwards, deep-learning approaches still largely fail to recognize +objects' continued existence. We introduce a slot-based autoregressive deep +learning system, the looped location and identity tracking model Loci-Looped, +which learns to adaptively fuse latent imaginations with pixel-space +observations into consistent latent object-specific what and where encodings +over time. The novel loop empowers Loci-Looped to learn the physical concepts +of object permanence, directional inertia, and object solidity through +observation alone. As a result, Loci-Looped tracks objects through occlusions, +anticipates their reappearance, and shows signs of surprise and internal +revisions when observing implausible object behavior. Notably, Loci-Looped +outperforms state-of-the-art baseline models in handling object occlusions and +temporary sensory interruptions while exhibiting more compositional, +interpretable internal activity patterns. Our work thus introduces the first +self-supervised interpretable learning model that learns about object +permanence directly from video data without supervision. + +
+
+
+
+
+ + ♻ ☆ VSCode: General Visual Salient and Camouflaged Object Detection with 2D + Prompt Learning CVPR2024 + + +
+ Salient object detection (SOD) and camouflaged object detection (COD) are +related yet distinct binary mapping tasks. These tasks involve multiple +modalities, sharing commonalities and unique cues. Existing research often +employs intricate task-specific specialist models, potentially leading to +redundancy and suboptimal results. We introduce VSCode, a generalist model with +novel 2D prompt learning, to jointly address four SOD tasks and three COD +tasks. We utilize VST as the foundation model and introduce 2D prompts within +the encoder-decoder architecture to learn domain and task-specific knowledge on +two separate dimensions. A prompt discrimination loss helps disentangle +peculiarities to benefit model optimization. VSCode outperforms +state-of-the-art methods across six tasks on 26 datasets and exhibits zero-shot +generalization to unseen tasks by combining 2D prompts, such as RGB-D COD. +Source code has been available at https://github.com/Sssssuperior/VSCode. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Extended Reality for Mental Health Evaluation -A Scoping Review + + +
+ Mental health disorders are the leading cause of health-related problems +globally. It is projected that mental health disorders will be the leading +cause of morbidity among adults as the incidence rates of anxiety and +depression grows globally. Recently, extended reality (XR), a general term +covering virtual reality (VR), augmented reality (AR) and mixed reality (MR), +is paving a new way to deliver mental health care. In this paper, we conduct a +scoping review on the development and application of XR in the area of mental +disorders. We performed a scoping database search to identify the relevant +studies indexed in Google Scholar, PubMed, and the ACM Digital Library. A +search period between August 2016 and December 2023 was defined to select +articles related to the usage of VR, AR, and MR in a mental health context. We +identified a total of 85 studies from 27 countries across the globe. By +performing data analysis, we found that most of the studies focused on +developed countries such as the US (16.47%) and Germany (12.94%). None of the +studies were for African countries. The majority of the articles reported that +XR techniques led to a significant reduction in symptoms of anxiety or +depression. More studies were published in the year 2021, i.e., 31.76% (n = +31). This could indicate that mental disorder intervention received a higher +attention when COVID-19 emerged. Most studies (n = 65) focused on a population +between 18 and 65 years old, only a few studies focused on teenagers (n = 2). +Also, more studies were done experimentally (n = 67, 78.82%) rather than by +analytical and modeling approaches (n = 8, 9.41%). This shows that there is a +rapid development of XR technology for mental health care. Furthermore, these +studies showed that XR technology can effectively be used for evaluating mental +disorders in similar or better way as the conventional approaches. + +
+
+
+
+
+ + ♻ ☆ VST++: Efficient and Stronger Visual Saliency Transformer + + +
+ While previous CNN-based models have exhibited promising results for salient +object detection (SOD), their ability to explore global long-range dependencies +is restricted. Our previous work, the Visual Saliency Transformer (VST), +addressed this constraint from a transformer-based sequence-to-sequence +perspective, to unify RGB and RGB-D SOD. In VST, we developed a multi-task +transformer decoder that concurrently predicts saliency and boundary outcomes +in a pure transformer architecture. Moreover, we introduced a novel token +upsampling method called reverse T2T for predicting a high-resolution saliency +map effortlessly within transformer-based structures. Building upon the VST +model, we further propose an efficient and stronger VST version in this work, +i.e. VST++. To mitigate the computational costs of the VST model, we propose a +Select-Integrate Attention (SIA) module, partitioning foreground into +fine-grained segments and aggregating background information into a single +coarse-grained token. To incorporate 3D depth information with low cost, we +design a novel depth position encoding method tailored for depth maps. +Furthermore, we introduce a token-supervised prediction loss to provide +straightforward guidance for the task-related tokens. We evaluate our VST++ +model across various transformer-based backbones on RGB, RGB-D, and RGB-T SOD +benchmark datasets. Experimental results show that our model outperforms +existing methods while achieving a 25% reduction in computational costs without +significant performance compromise. The demonstrated strong ability for +generalization, enhanced performance, and heightened efficiency of our VST++ +model highlight its potential. + +
+
+
+
+
+ + ♻ ☆ Towards Reliable Medical Image Segmentation by utilizing Evidential + Calibrated Uncertainty + + +
+ Medical image segmentation is critical for disease diagnosis and treatment +assessment. However, concerns regarding the reliability of segmentation regions +persist among clinicians, mainly attributed to the absence of confidence +assessment, robustness, and calibration to accuracy. To address this, we +introduce DEviS, an easily implementable foundational model that seamlessly +integrates into various medical image segmentation networks. DEviS not only +enhances the calibration and robustness of baseline segmentation accuracy but +also provides high-efficiency uncertainty estimation for reliable predictions. +By leveraging subjective logic theory, we explicitly model probability and +uncertainty for the problem of medical image segmentation. Here, the Dirichlet +distribution parameterizes the distribution of probabilities for different +classes of the segmentation results. To generate calibrated predictions and +uncertainty, we develop a trainable calibrated uncertainty penalty. +Furthermore, DEviS incorporates an uncertainty-aware filtering module, which +utilizes the metric of uncertainty-calibrated error to filter reliable data +within the dataset. We conducted validation studies to assess both the accuracy +and robustness of DEviS segmentation, along with evaluating the efficiency and +reliability of uncertainty estimation. These evaluations were performed using +publicly available datasets including ISIC2018, LiTS2017, and BraTS2019. +Additionally, two potential clinical trials are being conducted at Johns +Hopkins OCT, Duke-OCT-DME, and FIVES datasets to demonstrate their efficacy in +filtering high-quality or out-of-distribution data. Our code has been released +in https://github.com/Cocofeat/DEviS. + +
+
+ comment: 34 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Analyzing the Internals of Neural Radiance Fields CVPR + + +
+ Modern Neural Radiance Fields (NeRFs) learn a mapping from position to +volumetric density leveraging proposal network samplers. In contrast to the +coarse-to-fine sampling approach with two NeRFs, this offers significant +potential for acceleration using lower network capacity. Given that NeRFs +utilize most of their network capacity to estimate radiance, they could store +valuable density information in their parameters or their deep features. To +investigate this proposition, we take one step back and analyze large, trained +ReLU-MLPs used in coarse-to-fine sampling. Building on our novel activation +visualization method, we find that trained NeRFs, Mip-NeRFs and proposal +network samplers map samples with high density to local minima along a ray in +activation feature space. We show how these large MLPs can be accelerated by +transforming intermediate activations to a weight estimate, without any +modifications to the training protocol or the network architecture. With our +approach, we can reduce the computational requirements of trained NeRFs by up +to 50% with only a slight hit in rendering quality. Extensive experimental +evaluation on a variety of datasets and architectures demonstrates the +effectiveness of our approach. Consequently, our methodology provides valuable +insight into the inner workings of NeRFs. + +
+
+ comment: Accepted to CVPRW'24! Project Page: + https://r4dl.github.io/nerfinternals/ +
+
+
+
+
+ + ♻ ☆ S^2MVTC: a Simple yet Efficient Scalable Multi-View Tensor Clustering CVPR2024 + + +
+ Anchor-based large-scale multi-view clustering has attracted considerable +attention for its effectiveness in handling massive datasets. However, current +methods mainly seek the consensus embedding feature for clustering by exploring +global correlations between anchor graphs or projection matrices.In this paper, +we propose a simple yet efficient scalable multi-view tensor clustering +(S^2MVTC) approach, where our focus is on learning correlations of embedding +features within and across views. Specifically, we first construct the +embedding feature tensor by stacking the embedding features of different views +into a tensor and rotating it. Additionally, we build a novel tensor +low-frequency approximation (TLFA) operator, which incorporates graph +similarity into embedding feature learning, efficiently achieving smooth +representation of embedding features within different views. Furthermore, +consensus constraints are applied to embedding features to ensure inter-view +semantic consistency. Experimental results on six large-scale multi-view +datasets demonstrate that S^2MVTC significantly outperforms state-of-the-art +algorithms in terms of clustering performance and CPU execution time, +especially when handling massive data. The code of S^2MVTC is publicly +available at https://github.com/longzhen520/S2MVTC. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark + + +
+ Multi-label image classification in dynamic environments is a problem that +poses significant challenges. Previous studies have primarily focused on +scenarios such as Domain Incremental Learning and Class Incremental Learning, +which do not fully capture the complexity of real-world applications. In this +paper, we study the problem of classification of medical imaging in the +scenario termed New Instances and New Classes, which combines the challenges of +both new class arrivals and domain shifts in a single framework. Unlike +traditional scenarios, it reflects the realistic nature of CL in domains such +as medical imaging, where updates may introduce both new classes and changes in +domain characteristics. To address the unique challenges posed by this complex +scenario, we introduce a novel approach called Pseudo-Label Replay. This method +aims to mitigate forgetting while adapting to new classes and domain shifts by +combining the advantages of the Replay and Pseudo-Label methods and solving +their limitations in the proposed scenario. We evaluate our proposed approach +on a challenging benchmark consisting of two datasets, seven tasks, and +nineteen classes, modeling a realistic Continual Learning scenario. Our +experimental findings demonstrate the effectiveness of Pseudo-Label Replay in +addressing the challenges posed by the complex scenario proposed. Our method +surpasses existing approaches, exhibiting superior performance while showing +minimal forgetting. + +
+
+
+
+
+ + ♻ ☆ FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal + Consistency and Correlation Debiasing CVPR 2024 + + +
+ Dynamic scene graph generation (SGG) from videos requires not only a +comprehensive understanding of objects across scenes but also a method to +capture the temporal motions and interactions with different objects. Moreover, +the long-tailed distribution of visual relationships is a crucial bottleneck +for most dynamic SGG methods. This is because many of them focus on capturing +spatio-temporal context using complex architectures, leading to the generation +of biased scene graphs. To address these challenges, we propose +\textsc{FloCoDe}: \textbf{Flo}w-aware Temporal Consistency and +\textbf{Co}rrelation \textbf{De}biasing with uncertainty attenuation for +unbiased dynamic scene graphs. \textsc{FloCoDe} employs feature warping using +flow to detect temporally consistent objects across frames. To address the +long-tail issue of visual relationships, we propose correlation debiasing and a +label correlation-based loss to learn unbiased relation representations for +long-tailed classes. Specifically, we propose to incorporate label correlations +using contrastive loss to capture commonly co-occurring relations, which aids +in learning robust representations for long-tailed classes. Further, we adopt +the uncertainty attenuation-based classifier framework to handle noisy +annotations in the SGG data. Extensive experimental evaluation shows a +performance gain as high as 4.1\%, demonstrating the superiority of generating +more unbiased scene graphs. + +
+
+ comment: Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ Test-Time Zero-Shot Temporal Action Localization CVPR 2024 + + +
+ Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate +actions in untrimmed videos unseen during training. Existing ZS-TAL methods +involve fine-tuning a model on a large amount of annotated training data. While +effective, training-based ZS-TAL approaches assume the availability of labeled +data for supervised learning, which can be impractical in some applications. +Furthermore, the training process naturally induces a domain bias into the +learned model, which may adversely affect the model's generalization ability to +arbitrary videos. These considerations prompt us to approach the ZS-TAL problem +from a radically novel perspective, relaxing the requirement for training data. +To this aim, we introduce a novel method that performs Test-Time adaptation for +Temporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained +Vision and Language Model (VLM). T3AL operates in three steps. First, a +video-level pseudo-label of the action category is computed by aggregating +information from the entire video. Then, action localization is performed +adopting a novel procedure inspired by self-supervised learning. Finally, +frame-level textual descriptions extracted with a state-of-the-art captioning +model are employed for refining the action region proposals. We validate the +effectiveness of T3AL by conducting experiments on the THUMOS14 and the +ActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly +outperforms zero-shot baselines based on state-of-the-art VLMs, confirming the +benefit of a test-time adaptation approach. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Finding Regions of Interest in Whole Slide Images Using Multiple + Instance Learning + + +
+ Whole Slide Images (WSI), obtained by high-resolution digital scanning of +microscope slides at multiple scales, are the cornerstone of modern Digital +Pathology. However, they represent a particular challenge to +AI-based/AI-mediated analysis because pathology labeling is typically done at +slide-level, instead of tile-level. It is not just that medical diagnostics is +recorded at the specimen level, the detection of oncogene mutation is also +experimentally obtained, and recorded by initiatives like The Cancer Genome +Atlas (TCGA), at the slide level. This configures a dual challenge: a) +accurately predicting the overall cancer phenotype and b) finding out what +cellular morphologies are associated with it at the tile level. To address +these challenges, a weakly supervised Multiple Instance Learning (MIL) approach +was explored for two prevalent cancer types, Invasive Breast Carcinoma +(TCGA-BRCA) and Lung Squamous Cell Carcinoma (TCGA-LUSC). This approach was +explored for tumor detection at low magnification levels and TP53 mutations at +various levels. Our results show that a novel additive implementation of MIL +matched the performance of reference implementation (AUC 0.96), and was only +slightly outperformed by Attention MIL (AUC 0.97). More interestingly from the +perspective of the molecular pathologist, these different AI architectures +identify distinct sensitivities to morphological features (through the +detection of Regions of Interest, RoI) at different amplification levels. +Tellingly, TP53 mutation was most sensitive to features at the higher +applications where cellular morphology is resolved. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Invariance for Robust and Interpretable Vision Tasks at + Larger Scales + + +
+ Developing robust and interpretable vision systems is a crucial step towards +trustworthy artificial intelligence. In this regard, a promising paradigm +considers embedding task-required invariant structures, e.g., geometric +invariance, in the fundamental image representation. However, such invariant +representations typically exhibit limited discriminability, limiting their +applications in larger-scale trustworthy vision tasks. For this open problem, +we conduct a systematic investigation of hierarchical invariance, exploring +this topic from theoretical, practical, and application perspectives. At the +theoretical level, we show how to construct over-complete invariants with a +Convolutional Neural Networks (CNN)-like hierarchical architecture yet in a +fully interpretable manner. The general blueprint, specific definitions, +invariant properties, and numerical implementations are provided. At the +practical level, we discuss how to customize this theoretical framework into a +given task. With the over-completeness, discriminative features w.r.t. the task +can be adaptively formed in a Neural Architecture Search (NAS)-like manner. We +demonstrate the above arguments with accuracy, invariance, and efficiency +results on texture, digit, and parasite classification experiments. +Furthermore, at the application level, our representations are explored in +real-world forensics tasks on adversarial perturbations and Artificial +Intelligence Generated Content (AIGC). Such applications reveal that the +proposed strategy not only realizes the theoretically promised invariance, but +also exhibits competitive discriminability even in the era of deep learning. +For robust and interpretable vision tasks at larger scales, hierarchical +invariant representation can be considered as an effective alternative to +traditional CNN and invariants. + +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ MV-Adapter: Multimodal Video Transfer Learning for Video Text Retrieval + + +
+ State-of-the-art video-text retrieval (VTR) methods typically involve fully +fine-tuning a pre-trained model (e.g. CLIP) on specific datasets. However, this +can result in significant storage costs in practical applications as a separate +model per task must be stored. To address this issue, we present our pioneering +work that enables parameter-efficient VTR using a pre-trained model, with only +a small number of tunable parameters during training. Towards this goal, we +propose a new method dubbed Multimodal Video Adapter (MV-Adapter) for +efficiently transferring the knowledge in the pre-trained CLIP from image-text +to video-text. Specifically, MV-Adapter utilizes bottleneck structures in both +video and text branches, along with two novel components. The first is a +Temporal Adaptation Module that is incorporated in the video branch to +introduce global and local temporal contexts. We also train weights +calibrations to adjust to dynamic variations across frames. The second is Cross +Modality Tying that generates weights for video/text branches through sharing +cross modality factors, for better aligning between modalities. Thanks to above +innovations, MV-Adapter can achieve comparable or better performance than +standard full fine-tuning with negligible parameters overhead. Notably, +MV-Adapter consistently outperforms various competing methods in V2T/T2V tasks +with large margins on five widely used VTR benchmarks (MSR-VTT, MSVD, LSMDC, +DiDemo, and ActivityNet). + +
+
+
+
+
+ + ♻ ☆ Diff-Plugin: Revitalizing Details for Diffusion-based Low-level Tasks CVPR2024 + + +
+ Diffusion models trained on large-scale datasets have achieved remarkable +progress in image synthesis. However, due to the randomness in the diffusion +process, they often struggle with handling diverse low-level tasks that require +details preservation. To overcome this limitation, we present a new Diff-Plugin +framework to enable a single pre-trained diffusion model to generate +high-fidelity results across a variety of low-level tasks. Specifically, we +first propose a lightweight Task-Plugin module with a dual branch design to +provide task-specific priors, guiding the diffusion process in preserving image +content. We then propose a Plugin-Selector that can automatically select +different Task-Plugins based on the text instruction, allowing users to edit +images by indicating multiple low-level tasks with natural language. We conduct +extensive experiments on 8 low-level vision tasks. The results demonstrate the +superiority of Diff-Plugin over existing methods, particularly in real-world +scenarios. Our ablations further validate that Diff-Plugin is stable, +schedulable, and supports robust training across different dataset sizes. + +
+
+ comment: Accepted to CVPR2024. Replaced some celebrity images to avoid + copyright disputes +
+
+
+
+
+ + ♻ ☆ HPNet: Dynamic Trajectory Forecasting with Historical Prediction + Attention CVPR2024 + + +
+ Predicting the trajectories of road agents is essential for autonomous +driving systems. The recent mainstream methods follow a static paradigm, which +predicts the future trajectory by using a fixed duration of historical frames. +These methods make the predictions independently even at adjacent time steps, +which leads to potential instability and temporal inconsistency. As successive +time steps have largely overlapping historical frames, their forecasting should +have intrinsic correlation, such as overlapping predicted trajectories should +be consistent, or be different but share the same motion goal depending on the +road situation. Motivated by this, in this work, we introduce HPNet, a novel +dynamic trajectory forecasting method. Aiming for stable and accurate +trajectory forecasting, our method leverages not only historical frames +including maps and agent states, but also historical predictions. Specifically, +we newly design a Historical Prediction Attention module to automatically +encode the dynamic relationship between successive predictions. Besides, it +also extends the attention range beyond the currently visible window +benefitting from the use of historical predictions. The proposed Historical +Prediction Attention together with the Agent Attention and Mode Attention is +further formulated as the Triple Factorized Attention module, serving as the +core design of HPNet.Experiments on the Argoverse and INTERACTION datasets show +that HPNet achieves state-of-the-art performance, and generates accurate and +stable future trajectories. Our code are available at +https://github.com/XiaolongTang23/HPNet. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ MIPS at SemEval-2024 Task 3: Multimodal Emotion-Cause Pair Extraction in + Conversations with Multimodal Language Models SemEval '24 + + +
+ This paper presents our winning submission to Subtask 2 of SemEval 2024 Task +3 on multimodal emotion cause analysis in conversations. We propose a novel +Multimodal Emotion Recognition and Multimodal Emotion Cause Extraction +(MER-MCE) framework that integrates text, audio, and visual modalities using +specialized emotion encoders. Our approach sets itself apart from +top-performing teams by leveraging modality-specific features for enhanced +emotion understanding and causality inference. Experimental evaluation +demonstrates the advantages of our multimodal approach, with our submission +achieving a competitive weighted F1 score of 0.3435, ranking third with a +margin of only 0.0339 behind the 1st team and 0.0025 behind the 2nd team. +Project: https://github.com/MIPS-COLT/MER-MCE.git + +
+
+ comment: Ranked 3rd in SemEval '24 Task 3 with F1 of 0.3435, close to 1st & + 2nd by 0.0339 & 0.0025 +
+
+
+
+
+ + ♻ ☆ DriveDreamer-2: LLM-Enhanced World Models for Diverse Driving Video + Generation + + +
+ World models have demonstrated superiority in autonomous driving, +particularly in the generation of multi-view driving videos. However, +significant challenges still exist in generating customized driving videos. In +this paper, we propose DriveDreamer-2, which builds upon the framework of +DriveDreamer and incorporates a Large Language Model (LLM) to generate +user-defined driving videos. Specifically, an LLM interface is initially +incorporated to convert a user's query into agent trajectories. Subsequently, a +HDMap, adhering to traffic regulations, is generated based on the trajectories. +Ultimately, we propose the Unified Multi-View Model to enhance temporal and +spatial coherence in the generated driving videos. DriveDreamer-2 is the first +world model to generate customized driving videos, it can generate uncommon +driving videos (e.g., vehicles abruptly cut in) in a user-friendly manner. +Besides, experimental results demonstrate that the generated videos enhance the +training of driving perception methods (e.g., 3D detection and tracking). +Furthermore, video generation quality of DriveDreamer-2 surpasses other +state-of-the-art methods, showcasing FID and FVD scores of 11.2 and 55.7, +representing relative improvements of 30% and 50%. + +
+
+ comment: Project Page: https://drivedreamer2.github.io +
+
+
+
+
+ + ♻ ☆ Deep Multi-Threshold Spiking-UNet for Image Processing + + +
+ U-Net, known for its simple yet efficient architecture, is widely utilized +for image processing tasks and is particularly suitable for deployment on +neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for +image processing, which combines the power of Spiking Neural Networks (SNNs) +with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two +primary challenges: ensuring high-fidelity information propagation through the +network via spikes and formulating an effective training strategy. To address +the issue of information loss, we introduce multi-threshold spiking neurons, +which improve the efficiency of information transmission within the +Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning +pipeline that leverage pre-trained U-Net models. During the conversion process, +significant variability in data distribution across different parts is observed +when utilizing skip connections. Therefore, we propose a connection-wise +normalization method to prevent inaccurate firing rates. Furthermore, we adopt +a flow-based training method to fine-tune the converted models, reducing time +steps while preserving performance. Experimental results show that, on image +segmentation and denoising, our Spiking-UNet achieves comparable performance to +its non-spiking counterpart, surpassing existing SNN methods. Compared with the +converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference +time by approximately 90\%. This research broadens the application scope of +SNNs in image processing and is expected to inspire further exploration in the +field of neuromorphic engineering. The code for our Spiking-UNet implementation +is available at https://github.com/SNNresearch/Spiking-UNet. + +
+
+ comment: Accepted in NeuroComputing +
+
+
+
+
+ + ♻ ☆ GEM3D: GEnerative Medial Abstractions for 3D Shape Synthesis SIGGRAPH 2024 + + +
+ We introduce GEM3D -- a new deep, topology-aware generative model of 3D +shapes. The key ingredient of our method is a neural skeleton-based +representation encoding information on both shape topology and geometry. +Through a denoising diffusion probabilistic model, our method first generates +skeleton-based representations following the Medial Axis Transform (MAT), then +generates surfaces through a skeleton-driven neural implicit formulation. The +neural implicit takes into account the topological and geometric information +stored in the generated skeleton representations to yield surfaces that are +more topologically and geometrically accurate compared to previous neural field +formulations. We discuss applications of our method in shape synthesis and +point cloud reconstruction tasks, and evaluate our method both qualitatively +and quantitatively. We demonstrate significantly more faithful surface +reconstruction and diverse shape generation results compared to the +state-of-the-art, also involving challenging scenarios of reconstructing and +synthesizing structurally complex, high-genus shape surfaces from Thingi10K and +ShapeNet. + +
+
+ comment: Webpage: https://lodurality.github.io/GEM3D/ -- Cond. accept. to + SIGGRAPH 2024 (conf. track) -- Changes (based on reviews): changed style to + sigconf; rearranged figures for readability; added missing citations; fixed + misaligned centers in Fig. 3; added failure cases (Fig. 10); rewrote + discussion; added categories averages to Tab. 8; added Tab. 10 with model + capacities +
+
+
+
+
+ + ♻ ☆ Fourier Prompt Tuning for Modality-Incomplete Scene Segmentation + + +
+ Integrating information from multiple modalities enhances the robustness of +scene perception systems in autonomous vehicles, providing a more comprehensive +and reliable sensory framework. However, the modality incompleteness in +multi-modal segmentation remains under-explored. In this work, we establish a +task called Modality-Incomplete Scene Segmentation (MISS), which encompasses +both system-level modality absence and sensor-level modality errors. To avoid +the predominant modality reliance in multi-modal fusion, we introduce a +Missing-aware Modal Switch (MMS) strategy to proactively manage missing +modalities during training. Utilizing bit-level batch-wise sampling enhances +the model's performance in both complete and incomplete testing scenarios. +Furthermore, we introduce the Fourier Prompt Tuning (FPT) method to incorporate +representative spectral information into a limited number of learnable prompts +that maintain robustness against all MISS scenarios. Akin to fine-tuning +effects but with fewer tunable parameters (1.1%). Extensive experiments prove +the efficacy of our proposed approach, showcasing an improvement of 5.84% mIoU +over the prior state-of-the-art parameter-efficient methods in modality +missing. The source code is publicly available at +https://github.com/RuipingL/MISS. + +
+
+ comment: Accepted to IEEE IV 2024. The source code is publicly available at + https://github.com/RuipingL/MISS +
+
+
+
+
+ + ♻ ☆ Tensor Decomposition Based Attention Module for Spiking Neural Networks + + +
+ The attention mechanism has been proven to be an effective way to improve +spiking neural network (SNN). However, based on the fact that the current SNN +input data flow is split into tensors to process on GPUs, none of the previous +works consider the properties of tensors to implement an attention module. This +inspires us to rethink current SNN from the perspective of tensor-relevant +theories. Using tensor decomposition, we design the \textit{projected full +attention} (PFA) module, which demonstrates excellent results with linearly +growing parameters. Specifically, PFA is composed by the \textit{linear +projection of spike tensor} (LPST) module and \textit{attention map composing} +(AMC) module. In LPST, we start by compressing the original spike tensor into +three projected tensors using a single property-preserving strategy with +learnable parameters for each dimension. Then, in AMC, we exploit the inverse +procedure of the tensor decomposition process to combine the three tensors into +the attention map using a so-called connecting factor. To validate the +effectiveness of the proposed PFA module, we integrate it into the widely used +VGG and ResNet architectures for classification tasks. Our method achieves +state-of-the-art performance on both static and dynamic benchmark datasets, +surpassing the existing SNN models with Transformer-based and CNN-based +backbones. + +
+
+ comment: Accepted by Knowledge-Based Systems +
+
+
+
+
+ + ♻ ☆ TC4D: Trajectory-Conditioned Text-to-4D Generation + + +
+ Recent techniques for text-to-4D generation synthesize dynamic 3D scenes +using supervision from pre-trained text-to-video models. However, existing +representations for motion, such as deformation models or time-dependent neural +representations, are limited in the amount of motion they can generate-they +cannot synthesize motion extending far beyond the bounding box used for volume +rendering. The lack of a more flexible motion model contributes to the gap in +realism between 4D generation methods and recent, near-photorealistic video +generation models. Here, we propose TC4D: trajectory-conditioned text-to-4D +generation, which factors motion into global and local components. We represent +the global motion of a scene's bounding box using rigid transformation along a +trajectory parameterized by a spline. We learn local deformations that conform +to the global trajectory using supervision from a text-to-video model. Our +approach enables the synthesis of scenes animated along arbitrary trajectories, +compositional scene generation, and significant improvements to the realism and +amount of generated motion, which we evaluate qualitatively and through a user +study. Video results can be viewed on our website: +https://sherwinbahmani.github.io/tc4d. + +
+
+ comment: Project Page: https://sherwinbahmani.github.io/tc4d +
+
+
+
+
+ + ♻ ☆ Exploring Effective Priors and Efficient Models for Weakly-Supervised + Change Detection + + +
+ Weakly-supervised change detection (WSCD) aims to detect pixel-level changes +with only image-level annotations. Owing to its label efficiency, WSCD is +drawing increasing attention recently. However, current WSCD methods often +encounter the challenge of change missing and fabricating, i.e., the +inconsistency between image-level annotations and pixel-level predictions. +Specifically, change missing refer to the situation that the WSCD model fails +to predict any changed pixels, even though the image-level label indicates +changed, and vice versa for change fabricating. To address this challenge, in +this work, we leverage global-scale and local-scale priors in WSCD and propose +two components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint. +The DP decoder decodes samples with the changed image-level label, skips +samples with the unchanged label, and replaces them with an all-unchanged +pixel-level label. The LG constraint is derived from the correspondence between +changed representations and image-level labels, penalizing the model when it +mispredicts the change status. Additionally, we develop TransWCD, a simple yet +powerful transformer-based model, showcasing the potential of weakly-supervised +learning in change detection. By integrating the DP decoder and LG constraint +into TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL +achieve significant +6.33% and +9.55% F1 score improvements over the +state-of-the-art methods on the WHU-CD dataset, respectively. Some performance +metrics even exceed several fully-supervised change detection (FSCD) +competitors. Code will be available at +https://github.com/zhenghuizhao/TransWCD. + +
+
+
+
+
+ + ♻ ☆ One-Prompt to Segment All Medical Images + + +
+ Large foundation models, known for their strong zero-shot generalization, +have excelled in visual and language applications. However, applying them to +medical image segmentation, a domain with diverse imaging types and target +labels, remains an open challenge. Current approaches, such as adapting +interactive segmentation models like Segment Anything Model (SAM), require user +prompts for each sample during inference. Alternatively, transfer learning +methods like few/one-shot models demand labeled samples, leading to high costs. +This paper introduces a new paradigm toward the universal medical image +segmentation, termed 'One-Prompt Segmentation.' One-Prompt Segmentation +combines the strengths of one-shot and interactive methods. In the inference +stage, with just \textbf{one prompted sample}, it can adeptly handle the unseen +task in a single forward pass. We train One-Prompt Model on 64 open-source +medical datasets, accompanied by the collection of over 3,000 clinician-labeled +prompts. Tested on 14 previously unseen datasets, the One-Prompt Model +showcases superior zero-shot segmentation capabilities, outperforming a wide +range of related methods. The code and data is released as +\url{https://github.com/KidsWithTokens/one-prompt}. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.12620 +
+
+
+
+
+ + ♻ ☆ Large-Scale Multi-Hypotheses Cell Tracking Using Ultrametric Contours + Maps + + +
+ In this work, we describe a method for large-scale 3D cell-tracking through a +segmentation selection approach. The proposed method is effective at tracking +cells across large microscopy datasets on two fronts: (i) It can solve problems +containing millions of segmentation instances in terabyte-scale 3D+t datasets; +(ii) It achieves competitive results with or without deep learning, which +requires 3D annotated data, that is scarce in the fluorescence microscopy +field. The proposed method computes cell tracks and segments using a hierarchy +of segmentation hypotheses and selects disjoint segments by maximizing the +overlap between adjacent frames. We show that this method achieves +state-of-the-art results in 3D images from the cell tracking challenge and has +a faster integer linear programming formulation. Moreover, our framework is +flexible and supports segmentations from off-the-shelf cell segmentation models +and can combine them into an ensemble that improves tracking. The code is +available https://github.com/royerlab/ultrack. + +
+
+ comment: 13 pages, 7 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ ASDF: Assembly State Detection Utilizing Late Fusion by Integrating 6D + Pose Estimation + + +
+ In medical and industrial domains, providing guidance for assembly processes +is critical to ensure efficiency and safety. Errors in assembly can lead to +significant consequences such as extended surgery times, and prolonged +manufacturing or maintenance times in industry. Assembly scenarios can benefit +from in-situ AR visualization to provide guidance, reduce assembly times and +minimize errors. To enable in-situ visualization 6D pose estimation can be +leveraged. Existing 6D pose estimation techniques primarily focus on individual +objects and static captures. However, assembly scenarios have various dynamics +including occlusion during assembly and dynamics in the assembly objects +appearance. Existing work, combining object detection/6D pose estimation and +assembly state detection focuses either on pure deep learning-based approaches, +or limit the assembly state detection to building blocks. To address the +challenges of 6D pose estimation in combination with assembly state detection, +our approach ASDF builds upon the strengths of YOLOv8, a real-time capable +object detection framework. We extend this framework, refine the object pose +and fuse pose knowledge with network-detected pose information. Utilizing our +late fusion in our Pose2State module results in refined 6D pose estimation and +assembly state detection. By combining both pose and state information, our +Pose2State module predicts the final assembly state with precision. Our +evaluation on our ASDF dataset shows that our Pose2State module leads to an +improved assembly state detection and that the improvement of the assembly +state further leads to a more robust 6D pose estimation. Moreover, on the GBOT +dataset, we outperform the pure deep learning-based network, and even +outperform the hybrid and pure tracking-based approaches. + +
+
+
+
+
+ + ♻ ☆ Modality Translation for Object Detection Adaptation Without Forgetting + Prior Knowledge + + +
+ A common practice in deep learning consists of training large neural networks +on massive datasets to perform accurately for different domains and tasks. +While this methodology may work well in numerous application areas, it only +applies across modalities due to a larger distribution shift in data captured +using different sensors. This paper focuses on the problem of adapting a large +object detection model to one or multiple modalities while being efficient. To +do so, we propose ModTr as an alternative to the common approach of fine-tuning +large models. ModTr consists of adapting the input with a small transformation +network trained to minimize the detection loss directly. The original model can +therefore work on the translated inputs without any further change or +fine-tuning to its parameters. Experimental results on translating from IR to +RGB images on two well-known datasets show that this simple ModTr approach +provides detectors that can perform comparably or better than the standard +fine-tuning without forgetting the original knowledge. This opens the doors to +a more flexible and efficient service-based detection pipeline in which, +instead of using a different detector for each modality, a unique and unaltered +server is constantly running, where multiple modalities with the corresponding +translations can query it. Code: https://github.com/heitorrapela/ModTr. + +
+
+
+
+
+ + ♻ ☆ Putting the Object Back into Video Object Segmentation CVPR 2024 + + +
+ We present Cutie, a video object segmentation (VOS) network with object-level +memory reading, which puts the object representation from memory back into the +video object segmentation result. Recent works on VOS employ bottom-up +pixel-level memory reading which struggles due to matching noise, especially in +the presence of distractors, resulting in lower performance in more challenging +data. In contrast, Cutie performs top-down object-level memory reading by +adapting a small set of object queries. Via those, it interacts with the +bottom-up pixel features iteratively with a query-based object transformer (qt, +hence Cutie). The object queries act as a high-level summary of the target +object, while high-resolution feature maps are retained for accurate +segmentation. Together with foreground-background masked attention, Cutie +cleanly separates the semantics of the foreground object from the background. +On the challenging MOSE dataset, Cutie improves by 8.7 J&F over XMem with a +similar running time and improves by 4.2 J&F over DeAOT while being three times +faster. Code is available at: https://hkchengrex.github.io/Cutie + +
+
+ comment: CVPR 2024 Highlight. Project page: https://hkchengrex.github.io/Cutie +
+
+
+
+
+ + ♻ ☆ Sat2Cap: Mapping Fine-Grained Textual Descriptions from Satellite Images + + +
+ We propose a weakly supervised approach for creating maps using free-form +textual descriptions. We refer to this work of creating textual maps as +zero-shot mapping. Prior works have approached mapping tasks by developing +models that predict a fixed set of attributes using overhead imagery. However, +these models are very restrictive as they can only solve highly specific tasks +for which they were trained. Mapping text, on the other hand, allows us to +solve a large variety of mapping problems with minimal restrictions. To achieve +this, we train a contrastive learning framework called Sat2Cap on a new +large-scale dataset with 6.1M pairs of overhead and ground-level images. For a +given location and overhead image, our model predicts the expected CLIP +embeddings of the ground-level scenery. The predicted CLIP embeddings are then +used to learn about the textual space associated with that location. Sat2Cap is +also conditioned on date-time information, allowing it to model temporally +varying concepts over a location. Our experimental results demonstrate that our +models successfully capture ground-level concepts and allow large-scale mapping +of fine-grained textual queries. Our approach does not require any text-labeled +data, making the training easily scalable. The code, dataset, and models will +be made publicly available. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Learning county from pixels: Corn yield prediction with + attention-weighted multiple instance learning + + +
+ Remote sensing technology has become a promising tool in yield prediction. +Most prior work employs satellite imagery for county-level corn yield +prediction by spatially aggregating all pixels within a county into a single +value, potentially overlooking the detailed information and valuable insights +offered by more granular data. To this end, this research examines each county +at the pixel level and applies multiple instance learning to leverage detailed +information within a county. In addition, our method addresses the "mixed +pixel" issue caused by the inconsistent resolution between feature datasets and +crop mask, which may introduce noise into the model and therefore hinder +accurate yield prediction. Specifically, the attention mechanism is employed to +automatically assign weights to different pixels, which can mitigate the +influence of mixed pixels. The experimental results show that the developed +model outperforms four other machine learning models over the past five years +in the U.S. corn belt and demonstrates its best performance in 2022, achieving +a coefficient of determination (R2) value of 0.84 and a root mean square error +(RMSE) of 0.83. This paper demonstrates the advantages of our approach from +both spatial and temporal perspectives. Furthermore, through an in-depth study +of the relationship between mixed pixels and attention, it is verified that our +approach can capture critical feature information while filtering out noise +from mixed pixels. + +
+
+ comment: I am writing to request the withdrawal of my paper submitted to + arXiv. Upon further review, I have identified an error in the paper that + significantly affects the results and conclusions. To maintain the integrity + of the scientific record and prevent the dissemination of incorrect + information, I believe it is necessary to withdraw the paper from the archive +
+
+
+
+
+ + ♻ ☆ Fooling Contrastive Language-Image Pre-trained Models with + CLIPMasterPrints + + +
+ Models leveraging both visual and textual data such as Contrastive +Language-Image Pre-training (CLIP), are the backbone of many recent advances in +artificial intelligence. In this work, we show that despite their versatility, +such models are vulnerable to what we refer to as fooling master images. +Fooling master images are capable of maximizing the confidence score of a CLIP +model for a significant number of widely varying prompts, while being either +unrecognizable or unrelated to the attacked prompts for humans. The existence +of such images is problematic as it could be used by bad actors to maliciously +interfere with CLIP-trained image retrieval models in production with +comparably small effort as a single image can attack many different prompts. We +demonstrate how fooling master images for CLIP (CLIPMasterPrints) can be mined +using stochastic gradient descent, projected gradient descent, or blackbox +optimization. Contrary to many common adversarial attacks, the blackbox +optimization approach allows us to mine CLIPMasterPrints even when the weights +of the model are not accessible. We investigate the properties of the mined +images, and find that images trained on a small number of image captions +generalize to a much larger number of semantically related captions. We +evaluate possible mitigation strategies, where we increase the robustness of +the model and introduce an approach to automatically detect CLIPMasterPrints to +sanitize the input of vulnerable models. Finally, we find that vulnerability to +CLIPMasterPrints is related to a modality gap in contrastive pre-trained +multi-modal networks. Code available at +https://github.com/matfrei/CLIPMasterPrints. + +
+
+
+
+
+ + ♻ ☆ Efficient Representation of Natural Image Patches + + +
+ Utilizing an abstract information processing model based on minimal yet +realistic assumptions inspired by biological systems, we study how to achieve +the early visual system's two ultimate objectives: efficient information +transmission and accurate sensor probability distribution modeling. We prove +that optimizing for information transmission does not guarantee optimal +probability distribution modeling in general. We illustrate, using a two-pixel +(2D) system and image patches, that an efficient representation can be realized +through a nonlinear population code driven by two types of biologically +plausible loss functions that depend solely on output. After unsupervised +learning, our abstract information processing model bears remarkable +resemblances to biological systems, despite not mimicking many features of real +neurons, such as spiking activity. A preliminary comparison with a contemporary +deep learning model suggests that our model offers a significant efficiency +advantage. Our model provides novel insights into the computational theory of +early visual systems as well as a potential new approach to enhance the +efficiency of deep learning models. + +
+
+
+
+
+ + ♻ ☆ DQ-DETR: DETR with Dynamic Query for Tiny Object Detection + + +
+ Despite previous DETR-like methods having performed successfully in generic +object detection, tiny object detection is still a challenging task for them +since the positional information of object queries is not customized for +detecting tiny objects, whose scale is extraordinarily smaller than general +objects. Also, DETR-like methods using a fixed number of queries make them +unsuitable for aerial datasets, which only contain tiny objects, and the +numbers of instances are imbalanced between different images. Thus, we present +a simple yet effective model, named DQ-DETR, which consists of three different +components: categorical counting module, counting-guided feature enhancement, +and dynamic query selection to solve the above-mentioned problems. DQ-DETR uses +the prediction and density maps from the categorical counting module to +dynamically adjust the number of object queries and improve the positional +information of queries. Our model DQ-DETR outperforms previous CNN-based and +DETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2 +dataset, which mostly consists of tiny objects. + +
+
+
+
+
+ + ♻ ☆ EFHQ: Multi-purpose ExtremePose-Face-HQ dataset + + +
+ The existing facial datasets, while having plentiful images at near frontal +views, lack images with extreme head poses, leading to the downgraded +performance of deep learning models when dealing with profile or pitched faces. +This work aims to address this gap by introducing a novel dataset named Extreme +Pose Face High-Quality Dataset (EFHQ), which includes a maximum of 450k +high-quality images of faces at extreme poses. To produce such a massive +dataset, we utilize a novel and meticulous dataset processing pipeline to +curate two publicly available datasets, VFHQ and CelebV-HQ, which contain many +high-resolution face videos captured in various settings. Our dataset can +complement existing datasets on various facial-related tasks, such as facial +synthesis with 2D/3D-aware GAN, diffusion-based text-to-image face generation, +and face reenactment. Specifically, training with EFHQ helps models generalize +well across diverse poses, significantly improving performance in scenarios +involving extreme views, confirmed by extensive experiments. Additionally, we +utilize EFHQ to define a challenging cross-view face verification benchmark, in +which the performance of SOTA face recognition models drops 5-37% compared to +frontal-to-frontal scenarios, aiming to stimulate studies on face recognition +under severe pose conditions in the wild. + +
+
+ comment: Project Page: https://bomcon123456.github.io/efhq/ +
+
+
+
+
+ + ♻ ☆ IISAN: Efficiently Adapting Multimodal Representation for Sequential + Recommendation with Decoupled PEFT SIGIR2024 + + +
+ Multimodal foundation models are transformative in sequential recommender +systems, leveraging powerful representation learning capabilities. While +Parameter-efficient Fine-tuning (PEFT) is commonly used to adapt foundation +models for recommendation tasks, most research prioritizes parameter +efficiency, often overlooking critical factors like GPU memory efficiency and +training speed. Addressing this gap, our paper introduces IISAN (Intra- and +Inter-modal Side Adapted Network for Multimodal Representation), a simple +plug-and-play architecture using a Decoupled PEFT structure and exploiting both +intra- and inter-modal adaptation. + IISAN matches the performance of full fine-tuning (FFT) and state-of-the-art +PEFT. More importantly, it significantly reduces GPU memory usage - from 47GB +to just 3GB for multimodal sequential recommendation tasks. Additionally, it +accelerates training time per epoch from 443s to 22s compared to FFT. This is +also a notable improvement over the Adapter and LoRA, which require 37-39 GB +GPU memory and 350-380 seconds per epoch for training. + Furthermore, we propose a new composite efficiency metric, TPME +(Training-time, Parameter, and GPU Memory Efficiency) to alleviate the +prevalent misconception that "parameter efficiency represents overall +efficiency". TPME provides more comprehensive insights into practical +efficiency comparisons between different methods. Besides, we give an +accessible efficiency analysis of all PEFT and FFT approaches, which +demonstrate the superiority of IISAN. We release our codes and other materials +at https://github.com/GAIR-Lab/IISAN. + +
+
+ comment: Accepted by SIGIR2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 140 + +
+
+
+ + ☆ GoodDrag: Towards Good Practices for Drag Editing with Diffusion Models + + +
+ In this paper, we introduce GoodDrag, a novel approach to improve the +stability and image quality of drag editing. Unlike existing methods that +struggle with accumulated perturbations and often result in distortions, +GoodDrag introduces an AlDD framework that alternates between drag and +denoising operations within the diffusion process, effectively improving the +fidelity of the result. We also propose an information-preserving motion +supervision operation that maintains the original features of the starting +point for precise manipulation and artifact reduction. In addition, we +contribute to the benchmarking of drag editing by introducing a new dataset, +Drag100, and developing dedicated quality assessment metrics, Dragging Accuracy +Index and Gemini Score, utilizing Large Multimodal Models. Extensive +experiments demonstrate that the proposed GoodDrag compares favorably against +the state-of-the-art approaches both qualitatively and quantitatively. The +project page is https://gooddrag.github.io. + +
+
+
+
+
+ + ☆ BRAVE: Broadening the visual encoding of vision-language models + + +
+ Vision-language models (VLMs) are typically composed of a vision encoder, +e.g. CLIP, and a language model (LM) that interprets the encoded features to +solve downstream tasks. Despite remarkable progress, VLMs are subject to +several shortcomings due to the limited capabilities of vision encoders, e.g. +"blindness" to certain image features, visual hallucination, etc. To address +these issues, we study broadening the visual encoding capabilities of VLMs. We +first comprehensively benchmark several vision encoders with different +inductive biases for solving VLM tasks. We observe that there is no single +encoding configuration that consistently achieves top performance across +different tasks, and encoders with different biases can perform surprisingly +similarly. Motivated by this, we introduce a method, named BRAVE, that +consolidates features from multiple frozen encoders into a more versatile +representation that can be directly fed as the input to a frozen LM. BRAVE +achieves state-of-the-art performance on a broad range of captioning and VQA +benchmarks and significantly reduces the aforementioned issues of VLMs, while +requiring a smaller number of trainable parameters than existing methods and +having a more compressed representation. Our results highlight the potential of +incorporating different visual biases for a more broad and contextualized +visual understanding of VLMs. + +
+
+ comment: Project page at https://brave-vlms.epfl.ch/ +
+
+
+
+
+ + ☆ UMBRAE: Unified Multimodal Decoding of Brain Signals + + +
+ We address prevailing challenges of the brain-powered research, departing +from the observation that the literature hardly recover accurate spatial +information and require subject-specific models. To address these challenges, +we propose UMBRAE, a unified multimodal decoding of brain signals. First, to +extract instance-level conceptual and spatial details from neural signals, we +introduce an efficient universal brain encoder for multimodal-brain alignment +and recover object descriptions at multiple levels of granularity from +subsequent multimodal large language model (MLLM). Second, we introduce a +cross-subject training strategy mapping subject-specific features to a common +feature space. This allows a model to be trained on multiple subjects without +extra resources, even yielding superior results compared to subject-specific +models. Further, we demonstrate this supports weakly-supervised adaptation to +new subjects, with only a fraction of the total training data. Experiments +demonstrate that UMBRAE not only achieves superior results in the newly +introduced tasks but also outperforms methods in well established tasks. To +assess our method, we construct and share with the community a comprehensive +brain understanding benchmark BrainHub. Our code and benchmark are available at +https://weihaox.github.io/UMBRAE. + +
+
+ comment: Project Page: https://weihaox.github.io/UMBRAE +
+
+
+
+
+ + ☆ RealmDreamer: Text-Driven 3D Scene Generation with Inpainting and Depth + Diffusion + + +
+ We introduce RealmDreamer, a technique for generation of general +forward-facing 3D scenes from text descriptions. Our technique optimizes a 3D +Gaussian Splatting representation to match complex text prompts. We initialize +these splats by utilizing the state-of-the-art text-to-image generators, +lifting their samples into 3D, and computing the occlusion volume. We then +optimize this representation across multiple views as a 3D inpainting task with +image-conditional diffusion models. To learn correct geometric structure, we +incorporate a depth diffusion model by conditioning on the samples from the +inpainting model, giving rich geometric structure. Finally, we finetune the +model using sharpened samples from image generators. Notably, our technique +does not require video or multi-view data and can synthesize a variety of +high-quality 3D scenes in different styles, consisting of multiple objects. Its +generality additionally allows 3D synthesis from a single image. + +
+
+ comment: Project Page: https://realmdreamer.github.io/ +
+
+
+
+
+ + ☆ InstantMesh: Efficient 3D Mesh Generation from a Single Image with + Sparse-view Large Reconstruction Models + + +
+ We present InstantMesh, a feed-forward framework for instant 3D mesh +generation from a single image, featuring state-of-the-art generation quality +and significant training scalability. By synergizing the strengths of an +off-the-shelf multiview diffusion model and a sparse-view reconstruction model +based on the LRM architecture, InstantMesh is able to create diverse 3D assets +within 10 seconds. To enhance the training efficiency and exploit more +geometric supervisions, e.g, depths and normals, we integrate a differentiable +iso-surface extraction module into our framework and directly optimize on the +mesh representation. Experimental results on public datasets demonstrate that +InstantMesh significantly outperforms other latest image-to-3D baselines, both +qualitatively and quantitatively. We release all the code, weights, and demo of +InstantMesh, with the intention that it can make substantial contributions to +the community of 3D generative AI and empower both researchers and content +creators. + +
+
+ comment: Technical report. Project: https://github.com/TencentARC/InstantMesh +
+
+
+
+
+ + ☆ GCV-Turbo: End-to-end Acceleration of GNN-based Computer Vision Tasks on + FPGA + + +
+ Graph neural networks (GNNs) have recently empowered various novel computer +vision (CV) tasks. In GNN-based CV tasks, a combination of CNN layers and GNN +layers or only GNN layers are employed. This paper introduces GCV-Turbo, a +domain-specific accelerator on FPGA for end-to-end acceleration of GNN-based CV +tasks. GCV-Turbo consists of two key components: (1) a \emph{novel} hardware +architecture optimized for the computation kernels in both CNNs and GNNs using +the same set of computation resources. (2) a PyTorch-compatible compiler that +takes a user-defined model as input, performs end-to-end optimization for the +computation graph of a given GNN-based CV task, and produces optimized code for +hardware execution. The hardware architecture and the compiler work +synergistically to support a variety of GNN-based CV tasks. We implement +GCV-Turbo on a state-of-the-art FPGA and evaluate its performance across six +representative GNN-based CV tasks with diverse input data modalities (e.g., +image, human skeleton, point cloud). Compared with state-of-the-art CPU (GPU) +implementations, GCV-Turbo achieves an average latency reduction of +$68.4\times$ ($4.1\times$) on these six GNN-based CV tasks. Moreover, GCV-Turbo +supports the execution of the standalone CNNs or GNNs, achieving performance +comparable to that of state-of-the-art CNN (GNN) accelerators for widely used +CNN-only (GNN-only) models. + +
+
+
+
+
+ + ☆ Move Anything with Layered Scene Diffusion CVPR 2024 + + +
+ Diffusion models generate images with an unprecedented level of quality, but +how can we freely rearrange image layouts? Recent works generate controllable +scenes via learning spatially disentangled latent codes, but these methods do +not apply to diffusion models due to their fixed forward process. In this work, +we propose SceneDiffusion to optimize a layered scene representation during the +diffusion sampling process. Our key insight is that spatial disentanglement can +be obtained by jointly denoising scene renderings at different spatial layouts. +Our generated scenes support a wide range of spatial editing operations, +including moving, resizing, cloning, and layer-wise appearance editing +operations, including object restyling and replacing. Moreover, a scene can be +generated conditioned on a reference image, thus enabling object moving for +in-the-wild images. Notably, this approach is training-free, compatible with +general text-to-image diffusion models, and responsive in less than a second. + +
+
+ comment: CVPR 2024 camera-ready +
+
+
+
+
+ + ☆ Self-supervised Monocular Depth Estimation on Water Scenes via Specular + Reflection Prior + + +
+ Monocular depth estimation from a single image is an ill-posed problem for +computer vision due to insufficient reliable cues as the prior knowledge. +Besides the inter-frame supervision, namely stereo and adjacent frames, +extensive prior information is available in the same frame. Reflections from +specular surfaces, informative intra-frame priors, enable us to reformulate the +ill-posed depth estimation task as a multi-view synthesis. This paper proposes +the first self-supervision for deep-learning depth estimation on water scenes +via intra-frame priors, known as reflection supervision and geometrical +constraints. In the first stage, a water segmentation network is performed to +separate the reflection components from the entire image. Next, we construct a +self-supervised framework to predict the target appearance from reflections, +perceived as other perspectives. The photometric re-projection error, +incorporating SmoothL1 and a novel photometric adaptive SSIM, is formulated to +optimize pose and depth estimation by aligning the transformed virtual depths +and source ones. As a supplement, the water surface is determined from real and +virtual camera positions, which complement the depth of the water area. +Furthermore, to alleviate these laborious ground truth annotations, we +introduce a large-scale water reflection scene (WRS) dataset rendered from +Unreal Engine 4. Extensive experiments on the WRS dataset prove the feasibility +of the proposed method compared to state-of-the-art depth estimation +techniques. + +
+
+ comment: 16 pages, 8 figures +
+
+
+
+
+ + ☆ Unified Language-driven Zero-shot Domain Adaptation CVPR 2024 + + +
+ This paper introduces Unified Language-driven Zero-shot Domain Adaptation +(ULDA), a novel task setting that enables a single model to adapt to diverse +target domains without explicit domain-ID knowledge. We identify the +constraints in the existing language-driven zero-shot domain adaptation task, +particularly the requirement for domain IDs and domain-specific models, which +may restrict flexibility and scalability. To overcome these issues, we propose +a new framework for ULDA, consisting of Hierarchical Context Alignment (HCA), +Domain Consistent Representation Learning (DCRL), and Text-Driven Rectifier +(TDR). These components work synergistically to align simulated features with +target text across multiple visual levels, retain semantic correlations between +different regional representations, and rectify biases between simulated and +real target visual features, respectively. Our extensive empirical evaluations +demonstrate that this framework achieves competitive performance in both +settings, surpassing even the model that requires domain-ID, showcasing its +superiority and generalization ability. The proposed method is not only +effective but also maintains practicality and efficiency, as it does not +introduce additional computational costs during inference. Our project page is +https://senqiaoyang.com/project/ULDA . + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Lost in Translation: Modern Neural Networks Still Struggle With Small + Realistic Image Transformations + + +
+ Deep neural networks that achieve remarkable performance in image +classification have previously been shown to be easily fooled by tiny +transformations such as a one pixel translation of the input image. In order to +address this problem, two approaches have been proposed in recent years. The +first approach suggests using huge datasets together with data augmentation in +the hope that a highly varied training set will teach the network to learn to +be invariant. The second approach suggests using architectural modifications +based on sampling theory to deal explicitly with image translations. In this +paper, we show that these approaches still fall short in robustly handling +'natural' image translations that simulate a subtle change in camera +orientation. Our findings reveal that a mere one-pixel translation can result +in a significant change in the predicted image representation for approximately +40% of the test images in state-of-the-art models (e.g. open-CLIP trained on +LAION-2B or DINO-v2) , while models that are explicitly constructed to be +robust to cyclic translations can still be fooled with 1 pixel realistic +(non-cyclic) translations 11% of the time. We present Robust Inference by Crop +Selection: a simple method that can be proven to achieve any desired level of +consistency, although with a modest tradeoff with the model's accuracy. +Importantly, we demonstrate how employing this method reduces the ability to +fool state-of-the-art models with a 1 pixel translation to less than 5% while +suffering from only a 1% drop in classification accuracy. Additionally, we show +that our method can be easy adjusted to deal with circular shifts as well. In +such case we achieve 100% robustness to integer shifts with state-of-the-art +accuracy, and with no need for any further training. + +
+
+ comment: 14 pages, 6 appendices, 17 figures +
+
+
+
+
+ + ☆ Measuring proximity to standard planes during fetal brain ultrasound + scanning + + +
+ This paper introduces a novel pipeline designed to bring ultrasound (US) +plane pose estimation closer to clinical use for more effective navigation to +the standard planes (SPs) in the fetal brain. We propose a semi-supervised +segmentation model utilizing both labeled SPs and unlabeled 3D US volume +slices. Our model enables reliable segmentation across a diverse set of fetal +brain images. Furthermore, the model incorporates a classification mechanism to +identify the fetal brain precisely. Our model not only filters out frames +lacking the brain but also generates masks for those containing it, enhancing +the relevance of plane pose regression in clinical settings. We focus on fetal +brain navigation from 2D ultrasound (US) video analysis and combine this model +with a US plane pose regression network to provide sensorless proximity +detection to SPs and non-SPs planes; we emphasize the importance of proximity +detection to SPs for guiding sonographers, offering a substantial advantage +over traditional methods by allowing earlier and more precise adjustments +during scanning. We demonstrate the practical applicability of our approach +through validation on real fetal scan videos obtained from sonographers of +varying expertise levels. Our findings demonstrate the potential of our +approach to complement existing fetal US technologies and advance prenatal +diagnostic practices. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Driver Attention Tracking and Analysis + + +
+ We propose a novel method to estimate a driver's points-of-gaze using a pair +of ordinary cameras mounted on the windshield and dashboard of a car. This is a +challenging problem due to the dynamics of traffic environments with 3D scenes +of unknown depths. This problem is further complicated by the volatile distance +between the driver and the camera system. To tackle these challenges, we +develop a novel convolutional network that simultaneously analyzes the image of +the scene and the image of the driver's face. This network has a camera +calibration module that can compute an embedding vector that represents the +spatial configuration between the driver and the camera system. This +calibration module improves the overall network's performance, which can be +jointly trained end to end. + We also address the lack of annotated data for training and evaluation by +introducing a large-scale driving dataset with point-of-gaze annotations. This +is an in situ dataset of real driving sessions in an urban city, containing +synchronized images of the driving scene as well as the face and gaze of the +driver. Experiments on this dataset show that the proposed method outperforms +various baseline methods, having the mean prediction error of 29.69 pixels, +which is relatively small compared to the $1280{\times}720$ resolution of the +scene camera. + +
+
+
+
+
+ + ☆ Unfolding ADMM for Enhanced Subspace Clustering of Hyperspectral Images + + +
+ Deep subspace clustering methods are now prominent in clustering, typically +using fully connected networks and a self-representation loss function. +However, these methods often struggle with overfitting and lack +interpretability. In this paper, we explore an alternative clustering approach +based on deep unfolding. By unfolding iterative optimization methods into +neural networks, this approach offers enhanced interpretability and reliability +compared to data-driven deep learning methods, and greater adaptability and +generalization than model-based approaches. Hence, unfolding has become widely +used in inverse imaging problems, such as image restoration, reconstruction, +and super-resolution, but has not been sufficiently explored yet in the context +of clustering. In this work, we introduce an innovative clustering architecture +for hyperspectral images (HSI) by unfolding an iterative solver based on the +Alternating Direction Method of Multipliers (ADMM) for sparse subspace +clustering. To our knowledge, this is the first attempt to apply unfolding ADMM +for computing the self-representation matrix in subspace clustering. Moreover, +our approach captures well the structural characteristics of HSI data by +employing the K nearest neighbors algorithm as part of a structure preservation +module. Experimental evaluation of three established HSI datasets shows clearly +the potential of the unfolding approach in HSI clustering and even demonstrates +superior performance compared to state-of-the-art techniques. + +
+
+
+
+
+ + ☆ Wild Visual Navigation: Fast Traversability Learning via Pre-Trained + Models and Online Self-Supervision + + +
+ Natural environments such as forests and grasslands are challenging for +robotic navigation because of the false perception of rigid obstacles from high +grass, twigs, or bushes. In this work, we present Wild Visual Navigation (WVN), +an online self-supervised learning system for visual traversability estimation. +The system is able to continuously adapt from a short human demonstration in +the field, only using onboard sensing and computing. One of the key ideas to +achieve this is the use of high-dimensional features from pre-trained +self-supervised models, which implicitly encode semantic information that +massively simplifies the learning task. Further, the development of an online +scheme for supervision generator enables concurrent training and inference of +the learned model in the wild. We demonstrate our approach through diverse +real-world deployments in forests, parks, and grasslands. Our system is able to +bootstrap the traversable terrain segmentation in less than 5 min of in-field +training time, enabling the robot to navigate in complex, previously unseen +outdoor terrains. Code: https://bit.ly/498b0CV - Project +page:https://bit.ly/3M6nMHH + +
+
+ comment: Extended version of arXiv:2305.08510 +
+
+
+
+
+ + ☆ 3DMambaComplete: Exploring Structured State Space Model for Point Cloud + Completion + + +
+ Point cloud completion aims to generate a complete and high-fidelity point +cloud from an initially incomplete and low-quality input. A prevalent strategy +involves leveraging Transformer-based models to encode global features and +facilitate the reconstruction process. However, the adoption of pooling +operations to obtain global feature representations often results in the loss +of local details within the point cloud. Moreover, the attention mechanism +inherent in Transformers introduces additional computational complexity, +rendering it challenging to handle long sequences effectively. To address these +issues, we propose 3DMambaComplete, a point cloud completion network built on +the novel Mamba framework. It comprises three modules: HyperPoint Generation +encodes point cloud features using Mamba's selection mechanism and predicts a +set of Hyperpoints. A specific offset is estimated, and the down-sampled points +become HyperPoints. The HyperPoint Spread module disperses these HyperPoints +across different spatial locations to avoid concentration. Finally, a +deformation method transforms the 2D mesh representation of HyperPoints into a +fine-grained 3D structure for point cloud reconstruction. Extensive experiments +conducted on various established benchmarks demonstrate that 3DMambaComplete +surpasses state-of-the-art point cloud completion methods, as confirmed by +qualitative and quantitative analyses. + +
+
+ comment: 10 pages, 8 figures, 7 tables +
+
+
+
+
+ + ☆ Learning Priors for Non Rigid SfM from Casual Videos + + +
+ We tackle the long-standing challenge of reconstructing 3D structures and +camera positions from videos. The problem is particularly hard when objects are +transformed in a non-rigid way. Current approaches to this problem make +unrealistic assumptions or require a long optimization time. + We present TracksTo4D, a novel deep learning-based approach that enables +inferring 3D structure and camera positions from dynamic content originating +from in-the-wild videos using a single feed-forward pass on a sparse point +track matrix. To achieve this, we leverage recent advances in 2D point tracking +and design an equivariant neural architecture tailored for directly processing +2D point tracks by leveraging their symmetries. TracksTo4D is trained on a +dataset of in-the-wild videos utilizing only the 2D point tracks extracted from +the videos, without any 3D supervision. Our experiments demonstrate that +TracksTo4D generalizes well to unseen videos of unseen semantic categories at +inference time, producing equivalent results to state-of-the-art methods while +significantly reducing the runtime compared to other baselines. + +
+
+
+
+
+ + ☆ MoCap-to-Visual Domain Adaptation for Efficient Human Mesh Estimation + from 2D Keypoints CVPR + + +
+ This paper presents Key2Mesh, a model that takes a set of 2D human pose +keypoints as input and estimates the corresponding body mesh. Since this +process does not involve any visual (i.e. RGB image) data, the model can be +trained on large-scale motion capture (MoCap) datasets, thereby overcoming the +scarcity of image datasets with 3D labels. To enable the model's application on +RGB images, we first run an off-the-shelf 2D pose estimator to obtain the 2D +keypoints, and then feed these 2D keypoints to Key2Mesh. To improve the +performance of our model on RGB images, we apply an adversarial domain +adaptation (DA) method to bridge the gap between the MoCap and visual domains. +Crucially, our DA method does not require 3D labels for visual data, which +enables adaptation to target sets without the need for costly labels. We +evaluate Key2Mesh for the task of estimating 3D human meshes from 2D keypoints, +in the absence of RGB and mesh label pairs. Our results on widely used H3.6M +and 3DPW datasets show that Key2Mesh sets the new state-of-the-art by +outperforming other models in PA-MPJPE for both datasets, and in MPJPE and PVE +for the 3DPW dataset. Thanks to our model's simple architecture, it operates at +least 12x faster than the prior state-of-the-art model, LGD. Additional +qualitative samples and code are available on the project website: +https://key2mesh.github.io/. + +
+
+ comment: accepted to CVPRW 2024 +
+
+
+
+
+ + ☆ VLLMs Provide Better Context for Emotion Understanding Through Common + Sense Reasoning + + +
+ Recognising emotions in context involves identifying the apparent emotions of +an individual, taking into account contextual cues from the surrounding scene. +Previous approaches to this task have involved the design of explicit +scene-encoding architectures or the incorporation of external scene-related +information, such as captions. However, these methods often utilise limited +contextual information or rely on intricate training pipelines. In this work, +we leverage the groundbreaking capabilities of Vision-and-Large-Language Models +(VLLMs) to enhance in-context emotion classification without introducing +complexity to the training process in a two-stage approach. In the first stage, +we propose prompting VLLMs to generate descriptions in natural language of the +subject's apparent emotion relative to the visual context. In the second stage, +the descriptions are used as contextual information and, along with the image +input, are used to train a transformer-based architecture that fuses text and +visual features before the final classification task. Our experimental results +show that the text and image features have complementary information, and our +fused architecture significantly outperforms the individual modalities without +any complex training methods. We evaluate our approach on three different +datasets, namely, EMOTIC, CAER-S, and BoLD, and achieve state-of-the-art or +comparable accuracy across all datasets and metrics compared to much more +complex approaches. The code will be made publicly available on github: +https://github.com/NickyFot/EmoCommonSense.git + +
+
+ comment: A. Xenos, N. Foteinopoulou and I. Ntinou contributed equally to this + work; 14 pages, 5 figures +
+
+
+
+
+ + ☆ Implicit Multi-Spectral Transformer: An Lightweight and Effective + Visible to Infrared Image Translation Model IJCNN 2024 + + +
+ In the field of computer vision, visible light images often exhibit low +contrast in low-light conditions, presenting a significant challenge. While +infrared imagery provides a potential solution, its utilization entails high +costs and practical limitations. Recent advancements in deep learning, +particularly the deployment of Generative Adversarial Networks (GANs), have +facilitated the transformation of visible light images to infrared images. +However, these methods often experience unstable training phases and may +produce suboptimal outputs. To address these issues, we propose a novel +end-to-end Transformer-based model that efficiently converts visible light +images into high-fidelity infrared images. Initially, the Texture Mapping +Module and Color Perception Adapter collaborate to extract texture and color +features from the visible light image. The Dynamic Fusion Aggregation Module +subsequently integrates these features. Finally, the transformation into an +infrared image is refined through the synergistic action of the Color +Perception Adapter and the Enhanced Perception Attention mechanism. +Comprehensive benchmarking experiments confirm that our model outperforms +existing methods, producing infrared images of markedly superior quality, both +qualitatively and quantitatively. Furthermore, the proposed model enables more +effective downstream applications for infrared images than other methods. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ☆ Identification of Fine-grained Systematic Errors via Controlled Scene + Generation + + +
+ Many safety-critical applications, especially in autonomous driving, require +reliable object detectors. They can be very effectively assisted by a method to +search for and identify potential failures and systematic errors before these +detectors are deployed. Systematic errors are characterized by combinations of +attributes such as object location, scale, orientation, and color, as well as +the composition of their respective backgrounds. To identify them, one must +rely on something other than real images from a test set because they do not +account for very rare but possible combinations of attributes. To overcome this +limitation, we propose a pipeline for generating realistic synthetic scenes +with fine-grained control, allowing the creation of complex scenes with +multiple objects. Our approach, BEV2EGO, allows for a realistic generation of +the complete scene with road-contingent control that maps 2D bird's-eye view +(BEV) scene configurations to a first-person view (EGO). In addition, we +propose a benchmark for controlled scene generation to select the most +appropriate generative outpainting model for BEV2EGO. We further use it to +perform a systematic analysis of multiple state-of-the-art object detection +models and discover differences between them. + +
+
+
+
+
+ + ☆ An Evidential-enhanced Tri-Branch Consistency Learning Method for + Semi-supervised Medical Image Segmentation + + +
+ Semi-supervised segmentation presents a promising approach for large-scale +medical image analysis, effectively reducing annotation burdens while achieving +comparable performance. This methodology holds substantial potential for +streamlining the segmentation process and enhancing its feasibility within +clinical settings for translational investigations. While cross-supervised +training, based on distinct co-training sub-networks, has become a prevalent +paradigm for this task, addressing critical issues such as predication +disagreement and label-noise suppression requires further attention and +progress in cross-supervised training. In this paper, we introduce an +Evidential Tri-Branch Consistency learning framework (ETC-Net) for +semi-supervised medical image segmentation. ETC-Net employs three branches: an +evidential conservative branch, an evidential progressive branch, and an +evidential fusion branch. The first two branches exhibit complementary +characteristics, allowing them to address prediction diversity and enhance +training stability. We also integrate uncertainty estimation from the +evidential learning into cross-supervised training, mitigating the negative +impact of erroneous supervision signals. Additionally, the evidential fusion +branch capitalizes on the complementary attributes of the first two branches +and leverages an evidence-based Dempster-Shafer fusion strategy, supervised by +more reliable and accurate pseudo-labels of unlabeled data. Extensive +experiments conducted on LA, Pancreas-CT, and ACDC datasets demonstrate that +ETC-Net surpasses other state-of-the-art methods for semi-supervised +segmentation. The code will be made available in the near future at +https://github.com/Medsemiseg. + +
+
+
+
+
+ + ☆ ORacle: Large Vision-Language Models for Knowledge-Guided Holistic OR + Domain Modeling + + +
+ Every day, countless surgeries are performed worldwide, each within the +distinct settings of operating rooms (ORs) that vary not only in their setups +but also in the personnel, tools, and equipment used. This inherent diversity +poses a substantial challenge for achieving a holistic understanding of the OR, +as it requires models to generalize beyond their initial training datasets. To +reduce this gap, we introduce ORacle, an advanced vision-language model +designed for holistic OR domain modeling, which incorporates multi-view and +temporal capabilities and can leverage external knowledge during inference, +enabling it to adapt to previously unseen surgical scenarios. This capability +is further enhanced by our novel data augmentation framework, which +significantly diversifies the training dataset, ensuring ORacle's proficiency +in applying the provided knowledge effectively. In rigorous testing, in scene +graph generation, and downstream tasks on the 4D-OR dataset, ORacle not only +demonstrates state-of-the-art performance but does so requiring less data than +existing models. Furthermore, its adaptability is displayed through its ability +to interpret unseen views, actions, and appearances of tools and equipment. +This demonstrates ORacle's potential to significantly enhance the scalability +and affordability of OR domain modeling and opens a pathway for future +advancements in surgical data science. We will release our code and data upon +acceptance. + +
+
+ comment: 11 pages, 3 figures, 7 tables +
+
+
+
+
+ + ☆ Diffusion-based inpainting of incomplete Euclidean distance matrices of + trajectories generated by a fractional Brownian motion + + +
+ Fractional Brownian trajectories (fBm) feature both randomness and strong +scale-free correlations, challenging generative models to reproduce the +intrinsic memory characterizing the underlying process. Here we test a +diffusion probabilistic model on a specific dataset of corrupted images +corresponding to incomplete Euclidean distance matrices of fBm at various +memory exponents $H$. Our dataset implies uniqueness of the data imputation in +the regime of low missing ratio, where the remaining partial graph is rigid, +providing the ground truth for the inpainting. We find that the conditional +diffusion generation stably reproduces the statistics of missing +fBm-distributed distances for different values of $H$ exponent. Furthermore, +while diffusion models have been recently shown to remember samples from the +training database, we show that diffusion-based inpainting behaves +qualitatively different from the database search with the increasing database +size. Finally, we apply our fBm-trained diffusion model with $H=1/3$ for +completion of chromosome distance matrices obtained in single-cell microscopy +experiments, showing its superiority over the standard bioinformatics +algorithms. Our source code is available on GitHub at +https://github.com/alobashev/diffusion_fbm. + +
+
+
+
+
+ + ☆ Ray-driven Spectral CT Reconstruction Based on Neural Base-Material + Fields + + +
+ In spectral CT reconstruction, the basis materials decomposition involves +solving a large-scale nonlinear system of integral equations, which is highly +ill-posed mathematically. This paper proposes a model that parameterizes the +attenuation coefficients of the object using a neural field representation, +thereby avoiding the complex calculations of pixel-driven projection +coefficient matrices during the discretization process of line integrals. It +introduces a lightweight discretization method for line integrals based on a +ray-driven neural field, enhancing the accuracy of the integral approximation +during the discretization process. The basis materials are represented as +continuous vector-valued implicit functions to establish a neural field +parameterization model for the basis materials. The auto-differentiation +framework of deep learning is then used to solve the implicit continuous +function of the neural base-material fields. This method is not limited by the +spatial resolution of reconstructed images, and the network has compact and +regular properties. Experimental validation shows that our method performs +exceptionally well in addressing the spectral CT reconstruction. Additionally, +it fulfils the requirements for the generation of high-resolution +reconstruction images. + +
+
+ comment: 14 pages,16 figures +
+
+
+
+
+ + ☆ Accurate Tennis Court Line Detection on Amateur Recorded Matches + + +
+ Typically, tennis court line detection is done by running +Hough-Line-Detection to find straight lines in the image, and then computing a +transformation matrix from the detected lines to create the final court +structure. We propose numerous improvements and enhancements to this algorithm, +including using pretrained State-of-the-Art shadow-removal and object-detection +ML models to make our line-detection more robust. Compared to the original +algorithm, our method can accurately detect lines on amateur, dirty courts. +When combined with a robust ball-tracking system, our method will enable +accurate, automatic refereeing for amateur and professional tennis matches +alike. + +
+
+ comment: Accepted to 5th International conference on Image, Video Processing + and Artificial Intelligence +
+
+
+
+
+ + ☆ TrajPRed: Trajectory Prediction with Region-based Relation Learning + + +
+ Forecasting human trajectories in traffic scenes is critical for safety +within mixed or fully autonomous systems. Human future trajectories are driven +by two major stimuli, social interactions, and stochastic goals. Thus, reliable +forecasting needs to capture these two stimuli. Edge-based relation modeling +represents social interactions using pairwise correlations from precise +individual states. Nevertheless, edge-based relations can be vulnerable under +perturbations. To alleviate these issues, we propose a region-based relation +learning paradigm that models social interactions via region-wise dynamics of +joint states, i.e., the changes in the density of crowds. In particular, +region-wise agent joint information is encoded within convolutional feature +grids. Social relations are modeled by relating the temporal changes of local +joint information from a global perspective. We show that region-based +relations are less susceptible to perturbations. In order to account for the +stochastic individual goals, we exploit a conditional variational autoencoder +to realize multi-goal estimation and diverse future prediction. Specifically, +we perform variational inference via the latent distribution, which is +conditioned on the correlation between input states and associated target +goals. Sampling from the latent distribution enables the framework to reliably +capture the stochastic behavior in test data. We integrate multi-goal +estimation and region-based relation learning to model the two stimuli, social +interactions, and stochastic goals, in a prediction framework. We evaluate our +framework on the ETH-UCY dataset and Stanford Drone Dataset (SDD). We show that +the diverse prediction better fits the ground truth when incorporating the +relation module. Our framework outperforms the state-of-the-art models on SDD +by $27.61\%$/$18.20\%$ of ADE/FDE metrics. + +
+
+
+
+
+ + ☆ V-MAD: Video-based Morphing Attack Detection in Operational Scenarios + + +
+ In response to the rising threat of the face morphing attack, this paper +introduces and explores the potential of Video-based Morphing Attack Detection +(V-MAD) systems in real-world operational scenarios. While current morphing +attack detection methods primarily focus on a single or a pair of images, V-MAD +is based on video sequences, exploiting the video streams often acquired by +face verification tools available, for instance, at airport gates. Through this +study, we show for the first time the advantages that the availability of +multiple probe frames can bring to the morphing attack detection task, +especially in scenarios where the quality of probe images is varied and might +be affected, for instance, by pose or illumination variations. Experimental +results on a real operational database demonstrate that video sequences +represent valuable information for increasing the robustness and performance of +morphing attack detection systems. + +
+
+
+
+
+ + ☆ Adversarial purification for no-reference image-quality metrics: + applicability study and new methods + + +
+ Recently, the area of adversarial attacks on image quality metrics has begun +to be explored, whereas the area of defences remains under-researched. In this +study, we aim to cover that case and check the transferability of adversarial +purification defences from image classifiers to IQA methods. In this paper, we +apply several widespread attacks on IQA models and examine the success of the +defences against them. The purification methodologies covered different +preprocessing techniques, including geometrical transformations, compression, +denoising, and modern neural network-based methods. Also, we address the +challenge of assessing the efficacy of a defensive methodology by proposing +ways to estimate output visual quality and the success of neutralizing attacks. +Defences were tested against attack on three IQA metrics -- Linearity, MetaIQA +and SPAQ. The code for attacks and defences is available at: (link is hidden +for a blind review). + +
+
+
+
+
+ + ☆ Accelerating Cardiac MRI Reconstruction with CMRatt: An Attention-Driven + Approach + + +
+ Cine cardiac magnetic resonance (CMR) imaging is recognised as the benchmark +modality for the comprehensive assessment of cardiac function. Nevertheless, +the acquisition process of cine CMR is considered as an impediment due to its +prolonged scanning time. One commonly used strategy to expedite the acquisition +process is through k-space undersampling, though it comes with a drawback of +introducing aliasing effects in the reconstructed image. Lately, deep +learning-based methods have shown remarkable results over traditional +approaches in rapidly achieving precise CMR reconstructed images. This study +aims to explore the untapped potential of attention mechanisms incorporated +with a deep learning model within the context of the CMR reconstruction +problem. We are motivated by the fact that attention has proven beneficial in +downstream tasks such as image classification and segmentation, but has not +been systematically analysed in the context of CMR reconstruction. Our primary +goal is to identify the strengths and potential limitations of attention +algorithms when integrated with a convolutional backbone model such as a U-Net. +To achieve this, we benchmark different state-of-the-art spatial and channel +attention mechanisms on the CMRxRecon dataset and quantitatively evaluate the +quality of reconstruction using objective metrics. Furthermore, inspired by the +best performing attention mechanism, we propose a new, simple yet effective, +attention pipeline specifically optimised for the task of cardiac image +reconstruction that outperforms other state-of-the-art attention methods. The +layer and model code will be made publicly available. + +
+
+ comment: This paper has been submitted for the 32nd European Signal Processing + Conference EUSIPCO 2024 in Lyon +
+
+
+
+
+ + ☆ Efficient and Generic Point Model for Lossless Point Cloud Attribute + Compression + + +
+ The past several years have witnessed the emergence of learned point cloud +compression (PCC) techniques. However, current learning-based lossless point +cloud attribute compression (PCAC) methods either suffer from high +computational complexity or deteriorated compression performance. Moreover, the +significant variations in point cloud scale and sparsity encountered in +real-world applications make developing an all-in-one neural model a +challenging task. In this paper, we propose PoLoPCAC, an efficient and generic +lossless PCAC method that achieves high compression efficiency and strong +generalizability simultaneously. We formulate lossless PCAC as the task of +inferring explicit distributions of attributes from group-wise autoregressive +priors. A progressive random grouping strategy is first devised to efficiently +resolve the point cloud into groups, and then the attributes of each group are +modeled sequentially from accumulated antecedents. A locality-aware attention +mechanism is utilized to exploit prior knowledge from context windows in +parallel. Since our method directly operates on points, it can naturally avoids +distortion caused by voxelization, and can be executed on point clouds with +arbitrary scale and density. Experiments show that our method can be instantly +deployed once trained on a Synthetic 2k-ShapeNet dataset while enjoying +continuous bit-rate reduction over the latest G-PCCv23 on various datasets +(ShapeNet, ScanNet, MVUB, 8iVFB). Meanwhile, our method reports shorter coding +time than G-PCCv23 on the majority of sequences with a lightweight model size +(2.6MB), which is highly attractive for practical applications. Dataset, code +and trained model are available at +https://github.com/I2-Multimedia-Lab/PoLoPCAC. + +
+
+
+
+
+ + ☆ HRVDA: High-Resolution Visual Document Assistant CVPR 2024 + + +
+ Leveraging vast training data, multimodal large language models (MLLMs) have +demonstrated formidable general visual comprehension capabilities and achieved +remarkable performance across various tasks. However, their performance in +visual document understanding still leaves much room for improvement. This +discrepancy is primarily attributed to the fact that visual document +understanding is a fine-grained prediction task. In natural scenes, MLLMs +typically use low-resolution images, leading to a substantial loss of visual +information. Furthermore, general-purpose MLLMs do not excel in handling +document-oriented instructions. In this paper, we propose a High-Resolution +Visual Document Assistant (HRVDA), which bridges the gap between MLLMs and +visual document understanding. This model employs a content filtering mechanism +and an instruction filtering module to separately filter out the +content-agnostic visual tokens and instruction-agnostic visual tokens, thereby +achieving efficient model training and inference for high-resolution images. In +addition, we construct a document-oriented visual instruction tuning dataset +and apply a multi-stage training strategy to enhance the model's document +modeling capabilities. Extensive experiments demonstrate that our model +achieves state-of-the-art performance across multiple document understanding +datasets, while maintaining training efficiency and inference speed comparable +to low-resolution models. + +
+
+ comment: Accepted to CVPR 2024 main conference +
+
+
+
+
+ + ☆ Sparse Global Matching for Video Frame Interpolation with Large Motion CVPR 2024 + + +
+ Large motion poses a critical challenge in Video Frame Interpolation (VFI) +task. Existing methods are often constrained by limited receptive fields, +resulting in sub-optimal performance when handling scenarios with large motion. +In this paper, we introduce a new pipeline for VFI, which can effectively +integrate global-level information to alleviate issues associated with large +motion. Specifically, we first estimate a pair of initial intermediate flows +using a high-resolution feature map for extracting local details. Then, we +incorporate a sparse global matching branch to compensate for flow estimation, +which consists of identifying flaws in initial flows and generating sparse flow +compensation with a global receptive field. Finally, we adaptively merge the +initial flow estimation with global flow compensation, yielding a more accurate +intermediate flow. To evaluate the effectiveness of our method in handling +large motion, we carefully curate a more challenging subset from commonly used +benchmarks. Our method demonstrates the state-of-the-art performance on these +VFI subsets with large motion. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/ +
+
+
+
+
+ + ☆ DreamScene360: Unconstrained Text-to-3D Scene Generation with Panoramic + Gaussian Splatting + + +
+ The increasing demand for virtual reality applications has highlighted the +significance of crafting immersive 3D assets. We present a text-to-3D +360$^{\circ}$ scene generation pipeline that facilitates the creation of +comprehensive 360$^{\circ}$ scenes for in-the-wild environments in a matter of +minutes. Our approach utilizes the generative power of a 2D diffusion model and +prompt self-refinement to create a high-quality and globally coherent panoramic +image. This image acts as a preliminary "flat" (2D) scene representation. +Subsequently, it is lifted into 3D Gaussians, employing splatting techniques to +enable real-time exploration. To produce consistent 3D geometry, our pipeline +constructs a spatially coherent structure by aligning the 2D monocular depth +into a globally optimized point cloud. This point cloud serves as the initial +state for the centroids of 3D Gaussians. In order to address invisible issues +inherent in single-view inputs, we impose semantic and geometric constraints on +both synthesized and input camera views as regularizations. These guide the +optimization of Gaussians, aiding in the reconstruction of unseen regions. In +summary, our method offers a globally consistent 3D scene within a +360$^{\circ}$ perspective, providing an enhanced immersive experience over +existing techniques. Project website at: http://dreamscene360.github.io/ + +
+
+
+
+
+ + ☆ O-TALC: Steps Towards Combating Oversegmentation within Online Action + Segmentation + + +
+ Online temporal action segmentation shows a strong potential to facilitate +many HRI tasks where extended human action sequences must be tracked and +understood in real time. Traditional action segmentation approaches, however, +operate in an offline two stage approach, relying on computationally expensive +video wide features for segmentation, rendering them unsuitable for online HRI +applications. In order to facilitate online action segmentation on a stream of +incoming video data, we introduce two methods for improved training and +inference of backbone action recognition models, allowing them to be deployed +directly for online frame level classification. Firstly, we introduce surround +dense sampling whilst training to facilitate training vs. inference clip +matching and improve segment boundary predictions. Secondly, we introduce an +Online Temporally Aware Label Cleaning (O-TALC) strategy to explicitly reduce +oversegmentation during online inference. As our methods are backbone +invariant, they can be deployed with computationally efficient spatio-temporal +action recognition models capable of operating in real time with a small +segmentation latency. We show our method outperforms similar online action +segmentation work as well as matches the performance of many offline models +with access to full temporal resolution when operating on challenging +fine-grained datasets. + +
+
+ comment: 5 pages, 3 figures. Accepted as a short (unindexed) paper at the + TAHRI conference +
+
+
+
+
+ + ☆ SparseAD: Sparse Query-Centric Paradigm for Efficient End-to-End + Autonomous Driving + + +
+ End-to-End paradigms use a unified framework to implement multi-tasks in an +autonomous driving system. Despite simplicity and clarity, the performance of +end-to-end autonomous driving methods on sub-tasks is still far behind the +single-task methods. Meanwhile, the widely used dense BEV features in previous +end-to-end methods make it costly to extend to more modalities or tasks. In +this paper, we propose a Sparse query-centric paradigm for end-to-end +Autonomous Driving (SparseAD), where the sparse queries completely represent +the whole driving scenario across space, time and tasks without any dense BEV +representation. Concretely, we design a unified sparse architecture for +perception tasks including detection, tracking, and online mapping. Moreover, +we revisit motion prediction and planning, and devise a more justifiable motion +planner framework. On the challenging nuScenes dataset, SparseAD achieves SOTA +full-task performance among end-to-end methods and significantly narrows the +performance gap between end-to-end paradigms and single-task methods. Codes +will be released soon. + +
+
+
+
+
+ + ☆ Research on Detection of Floating Objects in River and Lake Based on AI + Intelligent Image Recognition + + +
+ With the rapid advancement of artificial intelligence technology, AI-enabled +image recognition has emerged as a potent tool for addressing challenges in +traditional environmental monitoring. This study focuses on the detection of +floating objects in river and lake environments, exploring an innovative +approach based on deep learning. By intricately analyzing the technical +pathways for detecting static and dynamic features and considering the +characteristics of river and lake debris, a comprehensive image acquisition and +processing workflow has been developed. The study highlights the application +and performance comparison of three mainstream deep learning models -SSD, +Faster-RCNN, and YOLOv5- in debris identification. Additionally, a detection +system for floating objects has been designed and implemented, encompassing +both hardware platform construction and software framework development. Through +rigorous experimental validation, the proposed system has demonstrated its +ability to significantly enhance the accuracy and efficiency of debris +detection, thus offering a new technological avenue for water quality +monitoring in rivers and lakes + +
+
+
+
+
+ + ☆ Fine color guidance in diffusion models and its application to image + compression at extremely low bitrates + + +
+ This study addresses the challenge of, without training or fine-tuning, +controlling the global color aspect of images generated with a diffusion model. +We rewrite the guidance equations to ensure that the outputs are closer to a +known color map, and this without hindering the quality of the generation. Our +method leads to new guidance equations. We show in the color guidance context +that, the scaling of the guidance should not decrease but remains high +throughout the diffusion process. In a second contribution, our guidance is +applied in a compression framework, we combine both semantic and general color +information on the image to decode the images at low cost. We show that our +method is effective at improving fidelity and realism of compressed images at +extremely low bit rates, when compared to other classical or more semantic +oriented approaches. + +
+
+ comment: Submitted to IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ☆ RESSCAL3D: Resolution Scalable 3D Semantic Segmentation of Point Clouds ICIP + + +
+ While deep learning-based methods have demonstrated outstanding results in +numerous domains, some important functionalities are missing. Resolution +scalability is one of them. In this work, we introduce a novel architecture, +dubbed RESSCAL3D, providing resolution-scalable 3D semantic segmentation of +point clouds. In contrast to existing works, the proposed method does not +require the whole point cloud to be available to start inference. Once a +low-resolution version of the input point cloud is available, first semantic +predictions can be generated in an extremely fast manner. This enables early +decision-making in subsequent processing steps. As additional points become +available, these are processed in parallel. To improve performance, features +from previously computed scales are employed as prior knowledge at the current +scale. Our experiments show that RESSCAL3D is 31-62% faster than the +non-scalable baseline while keeping a limited impact on performance. To the +best of our knowledge, the proposed method is the first to propose a +resolution-scalable approach for 3D semantic segmentation of point clouds based +on deep learning. + +
+
+ comment: Published at 2023 IEEE International Conference on Image Processing + (ICIP) +
+
+
+
+
+ + ☆ Monocular 3D lane detection for Autonomous Driving: Recent Achievements, + Challenges, and Outlooks + + +
+ 3D lane detection plays a crucial role in autonomous driving by extracting +structural and traffic information from the road in 3D space to assist the +self-driving car in rational, safe, and comfortable path planning and motion +control. Due to the consideration of sensor costs and the advantages of visual +data in color information, in practical applications, 3D lane detection based +on monocular vision is one of the important research directions in the field of +autonomous driving, which has attracted more and more attention in both +industry and academia. Unfortunately, recent progress in visual perception +seems insufficient to develop completely reliable 3D lane detection algorithms, +which also hinders the development of vision-based fully autonomous +self-driving cars, i.e., achieving level 5 autonomous driving, driving like +human-controlled cars. This is one of the conclusions drawn from this review +paper: there is still a lot of room for improvement and significant +improvements are still needed in the 3D lane detection algorithm for autonomous +driving cars using visual sensors. Motivated by this, this review defines, +analyzes, and reviews the current achievements in the field of 3D lane +detection research, and the vast majority of the current progress relies +heavily on computationally complex deep learning models. In addition, this +review covers the 3D lane detection pipeline, investigates the performance of +state-of-the-art algorithms, analyzes the time complexity of cutting-edge +modeling choices, and highlights the main achievements and limitations of +current research efforts. The survey also includes a comprehensive discussion +of available 3D lane detection datasets and the challenges that researchers +have faced but have not yet resolved. Finally, our work outlines future +research directions and welcomes researchers and practitioners to enter this +exciting field. + +
+
+
+
+
+ + ☆ Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark + + +
+ Multi-label image classification in dynamic environments is a problem that +poses significant challenges. Previous studies have primarily focused on +scenarios such as Domain Incremental Learning and Class Incremental Learning, +which do not fully capture the complexity of real-world applications. In this +paper, we study the problem of classification of medical imaging in the +scenario termed New Instances \& New Classes, which combines the challenges of +both new class arrivals and domain shifts in a single framework. Unlike +traditional scenarios, it reflects the realistic nature of CL in domains such +as medical imaging, where updates may introduce both new classes and changes in +domain characteristics. To address the unique challenges posed by this complex +scenario, we introduce a novel approach called Pseudo-Label Replay. This method +aims to mitigate forgetting while adapting to new classes and domain shifts by +combining the advantages of the Replay and Pseudo-Label methods and solving +their limitations in the proposed scenario. % part3 We evaluate our proposed +approach on a challenging benchmark consisting of two datasets, seven tasks, +and nineteen classes, modeling a realistic Continual Learning scenario. Our +experimental findings demonstrate the effectiveness of Pseudo-Label Replay in +addressing the challenges posed by the complex scenario proposed. Our method +surpasses existing approaches, exhibiting superior performance while showing +minimal forgetting. + +
+
+
+
+
+ + ☆ UDiFF: Generating Conditional Unsigned Distance Fields with Optimal + Wavelet Diffusion CVPR2024 + + +
+ Diffusion models have shown remarkable results for image generation, editing +and inpainting. Recent works explore diffusion models for 3D shape generation +with neural implicit functions, i.e., signed distance function and occupancy +function. However, they are limited to shapes with closed surfaces, which +prevents them from generating diverse 3D real-world contents containing open +surfaces. In this work, we present UDiFF, a 3D diffusion model for unsigned +distance fields (UDFs) which is capable to generate textured 3D shapes with +open surfaces from text conditions or unconditionally. Our key idea is to +generate UDFs in spatial-frequency domain with an optimal wavelet +transformation, which produces a compact representation space for UDF +generation. Specifically, instead of selecting an appropriate wavelet +transformation which requires expensive manual efforts and still leads to large +information loss, we propose a data-driven approach to learn the optimal +wavelet transformation for UDFs. We evaluate UDiFF to show our advantages by +numerical and visual comparisons with the latest methods on widely used +benchmarks. Page: https://weiqi-zhang.github.io/UDiFF. + +
+
+ comment: To appear at CVPR2024. Project page: + https://weiqi-zhang.github.io/UDiFF +
+
+
+
+
+ + ☆ MoCha-Stereo: Motif Channel Attention Network for Stereo Matching CVPR 2024 + + +
+ Learning-based stereo matching techniques have made significant progress. +However, existing methods inevitably lose geometrical structure information +during the feature channel generation process, resulting in edge detail +mismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network +(MoCha-Stereo) is designed to address this problem. We provide the Motif +Channel Correlation Volume (MCCV) to determine more accurate edge matching +costs. MCCV is achieved by projecting motif channels, which capture common +geometric structures in feature channels, onto feature maps and cost volumes. +In addition, edge variations in %potential feature channels of the +reconstruction error map also affect details matching, we propose the +Reconstruction Error Motif Penalty (REMP) module to further refine the +full-resolution disparity estimation. REMP integrates the frequency information +of typical channel features from the reconstruction error. MoCha-Stereo ranks +1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure +also shows excellent performance in Multi-View Stereo. Code is avaliable at +https://github.com/ZYangChen/MoCha-Stereo. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ O2V-Mapping: Online Open-Vocabulary Mapping with Neural Implicit + Representation + + +
+ Online construction of open-ended language scenes is crucial for robotic +applications, where open-vocabulary interactive scene understanding is +required. Recently, neural implicit representation has provided a promising +direction for online interactive mapping. However, implementing open-vocabulary +scene understanding capability into online neural implicit mapping still faces +three challenges: lack of local scene updating ability, blurry spatial +hierarchical semantic segmentation and difficulty in maintaining multi-view +consistency. To this end, we proposed O2V-mapping, which utilizes voxel-based +language and geometric features to create an open-vocabulary field, thus +allowing for local updates during online training process. Additionally, we +leverage a foundational model for image segmentation to extract language +features on object-level entities, achieving clear segmentation boundaries and +hierarchical semantic features. For the purpose of preserving consistency in 3D +object properties across different viewpoints, we propose a spatial adaptive +voxel adjustment mechanism and a multi-view weight selection method. Extensive +experiments on open-vocabulary object localization and semantic segmentation +demonstrate that O2V-mapping achieves online construction of language scenes +while enhancing accuracy, outperforming the previous SOTA method. + +
+
+
+
+
+ + ☆ Tuning-Free Adaptive Style Incorporation for Structure-Consistent + Text-Driven Style Transfer + + +
+ In this work, we target the task of text-driven style transfer in the context +of text-to-image (T2I) diffusion models. The main challenge is consistent +structure preservation while enabling effective style transfer effects. The +past approaches in this field directly concatenate the content and style +prompts for a prompt-level style injection, leading to unavoidable structure +distortions. In this work, we propose a novel solution to the text-driven style +transfer task, namely, Adaptive Style Incorporation~(ASI), to achieve +fine-grained feature-level style incorporation. It consists of the Siamese +Cross-Attention~(SiCA) to decouple the single-track cross-attention to a +dual-track structure to obtain separate content and style features, and the +Adaptive Content-Style Blending (AdaBlending) module to couple the content and +style information from a structure-consistent manner. Experimentally, our +method exhibits much better performance in both structure preservation and +stylized effects. + +
+
+
+
+
+ + ☆ SplatPose & Detect: Pose-Agnostic 3D Anomaly Detection CVPR 2024 + + +
+ Detecting anomalies in images has become a well-explored problem in both +academia and industry. State-of-the-art algorithms are able to detect defects +in increasingly difficult settings and data modalities. However, most current +methods are not suited to address 3D objects captured from differing poses. +While solutions using Neural Radiance Fields (NeRFs) have been proposed, they +suffer from excessive computation requirements, which hinder real-world +usability. For this reason, we propose the novel 3D Gaussian splatting-based +framework SplatPose which, given multi-view images of a 3D object, accurately +estimates the pose of unseen views in a differentiable manner, and detects +anomalies in them. We achieve state-of-the-art results in both training and +inference speed, and detection performance, even when using less training data +than competing methods. We thoroughly evaluate our framework using the recently +proposed Pose-agnostic Anomaly Detection benchmark and its multi-pose anomaly +detection (MAD) data set. + +
+
+ comment: Visual Anomaly and Novelty Detection 2.0 Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ Zero-shot Point Cloud Completion Via 2D Priors + + +
+ 3D point cloud completion is designed to recover complete shapes from +partially observed point clouds. Conventional completion methods typically +depend on extensive point cloud data for training %, with their effectiveness +often constrained to object categories similar to those seen during training. +In contrast, we propose a zero-shot framework aimed at completing partially +observed point clouds across any unseen categories. Leveraging point rendering +via Gaussian Splatting, we develop techniques of Point Cloud Colorization and +Zero-shot Fractal Completion that utilize 2D priors from pre-trained diffusion +models to infer missing regions. Experimental results on both synthetic and +real-world scanned point clouds demonstrate that our approach outperforms +existing methods in completing a variety of objects without any requirement for +specific training data. + +
+
+
+
+
+ + ☆ MedRG: Medical Report Grounding with Multi-modal Large Language Model + + +
+ Medical Report Grounding is pivotal in identifying the most relevant regions +in medical images based on a given phrase query, a critical aspect in medical +image analysis and radiological diagnosis. However, prevailing visual grounding +approaches necessitate the manual extraction of key phrases from medical +reports, imposing substantial burdens on both system efficiency and physicians. +In this paper, we introduce a novel framework, Medical Report Grounding +(MedRG), an end-to-end solution for utilizing a multi-modal Large Language +Model to predict key phrase by incorporating a unique token, BOX, into the +vocabulary to serve as an embedding for unlocking detection capabilities. +Subsequently, the vision encoder-decoder jointly decodes the hidden embedding +and the input medical image, generating the corresponding grounding box. The +experimental results validate the effectiveness of MedRG, surpassing the +performance of the existing state-of-the-art medical phrase grounding methods. +This study represents a pioneering exploration of the medical report grounding +task, marking the first-ever endeavor in this domain. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Urban Architect: Steerable 3D Urban Scene Generation with Layout Prior + + +
+ Text-to-3D generation has achieved remarkable success via large-scale +text-to-image diffusion models. Nevertheless, there is no paradigm for scaling +up the methodology to urban scale. Urban scenes, characterized by numerous +elements, intricate arrangement relationships, and vast scale, present a +formidable barrier to the interpretability of ambiguous textual descriptions +for effective model optimization. In this work, we surmount the limitations by +introducing a compositional 3D layout representation into text-to-3D paradigm, +serving as an additional prior. It comprises a set of semantic primitives with +simple geometric structures and explicit arrangement relationships, +complementing textual descriptions and enabling steerable generation. Upon +this, we propose two modifications -- (1) We introduce Layout-Guided +Variational Score Distillation to address model optimization inadequacies. It +conditions the score distillation sampling process with geometric and semantic +constraints of 3D layouts. (2) To handle the unbounded nature of urban scenes, +we represent 3D scene with a Scalable Hash Grid structure, incrementally +adapting to the growing scale of urban scenes. Extensive experiments +substantiate the capability of our framework to scale text-to-3D generation to +large-scale urban scenes that cover over 1000m driving distance for the first +time. We also present various scene editing demonstrations, showing the powers +of steerable urban scene generation. Website: https://urbanarchitect.github.io. + +
+
+ comment: Project page: https://urbanarchitect.github.io/ +
+
+
+
+
+ + ☆ Efficient and Scalable Chinese Vector Font Generation via Component + Composition + + +
+ Chinese vector font generation is challenging due to the complex structure +and huge amount of Chinese characters. Recent advances remain limited to +generating a small set of characters with simple structure. In this work, we +first observe that most Chinese characters can be disassembled into +frequently-reused components. Therefore, we introduce the first efficient and +scalable Chinese vector font generation approach via component composition, +allowing generating numerous vector characters from a small set of components. +To achieve this, we collect a large-scale dataset that contains over +\textit{90K} Chinese characters with their components and layout information. +Upon the dataset, we propose a simple yet effective framework based on spatial +transformer networks (STN) and multiple losses tailored to font characteristics +to learn the affine transformation of the components, which can be directly +applied to the B\'ezier curves, resulting in Chinese characters in vector +format. Our qualitative and quantitative experiments have demonstrated that our +method significantly surpasses the state-of-the-art vector font generation +methods in generating large-scale complex Chinese characters in both font +generation and zero-shot font extension. + +
+
+ comment: 15 pages, 23 figures +
+
+
+
+
+ + ☆ Object-Conditioned Energy-Based Attention Map Alignment in Text-to-Image + Diffusion Models + + +
+ Text-to-image diffusion models have shown great success in generating +high-quality text-guided images. Yet, these models may still fail to +semantically align generated images with the provided text prompts, leading to +problems like incorrect attribute binding and/or catastrophic object neglect. +Given the pervasive object-oriented structure underlying text prompts, we +introduce a novel object-conditioned Energy-Based Attention Map Alignment +(EBAMA) method to address the aforementioned problems. We show that an +object-centric attribute binding loss naturally emerges by approximately +maximizing the log-likelihood of a $z$-parameterized energy-based model with +the help of the negative sampling technique. We further propose an +object-centric intensity regularizer to prevent excessive shifts of objects +attention towards their attributes. Extensive qualitative and quantitative +experiments, including human evaluation, on several challenging benchmarks +demonstrate the superior performance of our method over previous strong +counterparts. With better aligned attention maps, our approach shows great +promise in further enhancing the text-controlled image editing ability of +diffusion models. + +
+
+
+
+
+ + ☆ Deep Generative Sampling in the Dual Divergence Space: A Data-efficient + & Interpretative Approach for Generative AI + + +
+ Building on the remarkable achievements in generative sampling of natural +images, we propose an innovative challenge, potentially overly ambitious, which +involves generating samples of entire multivariate time series that resemble +images. However, the statistical challenge lies in the small sample size, +sometimes consisting of a few hundred subjects. This issue is especially +problematic for deep generative models that follow the conventional approach of +generating samples from a canonical distribution and then decoding or denoising +them to match the true data distribution. In contrast, our method is grounded +in information theory and aims to implicitly characterize the distribution of +images, particularly the (global and local) dependency structure between +pixels. We achieve this by empirically estimating its KL-divergence in the dual +form with respect to the respective marginal distribution. This enables us to +perform generative sampling directly in the optimized 1-D dual divergence +space. Specifically, in the dual space, training samples representing the data +distribution are embedded in the form of various clusters between two end +points. In theory, any sample embedded between those two end points is +in-distribution w.r.t. the data distribution. Our key idea for generating novel +samples of images is to interpolate between the clusters via a walk as per +gradients of the dual function w.r.t. the data dimensions. In addition to the +data efficiency gained from direct sampling, we propose an algorithm that +offers a significant reduction in sample complexity for estimating the +divergence of the data distribution with respect to the marginal distribution. +We provide strong theoretical guarantees along with an extensive empirical +evaluation using many real-world datasets from diverse domains, establishing +the superiority of our approach w.r.t. state-of-the-art deep learning methods. + +
+
+
+
+
+ + ☆ Improving Multi-Center Generalizability of GAN-Based Fat Suppression + using Federated Learning + + +
+ Generative Adversarial Network (GAN)-based synthesis of fat suppressed (FS) +MRIs from non-FS proton density sequences has the potential to accelerate +acquisition of knee MRIs. However, GANs trained on single-site data have poor +generalizability to external data. We show that federated learning can improve +multi-center generalizability of GANs for synthesizing FS MRIs, while +facilitating privacy-preserving multi-institutional collaborations. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ GANsemble for Small and Imbalanced Data Sets: A Baseline for Synthetic + Microplastics Data + + +
+ Microplastic particle ingestion or inhalation by humans is a problem of +growing concern. Unfortunately, current research methods that use machine +learning to understand their potential harms are obstructed by a lack of +available data. Deep learning techniques in particular are challenged by such +domains where only small or imbalanced data sets are available. Overcoming this +challenge often involves oversampling underrepresented classes or augmenting +the existing data to improve model performance. This paper proposes GANsemble: +a two-module framework connecting data augmentation with conditional generative +adversarial networks (cGANs) to generate class-conditioned synthetic data. +First, the data chooser module automates augmentation strategy selection by +searching for the best data augmentation strategy. Next, the cGAN module uses +this strategy to train a cGAN for generating enhanced synthetic data. We +experiment with the GANsemble framework on a small and imbalanced microplastics +data set. A Microplastic-cGAN (MPcGAN) algorithm is introduced, and baselines +for synthetic microplastics (SYMP) data are established in terms of Frechet +Inception Distance (FID) and Inception Scores (IS). We also provide a synthetic +microplastics filter (SYMP-Filter) algorithm to increase the quality of +generated SYMP. Additionally, we show the best amount of oversampling with +augmentation to fix class imbalance in small microplastics data sets. To our +knowledge, this study is the first application of generative AI to +synthetically create microplastics data. + +
+
+ comment: Accepted to the 37th Canadian Artificial Intelligence Conference + (2024), 12 pages, 4 figures +
+
+
+
+
+ + ☆ A Transformer-Based Model for the Prediction of Human Gaze Behavior on + Videos + + +
+ Eye-tracking applications that utilize the human gaze in video understanding +tasks have become increasingly important. To effectively automate the process +of video analysis based on eye-tracking data, it is important to accurately +replicate human gaze behavior. However, this task presents significant +challenges due to the inherent complexity and ambiguity of human gaze patterns. +In this work, we introduce a novel method for simulating human gaze behavior. +Our approach uses a transformer-based reinforcement learning algorithm to train +an agent that acts as a human observer, with the primary role of watching +videos and simulating human gaze behavior. We employed an eye-tracking dataset +gathered from videos generated by the VirtualHome simulator, with a primary +focus on activity recognition. Our experimental results demonstrate the +effectiveness of our gaze prediction method by highlighting its capability to +replicate human gaze behavior and its applicability for downstream tasks where +real human-gaze is used as input. + +
+
+ comment: 2024 Symposium on Eye Tracking Research and Applications (ETRA24), + Glasgow, United Kingdom +
+
+
+
+
+ + ☆ Gaze-Guided Graph Neural Network for Action Anticipation Conditioned on + Intention + + +
+ Humans utilize their gaze to concentrate on essential information while +perceiving and interpreting intentions in videos. Incorporating human gaze into +computational algorithms can significantly enhance model performance in video +understanding tasks. In this work, we address a challenging and innovative task +in video understanding: predicting the actions of an agent in a video based on +a partial video. We introduce the Gaze-guided Action Anticipation algorithm, +which establishes a visual-semantic graph from the video input. Our method +utilizes a Graph Neural Network to recognize the agent's intention and predict +the action sequence to fulfill this intention. To assess the efficiency of our +approach, we collect a dataset containing household activities generated in the +VirtualHome environment, accompanied by human gaze data of viewing videos. Our +method outperforms state-of-the-art techniques, achieving a 7\% improvement in +accuracy for 18-class intention recognition. This highlights the efficiency of +our method in learning important features from human gaze data. + +
+
+ comment: 2024 Symposium on Eye Tracking Research and Applications (ETRA24), + Glasgow, United Kingdom +
+
+
+
+
+ + ☆ PEAVS: Perceptual Evaluation of Audio-Visual Synchrony Grounded in + Viewers' Opinion Scores + + +
+ Recent advancements in audio-visual generative modeling have been propelled +by progress in deep learning and the availability of data-rich benchmarks. +However, the growth is not attributed solely to models and benchmarks. +Universally accepted evaluation metrics also play an important role in +advancing the field. While there are many metrics available to evaluate audio +and visual content separately, there is a lack of metrics that offer a +quantitative and interpretable measure of audio-visual synchronization for +videos "in the wild". To address this gap, we first created a large scale human +annotated dataset (100+ hrs) representing nine types of synchronization errors +in audio-visual content and how human perceive them. We then developed a PEAVS +(Perceptual Evaluation of Audio-Visual Synchrony) score, a novel automatic +metric with a 5-point scale that evaluates the quality of audio-visual +synchronization. We validate PEAVS using a newly generated dataset, achieving a +Pearson correlation of 0.79 at the set level and 0.54 at the clip level when +compared to human labels. In our experiments, we observe a relative gain 50% +over a natural extension of Fr\'echet based metrics for Audio-Visual synchrony, +confirming PEAVS efficacy in objectively modeling subjective perceptions of +audio-visual synchronization for videos "in the wild". + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Rethinking Perceptual Metrics for Medical Image Translation + + +
+ Modern medical image translation methods use generative models for tasks such +as the conversion of CT images to MRI. Evaluating these methods typically +relies on some chosen downstream task in the target domain, such as +segmentation. On the other hand, task-agnostic metrics are attractive, such as +the network feature-based perceptual metrics (e.g., FID) that are common to +image translation in general computer vision. In this paper, we investigate +evaluation metrics for medical image translation on two medical image +translation tasks (GE breast MRI to Siemens breast MRI and lumbar spine MRI to +CT), tested on various state-of-the-art translation methods. We show that +perceptual metrics do not generally correlate with segmentation metrics due to +them extending poorly to the anatomical constraints of this sub-field, with FID +being especially inconsistent. However, we find that the lesser-used +pixel-level SWD metric may be useful for subtle intra-modality translation. Our +results demonstrate the need for further research into helpful metrics for +medical image translation. + +
+
+
+
+
+ + ☆ AI-Guided Defect Detection Techniques to Model Single Crystal Diamond + Growth + + +
+ From a process development perspective, diamond growth via chemical vapor +deposition has made significant strides. However, challenges persist in +achieving high quality and large-area material production. These difficulties +include controlling conditions to maintain uniform growth rates for the entire +growth surface. As growth progresses, various factors or defect states emerge, +altering the uniform conditions. These changes affect the growth rate and +result in the formation of crystalline defects at the microscale. However, +there is a distinct lack of methods to identify these defect states and their +geometry using images taken during the growth process. This paper details +seminal work on defect segmentation pipeline using in-situ optical images to +identify features that indicate defective states that are visible at the +macroscale. Using a semantic segmentation approach as applied in our previous +work, these defect states and corresponding derivative features are isolated +and classified by their pixel masks. Using an annotation focused +human-in-the-loop software architecture to produce training datasets, with +modules for selective data labeling using active learning, data augmentations, +and model-assisted labeling, our approach achieves effective annotation +accuracy and drastically reduces the time and cost of labeling by orders of +magnitude. On the model development front, we found that deep learning-based +algorithms are the most efficient. They can accurately learn complex +representations from feature-rich datasets. Our best-performing model, based on +the YOLOV3 and DeeplabV3plus architectures, achieved excellent accuracy for +specific features of interest. Specifically, it reached 93.35% accuracy for +center defects, 92.83% for polycrystalline defects, and 91.98% for edge +defects. + +
+
+ comment: 12 pages,4 figures,ACMME 2024 +
+
+
+
+
+ + ☆ Solving Masked Jigsaw Puzzles with Diffusion Vision Transformers + + +
+ Solving image and video jigsaw puzzles poses the challenging task of +rearranging image fragments or video frames from unordered sequences to restore +meaningful images and video sequences. Existing approaches often hinge on +discriminative models tasked with predicting either the absolute positions of +puzzle elements or the permutation actions applied to the original data. +Unfortunately, these methods face limitations in effectively solving puzzles +with a large number of elements. In this paper, we propose JPDVT, an innovative +approach that harnesses diffusion transformers to address this challenge. +Specifically, we generate positional information for image patches or video +frames, conditioned on their underlying visual content. This information is +then employed to accurately assemble the puzzle pieces in their correct +positions, even in scenarios involving missing pieces. Our method achieves +state-of-the-art performance on several datasets. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Logit Calibration and Feature Contrast for Robust Federated Learning on + Non-IID Data + + +
+ Federated learning (FL) is a privacy-preserving distributed framework for +collaborative model training on devices in edge networks. However, challenges +arise due to vulnerability to adversarial examples (AEs) and the +non-independent and identically distributed (non-IID) nature of data +distribution among devices, hindering the deployment of adversarially robust +and accurate learning models at the edge. While adversarial training (AT) is +commonly acknowledged as an effective defense strategy against adversarial +attacks in centralized training, we shed light on the adverse effects of +directly applying AT in FL that can severely compromise accuracy, especially in +non-IID challenges. Given this limitation, this paper proposes FatCC, which +incorporates local logit \underline{C}alibration and global feature +\underline{C}ontrast into the vanilla federated adversarial training +(\underline{FAT}) process from both logit and feature perspectives. This +approach can effectively enhance the federated system's robust accuracy (RA) +and clean accuracy (CA). First, we propose logit calibration, where the logits +are calibrated during local adversarial updates, thereby improving adversarial +robustness. Second, FatCC introduces feature contrast, which involves a global +alignment term that aligns each local representation with unbiased global +features, thus further enhancing robustness and accuracy in federated +adversarial environments. Extensive experiments across multiple datasets +demonstrate that FatCC achieves comparable or superior performance gains in +both CA and RA compared to other baselines. + +
+
+
+
+
+ + ☆ Adapting LLaMA Decoder to Vision Transformer + + +
+ This work examines whether decoder-only Transformers such as LLaMA, which +were originally designed for large language models (LLMs), can be adapted to +the computer vision field. We first "LLaMAfy" a standard ViT step-by-step to +align with LLaMA's architecture, and find that directly applying a casual mask +to the self-attention brings an attention collapse issue, resulting in the +failure to the network training. We suggest to reposition the class token +behind the image tokens with a post-sequence class token technique to overcome +this challenge, enabling causal self-attention to efficiently capture the +entire image's information. Additionally, we develop a soft mask strategy that +gradually introduces a casual mask to the self-attention at the onset of +training to facilitate the optimization behavior. The tailored model, dubbed as +image LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct +supervised learning. Its causal self-attention boosts computational efficiency +and learns complex representation by elevating attention map ranks. iLLaMA +rivals the performance with its encoder-only counterparts, achieving 75.1% +ImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M +and pre-training on ImageNet-21K further enhances the accuracy to 86.0%. +Extensive experiments demonstrate iLLaMA's reliable properties: calibration, +shape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR +transfer learning. We hope our study can kindle fresh views to visual model +design in the wave of LLMs. Pre-trained models and codes are available here. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ☆ MonoSelfRecon: Purely Self-Supervised Explicit Generalizable 3D + Reconstruction of Indoor Scenes from Monocular RGB Views + + +
+ Current monocular 3D scene reconstruction (3DR) works are either +fully-supervised, or not generalizable, or implicit in 3D representation. We +propose a novel framework - MonoSelfRecon that for the first time achieves +explicit 3D mesh reconstruction for generalizable indoor scenes with monocular +RGB views by purely self-supervision on voxel-SDF (signed distance function). +MonoSelfRecon follows an Autoencoder-based architecture, decodes voxel-SDF and +a generalizable Neural Radiance Field (NeRF), which is used to guide voxel-SDF +in self-supervision. We propose novel self-supervised losses, which not only +support pure self-supervision, but can be used together with supervised signals +to further boost supervised training. Our experiments show that "MonoSelfRecon" +trained in pure self-supervision outperforms current best self-supervised +indoor depth estimation models and is comparable to 3DR models trained in fully +supervision with depth annotations. MonoSelfRecon is not restricted by specific +model design, which can be used to any models with voxel-SDF for purely +self-supervised manner. + +
+
+
+
+
+ + ☆ YOLO based Ocean Eddy Localization with AWS SageMaker + + +
+ Ocean eddies play a significant role both on the sea surface and beneath it, +contributing to the sustainability of marine life dependent on oceanic +behaviors. Therefore, it is crucial to investigate ocean eddies to monitor +changes in the Earth, particularly in the oceans, and their impact on climate. +This study aims to pinpoint ocean eddies using AWS cloud services, specifically +SageMaker. The primary objective is to detect small-scale (<20km) ocean eddies +from satellite remote images and assess the feasibility of utilizing SageMaker, +which offers tools for deploying AI applications. Moreover, this research not +only explores the deployment of cloud-based services for remote sensing of +Earth data but also evaluates several YOLO (You Only Look Once) models using +single and multi-GPU-based services in the cloud. Furthermore, this study +underscores the potential of these services, their limitations, challenges +related to deployment and resource management, and their user-riendliness for +Earth science projects. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ An Animation-based Augmentation Approach for Action Recognition from + Discontinuous Video + + +
+ The study of action recognition has attracted considerable attention recently +due to its broad applications in multiple areas. However, with the issue of +discontinuous training video, which not only decreases the performance of +action recognition model, but complicates the data augmentation process as +well, still remains under-exploration. In this study, we introduce the 4A +(Action Animation-based Augmentation Approach), an innovative pipeline for data +augmentation to address the problem. The main contributions remain in our work +includes: (1) we investigate the problem of severe decrease on performance of +action recognition task training by discontinuous video, and the limitation of +existing augmentation methods on solving this problem. (2) we propose a novel +augmentation pipeline, 4A, to address the problem of discontinuous video for +training, while achieving a smoother and natural-looking action representation +than the latest data augmentation methodology. (3) We achieve the same +performance with only 10% of the original data for training as with all of the +original data from the real-world dataset, and a better performance on +In-the-wild videos, by employing our data augmentation techniques. + +
+
+
+
+
+ + ☆ Bayesian NeRF: Quantifying Uncertainty with Volume Density in Neural + Radiance Fields + + +
+ We present the Bayesian Neural Radiance Field (NeRF), which explicitly +quantifies uncertainty in geometric volume structures without the need for +additional networks, making it adept for challenging observations and +uncontrolled images. NeRF diverges from traditional geometric methods by +offering an enriched scene representation, rendering color and density in 3D +space from various viewpoints. However, NeRF encounters limitations in relaxing +uncertainties by using geometric structure information, leading to inaccuracies +in interpretation under insufficient real-world observations. Recent research +efforts aimed at addressing this issue have primarily relied on empirical +methods or auxiliary networks. To fundamentally address this issue, we propose +a series of formulational extensions to NeRF. By introducing generalized +approximations and defining density-related uncertainty, our method seamlessly +extends to manage uncertainty not only for RGB but also for depth, without the +need for additional networks or empirical assumptions. In experiments we show +that our method significantly enhances performance on RGB and depth images in +the comprehensive dataset, demonstrating the reliability of the Bayesian NeRF +approach to quantifying uncertainty based on the geometric structure. + +
+
+
+
+
+ + ☆ Sparse Points to Dense Clouds: Enhancing 3D Detection with Limited LiDAR + Data + + +
+ 3D detection is a critical task that enables machines to identify and locate +objects in three-dimensional space. It has a broad range of applications in +several fields, including autonomous driving, robotics and augmented reality. +Monocular 3D detection is attractive as it requires only a single camera, +however, it lacks the accuracy and robustness required for real world +applications. High resolution LiDAR on the other hand, can be expensive and +lead to interference problems in heavy traffic given their active +transmissions. We propose a balanced approach that combines the advantages of +monocular and point cloud-based 3D detection. Our method requires only a small +number of 3D points, that can be obtained from a low-cost, low-resolution +sensor. Specifically, we use only 512 points, which is just 1% of a full LiDAR +frame in the KITTI dataset. Our method reconstructs a complete 3D point cloud +from this limited 3D information combined with a single image. The +reconstructed 3D point cloud and corresponding image can be used by any +multi-modal off-the-shelf detector for 3D object detection. By using the +proposed network architecture with an off-the-shelf multi-modal 3D detector, +the accuracy of 3D detection improves by 20% compared to the state-of-the-art +monocular detection methods and 6% to 9% compare to the baseline multi-modal +methods on KITTI and JackRabbot datasets. + +
+
+
+
+
+ + ☆ Convolution-based Probability Gradient Loss for Semantic Segmentation + + +
+ In this paper, we introduce a novel Convolution-based Probability Gradient +(CPG) loss for semantic segmentation. It employs convolution kernels similar to +the Sobel operator, capable of computing the gradient of pixel intensity in an +image. This enables the computation of gradients for both ground-truth and +predicted category-wise probabilities. It enhances network performance by +maximizing the similarity between these two probability gradients. Moreover, to +specifically enhance accuracy near the object's boundary, we extract the object +boundary based on the ground-truth probability gradient and exclusively apply +the CPG loss to pixels belonging to boundaries. CPG loss proves to be highly +convenient and effective. It establishes pixel relationships through +convolution, calculating errors from a distinct dimension compared to +pixel-wise loss functions such as cross-entropy loss. We conduct qualitative +and quantitative analyses to evaluate the impact of the CPG loss on three +well-established networks (DeepLabv3-Resnet50, HRNetV2-OCR, and +LRASPP_MobileNet_V3_Large) across three standard segmentation datasets +(Cityscapes, COCO-Stuff, ADE20K). Our extensive experimental results +consistently and significantly demonstrate that the CPG loss enhances the mean +Intersection over Union. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ Scaling Multi-Camera 3D Object Detection through Weak-to-Strong + Eliciting + + +
+ The emergence of Multi-Camera 3D Object Detection (MC3D-Det), facilitated by +bird's-eye view (BEV) representation, signifies a notable progression in 3D +object detection. Scaling MC3D-Det training effectively accommodates varied +camera parameters and urban landscapes, paving the way for the MC3D-Det +foundation model. However, the multi-view fusion stage of the MC3D-Det method +relies on the ill-posed monocular perception during training rather than +surround refinement ability, leading to what we term "surround refinement +degradation". To this end, our study presents a weak-to-strong eliciting +framework aimed at enhancing surround refinement while maintaining robust +monocular perception. Specifically, our framework employs weakly tuned experts +trained on distinct subsets, and each is inherently biased toward specific +camera configurations and scenarios. These biased experts can learn the +perception of monocular degeneration, which can help the multi-view fusion +stage to enhance surround refinement abilities. Moreover, a composite +distillation strategy is proposed to integrate the universal knowledge of 2D +foundation models and task-specific information. Finally, for MC3D-Det joint +training, the elaborate dataset merge strategy is designed to solve the problem +of inconsistent camera numbers and camera parameters. We set up a multiple +dataset joint training benchmark for MC3D-Det and adequately evaluated existing +methods. Further, we demonstrate the proposed framework brings a generalized +and significant boost over multiple baselines. Our code is at +\url{https://github.com/EnVision-Research/Scale-BEV}. + +
+
+
+
+
+ + ☆ Binomial Self-compensation for Motion Error in Dynamic 3D Scanning + + +
+ Phase shifting profilometry (PSP) is favored in high-precision 3D scanning +due to its high accuracy, robustness, and pixel-wise property. However, a +fundamental assumption of PSP that the object should remain static is violated +in dynamic measurement, making PSP susceptible to object moving, resulting in +ripple-like errors in the point clouds. We propose a pixel-wise and frame-wise +loopable binomial self-compensation (BSC) algorithm to effectively and flexibly +eliminate motion error in the four-step PSP. Our mathematical model +demonstrates that by summing successive motion-affected phase frames weighted +by binomial coefficients, motion error exponentially diminishes as the binomial +order increases, accomplishing automatic error compensation through the +motion-affected phase sequence, without the assistance of any intermediate +variable. Extensive experiments show that our BSC outperforms the existing +methods in reducing motion error, while achieving a depth map frame rate equal +to the camera's acquisition rate (90 fps), enabling high-accuracy 3D +reconstruction with a quasi-single-shot frame rate. + +
+
+
+
+
+ + ☆ Perception-Oriented Video Frame Interpolation via Asymmetric Blending CVPR 2024 + + +
+ Previous methods for Video Frame Interpolation (VFI) have encountered +challenges, notably the manifestation of blur and ghosting effects. These +issues can be traced back to two pivotal factors: unavoidable motion errors and +misalignment in supervision. In practice, motion estimates often prove to be +error-prone, resulting in misaligned features. Furthermore, the reconstruction +loss tends to bring blurry results, particularly in misaligned regions. To +mitigate these challenges, we propose a new paradigm called PerVFI +(Perception-oriented Video Frame Interpolation). Our approach incorporates an +Asymmetric Synergistic Blending module (ASB) that utilizes features from both +sides to synergistically blend intermediate features. One reference frame +emphasizes primary content, while the other contributes complementary +information. To impose a stringent constraint on the blending process, we +introduce a self-learned sparse quasi-binary mask which effectively mitigates +ghosting and blur artifacts in the output. Additionally, we employ a +normalizing flow-based generator and utilize the negative log-likelihood loss +to learn the conditional distribution of the output, which further facilitates +the generation of clear and fine details. Experimental results validate the +superiority of PerVFI, demonstrating significant improvements in perceptual +quality compared to existing methods. Codes are available at +\url{https://github.com/mulns/PerVFI} + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Unsupervised Visible-Infrared ReID via Pseudo-label Correction and + Modality-level Alignment + + +
+ Unsupervised visible-infrared person re-identification (UVI-ReID) has +recently gained great attention due to its potential for enhancing human +detection in diverse environments without labeling. Previous methods utilize +intra-modality clustering and cross-modality feature matching to achieve +UVI-ReID. However, there exist two challenges: 1) noisy pseudo labels might be +generated in the clustering process, and 2) the cross-modality feature +alignment via matching the marginal distribution of visible and infrared +modalities may misalign the different identities from two modalities. In this +paper, we first conduct a theoretic analysis where an interpretable +generalization upper bound is introduced. Based on the analysis, we then +propose a novel unsupervised cross-modality person re-identification framework +(PRAISE). Specifically, to address the first challenge, we propose a +pseudo-label correction strategy that utilizes a Beta Mixture Model to predict +the probability of mis-clustering based network's memory effect and rectifies +the correspondence by adding a perceptual term to contrastive learning. Next, +we introduce a modality-level alignment strategy that generates paired +visible-infrared latent features and reduces the modality gap by aligning the +labeling function of visible and infrared features to learn identity +discriminative and modality-invariant features. Experimental results on two +benchmark datasets demonstrate that our method achieves state-of-the-art +performance than the unsupervised visible-ReID methods. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ SafeGen: Mitigating Unsafe Content Generation in Text-to-Image Models + + +
+ Text-to-image (T2I) models, such as Stable Diffusion, have exhibited +remarkable performance in generating high-quality images from text descriptions +in recent years. However, text-to-image models may be tricked into generating +not-safe-for-work (NSFW) content, particularly in sexual scenarios. Existing +countermeasures mostly focus on filtering inappropriate inputs and outputs, or +suppressing improper text embeddings, which can block explicit NSFW-related +content (e.g., naked or sexy) but may still be vulnerable to adversarial +prompts inputs that appear innocent but are ill-intended. In this paper, we +present SafeGen, a framework to mitigate unsafe content generation by +text-to-image models in a text-agnostic manner. The key idea is to eliminate +unsafe visual representations from the model regardless of the text input. In +this way, the text-to-image model is resistant to adversarial prompts since +unsafe visual representations are obstructed from within. Extensive experiments +conducted on four datasets demonstrate SafeGen's effectiveness in mitigating +unsafe content generation while preserving the high-fidelity of benign images. +SafeGen outperforms eight state-of-the-art baseline methods and achieves 99.1% +sexual content removal performance. Furthermore, our constructed benchmark of +adversarial prompts provides a basis for future development and evaluation of +anti-NSFW-generation methods. + +
+
+
+
+
+ + ☆ Deep Generative Data Assimilation in Multimodal Setting CVPR2024 + + +
+ Robust integration of physical knowledge and data is key to improve +computational simulations, such as Earth system models. Data assimilation is +crucial for achieving this goal because it provides a systematic framework to +calibrate model outputs with observations, which can include remote sensing +imagery and ground station measurements, with uncertainty quantification. +Conventional methods, including Kalman filters and variational approaches, +inherently rely on simplifying linear and Gaussian assumptions, and can be +computationally expensive. Nevertheless, with the rapid adoption of data-driven +methods in many areas of computational sciences, we see the potential of +emulating traditional data assimilation with deep learning, especially +generative models. In particular, the diffusion-based probabilistic framework +has large overlaps with data assimilation principles: both allows for +conditional generation of samples with a Bayesian inverse framework. These +models have shown remarkable success in text-conditioned image generation or +image-controlled video synthesis. Likewise, one can frame data assimilation as +observation-conditioned state calibration. In this work, we propose SLAMS: +Score-based Latent Assimilation in Multimodal Setting. Specifically, we +assimilate in-situ weather station data and ex-situ satellite imagery to +calibrate the vertical temperature profiles, globally. Through extensive +ablation, we demonstrate that SLAMS is robust even in low-resolution, noisy, +and sparse data settings. To our knowledge, our work is the first to apply deep +generative framework for multimodal data assimilation using real-world +datasets; an important step for building robust computational simulators, +including the next-generation Earth system models. Our code is available at: +https://github.com/yongquan-qu/SLAMS + +
+
+ comment: Accepted to CVPR2024 EarthVision +
+
+
+
+
+ + ☆ Multi-modal Document Presentation Attack Detection With Forensics Trace + Disentanglement ICME 2024 + + +
+ Document Presentation Attack Detection (DPAD) is an important measure in +protecting the authenticity of a document image. However, recent DPAD methods +demand additional resources, such as manual effort in collecting additional +data or knowing the parameters of acquisition devices. This work proposes a +DPAD method based on multi-modal disentangled traces (MMDT) without the above +drawbacks. We first disentangle the recaptured traces by a self-supervised +disentanglement and synthesis network to enhance the generalization capacity in +document images with different contents and layouts. Then, unlike the existing +DPAD approaches that rely only on data in the RGB domain, we propose to +explicitly employ the disentangled recaptured traces as new modalities in the +transformer backbone through adaptive multi-modal adapters to fuse RGB/trace +features efficiently. Visualization of the disentangled traces confirms the +effectiveness of the proposed method in different document contents. Extensive +experiments on three benchmark datasets demonstrate the superiority of our MMDT +method on representing forensic traces of recapturing distortion. + +
+
+ comment: Accepted to ICME 2024 +
+
+
+
+
+ + ☆ Efficient Denoising using Score Embedding in Score-based Diffusion + Models + + +
+ It is well known that training a denoising score-based diffusion models +requires tens of thousands of epochs and a substantial number of image data to +train the model. In this paper, we propose to increase the efficiency in +training score-based diffusion models. Our method allows us to decrease the +number of epochs needed to train the diffusion model. We accomplish this by +solving the log-density Fokker-Planck (FP) Equation numerically to compute the +score \textit{before} training. The pre-computed score is embedded into the +image to encourage faster training under slice Wasserstein distance. +Consequently, it also allows us to decrease the number of images we need to +train the neural network to learn an accurate score. We demonstrate through our +numerical experiments the improved performance of our proposed method compared +to standard score-based diffusion models. Our proposed method achieves a +similar quality to the standard method meaningfully faster. + +
+
+
+
+
+ + ☆ AI-Guided Feature Segmentation Techniques to Model Features from Single + Crystal Diamond Growth + + +
+ Process refinement to consistently produce high-quality material over a large +area of the grown crystal, enabling various applications from optics crystals +to quantum detectors, has long been a goal for diamond growth. Machine learning +offers a promising path toward this goal, but faces challenges such as the +complexity of features within datasets, their time-dependency, and the volume +of data produced per growth run. Accurate spatial feature extraction from image +to image for real-time monitoring of diamond growth is crucial yet complicated +due to the low-volume and high feature complexity nature of the datasets. This +paper compares various traditional and machine learning-driven approaches for +feature extraction in the diamond growth domain, proposing a novel deep +learning-driven semantic segmentation approach to isolate and classify accurate +pixel masks of geometric features like diamond, pocket holder, and background, +along with their derivative features based on shape and size. Using an +annotation-focused human-in-the-loop software architecture for training +datasets, with modules for selective data labeling using active learning, data +augmentations, and model-assisted labeling, our approach achieves effective +annotation accuracy and drastically reduces labeling time and cost. Deep +learning algorithms prove highly efficient in accurately learning complex +representations from datasets with many features. Our top-performing model, +based on the DeeplabV3plus architecture, achieves outstanding accuracy in +classifying features of interest, with accuracies of 96.31% for pocket holder, +98.60% for diamond top, and 91.64% for diamond side features. + +
+
+ comment: 12 pages,4 figures,ACMME 2024. arXiv admin note: substantial text + overlap with arXiv:2404.07306 +
+
+
+
+
+ + ☆ Enhanced Cooperative Perception for Autonomous Vehicles Using Imperfect + Communication + + +
+ Sharing and joint processing of camera feeds and sensor measurements, known +as Cooperative Perception (CP), has emerged as a new technique to achieve +higher perception qualities. CP can enhance the safety of Autonomous Vehicles +(AVs) where their individual visual perception quality is compromised by +adverse weather conditions (haze as foggy weather), low illumination, winding +roads, and crowded traffic. To cover the limitations of former methods, in this +paper, we propose a novel approach to realize an optimized CP under constrained +communications. At the core of our approach is recruiting the best helper from +the available list of front vehicles to augment the visual range and enhance +the Object Detection (OD) accuracy of the ego vehicle. In this two-step +process, we first select the helper vehicles that contribute the most to CP +based on their visual range and lowest motion blur. Next, we implement a radio +block optimization among the candidate vehicles to further improve +communication efficiency. We specifically focus on pedestrian detection as an +exemplary scenario. To validate our approach, we used the CARLA simulator to +create a dataset of annotated videos for different driving scenarios where +pedestrian detection is challenging for an AV with compromised vision. Our +results demonstrate the efficacy of our two-step optimization process in +improving the overall performance of cooperative perception in challenging +scenarios, substantially improving driving safety under adverse conditions. +Finally, we note that the networking assumptions are adopted from LTE Release +14 Mode 4 side-link communication, commonly used for Vehicle-to-Vehicle (V2V) +communication. Nonetheless, our method is flexible and applicable to arbitrary +V2V communications. + +
+
+
+
+
+ + ☆ An inclusive review on deep learning techniques and their scope in + handwriting recognition + + +
+ Deep learning expresses a category of machine learning algorithms that have +the capability to combine raw inputs into intermediate features layers. These +deep learning algorithms have demonstrated great results in different fields. +Deep learning has particularly witnessed for a great achievement of human level +performance across a number of domains in computer vision and pattern +recognition. For the achievement of state-of-the-art performances in diverse +domains, the deep learning used different architectures and these architectures +used activation functions to perform various computations between hidden and +output layers of any architecture. This paper presents a survey on the existing +studies of deep learning in handwriting recognition field. Even though the +recent progress indicates that the deep learning methods has provided valuable +means for speeding up or proving accurate results in handwriting recognition, +but following from the extensive literature survey, the present study finds +that the deep learning has yet to revolutionize more and has to resolve many of +the most pressing challenges in this field, but promising advances have been +made on the prior state of the art. Additionally, an inadequate availability of +labelled data to train presents problems in this domain. Nevertheless, the +present handwriting recognition survey foresees deep learning enabling changes +at both bench and bedside with the potential to transform several domains as +image processing, speech recognition, computer vision, machine translation, +robotics and control, medical imaging, medical information processing, +bio-informatics, natural language processing, cyber security, and many others. + +
+
+
+
+
+ + ☆ A Transformer-Based Model for the Prediction of Human Gaze Behavior on + Videos + + +
+ Eye-tracking applications that utilize the human gaze in video understanding +tasks have become increasingly important. To effectively automate the process +of video analysis based on eye-tracking data, it is important to accurately +replicate human gaze behavior. However, this task presents significant +challenges due to the inherent complexity and ambiguity of human gaze patterns. +In this work, we introduce a novel method for simulating human gaze behavior. +Our approach uses a transformer-based reinforcement learning algorithm to train +an agent that acts as a human observer, with the primary role of watching +videos and simulating human gaze behavior. We employed an eye-tracking dataset +gathered from videos generated by the VirtualHome simulator, with a primary +focus on activity recognition. Our experimental results demonstrate the +effectiveness of our gaze prediction method by highlighting its capability to +replicate human gaze behavior and its applicability for downstream tasks where +real human-gaze is used as input. + +
+
+ comment: 2024 Symposium on Eye Tracking Research and Applications (ETRA24), + Glasgow, United Kingdom +
+
+
+
+
+ + ☆ Gaze-Guided Graph Neural Network for Action Anticipation Conditioned on + Intention + + +
+ Humans utilize their gaze to concentrate on essential information while +perceiving and interpreting intentions in videos. Incorporating human gaze into +computational algorithms can significantly enhance model performance in video +understanding tasks. In this work, we address a challenging and innovative task +in video understanding: predicting the actions of an agent in a video based on +a partial video. We introduce the Gaze-guided Action Anticipation algorithm, +which establishes a visual-semantic graph from the video input. Our method +utilizes a Graph Neural Network to recognize the agent's intention and predict +the action sequence to fulfill this intention. To assess the efficiency of our +approach, we collect a dataset containing household activities generated in the +VirtualHome environment, accompanied by human gaze data of viewing videos. Our +method outperforms state-of-the-art techniques, achieving a 7\% improvement in +accuracy for 18-class intention recognition. This highlights the efficiency of +our method in learning important features from human gaze data. + +
+
+ comment: 2024 Symposium on Eye Tracking Research and Applications (ETRA24), + Glasgow, United Kingdom +
+
+
+
+
+ + ♻ ☆ Disentangled Explanations of Neural Network Predictions by Finding + Relevant Subspaces + + +
+ Explainable AI aims to overcome the black-box nature of complex ML models +like neural networks by generating explanations for their predictions. +Explanations often take the form of a heatmap identifying input features (e.g. +pixels) that are relevant to the model's decision. These explanations, however, +entangle the potentially multiple factors that enter into the overall complex +decision strategy. We propose to disentangle explanations by extracting at some +intermediate layer of a neural network, subspaces that capture the multiple and +distinct activation patterns (e.g. visual concepts) that are relevant to the +prediction. To automatically extract these subspaces, we propose two new +analyses, extending principles found in PCA or ICA to explanations. These novel +analyses, which we call principal relevant component analysis (PRCA) and +disentangled relevant subspace analysis (DRSA), maximize relevance instead of +e.g. variance or kurtosis. This allows for a much stronger focus of the +analysis on what the ML model actually uses for predicting, ignoring +activations or concepts to which the model is invariant. Our approach is +general enough to work alongside common attribution techniques such as Shapley +Value, Integrated Gradients, or LRP. Our proposed methods show to be +practically useful and compare favorably to the state of the art as +demonstrated on benchmarks and three use cases. + +
+
+ comment: 17 pages + supplement +
+
+
+
+
+ + ♻ ☆ Deep Learning for Inertial Sensor Alignment + + +
+ Accurate alignment of a fixed mobile device equipped with inertial sensors +inside a moving vehicle is important for navigation, activity recognition, and +other applications. Accurate estimation of the device mounting angle is +required to rotate the inertial measurement from the sensor frame to the moving +platform frame to standardize measurements and improve the performance of the +target task. In this work, a data-driven approach using deep neural networks +(DNNs) is proposed to learn the yaw mounting angle of a smartphone equipped +with an inertial measurement unit (IMU) and strapped to a car. The proposed +model uses only the accelerometer and gyroscope readings from an IMU as input +and, in contrast to existing solutions, does not require global position inputs +from global navigation satellite systems (GNSS). To train the model in a +supervised manner, IMU data is collected for training and validation with the +sensor mounted at a known yaw mounting angle, and a range of ground truth +labels is generated by applying a random rotation in a bounded range to the +measurements. The trained model is tested on data with real rotations showing +similar performance as with synthetic rotations. The trained model is deployed +on an Android device and evaluated in real-time to test the accuracy of the +estimated yaw mounting angle. The model is shown to find the mounting angle at +an accuracy of 8 degrees within 5 seconds, and 4 degrees within 27 seconds. An +experiment is conducted to compare the proposed model with an existing +off-the-shelf solution. + +
+
+ comment: 9 Pages, Preprint. Accepted IEEE +
+
+
+
+
+ + ♻ ☆ GLiDR: Topologically Regularized Graph Generative Network for Sparse + LiDAR Point Clouds CVPR + + +
+ Sparse LiDAR point clouds cause severe loss of detail of static structures +and reduce the density of static points available for navigation. Reduced +density can be detrimental to navigation under several scenarios. We observe +that despite high sparsity, in most cases, the global topology of LiDAR +outlining the static structures can be inferred. We utilize this property to +obtain a backbone skeleton of a LiDAR scan in the form of a single connected +component that is a proxy to its global topology. We utilize the backbone to +augment new points along static structures to overcome sparsity. Newly +introduced points could correspond to existing static structures or to static +points that were earlier obstructed by dynamic objects. To the best of our +knowledge, we are the first to use such a strategy for sparse LiDAR point +clouds. Existing solutions close to our approach fail to identify and preserve +the global static LiDAR topology and generate sub-optimal points. We propose +GLiDR, a Graph Generative network that is topologically regularized using +0-dimensional Persistent Homology ($\mathcal{PH}$) constraints. This enables +GLiDR to introduce newer static points along a topologically consistent global +static LiDAR backbone. GLiDR generates precise static points using $32\times$ +sparser dynamic scans and performs better than the baselines across three +datasets. GLiDR generates a valuable byproduct - an accurate binary +segmentation mask of static and dynamic objects that are helpful for navigation +planning and safety in constrained environments. The newly introduced static +points allow GLiDR to outperform LiDAR-based navigation using SLAM in several +settings. Source code is available at +$\texttt{https://github.com/GLiDR-CVPR2024/GLiDR}$. + +
+
+ comment: IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) +
+
+
+
+
+ + ♻ ☆ CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update CVPR 2024 + + +
+ Utilizing large language models (LLMs) to compose off-the-shelf visual tools +represents a promising avenue of research for developing robust visual +assistants capable of addressing diverse visual tasks. However, these methods +often overlook the potential for continual learning, typically by freezing the +utilized tools, thus limiting their adaptation to environments requiring new +knowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual +Assistant, which operates within a framework encompassing inference, +reflection, and learning phases. During the inference phase, LLMs generate +programs and execute corresponding tools to complete assigned tasks. In the +reflection phase, a multimodal global-local reflection scheme analyzes human +feedback to determine which tools require updating. Lastly, the learning phase +employs three flexible approaches to automatically gather training data and +introduces a novel prompt tuning scheme to update the tools, allowing CLOVA to +efficiently acquire new knowledge. Experimental findings demonstrate that CLOVA +surpasses existing tool-usage methods by 5% in visual question answering and +multiple-image reasoning, by 10% in knowledge tagging, and by 20% in image +editing. These results underscore the significance of the continual learning +capability in general visual assistants. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Bias-Reduced Neural Networks for Parameter Estimation in Quantitative + MRI + + +
+ Purpose: To develop neural network (NN)-based quantitative MRI parameter +estimators with minimal bias and a variance close to the Cram\'er-Rao bound. + Theory and Methods: We generalize the mean squared error loss to control the +bias and variance of the NN's estimates, which involves averaging over multiple +noise realizations of the same measurements during training. Bias and variance +properties of the resulting NNs are studied for two neuroimaging applications. + Results: In simulations, the proposed strategy reduces the estimates' bias +throughout parameter space and achieves a variance close to the Cram\'er-Rao +bound. In vivo, we observe good concordance between parameter maps estimated +with the proposed NNs and traditional estimators, such as non-linear +least-squares fitting, while state-of-the-art NNs show larger deviations. + Conclusion: The proposed NNs have greatly reduced bias compared to those +trained using the mean squared error and offer significantly improved +computational efficiency over traditional estimators with comparable or better +accuracy. + +
+
+
+
+
+ + ♻ ☆ MaskClustering: View Consensus based Mask Graph Clustering for + Open-Vocabulary 3D Instance Segmentation + + +
+ Open-vocabulary 3D instance segmentation is cutting-edge for its ability to +segment 3D instances without predefined categories. However, progress in 3D +lags behind its 2D counterpart due to limited annotated 3D data. To address +this, recent works first generate 2D open-vocabulary masks through 2D models +and then merge them into 3D instances based on metrics calculated between two +neighboring frames. In contrast to these local metrics, we propose a novel +metric, view consensus rate, to enhance the utilization of multi-view +observations. The key insight is that two 2D masks should be deemed part of the +same 3D instance if a significant number of other 2D masks from different views +contain both these two masks. Using this metric as edge weight, we construct a +global mask graph where each mask is a node. Through iterative clustering of +masks showing high view consensus, we generate a series of clusters, each +representing a distinct 3D instance. Notably, our model is training-free. +Through extensive experiments on publicly available datasets, including +ScanNet++, ScanNet200 and MatterPort3D, we demonstrate that our method achieves +state-of-the-art performance in open-vocabulary 3D instance segmentation. Our +project page is at https://pku-epic.github.io/MaskClustering. + +
+
+
+
+
+ + ♻ ☆ Visual Concept Connectome (VCC): Open World Concept Discovery and their + Interlayer Connections in Deep Models CVPR 2024 + + +
+ Understanding what deep network models capture in their learned +representations is a fundamental challenge in computer vision. We present a new +methodology to understanding such vision models, the Visual Concept Connectome +(VCC), which discovers human interpretable concepts and their interlayer +connections in a fully unsupervised manner. Our approach simultaneously reveals +fine-grained concepts at a layer, connection weightings across all layers and +is amendable to global analysis of network structure (e.g., branching pattern +of hierarchical concept assemblies). Previous work yielded ways to extract +interpretable concepts from single layers and examine their impact on +classification, but did not afford multilayer concept analysis across an entire +network architecture. Quantitative and qualitative empirical results show the +effectiveness of VCCs in the domain of image classification. Also, we leverage +VCCs for the application of failure mode debugging to reveal where mistakes +arise in deep networks. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Understanding Video Transformers via Universal Concept Discovery CVPR 2024 + + +
+ This paper studies the problem of concept-based interpretability of +transformer representations for videos. Concretely, we seek to explain the +decision-making process of video transformers based on high-level, +spatiotemporal concepts that are automatically discovered. Prior research on +concept-based interpretability has concentrated solely on image-level tasks. +Comparatively, video models deal with the added temporal dimension, increasing +complexity and posing challenges in identifying dynamic concepts over time. In +this work, we systematically address these challenges by introducing the first +Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose +an efficient approach for unsupervised identification of units of video +transformer representations - concepts, and ranking their importance to the +output of a model. The resulting concepts are highly interpretable, revealing +spatio-temporal reasoning mechanisms and object-centric representations in +unstructured video models. Performing this analysis jointly over a diverse set +of supervised and self-supervised representations, we discover that some of +these mechanism are universal in video transformers. Finally, we show that VTCD +can be used for fine-grained action recognition and video object segmentation. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Location-guided Head Pose Estimation for Fisheye Image + + +
+ Camera with a fisheye or ultra-wide lens covers a wide field of view that +cannot be modeled by the perspective projection. Serious fisheye lens +distortion in the peripheral region of the image leads to degraded performance +of the existing head pose estimation models trained on undistorted images. This +paper presents a new approach for head pose estimation that uses the knowledge +of head location in the image to reduce the negative effect of fisheye +distortion. We develop an end-to-end convolutional neural network to estimate +the head pose with the multi-task learning of head pose and head location. Our +proposed network estimates the head pose directly from the fisheye image +without the operation of rectification or calibration. We also created a +fisheye-distorted version of the three popular head pose estimation datasets, +BIWI, 300W-LP, and AFLW2000 for our experiments. Experiments results show that +our network remarkably improves the accuracy of head pose estimation compared +with other state-of-the-art one-stage and two-stage methods. + +
+
+ comment: Revised Introduction and Related Work; Submitted to lEEE Transactions + on Cognitive and Developmental Systems for review +
+
+
+
+
+ + ♻ ☆ VMamba: Visual State Space Model + + +
+ Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) have long +been the predominant backbone networks for visual representation learning. +While ViTs have recently gained prominence over CNNs due to their superior +fitting capabilities, their scalability is largely constrained by the quadratic +complexity of attention computation. Inspired by the capability of Mamba in +efficiently modeling long sequences, we propose VMamba, a generic vision +backbone model aiming to reduce the computational complexity to linear while +retaining ViTs' advantageous features. To enhance VMamba's adaptability in +processing vision data, we introduce the Cross-Scan Module (CSM) to enable 1D +selective scanning in 2D image space with global receptive fields. +Additionally, we make further improvements in implementation details and +architectural designs to enhance VMamba's performance and boost its inference +speed. Extensive experimental results demonstrate VMamba's promising +performance across various visual perception tasks, highlighting its pronounced +advantages in input scaling efficiency compared to existing benchmark models. +Source code is available at https://github.com/MzeroMiko/VMamba. + +
+
+ comment: 21 pages, 12 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Data-Efficient Multimodal Fusion on a Single GPU CVPR 2024 + + +
+ The goal of multimodal alignment is to learn a single latent space that is +shared between multimodal inputs. The most powerful models in this space have +been trained using massive datasets of paired inputs and large-scale +computational resources, making them prohibitively expensive to train in many +practical scenarios. We surmise that existing unimodal encoders pre-trained on +large amounts of unimodal data should provide an effective bootstrap to create +multimodal models from unimodal ones at much lower costs. We therefore propose +FuseMix, a multimodal augmentation scheme that operates on the latent spaces of +arbitrary pre-trained unimodal encoders. Using FuseMix for multimodal +alignment, we achieve competitive performance -- and in certain cases +outperform state-of-the art methods -- in both image-text and audio-text +retrieval, with orders of magnitude less compute and data: for example, we +outperform CLIP on the Flickr30K text-to-image retrieval task with $\sim \! +600\times$ fewer GPU days and $\sim \! 80\times$ fewer image-text pairs. +Additionally, we show how our method can be applied to convert pre-trained +text-to-image generative models into audio-to-image ones. Code is available at: +https://github.com/layer6ai-labs/fusemix. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Building-road Collaborative Extraction from Remotely Sensed Images via + Cross-Interaction + + +
+ Buildings are the basic carrier of social production and human life; roads +are the links that interconnect social networks. Building and road information +has important application value in the frontier fields of regional coordinated +development, disaster prevention, auto-driving, etc. Mapping buildings and +roads from very high-resolution (VHR) remote sensing images have become a hot +research topic. However, the existing methods often ignore the strong spatial +correlation between roads and buildings and extract them in isolation. To fully +utilize the complementary advantages between buildings and roads, we propose a +building-road collaborative extraction method based on multi-task and +cross-scale feature interaction to improve the accuracy of both tasks in a +complementary way. A multi-task interaction module is proposed to interact +information across tasks and preserve the unique information of each task, +which tackle the seesaw phenomenon in multitask learning. By considering the +variation in appearance and structure between buildings and roads, a +cross-scale interaction module is designed to automatically learn the optimal +reception field for different tasks. Compared with many existing methods that +train each task individually, the proposed collaborative extraction method can +utilize the complementary advantages between buildings and roads by the +proposed inter-task and inter-scale feature interactions, and automatically +select the optimal reception field for different tasks. Experiments on a wide +range of urban and rural scenarios show that the proposed algorithm can achieve +building-road extraction with outstanding performance and efficiency. + +
+
+ comment: IEEE Transactions on Geoscience and Remote Sensing +
+
+
+
+
+ + ♻ ☆ BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics CVPR 2024 + + +
+ The recently emerging text-to-motion advances have spired numerous attempts +for convenient and interactive human motion generation. Yet, existing methods +are largely limited to generating body motions only without considering the +rich two-hand motions, let alone handling various conditions like body dynamics +or texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal +dataset for two-hand motion generation. Our dataset includes accurate motion +tracking for the human body and hands and provides pair-wised finger-level hand +annotations and body descriptions. We further provide a strong baseline method, +BOTH2Hands, for the novel task: generating vivid two-hand motions from both +implicit body dynamics and explicit text prompts. We first warm up two parallel +body-to-hand and text-to-hand diffusion models and then utilize the +cross-attention transformer for motion blending. Extensive experiments and +cross-validations demonstrate the effectiveness of our approach and dataset for +generating convincing two-hand motions from the hybrid body-and-textual +conditions. Our dataset and code will be disseminated to the community for +future research. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: minor fixes (typos, URLs etc.) +
+
+
+
+
+ + ♻ ☆ Implicit Neural Representation for MRI Parallel Imaging Reconstruction + + +
+ Magnetic resonance imaging (MRI) usually faces lengthy acquisition times, +prompting the exploration of strategies such as parallel imaging (PI) to +alleviate this problem by periodically skipping specific K-space lines and +subsequently reconstructing high-quality images from the undersampled K-space. +Implicit neural representation (INR) has recently emerged as a promising deep +learning technique, characterizing objects as continuous functions of spatial +coordinates typically parameterized by a multilayer perceptron (MLP). In this +study, we propose a novel MRI PI reconstruction method that uses INR. Our +approach represents reconstructed fully-sampled images as functions of voxel +coordinates and prior feature vectors from undersampled images, addressing the +generalization challenges of INR. Specifically, we introduce a scale-embedded +encoder to generate scale-independent, voxel-specific features from MR images +across various undersampling scales. These features are then concatenated with +coordinate vectors to reconstruct fully-sampled MR images, facilitating +multiple-scale reconstructions. To evaluate our method's performance, we +conducted experiments using publicly available MRI datasets, comparing it with +alternative reconstruction techniques. Our quantitative assessment demonstrates +the superiority of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Expediting Building Footprint Extraction from High-resolution Remote + Sensing Images via progressive lenient supervision + + +
+ The efficacy of building footprint segmentation from remotely sensed images +has been hindered by model transfer effectiveness. Many existing building +segmentation methods were developed upon the encoder-decoder architecture of +U-Net, in which the encoder is finetuned from the newly developed backbone +networks that are pre-trained on ImageNet. However, the heavy computational +burden of the existing decoder designs hampers the successful transfer of these +modern encoder networks to remote sensing tasks. Even the widely-adopted deep +supervision strategy fails to mitigate these challenges due to its invalid loss +in hybrid regions where foreground and background pixels are intermixed. In +this paper, we conduct a comprehensive evaluation of existing decoder network +designs for building footprint segmentation and propose an efficient framework +denoted as BFSeg to enhance learning efficiency and effectiveness. +Specifically, a densely-connected coarse-to-fine feature fusion decoder network +that facilitates easy and fast feature fusion across scales is proposed. +Moreover, considering the invalidity of hybrid regions in the down-sampled +ground truth during the deep supervision process, we present a lenient deep +supervision and distillation strategy that enables the network to learn proper +knowledge from deep supervision. Building upon these advancements, we have +developed a new family of building segmentation networks, which consistently +surpass prior works with outstanding performance and efficiency across a wide +range of newly developed encoder networks. + +
+
+
+
+
+ + ♻ ☆ Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level + Awareness + + +
+ To obtain high-quality positron emission tomography (PET) while minimizing +radiation exposure, a range of methods have been designed to reconstruct +standard-dose PET (SPET) from corresponding low-dose PET (LPET) images. +However, most current methods merely learn the mapping between +single-dose-level LPET and SPET images, but omit the dose disparity of LPET +images in clinical scenarios. In this paper, to reconstruct high-quality SPET +images from multi-dose-level LPET images, we design a novel two-phase +multi-dose-level PET reconstruction algorithm with dose level awareness, +containing a pre-training phase and a SPET prediction phase. Specifically, the +pre-training phase is devised to explore both fine-grained discriminative +features and effective semantic representation. The SPET prediction phase +adopts a coarse prediction network utilizing pre-learned dose level prior to +generate preliminary result, and a refinement network to precisely preserve the +details. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge +Dataset have demonstrated the superiority of our method. + +
+
+ comment: Accepted by ISBI2024 +
+
+
+
+
+ + ♻ ☆ Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for + Multi-exposure Image Fusion + + +
+ In recent years, deep learning networks have made remarkable strides in the +domain of multi-exposure image fusion. Nonetheless, prevailing approaches often +involve directly feeding over-exposed and under-exposed images into the +network, which leads to the under-utilization of inherent information present +in the source images. Additionally, unsupervised techniques predominantly +employ rudimentary weighted summation for color channel processing, culminating +in an overall desaturated final image tone. To partially mitigate these issues, +this study proposes a gamma correction module specifically designed to fully +leverage latent information embedded within source images. Furthermore, a +modified transformer block, embracing with self-attention mechanisms, is +introduced to optimize the fusion process. Ultimately, a novel color +enhancement algorithm is presented to augment image saturation while preserving +intricate details. The source code is available at +https://github.com/ZhiyingDu/BHFMEF. + +
+
+
+
+
+ + ♻ ☆ DREAM: Visual Decoding from Reversing Human Visual System + + +
+ In this work we present DREAM, an fMRI-to-image method for reconstructing +viewed images from brain activities, grounded on fundamental knowledge of the +human visual system. We craft reverse pathways that emulate the hierarchical +and parallel nature of how humans perceive the visual world. These tailored +pathways are specialized to decipher semantics, color, and depth cues from fMRI +data, mirroring the forward pathways from visual stimuli to fMRI recordings. To +do so, two components mimic the inverse processes within the human visual +system: the Reverse Visual Association Cortex (R-VAC) which reverses pathways +of this brain region, extracting semantics from fMRI data; the Reverse Parallel +PKM (R-PKM) component simultaneously predicting color and depth from fMRI +signals. The experiments indicate that our method outperforms the current +state-of-the-art models in terms of the consistency of appearance, structure, +and semantics. Code will be made publicly available to facilitate further +research in this field. + +
+
+ comment: Project Page: https://weihaox.github.io/DREAM +
+
+
+
+
+ + ♻ ☆ Pre-trained Model Guided Fine-Tuning for Zero-Shot Adversarial + Robustness CVPR 2024 + + +
+ Large-scale pre-trained vision-language models like CLIP have demonstrated +impressive performance across various tasks, and exhibit remarkable zero-shot +generalization capability, while they are also vulnerable to imperceptible +adversarial examples. Existing works typically employ adversarial training +(fine-tuning) as a defense method against adversarial examples. However, direct +application to the CLIP model may result in overfitting, compromising the +model's capacity for generalization. In this paper, we propose Pre-trained +Model Guided Adversarial Fine-Tuning (PMG-AFT) method, which leverages +supervision from the original pre-trained model by carefully designing an +auxiliary branch, to enhance the model's zero-shot adversarial robustness. +Specifically, PMG-AFT minimizes the distance between the features of +adversarial examples in the target model and those in the pre-trained model, +aiming to preserve the generalization features already captured by the +pre-trained model. Extensive Experiments on 15 zero-shot datasets demonstrate +that PMG-AFT significantly outperforms the state-of-the-art method, improving +the top-1 robust accuracy by an average of 4.99%. Furthermore, our approach +consistently improves clean accuracy by an average of 8.72%. Our code is +available at +https://github.com/serendipity1122/Pre-trained-Model-Guided-Fine-Tuning-for-Zero-Shot-Adversarial-Robustness. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DG-TTA: Out-of-domain medical image segmentation through Domain + Generalization and Test-Time Adaptation + + +
+ Applying pre-trained medical segmentation models on out-of-domain images +often yields predictions of insufficient quality. Several strategies have been +proposed to maintain model performance, such as finetuning or unsupervised- and +source-free domain adaptation. These strategies set restrictive requirements +for data availability. In this study, we propose to combine domain +generalization and test-time adaptation to create a highly effective approach +for reusing pre-trained models in unseen target domains. Domain-generalized +pre-training on source data is used to obtain the best initial performance in +the target domain. We introduce the MIND descriptor previously used in image +registration tasks as a further technique to achieve generalization and present +superior performance for small-scale datasets compared to existing approaches. +At test-time, high-quality segmentation for every single unseen scan is ensured +by optimizing the model weights for consistency given different image +augmentations. That way, our method enables separate use of source and target +data and thus removes current data availability barriers. Moreover, the +presented method is highly modular as it does not require specific model +architectures or prior knowledge of involved domains and labels. We demonstrate +this by integrating it into the nnUNet, which is currently the most popular and +accurate framework for medical image segmentation. We employ multiple datasets +covering abdominal, cardiac, and lumbar spine scans and compose several +out-of-domain scenarios in this study. We demonstrate that our method, combined +with pre-trained whole-body CT models, can effectively segment MR images with +high accuracy in all of the aforementioned scenarios. Open-source code can be +found here: https://github.com/multimodallearning/DG-TTA + +
+
+
+
+
+ + ♻ ☆ ExpPoint-MAE: Better interpretability and performance for + self-supervised point cloud transformers + + +
+ In this paper we delve into the properties of transformers, attained through +self-supervision, in the point cloud domain. Specifically, we evaluate the +effectiveness of Masked Autoencoding as a pretraining scheme, and explore +Momentum Contrast as an alternative. In our study we investigate the impact of +data quantity on the learned features, and uncover similarities in the +transformer's behavior across domains. Through comprehensive visualiations, we +observe that the transformer learns to attend to semantically meaningful +regions, indicating that pretraining leads to a better understanding of the +underlying geometry. Moreover, we examine the finetuning process and its effect +on the learned representations. Based on that, we devise an unfreezing strategy +which consistently outperforms our baseline without introducing any other +modifications to the model or the training pipeline, and achieve +state-of-the-art results in the classification task among transformer models. + +
+
+
+
+
+ + ♻ ☆ AGILE3D: Attention Guided Interactive Multi-object 3D Segmentation ICLR 2024 + + +
+ During interactive segmentation, a model and a user work together to +delineate objects of interest in a 3D point cloud. In an iterative process, the +model assigns each data point to an object (or the background), while the user +corrects errors in the resulting segmentation and feeds them back into the +model. The current best practice formulates the problem as binary +classification and segments objects one at a time. The model expects the user +to provide positive clicks to indicate regions wrongly assigned to the +background and negative clicks on regions wrongly assigned to the object. +Sequentially visiting objects is wasteful since it disregards synergies between +objects: a positive click for a given object can, by definition, serve as a +negative click for nearby objects. Moreover, a direct competition between +adjacent objects can speed up the identification of their common boundary. We +introduce AGILE3D, an efficient, attention-based model that (1) supports +simultaneous segmentation of multiple 3D objects, (2) yields more accurate +segmentation masks with fewer user clicks, and (3) offers faster inference. Our +core idea is to encode user clicks as spatial-temporal queries and enable +explicit interactions between click queries as well as between them and the 3D +scene through a click attention module. Every time new clicks are added, we +only need to run a lightweight decoder that produces updated segmentation +masks. In experiments with four different 3D point cloud datasets, AGILE3D sets +a new state-of-the-art. Moreover, we also verify its practicality in real-world +setups with real user studies. + +
+
+ comment: ICLR 2024 camera-ready. Project page: https://ywyue.github.io/AGILE3D +
+
+
+
+
+ + ♻ ☆ Physics-guided Shape-from-Template: Monocular Video Perception through + Neural Surrogate Models + + +
+ 3D reconstruction of dynamic scenes is a long-standing problem in computer +graphics and increasingly difficult the less information is available. +Shape-from-Template (SfT) methods aim to reconstruct a template-based geometry +from RGB images or video sequences, often leveraging just a single monocular +camera without depth information, such as regular smartphone recordings. +Unfortunately, existing reconstruction methods are either unphysical and noisy +or slow in optimization. To solve this problem, we propose a novel SfT +reconstruction algorithm for cloth using a pre-trained neural surrogate model +that is fast to evaluate, stable, and produces smooth reconstructions due to a +regularizing physics simulation. Differentiable rendering of the simulated mesh +enables pixel-wise comparisons between the reconstruction and a target video +sequence that can be used for a gradient-based optimization procedure to +extract not only shape information but also physical parameters such as +stretching, shearing, or bending stiffness of the cloth. This allows to retain +a precise, stable, and smooth reconstructed geometry while reducing the runtime +by a factor of 400-500 compared to $\phi$-SfT, a state-of-the-art physics-based +SfT approach. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging + Noise + + +
+ Accurate analysis of microscopy images is hindered by the presence of noise. +This noise is usually signal-dependent and often additionally correlated along +rows or columns of pixels. Current self- and unsupervised denoisers can address +signal-dependent noise, but none can reliably remove noise that is also row- or +column-correlated. Here, we present the first fully unsupervised deep +learning-based denoiser capable of handling imaging noise that is +row-correlated as well as signal-dependent. Our approach uses a Variational +Autoencoder (VAE) with a specially designed autoregressive decoder. This +decoder is capable of modeling row-correlated and signal-dependent noise but is +incapable of independently modeling underlying clean signal. The VAE therefore +produces latent variables containing only clean signal information, and these +are mapped back into image space using a proposed second decoder network. Our +method does not require a pre-trained noise model and can be trained from +scratch using unpaired noisy data. We show that our approach achieves +competitive results when applied to a range of different sensor types and +imaging modalities. + +
+
+
+
+
+ + ♻ ☆ Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract + Reasoning process + + +
+ Abstract reasoning problems pose significant challenges to artificial +intelligence algorithms, demanding cognitive capabilities beyond those required +for perception tasks. This study introduces the Triple-CFN approach to tackle +the Bongard-Logo problem, achieving notable reasoning accuracy by implicitly +reorganizing the concept space of conflicting instances. Additionally, the +Triple-CFN paradigm proves effective for the RPM problem with necessary +modifications, yielding competitive results. To further enhance performance on +the RPM issue, we develop the Meta Triple-CFN network, which explicitly +structures the problem space while maintaining interpretability on progressive +patterns. The success of Meta Triple-CFN is attributed to its paradigm of +modeling the conceptual space, equivalent to normalizing reasoning information. +Based on this ideology, we introduce the Re-space layer, enhancing the +performance of both Meta Triple-CFN and Triple-CFN. This paper aims to +contribute to advancements in machine intelligence by exploring innovative +network designs for addressing abstract reasoning problems, paving the way for +further breakthroughs in this domain. + +
+
+ comment: 14 pages, 14 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey + + +
+ With the urgent demand for generalized deep models, many pre-trained big +models are proposed, such as BERT, ViT, GPT, etc. Inspired by the success of +these models in single domains (like computer vision and natural language +processing), the multi-modal pre-trained big models have also drawn more and +more attention in recent years. In this work, we give a comprehensive survey of +these models and hope this paper could provide new insights and helps fresh +researchers to track the most cutting-edge works. Specifically, we firstly +introduce the background of multi-modal pre-training by reviewing the +conventional deep learning, pre-training works in natural language process, +computer vision, and speech. Then, we introduce the task definition, key +challenges, and advantages of multi-modal pre-training models (MM-PTMs), and +discuss the MM-PTMs with a focus on data, objectives, network architectures, +and knowledge enhanced pre-training. After that, we introduce the downstream +tasks used for the validation of large-scale MM-PTMs, including generative, +classification, and regression tasks. We also give visualization and analysis +of the model parameters and results on representative downstream tasks. +Finally, we point out possible research directions for this topic that may +benefit future works. In addition, we maintain a continuously updated paper +list for large-scale pre-trained multi-modal big models: +https://github.com/wangxiao5791509/MultiModal_BigModels_Survey. This paper has +been published by the journal Machine Intelligence Research (MIR), +https://link.springer.com/article/10.1007/s11633-022-1410-8, DOI: +10.1007/s11633-022-1410-8, vol. 20, no. 4, pp. 447-482, 2023. + +
+
+ comment: Accepted by Machine Intelligence Research (MIR) +
+
+
+
+
+ + ♻ ☆ MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly + Mixed Classifiers + + +
+ Adversarial robustness often comes at the cost of degraded accuracy, impeding +the real-life application of robust classification models. Training-based +solutions for better trade-offs are limited by incompatibilities with +already-trained high-performance large models, necessitating the exploration of +training-free ensemble approaches. Observing that robust models are more +confident in correct predictions than in incorrect ones on clean and +adversarial data alike, we speculate amplifying this "benign confidence +property" can reconcile accuracy and robustness in an ensemble setting. To +achieve so, we propose "MixedNUTS", a training-free method where the output +logits of a robust classifier and a standard non-robust classifier are +processed by nonlinear transformations with only three parameters, which are +optimized through an efficient algorithm. MixedNUTS then converts the +transformed logits into probabilities and mixes them as the overall output. On +CIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom +strong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and +near-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points, +sacrificing merely 0.87 points in robust accuracy. + +
+
+
+
+
+ + ♻ ☆ RS-Mamba for Large Remote Sensing Image Dense Prediction + + +
+ Context modeling is critical for remote sensing image dense prediction tasks. +Nowadays, the growing size of very-high-resolution (VHR) remote sensing images +poses challenges in effectively modeling context. While transformer-based +models possess global modeling capabilities, they encounter computational +challenges when applied to large VHR images due to their quadratic complexity. +The conventional practice of cropping large images into smaller patches results +in a notable loss of contextual information. To address these issues, we +propose the Remote Sensing Mamba (RSM) for dense prediction tasks in large VHR +remote sensing images. RSM is specifically designed to capture the global +context of remote sensing images with linear complexity, facilitating the +effective processing of large VHR images. Considering that the land covers in +remote sensing images are distributed in arbitrary spatial directions due to +characteristics of remote sensing over-head imaging, the RSM incorporates an +omnidirectional selective scan module to globally model the context of images +in multiple directions, capturing large spatial features from various +directions. Extensive experiments on semantic segmentation and change detection +tasks across various land covers demonstrate the effectiveness of the proposed +RSM. We designed simple yet effective models based on RSM, achieving +state-of-the-art performance on dense prediction tasks in VHR remote sensing +images without fancy training strategies. Leveraging the linear complexity and +global modeling capabilities, RSM achieves better efficiency and accuracy than +transformer-based models on large remote sensing images. Interestingly, we also +demonstrated that our model generally performs better with a larger image size +on dense prediction tasks. Our code is available at +https://github.com/walking-shadow/Official_Remote_Sensing_Mamba. + +
+
+ comment: 15 pages,8 figures +
+
+
+
+
+ + ♻ ☆ Improving the Generalization of Segmentation Foundation Model under + Distribution Shift via Weakly Supervised Adaptation + + +
+ The success of large language models has inspired the computer vision +community to explore image segmentation foundation model that is able to +zero/few-shot generalize through prompt engineering. Segment-Anything(SAM), +among others, is the state-of-the-art image segmentation foundation model +demonstrating strong zero/few-shot generalization. Despite the success, recent +studies reveal the weakness of SAM under strong distribution shift. In +particular, SAM performs awkwardly on corrupted natural images, camouflaged +images, medical images, etc. Motivated by the observations, we aim to develop a +self-training based strategy to adapt SAM to target distribution. Given the +unique challenges of large source dataset, high computation cost and incorrect +pseudo label, we propose a weakly supervised self-training architecture with +anchor regularization and low-rank finetuning to improve the robustness and +computation efficiency of adaptation. We validate the effectiveness on 5 types +of downstream segmentation tasks including natural clean/corrupted images, +medical images, camouflaged images and robotic images. Our proposed method is +task-agnostic in nature and outperforms pre-trained SAM and state-of-the-art +domain adaptation methods on almost all downstream tasks with the same testing +prompt inputs. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Ear-Keeper: Real-time Diagnosis of Ear Lesions Utilizing + Ultralight-Ultrafast ConvNet and Large-scale Ear Endoscopic Dataset + + +
+ Deep learning-based ear disease diagnosis technology has proven effective and +affordable. However, due to the lack of ear endoscope datasets with diversity, +the practical potential of the deep learning model has not been thoroughly +studied. Moreover, existing research failed to achieve a good trade-off between +model inference speed and parameter size, rendering models inapplicable in +real-world settings. To address these challenges, we constructed the first +large-scale ear endoscopic dataset comprising eight types of ear diseases and +disease-free samples from two institutions. Inspired by ShuffleNetV2, we +proposed Best-EarNet, an ultrafast and ultralight network enabling real-time +ear disease diagnosis. Best-EarNet incorporates a novel Local-Global Spatial +Feature Fusion Module and multi-scale supervision strategy, which facilitates +the model focusing on global-local information within feature maps at various +levels. Utilizing transfer learning, the accuracy of Best-EarNet with only +0.77M parameters achieves 95.23% (internal 22,581 images) and 92.14% (external +1,652 images), respectively. In particular, it achieves an average frame per +second of 80 on the CPU. From the perspective of model practicality, the +proposed Best-EarNet is superior to state-of-the-art backbone models in ear +lesion detection tasks. Most importantly, Ear-keeper, an intelligent diagnosis +system based Best-EarNet, was developed successfully and deployed on common +electronic devices (smartphone, tablet computer and personal computer). In the +future, Ear-Keeper has the potential to assist the public and healthcare +providers in performing comprehensive scanning and diagnosis of the ear canal +in real-time video, thereby promptly detecting ear lesions. + +
+
+ comment: 18 pages,8 figures +
+
+
+
+
+ + ♻ ☆ GPT as Psychologist? Preliminary Evaluations for GPT-4V on Visual + Affective Computing + + +
+ Multimodal large language models (MLLMs) are designed to process and +integrate information from multiple sources, such as text, speech, images, and +videos. Despite its success in language understanding, it is critical to +evaluate the performance of downstream tasks for better human-centric +applications. This paper assesses the application of MLLMs with 5 crucial +abilities for affective computing, spanning from visual affective tasks and +reasoning tasks. The results show that \gpt has high accuracy in facial action +unit recognition and micro-expression detection while its general facial +expression recognition performance is not accurate. We also highlight the +challenges of achieving fine-grained micro-expression recognition and the +potential for further study and demonstrate the versatility and potential of +\gpt for handling advanced tasks in emotion recognition and related fields by +integrating with task-related agents for more complex tasks, such as heart rate +estimation through signal processing. In conclusion, this paper provides +valuable insights into the potential applications and challenges of MLLMs in +human-centric computing. Our interesting examples are at +https://github.com/EnVision-Research/GPT4Affectivity. + +
+
+
+
+
+ + ♻ ☆ GaussianImage: 1000 FPS Image Representation and Compression by 2D + Gaussian Splatting + + +
+ Implicit neural representations (INRs) recently achieved great success in +image representation and compression, offering high visual quality and fast +rendering speeds with 10-1000 FPS, assuming sufficient GPU resources are +available. However, this requirement often hinders their use on low-end devices +with limited memory. In response, we propose a groundbreaking paradigm of image +representation and compression by 2D Gaussian Splatting, named GaussianImage. +We first introduce 2D Gaussian to represent the image, where each Gaussian has +8 parameters including position, covariance and color. Subsequently, we unveil +a novel rendering algorithm based on accumulated summation. Remarkably, our +method with a minimum of 3$\times$ lower GPU memory usage and 5$\times$ faster +fitting time not only rivals INRs (e.g., WIRE, I-NGP) in representation +performance, but also delivers a faster rendering speed of 1500-2000 FPS +regardless of parameter size. Furthermore, we integrate existing vector +quantization technique to build an image codec. Experimental results +demonstrate that our codec attains rate-distortion performance comparable to +compression-based INRs such as COIN and COIN++, while facilitating decoding +speeds of approximately 1000 FPS. Additionally, preliminary proof of concept +shows that our codec surpasses COIN and COIN++ in performance when using +partial bits-back coding. Code will be available at +https://github.com/Xinjie-Q/GaussianImage. + +
+
+
+
+
+ + ♻ ☆ Re-DiffiNet: Modeling discrepancies in tumor segmentation using + diffusion models + + +
+ Identification of tumor margins is essential for surgical decision-making for +glioblastoma patients and provides reliable assistance for neurosurgeons. +Despite improvements in deep learning architectures for tumor segmentation over +the years, creating a fully autonomous system suitable for clinical floors +remains a formidable challenge because the model predictions have not yet +reached the desired level of accuracy and generalizability for clinical +applications. Generative modeling techniques have seen significant improvements +in recent times. Specifically, Generative Adversarial Networks (GANs) and +Denoising-diffusion-based models (DDPMs) have been used to generate +higher-quality images with fewer artifacts and finer attributes. In this work, +we introduce a framework called Re-Diffinet for modeling the discrepancy +between the outputs of a segmentation model like U-Net and the ground truth, +using DDPMs. By explicitly modeling the discrepancy, the results show an +average improvement of 0.55\% in the Dice score and 16.28\% in HD95 from +cross-validation over 5-folds, compared to the state-of-the-art U-Net +segmentation model. + +
+
+
+
+
+ + ♻ ☆ AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with + Implicit Disentanglement + + +
+ Facial action unit (AU) intensity plays a pivotal role in quantifying +fine-grained expression behaviors, which is an effective condition for facial +expression manipulation. However, publicly available datasets containing +intensity annotations for multiple AUs remain severely limited, often featuring +a restricted number of subjects. This limitation places challenges to the AU +intensity manipulation in images due to disentanglement issues, leading +researchers to resort to other large datasets with pretrained AU intensity +estimators for pseudo labels. In addressing this constraint and fully +leveraging manual annotations of AU intensities for precise manipulation, we +introduce AUEditNet. Our proposed model achieves impressive intensity +manipulation across 12 AUs, trained effectively with only 18 subjects. +Utilizing a dual-branch architecture, our approach achieves comprehensive +disentanglement of facial attributes and identity without necessitating +additional loss functions or implementing with large batch sizes. This approach +offers a potential solution to achieve desired facial attribute editing despite +the dataset's limited subject count. Our experiments demonstrate AUEditNet's +superior accuracy in editing AU intensities, affirming its capability in +disentangling facial attributes and identity within a limited subject pool. +AUEditNet allows conditioning by either intensity values or target images, +eliminating the need for constructing AU combinations for specific facial +expression synthesis. Moreover, AU intensity estimation, as a downstream task, +validates the consistency between real and edited images, confirming the +effectiveness of our proposed AU intensity manipulation method. + +
+
+
+
+
+ + ♻ ☆ Ultra-Range Gesture Recognition using a Web-Camera in Human-Robot + Interaction + + +
+ Hand gestures play a significant role in human interactions where non-verbal +intentions, thoughts and commands are conveyed. In Human-Robot Interaction +(HRI), hand gestures offer a similar and efficient medium for conveying clear +and rapid directives to a robotic agent. However, state-of-the-art vision-based +methods for gesture recognition have been shown to be effective only up to a +user-camera distance of seven meters. Such a short distance range limits +practical HRI with, for example, service robots, search and rescue robots and +drones. In this work, we address the Ultra-Range Gesture Recognition (URGR) +problem by aiming for a recognition distance of up to 25 meters and in the +context of HRI. We propose the URGR framework, a novel deep-learning, using +solely a simple RGB camera. Gesture inference is based on a single image. +First, a novel super-resolution model termed High-Quality Network (HQ-Net) uses +a set of self-attention and convolutional layers to enhance the low-resolution +image of the user. Then, we propose a novel URGR classifier termed Graph Vision +Transformer (GViT) which takes the enhanced image as input. GViT combines the +benefits of a Graph Convolutional Network (GCN) and a modified Vision +Transformer (ViT). Evaluation of the proposed framework over diverse test data +yields a high recognition rate of 98.1%. The framework has also exhibited +superior performance compared to human recognition in ultra-range distances. +With the framework, we analyze and demonstrate the performance of an autonomous +quadruped robot directed by human gestures in complex ultra-range indoor and +outdoor environments, acquiring 96% recognition rate on average. + +
+
+ comment: Engineering Applications of Artificial Intelligence, In press +
+
+
+
+
+ + ♻ ☆ Concept-based Analysis of Neural Networks via Vision-Language Models + + +
+ The analysis of vision-based deep neural networks (DNNs) is highly desirable +but it is very challenging due to the difficulty of expressing formal +specifications for vision tasks and the lack of efficient verification +procedures. In this paper, we propose to leverage emerging multimodal, +vision-language, foundation models (VLMs) as a lens through which we can reason +about vision models. VLMs have been trained on a large body of images +accompanied by their textual description, and are thus implicitly aware of +high-level, human-understandable concepts describing the images. We describe a +logical specification language $\texttt{Con}_{\texttt{spec}}$ designed to +facilitate writing specifications in terms of these concepts. To define and +formally check $\texttt{Con}_{\texttt{spec}}$ specifications, we build a map +between the internal representations of a given vision model and a VLM, leading +to an efficient verification procedure of natural-language properties for +vision models. We demonstrate our techniques on a ResNet-based classifier +trained on the RIVAL-10 dataset using CLIP as the multimodal model. + +
+
+
+
+
+ + ♻ ☆ Learning to Predict 3D Rotational Dynamics from Images of a Rigid Body + with Unknown Mass Distribution + + +
+ In many real-world settings, image observations of freely rotating 3D rigid +bodies may be available when low-dimensional measurements are not. However, the +high-dimensionality of image data precludes the use of classical estimation +techniques to learn the dynamics. The usefulness of standard deep learning +methods is also limited, because an image of a rigid body reveals nothing about +the distribution of mass inside the body, which, together with initial angular +velocity, is what determines how the body will rotate. We present a +physics-based neural network model to estimate and predict 3D rotational +dynamics from image sequences. We achieve this using a multi-stage prediction +pipeline that maps individual images to a latent representation homeomorphic to +$\mathbf{SO}(3)$, computes angular velocities from latent pairs, and predicts +future latent states using the Hamiltonian equations of motion. We demonstrate +the efficacy of our approach on new rotating rigid-body datasets of sequences +of synthetic images of rotating objects, including cubes, prisms and +satellites, with unknown uniform and non-uniform mass distributions. Our model +outperforms competing baselines on our datasets, producing better qualitative +predictions and reducing the error observed for the state-of-the-art +Hamiltonian Generative Network by a factor of 2. + +
+
+ comment: Previously appeared as arXiv:2209.11355v2, which was submitted as a + replacement by accident. arXiv admin note: text overlap with arXiv:2209.11355 +
+
+
+
+
+ + ♻ ☆ Mask4Former: Mask Transformer for 4D Panoptic Segmentation ICRA 2024 + + +
+ Accurately perceiving and tracking instances over time is essential for the +decision-making processes of autonomous agents interacting safely in dynamic +environments. With this intention, we propose Mask4Former for the challenging +task of 4D panoptic segmentation of LiDAR point clouds. Mask4Former is the +first transformer-based approach unifying semantic instance segmentation and +tracking of sparse and irregular sequences of 3D point clouds into a single +joint model. Our model directly predicts semantic instances and their temporal +associations without relying on hand-crafted non-learned association strategies +such as probabilistic clustering or voting-based center prediction. Instead, +Mask4Former introduces spatio-temporal instance queries that encode the +semantic and geometric properties of each semantic tracklet in the sequence. In +an in-depth study, we find that promoting spatially compact instance +predictions is critical as spatio-temporal instance queries tend to merge +multiple semantically similar instances, even if they are spatially distant. To +this end, we regress 6-DOF bounding box parameters from spatio-temporal +instance queries, which are used as an auxiliary task to foster spatially +compact predictions. Mask4Former achieves a new state-of-the-art on the +SemanticKITTI test set with a score of 68.4 LSTQ. + +
+
+ comment: Renamed from MASK4D to Mask4Former. ICRA 2024. Project page: + https://vision.rwth-aachen.de/Mask4Former +
+
+
+
+
+ + ♻ ☆ Enhancing Hierarchical Transformers for Whole Brain Segmentation with + Intracranial Measurements Integration + + +
+ Whole brain segmentation with magnetic resonance imaging (MRI) enables the +non-invasive measurement of brain regions, including total intracranial volume +(TICV) and posterior fossa volume (PFV). Enhancing the existing whole brain +segmentation methodology to incorporate intracranial measurements offers a +heightened level of comprehensiveness in the analysis of brain structures. +Despite its potential, the task of generalizing deep learning techniques for +intracranial measurements faces data availability constraints due to limited +manually annotated atlases encompassing whole brain and TICV/PFV labels. In +this paper, we enhancing the hierarchical transformer UNesT for whole brain +segmentation to achieve segmenting whole brain with 133 classes and TICV/PFV +simultaneously. To address the problem of data scarcity, the model is first +pretrained on 4859 T1-weighted (T1w) 3D volumes sourced from 8 different sites. +These volumes are processed through a multi-atlas segmentation pipeline for +label generation, while TICV/PFV labels are unavailable. Subsequently, the +model is finetuned with 45 T1w 3D volumes from Open Access Series Imaging +Studies (OASIS) where both 133 whole brain classes and TICV/PFV labels are +available. We evaluate our method with Dice similarity coefficients(DSC). We +show that our model is able to conduct precise TICV/PFV estimation while +maintaining the 132 brain regions performance at a comparable level. Code and +trained model are available at: +https://github.com/MASILab/UNesT/tree/main/wholebrainSeg. + +
+
+
+
+
+ + ♻ ☆ Detecting Image Attribution for Text-to-Image Diffusion Models in RGB + and Beyond + + +
+ Modern text-to-image (T2I) diffusion models can generate images with +remarkable realism and creativity. These advancements have sparked research in +fake image detection and attribution, yet prior studies have not fully explored +the practical and scientific dimensions of this task. In addition to +attributing images to 12 state-of-the-art T2I generators, we provide extensive +analyses on what inference stage hyperparameters and image modifications are +discernible. Our experiments reveal that initialization seeds are highly +detectable, along with other subtle variations in the image generation process +to some extent. We further investigate what visual traces are leveraged in +image attribution by perturbing high-frequency details and employing mid-level +representations of image style and structure. Notably, altering high-frequency +information causes only slight reductions in accuracy, and training an +attributor on style representations outperforms training on RGB images. Our +analyses underscore that fake images are detectable and attributable at various +levels of visual granularity than previously explored. + +
+
+ comment: Code available at https://github.com/k8xu/ImageAttribution +
+
+
+
+
+ + ♻ ☆ Hierarchical Augmentation and Distillation for Class Incremental + Audio-Visual Video Recognition + + +
+ Audio-visual video recognition (AVVR) aims to integrate audio and visual +clues to categorize videos accurately. While existing methods train AVVR models +using provided datasets and achieve satisfactory results, they struggle to +retain historical class knowledge when confronted with new classes in +real-world situations. Currently, there are no dedicated methods for addressing +this problem, so this paper concentrates on exploring Class Incremental +Audio-Visual Video Recognition (CIAVVR). For CIAVVR, since both stored data and +learned model of past classes contain historical knowledge, the core challenge +is how to capture past data knowledge and past model knowledge to prevent +catastrophic forgetting. We introduce Hierarchical Augmentation and +Distillation (HAD), which comprises the Hierarchical Augmentation Module (HAM) +and Hierarchical Distillation Module (HDM) to efficiently utilize the +hierarchical structure of data and models, respectively. Specifically, HAM +implements a novel augmentation strategy, segmental feature augmentation, to +preserve hierarchical model knowledge. Meanwhile, HDM introduces newly designed +hierarchical (video-distribution) logical distillation and hierarchical +(snippet-video) correlative distillation to capture and maintain the +hierarchical intra-sample knowledge of each data and the hierarchical +inter-sample knowledge between data, respectively. Evaluations on four +benchmarks (AVE, AVK-100, AVK-200, and AVK-400) demonstrate that the proposed +HAD effectively captures hierarchical information in both data and models, +resulting in better preservation of historical class knowledge and improved +performance. Furthermore, we provide a theoretical analysis to support the +necessity of the segmental feature augmentation strategy. + +
+
+ comment: Submitted to TPAMI +
+
+
+
+
+ + ♻ ☆ Elucidating the Exposure Bias in Diffusion Models ICLR 2024 + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their \textit{exposure bias} problem, described as the input mismatch between +training and sampling, lacks in-depth exploration. In this paper, we +systematically investigate the exposure bias problem in diffusion models by +first analytically modelling the sampling distribution, based on which we then +attribute the prediction error at each sampling step as the root cause of the +exposure bias issue. Furthermore, we discuss potential solutions to this issue +and propose an intuitive metric for it. Along with the elucidation of exposure +bias, we propose a simple, yet effective, training-free method called Epsilon +Scaling to alleviate the exposure bias. We show that Epsilon Scaling explicitly +moves the sampling trajectory closer to the vector field learned in the +training phase by scaling down the network output, mitigating the input +mismatch between training and sampling. Experiments on various diffusion +frameworks (ADM, DDIM, EDM, LDM, DiT, PFGM++) verify the effectiveness of our +method. Remarkably, our ADM-ES, as a state-of-the-art stochastic sampler, +obtains 2.17 FID on CIFAR-10 under 100-step unconditional generation. The code +is available at \url{https://github.com/forever208/ADM-ES} and +\url{https://github.com/forever208/EDM-ES}. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Discovering Closed-Loop Failures of Vision-Based Controllers via + Reachability Analysis + + +
+ Machine learning driven image-based controllers allow robotic systems to take +intelligent actions based on the visual feedback from their environment. +Understanding when these controllers might lead to system safety violations is +important for their integration in safety-critical applications and engineering +corrective safety measures for the system. Existing methods leverage +simulation-based testing (or falsification) to find the failures of +vision-based controllers, i.e., the visual inputs that lead to closed-loop +safety violations. However, these techniques do not scale well to the scenarios +involving high-dimensional and complex visual inputs, such as RGB images. In +this work, we cast the problem of finding closed-loop vision failures as a +Hamilton-Jacobi (HJ) reachability problem. Our approach blends simulation-based +analysis with HJ reachability methods to compute an approximation of the +backward reachable tube (BRT) of the system, i.e., the set of unsafe states for +the system under vision-based controllers. Utilizing the BRT, we can tractably +and systematically find the system states and corresponding visual inputs that +lead to closed-loop failures. These visual inputs can be subsequently analyzed +to find the input characteristics that might have caused the failure. Besides +its scalability to high-dimensional visual inputs, an explicit computation of +BRT allows the proposed approach to capture non-trivial system failures that +are difficult to expose via random simulations. We demonstrate our framework on +two case studies involving an RGB image-based neural network controller for (a) +autonomous indoor navigation, and (b) autonomous aircraft taxiing. + +
+
+
+
+
+ + ♻ ☆ nnMobileNe: Rethinking CNN for Retinopathy Research CVPR + + +
+ Over the past few decades, convolutional neural networks (CNNs) have been at +the forefront of the detection and tracking of various retinal diseases (RD). +Despite their success, the emergence of vision transformers (ViT) in the 2020s +has shifted the trajectory of RD model development. The leading-edge +performance of ViT-based models in RD can be largely credited to their +scalability-their ability to improve as more parameters are added. As a result, +ViT-based models tend to outshine traditional CNNs in RD applications, albeit +at the cost of increased data and computational demands. ViTs also differ from +CNNs in their approach to processing images, working with patches rather than +local regions, which can complicate the precise localization of small, variably +presented lesions in RD. In our study, we revisited and updated the +architecture of a CNN model, specifically MobileNet, to enhance its utility in +RD diagnostics. We found that an optimized MobileNet, through selective +modifications, can surpass ViT-based models in various RD benchmarks, including +diabetic retinopathy grading, detection of multiple fundus diseases, and +classification of diabetic macular edema. The code is available at +https://github.com/Retinal-Research/NN-MOBILENET + +
+
+ comment: Accepted as a conference paper to 2024 CVPRW +
+
+
+
+
+ + ♻ ☆ LongVLM: Efficient Long Video Understanding via Large Language Models + + +
+ Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs +have driven progress in various video understanding tasks. These models encode +video representations through pooling or query aggregation over a vast number +of visual tokens, making computational and memory costs affordable. Despite +successfully providing an overall comprehension of video content, existing +VideoLLMs still face challenges in achieving detailed understanding in videos +due to overlooking local information in long-term videos. To tackle this +challenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for +long video understanding, building upon the observation that long videos often +consist of sequential key events, complex actions, and camera movements. Our +approach proposes to decompose long videos into multiple short-term segments +and encode local features for each local segment via a hierarchical token +merging module. These features are concatenated in temporal order to maintain +the storyline across sequential short-term segments. Additionally, we propose +to integrate global semantics into each local feature to enhance context +understanding. In this way, we encode video representations that incorporate +both local and global information, enabling the LLM to generate comprehensive +responses for long-term videos. Experimental results on the VideoChatGPT +benchmark and zero-shot video question-answering datasets demonstrate the +superior capabilities of our model over the previous state-of-the-art methods. +Qualitative examples demonstrate that our model produces more precise responses +for long videos understanding. Code will be available at +https://github.com/ziplab/LongVLM. + +
+
+
+
+
+ + ♻ ☆ GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object + Detection + + +
+ Integrating LiDAR and camera information into Bird's-Eye-View (BEV) +representation has emerged as a crucial aspect of 3D object detection in +autonomous driving. However, existing methods are susceptible to the inaccurate +calibration relationship between LiDAR and the camera sensor. Such inaccuracies +result in errors in depth estimation for the camera branch, ultimately causing +misalignment between LiDAR and camera BEV features. In this work, we propose a +robust fusion framework called Graph BEV. Addressing errors caused by +inaccurate point cloud projection, we introduce a Local Align module that +employs neighbor-aware depth features via Graph matching. Additionally, we +propose a Global Align module to rectify the misalignment between LiDAR and +camera BEV features. Our Graph BEV framework achieves state-of-the-art +performance, with an mAP of 70.1\%, surpassing BEV Fusion by 1.6\% on the +nuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by +8.3\% under conditions with misalignment noise. + +
+
+
+
+
+ + ♻ ☆ Exploring the Potential of Large Foundation Models for Open-Vocabulary + HOI Detection + + +
+ Open-vocabulary human-object interaction (HOI) detection, which is concerned +with the problem of detecting novel HOIs guided by natural language, is crucial +for understanding human-centric scenes. However, prior zero-shot HOI detectors +often employ the same levels of feature maps to model HOIs with varying +distances, leading to suboptimal performance in scenes containing human-object +pairs with a wide range of distances. In addition, these detectors primarily +rely on category names and overlook the rich contextual information that +language can provide, which is essential for capturing open vocabulary concepts +that are typically rare and not well-represented by category names alone. In +this paper, we introduce a novel end-to-end open vocabulary HOI detection +framework with conditional multi-level decoding and fine-grained semantic +enhancement (CMD-SE), harnessing the potential of Visual-Language Models +(VLMs). Specifically, we propose to model human-object pairs with different +distances with different levels of feature maps by incorporating a soft +constraint during the bipartite matching process. Furthermore, by leveraging +large language models (LLMs) such as GPT models, we exploit their extensive +world knowledge to generate descriptions of human body part states for various +interactions. Then we integrate the generalizable and fine-grained semantics of +human body parts to improve interaction recognition. Experimental results on +two datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method +achieves state-of-the-art results in open vocabulary HOI detection. The code +and models are available at https://github.com/ltttpku/CMD-SE-release. + +
+
+
+
+
+ + ♻ ☆ Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A + Semi-Supervised Video Object Detection Method + + +
+ This study aims to establish a computer-aided diagnostic system for lung +lesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians +in identifying lesion areas. During EBUS-transbronchial needle aspiration +(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to +determine the location of lesions. However, these images often contain +significant noise and can be influenced by surrounding tissues or blood +vessels, making interpretation challenging. Previous research has lacked the +application of object detection models to EBUS-TBNA, and there has been no +well-defined solution for annotating the EBUS-TBNA dataset. In related studies +on ultrasound images, although models have been successful in capturing target +regions for their respective tasks, their training and predictions have been +based on two-dimensional images, limiting their ability to leverage temporal +features for improved predictions. This study introduces a three-dimensional +image-based object detection model. It utilizes an attention mechanism to +capture temporal correlations and we will implements a filtering mechanism to +select relevant information from previous frames. Subsequently, a +teacher-student model training approach is employed to optimize the model +further, leveraging unlabeled data. To mitigate the impact of poor-quality +pseudo-labels on the student model, we will add a special Gaussian Mixture +Model (GMM) to ensure the quality of pseudo-labels. + +
+
+
+
+
+ + ♻ ☆ Using Few-Shot Learning to Classify Primary Lung Cancer and Other + Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial + Ultrasound Procedures + + +
+ This study aims to establish a computer-aided diagnosis system for +endobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary +diagnosis of metastatic cancer. This involves arranging immediate examinations +for other sites of metastatic cancer after EBUS surgery, eliminating the need +to wait for reports, thereby shortening the waiting time by more than half and +enabling patients to detect other cancers earlier, allowing for early planning +and implementation of treatment plans. Unlike previous studies on cell image +classification, which have abundant datasets for training, this study must also +be able to make effective classifications despite the limited amount of case +data for lung metastatic cancer. In the realm of small data set classification +methods, Few-shot learning (FSL) has become mainstream in recent years. Through +its ability to train on small datasets and its strong generalization +capabilities, FSL shows potential in this task of lung metastatic cell image +classification. This study will adopt the approach of Few-shot learning, +referencing existing proposed models, and designing a model architecture for +classifying lung metastases cell images. Batch Spectral Regularization (BSR) +will be incorporated as a loss update parameter, and the Finetune method of PMF +will be modified. In terms of test results, the addition of BSR and the +modified Finetune method further increases the accuracy by 8.89% to 65.60%, +outperforming other FSL methods. This study confirms that FSL is superior to +supervised and transfer learning in classifying metastatic cancer and +demonstrates that using BSR as a loss function and modifying Finetune can +enhance the model's capabilities. + +
+
+
+
+
+ + ♻ ☆ Pyramid Deep Fusion Network for Two-Hand Reconstruction from RGB-D + Images + + +
+ Accurately recovering the dense 3D mesh of both hands from monocular images +poses considerable challenges due to occlusions and projection ambiguity. Most +of the existing methods extract features from color images to estimate the +root-aligned hand meshes, which neglect the crucial depth and scale information +in the real world. Given the noisy sensor measurements with limited resolution, +depth-based methods predict 3D keypoints rather than a dense mesh. These +limitations motivate us to take advantage of these two complementary inputs to +acquire dense hand meshes on a real-world scale. In this work, we propose an +end-to-end framework for recovering dense meshes for both hands, which employ +single-view RGB-D image pairs as input. The primary challenge lies in +effectively utilizing two different input modalities to mitigate the blurring +effects in RGB images and noises in depth images. Instead of directly treating +depth maps as additional channels for RGB images, we encode the depth +information into the unordered point cloud to preserve more geometric details. +Specifically, our framework employs ResNet50 and PointNet++ to derive features +from RGB and point cloud, respectively. Additionally, we introduce a novel +pyramid deep fusion network (PDFNet) to aggregate features at different scales, +which demonstrates superior efficacy compared to previous fusion strategies. +Furthermore, we employ a GCN-based decoder to process the fused features and +recover the corresponding 3D pose and dense mesh. Through comprehensive +ablation experiments, we have not only demonstrated the effectiveness of our +proposed fusion algorithm but also outperformed the state-of-the-art approaches +on publicly available datasets. To reproduce the results, we will make our +source code and models publicly available at +{https://github.com/zijinxuxu/PDFNet}. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ♻ ☆ CitDet: A Benchmark Dataset for Citrus Fruit Detection + + +
+ In this letter, we present a new dataset to advance the state of the art in +detecting citrus fruit and accurately estimate yield on trees affected by the +Huanglongbing (HLB) disease in orchard environments via imaging. Despite the +fact that significant progress has been made in solving the fruit detection +problem, the lack of publicly available datasets has complicated direct +comparison of results. For instance, citrus detection has long been of interest +to the agricultural research community, yet there is an absence of work, +particularly involving public datasets of citrus affected by HLB. To address +this issue, we enhance state-of-the-art object detection methods for use in +typical orchard settings. Concretely, we provide high-resolution images of +citrus trees located in an area known to be highly affected by HLB, along with +high-quality bounding box annotations of citrus fruit. Fruit on both the trees +and the ground are labeled to allow for identification of fruit location, which +contributes to advancements in yield estimation and potential measure of HLB +impact via fruit drop. The dataset consists of over 32,000 bounding box +annotations for fruit instances contained in 579 high-resolution images. In +summary, our contributions are the following: (i) we introduce a novel dataset +along with baseline performance benchmarks on multiple contemporary object +detection algorithms, (ii) we show the ability to accurately capture fruit +location on tree or on ground, and finally (ii) we present a correlation of our +results with yield estimations. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters (RA-L) +
+
+
+
+
+ + ♻ ☆ A Generic Shared Attention Mechanism for Various Backbone Neural + Networks + + +
+ The self-attention mechanism has emerged as a critical component for +improving the performance of various backbone neural networks. However, current +mainstream approaches individually incorporate newly designed self-attention +modules (SAMs) into each layer of the network for granted without fully +exploiting their parameters' potential. This leads to suboptimal performance +and increased parameter consumption as the network depth increases. To improve +this paradigm, in this paper, we first present a counterintuitive but inherent +phenomenon: SAMs tend to produce strongly correlated attention maps across +different layers, with an average Pearson correlation coefficient of up to +0.85. Inspired by this inherent observation, we propose Dense-and-Implicit +Attention (DIA), which directly shares SAMs across layers and employs a long +short-term memory module to calibrate and bridge the highly correlated +attention maps of different layers, thus improving the parameter utilization +efficiency of SAMs. This design of DIA is also consistent with the neural +network's dynamical system perspective. Through extensive experiments, we +demonstrate that our simple yet effective DIA can consistently enhance various +network backbones, including ResNet, Transformer, and UNet, across tasks such +as image classification, object detection, and image generation using diffusion +models. + +
+
+ comment: Work in progress. arXiv admin note: text overlap with + arXiv:1905.10671 +
+
+
+
+
+ + ♻ ☆ Flying with Photons: Rendering Novel Views of Propagating Light + + +
+ We present an imaging and neural rendering technique that seeks to synthesize +videos of light propagating through a scene from novel, moving camera +viewpoints. Our approach relies on a new ultrafast imaging setup to capture a +first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal +resolution. Combined with this dataset, we introduce an efficient neural volume +rendering framework based on the transient field. This field is defined as a +mapping from a 3D point and 2D direction to a high-dimensional, discrete-time +signal that represents time-varying radiance at ultrafast timescales. Rendering +with transient fields naturally accounts for effects due to the finite speed of +light, including viewpoint-dependent appearance changes caused by light +propagation delays to the camera. We render a range of complex effects, +including scattering, specular reflection, refraction, and diffraction. +Additionally, we demonstrate removing viewpoint-dependent propagation delays +using a time warping procedure, rendering of relativistic effects, and video +synthesis of direct and global components of light transport. + +
+
+ comment: Project page: https://anaghmalik.com/FlyingWithPhotons/ +
+
+
+
+
+ + ♻ ☆ Reconstructing Hand-Held Objects in 3D + + +
+ Objects manipulated by the hand (i.e., manipulanda) are particularly +challenging to reconstruct from in-the-wild RGB images or videos. Not only does +the hand occlude much of the object, but also the object is often only visible +in a small number of image pixels. At the same time, two strong anchors emerge +in this setting: (1) estimated 3D hands help disambiguate the location and +scale of the object, and (2) the set of manipulanda is small relative to all +possible objects. With these insights in mind, we present a scalable paradigm +for handheld object reconstruction that builds on recent breakthroughs in large +language/vision models and 3D object datasets. Our model, MCC-Hand-Object +(MCC-HO), jointly reconstructs hand and object geometry given a single RGB +image and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve +a 3D object model that matches the object in the image and rigidly align the +model to the network-inferred geometry; we call this alignment +Retrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO +achieves state-of-the-art performance on lab and Internet datasets, and we show +how RAR can be used to automatically obtain 3D labels for in-the-wild images of +hand-object interactions. + +
+
+ comment: Project page: https://janehwu.github.io/mcc-ho +
+
+
+
+
+ + ♻ ☆ Phase Guided Light Field for Spatial-Depth High Resolution 3D Imaging + + +
+ On 3D imaging, light field cameras typically are of single shot, and however, +they heavily suffer from low spatial resolution and depth accuracy. In this +paper, by employing an optical projector to project a group of single +high-frequency phase-shifted sinusoid patterns, we propose a phase guided light +field algorithm to significantly improve both the spatial and depth resolutions +for off-the-shelf light field cameras. First, for correcting the axial +aberrations caused by the main lens of our light field camera, we propose a +deformed cone model to calibrate our structured light field system. Second, +over wrapped phases computed from patterned images, we propose a stereo +matching algorithm, i.e. phase guided sum of absolute difference, to robustly +obtain the correspondence for each pair of neighbored two lenslets. Finally, by +introducing a virtual camera according to the basic geometrical optics of light +field imaging, we propose a reorganization strategy to reconstruct 3D point +clouds with spatial-depth high resolution. Experimental results show that, +compared with the state-of-the-art active light field methods, the proposed +reconstructs 3D point clouds with a spatial resolution of 1280$\times$720 with +factors 10$\times$ increased, while maintaining the same high depth resolution +and needing merely a single group of high-frequency patterns. + +
+
+
+
+
+ + ♻ ☆ Text-Based Reasoning About Vector Graphics + + +
+ While large multimodal models excel in broad vision-language benchmarks, they +often struggle with tasks requiring precise perception of low-level visual +details, such as comparing line lengths or solving simple mazes. In particular, +this failure mode persists in question-answering tasks about vector graphics -- +images composed purely of 2D objects and shapes. To address this challenge, we +propose the Visually Descriptive Language Model (VDLM), which performs +text-based reasoning about vector graphics. VDLM leverages Scalable Vector +Graphics (SVG) for a more precise visual description and first uses an +off-the-shelf raster-to-SVG algorithm for encoding. Since existing language +models cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG +with pretrained language models through a newly introduced intermediate +symbolic representation, Primal Visual Description (PVD), comprising primitive +attributes (e.g., shape, position, measurement) with their corresponding +predicted values. PVD is task-agnostic and represents visual primitives that +are universal across all vector graphics. It can be learned with procedurally +generated (SVG, PVD) pairs and also enables the direct use of LLMs for +generalization to complex reasoning tasks. By casting an image to a text-based +representation, we can leverage the power of language models to learn alignment +from SVG to visual primitives and generalize to unseen question-answering +tasks. Empirical results show that VDLM achieves stronger zero-shot performance +compared to state-of-the-art LMMs, such as GPT-4V, in various low-level +multimodal perception and reasoning tasks on vector graphics. We additionally +present extensive analyses on VDLM's performance, demonstrating that our +framework offers better interpretability due to its disentangled perception and +reasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/ + +
+
+ comment: Project page: https://mikewangwzhl.github.io/VDLM/ +
+
+
+
+
+ + ♻ ☆ Multi-Level Label Correction by Distilling Proximate Patterns for + Semi-supervised Semantic Segmentation + + +
+ Semi-supervised semantic segmentation relieves the reliance on large-scale +labeled data by leveraging unlabeled data. Recent semi-supervised semantic +segmentation approaches mainly resort to pseudo-labeling methods to exploit +unlabeled data. However, unreliable pseudo-labeling can undermine the +semi-supervision processes. In this paper, we propose an algorithm called +Multi-Level Label Correction (MLLC), which aims to use graph neural networks to +capture structural relationships in Semantic-Level Graphs (SLGs) and +Class-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically, +SLGs represent semantic affinities between pairs of pixel features, and CLGs +describe classification consistencies between pairs of pixel labels. With the +support of proximate pattern information from graphs, MLLC can rectify +incorrectly predicted pseudo-labels and can facilitate discriminative feature +representations. We design an end-to-end network to train and perform this +effective label corrections mechanism. Experiments demonstrate that MLLC can +significantly improve supervised baselines and outperforms state-of-the-art +approaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets. +Specifically, MLLC improves the supervised baseline by at least 5% and 2% with +DeepLabV2 and DeepLabV3+ respectively under different partition protocols. + +
+
+ comment: 12 pages, 8 figures. IEEE Transactions on Multimedia, 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Diffusion For Strong and High Quality Face Morphing Attacks + + +
+ Face morphing attacks seek to deceive a Face Recognition (FR) system by +presenting a morphed image consisting of the biometric qualities from two +different identities with the aim of triggering a false acceptance with one of +the two identities, thereby presenting a significant threat to biometric +systems. The success of a morphing attack is dependent on the ability of the +morphed image to represent the biometric characteristics of both identities +that were used to create the image. We present a novel morphing attack that +uses a Diffusion-based architecture to improve the visual fidelity of the image +and the ability of the morphing attack to represent characteristics from both +identities. We demonstrate the effectiveness of the proposed attack by +evaluating its visual fidelity via the Frechet Inception Distance (FID). Also, +extensive experiments are conducted to measure the vulnerability of FR systems +to the proposed attack. The ability of a morphing attack detector to detect the +proposed attack is measured and compared against two state-of-the-art GAN-based +morphing attacks along with two Landmark-based attacks. Additionally, a novel +metric to measure the relative strength between different morphing attacks is +introduced and evaluated. + +
+
+ comment: Diffusion Morphs (DiM) paper. Accepted in IEEE TBIOM +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Attention and Gaussian Processes for Personalized Video + Gaze Estimation CVPR 2024 + + +
+ Gaze is an essential prompt for analyzing human behavior and attention. +Recently, there has been an increasing interest in determining gaze direction +from facial videos. However, video gaze estimation faces significant +challenges, such as understanding the dynamic evolution of gaze in video +sequences, dealing with static backgrounds, and adapting to variations in +illumination. To address these challenges, we propose a simple and novel deep +learning model designed to estimate gaze from videos, incorporating a +specialized attention module. Our method employs a spatial attention mechanism +that tracks spatial dynamics within videos. This technique enables accurate +gaze direction prediction through a temporal sequence model, adeptly +transforming spatial observations into temporal insights, thereby significantly +improving gaze estimation accuracy. Additionally, our approach integrates +Gaussian processes to include individual-specific traits, facilitating the +personalization of our model with just a few labeled samples. Experimental +results confirm the efficacy of the proposed approach, demonstrating its +success in both within-dataset and cross-dataset settings. Specifically, our +proposed approach achieves state-of-the-art performance on the Gaze360 dataset, +improving by $2.5^\circ$ without personalization. Further, by personalizing the +model with just three samples, we achieved an additional improvement of +$0.8^\circ$. The code and pre-trained models are available at +\url{https://github.com/jswati31/stage}. + +
+
+ comment: Accepted at CVPR 2024 Gaze workshop +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 170 + +
+
+
+ + ☆ InternLM-XComposer2-4KHD: A Pioneering Large Vision-Language Model + Handling Resolutions from 336 Pixels to 4K HD + + +
+ The Large Vision-Language Model (LVLM) field has seen significant +advancements, yet its progression has been hindered by challenges in +comprehending fine-grained visual content due to limited resolution. Recent +efforts have aimed to enhance the high-resolution understanding capabilities of +LVLMs, yet they remain capped at approximately 1500 x 1500 pixels and +constrained to a relatively narrow resolution range. This paper represents +InternLM-XComposer2-4KHD, a groundbreaking exploration into elevating LVLM +resolution capabilities up to 4K HD (3840 x 1600) and beyond. Concurrently, +considering the ultra-high resolution may not be necessary in all scenarios, it +supports a wide range of diverse resolutions from 336 pixels to 4K standard, +significantly broadening its scope of applicability. Specifically, this +research advances the patch division paradigm by introducing a novel extension: +dynamic resolution with automatic patch configuration. It maintains the +training image aspect ratios while automatically varying patch counts and +configuring layouts based on a pre-trained Vision Transformer (ViT) (336 x +336), leading to dynamic training resolution from 336 pixels to 4K standard. +Our research demonstrates that scaling training resolution up to 4K HD leads to +consistent performance enhancements without hitting the ceiling of potential +improvements. InternLM-XComposer2-4KHD shows superb capability that matches or +even surpasses GPT-4V and Gemini Pro in 10 of the 16 benchmarks. The +InternLM-XComposer2-4KHD model series with 7B parameters are publicly available +at https://github.com/InternLM/InternLM-XComposer. + +
+
+ comment: Code and models are publicly available at + https://github.com/InternLM/InternLM-XComposer +
+
+
+
+
+ + ☆ MoReVQA: Exploring Modular Reasoning Models for Video Question Answering CVPR 2024 + + +
+ This paper addresses the task of video question answering (videoQA) via a +decomposed multi-stage, modular reasoning framework. Previous modular methods +have shown promise with a single planning stage ungrounded in visual content. +However, through a simple and effective baseline, we find that such systems can +lead to brittle behavior in practice for challenging videoQA settings. Thus, +unlike traditional single-stage planning methods, we propose a multi-stage +system consisting of an event parser, a grounding stage, and a final reasoning +stage in conjunction with an external memory. All stages are training-free, and +performed using few-shot prompting of large models, creating interpretable +intermediate outputs at each stage. By decomposing the underlying planning and +task complexity, our method, MoReVQA, improves over prior work on standard +videoQA benchmarks (NExT-QA, iVQA, EgoSchema, ActivityNet-QA) with +state-of-the-art results, and extensions to related tasks (grounded videoQA, +paragraph captioning). + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Can Feedback Enhance Semantic Grounding in Large Vision-Language Models? + + +
+ Enhancing semantic grounding abilities in Vision-Language Models (VLMs) often +involves collecting domain-specific training data, refining the network +architectures, or modifying the training recipes. In this work, we venture into +an orthogonal direction and explore whether VLMs can improve their semantic +grounding by "receiving" feedback, without requiring in-domain data, +fine-tuning, or modifications to the network architectures. We systematically +analyze this hypothesis using a feedback mechanism composed of a binary signal. +We find that if prompted appropriately, VLMs can utilize feedback both in a +single step and iteratively, showcasing the potential of feedback as an +alternative technique to improve grounding in internet-scale VLMs. Furthermore, +VLMs, like LLMs, struggle to self-correct errors out-of-the-box. However, we +find that this issue can be mitigated via a binary verification mechanism. +Finally, we explore the potential and limitations of amalgamating these +findings and applying them iteratively to automatically enhance VLMs' grounding +performance, showing grounding accuracy consistently improves using automated +feedback across all models in all settings investigated. Overall, our iterative +framework improves semantic grounding in VLMs by more than 15 accuracy points +under noise-free feedback and up to 5 accuracy points under a simple automated +binary verification mechanism. The project website is hosted at +https://andrewliao11.github.io/vlms_feedback + +
+
+ comment: 31 pages, 15 figures +
+
+
+
+
+ + ☆ Reconstructing Hand-Held Objects in 3D + + +
+ Objects manipulated by the hand (i.e., manipulanda) are particularly +challenging to reconstruct from in-the-wild RGB images or videos. Not only does +the hand occlude much of the object, but also the object is often only visible +in a small number of image pixels. At the same time, two strong anchors emerge +in this setting: (1) estimated 3D hands help disambiguate the location and +scale of the object, and (2) the set of manipulanda is small relative to all +possible objects. With these insights in mind, we present a scalable paradigm +for handheld object reconstruction that builds on recent breakthroughs in large +language/vision models and 3D object datasets. Our model, MCC-Hand-Object +(MCC-HO), jointly reconstructs hand and object geometry given a single RGB +image and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve +a 3D object model that matches the object in the image and rigidly align the +model to the network-inferred geometry; we call this alignment +Retrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO +achieves state-of-the-art performance on lab and Internet datasets, and we show +how RAR can be used to automatically obtain 3D labels for in-the-wild images of +hand-object interactions. + +
+
+
+
+
+ + ☆ Flying With Photons: Rendering Novel Views of Propagating Light + + +
+ We present an imaging and neural rendering technique that seeks to synthesize +videos of light propagating through a scene from novel, moving camera +viewpoints. Our approach relies on a new ultrafast imaging setup to capture a +first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal +resolution. Combined with this dataset, we introduce an efficient neural volume +rendering framework based on the transient field. This field is defined as a +mapping from a 3D point and 2D direction to a high-dimensional, discrete-time +signal that represents time-varying radiance at ultrafast timescales. Rendering +with transient fields naturally accounts for effects due to the finite speed of +light, including viewpoint-dependent appearance changes caused by light +propagation delays to the camera. We render a range of complex effects, +including scattering, specular reflection, refraction, and diffraction. +Additionally, we demonstrate removing viewpoint-dependent propagation delays +using a time warping procedure, rendering of relativistic effects, and video +synthesis of direct and global components of light transport. + +
+
+ comment: Project page: https://anaghmalik.com/FlyingWithPhotons/ +
+
+
+
+
+ + ☆ RhythmMamba: Fast Remote Physiological Measurement with Arbitrary Length + Videos + + +
+ Remote photoplethysmography (rPPG) is a non-contact method for detecting +physiological signals from facial videos, holding great potential in various +applications such as healthcare, affective computing, and anti-spoofing. +Existing deep learning methods struggle to address two core issues of rPPG +simultaneously: extracting weak rPPG signals from video segments with large +spatiotemporal redundancy and understanding the periodic patterns of rPPG among +long contexts. This represents a trade-off between computational complexity and +the ability to capture long-range dependencies, posing a challenge for rPPG +that is suitable for deployment on mobile devices. Based on the in-depth +exploration of Mamba's comprehension of spatial and temporal information, this +paper introduces RhythmMamba, an end-to-end Mamba-based method that employs +multi-temporal Mamba to constrain both periodic patterns and short-term trends, +coupled with frequency domain feed-forward to enable Mamba to robustly +understand the quasi-periodic patterns of rPPG. Extensive experiments show that +RhythmMamba achieves state-of-the-art performance with reduced parameters and +lower computational complexity. The proposed RhythmMamba can be applied to +video segments of any length without performance degradation. The codes are +available at https://github.com/zizheng-guo/RhythmMamba. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2402.12788 +
+
+
+
+
+ + ☆ Text-Based Reasoning About Vector Graphics + + +
+ While large multimodal models excel in broad vision-language benchmarks, they +often struggle with tasks requiring precise perception of low-level visual +details, such as comparing line lengths or solving simple mazes. In particular, +this failure mode persists in question-answering tasks about vector graphics -- +images composed purely of 2D objects and shapes. To address this challenge, we +propose the Visually Descriptive Language Model (VDLM), which performs +text-based reasoning about vector graphics. VDLM leverages Scalable Vector +Graphics (SVG) for a more precise visual description and first uses an +off-the-shelf raster-to-SVG algorithm for encoding. Since existing language +models cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG +with pretrained language models through a newly introduced intermediate +symbolic representation, Primal Visual Description (PVD), comprising primitive +attributes (e.g., shape, position, measurement) with their corresponding +predicted values. PVD is task-agnostic and represents visual primitives that +are universal across all vector graphics. It can be learned with procedurally +generated (SVG, PVD) pairs and also enables the direct use of LLMs for +generalization to complex reasoning tasks. By casting an image to a text-based +representation, we can leverage the power of language models to learn alignment +from SVG to visual primitives and generalize to unseen question-answering +tasks. Empirical results show that VDLM achieves stronger zero-shot performance +compared to state-of-the-art LMMs, such as GPT-4V, in various low-level +multimodal perception and reasoning tasks on vector graphics. We additionally +present extensive analyses on VDLM's performance, demonstrating that our +framework offers better interpretability due to its disentangled perception and +reasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/ + +
+
+ comment: Project page: https://mikewangwzhl.github.io/VDLM/ +
+
+
+
+
+ + ☆ Learning State-Invariant Representations of Objects from Image + Collections with State, Pose, and Viewpoint Changes + + +
+ We add one more invariance - state invariance - to the more commonly used +other invariances for learning object representations for recognition and +retrieval. By state invariance, we mean robust with respect to changes in the +structural form of the object, such as when an umbrella is folded, or when an +item of clothing is tossed on the floor. Since humans generally have no +difficulty in recognizing objects despite such state changes, we are naturally +faced with the question of whether it is possible to devise a neural +architecture with similar abilities. To that end, we present a novel dataset, +ObjectsWithStateChange, that captures state and pose variations in the object +images recorded from arbitrary viewpoints. We believe that this dataset will +facilitate research in fine-grained object recognition and retrieval of objects +that are capable of state changes. The goal of such research would be to train +models capable of generating object embeddings that remain invariant to state +changes while also staying invariant to transformations induced by changes in +viewpoint, pose, illumination, etc. To demonstrate the usefulness of the +ObjectsWithStateChange dataset, we also propose a curriculum learning strategy +that uses the similarity relationships in the learned embedding space after +each epoch to guide the training process. The model learns discriminative +features by comparing visually similar objects within and across different +categories, encouraging it to differentiate between objects that may be +challenging to distinguish due to changes in their state. We believe that this +strategy enhances the model's ability to capture discriminative features for +fine-grained tasks that may involve objects with state changes, leading to +performance improvements on object-level tasks not only on our new dataset, but +also on two other challenging multi-view datasets such as ModelNet40 and +ObjectPI. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ A comparative analysis of deep learning models for lung segmentation on + X-ray images + + +
+ Robust and highly accurate lung segmentation in X-rays is crucial in medical +imaging. This study evaluates deep learning solutions for this task, ranking +existing methods and analyzing their performance under diverse image +modifications. Out of 61 analyzed papers, only nine offered implementation or +pre-trained models, enabling assessment of three prominent methods: Lung VAE, +TransResUNet, and CE-Net. The analysis revealed that CE-Net performs best, +demonstrating the highest values in dice similarity coefficient and +intersection over union metric. + +
+
+ comment: published at the Polish Conference on Artificial Intelligence + (PP-RAI), 2024 +
+
+
+
+
+ + ☆ PURE: Turning Polysemantic Neurons Into Pure Features by Identifying + Relevant Circuits + + +
+ The field of mechanistic interpretability aims to study the role of +individual neurons in Deep Neural Networks. Single neurons, however, have the +capability to act polysemantically and encode for multiple (unrelated) +features, which renders their interpretation difficult. We present a method for +disentangling polysemanticity of any Deep Neural Network by decomposing a +polysemantic neuron into multiple monosemantic "virtual" neurons. This is +achieved by identifying the relevant sub-graph ("circuit") for each "pure" +feature. We demonstrate how our approach allows us to find and disentangle +various polysemantic units of ResNet models trained on ImageNet. While +evaluating feature visualizations using CLIP, our method effectively +disentangles representations, improving upon methods based on neuron +activations. Our code is available at https://github.com/maxdreyer/PURE. + +
+
+ comment: 14 pages (4 pages manuscript, 2 pages references, 8 pages appendix) +
+
+
+
+
+ + ☆ SmartControl: Enhancing ControlNet for Handling Rough Visual Conditions + + +
+ Human visual imagination usually begins with analogies or rough sketches. For +example, given an image with a girl playing guitar before a building, one may +analogously imagine how it seems like if Iron Man playing guitar before Pyramid +in Egypt. Nonetheless, visual condition may not be precisely aligned with the +imaginary result indicated by text prompt, and existing layout-controllable +text-to-image (T2I) generation models is prone to producing degraded generated +results with obvious artifacts. To address this issue, we present a novel T2I +generation method dubbed SmartControl, which is designed to modify the rough +visual conditions for adapting to text prompt. The key idea of our SmartControl +is to relax the visual condition on the areas that are conflicted with text +prompts. In specific, a Control Scale Predictor (CSP) is designed to identify +the conflict regions and predict the local control scales, while a dataset with +text prompts and rough visual conditions is constructed for training CSP. It is +worth noting that, even with a limited number (e.g., 1,000~2,000) of training +samples, our SmartControl can generalize well to unseen objects. Extensive +experiments on four typical visual condition types clearly show the efficacy of +our SmartControl against state-of-the-arts. Source code, pre-trained models, +and datasets are available at https://github.com/liuxiaoyu1104/SmartControl. + +
+
+
+
+
+ + ☆ The Central Spanning Tree Problem + + +
+ Spanning trees are an important primitive in many data analysis tasks, when a +data set needs to be summarized in terms of its "skeleton", or when a +tree-shaped graph over all observations is required for downstream processing. +Popular definitions of spanning trees include the minimum spanning tree and the +optimum distance spanning tree, a.k.a. the minimum routing cost tree. When +searching for the shortest spanning tree but admitting additional branching +points, even shorter spanning trees can be realized: Steiner trees. +Unfortunately, both minimum spanning and Steiner trees are not robust with +respect to noise in the observations; that is, small perturbations of the +original data set often lead to drastic changes in the associated spanning +trees. In response, we make two contributions when the data lies in a Euclidean +space: on the theoretical side, we introduce a new optimization problem, the +"(branched) central spanning tree", which subsumes all previously mentioned +definitions as special cases. On the practical side, we show empirically that +the (branched) central spanning tree is more robust to noise in the data, and +as such is better suited to summarize a data set in terms of its skeleton. We +also propose a heuristic to address the NP-hard optimization problem, and +illustrate its use on single cell RNA expression data from biology and 3D point +clouds of plants. + +
+
+
+
+
+ + ☆ Multi-scale Dynamic and Hierarchical Relationship Modeling for Facial + Action Units Recognition CVPR2024 + + +
+ Human facial action units (AUs) are mutually related in a hierarchical +manner, as not only they are associated with each other in both spatial and +temporal domains but also AUs located in the same/close facial regions show +stronger relationships than those of different facial regions. While none of +existing approach thoroughly model such hierarchical inter-dependencies among +AUs, this paper proposes to comprehensively model multi-scale AU-related +dynamic and hierarchical spatio-temporal relationship among AUs for their +occurrences recognition. Specifically, we first propose a novel multi-scale +temporal differencing network with an adaptive weighting block to explicitly +capture facial dynamics across frames at different spatial scales, which +specifically considers the heterogeneity of range and magnitude in different +AUs' activation. Then, a two-stage strategy is introduced to hierarchically +model the relationship among AUs based on their spatial distribution (i.e., +local and cross-region AU relationship modelling). Experimental results +achieved on BP4D and DISFA show that our approach is the new state-of-the-art +in the field of AU occurrence recognition. Our code is publicly available at +https://github.com/CVI-SZU/MDHR. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ QueSTMaps: Queryable Semantic Topological Maps for 3D Scene + Understanding + + +
+ Understanding the structural organisation of 3D indoor scenes in terms of +rooms is often accomplished via floorplan extraction. Robotic tasks such as +planning and navigation require a semantic understanding of the scene as well. +This is typically achieved via object-level semantic segmentation. However, +such methods struggle to segment out topological regions like "kitchen" in the +scene. In this work, we introduce a two-step pipeline. First, we extract a +topological map, i.e., floorplan of the indoor scene using a novel +multi-channel occupancy representation. Then, we generate CLIP-aligned features +and semantic labels for every room instance based on the objects it contains +using a self-attention transformer. Our language-topology alignment supports +natural language querying, e.g., a "place to cook" locates the "kitchen". We +outperform the current state-of-the-art on room segmentation by ~20% and room +classification by ~12%. Our detailed qualitative analysis and ablation studies +provide insights into the problem of joint structural and semantic 3D scene +understanding. + +
+
+
+
+
+ + ☆ Seasonal Fire Prediction using Spatio-Temporal Deep Neural Networks + + +
+ With climate change expected to exacerbate fire weather conditions, the +accurate anticipation of wildfires on a global scale becomes increasingly +crucial for disaster mitigation. In this study, we utilize SeasFire, a +comprehensive global wildfire dataset with climate, vegetation, oceanic +indices, and human-related variables, to enable seasonal wildfire forecasting +with machine learning. For the predictive analysis, we train deep learning +models with different architectures that capture the spatio-temporal context +leading to wildfires. Our investigation focuses on assessing the effectiveness +of these models in predicting the presence of burned areas at varying +forecasting time horizons globally, extending up to six months into the future, +and on how different spatial or/and temporal context affects the performance of +the models. Our findings demonstrate the great potential of deep learning +models in seasonal fire forecasting; longer input time-series leads to more +robust predictions across varying forecasting horizons, while integrating +spatial information to capture wildfire spatio-temporal dynamics boosts +performance. Finally, our results hint that in order to enhance performance at +longer forecasting horizons, a larger receptive field spatially needs to be +considered. + +
+
+
+
+
+ + ☆ pfl-research: simulation framework for accelerating research in Private + Federated Learning + + +
+ Federated learning (FL) is an emerging machine learning (ML) training +paradigm where clients own their data and collaborate to train a global model, +without revealing any data to the server and other participants. Researchers +commonly perform experiments in a simulation environment to quickly iterate on +ideas. However, existing open-source tools do not offer the efficiency required +to simulate FL on larger and more realistic FL datasets. We introduce +pfl-research, a fast, modular, and easy-to-use Python framework for simulating +FL. It supports TensorFlow, PyTorch, and non-neural network models, and is +tightly integrated with state-of-the-art privacy algorithms. We study the speed +of open-source FL frameworks and show that pfl-research is 7-72$\times$ faster +than alternative open-source frameworks on common cross-device setups. Such +speedup will significantly boost the productivity of the FL research community +and enable testing hypotheses on realistic FL datasets that were previously too +resource intensive. We release a suite of benchmarks that evaluates an +algorithm's overall performance on a diverse set of realistic scenarios. The +code is available on GitHub at https://github.com/apple/pfl-research. + +
+
+
+
+
+ + ☆ Magic-Boost: Boost 3D Generation with Mutli-View Conditioned Diffusion + + +
+ Benefiting from the rapid development of 2D diffusion models, 3D content +creation has made significant progress recently. One promising solution +involves the fine-tuning of pre-trained 2D diffusion models to harness their +capacity for producing multi-view images, which are then lifted into accurate +3D models via methods like fast-NeRFs or large reconstruction models. However, +as inconsistency still exists and limited generated resolution, the generation +results of such methods still lack intricate textures and complex geometries. +To solve this problem, we propose Magic-Boost, a multi-view conditioned +diffusion model that significantly refines coarse generative results through a +brief period of SDS optimization ($\sim15$min). Compared to the previous text +or single image based diffusion models, Magic-Boost exhibits a robust +capability to generate images with high consistency from pseudo synthesized +multi-view images. It provides precise SDS guidance that well aligns with the +identity of the input images, enriching the local detail in both geometry and +texture of the initial generative results. Extensive experiments show +Magic-Boost greatly enhances the coarse inputs and generates high-quality 3D +assets with rich geometric and textural details. (Project Page: +https://magic-research.github.io/magic-boost/) + +
+
+
+
+
+ + ☆ ZeST: Zero-Shot Material Transfer from a Single Image + + +
+ We propose ZeST, a method for zero-shot material transfer to an object in the +input image given a material exemplar image. ZeST leverages existing diffusion +adapters to extract implicit material representation from the exemplar image. +This representation is used to transfer the material using pre-trained +inpainting diffusion model on the object in the input image using depth +estimates as geometry cue and grayscale object shading as illumination cues. +The method works on real images without any training resulting a zero-shot +approach. Both qualitative and quantitative results on real and synthetic +datasets demonstrate that ZeST outputs photorealistic images with transferred +materials. We also show the application of ZeST to perform multiple edits and +robust material assignment under different illuminations. Project Page: +https://ttchengab.github.io/zest + +
+
+ comment: Project Page: https://ttchengab.github.io/zest +
+
+
+
+
+ + ☆ Emergent Dynamics in Neural Cellular Automata + + +
+ Neural Cellular Automata (NCA) models are trainable variations of traditional +Cellular Automata (CA). Emergent motion in the patterns created by NCA has been +successfully applied to synthesize dynamic textures. However, the conditions +required for an NCA to display dynamic patterns remain unexplored. Here, we +investigate the relationship between the NCA architecture and the emergent +dynamics of the trained models. Specifically, we vary the number of channels in +the cell state and the number of hidden neurons in the MultiLayer Perceptron +(MLP), and draw a relationship between the combination of these two variables +and the motion strength between successive frames. Our analysis reveals that +the disparity and proportionality between these two variables have a strong +correlation with the emergent dynamics in the NCA output. We thus propose a +design principle for creating dynamic NCA. + +
+
+ comment: 2 pages +
+
+
+
+
+ + ☆ Raster Forge: Interactive Raster Manipulation Library and GUI for Python + + +
+ Raster Forge is a Python library and graphical user interface for raster data +manipulation and analysis. The tool is focused on remote sensing applications, +particularly in wildfire management. It allows users to import, visualize, and +process raster layers for tasks such as image compositing or topographical +analysis. For wildfire management, it generates fuel maps using predefined +models. Its impact extends from disaster management to hydrological modeling, +agriculture, and environmental monitoring. Raster Forge can be a valuable asset +for geoscientists and researchers who rely on raster data analysis, enhancing +geospatial data processing and visualization across various disciplines. + +
+
+
+
+
+ + ☆ VISION2UI: A Real-World Dataset with Layout for Code Generation from UI + Designs + + +
+ Automatically generating UI code from webpage design visions can +significantly alleviate the burden of developers, enabling beginner developers +or designers to directly generate Web pages from design diagrams. Currently, +prior research has accomplished the objective of generating UI code from +rudimentary design visions or sketches through designing deep neural networks. +Inspired by the groundbreaking advancements achieved by Multimodal Large +Language Models (MLLMs), the automatic generation of UI code from high-fidelity +design images is now emerging as a viable possibility. Nevertheless, our +investigation reveals that existing MLLMs are hampered by the scarcity of +authentic, high-quality, and large-scale datasets, leading to unsatisfactory +performance in automated UI code generation. To mitigate this gap, we present a +novel dataset, termed VISION2UI, extracted from real-world scenarios, augmented +with comprehensive layout information, tailored specifically for finetuning +MLLMs in UI code generation. Specifically, this dataset is derived through a +series of operations, encompassing collecting, cleaning, and filtering of the +open-source Common Crawl dataset. In order to uphold its quality, a neural +scorer trained on labeled samples is utilized to refine the data, retaining +higher-quality instances. Ultimately, this process yields a dataset comprising +2,000 (Much more is coming soon) parallel samples encompassing design visions +and UI code. The dataset is available at +https://huggingface.co/datasets/xcodemind/vision2ui. + +
+
+
+
+
+ + ☆ Dynamic Resolution Guidance for Facial Expression Recognition + + +
+ Facial expression recognition (FER) is vital for human-computer interaction +and emotion analysis, yet recognizing expressions in low-resolution images +remains challenging. This paper introduces a practical method called Dynamic +Resolution Guidance for Facial Expression Recognition (DRGFER) to effectively +recognize facial expressions in images with varying resolutions without +compromising FER model accuracy. Our framework comprises two main components: +the Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation +Facial Expression Recognition Network (MRAFER). The RRN determines image +resolution, outputs a binary vector, and the MRAFER assigns images to suitable +facial expression recognition networks based on resolution. We evaluated DRGFER +on widely-used datasets RAFDB and FERPlus, demonstrating that our method +retains optimal model performance at each resolution and outperforms +alternative resolution approaches. The proposed framework exhibits robustness +against resolution variations and facial expressions, offering a promising +solution for real-world applications. + +
+
+
+
+
+ + ☆ Test-Time Adaptation with SaLIP: A Cascade of SAM and CLIP for Zero shot + Medical Image Segmentation + + +
+ The Segment Anything Model (SAM) and CLIP are remarkable vision foundation +models (VFMs). SAM, a prompt driven segmentation model, excels in segmentation +tasks across diverse domains, while CLIP is renowned for its zero shot +recognition capabilities. However, their unified potential has not yet been +explored in medical image segmentation. To adapt SAM to medical imaging, +existing methods primarily rely on tuning strategies that require extensive +data or prior prompts tailored to the specific task, making it particularly +challenging when only a limited number of data samples are available. This work +presents an in depth exploration of integrating SAM and CLIP into a unified +framework for medical image segmentation. Specifically, we propose a simple +unified framework, SaLIP, for organ segmentation. Initially, SAM is used for +part based segmentation within the image, followed by CLIP to retrieve the mask +corresponding to the region of interest (ROI) from the pool of SAM generated +masks. Finally, SAM is prompted by the retrieved ROI to segment a specific +organ. Thus, SaLIP is training and fine tuning free and does not rely on domain +expertise or labeled data for prompt engineering. Our method shows substantial +enhancements in zero shot segmentation, showcasing notable improvements in DICE +scores across diverse segmentation tasks like brain (63.46%), lung (50.11%), +and fetal head (30.82%), when compared to un prompted SAM. Code and text +prompts will be available online. + +
+
+
+
+
+ + ☆ High Noise Scheduling is a Must + + +
+ Consistency models possess high capabilities for image generation, advancing +sampling steps to a single step through their advanced techniques. Current +advancements move one step forward consistency training techniques and +eliminates the limitation of distillation training. Even though the proposed +curriculum and noise scheduling in improved training techniques yield better +results than basic consistency models, it lacks well balanced noise +distribution and its consistency between curriculum. In this study, it is +investigated the balance between high and low noise levels in noise +distribution and offered polynomial noise distribution to maintain the +stability. This proposed polynomial noise distribution is also supported with a +predefined Karras noises to prevent unique noise levels arises with Karras +noise generation algorithm. Furthermore, by elimination of learned noisy steps +with a curriculum based on sinusoidal function increase the performance of the +model in denoising. To make a fair comparison with the latest released +consistency model training techniques, experiments are conducted with same +hyper-parameters except curriculum and noise distribution. The models utilized +during experiments are determined with low depth to prove the robustness of our +proposed technique. The results show that the polynomial noise distribution +outperforms the model trained with log-normal noise distribution, yielding a +33.54 FID score after 100,000 training steps with constant discretization +steps. Additionally, the implementation of a sinusoidal-based curriculum +enhances denoising performance, resulting in a FID score of 30.48. + +
+
+
+
+
+ + ☆ DaF-BEVSeg: Distortion-aware Fisheye Camera based Bird's Eye View + Segmentation with Occlusion Reasoning + + +
+ Semantic segmentation is an effective way to perform scene understanding. +Recently, segmentation in 3D Bird's Eye View (BEV) space has become popular as +its directly used by drive policy. However, there is limited work on BEV +segmentation for surround-view fisheye cameras, commonly used in commercial +vehicles. As this task has no real-world public dataset and existing synthetic +datasets do not handle amodal regions due to occlusion, we create a synthetic +dataset using the Cognata simulator comprising diverse road types, weather, and +lighting conditions. We generalize the BEV segmentation to work with any camera +model; this is useful for mixing diverse cameras. We implement a baseline by +applying cylindrical rectification on the fisheye images and using a standard +LSS-based BEV segmentation model. We demonstrate that we can achieve better +performance without undistortion, which has the adverse effects of increased +runtime due to pre-processing, reduced field-of-view, and resampling artifacts. +Further, we introduce a distortion-aware learnable BEV pooling strategy that is +more effective for the fisheye cameras. We extend the model with an occlusion +reasoning module, which is critical for estimating in BEV space. Qualitative +performance of DaF-BEVSeg is showcased in the video at +https://streamable.com/ge4v51. + +
+
+
+
+
+ + ☆ HPNet: Dynamic Trajectory Forecasting with Historical Prediction + Attention CVPR2024 + + +
+ Predicting the trajectories of road agents is essential for autonomous +driving systems. The recent mainstream methods follow a static paradigm, which +predicts the future trajectory by using a fixed duration of historical frames. +These methods make the predictions independently even at adjacent time steps, +which leads to potential instability and temporal inconsistency. As successive +time steps have largely overlapping historical frames, their forecasting should +have intrinsic correlation, such as overlapping predicted trajectories should +be consistent, or be different but share the same motion goal depending on the +road situation. Motivated by this, in this work, we introduce HPNet, a novel +dynamic trajectory forecasting method. Aiming for stable and accurate +trajectory forecasting, our method leverages not only historical frames +including maps and agent states, but also historical predictions. Specifically, +we newly design a Historical Prediction Attention module to automatically +encode the dynamic relationship between successive predictions. Besides, it +also extends the attention range beyond the currently visible window +benefitting from the use of historical predictions. The proposed Historical +Prediction Attention together with the Agent Attention and Mode Attention is +further formulated as the Triple Factorized Attention module, serving as the +core design of HPNet.Experiments on the Argoverse and INTERACTION datasets show +that HPNet achieves state-of-the-art performance, and generates accurate and +stable future trajectories. Our code are available at +https://github.com/XiaolongTang23/HPNet. + +
+
+ comment: accepted by CVPR2024 +
+
+
+
+
+ + ☆ Rolling Shutter Correction with Intermediate Distortion Flow Estimation CVPR2024 + + +
+ This paper proposes to correct the rolling shutter (RS) distorted images by +estimating the distortion flow from the global shutter (GS) to RS directly. +Existing methods usually perform correction using the undistortion flow from +the RS to GS. They initially predict the flow from consecutive RS frames, +subsequently rescaling it as the displacement fields from the RS frame to the +underlying GS image using time-dependent scaling factors. Following this, +RS-aware forward warping is employed to convert the RS image into its GS +counterpart. Nevertheless, this strategy is prone to two shortcomings. First, +the undistortion flow estimation is rendered inaccurate by merely linear +scaling the flow, due to the complex non-linear motion nature. Second, RS-aware +forward warping often results in unavoidable artifacts. To address these +limitations, we introduce a new framework that directly estimates the +distortion flow and rectifies the RS image with the backward warping operation. +More specifically, we first propose a global correlation-based flow attention +mechanism to estimate the initial distortion flow and GS feature jointly, which +are then refined by the following coarse-to-fine decoder layers. Additionally, +a multi-distortion flow prediction strategy is integrated to mitigate the issue +of inaccurate flow estimation further. Experimental results validate the +effectiveness of the proposed method, which outperforms state-of-the-art +approaches on various benchmarks while maintaining high efficiency. The project +is available at \url{https://github.com/ljzycmd/DFRSC}. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Matching 2D Images in 3D: Metric Relative Pose from Metric + Correspondences + + +
+ Given two images, we can estimate the relative camera pose between them by +establishing image-to-image correspondences. Usually, correspondences are +2D-to-2D and the pose we estimate is defined only up to scale. Some +applications, aiming at instant augmented reality anywhere, require +scale-metric pose estimates, and hence, they rely on external depth estimators +to recover the scale. We present MicKey, a keypoint matching pipeline that is +able to predict metric correspondences in 3D camera space. By learning to match +3D coordinates across images, we are able to infer the metric relative pose +without depth measurements. Depth measurements are also not required for +training, nor are scene reconstructions or image overlap information. MicKey is +supervised only by pairs of images and their relative poses. MicKey achieves +state-of-the-art performance on the Map-Free Relocalisation benchmark while +requiring less supervision than competing approaches. + +
+
+
+
+
+ + ☆ Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large + Multi-Modal Models CVPR + + +
+ Audio-visual zero-shot learning methods commonly build on features extracted +from pre-trained models, e.g. video or audio classification models. However, +existing benchmarks predate the popularization of large multi-modal models, +such as CLIP and CLAP. In this work, we explore such large pre-trained models +to obtain features, i.e. CLIP for visual features, and CLAP for audio features. +Furthermore, the CLIP and CLAP text encoders provide class label embeddings +which are combined to boost the performance of the system. We propose a simple +yet effective model that only relies on feed-forward neural networks, +exploiting the strong generalization capabilities of the new audio, visual and +textual features. Our framework achieves state-of-the-art performance on +VGGSound-GZSL, UCF-GZSL, and ActivityNet-GZSL with our new features. Code and +data available at: https://github.com/dkurzend/ClipClap-GZSL. + +
+
+ comment: CVPRw 2024 (L3D-IVU) +
+
+
+
+
+ + ☆ Fortifying Fully Convolutional Generative Adversarial Networks for Image + Super-Resolution Using Divergence Measures + + +
+ Super-Resolution (SR) is a time-hallowed image processing problem that aims +to improve the quality of a Low-Resolution (LR) sample up to the standard of +its High-Resolution (HR) counterpart. We aim to address this by introducing +Super-Resolution Generator (SuRGe), a fully-convolutional Generative +Adversarial Network (GAN)-based architecture for SR. We show that distinct +convolutional features obtained at increasing depths of a GAN generator can be +optimally combined by a set of learnable convex weights to improve the quality +of generated SR samples. In the process, we employ the Jensen-Shannon and the +Gromov-Wasserstein losses respectively between the SR-HR and LR-SR pairs of +distributions to further aid the generator of SuRGe to better exploit the +available information in an attempt to improve SR. Moreover, we train the +discriminator of SuRGe with the Wasserstein loss with gradient penalty, to +primarily prevent mode collapse. The proposed SuRGe, as an end-to-end GAN +workflow tailor-made for super-resolution, offers improved performance while +maintaining low inference time. The efficacy of SuRGe is substantiated by its +superior performance compared to 18 state-of-the-art contenders on 10 benchmark +datasets. + +
+
+
+
+
+ + ☆ Counterfactual Reasoning for Multi-Label Image Classification via + Patching-Based Training + + +
+ The key to multi-label image classification (MLC) is to improve model +performance by leveraging label correlations. Unfortunately, it has been shown +that overemphasizing co-occurrence relationships can cause the overfitting +issue of the model, ultimately leading to performance degradation. In this +paper, we provide a causal inference framework to show that the correlative +features caused by the target object and its co-occurring objects can be +regarded as a mediator, which has both positive and negative impacts on model +predictions. On the positive side, the mediator enhances the recognition +performance of the model by capturing co-occurrence relationships; on the +negative side, it has the harmful causal effect that causes the model to make +an incorrect prediction for the target object, even when only co-occurring +objects are present in an image. To address this problem, we propose a +counterfactual reasoning method to measure the total direct effect, achieved by +enhancing the direct effect caused only by the target object. Due to the +unknown location of the target object, we propose patching-based training and +inference to accomplish this goal, which divides an image into multiple patches +and identifies the pivot patch that contains the target object. Experimental +results on multiple benchmark datasets with diverse configurations validate +that the proposed method can achieve state-of-the-art performance. + +
+
+
+
+
+ + ☆ NoiseNCA: Noisy Seed Improves Spatio-Temporal Continuity of Neural + Cellular Automata + + +
+ Neural Cellular Automata (NCA) is a class of Cellular Automata where the +update rule is parameterized by a neural network that can be trained using +gradient descent. In this paper, we focus on NCA models used for texture +synthesis, where the update rule is inspired by partial differential equations +(PDEs) describing reaction-diffusion systems. To train the NCA model, the +spatio-termporal domain is discretized, and Euler integration is used to +numerically simulate the PDE. However, whether a trained NCA truly learns the +continuous dynamic described by the corresponding PDE or merely overfits the +discretization used in training remains an open question. We study NCA models +at the limit where space-time discretization approaches continuity. We find +that existing NCA models tend to overfit the training discretization, +especially in the proximity of the initial condition, also called "seed". To +address this, we propose a solution that utilizes uniform noise as the initial +condition. We demonstrate the effectiveness of our approach in preserving the +consistency of NCA dynamics across a wide range of spatio-temporal +granularities. Our improved NCA model enables two new test-time interactions by +allowing continuous control over the speed of pattern formation and the scale +of the synthesized patterns. We demonstrate this new NCA feature in our +interactive online demo. Our work reveals that NCA models can learn continuous +dynamics and opens new venues for NCA research from a dynamical systems' +perspective. + +
+
+ comment: 9 pages, 12 figures +
+
+
+
+
+ + ☆ Learning Embeddings with Centroid Triplet Loss for Object Identification + in Robotic Grasping + + +
+ Foundation models are a strong trend in deep learning and computer vision. +These models serve as a base for applications as they require minor or no +further fine-tuning by developers to integrate into their applications. +Foundation models for zero-shot object segmentation such as Segment Anything +(SAM) output segmentation masks from images without any further object +information. When they are followed in a pipeline by an object identification +model, they can perform object detection without training. Here, we focus on +training such an object identification model. A crucial practical aspect for an +object identification model is to be flexible in input size. As object +identification is an image retrieval problem, a suitable method should handle +multi-query multi-gallery situations without constraining the number of input +images (e.g. by having fixed-size aggregation layers). The key solution to +train such a model is the centroid triplet loss (CTL), which aggregates image +features to their centroids. CTL yields high accuracy, avoids misleading +training signals and keeps the model input size flexible. In our experiments, +we establish a new state of the art on the ArmBench object identification task, +which shows general applicability of our model. We furthermore demonstrate an +integrated unseen object detection pipeline on the challenging HOPE dataset, +which requires fine-grained detection. There, our pipeline matches and +surpasses related methods which have been trained on dataset-specific data. + +
+
+
+
+
+ + ☆ Robust Confidence Intervals in Stereo Matching using Possibility Theory + + +
+ We propose a method for estimating disparity confidence intervals in stereo +matching problems. Confidence intervals provide complementary information to +usual confidence measures. To the best of our knowledge, this is the first +method creating disparity confidence intervals based on the cost volume. This +method relies on possibility distributions to interpret the epistemic +uncertainty of the cost volume. Our method has the benefit of having a +white-box nature, differing in this respect from current state-of-the-art deep +neural networks approaches. The accuracy and size of confidence intervals are +validated using the Middlebury stereo datasets as well as a dataset of +satellite images. This contribution is freely available on GitHub. + +
+
+
+
+
+ + ☆ 3D Geometry-aware Deformable Gaussian Splatting for Dynamic View + Synthesis CVPR 2024 + + +
+ In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting +method for dynamic view synthesis. Existing neural radiance fields (NeRF) based +solutions learn the deformation in an implicit manner, which cannot incorporate +3D scene geometry. Therefore, the learned deformation is not necessarily +geometrically coherent, which results in unsatisfactory dynamic view synthesis +and 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new +representation of the 3D scene, building upon which the 3D geometry could be +exploited in learning the complex 3D deformation. Specifically, the scenes are +represented as a collection of 3D Gaussian, where each 3D Gaussian is optimized +to move and rotate over time to model the deformation. To enforce the 3D scene +geometry constraint during deformation, we explicitly extract 3D geometry +features and integrate them in learning the 3D deformation. In this way, our +solution achieves 3D geometry-aware deformation modeling, which enables +improved dynamic view synthesis and 3D dynamic reconstruction. Extensive +experimental results on both synthetic and real datasets prove the superiority +of our solution, which achieves new state-of-the-art performance. + The project is available at https://npucvr.github.io/GaGS/ + +
+
+ comment: Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/ +
+
+
+
+
+ + ☆ Spatial-Temporal Multi-level Association for Video Object Segmentation + + +
+ Existing semi-supervised video object segmentation methods either focus on +temporal feature matching or spatial-temporal feature modeling. However, they +do not address the issues of sufficient target interaction and efficient +parallel processing simultaneously, thereby constraining the learning of +dynamic, target-aware features. To tackle these limitations, this paper +proposes a spatial-temporal multi-level association framework, which jointly +associates reference frame, test frame, and object features to achieve +sufficient interaction and parallel target ID association with a +spatial-temporal memory bank for efficient video object segmentation. +Specifically, we construct a spatial-temporal multi-level feature association +module to learn better target-aware features, which formulates feature +extraction and interaction as the efficient operations of object +self-attention, reference object enhancement, and test reference correlation. +In addition, we propose a spatial-temporal memory to assist feature association +and temporal ID assignment and correlation. We evaluate the proposed method by +conducting extensive experiments on numerous video object segmentation +datasets, including DAVIS 2016/2017 val, DAVIS 2017 test-dev, and YouTube-VOS +2018/2019 val. The favorable performance against the state-of-the-art methods +demonstrates the effectiveness of our approach. All source code and trained +models will be made publicly available. + +
+
+
+
+
+ + ☆ Playing to Vision Foundation Model's Strengths in Stereo Matching + + +
+ Stereo matching has become a key technique for 3D environment perception in +intelligent vehicles. For a considerable time, convolutional neural networks +(CNNs) have remained the mainstream choice for feature extraction in this +domain. Nonetheless, there is a growing consensus that the existing paradigm +should evolve towards vision foundation models (VFM), particularly those +developed based on vision Transformers (ViTs) and pre-trained through +self-supervision on extensive, unlabeled datasets. While VFMs are adept at +extracting informative, general-purpose visual features, specifically for dense +prediction tasks, their performance often lacks in geometric vision tasks. This +study serves as the first exploration of a viable approach for adapting VFMs to +stereo matching. Our ViT adapter, referred to as ViTAS, is constructed upon +three types of modules: spatial differentiation, patch attention fusion, and +cross-attention. The first module initializes feature pyramids, while the +latter two aggregate stereo and multi-scale contextual information into +fine-grained features, respectively. ViTAStereo, which combines ViTAS with cost +volume-based stereo matching back-end processes, achieves the top rank on the +KITTI Stereo 2012 dataset and outperforms the second-best network StereoBase by +approximately 7.9% in terms of the percentage of error pixels, with a tolerance +of 3 pixels. Additional experiments across diverse scenarios further +demonstrate its superior generalizability compared to all other +state-of-the-art approaches. We believe this new paradigm will pave the way for +the next generation of stereo matching networks. + +
+
+
+
+
+ + ☆ Robust feature knowledge distillation for enhanced performance of + lightweight crack segmentation models + + +
+ Vision-based crack detection faces deployment challenges due to the size of +robust models and edge device limitations. These can be addressed with +lightweight models trained with knowledge distillation (KD). However, +state-of-the-art (SOTA) KD methods compromise anti-noise robustness. This paper +develops Robust Feature Knowledge Distillation (RFKD), a framework to improve +robustness while retaining the precision of light models for crack +segmentation. RFKD distils knowledge from a teacher model's logit layers and +intermediate feature maps while leveraging mixed clean and noisy images to +transfer robust patterns to the student model, improving its precision, +generalisation, and anti-noise performance. To validate the proposed RFKD, a +lightweight crack segmentation model, PoolingCrack Tiny (PCT), with only 0.5 M +parameters, is also designed and used as the student to run the framework. The +results show a significant enhancement in noisy images, with RFKD reaching a +62% enhanced mean Dice score (mDS) compared to SOTA KD methods. + +
+
+ comment: 24 pages, 13 figures +
+
+
+
+
+ + ☆ Label-Efficient 3D Object Detection For Road-Side Units + + +
+ Occlusion presents a significant challenge for safety-critical applications +such as autonomous driving. Collaborative perception has recently attracted a +large research interest thanks to the ability to enhance the perception of +autonomous vehicles via deep information fusion with intelligent roadside units +(RSU), thus minimizing the impact of occlusion. While significant advancement +has been made, the data-hungry nature of these methods creates a major hurdle +for their real-world deployment, particularly due to the need for annotated RSU +data. Manually annotating the vast amount of RSU data required for training is +prohibitively expensive, given the sheer number of intersections and the effort +involved in annotating point clouds. We address this challenge by devising a +label-efficient object detection method for RSU based on unsupervised object +discovery. Our paper introduces two new modules: one for object discovery based +on a spatial-temporal aggregation of point clouds, and another for refinement. +Furthermore, we demonstrate that fine-tuning on a small portion of annotated +data allows our object discovery models to narrow the performance gap with, or +even surpass, fully supervised models. Extensive experiments are carried out in +simulated and real-world datasets to evaluate our method. + +
+
+ comment: IV 2024 +
+
+
+
+
+ + ☆ From Barlow Twins to Triplet Training: Differentiating Dementia with + Limited Data + + +
+ Differential diagnosis of dementia is challenging due to overlapping +symptoms, with structural magnetic resonance imaging (MRI) being the primary +method for diagnosis. Despite the clinical value of computer-aided differential +diagnosis, research has been limited, mainly due to the absence of public +datasets that contain diverse types of dementia. This leaves researchers with +small in-house datasets that are insufficient for training deep neural networks +(DNNs). Self-supervised learning shows promise for utilizing unlabeled MRI +scans in training, but small batch sizes for volumetric brain scans make its +application challenging. To address these issues, we propose Triplet Training +for differential diagnosis with limited target data. It consists of three key +stages: (i) self-supervised pre-training on unlabeled data with Barlow Twins, +(ii) self-distillation on task-related data, and (iii) fine-tuning on the +target dataset. Our approach significantly outperforms traditional training +strategies, achieving a balanced accuracy of 75.6%. We further provide insights +into the training process by visualizing changes in the latent space after each +step. Finally, we validate the robustness of Triplet Training in terms of its +individual components in a comprehensive ablation study. Our code is available +at https://github.com/ai-med/TripletTraining. + +
+
+ comment: Accepted for presentation at MIDL 2024 +
+
+
+
+
+ + ☆ ColorMNet: A Memory-based Deep Spatial-Temporal Feature Propagation + Network for Video Colorization + + +
+ How to effectively explore spatial-temporal features is important for video +colorization. Instead of stacking multiple frames along the temporal dimension +or recurrently propagating estimated features that will accumulate errors or +cannot explore information from far-apart frames, we develop a memory-based +feature propagation module that can establish reliable connections with +features from far-apart frames and alleviate the influence of inaccurately +estimated features. To extract better features from each frame for the +above-mentioned feature propagation, we explore the features from +large-pretrained visual models to guide the feature estimation of each frame so +that the estimated features can model complex scenarios. In addition, we note +that adjacent frames usually contain similar contents. To explore this property +for better spatial and temporal feature utilization, we develop a local +attention module to aggregate the features from adjacent frames in a +spatial-temporal neighborhood. We formulate our memory-based feature +propagation module, large-pretrained visual model guided feature estimation +module, and local attention module into an end-to-end trainable network (named +ColorMNet) and show that it performs favorably against state-of-the-art methods +on both the benchmark datasets and real-world scenarios. The source code and +pre-trained models will be available at +\url{https://github.com/yyang181/colormnet}. + +
+
+ comment: Project website: \url{https://github.com/yyang181/colormnet} +
+
+
+
+
+ + ☆ LRR: Language-Driven Resamplable Continuous Representation against + Adversarial Tracking Attacks + + +
+ Visual object tracking plays a critical role in visual-based autonomous +systems, as it aims to estimate the position and size of the object of interest +within a live video. Despite significant progress made in this field, +state-of-the-art (SOTA) trackers often fail when faced with adversarial +perturbations in the incoming frames. This can lead to significant robustness +and security issues when these trackers are deployed in the real world. To +achieve high accuracy on both clean and adversarial data, we propose building a +spatial-temporal continuous representation using the semantic text guidance of +the object of interest. This novel continuous representation enables us to +reconstruct incoming frames to maintain semantic and appearance consistency +with the object of interest and its clean counterparts. As a result, our +proposed method successfully defends against different SOTA adversarial +tracking attacks while maintaining high accuracy on clean data. In particular, +our method significantly increases tracking accuracy under adversarial attacks +with around 90% relative improvement on UAV123, which is even higher than the +accuracy on clean data. + +
+
+
+
+
+ + ☆ GHNeRF: Learning Generalizable Human Features with Efficient Neural + Radiance Fields + + +
+ Recent advances in Neural Radiance Fields (NeRF) have demonstrated promising +results in 3D scene representations, including 3D human representations. +However, these representations often lack crucial information on the underlying +human pose and structure, which is crucial for AR/VR applications and games. In +this paper, we introduce a novel approach, termed GHNeRF, designed to address +these limitations by learning 2D/3D joint locations of human subjects with NeRF +representation. GHNeRF uses a pre-trained 2D encoder streamlined to extract +essential human features from 2D images, which are then incorporated into the +NeRF framework in order to encode human biomechanic features. This allows our +network to simultaneously learn biomechanic features, such as joint locations, +along with human geometry and texture. To assess the effectiveness of our +method, we conduct a comprehensive comparison with state-of-the-art human NeRF +techniques and joint estimation algorithms. Our results show that GHNeRF can +achieve state-of-the-art results in near real-time. + +
+
+
+
+
+ + ☆ Anchor-based Robust Finetuning of Vision-Language Models CVPR2024 + + +
+ We aim at finetuning a vision-language model without hurting its +out-of-distribution (OOD) generalization. We address two types of OOD +generalization, i.e., i) domain shift such as natural to sketch images, and ii) +zero-shot capability to recognize the category that was not contained in the +finetune data. Arguably, the diminished OOD generalization after finetuning +stems from the excessively simplified finetuning target, which only provides +the class information, such as ``a photo of a [CLASS]''. This is distinct from +the process in that CLIP was pretrained, where there is abundant text +supervision with rich semantic information. Therefore, we propose to compensate +for the finetune process using auxiliary supervision with rich semantic +information, which acts as anchors to preserve the OOD generalization. +Specifically, two types of anchors are elaborated in our method, including i) +text-compensated anchor which uses the images from the finetune set but +enriches the text supervision from a pretrained captioner, ii) image-text-pair +anchor which is retrieved from the dataset similar to pretraining data of CLIP +according to the downstream task, associating with the original CLIP text with +rich semantics. Those anchors are utilized as auxiliary semantic information to +maintain the original feature space of CLIP, thereby preserving the OOD +generalization capabilities. Comprehensive experiments demonstrate that our +method achieves in-distribution performance akin to conventional finetuning +while attaining new state-of-the-art results on domain shift and zero-shot +learning benchmarks. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ ActNetFormer: Transformer-ResNet Hybrid Method for Semi-Supervised + Action Recognition in Videos + + +
+ Human action or activity recognition in videos is a fundamental task in +computer vision with applications in surveillance and monitoring, self-driving +cars, sports analytics, human-robot interaction and many more. Traditional +supervised methods require large annotated datasets for training, which are +expensive and time-consuming to acquire. This work proposes a novel approach +using Cross-Architecture Pseudo-Labeling with contrastive learning for +semi-supervised action recognition. Our framework leverages both labeled and +unlabelled data to robustly learn action representations in videos, combining +pseudo-labeling with contrastive learning for effective learning from both +types of samples. We introduce a novel cross-architecture approach where 3D +Convolutional Neural Networks (3D CNNs) and video transformers (VIT) are +utilised to capture different aspects of action representations; hence we call +it ActNetFormer. The 3D CNNs excel at capturing spatial features and local +dependencies in the temporal domain, while VIT excels at capturing long-range +dependencies across frames. By integrating these complementary architectures +within the ActNetFormer framework, our approach can effectively capture both +local and global contextual information of an action. This comprehensive +representation learning enables the model to achieve better performance in +semi-supervised action recognition tasks by leveraging the strengths of each of +these architectures. Experimental results on standard action recognition +datasets demonstrate that our approach performs better than the existing +methods, achieving state-of-the-art performance with only a fraction of labeled +data. The official website of this work is available at: +https://github.com/rana2149/ActNetFormer. + +
+
+ comment: Submitted for peer review +
+
+
+
+
+ + ☆ Hyperparameter-Free Medical Image Synthesis for Sharing Data and + Improving Site-Specific Segmentation + + +
+ Sharing synthetic medical images is a promising alternative to sharing real +images that can improve patient privacy and data security. To get good results, +existing methods for medical image synthesis must be manually adjusted when +they are applied to unseen data. To remove this manual burden, we introduce a +Hyperparameter-Free distributed learning method for automatic medical image +Synthesis, Sharing, and Segmentation called HyFree-S3. For three diverse +segmentation settings (pelvic MRIs, lung X-rays, polyp photos), the use of +HyFree-S3 results in improved performance over training only with site-specific +data (in the majority of cases). The hyperparameter-free nature of the method +should make data synthesis and sharing easier, potentially leading to an +increase in the quantity of available data and consequently the quality of the +models trained that may ultimately be applied in the clinic. Our code is +available at https://github.com/AwesomeLemon/HyFree-S3 + +
+
+ comment: Accepted at MIDL 2024 +
+
+
+
+
+ + ☆ Automatic Defect Detection in Sewer Network Using Deep Learning Based + Object Detector + + +
+ Maintaining sewer systems in large cities is important, but also time and +effort consuming, because visual inspections are currently done manually. To +reduce the amount of aforementioned manual work, defects within sewer pipes +should be located and classified automatically. In the past, multiple works +have attempted solving this problem using classical image processing, machine +learning, or a combination of those. However, each provided solution only focus +on detecting a limited set of defect/structure types, such as fissure, root, +and/or connection. Furthermore, due to the use of hand-crafted features and +small training datasets, generalization is also problematic. In order to +overcome these deficits, a sizable dataset with 14.7 km of various sewer pipes +were annotated by sewer maintenance experts in the scope of this work. On top +of that, an object detector (EfficientDet-D0) was trained for automatic defect +detection. From the result of several expermients, peculiar natures of defects +in the context of object detection, which greatly effect annotation and +training process, are found and discussed. At the end, the final detector was +able to detect 83% of defects in the test set; out of the missing 17%, only +0.77% are very severe defects. This work provides an example of applying deep +learning-based object detection into an important but quiet engineering field. +It also gives some practical pointers on how to annotate peculiar "object", +such as defects. + +
+
+
+
+
+ + ☆ OmniFusion Technical Report + + +
+ Last year, multimodal architectures served up a revolution in AI-based +approaches and solutions, extending the capabilities of large language models +(LLM). We propose an \textit{OmniFusion} model based on a pretrained LLM and +adapters for visual modality. We evaluated and compared several architecture +design principles for better text and visual data coupling: MLP and transformer +adapters, various CLIP ViT-based encoders (SigLIP, InternVIT, etc.), and their +fusing approach, image encoding method (whole image or tiles encoding) and two +7B LLMs (the proprietary one and open-source Mistral). Experiments on 8 +visual-language benchmarks show the top score for the best OmniFusion setup in +terms of different VQA tasks in comparison with open-source LLaVA-like +solutions: VizWiz, Pope, MM-Vet, ScienceQA, MMBench, TextVQA, VQAv2, MMMU. We +also propose a variety of situations, where OmniFusion provides highly-detailed +answers in different domains: housekeeping, sightseeing, culture, medicine, +handwritten and scanned equations recognition, etc. Mistral-based OmniFusion +model is an open-source solution with weights, training and inference scripts +available at https://github.com/AIRI-Institute/OmniFusion. + +
+
+ comment: 17 pages, 4 figures, 9 tables, 2 appendices +
+
+
+
+
+ + ☆ Unified Physical-Digital Attack Detection Challenge + + +
+ Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR) +Systems. In real-world scenarios, FRs are confronted with both physical and +digital attacks. However, existing algorithms often address only one type of +attack at a time, which poses significant limitations in real-world scenarios +where FR systems face hybrid physical-digital threats. To facilitate the +research of Unified Attack Detection (UAD) algorithms, a large-scale +UniAttackData dataset has been collected. UniAttackData is the largest public +dataset for Unified Attack Detection, with a total of 28,706 videos, where each +unique identity encompasses all advanced attack types. Based on this dataset, +we organized a Unified Physical-Digital Face Attack Detection Challenge to +boost the research in Unified Attack Detections. It attracted 136 teams for the +development phase, with 13 qualifying for the final round. The results +re-verified by the organizing team were used for the final ranking. This paper +comprehensively reviews the challenge, detailing the dataset introduction, +protocol definition, evaluation criteria, and a summary of published results. +Finally, we focus on the detailed analysis of the highest-performing algorithms +and offer potential directions for unified physical-digital attack detection +inspired by this competition. Challenge Website: +https://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ☆ Leveraging edge detection and neural networks for better UAV + localization + + +
+ We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs) +in environments lacking Global Navigation Satellite Systems (GNSS). Current +state-of-the-art techniques employ an offline-trained encoder to generate a +vector representation (embedding) of the UAV's current view, which is then +compared with pre-computed embeddings of geo-referenced images to determine the +UAV's position. Here, we demonstrate that the performance of these methods can +be significantly enhanced by preprocessing the images to extract their edges, +which exhibit robustness to seasonal and illumination variations. Furthermore, +we establish that utilizing edges enhances resilience to orientation and +altitude inaccuracies. Additionally, we introduce a confidence criterion for +localization. Our findings are substantiated through synthetic experiments. + +
+
+ comment: Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Automated National Urban Map Extraction + + +
+ Developing countries usually lack the proper governance means to generate and +regularly update a national rooftop map. Using traditional photogrammetry and +surveying methods to produce a building map at the federal level is costly and +time consuming. Using earth observation and deep learning methods, we can +bridge this gap and propose an automated pipeline to fetch such national urban +maps. This paper aims to exploit the power of fully convolutional neural +networks for multi-class buildings' instance segmentation to leverage high +object-wise accuracy results. Buildings' instance segmentation from sub-meter +high-resolution satellite images can be achieved with relatively high +pixel-wise metric scores. We detail all engineering steps to replicate this +work and ensure highly accurate results in dense and slum areas witnessed in +regions that lack proper urban planning in the Global South. We applied a case +study of the proposed pipeline to Lebanon and successfully produced the first +comprehensive national building footprint map with approximately 1 Million +units with an 84% accuracy. The proposed architecture relies on advanced +augmentation techniques to overcome dataset scarcity, which is often the case +in developing countries. + +
+
+
+
+
+ + ☆ Exploring the Potential of Large Foundation Models for Open-Vocabulary + HOI Detection + + +
+ Open-vocabulary human-object interaction (HOI) detection, which is concerned +with the problem of detecting novel HOIs guided by natural language, is crucial +for understanding human-centric scenes. However, prior zero-shot HOI detectors +often employ the same levels of feature maps to model HOIs with varying +distances, leading to suboptimal performance in scenes containing human-object +pairs with a wide range of distances. In addition, these detectors primarily +rely on category names and overlook the rich contextual information that +language can provide, which is essential for capturing open vocabulary concepts +that are typically rare and not well-represented by category names alone. In +this paper, we introduce a novel end-to-end open vocabulary HOI detection +framework with conditional multi-level decoding and fine-grained semantic +enhancement (CMD-SE), harnessing the potential of Visual-Language Models +(VLMs). Specifically, we propose to model human-object pairs with different +distances with different levels of feature maps by incorporating a soft +constraint during the bipartite matching process. Furthermore, by leveraging +large language models (LLMs) such as GPT models, we exploit their extensive +world knowledge to generate descriptions of human body part states for various +interactions. Then we integrate the generalizable and fine-grained semantics of +human body parts to improve interaction recognition. Experimental results on +two datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method +achieves state-of-the-art results in open vocabulary HOI detection. The code +and models are available at https://github.com/ltttpku/CMD-SE-release. + +
+
+
+
+
+ + ☆ EPL: Evidential Prototype Learning for Semi-supervised Medical Image + Segmentation + + +
+ Although current semi-supervised medical segmentation methods can achieve +decent performance, they are still affected by the uncertainty in unlabeled +data and model predictions, and there is currently a lack of effective +strategies that can explore the uncertain aspects of both simultaneously. To +address the aforementioned issues, we propose Evidential Prototype Learning +(EPL), which utilizes an extended probabilistic framework to effectively fuse +voxel probability predictions from different sources and achieves prototype +fusion utilization of labeled and unlabeled data under a generalized evidential +framework, leveraging voxel-level dual uncertainty masking. The uncertainty not +only enables the model to self-correct predictions but also improves the guided +learning process with pseudo-labels and is able to feed back into the +construction of hidden features. The method proposed in this paper has been +experimented on LA, Pancreas-CT and TBAD datasets, achieving the +state-of-the-art performance in three different labeled ratios, which strongly +demonstrates the effectiveness of our strategy. + +
+
+
+
+
+ + ☆ YOLC: You Only Look Clusters for Tiny Object Detection in Aerial Images + + +
+ Detecting objects from aerial images poses significant challenges due to the +following factors: 1) Aerial images typically have very large sizes, generally +with millions or even hundreds of millions of pixels, while computational +resources are limited. 2) Small object size leads to insufficient information +for effective detection. 3) Non-uniform object distribution leads to +computational resource wastage. To address these issues, we propose YOLC (You +Only Look Clusters), an efficient and effective framework that builds on an +anchor-free object detector, CenterNet. To overcome the challenges posed by +large-scale images and non-uniform object distribution, we introduce a Local +Scale Module (LSM) that adaptively searches cluster regions for zooming in for +accurate detection. Additionally, we modify the regression loss using Gaussian +Wasserstein distance (GWD) to obtain high-quality bounding boxes. Deformable +convolution and refinement methods are employed in the detection head to +enhance the detection of small objects. We perform extensive experiments on two +aerial image datasets, including Visdrone2019 and UAVDT, to demonstrate the +effectiveness and superiority of our proposed approach. + +
+
+ comment: accepted to TITS +
+
+
+
+
+ + ☆ Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised + Medical Image Segmentation + + +
+ Although the existing uncertainty-based semi-supervised medical segmentation +methods have achieved excellent performance, they usually only consider a +single uncertainty evaluation, which often fails to solve the problem related +to credibility completely. Therefore, based on the framework of evidential deep +learning, this paper integrates the evidential predictive results in the +cross-region of mixed and original samples to reallocate the confidence degree +and uncertainty measure of each voxel, which is realized by emphasizing +uncertain information of probability assignments fusion rule of traditional +evidence theory. Furthermore, we design a voxel-level asymptotic learning +strategy by introducing information entropy to combine with the fused +uncertainty measure to estimate voxel prediction more precisely. The model will +gradually pay attention to the prediction results with high uncertainty in the +learning process, to learn the features that are difficult to master. The +experimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the +superior performance of our proposed method in comparison with the existing +state of the arts. + +
+
+
+
+
+ + ☆ Improving Interpretable Embeddings for Ad-hoc Video Search with + Generative Captions and Multi-word Concept Bank ICMR2024 + + +
+ Aligning a user query and video clips in cross-modal latent space and that +with semantic concepts are two mainstream approaches for ad-hoc video search +(AVS). However, the effectiveness of existing approaches is bottlenecked by the +small sizes of available video-text datasets and the low quality of concept +banks, which results in the failures of unseen queries and the +out-of-vocabulary problem. This paper addresses these two problems by +constructing a new dataset and developing a multi-word concept bank. +Specifically, capitalizing on a generative model, we construct a new dataset +consisting of 7 million generated text and video pairs for pre-training. To +tackle the out-of-vocabulary problem, we develop a multi-word concept bank +based on syntax analysis to enhance the capability of a state-of-the-art +interpretable AVS method in modeling relationships between query words. We also +study the impact of current advanced features on the method. Experimental +results show that the integration of the above-proposed elements doubles the +R@1 performance of the AVS method on the MSRVTT dataset and improves the xinfAP +on the TRECVid AVS query sets for 2016-2023 (eight years) by a margin from 2% +to 77%, with an average about 20%. + +
+
+ comment: Accepted in ICMR2024 +
+
+
+
+
+ + ☆ Enhanced Radar Perception via Multi-Task Learning: Towards Refined Data + for Sensor Fusion Applications + + +
+ Radar and camera fusion yields robustness in perception tasks by leveraging +the strength of both sensors. The typical extracted radar point cloud is 2D +without height information due to insufficient antennas along the elevation +axis, which challenges the network performance. This work introduces a +learning-based approach to infer the height of radar points associated with 3D +objects. A novel robust regression loss is introduced to address the sparse +target challenge. In addition, a multi-task training strategy is employed, +emphasizing important features. The average radar absolute height error +decreases from 1.69 to 0.25 meters compared to the state-of-the-art height +extension method. The estimated target height values are used to preprocess and +enrich radar data for downstream perception tasks. Integrating this refined +radar information further enhances the performance of existing radar camera +fusion models for object detection and depth estimation tasks. + +
+
+ comment: Accepted by IEEE Intelligent Vehicles Symposium (IV 2024) +
+
+
+
+
+ + ☆ Efficient and Robust Point Cloud Registration via Heuristics-guided + Parameter Search + + +
+ Estimating the rigid transformation with 6 degrees of freedom based on a +putative 3D correspondence set is a crucial procedure in point cloud +registration. Existing correspondence identification methods usually lead to +large outlier ratios ($>$ 95 $\%$ is common), underscoring the significance of +robust registration methods. Many researchers turn to parameter search-based +strategies (e.g., Branch-and-Bround) for robust registration. Although related +methods show high robustness, their efficiency is limited to the +high-dimensional search space. This paper proposes a heuristics-guided +parameter search strategy to accelerate the search while maintaining high +robustness. We first sample some correspondences (i.e., heuristics) and then +just need to sequentially search the feasible regions that make each sample an +inlier. Our strategy largely reduces the search space and can guarantee +accuracy with only a few inlier samples, therefore enjoying an excellent +trade-off between efficiency and robustness. Since directly parameterizing the +6-dimensional nonlinear feasible region for efficient search is intractable, we +construct a three-stage decomposition pipeline to reparameterize the feasible +region, resulting in three lower-dimensional sub-problems that are easily +solvable via our strategy. Besides reducing the searching dimension, our +decomposition enables the leverage of 1-dimensional interval stabbing at all +three stages for searching acceleration. Moreover, we propose a valid sampling +strategy to guarantee our sampling effectiveness, and a compatibility +verification setup to further accelerate our search. Extensive experiments on +both simulated and real-world datasets demonstrate that our approach exhibits +comparable robustness with state-of-the-art methods while achieving a +significant efficiency boost. + +
+
+ comment: 21 pages, 16 figures. Accepted to IEEE Transactions on Pattern + Analysis and Machine Intelligence, 2024 +
+
+
+
+
+ + ☆ Concise Plane Arrangements for Low-Poly Surface and Volume Modelling + + +
+ Plane arrangements are a useful tool for surface and volume modelling. +However, their main drawback is poor scalability. We introduce two key +novelties that enable the construction of plane arrangements for complex +objects and entire scenes: an ordering scheme for the plane insertion and the +direct use of input points during arrangement construction. Both ingredients +reduce the number of unwanted splits, resulting in improved scalability of the +construction mechanism by up to two orders of magnitude compared to existing +algorithms. We further introduce a remeshing and simplification technique that +allows us to extract low-polygon surface meshes and lightweight convex +decompositions of volumes from the arrangement. We show that our approach leads +to state-of-the-art results for the aforementioned tasks by comparing it to +learning-based and traditional approaches on various different datasets. Our +implementation is available at https://github.com/raphaelsulzer/compod . + +
+
+
+
+
+ + ☆ HFNeRF: Learning Human Biomechanic Features with Neural Radiance Fields + + +
+ In recent advancements in novel view synthesis, generalizable Neural Radiance +Fields (NeRF) based methods applied to human subjects have shown remarkable +results in generating novel views from few images. However, this generalization +ability cannot capture the underlying structural features of the skeleton +shared across all instances. Building upon this, we introduce HFNeRF: a novel +generalizable human feature NeRF aimed at generating human biomechanic features +using a pre-trained image encoder. While previous human NeRF methods have shown +promising results in the generation of photorealistic virtual avatars, such +methods lack underlying human structure or biomechanic features such as +skeleton or joint information that are crucial for downstream applications +including Augmented Reality (AR)/Virtual Reality (VR). HFNeRF leverages 2D +pre-trained foundation models toward learning human features in 3D using neural +rendering, and then volume rendering towards generating 2D feature maps. We +evaluate HFNeRF in the skeleton estimation task by predicting heatmaps as +features. The proposed method is fully differentiable, allowing to successfully +learn color, geometry, and human skeleton in a simultaneous manner. This paper +presents preliminary results of HFNeRF, illustrating its potential in +generating realistic virtual avatars with biomechanic features using NeRF. + +
+
+
+
+
+ + ☆ DiffHarmony: Latent Diffusion Model Meets Image Harmonization ICMR 2024 + + +
+ Image harmonization, which involves adjusting the foreground of a composite +image to attain a unified visual consistency with the background, can be +conceptualized as an image-to-image translation task. Diffusion models have +recently promoted the rapid development of image-to-image translation tasks . +However, training diffusion models from scratch is computationally intensive. +Fine-tuning pre-trained latent diffusion models entails dealing with the +reconstruction error induced by the image compression autoencoder, making it +unsuitable for image generation tasks that involve pixel-level evaluation +metrics. To deal with these issues, in this paper, we first adapt a pre-trained +latent diffusion model to the image harmonization task to generate the +harmonious but potentially blurry initial images. Then we implement two +strategies: utilizing higher-resolution images during inference and +incorporating an additional refinement stage, to further enhance the clarity of +the initially harmonized images. Extensive experiments on iHarmony4 datasets +demonstrate the superiority of our proposed method. The code and model will be +made publicly available at https://github.com/nicecv/DiffHarmony . + +
+
+ comment: Accepted by ICMR 2024 +
+
+
+
+
+ + ☆ Mansformer: Efficient Transformer of Mixed Attention for Image + Deblurring and Beyond + + +
+ Transformer has made an enormous success in natural language processing and +high-level vision over the past few years. However, the complexity of +self-attention is quadratic to the image size, which makes it infeasible for +high-resolution vision tasks. In this paper, we propose the Mansformer, a +Transformer of mixed attention that combines multiple self-attentions, gate, +and multi-layer perceptions (MLPs), to explore and employ more possibilities of +self-attention. Taking efficiency into account, we design four kinds of +self-attention, whose complexities are all linear. By elaborate adjustment of +the tensor shapes and dimensions for the dot product, we split the typical +self-attention of quadratic complexity into four operations of linear +complexity. To adaptively merge these different kinds of self-attention, we +take advantage of an architecture similar to Squeeze-and-Excitation Networks. +Furthermore, we make it to merge the two-staged Transformer design into one +stage by the proposed gated-dconv MLP. Image deblurring is our main target, +while extensive quantitative and qualitative evaluations show that this method +performs favorably against the state-of-the-art methods far more than simply +deblurring. The source codes and trained models will be made available to the +public. + +
+
+
+
+
+ + ☆ Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for + Realistic Endoscopic Reconstruction + + +
+ Within colorectal cancer diagnostics, conventional colonoscopy techniques +face critical limitations, including a limited field of view and a lack of +depth information, which can impede the detection of precancerous lesions. +Current methods struggle to provide comprehensive and accurate 3D +reconstructions of the colonic surface which can help minimize the missing +regions and reinspection for pre-cancerous polyps. Addressing this, we +introduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting +(3D GS) combined with a Recurrent Neural Network-based Simultaneous +Localization and Mapping (RNNSLAM) system. By introducing geometric and depth +regularization into the 3D GS framework, our approach ensures more accurate +alignment of Gaussians with the colon surface, resulting in smoother 3D +reconstructions with novel viewing of detailed textures and structures. +Evaluations across three diverse datasets show that Gaussian Pancakes enhances +novel view synthesis quality, surpassing current leading methods with a 18% +boost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster +rendering and more than 10X shorter training times, making it a practical tool +for real-time applications. Hence, this holds promise for achieving clinical +translation for better detection and diagnosis of colorectal cancer. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Hierarchical Insights: Exploiting Structural Similarities for Reliable + 3D Semantic Segmentation IROS 2024 + + +
+ Safety-critical applications like autonomous driving call for robust 3D +environment perception algorithms which can withstand highly diverse and +ambiguous surroundings. The predictive performance of any classification model +strongly depends on the underlying dataset and the prior knowledge conveyed by +the annotated labels. While the labels provide a basis for the learning +process, they usually fail to represent inherent relations between the classes +- representations, which are a natural element of the human perception system. +We propose a training strategy which enables a 3D LiDAR semantic segmentation +model to learn structural relationships between the different classes through +abstraction. We achieve this by implicitly modeling those relationships through +a learning rule for hierarchical multi-label classification (HMC). With a +detailed analysis we show, how this training strategy not only improves the +model's confidence calibration, but also preserves additional information for +downstream tasks like fusion, prediction and planning. + +
+
+ comment: submitted to IROS 2024 +
+
+
+
+
+ + ☆ DreamView: Injecting View-specific Text Guidance into Text-to-3D + Generation + + +
+ Text-to-3D generation, which synthesizes 3D assets according to an overall +text description, has significantly progressed. However, a challenge arises +when the specific appearances need customizing at designated viewpoints but +referring solely to the overall description for generating 3D objects. For +instance, ambiguity easily occurs when producing a T-shirt with distinct +patterns on its front and back using a single overall text guidance. In this +work, we propose DreamView, a text-to-image approach enabling multi-view +customization while maintaining overall consistency by adaptively injecting the +view-specific and overall text guidance through a collaborative text guidance +injection module, which can also be lifted to 3D generation via score +distillation sampling. DreamView is trained with large-scale rendered +multi-view images and their corresponding view-specific texts to learn to +balance the separate content manipulation in each view and the global +consistency of the overall object, resulting in a dual achievement of +customization and consistency. Consequently, DreamView empowers artists to +design 3D objects creatively, fostering the creation of more innovative and +diverse 3D assets. Code and model will be released at +https://github.com/iSEE-Laboratory/DreamView. + +
+
+
+
+
+ + ☆ Revising Densification in Gaussian Splatting + + +
+ In this paper, we address the limitations of Adaptive Density Control (ADC) +in 3D Gaussian Splatting (3DGS), a scene representation method achieving +high-quality, photorealistic results for novel view synthesis. ADC has been +introduced for automatic 3D point primitive management, controlling +densification and pruning, however, with certain limitations in the +densification logic. Our main contribution is a more principled, pixel-error +driven formulation for density control in 3DGS, leveraging an auxiliary, +per-pixel error function as the criterion for densification. We further +introduce a mechanism to control the total number of primitives generated per +scene and correct a bias in the current opacity handling strategy of ADC during +cloning operations. Our approach leads to consistent quality improvements +across a variety of benchmark scenes, without sacrificing the method's +efficiency. + +
+
+
+
+
+ + ☆ Hash3D: Training-free Acceleration for 3D Generation + + +
+ The evolution of 3D generative modeling has been notably propelled by the +adoption of 2D diffusion models. Despite this progress, the cumbersome +optimization process per se presents a critical hurdle to efficiency. In this +paper, we introduce Hash3D, a universal acceleration for 3D generation without +model training. Central to Hash3D is the insight that feature-map redundancy is +prevalent in images rendered from camera positions and diffusion time-steps in +close proximity. By effectively hashing and reusing these feature maps across +neighboring timesteps and camera angles, Hash3D substantially prevents +redundant calculations, thus accelerating the diffusion model's inference in 3D +generation tasks. We achieve this through an adaptive grid-based hashing. +Surprisingly, this feature-sharing mechanism not only speed up the generation +but also enhances the smoothness and view consistency of the synthesized 3D +objects. Our experiments covering 5 text-to-3D and 3 image-to-3D models, +demonstrate Hash3D's versatility to speed up optimization, enhancing efficiency +by 1.3 to 4 times. Additionally, Hash3D's integration with 3D Gaussian +splatting largely speeds up 3D model creation, reducing text-to-3D processing +to about 10 minutes and image-to-3D conversion to roughly 30 seconds. The +project page is at https://adamdad.github.io/hash3D/. + +
+
+ comment: https://adamdad.github.io/hash3D/ +
+
+
+
+
+ + ☆ Using Few-Shot Learning to Classify Primary Lung Cancer and Other + Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial + Ultrasound Procedures + + +
+ This study aims to establish a computer-aided diagnosis system for +endobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary +diagnosis of metastatic cancer. This involves arranging immediate examinations +for other sites of metastatic cancer after EBUS surgery, eliminating the need +to wait for reports, thereby shortening the waiting time by more than half and +enabling patients to detect other cancers earlier, allowing for early planning +and implementation of treatment plans. Unlike previous studies on cell image +classification, which have abundant datasets for training, this study must also +be able to make effective classifications despite the limited amount of case +data for lung metastatic cancer. In the realm of small data set classification +methods, Few-shot learning (FSL) has become mainstream in recent years. Through +its ability to train on small datasets and its strong generalization +capabilities, FSL shows potential in this task of lung metastatic cell image +classification. This study will adopt the approach of Few-shot learning, +referencing existing proposed models, and designing a model architecture for +classifying lung metastases cell images. Batch Spectral Regularization (BSR) +will be incorporated as a loss update parameter, and the Finetune method of PMF +will be modified. In terms of test results, the addition of BSR and the +modified Finetune method further increases the accuracy by 8.89% to 65.60%, +outperforming other FSL methods. This study confirms that FSL is superior to +supervised and transfer learning in classifying metastatic cancer and +demonstrates that using BSR as a loss function and modifying Finetune can +enhance the model's capabilities. + +
+
+
+
+
+ + ☆ LIPT: Latency-aware Image Processing Transformer + + +
+ Transformer is leading a trend in the field of image processing. Despite the +great success that existing lightweight image processing transformers have +achieved, they are tailored to FLOPs or parameters reduction, rather than +practical inference acceleration. In this paper, we present a latency-aware +image processing transformer, termed LIPT. We devise the low-latency proportion +LIPT block that substitutes memory-intensive operators with the combination of +self-attention and convolutions to achieve practical speedup. Specifically, we +propose a novel non-volatile sparse masking self-attention (NVSM-SA) that +utilizes a pre-computing sparse mask to capture contextual information from a +larger window with no extra computation overload. Besides, a high-frequency +reparameterization module (HRM) is proposed to make LIPT block +reparameterization friendly, which improves the model's detail reconstruction +capability. Extensive experiments on multiple image processing tasks (e.g., +image super-resolution (SR), JPEG artifact reduction, and image denoising) +demonstrate the superiority of LIPT on both latency and PSNR. LIPT achieves +real-time GPU inference with state-of-the-art performance on multiple image SR +benchmarks. + +
+
+
+
+
+ + ☆ Unified Entropy Optimization for Open-Set Test-Time Adaptation CVPR 2024 + + +
+ Test-time adaptation (TTA) aims at adapting a model pre-trained on the +labeled source domain to the unlabeled target domain. Existing methods usually +focus on improving TTA performance under covariate shifts, while neglecting +semantic shifts. In this paper, we delve into a realistic open-set TTA setting +where the target domain may contain samples from unknown classes. Many +state-of-the-art closed-set TTA methods perform poorly when applied to open-set +scenarios, which can be attributed to the inaccurate estimation of data +distribution and model confidence. To address these issues, we propose a simple +but effective framework called unified entropy optimization (UniEnt), which is +capable of simultaneously adapting to covariate-shifted in-distribution (csID) +data and detecting covariate-shifted out-of-distribution (csOOD) data. +Specifically, UniEnt first mines pseudo-csID and pseudo-csOOD samples from test +data, followed by entropy minimization on the pseudo-csID data and entropy +maximization on the pseudo-csOOD data. Furthermore, we introduce UniEnt+ to +alleviate the noise caused by hard data partition leveraging sample-level +confidence. Extensive experiments on CIFAR benchmarks and Tiny-ImageNet-C show +the superiority of our framework. The code is available at +https://github.com/gaozhengqing/UniEnt + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Unified Multi-modal Diagnostic Framework with Reconstruction + Pre-training and Heterogeneity-combat Tuning + + +
+ Medical multi-modal pre-training has revealed promise in computer-aided +diagnosis by leveraging large-scale unlabeled datasets. However, existing +methods based on masked autoencoders mainly rely on data-level reconstruction +tasks, but lack high-level semantic information. Furthermore, two significant +heterogeneity challenges hinder the transfer of pre-trained knowledge to +downstream tasks, \textit{i.e.}, the distribution heterogeneity between +pre-training data and downstream data, and the modality heterogeneity within +downstream data. To address these challenges, we propose a Unified Medical +Multi-modal Diagnostic (UMD) framework with tailored pre-training and +downstream tuning strategies. Specifically, to enhance the representation +abilities of vision and language encoders, we propose the Multi-level +Reconstruction Pre-training (MR-Pretrain) strategy, including a feature-level +and data-level reconstruction, which guides models to capture the semantic +information from masked inputs of different modalities. Moreover, to tackle two +kinds of heterogeneities during the downstream tuning, we present the +heterogeneity-combat downstream tuning strategy, which consists of a +Task-oriented Distribution Calibration (TD-Calib) and a Gradient-guided +Modality Coordination (GM-Coord). In particular, TD-Calib fine-tunes the +pre-trained model regarding the distribution of downstream datasets, and +GM-Coord adjusts the gradient weights according to the dynamic optimization +status of different modalities. Extensive experiments on five public medical +datasets demonstrate the effectiveness of our UMD framework, which remarkably +outperforms existing approaches on three kinds of downstream tasks. + +
+
+ comment: to be published in IEEE JBHI; Code available at + https://github.com/helenypzhang/UMD +
+
+
+
+
+ + ☆ Incremental Joint Learning of Depth, Pose and Implicit Scene + Representation on Monocular Camera in Large-scale Scenes + + +
+ Dense scene reconstruction for photo-realistic view synthesis has various +applications, such as VR/AR, autonomous vehicles. However, most existing +methods have difficulties in large-scale scenes due to three core challenges: +\textit{(a) inaccurate depth input.} Accurate depth input is impossible to get +in real-world large-scale scenes. \textit{(b) inaccurate pose estimation.} Most +existing approaches rely on accurate pre-estimated camera poses. \textit{(c) +insufficient scene representation capability.} A single global radiance field +lacks the capacity to effectively scale to large-scale scenes. To this end, we +propose an incremental joint learning framework, which can achieve accurate +depth, pose estimation, and large-scale scene reconstruction. A vision +transformer-based network is adopted as the backbone to enhance performance in +scale information estimation. For pose estimation, a feature-metric bundle +adjustment (FBA) method is designed for accurate and robust camera tracking in +large-scale scenes. In terms of implicit scene representation, we propose an +incremental scene representation method to construct the entire large-scale +scene as multiple local radiance fields to enhance the scalability of 3D scene +representation. Extended experiments have been conducted to demonstrate the +effectiveness and accuracy of our method in depth estimation, pose estimation, +and large-scale scene reconstruction. + +
+
+
+
+
+ + ☆ Object Dynamics Modeling with Hierarchical Point Cloud-based + Representations CVPR 2024 + + +
+ Modeling object dynamics with a neural network is an important problem with +numerous applications. Most recent work has been based on graph neural +networks. However, physics happens in 3D space, where geometric information +potentially plays an important role in modeling physical phenomena. In this +work, we propose a novel U-net architecture based on continuous point +convolution which naturally embeds information from 3D coordinates and allows +for multi-scale feature representations with established downsampling and +upsampling procedures. Bottleneck layers in the downsampled point clouds lead +to better long-range interaction modeling. Besides, the flexibility of point +convolutions allows our approach to generalize to sparsely sampled points from +mesh vertices and dynamically generate features on important interaction points +on mesh faces. Experimental results demonstrate that our approach significantly +improves the state-of-the-art, especially in scenarios that require accurate +gravity or collision reasoning. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Space-Time Video Super-resolution with Neural Operator + + +
+ This paper addresses the task of space-time video super-resolution (ST-VSR). +Existing methods generally suffer from inaccurate motion estimation and motion +compensation (MEMC) problems for large motions. Inspired by recent progress in +physics-informed neural networks, we model the challenges of MEMC in ST-VSR as +a mapping between two continuous function spaces. Specifically, our approach +transforms independent low-resolution representations in the coarse-grained +continuous function space into refined representations with enriched +spatiotemporal details in the fine-grained continuous function space. To +achieve efficient and accurate MEMC, we design a Galerkin-type attention +function to perform frame alignment and temporal interpolation. Due to the +linear complexity of the Galerkin-type attention mechanism, our model avoids +patch partitioning and offers global receptive fields, enabling precise +estimation of large motions. The experimental results show that the proposed +method surpasses state-of-the-art techniques in both fixed-size and continuous +space-time video super-resolution tasks. + +
+
+
+
+
+ + ☆ Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for + Multi-exposure Image Fusion + + +
+ In recent years, deep learning networks have made remarkable strides in the +domain of multi-exposure image fusion. Nonetheless, prevailing approaches often +involve directly feeding over-exposed and under-exposed images into the +network, which leads to the under-utilization of inherent information present +in the source images. Additionally, unsupervised techniques predominantly +employ rudimentary weighted summation for color channel processing, culminating +in an overall desaturated final image tone. To partially mitigate these issues, +this study proposes a gamma correction module specifically designed to fully +leverage latent information embedded within source images. Furthermore, a +modified transformer block, embracing with self-attention mechanisms, is +introduced to optimize the fusion process. Ultimately, a novel color +enhancement algorithm is presented to augment image saturation while preserving +intricate details. The source code is available at this https://github.com/ZhiyingDu/BHFMEF url. + +
+
+
+
+
+ + ☆ Improving Facial Landmark Detection Accuracy and Efficiency with + Knowledge Distillation ICME 2024 + + +
+ The domain of computer vision has experienced significant advancements in +facial-landmark detection, becoming increasingly essential across various +applications such as augmented reality, facial recognition, and emotion +analysis. Unlike object detection or semantic segmentation, which focus on +identifying objects and outlining boundaries, faciallandmark detection aims to +precisely locate and track critical facial features. However, deploying deep +learning-based facial-landmark detection models on embedded systems with +limited computational resources poses challenges due to the complexity of +facial features, especially in dynamic settings. Additionally, ensuring +robustness across diverse ethnicities and expressions presents further +obstacles. Existing datasets often lack comprehensive representation of facial +nuances, particularly within populations like those in Taiwan. This paper +introduces a novel approach to address these challenges through the development +of a knowledge distillation method. By transferring knowledge from larger +models to smaller ones, we aim to create lightweight yet powerful deep learning +models tailored specifically for facial-landmark detection tasks. Our goal is +to design models capable of accurately locating facial landmarks under varying +conditions, including diverse expressions, orientations, and lighting +environments. The ultimate objective is to achieve high accuracy and real-time +performance suitable for deployment on embedded systems. This method was +successfully implemented and achieved a top 6th place finish out of 165 +participants in the IEEE ICME 2024 PAIR competition. + +
+
+ comment: technical report. 6th/165 in IEEE ICME 2024 PAIR competition +
+
+
+
+
+ + ☆ Greedy-DiM: Greedy Algorithms for Unreasonably Effective Face Morphs + + +
+ Morphing attacks are an emerging threat to state-of-the-art Face Recognition +(FR) systems, which aim to create a single image that contains the biometric +information of multiple identities. Diffusion Morphs (DiM) are a recently +proposed morphing attack that has achieved state-of-the-art performance for +representation-based morphing attacks. However, none of the existing research +on DiMs have leveraged the iterative nature of DiMs and left the DiM model as a +black box, treating it no differently than one would a Generative Adversarial +Network (GAN) or Varational AutoEncoder (VAE). We propose a greedy strategy on +the iterative sampling process of DiM models which searches for an optimal step +guided by an identity-based heuristic function. We compare our proposed +algorithm against ten other state-of-the-art morphing algorithms using the +open-source SYN-MAD 2022 competition dataset. We find that our proposed +algorithm is unreasonably effective, fooling all of the tested FR systems with +an MMPMR of 100%, outperforming all other morphing algorithms compared. + +
+
+ comment: Initial preprint. Under review +
+
+
+
+
+ + ☆ Band-Attention Modulated RetNet for Face Forgery Detection + + +
+ The transformer networks are extensively utilized in face forgery detection +due to their scalability across large datasets.Despite their success, +transformers face challenges in balancing the capture of global context, which +is crucial for unveiling forgery clues, with computational complexity.To +mitigate this issue, we introduce Band-Attention modulated RetNet (BAR-Net), a +lightweight network designed to efficiently process extensive visual contexts +while avoiding catastrophic forgetting.Our approach empowers the target token +to perceive global information by assigning differential attention levels to +tokens at varying distances. We implement self-attention along both spatial +axes, thereby maintaining spatial priors and easing the computational +burden.Moreover, we present the adaptive frequency Band-Attention Modulation +mechanism, which treats the entire Discrete Cosine Transform spectrogram as a +series of frequency bands with learnable weights.Together, BAR-Net achieves +favorable performance on several face forgery datasets, outperforming current +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Diffusion-Based Point Cloud Super-Resolution for mmWave Radar Data + + +
+ The millimeter-wave radar sensor maintains stable performance under adverse +environmental conditions, making it a promising solution for all-weather +perception tasks, such as outdoor mobile robotics. However, the radar point +clouds are relatively sparse and contain massive ghost points, which greatly +limits the development of mmWave radar technology. In this paper, we propose a +novel point cloud super-resolution approach for 3D mmWave radar data, named +Radar-diffusion. Our approach employs the diffusion model defined by +mean-reverting stochastic differential equations(SDE). Using our proposed new +objective function with supervision from corresponding LiDAR point clouds, our +approach efficiently handles radar ghost points and enhances the sparse mmWave +radar point clouds to dense LiDAR-like point clouds. We evaluate our approach +on two different datasets, and the experimental results show that our method +outperforms the state-of-the-art baseline methods in 3D radar super-resolution +tasks. Furthermore, we demonstrate that our enhanced radar point cloud is +capable of downstream radar point-based registration tasks. + +
+
+
+
+
+ + ☆ Concept-Attention Whitening for Interpretable Skin Lesion Diagnosis + + +
+ The black-box nature of deep learning models has raised concerns about their +interpretability for successful deployment in real-world clinical applications. +To address the concerns, eXplainable Artificial Intelligence (XAI) aims to +provide clear and understandable explanations of the decision-making process. +In the medical domain, concepts such as attributes of lesions or abnormalities +serve as key evidence for deriving diagnostic results. However, existing +concept-based models mainly depend on concepts that appear independently and +require fine-grained concept annotations such as bounding boxes. A medical +image usually contains multiple concepts and the fine-grained concept +annotations are difficult to acquire. In this paper, we propose a novel +Concept-Attention Whitening (CAW) framework for interpretable skin lesion +diagnosis. CAW is comprised of a disease diagnosis branch and a concept +alignment branch. In the former branch, we train the CNN with a CAW layer +inserted to perform skin lesion diagnosis. The CAW layer decorrelates features +and aligns image features to conceptual meanings via an orthogonal matrix. In +the latter branch, we calculate the orthogonal matrix under the guidance of the +concept attention mask. We particularly introduce a weakly-supervised concept +mask generator that only leverages coarse concept labels for filtering local +regions that are relevant to certain concepts, improving the optimization of +the orthogonal matrix. Extensive experiments on two public skin lesion +diagnosis datasets demonstrated that CAW not only enhanced interpretability but +also maintained a state-of-the-art diagnostic performance. + +
+
+
+
+
+ + ☆ A Lightweight Measure of Classification Difficulty from Application + Dataset Characteristics + + +
+ Despite accuracy and computation benchmarks being widely available to help +choose among neural network models, these are usually trained on datasets with +many classes, and do not give a precise idea of performance for applications of +few (< 10) classes. The conventional procedure to predict performance is to +train and test repeatedly on the different models and dataset variations of +interest. However, this is computationally expensive. We propose an efficient +classification difficulty measure that is calculated from the number of classes +and intra- and inter-class similarity metrics of the dataset. After a single +stage of training and testing per model family, relative performance for +different datasets and models of the same family can be predicted by comparing +difficulty measures - without further training and testing. We show how this +measure can help a practitioner select a computationally efficient model for a +small dataset 6 to 29x faster than through repeated training and testing. We +give an example of use of the measure for an industrial application in which +options are identified to select a model 42% smaller than the baseline +YOLOv5-nano model, and if class merging from 3 to 2 classes meets requirements, +85% smaller. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ☆ Tackling Structural Hallucination in Image Translation with Local + Diffusion + + +
+ Recent developments in diffusion models have advanced conditioned image +generation, yet they struggle with reconstructing out-of-distribution (OOD) +images, such as unseen tumors in medical images, causing ``image +hallucination'' and risking misdiagnosis. We hypothesize such hallucinations +result from local OOD regions in the conditional images. We verify that +partitioning the OOD region and conducting separate image generations +alleviates hallucinations in several applications. From this, we propose a +training-free diffusion framework that reduces hallucination with multiple +Local Diffusion processes. Our approach involves OOD estimation followed by two +modules: a ``branching'' module generates locally both within and outside OOD +regions, and a ``fusion'' module integrates these predictions into one. Our +evaluation shows our method mitigates hallucination over baseline models +quantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the +real-world medical and natural image datasets, respectively. It also +demonstrates compatibility with various pre-trained diffusion models. + +
+
+
+
+
+ + ☆ StoryImager: A Unified and Efficient Framework for Coherent Story + Visualization and Completion + + +
+ Story visualization aims to generate a series of realistic and coherent +images based on a storyline. Current models adopt a frame-by-frame architecture +by transforming the pre-trained text-to-image model into an auto-regressive +manner. Although these models have shown notable progress, there are still +three flaws. 1) The unidirectional generation of auto-regressive manner +restricts the usability in many scenarios. 2) The additional introduced story +history encoders bring an extremely high computational cost. 3) The story +visualization and continuation models are trained and inferred independently, +which is not user-friendly. To these ends, we propose a bidirectional, unified, +and efficient framework, namely StoryImager. The StoryImager enhances the +storyboard generative ability inherited from the pre-trained text-to-image +model for a bidirectional generation. Specifically, we introduce a Target Frame +Masking Strategy to extend and unify different story image generation tasks. +Furthermore, we propose a Frame-Story Cross Attention Module that decomposes +the cross attention for local fidelity and global coherence. Moreover, we +design a Contextual Feature Extractor to extract contextual information from +the whole storyline. The extensive experimental results demonstrate the +excellent performance of our StoryImager. The code is available at +https://github.com/tobran/StoryImager. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ JSTR: Judgment Improves Scene Text Recognition + + +
+ In this paper, we present a method for enhancing the accuracy of scene text +recognition tasks by judging whether the image and text match each other. While +previous studies focused on generating the recognition results from input +images, our approach also considers the model's misrecognition results to +understand its error tendencies, thus improving the text recognition pipeline. +This method boosts text recognition accuracy by providing explicit feedback on +the data that the model is likely to misrecognize by predicting correct or +incorrect between the image and text. The experimental results on publicly +available datasets demonstrate that our proposed method outperforms the +baseline and state-of-the-art methods in scene text recognition. + +
+
+ comment: IntelliSys 2024 +
+
+
+
+
+ + ☆ EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker + + +
+ Most of 3D single object trackers (SOT) in point clouds follow the two-stream +multi-stage 3D Siamese or motion tracking paradigms, which process the template +and search area point clouds with two parallel branches, built on supervised +point cloud backbones. In this work, beyond typical 3D Siamese or motion +tracking, we propose a neat and compact one-stream transformer 3D SOT paradigm +from the novel perspective, termed as \textbf{EasyTrack}, which consists of +three special designs: 1) A 3D point clouds tracking feature pre-training +module is developed to exploit the masked autoencoding for learning 3D point +clouds tracking representations. 2) A unified 3D tracking feature learning and +fusion network is proposed to simultaneously learns target-aware 3D features, +and extensively captures mutual correlation through the flexible self-attention +mechanism. 3) A target location network in the dense bird's eye view (BEV) +feature space is constructed for target classification and regression. +Moreover, we develop an enhanced version named EasyTrack++, which designs the +center points interaction (CPI) strategy to reduce the ambiguous targets caused +by the noise point cloud background information. The proposed EasyTrack and +EasyTrack++ set a new state-of-the-art performance ($\textbf{18\%}$, +$\textbf{40\%}$ and $\textbf{3\%}$ success gains) in KITTI, NuScenes, and Waymo +while runing at \textbf{52.6fps} with few parameters (\textbf{1.3M}). The code +will be available at https://github.com/KnightApple427/Easytrack. + +
+
+
+
+
+ + ☆ Prompt-driven Universal Model for View-Agnostic Echocardiography + Analysis + + +
+ Echocardiography segmentation for cardiac analysis is time-consuming and +resource-intensive due to the variability in image quality and the necessity to +process scans from various standard views. While current automated segmentation +methods in echocardiography show promising performance, they are trained on +specific scan views to analyze corresponding data. However, this solution has a +limitation as the number of required models increases with the number of +standard views. To address this, in this paper, we present a prompt-driven +universal method for view-agnostic echocardiography analysis. Considering the +domain shift between standard views, we first introduce a method called prompt +matching, aimed at learning prompts specific to different views by matching +prompts and querying input embeddings using a pre-trained vision model. Then, +we utilized a pre-trained medical language model to align textual information +with pixel data for accurate segmentation. Extensive experiments on three +standard views showed that our approach significantly outperforms the +state-of-the-art universal methods and achieves comparable or even better +performances over the segmentation model trained and tested on same views. + +
+
+
+
+
+ + ☆ LATUP-Net: A Lightweight 3D Attention U-Net with Parallel Convolutions + for Brain Tumor Segmentation + + +
+ Early-stage 3D brain tumor segmentation from magnetic resonance imaging (MRI) +scans is crucial for prompt and effective treatment. However, this process +faces the challenge of precise delineation due to the tumors' complex +heterogeneity. Moreover, energy sustainability targets and resource +limitations, especially in developing countries, require efficient and +accessible medical imaging solutions. The proposed architecture, a Lightweight +3D ATtention U-Net with Parallel convolutions, LATUP-Net, addresses these +issues. It is specifically designed to reduce computational requirements +significantly while maintaining high segmentation performance. By incorporating +parallel convolutions, it enhances feature representation by capturing +multi-scale information. It further integrates an attention mechanism to refine +segmentation through selective feature recalibration. LATUP-Net achieves +promising segmentation performance: the average Dice scores for the whole +tumor, tumor core, and enhancing tumor on the BraTS2020 dataset are 88.41%, +83.82%, and 73.67%, and on the BraTS2021 dataset, they are 90.29%, 89.54%, and +83.92%, respectively. Hausdorff distance metrics further indicate its improved +ability to delineate tumor boundaries. With its significantly reduced +computational demand using only 3.07 M parameters, about 59 times fewer than +other state-of-the-art models, and running on a single V100 GPU, LATUP-Net +stands out as a promising solution for real-world clinical applications, +particularly in settings with limited resources. Investigations into the +model's interpretability, utilizing gradient-weighted class activation mapping +and confusion matrices, reveal that while attention mechanisms enhance the +segmentation of small regions, their impact is nuanced. Achieving the most +accurate tumor delineation requires carefully balancing local and global +features. + +
+
+
+
+
+ + ☆ Res-U2Net: Untrained Deep Learning for Phase Retrieval and Image + Reconstruction + + +
+ Conventional deep learning-based image reconstruction methods require a large +amount of training data which can be hard to obtain in practice. Untrained deep +learning methods overcome this limitation by training a network to invert a +physical model of the image formation process. Here we present a novel +untrained Res-U2Net model for phase retrieval. We use the extracted phase +information to determine changes in an object's surface and generate a mesh +representation of its 3D structure. We compare the performance of Res-U2Net +phase retrieval against UNet and U2Net using images from the GDXRAY dataset. + +
+
+ comment: 16 pages, 8 figures, 4 Tables +
+
+
+
+
+ + ☆ FlameFinder: Illuminating Obscured Fire through Smoke with Attentive + Deep Metric Learning + + +
+ FlameFinder is a deep metric learning (DML) framework designed to accurately +detect flames, even when obscured by smoke, using thermal images from +firefighter drones during wildfire monitoring. Traditional RGB cameras struggle +in such conditions, but thermal cameras can capture smoke-obscured flame +features. However, they lack absolute thermal reference points, leading to +false positives.To address this issue, FlameFinder utilizes paired thermal-RGB +images for training. By learning latent flame features from smoke-free samples, +the model becomes less biased towards relative thermal gradients. In testing, +it identifies flames in smoky patches by analyzing their equivalent +thermal-domain distribution. This method improves performance using both +supervised and distance-based clustering metrics.The framework incorporates a +flame segmentation method and a DML-aided detection framework. This includes +utilizing center loss (CL), triplet center loss (TCL), and triplet cosine +center loss (TCCL) to identify optimal cluster representatives for +classification. However, the dominance of center loss over the other losses +leads to the model missing features sensitive to them. To address this +limitation, an attention mechanism is proposed. This mechanism allows for +non-uniform feature contribution, amplifying the critical role of cosine and +triplet loss in the DML framework. Additionally, it improves interpretability, +class discrimination, and decreases intra-class variance. As a result, the +proposed model surpasses the baseline by 4.4% in the FLAME2 dataset and 7% in +the FLAME3 dataset for unobscured flame detection accuracy. Moreover, it +demonstrates enhanced class separation in obscured scenarios compared to VGG19, +ResNet18, and three backbone models tailored for flame detection. + +
+
+ comment: Submitted as a Journal Paper to IEEE Transactions on Geoscience and + Remote Sensing +
+
+
+
+
+ + ☆ SAM-I-Am: Semantic Boosting for Zero-shot Atomic-Scale Electron + Micrograph Segmentation + + +
+ Image segmentation is a critical enabler for tasks ranging from medical +diagnostics to autonomous driving. However, the correct segmentation semantics +- where are boundaries located? what segments are logically similar? - change +depending on the domain, such that state-of-the-art foundation models can +generate meaningless and incorrect results. Moreover, in certain domains, +fine-tuning and retraining techniques are infeasible: obtaining labels is +costly and time-consuming; domain images (micrographs) can be exponentially +diverse; and data sharing (for third-party retraining) is restricted. To enable +rapid adaptation of the best segmentation technology, we propose the concept of +semantic boosting: given a zero-shot foundation model, guide its segmentation +and adjust results to match domain expectations. We apply semantic boosting to +the Segment Anything Model (SAM) to obtain microstructure segmentation for +transmission electron microscopy. Our booster, SAM-I-Am, extracts geometric and +textural features of various intermediate masks to perform mask removal and +mask merging operations. We demonstrate a zero-shot performance increase of +(absolute) +21.35%, +12.6%, +5.27% in mean IoU, and a -9.91%, -18.42%, -4.06% +drop in mean false positive masks across images of three difficulty classes +over vanilla SAM (ViT-L). + +
+
+
+
+
+ + ☆ GeoSynth: Contextually-Aware High-Resolution Satellite Image Synthesis + + +
+ We present GeoSynth, a model for synthesizing satellite images with global +style and image-driven layout control. The global style control is via textual +prompts or geographic location. These enable the specification of scene +semantics or regional appearance respectively, and can be used together. We +train our model on a large dataset of paired satellite imagery, with +automatically generated captions, and OpenStreetMap data. We evaluate various +combinations of control inputs, including different types of layout controls. +Results demonstrate that our model can generate diverse, high-quality images +and exhibits excellent zero-shot generalization. The code and model checkpoints +are available at https://github.com/mvrl/GeoSynth. + +
+
+
+
+
+ + ☆ Calibrating Higher-Order Statistics for Few-Shot Class-Incremental + Learning with Pre-trained Vision Transformers CVPR 2024 + + +
+ Few-shot class-incremental learning (FSCIL) aims to adapt the model to new +classes from very few data (5 samples) without forgetting the previously +learned classes. Recent works in many-shot CIL (MSCIL) (using all available +training data) exploited pre-trained models to reduce forgetting and achieve +better plasticity. In a similar fashion, we use ViT models pre-trained on +large-scale datasets for few-shot settings, which face the critical issue of +low plasticity. FSCIL methods start with a many-shot first task to learn a very +good feature extractor and then move to the few-shot setting from the second +task onwards. While the focus of most recent studies is on how to learn the +many-shot first task so that the model generalizes to all future few-shot +tasks, we explore in this work how to better model the few-shot data using +pre-trained models, irrespective of how the first task is trained. Inspired by +recent works in MSCIL, we explore how using higher-order feature statistics can +influence the classification of few-shot classes. We identify the main +challenge of obtaining a good covariance matrix from few-shot data and propose +to calibrate the covariance matrix for new classes based on semantic similarity +to the many-shot base classes. Using the calibrated feature statistics in +combination with existing methods significantly improves few-shot continual +classification on several FSCIL benchmarks. Code is available at +https://github.com/dipamgoswami/FSCIL-Calibration. + +
+
+ comment: Accepted at CLVision workshop (CVPR 2024) +
+
+
+
+
+ + ☆ RoadBEV: Road Surface Reconstruction in Bird's Eye View + + +
+ Road surface conditions, especially geometry profiles, enormously affect +driving performance of autonomous vehicles. Vision-based online road +reconstruction promisingly captures road information in advance. Existing +solutions like monocular depth estimation and stereo matching suffer from +modest performance. The recent technique of Bird's-Eye-View (BEV) perception +provides immense potential to more reliable and accurate reconstruction. This +paper uniformly proposes two simple yet effective models for road elevation +reconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate +road elevation with monocular and stereo images, respectively. The former +directly fits elevation values based on voxel features queried from image view, +while the latter efficiently recognizes road elevation patterns based on BEV +volume representing discrepancy between left and right voxel features. +Insightful analyses reveal their consistence and difference with perspective +view. Experiments on real-world dataset verify the models' effectiveness and +superiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm +and 0.56cm, respectively. The estimation performance improves by 50\% in BEV +based on monocular image. Our models are promising for practical applications, +providing valuable references for vision-based BEV perception in autonomous +driving. The code is released at https://github.com/ztsrxh/RoadBEV. + +
+
+ comment: Dataset page: https://thu-rsxd.com/rsrd Code: + https://github.com/ztsrxh/RoadBEV +
+
+
+
+
+ + ☆ Spatially Optimized Compact Deep Metric Learning Model for Similarity + Search + + +
+ Spatial optimization is often overlooked in many computer vision tasks. +Filters should be able to recognize the features of an object regardless of +where it is in the image. Similarity search is a crucial task where spatial +features decide an important output. The capacity of convolution to capture +visual patterns across various locations is limited. In contrast to +convolution, the involution kernel is dynamically created at each pixel based +on the pixel value and parameters that have been learned. This study +demonstrates that utilizing a single layer of involution feature extractor +alongside a compact convolution model significantly enhances the performance of +similarity search. Additionally, we improve predictions by using the GELU +activation function rather than the ReLU. The negligible amount of weight +parameters in involution with a compact model with better performance makes the +model very useful in real-world implementations. Our proposed model is below 1 +megabyte in size. We have experimented with our proposed methodology and other +models on CIFAR-10, FashionMNIST, and MNIST datasets. Our proposed method +outperforms across all three datasets. + +
+
+ comment: 5 pages, 3 figures, +
+
+
+
+
+ + ☆ Leveraging Latents for Efficient Thermography Classification and + Segmentation + + +
+ Breast cancer is a prominent health concern worldwide, currently being the +secondmost common and second-deadliest type of cancer in women. While current +breast cancer diagnosis mainly relies on mammography imaging, in recent years +the use of thermography for breast cancer imaging has been garnering growing +popularity. Thermographic imaging relies on infrared cameras to capture +body-emitted heat distributions. While these heat signatures have proven useful +for computer-vision systems for accurate breast cancer segmentation and +classification, prior work often relies on handcrafted feature engineering or +complex architectures, potentially limiting the comparability and applicability +of these methods. In this work, we present a novel algorithm for both breast +cancer classification and segmentation. Rather than focusing efforts on manual +feature and architecture engineering, our algorithm focuses on leveraging an +informative, learned feature space, thus making our solution simpler to use and +extend to other frameworks and downstream tasks, as well as more applicable to +data-scarce settings. Our classification produces SOTA results, while we are +the first work to produce segmentation regions studied in this paper. + +
+
+
+
+
+ + ☆ The Impact of Print-and-Scan in Heterogeneous Morph Evaluation Scenarios + + +
+ Face morphing attacks present an emerging threat to the face recognition +system. On top of that, printing and scanning the morphed images could obscure +the artifacts generated during the morphing process, which makes morphed image +detection even harder. In this work, we investigate the impact that printing +and scanning has on morphing attacks through a series of heterogeneous tests. +Our experiments show that we can increase the possibility of a false match by +up to 5.64% for DiM and 16.00% for StyleGAN2 when providing an image that has +been printed and scanned, regardless it is morphed or bona fide, to a Face +Recognition (FR) system. Likewise, using Frechet Inception Distance (FID) +metric, strictly print-scanned morph attacks performed on average 9.185% +stronger than non-print-scanned digital morphs. + +
+
+ comment: Initial preprint. Under review +
+
+
+
+
+ + ☆ Training-Free Open-Vocabulary Segmentation with Offline + Diffusion-Augmented Prototype Generation CVPR 2024 + + +
+ Open-vocabulary semantic segmentation aims at segmenting arbitrary categories +expressed in textual form. Previous works have trained over large amounts of +image-caption pairs to enforce pixel-level multimodal alignments. However, +captions provide global information about the semantics of a given image but +lack direct localization of individual concepts. Further, training on +large-scale datasets inevitably brings significant computational costs. In this +paper, we propose FreeDA, a training-free diffusion-augmented method for +open-vocabulary semantic segmentation, which leverages the ability of diffusion +models to visually localize generated concepts and local-global similarities to +match class-agnostic regions with semantic classes. Our approach involves an +offline stage in which textual-visual reference embeddings are collected, +starting from a large set of captions and leveraging visual and semantic +contexts. At test time, these are queried to support the visual matching +process, which is carried out by jointly considering class-agnostic regions and +global semantic similarities. Extensive analyses demonstrate that FreeDA +achieves state-of-the-art performance on five datasets, surpassing previous +methods by more than 7.0 average points in terms of mIoU and without requiring +any training. + +
+
+ comment: CVPR 2024. Project page: https://aimagelab.github.io/freeda/ +
+
+
+
+
+ + ☆ GO4Align: Group Optimization for Multi-Task Alignment + + +
+ This paper proposes \textit{GO4Align}, a multi-task optimization approach +that tackles task imbalance by explicitly aligning the optimization across +tasks. To achieve this, we design an adaptive group risk minimization strategy, +compromising two crucial techniques in implementation: (i) dynamical group +assignment, which clusters similar tasks based on task interactions; (ii) +risk-guided group indicators, which exploit consistent task correlations with +risk information from previous iterations. Comprehensive experimental results +on diverse typical benchmarks demonstrate our method's performance superiority +with even lower computational costs. + +
+
+
+
+
+ + ♻ ☆ Zero-shot Referring Expression Comprehension via Structural Similarity + Between Images and Captions CVPR 2024 + + +
+ Zero-shot referring expression comprehension aims at localizing bounding +boxes in an image corresponding to provided textual prompts, which requires: +(i) a fine-grained disentanglement of complex visual scene and textual context, +and (ii) a capacity to understand relationships among disentangled entities. +Unfortunately, existing large vision-language alignment (VLA) models, e.g., +CLIP, struggle with both aspects so cannot be directly used for this task. To +mitigate this gap, we leverage large foundation models to disentangle both +images and texts into triplets in the format of (subject, predicate, object). +After that, grounding is accomplished by calculating the structural similarity +matrix between visual and textual triplets with a VLA model, and subsequently +propagate it to an instance-level similarity matrix. Furthermore, to equip VLA +models with the ability of relationship understanding, we design a +triplet-matching objective to fine-tune the VLA models on a collection of +curated dataset containing abundant entity relationships. Experiments +demonstrate that our visual grounding performance increase of up to 19.5% over +the SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo +dataset, our zero-shot approach achieves comparable accuracy to the fully +supervised model. Code is available at +https://github.com/Show-han/Zeroshot_REC. + +
+
+ comment: CVPR 2024, Code available at https://github.com/Show-han/Zeroshot_REC +
+
+
+
+
+ + ♻ ☆ Multi-person 3D pose estimation from unlabelled data + + +
+ Its numerous applications make multi-human 3D pose estimation a remarkably +impactful area of research. Nevertheless, assuming a multiple-view system +composed of several regular RGB cameras, 3D multi-pose estimation presents +several challenges. First of all, each person must be uniquely identified in +the different views to separate the 2D information provided by the cameras. +Secondly, the 3D pose estimation process from the multi-view 2D information of +each person must be robust against noise and potential occlusions in the +scenario. In this work, we address these two challenges with the help of deep +learning. Specifically, we present a model based on Graph Neural Networks +capable of predicting the cross-view correspondence of the people in the +scenario along with a Multilayer Perceptron that takes the 2D points to yield +the 3D poses of each person. These two models are trained in a self-supervised +manner, thus avoiding the need for large datasets with 3D annotations. + +
+
+
+
+
+ + ♻ ☆ Influencer Backdoor Attack on Semantic Segmentation + + +
+ When a small number of poisoned samples are injected into the training +dataset of a deep neural network, the network can be induced to exhibit +malicious behavior during inferences, which poses potential threats to +real-world applications. While they have been intensively studied in +classification, backdoor attacks on semantic segmentation have been largely +overlooked. Unlike classification, semantic segmentation aims to classify every +pixel within a given image. In this work, we explore backdoor attacks on +segmentation models to misclassify all pixels of a victim class by injecting a +specific trigger on non-victim pixels during inferences, which is dubbed +Influencer Backdoor Attack (IBA). IBA is expected to maintain the +classification accuracy of non-victim pixels and mislead classifications of all +victim pixels in every single inference and could be easily applied to +real-world scenes. Based on the context aggregation ability of segmentation +models, we proposed a simple, yet effective, Nearest-Neighbor trigger injection +strategy. We also introduce an innovative Pixel Random Labeling strategy which +maintains optimal performance even when the trigger is placed far from the +victim pixels. Our extensive experiments reveal that current segmentation +models do suffer from backdoor attacks, demonstrate IBA real-world +applicability, and show that our proposed techniques can further increase +attack performance. + +
+
+
+
+
+ + ♻ ☆ An Edit Friendly DDPM Noise Space: Inversion and Manipulations CVPR 2024 + + +
+ Denoising diffusion probabilistic models (DDPMs) employ a sequence of white +Gaussian noise samples to generate an image. In analogy with GANs, those noise +maps could be considered as the latent code associated with the generated +image. However, this native noise space does not possess a convenient +structure, and is thus challenging to work with in editing tasks. Here, we +propose an alternative latent noise space for DDPM that enables a wide range of +editing operations via simple means, and present an inversion method for +extracting these edit-friendly noise maps for any given image (real or +synthetically generated). As opposed to the native DDPM noise space, the +edit-friendly noise maps do not have a standard normal distribution and are not +statistically independent across timesteps. However, they allow perfect +reconstruction of any desired image, and simple transformations on them +translate into meaningful manipulations of the output image (e.g. shifting, +color edits). Moreover, in text-conditional models, fixing those noise maps +while changing the text prompt, modifies semantics while retaining structure. +We illustrate how this property enables text-based editing of real images via +the diverse DDPM sampling scheme (in contrast to the popular non-diverse DDIM +inversion). We also show how it can be used within existing diffusion-based +editing methods to improve their quality and diversity. Webpage: +https://inbarhub.github.io/DDPM_inversion + +
+
+ comment: CVPR 2024. Code and examples are available at + https://github.com/inbarhub/DDPM_inversion +
+
+
+
+
+ + ♻ ☆ Event Data Association via Robust Model Fitting for Event-based Object + Tracking + + +
+ Event-based approaches, which are based on bio-inspired asynchronous event +cameras, have achieved promising performance on various computer vision tasks. +However, the study of the fundamental event data association problem is still +in its infancy. In this paper, we propose a novel Event Data Association +(called EDA) approach to explicitly address the event association and fusion +problem. The proposed EDA seeks for event trajectories that best fit the event +data, in order to perform unifying data association and information fusion. In +EDA, we first asynchronously fuse the event data based on its information +entropy. Then, we introduce a deterministic model hypothesis generation +strategy, which effectively generates model hypotheses from the fused events, +to represent the corresponding event trajectories. After that, we present a +two-stage weighting algorithm, which robustly weighs and selects true models +from the generated model hypotheses, through multi-structural geometric model +fitting. Meanwhile, we also propose an adaptive model selection strategy to +automatically determine the number of the true models. Finally, we use the +selected true models to associate and fuse the event data, without being +affected by sensor noise and irrelevant structures. We evaluate the performance +of the proposed EDA on the object tracking task. The experimental results show +the effectiveness of EDA under challenging scenarios, such as high speed, +motion blur, and high dynamic range conditions. + +
+
+ comment: 32 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ A Spatio-temporal Aligned SUNet Model for Low-light Video Enhancement + + +
+ Distortions caused by low-light conditions are not only visually unpleasant +but also degrade the performance of computer vision tasks. The restoration and +enhancement have proven to be highly beneficial. However, there are only a +limited number of enhancement methods explicitly designed for videos acquired +in low-light conditions. We propose a Spatio-Temporal Aligned SUNet (STA-SUNet) +model using a Swin Transformer as a backbone to capture low light video +features and exploit their spatio-temporal correlations. The STA-SUNet model is +trained on a novel, fully registered dataset (BVI), which comprises dynamic +scenes captured under varying light conditions. It is further analysed +comparatively against various other models over three test datasets. The model +demonstrates superior adaptivity across all datasets, obtaining the highest +PSNR and SSIM values. It is particularly effective in extreme low-light +conditions, yielding fairly good visualisation results. + +
+
+
+
+
+ + ♻ ☆ DIAGNOSIS: Detecting Unauthorized Data Usages in Text-to-image Diffusion + Models ICLR 2024 + + +
+ Recent text-to-image diffusion models have shown surprising performance in +generating high-quality images. However, concerns have arisen regarding the +unauthorized data usage during the training or fine-tuning process. One example +is when a model trainer collects a set of images created by a particular artist +and attempts to train a model capable of generating similar images without +obtaining permission and giving credit to the artist. To address this issue, we +propose a method for detecting such unauthorized data usage by planting the +injected memorization into the text-to-image diffusion models trained on the +protected dataset. Specifically, we modify the protected images by adding +unique contents on these images using stealthy image warping functions that are +nearly imperceptible to humans but can be captured and memorized by diffusion +models. By analyzing whether the model has memorized the injected content +(i.e., whether the generated images are processed by the injected +post-processing function), we can detect models that had illegally utilized the +unauthorized data. Experiments on Stable Diffusion and VQ Diffusion with +different model training or fine-tuning methods (i.e, LoRA, DreamBooth, and +standard training) demonstrate the effectiveness of our proposed method in +detecting unauthorized data usages. Code: +https://github.com/ZhentingWang/DIAGNOSIS. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ DiffusionLight: Light Probes for Free by Painting a Chrome Ball CVPR 2024 + + +
+ We present a simple yet effective technique to estimate lighting in a single +input image. Current techniques rely heavily on HDR panorama datasets to train +neural networks to regress an input with limited field-of-view to a full +environment map. However, these approaches often struggle with real-world, +uncontrolled settings due to the limited diversity and size of their datasets. +To address this problem, we leverage diffusion models trained on billions of +standard images to render a chrome ball into the input image. Despite its +simplicity, this task remains challenging: the diffusion models often insert +incorrect or inconsistent objects and cannot readily generate images in HDR +format. Our research uncovers a surprising relationship between the appearance +of chrome balls and the initial diffusion noise map, which we utilize to +consistently generate high-quality chrome balls. We further fine-tune an LDR +diffusion model (Stable Diffusion XL) with LoRA, enabling it to perform +exposure bracketing for HDR light estimation. Our method produces convincing +light estimates across diverse settings and demonstrates superior +generalization to in-the-wild scenarios. + +
+
+ comment: CVPR 2024 Oral. For more information and code, please visit our + website https://diffusionlight.github.io/ +
+
+
+
+
+ + ♻ ☆ Learning Local and Global Temporal Contexts for Video Semantic + Segmentation CVPR + 2022 + + +
+ Contextual information plays a core role for video semantic segmentation +(VSS). This paper summarizes contexts for VSS in two-fold: local temporal +contexts (LTC) which define the contexts from neighboring frames, and global +temporal contexts (GTC) which represent the contexts from the whole video. As +for LTC, it includes static and motional contexts, corresponding to static and +moving content in neighboring frames, respectively. Previously, both static and +motional contexts have been studied. However, there is no research about +simultaneously learning static and motional contexts (highly complementary). +Hence, we propose a Coarse-to-Fine Feature Mining (CFFM) technique to learn a +unified presentation of LTC. CFFM contains two parts: Coarse-to-Fine Feature +Assembling (CFFA) and Cross-frame Feature Mining (CFM). CFFA abstracts static +and motional contexts, and CFM mines useful information from nearby frames to +enhance target features. To further exploit more temporal contexts, we propose +CFFM++ by additionally learning GTC from the whole video. Specifically, we +uniformly sample certain frames from the video and extract global contextual +prototypes by k-means. The information within those prototypes is mined by CFM +to refine target features. Experimental results on popular benchmarks +demonstrate that CFFM and CFFM++ perform favorably against state-of-the-art +methods. Our code is available at https://github.com/GuoleiSun/VSS-CFFM + +
+
+ comment: Accepted to TPAMI, an extended version of a paper published in CVPR + 2022 +
+
+
+
+
+ + ♻ ☆ SGV3D:Towards Scenario Generalization for Vision-based Roadside 3D + Object Detection + + +
+ Roadside perception can greatly increase the safety of autonomous vehicles by +extending their perception ability beyond the visual range and addressing blind +spots. However, current state-of-the-art vision-based roadside detection +methods possess high accuracy on labeled scenes but have inferior performance +on new scenes. This is because roadside cameras remain stationary after +installation and can only collect data from a single scene, resulting in the +algorithm overfitting these roadside backgrounds and camera poses. To address +this issue, in this paper, we propose an innovative Scenario Generalization +Framework for Vision-based Roadside 3D Object Detection, dubbed SGV3D. +Specifically, we employ a Background-suppressed Module (BSM) to mitigate +background overfitting in vision-centric pipelines by attenuating background +features during the 2D to bird's-eye-view projection. Furthermore, by +introducing the Semi-supervised Data Generation Pipeline (SSDG) using unlabeled +images from new scenes, diverse instance foregrounds with varying camera poses +are generated, addressing the risk of overfitting specific camera poses. We +evaluate our method on two large-scale roadside benchmarks. Our method +surpasses all previous methods by a significant margin in new scenes, including ++42.57% for vehicle, +5.87% for pedestrian, and +14.89% for cyclist compared to +BEVHeight on the DAIR-V2X-I heterologous benchmark. On the larger-scale Rope3D +heterologous benchmark, we achieve notable gains of 14.48% for car and 12.41% +for large vehicle. We aspire to contribute insights on the exploration of +roadside perception techniques, emphasizing their capability for scenario +generalization. The code will be available at +https://github.com/yanglei18/SGV3D + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Are We on the Right Way for Evaluating Large Vision-Language Models? + + +
+ Large vision-language models (LVLMs) have recently achieved rapid progress, +sparking numerous studies to evaluate their multi-modal capabilities. However, +we dig into current evaluation works and identify two primary issues: 1) Visual +content is unnecessary for many samples. The answers can be directly inferred +from the questions and options, or the world knowledge embedded in LLMs. This +phenomenon is prevalent across current benchmarks. For instance, GeminiPro +achieves 42.9% on the MMMU benchmark without any visual input, and outperforms +the random choice baseline across six benchmarks over 24% on average. 2) +Unintentional data leakage exists in LLM and LVLM training. LLM and LVLM could +still answer some visual-necessary questions without visual content, indicating +the memorizing of these samples within large-scale training data. For example, +Sphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM +backbone with 17.9%. Both problems lead to misjudgments of actual multi-modal +gains and potentially misguide the study of LVLM. To this end, we present +MMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500 +samples meticulously selected by humans. MMStar benchmarks 6 core capabilities +and 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with +carefully balanced and purified samples. These samples are first roughly +selected from current benchmarks with an automated pipeline, human review is +then involved to ensure each curated sample exhibits visual dependency, minimal +data leakage, and requires advanced multi-modal capabilities. Moreover, two +metrics are developed to measure data leakage and actual performance gain in +multi-modal training. We evaluate 16 leading LVLMs on MMStar to assess their +multi-modal capabilities, and on 7 benchmarks with the proposed metrics to +investigate their data leakage and actual multi-modal gain. + +
+
+ comment: Project page: https://mmstar-benchmark.github.io/ +
+
+
+
+
+ + ♻ ☆ CN-RMA: Combined Network with Ray Marching Aggregation for 3D Indoors + Object Detection from Multi-view Images CVPR2024 + + +
+ This paper introduces CN-RMA, a novel approach for 3D indoor object detection +from multi-view images. We observe the key challenge as the ambiguity of image +and 3D correspondence without explicit geometry to provide occlusion +information. To address this issue, CN-RMA leverages the synergy of 3D +reconstruction networks and 3D object detection networks, where the +reconstruction network provides a rough Truncated Signed Distance Function +(TSDF) and guides image features to vote to 3D space correctly in an end-to-end +manner. Specifically, we associate weights to sampled points of each ray +through ray marching, representing the contribution of a pixel in an image to +corresponding 3D locations. Such weights are determined by the predicted signed +distances so that image features vote only to regions near the reconstructed +surface. Our method achieves state-of-the-art performance in 3D object +detection from multi-view images, as measured by mAP@0.25 and mAP@0.5 on the +ScanNet and ARKitScenes datasets. The code and models are released at +https://github.com/SerCharles/CN-RMA. + +
+
+ comment: CVPR2024 poster paper, 8 pages of main part, and 4 pages of + supplementary material +
+
+
+
+
+ + ♻ ☆ MetaMix: Meta-state Precision Searcher for Mixed-precision Activation + Quantization AAAI + + +
+ Mixed-precision quantization of efficient networks often suffer from +activation instability encountered in the exploration of bit selections. To +address this problem, we propose a novel method called MetaMix which consists +of bit selection and weight training phases. The bit selection phase iterates +two steps, (1) the mixed-precision-aware weight update, and (2) the bit-search +training with the fixed mixed-precision-aware weights, both of which combined +reduce activation instability in mixed-precision quantization and contribute to +fast and high-quality bit selection. The weight training phase exploits the +weights and step sizes trained in the bit selection phase and fine-tunes them +thereby offering fast training. Our experiments with efficient and +hard-to-quantize networks, i.e., MobileNet v2 and v3, and ResNet-18 on ImageNet +show that our proposed method pushes the boundary of mixed-precision +quantization, in terms of accuracy vs. operations, by outperforming both mixed- +and single-precision SOTA methods. + +
+
+ comment: Proc. The 38th Annual AAAI Conference on Artificial Intelligence + (AAAI) +
+
+
+
+
+ + ♻ ☆ UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces + Parameters for Skin Lesion Segmentation + + +
+ Traditionally for improving the segmentation performance of models, most +approaches prefer to use adding more complex modules. And this is not suitable +for the medical field, especially for mobile medical devices, where +computationally loaded models are not suitable for real clinical environments +due to computational resource constraints. Recently, state-space models (SSMs), +represented by Mamba, have become a strong competitor to traditional CNNs and +Transformers. In this paper, we deeply explore the key elements of parameter +influence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight +VM-UNet) based on this. Specifically, we propose a method for processing +features in parallel Vision Mamba, named PVM Layer, which achieves excellent +performance with the lowest computational load while keeping the overall number +of processing channels constant. We conducted comparisons and ablation +experiments with several state-of-the-art lightweight models on three skin +lesion public datasets and demonstrated that the UltraLight VM-UNet exhibits +the same strong performance competitiveness with parameters of only 0.049M and +GFLOPs of 0.060. In addition, this study deeply explores the key elements of +parameter influence in Mamba, which will lay a theoretical foundation for Mamba +to possibly become a new mainstream module for lightweighting in the future. +The code is available from https://github.com/wurenkai/UltraLight-VM-UNet . + +
+
+
+
+
+ + ♻ ☆ Cross-Silo Federated Learning Across Divergent Domains with Iterative + Parameter Alignment + + +
+ Learning from the collective knowledge of data dispersed across private +sources can provide neural networks with enhanced generalization capabilities. +Federated learning, a method for collaboratively training a machine learning +model across remote clients, achieves this by combining client models via the +orchestration of a central server. However, current approaches face two +critical limitations: i) they struggle to converge when client domains are +sufficiently different, and ii) current aggregation techniques produce an +identical global model for each client. In this work, we address these issues +by reformulating the typical federated learning setup: rather than learning a +single global model, we learn N models each optimized for a common objective. +To achieve this, we apply a weighted distance minimization to model parameters +shared in a peer-to-peer topology. The resulting framework, Iterative Parameter +Alignment, applies naturally to the cross-silo setting, and has the following +properties: (i) a unique solution for each participant, with the option to +globally converge each model in the federation, and (ii) an optional +early-stopping mechanism to elicit fairness among peers in collaborative +learning settings. These characteristics jointly provide a flexible new +framework for iteratively learning from peer models trained on disparate +datasets. We find that the technique achieves competitive results on a variety +of data partitions compared to state-of-the-art approaches. Further, we show +that the method is robust to divergent domains (i.e. disjoint classes across +peers) where existing approaches struggle. + +
+
+ comment: Published at IEEE Big Data 2023 +
+
+
+
+
+ + ♻ ☆ Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis CVPR 2024 + + +
+ Diffusion model is a promising approach to image generation and has been +employed for Pose-Guided Person Image Synthesis (PGPIS) with competitive +performance. While existing methods simply align the person appearance to the +target pose, they are prone to overfitting due to the lack of a high-level +semantic understanding on the source person image. In this paper, we propose a +novel Coarse-to-Fine Latent Diffusion (CFLD) method for PGPIS. In the absence +of image-caption pairs and textual prompts, we develop a novel training +paradigm purely based on images to control the generation process of a +pre-trained text-to-image diffusion model. A perception-refined decoder is +designed to progressively refine a set of learnable queries and extract +semantic understanding of person images as a coarse-grained prompt. This allows +for the decoupling of fine-grained appearance and pose information controls at +different stages, and thus circumventing the potential overfitting problem. To +generate more realistic texture details, a hybrid-granularity attention module +is proposed to encode multi-scale fine-grained appearance features as bias +terms to augment the coarse-grained prompt. Both quantitative and qualitative +experimental results on the DeepFashion benchmark demonstrate the superiority +of our method over the state of the arts for PGPIS. Code is available at +https://github.com/YanzuoLu/CFLD. + +
+
+ comment: Accepted by CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ One-Step Late Fusion Multi-view Clustering with Compressed Subspace ICASSP2024 + + +
+ Late fusion multi-view clustering (LFMVC) has become a rapidly growing class +of methods in the multi-view clustering (MVC) field, owing to its excellent +computational speed and clustering performance. One bottleneck faced by +existing late fusion methods is that they are usually aligned to the average +kernel function, which makes the clustering performance highly dependent on the +quality of datasets. Another problem is that they require subsequent k-means +clustering after obtaining the consensus partition matrix to get the final +discrete labels, and the resulting separation of the label learning and cluster +structure optimization processes limits the integrity of these models. To +address the above issues, we propose an integrated framework named One-Step +Late Fusion Multi-view Clustering with Compressed Subspace (OS-LFMVC-CS). +Specifically, we use the consensus subspace to align the partition matrix while +optimizing the partition fusion, and utilize the fused partition matrix to +guide the learning of discrete labels. A six-step iterative optimization +approach with verified convergence is proposed. Sufficient experiments on +multiple datasets validate the effectiveness and efficiency of our proposed +method. + +
+
+ comment: Accepted by ICASSP2024 +
+
+
+
+
+ + ♻ ☆ Deepfake Generation and Detection: A Benchmark and Survey + + +
+ In addition to the advancements in deepfake generation, corresponding +detection technologies need to continuously evolve to regulate the potential +misuse of deepfakes, such as for privacy invasion and phishing attacks. This +survey comprehensively reviews the latest developments in deepfake generation +and detection, summarizing and analyzing the current state of the art in this +rapidly evolving field. We first unify task definitions, comprehensively +introduce datasets and metrics, and discuss the development of generation and +detection technology frameworks. Then, we discuss the development of several +related sub-fields and focus on researching four mainstream deepfake fields: +popular face swap, face reenactment, talking face generation, and facial +attribute editing, as well as foreign detection. Subsequently, we +comprehensively benchmark representative methods on popular datasets for each +field, fully evaluating the latest and influential works published in top +conferences/journals. Finally, we analyze the challenges and future research +directions of the discussed fields. We closely follow the latest developments +in https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection. + +
+
+
+
+
+ + ♻ ☆ MultIOD: Rehearsal-free Multihead Incremental Object Detector CVPR 2024 + + +
+ Class-Incremental learning (CIL) refers to the ability of artificial agents +to integrate new classes as they appear in a stream. It is particularly +interesting in evolving environments where agents have limited access to memory +and computational resources. The main challenge of incremental learning is +catastrophic forgetting, the inability of neural networks to retain past +knowledge when learning a new one. Unfortunately, most existing +class-incremental methods for object detection are applied to two-stage +algorithms such as Faster-RCNN, and rely on rehearsal memory to retain past +knowledge. We argue that those are not suitable in resource-limited +environments, and more effort should be dedicated to anchor-free and +rehearsal-free object detection. In this paper, we propose MultIOD, a +class-incremental object detector based on CenterNet. Our contributions are: +(1) we propose a multihead feature pyramid and multihead detection architecture +to efficiently separate class representations, (2) we employ transfer learning +between classes learned initially and those learned incrementally to tackle +catastrophic forgetting, and (3) we use a class-wise non-max-suppression as a +post-processing technique to remove redundant boxes. Results show that our +method outperforms state-of-the-art methods on two Pascal VOC datasets, while +only saving the model in its current state, contrary to other +distillation-based counterparts. + +
+
+ comment: Accepted at the archival track of the Workshop on Continual Learning + in Computer Vision (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ BlockFusion: Expandable 3D Scene Generation using Latent Tri-plane + Extrapolation + + +
+ We present BlockFusion, a diffusion-based model that generates 3D scenes as +unit blocks and seamlessly incorporates new blocks to extend the scene. +BlockFusion is trained using datasets of 3D blocks that are randomly cropped +from complete 3D scene meshes. Through per-block fitting, all training blocks +are converted into the hybrid neural fields: with a tri-plane containing the +geometry features, followed by a Multi-layer Perceptron (MLP) for decoding the +signed distance values. A variational auto-encoder is employed to compress the +tri-planes into the latent tri-plane space, on which the denoising diffusion +process is performed. Diffusion applied to the latent representations allows +for high-quality and diverse 3D scene generation. To expand a scene during +generation, one needs only to append empty blocks to overlap with the current +scene and extrapolate existing latent tri-planes to populate new blocks. The +extrapolation is done by conditioning the generation process with the feature +samples from the overlapping tri-planes during the denoising iterations. Latent +tri-plane extrapolation produces semantically and geometrically meaningful +transitions that harmoniously blend with the existing scene. A 2D layout +conditioning mechanism is used to control the placement and arrangement of +scene elements. Experimental results indicate that BlockFusion is capable of +generating diverse, geometrically consistent and unbounded large 3D scenes with +unprecedented high-quality shapes in both indoor and outdoor scenarios. + +
+
+ comment: Video: https://www.youtube.com/watch?v=PxIBtd6G0mA +
+
+
+
+
+ + ♻ ☆ Learning Zero-Shot Material States Segmentation, by Implanting Natural + Image Patterns in Synthetic Data + + +
+ Visual understanding and segmentation of materials and their states is +fundamental to understanding the physical world. The myriad textures, shapes, +and often blurry boundaries formed by materials make this task particularly +hard to generalize. Whether it's identifying wet regions of a surface, minerals +in rocks, infected regions in plants, or pollution in water, each material +state has its own unique form. For neural nets to learn general class-agnostic +material segmentation, it is necessary to first collect and annotate data that +captures this complexity. Collecting and manually annotating real-world images +is limited by the cost and precision of manual labor. In contrast, synthetic +CGI data is highly accurate and almost cost-free, but fails to replicate the +vast diversity of the material world. This work offers a method to bridge this +crucial gap by implanting patterns extracted from real-world images in +synthetic data. Hence, patterns automatically collected from natural images are +used to map materials into synthetic scenes. This unsupervised approach allows +the generated data to capture the vast complexity of the real world while +maintaining the precision and scale of synthetic data. We also present the +first general benchmark for zero-shot material state segmentation. The +benchmark contains a wide range of real-world images of material states, like +food, rocks, construction, plants, liquids, and many others, each in various +states (wet/dry/stained/cooked/burned/worn/rusted/sediment/foam, etc.). The +annotation includes both partial similarity between regions with similar but +not identical materials, and hard segmentation of only points in the exact same +material state. We show that net trains on MatSeg significantly outperform +existing state-of-the-art methods on this task. The dataset, code, and trained +model are available + +
+
+
+
+
+ + ♻ ☆ Improved Probabilistic Image-Text Representations ICLR 2024 + + +
+ Image-Text Matching (ITM) task, a fundamental vision-language (VL) task, +suffers from the inherent ambiguity arising from multiplicity and imperfect +annotations. Deterministic functions are not sufficiently powerful to capture +ambiguity, prompting the exploration of probabilistic embeddings to tackle the +challenge. However, the existing probabilistic ITM approach encounters two key +shortcomings; the burden of heavy computations due to the Monte Carlo +approximation, and the loss saturation issue in the face of abundant false +negatives. To overcome the issues, this paper presents an improved +Probabilistic Cross-Modal Embeddings (named PCME++) by introducing a new +probabilistic distance with a closed-form solution. In addition, two +optimization techniques are proposed to enhance PCME++ further: first, the +incorporation of pseudo-positives to prevent the negative effect under massive +false negatives; second, mixed sample data augmentation for probabilistic +matching. Experimental results on MS-COCO Caption and two extended benchmarks, +CxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to +state-of-the-art ITM methods. The robustness of PCME++ is also evaluated under +noisy image-text correspondences. In addition, the potential applicability of +PCME++ in automatic prompt-filtering for zero-shot classification is shown. The +code is available at https://github.com/naver-ai/pcmepp + +
+
+ comment: ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp. + Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB +
+
+
+
+
+ + ♻ ☆ Industrial Application of 6D Pose Estimation for Robotic Manipulation in + Automotive Internal Logistics + + +
+ Despite the advances in robotics a large proportion of the of parts handling +tasks in the automotive industry's internal logistics are not automated but +still performed by humans. A key component to competitively automate these +processes is a 6D pose estimation that can handle a large number of different +parts, is adaptable to new parts with little manual effort, and is sufficiently +accurate and robust with respect to industry requirements. In this context, the +question arises as to the current status quo with respect to these measures. To +address this we built a representative 6D pose estimation pipeline with +state-of-the-art components from economically scalable real to synthetic data +generation to pose estimators and evaluated it on automotive parts with regards +to a realistic sequencing process. We found that using the data generation +approaches, the performance of the trained 6D pose estimators are promising, +but do not meet industry requirements. We reveal that the reason for this is +the inability of the estimators to provide reliable uncertainties for their +poses, rather than the ability of to provide sufficiently accurate poses. In +this context we further analyzed how RGB- and RGB-D-based approaches compare +against this background and show that they are differently vulnerable to the +domain gap induced by synthetic data. + +
+
+ comment: Accepted for publication at IEEE International Conference on + Automation Science and Engineering (CASE 2023) +
+
+
+
+
+ + ♻ ☆ Self-training via Metric Learning for Source-Free Domain Adaptation of + Semantic Segmentation + + +
+ Unsupervised source-free domain adaptation methods aim to train a model for +the target domain utilizing a pretrained source-domain model and unlabeled +target-domain data, particularly when accessibility to source data is +restricted due to intellectual property or privacy concerns. Traditional +methods usually use self-training with pseudo-labeling, which is often +subjected to thresholding based on prediction confidence. However, such +thresholding limits the effectiveness of self-training due to insufficient +supervision. This issue becomes more severe in a source-free setting, where +supervision comes solely from the predictions of the pre-trained source model. +In this study, we propose a novel approach by incorporating a mean-teacher +model, wherein the student network is trained using all predictions from the +teacher network. Instead of employing thresholding on predictions, we introduce +a method to weight the gradients calculated from pseudo-labels based on the +reliability of the teacher's predictions. To assess reliability, we introduce a +novel approach using proxy-based metric learning. Our method is evaluated in +synthetic-to-real and cross-city scenarios, demonstrating superior performance +compared to existing state-of-the-art methods. + +
+
+ comment: This paper is under consideration at Computer Vision and Image + Understanding +
+
+
+
+
+ + ♻ ☆ Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of + Figure Skating + + +
+ The fine-grained action analysis of the existing action datasets is +challenged by insufficient action categories, low fine granularities, limited +modalities, and tasks. In this paper, we propose a Multi-modality and +Multi-task dataset of Figure Skating (MMFS) which was collected from the World +Figure Skating Championships. MMFS, which possesses action recognition and +action quality assessment, captures RGB, skeleton, and is collected the score +of actions from 11671 clips with 256 categories including spatial and temporal +labels. The key contributions of our dataset fall into three aspects as +follows. (1) Independently spatial and temporal categories are first proposed +to further explore fine-grained action recognition and quality assessment. (2) +MMFS first introduces the skeleton modality for complex fine-grained action +quality assessment. (3) Our multi-modality and multi-task dataset encourage +more action analysis models. To benchmark our dataset, we adopt RGB-based and +skeleton-based baseline methods for action recognition and action quality +assessment. + +
+
+
+
+
+ + ♻ ☆ Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering + Regularization for Multi-Modal 3D Semantic Occupancy Prediction + + +
+ 3D semantic occupancy prediction is a pivotal task in the field of autonomous +driving. Recent approaches have made great advances in 3D semantic occupancy +predictions on a single modality. However, multi-modal semantic occupancy +prediction approaches have encountered difficulties in dealing with the +modality heterogeneity, modality misalignment, and insufficient modality +interactions that arise during the fusion of different modalities data, which +may result in the loss of important geometric and semantic information. This +letter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy +prediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera +feature fusion with implicit volume rendering regularization. The key insight +is that volume rendering in the feature space can proficiently bridge the gap +between 3D LiDAR sweeps and 2D images while serving as a physical +regularization to enhance LiDAR-camera fused volumetric representation. +Specifically, we first propose a Geometric- and Semantic-aware Fusion +(GSFusion) module to explicitly enhance LiDAR features by incorporating +neighboring camera features through a K-nearest neighbors (KNN) search. Then, +we employ volume rendering to project the fused feature back to the image +planes for reconstructing color and depth maps. These maps are then supervised +by input images from the camera and depth estimations derived from LiDAR, +respectively. Extensive experiments on the popular nuScenes and SemanticKITTI +benchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy +prediction. The project page is available at +https://rorisis.github.io/Co-Occ_project-page/. + +
+
+
+
+
+ + ♻ ☆ Anchor-based Multi-view Subspace Clustering with Hierarchical Feature + Descent + + +
+ Multi-view clustering has attracted growing attention owing to its +capabilities of aggregating information from various sources and its promising +horizons in public affairs. Up till now, many advanced approaches have been +proposed in recent literature. However, there are several ongoing difficulties +to be tackled. One common dilemma occurs while attempting to align the features +of different views. {Moreover, due to the fact that many existing multi-view +clustering algorithms stem from spectral clustering, this results to cubic time +complexity w.r.t. the number of dataset. However, we propose Anchor-based +Multi-view Subspace Clustering with Hierarchical Feature Descent(MVSC-HFD) to +tackle the discrepancy among views through hierarchical feature descent and +project to a common subspace( STAGE 1), which reveals dependency of different +views. We further reduce the computational complexity to linear time cost +through a unified sampling strategy in the common subspace( STAGE 2), followed +by anchor-based subspace clustering to learn the bipartite graph collectively( +STAGE 3). }Extensive experimental results on public benchmark datasets +demonstrate that our proposed model consistently outperforms the +state-of-the-art techniques. + +
+
+
+
+
+ + ♻ ☆ Simple Semantic-Aided Few-Shot Learning CVPR 2024 + + +
+ Learning from a limited amount of data, namely Few-Shot Learning, stands out +as a challenging computer vision task. Several works exploit semantics and +design complicated semantic fusion mechanisms to compensate for rare +representative features within restricted data. However, relying on naive +semantics such as class names introduces biases due to their brevity, while +acquiring extensive semantics from external knowledge takes a huge time and +effort. This limitation severely constrains the potential of semantics in +Few-Shot Learning. In this paper, we design an automatic way called Semantic +Evolution to generate high-quality semantics. The incorporation of high-quality +semantics alleviates the need for complex network structures and learning +algorithms used in previous works. Hence, we employ a simple two-layer network +termed Semantic Alignment Network to transform semantics and visual features +into robust class prototypes with rich discriminative features for few-shot +classification. The experimental results show our framework outperforms all +previous methods on six benchmarks, demonstrating a simple network with +high-quality semantics can beat intricate multi-modal modules on few-shot +classification tasks. Code is available at +https://github.com/zhangdoudou123/SemFew. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Multi-Threshold Spiking-UNet for Image Processing + + +
+ U-Net, known for its simple yet efficient architecture, is widely utilized +for image processing tasks and is particularly suitable for deployment on +neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for +image processing, which combines the power of Spiking Neural Networks (SNNs) +with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two +primary challenges: ensuring high-fidelity information propagation through the +network via spikes and formulating an effective training strategy. To address +the issue of information loss, we introduce multi-threshold spiking neurons, +which improve the efficiency of information transmission within the +Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning +pipeline that leverage pre-trained U-Net models. During the conversion process, +significant variability in data distribution across different parts is observed +when utilizing skip connections. Therefore, we propose a connection-wise +normalization method to prevent inaccurate firing rates. Furthermore, we adopt +a flow-based training method to fine-tune the converted models, reducing time +steps while preserving performance. Experimental results show that, on image +segmentation and denoising, our Spiking-UNet achieves comparable performance to +its non-spiking counterpart, surpassing existing SNN methods. Compared with the +converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference +time by approximately 90\%. This research broadens the application scope of +SNNs in image processing and is expected to inspire further exploration in the +field of neuromorphic engineering. The code for our Spiking-UNet implementation +is available at https://github.com/SNNresearch/Spiking-UNet. + +
+
+ comment: Accepted in NeuroComputing +
+
+
+
+
+ + ♻ ☆ PASTA: Towards Flexible and Efficient HDR Imaging Via Progressively + Aggregated Spatio-Temporal Alignment + + +
+ Leveraging Transformer attention has led to great advancements in HDR +deghosting. However, the intricate nature of self-attention introduces +practical challenges, as existing state-of-the-art methods often demand +high-end GPUs or exhibit slow inference speeds, especially for high-resolution +images like 2K. Striking an optimal balance between performance and latency +remains a critical concern. In response, this work presents PASTA, a novel +Progressively Aggregated Spatio-Temporal Alignment framework for HDR +deghosting. Our approach achieves effectiveness and efficiency by harnessing +hierarchical representation during feature distanglement. Through the +utilization of diverse granularities within the hierarchical structure, our +method substantially boosts computational speed and optimizes the HDR imaging +workflow. In addition, we explore within-scale feature modeling with local and +global attention, gradually merging and refining them in a coarse-to-fine +fashion. Experimental results showcase PASTA's superiority over current SOTA +methods in both visual quality and performance metrics, accompanied by a +substantial 3-fold (x3) increase in inference speed. + +
+
+
+
+
+ + ♻ ☆ PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation + + +
+ Beyond class frequency, we recognize the impact of class-wise relationships +among various class-specific predictions and the imbalance in label masks on +long-tailed segmentation learning. To address these challenges, we propose an +innovative Pixel-wise Adaptive Training (PAT) technique tailored for +long-tailed segmentation. PAT has two key features: 1) class-wise gradient +magnitude homogenization, and 2) pixel-wise class-specific loss adaptation +(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate +the imbalance among label masks by ensuring equal consideration of the +class-wise impact on model updates. Second, PCLA tackles the detrimental impact +of both rare classes within the long-tailed distribution and inaccurate +predictions from previous training stages by encouraging learning classes with +low prediction confidence and guarding against forgetting classes with high +confidence. This combined approach fosters robust learning while preventing the +model from forgetting previously learned knowledge. PAT exhibits significant +performance improvements, surpassing the current state-of-the-art by 2.2% in +the NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and +intersection over union value by 2.07%, with a particularly notable declination +of 0.39% in detecting rare classes compared to Balance Logits Variation, as +demonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and +NYU. + +
+
+
+
+
+ + ♻ ☆ Anomaly Score: Evaluating Generative Models and Individual Generated + Images based on Complexity and Vulnerability CVPR 2024 + + +
+ With the advancement of generative models, the assessment of generated images +becomes more and more important. Previous methods measure distances between +features of reference and generated images from trained vision models. In this +paper, we conduct an extensive investigation into the relationship between the +representation space and input space around generated images. We first propose +two measures related to the presence of unnatural elements within images: +complexity, which indicates how non-linear the representation space is, and +vulnerability, which is related to how easily the extracted feature changes by +adversarial input changes. Based on these, we introduce a new metric to +evaluating image-generative models called anomaly score (AS). Moreover, we +propose AS-i (anomaly score for individual images) that can effectively +evaluate generated images individually. Experimental results demonstrate the +validity of the proposed approach. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Scalable 3D Registration via Truncated Entry-wise Absolute Residuals CVPR 2024 + + +
+ Given an input set of $3$D point pairs, the goal of outlier-robust $3$D +registration is to compute some rotation and translation that align as many +point pairs as possible. This is an important problem in computer vision, for +which many highly accurate approaches have been recently proposed. Despite +their impressive performance, these approaches lack scalability, often +overflowing the $16$GB of memory of a standard laptop to handle roughly +$30,000$ point pairs. In this paper, we propose a $3$D registration approach +that can process more than ten million ($10^7$) point pairs with over $99\%$ +random outliers. Moreover, our method is efficient, entails low memory costs, +and maintains high accuracy at the same time. We call our method TEAR, as it +involves minimizing an outlier-robust loss that computes Truncated Entry-wise +Absolute Residuals. To minimize this loss, we decompose the original +$6$-dimensional problem into two subproblems of dimensions $3$ and $2$, +respectively, solved in succession to global optimality via a customized +branch-and-bound method. While branch-and-bound is often slow and unscalable, +this does not apply to TEAR as we propose novel bounding functions that are +tight and computationally efficient. Experiments on various datasets are +conducted to validate the scalability and efficiency of our method. + +
+
+ comment: 24 pages, 12 figures. Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis + via Bridging Image and Video Diffusion Models CVPR 2024 + + +
+ Diffusion models have made tremendous progress in text-driven image and video +generation. Now text-to-image foundation models are widely applied to various +downstream image synthesis tasks, such as controllable image generation and +image editing, while downstream video synthesis tasks are less explored for +several reasons. First, it requires huge memory and computation overhead to +train a video generation foundation model. Even with video foundation models, +additional costly training is still required for downstream video synthesis +tasks. Second, although some works extend image diffusion models into videos in +a training-free manner, temporal consistency cannot be well preserved. Finally, +these adaption methods are specifically designed for one task and fail to +generalize to different tasks. To mitigate these issues, we propose a +training-free general-purpose video synthesis framework, coined as {\bf +BIVDiff}, via bridging specific image diffusion models and general +text-to-video foundation diffusion models. Specifically, we first use a +specific image diffusion model (e.g., ControlNet and Instruct Pix2Pix) for +frame-wise video generation, then perform Mixed Inversion on the generated +video, and finally input the inverted latents into the video diffusion models +(e.g., VidRD and ZeroScope) for temporal smoothing. This decoupled framework +enables flexible image model selection for different purposes with strong task +generalization and high efficiency. To validate the effectiveness and general +use of BIVDiff, we perform a wide range of video synthesis tasks, including +controllable video generation, video editing, video inpainting, and +outpainting. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://bivdiff.github.io; + GitHub repository: https://github.com/MCG-NJU/BIVDiff +
+
+
+
+
+ + ♻ ☆ Empowering Image Recovery_ A Multi-Attention Approach + + +
+ We propose Diverse Restormer (DART), a novel image restoration method that +effectively integrates information from various sources (long sequences, local +and global regions, feature dimensions, and positional dimensions) to address +restoration challenges. While Transformer models have demonstrated excellent +performance in image restoration due to their self-attention mechanism, they +face limitations in complex scenarios. Leveraging recent advancements in +Transformers and various attention mechanisms, our method utilizes customized +attention mechanisms to enhance overall performance. DART, our novel network +architecture, employs windowed attention to mimic the selective focusing +mechanism of human eyes. By dynamically adjusting receptive fields, it +optimally captures the fundamental features crucial for image resolution +reconstruction. Efficiency and performance balance are achieved through the +LongIR attention mechanism for long sequence image restoration. Integration of +attention mechanisms across feature and positional dimensions further enhances +the recovery of fine details. Evaluation across five restoration tasks +consistently positions DART at the forefront. Upon acceptance, we commit to +providing publicly accessible code and models to ensure reproducibility and +facilitate further research. + +
+
+ comment: 12 pages, 10 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation + in VEM images + + +
+ While imaging techniques at macro and mesoscales have garnered substantial +attention and resources, microscale VEM imaging, capable of revealing intricate +vascular details, has lacked the necessary benchmarking infrastructure. In this +paper, we address a significant gap in the field of neuroimaging by introducing +the largest-to-date public benchmark, \textbf{BvEM}, designed specifically for +cortical blood vessel segmentation in volume electron microscopy (VEM) images. +Our BvEM benchmark is based on VEM image volumes from three mammal species: +adult mouse, macaque, and human. We standardized the resolution, addressed +imaging variations, and meticulously annotated blood vessels through +semi-automatic, manual, and quality control processes, ensuring high-quality 3D +segmentation. Furthermore, we developed a zero-shot cortical blood vessel +segmentation method named TriSAM, which leverages the powerful segmentation +model SAM for 3D segmentation. To extend SAM from 2D to 3D volume segmentation, +TriSAM employs a multi-seed tracking framework, leveraging the reliability of +certain image planes for tracking while using others to identify potential +turning points. This approach effectively achieves long-term 3D blood vessel +segmentation without model training or fine-tuning. Experimental results show +that TriSAM achieved superior performances on the BvEM benchmark across three +species. + +
+
+ comment: BvEM-Mouse can be visualized at: https://tinyurl.com/yc2s38x9 +
+
+
+
+
+ + ♻ ☆ GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped + Robot + + +
+ Multi-task robot learning holds significant importance in tackling diverse +and complex scenarios. However, current approaches are hindered by performance +issues and difficulties in collecting training datasets. In this paper, we +propose GeRM (Generalist Robotic Model). We utilize offline reinforcement +learning to optimize data utilization strategies to learn from both +demonstrations and sub-optimal data, thus surpassing the limitations of human +demonstrations. Thereafter, we employ a transformer-based VLA network to +process multi-modal inputs and output actions. By introducing the +Mixture-of-Experts structure, GeRM allows faster inference speed with higher +whole model capacity, and thus resolves the issue of limited RL parameters, +enhancing model performance in multi-task learning while controlling +computational costs. Through a series of experiments, we demonstrate that GeRM +outperforms other methods across all tasks, while also validating its +efficiency in both training and inference processes. Additionally, we uncover +its potential to acquire emergent skills. Additionally, we contribute the +QUARD-Auto dataset, collected automatically to support our training approach +and foster advancements in multi-task quadruped robot learning. This work +presents a new paradigm for reducing the cost of collecting robot data and +driving progress in the multi-task learning community. You can reach our +project and video through the link: https://songwxuan.github.io/GeRM/ . + +
+
+
+
+
+ + ♻ ☆ Exploring Recurrent Long-term Temporal Fusion for Multi-view 3D + Perception + + +
+ Long-term temporal fusion is a crucial but often overlooked technique in +camera-based Bird's-Eye-View (BEV) 3D perception. Existing methods are mostly +in a parallel manner. While parallel fusion can benefit from long-term +information, it suffers from increasing computational and memory overheads as +the fusion window size grows. Alternatively, BEVFormer adopts a recurrent +fusion pipeline so that history information can be efficiently integrated, yet +it fails to benefit from longer temporal frames. In this paper, we explore an +embarrassingly simple long-term recurrent fusion strategy built upon the +LSS-based methods and find it already able to enjoy the merits from both sides, +i.e., rich long-term information and efficient fusion pipeline. A temporal +embedding module is further proposed to improve the model's robustness against +occasionally missed frames in practical scenarios. We name this simple but +effective fusing pipeline VideoBEV. Experimental results on the nuScenes +benchmark show that VideoBEV obtains strong performance on various camera-based +3D perception tasks, including object detection (55.4\% mAP and 62.9\% NDS), +segmentation (48.6\% vehicle mIoU), tracking (54.8\% AMOTA), and motion +prediction (0.80m minADE and 0.463 EPA). + +
+
+
+
+
+ + ♻ ☆ Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following + + +
+ Existing text-to-image (T2I) diffusion models usually struggle in +interpreting complex prompts, especially those with quantity, object-attribute +binding, and multi-subject descriptions. In this work, we introduce a semantic +panel as the middleware in decoding texts to images, supporting the generator +to better follow instructions. The panel is obtained through arranging the +visual concepts parsed from the input text by the aid of large language models, +and then injected into the denoising network as a detailed control signal to +complement the text condition. To facilitate text-to-panel learning, we come up +with a carefully designed semantic formatting protocol, accompanied by a +fully-automatic data preparation pipeline. Thanks to such a design, our +approach, which we call Ranni, manages to enhance a pre-trained T2I generator +regarding its textual controllability. More importantly, the introduction of +the generative middleware brings a more convenient form of interaction (i.e., +directly adjusting the elements in the panel or using language instructions) +and further allows users to finely customize their generation, based on which +we develop a practical system and showcase its potential in continuous +generation and chatting-based editing. Our project page is at +https://ranni-t2i.github.io/Ranni. + +
+
+
+
+
+ + ♻ ☆ TIM: A Time Interval Machine for Audio-Visual Action Recognition CVPR 2024 + + +
+ Diverse actions give rise to rich audio-visual signals in long videos. Recent +works showcase that the two modalities of audio and video exhibit different +temporal extents of events and distinct labels. We address the interplay +between the two modalities in long videos by explicitly modelling the temporal +extents of audio and visual events. We propose the Time Interval Machine (TIM) +where a modality-specific time interval poses as a query to a transformer +encoder that ingests a long video input. The encoder then attends to the +specified interval, as well as the surrounding context in both modalities, in +order to recognise the ongoing action. + We test TIM on three long audio-visual video datasets: EPIC-KITCHENS, +Perception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On +EPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly +larger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we +show that TIM can be adapted for action detection, using dense multi-scale +interval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and +showing strong performance on the Perception Test. Our ablations show the +critical role of integrating the two modalities and modelling their time +intervals in achieving this performance. Code and models at: +https://github.com/JacobChalk/TIM + +
+
+ comment: Accepted to CVPR 2024. Project Webpage: + https://jacobchalk.github.io/TIM-Project +
+
+
+
+
+ + ♻ ☆ BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics + + +
+ The recently emerging text-to-motion advances have spired numerous attempts +for convenient and interactive human motion generation. Yet, existing methods +are largely limited to generating body motions only without considering the +rich two-hand motions, let alone handling various conditions like body dynamics +or texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal +dataset for two-hand motion generation. Our dataset includes accurate motion +tracking for the human body and hands and provides pair-wised finger-level hand +annotations and body descriptions. We further provide a strong baseline method, +BOTH2Hands, for the novel task: generating vivid two-hand motions from both +implicit body dynamics and explicit text prompts. We first warm up two parallel +body-to-hand and text-to-hand diffusion models and then utilize the +cross-attention transformer for motion blending. Extensive experiments and +cross-validations demonstrate the effectiveness of our approach and dataset for +generating convincing two-hand motions from the hybrid body-and-textual +conditions. Our dataset and code will be disseminated to the community for +future research. + +
+
+
+
+
+ + ♻ ☆ Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and + Integration of Convolutional Neural Networks and Explainable AI + + +
+ The study introduces an integrated framework combining Convolutional Neural +Networks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced +diagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned +ResNet50 architecture, our investigation not only provides effective +differentiation of mammographic images into benign and malignant categories but +also addresses the opaque "black-box" nature of deep learning models by +employing XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN +decision-making processes for healthcare professionals. Our methodology +encompasses an elaborate data preprocessing pipeline and advanced data +augmentation techniques to counteract dataset limitations, and transfer +learning using pre-trained networks, such as VGG-16, DenseNet and ResNet was +employed. A focal point of our study is the evaluation of XAI's effectiveness +in interpreting model predictions, highlighted by utilising the Hausdorff +measure to assess the alignment between AI-generated explanations and expert +annotations quantitatively. This approach plays a critical role for XAI in +promoting trustworthiness and ethical fairness in AI-assisted diagnostics. The +findings from our research illustrate the effective collaboration between CNNs +and XAI in advancing diagnostic methods for breast cancer, thereby facilitating +a more seamless integration of advanced AI technologies within clinical +settings. By enhancing the interpretability of AI-driven decisions, this work +lays the groundwork for improved collaboration between AI systems and medical +practitioners, ultimately enriching patient care. Furthermore, the implications +of our research extend well beyond the current methodologies, advocating for +subsequent inquiries into the integration of multimodal data and the refinement +of AI explanations to satisfy the needs of clinical practice. + +
+
+
+
+
+ + ♻ ☆ Learning Invariant Inter-pixel Correlations for Superpixel Generation AAAI24 + + +
+ Deep superpixel algorithms have made remarkable strides by substituting +hand-crafted features with learnable ones. Nevertheless, we observe that +existing deep superpixel methods, serving as mid-level representation +operations, remain sensitive to the statistical properties (e.g., color +distribution, high-level semantics) embedded within the training dataset. +Consequently, learnable features exhibit constrained discriminative capability, +resulting in unsatisfactory pixel grouping performance, particularly in +untrainable application scenarios. To address this issue, we propose the +Content Disentangle Superpixel (CDS) algorithm to selectively separate the +invariant inter-pixel correlations and statistical properties, i.e., style +noise. Specifically, We first construct auxiliary modalities that are +homologous to the original RGB image but have substantial stylistic variations. +Then, driven by mutual information, we propose the local-grid correlation +alignment across modalities to reduce the distribution discrepancy of +adaptively selected features and learn invariant inter-pixel correlations. +Afterwards, we perform global-style mutual information minimization to enforce +the separation of invariant content and train data styles. The experimental +results on four benchmark datasets demonstrate the superiority of our approach +to existing state-of-the-art methods, regarding boundary adherence, +generalization, and efficiency. Code and pre-trained model are available at +https://github.com/rookiie/CDSpixel. + +
+
+ comment: Accepted by AAAI24 +
+
+
+
+
+ + ♻ ☆ SDFR: Synthetic Data for Face Recognition Competition + + +
+ Large-scale face recognition datasets are collected by crawling the Internet +and without individuals' consent, raising legal, ethical, and privacy concerns. +With the recent advances in generative models, recently several works proposed +generating synthetic face recognition datasets to mitigate concerns in +web-crawled face recognition datasets. This paper presents the summary of the +Synthetic Data for Face Recognition (SDFR) Competition held in conjunction with +the 18th IEEE International Conference on Automatic Face and Gesture +Recognition (FG 2024) and established to investigate the use of synthetic data +for training face recognition models. The SDFR competition was split into two +tasks, allowing participants to train face recognition systems using new +synthetic datasets and/or existing ones. In the first task, the face +recognition backbone was fixed and the dataset size was limited, while the +second task provided almost complete freedom on the model backbone, the +dataset, and the training pipeline. The submitted models were trained on +existing and also new synthetic datasets and used clever methods to improve +training with synthetic data. The submissions were evaluated and ranked on a +diverse set of seven benchmarking datasets. The paper gives an overview of the +submitted face recognition models and reports achieved performance compared to +baseline models trained on real and synthetic datasets. Furthermore, the +evaluation of submissions is extended to bias assessment across different +demography groups. Lastly, an outlook on the current state of the research in +training face recognition models using synthetic data is presented, and +existing problems as well as potential future directions are also discussed. + +
+
+ comment: The 18th IEEE International Conference on Automatic Face and Gesture + Recognition (FG 2024) +
+
+
+
+
+ + ♻ ☆ PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual + Observations + + +
+ Modeling and rendering photorealistic avatars is of crucial importance in +many applications. Existing methods that build a 3D avatar from visual +observations, however, struggle to reconstruct clothed humans. We introduce +PhysAvatar, a novel framework that combines inverse rendering with inverse +physics to automatically estimate the shape and appearance of a human from +multi-view video data along with the physical parameters of the fabric of their +clothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for +spatio-temporal mesh tracking as well as a physically based inverse renderer to +estimate the intrinsic material properties. PhysAvatar integrates a physics +simulator to estimate the physical parameters of the garments using +gradient-based optimization in a principled manner. These novel capabilities +enable PhysAvatar to create high-quality novel-view renderings of avatars +dressed in loose-fitting clothes under motions and lighting conditions not seen +in the training data. This marks a significant advancement towards modeling +photorealistic digital humans using physically based inverse rendering with +physics in the loop. Our project website is at: +https://qingqing-zhao.github.io/PhysAvatar + +
+
+ comment: Project Page: https://qingqing-zhao.github.io/PhysAvatar +
+
+
+
+
+ + ♻ ☆ Dense Video Object Captioning from Disjoint Supervision + + +
+ We propose a new task and model for dense video object captioning -- +detecting, tracking and captioning trajectories of objects in a video. This +task unifies spatial and temporal localization in video, whilst also requiring +fine-grained visual understanding that is best described by natural language. +We propose a unified model, and demonstrate how our end-to-end approach is more +accurate and temporally coherent than a multi-stage pipeline combining +state-of-the-art detection, tracking, and captioning models. Moreover, we +propose a training strategy based on a mixture of disjoint tasks, which allows +us to leverage diverse, large-scale datasets which supervise different parts of +our model. Although each pretraining task only provides weak supervision, they +are complementary and, when combined, result in noteworthy zero-shot ability +and serve as strong initialization for additional finetuning to further improve +accuracy. We carefully design new metrics capturing all components of our task, +and show how we can repurpose existing video grounding datasets (e.g. VidSTG +and VLN) for our new task. We show that our model improves upon a number of +strong baselines for this new task. Furthermore, we can apply our model to the +task of spatial grounding, outperforming prior state-of-the-art on VidSTG and +VLN, without explicitly training for it. Code is available at +https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc. + +
+
+ comment: Code is available at + https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc +
+
+
+
+
+ + ♻ ☆ Oriented Object Detection in Optical Remote Sensing Images using Deep + Learning: A Survey + + +
+ Oriented object detection is one of the most fundamental and challenging +tasks in remote sensing, aiming to locate and classify objects with arbitrary +orientations. Recent years have witnessed remarkable progress in oriented +object detection using deep learning techniques. Given the rapid development of +this field, this paper aims to provide a comprehensive survey of recent +advances in oriented object detection. To be specific, we first review the +technical evolution from horizontal object detection to oriented object +detection and summarize the specific challenges, including feature +misalignment, spatial misalignment, and periodicity of angle. Subsequently, we +further categorize existing methods into detection framework, oriented bounding +box (OBB) regression, and feature representations, and discuss how these +methods address the above challenges in detail. In addition, we cover several +publicly available datasets and performance evaluation protocols. Furthermore, +we provide a comprehensive comparison and analysis of state-of-the-art oriented +object detection methods. Toward the end of this paper, we discuss several +future directions for oriented object detection. + +
+
+
+
+
+ + ♻ ☆ PeerAiD: Improving Adversarial Distillation from a Specialized Peer + Tutor CVPR 2024 + + +
+ Adversarial robustness of the neural network is a significant concern when it +is applied to security-critical domains. In this situation, adversarial +distillation is a promising option which aims to distill the robustness of the +teacher network to improve the robustness of a small student network. Previous +works pretrain the teacher network to make it robust to the adversarial +examples aimed at itself. However, the adversarial examples are dependent on +the parameters of the target network. The fixed teacher network inevitably +degrades its robustness against the unseen transferred adversarial examples +which targets the parameters of the student network in the adversarial +distillation process. We propose PeerAiD to make a peer network learn the +adversarial examples of the student network instead of adversarial examples +aimed at itself. PeerAiD is an adversarial distillation that trains the peer +network and the student network simultaneously in order to make the peer +network specialized for defending the student network. We observe that such +peer networks surpass the robustness of pretrained robust teacher network +against student-attacked adversarial samples. With this peer network and +adversarial distillation, PeerAiD achieves significantly higher robustness of +the student network with AutoAttack (AA) accuracy up to 1.66%p and improves the +natural accuracy of the student network up to 4.72%p with ResNet-18 and +TinyImageNet dataset. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Object Detectors in the Open Environment: Challenges, Solutions, and + Outlook + + +
+ With the emergence of foundation models, deep learning-based object detectors +have shown practical usability in closed set scenarios. However, for real-world +tasks, object detectors often operate in open environments, where crucial +factors (e.g., data distribution, objective) that influence model learning are +often changing. The dynamic and intricate nature of the open environment poses +novel and formidable challenges to object detectors. Unfortunately, current +research on object detectors in open environments lacks a comprehensive +analysis of their distinctive characteristics, challenges, and corresponding +solutions, which hinders their secure deployment in critical real-world +scenarios. This paper aims to bridge this gap by conducting a comprehensive +review and analysis of object detectors in open environments. We initially +identified limitations of key structural components within the existing +detection pipeline and propose the open environment object detector challenge +framework that includes four quadrants (i.e., out-of-domain, out-of-category, +robust learning, and incremental learning) based on the dimensions of the data +/ target changes. For each quadrant of challenges in the proposed framework, we +present a detailed description and systematic analysis of the overarching goals +and core difficulties, systematically review the corresponding solutions, and +benchmark their performance over multiple widely adopted datasets. In addition, +we engage in a discussion of open problems and potential avenues for future +research. This paper aims to provide a fresh, comprehensive, and systematic +understanding of the challenges and solutions associated with open-environment +object detectors, thus catalyzing the development of more solid applications in +real-world scenarios. A project related to this survey can be found at +https://github.com/LiangSiyuan21/OEOD_Survey. + +
+
+ comment: 37 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Carve3D: Improving Multi-view Reconstruction Consistency for Diffusion + Models with RL Finetuning CVPR 2024 + + +
+ Multi-view diffusion models, obtained by applying Supervised Finetuning (SFT) +to text-to-image diffusion models, have driven recent breakthroughs in +text-to-3D research. However, due to the limited size and quality of existing +3D datasets, they still suffer from multi-view inconsistencies and Neural +Radiance Field (NeRF) reconstruction artifacts. We argue that multi-view +diffusion models can benefit from further Reinforcement Learning Finetuning +(RLFT), which allows models to learn from the data generated by themselves and +improve beyond their dataset limitations during SFT. To this end, we introduce +Carve3D, an improved RLFT algorithm coupled with a novel Multi-view +Reconstruction Consistency (MRC) metric, to enhance the consistency of +multi-view diffusion models. To measure the MRC metric on a set of multi-view +images, we compare them with their corresponding NeRF renderings at the same +camera viewpoints. The resulting model, which we denote as Carve3DM, +demonstrates superior multi-view consistency and NeRF reconstruction quality +than existing models. Our results suggest that pairing SFT with Carve3D's RLFT +is essential for developing multi-view-consistent diffusion models, mirroring +the standard Large Language Model (LLM) alignment pipeline. Our code, training +and testing data, and video results are available at: +https://desaixie.github.io/carve-3d. + +
+
+ comment: 22 pages, 16 figures. Our code, training and testing data, and video + results are available at: https://desaixie.github.io/carve-3d. This paper has + been accepted to CVPR 2024. v2: incorporated changes from the CVPR 2024 + camera-ready version +
+
+
+
+
+ + ♻ ☆ Surface Reconstruction from Point Clouds via Grid-based Intersection + Prediction + + +
+ Surface reconstruction from point clouds is a crucial task in the fields of +computer vision and computer graphics. SDF-based methods excel at +reconstructing smooth meshes with minimal error and artefacts but struggle with +representing open surfaces. On the other hand, UDF-based methods can +effectively represent open surfaces but often introduce noise, leading to +artefacts in the mesh. In this work, we propose a novel approach that directly +predicts the intersection points between line segment of point pairs and +implicit surfaces. To achieve it, we propose two modules named Relative +Intersection Module and Sign Module respectively with the feature of point pair +as input. To preserve the continuity of the surface, we also integrate symmetry +into the two modules, which means the position of predicted intersection will +not change even if the input order of the point pair changes. This method not +only preserves the ability to represent open surfaces but also eliminates most +artefacts on the mesh. Our approach demonstrates state-of-the-art performance +on three datasets: ShapeNet, MGN, and ScanNet. The code will be made available +upon acceptance. + +
+
+
+
+
+ + ♻ ☆ Background Noise Reduction of Attention Map for Weakly Supervised + Semantic Segmentation + + +
+ In weakly-supervised semantic segmentation (WSSS) using only image-level +class labels, a problem with CNN-based Class Activation Maps (CAM) is that they +tend to activate the most discriminative local regions of objects. On the other +hand, methods based on Transformers learn global features but suffer from the +issue of background noise contamination. This paper focuses on addressing the +issue of background noise in attention weights within the existing WSSS method +based on Conformer, known as TransCAM. The proposed method successfully reduces +background noise, leading to improved accuracy of pseudo labels. Experimental +results demonstrate that our model achieves segmentation performance of 70.5% +on the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS +COCO 2014 data, outperforming TransCAM in terms of segmentation performance. + +
+
+
+
+
+ + ♻ ☆ Improving the Accuracy-Robustness Trade-Off of Classifiers via Adaptive + Smoothing + + +
+ While prior research has proposed a plethora of methods that build neural +classifiers robust against adversarial robustness, practitioners are still +reluctant to adopt them due to their unacceptably severe clean accuracy +penalties. This paper significantly alleviates this accuracy-robustness +trade-off by mixing the output probabilities of a standard classifier and a +robust classifier, where the standard network is optimized for clean accuracy +and is not robust in general. We show that the robust base classifier's +confidence difference for correct and incorrect examples is the key to this +improvement. In addition to providing intuitions and empirical evidence, we +theoretically certify the robustness of the mixed classifier under realistic +assumptions. Furthermore, we adapt an adversarial input detector into a mixing +network that adaptively adjusts the mixture of the two base models, further +reducing the accuracy penalty of achieving robustness. The proposed flexible +method, termed "adaptive smoothing", can work in conjunction with existing or +even future methods that improve clean accuracy, robustness, or adversary +detection. Our empirical evaluation considers strong attack methods, including +AutoAttack and adaptive attack. On the CIFAR-100 dataset, our method achieves +an 85.21% clean accuracy while maintaining a 38.72% $\ell_\infty$-AutoAttacked +($\epsilon = 8/255$) accuracy, becoming the second most robust method on the +RobustBench CIFAR-100 benchmark as of submission, while improving the clean +accuracy by ten percentage points compared with all listed models. The code +that implements our method is available at +https://github.com/Bai-YT/AdaptiveSmoothing. + +
+
+
+
+
+ + ♻ ☆ SIR: Multi-view Inverse Rendering with Decomposable Shadow for Indoor + Scenes + + +
+ We propose SIR, an efficient method to decompose differentiable shadows for +inverse rendering on indoor scenes using multi-view data, addressing the +challenges in accurately decomposing the materials and lighting conditions. +Unlike previous methods that struggle with shadow fidelity in complex lighting +environments, our approach explicitly learns shadows for enhanced realism in +material estimation under unknown light positions. Utilizing posed HDR images +as input, SIR employs an SDF-based neural radiance field for comprehensive +scene representation. Then, SIR integrates a shadow term with a three-stage +material estimation approach to improve SVBRDF quality. Specifically, SIR is +designed to learn a differentiable shadow, complemented by BRDF regularization, +to optimize inverse rendering accuracy. Extensive experiments on both synthetic +and real-world indoor scenes demonstrate the superior performance of SIR over +existing methods in both quantitative metrics and qualitative analysis. The +significant decomposing ability of SIR enables sophisticated editing +capabilities like free-view relighting, object insertion, and material +replacement. The code and data are available at +https://xiaokangwei.github.io/SIR/. + +
+
+
+
+
+ + ♻ ☆ Toward Tiny and High-quality Facial Makeup with Data Amplify Learning + + +
+ Contemporary makeup approaches primarily hinge on unpaired learning +paradigms, yet they grapple with the challenges of inaccurate supervision +(e.g., face misalignment) and sophisticated facial prompts (including face +parsing, and landmark detection). These challenges prohibit low-cost deployment +of facial makeup models, especially on mobile devices. To solve above problems, +we propose a brand-new learning paradigm, termed "Data Amplify Learning (DAL)," +alongside a compact makeup model named "TinyBeauty." The core idea of DAL lies +in employing a Diffusion-based Data Amplifier (DDA) to "amplify" limited images +for the model training, thereby enabling accurate pixel-to-pixel supervision +with merely a handful of annotations. Two pivotal innovations in DDA facilitate +the above training approach: (1) A Residual Diffusion Model (RDM) is designed +to generate high-fidelity detail and circumvent the detail vanishing problem in +the vanilla diffusion models; (2) A Fine-Grained Makeup Module (FGMM) is +proposed to achieve precise makeup control and combination while retaining face +identity. Coupled with DAL, TinyBeauty necessitates merely 80K parameters to +achieve a state-of-the-art performance without intricate face prompts. +Meanwhile, TinyBeauty achieves a remarkable inference speed of up to 460 fps on +the iPhone 13. Extensive experiments show that DAL can produce highly +competitive makeup models using only 5 image pairs. + +
+
+
+
+
+ + ♻ ☆ Harnessing Meta-Learning for Improving Full-Frame Video Stabilization CVPR 2024 + + +
+ Video stabilization is a longstanding computer vision problem, particularly +pixel-level synthesis solutions for video stabilization which synthesize full +frames add to the complexity of this task. These techniques aim to stabilize +videos by synthesizing full frames while enhancing the stability of the +considered video. This intensifies the complexity of the task due to the +distinct mix of unique motion profiles and visual content present in each video +sequence, making robust generalization with fixed parameters difficult. In our +study, we introduce a novel approach to enhance the performance of pixel-level +synthesis solutions for video stabilization by adapting these models to +individual input video sequences. The proposed adaptation exploits low-level +visual cues accessible during test-time to improve both the stability and +quality of resulting videos. We highlight the efficacy of our methodology of +"test-time adaptation" through simple fine-tuning of one of these models, +followed by significant stability gain via the integration of meta-learning +techniques. Notably, significant improvement is achieved with only a single +adaptation step. The versatility of the proposed algorithm is demonstrated by +consistently improving the performance of various pixel-level synthesis models +for video stabilization in real-world scenarios. + +
+
+ comment: CVPR 2024, Code will be made availble on: + http://github.com/MKashifAli/MetaVideoStab +
+
+
+
+
+ + ♻ ☆ Detecting and Mitigating System-Level Anomalies of Vision-Based + Controllers + + +
+ Autonomous systems, such as self-driving cars and drones, have made +significant strides in recent years by leveraging visual inputs and machine +learning for decision-making and control. Despite their impressive performance, +these vision-based controllers can make erroneous predictions when faced with +novel or out-of-distribution inputs. Such errors can cascade to catastrophic +system failures and compromise system safety. In this work, we introduce a +run-time anomaly monitor to detect and mitigate such closed-loop, system-level +failures. Specifically, we leverage a reachability-based framework to +stress-test the vision-based controller offline and mine its system-level +failures. This data is then used to train a classifier that is leveraged online +to flag inputs that might cause system breakdowns. The anomaly detector +highlights issues that transcend individual modules and pertain to the safety +of the overall system. We also design a fallback controller that robustly +handles these detected anomalies to preserve system safety. We validate the +proposed approach on an autonomous aircraft taxiing system that uses a +vision-based controller for taxiing. Our results show the efficacy of the +proposed approach in identifying and handling system-level anomalies, +outperforming methods such as prediction error-based detection, and ensembling, +thereby enhancing the overall safety and robustness of autonomous systems. + +
+
+
+
+
+ + ♻ ☆ Rich Human Feedback for Text-to-Image Generation CVPR'24 + + +
+ Recent Text-to-Image (T2I) generation models such as Stable Diffusion and +Imagen have made significant progress in generating high-resolution images +based on text descriptions. However, many generated images still suffer from +issues such as artifacts/implausibility, misalignment with text descriptions, +and low aesthetic quality. Inspired by the success of Reinforcement Learning +with Human Feedback (RLHF) for large language models, prior works collected +human-provided scores as feedback on generated images and trained a reward +model to improve the T2I generation. In this paper, we enrich the feedback +signal by (i) marking image regions that are implausible or misaligned with the +text, and (ii) annotating which words in the text prompt are misrepresented or +missing on the image. We collect such rich human feedback on 18K generated +images (RichHF-18K) and train a multimodal transformer to predict the rich +feedback automatically. We show that the predicted rich human feedback can be +leveraged to improve image generation, for example, by selecting high-quality +training data to finetune and improve the generative models, or by creating +masks with predicted heatmaps to inpaint the problematic regions. Notably, the +improvements generalize to models (Muse) beyond those used to generate the +images on which human feedback data were collected (Stable Diffusion variants). +The RichHF-18K data set will be released in our GitHub repository: +https://github.com/google-research/google-research/tree/master/richhf_18k. + +
+
+ comment: CVPR'24 +
+
+
+
+
+ + ♻ ☆ Feature Re-Embedding: Towards Foundation Model-Level Performance in + Computational Pathology CVPR2024 + + +
+ Multiple instance learning (MIL) is the most widely used framework in +computational pathology, encompassing sub-typing, diagnosis, prognosis, and +more. However, the existing MIL paradigm typically requires an offline instance +feature extractor, such as a pre-trained ResNet or a foundation model. This +approach lacks the capability for feature fine-tuning within the specific +downstream tasks, limiting its adaptability and performance. To address this +issue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding +the instance features online, which captures fine-grained local features and +establishes connections across different regions. Unlike existing works that +focus on pre-training powerful feature extractor or designing sophisticated +instance aggregator, R$^2$T is tailored to re-embed instance features online. +It serves as a portable module that can seamlessly integrate into mainstream +MIL models. Extensive experimental results on common computational pathology +tasks validate that: 1) feature re-embedding improves the performance of MIL +models based on ResNet-50 features to the level of foundation model features, +and further enhances the performance of foundation model features; 2) the +R$^2$T can introduce more significant performance improvements to various MIL +models; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest +methods by a large margin.The code is available at: +https://github.com/DearCaat/RRT-MIL. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Full-dose Whole-body PET Synthesis from Low-dose PET Using + High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency + Model + + +
+ Objective: Positron Emission Tomography (PET) has been a commonly used +imaging modality in broad clinical applications. One of the most important +tradeoffs in PET imaging is between image quality and radiation dose: high +image quality comes with high radiation exposure. Improving image quality is +desirable for all clinical applications while minimizing radiation exposure is +needed to reduce risk to patients. Approach: We introduce PET Consistency Model +(PET-CM), an efficient diffusion-based method for generating high-quality +full-dose PET images from low-dose PET images. It employs a two-step process, +adding Gaussian noise to full-dose PET images in the forward diffusion, and +then denoising them using a PET Shifted-window Vision Transformer (PET-VIT) +network in the reverse diffusion. The PET-VIT network learns a consistency +function that enables direct denoising of Gaussian noise into clean full-dose +PET images. PET-CM achieves state-of-the-art image quality while requiring +significantly less computation time than other methods. Results: In experiments +comparing eighth-dose to full-dose images, PET-CM demonstrated impressive +performance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of +0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of +0.255+/-0.318%, with an average generation time of 62 seconds per patient. This +is a significant improvement compared to the state-of-the-art diffusion-based +model with PET-CM reaching this result 12x faster. Similarly, in the +quarter-dose to full-dose image experiments, PET-CM delivered competitive +outcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM +of 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of +0.151+/-0.192% using the same generation process, which underlining its high +quantitative and clinical precision in both denoising scenario. + +
+
+
+
+
+ + ♻ ☆ SocialCounterfactuals: Probing and Mitigating Intersectional Social + Biases in Vision-Language Models with Counterfactual Examples CVPR 2024 + + +
+ While vision-language models (VLMs) have achieved remarkable performance +improvements recently, there is growing evidence that these models also posses +harmful biases with respect to social attributes such as gender and race. Prior +studies have primarily focused on probing such bias attributes individually +while ignoring biases associated with intersections between social attributes. +This could be due to the difficulty of collecting an exhaustive set of +image-text pairs for various combinations of social attributes. To address this +challenge, we employ text-to-image diffusion models to produce counterfactual +examples for probing intersectional social biases at scale. Our approach +utilizes Stable Diffusion with cross attention control to produce sets of +counterfactual image-text pairs that are highly similar in their depiction of a +subject (e.g., a given occupation) while differing only in their depiction of +intersectional social attributes (e.g., race & gender). Through our +over-generate-then-filter methodology, we produce SocialCounterfactuals, a +high-quality dataset containing 171k image-text pairs for probing +intersectional biases related to gender, race, and physical characteristics. We +conduct extensive experiments to demonstrate the usefulness of our generated +dataset for probing and mitigating intersectional social biases in +state-of-the-art VLMs. + +
+
+ comment: Accepted to CVPR 2024. arXiv admin note: text overlap with + arXiv:2310.02988 +
+
+
+
+
+ + ♻ ☆ Better Monocular 3D Detectors with LiDAR from the Past ICRA 2024 + + +
+ Accurate 3D object detection is crucial to autonomous driving. Though +LiDAR-based detectors have achieved impressive performance, the high cost of +LiDAR sensors precludes their widespread adoption in affordable vehicles. +Camera-based detectors are cheaper alternatives but often suffer inferior +performance compared to their LiDAR-based counterparts due to inherent depth +ambiguities in images. In this work, we seek to improve monocular 3D detectors +by leveraging unlabeled historical LiDAR data. Specifically, at inference time, +we assume that the camera-based detectors have access to multiple unlabeled +LiDAR scans from past traversals at locations of interest (potentially from +other high-end vehicles equipped with LiDAR sensors). Under this setup, we +proposed a novel, simple, and end-to-end trainable framework, termed +AsyncDepth, to effectively extract relevant features from asynchronous LiDAR +traversals of the same location for monocular 3D detectors. We show consistent +and significant performance gain (up to 9 AP) across multiple state-of-the-art +models and datasets with a negligible additional latency of 9.66 ms and a small +storage cost. + +
+
+ comment: Accepted by ICRA 2024. The code can be found at + https://github.com/YurongYou/AsyncDepth +
+
+
+
+
+ + ♻ ☆ $λ$-ECLIPSE: Multi-Concept Personalized Text-to-Image Diffusion + Models by Leveraging CLIP Latent Space + + +
+ Despite the recent advances in personalized text-to-image (P-T2I) generative +models, it remains challenging to perform finetuning-free multi-subject-driven +T2I in a resource-efficient manner. Predominantly, contemporary approaches, +involving the training of Hypernetworks and Multimodal Large Language Models +(MLLMs), require heavy computing resources that range from 600 to 12300 GPU +hours of training. These subject-driven T2I methods hinge on Latent Diffusion +Models (LDMs), which facilitate T2I mapping through cross-attention layers. +While LDMs offer distinct advantages, P-T2I methods' reliance on the latent +space of these diffusion models significantly escalates resource demands, +leading to inconsistent results and necessitating numerous iterations for a +single desired image. In this paper, we present $\lambda$-ECLIPSE, an +alternative prior-training strategy that works in the latent space of a +pre-trained CLIP model without relying on the diffusion UNet models. +$\lambda$-ECLIPSE leverages the image-text interleaved pre-training for fast +and effective multi-subject-driven P-T2I. Through extensive experiments, we +establish that $\lambda$-ECLIPSE surpasses existing baselines in composition +alignment while preserving concept alignment performance, even with +significantly lower resource utilization. $\lambda$-ECLIPSE performs +multi-subject driven P-T2I with just 34M parameters and is trained on a mere 74 +GPU hours. Additionally, $\lambda$-ECLIPSE demonstrates the unique ability to +perform multi-concept interpolations. + +
+
+ comment: Project page: https://eclipse-t2i.github.io/Lambda-ECLIPSE/ +
+
+
+
+
+ + ♻ ☆ Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized + Narratives from Open-Source Histopathology Videos + + +
+ Diagnosis in histopathology requires a global whole slide images (WSIs) +analysis, requiring pathologists to compound evidence from different WSI +patches. The gigapixel scale of WSIs poses a challenge for histopathology +multi-modal models. Training multi-model models for histopathology requires +instruction tuning datasets, which currently contain information for individual +image patches, without a spatial grounding of the concepts within each patch +and without a wider view of the WSI. Therefore, they lack sufficient diagnostic +capacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a +large-scale dataset of 107,131 histopathology-specific instruction +question/answer pairs, grounded within diagnostically relevant image patches +that make up the WSI. Our dataset is collected by leveraging educational +histopathology videos from YouTube, which provides spatial localization of +narrations by automatically extracting the narrators' cursor positions. +Quilt-Instruct supports contextual reasoning by extracting diagnosis and +supporting facts from the entire WSI. Using Quilt-Instruct, we train +Quilt-LLaVA, which can reason beyond the given single image patch, enabling +diagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a +comprehensive evaluation dataset created from 985 images and 1283 +human-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using +public histopathology datasets, where Quilt-LLaVA significantly outperforms +SOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set +VQA. Our code, data, and model are publicly accessible at +quilt-llava.github.io. + +
+
+
+
+
+ + ♻ ☆ Mitigating the Impact of Attribute Editing on Face Recognition + + +
+ Through a large-scale study over diverse face images, we show that facial +attribute editing using modern generative AI models can severely degrade +automated face recognition systems. This degradation persists even with +identity-preserving generative models. To mitigate this issue, we propose two +novel techniques for local and global attribute editing. We empirically ablate +twenty-six facial semantic, demographic and expression-based attributes that +have been edited using state-of-the-art generative models, and evaluate them +using ArcFace and AdaFace matchers on CelebA, CelebAMaskHQ and LFW datasets. +Finally, we use LLaVA, an emerging visual question-answering framework for +attribute prediction to validate our editing techniques. Our methods outperform +the current state-of-the-art at facial editing (BLIP, InstantID) while +improving identity retention by a significant extent. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Diffusion based Zero-shot Medical Image-to-Image Translation for Cross + Modality Segmentation + + +
+ Cross-modality image segmentation aims to segment the target modalities using +a method designed in the source modality. Deep generative models can translate +the target modality images into the source modality, thus enabling +cross-modality segmentation. However, a vast body of existing cross-modality +image translation methods relies on supervised learning. In this work, we aim +to address the challenge of zero-shot learning-based image translation tasks +(extreme scenarios in the target modality is unseen in the training phase). To +leverage generative learning for zero-shot cross-modality image segmentation, +we propose a novel unsupervised image translation method. The framework learns +to translate the unseen source image to the target modality for image +segmentation by leveraging the inherent statistical consistency between +different modalities for diffusion guidance. Our framework captures identical +cross-modality features in the statistical domain, offering diffusion guidance +without relying on direct mappings between the source and target domains. This +advantage allows our method to adapt to changing source domains without the +need for retraining, making it highly practical when sufficient labeled source +domain data is not available. The proposed framework is validated in zero-shot +cross-modality image segmentation tasks through empirical comparisons with +influential generative models, including adversarial-based and diffusion-based +models. + +
+
+ comment: Neurips 2023 Diffusion Workshop +
+
+
+
+
+ + ♻ ☆ Local Neighborhood Features for 3D Classification + + +
+ With advances in deep learning model training strategies, the training of +Point cloud classification methods is significantly improving. For example, +PointNeXt, which adopts prominent training techniques and InvResNet layers into +PointNet++, achieves over 7% improvement on the real-world ScanObjectNN +dataset. However, most of these models use point coordinates features of +neighborhood points mapped to higher dimensional space while ignoring the +neighborhood point features computed before feeding to the network layers. In +this paper, we revisit the PointNeXt model to study the usage and benefit of +such neighborhood point features. We train and evaluate PointNeXt on ModelNet40 +(synthetic), ScanObjectNN (real-world), and a recent large-scale, real-world +grocery dataset, i.e., 3DGrocery100. In addition, we provide an additional +inference strategy of weight averaging the top two checkpoints of PointNeXt to +improve classification accuracy. Together with the abovementioned ideas, we +gain 0.5%, 1%, 4.8%, 3.4%, and 1.6% overall accuracy on the PointNeXt model +with real-world datasets, ScanObjectNN (hardest variant), 3DGrocery100's +Apple10, Fruits, Vegetables, and Packages subsets, respectively. We also +achieve a comparable 0.2% accuracy gain on ModelNet40. + +
+
+
+
+
+ + ♻ ☆ Two-Person Interaction Augmentation with Skeleton Priors + + +
+ Close and continuous interaction with rich contacts is a crucial aspect of +human activities (e.g. hugging, dancing) and of interest in many domains like +activity recognition, motion prediction, character animation, etc. However, +acquiring such skeletal motion is challenging. While direct motion capture is +expensive and slow, motion editing/generation is also non-trivial, as complex +contact patterns with topological and geometric constraints have to be +retained. To this end, we propose a new deep learning method for two-body +skeletal interaction motion augmentation, which can generate variations of +contact-rich interactions with varying body sizes and proportions while +retaining the key geometric/topological relations between two bodies. Our +system can learn effectively from a relatively small amount of data and +generalize to drastically different skeleton sizes. Through exhaustive +evaluation and comparison, we show it can generate high-quality motions, has +strong generalizability and outperforms traditional optimization-based methods +and alternative deep learning solutions. + +
+
+
+
+
+ + ♻ ☆ A dataset of over one thousand computed tomography scans of battery + cells + + +
+ Battery technology is increasingly important for global electrification +efforts. However, batteries are highly sensitive to small manufacturing +variations that can induce reliability or safety issues. An important +technology for battery quality control is computed tomography (CT) scanning, +which is widely used for non-destructive 3D inspection across a variety of +clinical and industrial applications. Historically, however, the utility of CT +scanning for high-volume manufacturing has been limited by its low throughput +as well as the difficulty of handling its large file sizes. In this work, we +present a dataset of over one thousand CT scans of as-produced commercially +available batteries. The dataset spans various chemistries (lithium-ion and +sodium-ion) as well as various battery form factors (cylindrical, pouch, and +prismatic). We evaluate seven different battery types in total. The +manufacturing variability and the presence of battery defects can be observed +via this dataset. This dataset may be of interest to scientists and engineers +working on battery technology, computer vision, or both. + +
+
+
+
+
+ + ♻ ☆ TAM-VT: Transformation-Aware Multi-scale Video Transformer for + Segmentation and Tracking + + +
+ Video Object Segmentation (VOS) has emerged as an increasingly important +problem with availability of larger datasets and more complex and realistic +settings, which involve long videos with global motion (e.g, in egocentric +settings), depicting small objects undergoing both rigid and non-rigid +(including state) deformations. While a number of recent approaches have been +explored for this task, these data characteristics still present challenges. In +this work we propose a novel, clip-based DETR-style encoder-decoder +architecture, which focuses on systematically analyzing and addressing +aforementioned challenges. Specifically, we propose a novel +transformation-aware loss that focuses learning on portions of the video where +an object undergoes significant deformations -- a form of "soft" hard examples +mining. Further, we propose a multiplicative time-coded memory, beyond vanilla +additive positional encoding, which helps propagate context across long videos. +Finally, we incorporate these in our proposed holistic multi-scale video +transformer for tracking via multi-scale memory matching and decoding to ensure +sensitivity and accuracy for long videos and small objects. Our model enables +on-line inference with long videos in a windowed fashion, by breaking the video +into clips and propagating context among them. We illustrate that short clip +length and longer memory with learned time-coding are important design choices +for improved performance. Collectively, these technical contributions enable +our model to achieve new state-of-the-art (SoTA) performance on two complex +egocentric datasets -- VISOR and VOST, while achieving comparable to SoTA +results on the conventional VOS benchmark, DAVIS'17. A series of detailed +ablations validate our design choices as well as provide insights into the +importance of parameter choices and their impact on performance. + +
+
+
+
+
+ + ♻ ☆ Lane Change Classification and Prediction with Action Recognition + Networks ECCV2022 + + +
+ Anticipating lane change intentions of surrounding vehicles is crucial for +efficient and safe driving decision making in an autonomous driving system. +Previous works often adopt physical variables such as driving speed, +acceleration and so forth for lane change classification. However, physical +variables do not contain semantic information. Although 3D CNNs have been +developing rapidly, the number of methods utilising action recognition models +and appearance feature for lane change recognition is low, and they all require +additional information to pre-process data. In this work, we propose an +end-to-end framework including two action recognition methods for lane change +recognition, using video data collected by cameras. Our method achieves the +best lane change classification results using only the RGB video data of the +PREVENTION dataset. Class activation maps demonstrate that action recognition +models can efficiently extract lane change motions. A method to better extract +motion clues is also proposed in this paper. + +
+
+ comment: Accepted to ECCV2022 AVVISION +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 167 + +
+
+
+ + ☆ Finding Visual Task Vectors + + +
+ Visual Prompting is a technique for teaching models to perform a visual task +via in-context examples, without any additional training. In this work, we +analyze the activations of MAE-VQGAN, a recent Visual Prompting model, and find +task vectors, activations that encode task-specific information. Equipped with +this insight, we demonstrate that it is possible to identify the task vectors +and use them to guide the network towards performing different tasks without +providing any input-output examples. To find task vectors, we compute the +average intermediate activations per task and use the REINFORCE algorithm to +search for the subset of task vectors. The resulting task vectors guide the +model towards performing a task better than the original model without the need +for input-output examples. + +
+
+ comment: https://github.com/alhojel/visual_task_vectors +
+
+
+
+
+ + ☆ MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video + Understanding CVPR 2024 + + +
+ With the success of large language models (LLMs), integrating the vision +model into LLMs to build vision-language foundation models has gained much more +interest recently. However, existing LLM-based large multimodal models (e.g., +Video-LLaMA, VideoChat) can only take in a limited number of frames for short +video understanding. In this study, we mainly focus on designing an efficient +and effective model for long-term video understanding. Instead of trying to +process more frames simultaneously like most existing work, we propose to +process videos in an online manner and store past video information in a memory +bank. This allows our model to reference historical video content for long-term +analysis without exceeding LLMs' context length constraints or GPU memory +limits. Our memory bank can be seamlessly integrated into current multimodal +LLMs in an off-the-shelf manner. We conduct extensive experiments on various +video understanding tasks, such as long-video understanding, video question +answering, and video captioning, and our model can achieve state-of-the-art +performances across multiple datasets. Code available at +https://boheumd.github.io/MA-LMM/. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs + + +
+ Recent advancements in multimodal large language models (MLLMs) have been +noteworthy, yet, these general-domain MLLMs often fall short in their ability +to comprehend and interact effectively with user interface (UI) screens. In +this paper, we present Ferret-UI, a new MLLM tailored for enhanced +understanding of mobile UI screens, equipped with referring, grounding, and +reasoning capabilities. Given that UI screens typically exhibit a more +elongated aspect ratio and contain smaller objects of interest (e.g., icons, +texts) than natural images, we incorporate "any resolution" on top of Ferret to +magnify details and leverage enhanced visual features. Specifically, each +screen is divided into 2 sub-images based on the original aspect ratio (i.e., +horizontal division for portrait screens and vertical division for landscape +screens). Both sub-images are encoded separately before being sent to LLMs. We +meticulously gather training samples from an extensive range of elementary UI +tasks, such as icon recognition, find text, and widget listing. These samples +are formatted for instruction-following with region annotations to facilitate +precise referring and grounding. To augment the model's reasoning ability, we +further compile a dataset for advanced tasks, including detailed description, +perception/interaction conversations, and function inference. After training on +the curated datasets, Ferret-UI exhibits outstanding comprehension of UI +screens and the capability to execute open-ended instructions. For model +evaluation, we establish a comprehensive benchmark encompassing all the +aforementioned tasks. Ferret-UI excels not only beyond most open-source UI +MLLMs, but also surpasses GPT-4V on all the elementary UI tasks. + +
+
+
+
+
+ + ☆ SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual + Editing + + +
+ Effective editing of personal content holds a pivotal role in enabling +individuals to express their creativity, weaving captivating narratives within +their visual stories, and elevate the overall quality and impact of their +visual content. Therefore, in this work, we introduce SwapAnything, a novel +framework that can swap any objects in an image with personalized concepts +given by the reference, while keeping the context unchanged. Compared with +existing methods for personalized subject swapping, SwapAnything has three +unique advantages: (1) precise control of arbitrary objects and parts rather +than the main subject, (2) more faithful preservation of context pixels, (3) +better adaptation of the personalized concept to the image. First, we propose +targeted variable swapping to apply region control over latent feature maps and +swap masked variables for faithful context preservation and initial semantic +concept swapping. Then, we introduce appearance adaptation, to seamlessly adapt +the semantic concept into the original image in terms of target location, +shape, style, and content during the image generation process. Extensive +results on both human and automatic evaluation demonstrate significant +improvements of our approach over baseline methods on personalized swapping. +Furthermore, SwapAnything shows its precise and faithful swapping abilities +across single object, multiple objects, partial object, and cross-domain +swapping tasks. SwapAnything also achieves great performance on text-based +swapping and tasks beyond swapping such as object insertion. + +
+
+ comment: 18 pages, 16 figures, 3 tables +
+
+
+
+
+ + ☆ Learning 3D-Aware GANs from Unposed Images with Template Feature Field + + +
+ Collecting accurate camera poses of training images has been shown to well +serve the learning of 3D-aware generative adversarial networks (GANs) yet can +be quite expensive in practice. This work targets learning 3D-aware GANs from +unposed images, for which we propose to perform on-the-fly pose estimation of +training images with a learned template feature field (TeFF). Concretely, in +addition to a generative radiance field as in previous approaches, we ask the +generator to also learn a field from 2D semantic features while sharing the +density from the radiance field. Such a framework allows us to acquire a +canonical 3D feature template leveraging the dataset mean discovered by the +generative model, and further efficiently estimate the pose parameters on real +data. Experimental results on various challenging datasets demonstrate the +superiority of our approach over state-of-the-art alternatives from both the +qualitative and the quantitative perspectives. + +
+
+ comment: https://XDimlab.github.io/TeFF +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic + Segmentation for Satellite Imagery + + +
+ Satellite imagery is crucial for tasks like environmental monitoring and +urban planning. Typically, it relies on semantic segmentation or Land Use Land +Cover (LULC) classification to categorize each pixel. Despite the advancements +brought about by Deep Neural Networks (DNNs), their performance in segmentation +tasks is hindered by challenges such as limited availability of labeled data, +class imbalance and the inherent variability and complexity of satellite +images. In order to mitigate those issues, our study explores the effectiveness +of a Cut-and-Paste augmentation technique for semantic segmentation in +satellite images. We adapt this augmentation, which usually requires labeled +instances, to the case of semantic segmentation. By leveraging the connected +components in the semantic segmentation labels, we extract instances that are +then randomly pasted during training. Using the DynamicEarthNet dataset and a +U-Net model for evaluation, we found that this augmentation significantly +enhances the mIoU score on the test set from 37.9 to 44.1. This finding +highlights the potential of the Cut-and-Paste augmentation to improve the +generalization capabilities of semantic segmentation models in satellite +imagery. + +
+
+ comment: Accepted for publication in IEEE 2024 International Geoscience & + Remote Sensing Symposium (IGARSS 2024) +
+
+
+
+
+ + ☆ Retrieval-Augmented Open-Vocabulary Object Detection CVPR 2024 + + +
+ Open-vocabulary object detection (OVD) has been studied with Vision-Language +Models (VLMs) to detect novel objects beyond the pre-trained categories. +Previous approaches improve the generalization ability to expand the knowledge +of the detector, using 'positive' pseudo-labels with additional 'class' names, +e.g., sock, iPod, and alligator. To extend the previous methods in two aspects, +we propose Retrieval-Augmented Losses and visual Features (RALF). Our method +retrieves related 'negative' classes and augments loss functions. Also, visual +features are augmented with 'verbalized concepts' of classes, e.g., worn on the +feet, handheld music player, and sharp teeth. Specifically, RALF consists of +two modules: Retrieval Augmented Losses (RAL) and Retrieval-Augmented visual +Features (RAF). RAL constitutes two losses reflecting the semantic similarity +with negative vocabularies. In addition, RAF augments visual features with the +verbalized concepts from a large language model (LLM). Our experiments +demonstrate the effectiveness of RALF on COCO and LVIS benchmark datasets. We +achieve improvement up to 3.4 box AP$_{50}^{\text{N}}$ on novel categories of +the COCO dataset and 3.6 mask AP$_{\text{r}}$ gains on the LVIS dataset. Code +is available at https://github.com/mlvlab/RALF . + +
+
+ comment: Accepted paper at CVPR 2024 +
+
+
+
+
+ + ☆ SphereHead: Stable 3D Full-head Synthesis with Spherical Tri-plane + Representation + + +
+ While recent advances in 3D-aware Generative Adversarial Networks (GANs) have +aided the development of near-frontal view human face synthesis, the challenge +of comprehensively synthesizing a full 3D head viewable from all angles still +persists. Although PanoHead proves the possibilities of using a large-scale +dataset with images of both frontal and back views for full-head synthesis, it +often causes artifacts for back views. Based on our in-depth analysis, we found +the reasons are mainly twofold. First, from network architecture perspective, +we found each plane in the utilized tri-plane/tri-grid representation space +tends to confuse the features from both sides, causing "mirroring" artifacts +(e.g., the glasses appear in the back). Second, from data supervision aspect, +we found that existing discriminator training in 3D GANs mainly focuses on the +quality of the rendered image itself, and does not care much about its +plausibility with the perspective from which it was rendered. This makes it +possible to generate "face" in non-frontal views, due to its easiness to fool +the discriminator. In response, we propose SphereHead, a novel tri-plane +representation in the spherical coordinate system that fits the human head's +geometric characteristics and efficiently mitigates many of the generated +artifacts. We further introduce a view-image consistency loss for the +discriminator to emphasize the correspondence of the camera parameters and the +images. The combination of these efforts results in visually superior outcomes +with significantly fewer artifacts. Our code and dataset are publicly available +at https://lhyfst.github.io/spherehead. + +
+
+ comment: project page: https://lhyfst.github.io/spherehead +
+
+
+
+
+ + ☆ Normalizing Flows on the Product Space of SO(3) Manifolds for + Probabilistic Human Pose Modeling CVPR 2024 + + +
+ Normalizing flows have proven their efficacy for density estimation in +Euclidean space, but their application to rotational representations, crucial +in various domains such as robotics or human pose modeling, remains +underexplored. Probabilistic models of the human pose can benefit from +approaches that rigorously consider the rotational nature of human joints. For +this purpose, we introduce HuProSO3, a normalizing flow model that operates on +a high-dimensional product space of SO(3) manifolds, modeling the joint +distribution for human joints with three degrees of freedom. HuProSO3's +advantage over state-of-the-art approaches is demonstrated through its superior +modeling accuracy in three different applications and its capability to +evaluate the exact likelihood. This work not only addresses the technical +challenge of learning densities on SO(3) manifolds, but it also has broader +implications for domains where the probabilistic regression of correlated 3D +rotations is of importance. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ MoMA: Multimodal LLM Adapter for Fast Personalized Image Generation + + +
+ In this paper, we present MoMA: an open-vocabulary, training-free +personalized image model that boasts flexible zero-shot capabilities. As +foundational text-to-image models rapidly evolve, the demand for robust +image-to-image translation grows. Addressing this need, MoMA specializes in +subject-driven personalized image generation. Utilizing an open-source, +Multimodal Large Language Model (MLLM), we train MoMA to serve a dual role as +both a feature extractor and a generator. This approach effectively synergizes +reference image and text prompt information to produce valuable image features, +facilitating an image diffusion model. To better leverage the generated +features, we further introduce a novel self-attention shortcut method that +efficiently transfers image features to an image diffusion model, improving the +resemblance of the target object in generated images. Remarkably, as a +tuning-free plug-and-play module, our model requires only a single reference +image and outperforms existing methods in generating images with high detail +fidelity, enhanced identity-preservation and prompt faithfulness. Our work is +open-source, thereby providing universal access to these advancements. + +
+
+
+
+
+ + ☆ CoReS: Orchestrating the Dance of Reasoning and Segmentation + + +
+ The reasoning segmentation task, which demands a nuanced comprehension of +intricate queries to accurately pinpoint object regions, is attracting +increasing attention. However, Multi-modal Large Language Models (MLLM) often +find it difficult to accurately localize the objects described in complex +reasoning contexts. We believe that the act of reasoning segmentation should +mirror the cognitive stages of human visual search, where each step is a +progressive refinement of thought toward the final object. Thus we introduce +the Chains of Reasoning and Segmenting (CoReS) and find this top-down visual +hierarchy indeed enhances the visual search process. Specifically, we propose a +dual-chain structure that generates multi-modal, chain-like outputs to aid the +segmentation process. Furthermore, to steer the MLLM's outputs into this +intended hierarchy, we incorporate in-context inputs as guidance. Extensive +experiments demonstrate the superior performance of our CoReS, which surpasses +the state-of-the-art method by 7.1\% on the ReasonSeg dataset. The code will be +released at https://github.com/baoxiaoyi/CoReS. + +
+
+
+
+
+ + ☆ NAF-DPM: A Nonlinear Activation-Free Diffusion Probabilistic Model for + Document Enhancement + + +
+ Real-world documents may suffer various forms of degradation, often resulting +in lower accuracy in optical character recognition (OCR) systems. Therefore, a +crucial preprocessing step is essential to eliminate noise while preserving +text and key features of documents. In this paper, we propose NAF-DPM, a novel +generative framework based on a diffusion probabilistic model (DPM) designed to +restore the original quality of degraded documents. While DPMs are recognized +for their high-quality generated images, they are also known for their large +inference time. To mitigate this problem we provide the DPM with an efficient +nonlinear activation-free (NAF) network and we employ as a sampler a fast +solver of ordinary differential equations, which can converge in a few +iterations. To better preserve text characters, we introduce an additional +differentiable module based on convolutional recurrent neural networks, +simulating the behavior of an OCR system during training. Experiments conducted +on various datasets showcase the superiority of our approach, achieving +state-of-the-art performance in terms of pixel-level and perceptual similarity +metrics. Furthermore, the results demonstrate a notable character error +reduction made by OCR systems when transcribing real-world document images +enhanced by our framework. Code and pre-trained models are available at +https://github.com/ispamm/NAF-DPM. + +
+
+ comment: Under review at IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ☆ AlignZeg: Mitigating Objective Misalignment for Zero-shot Semantic + Segmentation + + +
+ A serious issue that harms the performance of zero-shot visual recognition is +named objective misalignment, i.e., the learning objective prioritizes +improving the recognition accuracy of seen classes rather than unseen classes, +while the latter is the true target to pursue. This issue becomes more +significant in zero-shot image segmentation because the stronger (i.e., +pixel-level) supervision brings a larger gap between seen and unseen classes. +To mitigate it, we propose a novel architecture named AlignZeg, which embodies +a comprehensive improvement of the segmentation pipeline, including proposal +extraction, classification, and correction, to better fit the goal of zero-shot +segmentation. (1) Mutually-Refined Proposal Extraction. AlignZeg harnesses a +mutual interaction between mask queries and visual features, facilitating +detailed class-agnostic mask proposal extraction. (2) Generalization-Enhanced +Proposal Classification. AlignZeg introduces synthetic data and incorporates +multiple background prototypes to allocate a more generalizable feature space. +(3) Predictive Bias Correction. During the inference stage, AlignZeg uses a +class indicator to find potential unseen class proposals followed by a +prediction postprocess to correct the prediction bias. Experiments demonstrate +that AlignZeg markedly enhances zero-shot semantic segmentation, as shown by an +average 3.8% increase in hIoU, primarily attributed to a 7.1% improvement in +identifying unseen classes, and we further validate that the improvement comes +from alleviating the objective misalignment issue. + +
+
+
+
+
+ + ☆ YaART: Yet Another ART Rendering Technology + + +
+ In the rapidly progressing field of generative models, the development of +efficient and high-fidelity text-to-image diffusion systems represents a +significant frontier. This study introduces YaART, a novel production-grade +text-to-image cascaded diffusion model aligned to human preferences using +Reinforcement Learning from Human Feedback (RLHF). During the development of +YaART, we especially focus on the choices of the model and training dataset +sizes, the aspects that were not systematically investigated for text-to-image +cascaded diffusion models before. In particular, we comprehensively analyze how +these choices affect both the efficiency of the training process and the +quality of the generated images, which are highly important in practice. +Furthermore, we demonstrate that models trained on smaller datasets of +higher-quality images can successfully compete with those trained on larger +datasets, establishing a more efficient scenario of diffusion models training. +From the quality perspective, YaART is consistently preferred by users over +many existing state-of-the-art models. + +
+
+ comment: Prompts and additional information are available on the project page, + see https://ya.ru/ai/art/paper-yaart-v1 +
+
+
+
+
+ + ☆ BinaryDM: Towards Accurate Binarization of Diffusion Model + + +
+ With the advancement of diffusion models (DMs) and the substantially +increased computational requirements, quantization emerges as a practical +solution to obtain compact and efficient low-bit DMs. However, the highly +discrete representation leads to severe accuracy degradation, hindering the +quantization of diffusion models to ultra-low bit-widths. In this paper, we +propose BinaryDM, a novel accurate quantization-aware training approach to push +the weights of diffusion models towards the limit of 1-bit. Firstly, we present +a Learnable Multi-basis Binarizer (LMB) to recover the representations +generated by the binarized DM, which improves the information in details of +representations crucial to the DM. Secondly, a Low-rank Representation +Mimicking (LRM) is applied to enhance the binarization-aware optimization of +the DM, alleviating the optimization direction ambiguity caused by fine-grained +alignment. Moreover, a progressive initialization strategy is applied to +training DMs to avoid convergence difficulties. Comprehensive experiments +demonstrate that BinaryDM achieves significant accuracy and efficiency gains +compared to SOTA quantization methods of DMs under ultra-low bit-widths. As the +first binarization method for diffusion models, BinaryDM achieves impressive +16.0 times FLOPs and 27.1 times storage savings with 1-bit weight and 4-bit +activation, showcasing its substantial advantages and potential for deploying +DMs on resource-limited scenarios. + +
+
+ comment: The code will soon be available at + https://github.com/Xingyu-Zheng/BinaryDM +
+
+
+
+
+ + ☆ Automatic Controllable Colorization via Imagination CVPR 2024 + + +
+ We propose a framework for automatic colorization that allows for iterative +editing and modifications. The core of our framework lies in an imagination +module: by understanding the content within a grayscale image, we utilize a +pre-trained image generation model to generate multiple images that contain the +same content. These images serve as references for coloring, mimicking the +process of human experts. As the synthesized images can be imperfect or +different from the original grayscale image, we propose a Reference Refinement +Module to select the optimal reference composition. Unlike most previous +end-to-end automatic colorization algorithms, our framework allows for +iterative and localized modifications of the colorization results because we +explicitly model the coloring samples. Extensive experiments demonstrate the +superiority of our framework over existing automatic colorization algorithms in +editability and flexibility. Project page: +https://xy-cong.github.io/imagine-colorization. + +
+
+ comment: CVPR 2024. Project page: + https://xy-cong.github.io/imagine-colorization +
+
+
+
+
+ + ☆ MLP Can Be A Good Transformer Learner + + +
+ Self-attention mechanism is the key of the Transformer but often criticized +for its computation demands. Previous token pruning works motivate their +methods from the view of computation redundancy but still need to load the full +network and require same memory costs. This paper introduces a novel strategy +that simplifies vision transformers and reduces computational load through the +selective removal of non-essential attention layers, guided by entropy +considerations. We identify that regarding the attention layer in bottom +blocks, their subsequent MLP layers, i.e. two feed-forward layers, can elicit +the same entropy quantity. Meanwhile, the accompanied MLPs are under-exploited +since they exhibit smaller feature entropy compared to those MLPs in the top +blocks. Therefore, we propose to integrate the uninformative attention layers +into their subsequent counterparts by degenerating them into identical mapping, +yielding only MLP in certain transformer blocks. Experimental results on +ImageNet-1k show that the proposed method can remove 40% attention layer of +DeiT-B, improving throughput and memory bound without performance compromise. +Code is available at https://github.com/sihaoevery/lambda_vit. + +
+
+ comment: efficient transformer +
+
+
+
+
+ + ☆ 3D-COCO: extension of MS-COCO dataset for image detection and 3D + reconstruction modules + + +
+ We introduce 3D-COCO, an extension of the original MS-COCO dataset providing +3D models and 2D-3D alignment annotations. 3D-COCO was designed to achieve +computer vision tasks such as 3D reconstruction or image detection configurable +with textual, 2D image, and 3D CAD model queries. We complete the existing +MS-COCO dataset with 28K 3D models collected on ShapeNet and Objaverse. By +using an IoU-based method, we match each MS-COCO annotation with the best 3D +models to provide a 2D-3D alignment. The open-source nature of 3D-COCO is a +premiere that should pave the way for new research on 3D-related topics. The +dataset and its source codes is available at +https://kalisteo.cea.fr/index.php/coco3d-object-detection-and-reconstruction/ + +
+
+
+
+
+ + ☆ Learning a Category-level Object Pose Estimator without Pose Annotations + + +
+ 3D object pose estimation is a challenging task. Previous works always +require thousands of object images with annotated poses for learning the 3D +pose correspondence, which is laborious and time-consuming for labeling. In +this paper, we propose to learn a category-level 3D object pose estimator +without pose annotations. Instead of using manually annotated images, we +leverage diffusion models (e.g., Zero-1-to-3) to generate a set of images under +controlled pose differences and propose to learn our object pose estimator with +those images. Directly using the original diffusion model leads to images with +noisy poses and artifacts. To tackle this issue, firstly, we exploit an image +encoder, which is learned from a specially designed contrastive pose learning, +to filter the unreasonable details and extract image feature maps. +Additionally, we propose a novel learning strategy that allows the model to +learn object poses from those generated image sets without knowing the +alignment of their canonical poses. Experimental results show that our method +has the capability of category-level object pose estimation from a single shot +setting (as pose definition), while significantly outperforming other +state-of-the-art methods on the few-shot category-level object pose estimation +benchmarks. + +
+
+
+
+
+ + ☆ MULTIFLOW: Shifting Towards Task-Agnostic Vision-Language Pruning CVPR 2024 + + +
+ While excellent in transfer learning, Vision-Language models (VLMs) come with +high computational costs due to their large number of parameters. To address +this issue, removing parameters via model pruning is a viable solution. +However, existing techniques for VLMs are task-specific, and thus require +pruning the network from scratch for each new task of interest. In this work, +we explore a new direction: Task-Agnostic Vision-Language Pruning (TA-VLP). +Given a pretrained VLM, the goal is to find a unique pruned counterpart +transferable to multiple unknown downstream tasks. In this challenging setting, +the transferable representations already encoded in the pretrained model are a +key aspect to preserve. Thus, we propose Multimodal Flow Pruning (MULTIFLOW), a +first, gradient-free, pruning framework for TA-VLP where: (i) the importance of +a parameter is expressed in terms of its magnitude and its information flow, by +incorporating the saliency of the neurons it connects; and (ii) pruning is +driven by the emergent (multimodal) distribution of the VLM parameters after +pretraining. We benchmark eight state-of-the-art pruning algorithms in the +context of TA-VLP, experimenting with two VLMs, three vision-language tasks, +and three pruning ratios. Our experimental results show that MULTIFLOW +outperforms recent sophisticated, combinatorial competitors in the vast +majority of the cases, paving the way towards addressing TA-VLP. The code is +publicly available at https://github.com/FarinaMatteo/multiflow. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ A Training-Free Plug-and-Play Watermark Framework for Stable Diffusion + + +
+ Nowadays, the family of Stable Diffusion (SD) models has gained prominence +for its high quality outputs and scalability. This has also raised security +concerns on social media, as malicious users can create and disseminate harmful +content. Existing approaches involve training components or entire SDs to embed +a watermark in generated images for traceability and responsibility +attribution. However, in the era of AI-generated content (AIGC), the rapid +iteration of SDs renders retraining with watermark models costly. To address +this, we propose a training-free plug-and-play watermark framework for SDs. +Without modifying any components of SDs, we embed diverse watermarks in the +latent space, adapting to the denoising process. Our experimental findings +reveal that our method effectively harmonizes image quality and watermark +invisibility. Furthermore, it performs robustly under various attacks. We also +have validated that our method is generalized to multiple versions of SDs, even +without retraining the watermark model. + +
+
+
+
+
+ + ☆ Learning Topology Uniformed Face Mesh by Volume Rendering for Multi-view + Reconstruction + + +
+ Face meshes in consistent topology serve as the foundation for many +face-related applications, such as 3DMM constrained face reconstruction and +expression retargeting. Traditional methods commonly acquire topology uniformed +face meshes by two separate steps: multi-view stereo (MVS) to reconstruct +shapes followed by non-rigid registration to align topology, but struggles with +handling noise and non-lambertian surfaces. Recently neural volume rendering +techniques have been rapidly evolved and shown great advantages in 3D +reconstruction or novel view synthesis. Our goal is to leverage the superiority +of neural volume rendering into multi-view reconstruction of face mesh with +consistent topology. We propose a mesh volume rendering method that enables +directly optimizing mesh geometry while preserving topology, and learning +implicit features to model complex facial appearance from multi-view images. +The key innovation lies in spreading sparse mesh features into the surrounding +space to simulate radiance field required for volume rendering, which +facilitates backpropagation of gradients from images to mesh geometry and +implicit appearance features. Our proposed feature spreading module exhibits +deformation invariance, enabling photorealistic rendering seamlessly after mesh +editing. We conduct experiments on multi-view face image dataset to evaluate +the reconstruction and implement an application for photorealistic rendering of +animated face mesh. + +
+
+
+
+
+ + ☆ Self-Explainable Affordance Learning with Embodied Caption + + +
+ In the field of visual affordance learning, previous methods mainly used +abundant images or videos that delineate human behavior patterns to identify +action possibility regions for object manipulation, with a variety of +applications in robotic tasks. However, they encounter a main challenge of +action ambiguity, illustrated by the vagueness like whether to beat or carry a +drum, and the complexities involved in processing intricate scenes. Moreover, +it is important for human intervention to rectify robot errors in time. To +address these issues, we introduce Self-Explainable Affordance learning (SEA) +with embodied caption. This innovation enables robots to articulate their +intentions and bridge the gap between explainable vision-language caption and +visual affordance learning. Due to a lack of appropriate dataset, we unveil a +pioneering dataset and metrics tailored for this task, which integrates images, +heatmaps, and embodied captions. Furthermore, we propose a novel model to +effectively combine affordance grounding with self-explanation in a simple but +efficient manner. Extensive quantitative and qualitative experiments +demonstrate our method's effectiveness. + +
+
+
+
+
+ + ☆ UniFL: Improve Stable Diffusion via Unified Feedback Learning + + +
+ Diffusion models have revolutionized the field of image generation, leading +to the proliferation of high-quality models and diverse downstream +applications. However, despite these significant advancements, the current +competitive solutions still suffer from several limitations, including inferior +visual quality, a lack of aesthetic appeal, and inefficient inference, without +a comprehensive solution in sight. To address these challenges, we present +UniFL, a unified framework that leverages feedback learning to enhance +diffusion models comprehensively. UniFL stands out as a universal, effective, +and generalizable solution applicable to various diffusion models, such as +SD1.5 and SDXL. Notably, UniFL incorporates three key components: perceptual +feedback learning, which enhances visual quality; decoupled feedback learning, +which improves aesthetic appeal; and adversarial feedback learning, which +optimizes inference speed. In-depth experiments and extensive user studies +validate the superior performance of our proposed method in enhancing both the +quality of generated models and their acceleration. For instance, UniFL +surpasses ImageReward by 17% user preference in terms of generation quality and +outperforms LCM and SDXL Turbo by 57% and 20% in 4-step inference. Moreover, we +have verified the efficacy of our approach in downstream tasks, including Lora, +ControlNet, and AnimateDiff. + +
+
+
+
+
+ + ☆ Neural Cellular Automata for Lightweight, Robust and Explainable + Classification of White Blood Cell Images + + +
+ Diagnosis of hematological malignancies depends on accurate identification of +white blood cells in peripheral blood smears. Deep learning techniques are +emerging as a viable solution to scale and optimize this process by automatic +identification of cells in laboratories. However, these techniques face several +challenges such as limited generalizability, sensitivity to domain shifts and +lack of explainability. Here, we are introducing a novel approach based on +neural cellular automata (NCA) for white blood cell classification. We test our +approach on three datasets of white blood cell images and show that we achieve +competitive performance compared to conventional methods. Our NCA-based method +is significantly smaller in terms of parameters and exhibits robustness to +domain shifts. Furthermore, the architecture is inherently explainable, +providing insights into the decision process for each classification, helping +experts understand and validate model predictions. Results demonstrate that NCA +not only can be used for image classification, but also address key challenges +of conventional methods, indicating a high potential for applicability in +clinical practice. + +
+
+
+
+
+ + ☆ Towards More General Video-based Deepfake Detection through Facial + Feature Guided Adaptation for Foundation Model + + +
+ With the rise of deep learning, generative models have enabled the creation +of highly realistic synthetic images, presenting challenges due to their +potential misuse. While research in Deepfake detection has grown rapidly in +response, many detection methods struggle with unseen Deepfakes generated by +new synthesis techniques. To address this generalisation challenge, we propose +a novel Deepfake detection approach by adapting rich information encoded inside +the Foundation Models with rich information encoded inside, specifically using +the image encoder from CLIP which has demonstrated strong zero-shot capability +for downstream tasks. Inspired by the recent advances of parameter efficient +fine-tuning, we propose a novel side-network-based decoder to extract spatial +and temporal cues from the given video clip, with the promotion of the Facial +Component Guidance (FCG) to guidencourage the spatial feature to include +features of key facial parts for more robust and general Deepfake detection. +Through extensive cross-dataset evaluations, our approach exhibits superior +effectiveness in identifying unseen Deepfake samples, achieving notable +performance improvementsuccess even with limited training samples and +manipulation types. Our model secures an average performance enhancement of +0.9% AUROC in cross-dataset assessments comparing with state-of-the-art +methods, especiallytablishing a significant lead of achieving 4.4% improvement +on the challenging DFDC dataset. + +
+
+
+
+
+ + ☆ Responsible Visual Editing + + +
+ With recent advancements in visual synthesis, there is a growing risk of +encountering images with detrimental effects, such as hate, discrimination, or +privacy violations. The research on transforming harmful images into +responsible ones remains unexplored. In this paper, we formulate a new task, +responsible visual editing, which entails modifying specific concepts within an +image to render it more responsible while minimizing changes. However, the +concept that needs to be edited is often abstract, making it challenging to +locate what needs to be modified and plan how to modify it. To tackle these +challenges, we propose a Cognitive Editor (CoEditor) that harnesses the large +multimodal model through a two-stage cognitive process: (1) a perceptual +cognitive process to focus on what needs to be modified and (2) a behavioral +cognitive process to strategize how to modify. To mitigate the negative +implications of harmful images on research, we create a transparent and public +dataset, AltBear, which expresses harmful information using teddy bears instead +of humans. Experiments demonstrate that CoEditor can effectively comprehend +abstract concepts within complex scenes and significantly surpass the +performance of baseline models for responsible visual editing. We find that the +AltBear dataset corresponds well to the harmful content found in real images, +offering a consistent experimental evaluation, thereby providing a safer +benchmark for future research. Moreover, CoEditor also shows great results in +general editing. We release our code and dataset at +https://github.com/kodenii/Responsible-Visual-Editing. + +
+
+ comment: 24 pages, 12 figures +
+
+
+
+
+ + ☆ Robust Data Pruning: Uncovering and Overcoming Implicit Bias + + +
+ In the era of exceptionally data-hungry models, careful selection of the +training data is essential to mitigate the extensive costs of deep learning. +Data pruning offers a solution by removing redundant or uninformative samples +from the dataset, which yields faster convergence and improved neural scaling +laws. However, little is known about its impact on classification bias of the +trained models. We conduct the first systematic study of this effect and reveal +that existing data pruning algorithms can produce highly biased classifiers. At +the same time, we argue that random data pruning with appropriate class ratios +has potential to improve the worst-class performance. We propose a +"fairness-aware" approach to pruning and empirically demonstrate its +performance on standard computer vision benchmarks. In sharp contrast to +existing algorithms, our proposed method continues improving robustness at a +tolerable drop of average performance as we prune more from the datasets. We +present theoretical analysis of the classification risk in a mixture of +Gaussians to further motivate our algorithm and support our findings. + +
+
+
+
+
+ + ☆ Social-MAE: Social Masked Autoencoder for Multi-person Motion + Representation Learning + + +
+ For a complete comprehension of multi-person scenes, it is essential to go +beyond basic tasks like detection and tracking. Higher-level tasks, such as +understanding the interactions and social activities among individuals, are +also crucial. Progress towards models that can fully understand scenes +involving multiple people is hindered by a lack of sufficient annotated data +for such high-level tasks. To address this challenge, we introduce Social-MAE, +a simple yet effective transformer-based masked autoencoder framework for +multi-person human motion data. The framework uses masked modeling to pre-train +the encoder to reconstruct masked human joint trajectories, enabling it to +learn generalizable and data efficient representations of motion in human +crowded scenes. Social-MAE comprises a transformer as the MAE encoder and a +lighter-weight transformer as the MAE decoder which operates on multi-person +joints' trajectory in the frequency domain. After the reconstruction task, the +MAE decoder is replaced with a task-specific decoder and the model is +fine-tuned end-to-end for a variety of high-level social tasks. Our proposed +model combined with our pre-training approach achieves the state-of-the-art +results on various high-level social tasks, including multi-person pose +forecasting, social grouping, and social action understanding. These +improvements are demonstrated across four popular multi-person datasets +encompassing both human 2D and 3D body pose. + +
+
+
+
+
+ + ☆ TIM: A Time Interval Machine for Audio-Visual Action Recognition CVPR 2024 + + +
+ Diverse actions give rise to rich audio-visual signals in long videos. Recent +works showcase that the two modalities of audio and video exhibit different +temporal extents of events and distinct labels. We address the interplay +between the two modalities in long videos by explicitly modelling the temporal +extents of audio and visual events. We propose the Time Interval Machine (TIM) +where a modality-specific time interval poses as a query to a transformer +encoder that ingests a long video input. The encoder then attends to the +specified interval, as well as the surrounding context in both modalities, in +order to recognise the ongoing action. + We test TIM on three long audio-visual video datasets: EPIC-KITCHENS, +Perception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On +EPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly +larger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we +show that TIM can be adapted for action detection, using dense multi-scale +interval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and +showing strong performance on the Perception Test. Our ablations show the +critical role of integrating the two modalities and modelling their time +intervals in achieving this performance. Code and models at: +https://github.com/JacobChalk/TIM + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Investigating the Effectiveness of Cross-Attention to Unlock Zero-Shot + Editing of Text-to-Video Diffusion Models CVPR 2024 + + +
+ With recent advances in image and video diffusion models for content +creation, a plethora of techniques have been proposed for customizing their +generated content. In particular, manipulating the cross-attention layers of +Text-to-Image (T2I) diffusion models has shown great promise in controlling the +shape and location of objects in the scene. Transferring image-editing +techniques to the video domain, however, is extremely challenging as object +motion and temporal consistency are difficult to capture accurately. In this +work, we take a first look at the role of cross-attention in Text-to-Video +(T2V) diffusion models for zero-shot video editing. While one-shot models have +shown potential in controlling motion and camera movement, we demonstrate +zero-shot control over object shape, position and movement in T2V models. We +show that despite the limitations of current T2V models, cross-attention +guidance can be a promising approach for editing videos. + +
+
+ comment: Generative Models for Computer Vision Generative Models for Computer + Vision CVPR 2024 Workshop +
+
+
+
+
+ + ☆ DepthMOT: Depth Cues Lead to a Strong Multi-Object Tracker + + +
+ Accurately distinguishing each object is a fundamental goal of Multi-object +tracking (MOT) algorithms. However, achieving this goal still remains +challenging, primarily due to: (i) For crowded scenes with occluded objects, +the high overlap of object bounding boxes leads to confusion among closely +located objects. Nevertheless, humans naturally perceive the depth of elements +in a scene when observing 2D videos. Inspired by this, even though the bounding +boxes of objects are close on the camera plane, we can differentiate them in +the depth dimension, thereby establishing a 3D perception of the objects. (ii) +For videos with rapidly irregular camera motion, abrupt changes in object +positions can result in ID switches. However, if the camera pose are known, we +can compensate for the errors in linear motion models. In this paper, we +propose \textit{DepthMOT}, which achieves: (i) detecting and estimating scene +depth map \textit{end-to-end}, (ii) compensating the irregular camera motion by +camera pose estimation. Extensive experiments demonstrate the superior +performance of DepthMOT in VisDrone-MOT and UAVDT datasets. The code will be +available at \url{https://github.com/JackWoo0831/DepthMOT}. + +
+
+
+
+
+ + ☆ Impact of LiDAR visualisations on semantic segmentation of + archaeological objects + + +
+ Deep learning methods in LiDAR-based archaeological research often leverage +visualisation techniques derived from Digital Elevation Models to enhance +characteristics of archaeological objects present in the images. This paper +investigates the impact of visualisations on deep learning performance through +a comprehensive testing framework. The study involves the use of eight semantic +segmentation models to evaluate seven diverse visualisations across two study +areas, encompassing five archaeological classes. Experimental results reveal +that the choice of appropriate visualisations can influence performance by up +to 8%. Yet, pinpointing one visualisation that outperforms the others in +segmenting all archaeological classes proves challenging. The observed +performance variation, reaching up to 25% across different model +configurations, underscores the importance of thoughtfully selecting model +configurations and LiDAR visualisations for successfully segmenting +archaeological objects. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2024 (IGARSS 2024) @IEEE copyright +
+
+
+
+
+ + ☆ Taming Transformers for Realistic Lidar Point Cloud Generation + + +
+ Diffusion Models (DMs) have achieved State-Of-The-Art (SOTA) results in the +Lidar point cloud generation task, benefiting from their stable training and +iterative refinement during sampling. However, DMs often fail to realistically +model Lidar raydrop noise due to their inherent denoising process. To retain +the strength of iterative sampling while enhancing the generation of raydrop +noise, we introduce LidarGRIT, a generative model that uses auto-regressive +transformers to iteratively sample the range images in the latent space rather +than image space. Furthermore, LidarGRIT utilises VQ-VAE to separately decode +range images and raydrop masks. Our results show that LidarGRIT achieves +superior performance compared to SOTA models on KITTI-360 and KITTI odometry +datasets. Code available at:https://github.com/hamedhaghighi/LidarGRIT. + +
+
+
+
+
+ + ☆ Two-Person Interaction Augmentation with Skeleton Priors + + +
+ Close and continuous interaction with rich contacts is a crucial aspect of +human activities (e.g. hugging, dancing) and of interest in many domains like +activity recognition, motion prediction, character animation, etc. However, +acquiring such skeletal motion is challenging. While direct motion capture is +expensive and slow, motion editing/generation is also non-trivial, as complex +contact patterns with topological and geometric constraints have to be +retained. To this end, we propose a new deep learning method for two-body +skeletal interaction motion augmentation, which can generate variations of +contact-rich interactions with varying body sizes and proportions while +retaining the key geometric/topological relations between two bodies. Our +system can learn effectively from a relatively small amount of data and +generalize to drastically different skeleton sizes. Through exhaustive +evaluation and comparison, we show it can generate high-quality motions, has +strong generalizability and outperforms traditional optimization-based methods +and alternative deep learning solutions. + +
+
+
+
+
+ + ☆ Mind-to-Image: Projecting Visual Mental Imagination of the Brain from + fMRI + + +
+ The reconstruction of images observed by subjects from fMRI data collected +during visual stimuli has made significant strides in the past decade, thanks +to the availability of extensive fMRI datasets and advancements in generative +models for image generation. However, the application of visual reconstruction +has remained limited. Reconstructing visual imagination presents a greater +challenge, with potentially revolutionary applications ranging from aiding +individuals with disabilities to verifying witness accounts in court. The +primary hurdles in this field are the absence of data collection protocols for +visual imagery and the lack of datasets on the subject. Traditionally, +fMRI-to-image relies on data collected from subjects exposed to visual stimuli, +which poses issues for generating visual imagery based on the difference of +brain activity between visual stimulation and visual imagery. For the first +time, we have compiled a substantial dataset (around 6h of scans) on visual +imagery along with a proposed data collection protocol. We then train a +modified version of an fMRI-to-image model and demonstrate the feasibility of +reconstructing images from two modes of imagination: from memory and from pure +imagination. This marks an important step towards creating a technology that +allow direct reconstruction of visual imagery. + +
+
+ comment: Pre-print to be updated +
+
+
+
+
+ + ☆ Enhancing Lip Reading with Multi-Scale Video and Multi-Encoder ICME2024 + + +
+ Automatic lip-reading (ALR) aims to automatically transcribe spoken content +from a speaker's silent lip motion captured in video. Current mainstream +lip-reading approaches only use a single visual encoder to model input videos +of a single scale. In this paper, we propose to enhance lipreading by +incorporating multi-scale video data and multi-encoder. Specifically, we first +propose a novel multi-scale lip extraction algorithm based on the size of the +speaker's face and an enhanced ResNet3D visual front-end (VFE) to extract lip +features at different scales. For the multi-encoder, in addition to the +mainstream Transformer and Conformer, we also incorporate the recently proposed +Branchformer and EBranchformer as visual encoders. In the experiments, we +explore the influence of different video data scales and encoders on ALR system +performance and fuse the texts transcribed by all ALR systems using recognizer +output voting error reduction (ROVER). Finally, our proposed approach placed +second in the ICME 2024 ChatCLR Challenge Task 2, with a 21.52% reduction in +character error rate (CER) compared to the official baseline on the evaluation +set. + +
+
+ comment: 6 pages, 3 figures, submitted to ICME2024 GC-ChatCLR +
+
+
+
+
+ + ☆ HAMMR: HierArchical MultiModal React agents for generic VQA + + +
+ Combining Large Language Models (LLMs) with external specialized tools +(LLMs+tools) is a recent paradigm to solve multimodal tasks such as Visual +Question Answering (VQA). While this approach was demonstrated to work well +when optimized and evaluated for each individual benchmark, in practice it is +crucial for the next generation of real-world AI systems to handle a broad +range of multimodal problems. Therefore we pose the VQA problem from a unified +perspective and evaluate a single system on a varied suite of VQA tasks +including counting, spatial reasoning, OCR-based reasoning, visual pointing, +external knowledge, and more. In this setting, we demonstrate that naively +applying the LLM+tools approach using the combined set of all tools leads to +poor results. This motivates us to introduce HAMMR: HierArchical MultiModal +React. We start from a multimodal ReAct-based system and make it hierarchical +by enabling our HAMMR agents to call upon other specialized agents. This +enhances the compositionality of the LLM+tools approach, which we show to be +critical for obtaining high accuracy on generic VQA. Concretely, on our generic +VQA suite, HAMMR outperforms the naive LLM+tools approach by 19.5%. +Additionally, HAMMR achieves state-of-the-art results on this task, +outperforming the generic standalone PaLI-X VQA model by 5.0%. + +
+
+
+
+
+ + ☆ Pansharpening of PRISMA products for archaeological prospection + + +
+ Hyperspectral data recorded from satellite platforms are often ill-suited for +geo-archaeological prospection due to low spatial resolution. The established +potential of hyperspectral data from airborne sensors in identifying +archaeological features has, on the other side, generated increased interest in +enhancing hyperspectral data to achieve higher spatial resolution. This +improvement is crucial for detecting traces linked to sub-surface +geo-archaeological features and can make satellite hyperspectral acquisitions +more suitable for archaeological research. This research assesses the usability +of pansharpened PRISMA satellite products in geo-archaeological prospections. +Three pan-sharpening methods (GSA, MTF-GLP and HySure) are compared +quantitatively and qualitatively and tested over the archaeological landscape +of Aquileia (Italy). The results suggest that the application of pansharpening +techniques makes hyperspectral satellite imagery highly suitable, under certain +conditions, to the identification of sub-surface archaeological features of +small and large size. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2024 (IGARSS 2024) @IEEE copyright +
+
+
+
+
+ + ☆ Action-conditioned video data improves predictability + + +
+ Long-term video generation and prediction remain challenging tasks in +computer vision, particularly in partially observable scenarios where cameras +are mounted on moving platforms. The interaction between observed image frames +and the motion of the recording agent introduces additional complexities. To +address these issues, we introduce the Action-Conditioned Video Generation +(ACVG) framework, a novel approach that investigates the relationship between +actions and generated image frames through a deep dual Generator-Actor +architecture. ACVG generates video sequences conditioned on the actions of +robots, enabling exploration and analysis of how vision and action mutually +influence one another in dynamic environments. We evaluate the framework's +effectiveness on an indoor robot motion dataset which consists of sequences of +image frames along with the sequences of actions taken by the robotic agent, +conducting a comprehensive empirical study comparing ACVG to other +state-of-the-art frameworks along with a detailed ablation study. + +
+
+
+
+
+ + ☆ Test-Time Zero-Shot Temporal Action Localization + + +
+ Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate +actions in untrimmed videos unseen during training. Existing ZS-TAL methods +involve fine-tuning a model on a large amount of annotated training data. While +effective, training-based ZS-TAL approaches assume the availability of labeled +data for supervised learning, which can be impractical in some applications. +Furthermore, the training process naturally induces a domain bias into the +learned model, which may adversely affect the model's generalization ability to +arbitrary videos. These considerations prompt us to approach the ZS-TAL problem +from a radically novel perspective, relaxing the requirement for training data. +To this aim, we introduce a novel method that performs Test-Time adaptation for +Temporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained +Vision and Language Model (VLM). T3AL operates in three steps. First, a +video-level pseudo-label of the action category is computed by aggregating +information from the entire video. Then, action localization is performed +adopting a novel procedure inspired by self-supervised learning. Finally, +frame-level textual descriptions extracted with a state-of-the-art captioning +model are employed for refining the action region proposals. We validate the +effectiveness of T3AL by conducting experiments on the THUMOS14 and the +ActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly +outperforms zero-shot baselines based on state-of-the-art VLMs, confirming the +benefit of a test-time adaptation approach. + +
+
+
+
+
+ + ☆ Two Hands Are Better Than One: Resolving Hand to Hand Intersections via + Occupancy Networks + + +
+ 3D hand pose estimation from images has seen considerable interest from the +literature, with new methods improving overall 3D accuracy. One current +challenge is to address hand-to-hand interaction where self-occlusions and +finger articulation pose a significant problem to estimation. Little work has +applied physical constraints that minimize the hand intersections that occur as +a result of noisy estimation. This work addresses the intersection of hands by +exploiting an occupancy network that represents the hand's volume as a +continuous manifold. This allows us to model the probability distribution of +points being inside a hand. We designed an intersection loss function to +minimize the likelihood of hand-to-point intersections. Moreover, we propose a +new hand mesh parameterization that is superior to the commonly used MANO model +in many respects including lower mesh complexity, underlying 3D skeleton +extraction, watertightness, etc. On the benchmark InterHand2.6M dataset, the +models trained using our intersection loss achieve better results than the +state-of-the-art by significantly decreasing the number of hand intersections +while lowering the mean per-joint positional error. Additionally, we +demonstrate superior performance for 3D hand uplift on Re:InterHand and SMILE +datasets and show reduced hand-to-hand intersections for complex domains such +as sign-language pose estimation. + +
+
+
+
+
+ + ☆ Anatomical Conditioning for Contrastive Unpaired Image-to-Image + Translation of Optical Coherence Tomography Images + + +
+ For a unified analysis of medical images from different modalities, data +harmonization using image-to-image (I2I) translation is desired. We study this +problem employing an optical coherence tomography (OCT) data set of +Spectralis-OCT and Home-OCT images. I2I translation is challenging because the +images are unpaired, and a bijective mapping does not exist due to the +information discrepancy between both domains. This problem has been addressed +by the Contrastive Learning for Unpaired I2I Translation (CUT) approach, but it +reduces semantic consistency. To restore the semantic consistency, we support +the style decoder using an additional segmentation decoder. Our approach +increases the similarity between the style-translated images and the target +distribution. Importantly, we improve the segmentation of biomarkers in +Home-OCT images in an unsupervised domain adaptation scenario. Our data +harmonization approach provides potential for the monitoring of diseases, e.g., +age related macular disease, using different OCT devices. + +
+
+ comment: Accepted at ISBI 2024 +
+
+
+
+
+ + ☆ PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation + + +
+ Beyond class frequency, we recognize the impact of class-wise relationships +among various class-specific predictions and the imbalance in label masks on +long-tailed segmentation learning. To address these challenges, we propose an +innovative Pixel-wise Adaptive Training (PAT) technique tailored for +long-tailed segmentation. PAT has two key features: 1) class-wise gradient +magnitude homogenization, and 2) pixel-wise class-specific loss adaptation +(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate +the imbalance among label masks by ensuring equal consideration of the +class-wise impact on model updates. Second, PCLA tackles the detrimental impact +of both rare classes within the long-tailed distribution and inaccurate +predictions from previous training stages by encouraging learning classes with +low prediction confidence and guarding against forgetting classes with high +confidence. This combined approach fosters robust learning while preventing the +model from forgetting previously learned knowledge. PAT exhibits significant +performance improvements, surpassing the current state-of-the-art by 2.2% in +the NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and +intersection over union value by 2.07%, with a particularly notable declination +of 0.39% in detecting rare classes compared to Balance Logits Variation, as +demonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and +NYU. + +
+
+
+
+
+ + ☆ T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise + Event Spotting in Sports Videos + + +
+ In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer +Encoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses +multiple challenges in the task, including the need for discriminability among +frame representations, high output temporal resolution to maintain prediction +precision, and the necessity to capture information at different temporal +scales to handle events with varying dynamics. It tackles these challenges +through its specifically designed architecture, featuring an encoder-decoder +for leveraging multiple temporal scales and achieving high output temporal +resolution, along with temporal modules designed to increase token +discriminability. Leveraging these characteristics, T-DEED achieves SOTA +performance on the FigureSkating and FineDiving datasets. + +
+
+
+
+
+ + ☆ Rethinking the Spatial Inconsistency in Classifier-Free Diffusion + Guidance CVPR-2024 + + +
+ Classifier-Free Guidance (CFG) has been widely used in text-to-image +diffusion models, where the CFG scale is introduced to control the strength of +text guidance on the whole image space. However, we argue that a global CFG +scale results in spatial inconsistency on varying semantic strengths and +suboptimal image quality. To address this problem, we present a novel approach, +Semantic-aware Classifier-Free Guidance (S-CFG), to customize the guidance +degrees for different semantic units in text-to-image diffusion models. +Specifically, we first design a training-free semantic segmentation method to +partition the latent image into relatively independent semantic regions at each +denoising step. In particular, the cross-attention map in the denoising U-net +backbone is renormalized for assigning each patch to the corresponding token, +while the self-attention map is used to complete the semantic regions. Then, to +balance the amplification of diverse semantic units, we adaptively adjust the +CFG scales across different semantic regions to rescale the text guidance +degrees into a uniform level. Finally, extensive experiments demonstrate the +superiority of S-CFG over the original CFG strategy on various text-to-image +diffusion models, without requiring any extra training cost. our codes are +available at https://github.com/SmilesDZgk/S-CFG. + +
+
+ comment: accepted by CVPR-2024 +
+
+
+
+
+ + ☆ CDAD-Net: Bridging Domain Gaps in Generalized Category Discovery CVPR + + +
+ In Generalized Category Discovery (GCD), we cluster unlabeled samples of +known and novel classes, leveraging a training dataset of known classes. A +salient challenge arises due to domain shifts between these datasets. To +address this, we present a novel setting: Across Domain Generalized Category +Discovery (AD-GCD) and bring forth CDAD-NET (Class Discoverer Across Domains) +as a remedy. CDAD-NET is architected to synchronize potential known class +samples across both the labeled (source) and unlabeled (target) datasets, while +emphasizing the distinct categorization of the target data. To facilitate this, +we propose an entropy-driven adversarial learning strategy that accounts for +the distance distributions of target samples relative to source-domain class +prototypes. Parallelly, the discriminative nature of the shared space is upheld +through a fusion of three metric learning objectives. In the source domain, our +focus is on refining the proximity between samples and their affiliated class +prototypes, while in the target domain, we integrate a neighborhood-centric +contrastive learning mechanism, enriched with an adept neighborsmining +approach. To further accentuate the nuanced feature interrelation among +semantically aligned images, we champion the concept of conditional image +inpainting, underscoring the premise that semantically analogous images prove +more efficacious to the task than their disjointed counterparts. +Experimentally, CDAD-NET eclipses existing literature with a performance +increment of 8-15% on three AD-GCD benchmarks we present. + +
+
+ comment: Accepted in L3D-IVU, CVPR Workshop, 2024 +
+
+
+
+
+ + ☆ Multi-head Attention-based Deep Multiple Instance Learning + + +
+ This paper introduces MAD-MIL, a Multi-head Attention-based Deep Multiple +Instance Learning model, designed for weakly supervised Whole Slide Images +(WSIs) classification in digital pathology. Inspired by the multi-head +attention mechanism of the Transformer, MAD-MIL simplifies model complexity +while achieving competitive results against advanced models like CLAM and +DS-MIL. Evaluated on the MNIST-BAGS and public datasets, including TUPAC16, +TCGA BRCA, TCGA LUNG, and TCGA KIDNEY, MAD-MIL consistently outperforms ABMIL. +This demonstrates enhanced information diversity, interpretability, and +efficiency in slide representation. The model's effectiveness, coupled with +fewer trainable parameters and lower computational complexity makes it a +promising solution for automated pathology workflows. Our code is available at +https://github.com/tueimage/MAD-MIL. + +
+
+
+
+
+ + ☆ CNN-based Game State Detection for a Foosball Table + + +
+ The automation of games using Deep Reinforcement Learning Strategies (DRL) is +a well-known challenge in AI research. While for feature extraction in a video +game typically the whole image is used, this is hardly practical for many real +world games. Instead, using a smaller game state reducing the dimension of the +parameter space to include essential parameters only seems to be a promising +approach. In the game of Foosball, a compact and comprehensive game state +description consists of the positional shifts and rotations of the figures and +the position of the ball over time. In particular, velocities and accelerations +can be derived from consecutive time samples of the game state. In this paper, +a figure detection system to determine the game state in Foosball is presented. +We capture a dataset containing the rotations of the rods which were measured +using accelerometers and the positional shifts were derived using traditional +Computer Vision techniques (in a laboratory setting). This dataset is utilized +to train Convolutional Neural Network (CNN) based end-to-end regression models +to predict the rotations and shifts of each rod. We present an evaluation of +our system using different state-of-the-art CNNs as base architectures for the +regression model. We show that our system is able to predict the game state +with high accuracy. By providing data for both black and white teams, the +presented system is intended to provide the required data for future +developments of Imitation Learning techniques w.r.t. to observing human +players. + +
+
+
+
+
+ + ☆ Iterative Refinement Strategy for Automated Data Labeling: Facial + Landmark Diagnosis in Medical Imaging + + +
+ Automated data labeling techniques are crucial for accelerating the +development of deep learning models, particularly in complex medical imaging +applications. However, ensuring accuracy and efficiency remains challenging. +This paper presents iterative refinement strategies for automated data labeling +in facial landmark diagnosis to enhance accuracy and efficiency for deep +learning models in medical applications, including dermatology, plastic +surgery, and ophthalmology. Leveraging feedback mechanisms and advanced +algorithms, our approach iteratively refines initial labels, reducing reliance +on manual intervention while improving label quality. Through empirical +evaluation and case studies, we demonstrate the effectiveness of our proposed +strategies in deep learning tasks across medical imaging domains. Our results +highlight the importance of iterative refinement in automated data labeling to +enhance the capabilities of deep learning systems in medical imaging +applications. + +
+
+
+
+
+ + ☆ Comparative Analysis of Image Enhancement Techniques for Brain Tumor + Segmentation: Contrast, Histogram, and Hybrid Approaches CCS + + +
+ This study systematically investigates the impact of image enhancement +techniques on Convolutional Neural Network (CNN)-based Brain Tumor +Segmentation, focusing on Histogram Equalization (HE), Contrast Limited +Adaptive Histogram Equalization (CLAHE), and their hybrid variations. Employing +the U-Net architecture on a dataset of 3064 Brain MRI images, the research +delves into preprocessing steps, including resizing and enhancement, to +optimize segmentation accuracy. A detailed analysis of the CNN-based U-Net +architecture, training, and validation processes is provided. The comparative +analysis, utilizing metrics such as Accuracy, Loss, MSE, IoU, and DSC, reveals +that the hybrid approach CLAHE-HE consistently outperforms others. Results +highlight its superior accuracy (0.9982, 0.9939, 0.9936 for training, testing, +and validation, respectively) and robust segmentation overlap, with Jaccard +values of 0.9862, 0.9847, and 0.9864, and Dice values of 0.993, 0.9923, and +0.9932 for the same phases, emphasizing its potential in neuro-oncological +applications. The study concludes with a call for refinement in segmentation +methodologies to further enhance diagnostic precision and treatment planning in +neuro-oncology. + +
+
+ comment: 9 Pages, & Figures, 2 Tables, International Conference on Computer + Science Electronics and Information (ICCSEI 2023) +
+
+
+
+
+ + ☆ Mask-ControlNet: Higher-Quality Image Generation with An Additional Mask + Prompt + + +
+ Text-to-image generation has witnessed great progress, especially with the +recent advancements in diffusion models. Since texts cannot provide detailed +conditions like object appearance, reference images are usually leveraged for +the control of objects in the generated images. However, existing methods still +suffer limited accuracy when the relationship between the foreground and +background is complicated. To address this issue, we develop a framework termed +Mask-ControlNet by introducing an additional mask prompt. Specifically, we +first employ large vision models to obtain masks to segment the objects of +interest in the reference image. Then, the object images are employed as +additional prompts to facilitate the diffusion model to better understand the +relationship between foreground and background regions during image generation. +Experiments show that the mask prompts enhance the controllability of the +diffusion model to maintain higher fidelity to the reference image while +achieving better image quality. Comparison with previous text-to-image +generation methods demonstrates our method's superior quantitative and +qualitative performance on the benchmark datasets. + +
+
+
+
+
+ + ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ CLIPping the Limits: Finding the Sweet Spot for Relevant Images in + Automated Driving Systems Perception Testing + + +
+ Perception systems, especially cameras, are the eyes of automated driving +systems. Ensuring that they function reliably and robustly is therefore an +important building block in the automation of vehicles. There are various +approaches to test the perception of automated driving systems. Ultimately, +however, it always comes down to the investigation of the behavior of +perception systems under specific input data. Camera images are a crucial part +of the input data. Image data sets are therefore collected for the testing of +automated driving systems, but it is non-trivial to find specific images in +these data sets. Thanks to recent developments in neural networks, there are +now methods for sorting the images in a data set according to their similarity +to a prompt in natural language. In order to further automate the provision of +search results, we make a contribution by automating the threshold definition +in these sorted results and returning only the images relevant to the prompt as +a result. Our focus is on preventing false positives and false negatives +equally. It is also important that our method is robust and in the case that +our assumptions are not fulfilled, we provide a fallback solution. + +
+
+
+
+
+ + ☆ Human Detection from 4D Radar Data in Low-Visibility Field Conditions ICRA 2024 + + +
+ Autonomous driving technology is increasingly being used on public roads and +in industrial settings such as mines. While it is essential to detect +pedestrians, vehicles, or other obstacles, adverse field conditions negatively +affect the performance of classical sensors such as cameras or lidars. Radar, +on the other hand, is a promising modality that is less affected by, e.g., +dust, smoke, water mist or fog. In particular, modern 4D imaging radars provide +target responses across the range, vertical angle, horizontal angle and Doppler +velocity dimensions. We propose TMVA4D, a CNN architecture that leverages this +4D radar modality for semantic segmentation. The CNN is trained to distinguish +between the background and person classes based on a series of 2D projections +of the 4D radar data that include the elevation, azimuth, range, and Doppler +velocity dimensions. We also outline the process of compiling a novel dataset +consisting of data collected in industrial settings with a car-mounted 4D radar +and describe how the ground-truth labels were generated from reference thermal +images. Using TMVA4D on this dataset, we achieve an mIoU score of 78.2% and an +mDice score of 86.1%, evaluated on the two classes background and person + +
+
+ comment: Submitted to Radar in Robotics workshop at ICRA 2024 +
+
+
+
+
+ + ☆ Texture Classification Network Integrating Adaptive Wavelet Transform + + +
+ Graves' disease is a common condition that is diagnosed clinically by +determining the smoothness of the thyroid texture and its morphology in +ultrasound images. Currently, the most widely used approach for the automated +diagnosis of Graves' disease utilizes Convolutional Neural Networks (CNNs) for +both feature extraction and classification. However, these methods demonstrate +limited efficacy in capturing texture features. Given the high capacity of +wavelets in describing texture features, this research integrates learnable +wavelet modules utilizing the Lifting Scheme into CNNs and incorporates a +parallel wavelet branch into the ResNet18 model to enhance texture feature +extraction. Our model can analyze texture features in spatial and frequency +domains simultaneously, leading to optimized classification accuracy. We +conducted experiments on collected ultrasound datasets and publicly available +natural image texture datasets, our proposed network achieved 97.27% accuracy +and 95.60% recall on ultrasound datasets, 60.765% accuracy on natural image +texture datasets, surpassing the accuracy of ResNet and conrming the +effectiveness of our approach. + +
+
+
+
+
+ + ☆ MindSet: Vision. A toolbox for testing DNNs on key psychological + experiments + + +
+ Multiple benchmarks have been developed to assess the alignment between deep +neural networks (DNNs) and human vision. In almost all cases these benchmarks +are observational in the sense they are composed of behavioural and brain +responses to naturalistic images that have not been manipulated to test +hypotheses regarding how DNNs or humans perceive and identify objects. Here we +introduce the toolbox MindSet: Vision, consisting of a collection of image +datasets and related scripts designed to test DNNs on 30 psychological +findings. In all experimental conditions, the stimuli are systematically +manipulated to test specific hypotheses regarding human visual perception and +object recognition. In addition to providing pre-generated datasets of images, +we provide code to regenerate these datasets, offering many configurable +parameters which greatly extend the dataset versatility for different research +contexts, and code to facilitate the testing of DNNs on these image datasets +using three different methods (similarity judgments, out-of-distribution +classification, and decoder method), accessible at +https://github.com/MindSetVision/mindset-vision. We test ResNet-152 on each of +these methods as an example of how the toolbox can be used. + +
+
+
+
+
+ + ☆ Detecting Every Object from Events + + +
+ Object detection is critical in autonomous driving, and it is more practical +yet challenging to localize objects of unknown categories: an endeavour known +as Class-Agnostic Object Detection (CAOD). Existing studies on CAOD +predominantly rely on ordinary cameras, but these frame-based sensors usually +have high latency and limited dynamic range, leading to safety risks in +real-world scenarios. In this study, we turn to a new modality enabled by the +so-called event camera, featured by its sub-millisecond latency and high +dynamic range, for robust CAOD. We propose Detecting Every Object in Events +(DEOE), an approach tailored for achieving high-speed, class-agnostic +open-world object detection in event-based vision. Built upon the fast +event-based backbone: recurrent vision transformer, we jointly consider the +spatial and temporal consistencies to identify potential objects. The +discovered potential objects are assimilated as soft positive samples to avoid +being suppressed as background. Moreover, we introduce a disentangled +objectness head to separate the foreground-background classification and novel +object discovery tasks, enhancing the model's generalization in localizing +novel objects while maintaining a strong ability to filter out the background. +Extensive experiments confirm the superiority of our proposed DEOE in +comparison with three strong baseline methods that integrate the +state-of-the-art event-based object detector with advancements in RGB-based +CAOD. Our code is available at https://github.com/Hatins/DEOE. + +
+
+
+
+
+ + ☆ MOSE: Boosting Vision-based Roadside 3D Object Detection with Scene Cues + + +
+ 3D object detection based on roadside cameras is an additional way for +autonomous driving to alleviate the challenges of occlusion and short +perception range from vehicle cameras. Previous methods for roadside 3D object +detection mainly focus on modeling the depth or height of objects, neglecting +the stationary of cameras and the characteristic of inter-frame consistency. In +this work, we propose a novel framework, namely MOSE, for MOnocular 3D object +detection with Scene cuEs. The scene cues are the frame-invariant +scene-specific features, which are crucial for object localization and can be +intuitively regarded as the height between the surface of the real road and the +virtual ground plane. In the proposed framework, a scene cue bank is designed +to aggregate scene cues from multiple frames of the same scene with a carefully +designed extrinsic augmentation strategy. Then, a transformer-based decoder +lifts the aggregated scene cues as well as the 3D position embeddings for 3D +object location, which boosts generalization ability in heterologous scenes. +The extensive experiment results on two public benchmarks demonstrate the +state-of-the-art performance of the proposed method, which surpasses the +existing methods by a large margin. + +
+
+
+
+
+ + ☆ Deep Optics for Video Snapshot Compressive Imaging ICCV 2023 + + +
+ Video snapshot compressive imaging (SCI) aims to capture a sequence of video +frames with only a single shot of a 2D detector, whose backbones rest in +optical modulation patterns (also known as masks) and a computational +reconstruction algorithm. Advanced deep learning algorithms and mature hardware +are putting video SCI into practical applications. Yet, there are two clouds in +the sunshine of SCI: i) low dynamic range as a victim of high temporal +multiplexing, and ii) existing deep learning algorithms' degradation on real +system. To address these challenges, this paper presents a deep optics +framework to jointly optimize masks and a reconstruction network. Specifically, +we first propose a new type of structural mask to realize motion-aware and +full-dynamic-range measurement. Considering the motion awareness property in +measurement domain, we develop an efficient network for video SCI +reconstruction using Transformer to capture long-term temporal dependencies, +dubbed Res2former. Moreover, sensor response is introduced into the forward +model of video SCI to guarantee end-to-end model training close to real system. +Finally, we implement the learned structural masks on a digital micro-mirror +device. Experimental results on synthetic and real data validate the +effectiveness of the proposed framework. We believe this is a milestone for +real-world video SCI. The source code and data are available at +https://github.com/pwangcs/DeepOpticsSCI. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation + + +
+ Customized text-to-image generation aims to synthesize instantiations of +user-specified concepts and has achieved unprecedented progress in handling +individual concept. However, when extending to multiple customized concepts, +existing methods exhibit limitations in terms of flexibility and fidelity, only +accommodating the combination of limited types of models and potentially +resulting in a mix of characteristics from different concepts. In this paper, +we introduce the Multi-concept guidance for Multi-concept customization, termed +MC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the +requirements for model architecture via inference time optimization, allowing +the integration of various heterogeneous single-concept customized models. It +adaptively refines the attention weights between visual and textual tokens, +directing image regions to focus on their associated words while diminishing +the impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$ +even surpasses previous methods that require additional training in terms of +consistency with input prompt and reference images. Moreover, MC$^2$ can be +extended to elevate the compositional capabilities of text-to-image generation, +yielding appealing results. Code will be publicly available at +https://github.com/JIANGJiaXiu/MC-2. + +
+
+
+
+
+ + ☆ Unbridled Icarus: A Survey of the Potential Perils of Image Inputs in + Multimodal Large Language Model Security + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable capabilities +that increasingly influence various aspects of our daily lives, constantly +defining the new boundary of Artificial General Intelligence (AGI). Image +modalities, enriched with profound semantic information and a more continuous +mathematical nature compared to other modalities, greatly enhance the +functionalities of MLLMs when integrated. However, this integration serves as a +double-edged sword, providing attackers with expansive vulnerabilities to +exploit for highly covert and harmful attacks. The pursuit of reliable AI +systems like powerful MLLMs has emerged as a pivotal area of contemporary +research. In this paper, we endeavor to demostrate the multifaceted risks +associated with the incorporation of image modalities into MLLMs. Initially, we +delineate the foundational components and training processes of MLLMs. +Subsequently, we construct a threat model, outlining the security +vulnerabilities intrinsic to MLLMs. Moreover, we analyze and summarize existing +scholarly discourses on MLLMs' attack and defense mechanisms, culminating in +suggestions for the future research on MLLM security. Through this +comprehensive analysis, we aim to deepen the academic understanding of MLLM +security challenges and propel forward the development of trustworthy MLLM +systems. + +
+
+ comment: 8 pages, 1 figure +
+
+
+
+
+ + ☆ Unsupervised Band Selection Using Fused HSI and LiDAR Attention + Integrating With Autoencoder + + +
+ Band selection in hyperspectral imaging (HSI) is critical for optimising data +processing and enhancing analytical accuracy. Traditional approaches have +predominantly concentrated on analysing spectral and pixel characteristics +within individual bands independently. These approaches overlook the potential +benefits of integrating multiple data sources, such as Light Detection and +Ranging (LiDAR), and is further challenged by the limited availability of +labeled data in HSI processing, which represents a significant obstacle. To +address these challenges, this paper introduces a novel unsupervised band +selection framework that incorporates attention mechanisms and an Autoencoder +for reconstruction-based band selection. Our methodology distinctively +integrates HSI with LiDAR data through an attention score, using a +convolutional Autoencoder to process the combined feature mask. This fusion +effectively captures essential spatial and spectral features and reduces +redundancy in hyperspectral datasets. A comprehensive comparative analysis of +our innovative fused band selection approach is performed against existing +unsupervised band selection and fusion models. We used data sets such as +Houston 2013, Trento, and MUUFLE for our experiments. The results demonstrate +that our method achieves superior classification accuracy and significantly +outperforms existing models. This enhancement in HSI band selection, +facilitated by the incorporation of LiDAR features, underscores the +considerable advantages of integrating features from different sources. + +
+
+ comment: 13 pages, 13figures, 6 tables +
+
+
+
+
+ + ☆ Text-to-Image Synthesis for Any Artistic Styles: Advancements in + Personalized Artistic Image Generation via Subdivision and Dual Binding + + +
+ Recent advancements in text-to-image models, such as Stable Diffusion, have +demonstrated their ability to synthesize visual images through natural language +prompts. One approach of personalizing text-to-image models, exemplified by +DreamBooth, fine-tunes the pre-trained model by binding unique text identifiers +with a few images of a specific subject. Although existing fine-tuning methods +have demonstrated competence in rendering images according to the styles of +famous painters, it is still challenging to learn to produce images +encapsulating distinct art styles due to abstract and broad visual perceptions +of stylistic attributes such as lines, shapes, textures, and colors. In this +paper, we introduce a new method, Single-StyleForge, for personalization. It +fine-tunes pre-trained text-to-image diffusion models to generate diverse +images in specified styles from text prompts. By using around 15-20 images of +the target style, the approach establishes a foundational binding of a unique +token identifier with a broad range of the target style. It also utilizes +auxiliary images to strengthen this binding, resulting in offering specific +guidance on representing elements such as persons in a target style-consistent +manner. In addition, we present ways to improve the quality of style and +text-image alignment through a method called Multi-StyleForge, which inherits +the strategy used in StyleForge and learns tokens in multiple. Experimental +evaluation conducted on six distinct artistic styles demonstrates substantial +improvements in both the quality of generated images and the perceptual +fidelity metrics, such as FID, KID, and CLIP scores. + +
+
+ comment: 20 pages, 12 figuers +
+
+
+
+
+ + ☆ CodeEnhance: A Codebook-Driven Approach for Low-Light Image Enhancement + + +
+ Low-light image enhancement (LLIE) aims to improve low-illumination images. +However, existing methods face two challenges: (1) uncertainty in restoration +from diverse brightness degradations; (2) loss of texture and color information +caused by noise suppression and light enhancement. In this paper, we propose a +novel enhancement approach, CodeEnhance, by leveraging quantized priors and +image refinement to address these challenges. In particular, we reframe LLIE as +learning an image-to-code mapping from low-light images to discrete codebook, +which has been learned from high-quality images. To enhance this process, a +Semantic Embedding Module (SEM) is introduced to integrate semantic information +with low-level features, and a Codebook Shift (CS) mechanism, designed to adapt +the pre-learned codebook to better suit the distinct characteristics of our +low-light dataset. Additionally, we present an Interactive Feature +Transformation (IFT) module to refine texture and color information during +image reconstruction, allowing for interactive enhancement based on user +preferences. Extensive experiments on both real-world and synthetic benchmarks +demonstrate that the incorporation of prior knowledge and controllable +information transfer significantly enhances LLIE performance in terms of +quality and fidelity. The proposed CodeEnhance exhibits superior robustness to +various degradations, including uneven illumination, noise, and color +distortion. + +
+
+ comment: 10 pages, 13 figures +
+
+
+
+
+ + ☆ Allowing humans to interactively guide machines where to look does not + always improve a human-AI team's classification accuracy CVPR + 2024 + + +
+ Via thousands of papers in Explainable AI (XAI), attention maps +\cite{vaswani2017attention} and feature attribution maps \cite{bansal2020sam} +have been established as a common means for explaining the input features that +are important to AI's decisions. It is an interesting but unexplored question +whether allowing users to edit the importance scores of input features at test +time would improve the human-AI team's accuracy on downstream tasks. In this +paper, we address this question by taking CHM-Corr, a state-of-the-art, +ante-hoc explanation method \cite{taesiri2022visual} that first predicts +patch-wise correspondences between the input and the training-set images, and +then uses them to make classification decisions. We build an interactive +interface on top of CHM-Corr, enabling users to directly edit the initial +feature attribution map provided by CHM-Corr. Via our CHM-Corr++ interface, +users gain insights into if, when, and how the model changes its outputs, +enhancing understanding beyond static explanations. Our user study with 18 +machine learning researchers who performed $\sim$1,400 decisions shows that our +interactive approach does not improve user accuracy on CUB-200 bird image +classification over static explanations. This challenges the belief that +interactivity inherently boosts XAI +effectiveness~\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding} +and raises needs for future research. Our work contributes to the field by +open-sourcing an interactive tool for manipulating model attention, and it lays +the groundwork for future research to enable effective human-AI interaction in +computer vision. We release code and data on +\href{https://anonymous.4open.science/r/CHMCorrPlusPlus/}{github}. Our +interface are available \href{http://137.184.82.109:7080/}{here}. + +
+
+ comment: Accepted for presentation at the XAI4CV Workshop, part of the CVPR + 2024 proceedings +
+
+
+
+
+ + ☆ Stylizing Sparse-View 3D Scenes with Hierarchical Neural Representation + + +
+ Recently, a surge of 3D style transfer methods has been proposed that +leverage the scene reconstruction power of a pre-trained neural radiance field +(NeRF). To successfully stylize a scene this way, one must first reconstruct a +photo-realistic radiance field from collected images of the scene. However, +when only sparse input views are available, pre-trained few-shot NeRFs often +suffer from high-frequency artifacts, which are generated as a by-product of +high-frequency details for improving reconstruction quality. Is it possible to +generate more faithful stylized scenes from sparse inputs by directly +optimizing encoding-based scene representation with target style? In this +paper, we consider the stylization of sparse-view scenes in terms of +disentangling content semantics and style textures. We propose a coarse-to-fine +sparse-view scene stylization framework, where a novel hierarchical +encoding-based neural representation is designed to generate high-quality +stylized scenes directly from implicit scene representations. We also propose a +new optimization strategy with content strength annealing to achieve realistic +stylization and better content preservation. Extensive experiments demonstrate +that our method can achieve high-quality stylization of sparse-view scenes and +outperforms fine-tuning-based baselines in terms of stylization quality and +efficiency. + +
+
+
+
+
+ + ☆ PromptAD: Learning Prompts with only Normal Samples for Few-Shot Anomaly + Detection CVPR2024 + + +
+ The vision-language model has brought great improvement to few-shot +industrial anomaly detection, which usually needs to design of hundreds of +prompts through prompt engineering. For automated scenarios, we first use +conventional prompt learning with many-class paradigm as the baseline to +automatically learn prompts but found that it can not work well in one-class +anomaly detection. To address the above problem, this paper proposes a +one-class prompt learning method for few-shot anomaly detection, termed +PromptAD. First, we propose semantic concatenation which can transpose normal +prompts into anomaly prompts by concatenating normal prompts with anomaly +suffixes, thus constructing a large number of negative samples used to guide +prompt learning in one-class setting. Furthermore, to mitigate the training +challenge caused by the absence of anomaly images, we introduce the concept of +explicit anomaly margin, which is used to explicitly control the margin between +normal prompt features and anomaly prompt features through a hyper-parameter. +For image-level/pixel-level anomaly detection, PromptAD achieves first place in +11/12 few-shot settings on MVTec and VisA. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ LayoutLLM: Layout Instruction Tuning with Large Language Models for + Document Understanding CVPR 2024 + + +
+ Recently, leveraging large language models (LLMs) or multimodal large +language models (MLLMs) for document understanding has been proven very +promising. However, previous works that employ LLMs/MLLMs for document +understanding have not fully explored and utilized the document layout +information, which is vital for precise document understanding. In this paper, +we propose LayoutLLM, an LLM/MLLM based method for document understanding. The +core of LayoutLLM is a layout instruction tuning strategy, which is specially +designed to enhance the comprehension and utilization of document layouts. The +proposed layout instruction tuning strategy consists of two components: +Layout-aware Pre-training and Layout-aware Supervised Fine-tuning. To capture +the characteristics of document layout in Layout-aware Pre-training, three +groups of pre-training tasks, corresponding to document-level, region-level and +segment-level information, are introduced. Furthermore, a novel module called +layout chain-of-thought (LayoutCoT) is devised to enable LayoutLLM to focus on +regions relevant to the question and generate accurate answers. LayoutCoT is +effective for boosting the performance of document understanding. Meanwhile, it +brings a certain degree of interpretability, which could facilitate manual +inspection and correction. Experiments on standard benchmarks show that the +proposed LayoutLLM significantly outperforms existing methods that adopt +open-source 7B LLMs/MLLMs for document understanding. The training data of the +LayoutLLM is publicly available at +https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/DocumentUnderstanding/LayoutLLM + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ StylizedGS: Controllable Stylization for 3D Gaussian Splatting + + +
+ With the rapid development of XR, 3D generation and editing are becoming more +and more important, among which, stylization is an important tool of 3D +appearance editing. It can achieve consistent 3D artistic stylization given a +single reference style image and thus is a user-friendly editing way. However, +recent NeRF-based 3D stylization methods face efficiency issues that affect the +actual user experience and the implicit nature limits its ability to transfer +the geometric pattern styles. Additionally, the ability for artists to exert +flexible control over stylized scenes is considered highly desirable, fostering +an environment conducive to creative exploration. In this paper, we introduce +StylizedGS, a 3D neural style transfer framework with adaptable control over +perceptual factors based on 3D Gaussian Splatting (3DGS) representation. The +3DGS brings the benefits of high efficiency. We propose a GS filter to +eliminate floaters in the reconstruction which affects the stylization effects +before stylization. Then the nearest neighbor-based style loss is introduced to +achieve stylization by fine-tuning the geometry and color parameters of 3DGS, +while a depth preservation loss with other regularizations is proposed to +prevent the tampering of geometry content. Moreover, facilitated by specially +designed losses, StylizedGS enables users to control color, stylized scale and +regions during the stylization to possess customized capabilities. Our method +can attain high-quality stylization results characterized by faithful +brushstrokes and geometric consistency with flexible controls. Extensive +experiments across various scenes and styles demonstrate the effectiveness and +efficiency of our method concerning both stylization quality and inference FPS. + +
+
+
+
+
+ + ☆ Multi-agent Long-term 3D Human Pose Forecasting via Interaction-aware + Trajectory Conditioning CVPR + + +
+ Human pose forecasting garners attention for its diverse applications. +However, challenges in modeling the multi-modal nature of human motion and +intricate interactions among agents persist, particularly with longer +timescales and more agents. In this paper, we propose an interaction-aware +trajectory-conditioned long-term multi-agent human pose forecasting model, +utilizing a coarse-to-fine prediction approach: multi-modal global trajectories +are initially forecasted, followed by respective local pose forecasts +conditioned on each mode. In doing so, our Trajectory2Pose model introduces a +graph-based agent-wise interaction module for a reciprocal forecast of local +motion-conditioned global trajectory and trajectory-conditioned local pose. Our +model effectively handles the multi-modality of human motion and the complexity +of long-term multi-agent interactions, improving performance in complex +environments. Furthermore, we address the lack of long-term (6s+) multi-agent +(5+) datasets by constructing a new dataset from real-world images and 2D +annotations, enabling a comprehensive evaluation of our proposed model. +State-of-the-art prediction performance on both complex and simpler datasets +confirms the generalized effectiveness of our method. The code is available at +https://github.com/Jaewoo97/T2P. + +
+
+ comment: 2024 CVPR Highlight +
+
+
+
+
+ + ☆ Spatio-Temporal Attention and Gaussian Processes for Personalized Video + Gaze Estimation CVPR 2024 + + +
+ Gaze is an essential prompt for analyzing human behavior and attention. +Recently, there has been an increasing interest in determining gaze direction +from facial videos. However, video gaze estimation faces significant +challenges, such as understanding the dynamic evolution of gaze in video +sequences, dealing with static backgrounds, and adapting to variations in +illumination. To address these challenges, we propose a simple and novel deep +learning model designed to estimate gaze from videos, incorporating a +specialized attention module. Our method employs a spatial attention mechanism +that tracks spatial dynamics within videos. This technique enables accurate +gaze direction prediction through a temporal sequence model, adeptly +transforming spatial observations into temporal insights, thereby significantly +improving gaze estimation accuracy. Additionally, our approach integrates +Gaussian processes to include individual-specific traits, facilitating the +personalization of our model with just a few labeled samples. Experimental +results confirm the efficacy of the proposed approach, demonstrating its +success in both within-dataset and cross-dataset settings. Specifically, our +proposed approach achieves state-of-the-art performance on the Gaze360 dataset, +improving by $2.5^\circ$ without personalization. Further, by personalizing the +model with just three samples, we achieved an additional improvement of +$0.8^\circ$. The code and pre-trained models are available at +\url{https://github.com/jswati31/stage}. + +
+
+ comment: Accepted at CVPR 2024 Gaze workshop +
+
+
+
+
+ + ☆ DiffCJK: Conditional Diffusion Model for High-Quality and Wide-coverage + CJK Character Generation + + +
+ Chinese, Japanese, and Korean (CJK), with a vast number of native speakers, +has profound influence on society and culture. The typesetting of CJK languages +carries a wide range of requirements due to the complexity of their scripts and +unique literary traditions. A critical aspect of this typesetting process is +that CJK fonts need to provide a set of consistent-looking glyphs for +approximately one hundred thousand characters. However, creating such a font is +inherently labor-intensive and expensive, which significantly hampers the +development of new CJK fonts for typesetting, historical, aesthetic, or +artistic purposes. + To bridge this gap, we are motivated by recent advancements in +diffusion-based generative models and propose a novel diffusion method for +generating glyphs in a targeted style from a \emph{single} conditioned, +standard glyph form. Our experiments show that our method is capable of +generating fonts of both printed and hand-written styles, the latter of which +presents a greater challenge. Moreover, our approach shows remarkable zero-shot +generalization capabilities for non-CJK but Chinese-inspired scripts. We also +show our method facilitates smooth style interpolation and generates bitmap +images suitable for vectorization, which is crucial in the font creation +process. In summary, our proposed method opens the door to high-quality, +generative model-assisted font creation for CJK characters, for both +typesetting and artistic endeavors. + +
+
+
+
+
+ + ☆ Multi-level Graph Subspace Contrastive Learning for Hyperspectral Image + Clustering IJCNN 2024 + + +
+ Hyperspectral image (HSI) clustering is a challenging task due to its high +complexity. Despite subspace clustering shows impressive performance for HSI, +traditional methods tend to ignore the global-local interaction in HSI data. In +this study, we proposed a multi-level graph subspace contrastive learning +(MLGSC) for HSI clustering. The model is divided into the following main parts. +Graph convolution subspace construction: utilizing spectral and texture +feautures to construct two graph convolution views. Local-global graph +representation: local graph representations were obtained by step-by-step +convolutions and a more representative global graph representation was obtained +using an attention-based pooling strategy. Multi-level graph subspace +contrastive learning: multi-level contrastive learning was conducted to obtain +local-global joint graph representations, to improve the consistency of the +positive samples between views, and to obtain more robust graph embeddings. +Specifically, graph-level contrastive learning is used to better learn global +representations of HSI data. Node-level intra-view and inter-view contrastive +learning is designed to learn joint representations of local regions of HSI. +The proposed model is evaluated on four popular HSI datasets: Indian Pines, +Pavia University, Houston, and Xu Zhou. The overall accuracies are 97.75%, +99.96%, 92.28%, and 95.73%, which significantly outperforms the current +state-of-the-art clustering methods. + +
+
+ comment: IJCNN 2024 +
+
+
+
+
+ + ☆ Bidirectional Long-Range Parser for Sequential Data Understanding + + +
+ The transformer is a powerful data modelling framework responsible for +remarkable performance on a wide range of tasks. However, they are limited in +terms of scalability as it is suboptimal and inefficient to process +long-sequence data. To this purpose we introduce BLRP (Bidirectional Long-Range +Parser), a novel and versatile attention mechanism designed to increase +performance and efficiency on long-sequence tasks. It leverages short and long +range heuristics in the form of a local sliding window approach combined with a +global bidirectional latent space synthesis technique. We show the benefits and +versatility of our approach on vision and language domains by demonstrating +competitive results against state-of-the-art methods on the Long-Range-Arena +and CIFAR benchmarks together with ablations demonstrating the computational +efficiency. + +
+
+
+
+
+ + ☆ iVPT: Improving Task-relevant Information Sharing in Visual Prompt + Tuning by Cross-layer Dynamic Connection + + +
+ Recent progress has shown great potential of visual prompt tuning (VPT) when +adapting pre-trained vision transformers to various downstream tasks. However, +most existing solutions independently optimize prompts at each layer, thereby +neglecting the usage of task-relevant information encoded in prompt tokens +across layers. Additionally, existing prompt structures are prone to +interference from task-irrelevant noise in input images, which can do harm to +the sharing of task-relevant information. In this paper, we propose a novel VPT +approach, \textbf{iVPT}. It innovatively incorporates a cross-layer dynamic +connection (CDC) for input prompt tokens from adjacent layers, enabling +effective sharing of task-relevant information. Furthermore, we design a +dynamic aggregation (DA) module that facilitates selective sharing of +information between layers. The combination of CDC and DA enhances the +flexibility of the attention process within the VPT framework. Building upon +these foundations, iVPT introduces an attentive reinforcement (AR) mechanism, +by automatically identifying salient image tokens, which are further enhanced +by prompt tokens in an additive manner. Extensive experiments on 24 image +classification and semantic segmentation benchmarks clearly demonstrate the +advantage of the proposed iVPT, compared to the state-of-the-art counterparts. + +
+
+
+
+
+ + ☆ SoundingActions: Learning How Actions Sound from Narrated Egocentric + Videos CVPR 2024 + + +
+ We propose a novel self-supervised embedding to learn how actions sound from +narrated in-the-wild egocentric videos. Whereas existing methods rely on +curated data with known audio-visual correspondence, our multimodal +contrastive-consensus coding (MC3) embedding reinforces the associations +between audio, language, and vision when all modality pairs agree, while +diminishing those associations when any one pair does not. We show our approach +can successfully discover how the long tail of human actions sound from +egocentric video, outperforming an array of recent multimodal embedding +techniques on two datasets (Ego4D and EPIC-Sounds) and multiple cross-modal +tasks. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://vision.cs.utexas.edu/projects/soundingactions +
+
+
+
+
+ + ☆ A secure and private ensemble matcher using multi-vault obfuscated + templates + + +
+ Given the irrevocability of biometric samples and mounting privacy concerns, +biometric template security and secure matching are among the essential +features of any well-designed modern biometric system. In this paper, we +propose an obfuscation method that hides the biometric template information +with just enough chaff. The main idea is to reduce the number of chaff points +to a practical level by creating n sub-templates from the original template and +hiding each sub-template with m chaff points. During verification, s closest +vectors to the biometric query are retrieved from each vault and then combined +to generate hash values that are compared with the stored hash value. We +demonstrate the effectiveness of synthetic facial images, generated by a +Generative Adversarial Network (GAN), as ``random chaff points'' within a +secure-vault authorization system. This approach safeguards user identities +during training and deployment. We tested our protocol using the AT&T, GT, and +LFW face datasets, with the ROC areas under the curve being 0.99, 0.99, and +0.90, respectively. These numbers were close to those of the unprotected +templates, showing that our method does not adversely affect accuracy. + +
+
+
+
+
+ + ☆ HSViT: Horizontally Scalable Vision Transformer + + +
+ While the Vision Transformer (ViT) architecture gains prominence in computer +vision and attracts significant attention from multimedia communities, its +deficiency in prior knowledge (inductive bias) regarding shift, scale, and +rotational invariance necessitates pre-training on large-scale datasets. +Furthermore, the growing layers and parameters in both ViT and convolutional +neural networks (CNNs) impede their applicability to mobile multimedia +services, primarily owing to the constrained computational resources on edge +devices. To mitigate the aforementioned challenges, this paper introduces a +novel horizontally scalable vision transformer (HSViT). Specifically, a novel +image-level feature embedding allows ViT to better leverage the inductive bias +inherent in the convolutional layers. Based on this, an innovative horizontally +scalable architecture is designed, which reduces the number of layers and +parameters of the models while facilitating collaborative training and +inference of ViT models across multiple nodes. The experimental results depict +that, without pre-training on large-scale datasets, HSViT achieves up to 10% +higher top-1 accuracy than state-of-the-art schemes, ascertaining its superior +preservation of inductive bias. The code is available at +https://github.com/xuchenhao001/HSViT. + +
+
+
+
+
+ + ☆ LGSDF: Continual Global Learning of Signed Distance Fields Aided by + Local Updating + + +
+ Implicit reconstruction of ESDF (Euclidean Signed Distance Field) involves +training a neural network to regress the signed distance from any point to the +nearest obstacle, which has the advantages of lightweight storage and +continuous querying. However, existing algorithms usually rely on conflicting +raw observations as training data, resulting in poor map performance. In this +paper, we propose LGSDF, an ESDF continual Global learning algorithm aided by +Local updating. At the front end, axis-aligned grids are dynamically updated by +pre-processed sensor observations, where incremental fusion alleviates +estimation error caused by limited viewing directions. At the back end, a +randomly initialized implicit ESDF neural network performs continual +self-supervised learning guided by these grids to generate smooth and +continuous maps. The results on multiple scenes show that LGSDF can construct +more accurate ESDF maps and meshes compared with SOTA (State Of The Art) +explicit and implicit mapping algorithms. The source code of LGSDF is publicly +available at https://github.com/BIT-DYN/LGSDF. + +
+
+
+
+
+ + ☆ Progressive Alignment with VLM-LLM Feature to Augment Defect + Classification for the ASE Dataset + + +
+ Traditional defect classification approaches are facing with two barriers. +(1) Insufficient training data and unstable data quality. Collecting sufficient +defective sample is expensive and time-costing, consequently leading to dataset +variance. It introduces the difficulty on recognition and learning. (2) +Over-dependence on visual modality. When the image pattern and texture is +monotonic for all defect classes in a given dataset, the performance of +conventional AOI system cannot be guaranteed. In scenarios where image quality +is compromised due to mechanical failures or when defect information is +inherently difficult to discern, the performance of deep models cannot be +guaranteed. A main question is, "how to solve those two problems when they +occur at the same time?" The feasible strategy is to explore another feature +within dataset and combine an eminent vision-language model (VLM) and +Large-Language model (LLM) with their astonishing zero-shot capability. In this +work, we propose the special ASE dataset, including rich data description +recorded on image, for defect classification, but the defect feature is uneasy +to learn directly. Secondly, We present the prompting for VLM-LLM against +defect classification with the proposed ASE dataset to activate extra-modality +feature from images to enhance performance. Then, We design the novel +progressive feature alignment (PFA) block to refine image-text feature to +alleviate the difficulty of alignment under few-shot scenario. Finally, the +proposed Cross-modality attention fusion (CMAF) module can effectively fuse +different modality feature. Experiment results have demonstrated our method's +effectiveness over several defect classification methods for the ASE dataset. + +
+
+ comment: MULA 2024 +
+
+
+
+
+ + ☆ Adaptive Learning for Multi-view Stereo Reconstruction + + +
+ Deep learning has recently demonstrated its excellent performance on the task +of multi-view stereo (MVS). However, loss functions applied for deep MVS are +rarely studied. In this paper, we first analyze existing loss functions' +properties for deep depth based MVS approaches. Regression based loss leads to +inaccurate continuous results by computing mathematical expectation, while +classification based loss outputs discretized depth values. To this end, we +then propose a novel loss function, named adaptive Wasserstein loss, which is +able to narrow down the difference between the true and predicted probability +distributions of depth. Besides, a simple but effective offset module is +introduced to better achieve sub-pixel prediction accuracy. Extensive +experiments on different benchmarks, including DTU, Tanks and Temples and +BlendedMVS, show that the proposed method with the adaptive Wasserstein loss +and the offset module achieves state-of-the-art performance. + +
+
+
+
+
+ + ☆ GloSoFarID: Global multispectral dataset for Solar Farm IDentification + in satellite imagery + + +
+ Solar Photovoltaic (PV) technology is increasingly recognized as a pivotal +solution in the global pursuit of clean and renewable energy. This technology +addresses the urgent need for sustainable energy alternatives by converting +solar power into electricity without greenhouse gas emissions. It not only +curtails global carbon emissions but also reduces reliance on finite, +non-renewable energy sources. In this context, monitoring solar panel farms +becomes essential for understanding and facilitating the worldwide shift toward +clean energy. This study contributes to this effort by developing the first +comprehensive global dataset of multispectral satellite imagery of solar panel +farms. This dataset is intended to form the basis for training robust machine +learning models, which can accurately map and analyze the expansion and +distribution of solar panel farms globally. The insights gained from this +endeavor will be instrumental in guiding informed decision-making for a +sustainable energy future. https://github.com/yzyly1992/GloSoFarID + +
+
+
+
+
+ + ☆ QMix: Quality-aware Learning with Mixed Noise for Robust Retinal Disease + Diagnosis + + +
+ Due to the complexity of medical image acquisition and the difficulty of +annotation, medical image datasets inevitably contain noise. Noisy data with +wrong labels affects the robustness and generalization ability of deep neural +networks. Previous noise learning methods mainly considered noise arising from +images being mislabeled, i.e. label noise, assuming that all mislabeled images +are of high image quality. However, medical images are prone to suffering +extreme quality issues, i.e. data noise, where discriminative visual features +are missing for disease diagnosis. In this paper, we propose a noise learning +framework, termed as QMix, that learns a robust disease diagnosis model under +mixed noise. QMix alternates between sample separation and quality-aware +semisupervised training in each training epoch. In the sample separation phase, +we design a joint uncertainty-loss criterion to effectively separate (1) +correctly labeled images; (2) mislabeled images with high quality and (3) +mislabeled images with low quality. In the semi-supervised training phase, we +train a disease diagnosis model to learn robust feature representation from the +separated samples. Specifically, we devise a sample-reweighing loss to mitigate +the effect of mislabeled images with low quality during training. Meanwhile, a +contrastive enhancement loss is proposed to further distinguish mislabeled +images with low quality from correctly labeled images. QMix achieved +state-of-the-art disease diagnosis performance on five public retinal image +datasets and exhibited substantial improvement on robustness against mixed +noise. + +
+
+
+
+
+ + ☆ Semantic Flow: Learning Semantic Field of Dynamic Scenes from Monocular + Videos ICLR 2024 + + +
+ In this work, we pioneer Semantic Flow, a neural semantic representation of +dynamic scenes from monocular videos. In contrast to previous NeRF methods that +reconstruct dynamic scenes from the colors and volume densities of individual +points, Semantic Flow learns semantics from continuous flows that contain rich +3D motion information. As there is 2D-to-3D ambiguity problem in the viewing +direction when extracting 3D flow features from 2D video frames, we consider +the volume densities as opacity priors that describe the contributions of flow +features to the semantics on the frames. More specifically, we first learn a +flow network to predict flows in the dynamic scene, and propose a flow feature +aggregation module to extract flow features from video frames. Then, we propose +a flow attention module to extract motion information from flow features, which +is followed by a semantic network to output semantic logits of flows. We +integrate the logits with volume densities in the viewing direction to +supervise the flow features with semantic labels on video frames. Experimental +results show that our model is able to learn from multiple dynamic scenes and +supports a series of new tasks such as instance-level scene editing, semantic +completions, dynamic scene tracking and semantic adaption on novel scenes. +Codes are available at https://github.com/tianfr/Semantic-Flow/. + +
+
+ comment: Accepted by ICLR 2024, Codes are available at + https://github.com/tianfr/Semantic-Flow/ +
+
+
+
+
+ + ☆ UniMix: Towards Domain Adaptive and Generalizable LiDAR Semantic + Segmentation in Adverse Weather CVPR 2024 + + +
+ LiDAR semantic segmentation (LSS) is a critical task in autonomous driving +and has achieved promising progress. However, prior LSS methods are +conventionally investigated and evaluated on datasets within the same domain in +clear weather. The robustness of LSS models in unseen scenes and all weather +conditions is crucial for ensuring safety and reliability in real applications. +To this end, we propose UniMix, a universal method that enhances the +adaptability and generalizability of LSS models. UniMix first leverages +physically valid adverse weather simulation to construct a Bridge Domain, which +serves to bridge the domain gap between the clear weather scenes and the +adverse weather scenes. Then, a Universal Mixing operator is defined regarding +spatial, intensity, and semantic distributions to create the intermediate +domain with mixed samples from given domains. Integrating the proposed two +techniques into a teacher-student framework, UniMix efficiently mitigates the +domain gap and enables LSS models to learn weather-robust and domain-invariant +representations. We devote UniMix to two main setups: 1) unsupervised domain +adaption, adapting the model from the clear weather source domain to the +adverse weather target domain; 2) domain generalization, learning a model that +generalizes well to unseen scenes in adverse weather. Extensive experiments +validate the effectiveness of UniMix across different tasks and datasets, all +achieving superior performance over state-of-the-art methods. The code will be +released. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing Clinical Efficiency through LLM: Discharge Note Generation for + Cardiac Patients + + +
+ Medical documentation, including discharge notes, is crucial for ensuring +patient care quality, continuity, and effective medical communication. However, +the manual creation of these documents is not only time-consuming but also +prone to inconsistencies and potential errors. The automation of this +documentation process using artificial intelligence (AI) represents a promising +area of innovation in healthcare. This study directly addresses the +inefficiencies and inaccuracies in creating discharge notes manually, +particularly for cardiac patients, by employing AI techniques, specifically +large language model (LLM). Utilizing a substantial dataset from a cardiology +center, encompassing wide-ranging medical records and physician assessments, +our research evaluates the capability of LLM to enhance the documentation +process. Among the various models assessed, Mistral-7B distinguished itself by +accurately generating discharge notes that significantly improve both +documentation efficiency and the continuity of care for patients. These notes +underwent rigorous qualitative evaluation by medical expert, receiving high +marks for their clinical relevance, completeness, readability, and contribution +to informed decision-making and care planning. Coupled with quantitative +analyses, these results confirm Mistral-7B's efficacy in distilling complex +medical information into concise, coherent summaries. Overall, our findings +illuminate the considerable promise of specialized LLM, such as Mistral-7B, in +refining healthcare documentation workflows and advancing patient care. This +study lays the groundwork for further integrating advanced AI technologies in +healthcare, demonstrating their potential to revolutionize patient +documentation and support better care outcomes. + +
+
+ comment: 10 pages, 1 figure, 3 tables, conference +
+
+
+
+
+ + ☆ Better Monocular 3D Detectors with LiDAR from the Past ICRA 2022 + + +
+ Accurate 3D object detection is crucial to autonomous driving. Though +LiDAR-based detectors have achieved impressive performance, the high cost of +LiDAR sensors precludes their widespread adoption in affordable vehicles. +Camera-based detectors are cheaper alternatives but often suffer inferior +performance compared to their LiDAR-based counterparts due to inherent depth +ambiguities in images. In this work, we seek to improve monocular 3D detectors +by leveraging unlabeled historical LiDAR data. Specifically, at inference time, +we assume that the camera-based detectors have access to multiple unlabeled +LiDAR scans from past traversals at locations of interest (potentially from +other high-end vehicles equipped with LiDAR sensors). Under this setup, we +proposed a novel, simple, and end-to-end trainable framework, termed +AsyncDepth, to effectively extract relevant features from asynchronous LiDAR +traversals of the same location for monocular 3D detectors. We show consistent +and significant performance gain (up to 9 AP) across multiple state-of-the-art +models and datasets with a negligible additional latency of 9.66 ms and a small +storage cost. + +
+
+ comment: Accepted by ICRA 2022. The code can be found at + https://github.com/YurongYou/AsyncDepth +
+
+
+
+
+ + ☆ Self-Supervised Multi-Object Tracking with Path Consistency CVPR 2024 + + +
+ In this paper, we propose a novel concept of path consistency to learn robust +object matching without using manual object identity supervision. Our key idea +is that, to track a object through frames, we can obtain multiple different +association results from a model by varying the frames it can observe, i.e., +skipping frames in observation. As the differences in observations do not alter +the identities of objects, the obtained association results should be +consistent. Based on this rationale, we generate multiple observation paths, +each specifying a different set of frames to be skipped, and formulate the Path +Consistency Loss that enforces the association results are consistent across +different observation paths. We use the proposed loss to train our object +matching model with only self-supervision. By extensive experiments on three +tracking datasets (MOT17, PersonPath22, KITTI), we demonstrate that our method +outperforms existing unsupervised methods with consistent margins on various +evaluation metrics, and even achieves performance close to supervised methods. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Image-based Agarwood Resinous Area Segmentation using Deep Learning + + +
+ The manual extraction method of Agarwood resinous compound is laborious work, +requires skilled workers, and is subject to human errors. Commercial Agarwood +industries have been actively exploring using Computer Numerical Control (CNC) +machines to replace human effort for this particular task. The CNC machine +accepts a G-code script produced from a binary image in which the wood region +that needs to be chiselled off is marked with (0, 0, 0) as its RGB value. +Rather than requiring a human expert to perform the region marking, we propose +using a Deep learning image segmentation method instead. Our setup involves a +camera that captures the cross-section image and then passes the image file to +a computer. The computer performs the automated image segmentation and feeds +the CNC machine with a G-code script. In this article, we report the initial +segmentation results achieved using a state-of-the-art Deep learning +segmentation method and discuss potential improvements to refine the +segmentation accuracy. + +
+
+ comment: 15 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ Improving Deep Learning Predictions with Simulated Images, and Vice + Versa + + +
+ Artificial neural networks are often used to identify features of crop +plants. However, training their models requires many annotated images, which +can be expensive and time-consuming to acquire. Procedural models of plants, +such as those developed with Lindenmayer-systems (L-systems) can be created to +produce visually realistic simulations, and hence images of plant simulations, +where annotations are implicitly known. These synthetic images can either +augment or completely replace real images in training neural networks for +phenotyping tasks. In this paper, we systematically vary amounts of real and +synthetic images used for training in both maize and canola to better +understand situations where synthetic images generated from L-systems can help +prediction on real images. This work also explores the degree to which realism +in the synthetic images improves prediction. Furthermore, we see how neural +network predictions can be used to help calibrate L-systems themselves, +creating a feedback loop. + +
+
+
+
+
+ + ☆ Class Similarity Transition: Decoupling Class Similarities and Imbalance + from Generalized Few-shot Segmentation + + +
+ In Generalized Few-shot Segmentation (GFSS), a model is trained with a large +corpus of base class samples and then adapted on limited samples of novel +classes. This paper focuses on the relevance between base and novel classes, +and improves GFSS in two aspects: 1) mining the similarity between base and +novel classes to promote the learning of novel classes, and 2) mitigating the +class imbalance issue caused by the volume difference between the support set +and the training set. Specifically, we first propose a similarity transition +matrix to guide the learning of novel classes with base class knowledge. Then, +we leverage the Label-Distribution-Aware Margin (LDAM) loss and Transductive +Inference to the GFSS task to address the problem of class imbalance as well as +overfitting the support set. In addition, by extending the probability +transition matrix, the proposed method can mitigate the catastrophic forgetting +of base classes when learning novel classes. With a simple training phase, our +proposed method can be applied to any segmentation network trained on base +classes. We validated our methods on the adapted version of OpenEarthMap. +Compared to existing GFSS baselines, our method excels them all from 3% to 7% +and ranks second in the OpenEarthMap Land Cover Mapping Few-Shot Challenge at +the completion of this paper. Code: +https://github.com/earth-insights/ClassTrans + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ TabConv: Low-Computation CNN Inference via Table Lookups + + +
+ Convolutional Neural Networks (CNNs) have demonstrated remarkable ability +throughout the field of computer vision. However, CNN inference requires a +large number of arithmetic operations, making them expensive to deploy in +hardware. Current approaches alleviate this issue by developing +hardware-supported, algorithmic processes to simplify spatial convolution +functions. However, these methods still heavily rely on matrix multiplication, +leading to significant computational overhead. To bridge the gap between +hardware, algorithmic acceleration, and approximate matrix multiplication, we +propose TabConv, a novel, table-based approximation for convolution to +significantly reduce arithmetic operations during inference. Additionally, we +introduce a priority masking technique based on cosine similarity to select +layers for table-based approximation, thereby maintaining the model +performance. We evaluate our approach on popular CNNs: ResNet-18, ResNet-34, +and NetworkInNetwork (NIN). TabConv preserves over 93% of the original model's +performance while reducing arithmetic operations by 36.5%, 25.8%, and 99.4% for +ResNet-18 on CIFAR-10, CIFAR-100, and MNIST, respectively, 35.6% and 99.3% for +ResNet-34 on CIFAR-10 and MNIST, and 98.9% for NIN on MNIST, achieving +low-computation inference. + +
+
+ comment: 8 pages, Accepted at CF '24 +
+
+
+
+
+ + ☆ Towards Improved Semiconductor Defect Inspection for high-NA EUVL based + on SEMI-SuperYOLO-NAS + + +
+ Due to potential pitch reduction, the semiconductor industry is adopting +High-NA EUVL technology. However, its low depth of focus presents challenges +for High Volume Manufacturing. To address this, suppliers are exploring thinner +photoresists and new underlayers/hardmasks. These may suffer from poor SNR, +complicating defect detection. Vision-based ML algorithms offer a promising +solution for semiconductor defect inspection. However, developing a robust ML +model across various image resolutions without explicit training remains a +challenge for nano-scale defect inspection. This research's goal is to propose +a scale-invariant ADCD framework capable to upscale images, addressing this +issue. We propose an improvised ADCD framework as SEMI-SuperYOLO-NAS, which +builds upon the baseline YOLO-NAS architecture. This framework integrates a SR +assisted branch to aid in learning HR features by the defect detection +backbone, particularly for detecting nano-scale defect instances from LR +images. Additionally, the SR-assisted branch can recursively generate upscaled +images from their corresponding downscaled counterparts, enabling defect +detection inference across various image resolutions without requiring explicit +training. Moreover, we investigate improved data augmentation strategy aimed at +generating diverse and realistic training datasets to enhance model +performance. We have evaluated our proposed approach using two original FAB +datasets obtained from two distinct processes and captured using two different +imaging tools. Finally, we demonstrate zero-shot inference for our model on a +new, originating from a process condition distinct from the training dataset +and possessing different Pitch characteristics. Experimental validation +demonstrates that our proposed ADCD framework aids in increasing the throughput +of imaging tools for defect inspection by reducing the required image pixel +resolutions. + +
+
+
+
+
+ + ☆ Localizing Moments of Actions in Untrimmed Videos of Infants with Autism + Spectrum Disorder + + +
+ Autism Spectrum Disorder (ASD) presents significant challenges in early +diagnosis and intervention, impacting children and their families. With +prevalence rates rising, there is a critical need for accessible and efficient +screening tools. Leveraging machine learning (ML) techniques, in particular +Temporal Action Localization (TAL), holds promise for automating ASD screening. +This paper introduces a self-attention based TAL model designed to identify +ASD-related behaviors in infant videos. Unlike existing methods, our approach +simplifies complex modeling and emphasizes efficiency, which is essential for +practical deployment in real-world scenarios. Importantly, this work +underscores the importance of developing computer vision methods capable of +operating in naturilistic environments with little equipment control, +addressing key challenges in ASD screening. This study is the first to conduct +end-to-end temporal action localization in untrimmed videos of infants with +ASD, offering promising avenues for early intervention and support. We report +baseline results of behavior detection using our TAL model. We achieve 70% +accuracy for look face, 79% accuracy for look object, 72% for smile and 65% for +vocalization. + +
+
+ comment: 7 pages, 2 figures, 3 tables +
+
+
+
+
+ + ☆ Privacy-Preserving Deep Learning Using Deformable Operators for Secure + Task Learning + + +
+ In the era of cloud computing and data-driven applications, it is crucial to +protect sensitive information to maintain data privacy, ensuring truly reliable +systems. As a result, preserving privacy in deep learning systems has become a +critical concern. Existing methods for privacy preservation rely on image +encryption or perceptual transformation approaches. However, they often suffer +from reduced task performance and high computational costs. To address these +challenges, we propose a novel Privacy-Preserving framework that uses a set of +deformable operators for secure task learning. Our method involves shuffling +pixels during the analog-to-digital conversion process to generate visually +protected data. Those are then fed into a well-known network enhanced with +deformable operators. Using our approach, users can achieve equivalent +performance to original images without additional training using a secret key. +Moreover, our method enables access control against unauthorized users. +Experimental results demonstrate the efficacy of our approach, showcasing its +potential in cloud-based scenarios and privacy-sensitive applications. + +
+
+ comment: copyright 2024 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ☆ Towards Explainable Automated Neuroanatomy + + +
+ We present a novel method for quantifying the microscopic structure of brain +tissue. It is based on the automated recognition of interpretable features +obtained by analyzing the shapes of cells. This contrasts with prevailing +methods of brain anatomical analysis in two ways. First, contemporary methods +use gray-scale values derived from smoothed version of the anatomical images, +which dissipated valuable information from the texture of the images. Second, +contemporary analysis uses the output of black-box Convolutional Neural +Networks, while our system makes decisions based on interpretable features +obtained by analyzing the shapes of individual cells. An important benefit of +this open-box approach is that the anatomist can understand and correct the +decisions made by the computer. Our proposed system can accurately localize and +identify existing brain structures. This can be used to align and coregistar +brains and will facilitate connectomic studies for reverse engineering of brain +circuitry. + +
+
+
+
+
+ + ☆ BatSort: Enhanced Battery Classification with Transfer Learning for + Battery Sorting and Recycling + + +
+ Battery recycling is a critical process for minimizing environmental harm and +resource waste for used batteries. However, it is challenging, largely because +sorting batteries is costly and hardly automated to group batteries based on +battery types. In this paper, we introduce a machine learning-based approach +for battery-type classification and address the daunting problem of data +scarcity for the application. We propose BatSort which applies transfer +learning to utilize the existing knowledge optimized with large-scale datasets +and customizes ResNet to be specialized for classifying battery types. We +collected our in-house battery-type dataset of small-scale to guide the +knowledge transfer as a case study and evaluate the system performance. We +conducted an experimental study and the results show that BatSort can achieve +outstanding accuracy of 92.1% on average and up to 96.2% and the performance is +stable for battery-type classification. Our solution helps realize fast and +automated battery sorting with minimized cost and can be transferred to related +industry applications with insufficient data. + +
+
+
+
+
+ + ☆ Responsible Generative AI: What to Generate and What Not + + +
+ In recent years, generative AI (GenAI), like large language models and +text-to-image models, has received significant attention across various +domains. However, ensuring the responsible generation of content by these +models is crucial for their real-world applicability. This raises an +interesting question: \textit{What should responsible GenAI generate, and what +should it not?} To answer the question, this paper investigates the practical +responsible requirements of both textual and visual generative models, +outlining five key considerations: generating truthful content, avoiding toxic +content, refusing harmful instruction, leaking no training data-related +content, and ensuring generated content identifiable. Specifically, we review +recent advancements and challenges in addressing these requirements. Besides, +we discuss and emphasize the importance of responsible GenAI across healthcare, +education, finance, and artificial general intelligence domains. Through a +unified perspective on both textual and visual generative models, this paper +aims to provide insights into practical safety-related issues and further +benefit the community in building responsible GenAI. + +
+
+ comment: 74 pages, 10 figures +
+
+
+
+
+ + ☆ Forecasting Electric Vehicle Battery Output Voltage: A Predictive + Modeling Approach + + +
+ The battery management system plays a vital role in ensuring the safety and +dependability of electric and hybrid vehicles. It is responsible for various +functions, including state evaluation, monitoring, charge control, and cell +balancing, all integrated within the BMS. Nonetheless, due to the uncertainties +surrounding battery performance, implementing these functionalities poses +significant challenges. In this study, we explore the latest approaches for +assessing battery states, highlight notable advancements in battery management +systems (BMS), address existing issues with current BMS technology, and put +forth possible solutions for predicting battery charging voltage. + +
+
+
+
+
+ + ☆ Lightweight Deep Learning for Resource-Constrained Environments: A + Survey + + +
+ Over the past decade, the dominance of deep learning has prevailed across +various domains of artificial intelligence, including natural language +processing, computer vision, and biomedical signal processing. While there have +been remarkable improvements in model accuracy, deploying these models on +lightweight devices, such as mobile phones and microcontrollers, is constrained +by limited resources. In this survey, we provide comprehensive design guidance +tailored for these devices, detailing the meticulous design of lightweight +models, compression methods, and hardware acceleration strategies. The +principal goal of this work is to explore methods and concepts for getting +around hardware constraints without compromising the model's accuracy. +Additionally, we explore two notable paths for lightweight deep learning in the +future: deployment techniques for TinyML and Large Language Models. Although +these paths undoubtedly have potential, they also present significant +challenges, encouraging research into unexplored areas. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Energy-Calibrated VAE with Test Time Free Lunch + + +
+ In this paper, we propose a novel generative model that utilizes a +conditional Energy-Based Model (EBM) for enhancing Variational Autoencoder +(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer +from blurry generated samples due to the lack of a tailored training on the +samples generated in the generative direction. On the other hand, EBMs can +generate high-quality samples but require expensive Markov Chain Monte Carlo +(MCMC) sampling. To address these issues, we introduce a conditional EBM for +calibrating the generative direction of VAE during training, without requiring +it for the generation at test time. In particular, we train EC-VAE upon both +the input data and the calibrated samples with adaptive weight to enhance +efficacy while avoiding MCMC sampling at test time. Furthermore, we extend the +calibration idea of EC-VAE to variational learning and normalizing flows, and +apply EC-VAE to an additional application of zero-shot image restoration via +neural transport prior and range-null theory. We evaluate the proposed method +with two applications, including image generation and zero-shot image +restoration, and the experimental results show that our method achieves +competitive performance over single-step non-adversarial generation. Our code +is available at https://github.com/DJ-LYH/EC-VAE. + +
+
+ comment: Revision. Code is available at https://github.com/DJ-LYH/EC-VAE +
+
+
+
+
+ + ♻ ☆ Deep Internal Learning: Deep Learning from a Single Input + + +
+ Deep learning, in general, focuses on training a neural network from large +labeled datasets. Yet, in many cases there is value in training a network just +from the input at hand. This is particularly relevant in many signal and image +processing problems where training data is scarce and diversity is large on the +one hand, and on the other, there is a lot of structure in the data that can be +exploited. Using this information is the key to deep internal-learning +strategies, which may involve training a network from scratch using a single +input or adapting an already trained network to a provided input example at +inference time. This survey paper aims at covering deep internal-learning +techniques that have been proposed in the past few years for these two +important directions. While our main focus will be on image processing +problems, most of the approaches that we survey are derived for general signals +(vectors with recurring patterns that can be distinguished from noise) and are +therefore applicable to other modalities. + +
+
+ comment: Accepted to IEEE Signal Processing Magazine +
+
+
+
+
+ + ♻ ☆ FreGS: 3D Gaussian Splatting with Progressive Frequency Regularization CVPR 2024 + + +
+ 3D Gaussian splatting has achieved very impressive performance in real-time +novel view synthesis. However, it often suffers from over-reconstruction during +Gaussian densification where high-variance image regions are covered by a few +large Gaussians only, leading to blur and artifacts in the rendered images. We +design a progressive frequency regularization (FreGS) technique to tackle the +over-reconstruction issue within the frequency space. Specifically, FreGS +performs coarse-to-fine Gaussian densification by exploiting low-to-high +frequency components that can be easily extracted with low-pass and high-pass +filters in the Fourier space. By minimizing the discrepancy between the +frequency spectrum of the rendered image and the corresponding ground truth, it +achieves high-quality Gaussian densification and alleviates the +over-reconstruction of Gaussian splatting effectively. Experiments over +multiple widely adopted benchmarks (e.g., Mip-NeRF360, Tanks-and-Temples and +Deep Blending) show that FreGS achieves superior novel view synthesis and +outperforms the state-of-the-art consistently. + +
+
+ comment: Accepted by CVPR 2024. Project website: + https://rogeraigc.github.io/FreGS-Page/ +
+
+
+
+
+ + ♻ ☆ WEEP: A method for spatial interpretation of weakly supervised CNN + models in computational pathology + + +
+ Deep learning enables the modelling of high-resolution histopathology +whole-slide images (WSI). Weakly supervised learning of tile-level data is +typically applied for tasks where labels only exist on the patient or WSI level +(e.g. patient outcomes or histological grading). In this context, there is a +need for improved spatial interpretability of predictions from such models. We +propose a novel method, Wsi rEgion sElection aPproach (WEEP), for model +interpretation. It provides a principled yet straightforward way to establish +the spatial area of WSI required for assigning a particular prediction label. +We demonstrate WEEP on a binary classification task in the area of breast +cancer computational pathology. WEEP is easy to implement, is directly +connected to the model-based decision process, and offers information relevant +to both research and diagnostic applications. + +
+
+
+
+
+ + ♻ ☆ Deep Feature Statistics Mapping for Generalized Screen Content Image + Quality Assessment + + +
+ The statistical regularities of natural images, referred to as natural scene +statistics, play an important role in no-reference image quality assessment. +However, it has been widely acknowledged that screen content images (SCIs), +which are typically computer generated, do not hold such statistics. Here we +make the first attempt to learn the statistics of SCIs, based upon which the +quality of SCIs can be effectively determined. The underlying mechanism of the +proposed approach is based upon the mild assumption that the SCIs, which are +not physically acquired, still obey certain statistics that could be understood +in a learning fashion. We empirically show that the statistics deviation could +be effectively leveraged in quality assessment, and the proposed method is +superior when evaluated in different settings. Extensive experimental results +demonstrate the Deep Feature Statistics based SCI Quality Assessment (DFSS-IQA) +model delivers promising performance compared with existing NR-IQA models and +shows a high generalization capability in the cross-dataset settings. The +implementation of our method is publicly available at +https://github.com/Baoliang93/DFSS-IQA. + +
+
+
+
+
+ + ♻ ☆ Towards Domain-agnostic Depth Completion + + +
+ Existing depth completion methods are often targeted at a specific sparse +depth type and generalize poorly across task domains. We present a method to +complete sparse/semi-dense, noisy, and potentially low-resolution depth maps +obtained by various range sensors, including those in modern mobile phones, or +by multi-view reconstruction algorithms. Our method leverages a data-driven +prior in the form of a single image depth prediction network trained on +large-scale datasets, the output of which is used as an input to our model. We +propose an effective training scheme where we simulate various sparsity +patterns in typical task domains. In addition, we design two new benchmarks to +evaluate the generalizability and the robustness of depth completion methods. +Our simple method shows superior cross-domain generalization ability against +state-of-the-art depth completion methods, introducing a practical solution to +high-quality depth capture on a mobile device. The code is available at: +https://github.com/YvanYin/FillDepth. + +
+
+
+
+
+ + ♻ ☆ Intention-Conditioned Long-Term Human Egocentric Action Forecasting CVPR + + +
+ To anticipate how a human would act in the future, it is essential to +understand the human intention since it guides the human towards a certain +goal. In this paper, we propose a hierarchical architecture which assumes a +sequence of human action (low-level) can be driven from the human intention +(high-level). Based on this, we deal with Long-Term Action Anticipation task in +egocentric videos. Our framework first extracts two level of human information +over the N observed videos human actions through a Hierarchical Multi-task MLP +Mixer (H3M). Then, we condition the uncertainty of the future through an +Intention-Conditioned Variational Auto-Encoder (I-CVAE) that generates K stable +predictions of the next Z=20 actions that the observed human might perform. By +leveraging human intention as high-level information, we claim that our model +is able to anticipate more time-consistent actions in the long-term, thus +improving the results over baseline methods in EGO4D Challenge. This work +ranked first in both CVPR@2022 and ECVV@2022 EGO4D LTA Challenge by providing +more plausible anticipated sequences, improving the anticipation of nouns and +overall actions. Webpage: https://evm7.github.io/icvae-page/ + +
+
+ comment: Winner of CVPR@2022 and ECCV@2022 EGO4D LTA Challenge. Accepted in + WACV2023. Webpage: https://evm7.github.io/icvae-page/ +
+
+
+
+
+ + ♻ ☆ Robust Human Motion Forecasting using Transformer-based Model IROS2022 + + +
+ Comprehending human motion is a fundamental challenge for developing +Human-Robot Collaborative applications. Computer vision researchers have +addressed this field by only focusing on reducing error in predictions, but not +taking into account the requirements to facilitate its implementation in +robots. In this paper, we propose a new model based on Transformer that +simultaneously deals with the real time 3D human motion forecasting in the +short and long term. Our 2-Channel Transformer (2CH-TR) is able to efficiently +exploit the spatio-temporal information of a shortly observed sequence (400ms) +and generates a competitive accuracy against the current state-of-the-art. +2CH-TR stands out for the efficient performance of the Transformer, being +lighter and faster than its competitors. In addition, our model is tested in +conditions where the human motion is severely occluded, demonstrating its +robustness in reconstructing and predicting 3D human motion in a highly noisy +environment. Our experiment results show that the proposed 2CH-TR outperforms +the ST-Transformer, which is another state-of-the-art model based on the +Transformer, in terms of reconstruction and prediction under the same +conditions of input prefix. Our model reduces in 8.89% the mean squared error +of ST-Transformer in short-term prediction, and 2.57% in long-term prediction +in Human3.6M dataset with 400ms input prefix. Webpage: +https://evm7.github.io/2CHTR-page/ + +
+
+ comment: Accepted to IROS2022. Webpage: https://evm7.github.io/2CHTR-page/ +
+
+
+
+
+ + ♻ ☆ A Unified Masked Autoencoder with Patchified Skeletons for Motion + Synthesis AAAI2024 + + +
+ The synthesis of human motion has traditionally been addressed through +task-dependent models that focus on specific challenges, such as predicting +future motions or filling in intermediate poses conditioned on known key-poses. +In this paper, we present a novel task-independent model called UNIMASK-M, +which can effectively address these challenges using a unified architecture. +Our model obtains comparable or better performance than the state-of-the-art in +each field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model +decomposes a human pose into body parts to leverage the spatio-temporal +relationships existing in human motion. Moreover, we reformulate various +pose-conditioned motion synthesis tasks as a reconstruction problem with +different masking patterns given as input. By explicitly informing our model +about the masked joints, our UNIMASK-M becomes more robust to occlusions. +Experimental results show that our model successfully forecasts human motion on +the Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion +inbetweening on the LaFAN1 dataset, particularly in long transition periods. +More information can be found on the project website +https://evm7.github.io/UNIMASKM-page/ + +
+
+ comment: Accepted to AAAI2024. Webpage: https://evm7.github.io/UNIMASKM-page/ +
+
+
+
+
+ + ♻ ☆ HOI4ABOT: Human-Object Interaction Anticipation for Human Intention + Reading Collaborative roBOTs + + +
+ Robots are becoming increasingly integrated into our lives, assisting us in +various tasks. To ensure effective collaboration between humans and robots, it +is essential that they understand our intentions and anticipate our actions. In +this paper, we propose a Human-Object Interaction (HOI) anticipation framework +for collaborative robots. We propose an efficient and robust transformer-based +model to detect and anticipate HOIs from videos. This enhanced anticipation +empowers robots to proactively assist humans, resulting in more efficient and +intuitive collaborations. Our model outperforms state-of-the-art results in HOI +detection and anticipation in VidHOI dataset with an increase of 1.76% and +1.04% in mAP respectively while being 15.4 times faster. We showcase the +effectiveness of our approach through experimental results in a real robot, +demonstrating that the robot's ability to anticipate HOIs is key for better +Human-Robot Interaction. More information can be found on our project webpage: +https://evm7.github.io/HOI4ABOT_page/ + +
+
+ comment: Proceedings in Conference on Robot Learning 2023. Webpage: + https://evm7.github.io/HOI4ABOT_page/ +
+
+
+
+
+ + ♻ ☆ Robot Interaction Behavior Generation based on Social Motion Forecasting + for Human-Robot Interaction ICRA 2024 + + +
+ Integrating robots into populated environments is a complex challenge that +requires an understanding of human social dynamics. In this work, we propose to +model social motion forecasting in a shared human-robot representation space, +which facilitates us to synthesize robot motions that interact with humans in +social scenarios despite not observing any robot in the motion training. We +develop a transformer-based architecture called ECHO, which operates in the +aforementioned shared space to predict the future motions of the agents +encountered in social scenarios. Contrary to prior works, we reformulate the +social motion problem as the refinement of the predicted individual motions +based on the surrounding agents, which facilitates the training while allowing +for single-motion forecasting when only one human is in the scene. We evaluate +our model in multi-person and human-robot motion forecasting tasks and obtain +state-of-the-art performance by a large margin while being efficient and +performing in real-time. Additionally, our qualitative results showcase the +effectiveness of our approach in generating human-robot interaction behaviors +that can be controlled via text commands. Webpage: https://evm7.github.io/ECHO/ + +
+
+ comment: Accepted at ICRA 2024. Webpage: https://evm7.github.io/ECHO/ +
+
+
+
+
+ + ♻ ☆ DRCT: Saving Image Super-resolution away from Information Bottleneck + + +
+ In recent years, Vision Transformer-based applications to low-level vision +tasks have achieved widespread success. Unlike CNN-based models, Transformers +are more adept at capturing long-range dependencies, enabling the +reconstruction of images utilizing information from non-local areas. In the +domain of super-resolution, Swin-transformer-based approaches have become +mainstream due to their capacity to capture global spatial information and +their shifting-window attention mechanism that facilitates the interchange of +information between different windows. Many researchers have enhanced image +quality and network efficiency by expanding the receptive field or designing +complex networks, yielding commendable results. However, we observed that +spatial information tends to diminish during the forward propagation process +due to increased depth, leading to a loss of spatial information and, +consequently, limiting the model's potential. To address this, we propose the +Dense-residual-connected Transformer (DRCT), aimed at mitigating the loss of +spatial information through dense-residual connections between layers, thereby +unleashing the model's potential and enhancing performance. Experiment results +indicate that our approach is not only straightforward but also achieves +remarkable efficiency, surpassing state-of-the-art methods and performing +commendably at NTIRE2024. + +
+
+ comment: NTIRE 2024 Image Super-resolution (x4) +
+
+
+
+
+ + ♻ ☆ MESA: Matching Everything by Segmenting Anything CVPR24 + + +
+ Feature matching is a crucial task in the field of computer vision, which +involves finding correspondences between images. Previous studies achieve +remarkable performance using learning-based feature comparison. However, the +pervasive presence of matching redundancy between images gives rise to +unnecessary and error-prone computations in these methods, imposing limitations +on their accuracy. To address this issue, we propose MESA, a novel approach to +establish precise area (or region) matches for efficient matching redundancy +reduction. MESA first leverages the advanced image understanding capability of +SAM, a state-of-the-art foundation model for image segmentation, to obtain +image areas with implicit semantic. Then, a multi-relational graph is proposed +to model the spatial structure of these areas and construct their scale +hierarchy. Based on graphical models derived from the graph, the area matching +is reformulated as an energy minimization task and effectively resolved. +Extensive experiments demonstrate that MESA yields substantial precision +improvement for multiple point matchers in indoor and outdoor downstream tasks, +e.g. +13.61% for DKM in indoor pose estimation. + +
+
+ comment: CVPR24 +
+
+
+
+
+ + ♻ ☆ DPHMs: Diffusion Parametric Head Models for Depth-based Tracking CVPR 2024 + + +
+ We introduce Diffusion Parametric Head Models (DPHMs), a generative model +that enables robust volumetric head reconstruction and tracking from monocular +depth sequences. While recent volumetric head models, such as NPHMs, can now +excel in representing high-fidelity head geometries, tracking and +reconstructing heads from real-world single-view depth sequences remains very +challenging, as the fitting to partial and noisy observations is +underconstrained. To tackle these challenges, we propose a latent +diffusion-based prior to regularize volumetric head reconstruction and +tracking. This prior-based regularizer effectively constrains the identity and +expression codes to lie on the underlying latent manifold which represents +plausible head shapes. To evaluate the effectiveness of the diffusion-based +prior, we collect a dataset of monocular Kinect sequences consisting of various +complex facial expression motions and rapid transitions. We compare our method +to state-of-the-art tracking methods and demonstrate improved head identity +reconstruction as well as robust expression tracking. + +
+
+ comment: CVPR 2024; homepage: https://tangjiapeng.github.io/projects/DPHMs/ +
+
+
+
+
+ + ♻ ☆ SepVAE: a contrastive VAE to separate pathological patterns from healthy + ones ICML + + +
+ Contrastive Analysis VAE (CA-VAEs) is a family of Variational auto-encoders +(VAEs) that aims at separating the common factors of variation between a +background dataset (BG) (i.e., healthy subjects) and a target dataset (TG) +(i.e., patients) from the ones that only exist in the target dataset. To do so, +these methods separate the latent space into a set of salient features (i.e., +proper to the target dataset) and a set of common features (i.e., exist in both +datasets). Currently, all models fail to prevent the sharing of information +between latent spaces effectively and to capture all salient factors of +variation. To this end, we introduce two crucial regularization losses: a +disentangling term between common and salient representations and a +classification term between background and target samples in the salient space. +We show a better performance than previous CA-VAEs methods on three medical +applications and a natural images dataset (CelebA). Code and datasets are +available on GitHub https://github.com/neurospin-projects/2023_rlouiset_sepvae. + +
+
+ comment: Workshop on Interpretable ML in Healthcare at International + Conference on Machine Learning (ICML), Honolulu, Hawaii, USA. 2023 +
+
+
+
+
+ + ♻ ☆ SiT-MLP: A Simple MLP with Point-wise Topology Feature Learning for + Skeleton-based Action Recognition + + +
+ Graph convolution networks (GCNs) have achieved remarkable performance in +skeleton-based action recognition. However, previous GCN-based methods rely on +elaborate human priors excessively and construct complex feature aggregation +mechanisms, which limits the generalizability and effectiveness of networks. To +solve these problems, we propose a novel Spatial Topology Gating Unit (STGU), +an MLP-based variant without extra priors, to capture the co-occurrence +topology features that encode the spatial dependency across all joints. In +STGU, to learn the point-wise topology features, a new gate-based feature +interaction mechanism is introduced to activate the features point-to-point by +the attention map generated from the input sample. Based on the STGU, we +propose the first MLP-based model, SiT-MLP, for skeleton-based action +recognition in this work. Compared with previous methods on three large-scale +datasets, SiT-MLP achieves competitive performance. In addition, SiT-MLP +reduces the parameters significantly with favorable results. The code will be +available at https://github.com/BUPTSJZhang/SiT?MLP. + +
+
+ comment: Accepted by IEEE TCSVT 2024 +
+
+
+
+
+ + ♻ ☆ RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose + Estimation CVPR 2024 + + +
+ Real-time multi-person pose estimation presents significant challenges in +balancing speed and precision. While two-stage top-down methods slow down as +the number of people in the image increases, existing one-stage methods often +fail to simultaneously deliver high accuracy and real-time performance. This +paper introduces RTMO, a one-stage pose estimation framework that seamlessly +integrates coordinate classification by representing keypoints using dual 1-D +heatmaps within the YOLO architecture, achieving accuracy comparable to +top-down methods while maintaining high speed. We propose a dynamic coordinate +classifier and a tailored loss function for heatmap learning, specifically +designed to address the incompatibilities between coordinate classification and +dense prediction models. RTMO outperforms state-of-the-art one-stage pose +estimators, achieving 1.1% higher AP on COCO while operating about 9 times +faster with the same backbone. Our largest model, RTMO-l, attains 74.8% AP on +COCO val2017 and 141 FPS on a single V100 GPU, demonstrating its efficiency and +accuracy. The code and models are available at +https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo +
+
+
+
+
+ + ♻ ☆ Zero-Shot Segmentation of Eye Features Using the Segment Anything Model + (SAM) + + +
+ The advent of foundation models signals a new era in artificial intelligence. +The Segment Anything Model (SAM) is the first foundation model for image +segmentation. In this study, we evaluate SAM's ability to segment features from +eye images recorded in virtual reality setups. The increasing requirement for +annotated eye-image datasets presents a significant opportunity for SAM to +redefine the landscape of data annotation in gaze estimation. Our investigation +centers on SAM's zero-shot learning abilities and the effectiveness of prompts +like bounding boxes or point clicks. Our results are consistent with studies in +other domains, demonstrating that SAM's segmentation effectiveness can be +on-par with specialized models depending on the feature, with prompts improving +its performance, evidenced by an IoU of 93.34% for pupil segmentation in one +dataset. Foundation models like SAM could revolutionize gaze estimation by +enabling quick and easy image segmentation, reducing reliance on specialized +models and extensive manual annotation. + +
+
+ comment: 14 pages, 8 figures, 1 table, Accepted to ETRA 2024: ACM Symposium on + Eye Tracking Research & Applications +
+
+
+
+
+ + ♻ ☆ Photo-SLAM: Real-time Simultaneous Localization and Photorealistic + Mapping for Monocular, Stereo, and RGB-D Cameras CVPR 2024 + + +
+ The integration of neural rendering and the SLAM system recently showed +promising results in joint localization and photorealistic view reconstruction. +However, existing methods, fully relying on implicit representations, are so +resource-hungry that they cannot run on portable devices, which deviates from +the original intention of SLAM. In this paper, we present Photo-SLAM, a novel +SLAM framework with a hyper primitives map. Specifically, we simultaneously +exploit explicit geometric features for localization and learn implicit +photometric features to represent the texture information of the observed +environment. In addition to actively densifying hyper primitives based on +geometric features, we further introduce a Gaussian-Pyramid-based training +method to progressively learn multi-level features, enhancing photorealistic +mapping performance. The extensive experiments with monocular, stereo, and +RGB-D datasets prove that our proposed system Photo-SLAM significantly +outperforms current state-of-the-art SLAM systems for online photorealistic +mapping, e.g., PSNR is 30% higher and rendering speed is hundreds of times +faster in the Replica dataset. Moreover, the Photo-SLAM can run at real-time +speed using an embedded platform such as Jetson AGX Orin, showing the potential +of robotics applications. + +
+
+ comment: CVPR 2024. Code: https://github.com/HuajianUP/Photo-SLAM - Project + Page: https://huajianup.github.io/research/Photo-SLAM/ +
+
+
+
+
+ + ♻ ☆ 360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization + with Cross-device Queries CVPR 2024 + + +
+ Portable 360$^\circ$ cameras are becoming a cheap and efficient tool to +establish large visual databases. By capturing omnidirectional views of a +scene, these cameras could expedite building environment models that are +essential for visual localization. However, such an advantage is often +overlooked due to the lack of valuable datasets. This paper introduces a new +benchmark dataset, 360Loc, composed of 360$^\circ$ images with ground truth +poses for visual localization. We present a practical implementation of +360$^\circ$ mapping combining 360$^\circ$ images with lidar data to generate +the ground truth 6DoF poses. 360Loc is the first dataset and benchmark that +explores the challenge of cross-device visual positioning, involving +360$^\circ$ reference frames, and query frames from pinhole, ultra-wide FoV +fisheye, and 360$^\circ$ cameras. We propose a virtual camera approach to +generate lower-FoV query frames from 360$^\circ$ images, which ensures a fair +comparison of performance among different query types in visual localization +tasks. We also extend this virtual camera approach to feature matching-based +and pose regression-based methods to alleviate the performance loss caused by +the cross-device domain gap, and evaluate its effectiveness against +state-of-the-art baselines. We demonstrate that omnidirectional visual +localization is more robust in challenging large-scale scenes with symmetries +and repetitive structures. These results provide new insights into 360-camera +mapping and omnidirectional visual localization with cross-device queries. + +
+
+ comment: CVPR 2024. Project Page: https://huajianup.github.io/research/360Loc/ +
+
+
+
+
+ + ♻ ☆ Design as Desired: Utilizing Visual Question Answering for Multimodal + Pre-training + + +
+ Multimodal pre-training demonstrates its potential in the medical domain, +which learns medical visual representations from paired medical reports. +However, many pre-training tasks require extra annotations from clinicians, and +most of them fail to explicitly guide the model to learn the desired features +of different pathologies. To the best of our knowledge, we are the first to +utilize Visual Question Answering (VQA) for multimodal pre-training to guide +the framework focusing on targeted pathological features. In this work, we +leverage descriptions in medical reports to design multi-granular +question-answer pairs associated with different diseases, which assist the +framework in pre-training without requiring extra annotations from experts. We +also propose a novel pre-training framework with a quasi-textual feature +transformer, a module designed to transform visual features into a +quasi-textual space closer to the textual domain via a contrastive learning +strategy. This narrows the vision-language gap and facilitates modality +alignment. Our framework is applied to four downstream tasks: report +generation, classification, segmentation, and detection across five datasets. +Extensive experiments demonstrate the superiority of our framework compared to +other state-of-the-art methods. Our code will be released upon acceptance. + +
+
+
+
+
+ + ♻ ☆ LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging CVPR 2024 + + +
+ Human pose and shape (HPS) estimation with lensless imaging is not only +beneficial to privacy protection but also can be used in covert surveillance +scenarios due to the small size and simple structure of this device. However, +this task presents significant challenges due to the inherent ambiguity of the +captured measurements and lacks effective methods for directly estimating human +pose and shape from lensless data. In this paper, we propose the first +end-to-end framework to recover 3D human poses and shapes from lensless +measurements to our knowledge. We specifically design a multi-scale lensless +feature decoder to decode the lensless measurements through the optically +encoded mask for efficient feature extraction. We also propose a double-head +auxiliary supervision mechanism to improve the estimation accuracy of human +limb ends. Besides, we establish a lensless imaging system and verify the +effectiveness of our method on various datasets acquired by our lensless +imaging system. + +
+
+ comment: Accepted to CVPR 2024. More results available at + https://cic.tju.edu.cn/faculty/likun/projects/LPSNet +
+
+
+
+
+ + ♻ ☆ A ground-based dataset and a diffusion model for on-orbit low-light + image enhancement + + +
+ On-orbit service is important for maintaining the sustainability of space +environment. Space-based visible camera is an economical and lightweight sensor +for situation awareness during on-orbit service. However, it can be easily +affected by the low illumination environment. Recently, deep learning has +achieved remarkable success in image enhancement of natural images, but seldom +applied in space due to the data bottleneck. In this article, we first propose +a dataset of the Beidou Navigation Satellite for on-orbit low-light image +enhancement (LLIE). In the automatic data collection scheme, we focus on +reducing domain gap and improving the diversity of the dataset. we collect +hardware in-the-loop images based on a robotic simulation testbed imitating +space lighting conditions. To evenly sample poses of different orientation and +distance without collision, a collision-free working space and pose stratified +sampling is proposed. Afterwards, a novel diffusion model is proposed. To +enhance the image contrast without over-exposure and blurring details, we +design a fused attention to highlight the structure and dark region. Finally, +we compare our method with previous methods using our dataset, which indicates +that our method has a better capacity in on-orbit LLIE. + +
+
+
+
+
+ + ♻ ☆ Representing Noisy Image Without Denoising + + +
+ A long-standing topic in artificial intelligence is the effective recognition +of patterns from noisy images. In this regard, the recent data-driven paradigm +considers 1) improving the representation robustness by adding noisy samples in +training phase (i.e., data augmentation) or 2) pre-processing the noisy image +by learning to solve the inverse problem (i.e., image denoising). However, such +methods generally exhibit inefficient process and unstable result, limiting +their practical applications. In this paper, we explore a non-learning paradigm +that aims to derive robust representation directly from noisy images, without +the denoising as pre-processing. Here, the noise-robust representation is +designed as Fractional-order Moments in Radon space (FMR), with also beneficial +properties of orthogonality and rotation invariance. Unlike earlier +integer-order methods, our work is a more generic design taking such classical +methods as special cases, and the introduced fractional-order parameter offers +time-frequency analysis capability that is not available in classical methods. +Formally, both implicit and explicit paths for constructing the FMR are +discussed in detail. Extensive simulation experiments and an image security +application are provided to demonstrate the uniqueness and usefulness of our +FMR, especially for noise robustness, rotation invariance, and time-frequency +discriminability. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence, 2024 +
+
+
+
+
+ + ♻ ☆ PEEB: Part-based Image Classifiers with an Explainable and Editable + Language Bottleneck NAACL 2024 + + +
+ CLIP-based classifiers rely on the prompt containing a {class name} that is +known to the text encoder. Therefore, they perform poorly on new classes or the +classes whose names rarely appear on the Internet (e.g., scientific names of +birds). For fine-grained classification, we propose PEEB - an explainable and +editable classifier to (1) express the class name into a set of text +descriptors that describe the visual parts of that class; and (2) match the +embeddings of the detected parts to their textual descriptors in each class to +compute a logit score for classification. In a zero-shot setting where the +class names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1 +accuracy). Compared to part-based classifiers, PEEB is not only the +state-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20% +accuracy on CUB-200 and Dogs-120, respectively) but also the first to enable +users to edit the text descriptors to form a new classifier without any +re-training. Compared to concept bottleneck models, PEEB is also the SOTA in +both zero-shot and supervised-learning settings. + +
+
+ comment: Findings of NAACL 2024 (long paper) +
+
+
+
+
+ + ♻ ☆ Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation + + +
+ With the explosive popularity of AI-generated content (AIGC), video +generation has recently received a lot of attention. Generating videos guided +by text instructions poses significant challenges, such as modeling the complex +relationship between space and time, and the lack of large-scale text-video +paired data. Existing text-video datasets suffer from limitations in both +content quality and scale, or they are not open-source, rendering them +inaccessible for study and use. For model design, previous approaches extend +pretrained text-to-image generation models by adding temporal 1D +convolution/attention modules for video generation. However, these approaches +overlook the importance of jointly modeling space and time, inevitably leading +to temporal distortions and misalignment between texts and videos. In this +paper, we propose a novel approach that strengthens the interaction between +spatial and temporal perceptions. In particular, we utilize a swapped +cross-attention mechanism in 3D windows that alternates the ``query'' role +between spatial and temporal blocks, enabling mutual reinforcement for each +other. Moreover, to fully unlock model capabilities for high-quality video +generation and promote the development of the field, we curate a large-scale +and open-source video dataset called HD-VG-130M. This dataset comprises 130 +million text-video pairs from the open-domain, ensuring high-definition, +widescreen and watermark-free characters. A smaller-scale yet more meticulously +cleaned subset further enhances the data quality, aiding models in achieving +superior performance. Experimental quantitative and qualitative results +demonstrate the superiority of our approach in terms of per-frame quality, +temporal correlation, and text-video alignment, with clear margins. + +
+
+
+
+
+ + ♻ ☆ InstaGen: Enhancing Object Detection by Training on Synthetic Dataset CVPR2024 + + +
+ In this paper, we present a novel paradigm to enhance the ability of object +detector, e.g., expanding categories or improving detection performance, by +training on synthetic dataset generated from diffusion models. Specifically, we +integrate an instance-level grounding head into a pre-trained, generative +diffusion model, to augment it with the ability of localising instances in the +generated images. The grounding head is trained to align the text embedding of +category names with the regional visual feature of the diffusion model, using +supervision from an off-the-shelf object detector, and a novel self-training +scheme on (novel) categories not covered by the detector. We conduct thorough +experiments to show that, this enhanced version of diffusion model, termed as +InstaGen, can serve as a data synthesizer, to enhance object detectors by +training on its generated samples, demonstrating superior performance over +existing state-of-the-art methods in open-vocabulary (+4.5 AP) and data-sparse +(+1.2 to 5.2 AP) scenarios. Project page with code: +https://fcjian.github.io/InstaGen. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ SIFU: Side-view Conditioned Implicit Function for Real-world Usable + Clothed Human Reconstruction CVPR 2024 + + +
+ Creating high-quality 3D models of clothed humans from single images for +real-world applications is crucial. Despite recent advancements, accurately +reconstructing humans in complex poses or with loose clothing from in-the-wild +images, along with predicting textures for unseen areas, remains a significant +challenge. A key limitation of previous methods is their insufficient prior +guidance in transitioning from 2D to 3D and in texture prediction. In response, +we introduce SIFU (Side-view Conditioned Implicit Function for Real-world +Usable Clothed Human Reconstruction), a novel approach combining a Side-view +Decoupling Transformer with a 3D Consistent Texture Refinement pipeline.SIFU +employs a cross-attention mechanism within the transformer, using SMPL-X +normals as queries to effectively decouple side-view features in the process of +mapping 2D features to 3D. This method not only improves the precision of the +3D models but also their robustness, especially when SMPL-X estimates are not +perfect. Our texture refinement process leverages text-to-image diffusion-based +prior to generate realistic and consistent textures for invisible views. +Through extensive experiments, SIFU surpasses SOTA methods in both geometry and +texture reconstruction, showcasing enhanced robustness in complex scenarios and +achieving an unprecedented Chamfer and P2S measurement. Our approach extends to +practical applications such as 3D printing and scene building, demonstrating +its broad utility in real-world scenarios. Project page +https://river-zhang.github.io/SIFU-projectpage/ . + +
+
+ comment: Accepted by CVPR 2024; Project page + https://river-zhang.github.io/SIFU-projectpage/ +
+
+
+
+
+ + ♻ ☆ SAOR: Single-View Articulated Object Reconstruction CVPR 2024 + + +
+ We introduce SAOR, a novel approach for estimating the 3D shape, texture, and +viewpoint of an articulated object from a single image captured in the wild. +Unlike prior approaches that rely on pre-defined category-specific 3D templates +or tailored 3D skeletons, SAOR learns to articulate shapes from single-view +image collections with a skeleton-free part-based model without requiring any +3D object shape priors. To prevent ill-posed solutions, we propose a +cross-instance consistency loss that exploits disentangled object shape +deformation and articulation. This is helped by a new silhouette-based sampling +mechanism to enhance viewpoint diversity during training. Our method only +requires estimated object silhouettes and relative depth maps from +off-the-shelf pre-trained networks during training. At inference time, given a +single-view image, it efficiently outputs an explicit mesh representation. We +obtain improved qualitative and quantitative results on challenging quadruped +animals compared to relevant existing work. + +
+
+ comment: Accepted to CVPR 2024, website: https://mehmetaygun.github.io/saor +
+
+
+
+
+ + ♻ ☆ CA-Jaccard: Camera-aware Jaccard Distance for Person Re-identification CVPR 2024 + + +
+ Person re-identification (re-ID) is a challenging task that aims to learn +discriminative features for person retrieval. In person re-ID, Jaccard distance +is a widely used distance metric, especially in re-ranking and clustering +scenarios. However, we discover that camera variation has a significant +negative impact on the reliability of Jaccard distance. In particular, Jaccard +distance calculates the distance based on the overlap of relevant neighbors. +Due to camera variation, intra-camera samples dominate the relevant neighbors, +which reduces the reliability of the neighbors by introducing intra-camera +negative samples and excluding inter-camera positive samples. To overcome this +problem, we propose a novel camera-aware Jaccard (CA-Jaccard) distance that +leverages camera information to enhance the reliability of Jaccard distance. +Specifically, we design camera-aware k-reciprocal nearest neighbors (CKRNNs) to +find k-reciprocal nearest neighbors on the intra-camera and inter-camera +ranking lists, which improves the reliability of relevant neighbors and +guarantees the contribution of inter-camera samples in the overlap. Moreover, +we propose a camera-aware local query expansion (CLQE) to mine reliable samples +in relevant neighbors by exploiting camera variation as a strong constraint and +assign these samples higher weights in overlap, further improving the +reliability. Our CA-Jaccard distance is simple yet effective and can serve as a +general distance metric for person re-ID methods with high reliability and low +computational cost. Extensive experiments demonstrate the effectiveness of our +method. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SegmentAnything helps microscopy images based automatic and quantitative + organoid detection and analysis + + +
+ Organoids are self-organized 3D cell clusters that closely mimic the +architecture and function of in vivo tissues and organs. Quantification of +organoid morphology helps in studying organ development, drug discovery, and +toxicity assessment. Recent microscopy techniques provide a potent tool to +acquire organoid morphology features, but manual image analysis remains a labor +and time-intensive process. Thus, this paper proposes a comprehensive pipeline +for microscopy analysis that leverages the SegmentAnything to precisely +demarcate individual organoids. Additionally, we introduce a set of +morphological properties, including perimeter, area, radius, non-smoothness, +and non-circularity, allowing researchers to analyze the organoid structures +quantitatively and automatically. To validate the effectiveness of our +approach, we conducted tests on bright-field images of human induced +pluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The +results obtained from our automatic pipeline closely align with manual organoid +detection and measurement, showcasing the capability of our proposed method in +accelerating organoids morphology analysis. + +
+
+ comment: Replace Figure 4 with the correct version. The original version is + wrong due to a column name mismatch +
+
+
+
+
+ + ♻ ☆ Understanding normalization in contrastive representation learning and + out-of-distribution detection + + +
+ Contrastive representation learning has emerged as an outstanding approach +for anomaly detection. In this work, we explore the $\ell_2$-norm of +contrastive features and its applications in out-of-distribution detection. We +propose a simple method based on contrastive learning, which incorporates +out-of-distribution data by discriminating against normal samples in the +contrastive layer space. Our approach can be applied flexibly as an outlier +exposure (OE) approach, where the out-of-distribution data is a huge collective +of random images, or as a fully self-supervised learning approach, where the +out-of-distribution data is self-generated by applying distribution-shifting +transformations. The ability to incorporate additional out-of-distribution +samples enables a feasible solution for datasets where AD methods based on +contrastive learning generally underperform, such as aerial images or +microscopy images. Furthermore, the high-quality features learned through +contrastive learning consistently enhance performance in OE scenarios, even +when the available out-of-distribution dataset is not diverse enough. Our +extensive experiments demonstrate the superiority of our proposed method under +various scenarios, including unimodal and multimodal settings, with various +image datasets. + +
+
+
+
+
+ + ♻ ☆ Confronting Ambiguity in 6D Object Pose Estimation via Score-Based + Diffusion on SE(3) CVPR2024 + + +
+ Addressing pose ambiguity in 6D object pose estimation from single RGB images +presents a significant challenge, particularly due to object symmetries or +occlusions. In response, we introduce a novel score-based diffusion method +applied to the $SE(3)$ group, marking the first application of diffusion models +to $SE(3)$ within the image domain, specifically tailored for pose estimation +tasks. Extensive evaluations demonstrate the method's efficacy in handling pose +ambiguity, mitigating perspective-induced ambiguity, and showcasing the +robustness of our surrogate Stein score formulation on $SE(3)$. This +formulation not only improves the convergence of denoising process but also +enhances computational efficiency. Thus, we pioneer a promising strategy for 6D +object pose estimation. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Neural Implicit Morphing of Face Images CVPR 2024 + + +
+ Face morphing is a problem in computer graphics with numerous artistic and +forensic applications. It is challenging due to variations in pose, lighting, +gender, and ethnicity. This task consists of a warping for feature alignment +and a blending for a seamless transition between the warped images. We propose +to leverage coord-based neural networks to represent such warpings and +blendings of face images. During training, we exploit the smoothness and +flexibility of such networks by combining energy functionals employed in +classical approaches without discretizations. Additionally, our method is +time-dependent, allowing a continuous warping/blending of the images. During +morphing inference, we need both direct and inverse transformations of the +time-dependent warping. The first (second) is responsible for warping the +target (source) image into the source (target) image. Our neural warping stores +those maps in a single network dismissing the need for inverting them. The +results of our experiments indicate that our method is competitive with both +classical and generative models under the lens of image quality and +face-morphing detectors. Aesthetically, the resulting images present a seamless +blending of diverse faces not yet usual in the literature. + +
+
+ comment: 14 pages, 20 figures, accepted for CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SegForestNet: Spatial-Partitioning-Based Aerial Image Segmentation + + +
+ Aerial image segmentation is the basis for applications such as automatically +creating maps or tracking deforestation. In true orthophotos, which are often +used in these applications, many objects and regions can be approximated well +by polygons. However, this fact is rarely exploited by state-of-the-art +semantic segmentation models. Instead, most models allow unnecessary degrees of +freedom in their predictions by allowing arbitrary region shapes. We therefore +present a refinement of our deep learning model which predicts binary space +partitioning trees, an efficient polygon representation. The refinements +include a new feature decoder architecture and a new differentiable BSP tree +renderer which both avoid vanishing gradients. Additionally, we designed a +novel loss function specifically designed to improve the spatial partitioning +defined by the predicted trees. Furthermore, our expanded model can predict +multiple trees at once and thus can predict class-specific segmentations. As an +additional contribution, we investigate the impact of a non-optimal training +process in comparison to an optimized training process. While model +architectures optimized for aerial images, such as PFNet or our own model, show +an advantage under non-optimal conditions, this advantage disappears under +optimal training conditions. Despite this observation, our model still makes +better predictions for small rectangular objects, e.g., cars. + +
+
+
+
+
+ + ♻ ☆ Synthetic data shuffling accelerates the convergence of federated + learning under data heterogeneity + + +
+ In federated learning, data heterogeneity is a critical challenge. A +straightforward solution is to shuffle the clients' data to homogenize the +distribution. However, this may violate data access rights, and how and when +shuffling can accelerate the convergence of a federated optimization algorithm +is not theoretically well understood. In this paper, we establish a precise and +quantifiable correspondence between data heterogeneity and parameters in the +convergence rate when a fraction of data is shuffled across clients. We prove +that shuffling can quadratically reduce the gradient dissimilarity with respect +to the shuffling percentage, accelerating convergence. Inspired by the theory, +we propose a practical approach that addresses the data access rights issue by +shuffling locally generated synthetic data. The experimental results show that +shuffling synthetic data improves the performance of multiple existing +federated learning algorithms by a large margin. + +
+
+ comment: Accepted at TMLR +
+
+
+
+
+ + ♻ ☆ Learning Optical Flow and Scene Flow with Bidirectional Camera-LiDAR + Fusion + + +
+ In this paper, we study the problem of jointly estimating the optical flow +and scene flow from synchronized 2D and 3D data. Previous methods either employ +a complex pipeline that splits the joint task into independent stages, or fuse +2D and 3D information in an ``early-fusion'' or ``late-fusion'' manner. Such +one-size-fits-all approaches suffer from a dilemma of failing to fully utilize +the characteristic of each modality or to maximize the inter-modality +complementarity. To address the problem, we propose a novel end-to-end +framework, which consists of 2D and 3D branches with multiple bidirectional +fusion connections between them in specific layers. Different from previous +work, we apply a point-based 3D branch to extract the LiDAR features, as it +preserves the geometric structure of point clouds. To fuse dense image features +and sparse point features, we propose a learnable operator named bidirectional +camera-LiDAR fusion module (Bi-CLFM). We instantiate two types of the +bidirectional fusion pipeline, one based on the pyramidal coarse-to-fine +architecture (dubbed CamLiPWC), and the other one based on the recurrent +all-pairs field transforms (dubbed CamLiRAFT). On FlyingThings3D, both CamLiPWC +and CamLiRAFT surpass all existing methods and achieve up to a 47.9\% reduction +in 3D end-point-error from the best published result. Our best-performing +model, CamLiRAFT, achieves an error of 4.26\% on the KITTI Scene Flow +benchmark, ranking 1st among all submissions with much fewer parameters. +Besides, our methods have strong generalization performance and the ability to +handle non-rigid motion. Code is available at +https://github.com/MCG-NJU/CamLiFlow. + +
+
+ comment: Accepted to TPAMI 2023 +
+
+
+
+
+ + ♻ ☆ Burst Super-Resolution with Diffusion Models for Improving Perceptual + Quality IJCNN 2024 + + +
+ While burst LR images are useful for improving the SR image quality compared +with a single LR image, prior SR networks accepting the burst LR images are +trained in a deterministic manner, which is known to produce a blurry SR image. +In addition, it is difficult to perfectly align the burst LR images, making the +SR image more blurry. Since such blurry images are perceptually degraded, we +aim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity +images can be reconstructed by diffusion models. However, prior SR methods +using the diffusion model are not properly optimized for the burst SR task. +Specifically, the reverse process starting from a random sample is not +optimized for image enhancement and restoration methods, including burst SR. In +our proposed method, on the other hand, burst LR features are used to +reconstruct the initial burst SR image that is fed into an intermediate step in +the diffusion model. This reverse process from the intermediate step 1) skips +diffusion steps for reconstructing the global structure of the image and 2) +focuses on steps for refining detailed textures. Our experimental results +demonstrate that our method can improve the scores of the perceptual quality +metrics. Code: https://github.com/placerkyo/BSRD + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ♻ ☆ Gyro-based Neural Single Image Deblurring + + +
+ In this paper, we present GyroDeblurNet, a novel single image deblurring +method that utilizes a gyro sensor to effectively resolve the ill-posedness of +image deblurring. The gyro sensor provides valuable information about camera +motion during exposure time that can significantly improve deblurring quality. +However, effectively exploiting real-world gyro data is challenging due to +significant errors from various sources including sensor noise, the disparity +between the positions of a camera module and a gyro sensor, the absence of +translational motion information, and moving objects whose motions cannot be +captured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with +two novel neural network blocks: a gyro refinement block and a gyro deblurring +block. The gyro refinement block refines the error-ridden gyro data using the +blur information from the input image. On the other hand, the gyro deblurring +block removes blur from the input image using the refined gyro data and further +compensates for gyro error by leveraging the blur information from the input +image. For training a neural network with erroneous gyro data, we propose a +training strategy based on the curriculum learning. We also introduce a novel +gyro data embedding scheme to represent real-world intricate camera shakes. +Finally, we present a synthetic dataset and a real dataset for the training and +evaluation of gyro-based single image deblurring. Our experiments demonstrate +that our approach achieves state-of-the-art deblurring quality by effectively +utilizing erroneous gyro data. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Review of Knowledge Distillation in Computer Vision + + +
+ Deep learning techniques have been demonstrated to surpass preceding +cutting-edge machine learning techniques in recent years, with computer vision +being one of the most prominent examples. However, deep learning models suffer +from significant drawbacks when deployed in resource-constrained environments +due to their large model size and high complexity. Knowledge Distillation is +one of the prominent solutions to overcome this challenge. This review paper +examines the current state of research on knowledge distillation, a technique +for compressing complex models into smaller and simpler ones. The paper +provides an overview of the major principles and techniques associated with +knowledge distillation and reviews the applications of knowledge distillation +in the domain of computer vision. The review focuses on the benefits of +knowledge distillation, as well as the problems that must be overcome to +improve its effectiveness. + +
+
+ comment: 36 pages ,10 figures +
+
+
+
+
+ + ♻ ☆ Autoregressive Omni-Aware Outpainting for Open-Vocabulary 360-Degree + Image Generation AAAI 24 + + +
+ A 360-degree (omni-directional) image provides an all-encompassing spherical +view of a scene. Recently, there has been an increasing interest in +synthesising 360-degree images from conventional narrow field of view (NFoV) +images captured by digital cameras and smartphones, for providing immersive +experiences in various scenarios such as virtual reality. Yet, existing methods +typically fall short in synthesizing intricate visual details or ensure the +generated images align consistently with user-provided prompts. In this study, +autoregressive omni-aware generative network (AOG-Net) is proposed for +360-degree image generation by out-painting an incomplete 360-degree image +progressively with NFoV and text guidances joinly or individually. This +autoregressive scheme not only allows for deriving finer-grained and +text-consistent patterns by dynamically generating and adjusting the process +but also offers users greater flexibility to edit their conditions throughout +the generation process. A global-local conditioning mechanism is devised to +comprehensively formulate the outpainting guidance in each autoregressive step. +Text guidances, omni-visual cues, NFoV inputs and omni-geometry are encoded and +further formulated with cross-attention based transformers into a global stream +and a local stream into a conditioned generative backbone model. As AOG-Net is +compatible to leverage large-scale models for the conditional encoder and the +generative prior, it enables the generation to use extensive open-vocabulary +text guidances. Comprehensive experiments on two commonly used 360-degree image +datasets for both indoor and outdoor settings demonstrate the state-of-the-art +performance of our proposed method. Our code will be made publicly available. + +
+
+ comment: Accepted by AAAI 24 +
+
+
+
+
+ + ♻ ☆ Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled + Feature Fields + + +
+ 3D scene representations have gained immense popularity in recent years. +Methods that use Neural Radiance fields are versatile for traditional tasks +such as novel view synthesis. In recent times, some work has emerged that aims +to extend the functionality of NeRF beyond view synthesis, for semantically +aware tasks such as editing and segmentation using 3D feature field +distillation from 2D foundation models. However, these methods have two major +limitations: (a) they are limited by the rendering speed of NeRF pipelines, and +(b) implicitly represented feature fields suffer from continuity artifacts +reducing feature quality. Recently, 3D Gaussian Splatting has shown +state-of-the-art performance on real-time radiance field rendering. In this +work, we go one step further: in addition to radiance field rendering, we +enable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D +foundation model distillation. This translation is not straightforward: naively +incorporating feature fields in the 3DGS framework encounters significant +challenges, notably the disparities in spatial resolution and channel +consistency between RGB images and feature maps. We propose architectural and +training changes to efficiently avert this problem. Our proposed method is +general, and our experiments showcase novel view semantic segmentation, +language-guided editing and segment anything through learning feature fields +from state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across +experiments, our distillation method is able to provide comparable or better +results, while being significantly faster to both train and render. +Additionally, to the best of our knowledge, we are the first method to enable +point and bounding-box prompting for radiance field manipulation, by leveraging +the SAM model. Project website at: https://feature-3dgs.github.io/ + +
+
+
+
+
+ + ♻ ☆ Unifying Correspondence, Pose and NeRF for Pose-Free Novel View + Synthesis from Stereo Pairs CVPR2024 + + +
+ This work delves into the task of pose-free novel view synthesis from stereo +pairs, a challenging and pioneering task in 3D vision. Our innovative +framework, unlike any before, seamlessly integrates 2D correspondence matching, +camera pose estimation, and NeRF rendering, fostering a synergistic enhancement +of these tasks. We achieve this through designing an architecture that utilizes +a shared representation, which serves as a foundation for enhanced 3D geometry +understanding. Capitalizing on the inherent interplay between the tasks, our +unified framework is trained end-to-end with the proposed training strategy to +improve overall model accuracy. Through extensive evaluations across diverse +indoor and outdoor scenes from two real-world datasets, we demonstrate that our +approach achieves substantial improvement over previous methodologies, +especially in scenarios characterized by extreme viewpoint changes and the +absence of accurate camera poses. + +
+
+ comment: Project page: https://ku-cvlab.github.io/CoPoNeRF/ CVPR2024 camera + ready version (Highlight) +
+
+
+
+
+ + ♻ ☆ UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery CVPR + + +
+ Raindrops adhering to the lens of UAVs can obstruct visibility of the +background scene and degrade image quality. Despite recent progress in image +deraining methods and datasets, there is a lack of focus on raindrop removal +from UAV aerial imagery due to the unique challenges posed by varying angles +and rapid movement during drone flight. To fill the gap in this research, we +first construct a new benchmark dataset for removing raindrops from UAV images, +called UAV-Rain1k. In this letter, we provide a dataset generation pipeline, +which includes modeling raindrop shapes using Blender, collecting background +images from various UAV angles, random sampling of rain masks and etc. Based on +the proposed benchmark, we further present a comprehensive evaluation of +existing representative image deraining algorithms, and reveal future research +opportunities worth exploring. The proposed dataset is publicly available at +https://github.com/cschenxiang/UAV-Rain1k. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition Workshops (CVPRW) 2024 +
+
+
+
+
+ + ♻ ☆ Fully Sparse 3D Occupancy Prediction + + +
+ Occupancy prediction plays a pivotal role in autonomous driving. Previous +methods typically construct dense 3D volumes, neglecting the inherent sparsity +of the scene and suffering high computational costs. To bridge the gap, we +introduce a novel fully sparse occupancy network, termed SparseOcc. SparseOcc +initially reconstructs a sparse 3D representation from visual inputs and +subsequently predicts semantic/instance occupancy from the 3D sparse +representation by sparse queries. A mask-guided sparse sampling is designed to +enable sparse queries to interact with 2D features in a fully sparse manner, +thereby circumventing costly dense features or global attention. Additionally, +we design a thoughtful ray-based evaluation metric, namely RayIoU, to solve the +inconsistency penalty along depths raised in traditional voxel-level mIoU +criteria. SparseOcc demonstrates its effectiveness by achieving a RayIoU of +34.0, while maintaining a real-time inference speed of 17.3 FPS, with 7 history +frames inputs. By incorporating more preceding frames to 15, SparseOcc +continuously improves its performance to 35.1 RayIoU without whistles and +bells. Code is available at https://github.com/MCG-NJU/SparseOcc. + +
+
+ comment: Add new metric: RayIoU +
+
+
+
+
+ + ♻ ☆ Enhancing Ship Classification in Optical Satellite Imagery: Integrating + Convolutional Block Attention Module with ResNet for Improved Performance + + +
+ This study presents an advanced Convolutional Neural Network (CNN) +architecture for ship classification from optical satellite imagery, +significantly enhancing performance through the integration of the +Convolutional Block Attention Module (CBAM) and additional architectural +innovations. Building upon the foundational ResNet50 model, we first +incorporated a standard CBAM to direct the model's focus towards more +informative features, achieving an accuracy of 87% compared to the baseline +ResNet50's 85%. Further augmentations involved multi-scale feature integration, +depthwise separable convolutions, and dilated convolutions, culminating in the +Enhanced ResNet Model with Improved CBAM. This model demonstrated a remarkable +accuracy of 95%, with precision, recall, and f1-scores all witnessing +substantial improvements across various ship classes. The bulk carrier and oil +tanker classes, in particular, showcased nearly perfect precision and recall +rates, underscoring the model's enhanced capability in accurately identifying +and classifying ships. Attention heatmap analyses further validated the +improved model's efficacy, revealing a more focused attention on relevant ship +features, regardless of background complexities. These findings underscore the +potential of integrating attention mechanisms and architectural innovations in +CNNs for high-resolution satellite imagery classification. The study navigates +through the challenges of class imbalance and computational costs, proposing +future directions towards scalability and adaptability in new or rare ship type +recognition. This research lays a groundwork for the application of advanced +deep learning techniques in the domain of remote sensing, offering insights +into scalable and efficient satellite image classification. + +
+
+
+
+
+ + ♻ ☆ Holistic Inverse Rendering of Complex Facade via Aerial 3D Scanning + + +
+ In this work, we use multi-view aerial images to reconstruct the geometry, +lighting, and material of facades using neural signed distance fields (SDFs). +Without the requirement of complex equipment, our method only takes simple RGB +images captured by a drone as inputs to enable physically based and +photorealistic novel-view rendering, relighting, and editing. However, a +real-world facade usually has complex appearances ranging from diffuse rocks +with subtle details to large-area glass windows with specular reflections, +making it hard to attend to everything. As a result, previous methods can +preserve the geometry details but fail to reconstruct smooth glass windows or +verse vise. In order to address this challenge, we introduce three spatial- and +semantic-adaptive optimization strategies, including a semantic regularization +approach based on zero-shot segmentation techniques to improve material +consistency, a frequency-aware geometry regularization to balance surface +smoothness and details in different surfaces, and a visibility probe-based +scheme to enable efficient modeling of the local lighting in large-scale +outdoor environments. In addition, we capture a real-world facade aerial 3D +scanning image set and corresponding point clouds for training and +benchmarking. The experiment demonstrates the superior quality of our method on +facade holistic inverse rendering, novel view synthesis, and scene editing +compared to state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Temporally Consistent Unbalanced Optimal Transport for Unsupervised + Action Segmentation CVPR 2024 + + +
+ We propose a novel approach to the action segmentation task for long, +untrimmed videos, based on solving an optimal transport problem. By encoding a +temporal consistency prior into a Gromov-Wasserstein problem, we are able to +decode a temporally consistent segmentation from a noisy affinity/matching cost +matrix between video frames and action classes. Unlike previous approaches, our +method does not require knowing the action order for a video to attain temporal +consistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can +be efficiently solved on GPUs using a few iterations of projected mirror +descent. We demonstrate the effectiveness of our method in an unsupervised +learning setting, where our method is used to generate pseudo-labels for +self-training. We evaluate our segmentation approach and unsupervised learning +pipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly +datasets, yielding state-of-the-art results for the unsupervised video action +segmentation task. + +
+
+ comment: Accepted to CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ MVSA-Net: Multi-View State-Action Recognition for Robust and Deployable + Trajectory Generation AAAI-2024 + + +
+ The learn-from-observation (LfO) paradigm is a human-inspired mode for a +robot to learn to perform a task simply by watching it being performed. LfO can +facilitate robot integration on factory floors by minimizing disruption and +reducing tedious programming. A key component of the LfO pipeline is a +transformation of the depth camera frames to the corresponding task state and +action pairs, which are then relayed to learning techniques such as imitation +or inverse reinforcement learning for understanding the task parameters. While +several existing computer vision models analyze videos for activity +recognition, SA-Net specifically targets robotic LfO from RGB-D data. However, +SA-Net and many other models analyze frame data captured from a single +viewpoint. Their analysis is therefore highly sensitive to occlusions of the +observed task, which are frequent in deployments. An obvious way of reducing +occlusions is to simultaneously observe the task from multiple viewpoints and +synchronously fuse the multiple streams in the model. Toward this, we present +multi-view SA-Net, which generalizes the SA-Net model to allow the perception +of multiple viewpoints of the task activity, integrate them, and better +recognize the state and action in each frame. Performance evaluations on two +distinct domains establish that MVSA-Net recognizes the state-action pairs +under occlusion more accurately compared to single-view MVSA-Net and other +baselines. Our ablation studies further evaluate its performance under +different ambient conditions and establish the contribution of the architecture +components. As such, MVSA-Net offers a significantly more robust and deployable +state-action trajectory generation compared to previous methods. + +
+
+ comment: Presented at Deployable AI Workshop at AAAI-2024 and 'Towards + Reliable and Deployable Learning-Based Robotic Systems' Workshop at CoRL2023 +
+
+
+
+
+ + ♻ ☆ And Then the Hammer Broke: Reflections on Machine Ethics from Feminist + Philosophy of Science + + +
+ Vision is an important metaphor in ethical and political questions of +knowledge. The feminist philosopher Donna Haraway points out the ``perverse'' +nature of an intrusive, alienating, all-seeing vision (to which we might cry +out ``stop looking at me!''), but also encourages us to embrace the embodied +nature of sight and its promises for genuinely situated knowledge. Current +technologies of machine vision -- surveillance cameras, drones (for war or +recreation), iPhone cameras -- are usually construed as instances of the former +rather than the latter, and for good reasons. However, although in no way +attempting to diminish the real suffering these technologies have brought about +in the world, I make the case for understanding technologies of computer vision +as material instances of embodied seeing and situated knowing. Furthermore, +borrowing from Iris Murdoch's concept of moral vision, I suggest that these +technologies direct our labor towards self-reflection in ethically significant +ways. My approach draws upon paradigms in computer vision research, +phenomenology, and feminist epistemology. Ultimately, this essay is an argument +for directing more philosophical attention from merely criticizing technologies +of vision as ethically deficient towards embracing them as complex, +methodologically and epistemologically important objects. + +
+
+ comment: Pacific University Philosophy Conference +
+
+
+
+
+ + ♻ ☆ 3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple + 3D Representations + + +
+ Imitation learning provides an efficient way to teach robots dexterous +skills; however, learning complex skills robustly and generalizablely usually +consumes large amounts of human demonstrations. To tackle this challenging +problem, we present 3D Diffusion Policy (DP3), a novel visual imitation +learning approach that incorporates the power of 3D visual representations into +diffusion policies, a class of conditional action generative models. The core +design of DP3 is the utilization of a compact 3D visual representation, +extracted from sparse point clouds with an efficient point encoder. In our +experiments involving 72 simulation tasks, DP3 successfully handles most tasks +with just 10 demonstrations and surpasses baselines with a 24.2% relative +improvement. In 4 real robot tasks, DP3 demonstrates precise control with a +high success rate of 85%, given only 40 demonstrations of each task, and shows +excellent generalization abilities in diverse aspects, including space, +viewpoint, appearance, and instance. Interestingly, in real robot experiments, +DP3 rarely violates safety requirements, in contrast to baseline methods which +frequently do, necessitating human intervention. Our extensive evaluation +highlights the critical importance of 3D representations in real-world robot +learning. Videos, code, and data are available on +https://3d-diffusion-policy.github.io . + +
+
+ comment: Videos, code, and data: https://3d-diffusion-policy.github.io +
+
+
+
+
+ + ♻ ☆ 360+x: A Panoptic Multi-modal Scene Understanding Dataset CVPR 2024 + + +
+ Human perception of the world is shaped by a multitude of viewpoints and +modalities. While many existing datasets focus on scene understanding from a +certain perspective (e.g. egocentric or third-person views), our dataset offers +a panoptic perspective (i.e. multiple viewpoints with multiple data +modalities). Specifically, we encapsulate third-person panoramic and front +views, as well as egocentric monocular/binocular views with rich modalities +including video, multi-channel audio, directional binaural delay, location data +and textual scene descriptions within each scene captured, presenting +comprehensive observation of the world. Figure 1 offers a glimpse of all 28 +scene categories of our 360+x dataset. To the best of our knowledge, this is +the first database that covers multiple viewpoints with multiple data +modalities to mimic how daily information is accessed in the real world. +Through our benchmark analysis, we presented 5 different scene understanding +tasks on the proposed 360+x dataset to evaluate the impact and benefit of each +data modality and perspective in panoptic scene understanding. We hope this +unique dataset could broaden the scope of comprehensive scene understanding and +encourage the community to approach these problems from more diverse +perspectives. + +
+
+ comment: CVPR 2024 (Oral Presentation), Project page: + https://x360dataset.github.io/ +
+
+
+
+
+ + ♻ ☆ A Benchmark Grocery Dataset of Realworld Point Clouds From Single View + + +
+ Fine-grained grocery object recognition is an important computer vision +problem with broad applications in automatic checkout, in-store robotic +navigation, and assistive technologies for the visually impaired. Existing +datasets on groceries are mainly 2D images. Models trained on these datasets +are limited to learning features from the regular 2D grids. While portable 3D +sensors such as Kinect were commonly available for mobile phones, sensors such +as LiDAR and TrueDepth, have recently been integrated into mobile phones. +Despite the availability of mobile 3D sensors, there are currently no dedicated +real-world large-scale benchmark 3D datasets for grocery. In addition, existing +3D datasets lack fine-grained grocery categories and have limited training +samples. Furthermore, collecting data by going around the object versus the +traditional photo capture makes data collection cumbersome. Thus, we introduce +a large-scale grocery dataset called 3DGrocery100. It constitutes 100 classes, +with a total of 87,898 3D point clouds created from 10,755 RGB-D single-view +images. We benchmark our dataset on six recent state-of-the-art 3D point cloud +classification models. Additionally, we also benchmark the dataset on few-shot +and continual learning point cloud classification tasks. Project Page: +https://bigdatavision.org/3DGrocery100/. + +
+
+
+
+
+ + ♻ ☆ Linear Combination of Saved Checkpoints Makes Consistency and Diffusion + Models Better + + +
+ Diffusion Models (DM) and Consistency Models (CM) are two types of popular +generative models with good generation quality on various tasks. When training +DM and CM, intermediate weight checkpoints are not fully utilized and only the +last converged checkpoint is used. In this work, we find that high-quality +model weights often lie in a basin which cannot be reached by SGD but can be +obtained by proper checkpoint averaging. Based on these observations, we +propose LCSC, a simple but effective and efficient method to enhance the +performance of DM and CM, by combining checkpoints along the training +trajectory with coefficients deduced from evolutionary search. We demonstrate +the value of LCSC through two use cases: $\textbf{(a) Reducing training cost.}$ +With LCSC, we only need to train DM/CM with fewer number of iterations and/or +lower batch sizes to obtain comparable sample quality with the fully trained +model. For example, LCSC achieves considerable training speedups for CM +(23$\times$ on CIFAR-10 and 15$\times$ on ImageNet-64). $\textbf{(b) Enhancing +pre-trained models.}$ Assuming full training is already done, LCSC can further +improve the generation quality or speed of the final converged models. For +example, LCSC achieves better performance using 1 number of function evaluation +(NFE) than the base model with 2 NFE on consistency distillation, and decreases +the NFE of DM from 15 to 9 while maintaining the generation quality on +CIFAR-10. Our code is available at +https://github.com/imagination-research/LCSC. + +
+
+
+
+
+ + ♻ ☆ S$^{5}$Mars: Semi-Supervised Learning for Mars Semantic Segmentation + + +
+ Deep learning has become a powerful tool for Mars exploration. Mars terrain +semantic segmentation is an important Martian vision task, which is the base of +rover autonomous planning and safe driving. However, there is a lack of +sufficient detailed and high-confidence data annotations, which are exactly +required by most deep learning methods to obtain a good model. To address this +problem, we propose our solution from the perspective of joint data and method +design. We first present a newdataset S5Mars for Semi-SuperviSed learning on +Mars Semantic Segmentation, which contains 6K high-resolution images and is +sparsely annotated based on confidence, ensuring the high quality of labels. +Then to learn from this sparse data, we propose a semi-supervised learning +(SSL) framework for Mars image semantic segmentation, to learn representations +from limited labeled data. Different from the existing SSL methods which are +mostly targeted at the Earth image data, our method takes into account Mars +data characteristics. Specifically, we first investigate the impact of current +widely used natural image augmentations on Mars images. Based on the analysis, +we then proposed two novel and effective augmentations for SSL of Mars +segmentation, AugIN and SAM-Mix, which serve as strong augmentations to boost +the model performance. Meanwhile, to fully leverage the unlabeled data, we +introduce a soft-to-hard consistency learning strategy, learning from different +targets based on prediction confidence. Experimental results show that our +method can outperform state-of-the-art SSL approaches remarkably. Our proposed +dataset is available at https://jhang2020.github.io/S5Mars.github.io/. + +
+
+ comment: IEEE TGRS 2024 +
+
+
+
+
+ + ♻ ☆ OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field + Reconstruction using Omnidirectional Images + + +
+ Photorealistic reconstruction relying on 3D Gaussian Splatting has shown +promising potential in robotics. However, the current 3D Gaussian Splatting +system only supports radiance field reconstruction using undistorted +perspective images. In this paper, we present OmniGS, a novel omnidirectional +Gaussian splatting system, to take advantage of omnidirectional images for fast +radiance field reconstruction. Specifically, we conduct a theoretical analysis +of spherical camera model derivatives in 3D Gaussian Splatting. According to +the derivatives, we then implement a new GPU-accelerated omnidirectional +rasterizer that directly splats 3D Gaussians onto the equirectangular screen +space for omnidirectional image rendering. As a result, we realize +differentiable optimization of the radiance field without the requirement of +cube-map rectification or tangent-plane approximation. Extensive experiments +conducted in egocentric and roaming scenarios demonstrate that our method +achieves state-of-the-art reconstruction quality and high rendering speed using +omnidirectional images. To benefit the research community, the code will be +made publicly available once the paper is published. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Single Mesh Diffusion Models with Field Latents for Texture Generation CVPR 2024 + + +
+ We introduce a framework for intrinsic latent diffusion models operating +directly on the surfaces of 3D shapes, with the goal of synthesizing +high-quality textures. Our approach is underpinned by two contributions: field +latents, a latent representation encoding textures as discrete vector fields on +the mesh vertices, and field latent diffusion models, which learn to denoise a +diffusion process in the learned latent space on the surface. We consider a +single-textured-mesh paradigm, where our models are trained to generate +variations of a given texture on a mesh. We show the synthesized textures are +of superior fidelity compared those from existing single-textured-mesh +generative models. Our models can also be adapted for user-controlled editing +tasks such as inpainting and label-guided generation. The efficacy of our +approach is due in part to the equivariance of our proposed framework under +isometries, allowing our models to seamlessly reproduce details across locally +similar regions and opening the door to a notion of generative texture +transfer. + +
+
+ comment: CVPR 2024. Code and additional visualizations available: + https://single-mesh-diffusion.github.io/ +
+
+
+
+
+ + ♻ ☆ GMISeg: General Medical Image Segmentation without Re-Training + + +
+ Although deep learning models have become the main method for medical image +segmentation, they often cannot be extended to unknown segmentation tasks +involving new anatomical structures, image shapes, or labels. For new +segmentation tasks, researchers often have to retrain or fine-tune the model, +which is time-consuming and poses a significant obstacle to clinical +researchers, who often lack the resources and professional knowledge to train +neural networks. Therefore, we proposed a general method that can solve unknown +medical image segmentation tasks without requiring additional training. Given +an example set of images and prompts for defining new segmentation tasks, +GMISeg applies a novel low-rank fine-tuning strategy based on the proposed +approach to the SAM (Segment Anything Model) image encoder, and works with the +prompt encoder and mask decoder to fine-tune the labeled dataset without the +need for additional training. To achieve generalization of new tasks, we used +medical image datasets with different imaging modes for different parts. We +trained and generalized GMISeg on a different set of anatomical and imaging +modes using cardiac images on other site datasets. We have demonstrated that +GMISeg outperforms the latest methods on unknown tasks and have conducted a +comprehensive analysis and summary of the important performance of the proposed +method. + +
+
+
+
+
+ + ♻ ☆ i-MAE: Are Latent Representations in Masked Autoencoders Linearly + Separable? + + +
+ Masked image modeling (MIM) has been recognized as a strong self-supervised +pre-training approach in the vision domain. However, the mechanism and +properties of the learned representations by such a scheme, as well as how to +further enhance the representations are so far not well-explored. In this +paper, we aim to explore an interactive Masked Autoencoders (i-MAE) framework +to enhance the representation capability from two aspects: (1) employing a +two-way image reconstruction and a latent feature reconstruction with +distillation loss to learn better features; (2) proposing a semantics-enhanced +sampling strategy to boost the learned semantics in MAE. Upon the proposed +i-MAE architecture, we can address two critical questions to explore the +behaviors of the learned representations in MAE: (1) Whether the separability +of latent representations in Masked Autoencoders is helpful for model +performance? We study it by forcing the input as a mixture of two images +instead of one. (2) Whether we can enhance the representations in the latent +feature space by controlling the degree of semantics during sampling on Masked +Autoencoders? To this end, we propose a sampling strategy within a mini-batch +based on the semantics of training samples to examine this aspect. Extensive +experiments are conducted on CIFAR-10/100, Tiny-ImageNet and ImageNet-1K to +verify the observations we discovered. Furthermore, in addition to +qualitatively analyzing the characteristics of the latent representations, we +examine the existence of linear separability and the degree of semantics in the +latent space by proposing two evaluation schemes. The surprising and consistent +results demonstrate that i-MAE is a superior framework design for understanding +MAE frameworks, as well as achieving better representational ability. Code is +available at https://github.com/vision-learning-acceleration-lab/i-mae. + +
+
+ comment: Project page: https://zhiqiangshen.com/projects/i-mae/ +
+
+
+
+
+ + ♻ ☆ Two Tricks to Improve Unsupervised Segmentation Learning + + +
+ We present two practical improvement techniques for unsupervised segmentation +learning. These techniques address limitations in the resolution and accuracy +of predicted segmentation maps of recent state-of-the-art methods. Firstly, we +leverage image post-processing techniques such as guided filtering to refine +the output masks, improving accuracy while avoiding substantial computational +costs. Secondly, we introduce a multi-scale consistency criterion, based on a +teacher-student training scheme. This criterion matches segmentation masks +predicted from regions of the input image extracted at different resolutions to +each other. Experimental results on several benchmarks used in unsupervised +segmentation learning demonstrate the effectiveness of our proposed techniques. + +
+
+
+
+
+ + ♻ ☆ Divide and Conquer: High-Resolution Industrial Anomaly Detection via + Memory Efficient Tiled Ensemble CVPR 24 + + +
+ Industrial anomaly detection is an important task within computer vision with +a wide range of practical use cases. The small size of anomalous regions in +many real-world datasets necessitates processing the images at a high +resolution. This frequently poses significant challenges concerning memory +consumption during the model training and inference stages, leaving some +existing methods impractical for widespread adoption. To overcome this +challenge, we present the tiled ensemble approach, which reduces memory +consumption by dividing the input images into a grid of tiles and training a +dedicated model for each tile location. The tiled ensemble is compatible with +any existing anomaly detection model without the need for any modification of +the underlying architecture. By introducing overlapping tiles, we utilize the +benefits of traditional stacking ensembles, leading to further improvements in +anomaly detection capabilities beyond high resolution alone. We perform a +comprehensive analysis using diverse underlying architectures, including Padim, +PatchCore, FastFlow, and Reverse Distillation, on two standard anomaly +detection datasets: MVTec and VisA. Our method demonstrates a notable +improvement across setups while remaining within GPU memory constraints, +consuming only as much GPU memory as a single model needs to process a single +tile. + +
+
+ comment: To appear at CVPR 24 Visual Anomaly Detection Workshop. Research + conducted during Google Summer of Code 2023 at OpenVINO (Intel). GSoC 2023 + page: https://summerofcode.withgoogle.com/archive/2023/projects/WUSjdxGl +
+
+
+
+
+ + ♻ ☆ TrailBlazer: Trajectory Control for Diffusion-Based Video Generation + + +
+ Within recent approaches to text-to-video (T2V) generation, achieving +controllability in the synthesized video is often a challenge. Typically, this +issue is addressed by providing low-level per-frame guidance in the form of +edge maps, depth maps, or an existing video to be altered. However, the process +of obtaining such guidance can be labor-intensive. This paper focuses on +enhancing controllability in video synthesis by employing straightforward +bounding boxes to guide the subject in various ways, all without the need for +neural network training, finetuning, optimization at inference time, or the use +of pre-existing videos. Our algorithm, TrailBlazer, is constructed upon a +pre-trained (T2V) model, and easy to implement. The subject is directed by a +bounding box through the proposed spatial and temporal attention map editing. +Moreover, we introduce the concept of keyframing, allowing the subject +trajectory and overall appearance to be guided by both a moving bounding box +and corresponding prompts, without the need to provide a detailed mask. The +method is efficient, with negligible additional computation relative to the +underlying pre-trained model. Despite the simplicity of the bounding box +guidance, the resulting motion is surprisingly natural, with emergent effects +including perspective and movement toward the virtual camera as the box size +increases. + +
+
+ comment: 14 pages, 18 figures, Project Page: + https://hohonu-vicml.github.io/Trailblazer.Page/ +
+
+
+
+
+ + ♻ ☆ Knowledge Distillation via the Target-aware Transformer CVPR2022 + + +
+ Knowledge distillation becomes a de facto standard to improve the performance +of small neural networks. Most of the previous works propose to regress the +representational features from the teacher to the student in a one-to-one +spatial matching fashion. However, people tend to overlook the fact that, due +to the architecture differences, the semantic information on the same spatial +location usually vary. This greatly undermines the underlying assumption of the +one-to-one distillation approach. To this end, we propose a novel one-to-all +spatial matching knowledge distillation approach. Specifically, we allow each +pixel of the teacher feature to be distilled to all spatial locations of the +student features given its similarity, which is generated from a target-aware +transformer. Our approach surpasses the state-of-the-art methods by a +significant margin on various computer vision benchmarks, such as ImageNet, +Pascal VOC and COCOStuff10k. Code is available at +https://github.com/sihaoevery/TaT. + +
+
+ comment: CVPR2022(Oral) +
+
+
+
+
+ + ♻ ☆ PAIR-Diffusion: A Comprehensive Multimodal Object-Level Image Editor CVPR 2024 + + +
+ Generative image editing has recently witnessed extremely fast-paced growth. +Some works use high-level conditioning such as text, while others use low-level +conditioning. Nevertheless, most of them lack fine-grained control over the +properties of the different objects present in the image, i.e. object-level +image editing. In this work, we tackle the task by perceiving the images as an +amalgamation of various objects and aim to control the properties of each +object in a fine-grained manner. Out of these properties, we identify structure +and appearance as the most intuitive to understand and useful for editing +purposes. We propose PAIR Diffusion, a generic framework that can enable a +diffusion model to control the structure and appearance properties of each +object in the image. We show that having control over the properties of each +object in an image leads to comprehensive editing capabilities. Our framework +allows for various object-level editing operations on real images such as +reference image-based appearance editing, free-form shape editing, adding +objects, and variations. Thanks to our design, we do not require any inversion +step. Additionally, we propose multimodal classifier-free guidance which +enables editing images using both reference images and text when using our +approach with foundational diffusion models. We validate the above claims by +extensively evaluating our framework on both unconditional and foundational +diffusion models. Please refer to +https://vidit98.github.io/publication/conference-paper/pair_diff.html for code +and model release. + +
+
+ comment: Accepted in CVPR 2024, Project page + https://vidit98.github.io/publication/conference-paper/pair_diff.html +
+
+
+
+
+ + ♻ ☆ DGInStyle: Domain-Generalizable Semantic Segmentation with Image + Diffusion Models and Stylized Semantic Control + + +
+ Large, pretrained latent diffusion models (LDMs) have demonstrated an +extraordinary ability to generate creative content, specialize to user data +through few-shot fine-tuning, and condition their output on other modalities, +such as semantic maps. However, are they usable as large-scale data generators, +e.g., to improve tasks in the perception stack, like semantic segmentation? We +investigate this question in the context of autonomous driving, and answer it +with a resounding "yes". We propose an efficient data generation pipeline +termed DGInStyle. First, we examine the problem of specializing a pretrained +LDM to semantically-controlled generation within a narrow domain. Second, we +propose a Style Swap technique to endow the rich generative prior with the +learned semantic control. Third, we design a Multi-resolution Latent Fusion +technique to overcome the bias of LDMs towards dominant objects. Using +DGInStyle, we generate a diverse dataset of street scenes, train a +domain-agnostic semantic segmentation model on it, and evaluate the model on +multiple popular autonomous driving datasets. Our approach consistently +increases the performance of several domain generalization methods compared to +the previous state-of-the-art methods. Source code and dataset are available at +https://dginstyle.github.io. + +
+
+
+
+
+ + ♻ ☆ No "Zero-Shot" Without Exponential Data: Pretraining Concept Frequency + Determines Multimodal Model Performance ICLR'24 + + +
+ Web-crawled pretraining datasets underlie the impressive "zero-shot" +evaluation performance of multimodal models, such as CLIP for +classification/retrieval and Stable-Diffusion for image generation. However, it +is unclear how meaningful the notion of "zero-shot" generalization is for such +multimodal models, as it is not known to what extent their pretraining datasets +encompass the downstream concepts targeted for during "zero-shot" evaluation. +In this work, we ask: How is the performance of multimodal models on downstream +concepts influenced by the frequency of these concepts in their pretraining +datasets? We comprehensively investigate this question across 34 models and +five standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M, +LAION-Aesthetics), generating over 300GB of data artifacts. We consistently +find that, far from exhibiting "zero-shot" generalization, multimodal models +require exponentially more data to achieve linear improvements in downstream +"zero-shot" performance, following a sample inefficient log-linear scaling +trend. This trend persists even when controlling for sample-level similarity +between pretraining and downstream datasets, and testing on purely synthetic +data distributions. Furthermore, upon benchmarking models on long-tailed data +sampled based on our analysis, we demonstrate that multimodal models across the +board perform poorly. We contribute this long-tail test set as the "Let it +Wag!" benchmark to further research in this direction. Taken together, our +study reveals an exponential need for training data which implies that the key +to "zero-shot" generalization capabilities under large-scale training paradigms +remains to be found. + +
+
+ comment: Extended version of the short paper accepted at DPFM, ICLR'24 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 121 + +
+
+
+ + ☆ Reconstructing Retinal Visual Images from 3T fMRI Data Enhanced by + Unsupervised Learning + + +
+ The reconstruction of human visual inputs from brain activity, particularly +through functional Magnetic Resonance Imaging (fMRI), holds promising avenues +for unraveling the mechanisms of the human visual system. Despite the +significant strides made by deep learning methods in improving the quality and +interpretability of visual reconstruction, there remains a substantial demand +for high-quality, long-duration, subject-specific 7-Tesla fMRI experiments. The +challenge arises in integrating diverse smaller 3-Tesla datasets or +accommodating new subjects with brief and low-quality fMRI scans. In response +to these constraints, we propose a novel framework that generates enhanced 3T +fMRI data through an unsupervised Generative Adversarial Network (GAN), +leveraging unpaired training across two distinct fMRI datasets in 7T and 3T, +respectively. This approach aims to overcome the limitations of the scarcity of +high-quality 7-Tesla data and the challenges associated with brief and +low-quality scans in 3-Tesla experiments. In this paper, we demonstrate the +reconstruction capabilities of the enhanced 3T fMRI data, highlighting its +proficiency in generating superior input visual images compared to +data-intensive methods trained and tested on a single subject. + +
+
+ comment: Accepted by ISBI 2024 +
+
+
+
+
+ + ☆ VMambaMorph: a Visual Mamba-based Framework with Cross-Scan Module for + Deformable 3D Image Registration + + +
+ Image registration, a critical process in medical imaging, involves aligning +different sets of medical imaging data into a single unified coordinate system. +Deep learning networks, such as the Convolutional Neural Network (CNN)-based +VoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model +(SSM)-based MambaMorph, have demonstrated effective performance in this domain. +The recent Visual State Space Model (VMamba), which incorporates a cross-scan +module with SSM, has exhibited promising improvements in modeling global-range +dependencies with efficient computational cost in computer vision tasks. This +paper hereby introduces an exploration of VMamba with image registration, named +VMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for +3D image registration. Utilizing a U-shaped network architecture, VMambaMorph +computes the deformation field based on target and source volumes. The +VMamba-based block with 2D cross-scan module is redesigned for 3D volumetric +feature processing, and a fine-grained feature extraction module is proposed +for high-dimensional feature learning. We validate VMambaMorph using a public +benchmark brain MR-CT registration dataset, comparing its performance against +current state-of-the-art methods. The results indicate that VMambaMorph +achieves competitive registration quality. The code for VMambaMorph is +available on GitHub. + +
+
+
+
+
+ + ☆ LHU-Net: A Light Hybrid U-Net for Cost-Efficient, High-Performance + Volumetric Medical Image Segmentation + + +
+ As a result of the rise of Transformer architectures in medical image +analysis, specifically in the domain of medical image segmentation, a multitude +of hybrid models have been created that merge the advantages of Convolutional +Neural Networks (CNNs) and Transformers. These hybrid models have achieved +notable success by significantly improving segmentation accuracy. Yet, this +progress often comes at the cost of increased model complexity, both in terms +of parameters and computational demand. Moreover, many of these models fail to +consider the crucial interplay between spatial and channel features, which +could further refine and improve segmentation outcomes. To address this, we +introduce LHU-Net, a Light Hybrid U-Net architecture optimized for volumetric +medical image segmentation. LHU-Net is meticulously designed to prioritize +spatial feature analysis in its initial layers before shifting focus to +channel-based features in its deeper layers, ensuring a comprehensive feature +extraction process. Rigorous evaluation across five benchmark datasets - +Synapse, LA, Pancreas, ACDC, and BRaTS 2018 - underscores LHU-Net's superior +performance, showcasing its dual capacity for efficiency and accuracy. Notably, +LHU-Net sets new performance benchmarks, such as attaining a Dice score of +92.66 on the ACDC dataset, while simultaneously reducing parameters by 85% and +quartering the computational load compared to existing state-of-the-art models. +Achieved without any reliance on pre-training, additional data, or model +ensemble, LHU-Net's effectiveness is further evidenced by its state-of-the-art +performance across all evaluated datasets, utilizing fewer than 11 million +parameters. This achievement highlights that balancing computational efficiency +with high accuracy in medical image segmentation is feasible. Our +implementation of LHU-Net is freely accessible to the research community on +GitHub. + +
+
+
+
+
+ + ☆ HaVTR: Improving Video-Text Retrieval Through Augmentation Using Large + Foundation Models + + +
+ While recent progress in video-text retrieval has been driven by the +exploration of powerful model architectures and training strategies, the +representation learning ability of video-text retrieval models is still limited +due to low-quality and scarce training data annotations. To address this issue, +we present a novel video-text learning paradigm, HaVTR, which augments video +and text data to learn more generalized features. Specifically, we first adopt +a simple augmentation method, which generates self-similar data by randomly +duplicating or dropping subwords and frames. In addition, inspired by the +recent advancement in visual and language generative models, we propose a more +powerful augmentation method through textual paraphrasing and video stylization +using large language models (LLMs) and visual generative models (VGMs). +Further, to bring richer information into video and text, we propose a +hallucination-based augmentation method, where we use LLMs and VGMs to generate +and add new relevant information to the original data. Benefiting from the +enriched data, extensive experiments on several video-text retrieval benchmarks +demonstrate the superiority of HaVTR over existing methods. + +
+
+
+
+
+ + ☆ Spatial Cognition from Egocentric Video: Out of Sight, Not Out of Mind + + +
+ As humans move around, performing their daily tasks, they are able to recall +where they have positioned objects in their environment, even if these objects +are currently out of sight. In this paper, we aim to mimic this spatial +cognition ability. We thus formulate the task of Out of Sight, Not Out of Mind +- 3D tracking active objects using observations captured through an egocentric +camera. We introduce Lift, Match and Keep (LMK), a method which lifts partial +2D observations to 3D world coordinates, matches them over time using visual +appearance, 3D location and interactions to form object tracks, and keeps these +object tracks even when they go out-of-view of the camera - hence keeping in +mind what is out of sight. We test LMK on 100 long videos from EPIC-KITCHENS. +Our results demonstrate that spatial cognition is critical for correctly +locating objects over short and long time scales. E.g., for one long egocentric +video, we estimate the 3D location of 50 active objects. Of these, 60% can be +correctly positioned in 3D after 2 minutes of leaving the camera view. + +
+
+ comment: 21 pages including references and appendix. Project Webpage: + http://dimadamen.github.io/OSNOM/ +
+
+
+
+
+ + ☆ AirShot: Efficient Few-Shot Detection for Autonomous Exploration + + +
+ Few-shot object detection has drawn increasing attention in the field of +robotic exploration, where robots are required to find unseen objects with a +few online provided examples. Despite recent efforts have been made to yield +online processing capabilities, slow inference speeds of low-powered robots +fail to meet the demands of real-time detection-making them impractical for +autonomous exploration. Existing methods still face performance and efficiency +challenges, mainly due to unreliable features and exhaustive class loops. In +this work, we propose a new paradigm AirShot, and discover that, by fully +exploiting the valuable correlation map, AirShot can result in a more robust +and faster few-shot object detection system, which is more applicable to +robotics community. The core module Top Prediction Filter (TPF) can operate on +multi-scale correlation maps in both the training and inference stages. During +training, TPF supervises the generation of a more representative correlation +map, while during inference, it reduces looping iterations by selecting +top-ranked classes, thus cutting down on computational costs with better +performance. Surprisingly, this dual functionality exhibits general +effectiveness and efficiency on various off-the-shelf models. Exhaustive +experiments on COCO2017, VOC2014, and SubT datasets demonstrate that TPF can +significantly boost the efficacy and efficiency of most off-the-shelf models, +achieving up to 36.4% precision improvements along with 56.3% faster inference +speed. Code and Data are at: https://github.com/ImNotPrepared/AirShot. + +
+
+
+
+
+ + ☆ AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with + Implicit Disentanglement + + +
+ Facial action unit (AU) intensity plays a pivotal role in quantifying +fine-grained expression behaviors, which is an effective condition for facial +expression manipulation. However, publicly available datasets containing +intensity annotations for multiple AUs remain severely limited, often featuring +a restricted number of subjects. This limitation places challenges to the AU +intensity manipulation in images due to disentanglement issues, leading +researchers to resort to other large datasets with pretrained AU intensity +estimators for pseudo labels. In addressing this constraint and fully +leveraging manual annotations of AU intensities for precise manipulation, we +introduce AUEditNet. Our proposed model achieves impressive intensity +manipulation across 12 AUs, trained effectively with only 18 subjects. +Utilizing a dual-branch architecture, our approach achieves comprehensive +disentanglement of facial attributes and identity without necessitating +additional loss functions or implementing with large batch sizes. This approach +offers a potential solution to achieve desired facial attribute editing despite +the dataset's limited subject count. Our experiments demonstrate AUEditNet's +superior accuracy in editing AU intensities, affirming its capability in +disentangling facial attributes and identity within a limited subject pool. +AUEditNet allows conditioning by either intensity values or target images, +eliminating the need for constructing AU combinations for specific facial +expression synthesis. Moreover, AU intensity estimation, as a downstream task, +validates the consistency between real and edited images, confirming the +effectiveness of our proposed AU intensity manipulation method. + +
+
+
+
+
+ + ☆ Automated Prediction of Breast Cancer Response to Neoadjuvant + Chemotherapy from DWI Data + + +
+ Effective surgical planning for breast cancer hinges on accurately predicting +pathological complete response (pCR) to neoadjuvant chemotherapy (NAC). +Diffusion-weighted MRI (DWI) and machine learning offer a non-invasive approach +for early pCR assessment. However, most machine-learning models require manual +tumor segmentation, a cumbersome and error-prone task. We propose a deep +learning model employing "Size-Adaptive Lesion Weighting" for automatic DWI +tumor segmentation to enhance pCR prediction accuracy. Despite +histopathological changes during NAC complicating DWI image segmentation, our +model demonstrates robust performance. Utilizing the BMMR2 challenge dataset, +it matches human experts in pCR prediction pre-NAC with an area under the curve +(AUC) of 0.76 vs. 0.796, and surpasses standard automated methods mid-NAC, with +an AUC of 0.729 vs. 0.654 and 0.576. Our approach represents a significant +advancement in automating breast cancer treatment planning, enabling more +reliable pCR predictions without manual segmentation. + +
+
+ comment: Accepted for presentation at the IEEE International Symposium on + Biomedical Imaging (ISBI) +
+
+
+
+
+ + ☆ Facial Affective Behavior Analysis with Instruction Tuning + + +
+ Facial affective behavior analysis (FABA) is crucial for understanding human +mental states from images. However, traditional approaches primarily deploy +models to discriminate among discrete emotion categories, and lack the fine +granularity and reasoning capability for complex facial behaviors. The advent +of Multi-modal Large Language Models (MLLMs) has been proven successful in +general visual understanding tasks. However, directly harnessing MLLMs for FABA +is challenging due to the scarcity of datasets and benchmarks, neglecting +facial prior knowledge, and low training efficiency. To address these +challenges, we introduce (i) an instruction-following dataset for two FABA +tasks, e.g., emotion and action unit recognition, (ii) a benchmark FABA-Bench +with a new metric considering both recognition and generation ability, and +(iii) a new MLLM "EmoLA" as a strong baseline to the community. Our initiative +on the dataset and benchmarks reveal the nature and rationale of facial +affective behaviors, i.e., fine-grained facial movement, interpretability, and +reasoning. Moreover, to build an effective and efficient FABA MLLM, we +introduce a facial prior expert module with face structure knowledge and a +low-rank adaptation module into pre-trained MLLM. We conduct extensive +experiments on FABA-Bench and four commonly-used FABA datasets. The results +demonstrate that the proposed facial prior expert can boost the performance and +EmoLA achieves the best results on our FABA-Bench. On commonly-used FABA +datasets, EmoLA is competitive rivaling task-specific state-of-the-art models. + +
+
+ comment: V1.0 +
+
+
+
+
+ + ☆ PlateSegFL: A Privacy-Preserving License Plate Detection Using Federated + Segmentation Learning + + +
+ Automatic License Plate Recognition (ALPR) is an integral component of an +intelligent transport system with extensive applications in secure +transportation, vehicle-to-vehicle communication, stolen vehicles detection, +traffic violations, and traffic flow management. The existing license plate +detection system focuses on one-shot learners or pre-trained models that +operate with a geometric bounding box, limiting the model's performance. +Furthermore, continuous video data streams uploaded to the central server +result in network and complexity issues. To combat this, PlateSegFL was +introduced, which implements U-Net-based segmentation along with Federated +Learning (FL). U-Net is well-suited for multi-class image segmentation tasks +because it can analyze a large number of classes and generate a pixel-level +segmentation map for each class. Federated Learning is used to reduce the +quantity of data required while safeguarding the user's privacy. Different +computing platforms, such as mobile phones, are able to collaborate on the +development of a standard prediction model where it makes efficient use of +one's time; incorporates more diverse data; delivers projections in real-time; +and requires no physical effort from the user; resulting around 95% F1 score. + +
+
+
+
+
+ + ☆ FGAIF: Aligning Large Vision-Language Models with Fine-grained AI + Feedback + + +
+ Large Vision-Language Models (LVLMs) have demonstrated proficiency in +tackling a variety of visual-language tasks. However, current LVLMs suffer from +misalignment between text and image modalities which causes three kinds of +hallucination problems, i.e., object existence, object attribute, and object +relationship. To tackle this issue, existing methods mainly utilize +Reinforcement Learning (RL) to align modalities in LVLMs. However, they still +suffer from three main limitations: (1) General feedback can not indicate the +hallucination type contained in the response; (2) Sparse rewards only give the +sequence-level reward for the whole response; and (3)Annotation cost is +time-consuming and labor-intensive. To handle these limitations, we propose an +innovative method to align modalities in LVLMs through Fine-Grained Artificial +Intelligence Feedback (FGAIF), which mainly consists of three steps: AI-based +Feedback Collection, Fine-grained Reward Model Training, and Reinforcement +Learning with Fine-grained Reward. Specifically, We first utilize AI tools to +predict the types of hallucination for each segment in the response and obtain +a collection of fine-grained feedback. Then, based on the collected reward +data, three specialized reward models are trained to produce dense rewards. +Finally, a novel fine-grained feedback module is integrated into the Proximal +Policy Optimization (PPO) algorithm. Extensive experiments are conducted on +hallucination and general benchmarks, demonstrating the superior performance of +our proposed method. Notably, compared with previous models trained with the +RL-based aligning method, our proposed method is effective even with fewer +parameters. + +
+
+
+
+
+ + ☆ LOGO: A Long-Form Video Dataset for Group Action Quality Assessment CVPR 2023 + + +
+ Action quality assessment (AQA) has become an emerging topic since it can be +extensively applied in numerous scenarios. However, most existing methods and +datasets focus on single-person short-sequence scenes, hindering the +application of AQA in more complex situations. To address this issue, we +construct a new multi-person long-form video dataset for action quality +assessment named LOGO. Distinguished in scenario complexity, our dataset +contains 200 videos from 26 artistic swimming events with 8 athletes in each +sample along with an average duration of 204.2 seconds. As for richness in +annotations, LOGO includes formation labels to depict group information of +multiple athletes and detailed annotations on action procedures. Furthermore, +we propose a simple yet effective method to model relations among athletes and +reason about the potential temporal logic in long-form videos. Specifically, we +design a group-aware attention module, which can be easily plugged into +existing AQA methods, to enrich the clip-wise representations based on +contextual group information. To benchmark LOGO, we systematically conduct +investigations on the performance of several popular methods in AQA and action +segmentation. The results reveal the challenges our dataset brings. Extensive +experiments also show that our approach achieves state-of-the-art on the LOGO +dataset. The dataset and code will be released at +\url{https://github.com/shiyi-zh0408/LOGO }. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ☆ PathFinder: Attention-Driven Dynamic Non-Line-of-Sight Tracking with a + Mobile Robot + + +
+ The study of non-line-of-sight (NLOS) imaging is growing due to its many +potential applications, including rescue operations and pedestrian detection by +self-driving cars. However, implementing NLOS imaging on a moving camera +remains an open area of research. Existing NLOS imaging methods rely on +time-resolved detectors and laser configurations that require precise optical +alignment, making it difficult to deploy them in dynamic environments. This +work proposes a data-driven approach to NLOS imaging, PathFinder, that can be +used with a standard RGB camera mounted on a small, power-constrained mobile +robot, such as an aerial drone. Our experimental pipeline is designed to +accurately estimate the 2D trajectory of a person who moves in a +Manhattan-world environment while remaining hidden from the camera's +field-of-view. We introduce a novel approach to process a sequence of dynamic +successive frames in a line-of-sight (LOS) video using an attention-based +neural network that performs inference in real-time. The method also includes a +preprocessing selection metric that analyzes images from a moving camera which +contain multiple vertical planar surfaces, such as walls and building facades, +and extracts planes that return maximum NLOS information. We validate the +approach on in-the-wild scenes using a drone for video capture, thus +demonstrating low-cost NLOS imaging in dynamic capture environments. + +
+
+ comment: First two authors have equal contribution +
+
+
+
+
+ + ☆ Scalable and Efficient Hierarchical Visual Topological Mapping + + +
+ Hierarchical topological representations can significantly reduce search +times within mapping and localization algorithms. Although recent research has +shown the potential for such approaches, limited consideration has been given +to the suitability and comparative performance of different global feature +representations within this context. In this work, we evaluate state-of-the-art +hand-crafted and learned global descriptors using a hierarchical topological +mapping technique on benchmark datasets and present results of a comprehensive +evaluation of the impact of the global descriptor used. Although learned +descriptors have been incorporated into place recognition methods to improve +retrieval accuracy and enhance overall recall, the problem of scalability and +efficiency when applied to longer trajectories has not been adequately +addressed in a majority of research studies. Based on our empirical analysis of +multiple runs, we identify that continuity and distinctiveness are crucial +characteristics for an optimal global descriptor that enable efficient and +scalable hierarchical mapping, and present a methodology for quantifying and +contrasting these characteristics across different global descriptors. Our +study demonstrates that the use of global descriptors based on an unsupervised +learned Variational Autoencoder (VAE) excels in these characteristics and +achieves significantly lower runtime. It runs on a consumer grade desktop, up +to 2.3x faster than the second best global descriptor, NetVLAD, and up to 9.5x +faster than the hand-crafted descriptor, PHOG, on the longest track evaluated +(St Lucia, 17.6 km), without sacrificing overall recall performance. + +
+
+ comment: Published in the 21st International Conference on Advanced Robotics + (ICAR 2023) +
+
+
+
+
+ + ☆ DinoBloom: A Foundation Model for Generalizable Cell Embeddings in + Hematology + + +
+ In hematology, computational models offer significant potential to improve +diagnostic accuracy, streamline workflows, and reduce the tedious work of +analyzing single cells in peripheral blood or bone marrow smears. However, +clinical adoption of computational models has been hampered by the lack of +generalization due to large batch effects, small dataset sizes, and poor +performance in transfer learning from natural images. To address these +challenges, we introduce DinoBloom, the first foundation model for single cell +images in hematology, utilizing a tailored DINOv2 pipeline. Our model is built +upon an extensive collection of 13 diverse, publicly available datasets of +peripheral blood and bone marrow smears, the most substantial open-source +cohort in hematology so far, comprising over 380,000 white blood cell images. +To assess its generalization capability, we evaluate it on an external dataset +with a challenging domain shift. We show that our model outperforms existing +medical and non-medical vision models in (i) linear probing and k-nearest +neighbor evaluations for cell-type classification on blood and bone marrow +smears and (ii) weakly supervised multiple instance learning for acute myeloid +leukemia subtyping by a large margin. A family of four DinoBloom models (small, +base, large, and giant) can be adapted for a wide range of downstream +applications, be a strong baseline for classification problems, and facilitate +the assessment of batch effects in new datasets. All models are available at +github.com/marrlab/DinoBloom. + +
+
+
+
+
+ + ☆ Hyperbolic Learning with Synthetic Captions for Open-World Detection CVPR 2024 + + +
+ Open-world detection poses significant challenges, as it requires the +detection of any object using either object class labels or free-form texts. +Existing related works often use large-scale manual annotated caption datasets +for training, which are extremely expensive to collect. Instead, we propose to +transfer knowledge from vision-language models (VLMs) to enrich the +open-vocabulary descriptions automatically. Specifically, we bootstrap dense +synthetic captions using pre-trained VLMs to provide rich descriptions on +different regions in images, and incorporate these captions to train a novel +detector that generalizes to novel concepts. To mitigate the noise caused by +hallucination in synthetic captions, we also propose a novel hyperbolic +vision-language learning approach to impose a hierarchy between visual and +caption embeddings. We call our detector ``HyperLearner''. We conduct extensive +experiments on a wide variety of open-world detection benchmarks (COCO, LVIS, +Object Detection in the Wild, RefCOCO) and our results show that our model +consistently outperforms existing state-of-the-art methods, such as GLIP, +GLIPv2 and Grounding DINO, when using the same backbone. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators + + +
+ Recent advances in Text-to-Video generation (T2V) have achieved remarkable +success in synthesizing high-quality general videos from textual descriptions. +A largely overlooked problem in T2V is that existing models have not adequately +encoded physical knowledge of the real world, thus generated videos tend to +have limited motion and poor variations. In this paper, we propose +\textbf{MagicTime}, a metamorphic time-lapse video generation model, which +learns real-world physics knowledge from time-lapse videos and implements +metamorphic generation. First, we design a MagicAdapter scheme to decouple +spatial and temporal training, encode more physical knowledge from metamorphic +videos, and transform pre-trained T2V models to generate metamorphic videos. +Second, we introduce a Dynamic Frames Extraction strategy to adapt to +metamorphic time-lapse videos, which have a wider variation range and cover +dramatic object metamorphic processes, thus embodying more physical knowledge +than general videos. Finally, we introduce a Magic Text-Encoder to improve the +understanding of metamorphic video prompts. Furthermore, we create a time-lapse +video-text dataset called \textbf{ChronoMagic}, specifically curated to unlock +the metamorphic video generation ability. Extensive experiments demonstrate the +superiority and effectiveness of MagicTime for generating high-quality and +dynamic metamorphic videos, suggesting time-lapse video generation is a +promising path toward building metamorphic simulators of the physical world. + +
+
+
+
+
+ + ☆ Camera-Based Remote Physiology Sensing for Hundreds of Subjects Across + Skin Tones + + +
+ Remote photoplethysmography (rPPG) emerges as a promising method for +non-invasive, convenient measurement of vital signs, utilizing the widespread +presence of cameras. Despite advancements, existing datasets fall short in +terms of size and diversity, limiting comprehensive evaluation under diverse +conditions. This paper presents an in-depth analysis of the VitalVideo dataset, +the largest real-world rPPG dataset to date, encompassing 893 subjects and 6 +Fitzpatrick skin tones. Our experimentation with six unsupervised methods and +three supervised models demonstrates that datasets comprising a few hundred +subjects(i.e., 300 for UBFC-rPPG, 500 for PURE, and 700 for MMPD-Simple) are +sufficient for effective rPPG model training. Our findings highlight the +importance of diversity and consistency in skin tones for precise performance +evaluation across different datasets. + +
+
+ comment: 11 pages, 5 figures, CHI24 Workshop PhysioCHI +
+
+
+
+
+ + ☆ Dual-Scale Transformer for Large-Scale Single-Pixel Imaging CVPR 2024 + + +
+ Single-pixel imaging (SPI) is a potential computational imaging technique +which produces image by solving an illposed reconstruction problem from few +measurements captured by a single-pixel detector. Deep learning has achieved +impressive success on SPI reconstruction. However, previous poor reconstruction +performance and impractical imaging model limit its real-world applications. In +this paper, we propose a deep unfolding network with hybrid-attention +Transformer on Kronecker SPI model, dubbed HATNet, to improve the imaging +quality of real SPI cameras. Specifically, we unfold the computation graph of +the iterative shrinkagethresholding algorithm (ISTA) into two alternative +modules: efficient tensor gradient descent and hybrid-attention multiscale +denoising. By virtue of Kronecker SPI, the gradient descent module can avoid +high computational overheads rooted in previous gradient descent modules based +on vectorized SPI. The denoising module is an encoder-decoder architecture +powered by dual-scale spatial attention for high- and low-frequency aggregation +and channel attention for global information recalibration. Moreover, we build +a SPI prototype to verify the effectiveness of the proposed method. Extensive +experiments on synthetic and real data demonstrate that our method achieves the +state-of-the-art performance. The source code and pre-trained models are +available at https://github.com/Gang-Qu/HATNet-SPI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Weakly Supervised Deep Hyperspherical Quantization for Image Retrieval AAAI 2021 + + +
+ Deep quantization methods have shown high efficiency on large-scale image +retrieval. However, current models heavily rely on ground-truth information, +hindering the application of quantization in label-hungry scenarios. A more +realistic demand is to learn from inexhaustible uploaded images that are +associated with informal tags provided by amateur users. Though such sketchy +tags do not obviously reveal the labels, they actually contain useful semantic +information for supervising deep quantization. To this end, we propose +Weakly-Supervised Deep Hyperspherical Quantization (WSDHQ), which is the first +work to learn deep quantization from weakly tagged images. Specifically, 1) we +use word embeddings to represent the tags and enhance their semantic +information based on a tag correlation graph. 2) To better preserve semantic +information in quantization codes and reduce quantization error, we jointly +learn semantics-preserving embeddings and supervised quantizer on hypersphere +by employing a well-designed fusion layer and tailor-made loss functions. +Extensive experiments show that WSDHQ can achieve state-of-art performance on +weakly-supervised compact coding. Code is available at +https://github.com/gimpong/AAAI21-WSDHQ. + +
+
+ comment: In proceedings of AAAI 2021. Code and data are available +
+
+
+
+
+ + ☆ Fantastic Animals and Where to Find Them: Segment Any Marine Animal with + Dual SAM CVPR2024 + + +
+ As an important pillar of underwater intelligence, Marine Animal Segmentation +(MAS) involves segmenting animals within marine environments. Previous methods +don't excel in extracting long-range contextual features and overlook the +connectivity between discrete pixels. Recently, Segment Anything Model (SAM) +offers a universal framework for general segmentation tasks. Unfortunately, +trained with natural images, SAM does not obtain the prior knowledge from +marine images. In addition, the single-position prompt of SAM is very +insufficient for prior guidance. To address these issues, we propose a novel +feature learning framework, named Dual-SAM for high-performance MAS. To this +end, we first introduce a dual structure with SAM's paradigm to enhance feature +learning of marine images. Then, we propose a Multi-level Coupled Prompt (MCP) +strategy to instruct comprehensive underwater prior information, and enhance +the multi-level features of SAM's encoder with adapters. Subsequently, we +design a Dilated Fusion Attention Module (DFAM) to progressively integrate +multi-level features from SAM's encoder. Finally, instead of directly +predicting the masks of marine animals, we propose a Criss-Cross Connectivity +Prediction (C$^3$P) paradigm to capture the inter-connectivity between discrete +pixels. With dual decoders, it generates pseudo-labels and achieves mutual +supervision for complementary feature representations, resulting in +considerable improvements over previous techniques. Extensive experiments +verify that our proposed method achieves state-of-the-art performances on five +widely-used MAS datasets. The code is available at +https://github.com/Drchip61/Dual_SAM. + +
+
+ comment: Accepted by CVPR2024 as Poster(Highlight) +
+
+
+
+
+ + ☆ Efficient Surgical Tool Recognition via HMM-Stabilized Deep Learning + + +
+ Recognizing various surgical tools, actions and phases from surgery videos is +an important problem in computer vision with exciting clinical applications. +Existing deep-learning-based methods for this problem either process each +surgical video as a series of independent images without considering their +dependence, or rely on complicated deep learning models to count for dependence +of video frames. In this study, we revealed from exploratory data analysis that +surgical videos enjoy relatively simple semantic structure, where the presence +of surgical phases and tools can be well modeled by a compact hidden Markov +model (HMM). Based on this observation, we propose an HMM-stabilized deep +learning method for tool presence detection. A wide range of experiments +confirm that the proposed approaches achieve better performance with lower +training and running costs, and support more flexible ways to construct and +utilize training data in scenarios where not all surgery videos of interest are +extensively labelled. These results suggest that popular deep learning +approaches with over-complicated model structures may suffer from inefficient +utilization of data, and integrating ingredients of deep learning and +statistical learning wisely may lead to more powerful algorithms that enjoy +competitive performance, transparent interpretation and convenient model +training simultaneously. + +
+
+
+
+
+ + ☆ Dynamic Distinction Learning: Adaptive Pseudo Anomalies for Video + Anomaly Detection CVPR2024 + + +
+ We introduce Dynamic Distinction Learning (DDL) for Video Anomaly Detection, +a novel video anomaly detection methodology that combines pseudo-anomalies, +dynamic anomaly weighting, and a distinction loss function to improve detection +accuracy. By training on pseudo-anomalies, our approach adapts to the +variability of normal and anomalous behaviors without fixed anomaly thresholds. +Our model showcases superior performance on the Ped2, Avenue and ShanghaiTech +datasets, where individual models are tailored for each scene. These +achievements highlight DDL's effectiveness in advancing anomaly detection, +offering a scalable and adaptable solution for video surveillance challenges. + +
+
+ comment: To be published in the CVPR2024 Workshop +
+
+
+
+
+ + ☆ Primary liver cancer classification from routine tumour biopsy using + weakly supervised deep learning + + +
+ The diagnosis of primary liver cancers (PLCs) can be challenging, especially +on biopsies and for combined hepatocellular-cholangiocarcinoma (cHCC-CCA). We +automatically classified PLCs on routine-stained biopsies using a weakly +supervised learning method. Weak tumour/non-tumour annotations served as labels +for training a Resnet18 neural network, and the network's last convolutional +layer was used to extract new tumour tile features. Without knowledge of the +precise labels of the malignancies, we then applied an unsupervised clustering +algorithm. Our model identified specific features of hepatocellular carcinoma +(HCC) and intrahepatic cholangiocarcinoma (iCCA). Despite no specific features +of cHCC-CCA being recognized, the identification of HCC and iCCA tiles within a +slide could facilitate the diagnosis of primary liver cancers, particularly +cHCC-CCA. + Method and results: 166 PLC biopsies were divided into training, internal and +external validation sets: 90, 29 and 47 samples. Two liver pathologists +reviewed each whole-slide hematein eosin saffron (HES)-stained image (WSI). +After annotating the tumour/non-tumour areas, 256x256 pixel tiles were +extracted from the WSIs and used to train a ResNet18. The network was used to +extract new tile features. An unsupervised clustering algorithm was then +applied to the new tile features. In a two-cluster model, Clusters 0 and 1 +contained mainly HCC and iCCA histological features. The diagnostic agreement +between the pathological diagnosis and the model predictions in the internal +and external validation sets was 100% (11/11) and 96% (25/26) for HCC and 78% +(7/9) and 87% (13/15) for iCCA, respectively. For cHCC-CCA, we observed a +highly variable proportion of tiles from each cluster (Cluster 0: 5-97%; +Cluster 1: 2-94%). + +
+
+ comment: https://www.sciencedirect.com/science/article/pii/S2589555924000090 +
+
+
+
+
+ + ☆ FPL+: Filtered Pseudo Label-based Unsupervised Cross-Modality Adaptation + for 3D Medical Image Segmentation + + +
+ Adapting a medical image segmentation model to a new domain is important for +improving its cross-domain transferability, and due to the expensive annotation +process, Unsupervised Domain Adaptation (UDA) is appealing where only unlabeled +images are needed for the adaptation. Existing UDA methods are mainly based on +image or feature alignment with adversarial training for regularization, and +they are limited by insufficient supervision in the target domain. In this +paper, we propose an enhanced Filtered Pseudo Label (FPL+)-based UDA method for +3D medical image segmentation. It first uses cross-domain data augmentation to +translate labeled images in the source domain to a dual-domain training set +consisting of a pseudo source-domain set and a pseudo target-domain set. To +leverage the dual-domain augmented images to train a pseudo label generator, +domain-specific batch normalization layers are used to deal with the domain +shift while learning the domain-invariant structure features, generating +high-quality pseudo labels for target-domain images. We then combine labeled +source-domain images and target-domain images with pseudo labels to train a +final segmentor, where image-level weighting based on uncertainty estimation +and pixel-level weighting based on dual-domain consensus are proposed to +mitigate the adverse effect of noisy pseudo labels. Experiments on three public +multi-modal datasets for Vestibular Schwannoma, brain tumor and whole heart +segmentation show that our method surpassed ten state-of-the-art UDA methods, +and it even achieved better results than fully supervised learning in the +target domain in some cases. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ PairAug: What Can Augmented Image-Text Pairs Do for Radiology? CVPR2024 + + +
+ Current vision-language pre-training (VLP) methodologies predominantly depend +on paired image-text datasets, a resource that is challenging to acquire in +radiology due to privacy considerations and labelling complexities. Data +augmentation provides a practical solution to overcome the issue of data +scarcity, however, most augmentation methods exhibit a limited focus, +prioritising either image or text augmentation exclusively. Acknowledging this +limitation, our objective is to devise a framework capable of concurrently +augmenting medical image and text data. We design a Pairwise Augmentation +(PairAug) approach that contains an Inter-patient Augmentation (InterAug) +branch and an Intra-patient Augmentation (IntraAug) branch. Specifically, the +InterAug branch of our approach generates radiology images using synthesised +yet plausible reports derived from a Large Language Model (LLM). The generated +pairs can be considered a collection of new patient cases since they are +artificially created and may not exist in the original dataset. In contrast, +the IntraAug branch uses newly generated reports to manipulate images. This +process allows us to create new paired data for each individual with diverse +medical conditions. Our extensive experiments on various downstream tasks +covering medical image classification zero-shot and fine-tuning analysis +demonstrate that our PairAug, concurrently expanding both image and text data, +substantially outperforms image-/text-only expansion baselines and advanced +medical VLP baselines. Our code is released at +\url{https://github.com/YtongXie/PairAug}. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Gaussian Shading: Provable Performance-Lossless Image Watermarking for + Diffusion Models CVPR 2024 + + +
+ Ethical concerns surrounding copyright protection and inappropriate content +generation pose challenges for the practical implementation of diffusion +models. One effective solution involves watermarking the generated images. +However, existing methods often compromise the model performance or require +additional training, which is undesirable for operators and users. To address +this issue, we propose Gaussian Shading, a diffusion model watermarking +technique that is both performance-lossless and training-free, while serving +the dual purpose of copyright protection and tracing of offending content. Our +watermark embedding is free of model parameter modifications and thus is +plug-and-play. We map the watermark to latent representations following a +standard Gaussian distribution, which is indistinguishable from latent +representations obtained from the non-watermarked diffusion model. Therefore we +can achieve watermark embedding with lossless performance, for which we also +provide theoretical proof. Furthermore, since the watermark is intricately +linked with image semantics, it exhibits resilience to lossy processing and +erasure attempts. The watermark can be extracted by Denoising Diffusion +Implicit Models (DDIM) inversion and inverse sampling. We evaluate Gaussian +Shading on multiple versions of Stable Diffusion, and the results demonstrate +that Gaussian Shading not only is performance-lossless but also outperforms +existing methods in terms of robustness. + +
+
+ comment: 17 pages, 11 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ☆ High-Discriminative Attribute Feature Learning for Generalized Zero-Shot + Learning + + +
+ Zero-shot learning(ZSL) aims to recognize new classes without prior exposure +to their samples, relying on semantic knowledge from observed classes. However, +current attention-based models may overlook the transferability of visual +features and the distinctiveness of attribute localization when learning +regional features in images. Additionally, they often overlook shared +attributes among different objects. Highly discriminative attribute features +are crucial for identifying and distinguishing unseen classes. To address these +issues, we propose an innovative approach called High-Discriminative Attribute +Feature Learning for Generalized Zero-Shot Learning (HDAFL). HDAFL optimizes +visual features by learning attribute features to obtain discriminative visual +embeddings. Specifically, HDAFL utilizes multiple convolutional kernels to +automatically learn discriminative regions highly correlated with attributes in +images, eliminating irrelevant interference in image features. Furthermore, we +introduce a Transformer-based attribute discrimination encoder to enhance the +discriminative capability among attributes. Simultaneously, the method employs +contrastive loss to alleviate dataset biases and enhance the transferability of +visual features, facilitating better semantic transfer between seen and unseen +classes. Experimental results demonstrate the effectiveness of HDAFL across +three widely used datasets. + +
+
+
+
+
+ + ☆ AnimateZoo: Zero-shot Video Generation of Cross-Species Animation via + Subject Alignment + + +
+ Recent video editing advancements rely on accurate pose sequences to animate +subjects. However, these efforts are not suitable for cross-species animation +due to pose misalignment between species (for example, the poses of a cat +differs greatly from that of a pig due to differences in body structure). In +this paper, we present AnimateZoo, a zero-shot diffusion-based video generator +to address this challenging cross-species animation issue, aiming to accurately +produce animal animations while preserving the background. The key technique +used in our AnimateZoo is subject alignment, which includes two steps. First, +we improve appearance feature extraction by integrating a Laplacian detail +booster and a prompt-tuning identity extractor. These components are +specifically designed to capture essential appearance information, including +identity and fine details. Second, we align shape features and address +conflicts from differing subjects by introducing a scale-information remover. +This ensures accurate cross-species animation. Moreover, we introduce two +high-quality animal video datasets featuring a wide variety of species. Trained +on these extensive datasets, our model is capable of generating videos +characterized by accurate movements, consistent appearance, and high-fidelity +frames, without the need for the pre-inference fine-tuning that prior arts +required. Extensive experiments showcase the outstanding performance of our +method in cross-species action following tasks, demonstrating exceptional shape +adaptation capability. The project page is available at +https://justinxu0.github.io/AnimateZoo/. + +
+
+ comment: Technical report,15 pages +
+
+
+
+
+ + ☆ Bootstrapping Chest CT Image Understanding by Distilling Knowledge from + X-ray Expert Models CVPR 2024 + + +
+ Radiologists highly desire fully automated versatile AI for medical imaging +interpretation. However, the lack of extensively annotated large-scale +multi-disease datasets has hindered the achievement of this goal. In this +paper, we explore the feasibility of leveraging language as a naturally +high-quality supervision for chest CT imaging. In light of the limited +availability of image-report pairs, we bootstrap the understanding of 3D chest +CT images by distilling chest-related diagnostic knowledge from an extensively +pre-trained 2D X-ray expert model. Specifically, we propose a language-guided +retrieval method to match each 3D CT image with its semantically closest 2D +X-ray image, and perform pair-wise and semantic relation knowledge +distillation. Subsequently, we use contrastive learning to align images and +reports within the same patient while distinguishing them from the other +patients. However, the challenge arises when patients have similar semantic +diagnoses, such as healthy patients, potentially confusing if treated as +negatives. We introduce a robust contrastive learning that identifies and +corrects these false negatives. We train our model with over 12,000 pairs of +chest CT images and radiology reports. Extensive experiments across multiple +scenarios, including zero-shot learning, report generation, and fine-tuning +processes, demonstrate the model's feasibility in interpreting chest CT images. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Anomaly Detection in Electrocardiograms: Advancing Clinical Diagnosis + Through Self-Supervised Learning + + +
+ The electrocardiogram (ECG) is an essential tool for diagnosing heart +disease, with computer-aided systems improving diagnostic accuracy and reducing +healthcare costs. Despite advancements, existing systems often miss rare +cardiac anomalies that could be precursors to serious, life-threatening issues +or alterations in the cardiac macro/microstructure. We address this gap by +focusing on self-supervised anomaly detection (AD), training exclusively on +normal ECGs to recognize deviations indicating anomalies. We introduce a novel +self-supervised learning framework for ECG AD, utilizing a vast dataset of +normal ECGs to autonomously detect and localize cardiac anomalies. It proposes +a novel masking and restoration technique alongside a multi-scale +cross-attention module, enhancing the model's ability to integrate global and +local signal features. The framework emphasizes accurate localization of +anomalies within ECG signals, ensuring the method's clinical relevance and +reliability. To reduce the impact of individual variability, the approach +further incorporates crucial patient-specific information from ECG reports, +such as age and gender, thus enabling accurate identification of a broad +spectrum of cardiac anomalies, including rare ones. Utilizing an extensive +dataset of 478,803 ECG graphic reports from real-world clinical practice, our +method has demonstrated exceptional effectiveness in AD across all tested +conditions, regardless of their frequency of occurrence, significantly +outperforming existing models. It achieved superior performance metrics, +including an AUROC of 91.2%, an F1 score of 83.7%, a sensitivity rate of 84.2%, +a specificity of 83.0%, and a precision of 75.6% with a fixed recall rate of +90%. It has also demonstrated robust localization capabilities, with an AUROC +of 76.5% and a Dice coefficient of 65.3% for anomaly localization. + +
+
+
+
+
+ + ☆ UniMD: Towards Unifying Moment Retrieval and Temporal Action Detection + + +
+ Temporal Action Detection (TAD) focuses on detecting pre-defined actions, +while Moment Retrieval (MR) aims to identify the events described by open-ended +natural language within untrimmed videos. Despite that they focus on different +events, we observe they have a significant connection. For instance, most +descriptions in MR involve multiple actions from TAD. In this paper, we aim to +investigate the potential synergy between TAD and MR. Firstly, we propose a +unified architecture, termed Unified Moment Detection (UniMD), for both TAD and +MR. It transforms the inputs of the two tasks, namely actions for TAD or events +for MR, into a common embedding space, and utilizes two novel query-dependent +decoders to generate a uniform output of classification score and temporal +segments. Secondly, we explore the efficacy of two task fusion learning +approaches, pre-training and co-training, in order to enhance the mutual +benefits between TAD and MR. Extensive experiments demonstrate that the +proposed task fusion learning scheme enables the two tasks to help each other +and outperform the separately trained counterparts. Impressively, UniMD +achieves state-of-the-art results on three paired datasets Ego4D, Charades-STA, +and ActivityNet. Our code will be released at +https://github.com/yingsen1/UniMD. + +
+
+ comment: Tech report +
+
+
+
+
+ + ☆ GvT: A Graph-based Vision Transformer with Talking-Heads Utilizing + Sparsity, Trained from Scratch on Small Datasets + + +
+ Vision Transformers (ViTs) have achieved impressive results in large-scale +image classification. However, when training from scratch on small datasets, +there is still a significant performance gap between ViTs and Convolutional +Neural Networks (CNNs), which is attributed to the lack of inductive bias. To +address this issue, we propose a Graph-based Vision Transformer (GvT) that +utilizes graph convolutional projection and graph-pooling. In each block, +queries and keys are calculated through graph convolutional projection based on +the spatial adjacency matrix, while dot-product attention is used in another +graph convolution to generate values. When using more attention heads, the +queries and keys become lower-dimensional, making their dot product an +uninformative matching function. To overcome this low-rank bottleneck in +attention heads, we employ talking-heads technology based on bilinear pooled +features and sparse selection of attention tensors. This allows interaction +among filtered attention scores and enables each attention mechanism to depend +on all queries and keys. Additionally, we apply graph-pooling between two +intermediate blocks to reduce the number of tokens and aggregate semantic +information more effectively. Our experimental results show that GvT produces +comparable or superior outcomes to deep convolutional networks and surpasses +vision transformers without pre-training on large datasets. The code for our +proposed model is publicly available on the website. + +
+
+
+
+
+ + ☆ Efficient Learnable Collaborative Attention for Single Image + Super-Resolution + + +
+ Non-Local Attention (NLA) is a powerful technique for capturing long-range +feature correlations in deep single image super-resolution (SR). However, NLA +suffers from high computational complexity and memory consumption, as it +requires aggregating all non-local feature information for each query response +and recalculating the similarity weight distribution for different abstraction +levels of features. To address these challenges, we propose a novel Learnable +Collaborative Attention (LCoA) that introduces inductive bias into non-local +modeling. Our LCoA consists of two components: Learnable Sparse Pattern (LSP) +and Collaborative Attention (CoA). LSP uses the k-means clustering algorithm to +dynamically adjust the sparse attention pattern of deep features, which reduces +the number of non-local modeling rounds compared with existing sparse +solutions. CoA leverages the sparse attention pattern and weights learned by +LSP, and co-optimizes the similarity matrix across different abstraction +levels, which avoids redundant similarity matrix calculations. The experimental +results show that our LCoA can reduce the non-local modeling time by about 83% +in the inference stage. In addition, we integrate our LCoA into a deep +Learnable Collaborative Attention Network (LCoAN), which achieves competitive +performance in terms of inference time, memory consumption, and reconstruction +quality compared with other state-of-the-art SR methods. + +
+
+
+
+
+ + ☆ Correcting Diffusion-Based Perceptual Image Compression with Privileged + End-to-End Decoder + + +
+ The images produced by diffusion models can attain excellent perceptual +quality. However, it is challenging for diffusion models to guarantee +distortion, hence the integration of diffusion models and image compression +models still needs more comprehensive explorations. This paper presents a +diffusion-based image compression method that employs a privileged end-to-end +decoder model as correction, which achieves better perceptual quality while +guaranteeing the distortion to an extent. We build a diffusion model and design +a novel paradigm that combines the diffusion model and an end-to-end decoder, +and the latter is responsible for transmitting the privileged information +extracted at the encoder side. Specifically, we theoretically analyze the +reconstruction process of the diffusion models at the encoder side with the +original images being visible. Based on the analysis, we introduce an +end-to-end convolutional decoder to provide a better approximation of the score +function $\nabla_{\mathbf{x}_t}\log p(\mathbf{x}_t)$ at the encoder side and +effectively transmit the combination. Experiments demonstrate the superiority +of our method in both distortion and perception compared with previous +perceptual compression methods. + +
+
+
+
+
+ + ☆ CodecNeRF: Toward Fast Encoding and Decoding, Compact, and High-quality + Novel-view Synthesis + + +
+ Neural Radiance Fields (NeRF) have achieved huge success in effectively +capturing and representing 3D objects and scenes. However, several factors have +impeded its further proliferation as next-generation 3D media. To establish a +ubiquitous presence in everyday media formats, such as images and videos, it is +imperative to devise a solution that effectively fulfills three key objectives: +fast encoding and decoding time, compact model sizes, and high-quality +renderings. Despite significant advancements, a comprehensive algorithm that +adequately addresses all objectives has yet to be fully realized. In this work, +we present CodecNeRF, a neural codec for NeRF representations, consisting of a +novel encoder and decoder architecture that can generate a NeRF representation +in a single forward pass. Furthermore, inspired by the recent +parameter-efficient finetuning approaches, we develop a novel finetuning method +to efficiently adapt the generated NeRF representations to a new test instance, +leading to high-quality image renderings and compact code sizes. The proposed +CodecNeRF, a newly suggested encoding-decoding-finetuning pipeline for NeRF, +achieved unprecedented compression performance of more than 150x and 20x +reduction in encoding time while maintaining (or improving) the image quality +on widely used 3D object datasets, such as ShapeNet and Objaverse. + +
+
+ comment: 34 pages, 22 figures, Project page: + https://gynjn.github.io/Codec-NeRF/ +
+
+
+
+
+ + ☆ MonoTAKD: Teaching Assistant Knowledge Distillation for Monocular 3D + Object Detection + + +
+ Monocular 3D object detection (Mono3D) is an indispensable research topic in +autonomous driving, thanks to the cost-effective monocular camera sensors and +its wide range of applications. Since the image perspective has depth +ambiguity, the challenges of Mono3D lie in understanding 3D scene geometry and +reconstructing 3D object information from a single image. Previous methods +attempted to transfer 3D information directly from the LiDAR-based teacher to +the camera-based student. However, a considerable gap in feature representation +makes direct cross-modal distillation inefficient, resulting in a significant +performance deterioration between the LiDAR-based teacher and the camera-based +student. To address this issue, we propose the Teaching Assistant Knowledge +Distillation (MonoTAKD) to break down the learning objective by integrating +intra-modal distillation with cross-modal residual distillation. In particular, +we employ a strong camera-based teaching assistant model to distill powerful +visual knowledge effectively through intra-modal distillation. Subsequently, we +introduce the cross-modal residual distillation to transfer the 3D spatial +cues. By acquiring both visual knowledge and 3D spatial cues, the predictions +of our approach are rigorously evaluated on the KITTI 3D object detection +benchmark and achieve state-of-the-art performance in Mono3D. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Dual-Camera Smooth Zoom on Mobile Phones + + +
+ When zooming between dual cameras on a mobile, noticeable jumps in geometric +content and image color occur in the preview, inevitably affecting the user's +zoom experience. In this work, we introduce a new task, ie, dual-camera smooth +zoom (DCSZ) to achieve a smooth zoom preview. The frame interpolation (FI) +technique is a potential solution but struggles with ground-truth collection. +To address the issue, we suggest a data factory solution where continuous +virtual cameras are assembled to generate DCSZ data by rendering reconstructed +3D models of the scene. In particular, we propose a novel dual-camera smooth +zoom Gaussian Splatting (ZoomGS), where a camera-specific encoding is +introduced to construct a specific 3D model for each virtual camera. With the +proposed data factory, we construct a synthetic dataset for DCSZ, and we +utilize it to fine-tune FI models. In addition, we collect real-world dual-zoom +images without ground-truth for evaluation. Extensive experiments are conducted +with multiple FI methods. The results show that the fine-tuned FI models +achieve a significant performance improvement over the original ones on DCSZ +task. The datasets, codes, and pre-trained models will be publicly available. + +
+
+ comment: 24 +
+
+
+
+
+ + ☆ DL-EWF: Deep Learning Empowering Women's Fashion with + Grounded-Segment-Anything Segmentation for Body Shape Classification + + +
+ The global fashion industry plays a pivotal role in the global economy, and +addressing fundamental issues within the industry is crucial for developing +innovative solutions. One of the most pressing challenges in the fashion +industry is the mismatch between body shapes and the garments of individuals +they purchase. This issue is particularly prevalent among individuals with +non-ideal body shapes, exacerbating the challenges faced. Considering +inter-individual variability in body shapes is essential for designing and +producing garments that are widely accepted by consumers. Traditional methods +for determining human body shape are limited due to their low accuracy, high +costs, and time-consuming nature. New approaches, utilizing digital imaging and +deep neural networks (DNN), have been introduced to identify human body shape. +In this study, the Style4BodyShape dataset is used for classifying body shapes +into five categories: Rectangle, Triangle, Inverted Triangle, Hourglass, and +Apple. In this paper, the body shape segmentation of a person is extracted from +the image, disregarding the surroundings and background. Then, Various +pre-trained models, such as ResNet18, ResNet34, ResNet50, VGG16, VGG19, and +Inception v3, are used to classify the segmentation results. Among these +pre-trained models, the Inception V3 model demonstrates superior performance +regarding f1-score evaluation metric and accuracy compared to the other models. + +
+
+
+
+
+ + ☆ A Unified Diffusion Framework for Scene-aware Human Motion Estimation + from Sparse Signals + + +
+ Estimating full-body human motion via sparse tracking signals from +head-mounted displays and hand controllers in 3D scenes is crucial to +applications in AR/VR. One of the biggest challenges to this task is the +one-to-many mapping from sparse observations to dense full-body motions, which +endowed inherent ambiguities. To help resolve this ambiguous problem, we +introduce a new framework to combine rich contextual information provided by +scenes to benefit full-body motion tracking from sparse observations. To +estimate plausible human motions given sparse tracking signals and 3D scenes, +we develop $\text{S}^2$Fusion, a unified framework fusing \underline{S}cene and +sparse \underline{S}ignals with a conditional dif\underline{Fusion} model. +$\text{S}^2$Fusion first extracts the spatial-temporal relations residing in +the sparse signals via a periodic autoencoder, and then produces time-alignment +feature embedding as additional inputs. Subsequently, by drawing initial noisy +motion from a pre-trained prior, $\text{S}^2$Fusion utilizes conditional +diffusion to fuse scene geometry and sparse tracking signals to generate +full-body scene-aware motions. The sampling procedure of $\text{S}^2$Fusion is +further guided by a specially designed scene-penetration loss and +phase-matching loss, which effectively regularizes the motion of the lower body +even in the absence of any tracking signals, making the generated motion much +more plausible and coherent. Extensive experimental results have demonstrated +that our $\text{S}^2$Fusion outperforms the state-of-the-art in terms of +estimation quality and smoothness. + +
+
+
+
+
+ + ☆ A Clinical-oriented Multi-level Contrastive Learning Method for Disease + Diagnosis in Low-quality Medical Images + + +
+ Representation learning offers a conduit to elucidate distinctive features +within the latent space and interpret the deep models. However, the randomness +of lesion distribution and the complexity of low-quality factors in medical +images pose great challenges for models to extract key lesion features. Disease +diagnosis methods guided by contrastive learning (CL) have shown significant +advantages in lesion feature representation. Nevertheless, the effectiveness of +CL is highly dependent on the quality of the positive and negative sample +pairs. In this work, we propose a clinical-oriented multi-level CL framework +that aims to enhance the model's capacity to extract lesion features and +discriminate between lesion and low-quality factors, thereby enabling more +accurate disease diagnosis from low-quality medical images. Specifically, we +first construct multi-level positive and negative pairs to enhance the model's +comprehensive recognition capability of lesion features by integrating +information from different levels and qualities of medical images. Moreover, to +improve the quality of the learned lesion embeddings, we introduce a dynamic +hard sample mining method based on self-paced learning. The proposed CL +framework is validated on two public medical image datasets, EyeQ and Chest +X-ray, demonstrating superior performance compared to other state-of-the-art +disease diagnostic methods. + +
+
+
+
+
+ + ☆ LRNet: Change detection of high-resolution remote sensing imagery via + strategy of localization-then-refinement + + +
+ Change detection, as a research hotspot in the field of remote sensing, has +witnessed continuous development and progress. However, the discrimination of +boundary details remains a significant bottleneck due to the complexity of +surrounding elements between change areas and backgrounds. Discriminating the +boundaries of large change areas results in misalignment, while connecting +boundaries occurs for small change targets. To address the above issues, a +novel network based on the localization-then-refinement strategy is proposed in +this paper, namely LRNet. LRNet consists of two stages: localization and +refinement. In the localization stage, a three-branch encoder simultaneously +extracts original image features and their differential features for +interactive localization of the position of each change area. To minimize +information loss during feature extraction, learnable optimal pooling (LOP) is +proposed to replace the widely used max-pooling. Additionally, this process is +trainable and contributes to the overall optimization of the network. To +effectively interact features from different branches and accurately locate +change areas of various sizes, change alignment attention (C2A) and +hierarchical change alignment module (HCA) are proposed. In the refinement +stage, the localization results from the localization stage are corrected by +constraining the change areas and change edges through the edge-area alignment +module (E2A). Subsequently, the decoder, combined with the difference features +strengthened by C2A in the localization phase, refines change areas of +different sizes, ultimately achieving accurate boundary discrimination of +change areas. The proposed LRNet outperforms 13 other state-of-the-art methods +in terms of comprehensive evaluation metrics and provides the most precise +boundary discrimination results on the LEVIR-CD and WHU-CD datasets. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ Mixture of Low-rank Experts for Transferable AI-Generated Image + Detection + + +
+ Generative models have shown a giant leap in synthesizing photo-realistic +images with minimal expertise, sparking concerns about the authenticity of +online information. This study aims to develop a universal AI-generated image +detector capable of identifying images from diverse sources. Existing methods +struggle to generalize across unseen generative models when provided with +limited sample sources. Inspired by the zero-shot transferability of +pre-trained vision-language models, we seek to harness the nontrivial +visual-world knowledge and descriptive proficiency of CLIP-ViT to generalize +over unknown domains. This paper presents a novel parameter-efficient +fine-tuning approach, mixture of low-rank experts, to fully exploit CLIP-ViT's +potential while preserving knowledge and expanding capacity for transferable +detection. We adapt only the MLP layers of deeper ViT blocks via an integration +of shared and separate LoRAs within an MoE-based structure. Extensive +experiments on public benchmarks show that our method achieves superiority over +state-of-the-art approaches in cross-generator generalization and robustness to +perturbations. Remarkably, our best-performing ViT-L/14 variant requires +training only 0.08% of its parameters to surpass the leading baseline by +3.64% +mAP and +12.72% avg.Acc across unseen diffusion and autoregressive models. This +even outperforms the baseline with just 0.28% of the training data. Our code +and pre-trained models will be available at +https://github.com/zhliuworks/CLIPMoLE. + +
+
+
+
+
+ + ☆ GauU-Scene V2: Expanse Lidar Image Dataset Shows Unreliable Geometric + Reconstruction Using Gaussian Splatting and NeRF + + +
+ We introduce a novel large-scale scene reconstruction benchmark that utilizes +newly developed 3D representation approaches: Gaussian Splatting and Neural +Radiance Fields, on our expansive GauU-Scene V2 dataset. GauU-Scene V2 +encompasses over 6.5 square kilometers and features a comprehensive RGB dataset +coupled with LiDAR ground truth. This dataset offers a unique blend of urban +and academic environments for advanced spatial analysis, covering more than 6.5 +km2. We also provide detailed supplementary information on data collection +protocols. Furthermore, we present an easy-to-follow pipeline to align the +COLMAP sparse point cloud with the detailed LiDAR dataset. Our evaluation of +U-Scene, which includes a detailed analysis across various novel viewpoints +using image-based metrics such as SSIM, LPIPS, and PSNR, shows contradictory +results when applying geometric-based metrics, such as Chamfer distance. This +leads to doubts about the reliability of current image-based measurement +matrices and geometric extraction methods on Gaussian Splatting. We also make +the dataset available on the following anonymous project page + +
+
+ comment: 8 pages(No reference) 6 figures 4 tabs +
+
+
+
+
+ + ☆ CycleINR: Cycle Implicit Neural Representation for Arbitrary-Scale + Volumetric Super-Resolution of Medical Data CVPR + + +
+ In the realm of medical 3D data, such as CT and MRI images, prevalent +anisotropic resolution is characterized by high intra-slice but diminished +inter-slice resolution. The lowered resolution between adjacent slices poses +challenges, hindering optimal viewing experiences and impeding the development +of robust downstream analysis algorithms. Various volumetric super-resolution +algorithms aim to surmount these challenges, enhancing inter-slice resolution +and overall 3D medical imaging quality. However, existing approaches confront +inherent challenges: 1) often tailored to specific upsampling factors, lacking +flexibility for diverse clinical scenarios; 2) newly generated slices +frequently suffer from over-smoothing, degrading fine details, and leading to +inter-slice inconsistency. In response, this study presents CycleINR, a novel +enhanced Implicit Neural Representation model for 3D medical data volumetric +super-resolution. Leveraging the continuity of the learned implicit function, +the CycleINR model can achieve results with arbitrary up-sampling rates, +eliminating the need for separate training. Additionally, we enhance the grid +sampling in CycleINR with a local attention mechanism and mitigate +over-smoothing by integrating cycle-consistent loss. We introduce a new metric, +Slice-wise Noise Level Inconsistency (SNLI), to quantitatively assess +inter-slice noise level inconsistency. The effectiveness of our approach is +demonstrated through image quality evaluations on an in-house dataset and a +downstream task analysis on the Medical Segmentation Decathlon liver tumor +dataset. + +
+
+ comment: CVPR accepted paper +
+
+
+
+
+ + ☆ HiLo: Detailed and Robust 3D Clothed Human Reconstruction with High-and + Low-Frequency Information of Parametric Models CVPR 2024 + + +
+ Reconstructing 3D clothed human involves creating a detailed geometry of +individuals in clothing, with applications ranging from virtual try-on, movies, +to games. To enable practical and widespread applications, recent advances +propose to generate a clothed human from an RGB image. However, they struggle +to reconstruct detailed and robust avatars simultaneously. We empirically find +that the high-frequency (HF) and low-frequency (LF) information from a +parametric model has the potential to enhance geometry details and improve +robustness to noise, respectively. Based on this, we propose HiLo, namely +clothed human reconstruction with high- and low-frequency information, which +contains two components. 1) To recover detailed geometry using HF information, +we propose a progressive HF Signed Distance Function to enhance the detailed 3D +geometry of a clothed human. We analyze that our progressive learning manner +alleviates large gradients that hinder model convergence. 2) To achieve robust +reconstruction against inaccurate estimation of the parametric model by using +LF information, we propose a spatial interaction implicit function. This +function effectively exploits the complementary spatial information from a +low-resolution voxel grid of the parametric model. Experimental results +demonstrate that HiLo outperforms the state-of-the-art methods by 10.43% and +9.54% in terms of Chamfer distance on the Thuman2.0 and CAPE datasets, +respectively. Additionally, HiLo demonstrates robustness to noise from the +parametric model, challenging poses, and various clothing styles. + +
+
+ comment: CVPR 2024 Accepted Paper +
+
+
+
+
+ + ☆ NeRF2Points: Large-Scale Point Cloud Generation From Street Views' + Radiance Field Optimization + + +
+ Neural Radiance Fields (NeRF) have emerged as a paradigm-shifting methodology +for the photorealistic rendering of objects and environments, enabling the +synthesis of novel viewpoints with remarkable fidelity. This is accomplished +through the strategic utilization of object-centric camera poses characterized +by significant inter-frame overlap. This paper explores a compelling, +alternative utility of NeRF: the derivation of point clouds from aggregated +urban landscape imagery. The transmutation of street-view data into point +clouds is fraught with complexities, attributable to a nexus of interdependent +variables. First, high-quality point cloud generation hinges on precise camera +poses, yet many datasets suffer from inaccuracies in pose metadata. Also, the +standard approach of NeRF is ill-suited for the distinct characteristics of +street-view data from autonomous vehicles in vast, open settings. Autonomous +vehicle cameras often record with limited overlap, leading to blurring, +artifacts, and compromised pavement representation in NeRF-based point clouds. +In this paper, we present NeRF2Points, a tailored NeRF variant for urban point +cloud synthesis, notable for its high-quality output from RGB inputs alone. Our +paper is supported by a bespoke, high-resolution 20-kilometer urban street +dataset, designed for point cloud generation and evaluation. NeRF2Points +adeptly navigates the inherent challenges of NeRF-based point cloud synthesis +through the implementation of the following strategic innovations: (1) +Integration of Weighted Iterative Geometric Optimization (WIGO) and Structure +from Motion (SfM) for enhanced camera pose accuracy, elevating street-view data +precision. (2) Layered Perception and Integrated Modeling (LPiM) is designed +for distinct radiance field modeling in urban environments, resulting in +coherent point cloud representations. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Data Stream Sampling with Fuzzy Task Boundaries and Noisy Labels + + +
+ In the realm of continual learning, the presence of noisy labels within data +streams represents a notable obstacle to model reliability and fairness. We +focus on the data stream scenario outlined in pertinent literature, +characterized by fuzzy task boundaries and noisy labels. To address this +challenge, we introduce a novel and intuitive sampling method called Noisy Test +Debiasing (NTD) to mitigate noisy labels in evolving data streams and establish +a fair and robust continual learning algorithm. NTD is straightforward to +implement, making it feasible across various scenarios. Our experiments +benchmark four datasets, including two synthetic noise datasets (CIFAR10 and +CIFAR100) and real-world noise datasets (mini-WebVision and Food-101N). The +results validate the efficacy of NTD for online continual learning in scenarios +with noisy labels in data streams. Compared to the previous leading approach, +NTD achieves a training speedup enhancement over two times while maintaining or +surpassing accuracy levels. Moreover, NTD utilizes less than one-fifth of the +GPU memory resources compared to previous leading methods. + +
+
+
+
+
+ + ☆ On the Learnability of Out-of-distribution Detection NeurIPS 2022 + + +
+ Supervised learning aims to train a classifier under the assumption that +training and test data are from the same distribution. To ease the above +assumption, researchers have studied a more realistic setting: +out-of-distribution (OOD) detection, where test data may come from classes that +are unknown during training (i.e., OOD data). Due to the unavailability and +diversity of OOD data, good generalization ability is crucial for effective OOD +detection algorithms, and corresponding learning theory is still an open +problem. To study the generalization of OOD detection, this paper investigates +the probably approximately correct (PAC) learning theory of OOD detection that +fits the commonly used evaluation metrics in the literature. First, we find a +necessary condition for the learnability of OOD detection. Then, using this +condition, we prove several impossibility theorems for the learnability of OOD +detection under some scenarios. Although the impossibility theorems are +frustrating, we find that some conditions of these impossibility theorems may +not hold in some practical scenarios. Based on this observation, we next give +several necessary and sufficient conditions to characterize the learnability of +OOD detection in some practical scenarios. Lastly, we offer theoretical support +for representative OOD detection works based on our OOD theory. + +
+
+ comment: Accepted by JMLR in 7th of April, 2024. This is a journal extension + of the previous NeurIPS 2022 Outstanding Paper "Is Out-of-distribution + Detection Learnable?" [arXiv:2210.14707] +
+
+
+
+
+ + ☆ ByteEdit: Boost, Comply and Accelerate Generative Image Editing + + +
+ Recent advancements in diffusion-based generative image editing have sparked +a profound revolution, reshaping the landscape of image outpainting and +inpainting tasks. Despite these strides, the field grapples with inherent +challenges, including: i) inferior quality; ii) poor consistency; iii) +insufficient instrcution adherence; iv) suboptimal generation efficiency. To +address these obstacles, we present ByteEdit, an innovative feedback learning +framework meticulously designed to Boost, Comply, and Accelerate Generative +Image Editing tasks. ByteEdit seamlessly integrates image reward models +dedicated to enhancing aesthetics and image-text alignment, while also +introducing a dense, pixel-level reward model tailored to foster coherence in +the output. Furthermore, we propose a pioneering adversarial and progressive +feedback learning strategy to expedite the model's inference speed. Through +extensive large-scale user evaluations, we demonstrate that ByteEdit surpasses +leading generative image editing products, including Adobe, Canva, and MeiTu, +in both generation quality and consistency. ByteEdit-Outpainting exhibits a +remarkable enhancement of 388% and 135% in quality and consistency, +respectively, when compared to the baseline model. Experiments also verfied +that our acceleration models maintains excellent performance results in terms +of quality and consistency. + +
+
+
+
+
+ + ☆ Msmsfnet: a multi-stream and multi-scale fusion net for edge detection + + +
+ Edge detection is a long standing problem in computer vision. Recent deep +learning based algorithms achieve state of-the-art performance in publicly +available datasets. Despite the efficiency of these algorithms, their +performance, however, relies heavily on the pretrained weights of the backbone +network on the ImageNet dataset. This limits heavily the design space of deep +learning based edge detectors. Whenever we want to devise a new model, we have +to train this new model on the ImageNet dataset first, and then fine tune the +model using the edge detection datasets. The comparison would be unfair +otherwise. However, it is usually not feasible for many researchers to train a +model on the ImageNet dataset due to the limited computation resources. In this +work, we study the performance that can be achieved by state-of-the-art deep +learning based edge detectors in publicly available datasets when they are +trained from scratch, and devise a new network architecture, the multi-stream +and multi scale fusion net (msmsfnet), for edge detection. We show in our +experiments that by training all models from scratch to ensure the fairness of +comparison, out model outperforms state-of-the art deep learning based edge +detectors in three publicly available datasets. + +
+
+
+
+
+ + ☆ Task-Aware Encoder Control for Deep Video Compression CVPR 2024 + + +
+ Prior research on deep video compression (DVC) for machine tasks typically +necessitates training a unique codec for each specific task, mandating a +dedicated decoder per task. In contrast, traditional video codecs employ a +flexible encoder controller, enabling the adaptation of a single codec to +different tasks through mechanisms like mode prediction. Drawing inspiration +from this, we introduce an innovative encoder controller for deep video +compression for machines. This controller features a mode prediction and a +Group of Pictures (GoP) selection module. Our approach centralizes control at +the encoding stage, allowing for adaptable encoder adjustments across different +tasks, such as detection and tracking, while maintaining compatibility with a +standard pre-trained DVC decoder. Empirical evidence demonstrates that our +method is applicable across multiple tasks with various existing pre-trained +DVCs. Moreover, extensive experiments demonstrate that our method outperforms +previous DVC by about 25% bitrate for different tasks, with only one +pre-trained decoder. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ ShoeModel: Learning to Wear on the User-specified Shoes via Diffusion + Model + + +
+ With the development of the large-scale diffusion model, Artificial +Intelligence Generated Content (AIGC) techniques are popular recently. However, +how to truly make it serve our daily lives remains an open question. To this +end, in this paper, we focus on employing AIGC techniques in one filed of +E-commerce marketing, i.e., generating hyper-realistic advertising images for +displaying user-specified shoes by human. Specifically, we propose a +shoe-wearing system, called Shoe-Model, to generate plausible images of human +legs interacting with the given shoes. It consists of three modules: (1) shoe +wearable-area detection module (WD), (2) leg-pose synthesis module (LpS) and +the final (3) shoe-wearing image generation module (SW). Them three are +performed in ordered stages. Compared to baselines, our ShoeModel is shown to +generalize better to different type of shoes and has ability of keeping the +ID-consistency of the given shoes, as well as automatically producing +reasonable interactions with human. Extensive experiments show the +effectiveness of our proposed shoe-wearing system. Figure 1 shows the input and +output examples of our ShoeModel. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Strictly-ID-Preserved and Controllable Accessory Advertising Image + Generation + + +
+ Customized generative text-to-image models have the ability to produce images +that closely resemble a given subject. However, in the context of generating +advertising images for e-commerce scenarios, it is crucial that the generated +subject's identity aligns perfectly with the product being advertised. In order +to address the need for strictly-ID preserved advertising image generation, we +have developed a Control-Net based customized image generation pipeline and +have taken earring model advertising as an example. Our approach facilitates a +seamless interaction between the earrings and the model's face, while ensuring +that the identity of the earrings remains intact. Furthermore, to achieve a +diverse and controllable display, we have proposed a multi-branch +cross-attention architecture, which allows for control over the scale, pose, +and appearance of the model, going beyond the limitations of text prompts. Our +method manages to achieve fine-grained control of the generated model's face, +resulting in controllable and captivating advertising effects. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ 3D Building Reconstruction from Monocular Remote Sensing Images with + Multi-level Supervisions CVPR 2024 + + +
+ 3D building reconstruction from monocular remote sensing images is an +important and challenging research problem that has received increasing +attention in recent years, owing to its low cost of data acquisition and +availability for large-scale applications. However, existing methods rely on +expensive 3D-annotated samples for fully-supervised training, restricting their +application to large-scale cross-city scenarios. In this work, we propose +MLS-BRN, a multi-level supervised building reconstruction network that can +flexibly utilize training samples with different annotation levels to achieve +better reconstruction results in an end-to-end manner. To alleviate the demand +on full 3D supervision, we design two new modules, Pseudo Building Bbox +Calculator and Roof-Offset guided Footprint Extractor, as well as new tasks and +training strategies for different types of samples. Experimental results on +several public and new datasets demonstrate that our proposed MLS-BRN achieves +competitive performance using much fewer 3D-annotated samples, and +significantly improves the footprint extraction and 3D reconstruction +performance compared with current state-of-the-art. The code and datasets of +this work will be released at https://github.com/opendatalab/MLS-BRN.git. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Joint Reconstruction of 3D Human and Object via Contact-Based Refinement + Transformer CVPR 2024 + + +
+ Human-object contact serves as a strong cue to understand how humans +physically interact with objects. Nevertheless, it is not widely explored to +utilize human-object contact information for the joint reconstruction of 3D +human and object from a single image. In this work, we present a novel joint 3D +human-object reconstruction method (CONTHO) that effectively exploits contact +information between humans and objects. There are two core designs in our +system: 1) 3D-guided contact estimation and 2) contact-based 3D human and +object refinement. First, for accurate human-object contact estimation, CONTHO +initially reconstructs 3D humans and objects and utilizes them as explicit 3D +guidance for contact estimation. Second, to refine the initial reconstructions +of 3D human and object, we propose a novel contact-based refinement Transformer +that effectively aggregates human features and object features based on the +estimated human-object contact. The proposed contact-based refinement prevents +the learning of erroneous correlation between human and object, which enables +accurate 3D reconstruction. As a result, our CONTHO achieves state-of-the-art +performance in both human-object contact estimation and joint reconstruction of +3D human and object. The code is publicly available at +https://github.com/dqj5182/CONTHO_RELEASE. + +
+
+ comment: Published at CVPR 2024, 19 pages including the supplementary material +
+
+
+
+
+ + ☆ DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking + + +
+ Multimodal entity linking (MEL) aims to utilize multimodal information +(usually textual and visual information) to link ambiguous mentions to +unambiguous entities in knowledge base. Current methods facing main issues: +(1)treating the entire image as input may contain redundant information. (2)the +insufficient utilization of entity-related information, such as attributes in +images. (3)semantic inconsistency between the entity in knowledge base and its +representation. To this end, we propose DWE+ for multimodal entity linking. +DWE+ could capture finer semantics and dynamically maintain semantic +consistency with entities. This is achieved by three aspects: (a)we introduce a +method for extracting fine-grained image features by partitioning the image +into multiple local objects. Then, hierarchical contrastive learning is used to +further align semantics between coarse-grained information(text and image) and +fine-grained (mention and visual objects). (b)we explore ways to extract visual +attributes from images to enhance fusion feature such as facial features and +identity. (c)we leverage Wikipedia and ChatGPT to capture the entity +representation, achieving semantic enrichment from both static and dynamic +perspectives, which better reflects the real-world entity semantics. +Experiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the +effectiveness of DWE+ in improving MEL performance. Specifically, we optimize +these datasets and achieve state-of-the-art performance on the enhanced +datasets. The code and enhanced datasets are released on +https://github.com/season1blue/DWET + +
+
+ comment: under review on TOIS +
+
+
+
+
+ + ☆ MemFlow: Optical Flow Estimation and Prediction with Memory CVPR 2024 + + +
+ Optical flow is a classical task that is important to the vision community. +Classical optical flow estimation uses two frames as input, whilst some recent +methods consider multiple frames to explicitly model long-range information. +The former ones limit their ability to fully leverage temporal coherence along +the video sequence; and the latter ones incur heavy computational overhead, +typically not possible for real-time flow estimation. Some multi-frame-based +approaches even necessitate unseen future frames for current estimation, +compromising real-time applicability in safety-critical scenarios. To this end, +we present MemFlow, a real-time method for optical flow estimation and +prediction with memory. Our method enables memory read-out and update modules +for aggregating historical motion information in real-time. Furthermore, we +integrate resolution-adaptive re-scaling to accommodate diverse video +resolutions. Besides, our approach seamlessly extends to the future prediction +of optical flow based on past observations. Leveraging effective historical +motion aggregation, our method outperforms VideoFlow with fewer parameters and +faster inference speed on Sintel and KITTI-15 datasets in terms of +generalization performance. At the time of submission, MemFlow also leads in +performance on the 1080p Spring dataset. Codes and models will be available at: +https://dqiaole.github.io/MemFlow/. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ D2SL: Decouple Defogging and Semantic Learning for Foggy Domain-Adaptive + Segmentation + + +
+ We investigated domain adaptive semantic segmentation in foggy weather +scenarios, which aims to enhance the utilization of unlabeled foggy data and +improve the model's adaptability to foggy conditions. Current methods rely on +clear images as references, jointly learning defogging and segmentation for +foggy images. Despite making some progress, there are still two main drawbacks: +(1) the coupling of segmentation and defogging feature representations, +resulting in a decrease in semantic representation capability, and (2) the +failure to leverage real fog priors in unlabeled foggy data, leading to +insufficient model generalization ability. To address these issues, we propose +a novel training framework, Decouple Defogging and Semantic learning, called +D2SL, aiming to alleviate the adverse impact of defogging tasks on the final +segmentation task. In this framework, we introduce a domain-consistent transfer +strategy to establish a connection between defogging and segmentation tasks. +Furthermore, we design a real fog transfer strategy to improve defogging +effects by fully leveraging the fog priors from real foggy images. Our approach +enhances the semantic representations required for segmentation during the +defogging learning process and maximizes the representation capability of fog +invariance by effectively utilizing real fog data. Comprehensive experiments +validate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Light the Night: A Multi-Condition Diffusion Framework for Unpaired + Low-Light Enhancement in Autonomous Driving CVPR 2024 + + +
+ Vision-centric perception systems for autonomous driving have gained +considerable attention recently due to their cost-effectiveness and +scalability, especially compared to LiDAR-based systems. However, these systems +often struggle in low-light conditions, potentially compromising their +performance and safety. To address this, our paper introduces LightDiff, a +domain-tailored framework designed to enhance the low-light image quality for +autonomous driving applications. Specifically, we employ a multi-condition +controlled diffusion model. LightDiff works without any human-collected paired +data, leveraging a dynamic data degradation process instead. It incorporates a +novel multi-condition adapter that adaptively controls the input weights from +different modalities, including depth maps, RGB images, and text captions, to +effectively illuminate dark scenes while maintaining context consistency. +Furthermore, to align the enhanced images with the detection model's knowledge, +LightDiff employs perception-specific scores as rewards to guide the diffusion +training process through reinforcement learning. Extensive experiments on the +nuScenes datasets demonstrate that LightDiff can significantly improve the +performance of several state-of-the-art 3D detectors in night-time conditions +while achieving high visual quality scores, highlighting its potential to +safeguard autonomous driving. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Coordinated Sparse Recovery of Label Noise + + +
+ Label noise is a common issue in real-world datasets that inevitably impacts +the generalization of models. This study focuses on robust classification tasks +where the label noise is instance-dependent. Estimating the transition matrix +accurately in this task is challenging, and methods based on sample selection +often exhibit confirmation bias to varying degrees. Sparse over-parameterized +training (SOP) has been theoretically effective in estimating and recovering +label noise, offering a novel solution for noise-label learning. However, this +study empirically observes and verifies a technical flaw of SOP: the lack of +coordination between model predictions and noise recovery leads to increased +generalization error. To address this, we propose a method called Coordinated +Sparse Recovery (CSR). CSR introduces a collaboration matrix and confidence +weights to coordinate model predictions and noise recovery, reducing error +leakage. Based on CSR, this study designs a joint sample selection strategy and +constructs a comprehensive and powerful learning framework called CSR+. CSR+ +significantly reduces confirmation bias, especially for datasets with more +classes and a high proportion of instance-specific noise. Experimental results +on simulated and real-world noisy datasets demonstrate that both CSR and CSR+ +achieve outstanding performance compared to methods at the same level. + +
+
+ comment: Pre-print prior to submission to journal +
+
+
+
+
+ + ☆ Few-Shot Object Detection: Research Advances and Challenges + + +
+ Object detection as a subfield within computer vision has achieved remarkable +progress, which aims to accurately identify and locate a specific object from +images or videos. Such methods rely on large-scale labeled training samples for +each object category to ensure accurate detection, but obtaining extensive +annotated data is a labor-intensive and expensive process in many real-world +scenarios. To tackle this challenge, researchers have explored few-shot object +detection (FSOD) that combines few-shot learning and object detection +techniques to rapidly adapt to novel objects with limited annotated samples. +This paper presents a comprehensive survey to review the significant +advancements in the field of FSOD in recent years and summarize the existing +challenges and solutions. Specifically, we first introduce the background and +definition of FSOD to emphasize potential value in advancing the field of +computer vision. We then propose a novel FSOD taxonomy method and survey the +plentifully remarkable FSOD algorithms based on this fact to report a +comprehensive overview that facilitates a deeper understanding of the FSOD +problem and the development of innovative solutions. Finally, we discuss the +advantages and limitations of these algorithms to summarize the challenges, +potential research direction, and development trend of object detection in the +data scarcity scenario. + +
+
+
+
+
+ + ☆ Rethinking Diffusion Model for Multi-Contrast MRI Super-Resolution CVPR2024 + + +
+ Recently, diffusion models (DM) have been applied in magnetic resonance +imaging (MRI) super-resolution (SR) reconstruction, exhibiting impressive +performance, especially with regard to detailed reconstruction. However, the +current DM-based SR reconstruction methods still face the following issues: (1) +They require a large number of iterations to reconstruct the final image, which +is inefficient and consumes a significant amount of computational resources. +(2) The results reconstructed by these methods are often misaligned with the +real high-resolution images, leading to remarkable distortion in the +reconstructed MR images. To address the aforementioned issues, we propose an +efficient diffusion model for multi-contrast MRI SR, named as DiffMSR. +Specifically, we apply DM in a highly compact low-dimensional latent space to +generate prior knowledge with high-frequency detail information. The highly +compact latent space ensures that DM requires only a few simple iterations to +produce accurate prior knowledge. In addition, we design the Prior-Guide Large +Window Transformer (PLWformer) as the decoder for DM, which can extend the +receptive field while fully utilizing the prior knowledge generated by DM to +ensure that the reconstructed MR image remains undistorted. Extensive +experiments on public and clinical datasets demonstrate that our DiffMSR +outperforms state-of-the-art methods. + +
+
+ comment: 14 pages, 12 figures, Accepted by CVPR2024 +
+
+
+
+
+ + ☆ GenEARL: A Training-Free Generative Framework for Multimodal Event + Argument Role Labeling + + +
+ Multimodal event argument role labeling (EARL), a task that assigns a role +for each event participant (object) in an image is a complex challenge. It +requires reasoning over the entire image, the depicted event, and the +interactions between various objects participating in the event. Existing +models heavily rely on high-quality event-annotated training data to understand +the event semantics and structures, and they fail to generalize to new event +types and domains. In this paper, we propose GenEARL, a training-free +generative framework that harness the power of the modern generative models to +understand event task descriptions given image contexts to perform the EARL +task. Specifically, GenEARL comprises two stages of generative prompting with a +frozen vision-language model (VLM) and a frozen large language model (LLM). +First, a generative VLM learns the semantics of the event argument roles and +generates event-centric object descriptions based on the image. Subsequently, a +LLM is prompted with the generated object descriptions with a predefined +template for EARL (i.e., assign an object with an event argument role). We show +that GenEARL outperforms the contrastive pretraining (CLIP) baseline by 9.4% +and 14.2% accuracy for zero-shot EARL on the M2E2 and SwiG datasets, +respectively. In addition, we outperform CLIP-Event by 22% precision on M2E2 +dataset. The framework also allows flexible adaptation and generalization to +unseen domains. + +
+
+ comment: 20 pages, 15 Figures, 13 figures +
+
+
+
+
+ + ☆ X-VARS: Introducing Explainability in Football Refereeing with + Multi-Modal Large Language Model + + +
+ The rapid advancement of artificial intelligence has led to significant +improvements in automated decision-making. However, the increased performance +of models often comes at the cost of explainability and transparency of their +decision-making processes. In this paper, we investigate the capabilities of +large language models to explain decisions, using football refereeing as a +testing ground, given its decision complexity and subjectivity. We introduce +the Explainable Video Assistant Referee System, X-VARS, a multi-modal large +language model designed for understanding football videos from the point of +view of a referee. X-VARS can perform a multitude of tasks, including video +description, question answering, action recognition, and conducting meaningful +conversations based on video content and in accordance with the Laws of the +Game for football referees. We validate X-VARS on our novel dataset, +SoccerNet-XFoul, which consists of more than 22k video-question-answer triplets +annotated by over 70 experienced football referees. Our experiments and human +study illustrate the impressive capabilities of X-VARS in interpreting complex +football clips. Furthermore, we highlight the potential of X-VARS to reach +human performance and support football referees in the future. + +
+
+
+
+
+ + ☆ DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking + + +
+ Multimodal entity linking (MEL) aims to utilize multimodal information +(usually textual and visual information) to link ambiguous mentions to +unambiguous entities in knowledge base. Current methods facing main issues: +(1)treating the entire image as input may contain redundant information. (2)the +insufficient utilization of entity-related information, such as attributes in +images. (3)semantic inconsistency between the entity in knowledge base and its +representation. To this end, we propose DWE+ for multimodal entity linking. +DWE+ could capture finer semantics and dynamically maintain semantic +consistency with entities. This is achieved by three aspects: (a)we introduce a +method for extracting fine-grained image features by partitioning the image +into multiple local objects. Then, hierarchical contrastive learning is used to +further align semantics between coarse-grained information(text and image) and +fine-grained (mention and visual objects). (b)we explore ways to extract visual +attributes from images to enhance fusion feature such as facial features and +identity. (c)we leverage Wikipedia and ChatGPT to capture the entity +representation, achieving semantic enrichment from both static and dynamic +perspectives, which better reflects the real-world entity semantics. +Experiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the +effectiveness of DWE+ in improving MEL performance. Specifically, we optimize +these datasets and achieve state-of-the-art performance on the enhanced +datasets. The code and enhanced datasets are released on +https://github.com/season1blue/DWET + +
+
+ comment: under review on TOIS. arXiv admin note: substantial text overlap with + arXiv:2312.11816 +
+
+
+
+
+ + ♻ ☆ PIGEON: Predicting Image Geolocations + + +
+ Planet-scale image geolocalization remains a challenging problem due to the +diversity of images originating from anywhere in the world. Although approaches +based on vision transformers have made significant progress in geolocalization +accuracy, success in prior literature is constrained to narrow distributions of +images of landmarks, and performance has not generalized to unseen places. We +present a new geolocalization system that combines semantic geocell creation, +multi-task contrastive pretraining, and a novel loss function. Additionally, +our work is the first to perform retrieval over location clusters for guess +refinements. We train two models for evaluations on street-level data and +general-purpose image geolocalization; the first model, PIGEON, is trained on +data from the game of Geoguessr and is capable of placing over 40% of its +guesses within 25 kilometers of the target location globally. We also develop a +bot and deploy PIGEON in a blind experiment against humans, ranking in the top +0.01% of players. We further challenge one of the world's foremost professional +Geoguessr players to a series of six matches with millions of viewers, winning +all six games. Our second model, PIGEOTTO, differs in that it is trained on a +dataset of images from Flickr and Wikipedia, achieving state-of-the-art results +on a wide range of image geolocalization benchmarks, outperforming the previous +SOTA by up to 7.7 percentage points on the city accuracy level and up to 38.8 +percentage points on the country level. Our findings suggest that PIGEOTTO is +the first image geolocalization model that effectively generalizes to unseen +places and that our approach can pave the way for highly accurate, planet-scale +image geolocalization systems. Our code is available on GitHub. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ MMSFormer: Multimodal Transformer for Material and Semantic Segmentation + + +
+ Leveraging information across diverse modalities is known to enhance +performance on multimodal segmentation tasks. However, effectively fusing +information from different modalities remains challenging due to the unique +characteristics of each modality. In this paper, we propose a novel fusion +strategy that can effectively fuse information from different modality +combinations. We also propose a new model named Multi-Modal Segmentation +TransFormer (MMSFormer) that incorporates the proposed fusion strategy to +perform multimodal material and semantic segmentation tasks. MMSFormer +outperforms current state-of-the-art models on three different datasets. As we +begin with only one input modality, performance improves progressively as +additional modalities are incorporated, showcasing the effectiveness of the +fusion block in combining useful information from diverse input modalities. +Ablation studies show that different modules in the fusion block are crucial +for overall model performance. Furthermore, our ablation studies also highlight +the capacity of different input modalities to improve performance in the +identification of different types of materials. The code and pretrained models +will be made available at https://github.com/csiplab/MMSFormer. + +
+
+ comment: Accepted by IEEE Open Journal of Signal Processing. 15 pages, 3 + figures, 9 tables +
+
+
+
+
+ + ♻ ☆ AG-ReID.v2: Bridging Aerial and Ground Views for Person + Re-identification + + +
+ Aerial-ground person re-identification (Re-ID) presents unique challenges in +computer vision, stemming from the distinct differences in viewpoints, poses, +and resolutions between high-altitude aerial and ground-based cameras. Existing +research predominantly focuses on ground-to-ground matching, with aerial +matching less explored due to a dearth of comprehensive datasets. To address +this, we introduce AG-ReID.v2, a dataset specifically designed for person Re-ID +in mixed aerial and ground scenarios. This dataset comprises 100,502 images of +1,615 unique individuals, each annotated with matching IDs and 15 soft +attribute labels. Data were collected from diverse perspectives using a UAV, +stationary CCTV, and smart glasses-integrated camera, providing a rich variety +of intra-identity variations. Additionally, we have developed an explainable +attention network tailored for this dataset. This network features a +three-stream architecture that efficiently processes pairwise image distances, +emphasizes key top-down features, and adapts to variations in appearance due to +altitude differences. Comparative evaluations demonstrate the superiority of +our approach over existing baselines. We plan to release the dataset and +algorithm source code publicly, aiming to advance research in this specialized +field of computer vision. For access, please visit +https://github.com/huynguyen792/AG-ReID.v2. + +
+
+ comment: 13 pages, Accepted by TIFS 2023 +
+
+
+
+
+ + ♻ ☆ Relightful Harmonization: Lighting-aware Portrait Background Replacement CVPR 2024 + + +
+ Portrait harmonization aims to composite a subject into a new background, +adjusting its lighting and color to ensure harmony with the background scene. +Existing harmonization techniques often only focus on adjusting the global +color and brightness of the foreground and ignore crucial illumination cues +from the background such as apparent lighting direction, leading to unrealistic +compositions. We introduce Relightful Harmonization, a lighting-aware diffusion +model designed to seamlessly harmonize sophisticated lighting effect for the +foreground portrait using any background image. Our approach unfolds in three +stages. First, we introduce a lighting representation module that allows our +diffusion model to encode lighting information from target image background. +Second, we introduce an alignment network that aligns lighting features learned +from image background with lighting features learned from panorama environment +maps, which is a complete representation for scene illumination. Last, to +further boost the photorealism of the proposed method, we introduce a novel +data simulation pipeline that generates synthetic training pairs from a diverse +range of natural images, which are used to refine the model. Our method +outperforms existing benchmarks in visual fidelity and lighting coherence, +showing superior generalization in real-world testing scenarios, highlighting +its versatility and practicality. + +
+
+ comment: CVPR 2024 camera ready +
+
+
+
+
+ + ♻ ☆ Zero-TPrune: Zero-Shot Token Pruning through Leveraging of the Attention + Graph in Pre-Trained Transformers CVPR + + +
+ Deployment of Transformer models on edge devices is becoming increasingly +challenging due to the exponentially growing inference cost that scales +quadratically with the number of tokens in the input sequence. Token pruning is +an emerging solution to address this challenge due to its ease of deployment on +various Transformer backbones. However, most token pruning methods require +computationally expensive fine-tuning, which is undesirable in many edge +deployment cases. In this work, we propose Zero-TPrune, the first zero-shot +method that considers both the importance and similarity of tokens in +performing token pruning. It leverages the attention graph of pre-trained +Transformer models to produce an importance distribution for tokens via our +proposed Weighted Page Rank (WPR) algorithm. This distribution further guides +token partitioning for efficient similarity-based pruning. Due to the +elimination of the fine-tuning overhead, Zero-TPrune can prune large models at +negligible computational cost, switch between different pruning configurations +at no computational cost, and perform hyperparameter tuning efficiently. We +evaluate the performance of Zero-TPrune on vision tasks by applying it to +various vision Transformer backbones and testing them on ImageNet. Without any +fine-tuning, Zero-TPrune reduces the FLOPs cost of DeiT-S by 34.7% and improves +its throughput by 45.3% with only 0.4% accuracy loss. Compared with +state-of-the-art pruning methods that require fine-tuning, Zero-TPrune not only +eliminates the need for fine-tuning after pruning but also does so with only +0.1% accuracy loss. Compared with state-of-the-art fine-tuning-free pruning +methods, Zero-TPrune reduces accuracy loss by up to 49% with similar FLOPs +budgets. Project webpage: https://jha-lab.github.io/zerotprune. + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Turbulence Mitigation: A Translational Perspective CVPR 2024 + + +
+ Recovering images distorted by atmospheric turbulence is a challenging +inverse problem due to the stochastic nature of turbulence. Although numerous +turbulence mitigation (TM) algorithms have been proposed, their efficiency and +generalization to real-world dynamic scenarios remain severely limited. +Building upon the intuitions of classical TM algorithms, we present the Deep +Atmospheric TUrbulence Mitigation network (DATUM). DATUM aims to overcome major +challenges when transitioning from classical to deep learning approaches. By +carefully integrating the merits of classical multi-frame TM methods into a +deep network structure, we demonstrate that DATUM can efficiently perform +long-range temporal aggregation using a recurrent fashion, while deformable +attention and temporal-channel attention seamlessly facilitate pixel +registration and lucky imaging. With additional supervision, tilt and blur +degradation can be jointly mitigated. These inductive biases empower DATUM to +significantly outperform existing methods while delivering a tenfold increase +in processing speed. A large-scale training dataset, ATSyn, is presented as a +co-invention to enable generalization in real turbulence. Our code and datasets +are available at https://xg416.github.io/DATUM. + +
+
+ comment: Accepted by CVPR 2024, project page https://xg416.github.io/DATUM/ +
+
+
+
+
+ + ♻ ☆ Get a Grip: Reconstructing Hand-Object Stable Grasps in Egocentric + Videos + + +
+ We propose the task of Hand-Object Stable Grasp Reconstruction (HO-SGR), the +reconstruction of frames during which the hand is stably holding the object. We +first develop the stable grasp definition based on the intuition that the +in-contact area between the hand and object should remain stable. By analysing +the 3D ARCTIC dataset, we identify stable grasp durations and showcase that +objects in stable grasps move within a single degree of freedom (1-DoF). We +thereby propose a method to jointly optimise all frames within a stable grasp, +minimising object motions to a latent 1-DoF. Finally, we extend the knowledge +to in-the-wild videos by labelling 2.4K clips of stable grasps. Our proposed +EPIC-Grasps dataset includes 390 object instances of 9 categories, featuring +stable grasps from videos of daily interactions in 141 environments. Without 3D +ground truth, we use stable contact areas and 2D projection masks to assess the +HO-SGR task in the wild. We evaluate relevant methods and our approach +preserves significantly higher stable contact area, on both EPIC-Grasps and +stable grasp sub-sequences from the ARCTIC dataset. + +
+
+ comment: webpage: https://zhifanzhu.github.io/getagrip +
+
+
+
+
+ + ♻ ☆ DragDiffusion: Harnessing Diffusion Models for Interactive Point-based + Image Editing + + +
+ Accurate and controllable image editing is a challenging task that has +attracted significant attention recently. Notably, DragGAN is an interactive +point-based image editing framework that achieves impressive editing results +with pixel-level precision. However, due to its reliance on generative +adversarial networks (GANs), its generality is limited by the capacity of +pretrained GAN models. In this work, we extend this editing framework to +diffusion models and propose a novel approach DragDiffusion. By harnessing +large-scale pretrained diffusion models, we greatly enhance the applicability +of interactive point-based editing on both real and diffusion-generated images. +Our approach involves optimizing the diffusion latents to achieve precise +spatial control. The supervision signal of this optimization process is from +the diffusion model's UNet features, which are known to contain rich semantic +and geometric information. Moreover, we introduce two additional techniques, +namely LoRA fine-tuning and latent-MasaCtrl, to further preserve the identity +of the original image. Lastly, we present a challenging benchmark dataset +called DragBench -- the first benchmark to evaluate the performance of +interactive point-based image editing methods. Experiments across a wide range +of challenging cases (e.g., images with multiple objects, diverse object +categories, various styles, etc.) demonstrate the versatility and generality of +DragDiffusion. Code: https://github.com/Yujun-Shi/DragDiffusion. + +
+
+ comment: Code is released at https://github.com/Yujun-Shi/DragDiffusion +
+
+
+
+
+ + ♻ ☆ Demystifying CLIP Data + + +
+ Contrastive Language-Image Pre-training (CLIP) is an approach that has +advanced research and applications in computer vision, fueling modern +recognition systems and generative models. We believe that the main ingredient +to the success of CLIP is its data and not the model architecture or +pre-training objective. However, CLIP only provides very limited information +about its data and how it has been collected, leading to works that aim to +reproduce CLIP's data by filtering with its model parameters. In this work, we +intend to reveal CLIP's data curation approach and in our pursuit of making it +open to the community introduce Metadata-Curated Language-Image Pre-training +(MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's +concepts) and yields a balanced subset over the metadata distribution. Our +experimental study rigorously isolates the model and training settings, +concentrating solely on data. MetaCLIP applied to CommonCrawl with 400M +image-text data pairs outperforms CLIP's data on multiple standard benchmarks. +In zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy, +surpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining +the same training budget, attains 72.4%. Our observations hold across various +model sizes, exemplified by ViT-H achieving 80.5%, without any +bells-and-whistles. Curation code and training data distribution on metadata is +made available at https://github.com/facebookresearch/MetaCLIP. + +
+
+ comment: 17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by + other authors +
+
+
+
+
+ + ♻ ☆ Mimicking the Oracle: An Initial Phase Decorrelation Approach for Class + Incremental Learning CVPR 2022 + + +
+ Class Incremental Learning (CIL) aims at learning a multi-class classifier in +a phase-by-phase manner, in which only data of a subset of the classes are +provided at each phase. Previous works mainly focus on mitigating forgetting in +phases after the initial one. However, we find that improving CIL at its +initial phase is also a promising direction. Specifically, we experimentally +show that directly encouraging CIL Learner at the initial phase to output +similar representations as the model jointly trained on all classes can greatly +boost the CIL performance. Motivated by this, we study the difference between a +na\"ively-trained initial-phase model and the oracle model. Specifically, since +one major difference between these two models is the number of training +classes, we investigate how such difference affects the model representations. +We find that, with fewer training classes, the data representations of each +class lie in a long and narrow region; with more training classes, the +representations of each class scatter more uniformly. Inspired by this +observation, we propose Class-wise Decorrelation (CwD) that effectively +regularizes representations of each class to scatter more uniformly, thus +mimicking the model jointly trained with all classes (i.e., the oracle model). +Our CwD is simple to implement and easy to plug into existing methods. +Extensive experiments on various benchmark datasets show that CwD consistently +and significantly improves the performance of existing state-of-the-art methods +by around 1\% to 3\%. Code will be released. + +
+
+ comment: CVPR 2022 Camera-Ready Version +
+
+
+
+
+ + ♻ ☆ Hidden in Plain Sight: Undetectable Adversarial Bias Attacks on + Vulnerable Patient Populations + + +
+ The proliferation of artificial intelligence (AI) in radiology has shed light +on the risk of deep learning (DL) models exacerbating clinical biases towards +vulnerable patient populations. While prior literature has focused on +quantifying biases exhibited by trained DL models, demographically targeted +adversarial bias attacks on DL models and its implication in the clinical +environment remains an underexplored field of research in medical imaging. In +this work, we demonstrate that demographically targeted label poisoning attacks +can introduce undetectable underdiagnosis bias in DL models. Our results across +multiple performance metrics and demographic groups like sex, age, and their +intersectional subgroups show that adversarial bias attacks demonstrate +high-selectivity for bias in the targeted group by degrading group model +performance without impacting overall model performance. Furthermore, our +results indicate that adversarial bias attacks result in biased DL models that +propagate prediction bias even when evaluated with external datasets. + +
+
+ comment: 29 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ NiteDR: Nighttime Image De-Raining with Cross-View Sensor Cooperative + Learning for Dynamic Driving Scenes + + +
+ In real-world environments, outdoor imaging systems are often affected by +disturbances such as rain degradation. Especially, in nighttime driving scenes, +insufficient and uneven lighting shrouds the scenes in darkness, resulting +degradation of both the image quality and visibility. Particularly, in the +field of autonomous driving, the visual perception ability of RGB sensors +experiences a sharp decline in such harsh scenarios. Additionally, driving +assistance systems suffer from reduced capabilities in capturing and discerning +the surrounding environment, posing a threat to driving safety. Single-view +information captured by single-modal sensors cannot comprehensively depict the +entire scene. To address these challenges, we developed an image de-raining +framework tailored for rainy nighttime driving scenes. It aims to remove rain +artifacts, enrich scene representation, and restore useful information. +Specifically, we introduce cooperative learning between visible and infrared +images captured by different sensors. By cross-view fusion of these +multi-source data, the scene within the images gains richer texture details and +enhanced contrast. We constructed an information cleaning module called +CleanNet as the first stage of our framework. Moreover, we designed an +information fusion module called FusionNet as the second stage to fuse the +clean visible images with infrared images. Using this stage-by-stage learning +strategy, we obtain de-rained fusion images with higher quality and better +visual perception. Extensive experiments demonstrate the effectiveness of our +proposed Cross-View Cooperative Learning (CVCL) in adverse driving scenarios in +low-light rainy environments. The proposed approach addresses the gap in the +utilization of existing rain removal algorithms in specific low-light +conditions. + +
+
+
+
+
+ + ♻ ☆ HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning + for RGB-D 6DoF Object Pose Estimation CVPR 2024 + + +
+ In this work, we present a novel dense-correspondence method for 6DoF object +pose estimation from a single RGB-D image. While many existing data-driven +methods achieve impressive performance, they tend to be time-consuming due to +their reliance on rendering-based refinement approaches. To circumvent this +limitation, we present HiPose, which establishes 3D-3D correspondences in a +coarse-to-fine manner with a hierarchical binary surface encoding. Unlike +previous dense-correspondence methods, we estimate the correspondence surface +by employing point-to-surface matching and iteratively constricting the surface +until it becomes a correspondence point while gradually removing outliers. +Extensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate +that our method surpasses all refinement-free methods and is even on par with +expensive refinement-based approaches. Crucially, our approach is +computationally efficient and enables real-time critical applications with high +accuracy requirements. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz + continuity constrAIned Normalization CVPR2024 + + +
+ Generative Adversarial Networks (GANs) significantly advanced image +generation but their performance heavily depends on abundant training data. In +scenarios with limited data, GANs often struggle with discriminator overfitting +and unstable training. Batch Normalization (BN), despite being known for +enhancing generalization and training stability, has rarely been used in the +discriminator of Data-Efficient GANs. Our work addresses this gap by +identifying a critical flaw in BN: the tendency for gradient explosion during +the centering and scaling steps. To tackle this issue, we present CHAIN +(lipsCHitz continuity constrAIned Normalization), which replaces the +conventional centering step with zero-mean regularization and integrates a +Lipschitz continuity constraint in the scaling step. CHAIN further enhances GAN +training by adaptively interpolating the normalized and unnormalized features, +effectively avoiding discriminator overfitting. Our theoretical analyses firmly +establishes CHAIN's effectiveness in reducing gradients in latent features and +weights, improving stability and generalization in GAN training. Empirical +evidence supports our theory. CHAIN achieves state-of-the-art results in +data-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven +high-resolution few-shot image datasets. Code: +https://github.com/MaxwellYaoNi/CHAIN + +
+
+ comment: Accepted by CVPR2024. 26 pages full version. Code: + https://github.com/MaxwellYaoNi/CHAIN +
+
+
+
+
+ + ♻ ☆ EVCap: Retrieval-Augmented Image Captioning with External Visual-Name + Memory for Open-World Comprehension CVPR 2024 + + +
+ Large language models (LLMs)-based image captioning has the capability of +describing objects not explicitly observed in training data; yet novel objects +occur frequently, necessitating the requirement of sustaining up-to-date object +knowledge for open-world comprehension. Instead of relying on large amounts of +data and/or scaling up network parameters, we introduce a highly effective +retrieval-augmented image captioning method that prompts LLMs with object names +retrieved from External Visual--name memory (EVCap). We build ever-changing +object knowledge memory using objects' visuals and names, enabling us to (i) +update the memory at a minimal cost and (ii) effortlessly augment LLMs with +retrieved object names by utilizing a lightweight and fast-to-train model. Our +model, which was trained only on the COCO dataset, can adapt to out-of-domain +without requiring additional fine-tuning or re-training. Our experiments +conducted on benchmarks and synthetic commonsense-violating data show that +EVCap, with only 3.97M trainable parameters, exhibits superior performance +compared to other methods based on frozen pre-trained LLMs. Its performance is +also competitive to specialist SOTAs that require extensive training. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Cooperation Does Matter: Exploring Multi-Order Bilateral Relations for + Audio-Visual Segmentation CVPR 2024 + + +
+ Recently, an audio-visual segmentation (AVS) task has been introduced, aiming +to group pixels with sounding objects within a given video. This task +necessitates a first-ever audio-driven pixel-level understanding of the scene, +posing significant challenges. In this paper, we propose an innovative +audio-visual transformer framework, termed COMBO, an acronym for COoperation of +Multi-order Bilateral relatiOns. For the first time, our framework explores +three types of bilateral entanglements within AVS: pixel entanglement, modality +entanglement, and temporal entanglement. Regarding pixel entanglement, we +employ a Siam-Encoder Module (SEM) that leverages prior knowledge to generate +more precise visual features from the foundational model. For modality +entanglement, we design a Bilateral-Fusion Module (BFM), enabling COMBO to +align corresponding visual and auditory signals bi-directionally. As for +temporal entanglement, we introduce an innovative adaptive inter-frame +consistency loss according to the inherent rules of temporal. Comprehensive +experiments and ablation studies on AVSBench-object (84.7 mIoU on S4, 59.2 mIou +on MS3) and AVSBench-semantic (42.1 mIoU on AVSS) datasets demonstrate that +COMBO surpasses previous state-of-the-art methods. Code and more results will +be publicly available at https://yannqi.github.io/AVS-COMBO/. + +
+
+ comment: CVPR 2024 Highlight. 13 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Video Anomaly Detection via Spatio-Temporal Pseudo-Anomaly Generation : + A Unified Approach CVPR + + +
+ Video Anomaly Detection (VAD) is an open-set recognition task, which is +usually formulated as a one-class classification (OCC) problem, where training +data is comprised of videos with normal instances while test data contains both +normal and anomalous instances. Recent works have investigated the creation of +pseudo-anomalies (PAs) using only the normal data and making strong assumptions +about real-world anomalies with regards to abnormality of objects and speed of +motion to inject prior information about anomalies in an autoencoder (AE) based +reconstruction model during training. This work proposes a novel method for +generating generic spatio-temporal PAs by inpainting a masked out region of an +image using a pre-trained Latent Diffusion Model and further perturbing the +optical flow using mixup to emulate spatio-temporal distortions in the data. In +addition, we present a simple unified framework to detect real-world anomalies +under the OCC setting by learning three types of anomaly indicators, namely +reconstruction quality, temporal irregularity and semantic inconsistency. +Extensive experiments on four VAD benchmark datasets namely Ped2, Avenue, +ShanghaiTech and UBnormal demonstrate that our method performs on par with +other existing state-of-the-art PAs generation and reconstruction based methods +under the OCC setting. Our analysis also examines the transferability and +generalisation of PAs across these datasets, offering valuable insights by +identifying real-world anomalies through PAs. + +
+
+ comment: Accepted in CVPRW 2024 - VAND Workshop +
+
+
+
+
+ + ♻ ☆ Reconstruction and Simulation of Elastic Objects with Spring-Mass 3D + Gaussians + + +
+ Reconstructing and simulating elastic objects from visual observations is +crucial for applications in computer vision and robotics. Existing methods, +such as 3D Gaussians, model 3D appearance and geometry, but lack the ability to +estimate physical properties for objects and simulate them. The core challenge +lies in integrating an expressive yet efficient physical dynamics model. We +propose Spring-Gaus, a 3D physical object representation for reconstructing and +simulating elastic objects from videos of the object from multiple viewpoints. +In particular, we develop and integrate a 3D Spring-Mass model into 3D Gaussian +kernels, enabling the reconstruction of the visual appearance, shape, and +physical dynamics of the object. Our approach enables future prediction and +simulation under various initial states and environmental properties. We +evaluate Spring-Gaus on both synthetic and real-world datasets, demonstrating +accurate reconstruction and simulation of elastic objects. Project page: +https://zlicheng.com/spring_gaus. + +
+
+
+
+
+ + ♻ ☆ A Survey on Transformer Compression + + +
+ Transformer plays a vital role in the realms of natural language processing +(NLP) and computer vision (CV), specially for constructing large language +models (LLM) and large vision models (LVM). Model compression methods reduce +the memory and computational cost of Transformer, which is a necessary step to +implement large language/vision models on practical devices. Given the unique +architecture of Transformer, featuring alternative attention and feedforward +neural network (FFN) modules, specific compression techniques are usually +required. The efficiency of these compression methods is also paramount, as +retraining large models on the entire training dataset is usually impractical. +This survey provides a comprehensive review of recent compression methods, with +a specific focus on their application to Transformer-based models. The +compression methods are primarily categorized into pruning, quantization, +knowledge distillation, and efficient architecture design (Mamba, RetNet, RWKV, +etc.). In each category, we discuss compression methods for both language and +vision tasks, highlighting common underlying principles. Finally, we delve into +the relation between various compression methods, and discuss further +directions in this domain. + +
+
+ comment: Model Compression, Transformer, Large Language Model, Large Vision + Model, LLM +
+
+
+
+
+ + ♻ ☆ Linear Anchored Gaussian Mixture Model for Location and Width + Computation of Objects in Thick Line Shape + + +
+ An accurate detection of the centerlines of linear objects is a challenging +topic in many sensitive real-world applications such X-ray imaging, remote +sensing and lane marking detection in road traffic. Model-based approaches +using Hough and Radon transforms are often used but, are not recommended for +thick line detection, whereas approaches based on image derivatives need +further step-by-step processing, making their efficiency dependent on each step +outcomes. In this paper, we aim to detect linear structures found in images by +considering the 3D representation of the image gray levels as a finite mixture +model of statistical distribution. The latter, which we named linear anchored +Gaussian distribution could be parametrized by a scale value ${\sigma}$ +describing the linear structure thickness and a line equation, parametrized, in +turn, by a radius ${\rho}$ and an orientation angle ${\theta}$, describing the +linear structure centerline location. Expectation-Maximization (EM) algorithm +is used for the mixture model parameter estimation, where a new paradigm, using +the background subtraction for the likelihood function computation, is +proposed. For the EM algorithm, two ${\theta}$ parameter initialization schemes +are used: the first one is based on a random choice of the first component of +${\theta}$ vector, whereas the second is based on the image Hessian with a +simultaneous computation of the mixture model components number. Experiments on +real world images and synthetic images corrupted by blur and additive noise +show the good performance of the proposed methods, where the algorithm using +background subtraction and Hessian-based ${\theta}$ initialization provides an +outstanding accuracy of the linear structure detection despite irregular image +background and presence of blur and noise. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ UPNet: Uncertainty-based Picking Deep Learning Network for Robust First + Break Picking + + +
+ In seismic exploration, first break (FB) picking is a crucial aspect in the +determination of subsurface velocity models, significantly influencing the +placement of wells. Many deep neural networks (DNNs)-based automatic picking +methods have been proposed to accelerate this processing. Significantly, the +segmentation-based DNN methods provide a segmentation map and then estimate FB +from the map using a picking threshold. However, the uncertainty of the results +picked by DNNs still needs to be analyzed. Thus, the automatic picking methods +applied in field datasets can not ensure robustness, especially in the case of +a low signal-to-noise ratio (SNR). In this paper, we introduce uncertainty +quantification into the FB picking task and propose a novel uncertainty-based +picking deep learning network called UPNet. UPNet not only estimates the +uncertainty of network output but also can filter the pickings with low +confidence. Many experiments evaluate that UPNet exhibits higher accuracy and +robustness than the deterministic DNN-based model, achieving State-of-the-Art +(SOTA) performance in field surveys. In addition, we verify that the +measurement uncertainty is meaningful, which can provide a reference for human +decision-making. + +
+
+
+
+
+ + ♻ ☆ UniEdit: A Unified Tuning-Free Framework for Video Motion and Appearance + Editing + + +
+ Recent advances in text-guided video editing have showcased promising results +in appearance editing (e.g., stylization). However, video motion editing in the +temporal dimension (e.g., from eating to waving), which distinguishes video +editing from image editing, is underexplored. In this work, we present UniEdit, +a tuning-free framework that supports both video motion and appearance editing +by harnessing the power of a pre-trained text-to-video generator within an +inversion-then-generation framework. To realize motion editing while preserving +source video content, based on the insights that temporal and spatial +self-attention layers encode inter-frame and intra-frame dependency +respectively, we introduce auxiliary motion-reference and reconstruction +branches to produce text-guided motion and source features respectively. The +obtained features are then injected into the main editing path via temporal and +spatial self-attention layers. Extensive experiments demonstrate that UniEdit +covers video motion editing and various appearance editing scenarios, and +surpasses the state-of-the-art methods. Our code will be publicly available. + +
+
+ comment: Project page: https://jianhongbai.github.io/UniEdit/ +
+
+
+
+
+ + ♻ ☆ SiCL: Silhouette-Driven Contrastive Learning for Unsupervised Person + Re-Identification with Clothes Change + + +
+ In this paper, we address a highly challenging yet critical task: +unsupervised long-term person re-identification with clothes change. Existing +unsupervised person re-id methods are mainly designed for short-term scenarios +and usually rely on RGB cues so that fail to perceive feature patterns that are +independent of the clothes. To crack this bottleneck, we propose a +silhouette-driven contrastive learning (SiCL) method, which is designed to +learn cross-clothes invariance by integrating both the RGB cues and the +silhouette information within a contrastive learning framework. To our +knowledge, this is the first tailor-made framework for unsupervised long-term +clothes change \reid{}, with superior performance on six benchmark datasets. We +conduct extensive experiments to evaluate our proposed SiCL compared to the +state-of-the-art unsupervised person reid methods across all the representative +datasets. Experimental results demonstrate that our proposed SiCL significantly +outperforms other unsupervised re-id methods. + +
+
+
+
+
+ + ♻ ☆ DetToolChain: A New Prompting Paradigm to Unleash Detection Ability of + MLLM + + +
+ We present DetToolChain, a novel prompting paradigm, to unleash the zero-shot +object detection ability of multimodal large language models (MLLMs), such as +GPT-4V and Gemini. Our approach consists of a detection prompting toolkit +inspired by high-precision detection priors and a new Chain-of-Thought to +implement these prompts. Specifically, the prompts in the toolkit are designed +to guide the MLLM to focus on regional information (e.g., zooming in), read +coordinates according to measure standards (e.g., overlaying rulers and +compasses), and infer from the contextual information (e.g., overlaying scene +graphs). Building upon these tools, the new detection chain-of-thought can +automatically decompose the task into simple subtasks, diagnose the +predictions, and plan for progressive box refinements. The effectiveness of our +framework is demonstrated across a spectrum of detection tasks, especially hard +cases. Compared to existing state-of-the-art methods, GPT-4V with our +DetToolChain improves state-of-the-art object detectors by +21.5% AP50 on MS +COCO Novel class set for open-vocabulary detection, +24.23% Acc on RefCOCO val +set for zero-shot referring expression comprehension, +14.5% AP on D-cube +describe object detection FULL setting. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Learning for Medical Image Data with Anatomy-Oriented + Imaging Planes + + +
+ Self-supervised learning has emerged as a powerful tool for pretraining deep +networks on unlabeled data, prior to transfer learning of target tasks with +limited annotation. The relevance between the pretraining pretext and target +tasks is crucial to the success of transfer learning. Various pretext tasks +have been proposed to utilize properties of medical image data (e.g., three +dimensionality), which are more relevant to medical image analysis than generic +ones for natural images. However, previous work rarely paid attention to data +with anatomy-oriented imaging planes, e.g., standard cardiac magnetic resonance +imaging views. As these imaging planes are defined according to the anatomy of +the imaged organ, pretext tasks effectively exploiting this information can +pretrain the networks to gain knowledge on the organ of interest. In this work, +we propose two complementary pretext tasks for this group of medical image data +based on the spatial relationship of the imaging planes. The first is to learn +the relative orientation between the imaging planes and implemented as +regressing their intersecting lines. The second exploits parallel imaging +planes to regress their relative slice locations within a stack. Both pretext +tasks are conceptually straightforward and easy to implement, and can be +combined in multitask learning for better representation learning. Thorough +experiments on two anatomical structures (heart and knee) and representative +target tasks (semantic segmentation and classification) demonstrate that the +proposed pretext tasks are effective in pretraining deep networks for +remarkably boosted performance on the target tasks, and superior to other +recent approaches. + +
+
+ comment: Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ From Two-Stream to One-Stream: Efficient RGB-T Tracking via Mutual + Prompt Learning and Knowledge Distillation + + +
+ Due to the complementary nature of visible light and thermal infrared +modalities, object tracking based on the fusion of visible light images and +thermal images (referred to as RGB-T tracking) has received increasing +attention from researchers in recent years. How to achieve more comprehensive +fusion of information from the two modalities at a lower cost has been an issue +that researchers have been exploring. Inspired by visual prompt learning, we +designed a novel two-stream RGB-T tracking architecture based on cross-modal +mutual prompt learning, and used this model as a teacher to guide a one-stream +student model for rapid learning through knowledge distillation techniques. +Extensive experiments have shown that, compared to similar RGB-T trackers, our +designed teacher model achieved the highest precision rate, while the student +model, with comparable precision rate to the teacher model, realized an +inference speed more than three times faster than the teacher model.(Codes will +be available if accepted.) + +
+
+
+
+
+ + ♻ ☆ GS-SLAM: Dense Visual SLAM with 3D Gaussian Splatting CVPR 2024 + + +
+ In this paper, we introduce \textbf{GS-SLAM} that first utilizes 3D Gaussian +representation in the Simultaneous Localization and Mapping (SLAM) system. It +facilitates a better balance between efficiency and accuracy. Compared to +recent SLAM methods employing neural implicit representations, our method +utilizes a real-time differentiable splatting rendering pipeline that offers +significant speedup to map optimization and RGB-D rendering. Specifically, we +propose an adaptive expansion strategy that adds new or deletes noisy 3D +Gaussians in order to efficiently reconstruct new observed scene geometry and +improve the mapping of previously observed areas. This strategy is essential to +extend 3D Gaussian representation to reconstruct the whole scene rather than +synthesize a static object in existing methods. Moreover, in the pose tracking +process, an effective coarse-to-fine technique is designed to select reliable +3D Gaussian representations to optimize camera pose, resulting in runtime +reduction and robust estimation. Our method achieves competitive performance +compared with existing state-of-the-art real-time methods on the Replica, +TUM-RGBD datasets. Project page: https://gs-slam.github.io/. + +
+
+ comment: Accepted to CVPR 2024(highlight). Project Page: + https://gs-slam.github.io/ +
+
+
+
+
+ + ♻ ☆ PV-SSD: A Multi-Modal Point Cloud Feature Fusion Method for Projection + Features and Variable Receptive Field Voxel Features + + +
+ LiDAR-based 3D object detection and classification is crucial for autonomous +driving. However, real-time inference from extremely sparse 3D data is a +formidable challenge. To address this problem, a typical class of approaches +transforms the point cloud cast into a regular data representation (voxels or +projection maps). Then, it performs feature extraction with convolutional +neural networks. However, such methods often result in a certain degree of +information loss due to down-sampling or over-compression of feature +information. This paper proposes a multi-modal point cloud feature fusion +method for projection features and variable receptive field voxel features +(PV-SSD) based on projection and variable voxelization to solve the information +loss problem. We design a two-branch feature extraction structure with a 2D +convolutional neural network to extract the point cloud's projection features +in bird's-eye view to focus on the correlation between local features. A voxel +feature extraction branch is used to extract local fine-grained features. +Meanwhile, we propose a voxel feature extraction method with variable sensory +fields to reduce the information loss of voxel branches due to downsampling. It +avoids missing critical point information by selecting more useful feature +points based on feature point weights for the detection task. In addition, we +propose a multi-modal feature fusion module for point clouds. To validate the +effectiveness of our method, we tested it on the KITTI dataset and ONCE +dataset. + +
+
+
+
+
+ + ♻ ☆ LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge + Retrieval-Augmented Diffusion CVPR 2024 + + +
+ Camouflaged vision perception is an important vision task with numerous +practical applications. Due to the expensive collection and labeling costs, +this community struggles with a major bottleneck that the species category of +its datasets is limited to a small number of object species. However, the +existing camouflaged generation methods require specifying the background +manually, thus failing to extend the camouflaged sample diversity in a low-cost +manner. In this paper, we propose a Latent Background Knowledge +Retrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To +our knowledge, our contributions mainly include: (1) For the first time, we +propose a camouflaged generation paradigm that does not need to receive any +background inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented +method with interpretability for camouflaged generation, in which we propose an +idea that knowledge retrieval and reasoning enhancement are separated +explicitly, to alleviate the task-specific challenges. Moreover, our method is +not restricted to specific foreground targets or backgrounds, offering a +potential for extending camouflaged vision perception to more diverse domains. +(3) Experimental results demonstrate that our method outperforms the existing +approaches, generating more realistic camouflage images. + +
+
+ comment: Accepted by CVPR 2024, Fig.3 revised +
+
+
+
+
+ + ♻ ☆ Extending CLIP's Image-Text Alignment to Referring Image Segmentation NAACL 2024 + + +
+ Referring Image Segmentation (RIS) is a cross-modal task that aims to segment +an instance described by a natural language expression. Recent methods leverage +large-scale pretrained unimodal models as backbones along with fusion +techniques for joint reasoning across modalities. However, the inherent +cross-modal nature of RIS raises questions about the effectiveness of unimodal +backbones. We propose RISCLIP, a novel framework that effectively leverages the +cross-modal nature of CLIP for RIS. Observing CLIP's inherent alignment between +image and text features, we capitalize on this starting point and introduce +simple but strong modules that enhance unimodal feature extraction and leverage +rich alignment knowledge in CLIP's image-text shared-embedding space. RISCLIP +exhibits outstanding results on all three major RIS benchmarks and also +outperforms previous CLIP-based methods, demonstrating the efficacy of our +strategy in extending CLIP's image-text alignment to RIS. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ Human Mesh Recovery from Arbitrary Multi-view Images + + +
+ Human mesh recovery from arbitrary multi-view images involves two +characteristics: the arbitrary camera poses and arbitrary number of camera +views. Because of the variability, designing a unified framework to tackle this +task is challenging. The challenges can be summarized as the dilemma of being +able to simultaneously estimate arbitrary camera poses and recover human mesh +from arbitrary multi-view images while maintaining flexibility. To solve this +dilemma, we propose a divide and conquer framework for Unified Human Mesh +Recovery (U-HMR) from arbitrary multi-view images. In particular, U-HMR +consists of a decoupled structure and two main components: camera and body +decoupling (CBD), camera pose estimation (CPE), and arbitrary view fusion +(AVF). As camera poses and human body mesh are independent of each other, CBD +splits the estimation of them into two sub-tasks for two individual +sub-networks (ie, CPE and AVF) to handle respectively, thus the two sub-tasks +are disentangled. In CPE, since each camera pose is unrelated to the others, we +adopt a shared MLP to process all views in a parallel way. In AVF, in order to +fuse multi-view information and make the fusion operation independent of the +number of views, we introduce a transformer decoder with a SMPL parameters +query token to extract cross-view features for mesh recovery. To demonstrate +the efficacy and flexibility of the proposed framework and effect of each +component, we conduct extensive experiments on three public datasets: +Human3.6M, MPI-INF-3DHP, and TotalCapture. + +
+
+
+
+
+ + ♻ ☆ GP-NeRF: Generalized Perception NeRF for Context-Aware 3D Scene + Understanding CVPR 2024 + + +
+ Applying NeRF to downstream perception tasks for scene understanding and +representation is becoming increasingly popular. Most existing methods treat +semantic prediction as an additional rendering task, \textit{i.e.}, the "label +rendering" task, to build semantic NeRFs. However, by rendering +semantic/instance labels per pixel without considering the contextual +information of the rendered image, these methods usually suffer from unclear +boundary segmentation and abnormal segmentation of pixels within an object. To +solve this problem, we propose Generalized Perception NeRF (GP-NeRF), a novel +pipeline that makes the widely used segmentation model and NeRF work compatibly +under a unified framework, for facilitating context-aware 3D scene perception. +To accomplish this goal, we introduce transformers to aggregate radiance as +well as semantic embedding fields jointly for novel views and facilitate the +joint volumetric rendering of both fields. In addition, we propose two +self-distillation mechanisms, i.e., the Semantic Distill Loss and the +Depth-Guided Semantic Distill Loss, to enhance the discrimination and quality +of the semantic field and the maintenance of geometric consistency. In +evaluation, we conduct experimental comparisons under two perception tasks +(\textit{i.e.} semantic and instance segmentation) using both synthetic and +real-world datasets. Notably, our method outperforms SOTA approaches by 6.94\%, +11.76\%, and 8.47\% on generalized semantic segmentation, finetuning semantic +segmentation, and instance segmentation, respectively. + +
+
+ comment: CVPR 2024 (Highlight). Project Page: + https://lifuguan.github.io/gpnerf-pages/ +
+
+
+
+
+ + ♻ ☆ RaFE: Generative Radiance Fields Restoration + + +
+ NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel +view synthesis and 3D reconstruction, but its performance is sensitive to input +image quality, which struggles to achieve high-fidelity rendering when provided +with low-quality sparse input viewpoints. Previous methods for NeRF restoration +are tailored for specific degradation type, ignoring the generality of +restoration. To overcome this limitation, we propose a generic radiance fields +restoration pipeline, named RaFE, which applies to various types of +degradations, such as low resolution, blurriness, noise, compression artifacts, +or their combinations. Our approach leverages the success of off-the-shelf 2D +restoration methods to recover the multi-view images individually. Instead of +reconstructing a blurred NeRF by averaging inconsistencies, we introduce a +novel approach using Generative Adversarial Networks (GANs) for NeRF generation +to better accommodate the geometric and appearance inconsistencies present in +the multi-view images. Specifically, we adopt a two-level tri-plane +architecture, where the coarse level remains fixed to represent the low-quality +NeRF, and a fine-level residual tri-plane to be added to the coarse level is +modeled as a distribution with GAN to capture potential variations in +restoration. We validate RaFE on both synthetic and real cases for various +restoration tasks, demonstrating superior performance in both quantitative and +qualitative evaluations, surpassing other 3D restoration methods specific to +single task. Please see our project website +https://zkaiwu.github.io/RaFE-Project/. + +
+
+ comment: Project Page: https://zkaiwu.github.io/RaFE +
+
+
+
+
+ + ♻ ☆ Reduction of Class Activation Uncertainty with Background Information + + +
+ Multitask learning is a popular approach to training high-performing neural +networks with improved generalization. In this paper, we propose a background +class to achieve improved generalization at a lower computation compared to +multitask learning to help researchers and organizations with limited +computation power. We also present a methodology for selecting background +images and discuss potential future improvements. We apply our approach to +several datasets and achieve improved generalization with much lower +computation. Through the class activation mappings (CAMs) of the trained +models, we observed the tendency towards looking at a bigger picture with the +proposed model training methodology. Applying the vision transformer with the +proposed background class, we receive state-of-the-art (SOTA) performance on +STL-10, Caltech-101, and CINIC-10 datasets. Example scripts are available in +the 'CAM' folder of the following GitHub Repository: github.com/dipuk0506/UQ + +
+
+
+
+
+ + ♻ ☆ Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data CVPR 2024 + + +
+ This work presents Depth Anything, a highly practical solution for robust +monocular depth estimation. Without pursuing novel technical modules, we aim to +build a simple yet powerful foundation model dealing with any images under any +circumstances. To this end, we scale up the dataset by designing a data engine +to collect and automatically annotate large-scale unlabeled data (~62M), which +significantly enlarges the data coverage and thus is able to reduce the +generalization error. We investigate two simple yet effective strategies that +make data scaling-up promising. First, a more challenging optimization target +is created by leveraging data augmentation tools. It compels the model to +actively seek extra visual knowledge and acquire robust representations. +Second, an auxiliary supervision is developed to enforce the model to inherit +rich semantic priors from pre-trained encoders. We evaluate its zero-shot +capabilities extensively, including six public datasets and randomly captured +photos. It demonstrates impressive generalization ability. Further, through +fine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs +are set. Our better depth model also results in a better depth-conditioned +ControlNet. Our models are released at +https://github.com/LiheYoung/Depth-Anything. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://depth-anything.github.io +
+
+
+
+
+ + ♻ ☆ StepNet: Spatial-temporal Part-aware Network for Isolated Sign Language + Recognition + + +
+ The goal of sign language recognition (SLR) is to help those who are hard of +hearing or deaf overcome the communication barrier. Most existing approaches +can be typically divided into two lines, i.e., Skeleton-based and RGB-based +methods, but both the two lines of methods have their limitations. +Skeleton-based methods do not consider facial expressions, while RGB-based +approaches usually ignore the fine-grained hand structure. To overcome both +limitations, we propose a new framework called Spatial-temporal Part-aware +network~(StepNet), based on RGB parts. As its name suggests, it is made up of +two modules: Part-level Spatial Modeling and Part-level Temporal Modeling. +Part-level Spatial Modeling, in particular, automatically captures the +appearance-based properties, such as hands and faces, in the feature space +without the use of any keypoint-level annotations. On the other hand, +Part-level Temporal Modeling implicitly mines the long-short term context to +capture the relevant attributes over time. Extensive experiments demonstrate +that our StepNet, thanks to spatial-temporal modules, achieves competitive +Top-1 Per-instance accuracy on three commonly-used SLR benchmarks, i.e., 56.89% +on WLASL, 77.2% on NMFs-CSL, and 77.1% on BOBSL. Additionally, the proposed +method is compatible with the optical flow input and can produce superior +performance if fused. For those who are hard of hearing, we hope that our work +can act as a preliminary step. + +
+
+
+
+
+ + ♻ ☆ DeepAAT: Deep Automated Aerial Triangulation for Fast UAV-based Mapping + + +
+ Automated Aerial Triangulation (AAT), aiming to restore image pose and +reconstruct sparse points simultaneously, plays a pivotal role in earth +observation. With its rich research heritage spanning several decades in +photogrammetry, AAT has evolved into a fundamental process widely applied in +large-scale Unmanned Aerial Vehicle (UAV) based mapping. Despite its +advancements, classic AAT methods still face challenges like low efficiency and +limited robustness. This paper introduces DeepAAT, a deep learning network +designed specifically for AAT of UAV imagery. DeepAAT considers both spatial +and spectral characteristics of imagery, enhancing its capability to resolve +erroneous matching pairs and accurately predict image poses. DeepAAT marks a +significant leap in AAT's efficiency, ensuring thorough scene coverage and +precision. Its processing speed outpaces incremental AAT methods by hundreds of +times and global AAT methods by tens of times while maintaining a comparable +level of reconstruction accuracy. Additionally, DeepAAT's scene clustering and +merging strategy facilitate rapid localization and pose determination for +large-scale UAV images, even under constrained computing resources. The +experimental results demonstrate DeepAAT's substantial improvements over +conventional AAT methods, highlighting its potential in the efficiency and +accuracy of UAV-based 3D reconstruction tasks. To benefit the photogrammetry +society, the code of DeepAAT will be released at: +https://github.com/WHU-USI3DV/DeepAAT. + +
+
+
+
+
+ + ♻ ☆ UniPAD: A Universal Pre-training Paradigm for Autonomous Driving CVPR2024 + + +
+ In the context of autonomous driving, the significance of effective feature +learning is widely acknowledged. While conventional 3D self-supervised +pre-training methods have shown widespread success, most methods follow the +ideas originally designed for 2D images. In this paper, we present UniPAD, a +novel self-supervised learning paradigm applying 3D volumetric differentiable +rendering. UniPAD implicitly encodes 3D space, facilitating the reconstruction +of continuous 3D shape structures and the intricate appearance characteristics +of their 2D projections. The flexibility of our method enables seamless +integration into both 2D and 3D frameworks, enabling a more holistic +comprehension of the scenes. We manifest the feasibility and effectiveness of +UniPAD by conducting extensive experiments on various downstream 3D tasks. Our +method significantly improves lidar-, camera-, and lidar-camera-based baseline +by 9.1, 7.7, and 6.9 NDS, respectively. Notably, our pre-training pipeline +achieves 73.2 NDS for 3D object detection and 79.4 mIoU for 3D semantic +segmentation on the nuScenes validation set, achieving state-of-the-art results +in comparison with previous methods. The code will be available at +https://github.com/Nightmare-n/UniPAD. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ CityGaussian: Real-time High-quality Large-Scale Scene Rendering with + Gaussians + + +
+ The advancement of real-time 3D scene reconstruction and novel view synthesis +has been significantly propelled by 3D Gaussian Splatting (3DGS). However, +effectively training large-scale 3DGS and rendering it in real-time across +various scales remains challenging. This paper introduces CityGaussian +(CityGS), which employs a novel divide-and-conquer training approach and +Level-of-Detail (LoD) strategy for efficient large-scale 3DGS training and +rendering. Specifically, the global scene prior and adaptive training data +selection enables efficient training and seamless fusion. Based on fused +Gaussian primitives, we generate different detail levels through compression, +and realize fast rendering across various scales through the proposed +block-wise detail levels selection and aggregation strategy. Extensive +experimental results on large-scale scenes demonstrate that our approach +attains state-of-theart rendering quality, enabling consistent real-time +rendering of largescale scenes across vastly different scales. Our project page +is available at https://dekuliutesla.github.io/citygs/. + +
+
+ comment: Project Page: https://dekuliutesla.github.io/citygs/ +
+
+
+
+
+ + ♻ ☆ Towards AI-Architecture Liberty: A Comprehensive Survey on Designing and + Collaborating Virtual Architecture by Deep Learning in the Metaverse + + +
+ 3D shape generation techniques leveraging deep learning have garnered +significant interest from both the computer vision and architectural design +communities, promising to enrich the content of the future metaverse. However, +research on virtual architectural design remains limited, particularly +regarding human-AI collaboration and deep learning-assisted design. We first +illuminate the principles, generation techniques, and current literature of +virtual architecture, focusing on challenges such as datasets, multimodality, +design intuition, and generative frameworks. In our survey, we reviewed 187 +related articles (80.7\% of articles published between 2018 and 2022) covering +architectural research, virtual environments, and technical approaches. This +survey investigates the latest approaches to 3D object generation with deep +generative models (DGMs) and summarizes four characteristics of deep-learning +generation approaches for virtual architecture. According to our analysis of +the survey, we expound on four research agendas, including agency, +communication, user consideration, and integrating tools, and highlight three +important enablers of ubiquitous interaction with immersive systems in deep +learning-assisted architectural generation. Our work contributes to fostering +understanding between designers and deep learning techniques, broadening access +to human-AI collaboration. We advocate for interdisciplinary efforts to address +this timely research topic, facilitating content designing and generation in +the metaverse. + +
+
+ comment: 37 pages, 9 figures, and 5 tables +
+
+
+
+
+ + ♻ ☆ ARS-DETR: Aspect Ratio-Sensitive Detection Transformer for Aerial + Oriented Object Detection + + +
+ Existing oriented object detection methods commonly use metric AP$_{50}$ to +measure the performance of the model. We argue that AP$_{50}$ is inherently +unsuitable for oriented object detection due to its large tolerance in angle +deviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$, +to measure the performance of models. In this paper, we propose an Aspect Ratio +Sensitive Oriented Object Detector with Transformer, termed ARS-DETR, which +exhibits a competitive performance in high-precision oriented object detection. +Specifically, a new angle classification method, calling Aspect Ratio aware +Circle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more +reasonable way and discard the hyperparameter that introduced by previous work +(e.g. CSL). Then, a rotated deformable attention module is designed to rotate +the sampling points with the corresponding angles and eliminate the +misalignment between region features and sampling points. Moreover, a dynamic +weight coefficient according to the aspect ratio is adopted to calculate the +angle loss. Comprehensive experiments on several challenging datasets show that +our method achieves competitive performance on the high-precision oriented +object detection task. + +
+
+ comment: 15 pages, 13 figures, 13 tables, the source code is available at + https://github.com/httle/ARS-DETR +
+
+
+
+
+ + ♻ ☆ Bi-LORA: A Vision-Language Approach for Synthetic Image Detection + + +
+ Advancements in deep image synthesis techniques, such as generative +adversarial networks (GANs) and diffusion models (DMs), have ushered in an era +of generating highly realistic images. While this technological progress has +captured significant interest, it has also raised concerns about the potential +difficulty in distinguishing real images from their synthetic counterparts. +This paper takes inspiration from the potent convergence capabilities between +vision and language, coupled with the zero-shot nature of vision-language +models (VLMs). We introduce an innovative method called Bi-LORA that leverages +VLMs, combined with low-rank adaptation (LORA) tuning techniques, to enhance +the precision of synthetic image detection for unseen model-generated images. +The pivotal conceptual shift in our methodology revolves around reframing +binary classification as an image captioning task, leveraging the distinctive +capabilities of cutting-edge VLM, notably bootstrapping language image +pre-training (BLIP2). Rigorous and comprehensive experiments are conducted to +validate the effectiveness of our proposed approach, particularly in detecting +unseen diffusion-generated images from unknown diffusion-based generative +models during training, showcasing robustness to noise, and demonstrating +generalization capabilities to GANs. The obtained results showcase an +impressive average accuracy of 93.41% in synthetic image detection on unseen +generation models. The code and models associated with this research can be +publicly accessed at https://github.com/Mamadou-Keita/VLM-DETECT. + +
+
+
+
+
+ + ♻ ☆ Sketch3D: Style-Consistent Guidance for Sketch-to-3D Generation + + +
+ Recently, image-to-3D approaches have achieved significant results with a +natural image as input. However, it is not always possible to access these +enriched color input samples in practical applications, where only sketches are +available. Existing sketch-to-3D researches suffer from limitations in broad +applications due to the challenges of lacking color information and multi-view +content. To overcome them, this paper proposes a novel generation paradigm +Sketch3D to generate realistic 3D assets with shape aligned with the input +sketch and color matching the textual description. Concretely, Sketch3D first +instantiates the given sketch in the reference image through the +shape-preserving generation process. Second, the reference image is leveraged +to deduce a coarse 3D Gaussian prior, and multi-view style-consistent guidance +images are generated based on the renderings of the 3D Gaussians. Finally, +three strategies are designed to optimize 3D Gaussians, i.e., structural +optimization via a distribution transfer mechanism, color optimization with a +straightforward MSE loss and sketch similarity optimization with a CLIP-based +geometric similarity loss. Extensive visual comparisons and quantitative +analysis illustrate the advantage of our Sketch3D in generating realistic 3D +assets while preserving consistency with the input. + +
+
+
+
+
+ + ♻ ☆ Training Like a Medical Resident: Context-Prior Learning Toward + Universal Medical Image Segmentation CVPR 2024 + + +
+ A major focus of clinical imaging workflow is disease diagnosis and +management, leading to medical imaging datasets strongly tied to specific +clinical objectives. This scenario has led to the prevailing practice of +developing task-specific segmentation models, without gaining insights from +widespread imaging cohorts. Inspired by the training program of medical +radiology residents, we propose a shift towards universal medical image +segmentation, a paradigm aiming to build medical image understanding foundation +models by leveraging the diversity and commonality across clinical targets, +body regions, and imaging modalities. Towards this goal, we develop Hermes, a +novel context-prior learning approach to address the challenges of data +heterogeneity and annotation differences in medical image segmentation. In a +large collection of eleven diverse datasets (2,438 3D images) across five +modalities (CT, PET, T1, T2 and cine MRI) and multiple body regions, we +demonstrate the merit of the universal paradigm over the traditional paradigm +on addressing multiple tasks within a single model. By exploiting the synergy +across tasks, Hermes achieves state-of-the-art performance on all testing +datasets and shows superior model scalability. Results on two additional +datasets reveals Hermes' strong performance for transfer learning, incremental +learning, and generalization to downstream tasks. Hermes's learned priors +demonstrate an appealing trait to reflect the intricate relations among tasks +and modalities, which aligns with the established anatomical and imaging +principles in radiology. The code is available: +https://github.com/yhygao/universal-medical-image-segmentation. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Tailored Visions: Enhancing Text-to-Image Generation with Personalized + Prompt Rewriting CVPR 2024 + + +
+ Despite significant progress in the field, it is still challenging to create +personalized visual representations that align closely with the desires and +preferences of individual users. This process requires users to articulate +their ideas in words that are both comprehensible to the models and accurately +capture their vision, posing difficulties for many users. In this paper, we +tackle this challenge by leveraging historical user interactions with the +system to enhance user prompts. We propose a novel approach that involves +rewriting user prompts based on a newly collected large-scale text-to-image +dataset with over 300k prompts from 3115 users. Our rewriting model enhances +the expressiveness and alignment of user prompts with their intended visual +outputs. Experimental results demonstrate the superiority of our methods over +baseline approaches, as evidenced in our new offline evaluation method and +online tests. Our code and dataset are available at +https://github.com/zzjchen/Tailored-Visions. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ GraphAD: Interaction Scene Graph for End-to-end Autonomous Driving + + +
+ Modeling complicated interactions among the ego-vehicle, road agents, and map +elements has been a crucial part for safety-critical autonomous driving. +Previous works on end-to-end autonomous driving rely on the attention mechanism +for handling heterogeneous interactions, which fails to capture the geometric +priors and is also computationally intensive. In this paper, we propose the +Interaction Scene Graph (ISG) as a unified method to model the interactions +among the ego-vehicle, road agents, and map elements. With the representation +of the ISG, the driving agents aggregate essential information from the most +influential elements, including the road agents with potential collisions and +the map elements to follow. Since a mass of unnecessary interactions are +omitted, the more efficient scene-graph-based framework is able to focus on +indispensable connections and leads to better performance. We evaluate the +proposed method for end-to-end autonomous driving on the nuScenes dataset. +Compared with strong baselines, our method significantly outperforms in the +full-stack driving tasks, including perception, prediction, and planning. Code +will be released at https://github.com/zhangyp15/GraphAD. + +
+
+ comment: project page: https://github.com/zhangyp15/GraphAD +
+
+
+
+
+ + ♻ ☆ Towards a Simultaneous and Granular Identity-Expression Control in + Personalized Face Generation + + +
+ In human-centric content generation, the pre-trained text-to-image models +struggle to produce user-wanted portrait images, which retain the identity of +individuals while exhibiting diverse expressions. This paper introduces our +efforts towards personalized face generation. To this end, we propose a novel +multi-modal face generation framework, capable of simultaneous +identity-expression control and more fine-grained expression synthesis. Our +expression control is so sophisticated that it can be specialized by the +fine-grained emotional vocabulary. We devise a novel diffusion model that can +undertake the task of simultaneously face swapping and reenactment. Due to the +entanglement of identity and expression, it's nontrivial to separately and +precisely control them in one framework, thus has not been explored yet. To +overcome this, we propose several innovative designs in the conditional +diffusion model, including balancing identity and expression encoder, improved +midpoint sampling, and explicitly background conditioning. Extensive +experiments have demonstrated the controllability and scalability of the +proposed framework, in comparison with state-of-the-art text-to-image, face +swapping, and face reenactment methods. + +
+
+
+
+
+ + ♻ ☆ FineDiffusion: Scaling up Diffusion Models for Fine-grained Image + Generation with 10,000 Classes + + +
+ The class-conditional image generation based on diffusion models is renowned +for generating high-quality and diverse images. However, most prior efforts +focus on generating images for general categories, e.g., 1000 classes in +ImageNet-1k. A more challenging task, large-scale fine-grained image +generation, remains the boundary to explore. In this work, we present a +parameter-efficient strategy, called FineDiffusion, to fine-tune large +pre-trained diffusion models scaling to large-scale fine-grained image +generation with 10,000 categories. FineDiffusion significantly accelerates +training and reduces storage overhead by only fine-tuning tiered class +embedder, bias terms, and normalization layers' parameters. To further improve +the image generation quality of fine-grained categories, we propose a novel +sampling method for fine-grained image generation, which utilizes +superclass-conditioned guidance, specifically tailored for fine-grained +categories, to replace the conventional classifier-free guidance sampling. +Compared to full fine-tuning, FineDiffusion achieves a remarkable 1.56x +training speed-up and requires storing merely 1.77% of the total model +parameters, while achieving state-of-the-art FID of 9.776 on image generation +of 10,000 classes. Extensive qualitative and quantitative experiments +demonstrate the superiority of our method compared to other parameter-efficient +fine-tuning methods. The code and more generated results are available at our +project website: https://finediffusion.github.io/. + +
+
+
+
+
+ + ♻ ☆ Feature Re-Embedding: Towards Foundation Model-Level Performance in + Computational Pathology CVPR2024 + + +
+ Multiple instance learning (MIL) is the most widely used framework in +computational pathology, encompassing sub-typing, diagnosis, prognosis, and +more. However, the existing MIL paradigm typically requires an offline instance +feature extractor, such as a pre-trained ResNet or a foundation model. This +approach lacks the capability for feature fine-tuning within the specific +downstream tasks, limiting its adaptability and performance. To address this +issue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding +the instance features online, which captures fine-grained local features and +establishes connections across different regions. Unlike existing works that +focus on pre-training powerful feature extractor or designing sophisticated +instance aggregator, R$^2$T is tailored to re-embed instance features online. +It serves as a portable module that can seamlessly integrate into mainstream +MIL models. Extensive experimental results on common computational pathology +tasks validate that: 1) feature re-embedding improves the performance of MIL +models based on ResNet-50 features to the level of foundation model features, +and further enhances the performance of foundation model features; 2) the +R$^2$T can introduce more significant performance improvements to various MIL +models; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest +methods by a large margin.The code is available at: +https://github.com/DearCaat/RRT-MIL. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ GenAD: Generative End-to-End Autonomous Driving + + +
+ Directly producing planning results from raw sensors has been a long-desired +solution for autonomous driving and has attracted increasing attention +recently. Most existing end-to-end autonomous driving methods factorize this +problem into perception, motion prediction, and planning. However, we argue +that the conventional progressive pipeline still cannot comprehensively model +the entire traffic evolution process, e.g., the future interaction between the +ego car and other traffic participants and the structural trajectory prior. In +this paper, we explore a new paradigm for end-to-end autonomous driving, where +the key is to predict how the ego car and the surroundings evolve given past +scenes. We propose GenAD, a generative framework that casts autonomous driving +into a generative modeling problem. We propose an instance-centric scene +tokenizer that first transforms the surrounding scenes into map-aware instance +tokens. We then employ a variational autoencoder to learn the future trajectory +distribution in a structural latent space for trajectory prior modeling. We +further adopt a temporal model to capture the agent and ego movements in the +latent space to generate more effective future trajectories. GenAD finally +simultaneously performs motion prediction and planning by sampling +distributions in the learned structural latent space conditioned on the +instance tokens and using the learned temporal model to generate futures. +Extensive experiments on the widely used nuScenes benchmark show that the +proposed GenAD achieves state-of-the-art performance on vision-centric +end-to-end autonomous driving with high efficiency. Code: +https://github.com/wzzheng/GenAD. + +
+
+ comment: Code is available at: https://github.com/wzzheng/GenAD +
+
+
+
+
+ + ♻ ☆ CCEdit: Creative and Controllable Video Editing via Diffusion Models + + +
+ In this paper, we present CCEdit, a versatile generative video editing +framework based on diffusion models. Our approach employs a novel trident +network structure that separates structure and appearance control, ensuring +precise and creative editing capabilities. Utilizing the foundational +ControlNet architecture, we maintain the structural integrity of the video +during editing. The incorporation of an additional appearance branch enables +users to exert fine-grained control over the edited key frame. These two side +branches seamlessly integrate into the main branch, which is constructed upon +existing text-to-image (T2I) generation models, through learnable temporal +layers. The versatility of our framework is demonstrated through a diverse +range of choices in both structure representations and personalized T2I models, +as well as the option to provide the edited key frame. To facilitate +comprehensive evaluation, we introduce the BalanceCC benchmark dataset, +comprising 100 videos and 4 target prompts for each video. Our extensive user +studies compare CCEdit with eight state-of-the-art video editing methods. The +outcomes demonstrate CCEdit's substantial superiority over all other methods. + +
+
+
+
+
+ + ♻ ☆ PrivImage: Differentially Private Synthetic Image Generation using + Diffusion Models with Semantic-Aware Pretraining USENIX Security 2024 + + +
+ Differential Privacy (DP) image data synthesis, which leverages the DP +technique to generate synthetic data to replace the sensitive data, allowing +organizations to share and utilize synthetic images without privacy concerns. +Previous methods incorporate the advanced techniques of generative models and +pre-training on a public dataset to produce exceptional DP image data, but +suffer from problems of unstable training and massive computational resource +demands. This paper proposes a novel DP image synthesis method, termed +PRIVIMAGE, which meticulously selects pre-training data, promoting the +efficient creation of DP datasets with high fidelity and utility. PRIVIMAGE +first establishes a semantic query function using a public dataset. Then, this +function assists in querying the semantic distribution of the sensitive +dataset, facilitating the selection of data from the public dataset with +analogous semantics for pre-training. Finally, we pre-train an image generative +model using the selected data and then fine-tune this model on the sensitive +dataset using Differentially Private Stochastic Gradient Descent (DP-SGD). +PRIVIMAGE allows us to train a lightly parameterized generative model, reducing +the noise in the gradient during DP-SGD training and enhancing training +stability. Extensive experiments demonstrate that PRIVIMAGE uses only 1% of the +public dataset for pre-training and 7.6% of the parameters in the generative +model compared to the state-of-the-art method, whereas achieves superior +synthetic performance and conserves more computational resources. On average, +PRIVIMAGE achieves 30.1% lower FID and 12.6% higher Classification Accuracy +than the state-of-the-art method. The replication package and datasets can be +accessed online. + +
+
+ comment: Accepted at USENIX Security 2024 +
+
+
+
+
+ + ♻ ☆ Knowledge NeRF: Few-shot Novel View Synthesis for Dynamic Articulated + Objects + + +
+ We present Knowledge NeRF to synthesize novel views for dynamic scenes. +Reconstructing dynamic 3D scenes from few sparse views and rendering them from +arbitrary perspectives is a challenging problem with applications in various +domains. Previous dynamic NeRF methods learn the deformation of articulated +objects from monocular videos. However, qualities of their reconstructed scenes +are limited. To clearly reconstruct dynamic scenes, we propose a new framework +by considering two frames at a time.We pretrain a NeRF model for an articulated +object.When articulated objects moves, Knowledge NeRF learns to generate novel +views at the new state by incorporating past knowledge in the pretrained NeRF +model with minimal observations in the present state. We propose a projection +module to adapt NeRF for dynamic scenes, learning the correspondence between +pretrained knowledge base and current states. Experimental results demonstrate +the effectiveness of our method in reconstructing dynamic 3D scenes with 5 +input images in one state. Knowledge NeRF is a new pipeline and promising +solution for novel view synthesis in dynamic articulated objects. The data and +implementation are publicly available at +https://github.com/RussRobin/Knowledge_NeRF. + +
+
+
+
+
+ + ♻ ☆ Optimizing Illuminant Estimation in Dual-Exposure HDR Imaging + + +
+ High dynamic range (HDR) imaging involves capturing a series of frames of the +same scene, each with different exposure settings, to broaden the dynamic range +of light. This can be achieved through burst capturing or using staggered HDR +sensors that capture long and short exposures simultaneously in the camera +image signal processor (ISP). Within camera ISP pipeline, illuminant estimation +is a crucial step aiming to estimate the color of the global illuminant in the +scene. This estimation is used in camera ISP white-balance module to remove +undesirable color cast in the final image. Despite the multiple frames captured +in the HDR pipeline, conventional illuminant estimation methods often rely only +on a single frame of the scene. In this paper, we explore leveraging +information from frames captured with different exposure times. Specifically, +we introduce a simple feature extracted from dual-exposure images to guide +illuminant estimators, referred to as the dual-exposure feature (DEF). To +validate the efficiency of DEF, we employed two illuminant estimators using the +proposed DEF: 1) a multilayer perceptron network (MLP), referred to as +exposure-based MLP (EMLP), and 2) a modified version of the convolutional color +constancy (CCC) to integrate our DEF, that we call ECCC. Both EMLP and ECCC +achieve promising results, in some cases surpassing prior methods that require +hundreds of thousands or millions of parameters, with only a few hundred +parameters for EMLP and a few thousand parameters for ECCC. + +
+
+
+
+
+ + ♻ ☆ UWFormer: Underwater Image Enhancement via a Semi-Supervised Multi-Scale + Transformer IJCNN 2024 + + +
+ Underwater images often exhibit poor quality, distorted color balance and low +contrast due to the complex and intricate interplay of light, water, and +objects. Despite the significant contributions of previous underwater +enhancement techniques, there exist several problems that demand further +improvement: (i) The current deep learning methods rely on Convolutional Neural +Networks (CNNs) that lack the multi-scale enhancement, and global perception +field is also limited. (ii) The scarcity of paired real-world underwater +datasets poses a significant challenge, and the utilization of synthetic image +pairs could lead to overfitting. To address the aforementioned problems, this +paper introduces a Multi-scale Transformer-based Network called UWFormer for +enhancing images at multiple frequencies via semi-supervised learning, in which +we propose a Nonlinear Frequency-aware Attention mechanism and a Multi-Scale +Fusion Feed-forward Network for low-frequency enhancement. Besides, we +introduce a special underwater semi-supervised training strategy, where we +propose a Subaqueous Perceptual Loss function to generate reliable pseudo +labels. Experiments using full-reference and non-reference underwater +benchmarks demonstrate that our method outperforms state-of-the-art methods in +terms of both quantity and visual quality. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 81 + +
+
+
+ + ☆ Collaborative Feedback Discriminative Propagation for Video + Super-Resolution + + +
+ The key success of existing video super-resolution (VSR) methods stems mainly +from exploring spatial and temporal information, which is usually achieved by a +recurrent propagation module with an alignment module. However, inaccurate +alignment usually leads to aligned features with significant artifacts, which +will be accumulated during propagation and thus affect video restoration. +Moreover, propagation modules only propagate the same timestep features forward +or backward that may fail in case of complex motion or occlusion, limiting +their performance for high-quality frame restoration. To address these issues, +we propose a collaborative feedback discriminative (CFD) method to correct +inaccurate aligned features and model long -range spatial and temporal +information for better video reconstruction. In detail, we develop a +discriminative alignment correction (DAC) method to adaptively explore +information and reduce the influences of the artifacts caused by inaccurate +alignment. Then, we propose a collaborative feedback propagation (CFP) module +that employs feedback and gating mechanisms to better explore spatial and +temporal information of different timestep features from forward and backward +propagation simultaneously. Finally, we embed the proposed DAC and CFP into +commonly used VSR networks to verify the effectiveness of our method. +Quantitative and qualitative experiments on several benchmarks demonstrate that +our method can improve the performance of existing VSR models while maintaining +a lower model complexity. The source code and pre-trained models will be +available at \url{https://github.com/House-Leo/CFDVSR}. + +
+
+ comment: Project website: https://github.com/House-Leo/CFDVSR +
+
+
+
+
+ + ☆ ProtoAL: Interpretable Deep Active Learning with prototypes for medical + imaging + + +
+ The adoption of Deep Learning algorithms in the medical imaging field is a +prominent area of research, with high potential for advancing AI-based +Computer-aided diagnosis (AI-CAD) solutions. However, current solutions face +challenges due to a lack of interpretability features and high data demands, +prompting recent efforts to address these issues. In this study, we propose the +ProtoAL method, where we integrate an interpretable DL model into the Deep +Active Learning (DAL) framework. This approach aims to address both challenges +by focusing on the medical imaging context and utilizing an inherently +interpretable model based on prototypes. We evaluated ProtoAL on the Messidor +dataset, achieving an area under the precision-recall curve of 0.79 while +utilizing only 76.54\% of the available labeled data. These capabilities can +enhances the practical usability of a DL model in the medical field, providing +a means of trust calibration in domain experts and a suitable solution for +learning in the data scarcity context often found. + +
+
+
+
+
+ + ☆ Towards Generalized Entropic Sparsification for Convolutional Neural + Networks + + +
+ Convolutional neural networks (CNNs) are reported to be overparametrized. The +search for optimal (minimal) and sufficient architecture is an NP-hard problem +as the hyperparameter space for possible network configurations is vast. Here, +we introduce a layer-by-layer data-driven pruning method based on the +mathematical idea aiming at a computationally-scalable entropic relaxation of +the pruning problem. The sparse subnetwork is found from the pre-trained (full) +CNN using the network entropy minimization as a sparsity constraint. This +allows deploying a numerically scalable algorithm with a sublinear scaling +cost. The method is validated on several benchmarks (architectures): (i) MNIST +(LeNet) with sparsity 55%-84% and loss in accuracy 0.1%-0.5%, and (ii) CIFAR-10 +(VGG-16, ResNet18) with sparsity 73-89% and loss in accuracy 0.1%-0.5%. + +
+
+
+
+
+ + ☆ On Exploring PDE Modeling for Point Cloud Video Representation Learning + + +
+ Point cloud video representation learning is challenging due to complex +structures and unordered spatial arrangement. Traditional methods struggle with +frame-to-frame correlations and point-wise correspondence tracking. Recently, +partial differential equations (PDE) have provided a new perspective in +uniformly solving spatial-temporal data information within certain constraints. +While tracking tangible point correspondence remains challenging, we propose to +formalize point cloud video representation learning as a PDE-solving problem. +Inspired by fluid analysis, where PDEs are used to solve the deformation of +spatial shape over time, we employ PDE to solve the variations of spatial +points affected by temporal information. By modeling spatial-temporal +correlations, we aim to regularize spatial variations with temporal features, +thereby enhancing representation learning in point cloud videos. We introduce +Motion PointNet composed of a PointNet-like encoder and a PDE-solving module. +Initially, we construct a lightweight yet effective encoder to model an initial +state of the spatial variations. Subsequently, we develop our PDE-solving +module in a parameterized latent space, tailored to address the spatio-temporal +correlations inherent in point cloud video. The process of solving PDE is +guided and refined by a contrastive learning structure, which is pivotal in +reshaping the feature distribution, thereby optimizing the feature +representation within point cloud video data. Remarkably, our Motion PointNet +achieves an impressive accuracy of 97.52% on the MSRAction-3D dataset, +surpassing the current state-of-the-art in all aspects while consuming minimal +resources (only 0.72M parameters and 0.82G FLOPs). + +
+
+
+
+
+ + ☆ Interpretable Multimodal Learning for Cardiovascular Hemodynamics + Assessment + + +
+ Pulmonary Arterial Wedge Pressure (PAWP) is an essential cardiovascular +hemodynamics marker to detect heart failure. In clinical practice, Right Heart +Catheterization is considered a gold standard for assessing cardiac +hemodynamics while non-invasive methods are often needed to screen high-risk +patients from a large population. In this paper, we propose a multimodal +learning pipeline to predict PAWP marker. We utilize complementary information +from Cardiac Magnetic Resonance Imaging (CMR) scans (short-axis and +four-chamber) and Electronic Health Records (EHRs). We extract spatio-temporal +features from CMR scans using tensor-based learning. We propose a graph +attention network to select important EHR features for prediction, where we +model subjects as graph nodes and feature relationships as graph edges using +the attention mechanism. We design four feature fusion strategies: early, +intermediate, late, and hybrid fusion. With a linear classifier and linear +fusion strategies, our pipeline is interpretable. We validate our pipeline on a +large dataset of $2,641$ subjects from our ASPIRE registry. The comparative +study against state-of-the-art methods confirms the superiority of our +pipeline. The decision curve analysis further validates that our pipeline can +be applied to screen a large population. The code is available at +https://github.com/prasunc/hemodynamics. + +
+
+
+
+
+ + ☆ OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera + Fusion for Colorizing Point Clouds + + +
+ A Colored point cloud, as a simple and efficient 3D representation, has many +advantages in various fields, including robotic navigation and scene +reconstruction. This representation is now commonly used in 3D reconstruction +tasks relying on cameras and LiDARs. However, fusing data from these two types +of sensors is poorly performed in many existing frameworks, leading to +unsatisfactory mapping results, mainly due to inaccurate camera poses. This +paper presents OmniColor, a novel and efficient algorithm to colorize point +clouds using an independent 360-degree camera. Given a LiDAR-based point cloud +and a sequence of panorama images with initial coarse camera poses, our +objective is to jointly optimize the poses of all frames for mapping images +onto geometric reconstructions. Our pipeline works in an off-the-shelf manner +that does not require any feature extraction or matching process. Instead, we +find optimal poses by directly maximizing the photometric consistency of LiDAR +maps. In experiments, we show that our method can overcome the severe visual +distortion of omnidirectional images and greatly benefit from the wide field of +view (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy +and stability. The code will be released at +https://github.com/liubonan123/OmniColor/. + +
+
+ comment: 2024 IEEE International Conference on Robotics and Automation +
+
+
+
+
+ + ☆ Z-Splat: Z-Axis Gaussian Splatting for Camera-Sonar Fusion + + +
+ Differentiable 3D-Gaussian splatting (GS) is emerging as a prominent +technique in computer vision and graphics for reconstructing 3D scenes. GS +represents a scene as a set of 3D Gaussians with varying opacities and employs +a computationally efficient splatting operation along with analytical +derivatives to compute the 3D Gaussian parameters given scene images captured +from various viewpoints. Unfortunately, capturing surround view ($360^{\circ}$ +viewpoint) images is impossible or impractical in many real-world imaging +scenarios, including underwater imaging, rooms inside a building, and +autonomous navigation. In these restricted baseline imaging scenarios, the GS +algorithm suffers from a well-known 'missing cone' problem, which results in +poor reconstruction along the depth axis. In this manuscript, we demonstrate +that using transient data (from sonars) allows us to address the missing cone +problem by sampling high-frequency data along the depth axis. We extend the +Gaussian splatting algorithms for two commonly used sonars and propose fusion +algorithms that simultaneously utilize RGB camera data and sonar data. Through +simulations, emulations, and hardware experiments across various imaging +scenarios, we show that the proposed fusion algorithms lead to significantly +better novel view synthesis (5 dB improvement in PSNR) and 3D geometry +reconstruction (60% lower Chamfer distance). + +
+
+
+
+
+ + ☆ Predictive Modeling for Breast Cancer Classification in the Context of + Bangladeshi Patients: A Supervised Machine Learning Approach with Explainable + AI + + +
+ Breast cancer has rapidly increased in prevalence in recent years, making it +one of the leading causes of mortality worldwide. Among all cancers, it is by +far the most common. Diagnosing this illness manually requires significant time +and expertise. Since detecting breast cancer is a time-consuming process, +preventing its further spread can be aided by creating machine-based forecasts. +Machine learning and Explainable AI are crucial in classification as they not +only provide accurate predictions but also offer insights into how the model +arrives at its decisions, aiding in the understanding and trustworthiness of +the classification results. In this study, we evaluate and compare the +classification accuracy, precision, recall, and F-1 scores of five different +machine learning methods using a primary dataset (500 patients from Dhaka +Medical College Hospital). Five different supervised machine learning +techniques, including decision tree, random forest, logistic regression, naive +bayes, and XGBoost, have been used to achieve optimal results on our dataset. +Additionally, this study applied SHAP analysis to the XGBoost model to +interpret the model's predictions and understand the impact of each feature on +the model's output. We compared the accuracy with which several algorithms +classified the data, as well as contrasted with other literature in this field. +After final evaluation, this study found that XGBoost achieved the best model +accuracy, which is 97%. + +
+
+ comment: Accepted for the Scientific Reports (Nature) journal. 32 pages, 12 + figures +
+
+
+
+
+ + ☆ Salient Sparse Visual Odometry With Pose-Only Supervision + + +
+ Visual Odometry (VO) is vital for the navigation of autonomous systems, +providing accurate position and orientation estimates at reasonable costs. +While traditional VO methods excel in some conditions, they struggle with +challenges like variable lighting and motion blur. Deep learning-based VO, +though more adaptable, can face generalization problems in new environments. +Addressing these drawbacks, this paper presents a novel hybrid visual odometry +(VO) framework that leverages pose-only supervision, offering a balanced +solution between robustness and the need for extensive labeling. We propose two +cost-effective and innovative designs: a self-supervised homographic +pre-training for enhancing optical flow learning from pose-only labels and a +random patch-based salient point detection strategy for more accurate optical +flow patch extraction. These designs eliminate the need for dense optical flow +labels for training and significantly improve the generalization capability of +the system in diverse and challenging environments. Our pose-only supervised +method achieves competitive performance on standard datasets and greater +robustness and generalization ability in extreme and unseen scenarios, even +compared to dense optical flow-supervised state-of-the-art methods. + +
+
+ comment: Accepted by IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ Neural-ABC: Neural Parametric Models for Articulated Body with Clothes + + +
+ In this paper, we introduce Neural-ABC, a novel parametric model based on +neural implicit functions that can represent clothed human bodies with +disentangled latent spaces for identity, clothing, shape, and pose. Traditional +mesh-based representations struggle to represent articulated bodies with +clothes due to the diversity of human body shapes and clothing styles, as well +as the complexity of poses. Our proposed model provides a unified framework for +parametric modeling, which can represent the identity, clothing, shape and pose +of the clothed human body. Our proposed approach utilizes the power of neural +implicit functions as the underlying representation and integrates +well-designed structures to meet the necessary requirements. Specifically, we +represent the underlying body as a signed distance function and clothing as an +unsigned distance function, and they can be uniformly represented as unsigned +distance fields. Different types of clothing do not require predefined +topological structures or classifications, and can follow changes in the +underlying body to fit the body. Additionally, we construct poses using a +controllable articulated structure. The model is trained on both open and newly +constructed datasets, and our decoupling strategy is carefully designed to +ensure optimal performance. Our model excels at disentangling clothing and +identity in different shape and poses while preserving the style of the +clothing. We demonstrate that Neural-ABC fits new observations of different +types of clothing. Compared to other state-of-the-art parametric models, +Neural-ABC demonstrates powerful advantages in the reconstruction of clothed +human bodies, as evidenced by fitting raw scans, depth maps and images. We show +that the attributes of the fitted results can be further edited by adjusting +their identities, clothing, shape and pose codes. + +
+
+ comment: Accepted by IEEE Transactions on Visualization and Computer Graphics. + Project page: https://ustc3dv.github.io/NeuralABC/ +
+
+
+
+
+ + ☆ Adaptive Intra-Class Variation Contrastive Learning for Unsupervised + Person Re-Identification + + +
+ The memory dictionary-based contrastive learning method has achieved +remarkable results in the field of unsupervised person Re-ID. However, The +method of updating memory based on all samples does not fully utilize the +hardest sample to improve the generalization ability of the model, and the +method based on hardest sample mining will inevitably introduce false-positive +samples that are incorrectly clustered in the early stages of the model. +Clustering-based methods usually discard a significant number of outliers, +leading to the loss of valuable information. In order to address the issues +mentioned before, we propose an adaptive intra-class variation contrastive +learning algorithm for unsupervised Re-ID, called AdaInCV. And the algorithm +quantitatively evaluates the learning ability of the model for each class by +considering the intra-class variations after clustering, which helps in +selecting appropriate samples during the training process of the model. To be +more specific, two new strategies are proposed: Adaptive Sample Mining (AdaSaM) +and Adaptive Outlier Filter (AdaOF). The first one gradually creates more +reliable clusters to dynamically refine the memory, while the second can +identify and filter out valuable outliers as negative samples. + +
+
+
+
+
+ + ☆ Focused Active Learning for Histopathological Image Classification + + +
+ Active Learning (AL) has the potential to solve a major problem of digital +pathology: the efficient acquisition of labeled data for machine learning +algorithms. However, existing AL methods often struggle in realistic settings +with artifacts, ambiguities, and class imbalances, as commonly seen in the +medical field. The lack of precise uncertainty estimations leads to the +acquisition of images with a low informative value. To address these +challenges, we propose Focused Active Learning (FocAL), which combines a +Bayesian Neural Network with Out-of-Distribution detection to estimate +different uncertainties for the acquisition function. Specifically, the +weighted epistemic uncertainty accounts for the class imbalance, aleatoric +uncertainty for ambiguous images, and an OoD score for artifacts. We perform +extensive experiments to validate our method on MNIST and the real-world Panda +dataset for the classification of prostate cancer. The results confirm that +other AL methods are 'distracted' by ambiguities and artifacts which harm the +performance. FocAL effectively focuses on the most informative images, avoiding +ambiguities and artifacts during acquisition. For both experiments, FocAL +outperforms existing AL approaches, reaching a Cohen's kappa of 0.764 with only +0.69% of the labeled Panda data. + +
+
+
+
+
+ + ☆ Music Recommendation Based on Facial Emotion Recognition + + +
+ Introduction: Music provides an incredible avenue for individuals to express +their thoughts and emotions, while also serving as a delightful mode of +entertainment for enthusiasts and music lovers. Objectives: This paper presents +a comprehensive approach to enhancing the user experience through the +integration of emotion recognition, music recommendation, and explainable AI +using GRAD-CAM. Methods: The proposed methodology utilizes a ResNet50 model +trained on the Facial Expression Recognition (FER) dataset, consisting of real +images of individuals expressing various emotions. Results: The system achieves +an accuracy of 82% in emotion classification. By leveraging GRAD-CAM, the model +provides explanations for its predictions, allowing users to understand the +reasoning behind the system's recommendations. The model is trained on both FER +and real user datasets, which include labelled facial expressions, and real +images of individuals expressing various emotions. The training process +involves pre-processing the input images, extracting features through +convolutional layers, reasoning with dense layers, and generating emotion +predictions through the output layer Conclusion: The proposed methodology, +leveraging the Resnet50 model with ROI-based analysis and explainable AI +techniques, offers a robust and interpretable solution for facial emotion +detection paper. + +
+
+
+
+
+ + ☆ HawkDrive: A Transformer-driven Visual Perception System for Autonomous + Driving in Night Scene + + +
+ Many established vision perception systems for autonomous driving scenarios +ignore the influence of light conditions, one of the key elements for driving +safety. To address this problem, we present HawkDrive, a novel perception +system with hardware and software solutions. Hardware that utilizes stereo +vision perception, which has been demonstrated to be a more reliable way of +estimating depth information than monocular vision, is partnered with the edge +computing device Nvidia Jetson Xavier AGX. Our software for low light +enhancement, depth estimation, and semantic segmentation tasks, is a +transformer-based neural network. Our software stack, which enables fast +inference and noise reduction, is packaged into system modules in Robot +Operating System 2 (ROS2). Our experimental results have shown that the +proposed end-to-end system is effective in improving the depth estimation and +semantic segmentation performance. Our dataset and codes will be released at +https://github.com/ZionGo6/HawkDrive. + +
+
+ comment: Accepted by IEEE IV 2024 +
+
+
+
+
+ + ☆ InitNO: Boosting Text-to-Image Diffusion Models via Initial Noise + Optimization CVPR 2024 + + +
+ Recent strides in the development of diffusion models, exemplified by +advancements such as Stable Diffusion, have underscored their remarkable +prowess in generating visually compelling images. However, the imperative of +achieving a seamless alignment between the generated image and the provided +prompt persists as a formidable challenge. This paper traces the root of these +difficulties to invalid initial noise, and proposes a solution in the form of +Initial Noise Optimization (InitNO), a paradigm that refines this noise. +Considering text prompts, not all random noises are effective in synthesizing +semantically-faithful images. We design the cross-attention response score and +the self-attention conflict score to evaluate the initial noise, bifurcating +the initial latent space into valid and invalid sectors. A strategically +crafted noise optimization pipeline is developed to guide the initial noise +towards valid regions. Our method, validated through rigorous experimentation, +shows a commendable proficiency in generating images in strict accordance with +text prompts. Our code is available at https://github.com/xiefan-guo/initno. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Structured Gradient-based Interpretations via Norm-Regularized + Adversarial Training CVPR 2024 + + +
+ Gradient-based saliency maps have been widely used to explain the decisions +of deep neural network classifiers. However, standard gradient-based +interpretation maps, including the simple gradient and integrated gradient +algorithms, often lack desired structures such as sparsity and connectedness in +their application to real-world computer vision models. A frequently used +approach to inducing sparsity structures into gradient-based saliency maps is +to alter the simple gradient scheme using sparsification or norm-based +regularization. A drawback with such post-processing methods is their +frequently-observed significant loss in fidelity to the original simple +gradient map. In this work, we propose to apply adversarial training as an +in-processing scheme to train neural networks with structured simple gradient +maps. We show a duality relation between the regularized norms of the +adversarial perturbations and gradient-based maps, based on which we design +adversarial training loss functions promoting sparsity and group-sparsity +properties in simple gradient maps. We present several numerical results to +show the influence of our proposed norm-based adversarial training methods on +the standard gradient-based maps of standard neural network architectures on +benchmark image datasets. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Constrained 6-DoF Grasp Generation on Complex Shapes for Improved + Dual-Arm Manipulation + + +
+ Efficiently generating grasp poses tailored to specific regions of an object +is vital for various robotic manipulation tasks, especially in a dual-arm +setup. This scenario presents a significant challenge due to the complex +geometries involved, requiring a deep understanding of the local geometry to +generate grasps efficiently on the specified constrained regions. Existing +methods only explore settings involving table-top/small objects and require +augmented datasets to train, limiting their performance on complex objects. We +propose CGDF: Constrained Grasp Diffusion Fields, a diffusion-based grasp +generative model that generalizes to objects with arbitrary geometries, as well +as generates dense grasps on the target regions. CGDF uses a part-guided +diffusion approach that enables it to get high sample efficiency in constrained +grasping without explicitly training on massive constraint-augmented datasets. +We provide qualitative and quantitative comparisons using analytical metrics +and in simulation, in both unconstrained and constrained settings to show that +our method can generalize to generate stable grasps on complex objects, +especially useful for dual-arm manipulation settings, while existing methods +struggle to do so. + +
+
+ comment: Project Page: https://constrained-grasp-diffusion.github.io/ +
+
+
+
+
+ + ☆ A Deep Look Into -- Automated Lung X-Ray Abnormality Detection System + + +
+ Introduction: Automated Lung X-Ray Abnormality Detection System is the +application which distinguish the normal x-ray images from infected x-ray +images and highlight area considered for prediction, with the recent pandemic a +need to have a non-conventional method and faster detecting diseases, for which +X ray serves the purpose. Obectives: As of current situation any viral disease +that is infectious is potential pandemic, so there is need for cheap and early +detection system. Methods: This research will help to eases the work of expert +to do further analysis. Accuracy of three different preexisting models such as +DenseNet, MobileNet and VGG16 were high but models over-fitted primarily due to +black and white images. Results: This led to building up new method such as as +V-BreathNet which gave more than 96% percent accuracy. Conclusion: Thus, it can +be stated that not all state-of art CNN models can be used on B/W images. In +conclusion not all state-of-art CNN models can be used on B/W images. + +
+
+
+
+
+ + ☆ DifFUSER: Diffusion Model for Robust Multi-Sensor Fusion in 3D Object + Detection and BEV Segmentation + + +
+ Diffusion models have recently gained prominence as powerful deep generative +models, demonstrating unmatched performance across various domains. However, +their potential in multi-sensor fusion remains largely unexplored. In this +work, we introduce DifFUSER, a novel approach that leverages diffusion models +for multi-modal fusion in 3D object detection and BEV map segmentation. +Benefiting from the inherent denoising property of diffusion, DifFUSER is able +to refine or even synthesize sensor features in case of sensor malfunction, +thereby improving the quality of the fused output. In terms of architecture, +our DifFUSER blocks are chained together in a hierarchical BiFPN fashion, +termed cMini-BiFPN, offering an alternative architecture for latent diffusion. +We further introduce a Gated Self-conditioned Modulated (GSM) latent diffusion +module together with a Progressive Sensor Dropout Training (PSDT) paradigm, +designed to add stronger conditioning to the diffusion process and robustness +to sensor failures. Our extensive evaluations on the Nuscenes dataset reveal +that DifFUSER not only achieves state-of-the-art performance with a 69.1% mIOU +in BEV map segmentation tasks but also competes effectively with leading +transformer-based fusion techniques in 3D object detection. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ Self-Training Large Language Models for Improved Visual Program + Synthesis With Visual Reinforcement CVPR 2024 + + +
+ Visual program synthesis is a promising approach to exploit the reasoning +abilities of large language models for compositional computer vision tasks. +Previous work has used few-shot prompting with frozen LLMs to synthesize visual +programs. Training an LLM to write better visual programs is an attractive +prospect, but it is unclear how to accomplish this. No dataset of visual +programs for training exists, and acquisition of a visual program dataset +cannot be easily crowdsourced due to the need for expert annotators. To get +around the lack of direct supervision, we explore improving the program +synthesis abilities of an LLM using feedback from interactive experience. We +propose a method where we exploit existing annotations for a vision-language +task to improvise a coarse reward signal for that task, treat the LLM as a +policy, and apply reinforced self-training to improve the visual program +synthesis ability of the LLM for that task. We describe a series of experiments +on object detection, compositional visual question answering, and image-text +retrieval, and show that in each case, the self-trained LLM outperforms or +performs on par with few-shot frozen LLMs that are an order of magnitude +larger. Website: https://zaidkhan.me/ViReP + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Bridging the Gap Between End-to-End and Two-Step Text Spotting CVPR2024 + + +
+ Modularity plays a crucial role in the development and maintenance of complex +systems. While end-to-end text spotting efficiently mitigates the issues of +error accumulation and sub-optimal performance seen in traditional two-step +methodologies, the two-step methods continue to be favored in many competitions +and practical settings due to their superior modularity. In this paper, we +introduce Bridging Text Spotting, a novel approach that resolves the error +accumulation and suboptimal performance issues in two-step methods while +retaining modularity. To achieve this, we adopt a well-trained detector and +recognizer that are developed and trained independently and then lock their +parameters to preserve their already acquired capabilities. Subsequently, we +introduce a Bridge that connects the locked detector and recognizer through a +zero-initialized neural network. This zero-initialized neural network, +initialized with weights set to zeros, ensures seamless integration of the +large receptive field features in detection into the locked recognizer. +Furthermore, since the fixed detector and recognizer cannot naturally acquire +end-to-end optimization features, we adopt the Adapter to facilitate their +efficient learning of these features. We demonstrate the effectiveness of the +proposed method through extensive experiments: Connecting the latest detector +and recognizer through Bridging Text Spotting, we achieved an accuracy of 83.3% +on Total-Text, 69.8% on CTW1500, and 89.5% on ICDAR 2015. The code is available +at https://github.com/mxin262/Bridging-Text-Spotting. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Do We Really Need a Complex Agent System? Distill Embodied Agent into a + Single Model + + +
+ With the power of large language models (LLMs), open-ended embodied agents +can flexibly understand human instructions, generate interpretable guidance +strategies, and output executable actions. Nowadays, Multi-modal Language +Models~(MLMs) integrate multi-modal signals into LLMs, further bringing richer +perception to entity agents and allowing embodied agents to perceive +world-understanding tasks more delicately. However, existing works: 1) operate +independently by agents, each containing multiple LLMs, from perception to +action, resulting in gaps between complex tasks and execution; 2) train MLMs on +static data, struggling with dynamics in open-ended scenarios; 3) input prior +knowledge directly as prompts, suppressing application flexibility. We propose +STEVE-2, a hierarchical knowledge distillation framework for open-ended +embodied tasks, characterized by 1) a hierarchical system for multi-granular +task division, 2) a mirrored distillation method for parallel simulation data, +and 3) an extra expert model for bringing additional knowledge into parallel +simulation. After distillation, embodied agents can complete complex, +open-ended tasks without additional expert guidance, utilizing the performance +and knowledge of a versatile MLM. Extensive evaluations on navigation and +creation tasks highlight the superior performance of STEVE-2 in open-ended +tasks, with $1.4 \times$ - $7.3 \times$ in performance. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2403.08282 +
+
+
+
+
+ + ☆ Empowering Image Recovery_ A Multi-Attention Approach + + +
+ We propose Diverse Restormer (DART), a novel image restoration method that +effectively integrates information from various sources (long sequences, local +and global regions, feature dimensions, and positional dimensions) to address +restoration challenges. While Transformer models have demonstrated excellent +performance in image restoration due to their self-attention mechanism, they +face limitations in complex scenarios. Leveraging recent advancements in +Transformers and various attention mechanisms, our method utilizes customized +attention mechanisms to enhance overall performance. DART, our novel network +architecture, employs windowed attention to mimic the selective focusing +mechanism of human eyes. By dynamically adjusting receptive fields, it +optimally captures the fundamental features crucial for image resolution +reconstruction. Efficiency and performance balance are achieved through the +LongIR attention mechanism for long sequence image restoration. Integration of +attention mechanisms across feature and positional dimensions further enhances +the recovery of fine details. Evaluation across five restoration tasks +consistently positions DART at the forefront. Upon acceptance, we commit to +providing publicly accessible code and models to ensure reproducibility and +facilitate further research. + +
+
+ comment: 12 pages, 10 figures, 12 tables +
+
+
+
+
+ + ☆ Panoptic Perception: A Novel Task and Fine-grained Dataset for Universal + Remote Sensing Image Interpretation + + +
+ Current remote-sensing interpretation models often focus on a single task +such as detection, segmentation, or caption. However, the task-specific +designed models are unattainable to achieve the comprehensive multi-level +interpretation of images. The field also lacks support for multi-task joint +interpretation datasets. In this paper, we propose Panoptic Perception, a novel +task and a new fine-grained dataset (FineGrip) to achieve a more thorough and +universal interpretation for RSIs. The new task, 1) integrates pixel-level, +instance-level, and image-level information for universal image perception, 2) +captures image information from coarse to fine granularity, achieving deeper +scene understanding and description, and 3) enables various independent tasks +to complement and enhance each other through multi-task learning. By +emphasizing multi-task interactions and the consistency of perception results, +this task enables the simultaneous processing of fine-grained foreground +instance segmentation, background semantic segmentation, and global +fine-grained image captioning. Concretely, the FineGrip dataset includes 2,649 +remote sensing images, 12,054 fine-grained instance segmentation masks +belonging to 20 foreground things categories, 7,599 background semantic masks +for 5 stuff classes and 13,245 captioning sentences. Furthermore, we propose a +joint optimization-based panoptic perception model. Experimental results on +FineGrip demonstrate the feasibility of the panoptic perception task and the +beneficial effect of multi-task joint optimization on individual tasks. The +dataset will be publicly available. + +
+
+ comment: Undergoing Review +
+
+
+
+
+ + ☆ PIE: Physics-inspired Low-light Enhancement + + +
+ In this paper, we propose a physics-inspired contrastive learning paradigm +for low-light enhancement, called PIE. PIE primarily addresses three issues: +(i) To resolve the problem of existing learning-based methods often training a +LLE model with strict pixel-correspondence image pairs, we eliminate the need +for pixel-correspondence paired training data and instead train with unpaired +images. (ii) To address the disregard for negative samples and the inadequacy +of their generation in existing methods, we incorporate physics-inspired +contrastive learning for LLE and design the Bag of Curves (BoC) method to +generate more reasonable negative samples that closely adhere to the underlying +physical imaging principle. (iii) To overcome the reliance on semantic ground +truths in existing methods, we propose an unsupervised regional segmentation +module, ensuring regional brightness consistency while eliminating the +dependency on semantic ground truths. Overall, the proposed PIE can effectively +learn from unpaired positive/negative samples and smoothly realize non-semantic +regional enhancement, which is clearly different from existing LLE efforts. +Besides the novel architecture of PIE, we explore the gain of PIE on downstream +tasks such as semantic segmentation and face detection. Training on readily +available open data and extensive experiments demonstrate that our method +surpasses the state-of-the-art LLE models over six independent cross-scenes +datasets. PIE runs fast with reasonable GFLOPs in test time, making it easy to +use on mobile devices. + +
+
+
+
+
+ + ☆ D$^3$: Scaling Up Deepfake Detection by Learning from Discrepancy + + +
+ The boom of Generative AI brings opportunities entangled with risks and +concerns. In this work, we seek a step toward a universal deepfake detection +system with better generalization and robustness, to accommodate the +responsible deployment of diverse image generative models. We do so by first +scaling up the existing detection task setup from the one-generator to +multiple-generators in training, during which we disclose two challenges +presented in prior methodological designs. Specifically, we reveal that the +current methods tailored for training on one specific generator either struggle +to learn comprehensive artifacts from multiple generators or tend to sacrifice +their ability to identify fake images from seen generators (i.e., In-Domain +performance) to exchange the generalization for unseen generators (i.e., +Out-Of-Domain performance). To tackle the above challenges, we propose our +Discrepancy Deepfake Detector (D$^3$) framework, whose core idea is to learn +the universal artifacts from multiple generators by introducing a parallel +network branch that takes a distorted image as extra discrepancy signal to +supplement its original counterpart. Extensive scaled-up experiments on the +merged UFD and GenImage datasets with six detection models demonstrate the +effectiveness of our framework, achieving a 5.3% accuracy improvement in the +OOD testing compared to the current SOTA methods while maintaining the ID +performance. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ SDFR: Synthetic Data for Face Recognition Competition + + +
+ Large-scale face recognition datasets are collected by crawling the Internet +and without individuals' consent, raising legal, ethical, and privacy concerns. +With the recent advances in generative models, recently several works proposed +generating synthetic face recognition datasets to mitigate concerns in +web-crawled face recognition datasets. This paper presents the summary of the +Synthetic Data for Face Recognition (SDFR) Competition held in conjunction with +the 18th IEEE International Conference on Automatic Face and Gesture +Recognition (FG 2024) and established to investigate the use of synthetic data +for training face recognition models. The SDFR competition was split into two +tasks, allowing participants to train face recognition systems using new +synthetic datasets and/or existing ones. In the first task, the face +recognition backbone was fixed and the dataset size was limited, while the +second task provided almost complete freedom on the model backbone, the +dataset, and the training pipeline. The submitted models were trained on +existing and also new synthetic datasets and used clever methods to improve +training with synthetic data. The submissions were evaluated and ranked on a +diverse set of seven benchmarking datasets. The paper gives an overview of the +submitted face recognition models and reports achieved performance compared to +baseline models trained on real and synthetic datasets. Furthermore, the +evaluation of submissions is extended to bias assessment across different +demography groups. Lastly, an outlook on the current state of the research in +training face recognition models using synthetic data is presented, and +existing problems as well as potential future directions are also discussed. + +
+
+ comment: The 18th IEEE International Conference on Automatic Face and Gesture + Recognition (FG 2024) +
+
+
+
+
+ + ☆ GLCM-Based Feature Combination for Extraction Model Optimization in + Object Detection Using Machine Learning + + +
+ In the era of modern technology, object detection using the Gray Level +Co-occurrence Matrix (GLCM) extraction method plays a crucial role in object +recognition processes. It finds applications in real-time scenarios such as +security surveillance and autonomous vehicle navigation, among others. +Computational efficiency becomes a critical factor in achieving real-time +object detection. Hence, there is a need for a detection model with low +complexity and satisfactory accuracy. This research aims to enhance +computational efficiency by selecting appropriate features within the GLCM +framework. Two classification models, namely K-Nearest Neighbours (K-NN) and +Support Vector Machine (SVM), were employed, with the results indicating that +K-Nearest Neighbours (K-NN) outperforms SVM in terms of computational +complexity. Specifically, K-NN, when utilizing a combination of Correlation, +Energy, and Homogeneity features, achieves a 100% accuracy rate with low +complexity. Moreover, when using a combination of Energy and Homogeneity +features, K-NN attains an almost perfect accuracy level of 99.9889%, while +maintaining low complexity. On the other hand, despite SVM achieving 100% +accuracy in certain feature combinations, its high or very high complexity can +pose challenges, particularly in real-time applications. Therefore, based on +the trade-off between accuracy and complexity, the K-NN model with a +combination of Correlation, Energy, and Homogeneity features emerges as a more +suitable choice for real-time applications that demand high accuracy and low +complexity. This research provides valuable insights for optimizing object +detection in various applications requiring both high accuracy and rapid +responsiveness. + +
+
+
+
+
+ + ☆ SportsHHI: A Dataset for Human-Human Interaction Detection in Sports + Videos CVPR 2024 + + +
+ Video-based visual relation detection tasks, such as video scene graph +generation, play important roles in fine-grained video understanding. However, +current video visual relation detection datasets have two main limitations that +hinder the progress of research in this area. First, they do not explore +complex human-human interactions in multi-person scenarios. Second, the +relation types of existing datasets have relatively low-level semantics and can +be often recognized by appearance or simple prior information, without the need +for detailed spatio-temporal context reasoning. Nevertheless, comprehending +high-level interactions between humans is crucial for understanding complex +multi-person videos, such as sports and surveillance videos. To address this +issue, we propose a new video visual relation detection task: video human-human +interaction detection, and build a dataset named SportsHHI for it. SportsHHI +contains 34 high-level interaction classes from basketball and volleyball +sports. 118,075 human bounding boxes and 50,649 interaction instances are +annotated on 11,398 keyframes. To benchmark this, we propose a two-stage +baseline method and conduct extensive experiments to reveal the key factors for +a successful human-human interaction detector. We hope that SportsHHI can +stimulate research on human interaction understanding in videos and promote the +development of spatio-temporal context modeling techniques in video visual +relation detection. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing Video Summarization with Context Awareness + + +
+ Video summarization is a crucial research area that aims to efficiently +browse and retrieve relevant information from the vast amount of video content +available today. With the exponential growth of multimedia data, the ability to +extract meaningful representations from videos has become essential. Video +summarization techniques automatically generate concise summaries by selecting +keyframes, shots, or segments that capture the video's essence. This process +improves the efficiency and accuracy of various applications, including video +surveillance, education, entertainment, and social media. Despite the +importance of video summarization, there is a lack of diverse and +representative datasets, hindering comprehensive evaluation and benchmarking of +algorithms. Existing evaluation metrics also fail to fully capture the +complexities of video summarization, limiting accurate algorithm assessment and +hindering the field's progress. To overcome data scarcity challenges and +improve evaluation, we propose an unsupervised approach that leverages video +data structure and information for generating informative summaries. By moving +away from fixed annotations, our framework can produce representative summaries +effectively. Moreover, we introduce an innovative evaluation pipeline tailored +specifically for video summarization. Human participants are involved in the +evaluation, comparing our generated summaries to ground truth summaries and +assessing their informativeness. This human-centric approach provides valuable +insights into the effectiveness of our proposed techniques. Experimental +results demonstrate that our training-free framework outperforms existing +unsupervised approaches and achieves competitive results compared to +state-of-the-art supervised methods. + +
+
+ comment: 115 pages, 1 supplementary paper, undergraduate thesis report at + US-VNUHCM +
+
+
+
+
+ + ☆ Diffusion Time-step Curriculum for One Image to 3D Generation + + +
+ Score distillation sampling~(SDS) has been widely adopted to overcome the +absence of unseen views in reconstructing 3D objects from a \textbf{single} +image. It leverages pre-trained 2D diffusion models as teacher to guide the +reconstruction of student 3D models. Despite their remarkable success, +SDS-based methods often encounter geometric artifacts and texture saturation. +We find out the crux is the overlooked indiscriminate treatment of diffusion +time-steps during optimization: it unreasonably treats the student-teacher +knowledge distillation to be equal at all time-steps and thus entangles +coarse-grained and fine-grained modeling. Therefore, we propose the Diffusion +Time-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the +teacher and student models collaborating with the time-step curriculum in a +coarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and +Level50 benchmark demonstrate that DTC123 can produce multi-view consistent, +high-quality, and diverse 3D assets. Codes and more generation demos will be +released in https://github.com/yxymessi/DTC123. + +
+
+
+
+
+ + ☆ Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering + Regularization for Multi-Modal 3D Semantic Occupancy Prediction + + +
+ 3D semantic occupancy prediction is a pivotal task in the field of autonomous +driving. Recent approaches have made great advances in 3D semantic occupancy +predictions on a single modality. However, multi-modal semantic occupancy +prediction approaches have encountered difficulties in dealing with the +modality heterogeneity, modality misalignment, and insufficient modality +interactions that arise during the fusion of different modalities data, which +may result in the loss of important geometric and semantic information. This +letter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy +prediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera +feature fusion with implicit volume rendering regularization. The key insight +is that volume rendering in the feature space can proficiently bridge the gap +between 3D LiDAR sweeps and 2D images while serving as a physical +regularization to enhance LiDAR-camera fused volumetric representation. +Specifically, we first propose a Geometric- and Semantic-aware Fusion +(GSFusion) module to explicitly enhance LiDAR features by incorporating +neighboring camera features through a K-nearest neighbors (KNN) search. Then, +we employ volume rendering to project the fused feature back to the image +planes for reconstructing color and depth maps. These maps are then supervised +by input images from the camera and depth estimations derived from LiDAR, +respectively. Extensive experiments on the popular nuScenes and SemanticKITTI +benchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy +prediction. The project page is available at +https://rorisis.github.io/Co-Occ_project-page/. + +
+
+
+
+
+ + ☆ Learning Instance-Aware Correspondences for Robust Multi-Instance Point + Cloud Registration in Cluttered Scenes + + +
+ Multi-instance point cloud registration estimates the poses of multiple +instances of a model point cloud in a scene point cloud. Extracting accurate +point correspondence is to the center of the problem. Existing approaches +usually treat the scene point cloud as a whole, overlooking the separation of +instances. Therefore, point features could be easily polluted by other points +from the background or different instances, leading to inaccurate +correspondences oblivious to separate instances, especially in cluttered +scenes. In this work, we propose MIRETR, Multi-Instance REgistration +TRansformer, a coarse-to-fine approach to the extraction of instance-aware +correspondences. At the coarse level, it jointly learns instance-aware +superpoint features and predicts per-instance masks. With instance masks, the +influence from outside of the instance being concerned is minimized, such that +highly reliable superpoint correspondences can be extracted. The superpoint +correspondences are then extended to instance candidates at the fine level +according to the instance masks. At last, an efficient candidate selection and +refinement algorithm is devised to obtain the final registrations. Extensive +experiments on three public benchmarks demonstrate the efficacy of our +approach. In particular, MIRETR outperforms the state of the arts by 16.6 +points on F1 score on the challenging ROBI benchmark. Code and models are +available at https://github.com/zhiyuanYU134/MIRETR. + +
+
+
+
+
+ + ☆ Rethinking Self-training for Semi-supervised Landmark Detection: A + Selection-free Approach + + +
+ Self-training is a simple yet effective method for semi-supervised learning, +during which pseudo-label selection plays an important role for handling +confirmation bias. Despite its popularity, applying self-training to landmark +detection faces three problems: 1) The selected confident pseudo-labels often +contain data bias, which may hurt model performance; 2) It is not easy to +decide a proper threshold for sample selection as the localization task can be +sensitive to noisy pseudo-labels; 3) coordinate regression does not output +confidence, making selection-based self-training infeasible. To address the +above issues, we propose Self-Training for Landmark Detection (STLD), a method +that does not require explicit pseudo-label selection. Instead, STLD constructs +a task curriculum to deal with confirmation bias, which progressively +transitions from more confident to less confident tasks over the rounds of +self-training. Pseudo pretraining and shrink regression are two essential +components for such a curriculum, where the former is the first task of the +curriculum for providing a better model initialization and the latter is +further added in the later rounds to directly leverage the pseudo-labels in a +coarse-to-fine manner. Experiments on three facial and one medical landmark +detection benchmark show that STLD outperforms the existing methods +consistently in both semi- and omni-supervised settings. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ NPB-REC: A Non-parametric Bayesian Deep-learning Approach for + Undersampled MRI Reconstruction with Uncertainty Estimation + + +
+ The ability to reconstruct high-quality images from undersampled MRI data is +vital in improving MRI temporal resolution and reducing acquisition times. Deep +learning methods have been proposed for this task, but the lack of verified +methods to quantify the uncertainty in the reconstructed images hampered +clinical applicability. We introduce "NPB-REC", a non-parametric fully Bayesian +framework, for MRI reconstruction from undersampled data with uncertainty +estimation. We use Stochastic Gradient Langevin Dynamics during training to +characterize the posterior distribution of the network parameters. This enables +us to both improve the quality of the reconstructed images and quantify the +uncertainty in the reconstructed images. We demonstrate the efficacy of our +approach on a multi-coil MRI dataset from the fastMRI challenge and compare it +to the baseline End-to-End Variational Network (E2E-VarNet). Our approach +outperforms the baseline in terms of reconstruction accuracy by means of PSNR +and SSIM ($34.55$, $0.908$ vs. $33.08$, $0.897$, $p<0.01$, acceleration rate +$R=8$) and provides uncertainty measures that correlate better with the +reconstruction error (Pearson correlation, $R=0.94$ vs. $R=0.91$). +Additionally, our approach exhibits better generalization capabilities against +anatomical distribution shifts (PSNR and SSIM of $32.38$, $0.849$ vs. $31.63$, +$0.836$, $p<0.01$, training on brain data, inference on knee data, acceleration +rate $R=8$). NPB-REC has the potential to facilitate the safe utilization of +deep learning-based methods for MRI reconstruction from undersampled data. Code +and trained models are available at \url{https://github.com/samahkh/NPB-REC}. + +
+
+ comment: Published in Artificial Intelligence in Medicine, DOI: + https://doi.org/10.1016/j.artmed.2024.102798 This is an extension + representing a more comprehensive work extending preliminary work presented + at arXiv:2208.03966 +
+
+
+
+
+ + ☆ A self-attention model for robust rigid slice-to-volume registration of + functional MRI + + +
+ Functional Magnetic Resonance Imaging (fMRI) is vital in neuroscience, +enabling investigations into brain disorders, treatment monitoring, and brain +function mapping. However, head motion during fMRI scans, occurring between +shots of slice acquisition, can result in distortion, biased analyses, and +increased costs due to the need for scan repetitions. Therefore, retrospective +slice-level motion correction through slice-to-volume registration (SVR) is +crucial. Previous studies have utilized deep learning (DL) based models to +address the SVR task; however, they overlooked the uncertainty stemming from +the input stack of slices and did not assign weighting or scoring to each +slice. In this work, we introduce an end-to-end SVR model for aligning 2D fMRI +slices with a 3D reference volume, incorporating a self-attention mechanism to +enhance robustness against input data variations and uncertainties. It utilizes +independent slice and volume encoders and a self-attention module to assign +pixel-wise scores for each slice. We conducted evaluation experiments on 200 +images involving synthetic rigid motion generated from 27 subjects belonging to +the test set, from the publicly available Healthy Brain Network (HBN) dataset. +Our experimental results demonstrate that our model achieves competitive +performance in terms of alignment accuracy compared to state-of-the-art deep +learning-based methods (Euclidean distance of $0.93$ [mm] vs. $1.86$ [mm]). +Furthermore, our approach exhibits significantly faster registration speed +compared to conventional iterative methods ($0.096$ sec. vs. $1.17$ sec.). Our +end-to-end SVR model facilitates real-time head motion tracking during fMRI +acquisition, ensuring reliability and robustness against uncertainties in +inputs. source code, which includes the training and evaluations, will be +available soon. + +
+
+ comment: Currently under review +
+
+
+
+
+ + ☆ BeyondScene: Higher-Resolution Human-Centric Scene Generation With + Pretrained Diffusion + + +
+ Generating higher-resolution human-centric scenes with details and controls +remains a challenge for existing text-to-image diffusion models. This challenge +stems from limited training image size, text encoder capacity (limited tokens), +and the inherent difficulty of generating complex scenes involving multiple +humans. While current methods attempted to address training size limit only, +they often yielded human-centric scenes with severe artifacts. We propose +BeyondScene, a novel framework that overcomes prior limitations, generating +exquisite higher-resolution (over 8K) human-centric scenes with exceptional +text-image correspondence and naturalness using existing pretrained diffusion +models. BeyondScene employs a staged and hierarchical approach to initially +generate a detailed base image focusing on crucial elements in instance +creation for multiple humans and detailed descriptions beyond token limit of +diffusion model, and then to seamlessly convert the base image to a +higher-resolution output, exceeding training image size and incorporating +details aware of text and instances via our novel instance-aware hierarchical +enlargement process that consists of our proposed high-frequency injected +forward diffusion and adaptive joint diffusion. BeyondScene surpasses existing +methods in terms of correspondence with detailed text descriptions and +naturalness, paving the way for advanced applications in higher-resolution +human-centric scene creation beyond the capacity of pretrained diffusion models +without costly retraining. Project page: +https://janeyeon.github.io/beyond-scene. + +
+
+ comment: Project page: https://janeyeon.github.io/beyond-scene +
+
+
+
+
+ + ☆ Frequency Decomposition-Driven Unsupervised Domain Adaptation for Remote + Sensing Image Semantic Segmentation + + +
+ Cross-domain semantic segmentation of remote sensing (RS) imagery based on +unsupervised domain adaptation (UDA) techniques has significantly advanced +deep-learning applications in the geosciences. Recently, with its ingenious and +versatile architecture, the Transformer model has been successfully applied in +RS-UDA tasks. However, existing UDA methods mainly focus on domain alignment in +the high-level feature space. It is still challenging to retain cross-domain +local spatial details and global contextual semantics simultaneously, which is +crucial for the RS image semantic segmentation task. To address these problems, +we propose novel high/low-frequency decomposition (HLFD) techniques to guide +representation alignment in cross-domain semantic segmentation. Specifically, +HLFD attempts to decompose the feature maps into high- and low-frequency +components before performing the domain alignment in the corresponding +subspaces. Secondly, to further facilitate the alignment of decomposed +features, we propose a fully global-local generative adversarial network, +namely GLGAN, to learn domain-invariant detailed and semantic features across +domains by leveraging global-local transformer blocks (GLTBs). By integrating +HLFD techniques and the GLGAN, a novel UDA framework called FD-GLGAN is +developed to improve the cross-domain transferability and generalization +capability of semantic segmentation models. Extensive experiments on two +fine-resolution benchmark datasets, namely ISPRS Potsdam and ISPRS Vaihingen, +highlight the effectiveness and superiority of the proposed approach as +compared to the state-of-the-art UDA methods. The source code for this work +will be accessible at https://github.com/sstary/SSRS. + +
+
+ comment: 28 pages, 13 figures +
+
+
+
+
+ + ☆ VTR: An Optimized Vision Transformer for SAR ATR Acceleration on FPGA SP + + +
+ Synthetic Aperture Radar (SAR) Automatic Target Recognition (ATR) is a key +technique used in military applications like remote-sensing image recognition. +Vision Transformers (ViTs) are the current state-of-the-art in various computer +vision applications, outperforming their CNN counterparts. However, using ViTs +for SAR ATR applications is challenging due to (1) standard ViTs require +extensive training data to generalize well due to their low locality; the +standard SAR datasets, however, have a limited number of labeled training data +which reduces the learning capability of ViTs; (2) ViTs have a high parameter +count and are computation intensive which makes their deployment on +resource-constrained SAR platforms difficult. In this work, we develop a +lightweight ViT model that can be trained directly on small datasets without +any pre-training by utilizing the Shifted Patch Tokenization (SPT) and Locality +Self-Attention (LSA) modules. We directly train this model on SAR datasets +which have limited training samples to evaluate its effectiveness for SAR ATR +applications. We evaluate our proposed model, that we call VTR (ViT for SAR +ATR), on three widely used SAR datasets: MSTAR, SynthWakeSAR, and GBSAR. +Further, we propose a novel FPGA accelerator for VTR, in order to enable +deployment for real-time SAR ATR applications. + +
+
+ comment: SPIE DCS 2024 +
+
+
+
+
+ + ☆ DATENeRF: Depth-Aware Text-based Editing of NeRFs + + +
+ Recent advancements in diffusion models have shown remarkable proficiency in +editing 2D images based on text prompts. However, extending these techniques to +edit scenes in Neural Radiance Fields (NeRF) is complex, as editing individual +2D frames can result in inconsistencies across multiple views. Our crucial +insight is that a NeRF scene's geometry can serve as a bridge to integrate +these 2D edits. Utilizing this geometry, we employ a depth-conditioned +ControlNet to enhance the coherence of each 2D image modification. Moreover, we +introduce an inpainting approach that leverages the depth information of NeRF +scenes to distribute 2D edits across different images, ensuring robustness +against errors and resampling challenges. Our results reveal that this +methodology achieves more consistent, lifelike, and detailed edits than +existing leading methods for text-driven NeRF scene editing. + +
+
+ comment: 14 pages, Conference paper, 3D Scene Editing, Neural Rendering, + Diffusion Models +
+
+
+
+
+ + ☆ MedIAnomaly: A comparative study of anomaly detection in medical images + + +
+ Anomaly detection (AD) aims at detecting abnormal samples that deviate from +the expected normal patterns. Generally, it can be trained on merely normal +data without the requirement for abnormal samples, and thereby plays an +important role in the recognition of rare diseases and health screening in the +medical domain. Despite numerous related studies, we observe a lack of a fair +and comprehensive evaluation, which causes some ambiguous conclusions and +hinders the development of this field. This paper focuses on building a +benchmark with unified implementation and comparison to address this problem. +In particular, seven medical datasets with five image modalities, including +chest X-rays, brain MRIs, retinal fundus images, dermatoscopic images, and +histopathology whole slide images are organized for extensive evaluation. +Twenty-seven typical AD methods, including reconstruction and self-supervised +learning-based methods, are involved in comparison of image-level anomaly +classification and pixel-level anomaly segmentation. Furthermore, we for the +first time formally explore the effect of key components in existing methods, +clearly revealing unresolved challenges and potential future directions. The +datasets and code are available at +\url{https://github.com/caiyu6666/MedIAnomaly}. + +
+
+ comment: Under submission +
+
+
+
+
+ + ☆ Latent-based Diffusion Model for Long-tailed Recognition CVPR2024 + + +
+ Long-tailed imbalance distribution is a common issue in practical computer +vision applications. Previous works proposed methods to address this problem, +which can be categorized into several classes: re-sampling, re-weighting, +transfer learning, and feature augmentation. In recent years, diffusion models +have shown an impressive generation ability in many sub-problems of deep +computer vision. However, its powerful generation has not been explored in +long-tailed problems. We propose a new approach, the Latent-based Diffusion +Model for Long-tailed Recognition (LDMLR), as a feature augmentation method to +tackle the issue. First, we encode the imbalanced dataset into features using +the baseline model. Then, we train a Denoising Diffusion Implicit Model (DDIM) +using these encoded features to generate pseudo-features. Finally, we train the +classifier using the encoded and pseudo-features from the previous two steps. +The model's accuracy shows an improvement on the CIFAR-LT and ImageNet-LT +datasets by using the proposed method. + +
+
+ comment: 8 pages, 3 figures, accepted by L3DIVU-CVPR2024 +
+
+
+
+
+ + ☆ Cluster-based Video Summarization with Temporal Context Awareness + + +
+ In this paper, we present TAC-SUM, a novel and efficient training-free +approach for video summarization that addresses the limitations of existing +cluster-based models by incorporating temporal context. Our method partitions +the input video into temporally consecutive segments with clustering +information, enabling the injection of temporal awareness into the clustering +process, setting it apart from prior cluster-based summarization methods. The +resulting temporal-aware clusters are then utilized to compute the final +summary, using simple rules for keyframe selection and frame importance +scoring. Experimental results on the SumMe dataset demonstrate the +effectiveness of our proposed approach, outperforming existing unsupervised +methods and achieving comparable performance to state-of-the-art supervised +summarization techniques. Our source code is available for reference at +\url{https://github.com/hcmus-thesis-gulu/TAC-SUM}. + +
+
+ comment: 14 pages, 6 figures, accepted in PSIVT 2023 +
+
+
+
+
+ + ☆ Automated Lane Change Behavior Prediction and Environmental Perception + Based on SLAM Technology + + +
+ In addition to environmental perception sensors such as cameras, radars, etc. +in the automatic driving system, the external environment of the vehicle is +perceived, in fact, there is also a perception sensor that has been silently +dedicated in the system, that is, the positioning module. This paper explores +the application of SLAM (Simultaneous Localization and Mapping) technology in +the context of automatic lane change behavior prediction and environment +perception for autonomous vehicles. It discusses the limitations of traditional +positioning methods, introduces SLAM technology, and compares LIDAR SLAM with +visual SLAM. Real-world examples from companies like Tesla, Waymo, and Mobileye +showcase the integration of AI-driven technologies, sensor fusion, and SLAM in +autonomous driving systems. The paper then delves into the specifics of SLAM +algorithms, sensor technologies, and the importance of automatic lane changes +in driving safety and efficiency. It highlights Tesla's recent update to its +Autopilot system, which incorporates automatic lane change functionality using +SLAM technology. The paper concludes by emphasizing the crucial role of SLAM in +enabling accurate environment perception, positioning, and decision-making for +autonomous vehicles, ultimately enhancing safety and driving experience. + +
+
+
+
+
+ + ☆ FastHDRNet: A new efficient method for SDR-to-HDR Translation + + +
+ Modern displays nowadays possess the capability to render video content with +a high dynamic range (HDR) and an extensive color gamut (WCG).However, the +majority of available resources are still in standard dynamic range(SDR). +Therefore, we need to identify an effective methodology for this objective.The +existing deep neural network (DNN) based SDR(Standard dynamic range) to HDR +(High dynamic range) conversion methods outperform conventional methods, but +they are either too large to implement or generate some terrible artifacts. We +propose a neural network for SDRTV to HDRTV conversion, termed "FastHDRNet". +This network includes two parts, Adaptive Universal Color Transformation and +Local Enhancement.The architecture is designed as a lightweight network that +utilizes global statistics and local information with super high efficiency. +After the experiment, we find that our proposed method achieve state-of-the-art +performance in both quantitative comparisons and visual quality with a +lightweight structure and a enhanced infer speed. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ☆ Diffusion-RWKV: Scaling RWKV-Like Architectures for Diffusion Models + + +
+ Transformers have catalyzed advancements in computer vision and natural +language processing (NLP) fields. However, substantial computational complexity +poses limitations for their application in long-context tasks, such as +high-resolution image generation. This paper introduces a series of +architectures adapted from the RWKV model used in the NLP, with requisite +modifications tailored for diffusion model applied to image generation tasks, +referred to as Diffusion-RWKV. Similar to the diffusion with Transformers, our +model is designed to efficiently handle patchnified inputs in a sequence with +extra conditions, while also scaling up effectively, accommodating both +large-scale parameters and extensive datasets. Its distinctive advantage +manifests in its reduced spatial aggregation complexity, rendering it +exceptionally adept at processing high-resolution images, thereby eliminating +the necessity for windowing or group cached operations. Experimental results on +both condition and unconditional image generation tasks demonstrate that +Diffison-RWKV achieves performance on par with or surpasses existing CNN or +Transformer-based diffusion models in FID and IS metrics while significantly +reducing total computation FLOP usage. + +
+
+
+
+
+ + ☆ DELTA: Decoupling Long-Tailed Online Continual Learning CVPR + + +
+ A significant challenge in achieving ubiquitous Artificial Intelligence is +the limited ability of models to rapidly learn new information in real-world +scenarios where data follows long-tailed distributions, all while avoiding +forgetting previously acquired knowledge. In this work, we study the +under-explored problem of Long-Tailed Online Continual Learning (LTOCL), which +aims to learn new tasks from sequentially arriving class-imbalanced data +streams. Each data is observed only once for training without knowing the task +data distribution. We present DELTA, a decoupled learning approach designed to +enhance learning representations and address the substantial imbalance in +LTOCL. We enhance the learning process by adapting supervised contrastive +learning to attract similar samples and repel dissimilar (out-of-class) +samples. Further, by balancing gradients during training using an equalization +loss, DELTA significantly enhances learning outcomes and successfully mitigates +catastrophic forgetting. Through extensive evaluation, we demonstrate that +DELTA improves the capacity for incremental learning, surpassing existing OCL +methods. Our results suggest considerable promise for applying OCL in +real-world applications. + +
+
+ comment: CVPR Workshop acceptance archival track +
+
+
+
+
+ + ☆ RoNet: Rotation-oriented Continuous Image Translation + + +
+ The generation of smooth and continuous images between domains has recently +drawn much attention in image-to-image (I2I) translation. Linear relationship +acts as the basic assumption in most existing approaches, while applied to +different aspects including features, models or labels. However, the linear +assumption is hard to conform with the element dimension increases and suffers +from the limit that having to obtain both ends of the line. In this paper, we +propose a novel rotation-oriented solution and model the continuous generation +with an in-plane rotation over the style representation of an image, achieving +a network named RoNet. A rotation module is implanted in the generation network +to automatically learn the proper plane while disentangling the content and the +style of an image. To encourage realistic texture, we also design a patch-based +semantic style loss that learns the different styles of the similar object in +different domains. We conduct experiments on forest scenes (where the complex +texture makes the generation very challenging), faces, streetscapes and the +iphone2dslr task. The results validate the superiority of our method in terms +of visual quality and continuity. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Mixed-Query Transformer: A Unified Image Segmentation Architecture + + +
+ Existing unified image segmentation models either employ a unified +architecture across multiple tasks but use separate weights tailored to each +dataset, or apply a single set of weights to multiple datasets but are limited +to a single task. In this paper, we introduce the Mixed-Query Transformer +(MQ-Former), a unified architecture for multi-task and multi-dataset image +segmentation using a single set of weights. To enable this, we propose a mixed +query strategy, which can effectively and dynamically accommodate different +types of objects without heuristic designs. In addition, the unified +architecture allows us to use data augmentation with synthetic masks and +captions to further improve model generalization. Experiments demonstrate that +MQ-Former can not only effectively handle multiple segmentation datasets and +tasks compared to specialized state-of-the-art models with competitive +performance, but also generalize better to open-set segmentation tasks, +evidenced by over 7 points higher performance than the prior art on the +open-vocabulary SeginW benchmark. + +
+
+
+
+
+ + ☆ Aligning Diffusion Models by Optimizing Human Utility + + +
+ We present Diffusion-KTO, a novel approach for aligning text-to-image +diffusion models by formulating the alignment objective as the maximization of +expected human utility. Since this objective applies to each generation +independently, Diffusion-KTO does not require collecting costly pairwise +preference data nor training a complex reward model. Instead, our objective +requires simple per-image binary feedback signals, e.g. likes or dislikes, +which are abundantly available. After fine-tuning using Diffusion-KTO, +text-to-image diffusion models exhibit superior performance compared to +existing techniques, including supervised fine-tuning and Diffusion-DPO, both +in terms of human judgment and automatic evaluation metrics such as PickScore +and ImageReward. Overall, Diffusion-KTO unlocks the potential of leveraging +readily available per-image binary signals and broadens the applicability of +aligning text-to-image diffusion models with human preferences. + +
+
+ comment: 27 pages, 11 figures +
+
+
+
+
+ + ☆ Automated Polyp Segmentation in Colonoscopy Images + + +
+ It is important to find the polyps in a human system that helps to prevent +cancer during medical diagnosis. This research discusses using a dilated +convolution module along with a criss cross attention-based network to segment +polyps from the endoscopic images of the colon. To gather the context +information of all pixels in an image more efficiently, criss-cross attention +module has played a vital role. In order to extract maximum information from +dataset, data augmentation techniques are employed in the dataset. Rotations, +flips, scaling, and contrast along with varying learning rates were implemented +to make a better model. Global average pooling was applied over ResNet50 that +helped to store the important details of encoder. In our experiment, the +proposed architecture's performance was compared with existing models like +U-Net, DeepLabV3, PraNet. This architecture outperformed other models on the +subset of dataset which has irregular polyp shapes. The combination of dilated +convolution module, RCCA, and global average pooling was found to be effective +for irregular shapes. Our architecture demonstrates an enhancement, with an +average improvement of 3.75% across all metrics when compared to existing +models. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ JRDB-Social: A Multifaceted Robotic Dataset for Understanding of Context + and Dynamics of Human Interactions Within Social Groups CVPR 2024 + + +
+ Understanding human social behaviour is crucial in computer vision and +robotics. Micro-level observations like individual actions fall short, +necessitating a comprehensive approach that considers individual behaviour, +intra-group dynamics, and social group levels for a thorough understanding. To +address dataset limitations, this paper introduces JRDB-Social, an extension of +JRDB. Designed to fill gaps in human understanding across diverse indoor and +outdoor social contexts, JRDB-Social provides annotations at three levels: +individual attributes, intra-group interactions, and social group context. This +dataset aims to enhance our grasp of human social dynamics for robotic +applications. Utilizing the recent cutting-edge multi-modal large language +models, we evaluated our benchmark to explore their capacity to decipher social +human behaviour. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://jrdb.erc.monash.edu/dataset/social +
+
+
+
+
+ + ☆ Beyond the Known: Adversarial Autoencoders in Novelty Detection + + +
+ In novelty detection, the goal is to decide if a new data point should be +categorized as an inlier or an outlier, given a training dataset that primarily +captures the inlier distribution. Recent approaches typically use deep encoder +and decoder network frameworks to derive a reconstruction error, and employ +this error either to determine a novelty score, or as the basis for a one-class +classifier. In this research, we use a similar framework but with a lightweight +deep network, and we adopt a probabilistic score with reconstruction error. Our +methodology calculates the probability of whether the sample comes from the +inlier distribution or not. This work makes two key contributions. The first is +that we compute the novelty probability by linearizing the manifold that holds +the structure of the inlier distribution. This allows us to interpret how the +probability is distributed and can be determined in relation to the local +coordinates of the manifold tangent space. The second contribution is that we +improve the training protocol for the network. Our results indicate that our +approach is effective at learning the target class, and it outperforms recent +state-of-the-art methods on several benchmark datasets. + +
+
+ comment: Accepted at the VISAAP 2024 +
+
+
+
+
+ + ☆ Study of the effect of Sharpness on Blind Video Quality Assessment + + +
+ Introduction: Video Quality Assessment (VQA) is one of the important areas of +study in this modern era, where video is a crucial component of communication +with applications in every field. Rapid technology developments in mobile +technology enabled anyone to create videos resulting in a varied range of video +quality scenarios. Objectives: Though VQA was present for some time with the +classical metrices like SSIM and PSNR, the advent of machine learning has +brought in new techniques of VQAs which are built upon Convolutional Neural +Networks (CNNs) or Deep Neural Networks (DNNs). Methods: Over the past years +various research studies such as the BVQA which performed video quality +assessment of nature-based videos using DNNs exposed the powerful capabilities +of machine learning algorithms. BVQA using DNNs explored human visual system +effects such as content dependency and time-related factors normally known as +temporal effects. Results: This study explores the sharpness effect on models +like BVQA. Sharpness is the measure of the clarity and details of the video +image. Sharpness typically involves analyzing the edges and contrast of the +image to determine the overall level of detail and sharpness. Conclusion: This +study uses the existing video quality databases such as CVD2014. A comparative +study of the various machine learning parameters such as SRCC and PLCC during +the training and testing are presented along with the conclusion. + +
+
+
+
+
+ + ☆ Music Recommendation Based on Facial Emotion Recognition + + +
+ Introduction: Music provides an incredible avenue for individuals to express +their thoughts and emotions, while also serving as a delightful mode of +entertainment for enthusiasts and music lovers. Objectives: This paper presents +a comprehensive approach to enhancing the user experience through the +integration of emotion recognition, music recommendation, and explainable AI +using GRAD-CAM. Methods: The proposed methodology utilizes a ResNet50 model +trained on the Facial Expression Recognition (FER) dataset, consisting of real +images of individuals expressing various emotions. Results: The system achieves +an accuracy of 82% in emotion classification. By leveraging GRAD-CAM, the model +provides explanations for its predictions, allowing users to understand the +reasoning behind the system's recommendations. The model is trained on both FER +and real user datasets, which include labelled facial expressions, and real +images of individuals expressing various emotions. The training process +involves pre-processing the input images, extracting features through +convolutional layers, reasoning with dense layers, and generating emotion +predictions through the output layer. Conclusion: The proposed methodology, +leveraging the Resnet50 model with ROI-based analysis and explainable AI +techniques, offers a robust and interpretable solution for facial emotion +detection paper. + +
+
+
+
+
+ + ☆ Deep Learning-Based Brain Image Segmentation for Automated Tumour + Detection + + +
+ Introduction: The present study on the development and evaluation of an +automated brain tumor segmentation technique based on deep learning using the +3D U-Net model. Objectives: The objective is to leverage state-of-the-art +convolutional neural networks (CNNs) on a large dataset of brain MRI scans for +segmentation. Methods: The proposed methodology applies pre-processing +techniques for enhanced performance and generalizability. Results: Extensive +validation on an independent dataset confirms the model's robustness and +potential for integration into clinical workflows. The study emphasizes the +importance of data pre-processing and explores various hyperparameters to +optimize the model's performance. The 3D U-Net, has given IoUs for training and +validation dataset have been 0.8181 and 0.66 respectively. Conclusion: +Ultimately, this comprehensive framework showcases the efficacy of deep +learning in automating brain tumour detection, offering valuable support in +clinical practice. + +
+
+
+
+
+ + ☆ PIE: Physics-inspired Low-light Enhancement + + +
+ In this paper, we propose a physics-inspired contrastive learning paradigm +for low-light enhancement, called PIE. PIE primarily addresses three issues: +(i) To resolve the problem of existing learning-based methods often training a +LLE model with strict pixel-correspondence image pairs, we eliminate the need +for pixel-correspondence paired training data and instead train with unpaired +images. (ii) To address the disregard for negative samples and the inadequacy +of their generation in existing methods, we incorporate physics-inspired +contrastive learning for LLE and design the Bag of Curves (BoC) method to +generate more reasonable negative samples that closely adhere to the underlying +physical imaging principle. (iii) To overcome the reliance on semantic ground +truths in existing methods, we propose an unsupervised regional segmentation +module, ensuring regional brightness consistency while eliminating the +dependency on semantic ground truths. Overall, the proposed PIE can effectively +learn from unpaired positive/negative samples and smoothly realize non-semantic +regional enhancement, which is clearly different from existing LLE efforts. +Besides the novel architecture of PIE, we explore the gain of PIE on downstream +tasks such as semantic segmentation and face detection. Training on readily +available open data and extensive experiments demonstrate that our method +surpasses the state-of-the-art LLE models over six independent cross-scenes +datasets. PIE runs fast with reasonable GFLOPs in test time, making it easy to +use on mobile devices. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2112.06451 +
+
+
+
+
+ + ♻ ☆ Which One? Leveraging Context Between Objects and Multiple Views for + Language Grounding + + +
+ When connecting objects and their language referents in an embodied 3D +environment, it is important to note that: (1) an object can be better +characterized by leveraging comparative information between itself and other +objects, and (2) an object's appearance can vary with camera position. As such, +we present the Multi-view Approach to Grounding in Context (MAGiC), which +selects an object referent based on language that distinguishes between two +similar objects. By pragmatically reasoning over both objects and across +multiple views of those objects, MAGiC improves over the state-of-the-art model +on the SNARE object reference task with a relative error reduction of 12.9\% +(representing an absolute improvement of 2.7\%). Ablation studies show that +reasoning jointly over object referent candidates and multiple views of each +object both contribute to improved accuracy. Code: +https://github.com/rcorona/magic_snare/ + +
+
+
+
+
+ + ♻ ☆ Tensor-based Multimodal Learning for Prediction of Pulmonary Arterial + Wedge Pressure from Cardiac MRI + + +
+ Heart failure is a serious and life-threatening condition that can lead to +elevated pressure in the left ventricle. Pulmonary Arterial Wedge Pressure +(PAWP) is an important surrogate marker indicating high pressure in the left +ventricle. PAWP is determined by Right Heart Catheterization (RHC) but it is an +invasive procedure. A non-invasive method is useful in quickly identifying +high-risk patients from a large population. In this work, we develop a tensor +learning-based pipeline for identifying PAWP from multimodal cardiac Magnetic +Resonance Imaging (MRI). This pipeline extracts spatial and temporal features +from high-dimensional scans. For quality control, we incorporate an epistemic +uncertainty-based binning strategy to identify poor-quality training samples. +To improve the performance, we learn complementary information by integrating +features from multimodal data: cardiac MRI with short-axis and four-chamber +views, and Electronic Health Records. The experimental analysis on a large +cohort of $1346$ subjects who underwent the RHC procedure for PAWP estimation +indicates that the proposed pipeline has a diagnostic value and can produce +promising performance with significant improvement over the baseline in +clinical practice (i.e., $\Delta$AUC $=0.10$, $\Delta$Accuracy $=0.06$, and +$\Delta$MCC $=0.39$). The decision curve analysis further confirms the clinical +utility of our method. + +
+
+
+
+
+ + ♻ ☆ Adaptively Placed Multi-Grid Scene Representation Networks for + Large-Scale Data Visualization IEEE VIS 2023 + + +
+ Scene representation networks (SRNs) have been recently proposed for +compression and visualization of scientific data. However, state-of-the-art +SRNs do not adapt the allocation of available network parameters to the complex +features found in scientific data, leading to a loss in reconstruction quality. +We address this shortcoming with an adaptively placed multi-grid SRN (APMGSRN) +and propose a domain decomposition training and inference technique for +accelerated parallel training on multi-GPU systems. We also release an +open-source neural volume rendering application that allows plug-and-play +rendering with any PyTorch-based SRN. Our proposed APMGSRN architecture uses +multiple spatially adaptive feature grids that learn where to be placed within +the domain to dynamically allocate more neural network resources where error is +high in the volume, improving state-of-the-art reconstruction accuracy of SRNs +for scientific data without requiring expensive octree refining, pruning, and +traversal like previous adaptive models. In our domain decomposition approach +for representing large-scale data, we train an set of APMGSRNs in parallel on +separate bricks of the volume to reduce training time while avoiding overhead +necessary for an out-of-core solution for volumes too large to fit in GPU +memory. After training, the lightweight SRNs are used for realtime neural +volume rendering in our open-source renderer, where arbitrary view angles and +transfer functions can be explored. A copy of this paper, all code, all models +used in our experiments, and all supplemental materials and videos are +available at https://github.com/skywolf829/APMGSRN. + +
+
+ comment: Accepted to IEEE VIS 2023. + https://www.computer.org/csdl/journal/tg/2024/01/10297599/1RyYguiNBLO +
+
+
+
+
+ + ♻ ☆ Joint2Human: High-quality 3D Human Generation via Compact Spherical + Embedding of 3D Joints + + +
+ 3D human generation is increasingly significant in various applications. +However, the direct use of 2D generative methods in 3D generation often results +in losing local details, while methods that reconstruct geometry from generated +images struggle with global view consistency. In this work, we introduce +Joint2Human, a novel method that leverages 2D diffusion models to generate +detailed 3D human geometry directly, ensuring both global structure and local +details. To achieve this, we employ the Fourier occupancy field (FOF) +representation, enabling the direct generation of 3D shapes as preliminary +results with 2D generative models. With the proposed high-frequency enhancer +and the multi-view recarving strategy, our method can seamlessly integrate the +details from different views into a uniform global shape. To better utilize the +3D human prior and enhance control over the generated geometry, we introduce a +compact spherical embedding of 3D joints. This allows for an effective guidance +of pose during the generation process. Additionally, our method can generate 3D +humans guided by textual inputs. Our experimental results demonstrate the +capability of our method to ensure global structure, local details, high +resolution, and low computational cost simultaneously. More results and the +code can be found on our project page at +http://cic.tju.edu.cn/faculty/likun/projects/Joint2Human. + +
+
+
+
+
+ + ♻ ☆ Detection Is Tracking: Point Cloud Multi-Sweep Deep Learning Models + Revisited + + +
+ Conventional tracking paradigm takes in instantaneous measurements such as +range and bearing, and produces object tracks across time. In applications such +as autonomous driving, lidar measurements in the form of point clouds are +usually passed through a "virtual sensor" realized by a deep learning model, to +produce "measurements" such as bounding boxes, which are in turn ingested by a +tracking module to produce object tracks. Very often multiple lidar sweeps are +accumulated in a buffer to merge and become the input to the virtual sensor. We +argue in this paper that such an input already contains temporal information, +and therefore the virtual sensor output should also contain temporal +information, not just instantaneous values for the time corresponding to the +end of the buffer. In particular, we present the deep learning model called +MULti-Sweep PAired Detector (MULSPAD) that produces, for each detected object, +a pair of bounding boxes at both the end time and the beginning time of the +input buffer. This is achieved with fairly straightforward changes in commonly +used lidar detection models, and with only marginal extra processing, but the +resulting symmetry is satisfying. Such paired detections make it possible not +only to construct rudimentary trackers fairly easily, but also to construct +more sophisticated trackers that can exploit the extra information conveyed by +the pair and be robust to choices of motion models and object birth/death +models. We have conducted preliminary training and experimentation using Waymo +Open Dataset, which shows the efficacy of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Learning Trimaps via Clicks for Image Matting + + +
+ Despite significant advancements in image matting, existing models heavily +depend on manually-drawn trimaps for accurate results in natural image +scenarios. However, the process of obtaining trimaps is time-consuming, lacking +user-friendliness and device compatibility. This reliance greatly limits the +practical application of all trimap-based matting methods. To address this +issue, we introduce Click2Trimap, an interactive model capable of predicting +high-quality trimaps and alpha mattes with minimal user click inputs. Through +analyzing real users' behavioral logic and characteristics of trimaps, we +successfully propose a powerful iterative three-class training strategy and a +dedicated simulation function, making Click2Trimap exhibit versatility across +various scenarios. Quantitative and qualitative assessments on synthetic and +real-world matting datasets demonstrate Click2Trimap's superior performance +compared to all existing trimap-free matting methods. Especially, in the user +study, Click2Trimap achieves high-quality trimap and matting predictions in +just an average of 5 seconds per image, demonstrating its substantial practical +value in real-world applications. + +
+
+
+
+
+ + ♻ ☆ Image Inpainting via Conditional Texture and Structure Dual Generation ICCV 2021 + + +
+ Deep generative approaches have recently made considerable progress in image +inpainting by introducing structure priors. Due to the lack of proper +interaction with image texture during structure reconstruction, however, +current solutions are incompetent in handling the cases with large corruptions, +and they generally suffer from distorted results. In this paper, we propose a +novel two-stream network for image inpainting, which models the +structure-constrained texture synthesis and texture-guided structure +reconstruction in a coupled manner so that they better leverage each other for +more plausible generation. Furthermore, to enhance the global consistency, a +Bi-directional Gated Feature Fusion (Bi-GFF) module is designed to exchange and +combine the structure and texture information and a Contextual Feature +Aggregation (CFA) module is developed to refine the generated contents by +region affinity learning and multi-scale feature aggregation. Qualitative and +quantitative experiments on the CelebA, Paris StreetView and Places2 datasets +demonstrate the superiority of the proposed method. Our code is available at +https://github.com/Xiefan-Guo/CTSDG. + +
+
+ comment: Accepted by ICCV 2021 +
+
+
+
+
+ + ♻ ☆ Geometry Transfer for Stylizing Radiance Fields CVPR 2024 + + +
+ Shape and geometric patterns are essential in defining stylistic identity. +However, current 3D style transfer methods predominantly focus on transferring +colors and textures, often overlooking geometric aspects. In this paper, we +introduce Geometry Transfer, a novel method that leverages geometric +deformation for 3D style transfer. This technique employs depth maps to extract +a style guide, subsequently applied to stylize the geometry of radiance fields. +Moreover, we propose new techniques that utilize geometric cues from the 3D +scene, thereby enhancing aesthetic expressiveness and more accurately +reflecting intended styles. Our extensive experiments show that Geometry +Transfer enables a broader and more expressive range of stylizations, thereby +significantly expanding the scope of 3D style transfer. + +
+
+ comment: CVPR 2024. Project page: https://hyblue.github.io/geo-srf/ +
+
+
+
+
+ + ♻ ☆ DiffSHEG: A Diffusion-Based Approach for Real-Time Speech-driven + Holistic 3D Expression and Gesture Generation CVPR 2024 + + +
+ We propose DiffSHEG, a Diffusion-based approach for Speech-driven Holistic 3D +Expression and Gesture generation with arbitrary length. While previous works +focused on co-speech gesture or expression generation individually, the joint +generation of synchronized expressions and gestures remains barely explored. To +address this, our diffusion-based co-speech motion generation transformer +enables uni-directional information flow from expression to gesture, +facilitating improved matching of joint expression-gesture distributions. +Furthermore, we introduce an outpainting-based sampling strategy for arbitrary +long sequence generation in diffusion models, offering flexibility and +computational efficiency. Our method provides a practical solution that +produces high-quality synchronized expression and gesture generation driven by +speech. Evaluated on two public datasets, our approach achieves +state-of-the-art performance both quantitatively and qualitatively. +Additionally, a user study confirms the superiority of DiffSHEG over prior +approaches. By enabling the real-time generation of expressive and synchronized +motions, DiffSHEG showcases its potential for various applications in the +development of digital humans and embodied agents. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://jeremycjm.github.io/proj/DiffSHEG +
+
+
+
+
+ + ♻ ☆ Optimizing Sparse Convolution on GPUs with CUDA for 3D Point Cloud + Processing in Embedded Systems + + +
+ In recent years, there has been a significant increase in the utilization of +deep learning methods, particularly convolutional neural networks (CNNs), which +have emerged as the dominant approach in various domains that involve +structured grid data, such as picture analysis and processing. Nevertheless, +the exponential growth in the utilization of LiDAR and 3D sensors across many +domains has resulted in an increased need for the analysis of 3D point clouds. +The utilization of 3D point clouds is crucial in various applications, +including object recognition and segmentation, as they offer a spatial +depiction of things within a three-dimensional environment. In contrast to +photos, point clouds exhibit sparsity and lack a regular grid, hence posing +distinct processing and computational issues. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ YOLOv8-AM: YOLOv8 with Attention Mechanisms for Pediatric Wrist Fracture + Detection + + +
+ Wrist trauma and even fractures occur frequently in daily life, particularly +among children who account for a significant proportion of fracture cases. +Before performing surgery, surgeons often request patients to undergo X-ray +imaging first and prepare for it based on the analysis of the radiologist. With +the development of neural networks, You Only Look Once (YOLO) series models +have been widely used in fracture detection as computer-assisted diagnosis +(CAD). In 2023, Ultralytics presented the latest version of the YOLO models, +which has been employed for detecting fractures across various parts of the +body. Attention mechanism is one of the hottest methods to improve the model +performance. This research work proposes YOLOv8-AM, which incorporates the +attention mechanism into the original YOLOv8 architecture. Specifically, we +respectively employ four attention modules, Convolutional Block Attention +Module (CBAM), Global Attention Mechanism (GAM), Efficient Channel Attention +(ECA), and Shuffle Attention (SA), to design the improved models and train them +on GRAZPEDWRI-DX dataset. Experimental results demonstrate that the mean +Average Precision at IoU 50 (mAP 50) of the YOLOv8-AM model based on ResBlock + +CBAM (ResCBAM) increased from 63.6% to 65.8%, which achieves the +state-of-the-art (SOTA) performance. Conversely, YOLOv8-AM model incorporating +GAM obtains the mAP 50 value of 64.2%, which is not a satisfactory enhancement. +Therefore, we combine ResBlock and GAM, introducing ResGAM to design another +new YOLOv8-AM model, whose mAP 50 value is increased to 65.0%. The +implementation code for this study is available on GitHub at +https://github.com/RuiyangJu/Fracture_Detection_Improved_YOLOv8. + +
+
+
+
+
+ + ♻ ☆ Template Free Reconstruction of Human-object Interaction with Procedural + Interaction Generation CVPR'24 + + +
+ Reconstructing human-object interaction in 3D from a single RGB image is a +challenging task and existing data driven methods do not generalize beyond the +objects present in the carefully curated 3D interaction datasets. Capturing +large-scale real data to learn strong interaction and 3D shape priors is very +expensive due to the combinatorial nature of human-object interactions. In this +paper, we propose ProciGen (Procedural interaction Generation), a method to +procedurally generate datasets with both, plausible interaction and diverse +object variation. We generate 1M+ human-object interaction pairs in 3D and +leverage this large-scale data to train our HDM (Hierarchical Diffusion Model), +a novel method to reconstruct interacting human and unseen objects, without any +templates. Our HDM is an image-conditioned diffusion model that learns both +realistic interaction and highly accurate human and object shapes. Experiments +show that our HDM trained with ProciGen significantly outperforms prior methods +that requires template meshes and that our dataset allows training methods with +strong generalization ability to unseen object instances. Our code and data are +released. + +
+
+ comment: CVPR'24 camera ready version. 25 pages, 20 figures. Project page: + https://virtualhumans.mpi-inf.mpg.de/procigen-hdm +
+
+
+
+
+ + ♻ ☆ Grounding and Enhancing Grid-based Models for Neural Fields CVPR24 + + +
+ Many contemporary studies utilize grid-based models for neural field +representation, but a systematic analysis of grid-based models is still +missing, hindering the improvement of those models. Therefore, this paper +introduces a theoretical framework for grid-based models. This framework points +out that these models' approximation and generalization behaviors are +determined by grid tangent kernels (GTK), which are intrinsic properties of +grid-based models. The proposed framework facilitates a consistent and +systematic analysis of diverse grid-based models. Furthermore, the introduced +framework motivates the development of a novel grid-based model named the +Multiplicative Fourier Adaptive Grid (MulFAGrid). The numerical analysis +demonstrates that MulFAGrid exhibits a lower generalization bound than its +predecessors, indicating its robust generalization performance. Empirical +studies reveal that MulFAGrid achieves state-of-the-art performance in various +tasks, including 2D image fitting, 3D signed distance field (SDF) +reconstruction, and novel view synthesis, demonstrating superior representation +ability. The project website is available at +https://sites.google.com/view/cvpr24-2034-submission/home. + +
+
+ comment: Accepted in CVPR24 as an oral presentation. Pre-rebuttal scores: 555. + Post-rebuttal scores: 555 +
+
+
+
+
+ + ♻ ☆ Diff-Plugin: Revitalizing Details for Diffusion-based Low-level Tasks CVPR2024 + + +
+ Diffusion models trained on large-scale datasets have achieved remarkable +progress in image synthesis. However, due to the randomness in the diffusion +process, they often struggle with handling diverse low-level tasks that require +details preservation. To overcome this limitation, we present a new Diff-Plugin +framework to enable a single pre-trained diffusion model to generate +high-fidelity results across a variety of low-level tasks. Specifically, we +first propose a lightweight Task-Plugin module with a dual branch design to +provide task-specific priors, guiding the diffusion process in preserving image +content. We then propose a Plugin-Selector that can automatically select +different Task-Plugins based on the text instruction, allowing users to edit +images by indicating multiple low-level tasks with natural language. We conduct +extensive experiments on 8 low-level vision tasks. The results demonstrate the +superiority of Diff-Plugin over existing methods, particularly in real-world +scenarios. Our ablations further validate that Diff-Plugin is stable, +schedulable, and supports robust training across different dataset sizes. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Comparing the Decision-Making Mechanisms by Transformers and CNNs via + Explanation Methods CVPR24 + + +
+ In order to gain insights about the decision-making of different visual +recognition backbones, we propose two methodologies, sub-explanation counting +and cross-testing, that systematically applies deep explanation algorithms on a +dataset-wide basis, and compares the statistics generated from the amount and +nature of the explanations. These methodologies reveal the difference among +networks in terms of two properties called compositionality and disjunctivism. +Transformers and ConvNeXt are found to be more compositional, in the sense that +they jointly consider multiple parts of the image in building their decisions, +whereas traditional CNNs and distilled transformers are less compositional and +more disjunctive, which means that they use multiple diverse but smaller set of +parts to achieve a confident prediction. Through further experiments, we +pinpointed the choice of normalization to be especially important in the +compositionality of a model, in that batch normalization leads to less +compositionality while group and layer normalization lead to more. Finally, we +also analyze the features shared by different backbones and plot a landscape of +different models based on their feature-use similarity. + +
+
+ comment: 25 pages with 37 figures, to be published in CVPR24 +
+
+
+
+
+ + ♻ ☆ From Pixels to Graphs: Open-Vocabulary Scene Graph Generation with + Vision-Language Models CVPR 2024 + + +
+ Scene graph generation (SGG) aims to parse a visual scene into an +intermediate graph representation for downstream reasoning tasks. Despite +recent advancements, existing methods struggle to generate scene graphs with +novel visual relation concepts. To address this challenge, we introduce a new +open-vocabulary SGG framework based on sequence generation. Our framework +leverages vision-language pre-trained models (VLM) by incorporating an +image-to-graph generation paradigm. Specifically, we generate scene graph +sequences via image-to-text generation with VLM and then construct scene graphs +from these sequences. By doing so, we harness the strong capabilities of VLM +for open-vocabulary SGG and seamlessly integrate explicit relational modeling +for enhancing the VL tasks. Experimental results demonstrate that our design +not only achieves superior performance with an open vocabulary but also +enhances downstream vision-language task performance through explicit relation +modeling knowledge. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A Universal Knowledge Embedded Contrastive Learning Framework for + Hyperspectral Image Classification + + +
+ Hyperspectral image (HSI) classification techniques have been intensively +studied and a variety of models have been developed. However, these HSI +classification models are confined to pocket models and unrealistic ways of +datasets partitioning. The former limits the generalization performance of the +model and the latter is partitioned leads to inflated model evaluation metrics, +which results in plummeting model performance in the real world. Therefore, we +propose a universal knowledge embedded contrastive learning framework (KnowCL) +for supervised, unsupervised, and semisupervised HSI classification, which +largely closes the gap of HSI classification models between pocket models and +standard vision backbones. We present a new HSI processing pipeline in +conjunction with a range of data transformation and augmentation techniques +that provide diverse data representations and realistic data partitioning. The +proposed framework based on this pipeline is compatible with all kinds of +backbones and can fully exploit labeled and unlabeled samples with expected +training time. Furthermore, we design a new loss function, which can adaptively +fuse the supervised loss and unsupervised loss, enhancing the learning +performance. This proposed new classification paradigm shows great potentials +in exploring for HSI classification technology. The code can be accessed at +https://github.com/quanweiliu/KnowCL. + +
+
+
+
+
+ + ♻ ☆ HAPNet: Toward Superior RGB-Thermal Scene Parsing via Hybrid, + Asymmetric, and Progressive Heterogeneous Feature Fusion + + +
+ Data-fusion networks have shown significant promise for RGB-thermal scene +parsing. However, the majority of existing studies have relied on symmetric +duplex encoders for heterogeneous feature extraction and fusion, paying +inadequate attention to the inherent differences between RGB and thermal +modalities. Recent progress in vision foundation models (VFMs) trained through +self-supervision on vast amounts of unlabeled data has proven their ability to +extract informative, general-purpose features. However, this potential has yet +to be fully leveraged in the domain. In this study, we take one step toward +this new research area by exploring a feasible strategy to fully exploit VFM +features for RGB-thermal scene parsing. Specifically, we delve deeper into the +unique characteristics of RGB and thermal modalities, thereby designing a +hybrid, asymmetric encoder that incorporates both a VFM and a convolutional +neural network. This design allows for more effective extraction of +complementary heterogeneous features, which are subsequently fused in a +dual-path, progressive manner. Moreover, we introduce an auxiliary task to +further enrich the local semantics of the fused features, thereby improving the +overall performance of RGB-thermal scene parsing. Our proposed HAPNet, equipped +with all these components, demonstrates superior performance compared to all +other state-of-the-art RGB-thermal scene parsing networks, achieving top ranks +across three widely used public RGB-thermal scene parsing datasets. We believe +this new paradigm has opened up new opportunities for future developments in +data-fusion scene parsing approaches. + +
+
+ comment: 12 pages, 4figures +
+
+
+
+
+ + ♻ ☆ SANeRF-HQ: Segment Anything for NeRF in High Quality CVPR 2024 + + +
+ Recently, the Segment Anything Model (SAM) has showcased remarkable +capabilities of zero-shot segmentation, while NeRF (Neural Radiance Fields) has +gained popularity as a method for various 3D problems beyond novel view +synthesis. Though there exist initial attempts to incorporate these two methods +into 3D segmentation, they face the challenge of accurately and consistently +segmenting objects in complex scenarios. In this paper, we introduce the +Segment Anything for NeRF in High Quality (SANeRF-HQ) to achieve high-quality +3D segmentation of any target object in a given scene. SANeRF-HQ utilizes SAM +for open-world object segmentation guided by user-supplied prompts, while +leveraging NeRF to aggregate information from different viewpoints. To overcome +the aforementioned challenges, we employ density field and RGB similarity to +enhance the accuracy of segmentation boundary during the aggregation. +Emphasizing on segmentation accuracy, we evaluate our method on multiple NeRF +datasets where high-quality ground-truths are available or manually annotated. +SANeRF-HQ shows a significant quality improvement over state-of-the-art methods +in NeRF object segmentation, provides higher flexibility for object +localization, and enables more consistent object segmentation across multiple +views. Results and code are available at the project site: +https://lyclyc52.github.io/SANeRF-HQ/. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ L2SR: Learning to Sample and Reconstruct for Accelerated MRI via + Reinforcement Learning + + +
+ Magnetic Resonance Imaging (MRI) is a widely used medical imaging technique, +but its long acquisition time can be a limiting factor in clinical settings. To +address this issue, researchers have been exploring ways to reduce the +acquisition time while maintaining the reconstruction quality. Previous works +have focused on finding either sparse samplers with a fixed reconstructor or +finding reconstructors with a fixed sampler. However, these approaches do not +fully utilize the potential of joint learning of samplers and reconstructors. +In this paper, we propose an alternating training framework for jointly +learning a good pair of samplers and reconstructors via deep reinforcement +learning (RL). In particular, we consider the process of MRI sampling as a +sampling trajectory controlled by a sampler, and introduce a novel +sparse-reward Partially Observed Markov Decision Process (POMDP) to formulate +the MRI sampling trajectory. Compared to the dense-reward POMDP used in +existing works, the proposed sparse-reward POMDP is more computationally +efficient and has a provable advantage. Moreover, the proposed framework, +called L2SR (Learning to Sample and Reconstruct), overcomes the training +mismatch problem that arises in previous methods that use dense-reward POMDP. +By alternately updating samplers and reconstructors, L2SR learns a pair of +samplers and reconstructors that achieve state-of-the-art reconstruction +performances on the fastMRI dataset. Codes are available at +\url{https://github.com/yangpuPKU/L2SR-Learning-to-Sample-and-Reconstruct}. + +
+
+
+
+
+ + ♻ ☆ Image Super-resolution Reconstruction Network based on Enhanced Swin + Transformer via Alternating Aggregation of Local-Global Features + + +
+ The Swin Transformer image super-resolution reconstruction network only +relies on the long-range relationship of window attention and shifted window +attention to explore features. This mechanism has two limitations. On the one +hand, it only focuses on global features while ignoring local features. On the +other hand, it is only concerned with spatial feature interactions while +ignoring channel features and channel interactions, thus limiting its +non-linear mapping ability. To address the above limitations, this paper +proposes enhanced Swin Transformer modules via alternating aggregation of +local-global features. In the local feature aggregation stage, we introduce a +shift convolution to realize the interaction between local spatial information +and channel information. Then, a block sparse global perception module is +introduced in the global feature aggregation stage. In this module, we +reorganize the spatial information first, then send the recombination +information into a dense layer to implement the global perception. After that, +a multi-scale self-attention module and a low-parameter residual channel +attention module are introduced to realize information aggregation at different +scales. Finally, the proposed network is validated on five publicly available +datasets. The experimental results show that the proposed network outperforms +the other state-of-the-art super-resolution networks. + +
+
+
+
+
+ + ♻ ☆ You Only Train Once: A Unified Framework for Both Full-Reference and + No-Reference Image Quality Assessment + + +
+ Although recent efforts in image quality assessment (IQA) have achieved +promising performance, there still exists a considerable gap compared to the +human visual system (HVS). One significant disparity lies in humans' seamless +transition between full reference (FR) and no reference (NR) tasks, whereas +existing models are constrained to either FR or NR tasks. This disparity +implies the necessity of designing two distinct systems, thereby greatly +diminishing the model's versatility. Therefore, our focus lies in unifying FR +and NR IQA under a single framework. Specifically, we first employ an encoder +to extract multi-level features from input images. Then a Hierarchical +Attention (HA) module is proposed as a universal adapter for both FR and NR +inputs to model the spatial distortion at each encoder stage. Furthermore, +considering that different distortions contaminate encoder stages and damage +image semantic meaning differently, a Semantic Distortion Aware (SDA) module is +proposed to examine feature correlations between shallow and deep layers of the +encoder. By adopting HA and SDA, the proposed network can effectively perform +both FR and NR IQA. When our proposed model is independently trained on NR or +FR IQA tasks, it outperforms existing models and achieves state-of-the-art +performance. Moreover, when trained jointly on NR and FR IQA tasks, it further +enhances the performance of NR IQA while achieving on-par performance in the +state-of-the-art FR IQA. You only train once to perform both IQA tasks. Code +will be released at: https://github.com/BarCodeReader/YOTO. + +
+
+
+
+
+ + ♻ ☆ Filtering Pixel Latent Variables for Unmixing Noisy and Undersampled + Volumetric Images + + +
+ The development of robust signal unmixing algorithms is essential for +leveraging multimodal datasets acquired through a wide array of scientific +imaging technologies, including hyperspectral or time-resolved acquisitions. In +experimental physics, enhancing the spatio-temporal resolution or expanding the +number of detection channels often leads to diminished sampling rate and +signal-to-noise ratio, significantly affecting the efficacy of signal unmixing +algorithms. We propose applying band-pass filters to the latent space of a +multi-dimensional convolutional neural network to disentangle overlapping +signal components, enabling the isolation and quantification of their +individual contributions. Using multi-dimensional convolution kernels to +process all dimensions simultaneously enhances the network's ability to extract +information from adjacent pixels, time- or spectral-bins. This approach enables +more effective separation of components in cases where individual pixels do not +provide clear, well-resolved information. We showcase the method's practical +use in experimental physics through two test cases that highlight the +versatility of our approach: fluorescence lifetime microscopy and mode +decomposition in optical fibers. The latent unmixing method extracts valuable +information from complex signals that cannot be resolved by standard methods. +Application of latent unmixing to real FLIM experiments will increase the +number of distinguishable fluorescent markers. It will also open new +possibilities in optics and photonics for multichannel separations at increased +sampling rate. + +
+
+ comment: 16 pages, 8 figures (main paper) + 18 pages, 9 figures (supplementary + material) +
+
+
+
+
+ + ♻ ☆ Open3DIS: Open-Vocabulary 3D Instance Segmentation with 2D Mask Guidance CVPR 2024 + + +
+ We introduce Open3DIS, a novel solution designed to tackle the problem of +Open-Vocabulary Instance Segmentation within 3D scenes. Objects within 3D +environments exhibit diverse shapes, scales, and colors, making precise +instance-level identification a challenging task. Recent advancements in +Open-Vocabulary scene understanding have made significant strides in this area +by employing class-agnostic 3D instance proposal networks for object +localization and learning queryable features for each 3D mask. While these +methods produce high-quality instance proposals, they struggle with identifying +small-scale and geometrically ambiguous objects. The key idea of our method is +a new module that aggregates 2D instance masks across frames and maps them to +geometrically coherent point cloud regions as high-quality object proposals +addressing the above limitations. These are then combined with 3D +class-agnostic instance proposals to include a wide range of objects in the +real world. To validate our approach, we conducted experiments on three +prominent datasets, including ScanNet200, S3DIS, and Replica, demonstrating +significant performance gains in segmenting objects with diverse categories +over the state-of-the-art approaches. + +
+
+ comment: CVPR 2024. Project page: https://open3dis.github.io/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 116 + +
+
+
+ + ☆ Sigma: Siamese Mamba Network for Multi-Modal Semantic Segmentation + + +
+ Multi-modal semantic segmentation significantly enhances AI agents' +perception and scene understanding, especially under adverse conditions like +low-light or overexposed environments. Leveraging additional modalities +(X-modality) like thermal and depth alongside traditional RGB provides +complementary information, enabling more robust and reliable segmentation. In +this work, we introduce Sigma, a Siamese Mamba network for multi-modal semantic +segmentation, utilizing the Selective Structured State Space Model, Mamba. +Unlike conventional methods that rely on CNNs, with their limited local +receptive fields, or Vision Transformers (ViTs), which offer global receptive +fields at the cost of quadratic complexity, our model achieves global receptive +fields coverage with linear complexity. By employing a Siamese encoder and +innovating a Mamba fusion mechanism, we effectively select essential +information from different modalities. A decoder is then developed to enhance +the channel-wise modeling ability of the model. Our method, Sigma, is +rigorously evaluated on both RGB-Thermal and RGB-Depth segmentation tasks, +demonstrating its superiority and marking the first successful application of +State Space Models (SSMs) in multi-modal perception tasks. Code is available at +https://github.com/zifuwan/Sigma. + +
+
+
+
+
+ + ☆ Watermark-based Detection and Attribution of AI-Generated Content + + +
+ Several companies--such as Google, Microsoft, and OpenAI--have deployed +techniques to watermark AI-generated content to enable proactive detection. +However, existing literature mainly focuses on user-agnostic detection. +Attribution aims to further trace back the user of a generative-AI service who +generated a given content detected as AI-generated. Despite its growing +importance, attribution is largely unexplored. In this work, we aim to bridge +this gap by providing the first systematic study on watermark-based, user-aware +detection and attribution of AI-generated content. Specifically, we +theoretically study the detection and attribution performance via rigorous +probabilistic analysis. Moreover, we develop an efficient algorithm to select +watermarks for the users to enhance attribution performance. Both our +theoretical and empirical results show that watermark-based detection and +attribution inherit the accuracy and (non-)robustness properties of the +watermarking method. + +
+
+
+
+
+ + ☆ Who Evaluates the Evaluations? Objectively Scoring Text-to-Image Prompt + Coherence Metrics with T2IScoreScore (TS2) + + +
+ With advances in the quality of text-to-image (T2I) models has come interest +in benchmarking their prompt faithfulness-the semantic coherence of generated +images to the prompts they were conditioned on. A variety of T2I faithfulness +metrics have been proposed, leveraging advances in cross-modal embeddings and +vision-language models (VLMs). However, these metrics are not rigorously +compared and benchmarked, instead presented against few weak baselines by +correlation to human Likert scores over a set of easy-to-discriminate images. + We introduce T2IScoreScore (TS2), a curated set of semantic error graphs +containing a prompt and a set increasingly erroneous images. These allow us to +rigorously judge whether a given prompt faithfulness metric can correctly order +images with respect to their objective error count and significantly +discriminate between different error nodes, using meta-metric scores derived +from established statistical tests. Surprisingly, we find that the +state-of-the-art VLM-based metrics (e.g., TIFA, DSG, LLMScore, VIEScore) we +tested fail to significantly outperform simple feature-based metrics like +CLIPScore, particularly on a hard subset of naturally-occurring T2I model +errors. TS2 will enable the development of better T2I prompt faithfulness +metrics through more rigorous comparison of their conformity to expected +orderings and separations under objective criteria. + +
+
+ comment: 15 pages main, 9 pages appendices, 16 figures, 3 tables +
+
+
+
+
+ + ☆ Evaluating Adversarial Robustness: A Comparison Of FGSM, Carlini-Wagner + Attacks, And The Role of Distillation as Defense Mechanism + + +
+ This technical report delves into an in-depth exploration of adversarial +attacks specifically targeted at Deep Neural Networks (DNNs) utilized for image +classification. The study also investigates defense mechanisms aimed at +bolstering the robustness of machine learning models. The research focuses on +comprehending the ramifications of two prominent attack methodologies: the Fast +Gradient Sign Method (FGSM) and the Carlini-Wagner (CW) approach. These attacks +are examined concerning three pre-trained image classifiers: Resnext50_32x4d, +DenseNet-201, and VGG-19, utilizing the Tiny-ImageNet dataset. Furthermore, the +study proposes the robustness of defensive distillation as a defense mechanism +to counter FGSM and CW attacks. This defense mechanism is evaluated using the +CIFAR-10 dataset, where CNN models, specifically resnet101 and Resnext50_32x4d, +serve as the teacher and student models, respectively. The proposed defensive +distillation model exhibits effectiveness in thwarting attacks such as FGSM. +However, it is noted to remain susceptible to more sophisticated techniques +like the CW attack. The document presents a meticulous validation of the +proposed scheme. It provides detailed and comprehensive results, elucidating +the efficacy and limitations of the defense mechanisms employed. Through +rigorous experimentation and analysis, the study offers insights into the +dynamics of adversarial attacks on DNNs, as well as the effectiveness of +defensive strategies in mitigating their impact. + +
+
+ comment: This report pertains to the Capstone Project done by Group 1 of the + Fall batch of 2023 students at Praxis Tech School, Kolkata, India. The + reports consists of 35 pages and it includes 15 figures and 10 tables. This + is the preprint which will be submitted to to an IEEE international + conference for review +
+
+
+
+
+ + ☆ DiffOp-net: A Differential Operator-based Fully Convolutional Network + for Unsupervised Deformable Image Registration + + +
+ Existing unsupervised deformable image registration methods usually rely on +metrics applied to the gradients of predicted displacement or velocity fields +as a regularization term to ensure transformation smoothness, which potentially +limits registration accuracy. In this study, we propose a novel approach to +enhance unsupervised deformable image registration by introducing a new +differential operator into the registration framework. This operator, acting on +the velocity field and mapping it to a dual space, ensures the smoothness of +the velocity field during optimization, facilitating accurate deformable +registration. In addition, to tackle the challenge of capturing large +deformations inside image pairs, we introduce a Cross-Coordinate Attention +module (CCA) and embed it into a proposed Fully Convolutional Networks +(FCNs)-based multi-resolution registration architecture. Evaluation experiments +are conducted on two magnetic resonance imaging (MRI) datasets. Compared to +various state-of-the-art registration approaches, including a traditional +algorithm and three representative unsupervised learning-based methods, our +method achieves superior accuracies, maintaining desirable diffeomorphic +properties, and exhibiting promising registration speed. + +
+
+
+
+
+ + ☆ Identity Decoupling for Multi-Subject Personalization of Text-to-Image + Models + + +
+ Text-to-image diffusion models have shown remarkable success in generating a +personalized subject based on a few reference images. However, current methods +struggle with handling multiple subjects simultaneously, often resulting in +mixed identities with combined attributes from different subjects. In this +work, we present MuDI, a novel framework that enables multi-subject +personalization by effectively decoupling identities from multiple subjects. +Our main idea is to utilize segmented subjects generated by the Segment +Anything Model for both training and inference, as a form of data augmentation +for training and initialization for the generation process. Our experiments +demonstrate that MuDI can produce high-quality personalized images without +identity mixing, even for highly similar subjects as shown in Figure 1. In +human evaluation, MuDI shows twice as many successes for personalizing multiple +subjects without identity mixing over existing baselines and is preferred over +70% compared to the strongest baseline. More results are available at +https://mudi-t2i.github.io/. + +
+
+ comment: Preprint. Project page: https://mudi-t2i.github.io/ +
+
+
+
+
+ + ☆ Physical Property Understanding from Language-Embedded Feature Fields CVPR 2024 + + +
+ Can computers perceive the physical properties of objects solely through +vision? Research in cognitive science and vision science has shown that humans +excel at identifying materials and estimating their physical properties based +purely on visual appearance. In this paper, we present a novel approach for +dense prediction of the physical properties of objects using a collection of +images. Inspired by how humans reason about physics through vision, we leverage +large language models to propose candidate materials for each object. We then +construct a language-embedded point cloud and estimate the physical properties +of each 3D point using a zero-shot kernel regression approach. Our method is +accurate, annotation-free, and applicable to any object in the open world. +Experiments demonstrate the effectiveness of the proposed approach in various +physical property reasoning tasks, such as estimating the mass of common +objects, as well as other properties like friction and hardness. + +
+
+ comment: CVPR 2024. Project page (with code): + https://ajzhai.github.io/NeRF2Physics/ +
+
+
+
+
+ + ☆ Image-Text Co-Decomposition for Text-Supervised Semantic Segmentation CVPR 2024 + + +
+ This paper addresses text-supervised semantic segmentation, aiming to learn a +model capable of segmenting arbitrary visual concepts within images by using +only image-text pairs without dense annotations. Existing methods have +demonstrated that contrastive learning on image-text pairs effectively aligns +visual segments with the meanings of texts. We notice that there is a +discrepancy between text alignment and semantic segmentation: A text often +consists of multiple semantic concepts, whereas semantic segmentation strives +to create semantically homogeneous segments. To address this issue, we propose +a novel framework, Image-Text Co-Decomposition (CoDe), where the paired image +and text are jointly decomposed into a set of image regions and a set of word +segments, respectively, and contrastive learning is developed to enforce +region-word alignment. To work with a vision-language model, we present a +prompt learning mechanism that derives an extra representation to highlight an +image segment or a word segment of interest, with which more effective features +can be extracted from that segment. Comprehensive experimental results +demonstrate that our method performs favorably against existing text-supervised +semantic segmentation methods on six benchmark datasets. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Robust Gaussian Splatting + + +
+ In this paper, we address common error sources for 3D Gaussian Splatting +(3DGS) including blur, imperfect camera poses, and color inconsistencies, with +the goal of improving its robustness for practical applications like +reconstructions from handheld phone captures. Our main contribution involves +modeling motion blur as a Gaussian distribution over camera poses, allowing us +to address both camera pose refinement and motion blur correction in a unified +way. Additionally, we propose mechanisms for defocus blur compensation and for +addressing color in-consistencies caused by ambient light, shadows, or due to +camera-related factors like varying white balancing settings. Our proposed +solutions integrate in a seamless way with the 3DGS formulation while +maintaining its benefits in terms of training efficiency and rendering speed. +We experimentally validate our contributions on relevant benchmark datasets +including Scannet++ and Deblur-NeRF, obtaining state-of-the-art results and +thus consistent improvements over relevant baselines. + +
+
+
+
+
+ + ☆ Deep-learning Segmentation of Small Volumes in CT images for + Radiotherapy Treatment Planning + + +
+ Our understanding of organs at risk is progressing to include physical small +tissues such as coronary arteries and the radiosensitivities of many small +organs and tissues are high. Therefore, the accurate segmentation of small +volumes in external radiotherapy is crucial to protect them from +over-irradiation. Moreover, with the development of the particle therapy and +on-board imaging, the treatment becomes more accurate and precise. The purpose +of this work is to optimize organ segmentation algorithms for small organs. We +used 50 three-dimensional (3-D) computed tomography (CT) head and neck images +from StructSeg2019 challenge to develop a general-purpose V-Net model to +segment 20 organs in the head and neck region. We applied specific strategies +to improve the segmentation accuracy of the small volumes in this anatomical +region, i.e., the lens of the eye. Then, we used 17 additional head images from +OSF healthcare to validate the robustness of the V Net model optimized for +small-volume segmentation. With the study of the StructSeg2019 images, we found +that the optimization of the image normalization range and classification +threshold yielded a segmentation improvement of the lens of the eye of +approximately 50%, compared to the use of the V-Net not optimized for small +volumes. We used the optimized model to segment 17 images acquired using +heterogeneous protocols. We obtained comparable Dice coefficient values for the +clinical and StructSeg2019 images (0.61 plus/minus 0.07 and 0.58 plus/minus +0.10 for the left and right lens of the eye, respectively) + +
+
+
+
+
+ + ☆ SCAResNet: A ResNet Variant Optimized for Tiny Object Detection in + Transmission and Distribution Towers + + +
+ Traditional deep learning-based object detection networks often resize images +during the data preprocessing stage to achieve a uniform size and scale in the +feature map. Resizing is done to facilitate model propagation and fully +connected classification. However, resizing inevitably leads to object +deformation and loss of valuable information in the images. This drawback +becomes particularly pronounced for tiny objects like distribution towers with +linear shapes and few pixels. To address this issue, we propose abandoning the +resizing operation. Instead, we introduce Positional-Encoding Multi-head +Criss-Cross Attention. This allows the model to capture contextual information +and learn from multiple representation subspaces, effectively enriching the +semantics of distribution towers. Additionally, we enhance Spatial Pyramid +Pooling by reshaping three pooled feature maps into a new unified one while +also reducing the computational burden. This approach allows images of +different sizes and scales to generate feature maps with uniform dimensions and +can be employed in feature map propagation. Our SCAResNet incorporates these +aforementioned improvements into the backbone network ResNet. We evaluated our +SCAResNet using the Electric Transmission and Distribution Infrastructure +Imagery dataset from Duke University. Without any additional tricks, we +employed various object detection models with Gaussian Receptive Field based +Label Assignment as the baseline. When incorporating the SCAResNet into the +baseline model, we achieved a 2.1% improvement in mAPs. This demonstrates the +advantages of our SCAResNet in detecting transmission and distribution towers +and its value in tiny object detection. The source code is available at +https://github.com/LisavilaLee/SCAResNet_mmdet. + +
+
+
+
+
+ + ☆ Noisy Label Processing for Classification: A Survey + + +
+ In recent years, deep neural networks (DNNs) have gained remarkable +achievement in computer vision tasks, and the success of DNNs often depends +greatly on the richness of data. However, the acquisition process of data and +high-quality ground truth requires a lot of manpower and money. In the long, +tedious process of data annotation, annotators are prone to make mistakes, +resulting in incorrect labels of images, i.e., noisy labels. The emergence of +noisy labels is inevitable. Moreover, since research shows that DNNs can easily +fit noisy labels, the existence of noisy labels will cause significant damage +to the model training process. Therefore, it is crucial to combat noisy labels +for computer vision tasks, especially for classification tasks. In this survey, +we first comprehensively review the evolution of different deep learning +approaches for noisy label combating in the image classification task. In +addition, we also review different noise patterns that have been proposed to +design robust algorithms. Furthermore, we explore the inner pattern of +real-world label noise and propose an algorithm to generate a synthetic label +noise pattern guided by real-world data. We test the algorithm on the +well-known real-world dataset CIFAR-10N to form a new real-world data-guided +synthetic benchmark and evaluate some typical noise-robust methods on the +benchmark. + +
+
+
+
+
+ + ☆ MarsSeg: Mars Surface Semantic Segmentation with Multi-level Extractor + and Connector + + +
+ The segmentation and interpretation of the Martian surface play a pivotal +role in Mars exploration, providing essential data for the trajectory planning +and obstacle avoidance of rovers. However, the complex topography, similar +surface features, and the lack of extensive annotated data pose significant +challenges to the high-precision semantic segmentation of the Martian surface. +To address these challenges, we propose a novel encoder-decoder based Mars +segmentation network, termed MarsSeg. Specifically, we employ an +encoder-decoder structure with a minimized number of down-sampling layers to +preserve local details. To facilitate a high-level semantic understanding +across the shadow multi-level feature maps, we introduce a feature enhancement +connection layer situated between the encoder and decoder. This layer +incorporates Mini Atrous Spatial Pyramid Pooling (Mini-ASPP), Polarized +Self-Attention (PSA), and Strip Pyramid Pooling Module (SPPM). The Mini-ASPP +and PSA are specifically designed for shadow feature enhancement, thereby +enabling the expression of local details and small objects. Conversely, the +SPPM is employed for deep feature enhancement, facilitating the extraction of +high-level semantic category-related information. Experimental results derived +from the Mars-Seg and AI4Mars datasets substantiate that the proposed MarsSeg +outperforms other state-of-the-art methods in segmentation performance, +validating the efficacy of each proposed component. + +
+
+
+
+
+ + ☆ Improving Detection in Aerial Images by Capturing Inter-Object + Relationships + + +
+ In many image domains, the spatial distribution of objects in a scene +exhibits meaningful patterns governed by their semantic relationships. In most +modern detection pipelines, however, the detection proposals are processed +independently, overlooking the underlying relationships between objects. In +this work, we introduce a transformer-based approach to capture these +inter-object relationships to refine classification and regression outcomes for +detected objects. Building on two-stage detectors, we tokenize the region of +interest (RoI) proposals to be processed by a transformer encoder. Specific +spatial and geometric relations are incorporated into the attention weights and +adaptively modulated and regularized. Experimental results demonstrate that the +proposed method achieves consistent performance improvement on three benchmarks +including DOTA-v1.0, DOTA-v1.5, and HRSC 2016, especially ranking first on both +DOTA-v1.5 and HRSC 2016. Specifically, our new method has an increase of 1.59 +mAP on DOTA-v1.0, 4.88 mAP on DOTA-v1.5, and 2.1 mAP on HRSC 2016, +respectively, compared to the baselines. + +
+
+
+
+
+ + ☆ 3D Facial Expressions through Analysis-by-Neural-Synthesis + + +
+ While existing methods for 3D face reconstruction from in-the-wild images +excel at recovering the overall face shape, they commonly miss subtle, extreme, +asymmetric, or rarely observed expressions. We improve upon these methods with +SMIRK (Spatial Modeling for Image-based Reconstruction of Kinesics), which +faithfully reconstructs expressive 3D faces from images. We identify two key +limitations in existing methods: shortcomings in their self-supervised training +formulation, and a lack of expression diversity in the training images. For +training, most methods employ differentiable rendering to compare a predicted +face mesh with the input image, along with a plethora of additional loss +functions. This differentiable rendering loss not only has to provide +supervision to optimize for 3D face geometry, camera, albedo, and lighting, +which is an ill-posed optimization problem, but the domain gap between +rendering and input image further hinders the learning process. Instead, SMIRK +replaces the differentiable rendering with a neural rendering module that, +given the rendered predicted mesh geometry, and sparsely sampled pixels of the +input image, generates a face image. As the neural rendering gets color +information from sampled image pixels, supervising with neural rendering-based +reconstruction loss can focus solely on the geometry. Further, it enables us to +generate images of the input identity with varying expressions while training. +These are then utilized as input to the reconstruction model and used as +supervision with ground truth geometry. This effectively augments the training +data and enhances the generalization for diverse expressions. Our qualitative, +quantitative and particularly our perceptual evaluations demonstrate that SMIRK +achieves the new state-of-the art performance on accurate expression +reconstruction. Project webpage: https://georgeretsi.github.io/smirk/. + +
+
+
+
+
+ + ☆ Dynamic Prompt Optimizing for Text-to-Image Generation CVPR 2024 + + +
+ Text-to-image generative models, specifically those based on diffusion models +like Imagen and Stable Diffusion, have made substantial advancements. Recently, +there has been a surge of interest in the delicate refinement of text prompts. +Users assign weights or alter the injection time steps of certain words in the +text prompts to improve the quality of generated images. However, the success +of fine-control prompts depends on the accuracy of the text prompts and the +careful selection of weights and time steps, which requires significant manual +intervention. To address this, we introduce the \textbf{P}rompt +\textbf{A}uto-\textbf{E}diting (PAE) method. Besides refining the original +prompts for image generation, we further employ an online reinforcement +learning strategy to explore the weights and injection time steps of each word, +leading to the dynamic fine-control prompts. The reward function during +training encourages the model to consider aesthetic score, semantic +consistency, and user preferences. Experimental results demonstrate that our +proposed method effectively improves the original prompts, generating visually +more appealing images while maintaining semantic alignment. Code is available +at https://github.com/Mowenyii/PAE. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Label Propagation for Zero-shot Classification with Vision-Language + Models CVPR 2024 + + +
+ Vision-Language Models (VLMs) have demonstrated impressive performance on +zero-shot classification, i.e. classification when provided merely with a list +of class names. In this paper, we tackle the case of zero-shot classification +in the presence of unlabeled data. We leverage the graph structure of the +unlabeled data and introduce ZLaP, a method based on label propagation (LP) +that utilizes geodesic distances for classification. We tailor LP to graphs +containing both text and image features and further propose an efficient method +for performing inductive inference based on a dual solution and a +sparsification step. We perform extensive experiments to evaluate the +effectiveness of our method on 14 common datasets and show that ZLaP +outperforms the latest related works. Code: +https://github.com/vladan-stojnic/ZLaP + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Score identity Distillation: Exponentially Fast Distillation of + Pretrained Diffusion Models for One-Step Generation + + +
+ We introduce Score identity Distillation (SiD), an innovative data-free +method that distills the generative capabilities of pretrained diffusion models +into a single-step generator. SiD not only facilitates an exponentially fast +reduction in Fr\'echet inception distance (FID) during distillation but also +approaches or even exceeds the FID performance of the original teacher +diffusion models. By reformulating forward diffusion processes as semi-implicit +distributions, we leverage three score-related identities to create an +innovative loss mechanism. This mechanism achieves rapid FID reduction by +training the generator using its own synthesized images, eliminating the need +for real data or reverse-diffusion-based generation, all accomplished within +significantly shortened generation time. Upon evaluation across four benchmark +datasets, the SiD algorithm demonstrates high iteration efficiency during +distillation and surpasses competing distillation approaches, whether they are +one-step or few-step, data-free, or dependent on training data, in terms of +generation quality. This achievement not only redefines the benchmarks for +efficiency and effectiveness in diffusion distillation but also in the broader +field of diffusion-based generation. Our PyTorch implementation will be +publicly accessible on GitHub. + +
+
+
+
+
+ + ☆ No Time to Train: Empowering Non-Parametric Networks for Few-shot 3D + Scene Segmentation CVPR + + +
+ To reduce the reliance on large-scale datasets, recent works in 3D +segmentation resort to few-shot learning. Current 3D few-shot segmentation +methods first pre-train models on 'seen' classes, and then evaluate their +generalization performance on 'unseen' classes. However, the prior pre-training +stage not only introduces excessive time overhead but also incurs a significant +domain gap on 'unseen' classes. To tackle these issues, we propose a +Non-parametric Network for few-shot 3D Segmentation, Seg-NN, and its Parametric +variant, Seg-PN. Without training, Seg-NN extracts dense representations by +hand-crafted filters and achieves comparable performance to existing parametric +models. Due to the elimination of pre-training, Seg-NN can alleviate the domain +gap issue and save a substantial amount of time. Based on Seg-NN, Seg-PN only +requires training a lightweight QUEry-Support Transferring (QUEST) module, +which enhances the interaction between the support set and query set. +Experiments suggest that Seg-PN outperforms previous state-of-the-art method by ++4.19% and +7.71% mIoU on S3DIS and ScanNet datasets respectively, while +reducing training time by -90%, indicating its effectiveness and efficiency. + +
+
+ comment: CVPR Highlight. Code is available at + https://github.com/yangyangyang127/Seg-NN. arXiv admin note: text overlap + with arXiv:2308.12961 +
+
+
+
+
+ + ☆ Dynamic Risk Assessment Methodology with an LDM-based System for Parking + Scenarios + + +
+ This paper describes the methodology for building a dynamic risk assessment +for ADAS (Advanced Driving Assistance Systems) algorithms in parking scenarios, +fusing exterior and interior perception for a better understanding of the scene +and a more comprehensive risk estimation. This includes the definition of a +dynamic risk methodology that depends on the situation from inside and outside +the vehicle, the creation of a multi-sensor dataset of risk assessment for ADAS +benchmarking purposes, and a Local Dynamic Map (LDM) that fuses data from the +exterior and interior of the car to build an LDM-based Dynamic Risk Assessment +System (DRAS). + +
+
+
+
+
+ + ☆ InstructHumans: Editing Animated 3D Human Textures with Instructions + + +
+ We present InstructHumans, a novel framework for instruction-driven 3D human +texture editing. Existing text-based editing methods use Score Distillation +Sampling (SDS) to distill guidance from generative models. This work shows that +naively using such scores is harmful to editing as they destroy consistency +with the source avatar. Instead, we propose an alternate SDS for Editing +(SDS-E) that selectively incorporates subterms of SDS across diffusion +timesteps. We further enhance SDS-E with spatial smoothness regularization and +gradient-based viewpoint sampling to achieve high-quality edits with sharp and +high-fidelity detailing. InstructHumans significantly outperforms existing 3D +editing methods, consistent with the initial avatar while faithful to the +textual instructions. Project page: https://jyzhu.top/instruct-humans . + +
+
+ comment: Project Page: https://jyzhu.top/instruct-humans +
+
+
+
+
+ + ☆ MM-Gaussian: 3D Gaussian-based Multi-modal Fusion for Localization and + Reconstruction in Unbounded Scenes + + +
+ Localization and mapping are critical tasks for various applications such as +autonomous vehicles and robotics. The challenges posed by outdoor environments +present particular complexities due to their unbounded characteristics. In this +work, we present MM-Gaussian, a LiDAR-camera multi-modal fusion system for +localization and mapping in unbounded scenes. Our approach is inspired by the +recently developed 3D Gaussians, which demonstrate remarkable capabilities in +achieving high rendering quality and fast rendering speed. Specifically, our +system fully utilizes the geometric structure information provided by +solid-state LiDAR to address the problem of inaccurate depth encountered when +relying solely on visual solutions in unbounded, outdoor scenarios. +Additionally, we utilize 3D Gaussian point clouds, with the assistance of +pixel-level gradient descent, to fully exploit the color information in photos, +thereby achieving realistic rendering effects. To further bolster the +robustness of our system, we designed a relocalization module, which assists in +returning to the correct trajectory in the event of a localization failure. +Experiments conducted in multiple scenarios demonstrate the effectiveness of +our method. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Framework to generate perfusion map from CT and CTA images in patients + with acute ischemic stroke: A longitudinal and cross-sectional study MICCAI 2023 + + +
+ Stroke is a leading cause of disability and death. Effective treatment +decisions require early and informative vascular imaging. 4D perfusion imaging +is ideal but rarely available within the first hour after stroke, whereas plain +CT and CTA usually are. Hence, we propose a framework to extract a predicted +perfusion map (PPM) derived from CT and CTA images. In all eighteen patients, +we found significantly high spatial similarity (with average Spearman's +correlation = 0.7893) between our predicted perfusion map (PPM) and the T-max +map derived from 4D-CTP. Voxelwise correlations between the PPM and National +Institutes of Health Stroke Scale (NIHSS) subscores for L/R hand motor, gaze, +and language on a large cohort of 2,110 subjects reliably mapped symptoms to +expected infarct locations. Therefore our PPM could serve as an alternative for +4D perfusion imaging, if the latter is unavailable, to investigate blood +perfusion in the first hours after hospital admission. + +
+
+ comment: Accepted and presented in SWITCH2023: Stroke Workshop on Imaging and + Treatment CHallenges (MICCAI 2023, Vancouver Canada) +
+
+
+
+
+ + ☆ Neural-Symbolic VideoQA: Learning Compositional Spatio-Temporal + Reasoning for Real-world Video Question Answering + + +
+ Compositional spatio-temporal reasoning poses a significant challenge in the +field of video question answering (VideoQA). Existing approaches struggle to +establish effective symbolic reasoning structures, which are crucial for +answering compositional spatio-temporal questions. To address this challenge, +we propose a neural-symbolic framework called Neural-Symbolic VideoQA +(NS-VideoQA), specifically designed for real-world VideoQA tasks. The +uniqueness and superiority of NS-VideoQA are two-fold: 1) It proposes a Scene +Parser Network (SPN) to transform static-dynamic video scenes into Symbolic +Representation (SR), structuralizing persons, objects, relations, and action +chronologies. 2) A Symbolic Reasoning Machine (SRM) is designed for top-down +question decompositions and bottom-up compositional reasonings. Specifically, a +polymorphic program executor is constructed for internally consistent reasoning +from SR to the final answer. As a result, Our NS-VideoQA not only improves the +compositional spatio-temporal reasoning in real-world VideoQA task, but also +enables step-by-step error analysis by tracing the intermediate results. +Experimental evaluations on the AGQA Decomp benchmark demonstrate the +effectiveness of the proposed NS-VideoQA framework. Empirical studies further +confirm that NS-VideoQA exhibits internal consistency in answering +compositional questions and significantly improves the capability of +spatio-temporal and logical inference for VideoQA tasks. + +
+
+
+
+
+ + ☆ Finsler-Laplace-Beltrami Operators with Application to Shape Analysis + + +
+ The Laplace-Beltrami operator (LBO) emerges from studying manifolds equipped +with a Riemannian metric. It is often called the Swiss army knife of geometry +processing as it allows to capture intrinsic shape information and gives rise +to heat diffusion, geodesic distances, and a multitude of shape descriptors. It +also plays a central role in geometric deep learning. In this work, we explore +Finsler manifolds as a generalization of Riemannian manifolds. We revisit the +Finsler heat equation and derive a Finsler heat kernel and a +Finsler-Laplace-Beltrami Operator (FLBO): a novel theoretically justified +anisotropic Laplace-Beltrami operator (ALBO). In experimental evaluations we +demonstrate that the proposed FLBO is a valuable alternative to the traditional +Riemannian-based LBO and ALBOs for spatial filtering and shape correspondence +estimation. We hope that the proposed Finsler heat kernel and the FLBO will +inspire further exploration of Finsler geometry in the computer vision +community. + +
+
+
+
+
+ + ☆ Physics-Inspired Synthesized Underwater Image Dataset + + +
+ This paper introduces the physics-inspired synthesized underwater image +dataset (PHISWID), a dataset tailored for enhancing underwater image processing +through physics-inspired image synthesis. Deep learning approaches to +underwater image enhancement typically demand extensive datasets, yet acquiring +paired clean and degraded underwater ones poses significant challenges. While +several underwater image datasets have been proposed using physics-based +synthesis, a publicly accessible collection has been lacking. Additionally, +most underwater image synthesis approaches do not intend to reproduce +atmospheric scenes, resulting in incomplete enhancement. PHISWID addresses this +gap by offering a set of paired ground-truth (atmospheric) and synthetically +degraded underwater images, showcasing not only color degradation but also the +often-neglected effects of marine snow, a composite of organic matter and sand +particles that considerably impairs underwater image clarity. The dataset +applies these degradations to atmospheric RGB-D images, enhancing the dataset's +realism and applicability. PHISWID is particularly valuable for training deep +neural networks in a supervised learning setting and for objectively assessing +image quality in benchmark analyses. Our results reveal that even a basic U-Net +architecture, when trained with PHISWID, substantially outperforms existing +methods in underwater image enhancement. We intend to release PHISWID publicly, +contributing a significant resource to the advancement of underwater imaging +technology. + +
+
+
+
+
+ + ☆ Rolling the dice for better deep learning performance: A study of + randomness techniques in deep neural networks + + +
+ This paper investigates how various randomization techniques impact Deep +Neural Networks (DNNs). Randomization, like weight noise and dropout, aids in +reducing overfitting and enhancing generalization, but their interactions are +poorly understood. The study categorizes randomness techniques into four types +and proposes new methods: adding noise to the loss function and random masking +of gradient updates. Using Particle Swarm Optimizer (PSO) for hyperparameter +optimization, it explores optimal configurations across MNIST, FASHION-MNIST, +CIFAR10, and CIFAR100 datasets. Over 30,000 configurations are evaluated, +revealing data augmentation and weight initialization randomness as main +performance contributors. Correlation analysis shows different optimizers +prefer distinct randomization types. The complete implementation and dataset +are available on GitHub. + +
+
+
+
+
+ + ☆ Towards Efficient and Accurate CT Segmentation via Edge-Preserving + Probabilistic Downsampling + + +
+ Downsampling images and labels, often necessitated by limited resources or to +expedite network training, leads to the loss of small objects and thin +boundaries. This undermines the segmentation network's capacity to interpret +images accurately and predict detailed labels, resulting in diminished +performance compared to processing at original resolutions. This situation +exemplifies the trade-off between efficiency and accuracy, with higher +downsampling factors further impairing segmentation outcomes. Preserving +information during downsampling is especially critical for medical image +segmentation tasks. To tackle this challenge, we introduce a novel method named +Edge-preserving Probabilistic Downsampling (EPD). It utilizes class uncertainty +within a local window to produce soft labels, with the window size dictating +the downsampling factor. This enables a network to produce quality predictions +at low resolutions. Beyond preserving edge details more effectively than +conventional nearest-neighbor downsampling, employing a similar algorithm for +images, it surpasses bilinear interpolation in image downsampling, enhancing +overall performance. Our method significantly improved Intersection over Union +(IoU) to 2.85%, 8.65%, and 11.89% when downsampling data to 1/2, 1/4, and 1/8, +respectively, compared to conventional interpolation methods. + +
+
+ comment: 5 pages (4 figures, 1 table); This work has been submitted to the + IEEE Signal Processing Letters. Copyright may be transferred without notice, + after which this version may no longer be accessible +
+
+
+
+
+ + ☆ RaSim: A Range-aware High-fidelity RGB-D Data Simulation Pipeline for + Real-world Applications ICRA'24 + + +
+ In robotic vision, a de-facto paradigm is to learn in simulated environments +and then transfer to real-world applications, which poses an essential +challenge in bridging the sim-to-real domain gap. While mainstream works tackle +this problem in the RGB domain, we focus on depth data synthesis and develop a +range-aware RGB-D data simulation pipeline (RaSim). In particular, +high-fidelity depth data is generated by imitating the imaging principle of +real-world sensors. A range-aware rendering strategy is further introduced to +enrich data diversity. Extensive experiments show that models trained with +RaSim can be directly applied to real-world scenarios without any finetuning +and excel at downstream RGB-D perception tasks. + +
+
+ comment: accepted by ICRA'24 +
+
+
+
+
+ + ☆ Deep Learning for Satellite Image Time Series Analysis: A Review + + +
+ Earth observation (EO) satellite missions have been providing detailed images +about the state of the Earth and its land cover for over 50 years. Long term +missions, such as NASA's Landsat, Terra, and Aqua satellites, and more +recently, the ESA's Sentinel missions, record images of the entire world every +few days. Although single images provide point-in-time data, repeated images of +the same area, or satellite image time series (SITS) provide information about +the changing state of vegetation and land use. These SITS are useful for +modeling dynamic processes and seasonal changes such as plant phenology. They +have potential benefits for many aspects of land and natural resource +management, including applications in agricultural, forest, water, and disaster +management, urban planning, and mining. However, the resulting satellite image +time series (SITS) are complex, incorporating information from the temporal, +spatial, and spectral dimensions. Therefore, deep learning methods are often +deployed as they can analyze these complex relationships. This review presents +a summary of the state-of-the-art methods of modelling environmental, +agricultural, and other Earth observation variables from SITS data using deep +learning methods. We aim to provide a resource for remote sensing experts +interested in using deep learning techniques to enhance Earth observation +models with temporal information. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Real-GDSR: Real-World Guided DSM Super-Resolution via Edge-Enhancing + Residual Network SP + + +
+ A low-resolution digital surface model (DSM) features distinctive attributes +impacted by noise, sensor limitations and data acquisition conditions, which +failed to be replicated using simple interpolation methods like bicubic. This +causes super-resolution models trained on synthetic data does not perform +effectively on real ones. Training a model on real low and high resolution DSMs +pairs is also a challenge because of the lack of information. On the other +hand, the existence of other imaging modalities of the same scene can be used +to enrich the information needed for large-scale super-resolution. In this +work, we introduce a novel methodology to address the intricacies of real-world +DSM super-resolution, named REAL-GDSR, breaking down this ill-posed problem +into two steps. The first step involves the utilization of a residual local +refinement network. This strategic approach departs from conventional methods +that trained to directly predict height values instead of the differences +(residuals) and utilize large receptive fields in their networks. The second +step introduces a diffusion-based technique that enhances the results on a +global scale, with a primary focus on smoothing and edge preservation. Our +experiments underscore the effectiveness of the proposed method. We conduct a +comprehensive evaluation, comparing it to recent state-of-the-art techniques in +the domain of real-world DSM super-resolution (SR). Our approach consistently +outperforms these existing methods, as evidenced through qualitative and +quantitative assessments. + +
+
+ comment: Accepted for publication in the ISPRS Annals of Photogrammetry, + Remote Sensing, and Spatial Information Sciences +
+
+
+
+
+ + ☆ LightOctree: Lightweight 3D Spatially-Coherent Indoor Lighting + Estimation + + +
+ We present a lightweight solution for estimating spatially-coherent indoor +lighting from a single RGB image. Previous methods for estimating illumination +using volumetric representations have overlooked the sparse distribution of +light sources in space, necessitating substantial memory and computational +resources for achieving high-quality results. We introduce a unified, voxel +octree-based illumination estimation framework to produce 3D spatially-coherent +lighting. Additionally, a differentiable voxel octree cone tracing rendering +layer is proposed to eliminate regular volumetric representation throughout the +entire process and ensure the retention of features across different frequency +domains. This reduction significantly decreases spatial usage and required +floating-point operations without substantially compromising precision. +Experimental results demonstrate that our approach achieves high-quality +coherent estimation with minimal cost compared to previous methods. + +
+
+
+
+
+ + ☆ Learning Correlation Structures for Vision Transformers CVPR 2024 + + +
+ We introduce a new attention mechanism, dubbed structural self-attention +(StructSA), that leverages rich correlation patterns naturally emerging in +key-query interactions of attention. StructSA generates attention maps by +recognizing space-time structures of key-query correlations via convolution and +uses them to dynamically aggregate local contexts of value features. This +effectively leverages rich structural patterns in images and videos such as +scene layouts, object motion, and inter-object relations. Using StructSA as a +main building block, we develop the structural vision transformer (StructViT) +and evaluate its effectiveness on both image and video classification tasks, +achieving state-of-the-art results on ImageNet-1K, Kinetics-400, +Something-Something V1 & V2, Diving-48, and FineGym. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Concept Weaver: Enabling Multi-Concept Fusion in Text-to-Image Models CVPR 2024 + + +
+ While there has been significant progress in customizing text-to-image +generation models, generating images that combine multiple personalized +concepts remains challenging. In this work, we introduce Concept Weaver, a +method for composing customized text-to-image diffusion models at inference +time. Specifically, the method breaks the process into two steps: creating a +template image aligned with the semantics of input prompts, and then +personalizing the template using a concept fusion strategy. The fusion strategy +incorporates the appearance of the target concepts into the template image +while retaining its structural details. The results indicate that our method +can generate multiple custom concepts with higher identity fidelity compared to +alternative approaches. Furthermore, the method is shown to seamlessly handle +more than two concepts and closely follow the semantic meaning of the input +prompt without blending appearances across different subjects. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Deep Phase Coded Image Prior + + +
+ Phase-coded imaging is a computational imaging method designed to tackle +tasks such as passive depth estimation and extended depth of field (EDOF) using +depth cues inserted during image capture. Most of the current deep +learning-based methods for depth estimation or all-in-focus imaging require a +training dataset with high-quality depth maps and an optimal focus point at +infinity for all-in-focus images. Such datasets are difficult to create, +usually synthetic, and require external graphic programs. We propose a new +method named "Deep Phase Coded Image Prior" (DPCIP) for jointly recovering the +depth map and all-in-focus image from a coded-phase image using solely the +captured image and the optical information of the imaging system. Our approach +does not depend on any specific dataset and surpasses prior supervised +techniques utilizing the same imaging system. This improvement is achieved +through the utilization of a problem formulation based on implicit neural +representation (INR) and deep image prior (DIP). Due to our zero-shot method, +we overcome the barrier of acquiring accurate ground-truth data of depth maps +and all-in-focus images for each new phase-coded system introduced. This allows +focusing mainly on developing the imaging system, and not on ground-truth data +collection. + +
+
+
+
+
+ + ☆ VoltaVision: A Transfer Learning model for electronic component + classification ICLR 2024 + + +
+ In this paper, we analyze the effectiveness of transfer learning on +classifying electronic components. Transfer learning reuses pre-trained models +to save time and resources in building a robust classifier rather than learning +from scratch. Our work introduces a lightweight CNN, coined as VoltaVision, and +compares its performance against more complex models. We test the hypothesis +that transferring knowledge from a similar task to our target domain yields +better results than state-of-the-art models trained on general datasets. Our +dataset and code for this work are available at +https://github.com/AnasIshfaque/VoltaVision. + +
+
+ comment: Tiny Paper at ICLR 2024 +
+
+
+
+
+ + ☆ Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and + Integration of Convolutional Neural Networks and Explainable AI + + +
+ The study introduces an integrated framework combining Convolutional Neural +Networks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced +diagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned +ResNet50 architecture, our investigation not only provides effective +differentiation of mammographic images into benign and malignant categories but +also addresses the opaque "black-box" nature of deep learning models by +employing XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN +decision-making processes for healthcare professionals. Our methodology +encompasses an elaborate data preprocessing pipeline and advanced data +augmentation techniques to counteract dataset limitations, and transfer +learning using pre-trained networks, such as VGG-16, DenseNet and ResNet was +employed. A focal point of our study is the evaluation of XAI's effectiveness +in interpreting model predictions, highlighted by utilising the Hausdorff +measure to assess the alignment between AI-generated explanations and expert +annotations quantitatively. This approach plays a critical role for XAI in +promoting trustworthiness and ethical fairness in AI-assisted diagnostics. The +findings from our research illustrate the effective collaboration between CNNs +and XAI in advancing diagnostic methods for breast cancer, thereby facilitating +a more seamless integration of advanced AI technologies within clinical +settings. By enhancing the interpretability of AI-driven decisions, this work +lays the groundwork for improved collaboration between AI systems and medical +practitioners, ultimately enriching patient care. Furthermore, the implications +of our research extend well beyond the current methodologies, advocating for +subsequent inquiries into the integration of multimodal data and the refinement +of AI explanations to satisfy the needs of clinical practice. + +
+
+
+
+
+ + ☆ LiDAR-Guided Cross-Attention Fusion for Hyperspectral Band Selection and + Image Classification + + +
+ The fusion of hyperspectral and LiDAR data has been an active research topic. +Existing fusion methods have ignored the high-dimensionality and redundancy +challenges in hyperspectral images, despite that band selection methods have +been intensively studied for hyperspectral image (HSI) processing. This paper +addresses this significant gap by introducing a cross-attention mechanism from +the transformer architecture for the selection of HSI bands guided by LiDAR +data. LiDAR provides high-resolution vertical structural information, which can +be useful in distinguishing different types of land cover that may have similar +spectral signatures but different structural profiles. In our approach, the +LiDAR data are used as the "query" to search and identify the "key" from the +HSI to choose the most pertinent bands for LiDAR. This method ensures that the +selected HSI bands drastically reduce redundancy and computational requirements +while working optimally with the LiDAR data. Extensive experiments have been +undertaken on three paired HSI and LiDAR data sets: Houston 2013, Trento and +MUUFL. The results highlight the superiority of the cross-attention mechanism, +underlining the enhanced classification accuracy of the identified HSI bands +when fused with the LiDAR features. The results also show that the use of fewer +bands combined with LiDAR surpasses the performance of state-of-the-art fusion +models. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ☆ Increasing Fairness in Classification of Out of Distribution Data for + Facial Recognition + + +
+ Standard classification theory assumes that the distribution of images in the +test and training sets are identical. Unfortunately, real-life scenarios +typically feature unseen data ("out-of-distribution data") which is different +from data in the training distribution("in-distribution"). This issue is most +prevalent in social justice problems where data from under-represented groups +may appear in the test data without representing an equal proportion of the +training data. This may result in a model returning confidently wrong decisions +and predictions. We are interested in the following question: Can the +performance of a neural network improve on facial images of out-of-distribution +data when it is trained simultaneously on multiple datasets of in-distribution +data? We approach this problem by incorporating the Outlier Exposure model and +investigate how the model's performance changes when other datasets of facial +images were implemented. We observe that the accuracy and other metrics of the +model can be increased by applying Outlier Exposure, incorporating a trainable +weight parameter to increase the machine's emphasis on outlier images, and by +re-weighting the importance of different class labels. We also experimented +with whether sorting the images and determining outliers via image features +would have more of an effect on the metrics than sorting by average pixel +value. Our goal was to make models not only more accurate but also more fair by +scanning a more expanded range of images. We also tested the datasets in +reverse order to see whether a more fair dataset with balanced features has an +effect on the model's accuracy. + +
+
+ comment: 18 pages, 6 tables, 6 figures +
+
+
+
+
+ + ☆ Mitigating Heterogeneity in Federated Multimodal Learning with + Biomedical Vision-Language Pre-training + + +
+ Vision-language pre-training (VLP) has arised as an efficient scheme for +multimodal representation learning, but it requires large-scale multimodal data +for pre-training, making it an obstacle especially for biomedical applications. +To overcome the data limitation, federated learning (FL) can be a promising +strategy to scale up the dataset for biomedical VLP while protecting data +privacy. However, client data are often heterogeneous in real-world scenarios, +and we observe that local training on heterogeneous client data would distort +the multimodal representation learning and lead to biased cross-modal +alignment. To address this challenge, we propose Federated distributional +Robust Guidance-Based (FedRGB) learning framework for federated VLP with +robustness to data heterogeneity. Specifically, we utilize a guidance-based +local training scheme to reduce feature distortions, and employ a +distribution-based min-max optimization to learn unbiased cross-modal +alignment. The experiments on real-world datasets show our method successfully +promotes efficient federated multimodal learning for biomedical VLP with data +heterogeneity. + +
+
+
+
+
+ + ☆ Vision Transformers in Domain Adaptation and Generalization: A Study of + Robustness + + +
+ Deep learning models are often evaluated in scenarios where the data +distribution is different from those used in the training and validation +phases. The discrepancy presents a challenge for accurately predicting the +performance of models once deployed on the target distribution. Domain +adaptation and generalization are widely recognized as effective strategies for +addressing such shifts, thereby ensuring reliable performance. The recent +promising results in applying vision transformers in computer vision tasks, +coupled with advancements in self-attention mechanisms, have demonstrated their +significant potential for robustness and generalization in handling +distribution shifts. Motivated by the increased interest from the research +community, our paper investigates the deployment of vision transformers in +domain adaptation and domain generalization scenarios. For domain adaptation +methods, we categorize research into feature-level, instance-level, model-level +adaptations, and hybrid approaches, along with other categorizations with +respect to diverse strategies for enhancing domain adaptation. Similarly, for +domain generalization, we categorize research into multi-domain learning, +meta-learning, regularization techniques, and data augmentation strategies. We +further classify diverse strategies in research, underscoring the various +approaches researchers have taken to address distribution shifts by integrating +vision transformers. The inclusion of comprehensive tables summarizing these +categories is a distinct feature of our work, offering valuable insights for +researchers. These findings highlight the versatility of vision transformers in +managing distribution shifts, crucial for real-world applications, especially +in critical safety and decision-making scenarios. + +
+
+ comment: 28 pages, 5 figures, Preprint submitted to Elsevier +
+
+
+
+
+ + ☆ Robust Few-Shot Ensemble Learning with Focal Diversity-Based Pruning + + +
+ This paper presents FusionShot, a focal diversity optimized few-shot ensemble +learning approach for boosting the robustness and generalization performance of +pre-trained few-shot models. The paper makes three original contributions. +First, we explore the unique characteristics of few-shot learning to ensemble +multiple few-shot (FS) models by creating three alternative fusion channels. +Second, we introduce the concept of focal error diversity to learn the most +efficient ensemble teaming strategy, rather than assuming that an ensemble of a +larger number of base models will outperform those sub-ensembles of smaller +size. We develop a focal-diversity ensemble pruning method to effectively prune +out the candidate ensembles with low ensemble error diversity and recommend +top-$K$ FS ensembles with the highest focal error diversity. Finally, we +capture the complex non-linear patterns of ensemble few-shot predictions by +designing the learn-to-combine algorithm, which can learn the diverse weight +assignments for robust ensemble fusion over different member models. Extensive +experiments on representative few-shot benchmarks show that the top-K ensembles +recommended by FusionShot can outperform the representative SOTA few-shot +models on novel tasks (different distributions and unknown at training), and +can prevail over existing few-shot learners in both cross-domain settings and +adversarial settings. For reproducibility purposes, FusionShot trained models, +results, and code are made available at https://github.com/sftekin/fusionshot + +
+
+
+
+
+ + ☆ PhysPT: Physics-aware Pretrained Transformer for Estimating Human + Dynamics from Monocular Videos + + +
+ While current methods have shown promising progress on estimating 3D human +motion from monocular videos, their motion estimates are often physically +unrealistic because they mainly consider kinematics. In this paper, we +introduce Physics-aware Pretrained Transformer (PhysPT), which improves +kinematics-based motion estimates and infers motion forces. PhysPT exploits a +Transformer encoder-decoder backbone to effectively learn human dynamics in a +self-supervised manner. Moreover, it incorporates physics principles governing +human motion. Specifically, we build a physics-based body representation and +contact force model. We leverage them to impose novel physics-inspired training +losses (i.e., force loss, contact loss, and Euler-Lagrange loss), enabling +PhysPT to capture physical properties of the human body and the forces it +experiences. Experiments demonstrate that, once trained, PhysPT can be directly +applied to kinematics-based estimates to significantly enhance their physical +plausibility and generate favourable motion forces. Furthermore, we show that +these physically meaningful quantities translate into improved accuracy of an +important downstream task: human action recognition. + +
+
+
+
+
+ + ☆ PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual + Observations + + +
+ Modeling and rendering photorealistic avatars is of crucial importance in +many applications. Existing methods that build a 3D avatar from visual +observations, however, struggle to reconstruct clothed humans. We introduce +PhysAvatar, a novel framework that combines inverse rendering with inverse +physics to automatically estimate the shape and appearance of a human from +multi-view video data along with the physical parameters of the fabric of their +clothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for +spatio-temporal mesh tracking as well as a physically based inverse renderer to +estimate the intrinsic material properties. PhysAvatar integrates a physics +simulator to estimate the physical parameters of the garments using +gradient-based optimization in a principled manner. These novel capabilities +enable PhysAvatar to create high-quality novel-view renderings of avatars +dressed in loose-fitting clothes under motions and lighting conditions not seen +in the training data. This marks a significant advancement towards modeling +photorealistic digital humans using physically based inverse rendering with +physics in the loop. Our project website is at: +https://qingqing-zhao.github.io/PhysAvatar + +
+
+ comment: Yang Zheng and Qingqing Zhao are project co-leads +
+
+
+
+
+ + ☆ Analyzing Participants' Engagement during Online Meetings Using + Unsupervised Remote Photoplethysmography with Behavioral Features + + +
+ Engagement measurement finds application in healthcare, education, +advertisement, and services. The use of physiological and behavioral features +is viable, but the impracticality of traditional physiological measurement +arises due to the need for contact sensors. We demonstrate the feasibility of +unsupervised remote photoplethysmography (rPPG) as an alternative for contact +sensors in deriving heart rate variability (HRV) features, then fusing these +with behavioral features to measure engagement in online group meetings. +Firstly, a unique Engagement Dataset of online interactions among social +workers is collected with granular engagement labels, offering insight into +virtual meeting dynamics. Secondly, a pre-trained rPPG model is customized to +reconstruct accurate rPPG signals from video meetings in an unsupervised +manner, enabling the calculation of HRV features. Thirdly, the feasibility of +estimating engagement from HRV features using short observation windows, with a +notable enhancement when using longer observation windows of two to four +minutes, is demonstrated. Fourthly, the effectiveness of behavioral cues is +evaluated and fused with physiological data, which further enhances engagement +estimation performance. An accuracy of 94% is achieved when only HRV features +are used, eliminating the need for contact sensors or ground truth signals. The +incorporation of behavioral cues raises the accuracy to 96%. Facial video +analysis offers precise engagement measurement, beneficial for future +applications. + +
+
+
+
+
+ + ☆ LOSS-SLAM: Lightweight Open-Set Semantic Simultaneous Localization and + Mapping + + +
+ Enabling robots to understand the world in terms of objects is a critical +building block towards higher level autonomy. The success of foundation models +in vision has created the ability to segment and identify nearly all objects in +the world. However, utilizing such objects to localize the robot and build an +open-set semantic map of the world remains an open research question. In this +work, a system of identifying, localizing, and encoding objects is tightly +coupled with probabilistic graphical models for performing open-set semantic +simultaneous localization and mapping (SLAM). Results are presented +demonstrating that the proposed lightweight object encoding can be used to +perform more accurate object-based SLAM than existing open-set methods, +closed-set methods, and geometric methods while incurring a lower computational +overhead than existing open-set mapping methods. + +
+
+
+
+
+ + ☆ ClickDiffusion: Harnessing LLMs for Interactive Precise Image Editing + + +
+ Recently, researchers have proposed powerful systems for generating and +manipulating images using natural language instructions. However, it is +difficult to precisely specify many common classes of image transformations +with text alone. For example, a user may wish to change the location and breed +of a particular dog in an image with several similar dogs. This task is quite +difficult with natural language alone, and would require a user to write a +laboriously complex prompt that both disambiguates the target dog and describes +the destination. We propose ClickDiffusion, a system for precise image +manipulation and generation that combines natural language instructions with +visual feedback provided by the user through a direct manipulation interface. +We demonstrate that by serializing both an image and a multi-modal instruction +into a textual representation it is possible to leverage LLMs to perform +precise transformations of the layout and appearance of an image. Code +available at https://github.com/poloclub/ClickDiffusion. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2402.07925 +
+
+
+
+
+ + ☆ Idea-2-3D: Collaborative LMM Agents Enable 3D Model Generation from + Interleaved Multimodal Inputs + + +
+ In this paper, we pursue a novel 3D AIGC setting: generating 3D content from +IDEAs. The definition of an IDEA is the composition of multimodal inputs +including text, image, and 3D models. To our knowledge, this challenging and +appealing 3D AIGC setting has not been studied before. We propose the novel +framework called Idea-2-3D to achieve this goal, which consists of three agents +based upon large multimodel models (LMMs) and several existing algorithmic +tools for them to invoke. Specifically, these three LMM-based agents are +prompted to do the jobs of prompt generation, model selection and feedback +reflection. They work in a cycle that involves both mutual collaboration and +criticism. Note that this cycle is done in a fully automatic manner, without +any human intervention. The framework then outputs a text prompt to generate 3D +models that well align with input IDEAs. We show impressive 3D AIGC results +that are beyond any previous methods can achieve. For quantitative comparisons, +we construct caption-based baselines using a whole bunch of state-of-the-art 3D +AIGC models and demonstrate Idea-2-3D out-performs significantly. In 94.2% of +cases, Idea-2-3D meets users' requirements, marking a degree of match between +IDEA and 3D models that is 2.3 times higher than baselines. Moreover, in 93.5% +of the cases, users agreed that Idea-2-3D was better than baselines. Codes, +data and models will made publicly available. + +
+
+ comment: Project Page: https://air-discover.github.io/Idea-2-3D/ Code: + https://github.com/yisuanwang/Idea23D +
+
+
+
+
+ + ☆ Pixel-wise RL on Diffusion Models: Reinforcement Learning from Rich + Feedback + + +
+ Latent diffusion models are the state-of-the-art for synthetic image +generation. To align these models with human preferences, training the models +using reinforcement learning on human feedback is crucial. Black et. al 2024 +introduced denoising diffusion policy optimisation (DDPO), which accounts for +the iterative denoising nature of the generation by modelling it as a Markov +chain with a final reward. As the reward is a single value that determines the +model's performance on the entire image, the model has to navigate a very +sparse reward landscape and so requires a large sample count. In this work, we +extend the DDPO by presenting the Pixel-wise Policy Optimisation (PXPO) +algorithm, which can take feedback for each pixel, providing a more nuanced +reward to the model. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + ☆ Koala: Key frame-conditioned long video-LLM CVPR 2024 + + +
+ Long video question answering is a challenging task that involves recognizing +short-term activities and reasoning about their fine-grained relationships. +State-of-the-art video Large Language Models (vLLMs) hold promise as a viable +solution due to their demonstrated emergent capabilities on new tasks. However, +despite being trained on millions of short seconds-long videos, vLLMs are +unable to understand minutes-long videos and accurately answer questions about +them. To address this limitation, we propose a lightweight and self-supervised +approach, Key frame-conditioned long video-LLM (Koala), that introduces +learnable spatiotemporal queries to adapt pretrained vLLMs for generalizing to +longer videos. Our approach introduces two new tokenizers that condition on +visual tokens computed from sparse video key frames for understanding short and +long video moments. We train our proposed approach on HowTo100M and demonstrate +its effectiveness on zero-shot long video understanding benchmarks, where it +outperforms state-of-the-art large models by 3 - 6% in absolute accuracy across +all tasks. Surprisingly, we also empirically show that our approach not only +helps a pretrained vLLM to understand long videos but also improves its +accuracy on short-term action recognition. + +
+
+ comment: Accepted at CVPR 2024 as a poster highlight +
+
+
+
+
+ + ☆ SpatialTracker: Tracking Any 2D Pixels in 3D Space CVPR 2024 + + +
+ Recovering dense and long-range pixel motion in videos is a challenging +problem. Part of the difficulty arises from the 3D-to-2D projection process, +leading to occlusions and discontinuities in the 2D motion domain. While 2D +motion can be intricate, we posit that the underlying 3D motion can often be +simple and low-dimensional. In this work, we propose to estimate point +trajectories in 3D space to mitigate the issues caused by image projection. Our +method, named SpatialTracker, lifts 2D pixels to 3D using monocular depth +estimators, represents the 3D content of each frame efficiently using a +triplane representation, and performs iterative updates using a transformer to +estimate 3D trajectories. Tracking in 3D allows us to leverage +as-rigid-as-possible (ARAP) constraints while simultaneously learning a +rigidity embedding that clusters pixels into different rigid parts. Extensive +evaluation shows that our approach achieves state-of-the-art tracking +performance both qualitatively and quantitatively, particularly in challenging +scenarios such as out-of-plane rotation. + +
+
+ comment: Accepted to CVPR 2024 (selected as highlight paper). Project page: + https://henry123-boy.github.io/SpaTracker/ +
+
+
+
+
+ + ☆ Robust Depth Enhancement via Polarization Prompt Fusion Tuning CVPR 2024 + + +
+ Existing depth sensors are imperfect and may provide inaccurate depth values +in challenging scenarios, such as in the presence of transparent or reflective +objects. In this work, we present a general framework that leverages +polarization imaging to improve inaccurate depth measurements from various +depth sensors. Previous polarization-based depth enhancement methods focus on +utilizing pure physics-based formulas for a single sensor. In contrast, our +method first adopts a learning-based strategy where a neural network is trained +to estimate a dense and complete depth map from polarization data and a sensor +depth map from different sensors. To further improve the performance, we +propose a Polarization Prompt Fusion Tuning (PPFT) strategy to effectively +utilize RGB-based models pre-trained on large-scale datasets, as the size of +the polarization dataset is limited to train a strong model from scratch. We +conducted extensive experiments on a public dataset, and the results +demonstrate that the proposed method performs favorably compared to existing +depth enhancement baselines. Code and demos are available at +https://lastbasket.github.io/PPFT/. + +
+
+ comment: CVPR 2024. Project page: https://lastbasket.github.io/PPFT/. The + first two authors contribute equally +
+
+
+
+
+ + ☆ Visual Knowledge in the Big Model Era: Retrospect and Prospect + + +
+ Visual knowledge is a new form of knowledge representation that can +encapsulate visual concepts and their relations in a succinct, comprehensive, +and interpretable manner, with a deep root in cognitive psychology. As the +knowledge about the visual world has been identified as an indispensable +component of human cognition and intelligence, visual knowledge is poised to +have a pivotal role in establishing machine intelligence. With the recent +advance of Artificial Intelligence (AI) techniques, large AI models (or +foundation models) have emerged as a potent tool capable of extracting +versatile patterns from broad data as implicit knowledge, and abstracting them +into an outrageous amount of numeric parameters. To pave the way for creating +visual knowledge empowered AI machines in this coming wave, we present a timely +review that investigates the origins and development of visual knowledge in the +pre-big model era, and accentuates the opportunities and unique role of visual +knowledge in the big model era. + +
+
+
+
+
+ + ☆ Implicit Assimilation of Sparse In Situ Data for Dense & Global Storm + Surge Forecasting CVPR + + +
+ Hurricanes and coastal floods are among the most disastrous natural hazards. +Both are intimately related to storm surges, as their causes and effects, +respectively. However, the short-term forecasting of storm surges has proven +challenging, especially when targeting previously unseen locations or sites +without tidal gauges. Furthermore, recent work improved short and medium-term +weather forecasting but the handling of raw unassimilated data remains +non-trivial. In this paper, we tackle both challenges and demonstrate that +neural networks can implicitly assimilate sparse in situ tide gauge data with +coarse ocean state reanalysis in order to forecast storm surges. We curate a +global dataset to learn and validate the dense prediction of storm surges, +building on preceding efforts. Other than prior work limited to known gauges, +our approach extends to ungauged sites, paving the way for global storm surge +forecasting. + +
+
+ comment: Accepted at CVPR EarthVision 2024 +
+
+
+
+
+ + ♻ ☆ DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries + + +
+ Modern video segmentation methods adopt object queries to perform inter-frame +association and demonstrate satisfactory performance in tracking continuously +appearing objects despite large-scale motion and transient occlusion. However, +they all underperform on newly emerging and disappearing objects that are +common in the real world because they attempt to model object emergence and +disappearance through feature transitions between background and foreground +queries that have significant feature gaps. We introduce Dynamic Anchor Queries +(DAQ) to shorten the transition gap between the anchor and target queries by +dynamically generating anchor queries based on the features of potential +candidates. Furthermore, we introduce a query-level object Emergence and +Disappearance Simulation (EDS) strategy, which unleashes DAQ's potential +without any additional cost. Finally, we combine our proposed DAQ and EDS with +DVIS to obtain DVIS-DAQ. Extensive experiments demonstrate that DVIS-DAQ +achieves a new state-of-the-art (SOTA) performance on five mainstream video +segmentation benchmarks. Code and models are available at +\url{https://github.com/SkyworkAI/DAQ-VS}. + +
+
+
+
+
+ + ♻ ☆ CenterGrasp: Object-Aware Implicit Representation Learning for + Simultaneous Shape Reconstruction and 6-DoF Grasp Estimation + + +
+ Reliable object grasping is a crucial capability for autonomous robots. +However, many existing grasping approaches focus on general clutter removal +without explicitly modeling objects and thus only relying on the visible local +geometry. We introduce CenterGrasp, a novel framework that combines object +awareness and holistic grasping. CenterGrasp learns a general object prior by +encoding shapes and valid grasps in a continuous latent space. It consists of +an RGB-D image encoder that leverages recent advances to detect objects and +infer their pose and latent code, and a decoder to predict shape and grasps for +each object in the scene. We perform extensive experiments on simulated as well +as real-world cluttered scenes and demonstrate strong scene reconstruction and +6-DoF grasp-pose estimation performance. Compared to the state of the art, +CenterGrasp achieves an improvement of 38.5 mm in shape reconstruction and 33 +percentage points on average in grasp success. We make the code and trained +models publicly available at http://centergrasp.cs.uni-freiburg.de. + +
+
+ comment: Accepted at RA-L. Video, code and models available at + http://centergrasp.cs.uni-freiburg.de +
+
+
+
+
+ + ♻ ☆ Modeling 3D Surface Manifolds with a Locally Conditioned Atlas + + +
+ Recently proposed 3D object reconstruction methods represent a mesh with an +atlas - a set of planar patches approximating the surface. However, their +application in a real-world scenario is limited since the surfaces of +reconstructed objects contain discontinuities, which degrades the quality of +the final mesh. This is mainly caused by independent processing of individual +patches, and in this work, we postulate to mitigate this limitation by +preserving local consistency around patch vertices. To that end, we introduce a +Locally Conditioned Atlas (LoCondA), a framework for representing a 3D object +hierarchically in a generative model. Firstly, the model maps a point cloud of +an object into a sphere. Secondly, by leveraging a spherical prior, we enforce +the mapping to be locally consistent on the sphere and on the target object. +This way, we can sample a mesh quad on that sphere and project it back onto the +object's manifold. With LoCondA, we can produce topologically diverse objects +while maintaining quads to be stitched together. We show that the proposed +approach provides structurally coherent reconstructions while producing meshes +of quality comparable to the competitors. + +
+
+
+
+
+ + ♻ ☆ Finding AI-Generated Faces in the Wild CVPR + + +
+ AI-based image generation has continued to rapidly improve, producing +increasingly more realistic images with fewer obvious visual flaws. +AI-generated images are being used to create fake online profiles which in turn +are being used for spam, fraud, and disinformation campaigns. As the general +problem of detecting any type of manipulated or synthesized content is +receiving increasing attention, here we focus on a more narrow task of +distinguishing a real face from an AI-generated face. This is particularly +applicable when tackling inauthentic online accounts with a fake user profile +photo. We show that by focusing on only faces, a more resilient and +general-purpose artifact can be detected that allows for the detection of +AI-generated faces from a variety of GAN- and diffusion-based synthesis +engines, and across image resolutions (as low as 128 x 128 pixels) and +qualities. + +
+
+ comment: to be published as: G.J.A. Porcile, J. Gindi, S. Mundra, J.R. Verbus, + and H. Farid, Finding AI-Generated Faces in the Wild, Workshop on Media + Forensics at CVPR, 2024 +
+
+
+
+
+ + ♻ ☆ WorDepth: Variational Language Prior for Monocular Depth Estimation + + +
+ Three-dimensional (3D) reconstruction from a single image is an ill-posed +problem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text +description(s) is similarly ill-posed, i.e. spatial arrangements of objects +described. We investigate the question of whether two inherently ambiguous +modalities can be used in conjunction to produce metric-scaled reconstructions. +To test this, we focus on monocular depth estimation, the problem of predicting +a dense depth map from a single image, but with an additional text caption +describing the scene. To this end, we begin by encoding the text caption as a +mean and standard deviation; using a variational framework, we learn the +distribution of the plausible metric reconstructions of 3D scenes corresponding +to the text captions as a prior. To "select" a specific reconstruction or depth +map, we encode the given image through a conditional sampler that samples from +the latent space of the variational text encoder, which is then decoded to the +output depth map. Our approach is trained alternatingly between the text and +image branches: in one optimization step, we predict the mean and standard +deviation from the text description and sample from a standard Gaussian, and in +the other, we sample using a (image) conditional sampler. Once trained, we +directly predict depth from the encoded text using the conditional sampler. We +demonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where +we show that language can consistently improve performance in both. + +
+
+
+
+
+ + ♻ ☆ SnAG: Scalable and Accurate Video Grounding CVPR 2024 + + +
+ Temporal grounding of text descriptions in videos is a central problem in +vision-language learning and video understanding. Existing methods often +prioritize accuracy over scalability -- they have been optimized for grounding +only a few text queries within short videos, and fail to scale up to long +videos with hundreds of queries. In this paper, we study the effect of +cross-modal fusion on the scalability of video grounding models. Our analysis +establishes late fusion as a more cost-effective fusion scheme for long-form +videos with many text queries. Moreover, it leads us to a novel, video-centric +sampling scheme for efficient training. Based on these findings, we present +SnAG, a simple baseline for scalable and accurate video grounding. Without +bells and whistles, SnAG is 43% more accurate and 1.5x faster than CONE, a +state of the art for long-form video grounding on the challenging MAD dataset, +while achieving highly competitive results on short videos. + +
+
+ comment: Accepted to CVPR 2024. Code available at + https://github.com/fmu2/snag_release +
+
+
+
+
+ + ♻ ☆ State Space Models for Event Cameras CVPR 2024 + + +
+ Today, state-of-the-art deep neural networks that process event-camera data +first convert a temporal window of events into dense, grid-like input +representations. As such, they exhibit poor generalizability when deployed at +higher inference frequencies (i.e., smaller temporal windows) than the ones +they were trained on. We address this challenge by introducing state-space +models (SSMs) with learnable timescale parameters to event-based vision. This +design adapts to varying frequencies without the need to retrain the network at +different frequencies. Additionally, we investigate two strategies to +counteract aliasing effects when deploying the model at higher frequencies. We +comprehensively evaluate our approach against existing methods based on RNN and +Transformer architectures across various benchmarks, including Gen1 and 1 Mpx +event camera datasets. Our results demonstrate that SSM-based models train 33% +faster and also exhibit minimal performance degradation when tested at higher +frequencies than the training input. Traditional RNN and Transformer models +exhibit performance drops of more than 20 mAP, with SSMs having a drop of 3.31 +mAP, highlighting the effectiveness of SSMs in event-based vision tasks. + +
+
+ comment: 18 pages, 5 figures, 6 tables, CVPR 2024 Camera Ready paper +
+
+
+
+
+ + ♻ ☆ Opti-CAM: Optimizing saliency maps for interpretability + + +
+ Methods based on class activation maps (CAM) provide a simple mechanism to +interpret predictions of convolutional neural networks by using linear +combinations of feature maps as saliency maps. By contrast, masking-based +methods optimize a saliency map directly in the image space or learn it by +training another network on additional data. + In this work we introduce Opti-CAM, combining ideas from CAM-based and +masking-based approaches. Our saliency map is a linear combination of feature +maps, where weights are optimized per image such that the logit of the masked +image for a given class is maximized. We also fix a fundamental flaw in two of +the most common evaluation metrics of attribution methods. On several datasets, +Opti-CAM largely outperforms other CAM-based approaches according to the most +relevant classification metrics. We provide empirical evidence supporting that +localization and classifier interpretability are not necessarily aligned. + +
+
+ comment: This work is under consideration at "Computer Vision and Image + Understanding" +
+
+
+
+
+ + ♻ ☆ EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised + Semantic Segmentation + + +
+ Semantic segmentation has innately relied on extensive pixel-level annotated +data, leading to the emergence of unsupervised methodologies. Among them, +leveraging self-supervised Vision Transformers for unsupervised semantic +segmentation (USS) has been making steady progress with expressive deep +features. Yet, for semantically segmenting images with complex objects, a +predominant challenge remains: the lack of explicit object-level semantic +encoding in patch-level features. This technical limitation often leads to +inadequate segmentation of complex objects with diverse structures. To address +this gap, we present a novel approach, EAGLE, which emphasizes object-centric +representation learning for unsupervised semantic segmentation. Specifically, +we introduce EiCue, a spectral technique providing semantic and structural cues +through an eigenbasis derived from the semantic similarity matrix of deep image +features and color affinity from an image. Further, by incorporating our +object-centric contrastive loss with EiCue, we guide our model to learn +object-level representations with intra- and inter-image object-feature +consistency, thereby enhancing semantic accuracy. Extensive experiments on +COCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art +USS results of EAGLE with accurate and consistent semantic segmentation across +complex scenes. + +
+
+
+
+
+ + ♻ ☆ On Inherent Adversarial Robustness of Active Vision Systems + + +
+ Current Deep Neural Networks are vulnerable to adversarial examples, which +alter their predictions by adding carefully crafted noise. Since human eyes are +robust to such inputs, it is possible that the vulnerability stems from the +standard way of processing inputs in one shot by processing every pixel with +the same importance. In contrast, neuroscience suggests that the human vision +system can differentiate salient features by (1) switching between multiple +fixation points (saccades) and (2) processing the surrounding with a +non-uniform external resolution (foveation). In this work, we advocate that the +integration of such active vision mechanisms into current deep learning systems +can offer robustness benefits. Specifically, we empirically demonstrate the +inherent robustness of two active vision methods - GFNet and FALcon - under a +black box threat model. By learning and inferencing based on downsampled +glimpses obtained from multiple distinct fixation points within an input, we +show that these active methods achieve (2-3) times greater robustness compared +to a standard passive convolutional network under state-of-the-art adversarial +attacks. More importantly, we provide illustrative and interpretable +visualization analysis that demonstrates how performing inference from distinct +fixation points makes active vision methods less vulnerable to malicious +inputs. + +
+
+
+
+
+ + ♻ ☆ SWAG: Splatting in the Wild images with Appearance-conditioned Gaussians + + +
+ Implicit neural representation methods have shown impressive advancements in +learning 3D scenes from unstructured in-the-wild photo collections but are +still limited by the large computational cost of volumetric rendering. More +recently, 3D Gaussian Splatting emerged as a much faster alternative with +superior rendering quality and training efficiency, especially for small-scale +and object-centric scenarios. Nevertheless, this technique suffers from poor +performance on unstructured in-the-wild data. To tackle this, we extend over 3D +Gaussian Splatting to handle unstructured image collections. We achieve this by +modeling appearance to seize photometric variations in the rendered images. +Additionally, we introduce a new mechanism to train transient Gaussians to +handle the presence of scene occluders in an unsupervised manner. Experiments +on diverse photo collection scenes and multi-pass acquisition of outdoor +landmarks show the effectiveness of our method over prior works achieving +state-of-the-art results with improved efficiency. + +
+
+
+
+
+ + ♻ ☆ Embedded Heterogeneous Attention Transformer for Cross-lingual Image + Captioning + + +
+ Cross-lingual image captioning is a challenging task that requires addressing +both cross-lingual and cross-modal obstacles in multimedia analysis. The +crucial issue in this task is to model the global and the local matching +between the image and different languages. Existing cross-modal embedding +methods based on the transformer architecture oversee the local matching +between the image region and monolingual words, especially when dealing with +diverse languages. To overcome these limitations, we propose an Embedded +Heterogeneous Attention Transformer (EHAT) to establish cross-domain +relationships and local correspondences between images and different languages +by using a heterogeneous network. EHAT comprises Masked Heterogeneous +Cross-attention (MHCA), Heterogeneous Attention Reasoning Network (HARN), and +Heterogeneous Co-attention (HCA). The HARN serves as the core network and it +captures cross-domain relationships by leveraging visual bounding box +representation features to connect word features from two languages and to +learn heterogeneous maps. MHCA and HCA facilitate cross-domain integration in +the encoder through specialized heterogeneous attention mechanisms, enabling a +single model to generate captions in two languages. We evaluate our approach on +the MSCOCO dataset to generate captions in English and Chinese, two languages +that exhibit significant differences in their language families. The +experimental results demonstrate the superior performance of our method +compared to existing advanced monolingual methods. Our proposed EHAT framework +effectively addresses the challenges of cross-lingual image captioning, paving +the way for improved multilingual image analysis and understanding. + +
+
+
+
+
+ + ♻ ☆ Self-Correcting Self-Consuming Loops for Generative Model Training + + +
+ As synthetic data becomes higher quality and proliferates on the internet, +machine learning models are increasingly trained on a mix of human- and +machine-generated data. Despite the successful stories of using synthetic data +for representation learning, using synthetic data for generative model training +creates "self-consuming loops" which may lead to training instability or even +collapse, unless certain conditions are met. Our paper aims to stabilize +self-consuming generative model training. Our theoretical results demonstrate +that by introducing an idealized correction function, which maps a data point +to be more likely under the true data distribution, self-consuming loops can be +made exponentially more stable. We then propose self-correction functions, +which rely on expert knowledge (e.g. the laws of physics programmed in a +simulator), and aim to approximate the idealized corrector automatically and at +scale. We empirically validate the effectiveness of self-correcting +self-consuming loops on the challenging human motion synthesis task, and +observe that it successfully avoids model collapse, even when the ratio of +synthetic data to real data is as high as 100%. + +
+
+ comment: This new version contains updated mathematical results (c.f. Remark + 4.4), as well as experiments for an additional generative modeling task. + Paper under submission; code is available at + https://nategillman.com/sc-sc.html +
+
+
+
+
+ + ♻ ☆ Chat-UniVi: Unified Visual Representation Empowers Large Language Models + with Image and Video Understanding CVPR 2024 + + +
+ Large language models have demonstrated impressive universal capabilities +across a wide range of open-ended tasks and have extended their utility to +encompass multimodal conversations. However, existing methods encounter +challenges in effectively handling both image and video understanding, +particularly with limited visual tokens. In this work, we introduce Chat-UniVi, +a Unified Vision-language model capable of comprehending and engaging in +conversations involving images and videos through a unified visual +representation. Specifically, we employ a set of dynamic visual tokens to +uniformly represent images and videos. This representation framework empowers +the model to efficiently utilize a limited number of visual tokens to +simultaneously capture the spatial details necessary for images and the +comprehensive temporal relationship required for videos. Moreover, we leverage +a multi-scale representation, enabling the model to perceive both high-level +semantic concepts and low-level visual details. Notably, Chat-UniVi is trained +on a mixed dataset containing both images and videos, allowing direct +application to tasks involving both mediums without requiring any +modifications. Extensive experimental results demonstrate that Chat-UniVi +consistently outperforms even existing methods exclusively designed for either +images or videos. Code is available at +https://github.com/PKU-YuanGroup/Chat-UniVi. + +
+
+ comment: Accepted by CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ PlatoNeRF: 3D Reconstruction in Plato's Cave via Single-View Two-Bounce + Lidar CVPR 2024 + + +
+ 3D reconstruction from a single-view is challenging because of the ambiguity +from monocular cues and lack of information about occluded regions. Neural +radiance fields (NeRF), while popular for view synthesis and 3D reconstruction, +are typically reliant on multi-view images. Existing methods for single-view 3D +reconstruction with NeRF rely on either data priors to hallucinate views of +occluded regions, which may not be physically accurate, or shadows observed by +RGB cameras, which are difficult to detect in ambient light and low albedo +backgrounds. We propose using time-of-flight data captured by a single-photon +avalanche diode to overcome these limitations. Our method models two-bounce +optical paths with NeRF, using lidar transient data for supervision. By +leveraging the advantages of both NeRF and two-bounce light measured by lidar, +we demonstrate that we can reconstruct visible and occluded geometry without +data priors or reliance on controlled ambient lighting or scene albedo. In +addition, we demonstrate improved generalization under practical constraints on +sensor spatial- and temporal-resolution. We believe our method is a promising +direction as single-photon lidars become ubiquitous on consumer devices, such +as phones, tablets, and headsets. + +
+
+ comment: CVPR 2024. Project Page: https://platonerf.github.io/ +
+
+
+
+
+ + ♻ ☆ Plug-and-Play image restoration with Stochastic deNOising REgularization + + +
+ Plug-and-Play (PnP) algorithms are a class of iterative algorithms that +address image inverse problems by combining a physical model and a deep neural +network for regularization. Even if they produce impressive image restoration +results, these algorithms rely on a non-standard use of a denoiser on images +that are less and less noisy along the iterations, which contrasts with recent +algorithms based on Diffusion Models (DM), where the denoiser is applied only +on re-noised images. We propose a new PnP framework, called Stochastic +deNOising REgularization (SNORE), which applies the denoiser only on images +with noise of the adequate level. It is based on an explicit stochastic +regularization, which leads to a stochastic gradient descent algorithm to solve +ill-posed inverse problems. A convergence analysis of this algorithm and its +annealing extension is provided. Experimentally, we prove that SNORE is +competitive with respect to state-of-the-art methods on deblurring and +inpainting tasks, both quantitatively and qualitatively. + +
+
+
+
+
+ + ♻ ☆ EGTR: Extracting Graph from Transformer for Scene Graph Generation CVPR 2024 + + +
+ Scene Graph Generation (SGG) is a challenging task of detecting objects and +predicting relationships between objects. After DETR was developed, one-stage +SGG models based on a one-stage object detector have been actively studied. +However, complex modeling is used to predict the relationship between objects, +and the inherent relationship between object queries learned in the multi-head +self-attention of the object detector has been neglected. We propose a +lightweight one-stage SGG model that extracts the relation graph from the +various relationships learned in the multi-head self-attention layers of the +DETR decoder. By fully utilizing the self-attention by-products, the relation +graph can be extracted effectively with a shallow relation extraction head. +Considering the dependency of the relation extraction task on the object +detection task, we propose a novel relation smoothing technique that adjusts +the relation label adaptively according to the quality of the detected objects. +By the relation smoothing, the model is trained according to the continuous +curriculum that focuses on object detection task at the beginning of training +and performs multi-task learning as the object detection performance gradually +improves. Furthermore, we propose a connectivity prediction task that predicts +whether a relation exists between object pairs as an auxiliary task of the +relation extraction. We demonstrate the effectiveness and efficiency of our +method for the Visual Genome and Open Image V6 datasets. Our code is publicly +available at https://github.com/naver-ai/egtr. + +
+
+ comment: CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Open-vocabulary object 6D pose estimation CVPR 2024 + + +
+ We introduce the new setting of open-vocabulary object 6D pose estimation, in +which a textual prompt is used to specify the object of interest. In contrast +to existing approaches, in our setting (i) the object of interest is specified +solely through the textual prompt, (ii) no object model (e.g., CAD or video +sequence) is required at inference, and (iii) the object is imaged from two +RGBD viewpoints of different scenes. To operate in this setting, we introduce a +novel approach that leverages a Vision-Language Model to segment the object of +interest from the scenes and to estimate its relative 6D pose. The key of our +approach is a carefully devised strategy to fuse object-level information +provided by the prompt with local image features, resulting in a feature space +that can generalize to novel concepts. We validate our approach on a new +benchmark based on two popular datasets, REAL275 and Toyota-Light, which +collectively encompass 34 object instances appearing in four thousand image +pairs. The results demonstrate that our approach outperforms both a +well-established hand-crafted method and a recent deep learning-based baseline +in estimating the relative 6D pose of objects in different scenes. Code and +dataset are available at https://jcorsetti.github.io/oryon. + +
+
+ comment: Camera ready version (CVPR 2024, poster highlight). 21 pages, 15 + figures, 6 tables +
+
+
+
+
+ + ♻ ☆ The Missing U for Efficient Diffusion Models + + +
+ Diffusion Probabilistic Models stand as a critical tool in generative +modelling, enabling the generation of complex data distributions. This family +of generative models yields record-breaking performance in tasks such as image +synthesis, video generation, and molecule design. Despite their capabilities, +their efficiency, especially in the reverse process, remains a challenge due to +slow convergence rates and high computational costs. In this paper, we +introduce an approach that leverages continuous dynamical systems to design a +novel denoising network for diffusion models that is more parameter-efficient, +exhibits faster convergence, and demonstrates increased noise robustness. +Experimenting with Denoising Diffusion Probabilistic Models (DDPMs), our +framework operates with approximately a quarter of the parameters, and $\sim$ +30\% of the Floating Point Operations (FLOPs) compared to standard U-Nets in +DDPMs. Furthermore, our model is notably faster in inference than the baseline +when measured in fair and equal conditions. We also provide a mathematical +intuition as to why our proposed reverse process is faster as well as a +mathematical discussion of the empirical tradeoffs in the denoising downstream +task. Finally, we argue that our method is compatible with existing performance +enhancement techniques, enabling further improvements in efficiency, quality, +and speed. + +
+
+ comment: 23 pages, 14 figures, Accepted at Transactions of Machine Learning + Research (04/2024) +
+
+
+
+
+ + ♻ ☆ DualRefine: Self-Supervised Depth and Pose Estimation Through Iterative + Epipolar Sampling and Refinement Toward Equilibrium CVPR 2023 + + +
+ Self-supervised multi-frame depth estimation achieves high accuracy by +computing matching costs of pixel correspondences between adjacent frames, +injecting geometric information into the network. These pixel-correspondence +candidates are computed based on the relative pose estimates between the +frames. Accurate pose predictions are essential for precise matching cost +computation as they influence the epipolar geometry. Furthermore, improved +depth estimates can, in turn, be used to align pose estimates. + Inspired by traditional structure-from-motion (SfM) principles, we propose +the DualRefine model, which tightly couples depth and pose estimation through a +feedback loop. Our novel update pipeline uses a deep equilibrium model +framework to iteratively refine depth estimates and a hidden state of feature +maps by computing local matching costs based on epipolar geometry. Importantly, +we used the refined depth estimates and feature maps to compute pose updates at +each step. This update in the pose estimates slowly alters the epipolar +geometry during the refinement process. Experimental results on the KITTI +dataset demonstrate competitive depth prediction and odometry prediction +performance surpassing published self-supervised baselines. + +
+
+ comment: CVPR 2023. Project page: + https://antabangun.github.io/projects/DualRefine/ Code: + https://github.com/antabangun/DualRefine +
+
+
+
+
+ + ♻ ☆ Neural Sign Actors: A diffusion model for 3D sign language production + from text CVPR 2024 + + +
+ Sign Languages (SL) serve as the primary mode of communication for the Deaf +and Hard of Hearing communities. Deep learning methods for SL recognition and +translation have achieved promising results. However, Sign Language Production +(SLP) poses a challenge as the generated motions must be realistic and have +precise semantic meaning. Most SLP methods rely on 2D data, which hinders their +realism. In this work, a diffusion-based SLP model is trained on a curated +large-scale dataset of 4D signing avatars and their corresponding text +transcripts. The proposed method can generate dynamic sequences of 3D avatars +from an unconstrained domain of discourse using a diffusion process formed on a +novel and anatomically informed graph neural network defined on the SMPL-X body +skeleton. Through quantitative and qualitative experiments, we show that the +proposed method considerably outperforms previous methods of SLP. This work +makes an important step towards realistic neural sign avatars, bridging the +communication gap between Deaf and hearing communities. + +
+
+ comment: Accepted at CVPR 2024, Project page: + https://baltatzisv.github.io/neural-sign-actors/ +
+
+
+
+
+ + ♻ ☆ Localization Is All You Evaluate: Data Leakage in Online Mapping + Datasets and How to Fix It + + +
+ The task of online mapping is to predict a local map using current sensor +observations, e.g. from lidar and camera, without relying on a pre-built map. +State-of-the-art methods are based on supervised learning and are trained +predominantly using two datasets: nuScenes and Argoverse 2. However, these +datasets revisit the same geographic locations across training, validation, and +test sets. Specifically, over $80$% of nuScenes and $40$% of Argoverse 2 +validation and test samples are less than $5$ m from a training sample. At test +time, the methods are thus evaluated more on how well they localize within a +memorized implicit map built from the training data than on extrapolating to +unseen locations. Naturally, this data leakage causes inflated performance +numbers and we propose geographically disjoint data splits to reveal the true +performance in unseen environments. Experimental results show that methods +perform considerably worse, some dropping more than $45$ mAP, when trained and +evaluated on proper data splits. Additionally, a reassessment of prior design +choices reveals diverging conclusions from those based on the original split. +Notably, the impact of lifting methods and the support from auxiliary tasks +(e.g., depth supervision) on performance appears less substantial or follows a +different trajectory than previously perceived. Splits can be found at +https://github.com/LiljaAdam/geographical-splits + +
+
+
+
+
+ + ♻ ☆ Contextual Encoder-Decoder Network for Visual Saliency Prediction + + +
+ Predicting salient regions in natural images requires the detection of +objects that are present in a scene. To develop robust representations for this +challenging task, high-level visual features at multiple spatial scales must be +extracted and augmented with contextual information. However, existing models +aimed at explaining human fixation maps do not incorporate such a mechanism +explicitly. Here we propose an approach based on a convolutional neural network +pre-trained on a large-scale image classification task. The architecture forms +an encoder-decoder structure and includes a module with multiple convolutional +layers at different dilation rates to capture multi-scale features in parallel. +Moreover, we combine the resulting representations with global scene +information for accurately predicting visual saliency. Our model achieves +competitive and consistent results across multiple evaluation metrics on two +public saliency benchmarks and we demonstrate the effectiveness of the +suggested approach on five datasets and selected examples. Compared to state of +the art approaches, the network is based on a lightweight image classification +backbone and hence presents a suitable choice for applications with limited +computational resources, such as (virtual) robotic systems, to estimate human +fixations across complex natural scenes. + +
+
+ comment: Updated contact information +
+
+
+
+
+ + ♻ ☆ Single Domain Generalization for Crowd Counting CVPR2024 + + +
+ Due to its promising results, density map regression has been widely employed +for image-based crowd counting. The approach, however, often suffers from +severe performance degradation when tested on data from unseen scenarios, the +so-called "domain shift" problem. To address the problem, we investigate in +this work single domain generalization (SDG) for crowd counting. The existing +SDG approaches are mainly for image classification and segmentation, and can +hardly be extended to our case due to its regression nature and label ambiguity +(i.e., ambiguous pixel-level ground truths). We propose MPCount, a novel +effective SDG approach even for narrow source distribution. MPCount stores +diverse density values for density map regression and reconstructs +domain-invariant features by means of only one memory bank, a content error +mask and attention consistency loss. By partitioning the image into grids, it +employs patch-wise classification as an auxiliary task to mitigate label +ambiguity. Through extensive experiments on different datasets, MPCount is +shown to significantly improve counting accuracy compared to the state of the +art under diverse scenarios unobserved in the training data characterized by +narrow source distribution. Code is available at +https://github.com/Shimmer93/MPCount. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ One model to use them all: Training a segmentation model with + complementary datasets + + +
+ Understanding a surgical scene is crucial for computer-assisted surgery +systems to provide any intelligent assistance functionality. One way of +achieving this scene understanding is via scene segmentation, where every pixel +of a frame is classified and therefore identifies the visible structures and +tissues. Progress on fully segmenting surgical scenes has been made using +machine learning. However, such models require large amounts of annotated +training data, containing examples of all relevant object classes. Such fully +annotated datasets are hard to create, as every pixel in a frame needs to be +annotated by medical experts and, therefore, are rarely available. In this +work, we propose a method to combine multiple partially annotated datasets, +which provide complementary annotations, into one model, enabling better scene +segmentation and the use of multiple readily available datasets. Our method +aims to combine available data with complementary labels by leveraging mutual +exclusive properties to maximize information. Specifically, we propose to use +positive annotations of other classes as negative samples and to exclude +background pixels of binary annotations, as we cannot tell if they contain a +class not annotated but predicted by the model. We evaluate our method by +training a DeepLabV3 on the publicly available Dresden Surgical Anatomy +Dataset, which provides multiple subsets of binary segmented anatomical +structures. Our approach successfully combines 6 classes into one model, +increasing the overall Dice Score by 4.4% compared to an ensemble of models +trained on the classes individually. By including information on multiple +classes, we were able to reduce confusion between stomach and colon by 24%. Our +results demonstrate the feasibility of training a model on multiple datasets. +This paves the way for future work further alleviating the need for one large, +fully segmented datasets. + +
+
+ comment: Accepted at IPCAI 2024; submitted to IJCARS (under revision) +
+
+
+
+
+ + ♻ ☆ Part-Attention Based Model Make Occluded Person Re-Identification + Stronger + + +
+ The goal of occluded person re-identification (ReID) is to retrieve specific +pedestrians in occluded situations. However, occluded person ReID still suffers +from background clutter and low-quality local feature representations, which +limits model performance. In our research, we introduce a new framework called +PAB-ReID, which is a novel ReID model incorporating part-attention mechanisms +to tackle the aforementioned issues effectively. Firstly, we introduce the +human parsing label to guide the generation of more accurate human part +attention maps. In addition, we propose a fine-grained feature focuser for +generating fine-grained human local feature representations while suppressing +background interference. Moreover, We also design a part triplet loss to +supervise the learning of human local features, which optimizes +intra/inter-class distance. We conducted extensive experiments on specialized +occlusion and regular ReID datasets, showcasing that our approach outperforms +the existing state-of-the-art methods. + +
+
+ comment: Accepted By International Joint Conference on Neural Networks 2024 +
+
+
+
+
+ + ♻ ☆ OMH: Structured Sparsity via Optimally Matched Hierarchy for + Unsupervised Semantic Segmentation + + +
+ Unsupervised Semantic Segmentation (USS) involves segmenting images without +relying on predefined labels, aiming to alleviate the burden of extensive human +labeling. Existing methods utilize features generated by self-supervised models +and specific priors for clustering. However, their clustering objectives are +not involved in the optimization of the features during training. Additionally, +due to the lack of clear class definitions in USS, the resulting segments may +not align well with the clustering objective. In this paper, we introduce a +novel approach called Optimally Matched Hierarchy (OMH) to simultaneously +address the above issues. The core of our method lies in imposing structured +sparsity on the feature space, which allows the features to encode information +with different levels of granularity. The structure of this sparsity stems from +our hierarchy (OMH). To achieve this, we learn a soft but sparse hierarchy +among parallel clusters through Optimal Transport. Our OMH yields better +unsupervised segmentation performance compared to existing USS methods. Our +extensive experiments demonstrate the benefits of OMH when utilizing our +differentiable paradigm. We will make our code publicly available. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ SADA: Semantic adversarial unsupervised domain adaptation for Temporal + Action Localization + + +
+ Temporal Action Localization (TAL) is a complex task that poses relevant +challenges, particularly when attempting to generalize on new -- unseen -- +domains in real-world applications. These scenarios, despite realistic, are +often neglected in the literature, exposing these solutions to important +performance degradation. In this work, we tackle this issue by introducing, for +the first time, an approach for Unsupervised Domain Adaptation (UDA) in sparse +TAL, which we refer to as Semantic Adversarial unsupervised Domain Adaptation +(SADA). Our contributions are threefold: (1) we pioneer the development of a +domain adaptation model that operates on realistic sparse action detection +benchmarks; (2) we tackle the limitations of global-distribution alignment +techniques by introducing a novel adversarial loss that is sensitive to local +class distributions, ensuring finer-grained adaptation; and (3) we present a +novel set of benchmarks based on EpicKitchens100 and CharadesEgo, that evaluate +multiple domain shifts in a comprehensive manner. Our experiments indicate that +SADA improves the adaptation across domains when compared to fully supervised +state-of-the-art and alternative UDA methods, attaining a performance boost of +up to 6.14% mAP. + +
+
+
+
+
+ + ♻ ☆ SCILLA: SurfaCe Implicit Learning for Large Urban Area, a volumetric + hybrid solution + + +
+ Neural implicit surface representation methods have recently shown impressive +3D reconstruction results. However, existing solutions struggle to reconstruct +urban outdoor scenes due to their large, unbounded, and highly detailed nature. +Hence, to achieve accurate reconstructions, additional supervision data such as +LiDAR, strong geometric priors, and long training times are required. To tackle +such issues, we present SCILLA, a new hybrid implicit surface learning method +to reconstruct large driving scenes from 2D images. SCILLA's hybrid +architecture models two separate implicit fields: one for the volumetric +density and another for the signed distance to the surface. To accurately +represent urban outdoor scenarios, we introduce a novel volume-rendering +strategy that relies on self-supervised probabilistic density estimation to +sample points near the surface and transition progressively from volumetric to +surface representation. Our solution permits a proper and fast initialization +of the signed distance field without relying on any geometric prior on the +scene, compared to concurrent methods. By conducting extensive experiments on +four outdoor driving datasets, we show that SCILLA can learn an accurate and +detailed 3D surface scene representation in various urban scenarios while being +two times faster to train compared to previous state-of-the-art solutions. + +
+
+
+
+
+ + ♻ ☆ QuickQuakeBuildings: Post-earthquake SAR-Optical Dataset for Quick + Damaged-building Detection + + +
+ Quick and automated earthquake-damaged building detection from post-event +satellite imagery is crucial, yet it is challenging due to the scarcity of +training data required to develop robust algorithms. This letter presents the +first dataset dedicated to detecting earthquake-damaged buildings from +post-event very high resolution (VHR) Synthetic Aperture Radar (SAR) and +optical imagery. Utilizing open satellite imagery and annotations acquired +after the 2023 Turkey-Syria earthquakes, we deliver a dataset of coregistered +building footprints and satellite image patches of both SAR and optical data, +encompassing more than four thousand buildings. The task of damaged building +detection is formulated as a binary image classification problem, that can also +be treated as an anomaly detection problem due to extreme class imbalance. We +provide baseline methods and results to serve as references for comparison. +Researchers can utilize this dataset to expedite algorithm development, +facilitating the rapid detection of damaged buildings in response to future +events. The dataset and codes together with detailed explanations and +visualization are made publicly available at +\url{https://github.com/ya0-sun/PostEQ-SARopt-BuildingDamage}. + +
+
+
+
+
+ + ♻ ☆ SPOT: Self-Training with Patch-Order Permutation for Object-Centric + Learning with Autoregressive Transformers CVPR 2024 + + +
+ Unsupervised object-centric learning aims to decompose scenes into +interpretable object entities, termed slots. Slot-based auto-encoders stand out +as a prominent method for this task. Within them, crucial aspects include +guiding the encoder to generate object-specific slots and ensuring the decoder +utilizes them during reconstruction. This work introduces two novel techniques, +(i) an attention-based self-training approach, which distills superior +slot-based attention masks from the decoder to the encoder, enhancing object +segmentation, and (ii) an innovative patch-order permutation strategy for +autoregressive transformers that strengthens the role of slot vectors in +reconstruction. The effectiveness of these strategies is showcased +experimentally. The combined approach significantly surpasses prior slot-based +autoencoder methods in unsupervised object segmentation, especially with +complex real-world images. We provide the implementation code at +https://github.com/gkakogeorgiou/spot . + +
+
+ comment: CVPR 2024 (Highlight). Code: https://github.com/gkakogeorgiou/spot +
+
+
+
+
+ + ♻ ☆ Learning Enriched Features via Selective State Spaces Model for + Efficient Image Deblurring + + +
+ Image deblurring aims to restore a high-quality image from its corresponding +blurred. The emergence of CNNs and Transformers has enabled significant +progress. However, these methods often face the dilemma between eliminating +long-range degradation perturbations and maintaining computational efficiency. +While the selective state space model (SSM) shows promise in modeling +long-range dependencies with linear complexity, it also encounters challenges +such as local pixel forgetting and channel redundancy. To address this issue, +we propose an efficient image deblurring network that leverages selective state +spaces model to aggregate enriched and accurate features. Specifically, we +introduce an aggregate local and global information block (ALGBlock) designed +to effectively capture and integrate both local invariant properties and +non-local information. The ALGBlock comprises two primary modules: a module for +capturing local and global features (CLGF), and a feature aggregation module +(FA). The CLGF module is composed of two branches: the global branch captures +long-range dependency features via a selective state spaces model, while the +local branch employs simplified channel attention to model local connectivity, +thereby reducing local pixel forgetting and channel redundancy. In addition, we +design a FA module to accentuate the local part by recalibrating the weight +during the aggregation of the two branches for restoration. Experimental +results demonstrate that the proposed method outperforms state-of-the-art +approaches on widely used benchmarks. + +
+
+
+
+
+ + ♻ ☆ Sculpting Holistic 3D Representation in Contrastive Language-Image-3D + Pre-training CVPR 2024 + + +
+ Contrastive learning has emerged as a promising paradigm for 3D open-world +understanding, i.e., aligning point cloud representation to image and text +embedding space individually. In this paper, we introduce MixCon3D, a simple +yet effective method aiming to sculpt holistic 3D representation in contrastive +language-image-3D pre-training. In contrast to point cloud only, we develop the +3D object-level representation from complementary perspectives, e.g., +multi-view rendered images with the point cloud. Then, MixCon3D performs +language-3D contrastive learning, comprehensively depicting real-world 3D +objects and bolstering text alignment. Additionally, we pioneer the first +thorough investigation of various training recipes for the 3D contrastive +learning paradigm, building a solid baseline with improved performance. +Extensive experiments conducted on three representative benchmarks reveal that +our method significantly improves over the baseline, surpassing the previous +state-of-the-art performance on the challenging 1,156-category Objaverse-LVIS +dataset by 5.7%. The versatility of MixCon3D is showcased in applications such +as text-to-3D retrieval and point cloud captioning, further evidencing its +efficacy in diverse scenarios. The code is available at +https://github.com/UCSC-VLAA/MixCon3D. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ MO-YOLO: End-to-End Multiple-Object Tracking Method with YOLO and + Decoder + + +
+ In the field of multi-object tracking (MOT), recent Transformer based +end-to-end models like MOTR have demonstrated exceptional performance on +datasets such as DanceTracker. However, the computational demands of these +models present challenges in training and deployment. Drawing inspiration from +successful models like GPT, we present MO-YOLO, an efficient and +computationally frugal end-to-end MOT model. MO-YOLO integrates principles from +You Only Look Once (YOLO) and RT-DETR, adopting a decoder-only approach. By +leveraging the decoder from RT-DETR and architectural components from YOLOv8, +MO-YOLO achieves high speed, shorter training times, and proficient MOT +performance. On the Dancetrack, MO-YOLO not only matches MOTR's performance but +also surpasses it, achieving over twice the frames per second (MOTR 9.5 FPS, +MO-YOLO 19.6 FPS). Furthermore, MO-YOLO demonstrates significantly reduced +training times and lower hardware requirements compared to MOTR. This research +introduces a promising paradigm for efficient end-to-end MOT, emphasizing +enhanced performance and resource efficiency. + +
+
+
+
+
+ + ♻ ☆ GaussianCube: Structuring Gaussian Splatting using Optimal Transport for + 3D Generative Modeling + + +
+ 3D Gaussian Splatting (GS) have achieved considerable improvement over Neural +Radiance Fields in terms of 3D fitting fidelity and rendering speed. However, +this unstructured representation with scattered Gaussians poses a significant +challenge for generative modeling. To address the problem, we introduce +GaussianCube, a structured GS representation that is both powerful and +efficient for generative modeling. We achieve this by first proposing a +modified densification-constrained GS fitting algorithm which can yield +high-quality fitting results using a fixed number of free Gaussians, and then +re-arranging the Gaussians into a predefined voxel grid via Optimal Transport. +The structured grid representation allows us to use standard 3D U-Net as our +backbone in diffusion generative modeling without elaborate designs. Extensive +experiments conducted on ShapeNet and OmniObject3D show that our model achieves +state-of-the-art generation results both qualitatively and quantitatively, +underscoring the potential of GaussianCube as a powerful and versatile 3D +representation. + +
+
+ comment: Fix typo in Eq.2; Project Page: https://gaussiancube.github.io/ +
+
+
+
+
+ + ♻ ☆ Mind the Exit Pupil Gap: Revisiting the Intrinsics of a Standard + Plenoptic Camera + + +
+ Among the common applications of plenoptic cameras are depth reconstruction +and post-shot refocusing. These require a calibration relating the camera-side +light field to that of the scene. Numerous methods with this goal have been +developed based on thin lens models for the plenoptic camera's main lens and +microlenses. Our work addresses the often-overlooked role of the main lens exit +pupil in these models and specifically in the decoding process of standard +plenoptic camera (SPC) images. We formally deduce the connection between the +refocusing distance and the resampling parameter for the decoded light field +and provide an analysis of the errors that arise when the exit pupil is not +considered. In addition, previous work is revisited with respect to the exit +pupil's role and all theoretical results are validated through a +ray-tracing-based simulation. With the public release of the evaluated SPC +designs alongside our simulation and experimental data we aim to contribute to +a more accurate and nuanced understanding of plenoptic camera optics. + +
+
+ comment: 29 pages, 16 figures, Accepted for publication in MDPI Sensors, + Special Issue 'Short-Range Optical 3D Scanning and 3D Data Processing ' +
+
+
+
+
+ + ♻ ☆ Theoretical and Empirical Analysis of a Fast Algorithm for Extracting + Polygons from Signed Distance Bounds + + +
+ Recently there has been renewed interest in signed distance bound +representations due to their unique properties for 3D shape modelling. This is +especially the case for deep learning-based bounds. However, it is beneficial +to work with polygons in most computer-graphics applications. Thus, in this +paper we introduce and investigate an asymptotically fast method for +transforming signed distance bounds into polygon meshes. This is achieved by +combining the principles of sphere tracing (or ray marching) with traditional +polygonization techniques, such as Marching Cubes. We provide theoretical and +experimental evidence that this approach is of the $O(N^2\log N)$ computational +complexity for a polygonization grid with $N^3$ cells. The algorithm is tested +on both a set of primitive shapes as well as signed distance bounds generated +from point clouds by machine learning (and represented as neural networks). +Given its speed, implementation simplicity and portability, we argue that it +could prove useful during the modelling stage as well as in shape compression +for storage. + The code is available here: https://github.com/nenadmarkus/gridhopping + +
+
+
+
+
+ + ♻ ☆ InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering + + +
+ Recent advances in full-head reconstruction have been obtained by optimizing +a neural field through differentiable surface or volume rendering to represent +a single scene. While these techniques achieve an unprecedented accuracy, they +take several minutes, or even hours, due to the expensive optimization process +required. In this work, we introduce InstantAvatar, a method that recovers +full-head avatars from few images (down to just one) in a few seconds on +commodity hardware. In order to speed up the reconstruction process, we propose +a system that combines, for the first time, a voxel-grid neural field +representation with a surface renderer. Notably, a naive combination of these +two techniques leads to unstable optimizations that do not converge to valid +solutions. In order to overcome this limitation, we present a novel statistical +model that learns a prior distribution over 3D head signed distance functions +using a voxel-grid based architecture. The use of this prior model, in +combination with other design choices, results into a system that achieves 3D +head reconstructions with comparable accuracy as the state-of-the-art with a +100x speed-up. + +
+
+
+
+
+ + ♻ ☆ EVREAL: Towards a Comprehensive Benchmark and Analysis Suite for + Event-based Video Reconstruction CVPR + + +
+ Event cameras are a new type of vision sensor that incorporates asynchronous +and independent pixels, offering advantages over traditional frame-based +cameras such as high dynamic range and minimal motion blur. However, their +output is not easily understandable by humans, making the reconstruction of +intensity images from event streams a fundamental task in event-based vision. +While recent deep learning-based methods have shown promise in video +reconstruction from events, this problem is not completely solved yet. To +facilitate comparison between different approaches, standardized evaluation +protocols and diverse test datasets are essential. This paper proposes a +unified evaluation methodology and introduces an open-source framework called +EVREAL to comprehensively benchmark and analyze various event-based video +reconstruction methods from the literature. Using EVREAL, we give a detailed +analysis of the state-of-the-art methods for event-based video reconstruction, +and provide valuable insights into the performance of these methods under +varying settings, challenging scenarios, and downstream tasks. + +
+
+ comment: 19 pages, 9 figures. Has been accepted for publication at the IEEE + Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), + Vancouver, 2023. The project page can be found at + https://ercanburak.github.io/evreal.html +
+
+
+
+
+ + ♻ ☆ Causal Mode Multiplexer: A Novel Framework for Unbiased Multispectral + Pedestrian Detection CVPR2024 + + +
+ RGBT multispectral pedestrian detection has emerged as a promising solution +for safety-critical applications that require day/night operations. However, +the modality bias problem remains unsolved as multispectral pedestrian +detectors learn the statistical bias in datasets. Specifically, datasets in +multispectral pedestrian detection mainly distribute between ROTO (day) and +RXTO (night) data; the majority of the pedestrian labels statistically co-occur +with their thermal features. As a result, multispectral pedestrian detectors +show poor generalization ability on examples beyond this statistical +correlation, such as ROTX data. To address this problem, we propose a novel +Causal Mode Multiplexer (CMM) framework that effectively learns the causalities +between multispectral inputs and predictions. Moreover, we construct a new +dataset (ROTX-MP) to evaluate modality bias in multispectral pedestrian +detection. ROTX-MP mainly includes ROTX examples not presented in previous +datasets. Extensive experiments demonstrate that our proposed CMM framework +generalizes well on existing datasets (KAIST, CVC-14, FLIR) and the new +ROTX-MP. We will release our new dataset to the public for future research. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Learning Prompt with Distribution-Based Feature Replay for Few-Shot + Class-Incremental Learning + + +
+ Few-shot Class-Incremental Learning (FSCIL) aims to continuously learn new +classes based on very limited training data without forgetting the old ones +encountered. Existing studies solely relied on pure visual networks, while in +this paper we solved FSCIL by leveraging the Vision-Language model (e.g., CLIP) +and propose a simple yet effective framework, named Learning Prompt with +Distribution-based Feature Replay (LP-DiF). We observe that simply using CLIP +for zero-shot evaluation can substantially outperform the most influential +methods. Then, prompt tuning technique is involved to further improve its +adaptation ability, allowing the model to continually capture specific +knowledge from each session. To prevent the learnable prompt from forgetting +old knowledge in the new session, we propose a pseudo-feature replay approach. +Specifically, we preserve the old knowledge of each class by maintaining a +feature-level Gaussian distribution with a diagonal covariance matrix, which is +estimated by the image features of training images and synthesized features +generated from a VAE. When progressing to a new session, pseudo-features are +sampled from old-class distributions combined with training images of the +current session to optimize the prompt, thus enabling the model to learn new +knowledge while retaining old knowledge. Experiments on three prevalent +benchmarks, i.e., CIFAR100, mini-ImageNet, CUB-200, and two more challenging +benchmarks, i.e., SUN-397 and CUB-200$^*$ proposed in this paper showcase the +superiority of LP-DiF, achieving new state-of-the-art (SOTA) in FSCIL. Code is +publicly available at https://github.com/1170300714/LP-DiF. + +
+
+
+
+
+ + ♻ ☆ Your Student is Better Than Expected: Adaptive Teacher-Student + Collaboration for Text-Conditional Diffusion Models CVPR2024 + + +
+ Knowledge distillation methods have recently shown to be a promising +direction to speedup the synthesis of large-scale diffusion models by requiring +only a few inference steps. While several powerful distillation methods were +recently proposed, the overall quality of student samples is typically lower +compared to the teacher ones, which hinders their practical usage. In this +work, we investigate the relative quality of samples produced by the teacher +text-to-image diffusion model and its distilled student version. As our main +empirical finding, we discover that a noticeable portion of student samples +exhibit superior fidelity compared to the teacher ones, despite the +"approximate" nature of the student. Based on this finding, we propose an +adaptive collaboration between student and teacher diffusion models for +effective text-to-image synthesis. Specifically, the distilled model produces +the initial sample, and then an oracle decides whether it needs further +improvements with a slow teacher model. Extensive experiments demonstrate that +the designed pipeline surpasses state-of-the-art text-to-image alternatives for +various inference budgets in terms of human preference. Furthermore, the +proposed approach can be naturally used in popular applications such as +text-guided image editing and controllable generation. + +
+
+ comment: CVPR2024 camera ready v2 +
+
+
+
+
+ + ♻ ☆ Predicting Traffic Flow with Federated Learning and Graph Neural with + Asynchronous Computations Network + + +
+ Real-time traffic flow prediction holds significant importance within the +domain of Intelligent Transportation Systems (ITS). The task of achieving a +balance between prediction precision and computational efficiency presents a +significant challenge. In this article, we present a novel deep-learning method +called Federated Learning and Asynchronous Graph Convolutional Network +(FLAGCN). Our framework incorporates the principles of asynchronous graph +convolutional networks with federated learning to enhance the accuracy and +efficiency of real-time traffic flow prediction. The FLAGCN model employs a +spatial-temporal graph convolution technique to asynchronously address +spatio-temporal dependencies within traffic data effectively. To efficiently +handle the computational requirements associated with this deep learning model, +this study used a graph federated learning technique known as GraphFL. This +approach is designed to facilitate the training process. The experimental +results obtained from conducting tests on two distinct traffic datasets +demonstrate that the utilization of FLAGCN leads to the optimization of both +training and inference durations while maintaining a high level of prediction +accuracy. FLAGCN outperforms existing models with significant improvements by +achieving up to approximately 6.85% reduction in RMSE, 20.45% reduction in +MAPE, compared to the best-performing existing models. + +
+
+ comment: I request to withdraw my paper from arXiv due to significant updates + and improvements identified post-submission. These enhancements will + substantially elevate the work's quality and impact. I plan to resubmit the + revised paper upon completion of these updates. Thank you for accommodating + this request +
+
+
+
+
+ + ♻ ☆ Generalizable Whole Slide Image Classification with Fine-Grained + Visual-Semantic Interaction CVPR 2024 + + +
+ Whole Slide Image (WSI) classification is often formulated as a Multiple +Instance Learning (MIL) problem. Recently, Vision-Language Models (VLMs) have +demonstrated remarkable performance in WSI classification. However, existing +methods leverage coarse-grained pathogenetic descriptions for visual +representation supervision, which are insufficient to capture the complex +visual appearance of pathogenetic images, hindering the generalizability of +models on diverse downstream tasks. Additionally, processing high-resolution +WSIs can be computationally expensive. In this paper, we propose a novel +"Fine-grained Visual-Semantic Interaction" (FiVE) framework for WSI +classification. It is designed to enhance the model's generalizability by +leveraging the interaction between localized visual patterns and fine-grained +pathological semantics. Specifically, with meticulously designed queries, we +start by utilizing a large language model to extract fine-grained pathological +descriptions from various non-standardized raw reports. The output descriptions +are then reconstructed into fine-grained labels used for training. By +introducing a Task-specific Fine-grained Semantics (TFS) module, we enable +prompts to capture crucial visual information in WSIs, which enhances +representation learning and augments generalization capabilities significantly. +Furthermore, given that pathological visual patterns are redundantly +distributed across tissue slices, we sample a subset of visual instances during +training. Our method demonstrates robust generalizability and strong +transferability, dominantly outperforming the counterparts on the TCGA Lung +Cancer dataset with at least 9.19% higher accuracy in few-shot experiments. The +code is available at: https://github.com/ls1rius/WSI_FiVE. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Dynamic Adapter Meets Prompt Tuning: Parameter-Efficient Transfer + Learning for Point Cloud Analysis CVPR 2024 + + +
+ Point cloud analysis has achieved outstanding performance by transferring +point cloud pre-trained models. However, existing methods for model adaptation +usually update all model parameters, i.e., full fine-tuning paradigm, which is +inefficient as it relies on high computational costs (e.g., training GPU +memory) and massive storage space. In this paper, we aim to study +parameter-efficient transfer learning for point cloud analysis with an ideal +trade-off between task performance and parameter efficiency. To achieve this +goal, we freeze the parameters of the default pre-trained models and then +propose the Dynamic Adapter, which generates a dynamic scale for each token, +considering the token significance to the downstream task. We further +seamlessly integrate Dynamic Adapter with Prompt Tuning (DAPT) by constructing +Internal Prompts, capturing the instance-specific features for interaction. +Extensive experiments conducted on five challenging datasets demonstrate that +the proposed DAPT achieves superior performance compared to the full +fine-tuning counterparts while significantly reducing the trainable parameters +and training GPU memory by 95% and 35%, respectively. Code is available at +https://github.com/LMD0311/DAPT. + +
+
+ comment: Accepted to CVPR 2024. Code is available at + https://github.com/LMD0311/DAPT +
+
+
+
+
+ + ♻ ☆ CapsFusion: Rethinking Image-Text Data at Scale CVPR 2024 + + +
+ Large multimodal models demonstrate remarkable generalist ability to perform +diverse multimodal tasks in a zero-shot manner. Large-scale web-based +image-text pairs contribute fundamentally to this success, but suffer from +excessive noise. Recent studies use alternative captions synthesized by +captioning models and have achieved notable benchmark performance. However, our +experiments reveal significant Scalability Deficiency and World Knowledge Loss +issues in models trained with synthetic captions, which have been largely +obscured by their initial benchmark success. Upon closer examination, we +identify the root cause as the overly-simplified language structure and lack of +knowledge details in existing synthetic captions. To provide higher-quality and +more scalable multimodal pretraining data, we propose CapsFusion, an advanced +framework that leverages large language models to consolidate and refine +information from both web-based image-text pairs and synthetic captions. +Extensive experiments show that CapsFusion captions exhibit remarkable +all-round superiority over existing captions in terms of model performance +(e.g., 18.8 and 18.3 improvements in CIDEr score on COCO and NoCaps), sample +efficiency (requiring 11-16 times less computation than baselines), world +knowledge depth, and scalability. These effectiveness, efficiency and +scalability advantages position CapsFusion as a promising candidate for future +scaling of LMM training. + +
+
+ comment: CVPR 2024. Code & Dataset: https://github.com/baaivision/CapsFusion +
+
+
+
+
+ + ♻ ☆ Visual Program Distillation: Distilling Tools and Programmatic Reasoning + into Vision-Language Models CVPR 2024 + + +
+ Solving complex visual tasks such as "Who invented the musical instrument on +the right?" involves a composition of skills: understanding space, recognizing +instruments, and also retrieving prior knowledge. Recent work shows promise by +decomposing such tasks using a large language model (LLM) into an executable +program that invokes specialized vision models. However, generated programs are +error-prone: they omit necessary steps, include spurious ones, and are unable +to recover when the specialized models give incorrect outputs. Moreover, they +require loading multiple models, incurring high latency and computation costs. +We propose Visual Program Distillation (VPD), an instruction tuning framework +that produces a vision-language model (VLM) capable of solving complex visual +tasks with a single forward pass. VPD distills the reasoning ability of LLMs by +using them to sample multiple candidate programs, which are then executed and +verified to identify a correct one. It translates each correct program into a +language description of the reasoning steps, which are then distilled into a +VLM. Extensive experiments show that VPD improves the VLM's ability to count, +understand spatial relations, and reason compositionally. Our VPD-trained +PaLI-X outperforms all prior VLMs, achieving state-of-the-art performance +across complex vision tasks, including MMBench, OK-VQA, A-OKVQA, TallyQA, POPE, +and Hateful Memes. An evaluation with human annotators also confirms that VPD +improves model response factuality and consistency. Finally, experiments on +content moderation demonstrate that VPD is also helpful for adaptation to +real-world applications with limited data. + +
+
+ comment: CVPR 2024 Oral +
+
+
+
+
+ + ♻ ☆ Detecting Heart Disease from Multi-View Ultrasound Images via Supervised + Attention Multiple Instance Learning + + +
+ Aortic stenosis (AS) is a degenerative valve condition that causes +substantial morbidity and mortality. This condition is under-diagnosed and +under-treated. In clinical practice, AS is diagnosed with expert review of +transthoracic echocardiography, which produces dozens of ultrasound images of +the heart. Only some of these views show the aortic valve. To automate +screening for AS, deep networks must learn to mimic a human expert's ability to +identify views of the aortic valve then aggregate across these relevant images +to produce a study-level diagnosis. We find previous approaches to AS detection +yield insufficient accuracy due to relying on inflexible averages across +images. We further find that off-the-shelf attention-based multiple instance +learning (MIL) performs poorly. We contribute a new end-to-end MIL approach +with two key methodological innovations. First, a supervised attention +technique guides the learned attention mechanism to favor relevant views. +Second, a novel self-supervised pretraining strategy applies contrastive +learning on the representation of the whole study instead of individual images +as commonly done in prior literature. Experiments on an open-access dataset and +an external validation set show that our approach yields higher accuracy while +reducing model size. + +
+
+ comment: Echocardiogram; multiple-instance learning; self-supervised learning; + semi-supervised learning; medical imaging +
+
+
+
+
+ + ♻ ☆ FashionEngine: Interactive Generation and Editing of 3D Clothed Humans + + +
+ We present FashionEngine, an interactive 3D human generation and editing +system that allows us to design 3D digital humans in a way that aligns with how +humans interact with the world, such as natural languages, visual perceptions, +and hand-drawing. FashionEngine automates the 3D human production with three +key components: 1) A pre-trained 3D human diffusion model that learns to model +3D humans in a semantic UV latent space from 2D image training data, which +provides strong priors for diverse generation and editing tasks. 2) +Multimodality-UV Space encoding the texture appearance, shape topology, and +textual semantics of human clothing in a canonical UV-aligned space, which +faithfully aligns the user multimodal inputs with the implicit UV latent space +for controllable 3D human editing. The multimodality-UV space is shared across +different user inputs, such as texts, images, and sketches, which enables +various joint multimodal editing tasks. 3) Multimodality-UV Aligned Sampler +learns to sample high-quality and diverse 3D humans from the diffusion prior +for multimodal user inputs. Extensive experiments validate FashionEngine's +state-of-the-art performance for conditional generation/editing tasks. In +addition, we present an interactive user interface for our FashionEngine that +enables both conditional and unconditional generation tasks, and editing tasks +including pose/view/shape control, text-, image-, and sketch-driven 3D human +editing and 3D virtual try-on, in a unified framework. Our project page is at: +https://taohuumd.github.io/projects/FashionEngine. + +
+
+ comment: Project Page: https://taohuumd.github.io/projects/FashionEngine +
+
+
+
+
+ + ♻ ☆ WaterVG: Waterway Visual Grounding based on Text-Guided Vision and + mmWave Radar + + +
+ The perception of waterways based on human intent is significant for +autonomous navigation and operations of Unmanned Surface Vehicles (USVs) in +water environments. Inspired by visual grounding, we introduce WaterVG, the +first visual grounding dataset designed for USV-based waterway perception based +on human prompts. WaterVG encompasses prompts describing multiple targets, with +annotations at the instance level including bounding boxes and masks. Notably, +WaterVG includes 11,568 samples with 34,987 referred targets, whose prompts +integrates both visual and radar characteristics. The pattern of text-guided +two sensors equips a finer granularity of text prompts with visual and radar +features of referred targets. Moreover, we propose a low-power visual grounding +model, Potamoi, which is a multi-task model with a well-designed Phased +Heterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting +(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts +required radar features to fuse with vision for prompt alignment. MHSCA is an +efficient fusion module with a remarkably small parameter count and FLOPs, +elegantly fusing scenario context captured by two sensors with linguistic +features, which performs expressively on visual grounding tasks. Comprehensive +experiments and evaluations have been conducted on WaterVG, where our Potamoi +archives state-of-the-art performances compared with counterparts. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ RadarDistill: Boosting Radar-based Object Detection Performance via + Knowledge Distillation from LiDAR Features CVPR + + +
+ The inherent noisy and sparse characteristics of radar data pose challenges +in finding effective representations for 3D object detection. In this paper, we +propose RadarDistill, a novel knowledge distillation (KD) method, which can +improve the representation of radar data by leveraging LiDAR data. RadarDistill +successfully transfers desirable characteristics of LiDAR features into radar +features using three key components: Cross-Modality Alignment (CMA), +Activation-based Feature Distillation (AFD), and Proposal-based Feature +Distillation (PFD). CMA enhances the density of radar features by employing +multiple layers of dilation operations, effectively addressing the challenge of +inefficient knowledge transfer from LiDAR to radar. AFD selectively transfers +knowledge based on regions of the LiDAR features, with a specific focus on +areas where activation intensity exceeds a predefined threshold. PFD similarly +guides the radar network to selectively mimic features from the LiDAR network +within the object proposals. Our comparative analyses conducted on the nuScenes +datasets demonstrate that RadarDistill achieves state-of-the-art (SOTA) +performance for radar-only object detection task, recording 20.5% in mAP and +43.7% in NDS. Also, RadarDistill significantly improves the performance of the +camera-radar fusion model. + +
+
+ comment: Accepted to IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR) 2024, 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ 94% on CIFAR-10 in 3.29 Seconds on a Single GPU + + +
+ CIFAR-10 is among the most widely used datasets in machine learning, +facilitating thousands of research projects per year. To accelerate research +and reduce the cost of experiments, we introduce training methods for CIFAR-10 +which reach 94% accuracy in 3.29 seconds, 95% in 10.4 seconds, and 96% in 46.3 +seconds, when run on a single NVIDIA A100 GPU. As one factor contributing to +these training speeds, we propose a derandomized variant of horizontal flipping +augmentation, which we show improves over the standard method in every case +where flipping is beneficial over no flipping at all. Our code is released at +https://github.com/KellerJordan/cifar10-airbench. + +
+
+
+
+
+ + ♻ ☆ Transient Neural Radiance Fields for Lidar View Synthesis and 3D + Reconstruction NeurIPS 2023 + + +
+ Neural radiance fields (NeRFs) have become a ubiquitous tool for modeling +scene appearance and geometry from multiview imagery. Recent work has also +begun to explore how to use additional supervision from lidar or depth sensor +measurements in the NeRF framework. However, previous lidar-supervised NeRFs +focus on rendering conventional camera imagery and use lidar-derived point +cloud data as auxiliary supervision; thus, they fail to incorporate the +underlying image formation model of the lidar. Here, we propose a novel method +for rendering transient NeRFs that take as input the raw, time-resolved photon +count histograms measured by a single-photon lidar system, and we seek to +render such histograms from novel views. Different from conventional NeRFs, the +approach relies on a time-resolved version of the volume rendering equation to +render the lidar measurements and capture transient light transport phenomena +at picosecond timescales. We evaluate our method on a first-of-its-kind dataset +of simulated and captured transient multiview scans from a prototype +single-photon lidar. Overall, our work brings NeRFs to a new dimension of +imaging at transient timescales, newly enabling rendering of transient imagery +from novel views. Additionally, we show that our approach recovers improved +geometry and conventional appearance compared to point cloud-based supervision +when training on few input viewpoints. Transient NeRFs may be especially useful +for applications which seek to simulate raw lidar measurements for downstream +tasks in autonomous driving, robotics, and remote sensing. + +
+
+ comment: NeurIPS 2023, Project Page: https://anaghmalik.com/TransientNeRF/ +
+
+
+
+
+ + ♻ ☆ RF-ULM: Ultrasound Localization Microscopy Learned from Radio-Frequency + Wavefronts + + +
+ In Ultrasound Localization Microscopy (ULM), achieving high-resolution images +relies on the precise localization of contrast agent particles across a series +of beamformed frames. However, our study uncovers an enormous potential: The +process of delay-and-sum beamforming leads to an irreversible reduction of +Radio-Frequency (RF) channel data, while its implications for localization +remain largely unexplored. The rich contextual information embedded within RF +wavefronts, including their hyperbolic shape and phase, offers great promise +for guiding Deep Neural Networks (DNNs) in challenging localization scenarios. +To fully exploit this data, we propose to directly localize scatterers in RF +channel data. Our approach involves a custom super-resolution DNN using learned +feature channel shuffling, non-maximum suppression, and a semi-global +convolutional block for reliable and accurate wavefront localization. +Additionally, we introduce a geometric point transformation that facilitates +seamless mapping to the B-mode coordinate space. To understand the impact of +beamforming on ULM, we validate the effectiveness of our method by conducting +an extensive comparison with State-Of-The-Art (SOTA) techniques. We present the +inaugural in vivo results from a wavefront-localizing DNN, highlighting its +real-world practicality. Our findings show that RF-ULM bridges the domain shift +between synthetic and real datasets, offering a considerable advantage in terms +of precision and complexity. To enable the broader research community to +benefit from our findings, our code and the associated SOTA methods are made +available at https://github.com/hahnec/rf-ulm. + +
+
+
+
+
+ + ♻ ☆ How Can Large Language Models Enable Better Socially Assistive + Human-Robot Interaction: A Brief Survey AAAI + + +
+ Socially assistive robots (SARs) have shown great success in providing +personalized cognitive-affective support for user populations with special +needs such as older adults, children with autism spectrum disorder (ASD), and +individuals with mental health challenges. The large body of work on SAR +demonstrates its potential to provide at-home support that complements +clinic-based interventions delivered by mental health professionals, making +these interventions more effective and accessible. However, there are still +several major technical challenges that hinder SAR-mediated interactions and +interventions from reaching human-level social intelligence and efficacy. With +the recent advances in large language models (LLMs), there is an increased +potential for novel applications within the field of SAR that can significantly +expand the current capabilities of SARs. However, incorporating LLMs introduces +new risks and ethical concerns that have not yet been encountered, and must be +carefully be addressed to safely deploy these more advanced systems. In this +work, we aim to conduct a brief survey on the use of LLMs in SAR technologies, +and discuss the potentials and risks of applying LLMs to the following three +major technical challenges of SAR: 1) natural language dialog; 2) multimodal +understanding; 3) LLMs as robot policies. + +
+
+ comment: 2 pages, accepted to the Proceedings of the AAAI Symposium Series, + 2024 +
+
+
+
+
+ + ♻ ☆ K-band: Self-supervised MRI Reconstruction via Stochastic Gradient + Descent over K-space Subsets + + +
+ Although deep learning (DL) methods are powerful for solving inverse +problems, their reliance on high-quality training data is a major hurdle. This +is significant in high-dimensional (dynamic/volumetric) magnetic resonance +imaging (MRI), where acquisition of high-resolution fully sampled k-space data +is impractical. We introduce a novel mathematical framework, dubbed k-band, +that enables training DL models using only partial, limited-resolution k-space +data. Specifically, we introduce training with stochastic gradient descent +(SGD) over k-space subsets. In each training iteration, rather than using the +fully sampled k-space for computing gradients, we use only a small k-space +portion. This concept is compatible with different sampling strategies; here we +demonstrate the method for k-space "bands", which have limited resolution in +one dimension and can hence be acquired rapidly. We prove analytically that our +method stochastically approximates the gradients computed in a fully-supervised +setup, when two simple conditions are met: (i) the limited-resolution axis is +chosen randomly-uniformly for every new scan, hence k-space is fully covered +across the entire training set, and (ii) the loss function is weighed with a +mask, derived here analytically, which facilitates accurate reconstruction of +high-resolution details. Numerical experiments with raw MRI data indicate that +k-band outperforms two other methods trained on limited-resolution data and +performs comparably to state-of-the-art (SoTA) methods trained on +high-resolution data. k-band hence obtains SoTA performance, with the advantage +of training using only limited-resolution data. This work hence introduces a +practical, easy-to-implement, self-supervised training framework, which +involves fast acquisition and self-supervised reconstruction and offers +theoretical guarantees. + +
+
+
+
+
+ + ♻ ☆ Cross-Silo Federated Learning Across Divergent Domains with Iterative + Parameter Alignment + + +
+ Learning from the collective knowledge of data dispersed across private +sources can provide neural networks with enhanced generalization capabilities. +Federated learning, a method for collaboratively training a machine learning +model across remote clients, achieves this by combining client models via the +orchestration of a central server. However, current approaches face two +critical limitations: i) they struggle to converge when client domains are +sufficiently different, and ii) current aggregation techniques produce an +identical global model for each client. In this work, we address these issues +by reformulating the typical federated learning setup: rather than learning a +single global model, we learn N models each optimized for a common objective. +To achieve this, we apply a weighted distance minimization to model parameters +shared in a peer-to-peer topology. The resulting framework, Iterative Parameter +Alignment, applies naturally to the cross-silo setting, and has the following +properties: (i) a unique solution for each participant, with the option to +globally converge each model in the federation, and (ii) an optional +early-stopping mechanism to elicit fairness among peers in collaborative +learning settings. These characteristics jointly provide a flexible new +framework for iteratively learning from peer models trained on disparate +datasets. We find that the technique achieves competitive results on a variety +of data partitions compared to state-of-the-art approaches. Further, we show +that the method is robust to divergent domains (i.e. disjoint classes across +peers) where existing approaches struggle. + +
+
+ comment: Published at IEEE Big Data 2023 +
+
+
+
+
+ + ♻ ☆ FairRAG: Fair Human Generation via Fair Retrieval Augmentation CVPR 2024 + + +
+ Existing text-to-image generative models reflect or even amplify societal +biases ingrained in their training data. This is especially concerning for +human image generation where models are biased against certain demographic +groups. Existing attempts to rectify this issue are hindered by the inherent +limitations of the pre-trained models and fail to substantially improve +demographic diversity. In this work, we introduce Fair Retrieval Augmented +Generation (FairRAG), a novel framework that conditions pre-trained generative +models on reference images retrieved from an external image database to improve +fairness in human generation. FairRAG enables conditioning through a +lightweight linear module that projects reference images into the textual +space. To enhance fairness, FairRAG applies simple-yet-effective debiasing +strategies, providing images from diverse demographic groups during the +generative process. Extensive experiments demonstrate that FairRAG outperforms +existing methods in terms of demographic diversity, image-text alignment, and +image fidelity while incurring minimal computational overhead during inference. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FairCLIP: Harnessing Fairness in Vision-Language Learning CVPR 2024 + + +
+ Fairness is a critical concern in deep learning, especially in healthcare, +where these models influence diagnoses and treatment decisions. Although +fairness has been investigated in the vision-only domain, the fairness of +medical vision-language (VL) models remains unexplored due to the scarcity of +medical VL datasets for studying fairness. To bridge this research gap, we +introduce the first fair vision-language medical dataset Harvard-FairVLMed that +provides detailed demographic attributes, ground-truth labels, and clinical +notes to facilitate an in-depth examination of fairness within VL foundation +models. Using Harvard-FairVLMed, we conduct a comprehensive fairness analysis +of two widely-used VL models (CLIP and BLIP2), pre-trained on both natural and +medical domains, across four different protected attributes. Our results +highlight significant biases in all VL models, with Asian, Male, Non-Hispanic, +and Spanish being the preferred subgroups across the protected attributes of +race, gender, ethnicity, and language, respectively. In order to alleviate +these biases, we propose FairCLIP, an optimal-transport-based approach that +achieves a favorable trade-off between performance and fairness by reducing the +Sinkhorn distance between the overall sample distribution and the distributions +corresponding to each demographic group. As the first VL dataset of its kind, +Harvard-FairVLMed holds the potential to catalyze advancements in the +development of machine learning models that are both ethically aware and +clinically effective. Our dataset and code are available at +https://ophai.hms.harvard.edu/datasets/harvard-fairvlmed10k. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ The devil is in the fine-grained details: Evaluating open-vocabulary + object detectors for fine-grained understanding CVPR2024 + + +
+ Recent advancements in large vision-language models enabled visual object +detection in open-vocabulary scenarios, where object classes are defined in +free-text formats during inference. In this paper, we aim to probe the +state-of-the-art methods for open-vocabulary object detection to determine to +what extent they understand fine-grained properties of objects and their parts. +To this end, we introduce an evaluation protocol based on dynamic vocabulary +generation to test whether models detect, discern, and assign the correct +fine-grained description to objects in the presence of hard-negative classes. +We contribute with a benchmark suite of increasing difficulty and probing +different properties like color, pattern, and material. We further enhance our +investigation by evaluating several state-of-the-art open-vocabulary object +detectors using the proposed protocol and find that most existing solutions, +which shine in standard open-vocabulary benchmarks, struggle to accurately +capture and distinguish finer object details. We conclude the paper by +highlighting the limitations of current methodologies and exploring promising +research directions to overcome the discovered drawbacks. Data and code are +available at https://lorebianchi98.github.io/FG-OVD/. + +
+
+ comment: Accepted as Highlight at CVPR2024 +
+
+
+
+
+ + ♻ ☆ On Pretraining Data Diversity for Self-Supervised Learning + + +
+ We explore the impact of training with more diverse datasets, characterized +by the number of unique samples, on the performance of self-supervised learning +(SSL) under a fixed computational budget. Our findings consistently demonstrate +that increasing pretraining data diversity enhances SSL performance, albeit +only when the distribution distance to the downstream data is minimal. Notably, +even with an exceptionally large pretraining data diversity achieved through +methods like web crawling or diffusion-generated data, among other ways, the +distribution shift remains a challenge. Our experiments are comprehensive with +seven SSL methods using large-scale datasets such as ImageNet and YFCC100M +amounting to over 200 GPU days. Code and trained models will be available at +https://github.com/hammoudhasan/DiversitySSL . + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Object Detectors in the Open Environment: Challenges, Solutions, and + Outlook + + +
+ With the emergence of foundation models, deep learning-based object detectors +have shown practical usability in closed set scenarios. However, for real-world +tasks, object detectors often operate in open environments, where crucial +factors (e.g., data distribution, objective) that influence model learning are +often changing. The dynamic and intricate nature of the open environment poses +novel and formidable challenges to object detectors. Unfortunately, current +research on object detectors in open environments lacks a comprehensive +analysis of their distinctive characteristics, challenges, and corresponding +solutions, which hinders their secure deployment in critical real-world +scenarios. This paper aims to bridge this gap by conducting a comprehensive +review and analysis of object detectors in open environments. We initially +identified limitations of key structural components within the existing +detection pipeline and propose the open environment object detector challenge +framework that includes four quadrants (i.e., out-of-domain, out-of-category, +robust learning, and incremental learning) based on the dimensions of the data +/ target changes. For each quadrant of challenges in the proposed framework, we +present a detailed description and systematic analysis of the overarching goals +and core difficulties, systematically review the corresponding solutions, and +benchmark their performance over multiple widely adopted datasets. In addition, +we engage in a discussion of open problems and potential avenues for future +research. This paper aims to provide a fresh, comprehensive, and systematic +understanding of the challenges and solutions associated with open-environment +object detectors, thus catalyzing the development of more solid applications in +real-world scenarios. A project related to this survey can be found at +https://github.com/LiangSiyuan21/OEOD_Survey. + +
+
+ comment: 37 pages, 17 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 142 + +
+
+
+ + ☆ Know Your Neighbors: Improving Single-View Reconstruction via Spatial + Vision-Language Reasoning CVPR 2024 + + +
+ Recovering the 3D scene geometry from a single view is a fundamental yet +ill-posed problem in computer vision. While classical depth estimation methods +infer only a 2.5D scene representation limited to the image plane, recent +approaches based on radiance fields reconstruct a full 3D representation. +However, these methods still struggle with occluded regions since inferring +geometry without visual observation requires (i) semantic knowledge of the +surroundings, and (ii) reasoning about spatial context. We propose KYN, a novel +method for single-view scene reconstruction that reasons about semantic and +spatial context to predict each point's density. We introduce a vision-language +modulation module to enrich point features with fine-grained semantic +information. We aggregate point representations across the scene through a +language-guided spatial attention mechanism to yield per-point density +predictions aware of the 3D semantic context. We show that KYN improves 3D +shape recovery compared to predicting density for each 3D point in isolation. +We achieve state-of-the-art results in scene and object reconstruction on +KITTI-360, and show improved zero-shot generalization compared to prior work. +Project page: https://ruili3.github.io/kyn. + +
+
+ comment: CVPR 2024. Project page: https://ruili3.github.io/kyn +
+
+
+
+
+ + ☆ OW-VISCap: Open-World Video Instance Segmentation and Captioning SC + + +
+ Open-world video instance segmentation is an important video understanding +task. Yet most methods either operate in a closed-world setting, require an +additional user-input, or use classic region-based proposals to identify never +before seen objects. Further, these methods only assign a one-word label to +detected objects, and don't generate rich object-centric descriptions. They +also often suffer from highly overlapping predictions. To address these issues, +we propose Open-World Video Instance Segmentation and Captioning (OW-VISCap), +an approach to jointly segment, track, and caption previously seen or unseen +objects in a video. For this, we introduce open-world object queries to +discover never before seen objects without additional user-input. We generate +rich and descriptive object-centric captions for each detected object via a +masked attention augmented LLM input. We introduce an inter-query contrastive +loss to ensure that the object queries differ from one another. Our generalized +approach matches or surpasses state-of-the-art on three tasks: open-world video +instance segmentation on the BURST dataset, dense video object captioning on +the VidSTG dataset, and closed-world video instance segmentation on the OVIS +dataset. + +
+
+ comment: Project page: https://anwesachoudhuri.github.io/OpenWorldVISCap/ +
+
+
+
+
+ + ☆ MVD-Fusion: Single-view 3D via Depth-consistent Multi-view Generation + + +
+ We present MVD-Fusion: a method for single-view 3D inference via generative +modeling of multi-view-consistent RGB-D images. While recent methods pursuing +3D inference advocate learning novel-view generative models, these generations +are not 3D-consistent and require a distillation process to generate a 3D +output. We instead cast the task of 3D inference as directly generating +mutually-consistent multiple views and build on the insight that additionally +inferring depth can provide a mechanism for enforcing this consistency. +Specifically, we train a denoising diffusion model to generate multi-view RGB-D +images given a single RGB input image and leverage the (intermediate noisy) +depth estimates to obtain reprojection-based conditioning to maintain +multi-view consistency. We train our model using large-scale synthetic dataset +Obajverse as well as the real-world CO3D dataset comprising of generic camera +viewpoints. We demonstrate that our approach can yield more accurate synthesis +compared to recent state-of-the-art, including distillation-based 3D inference +and prior multi-view generation methods. We also evaluate the geometry induced +by our multi-view depth prediction and find that it yields a more accurate +representation than other direct 3D inference approaches. + +
+
+ comment: Project page: https://mvd-fusion.github.io/ +
+
+
+
+
+ + ☆ RaFE: Generative Radiance Fields Restoration + + +
+ NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel +view synthesis and 3D reconstruction, but its performance is sensitive to input +image quality, which struggles to achieve high-fidelity rendering when provided +with low-quality sparse input viewpoints. Previous methods for NeRF restoration +are tailored for specific degradation type, ignoring the generality of +restoration. To overcome this limitation, we propose a generic radiance fields +restoration pipeline, named RaFE, which applies to various types of +degradations, such as low resolution, blurriness, noise, compression artifacts, +or their combinations. Our approach leverages the success of off-the-shelf 2D +restoration methods to recover the multi-view images individually. Instead of +reconstructing a blurred NeRF by averaging inconsistencies, we introduce a +novel approach using Generative Adversarial Networks (GANs) for NeRF generation +to better accommodate the geometric and appearance inconsistencies present in +the multi-view images. Specifically, we adopt a two-level tri-plane +architecture, where the coarse level remains fixed to represent the low-quality +NeRF, and a fine-level residual tri-plane to be added to the coarse level is +modeled as a distribution with GAN to capture potential variations in +restoration. We validate RaFE on both synthetic and real cases for various +restoration tasks, demonstrating superior performance in both quantitative and +qualitative evaluations, surpassing other 3D restoration methods specific to +single task. Please see our project website +https://zkaiwu.github.io/RaFE-Project/. + +
+
+ comment: Project Page: https://zkaiwu.github.io/RaFE-Project/ +
+
+
+
+
+ + ☆ CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept + Matching + + +
+ Diffusion models have demonstrated great success in the field of +text-to-image generation. However, alleviating the misalignment between the +text prompts and images is still challenging. The root reason behind the +misalignment has not been extensively investigated. We observe that the +misalignment is caused by inadequate token attention activation. We further +attribute this phenomenon to the diffusion model's insufficient condition +utilization, which is caused by its training paradigm. To address the issue, we +propose CoMat, an end-to-end diffusion model fine-tuning strategy with an +image-to-text concept matching mechanism. We leverage an image captioning model +to measure image-to-text alignment and guide the diffusion model to revisit +ignored tokens. A novel attribute concentration module is also proposed to +address the attribute binding problem. Without any image or human preference +data, we use only 20K text prompts to fine-tune SDXL to obtain CoMat-SDXL. +Extensive experiments show that CoMat-SDXL significantly outperforms the +baseline model SDXL in two text-to-image alignment benchmarks and achieves +start-of-the-art performance. + +
+
+ comment: Project Page: https://caraj7.github.io/comat +
+
+
+
+
+ + ☆ The More You See in 2D, the More You Perceive in 3D + + +
+ Humans can infer 3D structure from 2D images of an object based on past +experience and improve their 3D understanding as they see more images. Inspired +by this behavior, we introduce SAP3D, a system for 3D reconstruction and novel +view synthesis from an arbitrary number of unposed images. Given a few unposed +images of an object, we adapt a pre-trained view-conditioned diffusion model +together with the camera poses of the images via test-time fine-tuning. The +adapted diffusion model and the obtained camera poses are then utilized as +instance-specific priors for 3D reconstruction and novel view synthesis. We +show that as the number of input images increases, the performance of our +approach improves, bridging the gap between optimization-based prior-less 3D +reconstruction methods and single-image-to-3D diffusion-based methods. We +demonstrate our system on real images as well as standard synthetic benchmarks. +Our ablation studies confirm that this adaption behavior is key for more +accurate 3D understanding. + +
+
+ comment: Project page: https://sap3d.github.io/ +
+
+
+
+
+ + ☆ OpenNeRF: Open Set 3D Neural Scene Segmentation with Pixel-Wise Features + and Rendered Novel Views ICLR 2024 + + +
+ Large visual-language models (VLMs), like CLIP, enable open-set image +segmentation to segment arbitrary concepts from an image in a zero-shot manner. +This goes beyond the traditional closed-set assumption, i.e., where models can +only segment classes from a pre-defined training set. More recently, first +works on open-set segmentation in 3D scenes have appeared in the literature. +These methods are heavily influenced by closed-set 3D convolutional approaches +that process point clouds or polygon meshes. However, these 3D scene +representations do not align well with the image-based nature of the +visual-language models. Indeed, point cloud and 3D meshes typically have a +lower resolution than images and the reconstructed 3D scene geometry might not +project well to the underlying 2D image sequences used to compute pixel-aligned +CLIP features. To address these challenges, we propose OpenNeRF which naturally +operates on posed images and directly encodes the VLM features within the NeRF. +This is similar in spirit to LERF, however our work shows that using pixel-wise +VLM features (instead of global CLIP features) results in an overall less +complex architecture without the need for additional DINO regularization. Our +OpenNeRF further leverages NeRF's ability to render novel views and extract +open-set VLM features from areas that are not well observed in the initial +posed images. For 3D point cloud segmentation on the Replica dataset, OpenNeRF +outperforms recent open-vocabulary methods such as LERF and OpenScene by at +least +4.9 mIoU. + +
+
+ comment: ICLR 2024, Project page: https://opennerf.github.io +
+
+
+
+
+ + ☆ Decoupling Static and Hierarchical Motion Perception for Referring Video + Segmentation CVPR 2024 + + +
+ Referring video segmentation relies on natural language expressions to +identify and segment objects, often emphasizing motion clues. Previous works +treat a sentence as a whole and directly perform identification at the +video-level, mixing up static image-level cues with temporal motion cues. +However, image-level features cannot well comprehend motion cues in sentences, +and static cues are not crucial for temporal perception. In fact, static cues +can sometimes interfere with temporal perception by overshadowing motion cues. +In this work, we propose to decouple video-level referring expression +understanding into static and motion perception, with a specific emphasis on +enhancing temporal comprehension. Firstly, we introduce an +expression-decoupling module to make static cues and motion cues perform their +distinct role, alleviating the issue of sentence embeddings overlooking motion +cues. Secondly, we propose a hierarchical motion perception module to capture +temporal information effectively across varying timescales. Furthermore, we +employ contrastive learning to distinguish the motions of visually similar +objects. These contributions yield state-of-the-art performance across five +datasets, including a remarkable $\textbf{9.2%}$ $\mathcal{J\&F}$ improvement +on the challenging $\textbf{MeViS}$ dataset. Code is available at +https://github.com/heshuting555/DsHmp. + +
+
+ comment: CVPR 2024, code: https://github.com/heshuting555/DsHmp +
+
+
+
+
+ + ☆ DiffBody: Human Body Restoration by Imagining with Generative Diffusion + Prior + + +
+ Human body restoration plays a vital role in various applications related to +the human body. Despite recent advances in general image restoration using +generative models, their performance in human body restoration remains +mediocre, often resulting in foreground and background blending, over-smoothing +surface textures, missing accessories, and distorted limbs. Addressing these +challenges, we propose a novel approach by constructing a human body-aware +diffusion model that leverages domain-specific knowledge to enhance +performance. Specifically, we employ a pretrained body attention module to +guide the diffusion model's focus on the foreground, addressing issues caused +by blending between the subject and background. We also demonstrate the value +of revisiting the language modality of the diffusion model in restoration tasks +by seamlessly incorporating text prompt to improve the quality of surface +texture and additional clothing and accessories details. Additionally, we +introduce a diffusion sampler tailored for fine-grained human body parts, +utilizing local semantic information to rectify limb distortions. Lastly, we +collect a comprehensive dataset for benchmarking and advancing the field of +human body restoration. Extensive experimental validation showcases the +superiority of our approach, both quantitatively and qualitatively, over +existing methods. + +
+
+
+
+
+ + ☆ WorDepth: Variational Language Prior for Monocular Depth Estimation + + +
+ Three-dimensional (3D) reconstruction from a single image is an ill-posed +problem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text +description(s) is similarly ill-posed, i.e. spatial arrangements of objects +described. We investigate the question of whether two inherently ambiguous +modalities can be used in conjunction to produce metric-scaled reconstructions. +To test this, we focus on monocular depth estimation, the problem of predicting +a dense depth map from a single image, but with an additional text caption +describing the scene. To this end, we begin by encoding the text caption as a +mean and standard deviation; using a variational framework, we learn the +distribution of the plausible metric reconstructions of 3D scenes corresponding +to the text captions as a prior. To "select" a specific reconstruction or depth +map, we encode the given image through a conditional sampler that samples from +the latent space of the variational text encoder, which is then decoded to the +output depth map. Our approach is trained alternatingly between the text and +image branches: in one optimization step, we predict the mean and standard +deviation from the text description and sample from a standard Gaussian, and in +the other, we sample using a (image) conditional sampler. Once trained, we +directly predict depth from the encoded text using the conditional sampler. We +demonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where +we show that language can consistently improve performance in both. + +
+
+
+
+
+ + ☆ PreAfford: Universal Affordance-Based Pre-Grasping for Diverse Objects + and Environments + + +
+ Robotic manipulation of ungraspable objects with two-finger grippers presents +significant challenges due to the paucity of graspable features, while +traditional pre-grasping techniques, which rely on repositioning objects and +leveraging external aids like table edges, lack the adaptability across object +categories and scenes. Addressing this, we introduce PreAfford, a novel +pre-grasping planning framework that utilizes a point-level affordance +representation and a relay training approach to enhance adaptability across a +broad range of environments and object types, including those previously +unseen. Demonstrated on the ShapeNet-v2 dataset, PreAfford significantly +improves grasping success rates by 69% and validates its practicality through +real-world experiments. This work offers a robust and adaptable solution for +manipulating ungraspable objects. + +
+
+ comment: Project Page: https://air-discover.github.io/PreAfford/ +
+
+
+
+
+ + ☆ Reference-Based 3D-Aware Image Editing with Triplane + + +
+ Generative Adversarial Networks (GANs) have emerged as powerful tools not +only for high-quality image generation but also for real image editing through +manipulation of their interpretable latent spaces. Recent advancements in GANs +include the development of 3D-aware models such as EG3D, characterized by +efficient triplane-based architectures enabling the reconstruction of 3D +geometry from single images. However, scant attention has been devoted to +providing an integrated framework for high-quality reference-based 3D-aware +image editing within this domain. This study addresses this gap by exploring +and demonstrating the effectiveness of EG3D's triplane space for achieving +advanced reference-based edits, presenting a unique perspective on 3D-aware +image editing through our novel pipeline. Our approach integrates the encoding +of triplane features, spatial disentanglement and automatic localization of +features in the triplane domain, and fusion learning for desired image editing. +Moreover, our framework demonstrates versatility across domains, extending its +effectiveness to animal face edits and partial stylization of cartoon +portraits. The method shows significant improvements over relevant 3D-aware +latent editing and 2D reference-based editing methods, both qualitatively and +quantitatively. Project page: https://three-bee.github.io/triplane_edit + +
+
+
+
+
+ + ☆ Robust Concept Erasure Using Task Vectors + + +
+ With the rapid growth of text-to-image models, a variety of techniques have +been suggested to prevent undesirable image generations. Yet, these methods +often only protect against specific user prompts and have been shown to allow +unsafe generations with other inputs. Here we focus on unconditionally erasing +a concept from a text-to-image model rather than conditioning the erasure on +the user's prompt. We first show that compared to input-dependent erasure +methods, concept erasure that uses Task Vectors (TV) is more robust to +unexpected user inputs, not seen during training. However, TV-based erasure can +also affect the core performance of the edited model, particularly when the +required edit strength is unknown. To this end, we propose a method called +Diverse Inversion, which we use to estimate the required strength of the TV +edit. Diverse Inversion finds within the model input space a large set of word +embeddings, each of which induces the generation of the target concept. We find +that encouraging diversity in the set makes our estimation more robust to +unexpected prompts. Finally, we show that Diverse Inversion enables us to apply +a TV edit only to a subset of the model weights, enhancing the erasure +capabilities while better maintaining the core functionality of the model. + +
+
+
+
+
+ + ☆ LCM-Lookahead for Encoder-based Text-to-Image Personalization + + +
+ Recent advancements in diffusion models have introduced fast sampling methods +that can effectively produce high-quality images in just one or a few denoising +steps. Interestingly, when these are distilled from existing diffusion models, +they often maintain alignment with the original model, retaining similar +outputs for similar prompts and seeds. These properties present opportunities +to leverage fast sampling methods as a shortcut-mechanism, using them to create +a preview of denoised outputs through which we can backpropagate image-space +losses. In this work, we explore the potential of using such +shortcut-mechanisms to guide the personalization of text-to-image models to +specific facial identities. We focus on encoder-based personalization +approaches, and demonstrate that by tuning them with a lookahead identity loss, +we can achieve higher identity fidelity, without sacrificing layout diversity +or prompt alignment. We further explore the use of attention sharing mechanisms +and consistent data generation for the task of personalization, and find that +encoder training can benefit from both. + +
+
+ comment: Project page at https://lcm-lookahead.github.io/ +
+
+
+
+
+ + ☆ DeViDe: Faceted medical knowledge for improved medical vision-language + pre-training + + +
+ Vision-language pre-training for chest X-rays has made significant strides, +primarily by utilizing paired radiographs and radiology reports. However, +existing approaches often face challenges in encoding medical knowledge +effectively. While radiology reports provide insights into the current disease +manifestation, medical definitions (as used by contemporary methods) tend to be +overly abstract, creating a gap in knowledge. To address this, we propose +DeViDe, a novel transformer-based method that leverages radiographic +descriptions from the open web. These descriptions outline general visual +characteristics of diseases in radiographs, and when combined with abstract +definitions and radiology reports, provide a holistic snapshot of knowledge. +DeViDe incorporates three key features for knowledge-augmented vision language +alignment: First, a large-language model-based augmentation is employed to +homogenise medical knowledge from diverse sources. Second, this knowledge is +aligned with image information at various levels of granularity. Third, a novel +projection layer is proposed to handle the complexity of aligning each image +with multiple descriptions arising in a multi-label setting. In zero-shot +settings, DeViDe performs comparably to fully supervised models on external +datasets and achieves state-of-the-art results on three large-scale datasets. +Additionally, fine-tuning DeViDe on four downstream tasks and six segmentation +tasks showcases its superior performance across data from diverse +distributions. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2208.04060 by other authors +
+
+
+
+
+ + ☆ On the Efficiency of Convolutional Neural Networks + + +
+ Since the breakthrough performance of AlexNet in 2012, convolutional neural +networks (convnets) have grown into extremely powerful vision models. Deep +learning researchers have used convnets to produce accurate results that were +unachievable a decade ago. Yet computer scientists make computational +efficiency their primary objective. Accuracy with exorbitant cost is not +acceptable; an algorithm must also minimize its computational requirements. +Confronted with the daunting computation that convnets use, deep learning +researchers also became interested in efficiency. Researchers applied +tremendous effort to find the convnet architectures that have the greatest +efficiency. However, skepticism grew among researchers and engineers alike +about the relevance of arithmetic complexity. Contrary to the prevailing view +that latency and arithmetic complexity are irreconcilable, a simple formula +relates both through computational efficiency. This insight enabled us to +co-optimize the separate factors that determine latency. We observed that the +degenerate conv2d layers that produce the best accuracy-complexity trade-off +also have low operational intensity. Therefore, kernels that implement these +layers use significant memory resources. We solved this optimization problem +with block-fusion kernels that implement all layers of a residual block, +thereby creating temporal locality, avoiding communication, and reducing +workspace size. Our ConvFirst model with block-fusion kernels ran approximately +four times as fast as the ConvNeXt baseline with PyTorch Inductor, at equal +accuracy on the ImageNet-1K classification task. Our unified approach to +convnet efficiency envisions a new era of models and kernels that achieve +greater accuracy at lower cost. + +
+
+
+
+
+ + ☆ Per-Gaussian Embedding-Based Deformation for Deformable 3D Gaussian + Splatting + + +
+ As 3D Gaussian Splatting (3DGS) provides fast and high-quality novel view +synthesis, it is a natural extension to deform a canonical 3DGS to multiple +frames. However, previous works fail to accurately reconstruct dynamic scenes, +especially 1) static parts moving along nearby dynamic parts, and 2) some +dynamic areas are blurry. We attribute the failure to the wrong design of the +deformation field, which is built as a coordinate-based function. This approach +is problematic because 3DGS is a mixture of multiple fields centered at the +Gaussians, not just a single coordinate-based framework. To resolve this +problem, we define the deformation as a function of per-Gaussian embeddings and +temporal embeddings. Moreover, we decompose deformations as coarse and fine +deformations to model slow and fast movements, respectively. Also, we introduce +an efficient training strategy for faster convergence and higher quality. +Project page: https://jeongminb.github.io/e-d3dgs/ + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ InsectMamba: Insect Pest Classification with State Space Model + + +
+ The classification of insect pests is a critical task in agricultural +technology, vital for ensuring food security and environmental sustainability. +However, the complexity of pest identification, due to factors like high +camouflage and species diversity, poses significant obstacles. Existing methods +struggle with the fine-grained feature extraction needed to distinguish between +closely related pest species. Although recent advancements have utilized +modified network structures and combined deep learning approaches to improve +accuracy, challenges persist due to the similarity between pests and their +surroundings. To address this problem, we introduce InsectMamba, a novel +approach that integrates State Space Models (SSMs), Convolutional Neural +Networks (CNNs), Multi-Head Self-Attention mechanism (MSA), and Multilayer +Perceptrons (MLPs) within Mix-SSM blocks. This integration facilitates the +extraction of comprehensive visual features by leveraging the strengths of each +encoding strategy. A selective module is also proposed to adaptively aggregate +these features, enhancing the model's ability to discern pest characteristics. +InsectMamba was evaluated against strong competitors across five insect pest +classification datasets. The results demonstrate its superior performance and +verify the significance of each model component by an ablation study. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ SemGrasp: Semantic Grasp Generation via Language Aligned Discretization + + +
+ Generating natural human grasps necessitates consideration of not just object +geometry but also semantic information. Solely depending on object shape for +grasp generation confines the applications of prior methods in downstream +tasks. This paper presents a novel semantic-based grasp generation method, +termed SemGrasp, which generates a static human grasp pose by incorporating +semantic information into the grasp representation. We introduce a discrete +representation that aligns the grasp space with semantic space, enabling the +generation of grasp postures in accordance with language instructions. A +Multimodal Large Language Model (MLLM) is subsequently fine-tuned, integrating +object, grasp, and language within a unified semantic space. To facilitate the +training of SemGrasp, we have compiled a large-scale, grasp-text-aligned +dataset named CapGrasp, featuring about 260k detailed captions and 50k diverse +grasps. Experimental findings demonstrate that SemGrasp efficiently generates +natural human grasps in alignment with linguistic intentions. Our code, models, +and dataset are available publicly at: https://kailinli.github.io/SemGrasp. + +
+
+
+
+
+ + ☆ Towards more realistic human motion prediction with attention to motion + coordination + + +
+ Joint relation modeling is a curial component in human motion prediction. +Most existing methods rely on skeletal-based graphs to build the joint +relations, where local interactive relations between joint pairs are well +learned. However, the motion coordination, a global joint relation reflecting +the simultaneous cooperation of all joints, is usually weakened because it is +learned from part to whole progressively and asynchronously. Thus, the final +predicted motions usually appear unrealistic. To tackle this issue, we learn a +medium, called coordination attractor (CA), from the spatiotemporal features of +motion to characterize the global motion features, which is subsequently used +to build new relative joint relations. Through the CA, all joints are related +simultaneously, and thus the motion coordination of all joints can be better +learned. Based on this, we further propose a novel joint relation modeling +module, Comprehensive Joint Relation Extractor (CJRE), to combine this motion +coordination with the local interactions between joint pairs in a unified +manner. Additionally, we also present a Multi-timescale Dynamics Extractor +(MTDE) to extract enriched dynamics from the raw position information for +effective prediction. Extensive experiments show that the proposed framework +outperforms state-of-the-art methods in both short- and long-term predictions +on H3.6M, CMU-Mocap, and 3DPW. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ☆ DreamScene: 3D Gaussian-based Text-to-3D Scene Generation via Formation + Pattern Sampling + + +
+ Text-to-3D scene generation holds immense potential for the gaming, film, and +architecture sectors. Despite significant progress, existing methods struggle +with maintaining high quality, consistency, and editing flexibility. In this +paper, we propose DreamScene, a 3D Gaussian-based novel text-to-3D scene +generation framework, to tackle the aforementioned three challenges mainly via +two strategies. First, DreamScene employs Formation Pattern Sampling (FPS), a +multi-timestep sampling strategy guided by the formation patterns of 3D +objects, to form fast, semantically rich, and high-quality representations. FPS +uses 3D Gaussian filtering for optimization stability, and leverages +reconstruction techniques to generate plausible textures. Second, DreamScene +employs a progressive three-stage camera sampling strategy, specifically +designed for both indoor and outdoor settings, to effectively ensure +object-environment integration and scene-wide 3D consistency. Last, DreamScene +enhances scene editing flexibility by integrating objects and environments, +enabling targeted adjustments. Extensive experiments validate DreamScene's +superiority over current state-of-the-art techniques, heralding its +wide-ranging potential for diverse applications. Code and demos will be +released at https://dreamscene-project.github.io . + +
+
+
+
+
+ + ☆ TinyVQA: Compact Multimodal Deep Neural Network for Visual Question + Answering on Resource-Constrained Devices + + +
+ Traditional machine learning models often require powerful hardware, making +them unsuitable for deployment on resource-limited devices. Tiny Machine +Learning (tinyML) has emerged as a promising approach for running machine +learning models on these devices, but integrating multiple data modalities into +tinyML models still remains a challenge due to increased complexity, latency, +and power consumption. This paper proposes TinyVQA, a novel multimodal deep +neural network for visual question answering tasks that can be deployed on +resource-constrained tinyML hardware. TinyVQA leverages a supervised +attention-based model to learn how to answer questions about images using both +vision and language modalities. Distilled knowledge from the supervised +attention-based VQA model trains the memory aware compact TinyVQA model and low +bit-width quantization technique is employed to further compress the model for +deployment on tinyML devices. The TinyVQA model was evaluated on the FloodNet +dataset, which is used for post-disaster damage assessment. The compact model +achieved an accuracy of 79.5%, demonstrating the effectiveness of TinyVQA for +real-world applications. Additionally, the model was deployed on a Crazyflie +2.0 drone, equipped with an AI deck and GAP8 microprocessor. The TinyVQA model +achieved low latencies of 56 ms and consumes 693 mW power while deployed on the +tiny drone, showcasing its suitability for resource-constrained embedded +systems. + +
+
+ comment: Accepted as a full paper by the tinyML Research Symposium 2024 +
+
+
+
+
+ + ☆ Terrain Point Cloud Inpainting via Signal Decomposition + + +
+ The rapid development of 3D acquisition technology has made it possible to +obtain point clouds of real-world terrains. However, due to limitations in +sensor acquisition technology or specific requirements, point clouds often +contain defects such as holes with missing data. Inpainting algorithms are +widely used to patch these holes. However, existing traditional inpainting +algorithms rely on precise hole boundaries, which limits their ability to +handle cases where the boundaries are not well-defined. On the other hand, +learning-based completion methods often prioritize reconstructing the entire +point cloud instead of solely focusing on hole filling. Based on the fact that +real-world terrain exhibits both global smoothness and rich local detail, we +propose a novel representation for terrain point clouds. This representation +can help to repair the holes without clear boundaries. Specifically, it +decomposes terrains into low-frequency and high-frequency components, which are +represented by B-spline surfaces and relative height maps respectively. In this +way, the terrain point cloud inpainting problem is transformed into a B-spline +surface fitting and 2D image inpainting problem. By solving the two problems, +the highly complex and irregular holes on the terrain point clouds can be +well-filled, which not only satisfies the global terrain undulation but also +exhibits rich geometric details. The experimental results also demonstrate the +effectiveness of our method. + +
+
+
+
+
+ + ☆ PointInfinity: Resolution-Invariant Point Diffusion Models CVPR 2024 + + +
+ We present PointInfinity, an efficient family of point cloud diffusion +models. Our core idea is to use a transformer-based architecture with a +fixed-size, resolution-invariant latent representation. This enables efficient +training with low-resolution point clouds, while allowing high-resolution point +clouds to be generated during inference. More importantly, we show that scaling +the test-time resolution beyond the training resolution improves the fidelity +of generated point clouds and surfaces. We analyze this phenomenon and draw a +link to classifier-free guidance commonly used in diffusion models, +demonstrating that both allow trading off fidelity and variability during +inference. Experiments on CO3D show that PointInfinity can efficiently generate +high-resolution point clouds (up to 131k points, 31 times more than Point-E) +with state-of-the-art quality. + +
+
+ comment: Accepted to CVPR 2024, project website at + https://zixuanh.com/projects/pointinfinity +
+
+
+
+
+ + ☆ Segmentation-Guided Knee Radiograph Generation using Conditional + Diffusion Models + + +
+ Deep learning-based medical image processing algorithms require +representative data during development. In particular, surgical data might be +difficult to obtain, and high-quality public datasets are limited. To overcome +this limitation and augment datasets, a widely adopted solution is the +generation of synthetic images. In this work, we employ conditional diffusion +models to generate knee radiographs from contour and bone segmentations. +Remarkably, two distinct strategies are presented by incorporating the +segmentation as a condition into the sampling and training process, namely, +conditional sampling and conditional training. The results demonstrate that +both methods can generate realistic images while adhering to the conditioning +segmentation. The conditional training method outperforms the conditional +sampling method and the conventional U-Net. + +
+
+
+
+
+ + ☆ Is CLIP the main roadblock for fine-grained open-world perception? + + +
+ Modern applications increasingly demand flexible computer vision models that +adapt to novel concepts not encountered during training. This necessity is +pivotal in emerging domains like extended reality, robotics, and autonomous +driving, which require the ability to respond to open-world stimuli. A key +ingredient is the ability to identify objects based on free-form textual +queries defined at inference time - a task known as open-vocabulary object +detection. Multimodal backbones like CLIP are the main enabling technology for +current open-world perception solutions. Despite performing well on generic +queries, recent studies highlighted limitations on the fine-grained recognition +capabilities in open-vocabulary settings - i.e., for distinguishing subtle +object features like color, shape, and material. In this paper, we perform a +detailed examination of these open-vocabulary object recognition limitations to +find the root cause. We evaluate the performance of CLIP, the most commonly +used vision-language backbone, against a fine-grained object-matching +benchmark, revealing interesting analogies between the limitations of +open-vocabulary object detectors and their backbones. Experiments suggest that +the lack of fine-grained understanding is caused by the poor separability of +object characteristics in the CLIP latent space. Therefore, we try to +understand whether fine-grained knowledge is present in CLIP embeddings but not +exploited at inference time due, for example, to the unsuitability of the +cosine similarity matching function, which may discard important object +characteristics. Our preliminary experiments show that simple CLIP latent-space +re-projections help separate fine-grained concepts, paving the way towards the +development of backbones inherently able to process fine-grained details. The +code for reproducing these experiments is available at +https://github.com/lorebianchi98/FG-CLIP. + +
+
+
+
+
+ + ☆ If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face + Recognition through Synthetic Faces + + +
+ Recent advances in deep face recognition have spurred a growing demand for +large, diverse, and manually annotated face datasets. Acquiring authentic, +high-quality data for face recognition has proven to be a challenge, primarily +due to privacy concerns. Large face datasets are primarily sourced from +web-based images, lacking explicit user consent. In this paper, we examine +whether and how synthetic face data can be used to train effective face +recognition models with reduced reliance on authentic images, thereby +mitigating data collection concerns. First, we explored the performance gap +among recent state-of-the-art face recognition models, trained with synthetic +data only and authentic (scarce) data only. Then, we deepened our analysis by +training a state-of-the-art backbone with various combinations of synthetic and +authentic data, gaining insights into optimizing the limited use of the latter +for verification accuracy. Finally, we assessed the effectiveness of data +augmentation approaches on synthetic and authentic data, with the same goal in +mind. Our results highlighted the effectiveness of FR trained on combined +datasets, particularly when combined with appropriate augmentation techniques. + +
+
+ comment: Accepted as a full paper at FG 2024 main track +
+
+
+
+
+ + ☆ COMO: Compact Mapping and Odometry + + +
+ We present COMO, a real-time monocular mapping and odometry system that +encodes dense geometry via a compact set of 3D anchor points. Decoding anchor +point projections into dense geometry via per-keyframe depth covariance +functions guarantees that depth maps are joined together at visible anchor +points. The representation enables joint optimization of camera poses and dense +geometry, intrinsic 3D consistency, and efficient second-order inference. To +maintain a compact yet expressive map, we introduce a frontend that leverages +the covariance function for tracking and initializing potentially visually +indistinct 3D points across frames. Altogether, we introduce a real-time system +capable of estimating accurate poses and consistent geometry. + +
+
+
+
+
+ + ☆ HAPNet: Toward Superior RGB-Thermal Scene Parsing via Hybrid, + Asymmetric, and Progressive Heterogeneous Feature Fusion + + +
+ Data-fusion networks have shown significant promise for RGB-thermal scene +parsing. However, the majority of existing studies have relied on symmetric +duplex encoders for heterogeneous feature extraction and fusion, paying +inadequate attention to the inherent differences between RGB and thermal +modalities. Recent progress in vision foundation models (VFMs) trained through +self-supervision on vast amounts of unlabeled data has proven their ability to +extract informative, general-purpose features. However, this potential has yet +to be fully leveraged in the domain. In this study, we take one step toward +this new research area by exploring a feasible strategy to fully exploit VFM +features for RGB-thermal scene parsing. Specifically, we delve deeper into the +unique characteristics of RGB and thermal modalities, thereby designing a +hybrid, asymmetric encoder that incorporates both a VFM and a convolutional +neural network. This design allows for more effective extraction of +complementary heterogeneous features, which are subsequently fused in a +dual-path, progressive manner. Moreover, we introduce an auxiliary task to +further enrich the local semantics of the fused features, thereby improving the +overall performance of RGB-thermal scene parsing. Our proposed HAPNet, equipped +with all these components, demonstrates superior performance compared to all +other state-of-the-art RGB-thermal scene parsing networks, achieving top ranks +across three widely used public RGB-thermal scene parsing datasets. We believe +this new paradigm has opened up new opportunities for future developments in +data-fusion scene parsing approaches. + +
+
+ comment: 12 pages, 4figures +
+
+
+
+
+ + ☆ SDPose: Tokenized Pose Estimation via Circulation-Guide + Self-Distillation CVPR 2024 + + +
+ Recently, transformer-based methods have achieved state-of-the-art prediction +quality on human pose estimation(HPE). Nonetheless, most of these +top-performing transformer-based models are too computation-consuming and +storage-demanding to deploy on edge computing platforms. Those +transformer-based models that require fewer resources are prone to +under-fitting due to their smaller scale and thus perform notably worse than +their larger counterparts. Given this conundrum, we introduce SDPose, a new +self-distillation method for improving the performance of small +transformer-based models. To mitigate the problem of under-fitting, we design a +transformer module named Multi-Cycled Transformer(MCT) based on multiple-cycled +forwards to more fully exploit the potential of small model parameters. +Further, in order to prevent the additional inference compute-consuming brought +by MCT, we introduce a self-distillation scheme, extracting the knowledge from +the MCT module to a naive forward model. Specifically, on the MSCOCO validation +dataset, SDPose-T obtains 69.7% mAP with 4.4M parameters and 1.8 GFLOPs. +Furthermore, SDPose-S-V2 obtains 73.5% mAP on the MSCOCO validation dataset +with 6.2M parameters and 4.7 GFLOPs, achieving a new state-of-the-art among +predominant tiny neural network methods. Our code is available at +https://github.com/MartyrPenink/SDPose. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ DQ-DETR: DETR with Dynamic Query for Tiny Object Detection + + +
+ Despite previous DETR-like methods having performed successfully in generic +object detection, tiny object detection is still a challenging task for them +since the positional information of object queries is not customized for +detecting tiny objects, whose scale is extraordinarily smaller than general +objects. Also, DETR-like methods using a fixed number of queries make them +unsuitable for aerial datasets, which only contain tiny objects, and the +numbers of instances are imbalanced between different images. Thus, we present +a simple yet effective model, named DQ-DETR, which consists of three different +components: categorical counting module, counting-guided feature enhancement, +and dynamic query selection to solve the above-mentioned problems. DQ-DETR uses +the prediction and density maps from the categorical counting module to +dynamically adjust the number of object queries and improve the positional +information of queries. Our model DQ-DETR outperforms previous CNN-based and +DETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2 +dataset, which mostly consists of tiny objects. + +
+
+
+
+
+ + ☆ AdaGlimpse: Active Visual Exploration with Arbitrary Glimpse Position + and Scale + + +
+ Active Visual Exploration (AVE) is a task that involves dynamically selecting +observations (glimpses), which is critical to facilitate comprehension and +navigation within an environment. While modern AVE methods have demonstrated +impressive performance, they are constrained to fixed-scale glimpses from rigid +grids. In contrast, existing mobile platforms equipped with optical zoom +capabilities can capture glimpses of arbitrary positions and scales. To address +this gap between software and hardware capabilities, we introduce AdaGlimpse. +It uses Soft Actor-Critic, a reinforcement learning algorithm tailored for +exploration tasks, to select glimpses of arbitrary position and scale. This +approach enables our model to rapidly establish a general awareness of the +environment before zooming in for detailed analysis. Experimental results +demonstrate that AdaGlimpse surpasses previous methods across various visual +tasks while maintaining greater applicability in realistic AVE scenarios. + +
+
+
+
+
+ + ☆ Towards Automated Movie Trailer Generation CVPR 2024 + + +
+ Movie trailers are an essential tool for promoting films and attracting +audiences. However, the process of creating trailers can be time-consuming and +expensive. To streamline this process, we propose an automatic trailer +generation framework that generates plausible trailers from a full movie by +automating shot selection and composition. Our approach draws inspiration from +machine translation techniques and models the movies and trailers as sequences +of shots, thus formulating the trailer generation problem as a +sequence-to-sequence task. We introduce Trailer Generation Transformer (TGT), a +deep-learning framework utilizing an encoder-decoder architecture. TGT movie +encoder is tasked with contextualizing each movie shot representation via +self-attention, while the autoregressive trailer decoder predicts the feature +representation of the next trailer shot, accounting for the relevance of shots' +temporal order in trailers. Our TGT significantly outperforms previous methods +on a comprehensive suite of metrics. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Performance of computer vision algorithms for fine-grained + classification using crowdsourced insect images + + +
+ With fine-grained classification, we identify unique characteristics to +distinguish among classes of the same super-class. We are focusing on species +recognition in Insecta, as they are critical for biodiversity monitoring and at +the base of many ecosystems. With citizen science campaigns, billions of images +are collected in the wild. Once these are labelled, experts can use them to +create distribution maps. However, the labelling process is time-consuming, +which is where computer vision comes in. The field of computer vision offers a +wide range of algorithms, each with its strengths and weaknesses; how do we +identify the algorithm that is in line with our application? To answer this +question, we provide a full and detailed evaluation of nine algorithms among +deep convolutional networks (CNN), vision transformers (ViT), and +locality-based vision transformers (LBVT) on 4 different aspects: +classification performance, embedding quality, computational cost, and gradient +activity. We offer insights that we haven't yet had in this domain proving to +which extent these algorithms solve the fine-grained tasks in Insecta. We found +that the ViT performs the best on inference speed and computational cost while +the LBVT outperforms the others on performance and embedding quality; the CNN +provide a trade-off among the metrics. + +
+
+
+
+
+ + ☆ You Only Scan Once: A Dynamic Scene Reconstruction Pipeline for 6-DoF + Robotic Grasping of Novel Objects ICRA 2024 + + +
+ In the realm of robotic grasping, achieving accurate and reliable +interactions with the environment is a pivotal challenge. Traditional methods +of grasp planning methods utilizing partial point clouds derived from depth +image often suffer from reduced scene understanding due to occlusion, +ultimately impeding their grasping accuracy. Furthermore, scene reconstruction +methods have primarily relied upon static techniques, which are susceptible to +environment change during manipulation process limits their efficacy in +real-time grasping tasks. To address these limitations, this paper introduces a +novel two-stage pipeline for dynamic scene reconstruction. In the first stage, +our approach takes scene scanning as input to register each target object with +mesh reconstruction and novel object pose tracking. In the second stage, pose +tracking is still performed to provide object poses in real-time, enabling our +approach to transform the reconstructed object point clouds back into the +scene. Unlike conventional methodologies, which rely on static scene snapshots, +our method continuously captures the evolving scene geometry, resulting in a +comprehensive and up-to-date point cloud representation. By circumventing the +constraints posed by occlusion, our method enhances the overall grasp planning +process and empowers state-of-the-art 6-DoF robotic grasping algorithms to +exhibit markedly improved accuracy. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ☆ How Much Data are Enough? Investigating Dataset Requirements for + Patch-Based Brain MRI Segmentation Tasks + + +
+ Training deep neural networks reliably requires access to large-scale +datasets. However, obtaining such datasets can be challenging, especially in +the context of neuroimaging analysis tasks, where the cost associated with +image acquisition and annotation can be prohibitive. To mitigate both the time +and financial costs associated with model development, a clear understanding of +the amount of data required to train a satisfactory model is crucial. This +paper focuses on an early stage phase of deep learning research, prior to model +development, and proposes a strategic framework for estimating the amount of +annotated data required to train patch-based segmentation networks. This +framework includes the establishment of performance expectations using a novel +Minor Boundary Adjustment for Threshold (MinBAT) method, and standardizing +patch selection through the ROI-based Expanded Patch Selection (REPS) method. +Our experiments demonstrate that tasks involving regions of interest (ROIs) +with different sizes or shapes may yield variably acceptable Dice Similarity +Coefficient (DSC) scores. By setting an acceptable DSC as the target, the +required amount of training data can be estimated and even predicted as data +accumulates. This approach could assist researchers and engineers in estimating +the cost associated with data collection and annotation when defining a new +segmentation task based on deep neural networks, ultimately contributing to +their efficient translation to real-world applications. + +
+
+
+
+
+ + ☆ SP$^2$OT: Semantic-Regularized Progressive Partial Optimal Transport for + Imbalanced Clustering + + +
+ Deep clustering, which learns representation and semantic clustering without +labels information, poses a great challenge for deep learning-based approaches. +Despite significant progress in recent years, most existing methods focus on +uniformly distributed datasets, significantly limiting the practical +applicability of their methods. In this paper, we propose a more practical +problem setting named deep imbalanced clustering, where the underlying classes +exhibit an imbalance distribution. To address this challenge, we introduce a +novel optimal transport-based pseudo-label learning framework. Our framework +formulates pseudo-label generation as a Semantic-regularized Progressive +Partial Optimal Transport (SP$^2$OT) problem, which progressively transports +each sample to imbalanced clusters under several prior distribution and +semantic relation constraints, thus generating high-quality and imbalance-aware +pseudo-labels. To solve SP$^2$OT, we develop a Majorization-Minimization-based +optimization algorithm. To be more precise, we employ the strategy of +majorization to reformulate the SP$^2$OT problem into a Progressive Partial +Optimal Transport problem, which can be transformed into an unbalanced optimal +transport problem with augmented constraints and can be solved efficiently by a +fast matrix scaling algorithm. Experiments on various datasets, including a +human-curated long-tailed CIFAR100, challenging ImageNet-R, and large-scale +subsets of fine-grained iNaturalist2018 datasets, demonstrate the superiority +of our method. + +
+
+ comment: under review. arXiv admin note: substantial text overlap with + arXiv:2401.09266 +
+
+
+
+
+ + ☆ Part-Attention Based Model Make Occluded Person Re-Identification + Stronger + + +
+ The goal of occluded person re-identification (ReID) is to retrieve specific +pedestrians in occluded situations. However, occluded person ReID still suffers +from background clutter and low-quality local feature representations, which +limits model performance. In our research, we introduce a new framework called +PAB-ReID, which is a novel ReID model incorporating part-attention mechanisms +to tackle the aforementioned issues effectively. Firstly, we introduce the +human parsing label to guide the generation of more accurate human part +attention maps. In addition, we propose a fine-grained feature focuser for +generating fine-grained human local feature representations while suppressing +background interference. Moreover, We also design a part triplet loss to +supervise the learning of human local features, which optimizes +intra/inter-class distance. We conducted extensive experiments on specialized +occlusion and regular ReID datasets, showcasing that our approach outperforms +the existing state-of-the-art methods. + +
+
+ comment: Accepted By International Joint Conference on Neural Networks +
+
+
+
+
+ + ☆ ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State + Space Model + + +
+ Convolutional neural networks (CNN) and Transformers have made impressive +progress in the field of remote sensing change detection (CD). However, both +architectures have their inherent shortcomings. Recently, the Mamba +architecture, based on spatial state models, has shown remarkable performance +in a series of natural language processing tasks, which can effectively +compensate for the shortcomings of the above two architectures. In this paper, +we explore for the first time the potential of the Mamba architecture for +remote sensing change detection tasks. We tailor the corresponding frameworks, +called MambaBCD, MambaSCD, and MambaBDA, for binary change detection (BCD), +semantic change detection (SCD), and building damage assessment (BDA), +respectively. All three frameworks adopt the cutting-edge visual Mamba +architecture as the encoder, which allows full learning of global spatial +contextual information from the input images. For the change decoder, which is +available in all three architectures, we propose three spatio-temporal +relationship modeling mechanisms, which can be naturally combined with the +Mamba architecture and fully utilize its attribute to achieve spatio-temporal +interaction of multi-temporal features and obtain accurate change information. +On five benchmark datasets, our proposed frameworks outperform current CNN- and +Transformer-based approaches without using any complex strategies or tricks, +fully demonstrating the potential of the Mamba architecture. Specifically, we +obtained 83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, +LEVIR-CD+, and WHU-CD; on the SCD dataset SECOND, we obtained 24.04% SeK; and +on the xBD dataset, we obtained 81.41% overall F1 score. The source code will +be available in https://github.com/ChenHongruixuan/MambaCD + +
+
+
+
+
+ + ☆ Generalizable 3D Scene Reconstruction via Divide and Conquer from a + Single View + + +
+ Single-view 3D reconstruction is currently approached from two dominant +perspectives: reconstruction of scenes with limited diversity using 3D data +supervision or reconstruction of diverse singular objects using large image +priors. However, real-world scenarios are far more complex and exceed the +capabilities of these methods. We therefore propose a hybrid method following a +divide-and-conquer strategy. We first process the scene holistically, +extracting depth and semantic information, and then leverage a single-shot +object-level method for the detailed reconstruction of individual components. +By following a compositional processing approach, the overall framework +achieves full reconstruction of complex 3D scenes from a single image. We +purposely design our pipeline to be highly modular by carefully integrating +specific procedures for each processing step, without requiring an end-to-end +training of the whole system. This enables the pipeline to naturally improve as +future methods can replace the individual modules. We demonstrate the +reconstruction performance of our approach on both synthetic and real-world +scenes, comparing favorable against prior works. Project page: +https://andreeadogaru.github.io/Gen3DSR. + +
+
+
+
+
+ + ☆ NMF-Based Analysis of Mobile Eye-Tracking Data + + +
+ The depiction of scanpaths from mobile eye-tracking recordings by thumbnails +from the stimulus allows the application of visual computing to detect areas of +interest in an unsupervised way. We suggest using nonnegative matrix +factorization (NMF) to identify such areas in stimuli. For a user-defined +integer k, NMF produces an explainable decomposition into k components, each +consisting of a spatial representation associated with a temporal indicator. In +the context of multiple eye-tracking recordings, this leads to k spatial +representations, where the temporal indicator highlights the appearance within +recordings. The choice of k provides an opportunity to control the refinement +of the decomposition, i.e., the number of areas to detect. We combine our +NMF-based approach with visualization techniques to enable an exploratory +analysis of multiple recordings. Finally, we demonstrate the usefulness of our +approach with mobile eye-tracking data of an art gallery. + +
+
+
+
+
+ + ☆ Future Predictive Success-or-Failure Classification for Long-Horizon + Robotic Tasks IJCNN 2024 + + +
+ Automating long-horizon tasks with a robotic arm has been a central research +topic in robotics. Optimization-based action planning is an efficient approach +for creating an action plan to complete a given task. Construction of a +reliable planning method requires a design process of conditions, e.g., to +avoid collision between objects. The design process, however, has two critical +issues: 1) iterative trials--the design process is time-consuming due to the +trial-and-error process of modifying conditions, and 2) manual redesign--it is +difficult to cover all the necessary conditions manually. To tackle these +issues, this paper proposes a future-predictive +success-or-failure-classification method to obtain conditions automatically. +The key idea behind the proposed method is an end-to-end approach for +determining whether the action plan can complete a given task instead of +manually redesigning the conditions. The proposed method uses a long-horizon +future-prediction method to enable success-or-failure classification without +the execution of an action plan. This paper also proposes a regularization term +called transition consistency regularization to provide easy-to-predict feature +distribution. The regularization term improves future prediction and +classification performance. The effectiveness of our method is demonstrated +through classification and robotic-manipulation experiments. + +
+
+ comment: IJCNN 2024 +
+
+
+
+
+ + ☆ MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with + Interleaved Visual-Textual Tokens + + +
+ This paper introduces MiniGPT4-Video, a multimodal Large Language Model (LLM) +designed specifically for video understanding. The model is capable of +processing both temporal visual and textual data, making it adept at +understanding the complexities of videos. Building upon the success of +MiniGPT-v2, which excelled in translating visual features into the LLM space +for single images and achieved impressive results on various image-text +benchmarks, this paper extends the model's capabilities to process a sequence +of frames, enabling it to comprehend videos. MiniGPT4-video does not only +consider visual content but also incorporates textual conversations, allowing +the model to effectively answer queries involving both visual and text +components. The proposed model outperforms existing state-of-the-art methods, +registering gains of 4.22%, 1.13%, 20.82%, and 13.1% on the MSVD, MSRVTT, TGIF, +and TVQA benchmarks respectively. Our models and code have been made publicly +available here https://vision-cair.github.io/MiniGPT4-video/ + +
+
+ comment: 6 pages,8 figures +
+
+
+
+
+ + ☆ AIGIQA-20K: A Large Database for AI-Generated Image Quality Assessment + + +
+ With the rapid advancements in AI-Generated Content (AIGC), AI-Generated +Images (AIGIs) have been widely applied in entertainment, education, and social +media. However, due to the significant variance in quality among different +AIGIs, there is an urgent need for models that consistently match human +subjective ratings. To address this issue, we organized a challenge towards +AIGC quality assessment on NTIRE 2024 that extensively considers 15 popular +generative models, utilizing dynamic hyper-parameters (including +classifier-free guidance, iteration epochs, and output image resolution), and +gather subjective scores that consider perceptual quality and text-to-image +alignment altogether comprehensively involving 21 subjects. This approach +culminates in the creation of the largest fine-grained AIGI subjective quality +database to date with 20,000 AIGIs and 420,000 subjective ratings, known as +AIGIQA-20K. Furthermore, we conduct benchmark experiments on this database to +assess the correspondence between 16 mainstream AIGI quality models and human +perception. We anticipate that this large-scale quality database will inspire +robust quality indicators for AIGIs and propel the evolution of AIGC for +vision. The database is released on +https://www.modelscope.cn/datasets/lcysyzxdxc/AIGCQA-30K-Image. + +
+
+
+
+
+ + ☆ Scaling Up Video Summarization Pretraining with Large Language Models CVPR 2024 + + +
+ Long-form video content constitutes a significant portion of internet +traffic, making automated video summarization an essential research problem. +However, existing video summarization datasets are notably limited in their +size, constraining the effectiveness of state-of-the-art methods for +generalization. Our work aims to overcome this limitation by capitalizing on +the abundance of long-form videos with dense speech-to-video alignment and the +remarkable capabilities of recent large language models (LLMs) in summarizing +long text. We introduce an automated and scalable pipeline for generating a +large-scale video summarization dataset using LLMs as Oracle summarizers. By +leveraging the generated dataset, we analyze the limitations of existing +approaches and propose a new video summarization model that effectively +addresses them. To facilitate further research in the field, our work also +presents a new benchmark dataset that contains 1200 long videos each with +high-quality summaries annotated by professionals. Extensive experiments +clearly indicate that our proposed approach sets a new state-of-the-art in +video summarization across several benchmarks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Background Noise Reduction of Attention Map for Weakly Supervised + Semantic Segmentation + + +
+ In weakly-supervised semantic segmentation (WSSS) using only image-level +class labels, a problem with CNN-based Class Activation Maps (CAM) is that they +tend to activate the most discriminative local regions of objects. On the other +hand, methods based on Transformers learn global features but suffer from the +issue of background noise contamination. This paper focuses on addressing the +issue of background noise in attention weights within the existing WSSS method +based on Conformer, known as TransCAM. The proposed method successfully reduces +background noise, leading to improved accuracy of pseudo labels. Experimental +results demonstrate that our model achieves segmentation performance of 70.5% +on the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS +COCO 2014 data, outperforming TransCAM in terms of segmentation performance. + +
+
+
+
+
+ + ☆ Two Tricks to Improve Unsupervised Segmentation Learning + + +
+ We present two practical improvement techniques for unsupervised segmentation +learning. These techniques address limitations in the resolution and accuracy +of predicted segmentation maps of recent state-of-the-art methods. Firstly, we +leverage image post-processing techniques such as guided filtering to refine +the output masks, improving accuracy while avoiding substantial computational +costs. Secondly, we introduce a multi-scale consistency criterion, based on a +teacher-student training scheme. This criterion matches segmentation masks +predicted from regions of the input image extracted at different resolutions to +each other. Experimental results on several benchmarks used in unsupervised +segmentation learning demonstrate the effectiveness of our proposed techniques. + +
+
+
+
+
+ + ☆ LongVLM: Efficient Long Video Understanding via Large Language Models + + +
+ Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs +have driven progress in various video understanding tasks. These models encode +video representations through pooling or query aggregation over a vast number +of visual tokens, making computational and memory costs affordable. Despite +successfully providing an overall comprehension of video content, existing +VideoLLMs still face challenges in achieving detailed understanding in videos +due to overlooking local information in long-term videos. To tackle this +challenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for +long video understanding, building upon the observation that long videos often +consist of sequential key events, complex actions, and camera movements. Our +approach proposes to decompose long videos into multiple short-term segments +and encode local features for each local segment via a hierarchical token +merging module. These features are concatenated in temporal order to maintain +the storyline across sequential short-term segments. Additionally, we propose +to integrate global semantics into each local feature to enhance context +understanding. In this way, we encode video representations that incorporate +both local and global information, enabling the LLM to generate comprehensive +responses for long-term videos. Experimental results on the VideoChatGPT +benchmark and zero-shot video question-answering datasets demonstrate the +superior capabilities of our model over the previous state-of-the-art methods. +Qualitative examples demonstrate that our model produces more precise responses +for long videos understanding. Code is available at +\url{https://github.com/ziplab/LongVLM}. + +
+
+
+
+
+ + ☆ VF-NeRF: Viewshed Fields for Rigid NeRF Registration + + +
+ 3D scene registration is a fundamental problem in computer vision that seeks +the best 6-DoF alignment between two scenes. This problem was extensively +investigated in the case of point clouds and meshes, but there has been +relatively limited work regarding Neural Radiance Fields (NeRF). In this paper, +we consider the problem of rigid registration between two NeRFs when the +position of the original cameras is not given. Our key novelty is the +introduction of Viewshed Fields (VF), an implicit function that determines, for +each 3D point, how likely it is to be viewed by the original cameras. We +demonstrate how VF can help in the various stages of NeRF registration, with an +extensive evaluation showing that VF-NeRF achieves SOTA results on various +datasets with different capturing approaches such as LLFF and Objaverese. + +
+
+
+
+
+ + ☆ Meta Invariance Defense Towards Generalizable Robustness to Unknown + Adversarial Attacks + + +
+ Despite providing high-performance solutions for computer vision tasks, the +deep neural network (DNN) model has been proved to be extremely vulnerable to +adversarial attacks. Current defense mainly focuses on the known attacks, but +the adversarial robustness to the unknown attacks is seriously overlooked. +Besides, commonly used adaptive learning and fine-tuning technique is +unsuitable for adversarial defense since it is essentially a zero-shot problem +when deployed. Thus, to tackle this challenge, we propose an attack-agnostic +defense method named Meta Invariance Defense (MID). Specifically, various +combinations of adversarial attacks are randomly sampled from a manually +constructed Attacker Pool to constitute different defense tasks against unknown +attacks, in which a student encoder is supervised by multi-consistency +distillation to learn the attack-invariant features via a meta principle. The +proposed MID has two merits: 1) Full distillation from pixel-, feature- and +prediction-level between benign and adversarial samples facilitates the +discovery of attack-invariance. 2) The model simultaneously achieves robustness +to the imperceptible adversarial perturbations in high-level image +classification and attack-suppression in low-level robust image regeneration. +Theoretical and empirical studies on numerous benchmarks such as ImageNet +verify the generalizable robustness and superiority of MID under various +attacks. + +
+
+ comment: Accepted by IEEE TPAMI in 2024 +
+
+
+
+
+ + ☆ DI-Retinex: Digital-Imaging Retinex Theory for Low-Light Image + Enhancement + + +
+ Many existing methods for low-light image enhancement (LLIE) based on Retinex +theory ignore important factors that affect the validity of this theory in +digital imaging, such as noise, quantization error, non-linearity, and dynamic +range overflow. In this paper, we propose a new expression called +Digital-Imaging Retinex theory (DI-Retinex) through theoretical and +experimental analysis of Retinex theory in digital imaging. Our new expression +includes an offset term in the enhancement model, which allows for pixel-wise +brightness contrast adjustment with a non-linear mapping function. In addition, +to solve the lowlight enhancement problem in an unsupervised manner, we propose +an image-adaptive masked reverse degradation loss in Gamma space. We also +design a variance suppression loss for regulating the additional offset term. +Extensive experiments show that our proposed method outperforms all existing +unsupervised methods in terms of visual quality, model size, and speed. Our +algorithm can also assist downstream face detectors in low-light, as it shows +the most performance gain after the low-light enhancement compared to other +methods. + +
+
+
+
+
+ + ☆ Sparse Concept Bottleneck Models: Gumbel Tricks in Contrastive Learning + + +
+ We propose a novel architecture and method of explainable classification with +Concept Bottleneck Models (CBMs). While SOTA approaches to Image Classification +task work as a black box, there is a growing demand for models that would +provide interpreted results. Such a models often learn to predict the +distribution over class labels using additional description of this target +instances, called concepts. However, existing Bottleneck methods have a number +of limitations: their accuracy is lower than that of a standard model and CBMs +require an additional set of concepts to leverage. We provide a framework for +creating Concept Bottleneck Model from pre-trained multi-modal encoder and new +CLIP-like architectures. By introducing a new type of layers known as Concept +Bottleneck Layers, we outline three methods for training them: with +$\ell_1$-loss, contrastive loss and loss function based on Gumbel-Softmax +distribution (Sparse-CBM), while final FC layer is still trained with +Cross-Entropy. We show a significant increase in accuracy using sparse hidden +layers in CLIP-based bottleneck models. Which means that sparse representation +of concepts activation vector is meaningful in Concept Bottleneck Models. +Moreover, with our Concept Matrix Search algorithm we can improve CLIP +predictions on complex datasets without any additional training or fine-tuning. +The code is available at: https://github.com/Andron00e/SparseCBM. + +
+
+ comment: 23 pages, 1 algorithm, 36 figures +
+
+
+
+
+ + ☆ AdaBM: On-the-Fly Adaptive Bit Mapping for Image Super-Resolution CVPR 2024 + + +
+ Although image super-resolution (SR) problem has experienced unprecedented +restoration accuracy with deep neural networks, it has yet limited versatile +applications due to the substantial computational costs. Since different input +images for SR face different restoration difficulties, adapting computational +costs based on the input image, referred to as adaptive inference, has emerged +as a promising solution to compress SR networks. Specifically, adapting the +quantization bit-widths has successfully reduced the inference and memory cost +without sacrificing the accuracy. However, despite the benefits of the +resultant adaptive network, existing works rely on time-intensive +quantization-aware training with full access to the original training pairs to +learn the appropriate bit allocation policies, which limits its ubiquitous +usage. To this end, we introduce the first on-the-fly adaptive quantization +framework that accelerates the processing time from hours to seconds. We +formulate the bit allocation problem with only two bit mapping modules: one to +map the input image to the image-wise bit adaptation factor and one to obtain +the layer-wise adaptation factors. These bit mappings are calibrated and +fine-tuned using only a small number of calibration images. We achieve +competitive performance with the previous adaptive quantization methods, while +the processing time is accelerated by x2000. Codes are available at +https://github.com/Cheeun/AdaBM. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Design and Development of a Framework For Stroke-Based Handwritten + Gujarati Font Generation + + +
+ Handwritten font generation is important for preserving cultural heritage and +creating personalized designs. It adds an authentic and expressive touch to +printed materials, making them visually appealing and establishing a stronger +connection with the audience. This paper aims to design a framework for +generating handwritten fonts in the Gujarati script, mimicking the variation of +human handwriting. The proposed font generation model consists of a learning +phase and a generation phase. In the learning phase, Gujarati scripts are +analyzed, and rules for designing each character are formulated. This ruleset +involves the concatenation of strokes in a stroke-based manner, ensuring visual +consistency in the resulting glyphs. The generation phase involves the user +providing a small subset of characters, and the system automatically generates +the remaining character glyphs based on extracted strokes and learned rules, +resulting in handwritten Gujarati fonts. The resulting character glyphs are +converted into an open-type font using the FontForge tool, making them +compatible with any Gujarati editor. Both subjective and objective evaluations +are conducted to assess the synthesized images and fonts. Subjective evaluation +through user studies provides feedback on quality and visual appeal, achieving +an overall accuracy of 84.84%. Notably, eleven characters demonstrated a +success ratio above 90%. Objective evaluation using an existing recognition +system achieves an overall accuracy of 84.28% in OCR evaluation. Notably, +fifteen characters had a success ratio of 80% or higher. + +
+
+ comment: 13 pages, 2 column, 12 figures +
+
+
+
+
+ + ☆ Multi Positive Contrastive Learning with Pose-Consistent Generated + Images + + +
+ Model pre-training has become essential in various recognition tasks. +Meanwhile, with the remarkable advancements in image generation models, +pre-training methods utilizing generated images have also emerged given their +ability to produce unlimited training data. However, while existing methods +utilizing generated images excel in classification, they fall short in more +practical tasks, such as human pose estimation. In this paper, we have +experimentally demonstrated it and propose the generation of visually distinct +images with identical human poses. We then propose a novel multi-positive +contrastive learning, which optimally utilize the previously generated images +to learn structural features of the human body. We term the entire learning +pipeline as GenPoCCL. Despite using only less than 1% amount of data compared +to current state-of-the-art method, GenPoCCL captures structural features of +the human body more effectively, surpassing existing methods in a variety of +human-centric perception tasks. + +
+
+
+
+
+ + ☆ A dataset of primary nasopharyngeal carcinoma MRI with multi-modalities + segmentation + + +
+ Multi-modality magnetic resonance imaging data with various sequences +facilitate the early diagnosis, tumor segmentation, and disease staging in the +management of nasopharyngeal carcinoma (NPC). The lack of publicly available, +comprehensive datasets limits advancements in diagnosis, treatment planning, +and the development of machine learning algorithms for NPC. Addressing this +critical need, we introduce the first comprehensive NPC MRI dataset, +encompassing MR axial imaging of 277 primary NPC patients. This dataset +includes T1-weighted, T2-weighted, and contrast-enhanced T1-weighted sequences, +totaling 831 scans. In addition to the corresponding clinical data, manually +annotated and labeled segmentations by experienced radiologists offer +high-quality data resources from untreated primary NPC. + +
+
+
+
+
+ + ☆ Real-time Noise Source Estimation of a Camera System from an Image and + Metadata + + +
+ Autonomous machines must self-maintain proper functionality to ensure the +safety of humans and themselves. This pertains particularly to its cameras as +predominant sensors to perceive the environment and support actions. A +fundamental camera problem addressed in this study is noise. Solutions often +focus on denoising images a posteriori, that is, fighting symptoms rather than +root causes. However, tackling root causes requires identifying the noise +sources, considering the limitations of mobile platforms. This work +investigates a real-time, memory-efficient and reliable noise source estimator +that combines data- and physically-based models. To this end, a DNN that +examines an image with camera metadata for major camera noise sources is built +and trained. In addition, it quantifies unexpected factors that impact image +noise or metadata. This study investigates seven different estimators on six +datasets that include synthetic noise, real-world noise from two camera +systems, and real field campaigns. For these, only the model with most metadata +is capable to accurately and robustly quantify all individual noise +contributions. This method outperforms total image noise estimators and can be +plug-and-play deployed. It also serves as a basis to include more advanced +noise sources, or as part of an automatic countermeasure feedback-loop to +approach fully reliable machines. + +
+
+ comment: 16 pages, 16 figures, 12 tables, Project page: + https://github.com/MaikWischow/Noise-Source-Estimation +
+
+
+
+
+ + ☆ Learning Transferable Negative Prompts for Out-of-Distribution Detection CVPR 2024 + + +
+ Existing prompt learning methods have shown certain capabilities in +Out-of-Distribution (OOD) detection, but the lack of OOD images in the target +dataset in their training can lead to mismatches between OOD images and +In-Distribution (ID) categories, resulting in a high false positive rate. To +address this issue, we introduce a novel OOD detection method, named +'NegPrompt', to learn a set of negative prompts, each representing a negative +connotation of a given class label, for delineating the boundaries between ID +and OOD images. It learns such negative prompts with ID data only, without any +reliance on external outlier data. Further, current methods assume the +availability of samples of all ID classes, rendering them ineffective in +open-vocabulary learning scenarios where the inference stage can contain novel +ID classes not present during training. In contrast, our learned negative +prompts are transferable to novel class labels. Experiments on various ImageNet +benchmarks show that NegPrompt surpasses state-of-the-art prompt-learning-based +OOD detection methods and maintains a consistent lead in hard OOD detection in +closed- and open-vocabulary classification scenarios. Code is available at +https://github.com/mala-lab/negprompt. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Would Deep Generative Models Amplify Bias in Future Models? CVPR 2024 + + +
+ We investigate the impact of deep generative models on potential social +biases in upcoming computer vision models. As the internet witnesses an +increasing influx of AI-generated images, concerns arise regarding inherent +biases that may accompany them, potentially leading to the dissemination of +harmful content. This paper explores whether a detrimental feedback loop, +resulting in bias amplification, would occur if generated images were used as +the training data for future models. We conduct simulations by progressively +substituting original images in COCO and CC3M datasets with images generated +through Stable Diffusion. The modified datasets are used to train OpenCLIP and +image captioning models, which we evaluate in terms of quality and bias. +Contrary to expectations, our findings indicate that introducing generated +images during training does not uniformly amplify bias. Instead, instances of +bias mitigation across specific tasks are observed. We further explore the +factors that may influence these phenomena, such as artifacts in image +generation (e.g., blurry faces) or pre-existing biases in the original +datasets. + +
+
+ comment: This paper has been accepted to CVPR 2024 +
+
+
+
+
+ + ☆ FACTUAL: A Novel Framework for Contrastive Learning Based Robust SAR + Image Classification + + +
+ Deep Learning (DL) Models for Synthetic Aperture Radar (SAR) Automatic Target +Recognition (ATR), while delivering improved performance, have been shown to be +quite vulnerable to adversarial attacks. Existing works improve robustness by +training models on adversarial samples. However, by focusing mostly on attacks +that manipulate images randomly, they neglect the real-world feasibility of +such attacks. In this paper, we propose FACTUAL, a novel Contrastive Learning +framework for Adversarial Training and robust SAR classification. FACTUAL +consists of two components: (1) Differing from existing works, a novel +perturbation scheme that incorporates realistic physical adversarial attacks +(such as OTSA) to build a supervised adversarial pre-training network. This +network utilizes class labels for clustering clean and perturbed images +together into a more informative feature space. (2) A linear classifier +cascaded after the encoder to use the computed representations to predict the +target labels. By pre-training and fine-tuning our model on both clean and +adversarial samples, we show that our model achieves high prediction accuracy +on both cases. Our model achieves 99.7% accuracy on clean samples, and 89.6% on +perturbed samples, both outperforming previous state-of-the-art methods. + +
+
+ comment: 2024 IEEE Radar Conference +
+
+
+
+
+ + ☆ iSeg: Interactive 3D Segmentation via Interactive Attention + + +
+ We present iSeg, a new interactive technique for segmenting 3D shapes. +Previous works have focused mainly on leveraging pre-trained 2D foundation +models for 3D segmentation based on text. However, text may be insufficient for +accurately describing fine-grained spatial segmentations. Moreover, achieving a +consistent 3D segmentation using a 2D model is challenging since occluded areas +of the same semantic region may not be visible together from any 2D view. Thus, +we design a segmentation method conditioned on fine user clicks, which operates +entirely in 3D. Our system accepts user clicks directly on the shape's surface, +indicating the inclusion or exclusion of regions from the desired shape +partition. To accommodate various click settings, we propose a novel +interactive attention module capable of processing different numbers and types +of clicks, enabling the training of a single unified interactive segmentation +model. We apply iSeg to a myriad of shapes from different domains, +demonstrating its versatility and faithfulness to the user's specifications. +Our project page is at https://threedle.github.io/iSeg/. + +
+
+ comment: Project page: https://threedle.github.io/iSeg/ +
+
+
+
+
+ + ☆ LeGrad: An Explainability Method for Vision Transformers via Feature + Formation Sensitivity + + +
+ Vision Transformers (ViTs), with their ability to model long-range +dependencies through self-attention mechanisms, have become a standard +architecture in computer vision. However, the interpretability of these models +remains a challenge. To address this, we propose LeGrad, an explainability +method specifically designed for ViTs. LeGrad computes the gradient with +respect to the attention maps of ViT layers, considering the gradient itself as +the explainability signal. We aggregate the signal over all layers, combining +the activations of the last as well as intermediate tokens to produce the +merged explainability map. This makes LeGrad a conceptually simple and an +easy-to-implement tool for enhancing the transparency of ViTs. We evaluate +LeGrad in challenging segmentation, perturbation, and open-vocabulary settings, +showcasing its versatility compared to other SotA explainability methods +demonstrating its superior spatial fidelity and robustness to perturbations. A +demo and the code is available at https://github.com/WalBouss/LeGrad. + +
+
+ comment: Code available at https://github.com/WalBouss/LeGrad +
+
+
+
+
+ + ☆ HDR Imaging for Dynamic Scenes with Events + + +
+ High dynamic range imaging (HDRI) for real-world dynamic scenes is +challenging because moving objects may lead to hybrid degradation of low +dynamic range and motion blur. Existing event-based approaches only focus on a +separate task, while cascading HDRI and motion deblurring would lead to +sub-optimal solutions, and unavailable ground-truth sharp HDR images aggravate +the predicament. To address these challenges, we propose an Event-based HDRI +framework within a Self-supervised learning paradigm, i.e., Self-EHDRI, which +generalizes HDRI performance in real-world dynamic scenarios. Specifically, a +self-supervised learning strategy is carried out by learning cross-domain +conversions from blurry LDR images to sharp LDR images, which enables sharp HDR +images to be accessible in the intermediate process even though ground-truth +sharp HDR images are missing. Then, we formulate the event-based HDRI and +motion deblurring model and conduct a unified network to recover the +intermediate sharp HDR results, where both the high dynamic range and high +temporal resolution of events are leveraged simultaneously for compensation. We +construct large-scale synthetic and real-world datasets to evaluate the +effectiveness of our method. Comprehensive experiments demonstrate that the +proposed Self-EHDRI outperforms state-of-the-art approaches by a large margin. +The codes, datasets, and results are available at +https://lxp-whu.github.io/Self-EHDRI. + +
+
+
+
+
+ + ☆ OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field + Reconstruction using Omnidirectional Images IROS 2024 + + +
+ Photorealistic reconstruction relying on 3D Gaussian Splatting has shown +promising potential in robotics. However, the current 3D Gaussian Splatting +system only supports radiance field reconstruction using undistorted +perspective images. In this paper, we present OmniGS, a novel omnidirectional +Gaussian splatting system, to take advantage of omnidirectional images for fast +radiance field reconstruction. Specifically, we conduct a theoretical analysis +of spherical camera model derivatives in 3D Gaussian Splatting. According to +the derivatives, we then implement a new GPU-accelerated omnidirectional +rasterizer that directly splats 3D Gaussians onto the equirectangular screen +space for omnidirectional image rendering. As a result, we realize +differentiable optimization of the radiance field without the requirement of +cube-map rectification or tangent-plane approximation. Extensive experiments +conducted in egocentric and roaming scenarios demonstrate that our method +achieves state-of-the-art reconstruction quality and high rendering speed using +omnidirectional images. To benefit the research community, the code will be +made publicly available once the paper is published. + +
+
+ comment: IROS 2024 submission, 7 pages, 4 figures +
+
+
+
+
+ + ☆ Future-Proofing Class Incremental Learning + + +
+ Exemplar-Free Class Incremental Learning is a highly challenging setting +where replay memory is unavailable. Methods relying on frozen feature +extractors have drawn attention recently in this setting due to their +impressive performances and lower computational costs. However, those methods +are highly dependent on the data used to train the feature extractor and may +struggle when an insufficient amount of classes are available during the first +incremental step. To overcome this limitation, we propose to use a pre-trained +text-to-image diffusion model in order to generate synthetic images of future +classes and use them to train the feature extractor. Experiments on the +standard benchmarks CIFAR100 and ImageNet-Subset demonstrate that our proposed +method can be used to improve state-of-the-art methods for exemplar-free class +incremental learning, especially in the most difficult settings where the first +incremental step only contains few classes. Moreover, we show that using +synthetic samples of future classes achieves higher performance than using real +data from different classes, paving the way for better and less costly +pre-training methods for incremental learning. + +
+
+
+
+
+ + ☆ CORP: A Multi-Modal Dataset for Campus-Oriented Roadside Perception + Tasks + + +
+ Numerous roadside perception datasets have been introduced to propel +advancements in autonomous driving and intelligent transportation systems +research and development. However, it has been observed that the majority of +their concentrates is on urban arterial roads, inadvertently overlooking +residential areas such as parks and campuses that exhibit entirely distinct +characteristics. In light of this gap, we propose CORP, which stands as the +first public benchmark dataset tailored for multi-modal roadside perception +tasks under campus scenarios. Collected in a university campus, CORP consists +of over 205k images plus 102k point clouds captured from 18 cameras and 9 LiDAR +sensors. These sensors with different configurations are mounted on roadside +utility poles to provide diverse viewpoints within the campus region. The +annotations of CORP encompass multi-dimensional information beyond 2D and 3D +bounding boxes, providing extra support for 3D seamless tracking and instance +segmentation with unique IDs and pixel masks for identifying targets, to +enhance the understanding of objects and their behaviors distributed across the +campus premises. Unlike other roadside datasets about urban traffic, CORP +extends the spectrum to highlight the challenges for multi-modal perception in +campuses and other residential areas. + +
+
+
+
+
+ + ☆ Adaptive Discrete Disparity Volume for Self-supervised Monocular Depth + Estimation + + +
+ In self-supervised monocular depth estimation tasks, discrete disparity +prediction has been proven to attain higher quality depth maps than common +continuous methods. However, current discretization strategies often divide +depth ranges of scenes into bins in a handcrafted and rigid manner, limiting +model performance. In this paper, we propose a learnable module, Adaptive +Discrete Disparity Volume (ADDV), which is capable of dynamically sensing depth +distributions in different RGB images and generating adaptive bins for them. +Without any extra supervision, this module can be integrated into existing CNN +architectures, allowing networks to produce representative values for bins and +a probability volume over them. Furthermore, we introduce novel training +strategies - uniformizing and sharpening - through a loss term and temperature +parameter, respectively, to provide regularizations under self-supervised +conditions, preventing model degradation or collapse. Empirical results +demonstrate that ADDV effectively processes global information, generating +appropriate bins for various scenes and producing higher quality depth maps +compared to handcrafted methods. + +
+
+
+
+
+ + ☆ Classification of Nasopharyngeal Cases using DenseNet Deep Learning + Architecture + + +
+ Nasopharyngeal carcinoma (NPC) is one of the understudied yet deadliest +cancers in South East Asia. In Malaysia, the prevalence is identified mainly in +Sarawak, among the ethnic of Bidayuh. NPC is often late-diagnosed because it is +asymptomatic at the early stage. There are several tissue representations from +the nasopharynx biopsy, such as nasopharyngeal inflammation (NPI), lymphoid +hyperplasia (LHP), nasopharyngeal carcinoma (NPC) and normal tissue. This paper +is our first initiative to identify the difference between NPC, NPI and normal +cases. Seven whole slide images (WSIs) with gigapixel resolutions from seven +different patients and two hospitals were experimented with using two test +setups, consisting of a different set of images. The tissue regions are patched +into smaller blocks and classified using DenseNet architecture with 21 dense +layers. Two tests are carried out, each for proof of concept (Test 1) and +real-test scenario (Test 2). The accuracy achieved for NPC class is 94.8% for +Test 1 and 67.0% for Test 2. + +
+
+ comment: This article has been accepted in the Journal of Engineering Science + and Technology (JESTEC) and awaiting publication +
+
+
+
+
+ + ☆ AGL-NET: Aerial-Ground Cross-Modal Global Localization with Varying + Scales + + +
+ We present AGL-NET, a novel learning-based method for global localization +using LiDAR point clouds and satellite maps. AGL-NET tackles two critical +challenges: bridging the representation gap between image and points modalities +for robust feature matching, and handling inherent scale discrepancies between +global view and local view. To address these challenges, AGL-NET leverages a +unified network architecture with a novel two-stage matching design. The first +stage extracts informative neural features directly from raw sensor data and +performs initial feature matching. The second stage refines this matching +process by extracting informative skeleton features and incorporating a novel +scale alignment step to rectify scale variations between LiDAR and map data. +Furthermore, a novel scale and skeleton loss function guides the network toward +learning scale-invariant feature representations, eliminating the need for +pre-processing satellite maps. This significantly improves real-world +applicability in scenarios with unknown map scales. To facilitate rigorous +performance evaluation, we introduce a meticulously designed dataset within the +CARLA simulator specifically tailored for metric localization training and +assessment. The code and dataset will be made publicly available. + +
+
+
+
+
+ + ☆ BodyMAP -- Jointly Predicting Body Mesh and 3D Applied Pressure Map for + People in Bed CVPR 2024 + + +
+ Accurately predicting the 3D human posture and the pressure exerted on the +body for people resting in bed, visualized as a body mesh (3D pose & shape) +with a 3D pressure map, holds significant promise for healthcare applications, +particularly, in the prevention of pressure ulcers. Current methods focus on +singular facets of the problem -- predicting only 2D/3D poses, generating 2D +pressure images, predicting pressure only for certain body regions instead of +the full body, or forming indirect approximations to the 3D pressure map. In +contrast, we introduce BodyMAP, which jointly predicts the human body mesh and +3D applied pressure map across the entire human body. Our network leverages +multiple visual modalities, incorporating both a depth image of a person in bed +and its corresponding 2D pressure image acquired from a pressure-sensing +mattress. The 3D pressure map is represented as a pressure value at each mesh +vertex and thus allows for precise localization of high-pressure regions on the +body. Additionally, we present BodyMAP-WS, a new formulation of pressure +prediction in which we implicitly learn pressure in 3D by aligning sensed 2D +pressure images with a differentiable 2D projection of the predicted 3D +pressure maps. In evaluations with real-world human data, our method +outperforms the current state-of-the-art technique by 25% on both body mesh and +3D applied pressure map prediction tasks for people in bed. + +
+
+ comment: Accepted at CVPR 2024 Project Website: https://bodymap3d.github.io/ + Code: https://github.com/RCHI-Lab/BodyMAP +
+
+
+
+
+ + ☆ MonoCD: Monocular 3D Object Detection with Complementary Depths CVPR 2024 + + +
+ Monocular 3D object detection has attracted widespread attention due to its +potential to accurately obtain object 3D localization from a single image at a +low cost. Depth estimation is an essential but challenging subtask of monocular +3D object detection due to the ill-posedness of 2D to 3D mapping. Many methods +explore multiple local depth clues such as object heights and keypoints and +then formulate the object depth estimation as an ensemble of multiple depth +predictions to mitigate the insufficiency of single-depth information. However, +the errors of existing multiple depths tend to have the same sign, which +hinders them from neutralizing each other and limits the overall accuracy of +combined depth. To alleviate this problem, we propose to increase the +complementarity of depths with two novel designs. First, we add a new depth +prediction branch named complementary depth that utilizes global and efficient +depth clues from the entire image rather than the local clues to reduce the +correlation of depth predictions. Second, we propose to fully exploit the +geometric relations between multiple depth clues to achieve complementarity in +form. Benefiting from these designs, our method achieves higher +complementarity. Experiments on the KITTI benchmark demonstrate that our method +achieves state-of-the-art performance without introducing extra data. In +addition, complementary depth can also be a lightweight and plug-and-play +module to boost multiple existing monocular 3d object detectors. Code is +available at https://github.com/elvintanhust/MonoCD. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ UniAV: Unified Audio-Visual Perception for Multi-Task Video Localization + + +
+ Video localization tasks aim to temporally locate specific instances in +videos, including temporal action localization (TAL), sound event detection +(SED) and audio-visual event localization (AVEL). Existing methods +over-specialize on each task, overlooking the fact that these instances often +occur in the same video to form the complete video content. In this work, we +present UniAV, a Unified Audio-Visual perception network, to achieve joint +learning of TAL, SED and AVEL tasks for the first time. UniAV can leverage +diverse data available in task-specific datasets, allowing the model to learn +and share mutually beneficial knowledge across tasks and modalities. To tackle +the challenges posed by substantial variations in datasets +(size/domain/duration) and distinct task characteristics, we propose to +uniformly encode visual and audio modalities of all videos to derive generic +representations, while also designing task-specific experts to capture unique +knowledge for each task. Besides, we develop a unified language-aware +classifier by utilizing a pre-trained text encoder, enabling the model to +flexibly detect various types of instances and previously unseen ones by simply +changing prompts during inference. UniAV outperforms its single-task +counterparts by a large margin with fewer parameters, achieving on-par or +superior performances compared to state-of-the-art task-specific methods across +ActivityNet 1.3, DESED and UnAV-100 benchmarks. + +
+
+
+
+
+ + ☆ BioVL-QR: Egocentric Biochemical Video-and-Language Dataset Using Micro + QR Codes + + +
+ This paper introduces a biochemical vision-and-language dataset, which +consists of 24 egocentric experiment videos, corresponding protocols, and +video-and-language alignments. The key challenge in the wet-lab domain is +detecting equipment, reagents, and containers is difficult because the lab +environment is scattered by filling objects on the table and some objects are +indistinguishable. Therefore, previous studies assume that objects are manually +annotated and given for downstream tasks, but this is costly and +time-consuming. To address this issue, this study focuses on Micro QR Codes to +detect objects automatically. From our preliminary study, we found that +detecting objects only using Micro QR Codes is still difficult because the +researchers manipulate objects, causing blur and occlusion frequently. To +address this, we also propose a novel object labeling method by combining a +Micro QR Code detector and an off-the-shelf hand object detector. As one of the +applications of our dataset, we conduct the task of generating protocols from +experiment videos and find that our approach can generate accurate protocols. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ HandDiff: 3D Hand Pose Estimation with Diffusion on Image-Point Cloud + + +
+ Extracting keypoint locations from input hand frames, known as 3D hand pose +estimation, is a critical task in various human-computer interaction +applications. Essentially, the 3D hand pose estimation can be regarded as a 3D +point subset generative problem conditioned on input frames. Thanks to the +recent significant progress on diffusion-based generative models, hand pose +estimation can also benefit from the diffusion model to estimate keypoint +locations with high quality. However, directly deploying the existing diffusion +models to solve hand pose estimation is non-trivial, since they cannot achieve +the complex permutation mapping and precise localization. Based on this +motivation, this paper proposes HandDiff, a diffusion-based hand pose +estimation model that iteratively denoises accurate hand pose conditioned on +hand-shaped image-point clouds. In order to recover keypoint permutation and +accurate location, we further introduce joint-wise condition and local detail +condition. Experimental results demonstrate that the proposed HandDiff +significantly outperforms the existing approaches on four challenging hand pose +benchmark datasets. Codes and pre-trained models are publicly available at +https://github.com/cwc1260/HandDiff. + +
+
+ comment: Accepted as a conference paper to the Conference on Computer Vision + and Pattern Recognition (2024) +
+
+
+
+
+ + ☆ DreamWalk: Style Space Exploration using Diffusion Guidance + + +
+ Text-conditioned diffusion models can generate impressive images, but fall +short when it comes to fine-grained control. Unlike direct-editing tools like +Photoshop, text conditioned models require the artist to perform "prompt +engineering," constructing special text sentences to control the style or +amount of a particular subject present in the output image. Our goal is to +provide fine-grained control over the style and substance specified by the +prompt, for example to adjust the intensity of styles in different regions of +the image (Figure 1). Our approach is to decompose the text prompt into +conceptual elements, and apply a separate guidance term for each element in a +single diffusion process. We introduce guidance scale functions to control when +in the diffusion process and \emph{where} in the image to intervene. Since the +method is based solely on adjusting diffusion guidance, it does not require +fine-tuning or manipulating the internal layers of the diffusion model's neural +network, and can be used in conjunction with LoRA- or DreamBooth-trained models +(Figure2). Project page: https://mshu1.github.io/dreamwalk.github.io/ + +
+
+
+
+
+ + ☆ Diverse and Tailored Image Generation for Zero-shot Multi-label + Classification + + +
+ Recently, zero-shot multi-label classification has garnered considerable +attention for its capacity to operate predictions on unseen labels without +human annotations. Nevertheless, prevailing approaches often use seen classes +as imperfect proxies for unseen ones, resulting in suboptimal performance. +Drawing inspiration from the success of text-to-image generation models in +producing realistic images, we propose an innovative solution: generating +synthetic data to construct a training set explicitly tailored for proxyless +training on unseen labels. Our approach introduces a novel image generation +framework that produces multi-label synthetic images of unseen classes for +classifier training. To enhance diversity in the generated images, we leverage +a pre-trained large language model to generate diverse prompts. Employing a +pre-trained multi-modal CLIP model as a discriminator, we assess whether the +generated images accurately represent the target classes. This enables +automatic filtering of inaccurately generated images, preserving classifier +accuracy. To refine text prompts for more precise and effective multi-label +object generation, we introduce a CLIP score-based discriminative loss to +fine-tune the text encoder in the diffusion model. Additionally, to enhance +visual features on the target task while maintaining the generalization of +original features and mitigating catastrophic forgetting resulting from +fine-tuning the entire visual encoder, we propose a feature fusion module +inspired by transformer attention mechanisms. This module aids in capturing +global dependencies between multiple objects more effectively. Extensive +experimental results validate the effectiveness of our approach, demonstrating +significant improvements over state-of-the-art methods. + +
+
+
+
+
+ + ☆ Discontinuity-preserving Normal Integration with Auxiliary Edges CVPR 2024 + + +
+ Many surface reconstruction methods incorporate normal integration, which is +a process to obtain a depth map from surface gradients. In this process, the +input may represent a surface with discontinuities, e.g., due to +self-occlusion. To reconstruct an accurate depth map from the input normal map, +hidden surface gradients occurring from the jumps must be handled. To model +these jumps correctly, we design a novel discretization scheme for the domain +of normal integration. Our key idea is to introduce auxiliary edges, which +bridge between piecewise-smooth patches in the domain so that the magnitude of +hidden jumps can be explicitly expressed. Using the auxiliary edges, we design +a novel algorithm to optimize the discontinuity and the depth map from the +input normal map. Our method optimizes discontinuities by using a combination +of iterative re-weighted least squares and iterative filtering of the jump +magnitudes on auxiliary edges to provide strong sparsity regularization. +Compared to previous discontinuity-preserving normal integration methods, which +model the magnitudes of jumps only implicitly, our method reconstructs subtle +discontinuities accurately thanks to our explicit representation of jumps +allowing for strong sparsity regularization. + +
+
+ comment: To appear at CVPR 2024. For supplementary video, see + https://youtu.be/MTTcW5kAOFE +
+
+
+
+
+ + ☆ GaSpCT: Gaussian Splatting for Novel CT Projection View Synthesis MICCAI 2024 + + +
+ We present GaSpCT, a novel view synthesis and 3D scene representation method +used to generate novel projection views for Computer Tomography (CT) scans. We +adapt the Gaussian Splatting framework to enable novel view synthesis in CT +based on limited sets of 2D image projections and without the need for +Structure from Motion (SfM) methodologies. Therefore, we reduce the total +scanning duration and the amount of radiation dose the patient receives during +the scan. We adapted the loss function to our use-case by encouraging a +stronger background and foreground distinction using two sparsity promoting +regularizers: a beta loss and a total variation (TV) loss. Finally, we +initialize the Gaussian locations across the 3D space using a uniform prior +distribution of where the brain's positioning would be expected to be within +the field of view. We evaluate the performance of our model using brain CT +scans from the Parkinson's Progression Markers Initiative (PPMI) dataset and +demonstrate that the rendered novel views closely match the original projection +views of the simulated scan, and have better performance than other implicit 3D +scene representations methodologies. Furthermore, we empirically observe +reduced training time compared to neural network based image synthesis for +sparse-view CT image reconstruction. Finally, the memory requirements of the +Gaussian Splatting representations are reduced by 17% compared to the +equivalent voxel grid image representations. + +
+
+ comment: Under Review Process for MICCAI 2024 +
+
+
+
+
+ + ☆ PARIS3D: Reasoning-based 3D Part Segmentation Using Large Multimodal + Model + + +
+ Recent advancements in 3D perception systems have significantly improved +their ability to perform visual recognition tasks such as segmentation. +However, these systems still heavily rely on explicit human instruction to +identify target objects or categories, lacking the capability to actively +reason and comprehend implicit user intentions. We introduce a novel +segmentation task known as reasoning part segmentation for 3D objects, aiming +to output a segmentation mask based on complex and implicit textual queries +about specific parts of a 3D object. To facilitate evaluation and benchmarking, +we present a large 3D dataset comprising over 60k instructions paired with +corresponding ground-truth part segmentation annotations specifically curated +for reasoning-based 3D part segmentation. We propose a model that is capable of +segmenting parts of 3D objects based on implicit textual queries and generating +natural language explanations corresponding to 3D object segmentation requests. +Experiments show that our method achieves competitive performance to models +that use explicit queries, with the additional abilities to identify part +concepts, reason about them, and complement them with world knowledge. Our +source code, dataset, and trained models are available at +https://github.com/AmrinKareem/PARIS3D. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ SleepVST: Sleep Staging from Near-Infrared Video Signals using + Pre-Trained Transformers CVPR 2024 + + +
+ Advances in camera-based physiological monitoring have enabled the robust, +non-contact measurement of respiration and the cardiac pulse, which are known +to be indicative of the sleep stage. This has led to research into camera-based +sleep monitoring as a promising alternative to "gold-standard" polysomnography, +which is cumbersome, expensive to administer, and hence unsuitable for +longer-term clinical studies. In this paper, we introduce SleepVST, a +transformer model which enables state-of-the-art performance in camera-based +sleep stage classification (sleep staging). After pre-training on contact +sensor data, SleepVST outperforms existing methods for cardio-respiratory sleep +staging on the SHHS and MESA datasets, achieving total Cohen's kappa scores of +0.75 and 0.77 respectively. We then show that SleepVST can be successfully +transferred to cardio-respiratory waveforms extracted from video, enabling +fully contact-free sleep staging. Using a video dataset of 50 nights, we +achieve a total accuracy of 78.8\% and a Cohen's $\kappa$ of 0.71 in four-class +video-based sleep staging, setting a new state-of-the-art in the domain. + +
+
+ comment: CVPR 2024 Highlight Paper +
+
+
+
+
+ + ☆ Effective Lymph Nodes Detection in CT Scans Using Location Debiased + Query Selection and Contrastive Query Representation in Transformer + + +
+ Lymph node (LN) assessment is a critical, indispensable yet very challenging +task in the routine clinical workflow of radiology and oncology. Accurate LN +analysis is essential for cancer diagnosis, staging, and treatment planning. +Finding scatteredly distributed, low-contrast clinically relevant LNs in 3D CT +is difficult even for experienced physicians under high inter-observer +variations. Previous automatic LN detection works typically yield limited +recall and high false positives (FPs) due to adjacent anatomies with similar +image intensities, shapes, or textures (vessels, muscles, esophagus, etc). In +this work, we propose a new LN DEtection TRansformer, named LN-DETR, to achieve +more accurate performance. By enhancing the 2D backbone with a multi-scale 2.5D +feature fusion to incorporate 3D context explicitly, more importantly, we make +two main contributions to improve the representation quality of LN queries. 1) +Considering that LN boundaries are often unclear, an IoU prediction head and a +location debiased query selection are proposed to select LN queries of higher +localization accuracy as the decoder query's initialization. 2) To reduce FPs, +query contrastive learning is employed to explicitly reinforce LN queries +towards their best-matched ground-truth queries over unmatched query +predictions. Trained and tested on 3D CT scans of 1067 patients (with 10,000+ +labeled LNs) via combining seven LN datasets from different body parts (neck, +chest, and abdomen) and pathologies/cancers, our method significantly improves +the performance of previous leading methods by > 4-5% average recall at the +same FP rates in both internal and external testing. We further evaluate on the +universal lesion detection task using NIH DeepLesion benchmark, and our method +achieves the top performance of 88.46% averaged recall across 0.5 to 4 FPs per +image, compared with other leading reported results. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Language-Guided Instance-Aware Domain-Adaptive Panoptic Segmentation + + +
+ The increasing relevance of panoptic segmentation is tied to the advancements +in autonomous driving and AR/VR applications. However, the deployment of such +models has been limited due to the expensive nature of dense data annotation, +giving rise to unsupervised domain adaptation (UDA). A key challenge in +panoptic UDA is reducing the domain gap between a labeled source and an +unlabeled target domain while harmonizing the subtasks of semantic and instance +segmentation to limit catastrophic interference. While considerable progress +has been achieved, existing approaches mainly focus on the adaptation of +semantic segmentation. In this work, we focus on incorporating instance-level +adaptation via a novel instance-aware cross-domain mixing strategy IMix. IMix +significantly enhances the panoptic quality by improving instance segmentation +performance. Specifically, we propose inserting high-confidence predicted +instances from the target domain onto source images, retaining the +exhaustiveness of the resulting pseudo-labels while reducing the injected +confirmation bias. Nevertheless, such an enhancement comes at the cost of +degraded semantic performance, attributed to catastrophic forgetting. To +mitigate this issue, we regularize our semantic branch by employing CLIP-based +domain alignment (CDA), exploiting the domain-robustness of natural language +prompts. Finally, we present an end-to-end model incorporating these two +mechanisms called LIDAPS, achieving state-of-the-art results on all popular +panoptic UDA benchmarks. + +
+
+
+
+
+ + ☆ Quantifying Uncertainty in Motion Prediction with Variational Bayesian + Mixture CVPR 2024 + + +
+ Safety and robustness are crucial factors in developing trustworthy +autonomous vehicles. One essential aspect of addressing these factors is to +equip vehicles with the capability to predict future trajectories for all +moving objects in the surroundings and quantify prediction uncertainties. In +this paper, we propose the Sequential Neural Variational Agent (SeNeVA), a +generative model that describes the distribution of future trajectories for a +single moving object. Our approach can distinguish Out-of-Distribution data +while quantifying uncertainty and achieving competitive performance compared to +state-of-the-art methods on the Argoverse 2 and INTERACTION datasets. +Specifically, a 0.446 meters minimum Final Displacement Error, a 0.203 meters +minimum Average Displacement Error, and a 5.35% Miss Rate are achieved on the +INTERACTION test set. Extensive qualitative and quantitative analysis is also +provided to evaluate the proposed model. Our open-source code is available at +https://github.com/PurdueDigitalTwin/seneva. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Layerwise Early Stopping for Test Time Adaptation + + +
+ Test Time Adaptation (TTA) addresses the problem of distribution shift by +enabling pretrained models to learn new features on an unseen domain at test +time. However, it poses a significant challenge to maintain a balance between +learning new features and retaining useful pretrained features. In this paper, +we propose Layerwise EArly STopping (LEAST) for TTA to address this problem. +The key idea is to stop adapting individual layers during TTA if the features +being learned do not appear beneficial for the new domain. For that purpose, we +propose using a novel gradient-based metric to measure the relevance of the +current learnt features to the new domain without the need for supervised +labels. More specifically, we propose to use this metric to determine +dynamically when to stop updating each layer during TTA. This enables a more +balanced adaptation, restricted to layers benefiting from it, and only for a +certain number of steps. Such an approach also has the added effect of limiting +the forgetting of pretrained features useful for dealing with new domains. +Through extensive experiments, we demonstrate that Layerwise Early Stopping +improves the performance of existing TTA approaches across multiple datasets, +domain shifts, model architectures, and TTA losses. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Flattening the Parent Bias: Hierarchical Semantic Segmentation in the + Poincaré Ball + + +
+ Hierarchy is a natural representation of semantic taxonomies, including the +ones routinely used in image segmentation. Indeed, recent work on semantic +segmentation reports improved accuracy from supervised training leveraging +hierarchical label structures. Encouraged by these results, we revisit the +fundamental assumptions behind that work. We postulate and then empirically +verify that the reasons for the observed improvement in segmentation accuracy +may be entirely unrelated to the use of the semantic hierarchy. To demonstrate +this, we design a range of cross-domain experiments with a representative +hierarchical approach. We find that on the new testing domains, a flat +(non-hierarchical) segmentation network, in which the parents are inferred from +the children, has superior segmentation accuracy to the hierarchical approach +across the board. Complementing these findings and inspired by the intrinsic +properties of hyperbolic spaces, we study a more principled approach to +hierarchical segmentation using the Poincar\'e ball model. The hyperbolic +representation largely outperforms the previous (Euclidean) hierarchical +approach as well and is on par with our flat Euclidean baseline in terms of +segmentation accuracy. However, it additionally exhibits surprisingly strong +calibration quality of the parent nodes in the semantic hierarchy, especially +on the more challenging domains. Our combined analysis suggests that the +established practice of hierarchical segmentation may be limited to in-domain +settings, whereas flat classifiers generalize substantially better, especially +if they are modeled in the hyperbolic space. + +
+
+
+
+
+ + ☆ Data Science for Geographic Information Systems + + +
+ The integration of data science into Geographic Information Systems (GIS) has +facilitated the evolution of these tools into complete spatial analysis +platforms. The adoption of machine learning and big data techniques has +equipped these platforms with the capacity to handle larger amounts of +increasingly complex data, transcending the limitations of more traditional +approaches. This work traces the historical and technical evolution of data +science and GIS as fields of study, highlighting the critical points of +convergence between domains, and underlining the many sectors that rely on this +integration. A GIS application is presented as a case study in the disaster +management sector where we utilize aerial data from Tr\'oia, Portugal, to +emphasize the process of insight extraction from raw data. We conclude by +outlining prospects for future research in integration of these fields in +general, and the developed application in particular. + +
+
+
+
+
+ + ☆ Test Time Training for Industrial Anomaly Segmentation CVPR + + +
+ Anomaly Detection and Segmentation (AD&S) is crucial for industrial quality +control. While existing methods excel in generating anomaly scores for each +pixel, practical applications require producing a binary segmentation to +identify anomalies. Due to the absence of labeled anomalies in many real +scenarios, standard practices binarize these maps based on some statistics +derived from a validation set containing only nominal samples, resulting in +poor segmentation performance. This paper addresses this problem by proposing a +test time training strategy to improve the segmentation performance. Indeed, at +test time, we can extract rich features directly from anomalous samples to +train a classifier that can discriminate defects effectively. Our general +approach can work downstream to any AD&S method that provides an anomaly score +map as output, even in multimodal settings. We demonstrate the effectiveness of +our approach over baselines through extensive experimentation and evaluation on +MVTec AD and MVTec 3D-AD. + +
+
+ comment: Accepted at VAND 2.0, CVPRW 2024 +
+
+
+
+
+ + ☆ SC4D: Sparse-Controlled Video-to-4D Generation and Motion Transfer + + +
+ Recent advances in 2D/3D generative models enable the generation of dynamic +3D objects from a single-view video. Existing approaches utilize score +distillation sampling to form the dynamic scene as dynamic NeRF or dense 3D +Gaussians. However, these methods struggle to strike a balance among reference +view alignment, spatio-temporal consistency, and motion fidelity under +single-view conditions due to the implicit nature of NeRF or the intricate +dense Gaussian motion prediction. To address these issues, this paper proposes +an efficient, sparse-controlled video-to-4D framework named SC4D, that +decouples motion and appearance to achieve superior video-to-4D generation. +Moreover, we introduce Adaptive Gaussian (AG) initialization and Gaussian +Alignment (GA) loss to mitigate shape degeneration issue, ensuring the fidelity +of the learned motion and shape. Comprehensive experimental results demonstrate +that our method surpasses existing methods in both quality and efficiency. In +addition, facilitated by the disentangled modeling of motion and appearance of +SC4D, we devise a novel application that seamlessly transfers the learned +motion onto a diverse array of 4D entities according to textual descriptions. + +
+
+ comment: Project Page: https://sc4d.github.io/ +
+
+
+
+
+ + ☆ No "Zero-Shot" Without Exponential Data: Pretraining Concept Frequency + Determines Multimodal Model Performance ICLR'24 + + +
+ Web-crawled pretraining datasets underlie the impressive "zero-shot" +evaluation performance of multimodal models, such as CLIP for +classification/retrieval and Stable-Diffusion for image generation. However, it +is unclear how meaningful the notion of "zero-shot" generalization is for such +multimodal models, as it is not known to what extent their pretraining datasets +encompass the downstream concepts targeted for during "zero-shot" evaluation. +In this work, we ask: How is the performance of multimodal models on downstream +concepts influenced by the frequency of these concepts in their pretraining +datasets? We comprehensively investigate this question across 34 models and +five standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M, +LAION-Aesthetics), generating over 300GB of data artifacts. We consistently +find that, far from exhibiting "zero-shot" generalization, multimodal models +require exponentially more data to achieve linear improvements in downstream +"zero-shot" performance, following a sample inefficient log-linear scaling +trend. This trend persists even when controlling for sample-level similarity +between pretraining and downstream datasets, and testing on purely synthetic +data distributions. Furthermore, upon benchmarking models on long-tailed data +sampled based on our analysis, we demonstrate that multimodal models across the +board perform poorly. We contribute this long-tail test set as the "Let it +Wag!" benchmark to further research in this direction. Taken together, our +study reveals an exponential need for training data which implies that the key +to "zero-shot" generalization capabilities under large-scale training paradigms +remains to be found. + +
+
+ comment: Extended version of the short paper accepted at DPFM, ICLR'24 +
+
+
+
+
+ + ☆ Explaining Explainability: Understanding Concept Activation Vectors + + +
+ Recent interpretability methods propose using concept-based explanations to +translate the internal representations of deep learning models into a language +that humans are familiar with: concepts. This requires understanding which +concepts are present in the representation space of a neural network. One +popular method for finding concepts is Concept Activation Vectors (CAVs), which +are learnt using a probe dataset of concept exemplars. In this work, we +investigate three properties of CAVs. CAVs may be: (1) inconsistent between +layers, (2) entangled with different concepts, and (3) spatially dependent. +Each property provides both challenges and opportunities in interpreting +models. We introduce tools designed to detect the presence of these properties, +provide insight into how they affect the derived explanations, and provide +recommendations to minimise their impact. Understanding these properties can be +used to our advantage. For example, we introduce spatially dependent CAVs to +test if a model is translation invariant with respect to a specific concept and +class. Our experiments are performed on ImageNet and a new synthetic dataset, +Elements. Elements is designed to capture a known ground truth relationship +between concepts and classes. We release this dataset to facilitate further +research in understanding and evaluating interpretability methods. + +
+
+ comment: (54 pages, 39 figures) +
+
+
+
+
+ + ☆ Cross-Modality Gait Recognition: Bridging LiDAR and Camera Modalities + for Human Identification + + +
+ Current gait recognition research mainly focuses on identifying pedestrians +captured by the same type of sensor, neglecting the fact that individuals may +be captured by different sensors in order to adapt to various environments. A +more practical approach should involve cross-modality matching across different +sensors. Hence, this paper focuses on investigating the problem of +cross-modality gait recognition, with the objective of accurately identifying +pedestrians across diverse vision sensors. We present CrossGait inspired by the +feature alignment strategy, capable of cross retrieving diverse data +modalities. Specifically, we investigate the cross-modality recognition task by +initially extracting features within each modality and subsequently aligning +these features across modalities. To further enhance the cross-modality +performance, we propose a Prototypical Modality-shared Attention Module that +learns modality-shared features from two modality-specific features. +Additionally, we design a Cross-modality Feature Adapter that transforms the +learned modality-specific features into a unified feature space. Extensive +experiments conducted on the SUSTech1K dataset demonstrate the effectiveness of +CrossGait: (1) it exhibits promising cross-modality ability in retrieving +pedestrians across various modalities from different sensors in diverse scenes, +and (2) CrossGait not only learns modality-shared features for cross-modality +gait recognition but also maintains modality-specific features for +single-modality recognition. + +
+
+
+
+
+ + ☆ Mitigating analytical variability in fMRI results with style transfer + + +
+ We propose a novel approach to improve the reproducibility of neuroimaging +results by converting statistic maps across different functional MRI pipelines. +We make the assumption that pipelines can be considered as a style component of +data and propose to use different generative models, among which, Diffusion +Models (DM) to convert data between pipelines. We design a new DM-based +unsupervised multi-domain image-to-image transition framework and constrain the +generation of 3D fMRI statistic maps using the latent space of an auxiliary +classifier that distinguishes statistic maps from different pipelines. We +extend traditional sampling techniques used in DM to improve the transition +performance. Our experiments demonstrate that our proposed methods are +successful: pipelines can indeed be transferred, providing an important source +of data augmentation for future medical studies. + +
+
+
+
+
+ + ♻ ☆ $CrowdDiff$: Multi-hypothesis Crowd Density Estimation using Diffusion + Models CVPR'24 + + +
+ Crowd counting is a fundamental problem in crowd analysis which is typically +accomplished by estimating a crowd density map and summing over the density +values. However, this approach suffers from background noise accumulation and +loss of density due to the use of broad Gaussian kernels to create the ground +truth density maps. This issue can be overcome by narrowing the Gaussian +kernel. However, existing approaches perform poorly when trained with ground +truth density maps with broad kernels. To deal with this limitation, we propose +using conditional diffusion models to predict density maps, as diffusion models +show high fidelity to training data during generation. With that, we present +$CrowdDiff$ that generates the crowd density map as a reverse diffusion +process. Furthermore, as the intermediate time steps of the diffusion process +are noisy, we incorporate a regression branch for direct crowd estimation only +during training to improve the feature learning. In addition, owing to the +stochastic nature of the diffusion model, we introduce producing multiple +density maps to improve the counting performance contrary to the existing crowd +counting pipelines. We conduct extensive experiments on publicly available +datasets to validate the effectiveness of our method. $CrowdDiff$ outperforms +existing state-of-the-art crowd counting methods on several public crowd +analysis benchmarks with significant improvements. + +
+
+ comment: Accepted at CVPR'24. The project is available at + https://dylran.github.io/crowddiff.github.io +
+
+
+
+
+ + ♻ ☆ Expressive Forecasting of 3D Whole-body Human Motions AAAI24 + + +
+ Human motion forecasting, with the goal of estimating future human behavior +over a period of time, is a fundamental task in many real-world applications. +However, existing works typically concentrate on predicting the major joints of +the human body without considering the delicate movements of the human hands. +In practical applications, hand gesture plays an important role in human +communication with the real world, and expresses the primary intention of human +beings. In this work, we are the first to formulate a whole-body human pose +forecasting task, which jointly predicts the future body and hand activities. +Correspondingly, we propose a novel Encoding-Alignment-Interaction (EAI) +framework that aims to predict both coarse (body joints) and fine-grained +(gestures) activities collaboratively, enabling expressive and +cross-facilitated forecasting of 3D whole-body human motions. Specifically, our +model involves two key constituents: cross-context alignment (XCA) and +cross-context interaction (XCI). Considering the heterogeneous information +within the whole-body, XCA aims to align the latent features of various human +components, while XCI focuses on effectively capturing the context interaction +among the human components. We conduct extensive experiments on a +newly-introduced large-scale benchmark and achieve state-of-the-art +performance. The code is public for research purposes at +https://github.com/Dingpx/EAI. + +
+
+ comment: Accepted by AAAI24 +
+
+
+
+
+ + ♻ ☆ Cameras as Rays: Pose Estimation via Ray Diffusion ICLR 2024 + + +
+ Estimating camera poses is a fundamental task for 3D reconstruction and +remains challenging given sparsely sampled views (<10). In contrast to existing +approaches that pursue top-down prediction of global parametrizations of camera +extrinsics, we propose a distributed representation of camera pose that treats +a camera as a bundle of rays. This representation allows for a tight coupling +with spatial image features improving pose precision. We observe that this +representation is naturally suited for set-level transformers and develop a +regression-based approach that maps image patches to corresponding rays. To +capture the inherent uncertainties in sparse-view pose inference, we adapt this +approach to learn a denoising diffusion model which allows us to sample +plausible modes while improving performance. Our proposed methods, both +regression- and diffusion-based, demonstrate state-of-the-art performance on +camera pose estimation on CO3D while generalizing to unseen object categories +and in-the-wild captures. + +
+
+ comment: In ICLR 2024 (oral). v2-3: updated references. Project webpage: + https://jasonyzhang.com/RayDiffusion +
+
+
+
+
+ + ♻ ☆ APISR: Anime Production Inspired Real-World Anime Super-Resolution + + +
+ While real-world anime super-resolution (SR) has gained increasing attention +in the SR community, existing methods still adopt techniques from the +photorealistic domain. In this paper, we analyze the anime production workflow +and rethink how to use characteristics of it for the sake of the real-world +anime SR. First, we argue that video networks and datasets are not necessary +for anime SR due to the repetition use of hand-drawing frames. Instead, we +propose an anime image collection pipeline by choosing the least compressed and +the most informative frames from the video sources. Based on this pipeline, we +introduce the Anime Production-oriented Image (API) dataset. In addition, we +identify two anime-specific challenges of distorted and faint hand-drawn lines +and unwanted color artifacts. We address the first issue by introducing a +prediction-oriented compression module in the image degradation model and a +pseudo-ground truth preparation with enhanced hand-drawn lines. In addition, we +introduce the balanced twin perceptual loss combining both anime and +photorealistic high-level features to mitigate unwanted color artifacts and +increase visual clarity. We evaluate our method through extensive experiments +on the public benchmark, showing our method outperforms state-of-the-art anime +dataset-trained approaches. + +
+
+
+
+
+ + ♻ ☆ NEMTO: Neural Environment Matting for Novel View and Relighting + Synthesis of Transparent Objects ICCV 2023 + + +
+ We propose NEMTO, the first end-to-end neural rendering pipeline to model 3D +transparent objects with complex geometry and unknown indices of refraction. +Commonly used appearance modeling such as the Disney BSDF model cannot +accurately address this challenging problem due to the complex light paths +bending through refractions and the strong dependency of surface appearance on +illumination. With 2D images of the transparent object as input, our method is +capable of high-quality novel view and relighting synthesis. We leverage +implicit Signed Distance Functions (SDF) to model the object geometry and +propose a refraction-aware ray bending network to model the effects of light +refraction within the object. Our ray bending network is more tolerant to +geometric inaccuracies than traditional physically-based methods for rendering +transparent objects. We provide extensive evaluations on both synthetic and +real-world datasets to demonstrate our high-quality synthesis and the +applicability of our method. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ 3DGS-Avatar: Animatable Avatars via Deformable 3D Gaussian Splatting + + +
+ We introduce an approach that creates animatable human avatars from monocular +videos using 3D Gaussian Splatting (3DGS). Existing methods based on neural +radiance fields (NeRFs) achieve high-quality novel-view/novel-pose image +synthesis but often require days of training, and are extremely slow at +inference time. Recently, the community has explored fast grid structures for +efficient training of clothed avatars. Albeit being extremely fast at training, +these methods can barely achieve an interactive rendering frame rate with +around 15 FPS. In this paper, we use 3D Gaussian Splatting and learn a +non-rigid deformation network to reconstruct animatable clothed human avatars +that can be trained within 30 minutes and rendered at real-time frame rates +(50+ FPS). Given the explicit nature of our representation, we further +introduce as-isometric-as-possible regularizations on both the Gaussian mean +vectors and the covariance matrices, enhancing the generalization of our model +on highly articulated unseen poses. Experimental results show that our method +achieves comparable and even better performance compared to state-of-the-art +approaches on animatable avatar creation from a monocular input, while being +400x and 250x faster in training and inference, respectively. + +
+
+ comment: Project page: https://neuralbodies.github.io/3DGS-Avatar +
+
+
+
+
+ + ♻ ☆ ILPO-NET: Network for the invariant recognition of arbitrary volumetric + patterns in 3D + + +
+ Effective recognition of spatial patterns and learning their hierarchy is +crucial in modern spatial data analysis. Volumetric data applications seek +techniques ensuring invariance not only to shifts but also to pattern +rotations. While traditional methods can readily achieve translational +invariance, rotational invariance possesses multiple challenges and remains an +active area of research. Here, we present ILPO-Net (Invariant to Local Patterns +Orientation Network), a novel approach that handles arbitrarily shaped patterns +with the convolutional operation inherently invariant to local spatial pattern +orientations using the Wigner matrix expansions. Our architecture seamlessly +integrates the new convolution operator and, when benchmarked on diverse +volumetric datasets such as MedMNIST and CATH, demonstrates superior +performance over the baselines with significantly reduced parameter counts - up +to 1000 times fewer in the case of MedMNIST. Beyond these demonstrations, +ILPO-Net's rotational invariance paves the way for other applications across +multiple disciplines. Our code is publicly available at +https://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPONet. + +
+
+
+
+
+ + ♻ ☆ Bootstrapping SparseFormers from Vision Foundation Models CVPR 2024 + + +
+ The recently proposed SparseFormer architecture provides an alternative +approach to visual understanding by utilizing a significantly lower number of +visual tokens via adjusting RoIs, greatly reducing computational costs while +still achieving promising performance. However, training SparseFormers from +scratch is still expensive, and scaling up the number of parameters can be +challenging. In this paper, we propose to bootstrap SparseFormers from +ViT-based vision foundation models in a simple and efficient way. Since the +majority of SparseFormer blocks are the standard transformer ones, we can +inherit weights from large-scale pre-trained vision transformers and freeze +them as much as possible. Therefore, we only need to train the +SparseFormer-specific lightweight focusing transformer to adjust token RoIs and +fine-tune a few early pre-trained blocks to align the final token +representation. In such a way, we can bootstrap SparseFormer architectures from +various large-scale pre-trained models (e.g., IN-21K pre-trained AugRegs or +CLIPs) using a rather smaller amount of training samples (e.g., IN-1K) and +without labels or captions within just a few hours. As a result, the +bootstrapped unimodal SparseFormer (from AugReg-ViT-L/16-384) can reach 84.9% +accuracy on IN-1K with only 49 tokens, and the multimodal SparseFormer from +CLIPs also demonstrates notable zero-shot performance with highly reduced +computational cost without seeing any caption during the bootstrapping +procedure. In addition, CLIP-bootstrapped SparseFormers, which align the output +space with language without seeing a word, can serve as efficient vision +encoders in multimodal large language models. Code and models are available at +https://github.com/showlab/sparseformer + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Unified Spatio-Temporal Tri-Perspective View Representation for 3D + Semantic Occupancy Prediction + + +
+ Holistic understanding and reasoning in 3D scenes play a vital role in the +success of autonomous driving systems. The evolution of 3D semantic occupancy +prediction as a pretraining task for autonomous driving and robotic downstream +tasks capture finer 3D details compared to methods like 3D detection. Existing +approaches predominantly focus on spatial cues such as tri-perspective view +embeddings (TPV), often overlooking temporal cues. This study introduces a +spatiotemporal transformer architecture S2TPVFormer for temporally coherent 3D +semantic occupancy prediction. We enrich the prior process by including +temporal cues using a novel temporal cross-view hybrid attention mechanism +(TCVHA) and generate spatiotemporal TPV embeddings (i.e. S2TPV embeddings). +Experimental evaluations on the nuScenes dataset demonstrate a substantial 4.1% +improvement in mean Intersection over Union (mIoU) for 3D Semantic Occupancy +compared to TPVFormer, confirming the effectiveness of the proposed S2TPVFormer +in enhancing 3D scene perception. + +
+
+
+
+
+ + ♻ ☆ Learning Subject-Aware Cropping by Outpainting Professional Photos AAAI 24 + + +
+ How to frame (or crop) a photo often depends on the image subject and its +context; e.g., a human portrait. Recent works have defined the subject-aware +image cropping task as a nuanced and practical version of image cropping. We +propose a weakly-supervised approach (GenCrop) to learn what makes a +high-quality, subject-aware crop from professional stock images. Unlike +supervised prior work, GenCrop requires no new manual annotations beyond the +existing stock image collection. The key challenge in learning from this data, +however, is that the images are already cropped and we do not know what regions +were removed. Our insight is to combine a library of stock images with a +modern, pre-trained text-to-image diffusion model. The stock image collection +provides diversity and its images serve as pseudo-labels for a good crop, while +the text-image diffusion model is used to out-paint (i.e., outward inpainting) +realistic uncropped images. Using this procedure, we are able to automatically +generate a large dataset of cropped-uncropped training pairs to train a +cropping model. Despite being weakly-supervised, GenCrop is competitive with +state-of-the-art supervised methods and significantly better than comparable +weakly-supervised baselines on quantitative and qualitative evaluation metrics. + +
+
+ comment: AAAI 24. Extended version with supplemental materials +
+
+
+
+
+ + ♻ ☆ Non-negative Subspace Feature Representation for Few-shot Learning in + Medical Imaging + + +
+ Unlike typical visual scene recognition domains, in which massive datasets +are accessible to deep neural networks, medical image interpretations are often +obstructed by the paucity of data. In this paper, we investigate the +effectiveness of data-based few-shot learning in medical imaging by exploring +different data attribute representations in a low-dimensional space. We +introduce different types of non-negative matrix factorization (NMF) in +few-shot learning, addressing the data scarcity issue in medical image +classification. Extensive empirical studies are conducted in terms of +validating the effectiveness of NMF, especially its supervised variants (e.g., +discriminative NMF, and supervised and constrained NMF with sparseness), and +the comparison with principal component analysis (PCA), i.e., the collaborative +representation-based dimensionality reduction technique derived from +eigenvectors. With 14 different datasets covering 11 distinct illness +categories, thorough experimental results and comparison with related +techniques demonstrate that NMF is a competitive alternative to PCA for +few-shot learning in medical imaging, and the supervised NMF algorithms are +more discriminative in the subspace with greater effectiveness. Furthermore, we +show that the part-based representation of NMF, especially its supervised +variants, is dramatically impactful in detecting lesion areas in medical +imaging with limited samples. + +
+
+
+
+
+ + ♻ ☆ Data Upcycling Knowledge Distillation for Image Super-Resolution + + +
+ Knowledge distillation (KD) compresses deep neural networks by transferring +task-related knowledge from cumbersome pre-trained teacher models to compact +student models. However, current KD methods for super-resolution (SR) networks +overlook the nature of SR task that the outputs of the teacher model are noisy +approximations to the ground-truth distribution of high-quality images (GT), +which shades the teacher model's knowledge to result in limited KD effects. To +utilize the teacher model beyond the GT upper-bound, we present the Data +Upcycling Knowledge Distillation (DUKD), to transfer the teacher model's +knowledge to the student model through the upcycled in-domain data derived from +training data. Besides, we impose label consistency regularization to KD for SR +by the paired invertible augmentations to improve the student model's +performance and robustness. Comprehensive experiments demonstrate that the DUKD +method significantly outperforms previous arts on several SR tasks. + +
+
+
+
+
+ + ♻ ☆ MorpheuS: Neural Dynamic 360° Surface Reconstruction from Monocular + RGB-D Video CVPR2024 + + +
+ Neural rendering has demonstrated remarkable success in dynamic scene +reconstruction. Thanks to the expressiveness of neural representations, prior +works can accurately capture the motion and achieve high-fidelity +reconstruction of the target object. Despite this, real-world video scenarios +often feature large unobserved regions where neural representations struggle to +achieve realistic completion. To tackle this challenge, we introduce MorpheuS, +a framework for dynamic 360{\deg} surface reconstruction from a casually +captured RGB-D video. Our approach models the target scene as a canonical field +that encodes its geometry and appearance, in conjunction with a deformation +field that warps points from the current frame to the canonical space. We +leverage a view-dependent diffusion prior and distill knowledge from it to +achieve realistic completion of unobserved regions. Experimental results on +various real-world and synthetic datasets show that our method can achieve +high-fidelity 360{\deg} surface reconstruction of a deformable object from a +monocular RGB-D video. + +
+
+ comment: CVPR2024. Project page: + https://hengyiwang.github.io/projects/morpheus +
+
+
+
+
+ + ♻ ☆ Calibrating Bayesian UNet++ for Sub-Seasonal Forecasting ICLR 2024 + + +
+ Seasonal forecasting is a crucial task when it comes to detecting the extreme +heat and colds that occur due to climate change. Confidence in the predictions +should be reliable since a small increase in the temperatures in a year has a +big impact on the world. Calibration of the neural networks provides a way to +ensure our confidence in the predictions. However, calibrating regression +models is an under-researched topic, especially in forecasters. We calibrate a +UNet++ based architecture, which was shown to outperform physics-based models +in temperature anomalies. We show that with a slight trade-off between +prediction error and calibration error, it is possible to get more reliable and +sharper forecasts. We believe that calibration should be an important part of +safety-critical machine learning applications such as weather forecasters. + +
+
+ comment: Accepted as a workshop paper at "ICLR 2024 Tackling Climate Change + with Machine Learning" +
+
+
+
+
+ + ♻ ☆ Roadside Monocular 3D Detection via 2D Detection Prompting + + +
+ The problem of roadside monocular 3D detection requires detecting objects of +interested classes in a 2D RGB frame and predicting their 3D information such +as locations in bird's-eye-view (BEV). It has broad applications in traffic +control, vehicle-vehicle communication, and vehicle-infrastructure cooperative +perception. To approach this problem, we present a novel and simple method by +prompting the 3D detector using 2D detections. Our method builds on a key +insight that, compared with 3D detectors, a 2D detector is much easier to train +and performs significantly better w.r.t detections on the 2D image plane. That +said, one can exploit 2D detections of a well-trained 2D detector as prompts to +a 3D detector, being trained in a way of inflating such 2D detections to 3D +towards 3D detection. To construct better prompts using the 2D detector, we +explore three techniques: (a) concatenating both 2D and 3D detectors' features, +(b) attentively fusing 2D and 3D detectors' features, and (c) encoding +predicted 2D boxes x, y, width, height, label and attentively fusing such with +the 3D detector's features. Surprisingly, the third performs the best. +Moreover, we present a yaw tuning tactic and a class-grouping strategy that +merges classes based on their functionality; these techniques improve 3D +detection performance further. Comprehensive ablation studies and extensive +experiments demonstrate that our method resoundingly outperforms prior works, +achieving the state-of-the-art on two large-scale roadside 3D detection +benchmarks. + +
+
+
+
+
+ + ♻ ☆ Scene-aware Human Motion Forecasting via Mutual Distance Prediction + + +
+ In this paper, we tackle the problem of scene-aware 3D human motion +forecasting. A key challenge of this task is to predict future human motions +that are consistent with the scene by modeling the human-scene interactions. +While recent works have demonstrated that explicit constraints on human-scene +interactions can prevent the occurrence of ghost motion, they only provide +constraints on partial human motion e.g., the global motion of the human or a +few joints contacting the scene, leaving the rest of the motion unconstrained. +To address this limitation, we propose to model the human-scene interaction +with the mutual distance between the human body and the scene. Such mutual +distances constrain both the local and global human motion, resulting in a +whole-body motion constrained prediction. In particular, mutual distance +constraints consist of two components, the signed distance of each vertex on +the human mesh to the scene surface and the distance of basis scene points to +the human mesh. We further introduce a global scene representation learned from +a signed distance function (SDF) volume to ensure coherence between the global +scene representation and the explicit constraint from the mutual distance. We +develop a pipeline with two sequential steps: predicting the future mutual +distances first, followed by forecasting future human motion. During training, +we explicitly encourage consistency between predicted poses and mutual +distances. Extensive evaluations on the existing synthetic and real datasets +demonstrate that our approach consistently outperforms the state-of-the-art +methods. + +
+
+
+
+
+ + ♻ ☆ ShapeFusion: A 3D diffusion model for localized shape editing + + +
+ In the realm of 3D computer vision, parametric models have emerged as a +ground-breaking methodology for the creation of realistic and expressive 3D +avatars. Traditionally, they rely on Principal Component Analysis (PCA), given +its ability to decompose data to an orthonormal space that maximally captures +shape variations. However, due to the orthogonality constraints and the global +nature of PCA's decomposition, these models struggle to perform localized and +disentangled editing of 3D shapes, which severely affects their use in +applications requiring fine control such as face sculpting. In this paper, we +leverage diffusion models to enable diverse and fully localized edits on 3D +meshes, while completely preserving the un-edited regions. We propose an +effective diffusion masking training strategy that, by design, facilitates +localized manipulation of any shape region, without being limited to predefined +regions or to sparse sets of predefined control vertices. Following our +framework, a user can explicitly set their manipulation region of choice and +define an arbitrary set of vertices as handles to edit a 3D mesh. Compared to +the current state-of-the-art our method leads to more interpretable shape +manipulations than methods relying on latent code state, greater localization +and generation diversity while offering faster inference than optimization +based approaches. Project page: https://rolpotamias.github.io/Shapefusion/ + +
+
+ comment: Project Page: https://rolpotamias.github.io/Shapefusion/ +
+
+
+
+
+ + ♻ ☆ Vestibular schwannoma growth prediction from longitudinal MRI by time + conditioned neural fields + + +
+ Vestibular schwannomas (VS) are benign tumors that are generally managed by +active surveillance with MRI examination. To further assist clinical +decision-making and avoid overtreatment, an accurate prediction of tumor growth +based on longitudinal imaging is highly desirable. In this paper, we introduce +DeepGrowth, a deep learning method that incorporates neural fields and +recurrent neural networks for prospective tumor growth prediction. In the +proposed method, each tumor is represented as a signed distance function (SDF) +conditioned on a low-dimensional latent code. Unlike previous studies that +perform tumor shape prediction directly in the image space, we predict the +latent codes instead and then reconstruct future shapes from it. To deal with +irregular time intervals, we introduce a time-conditioned recurrent module +based on a ConvLSTM and a novel temporal encoding strategy, which enables the +proposed model to output varying tumor shapes over time. The experiments on an +in-house longitudinal VS dataset showed that the proposed model significantly +improved the performance ($\ge 1.6\%$ Dice score and $\ge0.20$ mm 95\% +Hausdorff distance), in particular for top 20\% tumors that grow or shrink the +most ($\ge 4.6\%$ Dice score and $\ge 0.73$ mm 95\% Hausdorff distance). Our +code is available at ~\burl{https://github.com/cyjdswx/DeepGrowth} + +
+
+
+
+
+ + ♻ ☆ Smooth Deep Saliency + + +
+ In this work, we investigate methods to reduce the noise in deep saliency +maps coming from convolutional downsampling, with the purpose of explaining how +a deep learning model detects tumors in scanned histological tissue samples. +Those methods make the investigated models more interpretable for +gradient-based saliency maps, computed in hidden layers. We test our approach +on different models trained for image classification on ImageNet1K, and models +trained for tumor detection on Camelyon16 and in-house real-world digital +pathology scans of stained tissue samples. Our results show that the +checkerboard noise in the gradient gets reduced, resulting in smoother and +therefore easier to interpret saliency maps. + +
+
+
+
+
+ + ♻ ☆ Self-Aligning Depth-regularized Radiance Fields for Asynchronous RGB-D + Sequences + + +
+ It has been shown that learning radiance fields with depth rendering and +depth supervision can effectively promote the quality and convergence of view +synthesis. However, this paradigm requires input RGB-D sequences to be +synchronized, hindering its usage in the UAV city modeling scenario. As there +exists asynchrony between RGB images and depth images due to high-speed flight, +we propose a novel time-pose function, which is an implicit network that maps +timestamps to $\rm SE(3)$ elements. To simplify the training process, we also +design a joint optimization scheme to jointly learn the large-scale +depth-regularized radiance fields and the time-pose function. Our algorithm +consists of three steps: (1) time-pose function fitting, (2) radiance field +bootstrapping, (3) joint pose error compensation and radiance field refinement. +In addition, we propose a large synthetic dataset with diverse controlled +mismatches and ground truth to evaluate this new problem setting +systematically. Through extensive experiments, we demonstrate that our method +outperforms baselines without regularization. We also show qualitatively +improved results on a real-world asynchronous RGB-D sequence captured by drone. +Codes, data, and models will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Beyond Image Super-Resolution for Image Recognition with Task-Driven + Perceptual Loss CVPR 2024 + + +
+ In real-world scenarios, image recognition tasks, such as semantic +segmentation and object detection, often pose greater challenges due to the +lack of information available within low-resolution (LR) content. Image +super-resolution (SR) is one of the promising solutions for addressing the +challenges. However, due to the ill-posed property of SR, it is challenging for +typical SR methods to restore task-relevant high-frequency contents, which may +dilute the advantage of utilizing the SR method. Therefore, in this paper, we +propose Super-Resolution for Image Recognition (SR4IR) that effectively guides +the generation of SR images beneficial to achieving satisfactory image +recognition performance when processing LR images. The critical component of +our SR4IR is the task-driven perceptual (TDP) loss that enables the SR network +to acquire task-specific knowledge from a network tailored for a specific task. +Moreover, we propose a cross-quality patch mix and an alternate training +framework that significantly enhances the efficacy of the TDP loss by +addressing potential problems when employing the TDP loss. Through extensive +experiments, we demonstrate that our SR4IR achieves outstanding task +performance by generating SR images useful for a specific image recognition +task, including semantic segmentation, object detection, and image +classification. The implementation code is available at +https://github.com/JaehaKim97/SR4IR. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CoDA: Instructive Chain-of-Domain Adaptation with Severity-Aware Visual + Prompt Tuning + + +
+ Unsupervised Domain Adaptation (UDA) aims to adapt models from labeled source +domains to unlabeled target domains. When adapting to adverse scenes, existing +UDA methods fail to perform well due to the lack of instructions, leading their +models to overlook discrepancies within all adverse scenes. To tackle this, we +propose CoDA which instructs models to distinguish, focus, and learn from these +discrepancies at scene and image levels. Specifically, CoDA consists of a +Chain-of-Domain (CoD) strategy and a Severity-Aware Visual Prompt Tuning +(SAVPT) mechanism. CoD focuses on scene-level instructions to divide all +adverse scenes into easy and hard scenes, guiding models to adapt from source +to easy domains with easy scene images, and then to hard domains with hard +scene images, thereby laying a solid foundation for whole adaptations. Building +upon this foundation, we employ SAVPT to dive into more detailed image-level +instructions to boost performance. SAVPT features a novel metric Severity that +divides all adverse scene images into low-severity and high-severity images. +Then Severity directs visual prompts and adapters, instructing models to +concentrate on unified severity features instead of scene-specific features, +without adding complexity to the model architecture. CoDA achieves SOTA +performances on widely-used benchmarks under all adverse scenes. Notably, CoDA +outperforms the existing ones by 4.6%, and 10.3% mIoU on the Foggy Driving, and +Foggy Zurich benchmarks, respectively. Our code is available at +https://github.com/Cuzyoung/CoDA + +
+
+
+
+
+ + ♻ ☆ GEARS: Local Geometry-aware Hand-object Interaction Synthesis + + +
+ Generating realistic hand motion sequences in interaction with objects has +gained increasing attention with the growing interest in digital humans. Prior +work has illustrated the effectiveness of employing occupancy-based or +distance-based virtual sensors to extract hand-object interaction features. +Nonetheless, these methods show limited generalizability across object +categories, shapes and sizes. We hypothesize that this is due to two reasons: +1) the limited expressiveness of employed virtual sensors, and 2) scarcity of +available training data. To tackle this challenge, we introduce a novel +joint-centered sensor designed to reason about local object geometry near +potential interaction regions. The sensor queries for object surface points in +the neighbourhood of each hand joint. As an important step towards mitigating +the learning complexity, we transform the points from global frame to hand +template frame and use a shared module to process sensor features of each +individual joint. This is followed by a spatio-temporal transformer network +aimed at capturing correlation among the joints in different dimensions. +Moreover, we devise simple heuristic rules to augment the limited training +sequences with vast static hand grasping samples. This leads to a broader +spectrum of grasping types observed during training, in turn enhancing our +model's generalization capability. We evaluate on two public datasets, GRAB and +InterCap, where our method shows superiority over baselines both quantitatively +and perceptually. + +
+
+
+
+
+ + ♻ ☆ Bias Behind the Wheel: Fairness Analysis of Autonomous Driving Systems + + +
+ This paper analyzes fairness in automated pedestrian detection, a crucial but +under-explored issue in autonomous driving systems. We evaluate eight +state-of-the-art deep learning-based pedestrian detectors across demographic +groups on large-scale real-world datasets. To enable thorough fairness testing, +we provide extensive annotations for the datasets, resulting in 8,311 images +with 16,070 gender labels, 20,115 age labels, and 3,513 skin tone labels. Our +findings reveal significant fairness issues, particularly related to age. The +undetected proportions for children are 20.14% higher compared to adults. +Furthermore, we explore how various driving scenarios affect the fairness of +pedestrian detectors. We find that pedestrian detectors demonstrate significant +gender biases during night time, potentially exacerbating the prevalent +societal issue of female safety concerns during nighttime out. Moreover, we +observe that pedestrian detectors can demonstrate both enhanced fairness and +superior performance under specific driving conditions, which challenges the +fairness-performance trade-off theory widely acknowledged in the fairness +literature. We publicly release the code, data, and results to support future +research on fairness in autonomous driving. + +
+
+
+
+
+ + ♻ ☆ Weighted structure tensor total variation for image denoising + + +
+ For image denoising problems, the structure tensor total variation +(STV)-based models show good performances when compared with other competing +regularization approaches. However, the STV regularizer does not couple the +local information of the image and may not maintain the image details. +Therefore, we employ the anisotropic weighted matrix introduced in the +anisotropic total variation (ATV) model to improve the STV model. By applying +the weighted matrix to the discrete gradient of the patch-based Jacobian +operator in STV, our proposed weighted STV (WSTV) model can effectively capture +local information from images and maintain their details during the denoising +process. The optimization problem in the model is solved by a fast first-order +gradient projection algorithm with a complexity result of $O(1 / i^2)$. For +images with different Gaussian noise levels, the experimental results +demonstrate that the WSTV model can effectively improve the quality of restored +images compared to other TV and STV-based models. + +
+
+
+
+
+ + ♻ ☆ A Novel Garment Transfer Method Supervised by Distilled Knowledge of + Virtual Try-on Model + + +
+ This paper proposes a novel garment transfer method supervised with knowledge +distillation from virtual try-on. Our method first reasons the transfer parsing +to provide shape prior to downstream tasks. We employ a multi-phase teaching +strategy to supervise the training of the transfer parsing reasoning model, +learning the response and feature knowledge from the try-on parsing reasoning +model. To correct the teaching error, it transfers the garment back to its +owner to absorb the hard knowledge in the self-study phase. Guided by the +transfer parsing, we adjust the position of the transferred garment via STN to +prevent distortion. Afterward, we estimate a progressive flow to precisely warp +the garment with shape and content correspondences. To ensure warping +rationality, we supervise the training of the garment warping model using +target shape and warping knowledge from virtual try-on. To better preserve body +features in the transfer result, we propose a well-designed training strategy +for the arm regrowth task to infer new exposure skin. Experiments demonstrate +that our method has state-of-the-art performance compared with other virtual +try-on and garment transfer methods in garment transfer, especially for +preserving garment texture and body features. + +
+
+
+
+
+ + ♻ ☆ ModaVerse: Efficiently Transforming Modalities with LLMs CVPR2024 + + +
+ Humans possess the capability to comprehend diverse modalities and seamlessly +transfer information between them. In this work, we introduce ModaVerse, a +Multi-modal Large Language Model (MLLM) capable of comprehending and +transforming content across various modalities including images, videos, and +audio. Predominant MLLM frameworks have largely relied on the alignment of +latent spaces of textual and non-textual features. This alignment process, +which synchronizes a language model trained on textual data with encoders and +decoders trained on multi-modal data, often necessitates extensive training of +several projection layers in multiple stages. Inspired by LLM-as-agent +methodologies, we propose a novel Input/Output (I/O) alignment mechanism that +operates directly at the level of natural language. It aligns the LLM's output +with the input of generative models, avoiding the complexities associated with +latent feature alignments, and simplifying the multiple training stages of +existing MLLMs into a single, efficient process. This conceptual advancement +leads to significant reductions in both data and computational costs. By +conducting experiments on several benchmarks, we demonstrate that our approach +attains comparable performance with the state of the art while achieving +considerable efficiencies in data usage and training duration. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ DeepIPC: Deeply Integrated Perception and Control for an Autonomous + Vehicle in Real Environments + + +
+ In this work, we introduce DeepIPC, a novel end-to-end model tailored for +autonomous driving, which seamlessly integrates perception and control tasks. +Unlike traditional models that handle these tasks separately, DeepIPC +innovatively combines a perception module, which processes RGBD images for +semantic segmentation and generates bird's eye view (BEV) mappings, with a +controller module that utilizes these insights along with GNSS and angular +speed measurements to accurately predict navigational waypoints. This +integration allows DeepIPC to efficiently translate complex environmental data +into actionable driving commands. Our comprehensive evaluation demonstrates +DeepIPC's superior performance in terms of drivability and multi-task +efficiency across diverse real-world scenarios, setting a new benchmark for +end-to-end autonomous driving systems with a leaner model architecture. The +experimental results underscore DeepIPC's potential to significantly enhance +autonomous vehicular navigation, promising a step forward in the development of +autonomous driving technologies. For further insights and replication, we will +make our code and datasets available at https://github.com/oskarnatan/DeepIPC. + +
+
+ comment: Accepted for Publication in IEEE Access +
+
+
+
+
+ + ♻ ☆ CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation + + +
+ Deep Neural Networks (DNNs) are widely used for visual classification tasks, +but their complex computation process and black-box nature hinder decision +transparency and interpretability. Class activation maps (CAMs) and recent +variants provide ways to visually explain the DNN decision-making process by +displaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation +only offers relative attention information, that is, on an attention heatmap, +we can interpret which image region is more or less important than the others. +However, these regions cannot be meaningfully compared across classes, and the +contribution of each region to the model's class prediction is not revealed. To +address these challenges that ultimately lead to better DNN Interpretation, in +this paper, we propose CAPE, a novel reformulation of CAM that provides a +unified and probabilistically meaningful assessment of the contributions of +image regions. We quantitatively and qualitatively compare CAPE with +state-of-the-art CAM methods on CUB and ImageNet benchmark datasets to +demonstrate enhanced interpretability. We also test on a cytology imaging +dataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML) +diagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE. + +
+
+
+
+
+ + ♻ ☆ Image Outlier Detection Without Training using RANSAC + + +
+ Image outlier detection (OD) is an essential tool to ensure the quality of +images used in computer vision tasks. Existing algorithms often involve +training a model to represent the inlier distribution, and outliers are +determined by some deviation measure. Although existing methods proved +effective when trained on strictly inlier samples, their performance remains +questionable when undesired outliers are included during training. As a result +of this limitation, it is necessary to carefully examine the data when +developing OD models for new domains. In this work, we present a novel image OD +algorithm called RANSAC-NN that eliminates the need of data examination and +model training altogether. Unlike existing approaches, RANSAC-NN can be +directly applied on datasets containing outliers by sampling and comparing +subsets of the data. Our algorithm maintains favorable performance compared to +existing methods on a range of benchmarks. Furthermore, we show that RANSAC-NN +can enhance the robustness of existing methods by incorporating our algorithm +as part of the data preparation process. + +
+
+
+
+
+ + ♻ ☆ DeepIPCv2: LiDAR-powered Robust Environmental Perception and + Navigational Control for Autonomous Vehicle + + +
+ We present DeepIPCv2, an autonomous driving model that perceives the +environment using a LiDAR sensor for more robust drivability, especially when +driving under poor illumination conditions where everything is not clearly +visible. DeepIPCv2 takes a set of LiDAR point clouds as the main perception +input. Since point clouds are not affected by illumination changes, they can +provide a clear observation of the surroundings no matter what the condition +is. This results in a better scene understanding and stable features provided +by the perception module to support the controller module in estimating +navigational control properly. To evaluate its performance, we conduct several +tests by deploying the model to predict a set of driving records and perform +real automated driving under three different conditions. We also conduct +ablation and comparative studies with some recent models to justify its +performance. Based on the experimental results, DeepIPCv2 shows a robust +performance by achieving the best drivability in all driving scenarios. +Furthermore, to support future research, we will upload the codes and data to +https://github.com/oskarnatan/DeepIPCv2. + +
+
+
+
+
+ + ♻ ☆ HumanNeRF-SE: A Simple yet Effective Approach to Animate HumanNeRF with + Diverse Poses + + +
+ We present HumanNeRF-SE, a simple yet effective method that synthesizes +diverse novel pose images with simple input. Previous HumanNeRF works require a +large number of optimizable parameters to fit the human images. Instead, we +reload these approaches by combining explicit and implicit human +representations to design both generalized rigid deformation and specific +non-rigid deformation. Our key insight is that explicit shape can reduce the +sampling points used to fit implicit representation, and frozen blending +weights from SMPL constructing a generalized rigid deformation can effectively +avoid overfitting and improve pose generalization performance. Our architecture +involving both explicit and implicit representation is simple yet effective. +Experiments demonstrate our model can synthesize images under arbitrary poses +with few-shot input and increase the speed of synthesizing images by 15 times +through a reduction in computational complexity without using any existing +acceleration modules. Compared to the state-of-the-art HumanNeRF studies, +HumanNeRF-SE achieves better performance with fewer learnable parameters and +less training time. + +
+
+ comment: 16pages, 17 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ TE-TAD: Towards Full End-to-End Temporal Action Detection via + Time-Aligned Coordinate Expression + + +
+ In this paper, we investigate that the normalized coordinate expression is a +key factor as reliance on hand-crafted components in query-based detectors for +temporal action detection (TAD). Despite significant advancements towards an +end-to-end framework in object detection, query-based detectors have been +limited in achieving full end-to-end modeling in TAD. To address this issue, we +propose \modelname{}, a full end-to-end temporal action detection transformer +that integrates time-aligned coordinate expression. We reformulate coordinate +expression utilizing actual timeline values, ensuring length-invariant +representations from the extremely diverse video duration environment. +Furthermore, our proposed adaptive query selection dynamically adjusts the +number of queries based on video length, providing a suitable solution for +varying video durations compared to a fixed query set. Our approach not only +simplifies the TAD process by eliminating the need for hand-crafted components +but also significantly improves the performance of query-based detectors. Our +TE-TAD outperforms the previous query-based detectors and achieves competitive +performance compared to state-of-the-art methods on popular benchmark datasets. +Code is available at: https://github.com/Dotori-HJ/TE-TAD + +
+
+
+
+
+ + ♻ ☆ Improving the Reconstruction of Disentangled Representation Learners via + Multi-Stage Modeling + + +
+ Current autoencoder-based disentangled representation learning methods +achieve disentanglement by penalizing the (aggregate) posterior to encourage +statistical independence of the latent factors. This approach introduces a +trade-off between disentangled representation learning and reconstruction +quality since the model does not have enough capacity to learn correlated +latent variables that capture detail information present in most image data. To +overcome this trade-off, we present a novel multi-stage modeling approach where +the disentangled factors are first learned using a penalty-based disentangled +representation learning method; then, the low-quality reconstruction is +improved with another deep generative model that is trained to model the +missing correlated latent variables, adding detail information while +maintaining conditioning on the previously learned disentangled factors. Taken +together, our multi-stage modelling approach results in a single, coherent +probabilistic model that is theoretically justified by the principal of +D-separation and can be realized with a variety of model classes including +likelihood-based models such as variational autoencoders, implicit models such +as generative adversarial networks, and tractable models like normalizing flows +or mixtures of Gaussians. We demonstrate that our multi-stage model has higher +reconstruction quality than current state-of-the-art methods with equivalent +disentanglement performance across multiple standard benchmarks. In addition, +we apply the multi-stage model to generate synthetic tabular datasets, +showcasing an enhanced performance over benchmark models across a variety of +metrics. The interpretability analysis further indicates that the multi-stage +model can effectively uncover distinct and meaningful features of variations +from which the original distribution can be recovered. + +
+
+
+
+
+ + ♻ ☆ WM-MoE: Weather-aware Multi-scale Mixture-of-Experts for Blind Adverse + Weather Removal + + +
+ Adverse weather removal tasks like deraining, desnowing, and dehazing are +usually treated as separate tasks. However, in practical autonomous driving +scenarios, the type, intensity,and mixing degree of weather are unknown, so +handling each task separately cannot deal with the complex practical scenarios. +In this paper, we study the blind adverse weather removal problem. +Mixture-of-Experts (MoE) is a popular model that adopts a learnable gate to +route the input to different expert networks. The principle of MoE involves +using adaptive networks to process different types of unknown inputs. +Therefore, MoE has great potential for blind adverse weather removal. However, +the original MoE module is inadequate for coupled multiple weather types and +fails to utilize multi-scale features for better performance. To this end, we +propose a method called Weather-aware Multi-scale MoE (WM-MoE) based on +Transformer for blind weather removal. WM-MoE includes two key designs: +WEather-Aware Router (WEAR) and Multi-Scale Experts (MSE). WEAR assigns experts +for each image token based on decoupled content and weather features, which +enhances the model's capability to process multiple adverse weathers. To obtain +discriminative weather features from images, we propose Weather Guidance +Fine-grained Contrastive Learning (WGF-CL), which utilizes weather cluster +information to guide the assignment of positive and negative samples for each +image token. Since processing different weather types requires different +receptive fields, MSE leverages multi-scale features to enhance the spatial +relationship modeling capability, facilitating the high-quality restoration of +diverse weather types and intensities. Our method achieves state-of-the-art +performance in blind adverse weather removal on two public datasets and our +dataset. We also demonstrate the advantage of our method on downstream +segmentation tasks. + +
+
+
+
+
+ + ♻ ☆ Temporally Consistent Unbalanced Optimal Transport for Unsupervised + Action Segmentation CVPR 2024 + + +
+ We propose a novel approach to the action segmentation task for long, +untrimmed videos, based on solving an optimal transport problem. By encoding a +temporal consistency prior into a Gromov-Wasserstein problem, we are able to +decode a temporally consistent segmentation from a noisy affinity/matching cost +matrix between video frames and action classes. Unlike previous approaches, our +method does not require knowing the action order for a video to attain temporal +consistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can +be efficiently solved on GPUs using a few iterations of projected mirror +descent. We demonstrate the effectiveness of our method in an unsupervised +learning setting, where our method is used to generate pseudo-labels for +self-training. We evaluate our segmentation approach and unsupervised learning +pipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly +datasets, yielding state-of-the-art results for the unsupervised video action +segmentation task. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ EGTR: Extracting Graph from Transformer for Scene Graph Generation CVPR 2024 + + +
+ Scene Graph Generation (SGG) is a challenging task of detecting objects and +predicting relationships between objects. After DETR was developed, one-stage +SGG models based on a one-stage object detector have been actively studied. +However, complex modeling is used to predict the relationship between objects, +and the inherent relationship between object queries learned in the multi-head +self-attention of the object detector has been neglected. We propose a +lightweight one-stage SGG model that extracts the relation graph from the +various relationships learned in the multi-head self-attention layers of the +DETR decoder. By fully utilizing the self-attention by-products, the relation +graph can be extracted effectively with a shallow relation extraction head. +Considering the dependency of the relation extraction task on the object +detection task, we propose a novel relation smoothing technique that adjusts +the relation label adaptively according to the quality of the detected objects. +By the relation smoothing, the model is trained according to the continuous +curriculum that focuses on object detection task at the beginning of training +and performs multi-task learning as the object detection performance gradually +improves. Furthermore, we propose a connectivity prediction task that predicts +whether a relation exists between object pairs as an auxiliary task of the +relation extraction. We demonstrate the effectiveness and efficiency of our +method for the Visual Genome and Open Image V6 datasets. Our code is publicly +available at https://github.com/naver-ai/egtr. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Robust deep learning for eye fundus images: Bridging real and synthetic + data for enhancing generalization + + +
+ Deep learning applications for assessing medical images are limited because +the datasets are often small and imbalanced. The use of synthetic data has been +proposed in the literature, but neither a robust comparison of the different +methods nor generalizability has been reported. Our approach integrates a +retinal image quality assessment model and StyleGAN2 architecture to enhance +Age-related Macular Degeneration (AMD) detection capabilities and improve +generalizability. This work compares ten different Generative Adversarial +Network (GAN) architectures to generate synthetic eye-fundus images with and +without AMD. We combined subsets of three public databases (iChallenge-AMD, +ODIR-2019, and RIADD) to form a single training and test set. We employed the +STARE dataset for external validation, ensuring a comprehensive assessment of +the proposed approach. The results show that StyleGAN2 reached the lowest +Frechet Inception Distance (166.17), and clinicians could not accurately +differentiate between real and synthetic images. ResNet-18 architecture +obtained the best performance with 85% accuracy and outperformed the two human +experts (80% and 75%) in detecting AMD fundus images. The accuracy rates were +82.8% for the test set and 81.3% for the STARE dataset, demonstrating the +model's generalizability. The proposed methodology for synthetic medical image +generation has been validated for robustness and accuracy, with free access to +its code for further research and development in this field. + +
+
+ comment: Accepted by the Biomedical Signal Processing and Control +
+
+
+
+
+ + ♻ ☆ Towards Fine-grained Large Object Segmentation 1st Place Solution to 3D + AI Challenge 2020 -- Instance Segmentation Track + + +
+ This technical report introduces our solutions of Team 'FineGrainedSeg' for +Instance Segmentation track in 3D AI Challenge 2020. In order to handle +extremely large objects in 3D-FUTURE, we adopt PointRend as our basic +framework, which outputs more fine-grained masks compared to HTC and SOLOv2. +Our final submission is an ensemble of 5 PointRend models, which achieves the +1st place on both validation and test leaderboards. The code is available at +https://github.com/zehuichen123/3DFuture_ins_seg. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Deep Learning in Cardiology + + +
+ The medical field is creating large amount of data that physicians are unable +to decipher and use efficiently. Moreover, rule-based expert systems are +inefficient in solving complicated medical tasks or for creating insights using +big data. Deep learning has emerged as a more accurate and effective technology +in a wide range of medical problems such as diagnosis, prediction and +intervention. Deep learning is a representation learning method that consists +of layers that transform the data non-linearly, thus, revealing hierarchical +relationships and structures. In this review we survey deep learning +application papers that use structured data, signal and imaging modalities from +cardiology. We discuss the advantages and limitations of applying deep learning +in cardiology that also apply in medicine in general, while proposing certain +directions as the most viable for clinical use. + +
+
+ comment: 27 pages, 2 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Synthesis of Annotated Colorectal Cancer Tissue Images from Gland Layout + + +
+ Generating realistic tissue images with annotations is a challenging task +that is important in many computational histopathology applications. +Synthetically generated images and annotations are valuable for training and +evaluating algorithms in this domain. To address this, we propose an +interactive framework generating pairs of realistic colorectal cancer histology +images with corresponding glandular masks from glandular structure layouts. The +framework accurately captures vital features like stroma, goblet cells, and +glandular lumen. Users can control gland appearance by adjusting parameters +such as the number of glands, their locations, and sizes. The generated images +exhibit good Frechet Inception Distance (FID) scores compared to the +state-of-the-art image-to-image translation model. Additionally, we demonstrate +the utility of our synthetic annotations for evaluating gland segmentation +algorithms. Furthermore, we present a methodology for constructing glandular +masks using advanced deep generative models, such as latent diffusion models. +These masks enable tissue image generation through a residual encoder-decoder +network. + +
+
+
+
+
+ + ♻ ☆ Spacetime Gaussian Feature Splatting for Real-Time Dynamic View + Synthesis CVPR 2024 + + +
+ Novel view synthesis of dynamic scenes has been an intriguing yet challenging +problem. Despite recent advancements, simultaneously achieving high-resolution +photorealistic results, real-time rendering, and compact storage remains a +formidable task. To address these challenges, we propose Spacetime Gaussian +Feature Splatting as a novel dynamic scene representation, composed of three +pivotal components. First, we formulate expressive Spacetime Gaussians by +enhancing 3D Gaussians with temporal opacity and parametric motion/rotation. +This enables Spacetime Gaussians to capture static, dynamic, as well as +transient content within a scene. Second, we introduce splatted feature +rendering, which replaces spherical harmonics with neural features. These +features facilitate the modeling of view- and time-dependent appearance while +maintaining small size. Third, we leverage the guidance of training error and +coarse depth to sample new Gaussians in areas that are challenging to converge +with existing pipelines. Experiments on several established real-world datasets +demonstrate that our method achieves state-of-the-art rendering quality and +speed, while retaining compact storage. At 8K resolution, our lite-version +model can render at 60 FPS on an Nvidia RTX 4090 GPU. Our code is available at +https://github.com/oppo-us-research/SpacetimeGaussians. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://oppo-us-research.github.io/SpacetimeGaussians-website/ +
+
+
+
+
+ + ♻ ☆ InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image + Generation + + +
+ Tuning-free diffusion-based models have demonstrated significant potential in +the realm of image personalization and customization. However, despite this +notable progress, current models continue to grapple with several complex +challenges in producing style-consistent image generation. Firstly, the concept +of style is inherently underdetermined, encompassing a multitude of elements +such as color, material, atmosphere, design, and structure, among others. +Secondly, inversion-based methods are prone to style degradation, often +resulting in the loss of fine-grained details. Lastly, adapter-based approaches +frequently require meticulous weight tuning for each reference image to achieve +a balance between style intensity and text controllability. In this paper, we +commence by examining several compelling yet frequently overlooked +observations. We then proceed to introduce InstantStyle, a framework designed +to address these issues through the implementation of two key strategies: 1) A +straightforward mechanism that decouples style and content from reference +images within the feature space, predicated on the assumption that features +within the same space can be either added to or subtracted from one another. 2) +The injection of reference image features exclusively into style-specific +blocks, thereby preventing style leaks and eschewing the need for cumbersome +weight tuning, which often characterizes more parameter-heavy designs.Our work +demonstrates superior visual stylization outcomes, striking an optimal balance +between the intensity of style and the controllability of textual elements. Our +codes will be available at https://github.com/InstantStyle/InstantStyle. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ DisCo: Disentangled Control for Realistic Human Dance Generation CVPR24 + + +
+ Generative AI has made significant strides in computer vision, particularly +in text-driven image/video synthesis (T2I/T2V). Despite the notable +advancements, it remains challenging in human-centric content synthesis such as +realistic dance generation. Current methodologies, primarily tailored for human +motion transfer, encounter difficulties when confronted with real-world dance +scenarios (e.g., social media dance), which require to generalize across a wide +spectrum of poses and intricate human details. In this paper, we depart from +the traditional paradigm of human motion transfer and emphasize two additional +critical attributes for the synthesis of human dance content in social media +contexts: (i) Generalizability: the model should be able to generalize beyond +generic human viewpoints as well as unseen human subjects, backgrounds, and +poses; (ii) Compositionality: it should allow for the seamless composition of +seen/unseen subjects, backgrounds, and poses from different sources. To address +these challenges, we introduce DISCO, which includes a novel model architecture +with disentangled control to improve the compositionality of dance synthesis, +and an effective human attribute pre-training for better generalizability to +unseen humans. Extensive qualitative and quantitative results demonstrate that +DisCc can generate high-quality human dance images and videos with diverse +appearances and flexible motions. Code is available at +https://disco-dance.github.io/. + +
+
+ comment: Accepted by CVPR24 +
+
+
+
+
+ + ♻ ☆ pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable + Generalizable 3D Reconstruction + + +
+ We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D +radiance fields parameterized by 3D Gaussian primitives from pairs of images. +Our model features real-time and memory-efficient rendering for scalable +training as well as fast 3D reconstruction at inference time. To overcome local +minima inherent to sparse and locally supported representations, we predict a +dense probability distribution over 3D and sample Gaussian means from that +probability distribution. We make this sampling operation differentiable via a +reparameterization trick, allowing us to back-propagate gradients through the +Gaussian splatting representation. We benchmark our method on wide-baseline +novel view synthesis on the real-world RealEstate10k and ACID datasets, where +we outperform state-of-the-art light field transformers and accelerate +rendering by 2.5 orders of magnitude while reconstructing an interpretable and +editable 3D radiance field. + +
+
+ comment: Project page: https://dcharatan.github.io/pixelsplat +
+
+
+
+
+ + ♻ ☆ ArtGPT-4: Towards Artistic-understanding Large Vision-Language Models + with Enhanced Adapter + + +
+ The success of large language models (LLMs) has inspired an emerging research +field of multimodal learning. However, a grand challenge of exploiting LLMs for +multimodal learning is the size of pre-trained LLMs which are always with +billions of parameters. To tackle this challenge, models such as MiniGPT-4 and +LLaVA have been developed to fine-tune the pre-trained models using fewer +parameters. Despite their promising performance, these models remain limited in +their understanding of artistic imagery. To facilitate better +artistic-understanding, in this paper, we propose ArtGPT-4, a pioneering large +vision-language model tailored to address the limitations of existing models in +artistic comprehension. The key innovation of ArtGPT-4 lies in its craft for +the sophisticated challenge of artistic image comprehension, setting it apart +from other models that overlook fine details for broader themes. Specifically, +it works by integrating some specialized adapter layers into the LLM, enabling +the model to more efficiently and effectively parse and interpret complex +visual tokens, instead of fine-tuning the whole LLM as in the existing method. +ArtGPT-4 has demonstrated its outstanding performance on the efficiency: +utilizing a Tesla A100 device, its training can be completed in mere 2 hours +with an image-text pair dataset comprising approximately 0.52M entries. +Additionally, ArtGPT-4 has also achieved state-of-the-art performance on the +ArtEmis and ArtEmis-v2.0 datasets as well as the benchmarks established in this +work, lagging behind professional artists' descriptions by a negligible 0.15 +points on a 6-point scale. The outstanding performance of ArtGPT-4 shows that +it can render images with an artistic-understanding and convey the emotions +they inspire, mirroring human interpretation. The code and the pre-trained +model are accessible in \url{https://github.com/DLYuanGod/ArtGPT-4}. + +
+
+
+
+
+ + ♻ ☆ TinyGPT-V: Efficient Multimodal Large Language Model via Small Backbones + + +
+ In recent years, multimodal large language models (MLLMs) such as GPT-4V have +demonstrated remarkable advancements, excelling in a variety of vision-language +tasks. Despite their prowess, the closed-source nature and computational +demands of such models limit their accessibility and applicability. This study +introduces TinyGPT-V, a novel open-source MLLM, designed for efficient training +and inference across various vision-language tasks, including image captioning +(IC) and visual question answering (VQA). Leveraging a compact yet powerful +architecture, TinyGPT-V integrates the Phi-2 language model with pre-trained +vision encoders, utilizing a unique mapping module for visual and linguistic +information fusion. With a training regimen optimized for small backbones and +employing a diverse dataset amalgam, TinyGPT-V requires significantly lower +computational resources 24GB for training and as little as 8GB for inference +without compromising on performance. Our experiments demonstrate that +TinyGPT-V, with its language model 2.8 billion parameters, achieves comparable +results in VQA and image inference tasks to its larger counterparts while being +uniquely suited for deployment on resource-constrained devices through +innovative quantization techniques. This work not only paves the way for more +accessible and efficient MLLMs but also underscores the potential of smaller, +optimized models in bridging the gap between high performance and computational +efficiency in real-world applications. Additionally, this paper introduces a +new approach to multimodal large language models using smaller backbones. Our +code and training weights are available in +\url{https://github.com/DLYuanGod/TinyGPT-V}. + +
+
+
+
+
+ + ♻ ☆ Few-shot point cloud reconstruction and denoising via learned Guassian + splats renderings and fine-tuned diffusion features + + +
+ Existing deep learning methods for the reconstruction and denoising of point +clouds rely on small datasets of 3D shapes. We circumvent the problem by +leveraging deep learning methods trained on billions of images. We propose a +method to reconstruct point clouds from few images and to denoise point clouds +from their rendering by exploiting prior knowledge distilled from image-based +deep learning models. To improve reconstruction in constraint settings, we +regularize the training of a differentiable renderer with hybrid surface and +appearance by introducing semantic consistency supervision. In addition, we +propose a pipeline to finetune Stable Diffusion to denoise renderings of noisy +point clouds and we demonstrate how these learned filters can be used to remove +point cloud noise coming without 3D supervision. We compare our method with DSS +and PointRadiance and achieved higher quality 3D reconstruction on the +Sketchfab Testset and SCUT Dataset. + +
+
+
+
+
+ + ♻ ☆ 3D scene generation from scene graphs and self-attention + + +
+ Synthesizing realistic and diverse indoor 3D scene layouts in a controllable +fashion opens up applications in simulated navigation and virtual reality. As +concise and robust representations of a scene, scene graphs have proven to be +well-suited as the semantic control on the generated layout. We present a +variant of the conditional variational autoencoder (cVAE) model to synthesize +3D scenes from scene graphs and floor plans. We exploit the properties of +self-attention layers to capture high-level relationships between objects in a +scene, and use these as the building blocks of our model. Our model, leverages +graph transformers to estimate the size, dimension and orientation of the +objects in a room while satisfying relationships in the given scene graph. Our +experiments shows self-attention layers leads to sparser (7.9x compared to +Graphto3D) and more diverse scenes (16%). + +
+
+
+
+
+ + ♻ ☆ Neural Field Convolutions by Repeated Differentiation + + +
+ Neural fields are evolving towards a general-purpose continuous +representation for visual computing. Yet, despite their numerous appealing +properties, they are hardly amenable to signal processing. As a remedy, we +present a method to perform general continuous convolutions with general +continuous signals such as neural fields. Observing that piecewise polynomial +kernels reduce to a sparse set of Dirac deltas after repeated differentiation, +we leverage convolution identities and train a repeated integral field to +efficiently execute large-scale convolutions. We demonstrate our approach on a +variety of data modalities and spatially-varying kernels. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 153 + +
+
+
+ + ☆ Visual Autoregressive Modeling: Scalable Image Generation via Next-Scale + Prediction + + +
+ We present Visual AutoRegressive modeling (VAR), a new generation paradigm +that redefines the autoregressive learning on images as coarse-to-fine +"next-scale prediction" or "next-resolution prediction", diverging from the +standard raster-scan "next-token prediction". This simple, intuitive +methodology allows autoregressive (AR) transformers to learn visual +distributions fast and generalize well: VAR, for the first time, makes AR +models surpass diffusion transformers in image generation. On ImageNet 256x256 +benchmark, VAR significantly improve AR baseline by improving Frechet inception +distance (FID) from 18.65 to 1.80, inception score (IS) from 80.4 to 356.4, +with around 20x faster inference speed. It is also empirically verified that +VAR outperforms the Diffusion Transformer (DiT) in multiple dimensions +including image quality, inference speed, data efficiency, and scalability. +Scaling up VAR models exhibits clear power-law scaling laws similar to those +observed in LLMs, with linear correlation coefficients near -0.998 as solid +evidence. VAR further showcases zero-shot generalization ability in downstream +tasks including image in-painting, out-painting, and editing. These results +suggest VAR has initially emulated the two important properties of LLMs: +Scaling Laws and zero-shot task generalization. We have released all models and +codes to promote the exploration of AR/VAR models for visual generation and +unified learning. + +
+
+
+
+
+ + ☆ ALOHa: A New Measure for Hallucination in Captioning Models NAACL 2024 + + +
+ Despite recent advances in multimodal pre-training for visual description, +state-of-the-art models still produce captions containing errors, such as +hallucinating objects not present in a scene. The existing prominent metric for +object hallucination, CHAIR, is limited to a fixed set of MS COCO objects and +synonyms. In this work, we propose a modernized open-vocabulary metric, ALOHa, +which leverages large language models (LLMs) to measure object hallucinations. +Specifically, we use an LLM to extract groundable objects from a candidate +caption, measure their semantic similarity to reference objects from captions +and object detections, and use Hungarian matching to produce a final +hallucination score. We show that ALOHa correctly identifies 13.6% more +hallucinated objects than CHAIR on HAT, a new gold-standard subset of MS COCO +Captions annotated for hallucinations, and 30.8% more on nocaps, where objects +extend beyond MS COCO categories. Our code is available at +https://davidmchan.github.io/aloha/. + +
+
+ comment: To appear at NAACL 2024 +
+
+
+
+
+ + ☆ LidarDM: Generative LiDAR Simulation in a Generated World + + +
+ We present LidarDM, a novel LiDAR generative model capable of producing +realistic, layout-aware, physically plausible, and temporally coherent LiDAR +videos. LidarDM stands out with two unprecedented capabilities in LiDAR +generative modeling: (i) LiDAR generation guided by driving scenarios, offering +significant potential for autonomous driving simulations, and (ii) 4D LiDAR +point cloud generation, enabling the creation of realistic and temporally +coherent sequences. At the heart of our model is a novel integrated 4D world +generation framework. Specifically, we employ latent diffusion models to +generate the 3D scene, combine it with dynamic actors to form the underlying 4D +world, and subsequently produce realistic sensory observations within this +virtual environment. Our experiments indicate that our approach outperforms +competing algorithms in realism, temporal coherency, and layout consistency. We +additionally show that LidarDM can be used as a generative world model +simulator for training and testing perception models. + +
+
+
+
+
+ + ☆ DeiT-LT Distillation Strikes Back for Vision Transformer Training on + Long-Tailed Datasets CVPR 2024 + + +
+ Vision Transformer (ViT) has emerged as a prominent architecture for various +computer vision tasks. In ViT, we divide the input image into patch tokens and +process them through a stack of self attention blocks. However, unlike +Convolutional Neural Networks (CNN), ViTs simple architecture has no +informative inductive bias (e.g., locality,etc. ). Due to this, ViT requires a +large amount of data for pre-training. Various data efficient approaches (DeiT) +have been proposed to train ViT on balanced datasets effectively. However, +limited literature discusses the use of ViT for datasets with long-tailed +imbalances. In this work, we introduce DeiT-LT to tackle the problem of +training ViTs from scratch on long-tailed datasets. In DeiT-LT, we introduce an +efficient and effective way of distillation from CNN via distillation DIST +token by using out-of-distribution images and re-weighting the distillation +loss to enhance focus on tail classes. This leads to the learning of local +CNN-like features in early ViT blocks, improving generalization for tail +classes. Further, to mitigate overfitting, we propose distilling from a flat +CNN teacher, which leads to learning low-rank generalizable features for DIST +tokens across all ViT blocks. With the proposed DeiT-LT scheme, the +distillation DIST token becomes an expert on the tail classes, and the +classifier CLS token becomes an expert on the head classes. The experts help to +effectively learn features corresponding to both the majority and minority +classes using a distinct set of tokens within the same ViT architecture. We +show the effectiveness of DeiT-LT for training ViT from scratch on datasets +ranging from small-scale CIFAR-10 LT to large-scale iNaturalist-2018. + +
+
+ comment: CVPR 2024. Project Page: https://rangwani-harsh.github.io/DeiT-LT +
+
+
+
+
+ + ☆ MatAtlas: Text-driven Consistent Geometry Texturing and Material + Assignment + + +
+ We present MatAtlas, a method for consistent text-guided 3D model texturing. +Following recent progress we leverage a large scale text-to-image generation +model (e.g., Stable Diffusion) as a prior to texture a 3D model. We carefully +design an RGB texturing pipeline that leverages a grid pattern diffusion, +driven by depth and edges. By proposing a multi-step texture refinement +process, we significantly improve the quality and 3D consistency of the +texturing output. To further address the problem of baked-in lighting, we move +beyond RGB colors and pursue assigning parametric materials to the assets. +Given the high-quality initial RGB texture, we propose a novel material +retrieval method capitalized on Large Language Models (LLM), enabling +editabiliy and relightability. We evaluate our method on a wide variety of +geometries and show that our method significantly outperform prior arts. We +also analyze the role of each component through a detailed ablation study. + +
+
+
+
+
+ + ☆ Deep Image Composition Meets Image Forgery + + +
+ Image forgery is a topic that has been studied for many years. Before the +breakthrough of deep learning, forged images were detected using handcrafted +features that did not require training. These traditional methods failed to +perform satisfactorily even on datasets much worse in quality than real-life +image manipulations. Advances in deep learning have impacted image forgery +detection as much as they have impacted other areas of computer vision and have +improved the state of the art. Deep learning models require large amounts of +labeled data for training. In the case of image forgery, labeled data at the +pixel level is a very important factor for the models to learn. None of the +existing datasets have sufficient size, realism and pixel-level labeling at the +same time. This is due to the high cost of producing and labeling quality +images. It can take hours for an image editing expert to manipulate just one +image. To bridge this gap, we automate data generation using image composition +techniques that are very related to image forgery. Unlike other automated data +generation frameworks, we use state of the art image composition deep learning +models to generate spliced images close to the quality of real-life +manipulations. Finally, we test the generated dataset on the SOTA image +manipulation detection model and show that its prediction performance is lower +compared to existing datasets, i.e. we produce realistic images that are more +difficult to detect. Dataset will be available at +https://github.com/99eren99/DIS25k . + +
+
+
+
+
+ + ☆ Steganographic Passport: An Owner and User Verifiable Credential for + Deep Model IP Protection Without Retraining + + +
+ Ensuring the legal usage of deep models is crucial to promoting trustable, +accountable, and responsible artificial intelligence innovation. Current +passport-based methods that obfuscate model functionality for license-to-use +and ownership verifications suffer from capacity and quality constraints, as +they require retraining the owner model for new users. They are also vulnerable +to advanced Expanded Residual Block ambiguity attacks. We propose +Steganographic Passport, which uses an invertible steganographic network to +decouple license-to-use from ownership verification by hiding the user's +identity images into the owner-side passport and recovering them from their +respective user-side passports. An irreversible and collision-resistant hash +function is used to avoid exposing the owner-side passport from the derived +user-side passports and increase the uniqueness of the model signature. To +safeguard both the passport and model's weights against advanced ambiguity +attacks, an activation-level obfuscation is proposed for the verification +branch of the owner's model. By jointly training the verification and +deployment branches, their weights become tightly coupled. The proposed method +supports agile licensing of deep models by providing a strong ownership proof +and license accountability without requiring a separate model retraining for +the admission of every new user. Experiment results show that our +Steganographic Passport outperforms other passport-based deep model protection +methods in robustness against various known attacks. + +
+
+
+
+
+ + ☆ PoCo: Point Context Cluster for RGBD Indoor Place Recognition + + +
+ We present a novel end-to-end algorithm (PoCo) for the indoor RGB-D place +recognition task, aimed at identifying the most likely match for a given query +frame within a reference database. The task presents inherent challenges +attributed to the constrained field of view and limited range of perception +sensors. We propose a new network architecture, which generalizes the recent +Context of Clusters (CoCs) to extract global descriptors directly from the +noisy point clouds through end-to-end learning. Moreover, we develop the +architecture by integrating both color and geometric modalities into the point +features to enhance the global descriptor representation. We conducted +evaluations on public datasets ScanNet-PR and ARKit with 807 and 5047 +scenarios, respectively. PoCo achieves SOTA performance: on ScanNet-PR, we +achieve R@1 of 64.63%, a 5.7% improvement from the best-published result CGis +(61.12%); on Arkit, we achieve R@1 of 45.12%, a 13.3% improvement from the +best-published result CGis (39.82%). In addition, PoCo shows higher efficiency +than CGis in inference time (1.75X-faster), and we demonstrate the +effectiveness of PoCo in recognizing places within a real-world laboratory +environment. + +
+
+
+
+
+ + ☆ On the Scalability of Diffusion-based Text-to-Image Generation CVPR2024 + + +
+ Scaling up model and data size has been quite successful for the evolution of +LLMs. However, the scaling law for the diffusion based text-to-image (T2I) +models is not fully explored. It is also unclear how to efficiently scale the +model for better performance at reduced cost. The different training settings +and expensive training cost make a fair model comparison extremely difficult. +In this work, we empirically study the scaling properties of diffusion based +T2I models by performing extensive and rigours ablations on scaling both +denoising backbones and training set, including training scaled UNet and +Transformer variants ranging from 0.4B to 4B parameters on datasets upto 600M +images. For model scaling, we find the location and amount of cross attention +distinguishes the performance of existing UNet designs. And increasing the +transformer blocks is more parameter-efficient for improving text-image +alignment than increasing channel numbers. We then identify an efficient UNet +variant, which is 45% smaller and 28% faster than SDXL's UNet. On the data +scaling side, we show the quality and diversity of the training set matters +more than simply dataset size. Increasing caption density and diversity +improves text-image alignment performance and the learning efficiency. Finally, +we provide scaling functions to predict the text-image alignment performance as +functions of the scale of model size, compute and dataset size. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ FlightScope: A Deep Comprehensive Assessment of Aircraft Detection + Algorithms in Satellite Imagery + + +
+ Object detection in remotely sensed satellite pictures is fundamental in many +fields such as biophysical, and environmental monitoring. While deep learning +algorithms are constantly evolving, they have been mostly implemented and +tested on popular ground-based taken photos. This paper critically evaluates +and compares a suite of advanced object detection algorithms customized for the +task of identifying aircraft within satellite imagery. Using the large +HRPlanesV2 dataset, together with a rigorous validation with the GDIT dataset, +this research encompasses an array of methodologies including YOLO versions 5 +and 8, Faster RCNN, CenterNet, RetinaNet, RTMDet, and DETR, all trained from +scratch. This exhaustive training and validation study reveal YOLOv5 as the +preeminent model for the specific case of identifying airplanes from remote +sensing data, showcasing high precision and adaptability across diverse imaging +conditions. This research highlight the nuanced performance landscapes of these +algorithms, with YOLOv5 emerging as a robust solution for aerial object +detection, underlining its importance through superior mean average precision, +Recall, and Intersection over Union scores. The findings described here +underscore the fundamental role of algorithm selection aligned with the +specific demands of satellite imagery analysis and extend a comprehensive +framework to evaluate model efficacy. The benchmark toolkit and codes, +available via https://github.com/toelt-llc/FlightScope_Bench, aims to further +exploration and innovation in the realm of remote sensing object detection, +paving the way for improved analytical methodologies in satellite imagery +applications. + +
+
+ comment: 15 figures, 4 tables, comprehensive survey, comparative study +
+
+
+
+
+ + ☆ Cross-Modal Conditioned Reconstruction for Language-guided Medical Image + Segmentation + + +
+ Recent developments underscore the potential of textual information in +enhancing learning models for a deeper understanding of medical visual +semantics. However, language-guided medical image segmentation still faces a +challenging issue. Previous works employ implicit and ambiguous architectures +to embed textual information. This leads to segmentation results that are +inconsistent with the semantics represented by the language, sometimes even +diverging significantly. To this end, we propose a novel cross-modal +conditioned Reconstruction for Language-guided Medical Image Segmentation +(RecLMIS) to explicitly capture cross-modal interactions, which assumes that +well-aligned medical visual features and medical notes can effectively +reconstruct each other. We introduce conditioned interaction to adaptively +predict patches and words of interest. Subsequently, they are utilized as +conditioning factors for mutual reconstruction to align with regions described +in the medical notes. Extensive experiments demonstrate the superiority of our +RecLMIS, surpassing LViT by 3.74% mIoU on the publicly available MosMedData+ +dataset and achieving an average increase of 1.89% mIoU for cross-domain tests +on our QATA-CoV19 dataset. Simultaneously, we achieve a relative reduction of +20.2% in parameter count and a 55.5% decrease in computational load. The code +will be available at https://github.com/ShashankHuang/RecLMIS. + +
+
+
+
+
+ + ☆ Enhancing Interpretability of Vertebrae Fracture Grading using + Human-interpretable Prototypes + + +
+ Vertebral fracture grading classifies the severity of vertebral fractures, +which is a challenging task in medical imaging and has recently attracted Deep +Learning (DL) models. Only a few works attempted to make such models +human-interpretable despite the need for transparency and trustworthiness in +critical use cases like DL-assisted medical diagnosis. Moreover, such models +either rely on post-hoc methods or additional annotations. In this work, we +propose a novel interpretable-by-design method, ProtoVerse, to find relevant +sub-parts of vertebral fractures (prototypes) that reliably explain the model's +decision in a human-understandable way. Specifically, we introduce a novel +diversity-promoting loss to mitigate prototype repetitions in small datasets +with intricate semantics. We have experimented with the VerSe'19 dataset and +outperformed the existing prototype-based method. Further, our model provides +superior interpretability against the post-hoc method. Importantly, expert +radiologists validated the visual interpretability of our results, showing +clinical applicability. + +
+
+
+
+
+ + ☆ GPU-Accelerated RSF Level Set Evolution for Large-Scale Microvascular + Segmentation + + +
+ Microvascular networks are challenging to model because these structures are +currently near the diffraction limit for most advanced three-dimensional +imaging modalities, including confocal and light sheet microscopy. This makes +semantic segmentation difficult, because individual components of these +networks fluctuate within the confines of individual pixels. Level set methods +are ideally suited to solve this problem by providing surface and topological +constraints on the resulting model, however these active contour techniques are +extremely time intensive and impractical for terabyte-scale images. We propose +a reformulation and implementation of the region-scalable fitting (RSF) level +set model that makes it amenable to three-dimensional evaluation using both +single-instruction multiple data (SIMD) and single-program multiple-data (SPMD) +parallel processing. This enables evaluation of the level set equation on +independent regions of the data set using graphics processing units (GPUs), +making large-scale segmentation of high-resolution networks practical and +inexpensive. + We tested this 3D parallel RSF approach on multiple data sets acquired using +state-of-the-art imaging techniques to acquire microvascular data, including +micro-CT, light sheet fluorescence microscopy (LSFM) and milling microscopy. To +assess the performance and accuracy of the RSF model, we conducted a +Monte-Carlo-based validation technique to compare results to other segmentation +methods. We also provide a rigorous profiling to show the gains in processing +speed leveraging parallel hardware. This study showcases the practical +application of the RSF model, emphasizing its utility in the challenging domain +of segmenting large-scale high-topology network structures with a particular +focus on building microvascular models. + +
+
+
+
+
+ + ☆ MULAN: A Multi Layer Annotated Dataset for Controllable Text-to-Image + Generation CVPR 2024 + + +
+ Text-to-image generation has achieved astonishing results, yet precise +spatial controllability and prompt fidelity remain highly challenging. This +limitation is typically addressed through cumbersome prompt engineering, scene +layout conditioning, or image editing techniques which often require hand drawn +masks. Nonetheless, pre-existing works struggle to take advantage of the +natural instance-level compositionality of scenes due to the typically flat +nature of rasterized RGB output images. Towards adressing this challenge, we +introduce MuLAn: a novel dataset comprising over 44K MUlti-Layer ANnotations of +RGB images as multilayer, instance-wise RGBA decompositions, and over 100K +instance images. To build MuLAn, we developed a training free pipeline which +decomposes a monocular RGB image into a stack of RGBA layers comprising of +background and isolated instances. We achieve this through the use of +pretrained general-purpose models, and by developing three modules: image +decomposition for instance discovery and extraction, instance completion to +reconstruct occluded areas, and image re-assembly. We use our pipeline to +create MuLAn-COCO and MuLAn-LAION datasets, which contain a variety of image +decompositions in terms of style, composition and complexity. With MuLAn, we +provide the first photorealistic resource providing instance decomposition and +occlusion information for high quality images, opening up new avenues for +text-to-image generative AI research. With this, we aim to encourage the +development of novel generation and editing technology, in particular +layer-wise solutions. MuLAn data resources are available at +https://MuLAn-dataset.github.io/. + +
+
+ comment: CVPR 2024 - Project page: https://MuLAn-dataset.github.io/ +
+
+
+
+
+ + ☆ GenN2N: Generative NeRF2NeRF Translation CVPR 2024 + + +
+ We present GenN2N, a unified NeRF-to-NeRF translation framework for various +NeRF translation tasks such as text-driven NeRF editing, colorization, +super-resolution, inpainting, etc. Unlike previous methods designed for +individual translation tasks with task-specific schemes, GenN2N achieves all +these NeRF editing tasks by employing a plug-and-play image-to-image translator +to perform editing in the 2D domain and lifting 2D edits into the 3D NeRF +space. Since the 3D consistency of 2D edits may not be assured, we propose to +model the distribution of the underlying 3D edits through a generative model +that can cover all possible edited NeRFs. To model the distribution of 3D +edited NeRFs from 2D edited images, we carefully design a VAE-GAN that encodes +images while decoding NeRFs. The latent space is trained to align with a +Gaussian distribution and the NeRFs are supervised through an adversarial loss +on its renderings. To ensure the latent code does not depend on 2D viewpoints +but truly reflects the 3D edits, we also regularize the latent code through a +contrastive learning scheme. Extensive experiments on various editing tasks +show GenN2N, as a universal framework, performs as well or better than +task-specific specialists while possessing flexible generative power. More +results on our project page: https://xiangyueliu.github.io/GenN2N/ + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://xiangyueliu.github.io/GenN2N/ +
+
+
+
+
+ + ☆ Domain Generalization through Meta-Learning: A Survey + + +
+ Deep neural networks (DNNs) have revolutionized artificial intelligence but +often lack performance when faced with out-of-distribution (OOD) data, a common +scenario due to the inevitable domain shifts in real-world applications. This +limitation stems from the common assumption that training and testing data +share the same distribution-an assumption frequently violated in practice. +Despite their effectiveness with large amounts of data and computational power, +DNNs struggle with distributional shifts and limited labeled data, leading to +overfitting and poor generalization across various tasks and domains. +Meta-learning presents a promising approach by employing algorithms that +acquire transferable knowledge across various tasks for fast adaptation, +eliminating the need to learn each task from scratch. This survey paper delves +into the realm of meta-learning with a focus on its contribution to domain +generalization. We first clarify the concept of meta-learning for domain +generalization and introduce a novel taxonomy based on the feature extraction +strategy and the classifier learning methodology, offering a granular view of +methodologies. Through an exhaustive review of existing methods and underlying +theories, we map out the fundamentals of the field. Our survey provides +practical insights and an informed discussion on promising research directions, +paving the way for future innovation in meta-learning for domain +generalization. + +
+
+
+
+
+ + ☆ Unsupervised Occupancy Learning from Sparse Point Cloud CVPR 2024 + + +
+ Implicit Neural Representations have gained prominence as a powerful +framework for capturing complex data modalities, encompassing a wide range from +3D shapes to images and audio. Within the realm of 3D shape representation, +Neural Signed Distance Functions (SDF) have demonstrated remarkable potential +in faithfully encoding intricate shape geometry. However, learning SDFs from 3D +point clouds in the absence of ground truth supervision remains a very +challenging task. In this paper, we propose a method to infer occupancy fields +instead of SDFs as they are easier to learn from sparse inputs. We leverage a +margin-based uncertainty measure to differentially sample from the decision +boundary of the occupancy function and supervise the sampled boundary points +using the input point cloud. We further stabilize the optimization process at +the early stages of the training by biasing the occupancy function towards +minimal entropy fields while maximizing its entropy at the input point cloud. +Through extensive experiments and evaluations, we illustrate the efficacy of +our proposed method, highlighting its capacity to improve implicit shape +inference with respect to baselines and the state-of-the-art using synthetic +and real data. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ DIBS: Enhancing Dense Video Captioning with Unlabeled Videos via Pseudo + Boundary Enrichment and Online Refinement CVPR 2024 + + +
+ We present Dive Into the BoundarieS (DIBS), a novel pretraining framework for +dense video captioning (DVC), that elaborates on improving the quality of the +generated event captions and their associated pseudo event boundaries from +unlabeled videos. By leveraging the capabilities of diverse large language +models (LLMs), we generate rich DVC-oriented caption candidates and optimize +the corresponding pseudo boundaries under several meticulously designed +objectives, considering diversity, event-centricity, temporal ordering, and +coherence. Moreover, we further introduce a novel online boundary refinement +strategy that iteratively improves the quality of pseudo boundaries during +training. Comprehensive experiments have been conducted to examine the +effectiveness of the proposed technique components. By leveraging a substantial +amount of unlabeled video data, such as HowTo100M, we achieve a remarkable +advancement on standard DVC datasets like YouCook2 and ActivityNet. We +outperform the previous state-of-the-art Vid2Seq across a majority of metrics, +achieving this with just 0.4% of the unlabeled video data used for pre-training +by Vid2Seq. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Cross-Attention Makes Inference Cumbersome in Text-to-Image Diffusion + Models + + +
+ This study explores the role of cross-attention during inference in +text-conditional diffusion models. We find that cross-attention outputs +converge to a fixed point after few inference steps. Accordingly, the time +point of convergence naturally divides the entire inference process into two +stages: an initial semantics-planning stage, during which, the model relies on +cross-attention to plan text-oriented visual semantics, and a subsequent +fidelity-improving stage, during which the model tries to generate images from +previously planned semantics. Surprisingly, ignoring text conditions in the +fidelity-improving stage not only reduces computation complexity, but also +maintains model performance. This yields a simple and training-free method +called TGATE for efficient generation, which caches the cross-attention output +once it converges and keeps it fixed during the remaining inference steps. Our +empirical study on the MS-COCO validation set confirms its effectiveness. The +source code of TGATE is available at https://github.com/HaozheLiu-ST/T-GATE. + +
+
+
+
+
+ + ☆ LiDAR4D: Dynamic Neural Fields for Novel Space-time View LiDAR Synthesis CVPR 2024 + + +
+ Although neural radiance fields (NeRFs) have achieved triumphs in image novel +view synthesis (NVS), LiDAR NVS remains largely unexplored. Previous LiDAR NVS +methods employ a simple shift from image NVS methods while ignoring the dynamic +nature and the large-scale reconstruction problem of LiDAR point clouds. In +light of this, we propose LiDAR4D, a differentiable LiDAR-only framework for +novel space-time LiDAR view synthesis. In consideration of the sparsity and +large-scale characteristics, we design a 4D hybrid representation combined with +multi-planar and grid features to achieve effective reconstruction in a +coarse-to-fine manner. Furthermore, we introduce geometric constraints derived +from point clouds to improve temporal consistency. For the realistic synthesis +of LiDAR point clouds, we incorporate the global optimization of ray-drop +probability to preserve cross-region patterns. Extensive experiments on +KITTI-360 and NuScenes datasets demonstrate the superiority of our method in +accomplishing geometry-aware and time-consistent dynamic reconstruction. Codes +are available at https://github.com/ispc-lab/LiDAR4D. + +
+
+ comment: Accepted by CVPR 2024. Project Page: + https://dyfcalid.github.io/LiDAR4D +
+
+
+
+
+ + ☆ Adaptive Affinity-Based Generalization For MRI Imaging Segmentation + Across Resource-Limited Settings + + +
+ The joint utilization of diverse data sources for medical imaging +segmentation has emerged as a crucial area of research, aiming to address +challenges such as data heterogeneity, domain shift, and data quality +discrepancies. Integrating information from multiple data domains has shown +promise in improving model generalizability and adaptability. However, this +approach often demands substantial computational resources, hindering its +practicality. In response, knowledge distillation (KD) has garnered attention +as a solution. KD involves training light-weight models to emulate the behavior +of more resource-intensive models, thereby mitigating the computational burden +while maintaining performance. This paper addresses the pressing need to +develop a lightweight and generalizable model for medical imaging segmentation +that can effectively handle data integration challenges. Our proposed approach +introduces a novel relation-based knowledge framework by seamlessly combining +adaptive affinity-based and kernel-based distillation through a gram matrix +that can capture the style representation across features. This methodology +empowers the student model to accurately replicate the feature representations +of the teacher model, facilitating robust performance even in the face of +domain shift and data heterogeneity. To validate our innovative approach, we +conducted experiments on publicly available multi-source prostate MRI data. The +results demonstrate a significant enhancement in segmentation performance using +lightweight networks. Notably, our method achieves this improvement while +reducing both inference time and storage usage, rendering it a practical and +efficient solution for real-time medical imaging segmentation. + +
+
+
+
+
+ + ☆ InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image + Generation + + +
+ Tuning-free diffusion-based models have demonstrated significant potential in +the realm of image personalization and customization. However, despite this +notable progress, current models continue to grapple with several complex +challenges in producing style-consistent image generation. Firstly, the concept +of style is inherently underdetermined, encompassing a multitude of elements +such as color, material, atmosphere, design, and structure, among others. +Secondly, inversion-based methods are prone to style degradation, often +resulting in the loss of fine-grained details. Lastly, adapter-based approaches +frequently require meticulous weight tuning for each reference image to achieve +a balance between style intensity and text controllability. In this paper, we +commence by examining several compelling yet frequently overlooked +observations. We then proceed to introduce InstantStyle, a framework designed +to address these issues through the implementation of two key strategies: 1) A +straightforward mechanism that decouples style and content from reference +images within the feature space, predicated on the assumption that features +within the same space can be either added to or subtracted from one another. 2) +The injection of reference image features exclusively into style-specific +blocks, thereby preventing style leaks and eschewing the need for cumbersome +weight tuning, which often characterizes more parameter-heavy designs.Our work +demonstrates superior visual stylization outcomes, striking an optimal balance +between the intensity of style and the controllability of textual elements. Our +codes will be available at https://github.com/InstantStyle/InstantStyle. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Event Camera Demosaicing via Swin Transformer and Pixel-focus Loss CVPR 2024 + + +
+ Recent research has highlighted improvements in high-quality imaging guided +by event cameras, with most of these efforts concentrating on the RGB domain. +However, these advancements frequently neglect the unique challenges introduced +by the inherent flaws in the sensor design of event cameras in the RAW domain. +Specifically, this sensor design results in the partial loss of pixel values, +posing new challenges for RAW domain processes like demosaicing. The challenge +intensifies as most research in the RAW domain is based on the premise that +each pixel contains a value, making the straightforward adaptation of these +methods to event camera demosaicing problematic. To end this, we present a +Swin-Transformer-based backbone and a pixel-focus loss function for demosaicing +with missing pixel values in RAW domain processing. Our core motivation is to +refine a general and widely applicable foundational model from the RGB domain +for RAW domain processing, thereby broadening the model's applicability within +the entire imaging process. Our method harnesses multi-scale processing and +space-to-depth techniques to ensure efficiency and reduce computing complexity. +We also proposed the Pixel-focus Loss function for network fine-tuning to +improve network convergence based on our discovery of a long-tailed +distribution in training loss. Our method has undergone validation on the MIPI +Demosaic Challenge dataset, with subsequent analytical experimentation +confirming its efficacy. All code and trained models are released here: +https://github.com/yunfanLu/ev-demosaic + +
+
+ comment: Accepted for the CVPR 2024 Workshop on Mobile Intelligent Photography + & Imaging +
+
+
+
+
+ + ☆ Harnessing the Power of Large Vision Language Models for Synthetic Image + Detection + + +
+ In recent years, the emergence of models capable of generating images from +text has attracted considerable interest, offering the possibility of creating +realistic images from text descriptions. Yet these advances have also raised +concerns about the potential misuse of these images, including the creation of +misleading content such as fake news and propaganda. This study investigates +the effectiveness of using advanced vision-language models (VLMs) for synthetic +image identification. Specifically, the focus is on tuning state-of-the-art +image captioning models for synthetic image detection. By harnessing the robust +understanding capabilities of large VLMs, the aim is to distinguish authentic +images from synthetic images produced by diffusion-based models. This study +contributes to the advancement of synthetic image detection by exploiting the +capabilities of visual language models such as BLIP-2 and ViTGPT2. By tailoring +image captioning models, we address the challenges associated with the +potential misuse of synthetic images in real-world applications. Results +described in this paper highlight the promising role of VLMs in the field of +synthetic image detection, outperforming conventional image-based detection +techniques. Code and models can be found at +https://github.com/Mamadou-Keita/VLM-DETECT. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2404.01959 +
+
+
+
+
+ + ☆ Model-agnostic Origin Attribution of Generated Images with Few-shot + Examples + + +
+ Recent progress in visual generative models enables the generation of +high-quality images. To prevent the misuse of generated images, it is important +to identify the origin model that generates them. In this work, we study the +origin attribution of generated images in a practical setting where only a few +images generated by a source model are available and the source model cannot be +accessed. The goal is to check if a given image is generated by the source +model. We first formulate this problem as a few-shot one-class classification +task. To solve the task, we propose OCC-CLIP, a CLIP-based framework for +few-shot one-class classification, enabling the identification of an image's +source model, even among multiple candidates. Extensive experiments +corresponding to various generative models verify the effectiveness of our +OCC-CLIP framework. Furthermore, an experiment based on the recently released +DALL-E 3 API verifies the real-world applicability of our solution. + +
+
+
+
+
+ + ☆ Design2Cloth: 3D Cloth Generation from 2D Masks CVPR 2024 + + +
+ In recent years, there has been a significant shift in the field of digital +avatar research, towards modeling, animating and reconstructing clothed human +representations, as a key step towards creating realistic avatars. However, +current 3D cloth generation methods are garment specific or trained completely +on synthetic data, hence lacking fine details and realism. In this work, we +make a step towards automatic realistic garment design and propose +Design2Cloth, a high fidelity 3D generative model trained on a real world +dataset from more than 2000 subject scans. To provide vital contribution to the +fashion industry, we developed a user-friendly adversarial model capable of +generating diverse and detailed clothes simply by drawing a 2D cloth mask. +Under a series of both qualitative and quantitative experiments, we showcase +that Design2Cloth outperforms current state-of-the-art cloth generative models +by a large margin. In addition to the generative properties of our network, we +showcase that the proposed method can be used to achieve high quality +reconstructions from single in-the-wild images and 3D scans. Dataset, code and +pre-trained model will become publicly available. + +
+
+ comment: Accepted to CVPR 2024, Project page: + https://jiali-zheng.github.io/Design2Cloth/ +
+
+
+
+
+ + ☆ Independently Keypoint Learning for Small Object Semantic Correspondence + + +
+ Semantic correspondence remains a challenging task for establishing +correspondences between a pair of images with the same category or similar +scenes due to the large intra-class appearance. In this paper, we introduce a +novel problem called 'Small Object Semantic Correspondence (SOSC).' This +problem is challenging due to the close proximity of keypoints associated with +small objects, which results in the fusion of these respective features. It is +difficult to identify the corresponding key points of the fused features, and +it is also difficult to be recognized. To address this challenge, we propose +the Keypoint Bounding box-centered Cropping (KBC) method, which aims to +increase the spatial separation between keypoints of small objects, thereby +facilitating independent learning of these keypoints. The KBC method is +seamlessly integrated into our proposed inference pipeline and can be easily +incorporated into other methodologies, resulting in significant performance +enhancements. Additionally, we introduce a novel framework, named KBCNet, which +serves as our baseline model. KBCNet comprises a Cross-Scale Feature Alignment +(CSFA) module and an efficient 4D convolutional decoder. The CSFA module is +designed to align multi-scale features, enriching keypoint representations by +integrating fine-grained features and deep semantic features. Meanwhile, the 4D +convolutional decoder, based on efficient 4D convolution, ensures efficiency +and rapid convergence. To empirically validate the effectiveness of our +proposed methodology, extensive experiments are conducted on three widely used +benchmarks: PF-PASCAL, PF-WILLOW, and SPair-71k. Our KBC method demonstrates a +substantial performance improvement of 7.5\% on the SPair-71K dataset, +providing compelling evidence of its efficacy. + +
+
+
+
+
+ + ☆ RS-Mamba for Large Remote Sensing Image Dense Prediction + + +
+ The spatial resolution of remote sensing images is becoming increasingly +higher, posing challenges in handling large very-high-resolution (VHR) remote +sensing images for dense prediction tasks. Models based on convolutional neural +networks are limited in their ability to model global features of remote +sensing images due to local convolution operations. Transformer based models, +despite their global modeling capabilities, face computational challenges with +large VHR images due to their quadratic complexity. The common practice of +cropping large images into smaller patches leads to a significant loss of +contextual information. To address these issues, we propose the Remote Sensing +Mamba (RSM) for dense prediction tasks in VHR remote sensing. RSM is designed +to model global features of remote sensing images with linear complexity, +enabling it to process large VHR images effectively. It employs an +omnidirectional selective scan module to globally model the images in multiple +directions, capturing large spatial features from various directions. +Experiments on semantic segmentation and change detection tasks across various +objects demonstrate the effectiveness of RSM. With simple model architecture +and training approach, RSM achieves state-of-the-art performance on the dense +prediction tasks of VHR remote sensing. The code for this work will be +available at https://github.com/walking-shadow/Official_Remote_Sensing_Mamba. + +
+
+ comment: 13 pages,6 figures +
+
+
+
+
+ + ☆ A Satellite Band Selection Framework for Amazon Forest Deforestation + Detection Task GECCO 2024 + + +
+ The conservation of tropical forests is a topic of significant social and +ecological relevance due to their crucial role in the global ecosystem. +Unfortunately, deforestation and degradation impact millions of hectares +annually, necessitating government or private initiatives for effective forest +monitoring. This study introduces a novel framework that employs the Univariate +Marginal Distribution Algorithm (UMDA) to select spectral bands from Landsat-8 +satellite, optimizing the representation of deforested areas. This selection +guides a semantic segmentation architecture, DeepLabv3+, enhancing its +performance. Experimental results revealed several band compositions that +achieved superior balanced accuracy compared to commonly adopted combinations +for deforestation detection, utilizing segment classification via a Support +Vector Machine (SVM). Moreover, the optimal band compositions identified by the +UMDA-based approach improved the performance of the DeepLabv3+ architecture, +surpassing state-of-the-art approaches compared in this study. The observation +that a few selected bands outperform the total contradicts the data-driven +paradigm prevalent in the deep learning field. Therefore, this suggests an +exception to the conventional wisdom that 'more is always better'. + +
+
+ comment: 9 pages, 4 figures, paper accepted for presentation at GECCO 2024 +
+
+
+
+
+ + ☆ Non-negative Subspace Feature Representation for Few-shot Learning in + Medical Imaging + + +
+ Unlike typical visual scene recognition domains, in which massive datasets +are accessible to deep neural networks, medical image interpretations are often +obstructed by the paucity of data. In this paper, we investigate the +effectiveness of data-based few-shot learning in medical imaging by exploring +different data attribute representations in a low-dimensional space. We +introduce different types of non-negative matrix factorization (NMF) in +few-shot learning, addressing the data scarcity issue in medical image +classification. Extensive empirical studies are conducted in terms of +validating the effectiveness of NMF, especially its supervised variants (e.g., +discriminative NMF, and supervised and constrained NMF with sparseness), and +the comparison with principal component analysis (PCA), i.e., the collaborative +representation-based dimensionality reduction technique derived from +eigenvectors. With 14 different datasets covering 11 distinct illness +categories, thorough experimental results and comparison with related +techniques demonstrate that NMF is a competitive alternative to PCA for +few-shot learning in medical imaging, and the supervised NMF algorithms are +more discriminative in the subspace with greater effectiveness. Furthermore, we +show that the part-based representation of NMF, especially its supervised +variants, is dramatically impactful in detecting lesion areas in medical +imaging with limited samples. + +
+
+
+
+
+ + ☆ SG-BEV: Satellite-Guided BEV Fusion for Cross-View Semantic Segmentation CVPR 2024 + + +
+ This paper aims at achieving fine-grained building attribute segmentation in +a cross-view scenario, i.e., using satellite and street-view image pairs. The +main challenge lies in overcoming the significant perspective differences +between street views and satellite views. In this work, we introduce SG-BEV, a +novel approach for satellite-guided BEV fusion for cross-view semantic +segmentation. To overcome the limitations of existing cross-view projection +methods in capturing the complete building facade features, we innovatively +incorporate Bird's Eye View (BEV) method to establish a spatially explicit +mapping of street-view features. Moreover, we fully leverage the advantages of +multiple perspectives by introducing a novel satellite-guided reprojection +module, optimizing the uneven feature distribution issues associated with +traditional BEV methods. Our method demonstrates significant improvements on +four cross-view datasets collected from multiple cities, including New York, +San Francisco, and Boston. On average across these datasets, our method +achieves an increase in mIOU by 10.13% and 5.21% compared with the +state-of-the-art satellite-based and cross-view methods. The code and datasets +of this work will be released at https://github.com/yejy53/SG-BEV. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ☆ 3DStyleGLIP: Part-Tailored Text-Guided 3D Neural Stylization + + +
+ 3D stylization, which entails the application of specific styles to +three-dimensional objects, holds significant commercial potential as it enables +the creation of diverse 3D objects with distinct moods and styles, tailored to +specific demands of different scenes. With recent advancements in text-driven +methods and artificial intelligence, the stylization process is increasingly +intuitive and automated, thereby diminishing the reliance on manual labor and +expertise. However, existing methods have predominantly focused on holistic +stylization, thereby leaving the application of styles to individual components +of a 3D object unexplored. In response, we introduce 3DStyleGLIP, a novel +framework specifically designed for text-driven, part-tailored 3D stylization. +Given a 3D mesh and a text prompt, 3DStyleGLIP leverages the vision-language +embedding space of the Grounded Language-Image Pre-training (GLIP) model to +localize the individual parts of the 3D mesh and modify their colors and local +geometries to align them with the desired styles specified in the text prompt. +3DStyleGLIP is effectively trained for 3D stylization tasks through a +part-level style loss working in GLIP's embedding space, supplemented by two +complementary learning techniques. Extensive experimental validation confirms +that our method achieves significant part-wise stylization capabilities, +demonstrating promising potential in advancing the field of 3D stylization. + +
+
+
+
+
+ + ☆ Multi-Scale Spatial-Temporal Self-Attention Graph Convolutional Networks + for Skeleton-based Action Recognition + + +
+ Skeleton-based gesture recognition methods have achieved high success using +Graph Convolutional Network (GCN). In addition, context-dependent adaptive +topology as a neighborhood vertex information and attention mechanism leverages +a model to better represent actions. In this paper, we propose self-attention +GCN hybrid model, Multi-Scale Spatial-Temporal self-attention (MSST)-GCN to +effectively improve modeling ability to achieve state-of-the-art results on +several datasets. We utilize spatial self-attention module with adaptive +topology to understand intra-frame interactions within a frame among different +body parts, and temporal self-attention module to examine correlations between +frames of a node. These two are followed by multi-scale convolution network +with dilations, which not only captures the long-range temporal dependencies of +joints but also the long-range spatial dependencies (i.e., long-distance +dependencies) of node temporal behaviors. They are combined into high-level +spatial-temporal representations and output the predicted action with the +softmax classifier. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Diffexplainer: Towards Cross-modal Global Explanations with Diffusion + Models + + +
+ We present DiffExplainer, a novel framework that, leveraging language-vision +models, enables multimodal global explainability. DiffExplainer employs +diffusion models conditioned on optimized text prompts, synthesizing images +that maximize class outputs and hidden features of a classifier, thus providing +a visual tool for explaining decisions. Moreover, the analysis of generated +visual descriptions allows for automatic identification of biases and spurious +features, as opposed to traditional methods that often rely on manual +intervention. The cross-modal transferability of language-vision models also +enables the possibility to describe decisions in a more human-interpretable +way, i.e., through text. We conduct comprehensive experiments, which include an +extensive user study, demonstrating the effectiveness of DiffExplainer on 1) +the generation of high-quality images explaining model decisions, surpassing +existing activation maximization methods, and 2) the automated identification +of biases and spurious features. + +
+
+
+
+
+ + ☆ Neural Radiance Fields with Torch Units + + +
+ Neural Radiance Fields (NeRF) give rise to learning-based 3D reconstruction +methods widely used in industrial applications. Although prevalent methods +achieve considerable improvements in small-scale scenes, accomplishing +reconstruction in complex and large-scale scenes is still challenging. First, +the background in complex scenes shows a large variance among different views. +Second, the current inference pattern, $i.e.$, a pixel only relies on an +individual camera ray, fails to capture contextual information. To solve these +problems, we propose to enlarge the ray perception field and build up the +sample points interactions. In this paper, we design a novel inference pattern +that encourages a single camera ray possessing more contextual information, and +models the relationship among sample points on each camera ray. To hold +contextual information,a camera ray in our proposed method can render a patch +of pixels simultaneously. Moreover, we replace the MLP in neural radiance field +models with distance-aware convolutions to enhance the feature propagation +among sample points from the same camera ray. To summarize, as a torchlight, a +ray in our proposed method achieves rendering a patch of image. Thus, we call +the proposed method, Torch-NeRF. Extensive experiments on KITTI-360 and LLFF +show that the Torch-NeRF exhibits excellent performance. + +
+
+
+
+
+ + ☆ Vestibular schwannoma growth_prediction from longitudinal MRI by time + conditioned neural fields + + +
+ Vestibular schwannomas (VS) are benign tumors that are generally managed by +active surveillance with MRI examination. To further assist clinical +decision-making and avoid overtreatment, an accurate prediction of tumor growth +based on longitudinal imaging is highly desirable. In this paper, we introduce +DeepGrowth, a deep learning method that incorporates neural fields and +recurrent neural networks for prospective tumor growth prediction. In the +proposed method, each tumor is represented as a signed distance function (SDF) +conditioned on a low-dimensional latent code. Unlike previous studies that +perform tumor shape prediction directly in the image space, we predict the +latent codes instead and then reconstruct future shapes from it. To deal with +irregular time intervals, we introduce a time-conditioned recurrent module +based on a ConvLSTM and a novel temporal encoding strategy, which enables the +proposed model to output varying tumor shapes over time. The experiments on an +in-house longitudinal VS dataset showed that the proposed model significantly +improved the performance ($\ge 1.6\%$ Dice score and $\ge0.20$ mm 95\% +Hausdorff distance), in particular for top 20\% tumors that grow or shrink the +most ($\ge 4.6\%$ Dice score and $\ge 0.73$ mm 95\% Hausdorff distance). Our +code is available at ~\burl{https://github.com/cyjdswx/DeepGrowth} + +
+
+
+
+
+ + ☆ Unsegment Anything by Simulating Deformation CVPR 2024 + + +
+ Foundation segmentation models, while powerful, pose a significant risk: they +enable users to effortlessly extract any objects from any digital content with +a single click, potentially leading to copyright infringement or malicious +misuse. To mitigate this risk, we introduce a new task "Anything Unsegmentable" +to grant any image "the right to be unsegmented". The ambitious pursuit of the +task is to achieve highly transferable adversarial attacks against all +prompt-based segmentation models, regardless of model parameterizations and +prompts. We highlight the non-transferable and heterogeneous nature of +prompt-specific adversarial noises. Our approach focuses on disrupting image +encoder features to achieve prompt-agnostic attacks. Intriguingly, targeted +feature attacks exhibit better transferability compared to untargeted ones, +suggesting the optimal update direction aligns with the image manifold. Based +on the observations, we design a novel attack named Unsegment Anything by +Simulating Deformation (UAD). Our attack optimizes a differentiable deformation +function to create a target deformed image, which alters structural information +while preserving achievable feature distance by adversarial example. Extensive +experiments verify the effectiveness of our approach, compromising a variety of +promptable segmentation models with different architectures and prompt +interfaces. We release the code at +https://github.com/jiahaolu97/anything-unsegmentable. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Active learning for efficient annotation in precision agriculture: a + use-case on crop-weed semantic segmentation + + +
+ Optimizing deep learning models requires large amounts of annotated images, a +process that is both time-intensive and costly. Especially for semantic +segmentation models in which every pixel must be annotated. A potential +strategy to mitigate annotation effort is active learning. Active learning +facilitates the identification and selection of the most informative images +from a large unlabelled pool. The underlying premise is that these selected +images can improve the model's performance faster than random selection to +reduce annotation effort. While active learning has demonstrated promising +results on benchmark datasets like Cityscapes, its performance in the +agricultural domain remains largely unexplored. This study addresses this +research gap by conducting a comparative study of three active learning-based +acquisition functions: Bayesian Active Learning by Disagreement (BALD), +stochastic-based BALD (PowerBALD), and Random. The acquisition functions were +tested on two agricultural datasets: Sugarbeet and Corn-Weed, both containing +three semantic classes: background, crop and weed. Our results indicated that +active learning, especially PowerBALD, yields a higher performance than Random +sampling on both datasets. But due to the relatively large standard deviations, +the differences observed were minimal; this was partly caused by high image +redundancy and imbalanced classes. Specifically, more than 89\% of the pixels +belonged to the background class on both datasets. The absence of significant +results on both datasets indicates that further research is required for +applying active learning on agricultural datasets, especially if they contain a +high-class imbalance and redundant images. Recommendations and insights are +provided in this paper to potentially resolve such issues. + +
+
+
+
+
+ + ☆ Knowledge Distillation with Multi-granularity Mixture of Priors for + Image Super-Resolution + + +
+ Knowledge distillation (KD) is a promising yet challenging model compression +technique that transfers rich learning representations from a well-performing +but cumbersome teacher model to a compact student model. Previous methods for +image super-resolution (SR) mostly compare the feature maps directly or after +standardizing the dimensions with basic algebraic operations (e.g. average, +dot-product). However, the intrinsic semantic differences among feature maps +are overlooked, which are caused by the disparate expressive capacity between +the networks. This work presents MiPKD, a multi-granularity mixture of prior KD +framework, to facilitate efficient SR model through the feature mixture in a +unified latent space and stochastic network block mixture. Extensive +experiments demonstrate the effectiveness of the proposed MiPKD method. + +
+
+
+
+
+ + ☆ Representation Alignment Contrastive Regularization for Multi-Object + Tracking + + +
+ Achieving high-performance in multi-object tracking algorithms heavily relies +on modeling spatio-temporal relationships during the data association stage. +Mainstream approaches encompass rule-based and deep learning-based methods for +spatio-temporal relationship modeling. While the former relies on physical +motion laws, offering wider applicability but yielding suboptimal results for +complex object movements, the latter, though achieving high-performance, lacks +interpretability and involves complex module designs. This work aims to +simplify deep learning-based spatio-temporal relationship models and introduce +interpretability into features for data association. Specifically, a +lightweight single-layer transformer encoder is utilized to model +spatio-temporal relationships. To make features more interpretative, two +contrastive regularization losses based on representation alignment are +proposed, derived from spatio-temporal consistency rules. By applying weighted +summation to affinity matrices, the aligned features can seamlessly integrate +into the data association stage of the original tracking workflow. Experimental +results showcase that our model enhances the majority of existing tracking +networks' performance without excessive complexity, with minimal increase in +training overhead and nearly negligible computational and storage costs. + +
+
+
+
+
+ + ☆ Regional biases in image geolocation estimation: a case study with the + SenseCity Africa dataset + + +
+ Advances in Artificial Intelligence are challenged by the biases rooted in +the datasets used to train the models. In image geolocation estimation, models +are mostly trained using data from specific geographic regions, notably the +Western world, and as a result, they may struggle to comprehend the +complexities of underrepresented regions. To assess this issue, we apply a +state-of-the-art image geolocation estimation model (ISNs) to a crowd-sourced +dataset of geolocated images from the African continent (SCA100), and then +explore the regional and socioeconomic biases underlying the model's +predictions. Our findings show that the ISNs model tends to over-predict image +locations in high-income countries of the Western world, which is consistent +with the geographic distribution of its training data, i.e., the IM2GPS3k +dataset. Accordingly, when compared to the IM2GPS3k benchmark, the accuracy of +the ISNs model notably decreases at all scales. Additionally, we cluster images +of the SCA100 dataset based on how accurately they are predicted by the ISNs +model and show the model's difficulties in correctly predicting the locations +of images in low income regions, especially in Sub-Saharan Africa. Therefore, +our results suggest that using IM2GPS3k as a training set and benchmark for +image geolocation estimation and other computer vision models overlooks its +potential application in the African context. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Semi-Supervised Unconstrained Head Pose Estimation in the Wild + + +
+ Existing head pose estimation datasets are either composed of numerous +samples by non-realistic synthesis or lab collection, or limited images by +labor-intensive annotating. This makes deep supervised learning based solutions +compromised due to the reliance on generous labeled data. To alleviate it, we +propose the first semi-supervised unconstrained head pose estimation (SemiUHPE) +method, which can leverage a large amount of unlabeled wild head images. +Specifically, we follow the recent semi-supervised rotation regression, and +focus on the diverse and complex head pose domain. Firstly, we claim that the +aspect-ratio invariant cropping of heads is superior to the previous +landmark-based affine alignment, which does not fit unlabeled natural heads or +practical applications where landmarks are often unavailable. Then, instead of +using an empirically fixed threshold to filter out pseudo labels, we propose +the dynamic entropy-based filtering by updating thresholds for adaptively +removing unlabeled outliers. Moreover, we revisit the design of weak-strong +augmentations, and further exploit its superiority by devising two novel +head-oriented strong augmentations named pose-irrelevant cut-occlusion and +pose-altering rotation consistency. Extensive experiments show that SemiUHPE +can surpass SOTAs with remarkable improvements on public benchmarks under both +front-range and full-range. Our code is released in +\url{https://github.com/hnuzhy/SemiUHPE}. + +
+
+ comment: 14 pages. Semi-Supervised Unconstrained Head Pose Estimation +
+
+
+
+
+ + ☆ Severity Controlled Text-to-Image Generative Model Bias Manipulation + + +
+ Text-to-image (T2I) generative models are gaining wide popularity, especially +in public domains. However, their intrinsic bias and potential malicious +manipulations remain under-explored. Charting the susceptibility of T2I models +to such manipulation, we first expose the new possibility of a dynamic and +computationally efficient exploitation of model bias by targeting the embedded +language models. By leveraging mathematical foundations of vector algebra, our +technique enables a scalable and convenient control over the severity of output +manipulation through model bias. As a by-product, this control also allows a +form of precise prompt engineering to generate images which are generally +implausible with regular text prompts. We also demonstrate a constructive +application of our manipulation for balancing the frequency of generated +classes - as in model debiasing. Our technique does not require training and is +also framed as a backdoor attack with severity control using semantically-null +text triggers in the prompts. With extensive analysis, we present interesting +qualitative and quantitative results to expose potential manipulation +possibilities for T2I models. + Key-words: Text-to-Image Models, Generative Models, Backdoor Attacks, Prompt +Engineering, Bias + +
+
+ comment: This research was supported by National Intelligence and Security + Discovery Research Grants (project# NS220100007), funded by the Department of + Defence Australia +
+
+
+
+
+ + ☆ Weakly-Supervised 3D Scene Graph Generation via Visual-Linguistic + Assisted Pseudo-labeling + + +
+ Learning to build 3D scene graphs is essential for real-world perception in a +structured and rich fashion. However, previous 3D scene graph generation +methods utilize a fully supervised learning manner and require a large amount +of entity-level annotation data of objects and relations, which is extremely +resource-consuming and tedious to obtain. To tackle this problem, we propose +3D-VLAP, a weakly-supervised 3D scene graph generation method via +Visual-Linguistic Assisted Pseudo-labeling. Specifically, our 3D-VLAP exploits +the superior ability of current large-scale visual-linguistic models to align +the semantics between texts and 2D images, as well as the naturally existing +correspondences between 2D images and 3D point clouds, and thus implicitly +constructs correspondences between texts and 3D point clouds. First, we +establish the positional correspondence from 3D point clouds to 2D images via +camera intrinsic and extrinsic parameters, thereby achieving alignment of 3D +point clouds and 2D images. Subsequently, a large-scale cross-modal +visual-linguistic model is employed to indirectly align 3D instances with the +textual category labels of objects by matching 2D images with object category +labels. The pseudo labels for objects and relations are then produced for +3D-VLAP model training by calculating the similarity between visual embeddings +and textual category embeddings of objects and relations encoded by the +visual-linguistic model, respectively. Ultimately, we design an edge +self-attention based graph neural network to generate scene graphs of 3D point +cloud scenes. Extensive experiments demonstrate that our 3D-VLAP achieves +comparable results with current advanced fully supervised methods, meanwhile +significantly alleviating the pressure of data annotation. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ☆ Text-driven Affordance Learning from Egocentric Vision + + +
+ Visual affordance learning is a key component for robots to understand how to +interact with objects. Conventional approaches in this field rely on +pre-defined objects and actions, falling short of capturing diverse +interactions in realworld scenarios. The key idea of our approach is employing +textual instruction, targeting various affordances for a wide range of objects. +This approach covers both hand-object and tool-object interactions. We +introduce text-driven affordance learning, aiming to learn contact points and +manipulation trajectories from an egocentric view following textual +instruction. In our task, contact points are represented as heatmaps, and the +manipulation trajectory as sequences of coordinates that incorporate both +linear and rotational movements for various manipulations. However, when we +gather data for this task, manual annotations of these diverse interactions are +costly. To this end, we propose a pseudo dataset creation pipeline and build a +large pseudo-training dataset: TextAFF80K, consisting of over 80K instances of +the contact points, trajectories, images, and text tuples. We extend existing +referring expression comprehension models for our task, and experimental +results show that our approach robustly handles multiple affordances, serving +as a new standard for affordance learning in real-world scenarios. + +
+
+
+
+
+ + ☆ CPAISD: Core-penumbra acute ischemic stroke dataset + + +
+ We introduce the CPAISD: Core-Penumbra Acute Ischemic Stroke Dataset, aimed +at enhancing the early detection and segmentation of ischemic stroke using +Non-Contrast Computed Tomography (NCCT) scans. Addressing the challenges in +diagnosing acute ischemic stroke during its early stages due to often +non-revealing native CT findings, the dataset provides a collection of +segmented NCCT images. These include annotations of ischemic core and penumbra +regions, critical for developing machine learning models for rapid stroke +identification and assessment. By offering a carefully collected and annotated +dataset, we aim to facilitate the development of advanced diagnostic tools, +contributing to improved patient care and outcomes in stroke management. Our +dataset's uniqueness lies in its focus on the acute phase of ischemic stroke, +with non-informative native CT scans, and includes a baseline model to +demonstrate the dataset's application, encouraging further research and +innovation in the field of medical imaging and stroke diagnosis. + +
+
+
+
+
+ + ☆ HENet: Hybrid Encoding for End-to-end Multi-task 3D Perception from + Multi-view Cameras + + +
+ Three-dimensional perception from multi-view cameras is a crucial component +in autonomous driving systems, which involves multiple tasks like 3D object +detection and bird's-eye-view (BEV) semantic segmentation. To improve +perception precision, large image encoders, high-resolution images, and +long-term temporal inputs have been adopted in recent 3D perception models, +bringing remarkable performance gains. However, these techniques are often +incompatible in training and inference scenarios due to computational resource +constraints. Besides, modern autonomous driving systems prefer to adopt an +end-to-end framework for multi-task 3D perception, which can simplify the +overall system architecture and reduce the implementation complexity. However, +conflict between tasks often arises when optimizing multiple tasks jointly +within an end-to-end 3D perception model. To alleviate these issues, we present +an end-to-end framework named HENet for multi-task 3D perception in this paper. +Specifically, we propose a hybrid image encoding network, using a large image +encoder for short-term frames and a small image encoder for long-term temporal +frames. Then, we introduce a temporal feature integration module based on the +attention mechanism to fuse the features of different frames extracted by the +two aforementioned hybrid image encoders. Finally, according to the +characteristics of each perception task, we utilize BEV features of different +grid sizes, independent BEV encoders, and task decoders for different tasks. +Experimental results show that HENet achieves state-of-the-art end-to-end +multi-task 3D perception results on the nuScenes benchmark, including 3D object +detection and BEV semantic segmentation. The source code and models will be +released at https://github.com/VDIGPKU/HENet. + +
+
+
+
+
+ + ☆ Freditor: High-Fidelity and Transferable NeRF Editing by Frequency + Decomposition + + +
+ This paper enables high-fidelity, transferable NeRF editing by frequency +decomposition. Recent NeRF editing pipelines lift 2D stylization results to 3D +scenes while suffering from blurry results, and fail to capture detailed +structures caused by the inconsistency between 2D editings. Our critical +insight is that low-frequency components of images are more +multiview-consistent after editing compared with their high-frequency parts. +Moreover, the appearance style is mainly exhibited on the low-frequency +components, and the content details especially reside in high-frequency parts. +This motivates us to perform editing on low-frequency components, which results +in high-fidelity edited scenes. In addition, the editing is performed in the +low-frequency feature space, enabling stable intensity control and novel scene +transfer. Comprehensive experiments conducted on photorealistic datasets +demonstrate the superior performance of high-fidelity and transferable NeRF +editing. The project page is at \url{https://aigc3d.github.io/freditor}. + +
+
+
+
+
+ + ☆ VIAssist: Adapting Multi-modal Large Language Models for Users with + Visual Impairments + + +
+ Individuals with visual impairments, encompassing both partial and total +difficulties in visual perception, are referred to as visually impaired (VI) +people. An estimated 2.2 billion individuals worldwide are affected by visual +impairments. Recent advancements in multi-modal large language models (MLLMs) +have showcased their extraordinary capabilities across various domains. It is +desirable to help VI individuals with MLLMs' great capabilities of visual +understanding and reasoning. However, it is challenging for VI people to use +MLLMs due to the difficulties in capturing the desirable images to fulfill +their daily requests. For example, the target object is not fully or partially +placed in the image. This paper explores how to leverage MLLMs for VI +individuals to provide visual-question answers. VIAssist can identify undesired +images and provide detailed actions. Finally, VIAssist can provide reliable +answers to users' queries based on the images. Our results show that VIAssist +provides +0.21 and +0.31 higher BERTScore and ROUGE scores than the baseline, +respectively. + +
+
+ comment: Accepted to IEEE International Workshop on Foundation Models for + Cyber-Physical Systems & Internet of Things (FMSys 2024) +
+
+
+
+
+ + ☆ A Unified Membership Inference Method for Visual Self-supervised Encoder + via Part-aware Capability + + +
+ Self-supervised learning shows promise in harnessing extensive unlabeled +data, but it also confronts significant privacy concerns, especially in vision. +In this paper, we aim to perform membership inference on visual self-supervised +models in a more realistic setting: self-supervised training method and details +are unknown for an adversary when attacking as he usually faces a black-box +system in practice. In this setting, considering that self-supervised model +could be trained by completely different self-supervised paradigms, e.g., +masked image modeling and contrastive learning, with complex training details, +we propose a unified membership inference method called PartCrop. It is +motivated by the shared part-aware capability among models and stronger part +response on the training data. Specifically, PartCrop crops parts of objects in +an image to query responses with the image in representation space. We conduct +extensive attacks on self-supervised models with different training protocols +and structures using three widely used image datasets. The results verify the +effectiveness and generalization of PartCrop. Moreover, to defend against +PartCrop, we evaluate two common approaches, i.e., early stop and differential +privacy, and propose a tailored method called shrinking crop scale range. The +defense experiments indicate that all of them are effective. Our code is +available at https://github.com/JiePKU/PartCrop + +
+
+ comment: Membership Inference, Self-supervised learning +
+
+
+
+
+ + ☆ TSNet:A Two-stage Network for Image Dehazing with Multi-scale Fusion and + Adaptive Learning + + +
+ Image dehazing has been a popular topic of research for a long time. Previous +deep learning-based image dehazing methods have failed to achieve satisfactory +dehazing effects on both synthetic datasets and real-world datasets, exhibiting +poor generalization. Moreover, single-stage networks often result in many +regions with artifacts and color distortion in output images. To address these +issues, this paper proposes a two-stage image dehazing network called TSNet, +mainly consisting of the multi-scale fusion module (MSFM) and the adaptive +learning module (ALM). Specifically, MSFM and ALM enhance the generalization of +TSNet. The MSFM can obtain large receptive fields at multiple scales and +integrate features at different frequencies to reduce the differences between +inputs and learning objectives. The ALM can actively learn of regions of +interest in images and restore texture details more effectively. Additionally, +TSNet is designed as a two-stage network, where the first-stage network +performs image dehazing, and the second-stage network is employed to improve +issues such as artifacts and color distortion present in the results of the +first-stage network. We also change the learning objective from ground truth +images to opposite fog maps, which improves the learning efficiency of TSNet. +Extensive experiments demonstrate that TSNet exhibits superior dehazing +performance on both synthetic and real-world datasets compared to previous +state-of-the-art methods. + +
+
+ comment: 12 pages, 10 figures, 7 tables +
+
+
+
+
+ + ☆ RS3Mamba: Visual State Space Model for Remote Sensing Images Semantic + Segmentation + + +
+ Semantic segmentation of remote sensing images is a fundamental task in +geoscience research. However, there are some significant shortcomings for the +widely used convolutional neural networks (CNNs) and Transformers. The former +is limited by its insufficient long-range modeling capabilities, while the +latter is hampered by its computational complexity. Recently, a novel visual +state space (VSS) model represented by Mamba has emerged, capable of modeling +long-range relationships with linear computability. In this work, we propose a +novel dual-branch network named remote sensing images semantic segmentation +Mamba (RS3Mamba) to incorporate this innovative technology into remote sensing +tasks. Specifically, RS3Mamba utilizes VSS blocks to construct an auxiliary +branch, providing additional global information to convolution-based main +branch. Moreover, considering the distinct characteristics of the two branches, +we introduce a collaborative completion module (CCM) to enhance and fuse +features from the dual-encoder. Experimental results on two widely used +datasets, ISPRS Vaihingen and LoveDA Urban, demonstrate the effectiveness and +potential of the proposed RS3Mamba. To the best of our knowledge, this is the +first vision Mamba specifically designed for remote sensing images semantic +segmentation. The source code will be made available at +https://github.com/sstary/SSRS. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ A Novel Approach to Breast Cancer Histopathological Image Classification + Using Cross-Colour Space Feature Fusion and Quantum-Classical Stack Ensemble + Method + + +
+ Breast cancer classification stands as a pivotal pillar in ensuring timely +diagnosis and effective treatment. This study with histopathological images +underscores the profound significance of harnessing the synergistic +capabilities of colour space ensembling and quantum-classical stacking to +elevate the precision of breast cancer classification. By delving into the +distinct colour spaces of RGB, HSV and CIE L*u*v, the authors initiated a +comprehensive investigation guided by advanced methodologies. Employing the +DenseNet121 architecture for feature extraction the authors have capitalized on +the robustness of Random Forest, SVM, QSVC, and VQC classifiers. This research +encompasses a unique feature fusion technique within the colour space ensemble. +This approach not only deepens our comprehension of breast cancer +classification but also marks a milestone in personalized medical assessment. +The amalgamation of quantum and classical classifiers through stacking emerges +as a potent catalyst, effectively mitigating the inherent constraints of +individual classifiers, paving a robust path towards more dependable and +refined breast cancer identification. Through rigorous experimentation and +meticulous analysis, fusion of colour spaces like RGB with HSV and RGB with CIE +L*u*v, presents an classification accuracy, nearing the value of unity. This +underscores the transformative potential of our approach, where the fusion of +diverse colour spaces and the synergy of quantum and classical realms converge +to establish a new horizon in medical diagnostics. Thus the implications of +this research extend across medical disciplines, offering promising avenues for +advancing diagnostic accuracy and treatment efficacy. + +
+
+
+
+
+ + ☆ RESSA: Repair Sparse Vision-Language Models via Sparse Cross-Modality + Adaptation + + +
+ Vision-Language Models (VLMs), integrating diverse information from multiple +modalities, have shown remarkable success across various tasks. However, +deploying VLMs, comprising large-scale vision and language models poses +challenges in resource-constrained scenarios. While pruning followed by +finetuning offers a potential solution to maintain performance with smaller +model sizes, its application to VLMs remains relatively unexplored, presenting +two main questions: how to distribute sparsity across different +modality-specific models, and how to repair the performance of pruned sparse +VLMs. To answer the first question, we conducted preliminary studies on VLM +pruning and found that pruning vision models and language models with the same +sparsity ratios contribute to nearly optimal performance. For the second +question, unlike finetuning unimodal sparse models, sparse VLMs involve +cross-modality interactions, requiring specialized techniques for post-pruning +performance repair. Moreover, while parameter-efficient LoRA finetuning has +been proposed to repair the performance of sparse models, a significant +challenge of weights merging arises due to the incompatibility of dense LoRA +modules with sparse models that destroy the sparsity of pruned models. To +tackle these challenges, we propose to Repair Sparse Vision-Language Models via +Sparse Cross-modality Adaptation (RESSA). RESSA utilizes cross-modality +finetuning to enhance task-specific performance and facilitate knowledge +distillation from original dense models. Additionally, we introduce SparseLoRA, +which applies sparsity directly to LoRA weights, enabling seamless integration +with sparse models. Our experimental results validate the effectiveness of +RESSA, showcasing significant enhancements, such as an 11.3\% improvement under +2:4 sparsity and a remarkable 47.6\% enhancement under unstructured 70\% +sparsity. + +
+
+
+
+
+ + ☆ What Are We Measuring When We Evaluate Large Vision-Language Models? An + Analysis of Latent Factors and Biases + + +
+ Vision-language (VL) models, pretrained on colossal image-text datasets, have +attained broad VL competence that is difficult to evaluate. A common belief is +that a small number of VL skills underlie the variety of VL tests. In this +paper, we perform a large-scale transfer learning experiment aimed at +discovering latent VL skills from data. We reveal interesting characteristics +that have important implications for test suite design. First, generation tasks +suffer from a length bias, suggesting benchmarks should balance tasks with +varying output lengths. Second, we demonstrate that factor analysis +successfully identifies reasonable yet surprising VL skill factors, suggesting +benchmarks could leverage similar analyses for task selection. Finally, we +present a new dataset, OLIVE (https://github.com/jq-zh/olive-dataset), which +simulates user instructions in the wild and presents challenges dissimilar to +all datasets we tested. Our findings contribute to the design of balanced and +broad-coverage vision-language evaluation methods. + +
+
+
+
+
+ + ☆ TCLC-GS: Tightly Coupled LiDAR-Camera Gaussian Splatting for Surrounding + Autonomous Driving Scenes + + +
+ Most 3D Gaussian Splatting (3D-GS) based methods for urban scenes initialize +3D Gaussians directly with 3D LiDAR points, which not only underutilizes LiDAR +data capabilities but also overlooks the potential advantages of fusing LiDAR +with camera data. In this paper, we design a novel tightly coupled LiDAR-Camera +Gaussian Splatting (TCLC-GS) to fully leverage the combined strengths of both +LiDAR and camera sensors, enabling rapid, high-quality 3D reconstruction and +novel view RGB/depth synthesis. TCLC-GS designs a hybrid explicit (colorized 3D +mesh) and implicit (hierarchical octree feature) 3D representation derived from +LiDAR-camera data, to enrich the properties of 3D Gaussians for splatting. 3D +Gaussian's properties are not only initialized in alignment with the 3D mesh +which provides more completed 3D shape and color information, but are also +endowed with broader contextual information through retrieved octree implicit +features. During the Gaussian Splatting optimization process, the 3D mesh +offers dense depth information as supervision, which enhances the training +process by learning of a robust geometry. Comprehensive evaluations conducted +on the Waymo Open Dataset and nuScenes Dataset validate our method's +state-of-the-art (SOTA) performance. Utilizing a single NVIDIA RTX 3090 Ti, our +method demonstrates fast training and achieves real-time RGB and depth +rendering at 90 FPS in resolution of 1920x1280 (Waymo), and 120 FPS in +resolution of 1600x900 (nuScenes) in urban scenarios. + +
+
+
+
+
+ + ☆ TE-TAD: Towards Full End-to-End Temporal Action Detection via + Time-Aligned Coordinate Expression + + +
+ In this paper, we investigate that the normalized coordinate expression is a +key factor as reliance on hand-crafted components in query-based detectors for +temporal action detection (TAD). Despite significant advancements towards an +end-to-end framework in object detection, query-based detectors have been +limited in achieving full end-to-end modeling in TAD. To address this issue, we +propose \modelname{}, a full end-to-end temporal action detection transformer +that integrates time-aligned coordinate expression. We reformulate coordinate +expression utilizing actual timeline values, ensuring length-invariant +representations from the extremely diverse video duration environment. +Furthermore, our proposed adaptive query selection dynamically adjusts the +number of queries based on video length, providing a suitable solution for +varying video durations compared to a fixed query set. Our approach not only +simplifies the TAD process by eliminating the need for hand-crafted components +but also significantly improves the performance of query-based detectors. Our +TE-TAD outperforms the previous query-based detectors and achieves competitive +performance compared to state-of-the-art methods on popular benchmark datasets. +Code is available at: https://github.com/Dotori-HJ/TE-TAD + +
+
+
+
+
+ + ☆ Enhancing Diffusion-based Point Cloud Generation with Smoothness + Constraint + + +
+ Diffusion models have been popular for point cloud generation tasks. Existing +works utilize the forward diffusion process to convert the original point +distribution into a noise distribution and then learn the reverse diffusion +process to recover the point distribution from the noise distribution. However, +the reverse diffusion process can produce samples with non-smooth points on the +surface because of the ignorance of the point cloud geometric properties. We +propose alleviating the problem by incorporating the local smoothness +constraint into the diffusion framework for point cloud generation. Experiments +demonstrate the proposed model can generate realistic shapes and smoother point +clouds, outperforming multiple state-of-the-art methods. + +
+
+
+
+
+ + ☆ Cohort-Individual Cooperative Learning for Multimodal Cancer Survival + Analysis + + +
+ Recently, we have witnessed impressive achievements in cancer survival +analysis by integrating multimodal data, e.g., pathology images and genomic +profiles. However, the heterogeneity and high dimensionality of these +modalities pose significant challenges for extracting discriminative +representations while maintaining good generalization. In this paper, we +propose a Cohort-individual Cooperative Learning (CCL) framework to advance +cancer survival analysis by collaborating knowledge decomposition and cohort +guidance. Specifically, first, we propose a Multimodal Knowledge Decomposition +(MKD) module to explicitly decompose multimodal knowledge into four distinct +components: redundancy, synergy and uniqueness of the two modalities. Such a +comprehensive decomposition can enlighten the models to perceive easily +overlooked yet important information, facilitating an effective multimodal +fusion. Second, we propose a Cohort Guidance Modeling (CGM) to mitigate the +risk of overfitting task-irrelevant information. It can promote a more +comprehensive and robust understanding of the underlying multimodal data, while +avoiding the pitfalls of overfitting and enhancing the generalization ability +of the model. By cooperating the knowledge decomposition and cohort guidance +methods, we develop a robust multimodal survival analysis model with enhanced +discrimination and generalization abilities. Extensive experimental results on +five cancer datasets demonstrate the effectiveness of our model in integrating +multimodal data for survival analysis. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ APC2Mesh: Bridging the gap from occluded building façades to full 3D + models + + +
+ The benefits of having digital twins of urban buildings are numerous. +However, a major difficulty encountered in their creation from airborne LiDAR +point clouds is the effective means of accurately reconstructing significant +occlusions amidst point density variations and noise. To bridge the +noise/sparsity/occlusion gap and generate high fidelity 3D building models, we +propose APC2Mesh which integrates point completion into a 3D reconstruction +pipeline, enabling the learning of dense geometrically accurate representation +of buildings. Specifically, we leveraged complete points generated from +occluded ones as input to a linearized skip attention-based deformation network +for 3D mesh reconstruction. In our experiments, conducted on 3 different +scenes, we demonstrate that: (1) APC2Mesh delivers comparatively superior +results, indicating its efficacy in handling the challenges of occluded +airborne building points of diverse styles and complexities. (2) The +combination of point completion with typical deep learning-based 3D point cloud +reconstruction methods offers a direct and effective solution for +reconstructing significantly occluded airborne building points. As such, this +neural integration holds promise for advancing the creation of digital twins +for urban buildings with greater accuracy and fidelity. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation + + +
+ Deep Neural Networks (DNNs) are widely used for visual classification tasks, +but their complex computation process and black-box nature hinder decision +transparency and interpretability. Class activation maps (CAMs) and recent +variants provide ways to visually explain the DNN decision-making process by +displaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation +only offers relative attention information, that is, on an attention heatmap, +we can interpret which image region is more or less important than the others. +However, these regions cannot be meaningfully compared across classes, and the +contribution of each region to the model's class prediction is not revealed. To +address these challenges that ultimately lead to better DNN Interpretation, in +this paper, we propose CAPE, a novel reformulation of CAM that provides a +unified and probabilistically meaningful assessment of the contributions of +image regions. We quantitatively and qualitatively compare CAPE with +state-of-the-art CAM methods on CUB and ImageNet benchmark datasets to +demonstrate enhanced interpretability. We also test on a cytology imaging +dataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML) +diagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE. + +
+
+
+
+
+ + ☆ Enhancing Human-Computer Interaction in Chest X-ray Analysis using + Vision and Language Model with Eye Gaze Patterns + + +
+ Recent advancements in Computer Assisted Diagnosis have shown promising +performance in medical imaging tasks, particularly in chest X-ray analysis. +However, the interaction between these models and radiologists has been +primarily limited to input images. This work proposes a novel approach to +enhance human-computer interaction in chest X-ray analysis using +Vision-Language Models (VLMs) enhanced with radiologists' attention by +incorporating eye gaze data alongside textual prompts. Our approach leverages +heatmaps generated from eye gaze data, overlaying them onto medical images to +highlight areas of intense radiologist's focus during chest X-ray evaluation. +We evaluate this methodology in tasks such as visual question answering, chest +X-ray report automation, error detection, and differential diagnosis. Our +results demonstrate the inclusion of eye gaze information significantly +enhances the accuracy of chest X-ray analysis. Also, the impact of eye gaze on +fine-tuning was confirmed as it outperformed other medical VLMs in all tasks +except visual question answering. This work marks the potential of leveraging +both the VLM's capabilities and the radiologist's domain knowledge to improve +the capabilities of AI models in medical imaging, paving a novel way for +Computer Assisted Diagnosis with a human-centred AI. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Utilizing Computer Vision for Continuous Monitoring of Vaccine Side + Effects in Experimental Mice + + +
+ The demand for improved efficiency and accuracy in vaccine safety assessments +is increasing. Here, we explore the application of computer vision technologies +to automate the monitoring of experimental mice for potential side effects +after vaccine administration. Traditional observation methods are +labor-intensive and lack the capability for continuous monitoring. By deploying +a computer vision system, our research aims to improve the efficiency and +accuracy of vaccine safety assessments. The methodology involves training +machine learning models on annotated video data of mice behaviors pre- and +post-vaccination. Preliminary results indicate that computer vision effectively +identify subtle changes, signaling possible side effects. Therefore, our +approach has the potential to significantly enhance the monitoring process in +vaccine trials in animals, providing a practical solution to the limitations of +human observation. + +
+
+ comment: 1 figure +
+
+
+
+
+ + ☆ LVLM-Intrepret: An Interpretability Tool for Large Vision-Language + Models + + +
+ In the rapidly evolving landscape of artificial intelligence, multi-modal +large language models are emerging as a significant area of interest. These +models, which combine various forms of data input, are becoming increasingly +popular. However, understanding their internal mechanisms remains a complex +task. Numerous advancements have been made in the field of explainability tools +and mechanisms, yet there is still much to explore. In this work, we present a +novel interactive application aimed towards understanding the internal +mechanisms of large vision-language models. Our interface is designed to +enhance the interpretability of the image patches, which are instrumental in +generating an answer, and assess the efficacy of the language model in +grounding its output in the image. With our application, a user can +systematically investigate the model and uncover system limitations, paving the +way for enhancements in system capabilities. Finally, we present a case study +of how our application can aid in understanding failure mechanisms in a popular +large multi-modal model: LLaVA. + +
+
+
+
+
+ + ☆ Ego-Motion Aware Target Prediction Module for Robust Multi-Object + Tracking IROS2024 + + +
+ Multi-object tracking (MOT) is a prominent task in computer vision with +application in autonomous driving, responsible for the simultaneous tracking of +multiple object trajectories. Detection-based multi-object tracking (DBT) +algorithms detect objects using an independent object detector and predict the +imminent location of each target. Conventional prediction methods in DBT +utilize Kalman Filter(KF) to extrapolate the target location in the upcoming +frames by supposing a constant velocity motion model. These methods are +especially hindered in autonomous driving applications due to dramatic camera +motion or unavailable detections. Such limitations lead to tracking failures +manifested by numerous identity switches and disrupted trajectories. In this +paper, we introduce a novel KF-based prediction module called the Ego-motion +Aware Target Prediction (EMAP) module by focusing on the integration of camera +motion and depth information with object motion models. Our proposed method +decouples the impact of camera rotational and translational velocity from the +object trajectories by reformulating the Kalman Filter. This reformulation +enables us to reject the disturbances caused by camera motion and maximizes the +reliability of the object motion model. We integrate our module with four +state-of-the-art base MOT algorithms, namely OC-SORT, Deep OC-SORT, ByteTrack, +and BoT-SORT. In particular, our evaluation on the KITTI MOT dataset +demonstrates that EMAP remarkably drops the number of identity switches (IDSW) +of OC-SORT and Deep OC-SORT by 73% and 21%, respectively. At the same time, it +elevates other performance metrics such as HOTA by more than 5%. Our source +code is available at https://github.com/noyzzz/EMAP. + +
+
+ comment: 7 pages, 4 figures, submitted to IROS2024 +
+
+
+
+
+ + ☆ Many-to-many Image Generation with Auto-regressive Diffusion Models + + +
+ Recent advancements in image generation have made significant progress, yet +existing models present limitations in perceiving and generating an arbitrary +number of interrelated images within a broad context. This limitation becomes +increasingly critical as the demand for multi-image scenarios, such as +multi-view images and visual narratives, grows with the expansion of multimedia +platforms. This paper introduces a domain-general framework for many-to-many +image generation, capable of producing interrelated image series from a given +set of images, offering a scalable solution that obviates the need for +task-specific solutions across different multi-image scenarios. To facilitate +this, we present MIS, a novel large-scale multi-image dataset, containing 12M +synthetic multi-image samples, each with 25 interconnected images. Utilizing +Stable Diffusion with varied latent noises, our method produces a set of +interconnected images from a single caption. Leveraging MIS, we learn M2M, an +autoregressive model for many-to-many generation, where each image is modeled +within a diffusion framework. Throughout training on the synthetic MIS, the +model excels in capturing style and content from preceding images - synthetic +or real - and generates novel images following the captured patterns. +Furthermore, through task-specific fine-tuning, our model demonstrates its +adaptability to various multi-image generation tasks, including Novel View +Synthesis and Visual Procedure Generation. + +
+
+
+
+
+ + ☆ SalFoM: Dynamic Saliency Prediction with Video Foundation Models + + +
+ Recent advancements in video saliency prediction (VSP) have shown promising +performance compared to the human visual system, whose emulation is the primary +goal of VSP. However, current state-of-the-art models employ spatio-temporal +transformers trained on limited amounts of data, hindering generalizability +adaptation to downstream tasks. The benefits of vision foundation models +present a potential solution to improve the VSP process. However, adapting +image foundation models to the video domain presents significant challenges in +modeling scene dynamics and capturing temporal information. To address these +challenges, and as the first initiative to design a VSP model based on video +foundation models, we introduce SalFoM, a novel encoder-decoder video +transformer architecture. Our model employs UnMasked Teacher (UMT) as feature +extractor and presents a heterogeneous decoder which features a locality-aware +spatio-temporal transformer and integrates local and global spatio-temporal +information from various perspectives to produce the final saliency map. Our +qualitative and quantitative experiments on the challenging VSP benchmark +datasets of DHF1K, Hollywood-2 and UCF-Sports demonstrate the superiority of +our proposed model in comparison with the state-of-the-art methods. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ Behind the Veil: Enhanced Indoor 3D Scene Reconstruction with Occluded + Surfaces Completion + + +
+ In this paper, we present a novel indoor 3D reconstruction method with +occluded surface completion, given a sequence of depth readings. Prior +state-of-the-art (SOTA) methods only focus on the reconstruction of the visible +areas in a scene, neglecting the invisible areas due to the occlusions, e.g., +the contact surface between furniture, occluded wall and floor. Our method +tackles the task of completing the occluded scene surfaces, resulting in a +complete 3D scene mesh. The core idea of our method is learning 3D geometry +prior from various complete scenes to infer the occluded geometry of an unseen +scene from solely depth measurements. We design a coarse-fine hierarchical +octree representation coupled with a dual-decoder architecture, i.e., +Geo-decoder and 3D Inpainter, which jointly reconstructs the complete 3D scene +geometry. The Geo-decoder with detailed representation at fine levels is +optimized online for each scene to reconstruct visible surfaces. The 3D +Inpainter with abstract representation at coarse levels is trained offline +using various scenes to complete occluded surfaces. As a result, while the +Geo-decoder is specialized for an individual scene, the 3D Inpainter can be +generally applied across different scenes. We evaluate the proposed method on +the 3D Completed Room Scene (3D-CRS) and iTHOR datasets, significantly +outperforming the SOTA methods by a gain of 16.8% and 24.2% in terms of the +completeness of 3D reconstruction. 3D-CRS dataset including a complete 3D mesh +of each scene is provided at project webpage. + +
+
+
+
+
+ + ☆ Self-supervised 6-DoF Robot Grasping by Demonstration via Augmented + Reality Teleoperation System + + +
+ Most existing 6-DoF robot grasping solutions depend on strong supervision on +grasp pose to ensure satisfactory performance, which could be laborious and +impractical when the robot works in some restricted area. To this end, we +propose a self-supervised 6-DoF grasp pose detection framework via an Augmented +Reality (AR) teleoperation system that can efficiently learn human +demonstrations and provide 6-DoF grasp poses without grasp pose annotations. +Specifically, the system collects the human demonstration from the AR +environment and contrastively learns the grasping strategy from the +demonstration. For the real-world experiment, the proposed system leads to +satisfactory grasping abilities and learning to grasp unknown objects within +three demonstrations. + +
+
+
+
+
+ + ☆ Linear Anchored Gaussian Mixture Model for Location and Width + Computation of Objects in Thick Line Shape + + +
+ An accurate detection of the centerlines of linear objects is a challenging +topic in many sensitive real-world applications such X-ray imaging, remote +sensing and lane marking detection in road traffic. Model-based approaches +using Hough and Radon transforms are often used but, are not recommended for +thick line detection, whereas approaches based on image derivatives need +further step-by-step processing, making their efficiency dependent on each step +outcomes. In this paper, we aim to detect linear structures found in images by +considering the 3D representation of the image gray levels as a finite mixture +model of statistical distribution. The latter, which we named linear anchored +Gaussian distribution could be parametrized by a scale value {\sigma} +describing the linear structure thickness and a line equation, parametrized, in +turn, by a radius \r{ho} and an orientation angle {\theta}, describing the +linear structure centerline location. Expectation-Maximization (EM) algorithm +is used for the mixture model parameter estimation, where a new paradigm, using +the background subtraction for the likelihood function computation, is +proposed. For the EM algorithm, two {\theta} parameter initialization schemes +are used: the first one is based on a random choice of the first component of +{\theta} vector, whereas the second is based on the image Hessian with a +simultaneous computation of the mixture model components number. Experiments on +real world images and synthetic images corrupted by blur and additive noise +show the good performance of the proposed methods, where the algorithm using +background subtraction and Hessian-based {\theta} initialization provides an +outstanding accuracy of the linear structure detection despite irregular image +background and presence of blur and noise. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ☆ AWOL: Analysis WithOut synthesis using Language + + +
+ Many classical parametric 3D shape models exist, but creating novel shapes +with such models requires expert knowledge of their parameters. For example, +imagine creating a specific type of tree using procedural graphics or a new +kind of animal from a statistical shape model. Our key idea is to leverage +language to control such existing models to produce novel shapes. This involves +learning a mapping between the latent space of a vision-language model and the +parameter space of the 3D model, which we do using a small set of shape and +text pairs. Our hypothesis is that mapping from language to parameters allows +us to generate parameters for objects that were never seen during training. If +the mapping between language and parameters is sufficiently smooth, then +interpolation or generalization in language should translate appropriately into +novel 3D shapes. We test our approach with two very different types of +parametric shape models (quadrupeds and arboreal trees). We use a learned +statistical shape model of quadrupeds and show that we can use text to generate +new animals not present during training. In particular, we demonstrate +state-of-the-art shape estimation of 3D dogs. This work also constitutes the +first language-driven method for generating 3D trees. Finally, embedding images +in the CLIP latent space enables us to generate animals and trees directly from +images. + +
+
+
+
+
+ + ☆ BCAmirs at SemEval-2024 Task 4: Beyond Words: A Multimodal and + Multilingual Exploration of Persuasion in Memes SemEval-2024 + + +
+ Memes, combining text and images, frequently use metaphors to convey +persuasive messages, shaping public opinion. Motivated by this, our team +engaged in SemEval-2024 Task 4, a hierarchical multi-label classification task +designed to identify rhetorical and psychological persuasion techniques +embedded within memes. To tackle this problem, we introduced a caption +generation step to assess the modality gap and the impact of additional +semantic information from images, which improved our result. Our best model +utilizes GPT-4 generated captions alongside meme text to fine-tune RoBERTa as +the text encoder and CLIP as the image encoder. It outperforms the baseline by +a large margin in all 12 subtasks. In particular, it ranked in top-3 across all +languages in Subtask 2a, and top-4 in Subtask 2b, demonstrating quantitatively +strong performance. The improvement achieved by the introduced intermediate +step is likely attributable to the metaphorical essence of images that +challenges visual encoders. This highlights the potential for improving +abstract visual semantics encoding. + +
+
+ comment: 11 pages, 5 tables, 2 figures, Proceedings of the 18th International + Workshop on Semantic Evaluation (SemEval-2024) @ NAACL 2024 +
+
+
+
+
+ + ☆ DPFT: Dual Perspective Fusion Transformer for Camera-Radar-based Object + Detection + + +
+ The perception of autonomous vehicles has to be efficient, robust, and +cost-effective. However, cameras are not robust against severe weather +conditions, lidar sensors are expensive, and the performance of radar-based +perception is still inferior to the others. Camera-radar fusion methods have +been proposed to address this issue, but these are constrained by the typical +sparsity of radar point clouds and often designed for radars without elevation +information. We propose a novel camera-radar fusion approach called Dual +Perspective Fusion Transformer (DPFT), designed to overcome these limitations. +Our method leverages lower-level radar data (the radar cube) instead of the +processed point clouds to preserve as much information as possible and employs +projections in both the camera and ground planes to effectively use radars with +elevation information and simplify the fusion with camera data. As a result, +DPFT has demonstrated state-of-the-art performance on the K-Radar dataset while +showing remarkable robustness against adverse weather conditions and +maintaining a low inference time. The code is made available as open-source +software under https://github.com/TUMFTM/DPFT. + +
+
+
+
+
+ + ☆ Skeleton Recall Loss for Connectivity Conserving and Resource Efficient + Segmentation of Thin Tubular Structures + + +
+ Accurately segmenting thin tubular structures, such as vessels, nerves, roads +or concrete cracks, is a crucial task in computer vision. Standard deep +learning-based segmentation loss functions, such as Dice or Cross-Entropy, +focus on volumetric overlap, often at the expense of preserving structural +connectivity or topology. This can lead to segmentation errors that adversely +affect downstream tasks, including flow calculation, navigation, and structural +inspection. Although current topology-focused losses mark an improvement, they +introduce significant computational and memory overheads. This is particularly +relevant for 3D data, rendering these losses infeasible for larger volumes as +well as increasingly important multi-class segmentation problems. To mitigate +this, we propose a novel Skeleton Recall Loss, which effectively addresses +these challenges by circumventing intensive GPU-based calculations with +inexpensive CPU operations. It demonstrates overall superior performance to +current state-of-the-art approaches on five public datasets for +topology-preserving segmentation, while substantially reducing computational +overheads by more than 90%. In doing so, we introduce the first multi-class +capable loss function for thin structure segmentation, excelling in both +efficiency and efficacy for topology-preservation. + +
+
+
+
+
+ + ☆ MeshBrush: Painting the Anatomical Mesh with Neural Stylization for + Endoscopy + + +
+ Style transfer is a promising approach to close the sim-to-real gap in +medical endoscopy. Rendering realistic endoscopic videos by traversing +pre-operative scans (such as MRI or CT) can generate realistic simulations as +well as ground truth camera poses and depth maps. Although image-to-image (I2I) +translation models such as CycleGAN perform well, they are unsuitable for +video-to-video synthesis due to the lack of temporal consistency, resulting in +artifacts between frames. We propose MeshBrush, a neural mesh stylization +method to synthesize temporally consistent videos with differentiable +rendering. MeshBrush uses the underlying geometry of patient imaging data while +leveraging existing I2I methods. With learned per-vertex textures, the stylized +mesh guarantees consistency while producing high-fidelity outputs. We +demonstrate that mesh stylization is a promising approach for creating +realistic simulations for downstream tasks such as training and preoperative +planning. Although our method is tested and designed for ureteroscopy, its +components are transferable to general endoscopic and laparoscopic procedures. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ ASAP: Interpretable Analysis and Summarization of AI-generated Image + Patterns at Scale + + +
+ Generative image models have emerged as a promising technology to produce +realistic images. Despite potential benefits, concerns grow about its misuse, +particularly in generating deceptive images that could raise significant +ethical, legal, and societal issues. Consequently, there is growing demand to +empower users to effectively discern and comprehend patterns of AI-generated +images. To this end, we developed ASAP, an interactive visualization system +that automatically extracts distinct patterns of AI-generated images and allows +users to interactively explore them via various views. To uncover fake +patterns, ASAP introduces a novel image encoder, adapted from CLIP, which +transforms images into compact "distilled" representations, enriched with +information for differentiating authentic and fake images. These +representations generate gradients that propagate back to the attention maps of +CLIP's transformer block. This process quantifies the relative importance of +each pixel to image authenticity or fakeness, exposing key deceptive patterns. +ASAP enables the at scale interactive analysis of these patterns through +multiple, coordinated visualizations. This includes a representation overview +with innovative cell glyphs to aid in the exploration and qualitative +evaluation of fake patterns across a vast array of images, as well as a pattern +view that displays authenticity-indicating patterns in images and quantifies +their impact. ASAP supports the analysis of cutting-edge generative models with +the latest architectures, including GAN-based models like proGAN and diffusion +models like the latent diffusion model. We demonstrate ASAP's usefulness +through two usage scenarios using multiple fake image detection benchmark +datasets, revealing its ability to identify and understand hidden patterns in +AI-generated images, especially in detecting fake human faces produced by +diffusion-based techniques. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Scaling Laws for Galaxy Images + + +
+ We present the first systematic investigation of supervised scaling laws +outside of an ImageNet-like context - on images of galaxies. We use 840k galaxy +images and over 100M annotations by Galaxy Zoo volunteers, comparable in scale +to Imagenet-1K. We find that adding annotated galaxy images provides a power +law improvement in performance across all architectures and all tasks, while +adding trainable parameters is effective only for some (typically more +subjectively challenging) tasks. We then compare the downstream performance of +finetuned models pretrained on either ImageNet-12k alone vs. additionally +pretrained on our galaxy images. We achieve an average relative error rate +reduction of 31% across 5 downstream tasks of scientific interest. Our +finetuned models are more label-efficient and, unlike their +ImageNet-12k-pretrained equivalents, often achieve linear transfer performance +equal to that of end-to-end finetuning. We find relatively modest additional +downstream benefits from scaling model size, implying that scaling alone is not +sufficient to address our domain gap, and suggest that practitioners with +qualitatively different images might benefit more from in-domain adaption +followed by targeted downstream labelling. + +
+
+ comment: 10+6 pages, 12 figures. Appendix C2 based on arxiv:2206.11927. Code, + demos, documentation at https://github.com/mwalmsley/zoobot +
+
+
+
+
+ + ☆ Translation-based Video-to-Video Synthesis + + +
+ Translation-based Video Synthesis (TVS) has emerged as a vital research area +in computer vision, aiming to facilitate the transformation of videos between +distinct domains while preserving both temporal continuity and underlying +content features. This technique has found wide-ranging applications, +encompassing video super-resolution, colorization, segmentation, and more, by +extending the capabilities of traditional image-to-image translation to the +temporal domain. One of the principal challenges faced in TVS is the inherent +risk of introducing flickering artifacts and inconsistencies between frames +during the synthesis process. This is particularly challenging due to the +necessity of ensuring smooth and coherent transitions between video frames. +Efforts to tackle this challenge have induced the creation of diverse +strategies and algorithms aimed at mitigating these unwanted consequences. This +comprehensive review extensively examines the latest progress in the realm of +TVS. It thoroughly investigates emerging methodologies, shedding light on the +fundamental concepts and mechanisms utilized for proficient video synthesis. +This survey also illuminates their inherent strengths, limitations, appropriate +applications, and potential avenues for future development. + +
+
+ comment: 25 pages, 9 figures +
+
+
+
+
+ + ☆ JDEC: JPEG Decoding via Enhanced Continuous Cosine Coefficients + + +
+ We propose a practical approach to JPEG image decoding, utilizing a local +implicit neural representation with continuous cosine formulation. The JPEG +algorithm significantly quantizes discrete cosine transform (DCT) spectra to +achieve a high compression rate, inevitably resulting in quality degradation +while encoding an image. We have designed a continuous cosine spectrum +estimator to address the quality degradation issue that restores the distorted +spectrum. By leveraging local DCT formulations, our network has the privilege +to exploit dequantization and upsampling simultaneously. Our proposed model +enables decoding compressed images directly across different quality factors +using a single pre-trained model without relying on a conventional JPEG +decoder. As a result, our proposed network achieves state-of-the-art +performance in flexible color image JPEG artifact removal tasks. Our source +code is available at https://github.com/WooKyoungHan/JDEC. + +
+
+
+
+
+ + ♻ ☆ FreeZe: Training-free zero-shot 6D pose estimation with geometric and + vision foundation models + + +
+ Estimating the 6D pose of objects unseen during training is highly desirable +yet challenging. Zero-shot object 6D pose estimation methods address this +challenge by leveraging additional task-specific supervision provided by +large-scale, photo-realistic synthetic datasets. However, their performance +heavily depends on the quality and diversity of rendered data and they require +extensive training. In this work, we show how to tackle the same task but +without training on specific data. We propose FreeZe, a novel solution that +harnesses the capabilities of pre-trained geometric and vision foundation +models. FreeZe leverages 3D geometric descriptors learned from unrelated 3D +point clouds and 2D visual features learned from web-scale 2D images to +generate discriminative 3D point-level descriptors. We then estimate the 6D +pose of unseen objects by 3D registration based on RANSAC. We also introduce a +novel algorithm to solve ambiguous cases due to geometrically symmetric objects +that is based on visual features. We comprehensively evaluate FreeZe across the +seven core datasets of the BOP Benchmark, which include over a hundred 3D +objects and 20,000 images captured in various scenarios. FreeZe consistently +outperforms all state-of-the-art approaches, including competitors extensively +trained on synthetic 6D pose estimation data. Code will be publicly available +at https://andreacaraffa.github.io/freeze. + +
+
+
+
+
+ + ♻ ☆ Total Selfie: Generating Full-Body Selfies + + +
+ We present a method to generate full-body selfies from photographs originally +taken at arms length. Because self-captured photos are typically taken close +up, they have limited field of view and exaggerated perspective that distorts +facial shapes. We instead seek to generate the photo some one else would take +of you from a few feet away. Our approach takes as input four selfies of your +face and body, a background image, and generates a full-body selfie in a +desired target pose. We introduce a novel diffusion-based approach to combine +all of this information into high-quality, well-composed photos of you with the +desired pose and background. + +
+
+ comment: Project page: + https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/ +
+
+
+
+
+ + ♻ ☆ G3DR: Generative 3D Reconstruction in ImageNet CVPR 2024 + + +
+ We introduce a novel 3D generative method, Generative 3D Reconstruction +(G3DR) in ImageNet, capable of generating diverse and high-quality 3D objects +from single images, addressing the limitations of existing methods. At the +heart of our framework is a novel depth regularization technique that enables +the generation of scenes with high-geometric fidelity. G3DR also leverages a +pretrained language-vision model, such as CLIP, to enable reconstruction in +novel views and improve the visual realism of generations. Additionally, G3DR +designs a simple but effective sampling procedure to further improve the +quality of generations. G3DR offers diverse and efficient 3D asset generation +based on class or text conditioning. Despite its simplicity, G3DR is able to +beat state-of-theart methods, improving over them by up to 22% in perceptual +metrics and 90% in geometry scores, while needing only half of the training +time. Code is available at https://github.com/preddy5/G3DR + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from an over-reliance on unimodal biases (e.g., language +bias and vision bias), leading to incorrect answers in complex multimodal +tasks. To investigate this issue, we propose a causal framework to interpret +the biases in Visual Question Answering (VQA) problems. Within our framework, +we devise a causal graph to elucidate the predictions of MLLMs on VQA problems, +and assess the causal effect of biases through an in-depth causal analysis. +Motivated by the causal graph, we introduce a novel MORE dataset, consisting of +12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities, +necessitating multi-hop reasoning and the surmounting of unimodal biases. +Furthermore, we propose two strategies to mitigate unimodal biases and enhance +MLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA) +framework for limited-access MLLMs and the refinement of open-source MLLMs +through fine-tuning. Extensive quantitative and qualitative experiments offer +valuable insights for future research. Our project page is at +https://opencausalab.github.io/MORE. + +
+
+
+
+
+ + ♻ ☆ Learning Object State Changes in Videos: An Open-World Perspective CVPR 2024 + + +
+ Object State Changes (OSCs) are pivotal for video understanding. While humans +can effortlessly generalize OSC understanding from familiar to unknown objects, +current approaches are confined to a closed vocabulary. Addressing this gap, we +introduce a novel open-world formulation for the video OSC problem. The goal is +to temporally localize the three stages of an OSC -- the object's initial +state, its transitioning state, and its end state -- whether or not the object +has been observed during training. Towards this end, we develop VidOSC, a +holistic learning approach that: (1) leverages text and vision-language models +for supervisory signals to obviate manually labeling OSC training data, and (2) +abstracts fine-grained shared state representations from objects to enhance +generalization. Furthermore, we present HowToChange, the first open-world +benchmark for video OSC localization, which offers an order of magnitude +increase in the label space and annotation volume compared to the best existing +benchmark. Experimental results demonstrate the efficacy of our approach, in +both traditional closed-world and open-world scenarios. + +
+
+ comment: Accepted by CVPR 2024, Project website: + https://vision.cs.utexas.edu/projects/VidOSC/ +
+
+
+
+
+ + ♻ ☆ AddSR: Accelerating Diffusion-based Blind Super-Resolution with + Adversarial Diffusion Distillation + + +
+ Blind super-resolution methods based on stable diffusion showcase formidable +generative capabilities in reconstructing clear high-resolution images with +intricate details from low-resolution inputs. However, their practical +applicability is often hampered by poor efficiency, stemming from the +requirement of thousands or hundreds of sampling steps. Inspired by the +efficient text-to-image approach adversarial diffusion distillation (ADD), we +design AddSR to address this issue by incorporating the ideas of both +distillation and ControlNet. Specifically, we first propose a prediction-based +self-refinement strategy to provide high-frequency information in the student +model output with marginal additional time cost. Furthermore, we refine the +training process by employing HR images, rather than LR images, to regulate the +teacher model, providing a more robust constraint for distillation. Second, we +introduce a timestep-adapting loss to address the perception-distortion +imbalance problem introduced by ADD. Extensive experiments demonstrate our +AddSR generates better restoration results, while achieving faster speed than +previous SD-based state-of-the-art models (e.g., 7x faster than SeeSR). + +
+
+
+
+
+ + ♻ ☆ Your Student is Better Than Expected: Adaptive Teacher-Student + Collaboration for Text-Conditional Diffusion Models CVPR2024 + + +
+ Knowledge distillation methods have recently shown to be a promising +direction to speedup the synthesis of large-scale diffusion models by requiring +only a few inference steps. While several powerful distillation methods were +recently proposed, the overall quality of student samples is typically lower +compared to the teacher ones, which hinders their practical usage. In this +work, we investigate the relative quality of samples produced by the teacher +text-to-image diffusion model and its distilled student version. As our main +empirical finding, we discover that a noticeable portion of student samples +exhibit superior fidelity compared to the teacher ones, despite the +"approximate" nature of the student. Based on this finding, we propose an +adaptive collaboration between student and teacher diffusion models for +effective text-to-image synthesis. Specifically, the distilled model produces +the initial sample, and then an oracle decides whether it needs further +improvements with a slow teacher model. Extensive experiments demonstrate that +the designed pipeline surpasses state-of-the-art text-to-image alternatives for +various inference budgets in terms of human preference. Furthermore, the +proposed approach can be naturally used in popular applications such as +text-guided image editing and controllable generation. + +
+
+ comment: CVPR2024 camera ready +
+
+
+
+
+ + ♻ ☆ ElasticLaneNet: An Efficient Geometry-Flexible Approach for Lane + Detection + + +
+ The task of lane detection involves identifying the boundaries of driving +areas in real-time. Recognizing lanes with variable and complex geometric +structures remains a challenge. In this paper, we explore a novel and flexible +way of implicit lanes representation named \textit{Elastic Lane map (ELM)}, and +introduce an efficient physics-informed end-to-end lane detection framework, +namely, ElasticLaneNet (Elastic interaction energy-informed Lane detection +Network). The approach considers predicted lanes as moving zero-contours on the +flexibly shaped \textit{ELM} that are attracted to the ground truth guided by +an elastic interaction energy-loss function (EIE loss). Our framework well +integrates the global information and low-level features. The method performs +well in complex lane scenarios, including those with large curvature, weak +geometry features at intersections, complicated cross lanes, Y-shapes lanes, +dense lanes, etc. We apply our approach on three datasets: SDLane, CULane, and +TuSimple. The results demonstrate exceptional performance of our method, with +the state-of-the-art results on the structurally diverse SDLane, achieving +F1-score of 89.51, Recall rate of 87.50, and Precision of 91.61 with fast +inference speed. + +
+
+
+
+
+ + ♻ ☆ Dynamic LiDAR Re-simulation using Compositional Neural Fields + + +
+ We introduce DyNFL, a novel neural field-based approach for high-fidelity +re-simulation of LiDAR scans in dynamic driving scenes. DyNFL processes LiDAR +measurements from dynamic environments, accompanied by bounding boxes of moving +objects, to construct an editable neural field. This field, comprising +separately reconstructed static background and dynamic objects, allows users to +modify viewpoints, adjust object positions, and seamlessly add or remove +objects in the re-simulated scene. A key innovation of our method is the neural +field composition technique, which effectively integrates reconstructed neural +assets from various scenes through a ray drop test, accounting for occlusions +and transparent surfaces. Our evaluation with both synthetic and real-world +environments demonstrates that DyNFL substantially improves dynamic scene LiDAR +simulation, offering a combination of physical fidelity and flexible editing +capabilities. + +
+
+ comment: Project page: https://shengyuh.github.io/dynfl +
+
+
+
+
+ + ♻ ☆ Three Heads Are Better Than One: Complementary Experts for Long-Tailed + Semi-supervised Learning AAAI2024 + + +
+ We address the challenging problem of Long-Tailed Semi-Supervised Learning +(LTSSL) where labeled data exhibit imbalanced class distribution and unlabeled +data follow an unknown distribution. Unlike in balanced SSL, the generated +pseudo-labels are skewed towards head classes, intensifying the training bias. +Such a phenomenon is even amplified as more unlabeled data will be mislabeled +as head classes when the class distribution of labeled and unlabeled datasets +are mismatched. To solve this problem, we propose a novel method named +ComPlementary Experts (CPE). Specifically, we train multiple experts to model +various class distributions, each of them yielding high-quality pseudo-labels +within one form of class distribution. Besides, we introduce Classwise Batch +Normalization for CPE to avoid performance degradation caused by feature +distribution mismatch between head and non-head classes. CPE achieves +state-of-the-art performances on CIFAR-10-LT, CIFAR-100-LT, and STL-10-LT +dataset benchmarks. For instance, on CIFAR-10-LT, CPE improves test accuracy by +over 2.22% compared to baselines. Code is available at +https://github.com/machengcheng2016/CPE-LTSSL. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Elastic Interaction Energy-Informed Real-Time Traffic Scene Perception + + +
+ Urban segmentation and lane detection are two important tasks for traffic +scene perception. Accuracy and fast inference speed of visual perception are +crucial for autonomous driving safety. Fine and complex geometric objects are +the most challenging but important recognition targets in traffic scene, such +as pedestrians, traffic signs and lanes. In this paper, a simple and efficient +topology-aware energy loss function-based network training strategy named +EIEGSeg is proposed. EIEGSeg is designed for multi-class segmentation on +real-time traffic scene perception. To be specific, the convolutional neural +network (CNN) extracts image features and produces multiple outputs, and the +elastic interaction energy loss function (EIEL) drives the predictions moving +toward the ground truth until they are completely overlapped. Our strategy +performs well especially on fine-scale structure, \textit{i.e.} small or +irregularly shaped objects can be identified more accurately, and discontinuity +issues on slender objects can be improved. We quantitatively and qualitatively +analyze our method on three traffic datasets, including urban scene +segmentation data Cityscapes and lane detection data TuSimple and CULane. Our +results demonstrate that EIEGSeg consistently improves the performance, +especially on real-time, lightweight networks that are better suited for +autonomous driving. + +
+
+
+
+
+ + ♻ ☆ Strengthening Multimodal Large Language Model with Bootstrapped + Preference Optimization + + +
+ Multimodal Large Language Models (MLLMs) excel in generating responses based +on visual inputs. However, they often suffer from a bias towards generating +responses similar to their pretraining corpus, overshadowing the importance of +visual information. We treat this bias as a "preference" for pretraining +statistics, which hinders the model's grounding in visual input. To mitigate +this issue, we propose Bootstrapped Preference Optimization (BPO), which +conducts preference learning with datasets containing negative responses +bootstrapped from the model itself. Specifically, we propose the following two +strategies: 1) using distorted image inputs to the MLLM for eliciting responses +that contain signified pretraining bias; 2) leveraging text-based LLM to +explicitly inject erroneous but common elements into the original response. +Those undesirable responses are paired with original annotated responses from +the datasets to construct the preference dataset, which is subsequently +utilized to perform preference learning. Our approach effectively suppresses +pretrained LLM bias, enabling enhanced grounding in visual inputs. Extensive +experimentation demonstrates significant performance improvements across +multiple benchmarks, advancing the state-of-the-art in multimodal +conversational systems. + +
+
+
+
+
+ + ♻ ☆ Isometric Multi-Shape Matching + + +
+ Finding correspondences between shapes is a fundamental problem in computer +vision and graphics, which is relevant for many applications, including 3D +reconstruction, object tracking, and style transfer. The vast majority of +correspondence methods aim to find a solution between pairs of shapes, even if +multiple instances of the same class are available. While isometries are often +studied in shape correspondence problems, they have not been considered +explicitly in the multi-matching setting. This paper closes this gap by +proposing a novel optimisation formulation for isometric multi-shape matching. +We present a suitable optimisation algorithm for solving our formulation and +provide a convergence and complexity analysis. Our algorithm obtains +multi-matchings that are by construction provably cycle-consistent. We +demonstrate the superior performance of our method on various datasets and set +the new state-of-the-art in isometric multi-shape matching. + +
+
+
+
+
+ + ♻ ☆ Semi-supervised Active Learning for Video Action Detection AAAI + + +
+ In this work, we focus on label efficient learning for video action +detection. We develop a novel semi-supervised active learning approach which +utilizes both labeled as well as unlabeled data along with informative sample +selection for action detection. Video action detection requires spatio-temporal +localization along with classification, which poses several challenges for both +active learning informative sample selection as well as semi-supervised +learning pseudo label generation. First, we propose NoiseAug, a simple +augmentation strategy which effectively selects informative samples for video +action detection. Next, we propose fft-attention, a novel technique based on +high-pass filtering which enables effective utilization of pseudo label for SSL +in video action detection by emphasizing on relevant activity region within a +video. We evaluate the proposed approach on three different benchmark datasets, +UCF-101-24, JHMDB-21, and Youtube-VOS. First, we demonstrate its effectiveness +on video action detection where the proposed approach outperforms prior works +in semi-supervised and weakly-supervised learning along with several baseline +approaches in both UCF101-24 and JHMDB-21. Next, we also show its effectiveness +on Youtube-VOS for video object segmentation demonstrating its generalization +capability for other dense prediction tasks in videos. The code and models is +publicly available at: +\url{https://github.com/AKASH2907/semi-sup-active-learning}. + +
+
+ comment: AAAI Conference on Artificial Intelligence, Main Technical Track + (AAAI), 2024, Code: https://github.com/AKASH2907/semi-sup-active-learning +
+
+
+
+
+ + ♻ ☆ Text-Driven Image Editing via Learnable Regions CVPR 2024 + + +
+ Language has emerged as a natural interface for image editing. In this paper, +we introduce a method for region-based image editing driven by textual prompts, +without the need for user-provided masks or sketches. Specifically, our +approach leverages an existing pre-trained text-to-image model and introduces a +bounding box generator to identify the editing regions that are aligned with +the textual prompts. We show that this simple approach enables flexible editing +that is compatible with current image generation models, and is able to handle +complex prompts featuring multiple objects, complex sentences, or lengthy +paragraphs. We conduct an extensive user study to compare our method against +state-of-the-art methods. The experiments demonstrate the competitive +performance of our method in manipulating images with high fidelity and realism +that correspond to the provided language descriptions. Our project webpage can +be found at: https://yuanze-lin.me/LearnableRegions_page. + +
+
+ comment: Accepted to CVPR 2024 Project webpage: + https://yuanze-lin.me/LearnableRegions_page +
+
+
+
+
+ + ♻ ☆ SIGMA: Scale-Invariant Global Sparse Shape Matching + + +
+ We propose a novel mixed-integer programming (MIP) formulation for generating +precise sparse correspondences for highly non-rigid shapes. To this end, we +introduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic +and extrinsic geometric information to measure the deformation quality induced +by predicted correspondences. We integrate the PLBO, together with an +orientation-aware regulariser, into a novel MIP formulation that can be solved +to global optimality for many practical problems. In contrast to previous +methods, our approach is provably invariant to rigid transformations and global +scaling, initialisation-free, has optimality guarantees, and scales to high +resolution meshes with (empirically observed) linear time. We show +state-of-the-art results for sparse non-rigid matching on several challenging +3D datasets, including data with inconsistent meshing, as well as applications +in mesh-to-point-cloud matching. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Conquering the Communication Constraints to Enable Large Pre-Trained + Models in Federated Learning + + +
+ Federated learning (FL) has emerged as a promising paradigm for enabling the +collaborative training of models without centralized access to the raw data on +local devices. In the typical FL paradigm (e.g., FedAvg), model weights are +sent to and from the server each round to participating clients. Recently, the +use of small pre-trained models has been shown effective in federated learning +optimization and improving convergence. However, recent state-of-the-art +pre-trained models are getting more capable but also have more parameters. In +conventional FL, sharing the enormous model weights can quickly put a massive +communication burden on the system, especially if more capable models are +employed. Can we find a solution to enable those strong and readily-available +pre-trained models in FL to achieve excellent performance while simultaneously +reducing the communication burden? To this end, we investigate the use of +parameter-efficient fine-tuning in federated learning and thus introduce a new +framework: FedPEFT. Specifically, we systemically evaluate the performance of +FedPEFT across a variety of client stability, data distribution, and +differential privacy settings. By only locally tuning and globally sharing a +small portion of the model weights, significant reductions in the total +communication overhead can be achieved while maintaining competitive or even +better performance in a wide range of federated learning scenarios, providing +insight into a new paradigm for practical and effective federated systems. + +
+
+
+
+
+ + ♻ ☆ Towards Seamless Adaptation of Pre-trained Models for Visual Place + Recognition ICLR2024 + + +
+ Recent studies show that vision models pre-trained in generic visual learning +tasks with large-scale data can provide useful feature representations for a +wide range of visual perception problems. However, few attempts have been made +to exploit pre-trained foundation models in visual place recognition (VPR). Due +to the inherent difference in training objectives and data between the tasks of +model pre-training and VPR, how to bridge the gap and fully unleash the +capability of pre-trained models for VPR is still a key issue to address. To +this end, we propose a novel method to realize seamless adaptation of +pre-trained models for VPR. Specifically, to obtain both global and local +features that focus on salient landmarks for discriminating places, we design a +hybrid adaptation method to achieve both global and local adaptation +efficiently, in which only lightweight adapters are tuned without adjusting the +pre-trained model. Besides, to guide effective adaptation, we propose a mutual +nearest neighbor local feature loss, which ensures proper dense local features +are produced for local matching and avoids time-consuming spatial verification +in re-ranking. Experimental results show that our method outperforms the +state-of-the-art methods with less training data and training time, and uses +about only 3% retrieval runtime of the two-stage VPR methods with RANSAC-based +spatial verification. It ranks 1st on the MSLS challenge leaderboard (at the +time of submission). The code is released at +https://github.com/Lu-Feng/SelaVPR. + +
+
+ comment: ICLR2024 +
+
+
+
+
+ + ♻ ☆ NEAT: Distilling 3D Wireframes from Neural Attraction Fields CVPR 2024 + + +
+ This paper studies the problem of structured 3D reconstruction using +wireframes that consist of line segments and junctions, focusing on the +computation of structured boundary geometries of scenes. Instead of leveraging +matching-based solutions from 2D wireframes (or line segments) for 3D wireframe +reconstruction as done in prior arts, we present NEAT, a rendering-distilling +formulation using neural fields to represent 3D line segments with 2D +observations, and bipartite matching for perceiving and distilling of a sparse +set of 3D global junctions. The proposed {NEAT} enjoys the joint optimization +of the neural fields and the global junctions from scratch, using +view-dependent 2D observations without precomputed cross-view feature matching. +Comprehensive experiments on the DTU and BlendedMVS datasets demonstrate our +NEAT's superiority over state-of-the-art alternatives for 3D wireframe +reconstruction. Moreover, the distilled 3D global junctions by NEAT, are a +better initialization than SfM points, for the recently-emerged 3D Gaussian +Splatting for high-fidelity novel view synthesis using about 20 times fewer +initial 3D points. Project page: \url{https://xuenan.net/neat}. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RadEdit: stress-testing biomedical vision models via diffusion image + editing + + +
+ Biomedical imaging datasets are often small and biased, meaning that +real-world performance of predictive models can be substantially lower than +expected from internal testing. This work proposes using generative image +editing to simulate dataset shifts and diagnose failure modes of biomedical +vision models; this can be used in advance of deployment to assess readiness, +potentially reducing cost and patient harm. Existing editing methods can +produce undesirable changes, with spurious correlations learned due to the +co-occurrence of disease and treatment interventions, limiting practical +applicability. To address this, we train a text-to-image diffusion model on +multiple chest X-ray datasets and introduce a new editing method RadEdit that +uses multiple masks, if present, to constrain changes and ensure consistency in +the edited images. We consider three types of dataset shifts: acquisition +shift, manifestation shift, and population shift, and demonstrate that our +approach can diagnose failures and quantify model robustness without additional +data collection, complementing more qualitative tools for explainable AI. + +
+
+
+
+
+ + ♻ ☆ DriftRec: Adapting diffusion models to blind JPEG restoration + + +
+ In this work, we utilize the high-fidelity generation abilities of diffusion +models to solve blind JPEG restoration at high compression levels. We propose +an elegant modification of the forward stochastic differential equation of +diffusion models to adapt them to this restoration task and name our method +DriftRec. Comparing DriftRec against an $L_2$ regression baseline with the same +network architecture and state-of-the-art techniques for JPEG restoration, we +show that our approach can escape the tendency of other methods to generate +blurry images, and recovers the distribution of clean images significantly more +faithfully. For this, only a dataset of clean/corrupted image pairs and no +knowledge about the corruption operation is required, enabling wider +applicability to other restoration tasks. In contrast to other conditional and +unconditional diffusion models, we utilize the idea that the distributions of +clean and corrupted images are much closer to each other than each is to the +usual Gaussian prior of the reverse process in diffusion models. Our approach +therefore requires only low levels of added noise and needs comparatively few +sampling steps even without further optimizations. We show that DriftRec +naturally generalizes to realistic and difficult scenarios such as unaligned +double JPEG compression and blind restoration of JPEGs found online, without +having encountered such examples during training. + +
+
+ comment: (C) 2024 IEEE. Personal use of this material is permitted. Permission + from IEEE must be obtained for all other uses, in any current or future + media, including reprinting/republishing this material for advertising or + promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ Repurposing Diffusion-Based Image Generators for Monocular Depth + Estimation CVPR 2024 + + +
+ Monocular depth estimation is a fundamental computer vision task. Recovering +3D depth from a single image is geometrically ill-posed and requires scene +understanding, so it is not surprising that the rise of deep learning has led +to a breakthrough. The impressive progress of monocular depth estimators has +mirrored the growth in model capacity, from relatively modest CNNs to large +Transformer architectures. Still, monocular depth estimators tend to struggle +when presented with images with unfamiliar content and layout, since their +knowledge of the visual world is restricted by the data seen during training, +and challenged by zero-shot generalization to new domains. This motivates us to +explore whether the extensive priors captured in recent generative diffusion +models can enable better, more generalizable depth estimation. We introduce +Marigold, a method for affine-invariant monocular depth estimation that is +derived from Stable Diffusion and retains its rich prior knowledge. The +estimator can be fine-tuned in a couple of days on a single GPU using only +synthetic training data. It delivers state-of-the-art performance across a wide +range of datasets, including over 20% performance gains in specific cases. +Project page: https://marigoldmonodepth.github.io. + +
+
+ comment: CVPR 2024 camera ready +
+
+
+
+
+ + ♻ ☆ Learnable Weight Initialization for Volumetric Medical Image + Segmentation + + +
+ Hybrid volumetric medical image segmentation models, combining the advantages +of local convolution and global attention, have recently received considerable +attention. While mainly focusing on architectural modifications, most existing +hybrid approaches still use conventional data-independent weight initialization +schemes which restrict their performance due to ignoring the inherent +volumetric nature of the medical data. To address this issue, we propose a +learnable weight initialization approach that utilizes the available medical +training data to effectively learn the contextual and structural cues via the +proposed self-supervised objectives. Our approach is easy to integrate into any +hybrid model and requires no external training data. Experiments on multi-organ +and lung cancer segmentation tasks demonstrate the effectiveness of our +approach, leading to state-of-the-art segmentation performance. Our proposed +data-dependent initialization approach performs favorably as compared to the +Swin-UNETR model pretrained using large-scale datasets on multi-organ +segmentation task. Our source code and models are available at: +https://github.com/ShahinaKK/LWI-VMS. + +
+
+ comment: Accepted at Elsevier AI in Medicine Journal +
+
+
+
+
+ + ♻ ☆ Evaluating GPT-4 with Vision on Detection of Radiological Findings on + Chest Radiographs + + +
+ The study examines the application of GPT-4V, a multi-modal large language +model equipped with visual recognition, in detecting radiological findings from +a set of 100 chest radiographs and suggests that GPT-4V is currently not ready +for real-world diagnostic usage in interpreting chest radiographs. + +
+
+
+
+
+ + ♻ ☆ ResNet with Integrated Convolutional Block Attention Module for Ship + Classification Using Transfer Learning on Optical Satellite Imagery + + +
+ This study proposes a novel transfer learning framework for effective ship +classification using high-resolution optical remote sensing satellite imagery. +The framework is based on the deep convolutional neural network model ResNet50 +and incorporates the Convolutional Block Attention Module (CBAM) to enhance +performance. CBAM enables the model to attend to salient features in the +images, allowing it to better discriminate between subtle differences between +ships and backgrounds. Furthermore, this study adopts a transfer learning +approach tailored for accurately classifying diverse types of ships by +fine-tuning a pre-trained model for the specific task. Experimental results +demonstrate the efficacy of the proposed framework in ship classification using +optical remote sensing imagery, achieving a high classification accuracy of 94% +across 5 classes, outperforming existing methods. This research holds potential +applications in maritime surveillance and management, illegal fishing +detection, and maritime traffic monitoring. + +
+
+
+
+
+ + ♻ ☆ ReCoRe: Regularized Contrastive Representation Learning of World Model CVPR 2024 + + +
+ While recent model-free Reinforcement Learning (RL) methods have demonstrated +human-level effectiveness in gaming environments, their success in everyday +tasks like visual navigation has been limited, particularly under significant +appearance variations. This limitation arises from (i) poor sample efficiency +and (ii) over-fitting to training scenarios. To address these challenges, we +present a world model that learns invariant features using (i) contrastive +unsupervised learning and (ii) an intervention-invariant regularizer. Learning +an explicit representation of the world dynamics i.e. a world model, improves +sample efficiency while contrastive learning implicitly enforces learning of +invariant features, which improves generalization. However, the na\"ive +integration of contrastive loss to world models is not good enough, as +world-model-based RL methods independently optimize representation learning and +agent policy. To overcome this issue, we propose an intervention-invariant +regularizer in the form of an auxiliary task such as depth prediction, image +denoising, image segmentation, etc., that explicitly enforces invariance to +style interventions. Our method outperforms current state-of-the-art +model-based and model-free RL methods and significantly improves on +out-of-distribution point navigation tasks evaluated on the iGibson benchmark. +With only visual observations, we further demonstrate that our approach +outperforms recent language-guided foundation models for point navigation, +which is essential for deployment on robots with limited computation +capabilities. Finally, we demonstrate that our proposed model excels at the +sim-to-real transfer of its perception module on the Gibson benchmark. + +
+
+ comment: Accepted at CVPR 2024. arXiv admin note: text overlap with + arXiv:2209.14932 +
+
+
+
+
+ + ♻ ☆ AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in + Text-to-Image Generation + + +
+ Text-to-Image (T2I) diffusion models have achieved remarkable success in +image generation. Despite their progress, challenges remain in both +prompt-following ability, image quality and lack of high-quality datasets, +which are essential for refining these models. As acquiring labeled data is +costly, we introduce AGFSync, a framework that enhances T2I diffusion models +through Direct Preference Optimization (DPO) in a fully AI-driven approach. +AGFSync utilizes Vision-Language Models (VLM) to assess image quality across +style, coherence, and aesthetics, generating feedback data within an AI-driven +loop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and +SDXL, our extensive experiments on the TIFA dataset demonstrate notable +improvements in VQA scores, aesthetic evaluations, and performance on the HPSv2 +benchmark, consistently outperforming the base models. AGFSync's method of +refining T2I diffusion models paves the way for scalable alignment techniques. + +
+
+
+
+
+ + ♻ ☆ FreeMan: Towards Benchmarking 3D Human Pose Estimation under Real-World + Conditions CVPR2024 + + +
+ Estimating the 3D structure of the human body from natural scenes is a +fundamental aspect of visual perception. 3D human pose estimation is a vital +step in advancing fields like AIGC and human-robot interaction, serving as a +crucial technique for understanding and interacting with human actions in +real-world settings. However, the current datasets, often collected under +single laboratory conditions using complex motion capture equipment and +unvarying backgrounds, are insufficient. The absence of datasets on variable +conditions is stalling the progress of this crucial task. To facilitate the +development of 3D pose estimation, we present FreeMan, the first large-scale, +multi-view dataset collected under the real-world conditions. FreeMan was +captured by synchronizing 8 smartphones across diverse scenarios. It comprises +11M frames from 8000 sequences, viewed from different perspectives. These +sequences cover 40 subjects across 10 different scenarios, each with varying +lighting conditions. We have also established an semi-automated pipeline +containing error detection to reduce the workload of manual check and ensure +precise annotation. We provide comprehensive evaluation baselines for a range +of tasks, underlining the significant challenges posed by FreeMan. Further +evaluations of standard indoor/outdoor human sensing datasets reveal that +FreeMan offers robust representation transferability in real and complex +scenes. Code and data are available at https://wangjiongw.github.io/freeman. + +
+
+ comment: CVPR2024 camera ready version. 19 pages, 16 figures. Project page: + https://wangjiongw.github.io/freeman/ ; API: + https://github.com/wangjiongw/FreeMan_API +
+
+
+
+
+ + ♻ ☆ eWand: A calibration framework for wide baseline frame-based and + event-based camera systems ICRA 2024 + + +
+ Accurate calibration is crucial for using multiple cameras to triangulate the +position of objects precisely. However, it is also a time-consuming process +that needs to be repeated for every displacement of the cameras. The standard +approach is to use a printed pattern with known geometry to estimate the +intrinsic and extrinsic parameters of the cameras. The same idea can be applied +to event-based cameras, though it requires extra work. By using frame +reconstruction from events, a printed pattern can be detected. A blinking +pattern can also be displayed on a screen. Then, the pattern can be directly +detected from the events. Such calibration methods can provide accurate +intrinsic calibration for both frame- and event-based cameras. However, using +2D patterns has several limitations for multi-camera extrinsic calibration, +with cameras possessing highly different points of view and a wide baseline. +The 2D pattern can only be detected from one direction and needs to be of +significant size to compensate for its distance to the camera. This makes the +extrinsic calibration time-consuming and cumbersome. To overcome these +limitations, we propose eWand, a new method that uses blinking LEDs inside +opaque spheres instead of a printed or displayed pattern. Our method provides a +faster, easier-to-use extrinsic calibration approach that maintains high +accuracy for both event- and frame-based cameras. + +
+
+ comment: Accepted for 2024 IEEE International Conference on Robotics and + Automation (ICRA 2024). Project web page: + https://cogsys-tuebingen.github.io/ewand/ +
+
+
+
+
+ + ♻ ☆ Hallucination Benchmark in Medical Visual Question Answering ICLR 2024 + + +
+ The recent success of large language and vision models (LLVMs) on vision +question answering (VQA), particularly their applications in medicine +(Med-VQA), has shown a great potential of realizing effective visual assistants +for healthcare. However, these models are not extensively tested on the +hallucination phenomenon in clinical settings. Here, we created a hallucination +benchmark of medical images paired with question-answer sets and conducted a +comprehensive evaluation of the state-of-the-art models. The study provides an +in-depth analysis of current models' limitations and reveals the effectiveness +of various prompting strategies. + +
+
+ comment: Accepted to ICLR 2024 Tiny Papers(Notable) +
+
+
+
+
+ + ♻ ☆ LLaFS: When Large Language Models Meet Few-Shot Segmentation CVPR2024 + + +
+ This paper proposes LLaFS, the first attempt to leverage large language +models (LLMs) in few-shot segmentation. In contrast to the conventional +few-shot segmentation methods that only rely on the limited and biased +information from the annotated support images, LLaFS leverages the vast prior +knowledge gained by LLM as an effective supplement and directly uses the LLM to +segment images in a few-shot manner. To enable the text-based LLM to handle +image-related tasks, we carefully design an input instruction that allows the +LLM to produce segmentation results represented as polygons, and propose a +region-attribute table to simulate the human visual mechanism and provide +multi-modal guidance. We also synthesize pseudo samples and use curriculum +learning for pretraining to augment data and achieve better optimization. LLaFS +achieves state-of-the-art results on multiple datasets, showing the potential +of using LLMs for few-shot computer vision tasks. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ DETRs Beat YOLOs on Real-time Object Detection + + +
+ The YOLO series has become the most popular framework for real-time object +detection due to its reasonable trade-off between speed and accuracy. However, +we observe that the speed and accuracy of YOLOs are negatively affected by the +NMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an +alternative to eliminating NMS. Nevertheless, the high computational cost +limits their practicality and hinders them from fully exploiting the advantage +of excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer +(RT-DETR), the first real-time end-to-end object detector to our best knowledge +that addresses the above dilemma. We build RT-DETR in two steps, drawing on the +advanced DETR: first we focus on maintaining accuracy while improving speed, +followed by maintaining speed while improving accuracy. Specifically, we design +an efficient hybrid encoder to expeditiously process multi-scale features by +decoupling intra-scale interaction and cross-scale fusion to improve speed. +Then, we propose the uncertainty-minimal query selection to provide +high-quality initial queries to the decoder, thereby improving accuracy. In +addition, RT-DETR supports flexible speed tuning by adjusting the number of +decoder layers to adapt to various scenarios without retraining. Our +RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 +GPU, outperforming previously advanced YOLOs in both speed and accuracy. We +also develop scaled RT-DETRs that outperform the lighter YOLO detectors (S and +M models). Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy +and about 21 times in FPS. After pre-training with Objects365, RT-DETR-R50 / +R101 achieves 55.3% / 56.2% AP. The project page: +https://zhao-yian.github.io/RTDETR. + +
+
+
+
+
+ + ♻ ☆ Creating Ensembles of Classifiers through UMDA for Aerial Scene + Classification GECCO2024 + + +
+ Aerial scene classification, which aims to semantically label remote sensing +images in a set of predefined classes (e.g., agricultural, beach, and harbor), +is a very challenging task in remote sensing due to high intra-class +variability and the different scales and orientations of the objects present in +the dataset images. In remote sensing area, the use of CNN architectures as an +alternative solution is also a reality for scene classification tasks. +Generally, these CNNs are used to perform the traditional image classification +task. However, another less used way to classify remote sensing image might be +the one that uses deep metric learning (DML) approaches. In this sense, this +work proposes to employ six DML approaches for aerial scene classification +tasks, analysing their behave with four different pre-trained CNNs as well as +combining them through the use of evolutionary computation algorithm (UMDA). In +performed experiments, it is possible to observe than DML approaches can +achieve the best classification results when compared to traditional +pre-trained CNNs for three well-known remote sensing aerial scene datasets. In +addition, the UMDA algorithm proved to be a promising strategy to combine DML +approaches when there is diversity among them, managing to improve at least +5.6% of accuracy in the classification results using almost 50\% of the +available classifiers for the construction of the final ensemble of +classifiers. + +
+
+ comment: 9 pages, 4 figures, accepted for presentation at the GECCO2024 +
+
+
+
+
+ + ♻ ☆ LYT-Net: Lightweight YUV Transformer-based Network for Low-Light Image + Enhancement ICIP + + +
+ In recent years, deep learning-based solutions have proven successful in the +domains of image enhancement. This paper introduces LYT-Net, or Lightweight YUV +Transformer-based Network, as a novel approach for low-light image enhancement. +The proposed architecture, distinct from conventional Retinex-based models, +leverages the YUV color space's natural separation of luminance (Y) and +chrominance (U and V) to simplify the intricate task of disentangling light and +color information in images. By utilizing the strengths of transformers, known +for their capability to capture long-range dependencies, LYT-Net ensures a +comprehensive contextual understanding of the image while maintaining reduced +model complexity. By employing a novel hybrid loss function, our proposed +method achieves state-of-the-art results on low-light image enhancement +datasets, all while being considerably more compact than its counterparts. The +source code and pre-trained models are available at +https://github.com/albrateanu/LYT-Net + +
+
+ comment: 10 pages, 6 figures, submitted to ICIP +
+
+
+
+
+ + ♻ ☆ Mind The Edge: Refining Depth Edges in Sparsely-Supervised Monocular + Depth Estimation CVPR24 + + +
+ Monocular Depth Estimation (MDE) is a fundamental problem in computer vision +with numerous applications. Recently, LIDAR-supervised methods have achieved +remarkable per-pixel depth accuracy in outdoor scenes. However, significant +errors are typically found in the proximity of depth discontinuities, i.e., +depth edges, which often hinder the performance of depth-dependent applications +that are sensitive to such inaccuracies, e.g., novel view synthesis and +augmented reality. Since direct supervision for the location of depth edges is +typically unavailable in sparse LIDAR-based scenes, encouraging the MDE model +to produce correct depth edges is not straightforward. To the best of our +knowledge this paper is the first attempt to address the depth edges issue for +LIDAR-supervised scenes. In this work we propose to learn to detect the +location of depth edges from densely-supervised synthetic data, and use it to +generate supervision for the depth edges in the MDE training. To quantitatively +evaluate our approach, and due to the lack of depth edges GT in LIDAR-based +scenes, we manually annotated subsets of the KITTI and the DDAD datasets with +depth edges ground truth. We demonstrate significant gains in the accuracy of +the depth edges with comparable per-pixel depth accuracy on several challenging +datasets. Code and datasets are available at +\url{https://github.com/liortalker/MindTheEdge}. + +
+
+ comment: Appears in CVPR24' +
+
+
+
+
+ + ♻ ☆ CV-Attention UNet: Attention-based UNet for 3D Cerebrovascular + Segmentation of Enhanced TOF-MRA Images + + +
+ Due to the lack of automated methods, to diagnose cerebrovascular disease, +time-of-flight magnetic resonance angiography (TOF-MRA) is assessed visually, +making it time-consuming. The commonly used encoder-decoder architectures for +cerebrovascular segmentation utilize redundant features, eventually leading to +the extraction of low-level features multiple times. Additionally, +convolutional neural networks (CNNs) suffer from performance degradation when +the batch size is small, and deeper networks experience the vanishing gradient +problem. Methods: In this paper, we attempt to solve these limitations and +propose the 3D cerebrovascular attention UNet method, named CV-AttentionUNet, +for precise extraction of brain vessel images. We proposed a sequence of +preprocessing techniques followed by deeply supervised UNet to improve the +accuracy of segmentation of the brain vessels leading to a stroke. To combine +the low and high semantics, we applied the attention mechanism. This mechanism +focuses on relevant associations and neglects irrelevant anatomical +information. Furthermore, the inclusion of deep supervision incorporates +different levels of features that prove to be beneficial for network +convergence. Results: We demonstrate the efficiency of the proposed method by +cross-validating with an unlabeled dataset, which was further labeled by us. We +believe that the novelty of this algorithm lies in its ability to perform well +on both labeled and unlabeled data with image processing-based enhancement. The +results indicate that our method performed better than the existing +state-of-the-art methods on the TubeTK dataset. Conclusion: The proposed method +will help in accurate segmentation of cerebrovascular structure leading to +stroke + +
+
+
+
+
+ + ♻ ☆ From Isolated Islands to Pangea: Unifying Semantic Space for Human + Action Understanding CVPR 2024 + + +
+ Action understanding has attracted long-term attention. It can be formed as +the mapping from the physical space to the semantic space. Typically, +researchers built datasets according to idiosyncratic choices to define classes +and push the envelope of benchmarks respectively. Datasets are incompatible +with each other like "Isolated Islands" due to semantic gaps and various class +granularities, e.g., do housework in dataset A and wash plate in dataset B. We +argue that we need a more principled semantic space to concentrate the +community efforts and use all datasets together to pursue generalizable action +learning. To this end, we design a structured action semantic space given verb +taxonomy hierarchy and covering massive actions. By aligning the classes of +previous datasets to our semantic space, we gather (image/video/skeleton/MoCap) +datasets into a unified database in a unified label system, i.e., bridging +"isolated islands" into a "Pangea". Accordingly, we propose a novel model +mapping from the physical space to semantic space to fully use Pangea. In +extensive experiments, our new system shows significant superiority, especially +in transfer learning. Our code and data will be made public at +https://mvig-rhos.com/pangea. + +
+
+ comment: CVPR 2024, Project Webpage: https://mvig-rhos.com/pangea +
+
+
+
+
+ + ♻ ☆ RDumb: A simple approach that questions our progress in continual + test-time adaptation + + +
+ Test-Time Adaptation (TTA) allows to update pre-trained models to changing +data distributions at deployment time. While early work tested these algorithms +for individual fixed distribution shifts, recent work proposed and applied +methods for continual adaptation over long timescales. To examine the reported +progress in the field, we propose the Continually Changing Corruptions (CCC) +benchmark to measure asymptotic performance of TTA techniques. We find that +eventually all but one state-of-the-art methods collapse and perform worse than +a non-adapting model, including models specifically proposed to be robust to +performance collapse. In addition, we introduce a simple baseline, "RDumb", +that periodically resets the model to its pretrained state. RDumb performs +better or on par with the previously proposed state-of-the-art in all +considered benchmarks. Our results show that previous TTA approaches are +neither effective at regularizing adaptation to avoid collapse nor able to +outperform a simplistic resetting strategy. + +
+
+
+
+
+ + ♻ ☆ Task-conditioned adaptation of visual features in multi-task policy + learning + + +
+ Successfully addressing a wide variety of tasks is a core ability of +autonomous agents, requiring flexibly adapting the underlying decision-making +strategies and, as we argue in this work, also adapting the perception modules. +An analogical argument would be the human visual system, which uses top-down +signals to focus attention determined by the current task. Similarly, we adapt +pre-trained large vision models conditioned on specific downstream tasks in the +context of multi-task policy learning. We introduce task-conditioned adapters +that do not require finetuning any pre-trained weights, combined with a single +policy trained with behavior cloning and capable of addressing multiple tasks. +We condition the visual adapters on task embeddings, which can be selected at +inference if the task is known, or alternatively inferred from a set of example +demonstrations. To this end, we propose a new optimization-based estimator. We +evaluate the method on a wide variety of tasks from the CortexBench benchmark +and show that, compared to existing work, it can be addressed with a single +policy. In particular, we demonstrate that adapting visual features is a key +design choice and that the method generalizes to unseen tasks given a few +demonstrations. + +
+
+
+
+
+ + ♻ ☆ Robustness Assessment of a Runway Object Classifier for Safe Aircraft + Taxiing SC + + +
+ As deep neural networks (DNNs) are becoming the prominent solution for many +computational problems, the aviation industry seeks to explore their potential +in alleviating pilot workload and in improving operational safety. However, the +use of DNNs in this type of safety-critical applications requires a thorough +certification process. This need can be addressed through formal verification, +which provides rigorous assurances -- e.g.,~by proving the absence of certain +mispredictions. In this case-study paper, we demonstrate this process using an +image-classifier DNN currently under development at Airbus and intended for use +during the aircraft taxiing phase. We use formal methods to assess this DNN's +robustness to three common image perturbation types: noise, brightness and +contrast, and some of their combinations. This process entails multiple +invocations of the underlying verifier, which might be computationally +expensive; and we therefore propose a method that leverages the monotonicity of +these robustness properties, as well as the results of past verification +queries, in order to reduce the overall number of verification queries required +by nearly 60%. Our results provide an indication of the level of robustness +achieved by the DNN classifier under study, and indicate that it is +considerably more vulnerable to noise than to brightness or contrast +perturbations. + +
+
+ comment: This is a preprint version of the paper in the proceedings of 43rd + Digital Avionics Systems Conference (DASC) +
+
+
+
+
+ + ♻ ☆ Long-term Frame-Event Visual Tracking: Benchmark Dataset and Baseline + + +
+ Current event-/frame-event based trackers undergo evaluation on short-term +tracking datasets, however, the tracking of real-world scenarios involves +long-term tracking, and the performance of existing tracking algorithms in +these scenarios remains unclear. In this paper, we first propose a new +long-term and large-scale frame-event single object tracking dataset, termed +FELT. It contains 742 videos and 1,594,474 RGB frames and event stream pairs +and has become the largest frame-event tracking dataset to date. We re-train +and evaluate 15 baseline trackers on our dataset for future works to compare. +More importantly, we find that the RGB frames and event streams are naturally +incomplete due to the influence of challenging factors and spatially sparse +event flow. In response to this, we propose a novel associative memory +Transformer network as a unified backbone by introducing modern Hopfield layers +into multi-head self-attention blocks to fuse both RGB and event data. +Extensive experiments on RGB-Event (FELT), RGB-Thermal (RGBT234, LasHeR), and +RGB-Depth (DepthTrack) datasets fully validated the effectiveness of our model. +The dataset and source code can be found at +\url{https://github.com/Event-AHU/FELT_SOT_Benchmark}. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ♻ ☆ Advancing Ante-Hoc Explainable Models through Generative Adversarial + Networks AAAI 2024 + + +
+ This paper presents a novel concept learning framework for enhancing model +interpretability and performance in visual classification tasks. Our approach +appends an unsupervised explanation generator to the primary classifier network +and makes use of adversarial training. During training, the explanation module +is optimized to extract visual concepts from the classifier's latent +representations, while the GAN-based module aims to discriminate images +generated from concepts, from true images. This joint training scheme enables +the model to implicitly align its internally learned concepts with +human-interpretable visual properties. Comprehensive experiments demonstrate +the robustness of our approach, while producing coherent concept activations. +We analyse the learned concepts, showing their semantic concordance with object +parts and visual attributes. We also study how perturbations in the adversarial +training protocol impact both classification and concept acquisition. In +summary, this work presents a significant step towards building inherently +interpretable deep vision models with task-aligned concept representations - a +key enabler for developing trustworthy AI for real-world perception tasks. + +
+
+ comment: Paper accepted in Human-Centric Representation Learning workshop at + AAAI 2024 (https://hcrl-workshop.github.io/2024/). Paper accepted and + presented at Deployable AI Workshop at AAAI-2024 + (https://sites.google.com/view/dai-2024/home) +
+
+
+
+
+ + ♻ ☆ RAVE: Residual Vector Embedding for CLIP-Guided Backlit Image + Enhancement + + +
+ In this paper we propose a novel modification of Contrastive Language-Image +Pre-Training (CLIP) guidance for the task of unsupervised backlit image +enhancement. Our work builds on the state-of-the-art CLIP-LIT approach, which +learns a prompt pair by constraining the text-image similarity between a prompt +(negative/positive sample) and a corresponding image (backlit image/well-lit +image) in the CLIP embedding space. Learned prompts then guide an image +enhancement network. Based on the CLIP-LIT framework, we propose two novel +methods for CLIP guidance. First, we show that instead of tuning prompts in the +space of text embeddings, it is possible to directly tune their embeddings in +the latent space without any loss in quality. This accelerates training and +potentially enables the use of additional encoders that do not have a text +encoder. Second, we propose a novel approach that does not require any prompt +tuning. Instead, based on CLIP embeddings of backlit and well-lit images from +training data, we compute the residual vector in the embedding space as a +simple difference between the mean embeddings of the well-lit and backlit +images. This vector then guides the enhancement network during training, +pushing a backlit image towards the space of well-lit images. This approach +further dramatically reduces training time, stabilizes training and produces +high quality enhanced images without artifacts, both in supervised and +unsupervised training regimes. Additionally, we show that residual vectors can +be interpreted, revealing biases in training data, and thereby enabling +potential bias correction. + +
+
+
+
+
+ + ♻ ☆ MeciFace: Mechanomyography and Inertial Fusion-based Glasses for Edge + Real-Time Recognition of Facial and Eating Activities + + +
+ The increasing prevalence of stress-related eating behaviors and their impact +on overall health highlights the importance of effective and ubiquitous +monitoring systems. In this paper, we present MeciFace, an innovative wearable +technology designed to monitor facial expressions and eating activities in +real-time on-the-edge (RTE). MeciFace aims to provide a low-power, +privacy-conscious, and highly accurate tool for promoting healthy eating +behaviors and stress management. We employ lightweight convolutional neural +networks as backbone models for facial expression and eating monitoring +scenarios. The MeciFace system ensures efficient data processing with a tiny +memory footprint, ranging from 11KB to 19 KB. During RTE evaluation, the system +achieves an F1-score of < 86% for facial expression recognition and 94% for +eating/drinking monitoring, for the RTE of unseen users (user-independent +case). + +
+
+ comment: Submitted to IEEE Transactions on Consumer Electronics +
+
+
+
+
+ + ♻ ☆ Language Guided Domain Generalized Medical Image Segmentation + + +
+ Single source domain generalization (SDG) holds promise for more reliable and +consistent image segmentation across real-world clinical settings particularly +in the medical domain, where data privacy and acquisition cost constraints +often limit the availability of diverse datasets. Depending solely on visual +features hampers the model's capacity to adapt effectively to various domains, +primarily because of the presence of spurious correlations and domain-specific +characteristics embedded within the image features. Incorporating text features +alongside visual features is a potential solution to enhance the model's +understanding of the data, as it goes beyond pixel-level information to provide +valuable context. Textual cues describing the anatomical structures, their +appearances, and variations across various imaging modalities can guide the +model in domain adaptation, ultimately contributing to more robust and +consistent segmentation. In this paper, we propose an approach that explicitly +leverages textual information by incorporating a contrastive learning mechanism +guided by the text encoder features to learn a more robust feature +representation. We assess the effectiveness of our text-guided contrastive +feature alignment technique in various scenarios, including cross-modality, +cross-sequence, and cross-site settings for different segmentation tasks. Our +approach achieves favorable performance against existing methods in literature. +Our code and model weights are available at +https://github.com/ShahinaKK/LG_SDG.git. + +
+
+ comment: Accepted at ISBI2024 +
+
+
+
+
+ + ♻ ☆ Scaling Up to Excellence: Practicing Model Scaling for Photo-Realistic + Image Restoration In the Wild CVPR 2024 + + +
+ We introduce SUPIR (Scaling-UP Image Restoration), a groundbreaking image +restoration method that harnesses generative prior and the power of model +scaling up. Leveraging multi-modal techniques and advanced generative prior, +SUPIR marks a significant advance in intelligent and realistic image +restoration. As a pivotal catalyst within SUPIR, model scaling dramatically +enhances its capabilities and demonstrates new potential for image restoration. +We collect a dataset comprising 20 million high-resolution, high-quality images +for model training, each enriched with descriptive text annotations. SUPIR +provides the capability to restore images guided by textual prompts, broadening +its application scope and potential. Moreover, we introduce negative-quality +prompts to further improve perceptual quality. We also develop a +restoration-guided sampling method to suppress the fidelity issue encountered +in generative-based restoration. Experiments demonstrate SUPIR's exceptional +restoration effects and its novel capacity to manipulate restoration through +textual prompts. + +
+
+ comment: This paper has been accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Analysis of Video Quality Datasets via Design of Minimalistic Video + Quality Models + + +
+ Blind video quality assessment (BVQA) plays an indispensable role in +monitoring and improving the end-users' viewing experience in various +real-world video-enabled media applications. As an experimental field, the +improvements of BVQA models have been measured primarily on a few human-rated +VQA datasets. Thus, it is crucial to gain a better understanding of existing +VQA datasets in order to properly evaluate the current progress in BVQA. +Towards this goal, we conduct a first-of-its-kind computational analysis of VQA +datasets via designing minimalistic BVQA models. By minimalistic, we restrict +our family of BVQA models to build only upon basic blocks: a video preprocessor +(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an +optional temporal quality analyzer, and a quality regressor, all with the +simplest possible instantiations. By comparing the quality prediction +performance of different model variants on eight VQA datasets with realistic +distortions, we find that nearly all datasets suffer from the easy dataset +problem of varying severity, some of which even admit blind image quality +assessment (BIQA) solutions. We additionally justify our claims by contrasting +our model generalizability on these VQA datasets, and by ablating a dizzying +set of BVQA design choices related to the basic building blocks. Our results +cast doubt on the current progress in BVQA, and meanwhile shed light on good +practices of constructing next-generation VQA datasets and models. + +
+
+
+
+
+ + ♻ ☆ A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation: + Generalizability and Clinical Utility Beyond the ISLES Challenge + + +
+ Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment +decisions, and prognosis. However, image and disease variability hinder the +development of generalizable AI algorithms with clinical value. We address this +gap by presenting a novel ensemble algorithm derived from the 2022 Ischemic +Stroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient +scans with ischemic stroke from various medical centers, facilitating the +development of a wide range of cutting-edge segmentation algorithms by the +research community. Through collaboration with leading teams, we combined +top-performing algorithms into an ensemble model that overcomes the limitations +of individual solutions. Our ensemble model achieved superior ischemic lesion +detection and segmentation accuracy on our internal test set compared to +individual algorithms. This accuracy generalized well across diverse image and +disease variables. Furthermore, the model excelled in extracting clinical +biomarkers. Notably, in a Turing-like test, neuroradiologists consistently +preferred the algorithm's segmentations over manual expert efforts, +highlighting increased comprehensiveness and precision. Validation using a +real-world external dataset (N=1686) confirmed the model's generalizability. +The algorithm's outputs also demonstrated strong correlations with clinical +scores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived +results, underlining its clinical relevance. This study offers two key +findings. First, we present an ensemble algorithm +(https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments +ischemic stroke lesions on DWI across diverse scenarios on par with expert +(neuro)radiologists. Second, we show the potential for biomedical challenge +outputs to extend beyond the challenge's initial objectives, demonstrating +their real-world clinical applicability. + +
+
+
+
+
+ + ♻ ☆ Object-level Copy-Move Forgery Image Detection based on Inconsistency + Mining WWW 2024 + + +
+ In copy-move tampering operations, perpetrators often employ techniques, such +as blurring, to conceal tampering traces, posing significant challenges to the +detection of object-level targets with intact structures. Focus on these +challenges, this paper proposes an Object-level Copy-Move Forgery Image +Detection based on Inconsistency Mining (IMNet). To obtain complete +object-level targets, we customize prototypes for both the source and tampered +regions and dynamically update them. Additionally, we extract inconsistent +regions between coarse similar regions obtained through self-correlation +calculations and regions composed of prototypes. The detected inconsistent +regions are used as supplements to coarse similar regions to refine pixel-level +detection. We operate experiments on three public datasets which validate the +effectiveness and the robustness of the proposed IMNet. + +
+
+ comment: 4 pages, 2 figures, Accepted to WWW 2024 +
+
+
+
+
+ + ♻ ☆ InfLoRA: Interference-Free Low-Rank Adaptation for Continual Learning CVPR 2024 + + +
+ Continual learning requires the model to learn multiple tasks sequentially. +In continual learning, the model should possess the ability to maintain its +performance on old tasks (stability) and the ability to adapt to new tasks +continuously (plasticity). Recently, parameter-efficient fine-tuning (PEFT), +which involves freezing a pre-trained model and injecting a small number of +learnable parameters to adapt to downstream tasks, has gained increasing +popularity in continual learning. Although existing continual learning methods +based on PEFT have demonstrated superior performance compared to those not +based on PEFT, most of them do not consider how to eliminate the interference +of the new task on the old tasks, which inhibits the model from making a good +trade-off between stability and plasticity. In this work, we propose a new PEFT +method, called interference-free low-rank adaptation (InfLoRA), for continual +learning. InfLoRA injects a small number of parameters to reparameterize the +pre-trained weights and shows that fine-tuning these injected parameters is +equivalent to fine-tuning the pre-trained weights within a subspace. +Furthermore, InfLoRA designs this subspace to eliminate the interference of the +new task on the old tasks, making a good trade-off between stability and +plasticity. Experimental results show that InfLoRA outperforms existing +state-of-the-art continual learning methods on multiple datasets. + +
+
+ comment: Accepted by the 2024 IEEE/CVF Conference on Computer Vision and + Pattern Recognition (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ RRWNet: Recursive Refinement Network for Effective Retinal Artery/Vein + Segmentation and Classification + + +
+ The caliber and configuration of retinal blood vessels serve as important +biomarkers for various diseases and medical conditions. A thorough analysis of +the retinal vasculature requires the segmentation of the blood vessels and +their classification into arteries and veins, typically performed on color +fundus images obtained by retinography. However, manually performing these +tasks is labor-intensive and prone to human error. While several automated +methods have been proposed to address this task, the current state of art faces +challenges due to manifest classification errors affecting the topological +consistency of segmentation maps. In this work, we introduce RRWNet, a novel +end-to-end deep learning framework that addresses this limitation. The +framework consists of a fully convolutional neural network that recursively +refines semantic segmentation maps, correcting manifest classification errors +and thus improving topological consistency. In particular, RRWNet is composed +of two specialized subnetworks: a Base subnetwork that generates base +segmentation maps from the input images, and a Recursive Refinement subnetwork +that iteratively and recursively improves these maps. Evaluation on three +different public datasets demonstrates the state-of-the-art performance of the +proposed method, yielding more topologically consistent segmentation maps with +fewer manifest classification errors than existing approaches. In addition, the +Recursive Refinement module within RRWNet proves effective in post-processing +segmentation maps from other methods, further demonstrating its potential. The +model code, weights, and predictions will be publicly available at +https://github.com/j-morano/rrwnet. + +
+
+
+
+
+ + ♻ ☆ ARS-DETR: Aspect Ratio Sensitive Oriented Object Detection with + Transformer + + +
+ Existing oriented object detection methods commonly use metric AP$_{50}$ to +measure the performance of the model. We argue that AP$_{50}$ is inherently +unsuitable for oriented object detection due to its large tolerance in angle +deviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$, +to measure the performance of models. In this paper, we propose an Aspect Ratio +Sensitive Oriented Object Detector with Transformer, termed ARS-DETR, which +exhibits a competitive performance in high-precision oriented object detection. +Specifically, a new angle classification method, calling Aspect Ratio aware +Circle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more +reasonable way and discard the hyperparameter that introduced by previous work +(e.g. CSL). Then, a rotated deformable attention module is designed to rotate +the sampling points with the corresponding angles and eliminate the +misalignment between region features and sampling points. Moreover, a dynamic +weight coefficient according to the aspect ratio is adopted to calculate the +angle loss. Comprehensive experiments on several challenging datasets show that +our method achieves competitive performance on the high-precision oriented +object detection task. + +
+
+ comment: 10 pages, 8 figures, 8 tables, the source code is available at + https://github.com/httle/ARS-DETR +
+
+
+
+
+ + ♻ ☆ Discriminative Sample-Guided and Parameter-Efficient Feature Space + Adaptation for Cross-Domain Few-Shot Learning + + +
+ In this paper, we look at cross-domain few-shot classification which presents +the challenging task of learning new classes in previously unseen domains with +few labelled examples. Existing methods, though somewhat effective, encounter +several limitations, which we alleviate through two significant improvements. +First, we introduce a lightweight parameter-efficient adaptation strategy to +address overfitting associated with fine-tuning a large number of parameters on +small datasets. This strategy employs a linear transformation of pre-trained +features, significantly reducing the trainable parameter count. Second, we +replace the traditional nearest centroid classifier with a discriminative +sample-aware loss function, enhancing the model's sensitivity to the inter- and +intra-class variances within the training set for improved clustering in +feature space. Empirical evaluations on the Meta-Dataset benchmark showcase +that our approach not only improves accuracy up to 7.7\% and 5.3\% on +previously seen and unseen datasets, respectively, but also achieves the above +performance while being at least $\sim3\times$ more parameter-efficient than +existing methods, establishing a new state-of-the-art in cross-domain few-shot +learning. Our code is available at https://github.com/rashindrie/DIPA. + +
+
+ comment: Code is available at this link: https://github.com/rashindrie/DIPA +
+
+
+
+
+ + ♻ ☆ MotionChain: Conversational Motion Controllers via Multimodal Prompts + + +
+ Recent advancements in language models have demonstrated their adeptness in +conducting multi-turn dialogues and retaining conversational context. However, +this proficiency remains largely unexplored in other multimodal generative +models, particularly in human motion models. By integrating multi-turn +conversations in controlling continuous virtual human movements, generative +human motion models can achieve an intuitive and step-by-step process of human +task execution for humanoid robotics, game agents, or other embodied systems. +In this work, we present MotionChain, a conversational human motion controller +to generate continuous and long-term human motion through multimodal prompts. +Specifically, MotionChain consists of multi-modal tokenizers that transform +various data types such as text, image, and motion, into discrete tokens, +coupled with a Vision-Motion-aware Language model. By leveraging large-scale +language, vision-language, and vision-motion data to assist motion-related +generation tasks, MotionChain thus comprehends each instruction in multi-turn +conversation and generates human motions followed by these prompts. Extensive +experiments validate the efficacy of MotionChain, demonstrating +state-of-the-art performance in conversational motion generation, as well as +more intuitive manners of controlling and interacting with virtual humans. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ The Audio-Visual Conversational Graph: From an Egocentric-Exocentric + Perspective + + +
+ In recent years, the thriving development of research related to egocentric +videos has provided a unique perspective for the study of conversational +interactions, where both visual and audio signals play a crucial role. While +most prior work focus on learning about behaviors that directly involve the +camera wearer, we introduce the Ego-Exocentric Conversational Graph Prediction +problem, marking the first attempt to infer exocentric conversational +interactions from egocentric videos. We propose a unified multi-modal framework +-- Audio-Visual Conversational Attention (AV-CONV), for the joint prediction of +conversation behaviors -- speaking and listening -- for both the camera wearer +as well as all other social partners present in the egocentric video. +Specifically, we adopt the self-attention mechanism to model the +representations across-time, across-subjects, and across-modalities. To +validate our method, we conduct experiments on a challenging egocentric video +dataset that includes multi-speaker and multi-conversation scenarios. Our +results demonstrate the superior performance of our method compared to a series +of baselines. We also present detailed ablation studies to assess the +contribution of each component in our model. Check our project page at +https://vjwq.github.io/AV-CONV/. + +
+
+
+
+
+ + ♻ ☆ LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging CVPR 2024 + + +
+ Human pose and shape (HPS) estimation with lensless imaging is not only +beneficial to privacy protection but also can be used in covert surveillance +scenarios due to the small size and simple structure of this device. However, +this task presents significant challenges due to the inherent ambiguity of the +captured measurements and lacks effective methods for directly estimating human +pose and shape from lensless data. In this paper, we propose the first +end-to-end framework to recover 3D human poses and shapes from lensless +measurements to our knowledge. We specifically design a multi-scale lensless +feature decoder to decode the lensless measurements through the optically +encoded mask for efficient feature extraction. We also propose a double-head +auxiliary supervision mechanism to improve the estimation accuracy of human +limb ends. Besides, we establish a lensless imaging system and verify the +effectiveness of our method on various datasets acquired by our lensless +imaging system. + +
+
+ comment: Accepted to CVPR 2024. More results available at + https://cic.tju.edu.cn/faculty/likun/projects/LPSNet +
+
+
+
+
+ + ♻ ☆ Semantic Human Mesh Reconstruction with Textures CVPR 2024 + + +
+ The field of 3D detailed human mesh reconstruction has made significant +progress in recent years. However, current methods still face challenges when +used in industrial applications due to unstable results, low-quality meshes, +and a lack of UV unwrapping and skinning weights. In this paper, we present +SHERT, a novel pipeline that can reconstruct semantic human meshes with +textures and high-precision details. SHERT applies semantic- and normal-based +sampling between the detailed surface (e.g. mesh and SDF) and the corresponding +SMPL-X model to obtain a partially sampled semantic mesh and then generates the +complete semantic mesh by our specifically designed self-supervised completion +and refinement networks. Using the complete semantic mesh as a basis, we employ +a texture diffusion model to create human textures that are driven by both +images and texts. Our reconstructed meshes have stable UV unwrapping, +high-quality triangle meshes, and consistent semantic information. The given +SMPL-X model provides semantic information and shape priors, allowing SHERT to +perform well even with incorrect and incomplete inputs. The semantic +information also makes it easy to substitute and animate different body parts +such as the face, body, and hands. Quantitative and qualitative experiments +demonstrate that SHERT is capable of producing high-fidelity and robust +semantic meshes that outperform state-of-the-art methods. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://zhanxy.xyz/projects/shert/ +
+
+
+
+
+ + ♻ ☆ Optimizing Diffusion Noise Can Serve As Universal Motion Priors CVPR 2024 + + +
+ We propose Diffusion Noise Optimization (DNO), a new method that effectively +leverages existing motion diffusion models as motion priors for a wide range of +motion-related tasks. Instead of training a task-specific diffusion model for +each new task, DNO operates by optimizing the diffusion latent noise of an +existing pre-trained text-to-motion model. Given the corresponding latent noise +of a human motion, it propagates the gradient from the target criteria defined +on the motion space through the whole denoising process to update the diffusion +latent noise. As a result, DNO supports any use cases where criteria can be +defined as a function of motion. In particular, we show that, for motion +editing and control, DNO outperforms existing methods in both achieving the +objective and preserving the motion content. DNO accommodates a diverse range +of editing modes, including changing trajectory, pose, joint locations, or +avoiding newly added obstacles. In addition, DNO is effective in motion +denoising and completion, producing smooth and realistic motion from noisy and +partial inputs. DNO achieves these results at inference time without the need +for model retraining, offering great versatility for any defined reward or loss +function on the motion representation. + +
+
+ comment: CVPR 2024. Project page: https://korrawe.github.io/dno-project/ +
+
+
+
+
+ + ♻ ☆ Analytic-Splatting: Anti-Aliased 3D Gaussian Splatting via Analytic + Integration + + +
+ The 3D Gaussian Splatting (3DGS) gained its popularity recently by combining +the advantages of both primitive-based and volumetric 3D representations, +resulting in improved quality and efficiency for 3D scene rendering. However, +3DGS is not alias-free, and its rendering at varying resolutions could produce +severe blurring or jaggies. This is because 3DGS treats each pixel as an +isolated, single point rather than as an area, causing insensitivity to changes +in the footprints of pixels. Consequently, this discrete sampling scheme +inevitably results in aliasing, owing to the restricted sampling bandwidth. In +this paper, we derive an analytical solution to address this issue. More +specifically, we use a conditioned logistic function as the analytic +approximation of the cumulative distribution function (CDF) in a +one-dimensional Gaussian signal and calculate the Gaussian integral by +subtracting the CDFs. We then introduce this approximation in the +two-dimensional pixel shading, and present Analytic-Splatting, which +analytically approximates the Gaussian integral within the 2D-pixel window area +to better capture the intensity response of each pixel. Moreover, we use the +approximated response of the pixel window integral area to participate in the +transmittance calculation of volume rendering, making Analytic-Splatting +sensitive to the changes in pixel footprint at different resolutions. +Experiments on various datasets validate that our approach has better +anti-aliasing capability that gives more details and better fidelity. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ ☆ MLLMReID: Multimodal Large Language Model-based Person Re-identification + + +
+ Multimodal large language models (MLLM) have achieved satisfactory results in +many tasks. However, their performance in the task of person re-identification +(ReID) has not been explored to date. This paper will investigate how to adapt +them for the task of ReID. An intuitive idea is to fine-tune MLLM with ReID +image-text datasets, and then use their visual encoder as a backbone for ReID. +However, there still exist two apparent issues: (1) Designing instructions for +ReID, MLLMs may overfit specific instructions, and designing a variety of +instructions will lead to higher costs. (2) Latent image feature vectors from +LLMs are not involved in loss computation. Instructional learning, aligning +image-text features, results in indirect optimization and a learning objective +that inadequately utilizes features, limiting effectiveness in person feature +learning. To address these problems, this paper proposes MLLMReID: Multimodal +Large Language Model-based ReID. Firstly, we proposed Common Instruction, a +simple approach that leverages the essence ability of LLMs to continue writing, +avoiding complex and diverse instruction design. Secondly, we proposed +DirectReID, which effectively employs the latent image feature vectors of +images outputted by LLMs in ReID tasks. The experimental results demonstrate +the superiority of our method. We will open-source the code on GitHub. + +
+
+
+
+
+ + ♻ ☆ 3D Open-Vocabulary Panoptic Segmentation with 2D-3D Vision-Language + Distillation + + +
+ 3D panoptic segmentation is a challenging perception task, especially in +autonomous driving. It aims to predict both semantic and instance annotations +for 3D points in a scene. Although prior 3D panoptic segmentation approaches +have achieved great performance on closed-set benchmarks, generalizing these +approaches to unseen things and unseen stuff categories remains an open +problem. For unseen object categories, 2D open-vocabulary segmentation has +achieved promising results that solely rely on frozen CLIP backbones and +ensembling multiple classification outputs. However, we find that simply +extending these 2D models to 3D does not guarantee good performance due to poor +per-mask classification quality, especially for novel stuff categories. In this +paper, we propose the first method to tackle 3D open-vocabulary panoptic +segmentation. Our model takes advantage of the fusion between learnable LiDAR +features and dense frozen vision CLIP features, using a single classification +head to make predictions for both base and novel classes. To further improve +the classification performance on novel classes and leverage the CLIP model, we +propose two novel loss functions: object-level distillation loss and +voxel-level distillation loss. Our experiments on the nuScenes and +SemanticKITTI datasets show that our method outperforms the strong baseline by +a large margin. + +
+
+
+
+
+ + ♻ ☆ On-Device Training Under 256KB Memory NeurIPS 2022 + + +
+ On-device training enables the model to adapt to new data collected from the +sensors by fine-tuning a pre-trained model. Users can benefit from customized +AI models without having to transfer the data to the cloud, protecting the +privacy. However, the training memory consumption is prohibitive for IoT +devices that have tiny memory resources. We propose an algorithm-system +co-design framework to make on-device training possible with only 256KB of +memory. On-device training faces two unique challenges: (1) the quantized +graphs of neural networks are hard to optimize due to low bit-precision and the +lack of normalization; (2) the limited hardware resource does not allow full +back-propagation. To cope with the optimization difficulty, we propose +Quantization-Aware Scaling to calibrate the gradient scales and stabilize 8-bit +quantized training. To reduce the memory footprint, we propose Sparse Update to +skip the gradient computation of less important layers and sub-tensors. The +algorithm innovation is implemented by a lightweight training system, Tiny +Training Engine, which prunes the backward computation graph to support sparse +updates and offload the runtime auto-differentiation to compile time. Our +framework is the first solution to enable tiny on-device training of +convolutional neural networks under 256KB SRAM and 1MB Flash without auxiliary +memory, using less than 1/1000 of the memory of PyTorch and TensorFlow while +matching the accuracy on tinyML application VWW. Our study enables IoT devices +not only to perform inference but also to continuously adapt to new data for +on-device lifelong learning. A video demo can be found here: +https://youtu.be/0pUFZYdoMY8. + +
+
+ comment: NeurIPS 2022 +
+
+
+
+
+ + ♻ ☆ Burst Super-Resolution with Diffusion Models for Improving Perceptual + Quality IJCNN 2024 + + +
+ While burst LR images are useful for improving the SR image quality compared +with a single LR image, prior SR networks accepting the burst LR images are +trained in a deterministic manner, which is known to produce a blurry SR image. +In addition, it is difficult to perfectly align the burst LR images, making the +SR image more blurry. Since such blurry images are perceptually degraded, we +aim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity +images can be reconstructed by diffusion models. However, prior SR methods +using the diffusion model are not properly optimized for the burst SR task. +Specifically, the reverse process starting from a random sample is not +optimized for image enhancement and restoration methods, including burst SR. In +our proposed method, on the other hand, burst LR features are used to +reconstruct the initial burst SR image that is fed into an intermediate step in +the diffusion model. This reverse process from the intermediate step 1) skips +diffusion steps for reconstructing the global structure of the image and 2) +focuses on steps for refining detailed textures. Our experimental results +demonstrate that our method can improve the scores of the perceptual quality +metrics. Code: https://github.com/placerkyo/BSRD + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ♻ ☆ MI-NeRF: Learning a Single Face NeRF from Multiple Identities + + +
+ In this work, we introduce a method that learns a single dynamic neural +radiance field (NeRF) from monocular talking face videos of multiple +identities. NeRFs have shown remarkable results in modeling the 4D dynamics and +appearance of human faces. However, they require per-identity optimization. +Although recent approaches have proposed techniques to reduce the training and +rendering time, increasing the number of identities can be expensive. We +introduce MI-NeRF (multi-identity NeRF), a single unified network that models +complex non-rigid facial motion for multiple identities, using only monocular +videos of arbitrary length. The core premise in our method is to learn the +non-linear interactions between identity and non-identity specific information +with a multiplicative module. By training on multiple videos simultaneously, +MI-NeRF not only reduces the total training time compared to standard +single-identity NeRFs, but also demonstrates robustness in synthesizing novel +expressions for any input identity. We present results for both facial +expression transfer and talking face video synthesis. Our method can be further +personalized for a target identity given only a short video. + +
+
+ comment: Project page: https://aggelinacha.github.io/MI-NeRF/ +
+
+
+
+
+ + ♻ ☆ HAC: Hash-grid Assisted Context for 3D Gaussian Splatting Compression + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel +view synthesis, boasting rapid rendering speed with high fidelity. However, the +substantial Gaussians and their associated attributes necessitate effective +compression techniques. Nevertheless, the sparse and unorganized nature of the +point cloud of Gaussians (or anchors in our paper) presents challenges for +compression. To address this, we make use of the relations between the +unorganized anchors and the structured hash grid, leveraging their mutual +information for context modeling, and propose a Hash-grid Assisted Context +(HAC) framework for highly compact 3DGS representation. Our approach introduces +a binary hash grid to establish continuous spatial consistencies, allowing us +to unveil the inherent spatial relations of anchors through a carefully +designed context model. To facilitate entropy coding, we utilize Gaussian +distributions to accurately estimate the probability of each quantized +attribute, where an adaptive quantization module is proposed to enable +high-precision quantization of these attributes for improved fidelity +restoration. Additionally, we incorporate an adaptive masking strategy to +eliminate invalid Gaussians and anchors. Importantly, our work is the pioneer +to explore context-based compression for 3DGS representation, resulting in a +remarkable size reduction of over $75\times$ compared to vanilla 3DGS, while +simultaneously improving fidelity, and achieving over $11\times$ size reduction +over SOTA 3DGS compression approach Scaffold-GS. Our code is available here: +https://github.com/YihangChen-ee/HAC + +
+
+ comment: Project Page: https://yihangchen-ee.github.io/project_hac/ Code: + https://github.com/YihangChen-ee/HAC +
+
+
+
+
+ + ♻ ☆ Causal Intervention for Subject-Deconfounded Facial Action Unit + Recognition AAAI2022 + + +
+ Subject-invariant facial action unit (AU) recognition remains challenging for +the reason that the data distribution varies among subjects. In this paper, we +propose a causal inference framework for subject-invariant facial action unit +recognition. To illustrate the causal effect existing in AU recognition task, +we formulate the causalities among facial images, subjects, latent AU semantic +relations, and estimated AU occurrence probabilities via a structural causal +model. By constructing such a causal diagram, we clarify the causal effect +among variables and propose a plug-in causal intervention module, CIS, to +deconfound the confounder \emph{Subject} in the causal diagram. Extensive +experiments conducted on two commonly used AU benchmark datasets, BP4D and +DISFA, show the effectiveness of our CIS, and the model with CIS inserted, +CISNet, has achieved state-of-the-art performance. + +
+
+ comment: Accepted by AAAI2022 +
+
+
+
+
+ + ♻ ☆ Improved Zero-Shot Classification by Adapting VLMs with Text + Descriptions + + +
+ The zero-shot performance of existing vision-language models (VLMs) such as +CLIP is limited by the availability of large-scale, aligned image and text +datasets in specific domains. In this work, we leverage two complementary +sources of information -- descriptions of categories generated by large +language models (LLMs) and abundant, fine-grained image classification datasets +-- to improve the zero-shot classification performance of VLMs across +fine-grained domains. On the technical side, we develop methods to train VLMs +with this "bag-level" image-text supervision. We find that simply using these +attributes at test-time does not improve performance, but our training +strategy, for example, on the iNaturalist dataset, leads to an average +improvement of 4-5% in zero-shot classification accuracy for novel categories +of birds and flowers. Similar improvements are observed in domains where a +subset of the categories was used to fine-tune the model. By prompting LLMs in +various ways, we generate descriptions that capture visual appearance, habitat, +and geographic regions and pair them with existing attributes such as the +taxonomic structure of the categories. We systematically evaluate their ability +to improve zero-shot categorization in natural domains. Our findings suggest +that geographic priors can be just as effective and are complementary to visual +appearance. Our method also outperforms prior work on prompt-based tuning of +VLMs. We release the benchmark, consisting of 14 datasets at +https://github.com/cvl-umass/AdaptCLIPZS , which will contribute to future +research in zero-shot recognition. + +
+
+
+
+
+ + ♻ ☆ Cooperative Students: Navigating Unsupervised Domain Adaptation in + Nighttime Object Detection + + +
+ Unsupervised Domain Adaptation (UDA) has shown significant advancements in +object detection under well-lit conditions; however, its performance degrades +notably in low-visibility scenarios, especially at night, posing challenges not +only for its adaptability in low signal-to-noise ratio (SNR) conditions but +also for the reliability and efficiency of automated vehicles. To address this +problem, we propose a \textbf{Co}operative \textbf{S}tudents (\textbf{CoS}) +framework that innovatively employs global-local transformations (GLT) and a +proxy-based target consistency (PTC) mechanism to capture the spatial +consistency in day- and night-time scenarios effectively, and thus bridge the +significant domain shift across contexts. Building upon this, we further devise +an adaptive IoU-informed thresholding (AIT) module to gradually avoid +overlooking potential true positives and enrich the latent information in the +target domain. Comprehensive experiments show that CoS essentially enhanced UDA +performance in low-visibility conditions and surpasses current state-of-the-art +techniques, achieving an increase in mAP of 3.0\%, 1.9\%, and 2.5\% on BDD100K, +SHIFT, and ACDC datasets, respectively. Code is available at +https://github.com/jichengyuan/Cooperitive_Students. + +
+
+ comment: Code is available at + https://github.com/jichengyuan/Cooperitive_Students +
+
+
+
+
+ + ♻ ☆ Mirasol3B: A Multimodal Autoregressive model for time-aligned and + contextual modalities CVPR 2024 + + +
+ One of the main challenges of multimodal learning is the need to combine +heterogeneous modalities (e.g., video, audio, text). For example, video and +audio are obtained at much higher rates than text and are roughly aligned in +time. They are often not synchronized with text, which comes as a global +context, e.g., a title, or a description. Furthermore, video and audio inputs +are of much larger volumes, and grow as the video length increases, which +naturally requires more compute dedicated to these modalities and makes +modeling of long-range dependencies harder. + We here decouple the multimodal modeling, dividing it into separate, focused +autoregressive models, processing the inputs according to the characteristics +of the modalities. We propose a multimodal model, called Mirasol3B, consisting +of an autoregressive component for the time-synchronized modalities (audio and +video), and an autoregressive component for the context modalities which are +not necessarily aligned in time but are still sequential. To address the +long-sequences of the video-audio inputs, we propose to further partition the +video and audio sequences in consecutive snippets and autoregressively process +their representations. To that end, we propose a Combiner mechanism, which +models the audio-video information jointly within a timeframe. The Combiner +learns to extract audio and video features from raw spatio-temporal signals, +and then learns to fuse these features producing compact but expressive +representations per snippet. + Our approach achieves the state-of-the-art on well established multimodal +benchmarks, outperforming much larger models. It effectively addresses the high +computational demand of media inputs by both learning compact representations, +controlling the sequence length of the audio-video feature representations, and +modeling their dependencies in time. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ ViTamin: Designing Scalable Vision Models in the Vision-Language Era CVPR 2024 + + +
+ Recent breakthroughs in vision-language models (VLMs) start a new page in the +vision community. The VLMs provide stronger and more generalizable feature +embeddings compared to those from ImageNet-pretrained models, thanks to the +training on the large-scale Internet image-text pairs. However, despite the +amazing achievement from the VLMs, vanilla Vision Transformers (ViTs) remain +the default choice for the image encoder. Although pure transformer proves its +effectiveness in the text encoding area, it remains questionable whether it is +also the case for image encoding, especially considering that various types of +networks are proposed on the ImageNet benchmark, which, unfortunately, are +rarely studied in VLMs. Due to small data/model scale, the original conclusions +of model design on ImageNet can be limited and biased. In this paper, we aim at +building an evaluation protocol of vision models in the vision-language era +under the contrastive language-image pretraining (CLIP) framework. We provide a +comprehensive way to benchmark different vision models, covering their +zero-shot performance and scalability in both model and training data sizes. To +this end, we introduce ViTamin, a new vision models tailored for VLMs. +ViTamin-L significantly outperforms ViT-L by 2.0% ImageNet zero-shot accuracy, +when using the same publicly available DataComp-1B dataset and the same +OpenCLIP training scheme. ViTamin-L presents promising results on 60 diverse +benchmarks, including classification, retrieval, open-vocabulary detection and +segmentation, and large multi-modal models. When further scaling up the model +size, our ViTamin-XL with only 436M parameters attains 82.9% ImageNet zero-shot +accuracy, surpassing 82.0% achieved by EVA-E that has ten times more parameters +(4.4B). + +
+
+ comment: CVPR 2024; https://github.com/Beckschen/ViTamin +
+
+
+
+
+ + ♻ ☆ Generating Images with 3D Annotations Using Diffusion Models ICLR 2024 + + +
+ Diffusion models have emerged as a powerful generative method, capable of +producing stunning photo-realistic images from natural language descriptions. +However, these models lack explicit control over the 3D structure in the +generated images. Consequently, this hinders our ability to obtain detailed 3D +annotations for the generated images or to craft instances with specific poses +and distances. In this paper, we propose 3D Diffusion Style Transfer (3D-DST), +which incorporates 3D geometry control into diffusion models. Our method +exploits ControlNet, which extends diffusion models by using visual prompts in +addition to text prompts. We generate images of the 3D objects taken from 3D +shape repositories (e.g., ShapeNet and Objaverse), render them from a variety +of poses and viewing directions, compute the edge maps of the rendered images, +and use these edge maps as visual prompts to generate realistic images. With +explicit 3D geometry control, we can easily change the 3D structures of the +objects in the generated images and obtain ground-truth 3D annotations +automatically. This allows us to improve a wide range of vision tasks, e.g., +classification and 3D pose estimation, in both in-distribution (ID) and +out-of-distribution (OOD) settings. We demonstrate the effectiveness of our +method through extensive experiments on ImageNet-100/200, ImageNet-R, +PASCAL3D+, ObjectNet3D, and OOD-CV. The results show that our method +significantly outperforms existing methods, e.g., 3.8 percentage points on +ImageNet-100 using DeiT-B. + +
+
+ comment: ICLR 2024 Spotlight. Code: https://ccvl.jhu.edu/3D-DST/ +
+
+
+
+
+ + ♻ ☆ Effective Adapter for Face Recognition in the Wild + + +
+ In this paper, we tackle the challenge of face recognition in the wild, where +images often suffer from low quality and real-world distortions. Traditional +heuristic approaches-either training models directly on these degraded images +or their enhanced counterparts using face restoration techniques-have proven +ineffective, primarily due to the degradation of facial features and the +discrepancy in image domains. To overcome these issues, we propose an effective +adapter for augmenting existing face recognition models trained on high-quality +facial datasets. The key of our adapter is to process both the unrefined and +enhanced images using two similar structures, one fixed and the other +trainable. Such design can confer two benefits. First, the dual-input system +minimizes the domain gap while providing varied perspectives for the face +recognition model, where the enhanced image can be regarded as a complex +non-linear transformation of the original one by the restoration model. Second, +both two similar structures can be initialized by the pre-trained models +without dropping the past knowledge. The extensive experiments in zero-shot +settings show the effectiveness of our method by surpassing baselines of about +3%, 4%, and 7% in three datasets. Our code will be publicly available. + +
+
+
+
+
+ + ♻ ☆ Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet + Representation + + +
+ Generating high-quality videos that synthesize desired realistic content is a +challenging task due to their intricate high-dimensionality and complexity of +videos. Several recent diffusion-based methods have shown comparable +performance by compressing videos to a lower-dimensional latent space, using +traditional video autoencoder architecture. However, such method that employ +standard frame-wise 2D and 3D convolution fail to fully exploit the +spatio-temporal nature of videos. To address this issue, we propose a novel +hybrid video diffusion model, called HVDM, which can capture spatio-temporal +dependencies more effectively. The HVDM is trained by a hybrid video +autoencoder which extracts a disentangled representation of the video +including: (i) a global context information captured by a 2D projected latent +(ii) a local volume information captured by 3D convolutions with wavelet +decomposition (iii) a frequency information for improving the video +reconstruction. Based on this disentangled representation, our hybrid +autoencoder provide a more comprehensive video latent enriching the generated +videos with fine structures and details. Experiments on video generation +benchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed +approach achieves state-of-the-art video generation quality, showing a wide +range of video applications (e.g., long video generation, image-to-video, and +video dynamics control). + +
+
+ comment: Project page is available at https://hxngiee.github.io/HVDM/ +
+
+
+
+
+ + ♻ ☆ MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning + + +
+ Tiny deep learning on microcontroller units (MCUs) is challenging due to the +limited memory size. We find that the memory bottleneck is due to the +imbalanced memory distribution in convolutional neural network (CNN) designs: +the first several blocks have an order of magnitude larger memory usage than +the rest of the network. To alleviate this issue, we propose a generic +patch-by-patch inference scheduling, which operates only on a small spatial +region of the feature map and significantly cuts down the peak memory. However, +naive implementation brings overlapping patches and computation overhead. We +further propose network redistribution to shift the receptive field and FLOPs +to the later stage and reduce the computation overhead. Manually redistributing +the receptive field is difficult. We automate the process with neural +architecture search to jointly optimize the neural architecture and inference +scheduling, leading to MCUNetV2. Patch-based inference effectively reduces the +peak memory usage of existing networks by 4-8x. Co-designed with neural +networks, MCUNetV2 sets a record ImageNet accuracy on MCU (71.8%), and achieves +>90% accuracy on the visual wake words dataset under only 32kB SRAM. MCUNetV2 +also unblocks object detection on tiny devices, achieving 16.9% higher mAP on +Pascal VOC compared to the state-of-the-art result. Our study largely addressed +the memory bottleneck in tinyML and paved the way for various vision +applications beyond image classification. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 235 + +
+
+
+ + ☆ Segment Any 3D Object with Language + + +
+ In this paper, we investigate Open-Vocabulary 3D Instance Segmentation +(OV-3DIS) with free-form language instructions. Earlier works that rely on only +annotated base categories for training suffer from limited generalization to +unseen novel categories. Recent works mitigate poor generalizability to novel +categories by generating class-agnostic masks or projecting generalized masks +from 2D to 3D, but disregard semantic or geometry information, leading to +sub-optimal performance. Instead, generating generalizable but semantic-related +masks directly from 3D point clouds would result in superior outcomes. In this +paper, we introduce Segment any 3D Object with LanguagE (SOLE), which is a +semantic and geometric-aware visual-language learning framework with strong +generalizability by generating semantic-related masks directly from 3D point +clouds. Specifically, we propose a multimodal fusion network to incorporate +multimodal semantics in both backbone and decoder. In addition, to align the 3D +segmentation model with various language instructions and enhance the mask +quality, we introduce three types of multimodal associations as supervision. +Our SOLE outperforms previous methods by a large margin on ScanNetv2, +ScanNet200, and Replica benchmarks, and the results are even close to the +fully-supervised counterpart despite the absence of class annotations in the +training. Furthermore, extensive qualitative results demonstrate the +versatility of our SOLE to language instructions. + +
+
+ comment: Project Page: https://cvrp-sole.github.io +
+
+
+
+
+ + ☆ Alpha Invariance: On Inverse Scaling Between Distance and Volume Density + in Neural Radiance Fields CVPR 2024 + + +
+ Scale-ambiguity in 3D scene dimensions leads to magnitude-ambiguity of +volumetric densities in neural radiance fields, i.e., the densities double when +scene size is halved, and vice versa. We call this property alpha invariance. +For NeRFs to better maintain alpha invariance, we recommend 1) parameterizing +both distance and volume densities in log space, and 2) a +discretization-agnostic initialization strategy to guarantee high ray +transmittance. We revisit a few popular radiance field models and find that +these systems use various heuristics to deal with issues arising from scene +scaling. We test their behaviors and show our recipe to be more robust. + +
+
+ comment: CVPR 2024. project page https://pals.ttic.edu/p/alpha-invariance +
+
+
+
+
+ + ☆ Dynamic Pre-training: Towards Efficient and Scalable All-in-One Image + Restoration + + +
+ All-in-one image restoration tackles different types of degradations with a +unified model instead of having task-specific, non-generic models for each +degradation. The requirement to tackle multiple degradations using the same +model can lead to high-complexity designs with fixed configuration that lack +the adaptability to more efficient alternatives. We propose DyNet, a dynamic +family of networks designed in an encoder-decoder style for all-in-one image +restoration tasks. Our DyNet can seamlessly switch between its bulkier and +lightweight variants, thereby offering flexibility for efficient model +deployment with a single round of training. This seamless switching is enabled +by our weights-sharing mechanism, forming the core of our architecture and +facilitating the reuse of initialized module weights. Further, to establish +robust weights initialization, we introduce a dynamic pre-training strategy +that trains variants of the proposed DyNet concurrently, thereby achieving a +50% reduction in GPU hours. To tackle the unavailability of large-scale dataset +required in pre-training, we curate a high-quality, high-resolution image +dataset named Million-IRD having 2M image samples. We validate our DyNet for +image denoising, deraining, and dehazing in all-in-one setting, achieving +state-of-the-art results with 31.34% reduction in GFlops and a 56.75% reduction +in parameters compared to baseline models. The source codes and trained models +are available at https://github.com/akshaydudhane16/DyNet. + +
+
+
+
+
+ + ☆ GeneAvatar: Generic Expression-Aware Volumetric Head Avatar Editing from + a Single Image CVPR 2024 + + +
+ Recently, we have witnessed the explosive growth of various volumetric +representations in modeling animatable head avatars. However, due to the +diversity of frameworks, there is no practical method to support high-level +applications like 3D head avatar editing across different representations. In +this paper, we propose a generic avatar editing approach that can be +universally applied to various 3DMM driving volumetric head avatars. To achieve +this goal, we design a novel expression-aware modification generative model, +which enables lift 2D editing from a single image to a consistent 3D +modification field. To ensure the effectiveness of the generative modification +process, we develop several techniques, including an expression-dependent +modification distillation scheme to draw knowledge from the large-scale head +avatar model and 2D facial texture editing tools, implicit latent space +guidance to enhance model convergence, and a segmentation-based loss reweight +strategy for fine-grained texture inversion. Extensive experiments demonstrate +that our method delivers high-quality and consistent results across multiple +expression and viewpoints. Project page: https://zju3dv.github.io/geneavatar/ + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://zju3dv.github.io/geneavatar/ +
+
+
+
+
+ + ☆ Diffusion$^2$: Dynamic 3D Content Generation via Score Composition of + Orthogonal Diffusion Models + + +
+ Recent advancements in 3D generation are predominantly propelled by +improvements in 3D-aware image diffusion models which are pretrained on +Internet-scale image data and fine-tuned on massive 3D data, offering the +capability of producing highly consistent multi-view images. However, due to +the scarcity of synchronized multi-view video data, it is impractical to adapt +this paradigm to 4D generation directly. Despite that, the available video and +3D data are adequate for training video and multi-view diffusion models that +can provide satisfactory dynamic and geometric priors respectively. In this +paper, we present Diffusion$^2$, a novel framework for dynamic 3D content +creation that leverages the knowledge about geometric consistency and temporal +smoothness from these models to directly sample dense multi-view and +multi-frame images which can be employed to optimize continuous 4D +representation. Specifically, we design a simple yet effective denoising +strategy via score composition of video and multi-view diffusion models based +on the probability structure of the images to be generated. Owing to the high +parallelism of the image generation and the efficiency of the modern 4D +reconstruction pipeline, our framework can generate 4D content within few +minutes. Furthermore, our method circumvents the reliance on 4D data, thereby +having the potential to benefit from the scalability of the foundation video +and multi-view diffusion models. Extensive experiments demonstrate the efficacy +of our proposed framework and its capability to flexibly adapt to various types +of prompts. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Iterated Learning Improves Compositionality in Large Vision-Language + Models CVPR 2024 + + +
+ A fundamental characteristic common to both human vision and natural language +is their compositional nature. Yet, despite the performance gains contributed +by large vision and language pretraining, recent investigations find that +most-if not all-our state-of-the-art vision-language models struggle at +compositionality. They are unable to distinguish between images of " a girl in +white facing a man in black" and "a girl in black facing a man in white". +Moreover, prior work suggests that compositionality doesn't arise with scale: +larger model sizes or training data don't help. This paper develops a new +iterated training algorithm that incentivizes compositionality. We draw on +decades of cognitive science research that identifies cultural transmission-the +need to teach a new generation-as a necessary inductive prior that incentivizes +humans to develop compositional languages. Specifically, we reframe +vision-language contrastive learning as the Lewis Signaling Game between a +vision agent and a language agent, and operationalize cultural transmission by +iteratively resetting one of the agent's weights during training. After every +iteration, this training paradigm induces representations that become "easier +to learn", a property of compositional languages: e.g. our model trained on +CC3M and CC12M improves standard CLIP by 4.7%, 4.0% respectfully in the +SugarCrepe benchmark. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ ResNet with Integrated Convolutional Block Attention Module for Ship + Classification Using Transfer Learning on Optical Satellite Imagery + + +
+ This study proposes a novel transfer learning framework for effective ship +classification using high-resolution optical remote sensing satellite imagery. +The framework is based on the deep convolutional neural network model ResNet50 +and incorporates the Convolutional Block Attention Module (CBAM) to enhance +performance. CBAM enables the model to attend to salient features in the +images, allowing it to better discriminate between subtle differences between +ships and backgrounds. Furthermore, this study adopts a transfer learning +approach tailored for accurately classifying diverse types of ships by +fine-tuning a pre-trained model for the specific task. Experimental results +demonstrate the efficacy of the proposed framework in ship classification using +optical remote sensing imagery, achieving a high classification accuracy of 94% +across 5 classes, outperforming existing methods. This research holds potential +applications in maritime surveillance and management, illegal fishing +detection, and maritime traffic monitoring. + +
+
+
+
+
+ + ☆ ViTamin: Designing Scalable Vision Models in the Vision-Language Era CVPR 2024 + + +
+ Recent breakthroughs in vision-language models (VLMs) start a new page in the +vision community. The VLMs provide stronger and more generalizable feature +embeddings compared to those from ImageNet-pretrained models, thanks to the +training on the large-scale Internet image-text pairs. However, despite the +amazing achievement from the VLMs, vanilla Vision Transformers (ViTs) remain +the default choice for the image encoder. Although pure transformer proves its +effectiveness in the text encoding area, it remains questionable whether it is +also the case for image encoding, especially considering that various types of +networks are proposed on the ImageNet benchmark, which, unfortunately, are +rarely studied in VLMs. Due to small data/model scale, the original conclusions +of model design on ImageNet can be limited and biased. In this paper, we aim at +building an evaluation protocol of vision models in the vision-language era +under the contrastive language-image pretraining (CLIP) framework. We provide a +comprehensive way to benchmark different vision models, covering their +zero-shot performance and scalability in both model and training data sizes. To +this end, we introduce ViTamin, a new vision models tailored for VLMs. +ViTamin-L significantly outperforms ViT-L by 2.0% ImageNet zero-shot accuracy, +when using the same publicly available DataComp-1B dataset and the same +OpenCLIP training scheme. ViTamin-L presents promising results on 60 diverse +benchmarks, including classification, retrieval, open-vocabulary detection and +segmentation, and large multi-modal models. When further scaling up the model +size, our ViTamin-XL with only 436M parameters attains 82.9% ImageNet zero-shot +accuracy, surpassing 82.0% achieved by EVA-E that has ten times more parameters +(4.4B). + +
+
+ comment: CVPR 2024; https://github.com/Beckschen/ViTamin +
+
+
+
+
+ + ☆ 3D Congealing: 3D-Aware Image Alignment in the Wild + + +
+ We propose 3D Congealing, a novel problem of 3D-aware alignment for 2D images +capturing semantically similar objects. Given a collection of unlabeled +Internet images, our goal is to associate the shared semantic parts from the +inputs and aggregate the knowledge from 2D images to a shared 3D canonical +space. We introduce a general framework that tackles the task without assuming +shape templates, poses, or any camera parameters. At its core is a canonical 3D +representation that encapsulates geometric and semantic information. The +framework optimizes for the canonical representation together with the pose for +each input image, and a per-image coordinate map that warps 2D pixel +coordinates to the 3D canonical frame to account for the shape matching. The +optimization procedure fuses prior knowledge from a pre-trained image +generative model and semantic information from input images. The former +provides strong knowledge guidance for this under-constraint task, while the +latter provides the necessary information to mitigate the training data bias +from the pre-trained model. Our framework can be used for various tasks such as +correspondence matching, pose estimation, and image editing, achieving strong +results on real-world image datasets under challenging illumination conditions +and on in-the-wild online image collections. + +
+
+ comment: Project page: + https://ai.stanford.edu/~yzzhang/projects/3d-congealing/ +
+
+
+
+
+ + ☆ Pre-trained Vision and Language Transformers Are Few-Shot Incremental + Learners CVPR 2024 + + +
+ Few-Shot Class Incremental Learning (FSCIL) is a task that requires a model +to learn new classes incrementally without forgetting when only a few samples +for each class are given. FSCIL encounters two significant challenges: +catastrophic forgetting and overfitting, and these challenges have driven prior +studies to primarily rely on shallow models, such as ResNet-18. Even though +their limited capacity can mitigate both forgetting and overfitting issues, it +leads to inadequate knowledge transfer during few-shot incremental sessions. In +this paper, we argue that large models such as vision and language transformers +pre-trained on large datasets can be excellent few-shot incremental learners. +To this end, we propose a novel FSCIL framework called PriViLege, Pre-trained +Vision and Language transformers with prompting functions and knowledge +distillation. Our framework effectively addresses the challenges of +catastrophic forgetting and overfitting in large models through new pre-trained +knowledge tuning (PKT) and two losses: entropy-based divergence loss and +semantic knowledge distillation loss. Experimental results show that the +proposed PriViLege significantly outperforms the existing state-of-the-art +methods with a large margin, e.g., +9.38% in CUB200, +20.58% in CIFAR-100, and ++13.36% in miniImageNet. Our implementation code is available at +https://github.com/KHU-AGI/PriViLege. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ ImageNot: A contrast with ImageNet preserves model rankings + + +
+ We introduce ImageNot, a dataset designed to match the scale of ImageNet +while differing drastically in other aspects. We show that key model +architectures developed for ImageNet over the years rank identically when +trained and evaluated on ImageNot to how they rank on ImageNet. This is true +when training models from scratch or fine-tuning them. Moreover, the relative +improvements of each model over earlier models strongly correlate in both +datasets. We further give evidence that ImageNot has a similar utility as +ImageNet for transfer learning purposes. Our work demonstrates a surprising +degree of external validity in the relative performance of image classification +models. This stands in contrast with absolute accuracy numbers that typically +drop sharply even under small changes to a dataset. + +
+
+
+
+
+ + ☆ Neural Ordinary Differential Equation based Sequential Image + Registration for Dynamic Characterization CVPR 2022 + + +
+ Deformable image registration (DIR) is crucial in medical image analysis, +enabling the exploration of biological dynamics such as organ motions and +longitudinal changes in imaging. Leveraging Neural Ordinary Differential +Equations (ODE) for registration, this extension work discusses how this +framework can aid in the characterization of sequential biological processes. +Utilizing the Neural ODE's ability to model state derivatives with neural +networks, our Neural Ordinary Differential Equation Optimization-based (NODEO) +framework considers voxels as particles within a dynamic system, defining +deformation fields through the integration of neural differential equations. +This method learns dynamics directly from data, bypassing the need for physical +priors, making it exceptionally suitable for medical scenarios where such +priors are unavailable or inapplicable. Consequently, the framework can discern +underlying dynamics and use sequence data to regularize the transformation +trajectory. We evaluated our framework on two clinical datasets: one for +cardiac motion tracking and another for longitudinal brain MRI analysis. +Demonstrating its efficacy in both 2D and 3D imaging scenarios, our framework +offers flexibility and model agnosticism, capable of managing image sequences +and facilitating label propagation throughout these sequences. This study +provides a comprehensive understanding of how the Neural ODE-based framework +uniquely benefits the image registration challenge. + +
+
+ comment: Journal extension of NODEO: A Neural Ordinary Differential Equation + Based Optimization Framework for Deformable Image Registration, CVPR 2022 +
+
+
+
+
+ + ☆ CameraCtrl: Enabling Camera Control for Text-to-Video Generation + + +
+ Controllability plays a crucial role in video generation since it allows +users to create desired content. However, existing models largely overlooked +the precise control of camera pose that serves as a cinematic language to +express deeper narrative nuances. To alleviate this issue, we introduce +CameraCtrl, enabling accurate camera pose control for text-to-video(T2V) +models. After precisely parameterizing the camera trajectory, a plug-and-play +camera module is then trained on a T2V model, leaving others untouched. +Additionally, a comprehensive study on the effect of various datasets is also +conducted, suggesting that videos with diverse camera distribution and similar +appearances indeed enhance controllability and generalization. Experimental +results demonstrate the effectiveness of CameraCtrl in achieving precise and +domain-adaptive camera control, marking a step forward in the pursuit of +dynamic and customized video storytelling from textual and camera pose inputs. +Our project website is at: https://hehao13.github.io/projects-CameraCtrl/. + +
+
+ comment: Project page: https://hehao13.github.io/projects-CameraCtrl/ Code: + https://github.com/hehao13/CameraCtrl +
+
+
+
+
+ + ☆ BRAVEn: Improving Self-Supervised Pre-training for Visual and Auditory + Speech Recognition ICASSP 2024 + + +
+ Self-supervision has recently shown great promise for learning visual and +auditory speech representations from unlabelled data. In this work, we propose +BRAVEn, an extension to the recent RAVEn method, which learns speech +representations entirely from raw audio-visual data. Our modifications to RAVEn +enable BRAVEn to achieve state-of-the-art results among self-supervised methods +in various settings. Moreover, we observe favourable scaling behaviour by +increasing the amount of unlabelled data well beyond other self-supervised +works. In particular, we achieve 20.0% / 1.7% word error rate for VSR / ASR on +the LRS3 test set, with only 30 hours of labelled data and no external ASR +models. Our results suggest that readily available unlabelled audio-visual data +can largely replace costly transcribed data. + +
+
+ comment: ICASSP 2024. Code: https://github.com/ahaliassos/raven +
+
+
+
+
+ + ☆ Adaptive Feature Fusion Neural Network for Glaucoma Segmentation on + Unseen Fundus Images + + +
+ Fundus image segmentation on unseen domains is challenging, especially for +the over-parameterized deep models trained on the small medical datasets. To +address this challenge, we propose a method named Adaptive Feature-fusion +Neural Network (AFNN) for glaucoma segmentation on unseen domains, which mainly +consists of three modules: domain adaptor, feature-fusion network, and +self-supervised multi-task learning. Specifically, the domain adaptor helps the +pretrained-model fast adapt from other image domains to the medical fundus +image domain. Feature-fusion network and self-supervised multi-task learning +for the encoder and decoder are introduced to improve the domain generalization +ability. In addition, we also design the weighted-dice-loss to improve model +performance on complex optic-cup segmentation tasks. Our proposed method +achieves a competitive performance over existing fundus segmentation methods on +four public glaucoma datasets. + +
+
+ comment: 17 pages, 11 figures +
+
+
+
+
+ + ☆ WcDT: World-centric Diffusion Transformer for Traffic Scene Generation + + +
+ In this paper, we introduce a novel approach for autonomous driving +trajectory generation by harnessing the complementary strengths of diffusion +probabilistic models (a.k.a., diffusion models) and transformers. Our proposed +framework, termed the "World-Centric Diffusion Transformer" (WcDT), optimizes +the entire trajectory generation process, from feature extraction to model +inference. To enhance the scene diversity and stochasticity, the historical +trajectory data is first preprocessed and encoded into latent space using +Denoising Diffusion Probabilistic Models (DDPM) enhanced with Diffusion with +Transformer (DiT) blocks. Then, the latent features, historical trajectories, +HD map features, and historical traffic signal information are fused with +various transformer-based encoders. The encoded traffic scenes are then decoded +by a trajectory decoder to generate multimodal future trajectories. +Comprehensive experimental results show that the proposed approach exhibits +superior performance in generating both realistic and diverse trajectories, +showing its potential for integration into automatic driving simulation +systems. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ EGTR: Extracting Graph from Transformer for Scene Graph Generation CVPR 2024 + + +
+ Scene Graph Generation (SGG) is a challenging task of detecting objects and +predicting relationships between objects. After DETR was developed, one-stage +SGG models based on a one-stage object detector have been actively studied. +However, complex modeling is used to predict the relationship between objects, +and the inherent relationship between object queries learned in the multi-head +self-attention of the object detector has been neglected. We propose a +lightweight one-stage SGG model that extracts the relation graph from the +various relationships learned in the multi-head self-attention layers of the +DETR decoder. By fully utilizing the self-attention by-products, the relation +graph can be extracted effectively with a shallow relation extraction head. +Considering the dependency of the relation extraction task on the object +detection task, we propose a novel relation smoothing technique that adjusts +the relation label adaptively according to the quality of the detected objects. +By the relation smoothing, the model is trained according to the continuous +curriculum that focuses on object detection task at the beginning of training +and performs multi-task learning as the object detection performance gradually +improves. Furthermore, we propose a connectivity prediction task that predicts +whether a relation exists between object pairs as an auxiliary task of the +relation extraction. We demonstrate the effectiveness and efficiency of our +method for the Visual Genome and Open Image V6 datasets. Our code is publicly +available at https://github.com/naver-ai/egtr . + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Red-Teaming Segment Anything Model CVPR 2024 + + +
+ Foundation models have emerged as pivotal tools, tackling many complex tasks +through pre-training on vast datasets and subsequent fine-tuning for specific +applications. The Segment Anything Model is one of the first and most +well-known foundation models for computer vision segmentation tasks. This work +presents a multi-faceted red-teaming analysis that tests the Segment Anything +Model against challenging tasks: (1) We analyze the impact of style transfer on +segmentation masks, demonstrating that applying adverse weather conditions and +raindrops to dashboard images of city roads significantly distorts generated +masks. (2) We focus on assessing whether the model can be used for attacks on +privacy, such as recognizing celebrities' faces, and show that the model +possesses some undesired knowledge in this task. (3) Finally, we check how +robust the model is to adversarial attacks on segmentation masks under text +prompts. We not only show the effectiveness of popular white-box attacks and +resistance to black-box attacks but also introduce a novel approach - Focused +Iterative Gradient Attack (FIGA) that combines white-box approaches to +construct an efficient attack resulting in a smaller number of modified pixels. +All of our testing methods and analyses indicate a need for enhanced safety +measures in foundation models for image segmentation. + +
+
+ comment: CVPR 2024 - The 4th Workshop of Adversarial Machine Learning on + Computer Vision: Robustness of Foundation Models +
+
+
+
+
+ + ☆ Multi-Level Label Correction by Distilling Proximate Patterns for + Semi-supervised Semantic Segmentation + + +
+ Semi-supervised semantic segmentation relieves the reliance on large-scale +labeled data by leveraging unlabeled data. Recent semi-supervised semantic +segmentation approaches mainly resort to pseudo-labeling methods to exploit +unlabeled data. However, unreliable pseudo-labeling can undermine the +semi-supervision processes. In this paper, we propose an algorithm called +Multi-Level Label Correction (MLLC), which aims to use graph neural networks to +capture structural relationships in Semantic-Level Graphs (SLGs) and +Class-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically, +SLGs represent semantic affinities between pairs of pixel features, and CLGs +describe classification consistencies between pairs of pixel labels. With the +support of proximate pattern information from graphs, MLLC can rectify +incorrectly predicted pseudo-labels and can facilitate discriminative feature +representations. We design an end-to-end network to train and perform this +effective label corrections mechanism. Experiments demonstrate that MLLC can +significantly improve supervised baselines and outperforms state-of-the-art +approaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets. +Specifically, MLLC improves the supervised baseline by at least 5% and 2% with +DeepLabV2 and DeepLabV3+ respectively under different partition protocols. + +
+
+ comment: 12 pages, 8 figures. IEEE Transactions on Multimedia, 2024 +
+
+
+
+
+ + ☆ IISAN: Efficiently Adapting Multimodal Representation for Sequential + Recommendation with Decoupled PEFT SIGIR2024 + + +
+ Multimodal foundation models are transformative in sequential recommender +systems, leveraging powerful representation learning capabilities. While +Parameter-efficient Fine-tuning (PEFT) is commonly used to adapt foundation +models for recommendation tasks, most research prioritizes parameter +efficiency, often overlooking critical factors like GPU memory efficiency and +training speed. Addressing this gap, our paper introduces IISAN (Intra- and +Inter-modal Side Adapted Network for Multimodal Representation), a simple +plug-and-play architecture using a Decoupled PEFT structure and exploiting both +intra- and inter-modal adaptation. + IISAN matches the performance of full fine-tuning (FFT) and state-of-the-art +PEFT. More importantly, it significantly reduces GPU memory usage - from 47GB +to just 3GB for multimodal sequential recommendation tasks. Additionally, it +accelerates training time per epoch from 443s to 22s compared to FFT. This is +also a notable improvement over the Adapter and LoRA, which require 37-39 GB +GPU memory and 350-380 seconds per epoch for training. + Furthermore, we propose a new composite efficiency metric, TPME +(Training-time, Parameter, and GPU Memory Efficiency) to alleviate the +prevalent misconception that "parameter efficiency represents overall +efficiency". TPME provides more comprehensive insights into practical +efficiency comparisons between different methods. Besides, we give an +accessible efficiency analysis of all PEFT and FFT approaches, which +demonstrate the superiority of IISAN. We release our codes and other materials +at https://github.com/jjGenAILab/IISAN. + +
+
+ comment: Accepted by SIGIR2024 +
+
+
+
+
+ + ☆ Causality-based Transfer of Driving Scenarios to Unseen Intersections + + +
+ Scenario-based testing of automated driving functions has become a promising +method to reduce time and cost compared to real-world testing. In +scenario-based testing automated functions are evaluated in a set of +pre-defined scenarios. These scenarios provide information about vehicle +behaviors, environmental conditions, or road characteristics using parameters. +To create realistic scenarios, parameters and parameter dependencies have to be +fitted utilizing real-world data. However, due to the large variety of +intersections and movement constellations found in reality, data may not be +available for certain scenarios. This paper proposes a methodology to +systematically analyze relations between parameters of scenarios. Bayesian +networks are utilized to analyze causal dependencies in order to decrease the +amount of required data and to transfer causal patterns creating unseen +scenarios. Thereby, infrastructural influences on movement patterns are +investigated to generate realistic scenarios on unobserved intersections. For +evaluation, scenarios and underlying parameters are extracted from the inD +dataset. Movement patterns are estimated, transferred and checked against +recorded data from those initially unseen intersections. + +
+
+ comment: 6 pages, 8 figures, 1 table, Accepted to be published as part of the + 35th IEEE Intelligent Vehicles Symposium, June 2 - 5, 2024, Korea +
+
+
+
+
+ + ☆ SelfPose3d: Self-Supervised Multi-Person Multi-View 3d Pose Estimation CVPR 2024 + + +
+ We present a new self-supervised approach, SelfPose3d, for estimating 3d +poses of multiple persons from multiple camera views. Unlike current +state-of-the-art fully-supervised methods, our approach does not require any 2d +or 3d ground-truth poses and uses only the multi-view input images from a +calibrated camera setup and 2d pseudo poses generated from an off-the-shelf 2d +human pose estimator. We propose two self-supervised learning objectives: +self-supervised person localization in 3d space and self-supervised 3d pose +estimation. We achieve self-supervised 3d person localization by training the +model on synthetically generated 3d points, serving as 3d person root +positions, and on the projected root-heatmaps in all the views. We then model +the 3d poses of all the localized persons with a bottleneck representation, map +them onto all views obtaining 2d joints, and render them using 2d Gaussian +heatmaps in an end-to-end differentiable manner. Afterwards, we use the +corresponding 2d joints and heatmaps from the pseudo 2d poses for learning. To +alleviate the intrinsic inaccuracy of the pseudo labels, we propose an adaptive +supervision attention mechanism to guide the self-supervision. Our experiments +and analysis on three public benchmark datasets, including Panoptic, Shelf, and +Campus, show the effectiveness of our approach, which is comparable to +fully-supervised methods. Code is available at +\url{https://github.com/CAMMA-public/SelfPose3D} + +
+
+ comment: Accepted for CVPR 2024 +
+
+
+
+
+ + ☆ Specularity Factorization for Low-Light Enhancement CVPR 2024 + + +
+ We present a new additive image factorization technique that treats images to +be composed of multiple latent specular components which can be simply +estimated recursively by modulating the sparsity during decomposition. Our +model-driven {\em RSFNet} estimates these factors by unrolling the optimization +into network layers requiring only a few scalars to be learned. The resultant +factors are interpretable by design and can be fused for different image +enhancement tasks via a network or combined directly by the user in a +controllable fashion. Based on RSFNet, we detail a zero-reference Low Light +Enhancement (LLE) application trained without paired or unpaired supervision. +Our system improves the state-of-the-art performance on standard benchmarks and +achieves better generalization on multiple other datasets. We also integrate +our factors with other task specific fusion networks for applications like +deraining, deblurring and dehazing with negligible overhead thereby +highlighting the multi-domain and multi-task generalizability of our proposed +RSFNet. The code and data is released for reproducibility on the project +homepage. + +
+
+ comment: CVPR 2024, Pages: 8(main)+4(references)+17(supp) = 29 +
+
+
+
+
+ + ☆ A discussion about violin reduction: geometric analysis of contour lines + and channel of minima + + +
+ Some early violins have been reduced during their history to fit imposed +morphological standards, while more recent ones have been built directly to +these standards. We can observe differences between reduced and unreduced +instruments, particularly in their contour lines and channel of minima. In a +recent preliminary work, we computed and highlighted those two features for two +instruments using triangular 3D meshes acquired by photogrammetry, whose +fidelity has been assessed and validated with sub-millimetre accuracy. We +propose here an extension to a corpus of 38 violins, violas and cellos, and +introduce improved procedures, leading to a stronger discussion of the +geometric analysis. We first recall the material we are working with. We then +discuss how to derive the best reference plane for the violin alignment, which +is crucial for the computation of contour lines and channel of minima. Finally, +we show how to compute efficiently both characteristics and we illustrate our +results with a few examples. + +
+
+ comment: Paper accepted (before reviewing) for the Florence Heri-Tech 2024 + Conference +
+
+
+
+
+ + ☆ DELAN: Dual-Level Alignment for Vision-and-Language Navigation by + Cross-Modal Contrastive Learning LREC + + +
+ Vision-and-Language navigation (VLN) requires an agent to navigate in unseen +environment by following natural language instruction. For task completion, the +agent needs to align and integrate various navigation modalities, including +instruction, observation and navigation history. Existing works primarily +concentrate on cross-modal attention at the fusion stage to achieve this +objective. Nevertheless, modality features generated by disparate uni-encoders +reside in their own spaces, leading to a decline in the quality of cross-modal +fusion and decision. To address this problem, we propose a Dual-levEL AligNment +(DELAN) framework by cross-modal contrastive learning. This framework is +designed to align various navigation-related modalities before fusion, thereby +enhancing cross-modal interaction and action decision-making. Specifically, we +divide the pre-fusion alignment into dual levels: instruction-history level and +landmark-observation level according to their semantic correlations. We also +reconstruct a dual-level instruction for adaptation to the dual-level +alignment. As the training signals for pre-fusion alignment are extremely +limited, self-supervised contrastive learning strategies are employed to +enforce the matching between different modalities. Our approach seamlessly +integrates with the majority of existing models, resulting in improved +navigation performance on various VLN benchmarks, including R2R, R4R, RxR and +CVDN. + +
+
+ comment: Accepted by LREC-COLING 2024 +
+
+
+
+
+ + ☆ Cooperative Students: Navigating Unsupervised Domain Adaptation in + Nighttime Object Detection + + +
+ Unsupervised Domain Adaptation (UDA) has shown significant advancements in +object detection under well-lit conditions; however, its performance degrades +notably in low-visibility scenarios, especially at night, posing challenges not +only for its adaptability in low signal-to-noise ratio (SNR) conditions but +also for the reliability and efficiency of automated vehicles. To address this +problem, we propose a \textbf{Co}operative \textbf{S}tudents (\textbf{CoS}) +framework that innovatively employs global-local transformations (GLT) and a +proxy-based target consistency (PTC) mechanism to capture the spatial +consistency in day- and night-time scenarios effectively, and thus bridge the +significant domain shift across contexts. Building upon this, we further devise +an adaptive IoU-informed thresholding (AIT) module to gradually avoid +overlooking potential true positives and enrich the latent information in the +target domain. Comprehensive experiments show that CoS essentially enhanced UDA +performance in low-visibility conditions and surpasses current state-of-the-art +techniques, achieving an increase in mAP of 3.0\%, 1.9\%, and 2.5\% on BDD100K, +SHIFT, and ACDC datasets, respectively. Code is available at +https://github.com/jichengyuan/Cooperitive_Students. + +
+
+ comment: Code is available at + https://github.com/jichengyuan/Cooperitive_Students +
+
+
+
+
+ + ☆ Fashion Style Editing with Generative Human Prior + + +
+ Image editing has been a long-standing challenge in the research community +with its far-reaching impact on numerous applications. Recently, text-driven +methods started to deliver promising results in domains like human faces, but +their applications to more complex domains have been relatively limited. In +this work, we explore the task of fashion style editing, where we aim to +manipulate the fashion style of human imagery using text descriptions. +Specifically, we leverage a generative human prior and achieve fashion style +editing by navigating its learned latent space. We first verify that the +existing text-driven editing methods fall short for our problem due to their +overly simplified guidance signal, and propose two directions to reinforce the +guidance: textual augmentation and visual referencing. Combined with our +empirical findings on the latent space structure, our Fashion Style Editing +framework (FaSE) successfully projects abstract fashion concepts onto human +images and introduces exciting new applications to the field. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Joint-Task Regularization for Partially Labeled Multi-Task Learning CVPR 2024 + + +
+ Multi-task learning has become increasingly popular in the machine learning +field, but its practicality is hindered by the need for large, labeled +datasets. Most multi-task learning methods depend on fully labeled datasets +wherein each input example is accompanied by ground-truth labels for all target +tasks. Unfortunately, curating such datasets can be prohibitively expensive and +impractical, especially for dense prediction tasks which require per-pixel +labels for each image. With this in mind, we propose Joint-Task Regularization +(JTR), an intuitive technique which leverages cross-task relations to +simultaneously regularize all tasks in a single joint-task latent space to +improve learning when data is not fully labeled for all tasks. JTR stands out +from existing approaches in that it regularizes all tasks jointly rather than +separately in pairs -- therefore, it achieves linear complexity relative to the +number of tasks while previous methods scale quadratically. To demonstrate the +validity of our approach, we extensively benchmark our method across a wide +variety of partially labeled scenarios based on NYU-v2, Cityscapes, and +Taskonomy. + +
+
+ comment: Accepted paper to CVPR 2024 (main conference) +
+
+
+
+
+ + ☆ CAM-Based Methods Can See through Walls + + +
+ CAM-based methods are widely-used post-hoc interpretability method that +produce a saliency map to explain the decision of an image classification +model. The saliency map highlights the important areas of the image relevant to +the prediction. In this paper, we show that most of these methods can +incorrectly attribute an important score to parts of the image that the model +cannot see. We show that this phenomenon occurs both theoretically and +experimentally. On the theory side, we analyze the behavior of GradCAM on a +simple masked CNN model at initialization. Experimentally, we train a VGG-like +model constrained to not use the lower part of the image and nevertheless +observe positive scores in the unseen part of the image. This behavior is +evaluated quantitatively on two new datasets. We believe that this is +problematic, potentially leading to mis-interpretation of the model's behavior. + +
+
+ comment: 25 pages, 9 figures +
+
+
+
+
+ + ☆ Bi-LORA: A Vision-Language Approach for Synthetic Image Detection + + +
+ Advancements in deep image synthesis techniques, such as generative +adversarial networks (GANs) and diffusion models (DMs), have ushered in an era +of generating highly realistic images. While this technological progress has +captured significant interest, it has also raised concerns about the potential +difficulty in distinguishing real images from their synthetic counterparts. +This paper takes inspiration from the potent convergence capabilities between +vision and language, coupled with the zero-shot nature of vision-language +models (VLMs). We introduce an innovative method called Bi-LORA that leverages +VLMs, combined with low-rank adaptation (LORA) tuning techniques, to enhance +the precision of synthetic image detection for unseen model-generated images. +The pivotal conceptual shift in our methodology revolves around reframing +binary classification as an image captioning task, leveraging the distinctive +capabilities of cutting-edge VLM, notably bootstrapping language image +pre-training (BLIP2). Rigorous and comprehensive experiments are conducted to +validate the effectiveness of our proposed approach, particularly in detecting +unseen diffusion-generated images from unknown diffusion-based generative +models during training, showcasing robustness to noise, and demonstrating +generalization capabilities to GANs. The obtained results showcase an +impressive average accuracy of 93.41% in synthetic image detection on unseen +generation models. The code and models associated with this research can be +publicly accessed at https://github.com/Mamadou-Keita/VLM-DETECT. + +
+
+
+
+
+ + ☆ Automatic Wood Pith Detector: Local Orientation Estimation and Robust + Accumulation ICPR 2024 + + +
+ A fully automated technique for wood pith detection (APD), relying on the +concentric shape of the structure of wood ring slices, is introduced. The +method estimates the ring's local orientations using the 2D structure tensor +and finds the pith position, optimizing a cost function designed for this +problem. We also present a variant (APD-PCL), using the parallel coordinates +space, that enhances the method's effectiveness when there are no clear tree +ring patterns. Furthermore, refining previous work by Kurdthongmee, a YoloV8 +net is trained for pith detection, producing a deep learning-based approach to +the same problem (APD-DL). All methods were tested on seven datasets, including +images captured under diverse conditions (controlled laboratory settings, +sawmill, and forest) and featuring various tree species (Pinus taeda, Douglas +fir, Abies alba, and Gleditsia triacanthos). All proposed approaches outperform +existing state-of-the-art methods and can be used in CPU-based real-time +applications. Additionally, we provide a novel dataset comprising images of +gymnosperm and angiosperm species. Dataset and source code are available at +http://github.com/hmarichal93/apd. + +
+
+ comment: 18 pages, presented to ICPR 2024 conference +
+
+
+
+
+ + ☆ Quantifying Noise of Dynamic Vision Sensor + + +
+ Dynamic visual sensors (DVS) are characterized by a large amount of +background activity (BA) noise, which it is mixed with the original (cleaned) +sensor signal. The dynamic nature of the signal and the absence in practical +application of the ground truth, it clearly makes difficult to distinguish +between noise and the cleaned sensor signals using standard image processing +techniques. In this letter, a new technique is presented to characterise BA +noise derived from the Detrended Fluctuation Analysis (DFA). The proposed +technique can be used to address an existing DVS issues, which is how to +quantitatively characterised noise and signal without ground truth, and how to +derive an optimal denoising filter parameters. The solution of the latter +problem is demonstrated for the popular real moving-car dataset. + +
+
+ comment: 5 pages, 4 figures, submitted to the IEEE Signal Processing Letters +
+
+
+
+
+ + ☆ Synthetic Data for Robust Stroke Segmentation + + +
+ Deep learning-based semantic segmentation in neuroimaging currently requires +high-resolution scans and extensive annotated datasets, posing significant +barriers to clinical applicability. We present a novel synthetic framework for +the task of lesion segmentation, extending the capabilities of the established +SynthSeg approach to accommodate large heterogeneous pathologies with +lesion-specific augmentation strategies. Our method trains deep learning +models, demonstrated here with the UNet architecture, using label maps derived +from healthy and stroke datasets, facilitating the segmentation of both healthy +tissue and pathological lesions without sequence-specific training data. +Evaluated against in-domain and out-of-domain (OOD) datasets, our framework +demonstrates robust performance, rivaling current methods within the training +domain and significantly outperforming them on OOD data. This contribution +holds promise for advancing medical imaging analysis in clinical settings, +especially for stroke pathology, by enabling reliable segmentation across +varied imaging sequences with reduced dependency on large annotated corpora. +Code and weights available at https://github.com/liamchalcroft/SynthStroke. + +
+
+
+
+
+ + ☆ Event-assisted Low-Light Video Object Segmentation CVPR 2024 + + +
+ In the realm of video object segmentation (VOS), the challenge of operating +under low-light conditions persists, resulting in notably degraded image +quality and compromised accuracy when comparing query and memory frames for +similarity computation. Event cameras, characterized by their high dynamic +range and ability to capture motion information of objects, offer promise in +enhancing object visibility and aiding VOS methods under such low-light +conditions. This paper introduces a pioneering framework tailored for low-light +VOS, leveraging event camera data to elevate segmentation accuracy. Our +approach hinges on two pivotal components: the Adaptive Cross-Modal Fusion +(ACMF) module, aimed at extracting pertinent features while fusing image and +event modalities to mitigate noise interference, and the Event-Guided Memory +Matching (EGMM) module, designed to rectify the issue of inaccurate matching +prevalent in low-light settings. Additionally, we present the creation of a +synthetic LLE-DAVIS dataset and the curation of a real-world LLE-VOS dataset, +encompassing frames and events. Experimental evaluations corroborate the +efficacy of our method across both datasets, affirming its effectiveness in +low-light scenarios. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Lookahead Exploration with Neural Radiance Representation for Continuous + Vision-Language Navigation CVPR 2024 + + +
+ Vision-and-language navigation (VLN) enables the agent to navigate to a +remote location following the natural language instruction in 3D environments. +At each navigation step, the agent selects from possible candidate locations +and then makes the move. For better navigation planning, the lookahead +exploration strategy aims to effectively evaluate the agent's next action by +accurately anticipating the future environment of candidate locations. To this +end, some existing works predict RGB images for future environments, while this +strategy suffers from image distortion and high computational cost. To address +these issues, we propose the pre-trained hierarchical neural radiance +representation model (HNR) to produce multi-level semantic features for future +environments, which are more robust and efficient than pixel-wise RGB +reconstruction. Furthermore, with the predicted future environmental +representations, our lookahead VLN model is able to construct the navigable +future path tree and select the optimal path via efficient parallel evaluation. +Extensive experiments on the VLN-CE datasets confirm the effectiveness of our +method. + +
+
+ comment: Accepted by CVPR 2024. The code is available at + https://github.com/MrZihan/HNR-VLN +
+
+
+
+
+ + ☆ LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging + + +
+ Human pose and shape (HPS) estimation with lensless imaging is not only +beneficial to privacy protection but also can be used in covert surveillance +scenarios due to the small size and simple structure of this device. However, +this task presents significant challenges due to the inherent ambiguity of the +captured measurements and lacks effective methods for directly estimating human +pose and shape from lensless data. In this paper, we propose the first +end-to-end framework to recover 3D human poses and shapes from lensless +measurements to our knowledge. We specifically design a multi-scale lensless +feature decoder to decode the lensless measurements through the optically +encoded mask for efficient feature extraction. We also propose a double-head +auxiliary supervision mechanism to improve the estimation accuracy of human +limb ends. Besides, we establish a lensless imaging system and verify the +effectiveness of our method on various datasets acquired by our lensless +imaging system. + +
+
+
+
+
+ + ☆ PREGO: online mistake detection in PRocedural EGOcentric videos CVPR 2024 + + +
+ Promptly identifying procedural errors from egocentric videos in an online +setting is highly challenging and valuable for detecting mistakes as soon as +they happen. This capability has a wide range of applications across various +fields, such as manufacturing and healthcare. The nature of procedural mistakes +is open-set since novel types of failures might occur, which calls for +one-class classifiers trained on correctly executed procedures. However, no +technique can currently detect open-set procedural mistakes online. We propose +PREGO, the first online one-class classification model for mistake detection in +PRocedural EGOcentric videos. PREGO is based on an online action recognition +component to model the current action, and a symbolic reasoning module to +predict the next actions. Mistake detection is performed by comparing the +recognized current action with the expected future one. We evaluate PREGO on +two procedural egocentric video datasets, Assembly101 and Epic-tent, which we +adapt for online benchmarking of procedural mistake detection to establish +suitable benchmarks, thus defining the Assembly101-O and Epic-tent-O datasets, +respectively. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A + Semi-Supervised Video Object Detection Method + + +
+ This study aims to establish a computer-aided diagnostic system for lung +lesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians +in identifying lesion areas. During EBUS-transbronchial needle aspiration +(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to +determine the location of lesions. However, these images often contain +significant noise and can be influenced by surrounding tissues or blood +vessels, making interpretation challenging. Previous research has lacked the +application of object detection models to EBUS-TBNA, and there has been no +well-defined solution for annotating the EBUS-TBNA dataset. In related studies +on ultrasound images, although models have been successful in capturing target +regions for their respective tasks, their training and predictions have been +based on two-dimensional images, limiting their ability to leverage temporal +features for improved predictions. This study introduces a three-dimensional +image-based object detection model. It utilizes an attention mechanism to +capture temporal correlations and we will implements a filtering mechanism to +select relevant information from previous frames. Subsequently, a +teacher-student model training approach is employed to optimize the model +further, leveraging unlabeled data. To mitigate the impact of poor-quality +pseudo-labels on the student model, we will add a special Gaussian Mixture +Model (GMM) to ensure the quality of pseudo-labels. + +
+
+
+
+
+ + ☆ Improving Bird's Eye View Semantic Segmentation by Task Decomposition CVPR 2024 + + +
+ Semantic segmentation in bird's eye view (BEV) plays a crucial role in +autonomous driving. Previous methods usually follow an end-to-end pipeline, +directly predicting the BEV segmentation map from monocular RGB inputs. +However, the challenge arises when the RGB inputs and BEV targets from distinct +perspectives, making the direct point-to-point predicting hard to optimize. In +this paper, we decompose the original BEV segmentation task into two stages, +namely BEV map reconstruction and RGB-BEV feature alignment. In the first +stage, we train a BEV autoencoder to reconstruct the BEV segmentation maps +given corrupted noisy latent representation, which urges the decoder to learn +fundamental knowledge of typical BEV patterns. The second stage involves +mapping RGB input images into the BEV latent space of the first stage, directly +optimizing the correlations between the two views at the feature level. Our +approach simplifies the complexity of combining perception and generation into +distinct steps, equipping the model to handle intricate and challenging scenes +effectively. Besides, we propose to transform the BEV segmentation map from the +Cartesian to the polar coordinate system to establish the column-wise +correspondence between RGB images and BEV maps. Moreover, our method requires +neither multi-scale features nor camera intrinsic parameters for depth +estimation and saves computational overhead. Extensive experiments on nuScenes +and Argoverse show the effectiveness and efficiency of our method. Code is +available at https://github.com/happytianhao/TaDe. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Toward Efficient Visual Gyroscopes: Spherical Moments, Harmonics + Filtering, and Masking Techniques for Spherical Camera Applications IROS 2024 + + +
+ Unlike a traditional gyroscope, a visual gyroscope estimates camera rotation +through images. The integration of omnidirectional cameras, offering a larger +field of view compared to traditional RGB cameras, has proven to yield more +accurate and robust results. However, challenges arise in situations that lack +features, have substantial noise causing significant errors, and where certain +features in the images lack sufficient strength, leading to less precise +prediction results. + Here, we address these challenges by introducing a novel visual gyroscope, +which combines an analytical method with a neural network approach to provide a +more efficient and accurate rotation estimation from spherical images. The +presented method relies on three key contributions: an adapted analytical +approach to compute the spherical moments coefficients, introduction of masks +for better global feature representation, and the use of a multilayer +perceptron to adaptively choose the best combination of masks and filters. +Experimental results demonstrate superior performance of the proposed approach +in terms of accuracy. The paper emphasizes the advantages of integrating +machine learning to optimize analytical solutions, discusses limitations, and +suggests directions for future research. + +
+
+ comment: Submitted to 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ VLRM: Vision-Language Models act as Reward Models for Image Captioning + + +
+ In this work, we present an unsupervised method for enhancing an image +captioning model (in our case, BLIP2) using reinforcement learning and +vision-language models like CLIP and BLIP2-ITM as reward models. The RL-tuned +model is able to generate longer and more comprehensive descriptions. Our model +reaches impressive 0.90 R@1 CLIP Recall score on MS-COCO Carpathy Test Split. + Weights are available at +https://huggingface.co/sashakunitsyn/vlrm-blip2-opt-2.7b. + +
+
+
+
+
+ + ☆ Minimize Quantization Output Error with Bias Compensation + + +
+ Quantization is a promising method that reduces memory usage and +computational intensity of Deep Neural Networks (DNNs), but it often leads to +significant output error that hinder model deployment. In this paper, we +propose Bias Compensation (BC) to minimize the output error, thus realizing +ultra-low-precision quantization without model fine-tuning. Instead of +optimizing the non-convex quantization process as in most previous methods, the +proposed BC bypasses the step to directly minimize the quantizing output error +by identifying a bias vector for compensation. We have established that the +minimization of output error through BC is a convex problem and provides an +efficient strategy to procure optimal solutions associated with minimal output +error,without the need for training or fine-tuning. We conduct extensive +experiments on Vision Transformer models and Large Language Models, and the +results show that our method notably reduces quantization output error, thereby +permitting ultra-low-precision post-training quantization and enhancing the +task performance of models. Especially, BC improves the accuracy of ViT-B with +4-bit PTQ4ViT by 36.89% on the ImageNet-1k task, and decreases the perplexity +of OPT-350M with 3-bit GPTQ by 5.97 on WikiText2.The code is in +https://github.com/GongCheng1919/bias-compensation. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ ASTRA: An Action Spotting TRAnsformer for Soccer Videos + + +
+ In this paper, we introduce ASTRA, a Transformer-based model designed for the +task of Action Spotting in soccer matches. ASTRA addresses several challenges +inherent in the task and dataset, including the requirement for precise action +localization, the presence of a long-tail data distribution, non-visibility in +certain actions, and inherent label noise. To do so, ASTRA incorporates (a) a +Transformer encoder-decoder architecture to achieve the desired output temporal +resolution and to produce precise predictions, (b) a balanced mixup strategy to +handle the long-tail distribution of the data, (c) an uncertainty-aware +displacement head to capture the label variability, and (d) input audio signal +to enhance detection of non-visible actions. Results demonstrate the +effectiveness of ASTRA, achieving a tight Average-mAP of 66.82 on the test set. +Moreover, in the SoccerNet 2023 Action Spotting challenge, we secure the 3rd +position with an Average-mAP of 70.21 on the challenge set. + +
+
+
+
+
+ + ☆ RAVE: Residual Vector Embedding for CLIP-Guided Backlit Image + Enhancement + + +
+ In this paper we propose a novel modification of Contrastive Language-Image +Pre-Training (CLIP) guidance for the task of unsupervised backlit image +enhancement. Our work builds on the state-of-the-art CLIP-LIT approach, which +learns a prompt pair by constraining the text-image similarity between a prompt +(negative/positive sample) and a corresponding image (backlit image/well-lit +image) in the CLIP embedding space. Learned prompts then guide an image +enhancement network. Based on the CLIP-LIT framework, we propose two novel +methods for CLIP guidance. First, we show that instead of tuning prompts in the +space of text embeddings, it is possible to directly tune their embeddings in +the latent space without any loss in quality. This accelerates training and +potentially enables the use of additional encoders that do not have a text +encoder. Second, we propose a novel approach that does not require any prompt +tuning. Instead, based on CLIP embeddings of backlit and well-lit images from +training data, we compute the residual vector in the embedding space as a +simple difference between the mean embeddings of the well-lit and backlit +images. This vector then guides the enhancement network during training, +pushing a backlit image towards the space of well-lit images. This approach +further dramatically reduces training time, stabilizes training and produces +high quality enhanced images without artifacts, both in supervised and +unsupervised training regimes. Additionally, we show that residual vectors can +be interpreted, revealing biases in training data, and thereby enabling +potential bias correction. + +
+
+
+
+
+ + ☆ 3D Scene Generation from Scene Graphs and Self-Attention + + +
+ Synthesizing realistic and diverse indoor 3D scene layouts in a controllable +fashion opens up applications in simulated navigation and virtual reality. As +concise and robust representations of a scene, scene graphs have proven to be +well-suited as the semantic control on the generated layout. We present a +variant of the conditional variational autoencoder (cVAE) model to synthesize +3D scenes from scene graphs and floor plans. We exploit the properties of +self-attention layers to capture high-level relationships between objects in a +scene, and use these as the building blocks of our model. Our model, leverages +graph transformers to estimate the size, dimension and orientation of the +objects in a room while satisfying relationships in the given scene graph. Our +experiments shows self-attention layers leads to sparser (HOW MUCH) and more +diverse scenes (HOW MUCH)\. Included in this work, we publish the first +large-scale dataset for conditioned scene generation from scene graphs, +containing over XXX rooms (of floor plans and scene graphs). + +
+
+
+
+
+ + ☆ Scene Adaptive Sparse Transformer for Event-based Object Detection + + +
+ While recent Transformer-based approaches have shown impressive performances +on event-based object detection tasks, their high computational costs still +diminish the low power consumption advantage of event cameras. Image-based +works attempt to reduce these costs by introducing sparse Transformers. +However, they display inadequate sparsity and adaptability when applied to +event-based object detection, since these approaches cannot balance the fine +granularity of token-level sparsification and the efficiency of window-based +Transformers, leading to reduced performance and efficiency. Furthermore, they +lack scene-specific sparsity optimization, resulting in information loss and a +lower recall rate. To overcome these limitations, we propose the Scene Adaptive +Sparse Transformer (SAST). SAST enables window-token co-sparsification, +significantly enhancing fault tolerance and reducing computational overhead. +Leveraging the innovative scoring and selection modules, along with the Masked +Sparse Window Self-Attention, SAST showcases remarkable scene-aware +adaptability: It focuses only on important objects and dynamically optimizes +sparsity level according to scene complexity, maintaining a remarkable balance +between performance and computational cost. The evaluation results show that +SAST outperforms all other dense and sparse networks in both performance and +efficiency on two large-scale event-based object detection datasets (1Mpx and +Gen1). Code: https://github.com/Peterande/SAST + +
+
+
+
+
+ + ☆ Real, fake and synthetic faces - does the coin have three sides? + + +
+ With the ever-growing power of generative artificial intelligence, deepfake +and artificially generated (synthetic) media have continued to spread online, +which creates various ethical and moral concerns regarding their usage. To +tackle this, we thus present a novel exploration of the trends and patterns +observed in real, deepfake and synthetic facial images. The proposed analysis +is done in two parts: firstly, we incorporate eight deep learning models and +analyze their performances in distinguishing between the three classes of +images. Next, we look to further delve into the similarities and differences +between these three sets of images by investigating their image properties both +in the context of the entire image as well as in the context of specific +regions within the image. ANOVA test was also performed and provided further +clarity amongst the patterns associated between the images of the three +classes. From our findings, we observe that the investigated deeplearning +models found it easier to detect synthetic facial images, with the ViT Patch-16 +model performing best on this task with a class-averaged sensitivity, +specificity, precision, and accuracy of 97.37%, 98.69%, 97.48%, and 98.25%, +respectively. This observation was supported by further analysis of various +image properties. We saw noticeable differences across the three category of +images. This analysis can help us build better algorithms for facial image +generation, and also shows that synthetic, deepfake and real face images are +indeed three different classes. + +
+
+
+
+
+ + ☆ Co-Speech Gesture Video Generation via Motion-Decoupled Diffusion Model CVPR 2024 + + +
+ Co-speech gestures, if presented in the lively form of videos, can achieve +superior visual effects in human-machine interaction. While previous works +mostly generate structural human skeletons, resulting in the omission of +appearance information, we focus on the direct generation of audio-driven +co-speech gesture videos in this work. There are two main challenges: 1) A +suitable motion feature is needed to describe complex human movements with +crucial appearance information. 2) Gestures and speech exhibit inherent +dependencies and should be temporally aligned even of arbitrary length. To +solve these problems, we present a novel motion-decoupled framework to generate +co-speech gesture videos. Specifically, we first introduce a well-designed +nonlinear TPS transformation to obtain latent motion features preserving +essential appearance information. Then a transformer-based diffusion model is +proposed to learn the temporal correlation between gestures and speech, and +performs generation in the latent motion space, followed by an optimal motion +selection module to produce long-term coherent and consistent gesture videos. +For better visual perception, we further design a refinement network focusing +on missing details of certain areas. Extensive experimental results show that +our proposed framework significantly outperforms existing approaches in both +motion and video-related evaluations. Our code, demos, and more resources are +available at https://github.com/thuhcsi/S2G-MDDiffusion. + +
+
+ comment: 22 pages, 8 figures, CVPR 2024 +
+
+
+
+
+ + ☆ Pairwise Similarity Distribution Clustering for Noisy Label Learning + + +
+ Noisy label learning aims to train deep neural networks using a large amount +of samples with noisy labels, whose main challenge comes from how to deal with +the inaccurate supervision caused by wrong labels. Existing works either take +the label correction or sample selection paradigm to involve more samples with +accurate labels into the training process. In this paper, we propose a simple +yet effective sample selection algorithm, termed as Pairwise Similarity +Distribution Clustering~(PSDC), to divide the training samples into one clean +set and another noisy set, which can power any of the off-the-shelf +semi-supervised learning regimes to further train networks for different +downstream tasks. Specifically, we take the pairwise similarity between sample +pairs to represent the sample structure, and the Gaussian Mixture Model~(GMM) +to model the similarity distribution between sample pairs belonging to the same +noisy cluster, therefore each sample can be confidently divided into the clean +set or noisy set. Even under severe label noise rate, the resulting data +partition mechanism has been proved to be more robust in judging the label +confidence in both theory and practice. Experimental results on various +benchmark datasets, such as CIFAR-10, CIFAR-100 and Clothing1M, demonstrate +significant improvements over state-of-the-art methods. + +
+
+
+
+
+ + ☆ Sketch3D: Style-Consistent Guidance for Sketch-to-3D Generation + + +
+ Recently, image-to-3D approaches have achieved significant results with a +natural image as input. However, it is not always possible to access these +enriched color input samples in practical applications, where only sketches are +available. Existing sketch-to-3D researches suffer from limitations in broad +applications due to the challenges of lacking color information and multi-view +content. To overcome them, this paper proposes a novel generation paradigm +Sketch3D to generate realistic 3D assets with shape aligned with the input +sketch and color matching the textual description. Concretely, Sketch3D first +instantiates the given sketch in the reference image through the +shape-preserving generation process. Second, the reference image is leveraged +to deduce a coarse 3D Gaussian prior, and multi-view style-consistent guidance +images are generated based on the renderings of the 3D Gaussians. Finally, +three strategies are designed to optimize 3D Gaussians, i.e., structural +optimization via a distribution transfer mechanism, color optimization with a +straightforward MSE loss and sketch similarity optimization with a CLIP-based +geometric similarity loss. Extensive visual comparisons and quantitative +analysis illustrate the advantage of our Sketch3D in generating realistic 3D +assets while preserving consistency with the input. + +
+
+
+
+
+ + ☆ Semi-Supervised Domain Adaptation for Wildfire Detection + + +
+ Recently, both the frequency and intensity of wildfires have increased +worldwide, primarily due to climate change. In this paper, we propose a novel +protocol for wildfire detection, leveraging semi-supervised Domain Adaptation +for object detection, accompanied by a corresponding dataset designed for use +by both academics and industries. Our dataset encompasses 30 times more diverse +labeled scenes for the current largest benchmark wildfire dataset, HPWREN, and +introduces a new labeling policy for wildfire detection. Inspired by CoordConv, +we propose a robust baseline, Location-Aware Object Detection for +Semi-Supervised Domain Adaptation (LADA), utilizing a teacher-student based +framework capable of extracting translational variance features characteristic +of wildfires. With only using 1% target domain labeled data, our framework +significantly outperforms our source-only baseline by a notable margin of 3.8% +in mean Average Precision on the HPWREN wildfire dataset. Our dataset is +available at https://github.com/BloomBerry/LADA. + +
+
+ comment: 16 pages, 5 figures, 22 tables +
+
+
+
+
+ + ☆ Sparse Semi-DETR: Sparse Learnable Queries for Semi-Supervised Object + Detection CVPR2024 + + +
+ In this paper, we address the limitations of the DETR-based semi-supervised +object detection (SSOD) framework, particularly focusing on the challenges +posed by the quality of object queries. In DETR-based SSOD, the one-to-one +assignment strategy provides inaccurate pseudo-labels, while the one-to-many +assignments strategy leads to overlapping predictions. These issues compromise +training efficiency and degrade model performance, especially in detecting +small or occluded objects. We introduce Sparse Semi-DETR, a novel +transformer-based, end-to-end semi-supervised object detection solution to +overcome these challenges. Sparse Semi-DETR incorporates a Query Refinement +Module to enhance the quality of object queries, significantly improving +detection capabilities for small and partially obscured objects. Additionally, +we integrate a Reliable Pseudo-Label Filtering Module that selectively filters +high-quality pseudo-labels, thereby enhancing detection accuracy and +consistency. On the MS-COCO and Pascal VOC object detection benchmarks, Sparse +Semi-DETR achieves a significant improvement over current state-of-the-art +methods that highlight Sparse Semi-DETR's effectiveness in semi-supervised +object detection, particularly in challenging scenarios involving small or +partially obscured objects. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Rethinking Annotator Simulation: Realistic Evaluation of Whole-Body PET + Lesion Interactive Segmentation Methods + + +
+ Interactive segmentation plays a crucial role in accelerating the annotation, +particularly in domains requiring specialized expertise such as nuclear +medicine. For example, annotating lesions in whole-body Positron Emission +Tomography (PET) images can require over an hour per volume. While previous +works evaluate interactive segmentation models through either real user studies +or simulated annotators, both approaches present challenges. Real user studies +are expensive and often limited in scale, while simulated annotators, also +known as robot users, tend to overestimate model performance due to their +idealized nature. To address these limitations, we introduce four evaluation +metrics that quantify the user shift between real and simulated annotators. In +an initial user study involving four annotators, we assess existing robot users +using our proposed metrics and find that robot users significantly deviate in +performance and annotation behavior compared to real annotators. Based on these +findings, we propose a more realistic robot user that reduces the user shift by +incorporating human factors such as click variation and inter-annotator +disagreement. We validate our robot user in a second user study, involving four +other annotators, and show it consistently reduces the simulated-to-real user +shift compared to traditional robot users. By employing our robot user, we can +conduct more large-scale and cost-efficient evaluations of interactive +segmentation models, while preserving the fidelity of real user studies. Our +implementation is based on MONAI Label and will be made publicly available. + +
+
+ comment: 10 pages, 5 figures, 1 table +
+
+
+
+
+ + ☆ Surface Reconstruction from Gaussian Splatting via Novel Stereo Views + + +
+ The Gaussian splatting for radiance field rendering method has recently +emerged as an efficient approach for accurate scene representation. It +optimizes the location, size, color, and shape of a cloud of 3D Gaussian +elements to visually match, after projection, or splatting, a set of given +images taken from various viewing directions. And yet, despite the proximity of +Gaussian elements to the shape boundaries, direct surface reconstruction of +objects in the scene is a challenge. + We propose a novel approach for surface reconstruction from Gaussian +splatting models. Rather than relying on the Gaussian elements' locations as a +prior for surface reconstruction, we leverage the superior novel-view synthesis +capabilities of 3DGS. To that end, we use the Gaussian splatting model to +render pairs of stereo-calibrated novel views from which we extract depth +profiles using a stereo matching method. We then combine the extracted RGB-D +images into a geometrically consistent surface. The resulting reconstruction is +more accurate and shows finer details when compared to other methods for +surface reconstruction from Gaussian splatting models, while requiring +significantly less compute time compared to other surface reconstruction +methods. + We performed extensive testing of the proposed method on in-the-wild scenes, +taken by a smartphone, showcasing its superior reconstruction abilities. +Additionally, we tested the proposed method on the Tanks and Temples benchmark, +and it has surpassed the current leading method for surface reconstruction from +Gaussian splatting models. Project page: https://gs2mesh.github.io/. + +
+
+ comment: Project Page: https://gs2mesh.github.io/ +
+
+
+
+
+ + ☆ EventSleep: Sleep Activity Recognition with Event Cameras + + +
+ Event cameras are a promising technology for activity recognition in dark +environments due to their unique properties. However, real event camera +datasets under low-lighting conditions are still scarce, which also limits the +number of approaches to solve these kind of problems, hindering the potential +of this technology in many applications. We present EventSleep, a new dataset +and methodology to address this gap and study the suitability of event cameras +for a very relevant medical application: sleep monitoring for sleep disorders +analysis. The dataset contains synchronized event and infrared recordings +emulating common movements that happen during the sleep, resulting in a new +challenging and unique dataset for activity recognition in dark environments. +Our novel pipeline is able to achieve high accuracy under these challenging +conditions and incorporates a Bayesian approach (Laplace ensembles) to increase +the robustness in the predictions, which is fundamental for medical +applications. Our work is the first application of Bayesian neural networks for +event cameras, the first use of Laplace ensembles in a realistic problem, and +also demonstrates for the first time the potential of event cameras in a new +application domain: to enhance current sleep evaluation procedures. Our +activity recognition results highlight the potential of event cameras under +dark conditions, and its capacity and robustness for sleep activity +recognition, and open problems as the adaptation of event data pre-processing +techniques to dark environments. + +
+
+
+
+
+ + ☆ Super-Resolution Analysis for Landfill Waste Classification + + +
+ Illegal landfills are a critical issue due to their environmental, economic, +and public health impacts. This study leverages aerial imagery for +environmental crime monitoring. While advances in artificial intelligence and +computer vision hold promise, the challenge lies in training models with +high-resolution literature datasets and adapting them to open-access +low-resolution images. Considering the substantial quality differences and +limited annotation, this research explores the adaptability of models across +these domains. Motivated by the necessity for a comprehensive evaluation of +waste detection algorithms, it advocates cross-domain classification and +super-resolution enhancement to analyze the impact of different image +resolutions on waste classification as an evaluation to combat the +proliferation of illegal landfills. We observed performance improvements by +enhancing image quality but noted an influence on model sensitivity, +necessitating careful threshold fine-tuning. + +
+
+ comment: This article has been accepted by the Symposium on Intelligent Data + Analysis (IDA 2024) +
+
+
+
+
+ + ☆ CSST Strong Lensing Preparation: a Framework for Detecting Strong Lenses + in the Multi-color Imaging Survey by the China Survey Space Telescope (CSST) + + +
+ Strong gravitational lensing is a powerful tool for investigating dark matter +and dark energy properties. With the advent of large-scale sky surveys, we can +discover strong lensing systems on an unprecedented scale, which requires +efficient tools to extract them from billions of astronomical objects. The +existing mainstream lens-finding tools are based on machine learning algorithms +and applied to cut-out-centered galaxies. However, according to the design and +survey strategy of optical surveys by CSST, preparing cutouts with multiple +bands requires considerable efforts. To overcome these challenges, we have +developed a framework based on a hierarchical visual Transformer with a sliding +window technique to search for strong lensing systems within entire images. +Moreover, given that multi-color images of strong lensing systems can provide +insights into their physical characteristics, our framework is specifically +crafted to identify strong lensing systems in images with any number of +channels. As evaluated using CSST mock data based on an Semi-Analytic Model +named CosmoDC2, our framework achieves precision and recall rates of 0.98 and +0.90, respectively. To evaluate the effectiveness of our method in real +observations, we have applied it to a subset of images from the DESI Legacy +Imaging Surveys and media images from Euclid Early Release Observations. 61 new +strong lensing system candidates are discovered by our method. However, we also +identified false positives arising primarily from the simplified galaxy +morphology assumptions within the simulation. This underscores the practical +limitations of our approach while simultaneously highlighting potential avenues +for future improvements. + +
+
+ comment: The paper is accepted by the AJ. The complete code could be + downloaded with DOI of: 10.12149/101393. Comments are welcome +
+
+
+
+
+ + ☆ A noisy elephant in the room: Is your out-of-distribution detector + robust to label noise? CVPR 2024 + + +
+ The ability to detect unfamiliar or unexpected images is essential for safe +deployment of computer vision systems. In the context of classification, the +task of detecting images outside of a model's training domain is known as +out-of-distribution (OOD) detection. While there has been a growing research +interest in developing post-hoc OOD detection methods, there has been +comparably little discussion around how these methods perform when the +underlying classifier is not trained on a clean, carefully curated dataset. In +this work, we take a closer look at 20 state-of-the-art OOD detection methods +in the (more realistic) scenario where the labels used to train the underlying +classifier are unreliable (e.g. crowd-sourced or web-scraped labels). Extensive +experiments across different datasets, noise types & levels, architectures and +checkpointing strategies provide insights into the effect of class label noise +on OOD detection, and show that poor separation between incorrectly classified +ID samples vs. OOD samples is an overlooked yet important limitation of +existing methods. Code: https://github.com/glhr/ood-labelnoise + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Guidelines for Cerebrovascular Segmentation: Managing Imperfect + Annotations in the context of Semi-Supervised Learning + + +
+ Segmentation in medical imaging is an essential and often preliminary task in +the image processing chain, driving numerous efforts towards the design of +robust segmentation algorithms. Supervised learning methods achieve excellent +performances when fed with a sufficient amount of labeled data. However, such +labels are typically highly time-consuming, error-prone and expensive to +produce. Alternatively, semi-supervised learning approaches leverage both +labeled and unlabeled data, and are very useful when only a small fraction of +the dataset is labeled. They are particularly useful for cerebrovascular +segmentation, given that labeling a single volume requires several hours for an +expert. In addition to the challenge posed by insufficient annotations, there +are concerns regarding annotation consistency. The task of annotating the +cerebrovascular tree is inherently ambiguous. Due to the discrete nature of +images, the borders and extremities of vessels are often unclear. Consequently, +annotations heavily rely on the expert subjectivity and on the underlying +clinical objective. These discrepancies significantly increase the complexity +of the segmentation task for the model and consequently impair the results. +Consequently, it becomes imperative to provide clinicians with precise +guidelines to improve the annotation process and construct more uniform +datasets. In this article, we investigate the data dependency of deep learning +methods within the context of imperfect data and semi-supervised learning, for +cerebrovascular segmentation. Specifically, this study compares various +state-of-the-art semi-supervised methods based on unsupervised regularization +and evaluates their performance in diverse quantity and quality data scenarios. +Based on these experiments, we provide guidelines for the annotation and +training of cerebrovascular segmentation models. + +
+
+
+
+
+ + ☆ GEARS: Local Geometry-aware Hand-object Interaction Synthesis + + +
+ Generating realistic hand motion sequences in interaction with objects has +gained increasing attention with the growing interest in digital humans. Prior +work has illustrated the effectiveness of employing occupancy-based or +distance-based virtual sensors to extract hand-object interaction features. +Nonetheless, these methods show limited generalizability across object +categories, shapes and sizes. We hypothesize that this is due to two reasons: +1) the limited expressiveness of employed virtual sensors, and 2) scarcity of +available training data. To tackle this challenge, we introduce a novel +joint-centered sensor designed to reason about local object geometry near +potential interaction regions. The sensor queries for object surface points in +the neighbourhood of each hand joint. As an important step towards mitigating +the learning complexity, we transform the points from global frame to hand +template frame and use a shared module to process sensor features of each +individual joint. This is followed by a spatio-temporal transformer network +aimed at capturing correlation among the joints in different dimensions. +Moreover, we devise simple heuristic rules to augment the limited training +sequences with vast static hand grasping samples. This leads to a broader +spectrum of grasping types observed during training, in turn enhancing our +model's generalization capability. We evaluate on two public datasets, GRAB and +InterCap, where our method shows superiority over baselines both quantitatively +and perceptually. + +
+
+
+
+
+ + ☆ T-VSL: Text-Guided Visual Sound Source Localization in Mixtures CVPR-2024 + + +
+ Visual sound source localization poses a significant challenge in identifying +the semantic region of each sounding source within a video. Existing +self-supervised and weakly supervised source localization methods struggle to +accurately distinguish the semantic regions of each sounding object, +particularly in multi-source mixtures. These methods often rely on audio-visual +correspondence as guidance, which can lead to substantial performance drops in +complex multi-source localization scenarios. The lack of access to individual +source sounds in multi-source mixtures during training exacerbates the +difficulty of learning effective audio-visual correspondence for localization. +To address this limitation, in this paper, we propose incorporating the text +modality as an intermediate feature guide using tri-modal joint embedding +models (e.g., AudioCLIP) to disentangle the semantic audio-visual source +correspondence in multi-source mixtures. Our framework, dubbed T-VSL, begins by +predicting the class of sounding entities in mixtures. Subsequently, the +textual representation of each sounding source is employed as guidance to +disentangle fine-grained audio-visual source correspondence from multi-source +mixtures, leveraging the tri-modal AudioCLIP embedding. This approach enables +our framework to handle a flexible number of sources and exhibits promising +zero-shot transferability to unseen classes during test time. Extensive +experiments conducted on the MUSIC, VGGSound, and VGGSound-Instruments datasets +demonstrate significant performance improvements over state-of-the-art methods. + +
+
+ comment: Tech report. Accepted in CVPR-2024 +
+
+
+
+
+ + ☆ Exploring Latent Pathways: Enhancing the Interpretability of Autonomous + Driving with a Variational Autoencoder IROS 2024 + + +
+ Autonomous driving presents a complex challenge, which is usually addressed +with artificial intelligence models that are end-to-end or modular in nature. +Within the landscape of modular approaches, a bio-inspired neural circuit +policy model has emerged as an innovative control module, offering a compact +and inherently interpretable system to infer a steering wheel command from +abstract visual features. Here, we take a leap forward by integrating a +variational autoencoder with the neural circuit policy controller, forming a +solution that directly generates steering commands from input camera images. By +substituting the traditional convolutional neural network approach to feature +extraction with a variational autoencoder, we enhance the system's +interpretability, enabling a more transparent and understandable +decision-making process. + In addition to the architectural shift toward a variational autoencoder, this +study introduces the automatic latent perturbation tool, a novel contribution +designed to probe and elucidate the latent features within the variational +autoencoder. The automatic latent perturbation tool automates the +interpretability process, offering granular insights into how specific latent +variables influence the overall model's behavior. Through a series of numerical +experiments, we demonstrate the interpretative power of the variational +autoencoder-neural circuit policy model and the utility of the automatic latent +perturbation tool in making the inner workings of autonomous driving systems +more transparent. + +
+
+ comment: Submitted to 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ Global Mapping of Exposure and Physical Vulnerability Dynamics in Least + Developed Countries using Remote Sensing and Machine Learning ICLR + + +
+ As the world marked the midterm of the Sendai Framework for Disaster Risk +Reduction 2015-2030, many countries are still struggling to monitor their +climate and disaster risk because of the expensive large-scale survey of the +distribution of exposure and physical vulnerability and, hence, are not on +track in reducing risks amidst the intensifying effects of climate change. We +present an ongoing effort in mapping this vital information using machine +learning and time-series remote sensing from publicly available Sentinel-1 SAR +GRD and Sentinel-2 Harmonized MSI. We introduce the development of +"OpenSendaiBench" consisting of 47 countries wherein most are least developed +(LDCs), trained ResNet-50 deep learning models, and demonstrated the region of +Dhaka, Bangladesh by mapping the distribution of its informal constructions. As +a pioneering effort in auditing global disaster risk over time, this paper aims +to advance the area of large-scale risk quantification in informing our +collective long-term efforts in reducing climate and disaster risk. + +
+
+ comment: This is the camera-ready paper for the accepted poster at the 2nd + Machine Learning for Remote Sensing Workshop, 12th International Conference + on Learning Representations (ICLR) in Vienna, Austria, on the 11th of May + 2024. Access the poster here: https://zenodo.org/doi/10.5281/zenodo.10903886 + Watch the video version of our poster here: https://youtu.be/N6ithJeCF4M +
+
+
+
+
+ + ☆ Unleash the Potential of CLIP for Video Highlight Detection + + +
+ Multimodal and large language models (LLMs) have revolutionized the +utilization of open-world knowledge, unlocking novel potentials across various +tasks and applications. Among these domains, the video domain has notably +benefited from their capabilities. In this paper, we present Highlight-CLIP +(HL-CLIP), a method designed to excel in the video highlight detection task by +leveraging the pre-trained knowledge embedded in multimodal models. By simply +fine-tuning the multimodal encoder in combination with our innovative saliency +pooling technique, we have achieved the state-of-the-art performance in the +highlight detection task, the QVHighlight Benchmark, to the best of our +knowledge. + +
+
+
+
+
+ + ☆ Atom-Level Optical Chemical Structure Recognition with Limited + Supervision + + +
+ Identifying the chemical structure from a graphical representation, or image, +of a molecule is a challenging pattern recognition task that would greatly +benefit drug development. Yet, existing methods for chemical structure +recognition do not typically generalize well, and show diminished effectiveness +when confronted with domains where data is sparse, or costly to generate, such +as hand-drawn molecule images. To address this limitation, we propose a new +chemical structure recognition tool that delivers state-of-the-art performance +and can adapt to new domains with a limited number of data samples and +supervision. Unlike previous approaches, our method provides atom-level +localization, and can therefore segment the image into the different atoms and +bonds. Our model is the first model to perform OCSR with atom-level entity +detection with only SMILES supervision. Through rigorous and extensive +benchmarking, we demonstrate the preeminence of our chemical structure +recognition approach in terms of data efficiency, accuracy, and atom-level +entity prediction. + +
+
+ comment: Accepted in IEEE/CVF Conference on Computer Vision and Pattern + Recognition 2024 +
+
+
+
+
+ + ☆ Generalizing 6-DoF Grasp Detection via Domain Prior Knowledge CVPR 2024 + + +
+ We focus on the generalization ability of the 6-DoF grasp detection method in +this paper. While learning-based grasp detection methods can predict grasp +poses for unseen objects using the grasp distribution learned from the training +set, they often exhibit a significant performance drop when encountering +objects with diverse shapes and structures. To enhance the grasp detection +methods' generalization ability, we incorporate domain prior knowledge of +robotic grasping, enabling better adaptation to objects with significant shape +and structure differences. More specifically, we employ the physical constraint +regularization during the training phase to guide the model towards predicting +grasps that comply with the physical rule on grasping. For the unstable grasp +poses predicted on novel objects, we design a contact-score joint optimization +using the projection contact map to refine these poses in cluttered scenarios. +Extensive experiments conducted on the GraspNet-1billion benchmark demonstrate +a substantial performance gain on the novel object set and the real-world +grasping experiments also demonstrate the effectiveness of our generalizing +6-DoF grasp detection method. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Disentangled Pre-training for Human-Object Interaction Detection CVPR2024 + + +
+ Detecting human-object interaction (HOI) has long been limited by the amount +of supervised data available. Recent approaches address this issue by +pre-training according to pseudo-labels, which align object regions with HOI +triplets parsed from image captions. However, pseudo-labeling is tricky and +noisy, making HOI pre-training a complex process. Therefore, we propose an +efficient disentangled pre-training method for HOI detection (DP-HOI) to +address this problem. First, DP-HOI utilizes object detection and action +recognition datasets to pre-train the detection and interaction decoder layers, +respectively. Then, we arrange these decoder layers so that the pre-training +architecture is consistent with the downstream HOI detection task. This +facilitates efficient knowledge transfer. Specifically, the detection decoder +identifies reliable human instances in each action recognition dataset image, +generates one corresponding query, and feeds it into the interaction decoder +for verb classification. Next, we combine the human instance verb predictions +in the same image and impose image-level supervision. The DP-HOI structure can +be easily adapted to the HOI detection task, enabling effective model parameter +initialization. Therefore, it significantly enhances the performance of +existing HOI detection models on a broad range of rare categories. The code and +pre-trained weight are available at https://github.com/xingaoli/DP-HOI. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Contextual Embedding Learning to Enhance 2D Networks for Volumetric + Image Segmentation + + +
+ The segmentation of organs in volumetric medical images plays an important +role in computer-aided diagnosis and treatment/surgery planning. Conventional +2D convolutional neural networks (CNNs) can hardly exploit the spatial +correlation of volumetric data. Current 3D CNNs have the advantage to extract +more powerful volumetric representations but they usually suffer from occupying +excessive memory and computation nevertheless. In this study we aim to enhance +the 2D networks with contextual information for better volumetric image +segmentation. Accordingly, we propose a contextual embedding learning approach +to facilitate 2D CNNs capturing spatial information properly. Our approach +leverages the learned embedding and the slice-wisely neighboring matching as a +soft cue to guide the network. In such a way, the contextual information can be +transferred slice-by-slice thus boosting the volumetric representation of the +network. Experiments on challenging prostate MRI dataset (PROMISE12) and +abdominal CT dataset (CHAOS) show that our contextual embedding learning can +effectively leverage the inter-slice context and improve segmentation +performance. The proposed approach is a plug-and-play, and memory-efficient +solution to enhance the 2D networks for volumetric segmentation. The code will +be publicly available. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ AddSR: Accelerating Diffusion-based Blind Super-Resolution with + Adversarial Diffusion Distillation + + +
+ Blind super-resolution methods based on stable diffusion showcase formidable +generative capabilities in reconstructing clear high-resolution images with +intricate details from low-resolution inputs. However, their practical +applicability is often hampered by poor efficiency, stemming from the +requirement of thousands or hundreds of sampling steps. Inspired by the +efficient text-to-image approach adversarial diffusion distillation (ADD), we +design AddSR to address this issue by incorporating the ideas of both +distillation and ControlNet. Specifically, we first propose a prediction-based +self-refinement strategy to provide high-frequency information in the student +model output with marginal additional time cost. Furthermore, we refine the +training process by employing HR images, rather than LR images, to regulate the +teacher model, providing a more robust constraint for distillation. Second, we +introduce a timestep-adapting loss to address the perception-distortion +imbalance problem introduced by ADD. Extensive experiments demonstrate our +AddSR generates better restoration results, while achieving faster speed than +previous SD-based state-of-the-art models (e.g., 7x faster than SeeSR). + +
+
+
+
+
+ + ☆ Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization + Algorithm for Deep Learning + + +
+ Training deep neural networks is a challenging task. In order to speed up +training and enhance the performance of deep neural networks, we rectify the +vanilla conjugate gradient as conjugate-gradient-like and incorporate it into +the generic Adam, and thus propose a new optimization algorithm named +CG-like-Adam for deep learning. Specifically, both the first-order and the +second-order moment estimation of generic Adam are replaced by the +conjugate-gradient-like. Convergence analysis handles the cases where the +exponential moving average coefficient of the first-order moment estimation is +constant and the first-order moment estimation is unbiased. Numerical +experiments show the superiority of the proposed algorithm based on the +CIFAR10/100 dataset. + +
+
+ comment: 32 pages, 13 figures +
+
+
+
+
+ + ☆ Upsample Guidance: Scale Up Diffusion Models without Training + + +
+ Diffusion models have demonstrated superior performance across various +generative tasks including images, videos, and audio. However, they encounter +difficulties in directly generating high-resolution samples. Previously +proposed solutions to this issue involve modifying the architecture, further +training, or partitioning the sampling process into multiple stages. These +methods have the limitation of not being able to directly utilize pre-trained +models as-is, requiring additional work. In this paper, we introduce upsample +guidance, a technique that adapts pretrained diffusion model (e.g., $512^2$) to +generate higher-resolution images (e.g., $1536^2$) by adding only a single term +in the sampling process. Remarkably, this technique does not necessitate any +additional training or relying on external models. We demonstrate that upsample +guidance can be applied to various models, such as pixel-space, latent space, +and video diffusion models. We also observed that the proper selection of +guidance scale can improve image quality, fidelity, and prompt alignment. + +
+
+ comment: 15 pages, 15 Figures +
+
+
+
+
+ + ☆ Samba: Semantic Segmentation of Remotely Sensed Images with State Space + Model + + +
+ High-resolution remotely sensed images poses a challenge for commonly used +semantic segmentation methods such as Convolutional Neural Network (CNN) and +Vision Transformer (ViT). CNN-based methods struggle with handling such +high-resolution images due to their limited receptive field, while ViT faces +challenges to handle long sequences. Inspired by Mamba, which adopts a State +Space Model (SSM) to efficiently capture global semantic information, we +propose a semantic segmentation framework for high-resolution remotely sensed +images, named Samba. Samba utilizes an encoder-decoder architecture, with Samba +blocks serving as the encoder for efficient multi-level semantic information +extraction, and UperNet functioning as the decoder. We evaluate Samba on the +LoveDA dataset, comparing its performance against top-performing CNN and ViT +methods. The results reveal that Samba achieved unparalleled performance on +LoveDA. This represents that the proposed Samba is an effective application of +the SSM in semantic segmentation of remotely sensed images, setting a new +benchmark in performance for Mamba-based techniques in this specific +application. The source code and baseline implementations are available at +https://github.com/zhuqinfeng1999/Samba. + +
+
+
+
+
+ + ☆ Boosting Visual Recognition for Autonomous Driving in Real-world + Degradations with Deep Channel Prior + + +
+ The environmental perception of autonomous vehicles in normal conditions have +achieved considerable success in the past decade. However, various unfavourable +conditions such as fog, low-light, and motion blur will degrade image quality +and pose tremendous threats to the safety of autonomous driving. That is, when +applied to degraded images, state-of-the-art visual models often suffer +performance decline due to the feature content loss and artifact interference +caused by statistical and structural properties disruption of captured images. +To address this problem, this work proposes a novel Deep Channel Prior (DCP) +for degraded visual recognition. Specifically, we observe that, in the deep +representation space of pre-trained models, the channel correlations of +degraded features with the same degradation type have uniform distribution even +if they have different content and semantics, which can facilitate the mapping +relationship learning between degraded and clear representations in +high-sparsity feature space. Based on this, a novel plug-and-play Unsupervised +Feature Enhancement Module (UFEM) is proposed to achieve unsupervised feature +correction, where the multi-adversarial mechanism is introduced in the first +stage of UFEM to achieve the latent content restoration and artifact removal in +high-sparsity feature space. Then, the generated features are transferred to +the second stage for global correlation modulation under the guidance of DCP to +obtain high-quality and recognition-friendly features. Evaluations of three +tasks and eight benchmark datasets demonstrate that our proposed method can +comprehensively improve the performance of pre-trained models in real +degradation conditions. The source code is available at +https://github.com/liyuhang166/Deep_Channel_Prior + +
+
+
+
+
+ + ☆ MotionChain: Conversational Motion Controllers via Multimodal Prompts + + +
+ Recent advancements in language models have demonstrated their adeptness in +conducting multi-turn dialogues and retaining conversational context. However, +this proficiency remains largely unexplored in other multimodal generative +models, particularly in human motion models. By integrating multi-turn +conversations in controlling continuous virtual human movements, generative +human motion models can achieve an intuitive and step-by-step process of human +task execution for humanoid robotics, game agents, or other embodied systems. +In this work, we present MotionChain, a conversational human motion controller +to generate continuous and long-term human motion through multimodal prompts. +Specifically, MotionChain consists of multi-modal tokenizers that transform +various data types such as text, image, and motion, into discrete tokens, +coupled with a Vision-Motion-aware Language model. By leveraging large-scale +language, vision-language, and vision-motion data to assist motion-related +generation tasks, MotionChain thus comprehends each instruction in multi-turn +conversation and generates human motions followed by these prompts. Extensive +experiments validate the efficacy of MotionChain, demonstrating +state-of-the-art performance in conversational motion generation, as well as +more intuitive manners of controlling and interacting with virtual humans. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ☆ Task Integration Distillation for Object Detectors + + +
+ Knowledge distillation is a widely adopted technique for model lightening. +However, the performance of most knowledge distillation methods in the domain +of object detection is not satisfactory. Typically, knowledge distillation +approaches consider only the classification task among the two sub-tasks of an +object detector, largely overlooking the regression task. This oversight leads +to a partial understanding of the object detector's comprehensive task, +resulting in skewed estimations and potentially adverse effects. Therefore, we +propose a knowledge distillation method that addresses both the classification +and regression tasks, incorporating a task significance strategy. By evaluating +the importance of features based on the output of the detector's two sub-tasks, +our approach ensures a balanced consideration of both classification and +regression tasks in object detection. Drawing inspiration from real-world +teaching processes and the definition of learning condition, we introduce a +method that focuses on both key and weak areas. By assessing the value of +features for knowledge distillation based on their importance differences, we +accurately capture the current model's learning situation. This method +effectively prevents the issue of biased predictions about the model's learning +reality caused by an incomplete utilization of the detector's outputs. + +
+
+
+
+
+ + ☆ Beyond Image Super-Resolution for Image Recognition with Task-Driven + Perceptual Loss CVPR 2024 + + +
+ In real-world scenarios, image recognition tasks, such as semantic +segmentation and object detection, often pose greater challenges due to the +lack of information available within low-resolution (LR) content. Image +super-resolution (SR) is one of the promising solutions for addressing the +challenges. However, due to the ill-posed property of SR, it is challenging for +typical SR methods to restore task-relevant high-frequency contents, which may +dilute the advantage of utilizing the SR method. Therefore, in this paper, we +propose Super-Resolution for Image Recognition (SR4IR) that effectively guides +the generation of SR images beneficial to achieving satisfactory image +recognition performance when processing LR images. The critical component of +our SR4IR is the task-driven perceptual (TDP) loss that enables the SR network +to acquire task-specific knowledge from a network tailored for a specific task. +Moreover, we propose a cross-quality patch mix and an alternate training +framework that significantly enhances the efficacy of the TDP loss by +addressing potential problems when employing the TDP loss. Through extensive +experiments, we demonstrate that our SR4IR achieves outstanding task +performance by generating SR images useful for a specific image recognition +task, including semantic segmentation, object detection, and image +classification. The implementation code is available at +https://github.com/JaehaKim97/SR4IR. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ RefQSR: Reference-based Quantization for Image Super-Resolution Networks + + +
+ Single image super-resolution (SISR) aims to reconstruct a high-resolution +image from its low-resolution observation. Recent deep learning-based SISR +models show high performance at the expense of increased computational costs, +limiting their use in resource-constrained environments. As a promising +solution for computationally efficient network design, network quantization has +been extensively studied. However, existing quantization methods developed for +SISR have yet to effectively exploit image self-similarity, which is a new +direction for exploration in this study. We introduce a novel method called +reference-based quantization for image super-resolution (RefQSR) that applies +high-bit quantization to several representative patches and uses them as +references for low-bit quantization of the rest of the patches in an image. To +this end, we design dedicated patch clustering and reference-based quantization +modules and integrate them into existing SISR network quantization methods. The +experimental results demonstrate the effectiveness of RefQSR on various SISR +networks and quantization methods. + +
+
+ comment: Accepted by IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ☆ JRDB-PanoTrack: An Open-world Panoptic Segmentation and Tracking Robotic + Dataset in Crowded Human Environments CVPR 2024 + + +
+ Autonomous robot systems have attracted increasing research attention in +recent years, where environment understanding is a crucial step for robot +navigation, human-robot interaction, and decision. Real-world robot systems +usually collect visual data from multiple sensors and are required to recognize +numerous objects and their movements in complex human-crowded settings. +Traditional benchmarks, with their reliance on single sensors and limited +object classes and scenarios, fail to provide the comprehensive environmental +understanding robots need for accurate navigation, interaction, and +decision-making. As an extension of JRDB dataset, we unveil JRDB-PanoTrack, a +novel open-world panoptic segmentation and tracking benchmark, towards more +comprehensive environmental perception. JRDB-PanoTrack includes (1) various +data involving indoor and outdoor crowded scenes, as well as comprehensive 2D +and 3D synchronized data modalities; (2) high-quality 2D spatial panoptic +segmentation and temporal tracking annotations, with additional 3D label +projections for further spatial understanding; (3) diverse object classes for +closed- and open-world recognition benchmarks, with OSPA-based metrics for +evaluation. Extensive evaluation of leading methods shows significant +challenges posed by our dataset. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ PRISM-TopoMap: Online Topological Mapping with Place Recognition and + Scan Matching IROS 2024 + + +
+ Mapping is one of the crucial tasks enabling autonomous navigation of a +mobile robot. Conventional mapping methods output dense geometric map +representation, e.g. an occupancy grid, which is not trivial to keep consistent +for the prolonged runs covering large environments. Meanwhile, capturing the +topological structure of the workspace enables fast path planning, is less +prone to odometry error accumulation and does not consume much memory. +Following this idea, this paper introduces PRISM-TopoMap -- a topological +mapping method that maintains a graph of locally aligned locations not relying +on global metric coordinates. The proposed method involves learnable multimodal +place recognition paired with the scan matching pipeline for localization and +loop closure in the graph of locations. The latter is updated online and the +robot is localized in a proper node at each time step. We conduct a broad +experimental evaluation of the suggested approach in a range of photo-realistic +environments and on a real robot (wheeled differential driven Husky robot), and +compare it to state of the art. The results of the empirical evaluation confirm +that PRISM-Topomap consistently outperforms competitors across several measures +of mapping and navigation efficiency and performs well on a real robot. The +code of PRISM-Topomap is open-sourced and available at +https://github.com/kirillMouraviev/prism-topomap. + +
+
+ comment: This is a pre-print of the paper submitted to an IROS 2024 conference +
+
+
+
+
+ + ☆ A Universal Knowledge Embedded Contrastive Learning Framework for + Hyperspectral Image Classification + + +
+ Hyperspectral image (HSI) classification techniques have been intensively +studied and a variety of models have been developed. However, these HSI +classification models are confined to pocket models and unrealistic ways of +datasets partitioning. The former limits the generalization performance of the +model and the latter is partitioned leads to inflated model evaluation metrics, +which results in plummeting model performance in the real world. Therefore, we +propose a universal knowledge embedded contrastive learning framework (KnowCL) +for supervised, unsupervised, and semisupervised HSI classification, which +largely closes the gap of HSI classification models between pocket models and +standard vision backbones. We present a new HSI processing pipeline in +conjunction with a range of data transformation and augmentation techniques +that provide diverse data representations and realistic data partitioning. The +proposed framework based on this pipeline is compatible with all kinds of +backbones and can fully exploit labeled and unlabeled samples with expected +training time. Furthermore, we design a new loss function, which can adaptively +fuse the supervised loss and unsupervised loss, enhancing the learning +performance. This proposed new classification paradigm shows great potentials +in exploring for HSI classification technology. The code can be accessed at +https://github.com/quanweiliu/KnowCL. + +
+
+
+
+
+ + ☆ Release of Pre-Trained Models for the Japanese Language LREC + + +
+ AI democratization aims to create a world in which the average person can +utilize AI techniques. To achieve this goal, numerous research institutes have +attempted to make their results accessible to the public. In particular, large +pre-trained models trained on large-scale data have shown unprecedented +potential, and their release has had a significant impact. However, most of the +released models specialize in the English language, and thus, AI +democratization in non-English-speaking communities is lagging significantly. +To reduce this gap in AI access, we released Generative Pre-trained Transformer +(GPT), Contrastive Language and Image Pre-training (CLIP), Stable Diffusion, +and Hidden-unit Bidirectional Encoder Representations from Transformers +(HuBERT) pre-trained in Japanese. By providing these models, users can freely +interface with AI that aligns with Japanese cultural values and ensures the +identity of Japanese culture, thus enhancing the democratization of AI. +Additionally, experiments showed that pre-trained models specialized for +Japanese can efficiently achieve high performance in Japanese tasks. + +
+
+ comment: 9 pages, 1 figure, 5 tables, accepted for LREC-COLING 2024. Models + are publicly available at https://huggingface.co/rinna +
+
+
+
+
+ + ☆ Supporting Mitosis Detection AI Training with Inter-Observer Eye-Gaze + Consistencies + + +
+ The expansion of artificial intelligence (AI) in pathology tasks has +intensified the demand for doctors' annotations in AI development. However, +collecting high-quality annotations from doctors is costly and time-consuming, +creating a bottleneck in AI progress. This study investigates eye-tracking as a +cost-effective technology to collect doctors' behavioral data for AI training +with a focus on the pathology task of mitosis detection. One major challenge in +using eye-gaze data is the low signal-to-noise ratio, which hinders the +extraction of meaningful information. We tackled this by levering the +properties of inter-observer eye-gaze consistencies and creating eye-gaze +labels from consistent eye-fixations shared by a group of observers. Our study +involved 14 non-medical participants, from whom we collected eye-gaze data and +generated eye-gaze labels based on varying group sizes. We assessed the +efficacy of such eye-gaze labels by training Convolutional Neural Networks +(CNNs) and comparing their performance to those trained with ground truth +annotations and a heuristic-based baseline. Results indicated that CNNs trained +with our eye-gaze labels closely followed the performance of ground-truth-based +CNNs, and significantly outperformed the baseline. Although primarily focused +on mitosis, we envision that insights from this study can be generalized to +other medical imaging tasks. + +
+
+ comment: Accepted by IEEE International Conference on Healthcare Informatics + 2024 +
+
+
+
+
+ + ☆ FashionEngine: Interactive Generation and Editing of 3D Clothed Humans + + +
+ We present FashionEngine, an interactive 3D human generation and editing +system that allows us to design 3D digital humans in a way that aligns with how +humans interact with the world, such as natural languages, visual perceptions, +and hand-drawing. FashionEngine automates the 3D human production with three +key components: 1) A pre-trained 3D human diffusion model that learns to model +3D humans in a semantic UV latent space from 2D image training data, which +provides strong priors for diverse generation and editing tasks. 2) +Multimodality-UV Space encoding the texture appearance, shape topology, and +textual semantics of human clothing in a canonical UV-aligned space, which +faithfully aligns the user multimodal inputs with the implicit UV latent space +for controllable 3D human editing. The multimodality-UV space is shared across +different user inputs, such as texts, images, and sketches, which enables +various joint multimodal editing tasks. 3) Multimodality-UV Aligned Sampler +learns to sample high-quality and diverse 3D humans from the diffusion prior +for multimodal user inputs. Extensive experiments validate FashionEngine's +state-of-the-art performance for conditional generation/editing tasks. In +addition, we present an interactive user interface for our FashionEngine that +enables both conditional and unconditional generation tasks, and editing tasks +including pose/view/shape control, text-, image-, and sketch-driven 3D human +editing and 3D virtual try-on, in a unified framework. Our project page is at: +https://taohuumd.github.io/projects/FashionEngine. + +
+
+ comment: Project Page: https://taohuumd.github.io/projects/FashionEngine +
+
+
+
+
+ + ☆ AI WALKUP: A Computer-Vision Approach to Quantifying MDS-UPDRS in + Parkinson's Disease + + +
+ Parkinson's Disease (PD) is the second most common neurodegenerative +disorder. The existing assessment method for PD is usually the Movement +Disorder Society - Unified Parkinson's Disease Rating Scale (MDS-UPDRS) to +assess the severity of various types of motor symptoms and disease progression. +However, manual assessment suffers from high subjectivity, lack of consistency, +and high cost and low efficiency of manual communication. We want to use a +computer vision based solution to capture human pose images based on a camera, +reconstruct and perform motion analysis using algorithms, and extract the +features of the amount of motion through feature engineering. The proposed +approach can be deployed on different smartphones, and the video recording and +artificial intelligence analysis can be done quickly and easily through our +APP. + +
+
+ comment: Technical report for AI WALKUP, an APP winning 3rd Prize of 2022 HUST + GS AI Innovation and Design Competition +
+
+
+
+
+ + ☆ EDTalk: Efficient Disentanglement for Emotional Talking Head Synthesis + + +
+ Achieving disentangled control over multiple facial motions and accommodating +diverse input modalities greatly enhances the application and entertainment of +the talking head generation. This necessitates a deep exploration of the +decoupling space for facial features, ensuring that they a) operate +independently without mutual interference and b) can be preserved to share with +different modal input, both aspects often neglected in existing methods. To +address this gap, this paper proposes a novel Efficient Disentanglement +framework for Talking head generation (EDTalk). Our framework enables +individual manipulation of mouth shape, head pose, and emotional expression, +conditioned on video or audio inputs. Specifically, we employ three lightweight +modules to decompose the facial dynamics into three distinct latent spaces +representing mouth, pose, and expression, respectively. Each space is +characterized by a set of learnable bases whose linear combinations define +specific motions. To ensure independence and accelerate training, we enforce +orthogonality among bases and devise an efficient training strategy to allocate +motion responsibilities to each space without relying on external knowledge. +The learned bases are then stored in corresponding banks, enabling shared +visual priors with audio input. Furthermore, considering the properties of each +space, we propose an Audio-to-Motion module for audio-driven talking head +synthesis. Experiments are conducted to demonstrate the effectiveness of +EDTalk. We recommend watching the project website: +https://tanshuai0219.github.io/EDTalk/ + +
+
+ comment: 22 pages, 15 figures +
+
+
+
+
+ + ☆ ContrastCAD: Contrastive Learning-based Representation Learning for + Computer-Aided Design Models + + +
+ The success of Transformer-based models has encouraged many researchers to +learn CAD models using sequence-based approaches. However, learning CAD models +is still a challenge, because they can be represented as complex shapes with +long construction sequences. Furthermore, the same CAD model can be expressed +using different CAD construction sequences. We propose a novel contrastive +learning-based approach, named ContrastCAD, that effectively captures semantic +information within the construction sequences of the CAD model. ContrastCAD +generates augmented views using dropout techniques without altering the shape +of the CAD model. We also propose a new CAD data augmentation method, called a +Random Replace and Extrude (RRE) method, to enhance the learning performance of +the model when training an imbalanced training CAD dataset. Experimental +results show that the proposed RRE augmentation method significantly enhances +the learning performance of Transformer-based autoencoders, even for complex +CAD models having very long construction sequences. The proposed ContrastCAD +model is shown to be robust to permutation changes of construction sequences +and performs better representation learning by generating representation spaces +where similar CAD models are more closely clustered. Our codes are available at +https://github.com/cm8908/ContrastCAD. + +
+
+
+
+
+ + ☆ A Closer Look at Spatial-Slice Features Learning for COVID-19 Detection + + +
+ Conventional Computed Tomography (CT) imaging recognition faces two +significant challenges: (1) There is often considerable variability in the +resolution and size of each CT scan, necessitating strict requirements for the +input size and adaptability of models. (2) CT-scan contains large number of +out-of-distribution (OOD) slices. The crucial features may only be present in +specific spatial regions and slices of the entire CT scan. How can we +effectively figure out where these are located? To deal with this, we introduce +an enhanced Spatial-Slice Feature Learning (SSFL++) framework specifically +designed for CT scan. It aim to filter out a OOD data within whole CT scan, +enabling our to select crucial spatial-slice for analysis by reducing 70% +redundancy totally. Meanwhile, we proposed Kernel-Density-based slice Sampling +(KDS) method to improve the stability when training and inference stage, +therefore speeding up the rate of convergence and boosting performance. As a +result, the experiments demonstrate the promising performance of our model +using a simple EfficientNet-2D (E2D) model, even with only 1% of the training +data. The efficacy of our approach has been validated on the COVID-19-CT-DB +datasets provided by the DEF-AI-MIA workshop, in conjunction with CVPR 2024. +Our source code will be made available. + +
+
+ comment: Submitted to DEF-AI-MIA workshop. arXiv admin note: text overlap with + arXiv:2403.11230 +
+
+
+
+
+ + ☆ Learning to Control Camera Exposure via Reinforcement Learning CVPR 2024 + + +
+ Adjusting camera exposure in arbitrary lighting conditions is the first step +to ensure the functionality of computer vision applications. Poorly adjusted +camera exposure often leads to critical failure and performance degradation. +Traditional camera exposure control methods require multiple convergence steps +and time-consuming processes, making them unsuitable for dynamic lighting +conditions. In this paper, we propose a new camera exposure control framework +that rapidly controls camera exposure while performing real-time processing by +exploiting deep reinforcement learning. The proposed framework consists of four +contributions: 1) a simplified training ground to simulate real-world's diverse +and dynamic lighting changes, 2) flickering and image attribute-aware reward +design, along with lightweight state design for real-time processing, 3) a +static-to-dynamic lighting curriculum to gradually improve the agent's +exposure-adjusting capability, and 4) domain randomization techniques to +alleviate the limitation of the training ground and achieve seamless +generalization in the wild.As a result, our proposed method rapidly reaches a +desired exposure level within five steps with real-time processing (1 ms). +Also, the acquired images are well-exposed and show superiority in various +computer vision tasks, such as feature extraction and object detection. + +
+
+ comment: Accepted at CVPR 2024, *First two authors contributed equally to this + work. Project page link: https://sites.google.com/view/drl-ae +
+
+
+
+
+ + ☆ Learning Equi-angular Representations for Online Continual Learning CVPR 2024 + + +
+ Online continual learning suffers from an underfitted solution due to +insufficient training for prompt model update (e.g., single-epoch training). To +address the challenge, we propose an efficient online continual learning method +using the neural collapse phenomenon. In particular, we induce neural collapse +to form a simplex equiangular tight frame (ETF) structure in the representation +space so that the continuously learned model with a single epoch can better fit +to the streamed data by proposing preparatory data training and residual +correction in the representation space. With an extensive set of empirical +validations using CIFAR-10/100, TinyImageNet, ImageNet-200, and ImageNet-1K, we +show that our proposed method outperforms state-of-the-art methods by a +noticeable margin in various online continual learning scenarios such as +disjoint and Gaussian scheduled continuous (i.e., boundary-free) data setups. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ LR-FPN: Enhancing Remote Sensing Object Detection with Location Refined + Feature Pyramid Network + + +
+ Remote sensing target detection aims to identify and locate critical targets +within remote sensing images, finding extensive applications in agriculture and +urban planning. Feature pyramid networks (FPNs) are commonly used to extract +multi-scale features. However, existing FPNs often overlook extracting +low-level positional information and fine-grained context interaction. To +address this, we propose a novel location refined feature pyramid network +(LR-FPN) to enhance the extraction of shallow positional information and +facilitate fine-grained context interaction. The LR-FPN consists of two primary +modules: the shallow position information extraction module (SPIEM) and the +contextual interaction module (CIM). Specifically, SPIEM first maximizes the +retention of solid location information of the target by simultaneously +extracting positional and saliency information from the low-level feature map. +Subsequently, CIM injects this robust location information into different +layers of the original FPN through spatial and channel interaction, explicitly +enhancing the object area. Moreover, in spatial interaction, we introduce a +simple local and non-local interaction strategy to learn and retain the +saliency information of the object. Lastly, the LR-FPN can be readily +integrated into common object detection frameworks to improve performance +significantly. Extensive experiments on two large-scale remote sensing datasets +(i.e., DOTAV1.0 and HRSC2016) demonstrate that the proposed LR-FPN is superior +to state-of-the-art object detection approaches. Our code and models will be +publicly available. + +
+
+
+
+
+ + ☆ Spin-UP: Spin Light for Natural Light Uncalibrated Photometric Stereo CVPR2024 + + +
+ Natural Light Uncalibrated Photometric Stereo (NaUPS) relieves the strict +environment and light assumptions in classical Uncalibrated Photometric Stereo +(UPS) methods. However, due to the intrinsic ill-posedness and high-dimensional +ambiguities, addressing NaUPS is still an open question. Existing works impose +strong assumptions on the environment lights and objects' material, restricting +the effectiveness in more general scenarios. Alternatively, some methods +leverage supervised learning with intricate models while lacking +interpretability, resulting in a biased estimation. In this work, we proposed +Spin Light Uncalibrated Photometric Stereo (Spin-UP), an unsupervised method to +tackle NaUPS in various environment lights and objects. The proposed method +uses a novel setup that captures the object's images on a rotatable platform, +which mitigates NaUPS's ill-posedness by reducing unknowns and provides +reliable priors to alleviate NaUPS's ambiguities. Leveraging neural inverse +rendering and the proposed training strategies, Spin-UP recovers surface +normals, environment light, and isotropic reflectance under complex natural +light with low computational cost. Experiments have shown that Spin-UP +outperforms other supervised / unsupervised NaUPS methods and achieves +state-of-the-art performance on synthetic and real-world datasets. Codes and +data are available at https://github.com/LMozart/CVPR2024-SpinUP. + +
+
+ comment: Paper accepted by CVPR2024 +
+
+
+
+
+ + ☆ WaveDH: Wavelet Sub-bands Guided ConvNet for Efficient Image Dehazing + + +
+ The surge in interest regarding image dehazing has led to notable +advancements in deep learning-based single image dehazing approaches, +exhibiting impressive performance in recent studies. Despite these strides, +many existing methods fall short in meeting the efficiency demands of practical +applications. In this paper, we introduce WaveDH, a novel and compact ConvNet +designed to address this efficiency gap in image dehazing. Our WaveDH leverages +wavelet sub-bands for guided up-and-downsampling and frequency-aware feature +refinement. The key idea lies in utilizing wavelet decomposition to extract +low-and-high frequency components from feature levels, allowing for faster +processing while upholding high-quality reconstruction. The downsampling block +employs a novel squeeze-and-attention scheme to optimize the feature +downsampling process in a structurally compact manner through wavelet domain +learning, preserving discriminative features while discarding noise components. +In our upsampling block, we introduce a dual-upsample and fusion mechanism to +enhance high-frequency component awareness, aiding in the reconstruction of +high-frequency details. Departing from conventional dehazing methods that treat +low-and-high frequency components equally, our feature refinement block +strategically processes features with a frequency-aware approach. By employing +a coarse-to-fine methodology, it not only refines the details at frequency +levels but also significantly optimizes computational costs. The refinement is +performed in a maximum 8x downsampled feature space, striking a favorable +efficiency-vs-accuracy trade-off. Extensive experiments demonstrate that our +method, WaveDH, outperforms many state-of-the-art methods on several image +dehazing benchmarks with significantly reduced computational costs. Our code is +available at https://github.com/AwesomeHwang/WaveDH. + +
+
+ comment: Submitted to TMM +
+
+
+
+
+ + ☆ Language Model Guided Interpretable Video Action Reasoning CVPR 2024 + + +
+ While neural networks have excelled in video action recognition tasks, their +black-box nature often obscures the understanding of their decision-making +processes. Recent approaches used inherently interpretable models to analyze +video actions in a manner akin to human reasoning. These models, however, +usually fall short in performance compared to their black-box counterparts. In +this work, we present a new framework named Language-guided Interpretable +Action Recognition framework (LaIAR). LaIAR leverages knowledge from language +models to enhance both the recognition capabilities and the interpretability of +video models. In essence, we redefine the problem of understanding video model +decisions as a task of aligning video and language models. Using the logical +reasoning captured by the language model, we steer the training of the video +model. This integrated approach not only improves the video model's +adaptability to different domains but also boosts its overall performance. +Extensive experiments on two complex video action datasets, Charades & CAD-120, +validates the improved performance and interpretability of our LaIAR framework. +The code of LaIAR is available at https://github.com/NingWang2049/LaIAR. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ TSCM: A Teacher-Student Model for Vision Place Recognition Using + Cross-Metric Knowledge Distillation + + +
+ Visual place recognition (VPR) plays a pivotal role in autonomous exploration +and navigation of mobile robots within complex outdoor environments. While +cost-effective and easily deployed, camera sensors are sensitive to lighting +and weather changes, and even slight image alterations can greatly affect VPR +efficiency and precision. Existing methods overcome this by exploiting powerful +yet large networks, leading to significant consumption of computational +resources. In this paper, we propose a high-performance teacher and lightweight +student distillation framework called TSCM. It exploits our devised +cross-metric knowledge distillation to narrow the performance gap between the +teacher and student models, maintaining superior performance while enabling +minimal computational load during deployment. We conduct comprehensive +evaluations on large-scale datasets, namely Pittsburgh30k and Pittsburgh250k. +Experimental results demonstrate the superiority of our method over baseline +models in terms of recognition accuracy and model parameter efficiency. +Moreover, our ablation studies show that the proposed knowledge distillation +technique surpasses other counterparts. The code of our method has been +released at https://github.com/nubot-nudt/TSCM. + +
+
+
+
+
+ + ☆ Learning Temporal Cues by Predicting Objects Move for Multi-camera 3D + Object Detection + + +
+ In autonomous driving and robotics, there is a growing interest in utilizing +short-term historical data to enhance multi-camera 3D object detection, +leveraging the continuous and correlated nature of input video streams. Recent +work has focused on spatially aligning BEV-based features over timesteps. +However, this is often limited as its gain does not scale well with long-term +past observations. To address this, we advocate for supervising a model to +predict objects' poses given past observations, thus explicitly guiding to +learn objects' temporal cues. To this end, we propose a model called DAP +(Detection After Prediction), consisting of a two-branch network: (i) a branch +responsible for forecasting the current objects' poses given past observations +and (ii) another branch that detects objects based on the current and past +observations. The features predicting the current objects from branch (i) is +fused into branch (ii) to transfer predictive knowledge. We conduct extensive +experiments with the large-scale nuScenes datasets, and we observe that +utilizing such predictive information significantly improves the overall +detection performance. Our model can be used plug-and-play, showing consistent +performance gain. + +
+
+
+
+
+ + ☆ Diffusion Deepfake + + +
+ Recent progress in generative AI, primarily through diffusion models, +presents significant challenges for real-world deepfake detection. The +increased realism in image details, diverse content, and widespread +accessibility to the general public complicates the identification of these +sophisticated deepfakes. Acknowledging the urgency to address the vulnerability +of current deepfake detectors to this evolving threat, our paper introduces two +extensive deepfake datasets generated by state-of-the-art diffusion models as +other datasets are less diverse and low in quality. Our extensive experiments +also showed that our dataset is more challenging compared to the other face +deepfake datasets. Our strategic dataset creation not only challenge the +deepfake detectors but also sets a new benchmark for more evaluation. Our +comprehensive evaluation reveals the struggle of existing detection methods, +often optimized for specific image domains and manipulations, to effectively +adapt to the intricate nature of diffusion deepfakes, limiting their practical +utility. To address this critical issue, we investigate the impact of enhancing +training data diversity on representative detection methods. This involves +expanding the diversity of both manipulation techniques and image domains. Our +findings underscore that increasing training data diversity results in improved +generalizability. Moreover, we propose a novel momentum difficulty boosting +strategy to tackle the additional challenge posed by training data +heterogeneity. This strategy dynamically assigns appropriate sample weights +based on learning difficulty, enhancing the model's adaptability to both easy +and challenging samples. Extensive experiments on both existing and newly +proposed benchmarks demonstrate that our model optimization approach surpasses +prior alternatives significantly. + +
+
+ comment: 28 pages including Supplementary material +
+
+
+
+
+ + ☆ Leveraging Digital Perceptual Technologies for Remote Perception and + Analysis of Human Biomechanical Processes: A Contactless Approach for + Workload and Joint Force Assessment + + +
+ This study presents an innovative computer vision framework designed to +analyze human movements in industrial settings, aiming to enhance biomechanical +analysis by integrating seamlessly with existing software. Through a +combination of advanced imaging and modeling techniques, the framework allows +for comprehensive scrutiny of human motion, providing valuable insights into +kinematic patterns and kinetic data. Utilizing Convolutional Neural Networks +(CNNs), Direct Linear Transform (DLT), and Long Short-Term Memory (LSTM) +networks, the methodology accurately detects key body points, reconstructs 3D +landmarks, and generates detailed 3D body meshes. Extensive evaluations across +various movements validate the framework's effectiveness, demonstrating +comparable results to traditional marker-based models with minor differences in +joint angle estimations and precise estimations of weight and height. +Statistical analyses consistently support the framework's reliability, with +joint angle estimations showing less than a 5-degree difference for hip +flexion, elbow flexion, and knee angle methods. Additionally, weight estimation +exhibits an average error of less than 6 % for weight and less than 2 % for +height when compared to ground-truth values from 10 subjects. The integration +of the Biomech-57 landmark skeleton template further enhances the robustness +and reinforces the framework's credibility. This framework shows significant +promise for meticulous biomechanical analysis in industrial contexts, +eliminating the need for cumbersome markers and extending its utility to +diverse research domains, including the study of specific exoskeleton devices' +impact on facilitating the prompt return of injured workers to their tasks. + +
+
+
+
+
+ + ☆ Leveraging YOLO-World and GPT-4V LMMs for Zero-Shot Person Detection and + Action Recognition in Drone Imagery + + +
+ In this article, we explore the potential of zero-shot Large Multimodal +Models (LMMs) in the domain of drone perception. We focus on person detection +and action recognition tasks and evaluate two prominent LMMs, namely YOLO-World +and GPT-4V(ision) using a publicly available dataset captured from aerial +views. Traditional deep learning approaches rely heavily on large and +high-quality training datasets. However, in certain robotic settings, acquiring +such datasets can be resource-intensive or impractical within a reasonable +timeframe. The flexibility of prompt-based Large Multimodal Models (LMMs) and +their exceptional generalization capabilities have the potential to +revolutionize robotics applications in these scenarios. Our findings suggest +that YOLO-World demonstrates good detection performance. GPT-4V struggles with +accurately classifying action classes but delivers promising results in +filtering out unwanted region proposals and in providing a general description +of the scenery. This research represents an initial step in leveraging LMMs for +drone perception and establishes a foundation for future investigations in this +area. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ A Linear Time and Space Local Point Cloud Geometry Encoder via + Vectorized Kernel Mixture (VecKM) + + +
+ We propose VecKM, a novel local point cloud geometry encoder that is +descriptive, efficient and robust to noise. VecKM leverages a unique approach +by vectorizing a kernel mixture to represent the local point clouds. Such +representation is descriptive and robust to noise, which is supported by two +theorems that confirm its ability to reconstruct and preserve the similarity of +the local shape. Moreover, VecKM is the first successful attempt to reduce the +computation and memory costs from $O(n^2+nKd)$ to $O(nd)$ by sacrificing a +marginal constant factor, where $n$ is the size of the point cloud and $K$ is +neighborhood size. The efficiency is primarily due to VecKM's unique +factorizable property that eliminates the need of explicitly grouping points +into neighborhoods. In the normal estimation task, VecKM demonstrates not only +100x faster inference speed but also strongest descriptiveness and robustness +compared with existing popular encoders. In classification and segmentation +tasks, integrating VecKM as a preprocessing module achieves consistently better +performance than the PointNet, PointNet++, and point transformer baselines, and +runs consistently faster by up to 10x. + +
+
+
+
+
+ + ☆ Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level + Awareness + + +
+ To obtain high-quality positron emission tomography (PET) while minimizing +radiation exposure, a range of methods have been designed to reconstruct +standard-dose PET (SPET) from corresponding low-dose PET (LPET) images. +However, most current methods merely learn the mapping between +single-dose-level LPET and SPET images, but omit the dose disparity of LPET +images in clinical scenarios. In this paper, to reconstruct high-quality SPET +images from multi-dose-level LPET images, we design a novel two-phase +multi-dose-level PET reconstruction algorithm with dose level awareness, +containing a pre-training phase and a SPET prediction phase. Specifically, the +pre-training phase is devised to explore both fine-grained discriminative +features and effective semantic representation. The SPET prediction phase +adopts a coarse prediction network utilizing pre-learned dose level prior to +generate preliminary result, and a refinement network to precisely preserve the +details. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge +Dataset have demonstrated the superiority of our method. + +
+
+ comment: Accepted by ISBI2024 +
+
+
+
+
+ + ☆ mChartQA: A universal benchmark for multimodal Chart Question Answer + based on Vision-Language Alignment and Reasoning + + +
+ In the fields of computer vision and natural language processing, multimodal +chart question-answering, especially involving color, structure, and textless +charts, poses significant challenges. Traditional methods, which typically +involve either direct multimodal processing or a table-to-text conversion +followed by language model analysis, have limitations in effectively handling +these complex scenarios. This paper introduces a novel multimodal chart +question-answering model, specifically designed to address these intricate +tasks. Our model integrates visual and linguistic processing, overcoming the +constraints of existing methods. We adopt a dual-phase training approach: the +initial phase focuses on aligning image and text representations, while the +subsequent phase concentrates on optimizing the model's interpretative and +analytical abilities in chart-related queries. This approach has demonstrated +superior performance on multiple public datasets, particularly in handling +color, structure, and textless chart questions, indicating its effectiveness in +complex multimodal tasks. + +
+
+
+
+
+ + ☆ Bidirectional Multi-Scale Implicit Neural Representations for Image + Deraining + + +
+ How to effectively explore multi-scale representations of rain streaks is +important for image deraining. In contrast to existing Transformer-based +methods that depend mostly on single-scale rain appearance, we develop an +end-to-end multi-scale Transformer that leverages the potentially useful +features in various scales to facilitate high-quality image reconstruction. To +better explore the common degradation representations from spatially-varying +rain streaks, we incorporate intra-scale implicit neural representations based +on pixel coordinates with the degraded inputs in a closed-loop design, enabling +the learned features to facilitate rain removal and improve the robustness of +the model in complex scenarios. To ensure richer collaborative representation +from different scales, we embed a simple yet effective inter-scale +bidirectional feedback operation into our multi-scale Transformer by performing +coarse-to-fine and fine-to-coarse information communication. Extensive +experiments demonstrate that our approach, named as NeRD-Rain, performs +favorably against the state-of-the-art ones on both synthetic and real-world +benchmark datasets. The source code and trained models are available at +https://github.com/cschenxiang/NeRD-Rain. + +
+
+ comment: Project website: https://github.com/cschenxiang/NeRD-Rain +
+
+
+
+
+ + ☆ Efficient 3D Implicit Head Avatar with Mesh-anchored Hash Table + Blendshapes CVPR2024 + + +
+ 3D head avatars built with neural implicit volumetric representations have +achieved unprecedented levels of photorealism. However, the computational cost +of these methods remains a significant barrier to their widespread adoption, +particularly in real-time applications such as virtual reality and +teleconferencing. While attempts have been made to develop fast neural +rendering approaches for static scenes, these methods cannot be simply employed +to support realistic facial expressions, such as in the case of a dynamic +facial performance. To address these challenges, we propose a novel fast 3D +neural implicit head avatar model that achieves real-time rendering while +maintaining fine-grained controllability and high rendering quality. Our key +idea lies in the introduction of local hash table blendshapes, which are +learned and attached to the vertices of an underlying face parametric model. +These per-vertex hash-tables are linearly merged with weights predicted via a +CNN, resulting in expression dependent embeddings. Our novel representation +enables efficient density and color predictions using a lightweight MLP, which +is further accelerated by a hierarchical nearest neighbor search method. +Extensive experiments show that our approach runs in real-time while achieving +comparable rendering quality to state-of-the-arts and decent results on +challenging expressions. + +
+
+ comment: In CVPR2024. Project page: + https://augmentedperception.github.io/monoavatar-plus +
+
+
+
+
+ + ☆ Semantic Augmentation in Images using Language + + +
+ Deep Learning models are incredibly data-hungry and require very large +labeled datasets for supervised learning. As a consequence, these models often +suffer from overfitting, limiting their ability to generalize to real-world +examples. Recent advancements in diffusion models have enabled the generation +of photorealistic images based on textual inputs. Leveraging the substantial +datasets used to train these diffusion models, we propose a technique to +utilize generated images to augment existing datasets. This paper explores +various strategies for effective data augmentation to improve the out-of-domain +generalization capabilities of deep learning models. + +
+
+
+
+
+ + ☆ COVID-19 Detection Based on Blood Test Parameters using Various + Artificial Intelligence Methods + + +
+ In 2019, the world faced a new challenge: a COVID-19 disease caused by the +novel coronavirus, SARS-CoV-2. The virus rapidly spread across the globe, +leading to a high rate of mortality, which prompted health organizations to +take measures to control its transmission. Early disease detection is crucial +in the treatment process, and computer-based automatic detection systems have +been developed to aid in this effort. These systems often rely on artificial +intelligence (AI) approaches such as machine learning, neural networks, fuzzy +systems, and deep learning to classify diseases. This study aimed to +differentiate COVID-19 patients from others using self-categorizing classifiers +and employing various AI methods. This study used two datasets: the blood test +samples and radiography images. The best results for the blood test samples +obtained from San Raphael Hospital, which include two classes of individuals, +those with COVID-19 and those with non-COVID diseases, were achieved through +the use of the Ensemble method (a combination of a neural network and two +machines learning methods). The results showed that this approach for COVID-19 +diagnosis is cost-effective and provides results in a shorter amount of time +than other methods. The proposed model achieved an accuracy of 94.09% on the +dataset used. Secondly, the radiographic images were divided into four classes: +normal, viral pneumonia, ground glass opacity, and COVID-19 infection. These +were used for segmentation and classification. The lung lobes were extracted +from the images and then categorized into specific classes. We achieved an +accuracy of 91.1% on the image dataset. Generally, this study highlights the +potential of AI in detecting and managing COVID-19 and underscores the +importance of continued research and development in this field. + +
+
+
+
+
+ + ☆ GaitSTR: Gait Recognition with Sequential Two-stream Refinement + + +
+ Gait recognition aims to identify a person based on their walking sequences, +serving as a useful biometric modality as it can be observed from long +distances without requiring cooperation from the subject. In representing a +person's walking sequence, silhouettes and skeletons are the two primary +modalities used. Silhouette sequences lack detailed part information when +overlapping occurs between different body segments and are affected by carried +objects and clothing. Skeletons, comprising joints and bones connecting the +joints, provide more accurate part information for different segments; however, +they are sensitive to occlusions and low-quality images, causing +inconsistencies in frame-wise results within a sequence. In this paper, we +explore the use of a two-stream representation of skeletons for gait +recognition, alongside silhouettes. By fusing the combined data of silhouettes +and skeletons, we refine the two-stream skeletons, joints, and bones through +self-correction in graph convolution, along with cross-modal correction with +temporal consistency from silhouettes. We demonstrate that with refined +skeletons, the performance of the gait recognition model can achieve further +improvement on public gait recognition datasets compared with state-of-the-art +methods without extra annotations. + +
+
+
+
+
+ + ☆ Effective Malware Detection for Embedded Computing Systems with Limited + Exposure + + +
+ One of the pivotal security threats for the embedded computing systems is +malicious software a.k.a malware. With efficiency and efficacy, Machine +Learning (ML) has been widely adopted for malware detection in recent times. +Despite being efficient, the existing techniques require a tremendous number of +benign and malware samples for training and modeling an efficient malware +detector. Furthermore, such constraints limit the detection of emerging malware +samples due to the lack of sufficient malware samples required for efficient +training. To address such concerns, we introduce a code-aware data generation +technique that generates multiple mutated samples of the limitedly seen malware +by the devices. Loss minimization ensures that the generated samples closely +mimic the limitedly seen malware and mitigate the impractical samples. Such +developed malware is further incorporated into the training set to formulate +the model that can efficiently detect the emerging malware despite having +limited exposure. The experimental results demonstrates that the proposed +technique achieves an accuracy of 90% in detecting limitedly seen malware, +which is approximately 3x more than the accuracy attained by state-of-the-art +techniques. + +
+
+
+
+
+ + ☆ One Noise to Rule Them All: Multi-View Adversarial Attacks with + Universal Perturbation + + +
+ This paper presents a novel universal perturbation method for generating +robust multi-view adversarial examples in 3D object recognition. Unlike +conventional attacks limited to single views, our approach operates on multiple +2D images, offering a practical and scalable solution for enhancing model +scalability and robustness. This generalizable method bridges the gap between +2D perturbations and 3D-like attack capabilities, making it suitable for +real-world applications. + Existing adversarial attacks may become ineffective when images undergo +transformations like changes in lighting, camera position, or natural +deformations. We address this challenge by crafting a single universal noise +perturbation applicable to various object views. Experiments on diverse +rendered 3D objects demonstrate the effectiveness of our approach. The +universal perturbation successfully identified a single adversarial noise for +each given set of 3D object renders from multiple poses and viewpoints. +Compared to single-view attacks, our universal attacks lower classification +confidence across multiple viewing angles, especially at low noise levels. A +sample implementation is made available at +https://github.com/memoatwit/UniversalPerturbation. + +
+
+ comment: 6 pages, 4 figures, presented at ICAIA, Springer to publish under + Algorithms for Intelligent Systems +
+
+
+
+
+ + ☆ LP++: A Surprisingly Strong Linear Probe for Few-Shot CLIP + + +
+ In a recent, strongly emergent literature on few-shot CLIP adaptation, Linear +Probe (LP) has been often reported as a weak baseline. This has motivated +intensive research building convoluted prompt learning or feature adaptation +strategies. In this work, we propose and examine from convex-optimization +perspectives a generalization of the standard LP baseline, in which the linear +classifier weights are learnable functions of the text embedding, with +class-wise multipliers blending image and text knowledge. As our objective +function depends on two types of variables, i.e., the class visual prototypes +and the learnable blending parameters, we propose a computationally efficient +block coordinate Majorize-Minimize (MM) descent algorithm. In our full-batch MM +optimizer, which we coin LP++, step sizes are implicit, unlike standard +gradient descent practices where learning rates are intensively searched over +validation sets. By examining the mathematical properties of our loss (e.g., +Lipschitz gradient continuity), we build majorizing functions yielding +data-driven learning rates and derive approximations of the loss's minima, +which provide data-informed initialization of the variables. Our image-language +objective function, along with these non-trivial optimization insights and +ingredients, yields, surprisingly, highly competitive few-shot CLIP +performances. Furthermore, LP++ operates in black-box, relaxes intensive +validation searches for the optimization hyper-parameters, and runs +orders-of-magnitudes faster than state-of-the-art few-shot CLIP adaptation +methods. Our code is available at: +\url{https://github.com/FereshteShakeri/FewShot-CLIP-Strong-Baseline.git}. + +
+
+
+
+
+ + ☆ Smooth Deep Saliency + + +
+ In this work, we investigate methods to reduce the noise in deep saliency +maps coming from convolutional downsampling, with the purpose of explaining how +a deep learning model detects tumors in scanned histological tissue samples. +Those methods make the investigated models more interpretable for +gradient-based saliency maps, computed in hidden layers. We test our approach +on different models trained for image classification on ImageNet1K, and models +trained for tumor detection on Camelyon16 and in-house real-world digital +pathology scans of stained tissue samples. Our results show that the +checkerboard noise in the gradient gets reduced, resulting in smoother and +therefore easier to interpret saliency maps. + +
+
+
+
+
+ + ☆ OFMPNet: Deep End-to-End Model for Occupancy and Flow Prediction in + Urban Environment + + +
+ The task of motion prediction is pivotal for autonomous driving systems, +providing crucial data to choose a vehicle behavior strategy within its +surroundings. Existing motion prediction techniques primarily focus on +predicting the future trajectory of each agent in the scene individually, +utilizing its past trajectory data. In this paper, we introduce an end-to-end +neural network methodology designed to predict the future behaviors of all +dynamic objects in the environment. This approach leverages the occupancy map +and the scene's motion flow. We are investigatin various alternatives for +constructing a deep encoder-decoder model called OFMPNet. This model uses a +sequence of bird's-eye-view road images, occupancy grid, and prior motion flow +as input data. The encoder of the model can incorporate transformer, +attention-based, or convolutional units. The decoder considers the use of both +convolutional modules and recurrent blocks. Additionally, we propose a novel +time-weighted motion flow loss, whose application has shown a substantial +decrease in end-point error. Our approach has achieved state-of-the-art results +on the Waymo Occupancy and Flow Prediction benchmark, with a Soft IoU of 52.1% +and an AUC of 76.75% on Flow-Grounded Occupancy. + +
+
+ comment: Accepted in Neurocomputing journal - 2024 +
+
+
+
+
+ + ☆ SnAG: Scalable and Accurate Video Grounding CVPR 2024 + + +
+ Temporal grounding of text descriptions in videos is a central problem in +vision-language learning and video understanding. Existing methods often +prioritize accuracy over scalability -- they have been optimized for grounding +only a few text queries within short videos, and fail to scale up to long +videos with hundreds of queries. In this paper, we study the effect of +cross-modal fusion on the scalability of video grounding models. Our analysis +establishes late fusion as a more cost-effective fusion scheme for long-form +videos with many text queries. Moreover, it leads us to a novel, video-centric +sampling scheme for efficient training. Based on these findings, we present +SnAG, a simple baseline for scalable and accurate video grounding. Without +bells and whistles, SnAG is 43% more accurate and 1.5x faster than CONE, a +state of the art for long-form video grounding on the challenging MAD dataset, +while achieving highly competitive results on short videos. + +
+
+ comment: Accepted to CVPR 2024. Code available at + https://github.com/fmu2/snag_release +
+
+
+
+
+ + ☆ Towards Robust 3D Pose Transfer with Adversarial Learning CVPR 2024 + + +
+ 3D pose transfer that aims to transfer the desired pose to a target mesh is +one of the most challenging 3D generation tasks. Previous attempts rely on +well-defined parametric human models or skeletal joints as driving pose +sources. However, to obtain those clean pose sources, cumbersome but necessary +pre-processing pipelines are inevitable, hindering implementations of the +real-time applications. This work is driven by the intuition that the +robustness of the model can be enhanced by introducing adversarial samples into +the training, leading to a more invulnerable model to the noisy inputs, which +even can be further extended to directly handling the real-world data like raw +point clouds/scans without intermediate processing. Furthermore, we propose a +novel 3D pose Masked Autoencoder (3D-PoseMAE), a customized MAE that +effectively learns 3D extrinsic presentations (i.e., pose). 3D-PoseMAE +facilitates learning from the aspect of extrinsic attributes by simultaneously +generating adversarial samples that perturb the model and learning the +arbitrary raw noisy poses via a multi-scale masking strategy. Both qualitative +and quantitative studies show that the transferred meshes given by our network +result in much better quality. Besides, we demonstrate the strong +generalizability of our method on various poses, different domains, and even +raw scans. Experimental results also show meaningful insights that the +intermediate adversarial samples generated in the training can successfully +attack the existing pose transfer models. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Linear Combination of Saved Checkpoints Makes Consistency and Diffusion + Models Better + + +
+ Diffusion Models (DM) and Consistency Models (CM) are two types of popular +generative models with good generation quality on various tasks. When training +DM and CM, intermediate weight checkpoints are not fully utilized and only the +last converged checkpoint is used. In this work, we find that high-quality +model weights often lie in a basin which cannot be reached by SGD but can be +obtained by proper checkpoint averaging. Based on these observations, we +propose LCSC, a simple but effective and efficient method to enhance the +performance of DM and CM, by combining checkpoints along the training +trajectory with coefficients deduced from evolutionary search. We demonstrate +the value of LCSC through two use cases: $\textbf{(a) Reducing training cost.}$ +With LCSC, we only need to train DM/CM with fewer number of iterations and/or +lower batch sizes to obtain comparable sample quality with the fully trained +model. For example, LCSC achieves considerable training speedups for CM +(23$\times$ on CIFAR-10 and 15$\times$ on ImageNet-64). $\textbf{(b) Enhancing +pre-trained models.}$ Assuming full training is already done, LCSC can further +improve the generation quality or speed of the final converged models. For +example, LCSC achieves better performance using 1 number of function evaluation +(NFE) than the base model with 2 NFE on consistency distillation, and decreases +the NFE of DM from 15 to 9 while maintaining the generation quality on +CIFAR-10. Our code is available at +https://github.com/imagination-research/LCSC. + +
+
+
+
+
+ + ☆ Visual Concept Connectome (VCC): Open World Concept Discovery and their + Interlayer Connections in Deep Models CVPR 2024 + + +
+ Understanding what deep network models capture in their learned +representations is a fundamental challenge in computer vision. We present a new +methodology to understanding such vision models, the Visual Concept Connectome +(VCC), which discovers human interpretable concepts and their interlayer +connections in a fully unsupervised manner. Our approach simultaneously reveals +fine-grained concepts at a layer, connection weightings across all layers and +is amendable to global analysis of network structure (e.g., branching pattern +of hierarchical concept assemblies). Previous work yielded ways to extract +interpretable concepts from single layers and examine their impact on +classification, but did not afford multilayer concept analysis across an entire +network architecture. Quantitative and qualitative empirical results show the +effectiveness of VCCs in the domain of image classification. Also, we leverage +VCCs for the application of failure mode debugging to reveal where mistakes +arise in deep networks. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ OOSTraj: Out-of-Sight Trajectory Prediction With Vision-Positioning + Denoising CVPR + + +
+ Trajectory prediction is fundamental in computer vision and autonomous +driving, particularly for understanding pedestrian behavior and enabling +proactive decision-making. Existing approaches in this field often assume +precise and complete observational data, neglecting the challenges associated +with out-of-view objects and the noise inherent in sensor data due to limited +camera range, physical obstructions, and the absence of ground truth for +denoised sensor data. Such oversights are critical safety concerns, as they can +result in missing essential, non-visible objects. To bridge this gap, we +present a novel method for out-of-sight trajectory prediction that leverages a +vision-positioning technique. Our approach denoises noisy sensor observations +in an unsupervised manner and precisely maps sensor-based trajectories of +out-of-sight objects into visual trajectories. This method has demonstrated +state-of-the-art performance in out-of-sight noisy sensor trajectory denoising +and prediction on the Vi-Fi and JRDB datasets. By enhancing trajectory +prediction accuracy and addressing the challenges of out-of-sight objects, our +work significantly contributes to improving the safety and reliability of +autonomous driving in complex environments. Our work represents the first +initiative towards Out-Of-Sight Trajectory prediction (OOSTraj), setting a new +benchmark for future research. The code is available at +\url{https://github.com/Hai-chao-Zhang/OOSTraj}. + +
+
+ comment: In Proceedings of IEEE/CVF Conference on Computer Vision and Pattern + Recognition 2024 (CVPR) +
+
+
+
+
+ + ☆ CHOSEN: Contrastive Hypothesis Selection for Multi-View Depth Refinement + + +
+ We propose CHOSEN, a simple yet flexible, robust and effective multi-view +depth refinement framework. It can be employed in any existing multi-view +stereo pipeline, with straightforward generalization capability for different +multi-view capture systems such as camera relative positioning and lenses. +Given an initial depth estimation, CHOSEN iteratively re-samples and selects +the best hypotheses, and automatically adapts to different metric or intrinsic +scales determined by the capture system. The key to our approach is the +application of contrastive learning in an appropriate solution space and a +carefully designed hypothesis feature, based on which positive and negative +hypotheses can be effectively distinguished. Integrated in a simple baseline +multi-view stereo pipeline, CHOSEN delivers impressive quality in terms of +depth and normal accuracy compared to many current deep learning based +multi-view stereo pipelines. + +
+
+
+
+
+ + ☆ Insights from the Use of Previously Unseen Neural Architecture Search + Datasets + + +
+ The boundless possibility of neural networks which can be used to solve a +problem -- each with different performance -- leads to a situation where a Deep +Learning expert is required to identify the best neural network. This goes +against the hope of removing the need for experts. Neural Architecture Search +(NAS) offers a solution to this by automatically identifying the best +architecture. However, to date, NAS work has focused on a small set of datasets +which we argue are not representative of real-world problems. We introduce +eight new datasets created for a series of NAS Challenges: AddNIST, Language, +MultNIST, CIFARTile, Gutenberg, Isabella, GeoClassing, and Chesseract. These +datasets and challenges are developed to direct attention to issues in NAS +development and to encourage authors to consider how their models will perform +on datasets unknown to them at development time. We present experimentation +using standard Deep Learning methods as well as the best results from challenge +participants. + +
+
+
+
+
+ + ☆ NeRFCodec: Neural Feature Compression Meets Neural Radiance Fields for + Memory-Efficient Scene Representation CVPR2024 + + +
+ The emergence of Neural Radiance Fields (NeRF) has greatly impacted 3D scene +modeling and novel-view synthesis. As a kind of visual media for 3D scene +representation, compression with high rate-distortion performance is an eternal +target. Motivated by advances in neural compression and neural field +representation, we propose NeRFCodec, an end-to-end NeRF compression framework +that integrates non-linear transform, quantization, and entropy coding for +memory-efficient scene representation. Since training a non-linear transform +directly on a large scale of NeRF feature planes is impractical, we discover +that pre-trained neural 2D image codec can be utilized for compressing the +features when adding content-specific parameters. Specifically, we reuse neural +2D image codec but modify its encoder and decoder heads, while keeping the +other parts of the pre-trained decoder frozen. This allows us to train the full +pipeline via supervision of rendering loss and entropy loss, yielding the +rate-distortion balance by updating the content-specific parameters. At test +time, the bitstreams containing latent code, feature decoder head, and other +side information are transmitted for communication. Experimental results +demonstrate our method outperforms existing NeRF compression methods, enabling +high-quality novel view synthesis with a memory budget of 0.5 MB. + +
+
+ comment: Accepted at CVPR2024. The source code will be released +
+
+
+
+
+ + ☆ Real, fake and synthetic faces -- does the coin have three sides? + + +
+ With the ever-growing power of generative artificial intelligence, deepfake +and artificially generated (synthetic) media have continued to spread online, +which creates various ethical and moral concerns regarding their usage. To +tackle this, we thus present a novel exploration of the trends and patterns +observed in real, deepfake and synthetic facial images. The proposed analysis +is done in two parts: firstly, we incorporate eight deep learning models and +analyze their performances in distinguishing between the three classes of +images. Next, we look to further delve into the similarities and differences +between these three sets of images by investigating their image properties both +in the context of the entire image as well as in the context of specific +regions within the image. ANOVA test was also performed and provided further +clarity amongst the patterns associated between the images of the three +classes. From our findings, we observe that the investigated deeplearning +models found it easier to detect synthetic facial images, with the ViT Patch-16 +model performing best on this task with a class-averaged sensitivity, +specificity, precision, and accuracy of 97.37%, 98.69%, 97.48%, and 98.25%, +respectively. This observation was supported by further analysis of various +image properties. We saw noticeable differences across the three category of +images. This analysis can help us build better algorithms for facial image +generation, and also shows that synthetic, deepfake and real face images are +indeed three different classes. + +
+
+
+
+
+ + ♻ ☆ Diffuse, Attend, and Segment: Unsupervised Zero-Shot Segmentation using + Stable Diffusion CVPR2024 + + +
+ Producing quality segmentation masks for images is a fundamental problem in +computer vision. Recent research has explored large-scale supervised training +to enable zero-shot segmentation on virtually any image style and unsupervised +training to enable segmentation without dense annotations. However, +constructing a model capable of segmenting anything in a zero-shot manner +without any annotations is still challenging. In this paper, we propose to +utilize the self-attention layers in stable diffusion models to achieve this +goal because the pre-trained stable diffusion model has learned inherent +concepts of objects within its attention layers. Specifically, we introduce a +simple yet effective iterative merging process based on measuring KL divergence +among attention maps to merge them into valid segmentation masks. The proposed +method does not require any training or language dependency to extract quality +segmentation for any images. On COCO-Stuff-27, our method surpasses the prior +unsupervised zero-shot SOTA method by an absolute 26% in pixel accuracy and 17% +in mean IoU. The project page is at +\url{https://sites.google.com/view/diffseg/home}. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Rephrase, Augment, Reason: Visual Grounding of Questions for + Vision-Language Models ICLR 2024 + + +
+ An increasing number of vision-language tasks can be handled with little to +no training, i.e., in a zero and few-shot manner, by marrying large language +models (LLMs) to vision encoders, resulting in large vision-language models +(LVLMs). While this has huge upsides, such as not requiring training data or +custom architectures, how an input is presented to an LVLM can have a major +impact on zero-shot model performance. In particular, inputs phrased in an +underspecified way can result in incorrect answers due to factors like missing +visual information, complex implicit reasoning, or linguistic ambiguity. +Therefore, adding visually-grounded information to the input as a preemptive +clarification should improve model performance by reducing underspecification, +e.g., by localizing objects and disambiguating references. Similarly, in the +VQA setting, changing the way questions are framed can make them easier for +models to answer. To this end, we present Rephrase, Augment and Reason +(RepARe), a gradient-free framework that extracts salient details about the +image using the underlying LVLM as a captioner and reasoner, in order to +propose modifications to the original question. We then use the LVLM's +confidence over a generated answer as an unsupervised scoring function to +select the rephrased question most likely to improve zero-shot performance. +Focusing on three visual question answering tasks, we show that RepARe can +result in a 3.85% (absolute) increase in zero-shot accuracy on VQAv2, 6.41%, +and 7.94% points increase on A-OKVQA, and VizWiz respectively. Additionally, we +find that using gold answers for oracle question candidate selection achieves a +substantial gain in VQA accuracy by up to 14.41%. Through extensive analysis, +we demonstrate that outputs from RepARe increase syntactic complexity, and +effectively utilize vision-language interaction and the frozen LLM. + +
+
+ comment: ICLR 2024 camera-ready (23 pages), Code: + https://github.com/archiki/RepARe +
+
+
+
+
+ + ♻ ☆ pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable + Generalizable 3D Reconstruction + + +
+ We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D +radiance fields parameterized by 3D Gaussian primitives from pairs of images. +Our model features real-time and memory-efficient rendering for scalable +training as well as fast 3D reconstruction at inference time. To overcome local +minima inherent to sparse and locally supported representations, we predict a +dense probability distribution over 3D and sample Gaussian means from that +probability distribution. We make this sampling operation differentiable via a +reparameterization trick, allowing us to back-propagate gradients through the +Gaussian splatting representation. We benchmark our method on wide-baseline +novel view synthesis on the real-world RealEstate10k and ACID datasets, where +we outperform state-of-the-art light field transformers and accelerate +rendering by 2.5 orders of magnitude while reconstructing an interpretable and +editable 3D radiance field. + +
+
+ comment: Project page: https://dcharatan.github.io/pixelsplat +
+
+
+
+
+ + ♻ ☆ MedMamba: Vision Mamba for Medical Image Classification + + +
+ Medical image classification is a very fundamental and crucial task in the +field of computer vision. These years, CNN-based and Transformer-based models +have been widely used to classify various medical images. Unfortunately, The +limitation of CNNs in long-range modeling capabilities prevents them from +effectively extracting features in medical images, while Transformers are +hampered by their quadratic computational complexity. Recent research has shown +that the state space model (SSM) represented by Mamba can efficiently model +long-range interactions while maintaining linear computational complexity. +Inspired by this, we propose Vision Mamba for medical image classification +(MedMamba). More specifically, we introduce a novel Conv-SSM module. Conv-SSM +combines the local feature extraction ability of convolutional layers with the +ability of SSM to capture long-range dependency, thereby modeling medical +images with different modalities. To demonstrate the potential of MedMamba, we +conducted extensive experiments using 14 publicly available medical datasets +with different imaging techniques and two private datasets built by ourselves. +Extensive experimental results demonstrate that the proposed MedMamba performs +well in detecting lesions in various medical images. To the best of our +knowledge, this is the first Vision Mamba tailored for medical image +classification. The purpose of this work is to establish a new baseline for +medical image classification tasks and provide valuable insights for the future +development of more efficient and effective SSM-based artificial intelligence +algorithms and application systems in the medical. Source code has been +available at https://github.com/YubiaoYue/MedMamba. + +
+
+
+
+
+ + ♻ ☆ GDA: Generalized Diffusion for Robust Test-time Adaptation + + +
+ Machine learning models struggle with generalization when encountering +out-of-distribution (OOD) samples with unexpected distribution shifts. For +vision tasks, recent studies have shown that test-time adaptation employing +diffusion models can achieve state-of-the-art accuracy improvements on OOD +samples by generating new samples that align with the model's domain without +the need to modify the model's weights. Unfortunately, those studies have +primarily focused on pixel-level corruptions, thereby lacking the +generalization to adapt to a broader range of OOD types. We introduce +Generalized Diffusion Adaptation (GDA), a novel diffusion-based test-time +adaptation method robust against diverse OOD types. Specifically, GDA +iteratively guides the diffusion by applying a marginal entropy loss derived +from the model, in conjunction with style and content preservation losses +during the reverse sampling process. In other words, GDA considers the model's +output behavior with the semantic information of the samples as a whole, which +can reduce ambiguity in downstream tasks during the generation process. +Evaluation across various popular model architectures and OOD benchmarks shows +that GDA consistently outperforms prior work on diffusion-driven adaptation. +Notably, it achieves the highest classification accuracy improvements, ranging +from 4.4\% to 5.02\% on ImageNet-C and 2.5\% to 7.4\% on Rendition, Sketch, and +Stylized benchmarks. This performance highlights GDA's generalization to a +broader range of OOD benchmarks. + +
+
+
+
+
+ + ♻ ☆ Learning CNN on ViT: A Hybrid Model to Explicitly Class-specific + Boundaries for Domain Adaptation + + +
+ Most domain adaptation (DA) methods are based on either a convolutional +neural networks (CNNs) or a vision transformers (ViTs). They align the +distribution differences between domains as encoders without considering their +unique characteristics. For instance, ViT excels in accuracy due to its +superior ability to capture global representations, while CNN has an advantage +in capturing local representations. This fact has led us to design a hybrid +method to fully take advantage of both ViT and CNN, called Explicitly +Class-specific Boundaries (ECB). ECB learns CNN on ViT to combine their +distinct strengths. In particular, we leverage ViT's properties to explicitly +find class-specific decision boundaries by maximizing the discrepancy between +the outputs of the two classifiers to detect target samples far from the source +support. In contrast, the CNN encoder clusters target features based on the +previously defined class-specific boundaries by minimizing the discrepancy +between the probabilities of the two classifiers. Finally, ViT and CNN mutually +exchange knowledge to improve the quality of pseudo labels and reduce the +knowledge discrepancies of these models. Compared to conventional DA methods, +our ECB achieves superior performance, which verifies its effectiveness in this +hybrid model. The project website can be found +https://dotrannhattuong.github.io/ECB/website/. + +
+
+
+
+
+ + ♻ ☆ MIPS at SemEval-2024 Task 3: Multimodal Emotion-Cause Pair Extraction in + Conversations with Multimodal Language Models SemEval '24 + + +
+ This paper presents our winning submission to Subtask 2 of SemEval 2024 Task +3 on multimodal emotion cause analysis in conversations. We propose a novel +Multimodal Emotion Recognition and Multimodal Emotion Cause Extraction +(MER-MCE) framework that integrates text, audio, and visual modalities using +specialized emotion encoders. Our approach sets itself apart from +top-performing teams by leveraging modality-specific features for enhanced +emotion understanding and causality inference. Experimental evaluation +demonstrates the advantages of our multimodal approach, with our submission +achieving a competitive weighted F1 score of 0.3435, ranking third with a +margin of only 0.0339 behind the 1st team and 0.0025 behind the 2nd team. +Project: https://github.com/MIPS-COLT/MER-MCE.git + +
+
+ comment: Ranked 3rd in SemEval '24 Task 3 with F1 of 0.3435, close to 1st & + 2nd by 0.0339 & 0.0025 +
+
+
+
+
+ + ♻ ☆ Immature Green Apple Detection and Sizing in Commercial Orchards using + YOLOv8 and Shape Fitting Techniques + + +
+ Detecting and estimating size of apples during the early stages of growth is +crucial for predicting yield, pest management, and making informed decisions +related to crop-load management, harvest and post-harvest logistics, and +marketing. Traditional fruit size measurement methods are laborious and +timeconsuming. This study employs the state-of-the-art YOLOv8 object detection +and instance segmentation algorithm in conjunction with geometric shape fitting +techniques on 3D point cloud data to accurately determine the size of immature +green apples (or fruitlet) in a commercial orchard environment. The methodology +utilized two RGB-D sensors: Intel RealSense D435i and Microsoft Azure Kinect +DK. Notably, the YOLOv8 instance segmentation models exhibited proficiency in +immature green apple detection, with the YOLOv8m-seg model achieving the +highest AP@0.5 and AP@0.75 scores of 0.94 and 0.91, respectively. Using the +ellipsoid fitting technique on images from the Azure Kinect, we achieved an +RMSE of 2.35 mm, MAE of 1.66 mm, MAPE of 6.15 mm, and an R-squared value of 0.9 +in estimating the size of apple fruitlets. Challenges such as partial occlusion +caused some error in accurately delineating and sizing green apples using the +YOLOv8-based segmentation technique, particularly in fruit clusters. In a +comparison with 102 outdoor samples, the size estimation technique performed +better on the images acquired with Microsoft Azure Kinect than the same with +Intel Realsense D435i. This superiority is evident from the metrics: the RMSE +values (2.35 mm for Azure Kinect vs. 9.65 mm for Realsense D435i), MAE values +(1.66 mm for Azure Kinect vs. 7.8 mm for Realsense D435i), and the R-squared +values (0.9 for Azure Kinect vs. 0.77 for Realsense D435i). + +
+
+
+
+
+ + ♻ ☆ Semantically-Prompted Language Models Improve Visual Descriptions NAACL 2024 + + +
+ Language-vision models like CLIP have made significant strides in vision +tasks, such as zero-shot image classification (ZSIC). However, generating +specific and expressive visual descriptions remains challenging; descriptions +produced by current methods are often ambiguous and lacking in granularity. To +tackle these issues, we propose V-GLOSS: Visual Glosses, a novel method built +upon two key ideas. The first is Semantic Prompting, which conditions a +language model on structured semantic knowledge. The second is a new +contrastive algorithm that elicits fine-grained distinctions between similar +concepts. With both ideas, we demonstrate that V-GLOSS improves visual +descriptions and achieves strong results in the zero-shot setting on general +and fine-grained image-classification datasets, including ImageNet, STL-10, +FGVC Aircraft, and Flowers 102. Moreover, these descriptive capabilities +contribute to enhancing image-generation performance. Finally, we introduce a +quality-tested silver dataset with descriptions generated with V-GLOSS for all +ImageNet classes. + +
+
+ comment: To appear at NAACL 2024 +
+
+
+
+
+ + ♻ ☆ FISTNet: FusIon of STyle-path generative Networks for Facial Style + Transfer + + +
+ With the surge in emerging technologies such as Metaverse, spatial computing, +and generative AI, the application of facial style transfer has gained a lot of +interest from researchers as well as startups enthusiasts alike. StyleGAN +methods have paved the way for transfer-learning strategies that could reduce +the dependency on the huge volume of data that is available for the training +process. However, StyleGAN methods have the tendency of overfitting that +results in the introduction of artifacts in the facial images. Studies, such as +DualStyleGAN, proposed the use of multipath networks but they require the +networks to be trained for a specific style rather than generating a fusion of +facial styles at once. In this paper, we propose a FusIon of STyles (FIST) +network for facial images that leverages pre-trained multipath style transfer +networks to eliminate the problem associated with lack of huge data volume in +the training phase along with the fusion of multiple styles at the output. We +leverage pre-trained styleGAN networks with an external style pass that use +residual modulation block instead of a transform coding block. The method also +preserves facial structure, identity, and details via the gated mapping unit +introduced in this study. The aforementioned components enable us to train the +network with very limited amount of data while generating high-quality stylized +images. Our training process adapts curriculum learning strategy to perform +efficient, flexible style and model fusion in the generative space. We perform +extensive experiments to show the superiority of FISTNet in comparison to +existing state-of-the-art methods. + +
+
+ comment: 21 pages, 6 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ MonoBox: Tightness-free Box-supervised Polyp Segmentation using + Monotonicity Constraint + + +
+ We propose MonoBox, an innovative box-supervised segmentation method +constrained by monotonicity to liberate its training from the user-unfriendly +box-tightness assumption. In contrast to conventional box-supervised +segmentation, where the box edges must precisely touch the target boundaries, +MonoBox leverages imprecisely-annotated boxes to achieve robust pixel-wise +segmentation. The 'linchpin' is that, within the noisy zones around box edges, +MonoBox discards the traditional misguiding multiple-instance learning loss, +and instead optimizes a carefully-designed objective, termed monotonicity +constraint. Along directions transitioning from the foreground to background, +this new constraint steers responses to adhere to a trend of monotonically +decreasing values. Consequently, the originally unreliable learning within the +noisy zones is transformed into a correct and effective monotonicity +optimization. Moreover, an adaptive label correction is introduced, enabling +MonoBox to enhance the tightness of box annotations using predicted masks from +the previous epoch and dynamically shrink the noisy zones as training +progresses. We verify MonoBox in the box-supervised segmentation task of +polyps, where satisfying box-tightness is challenging due to the vague +boundaries between the polyp and normal tissues. Experiments on both public +synthetic and in-house real noisy datasets demonstrate that MonoBox exceeds +other anti-noise state-of-the-arts by improving Dice by at least 5.5% and 3.3%, +respectively. Codes are at https://github.com/Huster-Hq/MonoBox. + +
+
+
+
+
+ + ♻ ☆ Joint Multimodal Transformer for Emotion Recognition in the Wild + + +
+ Systems for multimodal emotion recognition (MMER) can typically outperform +unimodal systems by leveraging the inter- and intra-modal relationships +between, e.g., visual, textual, physiological, and auditory modalities. In this +paper, an MMER method is proposed that relies on a joint multimodal transformer +for fusion with key-based cross-attention. This framework aims to exploit the +diverse and complementary nature of different modalities to improve predictive +accuracy. Separate backbones capture intra-modal spatiotemporal dependencies +within each modality over video sequences. Subsequently, a joint multimodal +transformer fusion architecture integrates the individual modality embeddings, +allowing the model to capture inter-modal and intra-modal relationships +effectively. Extensive experiments on two challenging expression recognition +tasks: (1) dimensional emotion recognition on the Affwild2 dataset (with face +and voice), and (2) pain estimation on the Biovid dataset (with face and +biosensors), indicate that the proposed method can work effectively with +different modalities. Empirical results show that MMER systems with our +proposed fusion method allow us to outperform relevant baseline and +state-of-the-art methods. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Enhancing wind field resolution in complex terrain through a + knowledge-driven machine learning approach + + +
+ Atmospheric flows are governed by a broad variety of spatio-temporal scales, +thus making real-time numerical modeling of such turbulent flows in complex +terrain at high resolution computationally intractable. In this study, we +demonstrate a neural network approach motivated by Enhanced Super-Resolution +Generative Adversarial Networks to upscale low-resolution wind fields to +generate high-resolution wind fields in an actual wind farm in Bessaker, +Norway. The neural network-based model is shown to successfully reconstruct +fully resolved 3D velocity fields from a coarser scale while respecting the +local terrain and that it easily outperforms trilinear interpolation. We also +demonstrate that by using appropriate cost function based on domain knowledge, +we can alleviate the use of adversarial training. + +
+
+
+
+
+ + ♻ ☆ A Simple Recipe for Language-guided Domain Generalized Segmentation CVPR 2024 + + +
+ Generalization to new domains not seen during training is one of the +long-standing challenges in deploying neural networks in real-world +applications. Existing generalization techniques either necessitate external +images for augmentation, and/or aim at learning invariant representations by +imposing various alignment constraints. Large-scale pretraining has recently +shown promising generalization capabilities, along with the potential of +binding different modalities. For instance, the advent of vision-language +models like CLIP has opened the doorway for vision models to exploit the +textual modality. In this paper, we introduce a simple framework for +generalizing semantic segmentation networks by employing language as the source +of randomization. Our recipe comprises three key ingredients: (i) the +preservation of the intrinsic CLIP robustness through minimal fine-tuning, (ii) +language-driven local style augmentation, and (iii) randomization by locally +mixing the source and augmented styles during training. Extensive experiments +report state-of-the-art results on various generalization benchmarks. Code is +accessible at https://github.com/astra-vision/FAMix . + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Open-Vocabulary Federated Learning with Multimodal Prototyping NAACL 2024 + + +
+ Existing federated learning (FL) studies usually assume the training label +space and test label space are identical. However, in real-world applications, +this assumption is too ideal to be true. A new user could come up with queries +that involve data from unseen classes, and such open-vocabulary queries would +directly defect such FL systems. Therefore, in this work, we explicitly focus +on the under-explored open-vocabulary challenge in FL. That is, for a new user, +the global server shall understand her/his query that involves arbitrary +unknown classes. To address this problem, we leverage the pre-trained +vision-language models (VLMs). In particular, we present a novel adaptation +framework tailored for VLMs in the context of FL, named as Federated Multimodal +Prototyping (Fed-MP). Fed-MP adaptively aggregates the local model weights +based on light-weight client residuals, and makes predictions based on a novel +multimodal prototyping mechanism. Fed-MP exploits the knowledge learned from +the seen classes, and robustifies the adapted VLM to unseen categories. Our +empirical evaluation on various datasets validates the effectiveness of Fed-MP. + +
+
+ comment: Accepted at NAACL 2024 +
+
+
+
+
+ + ♻ ☆ Cross-modality debiasing: using language to mitigate sub-population + shifts in imaging + + +
+ Sub-population shift is a specific type of domain shift that highlights +changes in data distribution within specific sub-groups or populations between +training and testing. Sub-population shift accounts for a significant source of +algorithmic bias and calls for distributional robustness. Recent studies found +inherent distributional robustness in multi-modality foundation models, such as +the vision-language model CLIP, yet this robustness is vulnerable through +parameter fine-tuning. In this paper, we propose leveraging the connection of +robustness among different modalities and reshaping the distributional +robustness of one modality with another. Specifically, in the context of the +distributional robustness of CLIP, we propose to leverage natural language +inputs to debias the image feature representations, to improve worst-case +performance on sub-populations. Our extensive empirical studies show that image +representations debiased by natural language can achieve significant +performance improvement and reduction of performance instability under +sub-population shifts. + +
+
+
+
+
+ + ♻ ☆ VA3: Virtually Assured Amplification Attack on Probabilistic Copyright + Protection for Text-to-Image Generative Models CVPR 2024 + + +
+ The booming use of text-to-image generative models has raised concerns about +their high risk of producing copyright-infringing content. While probabilistic +copyright protection methods provide a probabilistic guarantee against such +infringement, in this paper, we introduce Virtually Assured Amplification +Attack (VA3), a novel online attack framework that exposes the vulnerabilities +of these protection mechanisms. The proposed framework significantly amplifies +the probability of generating infringing content on the sustained interactions +with generative models and a non-trivial lower-bound on the success probability +of each engagement. Our theoretical and experimental results demonstrate the +effectiveness of our approach under various scenarios. These findings highlight +the potential risk of implementing probabilistic copyright protection in +practical applications of text-to-image generative models. Code is available at +https://github.com/South7X/VA3. + +
+
+ comment: 18 pages, 9 figures. Accept to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Dual-Activated Lightweight Attention ResNet50 for Automatic + Histopathology Breast Cancer Image Classification + + +
+ Automatic breast cancer classification in histopathology images is crucial +for precise diagnosis and treatment planning. Recently, classification +approaches based on the ResNet architecture have gained popularity for +significantly improving accuracy by using skip connections to mitigate +vanishing gradient problems, thereby integrating low-level and high-level +feature information. Nevertheless, the conventional ResNet architecture faces +challenges such as data imbalance and limited interpretability, necessitating +cross-domain knowledge and collaboration among medical experts. This study +effectively addresses these challenges by introducing a novel method for breast +cancer classification, the Dual-Activated Lightweight Attention ResNet50 +(DALAResNet50) model. It integrates a pre-trained ResNet50 model with a +lightweight attention mechanism, embedding an attention module in the fourth +layer of ResNet50 and incorporating two fully connected layers with LeakyReLU +and ReLU activation functions to enhance feature learning capabilities. The +DALAResNet50 method was tested on breast cancer histopathology images from the +BreakHis Database across magnification factors of 40X, 100X, 200X, and 400X, +achieving accuracies of 98.5%, 98.7%, 97.9%, and 94.3%, respectively. It was +also compared with established deep learning models such as SEResNet50, +DenseNet121, VGG16, VGG16Inception, ViT, Swin-Transformer, Dinov2_Vitb14, and +ResNet50. The reported results of DALAResNet50 have been shown to outperform +the compared approaches regarding accuracy, F1 score, IBA, and GMean, +demonstrating significant robustness and broad applicability when dealing with +different magnifications and imbalanced breast cancer datasets + +
+
+ comment: 13 pages, 7 figures,7 tables +
+
+
+
+
+ + ♻ ☆ PatchCURE: Improving Certifiable Robustness, Model Utility, and + Computation Efficiency of Adversarial Patch Defenses USENIX Security 2024 + + +
+ State-of-the-art defenses against adversarial patch attacks can now achieve +strong certifiable robustness with a marginal drop in model utility. However, +this impressive performance typically comes at the cost of 10-100x more +inference-time computation compared to undefended models -- the research +community has witnessed an intense three-way trade-off between certifiable +robustness, model utility, and computation efficiency. In this paper, we +propose a defense framework named PatchCURE to approach this trade-off problem. +PatchCURE provides sufficient "knobs" for tuning defense performance and allows +us to build a family of defenses: the most robust PatchCURE instance can match +the performance of any existing state-of-the-art defense (without efficiency +considerations); the most efficient PatchCURE instance has similar inference +efficiency as undefended models. Notably, PatchCURE achieves state-of-the-art +robustness and utility performance across all different efficiency levels, +e.g., 16-23% absolute clean accuracy and certified robust accuracy advantages +over prior defenses when requiring computation efficiency to be close to +undefended models. The family of PatchCURE defenses enables us to flexibly +choose appropriate defenses to satisfy given computation and/or utility +constraints in practice. + +
+
+ comment: USENIX Security 2024. (extended) technical report +
+
+
+
+
+ + ♻ ☆ Deep Multi-Threshold Spiking-UNet for Image Processing + + +
+ U-Net, known for its simple yet efficient architecture, is widely utilized +for image processing tasks and is particularly suitable for deployment on +neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for +image processing, which combines the power of Spiking Neural Networks (SNNs) +with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two +primary challenges: ensuring high-fidelity information propagation through the +network via spikes and formulating an effective training strategy. To address +the issue of information loss, we introduce multi-threshold spiking neurons, +which improve the efficiency of information transmission within the +Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning +pipeline that leverage pre-trained U-Net models. During the conversion process, +significant variability in data distribution across different parts is observed +when utilizing skip connections. Therefore, we propose a connection-wise +normalization method to prevent inaccurate firing rates. Furthermore, we adopt +a flow-based training method to fine-tune the converted models, reducing time +steps while preserving performance. Experimental results show that, on image +segmentation and denoising, our Spiking-UNet achieves comparable performance to +its non-spiking counterpart, surpassing existing SNN methods. Compared with the +converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference +time by approximately 90\%. This research broadens the application scope of +SNNs in image processing and is expected to inspire further exploration in the +field of neuromorphic engineering. The code for our Spiking-UNet implementation +is available at https://github.com/SNNresearch/Spiking-UNet. + +
+
+ comment: Accepted in NeuroComputing +
+
+
+
+
+ + ♻ ☆ Diverse Representation Embedding for Lifelong Person Re-Identification + + +
+ Lifelong Person Re-Identification (LReID) aims to continuously learn from +successive data streams, matching individuals across multiple cameras. The key +challenge for LReID is how to effectively preserve old knowledge while +incrementally learning new information, which is caused by task-level domain +gaps and limited old task datasets. Existing methods based on CNN backbone are +insufficient to explore the representation of each instance from different +perspectives, limiting model performance on limited old task datasets and new +task datasets. Unlike these methods, we propose a Diverse Representations +Embedding (DRE) framework that first explores a pure transformer for LReID. The +proposed DRE preserves old knowledge while adapting to new information based on +instance-level and task-level layout. Concretely, an Adaptive Constraint Module +(ACM) is proposed to implement integration and push away operations between +multiple overlapping representations generated by transformer-based backbone, +obtaining rich and discriminative representations for each instance to improve +adaptive ability of LReID. Based on the processed diverse representations, we +propose Knowledge Update (KU) and Knowledge Preservation (KP) strategies at the +task-level layout by introducing the adjustment model and the learner model. KU +strategy enhances the adaptive learning ability of learner models for new +information under the adjustment model prior, and KP strategy preserves old +knowledge operated by representation-level alignment and logit-level +supervision in limited old task datasets while guaranteeing the adaptive +learning information capacity of the LReID model. Compared to state-of-the-art +methods, our method achieves significantly improved performance in holistic, +large-scale, and occluded datasets. + +
+
+ comment: 11 pages,7 Tables,3 Figures +
+
+
+
+
+ + ♻ ☆ SVGDreamer: Text Guided SVG Generation with Diffusion Model CVPR 2024 + + +
+ Recently, text-guided scalable vector graphics (SVGs) synthesis has shown +promise in domains such as iconography and sketch. However, existing +text-to-SVG generation methods lack editability and struggle with visual +quality and result diversity. To address these limitations, we propose a novel +text-guided vector graphics synthesis method called SVGDreamer. SVGDreamer +incorporates a semantic-driven image vectorization (SIVE) process that enables +the decomposition of synthesis into foreground objects and background, thereby +enhancing editability. Specifically, the SIVE process introduces +attention-based primitive control and an attention-mask loss function for +effective control and manipulation of individual elements. Additionally, we +propose a Vectorized Particle-based Score Distillation (VPSD) approach to +address issues of shape over-smoothing, color over-saturation, limited +diversity, and slow convergence of the existing text-to-SVG generation methods +by modeling SVGs as distributions of control points and colors. Furthermore, +VPSD leverages a reward model to re-weight vector particles, which improves +aesthetic appeal and accelerates convergence. Extensive experiments are +conducted to validate the effectiveness of SVGDreamer, demonstrating its +superiority over baseline methods in terms of editability, visual quality, and +diversity. Project page: +\href{https://ximinng.github.io/SVGDreamer-project/}{https://ximinng.github.io/SVGDreamer-project/} + +
+
+ comment: Accepted by CVPR 2024. project link: + https://ximinng.github.io/SVGDreamer-project/ +
+
+
+
+
+ + ♻ ☆ Neural Implicit Representations for Physical Parameter Inference from a + Single Video WACV + + +
+ Neural networks have recently been used to analyze diverse physical systems +and to identify the underlying dynamics. While existing methods achieve +impressive results, they are limited by their strong demand for training data +and their weak generalization abilities to out-of-distribution data. To +overcome these limitations, in this work we propose to combine neural implicit +representations for appearance modeling with neural ordinary differential +equations (ODEs) for modelling physical phenomena to obtain a dynamic scene +representation that can be identified directly from visual observations. Our +proposed model combines several unique advantages: (i) Contrary to existing +approaches that require large training datasets, we are able to identify +physical parameters from only a single video. (ii) The use of neural implicit +representations enables the processing of high-resolution videos and the +synthesis of photo-realistic images. (iii) The embedded neural ODE has a known +parametric form that allows for the identification of interpretable physical +parameters, and (iv) long-term prediction in state space. (v) Furthermore, the +photo-realistic rendering of novel scenes with modified physical parameters +becomes possible. + +
+
+ comment: Published in IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2023 +
+
+
+
+
+ + ♻ ☆ FusionINN: Invertible Image Fusion for Brain Tumor Monitoring + + +
+ Image fusion typically employs non-invertible neural networks to merge +multiple source images into a single fused image. However, for clinical +experts, solely relying on fused images may be insufficient for making +diagnostic decisions, as the fusion mechanism blends features from source +images, thereby making it difficult to interpret the underlying tumor +pathology. We introduce FusionINN, a novel invertible image fusion framework, +capable of efficiently generating fused images and also decomposing them back +to the source images by solving the inverse of the fusion process. FusionINN +guarantees lossless one-to-one pixel mapping by integrating a normally +distributed latent image alongside the fused image to facilitate the generative +modeling of the decomposition process. To the best of our knowledge, we are the +first to investigate the decomposability of fused images, which is particularly +crucial for life-sensitive applications such as medical image fusion compared +to other tasks like multi-focus or multi-exposure image fusion. Our extensive +experimentation validates FusionINN over existing discriminative and generative +fusion methods, both subjectively and objectively. Moreover, compared to a +recent denoising diffusion-based fusion model, our approach offers faster and +qualitatively better fusion results. We also exhibit the clinical utility of +our results in aiding disease prognosis. + +
+
+ comment: Source code available at https://github.com/nish03/FusionINN +
+
+
+
+
+ + ♻ ☆ DRCT: Saving Image Super-resolution away from Information Bottleneck + + +
+ In recent years, Vision Transformer-based applications to low-level vision +tasks have achieved widespread success. Unlike CNN-based models, Transformers +are more adept at capturing long-range dependencies, enabling the +reconstruction of images utilizing information from non-local areas. In the +domain of super-resolution, Swin-transformer-based approaches have become +mainstream due to their capacity to capture global spatial information and +their shifting-window attention mechanism that facilitates the interchange of +information between different windows. Many researchers have enhanced image +quality and network efficiency by expanding the receptive field or designing +complex networks, yielding commendable results. However, we observed that +spatial information tends to diminish during the forward propagation process +due to increased depth, leading to a loss of spatial information and, +consequently, limiting the model's potential. To address this, we propose the +Dense-residual-connected Transformer (DRCT), aimed at mitigating the loss of +spatial information through dense-residual connections between layers, thereby +unleashing the model's potential and enhancing performance. Experiment results +indicate that our approach is not only straightforward but also achieves +remarkable efficiency, surpassing state-of-the-art methods and performing +commendably at NTIRE2024. + +
+
+ comment: Submitted to NTIRE 2024 +
+
+
+
+
+ + ♻ ☆ Direct Preference Optimization of Video Large Multimodal Models from + Language Model Reward + + +
+ Preference modeling techniques, such as direct preference optimization (DPO), +has shown effective in enhancing the generalization abilities of large language +model (LLM). However, in tasks involving video instruction-following, providing +informative feedback, especially for detecting hallucinations in generated +responses, remains a significant challenge. Previous studies have explored +using large large multimodal models (LMMs) as reward models to guide preference +modeling, but their ability to accurately assess the factuality of generated +responses compared to corresponding videos has not been conclusively +established. This paper introduces a novel framework that utilizes detailed +video captions as a proxy of video content, enabling language models to +incorporate this information as supporting evidence for scoring video Question +Answering (QA) predictions. Our approach demonstrates robust alignment with +OpenAI GPT-4V model's reward mechanism, which directly takes video frames as +input. Furthermore, we show that applying this tailored reward through DPO +significantly improves the performance of video LMMs on video QA tasks. + +
+
+
+
+
+ + ♻ ☆ CMRNext: Camera to LiDAR Matching in the Wild for Localization and + Extrinsic Calibration + + +
+ LiDARs are widely used for mapping and localization in dynamic environments. +However, their high cost limits their widespread adoption. On the other hand, +monocular localization in LiDAR maps using inexpensive cameras is a +cost-effective alternative for large-scale deployment. Nevertheless, most +existing approaches struggle to generalize to new sensor setups and +environments, requiring retraining or fine-tuning. In this paper, we present +CMRNext, a novel approach for camera-LIDAR matching that is independent of +sensor-specific parameters, generalizable, and can be used in the wild for +monocular localization in LiDAR maps and camera-LiDAR extrinsic calibration. +CMRNext exploits recent advances in deep neural networks for matching +cross-modal data and standard geometric techniques for robust pose estimation. +We reformulate the point-pixel matching problem as an optical flow estimation +problem and solve the Perspective-n-Point problem based on the resulting +correspondences to find the relative pose between the camera and the LiDAR +point cloud. We extensively evaluate CMRNext on six different robotic +platforms, including three publicly available datasets and three in-house +robots. Our experimental evaluations demonstrate that CMRNext outperforms +existing approaches on both tasks and effectively generalizes to previously +unseen environments and sensor setups in a zero-shot manner. We make the code +and pre-trained models publicly available at http://cmrnext.cs.uni-freiburg.de . + +
+
+
+
+
+ + ♻ ☆ HOI-M3:Capture Multiple Humans and Objects Interaction within Contextual + Environment CVPR 2024 + + +
+ Humans naturally interact with both others and the surrounding multiple +objects, engaging in various social activities. However, recent advances in +modeling human-object interactions mostly focus on perceiving isolated +individuals and objects, due to fundamental data scarcity. In this paper, we +introduce HOI-M3, a novel large-scale dataset for modeling the interactions of +Multiple huMans and Multiple objects. Notably, it provides accurate 3D tracking +for both humans and objects from dense RGB and object-mounted IMU inputs, +covering 199 sequences and 181M frames of diverse humans and objects under rich +activities. With the unique HOI-M3 dataset, we introduce two novel data-driven +tasks with companion strong baselines: monocular capture and unstructured +generation of multiple human-object interactions. Extensive experiments +demonstrate that our dataset is challenging and worthy of further research +about multiple human-object interactions and behavior analysis. Our HOI-M3 +dataset, corresponding codes, and pre-trained models will be disseminated to +the community for future research. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Few-shot point cloud reconstruction and denoising via learned Guassian + splats renderings and fine-tuned diffusion features + + +
+ Existing deep learning methods for the reconstruction and denoising of point +clouds rely on small datasets of 3D shapes. We circumvent the problem by +leveraging deep learning methods trained on billions of images. We propose a +method to reconstruct point clouds from few images and to denoise point clouds +from their rendering by exploiting prior knowledge distilled from image-based +deep learning models. To improve reconstruction in constraint settings, we +regularize the training of a differentiable renderer with hybrid surface and +appearance by introducing semantic consistency supervision. In addition, we +propose a pipeline to finetune Stable Diffusion to denoise renderings of noisy +point clouds and we demonstrate how these learned filters can be used to remove +point cloud noise coming without 3D supervision. We compare our method with DSS +and PointRadiance and achieved higher quality 3D reconstruction on the +Sketchfab Testset and SCUT Dataset. + +
+
+
+
+
+ + ♻ ☆ Recursive Joint Cross-Modal Attention for Multimodal Fusion in + Dimensional Emotion Recognition + + +
+ Though multimodal emotion recognition has achieved significant progress over +recent years, the potential of rich synergic relationships across the +modalities is not fully exploited. In this paper, we introduce Recursive Joint +Cross-Modal Attention (RJCMA) to effectively capture both intra-and inter-modal +relationships across audio, visual and text modalities for dimensional emotion +recognition. In particular, we compute the attention weights based on +cross-correlation between the joint audio-visual-text feature representations +and the feature representations of individual modalities to simultaneously +capture intra- and inter-modal relationships across the modalities. The +attended features of the individual modalities are again fed as input to the +fusion model in a recursive mechanism to obtain more refined feature +representations. We have also explored Temporal Convolutional Networks (TCNs) +to improve the temporal modeling of the feature representations of individual +modalities. Extensive experiments are conducted to evaluate the performance of +the proposed fusion model on the challenging Affwild2 dataset. By effectively +capturing the synergic intra- and inter-modal relationships across audio, +visual and text modalities, the proposed fusion model achieves a Concordance +Correlation Coefficient (CCC) of 0.585 (0.542) and 0.659 (0.619) for valence +and arousal respectively on the validation set (test set). This shows a +significant improvement over the baseline of 0.24 (0.211) and 0.20 (0.191) for +valence and arousal respectively on the validation set (test set) of the +valence-arousal challenge of 6th Affective Behavior Analysis in-the-Wild (ABAW) +competition. + +
+
+
+
+
+ + ♻ ☆ Learning to Generate Conditional Tri-plane for 3D-aware Expression + Controllable Portrait Animation + + +
+ In this paper, we present Export3D, a one-shot 3D-aware portrait animation +method that is able to control the facial expression and camera view of a given +portrait image. To achieve this, we introduce a tri-plane generator that +directly generates a tri-plane of 3D prior by transferring the expression +parameter of 3DMM into the source image. The tri-plane is then decoded into the +image of different view through a differentiable volume rendering. Existing +portrait animation methods heavily rely on image warping to transfer the +expression in the motion space, challenging on disentanglement of appearance +and expression. In contrast, we propose a contrastive pre-training framework +for appearance-free expression parameter, eliminating undesirable appearance +swap when transferring a cross-identity expression. Extensive experiments show +that our pre-training framework can learn the appearance-free expression +representation hidden in 3DMM, and our model can generate 3D-aware expression +controllable portrait image without appearance swap in the cross-identity +manner. + +
+
+ comment: Project page: https://export3d.github.io +
+
+
+
+
+ + ♻ ☆ Corrupting Convolution-based Unlearnable Datasets with Pixel-based Image + Transformations + + +
+ Unlearnable datasets lead to a drastic drop in the generalization performance +of models trained on them by introducing elaborate and imperceptible +perturbations into clean training sets. Many existing defenses, e.g., JPEG +compression and adversarial training, effectively counter UDs based on +norm-constrained additive noise. However, a fire-new type of convolution-based +UDs have been proposed and render existing defenses all ineffective, presenting +a greater challenge to defenders. To address this, we express the +convolution-based unlearnable sample as the result of multiplying a matrix by a +clean sample in a simplified scenario, and formalize the intra-class matrix +inconsistency as $\Theta_{imi}$, inter-class matrix consistency as +$\Theta_{imc}$ to investigate the working mechanism of the convolution-based +UDs. We conjecture that increasing both of these metrics will mitigate the +unlearnability effect. Through validation experiments that commendably support +our hypothesis, we further design a random matrix to boost both $\Theta_{imi}$ +and $\Theta_{imc}$, achieving a notable degree of defense effect. Hence, by +building upon and extending these facts, we first propose a brand-new image +COrruption that employs randomly multiplicative transformation via +INterpolation operation to successfully defend against convolution-based UDs. +Our approach leverages global pixel random interpolations, effectively +suppressing the impact of multiplicative noise in convolution-based UDs. +Additionally, we have also designed two new forms of convolution-based UDs, and +find that our defense is the most effective against them. + +
+
+
+
+
+ + ♻ ☆ VidEdit: Zero-Shot and Spatially Aware Text-Driven Video Editing + + +
+ Recently, diffusion-based generative models have achieved remarkable success +for image generation and edition. However, existing diffusion-based video +editing approaches lack the ability to offer precise control over generated +content that maintains temporal consistency in long-term videos. On the other +hand, atlas-based methods provide strong temporal consistency but are costly to +edit a video and lack spatial control. In this work, we introduce VidEdit, a +novel method for zero-shot text-based video editing that guarantees robust +temporal and spatial consistency. In particular, we combine an atlas-based +video representation with a pre-trained text-to-image diffusion model to +provide a training-free and efficient video editing method, which by design +fulfills temporal smoothness. To grant precise user control over generated +content, we utilize conditional information extracted from off-the-shelf +panoptic segmenters and edge detectors which guides the diffusion sampling +process. This method ensures a fine spatial control on targeted regions while +strictly preserving the structure of the original video. Our quantitative and +qualitative experiments show that VidEdit outperforms state-of-the-art methods +on DAVIS dataset, regarding semantic faithfulness, image preservation, and +temporal consistency metrics. With this framework, processing a single video +only takes approximately one minute, and it can generate multiple compatible +edits based on a unique text prompt. Project web-page at +https://videdit.github.io + +
+
+ comment: TMLR 2024. Project web-page at https://videdit.github.io +
+
+
+
+
+ + ♻ ☆ BRAIxDet: Learning to Detect Malignant Breast Lesion with Incomplete + Annotations + + +
+ Methods to detect malignant lesions from screening mammograms are usually +trained with fully annotated datasets, where images are labelled with the +localisation and classification of cancerous lesions. However, real-world +screening mammogram datasets commonly have a subset that is fully annotated and +another subset that is weakly annotated with just the global classification +(i.e., without lesion localisation). Given the large size of such datasets, +researchers usually face a dilemma with the weakly annotated subset: to not use +it or to fully annotate it. The first option will reduce detection accuracy +because it does not use the whole dataset, and the second option is too +expensive given that the annotation needs to be done by expert radiologists. In +this paper, we propose a middle-ground solution for the dilemma, which is to +formulate the training as a weakly- and semi-supervised learning problem that +we refer to as malignant breast lesion detection with incomplete annotations. +To address this problem, our new method comprises two stages, namely: 1) +pre-training a multi-view mammogram classifier with weak supervision from the +whole dataset, and 2) extending the trained classifier to become a multi-view +detector that is trained with semi-supervised student-teacher learning, where +the training set contains fully and weakly-annotated mammograms. We provide +extensive detection results on two real-world screening mammogram datasets +containing incomplete annotations, and show that our proposed approach achieves +state-of-the-art results in the detection of malignant breast lesions with +incomplete annotations. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ LLM meets Vision-Language Models for Zero-Shot One-Class Classification + + +
+ We consider the problem of zero-shot one-class visual classification. In this +setting, only the label of the target class is available, and the goal is to +discriminate between positive and negative query samples without requiring any +validation example from the target task. We propose a two-step solution that +first queries large language models for visually confusing objects and then +relies on vision-language pre-trained models (e.g., CLIP) to perform +classification. By adapting large-scale vision benchmarks, we demonstrate the +ability of the proposed method to outperform adapted off-the-shelf alternatives +in this setting. Namely, we propose a realistic benchmark where negative query +samples are drawn from the same original dataset as positive ones, including a +granularity-controlled version of iNaturalist, where negative samples are at a +fixed distance in the taxonomy tree from the positive ones. Our work shows that +it is possible to discriminate between a single category and other semantically +related ones using only its label + +
+
+
+
+
+ + ♻ ☆ Decoupled Diffusion Models: Simultaneous Image to Zero and Zero to Noise + + +
+ We propose decoupled diffusion models (DDMs) for high-quality (un)conditioned +image generation in less than 10 function evaluations. In a nutshell, DDMs +decouple the forward image-to-noise mapping into \textit{image-to-zero} mapping +and \textit{zero-to-noise} mapping. Under this framework, we mathematically +derive 1) the training objectives and 2) for the reverse time the sampling +formula based on an analytic transition probability which models image to zero +transition. The former enables DDMs to learn noise and image components +simultaneously which simplifies learning. Importantly, because of the latter's +analyticity in the \textit{zero-to-image} sampling function, DDMs can avoid the +ordinary differential equation-based accelerators and instead naturally perform +sampling with an arbitrary step size. Under the few function evaluation setups, +DDMs experimentally yield very competitive performance compared with the state +of the art in 1) unconditioned image generation, \textit{e.g.}, CIFAR-10 and +CelebA-HQ-256 and 2) image-conditioned downstream tasks such as +super-resolution, saliency detection, edge detection, and image inpainting. + +
+
+
+
+
+ + ♻ ☆ GenHowTo: Learning to Generate Actions and State Transformations from + Instructional Videos CVPR 2024 + + +
+ We address the task of generating temporally consistent and physically +plausible images of actions and object state transformations. Given an input +image and a text prompt describing the targeted transformation, our generated +images preserve the environment and transform objects in the initial image. Our +contributions are threefold. First, we leverage a large body of instructional +videos and automatically mine a dataset of triplets of consecutive frames +corresponding to initial object states, actions, and resulting object +transformations. Second, equipped with this data, we develop and train a +conditioned diffusion model dubbed GenHowTo. Third, we evaluate GenHowTo on a +variety of objects and actions and show superior performance compared to +existing methods. In particular, we introduce a quantitative evaluation where +GenHowTo achieves 88% and 74% on seen and unseen interaction categories, +respectively, outperforming prior work by a large margin. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ PEGASUS: Personalized Generative 3D Avatars with Composable Attributes CVPR 2024 + + +
+ We present PEGASUS, a method for constructing a personalized generative 3D +face avatar from monocular video sources. Our generative 3D avatar enables +disentangled controls to selectively alter the facial attributes (e.g., hair or +nose) while preserving the identity. Our approach consists of two stages: +synthetic database generation and constructing a personalized generative +avatar. We generate a synthetic video collection of the target identity with +varying facial attributes, where the videos are synthesized by borrowing the +attributes from monocular videos of diverse identities. Then, we build a +person-specific generative 3D avatar that can modify its attributes +continuously while preserving its identity. Through extensive experiments, we +demonstrate that our method of generating a synthetic database and creating a +3D generative avatar is the most effective in preserving identity while +achieving high realism. Subsequently, we introduce a zero-shot approach to +achieve the same goal of generative modeling more efficiently by leveraging a +previously constructed personalized generative model. + +
+
+ comment: Accepted at CVPR 2024, Project Page: + https://snuvclab.github.io/pegasus/ +
+
+
+
+
+ + ♻ ☆ Distilling Semantic Priors from SAM to Efficient Image Restoration + Models + + +
+ In image restoration (IR), leveraging semantic priors from segmentation +models has been a common approach to improve performance. The recent segment +anything model (SAM) has emerged as a powerful tool for extracting advanced +semantic priors to enhance IR tasks. However, the computational cost of SAM is +prohibitive for IR, compared to existing smaller IR models. The incorporation +of SAM for extracting semantic priors considerably hampers the model inference +efficiency. To address this issue, we propose a general framework to distill +SAM's semantic knowledge to boost exiting IR models without interfering with +their inference process. Specifically, our proposed framework consists of the +semantic priors fusion (SPF) scheme and the semantic priors distillation (SPD) +scheme. SPF fuses two kinds of information between the restored image predicted +by the original IR model and the semantic mask predicted by SAM for the refined +restored image. SPD leverages a self-distillation manner to distill the fused +semantic priors to boost the performance of original IR models. Additionally, +we design a semantic-guided relation (SGR) module for SPD, which ensures +semantic feature representation space consistency to fully distill the priors. +We demonstrate the effectiveness of our framework across multiple IR models and +tasks, including deraining, deblurring, and denoising. + +
+
+
+
+
+ + ♻ ☆ Rethinking Saliency-Guided Weakly-Supervised Semantic Segmentation + + +
+ This paper presents a fresh perspective on the role of saliency maps in +weakly-supervised semantic segmentation (WSSS) and offers new insights and +research directions based on our empirical findings. We conduct comprehensive +experiments and observe that the quality of the saliency map is a critical +factor in saliency-guided WSSS approaches. Nonetheless, we find that the +saliency maps used in previous works are often arbitrarily chosen, despite +their significant impact on WSSS. Additionally, we observe that the choice of +the threshold, which has received less attention before, is non-trivial in +WSSS. To facilitate more meaningful and rigorous research for saliency-guided +WSSS, we introduce \texttt{WSSS-BED}, a standardized framework for conducting +research under unified conditions. \texttt{WSSS-BED} provides various saliency +maps and activation maps for seven WSSS methods, as well as saliency maps from +unsupervised salient object detection models. + +
+
+ comment: Preprint, 17 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic + Human Modeling CVPR2024 + + +
+ High-quality human reconstruction and photo-realistic rendering of a dynamic +scene is a long-standing problem in computer vision and graphics. Despite +considerable efforts invested in developing various capture systems and +reconstruction algorithms, recent advancements still struggle with loose or +oversized clothing and overly complex poses. In part, this is due to the +challenges of acquiring high-quality human datasets. To facilitate the +development of these fields, in this paper, we present PKU-DyMVHumans, a +versatile human-centric dataset for high-fidelity reconstruction and rendering +of dynamic human scenarios from dense multi-view videos. It comprises 8.2 +million frames captured by more than 56 synchronized cameras across diverse +scenarios. These sequences comprise 32 human subjects across 45 different +scenarios, each with a high-detailed appearance and realistic human motion. +Inspired by recent advancements in neural radiance field (NeRF)-based scene +representations, we carefully set up an off-the-shelf framework that is easy to +provide those state-of-the-art NeRF-based implementations and benchmark on +PKU-DyMVHumans dataset. It is paving the way for various applications like +fine-grained foreground/background decomposition, high-quality human +reconstruction and photo-realistic novel view synthesis of a dynamic scene. +Extensive studies are performed on the benchmark, demonstrating new +observations and challenges that emerge from using such high-fidelity dynamic +data. + +
+
+ comment: CVPR2024(accepted). Project page: https://pku-dymvhumans.github.io +
+
+
+
+
+ + ♻ ☆ Real-time 3D-aware Portrait Editing from a Single Image + + +
+ This work presents 3DPE, a practical method that can efficiently edit a face +image following given prompts, like reference images or text descriptions, in a +3D-aware manner. To this end, a lightweight module is distilled from a 3D +portrait generator and a text-to-image model, which provide prior knowledge of +face geometry and superior editing capability, respectively. Such a design +brings two compelling advantages over existing approaches. First, our system +achieves real-time editing with a feedforward network (i.e., ~0.04s per image), +over 100x faster than the second competitor. Second, thanks to the powerful +priors, our module could focus on the learning of editing-related variations, +such that it manages to handle various types of editing simultaneously in the +training phase and further supports fast adaptation to user-specified +customized types of editing during inference (e.g., with ~5min fine-tuning per +style). The code, the model, and the interface will be made publicly available +to facilitate future research. + +
+
+
+
+
+ + ♻ ☆ High-throughput Visual Nano-drone to Nano-drone Relative Localization + using Onboard Fully Convolutional Networks ICRA 2024 + + +
+ Relative drone-to-drone localization is a fundamental building block for any +swarm operations. We address this task in the context of miniaturized +nano-drones, i.e., 10cm in diameter, which show an ever-growing interest due to +novel use cases enabled by their reduced form factor. The price for their +versatility comes with limited onboard resources, i.e., sensors, processing +units, and memory, which limits the complexity of the onboard algorithms. A +traditional solution to overcome these limitations is represented by +lightweight deep learning models directly deployed aboard nano-drones. This +work tackles the challenging relative pose estimation between nano-drones using +only a gray-scale low-resolution camera and an ultra-low-power System-on-Chip +(SoC) hosted onboard. We present a vertically integrated system based on a +novel vision-based fully convolutional neural network (FCNN), which runs at +39Hz within 101mW onboard a Crazyflie nano-drone extended with the GWT GAP8 +SoC. We compare our FCNN against three State-of-the-Art (SoA) systems. +Considering the best-performing SoA approach, our model results in an R-squared +improvement from 32 to 47% on the horizontal image coordinate and from 18 to +55% on the vertical image coordinate, on a real-world dataset of 30k images. +Finally, our in-field tests show a reduction of the average tracking error of +37% compared to a previous SoA work and an endurance performance up to the +entire battery lifetime of 4 minutes. + +
+
+ comment: ICRA 2024, IEEE Conference +
+
+
+
+
+ + ♻ ☆ SegICL: A Universal In-context Learning Framework for Enhanced + Segmentation in Medical Imaging + + +
+ Medical image segmentation models adapting to new tasks in a training-free +manner through in-context learning is an exciting advancement. Universal +segmentation models aim to generalize across the diverse modality of medical +images, yet their effectiveness often diminishes when applied to +out-of-distribution (OOD) data modalities and tasks, requiring intricate +fine-tuning of model for optimal performance. For addressing this challenge, we +introduce SegICL, a novel approach leveraging In-Context Learning (ICL) for +image segmentation. Unlike existing methods, SegICL has the capability to +employ text-guided segmentation and conduct in-context learning with a small +set of image-mask pairs, eliminating the need for training the model from +scratch or fine-tuning for OOD tasks (including OOD modality and dataset). +Extensive experimental validation of SegICL demonstrates a positive correlation +between the number of prompt samples and segmentation performance on OOD +modalities and tasks. This indicates that SegICL effectively address new +segmentation tasks based on contextual information. Additionally, SegICL also +exhibits comparable segmentation performance to mainstream models on OOD and +in-distribution tasks. Our code will be released soon. + +
+
+
+
+
+ + ♻ ☆ The Solution for the CVPR 2023 1st foundation model challenge-Track2 + + +
+ In this paper, we propose a solution for cross-modal transportation +retrieval. Due to the cross-domain problem of traffic images, we divide the +problem into two sub-tasks of pedestrian retrieval and vehicle retrieval +through a simple strategy. In pedestrian retrieval tasks, we use IRRA as the +base model and specifically design an Attribute Classification to mine the +knowledge implied by attribute labels. More importantly, We use the strategy of +Inclusion Relation Matching to make the image-text pairs with inclusion +relation have similar representation in the feature space. For the vehicle +retrieval task, we use BLIP as the base model. Since aligning the color +attributes of vehicles is challenging, we introduce attribute-based object +detection techniques to add color patch blocks to vehicle images for color data +augmentation. This serves as strong prior information, helping the model +perform the image-text alignment. At the same time, we incorporate labeled +attributes into the image-text alignment loss to learn fine-grained alignment +and prevent similar images and texts from being incorrectly separated. Our +approach ranked first in the final B-board test with a score of 70.9. + +
+
+
+
+
+ + ♻ ☆ Vision-Language Models in Remote Sensing: Current Progress and Future + Trends + + +
+ The remarkable achievements of ChatGPT and GPT-4 have sparked a wave of +interest and research in the field of large language models for Artificial +General Intelligence (AGI). These models provide intelligent solutions close to +human thinking, enabling us to use general artificial intelligence to solve +problems in various applications. However, in remote sensing (RS), the +scientific literature on the implementation of AGI remains relatively scant. +Existing AI-related research in remote sensing primarily focuses on visual +understanding tasks while neglecting the semantic understanding of the objects +and their relationships. This is where vision-language models excel, as they +enable reasoning about images and their associated textual descriptions, +allowing for a deeper understanding of the underlying semantics. +Vision-language models can go beyond visual recognition of RS images, model +semantic relationships, and generate natural language descriptions of the +image. This makes them better suited for tasks requiring visual and textual +understanding, such as image captioning, and visual question answering. This +paper provides a comprehensive review of the research on vision-language models +in remote sensing, summarizing the latest progress, highlighting challenges, +and identifying potential research opportunities. + +
+
+ comment: Accepted by IEEE Geoscience and Remote Sensing Magazine +
+
+
+
+
+ + ♻ ☆ Steerers: A framework for rotation equivariant keypoint descriptors CVPR 2024 + + +
+ Image keypoint descriptions that are discriminative and matchable over large +changes in viewpoint are vital for 3D reconstruction. However, descriptions +output by learned descriptors are typically not robust to camera rotation. +While they can be made more robust by, e.g., data augmentation, this degrades +performance on upright images. Another approach is test-time augmentation, +which incurs a significant increase in runtime. Instead, we learn a linear +transform in description space that encodes rotations of the input image. We +call this linear transform a steerer since it allows us to transform the +descriptions as if the image was rotated. From representation theory, we know +all possible steerers for the rotation group. Steerers can be optimized (A) +given a fixed descriptor, (B) jointly with a descriptor or (C) we can optimize +a descriptor given a fixed steerer. We perform experiments in these three +settings and obtain state-of-the-art results on the rotation invariant image +matching benchmarks AIMS and Roto-360. We publish code and model weights at +https://github.com/georg-bn/rotation-steerers. + +
+
+ comment: CVPR 2024 Camera ready +
+
+
+
+
+ + ♻ ☆ All in an Aggregated Image for In-Image Learning + + +
+ This paper introduces a new in-context learning (ICL) mechanism called +In-Image Learning (I$^2$L) that combines demonstration examples, visual cues, +and chain-of-thought reasoning into an aggregated image to enhance the +capabilities of Large Multimodal Models (e.g., GPT-4V) in multimodal reasoning +tasks. Unlike previous approaches that rely on converting images to text or +incorporating visual input into language models, I$^2$L consolidates all +information into an aggregated image and leverages image processing, +understanding, and reasoning abilities. This has several advantages: it reduces +inaccurate textual descriptions of complex images, provides flexibility in +positioning demonstration examples, and avoids multiple input images and +lengthy prompts. We also introduce I$^2$L-Hybrid, a method that combines the +strengths of I$^2$L with other ICL methods. Specifically, it uses an automatic +strategy to select the most suitable method (I$^2$L or another certain ICL +method) for a specific task instance. We conduct extensive experiments to +assess the effectiveness of I$^2$L and I$^2$L-Hybrid on MathVista, which covers +a variety of complex multimodal reasoning tasks. Additionally, we investigate +the influence of image resolution, the number of demonstration examples in a +single image, and the positions of these demonstrations in the aggregated image +on the effectiveness of I$^2$L. Our code is publicly available at +https://github.com/AGI-Edgerunners/IIL. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ MM-Interleaved: Interleaved Image-Text Generative Modeling via + Multi-modal Feature Synchronizer + + +
+ Developing generative models for interleaved image-text data has both +research and practical value. It requires models to understand the interleaved +sequences and subsequently generate images and text. However, existing attempts +are limited by the issue that the fixed number of visual tokens cannot +efficiently capture image details, which is particularly problematic in the +multi-image scenarios. To address this, this paper presents MM-Interleaved, an +end-to-end generative model for interleaved image-text data. It introduces a +multi-scale and multi-image feature synchronizer module, allowing direct access +to fine-grained image features in the previous context during the generation +process. MM-Interleaved is end-to-end pre-trained on both paired and +interleaved image-text corpora. It is further enhanced through a supervised +fine-tuning phase, wherein the model improves its ability to follow complex +multi-modal instructions. Experiments demonstrate the versatility of +MM-Interleaved in recognizing visual details following multi-modal instructions +and generating consistent images following both textual and visual conditions. +Code and models are available at +\url{https://github.com/OpenGVLab/MM-Interleaved}. + +
+
+ comment: 20 pages, 9 figures, 17 tables +
+
+
+
+
+ + ♻ ☆ EpiDiff: Enhancing Multi-View Synthesis via Localized + Epipolar-Constrained Diffusion + + +
+ Generating multiview images from a single view facilitates the rapid +generation of a 3D mesh conditioned on a single image. Recent methods that +introduce 3D global representation into diffusion models have shown the +potential to generate consistent multiviews, but they have reduced generation +speed and face challenges in maintaining generalizability and quality. To +address this issue, we propose EpiDiff, a localized interactive multiview +diffusion model. At the core of the proposed approach is to insert a +lightweight epipolar attention block into the frozen diffusion model, +leveraging epipolar constraints to enable cross-view interaction among feature +maps of neighboring views. The newly initialized 3D modeling module preserves +the original feature distribution of the diffusion model, exhibiting +compatibility with a variety of base diffusion models. Experiments show that +EpiDiff generates 16 multiview images in just 12 seconds, and it surpasses +previous methods in quality evaluation metrics, including PSNR, SSIM and LPIPS. +Additionally, EpiDiff can generate a more diverse distribution of views, +improving the reconstruction quality from generated multiviews. Please see our +project page at https://huanngzh.github.io/EpiDiff/. + +
+
+ comment: Project page: https://huanngzh.github.io/EpiDiff/ +
+
+
+
+
+ + ♻ ☆ ADDP: Learning General Representations for Image Recognition and + Generation with Alternating Denoising Diffusion Process ICLR2024 + + +
+ Image recognition and generation have long been developed independently of +each other. With the recent trend towards general-purpose representation +learning, the development of general representations for both recognition and +generation tasks is also promoted. However, preliminary attempts mainly focus +on generation performance, but are still inferior on recognition tasks. These +methods are modeled in the vector-quantized (VQ) space, whereas leading +recognition methods use pixels as inputs. Our key insights are twofold: (1) +pixels as inputs are crucial for recognition tasks; (2) VQ tokens as +reconstruction targets are beneficial for generation tasks. These observations +motivate us to propose an Alternating Denoising Diffusion Process (ADDP) that +integrates these two spaces within a single representation learning framework. +In each denoising step, our method first decodes pixels from previous VQ +tokens, then generates new VQ tokens from the decoded pixels. The diffusion +process gradually masks out a portion of VQ tokens to construct the training +samples. The learned representations can be used to generate diverse +high-fidelity images and also demonstrate excellent transfer performance on +recognition tasks. Extensive experiments show that our method achieves +competitive performance on unconditional generation, ImageNet classification, +COCO detection, and ADE20k segmentation. Importantly, our method represents the +first successful development of general representations applicable to both +generation and dense recognition tasks. Code is released at +\url{https://github.com/ChangyaoTian/ADDP}. + +
+
+ comment: Accepted by ICLR2024 +
+
+
+
+
+ + ♻ ☆ SDGE: Stereo Guided Depth Estimation for 360$^\circ$ Camera Sets + + +
+ Depth estimation is a critical technology in autonomous driving, and +multi-camera systems are often used to achieve a 360$^\circ$ perception. These +360$^\circ$ camera sets often have limited or low-quality overlap regions, +making multi-view stereo methods infeasible for the entire image. +Alternatively, monocular methods may not produce consistent cross-view +predictions. To address these issues, we propose the Stereo Guided Depth +Estimation (SGDE) method, which enhances depth estimation of the full image by +explicitly utilizing multi-view stereo results on the overlap. We suggest +building virtual pinhole cameras to resolve the distortion problem of fisheye +cameras and unify the processing for the two types of 360$^\circ$ cameras. For +handling the varying noise on camera poses caused by unstable movement, the +approach employs a self-calibration method to obtain highly accurate relative +poses of the adjacent cameras with minor overlap. These enable the use of +robust stereo methods to obtain high-quality depth prior in the overlap region. +This prior serves not only as an additional input but also as pseudo-labels +that enhance the accuracy of depth estimation methods and improve cross-view +prediction consistency. The effectiveness of SGDE is evaluated on one fisheye +camera dataset, Synthetic Urban, and two pinhole camera datasets, DDAD and +nuScenes. Our experiments demonstrate that SGDE is effective for both +supervised and self-supervised depth estimation, and highlight the potential of +our method for advancing downstream autonomous driving technologies, such as 3D +object detection and occupancy prediction. + +
+
+
+
+
+ + ♻ ☆ HIPTrack: Visual Tracking with Historical Prompts CVPR2024 + + +
+ Trackers that follow Siamese paradigm utilize similarity matching between +template and search region features for tracking. Many methods have been +explored to enhance tracking performance by incorporating tracking history to +better handle scenarios involving target appearance variations such as +deformation and occlusion. However, the utilization of historical information +in existing methods is insufficient and incomprehensive, which typically +requires repetitive training and introduces a large amount of computation. In +this paper, we show that by providing a tracker that follows Siamese paradigm +with precise and updated historical information, a significant performance +improvement can be achieved with completely unchanged parameters. Based on +this, we propose a historical prompt network that uses refined historical +foreground masks and historical visual features of the target to provide +comprehensive and precise prompts for the tracker. We build a novel tracker +called HIPTrack based on the historical prompt network, which achieves +considerable performance improvements without the need to retrain the entire +model. We conduct experiments on seven datasets and experimental results +demonstrate that our method surpasses the current state-of-the-art trackers on +LaSOT, LaSOText, GOT-10k and NfS. Furthermore, the historical prompt network +can seamlessly integrate as a plug-and-play module into existing trackers, +providing performance enhancements. The source code is available at +https://github.com/WenRuiCai/HIPTrack. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ 3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple + 3D Representations + + +
+ Imitation learning provides an efficient way to teach robots dexterous +skills; however, learning complex skills robustly and generalizablely usually +consumes large amounts of human demonstrations. To tackle this challenging +problem, we present 3D Diffusion Policy (DP3), a novel visual imitation +learning approach that incorporates the power of 3D visual representations into +diffusion policies, a class of conditional action generative models. The core +design of DP3 is the utilization of a compact 3D visual representation, +extracted from sparse point clouds with an efficient point encoder. In our +experiments involving 72 simulation tasks, DP3 successfully handles most tasks +with just 10 demonstrations and surpasses baselines with a 24.2% relative +improvement. In 4 real robot tasks, DP3 demonstrates precise control with a +high success rate of 85%, given only 40 demonstrations of each task, and shows +excellent generalization abilities in diverse aspects, including space, +viewpoint, appearance, and instance. Interestingly, in real robot experiments, +DP3 rarely violates safety requirements, in contrast to baseline methods which +frequently do, necessitating human intervention. Our extensive evaluation +highlights the critical importance of 3D representations in real-world robot +learning. Videos, code, and data are available on +https://3d-diffusion-policy.github.io . + +
+
+ comment: Videos, code, and data: https://3d-diffusion-policy.github.io +
+
+
+
+
+ + ♻ ☆ Saliency strikes back: How filtering out high frequencies improves + white-box explanations + + +
+ Attribution methods correspond to a class of explainability methods (XAI) +that aim to assess how individual inputs contribute to a model's +decision-making process. We have identified a significant limitation in one +type of attribution methods, known as "white-box" methods. Although highly +efficient, these methods rely on a gradient signal that is often contaminated +by high-frequency noise. To overcome this limitation, we introduce a new +approach called "FORGrad". This simple method effectively filters out noise +artifacts by using optimal cut-off frequencies tailored to the unique +characteristics of each model architecture. Our findings show that FORGrad +consistently enhances the performance of already existing white-box methods, +enabling them to compete effectively with more accurate yet computationally +demanding "black-box" methods. We anticipate that our research will foster +broader adoption of simpler and more efficient white-box methods for +explainability, offering a better balance between faithfulness and +computational efficiency. + +
+
+
+
+
+ + ♻ ☆ FRDiff : Feature Reuse for Universal Training-free Acceleration of + Diffusion Models + + +
+ The substantial computational costs of diffusion models, especially due to +the repeated denoising steps necessary for high-quality image generation, +present a major obstacle to their widespread adoption. While several studies +have attempted to address this issue by reducing the number of score function +evaluations (NFE) using advanced ODE solvers without fine-tuning, the decreased +number of denoising iterations misses the opportunity to update fine details, +resulting in noticeable quality degradation. In our work, we introduce an +advanced acceleration technique that leverages the temporal redundancy inherent +in diffusion models. Reusing feature maps with high temporal similarity opens +up a new opportunity to save computation resources without compromising output +quality. To realize the practical benefits of this intuition, we conduct an +extensive analysis and propose a novel method, FRDiff. FRDiff is designed to +harness the advantages of both reduced NFE and feature reuse, achieving a +Pareto frontier that balances fidelity and latency trade-offs in various +generative tasks. + +
+
+ comment: Work in progress. Project page : + https://jungwon-lee.github.io/Project_FRDiff/ +
+
+
+
+
+ + ♻ ☆ Morphable Diffusion: 3D-Consistent Diffusion for Single-image Avatar + Creation CVPR 2024 + + +
+ Recent advances in generative diffusion models have enabled the previously +unfeasible capability of generating 3D assets from a single input image or a +text prompt. In this work, we aim to enhance the quality and functionality of +these models for the task of creating controllable, photorealistic human +avatars. We achieve this by integrating a 3D morphable model into the +state-of-the-art multi-view-consistent diffusion approach. We demonstrate that +accurate conditioning of a generative pipeline on the articulated 3D model +enhances the baseline model performance on the task of novel view synthesis +from a single image. More importantly, this integration facilitates a seamless +and accurate incorporation of facial expression and body pose control into the +generation process. To the best of our knowledge, our proposed framework is the +first diffusion model to enable the creation of fully 3D-consistent, +animatable, and photorealistic human avatars from a single image of an unseen +subject; extensive quantitative and qualitative evaluations demonstrate the +advantages of our approach over existing state-of-the-art avatar creation +models on both novel view and novel expression synthesis tasks. The code for +our project is publicly available. + +
+
+ comment: [CVPR 2024] Project page: + https://xiyichen.github.io/morphablediffusion/ +
+
+
+
+
+ + ♻ ☆ PointMamba: A Simple State Space Model for Point Cloud Analysis + + +
+ Transformers have become one of the foundational architectures in point cloud +analysis tasks due to their excellent global modeling ability. However, the +attention mechanism has quadratic complexity and is difficult to extend to long +sequence modeling due to limited computational resources and so on. Recently, +state space models (SSM), a new family of deep sequence models, have presented +great potential for sequence modeling in NLP tasks. In this paper, taking +inspiration from the success of SSM in NLP, we propose PointMamba, a framework +with global modeling and linear complexity. Specifically, by taking embedded +point patches as input, we proposed a reordering strategy to enhance SSM's +global modeling ability by providing a more logical geometric scanning order. +The reordered point tokens are then sent to a series of Mamba blocks to +causally capture the point cloud structure. Experimental results show our +proposed PointMamba outperforms the transformer-based counterparts on different +point cloud analysis datasets, while significantly saving about 44.3% +parameters and 25% FLOPs, demonstrating the potential option for constructing +foundational 3D vision models. We hope our PointMamba can provide a new +perspective for point cloud analysis. The code is available at +https://github.com/LMD0311/PointMamba. + +
+
+ comment: Work in progress. The code is available at + https://github.com/LMD0311/PointMamba +
+
+
+
+
+ + ♻ ☆ Advancements in Point Cloud Data Augmentation for Deep Learning: A + Survey + + +
+ Deep learning (DL) has become one of the mainstream and effective methods for +point cloud analysis tasks such as detection, segmentation and classification. +To reduce overfitting during training DL models and improve model performance +especially when the amount and/or diversity of training data are limited, +augmentation is often crucial. Although various point cloud data augmentation +methods have been widely used in different point cloud processing tasks, there +are currently no published systematic surveys or reviews of these methods. +Therefore, this article surveys these methods, categorizing them into a +taxonomy framework that comprises basic and specialized point cloud data +augmentation methods. Through a comprehensive evaluation of these augmentation +methods, this article identifies their potentials and limitations, serving as a +useful reference for choosing appropriate augmentation methods. In addition, +potential directions for future research are recommended. This survey +contributes to providing a holistic overview of the current state of point +cloud data augmentation, promoting its wider application and development. + +
+
+
+
+
+ + ♻ ☆ CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz + continuity constrAIned Normalization CVPR2024 + + +
+ Generative Adversarial Networks (GANs) significantly advanced image +generation but their performance heavily depends on abundant training data. In +scenarios with limited data, GANs often struggle with discriminator overfitting +and unstable training. Batch Normalization (BN), despite being known for +enhancing generalization and training stability, has rarely been used in the +discriminator of Data-Efficient GANs. Our work addresses this gap by +identifying a critical flaw in BN: the tendency for gradient explosion during +the centering and scaling steps. To tackle this issue, we present CHAIN +(lipsCHitz continuity constrAIned Normalization), which replaces the +conventional centering step with zero-mean regularization and integrates a +Lipschitz continuity constraint in the scaling step. CHAIN further enhances GAN +training by adaptively interpolating the normalized and unnormalized features, +effectively avoiding discriminator overfitting. Our theoretical analyses firmly +establishes CHAIN's effectiveness in reducing gradients in latent features and +weights, improving stability and generalization in GAN training. Empirical +evidence supports our theory. CHAIN achieves state-of-the-art results in +data-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven +high-resolution few-shot image datasets. + +
+
+ comment: Accepted by CVPR2024, 26 pages full version +
+
+
+
+
+ + ♻ ☆ TIP-Editor: An Accurate 3D Editor Following Both Text-Prompts And + Image-Prompts + + +
+ Text-driven 3D scene editing has gained significant attention owing to its +convenience and user-friendliness. However, existing methods still lack +accurate control of the specified appearance and location of the editing result +due to the inherent limitations of the text description. To this end, we +propose a 3D scene editing framework, TIPEditor, that accepts both text and +image prompts and a 3D bounding box to specify the editing region. With the +image prompt, users can conveniently specify the detailed appearance/style of +the target content in complement to the text description, enabling accurate +control of the appearance. Specifically, TIP-Editor employs a stepwise 2D +personalization strategy to better learn the representation of the existing +scene and the reference image, in which a localization loss is proposed to +encourage correct object placement as specified by the bounding box. +Additionally, TIPEditor utilizes explicit and flexible 3D Gaussian splatting as +the 3D representation to facilitate local editing while keeping the background +unchanged. Extensive experiments have demonstrated that TIP-Editor conducts +accurate editing following the text and image prompts in the specified bounding +box region, consistently outperforming the baselines in editing quality, and +the alignment to the prompts, qualitatively and quantitatively. + +
+
+ comment: Accpeted by Siggraph 2024 & ACM Transactions on Graphics +
+
+
+
+
+ + ♻ ☆ SSM Meets Video Diffusion Models: Efficient Video Generation with + Structured State Spaces ICLR 2024 + + +
+ Given the remarkable achievements in image generation through diffusion +models, the research community has shown increasing interest in extending these +models to video generation. Recent diffusion models for video generation have +predominantly utilized attention layers to extract temporal features. However, +attention layers are limited by their memory consumption, which increases +quadratically with the length of the sequence. This limitation presents +significant challenges when attempting to generate longer video sequences using +diffusion models. To overcome this challenge, we propose leveraging state-space +models (SSMs). SSMs have recently gained attention as viable alternatives due +to their linear memory consumption relative to sequence length. In the +experiments, we first evaluate our SSM-based model with UCF101, a standard +benchmark of video generation. In addition, to investigate the potential of +SSMs for longer video generation, we perform an experiment using the MineRL +Navigate dataset, varying the number of frames to 64, 200, and 400. In these +settings, our SSM-based model can considerably save memory consumption for +longer sequences, while maintaining competitive FVD scores to the +attention-based models. Our codes are available at +https://github.com/shim0114/SSM-Meets-Video-Diffusion-Models. + +
+
+ comment: Accepted as a workshop paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Endo-4DGS: Endoscopic Monocular Scene Reconstruction with 4D Gaussian + Splatting + + +
+ In the realm of robot-assisted minimally invasive surgery, dynamic scene +reconstruction can significantly enhance downstream tasks and improve surgical +outcomes. Neural Radiance Fields (NeRF)-based methods have recently risen to +prominence for their exceptional ability to reconstruct scenes but are hampered +by slow inference speed, prolonged training, and inconsistent depth estimation. +Some previous work utilizes ground truth depth for optimization but is hard to +acquire in the surgical domain. To overcome these obstacles, we present +Endo-4DGS, a real-time endoscopic dynamic reconstruction approach that utilizes +3D Gaussian Splatting (GS) for 3D representation. Specifically, we propose +lightweight MLPs to capture temporal dynamics with Gaussian deformation fields. +To obtain a satisfactory Gaussian Initialization, we exploit a powerful depth +estimation foundation model, Depth-Anything, to generate pseudo-depth maps as a +geometry prior. We additionally propose confidence-guided learning to tackle +the ill-pose problems in monocular depth estimation and enhance the +depth-guided reconstruction with surface normal constraints and depth +regularization. Our approach has been validated on two surgical datasets, where +it can effectively render in real-time, compute efficiently, and reconstruct +with remarkable accuracy. + +
+
+
+
+
+ + ♻ ☆ KTPFormer: Kinematics and Trajectory Prior Knowledge-Enhanced + Transformer for 3D Human Pose Estimation CVPR 2024 + + +
+ This paper presents a novel Kinematics and Trajectory Prior +Knowledge-Enhanced Transformer (KTPFormer), which overcomes the weakness in +existing transformer-based methods for 3D human pose estimation that the +derivation of Q, K, V vectors in their self-attention mechanisms are all based +on simple linear mapping. We propose two prior attention modules, namely +Kinematics Prior Attention (KPA) and Trajectory Prior Attention (TPA) to take +advantage of the known anatomical structure of the human body and motion +trajectory information, to facilitate effective learning of global dependencies +and features in the multi-head self-attention. KPA models kinematic +relationships in the human body by constructing a topology of kinematics, while +TPA builds a trajectory topology to learn the information of joint motion +trajectory across frames. Yielding Q, K, V vectors with prior knowledge, the +two modules enable KTPFormer to model both spatial and temporal correlations +simultaneously. Extensive experiments on three benchmarks (Human3.6M, +MPI-INF-3DHP and HumanEva) show that KTPFormer achieves superior performance in +comparison to state-of-the-art methods. More importantly, our KPA and TPA +modules have lightweight plug-and-play designs and can be integrated into +various transformer-based networks (i.e., diffusion-based) to improve the +performance with only a very small increase in the computational overhead. The +code is available at: https://github.com/JihuaPeng/KTPFormer. + +
+
+ comment: Accepted by CVPR 2024,GitHub + code:https://github.com/JihuaPeng/KTPFormer +
+
+
+
+
+ + ♻ ☆ eTraM: Event-based Traffic Monitoring Dataset + + +
+ Event cameras, with their high temporal and dynamic range and minimal memory +usage, have found applications in various fields. However, their potential in +static traffic monitoring remains largely unexplored. To facilitate this +exploration, we present eTraM - a first-of-its-kind, fully event-based traffic +monitoring dataset. eTraM offers 10 hr of data from different traffic scenarios +in various lighting and weather conditions, providing a comprehensive overview +of real-world situations. Providing 2M bounding box annotations, it covers +eight distinct classes of traffic participants, ranging from vehicles to +pedestrians and micro-mobility. eTraM's utility has been assessed using +state-of-the-art methods for traffic participant detection, including RVT, RED, +and YOLOv8. We quantitatively evaluate the ability of event-based models to +generalize on nighttime and unseen scenes. Our findings substantiate the +compelling potential of leveraging event cameras for traffic monitoring, +opening new avenues for research and application. eTraM is available at +https://eventbasedvision.github.io/eTraM + +
+
+
+
+
+ + ♻ ☆ MaskINT: Video Editing via Interpolative Non-autoregressive Masked + Transformers CVPR 2024 + + +
+ Recent advances in generative AI have significantly enhanced image and video +editing, particularly in the context of text prompt control. State-of-the-art +approaches predominantly rely on diffusion models to accomplish these tasks. +However, the computational demands of diffusion-based methods are substantial, +often necessitating large-scale paired datasets for training, and therefore +challenging the deployment in real applications. To address these issues, this +paper breaks down the text-based video editing task into two stages. First, we +leverage an pre-trained text-to-image diffusion model to simultaneously edit +few keyframes in an zero-shot way. Second, we introduce an efficient model +called MaskINT, which is built on non-autoregressive masked generative +transformers and specializes in frame interpolation between the edited +keyframes, using the structural guidance from intermediate frames. Experimental +results suggest that our MaskINT achieves comparable performance with +diffusion-based methodologies, while significantly improve the inference time. +This research offers a practical solution for text-based video editing and +showcases the potential of non-autoregressive masked generative transformers in +this domain. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Kiki or Bouba? Sound Symbolism in Vision-and-Language Models NeurIPS 2023 + + +
+ Although the mapping between sound and meaning in human language is assumed +to be largely arbitrary, research in cognitive science has shown that there are +non-trivial correlations between particular sounds and meanings across +languages and demographic groups, a phenomenon known as sound symbolism. Among +the many dimensions of meaning, sound symbolism is particularly salient and +well-demonstrated with regards to cross-modal associations between language and +the visual domain. In this work, we address the question of whether sound +symbolism is reflected in vision-and-language models such as CLIP and Stable +Diffusion. Using zero-shot knowledge probing to investigate the inherent +knowledge of these models, we find strong evidence that they do show this +pattern, paralleling the well-known kiki-bouba effect in psycholinguistics. Our +work provides a novel method for demonstrating sound symbolism and +understanding its nature using computational tools. Our code will be made +publicly available. + +
+
+ comment: Accepted to NeurIPS 2023 (spotlight). Project webpage: + https://kiki-bouba.github.io/ +
+
+
+
+
+ + ♻ ☆ Rotated Multi-Scale Interaction Network for Referring Remote Sensing + Image Segmentation CVPR 2024 + + +
+ Referring Remote Sensing Image Segmentation (RRSIS) is a new challenge that +combines computer vision and natural language processing, delineating specific +regions in aerial images as described by textual queries. Traditional Referring +Image Segmentation (RIS) approaches have been impeded by the complex spatial +scales and orientations found in aerial imagery, leading to suboptimal +segmentation results. To address these challenges, we introduce the Rotated +Multi-Scale Interaction Network (RMSIN), an innovative approach designed for +the unique demands of RRSIS. RMSIN incorporates an Intra-scale Interaction +Module (IIM) to effectively address the fine-grained detail required at +multiple scales and a Cross-scale Interaction Module (CIM) for integrating +these details coherently across the network. Furthermore, RMSIN employs an +Adaptive Rotated Convolution (ARC) to account for the diverse orientations of +objects, a novel contribution that significantly enhances segmentation +accuracy. To assess the efficacy of RMSIN, we have curated an expansive dataset +comprising 17,402 image-caption-mask triplets, which is unparalleled in terms +of scale and variety. This dataset not only presents the model with a wide +range of spatial and rotational scenarios but also establishes a stringent +benchmark for the RRSIS task, ensuring a rigorous evaluation of performance. +Our experimental evaluations demonstrate the exceptional performance of RMSIN, +surpassing existing state-of-the-art models by a significant margin. All +datasets and code are made available at https://github.com/Lsan2401/RMSIN. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Bridging the Projection Gap: Overcoming Projection Bias Through + Parameterized Distance Learning + + +
+ Generalized zero-shot learning (GZSL) aims to recognize samples from both +seen and unseen classes using only seen class samples for training. However, +GZSL methods are prone to bias towards seen classes during inference due to the +projection function being learned from seen classes. Most methods focus on +learning an accurate projection, but bias in the projection is inevitable. We +address this projection bias by proposing to learn a parameterized Mahalanobis +distance metric for robust inference. Our key insight is that the distance +computation during inference is critical, even with a biased projection. We +make two main contributions - (1) We extend the VAEGAN (Variational Autoencoder +\& Generative Adversarial Networks) architecture with two branches to +separately output the projection of samples from seen and unseen classes, +enabling more robust distance learning. (2) We introduce a novel loss function +to optimize the Mahalanobis distance representation and reduce projection bias. +Extensive experiments on four datasets show that our approach outperforms +state-of-the-art GZSL techniques with improvements of up to 3.5 \% on the +harmonic mean metric. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ PosterLlama: Bridging Design Ability of Langauge Model to Contents-Aware + Layout Generation + + +
+ Visual layout plays a critical role in graphic design fields such as +advertising, posters, and web UI design. The recent trend towards content-aware +layout generation through generative models has shown promise, yet it often +overlooks the semantic intricacies of layout design by treating it as a simple +numerical optimization. To bridge this gap, we introduce PosterLlama, a network +designed for generating visually and textually coherent layouts by reformatting +layout elements into HTML code and leveraging the rich design knowledge +embedded within language models. Furthermore, we enhance the robustness of our +model with a unique depth-based poster augmentation strategy. This ensures our +generated layouts remain semantically rich but also visually appealing, even +with limited data. Our extensive evaluations across several benchmarks +demonstrate that PosterLlama outperforms existing methods in producing +authentic and content-aware layouts. It supports an unparalleled range of +conditions, including but not limited to unconditional layout generation, +element conditional layout generation, layout completion, among others, serving +as a highly versatile user manipulation tool. + +
+
+
+
+
+ + ♻ ☆ Classification for everyone : Building geography agnostic models for + fairer recognition + + +
+ In this paper, we analyze different methods to mitigate inherent geographical +biases present in state of the art image classification models. We first +quantitatively present this bias in two datasets - The Dollar Street Dataset +and ImageNet, using images with location information. We then present different +methods which can be employed to reduce this bias. Finally, we analyze the +effectiveness of the different techniques on making these models more robust to +geographical locations of the images. + +
+
+ comment: typos corrected, references added +
+
+
+
+
+ + ♻ ☆ Text-to-3D using Gaussian Splatting CVPR 2024 + + +
+ Automatic text-to-3D generation that combines Score Distillation Sampling +(SDS) with the optimization of volume rendering has achieved remarkable +progress in synthesizing realistic 3D objects. Yet most existing text-to-3D +methods by SDS and volume rendering suffer from inaccurate geometry, e.g., the +Janus issue, since it is hard to explicitly integrate 3D priors into implicit +3D representations. Besides, it is usually time-consuming for them to generate +elaborate 3D models with rich colors. In response, this paper proposes GSGEN, a +novel method that adopts Gaussian Splatting, a recent state-of-the-art +representation, to text-to-3D generation. GSGEN aims at generating high-quality +3D objects and addressing existing shortcomings by exploiting the explicit +nature of Gaussian Splatting that enables the incorporation of 3D prior. +Specifically, our method adopts a progressive optimization strategy, which +includes a geometry optimization stage and an appearance refinement stage. In +geometry optimization, a coarse representation is established under 3D point +cloud diffusion prior along with the ordinary 2D SDS optimization, ensuring a +sensible and 3D-consistent rough shape. Subsequently, the obtained Gaussians +undergo an iterative appearance refinement to enrich texture details. In this +stage, we increase the number of Gaussians by compactness-based densification +to enhance continuity and improve fidelity. With these designs, our approach +can generate 3D assets with delicate details and accurate geometry. Extensive +evaluations demonstrate the effectiveness of our method, especially for +capturing high-frequency components. Our code is available at +https://github.com/gsgen3d/gsgen + +
+
+ comment: To appear at CVPR 2024. Project page: https://gsgen3d.github.io. + Code: https://github.com/gsgen3d/gsgen +
+
+
+
+
+ + ♻ ☆ DPA-Net: Structured 3D Abstraction from Sparse Views via Differentiable + Primitive Assembly + + +
+ We present a differentiable rendering framework to learn structured 3D +abstractions in the form of primitive assemblies from sparse RGB images +capturing a 3D object. By leveraging differentiable volume rendering, our +method does not require 3D supervision. Architecturally, our network follows +the general pipeline of an image-conditioned neural radiance field (NeRF) +exemplified by pixelNeRF for color prediction. As our core contribution, we +introduce differential primitive assembly (DPA) into NeRF to output a 3D +occupancy field in place of density prediction, where the predicted occupancies +serve as opacity values for volume rendering. Our network, coined DPA-Net, +produces a union of convexes, each as an intersection of convex quadric +primitives, to approximate the target 3D object, subject to an abstraction loss +and a masking loss, both defined in the image space upon volume rendering. With +test-time adaptation and additional sampling and loss designs aimed at +improving the accuracy and compactness of the obtained assemblies, our method +demonstrates superior performance over state-of-the-art alternatives for 3D +primitive abstraction from sparse views. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ SurMo: Surface-based 4D Motion Modeling for Dynamic Human Rendering CVPR 2024 + + +
+ Dynamic human rendering from video sequences has achieved remarkable progress +by formulating the rendering as a mapping from static poses to human images. +However, existing methods focus on the human appearance reconstruction of every +single frame while the temporal motion relations are not fully explored. In +this paper, we propose a new 4D motion modeling paradigm, SurMo, that jointly +models the temporal dynamics and human appearances in a unified framework with +three key designs: 1) Surface-based motion encoding that models 4D human +motions with an efficient compact surface-based triplane. It encodes both +spatial and temporal motion relations on the dense surface manifold of a +statistical body template, which inherits body topology priors for +generalizable novel view synthesis with sparse training observations. 2) +Physical motion decoding that is designed to encourage physical motion learning +by decoding the motion triplane features at timestep t to predict both spatial +derivatives and temporal derivatives at the next timestep t+1 in the training +stage. 3) 4D appearance decoding that renders the motion triplanes into images +by an efficient volumetric surface-conditioned renderer that focuses on the +rendering of body surfaces with motion learning conditioning. Extensive +experiments validate the state-of-the-art performance of our new paradigm and +illustrate the expressiveness of surface-based motion triplanes for rendering +high-fidelity view-consistent humans with fast motions and even +motion-dependent shadows. Our project page is at: +https://taohuumd.github.io/projects/SurMo/ + +
+
+ comment: Accepted to CVPR 2024. Project Page: + https://taohuumd.github.io/projects/SurMo/ +
+
+
+
+
+ + ♻ ☆ StructLDM: Structured Latent Diffusion for 3D Human Generation + + +
+ Recent 3D human generative models have achieved remarkable progress by +learning 3D-aware GANs from 2D images. However, existing 3D human generative +methods model humans in a compact 1D latent space, ignoring the articulated +structure and semantics of human body topology. In this paper, we explore more +expressive and higher-dimensional latent space for 3D human modeling and +propose StructLDM, a diffusion-based unconditional 3D human generative model, +which is learned from 2D images. StructLDM solves the challenges imposed due to +the high-dimensional growth of latent space with three key designs: 1) A +semantic structured latent space defined on the dense surface manifold of a +statistical human body template. 2) A structured 3D-aware auto-decoder that +factorizes the global latent space into several semantic body parts +parameterized by a set of conditional structured local NeRFs anchored to the +body template, which embeds the properties learned from the 2D training data +and can be decoded to render view-consistent humans under different poses and +clothing styles. 3) A structured latent diffusion model for generative human +appearance sampling. Extensive experiments validate StructLDM's +state-of-the-art generation performance and illustrate the expressiveness of +the structured latent space over the well-adopted 1D latent space. Notably, +StructLDM enables different levels of controllable 3D human generation and +editing, including pose/view/shape control, and high-level tasks including +compositional generations, part-aware clothing editing, 3D virtual try-on, etc. +Our project page is at: https://taohuumd.github.io/projects/StructLDM/. + +
+
+ comment: Project page: https://taohuumd.github.io/projects/StructLDM/ +
+
+
+
+
+ + ♻ ☆ A Comprehensive Review of Knowledge Distillation in Computer Vision + + +
+ Deep learning techniques have been demonstrated to surpass preceding +cutting-edge machine learning techniques in recent years, with computer vision +being one of the most prominent examples. However, deep learning models suffer +from significant drawbacks when deployed in resource-constrained environments +due to their large model size and high complexity. Knowledge Distillation is +one of the prominent solutions to overcome this challenge. This review paper +examines the current state of research on knowledge distillation, a technique +for compressing complex models into smaller and simpler ones. The paper +provides an overview of the major principles and techniques associated with +knowledge distillation and reviews the applications of knowledge distillation +in the domain of computer vision. The review focuses on the benefits of +knowledge distillation, as well as the problems that must be overcome to +improve its effectiveness. + +
+
+ comment: 37 pages ,10 figures +
+
+
+
+
+ + ♻ ☆ Volcano: Mitigating Multimodal Hallucination through Self-Feedback + Guided Revision + + +
+ Large multimodal models suffer from multimodal hallucination, where they +provide incorrect responses misaligned with the given visual information. +Recent works have conjectured that one of the reasons behind multimodal +hallucination is due to the vision encoder failing to ground on the image +properly. To mitigate this issue, we propose a novel approach that leverages +self-feedback as visual cues. Building on this approach, we introduce Volcano, +a multimodal self-feedback guided revision model. Volcano generates natural +language feedback to its initial response based on the provided visual +information and utilizes this feedback to self-revise its initial response. +Volcano effectively reduces multimodal hallucination and achieves +state-of-the-art on MMHal-Bench, POPE, and GAVIE. It also improves on general +multimodal abilities and outperforms previous models on MM-Vet and MMBench. +Through qualitative analysis, we show that Volcano's feedback is properly +grounded on the image than the initial response. This indicates that Volcano +can provide itself with richer visual information through feedback generation, +leading to self-correct hallucinations. We publicly release our model, data, +and code at https://github.com/kaistAI/Volcano}{github.com/kaistAI/Volcano + +
+
+
+
+
+ + ♻ ☆ 3D Reconstruction of Interacting Multi-Person in Clothing from a Single + Image WACV 2024 + + +
+ This paper introduces a novel pipeline to reconstruct the geometry of +interacting multi-person in clothing on a globally coherent scene space from a +single image. The main challenge arises from the occlusion: a part of a human +body is not visible from a single view due to the occlusion by others or the +self, which introduces missing geometry and physical implausibility (e.g., +penetration). We overcome this challenge by utilizing two human priors for +complete 3D geometry and surface contacts. For the geometry prior, an encoder +learns to regress the image of a person with missing body parts to the latent +vectors; a decoder decodes these vectors to produce 3D features of the +associated geometry; and an implicit network combines these features with a +surface normal map to reconstruct a complete and detailed 3D humans. For the +contact prior, we develop an image-space contact detector that outputs a +probability distribution of surface contacts between people in 3D. We use these +priors to globally refine the body poses, enabling the penetration-free and +accurate reconstruction of interacting multi-person in clothing on the scene +space. The results demonstrate that our method is complete, globally coherent, +and physically plausible compared to existing methods. + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ♻ ☆ TAMM: TriAdapter Multi-Modal Learning for 3D Shape Understanding CVPR 2024 + + +
+ The limited scale of current 3D shape datasets hinders the advancements in 3D +shape understanding, and motivates multi-modal learning approaches which +transfer learned knowledge from data-abundant 2D image and language modalities +to 3D shapes. However, even though the image and language representations have +been aligned by cross-modal models like CLIP, we find that the image modality +fails to contribute as much as the language in existing multi-modal 3D +representation learning methods. This is attributed to the domain shift in the +2D images and the distinct focus of each modality. To more effectively leverage +both modalities in the pre-training, we introduce TriAdapter Multi-Modal +Learning (TAMM) -- a novel two-stage learning approach based on three +synergistic adapters. First, our CLIP Image Adapter mitigates the domain gap +between 3D-rendered images and natural images, by adapting the visual +representations of CLIP for synthetic image-text pairs. Subsequently, our Dual +Adapters decouple the 3D shape representation space into two complementary +sub-spaces: one focusing on visual attributes and the other for semantic +understanding, which ensure a more comprehensive and effective multi-modal +pre-training. Extensive experiments demonstrate that TAMM consistently enhances +3D representations for a wide range of 3D encoder architectures, pre-training +datasets, and downstream tasks. Notably, we boost the zero-shot classification +accuracy on Objaverse-LVIS from 46.8\% to 50.7\%, and improve the 5-way 10-shot +linear probing classification accuracy on ModelNet40 from 96.1\% to 99.0\%. +Project page: https://alanzhangcs.github.io/tamm-page. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Survey of Bias In Text-to-Image Generation: Definition, Evaluation, and + Mitigation + + +
+ The recent advancement of large and powerful models with Text-to-Image (T2I) +generation abilities -- such as OpenAI's DALLE-3 and Google's Gemini -- enables +users to generate high-quality images from textual prompts. However, it has +become increasingly evident that even simple prompts could cause T2I models to +exhibit conspicuous social bias in generated images. Such bias might lead to +both allocational and representational harms in society, further marginalizing +minority groups. Noting this problem, a large body of recent works has been +dedicated to investigating different dimensions of bias in T2I systems. +However, an extensive review of these studies is lacking, hindering a +systematic understanding of current progress and research gaps. We present the +first extensive survey on bias in T2I generative models. In this survey, we +review prior studies on dimensions of bias: Gender, Skintone, and Geo-Culture. +Specifically, we discuss how these works define, evaluate, and mitigate +different aspects of bias. We found that: (1) while gender and skintone biases +are widely studied, geo-cultural bias remains under-explored; (2) most works on +gender and skintone bias investigated occupational association, while other +aspects are less frequently studied; (3) almost all gender bias works overlook +non-binary identities in their studies; (4) evaluation datasets and metrics are +scattered, with no unified framework for measuring biases; and (5) current +mitigation methods fail to resolve biases comprehensively. Based on current +limitations, we point out future research directions that contribute to +human-centric definitions, evaluations, and mitigation of biases. We hope to +highlight the importance of studying biases in T2I systems, as well as +encourage future efforts to holistically understand and tackle biases, building +fair and trustworthy T2I technologies for everyone. + +
+
+
+
+
+ + ♻ ☆ One-Shot Structure-Aware Stylized Image Synthesis CVPR 2024 + + +
+ While GAN-based models have been successful in image stylization tasks, they +often struggle with structure preservation while stylizing a wide range of +input images. Recently, diffusion models have been adopted for image +stylization but still lack the capability to maintain the original quality of +input images. Building on this, we propose OSASIS: a novel one-shot stylization +method that is robust in structure preservation. We show that OSASIS is able to +effectively disentangle the semantics from the structure of an image, allowing +it to control the level of content and style implemented to a given input. We +apply OSASIS to various experimental settings, including stylization with +out-of-domain reference images and stylization with text-driven manipulation. +Results show that OSASIS outperforms other stylization methods, especially for +input images that were rarely encountered during training, providing a +promising solution to stylization via diffusion models. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ LoSh: Long-Short Text Joint Prediction Network for Referring Video + Object Segmentation CVPR2024 + + +
+ Referring video object segmentation (RVOS) aims to segment the target +instance referred by a given text expression in a video clip. The text +expression normally contains sophisticated description of the instance's +appearance, action, and relation with others. It is therefore rather difficult +for a RVOS model to capture all these attributes correspondingly in the video; +in fact, the model often favours more on the action- and relation-related +visual attributes of the instance. This can end up with partial or even +incorrect mask prediction of the target instance. We tackle this problem by +taking a subject-centric short text expression from the original long text +expression. The short one retains only the appearance-related information of +the target instance so that we can use it to focus the model's attention on the +instance's appearance. We let the model make joint predictions using both long +and short text expressions; and insert a long-short cross-attention module to +interact the joint features and a long-short predictions intersection loss to +regulate the joint predictions. Besides the improvement on the linguistic part, +we also introduce a forward-backward visual consistency loss, which utilizes +optical flows to warp visual features between the annotated frames and their +temporal neighbors for consistency. We build our method on top of two state of +the art pipelines. Extensive experiments on A2D-Sentences, Refer-YouTube-VOS, +JHMDB-Sentences and Refer-DAVIS17 show impressive improvements of our +method.Code is available at https://github.com/LinfengYuan1997/Losh. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Flexible filtrations for multiparameter persistent homology detect + digital images + + +
+ Two important problems in the field of Topological Data Analysis are defining +practical multifiltrations on objects and showing ability of TDA to detect the +geometry. Motivated by the problems, we constuct three multifiltrations named +multi-GENEO, multi-DGENEO and mix-GENEO, and prove the stability of both the +interleaving distance and multiparameter persistence landscape of multi-GENEO +with respect to the pseudometric of the subspace of bounded functions. We also +give the estimations of upper bound for multi-DGENEO and mix-GENEO. Finally, we +provide experiment results on MNIST dataset to demonstrate our bifiltrations +have ability to detect geometric and topological differences of digital images. + +
+
+
+
+
+ + ♻ ☆ NEDS-SLAM: A Novel Neural Explicit Dense Semantic SLAM Framework using + 3D Gaussian Splatting + + +
+ We propose NEDS-SLAM, an Explicit Dense semantic SLAM system based on 3D +Gaussian representation, that enables robust 3D semantic mapping, accurate +camera tracking, and high-quality rendering in real-time. In the system, we +propose a Spatially Consistent Feature Fusion model to reduce the effect of +erroneous estimates from pre-trained segmentation head on semantic +reconstruction, achieving robust 3D semantic Gaussian mapping. Additionally, we +employ a lightweight encoder-decoder to compress the high-dimensional semantic +features into a compact 3D Gaussian representation, mitigating the burden of +excessive memory consumption. Furthermore, we leverage the advantage of 3D +Gaussian splatting, which enables efficient and differentiable novel view +rendering, and propose a Virtual Camera View Pruning method to eliminate +outlier GS points, thereby effectively enhancing the quality of scene +representations. Our NEDS-SLAM method demonstrates competitive performance over +existing dense semantic SLAM methods in terms of mapping and tracking accuracy +on Replica and ScanNet datasets, while also showing excellent capabilities in +3D dense semantic mapping. + +
+
+
+
+
+ + ♻ ☆ FairRAG: Fair Human Generation via Fair Retrieval Augmentation CVPR 2024 + + +
+ Existing text-to-image generative models reflect or even amplify societal +biases ingrained in their training data. This is especially concerning for +human image generation where models are biased against certain demographic +groups. Existing attempts to rectify this issue are hindered by the inherent +limitations of the pre-trained models and fail to substantially improve +demographic diversity. In this work, we introduce Fair Retrieval Augmented +Generation (FairRAG), a novel framework that conditions pre-trained generative +models on reference images retrieved from an external image database to improve +fairness in human generation. FairRAG enables conditioning through a +lightweight linear module that projects reference images into the textual +space. To enhance fairness, FairRAG applies simple-yet-effective debiasing +strategies, providing images from diverse demographic groups during the +generative process. Extensive experiments demonstrate that FairRAG outperforms +existing methods in terms of demographic diversity, image-text alignment, and +image fidelity while incurring minimal computational overhead during inference. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Variational Dynamic for Self-Supervised Exploration in Deep + Reinforcement Learning + + +
+ Efficient exploration remains a challenging problem in reinforcement +learning, especially for tasks where extrinsic rewards from environments are +sparse or even totally disregarded. Significant advances based on intrinsic +motivation show promising results in simple environments but often get stuck in +environments with multimodal and stochastic dynamics. In this work, we propose +a variational dynamic model based on the conditional variational inference to +model the multimodality and stochasticity. We consider the environmental +state-action transition as a conditional generative process by generating the +next-state prediction under the condition of the current state, action, and +latent variable, which provides a better understanding of the dynamics and +leads a better performance in exploration. We derive an upper bound of the +negative log-likelihood of the environmental transition and use such an upper +bound as the intrinsic reward for exploration, which allows the agent to learn +skills by self-supervised exploration without observing extrinsic rewards. We +evaluate the proposed method on several image-based simulation tasks and a real +robotic manipulating task. Our method outperforms several state-of-the-art +environment model-based exploration approaches. + +
+
+ comment: IEEE Transactions on Neural Networks and Learning Systems (TNNLS) + 2021 +
+
+
+
+
+ + ♻ ☆ Text2HOI: Text-guided 3D Motion Generation for Hand-Object Interaction CVPR 2024 + + +
+ This paper introduces the first text-guided work for generating the sequence +of hand-object interaction in 3D. The main challenge arises from the lack of +labeled data where existing ground-truth datasets are nowhere near +generalizable in interaction type and object category, which inhibits the +modeling of diverse 3D hand-object interaction with the correct physical +implication (e.g., contacts and semantics) from text prompts. To address this +challenge, we propose to decompose the interaction generation task into two +subtasks: hand-object contact generation; and hand-object motion generation. +For contact generation, a VAE-based network takes as input a text and an object +mesh, and generates the probability of contacts between the surfaces of hands +and the object during the interaction. The network learns a variety of local +geometry structure of diverse objects that is independent of the objects' +category, and thus, it is applicable to general objects. For motion generation, +a Transformer-based diffusion model utilizes this 3D contact map as a strong +prior for generating physically plausible hand-object motion as a function of +text prompts by learning from the augmented labeled dataset; where we annotate +text labels from many existing 3D hand and object motion data. Finally, we +further introduce a hand refiner module that minimizes the distance between the +object surface and hand joints to improve the temporal stability of the +object-hand contacts and to suppress the penetration artifacts. In the +experiments, we demonstrate that our method can generate more realistic and +diverse interactions compared to other baseline methods. We also show that our +method is applicable to unseen objects. We will release our model and newly +labeled data as a strong foundation for future research. Codes and data are +available in: https://github.com/JunukCha/Text2HOI. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ MMCert: Provable Defense against Adversarial Attacks to Multi-modal + Models CVPR'24 + + +
+ Different from a unimodal model whose input is from a single modality, the +input (called multi-modal input) of a multi-modal model is from multiple +modalities such as image, 3D points, audio, text, etc. Similar to unimodal +models, many existing studies show that a multi-modal model is also vulnerable +to adversarial perturbation, where an attacker could add small perturbation to +all modalities of a multi-modal input such that the multi-modal model makes +incorrect predictions for it. Existing certified defenses are mostly designed +for unimodal models, which achieve sub-optimal certified robustness guarantees +when extended to multi-modal models as shown in our experimental results. In +our work, we propose MMCert, the first certified defense against adversarial +attacks to a multi-modal model. We derive a lower bound on the performance of +our MMCert under arbitrary adversarial attacks with bounded perturbations to +both modalities (e.g., in the context of auto-driving, we bound the number of +changed pixels in both RGB image and depth image). We evaluate our MMCert using +two benchmark datasets: one for the multi-modal road segmentation task and the +other for the multi-modal emotion recognition task. Moreover, we compare our +MMCert with a state-of-the-art certified defense extended from unimodal models. +Our experimental results show that our MMCert outperforms the baseline. + +
+
+ comment: To appear in CVPR'24 +
+
+
+
+
+ + ♻ ☆ Image Captioning in news report scenario + + +
+ Image captioning strives to generate pertinent captions for specified images, +situating itself at the crossroads of Computer Vision (CV) and Natural Language +Processing (NLP). This endeavor is of paramount importance with far-reaching +applications in recommendation systems, news outlets, social media, and beyond. +Particularly within the realm of news reporting, captions are expected to +encompass detailed information, such as the identities of celebrities captured +in the images. However, much of the existing body of work primarily centers +around understanding scenes and actions. In this paper, we explore the realm of +image captioning specifically tailored for celebrity photographs, illustrating +its broad potential for enhancing news industry practices. This exploration +aims to augment automated news content generation, thereby facilitating a more +nuanced dissemination of information. Our endeavor shows a broader horizon, +enriching the narrative in news reporting through a more intuitive image +captioning framework. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ InfLoRA: Interference-Free Low-Rank Adaptation for Continual Learning CVPR 2024 + + +
+ Continual learning requires the model to learn multiple tasks sequentially. +In continual learning, the model should possess the ability to maintain its +performance on old tasks (stability) and the ability to adapt to new tasks +continuously (plasticity). Recently, parameter-efficient fine-tuning (PEFT), +which involves freezing a pre-trained model and injecting a small number of +learnable parameters to adapt to downstream tasks, has gained increasing +popularity in continual learning. Although existing continual learning methods +based on PEFT have demonstrated superior performance compared to those not +based on PEFT, most of them do not consider how to eliminate the interference +of the new task on the old tasks, which inhibits the model from making a good +trade-off between stability and plasticity. In this work, we propose a new PEFT +method, called interference-free low-rank adaptation (InfLoRA), for continual +learning. InfLoRA injects a small number of parameters to reparameterize the +pre-trained weights and shows that fine-tuning these injected parameters is +equivalent to fine-tuning the pre-trained weights within a subspace. +Furthermore, InfLoRA designs this subspace to eliminate the interference of the +new task on the old tasks, making a good trade-off between stability and +plasticity. Experimental results show that InfLoRA outperforms existing +state-of-the-art continual learning methods on multiple datasets. + +
+
+ comment: Accepted by the 2024 IEEE/CVF Conference on Computer Vision and + Pattern Recognition (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ Interpretable Dimensionality Reduction by Feature Preserving Manifold + Approximation and Projection + + +
+ Nonlinear dimensionality reduction lacks interpretability due to the absence +of source features in low-dimensional embedding space. We propose an +interpretable method featMAP to preserve source features by tangent space +embedding. The core of our proposal is to utilize local singular value +decomposition (SVD) to approximate the tangent space which is embedded to +low-dimensional space by maintaining the alignment. Based on the embedding +tangent space, featMAP enables the interpretability by locally demonstrating +the source features and feature importance. Furthermore, featMAP embeds the +data points by anisotropic projection to preserve the local similarity and +original density. We apply featMAP to interpreting digit classification, object +detection and MNIST adversarial examples. FeatMAP uses source features to +explicitly distinguish the digits and objects and to explain the +misclassification of adversarial examples. We also compare featMAP with other +state-of-the-art methods on local and global metrics. + +
+
+
+
+
+ + ♻ ☆ MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient + image-text retrieval NAACL 2024 + + +
+ Due to the success of large-scale visual-language pretraining (VLP) models +and the widespread use of image-text retrieval in industry areas, it is now +critically necessary to reduce the model size and streamline their +mobile-device deployment. Single- and dual-stream model structures are commonly +used in image-text retrieval with the goal of closing the semantic gap between +textual and visual modalities. While single-stream models use deep feature +fusion to achieve more accurate cross-model alignment, dual-stream models are +better at offline indexing and fast inference.We propose a Multi-teacher +Cross-modality Alignment Distillation (MCAD) technique to integrate the +advantages of single- and dual-stream models. By incorporating the fused +single-stream features into the image and text features of the dual-stream +model, we formulate new modified teacher similarity distributions and features. +Then, we conduct both distribution and feature distillation to boost the +capability of the student dual-stream model, achieving high retrieval +performance without increasing inference complexity.Extensive experiments +demonstrate the remarkable performance and high efficiency of MCAD on +image-text retrieval tasks. Furthermore, we implement a lightweight CLIP model +on Snapdragon/Dimensity chips with only $\sim$100M running memory and +$\sim$8.0ms search latency, achieving the mobile-device application of VLP +models. + +
+
+ comment: Accepted by NAACL 2024 Findings +
+
+
+
+
+ + ♻ ☆ Efficient End-to-End Visual Document Understanding with Rationale + Distillation NAACL 2024 + + +
+ Understanding visually situated language requires interpreting complex +layouts of textual and visual elements. Pre-processing tools, such as optical +character recognition (OCR), can map document image inputs to textual tokens, +then large language models (LLMs) can reason over text. However, such methods +have high computational and engineering complexity. Can small pretrained +image-to-text models accurately understand visual documents through similar +recognition and reasoning steps instead? We propose Rationale Distillation +(RD), which incorporates the outputs of OCR tools, LLMs, and larger multimodal +models as intermediate "rationales", and trains a small student model to +predict both rationales and answers. On three visual document understanding +benchmarks representing infographics, scanned documents, and figures, our +Pix2Struct (282M parameters) student model finetuned with RD outperforms the +base model by 4-5% absolute accuracy with only 1% higher computational cost. + +
+
+ comment: Accepted by NAACL 2024 +
+
+
+
+
+ + ♻ ☆ OSCaR: Object State Captioning and State Change Representation NAACL 2024 + + +
+ The capability of intelligent models to extrapolate and comprehend changes in +object states is a crucial yet demanding aspect of AI research, particularly +through the lens of human interaction in real-world settings. This task +involves describing complex visual environments, identifying active objects, +and interpreting their changes as conveyed through language. Traditional +methods, which isolate object captioning and state change detection, offer a +limited view of dynamic environments. Moreover, relying on a small set of +symbolic words to represent changes has restricted the expressiveness of the +language. To address these challenges, in this paper, we introduce the Object +State Captioning and State Change Representation (OSCaR) dataset and benchmark. +OSCaR consists of 14,084 annotated video segments with nearly 1,000 unique +objects from various egocentric video collections. It sets a new testbed for +evaluating multimodal large language models (MLLMs). Our experiments +demonstrate that while MLLMs show some skill, they lack a full understanding of +object state changes. The benchmark includes a fine-tuned model that, despite +initial capabilities, requires significant improvements in accuracy and +generalization ability for effective understanding of these changes. Our code +and dataset are available at https://github.com/nguyennm1024/OSCaR. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ SeiT++: Masked Token Modeling Improves Storage-efficient Training + + +
+ Recent advancements in Deep Neural Network (DNN) models have significantly +improved performance across computer vision tasks. However, achieving highly +generalizable and high-performing vision models requires expansive datasets, +resulting in significant storage requirements. This storage challenge is a +critical bottleneck for scaling up models. A recent breakthrough by SeiT +proposed the use of Vector-Quantized (VQ) feature vectors (i.e., tokens) as +network inputs for vision classification. This approach achieved 90% of the +performance of a model trained on full-pixel images with only 1% of the +storage. While SeiT needs labeled data, its potential in scenarios beyond fully +supervised learning remains largely untapped. In this paper, we extend SeiT by +integrating Masked Token Modeling (MTM) for self-supervised pre-training. +Recognizing that self-supervised approaches often demand more data due to the +lack of labels, we introduce TokenAdapt and ColorAdapt. These methods +facilitate comprehensive token-friendly data augmentation, effectively +addressing the increased data requirements of self-supervised learning. We +evaluate our approach across various scenarios, including storage-efficient +ImageNet-1k classification, fine-grained classification, ADE-20k semantic +segmentation, and robustness benchmarks. Experimental results demonstrate +consistent performance improvement in diverse experiments, validating the +effectiveness of our method. Code is available at +https://github.com/naver-ai/tokenadapt. + +
+
+ comment: First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Is Synthetic Image Useful for Transfer Learning? An Investigation into + Data Generation, Volume, and Utilization ICLR24 + + +
+ Synthetic image data generation represents a promising avenue for training +deep learning models, particularly in the realm of transfer learning, where +obtaining real images within a specific domain can be prohibitively expensive +due to privacy and intellectual property considerations. This work delves into +the generation and utilization of synthetic images derived from text-to-image +generative models in facilitating transfer learning paradigms. Despite the high +visual fidelity of the generated images, we observe that their naive +incorporation into existing real-image datasets does not consistently enhance +model performance due to the inherent distribution gap between synthetic and +real images. To address this issue, we introduce a novel two-stage framework +called bridged transfer, which initially employs synthetic images for +fine-tuning a pre-trained model to improve its transferability and subsequently +uses real data for rapid adaptation. Alongside, We propose dataset style +inversion strategy to improve the stylistic alignment between synthetic and +real images. Our proposed methods are evaluated across 10 different datasets +and 5 distinct models, demonstrating consistent improvements, with up to 30% +accuracy increase on classification tasks. Intriguingly, we note that the +enhancements were not yet saturated, indicating that the benefits may further +increase with an expanded volume of synthetic data. + +
+
+ comment: ICLR24 Score 6865 https://openreview.net/forum?id=CjPt1AC6w0 +
+
+
+
+
+ + ♻ ☆ Gemini: A Family of Highly Capable Multimodal Models + + +
+ This report introduces a new family of multimodal models, Gemini, that +exhibit remarkable capabilities across image, audio, video, and text +understanding. The Gemini family consists of Ultra, Pro, and Nano sizes, +suitable for applications ranging from complex reasoning tasks to on-device +memory-constrained use-cases. Evaluation on a broad range of benchmarks shows +that our most-capable Gemini Ultra model advances the state of the art in 30 of +32 of these benchmarks - notably being the first model to achieve human-expert +performance on the well-studied exam benchmark MMLU, and improving the state of +the art in every one of the 20 multimodal benchmarks we examined. We believe +that the new capabilities of the Gemini family in cross-modal reasoning and +language understanding will enable a wide variety of use cases. We discuss our +approach toward post-training and deploying Gemini models responsibly to users +through services including Gemini, Gemini Advanced, Google AI Studio, and Cloud +Vertex AI. + +
+
+
+
+
+ + ♻ ☆ Large Language Models are Good Prompt Learners for Low-Shot Image + Classification CVPR 2024 + + +
+ Low-shot image classification, where training images are limited or +inaccessible, has benefited from recent progress on pre-trained vision-language +(VL) models with strong generalizability, e.g. CLIP. Prompt learning methods +built with VL models generate text features from the class names that only have +confined class-specific information. Large Language Models (LLMs), with their +vast encyclopedic knowledge, emerge as the complement. Thus, in this paper, we +discuss the integration of LLMs to enhance pre-trained VL models, specifically +on low-shot classification. However, the domain gap between language and vision +blocks the direct application of LLMs. Thus, we propose LLaMP, Large Language +Models as Prompt learners, that produces adaptive prompts for the CLIP text +encoder, establishing it as the connecting bridge. Experiments show that, +compared with other state-of-the-art prompt learning methods, LLaMP yields +better performance on both zero-shot generalization and few-shot image +classification, over a spectrum of 11 datasets. Code will be made available at: +https://github.com/zhaohengz/LLaMP. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ EarthNets: Empowering AI in Earth Observation + + +
+ Earth observation (EO), aiming at monitoring the state of planet Earth using +remote sensing data, is critical for improving our daily lives and living +environment. With a growing number of satellites in orbit, an increasing number +of datasets with diverse sensors and research domains are being published to +facilitate the research of the remote sensing community. This paper presents a +comprehensive review of more than 500 publicly published datasets, including +research domains like agriculture, land use and land cover, disaster +monitoring, scene understanding, vision-language models, foundation models, +climate change, and weather forecasting. We systematically analyze these EO +datasets from four aspects: volume, resolution distributions, research domains, +and the correlation between datasets. Based on the dataset attributes, we +propose to measure, rank, and select datasets to build a new benchmark for +model evaluation. Furthermore, a new platform for EO, termed EarthNets, is +released to achieve a fair and consistent evaluation of deep learning methods +on remote sensing data. EarthNets supports standard dataset libraries and +cutting-edge deep learning models to bridge the gap between the remote sensing +and machine learning communities. Based on this platform, extensive +deep-learning methods are evaluated on the new benchmark. The insightful +results are beneficial to future research. The platform and dataset collections +are publicly available at https://earthnets.github.io. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ♻ ☆ Visual Anagrams: Generating Multi-View Optical Illusions with Diffusion + Models CVPR 2024 + + +
+ We address the problem of synthesizing multi-view optical illusions: images +that change appearance upon a transformation, such as a flip or rotation. We +propose a simple, zero-shot method for obtaining these illusions from +off-the-shelf text-to-image diffusion models. During the reverse diffusion +process, we estimate the noise from different views of a noisy image, and then +combine these noise estimates together and denoise the image. A theoretical +analysis suggests that this method works precisely for views that can be +written as orthogonal transformations, of which permutations are a subset. This +leads to the idea of a visual anagram--an image that changes appearance under +some rearrangement of pixels. This includes rotations and flips, but also more +exotic pixel permutations such as a jigsaw rearrangement. Our approach also +naturally extends to illusions with more than two views. We provide both +qualitative and quantitative results demonstrating the effectiveness and +flexibility of our method. Please see our project webpage for additional +visualizations and results: https://dangeng.github.io/visual_anagrams/ + +
+
+ comment: CVPR 2024 camera ready +
+
+
+
+
+ + ♻ ☆ SignAvatars: A Large-scale 3D Sign Language Holistic Motion Dataset and + Benchmark + + +
+ We present SignAvatars, the first large-scale, multi-prompt 3D sign language +(SL) motion dataset designed to bridge the communication gap for Deaf and +hard-of-hearing individuals. While there has been an exponentially growing +number of research regarding digital communication, the majority of existing +communication technologies primarily cater to spoken or written languages, +instead of SL, the essential communication method for Deaf and hard-of-hearing +communities. Existing SL datasets, dictionaries, and sign language production +(SLP) methods are typically limited to 2D as annotating 3D models and avatars +for SL is usually an entirely manual and labor-intensive process conducted by +SL experts, often resulting in unnatural avatars. In response to these +challenges, we compile and curate the SignAvatars dataset, which comprises +70,000 videos from 153 signers, totaling 8.34 million frames, covering both +isolated signs and continuous, co-articulated signs, with multiple prompts +including HamNoSys, spoken language, and words. To yield 3D holistic +annotations, including meshes and biomechanically-valid poses of body, hands, +and face, as well as 2D and 3D keypoints, we introduce an automated annotation +pipeline operating on our large corpus of SL videos. SignAvatars facilitates +various tasks such as 3D sign language recognition (SLR) and the novel 3D SL +production (SLP) from diverse inputs like text scripts, individual words, and +HamNoSys notation. Hence, to evaluate the potential of SignAvatars, we further +propose a unified benchmark of 3D SL holistic motion production. We believe +that this work is a significant step forward towards bringing the digital world +to the Deaf and hard-of-hearing communities as well as people interacting with +them. + +
+
+ comment: 14 pages; Project page available at https://signavatars.github.io/ +
+
+
+
+
+ + ♻ ☆ CFIR: Fast and Effective Long-Text To Image Retrieval for Large Corpora + + +
+ Text-to-image retrieval aims to find the relevant images based on a text +query, which is important in various use-cases, such as digital libraries, +e-commerce, and multimedia databases. Although Multimodal Large Language Models +(MLLMs) demonstrate state-of-the-art performance, they exhibit limitations in +handling large-scale, diverse, and ambiguous real-world needs of retrieval, due +to the computation cost and the injective embeddings they produce. This paper +presents a two-stage Coarse-to-Fine Index-shared Retrieval (CFIR) framework, +designed for fast and effective large-scale long-text to image retrieval. The +first stage, Entity-based Ranking (ER), adapts to long-text query ambiguity by +employing a multiple-queries-to-multiple-targets paradigm, facilitating +candidate filtering for the next stage. The second stage, Summary-based +Re-ranking (SR), refines these rankings using summarized queries. We also +propose a specialized Decoupling-BEiT-3 encoder, optimized for handling +ambiguous user needs and both stages, which also enhances computational +efficiency through vector-based similarity inference. Evaluation on the AToMiC +dataset reveals that CFIR surpasses existing MLLMs by up to 11.06% in +Recall@1000, while reducing training and retrieval times by 68.75% and 99.79%, +respectively. We will release our code to facilitate future research at +https://github.com/longkukuhi/CFIR. + +
+
+
+
+
+ + ♻ ☆ MetaCloak: Preventing Unauthorized Subject-driven Text-to-image + Diffusion-based Synthesis via Meta-learning CVPR 2024 + + +
+ Text-to-image diffusion models allow seamless generation of personalized +images from scant reference photos. Yet, these tools, in the wrong hands, can +fabricate misleading or harmful content, endangering individuals. To address +this problem, existing poisoning-based approaches perturb user images in an +imperceptible way to render them "unlearnable" from malicious uses. We identify +two limitations of these defending approaches: i) sub-optimal due to the +hand-crafted heuristics for solving the intractable bilevel optimization and +ii) lack of robustness against simple data transformations like Gaussian +filtering. To solve these challenges, we propose MetaCloak, which solves the +bi-level poisoning problem with a meta-learning framework with an additional +transformation sampling process to craft transferable and robust perturbation. +Specifically, we employ a pool of surrogate diffusion models to craft +transferable and model-agnostic perturbation. Furthermore, by incorporating an +additional transformation process, we design a simple denoising-error +maximization loss that is sufficient for causing transformation-robust semantic +distortion and degradation in a personalized generation. Extensive experiments +on the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing +approaches. Notably, MetaCloak can successfully fool online training services +like Replicate, in a black-box manner, demonstrating the effectiveness of +MetaCloak in real-world scenarios. Our code is available at +https://github.com/liuyixin-louis/MetaCloak. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A Novel Benchmark for Few-Shot Semantic Segmentation in the Era of + Foundation Models + + +
+ In recent years, the rapid evolution of computer vision has seen the +emergence of various foundation models, each tailored to specific data types +and tasks. In this study, we explore the adaptation of these models for +few-shot semantic segmentation. Specifically, we conduct a comprehensive +comparative analysis of four prominent foundation models: DINO V2, Segment +Anything, CLIP, Masked AutoEncoders, and of a straightforward ResNet50 +pre-trained on the COCO dataset. We also include 5 adaptation methods, ranging +from linear probing to fine tuning. Our findings show that DINO V2 outperforms +other models by a large margin, across various datasets and adaptation methods. +On the other hand, adaptation methods provide little discrepancy in the +obtained results, suggesting that a simple linear probing can compete with +advanced, more computationally intensive, alternatives + +
+
+
+
+
+ + ♻ ☆ Readout Guidance: Learning Control from Diffusion Features CVPR 2024 + + +
+ We present Readout Guidance, a method for controlling text-to-image diffusion +models with learned signals. Readout Guidance uses readout heads, lightweight +networks trained to extract signals from the features of a pre-trained, frozen +diffusion model at every timestep. These readouts can encode single-image +properties, such as pose, depth, and edges; or higher-order properties that +relate multiple images, such as correspondence and appearance similarity. +Furthermore, by comparing the readout estimates to a user-defined target, and +back-propagating the gradient through the readout head, these estimates can be +used to guide the sampling process. Compared to prior methods for conditional +generation, Readout Guidance requires significantly fewer added parameters and +training samples, and offers a convenient and simple recipe for reproducing +different forms of conditional control under a single framework, with a single +architecture and sampling procedure. We showcase these benefits in the +applications of drag-based manipulation, identity-consistent generation, and +spatially aligned control. Project page: https://readout-guidance.github.io. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Omni-SMoLA: Boosting Generalist Multimodal Models with Soft Mixture of + Low-rank Experts + + +
+ Large multi-modal models (LMMs) exhibit remarkable performance across +numerous tasks. However, generalist LMMs often suffer from performance +degradation when tuned over a large collection of tasks. Recent research +suggests that Mixture of Experts (MoE) architectures are useful for instruction +tuning, but for LMMs of parameter size around O(50-100B), the prohibitive cost +of replicating and storing the expert models severely limits the number of +experts we can use. We propose Omni-SMoLA, an architecture that uses the Soft +MoE approach to (softly) mix many multimodal low rank experts, and avoids +introducing a significant number of new parameters compared to conventional MoE +models. The core intuition here is that the large model provides a foundational +backbone, while different lightweight experts residually learn specialized +knowledge, either per-modality or multimodally. Extensive experiments +demonstrate that the SMoLA approach helps improve the generalist performance +across a broad range of generative vision-and-language tasks, achieving new +SoTA generalist performance that often matches or outperforms single +specialized LMM baselines, as well as new SoTA specialist performance. + +
+
+
+
+
+ + ♻ ☆ Computational limits to the legibility of the imaged human brain + + +
+ Our knowledge of the organisation of the human brain at the population-level +is yet to translate into power to predict functional differences at the +individual-level, limiting clinical applications, and casting doubt on the +generalisability of inferred mechanisms. It remains unknown whether the +difficulty arises from the absence of individuating biological patterns within +the brain, or from limited power to access them with the models and compute at +our disposal. Here we comprehensively investigate the resolvability of such +patterns with data and compute at unprecedented scale. Across 23 810 unique +participants from UK Biobank, we systematically evaluate the predictability of +25 individual biological characteristics, from all available combinations of +structural and functional neuroimaging data. Over 4526 GPU hours of +computation, we train, optimize, and evaluate out-of-sample 700 individual +predictive models, including fully-connected feed-forward neural networks of +demographic, psychological, serological, chronic disease, and functional +connectivity characteristics, and both uni- and multi-modal 3D convolutional +neural network models of macro- and micro-structural brain imaging. We find a +marked discrepancy between the high predictability of sex (balanced accuracy +99.7%), age (mean absolute error 2.048 years, R2 0.859), and weight (mean +absolute error 2.609Kg, R2 0.625), for which we set new state-of-the-art +performance, and the surprisingly low predictability of other characteristics. +Neither structural nor functional imaging predicted psychology better than the +coincidence of chronic disease (p<0.05). Serology predicted chronic disease +(p<0.05) and was best predicted by it (p<0.001), followed by structural +neuroimaging (p<0.05). Our findings suggest either more informative imaging or +more powerful models are needed to decipher individual level characteristics +from the human brain. + +
+
+ comment: 38 pages, 6 figures, 1 table, 2 supplementary figures, 1 + supplementary table +
+
+
+
+
+ + ♻ ☆ Diffusion 3D Features (Diff3F): Decorating Untextured Shapes with + Distilled Semantic Features CVPR'24 + + +
+ We present Diff3F as a simple, robust, and class-agnostic feature descriptor +that can be computed for untextured input shapes (meshes or point clouds). Our +method distills diffusion features from image foundational models onto input +shapes. Specifically, we use the input shapes to produce depth and normal maps +as guidance for conditional image synthesis. In the process, we produce +(diffusion) features in 2D that we subsequently lift and aggregate on the +original surface. Our key observation is that even if the conditional image +generations obtained from multi-view rendering of the input shapes are +inconsistent, the associated image features are robust and, hence, can be +directly aggregated across views. This produces semantic features on the input +shapes, without requiring additional data or training. We perform extensive +experiments on multiple benchmarks (SHREC'19, SHREC'20, FAUST, and TOSCA) and +demonstrate that our features, being semantic instead of geometric, produce +reliable correspondence across both isometric and non-isometrically related +shape families. Code is available via the project page at +https://diff3f.github.io/ + +
+
+ comment: Accepted at CVPR'24 +
+
+
+
+
+ + ♻ ☆ UniBEV: Multi-modal 3D Object Detection with Uniform BEV Encoders for + Robustness against Missing Sensor Modalities + + +
+ Multi-sensor object detection is an active research topic in automated +driving, but the robustness of such detection models against missing sensor +input (modality missing), e.g., due to a sudden sensor failure, is a critical +problem which remains under-studied. In this work, we propose UniBEV, an +end-to-end multi-modal 3D object detection framework designed for robustness +against missing modalities: UniBEV can operate on LiDAR plus camera input, but +also on LiDAR-only or camera-only input without retraining. To facilitate its +detector head to handle different input combinations, UniBEV aims to create +well-aligned Bird's Eye View (BEV) feature maps from each available modality. +Unlike prior BEV-based multi-modal detection methods, all sensor modalities +follow a uniform approach to resample features from the native sensor +coordinate systems to the BEV features. We furthermore investigate the +robustness of various fusion strategies w.r.t. missing modalities: the commonly +used feature concatenation, but also channel-wise averaging, and a +generalization to weighted averaging termed Channel Normalized Weights. To +validate its effectiveness, we compare UniBEV to state-of-the-art BEVFusion and +MetaBEV on nuScenes over all sensor input combinations. In this setting, UniBEV +achieves $52.5 \%$ mAP on average over all input combinations, significantly +improving over the baselines ($43.5 \%$ mAP on average for BEVFusion, $48.7 \%$ +mAP on average for MetaBEV). An ablation study shows the robustness benefits of +fusing by weighted averaging over regular concatenation, and of sharing queries +between the BEV encoders of each modality. Our code will be released upon paper +acceptance. + +
+
+ comment: Accepted by IEEE Intelligent Vehicles Symposium (IV 2024) +
+
+
+
+
+ + ♻ ☆ Understanding Video Transformers via Universal Concept Discovery CVPR 2024 + + +
+ This paper studies the problem of concept-based interpretability of +transformer representations for videos. Concretely, we seek to explain the +decision-making process of video transformers based on high-level, +spatiotemporal concepts that are automatically discovered. Prior research on +concept-based interpretability has concentrated solely on image-level tasks. +Comparatively, video models deal with the added temporal dimension, increasing +complexity and posing challenges in identifying dynamic concepts over time. In +this work, we systematically address these challenges by introducing the first +Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose +an efficient approach for unsupervised identification of units of video +transformer representations - concepts, and ranking their importance to the +output of a model. The resulting concepts are highly interpretable, revealing +spatio-temporal reasoning mechanisms and object-centric representations in +unstructured video models. Performing this analysis jointly over a diverse set +of supervised and self-supervised representations, we discover that some of +these mechanism are universal in video transformers. Finally, we show that VTCD +can be used for fine-grained action recognition and video object segmentation. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Limitations of Data-Driven Spectral Reconstruction -- Optics-Aware + Analysis and Mitigation + + +
+ Hyperspectral imaging empowers machine vision systems with the distinct +capability of identifying materials through recording their spectral +signatures. Recent efforts in data-driven spectral reconstruction aim at +extracting spectral information from RGB images captured by cost-effective RGB +cameras, instead of dedicated hardware. + In this paper we systematically analyze the performance of such methods, +evaluating both the practical limitations with respect to current datasets and +overfitting, as well as fundamental limitations with respect to the nature of +the information encoded in the RGB images, and the dependency of this +information on the optical system of the camera. + We find that, the current models are not robust under slight variations, +e.g., in noise level or compression of the RGB file. Without modeling +underrepresented spectral content, existing datasets and the models trained on +them are limited in their ability to cope with challenging metameric colors. To +mitigate this issue, we propose to exploit the combination of metameric data +augmentation and optical lens aberrations to improve the encoding of the +metameric information into the RGB image, which paves the road towards higher +performing spectral imaging and reconstruction approaches. + +
+
+ comment: 12 pages, 7 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Language-Informed Visual Concept Learning ICLR 2024 + + +
+ Our understanding of the visual world is centered around various concept +axes, characterizing different aspects of visual entities. While different +concept axes can be easily specified by language, e.g. color, the exact visual +nuances along each axis often exceed the limitations of linguistic +articulations, e.g. a particular style of painting. In this work, our goal is +to learn a language-informed visual concept representation, by simply +distilling large pre-trained vision-language models. Specifically, we train a +set of concept encoders to encode the information pertinent to a set of +language-informed concept axes, with an objective of reproducing the input +image through a pre-trained Text-to-Image (T2I) model. To encourage better +disentanglement of different concept encoders, we anchor the concept embeddings +to a set of text embeddings obtained from a pre-trained Visual Question +Answering (VQA) model. At inference time, the model extracts concept embeddings +along various axes from new test images, which can be remixed to generate +images with novel compositions of visual concepts. With a lightweight test-time +finetuning procedure, it can also generalize to novel concepts unseen at +training. + +
+
+ comment: ICLR 2024. The first two authors contributed equally and are + alphabetically ordered. Project page: + https://ai.stanford.edu/~yzzhang/projects/concept-axes/ +
+
+
+
+
+ + ♻ ☆ TAO-Amodal: A Benchmark for Tracking Any Object Amodally + + +
+ Amodal perception, the ability to comprehend complete object structures from +partial visibility, is a fundamental skill, even for infants. Its significance +extends to applications like autonomous driving, where a clear understanding of +heavily occluded objects is essential. However, modern detection and tracking +algorithms often overlook this critical capability, perhaps due to the +prevalence of \textit{modal} annotations in most benchmarks. To address the +scarcity of amodal benchmarks, we introduce TAO-Amodal, featuring 833 diverse +categories in thousands of video sequences. Our dataset includes +\textit{amodal} and modal bounding boxes for visible and partially or fully +occluded objects, including those that are partially out of the camera frame. +We investigate the current lay of the land in both amodal tracking and +detection by benchmarking state-of-the-art modal trackers and amodal +segmentation methods. We find that existing methods, even when adapted for +amodal tracking, struggle to detect and track objects under heavy occlusion. To +mitigate this, we explore simple finetuning schemes that can increase the +amodal tracking and detection metrics of occluded objects by 2.1\% and 3.3\%. + +
+
+ comment: Project Page: https://tao-amodal.github.io +
+
+
+
+
+ + ♻ ☆ Exploiting Diffusion Prior for Generalizable Dense Prediction CVPR 2024 + + +
+ Contents generated by recent advanced Text-to-Image (T2I) diffusion models +are sometimes too imaginative for existing off-the-shelf dense predictors to +estimate due to the immitigable domain gap. We introduce DMP, a pipeline +utilizing pre-trained T2I models as a prior for dense prediction tasks. To +address the misalignment between deterministic prediction tasks and stochastic +T2I models, we reformulate the diffusion process through a sequence of +interpolations, establishing a deterministic mapping between input RGB images +and output prediction distributions. To preserve generalizability, we use +low-rank adaptation to fine-tune pre-trained models. Extensive experiments +across five tasks, including 3D property estimation, semantic segmentation, and +intrinsic image decomposition, showcase the efficacy of the proposed method. +Despite limited-domain training data, the approach yields faithful estimations +for arbitrary images, surpassing existing state-of-the-art algorithms. + +
+
+ comment: To appear in CVPR 2024. Project page: https://shinying.github.io/dmp +
+
+
+
+
+ + ♻ ☆ High-performance real-world optical computing trained by in situ + model-free optimization + + +
+ Optical computing systems provide high-speed and low-energy data processing +but face deficiencies in computationally demanding training and +simulation-to-reality gaps. We propose a gradient-based model-free optimization +(G-MFO) method based on a Monte Carlo gradient estimation algorithm for +computationally efficient in situ training of optical computing systems. This +approach treats an optical computing system as a black box and back-propagates +the loss directly to the optical computing weights' probability distributions, +circumventing the need for a computationally heavy and biased system +simulation. Our experiments on diffractive optical computing systems show that +G-MFO outperforms hybrid training on the MNIST and FMNIST datasets. +Furthermore, we demonstrate image-free and high-speed classification of cells +from their marker-free phase maps. Our method's model-free and high-performance +nature, combined with its low demand for computational resources, paves the way +for accelerating the transition of optical computing from laboratory +demonstrations to practical, real-world applications. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 207 + +
+
+
+ + ☆ On Train-Test Class Overlap and Detection for Image Retrieval CVPR2024 + + +
+ How important is it for training and evaluation sets to not have class +overlap in image retrieval? We revisit Google Landmarks v2 clean, the most +popular training set, by identifying and removing class overlap with Revisited +Oxford and Paris [34], the most popular evaluation set. By comparing the +original and the new RGLDv2-clean on a benchmark of reproduced state-of-the-art +methods, our findings are striking. Not only is there a dramatic drop in +performance, but it is inconsistent across methods, changing the ranking.What +does it take to focus on objects or interest and ignore background clutter when +indexing? Do we need to train an object detector and the representation +separately? Do we need location supervision? We introduce Single-stage +Detect-to-Retrieve (CiDeR), an end-to-end, single-stage pipeline to detect +objects of interest and extract a global image representation. We outperform +previous state-of-the-art on both existing training sets and the new +RGLDv2-clean. Our dataset is available at +https://github.com/dealicious-inc/RGLDv2-clean. + +
+
+ comment: CVPR2024 Accepted +
+
+
+
+
+ + ☆ Temporally Consistent Unbalanced Optimal Transport for Unsupervised + Action Segmentation CVPR 2024 + + +
+ We propose a novel approach to the action segmentation task for long, +untrimmed videos, based on solving an optimal transport problem. By encoding a +temporal consistency prior into a Gromov-Wasserstein problem, we are able to +decode a temporally consistent segmentation from a noisy affinity/matching cost +matrix between video frames and action classes. Unlike previous approaches, our +method does not require knowing the action order for a video to attain temporal +consistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can +be efficiently solved on GPUs using a few iterations of projected mirror +descent. We demonstrate the effectiveness of our method in an unsupervised +learning setting, where our method is used to generate pseudo-labels for +self-training. We evaluate our segmentation approach and unsupervised learning +pipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly +datasets, yielding state-of-the-art results for the unsupervised video action +segmentation task. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Can Biases in ImageNet Models Explain Generalization? CVPR2024 + + +
+ The robust generalization of models to rare, in-distribution (ID) samples +drawn from the long tail of the training distribution and to +out-of-training-distribution (OOD) samples is one of the major challenges of +current deep learning methods. For image classification, this manifests in the +existence of adversarial attacks, the performance drops on distorted images, +and a lack of generalization to concepts such as sketches. The current +understanding of generalization in neural networks is very limited, but some +biases that differentiate models from human vision have been identified and +might be causing these limitations. Consequently, several attempts with varying +success have been made to reduce these biases during training to improve +generalization. We take a step back and sanity-check these attempts. Fixing the +architecture to the well-established ResNet-50, we perform a large-scale study +on 48 ImageNet models obtained via different training methods to understand how +and if these biases - including shape bias, spectral biases, and critical bands +- interact with generalization. Our extensive study results reveal that +contrary to previous findings, these biases are insufficient to accurately +predict the generalization of a model holistically. We provide access to all +checkpoints and evaluation code at +https://github.com/paulgavrikov/biases_vs_generalization + +
+
+ comment: Accepted at CVPR2024 +
+
+
+
+
+ + ☆ MosquitoFusion: A Multiclass Dataset for Real-Time Detection of + Mosquitoes, Swarms, and Breeding Sites Using Deep Learning + + +
+ In this paper, we present an integrated approach to real-time mosquito +detection using our multiclass dataset (MosquitoFusion) containing 1204 diverse +images and leverage cutting-edge technologies, specifically computer vision, to +automate the identification of Mosquitoes, Swarms, and Breeding Sites. The +pre-trained YOLOv8 model, trained on this dataset, achieved a mean Average +Precision (mAP@50) of 57.1%, with precision at 73.4% and recall at 50.5%. The +integration of Geographic Information Systems (GIS) further enriches the depth +of our analysis, providing valuable insights into spatial patterns. The dataset +and code are available at https://github.com/faiyazabdullah/MosquitoFusion. + +
+
+
+
+
+ + ☆ Modality Translation for Object Detection Adaptation Without Forgetting + Prior Knowledge + + +
+ A common practice in deep learning consists of training large neural networks +on massive datasets to perform accurately for different domains and tasks. +While this methodology may work well in numerous application areas, it only +applies across modalities due to a larger distribution shift in data captured +using different sensors. This paper focuses on the problem of adapting a large +object detection model to one or multiple modalities while being efficient. To +do so, we propose ModTr as an alternative to the common approach of fine-tuning +large models. ModTr consists of adapting the input with a small transformation +network trained to minimize the detection loss directly. The original model can +therefore work on the translated inputs without any further change or +fine-tuning to its parameters. Experimental results on translating from IR to +RGB images on two well-known datasets show that this simple ModTr approach +provides detectors that can perform comparably or better than the standard +fine-tuning without forgetting the original knowledge. This opens the doors to +a more flexible and efficient service-based detection pipeline in which, +instead of using a different detector for each modality, a unique and unaltered +server is constantly running, where multiple modalities with the corresponding +translations can query it. Code: https://github.com/heitorrapela/ModTr. + +
+
+
+
+
+ + ☆ SUGAR: Pre-training 3D Visual Representations for Robotics CVPR 2024 + + +
+ Learning generalizable visual representations from Internet data has yielded +promising results for robotics. Yet, prevailing approaches focus on +pre-training 2D representations, being sub-optimal to deal with occlusions and +accurately localize objects in complex 3D scenes. Meanwhile, 3D representation +learning has been limited to single-object understanding. To address these +limitations, we introduce a novel 3D pre-training framework for robotics named +SUGAR that captures semantic, geometric and affordance properties of objects +through 3D point clouds. We underscore the importance of cluttered scenes in 3D +representation learning, and automatically construct a multi-object dataset +benefiting from cost-free supervision in simulation. SUGAR employs a versatile +transformer-based model to jointly address five pre-training tasks, namely +cross-modal knowledge distillation for semantic learning, masked point modeling +to understand geometry structures, grasping pose synthesis for object +affordance, 3D instance segmentation and referring expression grounding to +analyze cluttered scenes. We evaluate our learned representation on three +robotic-related tasks, namely, zero-shot 3D object recognition, referring +expression grounding, and language-driven robotic manipulation. Experimental +results show that SUGAR's 3D representation outperforms state-of-the-art 2D and +3D representations. + +
+
+ comment: Accepted to CVPR 2024. Project webpage: + https://cshizhe.github.io/projects/robot_sugar.html +
+
+
+
+
+ + ☆ QuAD: Query-based Interpretable Neural Motion Planning for Autonomous + Driving + + +
+ A self-driving vehicle must understand its environment to determine the +appropriate action. Traditional autonomy systems rely on object detection to +find the agents in the scene. However, object detection assumes a discrete set +of objects and loses information about uncertainty, so any errors compound when +predicting the future behavior of those agents. Alternatively, dense occupancy +grid maps have been utilized to understand free-space. However, predicting a +grid for the entire scene is wasteful since only certain spatio-temporal +regions are reachable and relevant to the self-driving vehicle. We present a +unified, interpretable, and efficient autonomy framework that moves away from +cascading modules that first perceive, then predict, and finally plan. Instead, +we shift the paradigm to have the planner query occupancy at relevant +spatio-temporal points, restricting the computation to those regions of +interest. Exploiting this representation, we evaluate candidate trajectories +around key factors such as collision avoidance, comfort, and progress for +safety and interpretability. Our approach achieves better highway driving +quality than the state-of-the-art in high-fidelity closed-loop simulations. + +
+
+
+
+
+ + ☆ TraveLER: A Multi-LMM Agent Framework for Video Question-Answering + + +
+ Recently, Large Multimodal Models (LMMs) have made significant progress in +video question-answering using a frame-wise approach by leveraging large-scale, +image-based pretraining in a zero-shot manner. While image-based methods for +videos have shown impressive performance, a current limitation is that they +often overlook how key timestamps are selected and cannot adjust when incorrect +timestamps are identified. Moreover, they are unable to extract details +relevant to the question, instead providing general descriptions of the frame. +To overcome this, we design a multi-LMM agent framework that travels along the +video, iteratively collecting relevant information from keyframes through +interactive question-asking until there is sufficient information to answer the +question. Specifically, we propose TraveLER, a model that can create a plan to +"Traverse" through the video, ask questions about individual frames to "Locate" +and store key information, and then "Evaluate" if there is enough information +to answer the question. Finally, if there is not enough information, our method +is able to "Replan" based on its collected knowledge. Through extensive +experiments, we find that the proposed TraveLER approach improves performance +on several video question-answering benchmarks, such as NExT-QA, STAR, and +Perception Test, without the need to fine-tune on specific datasets. + +
+
+
+
+
+ + ☆ Data-Efficient Unsupervised Interpolation Without Any Intermediate Frame + for 4D Medical Images CVPR 2024 + + +
+ 4D medical images, which represent 3D images with temporal information, are +crucial in clinical practice for capturing dynamic changes and monitoring +long-term disease progression. However, acquiring 4D medical images poses +challenges due to factors such as radiation exposure and imaging duration, +necessitating a balance between achieving high temporal resolution and +minimizing adverse effects. Given these circumstances, not only is data +acquisition challenging, but increasing the frame rate for each dataset also +proves difficult. To address this challenge, this paper proposes a simple yet +effective Unsupervised Volumetric Interpolation framework, UVI-Net. This +framework facilitates temporal interpolation without the need for any +intermediate frames, distinguishing it from the majority of other existing +unsupervised methods. Experiments on benchmark datasets demonstrate significant +improvements across diverse evaluation metrics compared to unsupervised and +supervised baselines. Remarkably, our approach achieves this superior +performance even when trained with a dataset as small as one, highlighting its +exceptional robustness and efficiency in scenarios with sparse supervision. +This positions UVI-Net as a compelling alternative for 4D medical imaging, +particularly in settings where data availability is limited. The source code is +available at https://github.com/jungeun122333/UVI-Net. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Finding Regions of Interest in Whole Slide Images Using Multiple + Instance Learning + + +
+ Whole Slide Images (WSI), obtained by high-resolution digital scanning of +microscope slides at multiple scales, are the cornerstone of modern Digital +Pathology. However, they represent a particular challenge to +AI-based/AI-mediated analysis because pathology labeling is typically done at +slide-level, instead of tile-level. It is not just that medical diagnostics is +recorded at the specimen level, the detection of oncogene mutation is also +experimentally obtained, and recorded by initiatives like The Cancer Genome +Atlas (TCGA), at the slide level. This configures a dual challenge: a) +accurately predicting the overall cancer phenotype and b) finding out what +cellular morphologies are associated with it at the tile level. To address +these challenges, a weakly supervised Multiple Instance Learning (MIL) approach +was explored for two prevalent cancer types, Invasive Breast Carcinoma +(TCGA-BRCA) and Lung Squamous Cell Carcinoma (TCGA-LUSC). This approach was +explored for tumor detection at low magnification levels and TP53 mutations at +various levels. Our results show that a novel additive implementation of MIL +matched the performance of reference implementation (AUC 0.96), and was only +slightly outperformed by Attention MIL (AUC 0.97). More interestingly from the +perspective of the molecular pathologist, these different AI architectures +identify distinct sensitivities to morphological features (through the +detection of Regions of Interest, RoI) at different amplification levels. +Tellingly, TP53 mutation was most sensitive to features at the higher +applications where cellular morphology is resolved. + +
+
+
+
+
+ + ☆ Neural Implicit Representation for Building Digital Twins of Unknown + Articulated Objects CVPR 2024 + + +
+ We address the problem of building digital twins of unknown articulated +objects from two RGBD scans of the object at different articulation states. We +decompose the problem into two stages, each addressing distinct aspects. Our +method first reconstructs object-level shape at each state, then recovers the +underlying articulation model including part segmentation and joint +articulations that associate the two states. By explicitly modeling point-level +correspondences and exploiting cues from images, 3D reconstructions, and +kinematics, our method yields more accurate and stable results compared to +prior work. It also handles more than one movable part and does not rely on any +object shape or structure priors. Project page: +https://github.com/NVlabs/DigitalTwinArt + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Generation and Detection of Sign Language Deepfakes - A Linguistic and + Visual Analysis + + +
+ A question in the realm of deepfakes is slowly emerging pertaining to whether +we can go beyond facial deepfakes and whether it would be beneficial to +society. Therefore, this research presents a positive application of deepfake +technology in upper body generation, while performing sign-language for the +Deaf and Hard of Hearing (DHoH) community. The resulting videos are later +vetted with a sign language expert. This is particularly helpful, given the +intricate nature of sign language, a scarcity of sign language experts, and +potential benefits for health and education. The objectives of this work +encompass constructing a reliable deepfake dataset, evaluating its technical +and visual credibility through computer vision and natural language processing +models, and assessing the plausibility of the generated content. With over 1200 +videos, featuring both previously seen and unseen individuals for the +generation model, using the help of a sign language expert, we establish a +deepfake dataset in sign language that can further be utilized to detect fake +videos that may target certain people of determination. + +
+
+ comment: 13 pages, 13 figures, Computer Vision and Image Understanding Journal +
+
+
+
+
+ + ☆ The Radar Ghost Dataset -- An Evaluation of Ghost Objects in Automotive + Radar Data + + +
+ Radar sensors have a long tradition in advanced driver assistance systems +(ADAS) and also play a major role in current concepts for autonomous vehicles. +Their importance is reasoned by their high robustness against meteorological +effects, such as rain, snow, or fog, and the radar's ability to measure +relative radial velocity differences via the Doppler effect. The cause for +these advantages, namely the large wavelength, is also one of the drawbacks of +radar sensors. Compared to camera or lidar sensor, a lot more surfaces in a +typical traffic scenario appear flat relative to the radar's emitted signal. +This results in multi-path reflections or so called ghost detections in the +radar signal. Ghost objects pose a major source for potential false positive +detections in a vehicle's perception pipeline. Therefore, it is important to be +able to segregate multi-path reflections from direct ones. In this article, we +present a dataset with detailed manual annotations for different kinds of ghost +detections. Moreover, two different approaches for identifying these kinds of +objects are evaluated. We hope that our dataset encourages more researchers to +engage in the fields of multi-path object suppression or exploitation. + +
+
+
+
+
+ + ☆ DPMesh: Exploiting Diffusion Prior for Occluded Human Mesh Recovery CVPR + + +
+ The recovery of occluded human meshes presents challenges for current methods +due to the difficulty in extracting effective image features under severe +occlusion. In this paper, we introduce DPMesh, an innovative framework for +occluded human mesh recovery that capitalizes on the profound diffusion prior +about object structure and spatial relationships embedded in a pre-trained +text-to-image diffusion model. Unlike previous methods reliant on conventional +backbones for vanilla feature extraction, DPMesh seamlessly integrates the +pre-trained denoising U-Net with potent knowledge as its image backbone and +performs a single-step inference to provide occlusion-aware information. To +enhance the perception capability for occluded poses, DPMesh incorporates +well-designed guidance via condition injection, which produces effective +controls from 2D observations for the denoising U-Net. Furthermore, we explore +a dedicated noisy key-point reasoning approach to mitigate disturbances arising +from occlusion and crowded scenarios. This strategy fully unleashes the +perceptual capability of the diffusion prior, thereby enhancing accuracy. +Extensive experiments affirm the efficacy of our framework, as we outperform +state-of-the-art methods on both occlusion-specific and standard datasets. The +persuasive results underscore its ability to achieve precise and robust 3D +human mesh recovery, particularly in challenging scenarios involving occlusion +and crowded scenes. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ On the Faithfulness of Vision Transformer Explanations CVPR 2024 + + +
+ To interpret Vision Transformers, post-hoc explanations assign salience +scores to input pixels, providing human-understandable heatmaps. However, +whether these interpretations reflect true rationales behind the model's output +is still underexplored. To address this gap, we study the faithfulness +criterion of explanations: the assigned salience scores should represent the +influence of the corresponding input pixels on the model's predictions. To +evaluate faithfulness, we introduce Salience-guided Faithfulness Coefficient +(SaCo), a novel evaluation metric leveraging essential information of salience +distribution. Specifically, we conduct pair-wise comparisons among distinct +pixel groups and then aggregate the differences in their salience scores, +resulting in a coefficient that indicates the explanation's degree of +faithfulness. Our explorations reveal that current metrics struggle to +differentiate between advanced explanation methods and Random Attribution, +thereby failing to capture the faithfulness property. In contrast, our proposed +SaCo offers a reliable faithfulness measurement, establishing a robust metric +for interpretations. Furthermore, our SaCo demonstrates that the use of +gradient and multi-layer aggregation can markedly enhance the faithfulness of +attention-based explanation, shedding light on potential paths for advancing +Vision Transformer explainability. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ OVFoodSeg: Elevating Open-Vocabulary Food Image Segmentation via + Image-Informed Textual Representation CVPR 2024 + + +
+ In the realm of food computing, segmenting ingredients from images poses +substantial challenges due to the large intra-class variance among the same +ingredients, the emergence of new ingredients, and the high annotation costs +associated with large food segmentation datasets. Existing approaches primarily +utilize a closed-vocabulary and static text embeddings setting. These methods +often fall short in effectively handling the ingredients, particularly new and +diverse ones. In response to these limitations, we introduce OVFoodSeg, a +framework that adopts an open-vocabulary setting and enhances text embeddings +with visual context. By integrating vision-language models (VLMs), our approach +enriches text embedding with image-specific information through two innovative +modules, eg, an image-to-text learner FoodLearner and an Image-Informed Text +Encoder. The training process of OVFoodSeg is divided into two stages: the +pre-training of FoodLearner and the subsequent learning phase for segmentation. +The pre-training phase equips FoodLearner with the capability to align visual +information with corresponding textual representations that are specifically +related to food, while the second phase adapts both the FoodLearner and the +Image-Informed Text Encoder for the segmentation task. By addressing the +deficiencies of previous models, OVFoodSeg demonstrates a significant +improvement, achieving an 4.9\% increase in mean Intersection over Union (mIoU) +on the FoodSeg103 dataset, setting a new milestone for food image segmentation. + +
+
+ comment: CVPR 2024; 12 pages +
+
+
+
+
+ + ☆ ContactHandover: Contact-Guided Robot-to-Human Object Handover + + +
+ Robot-to-human object handover is an important step in many human robot +collaboration tasks. A successful handover requires the robot to maintain a +stable grasp on the object while making sure the human receives the object in a +natural and easy-to-use manner. We propose ContactHandover, a robot to human +handover system that consists of two phases: a contact-guided grasping phase +and an object delivery phase. During the grasping phase, ContactHandover +predicts both 6-DoF robot grasp poses and a 3D affordance map of human contact +points on the object. The robot grasp poses are reranked by penalizing those +that block human contact points, and the robot executes the highest ranking +grasp. During the delivery phase, the robot end effector pose is computed by +maximizing human contact points close to the human while minimizing the human +arm joint torques and displacements. We evaluate our system on 27 diverse +household objects and show that our system achieves better visibility and +reachability of human contacts to the receiver compared to several baselines. +More results can be found on +https://clairezixiwang.github.io/ContactHandover.github.io + +
+
+ comment: Project website: + https://clairezixiwang.github.io/ContactHandover.github.io/ +
+
+
+
+
+ + ☆ Object-conditioned Bag of Instances for Few-Shot Personalized Instance + Recognition ICASSP 2024 + + +
+ Nowadays, users demand for increased personalization of vision systems to +localize and identify personal instances of objects (e.g., my dog rather than +dog) from a few-shot dataset only. Despite outstanding results of deep networks +on classical label-abundant benchmarks (e.g., those of the latest YOLOv8 model +for standard object detection), they struggle to maintain within-class +variability to represent different instances rather than object categories +only. We construct an Object-conditioned Bag of Instances (OBoI) based on +multi-order statistics of extracted features, where generic object detection +models are extended to search and identify personal instances from the OBoI's +metric space, without need for backpropagation. By relying on multi-order +statistics, OBoI achieves consistent superior accuracy in distinguishing +different instances. In the results, we achieve 77.1% personal object +recognition accuracy in case of 18 personal instances, showing about 12% +relative gain over the state of the art. + +
+
+ comment: ICASSP 2024. Copyright 2024 IEEE. Personal use of this material is + permitted. Permission from IEEE must be obtained for all other uses, in any + current or future media, including reprinting/republishing this material for + advertising or promotional purposes, creating new collective works, for + resale or redistribution to servers or lists, or reuse of any copyrighted + component of this work in other +
+
+
+
+
+ + ☆ NeRF-MAE : Masked AutoEncoders for Self Supervised 3D representation + Learning for Neural Radiance Fields + + +
+ Neural fields excel in computer vision and robotics due to their ability to +understand the 3D visual world such as inferring semantics, geometry, and +dynamics. Given the capabilities of neural fields in densely representing a 3D +scene from 2D images, we ask the question: Can we scale their self-supervised +pretraining, specifically using masked autoencoders, to generate effective 3D +representations from posed RGB images. Owing to the astounding success of +extending transformers to novel data modalities, we employ standard 3D Vision +Transformers to suit the unique formulation of NeRFs. We leverage NeRF's +volumetric grid as a dense input to the transformer, contrasting it with other +3D representations such as pointclouds where the information density can be +uneven, and the representation is irregular. Due to the difficulty of applying +masked autoencoders to an implicit representation, such as NeRF, we opt for +extracting an explicit representation that canonicalizes scenes across domains +by employing the camera trajectory for sampling. Our goal is made possible by +masking random patches from NeRF's radiance and density grid and employing a +standard 3D Swin Transformer to reconstruct the masked patches. In doing so, +the model can learn the semantic and spatial structure of complete scenes. We +pretrain this representation at scale on our proposed curated posed-RGB data, +totaling over 1.6 million images. Once pretrained, the encoder is used for +effective 3D transfer learning. Our novel self-supervised pretraining for +NeRFs, NeRF-MAE, scales remarkably well and improves performance on various +challenging 3D tasks. Utilizing unlabeled posed 2D data for pretraining, +NeRF-MAE significantly outperforms self-supervised 3D pretraining and NeRF +scene understanding baselines on Front3D and ScanNet datasets with an absolute +performance improvement of over 20% AP50 and 8% AP25 for 3D object detection. + +
+
+ comment: 29 pages, 13 figures. Project Page: https://nerf-mae.github.io/ +
+
+
+
+
+ + ☆ Noise2Image: Noise-Enabled Static Scene Recovery for Event Cameras + + +
+ Event cameras capture changes of intensity over time as a stream of 'events' +and generally cannot measure intensity itself; hence, they are only used for +imaging dynamic scenes. However, fluctuations due to random photon arrival +inevitably trigger noise events, even for static scenes. While previous efforts +have been focused on filtering out these undesirable noise events to improve +signal quality, we find that, in the photon-noise regime, these noise events +are correlated with the static scene intensity. We analyze the noise event +generation and model its relationship to illuminance. Based on this +understanding, we propose a method, called Noise2Image, to leverage the +illuminance-dependent noise characteristics to recover the static parts of a +scene, which are otherwise invisible to event cameras. We experimentally +collect a dataset of noise events on static scenes to train and validate +Noise2Image. Our results show that Noise2Image can robustly recover intensity +images solely from noise events, providing a novel approach for capturing +static scenes in event cameras, without additional hardware. + +
+
+
+
+
+ + ☆ CausalChaos! Dataset for Comprehensive Causal Action Question Answering + Over Longer Causal Chains Grounded in Dynamic Visual Scenes + + +
+ Causal video question answering (QA) has garnered increasing interest, yet +existing datasets often lack depth in causal reasoning analysis. To address +this gap, we capitalize on the unique properties of cartoons and construct +CausalChaos!, a novel, challenging causal Why-QA dataset built upon the iconic +"Tom and Jerry" cartoon series. With thoughtful questions and multi-level +answers, our dataset contains much longer causal chains embedded in dynamic +interactions and visuals, at the same time principles of animation allows +animators to create well-defined, unambiguous causal relationships. These +factors allow models to solve more challenging, yet well-defined causal +relationships. We also introduce hard negative mining, including +CausalConfusion version. While models perform well, there is much room for +improvement, especially, on open-ended answers. We identify more +advanced/explicit causal relationship modeling and joint modeling of vision and +language as the immediate areas for future efforts to focus upon. Along with +the other complementary datasets, our new challenging dataset will pave the way +for these developments in the field. We will release our dataset, codes, and +models to help future efforts in this domain. + +
+
+
+
+
+ + ☆ Bigger is not Always Better: Scaling Properties of Latent Diffusion + Models + + +
+ We study the scaling properties of latent diffusion models (LDMs) with an +emphasis on their sampling efficiency. While improved network architecture and +inference algorithms have shown to effectively boost sampling efficiency of +diffusion models, the role of model size -- a critical determinant of sampling +efficiency -- has not been thoroughly examined. Through empirical analysis of +established text-to-image diffusion models, we conduct an in-depth +investigation into how model size influences sampling efficiency across varying +sampling steps. Our findings unveil a surprising trend: when operating under a +given inference budget, smaller models frequently outperform their larger +equivalents in generating high-quality results. Moreover, we extend our study +to demonstrate the generalizability of the these findings by applying various +diffusion samplers, exploring diverse downstream tasks, evaluating +post-distilled models, as well as comparing performance relative to training +compute. These findings open up new pathways for the development of LDM scaling +strategies which can be employed to enhance generative capabilities within +limited inference budgets. + +
+
+
+
+
+ + ☆ Streaming Dense Video Captioning CVPR 2024 + + +
+ An ideal model for dense video captioning -- predicting captions localized +temporally in a video -- should be able to handle long input videos, predict +rich, detailed textual descriptions, and be able to produce outputs before +processing the entire video. Current state-of-the-art models, however, process +a fixed number of downsampled frames, and make a single full prediction after +seeing the whole video. We propose a streaming dense video captioning model +that consists of two novel components: First, we propose a new memory module, +based on clustering incoming tokens, which can handle arbitrarily long videos +as the memory is of a fixed size. Second, we develop a streaming decoding +algorithm that enables our model to make predictions before the entire video +has been processed. Our model achieves this streaming ability, and +significantly improves the state-of-the-art on three dense video captioning +benchmarks: ActivityNet, YouCook2 and ViTT. Our code is released at +https://github.com/google-research/scenic. + +
+
+ comment: CVPR 2024. Code is available at + https://github.com/google-research/scenic/tree/main/scenic/projects/streaming_dvc +
+
+
+
+
+ + ☆ MagicMirror: Fast and High-Quality Avatar Generation with a Constrained + Search Space + + +
+ We introduce a novel framework for 3D human avatar generation and +personalization, leveraging text prompts to enhance user engagement and +customization. Central to our approach are key innovations aimed at overcoming +the challenges in photo-realistic avatar synthesis. Firstly, we utilize a +conditional Neural Radiance Fields (NeRF) model, trained on a large-scale +unannotated multi-view dataset, to create a versatile initial solution space +that accelerates and diversifies avatar generation. Secondly, we develop a +geometric prior, leveraging the capabilities of Text-to-Image Diffusion Models, +to ensure superior view invariance and enable direct optimization of avatar +geometry. These foundational ideas are complemented by our optimization +pipeline built on Variational Score Distillation (VSD), which mitigates texture +loss and over-saturation issues. As supported by our extensive experiments, +these strategies collectively enable the creation of custom avatars with +unparalleled visual quality and better adherence to input text prompts. You can +find more results and videos in our website: +https://syntec-research.github.io/MagicMirror + +
+
+
+
+
+ + ☆ CosmicMan: A Text-to-Image Foundation Model for Humans CVPR 2024 + + +
+ We present CosmicMan, a text-to-image foundation model specialized for +generating high-fidelity human images. Unlike current general-purpose +foundation models that are stuck in the dilemma of inferior quality and +text-image misalignment for humans, CosmicMan enables generating +photo-realistic human images with meticulous appearance, reasonable structure, +and precise text-image alignment with detailed dense descriptions. At the heart +of CosmicMan's success are the new reflections and perspectives on data and +models: (1) We found that data quality and a scalable data production flow are +essential for the final results from trained models. Hence, we propose a new +data production paradigm, Annotate Anyone, which serves as a perpetual data +flywheel to produce high-quality data with accurate yet cost-effective +annotations over time. Based on this, we constructed a large-scale dataset, +CosmicMan-HQ 1.0, with 6 Million high-quality real-world human images in a mean +resolution of 1488x1255, and attached with precise text annotations deriving +from 115 Million attributes in diverse granularities. (2) We argue that a +text-to-image foundation model specialized for humans must be pragmatic -- easy +to integrate into down-streaming tasks while effective in producing +high-quality human images. Hence, we propose to model the relationship between +dense text descriptions and image pixels in a decomposed manner, and present +Decomposed-Attention-Refocusing (Daring) training framework. It seamlessly +decomposes the cross-attention features in existing text-to-image diffusion +model, and enforces attention refocusing without adding extra modules. Through +Daring, we show that explicitly discretizing continuous text space into several +basic groups that align with human body structure is the key to tackling the +misalignment problem in a breeze. + +
+
+ comment: Accepted by CVPR 2024. The supplementary material is included. + Project Page: https://cosmicman-cvpr2024.github.io +
+
+
+
+
+ + ☆ Measuring Style Similarity in Diffusion Models + + +
+ Generative models are now widely used by graphic designers and artists. Prior +works have shown that these models remember and often replicate content from +their training data during generation. Hence as their proliferation increases, +it has become important to perform a database search to determine whether the +properties of the image are attributable to specific training data, every time +before a generated image is used for professional purposes. Existing tools for +this purpose focus on retrieving images of similar semantic content. Meanwhile, +many artists are concerned with style replication in text-to-image models. We +present a framework for understanding and extracting style descriptors from +images. Our framework comprises a new dataset curated using the insight that +style is a subjective property of an image that captures complex yet meaningful +interactions of factors including but not limited to colors, textures, shapes, +etc. We also propose a method to extract style descriptors that can be used to +attribute style of a generated image to the images used in the training dataset +of a text-to-image model. We showcase promising results in various style +retrieval tasks. We also quantitatively and qualitatively analyze style +attribution and matching in the Stable Diffusion model. Code and artifacts are +available at https://github.com/learn2phoenix/CSD. + +
+
+
+
+
+ + ☆ Evaluating Text-to-Visual Generation with Image-to-Text Generation + + +
+ Despite significant progress in generative AI, comprehensive evaluation +remains challenging because of the lack of effective metrics and standardized +benchmarks. For instance, the widely-used CLIPScore measures the alignment +between a (generated) image and text prompt, but it fails to produce reliable +scores for complex prompts involving compositions of objects, attributes, and +relations. One reason is that text encoders of CLIP can notoriously act as a +"bag of words", conflating prompts such as "the horse is eating the grass" with +"the grass is eating the horse". To address this, we introduce the VQAScore, +which uses a visual-question-answering (VQA) model to produce an alignment +score by computing the probability of a "Yes" answer to a simple "Does this +figure show '{text}'?" question. Though simpler than prior art, VQAScore +computed with off-the-shelf models produces state-of-the-art results across +many (8) image-text alignment benchmarks. We also compute VQAScore with an +in-house model that follows best practices in the literature. For example, we +use a bidirectional image-question encoder that allows image embeddings to +depend on the question being asked (and vice versa). Our in-house model, +CLIP-FlanT5, outperforms even the strongest baselines that make use of the +proprietary GPT-4V. Interestingly, although we train with only images, VQAScore +can also align text with video and 3D models. VQAScore allows researchers to +benchmark text-to-visual generation using complex texts that capture the +compositional structure of real-world prompts. We introduce GenAI-Bench, a more +challenging benchmark with 1,600 compositional text prompts that require +parsing scenes, objects, attributes, relationships, and high-order reasoning +like comparison and logic. GenAI-Bench also offers over 15,000 human ratings +for leading image and video generation models such as Stable Diffusion, DALL-E +3, and Gen2. + +
+
+ comment: We open-source our data, model, and code at: + https://github.com/linzhiqiu/t2v_metrics ; Project page: + https://linzhiqiu.github.io/papers/vqascore +
+
+
+
+
+ + ☆ Large Motion Model for Unified Multi-Modal Motion Generation + + +
+ Human motion generation, a cornerstone technique in animation and video +production, has widespread applications in various tasks like text-to-motion +and music-to-dance. Previous works focus on developing specialist models +tailored for each task without scalability. In this work, we present Large +Motion Model (LMM), a motion-centric, multi-modal framework that unifies +mainstream motion generation tasks into a generalist model. A unified motion +model is appealing since it can leverage a wide range of motion data to achieve +broad generalization beyond a single task. However, it is also challenging due +to the heterogeneous nature of substantially different motion data and tasks. +LMM tackles these challenges from three principled aspects: 1) Data: We +consolidate datasets with different modalities, formats and tasks into a +comprehensive yet unified motion generation dataset, MotionVerse, comprising 10 +tasks, 16 datasets, a total of 320k sequences, and 100 million frames. 2) +Architecture: We design an articulated attention mechanism ArtAttention that +incorporates body part-aware modeling into Diffusion Transformer backbone. 3) +Pre-Training: We propose a novel pre-training strategy for LMM, which employs +variable frame rates and masking forms, to better exploit knowledge from +diverse training data. Extensive experiments demonstrate that our generalist +LMM achieves competitive performance across various standard motion generation +tasks over state-of-the-art specialist models. Notably, LMM exhibits strong +generalization capabilities and emerging properties across many unseen tasks. +Additionally, our ablation studies reveal valuable insights about training and +scaling up large motion models for future research. + +
+
+ comment: Homepage: https://mingyuan-zhang.github.io/projects/LMM.html +
+
+
+
+
+ + ☆ LoSA: Long-Short-range Adapter for Scaling End-to-End Temporal Action + Localization + + +
+ Temporal Action Localization (TAL) involves localizing and classifying action +snippets in an untrimmed video. The emergence of large video foundation models +has led RGB-only video backbones to outperform previous methods needing both +RGB and optical flow modalities. Leveraging these large models is often limited +to training only the TAL head due to the prohibitively large GPU memory +required to adapt the video backbone for TAL. To overcome this limitation, we +introduce LoSA, the first memory-and-parameter-efficient backbone adapter +designed specifically for TAL to handle untrimmed videos. LoSA specializes for +TAL by introducing Long-Short-range Adapters that adapt the intermediate layers +of the video backbone over different temporal ranges. These adapters run +parallel to the video backbone to significantly reduce memory footprint. LoSA +also includes Long-Short-range Fusion that strategically combines the output of +these adapters from the video backbone layers to enhance the video features +provided to the TAL head. Experiments show that LoSA significantly outperforms +all existing methods on standard TAL benchmarks, THUMOS-14 and +ActivityNet-v1.3, by scaling end-to-end backbone adaptation to +billion-parameter-plus models like VideoMAEv2~(ViT-g) and leveraging them +beyond head-only transfer learning. + +
+
+
+
+
+ + ☆ BiPer: Binary Neural Networks using a Periodic Function + + +
+ Quantized neural networks employ reduced precision representations for both +weights and activations. This quantization process significantly reduces the +memory requirements and computational complexity of the network. Binary Neural +Networks (BNNs) are the extreme quantization case, representing values with +just one bit. Since the sign function is typically used to map real values to +binary values, smooth approximations are introduced to mimic the gradients +during error backpropagation. Thus, the mismatch between the forward and +backward models corrupts the direction of the gradient, causing training +inconsistency problems and performance degradation. In contrast to current BNN +approaches, we propose to employ a binary periodic (BiPer) function during +binarization. Specifically, we use a square wave for the forward pass to obtain +the binary values and employ the trigonometric sine function with the same +period of the square wave as a differentiable surrogate during the backward +pass. We demonstrate that this approach can control the quantization error by +using the frequency of the periodic function and improves network performance. +Extensive experiments validate the effectiveness of BiPer in benchmark datasets +and network architectures, with improvements of up to 1% and 0.69% with respect +to state-of-the-art methods in the classification task over CIFAR-10 and +ImageNet, respectively. Our code is publicly available at +https://github.com/edmav4/BiPer. + +
+
+
+
+
+ + ☆ Language Guided Domain Generalized Medical Image Segmentation + + +
+ Single source domain generalization (SDG) holds promise for more reliable and +consistent image segmentation across real-world clinical settings particularly +in the medical domain, where data privacy and acquisition cost constraints +often limit the availability of diverse datasets. Depending solely on visual +features hampers the model's capacity to adapt effectively to various domains, +primarily because of the presence of spurious correlations and domain-specific +characteristics embedded within the image features. Incorporating text features +alongside visual features is a potential solution to enhance the model's +understanding of the data, as it goes beyond pixel-level information to provide +valuable context. Textual cues describing the anatomical structures, their +appearances, and variations across various imaging modalities can guide the +model in domain adaptation, ultimately contributing to more robust and +consistent segmentation. In this paper, we propose an approach that explicitly +leverages textual information by incorporating a contrastive learning mechanism +guided by the text encoder features to learn a more robust feature +representation. We assess the effectiveness of our text-guided contrastive +feature alignment technique in various scenarios, including cross-modality, +cross-sequence, and cross-site settings for different segmentation tasks. Our +approach achieves favorable performance against existing methods in literature. +Our code and model weights are available at +https://github.com/ShahinaKK/LG_SDG.git. + +
+
+ comment: Accepted at ISBI2024 +
+
+
+
+
+ + ☆ What is Point Supervision Worth in Video Instance Segmentation? + + +
+ Video instance segmentation (VIS) is a challenging vision task that aims to +detect, segment, and track objects in videos. Conventional VIS methods rely on +densely-annotated object masks which are expensive. We reduce the human +annotations to only one point for each object in a video frame during training, +and obtain high-quality mask predictions close to fully supervised models. Our +proposed training method consists of a class-agnostic proposal generation +module to provide rich negative samples and a spatio-temporal point-based +matcher to match the object queries with the provided point annotations. +Comprehensive experiments on three VIS benchmarks demonstrate competitive +performance of the proposed framework, nearly matching fully supervised +methods. + +
+
+
+
+
+ + ☆ Bridging Remote Sensors with Multisensor Geospatial Foundation Models CVPR + + +
+ In the realm of geospatial analysis, the diversity of remote sensors, +encompassing both optical and microwave technologies, offers a wealth of +distinct observational capabilities. Recognizing this, we present msGFM, a +multisensor geospatial foundation model that effectively unifies data from four +key sensor modalities. This integration spans an expansive dataset of two +million multisensor images. msGFM is uniquely adept at handling both paired and +unpaired sensor data. For data originating from identical geolocations, our +model employs an innovative cross-sensor pretraining approach in masked image +modeling, enabling the synthesis of joint representations from diverse sensors. +msGFM, incorporating four remote sensors, upholds strong performance, forming a +comprehensive model adaptable to various sensor types. msGFM has demonstrated +enhanced proficiency in a range of both single-sensor and multisensor +downstream tasks. These include scene classification, segmentation, cloud +removal, and pan-sharpening. A key discovery of our research is that +representations derived from natural images are not always compatible with the +distinct characteristics of geospatial remote sensors, underscoring the +limitations of existing representations in this field. Our work can serve as a +guide for developing multisensor geospatial pretraining models, paving the way +for more advanced geospatial capabilities. + +
+
+ comment: Accepted to CVPR +
+
+
+
+
+ + ☆ FireANTs: Adaptive Riemannian Optimization for Multi-Scale Diffeomorphic + Registration + + +
+ Diffeomorphic Image Registration is a critical part of the analysis in +various imaging modalities and downstream tasks like image translation, +segmentation, and atlas building. Registration algorithms based on optimization +have stood the test of time in terms of accuracy, reliability, and robustness +across a wide spectrum of modalities and acquisition settings. However, these +algorithms converge slowly, are prohibitively expensive to run, and their usage +requires a steep learning curve, limiting their scalability to larger clinical +and scientific studies. In this paper, we develop multi-scale Adaptive +Riemannian Optimization algorithms for diffeomorphic image registration. We +demonstrate compelling improvements on image registration across a spectrum of +modalities and anatomies by measuring structural and landmark overlap of the +registered image volumes. Our proposed framework leads to a consistent +improvement in performance, and from 300x up to 2000x speedup over existing +algorithms. Our modular library design makes it easy to use and allows +customization via user-defined cost functions. + +
+
+
+
+
+ + ☆ Scalable Scene Modeling from Perspective Imaging: Physics-based + Appearance and Geometry Inference + + +
+ 3D scene modeling techniques serve as the bedrocks in the geospatial +engineering and computer science, which drives many applications ranging from +automated driving, terrain mapping, navigation, virtual, augmented, mixed, and +extended reality (for gaming and movie industry etc.). This dissertation +presents a fraction of contributions that advances 3D scene modeling to its +state of the art, in the aspects of both appearance and geometry modeling. In +contrast to the prevailing deep learning methods, as a core contribution, this +thesis aims to develop algorithms that follow first principles, where +sophisticated physic-based models are introduced alongside with simpler +learning and inference tasks. The outcomes of these algorithms yield processes +that can consume much larger volume of data for highly accurate reconstructing +3D scenes at a scale without losing methodological generality, which are not +possible by contemporary complex-model based deep learning methods. +Specifically, the dissertation introduces three novel methodologies that +address the challenges of inferring appearance and geometry through +physics-based modeling. + Overall, the research encapsulated in this dissertation marks a series of +methodological triumphs in the processing of complex datasets. By navigating +the confluence of deep learning, computational geometry, and photogrammetry, +this work lays down a robust framework for future exploration and practical +application in the rapidly evolving field of 3D scene reconstruction. The +outcomes of these studies are evidenced through rigorous experiments and +comparisons with existing state-of-the-art methods, demonstrating the efficacy +and scalability of the proposed approaches. + +
+
+ comment: Ph.D. Dissertation, Geospatial Data Analytics Lab, The Ohio State + University, 2024. arXiv admin note: text overlap with arXiv:2108.08378 +
+
+
+
+
+ + ☆ An image speaks a thousand words, but can everyone listen? On + translating images for cultural relevance + + +
+ Given the rise of multimedia content, human translators increasingly focus on +culturally adapting not only words but also other modalities such as images to +convey the same meaning. While several applications stand to benefit from this, +machine translation systems remain confined to dealing with language in speech +and text. In this work, we take a first step towards translating images to make +them culturally relevant. First, we build three pipelines comprising +state-of-the-art generative models to do the task. Next, we build a two-part +evaluation dataset: i) concept: comprising 600 images that are cross-culturally +coherent, focusing on a single concept per image, and ii) application: +comprising 100 images curated from real-world applications. We conduct a +multi-faceted human evaluation of translated images to assess for cultural +relevance and meaning preservation. We find that as of today, image-editing +models fail at this task, but can be improved by leveraging LLMs and retrievers +in the loop. Best pipelines can only translate 5% of images for some countries +in the easier concept dataset and no translation is successful for some +countries in the application dataset, highlighting the challenging nature of +the task. Our code and data is released here: +https://github.com/simran-khanuja/image-transcreation. + +
+
+
+
+
+ + ☆ A Unified and Interpretable Emotion Representation and Expression + Generation CVPR 2024 + + +
+ Canonical emotions, such as happy, sad, and fearful, are easy to understand +and annotate. However, emotions are often compound, e.g. happily surprised, and +can be mapped to the action units (AUs) used for expressing emotions, and +trivially to the canonical ones. Intuitively, emotions are continuous as +represented by the arousal-valence (AV) model. An interpretable unification of +these four modalities - namely, Canonical, Compound, AUs, and AV - is highly +desirable, for a better representation and understanding of emotions. However, +such unification remains to be unknown in the current literature. In this work, +we propose an interpretable and unified emotion model, referred as C2A2. We +also develop a method that leverages labels of the non-unified models to +annotate the novel unified one. Finally, we modify the text-conditional +diffusion models to understand continuous numbers, which are then used to +generate continuous expressions using our unified emotion model. Through +quantitative and qualitative experiments, we show that our generated images are +rich and capture subtle expressions. Our work allows a fine-grained generation +of expressions in conjunction with other textual inputs and offers a new label +space for emotions at the same time. + +
+
+ comment: 10 pages, 9 figures, 3 tables Accepted at CVPR 2024. Project page: + https://emotion-diffusion.github.io +
+
+
+
+
+ + ☆ AURORA: Navigating UI Tarpits via Automated Neural Screen Understanding + + +
+ Nearly a decade of research in software engineering has focused on automating +mobile app testing to help engineers in overcoming the unique challenges +associated with the software platform. Much of this work has come in the form +of Automated Input Generation tools (AIG tools) that dynamically explore app +screens. However, such tools have repeatedly been demonstrated to achieve +lower-than-expected code coverage - particularly on sophisticated proprietary +apps. Prior work has illustrated that a primary cause of these coverage +deficiencies is related to so-called tarpits, or complex screens that are +difficult to navigate. + In this paper, we take a critical step toward enabling AIG tools to +effectively navigate tarpits during app exploration through a new form of +automated semantic screen understanding. We introduce AURORA, a technique that +learns from the visual and textual patterns that exist in mobile app UIs to +automatically detect common screen designs and navigate them accordingly. The +key idea of AURORA is that there are a finite number of mobile app screen +designs, albeit with subtle variations, such that the general patterns of +different categories of UI designs can be learned. As such, AURORA employs a +multi-modal, neural screen classifier that is able to recognize the most common +types of UI screen designs. After recognizing a given screen, it then applies a +set of flexible and generalizable heuristics to properly navigate the screen. +We evaluated AURORA both on a set of 12 apps with known tarpits from prior +work, and on a new set of five of the most popular apps from the Google Play +store. Our results indicate that AURORA is able to effectively navigate tarpit +screens, outperforming prior approaches that avoid tarpits by 19.6% in terms of +method coverage. The improvements can be attributed to AURORA's UI design +classification and heuristic navigation techniques. + +
+
+ comment: Published at 17th IEEE International Conference on Software Testing, + Verification and Validation (ICST) 2024, 12 pages +
+
+
+
+
+ + ☆ Feature Splatting: Language-Driven Physics-Based Scene Synthesis and + Editing + + +
+ Scene representations using 3D Gaussian primitives have produced excellent +results in modeling the appearance of static and dynamic 3D scenes. Many +graphics applications, however, demand the ability to manipulate both the +appearance and the physical properties of objects. We introduce Feature +Splatting, an approach that unifies physics-based dynamic scene synthesis with +rich semantics from vision language foundation models that are grounded by +natural language. Our first contribution is a way to distill high-quality, +object-centric vision-language features into 3D Gaussians, that enables +semi-automatic scene decomposition using text queries. Our second contribution +is a way to synthesize physics-based dynamics from an otherwise static scene +using a particle-based simulator, in which material properties are assigned +automatically via text queries. We ablate key techniques used in this pipeline, +to illustrate the challenge and opportunities in using feature-carrying 3D +Gaussians as a unified format for appearance, geometry, material properties and +semantics grounded on natural language. Project website: +https://feature-splatting.github.io/ + +
+
+ comment: Project website: https://feature-splatting.github.io/ +
+
+
+
+
+ + ☆ Entity-Centric Reinforcement Learning for Object Manipulation from + Pixels ICLR 2024 + + +
+ Manipulating objects is a hallmark of human intelligence, and an important +task in domains such as robotics. In principle, Reinforcement Learning (RL) +offers a general approach to learn object manipulation. In practice, however, +domains with more than a few objects are difficult for RL agents due to the +curse of dimensionality, especially when learning from raw image observations. +In this work we propose a structured approach for visual RL that is suitable +for representing multiple objects and their interaction, and use it to learn +goal-conditioned manipulation of several objects. Key to our method is the +ability to handle goals with dependencies between the objects (e.g., moving +objects in a certain order). We further relate our architecture to the +generalization capability of the trained agent, based on a theoretical result +for compositional generalization, and demonstrate agents that learn with 3 +objects but generalize to similar tasks with over 10 objects. Videos and code +are available on the project website: +https://sites.google.com/view/entity-centric-rl + +
+
+ comment: ICLR 2024 Spotlight. Videos and code are available on the project + website: https://sites.google.com/view/entity-centric-rl +
+
+
+
+
+ + ☆ Vision-language models for decoding provider attention during neonatal + resuscitation + + +
+ Neonatal resuscitations demand an exceptional level of attentiveness from +providers, who must process multiple streams of information simultaneously. +Gaze strongly influences decision making; thus, understanding where a provider +is looking during neonatal resuscitations could inform provider training, +enhance real-time decision support, and improve the design of delivery rooms +and neonatal intensive care units (NICUs). Current approaches to quantifying +neonatal providers' gaze rely on manual coding or simulations, which limit +scalability and utility. Here, we introduce an automated, real-time, deep +learning approach capable of decoding provider gaze into semantic classes +directly from first-person point-of-view videos recorded during live +resuscitations. Combining state-of-the-art, real-time segmentation with +vision-language models (CLIP), our low-shot pipeline attains 91\% +classification accuracy in identifying gaze targets without training. Upon +fine-tuning, the performance of our gaze-guided vision transformer exceeds 98\% +accuracy in gaze classification, approaching human-level precision. This +system, capable of real-time inference, enables objective quantification of +provider attention dynamics during live neonatal resuscitation. Our approach +offers a scalable solution that seamlessly integrates with existing +infrastructure for data-scarce gaze analysis, thereby offering new +opportunities for understanding and refining clinical decision making. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Video Interpolation with Diffusion Models CVPR 2024 + + +
+ We present VIDIM, a generative model for video interpolation, which creates +short videos given a start and end frame. In order to achieve high fidelity and +generate motions unseen in the input data, VIDIM uses cascaded diffusion models +to first generate the target video at low resolution, and then generate the +high-resolution video conditioned on the low-resolution generated video. We +compare VIDIM to previous state-of-the-art methods on video interpolation, and +demonstrate how such works fail in most settings where the underlying motion is +complex, nonlinear, or ambiguous while VIDIM can easily handle such cases. We +additionally demonstrate how classifier-free guidance on the start and end +frame and conditioning the super-resolution model on the original +high-resolution frames without additional parameters unlocks high-fidelity +results. VIDIM is fast to sample from as it jointly denoises all the frames to +be generated, requires less than a billion parameters per diffusion model to +produce compelling results, and still enjoys scalability and improved quality +at larger parameter counts. + +
+
+ comment: CVPR 2024, Project page at https://vidim-interpolation.github.io/ +
+
+
+
+
+ + ☆ Getting it Right: Improving Spatial Consistency in Text-to-Image Models + + +
+ One of the key shortcomings in current text-to-image (T2I) models is their +inability to consistently generate images which faithfully follow the spatial +relationships specified in the text prompt. In this paper, we offer a +comprehensive investigation of this limitation, while also developing datasets +and methods that achieve state-of-the-art performance. First, we find that +current vision-language datasets do not represent spatial relationships well +enough; to alleviate this bottleneck, we create SPRIGHT, the first +spatially-focused, large scale dataset, by re-captioning 6 million images from +4 widely used vision datasets. Through a 3-fold evaluation and analysis +pipeline, we find that SPRIGHT largely improves upon existing datasets in +capturing spatial relationships. To demonstrate its efficacy, we leverage only +~0.25% of SPRIGHT and achieve a 22% improvement in generating spatially +accurate images while also improving the FID and CMMD scores. Secondly, we find +that training on images containing a large number of objects results in +substantial improvements in spatial consistency. Notably, we attain +state-of-the-art on T2I-CompBench with a spatial score of 0.2133, by +fine-tuning on <500 images. Finally, through a set of controlled experiments +and ablations, we document multiple findings that we believe will enhance the +understanding of factors that affect spatial consistency in text-to-image +models. We publicly release our dataset and model to foster further research in +this area. + +
+
+ comment: project webpage : https://spright-t2i.github.io/ +
+
+
+
+
+ + ☆ Adaptive Query Prompting for Multi-Domain Landmark Detection + + +
+ Medical landmark detection is crucial in various medical imaging modalities +and procedures. Although deep learning-based methods have achieve promising +performance, they are mostly designed for specific anatomical regions or tasks. +In this work, we propose a universal model for multi-domain landmark detection +by leveraging transformer architecture and developing a prompting component, +named as Adaptive Query Prompting (AQP). Instead of embedding additional +modules in the backbone network, we design a separate module to generate +prompts that can be effectively extended to any other transformer network. In +our proposed AQP, prompts are learnable parameters maintained in a memory space +called prompt pool. The central idea is to keep the backbone frozen and then +optimize prompts to instruct the model inference process. Furthermore, we +employ a lightweight decoder to decode landmarks from the extracted features, +namely Light-MLD. Thanks to the lightweight nature of the decoder and AQP, we +can handle multiple datasets by sharing the backbone encoder and then only +perform partial parameter tuning without incurring much additional cost. It has +the potential to be extended to more landmark detection tasks. We conduct +experiments on three widely used X-ray datasets for different medical landmark +detection tasks. Our proposed Light-MLD coupled with AQP achieves SOTA +performance on many metrics even without the use of elaborate structural +designs or complex frameworks. + +
+
+
+
+
+ + ☆ iMD4GC: Incomplete Multimodal Data Integration to Advance Precise + Treatment Response Prediction and Survival Analysis for Gastric Cancer + + +
+ Gastric cancer (GC) is a prevalent malignancy worldwide, ranking as the fifth +most common cancer with over 1 million new cases and 700 thousand deaths in +2020. Locally advanced gastric cancer (LAGC) accounts for approximately +two-thirds of GC diagnoses, and neoadjuvant chemotherapy (NACT) has emerged as +the standard treatment for LAGC. However, the effectiveness of NACT varies +significantly among patients, with a considerable subset displaying treatment +resistance. Ineffective NACT not only leads to adverse effects but also misses +the optimal therapeutic window, resulting in lower survival rate. However, +existing multimodal learning methods assume the availability of all modalities +for each patient, which does not align with the reality of clinical practice. +The limited availability of modalities for each patient would cause information +loss, adversely affecting predictive accuracy. In this study, we propose an +incomplete multimodal data integration framework for GC (iMD4GC) to address the +challenges posed by incomplete multimodal data, enabling precise response +prediction and survival analysis. Specifically, iMD4GC incorporates unimodal +attention layers for each modality to capture intra-modal information. +Subsequently, the cross-modal interaction layers explore potential inter-modal +interactions and capture complementary information across modalities, thereby +enabling information compensation for missing modalities. To evaluate iMD4GC, +we collected three multimodal datasets for GC study: GastricRes (698 cases) for +response prediction, GastricSur (801 cases) for survival analysis, and +TCGA-STAD (400 cases) for survival analysis. The scale of our datasets is +significantly larger than previous studies. The iMD4GC achieved impressive +performance with an 80.2% AUC on GastricRes, 71.4% C-index on GastricSur, and +66.1% C-index on TCGA-STAD, significantly surpassing other compared methods. + +
+
+ comment: 27 pages, 9 figures, 3 tables (under review) +
+
+
+
+
+ + ☆ BEM: Balanced and Entropy-based Mix for Long-Tailed Semi-Supervised + Learning CVPR 2024 + + +
+ Data mixing methods play a crucial role in semi-supervised learning (SSL), +but their application is unexplored in long-tailed semi-supervised learning +(LTSSL). The primary reason is that the in-batch mixing manner fails to address +class imbalance. Furthermore, existing LTSSL methods mainly focus on +re-balancing data quantity but ignore class-wise uncertainty, which is also +vital for class balance. For instance, some classes with sufficient samples +might still exhibit high uncertainty due to indistinguishable features. To this +end, this paper introduces the Balanced and Entropy-based Mix (BEM), a +pioneering mixing approach to re-balance the class distribution of both data +quantity and uncertainty. Specifically, we first propose a class balanced mix +bank to store data of each class for mixing. This bank samples data based on +the estimated quantity distribution, thus re-balancing data quantity. Then, we +present an entropy-based learning approach to re-balance class-wise +uncertainty, including entropy-based sampling strategy, entropy-based selection +module, and entropy-based class balanced loss. Our BEM first leverages data +mixing for improving LTSSL, and it can also serve as a complement to the +existing re-balancing methods. Experimental results show that BEM significantly +enhances various LTSSL frameworks and achieves state-of-the-art performances +across multiple benchmarks. + +
+
+ comment: This paper is accepted to CVPR 2024. The supplementary material is + included +
+
+
+
+
+ + ☆ SpikeMba: Multi-Modal Spiking Saliency Mamba for Temporal Video + Grounding + + +
+ Temporal video grounding (TVG) is a critical task in video content +understanding. Despite significant advancements, existing methods often limit +in capturing the fine-grained relationships between multimodal inputs and the +high computational costs with processing long video sequences. To address these +limitations, we introduce a novel SpikeMba: multi-modal spiking saliency mamba +for temporal video grounding. In our work, we integrate the Spiking Neural +Networks (SNNs) and state space models (SSMs) to capture the fine-grained +relationships of multimodal features effectively. Specifically, we introduce +the relevant slots to enhance the model's memory capabilities, enabling a +deeper contextual understanding of video sequences. The contextual moment +reasoner leverages these slots to maintain a balance between contextual +information preservation and semantic relevance exploration. Simultaneously, +the spiking saliency detector capitalizes on the unique properties of SNNs to +accurately locate salient proposals. Our experiments demonstrate the +effectiveness of SpikeMba, which consistently outperforms state-of-the-art +methods across mainstream benchmarks. + +
+
+
+
+
+ + ☆ Mirror-3DGS: Incorporating Mirror Reflections into 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting (3DGS) has marked a significant breakthrough in the +realm of 3D scene reconstruction and novel view synthesis. However, 3DGS, much +like its predecessor Neural Radiance Fields (NeRF), struggles to accurately +model physical reflections, particularly in mirrors that are ubiquitous in +real-world scenes. This oversight mistakenly perceives reflections as separate +entities that physically exist, resulting in inaccurate reconstructions and +inconsistent reflective properties across varied viewpoints. To address this +pivotal challenge, we introduce Mirror-3DGS, an innovative rendering framework +devised to master the intricacies of mirror geometries and reflections, paving +the way for the generation of realistically depicted mirror reflections. By +ingeniously incorporating mirror attributes into the 3DGS and leveraging the +principle of plane mirror imaging, Mirror-3DGS crafts a mirrored viewpoint to +observe from behind the mirror, enriching the realism of scene renderings. +Extensive assessments, spanning both synthetic and real-world scenes, showcase +our method's ability to render novel views with enhanced fidelity in real-time, +surpassing the state-of-the-art Mirror-NeRF specifically within the challenging +mirror regions. Our code will be made publicly available for reproducible +research. + +
+
+ comment: 22 pages, 7 figures +
+
+
+
+
+ + ☆ Diagnosis of Skin Cancer Using VGG16 and VGG19 Based Transfer Learning + Models + + +
+ Today, skin cancer is considered as one of the most dangerous and common +cancers in the world which demands special attention. Skin cancer may be +developed in different types; including melanoma, actinic keratosis, basal cell +carcinoma, squamous cell carcinoma, and Merkel cell carcinoma. Among them, +melanoma is more unpredictable. Melanoma cancer can be diagnosed at early +stages increasing the possibility of disease treatment. Automatic +classification of skin lesions is a challenging task due to diverse forms and +grades of the disease, demanding the requirement of novel methods +implementation. Deep convolution neural networks (CNN) have shown an excellent +potential for data and image classification. In this article, we inspect skin +lesion classification problem using CNN techniques. Remarkably, we present that +prominent classification accuracy of lesion detection can be obtained by proper +designing and applying of transfer learning framework on pre-trained neural +networks, without any requirement for data enlargement procedures i.e. merging +VGG16 and VGG19 architectures pre-trained by a generic dataset with modified +AlexNet network, and then, fine-tuned by a subject-specific dataset containing +dermatology images. The convolution neural network was trained using 2541 +images and, in particular, dropout was used to prevent the network from +overfitting. Finally, the validity of the model was checked by applying the +K-fold cross validation method. The proposed model increased classification +accuracy by 3% (from 94.2% to 98.18%) in comparison with other methods. + +
+
+ comment: 15 pages, journal +
+
+
+
+
+ + ☆ SyncMask: Synchronized Attentional Masking for Fashion-centric + Vision-Language Pretraining CVPR2024 + + +
+ Vision-language models (VLMs) have made significant strides in cross-modal +understanding through large-scale paired datasets. However, in fashion domain, +datasets often exhibit a disparity between the information conveyed in image +and text. This issue stems from datasets containing multiple images of a single +fashion item all paired with one text, leading to cases where some textual +details are not visible in individual images. This mismatch, particularly when +non-co-occurring elements are masked, undermines the training of conventional +VLM objectives like Masked Language Modeling and Masked Image Modeling, thereby +hindering the model's ability to accurately align fine-grained visual and +textual features. Addressing this problem, we propose Synchronized attentional +Masking (SyncMask), which generate masks that pinpoint the image patches and +word tokens where the information co-occur in both image and text. This +synchronization is accomplished by harnessing cross-attentional features +obtained from a momentum model, ensuring a precise alignment between the two +modalities. Additionally, we enhance grouped batch sampling with semi-hard +negatives, effectively mitigating false negative issues in Image-Text Matching +and Image-Text Contrastive learning objectives within fashion datasets. Our +experiments demonstrate the effectiveness of the proposed approach, +outperforming existing methods in three downstream tasks. + +
+
+ comment: CVPR2024 Accepted +
+
+
+
+
+ + ☆ Uncovering the Text Embedding in Text-to-Image Diffusion Models + + +
+ The correspondence between input text and the generated image exhibits +opacity, wherein minor textual modifications can induce substantial deviations +in the generated image. While, text embedding, as the pivotal intermediary +between text and images, remains relatively underexplored. In this paper, we +address this research gap by delving into the text embedding space, unleashing +its capacity for controllable image editing and explicable semantic direction +attributes within a learning-free framework. Specifically, we identify two +critical insights regarding the importance of per-word embedding and their +contextual correlations within text embedding, providing instructive principles +for learning-free image editing. Additionally, we find that text embedding +inherently possesses diverse semantic potentials, and further reveal this +property through the lens of singular value decomposition (SVD). These +uncovered properties offer practical utility for image editing and semantic +discovery. More importantly, we expect the in-depth analyses and findings of +the text embedding can enhance the understanding of text-to-image diffusion +models. + +
+
+
+
+
+ + ☆ Detect2Interact: Localizing Object Key Field in Visual Question + Answering (VQA) with LLMs + + +
+ Localization plays a crucial role in enhancing the practicality and precision +of VQA systems. By enabling fine-grained identification and interaction with +specific parts of an object, it significantly improves the system's ability to +provide contextually relevant and spatially accurate responses, crucial for +applications in dynamic environments like robotics and augmented reality. +However, traditional systems face challenges in accurately mapping objects +within images to generate nuanced and spatially aware responses. In this work, +we introduce "Detect2Interact", which addresses these challenges by introducing +an advanced approach for fine-grained object visual key field detection. First, +we use the segment anything model (SAM) to generate detailed spatial maps of +objects in images. Next, we use Vision Studio to extract semantic object +descriptions. Third, we employ GPT-4's common sense knowledge, bridging the gap +between an object's semantics and its spatial map. As a result, Detect2Interact +achieves consistent qualitative results on object key field detection across +extensive test cases and outperforms the existing VQA system with object +detection by providing a more reasonable and finer visual representation. + +
+
+ comment: Accepted to IEEE Intelligent Systems +
+
+
+
+
+ + ☆ Condition-Aware Neural Network for Controlled Image Generation CVPR 2024 + + +
+ We present Condition-Aware Neural Network (CAN), a new method for adding +control to image generative models. In parallel to prior conditional control +methods, CAN controls the image generation process by dynamically manipulating +the weight of the neural network. This is achieved by introducing a +condition-aware weight generation module that generates conditional weight for +convolution/linear layers based on the input condition. We test CAN on +class-conditional image generation on ImageNet and text-to-image generation on +COCO. CAN consistently delivers significant improvements for diffusion +transformer models, including DiT and UViT. In particular, CAN combined with +EfficientViT (CaT) achieves 2.78 FID on ImageNet 512x512, surpassing DiT-XL/2 +while requiring 52x fewer MACs per sampling step. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Structured Initialization for Attention in Vision Transformers + + +
+ The training of vision transformer (ViT) networks on small-scale datasets +poses a significant challenge. By contrast, convolutional neural networks +(CNNs) have an architectural inductive bias enabling them to perform well on +such problems. In this paper, we argue that the architectural bias inherent to +CNNs can be reinterpreted as an initialization bias within ViT. This insight is +significant as it empowers ViTs to perform equally well on small-scale problems +while maintaining their flexibility for large-scale applications. Our +inspiration for this ``structured'' initialization stems from our empirical +observation that random impulse filters can achieve comparable performance to +learned filters within CNNs. Our approach achieves state-of-the-art performance +for data-efficient ViT learning across numerous benchmarks including CIFAR-10, +CIFAR-100, and SVHN. + +
+
+ comment: 20 pages, 5 figures, 8 tables +
+
+
+
+
+ + ☆ CityGaussian: Real-time High-quality Large-Scale Scene Rendering with + Gaussians + + +
+ The advancement of real-time 3D scene reconstruction and novel view synthesis +has been significantly propelled by 3D Gaussian Splatting (3DGS). However, +effectively training large-scale 3DGS and rendering it in real-time across +various scales remains challenging. This paper introduces CityGaussian +(CityGS), which employs a novel divide-and-conquer training approach and +Level-of-Detail (LoD) strategy for efficient large-scale 3DGS training and +rendering. Specifically, the global scene prior and adaptive training data +selection enables efficient training and seamless fusion. Based on fused +Gaussian primitives, we generate different detail levels through compression, +and realize fast rendering across various scales through the proposed +block-wise detail levels selection and aggregation strategy. Extensive +experimental results on large-scale scenes demonstrate that our approach +attains state-of-theart rendering quality, enabling consistent real-time +rendering of largescale scenes across vastly different scales. Our project page +is available at https://dekuliutesla.github.io/citygs/. + +
+
+ comment: Project Page: https://dekuliutesla.github.io/citygs/ +
+
+
+
+
+ + ☆ Medical Visual Prompting (MVP): A Unified Framework for Versatile and + High-Quality Medical Image Segmentation + + +
+ Accurate segmentation of lesion regions is crucial for clinical diagnosis and +treatment across various diseases. While deep convolutional networks have +achieved satisfactory results in medical image segmentation, they face +challenges such as loss of lesion shape information due to continuous +convolution and downsampling, as well as the high cost of manually labeling +lesions with varying shapes and sizes. To address these issues, we propose a +novel medical visual prompting (MVP) framework that leverages pre-training and +prompting concepts from natural language processing (NLP). The framework +utilizes three key components: Super-Pixel Guided Prompting (SPGP) for +superpixelating the input image, Image Embedding Guided Prompting (IEGP) for +freezing patch embedding and merging with superpixels to provide visual +prompts, and Adaptive Attention Mechanism Guided Prompting (AAGP) for +pinpointing prompt content and efficiently adapting all layers. By integrating +SPGP, IEGP, and AAGP, the MVP enables the segmentation network to better learn +shape prompting information and facilitates mutual learning across different +tasks. Extensive experiments conducted on five datasets demonstrate superior +performance of this method in various challenging medical image tasks, while +simplifying single-task medical segmentation models. This novel framework +offers improved performance with fewer parameters and holds significant +potential for accurate segmentation of lesion regions in various medical tasks, +making it clinically valuable. + +
+
+
+
+
+ + ☆ CLIPtone: Unsupervised Learning for Text-based Image Tone Adjustment + + +
+ Recent image tone adjustment (or enhancement) approaches have predominantly +adopted supervised learning for learning human-centric perceptual assessment. +However, these approaches are constrained by intrinsic challenges of supervised +learning. Primarily, the requirement for expertly-curated or retouched images +escalates the data acquisition expenses. Moreover, their coverage of target +style is confined to stylistic variants inferred from the training data. To +surmount the above challenges, we propose an unsupervised learning-based +approach for text-based image tone adjustment method, CLIPtone, that extends an +existing image enhancement method to accommodate natural language descriptions. +Specifically, we design a hyper-network to adaptively modulate the pretrained +parameters of the backbone model based on text description. To assess whether +the adjusted image aligns with the text description without ground truth image, +we utilize CLIP, which is trained on a vast set of language-image pairs and +thus encompasses knowledge of human perception. The major advantages of our +approach are three fold: (i) minimal data collection expenses, (ii) support for +a range of adjustments, and (iii) the ability to handle novel text descriptions +unseen in training. Our approach's efficacy is demonstrated through +comprehensive experiments, including a user study. + +
+
+
+
+
+ + ☆ CMT: Cross Modulation Transformer with Hybrid Loss for Pansharpening + + +
+ Pansharpening aims to enhance remote sensing image (RSI) quality by merging +high-resolution panchromatic (PAN) with multispectral (MS) images. However, +prior techniques struggled to optimally fuse PAN and MS images for enhanced +spatial and spectral information, due to a lack of a systematic framework +capable of effectively coordinating their individual strengths. In response, we +present the Cross Modulation Transformer (CMT), a pioneering method that +modifies the attention mechanism. This approach utilizes a robust modulation +technique from signal processing, integrating it into the attention mechanism's +calculations. It dynamically tunes the weights of the carrier's value (V) +matrix according to the modulator's features, thus resolving historical +challenges and achieving a seamless integration of spatial and spectral +attributes. Furthermore, considering that RSI exhibits large-scale features and +edge details along with local textures, we crafted a hybrid loss function that +combines Fourier and wavelet transforms to effectively capture these +characteristics, thereby enhancing both spatial and spectral accuracy in +pansharpening. Extensive experiments demonstrate our framework's superior +performance over existing state-of-the-art methods. The code will be publicly +available to encourage further research. + +
+
+
+
+
+ + ☆ Motion Blur Decomposition with Cross-shutter Guidance CVPR 2024 + + +
+ Motion blur is a frequently observed image artifact, especially under +insufficient illumination where exposure time has to be prolonged so as to +collect more photons for a bright enough image. Rather than simply removing +such blurring effects, recent researches have aimed at decomposing a blurry +image into multiple sharp images with spatial and temporal coherence. Since +motion blur decomposition itself is highly ambiguous, priors from neighbouring +frames or human annotation are usually needed for motion disambiguation. In +this paper, inspired by the complementary exposure characteristics of a global +shutter (GS) camera and a rolling shutter (RS) camera, we propose to utilize +the ordered scanline-wise delay in a rolling shutter image to robustify motion +decomposition of a single blurry image. To evaluate this novel dual imaging +setting, we construct a triaxial system to collect realistic data, as well as a +deep network architecture that explicitly addresses temporal and contextual +information through reciprocal branches for cross-shutter motion blur +decomposition. Experiment results have verified the effectiveness of our +proposed algorithm, as well as the validity of our dual imaging setting. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Diffusion based Zero-shot Medical Image-to-Image Translation for Cross + Modality Segmentation + + +
+ Cross-modality image segmentation aims to segment the target modalities using +a method designed in the source modality. Deep generative models can translate +the target modality images into the source modality, thus enabling +cross-modality segmentation. However, a vast body of existing cross-modality +image translation methods relies on supervised learning. In this work, we aim +to address the challenge of zero-shot learning-based image translation tasks +(extreme scenarios in the target modality is unseen in the training phase). To +leverage generative learning for zero-shot cross-modality image segmentation, +we propose a novel unsupervised image translation method. The framework learns +to translate the unseen source image to the target modality for image +segmentation by leveraging the inherent statistical consistency between +different modalities for diffusion guidance. Our framework captures identical +cross-modality features in the statistical domain, offering diffusion guidance +without relying on direct mappings between the source and target domains. This +advantage allows our method to adapt to changing source domains without the +need for retraining, making it highly practical when sufficient labeled source +domain data is not available. The proposed framework is validated in zero-shot +cross-modality image segmentation tasks through empirical comparisons with +influential generative models, including adversarial-based and diffusion-based +models. + +
+
+ comment: Neurips 2023 Diffusion Workshop +
+
+
+
+
+ + ☆ UFID: A Unified Framework for Input-level Backdoor Detection on + Diffusion Models + + +
+ Diffusion Models are vulnerable to backdoor attacks, where malicious +attackers inject backdoors by poisoning some parts of the training samples +during the training stage. This poses a serious threat to the downstream users, +who query the diffusion models through the API or directly download them from +the internet. To mitigate the threat of backdoor attacks, there have been a +plethora of investigations on backdoor detections. However, none of them +designed a specialized backdoor detection method for diffusion models, +rendering the area much under-explored. Moreover, these prior methods mainly +focus on the traditional neural networks in the classification task, which +cannot be adapted to the backdoor detections on the generative task easily. +Additionally, most of the prior methods require white-box access to model +weights and architectures, or the probability logits as additional information, +which are not always practical. In this paper, we propose a Unified Framework +for Input-level backdoor Detection (UFID) on the diffusion models, which is +motivated by observations in the diffusion models and further validated with a +theoretical causality analysis. Extensive experiments across different datasets +on both conditional and unconditional diffusion models show that our method +achieves a superb performance on detection effectiveness and run-time +efficiency. The code is available at +https://github.com/GuanZihan/official_UFID. + +
+
+ comment: 20 pages,18 figures +
+
+
+
+
+ + ☆ HairFastGAN: Realistic and Robust Hair Transfer with a Fast + Encoder-Based Approach + + +
+ Our paper addresses the complex task of transferring a hairstyle from a +reference image to an input photo for virtual hair try-on. This task is +challenging due to the need to adapt to various photo poses, the sensitivity of +hairstyles, and the lack of objective metrics. The current state of the art +hairstyle transfer methods use an optimization process for different parts of +the approach, making them inexcusably slow. At the same time, faster +encoder-based models are of very low quality because they either operate in +StyleGAN's W+ space or use other low-dimensional image generators. +Additionally, both approaches have a problem with hairstyle transfer when the +source pose is very different from the target pose, because they either don't +consider the pose at all or deal with it inefficiently. In our paper, we +present the HairFast model, which uniquely solves these problems and achieves +high resolution, near real-time performance, and superior reconstruction +compared to optimization problem-based methods. Our solution includes a new +architecture operating in the FS latent space of StyleGAN, an enhanced +inpainting approach, and improved encoders for better alignment, color +transfer, and a new encoder for post-processing. The effectiveness of our +approach is demonstrated on realism metrics after random hairstyle transfer and +reconstruction when the original hairstyle is transferred. In the most +difficult scenario of transferring both shape and color of a hairstyle from +different images, our method performs in less than a second on the Nvidia V100. +Our code is available at https://github.com/AIRI-Institute/HairFastGAN. + +
+
+
+
+
+ + ☆ Texture-Preserving Diffusion Models for High-Fidelity Virtual Try-On CVPR 2024 + + +
+ Image-based virtual try-on is an increasingly important task for online +shopping. It aims to synthesize images of a specific person wearing a specified +garment. Diffusion model-based approaches have recently become popular, as they +are excellent at image synthesis tasks. However, these approaches usually +employ additional image encoders and rely on the cross-attention mechanism for +texture transfer from the garment to the person image, which affects the +try-on's efficiency and fidelity. To address these issues, we propose an +Texture-Preserving Diffusion (TPD) model for virtual try-on, which enhances the +fidelity of the results and introduces no additional image encoders. +Accordingly, we make contributions from two aspects. First, we propose to +concatenate the masked person and reference garment images along the spatial +dimension and utilize the resulting image as the input for the diffusion +model's denoising UNet. This enables the original self-attention layers +contained in the diffusion model to achieve efficient and accurate texture +transfer. Second, we propose a novel diffusion-based method that predicts a +precise inpainting mask based on the person and reference garment images, +further enhancing the reliability of the try-on results. In addition, we +integrate mask prediction and image synthesis into a single compact model. The +experimental results show that our approach can be applied to various try-on +tasks, e.g., garment-to-person and person-to-person try-ons, and significantly +outperforms state-of-the-art methods on popular VITON, VITON-HD databases. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ PhysReaction: Physically Plausible Real-Time Humanoid Reaction Synthesis + via Forward Dynamics Guided 4D Imitation + + +
+ Humanoid Reaction Synthesis is pivotal for creating highly interactive and +empathetic robots that can seamlessly integrate into human environments, +enhancing the way we live, work, and communicate. However, it is difficult to +learn the diverse interaction patterns of multiple humans and generate +physically plausible reactions. The kinematics-based approaches face +challenges, including issues like floating feet, sliding, penetration, and +other problems that defy physical plausibility. The existing physics-based +method often relies on kinematics-based methods to generate reference states, +which struggle with the challenges posed by kinematic noise during action +execution. Constrained by their reliance on diffusion models, these methods are +unable to achieve real-time inference. In this work, we propose a Forward +Dynamics Guided 4D Imitation method to generate physically plausible human-like +reactions. The learned policy is capable of generating physically plausible and +human-like reactions in real-time, significantly improving the speed(x33) and +quality of reactions compared with the existing method. Our experiments on the +InterHuman and Chi3D datasets, along with ablation studies, demonstrate the +effectiveness of our approach. + +
+
+
+
+
+ + ☆ Stale Diffusion: Hyper-realistic 5D Movie Generation Using Old-school + Methods + + +
+ Two years ago, Stable Diffusion achieved super-human performance at +generating images with super-human numbers of fingers. Following the steady +decline of its technical novelty, we propose Stale Diffusion, a method that +solidifies and ossifies Stable Diffusion in a maximum-entropy state. Stable +Diffusion works analogously to a barn (the Stable) from which an infinite set +of horses have escaped (the Diffusion). As the horses have long left the barn, +our proposal may be seen as antiquated and irrelevant. Nevertheless, we +vigorously defend our claim of novelty by identifying as early adopters of the +Slow Science Movement, which will produce extremely important pearls of wisdom +in the future. Our speed of contributions can also be seen as a quasi-static +implementation of the recent call to pause AI experiments, which we +wholeheartedly support. As a result of a careful archaeological expedition to +18-months-old Git commit histories, we found that naturally-accumulating errors +have produced a novel entropy-maximising Stale Diffusion method, that can +produce sleep-inducing hyper-realistic 5D video that is as good as one's +imagination. + +
+
+ comment: SIGBOVIK 2024 +
+
+
+
+
+ + ☆ Prompt Learning for Oriented Power Transmission Tower Detection in + High-Resolution SAR Images + + +
+ Detecting transmission towers from synthetic aperture radar (SAR) images +remains a challenging task due to the comparatively small size and side-looking +geometry, with background clutter interference frequently hindering tower +identification. A large number of interfering signals superimposes the return +signal from the tower. We found that localizing or prompting positions of power +transmission towers is beneficial to address this obstacle. Based on this +revelation, this paper introduces prompt learning into the oriented object +detector (P2Det) for multimodal information learning. P2Det contains the sparse +prompt coding and cross-attention between the multimodal data. Specifically, +the sparse prompt encoder (SPE) is proposed to represent point locations, +converting prompts into sparse embeddings. The image embeddings are generated +through the Transformer layers. Then a two-way fusion module (TWFM) is proposed +to calculate the cross-attention of the two different embeddings. The +interaction of image-level and prompt-level features is utilized to address the +clutter interference. A shape-adaptive refinement module (SARM) is proposed to +reduce the effect of aspect ratio. Extensive experiments demonstrated the +effectiveness of the proposed model on high-resolution SAR images. P2Det +provides a novel insight for multimodal object detection due to its competitive +performance. + +
+
+ comment: 22 pages, 12figures +
+
+
+
+
+ + ☆ T-Mamba: Frequency-Enhanced Gated Long-Range Dependency for Tooth 3D + CBCT Segmentation + + +
+ Efficient tooth segmentation in three-dimensional (3D) imaging, critical for +orthodontic diagnosis, remains challenging due to noise, low contrast, and +artifacts in CBCT images. Both convolutional Neural Networks (CNNs) and +transformers have emerged as popular architectures for image segmentation. +However, their efficacy in handling long-range dependencies is limited due to +inherent locality or computational complexity. To address this issue, we +propose T-Mamba, integrating shared positional encoding and frequency-based +features into vision mamba, to address limitations in spatial position +preservation and feature enhancement in frequency domain. Besides, we also +design a gate selection unit to integrate two features in spatial domain and +one feature in frequency domain adaptively. T-Mamba is the first work to +introduce frequency-based features into vision mamba. Extensive experiments +demonstrate that T-Mamba achieves new SOTA results on the public Tooth CBCT +dataset and outperforms previous SOTA methods by a large margin, i.e., IoU + +3.63%, SO + 2.43%, DSC +2.30%, HD -4.39mm, and ASSD -0.37mm. The code and +models are publicly available at https://github.com/isbrycee/T-Mamba. + +
+
+
+
+
+ + ☆ Roadside Monocular 3D Detection via 2D Detection Prompting + + +
+ The problem of roadside monocular 3D detection requires detecting objects of +interested classes in a 2D RGB frame and predicting their 3D information such +as locations in bird's-eye-view (BEV). It has broad applications in traffic +control, vehicle-vehicle communication, and vehicle-infrastructure cooperative +perception. To approach this problem, we present a novel and simple method by +prompting the 3D detector using 2D detections. Our method builds on a key +insight that, compared with 3D detectors, a 2D detector is much easier to train +and performs significantly better w.r.t detections on the 2D image plane. That +said, one can exploit 2D detections of a well-trained 2D detector as prompts to +a 3D detector, being trained in a way of inflating such 2D detections to 3D +towards 3D detection. To construct better prompts using the 2D detector, we +explore three techniques: (a) concatenating both 2D and 3D detectors' features, +(b) attentively fusing 2D and 3D detectors' features, and (c) encoding +predicted 2D boxes x, y, width, height, label and attentively fusing such with +the 3D detector's features. Surprisingly, the third performs the best. +Moreover, we present a yaw tuning tactic and a class-grouping strategy that +merges classes based on their functionality; these techniques improve 3D +detection performance further. Comprehensive ablation studies and extensive +experiments demonstrate that our method resoundingly outperforms prior works, +achieving the state-of-the-art on two large-scale roadside 3D detection +benchmarks. + +
+
+
+
+
+ + ☆ HAHA: Highly Articulated Gaussian Human Avatars with Textured Mesh Prior + + +
+ We present HAHA - a novel approach for animatable human avatar generation +from monocular input videos. The proposed method relies on learning the +trade-off between the use of Gaussian splatting and a textured mesh for +efficient and high fidelity rendering. We demonstrate its efficiency to animate +and render full-body human avatars controlled via the SMPL-X parametric model. +Our model learns to apply Gaussian splatting only in areas of the SMPL-X mesh +where it is necessary, like hair and out-of-mesh clothing. This results in a +minimal number of Gaussians being used to represent the full avatar, and +reduced rendering artifacts. This allows us to handle the animation of small +body parts such as fingers that are traditionally disregarded. We demonstrate +the effectiveness of our approach on two open datasets: SnapshotPeople and +X-Humans. Our method demonstrates on par reconstruction quality to the +state-of-the-art on SnapshotPeople, while using less than a third of Gaussians. +HAHA outperforms previous state-of-the-art on novel poses from X-Humans both +quantitatively and qualitatively. + +
+
+
+
+
+ + ☆ Action Detection via an Image Diffusion Process CVPR 2024 + + +
+ Action detection aims to localize the starting and ending points of action +instances in untrimmed videos, and predict the classes of those instances. In +this paper, we make the observation that the outputs of the action detection +task can be formulated as images. Thus, from a novel perspective, we tackle +action detection via a three-image generation process to generate starting +point, ending point and action-class predictions as images via our proposed +Action Detection Image Diffusion (ADI-Diff) framework. Furthermore, since our +images differ from natural images and exhibit special properties, we further +explore a Discrete Action-Detection Diffusion Process and a Row-Column +Transformer design to better handle their processing. Our ADI-Diff framework +achieves state-of-the-art results on two widely-used datasets. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Drag Your Noise: Interactive Point-based Editing via Diffusion Semantic + Propagation CVPR 2024 + + +
+ Point-based interactive editing serves as an essential tool to complement the +controllability of existing generative models. A concurrent work, +DragDiffusion, updates the diffusion latent map in response to user inputs, +causing global latent map alterations. This results in imprecise preservation +of the original content and unsuccessful editing due to gradient vanishing. In +contrast, we present DragNoise, offering robust and accelerated editing without +retracing the latent map. The core rationale of DragNoise lies in utilizing the +predicted noise output of each U-Net as a semantic editor. This approach is +grounded in two critical observations: firstly, the bottleneck features of +U-Net inherently possess semantically rich features ideal for interactive +editing; secondly, high-level semantics, established early in the denoising +process, show minimal variation in subsequent stages. Leveraging these +insights, DragNoise edits diffusion semantics in a single denoising step and +efficiently propagates these changes, ensuring stability and efficiency in +diffusion editing. Comparative experiments reveal that DragNoise achieves +superior control and semantic retention, reducing the optimization time by over +50% compared to DragDiffusion. Our codes are available at +https://github.com/haofengl/DragNoise. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Higher education assessment practice in the era of generative AI tools + + +
+ The higher education (HE) sector benefits every nation's economy and society +at large. However, their contributions are challenged by advanced technologies +like generative artificial intelligence (GenAI) tools. In this paper, we +provide a comprehensive assessment of GenAI tools towards assessment and +pedagogic practice and, subsequently, discuss the potential impacts. This study +experimented using three assessment instruments from data science, data +analytics, and construction management disciplines. Our findings are two-fold: +first, the findings revealed that GenAI tools exhibit subject knowledge, +problem-solving, analytical, critical thinking, and presentation skills and +thus can limit learning when used unethically. Secondly, the design of the +assessment of certain disciplines revealed the limitations of the GenAI tools. +Based on our findings, we made recommendations on how AI tools can be utilised +for teaching and learning in HE. + +
+
+ comment: 11 pages, 7 tables published in the Journal of Applied Learning & + Teaching +
+
+
+
+
+ + ☆ AIGCOIQA2024: Perceptual Quality Assessment of AI Generated + Omnidirectional Images + + +
+ In recent years, the rapid advancement of Artificial Intelligence Generated +Content (AIGC) has attracted widespread attention. Among the AIGC, AI generated +omnidirectional images hold significant potential for Virtual Reality (VR) and +Augmented Reality (AR) applications, hence omnidirectional AIGC techniques have +also been widely studied. AI-generated omnidirectional images exhibit unique +distortions compared to natural omnidirectional images, however, there is no +dedicated Image Quality Assessment (IQA) criteria for assessing them. This +study addresses this gap by establishing a large-scale AI generated +omnidirectional image IQA database named AIGCOIQA2024 and constructing a +comprehensive benchmark. We first generate 300 omnidirectional images based on +5 AIGC models utilizing 25 text prompts. A subjective IQA experiment is +conducted subsequently to assess human visual preferences from three +perspectives including quality, comfortability, and correspondence. Finally, we +conduct a benchmark experiment to evaluate the performance of state-of-the-art +IQA models on our database. The database will be released to facilitate future +research. + +
+
+
+
+
+ + ☆ Harnessing Large Language Models for Training-free Video Anomaly + Detection CVPR 2024 + + +
+ Video anomaly detection (VAD) aims to temporally locate abnormal events in a +video. Existing works mostly rely on training deep models to learn the +distribution of normality with either video-level supervision, one-class +supervision, or in an unsupervised setting. Training-based methods are prone to +be domain-specific, thus being costly for practical deployment as any domain +change will involve data collection and model training. In this paper, we +radically depart from previous efforts and propose LAnguage-based VAD (LAVAD), +a method tackling VAD in a novel, training-free paradigm, exploiting the +capabilities of pre-trained large language models (LLMs) and existing +vision-language models (VLMs). We leverage VLM-based captioning models to +generate textual descriptions for each frame of any test video. With the +textual scene description, we then devise a prompting mechanism to unlock the +capability of LLMs in terms of temporal aggregation and anomaly score +estimation, turning LLMs into an effective video anomaly detector. We further +leverage modality-aligned VLMs and propose effective techniques based on +cross-modal similarity for cleaning noisy captions and refining the LLM-based +anomaly scores. We evaluate LAVAD on two large datasets featuring real-world +surveillance scenarios (UCF-Crime and XD-Violence), showing that it outperforms +both unsupervised and one-class methods without requiring any training or data +collection. + +
+
+ comment: CVPR 2024. Project website at https://lucazanella.github.io/lavad/ +
+
+
+
+
+ + ☆ Teeth-SEG: An Efficient Instance Segmentation Framework for Orthodontic + Treatment based on Anthropic Prior Knowledge CVPR 2024 + + +
+ Teeth localization, segmentation, and labeling in 2D images have great +potential in modern dentistry to enhance dental diagnostics, treatment +planning, and population-based studies on oral health. However, general +instance segmentation frameworks are incompetent due to 1) the subtle +differences between some teeth' shapes (e.g., maxillary first premolar and +second premolar), 2) the teeth's position and shape variation across subjects, +and 3) the presence of abnormalities in the dentition (e.g., caries and +edentulism). To address these problems, we propose a ViT-based framework named +TeethSEG, which consists of stacked Multi-Scale Aggregation (MSA) blocks and an +Anthropic Prior Knowledge (APK) layer. Specifically, to compose the two +modules, we design 1) a unique permutation-based upscaler to ensure high +efficiency while establishing clear segmentation boundaries with 2) multi-head +self/cross-gating layers to emphasize particular semantics meanwhile +maintaining the divergence between token embeddings. Besides, we collect 3) the +first open-sourced intraoral image dataset IO150K, which comprises over 150k +intraoral photos, and all photos are annotated by orthodontists using a +human-machine hybrid algorithm. Experiments on IO150K demonstrate that our +TeethSEG outperforms the state-of-the-art segmentation models on dental image +segmentation. + +
+
+ comment: This paper has been accepted by CVPR 2024 +
+
+
+
+
+ + ☆ AMOR: Ambiguous Authorship Order + + +
+ As we all know, writing scientific papers together with our beloved +colleagues is a truly remarkable experience (partially): endless discussions +about the same useless paragraph over and over again, followed by long days and +long nights -- both at the same time. What a wonderful ride it is! What a +beautiful life we have. But wait, there's one tiny little problem that utterly +shatters the peace, turning even renowned scientists into bloodthirsty +monsters: author order. The reason is that, contrary to widespread opinion, +it's not the font size that matters, but the way things are ordered. Of course, +this is a fairly well-known fact among scientists all across the planet (and +beyond) and explains clearly why we regularly have to read about yet another +escalated paper submission in local police reports. + In this paper, we take an important step backwards to tackle this issue by +solving the so-called author ordering problem (AOP) once and for all. +Specifically, we propose AMOR, a system that replaces silly constructs like +co-first or co-middle authorship with a simple yet easy probabilistic approach +based on random shuffling of the author list at viewing time. In addition to +AOP, we also solve the ambiguous author ordering citation problem} (AAOCP) on +the fly. Stop author violence, be human. + +
+
+ comment: SIGBOVIK '24 submission +
+
+
+
+
+ + ☆ SGCNeRF: Few-Shot Neural Rendering via Sparse Geometric Consistency + Guidance + + +
+ Neural Radiance Field (NeRF) technology has made significant strides in +creating novel viewpoints. However, its effectiveness is hampered when working +with sparsely available views, often leading to performance dips due to +overfitting. FreeNeRF attempts to overcome this limitation by integrating +implicit geometry regularization, which incrementally improves both geometry +and textures. Nonetheless, an initial low positional encoding bandwidth results +in the exclusion of high-frequency elements. The quest for a holistic approach +that simultaneously addresses overfitting and the preservation of +high-frequency details remains ongoing. This study introduces a novel feature +matching based sparse geometry regularization module. This module excels in +pinpointing high-frequency keypoints, thereby safeguarding the integrity of +fine details. Through progressive refinement of geometry and textures across +NeRF iterations, we unveil an effective few-shot neural rendering architecture, +designated as SGCNeRF, for enhanced novel view synthesis. Our experiments +demonstrate that SGCNeRF not only achieves superior geometry-consistent +outcomes but also surpasses FreeNeRF, with improvements of 0.7 dB and 0.6 dB in +PSNR on the LLFF and DTU datasets, respectively. + +
+
+
+
+
+ + ☆ 360+x: A Panoptic Multi-modal Scene Understanding Dataset + + +
+ Human perception of the world is shaped by a multitude of viewpoints and +modalities. While many existing datasets focus on scene understanding from a +certain perspective (e.g. egocentric or third-person views), our dataset offers +a panoptic perspective (i.e. multiple viewpoints with multiple data +modalities). Specifically, we encapsulate third-person panoramic and front +views, as well as egocentric monocular/binocular views with rich modalities +including video, multi-channel audio, directional binaural delay, location data +and textual scene descriptions within each scene captured, presenting +comprehensive observation of the world. Figure 1 offers a glimpse of all 28 +scene categories of our 360+x dataset. To the best of our knowledge, this is +the first database that covers multiple viewpoints with multiple data +modalities to mimic how daily information is accessed in the real world. +Through our benchmark analysis, we presented 5 different scene understanding +tasks on the proposed 360+x dataset to evaluate the impact and benefit of each +data modality and perspective in panoptic scene understanding. We hope this +unique dataset could broaden the scope of comprehensive scene understanding and +encourage the community to approach these problems from more diverse +perspectives. + +
+
+ comment: To access the public dataset, please visit + https://x360dataset.github.io +
+
+
+
+
+ + ☆ FlexiDreamer: Single Image-to-3D Generation with FlexiCubes + + +
+ 3D content generation from text prompts or single images has made remarkable +progress in quality and speed recently. One of its dominant paradigms involves +generating consistent multi-view images followed by a sparse-view +reconstruction. However, due to the challenge of directly deforming the mesh +representation to approach the target topology, most methodologies learn an +implicit representation (such as NeRF) during the sparse-view reconstruction +and acquire the target mesh by a post-processing extraction. Although the +implicit representation can effectively model rich 3D information, its training +typically entails a long convergence time. In addition, the post-extraction +operation from the implicit field also leads to undesirable visual artifacts. +In this paper, we propose FlexiDreamer, a novel single image-to-3d generation +framework that reconstructs the target mesh in an end-to-end manner. By +leveraging a flexible gradient-based extraction known as FlexiCubes, our method +circumvents the defects brought by the post-processing and facilitates a direct +acquisition of the target mesh. Furthermore, we incorporate a multi-resolution +hash grid encoding scheme that progressively activates the encoding levels into +the implicit field in FlexiCubes to help capture geometric details for per-step +optimization. Notably, FlexiDreamer recovers a dense 3D structure from a +single-view image in approximately 1 minute on a single NVIDIA A100 GPU, +outperforming previous methodologies by a large margin. + +
+
+ comment: project page:https://flexidreamer.github.io +
+
+
+
+
+ + ☆ Make Continual Learning Stronger via C-Flat + + +
+ Model generalization ability upon incrementally acquiring dynamically +updating knowledge from sequentially arriving tasks is crucial to tackle the +sensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape +sharpness minimization seeking for flat minima lying in neighborhoods with +uniform low loss or smooth gradient is proven to be a strong training regime +improving model generalization compared with loss minimization based optimizer +like SGD. Yet only a few works have discussed this training regime for CL, +proving that dedicated designed zeroth-order sharpness optimizer can improve CL +performance. In this work, we propose a Continual Flatness (C-Flat) method +featuring a flatter loss landscape tailored for CL. C-Flat could be easily +called with only one line of code and is plug-and-play to any CL methods. A +general framework of C-Flat applied to all CL categories and a thorough +comparison with loss minima optimizer and flat minima based CL approaches is +presented in this paper, showing that our method can boost CL performance in +almost all cases. Code will be publicly available upon publication. + +
+
+
+
+
+ + ☆ CAMO: Correlation-Aware Mask Optimization with Modulated Reinforcement + Learning + + +
+ Optical proximity correction (OPC) is a vital step to ensure printability in +modern VLSI manufacturing. Various OPC approaches based on machine learning +have been proposed to pursue performance and efficiency, which are typically +data-driven and hardly involve any particular considerations of the OPC +problem, leading to potential performance or efficiency bottlenecks. In this +paper, we propose CAMO, a reinforcement learning-based OPC system that +specifically integrates important principles of the OPC problem. CAMO +explicitly involves the spatial correlation among the movements of neighboring +segments and an OPC-inspired modulation for movement action selection. +Experiments are conducted on both via layer patterns and metal layer patterns. +The results demonstrate that CAMO outperforms state-of-the-art OPC engines from +both academia and industry. + +
+
+ comment: Accepted by DAC 2024 +
+
+
+
+
+ + ☆ PDF: A Probability-Driven Framework for Open World 3D Point Cloud + Semantic Segmentation + + +
+ Existing point cloud semantic segmentation networks cannot identify unknown +classes and update their knowledge, due to a closed-set and static perspective +of the real world, which would induce the intelligent agent to make bad +decisions. To address this problem, we propose a Probability-Driven Framework +(PDF) for open world semantic segmentation that includes (i) a lightweight +U-decoder branch to identify unknown classes by estimating the uncertainties, +(ii) a flexible pseudo-labeling scheme to supply geometry features along with +probability distribution features of unknown classes by generating pseudo +labels, and (iii) an incremental knowledge distillation strategy to incorporate +novel classes into the existing knowledge base gradually. Our framework enables +the model to behave like human beings, which could recognize unknown objects +and incrementally learn them with the corresponding knowledge. Experimental +results on the S3DIS and ScanNetv2 datasets demonstrate that the proposed PDF +outperforms other methods by a large margin in both important tasks of open +world semantic segmentation. + +
+
+
+
+
+ + ☆ Improving Visual Recognition with Hyperbolical Visual Hierarchy Mapping CVPR 2024 + + +
+ Visual scenes are naturally organized in a hierarchy, where a coarse semantic +is recursively comprised of several fine details. Exploring such a visual +hierarchy is crucial to recognize the complex relations of visual elements, +leading to a comprehensive scene understanding. In this paper, we propose a +Visual Hierarchy Mapper (Hi-Mapper), a novel approach for enhancing the +structured understanding of the pre-trained Deep Neural Networks (DNNs). +Hi-Mapper investigates the hierarchical organization of the visual scene by 1) +pre-defining a hierarchy tree through the encapsulation of probability +densities; and 2) learning the hierarchical relations in hyperbolic space with +a novel hierarchical contrastive loss. The pre-defined hierarchy tree +recursively interacts with the visual features of the pre-trained DNNs through +hierarchy decomposition and encoding procedures, thereby effectively +identifying the visual hierarchy and enhancing the recognition of an entire +scene. Extensive experiments demonstrate that Hi-Mapper significantly enhances +the representation capability of DNNs, leading to an improved performance on +various tasks, including image classification and dense prediction tasks. + +
+
+ comment: This paper is accepted to CVPR 2024. The supplementary material is + included. The code is available at + \url{https://github.com/kwonjunn01/Hi-Mapper} +
+
+
+
+
+ + ☆ VideoDistill: Language-aware Vision Distillation for Video Question + Answering CVPR2024 + + +
+ Significant advancements in video question answering (VideoQA) have been made +thanks to thriving large image-language pretraining frameworks. Although these +image-language models can efficiently represent both video and language +branches, they typically employ a goal-free vision perception process and do +not interact vision with language well during the answer generation, thus +omitting crucial visual cues. In this paper, we are inspired by the human +recognition and learning pattern and propose VideoDistill, a framework with +language-aware (i.e., goal-driven) behavior in both vision perception and +answer generation process. VideoDistill generates answers only from +question-related visual embeddings and follows a thinking-observing-answering +approach that closely resembles human behavior, distinguishing it from previous +research. Specifically, we develop a language-aware gating mechanism to replace +the standard cross-attention, avoiding language's direct fusion into visual +representations. We incorporate this mechanism into two key components of the +entire framework. The first component is a differentiable sparse sampling +module, which selects frames containing the necessary dynamics and semantics +relevant to the questions. The second component is a vision refinement module +that merges existing spatial-temporal attention layers to ensure the extraction +of multi-grained visual semantics associated with the questions. We conduct +experimental evaluations on various challenging video question-answering +benchmarks, and VideoDistill achieves state-of-the-art performance in both +general and long-form VideoQA datasets. In Addition, we verify that +VideoDistill can effectively alleviate the utilization of language shortcut +solutions in the EgoTaskQA dataset. + +
+
+ comment: This paper is accepted by CVPR2024 +
+
+
+
+
+ + ☆ S2RC-GCN: A Spatial-Spectral Reliable Contrastive Graph Convolutional + Network for Complex Land Cover Classification Using Hyperspectral Images IJCNN 2024 + + +
+ Spatial correlations between different ground objects are an important +feature of mining land cover research. Graph Convolutional Networks (GCNs) can +effectively capture such spatial feature representations and have demonstrated +promising results in performing hyperspectral imagery (HSI) classification +tasks of complex land. However, the existing GCN-based HSI classification +methods are prone to interference from redundant information when extracting +complex features. To classify complex scenes more effectively, this study +proposes a novel spatial-spectral reliable contrastive graph convolutional +classification framework named S2RC-GCN. Specifically, we fused the spectral +and spatial features extracted by the 1D- and 2D-encoder, and the 2D-encoder +includes an attention model to automatically extract important information. We +then leveraged the fused high-level features to construct graphs and fed the +resulting graphs into the GCNs to determine more effective graph +representations. Furthermore, a novel reliable contrastive graph convolution +was proposed for reliable contrastive learning to learn and fuse robust +features. Finally, to test the performance of the model on complex object +classification, we used imagery taken by Gaofen-5 in the Jiang Xia area to +construct complex land cover datasets. The test results show that compared with +other models, our model achieved the best results and effectively improved the +classification performance of complex remote sensing imagery. + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ☆ Equivariant Local Reference Frames for Unsupervised Non-rigid Point + Cloud Shape Correspondence + + +
+ Unsupervised non-rigid point cloud shape correspondence underpins a multitude +of 3D vision tasks, yet itself is non-trivial given the exponential complexity +stemming from inter-point degree-of-freedom, i.e., pose transformations. Based +on the assumption of local rigidity, one solution for reducing complexity is to +decompose the overall shape into independent local regions using Local +Reference Frames (LRFs) that are invariant to SE(3) transformations. However, +the focus solely on local structure neglects global geometric contexts, +resulting in less distinctive LRFs that lack crucial semantic information +necessary for effective matching. Furthermore, such complexity introduces +out-of-distribution geometric contexts during inference, thus complicating +generalization. To this end, we introduce 1) EquiShape, a novel structure +tailored to learn pair-wise LRFs with global structural cues for both spatial +and semantic consistency, and 2) LRF-Refine, an optimization strategy generally +applicable to LRF-based methods, aimed at addressing the generalization +challenges. Specifically, for EquiShape, we employ cross-talk within separate +equivariant graph neural networks (Cross-GVP) to build long-range dependencies +to compensate for the lack of semantic information in local structure modeling, +deducing pair-wise independent SE(3)-equivariant LRF vectors for each point. +For LRF-Refine, the optimization adjusts LRFs within specific contexts and +knowledge, enhancing the geometric and semantic generalizability of point +features. Our overall framework surpasses the state-of-the-art methods by a +large margin on three benchmarks. Code and models will be publicly available. + +
+
+
+
+
+ + ☆ Harnessing The Power of Attention For Patch-Based Biomedical Image + Classification + + +
+ Biomedical image analysis can be facilitated by an innovative architecture +rooted in self-attention mechanisms. The traditional convolutional neural +network (CNN), characterized by fixed-sized windows, needs help capturing +intricate spatial and temporal relations at the pixel level. The immutability +of CNN filter weights post-training further restricts input fluctuations. +Recognizing these limitations, we propose a new paradigm of attention-based +models instead of convolutions. As an alternative to traditional CNNs, these +models demonstrate robust modelling capabilities and the ability to grasp +comprehensive long-range contextual information efficiently. Providing a +solution to critical challenges faced by attention-based vision models such as +inductive bias, weight sharing, receptive field limitations, and data handling +in high resolution, our work combines non-overlapping (vanilla patching) with +novel overlapped Shifted Patching Techniques (S.P.T.s) to induce local context +that enhances model generalization. Moreover, we examine the novel Lancoz5 +interpolation technique, which adapts variable image sizes to higher +resolutions. Experimental evidence validates our model's generalization +effectiveness, comparing favourably with existing approaches. Attention-based +methods are particularly effective with ample data, especially when advanced +data augmentation methodologies are integrated to strengthen their robustness. + +
+
+
+
+
+ + ☆ Exploring the Efficacy of Group-Normalization in Deep Learning Models + for Alzheimer's Disease Classification + + +
+ Batch Normalization is an important approach to advancing deep learning since +it allows multiple networks to train simultaneously. A problem arises when +normalizing along the batch dimension because B.N.'s error increases +significantly as batch size shrinks because batch statistics estimates are +inaccurate. As a result, computer vision tasks like detection, segmentation, +and video, which require tiny batches based on memory consumption, aren't +suitable for using Batch Normalization for larger model training and feature +transfer. Here, we explore Group Normalization as an easy alternative to using +Batch Normalization A Group Normalization is a channel normalization method in +which each group is divided into different channels, and the corresponding mean +and variance are calculated for each group. Group Normalization computations +are accurate across a wide range of batch sizes and are independent of batch +size. When trained using a large ImageNet database on ResNet-50, GN achieves a +very low error rate of 10.6% compared to Batch Normalization. when a smaller +batch size of only 2 is used. For usual batch sizes, the performance of G.N. is +comparable to that of Batch Normalization, but at the same time, it outperforms +other normalization techniques. Implementing Group Normalization as a direct +alternative to B.N to combat the serious challenges faced by the Batch +Normalization in deep learning models with comparable or improved +classification accuracy. Additionally, Group Normalization can be naturally +transferred from the pre-training to the fine-tuning phase. . + +
+
+ comment: 19 pages, 3 figures +
+
+
+
+
+ + ☆ How Can Large Language Models Enable Better Socially Assistive + Human-Robot Interaction: A Brief Survey AAAI + + +
+ Socially assistive robots (SARs) have shown great success in providing +personalized cognitive-affective support for user populations with special +needs such as older adults, children with autism spectrum disorder (ASD), and +individuals with mental health challenges. The large body of work on SAR +demonstrates its potential to provide at-home support that complements +clinic-based interventions delivered by mental health professionals, making +these interventions more effective and accessible. However, there are still +several major technical challenges that hinder SAR-mediated interactions and +interventions from reaching human-level social intelligence and efficacy. With +the recent advances in large language models (LLMs), there is an increased +potential for novel applications within the field of SAR that can significantly +expand the current capabilities of SARs. However, incorporating LLMs introduces +new risks and ethical concerns that have not yet been encountered, and must be +carefully be addressed to safely deploy these more advanced systems. In this +work, we aim to conduct a brief survey on the use of LLMs in SAR technologies, +and discuss the potentials and risks of applying LLMs to the following three +major technical challenges of SAR: 1) natural language dialog; 2) multimodal +understanding; 3) LLMs as robot policies. + +
+
+ comment: 2 pages, to be submitted to 2024 AAAI Spring Symposium +
+
+
+
+
+ + ☆ GOV-NeSF: Generalizable Open-Vocabulary Neural Semantic Fields + + +
+ Recent advancements in vision-language foundation models have significantly +enhanced open-vocabulary 3D scene understanding. However, the generalizability +of existing methods is constrained due to their framework designs and their +reliance on 3D data. We address this limitation by introducing Generalizable +Open-Vocabulary Neural Semantic Fields (GOV-NeSF), a novel approach offering a +generalizable implicit representation of 3D scenes with open-vocabulary +semantics. We aggregate the geometry-aware features using a cost volume, and +propose a Multi-view Joint Fusion module to aggregate multi-view features +through a cross-view attention mechanism, which effectively predicts +view-specific blending weights for both colors and open-vocabulary features. +Remarkably, our GOV-NeSF exhibits state-of-the-art performance in both 2D and +3D open-vocabulary semantic segmentation, eliminating the need for ground truth +semantic labels or depth priors, and effectively generalize across scenes and +datasets without fine-tuning. + +
+
+
+
+
+ + ☆ VortexViz: Finding Vortex Boundaries by Learning from Particle + Trajectories + + +
+ Vortices are studied in various scientific disciplines, offering insights +into fluid flow behavior. Visualizing the boundary of vortices is crucial for +understanding flow phenomena and detecting flow irregularities. This paper +addresses the challenge of accurately extracting vortex boundaries using deep +learning techniques. While existing methods primarily train on velocity +components, we propose a novel approach incorporating particle trajectories +(streamlines or pathlines) into the learning process. By leveraging the +regional/local characteristics of the flow field captured by streamlines or +pathlines, our methodology aims to enhance the accuracy of vortex boundary +extraction. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Instance-Aware Group Quantization for Vision Transformers CVPR 2024 + + +
+ Post-training quantization (PTQ) is an efficient model compression technique +that quantizes a pretrained full-precision model using only a small calibration +set of unlabeled samples without retraining. PTQ methods for convolutional +neural networks (CNNs) provide quantization results comparable to +full-precision counterparts. Directly applying them to vision transformers +(ViTs), however, incurs severe performance degradation, mainly due to the +differences in architectures between CNNs and ViTs. In particular, the +distribution of activations for each channel vary drastically according to +input instances, making PTQ methods for CNNs inappropriate for ViTs. To address +this, we introduce instance-aware group quantization for ViTs (IGQ-ViT). To +this end, we propose to split the channels of activation maps into multiple +groups dynamically for each input instance, such that activations within each +group share similar statistical properties. We also extend our scheme to +quantize softmax attentions across tokens. In addition, the number of groups +for each layer is adjusted to minimize the discrepancies between predictions +from quantized and full-precision models, under a bit-operation (BOP) +constraint. We show extensive experimental results on image classification, +object detection, and instance segmentation, with various transformer +architectures, demonstrating the effectiveness of our approach. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ LLMs are Good Sign Language Translators CVPR 2024 + + +
+ Sign Language Translation (SLT) is a challenging task that aims to translate +sign videos into spoken language. Inspired by the strong translation +capabilities of large language models (LLMs) that are trained on extensive +multilingual text corpora, we aim to harness off-the-shelf LLMs to handle SLT. +In this paper, we regularize the sign videos to embody linguistic +characteristics of spoken language, and propose a novel SignLLM framework to +transform sign videos into a language-like representation for improved +readability by off-the-shelf LLMs. SignLLM comprises two key modules: (1) The +Vector-Quantized Visual Sign module converts sign videos into a sequence of +discrete character-level sign tokens, and (2) the Codebook Reconstruction and +Alignment module converts these character-level tokens into word-level sign +representations using an optimal transport formulation. A sign-text alignment +loss further bridges the gap between sign and text tokens, enhancing semantic +compatibility. We achieve state-of-the-art gloss-free results on two +widely-used SLT benchmarks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ BadPart: Unified Black-box Adversarial Patch Attacks against Pixel-wise + Regression Tasks + + +
+ Pixel-wise regression tasks (e.g., monocular depth estimation (MDE) and +optical flow estimation (OFE)) have been widely involved in our daily life in +applications like autonomous driving, augmented reality and video composition. +Although certain applications are security-critical or bear societal +significance, the adversarial robustness of such models are not sufficiently +studied, especially in the black-box scenario. In this work, we introduce the +first unified black-box adversarial patch attack framework against pixel-wise +regression tasks, aiming to identify the vulnerabilities of these models under +query-based black-box attacks. We propose a novel square-based adversarial +patch optimization framework and employ probabilistic square sampling and +score-based gradient estimation techniques to generate the patch effectively +and efficiently, overcoming the scalability problem of previous black-box patch +attacks. Our attack prototype, named BadPart, is evaluated on both MDE and OFE +tasks, utilizing a total of 7 models. BadPart surpasses 3 baseline methods in +terms of both attack performance and efficiency. We also apply BadPart on the +Google online service for portrait depth estimation, causing 43.5% relative +distance error with 50K queries. State-of-the-art (SOTA) countermeasures cannot +defend our attack effectively. + +
+
+
+
+
+ + ☆ MM3DGS SLAM: Multi-modal 3D Gaussian Splatting for SLAM Using Vision, + Depth, and Inertial Measurements + + +
+ Simultaneous localization and mapping is essential for position tracking and +scene understanding. 3D Gaussian-based map representations enable +photorealistic reconstruction and real-time rendering of scenes using multiple +posed cameras. We show for the first time that using 3D Gaussians for map +representation with unposed camera images and inertial measurements can enable +accurate SLAM. Our method, MM3DGS, addresses the limitations of prior neural +radiance field-based representations by enabling faster rendering, scale +awareness, and improved trajectory tracking. Our framework enables +keyframe-based mapping and tracking utilizing loss functions that incorporate +relative pose transformations from pre-integrated inertial measurements, depth +estimates, and measures of photometric rendering quality. We also release a +multi-modal dataset, UT-MM, collected from a mobile robot equipped with a +camera and an inertial measurement unit. Experimental evaluation on several +scenes from the dataset shows that MM3DGS achieves 3x improvement in tracking +and 5% improvement in photometric rendering quality compared to the current +3DGS SLAM state-of-the-art, while allowing real-time rendering of a +high-resolution dense 3D map. Project Webpage: +https://vita-group.github.io/MM3DGS-SLAM + +
+
+ comment: Project Webpage: https://vita-group.github.io/MM3DGS-SLAM +
+
+
+
+
+ + ☆ Towards Memorization-Free Diffusion Models CVPR2024 + + +
+ Pretrained diffusion models and their outputs are widely accessible due to +their exceptional capacity for synthesizing high-quality images and their +open-source nature. The users, however, may face litigation risks owing to the +models' tendency to memorize and regurgitate training data during inference. To +address this, we introduce Anti-Memorization Guidance (AMG), a novel framework +employing three targeted guidance strategies for the main causes of +memorization: image and caption duplication, and highly specific user prompts. +Consequently, AMG ensures memorization-free outputs while maintaining high +image quality and text alignment, leveraging the synergy of its guidance +methods, each indispensable in its own right. AMG also features an innovative +automatic detection system for potential memorization during each step of +inference process, allows selective application of guidance strategies, +minimally interfering with the original sampling process to preserve output +utility. We applied AMG to pretrained Denoising Diffusion Probabilistic Models +(DDPM) and Stable Diffusion across various generation tasks. The results +demonstrate that AMG is the first approach to successfully eradicates all +instances of memorization with no or marginal impacts on image quality and +text-alignment, as evidenced by FID and CLIP scores. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Towards Label-Efficient Human Matting: A Simple Baseline for Weakly + Semi-Supervised Trimap-Free Human Matting + + +
+ This paper presents a new practical training method for human matting, which +demands delicate pixel-level human region identification and significantly +laborious annotations. To reduce the annotation cost, most existing matting +approaches often rely on image synthesis to augment the dataset. However, the +unnaturalness of synthesized training images brings in a new domain +generalization challenge for natural images. To address this challenge, we +introduce a new learning paradigm, weakly semi-supervised human matting +(WSSHM), which leverages a small amount of expensive matte labels and a large +amount of budget-friendly segmentation labels, to save the annotation cost and +resolve the domain generalization problem. To achieve the goal of WSSHM, we +propose a simple and effective training method, named Matte Label Blending +(MLB), that selectively guides only the beneficial knowledge of the +segmentation and matte data to the matting model. Extensive experiments with +our detailed analysis demonstrate our method can substantially improve the +robustness of the matting model using a few matte data and numerous +segmentation data. Our training method is also easily applicable to real-time +models, achieving competitive accuracy with breakneck inference speed (328 FPS +on NVIDIA V100 GPU). The implementation code is available at +\url{https://github.com/clovaai/WSSHM}. + +
+
+ comment: Preprint, 15 pages, 13 figures +
+
+
+
+
+ + ☆ Gyro-based Neural Single Image Deblurring + + +
+ In this paper, we present GyroDeblurNet, a novel single image deblurring +method that utilizes a gyro sensor to effectively resolve the ill-posedness of +image deblurring. The gyro sensor provides valuable information about camera +motion during exposure time that can significantly improve deblurring quality. +However, effectively exploiting real-world gyro data is challenging due to +significant errors from various sources including sensor noise, the disparity +between the positions of a camera module and a gyro sensor, the absence of +translational motion information, and moving objects whose motions cannot be +captured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with +two novel neural network blocks: a gyro refinement block and a gyro deblurring +block. The gyro refinement block refines the error-ridden gyro data using the +blur information from the input image. On the other hand, the gyro deblurring +block removes blur from the input image using the refined gyro data and further +compensates for gyro error by leveraging the blur information from the input +image. For training a neural network with erroneous gyro data, we propose a +training strategy based on the curriculum learning. We also introduce a novel +gyro data embedding scheme to represent real-world intricate camera shakes. +Finally, we present a synthetic dataset and a real dataset for the training and +evaluation of gyro-based single image deblurring. Our experiments demonstrate +that our approach achieves state-of-the-art deblurring quality by effectively +utilizing erroneous gyro data. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ☆ Scalable 3D Registration via Truncated Entry-wise Absolute Residuals CVPR 2024 + + +
+ Given an input set of $3$D point pairs, the goal of outlier-robust $3$D +registration is to compute some rotation and translation that align as many +point pairs as possible. This is an important problem in computer vision, for +which many highly accurate approaches have been recently proposed. Despite +their impressive performance, these approaches lack scalability, often +overflowing the $16$GB of memory of a standard laptop to handle roughly +$30,000$ point pairs. In this paper, we propose a $3$D registration approach +that can process more than ten million ($10^7$) point pairs with over $99\%$ +random outliers. Moreover, our method is efficient, entails low memory costs, +and maintains high accuracy at the same time. We call our method TEAR, as it +involves minimizing an outlier-robust loss that computes Truncated Entry-wise +Absolute Residuals. To minimize this loss, we decompose the original +$6$-dimensional problem into two subproblems of dimensions $3$ and $2$, +respectively, solved in succession to global optimality via a customized +branch-and-bound method. While branch-and-bound is often slow and unscalable, +this does not apply to TEAR as we propose novel bounding functions that are +tight and computationally efficient. Experiments on various datasets are +conducted to validate the scalability and efficiency of our method. + +
+
+ comment: 24 pages, 12 figures. Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ LLaMA-Excitor: General Instruction Tuning via Indirect Feature + Interaction CVPR 2024 + + +
+ Existing methods to fine-tune LLMs, like Adapter, Prefix-tuning, and LoRA, +which introduce extra modules or additional input sequences to inject new +skills or knowledge, may compromise the innate abilities of LLMs. In this +paper, we propose LLaMA-Excitor, a lightweight method that stimulates the LLMs' +potential to better follow instructions by gradually paying more attention to +worthwhile information. Specifically, the LLaMA-Excitor does not directly +change the intermediate hidden state during the self-attention calculation of +the transformer structure. We designed the Excitor block as a bypass module for +the similarity score computation in LLMs' self-attention to reconstruct keys +and change the importance of values by learnable prompts. LLaMA-Excitor ensures +a self-adaptive allocation of additional attention to input instructions, thus +effectively preserving LLMs' pre-trained knowledge when fine-tuning LLMs on +low-quality instruction-following datasets. Furthermore, we unify the modeling +of multi-modal tuning and language-only tuning, extending LLaMA-Excitor to a +powerful visual instruction follower without the need for complex multi-modal +alignment. Our proposed approach is evaluated in language-only and multi-modal +tuning experimental scenarios. Notably, LLaMA-Excitor is the only method that +maintains basic capabilities while achieving a significant improvement (+6%) on +the MMLU benchmark. In the visual instruction tuning, we achieve a new +state-of-the-art image captioning performance of 157.5 CIDEr on MSCOCO, and a +comparable performance (88.39%) on ScienceQA to cutting-edge models with more +parameters and extensive vision-language pertaining. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Learning by Correction: Efficient Tuning Task for Zero-Shot Generative + Vision-Language Reasoning CVPR2024 + + +
+ Generative vision-language models (VLMs) have shown impressive performance in +zero-shot vision-language tasks like image captioning and visual question +answering. However, improving their zero-shot reasoning typically requires +second-stage instruction tuning, which relies heavily on human-labeled or large +language model-generated annotation, incurring high labeling costs. To tackle +this challenge, we introduce Image-Conditioned Caption Correction (ICCC), a +novel pre-training task designed to enhance VLMs' zero-shot performance without +the need for labeled task-aware data. The ICCC task compels VLMs to rectify +mismatches between visual and language concepts, thereby enhancing instruction +following and text generation conditioned on visual inputs. Leveraging language +structure and a lightweight dependency parser, we construct data samples of +ICCC task from image-text datasets with low labeling and computation costs. +Experimental results on BLIP-2 and InstructBLIP demonstrate significant +improvements in zero-shot image-text generation-based VL tasks through ICCC +instruction tuning. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ AETTA: Label-Free Accuracy Estimation for Test-Time Adaptation CVPR 2024 + + +
+ Test-time adaptation (TTA) has emerged as a viable solution to adapt +pre-trained models to domain shifts using unlabeled test data. However, TTA +faces challenges of adaptation failures due to its reliance on blind adaptation +to unknown test samples in dynamic scenarios. Traditional methods for +out-of-distribution performance estimation are limited by unrealistic +assumptions in the TTA context, such as requiring labeled data or re-training +models. To address this issue, we propose AETTA, a label-free accuracy +estimation algorithm for TTA. We propose the prediction disagreement as the +accuracy estimate, calculated by comparing the target model prediction with +dropout inferences. We then improve the prediction disagreement to extend the +applicability of AETTA under adaptation failures. Our extensive evaluation with +four baselines and six TTA methods demonstrates that AETTA shows an average of +19.8%p more accurate estimation compared with the baselines. We further +demonstrate the effectiveness of accuracy estimation with a model recovery case +study, showcasing the practicality of our model recovery based on accuracy +estimation. The source code is available at https://github.com/taeckyung/AETTA. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ From Pixels to Graphs: Open-Vocabulary Scene Graph Generation with + Vision-Language Models CVPR 2024 + + +
+ Scene graph generation (SGG) aims to parse a visual scene into an +intermediate graph representation for downstream reasoning tasks. Despite +recent advancements, existing methods struggle to generate scene graphs with +novel visual relation concepts. To address this challenge, we introduce a new +open-vocabulary SGG framework based on sequence generation. Our framework +leverages vision-language pre-trained models (VLM) by incorporating an +image-to-graph generation paradigm. Specifically, we generate scene graph +sequences via image-to-text generation with VLM and then construct scene graphs +from these sequences. By doing so, we harness the strong capabilities of VLM +for open-vocabulary SGG and seamlessly integrate explicit relational modeling +for enhancing the VL tasks. Experimental results demonstrate that our design +not only achieves superior performance with an open vocabulary but also +enhances downstream vision-language task performance through explicit relation +modeling knowledge. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Slightly Shift New Classes to Remember Old Classes for Video + Class-Incremental Learning + + +
+ Recent video class-incremental learning usually excessively pursues the +accuracy of the newly seen classes and relies on memory sets to mitigate +catastrophic forgetting of the old classes. However, limited storage only +allows storing a few representative videos. So we propose SNRO, which slightly +shifts the features of new classes to remember old classes. Specifically, SNRO +contains Examples Sparse(ES) and Early Break(EB). ES decimates at a lower +sample rate to build memory sets and uses interpolation to align those sparse +frames in the future. By this, SNRO stores more examples under the same memory +consumption and forces the model to focus on low-semantic features which are +harder to be forgotten. EB terminates the training at a small epoch, preventing +the model from overstretching into the high-semantic space of the current task. +Experiments on UCF101, HMDB51, and UESTC-MMEA-CL datasets show that SNRO +performs better than other approaches while consuming the same memory +consumption. + +
+
+
+
+
+ + ☆ Marrying NeRF with Feature Matching for One-step Pose Estimation ICRA + + +
+ Given the image collection of an object, we aim at building a real-time +image-based pose estimation method, which requires neither its CAD model nor +hours of object-specific training. Recent NeRF-based methods provide a +promising solution by directly optimizing the pose from pixel loss between +rendered and target images. However, during inference, they require long +converging time, and suffer from local minima, making them impractical for +real-time robot applications. We aim at solving this problem by marrying image +matching with NeRF. With 2D matches and depth rendered by NeRF, we directly +solve the pose in one step by building 2D-3D correspondences between target and +initial view, thus allowing for real-time prediction. Moreover, to improve the +accuracy of 2D-3D correspondences, we propose a 3D consistent point mining +strategy, which effectively discards unfaithful points reconstruted by NeRF. +Moreover, current NeRF-based methods naively optimizing pixel loss fail at +occluded images. Thus, we further propose a 2D matches based sampling strategy +to preclude the occluded area. Experimental results on representative datasets +prove that our method outperforms state-of-the-art methods, and improves +inference efficiency by 90x, achieving real-time prediction at 6 FPS. + +
+
+ comment: ICRA, 2024. Video https://www.youtube.com/watch?v=70fgUobOFWo +
+
+
+
+
+ + ☆ Model-Agnostic Human Preference Inversion in Diffusion Models + + +
+ Efficient text-to-image generation remains a challenging task due to the high +computational costs associated with the multi-step sampling in diffusion +models. Although distillation of pre-trained diffusion models has been +successful in reducing sampling steps, low-step image generation often falls +short in terms of quality. In this study, we propose a novel sampling design to +achieve high-quality one-step image generation aligning with human preferences, +particularly focusing on exploring the impact of the prior noise distribution. +Our approach, Prompt Adaptive Human Preference Inversion (PAHI), optimizes the +noise distributions for each prompt based on human preferences without the need +for fine-tuning diffusion models. Our experiments showcase that the tailored +noise distributions significantly improve image quality with only a marginal +increase in computational cost. Our findings underscore the importance of noise +optimization and pave the way for efficient and high-quality text-to-image +synthesis. + +
+
+
+
+
+ + ☆ TryOn-Adapter: Efficient Fine-Grained Clothing Identity Adaptation for + High-Fidelity Virtual Try-On + + +
+ Virtual try-on focuses on adjusting the given clothes to fit a specific +person seamlessly while avoiding any distortion of the patterns and textures of +the garment. However, the clothing identity uncontrollability and training +inefficiency of existing diffusion-based methods, which struggle to maintain +the identity even with full parameter training, are significant limitations +that hinder the widespread applications. In this work, we propose an effective +and efficient framework, termed TryOn-Adapter. Specifically, we first decouple +clothing identity into fine-grained factors: style for color and category +information, texture for high-frequency details, and structure for smooth +spatial adaptive transformation. Our approach utilizes a pre-trained +exemplar-based diffusion model as the fundamental network, whose parameters are +frozen except for the attention layers. We then customize three lightweight +modules (Style Preserving, Texture Highlighting, and Structure Adapting) +incorporated with fine-tuning techniques to enable precise and efficient +identity control. Meanwhile, we introduce the training-free T-RePaint strategy +to further enhance clothing identity preservation while maintaining the +realistic try-on effect during the inference. Our experiments demonstrate that +our approach achieves state-of-the-art performance on two widely-used +benchmarks. Additionally, compared with recent full-tuning diffusion-based +methods, we only use about half of their tunable parameters during training. +The code will be made publicly available at +https://github.com/jiazheng-xing/TryOn-Adapter. + +
+
+
+
+
+ + ☆ MGMap: Mask-Guided Learning for Online Vectorized HD Map Construction CVPR 2024 + + +
+ Currently, high-definition (HD) map construction leans towards a lightweight +online generation tendency, which aims to preserve timely and reliable road +scene information. However, map elements contain strong shape priors. Subtle +and sparse annotations make current detection-based frameworks ambiguous in +locating relevant feature scopes and cause the loss of detailed structures in +prediction. To alleviate these problems, we propose MGMap, a mask-guided +approach that effectively highlights the informative regions and achieves +precise map element localization by introducing the learned masks. +Specifically, MGMap employs learned masks based on the enhanced multi-scale BEV +features from two perspectives. At the instance level, we propose the +Mask-activated instance (MAI) decoder, which incorporates global instance and +structural information into instance queries by the activation of instance +masks. At the point level, a novel position-guided mask patch refinement +(PG-MPR) module is designed to refine point locations from a finer-grained +perspective, enabling the extraction of point-specific patch information. +Compared to the baselines, our proposed MGMap achieves a notable improvement of +around 10 mAP for different input modalities. Extensive experiments also +demonstrate that our approach showcases strong robustness and generalization +capabilities. Our code can be found at https://github.com/xiaolul2/MGMap. + +
+
+ comment: 18 pages, 11 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ☆ DiSR-NeRF: Diffusion-Guided View-Consistent Super-Resolution NeRF + + +
+ We present DiSR-NeRF, a diffusion-guided framework for view-consistent +super-resolution (SR) NeRF. Unlike prior works, we circumvent the requirement +for high-resolution (HR) reference images by leveraging existing powerful 2D +super-resolution models. Nonetheless, independent SR 2D images are often +inconsistent across different views. We thus propose Iterative 3D +Synchronization (I3DS) to mitigate the inconsistency problem via the inherent +multi-view consistency property of NeRF. Specifically, our I3DS alternates +between upscaling low-resolution (LR) rendered images with diffusion models, +and updating the underlying 3D representation with standard NeRF training. We +further introduce Renoised Score Distillation (RSD), a novel score-distillation +objective for 2D image resolution. Our RSD combines features from ancestral +sampling and Score Distillation Sampling (SDS) to generate sharp images that +are also LR-consistent. Qualitative and quantitative results on both synthetic +and real-world datasets demonstrate that our DiSR-NeRF can achieve better +results on NeRF super-resolution compared with existing works. Code and video +results available at the project website. + +
+
+
+
+
+ + ☆ Lipsum-FT: Robust Fine-Tuning of Zero-Shot Models Using Random Text + Guidance ICLR 2024 + + +
+ Large-scale contrastive vision-language pre-trained models provide the +zero-shot model achieving competitive performance across a range of image +classification tasks without requiring training on downstream data. Recent +works have confirmed that while additional fine-tuning of the zero-shot model +on the reference data results in enhanced downstream performance, it +compromises the model's robustness against distribution shifts. Our +investigation begins by examining the conditions required to achieve the goals +of robust fine-tuning, employing descriptions based on feature distortion +theory and joint energy-based models. Subsequently, we propose a novel robust +fine-tuning algorithm, Lipsum-FT, that effectively utilizes the language +modeling aspect of the vision-language pre-trained models. Extensive +experiments conducted on distribution shift scenarios in DomainNet and ImageNet +confirm the superiority of our proposed Lipsum-FT approach over existing robust +fine-tuning methods. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Meta Episodic learning with Dynamic Task Sampling for CLIP-based Point + Cloud Classification + + +
+ Point cloud classification refers to the process of assigning semantic labels +or categories to individual points within a point cloud data structure. Recent +works have explored the extension of pre-trained CLIP to 3D recognition. In +this direction, CLIP-based point cloud models like PointCLIP, CLIP2Point have +become state-of-the-art methods in the few-shot setup. Although these methods +show promising performance for some classes like airplanes, desks, guitars, +etc, the performance for some classes like the cup, flower pot, sink, +nightstand, etc is still far from satisfactory. This is due to the fact that +the adapter of CLIP-based models is trained using randomly sampled N-way K-shot +data in the standard supervised learning setup. In this paper, we propose a +novel meta-episodic learning framework for CLIP-based point cloud +classification, addressing the challenges of limited training examples and +sampling unknown classes. Additionally, we introduce dynamic task sampling +within the episode based on performance memory. This sampling strategy +effectively addresses the challenge of sampling unknown classes, ensuring that +the model learns from a diverse range of classes and promotes the exploration +of underrepresented categories. By dynamically updating the performance memory, +we adaptively prioritize the sampling of classes based on their performance, +enhancing the model's ability to handle challenging and real-world scenarios. +Experiments show an average performance gain of 3-6\% on ModelNet40 and +ScanobjectNN datasets in a few-shot setup. + +
+
+
+
+
+ + ☆ TSOM: Small Object Motion Detection Neural Network Inspired by Avian + Visual Circuit + + +
+ Detecting small moving objects in complex backgrounds from an overhead +perspective is a highly challenging task for machine vision systems. As an +inspiration from nature, the avian visual system is capable of processing +motion information in various complex aerial scenes, and its Retina-OT-Rt +visual circuit is highly sensitive to capturing the motion information of small +objects from high altitudes. However, more needs to be done on small object +motion detection algorithms based on the avian visual system. In this paper, we +conducted mathematical modeling based on extensive studies of the biological +mechanisms of the Retina-OT-Rt visual circuit. Based on this, we proposed a +novel tectum small object motion detection neural network (TSOM). The neural +network includes the retina, SGC dendritic, SGC Soma, and Rt layers, each layer +corresponding to neurons in the visual pathway. The Retina layer is responsible +for accurately projecting input content, the SGC dendritic layer perceives and +encodes spatial-temporal information, the SGC Soma layer computes complex +motion information and extracts small objects, and the Rt layer integrates and +decodes motion information from multiple directions to determine the position +of small objects. Extensive experiments on pigeon neurophysiological +experiments and image sequence data showed that the TSOM is biologically +interpretable and effective in extracting reliable small object motion features +from complex high-altitude backgrounds. + +
+
+
+
+
+ + ☆ Ensemble Learning for Vietnamese Scene Text Spotting in Urban + Environments + + +
+ This paper presents a simple yet efficient ensemble learning framework for +Vietnamese scene text spotting. Leveraging the power of ensemble learning, +which combines multiple models to yield more accurate predictions, our approach +aims to significantly enhance the performance of scene text spotting in +challenging urban settings. Through experimental evaluations on the VinText +dataset, our proposed method achieves a significant improvement in accuracy +compared to existing methods with an impressive accuracy of 5%. These results +unequivocally demonstrate the efficacy of ensemble learning in the context of +Vietnamese scene text spotting in urban environments, highlighting its +potential for real world applications, such as text detection and recognition +in urban signage, advertisements, and various text-rich urban scenes. + +
+
+ comment: RIVF 2023 +
+
+
+
+
+ + ☆ Prompt Learning via Meta-Regularization CVPR 2024 + + +
+ Pre-trained vision-language models have shown impressive success on various +computer vision tasks with their zero-shot generalizability. Recently, prompt +learning approaches have been explored to efficiently and effectively adapt the +vision-language models to a variety of downstream tasks. However, most existing +prompt learning methods suffer from task overfitting since the general +knowledge of the pre-trained vision language models is forgotten while the +prompts are finetuned on a small data set from a specific target task. To +address this issue, we propose a Prompt Meta-Regularization (ProMetaR) to +improve the generalizability of prompt learning for vision-language models. +Specifically, ProMetaR meta-learns both the regularizer and the soft prompts to +harness the task-specific knowledge from the downstream tasks and task-agnostic +general knowledge from the vision-language models. Further, ProMetaR augments +the task to generate multiple virtual tasks to alleviate the meta-overfitting. +In addition, we provide the analysis to comprehend how ProMetaR improves the +generalizability of prompt tuning in the perspective of the gradient alignment. +Our extensive experiments demonstrate that our ProMetaR improves the +generalizability of conventional prompt learning methods under +base-to-base/base-to-new and domain generalization settings. The code of +ProMetaR is available at https://github.com/mlvlab/ProMetaR. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Generating Content for HDR Deghosting from Frequency View CVPR2024 + + +
+ Recovering ghost-free High Dynamic Range (HDR) images from multiple Low +Dynamic Range (LDR) images becomes challenging when the LDR images exhibit +saturation and significant motion. Recent Diffusion Models (DMs) have been +introduced in HDR imaging field, demonstrating promising performance, +particularly in achieving visually perceptible results compared to previous +DNN-based methods. However, DMs require extensive iterations with large models +to estimate entire images, resulting in inefficiency that hinders their +practical application. To address this challenge, we propose the Low-Frequency +aware Diffusion (LF-Diff) model for ghost-free HDR imaging. The key idea of +LF-Diff is implementing the DMs in a highly compacted latent space and +integrating it into a regression-based model to enhance the details of +reconstructed images. Specifically, as low-frequency information is closely +related to human visual perception we propose to utilize DMs to create compact +low-frequency priors for the reconstruction process. In addition, to take full +advantage of the above low-frequency priors, the Dynamic HDR Reconstruction +Network (DHRNet) is carried out in a regression-based manner to obtain final +HDR images. Extensive experiments conducted on synthetic and real-world +benchmark datasets demonstrate that our LF-Diff performs favorably against +several state-of-the-art methods and is 10$\times$ faster than previous +DM-based methods. + +
+
+ comment: This paper is accepted by CVPR2024 +
+
+
+
+
+ + ☆ Collaborative Learning of Anomalies with Privacy (CLAP) for Unsupervised + Video Anomaly Detection: A New Baseline CVPR + + +
+ Unsupervised (US) video anomaly detection (VAD) in surveillance applications +is gaining more popularity recently due to its practical real-world +applications. As surveillance videos are privacy sensitive and the availability +of large-scale video data may enable better US-VAD systems, collaborative +learning can be highly rewarding in this setting. However, due to the extremely +challenging nature of the US-VAD task, where learning is carried out without +any annotations, privacy-preserving collaborative learning of US-VAD systems +has not been studied yet. In this paper, we propose a new baseline for anomaly +detection capable of localizing anomalous events in complex surveillance videos +in a fully unsupervised fashion without any labels on a privacy-preserving +participant-based distributed training configuration. Additionally, we propose +three new evaluation protocols to benchmark anomaly detection approaches on +various scenarios of collaborations and data availability. Based on these +protocols, we modify existing VAD datasets to extensively evaluate our approach +as well as existing US SOTA methods on two large-scale datasets including +UCF-Crime and XD-Violence. All proposed evaluation protocols, dataset splits, +and codes are available here: https://github.com/AnasEmad11/CLAP + +
+
+ comment: Accepted in IEEE/CVF Computer Vision and Pattern Recognition + Conference (CVPR), 2024 +
+
+
+
+
+ + ☆ Transfer Learning with Point Transformers + + +
+ Point Transformers are near state-of-the-art models for classification, +segmentation, and detection tasks on Point Cloud data. They utilize a self +attention based mechanism to model large range spatial dependencies between +multiple point sets. In this project we explore two things: classification +performance of these attention based networks on ModelNet10 dataset and then, +we use the trained model to classify 3D MNIST dataset after finetuning. We also +train the model from scratch on 3D MNIST dataset to compare the performance of +finetuned and from-scratch model on the MNIST dataset. We observe that since +the two datasets have a large difference in the degree of the distributions, +transfer learned models do not outperform the from-scratch models in this case. +Although we do expect transfer learned models to converge faster since they +already know the lower level edges, corners, etc features from the ModelNet10 +dataset. + +
+
+
+
+
+ + ☆ An N-Point Linear Solver for Line and Motion Estimation with Event + Cameras + + +
+ Event cameras respond primarily to edges--formed by strong gradients--and are +thus particularly well-suited for line-based motion estimation. Recent work has +shown that events generated by a single line each satisfy a polynomial +constraint which describes a manifold in the space-time volume. Multiple such +constraints can be solved simultaneously to recover the partial linear velocity +and line parameters. In this work, we show that, with a suitable line +parametrization, this system of constraints is actually linear in the unknowns, +which allows us to design a novel linear solver. Unlike existing solvers, our +linear solver (i) is fast and numerically stable since it does not rely on +expensive root finding, (ii) can solve both minimal and overdetermined systems +with more than 5 events, and (iii) admits the characterization of all +degenerate cases and multiple solutions. The found line parameters are +singularity-free and have a fixed scale, which eliminates the need for +auxiliary constraints typically encountered in previous work. To recover the +full linear camera velocity we fuse observations from multiple lines with a +novel velocity averaging scheme that relies on a geometrically-motivated +residual, and thus solves the problem more efficiently than previous schemes +which minimize an algebraic residual. Extensive experiments in synthetic and +real-world settings demonstrate that our method surpasses the previous work in +numerical stability, and operates over 600 times faster. + +
+
+
+
+
+ + ☆ 3MOS: Multi-sources, Multi-resolutions, and Multi-scenes dataset for + Optical-SAR image matching + + +
+ Optical-SAR image matching is a fundamental task for image fusion and visual +navigation. However, all large-scale open SAR dataset for methods development +are collected from single platform, resulting in limited satellite types and +spatial resolutions. Since images captured by different sensors vary +significantly in both geometric and radiometric appearance, existing methods +may fail to match corresponding regions containing the same content. Besides, +most of existing datasets have not been categorized based on the +characteristics of different scenes. To encourage the design of more general +multi-modal image matching methods, we introduce a large-scale +Multi-sources,Multi-resolutions, and Multi-scenes dataset for Optical-SAR image +matching(3MOS). It consists of 155K optical-SAR image pairs, including SAR data +from six commercial satellites, with resolutions ranging from 1.25m to 12.5m. +The data has been classified into eight scenes including urban, rural, plains, +hills, mountains, water, desert, and frozen earth. Extensively experiments show +that none of state-of-the-art methods achieve consistently superior performance +across different sources, resolutions and scenes. In addition, the distribution +of data has a substantial impact on the matching capability of deep learning +models, this proposes the domain adaptation challenge in optical-SAR image +matching. Our data and code will be available at:https://github.com/3M-OS/3MOS. + +
+
+ comment: 20pages 17 figures +
+
+
+
+
+ + ☆ Automated HER2 Scoring in Breast Cancer Images Using Deep Learning and + Pyramid Sampling + + +
+ Human epidermal growth factor receptor 2 (HER2) is a critical protein in +cancer cell growth that signifies the aggressiveness of breast cancer (BC) and +helps predict its prognosis. Accurate assessment of immunohistochemically (IHC) +stained tissue slides for HER2 expression levels is essential for both +treatment guidance and understanding of cancer mechanisms. Nevertheless, the +traditional workflow of manual examination by board-certified pathologists +encounters challenges, including inter- and intra-observer inconsistency and +extended turnaround times. Here, we introduce a deep learning-based approach +utilizing pyramid sampling for the automated classification of HER2 status in +IHC-stained BC tissue images. Our approach analyzes morphological features at +various spatial scales, efficiently managing the computational load and +facilitating a detailed examination of cellular and larger-scale tissue-level +details. This method addresses the tissue heterogeneity of HER2 expression by +providing a comprehensive view, leading to a blind testing classification +accuracy of 84.70%, on a dataset of 523 core images from tissue microarrays. +Our automated system, proving reliable as an adjunct pathology tool, has the +potential to enhance diagnostic precision and evaluation speed, and might +significantly impact cancer treatment planning. + +
+
+ comment: 21 Pages, 7 Figures +
+
+
+
+
+ + ☆ Towards Robust Event-guided Low-Light Image Enhancement: A Large-Scale + Real-World Event-Image Dataset and Novel Approach CVPR 2024 + + +
+ Event camera has recently received much attention for low-light image +enhancement (LIE) thanks to their distinct advantages, such as high dynamic +range. However, current research is prohibitively restricted by the lack of +large-scale, real-world, and spatial-temporally aligned event-image datasets. +To this end, we propose a real-world (indoor and outdoor) dataset comprising +over 30K pairs of images and events under both low and normal illumination +conditions. To achieve this, we utilize a robotic arm that traces a consistent +non-linear trajectory to curate the dataset with spatial alignment precision +under 0.03mm. We then introduce a matching alignment strategy, rendering 90% of +our dataset with errors less than 0.01s. Based on the dataset, we propose a +novel event-guided LIE approach, called EvLight, towards robust performance in +real-world low-light scenes. Specifically, we first design the multi-scale +holistic fusion branch to extract holistic structural and textural information +from both events and images. To ensure robustness against variations in the +regional illumination and noise, we then introduce a Signal-to-Noise-Ratio +(SNR)-guided regional feature selection to selectively fuse features of images +from regions with high SNR and enhance those with low SNR by extracting +regional structure information from events. Extensive experiments on our +dataset and the synthetic SDSD dataset demonstrate our EvLight significantly +surpasses the frame-based methods. Code and datasets are available at +https://vlislab22.github.io/eg-lowlight/. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Generation and Detection of Sign Language Deepfakes -- A Linguistic and + Visual Analysis + + +
+ A question in the realm of deepfakes is slowly emerging pertaining to whether +we can go beyond facial deepfakes and whether it would be beneficial to +society. Therefore, this research presents a positive application of deepfake +technology in upper body generation, while performing sign-language for the +Deaf and Hard of Hearing (DHoH) community. The resulting videos are later +vetted with a sign language expert. This is particularly helpful, given the +intricate nature of sign language, a scarcity of sign language experts, and +potential benefits for health and education. The objectives of this work +encompass constructing a reliable deepfake dataset, evaluating its technical +and visual credibility through computer vision and natural language processing +models, and assessing the plausibility of the generated content. With over 1200 +videos, featuring both previously seen and unseen individuals for the +generation model, using the help of a sign language expert, we establish a +deepfake dataset in sign language that can further be utilized to detect fake +videos that may target certain people of determination. + +
+
+ comment: 13 pages, 13 figures, Computer Vision and Image Understanding Journal +
+
+
+
+
+ + ☆ DRIVE: Dual Gradient-Based Rapid Iterative Pruning + + +
+ Modern deep neural networks (DNNs) consist of millions of parameters, +necessitating high-performance computing during training and inference. Pruning +is one solution that significantly reduces the space and time complexities of +DNNs. Traditional pruning methods that are applied post-training focus on +streamlining inference, but there are recent efforts to leverage sparsity early +on by pruning before training. Pruning methods, such as iterative +magnitude-based pruning (IMP) achieve up to a 90% parameter reduction while +retaining accuracy comparable to the original model. However, this leads to +impractical runtime as it relies on multiple train-prune-reset cycles to +identify and eliminate redundant parameters. In contrast, training agnostic +early pruning methods, such as SNIP and SynFlow offer fast pruning but fall +short of the accuracy achieved by IMP at high sparsities. To bridge this gap, +we present Dual Gradient-Based Rapid Iterative Pruning (DRIVE), which leverages +dense training for initial epochs to counteract the randomness inherent at the +initialization. Subsequently, it employs a unique dual gradient-based metric +for parameter ranking. It has been experimentally demonstrated for VGG and +ResNet architectures on CIFAR-10/100 and Tiny ImageNet, and ResNet on ImageNet +that DRIVE consistently has superior performance over other training-agnostic +early pruning methods in accuracy. Notably, DRIVE is 43$\times$ to 869$\times$ +faster than IMP for pruning. + +
+
+
+
+
+ + ♻ ☆ Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical + Image Segmentation + + +
+ Image segmentation holds a vital position in the realms of diagnosis and +treatment within the medical domain. Traditional convolutional neural networks +(CNNs) and Transformer models have made significant advancements in this realm, +but they still encounter challenges because of limited receptive field or high +computing complexity. Recently, State Space Models (SSMs), particularly Mamba +and its variants, have demonstrated notable performance in the field of vision. +However, their feature extraction methods may not be sufficiently effective and +retain some redundant structures, leaving room for parameter reduction. +Motivated by previous spatial and channel attention methods, we propose Triplet +Mamba-UNet. The method leverages residual VSS Blocks to extract intensive +contextual features, while Triplet SSM is employed to fuse features across +spatial and channel dimensions. We conducted experiments on ISIC17, ISIC18, +CVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets, +demonstrating the superior segmentation performance of our proposed TM-UNet. +Additionally, compared to the previous VM-UNet, our model achieves a one-third +reduction in parameters. + +
+
+
+
+
+ + ♻ ☆ Modality-Agnostic Structural Image Representation Learning for + Deformable Multi-Modality Medical Image Registration CVPR2024 + + +
+ Establishing dense anatomical correspondence across distinct imaging +modalities is a foundational yet challenging procedure for numerous medical +image analysis studies and image-guided radiotherapy. Existing multi-modality +image registration algorithms rely on statistical-based similarity measures or +local structural image representations. However, the former is sensitive to +locally varying noise, while the latter is not discriminative enough to cope +with complex anatomical structures in multimodal scans, causing ambiguity in +determining the anatomical correspondence across scans with different +modalities. In this paper, we propose a modality-agnostic structural +representation learning method, which leverages Deep Neighbourhood +Self-similarity (DNS) and anatomy-aware contrastive learning to learn +discriminative and contrast-invariance deep structural image representations +(DSIR) without the need for anatomical delineations or pre-aligned training +images. We evaluate our method on multiphase CT, abdomen MR-CT, and brain MR +T1w-T2w registration. Comprehensive results demonstrate that our method is +superior to the conventional local structural representation and +statistical-based similarity measures in terms of discriminability and +accuracy. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Structure Matters: Tackling the Semantic Discrepancy in Diffusion Models + for Image Inpainting CVPR 2024 + + +
+ Denoising diffusion probabilistic models for image inpainting aim to add the +noise to the texture of image during the forward process and recover masked +regions with unmasked ones of the texture via the reverse denoising +process.Despite the meaningful semantics generation,the existing arts suffer +from the semantic discrepancy between masked and unmasked regions, since the +semantically dense unmasked texture fails to be completely degraded while the +masked regions turn to the pure noise in diffusion process,leading to the large +discrepancy between them. In this paper,we aim to answer how unmasked semantics +guide texture denoising process;together with how to tackle the semantic +discrepancy,to facilitate the consistent and meaningful semantics generation. +To this end,we propose a novel structure-guided diffusion model named +StrDiffusion,to reformulate the conventional texture denoising process under +structure guidance to derive a simplified denoising objective for image +inpainting,while revealing:1)the semantically sparse structure is beneficial to +tackle semantic discrepancy in early stage, while dense texture generates +reasonable semantics in late stage;2)the semantics from unmasked regions +essentially offer the time-dependent structure guidance for the texture +denoising process,benefiting from the time-dependent sparsity of the structure +semantics.For the denoising process,a structure-guided neural network is +trained to estimate the simplified denoising objective by exploiting the +consistency of the denoised structure between masked and unmasked +regions.Besides,we devise an adaptive resampling strategy as a formal criterion +as whether structure is competent to guide the texture denoising process,while +regulate their semantic correlations.Extensive experiments validate the merits +of StrDiffusion over the state-of-the-arts.Our code is available at +https://github.com/htyjers/StrDiffusion. + +
+
+ comment: 15 pages, 10 figures, to appear CVPR 2024 +
+
+
+
+
+ + ♻ ☆ An Extensible Framework for Open Heterogeneous Collaborative Perception ICLR 2024 + + +
+ Collaborative perception aims to mitigate the limitations of single-agent +perception, such as occlusions, by facilitating data exchange among multiple +agents. However, most current works consider a homogeneous scenario where all +agents use identity sensors and perception models. In reality, heterogeneous +agent types may continually emerge and inevitably face a domain gap when +collaborating with existing agents. In this paper, we introduce a new open +heterogeneous problem: how to accommodate continually emerging new +heterogeneous agent types into collaborative perception, while ensuring high +perception performance and low integration cost? To address this problem, we +propose HEterogeneous ALliance (HEAL), a novel extensible collaborative +perception framework. HEAL first establishes a unified feature space with +initial agents via a novel multi-scale foreground-aware Pyramid Fusion network. +When heterogeneous new agents emerge with previously unseen modalities or +models, we align them to the established unified space with an innovative +backward alignment. This step only involves individual training on the new +agent type, thus presenting extremely low training costs and high +extensibility. To enrich agents' data heterogeneity, we bring OPV2V-H, a new +large-scale dataset with more diverse sensor types. Extensive experiments on +OPV2V-H and DAIR-V2X datasets show that HEAL surpasses SOTA methods in +performance while reducing the training parameters by 91.5% when integrating 3 +new agent types. We further implement a comprehensive codebase at: +https://github.com/yifanlu0227/HEAL + +
+
+ comment: Accepted by ICLR 2024. The code and data are open-sourced at + https://github.com/yifanlu0227/HEAL +
+
+
+
+
+ + ♻ ☆ WaterVG: Waterway Visual Grounding based on Text-Guided Vision and + mmWave Radar + + +
+ The perception of waterways based on human intent is significant for +autonomous navigation and operations of Unmanned Surface Vehicles (USVs) in +water environments. Inspired by visual grounding, we introduce WaterVG, the +first visual grounding dataset designed for USV-based waterway perception based +on human prompts. WaterVG encompasses prompts describing multiple targets, with +annotations at the instance level including bounding boxes and masks. Notably, +WaterVG includes 11,568 samples with 34,987 referred targets, whose prompts +integrates both visual and radar characteristics. The pattern of text-guided +two sensors equips a finer granularity of text prompts with visual and radar +features of referred targets. Moreover, we propose a low-power visual grounding +model, Potamoi, which is a multi-task model with a well-designed Phased +Heterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting +(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts +required radar features to fuse with vision for prompt alignment. MHSCA is an +efficient fusion module with a remarkably small parameter count and FLOPs, +elegantly fusing scenario context captured by two sensors with linguistic +features, which performs expressively on visual grounding tasks. Comprehensive +experiments and evaluations have been conducted on WaterVG, where our Potamoi +archives state-of-the-art performances compared with counterparts. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ PACE: A Large-Scale Dataset with Pose Annotations in Cluttered + Environments + + +
+ Pose estimation is a crucial task in computer vision and robotics, enabling +the tracking and manipulation of objects in images or videos. While several +datasets exist for pose estimation, there is a lack of large-scale datasets +specifically focusing on cluttered scenes with occlusions. We introduce PACE +(Pose Annotations in Cluttered Environments), a large-scale benchmark designed +to advance the development and evaluation of pose estimation methods in +cluttered scenarios. PACE consists of 54,945 frames with 257,673 annotations +across 300 videos, covering 576 objects from 44 categories and featuring a mix +of rigid and articulated items in cluttered scenes. To annotate the real-world +data efficiently, we developed an innovative annotation system utilizing a +calibrated 3-camera setup. We test state-of-the-art algorithms in PACE along +two tracks: pose estimation, and object pose tracking, revealing the +benchmark's challenges and research opportunities. Our code and data is +available on https://github.com/qq456cvb/PACE. + +
+
+
+
+
+ + ♻ ☆ Draw-and-Understand: Leveraging Visual Prompts to Enable MLLMs to + Comprehend What You Want + + +
+ The interaction between humans and artificial intelligence (AI) is a crucial +factor that reflects the effectiveness of multimodal large language models +(MLLMs). However, current MLLMs primarily focus on image-level comprehension +and limit interaction to textual instructions, thereby constraining their +flexibility in usage and depth of response. In this paper, we introduce the +Draw-and-Understand project: a new model, a multi-domain dataset, and a +challenging benchmark for visual prompting. Specifically, we propose SPHINX-V, +a new end-to-end trained Multimodal Large Language Model (MLLM) that connects a +vision encoder, a visual prompt encoder and an LLM for various visual prompts +(points, bounding boxes, and free-form shape) and language understanding. To +advance visual prompting research for MLLMs, we introduce MDVP-Data and +MDVP-Bench. MDVP-Data features a multi-domain dataset containing 1.6M unique +image-visual prompt-text instruction-following samples, including natural +images, document images, OCR images, mobile screenshots, web screenshots, and +multi-panel images. Furthermore, we present MDVP-Bench, a comprehensive and +challenging benchmark to assess a model's capability in understanding visual +prompting instructions. Our experiments demonstrate SPHINX-V's impressive +multimodal interaction capabilities through visual prompting, revealing +significant improvements in detailed pixel-level description and +question-answering abilities. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ BAMM: Bidirectional Autoregressive Motion Model + + +
+ Generating human motion from text has been dominated by denoising motion +models either through diffusion or generative masking process. However, these +models face great limitations in usability by requiring prior knowledge of the +motion length. Conversely, autoregressive motion models address this limitation +by adaptively predicting motion endpoints, at the cost of degraded generation +quality and editing capabilities. To address these challenges, we propose +Bidirectional Autoregressive Motion Model (BAMM), a novel text-to-motion +generation framework. BAMM consists of two key components: (1) a motion +tokenizer that transforms 3D human motion into discrete tokens in latent space, +and (2) a masked self-attention transformer that autoregressively predicts +randomly masked tokens via a hybrid attention masking strategy. By unifying +generative masked modeling and autoregressive modeling, BAMM captures rich and +bidirectional dependencies among motion tokens, while learning the +probabilistic mapping from textual inputs to motion outputs with +dynamically-adjusted motion sequence length. This feature enables BAMM to +simultaneously achieving high-quality motion generation with enhanced usability +and built-in motion editability. Extensive experiments on HumanML3D and KIT-ML +datasets demonstrate that BAMM surpasses current state-of-the-art methods in +both qualitative and quantitative measures. Our project page is available at +https://exitudio.github.io/BAMM-page + +
+
+
+
+
+ + ♻ ☆ Video-Based Human Pose Regression via Decoupled Space-Time Aggregation + + +
+ By leveraging temporal dependency in video sequences, multi-frame human pose +estimation algorithms have demonstrated remarkable results in complicated +situations, such as occlusion, motion blur, and video defocus. These algorithms +are predominantly based on heatmaps, resulting in high computation and storage +requirements per frame, which limits their flexibility and real-time +application in video scenarios, particularly on edge devices. In this paper, we +develop an efficient and effective video-based human pose regression method, +which bypasses intermediate representations such as heatmaps and instead +directly maps the input to the output joint coordinates. Despite the inherent +spatial correlation among adjacent joints of the human pose, the temporal +trajectory of each individual joint exhibits relative independence. In light of +this, we propose a novel Decoupled Space-Time Aggregation network (DSTA) to +separately capture the spatial contexts between adjacent joints and the +temporal cues of each individual joint, thereby avoiding the conflation of +spatiotemporal dimensions. Concretely, DSTA learns a dedicated feature token +for each joint to facilitate the modeling of their spatiotemporal dependencies. +With the proposed joint-wise local-awareness attention mechanism, our method is +capable of efficiently and flexibly utilizing the spatial dependency of +adjacent joints and the temporal dependency of each joint itself. Extensive +experiments demonstrate the superiority of our method. Compared to previous +regression-based single-frame human pose estimation methods, DSTA significantly +enhances performance, achieving an 8.9 mAP improvement on PoseTrack2017. +Furthermore, our approach either surpasses or is on par with the +state-of-the-art heatmap-based multi-frame human pose estimation methods. +Project page: https://github.com/zgspose/DSTA. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Compositional Chain-of-Thought Prompting for Large Multimodal Models + + +
+ The combination of strong visual backbones and Large Language Model (LLM) +reasoning has led to Large Multimodal Models (LMMs) becoming the current +standard for a wide range of vision and language (VL) tasks. However, recent +research has shown that even the most advanced LMMs still struggle to capture +aspects of compositional visual reasoning, such as attributes and relationships +between objects. One solution is to utilize scene graphs (SGs)--a formalization +of objects and their relations and attributes that has been extensively used as +a bridge between the visual and textual domains. Yet, scene graph data requires +scene graph annotations, which are expensive to collect and thus not easily +scalable. Moreover, finetuning an LMM based on SG data can lead to catastrophic +forgetting of the pretraining objective. To overcome this, inspired by +chain-of-thought methods, we propose Compositional Chain-of-Thought (CCoT), a +novel zero-shot Chain-of-Thought prompting method that utilizes SG +representations in order to extract compositional knowledge from an LMM. +Specifically, we first generate an SG using the LMM, and then use that SG in +the prompt to produce a response. Through extensive experiments, we find that +the proposed CCoT approach not only improves LMM performance on several vision +and language VL compositional benchmarks but also improves the performance of +several popular LMMs on general multimodal benchmarks, without the need for +fine-tuning or annotated ground-truth SGs. Code: +https://github.com/chancharikmitra/CCoT + +
+
+
+
+
+ + ♻ ☆ Efficient 3D Instance Mapping and Localization with Neural Fields + + +
+ We tackle the problem of learning an implicit scene representation for 3D +instance segmentation from a sequence of posed RGB images. Towards this, we +introduce 3DIML, a novel framework that efficiently learns a label field that +may be rendered from novel viewpoints to produce view-consistent instance +segmentation masks. 3DIML significantly improves upon training and inference +runtimes of existing implicit scene representation based methods. Opposed to +prior art that optimizes a neural field in a self-supervised manner, requiring +complicated training procedures and loss function design, 3DIML leverages a +two-phase process. The first phase, InstanceMap, takes as input 2D segmentation +masks of the image sequence generated by a frontend instance segmentation +model, and associates corresponding masks across images to 3D labels. These +almost view-consistent pseudolabel masks are then used in the second phase, +InstanceLift, to supervise the training of a neural label field, which +interpolates regions missed by InstanceMap and resolves ambiguities. +Additionally, we introduce InstanceLoc, which enables near realtime +localization of instance masks given a trained label field and an off-the-shelf +image segmentation model by fusing outputs from both. We evaluate 3DIML on +sequences from the Replica and ScanNet datasets and demonstrate 3DIML's +effectiveness under mild assumptions for the image sequences. We achieve a +large practical speedup over existing implicit scene representation methods +with comparable quality, showcasing its potential to facilitate faster and more +effective 3D scene understanding. + +
+
+
+
+
+ + ♻ ☆ Change-Agent: Towards Interactive Comprehensive Remote Sensing Change + Interpretation and Analysis + + +
+ Monitoring changes in the Earth's surface is crucial for understanding +natural processes and human impacts, necessitating precise and comprehensive +interpretation methodologies. Remote sensing satellite imagery offers a unique +perspective for monitoring these changes, leading to the emergence of remote +sensing image change interpretation (RSICI) as a significant research focus. +Current RSICI technology encompasses change detection and change captioning, +each with its limitations in providing comprehensive interpretation. To address +this, we propose an interactive Change-Agent, which can follow user +instructions to achieve comprehensive change interpretation and insightful +analysis according to user instructions, such as change detection and change +captioning, change object counting, change cause analysis, etc. The +Change-Agent integrates a multi-level change interpretation (MCI) model as the +eyes and a large language model (LLM) as the brain. The MCI model contains two +branches of pixel-level change detection and semantic-level change captioning, +in which multiple BI-temporal Iterative Interaction (BI3) layers utilize Local +Perception Enhancement (LPE) and the Global Difference Fusion Attention (GDFA) +modules to enhance the model's discriminative feature representation +capabilities. To support the training of the MCI model, we build the LEVIR-MCI +dataset with a large number of change masks and captions of changes. Extensive +experiments demonstrate the effectiveness of the proposed MCI model and +highlight the promising potential of our Change-Agent in facilitating +comprehensive and intelligent interpretation of surface changes. To facilitate +future research, we will make our dataset and codebase of the MCI model and +Change-Agent publicly available at +https://github.com/Chen-Yang-Liu/Change-Agent + +
+
+
+
+
+ + ♻ ☆ Frequency-Adaptive Dilated Convolution for Semantic Segmentation + + +
+ Dilated convolution, which expands the receptive field by inserting gaps +between its consecutive elements, is widely employed in computer vision. In +this study, we propose three strategies to improve individual phases of dilated +convolution from the view of spectrum analysis. Departing from the conventional +practice of fixing a global dilation rate as a hyperparameter, we introduce +Frequency-Adaptive Dilated Convolution (FADC), which dynamically adjusts +dilation rates spatially based on local frequency components. Subsequently, we +design two plug-in modules to directly enhance effective bandwidth and +receptive field size. The Adaptive Kernel (AdaKern) module decomposes +convolution weights into low-frequency and high-frequency components, +dynamically adjusting the ratio between these components on a per-channel +basis. By increasing the high-frequency part of convolution weights, AdaKern +captures more high-frequency components, thereby improving effective bandwidth. +The Frequency Selection (FreqSelect) module optimally balances high- and +low-frequency components in feature representations through spatially variant +reweighting. It suppresses high frequencies in the background to encourage FADC +to learn a larger dilation, thereby increasing the receptive field for an +expanded scope. Extensive experiments on segmentation and object detection +consistently validate the efficacy of our approach. The code is publicly +available at https://github.com/Linwei-Chen/FADC. + +
+
+
+
+
+ + ♻ ☆ Multi-criteria Token Fusion with One-step-ahead Attention for Efficient + Vision Transformers CVPR + + +
+ Vision Transformer (ViT) has emerged as a prominent backbone for computer +vision. For more efficient ViTs, recent works lessen the quadratic cost of the +self-attention layer by pruning or fusing the redundant tokens. However, these +works faced the speed-accuracy trade-off caused by the loss of information. +Here, we argue that token fusion needs to consider diverse relations between +tokens to minimize information loss. In this paper, we propose a Multi-criteria +Token Fusion (MCTF), that gradually fuses the tokens based on multi-criteria +(e.g., similarity, informativeness, and size of fused tokens). Further, we +utilize the one-step-ahead attention, which is the improved approach to capture +the informativeness of the tokens. By training the model equipped with MCTF +using a token reduction consistency, we achieve the best speed-accuracy +trade-off in the image classification (ImageNet1K). Experimental results prove +that MCTF consistently surpasses the previous reduction methods with and +without training. Specifically, DeiT-T and DeiT-S with MCTF reduce FLOPs by +about 44% while improving the performance (+0.5%, and +0.3%) over the base +model, respectively. We also demonstrate the applicability of MCTF in various +Vision Transformers (e.g., T2T-ViT, LV-ViT), achieving at least 31% speedup +without performance degradation. Code is available at +https://github.com/mlvlab/MCTF. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ High-Resolution Image Translation Model Based on Grayscale Redefinition + + +
+ Image-to-image translation is a technique that focuses on transferring images +from one domain to another while maintaining the essential content +representations. In recent years, image-to-image translation has gained +significant attention and achieved remarkable advancements due to its diverse +applications in computer vision and image processing tasks. In this work, we +propose an innovative method for image translation between different domains. +For high-resolution image translation tasks, we use a grayscale adjustment +method to achieve pixel-level translation. For other tasks, we utilize the +Pix2PixHD model with a coarse-to-fine generator, multi-scale discriminator, and +improved loss to enhance the image translation performance. On the other hand, +to tackle the issue of sparse training data, we adopt model weight +initialization from other task to optimize the performance of the current task. + +
+
+
+
+
+ + ♻ ☆ LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal + Models + + +
+ Large Multimodal Models (LMMs) have shown significant reasoning capabilities +by connecting a visual encoder and a large language model. LMMs typically use a +fixed amount of visual tokens, such as the penultimate layer features in the +CLIP visual encoder, as the prefix content. Recent LMMs incorporate more +complex visual inputs, such as high-resolution images and videos, which +increase the number of visual tokens significantly. However, due to the design +of the Transformer architecture, computational costs associated with these +models tend to increase quadratically with the number of input tokens. To +tackle this problem, we explore a token reduction mechanism and find, similar +to prior work, that many visual tokens are spatially redundant. Based on this, +we propose PruMerge, a novel adaptive visual token reduction approach, which +largely reduces the number of visual tokens while maintaining comparable model +performance. We first select the unpruned visual tokens based on their +similarity to class tokens and spatial tokens. We then cluster the pruned +tokens based on key similarity and merge the clustered tokens with the unpruned +tokens to supplement their information. Empirically, when applied to LLaVA-1.5, +our approach can compress the visual tokens by 18 times on average, and achieve +comparable performance across diverse visual question-answering and reasoning +tasks. Code and checkpoints are at https://llava-prumerge.github.io/. + +
+
+ comment: Project page: https://llava-prumerge.github.io/ +
+
+
+
+
+ + ♻ ☆ Concept-based Analysis of Neural Networks via Vision-Language Models + + +
+ The analysis of vision-based deep neural networks (DNNs) is highly desirable +but it is very challenging due to the difficulty of expressing formal +specifications for vision tasks and the lack of efficient verification +procedures. In this paper, we propose to leverage emerging multimodal, +vision-language, foundation models (VLMs) as a lens through which we can reason +about vision models. VLMs have been trained on a large body of images +accompanied by their textual description, and are thus implicitly aware of +high-level, human-understandable concepts describing the images. We describe a +logical specification language $\texttt{Con}_{\texttt{spec}}$ designed to +facilitate writing specifications in terms of these concepts. To define and +formally check $\texttt{Con}_{\texttt{spec}}$ specifications, we build a map +between the internal representations of a given vision model and a VLM, leading +to an efficient verification procedure of natural-language properties for +vision models. We demonstrate our techniques on a ResNet-based classifier +trained on the RIVAL-10 dataset using CLIP as the multimodal model. + +
+
+
+
+
+ + ♻ ☆ 3D Open-Vocabulary Panoptic Segmentation with 2D-3D Vision-Language + Distillation + + +
+ 3D panoptic segmentation is a challenging perception task, especially in +autonomous driving. It aims to predict both semantic and instance annotations +for 3D points in a scene. Although prior 3D panoptic segmentation approaches +have achieved great performance on closed-set benchmarks, generalizing these +approaches to unseen things and unseen stuff categories remains an open +problem. For unseen object categories, 2D open-vocabulary segmentation has +achieved promising results that solely rely on frozen CLIP backbones and +ensembling multiple classification outputs. However, we find that simply +extending these 2D models to 3D does not guarantee good performance due to poor +per-mask classification quality, especially for novel stuff categories. In this +paper, we propose the first method to tackle 3D open-vocabulary panoptic +segmentation. Our model takes advantage of the fusion between learnable LiDAR +features and dense frozen vision CLIP features, using a single classification +head to make predictions for both base and novel classes. To further improve +the classification performance on novel classes and leverage the CLIP model, we +propose two novel loss functions: object-level distillation loss and +voxel-level distillation loss. Our experiments on the nuScenes and +SemanticKITTI datasets show that our method outperforms the strong baseline by +a large margin. + +
+
+
+
+
+ + ♻ ☆ FeatUp: A Model-Agnostic Framework for Features at Any Resolution ICLR + + +
+ Deep features are a cornerstone of computer vision research, capturing image +semantics and enabling the community to solve downstream tasks even in the +zero- or few-shot regime. However, these features often lack the spatial +resolution to directly perform dense prediction tasks like segmentation and +depth prediction because models aggressively pool information over large areas. +In this work, we introduce FeatUp, a task- and model-agnostic framework to +restore lost spatial information in deep features. We introduce two variants of +FeatUp: one that guides features with high-resolution signal in a single +forward pass, and one that fits an implicit model to a single image to +reconstruct features at any resolution. Both approaches use a multi-view +consistency loss with deep analogies to NeRFs. Our features retain their +original semantics and can be swapped into existing applications to yield +resolution and performance gains even without re-training. We show that FeatUp +significantly outperforms other feature upsampling and image super-resolution +approaches in class activation map generation, transfer learning for +segmentation and depth prediction, and end-to-end training for semantic +segmentation. + +
+
+ comment: Accepted to the International Conference on Learning Representations + (ICLR) 2024 +
+
+
+
+
+ + ♻ ☆ Towards long-tailed, multi-label disease classification from chest + X-ray: Overview of the CXR-LT challenge + + +
+ Many real-world image recognition problems, such as diagnostic medical +imaging exams, are "long-tailed" $\unicode{x2013}$ there are a few common +findings followed by many more relatively rare conditions. In chest +radiography, diagnosis is both a long-tailed and multi-label problem, as +patients often present with multiple findings simultaneously. While researchers +have begun to study the problem of long-tailed learning in medical image +recognition, few have studied the interaction of label imbalance and label +co-occurrence posed by long-tailed, multi-label disease classification. To +engage with the research community on this emerging topic, we conducted an open +challenge, CXR-LT, on long-tailed, multi-label thorax disease classification +from chest X-rays (CXRs). We publicly release a large-scale benchmark dataset +of over 350,000 CXRs, each labeled with at least one of 26 clinical findings +following a long-tailed distribution. We synthesize common themes of +top-performing solutions, providing practical recommendations for long-tailed, +multi-label medical image classification. Finally, we use these insights to +propose a path forward involving vision-language foundation models for few- and +zero-shot disease classification. + +
+
+ comment: Update after major revision +
+
+
+
+
+ + ♻ ☆ Modeling Multimodal Social Interactions: New Challenges and Baselines + with Densely Aligned Representations CVPR 2024 + + +
+ Understanding social interactions involving both verbal and non-verbal cues +is essential for effectively interpreting social situations. However, most +prior works on multimodal social cues focus predominantly on single-person +behaviors or rely on holistic visual representations that are not aligned to +utterances in multi-party environments. Consequently, they are limited in +modeling the intricate dynamics of multi-party interactions. In this paper, we +introduce three new challenging tasks to model the fine-grained dynamics +between multiple people: speaking target identification, pronoun coreference +resolution, and mentioned player prediction. We contribute extensive data +annotations to curate these new challenges in social deduction game settings. +Furthermore, we propose a novel multimodal baseline that leverages densely +aligned language-visual representations by synchronizing visual features with +their corresponding utterances. This facilitates concurrently capturing verbal +and non-verbal cues pertinent to social reasoning. Experiments demonstrate the +effectiveness of the proposed approach with densely aligned multimodal +representations in modeling fine-grained social interactions. Project website: +https://sangmin-git.github.io/projects/MMSI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Hyperfeatures: Searching Through Time and Space for Semantic + Correspondence NeurIPS 2023 + + +
+ Diffusion models have been shown to be capable of generating high-quality +images, suggesting that they could contain meaningful internal representations. +Unfortunately, the feature maps that encode a diffusion model's internal +information are spread not only over layers of the network, but also over +diffusion timesteps, making it challenging to extract useful descriptors. We +propose Diffusion Hyperfeatures, a framework for consolidating multi-scale and +multi-timestep feature maps into per-pixel feature descriptors that can be used +for downstream tasks. These descriptors can be extracted for both synthetic and +real images using the generation and inversion processes. We evaluate the +utility of our Diffusion Hyperfeatures on the task of semantic keypoint +correspondence: our method achieves superior performance on the SPair-71k real +image benchmark. We also demonstrate that our method is flexible and +transferable: our feature aggregation network trained on the inversion features +of real image pairs can be used on the generation features of synthetic image +pairs with unseen objects and compositions. Our code is available at +https://diffusion-hyperfeatures.github.io. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ FasterViT: Fast Vision Transformers with Hierarchical Attention ICLR'24 + + +
+ We design a new family of hybrid CNN-ViT neural networks, named FasterViT, +with a focus on high image throughput for computer vision (CV) applications. +FasterViT combines the benefits of fast local representation learning in CNNs +and global modeling properties in ViT. Our newly introduced Hierarchical +Attention (HAT) approach decomposes global self-attention with quadratic +complexity into a multi-level attention with reduced computational costs. We +benefit from efficient window-based self-attention. Each window has access to +dedicated carrier tokens that participate in local and global representation +learning. At a high level, global self-attentions enable the efficient +cross-window communication at lower costs. FasterViT achieves a SOTA +Pareto-front in terms of accuracy and image throughput. We have extensively +validated its effectiveness on various CV tasks including classification, +object detection and segmentation. We also show that HAT can be used as a +plug-and-play module for existing networks and enhance them. We further +demonstrate significantly faster and more accurate performance than competitive +counterparts for images with high resolution. Code is available at +https://github.com/NVlabs/FasterViT. + +
+
+ comment: ICLR'24 Accepted Paper +
+
+
+
+
+ + ♻ ☆ DiffiT: Diffusion Vision Transformers for Image Generation + + +
+ Diffusion models with their powerful expressivity and high sample quality +have achieved State-Of-The-Art (SOTA) performance in the generative domain. The +pioneering Vision Transformer (ViT) has also demonstrated strong modeling +capabilities and scalability, especially for recognition tasks. In this paper, +we study the effectiveness of ViTs in diffusion-based generative learning and +propose a new model denoted as Diffusion Vision Transformers (DiffiT). +Specifically, we propose a methodology for finegrained control of the denoising +process and introduce the Time-dependant Multihead Self Attention (TMSA) +mechanism. DiffiT is surprisingly effective in generating high-fidelity images +with significantly better parameter efficiency. We also propose latent and +image space DiffiT models and show SOTA performance on a variety of +class-conditional and unconditional synthesis tasks at different resolutions. +The Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet-256 +dataset while having 19.85%, 16.88% less parameters than other +Transformer-based diffusion models such as MDT and DiT, respectively. Code: +https://github.com/NVlabs/DiffiT + +
+
+ comment: Revised Tech report +
+
+
+
+
+ + ♻ ☆ ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth + Estimation CVPR + + +
+ In the absence of parallax cues, a learning-based single image depth +estimation (SIDE) model relies heavily on shading and contextual cues in the +image. While this simplicity is attractive, it is necessary to train such +models on large and varied datasets, which are difficult to capture. It has +been shown that using embeddings from pre-trained foundational models, such as +CLIP, improves zero shot transfer in several applications. Taking inspiration +from this, in our paper we explore the use of global image priors generated +from a pre-trained ViT model to provide more detailed contextual information. +We argue that the embedding vector from a ViT model, pre-trained on a large +dataset, captures greater relevant information for SIDE than the usual route of +generating pseudo image captions, followed by CLIP based text embeddings. Based +on this idea, we propose a new SIDE model using a diffusion backbone which is +conditioned on ViT embeddings. Our proposed design establishes a new +state-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of +0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on +KITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to +0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model +trained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%) +over NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%, +18%, 45%, 9%) by ZoeDepth. The project page is available at +https://ecodepth-iitd.github.io + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ NeRT: Implicit Neural Representations for General Unsupervised + Turbulence Mitigation + + +
+ The atmospheric and water turbulence mitigation problems have emerged as +challenging inverse problems in computer vision and optics communities over the +years. However, current methods either rely heavily on the quality of the +training dataset or fail to generalize over various scenarios, such as static +scenes, dynamic scenes, and text reconstructions. We propose a general implicit +neural representation for unsupervised atmospheric and water turbulence +mitigation (NeRT). NeRT leverages the implicit neural representations and the +physically correct tilt-then-blur turbulence model to reconstruct the clean, +undistorted image, given only dozens of distorted input images. Moreover, we +show that NeRT outperforms the state-of-the-art through various qualitative and +quantitative evaluations of atmospheric and water turbulence datasets. +Furthermore, we demonstrate the ability of NeRT to eliminate uncontrolled +turbulence from real-world environments. Lastly, we incorporate NeRT into +continuously captured video sequences and demonstrate $48 \times$ speedup. + +
+
+
+
+
+ + ♻ ☆ WALT3D: Generating Realistic Training Data from Time-Lapse Imagery for + Reconstructing Dynamic Objects under Occlusion CVPR 2024 + + +
+ Current methods for 2D and 3D object understanding struggle with severe +occlusions in busy urban environments, partly due to the lack of large-scale +labeled ground-truth annotations for learning occlusion. In this work, we +introduce a novel framework for automatically generating a large, realistic +dataset of dynamic objects under occlusions using freely available time-lapse +imagery. By leveraging off-the-shelf 2D (bounding box, segmentation, keypoint) +and 3D (pose, shape) predictions as pseudo-groundtruth, unoccluded 3D objects +are identified automatically and composited into the background in a clip-art +style, ensuring realistic appearances and physically accurate occlusion +configurations. The resulting clip-art image with pseudo-groundtruth enables +efficient training of object reconstruction methods that are robust to +occlusions. Our method demonstrates significant improvements in both 2D and 3D +reconstruction, particularly in scenarios with heavily occluded objects like +vehicles and people in urban scenes. + +
+
+ comment: To appear in CVPR 2024. Homepage: https://www.cs.cmu.edu/~walt3d +
+
+
+
+
+ + ♻ ☆ CityDreamer: Compositional Generative Model of Unbounded 3D Cities CVPR 2024 + + +
+ 3D city generation is a desirable yet challenging task, since humans are more +sensitive to structural distortions in urban environments. Additionally, +generating 3D cities is more complex than 3D natural scenes since buildings, as +objects of the same class, exhibit a wider range of appearances compared to the +relatively consistent appearance of objects like trees in natural scenes. To +address these challenges, we propose \textbf{CityDreamer}, a compositional +generative model designed specifically for unbounded 3D cities. Our key insight +is that 3D city generation should be a composition of different types of neural +fields: 1) various building instances, and 2) background stuff, such as roads +and green lands. Specifically, we adopt the bird's eye view scene +representation and employ a volumetric render for both instance-oriented and +stuff-oriented neural fields. The generative hash grid and periodic positional +embedding are tailored as scene parameterization to suit the distinct +characteristics of building instances and background stuff. Furthermore, we +contribute a suite of CityGen Datasets, including OSM and GoogleEarth, which +comprises a vast amount of real-world city imagery to enhance the realism of +the generated 3D cities both in their layouts and appearances. CityDreamer +achieves state-of-the-art performance not only in generating realistic 3D +cities but also in localized editing within the generated cities. + +
+
+ comment: CVPR 2024. Project page: https://haozhexie.com/project/city-dreamer +
+
+
+
+
+ + ♻ ☆ ZigMa: A DiT-style Zigzag Mamba Diffusion Model + + +
+ The diffusion model has long been plagued by scalability and quadratic +complexity issues, especially within transformer-based structures. In this +study, we aim to leverage the long sequence modeling capability of a +State-Space Model called Mamba to extend its applicability to visual data +generation. Firstly, we identify a critical oversight in most current +Mamba-based vision methods, namely the lack of consideration for spatial +continuity in the scan scheme of Mamba. Secondly, building upon this insight, +we introduce a simple, plug-and-play, zero-parameter method named Zigzag Mamba, +which outperforms Mamba-based baselines and demonstrates improved speed and +memory utilization compared to transformer-based baselines. Lastly, we +integrate Zigzag Mamba with the Stochastic Interpolant framework to investigate +the scalability of the model on large-resolution visual datasets, such as +FacesHQ $1024\times 1024$ and UCF101, MultiModal-CelebA-HQ, and MS COCO +$256\times 256$ . Code will be released at https://taohu.me/zigma/ + +
+
+ comment: Project Page: https://taohu.me/zigma/ +
+
+
+
+
+ + ♻ ☆ A Survey on Multimodal Large Language Models + + +
+ Recently, Multimodal Large Language Model (MLLM) represented by GPT-4V has +been a new rising research hotspot, which uses powerful Large Language Models +(LLMs) as a brain to perform multimodal tasks. The surprising emergent +capabilities of MLLM, such as writing stories based on images and OCR-free math +reasoning, are rare in traditional multimodal methods, suggesting a potential +path to artificial general intelligence. To this end, both academia and +industry have endeavored to develop MLLMs that can compete with or even better +than GPT-4V, pushing the limit of research at a surprising speed. In this +paper, we aim to trace and summarize the recent progress of MLLMs. First of +all, we present the basic formulation of MLLM and delineate its related +concepts, including architecture, training strategy and data, as well as +evaluation. Then, we introduce research topics about how MLLMs can be extended +to support more granularity, modalities, languages, and scenarios. We continue +with multimodal hallucination and extended techniques, including Multimodal ICL +(M-ICL), Multimodal CoT (M-CoT), and LLM-Aided Visual Reasoning (LAVR). To +conclude the paper, we discuss existing challenges and point out promising +research directions. In light of the fact that the era of MLLM has only just +begun, we will keep updating this survey and hope it can inspire more research. +An associated GitHub link collecting the latest papers is available at +https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models. + +
+
+ comment: Project + page:https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models +
+
+
+
+
+ + ♻ ☆ DST-Det: Simple Dynamic Self-Training for Open-Vocabulary Object + Detection + + +
+ Open-vocabulary object detection (OVOD) aims to detect the objects beyond the +set of classes observed during training. This work introduces a straightforward +and efficient strategy that utilizes pre-trained vision-language models (VLM), +like CLIP, to identify potential novel classes through zero-shot +classification. Previous methods use a class-agnostic region proposal network +to detect object proposals and consider the proposals that do not match the +ground truth as background. Unlike these methods, our method will select a +subset of proposals that will be considered as background during the training. +Then, we treat them as novel classes during training. We refer to this approach +as the self-training strategy, which enhances recall and accuracy for novel +classes without requiring extra annotations, datasets, and re-training. +Compared to previous pseudo methods, our approach does not require re-training +and offline labeling processing, which is more efficient and effective in +one-shot training. Empirical evaluations on three datasets, including LVIS, +V3Det, and COCO, demonstrate significant improvements over the baseline +performance without incurring additional parameters or computational costs +during inference. In addition, we also apply our method to various baselines. +In particular, compared with the previous method, F-VLM, our method achieves a +1.7% improvement on the LVIS dataset. Combined with the recent method CLIPSelf, +our method also achieves 46.7 novel class AP on COCO without introducing extra +data for pertaining. We also achieve over 6.5% improvement over the F-VLM +baseline in the recent challenging V3Det dataset. We release our code and +models at https://github.com/xushilin1/dst-det. + +
+
+
+
+
+ + ♻ ☆ Efficient Benchmarking of Language Models NAACL + + +
+ The increasing versatility of language models (LMs) has given rise to a new +class of benchmarks that comprehensively assess a broad range of capabilities. +Such benchmarks are associated with massive computational costs, extending to +thousands of GPU hours per model. However, the efficiency aspect of these +evaluation efforts had raised little discussion in the literature. In this +work, we present the problem of Efficient Benchmarking, namely, intelligently +reducing the computation costs of LM evaluation without compromising +reliability. Using the HELM benchmark as a test case, we investigate how +different benchmark design choices affect the computation-reliability +trade-off. We propose to evaluate the reliability of such decisions, by using a +new measure -- Decision Impact on Reliability, DIoR for short. We find, for +example, that a benchmark leader may change by merely removing a low-ranked +model from the benchmark, and observe that a correct benchmark ranking can be +obtained by considering only a fraction of the evaluation examples. Based on +our findings, we outline a set of concrete recommendations for efficient +benchmark design and utilization practices. To take a step further, we use our +findings to propose an evaluation algorithm, that, when applied to the HELM +benchmark, leads to dramatic cost savings with minimal loss of benchmark +reliability, often reducing computation by x100 or more. + +
+
+ comment: Accepted to NAACL main track +
+
+
+
+
+ + ♻ ☆ Text-image Alignment for Diffusion-based Perception + + +
+ Diffusion models are generative models with impressive text-to-image +synthesis capabilities and have spurred a new wave of creative methods for +classical machine learning tasks. However, the best way to harness the +perceptual knowledge of these generative models for visual tasks is still an +open question. Specifically, it is unclear how to use the prompting interface +when applying diffusion backbones to vision tasks. We find that automatically +generated captions can improve text-image alignment and significantly enhance a +model's cross-attention maps, leading to better perceptual performance. Our +approach improves upon the current state-of-the-art (SOTA) in diffusion-based +semantic segmentation on ADE20K and the current overall SOTA for depth +estimation on NYUv2. Furthermore, our method generalizes to the cross-domain +setting. We use model personalization and caption modifications to align our +model to the target domain and find improvements over unaligned baselines. Our +cross-domain object detection model, trained on Pascal VOC, achieves SOTA +results on Watercolor2K. Our cross-domain segmentation method, trained on +Cityscapes, achieves SOTA results on Dark Zurich-val and Nighttime Driving. +Project page: https://www.vision.caltech.edu/tadp/. Code: +https://github.com/damaggu/TADP. + +
+
+ comment: Project page: https://www.vision.caltech.edu/tadp/, Code page: + github.com/damaggu/TADP +
+
+
+
+
+ + ♻ ☆ Shape-Guided Diffusion with Inside-Outside Attention WACV 2024 + + +
+ We introduce precise object silhouette as a new form of user control in +text-to-image diffusion models, which we dub Shape-Guided Diffusion. Our +training-free method uses an Inside-Outside Attention mechanism during the +inversion and generation process to apply a shape constraint to the cross- and +self-attention maps. Our mechanism designates which spatial region is the +object (inside) vs. background (outside) then associates edits to the correct +region. We demonstrate the efficacy of our method on the shape-guided editing +task, where the model must replace an object according to a text prompt and +object mask. We curate a new ShapePrompts benchmark derived from MS-COCO and +achieve SOTA results in shape faithfulness without a degradation in text +alignment or image realism according to both automatic metrics and annotator +ratings. Our data and code will be made available at +https://shape-guided-diffusion.github.io. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ♻ ☆ HAL3D: Hierarchical Active Learning for Fine-Grained 3D Part Labeling ICCV 2023 + + +
+ We present the first active learning tool for fine-grained 3D part labeling, +a problem which challenges even the most advanced deep learning (DL) methods +due to the significant structural variations among the small and intricate +parts. For the same reason, the necessary data annotation effort is tremendous, +motivating approaches to minimize human involvement. Our labeling tool +iteratively verifies or modifies part labels predicted by a deep neural +network, with human feedback continually improving the network prediction. To +effectively reduce human efforts, we develop two novel features in our tool, +hierarchical and symmetry-aware active labeling. Our human-in-the-loop +approach, coined HAL3D, achieves 100% accuracy (barring human errors) on any +test set with pre-defined hierarchical part labels, with 80% time-saving over +manual effort. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SymTC: A Symbiotic Transformer-CNN Net for Instance Segmentation of + Lumbar Spine MRI + + +
+ Intervertebral disc disease, a prevalent ailment, frequently leads to +intermittent or persistent low back pain, and diagnosing and assessing of this +disease rely on accurate measurement of vertebral bone and intervertebral disc +geometries from lumbar MR images. Deep neural network (DNN) models may assist +clinicians with more efficient image segmentation of individual instances +(disks and vertebrae) of the lumbar spine in an automated way, which is termed +as instance image segmentation. In this work, we proposed SymTC, an innovative +lumbar spine MR image segmentation model that combines the strengths of +Transformer and Convolutional Neural Network (CNN). Specifically, we designed a +parallel dual-path architecture to merge CNN layers and Transformer layers, and +we integrated a novel position embedding into the self-attention module of +Transformer, enhancing the utilization of positional information for more +accurate segmentation. To further improves model performance, we introduced a +new data augmentation technique to create synthetic yet realistic MR image +dataset, named SSMSpine, which is made publicly available. We evaluated our +SymTC and the other 15 existing image segmentation models on our private +in-house dataset and the public SSMSpine dataset, using two metrics, Dice +Similarity Coefficient and 95% Hausdorff Distance. The results show that our +SymTC has the best performance for segmenting vertebral bones and +intervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine +dataset are available at https://github.com/jiasongchen/SymTC. + +
+
+
+
+
+ + ♻ ☆ Multimodal Representation Learning by Alternating Unimodal Adaptation CVPR 2024 + + +
+ Multimodal learning, which integrates data from diverse sensory modes, plays +a pivotal role in artificial intelligence. However, existing multimodal +learning methods often struggle with challenges where some modalities appear +more dominant than others during multimodal learning, resulting in suboptimal +performance. To address this challenge, we propose MLA (Multimodal Learning +with Alternating Unimodal Adaptation). MLA reframes the conventional joint +multimodal learning process by transforming it into an alternating unimodal +learning process, thereby minimizing interference between modalities. +Simultaneously, it captures cross-modal interactions through a shared head, +which undergoes continuous optimization across different modalities. This +optimization process is controlled by a gradient modification mechanism to +prevent the shared head from losing previously acquired information. During the +inference phase, MLA utilizes a test-time uncertainty-based model fusion +mechanism to integrate multimodal information. Extensive experiments are +conducted on five diverse datasets, encompassing scenarios with complete +modalities and scenarios with missing modalities. These experiments demonstrate +the superiority of MLA over competing prior approaches. Our code is available +at +https://github.com/Cecile-hi/Multimodal-Learning-with-Alternating-Unimodal-Adaptation. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Open3DSG: Open-Vocabulary 3D Scene Graphs from Point Clouds with + Queryable Objects and Open-Set Relationships CVPR 2024 + + +
+ Current approaches for 3D scene graph prediction rely on labeled datasets to +train models for a fixed set of known object classes and relationship +categories. We present Open3DSG, an alternative approach to learn 3D scene +graph prediction in an open world without requiring labeled scene graph data. +We co-embed the features from a 3D scene graph prediction backbone with the +feature space of powerful open world 2D vision language foundation models. This +enables us to predict 3D scene graphs from 3D point clouds in a zero-shot +manner by querying object classes from an open vocabulary and predicting the +inter-object relationships from a grounded LLM with scene graph features and +queried object classes as context. Open3DSG is the first 3D point cloud method +to predict not only explicit open-vocabulary object classes, but also open-set +relationships that are not limited to a predefined label set, making it +possible to express rare as well as specific objects and relationships in the +predicted 3D scene graph. Our experiments show that Open3DSG is effective at +predicting arbitrary object classes as well as their complex inter-object +relationships describing spatial, supportive, semantic and comparative +relationships. + +
+
+ comment: CVPR 2024. Project page: https://kochsebastian.com/open3dsg +
+
+
+
+
+ + ♻ ☆ Introducing an ensemble method for the early detection of Alzheimer's + disease through the analysis of PET scan images + + +
+ Alzheimer's disease is a progressive neurodegenerative disorder that +primarily affects cognitive functions such as memory, thinking, and behavior. +In this disease, there is a critical phase, mild cognitive impairment, that is +really important to be diagnosed early since some patients with progressive MCI +will develop the disease. This study delves into the challenging task of +classifying Alzheimer's disease into four distinct groups: control normal (CN), +progressive mild cognitive impairment (pMCI), stable mild cognitive impairment +(sMCI), and Alzheimer's disease (AD). This classification is based on a +thorough examination of PET scan images obtained from the ADNI dataset, which +provides a thorough understanding of the disease's progression. Several +deep-learning and traditional machine-learning models have been used to detect +Alzheimer's disease. In this paper, three deep-learning models, namely VGG16 +and AlexNet, and a custom Convolutional neural network (CNN) with 8-fold +cross-validation have been used for classification. Finally, an ensemble +technique is used to improve the overall result of these models. The results +show that using deep-learning models to tell the difference between MCI +patients gives an overall average accuracy of 93.13% and an AUC of 94.4%. + +
+
+
+
+
+ + ♻ ☆ Supplementing Missing Visions via Dialog for Scene Graph Generations ICASSP 2024 + + +
+ Most current AI systems rely on the premise that the input visual data are +sufficient to achieve competitive performance in various computer vision tasks. +However, the classic task setup rarely considers the challenging, yet common +practical situations where the complete visual data may be inaccessible due to +various reasons (e.g., restricted view range and occlusions). To this end, we +investigate a computer vision task setting with incomplete visual input data. +Specifically, we exploit the Scene Graph Generation (SGG) task with various +levels of visual data missingness as input. While insufficient visual input +intuitively leads to performance drop, we propose to supplement the missing +visions via the natural language dialog interactions to better accomplish the +task objective. We design a model-agnostic Supplementary Interactive Dialog +(SI-Dial) framework that can be jointly learned with most existing models, +endowing the current AI systems with the ability of question-answer +interactions in natural language. We demonstrate the feasibility of such a task +setting with missing visual input and the effectiveness of our proposed dialog +module as the supplementary information source through extensive experiments +and analysis, by achieving promising performance improvement over multiple +baselines. + +
+
+ comment: ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Segment Anything in Medical Images + + +
+ Medical image segmentation is a critical component in clinical practice, +facilitating accurate diagnosis, treatment planning, and disease monitoring. +However, existing methods, often tailored to specific modalities or disease +types, lack generalizability across the diverse spectrum of medical image +segmentation tasks. Here we present MedSAM, a foundation model designed for +bridging this gap by enabling universal medical image segmentation. The model +is developed on a large-scale medical image dataset with 1,570,263 image-mask +pairs, covering 10 imaging modalities and over 30 cancer types. We conduct a +comprehensive evaluation on 86 internal validation tasks and 60 external +validation tasks, demonstrating better accuracy and robustness than +modality-wise specialist models. By delivering accurate and efficient +segmentation across a wide spectrum of tasks, MedSAM holds significant +potential to expedite the evolution of diagnostic tools and the personalization +of treatment plans. + +
+
+
+
+
+ + ♻ ☆ The Multi-modality Cell Segmentation Challenge: Towards Universal + Solutions NeurIPS22 + + +
+ Cell segmentation is a critical step for quantitative single-cell analysis in +microscopy images. Existing cell segmentation methods are often tailored to +specific modalities or require manual interventions to specify hyper-parameters +in different experimental settings. Here, we present a multi-modality cell +segmentation benchmark, comprising over 1500 labeled images derived from more +than 50 diverse biological experiments. The top participants developed a +Transformer-based deep-learning algorithm that not only exceeds existing +methods but can also be applied to diverse microscopy images across imaging +platforms and tissue types without manual parameter adjustments. This benchmark +and the improved algorithm offer promising avenues for more accurate and +versatile cell analysis in microscopy imaging. + +
+
+ comment: NeurIPS22 Cell Segmentation Challenge: + https://neurips22-cellseg.grand-challenge.org/ . Nature Methods (2024) +
+
+
+
+
+ + ♻ ☆ Copilot4D: Learning Unsupervised World Models for Autonomous Driving via + Discrete Diffusion ICLR 2024 + + +
+ Learning world models can teach an agent how the world works in an +unsupervised manner. Even though it can be viewed as a special case of sequence +modeling, progress for scaling world models on robotic applications such as +autonomous driving has been somewhat less rapid than scaling language models +with Generative Pre-trained Transformers (GPT). We identify two reasons as +major bottlenecks: dealing with complex and unstructured observation space, and +having a scalable generative model. Consequently, we propose Copilot4D, a novel +world modeling approach that first tokenizes sensor observations with VQVAE, +then predicts the future via discrete diffusion. To efficiently decode and +denoise tokens in parallel, we recast Masked Generative Image Transformer as +discrete diffusion and enhance it with a few simple changes, resulting in +notable improvement. When applied to learning world models on point cloud +observations, Copilot4D reduces prior SOTA Chamfer distance by more than 65% +for 1s prediction, and more than 50% for 3s prediction, across NuScenes, KITTI +Odometry, and Argoverse2 datasets. Our results demonstrate that discrete +diffusion on tokenized agent experience can unlock the power of GPT-like +unsupervised learning for robotics. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Sat2Scene: 3D Urban Scene Generation from Satellite Images with + Diffusion + + +
+ Directly generating scenes from satellite imagery offers exciting +possibilities for integration into applications like games and map services. +However, challenges arise from significant view changes and scene scale. +Previous efforts mainly focused on image or video generation, lacking +exploration into the adaptability of scene generation for arbitrary views. +Existing 3D generation works either operate at the object level or are +difficult to utilize the geometry obtained from satellite imagery. To overcome +these limitations, we propose a novel architecture for direct 3D scene +generation by introducing diffusion models into 3D sparse representations and +combining them with neural rendering techniques. Specifically, our approach +generates texture colors at the point level for a given geometry using a 3D +diffusion model first, which is then transformed into a scene representation in +a feed-forward manner. The representation can be utilized to render arbitrary +views which would excel in both single-frame quality and inter-frame +consistency. Experiments in two city-scale datasets show that our model +demonstrates proficiency in generating photo-realistic street-view image +sequences and cross-view urban scenes from satellite imagery. + +
+
+
+
+
+ + ♻ ☆ Task-Oriented Communication for Edge Video Analytics + + +
+ With the development of artificial intelligence (AI) techniques and the +increasing popularity of camera-equipped devices, many edge video analytics +applications are emerging, calling for the deployment of computation-intensive +AI models at the network edge. Edge inference is a promising solution to move +the computation-intensive workloads from low-end devices to a powerful edge +server for video analytics, but the device-server communications will remain a +bottleneck due to the limited bandwidth. This paper proposes a task-oriented +communication framework for edge video analytics, where multiple devices +collect the visual sensory data and transmit the informative features to an +edge server for processing. To enable low-latency inference, this framework +removes video redundancy in spatial and temporal domains and transmits minimal +information that is essential for the downstream task, rather than +reconstructing the videos at the edge server. Specifically, it extracts compact +task-relevant features based on the deterministic information bottleneck (IB) +principle, which characterizes a tradeoff between the informativeness of the +features and the communication cost. As the features of consecutive frames are +temporally correlated, we propose a temporal entropy model (TEM) to reduce the +bitrate by taking the previous features as side information in feature +encoding. To further improve the inference performance, we build a +spatial-temporal fusion module at the server to integrate features of the +current and previous frames for joint inference. Extensive experiments on video +analytics tasks evidence that the proposed framework effectively encodes +task-relevant information of video data and achieves a better rate-performance +tradeoff than existing methods. + +
+
+ comment: This paper was accepted to IEEE Transactions on Wireless + Communications (TWC) +
+
+
+
+
+ + ♻ ☆ Grounded Question-Answering in Long Egocentric Videos CVPR 2024 + + +
+ Existing approaches to video understanding, mainly designed for short videos +from a third-person perspective, are limited in their applicability in certain +fields, such as robotics. In this paper, we delve into open-ended +question-answering (QA) in long, egocentric videos, which allows individuals or +robots to inquire about their own past visual experiences. This task presents +unique challenges, including the complexity of temporally grounding queries +within extensive video content, the high resource demands for precise data +annotation, and the inherent difficulty of evaluating open-ended answers due to +their ambiguous nature. Our proposed approach tackles these challenges by (i) +integrating query grounding and answering within a unified model to reduce +error propagation; (ii) employing large language models for efficient and +scalable data synthesis; and (iii) introducing a close-ended QA task for +evaluation, to manage answer ambiguity. Extensive experiments demonstrate the +effectiveness of our method, which also achieves state-of-the-art performance +on the QaEgo4D and Ego4D-NLQ benchmarks. Code, data, and models are available +at https://github.com/Becomebright/GroundVQA. + +
+
+ comment: Accepted to CVPR 2024. Project website at https://dszdsz.cn/GroundVQA +
+
+
+
+
+ + ♻ ☆ CricaVPR: Cross-image Correlation-aware Representation Learning for + Visual Place Recognition CVPR2024 + + +
+ Over the past decade, most methods in visual place recognition (VPR) have +used neural networks to produce feature representations. These networks +typically produce a global representation of a place image using only this +image itself and neglect the cross-image variations (e.g. viewpoint and +illumination), which limits their robustness in challenging scenes. In this +paper, we propose a robust global representation method with cross-image +correlation awareness for VPR, named CricaVPR. Our method uses the attention +mechanism to correlate multiple images within a batch. These images can be +taken in the same place with different conditions or viewpoints, or even +captured from different places. Therefore, our method can utilize the +cross-image variations as a cue to guide the representation learning, which +ensures more robust features are produced. To further facilitate the +robustness, we propose a multi-scale convolution-enhanced adaptation method to +adapt pre-trained visual foundation models to the VPR task, which introduces +the multi-scale local information to further enhance the cross-image +correlation-aware representation. Experimental results show that our method +outperforms state-of-the-art methods by a large margin with significantly less +training time. The code is released at https://github.com/Lu-Feng/CricaVPR. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced + Training CVPR 2024 + + +
+ Contrastive pretraining of image-text foundation models, such as CLIP, +demonstrated excellent zero-shot performance and improved robustness on a wide +range of downstream tasks. However, these models utilize large +transformer-based encoders with significant memory and latency overhead which +pose challenges for deployment on mobile devices. In this work, we introduce +MobileCLIP -- a new family of efficient image-text models optimized for runtime +performance along with a novel and efficient training approach, namely +multi-modal reinforced training. The proposed training approach leverages +knowledge transfer from an image captioning model and an ensemble of strong +CLIP encoders to improve the accuracy of efficient models. Our approach avoids +train-time compute overhead by storing the additional knowledge in a reinforced +dataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for +zero-shot classification and retrieval tasks on several datasets. Our +MobileCLIP-S2 variant is 2.3$\times$ faster while more accurate compared to +previous best CLIP model based on ViT-B/16. We further demonstrate the +effectiveness of our multi-modal reinforced training by training a CLIP model +based on ViT-B/16 image backbone and achieving +2.9% average performance +improvement on 38 evaluation benchmarks compared to the previous best. +Moreover, we show that the proposed approach achieves 10$\times$-1000$\times$ +improved learning efficiency when compared with non-reinforced CLIP training. +Code and models are available at https://github.com/apple/ml-mobileclip . + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ TextFormer: A Query-based End-to-End Text Spotter with Mixed Supervision + + +
+ End-to-end text spotting is a vital computer vision task that aims to +integrate scene text detection and recognition into a unified framework. +Typical methods heavily rely on Region-of-Interest (RoI) operations to extract +local features and complex post-processing steps to produce final predictions. +To address these limitations, we propose TextFormer, a query-based end-to-end +text spotter with Transformer architecture. Specifically, using query embedding +per text instance, TextFormer builds upon an image encoder and a text decoder +to learn a joint semantic understanding for multi-task modeling. It allows for +mutual training and optimization of classification, segmentation, and +recognition branches, resulting in deeper feature sharing without sacrificing +flexibility or simplicity. Additionally, we design an Adaptive Global +aGgregation (AGG) module to transfer global features into sequential features +for reading arbitrarily-shaped texts, which overcomes the sub-optimization +problem of RoI operations. Furthermore, potential corpus information is +utilized from weak annotations to full labels through mixed supervision, +further improving text detection and end-to-end text spotting results. +Extensive experiments on various bilingual (i.e., English and Chinese) +benchmarks demonstrate the superiority of our method. Especially on TDA-ReCTS +dataset, TextFormer surpasses the state-of-the-art method in terms of 1-NED by +13.2%. + +
+
+ comment: Machine Intelligence Research, MIR 2024 +
+
+
+
+
+ + ♻ ☆ NViST: In the Wild New View Synthesis from a Single Image with + Transformers CVPR 2024 + + +
+ We propose NViST, a transformer-based model for efficient and generalizable +novel-view synthesis from a single image for real-world scenes. In contrast to +many methods that are trained on synthetic data, object-centred scenarios, or +in a category-specific manner, NViST is trained on MVImgNet, a large-scale +dataset of casually-captured real-world videos of hundreds of object categories +with diverse backgrounds. NViST transforms image inputs directly into a +radiance field, conditioned on camera parameters via adaptive layer +normalisation. In practice, NViST exploits fine-tuned masked autoencoder (MAE) +features and translates them to 3D output tokens via cross-attention, while +addressing occlusions with self-attention. To move away from object-centred +datasets and enable full scene synthesis, NViST adopts a 6-DOF camera pose +model and only requires relative pose, dropping the need for canonicalization +of the training data, which removes a substantial barrier to it being used on +casually captured datasets. We show results on unseen objects and categories +from MVImgNet and even generalization to casual phone captures. We conduct +qualitative and quantitative evaluations on MVImgNet and ShapeNet to show that +our model represents a step forward towards enabling true in-the-wild +generalizable novel-view synthesis from a single image. Project webpage: +https://wbjang.github.io/nvist_webpage. + +
+
+ comment: CVPR 2024, Project page: https://wbjang.github.io/nvist_webpage +
+
+
+
+
+ + ♻ ☆ Contrastive Denoising Score for Text-guided Latent Diffusion Image + Editing CVPR 2024 + + +
+ With the remarkable advent of text-to-image diffusion models, image editing +methods have become more diverse and continue to evolve. A promising recent +approach in this realm is Delta Denoising Score (DDS) - an image editing +technique based on Score Distillation Sampling (SDS) framework that leverages +the rich generative prior of text-to-image diffusion models. However, relying +solely on the difference between scoring functions is insufficient for +preserving specific structural elements from the original image, a crucial +aspect of image editing. To address this, here we present an embarrassingly +simple yet very powerful modification of DDS, called Contrastive Denoising +Score (CDS), for latent diffusion models (LDM). Inspired by the similarities +and differences between DDS and the contrastive learning for unpaired +image-to-image translation(CUT), we introduce a straightforward approach using +CUT loss within the DDS framework. Rather than employing auxiliary networks as +in the original CUT approach, we leverage the intermediate features of LDM, +specifically those from the self-attention layers, which possesses rich spatial +information. Our approach enables zero-shot image-to-image translation and +neural radiance field (NeRF) editing, achieving structural correspondence +between the input and output while maintaining content controllability. +Qualitative results and comparisons demonstrates the effectiveness of our +proposed method. Project page: https://hyelinnam.github.io/CDS/ + +
+
+ comment: CVPR 2024 (poster); Project page: https://hyelinnam.github.io/CDS/ +
+
+
+
+
+ + ♻ ☆ QUAR-VLA: Vision-Language-Action Model for Quadruped Robots + + +
+ The important manifestation of robot intelligence is the ability to naturally +interact and autonomously make decisions. Traditional approaches to robot +control often compartmentalize perception, planning, and decision-making, +simplifying system design but limiting the synergy between different +information streams. This compartmentalization poses challenges in achieving +seamless autonomous reasoning, decision-making, and action execution. To +address these limitations, a novel paradigm, named Vision-Language-Action tasks +for QUAdruped Robots (QUAR-VLA), has been introduced in this paper. This +approach tightly integrates visual information and instructions to generate +executable actions, effectively merging perception, planning, and +decision-making. The central idea is to elevate the overall intelligence of the +robot. Within this framework, a notable challenge lies in aligning fine-grained +instructions with visual perception information. This emphasizes the complexity +involved in ensuring that the robot accurately interprets and acts upon +detailed instructions in harmony with its visual observations. Consequently, we +propose QUAdruped Robotic Transformer (QUART), a family of VLA models to +integrate visual information and instructions from diverse modalities as input +and generates executable actions for real-world robots and present QUAdruped +Robot Dataset (QUARD), a large-scale multi-task dataset including navigation, +complex terrain locomotion, and whole-body manipulation tasks for training +QUART models. Our extensive evaluation (4000 evaluation trials) shows that our +approach leads to performant robotic policies and enables QUART to obtain a +range of emergent capabilities. + +
+
+
+
+
+ + ♻ ☆ OpenStereo: A Comprehensive Benchmark for Stereo Matching and Strong + Baseline + + +
+ Stereo matching aims to estimate the disparity between matching pixels in a +stereo image pair, which is of great importance to robotics, autonomous +driving, and other computer vision tasks. Despite the development of numerous +impressive methods in recent years, replicating their results and determining +the most suitable architecture for practical application remains challenging. +Addressing this gap, our paper introduces a comprehensive benchmark focusing on +practical applicability rather than solely on performance enhancement. +Specifically, we develop a flexible and efficient stereo matching codebase, +called OpenStereo. OpenStereo includes training and inference codes of more +than 10 network models, making it, to our knowledge, the most complete stereo +matching toolbox available. Based on OpenStereo, we conducted experiments and +have achieved or surpassed the performance metrics reported in the original +paper. Additionally, we carry out an exhaustive analysis and deconstruction of +recent developments in stereo matching through comprehensive ablative +experiments. These investigations inspired the creation of StereoBase, a strong +baseline model. Our StereoBase ranks 1st on SceneFlow, KITTI 2015, 2012 +(Reflective) among published methods and achieves the best performance across +all metrics. In addition, StereoBase has strong cross-dataset +generalization.Code is available at +\url{https://github.com/XiandaGuo/OpenStereo}. + +
+
+ comment: Code is available at: https://github.com/XiandaGuo/OpenStereo +
+
+
+
+
+ + ♻ ☆ Multiscale and Multilayer Contrastive Learning for Domain Generalization + + +
+ During the past decade, deep neural networks have led to fast-paced progress +and significant achievements in computer vision problems, for both academia and +industry. Yet despite their success, state-of-the-art image classification +approaches fail to generalize well in previously unseen visual contexts, as +required by many real-world applications. In this paper, we focus on this +domain generalization (DG) problem and argue that the generalization ability of +deep convolutional neural networks can be improved by taking advantage of +multi-layer and multi-scaled representations of the network. We introduce a +framework that aims at improving domain generalization of image classifiers by +combining both low-level and high-level features at multiple scales, enabling +the network to implicitly disentangle representations in its latent space and +learn domain-invariant attributes of the depicted objects. Additionally, to +further facilitate robust representation learning, we propose a novel objective +function, inspired by contrastive learning, which aims at constraining the +extracted representations to remain invariant under distribution shifts. We +demonstrate the effectiveness of our method by evaluating on the domain +generalization datasets of PACS, VLCS, Office-Home and NICO. Through extensive +experimentation, we show that our model is able to surpass the performance of +previous DG methods and consistently produce competitive and state-of-the-art +results in all datasets + +
+
+ comment: Manuscript accepted in: IEEE Transactions on Artificial Intelligence + (March 2024) +
+
+
+
+
+ + ♻ ☆ Solving Diffusion ODEs with Optimal Boundary Conditions for Better Image + Super-Resolution ICLR 2024 + + +
+ Diffusion models, as a kind of powerful generative model, have given +impressive results on image super-resolution (SR) tasks. However, due to the +randomness introduced in the reverse process of diffusion models, the +performances of diffusion-based SR models are fluctuating at every time of +sampling, especially for samplers with few resampled steps. This inherent +randomness of diffusion models results in ineffectiveness and instability, +making it challenging for users to guarantee the quality of SR results. +However, our work takes this randomness as an opportunity: fully analyzing and +leveraging it leads to the construction of an effective plug-and-play sampling +method that owns the potential to benefit a series of diffusion-based SR +methods. More in detail, we propose to steadily sample high-quality SR images +from pre-trained diffusion-based SR models by solving diffusion ordinary +differential equations (diffusion ODEs) with optimal boundary conditions (BCs) +and analyze the characteristics between the choices of BCs and their +corresponding SR results. Our analysis shows the route to obtain an +approximately optimal BC via an efficient exploration in the whole space. The +quality of SR results sampled by the proposed method with fewer steps +outperforms the quality of results sampled by current methods with randomness +from the same pre-trained diffusion-based SR model, which means that our +sampling method "boosts" current diffusion-based SR models without any +additional training. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Bilateral Propagation Network for Depth Completion CVPR 2024 + + +
+ Depth completion aims to derive a dense depth map from sparse depth +measurements with a synchronized color image. Current state-of-the-art (SOTA) +methods are predominantly propagation-based, which work as an iterative +refinement on the initial estimated dense depth. However, the initial depth +estimations mostly result from direct applications of convolutional layers on +the sparse depth map. In this paper, we present a Bilateral Propagation Network +(BP-Net), that propagates depth at the earliest stage to avoid directly +convolving on sparse data. Specifically, our approach propagates the target +depth from nearby depth measurements via a non-linear model, whose coefficients +are generated through a multi-layer perceptron conditioned on both +\emph{radiometric difference} and \emph{spatial distance}. By integrating +bilateral propagation with multi-modal fusion and depth refinement in a +multi-scale framework, our BP-Net demonstrates outstanding performance on both +indoor and outdoor scenes. It achieves SOTA on the NYUv2 dataset and ranks 1st +on the KITTI depth completion benchmark at the time of submission. Experimental +results not only show the effectiveness of bilateral propagation but also +emphasize the significance of early-stage propagation in contrast to the +refinement stage. Our code and trained models will be available on the project +page. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Emotional Speech-driven 3D Body Animation via Disentangled Latent + Diffusion CVPR + + +
+ Existing methods for synthesizing 3D human gestures from speech have shown +promising results, but they do not explicitly model the impact of emotions on +the generated gestures. Instead, these methods directly output animations from +speech without control over the expressed emotion. To address this limitation, +we present AMUSE, an emotional speech-driven body animation model based on +latent diffusion. Our observation is that content (i.e., gestures related to +speech rhythm and word utterances), emotion, and personal style are separable. +To account for this, AMUSE maps the driving audio to three disentangled latent +vectors: one for content, one for emotion, and one for personal style. A latent +diffusion model, trained to generate gesture motion sequences, is then +conditioned on these latent vectors. Once trained, AMUSE synthesizes 3D human +gestures directly from speech with control over the expressed emotions and +style by combining the content from the driving speech with the emotion and +style of another speech sequence. Randomly sampling the noise of the diffusion +model further generates variations of the gesture with the same emotional +expressivity. Qualitative, quantitative, and perceptual evaluations demonstrate +that AMUSE outputs realistic gesture sequences. Compared to the state of the +art, the generated gestures are better synchronized with the speech content, +and better represent the emotion expressed by the input speech. Our code is +available at amuse.is.tue.mpg.de. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR) 2024. + Webpage: https://amuse.is.tue.mpg.de/ +
+
+
+
+
+ + ♻ ☆ Dense Supervision Propagation for Weakly Supervised Semantic + Segmentation on 3D Point Clouds + + +
+ Semantic segmentation on 3D point clouds is an important task for 3D scene +understanding. While dense labeling on 3D data is expensive and time-consuming, +only a few works address weakly supervised semantic point cloud segmentation +methods to relieve the labeling cost by learning from simpler and cheaper +labels. Meanwhile, there are still huge performance gaps between existing +weakly supervised methods and state-of-the-art fully supervised methods. In +this paper, we train a semantic point cloud segmentation network with only a +small portion of points being labeled. We argue that we can better utilize the +limited supervision information as we densely propagate the supervision signal +from the labeled points to other points within and across the input samples. +Specifically, we propose a cross-sample feature reallocating module to transfer +similar features and therefore re-route the gradients across two samples with +common classes and an intra-sample feature redistribution module to propagate +supervision signals on unlabeled points across and within point cloud samples. +We conduct extensive experiments on public datasets S3DIS and ScanNet. Our +weakly supervised method with only 10% and 1% of labels can produce compatible +results with the fully supervised counterpart. + +
+
+
+
+
+ + ♻ ☆ Towards Learning a Generalist Model for Embodied Navigation CVPR 2024 + + +
+ Building a generalist agent that can interact with the world is the +intriguing target of AI systems, thus spurring the research for embodied +navigation, where an agent is required to navigate according to instructions or +respond to queries. Despite the major progress attained, previous works +primarily focus on task-specific agents and lack generalizability to unseen +scenarios. Recently, LLMs have presented remarkable capabilities across various +fields, and provided a promising opportunity for embodied navigation. Drawing +on this, we propose the first generalist model for embodied navigation, +NaviLLM. It adapts LLMs to embodied navigation by introducing schema-based +instruction. The schema-based instruction flexibly casts various tasks into +generation problems, thereby unifying a wide range of tasks. This approach +allows us to integrate diverse data sources from various datasets into the +training, equipping NaviLLM with a wide range of capabilities required by +embodied navigation. We conduct extensive experiments to evaluate the +performance and generalizability of our model. The experimental results +demonstrate that our unified model achieves state-of-the-art performance on +CVDN, SOON, and ScanQA. Specifically, it surpasses the previous +stats-of-the-art method by a significant margin of 29% in goal progress on +CVDN. Moreover, our model also demonstrates strong generalizability and +presents impressive results on unseen tasks, e.g., embodied question answering +and 3D captioning. + +
+
+ comment: Accepted by CVPR 2024 (14 pages, 3 figures) +
+
+
+
+
+ + ♻ ☆ HiCMAE: Hierarchical Contrastive Masked Autoencoder for Self-Supervised + Audio-Visual Emotion Recognition + + +
+ Audio-Visual Emotion Recognition (AVER) has garnered increasing attention in +recent years for its critical role in creating emotion-ware intelligent +machines. Previous efforts in this area are dominated by the supervised +learning paradigm. Despite significant progress, supervised learning is meeting +its bottleneck due to the longstanding data scarcity issue in AVER. Motivated +by recent advances in self-supervised learning, we propose Hierarchical +Contrastive Masked Autoencoder (HiCMAE), a novel self-supervised framework that +leverages large-scale self-supervised pre-training on vast unlabeled +audio-visual data to promote the advancement of AVER. Following prior arts in +self-supervised audio-visual representation learning, HiCMAE adopts two primary +forms of self-supervision for pre-training, namely masked data modeling and +contrastive learning. Unlike them which focus exclusively on top-layer +representations while neglecting explicit guidance of intermediate layers, +HiCMAE develops a three-pronged strategy to foster hierarchical audio-visual +feature learning and improve the overall quality of learned representations. To +verify the effectiveness of HiCMAE, we conduct extensive experiments on 9 +datasets covering both categorical and dimensional AVER tasks. Experimental +results show that our method significantly outperforms state-of-the-art +supervised and self-supervised audio-visual methods, which indicates that +HiCMAE is a powerful audio-visual emotion representation learner. Codes and +models will be publicly available at https://github.com/sunlicai/HiCMAE. + +
+
+ comment: Accepted by Information Fusion. The code is available at + https://github.com/sunlicai/HiCMAE +
+
+
+
+
+ + ♻ ☆ Invariant Representation via Decoupling Style and Spurious Features from + Images + + +
+ This paper considers the out-of-distribution (OOD) generalization problem +under the setting that both style distribution shift and spurious features +exist and domain labels are missing. This setting frequently arises in +real-world applications and is underlooked because previous approaches mainly +handle either of these two factors. The critical challenge is decoupling style +and spurious features in the absence of domain labels. To address this +challenge, we first propose a structural causal model (SCM) for the image +generation process, which captures both style distribution shift and spurious +features. The proposed SCM enables us to design a new framework called IRSS, +which can gradually separate style distribution and spurious features from +images by introducing adversarial neural networks and multi-environment +optimization, thus achieving OOD generalization. Moreover, it does not require +additional supervision (e.g., domain labels) other than the images and their +corresponding labels. Experiments on benchmark datasets demonstrate that IRSS +outperforms traditional OOD methods and solves the problem of Invariant risk +minimization (IRM) degradation, enabling the extraction of invariant features +under distribution shift. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ SemGauss-SLAM: Dense Semantic Gaussian Splatting SLAM + + +
+ We propose SemGauss-SLAM, the first semantic SLAM system utilizing 3D +Gaussian representation, that enables accurate 3D semantic mapping, robust +camera tracking, and high-quality rendering in real-time. In this system, we +incorporate semantic feature embedding into 3D Gaussian representation, which +effectively encodes semantic information within the spatial layout of the +environment for precise semantic scene representation. Furthermore, we propose +feature-level loss for updating 3D Gaussian representation, enabling +higher-level guidance for 3D Gaussian optimization. In addition, to reduce +cumulative drift and improve reconstruction accuracy, we introduce +semantic-informed bundle adjustment leveraging semantic associations for joint +optimization of 3D Gaussian representation and camera poses, leading to more +robust tracking and consistent mapping. Our SemGauss-SLAM method demonstrates +superior performance over existing dense semantic SLAM methods in terms of +mapping and tracking accuracy on Replica and ScanNet datasets, while also +showing excellent capabilities in novel-view semantic synthesis and 3D semantic +mapping. + +
+
+
+
+
+ + ♻ ☆ iMixer: hierarchical Hopfield network implies an invertible, implicit + and iterative MLP-Mixer + + +
+ In the last few years, the success of Transformers in computer vision has +stimulated the discovery of many alternative models that compete with +Transformers, such as the MLP-Mixer. Despite their weak inductive bias, these +models have achieved performance comparable to well-studied convolutional +neural networks. Recent studies on modern Hopfield networks suggest the +correspondence between certain energy-based associative memory models and +Transformers or MLP-Mixer, and shed some light on the theoretical background of +the Transformer-type architectures design. In this paper, we generalize the +correspondence to the recently introduced hierarchical Hopfield network, and +find iMixer, a novel generalization of MLP-Mixer model. Unlike ordinary +feedforward neural networks, iMixer involves MLP layers that propagate forward +from the output side to the input side. We characterize the module as an +example of invertible, implicit, and iterative mixing module. We evaluate the +model performance with various datasets on image classification tasks, and find +that iMixer, despite its unique architecture, exhibits stable learning +capabilities and achieves performance comparable to or better than the baseline +vanilla MLP-Mixer. The results imply that the correspondence between the +Hopfield networks and the Mixer models serves as a principle for understanding +a broader class of Transformer-like architecture designs. + +
+
+ comment: 19 pages. v2: minor improvements +
+
+
+
+
+ + ♻ ☆ StreamMultiDiffusion: Real-Time Interactive Generation with Region-Based + Semantic Control + + +
+ The enormous success of diffusion models in text-to-image synthesis has made +them promising candidates for the next generation of end-user applications for +image generation and editing. Previous works have focused on improving the +usability of diffusion models by reducing the inference time or increasing user +interactivity by allowing new, fine-grained controls such as region-based text +prompts. However, we empirically find that integrating both branches of works +is nontrivial, limiting the potential of diffusion models. To solve this +incompatibility, we present StreamMultiDiffusion, the first real-time +region-based text-to-image generation framework. By stabilizing fast inference +techniques and restructuring the model into a newly proposed multi-prompt +stream batch architecture, we achieve $\times 10$ faster panorama generation +than existing solutions, and the generation speed of 1.57 FPS in region-based +text-to-image synthesis on a single RTX 2080 Ti GPU. Our solution opens up a +new paradigm for interactive image generation named semantic palette, where +high-quality images are generated in real-time from given multiple hand-drawn +regions, encoding prescribed semantic meanings (e.g., eagle, girl). Our code +and demo application are available at +https://github.com/ironjr/StreamMultiDiffusion. + +
+
+ comment: 29 pages, 16 figures. v2: typos corrected, references added. Project + page: https://jaerinlee.com/research/StreamMultiDiffusion +
+
+
+
+
+ + ♻ ☆ LogoStyleFool: Vitiating Video Recognition Systems via Logo Style + Transfer AAAI 2024 + + +
+ Video recognition systems are vulnerable to adversarial examples. Recent +studies show that style transfer-based and patch-based unrestricted +perturbations can effectively improve attack efficiency. These attacks, +however, face two main challenges: 1) Adding large stylized perturbations to +all pixels reduces the naturalness of the video and such perturbations can be +easily detected. 2) Patch-based video attacks are not extensible to targeted +attacks due to the limited search space of reinforcement learning that has been +widely used in video attacks recently. In this paper, we focus on the video +black-box setting and propose a novel attack framework named LogoStyleFool by +adding a stylized logo to the clean video. We separate the attack into three +stages: style reference selection, reinforcement-learning-based logo style +transfer, and perturbation optimization. We solve the first challenge by +scaling down the perturbation range to a regional logo, while the second +challenge is addressed by complementing an optimization stage after +reinforcement learning. Experimental results substantiate the overall +superiority of LogoStyleFool over three state-of-the-art patch-based attacks in +terms of attack performance and semantic preservation. Meanwhile, LogoStyleFool +still maintains its performance against two existing patch-based defense +methods. We believe that our research is beneficial in increasing the attention +of the security community to such subregional style transfer attacks. + +
+
+ comment: 14 pages, 3 figures. Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ StyleFool: Fooling Video Classification Systems via Style Transfer + + +
+ Video classification systems are vulnerable to adversarial attacks, which can +create severe security problems in video verification. Current black-box +attacks need a large number of queries to succeed, resulting in high +computational overhead in the process of attack. On the other hand, attacks +with restricted perturbations are ineffective against defenses such as +denoising or adversarial training. In this paper, we focus on unrestricted +perturbations and propose StyleFool, a black-box video adversarial attack via +style transfer to fool the video classification system. StyleFool first +utilizes color theme proximity to select the best style image, which helps +avoid unnatural details in the stylized videos. Meanwhile, the target class +confidence is additionally considered in targeted attacks to influence the +output distribution of the classifier by moving the stylized video closer to or +even across the decision boundary. A gradient-free method is then employed to +further optimize the adversarial perturbations. We carry out extensive +experiments to evaluate StyleFool on two standard datasets, UCF-101 and +HMDB-51. The experimental results demonstrate that StyleFool outperforms the +state-of-the-art adversarial attacks in terms of both the number of queries and +the robustness against existing defenses. Moreover, 50% of the stylized videos +in untargeted attacks do not need any query since they can already fool the +video classification model. Furthermore, we evaluate the indistinguishability +through a user study to show that the adversarial samples of StyleFool look +imperceptible to human eyes, despite unrestricted perturbations. + +
+
+ comment: 18 pages, 9 figures. Accepted to S&P 2023 +
+
+
+
+
+ + ♻ ☆ Unsigned Orthogonal Distance Fields: An Accurate Neural Implicit + Representation for Diverse 3D Shapes CVPR 2024 + + +
+ Neural implicit representation of geometric shapes has witnessed considerable +advancements in recent years. However, common distance field based implicit +representations, specifically signed distance field (SDF) for watertight shapes +or unsigned distance field (UDF) for arbitrary shapes, routinely suffer from +degradation of reconstruction accuracy when converting to explicit surface +points and meshes. In this paper, we introduce a novel neural implicit +representation based on unsigned orthogonal distance fields (UODFs). In UODFs, +the minimal unsigned distance from any spatial point to the shape surface is +defined solely in one orthogonal direction, contrasting with the +multi-directional determination made by SDF and UDF. Consequently, every point +in the 3D UODFs can directly access its closest surface points along three +orthogonal directions. This distinctive feature leverages the accurate +reconstruction of surface points without interpolation errors. We verify the +effectiveness of UODFs through a range of reconstruction examples, extending +from simple watertight or non-watertight shapes to complex shapes that include +hollows, internal or assembling structures. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Asymmetric Masked Distillation for Pre-Training Small Foundation Models CVPR 2024 + + +
+ Self-supervised foundation models have shown great potential in computer +vision thanks to the pre-training paradigm of masked autoencoding. Scale is a +primary factor influencing the performance of these foundation models. However, +these large foundation models often result in high computational cost. This +paper focuses on pre-training relatively small vision transformer models that +could be efficiently adapted to downstream tasks. Specifically, taking +inspiration from knowledge distillation in model compression, we propose a new +asymmetric masked distillation (AMD) framework for pre-training relatively +small models with autoencoding. The core of AMD is to devise an asymmetric +masking strategy, where the teacher model is enabled to see more context +information with a lower masking ratio, while the student model is still +equipped with a high masking ratio. We design customized multi-layer feature +alignment between the teacher encoder and student encoder to regularize the +pre-training of student MAE. To demonstrate the effectiveness and versatility +of AMD, we apply it to both ImageMAE and VideoMAE for pre-training relatively +small ViT models. AMD achieved 84.6% classification accuracy on IN1K using the +ViT-B model. And AMD achieves 73.3% classification accuracy using the ViT-B +model on the Something-in-Something V2 dataset, a 3.7% improvement over the +original ViT-B model from VideoMAE. We also transfer AMD pre-trained models to +downstream tasks and obtain consistent performance improvement over the +original masked autoencoding. The code and models are available at +https://github.com/MCG-NJU/AMD. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning the 3D Fauna of the Web + + +
+ Learning 3D models of all animals on the Earth requires massively scaling up +existing solutions. With this ultimate goal in mind, we develop 3D-Fauna, an +approach that learns a pan-category deformable 3D animal model for more than +100 animal species jointly. One crucial bottleneck of modeling animals is the +limited availability of training data, which we overcome by simply learning +from 2D Internet images. We show that prior category-specific attempts fail to +generalize to rare species with limited training images. We address this +challenge by introducing the Semantic Bank of Skinned Models (SBSM), which +automatically discovers a small set of base animal shapes by combining +geometric inductive priors with semantic knowledge implicitly captured by an +off-the-shelf self-supervised feature extractor. To train such a model, we also +contribute a new large-scale dataset of diverse animal species. At inference +time, given a single image of any quadruped animal, our model reconstructs an +articulated 3D mesh in a feed-forward fashion within seconds. + +
+
+ comment: The first two authors contributed equally to this work. The last + three authors contributed equally. Project page: + https://kyleleey.github.io/3DFauna/ +
+
+
+
+
+ + ♻ ☆ Exploring Phonetic Context-Aware Lip-Sync For Talking Face Generation ICASSP 2024 + + +
+ Talking face generation is the challenging task of synthesizing a natural and +realistic face that requires accurate synchronization with a given audio. Due +to co-articulation, where an isolated phone is influenced by the preceding or +following phones, the articulation of a phone varies upon the phonetic context. +Therefore, modeling lip motion with the phonetic context can generate more +spatio-temporally aligned lip movement. In this respect, we investigate the +phonetic context in generating lip motion for talking face generation. We +propose Context-Aware Lip-Sync framework (CALS), which explicitly leverages +phonetic context to generate lip movement of the target face. CALS is comprised +of an Audio-to-Lip module and a Lip-to-Face module. The former is pretrained +based on masked learning to map each phone to a contextualized lip motion unit. +The contextualized lip motion unit then guides the latter in synthesizing a +target identity with context-aware lip motion. From extensive experiments, we +verify that simply exploiting the phonetic context in the proposed CALS +framework effectively enhances spatio-temporal alignment. We also demonstrate +the extent to which the phonetic context assists in lip synchronization and +find the effective window size for lip generation to be approximately 1.2 +seconds. + +
+
+ comment: Accepted at ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ An Embarrassingly Simple Defense Against Backdoor Attacks On SSL + + +
+ Self Supervised Learning (SSL) has emerged as a powerful paradigm to tackle +data landscapes with absence of human supervision. The ability to learn +meaningful tasks without the use of labeled data makes SSL a popular method to +manage large chunks of data in the absence of labels. However, recent work +indicates SSL to be vulnerable to backdoor attacks, wherein models can be +controlled, possibly maliciously, to suit an adversary's motives. Li et. al +(2022) introduce a novel frequency-based backdoor attack: CTRL. They show that +CTRL can be used to efficiently and stealthily gain control over a victim's +model trained using SSL. In this work, we devise two defense strategies against +frequency-based attacks in SSL: One applicable before model training and the +second to be applied during model inference. Our first contribution utilizes +the invariance property of the downstream task to defend against backdoor +attacks in a generalizable fashion. We observe the ASR (Attack Success Rate) to +reduce by over 60% across experiments. Our Inference-time defense relies on +evasiveness of the attack and uses the luminance channel to defend against +attacks. Using object classification as the downstream task for SSL, we +demonstrate successful defense strategies that do not require re-training of +the model. Code is available at https://github.com/Aryan-Satpathy/Backdoor. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Towards Universal Fake Image Detectors that Generalize Across Generative + Models + + +
+ With generative models proliferating at a rapid rate, there is a growing need +for general purpose fake image detectors. In this work, we first show that the +existing paradigm, which consists of training a deep network for real-vs-fake +classification, fails to detect fake images from newer breeds of generative +models when trained to detect GAN fake images. Upon analysis, we find that the +resulting classifier is asymmetrically tuned to detect patterns that make an +image fake. The real class becomes a sink class holding anything that is not +fake, including generated images from models not accessible during training. +Building upon this discovery, we propose to perform real-vs-fake classification +without learning; i.e., using a feature space not explicitly trained to +distinguish real from fake images. We use nearest neighbor and linear probing +as instantiations of this idea. When given access to the feature space of a +large pretrained vision-language model, the very simple baseline of nearest +neighbor classification has surprisingly good generalization ability in +detecting fake images from a wide variety of generative models; e.g., it +improves upon the SoTA by +15.07 mAP and +25.90% acc when tested on unseen +diffusion and autoregressive models. + +
+
+
+
+
+ + ♻ ☆ Each Test Image Deserves A Specific Prompt: Continual Test-Time + Adaptation for 2D Medical Image Segmentation + + +
+ Distribution shift widely exists in medical images acquired from different +medical centres and poses a significant obstacle to deploying the pre-trained +semantic segmentation model in real-world applications. Test-time adaptation +has proven its effectiveness in tackling the cross-domain distribution shift +during inference. However, most existing methods achieve adaptation by updating +the pre-trained models, rendering them susceptible to error accumulation and +catastrophic forgetting when encountering a series of distribution shifts +(i.e., under the continual test-time adaptation setup). To overcome these +challenges caused by updating the models, in this paper, we freeze the +pre-trained model and propose the Visual Prompt-based Test-Time Adaptation +(VPTTA) method to train a specific prompt for each test image to align the +statistics in the batch normalization layers. Specifically, we present the +low-frequency prompt, which is lightweight with only a few parameters and can +be effectively trained in a single iteration. To enhance prompt initialization, +we equip VPTTA with a memory bank to benefit the current prompt from previous +ones. Additionally, we design a warm-up mechanism, which mixes source and +target statistics to construct warm-up statistics, thereby facilitating the +training process. Extensive experiments demonstrate the superiority of our +VPTTA over other state-of-the-art methods on two medical image segmentation +benchmark tasks. The code and weights of pre-trained source models are +available at https://github.com/Chen-Ziyang/VPTTA. + +
+
+
+
+
+ + ♻ ☆ VSCode: General Visual Salient and Camouflaged Object Detection with 2D + Prompt Learning + + +
+ Salient object detection (SOD) and camouflaged object detection (COD) are +related yet distinct binary mapping tasks. These tasks involve multiple +modalities, sharing commonalities and unique cues. Existing research often +employs intricate task-specific specialist models, potentially leading to +redundancy and suboptimal results. We introduce VSCode, a generalist model with +novel 2D prompt learning, to jointly address four SOD tasks and three COD +tasks. We utilize VST as the foundation model and introduce 2D prompts within +the encoder-decoder architecture to learn domain and task-specific knowledge on +two separate dimensions. A prompt discrimination loss helps disentangle +peculiarities to benefit model optimization. VSCode outperforms +state-of-the-art methods across six tasks on 26 datasets and exhibits zero-shot +generalization to unseen tasks by combining 2D prompts, such as RGB-D COD. +Source code has been available at https://github.com/Sssssuperior/VSCode. + +
+
+
+
+
+ + ♻ ☆ Continual Segmentation with Disentangled Objectness Learning and Class + Recognition CVPR 2024 + + +
+ Most continual segmentation methods tackle the problem as a per-pixel +classification task. However, such a paradigm is very challenging, and we find +query-based segmenters with built-in objectness have inherent advantages +compared with per-pixel ones, as objectness has strong transfer ability and +forgetting resistance. Based on these findings, we propose CoMasTRe by +disentangling continual segmentation into two stages: forgetting-resistant +continual objectness learning and well-researched continual classification. +CoMasTRe uses a two-stage segmenter learning class-agnostic mask proposals at +the first stage and leaving recognition to the second stage. During continual +learning, a simple but effective distillation is adopted to strengthen +objectness. To further mitigate the forgetting of old classes, we design a +multi-label class distillation strategy suited for segmentation. We assess the +effectiveness of CoMasTRe on PASCAL VOC and ADE20K. Extensive experiments show +that our method outperforms per-pixel and query-based methods on both datasets. +Code will be available at https://github.com/jordangong/CoMasTRe. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ TransPose: 6D Object Pose Estimation with Geometry-Aware Transformer + + +
+ Estimating the 6D object pose is an essential task in many applications. Due +to the lack of depth information, existing RGB-based methods are sensitive to +occlusion and illumination changes. How to extract and utilize the geometry +features in depth information is crucial to achieve accurate predictions. To +this end, we propose TransPose, a novel 6D pose framework that exploits +Transformer Encoder with geometry-aware module to develop better learning of +point cloud feature representations. Specifically, we first uniformly sample +point cloud and extract local geometry features with the designed local feature +extractor base on graph convolution network. To improve robustness to +occlusion, we adopt Transformer to perform the exchange of global information, +making each local feature contains global information. Finally, we introduce +geometry-aware module in Transformer Encoder, which to form an effective +constrain for point cloud feature learning and makes the global information +exchange more tightly coupled with point cloud tasks. Extensive experiments +indicate the effectiveness of TransPose, our pose estimation pipeline achieves +competitive results on three benchmark datasets. + +
+
+ comment: accept by NEUROCOMPUTING +
+
+
+
+
+ + ♻ ☆ City-on-Web: Real-time Neural Rendering of Large-scale Scenes on the Web + + +
+ Existing neural radiance field-based methods can achieve real-time rendering +of small scenes on the web platform. However, extending these methods to +large-scale scenes still poses significant challenges due to limited resources +in computation, memory, and bandwidth. In this paper, we propose City-on-Web, +the first method for real-time rendering of large-scale scenes on the web. We +propose a block-based volume rendering method to guarantee 3D consistency and +correct occlusion between blocks, and introduce a Level-of-Detail strategy +combined with dynamic loading/unloading of resources to significantly reduce +memory demands. Our system achieves real-time rendering of large-scale scenes +at approximately 32FPS with RTX 3060 GPU on the web and maintains rendering +quality comparable to the current state-of-the-art novel view synthesis +methods. + +
+
+ comment: Project page: https://ustc3dv.github.io/City-on-Web/ +
+
+
+
+
+ + ♻ ☆ FaceChain-ImagineID: Freely Crafting High-Fidelity Diverse Talking Faces + from Disentangled Audio + + +
+ In this paper, we abstract the process of people hearing speech, extracting +meaningful cues, and creating various dynamically audio-consistent talking +faces, termed Listening and Imagining, into the task of high-fidelity diverse +talking faces generation from a single audio. Specifically, it involves two +critical challenges: one is to effectively decouple identity, content, and +emotion from entangled audio, and the other is to maintain intra-video +diversity and inter-video consistency. To tackle the issues, we first dig out +the intricate relationships among facial factors and simplify the decoupling +process, tailoring a Progressive Audio Disentanglement for accurate facial +geometry and semantics learning, where each stage incorporates a customized +training module responsible for a specific factor. Secondly, to achieve +visually diverse and audio-synchronized animation solely from input audio +within a single model, we introduce the Controllable Coherent Frame generation, +which involves the flexible integration of three trainable adapters with frozen +Latent Diffusion Models (LDMs) to focus on maintaining facial geometry and +semantics, as well as texture and temporal coherence between frames. In this +way, we inherit high-quality diverse generation from LDMs while significantly +improving their controllability at a low training cost. Extensive experiments +demonstrate the flexibility and effectiveness of our method in handling this +paradigm. The codes will be released at +https://github.com/modelscope/facechain. + +
+
+
+
+
+ + ♻ ☆ Honeybee: Locality-enhanced Projector for Multimodal LLM CVPR 2024 + + +
+ In Multimodal Large Language Models (MLLMs), a visual projector plays a +crucial role in bridging pre-trained vision encoders with LLMs, enabling +profound visual understanding while harnessing the LLMs' robust capabilities. +Despite the importance of the visual projector, it has been relatively less +explored. In this study, we first identify two essential projector properties: +(i) flexibility in managing the number of visual tokens, crucial for MLLMs' +overall efficiency, and (ii) preservation of local context from visual +features, vital for spatial understanding. Based on these findings, we propose +a novel projector design that is both flexible and locality-enhanced, +effectively satisfying the two desirable properties. Additionally, we present +comprehensive strategies to effectively utilize multiple and multifaceted +instruction datasets. Through extensive experiments, we examine the impact of +individual design choices. Finally, our proposed MLLM, Honeybee, remarkably +outperforms previous state-of-the-art methods across various benchmarks, +including MME, MMBench, SEED-Bench, and LLaVA-Bench, achieving significantly +higher efficiency. Code and models are available at +https://github.com/kakaobrain/honeybee. + +
+
+ comment: CVPR 2024 camera-ready +
+
+
+
+
+ + ♻ ☆ VDC: Versatile Data Cleanser based on Visual-Linguistic Inconsistency by + Multimodal Large Language Models ICLR 2024 + + +
+ The role of data in building AI systems has recently been emphasized by the +emerging concept of data-centric AI. Unfortunately, in the real-world, datasets +may contain dirty samples, such as poisoned samples from backdoor attack, noisy +labels in crowdsourcing, and even hybrids of them. The presence of such dirty +samples makes the DNNs vunerable and unreliable.Hence, it is critical to detect +dirty samples to improve the quality and realiability of dataset. Existing +detectors only focus on detecting poisoned samples or noisy labels, that are +often prone to weak generalization when dealing with dirty samples from other +domains.In this paper, we find a commonality of various dirty samples is +visual-linguistic inconsistency between images and associated labels. To +capture the semantic inconsistency between modalities, we propose versatile +data cleanser (VDC) leveraging the surpassing capabilities of multimodal large +language models (MLLM) in cross-modal alignment and reasoning.It consists of +three consecutive modules: the visual question generation module to generate +insightful questions about the image; the visual question answering module to +acquire the semantics of the visual content by answering the questions with +MLLM; followed by the visual answer evaluation module to evaluate the +inconsistency.Extensive experiments demonstrate its superior performance and +generalization to various categories and types of dirty samples. The code is +available at \url{https://github.com/zihao-ai/vdc}. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ UniHuman: A Unified Model for Editing Human Images in the Wild CVPR 2024 + + +
+ Human image editing includes tasks like changing a person's pose, their +clothing, or editing the image according to a text prompt. However, prior work +often tackles these tasks separately, overlooking the benefit of mutual +reinforcement from learning them jointly. In this paper, we propose UniHuman, a +unified model that addresses multiple facets of human image editing in +real-world settings. To enhance the model's generation quality and +generalization capacity, we leverage guidance from human visual encoders and +introduce a lightweight pose-warping module that can exploit different pose +representations, accommodating unseen textures and patterns. Furthermore, to +bridge the disparity between existing human editing benchmarks with real-world +data, we curated 400K high-quality human image-text pairs for training and +collected 2K human images for out-of-domain testing, both encompassing diverse +clothing styles, backgrounds, and age groups. Experiments on both in-domain and +out-of-domain test sets demonstrate that UniHuman outperforms task-specific +models by a significant margin. In user studies, UniHuman is preferred by the +users in an average of 77% of cases. Our project is available at +https://github.com/NannanLi999/UniHuman. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ AntGPT: Can Large Language Models Help Long-term Action Anticipation + from Videos? ICLR 2024 + + +
+ Can we better anticipate an actor's future actions (e.g. mix eggs) by knowing +what commonly happens after his/her current action (e.g. crack eggs)? What if +we also know the longer-term goal of the actor (e.g. making egg fried rice)? +The long-term action anticipation (LTA) task aims to predict an actor's future +behavior from video observations in the form of verb and noun sequences, and it +is crucial for human-machine interaction. We propose to formulate the LTA task +from two perspectives: a bottom-up approach that predicts the next actions +autoregressively by modeling temporal dynamics; and a top-down approach that +infers the goal of the actor and plans the needed procedure to accomplish the +goal. We hypothesize that large language models (LLMs), which have been +pretrained on procedure text data (e.g. recipes, how-tos), have the potential +to help LTA from both perspectives. It can help provide the prior knowledge on +the possible next actions, and infer the goal given the observed part of a +procedure, respectively. To leverage the LLMs, we propose a two-stage +framework, AntGPT. It first recognizes the actions already performed in the +observed videos and then asks an LLM to predict the future actions via +conditioned generation, or to infer the goal and plan the whole procedure by +chain-of-thought prompting. Empirical results on the Ego4D LTA v1 and v2 +benchmarks, EPIC-Kitchens-55, as well as EGTEA GAZE+ demonstrate the +effectiveness of our proposed approach. AntGPT achieves state-of-the-art +performance on all above benchmarks, and can successfully infer the goal and +thus perform goal-conditioned "counterfactual" prediction via qualitative +analysis. Code and model will be released at +https://brown-palm.github.io/AntGPT + +
+
+ comment: ICLR 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ Structure Matters: Tackling the Semantic Discrepancy in Diffusion Models + for Image Inpainting CVPR 2024 + + +
+ Denoising diffusion probabilistic models for image inpainting aim to add the +noise to the texture of image during the forward process and recover masked +regions with unmasked ones of the texture via the reverse denoising process. +Despite the meaningful semantics generation, the existing arts suffer from the +semantic discrepancy between masked and unmasked regions, since the +semantically dense unmasked texture fails to be completely degraded while the +masked regions turn to the pure noise in diffusion process, leading to the +large discrepancy between them. In this paper, we aim to answer how unmasked +semantics guide texture denoising process;together with how to tackle the +semantic discrepancy, to facilitate the consistent and meaningful semantics +generation. To this end, we propose a novel structure-guided diffusion model +named StrDiffusion, to reformulate the conventional texture denoising process +under structure guidance to derive a simplified denoising objective for image +inpainting, while revealing: 1) the semantically sparse structure is beneficial +to tackle semantic discrepancy in early stage, while dense texture generates +reasonable semantics in late stage; 2) the semantics from unmasked regions +essentially offer the time-dependent structure guidance for the texture +denoising process, benefiting from the time-dependent sparsity of the structure +semantics. For the denoising process, a structure-guided neural network is +trained to estimate the simplified denoising objective by exploiting the +consistency of the denoised structure between masked and unmasked regions. +Besides, we devise an adaptive resampling strategy as a formal criterion as +whether structure is competent to guide the texture denoising process, while +regulate their semantic correlations. Extensive experiments validate the merits +of StrDiffusion over the state-of-the-arts. Our code is available at +https://github.com/htyjers/StrDiffusion. + +
+
+ comment: 15 pages, 10 figures, to appear CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Posterior Distillation Sampling + + +
+ We introduce Posterior Distillation Sampling (PDS), a novel optimization +method for parametric image editing based on diffusion models. Existing +optimization-based methods, which leverage the powerful 2D prior of diffusion +models to handle various parametric images, have mainly focused on generation. +Unlike generation, editing requires a balance between conforming to the target +attribute and preserving the identity of the source content. Recent 2D image +editing methods have achieved this balance by leveraging the stochastic latent +encoded in the generative process of diffusion models. To extend the editing +capabilities of diffusion models shown in pixel space to parameter space, we +reformulate the 2D image editing method into an optimization form named PDS. +PDS matches the stochastic latents of the source and the target, enabling the +sampling of targets in diverse parameter spaces that align with a desired +attribute while maintaining the source's identity. We demonstrate that this +optimization resembles running a generative process with the target attribute, +but aligning this process with the trajectory of the source's generative +process. Extensive editing results in Neural Radiance Fields and Scalable +Vector Graphics representations demonstrate that PDS is capable of sampling +targets to fulfill the aforementioned balance across various parameter spaces. + +
+
+ comment: Project page: https://posterior-distillation-sampling.github.io/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 98 + +
+
+
+ + ☆ Towards Realistic Scene Generation with LiDAR Diffusion Models CVPR 2024 + + +
+ Diffusion models (DMs) excel in photo-realistic image synthesis, but their +adaptation to LiDAR scene generation poses a substantial hurdle. This is +primarily because DMs operating in the point space struggle to preserve the +curve-like patterns and 3D geometry of LiDAR scenes, which consumes much of +their representation power. In this paper, we propose LiDAR Diffusion Models +(LiDMs) to generate LiDAR-realistic scenes from a latent space tailored to +capture the realism of LiDAR scenes by incorporating geometric priors into the +learning pipeline. Our method targets three major desiderata: pattern realism, +geometry realism, and object realism. Specifically, we introduce curve-wise +compression to simulate real-world LiDAR patterns, point-wise coordinate +supervision to learn scene geometry, and patch-wise encoding for a full 3D +object context. With these three core designs, our method achieves competitive +performance on unconditional LiDAR generation in 64-beam scenario and state of +the art on conditional LiDAR generation, while maintaining high efficiency +compared to point-based DMs (up to 107$\times$ faster). Furthermore, by +compressing LiDAR scenes into a latent space, we enable the controllability of +DMs with various conditions such as semantic maps, camera views, and text +prompts. Our code and pretrained weights are available at +https://github.com/hancyran/LiDAR-Diffusion. + +
+
+ comment: CVPR 2024. Code available at + https://github.com/hancyran/LiDAR-Diffusion +
+
+
+
+
+ + ☆ GAMA-IR: Global Additive Multidimensional Averaging for Fast Image + Restoration + + +
+ Deep learning-based methods have shown remarkable success for various image +restoration tasks such as denoising and deblurring. The current +state-of-the-art networks are relatively deep and utilize (variants of) self +attention mechanisms. Those networks are significantly slower than shallow +convolutional networks, which however perform worse. In this paper, we +introduce an image restoration network that is both fast and yields excellent +image quality. The network is designed to minimize the latency and memory +consumption when executed on a standard GPU, while maintaining state-of-the-art +performance. The network is a simple shallow network with an efficient block +that implements global additive multidimensional averaging operations. This +block can capture global information and enable a large receptive field even +when used in shallow networks with minimal computational overhead. Through +extensive experiments and evaluations on diverse tasks, we demonstrate that our +network achieves comparable or even superior results to existing +state-of-the-art image restoration networks with less latency. For instance, we +exceed the state-of-the-art result on real-world SIDD denoising by 0.11dB, +while being 2 to 10 times faster. + +
+
+
+
+
+ + ☆ $R^2$-Tuning: Efficient Image-to-Video Transfer Learning for Video + Temporal Grounding + + +
+ Video temporal grounding (VTG) is a fine-grained video understanding problem +that aims to ground relevant clips in untrimmed videos given natural language +queries. Most existing VTG models are built upon frame-wise final-layer CLIP +features, aided by additional temporal backbones (e.g., SlowFast) with +sophisticated temporal reasoning mechanisms. In this work, we claim that CLIP +itself already shows great potential for fine-grained spatial-temporal +modeling, as each layer offers distinct yet useful information under different +granularity levels. Motivated by this, we propose Reversed Recurrent Tuning +($R^2$-Tuning), a parameter- and memory-efficient transfer learning framework +for video temporal grounding. Our method learns a lightweight $R^2$ Block +containing only 1.5% of the total parameters to perform progressive +spatial-temporal modeling. Starting from the last layer of CLIP, $R^2$ Block +recurrently aggregates spatial features from earlier layers, then refines +temporal correlation conditioning on the given query, resulting in a +coarse-to-fine scheme. $R^2$-Tuning achieves state-of-the-art performance +across three VTG tasks (i.e., moment retrieval, highlight detection, and video +summarization) on six public benchmarks (i.e., QVHighlights, Charades-STA, +Ego4D-NLQ, TACoS, YouTube Highlights, and TVSum) even without the additional +backbone, demonstrating the significance and effectiveness of the proposed +scheme. Our code is available at https://github.com/yeliudev/R2-Tuning. + +
+
+
+
+
+ + ☆ Disentangling Hippocampal Shape Variations: A Study of Neurological + Disorders Using Graph Variational Autoencoder with Contrastive Learning + + +
+ This paper presents a comprehensive study focused on disentangling +hippocampal shape variations from diffusion tensor imaging (DTI) datasets +within the context of neurological disorders. Leveraging a Graph Variational +Autoencoder (VAE) enhanced with Supervised Contrastive Learning, our approach +aims to improve interpretability by disentangling two distinct latent variables +corresponding to age and the presence of diseases. In our ablation study, we +investigate a range of VAE architectures and contrastive loss functions, +showcasing the enhanced disentanglement capabilities of our approach. This +evaluation uses synthetic 3D torus mesh data and real 3D hippocampal mesh +datasets derived from the DTI hippocampal dataset. Our supervised +disentanglement model outperforms several state-of-the-art (SOTA) methods like +attribute and guided VAEs in terms of disentanglement scores. Our model +distinguishes between age groups and disease status in patients with Multiple +Sclerosis (MS) using the hippocampus data. Our Graph VAE with Supervised +Contrastive Learning shows the volume changes of the hippocampus of MS +populations at different ages, and the result is consistent with the current +neuroimaging literature. This research provides valuable insights into the +relationship between neurological disorder and hippocampal shape changes in +different age groups of MS populations using a Graph VAE with Supervised +Contrastive loss. + +
+
+ comment: Length: 23 pages and submitted to the journal: MELBA (Machine + Learning for Biomedical Imaging) +
+
+
+
+
+ + ☆ Privacy-preserving Optics for Enhancing Protection in Face + De-identification CVPR 2024 + + +
+ The modern surge in camera usage alongside widespread computer vision +technology applications poses significant privacy and security concerns. +Current artificial intelligence (AI) technologies aid in recognizing relevant +events and assisting in daily tasks in homes, offices, hospitals, etc. The need +to access or process personal information for these purposes raises privacy +concerns. While software-level solutions like face de-identification provide a +good privacy/utility trade-off, they present vulnerabilities to sniffing +attacks. In this paper, we propose a hardware-level face de-identification +method to solve this vulnerability. Specifically, our approach first learns an +optical encoder along with a regression model to obtain a face heatmap while +hiding the face identity from the source image. We also propose an +anonymization framework that generates a new face using the privacy-preserving +image, face heatmap, and a reference face image from a public dataset as input. +We validate our approach with extensive simulations and hardware experiments. + +
+
+ comment: Accepted to CVPR 2024. Project Website and Code coming soon +
+
+
+
+
+ + ☆ Intensity-based 3D motion correction for cardiac MR images + + +
+ Cardiac magnetic resonance (CMR) image acquisition requires subjects to hold +their breath while 2D cine images are acquired. This process assumes that the +heart remains in the same position across all slices. However, differences in +breathhold positions or patient motion introduce 3D slice misalignments. In +this work, we propose an algorithm that simultaneously aligns all SA and LA +slices by maximizing the pair-wise intensity agreement between their +intersections. Unlike previous works, our approach is formulated as a +subject-specific optimization problem and requires no prior knowledge of the +underlying anatomy. We quantitatively demonstrate that the proposed method is +robust against a large range of rotations and translations by synthetically +misaligning 10 motion-free datasets and aligning them back using the proposed +method. + +
+
+
+
+
+ + ☆ Adapting to Length Shift: FlexiLength Network for Trajectory Prediction CVPR 2024 + + +
+ Trajectory prediction plays an important role in various applications, +including autonomous driving, robotics, and scene understanding. Existing +approaches mainly focus on developing compact neural networks to increase +prediction precision on public datasets, typically employing a standardized +input duration. However, a notable issue arises when these models are evaluated +with varying observation lengths, leading to a significant performance drop, a +phenomenon we term the Observation Length Shift. To address this issue, we +introduce a general and effective framework, the FlexiLength Network (FLN), to +enhance the robustness of existing trajectory prediction techniques against +varying observation periods. Specifically, FLN integrates trajectory data with +diverse observation lengths, incorporates FlexiLength Calibration (FLC) to +acquire temporal invariant representations, and employs FlexiLength Adaptation +(FLA) to further refine these representations for more accurate future +trajectory predictions. Comprehensive experiments on multiple datasets, ie, +ETH/UCY, nuScenes, and Argoverse 1, demonstrate the effectiveness and +flexibility of our proposed FLN framework. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Rethinking Interactive Image Segmentation with Low Latency, High + Quality, and Diverse Prompts CVPR 2024 + + +
+ The goal of interactive image segmentation is to delineate specific regions +within an image via visual or language prompts. Low-latency and high-quality +interactive segmentation with diverse prompts remain challenging for existing +specialist and generalist models. Specialist models, with their limited prompts +and task-specific designs, experience high latency because the image must be +recomputed every time the prompt is updated, due to the joint encoding of image +and visual prompts. Generalist models, exemplified by the Segment Anything +Model (SAM), have recently excelled in prompt diversity and efficiency, lifting +image segmentation to the foundation model era. However, for high-quality +segmentations, SAM still lags behind state-of-the-art specialist models despite +SAM being trained with x100 more segmentation masks. In this work, we delve +deep into the architectural differences between the two types of models. We +observe that dense representation and fusion of visual prompts are the key +design choices contributing to the high segmentation quality of specialist +models. In light of this, we reintroduce this dense design into the generalist +models, to facilitate the development of generalist models with high +segmentation quality. To densely represent diverse visual prompts, we propose +to use a dense map to capture five types: clicks, boxes, polygons, scribbles, +and masks. Thus, we propose SegNext, a next-generation interactive segmentation +approach offering low latency, high quality, and diverse prompt support. Our +method outperforms current state-of-the-art methods on HQSeg-44K and DAVIS, +both quantitatively and qualitatively. + +
+
+ comment: CVPR 2024 https://github.com/uncbiag/SegNext +
+
+
+
+
+ + ☆ MugenNet: A Novel Combined Convolution Neural Network and Transformer + Network with its Application for Colonic Polyp Image Segmentation + + +
+ Biomedical image segmentation is a very important part in disease diagnosis. +The term "colonic polyps" refers to polypoid lesions that occur on the surface +of the colonic mucosa within the intestinal lumen. In clinical practice, early +detection of polyps is conducted through colonoscopy examinations and +biomedical image processing. Therefore, the accurate polyp image segmentation +is of great significance in colonoscopy examinations. Convolutional Neural +Network (CNN) is a common automatic segmentation method, but its main +disadvantage is the long training time. Transformer utilizes a self-attention +mechanism, which essentially assigns different importance weights to each piece +of information, thus achieving high computational efficiency during +segmentation. However, a potential drawback is the risk of information loss. In +the study reported in this paper, based on the well-known hybridization +principle, we proposed a method to combine CNN and Transformer to retain the +strengths of both, and we applied this method to build a system called MugenNet +for colonic polyp image segmentation. We conducted a comprehensive experiment +to compare MugenNet with other CNN models on five publicly available datasets. +The ablation experiment on MugentNet was conducted as well. The experimental +results show that MugenNet achieves significantly higher processing speed and +accuracy compared with CNN alone. The generalized implication with our work is +a method to optimally combine two complimentary methods of machine learning. + +
+
+
+
+
+ + ☆ Absolute-Unified Multi-Class Anomaly Detection via Class-Agnostic + Distribution Alignment + + +
+ Conventional unsupervised anomaly detection (UAD) methods build separate +models for each object category. Recent studies have proposed to train a +unified model for multiple classes, namely model-unified UAD. However, such +methods still implement the unified model separately on each class during +inference with respective anomaly decision thresholds, which hinders their +application when the image categories are entirely unavailable. In this work, +we present a simple yet powerful method to address multi-class anomaly +detection without any class information, namely \textit{absolute-unified} UAD. +We target the crux of prior works in this challenging setting: different +objects have mismatched anomaly score distributions. We propose Class-Agnostic +Distribution Alignment (CADA) to align the mismatched score distribution of +each implicit class without knowing class information, which enables unified +anomaly detection for all classes and samples. The essence of CADA is to +predict each class's score distribution of normal samples given any image, +normal or anomalous, of this class. As a general component, CADA can activate +the potential of nearly all UAD methods under absolute-unified setting. Our +approach is extensively evaluated under the proposed setting on two popular UAD +benchmark datasets, MVTec AD and VisA, where we exceed previous +state-of-the-art by a large margin. + +
+
+
+
+
+ + ☆ End-to-End Autonomous Driving through V2X Cooperation + + +
+ Cooperatively utilizing both ego-vehicle and infrastructure sensor data via +V2X communication has emerged as a promising approach for advanced autonomous +driving. However, current research mainly focuses on improving individual +modules, rather than taking end-to-end learning to optimize final planning +performance, resulting in underutilized data potential. In this paper, we +introduce UniV2X, a pioneering cooperative autonomous driving framework that +seamlessly integrates all key driving modules across diverse views into a +unified network. We propose a sparse-dense hybrid data transmission and fusion +mechanism for effective vehicle-infrastructure cooperation, offering three +advantages: 1) Effective for simultaneously enhancing agent perception, online +mapping, and occupancy prediction, ultimately improving planning performance. +2) Transmission-friendly for practical and limited communication conditions. 3) +Reliable data fusion with interpretability of this hybrid data. We implement +UniV2X, as well as reproducing several benchmark methods, on the challenging +DAIR-V2X, the real-world cooperative driving dataset. Experimental results +demonstrate the effectiveness of UniV2X in significantly enhancing planning +performance, as well as all intermediate output performance. Code is at +https://github.com/AIR-THU/UniV2X. + +
+
+
+
+
+ + ☆ Neural Radiance Field-based Visual Rendering: A Comprehensive Review + + +
+ In recent years, Neural Radiance Fields (NeRF) has made remarkable progress +in the field of computer vision and graphics, providing strong technical +support for solving key tasks including 3D scene understanding, new perspective +synthesis, human body reconstruction, robotics, and so on, the attention of +academics to this research result is growing. As a revolutionary neural +implicit field representation, NeRF has caused a continuous research boom in +the academic community. Therefore, the purpose of this review is to provide an +in-depth analysis of the research literature on NeRF within the past two years, +to provide a comprehensive academic perspective for budding researchers. In +this paper, the core architecture of NeRF is first elaborated in detail, +followed by a discussion of various improvement strategies for NeRF, and case +studies of NeRF in diverse application scenarios, demonstrating its practical +utility in different domains. In terms of datasets and evaluation metrics, This +paper details the key resources needed for NeRF model training. Finally, this +paper provides a prospective discussion on the future development trends and +potential challenges of NeRF, aiming to provide research inspiration for +researchers in the field and to promote the further development of related +technologies. + +
+
+ comment: 35 pages, 22 figures, 14 tables, 18 formulas +
+
+
+
+
+ + ☆ Unknown Prompt, the only Lacuna: Unveiling CLIP's Potential for Open + Domain Generalization CVPR 2024 + + +
+ We delve into Open Domain Generalization (ODG), marked by domain and category +shifts between training's labeled source and testing's unlabeled target +domains. Existing solutions to ODG face limitations due to constrained +generalizations of traditional CNN backbones and errors in detecting target +open samples in the absence of prior knowledge. Addressing these pitfalls, we +introduce ODG-CLIP, harnessing the semantic prowess of the vision-language +model, CLIP. Our framework brings forth three primary innovations: Firstly, +distinct from prevailing paradigms, we conceptualize ODG as a multi-class +classification challenge encompassing both known and novel categories. Central +to our approach is modeling a unique prompt tailored for detecting unknown +class samples, and to train this, we employ a readily accessible stable +diffusion model, elegantly generating proxy images for the open class. +Secondly, aiming for domain-tailored classification (prompt) weights while +ensuring a balance of precision and simplicity, we devise a novel visual +stylecentric prompt learning mechanism. Finally, we infuse images with +class-discriminative knowledge derived from the prompt space to augment the +fidelity of CLIP's visual embeddings. We introduce a novel objective to +safeguard the continuity of this infused semantic intel across domains, +especially for the shared classes. Through rigorous testing on diverse +datasets, covering closed and open-set DG contexts, ODG-CLIP demonstrates clear +supremacy, consistently outpacing peers with performance boosts between 8%-16%. +Code will be available at https://github.com/mainaksingha01/ODG-CLIP. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ☆ Training-Free Semantic Segmentation via LLM-Supervision + + +
+ Recent advancements in open vocabulary models, like CLIP, have notably +advanced zero-shot classification and segmentation by utilizing natural +language for class-specific embeddings. However, most research has focused on +improving model accuracy through prompt engineering, prompt learning, or +fine-tuning with limited labeled data, thereby overlooking the importance of +refining the class descriptors. This paper introduces a new approach to +text-supervised semantic segmentation using supervision by a large language +model (LLM) that does not require extra training. Our method starts from an +LLM, like GPT-3, to generate a detailed set of subclasses for more accurate +class representation. We then employ an advanced text-supervised semantic +segmentation model to apply the generated subclasses as target labels, +resulting in diverse segmentation results tailored to each subclass's unique +characteristics. Additionally, we propose an assembly that merges the +segmentation maps from the various subclass descriptors to ensure a more +comprehensive representation of the different aspects in the test images. +Through comprehensive experiments on three standard benchmarks, our method +outperforms traditional text-supervised semantic segmentation methods by a +marked margin. + +
+
+ comment: 22 pages,10 figures, conference +
+
+
+
+
+ + ☆ DMSSN: Distilled Mixed Spectral-Spatial Network for Hyperspectral + Salient Object Detection + + +
+ Hyperspectral salient object detection (HSOD) has exhibited remarkable +promise across various applications, particularly in intricate scenarios where +conventional RGB-based approaches fall short. Despite the considerable progress +in HSOD method advancements, two critical challenges require immediate +attention. Firstly, existing hyperspectral data dimension reduction techniques +incur a loss of spectral information, which adversely affects detection +accuracy. Secondly, previous methods insufficiently harness the inherent +distinctive attributes of hyperspectral images (HSIs) during the feature +extraction process. To address these challenges, we propose a novel approach +termed the Distilled Mixed Spectral-Spatial Network (DMSSN), comprising a +Distilled Spectral Encoding process and a Mixed Spectral-Spatial Transformer +(MSST) feature extraction network. The encoding process utilizes knowledge +distillation to construct a lightweight autoencoder for dimension reduction, +striking a balance between robust encoding capabilities and low computational +costs. The MSST extracts spectral-spatial features through multiple attention +head groups, collaboratively enhancing its resistance to intricate scenarios. +Moreover, we have created a large-scale HSOD dataset, HSOD-BIT, to tackle the +issue of data scarcity in this field and meet the fundamental data requirements +of deep network training. Extensive experiments demonstrate that our proposed +DMSSN achieves state-of-the-art performance on multiple datasets. We will soon +make the code and dataset publicly available on +https://github.com/anonymous0519/HSOD-BIT. + +
+
+
+
+
+ + ☆ Learning to Rank Patches for Unbiased Image Redundancy Reduction + + +
+ Images suffer from heavy spatial redundancy because pixels in neighboring +regions are spatially correlated. Existing approaches strive to overcome this +limitation by reducing less meaningful image regions. However, current leading +methods rely on supervisory signals. They may compel models to preserve content +that aligns with labeled categories and discard content belonging to unlabeled +categories. This categorical inductive bias makes these methods less effective +in real-world scenarios. To address this issue, we propose a self-supervised +framework for image redundancy reduction called Learning to Rank Patches +(LTRP). We observe that image reconstruction of masked image modeling models is +sensitive to the removal of visible patches when the masking ratio is high +(e.g., 90\%). Building upon it, we implement LTRP via two steps: inferring the +semantic density score of each patch by quantifying variation between +reconstructions with and without this patch, and learning to rank the patches +with the pseudo score. The entire process is self-supervised, thus getting out +of the dilemma of categorical inductive bias. We design extensive experiments +on different datasets and tasks. The results demonstrate that LTRP outperforms +both supervised and other self-supervised methods due to the fair assessment of +image content. + +
+
+
+
+
+ + ☆ Weak-to-Strong 3D Object Detection with X-Ray Distillation + + +
+ This paper addresses the critical challenges of sparsity and occlusion in +LiDAR-based 3D object detection. Current methods often rely on supplementary +modules or specific architectural designs, potentially limiting their +applicability to new and evolving architectures. To our knowledge, we are the +first to propose a versatile technique that seamlessly integrates into any +existing framework for 3D Object Detection, marking the first instance of +Weak-to-Strong generalization in 3D computer vision. We introduce a novel +framework, X-Ray Distillation with Object-Complete Frames, suitable for both +supervised and semi-supervised settings, that leverages the temporal aspect of +point cloud sequences. This method extracts crucial information from both +previous and subsequent LiDAR frames, creating Object-Complete frames that +represent objects from multiple viewpoints, thus addressing occlusion and +sparsity. Given the limitation of not being able to generate Object-Complete +frames during online inference, we utilize Knowledge Distillation within a +Teacher-Student framework. This technique encourages the strong Student model +to emulate the behavior of the weaker Teacher, which processes simple and +informative Object-Complete frames, effectively offering a comprehensive view +of objects as if seen through X-ray vision. Our proposed methods surpass +state-of-the-art in semi-supervised learning by 1-1.5 mAP and enhance the +performance of five established supervised models by 1-2 mAP on standard +autonomous driving datasets, even with default hyperparameters. Code for +Object-Complete frames is available here: +https://github.com/sakharok13/X-Ray-Teacher-Patching-Tools. + +
+
+ comment: Computer Vision and Pattern Recognition 2024 +
+
+
+
+
+ + ☆ OmniSDF: Scene Reconstruction using Omnidirectional Signed Distance + Functions and Adaptive Binoctrees + + +
+ We present a method to reconstruct indoor and outdoor static scene geometry +and appearance from an omnidirectional video moving in a small circular sweep. +This setting is challenging because of the small baseline and large depth +ranges, making it difficult to find ray crossings. To better constrain the +optimization, we estimate geometry as a signed distance field within a +spherical binoctree data structure and use a complementary efficient tree +traversal strategy based on a breadth-first search for sampling. Unlike regular +grids or trees, the shape of this structure well-matches the camera setting, +creating a better memory-quality trade-off. From an initial depth estimate, the +binoctree is adaptively subdivided throughout the optimization; previous +methods use a fixed depth that leaves the scene undersampled. In comparison +with three neural optimization methods and two non-neural methods, ours shows +decreased geometry error on average, especially in a detailed scene, while +significantly reducing the required number of voxels to represent such details. + +
+
+
+
+
+ + ☆ OmniLocalRF: Omnidirectional Local Radiance Fields from Dynamic Videos + + +
+ Omnidirectional cameras are extensively used in various applications to +provide a wide field of vision. However, they face a challenge in synthesizing +novel views due to the inevitable presence of dynamic objects, including the +photographer, in their wide field of view. In this paper, we introduce a new +approach called Omnidirectional Local Radiance Fields (OmniLocalRF) that can +render static-only scene views, removing and inpainting dynamic objects +simultaneously. Our approach combines the principles of local radiance fields +with the bidirectional optimization of omnidirectional rays. Our input is an +omnidirectional video, and we evaluate the mutual observations of the entire +angle between the previous and current frames. To reduce ghosting artifacts of +dynamic objects and inpaint occlusions, we devise a multi-resolution motion +mask prediction module. Unlike existing methods that primarily separate dynamic +components through the temporal domain, our method uses multi-resolution neural +feature planes for precise segmentation, which is more suitable for long +360-degree videos. Our experiments validate that OmniLocalRF outperforms +existing methods in both qualitative and quantitative metrics, especially in +scenarios with complex real-world scenes. In particular, our approach +eliminates the need for manual interaction, such as drawing motion masks by +hand and additional pose estimation, making it a highly effective and efficient +solution. + +
+
+
+
+
+ + ☆ Knowledge NeRF: Few-shot Novel View Synthesis for Dynamic Articulated + Objects + + +
+ We present Knowledge NeRF to synthesize novel views for dynamic +scenes.Reconstructing dynamic 3D scenes from few sparse views and rendering +them from arbitrary perspectives is a challenging problem with applications in +various domains. Previous dynamic NeRF methods learn the deformation of +articulated objects from monocular videos. However, qualities of their +reconstructed scenes are limited.To clearly reconstruct dynamic scenes, we +propose a new framework by considering two frames at a time.We pretrain a NeRF +model for an articulated object.When articulated objects moves, Knowledge NeRF +learns to generate novel views at the new state by incorporating past knowledge +in the pretrained NeRF model with minimal observations in the present state. We +propose a projection module to adapt NeRF for dynamic scenes, learning the +correspondence between pretrained knowledge base and current states. +Experimental results demonstrate the effectiveness of our method in +reconstructing dynamic 3D scenes with 5 input images in one state. Knowledge +NeRF is a new pipeline and promising solution for novel view synthesis in +dynamic articulated objects. The data and implementation are publicly available +at https://github.com/RussRobin/Knowledge_NeRF. + +
+
+
+
+
+ + ☆ A General and Efficient Training for Transformer via Token Expansion CVPR 2024 + + +
+ The remarkable performance of Vision Transformers (ViTs) typically requires +an extremely large training cost. Existing methods have attempted to accelerate +the training of ViTs, yet typically disregard method universality with accuracy +dropping. Meanwhile, they break the training consistency of the original +transformers, including the consistency of hyper-parameters, architecture, and +strategy, which prevents them from being widely applied to different +Transformer networks. In this paper, we propose a novel token growth scheme +Token Expansion (termed ToE) to achieve consistent training acceleration for +ViTs. We introduce an "initialization-expansion-merging" pipeline to maintain +the integrity of the intermediate feature distribution of original +transformers, preventing the loss of crucial learnable information in the +training process. ToE can not only be seamlessly integrated into the training +and fine-tuning process of transformers (e.g., DeiT and LV-ViT), but also +effective for efficient training frameworks (e.g., EfficientTrain), without +twisting the original training hyper-parameters, architecture, and introducing +additional training strategies. Extensive experiments demonstrate that ToE +achieves about 1.3x faster for the training of ViTs in a lossless manner, or +even with performance gains over the full-token training baselines. Code is +available at https://github.com/Osilly/TokenExpansion . + +
+
+ comment: Accepted to CVPR 2024. Code is available at + https://github.com/Osilly/TokenExpansion +
+
+
+
+
+ + ☆ Statistical Analysis by Semiparametric Additive Regression and LSTM-FCN + Based Hierarchical Classification for Computer Vision Quantification of + Parkinsonian Bradykinesia + + +
+ Bradykinesia, characterized by involuntary slowing or decrement of movement, +is a fundamental symptom of Parkinson's Disease (PD) and is vital for its +clinical diagnosis. Despite various methodologies explored to quantify +bradykinesia, computer vision-based approaches have shown promising results. +However, these methods often fall short in adequately addressing key +bradykinesia characteristics in repetitive limb movements: "occasional arrest" +and "decrement in amplitude." + This research advances vision-based quantification of bradykinesia by +introducing nuanced numerical analysis to capture decrement in amplitudes and +employing a simple deep learning technique, LSTM-FCN, for precise +classification of occasional arrests. Our approach structures the +classification process hierarchically, tailoring it to the unique dynamics of +bradykinesia in PD. + Statistical analysis of the extracted features, including those representing +arrest and fatigue, has demonstrated their statistical significance in most +cases. This finding underscores the importance of considering "occasional +arrest" and "decrement in amplitude" in bradykinesia quantification of limb +movement. Our enhanced diagnostic tool has been rigorously tested on an +extensive dataset comprising 1396 motion videos from 310 PD patients, achieving +an accuracy of 80.3%. The results confirm the robustness and reliability of our +method. + +
+
+
+
+
+ + ☆ Weakly-Supervised Cross-Domain Segmentation of Electron Microscopy with + Sparse Point Annotation + + +
+ Accurate segmentation of organelle instances from electron microscopy (EM) +images plays an essential role in many neuroscience researches. However, +practical scenarios usually suffer from high annotation costs, label scarcity, +and large domain diversity. While unsupervised domain adaptation (UDA) that +assumes no annotation effort on the target data is promising to alleviate these +challenges, its performance on complicated segmentation tasks is still far from +practical usage. To address these issues, we investigate a highly +annotation-efficient weak supervision, which assumes only sparse center-points +on a small subset of object instances in the target training images. To achieve +accurate segmentation with partial point annotations, we introduce instance +counting and center detection as auxiliary tasks and design a multitask +learning framework to leverage correlations among the counting, detection, and +segmentation, which are all tasks with partial or no supervision. Building upon +the different domain-invariances of the three tasks, we enforce counting +estimation with a novel soft consistency loss as a global prior for center +detection, which further guides the per-pixel segmentation. To further +compensate for annotation sparsity, we develop a cross-position cut-and-paste +for label augmentation and an entropy-based pseudo-label selection. The +experimental results highlight that, by simply using extremely weak annotation, +e.g., 15\% sparse points, for model training, the proposed model is capable of +significantly outperforming UDA methods and produces comparable performance as +the supervised counterpart. The high robustness of our model shown in the +validations and the low requirement of expert knowledge for sparse point +annotation further improve the potential application value of our model. + +
+
+
+
+
+ + ☆ DeeDSR: Towards Real-World Image Super-Resolution via Degradation-Aware + Stable Diffusion + + +
+ Diffusion models, known for their powerful generative capabilities, play a +crucial role in addressing real-world super-resolution challenges. However, +these models often focus on improving local textures while neglecting the +impacts of global degradation, which can significantly reduce semantic fidelity +and lead to inaccurate reconstructions and suboptimal super-resolution +performance. To address this issue, we introduce a novel two-stage, +degradation-aware framework that enhances the diffusion model's ability to +recognize content and degradation in low-resolution images. In the first stage, +we employ unsupervised contrastive learning to obtain representations of image +degradations. In the second stage, we integrate a degradation-aware module into +a simplified ControlNet, enabling flexible adaptation to various degradations +based on the learned representations. Furthermore, we decompose the +degradation-aware features into global semantics and local details branches, +which are then injected into the diffusion denoising module to modulate the +target generation. Our method effectively recovers semantically precise and +photorealistic details, particularly under significant degradation conditions, +demonstrating state-of-the-art performance across various benchmarks. Codes +will be released at https://github.com/bichunyang419/DeeDSR. + +
+
+
+
+
+ + ☆ Dual DETRs for Multi-Label Temporal Action Detection CVPR 2024 + + +
+ Temporal Action Detection (TAD) aims to identify the action boundaries and +the corresponding category within untrimmed videos. Inspired by the success of +DETR in object detection, several methods have adapted the query-based +framework to the TAD task. However, these approaches primarily followed DETR to +predict actions at the instance level (i.e., identify each action by its center +point), leading to sub-optimal boundary localization. To address this issue, we +propose a new Dual-level query-based TAD framework, namely DualDETR, to detect +actions from both instance-level and boundary-level. Decoding at different +levels requires semantics of different granularity, therefore we introduce a +two-branch decoding structure. This structure builds distinctive decoding +processes for different levels, facilitating explicit capture of temporal cues +and semantics at each level. On top of the two-branch design, we present a +joint query initialization strategy to align queries from both levels. +Specifically, we leverage encoder proposals to match queries from each level in +a one-to-one manner. Then, the matched queries are initialized using position +and content prior from the matched action proposal. The aligned dual-level +queries can refine the matched proposal with complementary cues during +subsequent decoding. We evaluate DualDETR on three challenging multi-label TAD +benchmarks. The experimental results demonstrate the superior performance of +DualDETR to the existing state-of-the-art methods, achieving a substantial +improvement under det-mAP and delivering impressive results under seg-mAP. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Deep Instruction Tuning for Segment Anything Model + + +
+ Segment Anything Model (SAM) exhibits powerful yet versatile capabilities on +(un) conditional image segmentation tasks recently. Although SAM can support +various segmentation prompts, we note that, compared to point- and box-guided +segmentation, it performs much worse on text-instructed tasks. We argue that +deep text instruction tuning is key to mitigate such shortcoming caused by the +shallow fusion scheme in its default light-weight mask decoder. In this paper, +two \emph{deep instruction tuning} (DIT) methods are proposed, one is +end-to-end and the other is layer-wise. With these tuning methods, we can +regard the image encoder of SAM as a stand-alone vision-language learner in +contrast to building another deep fusion branch. Extensive experiments on three +highly competitive benchmark datasets of referring image segmentation show that +a simple end-to-end DIT improves SAM by a large margin, with layer-wise DIT +further boosts the performance to state-of-the-art. Our code is anonymously +released at: https://github.com/wysnzzzz/DIT. + +
+
+
+
+
+ + ☆ SpiralMLP: A Lightweight Vision MLP Architecture + + +
+ We present SpiralMLP, a novel architecture that introduces a Spiral FC layer +as a replacement for the conventional Token Mixing approach. Differing from +several existing MLP-based models that primarily emphasize axes, our Spiral FC +layer is designed as a deformable convolution layer with spiral-like offsets. +We further adapt Spiral FC into two variants: Self-Spiral FC and Cross-Spiral +FC, which enable both local and global feature integration seamlessly, +eliminating the need for additional processing steps. To thoroughly investigate +the effectiveness of the spiral-like offsets and validate our design, we +conduct ablation studies and explore optimal configurations. In empirical +tests, SpiralMLP reaches state-of-the-art performance, similar to Transformers, +CNNs, and other MLPs, benchmarking on ImageNet-1k, COCO and ADE20K. SpiralMLP +still maintains linear computational complexity O(HW) and is compatible with +varying input image resolutions. Our study reveals that targeting the full +receptive field is not essential for achieving high performance, instead, +adopting a refined approach offers better results. + +
+
+
+
+
+ + ☆ Attire-Based Anomaly Detection in Restricted Areas Using YOLOv8 for + Enhanced CCTV Security + + +
+ This research introduces an innovative security enhancement approach, +employing advanced image analysis and soft computing. The focus is on an +intelligent surveillance system that detects unauthorized individuals in +restricted areas by analyzing attire. Traditional security measures face +challenges in monitoring unauthorized access. Leveraging YOLOv8, an advanced +object detection algorithm, our system identifies authorized personnel based on +their attire in CCTV footage. The methodology involves training the YOLOv8 +model on a comprehensive dataset of uniform patterns, ensuring precise +recognition in specific regions. Soft computing techniques enhance adaptability +to dynamic environments and varying lighting conditions. This research +contributes to image analysis and soft computing, providing a sophisticated +security solution. Emphasizing uniform-based anomaly detection, it establishes +a foundation for robust security systems in restricted areas. The outcomes +highlight the potential of YOLOv8-based surveillance in ensuring safety in +sensitive locations. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ IPT-V2: Efficient Image Processing Transformer using Hierarchical + Attentions + + +
+ Recent advances have demonstrated the powerful capability of transformer +architecture in image restoration. However, our analysis indicates that +existing transformerbased methods can not establish both exact global and local +dependencies simultaneously, which are much critical to restore the details and +missing content of degraded images. To this end, we present an efficient image +processing transformer architecture with hierarchical attentions, called IPTV2, +adopting a focal context self-attention (FCSA) and a global grid self-attention +(GGSA) to obtain adequate token interactions in local and global receptive +fields. Specifically, FCSA applies the shifted window mechanism into the +channel self-attention, helps capture the local context and mutual interaction +across channels. And GGSA constructs long-range dependencies in the +cross-window grid, aggregates global information in spatial dimension. +Moreover, we introduce structural re-parameterization technique to feed-forward +network to further improve the model capability. Extensive experiments +demonstrate that our proposed IPT-V2 achieves state-of-the-art results on +various image processing tasks, covering denoising, deblurring, deraining and +obtains much better trade-off for performance and computational complexity than +previous methods. Besides, we extend our method to image generation as latent +diffusion backbone, and significantly outperforms DiTs. + +
+
+
+
+
+ + ☆ Domain Generalizable Person Search Using Unreal Dataset AAAI2024 + + +
+ Collecting and labeling real datasets to train the person search networks not +only requires a lot of time and effort, but also accompanies privacy issues. +The weakly-supervised and unsupervised domain adaptation methods have been +proposed to alleviate the labeling burden for target datasets, however, their +generalization capability is limited. We introduce a novel person search method +based on the domain generalization framework, that uses an automatically +labeled unreal dataset only for training but is applicable to arbitrary unseen +real datasets. To alleviate the domain gaps when transferring the knowledge +from the unreal source dataset to the real target datasets, we estimate the +fidelity of person instances which is then used to train the end-to-end network +adaptively. Moreover, we devise a domain-invariant feature learning scheme to +encourage the network to suppress the domain-related features. Experimental +results demonstrate that the proposed method provides the competitive +performance to existing person search methods even though it is applicable to +arbitrary unseen datasets without any prior knowledge and re-training burdens. + +
+
+ comment: AAAI2024 accepted +
+
+
+
+
+ + ☆ A Multi-Branched Radial Basis Network Approach to Predicting Complex + Chaotic Behaviours + + +
+ In this study, we propose a multi branched network approach to predict the +dynamics of a physics attractor characterized by intricate and chaotic +behavior. We introduce a unique neural network architecture comprised of Radial +Basis Function (RBF) layers combined with an attention mechanism designed to +effectively capture nonlinear inter-dependencies inherent in the attractor's +temporal evolution. Our results demonstrate successful prediction of the +attractor's trajectory across 100 predictions made using a real-world dataset +of 36,700 time-series observations encompassing approximately 28 minutes of +activity. To further illustrate the performance of our proposed technique, we +provide comprehensive visualizations depicting the attractor's original and +predicted behaviors alongside quantitative measures comparing observed versus +estimated outcomes. Overall, this work showcases the potential of advanced +machine learning algorithms in elucidating hidden structures in complex +physical systems while offering practical applications in various domains +requiring accurate short-term forecasting capabilities. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Object-level Copy-Move Forgery Image Detection based on Inconsistency + Mining + + +
+ In copy-move tampering operations, perpetrators often employ techniques, such +as blurring, to conceal tampering traces, posing significant challenges to the +detection of object-level targets with intact structures. Focus on these +challenges, this paper proposes an Object-level Copy-Move Forgery Image +Detection based on Inconsistency Mining (IMNet). To obtain complete +object-level targets, we customize prototypes for both the source and tampered +regions and dynamically update them. Additionally, we extract inconsistent +regions between coarse similar regions obtained through self-correlation +calculations and regions composed of prototypes. The detected inconsistent +regions are used as supplements to coarse similar regions to refine pixel-level +detection. We operate experiments on three public datasets which validate the +effectiveness and the robustness of the proposed IMNet. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ☆ Weak Distribution Detectors Lead to Stronger Generalizability of + Vision-Language Prompt Tuning AAAI2024 + + +
+ We propose a generalized method for boosting the generalization ability of +pre-trained vision-language models (VLMs) while fine-tuning on downstream +few-shot tasks. The idea is realized by exploiting out-of-distribution (OOD) +detection to predict whether a sample belongs to a base distribution or a novel +distribution and then using the score generated by a dedicated competition +based scoring function to fuse the zero-shot and few-shot classifier. The fused +classifier is dynamic, which will bias towards the zero-shot classifier if a +sample is more likely from the distribution pre-trained on, leading to improved +base-to-novel generalization ability. Our method is performed only in test +stage, which is applicable to boost existing methods without time-consuming +re-training. Extensive experiments show that even weak distribution detectors +can still improve VLMs' generalization ability. Specifically, with the help of +OOD detectors, the harmonic mean of CoOp and ProGrad increase by 2.6 and 1.5 +percentage points over 11 recognition datasets in the base-to-novel setting. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ Parameter and Data-Efficient Spectral StyleDCGAN ICLR + + +
+ We present a simple, highly parameter, and data-efficient adversarial network +for unconditional face generation. Our method: Spectral Style-DCGAN or SSD +utilizes only 6.574 million parameters and 4739 dog faces from the Animal Faces +HQ (AFHQ) dataset as training samples while preserving fidelity at low +resolutions up to 64x64. Code available at +https://github.com/Aryan-Garg/StyleDCGAN. + +
+
+ comment: Notable ICLR Tiny Paper 2024 +
+
+
+
+
+ + ☆ LAESI: Leaf Area Estimation with Synthetic Imagery + + +
+ We introduce LAESI, a Synthetic Leaf Dataset of 100,000 synthetic leaf images +on millimeter paper, each with semantic masks and surface area labels. This +dataset provides a resource for leaf morphology analysis primarily aimed at +beech and oak leaves. We evaluate the applicability of the dataset by training +machine learning models for leaf surface area prediction and semantic +segmentation, using real images for validation. Our validation shows that these +models can be trained to predict leaf surface area with a relative error not +greater than an average human annotator. LAESI also provides an efficient +framework based on 3D procedural models and generative AI for the large-scale, +controllable generation of data with potential further applications in +agriculture and biology. We evaluate the inclusion of generative AI in our +procedural data generation pipeline and show how data filtering based on +annotation consistency results in datasets which allow training the highest +performing vision models. + +
+
+ comment: 10 pages, 12 figures, 1 table +
+
+
+
+
+ + ☆ Memory-based Cross-modal Semantic Alignment Network for Radiology Report + Generation + + +
+ Generating radiology reports automatically reduces the workload of +radiologists and helps the diagnoses of specific diseases. Many existing +methods take this task as modality transfer process. However, since the key +information related to disease accounts for a small proportion in both image +and report, it is hard for the model to learn the latent relation between the +radiology image and its report, thus failing to generate fluent and accurate +radiology reports. To tackle this problem, we propose a memory-based +cross-modal semantic alignment model (MCSAM) following an encoder-decoder +paradigm. MCSAM includes a well initialized long-term clinical memory bank to +learn disease-related representations as well as prior knowledge for different +modalities to retrieve and use the retrieved memory to perform feature +consolidation. To ensure the semantic consistency of the retrieved cross modal +prior knowledge, a cross-modal semantic alignment module (SAM) is proposed. SAM +is also able to generate semantic visual feature embeddings which can be added +to the decoder and benefits report generation. More importantly, to memorize +the state and additional information while generating reports with the decoder, +we use learnable memory tokens which can be seen as prompts. Extensive +experiments demonstrate the promising performance of our proposed method which +generates state-of-the-art performance on the MIMIC-CXR dataset. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ M3D: Advancing 3D Medical Image Analysis with Multi-Modal Large Language + Models + + +
+ Medical image analysis is essential to clinical diagnosis and treatment, +which is increasingly supported by multi-modal large language models (MLLMs). +However, previous research has primarily focused on 2D medical images, leaving +3D images under-explored, despite their richer spatial information. This paper +aims to advance 3D medical image analysis with MLLMs. To this end, we present a +large-scale 3D multi-modal medical dataset, M3D-Data, comprising 120K +image-text pairs and 662K instruction-response pairs specifically tailored for +various 3D medical tasks, such as image-text retrieval, report generation, +visual question answering, positioning, and segmentation. Additionally, we +propose M3D-LaMed, a versatile multi-modal large language model for 3D medical +image analysis. Furthermore, we introduce a new 3D multi-modal medical +benchmark, M3D-Bench, which facilitates automatic evaluation across eight +tasks. Through comprehensive evaluation, our method proves to be a robust model +for 3D medical image analysis, outperforming existing solutions. All code, +data, and models are publicly available at: https://github.com/BAAI-DCAI/M3D. + +
+
+ comment: MLLM, 3D medical image analysis +
+
+
+
+
+ + ☆ Automated Bi-Fold Weighted Ensemble Algorithms and its Application to + Brain Tumor Detection and Classification + + +
+ The uncontrolled and unstructured growth of brain cells is known as brain +tumor, which has one of the highest mortality rates among diseases from all +types of cancers. Due to limited diagnostic and treatment capabilities, they +pose significant challenges, especially in third-world countries. Early +diagnosis plays a vital role in effectively managing brain tumors and reducing +mortality rates. However, the availability of diagnostic methods is hindered by +various limitations, including high costs and lengthy result acquisition times, +impeding early detection of the disease. In this study, we present two +cutting-edge bi-fold weighted voting ensemble models that aim to boost the +effectiveness of weighted ensemble methods. These two proposed methods combine +the classification outcomes from multiple classifiers and determine the optimal +result by selecting the one with the highest probability in the first approach, +and the highest weighted prediction in the second technique. These approaches +significantly improve the overall performance of weighted ensemble techniques. +In the first proposed method, we improve the soft voting technique (SVT) by +introducing a novel unsupervised weight calculating schema (UWCS) to enhance +its weight assigning capability, known as the extended soft voting technique +(ESVT). Secondly, we propose a novel weighted method (NWM) by using the +proposed UWCS. Both of our approaches incorporate three distinct models: a +custom-built CNN, VGG-16, and InceptionResNetV2 which has been trained on +publicly available datasets. The effectiveness of our proposed systems is +evaluated through blind testing, where exceptional results are achieved. We +then establish a comparative analysis of the performance of our proposed +methods with that of SVT to show their superiority and effectiveness. + +
+
+
+
+
+ + ☆ Exploiting Inter-sample and Inter-feature Relations in Dataset + Distillation CVPR 2024 + + +
+ Dataset distillation has emerged as a promising approach in deep learning, +enabling efficient training with small synthetic datasets derived from larger +real ones. Particularly, distribution matching-based distillation methods +attract attention thanks to its effectiveness and low computational cost. +However, these methods face two primary limitations: the dispersed feature +distribution within the same class in synthetic datasets, reducing class +discrimination, and an exclusive focus on mean feature consistency, lacking +precision and comprehensiveness. To address these challenges, we introduce two +novel constraints: a class centralization constraint and a covariance matching +constraint. The class centralization constraint aims to enhance class +discrimination by more closely clustering samples within classes. The +covariance matching constraint seeks to achieve more accurate feature +distribution matching between real and synthetic datasets through local feature +covariance matrices, particularly beneficial when sample sizes are much smaller +than the number of features. Experiments demonstrate notable improvements with +these constraints, yielding performance boosts of up to 6.6% on CIFAR10, 2.9% +on SVHN, 2.5% on CIFAR100, and 2.5% on TinyImageNet, compared to the +state-of-the-art relevant methods. In addition, our method maintains robust +performance in cross-architecture settings, with a maximum performance drop of +1.7% on four architectures. Code is available at +https://github.com/VincenDen/IID. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ GAN with Skip Patch Discriminator for Biological Electron Microscopy + Image Generation + + +
+ Generating realistic electron microscopy (EM) images has been a challenging +problem due to their complex global and local structures. Isola et al. proposed +pix2pix, a conditional Generative Adversarial Network (GAN), for the general +purpose of image-to-image translation; which fails to generate realistic EM +images. We propose a new architecture for the discriminator in the GAN +providing access to multiple patch sizes using skip patches and generating +realistic EM images. + +
+
+ comment: 4 pages, International Conference on Computational and Mathematical + Biomedical Engineering +
+
+
+
+
+ + ☆ Comparison of Methods in Human Skin Decomposition + + +
+ Decomposition of skin pigment plays an important role in medical fields. +Human skin can be decomposed into two primitive components, hemoglobin and +melanin. It is our goal to apply these results for diagnosis of skin cancer. In +this paper, various methods for skin pigment decomposition are reviewed +comparatively and the performance of each method is evaluated both +theoretically and experimentally. In addition, isometric feature mapping +(Isomap) is introduced in order to improve the dimensionality reduction +performance in context of skin decomposition. + +
+
+ comment: 4 pages, 7 figures +
+
+
+
+
+ + ☆ Pneumonia App: a mobile application for efficient pediatric pneumonia + diagnosis using explainable convolutional neural networks (CNN) + + +
+ Mycoplasma pneumoniae pneumonia (MPP) poses significant diagnostic challenges +in pediatric healthcare, especially in regions like China where it's prevalent. +We introduce PneumoniaAPP, a mobile application leveraging deep learning +techniques for rapid MPP detection. Our approach capitalizes on convolutional +neural networks (CNNs) trained on a comprehensive dataset comprising 3345 chest +X-ray (CXR) images, which includes 833 CXR images revealing MPP and +additionally augmented with samples from a public dataset. The CNN model +achieved an accuracy of 88.20% and an AUROC of 0.9218 across all classes, with +a specific accuracy of 97.64% for the mycoplasma class, as demonstrated on the +testing dataset. Furthermore, we integrated explainability techniques into +PneumoniaAPP to aid respiratory physicians in lung opacity localization. Our +contribution extends beyond existing research by targeting pediatric MPP, +emphasizing the age group of 0-12 years, and prioritizing deployment on mobile +devices. This work signifies a significant advancement in pediatric pneumonia +diagnosis, offering a reliable and accessible tool to alleviate diagnostic +burdens in healthcare settings. + +
+
+ comment: 27 Pages,7 figures +
+
+
+
+
+ + ☆ Denoising Distillation Makes Event-Frame Transformers as Accurate Gaze + Trackers + + +
+ This paper tackles the problem of passive gaze estimation using both event +and frame data. Considering inherently different physiological structures, it's +intractable to accurately estimate purely based on a given state. Thus, we +reformulate the gaze estimation as the quantification of state transitions from +the current state to several prior registered anchor states. Technically, we +propose a two-stage learning-based gaze estimation framework to divide the +whole gaze estimation process into a coarse-to-fine process of anchor state +selection and final gaze location. Moreover, to improve generalization ability, +we align a group of local experts with a student network, where a novel +denoising distillation algorithm is introduced to utilize denoising diffusion +technique to iteratively remove inherent noise of event data. Extensive +experiments demonstrate the effectiveness of the proposed method, which greatly +surpasses state-of-the-art methods by a large extent of 15$\%$. The code will +be publicly available at +https://github.com/jdjdli/Denoise_distill_EF_gazetracker. + +
+
+
+
+
+ + ☆ On the Estimation of Image-matching Uncertainty in Visual Place + Recognition CVPR + + +
+ In Visual Place Recognition (VPR) the pose of a query image is estimated by +comparing the image to a map of reference images with known reference poses. As +is typical for image retrieval problems, a feature extractor maps the query and +reference images to a feature space, where a nearest neighbor search is then +performed. However, till recently little attention has been given to +quantifying the confidence that a retrieved reference image is a correct match. +Highly certain but incorrect retrieval can lead to catastrophic failure of +VPR-based localization pipelines. This work compares for the first time the +main approaches for estimating the image-matching uncertainty, including the +traditional retrieval-based uncertainty estimation, more recent data-driven +aleatoric uncertainty estimation, and the compute-intensive geometric +verification. We further formulate a simple baseline method, ``SUE'', which +unlike the other methods considers the freely-available poses of the reference +images in the map. Our experiments reveal that a simple L2-distance between the +query and reference descriptors is already a better estimate of image-matching +uncertainty than current data-driven approaches. SUE outperforms the other +efficient uncertainty estimation methods, and its uncertainty estimates +complement the computationally expensive geometric verification approach. +Future works for uncertainty estimation in VPR should consider the baselines +discussed in this work. + +
+
+ comment: To appear in the proceedings of the IEEE/CVF Conference on Computer + Vision and Pattern Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ Deep Extrinsic Manifold Representation for Vision Tasks + + +
+ Non-Euclidean data is frequently encountered across different fields, yet +there is limited literature that addresses the fundamental challenge of +training neural networks with manifold representations as outputs. We introduce +the trick named Deep Extrinsic Manifold Representation (DEMR) for visual tasks +in this context. DEMR incorporates extrinsic manifold embedding into deep +neural networks, which helps generate manifold representations. The DEMR +approach does not directly optimize the complex geodesic loss. Instead, it +focuses on optimizing the computation graph within the embedded Euclidean +space, allowing for adaptability to various architectural requirements. We +provide empirical evidence supporting the proposed concept on two types of +manifolds, $SE(3)$ and its associated quotient manifolds. This evidence offers +theoretical assurances regarding feasibility, asymptotic properties, and +generalization capability. The experimental results show that DEMR effectively +adapts to point cloud alignment, producing outputs in $ SE(3) $, as well as in +illumination subspace learning with outputs on the Grassmann manifold. + +
+
+
+
+
+ + ☆ Embodied Active Defense: Leveraging Recurrent Feedback to Counter + Adversarial Patches + + +
+ The vulnerability of deep neural networks to adversarial patches has +motivated numerous defense strategies for boosting model robustness. However, +the prevailing defenses depend on single observation or pre-established +adversary information to counter adversarial patches, often failing to be +confronted with unseen or adaptive adversarial attacks and easily exhibiting +unsatisfying performance in dynamic 3D environments. Inspired by active human +perception and recurrent feedback mechanisms, we develop Embodied Active +Defense (EAD), a proactive defensive strategy that actively contextualizes +environmental information to address misaligned adversarial patches in 3D +real-world settings. To achieve this, EAD develops two central recurrent +sub-modules, i.e., a perception module and a policy module, to implement two +critical functions of active vision. These models recurrently process a series +of beliefs and observations, facilitating progressive refinement of their +comprehension of the target object and enabling the development of strategic +actions to counter adversarial patches in 3D environments. To optimize learning +efficiency, we incorporate a differentiable approximation of environmental +dynamics and deploy patches that are agnostic to the adversary strategies. +Extensive experiments demonstrate that EAD substantially enhances robustness +against a variety of patches within just a few steps through its action policy +in safety-critical tasks (e.g., face recognition and object detection), without +compromising standard accuracy. Furthermore, due to the attack-agnostic +characteristic, EAD facilitates excellent generalization to unseen attacks, +diminishing the averaged attack success rate by 95 percent across a range of +unseen adversarial attacks. + +
+
+ comment: 27pages +
+
+
+
+
+ + ☆ LLMs are Good Action Recognizers CVPR 2024 + + +
+ Skeleton-based action recognition has attracted lots of research attention. +Recently, to build an accurate skeleton-based action recognizer, a variety of +works have been proposed. Among them, some works use large model architectures +as backbones of their recognizers to boost the skeleton data representation +capability, while some other works pre-train their recognizers on external data +to enrich the knowledge. In this work, we observe that large language models +which have been extensively used in various natural language processing tasks +generally hold both large model architectures and rich implicit knowledge. +Motivated by this, we propose a novel LLM-AR framework, in which we investigate +treating the Large Language Model as an Action Recognizer. In our framework, we +propose a linguistic projection process to project each input action signal +(i.e., each skeleton sequence) into its ``sentence format'' (i.e., an ``action +sentence''). Moreover, we also incorporate our framework with several designs +to further facilitate this linguistic projection process. Extensive experiments +demonstrate the efficacy of our proposed framework. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ TexVocab: Texture Vocabulary-conditioned Human Avatars + + +
+ To adequately utilize the available image evidence in multi-view video-based +avatar modeling, we propose TexVocab, a novel avatar representation that +constructs a texture vocabulary and associates body poses with texture maps for +animation. Given multi-view RGB videos, our method initially back-projects all +the available images in the training videos to the posed SMPL surface, +producing texture maps in the SMPL UV domain. Then we construct pairs of human +poses and texture maps to establish a texture vocabulary for encoding dynamic +human appearances under various poses. Unlike the commonly used joint-wise +manner, we further design a body-part-wise encoding strategy to learn the +structural effects of the kinematic chain. Given a driving pose, we query the +pose feature hierarchically by decomposing the pose vector into several body +parts and interpolating the texture features for synthesizing fine-grained +human dynamics. Overall, our method is able to create animatable human avatars +with detailed and dynamic appearances from RGB videos, and the experiments show +that our method outperforms state-of-the-art approaches. The project page can +be found at https://texvocab.github.io/. + +
+
+
+
+
+ + ☆ Transformer based Pluralistic Image Completion with Reduced Information + Loss + + +
+ Transformer based methods have achieved great success in image inpainting +recently. However, we find that these solutions regard each pixel as a token, +thus suffering from an information loss issue from two aspects: 1) They +downsample the input image into much lower resolutions for efficiency +consideration. 2) They quantize $256^3$ RGB values to a small number (such as +512) of quantized color values. The indices of quantized pixels are used as +tokens for the inputs and prediction targets of the transformer. To mitigate +these issues, we propose a new transformer based framework called "PUT". +Specifically, to avoid input downsampling while maintaining computation +efficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts +the masked image into non-overlapped patch tokens and the decoder recovers the +masked regions from the inpainted tokens while keeping the unmasked regions +unchanged. To eliminate the information loss caused by input quantization, an +Un-quantized Transformer is applied. It directly takes features from the +P-VQVAE encoder as input without any quantization and only regards the +quantized tokens as prediction targets. Furthermore, to make the inpainting +process more controllable, we introduce semantic and structural conditions as +extra guidance. Extensive experiments show that our method greatly outperforms +existing transformer based methods on image fidelity and achieves much higher +diversity and better fidelity than state-of-the-art pluralistic inpainting +methods on complex large-scale datasets (e.g., ImageNet). Codes are available +at https://github.com/liuqk3/PUT. + +
+
+ comment: Accepted by TPAMI (2024) +
+
+
+
+
+ + ☆ Denoising Low-dose Images Using Deep Learning of Time Series Images + + +
+ Digital image devices have been widely applied in many fields, including +scientific imaging, recognition of individuals, and remote sensing. As the +application of these imaging technologies to autonomous driving and +measurement, image noise generated when observation cannot be performed with a +sufficient dose has become a major problem. Machine learning denoise technology +is expected to be the solver of this problem, but there are the following +problems. Here we report, artifacts generated by machine learning denoise in +ultra-low dose observation using an in-situ observation video of an electron +microscope as an example. And as a method to solve this problem, we propose a +method to decompose a time series image into a 2D image of the spatial axis and +time to perform machine learning denoise. Our method opens new avenues accurate +and stable reconstruction of continuous high-resolution images from low-dose +imaging in science, industry, and life. + +
+
+
+
+
+ + ☆ DailyMAE: Towards Pretraining Masked Autoencoders in One Day + + +
+ Recently, masked image modeling (MIM), an important self-supervised learning +(SSL) method, has drawn attention for its effectiveness in learning data +representation from unlabeled data. Numerous studies underscore the advantages +of MIM, highlighting how models pretrained on extensive datasets can enhance +the performance of downstream tasks. However, the high computational demands of +pretraining pose significant challenges, particularly within academic +environments, thereby impeding the SSL research progress. In this study, we +propose efficient training recipes for MIM based SSL that focuses on mitigating +data loading bottlenecks and employing progressive training techniques and +other tricks to closely maintain pretraining performance. Our library enables +the training of a MAE-Base/16 model on the ImageNet 1K dataset for 800 epochs +within just 18 hours, using a single machine equipped with 8 A100 GPUs. By +achieving speed gains of up to 5.8 times, this work not only demonstrates the +feasibility of conducting high-efficiency SSL training but also paves the way +for broader accessibility and promotes advancement in SSL research particularly +for prototyping and initial testing of SSL ideas. The code is available in +https://github.com/erow/FastSSL. + +
+
+
+
+
+ + ☆ NYC-Indoor-VPR: A Long-Term Indoor Visual Place Recognition Dataset with + Semi-Automatic Annotation ICRA 2024 + + +
+ Visual Place Recognition (VPR) in indoor environments is beneficial to humans +and robots for better localization and navigation. It is challenging due to +appearance changes at various frequencies, and difficulties of obtaining ground +truth metric trajectories for training and evaluation. This paper introduces +the NYC-Indoor-VPR dataset, a unique and rich collection of over 36,000 images +compiled from 13 distinct crowded scenes in New York City taken under varying +lighting conditions with appearance changes. Each scene has multiple revisits +across a year. To establish the ground truth for VPR, we propose a +semiautomatic annotation approach that computes the positional information of +each image. Our method specifically takes pairs of videos as input and yields +matched pairs of images along with their estimated relative locations. The +accuracy of this matching is refined by human annotators, who utilize our +annotation software to correlate the selected keyframes. Finally, we present a +benchmark evaluation of several state-of-the-art VPR algorithms using our +annotated dataset, revealing its challenge and thus value for VPR research. + +
+
+ comment: 7 pages, 7 figures, published in 2024 IEEE International Conference + on Robotics and Automation (ICRA 2024) +
+
+
+
+
+ + ♻ ☆ Language-only Efficient Training of Zero-shot Composed Image Retrieval CVPR 2024 + + +
+ Composed image retrieval (CIR) task takes a composed query of image and text, +aiming to search relative images for both conditions. Conventional CIR +approaches need a training dataset composed of triplets of query image, query +text, and target image, which is very expensive to collect. Several recent +works have worked on the zero-shot (ZS) CIR paradigm to tackle the issue +without using pre-collected triplets. However, the existing ZS-CIR methods show +limited backbone scalability and generalizability due to the lack of diversity +of the input texts during training. We propose a novel CIR framework, only +using language for its training. Our LinCIR (Language-only training for CIR) +can be trained only with text datasets by a novel self-supervision named +self-masking projection (SMP). We project the text latent embedding to the +token embedding space and construct a new text by replacing the keyword tokens +of the original text. Then, we let the new and original texts have the same +latent embedding vector. With this simple strategy, LinCIR is surprisingly +efficient and highly effective; LinCIR with CLIP ViT-G backbone is trained in +48 minutes and shows the best ZS-CIR performances on four different CIR +benchmarks, CIRCO, GeneCIS, FashionIQ, and CIRR, even outperforming supervised +method on FashionIQ. Code is available at https://github.com/navervision/lincir + +
+
+ comment: CVPR 2024 camera-ready; First two authors contributed equally; 17 + pages, 3.1MB +
+
+
+
+
+ + ♻ ☆ Handling The Non-Smooth Challenge in Tensor SVD: A Multi-Objective + Tensor Recovery Framework + + +
+ Recently, numerous tensor singular value decomposition (t-SVD)-based tensor +recovery methods have shown promise in processing visual data, such as color +images and videos. However, these methods often suffer from severe performance +degradation when confronted with tensor data exhibiting non-smooth changes. It +has been commonly observed in real-world scenarios but ignored by the +traditional t-SVD-based methods. In this work, we introduce a novel tensor +recovery model with a learnable tensor nuclear norm to address such a +challenge. We develop a new optimization algorithm named the Alternating +Proximal Multiplier Method (APMM) to iteratively solve the proposed tensor +completion model. Theoretical analysis demonstrates the convergence of the +proposed APMM to the Karush-Kuhn-Tucker (KKT) point of the optimization +problem. In addition, we propose a multi-objective tensor recovery framework +based on APMM to efficiently explore the correlations of tensor data across its +various dimensions, providing a new perspective on extending the t-SVD-based +method to higher-order tensor cases. Numerical experiments demonstrated the +effectiveness of the proposed method in tensor completion. + +
+
+
+
+
+ + ♻ ☆ ElasticDiffusion: Training-free Arbitrary Size Image Generation through + Global-Local Content Separation CVPR 2024 + + +
+ Diffusion models have revolutionized image generation in recent years, yet +they are still limited to a few sizes and aspect ratios. We propose +ElasticDiffusion, a novel training-free decoding method that enables pretrained +text-to-image diffusion models to generate images with various sizes. +ElasticDiffusion attempts to decouple the generation trajectory of a pretrained +model into local and global signals. The local signal controls low-level pixel +information and can be estimated on local patches, while the global signal is +used to maintain overall structural consistency and is estimated with a +reference image. We test our method on CelebA-HQ (faces) and LAION-COCO +(objects/indoor/outdoor scenes). Our experiments and qualitative results show +superior image coherence quality across aspect ratios compared to +MultiDiffusion and the standard decoding strategy of Stable Diffusion. Project +page: https://elasticdiffusion.github.io/ + +
+
+ comment: Accepted at CVPR 2024. Project Page: + https://elasticdiffusion.github.io/ +
+
+
+
+
+ + ♻ ☆ CLRmatchNet: Enhancing Curved Lane Detection with Deep Matching Process + + +
+ Lane detection plays a crucial role in autonomous driving by providing vital +data to ensure safe navigation. Modern algorithms rely on anchor-based +detectors, which are then followed by a label-assignment process to categorize +training detections as positive or negative instances based on learned +geometric attributes. Accurate label assignment has great impact on the model +performance, that is usually relying on a pre-defined classical cost function +evaluating GT-prediction alignment. However, classical label assignment methods +face limitations due to their reliance on predefined cost functions derived +from low-dimensional models, potentially impacting their optimality. Our +research introduces MatchNet, a deep learning submodule-based approach aimed at +improving the label assignment process. Integrated into a state-of-the-art lane +detection network such as the Cross Layer Refinement Network for Lane Detection +(CLRNet), MatchNet replaces the conventional label assignment process with a +submodule network. The integrated model, CLRmatchNet, surpasses CLRNet, showing +substantial improvements in scenarios involving curved lanes, with remarkable +improvement across all backbones of +2.8% for ResNet34, +2.3% for ResNet101, +and +2.96% for DLA34. In addition, it maintains or even improves comparable +results in other sections. Our method boosts the confidence level in lane +detection, allowing an increase in the confidence threshold. Our code is +available at: https://github.com/sapirkontente/CLRmatchNet.git + +
+
+
+
+
+ + ♻ ☆ DiverseNet: Decision Diversified Semi-supervised Semantic Segmentation + Networks for Remote Sensing Imagery + + +
+ Semi-supervised learning aims to help reduce the cost of the manual labelling +process by leveraging valuable features extracted from a substantial pool of +unlabeled data alongside a limited set of labelled data during the training +phase. Since pixel-level manual labelling in large-scale remote sensing imagery +is expensive, semi-supervised learning becomes an appropriate solution to this. +However, most of the existing consistency learning frameworks based on network +perturbation are very bulky. There is still a lack of lightweight and efficient +perturbation methods to promote the diversity of features and the precision of +pseudo labels during training. In order to fill this gap, we propose DiverseNet +which explores multi-head and multi-model semi-supervised learning algorithms +by simultaneously enhancing precision and diversity during training. The two +proposed methods in the DiverseNet family, namely DiverseHead and DiverseModel, +both achieve the better semantic segmentation performance in four widely +utilised remote sensing imagery data sets compared to state-of-the-art +semi-supervised learning methods. Meanwhile, the proposed DiverseHead +architecture is simple and relatively lightweight in terms of parameter space +compared to the state-of-the-art methods whilst reaching high-performance +results for all the tested data sets. + +
+
+
+
+
+ + ♻ ☆ Faster ISNet for Background Bias Mitigation on Deep Neural Networks + + +
+ Bias or spurious correlations in image backgrounds can impact neural +networks, causing shortcut learning (Clever Hans Effect) and hampering +generalization to real-world data. ISNet, a recently introduced architecture, +proposed the optimization of Layer-Wise Relevance Propagation (LRP, an +explanation technique) heatmaps, to mitigate the influence of backgrounds on +deep classifiers. However, ISNet's training time scales linearly with the +number of classes in an application. Here, we propose reformulated +architectures whose training time becomes independent from this number. +Additionally, we introduce a concise and model-agnostic LRP implementation. We +challenge the proposed architectures using synthetic background bias, and +COVID-19 detection in chest X-rays, an application that commonly presents +background bias. The networks hindered background attention and shortcut +learning, surpassing multiple state-of-the-art models on out-of-distribution +test datasets. Representing a potentially massive training speed improvement +over ISNet, the proposed architectures introduce LRP optimization into a gamut +of applications that the original model cannot feasibly handle. + +
+
+
+
+
+ + ♻ ☆ Open3DIS: Open-Vocabulary 3D Instance Segmentation with 2D Mask Guidance CVPR 2024 + + +
+ We introduce Open3DIS, a novel solution designed to tackle the problem of +Open-Vocabulary Instance Segmentation within 3D scenes. Objects within 3D +environments exhibit diverse shapes, scales, and colors, making precise +instance-level identification a challenging task. Recent advancements in +Open-Vocabulary scene understanding have made significant strides in this area +by employing class-agnostic 3D instance proposal networks for object +localization and learning queryable features for each 3D mask. While these +methods produce high-quality instance proposals, they struggle with identifying +small-scale and geometrically ambiguous objects. The key idea of our method is +a new module that aggregates 2D instance masks across frames and maps them to +geometrically coherent point cloud regions as high-quality object proposals +addressing the above limitations. These are then combined with 3D +class-agnostic instance proposals to include a wide range of objects in the +real world. To validate our approach, we conducted experiments on three +prominent datasets, including ScanNet200, S3DIS, and Replica, demonstrating +significant performance gains in segmenting objects with diverse categories +over the state-of-the-art approaches. + +
+
+ comment: CVPR 2024. Project page: https://open3dis.github.io/ +
+
+
+
+
+ + ♻ ☆ Object Recognition as Next Token Prediction CVPR 2024 + + +
+ We present an approach to pose object recognition as next token prediction. +The idea is to apply a language decoder that auto-regressively predicts the +text tokens from image embeddings to form labels. To ground this prediction +process in auto-regression, we customize a non-causal attention mask for the +decoder, incorporating two key features: modeling tokens from different labels +to be independent, and treating image tokens as a prefix. This masking +mechanism inspires an efficient method - one-shot sampling - to simultaneously +sample tokens of multiple labels in parallel and rank generated labels by their +probabilities during inference. To further enhance the efficiency, we propose a +simple strategy to construct a compact decoder by simply discarding the +intermediate blocks of a pretrained language model. This approach yields a +decoder that matches the full model's performance while being notably more +efficient. The code is available at https://github.com/kaiyuyue/nxtp + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DriveVLM: The Convergence of Autonomous Driving and Large + Vision-Language Models + + +
+ A primary hurdle of autonomous driving in urban environments is understanding +complex and long-tail scenarios, such as challenging road conditions and +delicate human behaviors. We introduce DriveVLM, an autonomous driving system +leveraging Vision-Language Models (VLMs) for enhanced scene understanding and +planning capabilities. DriveVLM integrates a unique combination of +chain-of-thought (CoT) modules for scene description, scene analysis, and +hierarchical planning. Furthermore, recognizing the limitations of VLMs in +spatial reasoning and heavy computational requirements, we propose +DriveVLM-Dual, a hybrid system that synergizes the strengths of DriveVLM with +the traditional autonomous driving pipeline. DriveVLM-Dual achieves robust +spatial understanding and real-time inference speed. Extensive experiments on +both the nuScenes dataset and our SUP-AD dataset demonstrate the effectiveness +of DriveVLM and the enhanced performance of DriveVLM-Dual, surpassing existing +methods in complex and unpredictable driving conditions. + +
+
+ comment: Project Page: https://tsinghua-mars-lab.github.io/DriveVLM/ +
+
+
+
+
+ + ♻ ☆ A New Benchmark and Model for Challenging Image Manipulation Detection AAAI-24 + + +
+ The ability to detect manipulation in multimedia data is vital in digital +forensics. Existing Image Manipulation Detection (IMD) methods are mainly based +on detecting anomalous features arisen from image editing or double compression +artifacts. All existing IMD techniques encounter challenges when it comes to +detecting small tampered regions from a large image. Moreover, +compression-based IMD approaches face difficulties in cases of double +compression of identical quality factors. To investigate the State-of-The-Art +(SoTA) IMD methods in those challenging conditions, we introduce a new +Challenging Image Manipulation Detection (CIMD) benchmark dataset, which +consists of two subsets, for evaluating editing-based and compression-based IMD +methods, respectively. The dataset images were manually taken and tampered with +high-quality annotations. In addition, we propose a new two-branch network +model based on HRNet that can better detect both the image-editing and +compression artifacts in those challenging conditions. Extensive experiments on +the CIMD benchmark show that our model significantly outperforms SoTA IMD +methods on CIMD. + +
+
+ comment: 9 pages, 6 figures, 3 tabels. AAAI-24 +
+
+
+
+
+ + ♻ ☆ Language-driven Object Fusion into Neural Radiance Fields with + Pose-Conditioned Dataset Updates CVPR 2024 + + +
+ Neural radiance field is an emerging rendering method that generates +high-quality multi-view consistent images from a neural scene representation +and volume rendering. Although neural radiance field-based techniques are +robust for scene reconstruction, their ability to add or remove objects remains +limited. This paper proposes a new language-driven approach for object +manipulation with neural radiance fields through dataset updates. Specifically, +to insert a new foreground object represented by a set of multi-view images +into a background radiance field, we use a text-to-image diffusion model to +learn and generate combined images that fuse the object of interest into the +given background across views. These combined images are then used for refining +the background radiance field so that we can render view-consistent images +containing both the object and the background. To ensure view consistency, we +propose a dataset updates strategy that prioritizes radiance field training +with camera views close to the already-trained views prior to propagating the +training to remaining views. We show that under the same dataset updates +strategy, we can easily adapt our method for object insertion using data from +text-to-3D models as well as object removal. Experimental results show that our +method generates photorealistic images of the edited scenes, and outperforms +state-of-the-art methods in 3D reconstruction and neural radiance field +blending. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Modular Blind Video Quality Assessment CVPR 2024 + + +
+ Blind video quality assessment (BVQA) plays a pivotal role in evaluating and +improving the viewing experience of end-users across a wide range of +video-based platforms and services. Contemporary deep learning-based models +primarily analyze video content in its aggressively subsampled format, while +being blind to the impact of the actual spatial resolution and frame rate on +video quality. In this paper, we propose a modular BVQA model and a method of +training it to improve its modularity. Our model comprises a base quality +predictor, a spatial rectifier, and a temporal rectifier, responding to the +visual content and distortion, spatial resolution, and frame rate changes on +video quality, respectively. During training, spatial and temporal rectifiers +are dropped out with some probabilities to render the base quality predictor a +standalone BVQA model, which should work better with the rectifiers. Extensive +experiments on both professionally-generated content and user-generated content +video databases show that our quality model achieves superior or comparable +performance to current methods. Additionally, the modularity of our model +offers an opportunity to analyze existing video quality databases in terms of +their spatial and temporal complexity. + +
+
+ comment: Accepted by CVPR 2024; Camera-ready version +
+
+
+
+
+ + ♻ ☆ Multi-Channel Orthogonal Transform-Based Perceptron Layers for Efficient + ResNets + + +
+ In this paper, we propose a set of transform-based neural network layers as +an alternative to the $3\times3$ Conv2D layers in Convolutional Neural Networks +(CNNs). The proposed layers can be implemented based on orthogonal transforms +such as the Discrete Cosine Transform (DCT), Hadamard transform (HT), and +biorthogonal Block Wavelet Transform (BWT). Furthermore, by taking advantage of +the convolution theorems, convolutional filtering operations are performed in +the transform domain using element-wise multiplications. Trainable +soft-thresholding layers, that remove noise in the transform domain, bring +nonlinearity to the transform domain layers. Compared to the Conv2D layer, +which is spatial-agnostic and channel-specific, the proposed layers are +location-specific and channel-specific. Moreover, these proposed layers reduce +the number of parameters and multiplications significantly while improving the +accuracy results of regular ResNets on the ImageNet-1K classification task. +Furthermore, they can be inserted with a batch normalization layer before the +global average pooling layer in the conventional ResNets as an additional layer +to improve classification accuracy. + +
+
+ comment: This work is accepted to IEEE Transactions on Neural Networks and + Learning Systems. The initial title is "Orthogonal Transform Domain + Approaches for the Convolutional Layer". We changed it to "Multi-Channel + Orthogonal Transform-Based Perceptron Layers for Efficient ResNets" based on + reviewer's comment. arXiv admin note: text overlap with arXiv:2211.08577 +
+
+
+
+
+ + ♻ ☆ C-TPT: Calibrated Test-Time Prompt Tuning for Vision-Language Models via + Text Feature Dispersion ICLR 2024 + + +
+ In deep learning, test-time adaptation has gained attention as a method for +model fine-tuning without the need for labeled data. A prime exemplification is +the recently proposed test-time prompt tuning for large-scale vision-language +models such as CLIP. Unfortunately, these prompts have been mainly developed to +improve accuracy, overlooking the importance of calibration, which is a crucial +aspect for quantifying prediction uncertainty. However, traditional calibration +methods rely on substantial amounts of labeled data, making them impractical +for test-time scenarios. To this end, this paper explores calibration during +test-time prompt tuning by leveraging the inherent properties of CLIP. Through +a series of observations, we find that the prompt choice significantly affects +the calibration in CLIP, where the prompts leading to higher text feature +dispersion result in better-calibrated predictions. Introducing the Average +Text Feature Dispersion (ATFD), we establish its relationship with calibration +error and present a novel method, Calibrated Test-time Prompt Tuning (C-TPT), +for optimizing prompts during test-time with enhanced calibration. Through +extensive experiments on different CLIP architectures and datasets, we show +that C-TPT can effectively improve the calibration of test-time prompt tuning +without needing labeled data. The code is publicly accessible at +https://github.com/hee-suk-yoon/C-TPT. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Break-for-Make: Modular Low-Rank Adaptations for Composable + Content-Style Customization + + +
+ Personalized generation paradigms empower designers to customize visual +intellectual properties with the help of textual descriptions by tuning or +adapting pre-trained text-to-image models on a few images. Recent works explore +approaches for concurrently customizing both content and detailed visual style +appearance. However, these existing approaches often generate images where the +content and style are entangled. In this study, we reconsider the customization +of content and style concepts from the perspective of parameter space +construction. Unlike existing methods that utilize a shared parameter space for +content and style, we propose a learning framework that separates the parameter +space to facilitate individual learning of content and style, thereby enabling +disentangled content and style. To achieve this goal, we introduce "partly +learnable projection" (PLP) matrices to separate the original adapters into +divided sub-parameter spaces. We propose "break-for-make" customization +learning pipeline based on PLP, which is simple yet effective. We break the +original adapters into "up projection" and "down projection", train content and +style PLPs individually with the guidance of corresponding textual prompts in +the separate adapters, and maintain generalization by employing a +multi-correspondence projection learning strategy. Based on the adapters broken +apart for separate training content and style, we then make the entity +parameter space by reconstructing the content and style PLPs matrices, followed +by fine-tuning the combined adapter to generate the target object with the +desired appearance. Experiments on various styles, including textures, +materials, and artistic style, show that our method outperforms +state-of-the-art single/multiple concept learning pipelines in terms of +content-style-prompt alignment. + +
+
+
+
+
+ + ♻ ☆ Sketch Input Method Editor: A Comprehensive Dataset and Methodology for + Systematic Input Recognition + + +
+ With the recent surge in the use of touchscreen devices, free-hand sketching +has emerged as a promising modality for human-computer interaction. While +previous research has focused on tasks such as recognition, retrieval, and +generation of familiar everyday objects, this study aims to create a Sketch +Input Method Editor (SketchIME) specifically designed for a professional C4I +system. Within this system, sketches are utilized as low-fidelity prototypes +for recommending standardized symbols in the creation of comprehensive +situation maps. This paper also presents a systematic dataset comprising 374 +specialized sketch types, and proposes a simultaneous recognition and +segmentation architecture with multilevel supervision between recognition and +segmentation to improve performance and enhance interpretability. By +incorporating few-shot domain adaptation and class-incremental learning, the +network's ability to adapt to new users and extend to new task-specific classes +is significantly enhanced. Results from experiments conducted on both the +proposed dataset and the SPG dataset illustrate the superior performance of the +proposed architecture. Our dataset and code are publicly available at +https://github.com/GuangmingZhu/SketchIME. + +
+
+ comment: The paper has been accepted by ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement: + Establishing a Novel Baseline and Benchmark ICME 2024 + + +
+ Lake extraction from remote sensing imagery is a complex challenge due to the +varied lake shapes and data noise. Current methods rely on multispectral image +datasets, making it challenging to learn lake features accurately from pixel +arrangements. This, in turn, affects model learning and the creation of +accurate segmentation masks. This paper introduces a prompt-based dataset +construction approach that provides approximate lake locations using point, +box, and mask prompts. We also propose a two-stage prompt enhancement +framework, LEPrompter, with prompt-based and prompt-free stages during +training. The prompt-based stage employs a prompt encoder to extract prior +information, integrating prompt tokens and image embedding through self- and +cross-attention in the prompt decoder. Prompts are deactivated to ensure +independence during inference, enabling automated lake extraction without +introducing additional parameters and GFlops. Extensive experiments showcase +performance improvements of our proposed approach compared to the previous +state-of-the-art method. The source code is available at +https://github.com/BastianChen/LEPrompter. + +
+
+ comment: Accepted by ICME 2024 +
+
+
+
+
+ + ♻ ☆ Deep Neural Networks Fused with Textures for Image Classification + + +
+ Fine-grained image classification (FGIC) is a challenging task in computer +vision for due to small visual differences among inter-subcategories, but, +large intra-class variations. Deep learning methods have achieved remarkable +success in solving FGIC. In this paper, we propose a fusion approach to address +FGIC by combining global texture with local patch-based information. The first +pipeline extracts deep features from various fixed-size non-overlapping patches +and encodes features by sequential modelling using the long short-term memory +(LSTM). Another path computes image-level textures at multiple scales using the +local binary patterns (LBP). The advantages of both streams are integrated to +represent an efficient feature vector for image classification. The method is +tested on eight datasets representing the human faces, skin lesions, food +dishes, marine lives, etc. using four standard backbone CNNs. Our method has +attained better classification accuracy over existing methods with notable +margins. + +
+
+ comment: 14 pages, 6 figures, 4 tables, conference +
+
+
+
+
+ + ♻ ☆ Object-level Geometric Structure Preserving for Natural Image Stitching + + +
+ The topic of stitching images with globally natural structures holds +paramount significance. Current methodologies exhibit the ability to preserve +local geometric structures, yet fall short in maintaining relationships between +these geometric structures. In this paper, we endeavor to safeguard the +overall, OBJect-level structures within images based on Global Similarity +Prior, while concurrently mitigating distortion and ghosting artifacts with +OBJ-GSP. Our approach leverages the Segment Anything Model to extract geometric +structures with semantic information, enhancing the algorithm's ability to +preserve objects in a manner that aligns more intuitively with human +perception. We seek to identify spatial constraints that govern the +relationships between various geometric boundaries. Recognizing that multiple +geometric boundaries collectively define complete objects, we employ triangular +meshes to safeguard not only individual geometric structures but also the +overall shapes of objects within the images. Empirical evaluations across +multiple image stitching datasets demonstrate that our method establishes a new +state-of-the-art benchmark in image stitching. Our implementation and dataset +is publicly available at https://github.com/RussRobin/OBJ-GSP . + +
+
+
+
+
+ + ♻ ☆ Self-Adaptive Sampling for Efficient Video Question-Answering on + Image--Text Models NAACL 2024 + + +
+ Video question-answering is a fundamental task in the field of video +understanding. Although current vision--language models (VLMs) equipped with +Video Transformers have enabled temporal modeling and yielded superior results, +they are at the cost of huge computational power and thus too expensive to +deploy in real-time application scenarios. An economical workaround only +samples a small portion of frames to represent the main content of that video +and tune an image--text model on these sampled frames. Recent video +understanding models usually randomly sample a set of frames or clips, +regardless of internal correlations between their visual contents, nor their +relevance to the problem. We argue that such kinds of aimless sampling may omit +the key frames from which the correct answer can be deduced, and the situation +gets worse when the sampling sparsity increases, which always happens as the +video lengths increase. To mitigate this issue, we propose two frame sampling +strategies, namely the most domain frames (MDF) and most implied frames (MIF), +to maximally preserve those frames that are most likely vital to the given +questions. MDF passively minimizes the risk of key frame omission in a +bootstrap manner, while MIS actively searches key frames customized for each +video--question pair with the assistance of auxiliary models. The experimental +results on three public datasets from three advanced VLMs (CLIP, GIT and +All-in-one) demonstrate that our proposed strategies can boost the performance +for image-text pretrained models. The source codes pertaining to the method +proposed in this paper are publicly available at +https://github.com/declare-lab/sas-vqa. + +
+
+ comment: 13 pages, 7 figures, accepted to Findings of NAACL 2024 +
+
+
+
+
+ + ♻ ☆ HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning + for RGB-D 6DoF Object Pose Estimation CVPR 2024 + + +
+ In this work, we present a novel dense-correspondence method for 6DoF object +pose estimation from a single RGB-D image. While many existing data-driven +methods achieve impressive performance, they tend to be time-consuming due to +their reliance on rendering-based refinement approaches. To circumvent this +limitation, we present HiPose, which establishes 3D-3D correspondences in a +coarse-to-fine manner with a hierarchical binary surface encoding. Unlike +previous dense-correspondence methods, we estimate the correspondence surface +by employing point-to-surface matching and iteratively constricting the surface +until it becomes a correspondence point while gradually removing outliers. +Extensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate +that our method surpasses all refinement-free methods and is even on par with +expensive refinement-based approaches. Crucially, our approach is +computationally efficient and enables real-time critical applications with high +accuracy requirements. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CECT: Controllable Ensemble CNN and Transformer for COVID-19 Image + Classification + + +
+ The COVID-19 pandemic has resulted in hundreds of million cases and numerous +deaths worldwide. Here, we develop a novel classification network CECT by +controllable ensemble convolutional neural network and transformer to provide a +timely and accurate COVID-19 diagnosis. The CECT is composed of a parallel +convolutional encoder block, an aggregate transposed-convolutional decoder +block, and a windowed attention classification block. Each block captures +features at different scales from 28 $\times$ 28 to 224 $\times$ 224 from the +input, composing enriched and comprehensive information. Different from +existing methods, our CECT can capture features at both multi-local and global +scales without any sophisticated module design. Moreover, the contribution of +local features at different scales can be controlled with the proposed ensemble +coefficients. We evaluate CECT on two public COVID-19 datasets and it reaches +the highest accuracy of 98.1% in the intra-dataset evaluation, outperforming +existing state-of-the-art methods. Moreover, the developed CECT achieves an +accuracy of 90.9% on the unseen dataset in the inter-dataset evaluation, +showing extraordinary generalization ability. With remarkable feature capture +ability and generalization ability, we believe CECT can be extended to other +medical scenarios as a powerful diagnosis tool. Code is available at +https://github.com/NUS-Tim/CECT. + +
+
+ comment: Computers in Biology and Medicine Accepted +
+
+
+
+
+ + ♻ ☆ CAT-Seg: Cost Aggregation for Open-Vocabulary Semantic Segmentation CVPR 2024 + + +
+ Open-vocabulary semantic segmentation presents the challenge of labeling each +pixel within an image based on a wide range of text descriptions. In this work, +we introduce a novel cost-based approach to adapt vision-language foundation +models, notably CLIP, for the intricate task of semantic segmentation. Through +aggregating the cosine similarity score, i.e., the cost volume between image +and text embeddings, our method potently adapts CLIP for segmenting seen and +unseen classes by fine-tuning its encoders, addressing the challenges faced by +existing methods in handling unseen classes. Building upon this, we explore +methods to effectively aggregate the cost volume considering its multi-modal +nature of being established between image and text embeddings. Furthermore, we +examine various methods for efficiently fine-tuning CLIP. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://ku-cvlab.github.io/CAT-Seg/ +
+
+
+
+
+ + ♻ ☆ Can Language Models Laugh at YouTube Short-form Videos? EMNLP 2023 + + +
+ As short-form funny videos on social networks are gaining popularity, it +becomes demanding for AI models to understand them for better communication +with humans. Unfortunately, previous video humor datasets target specific +domains, such as speeches or sitcoms, and mostly focus on verbal cues. We +curate a user-generated dataset of 10K multimodal funny videos from YouTube, +called ExFunTube. Using a video filtering pipeline with GPT-3.5, we verify both +verbal and visual elements contributing to humor. After filtering, we annotate +each video with timestamps and text explanations for funny moments. Our +ExFunTube is unique over existing datasets in that our videos cover a wide +range of domains with various types of humor that necessitate a multimodal +understanding of the content. Also, we develop a zero-shot video-to-text +prompting to maximize video humor understanding of large language models +(LLMs). With three different evaluation methods using automatic scores, +rationale quality experiments, and human evaluations, we show that our +prompting significantly improves LLMs' ability for humor explanation. + +
+
+ comment: EMNLP 2023; references added +
+
+
+
+
+ + ♻ ☆ SPIDeRS: Structured Polarization for Invisible Depth and Reflectance + Sensing CVPR 2024 + + +
+ Can we capture shape and reflectance in stealth? Such capability would be +valuable for many application domains in vision, xR, robotics, and HCI. We +introduce structured polarization for invisible depth and reflectance sensing +(SPIDeRS), the first depth and reflectance sensing method using patterns of +polarized light. The key idea is to modulate the angle of linear polarization +(AoLP) of projected light at each pixel. The use of polarization makes it +invisible and lets us recover not only depth but also directly surface normals +and even reflectance. We implement SPIDeRS with a liquid crystal spatial light +modulator (SLM) and a polarimetric camera. We derive a novel method for +robustly extracting the projected structured polarization pattern from the +polarimetric object appearance. We evaluate the effectiveness of SPIDeRS by +applying it to a number of real-world objects. The results show that our method +successfully reconstructs object shapes of various materials and is robust to +diffuse reflection and ambient light. We also demonstrate relighting using +recovered surface normals and reflectance. We believe SPIDeRS opens a new +avenue of polarization use in visual sensing. + +
+
+ comment: to be published in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Neural Parametric Gaussians for Monocular Non-Rigid Object + Reconstruction CVPR 2024 + + +
+ Reconstructing dynamic objects from monocular videos is a severely +underconstrained and challenging problem, and recent work has approached it in +various directions. However, owing to the ill-posed nature of this problem, +there has been no solution that can provide consistent, high-quality novel +views from camera positions that are significantly different from the training +views. In this work, we introduce Neural Parametric Gaussians (NPGs) to take on +this challenge by imposing a two-stage approach: first, we fit a low-rank +neural deformation model, which then is used as regularization for non-rigid +reconstruction in the second stage. The first stage learns the object's +deformations such that it preserves consistency in novel views. The second +stage obtains high reconstruction quality by optimizing 3D Gaussians that are +driven by the coarse model. To this end, we introduce a local 3D Gaussian +representation, where temporally shared Gaussians are anchored in and deformed +by local oriented volumes. The resulting combined model can be rendered as +radiance fields, resulting in high-quality photo-realistic reconstructions of +the non-rigidly deforming objects. We demonstrate that NPGs achieve superior +results compared to previous works, especially in challenging scenarios with +few multi-view cues. + +
+
+ comment: Accepted at CVPR 2024 | Project Website: + https://geometric-rl.mpi-inf.mpg.de/npg +
+
+
+
+
+ + ♻ ☆ Deep Convolutional Framelet Denoising for Panoramic by Mixed Wavelet + Integration + + +
+ Enhancing quality and removing noise during preprocessing is one of the most +critical steps in image processing. X-ray images are created by photons +colliding with atoms and the variation in scattered noise absorption. This +noise leads to a deterioration in the graph's medical quality and, at times, +results in repetition, thereby increasing the patient's effective dose. One of +the most critical challenges in this area has consistently been lowering the +image noise. Techniques like BM3d, low-pass filters, and Autoencoder have taken +this step. Owing to their structural design and high rate of repetition, neural +networks employing diverse architectures have, over the past decade, achieved +noise reduction with satisfactory outcomes, surpassing the traditional BM3D and +low-pass filters. The combination of the Hankel matrix with neural networks +represents one of these configurations. The Hankel matrix aims to identify a +local circle by separating individual values into local and non-local +components, utilizing a non-local matrix. A non-local matrix can be created +using the wave or DCT. This paper suggests integrating the waveform with the +Daubechies (D4) wavelet due to its higher energy concentration and employs the +u-Net neural network architecture, which incorporates the waveform exclusively +at each stage. The outcomes were evaluated using the PSNR and SSIM criteria, +and the outcomes were verified by using various waves. The effectiveness of a +one-wave network has increased from 0.5% to 1.2%, according to studies done on +other datasets. + +
+
+
+
+
+ + ♻ ☆ OCTDL: Optical Coherence Tomography Dataset for Image-Based Deep + Learning Methods + + +
+ Optical coherence tomography (OCT) is a non-invasive imaging technique with +extensive clinical applications in ophthalmology. OCT enables the visualization +of the retinal layers, playing a vital role in the early detection and +monitoring of retinal diseases. OCT uses the principle of light wave +interference to create detailed images of the retinal microstructures, making +it a valuable tool for diagnosing ocular conditions. This work presents an +open-access OCT dataset (OCTDL) comprising over 2000 OCT images labeled +according to disease group and retinal pathology. The dataset consists of OCT +records of patients with Age-related Macular Degeneration (AMD), Diabetic +Macular Edema (DME), Epiretinal Membrane (ERM), Retinal Artery Occlusion (RAO), +Retinal Vein Occlusion (RVO), and Vitreomacular Interface Disease (VID). The +images were acquired with an Optovue Avanti RTVue XR using raster scanning +protocols with dynamic scan length and image resolution. Each retinal b-scan +was acquired by centering on the fovea and interpreted and cataloged by an +experienced retinal specialist. In this work, we applied Deep Learning +classification techniques to this new open-access dataset. + +
+
+
+
+
+ + ♻ ☆ Adaptive Surface Normal Constraint for Geometric Estimation from + Monocular Images + + +
+ We introduce a novel approach to learn geometries such as depth and surface +normal from images while incorporating geometric context. The difficulty of +reliably capturing geometric context in existing methods impedes their ability +to accurately enforce the consistency between the different geometric +properties, thereby leading to a bottleneck of geometric estimation quality. We +therefore propose the Adaptive Surface Normal (ASN) constraint, a simple yet +efficient method. Our approach extracts geometric context that encodes the +geometric variations present in the input image and correlates depth estimation +with geometric constraints. By dynamically determining reliable local geometry +from randomly sampled candidates, we establish a surface normal constraint, +where the validity of these candidates is evaluated using the geometric +context. Furthermore, our normal estimation leverages the geometric context to +prioritize regions that exhibit significant geometric variations, which makes +the predicted normals accurately capture intricate and detailed geometric +information. Through the integration of geometric context, our method unifies +depth and surface normal estimations within a cohesive framework, which enables +the generation of high-quality 3D geometry from images. We validate the +superiority of our approach over state-of-the-art methods through extensive +evaluations and comparisons on diverse indoor and outdoor datasets, showcasing +its efficiency and robustness. + +
+
+ comment: Accepted by TPAMI. arXiv admin note: substantial text overlap with + arXiv:2103.15483 +
+
+
+
+
+ + ♻ ☆ HAVE-FUN: Human Avatar Reconstruction from Few-Shot Unconstrained Images + + +
+ As for human avatar reconstruction, contemporary techniques commonly +necessitate the acquisition of costly data and struggle to achieve satisfactory +results from a small number of casual images. In this paper, we investigate +this task from a few-shot unconstrained photo album. The reconstruction of +human avatars from such data sources is challenging because of limited data +amount and dynamic articulated poses. For handling dynamic data, we integrate a +skinning mechanism with deep marching tetrahedra (DMTet) to form a drivable +tetrahedral representation, which drives arbitrary mesh topologies generated by +the DMTet for the adaptation of unconstrained images. To effectively mine +instructive information from few-shot data, we devise a two-phase optimization +method with few-shot reference and few-shot guidance. The former focuses on +aligning avatar identity with reference images, while the latter aims to +generate plausible appearances for unseen regions. Overall, our framework, +called HaveFun, can undertake avatar reconstruction, rendering, and animation. +Extensive experiments on our developed benchmarks demonstrate that HaveFun +exhibits substantially superior performance in reconstructing the human body +and hand. Project website: https://seanchenxy.github.io/HaveFunWeb/. + +
+
+
+
+
+ + ♻ ☆ DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior + + +
+ We present DiffBIR, a general restoration pipeline that could handle +different blind image restoration tasks in a unified framework. DiffBIR +decouples blind image restoration problem into two stages: 1) degradation +removal: removing image-independent content; 2) information regeneration: +generating the lost image content. Each stage is developed independently but +they work seamlessly in a cascaded manner. In the first stage, we use +restoration modules to remove degradations and obtain high-fidelity restored +results. For the second stage, we propose IRControlNet that leverages the +generative ability of latent diffusion models to generate realistic details. +Specifically, IRControlNet is trained based on specially produced condition +images without distracting noisy content for stable generation performance. +Moreover, we design a region-adaptive restoration guidance that can modify the +denoising process during inference without model re-training, allowing users to +balance realness and fidelity through a tunable guidance scale. Extensive +experiments have demonstrated DiffBIR's superiority over state-of-the-art +approaches for blind image super-resolution, blind face restoration and blind +image denoising tasks on both synthetic and real-world datasets. The code is +available at https://github.com/XPixelGroup/DiffBIR. + +
+
+
+
+
+ + ♻ ☆ Prompt Tuning with Soft Context Sharing for Vision-Language Models + + +
+ Vision-language models have recently shown great potential on many tasks in +computer vision. Meanwhile, prior work demonstrates prompt tuning designed for +vision-language models could acquire superior performance on few-shot image +recognition compared to linear probe, a strong baseline. In practice, many +few-shot tasks are inherently correlated, particularly within specialized +domains. However, such information is overlooked previously. Inspired by the +fact that modeling task relationship by multi-task learning can usually boost +performance, we propose a novel method SoftCPT (Soft Context Sharing for Prompt +Tuning) to tune pre-trained vision-language models on multiple target few-shot +tasks jointly. Specifically, we design a task-shared meta network to generate +prompt context for each task using task name together with a learnable task +context as input. The parameters of this meta network as well as the task +context are tuned on the joint training set of all tasks. As such, the prompt +context of all tasks will be shared in a soft manner. Extensive experiments +across four multi-task few-shot datasets covering 44 tasks and 1593 categories +demonstrate that SoftCPT significantly outperforms single-task prompt tuning +methods, highlighting the effectiveness of multi-task learning for +vision-language prompt tuning. Code is available at +https://github.com/kding1225/softcpt. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Animatable Gaussians: Learning Pose-dependent Gaussian Maps for + High-fidelity Human Avatar Modeling CVPR 2024 + + +
+ Modeling animatable human avatars from RGB videos is a long-standing and +challenging problem. Recent works usually adopt MLP-based neural radiance +fields (NeRF) to represent 3D humans, but it remains difficult for pure MLPs to +regress pose-dependent garment details. To this end, we introduce Animatable +Gaussians, a new avatar representation that leverages powerful 2D CNNs and 3D +Gaussian splatting to create high-fidelity avatars. To associate 3D Gaussians +with the animatable avatar, we learn a parametric template from the input +videos, and then parameterize the template on two front \& back canonical +Gaussian maps where each pixel represents a 3D Gaussian. The learned template +is adaptive to the wearing garments for modeling looser clothes like dresses. +Such template-guided 2D parameterization enables us to employ a powerful +StyleGAN-based CNN to learn the pose-dependent Gaussian maps for modeling +detailed dynamic appearances. Furthermore, we introduce a pose projection +strategy for better generalization given novel poses. Overall, our method can +create lifelike avatars with dynamic, realistic and generalized appearances. +Experiments show that our method outperforms other state-of-the-art approaches. +Code: https://github.com/lizhe00/AnimatableGaussians + +
+
+ comment: Accepted by CVPR 2024, Projectpage: + https://animatable-gaussians.github.io/, Code: + https://github.com/lizhe00/AnimatableGaussians +
+
+
+
+
+ + ♻ ☆ G-PECNet: Towards a Generalizable Pedestrian Trajectory Prediction + System ICLR + + +
+ Navigating dynamic physical environments without obstructing or damaging +human assets is of quintessential importance for social robots. In this work, +we solve autonomous drone navigation's sub-problem of predicting out-of-domain +human and agent trajectories using a deep generative model. Our method: +General-PECNet or G-PECNet observes an improvement of 9.5\% on the Final +Displacement Error (FDE) on 2020's benchmark: PECNet through a combination of +architectural improvements inspired by periodic activation functions and +synthetic trajectory (data) augmentations using Hidden Markov Models (HMMs) and +Reinforcement Learning (RL). Additionally, we propose a simple +geometry-inspired metric for trajectory non-linearity and outlier detection, +helpful for the task. Code available at +https://github.com/Aryan-Garg/PECNet-Pedestrian-Trajectory-Prediction.git + +
+
+ comment: Notable ICLR Tiny Paper 2024 +
+
+
+
+
+ + ♻ ☆ LLMs as Bridges: Reformulating Grounded Multimodal Named Entity + Recognition + + +
+ Grounded Multimodal Named Entity Recognition (GMNER) is a nascent multimodal +task that aims to identify named entities, entity types and their corresponding +visual regions. GMNER task exhibits two challenging properties: 1) The weak +correlation between image-text pairs in social media results in a significant +portion of named entities being ungroundable. 2) There exists a distinction +between coarse-grained referring expressions commonly used in similar tasks +(e.g., phrase localization, referring expression comprehension) and +fine-grained named entities. In this paper, we propose RiVEG, a unified +framework that reformulates GMNER into a joint MNER-VE-VG task by leveraging +large language models (LLMs) as a connecting bridge. This reformulation brings +two benefits: 1) It maintains the optimal MNER performance and eliminates the +need for employing object detection methods to pre-extract regional features, +thereby naturally addressing two major limitations of existing GMNER methods. +2) The introduction of entity expansion expression and Visual Entailment (VE) +Module unifies Visual Grounding (VG) and Entity Grounding (EG). It enables +RiVEG to effortlessly inherit the Visual Entailment and Visual Grounding +capabilities of any current or prospective multimodal pretraining models. +Extensive experiments demonstrate that RiVEG outperforms state-of-the-art +methods on the existing GMNER dataset and achieves absolute leads of 10.65%, +6.21%, and 8.83% in all three subtasks. + +
+
+
+
+
+ + ♻ ☆ Decomposing Disease Descriptions for Enhanced Pathology Detection: A + Multi-Aspect Vision-Language Pre-training Framework CVPR2024 + + +
+ Medical vision language pre-training (VLP) has emerged as a frontier of +research, enabling zero-shot pathological recognition by comparing the query +image with the textual descriptions for each disease. Due to the complex +semantics of biomedical texts, current methods struggle to align medical images +with key pathological findings in unstructured reports. This leads to the +misalignment with the target disease's textual representation. In this paper, +we introduce a novel VLP framework designed to dissect disease descriptions +into their fundamental aspects, leveraging prior knowledge about the visual +manifestations of pathologies. This is achieved by consulting a large language +model and medical experts. Integrating a Transformer module, our approach +aligns an input image with the diverse elements of a disease, generating +aspect-centric image representations. By consolidating the matches from each +aspect, we improve the compatibility between an image and its associated +disease. Additionally, capitalizing on the aspect-oriented representations, we +present a dual-head Transformer tailored to process known and unknown diseases, +optimizing the comprehensive detection efficacy. Conducting experiments on +seven downstream datasets, ours improves the accuracy of recent methods by up +to 8.56% and 17.26% for seen and unseen categories, respectively. Our code is +released at https://github.com/HieuPhan33/MAVL. + +
+
+ comment: Accepted at CVPR2024. Pre-print before final camera-ready version +
+
+
+
+
+ + ♻ ☆ SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation + System + + +
+ Accuracy and computational efficiency are the most important metrics to +Visual Inertial Navigation System (VINS). The existing VINS algorithms with +either high accuracy or low computational complexity, are difficult to provide +the high precision localization in resource-constrained devices. To this end, +we propose a novel filter-based VINS framework named SchurVINS, which could +guarantee both high accuracy by building a complete residual model and low +computational complexity with Schur complement. Technically, we first formulate +the full residual model where Gradient, Hessian and observation covariance are +explicitly modeled. Then Schur complement is employed to decompose the full +model into ego-motion residual model and landmark residual model. Finally, +Extended Kalman Filter (EKF) update is implemented in these two models with +high efficiency. Experiments on EuRoC and TUM-VI datasets show that our method +notably outperforms state-of-the-art (SOTA) methods in both accuracy and +computational complexity. The experimental code of SchurVINS is available at +https://github.com/bytedance/SchurVINS. + +
+
+
+
+
+ + ♻ ☆ Segment Anything Model for Road Network Graph Extraction + + +
+ We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for +extracting large-scale, vectorized road network graphs from satellite imagery. +To predict graph geometry, we formulate it as a dense semantic segmentation +task, leveraging the inherent strengths of SAM. The image encoder of SAM is +fine-tuned to produce probability masks for roads and intersections, from which +the graph vertices are extracted via simple non-maximum suppression. To predict +graph topology, we designed a lightweight transformer-based graph neural +network, which leverages the SAM image embeddings to estimate the edge +existence probabilities between vertices. Our approach directly predicts the +graph vertices and edges for large regions without expensive and complex +post-processing heuristics, and is capable of building complete road network +graphs spanning multiple square kilometers in a matter of seconds. With its +simple, straightforward, and minimalist design, SAM-Road achieves comparable +accuracy with the state-of-the-art method RNGDet++, while being 40 times faster +on the City-scale dataset. We thus demonstrate the power of a foundational +vision model when applied to a graph learning task. The code is available at +https://github.com/htcr/sam_road. + +
+
+
+
+
+ + ♻ ☆ RCooper: A Real-world Large-scale Dataset for Roadside Cooperative + Perception CVPR2024 + + +
+ The value of roadside perception, which could extend the boundaries of +autonomous driving and traffic management, has gradually become more prominent +and acknowledged in recent years. However, existing roadside perception +approaches only focus on the single-infrastructure sensor system, which cannot +realize a comprehensive understanding of a traffic area because of the limited +sensing range and blind spots. Orienting high-quality roadside perception, we +need Roadside Cooperative Perception (RCooper) to achieve practical +area-coverage roadside perception for restricted traffic areas. Rcooper has its +own domain-specific challenges, but further exploration is hindered due to the +lack of datasets. We hence release the first real-world, large-scale RCooper +dataset to bloom the research on practical roadside cooperative perception, +including detection and tracking. The manually annotated dataset comprises 50k +images and 30k point clouds, including two representative traffic scenes (i.e., +intersection and corridor). The constructed benchmarks prove the effectiveness +of roadside cooperation perception and demonstrate the direction of further +research. Codes and dataset can be accessed at: +https://github.com/AIR-THU/DAIR-RCooper. + +
+
+ comment: Accepted by CVPR2024. 10 pages with 6 figures +
+
+
+
+
+ + ♻ ☆ 3D Reconstruction of Interacting Multi-Person in Clothing from a Single + Image WACV 2024 + + +
+ This paper introduces a novel pipeline to reconstruct the geometry of +interacting multi-person in clothing on a globally coherent scene space from a +single image. The main challenge arises from the occlusion: a part of a human +body is not visible from a single view due to the occlusion by others or the +self, which introduces missing geometry and physical implausibility (e.g., +penetration). We overcome this challenge by utilizing two human priors for +complete 3D geometry and surface contacts. For the geometry prior, an encoder +learns to regress the image of a person with missing body parts to the latent +vectors; a decoder decodes these vectors to produce 3D features of the +associated geometry; and an implicit network combines these features with a +surface normal map to reconstruct a complete and detailed 3D humans. For the +contact prior, we develop an image-space contact detector that outputs a +probability distribution of surface contacts between people in 3D. We use these +priors to globally refine the body poses, enabling the penetration-free and +accurate reconstruction of interacting multi-person in clothing on the scene +space. The results demonstrate that our method is complete, globally coherent, +and physically plausible compared to existing methods. + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ♻ ☆ LangSplat: 3D Language Gaussian Splatting CVPR 2024 + + +
+ Humans live in a 3D world and commonly use natural language to interact with +a 3D scene. Modeling a 3D language field to support open-ended language queries +in 3D has gained increasing attention recently. This paper introduces +LangSplat, which constructs a 3D language field that enables precise and +efficient open-vocabulary querying within 3D spaces. Unlike existing methods +that ground CLIP language embeddings in a NeRF model, LangSplat advances the +field by utilizing a collection of 3D Gaussians, each encoding language +features distilled from CLIP, to represent the language field. By employing a +tile-based splatting technique for rendering language features, we circumvent +the costly rendering process inherent in NeRF. Instead of directly learning +CLIP embeddings, LangSplat first trains a scene-wise language autoencoder and +then learns language features on the scene-specific latent space, thereby +alleviating substantial memory demands imposed by explicit modeling. Existing +methods struggle with imprecise and vague 3D language fields, which fail to +discern clear boundaries between objects. We delve into this issue and propose +to learn hierarchical semantics using SAM, thereby eliminating the need for +extensively querying the language field across various scales and the +regularization of DINO features. Extensive experimental results show that +LangSplat significantly outperforms the previous state-of-the-art method LERF +by a large margin. Notably, LangSplat is extremely efficient, achieving a 199 +$\times$ speedup compared to LERF at the resolution of 1440 $\times$ 1080. We +strongly recommend readers to check out our video results at +https://langsplat.github.io/ + +
+
+ comment: CVPR 2024. Project Page: https://langsplat.github.io +
+
+
+
+
+ + ♻ ☆ Guided Slot Attention for Unsupervised Video Object Segmentation CVPR 2024 + + +
+ Unsupervised video object segmentation aims to segment the most prominent +object in a video sequence. However, the existence of complex backgrounds and +multiple foreground objects make this task challenging. To address this issue, +we propose a guided slot attention network to reinforce spatial structural +information and obtain better foreground--background separation. The foreground +and background slots, which are initialized with query guidance, are +iteratively refined based on interactions with template information. +Furthermore, to improve slot--template interaction and effectively fuse global +and local features in the target and reference frames, K-nearest neighbors +filtering and a feature aggregation transformer are introduced. The proposed +model achieves state-of-the-art performance on two popular datasets. +Additionally, we demonstrate the robustness of the proposed model in +challenging scenes through various comparative experiments. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ LEMON: Learning 3D Human-Object Interaction Relation from 2D Images CVPR2024 + + +
+ Learning 3D human-object interaction relation is pivotal to embodied AI and +interaction modeling. Most existing methods approach the goal by learning to +predict isolated interaction elements, e.g., human contact, object affordance, +and human-object spatial relation, primarily from the perspective of either the +human or the object. Which underexploit certain correlations between the +interaction counterparts (human and object), and struggle to address the +uncertainty in interactions. Actually, objects' functionalities potentially +affect humans' interaction intentions, which reveals what the interaction is. +Meanwhile, the interacting humans and objects exhibit matching geometric +structures, which presents how to interact. In light of this, we propose +harnessing these inherent correlations between interaction counterparts to +mitigate the uncertainty and jointly anticipate the above interaction elements +in 3D space. To achieve this, we present LEMON (LEarning 3D huMan-Object +iNteraction relation), a unified model that mines interaction intentions of the +counterparts and employs curvatures to guide the extraction of geometric +correlations, combining them to anticipate the interaction elements. Besides, +the 3D Interaction Relation dataset (3DIR) is collected to serve as the test +bed for training and evaluation. Extensive experiments demonstrate the +superiority of LEMON over methods estimating each element in isolation. + +
+
+ comment: accept by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Improved Probabilistic Image-Text Representations ICLR 2024 + + +
+ Image-Text Matching (ITM) task, a fundamental vision-language (VL) task, +suffers from the inherent ambiguity arising from multiplicity and imperfect +annotations. Deterministic functions are not sufficiently powerful to capture +ambiguity, prompting the exploration of probabilistic embeddings to tackle the +challenge. However, the existing probabilistic ITM approach encounters two key +shortcomings; the burden of heavy computations due to the Monte Carlo +approximation, and the loss saturation issue in the face of abundant false +negatives. To overcome the issues, this paper presents an improved +Probabilistic Cross-Modal Embeddings (named PCME++) by introducing a new +probabilistic distance with a closed-form solution. In addition, two +optimization techniques are proposed to enhance PCME++ further: first, the +incorporation of pseudo-positives to prevent the negative effect under massive +false negatives; second, mixed sample data augmentation for probabilistic +matching. Experimental results on MS-COCO Caption and two extended benchmarks, +CxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to +state-of-the-art ITM methods. The robustness of PCME++ is also evaluated under +noisy image-text correspondences. In addition, the potential applicability of +PCME++ in automatic prompt-filtering for zero-shot classification is shown. The +code is available at https://github.com/naver-ai/pcmepp + +
+
+ comment: ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp. + Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB +
+
+
+
+
+ + ♻ ☆ Deep Semantic Segmentation of Natural and Medical Images: A Review + + +
+ The semantic image segmentation task consists of classifying each pixel of an +image into an instance, where each instance corresponds to a class. This task +is a part of the concept of scene understanding or better explaining the global +context of an image. In the medical image analysis domain, image segmentation +can be used for image-guided interventions, radiotherapy, or improved +radiological diagnostics. In this review, we categorize the leading deep +learning-based medical and non-medical image segmentation solutions into six +main groups of deep architectural, data synthesis-based, loss function-based, +sequenced models, weakly supervised, and multi-task methods and provide a +comprehensive review of the contributions in each of these groups. Further, for +each group, we analyze each variant of these groups and discuss the limitations +of the current approaches and present potential future research directions for +semantic image segmentation. + +
+
+ comment: 45 pages, 16 figures. Accepted for publication in Springer Artificial + Intelligence Review +
+
+
+
+
+ + ♻ ☆ Resolution Limit of Single-Photon LiDAR + + +
+ Single-photon Light Detection and Ranging (LiDAR) systems are often equipped +with an array of detectors for improved spatial resolution and sensing speed. +However, given a fixed amount of flux produced by the laser transmitter across +the scene, the per-pixel Signal-to-Noise Ratio (SNR) will decrease when more +pixels are packed in a unit space. This presents a fundamental trade-off +between the spatial resolution of the sensor array and the SNR received at each +pixel. Theoretical characterization of this fundamental limit is explored. By +deriving the photon arrival statistics and introducing a series of new +approximation techniques, the Mean Squared Error (MSE) of the +maximum-likelihood estimator of the time delay is derived. The theoretical +predictions align well with simulations and real data. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 113 + +
+
+
+ + ☆ 94% on CIFAR-10 in 3.29 Seconds on a Single GPU + + +
+ CIFAR-10 is among the most widely used datasets in machine learning, +facilitating thousands of research projects per year. To accelerate research +and reduce the cost of experiments, we introduce training methods for CIFAR-10 +which reach 94% accuracy in 3.29 seconds, 95% in 10.4 seconds, and 96% in 46.3 +seconds, when run on a single NVIDIA A100 GPU. As one factor contributing to +these training speeds, we propose a derandomized variant of horizontal flipping +augmentation, which we show improves over the standard method in every case +where flipping is beneficial over no flipping at all. Our code is released at +https://github.com/KellerJordan/cifar10-airbench. + +
+
+
+
+
+ + ☆ Denoising Monte Carlo Renders With Diffusion Models + + +
+ Physically-based renderings contain Monte-Carlo noise, with variance that +increases as the number of rays per pixel decreases. This noise, while +zero-mean for good modern renderers, can have heavy tails (most notably, for +scenes containing specular or refractive objects). Learned methods for +restoring low fidelity renders are highly developed, because suppressing render +noise means one can save compute and use fast renders with few rays per pixel. +We demonstrate that a diffusion model can denoise low fidelity renders +successfully. Furthermore, our method can be conditioned on a variety of +natural render information, and this conditioning helps performance. +Quantitative experiments show that our method is competitive with SOTA across a +range of sampling rates, but current metrics slightly favor competitor methods. +Qualitative examination of the reconstructions suggests that the metrics +themselves may not be reliable. The image prior applied by a diffusion method +strongly favors reconstructions that are "like" real images -- so have straight +shadow boundaries, curved specularities, no "fireflies" and the like -- and +metrics do not account for this. We show numerous examples where methods +preferred by current metrics produce qualitatively weaker reconstructions than +ours. + +
+
+ comment: 14 pages, 12 figures +
+
+
+
+
+ + ☆ DiffHuman: Probabilistic Photorealistic 3D Reconstruction of Humans CVPR 2024 + + +
+ We present DiffHuman, a probabilistic method for photorealistic 3D human +reconstruction from a single RGB image. Despite the ill-posed nature of this +problem, most methods are deterministic and output a single solution, often +resulting in a lack of geometric detail and blurriness in unseen or uncertain +regions. In contrast, DiffHuman predicts a probability distribution over 3D +reconstructions conditioned on an input 2D image, which allows us to sample +multiple detailed 3D avatars that are consistent with the image. DiffHuman is +implemented as a conditional diffusion model that denoises pixel-aligned 2D +observations of an underlying 3D shape representation. During inference, we may +sample 3D avatars by iteratively denoising 2D renders of the predicted 3D +representation. Furthermore, we introduce a generator neural network that +approximates rendering with considerably reduced runtime (55x speed up), +resulting in a novel dual-branch diffusion framework. Our experiments show that +DiffHuman can produce diverse and detailed reconstructions for the parts of the +person that are unseen or uncertain in the input image, while remaining +competitive with the state-of-the-art when reconstructing visible surfaces. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Score-Based Diffusion Models for Photoacoustic Tomography Image + Reconstruction + + +
+ Photoacoustic tomography (PAT) is a rapidly-evolving medical imaging modality +that combines optical absorption contrast with ultrasound imaging depth. One +challenge in PAT is image reconstruction with inadequate acoustic signals due +to limited sensor coverage or due to the density of the transducer array. Such +cases call for solving an ill-posed inverse reconstruction problem. In this +work, we use score-based diffusion models to solve the inverse problem of +reconstructing an image from limited PAT measurements. The proposed approach +allows us to incorporate an expressive prior learned by a diffusion model on +simulated vessel structures while still being robust to varying transducer +sparsity conditions. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ SceneGraphLoc: Cross-Modal Coarse Visual Localization on 3D Scene Graphs + + +
+ We introduce a novel problem, i.e., the localization of an input image within +a multi-modal reference map represented by a database of 3D scene graphs. These +graphs comprise multiple modalities, including object-level point clouds, +images, attributes, and relationships between objects, offering a lightweight +and efficient alternative to conventional methods that rely on extensive image +databases. Given the available modalities, the proposed method SceneGraphLoc +learns a fixed-sized embedding for each node (i.e., representing an object +instance) in the scene graph, enabling effective matching with the objects +visible in the input query image. This strategy significantly outperforms other +cross-modal methods, even without incorporating images into the map embeddings. +When images are leveraged, SceneGraphLoc achieves performance close to that of +state-of-the-art techniques depending on large image databases, while requiring +three orders-of-magnitude less storage and operating orders-of-magnitude +faster. The code will be made public. + +
+
+
+
+
+ + ☆ Multiway Point Cloud Mosaicking with Diffusion and Global Optimization + + +
+ We introduce a novel framework for multiway point cloud mosaicking (named +Wednesday), designed to co-align sets of partially overlapping point clouds -- +typically obtained from 3D scanners or moving RGB-D cameras -- into a unified +coordinate system. At the core of our approach is ODIN, a learned pairwise +registration algorithm that iteratively identifies overlaps and refines +attention scores, employing a diffusion-based process for denoising pairwise +correlation matrices to enhance matching accuracy. Further steps include +constructing a pose graph from all point clouds, performing rotation averaging, +a novel robust algorithm for re-estimating translations optimally in terms of +consensus maximization and translation optimization. Finally, the point cloud +rotations and positions are optimized jointly by a diffusion-based approach. +Tested on four diverse, large-scale datasets, our method achieves +state-of-the-art pairwise and multiway registration results by a large margin +on all benchmarks. Our code and models are available at +https://github.com/jinsz/Multiway-Point-Cloud-Mosaicking-with-Diffusion-and-Global-Optimization. + +
+
+
+
+
+ + ☆ Extracting Manifold Information from Point Clouds + + +
+ A kernel based method is proposed for the construction of signature +(defining) functions of subsets of $\mathbb{R}^d$. The subsets can range from +full dimensional manifolds (open subsets) to point clouds (a finite number of +points) and include bounded smooth manifolds of any codimension. The +interpolation and analysis of point clouds are the main application. Two +extreme cases in terms of regularity are considered, where the data set is +interpolated by an analytic surface, at the one extreme, and by a H\"older +continuous surface, at the other. The signature function can be computed as a +linear combination of translated kernels, the coefficients of which are the +solution of a finite dimensional linear problem. Once it is obtained, it can be +used to estimate the dimension as well as the normal and the curvatures of the +interpolated surface. The method is global and does not require explicit +knowledge of local neighborhoods or any other structure present in the data +set. It admits a variational formulation with a natural ``regularized'' +counterpart, that proves to be useful in dealing with data sets corrupted by +numerical error or noise. The underlying analytical structure of the approach +is presented in general before it is applied to the case of point clouds. + +
+
+ comment: 27 pages, 16 figures, 5 tables +
+
+
+
+
+ + ☆ Do Vision-Language Models Understand Compound Nouns? NAACL 2024 + + +
+ Open-vocabulary vision-language models (VLMs) like CLIP, trained using +contrastive loss, have emerged as a promising new paradigm for text-to-image +retrieval. However, do VLMs understand compound nouns (CNs) (e.g., lab coat) as +well as they understand nouns (e.g., lab)? We curate Compun, a novel benchmark +with 400 unique and commonly used CNs, to evaluate the effectiveness of VLMs in +interpreting CNs. The Compun benchmark challenges a VLM for text-to-image +retrieval where, given a text prompt with a CN, the task is to select the +correct image that shows the CN among a pair of distractor images that show the +constituent nouns that make up the CN. Next, we perform an in-depth analysis to +highlight CLIPs' limited understanding of certain types of CNs. Finally, we +present an alternative framework that moves beyond hand-written templates for +text prompts widely used by CLIP-like models. We employ a Large Language Model +to generate multiple diverse captions that include the CN as an object in the +scene described by the caption. Our proposed method improves CN understanding +of CLIP by 8.25% on Compun. Code and benchmark are available at: +https://github.com/sonalkum/Compun + +
+
+ comment: Accepted to NAACL 2024 Main Conference +
+
+
+
+
+ + ☆ Continual Learning for Autonomous Robots: A Prototype-based Approach IROS + + +
+ Humans and animals learn throughout their lives from limited amounts of +sensed data, both with and without supervision. Autonomous, intelligent robots +of the future are often expected to do the same. The existing continual +learning (CL) methods are usually not directly applicable to robotic settings: +they typically require buffering and a balanced replay of training data. A +few-shot online continual learning (FS-OCL) setting has been proposed to +address more realistic scenarios where robots must learn from a non-repeated +sparse data stream. To enable truly autonomous life-long learning, an +additional challenge of detecting novelties and learning new items without +supervision needs to be addressed. We address this challenge with our new +prototype-based approach called Continually Learning Prototypes (CLP). In +addition to being capable of FS-OCL learning, CLP also detects novel objects +and learns them without supervision. To mitigate forgetting, CLP utilizes a +novel metaplasticity mechanism that adapts the learning rate individually per +prototype. CLP is rehearsal-free, hence does not require a memory buffer, and +is compatible with neuromorphic hardware, characterized by ultra-low power +consumption, real-time processing abilities, and on-chip learning. Indeed, we +have open-sourced a simple version of CLP in the neuromorphic software +framework Lava, targetting Intel's neuromorphic chip Loihi 2. We evaluate CLP +on a robotic vision dataset, OpenLORIS. In a low-instance FS-OCL scenario, CLP +shows state-of-the-art results. In the open world, CLP detects novelties with +superior precision and recall and learns features of the detected novel classes +without supervision, achieving a strong baseline of 99% base class and 65%/76% +(5-shot/10-shot) novel class accuracy. + +
+
+ comment: Submitted to IEEE/RSJ International Conference on Intelligent Robots + and Systems (IROS) +
+
+
+
+
+ + ☆ Orchestrate Latent Expertise: Advancing Online Continual Learning with + Multi-Level Supervision and Reverse Self-Distillation CVPR 2024 + + +
+ To accommodate real-world dynamics, artificial intelligence systems need to +cope with sequentially arriving content in an online manner. Beyond regular +Continual Learning (CL) attempting to address catastrophic forgetting with +offline training of each task, Online Continual Learning (OCL) is a more +challenging yet realistic setting that performs CL in a one-pass data stream. +Current OCL methods primarily rely on memory replay of old training samples. +However, a notable gap from CL to OCL stems from the additional +overfitting-underfitting dilemma associated with the use of rehearsal buffers: +the inadequate learning of new training samples (underfitting) and the repeated +learning of a few old training samples (overfitting). To this end, we introduce +a novel approach, Multi-level Online Sequential Experts (MOSE), which +cultivates the model as stacked sub-experts, integrating multi-level +supervision and reverse self-distillation. Supervision signals across multiple +stages facilitate appropriate convergence of the new task while gathering +various strengths from experts by knowledge distillation mitigates the +performance decline of old tasks. MOSE demonstrates remarkable efficacy in +learning new samples and preserving past knowledge through multi-level experts, +thereby significantly advancing OCL performance over state-of-the-art baselines +(e.g., up to 7.3% on Split CIFAR-100 and 6.1% on Split Tiny-ImageNet). + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ SVGCraft: Beyond Single Object Text-to-SVG Synthesis with Comprehensive + Canvas Layout + + +
+ Generating VectorArt from text prompts is a challenging vision task, +requiring diverse yet realistic depictions of the seen as well as unseen +entities. However, existing research has been mostly limited to the generation +of single objects, rather than comprehensive scenes comprising multiple +elements. In response, this work introduces SVGCraft, a novel end-to-end +framework for the creation of vector graphics depicting entire scenes from +textual descriptions. Utilizing a pre-trained LLM for layout generation from +text prompts, this framework introduces a technique for producing masked +latents in specified bounding boxes for accurate object placement. It +introduces a fusion mechanism for integrating attention maps and employs a +diffusion U-Net for coherent composition, speeding up the drawing process. The +resulting SVG is optimized using a pre-trained encoder and LPIPS loss with +opacity modulation to maximize similarity. Additionally, this work explores the +potential of primitive shapes in facilitating canvas completion in constrained +environments. Through both qualitative and quantitative assessments, SVGCraft +is demonstrated to surpass prior works in abstraction, recognizability, and +detail, as evidenced by its performance metrics (CLIP-T: 0.4563, Cosine +Similarity: 0.6342, Confusion: 0.66, Aesthetic: 6.7832). The code will be +available at https://github.com/ayanban011/SVGCraft. + +
+
+
+
+
+ + ☆ 3DGSR: Implicit Surface Reconstruction with 3D Gaussian Splatting + + +
+ In this paper, we present an implicit surface reconstruction method with 3D +Gaussian Splatting (3DGS), namely 3DGSR, that allows for accurate 3D +reconstruction with intricate details while inheriting the high efficiency and +rendering quality of 3DGS. The key insight is incorporating an implicit signed +distance field (SDF) within 3D Gaussians to enable them to be aligned and +jointly optimized. First, we introduce a differentiable SDF-to-opacity +transformation function that converts SDF values into corresponding Gaussians' +opacities. This function connects the SDF and 3D Gaussians, allowing for +unified optimization and enforcing surface constraints on the 3D Gaussians. +During learning, optimizing the 3D Gaussians provides supervisory signals for +SDF learning, enabling the reconstruction of intricate details. However, this +only provides sparse supervisory signals to the SDF at locations occupied by +Gaussians, which is insufficient for learning a continuous SDF. Then, to +address this limitation, we incorporate volumetric rendering and align the +rendered geometric attributes (depth, normal) with those derived from 3D +Gaussians. This consistency regularization introduces supervisory signals to +locations not covered by discrete 3D Gaussians, effectively eliminating +redundant surfaces outside the Gaussian sampling range. Our extensive +experimental results demonstrate that our 3DGSR method enables high-quality 3D +surface reconstruction while preserving the efficiency and rendering quality of +3DGS. Besides, our method competes favorably with leading surface +reconstruction techniques while offering a more efficient learning process and +much better rendering qualities. The code will be available at +https://github.com/CVMI-Lab/3DGSR. + +
+
+
+
+
+ + ☆ Constrained Layout Generation with Factor Graphs CVPR 2024 + + +
+ This paper addresses the challenge of object-centric layout generation under +spatial constraints, seen in multiple domains including floorplan design +process. The design process typically involves specifying a set of spatial +constraints that include object attributes like size and inter-object relations +such as relative positioning. Existing works, which typically represent objects +as single nodes, lack the granularity to accurately model complex interactions +between objects. For instance, often only certain parts of an object, like a +room's right wall, interact with adjacent objects. To address this gap, we +introduce a factor graph based approach with four latent variable nodes for +each room, and a factor node for each constraint. The factor nodes represent +dependencies among the variables to which they are connected, effectively +capturing constraints that are potentially of a higher order. We then develop +message-passing on the bipartite graph, forming a factor graph neural network +that is trained to produce a floorplan that aligns with the desired +requirements. Our approach is simple and generates layouts faithful to the user +requirements, demonstrated by a large improvement in IOU scores over existing +methods. Additionally, our approach, being inferential and accurate, is +well-suited to the practical human-in-the-loop design process where +specifications evolve iteratively, offering a practical and powerful tool for +AI-guided design. + +
+
+ comment: To be published at IEEE/CVF CVPR 2024 +
+
+
+
+
+ + ☆ TTD: Text-Tag Self-Distillation Enhancing Image-Text Alignment in CLIP + to Alleviate Single Tag Bias + + +
+ We identify a critical bias in contemporary CLIP-based models, which we +denote as \textit{single tag bias}. This bias manifests as a disproportionate +focus on a singular tag (word) while neglecting other pertinent tags, stemming +from CLIP's text embeddings that prioritize one specific tag in image-text +relationships. When deconstructing text into individual tags, only one tag +tends to have high relevancy with CLIP's image embedding, leading to an +imbalanced tag relevancy. This results in an uneven alignment among multiple +tags present in the text. To tackle this challenge, we introduce a novel +two-step fine-tuning approach. First, our method leverages the similarity +between tags and their nearest pixels for scoring, enabling the extraction of +image-relevant tags from the text. Second, we present a self-distillation +strategy aimed at aligning the combined masks from extracted tags with the +text-derived mask. This approach mitigates the single tag bias, thereby +significantly improving the alignment of CLIP's model without necessitating +additional data or supervision. Our technique demonstrates model-agnostic +improvements in multi-tag classification and segmentation tasks, surpassing +competing methods that rely on external resources. Code is available at +https://github.com/shjo-april/TTD. + +
+
+
+
+
+ + ☆ DHR: Dual Features-Driven Hierarchical Rebalancing in Inter- and + Intra-Class Regions for Weakly-Supervised Semantic Segmentation + + +
+ Weakly-supervised semantic segmentation (WSS) ensures high-quality +segmentation with limited data and excels when employed as input seed masks for +large-scale vision models such as Segment Anything. However, WSS faces +challenges related to minor classes since those are overlooked in images with +adjacent multiple classes, a limitation originating from the overfitting of +traditional expansion methods like Random Walk. We first address this by +employing unsupervised and weakly-supervised feature maps instead of +conventional methodologies, allowing for hierarchical mask enhancement. This +method distinctly categorizes higher-level classes and subsequently separates +their associated lower-level classes, ensuring all classes are correctly +restored in the mask without losing minor ones. Our approach, validated through +extensive experimentation, significantly improves WSS across five benchmarks +(VOC: 79.8\%, COCO: 53.9\%, Context: 49.0\%, ADE: 32.9\%, Stuff: 37.4\%), +reducing the gap with fully supervised methods by over 84\% on the VOC +validation set. Code is available at https://github.com/shjo-april/DHR. + +
+
+
+
+
+ + ☆ The Devil is in the Edges: Monocular Depth Estimation with Edge-aware + Consistency Fusion + + +
+ This paper presents a novel monocular depth estimation method, named ECFNet, +for estimating high-quality monocular depth with clear edges and valid overall +structure from a single RGB image. We make a thorough inquiry about the key +factor that affects the edge depth estimation of the MDE networks, and come to +a ratiocination that the edge information itself plays a critical role in +predicting depth details. Driven by this analysis, we propose to explicitly +employ the image edges as input for ECFNet and fuse the initial depths from +different sources to produce the final depth. Specifically, ECFNet first uses a +hybrid edge detection strategy to get the edge map and edge-highlighted image +from the input image, and then leverages a pre-trained MDE network to infer the +initial depths of the aforementioned three images. After that, ECFNet utilizes +a layered fusion module (LFM) to fuse the initial depth, which will be further +updated by a depth consistency module (DCM) to form the final estimation. +Extensive experimental results on public datasets and ablation studies indicate +that our method achieves state-of-the-art performance. Project page: +https://zrealli.github.io/edgedepth. + +
+
+ comment: 17 pages, 19 figures +
+
+
+
+
+ + ☆ Towards Variable and Coordinated Holistic Co-Speech Motion Generation CVPR 2024 + + +
+ This paper addresses the problem of generating lifelike holistic co-speech +motions for 3D avatars, focusing on two key aspects: variability and +coordination. Variability allows the avatar to exhibit a wide range of motions +even with similar speech content, while coordination ensures a harmonious +alignment among facial expressions, hand gestures, and body poses. We aim to +achieve both with ProbTalk, a unified probabilistic framework designed to +jointly model facial, hand, and body movements in speech. ProbTalk builds on +the variational autoencoder (VAE) architecture and incorporates three core +designs. First, we introduce product quantization (PQ) to the VAE, which +enriches the representation of complex holistic motion. Second, we devise a +novel non-autoregressive model that embeds 2D positional encoding into the +product-quantized representation, thereby preserving essential structure +information of the PQ codes. Last, we employ a secondary stage to refine the +preliminary prediction, further sharpening the high-frequency details. Coupling +these three designs enables ProbTalk to generate natural and diverse holistic +co-speech motions, outperforming several state-of-the-art methods in +qualitative and quantitative evaluations, particularly in terms of realism. Our +code and model will be released for research purposes at +https://feifeifeiliu.github.io/probtalk/. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Efficient Multi-branch Segmentation Network for Situation Awareness in + Autonomous Navigation + + +
+ Real-time and high-precision situational awareness technology is critical for +autonomous navigation of unmanned surface vehicles (USVs). In particular, +robust and fast obstacle semantic segmentation methods are essential. However, +distinguishing between the sea and the sky is challenging due to the +differences between port and maritime environments. In this study, we built a +dataset that captured perspectives from USVs and unmanned aerial vehicles in a +maritime port environment and analysed the data features. Statistical analysis +revealed a high correlation between the distribution of the sea and sky and row +positional information. Based on this finding, a three-branch semantic +segmentation network with a row position encoding module (RPEM) was proposed to +improve the prediction accuracy between the sea and the sky. The proposed RPEM +highlights the effect of row coordinates on feature extraction. Compared to the +baseline, the three-branch network with RPEM significantly improved the ability +to distinguish between the sea and the sky without significantly reducing the +computational speed. + +
+
+
+
+
+ + ☆ STBA: Towards Evaluating the Robustness of DNNs for Query-Limited + Black-box Scenario + + +
+ Many attack techniques have been proposed to explore the vulnerability of +DNNs and further help to improve their robustness. Despite the significant +progress made recently, existing black-box attack methods still suffer from +unsatisfactory performance due to the vast number of queries needed to optimize +desired perturbations. Besides, the other critical challenge is that +adversarial examples built in a noise-adding manner are abnormal and struggle +to successfully attack robust models, whose robustness is enhanced by +adversarial training against small perturbations. There is no doubt that these +two issues mentioned above will significantly increase the risk of exposure and +result in a failure to dig deeply into the vulnerability of DNNs. Hence, it is +necessary to evaluate DNNs' fragility sufficiently under query-limited settings +in a non-additional way. In this paper, we propose the Spatial Transform +Black-box Attack (STBA), a novel framework to craft formidable adversarial +examples in the query-limited scenario. Specifically, STBA introduces a flow +field to the high-frequency part of clean images to generate adversarial +examples and adopts the following two processes to enhance their naturalness +and significantly improve the query efficiency: a) we apply an estimated flow +field to the high-frequency part of clean images to generate adversarial +examples instead of introducing external noise to the benign image, and b) we +leverage an efficient gradient estimation method based on a batch of samples to +optimize such an ideal flow field under query-limited settings. Compared to +existing score-based black-box baselines, extensive experiments indicated that +STBA could effectively improve the imperceptibility of the adversarial examples +and remarkably boost the attack success rate under query-limited settings. + +
+
+
+
+
+ + ☆ Reusable Architecture Growth for Continual Stereo Matching CVPR 2022 + + +
+ The remarkable performance of recent stereo depth estimation models benefits +from the successful use of convolutional neural networks to regress dense +disparity. Akin to most tasks, this needs gathering training data that covers a +number of heterogeneous scenes at deployment time. However, training samples +are typically acquired continuously in practical applications, making the +capability to learn new scenes continually even more crucial. For this purpose, +we propose to perform continual stereo matching where a model is tasked to 1) +continually learn new scenes, 2) overcome forgetting previously learned scenes, +and 3) continuously predict disparities at inference. We achieve this goal by +introducing a Reusable Architecture Growth (RAG) framework. RAG leverages +task-specific neural unit search and architecture growth to learn new scenes +continually in both supervised and self-supervised manners. It can maintain +high reusability during growth by reusing previous units while obtaining good +performance. Additionally, we present a Scene Router module to adaptively +select the scene-specific architecture path at inference. Comprehensive +experiments on numerous datasets show that our framework performs impressively +in various weather, road, and city circumstances and surpasses the +state-of-the-art methods in more challenging cross-dataset settings. Further +experiments also demonstrate the adaptability of our method to unseen scenes, +which can facilitate end-to-end stereo architecture learning and practical +deployment. + +
+
+ comment: Extended version of CVPR 2022 paper "Continual Stereo Matching of + Continuous Driving Scenes with Growing Architecture" - Accepted to TPAMI in + 2024 +
+
+
+
+
+ + ☆ Spread Your Wings: A Radial Strip Transformer for Image Deblurring + + +
+ Exploring motion information is important for the motion deblurring task. +Recent the window-based transformer approaches have achieved decent performance +in image deblurring. Note that the motion causing blurry results is usually +composed of translation and rotation movements and the window-shift operation +in the Cartesian coordinate system by the window-based transformer approaches +only directly explores translation motion in orthogonal directions. Thus, these +methods have the limitation of modeling the rotation part. To alleviate this +problem, we introduce the polar coordinate-based transformer, which has the +angles and distance to explore rotation motion and translation information +together. In this paper, we propose a Radial Strip Transformer (RST), which is +a transformer-based architecture that restores the blur images in a polar +coordinate system instead of a Cartesian one. RST contains a dynamic radial +embedding module (DRE) to extract the shallow feature by a radial deformable +convolution. We design a polar mask layer to generate the offsets for the +deformable convolution, which can reshape the convolution kernel along the +radius to better capture the rotation motion information. Furthermore, we +proposed a radial strip attention solver (RSAS) as deep feature extraction, +where the relationship of windows is organized by azimuth and radius. This +attention module contains radial strip windows to reweight image features in +the polar coordinate, which preserves more useful information in rotation and +translation motion together for better recovering the sharp images. +Experimental results on six synthesis and real-world datasets prove that our +method performs favorably against other SOTA methods for the image deblurring +task. + +
+
+
+
+
+ + ☆ Rethinking Attention-Based Multiple Instance Learning for Whole-Slide + Pathological Image Classification: An Instance Attribute Viewpoint + + +
+ Multiple instance learning (MIL) is a robust paradigm for whole-slide +pathological image (WSI) analysis, processing gigapixel-resolution images with +slide-level labels. As pioneering efforts, attention-based MIL (ABMIL) and its +variants are increasingly becoming popular due to the characteristics of +simultaneously handling clinical diagnosis and tumor localization. However, the +attention mechanism exhibits limitations in discriminating between instances, +which often misclassifies tissues and potentially impairs MIL performance. This +paper proposes an Attribute-Driven MIL (AttriMIL) framework to address these +issues. Concretely, we dissect the calculation process of ABMIL and present an +attribute scoring mechanism that measures the contribution of each instance to +bag prediction effectively, quantifying instance attributes. Based on attribute +quantification, we develop a spatial attribute constraint and an attribute +ranking constraint to model instance correlations within and across slides, +respectively. These constraints encourage the network to capture the spatial +correlation and semantic similarity of instances, improving the ability of +AttriMIL to distinguish tissue types and identify challenging instances. +Additionally, AttriMIL employs a histopathology adaptive backbone that +maximizes the pre-trained model's feature extraction capability for collecting +pathological features. Extensive experiments on three public benchmarks +demonstrate that our AttriMIL outperforms existing state-of-the-art frameworks +across multiple evaluation metrics. The implementation code is available at +https://github.com/MedCAI/AttriMIL. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ SGDFormer: One-stage Transformer-based Architecture for Cross-Spectral + Stereo Image Guided Denoising + + +
+ Cross-spectral image guided denoising has shown its great potential in +recovering clean images with rich details, such as using the near-infrared +image to guide the denoising process of the visible one. To obtain such image +pairs, a feasible and economical way is to employ a stereo system, which is +widely used on mobile devices. Current works attempt to generate an aligned +guidance image to handle the disparity between two images. However, due to +occlusion, spectral differences and noise degradation, the aligned guidance +image generally exists ghosting and artifacts, leading to an unsatisfactory +denoised result. To address this issue, we propose a one-stage +transformer-based architecture, named SGDFormer, for cross-spectral Stereo +image Guided Denoising. The architecture integrates the correspondence modeling +and feature fusion of stereo images into a unified network. Our transformer +block contains a noise-robust cross-attention (NRCA) module and a spatially +variant feature fusion (SVFF) module. The NRCA module captures the long-range +correspondence of two images in a coarse-to-fine manner to alleviate the +interference of noise. The SVFF module further enhances salient structures and +suppresses harmful artifacts through dynamically selecting useful information. +Thanks to the above design, our SGDFormer can restore artifact-free images with +fine structures, and achieves state-of-the-art performance on various datasets. +Additionally, our SGDFormer can be extended to handle other unaligned +cross-model guided restoration tasks such as guided depth super-resolution. + +
+
+
+
+
+ + ☆ MaGRITTe: Manipulative and Generative 3D Realization from Image, Topview + and Text + + +
+ The generation of 3D scenes from user-specified conditions offers a promising +avenue for alleviating the production burden in 3D applications. Previous +studies required significant effort to realize the desired scene, owing to +limited control conditions. We propose a method for controlling and generating +3D scenes under multimodal conditions using partial images, layout information +represented in the top view, and text prompts. Combining these conditions to +generate a 3D scene involves the following significant difficulties: (1) the +creation of large datasets, (2) reflection on the interaction of multimodal +conditions, and (3) domain dependence of the layout conditions. We decompose +the process of 3D scene generation into 2D image generation from the given +conditions and 3D scene generation from 2D images. 2D image generation is +achieved by fine-tuning a pretrained text-to-image model with a small +artificial dataset of partial images and layouts, and 3D scene generation is +achieved by layout-conditioned depth estimation and neural radiance fields +(NeRF), thereby avoiding the creation of large datasets. The use of a common +representation of spatial information using 360-degree images allows for the +consideration of multimodal condition interactions and reduces the domain +dependence of the layout control. The experimental results qualitatively and +quantitatively demonstrated that the proposed method can generate 3D scenes in +diverse domains, from indoor to outdoor, according to multimodal conditions. + +
+
+ comment: Project Page: https://hara012.github.io/MaGRITTe-project +
+
+
+
+
+ + ☆ Learing Trimaps via Clicks for Image Matting + + +
+ Despite significant advancements in image matting, existing models heavily +depend on manually-drawn trimaps for accurate results in natural image +scenarios. However, the process of obtaining trimaps is time-consuming, lacking +user-friendliness and device compatibility. This reliance greatly limits the +practical application of all trimap-based matting methods. To address this +issue, we introduce Click2Trimap, an interactive model capable of predicting +high-quality trimaps and alpha mattes with minimal user click inputs. Through +analyzing real users' behavioral logic and characteristics of trimaps, we +successfully propose a powerful iterative three-class training strategy and a +dedicated simulation function, making Click2Trimap exhibit versatility across +various scenarios. Quantitative and qualitative assessments on synthetic and +real-world matting datasets demonstrate Click2Trimap's superior performance +compared to all existing trimap-free matting methods. Especially, in the user +study, Click2Trimap achieves high-quality trimap and matting predictions in +just an average of 5 seconds per image, demonstrating its substantial practical +value in real-world applications. + +
+
+
+
+
+ + ☆ Memory-Scalable and Simplified Functional Map Learning + + +
+ Deep functional maps have emerged in recent years as a prominent +learning-based framework for non-rigid shape matching problems. While early +methods in this domain only focused on learning in the functional domain, the +latest techniques have demonstrated that by promoting consistency between +functional and pointwise maps leads to significant improvements in accuracy. +Unfortunately, existing approaches rely heavily on the computation of large +dense matrices arising from soft pointwise maps, which compromises their +efficiency and scalability. To address this limitation, we introduce a novel +memory-scalable and efficient functional map learning pipeline. By leveraging +the specific structure of functional maps, we offer the possibility to achieve +identical results without ever storing the pointwise map in memory. +Furthermore, based on the same approach, we present a differentiable map +refinement layer adapted from an existing axiomatic refinement algorithm. +Unlike many functional map learning methods, which use this algorithm at a +post-processing step, ours can be easily used at train time, enabling to +enforce consistency between the refined and initial versions of the map. Our +resulting approach is both simpler, more efficient and more numerically stable, +by avoiding differentiation through a linear system, while achieving close to +state-of-the-art results in challenging scenarios. + +
+
+
+
+
+ + ☆ YNetr: Dual-Encoder architecture on Plain Scan Liver Tumors (PSLT) + + +
+ Background: Liver tumors are abnormal growths in the liver that can be either +benign or malignant, with liver cancer being a significant health concern +worldwide. However, there is no dataset for plain scan segmentation of liver +tumors, nor any related algorithms. To fill this gap, we propose Plain Scan +Liver Tumors(PSLT) and YNetr. Methods: A collection of 40 liver tumor plain +scan segmentation datasets was assembled and annotated. Concurrently, we +utilized Dice coefficient as the metric for assessing the segmentation outcomes +produced by YNetr, having advantage of capturing different frequency +information. Results: The YNetr model achieved a Dice coefficient of 62.63% on +the PSLT dataset, surpassing the other publicly available model by an accuracy +margin of 1.22%. Comparative evaluations were conducted against a range of +models including UNet 3+, XNet, UNetr, Swin UNetr, Trans-BTS, COTr, nnUNetv2 +(2D), nnUNetv2 (3D fullres), MedNext (2D) and MedNext(3D fullres). Conclusions: +We not only proposed a dataset named PSLT(Plain Scan Liver Tumors), but also +explored a structure called YNetr that utilizes wavelet transform to extract +different frequency information, which having the SOTA in PSLT by experiments. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ CLIP-driven Outliers Synthesis for few-shot OOD detection + + +
+ Few-shot OOD detection focuses on recognizing out-of-distribution (OOD) +images that belong to classes unseen during training, with the use of only a +small number of labeled in-distribution (ID) images. Up to now, a mainstream +strategy is based on large-scale vision-language models, such as CLIP. However, +these methods overlook a crucial issue: the lack of reliable OOD supervision +information, which can lead to biased boundaries between in-distribution (ID) +and OOD. To tackle this problem, we propose CLIP-driven Outliers +Synthesis~(CLIP-OS). Firstly, CLIP-OS enhances patch-level features' perception +by newly proposed patch uniform convolution, and adaptively obtains the +proportion of ID-relevant information by employing CLIP-surgery-discrepancy, +thus achieving separation between ID-relevant and ID-irrelevant. Next, CLIP-OS +synthesizes reliable OOD data by mixing up ID-relevant features from different +classes to provide OOD supervision information. Afterward, CLIP-OS leverages +synthetic OOD samples by unknown-aware prompt learning to enhance the +separability of ID and OOD. Extensive experiments across multiple benchmarks +demonstrate that CLIP-OS achieves superior few-shot OOD detection capability. + +
+
+ comment: 9 pages,5 figures +
+
+
+
+
+ + ☆ Instrument-tissue Interaction Detection Framework for Surgical Video + Understanding + + +
+ Instrument-tissue interaction detection task, which helps understand surgical +activities, is vital for constructing computer-assisted surgery systems but +with many challenges. Firstly, most models represent instrument-tissue +interaction in a coarse-grained way which only focuses on classification and +lacks the ability to automatically detect instruments and tissues. Secondly, +existing works do not fully consider relations between intra- and inter-frame +of instruments and tissues. In the paper, we propose to represent +instrument-tissue interaction as quintuple and present an +Instrument-Tissue Interaction Detection Network (ITIDNet) to detect the +quintuple for surgery videos understanding. Specifically, we propose a Snippet +Consecutive Feature (SCF) Layer to enhance features by modeling relationships +of proposals in the current frame using global context information in the video +snippet. We also propose a Spatial Corresponding Attention (SCA) Layer to +incorporate features of proposals between adjacent frames through spatial +encoding. To reason relationships between instruments and tissues, a Temporal +Graph (TG) Layer is proposed with intra-frame connections to exploit +relationships between instruments and tissues in the same frame and inter-frame +connections to model the temporal information for the same instance. For +evaluation, we build a cataract surgery video (PhacoQ) dataset and a +cholecystectomy surgery video (CholecQ) dataset. Experimental results +demonstrate the promising performance of our model, which outperforms other +state-of-the-art models on both datasets. + +
+
+
+
+
+ + ☆ Exploring Unseen Environments with Robots using Large Language and + Vision Models through a Procedurally Generated 3D Scene Representation + + +
+ Recent advancements in Generative Artificial Intelligence, particularly in +the realm of Large Language Models (LLMs) and Large Vision Language Models +(LVLMs), have enabled the prospect of leveraging cognitive planners within +robotic systems. This work focuses on solving the object goal navigation +problem by mimicking human cognition to attend, perceive and store task +specific information and generate plans with the same. We introduce a +comprehensive framework capable of exploring an unfamiliar environment in +search of an object by leveraging the capabilities of Large Language +Models(LLMs) and Large Vision Language Models (LVLMs) in understanding the +underlying semantics of our world. A challenging task in using LLMs to generate +high level sub-goals is to efficiently represent the environment around the +robot. We propose to use a 3D scene modular representation, with semantically +rich descriptions of the object, to provide the LLM with task relevant +information. But providing the LLM with a mass of contextual information (rich +3D scene semantic representation), can lead to redundant and inefficient plans. +We propose to use an LLM based pruner that leverages the capabilities of +in-context learning to prune out irrelevant goal specific information. + +
+
+
+
+
+ + ☆ Harmonizing Light and Darkness: A Symphony of Prior-guided Data + Synthesis and Adaptive Focus for Nighttime Flare Removal + + +
+ Intense light sources often produce flares in captured images at night, which +deteriorates the visual quality and negatively affects downstream applications. +In order to train an effective flare removal network, a reliable dataset is +essential. The mainstream flare removal datasets are semi-synthetic to reduce +human labour, but these datasets do not cover typical scenarios involving +multiple scattering flares. To tackle this issue, we synthesize a prior-guided +dataset named Flare7K*, which contains multi-flare images where the brightness +of flares adheres to the laws of illumination. Besides, flares tend to occupy +localized regions of the image but existing networks perform flare removal on +the entire image and sometimes modify clean areas incorrectly. Therefore, we +propose a plug-and-play Adaptive Focus Module (AFM) that can adaptively mask +the clean background areas and assist models in focusing on the regions +severely affected by flares. Extensive experiments demonstrate that our data +synthesis method can better simulate real-world scenes and several models +equipped with AFM achieve state-of-the-art performance on the real-world test +dataset. + +
+
+
+
+
+ + ☆ Bayesian Exploration of Pre-trained Models for Low-shot Image + Classification + + +
+ Low-shot image classification is a fundamental task in computer vision, and +the emergence of large-scale vision-language models such as CLIP has greatly +advanced the forefront of research in this field. However, most existing +CLIP-based methods lack the flexibility to effectively incorporate other +pre-trained models that encompass knowledge distinct from CLIP. To bridge the +gap, this work proposes a simple and effective probabilistic model ensemble +framework based on Gaussian processes, which have previously demonstrated +remarkable efficacy in processing small data. We achieve the integration of +prior knowledge by specifying the mean function with CLIP and the kernel +function with an ensemble of deep kernels built upon various pre-trained +models. By regressing the classification label directly, our framework enables +analytical inference, straightforward uncertainty quantification, and +principled hyper-parameter tuning. Through extensive experiments on standard +benchmarks, we demonstrate that our method consistently outperforms competitive +ensemble baselines regarding predictive performance. Additionally, we assess +the robustness of our method and the quality of the yielded uncertainty +estimates on out-of-distribution datasets. We also illustrate that our method, +despite relying on label regression, still enjoys superior model calibration +compared to most deterministic baselines. + +
+
+
+
+
+ + ☆ ST-LLM: Large Language Models Are Effective Temporal Learners + + +
+ Large Language Models (LLMs) have showcased impressive capabilities in text +comprehension and generation, prompting research efforts towards video LLMs to +facilitate human-AI interaction at the video level. However, how to effectively +encode and understand videos in video-based dialogue systems remains to be +solved. In this paper, we investigate a straightforward yet unexplored +question: Can we feed all spatial-temporal tokens into the LLM, thus delegating +the task of video sequence modeling to the LLMs? Surprisingly, this simple +approach yields significant improvements in video understanding. Based upon +this, we propose ST-LLM, an effective video-LLM baseline with Spatial-Temporal +sequence modeling inside LLM. Furthermore, to address the overhead and +stability issues introduced by uncompressed video tokens within LLMs, we +develop a dynamic masking strategy with tailor-made training objectives. For +particularly long videos, we have also designed a global-local input module to +balance efficiency and effectiveness. Consequently, we harness LLM for +proficient spatial-temporal modeling, while upholding efficiency and stability. +Extensive experimental results attest to the effectiveness of our method. +Through a more concise model and training pipeline, ST-LLM establishes a new +state-of-the-art result on VideoChatGPT-Bench and MVBench. Codes have been +available at https://github.com/TencentARC/ST-LLM. + +
+
+
+
+
+ + ☆ Monocular Identity-Conditioned Facial Reflectance Reconstruction CVPR 2024 + + +
+ Recent 3D face reconstruction methods have made remarkable advancements, yet +there remain huge challenges in monocular high-quality facial reflectance +reconstruction. Existing methods rely on a large amount of light-stage captured +data to learn facial reflectance models. However, the lack of subject diversity +poses challenges in achieving good generalization and widespread applicability. +In this paper, we learn the reflectance prior in image space rather than UV +space and present a framework named ID2Reflectance. Our framework can directly +estimate the reflectance maps of a single image while using limited reflectance +data for training. Our key insight is that reflectance data shares facial +structures with RGB faces, which enables obtaining expressive facial prior from +inexpensive RGB data thus reducing the dependency on reflectance data. We first +learn a high-quality prior for facial reflectance. Specifically, we pretrain +multi-domain facial feature codebooks and design a codebook fusion method to +align the reflectance and RGB domains. Then, we propose an identity-conditioned +swapping module that injects facial identity from the target image into the +pre-trained autoencoder to modify the identity of the source reflectance image. +Finally, we stitch multi-view swapped reflectance images to obtain renderable +assets. Extensive experiments demonstrate that our method exhibits excellent +generalization capability and achieves state-of-the-art facial reflectance +reconstruction results for in-the-wild faces. Our project page is +https://xingyuren.github.io/id2reflectance/. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge + Retrieval-Augmented Diffusion CVPR 2024 + + +
+ Camouflaged vision perception is an important vision task with numerous +practical applications. Due to the expensive collection and labeling costs, +this community struggles with a major bottleneck that the species category of +its datasets is limited to a small number of object species. However, the +existing camouflaged generation methods require specifying the background +manually, thus failing to extend the camouflaged sample diversity in a low-cost +manner. In this paper, we propose a Latent Background Knowledge +Retrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To +our knowledge, our contributions mainly include: (1) For the first time, we +propose a camouflaged generation paradigm that does not need to receive any +background inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented +method with interpretability for camouflaged generation, in which we propose an +idea that knowledge retrieval and reasoning enhancement are separated +explicitly, to alleviate the task-specific challenges. Moreover, our method is +not restricted to specific foreground targets or backgrounds, offering a +potential for extending camouflaged vision perception to more diverse domains. +(3) Experimental results demonstrate that our method outperforms the existing +approaches, generating more realistic camouflage images. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Seeing the Unseen: A Frequency Prompt Guided Transformer for Image + Restoration + + +
+ How to explore useful features from images as prompts to guide the deep image +restoration models is an effective way to solve image restoration. In contrast +to mining spatial relations within images as prompt, which leads to +characteristics of different frequencies being neglected and further remaining +subtle or undetectable artifacts in the restored image, we develop a Frequency +Prompting image restoration method, dubbed FPro, which can effectively provide +prompt components from a frequency perspective to guild the restoration model +address these differences. Specifically, we first decompose input features into +separate frequency parts via dynamically learned filters, where we introduce a +gating mechanism for suppressing the less informative elements within the +kernels. To propagate useful frequency information as prompt, we then propose a +dual prompt block, consisting of a low-frequency prompt modulator (LPM) and a +high-frequency prompt modulator (HPM), to handle signals from different bands +respectively. Each modulator contains a generation process to incorporate +prompting components into the extracted frequency maps, and a modulation part +that modifies the prompt feature with the guidance of the decoder features. +Experimental results on commonly used benchmarks have demonstrated the +favorable performance of our pipeline against SOTA methods on 5 image +restoration tasks, including deraining, deraindrop, demoir\'eing, deblurring, +and dehazing. The source code and pre-trained models will be available at +https://github.com/joshyZhou/FPro. + +
+
+ comment: 18 pages, 10 figrues +
+
+
+
+
+ + ☆ Long-Tailed Recognition on Binary Networks by Calibrating A Pre-trained + Model + + +
+ Deploying deep models in real-world scenarios entails a number of challenges, +including computational efficiency and real-world (e.g., long-tailed) data +distributions. We address the combined challenge of learning long-tailed +distributions using highly resource-efficient binary neural networks as +backbones. Specifically, we propose a calibrate-and-distill framework that uses +off-the-shelf pretrained full-precision models trained on balanced datasets to +use as teachers for distillation when learning binary networks on long-tailed +datasets. To better generalize to various datasets, we further propose a novel +adversarial balancing among the terms in the objective function and an +efficient multiresolution learning scheme. We conducted the largest empirical +study in the literature using 15 datasets, including newly derived long-tailed +datasets from existing balanced datasets, and show that our proposed method +outperforms prior art by large margins (>14.33% on average). + +
+
+
+
+
+ + ☆ Look-Around Before You Leap: High-Frequency Injected Transformer for + Image Restoration + + +
+ Transformer-based approaches have achieved superior performance in image +restoration, since they can model long-term dependencies well. However, the +limitation in capturing local information restricts their capacity to remove +degradations. While existing approaches attempt to mitigate this issue by +incorporating convolutional operations, the core component in Transformer, +i.e., self-attention, which serves as a low-pass filter, could unintentionally +dilute or even eliminate the acquired local patterns. In this paper, we propose +HIT, a simple yet effective High-frequency Injected Transformer for image +restoration. Specifically, we design a window-wise injection module (WIM), +which incorporates abundant high-frequency details into the feature map, to +provide reliable references for restoring high-quality images. We also develop +a bidirectional interaction module (BIM) to aggregate features at different +scales using a mutually reinforced paradigm, resulting in spatially and +contextually improved representations. In addition, we introduce a spatial +enhancement unit (SEU) to preserve essential spatial relationships that may be +lost due to the computations carried out across channel dimensions in the BIM. +Extensive experiments on 9 tasks (real noise, real rain streak, raindrop, +motion blur, moir\'e, shadow, snow, haze, and low-light condition) demonstrate +that HIT with linear computational complexity performs favorably against the +state-of-the-art methods. The source code and pre-trained models will be +available at https://github.com/joshyZhou/HIT. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ☆ HSIMamba: Hyperpsectral Imaging Efficient Feature Learning with + Bidirectional State Space for Classification + + +
+ Classifying hyperspectral images is a difficult task in remote sensing, due +to their complex high-dimensional data. To address this challenge, we propose +HSIMamba, a novel framework that uses bidirectional reversed convolutional +neural network pathways to extract spectral features more efficiently. +Additionally, it incorporates a specialized block for spatial analysis. Our +approach combines the operational efficiency of CNNs with the dynamic feature +extraction capability of attention mechanisms found in Transformers. However, +it avoids the associated high computational demands. HSIMamba is designed to +process data bidirectionally, significantly enhancing the extraction of +spectral features and integrating them with spatial information for +comprehensive analysis. This approach improves classification accuracy beyond +current benchmarks and addresses computational inefficiencies encountered with +advanced models like Transformers. HSIMamba were tested against three widely +recognized datasets Houston 2013, Indian Pines, and Pavia University and +demonstrated exceptional performance, surpassing existing state-of-the-art +models in HSI classification. This method highlights the methodological +innovation of HSIMamba and its practical implications, which are particularly +valuable in contexts where computational resources are limited. HSIMamba +redefines the standards of efficiency and accuracy in HSI classification, +thereby enhancing the capabilities of remote sensing applications. +Hyperspectral imaging has become a crucial tool for environmental surveillance, +agriculture, and other critical areas that require detailed analysis of the +Earth surface. Please see our code in HSIMamba for more details. + +
+
+ comment: 11 pages, 2 figures, 8 tables +
+
+
+
+
+ + ☆ IPoD: Implicit Field Learning with Point Diffusion for Generalizable 3D + Object Reconstruction from Single RGB-D Images CVPR 2024 + + +
+ Generalizable 3D object reconstruction from single-view RGB-D images remains +a challenging task, particularly with real-world data. Current state-of-the-art +methods develop Transformer-based implicit field learning, necessitating an +intensive learning paradigm that requires dense query-supervision uniformly +sampled throughout the entire space. We propose a novel approach, IPoD, which +harmonizes implicit field learning with point diffusion. This approach treats +the query points for implicit field learning as a noisy point cloud for +iterative denoising, allowing for their dynamic adaptation to the target object +shape. Such adaptive query points harness diffusion learning's capability for +coarse shape recovery and also enhances the implicit representation's ability +to delineate finer details. Besides, an additional self-conditioning mechanism +is designed to use implicit predictions as the guidance of diffusion learning, +leading to a cooperative system. Experiments conducted on the CO3D-v2 dataset +affirm the superiority of IPoD, achieving 7.8% improvement in F-score and 28.6% +in Chamfer distance over existing methods. The generalizability of IPoD is also +demonstrated on the MVImgNet dataset. Our project page is at +https://yushuang-wu.github.io/IPoD. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Image-to-Image Matching via Foundation Models: A New Perspective for + Open-Vocabulary Semantic Segmentation CVPR2024 + + +
+ Open-vocabulary semantic segmentation (OVS) aims to segment images of +arbitrary categories specified by class labels or captions. However, most +previous best-performing methods, whether pixel grouping methods or region +recognition methods, suffer from false matches between image features and +category labels. We attribute this to the natural gap between the textual +features and visual features. In this work, we rethink how to mitigate false +matches from the perspective of image-to-image matching and propose a novel +relation-aware intra-modal matching (RIM) framework for OVS based on visual +foundation models. RIM achieves robust region classification by firstly +constructing diverse image-modal reference features and then matching them with +region features based on relation-aware ranking distribution. The proposed RIM +enjoys several merits. First, the intra-modal reference features are better +aligned, circumventing potential ambiguities that may arise in cross-modal +matching. Second, the ranking-based matching process harnesses the structure +information implicit in the inter-class relationships, making it more robust +than comparing individually. Extensive experiments on three benchmarks +demonstrate that RIM outperforms previous state-of-the-art methods by large +margins, obtaining a lead of more than 10% in mIoU on PASCAL VOC benchmark. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Exploiting Self-Supervised Constraints in Image Super-Resolution ICME 2024 + + +
+ Recent advances in self-supervised learning, predominantly studied in +high-level visual tasks, have been explored in low-level image processing. This +paper introduces a novel self-supervised constraint for single image +super-resolution, termed SSC-SR. SSC-SR uniquely addresses the divergence in +image complexity by employing a dual asymmetric paradigm and a target model +updated via exponential moving average to enhance stability. The proposed +SSC-SR framework works as a plug-and-play paradigm and can be easily applied to +existing SR models. Empirical evaluations reveal that our SSC-SR framework +delivers substantial enhancements on a variety of benchmark datasets, achieving +an average increase of 0.1 dB over EDSR and 0.06 dB over SwinIR. In addition, +extensive ablation studies corroborate the effectiveness of each constituent in +our SSC-SR framework. Codes are available at https://github.com/Aitical/SSCSR. + +
+
+ comment: ICME 2024 +
+
+
+
+
+ + ☆ YOLOOC: YOLO-based Open-Class Incremental Object Detection with Novel + Class Discovery ACCV 2022 + + +
+ Because of its use in practice, open-world object detection (OWOD) has gotten +a lot of attention recently. The challenge is how can a model detect novel +classes and then incrementally learn them without forgetting previously known +classes. Previous approaches hinge on strongly-supervised or weakly-supervised +novel-class data for novel-class detection, which may not apply to real +applications. We construct a new benchmark that novel classes are only +encountered at the inference stage. And we propose a new OWOD detector YOLOOC, +based on the YOLO architecture yet for the Open-Class setup. We introduce label +smoothing to prevent the detector from over-confidently mapping novel classes +to known classes and to discover novel classes. Extensive experiments conducted +on our more realistic setup demonstrate the effectiveness of our method for +discovering novel classes in our new benchmark. + +
+
+ comment: Initially submitted to ACCV 2022 +
+
+
+
+
+ + ☆ Learned Scanpaths Aid Blind Panoramic Video Quality Assessment + + +
+ Panoramic videos have the advantage of providing an immersive and interactive +viewing experience. Nevertheless, their spherical nature gives rise to various +and uncertain user viewing behaviors, which poses significant challenges for +panoramic video quality assessment (PVQA). In this work, we propose an +end-to-end optimized, blind PVQA method with explicit modeling of user viewing +patterns through visual scanpaths. Our method consists of two modules: a +scanpath generator and a quality assessor. The scanpath generator is initially +trained to predict future scanpaths by minimizing their expected code length +and then jointly optimized with the quality assessor for quality prediction. +Our blind PVQA method enables direct quality assessment of panoramic images by +treating them as videos composed of identical frames. Experiments on three +public panoramic image and video quality datasets, encompassing both synthetic +and authentic distortions, validate the superiority of our blind PVQA model +over existing methods. + +
+
+
+
+
+ + ☆ Grid Diffusion Models for Text-to-Video Generation CVPR 2024 + + +
+ Recent advances in the diffusion models have significantly improved +text-to-image generation. However, generating videos from text is a more +challenging task than generating images from text, due to the much larger +dataset and higher computational cost required. Most existing video generation +methods use either a 3D U-Net architecture that considers the temporal +dimension or autoregressive generation. These methods require large datasets +and are limited in terms of computational costs compared to text-to-image +generation. To tackle these challenges, we propose a simple but effective novel +grid diffusion for text-to-video generation without temporal dimension in +architecture and a large text-video paired dataset. We can generate a +high-quality video using a fixed amount of GPU memory regardless of the number +of frames by representing the video as a grid image. Additionally, since our +method reduces the dimensions of the video to the dimensions of the image, +various image-based methods can be applied to videos, such as text-guided video +manipulation from image manipulation. Our proposed method outperforms the +existing methods in both quantitative and qualitative evaluations, +demonstrating the suitability of our model for real-world video generation. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Attention-based Shape-Deformation Networks for Artifact-Free Geometry + Reconstruction of Lumbar Spine from MR Images + + +
+ Lumbar disc degeneration, a progressive structural wear and tear of lumbar +intervertebral disc, is regarded as an essential role on low back pain, a +significant global health concern. Automated lumbar spine geometry +reconstruction from MR images will enable fast measurement of medical +parameters to evaluate the lumbar status, in order to determine a suitable +treatment. Existing image segmentation-based techniques often generate +erroneous segments or unstructured point clouds, unsuitable for medical +parameter measurement. In this work, we present TransDeformer: a novel +attention-based deep learning approach that reconstructs the contours of the +lumbar spine with high spatial accuracy and mesh correspondence across +patients, and we also present a variant of TransDeformer for error estimation. +Specially, we devise new attention modules with a new attention formula, which +integrates image features and tokenized contour features to predict the +displacements of the points on a shape template without the need for image +segmentation. The deformed template reveals the lumbar spine geometry in the +input image. We develop a multi-stage training strategy to enhance model +robustness with respect to template initialization. Experiment results show +that our TransDeformer generates artifact-free geometry outputs, and its +variant predicts the error of a reconstructed geometry. Our code is available +at https://github.com/linchenq/TransDeformer-Mesh. + +
+
+
+
+
+ + ☆ Latent Watermark: Inject and Detect Watermarks in Latent Diffusion Space + + +
+ Watermarking is a tool for actively identifying and attributing the images +generated by latent diffusion models. Existing methods face the dilemma of +watermark robustness and image quality. The reason for this dilemma is that +watermark detection is performed in pixel space, implying an intrinsic link +between image quality and watermark robustness. In this paper, we highlight +that an effective solution to the problem is to both inject and detect +watermarks in latent space, and propose Latent Watermark (LW) with a +progressive training strategy. Experiments show that compared to the recently +proposed methods such as StegaStamp, StableSignature, RoSteALS and TreeRing, LW +not only surpasses them in terms of robustness but also offers superior image +quality. When we inject 64-bit messages, LW can achieve an identification +performance close to 100% and an attribution performance above 97% under 9 +single-attack scenarios and one all-attack scenario. Our code will be available +on GitHub. + +
+
+
+
+
+ + ☆ Design as Desired: Utilizing Visual Question Answering for Multimodal + Pre-training + + +
+ Multimodal pre-training demonstrates its potential in the medical domain, +which learns medical visual representations from paired medical reports. +However, many pre-training tasks require extra annotations from clinicians, and +most of them fail to explicitly guide the model to learn the desired features +of different pathologies. To the best of our knowledge, we are the first to +utilize Visual Question Answering (VQA) for multimodal pre-training to guide +the framework focusing on targeted pathological features. In this work, we +leverage descriptions in medical reports to design multi-granular +question-answer pairs associated with different diseases, which assist the +framework in pre-training without requiring extra annotations from experts. We +also propose a novel pre-training framework with a quasi-textual feature +transformer, a module designed to transform visual features into a +quasi-textual space closer to the textual domain via a contrastive learning +strategy. This narrows the vision-language gap and facilitates modality +alignment. Our framework is applied to four downstream tasks: report +generation, classification, segmentation, and detection across five datasets. +Extensive experiments demonstrate the superiority of our framework compared to +other state-of-the-art methods. Our code will be released upon acceptance. + +
+
+
+
+
+ + ♻ ☆ AttackNet: Enhancing Biometric Security via Tailored Convolutional + Neural Network Architectures for Liveness Detection + + +
+ Biometric security is the cornerstone of modern identity verification and +authentication systems, where the integrity and reliability of biometric +samples is of paramount importance. This paper introduces AttackNet, a bespoke +Convolutional Neural Network architecture, meticulously designed to combat +spoofing threats in biometric systems. Rooted in deep learning methodologies, +this model offers a layered defense mechanism, seamlessly transitioning from +low-level feature extraction to high-level pattern discernment. Three +distinctive architectural phases form the crux of the model, each underpinned +by judiciously chosen activation functions, normalization techniques, and +dropout layers to ensure robustness and resilience against adversarial attacks. +Benchmarking our model across diverse datasets affirms its prowess, showcasing +superior performance metrics in comparison to contemporary models. Furthermore, +a detailed comparative analysis accentuates the model's efficacy, drawing +parallels with prevailing state-of-the-art methodologies. Through iterative +refinement and an informed architectural strategy, AttackNet underscores the +potential of deep learning in safeguarding the future of biometric security. + +
+
+
+
+
+ + ♻ ☆ MAPSeg: Unified Unsupervised Domain Adaptation for Heterogeneous Medical + Image Segmentation Based on 3D Masked Autoencoding and Pseudo-Labeling CVPR 2024 + + +
+ Robust segmentation is critical for deriving quantitative measures from +large-scale, multi-center, and longitudinal medical scans. Manually annotating +medical scans, however, is expensive and labor-intensive and may not always be +available in every domain. Unsupervised domain adaptation (UDA) is a +well-studied technique that alleviates this label-scarcity problem by +leveraging available labels from another domain. In this study, we introduce +Masked Autoencoding and Pseudo-Labeling Segmentation (MAPSeg), a +$\textbf{unified}$ UDA framework with great versatility and superior +performance for heterogeneous and volumetric medical image segmentation. To the +best of our knowledge, this is the first study that systematically reviews and +develops a framework to tackle four different domain shifts in medical image +segmentation. More importantly, MAPSeg is the first framework that can be +applied to $\textbf{centralized}$, $\textbf{federated}$, and +$\textbf{test-time}$ UDA while maintaining comparable performance. We compare +MAPSeg with previous state-of-the-art methods on a private infant brain MRI +dataset and a public cardiac CT-MRI dataset, and MAPSeg outperforms others by a +large margin (10.5 Dice improvement on the private MRI dataset and 5.7 on the +public CT-MRI dataset). MAPSeg poses great practical value and can be applied +to real-world problems. GitHub: https://github.com/XuzheZ/MAPSeg/. + +
+
+ comment: CVPR 2024 camera-ready (8 pages, 3 figures) with the supplemental + materials (5 pages, 4 figures). Xuzhe Zhang and Yuhao Wu are co-first + authors. Andrew F. Laine and Yun Wang are co-senior supervising authors +
+
+
+
+
+ + ♻ ☆ LangNav: Language as a Perceptual Representation for Navigation + + +
+ We explore the use of language as a perceptual representation for +vision-and-language navigation (VLN), with a focus on low-data settings. Our +approach uses off-the-shelf vision systems for image captioning and object +detection to convert an agent's egocentric panoramic view at each time step +into natural language descriptions. We then finetune a pretrained language +model to select an action, based on the current view and the trajectory +history, that would best fulfill the navigation instructions. In contrast to +the standard setup which adapts a pretrained language model to work directly +with continuous visual features from pretrained vision models, our approach +instead uses (discrete) language as the perceptual representation. We explore +several use cases of our language-based navigation (LangNav) approach on the +R2R VLN benchmark: generating synthetic trajectories from a prompted language +model (GPT-4) with which to finetune a smaller language model; domain transfer +where we transfer a policy learned on one simulated environment (ALFRED) to +another (more realistic) environment (R2R); and combining both vision- and +language-based representations for VLN. Our approach is found to improve upon +baselines that rely on visual features in settings where only a few expert +trajectories (10-100) are available, demonstrating the potential of language as +a perceptual representation for navigation. + +
+
+
+
+
+ + ♻ ☆ U-Net v2: Rethinking the Skip Connections of U-Net for Medical Image + Segmentation + + +
+ In this paper, we introduce U-Net v2, a new robust and efficient U-Net +variant for medical image segmentation. It aims to augment the infusion of +semantic information into low-level features while simultaneously refining +high-level features with finer details. For an input image, we begin by +extracting multi-level features with a deep neural network encoder. Next, we +enhance the feature map of each level by infusing semantic information from +higher-level features and integrating finer details from lower-level features +through Hadamard product. Our novel skip connections empower features of all +the levels with enriched semantic characteristics and intricate details. The +improved features are subsequently transmitted to the decoder for further +processing and segmentation. Our method can be seamlessly integrated into any +Encoder-Decoder network. We evaluate our method on several public medical image +segmentation datasets for skin lesion segmentation and polyp segmentation, and +the experimental results demonstrate the segmentation accuracy of our new +method over state-of-the-art methods, while preserving memory and computational +efficiency. Code is available at: https://github.com/yaoppeng/U-Net_v2 + +
+
+
+
+
+ + ♻ ☆ Recursive Joint Cross-Modal Attention for Multimodal Fusion in + Dimensional Emotion Recognition + + +
+ Though multimodal emotion recognition has achieved significant progress over +recent years, the potential of rich synergic relationships across the +modalities is not fully exploited. In this paper, we introduce Recursive Joint +Cross-Modal Attention (RJCMA) to effectively capture both intra-and inter-modal +relationships across audio, visual and text modalities for dimensional emotion +recognition. In particular, we compute the attention weights based on +cross-correlation between the joint audio-visual-text feature representations +and the feature representations of individual modalities to simultaneously +capture intra- and inter-modal relationships across the modalities. The +attended features of the individual modalities are again fed as input to the +fusion model in a recursive mechanism to obtain more refined feature +representations. We have also explored Temporal Convolutional Networks (TCNs) +to improve the temporal modeling of the feature representations of individual +modalities. Extensive experiments are conducted to evaluate the performance of +the proposed fusion model on the challenging Affwild2 dataset. By effectively +capturing the synergic intra- and inter-modal relationships across audio, +visual and text modalities, the proposed fusion model achieves a Concordance +Correlation Coefficient (CCC) of 0.585 (0.542) and 0.659 (0.619) for valence +and arousal respectively on the validation set (test set). This shows a +significant improvement over the baseline of 0.24 (0.211) and 0.20 (0.191) for +valence and arousal respectively on the validation set (test set) of the +valence-arousal challenge of 6th Affective Behavior Analysis in-the-Wild (ABAW) +competition. + +
+
+
+
+
+ + ♻ ☆ Unifying Top-down and Bottom-up Scanpath Prediction Using Transformers CVPR 2024 + + +
+ Most models of visual attention aim at predicting either top-down or +bottom-up control, as studied using different visual search and free-viewing +tasks. In this paper we propose the Human Attention Transformer (HAT), a single +model that predicts both forms of attention control. HAT uses a novel +transformer-based architecture and a simplified foveated retina that +collectively create a spatio-temporal awareness akin to the dynamic visual +working memory of humans. HAT not only establishes a new state-of-the-art in +predicting the scanpath of fixations made during target-present and +target-absent visual search and ``taskless'' free viewing, but also makes human +gaze behavior interpretable. Unlike previous methods that rely on a coarse grid +of fixation cells and experience information loss due to fixation +discretization, HAT features a sequential dense prediction architecture and +outputs a dense heatmap for each fixation, thus avoiding discretizing +fixations. HAT sets a new standard in computational attention, which emphasizes +effectiveness, generality, and interpretability. HAT's demonstrated scope and +applicability will likely inspire the development of new attention models that +can better predict human behavior in various attention-demanding scenarios. +Code is available at https://github.com/cvlab-stonybrook/HAT. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Mamba-UNet: UNet-Like Pure Visual Mamba for Medical Image Segmentation + + +
+ In recent advancements in medical image analysis, Convolutional Neural +Networks (CNN) and Vision Transformers (ViT) have set significant benchmarks. +While the former excels in capturing local features through its convolution +operations, the latter achieves remarkable global context understanding by +leveraging self-attention mechanisms. However, both architectures exhibit +limitations in efficiently modeling long-range dependencies within medical +images, which is a critical aspect for precise segmentation. Inspired by the +Mamba architecture, known for its proficiency in handling long sequences and +global contextual information with enhanced computational efficiency as a State +Space Model (SSM), we propose Mamba-UNet, a novel architecture that synergizes +the U-Net in medical image segmentation with Mamba's capability. Mamba-UNet +adopts a pure Visual Mamba (VMamba)-based encoder-decoder structure, infused +with skip connections to preserve spatial information across different scales +of the network. This design facilitates a comprehensive feature learning +process, capturing intricate details and broader semantic contexts within +medical images. We introduce a novel integration mechanism within the VMamba +blocks to ensure seamless connectivity and information flow between the encoder +and decoder paths, enhancing the segmentation performance. We conducted +experiments on publicly available ACDC MRI Cardiac segmentation dataset, and +Synapse CT Abdomen segmentation dataset. The results show that Mamba-UNet +outperforms several types of UNet in medical image segmentation under the same +hyper-parameter setting. The source code and baseline implementations are +available. + +
+
+
+
+
+ + ♻ ☆ Towards minimizing efforts for Morphing Attacks -- Deep embeddings for + morphing pair selection and improved Morphing Attack Detection + + +
+ Face Morphing Attacks pose a threat to the security of identity documents, +especially with respect to a subsequent access control process, because it +enables both individuals involved to exploit the same document. In this study, +face embeddings serve two purposes: pre-selecting images for large-scale +Morphing Attack generation and detecting potential Morphing Attacks. We build +upon previous embedding studies in both use cases using the MagFace model. For +the first objective, we employ an pre-selection algorithm that pairs +individuals based on face embedding similarity. We quantify the attack +potential of differently morphed face images to compare the usability of +pre-selection in automatically generating numerous successful Morphing Attacks. +Regarding the second objective, we compare embeddings from two state-of-the-art +face recognition systems in terms of their ability to detect Morphing Attacks. +Our findings demonstrate that ArcFace and MagFace provide valuable face +embeddings for image pre-selection. Both open-source and COTS face recognition +systems are susceptible to generated attacks, particularly when pre-selection +is based on embeddings rather than random pairing which was only constrained by +soft biometrics. More accurate face recognition systems exhibit greater +vulnerability to attacks, with COTS systems being the most susceptible. +Additionally, MagFace embeddings serve as a robust alternative for detecting +morphed face images compared to the previously used ArcFace embeddings. The +results endorse the advantages of face embeddings in more effective image +pre-selection for face morphing and accurate detection of morphed face images. +This is supported by extensive analysis of various designed attacks. The +MagFace model proves to be a powerful alternative to the commonly used ArcFace +model for both objectives, pre-selection and attack detection. + +
+
+
+
+
+ + ♻ ☆ Total-Decom: Decomposed 3D Scene Reconstruction with Minimal Interaction CVPR 2024 + + +
+ Scene reconstruction from multi-view images is a fundamental problem in +computer vision and graphics. Recent neural implicit surface reconstruction +methods have achieved high-quality results; however, editing and manipulating +the 3D geometry of reconstructed scenes remains challenging due to the absence +of naturally decomposed object entities and complex object/background +compositions. In this paper, we present Total-Decom, a novel method for +decomposed 3D reconstruction with minimal human interaction. Our approach +seamlessly integrates the Segment Anything Model (SAM) with hybrid +implicit-explicit neural surface representations and a mesh-based +region-growing technique for accurate 3D object decomposition. Total-Decom +requires minimal human annotations while providing users with real-time control +over the granularity and quality of decomposition. We extensively evaluate our +method on benchmark datasets and demonstrate its potential for downstream +applications, such as animation and scene editing. The code is available at +https://github.com/CVMI-Lab/Total-Decom.git. + +
+
+ comment: 8 pages, 7 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RS-DPO: A Hybrid Rejection Sampling and Direct Preference Optimization + Method for Alignment of Large Language Models + + +
+ Reinforcement learning from human feedback (RLHF) has been extensively +employed to align large language models with user intent. However, proximal +policy optimization (PPO) based RLHF is occasionally unstable requiring +significant hyperparameter finetuning, and computationally expensive to +maximize the estimated reward during alignment. Recently, direct preference +optimization (DPO) is proposed to address those challenges. However, DPO relies +on contrastive responses generated from human annotator and alternative LLM, +instead of the policy model, limiting the effectiveness of the RLHF. In this +paper, we addresses both challenges by systematically combining rejection +sampling (RS) and DPO. Our proposed method, RS-DPO, initiates with the +development of a supervised fine-tuned policy model (SFT). A varied set of k +responses per prompt are sampled directly from the SFT model. RS-DPO identifies +pairs of contrastive samples based on their reward distribution. Finally, we +apply DPO with the contrastive samples to align the model to human preference. +Our experiments indicate that our proposed method effectively fine-tunes LLMs +with limited resource environments, leading to improved alignment with user +intent. Furthermore, it outperforms existing methods, including RS, PPO, and +DPO. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ An Examination of the Compositionality of Large Generative + Vision-Language Models + + +
+ With the success of Large Language Models (LLMs), many Generative +Vision-Language Models (GVLMs) have been constructed via multimodal instruction +tuning. However, the performance of GVLMs in multimodal compositional reasoning +remains under-explored. In this paper, we examine both the evaluation metrics +(VisualGPTScore, etc.) and current benchmarks for evaluating the +compositionality of GVLMs. We identify the syntactical bias in current +benchmarks, which is exploited by the linguistic capability of GVLMs. The bias +renders VisualGPTScore an insufficient metric for assessing GVLMs. To combat +this, we first introduce a SyntaxBias Score, leveraging LLMs to quantify such +bias for mitigation. A challenging new task is subsequently added to evaluate +the robustness of GVLMs against inherent inclination toward syntactical +correctness. Using the bias-mitigated datasets and the new task, we propose a +novel benchmark, namely SyntActically DE-biased benchmark (SADE). Our study +provides an unbiased benchmark for the compositionality of GVLMs, facilitating +future research in this direction (Code and dataset are available at +https://github.com/TeleeMa/SADE). + +
+
+
+
+
+ + ♻ ☆ Scale Alone Does not Improve Mechanistic Interpretability in Vision + Models NeurIPS 2023 + + +
+ In light of the recent widespread adoption of AI systems, understanding the +internal information processing of neural networks has become increasingly +critical. Most recently, machine vision has seen remarkable progress by scaling +neural networks to unprecedented levels in dataset and model size. We here ask +whether this extraordinary increase in scale also positively impacts the field +of mechanistic interpretability. In other words, has our understanding of the +inner workings of scaled neural networks improved as well? We use a +psychophysical paradigm to quantify one form of mechanistic interpretability +for a diverse suite of nine models and find no scaling effect for +interpretability - neither for model nor dataset size. Specifically, none of +the investigated state-of-the-art models are easier to interpret than the +GoogLeNet model from almost a decade ago. Latest-generation vision models +appear even less interpretable than older architectures, hinting at a +regression rather than improvement, with modern models sacrificing +interpretability for accuracy. These results highlight the need for models +explicitly designed to be mechanistically interpretable and the need for more +helpful interpretability methods to increase our understanding of networks at +an atomic level. We release a dataset containing more than 130'000 human +responses from our psychophysical evaluation of 767 units across nine models. +This dataset facilitates research on automated instead of human-based +interpretability evaluations, which can ultimately be leveraged to directly +optimize the mechanistic interpretability of models. + +
+
+ comment: Spotlight at NeurIPS 2023. The first two authors contributed equally. + Code available at https://brendel-group.github.io/imi/ +
+
+
+
+
+ + ♻ ☆ ReGround: Improving Textual and Spatial Grounding at No Cost + + +
+ When an image generation process is guided by both a text prompt and spatial +cues, such as a set of bounding boxes, do these elements work in harmony, or +does one dominate the other? Our analysis of a pretrained image diffusion model +that integrates gated self-attention into the U-Net reveals that spatial +grounding often outweighs textual grounding due to the sequential flow from +gated self-attention to cross-attention. We demonstrate that such bias can be +significantly mitigated without sacrificing accuracy in either grounding by +simply rewiring the network architecture, changing from sequential to parallel +for gated self-attention and cross-attention. This surprisingly simple yet +effective solution does not require any fine-tuning of the network but +significantly reduces the trade-off between the two groundings. Our experiments +demonstrate significant improvements from the original GLIGEN to the rewired +version in the trade-off between textual grounding and spatial grounding. + +
+
+ comment: Project page: https://re-ground.github.io/ +
+
+
+
+
+ + ♻ ☆ Rapid post-disaster infrastructure damage characterisation enabled by + remote sensing and deep learning technologies -- a tiered approach + + +
+ Critical infrastructure are systematically targeted during wars and extensive +natural disasters because critical infrastructure is vital for enabling +connectivity and transportation of people and goods, and hence, underpins +national and international economic growth. Mass destruction of transport +assets, in conjunction with minimal or no accessibility in the wake of natural +and anthropogenic disasters, prevents us from delivering rapid recovery and +adaptation. A solution to this challenge is to use technology that enables +stand-off observations. Nevertheless, no methods exist for the integrated +characterisation of damage at multiple scales, i.e. regional, asset, and +structural scales, while there is no systematic correlation between +infrastructure damage assessments across these scales. We propose a methodology +based on an integrated multi-scale tiered approach to fill this capability gap. +In doing so, we demonstrate how damage characterisation can be enabled by +fit-for-purpose digital technologies. Next, the methodology is applied and +validated to a case study in Ukraine that includes 17 bridges all damages by +human targeted interventions. From macro to micro, we deploy technology to +integrate assessments at scale, using from Sentinel-1 SAR images, crowdsourced +information, and high-resolution images to deep learning to characterise +infrastructure damage. For the first time, the interferometric coherence +difference and semantic segmentation of images were deployed to improve the +reliability of damage characterisations at different scales, i.e. regional, +infrastructure asset and component, with the aim of enhancing the damage +characterisation accuracy. This integrated approach accelerates +decision-making, and therefore, facilitates more efficient restoration and +adaptation efforts, ultimately fostering resilience into our infrastructure. + +
+
+ comment: Main text (33 pages,15 figures); Supplementary materials (19 pages) +
+
+
+
+
+ + ♻ ☆ Auto MC-Reward: Automated Dense Reward Design with Large Language Models + for Minecraft CVPR2024 + + +
+ Many reinforcement learning environments (e.g., Minecraft) provide only +sparse rewards that indicate task completion or failure with binary values. The +challenge in exploration efficiency in such environments makes it difficult for +reinforcement-learning-based agents to learn complex tasks. To address this, +this paper introduces an advanced learning system, named Auto MC-Reward, that +leverages Large Language Models (LLMs) to automatically design dense reward +functions, thereby enhancing the learning efficiency. Auto MC-Reward consists +of three important components: Reward Designer, Reward Critic, and Trajectory +Analyzer. Given the environment information and task descriptions, the Reward +Designer first design the reward function by coding an executable Python +function with predefined observation inputs. Then, our Reward Critic will be +responsible for verifying the code, checking whether the code is +self-consistent and free of syntax and semantic errors. Further, the Trajectory +Analyzer summarizes possible failure causes and provides refinement suggestions +according to collected trajectories. In the next round, Reward Designer will +further refine and iterate the dense reward function based on feedback. +Experiments demonstrate a significant improvement in the success rate and +learning efficiency of our agents in complex tasks in Minecraft, such as +obtaining diamond with the efficient ability to avoid lava, and efficiently +explore trees and animals that are sparse in the plains biome. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Video-Based Autism Detection with Deep Learning + + +
+ Individuals with Autism Spectrum Disorder (ASD) often experience challenges +in health, communication, and sensory processing; therefore, early diagnosis is +necessary for proper treatment and care. In this work, we consider the problem +of detecting or classifying ASD children to aid medical professionals in early +diagnosis. We develop a deep learning model that analyzes video clips of +children reacting to sensory stimuli, with the intent of capturing key +differences in reactions and behavior between ASD and non-ASD participants. +Unlike many recent studies in ASD classification with MRI data, which require +expensive specialized equipment, our method utilizes a powerful but relatively +affordable GPU, a standard computer setup, and a video camera for inference. +Results show that our model effectively generalizes and understands key +differences in the distinct movements of the children. It is noteworthy that +our model exhibits successful classification performance despite the limited +amount of data for a deep learning problem and limited temporal information +available for learning, even with the motion artifacts. + +
+
+ comment: Poster Abstract. Accepted into 2024 IEEE Green Technologies + Conference +
+
+
+
+
+ + ♻ ☆ SiTH: Single-view Textured Human Reconstruction with Image-Conditioned + Diffusion CVPR 2024 + + +
+ A long-standing goal of 3D human reconstruction is to create lifelike and +fully detailed 3D humans from single-view images. The main challenge lies in +inferring unknown body shapes, appearances, and clothing details in areas not +visible in the images. To address this, we propose SiTH, a novel pipeline that +uniquely integrates an image-conditioned diffusion model into a 3D mesh +reconstruction workflow. At the core of our method lies the decomposition of +the challenging single-view reconstruction problem into generative +hallucination and reconstruction subproblems. For the former, we employ a +powerful generative diffusion model to hallucinate unseen back-view appearance +based on the input images. For the latter, we leverage skinned body meshes as +guidance to recover full-body texture meshes from the input and back-view +images. SiTH requires as few as 500 3D human scans for training while +maintaining its generality and robustness to diverse images. Extensive +evaluations on two 3D human benchmarks, including our newly created one, +highlighted our method's superior accuracy and perceptual quality in 3D +textured human reconstruction. Our code and evaluation benchmark are available +at https://ait.ethz.ch/sith + +
+
+ comment: 23 pages, 23 figures, CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Gaussian Head Avatar: Ultra High-fidelity Head Avatar via Dynamic + Gaussians + + +
+ Creating high-fidelity 3D head avatars has always been a research hotspot, +but there remains a great challenge under lightweight sparse view setups. In +this paper, we propose Gaussian Head Avatar represented by controllable 3D +Gaussians for high-fidelity head avatar modeling. We optimize the neutral 3D +Gaussians and a fully learned MLP-based deformation field to capture complex +expressions. The two parts benefit each other, thereby our method can model +fine-grained dynamic details while ensuring expression accuracy. Furthermore, +we devise a well-designed geometry-guided initialization strategy based on +implicit SDF and Deep Marching Tetrahedra for the stability and convergence of +the training procedure. Experiments show our approach outperforms other +state-of-the-art sparse-view methods, achieving ultra high-fidelity rendering +quality at 2K resolution even under exaggerated expressions. + +
+
+ comment: Projectpage: https://yuelangx.github.io/gaussianheadavatar, Code: + https://github.com/YuelangX/Gaussian-Head-Avatar +
+
+
+
+
+ + ♻ ☆ LatentEditor: Text Driven Local Editing of 3D Scenes + + +
+ While neural fields have made significant strides in view synthesis and scene +reconstruction, editing them poses a formidable challenge due to their implicit +encoding of geometry and texture information from multi-view inputs. In this +paper, we introduce \textsc{LatentEditor}, an innovative framework designed to +empower users with the ability to perform precise and locally controlled +editing of neural fields using text prompts. Leveraging denoising diffusion +models, we successfully embed real-world scenes into the latent space, +resulting in a faster and more adaptable NeRF backbone for editing compared to +traditional methods. To enhance editing precision, we introduce a delta score +to calculate the 2D mask in the latent space that serves as a guide for local +modifications while preserving irrelevant regions. Our novel pixel-level +scoring approach harnesses the power of InstructPix2Pix (IP2P) to discern the +disparity between IP2P conditional and unconditional noise predictions in the +latent space. The edited latents conditioned on the 2D masks are then +iteratively updated in the training set to achieve 3D local editing. Our +approach achieves faster editing speeds and superior output quality compared to +existing 3D editing models, bridging the gap between textual instructions and +high-quality 3D scene editing in latent space. We show the superiority of our +approach on four benchmark 3D datasets, LLFF, IN2N, NeRFStudio and NeRF-Art. + +
+
+ comment: Project Page: https://latenteditor.github.io/ +
+
+
+
+
+ + ♻ ☆ Robust Active Speaker Detection in Noisy Environments + + +
+ This paper addresses the issue of active speaker detection (ASD) in noisy +environments and formulates a robust active speaker detection (rASD) problem. +Existing ASD approaches leverage both audio and visual modalities, but +non-speech sounds in the surrounding environment can negatively impact +performance. To overcome this, we propose a novel framework that utilizes +audio-visual speech separation as guidance to learn noise-free audio features. +These features are then utilized in an ASD model, and both tasks are jointly +optimized in an end-to-end framework. Our proposed framework mitigates residual +noise and audio quality reduction issues that can occur in a naive cascaded +two-stage framework that directly uses separated speech for ASD, and enables +the two tasks to be optimized simultaneously. To further enhance the robustness +of the audio features and handle inherent speech noises, we propose a dynamic +weighted loss approach to train the speech separator. We also collected a +real-world noise audio dataset to facilitate investigations. Experiments +demonstrate that non-speech audio noises significantly impact ASD models, and +our proposed approach improves ASD performance in noisy environments. The +framework is general and can be applied to different ASD approaches to improve +their robustness. Our code, models, and data will be released. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ WaveMix: A Resource-efficient Neural Network for Image Analysis + + +
+ We propose a novel neural architecture for computer vision -- WaveMix -- that +is resource-efficient and yet generalizable and scalable. While using fewer +trainable parameters, GPU RAM, and computations, WaveMix networks achieve +comparable or better accuracy than the state-of-the-art convolutional neural +networks, vision transformers, and token mixers for several tasks. This +efficiency can translate to savings in time, cost, and energy. To achieve these +gains we used multi-level two-dimensional discrete wavelet transform (2D-DWT) +in WaveMix blocks, which has the following advantages: (1) It reorganizes +spatial information based on three strong image priors -- scale-invariance, +shift-invariance, and sparseness of edges -- (2) in a lossless manner without +adding parameters, (3) while also reducing the spatial sizes of feature maps, +which reduces the memory and time required for forward and backward passes, and +(4) expanding the receptive field faster than convolutions do. The whole +architecture is a stack of self-similar and resolution-preserving WaveMix +blocks, which allows architectural flexibility for various tasks and levels of +resource availability. WaveMix establishes new benchmarks for segmentation on +Cityscapes; and for classification on Galaxy 10 DECals, Places-365, five EMNIST +datasets, and iNAT-mini and performs competitively on other benchmarks. Our +code and trained models are publicly available. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Bidirectional Consistency Models + + +
+ Diffusion models (DMs) are capable of generating remarkably high-quality +samples by iteratively denoising a random vector, a process that corresponds to +moving along the probability flow ordinary differential equation (PF ODE). +Interestingly, DMs can also invert an input image to noise by moving backward +along the PF ODE, a key operation for downstream tasks such as interpolation +and image editing. However, the iterative nature of this process restricts its +speed, hindering its broader application. Recently, Consistency Models (CMs) +have emerged to address this challenge by approximating the integral of the PF +ODE, largely reducing the number of iterations. Yet, the absence of an explicit +ODE solver complicates the inversion process. To resolve this, we introduce the +Bidirectional Consistency Model (BCM), which learns a single neural network +that enables both forward and backward traversal along the PF ODE, efficiently +unifying generation and inversion tasks within one framework. Notably, our +proposed method enables one-step generation and inversion while also allowing +the use of additional steps to enhance generation quality or reduce +reconstruction error. Furthermore, by leveraging our model's bidirectional +consistency, we introduce a sampling strategy that can enhance FID while +preserving the generated image content. We further showcase our model's +capabilities in several downstream tasks, such as interpolation and inpainting, +and present demonstrations of potential applications, including blind +restoration of compressed images and defending black-box adversarial attacks. + +
+
+ comment: 40 pages, 25 figures +
+
+
+
+
+ + ♻ ☆ IllusionVQA: A Challenging Optical Illusion Dataset for Vision Language + Models + + +
+ The advent of Vision Language Models (VLM) has allowed researchers to +investigate the visual understanding of a neural network using natural +language. Beyond object classification and detection, VLMs are capable of +visual comprehension and common-sense reasoning. This naturally led to the +question: How do VLMs respond when the image itself is inherently unreasonable? +To this end, we present IllusionVQA: a diverse dataset of challenging optical +illusions and hard-to-interpret scenes to test the capability of VLMs in two +distinct multiple-choice VQA tasks - comprehension and soft localization. +GPT4V, the best-performing VLM, achieves 62.99% accuracy (4-shot) on the +comprehension task and 49.7% on the localization task (4-shot and +Chain-of-Thought). Human evaluation reveals that humans achieve 91.03% and 100% +accuracy in comprehension and localization. We discover that In-Context +Learning (ICL) and Chain-of-Thought reasoning substantially degrade the +performance of GeminiPro on the localization task. Tangentially, we discover a +potential weakness in the ICL capabilities of VLMs: they fail to locate optical +illusions even when the correct answer is in the context window as a few-shot +example. + +
+
+
+
+
+ + ♻ ☆ Synthesize, Diagnose, and Optimize: Towards Fine-Grained Vision-Language + Understanding CVPR 2024 + + +
+ Vision language models (VLM) have demonstrated remarkable performance across +various downstream tasks. However, understanding fine-grained visual-linguistic +concepts, such as attributes and inter-object relationships, remains a +significant challenge. While several benchmarks aim to evaluate VLMs in finer +granularity, their primary focus remains on the linguistic aspect, neglecting +the visual dimension. Here, we highlight the importance of evaluating VLMs from +both a textual and visual perspective. We introduce a progressive pipeline to +synthesize images that vary in a specific attribute while ensuring consistency +in all other aspects. Utilizing this data engine, we carefully design a +benchmark, SPEC, to diagnose the comprehension of object size, position, +existence, and count. Subsequently, we conduct a thorough evaluation of four +leading VLMs on SPEC. Surprisingly, their performance is close to random guess, +revealing significant limitations. With this in mind, we propose a simple yet +effective approach to optimize VLMs in fine-grained understanding, achieving +significant improvements on SPEC without compromising the zero-shot +performance. Results on two additional fine-grained benchmarks also show +consistent improvements, further validating the transferability of our +approach. Code and data are available at https://github.com/wjpoom/SPEC. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ NICP: Neural ICP for 3D Human Registration at Scale + + +
+ Aligning a template to 3D human point clouds is a long-standing problem +crucial for tasks like animation, reconstruction, and enabling supervised +learning pipelines. Recent data-driven methods leverage predicted surface +correspondences; however, they are not robust to varied poses, identities, or +noise. In contrast, industrial solutions often rely on expensive manual +annotations or multi-view capturing systems. Recently, neural fields have shown +promising results. Still, their purely data-driven and extrinsic nature does +not incorporate any guidance toward the target surface, often resulting in a +trivial misalignment of the template registration. Currently, no method can be +considered the standard for 3D Human registration, limiting the scalability of +downstream applications. In this work, we propose NSR, a pipeline that, for the +first time, generalizes and scales across thousands of shapes and more than ten +different data sources. Our essential contribution is NICP, an ICP-style +self-supervised task tailored to neural fields. NICP takes a few seconds, is +self-supervised, and works out of the box on pre-trained neural fields. We +combine it with a localized Neural Field trained on a large MoCap dataset. NSR +achieves the state of the art over public benchmarks, and the release of its +code and checkpoints will provide the community with a powerful tool useful for +many downstream tasks like dataset alignments, cleaning, or asset animation. + +
+
+
+
+
+ + ♻ ☆ As-Plausible-As-Possible: Plausibility-Aware Mesh Deformation Using 2D + Diffusion Priors + + +
+ We present As-Plausible-as-Possible (APAP) mesh deformation technique that +leverages 2D diffusion priors to preserve the plausibility of a mesh under +user-controlled deformation. Our framework uses per-face Jacobians to represent +mesh deformations, where mesh vertex coordinates are computed via a +differentiable Poisson Solve. The deformed mesh is rendered, and the resulting +2D image is used in the Score Distillation Sampling (SDS) process, which +enables extracting meaningful plausibility priors from a pretrained 2D +diffusion model. To better preserve the identity of the edited mesh, we +fine-tune our 2D diffusion model with LoRA. Gradients extracted by SDS and a +user-prescribed handle displacement are then backpropagated to the per-face +Jacobians, and we use iterative gradient descent to compute the final +deformation that balances between the user edit and the output plausibility. We +evaluate our method with 2D and 3D meshes and demonstrate qualitative and +quantitative improvements when using plausibility priors over +geometry-preservation or distortion-minimization priors used by previous +techniques. Our project page is at: https://as-plausible-aspossible.github.io/ + +
+
+ comment: Project page: https://as-plausible-as-possible.github.io/ +
+
+
+
+
+ + ♻ ☆ Fun with Flags: Robust Principal Directions via Flag Manifolds + + +
+ Principal component analysis (PCA), along with its extensions to manifolds +and outlier contaminated data, have been indispensable in computer vision and +machine learning. In this work, we present a unifying formalism for PCA and its +variants, and introduce a framework based on the flags of linear subspaces, ie +a hierarchy of nested linear subspaces of increasing dimension, which not only +allows for a common implementation but also yields novel variants, not explored +previously. We begin by generalizing traditional PCA methods that either +maximize variance or minimize reconstruction error. We expand these +interpretations to develop a wide array of new dimensionality reduction +algorithms by accounting for outliers and the data manifold. To devise a common +computational approach, we recast robust and dual forms of PCA as optimization +problems on flag manifolds. We then integrate tangent space approximations of +principal geodesic analysis (tangent-PCA) into this flag-based framework, +creating novel robust and dual geodesic PCA variations. The remarkable +flexibility offered by the 'flagification' introduced here enables even more +algorithmic variants identified by specific flag types. Last but not least, we +propose an effective convergent solver for these flag-formulations employing +the Stiefel manifold. Our empirical results on both real-world and synthetic +scenarios, demonstrate the superiority of our novel algorithms, especially in +terms of robustness to outliers on manifolds. + +
+
+
+
+
+ + ♻ ☆ Open Vocabulary Semantic Scene Sketch Understanding + + +
+ We study the underexplored but fundamental vision problem of machine +understanding of abstract freehand scene sketches. We introduce a sketch +encoder that results in semantically-aware feature space, which we evaluate by +testing its performance on a semantic sketch segmentation task. To train our +model we rely only on the availability of bitmap sketches with their brief +captions and do not require any pixel-level annotations. To obtain +generalization to a large set of sketches and categories, we build on a vision +transformer encoder pretrained with the CLIP model. We freeze the text encoder +and perform visual-prompt tuning of the visual encoder branch while introducing +a set of critical modifications. Firstly, we augment the classical key-query +(k-q) self-attention blocks with value-value (v-v) self-attention blocks. +Central to our model is a two-level hierarchical network design that enables +efficient semantic disentanglement: The first level ensures holistic scene +sketch encoding, and the second level focuses on individual categories. We, +then, in the second level of the hierarchy, introduce a cross-attention between +textual and visual branches. Our method outperforms zero-shot CLIP pixel +accuracy of segmentation results by 37 points, reaching an accuracy of $85.5\%$ +on the FS-COCO sketch dataset. Finally, we conduct a user study that allows us +to identify further improvements needed over our method to reconcile machine +and human understanding of scene sketches. + +
+
+
+
+
+ + ♻ ☆ Correlation-guided Query-Dependency Calibration in Video Representation + Learning for Temporal Grounding + + +
+ Video Temporal Grounding is to identify specific moments or highlights from a +video corresponding to textual descriptions. Typical approaches in temporal +grounding treat all video clips equally during the encoding process regardless +of their semantic relevance with the text query. Therefore, we propose +Correlation-Guided DEtection TRansformer(CG-DETR), exploring to provide clues +for query-associated video clips within the cross-modal attention. First, we +design an adaptive cross-attention with dummy tokens. Dummy tokens conditioned +by text query take portions of the attention weights, preventing irrelevant +video clips from being represented by the text query. Yet, not all words +equally inherit the text query's correlation to video clips. Thus, we further +guide the cross-attention map by inferring the fine-grained correlation between +video clips and words. We enable this by learning a joint embedding space for +high-level concepts, i.e., moment and sentence level, and inferring the +clip-word correlation. Lastly, we exploit the moment-specific characteristics +and combine them with the context of each video to form a moment-adaptive +saliency detector. By exploiting the degrees of text engagement in each video +clip, it precisely measures the highlightness of each clip. CG-DETR achieves +state-of-the-art results on various benchmarks for temporal grounding. + +
+
+ comment: 34 pages, 16 figures, 13 tables, Code is available at + https://github.com/wjun0830/CGDETR +
+
+
+
+
+ + ♻ ☆ Geometrically-driven Aggregation for Zero-shot 3D Point Cloud + Understanding + + +
+ Zero-shot 3D point cloud understanding can be achieved via 2D Vision-Language +Models (VLMs). Existing strategies directly map Vision-Language Models from 2D +pixels of rendered or captured views to 3D points, overlooking the inherent and +expressible point cloud geometric structure. Geometrically similar or close +regions can be exploited for bolstering point cloud understanding as they are +likely to share semantic information. To this end, we introduce the first +training-free aggregation technique that leverages the point cloud's 3D +geometric structure to improve the quality of the transferred Vision-Language +Models. Our approach operates iteratively, performing local-to-global +aggregation based on geometric and semantic point-level reasoning. We benchmark +our approach on three downstream tasks, including classification, part +segmentation, and semantic segmentation, with a variety of datasets +representing both synthetic/real-world, and indoor/outdoor scenarios. Our +approach achieves new state-of-the-art results in all benchmarks. We will +release the source code publicly. + +
+
+ comment: Zero-shot, point cloud, 2D Vision-Language Models, geometric + structure, training-free +
+
+
+
+
+ + ♻ ☆ SAGE: Bridging Semantic and Actionable Parts for GEneralizable + Manipulation of Articulated Objects + + +
+ To interact with daily-life articulated objects of diverse structures and +functionalities, understanding the object parts plays a central role in both +user instruction comprehension and task execution. However, the possible +discordance between the semantic meaning and physics functionalities of the +parts poses a challenge for designing a general system. To address this +problem, we propose SAGE, a novel framework that bridges semantic and +actionable parts of articulated objects to achieve generalizable manipulation +under natural language instructions. More concretely, given an articulated +object, we first observe all the semantic parts on it, conditioned on which an +instruction interpreter proposes possible action programs that concretize the +natural language instruction. Then, a part-grounding module maps the semantic +parts into so-called Generalizable Actionable Parts (GAParts), which inherently +carry information about part motion. End-effector trajectories are predicted on +the GAParts, which, together with the action program, form an executable +policy. Additionally, an interactive feedback module is incorporated to respond +to failures, which closes the loop and increases the robustness of the overall +framework. Key to the success of our framework is the joint proposal and +knowledge fusion between a large vision-language model (VLM) and a small +domain-specific model for both context comprehension and part perception, with +the former providing general intuitions and the latter serving as expert facts. +Both simulation and real-robot experiments show our effectiveness in handling a +large variety of articulated objects with diverse language-instructed goals. + +
+
+
+
+
+ + ♻ ☆ vid-TLDR: Training Free Token merging for Light-weight Video Transformer CVPR + + +
+ Video Transformers have become the prevalent solution for various video +downstream tasks with superior expressive power and flexibility. However, these +video transformers suffer from heavy computational costs induced by the massive +number of tokens across the entire video frames, which has been the major +barrier to training the model. Further, the patches irrelevant to the main +contents, e.g., backgrounds, degrade the generalization performance of models. +To tackle these issues, we propose training free token merging for lightweight +video Transformer (vid-TLDR) that aims to enhance the efficiency of video +Transformers by merging the background tokens without additional training. For +vid-TLDR, we introduce a novel approach to capture the salient regions in +videos only with the attention map. Further, we introduce the saliency-aware +token merging strategy by dropping the background tokens and sharpening the +object scores. Our experiments show that vid-TLDR significantly mitigates the +computational complexity of video Transformers while achieving competitive +performance compared to the base model without vid-TLDR. Code is available at +https://github.com/mlvlab/vid-TLDR. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ VRP-SAM: SAM with Visual Reference Prompt CVPR 2024 + + +
+ In this paper, we propose a novel Visual Reference Prompt (VRP) encoder that +empowers the Segment Anything Model (SAM) to utilize annotated reference images +as prompts for segmentation, creating the VRP-SAM model. In essence, VRP-SAM +can utilize annotated reference images to comprehend specific objects and +perform segmentation of specific objects in target image. It is note that the +VRP encoder can support a variety of annotation formats for reference images, +including \textbf{point}, \textbf{box}, \textbf{scribble}, and \textbf{mask}. +VRP-SAM achieves a breakthrough within the SAM framework by extending its +versatility and applicability while preserving SAM's inherent strengths, thus +enhancing user-friendliness. To enhance the generalization ability of VRP-SAM, +the VRP encoder adopts a meta-learning strategy. To validate the effectiveness +of VRP-SAM, we conducted extensive empirical studies on the Pascal and COCO +datasets. Remarkably, VRP-SAM achieved state-of-the-art performance in visual +reference segmentation with minimal learnable parameters. Furthermore, VRP-SAM +demonstrates strong generalization capabilities, allowing it to perform +segmentation of unseen objects and enabling cross-domain segmentation. The +source code and models will be available at +\url{https://github.com/syp2ysy/VRP-SAM} + +
+
+ comment: Accepted by CVPR 2024; The camera-ready version +
+
+
+
+
+ + ♻ ☆ Video Self-Stitching Graph Network for Temporal Action Localization + + +
+ Temporal action localization (TAL) in videos is a challenging task, +especially due to the large variation in action temporal scales. Short actions +usually occupy a major proportion in the datasets, but tend to have the lowest +performance. In this paper, we confront the challenge of short actions and +propose a multi-level cross-scale solution dubbed as video self-stitching graph +network (VSGN). We have two key components in VSGN: video self-stitching (VSS) +and cross-scale graph pyramid network (xGPN). In VSS, we focus on a short +period of a video and magnify it along the temporal dimension to obtain a +larger scale. We stitch the original clip and its magnified counterpart in one +input sequence to take advantage of the complementary properties of both +scales. The xGPN component further exploits the cross-scale correlations by a +pyramid of cross-scale graph networks, each containing a hybrid module to +aggregate features from across scales as well as within the same scale. Our +VSGN not only enhances the feature representations, but also generates more +positive anchors for short actions and more short training samples. Experiments +demonstrate that VSGN obviously improves the localization performance of short +actions as well as achieving the state-of-the-art overall performance on +THUMOS-14 and ActivityNet-v1.3. + +
+
+
+
+
+ + ♻ ☆ Optimizing Sparse Convolution on GPUs with CUDA for 3D Point Cloud + Processing in Embedded Systems + + +
+ In recent years, there has been a significant increase in the utilization of +deep learning methods, particularly convolutional neural networks (CNNs), which +have emerged as the dominant approach in various domains that involve +structured grid data, such as picture analysis and processing. Nevertheless, +the exponential growth in the utilization of LiDAR and 3D sensors across many +domains has resulted in an increased need for the analysis of 3D point clouds. +The utilization of 3D point clouds is crucial in various applications, +including object recognition and segmentation, as they offer a spatial +depiction of things within a three-dimensional environment. In contrast to +photos, point clouds exhibit sparsity and lack a regular grid, hence posing +distinct processing and computational issues. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ Aligning Logits Generatively for Principled Black-Box Knowledge + Distillation CVPR 2024 + + +
+ Black-Box Knowledge Distillation (B2KD) is a formulated problem for +cloud-to-edge model compression with invisible data and models hosted on the +server. B2KD faces challenges such as limited Internet exchange and edge-cloud +disparity of data distributions. In this paper, we formalize a two-step +workflow consisting of deprivatization and distillation, and theoretically +provide a new optimization direction from logits to cell boundary different +from direct logits alignment. With its guidance, we propose a new method +Mapping-Emulation KD (MEKD) that distills a black-box cumbersome model into a +lightweight one. Our method does not differentiate between treating soft or +hard responses, and consists of: 1) deprivatization: emulating the inverse +mapping of the teacher function with a generator, and 2) distillation: aligning +low-dimensional logits of the teacher and student models by reducing the +distance of high-dimensional image points. For different teacher-student pairs, +our method yields inspiring distillation performance on various benchmarks, and +outperforms the previous state-of-the-art approaches. + +
+
+ comment: To appear at CVPR 2024; significantly rewritten with extra + experiments since the preliminary report +
+
+
+
+
+ + ♻ ☆ MMA-Diffusion: MultiModal Attack on Diffusion Models CVPR 2024 + + +
+ In recent years, Text-to-Image (T2I) models have seen remarkable +advancements, gaining widespread adoption. However, this progress has +inadvertently opened avenues for potential misuse, particularly in generating +inappropriate or Not-Safe-For-Work (NSFW) content. Our work introduces +MMA-Diffusion, a framework that presents a significant and realistic threat to +the security of T2I models by effectively circumventing current defensive +measures in both open-source models and commercial online services. Unlike +previous approaches, MMA-Diffusion leverages both textual and visual modalities +to bypass safeguards like prompt filters and post-hoc safety checkers, thus +exposing and highlighting the vulnerabilities in existing defense mechanisms. + +
+
+ comment: CVPR 2024. Our codes and benchmarks are available at + https://github.com/cure-lab/MMA-Diffusion +
+
+
+
+
+ + ♻ ☆ Dr$^2$Net: Dynamic Reversible Dual-Residual Networks for + Memory-Efficient Finetuning + + +
+ Large pretrained models are increasingly crucial in modern computer vision +tasks. These models are typically used in downstream tasks by end-to-end +finetuning, which is highly memory-intensive for tasks with high-resolution +data, e.g., video understanding, small object detection, and point cloud +analysis. In this paper, we propose Dynamic Reversible Dual-Residual Networks, +or Dr$^2$Net, a novel family of network architectures that acts as a surrogate +network to finetune a pretrained model with substantially reduced memory +consumption. Dr$^2$Net contains two types of residual connections, one +maintaining the residual structure in the pretrained models, and the other +making the network reversible. Due to its reversibility, intermediate +activations, which can be reconstructed from output, are cleared from memory +during training. We use two coefficients on either type of residual connections +respectively, and introduce a dynamic training strategy that seamlessly +transitions the pretrained model to a reversible network with much higher +numerical precision. We evaluate Dr$^2$Net on various pretrained models and +various tasks, and show that it can reach comparable performance to +conventional finetuning but with significantly less memory usage. + +
+
+
+
+
+ + ♻ ☆ I'M HOI: Inertia-aware Monocular Capture of 3D Human-Object Interactions CVPR 2024 + + +
+ We are living in a world surrounded by diverse and "smart" devices with rich +modalities of sensing ability. Conveniently capturing the interactions between +us humans and these objects remains far-reaching. In this paper, we present +I'm-HOI, a monocular scheme to faithfully capture the 3D motions of both the +human and object in a novel setting: using a minimal amount of RGB camera and +object-mounted Inertial Measurement Unit (IMU). It combines general motion +inference and category-aware refinement. For the former, we introduce a +holistic human-object tracking method to fuse the IMU signals and the RGB +stream and progressively recover the human motions and subsequently the +companion object motions. For the latter, we tailor a category-aware motion +diffusion model, which is conditioned on both the raw IMU observations and the +results from the previous stage under over-parameterization representation. It +significantly refines the initial results and generates vivid body, hand, and +object motions. Moreover, we contribute a large dataset with ground truth human +and object motions, dense RGB inputs, and rich object-mounted IMU measurements. +Extensive experiments demonstrate the effectiveness of I'm-HOI under a hybrid +capture setting. Our dataset and code will be released to the community. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://afterjourney00.github.io/IM-HOI.github.io/ +
+
+
+
+
+ + ♻ ☆ Initialization Matters for Adversarial Transfer Learning CVPR 2024 + + +
+ With the prevalence of the Pretraining-Finetuning paradigm in transfer +learning, the robustness of downstream tasks has become a critical concern. In +this work, we delve into adversarial robustness in transfer learning and reveal +the critical role of initialization, including both the pretrained model and +the linear head. First, we discover the necessity of an adversarially robust +pretrained model. Specifically, we reveal that with a standard pretrained +model, Parameter-Efficient Finetuning (PEFT) methods either fail to be +adversarially robust or continue to exhibit significantly degraded adversarial +robustness on downstream tasks, even with adversarial training during +finetuning. Leveraging a robust pretrained model, surprisingly, we observe that +a simple linear probing can outperform full finetuning and other PEFT methods +with random initialization on certain datasets. We further identify that linear +probing excels in preserving robustness from the robust pretraining. Based on +this, we propose Robust Linear Initialization (RoLI) for adversarial +finetuning, which initializes the linear head with the weights obtained by +adversarial linear probing to maximally inherit the robustness from +pretraining. Across five different image classification datasets, we +demonstrate the effectiveness of RoLI and achieve new state-of-the-art results. +Our code is available at \url{https://github.com/DongXzz/RoLI}. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Latent Code Augmentation Based on Stable Diffusion for Data-free + Substitute Attacks + + +
+ Since the training data of the target model is not available in the black-box +substitute attack, most recent schemes utilize GANs to generate data for +training the substitute model. However, these GANs-based schemes suffer from +low training efficiency as the generator needs to be retrained for each target +model during the substitute training process, as well as low generation +quality. To overcome these limitations, we consider utilizing the diffusion +model to generate data, and propose a novel data-free substitute attack scheme +based on the Stable Diffusion (SD) to improve the efficiency and accuracy of +substitute training. Despite the data generated by the SD exhibiting high +quality, it presents a different distribution of domains and a large variation +of positive and negative samples for the target model. For this problem, we +propose Latent Code Augmentation (LCA) to facilitate SD in generating data that +aligns with the data distribution of the target model. Specifically, we augment +the latent codes of the inferred member data with LCA and use them as guidance +for SD. With the guidance of LCA, the data generated by the SD not only meets +the discriminative criteria of the target model but also exhibits high +diversity. By utilizing this data, it is possible to train the substitute model +that closely resembles the target model more efficiently. Extensive experiments +demonstrate that our LCA achieves higher attack success rates and requires +fewer query budgets compared to GANs-based schemes for different target models. +Our codes are available at \url{https://github.com/LzhMeng/LCA}. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Can I Trust Your Answer? Visually Grounded Video Question Answering CVPR'24 + + +
+ We study visually grounded VideoQA in response to the emerging trends of +utilizing pretraining techniques for video-language understanding. +Specifically, by forcing vision-language models (VLMs) to answer questions and +simultaneously provide visual evidence, we seek to ascertain the extent to +which the predictions of such techniques are genuinely anchored in relevant +video content, versus spurious correlations from language or irrelevant visual +context. Towards this, we construct NExT-GQA -- an extension of NExT-QA with +10.5$K$ temporal grounding (or location) labels tied to the original QA pairs. +With NExT-GQA, we scrutinize a series of state-of-the-art VLMs. Through +post-hoc attention analysis, we find that these models are extremely weak in +substantiating the answers despite their strong QA performance. This exposes +the limitation of current VLMs in making reliable predictions. As a remedy, we +further explore and propose a grounded-QA method via Gaussian mask optimization +and cross-modal learning. Experiments with different backbones demonstrate that +this grounding mechanism improves both grounding and QA. With these efforts, we +aim to push towards trustworthy VLMs in VQA systems. Our dataset and code are +available at https://github.com/doc-doc/NExT-GQA. + +
+
+ comment: Accepted to CVPR'24. (Compared with preprint version, we mainly + improve the presentation, discuss more related works, and extend experiments + in Appendix.) +
+
+
+
+
+ + ♻ ☆ Tracing Hyperparameter Dependencies for Model Parsing via Learnable + Graph Pooling Network + + +
+ Model Parsing defines the research task of predicting hyperparameters of the +generative model (GM), given a generated image as input. Since a diverse set of +hyperparameters is jointly employed by the generative model, and dependencies +often exist among them, it is crucial to learn these hyperparameter +dependencies for the improved model parsing performance. To explore such +important dependencies, we propose a novel model parsing method called +Learnable Graph Pooling Network (LGPN). Specifically, we transform model +parsing into a graph node classification task, using graph nodes and edges to +represent hyperparameters and their dependencies, respectively. Furthermore, +LGPN incorporates a learnable pooling-unpooling mechanism tailored to model +parsing, which adaptively learns hyperparameter dependencies of GMs used to +generate the input image. We also extend our proposed method to CNN-generated +image detection and coordinate attacks detection. Empirically, we achieve +state-of-the-art results in model parsing and its extended applications, +showing the effectiveness of our method. Our source code are available. + +
+
+ comment: 24 pages, 15 figures, 17 tables +
+
+
+
+
+ + ♻ ☆ Finding needles in a haystack: A Black-Box Approach to Invisible + Watermark Detection + + +
+ In this paper, we propose WaterMark Detection (WMD), the first invisible +watermark detection method under a black-box and annotation-free setting. WMD +is capable of detecting arbitrary watermarks within a given reference dataset +using a clean non-watermarked dataset as a reference, without relying on +specific decoding methods or prior knowledge of the watermarking techniques. We +develop WMD using foundations of offset learning, where a clean non-watermarked +dataset enables us to isolate the influence of only watermarked samples in the +reference dataset. Our comprehensive evaluations demonstrate the effectiveness +of WMD, significantly outperforming naive detection methods, which only yield +AUC scores around 0.5. In contrast, WMD consistently achieves impressive +detection AUC scores, surpassing 0.9 in most single-watermark datasets and +exceeding 0.7 in more challenging multi-watermark scenarios across diverse +datasets and watermarking methods. As invisible watermarks become increasingly +prevalent, while specific decoding techniques remain undisclosed, our approach +provides a versatile solution and establishes a path toward increasing +accountability, transparency, and trust in our digital visual content. + +
+
+
+
+
+ + ♻ ☆ NTO3D: Neural Target Object 3D Reconstruction with Segment Anything CVPR24 + + +
+ Neural 3D reconstruction from multi-view images has recently attracted +increasing attention from the community. Existing methods normally learn a +neural field for the whole scene, while it is still under-explored how to +reconstruct a target object indicated by users. Considering the Segment +Anything Model (SAM) has shown effectiveness in segmenting any 2D images, in +this paper, we propose NTO3D, a novel high-quality Neural Target Object 3D +(NTO3D) reconstruction method, which leverages the benefits of both neural +field and SAM. We first propose a novel strategy to lift the multi-view 2D +segmentation masks of SAM into a unified 3D occupancy field. The 3D occupancy +field is then projected into 2D space and generates the new prompts for SAM. +This process is iterative until convergence to separate the target object from +the scene. After this, we then lift the 2D features of the SAM encoder into a +3D feature field in order to improve the reconstruction quality of the target +object. NTO3D lifts the 2D masks and features of SAM into the 3D neural field +for high-quality neural target object 3D reconstruction. We conduct detailed +experiments on several benchmark datasets to demonstrate the advantages of our +method. The code will be available at: https://github.com/ucwxb/NTO3D. + +
+
+ comment: accepted by CVPR24 +
+
+
+
+
+ + ♻ ☆ Consistency Trajectory Models: Learning Probability Flow ODE Trajectory + of Diffusion + + +
+ Consistency Models (CM) (Song et al., 2023) accelerate score-based diffusion +model sampling at the cost of sample quality but lack a natural way to +trade-off quality for speed. To address this limitation, we propose Consistency +Trajectory Model (CTM), a generalization encompassing CM and score-based models +as special cases. CTM trains a single neural network that can -- in a single +forward pass -- output scores (i.e., gradients of log-density) and enables +unrestricted traversal between any initial and final time along the Probability +Flow Ordinary Differential Equation (ODE) in a diffusion process. CTM enables +the efficient combination of adversarial training and denoising score matching +loss to enhance performance and achieves new state-of-the-art FIDs for +single-step diffusion model sampling on CIFAR-10 (FID 1.73) and ImageNet at +64x64 resolution (FID 1.92). CTM also enables a new family of sampling schemes, +both deterministic and stochastic, involving long jumps along the ODE solution +trajectories. It consistently improves sample quality as computational budgets +increase, avoiding the degradation seen in CM. Furthermore, unlike CM, CTM's +access to the score function can streamline the adoption of established +controllable/conditional generation methods from the diffusion community. This +access also enables the computation of likelihood. The code is available at +https://github.com/sony/ctm. + +
+
+ comment: International Conference on Learning Representations +
+
+
+
+
+ + ♻ ☆ Egocentric Scene-aware Human Trajectory Prediction + + +
+ Wearable collaborative robots stand to assist human wearers who need fall +prevention assistance or wear exoskeletons. Such a robot needs to be able to +predict the ego motion of the wearer based on egocentric vision and the +surrounding scene. In this work, we leveraged body-mounted cameras and sensors +to anticipate the trajectory of human wearers through complex surroundings. To +facilitate research in ego-motion prediction, we have collected a comprehensive +walking scene navigation dataset centered on the user's perspective. We present +a method to predict human motion conditioning on the surrounding static scene. +Our method leverages a diffusion model to produce a distribution of potential +future trajectories, taking into account the user's observation of the +environment. We introduce a compact representation to encode the user's visual +memory of the surroundings, as well as an efficient sample-generating technique +to speed up real-time inference of a diffusion model. We ablate our model and +compare it to baselines, and results show that our model outperforms existing +methods on key metrics of collision avoidance and trajectory mode coverage. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Passive Snapshot Coded Aperture Dual-Pixel RGB-D Imaging + + +
+ Passive, compact, single-shot 3D sensing is useful in many application areas +such as microscopy, medical imaging, surgical navigation, and autonomous +driving where form factor, time, and power constraints can exist. Obtaining +RGB-D scene information over a short imaging distance, in an ultra-compact form +factor, and in a passive, snapshot manner is challenging. Dual-pixel (DP) +sensors are a potential solution to achieve the same. DP sensors collect light +rays from two different halves of the lens in two interleaved pixel arrays, +thus capturing two slightly different views of the scene, like a stereo camera +system. However, imaging with a DP sensor implies that the defocus blur size is +directly proportional to the disparity seen between the views. This creates a +trade-off between disparity estimation vs. deblurring accuracy. To improve this +trade-off effect, we propose CADS (Coded Aperture Dual-Pixel Sensing), in which +we use a coded aperture in the imaging lens along with a DP sensor. In our +approach, we jointly learn an optimal coded pattern and the reconstruction +algorithm in an end-to-end optimization setting. Our resulting CADS imaging +system demonstrates improvement of >1.5dB PSNR in all-in-focus (AIF) estimates +and 5-6% in depth estimation quality over naive DP sensing for a wide range of +aperture settings. Furthermore, we build the proposed CADS prototypes for DSLR +photography settings and in an endoscope and a dermoscope form factor. Our +novel coded dual-pixel sensing approach demonstrates accurate RGB-D +reconstruction results in simulations and real-world experiments in a passive, +snapshot, and compact manner. + +
+
+
+
+
+ + ♻ ☆ SKDF: A Simple Knowledge Distillation Framework for Distilling + Open-Vocabulary Knowledge to Open-world Object Detector + + +
+ In this paper, we attempt to specialize the VLM model for OWOD tasks by +distilling its open-world knowledge into a language-agnostic detector. +Surprisingly, we observe that the combination of a simple \textbf{knowledge +distillation} approach and the automatic pseudo-labeling mechanism in OWOD can +achieve better performance for unknown object detection, even with a small +amount of data. Unfortunately, knowledge distillation for unknown objects +severely affects the learning of detectors with conventional structures for +known objects, leading to catastrophic forgetting. To alleviate these problems, +we propose the \textbf{down-weight loss function} for knowledge distillation +from vision-language to single vision modality. Meanwhile, we propose the +\textbf{cascade decouple decoding structure} that decouples the learning of +localization and recognition to reduce the impact of category interactions of +known and unknown objects on the localization learning process. Ablation +experiments demonstrate that both of them are effective in mitigating the +impact of open-world knowledge distillation on the learning of known objects. +Additionally, to alleviate the current lack of comprehensive benchmarks for +evaluating the ability of the open-world detector to detect unknown objects in +the open world, we propose two benchmarks, which we name +"\textbf{StandardSet}$\heartsuit$" and "\textbf{IntensiveSet}$\spadesuit$" +respectively, based on the complexity of their testing scenarios. Comprehensive +experiments performed on OWOD, MS-COCO, and our proposed benchmarks demonstrate +the effectiveness of our methods. The code and proposed dataset are available +at \url{https://github.com/xiaomabufei/SKDF}. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.11623 +
+
+
+
+
+ + ♻ ☆ Free3D: Consistent Novel View Synthesis without 3D Representation + + +
+ We introduce Free3D, a simple accurate method for monocular open-set novel +view synthesis (NVS). Similar to Zero-1-to-3, we start from a pre-trained 2D +image generator for generalization, and fine-tune it for NVS. Compared to other +works that took a similar approach, we obtain significant improvements without +resorting to an explicit 3D representation, which is slow and memory-consuming, +and without training an additional network for 3D reconstruction. Our key +contribution is to improve the way the target camera pose is encoded in the +network, which we do by introducing a new ray conditioning normalization (RCN) +layer. The latter injects pose information in the underlying 2D image generator +by telling each pixel its viewing direction. We further improve multi-view +consistency by using light-weight multi-view attention layers and by sharing +generation noise between the different views. We train Free3D on the Objaverse +dataset and demonstrate excellent generalization to new categories in new +datasets, including OmniObject3D and GSO. The project page is available at +https://chuanxiaz.com/free3d/. + +
+
+ comment: webpage: https://chuanxiaz.com/free3d/, code: + https://github.com/lyndonzheng/Free3D +
+
+
+
+
+ + ♻ ☆ Dual-View Visual Contextualization for Web Navigation CVPR 2024 + + +
+ Automatic web navigation aims to build a web agent that can follow language +instructions to execute complex and diverse tasks on real-world websites. +Existing work primarily takes HTML documents as input, which define the +contents and action spaces (i.e., actionable elements and operations) of +webpages. Nevertheless, HTML documents may not provide a clear task-related +context for each element, making it hard to select the right (sequence of) +actions. In this paper, we propose to contextualize HTML elements through their +"dual views" in webpage screenshots: each HTML element has its corresponding +bounding box and visual content in the screenshot. We build upon the insight -- +web developers tend to arrange task-related elements nearby on webpages to +enhance user experiences -- and propose to contextualize each element with its +neighbor elements, using both textual and visual features. The resulting +representations of HTML elements are more informative for the agent to take +action. We validate our method on the recently released Mind2Web dataset, which +features diverse navigation domains and tasks on real-world websites. Our +method consistently outperforms the baseline in all the scenarios, including +cross-task, cross-website, and cross-domain ones. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ MoEController: Instruction-based Arbitrary Image Manipulation with + Mixture-of-Expert Controllers + + +
+ Diffusion-model-based text-guided image generation has recently made +astounding progress, producing fascinating results in open-domain image +manipulation tasks. Few models, however, currently have complete zero-shot +capabilities for both global and local image editing due to the complexity and +diversity of image manipulation tasks. In this work, we propose a method with a +mixture-of-expert (MOE) controllers to align the text-guided capacity of +diffusion models with different kinds of human instructions, enabling our model +to handle various open-domain image manipulation tasks with natural language +instructions. First, we use large language models (ChatGPT) and conditional +image synthesis models (ControlNet) to generate a large number of global image +transfer dataset in addition to the instruction-based local image editing +dataset. Then, using an MOE technique and task-specific adaptation training on +a large-scale dataset, our conditional diffusion model can edit images globally +and locally. Extensive experiments demonstrate that our approach performs +surprisingly well on various image manipulation tasks when dealing with +open-domain images and arbitrary human instructions. Please refer to our +project page: [https://oppo-mente-lab.github.io/moe_controller/] + +
+
+ comment: 6 pages,6 figures +
+
+
+
+
+ + ♻ ☆ Boosting Flow-based Generative Super-Resolution Models via Learned Prior CVPR2024 + + +
+ Flow-based super-resolution (SR) models have demonstrated astonishing +capabilities in generating high-quality images. However, these methods +encounter several challenges during image generation, such as grid artifacts, +exploding inverses, and suboptimal results due to a fixed sampling temperature. +To overcome these issues, this work introduces a conditional learned prior to +the inference phase of a flow-based SR model. This prior is a latent code +predicted by our proposed latent module conditioned on the low-resolution +image, which is then transformed by the flow model into an SR image. Our +framework is designed to seamlessly integrate with any contemporary flow-based +SR model without modifying its architecture or pre-trained weights. We evaluate +the effectiveness of our proposed framework through extensive experiments and +ablation analyses. The proposed framework successfully addresses all the +inherent issues in flow-based SR models and enhances their performance in +various SR scenarios. Our code is available at: +https://github.com/liyuantsao/FlowSR-LP + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ GaussianAvatar: Towards Realistic Human Avatar Modeling from a Single + Video via Animatable 3D Gaussians + + +
+ We present GaussianAvatar, an efficient approach to creating realistic human +avatars with dynamic 3D appearances from a single video. We start by +introducing animatable 3D Gaussians to explicitly represent humans in various +poses and clothing styles. Such an explicit and animatable representation can +fuse 3D appearances more efficiently and consistently from 2D observations. Our +representation is further augmented with dynamic properties to support +pose-dependent appearance modeling, where a dynamic appearance network along +with an optimizable feature tensor is designed to learn the +motion-to-appearance mapping. Moreover, by leveraging the differentiable motion +condition, our method enables a joint optimization of motions and appearances +during avatar modeling, which helps to tackle the long-standing issue of +inaccurate motion estimation in monocular settings. The efficacy of +GaussianAvatar is validated on both the public dataset and our collected +dataset, demonstrating its superior performances in terms of appearance quality +and rendering efficiency. + +
+
+ comment: Project Page: https://huliangxiao.github.io/GaussianAvatar +
+
+
+
+
+ + ♻ ☆ EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via + Expressive Masked Audio Gesture Modeling CVPR + + +
+ We propose EMAGE, a framework to generate full-body human gestures from audio +and masked gestures, encompassing facial, local body, hands, and global +movements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new +mesh-level holistic co-speech dataset. BEAT2 combines a MoShed SMPL-X body with +FLAME head parameters and further refines the modeling of head, neck, and +finger movements, offering a community-standardized, high-quality 3D motion +captured dataset. EMAGE leverages masked body gesture priors during training to +boost inference performance. It involves a Masked Audio Gesture Transformer, +facilitating joint training on audio-to-gesture generation and masked gesture +reconstruction to effectively encode audio and body gesture hints. Encoded body +hints from masked gestures are then separately employed to generate facial and +body movements. Moreover, EMAGE adaptively merges speech features from the +audio's rhythm and content and utilizes four compositional VQ-VAEs to enhance +the results' fidelity and diversity. Experiments demonstrate that EMAGE +generates holistic gestures with state-of-the-art performance and is flexible +in accepting predefined spatial-temporal gesture inputs, generating complete, +audio-synchronized results. Our code and dataset are available +https://pantomatrix.github.io/EMAGE/ + +
+
+ comment: Fix typos; Conflict of Interest Disclosure; CVPR Camera Ready; + Project Page: https://pantomatrix.github.io/EMAGE/ +
+
+
+
+
+ + ♻ ☆ A Review of Predictive and Contrastive Self-supervised Learning for + Medical Images + + +
+ Over the last decade, supervised deep learning on manually annotated big data +has been progressing significantly on computer vision tasks. But the +application of deep learning in medical image analysis was limited by the +scarcity of high-quality annotated medical imaging data. An emerging solution +is self-supervised learning (SSL), among which contrastive SSL is the most +successful approach to rivalling or outperforming supervised learning. This +review investigates several state-of-the-art contrastive SSL algorithms +originally on natural images as well as their adaptations for medical images, +and concludes by discussing recent advances, current limitations, and future +directions in applying contrastive SSL in the medical domain. + +
+
+ comment: Article links: + https://link.springer.com/article/10.1007/s11633-022-1406-4 +
+
+
+
+
+ + ♻ ☆ MMVP: A Multimodal MoCap Dataset with Vision and Pressure Sensors CVPR2024 + + +
+ Foot contact is an important cue for human motion capture, understanding, and +generation. Existing datasets tend to annotate dense foot contact using visual +matching with thresholding or incorporating pressure signals. However, these +approaches either suffer from low accuracy or are only designed for small-range +and slow motion. There is still a lack of a vision-pressure multimodal dataset +with large-range and fast human motion, as well as accurate and dense +foot-contact annotation. To fill this gap, we propose a Multimodal MoCap +Dataset with Vision and Pressure sensors, named MMVP. MMVP provides accurate +and dense plantar pressure signals synchronized with RGBD observations, which +is especially useful for both plausible shape estimation, robust pose fitting +without foot drifting, and accurate global translation tracking. To validate +the dataset, we propose an RGBD-P SMPL fitting method and also a +monocular-video-based baseline framework, VP-MoCap, for human motion capture. +Experiments demonstrate that our RGBD-P SMPL Fitting results significantly +outperform pure visual motion capture. Moreover, VP-MoCap outperforms SOTA +methods in foot-contact and global translation estimation accuracy. We believe +the configuration of the dataset and the baseline frameworks will stimulate the +research in this direction and also provide a good reference for MoCap +applications in various domains. Project page: +https://metaverse-ai-lab-thu.github.io/MMVP-Dataset/. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Brain Decodes Deep Nets + + +
+ We developed a tool for visualizing and analyzing large pre-trained vision +models by mapping them onto the brain, thus exposing their hidden inside. Our +innovation arises from a surprising usage of brain encoding: predicting brain +fMRI measurements in response to images. We report two findings. First, +explicit mapping between the brain and deep-network features across dimensions +of space, layers, scales, and channels is crucial. This mapping method, +FactorTopy, is plug-and-play for any deep-network; with it, one can paint a +picture of the network onto the brain (literally!). Second, our visualization +shows how different training methods matter: they lead to remarkable +differences in hierarchical organization and scaling behavior, growing with +more data or network capacity. It also provides insight into fine-tuning: how +pre-trained models change when adapting to small datasets. We found brain-like +hierarchically organized network suffer less from catastrophic forgetting after +fine-tuned. + +
+
+ comment: Website: see https://huzeyann.github.io/brain-decodes-deep-nets . + Code: see https://github.com/huzeyann/BrainDecodesDeepNets +
+
+
+
+
+ + ♻ ☆ OccNeRF: Advancing 3D Occupancy Prediction in LiDAR-Free Environments + + +
+ As a fundamental task of vision-based perception, 3D occupancy prediction +reconstructs 3D structures of surrounding environments. It provides detailed +information for autonomous driving planning and navigation. However, most +existing methods heavily rely on the LiDAR point clouds to generate occupancy +ground truth, which is not available in the vision-based system. In this paper, +we propose an OccNeRF method for training occupancy networks without 3D +supervision. Different from previous works which consider a bounded scene, we +parameterize the reconstructed occupancy fields and reorganize the sampling +strategy to align with the cameras' infinite perceptive range. The neural +rendering is adopted to convert occupancy fields to multi-camera depth maps, +supervised by multi-frame photometric consistency. Moreover, for semantic +occupancy prediction, we design several strategies to polish the prompts and +filter the outputs of a pretrained open-vocabulary 2D segmentation model. +Extensive experiments for both self-supervised depth estimation and 3D +occupancy prediction tasks on nuScenes and SemanticKITTI datasets demonstrate +the effectiveness of our method. + +
+
+ comment: Code: https://github.com/LinShan-Bin/OccNeRF +
+
+
+
+
+ + ♻ ☆ Cross-domain Fiber Cluster Shape Analysis for Language Performance + Cognitive Score Prediction + + +
+ Shape plays an important role in computer graphics, offering informative +features to convey an object's morphology and functionality. Shape analysis in +brain imaging can help interpret structural and functionality correlations of +the human brain. In this work, we investigate the shape of the brain's 3D white +matter connections and its potential predictive relationship to human cognitive +function. We reconstruct brain connections as sequences of 3D points using +diffusion magnetic resonance imaging (dMRI) tractography. To describe each +connection, we extract 12 shape descriptors in addition to traditional dMRI +connectivity and tissue microstructure features. We introduce a novel +framework, Shape--fused Fiber Cluster Transformer (SFFormer), that leverages a +multi-head cross-attention feature fusion module to predict subject-specific +language performance based on dMRI tractography. We assess the performance of +the method on a large dataset including 1065 healthy young adults. The results +demonstrate that both the transformer-based SFFormer model and its inter/intra +feature fusion with shape, microstructure, and connectivity are informative, +and together, they improve the prediction of subject-specific language +performance scores. Overall, our results indicate that the shape of the brain's +connections is predictive of human language function. + +
+
+ comment: 2 figures, 11 pages +
+
+
+
+
+ + ♻ ☆ Video-GroundingDINO: Towards Open-Vocabulary Spatio-Temporal Video + Grounding + + +
+ Video grounding aims to localize a spatio-temporal section in a video +corresponding to an input text query. This paper addresses a critical +limitation in current video grounding methodologies by introducing an +Open-Vocabulary Spatio-Temporal Video Grounding task. Unlike prevalent +closed-set approaches that struggle with open-vocabulary scenarios due to +limited training data and predefined vocabularies, our model leverages +pre-trained representations from foundational spatial grounding models. This +empowers it to effectively bridge the semantic gap between natural language and +diverse visual content, achieving strong performance in closed-set and +open-vocabulary settings. Our contributions include a novel spatio-temporal +video grounding model, surpassing state-of-the-art results in closed-set +evaluations on multiple datasets and demonstrating superior performance in +open-vocabulary scenarios. Notably, the proposed model outperforms +state-of-the-art methods in closed-set settings on VidSTG (Declarative and +Interrogative) and HC-STVG (V1 and V2) datasets. Furthermore, in +open-vocabulary evaluations on HC-STVG V1 and YouCook-Interactions, our model +surpasses the recent best-performing models by $4.88$ m_vIoU and $1.83\%$ +accuracy, demonstrating its efficacy in handling diverse linguistic and visual +concepts for improved video understanding. Our codes will be publicly released. + +
+
+
+
+
+ + ♻ ☆ Collaborating Foundation Models for Domain Generalized Semantic + Segmentation CVPR 2024 + + +
+ Domain Generalized Semantic Segmentation (DGSS) deals with training a model +on a labeled source domain with the aim of generalizing to unseen domains +during inference. Existing DGSS methods typically effectuate robust features by +means of Domain Randomization (DR). Such an approach is often limited as it can +only account for style diversification and not content. In this work, we take +an orthogonal approach to DGSS and propose to use an assembly of CoLlaborative +FOUndation models for Domain Generalized Semantic Segmentation (CLOUDS). In +detail, CLOUDS is a framework that integrates FMs of various kinds: (i) CLIP +backbone for its robust feature representation, (ii) generative models to +diversify the content, thereby covering various modes of the possible target +distribution, and (iii) Segment Anything Model (SAM) for iteratively refining +the predictions of the segmentation model. Extensive experiments show that our +CLOUDS excels in adapting from synthetic to real DGSS benchmarks and under +varying weather conditions, notably outperforming prior methods by 5.6% and +6.7% on averaged miou, respectively. The code is available at : +https://github.com/yasserben/CLOUDS + +
+
+ comment: https://github.com/yasserben/CLOUDS ; Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Traffic Scene Parsing through the TSP6K Dataset CVPR 2024 + + +
+ Traffic scene perception in computer vision is a critically important task to +achieve intelligent cities. To date, most existing datasets focus on autonomous +driving scenes. We observe that the models trained on those driving datasets +often yield unsatisfactory results on traffic monitoring scenes. However, +little effort has been put into improving the traffic monitoring scene +understanding, mainly due to the lack of specific datasets. To fill this gap, +we introduce a specialized traffic monitoring dataset, termed TSP6K, containing +images from the traffic monitoring scenario, with high-quality pixel-level and +instance-level annotations. The TSP6K dataset captures more crowded traffic +scenes with several times more traffic participants than the existing driving +scenes. We perform a detailed analysis of the dataset and comprehensively +evaluate previous popular scene parsing methods, instance segmentation methods +and unsupervised domain adaption methods. Furthermore, considering the vast +difference in instance sizes, we propose a detail refining decoder for scene +parsing, which recovers the details of different semantic regions in traffic +scenes owing to the proposed TSP6K dataset. Experiments show its effectiveness +in parsing the traffic monitoring scenes. Code and dataset are available at +https://github.com/PengtaoJiang/TSP6K. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SD-NAE: Generating Natural Adversarial Examples with Stable Diffusion ICLR 2024 + + +
+ Natural Adversarial Examples (NAEs), images arising naturally from the +environment and capable of deceiving classifiers, are instrumental in robustly +evaluating and identifying vulnerabilities in trained models. In this work, +unlike prior works that passively collect NAEs from real images, we propose to +actively synthesize NAEs using the state-of-the-art Stable Diffusion. +Specifically, our method formulates a controlled optimization process, where we +perturb the token embedding that corresponds to a specified class to generate +NAEs. This generation process is guided by the gradient of loss from the target +classifier, ensuring that the created image closely mimics the ground-truth +class yet fools the classifier. Named SD-NAE (Stable Diffusion for Natural +Adversarial Examples), our innovative method is effective in producing valid +and useful NAEs, which is demonstrated through a meticulously designed +experiment. Code is available at https://github.com/linyueqian/SD-NAE. + +
+
+ comment: Accepted by ICLR 2024 TinyPapers +
+
+
+
+
+ + ♻ ☆ GlORIE-SLAM: Globally Optimized RGB-only Implicit Encoding Point Cloud + SLAM + + +
+ Recent advancements in RGB-only dense Simultaneous Localization and Mapping +(SLAM) have predominantly utilized grid-based neural implicit encodings and/or +struggle to efficiently realize global map and pose consistency. To this end, +we propose an efficient RGB-only dense SLAM system using a flexible neural +point cloud scene representation that adapts to keyframe poses and depth +updates, without needing costly backpropagation. Another critical challenge of +RGB-only SLAM is the lack of geometric priors. To alleviate this issue, with +the aid of a monocular depth estimator, we introduce a novel DSPO layer for +bundle adjustment which optimizes the pose and depth of keyframes along with +the scale of the monocular depth. Finally, our system benefits from loop +closure and online global bundle adjustment and performs either better or +competitive to existing dense neural RGB SLAM methods in tracking, mapping and +rendering accuracy on the Replica, TUM-RGBD and ScanNet datasets. The source +code will be made available. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 185 + +
+
+
+ + ☆ Unsolvable Problem Detection: Evaluating Trustworthiness of Vision + Language Models + + +
+ This paper introduces a novel and significant challenge for Vision Language +Models (VLMs), termed Unsolvable Problem Detection (UPD). UPD examines the +VLM's ability to withhold answers when faced with unsolvable problems in the +context of Visual Question Answering (VQA) tasks. UPD encompasses three +distinct settings: Absent Answer Detection (AAD), Incompatible Answer Set +Detection (IASD), and Incompatible Visual Question Detection (IVQD). To deeply +investigate the UPD problem, extensive experiments indicate that most VLMs, +including GPT-4V and LLaVA-Next-34B, struggle with our benchmarks to varying +extents, highlighting significant room for the improvements. To address UPD, we +explore both training-free and training-based solutions, offering new insights +into their effectiveness and limitations. We hope our insights, together with +future efforts within the proposed UPD settings, will enhance the broader +understanding and development of more practical and reliable VLMs. + +
+
+ comment: Code: https://github.com/AtsuMiyai/UPD +
+
+
+
+
+ + ☆ Are We on the Right Way for Evaluating Large Vision-Language Models? + + +
+ Large vision-language models (LVLMs) have recently achieved rapid progress, +sparking numerous studies to evaluate their multi-modal capabilities. However, +we dig into current evaluation works and identify two primary issues: 1) Visual +content is unnecessary for many samples. The answers can be directly inferred +from the questions and options, or the world knowledge embedded in LLMs. This +phenomenon is prevalent across current benchmarks. For instance, GeminiPro +achieves 42.9% on the MMMU benchmark without any visual input, and outperforms +the random choice baseline across six benchmarks over 20% on average. 2) +Unintentional data leakage exists in LLM and LVLM training. LLM and LVLM could +still answer some visual-necessary questions without visual content, indicating +the memorizing of these samples within large-scale training data. For example, +Sphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM +backbone with 17.9%. Both problems lead to misjudgments of actual multi-modal +gains and potentially misguide the study of LVLM. To this end, we present +MMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500 +samples meticulously selected by humans. MMStar benchmarks 6 core capabilities +and 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with +carefully balanced and purified samples. These samples are first roughly +selected from current benchmarks with an automated pipeline, human review is +then involved to ensure each curated sample exhibits visual dependency, minimal +data leakage, and requires advanced multi-modal capabilities. Moreover, two +metrics are developed to measure data leakage and actual performance gain in +multi-modal training. We evaluate 16 leading LVLMs on MMStar to assess their +multi-modal capabilities, and on 7 benchmarks with the proposed metrics to +investigate their data leakage and actual multi-modal gain. + +
+
+ comment: Project page: https://mmstar-benchmark.github.io/ +
+
+
+
+
+ + ☆ MTLoRA: A Low-Rank Adaptation Approach for Efficient Multi-Task Learning CVPR + + +
+ Adapting models pre-trained on large-scale datasets to a variety of +downstream tasks is a common strategy in deep learning. Consequently, +parameter-efficient fine-tuning methods have emerged as a promising way to +adapt pre-trained models to different tasks while training only a minimal +number of parameters. While most of these methods are designed for single-task +adaptation, parameter-efficient training in Multi-Task Learning (MTL) +architectures is still unexplored. In this paper, we introduce MTLoRA, a novel +framework for parameter-efficient training of MTL models. MTLoRA employs +Task-Agnostic and Task-Specific Low-Rank Adaptation modules, which effectively +disentangle the parameter space in MTL fine-tuning, thereby enabling the model +to adeptly handle both task specialization and interaction within MTL contexts. +We applied MTLoRA to hierarchical-transformer-based MTL architectures, adapting +them to multiple downstream dense prediction tasks. Our extensive experiments +on the PASCAL dataset show that MTLoRA achieves higher accuracy on downstream +tasks compared to fully fine-tuning the MTL model while reducing the number of +trainable parameters by 3.6x. Furthermore, MTLoRA establishes a Pareto-optimal +trade-off between the number of trainable parameters and the accuracy of the +downstream tasks, outperforming current state-of-the-art parameter-efficient +training methods in both accuracy and efficiency. Our code is publicly +available. + +
+
+ comment: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR), 2024 +
+
+
+
+
+ + ☆ SeaBird: Segmentation in Bird's View with Dice Loss Improves Monocular + 3D Detection of Large Objects CVPR 2024 + + +
+ Monocular 3D detectors achieve remarkable performance on cars and smaller +objects. However, their performance drops on larger objects, leading to fatal +accidents. Some attribute the failures to training data scarcity or their +receptive field requirements of large objects. In this paper, we highlight this +understudied problem of generalization to large objects. We find that modern +frontal detectors struggle to generalize to large objects even on nearly +balanced datasets. We argue that the cause of failure is the sensitivity of +depth regression losses to noise of larger objects. To bridge this gap, we +comprehensively investigate regression and dice losses, examining their +robustness under varying error levels and object sizes. We mathematically prove +that the dice loss leads to superior noise-robustness and model convergence for +large objects compared to regression losses for a simplified case. Leveraging +our theoretical insights, we propose SeaBird (Segmentation in Bird's View) as +the first step towards generalizing to large objects. SeaBird effectively +integrates BEV segmentation on foreground objects for 3D detection, with the +segmentation head trained with the dice loss. SeaBird achieves SoTA results on +the KITTI-360 leaderboard and improves existing detectors on the nuScenes +leaderboard, particularly for large objects. Code and models at +https://github.com/abhi1kumar/SeaBird + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Convolutional Prompting meets Language Models for Continual Learning CVPR 2024 + + +
+ Continual Learning (CL) enables machine learning models to learn from +continuously shifting new training data in absence of data from old tasks. +Recently, pretrained vision transformers combined with prompt tuning have shown +promise for overcoming catastrophic forgetting in CL. These approaches rely on +a pool of learnable prompts which can be inefficient in sharing knowledge +across tasks leading to inferior performance. In addition, the lack of +fine-grained layer specific prompts does not allow these to fully express the +strength of the prompts for CL. We address these limitations by proposing +ConvPrompt, a novel convolutional prompt creation mechanism that maintains +layer-wise shared embeddings, enabling both layer-specific learning and better +concept transfer across tasks. The intelligent use of convolution enables us to +maintain a low parameter overhead without compromising performance. We further +leverage Large Language Models to generate fine-grained text descriptions of +each category which are used to get task similarity and dynamically decide the +number of prompts to be learned. Extensive experiments demonstrate the +superiority of ConvPrompt and improves SOTA by ~3% with significantly less +parameter overhead. We also perform strong ablation over various modules to +disentangle the importance of different components. + +
+
+ comment: CVPR 2024 Camera Ready +
+
+
+
+
+ + ☆ Learn "No" to Say "Yes" Better: Improving Vision-Language Models via + Negations + + +
+ Existing vision-language models (VLMs) treat text descriptions as a unit, +confusing individual concepts in a prompt and impairing visual semantic +matching and reasoning. An important aspect of reasoning in logic and language +is negations. This paper highlights the limitations of popular VLMs such as +CLIP, at understanding the implications of negations, i.e., the effect of the +word "not" in a given prompt. To enable evaluation of VLMs on fluent prompts +with negations, we present CC-Neg, a dataset containing 228,246 images, true +captions and their corresponding negated captions. Using CC-Neg along with +modifications to the contrastive loss of CLIP, our proposed CoN-CLIP framework, +has an improved understanding of negations. This training paradigm improves +CoN-CLIP's ability to encode semantics reliably, resulting in 3.85% average +gain in top-1 accuracy for zero-shot image classification across 8 datasets. +Further, CoN-CLIP outperforms CLIP on challenging compositionality benchmarks +such as SugarCREPE by 4.4%, showcasing emergent compositional understanding of +objects, relations, and attributes in text. Overall, our work addresses a +crucial limitation of VLMs by introducing a dataset and framework that +strengthens semantic associations between images and text, demonstrating +improved large-scale foundation models with significantly reduced computational +cost, promoting efficiency and accessibility. + +
+
+ comment: 14 pages + 6 figures in main manuscript (excluding references) +
+
+
+
+
+ + ☆ InstantSplat: Unbounded Sparse-view Pose-free Gaussian Splatting in 40 + Seconds + + +
+ While novel view synthesis (NVS) has made substantial progress in 3D computer +vision, it typically requires an initial estimation of camera intrinsics and +extrinsics from dense viewpoints. This pre-processing is usually conducted via +a Structure-from-Motion (SfM) pipeline, a procedure that can be slow and +unreliable, particularly in sparse-view scenarios with insufficient matched +features for accurate reconstruction. In this work, we integrate the strengths +of point-based representations (e.g., 3D Gaussian Splatting, 3D-GS) with +end-to-end dense stereo models (DUSt3R) to tackle the complex yet unresolved +issues in NVS under unconstrained settings, which encompasses pose-free and +sparse view challenges. Our framework, InstantSplat, unifies dense stereo +priors with 3D-GS to build 3D Gaussians of large-scale scenes from sparseview & +pose-free images in less than 1 minute. Specifically, InstantSplat comprises a +Coarse Geometric Initialization (CGI) module that swiftly establishes a +preliminary scene structure and camera parameters across all training views, +utilizing globally-aligned 3D point maps derived from a pre-trained dense +stereo pipeline. This is followed by the Fast 3D-Gaussian Optimization (F-3DGO) +module, which jointly optimizes the 3D Gaussian attributes and the initialized +poses with pose regularization. Experiments conducted on the large-scale +outdoor Tanks & Temples datasets demonstrate that InstantSplat significantly +improves SSIM (by 32%) while concurrently reducing Absolute Trajectory Error +(ATE) by 80%. These establish InstantSplat as a viable solution for scenarios +involving posefree and sparse-view conditions. Project page: +instantsplat.github.io. + +
+
+
+
+
+ + ☆ Benchmarking Counterfactual Image Generation + + +
+ Counterfactual image generation is pivotal for understanding the causal +relations of variables, with applications in interpretability and generation of +unbiased synthetic data. However, evaluating image generation is a +long-standing challenge in itself. The need to evaluate counterfactual +generation compounds on this challenge, precisely because counterfactuals, by +definition, are hypothetical scenarios without observable ground truths. In +this paper, we present a novel comprehensive framework aimed at benchmarking +counterfactual image generation methods. We incorporate metrics that focus on +evaluating diverse aspects of counterfactuals, such as composition, +effectiveness, minimality of interventions, and image realism. We assess the +performance of three distinct conditional image generation model types, based +on the Structural Causal Model paradigm. Our work is accompanied by a +user-friendly Python package which allows to further evaluate and benchmark +existing and future counterfactual image generation methods. Our framework is +extendable to additional SCM and other causal methods, generative models, and +datasets. + +
+
+
+
+
+ + ☆ Snap-it, Tap-it, Splat-it: Tactile-Informed 3D Gaussian Splatting for + Reconstructing Challenging Surfaces + + +
+ Touch and vision go hand in hand, mutually enhancing our ability to +understand the world. From a research perspective, the problem of mixing touch +and vision is underexplored and presents interesting challenges. To this end, +we propose Tactile-Informed 3DGS, a novel approach that incorporates touch data +(local depth maps) with multi-view vision data to achieve surface +reconstruction and novel view synthesis. Our method optimises 3D Gaussian +primitives to accurately model the object's geometry at points of contact. By +creating a framework that decreases the transmittance at touch locations, we +achieve a refined surface reconstruction, ensuring a uniformly smooth depth +map. Touch is particularly useful when considering non-Lambertian objects (e.g. +shiny or reflective surfaces) since contemporary methods tend to fail to +reconstruct with fidelity specular highlights. By combining vision and tactile +sensing, we achieve more accurate geometry reconstructions with fewer images +than prior methods. We conduct evaluation on objects with glossy and reflective +surfaces and demonstrate the effectiveness of our approach, offering +significant improvements in reconstruction quality. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ CATSNet: a context-aware network for Height Estimation in a Forested + Area based on Pol-TomoSAR data + + +
+ Tropical forests are a key component of the global carbon cycle. With plans +for upcoming space-borne missions like BIOMASS to monitor forestry, several +airborne missions, including TropiSAR and AfriSAR campaigns, have been +successfully launched and experimented. Typical Synthetic Aperture Radar +Tomography (TomoSAR) methods involve complex models with low accuracy and high +computation costs. In recent years, deep learning methods have also gained +attention in the TomoSAR framework, showing interesting performance. Recently, +a solution based on a fully connected Tomographic Neural Network (TSNN) has +demonstrated its effectiveness in accurately estimating forest and ground +heights by exploiting the pixel-wise elements of the covariance matrix derived +from TomoSAR data. This work instead goes beyond the pixel-wise approach to +define a context-aware deep learning-based solution named CATSNet. A +convolutional neural network is considered to leverage patch-based information +and extract features from a neighborhood rather than focus on a single pixel. +The training is conducted by considering TomoSAR data as the input and Light +Detection and Ranging (LiDAR) values as the ground truth. The experimental +results show striking advantages in both performance and generalization ability +by leveraging context information within Multiple Baselines (MB) TomoSAR data +across different polarimetric modalities, surpassing existing techniques. + +
+
+ comment: Submitted to IEEE TGRS, under review +
+
+
+
+
+ + ☆ Draw-and-Understand: Leveraging Visual Prompts to Enable MLLMs to + Comprehend What You Want + + +
+ The interaction between humans and artificial intelligence (AI) is a crucial +factor that reflects the effectiveness of multimodal large language models +(MLLMs). However, current MLLMs primarily focus on image-level comprehension +and limit interaction to textual instructions, thereby constraining their +flexibility in usage and depth of response. In this paper, we introduce the +Draw-and-Understand project: a new model, a multi-domain dataset, and a +challenging benchmark for visual prompting. Specifically, we propose SPHINX-V, +a new end-to-end trained Multimodal Large Language Model (MLLM) that connects a +vision encoder, a visual prompt encoder and an LLM for various visual prompts +(points, bounding boxes, and free-form shape) and language understanding. To +advance visual prompting research for MLLMs, we introduce MDVP-Data and +MDVP-Bench. MDVP-Data features a multi-domain dataset containing 1.6M unique +image-visual prompt-text instruction-following samples, including natural +images, document images, OCR images, mobile screenshots, web screenshots, and +multi-panel images. Furthermore, we present MDVP-Bench, a comprehensive and +challenging benchmark to assess a model's capability in understanding visual +prompting instructions. Our experiments demonstrate SPHINX-V's impressive +multimodal interaction capabilities through visual prompting, revealing +significant improvements in detailed pixel-level description and +question-answering abilities. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ☆ Prototype-based Interpretable Breast Cancer Prediction Models: Analysis + and Challenges + + +
+ Deep learning models have achieved high performance in medical applications, +however, their adoption in clinical practice is hindered due to their black-box +nature. Self-explainable models, like prototype-based models, can be especially +beneficial as they are interpretable by design. However, if the learnt +prototypes are of low quality then the prototype-based models are as good as +black-box. Having high quality prototypes is a pre-requisite for a truly +interpretable model. In this work, we propose a prototype evaluation framework +for coherence (PEF-C) for quantitatively evaluating the quality of the +prototypes based on domain knowledge. We show the use of PEF-C in the context +of breast cancer prediction using mammography. Existing works on +prototype-based models on breast cancer prediction using mammography have +focused on improving the classification performance of prototype-based models +compared to black-box models and have evaluated prototype quality through +anecdotal evidence. We are the first to go beyond anecdotal evidence and +evaluate the quality of the mammography prototypes systematically using our +PEF-C. Specifically, we apply three state-of-the-art prototype-based models, +ProtoPNet, BRAIxProtoPNet++ and PIP-Net on mammography images for breast cancer +prediction and evaluate these models w.r.t. i) classification performance, and +ii) quality of the prototypes, on three public datasets. Our results show that +prototype-based models are competitive with black-box models in terms of +classification performance, and achieve a higher score in detecting ROIs. +However, the quality of the prototypes are not yet sufficient and can be +improved in aspects of relevance, purity and learning a variety of prototypes. +We call the XAI community to systematically evaluate the quality of the +prototypes to check their true usability in high stake decisions and improve +such models further. + +
+
+ comment: 21 pages, 5 figures, 3 tables +
+
+
+
+
+ + ☆ Benchmarking the Robustness of Temporal Action Detection Models Against + Temporal Corruptions CVPR2024 + + +
+ Temporal action detection (TAD) aims to locate action positions and recognize +action categories in long-term untrimmed videos. Although many methods have +achieved promising results, their robustness has not been thoroughly studied. +In practice, we observe that temporal information in videos can be occasionally +corrupted, such as missing or blurred frames. Interestingly, existing methods +often incur a significant performance drop even if only one frame is affected. +To formally evaluate the robustness, we establish two temporal corruption +robustness benchmarks, namely THUMOS14-C and ActivityNet-v1.3-C. In this paper, +we extensively analyze the robustness of seven leading TAD methods and obtain +some interesting findings: 1) Existing methods are particularly vulnerable to +temporal corruptions, and end-to-end methods are often more susceptible than +those with a pre-trained feature extractor; 2) Vulnerability mainly comes from +localization error rather than classification error; 3) When corruptions occur +in the middle of an action instance, TAD models tend to yield the largest +performance drop. Besides building a benchmark, we further develop a simple but +effective robust training method to defend against temporal corruptions, +through the FrameDrop augmentation and Temporal-Robust Consistency loss. +Remarkably, our approach not only improves robustness but also yields promising +improvements on clean data. We believe that this study will serve as a +benchmark for future research in robust video analysis. Source code and models +are available at https://github.com/Alvin-Zeng/temporal-robustness-benchmark. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ MedCLIP-SAM: Bridging Text and Image Towards Universal Medical Image + Segmentation + + +
+ Medical image segmentation of anatomical structures and pathology is crucial +in modern clinical diagnosis, disease study, and treatment planning. To date, +great progress has been made in deep learning-based segmentation techniques, +but most methods still lack data efficiency, generalizability, and +interactability. Consequently, the development of new, precise segmentation +methods that demand fewer labeled datasets is of utmost importance in medical +image analysis. Recently, the emergence of foundation models, such as CLIP and +Segment-Anything-Model (SAM), with comprehensive cross-domain representation +opened the door for interactive and universal image segmentation. However, +exploration of these models for data-efficient medical image segmentation is +still limited, but is highly necessary. In this paper, we propose a novel +framework, called MedCLIP-SAM that combines CLIP and SAM models to generate +segmentation of clinical scans using text prompts in both zero-shot and weakly +supervised settings. To achieve this, we employed a new Decoupled Hard Negative +Noise Contrastive Estimation (DHN-NCE) loss to fine-tune the BiomedCLIP model +and the recent gScoreCAM to generate prompts to obtain segmentation masks from +SAM in a zero-shot setting. Additionally, we explored the use of zero-shot +segmentation labels in a weakly supervised paradigm to improve the segmentation +quality further. By extensively testing three diverse segmentation tasks and +medical image modalities (breast tumor ultrasound, brain tumor MRI, and lung +X-ray), our proposed framework has demonstrated excellent accuracy. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ Latent Embedding Clustering for Occlusion Robust Head Pose Estimation + + +
+ Head pose estimation has become a crucial area of research in computer vision +given its usefulness in a wide range of applications, including robotics, +surveillance, or driver attention monitoring. One of the most difficult +challenges in this field is managing head occlusions that frequently take place +in real-world scenarios. In this paper, we propose a novel and efficient +framework that is robust in real world head occlusion scenarios. In particular, +we propose an unsupervised latent embedding clustering with regression and +classification components for each pose angle. The model optimizes latent +feature representations for occluded and non-occluded images through a +clustering term while improving fine-grained angle predictions. Experimental +evaluation on in-the-wild head pose benchmark datasets reveal competitive +performance in comparison to state-of-the-art methodologies with the advantage +of having a significant data reduction. We observe a substantial improvement in +occluded head pose estimation. Also, an ablation study is conducted to +ascertain the impact of the clustering term within our proposed framework. + +
+
+ comment: Accepted at 18th IEEE International Conference on Automatic Face and + Gesture Recognition (FG'24) +
+
+
+
+
+ + ☆ Relation Rectification in Diffusion Model + + +
+ Despite their exceptional generative abilities, large text-to-image diffusion +models, much like skilled but careless artists, often struggle with accurately +depicting visual relationships between objects. This issue, as we uncover +through careful analysis, arises from a misaligned text encoder that struggles +to interpret specific relationships and differentiate the logical order of +associated objects. To resolve this, we introduce a novel task termed Relation +Rectification, aiming to refine the model to accurately represent a given +relationship it initially fails to generate. To address this, we propose an +innovative solution utilizing a Heterogeneous Graph Convolutional Network +(HGCN). It models the directional relationships between relation terms and +corresponding objects within the input prompts. Specifically, we optimize the +HGCN on a pair of prompts with identical relational words but reversed object +orders, supplemented by a few reference images. The lightweight HGCN adjusts +the text embeddings generated by the text encoder, ensuring the accurate +reflection of the textual relation in the embedding space. Crucially, our +method retains the parameters of the text encoder and diffusion model, +preserving the model's robust performance on unrelated descriptions. We +validated our approach on a newly curated dataset of diverse relational data, +demonstrating both quantitative and qualitative enhancements in generating +images with precise visual relations. Project page: +https://wuyinwei-hah.github.io/rrnet.github.io/. + +
+
+
+
+
+ + ☆ Long-Tailed Anomaly Detection with Learnable Class Names CVPR 2024 + + +
+ Anomaly detection (AD) aims to identify defective images and localize their +defects (if any). Ideally, AD models should be able to detect defects over many +image classes; without relying on hard-coded class names that can be +uninformative or inconsistent across datasets; learn without anomaly +supervision; and be robust to the long-tailed distributions of real-world +applications. To address these challenges, we formulate the problem of +long-tailed AD by introducing several datasets with different levels of class +imbalance and metrics for performance evaluation. We then propose a novel +method, LTAD, to detect defects from multiple and long-tailed classes, without +relying on dataset class names. LTAD combines AD by reconstruction and semantic +AD modules. AD by reconstruction is implemented with a transformer-based +reconstruction module. Semantic AD is implemented with a binary classifier, +which relies on learned pseudo class names and a pretrained foundation model. +These modules are learned over two phases. Phase 1 learns the pseudo-class +names and a variational autoencoder (VAE) for feature synthesis that augments +the training data to combat long-tails. Phase 2 then learns the parameters of +the reconstruction and classification modules of LTAD. Extensive experiments +using the proposed long-tailed datasets show that LTAD substantially +outperforms the state-of-the-art methods for most forms of dataset imbalance. +The long-tailed dataset split is available at +https://zenodo.org/records/10854201 . + +
+
+ comment: This paper is accepted to CVPR 2024. The supplementary material is + included. The long-tailed dataset split is available at + https://zenodo.org/records/10854201 +
+
+
+
+
+ + ☆ U-VAP: User-specified Visual Appearance Personalization via Decoupled + Self Augmentation + + +
+ Concept personalization methods enable large text-to-image models to learn +specific subjects (e.g., objects/poses/3D models) and synthesize renditions in +new contexts. Given that the image references are highly biased towards visual +attributes, state-of-the-art personalization models tend to overfit the whole +subject and cannot disentangle visual characteristics in pixel space. In this +study, we proposed a more challenging setting, namely fine-grained visual +appearance personalization. Different from existing methods, we allow users to +provide a sentence describing the desired attributes. A novel decoupled +self-augmentation strategy is proposed to generate target-related and +non-target samples to learn user-specified visual attributes. These augmented +data allow for refining the model's understanding of the target attribute while +mitigating the impact of unrelated attributes. At the inference stage, +adjustments are conducted on semantic space through the learned target and +non-target embeddings to further enhance the disentanglement of target +attributes. Extensive experiments on various kinds of visual attributes with +SOTA personalization methods show the ability of the proposed method to mimic +target visual appearance in novel contexts, thus improving the controllability +and flexibility of personalization. + +
+
+ comment: 14 pages, 13 figures, 2 tables +
+
+
+
+
+ + ☆ MTMMC: A Large-Scale Real-World Multi-Modal Camera Tracking Benchmark CVPR 2024 + + +
+ Multi-target multi-camera tracking is a crucial task that involves +identifying and tracking individuals over time using video streams from +multiple cameras. This task has practical applications in various fields, such +as visual surveillance, crowd behavior analysis, and anomaly detection. +However, due to the difficulty and cost of collecting and labeling data, +existing datasets for this task are either synthetically generated or +artificially constructed within a controlled camera network setting, which +limits their ability to model real-world dynamics and generalize to diverse +camera configurations. To address this issue, we present MTMMC, a real-world, +large-scale dataset that includes long video sequences captured by 16 +multi-modal cameras in two different environments - campus and factory - across +various time, weather, and season conditions. This dataset provides a +challenging test-bed for studying multi-camera tracking under diverse +real-world complexities and includes an additional input modality of spatially +aligned and temporally synchronized RGB and thermal cameras, which enhances the +accuracy of multi-camera tracking. MTMMC is a super-set of existing datasets, +benefiting independent fields such as person detection, re-identification, and +multiple object tracking. We provide baselines and new learning setups on this +dataset and set the reference scores for future studies. The datasets, models, +and test server will be made publicly available. + +
+
+ comment: Accepted on CVPR 2024 +
+
+
+
+
+ + ☆ H2RSVLM: Towards Helpful and Honest Remote Sensing Large Vision Language + Model + + +
+ The generic large Vision-Language Models (VLMs) is rapidly developing, but +still perform poorly in Remote Sensing (RS) domain, which is due to the unique +and specialized nature of RS imagery and the comparatively limited spatial +perception of current VLMs. Existing Remote Sensing specific Vision Language +Models (RSVLMs) still have considerable potential for improvement, primarily +owing to the lack of large-scale, high-quality RS vision-language datasets. We +constructed HqDC-1.4M, the large scale High quality and Detailed Captions for +RS images, containing 1.4 million image-caption pairs, which not only enhance +the RSVLM's understanding of RS images but also significantly improve the +model's spatial perception abilities, such as localization and counting, +thereby increasing the helpfulness of the RSVLM. Moreover, to address the +inevitable "hallucination" problem in RSVLM, we developed RSSA, the first +dataset aimed at enhancing the Self-Awareness capability of RSVLMs. By +incorporating a variety of unanswerable questions into typical RS visual +question-answering tasks, RSSA effectively improves the truthfulness and +reduces the hallucinations of the model's outputs, thereby enhancing the +honesty of the RSVLM. Based on these datasets, we proposed the H2RSVLM, the +Helpful and Honest Remote Sensing Vision Language Model. H2RSVLM has achieved +outstanding performance on multiple RS public datasets and is capable of +recognizing and refusing to answer the unanswerable questions, effectively +mitigating the incorrect generations. We will release the code, data and model +weights at https://github.com/opendatalab/H2RSVLM . + +
+
+ comment: Equal contribution: Chao Pang, Jiang Wu; Corresponding author: + Gui-Song Xia, Conghui He +
+
+
+
+
+ + ☆ Enhancing Lithological Mapping with Spatially Constrained Bayesian + Network (SCB-Net): An Approach for Field Data-Constrained Predictions with + Uncertainty Evaluation + + +
+ Geological maps are an extremely valuable source of information for the Earth +sciences. They provide insights into mineral exploration, vulnerability to +natural hazards, and many other applications. These maps are created using +numerical or conceptual models that use geological observations to extrapolate +data. Geostatistical techniques have traditionally been used to generate +reliable predictions that take into account the spatial patterns inherent in +the data. However, as the number of auxiliary variables increases, these +methods become more labor-intensive. Additionally, traditional machine learning +methods often struggle with spatially correlated data and extracting valuable +non-linear information from geoscientific datasets. To address these +limitations, a new architecture called the Spatially Constrained Bayesian +Network (SCB-Net) has been developed. The SCB-Net aims to effectively exploit +the information from auxiliary variables while producing spatially constrained +predictions. It is made up of two parts, the first part focuses on learning +underlying patterns in the auxiliary variables while the second part integrates +ground-truth data and the learned embeddings from the first part. Moreover, to +assess model uncertainty, a technique called Monte Carlo dropout is used as a +Bayesian approximation. The SCB-Net has been applied to two selected areas in +northern Quebec, Canada, and has demonstrated its potential in generating +field-data-constrained lithological maps while allowing assessment of +prediction uncertainty for decision-making. This study highlights the promising +advancements of deep neural networks in geostatistics, particularly in handling +complex spatial feature learning tasks, leading to improved spatial information +techniques. + +
+
+ comment: 17 pages, 3559 words, 14 figures +
+
+
+
+
+ + ☆ Motion Inversion for Video Customization + + +
+ In this research, we present a novel approach to motion customization in +video generation, addressing the widespread gap in the thorough exploration of +motion representation within video generative models. Recognizing the unique +challenges posed by video's spatiotemporal nature, our method introduces Motion +Embeddings, a set of explicit, temporally coherent one-dimensional embeddings +derived from a given video. These embeddings are designed to integrate +seamlessly with the temporal transformer modules of video diffusion models, +modulating self-attention computations across frames without compromising +spatial integrity. Our approach offers a compact and efficient solution to +motion representation and enables complex manipulations of motion +characteristics through vector arithmetic in the embedding space. Furthermore, +we identify the Temporal Discrepancy in video generative models, which refers +to variations in how different motion modules process temporal relationships +between frames. We leverage this understanding to optimize the integration of +our motion embeddings. Our contributions include the introduction of a tailored +motion embedding for customization tasks, insights into the temporal processing +differences in video models, and a demonstration of the practical advantages +and effectiveness of our method through extensive experiments. + +
+
+ comment: Project Page: + \href{https://wileewang.github.io/MotionInversion/}{https://wileewang.github.io/MotionInversion/} +
+
+
+
+
+ + ☆ Sketch-to-Architecture: Generative AI-aided Architectural Design + + +
+ Recently, the development of large-scale models has paved the way for various +interdisciplinary research, including architecture. By using generative AI, we +present a novel workflow that utilizes AI models to generate conceptual +floorplans and 3D models from simple sketches, enabling rapid ideation and +controlled generation of architectural renderings based on textual +descriptions. Our work demonstrates the potential of generative AI in the +architectural design process, pointing towards a new direction of +computer-aided architectural design. Our project website is available at: +https://zrealli.github.io/sketch2arc + +
+
+ comment: Pacific Graphics 2023, accepted as Poster +
+
+
+
+
+ + ☆ HARMamba: Efficient Wearable Sensor Human Activity Recognition Based on + Bidirectional Selective SSM + + +
+ Wearable sensor human activity recognition (HAR) is a crucial area of +research in activity sensing. While transformer-based temporal deep learning +models have been extensively studied and implemented, their large number of +parameters present significant challenges in terms of system computing load and +memory usage, rendering them unsuitable for real-time mobile activity +recognition applications. Recently, an efficient hardware-aware state space +model (SSM) called Mamba has emerged as a promising alternative. Mamba +demonstrates strong potential in long sequence modeling, boasts a simpler +network architecture, and offers an efficient hardware-aware design. Leveraging +SSM for activity recognition represents an appealing avenue for exploration. In +this study, we introduce HARMamba, which employs a more lightweight selective +SSM as the foundational model architecture for activity recognition. The goal +is to address the computational resource constraints encountered in real-time +activity recognition scenarios. Our approach involves processing sensor data +flow by independently learning each channel and segmenting the data into +"patches". The marked sensor sequence's position embedding serves as the input +token for the bidirectional state space model, ultimately leading to activity +categorization through the classification head. Compared to established +activity recognition frameworks like Transformer-based models, HARMamba +achieves superior performance while also reducing computational and memory +overhead. Furthermore, our proposed method has been extensively tested on four +public activity datasets: PAMAP2, WISDM, UNIMIB, and UCI, demonstrating +impressive performance in activity recognition tasks. + +
+
+
+
+
+ + ☆ MCNet: A crowd denstity estimation network based on integrating + multiscale attention module + + +
+ Aiming at the metro video surveillance system has not been able to +effectively solve the metro crowd density estimation problem, a Metro Crowd +density estimation Network (called MCNet) is proposed to automatically classify +crowd density level of passengers. Firstly, an Integrating Multi-scale +Attention (IMA) module is proposed to enhance the ability of the plain +classifiers to extract semantic crowd texture features to accommodate to the +characteristics of the crowd texture feature. The innovation of the IMA module +is to fuse the dilation convolution, multiscale feature extraction and +attention mechanism to obtain multi-scale crowd feature activation from a +larger receptive field with lower computational cost, and to strengthen the +crowds activation state of convolutional features in top layers. Secondly, a +novel lightweight crowd texture feature extraction network is proposed, which +can directly process video frames and automatically extract texture features +for crowd density estimation, while its faster image processing speed and fewer +network parameters make it flexible to be deployed on embedded platforms with +limited hardware resources. Finally, this paper integrates IMA module and the +lightweight crowd texture feature extraction network to construct the MCNet, +and validate the feasibility of this network on image classification dataset: +Cifar10 and four crowd density datasets: PETS2009, Mall, QUT and SH_METRO to +validate the MCNet whether can be a suitable solution for crowd density +estimation in metro video surveillance where there are image processing +challenges such as high density, high occlusion, perspective distortion and +limited hardware resources. + +
+
+
+
+
+ + ☆ Unsupervised Tumor-Aware Distillation for Multi-Modal Brain Image + Translation IJCNN 2024 + + +
+ Multi-modal brain images from MRI scans are widely used in clinical diagnosis +to provide complementary information from different modalities. However, +obtaining fully paired multi-modal images in practice is challenging due to +various factors, such as time, cost, and artifacts, resulting in +modality-missing brain images. To address this problem, unsupervised +multi-modal brain image translation has been extensively studied. Existing +methods suffer from the problem of brain tumor deformation during translation, +as they fail to focus on the tumor areas when translating the whole images. In +this paper, we propose an unsupervised tumor-aware distillation teacher-student +network called UTAD-Net, which is capable of perceiving and translating tumor +areas precisely. Specifically, our model consists of two parts: a teacher +network and a student network. The teacher network learns an end-to-end mapping +from source to target modality using unpaired images and corresponding tumor +masks first. Then, the translation knowledge is distilled into the student +network, enabling it to generate more realistic tumor areas and whole images +without masks. Experiments show that our model achieves competitive performance +on both quantitative and qualitative evaluations of image quality compared with +state-of-the-art methods. Furthermore, we demonstrate the effectiveness of the +generated images on downstream segmentation tasks. Our code is available at +https://github.com/scut-HC/UTAD-Net. + +
+
+ comment: 8 pages, 5 figures. It has been provisionally accepted for IJCNN 2024 +
+
+
+
+
+ + ☆ HGS-Mapping: Online Dense Mapping Using Hybrid Gaussian Representation + in Urban Scenes + + +
+ Online dense mapping of urban scenes forms a fundamental cornerstone for +scene understanding and navigation of autonomous vehicles. Recent advancements +in mapping methods are mainly based on NeRF, whose rendering speed is too slow +to meet online requirements. 3D Gaussian Splatting (3DGS), with its rendering +speed hundreds of times faster than NeRF, holds greater potential in online +dense mapping. However, integrating 3DGS into a street-view dense mapping +framework still faces two challenges, including incomplete reconstruction due +to the absence of geometric information beyond the LiDAR coverage area and +extensive computation for reconstruction in large urban scenes. To this end, we +propose HGS-Mapping, an online dense mapping framework in unbounded large-scale +scenes. To attain complete construction, our framework introduces Hybrid +Gaussian Representation, which models different parts of the entire scene using +Gaussians with distinct properties. Furthermore, we employ a hybrid Gaussian +initialization mechanism and an adaptive update method to achieve high-fidelity +and rapid reconstruction. To the best of our knowledge, we are the first to +integrate Gaussian representation into online dense mapping of urban scenes. +Our approach achieves SOTA reconstruction accuracy while only employing 66% +number of Gaussians, leading to 20% faster reconstruction speed. + +
+
+
+
+
+ + ☆ Talk3D: High-Fidelity Talking Portrait Synthesis via Personalized 3D + Generative Prior + + +
+ Recent methods for audio-driven talking head synthesis often optimize neural +radiance fields (NeRF) on a monocular talking portrait video, leveraging its +capability to render high-fidelity and 3D-consistent novel-view frames. +However, they often struggle to reconstruct complete face geometry due to the +absence of comprehensive 3D information in the input monocular videos. In this +paper, we introduce a novel audio-driven talking head synthesis framework, +called Talk3D, that can faithfully reconstruct its plausible facial geometries +by effectively adopting the pre-trained 3D-aware generative prior. Given the +personalized 3D generative model, we present a novel audio-guided attention +U-Net architecture that predicts the dynamic face variations in the NeRF space +driven by audio. Furthermore, our model is further modulated by audio-unrelated +conditioning tokens which effectively disentangle variations unrelated to audio +features. Compared to existing methods, our method excels in generating +realistic facial geometries even under extreme head poses. We also conduct +extensive experiments showing our approach surpasses state-of-the-art +benchmarks in terms of both quantitative and qualitative evaluations. + +
+
+ comment: Project page: https://ku-cvlab.github.io/Talk3D/ +
+
+
+
+
+ + ☆ StegoGAN: Leveraging Steganography for Non-Bijective Image-to-Image + Translation + + +
+ Most image-to-image translation models postulate that a unique correspondence +exists between the semantic classes of the source and target domains. However, +this assumption does not always hold in real-world scenarios due to divergent +distributions, different class sets, and asymmetrical information +representation. As conventional GANs attempt to generate images that match the +distribution of the target domain, they may hallucinate spurious instances of +classes absent from the source domain, thereby diminishing the usefulness and +reliability of translated images. CycleGAN-based methods are also known to hide +the mismatched information in the generated images to bypass cycle consistency +objectives, a process known as steganography. In response to the challenge of +non-bijective image translation, we introduce StegoGAN, a novel model that +leverages steganography to prevent spurious features in generated images. Our +approach enhances the semantic consistency of the translated images without +requiring additional postprocessing or supervision. Our experimental +evaluations demonstrate that StegoGAN outperforms existing GAN-based models +across various non-bijective image-to-image translation tasks, both +qualitatively and quantitatively. Our code and pretrained models are accessible +at https://github.com/sian-wusidi/StegoGAN. + +
+
+
+
+
+ + ☆ ECLIPSE: Efficient Continual Learning in Panoptic Segmentation with + Visual Prompt Tuning CVPR 2024 + + +
+ Panoptic segmentation, combining semantic and instance segmentation, stands +as a cutting-edge computer vision task. Despite recent progress with deep +learning models, the dynamic nature of real-world applications necessitates +continual learning, where models adapt to new classes (plasticity) over time +without forgetting old ones (catastrophic forgetting). Current continual +segmentation methods often rely on distillation strategies like knowledge +distillation and pseudo-labeling, which are effective but result in increased +training complexity and computational overhead. In this paper, we introduce a +novel and efficient method for continual panoptic segmentation based on Visual +Prompt Tuning, dubbed ECLIPSE. Our approach involves freezing the base model +parameters and fine-tuning only a small set of prompt embeddings, addressing +both catastrophic forgetting and plasticity and significantly reducing the +trainable parameters. To mitigate inherent challenges such as error propagation +and semantic drift in continual segmentation, we propose logit manipulation to +effectively leverage common knowledge across the classes. Experiments on ADE20K +continual panoptic segmentation benchmark demonstrate the superiority of +ECLIPSE, notably its robustness against catastrophic forgetting and its +reasonable plasticity, achieving a new state-of-the-art. The code is available +at https://github.com/clovaai/ECLIPSE. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Segmentation, Classification and Interpretation of Breast Cancer Medical + Images using Human-in-the-Loop Machine Learning + + +
+ This paper explores the application of Human-in-the-Loop (HITL) strategies in +training machine learning models in the medical domain. In this case a +doctor-in-the-loop approach is proposed to leverage human expertise in dealing +with large and complex data. Specifically, the paper deals with the integration +of genomic data and Whole Slide Imaging (WSI) analysis of breast cancer. Three +different tasks were developed: segmentation of histopathological images, +classification of this images regarding the genomic subtype of the cancer and, +finally, interpretation of the machine learning results. The involvement of a +pathologist helped us to develop a better segmentation model and to enhance the +explainatory capabilities of the models, but the classification results were +suboptimal, highlighting the limitations of this approach: despite involving +human experts, complex domains can still pose challenges, and a HITL approach +may not always be effective. + +
+
+
+
+
+ + ☆ Aggregating Local and Global Features via Selective State Spaces Model + for Efficient Image Deblurring + + +
+ Image deblurring is a process of restoring a high quality image from the +corresponding blurred image. Significant progress in this field has been made +possible by the emergence of various effective deep learning models, including +CNNs and Transformers. However, these methods often face the dilemma between +eliminating long-range blur degradation perturbations and maintaining +computational efficiency, which hinders their practical application. To address +this issue, we propose an efficient image deblurring network that leverages +selective structured state spaces model to aggregate enriched and accurate +features. Specifically, we design an aggregate local and global block +(ALGBlock) to capture and fuse both local invariant properties and non-local +information. The ALGBlock consists of two blocks: (1) The local block models +local connectivity using simplified channel attention. (2) The global block +captures long-range dependency features with linear complexity through +selective structured state spaces. Nevertheless, we note that the image details +are local features of images, we accentuate the local part for restoration by +recalibrating the weight when aggregating the two branches for recovery. +Experimental results demonstrate that the proposed method outperforms +state-of-the-art approaches on widely used benchmarks, highlighting its +superior performance. + +
+
+
+
+
+ + ☆ FreeSeg-Diff: Training-Free Open-Vocabulary Segmentation with Diffusion + Models + + +
+ Foundation models have exhibited unprecedented capabilities in tackling many +domains and tasks. Models such as CLIP are currently widely used to bridge +cross-modal representations, and text-to-image diffusion models are arguably +the leading models in terms of realistic image generation. Image generative +models are trained on massive datasets that provide them with powerful internal +spatial representations. In this work, we explore the potential benefits of +such representations, beyond image generation, in particular, for dense visual +prediction tasks. We focus on the task of image segmentation, which is +traditionally solved by training models on closed-vocabulary datasets, with +pixel-level annotations. To avoid the annotation cost or training large +diffusion models, we constraint our setup to be zero-shot and training-free. In +a nutshell, our pipeline leverages different and relatively small-sized, +open-source foundation models for zero-shot open-vocabulary segmentation. The +pipeline is as follows: the image is passed to both a captioner model (i.e. +BLIP) and a diffusion model (i.e., Stable Diffusion Model) to generate a text +description and visual representation, respectively. The features are clustered +and binarized to obtain class agnostic masks for each object. These masks are +then mapped to a textual class, using the CLIP model to support +open-vocabulary. Finally, we add a refinement step that allows to obtain a more +precise segmentation mask. Our approach (dubbed FreeSeg-Diff), which does not +rely on any training, outperforms many training-based approaches on both Pascal +VOC and COCO datasets. In addition, we show very competitive results compared +to the recent weakly-supervised segmentation approaches. We provide +comprehensive experiments showing the superiority of diffusion model features +compared to other pretrained models. Project page: +https://bcorrad.github.io/freesegdiff/ + +
+
+
+
+
+ + ☆ RealKIE: Five Novel Datasets for Enterprise Key Information Extraction + + +
+ We introduce RealKIE, a benchmark of five challenging datasets aimed at +advancing key information extraction methods, with an emphasis on enterprise +applications. The datasets include a diverse range of documents including SEC +S1 Filings, US Non-disclosure Agreements, UK Charity Reports, FCC Invoices, and +Resource Contracts. Each presents unique challenges: poor text serialization, +sparse annotations in long documents, and complex tabular layouts. These +datasets provide a realistic testing ground for key information extraction +tasks like investment analysis and legal data processing. + In addition to presenting these datasets, we offer an in-depth description of +the annotation process, document processing techniques, and baseline modeling +approaches. This contribution facilitates the development of NLP models capable +of handling practical challenges and supports further research into information +extraction technologies applicable to industry-specific problems. + The annotated data and OCR outputs are available to download at +https://indicodatasolutions.github.io/RealKIE/ code to reproduce the baselines +will be available shortly. + +
+
+
+
+
+ + ☆ Modeling Weather Uncertainty for Multi-weather Co-Presence Estimation + + +
+ Images from outdoor scenes may be taken under various weather conditions. It +is well studied that weather impacts the performance of computer vision +algorithms and needs to be handled properly. However, existing algorithms model +weather condition as a discrete status and estimate it using multi-label +classification. The fact is that, physically, specifically in meteorology, +weather are modeled as a continuous and transitional status. Instead of +directly implementing hard classification as existing multi-weather +classification methods do, we consider the physical formulation of +multi-weather conditions and model the impact of physical-related parameter on +learning from the image appearance. In this paper, we start with solid revisit +of the physics definition of weather and how it can be described as a +continuous machine learning and computer vision task. Namely, we propose to +model the weather uncertainty, where the level of probability and co-existence +of multiple weather conditions are both considered. A Gaussian mixture model is +used to encapsulate the weather uncertainty and a uncertainty-aware +multi-weather learning scheme is proposed based on prior-posterior learning. A +novel multi-weather co-presence estimation transformer (MeFormer) is proposed. +In addition, a new multi-weather co-presence estimation (MePe) dataset, along +with 14 fine-grained weather categories and 16,078 samples, is proposed to +benchmark both conventional multi-label weather classification task and +multi-weather co-presence estimation task. Large scale experiments show that +the proposed method achieves state-of-the-art performance and substantial +generalization capabilities on both the conventional multi-label weather +classification task and the proposed multi-weather co-presence estimation task. +Besides, modeling weather uncertainty also benefits adverse-weather semantic +segmentation. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Selective Attention-based Modulation for Continual Learning + + +
+ We present SAM, a biologically-plausible selective attention-driven +modulation approach to enhance classification models in a continual learning +setting. Inspired by neurophysiological evidence that the primary visual cortex +does not contribute to object manifold untangling for categorization and that +primordial attention biases are still embedded in the modern brain, we propose +to employ auxiliary saliency prediction features as a modulation signal to +drive and stabilize the learning of a sequence of non-i.i.d. classification +tasks. Experimental results confirm that SAM effectively enhances the +performance (in some cases up to about twenty percent points) of +state-of-the-art continual learning methods, both in class-incremental and +task-incremental settings. Moreover, we show that attention-based modulation +successfully encourages the learning of features that are more robust to the +presence of spurious features and to adversarial attacks than baseline methods. +Code is available at: https://github.com/perceivelab/SAM. + +
+
+
+
+
+ + ☆ Mixed-precision Supernet Training from Vision Foundation Models using + Low Rank Adapter + + +
+ Compression of large and performant vision foundation models (VFMs) into +arbitrary bit-wise operations (BitOPs) allows their deployment on various +hardware. We propose to fine-tune a VFM to a mixed-precision quantized +supernet. The supernet-based neural architecture search (NAS) can be adopted +for this purpose, which trains a supernet, and then subnets within arbitrary +hardware budgets can be extracted. However, existing methods face difficulties +in optimizing the mixed-precision search space and incurring large memory costs +during training. To tackle these challenges, first, we study the effective +search space design for fine-tuning a VFM by comparing different operators +(such as resolution, feature size, width, depth, and bit-widths) in terms of +performance and BitOPs reduction. Second, we propose memory-efficient supernet +training using a low-rank adapter (LoRA) and a progressive training strategy. +The proposed method is evaluated for the recently proposed VFM, Segment +Anything Model, fine-tuned on segmentation tasks. The searched model yields +about a 95% reduction in BitOPs without incurring performance degradation. + +
+
+
+
+
+ + ☆ SGD: Street View Synthesis with Gaussian Splatting and Diffusion Prior + + +
+ Novel View Synthesis (NVS) for street scenes play a critical role in the +autonomous driving simulation. The current mainstream technique to achieve it +is neural rendering, such as Neural Radiance Fields (NeRF) and 3D Gaussian +Splatting (3DGS). Although thrilling progress has been made, when handling +street scenes, current methods struggle to maintain rendering quality at the +viewpoint that deviates significantly from the training viewpoints. This issue +stems from the sparse training views captured by a fixed camera on a moving +vehicle. To tackle this problem, we propose a novel approach that enhances the +capacity of 3DGS by leveraging prior from a Diffusion Model along with +complementary multi-modal data. Specifically, we first fine-tune a Diffusion +Model by adding images from adjacent frames as condition, meanwhile exploiting +depth data from LiDAR point clouds to supply additional spatial information. +Then we apply the Diffusion Model to regularize the 3DGS at unseen views during +training. Experimental results validate the effectiveness of our method +compared with current state-of-the-art models, and demonstrate its advance in +rendering images from broader views. + +
+
+
+
+
+ + ☆ Negative Label Guided OOD Detection with Pretrained Vision-Language + Models ICLR 2024 + + +
+ Out-of-distribution (OOD) detection aims at identifying samples from unknown +classes, playing a crucial role in trustworthy models against errors on +unexpected inputs. Extensive research has been dedicated to exploring OOD +detection in the vision modality. Vision-language models (VLMs) can leverage +both textual and visual information for various multi-modal applications, +whereas few OOD detection methods take into account information from the text +modality. In this paper, we propose a novel post hoc OOD detection method, +called NegLabel, which takes a vast number of negative labels from extensive +corpus databases. We design a novel scheme for the OOD score collaborated with +negative labels. Theoretical analysis helps to understand the mechanism of +negative labels. Extensive experiments demonstrate that our method NegLabel +achieves state-of-the-art performance on various OOD detection benchmarks and +generalizes well on multiple VLM architectures. Furthermore, our method +NegLabel exhibits remarkable robustness against diverse domain shifts. The +codes are available at https://github.com/tmlr-group/NegLabel. + +
+
+ comment: ICLR 2024 Spotlight +
+
+
+
+
+ + ☆ Revolutionizing Disease Diagnosis with simultaneous functional PET/MR + and Deeply Integrated Brain Metabolic, Hemodynamic, and Perfusion Networks + + +
+ Simultaneous functional PET/MR (sf-PET/MR) presents a cutting-edge multimodal +neuroimaging technique. It provides an unprecedented opportunity for +concurrently monitoring and integrating multifaceted brain networks built by +spatiotemporally covaried metabolic activity, neural activity, and cerebral +blood flow (perfusion). Albeit high scientific/clinical values, short in +hardware accessibility of PET/MR hinders its applications, let alone modern +AI-based PET/MR fusion models. Our objective is to develop a clinically +feasible AI-based disease diagnosis model trained on comprehensive sf-PET/MR +data with the power of, during inferencing, allowing single modality input +(e.g., PET only) as well as enforcing multimodal-based accuracy. To this end, +we propose MX-ARM, a multimodal MiXture-of-experts Alignment and Reconstruction +Model. It is modality detachable and exchangeable, allocating different +multi-layer perceptrons dynamically ("mixture of experts") through learnable +weights to learn respective representations from different modalities. Such +design will not sacrifice model performance in uni-modal situation. To fully +exploit the inherent complex and nonlinear relation among modalities while +producing fine-grained representations for uni-modal inference, we subsequently +add a modal alignment module to line up a dominant modality (e.g., PET) with +representations of auxiliary modalities (MR). We further adopt multimodal +reconstruction to promote the quality of learned features. Experiments on +precious multimodal sf-PET/MR data for Mild Cognitive Impairment diagnosis +showcase the efficacy of our model toward clinically feasible precision +medicine. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Embracing Unknown Step by Step: Towards Reliable Sparse Training in Real + World + + +
+ Sparse training has emerged as a promising method for resource-efficient deep +neural networks (DNNs) in real-world applications. However, the reliability of +sparse models remains a crucial concern, particularly in detecting unknown +out-of-distribution (OOD) data. This study addresses the knowledge gap by +investigating the reliability of sparse training from an OOD perspective and +reveals that sparse training exacerbates OOD unreliability. The lack of unknown +information and the sparse constraints hinder the effective exploration of +weight space and accurate differentiation between known and unknown knowledge. +To tackle these challenges, we propose a new unknown-aware sparse training +method, which incorporates a loss modification, auto-tuning strategy, and a +voting scheme to guide weight space exploration and mitigate confusion between +known and unknown information without incurring significant additional costs or +requiring access to additional OOD data. Theoretical insights demonstrate how +our method reduces model confidence when faced with OOD samples. Empirical +experiments across multiple datasets, model architectures, and sparsity levels +validate the effectiveness of our method, with improvements of up to +\textbf{8.4\%} in AUROC while maintaining comparable or higher accuracy and +calibration. This research enhances the understanding and readiness of sparse +DNNs for deployment in resource-limited applications. Our code is available on: +\url{https://github.com/StevenBoys/MOON}. + +
+
+
+
+
+ + ☆ UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces + Parameters for Skin Lesion Segmentation + + +
+ Traditionally for improving the segmentation performance of models, most +approaches prefer to use adding more complex modules. And this is not suitable +for the medical field, especially for mobile medical devices, where +computationally loaded models are not suitable for real clinical environments +due to computational resource constraints. Recently, state-space models (SSMs), +represented by Mamba, have become a strong competitor to traditional CNNs and +Transformers. In this paper, we deeply explore the key elements of parameter +influence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight +VM-UNet) based on this. Specifically, we propose a method for processing +features in parallel Vision Mamba, named PVM Layer, which achieves excellent +performance with the lowest computational load while keeping the overall number +of processing channels constant. We conducted comparisons and ablation +experiments with several state-of-the-art lightweight models on three skin +lesion public datasets and demonstrated that the UltraLight VM-UNet exhibits +the same strong performance competitiveness with parameters of only 0.049M and +GFLOPs of 0.060. In addition, this study deeply explores the key elements of +parameter influence in Mamba, which will lay a theoretical foundation for Mamba +to possibly become a new mainstream module for lightweighting in the future. +The code is available from https://github.com/wurenkai/UltraLight-VM-UNet . + +
+
+
+
+
+ + ☆ NeSLAM: Neural Implicit Mapping and Self-Supervised Feature Tracking + With Depth Completion and Denoising + + +
+ In recent years, there have been significant advancements in 3D +reconstruction and dense RGB-D SLAM systems. One notable development is the +application of Neural Radiance Fields (NeRF) in these systems, which utilizes +implicit neural representation to encode 3D scenes. This extension of NeRF to +SLAM has shown promising results. However, the depth images obtained from +consumer-grade RGB-D sensors are often sparse and noisy, which poses +significant challenges for 3D reconstruction and affects the accuracy of the +representation of the scene geometry. Moreover, the original hierarchical +feature grid with occupancy value is inaccurate for scene geometry +representation. Furthermore, the existing methods select random pixels for +camera tracking, which leads to inaccurate localization and is not robust in +real-world indoor environments. To this end, we present NeSLAM, an advanced +framework that achieves accurate and dense depth estimation, robust camera +tracking, and realistic synthesis of novel views. First, a depth completion and +denoising network is designed to provide dense geometry prior and guide the +neural implicit representation optimization. Second, the occupancy scene +representation is replaced with Signed Distance Field (SDF) hierarchical scene +representation for high-quality reconstruction and view synthesis. Furthermore, +we also propose a NeRF-based self-supervised feature tracking algorithm for +robust real-time tracking. Experiments on various indoor datasets demonstrate +the effectiveness and accuracy of the system in reconstruction, tracking +quality, and novel view synthesis. + +
+
+
+
+
+ + ☆ HO-Gaussian: Hybrid Optimization of 3D Gaussian Splatting for Urban + Scenes + + +
+ The rapid growth of 3D Gaussian Splatting (3DGS) has revolutionized neural +rendering, enabling real-time production of high-quality renderings. However, +the previous 3DGS-based methods have limitations in urban scenes due to +reliance on initial Structure-from-Motion(SfM) points and difficulties in +rendering distant, sky and low-texture areas. To overcome these challenges, we +propose a hybrid optimization method named HO-Gaussian, which combines a +grid-based volume with the 3DGS pipeline. HO-Gaussian eliminates the dependency +on SfM point initialization, allowing for rendering of urban scenes, and +incorporates the Point Densitification to enhance rendering quality in +problematic regions during training. Furthermore, we introduce Gaussian +Direction Encoding as an alternative for spherical harmonics in the rendering +pipeline, which enables view-dependent color representation. To account for +multi-camera systems, we introduce neural warping to enhance object consistency +across different cameras. Experimental results on widely used autonomous +driving datasets demonstrate that HO-Gaussian achieves photo-realistic +rendering in real-time on multi-camera urban datasets. + +
+
+
+
+
+ + ☆ A Unified Framework for Human-centric Point Cloud Video Understanding CVPR 2024 + + +
+ Human-centric Point Cloud Video Understanding (PVU) is an emerging field +focused on extracting and interpreting human-related features from sequences of +human point clouds, further advancing downstream human-centric tasks and +applications. Previous works usually focus on tackling one specific task and +rely on huge labeled data, which has poor generalization capability. +Considering that human has specific characteristics, including the structural +semantics of human body and the dynamics of human motions, we propose a unified +framework to make full use of the prior knowledge and explore the inherent +features in the data itself for generalized human-centric point cloud video +understanding. Extensive experiments demonstrate that our method achieves +state-of-the-art performance on various human-related tasks, including action +recognition and 3D pose estimation. All datasets and code will be released +soon. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ FSMR: A Feature Swapping Multi-modal Reasoning Approach with Joint + Textual and Visual Clues + + +
+ Multi-modal reasoning plays a vital role in bridging the gap between textual +and visual information, enabling a deeper understanding of the context. This +paper presents the Feature Swapping Multi-modal Reasoning (FSMR) model, +designed to enhance multi-modal reasoning through feature swapping. FSMR +leverages a pre-trained visual-language model as an encoder, accommodating both +text and image inputs for effective feature representation from both +modalities. It introduces a unique feature swapping module, enabling the +exchange of features between identified objects in images and corresponding +vocabulary words in text, thereby enhancing the model's comprehension of the +interplay between images and text. To further bolster its multi-modal alignment +capabilities, FSMR incorporates a multi-modal cross-attention mechanism, +facilitating the joint modeling of textual and visual information. During +training, we employ image-text matching and cross-entropy losses to ensure +semantic consistency between visual and language elements. Extensive +experiments on the PMR dataset demonstrate FSMR's superiority over +state-of-the-art baseline models across various performance metrics. + +
+
+
+
+
+ + ☆ Psychometry: An Omnifit Model for Image Reconstruction from Human Brain + Activity CVPR 2024 + + +
+ Reconstructing the viewed images from human brain activity bridges human and +computer vision through the Brain-Computer Interface. The inherent variability +in brain function between individuals leads existing literature to focus on +acquiring separate models for each individual using their respective brain +signal data, ignoring commonalities between these data. In this article, we +devise Psychometry, an omnifit model for reconstructing images from functional +Magnetic Resonance Imaging (fMRI) obtained from different subjects. Psychometry +incorporates an omni mixture-of-experts (Omni MoE) module where all the experts +work together to capture the inter-subject commonalities, while each expert +associated with subject-specific parameters copes with the individual +differences. Moreover, Psychometry is equipped with a retrieval-enhanced +inference strategy, termed Ecphory, which aims to enhance the learned fMRI +representation via retrieving from prestored subject-specific memories. These +designs collectively render Psychometry omnifit and efficient, enabling it to +capture both inter-subject commonality and individual specificity across +subjects. As a result, the enhanced fMRI representations serve as conditional +signals to guide a generation model to reconstruct high-quality and realistic +images, establishing Psychometry as state-of-the-art in terms of both +high-level and low-level metrics. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ SCINeRF: Neural Radiance Fields from a Snapshot Compressive Image + + +
+ In this paper, we explore the potential of Snapshot Compressive Imaging (SCI) +technique for recovering the underlying 3D scene representation from a single +temporal compressed image. SCI is a cost-effective method that enables the +recording of high-dimensional data, such as hyperspectral or temporal +information, into a single image using low-cost 2D imaging sensors. To achieve +this, a series of specially designed 2D masks are usually employed, which not +only reduces storage requirements but also offers potential privacy protection. +Inspired by this, to take one step further, our approach builds upon the +powerful 3D scene representation capabilities of neural radiance fields (NeRF). +Specifically, we formulate the physical imaging process of SCI as part of the +training of NeRF, allowing us to exploit its impressive performance in +capturing complex scene structures. To assess the effectiveness of our method, +we conduct extensive evaluations using both synthetic data and real data +captured by our SCI system. Extensive experimental results demonstrate that our +proposed approach surpasses the state-of-the-art methods in terms of image +reconstruction and novel view image synthesis. Moreover, our method also +exhibits the ability to restore high frame-rate multi-view consistent images by +leveraging SCI and the rendering capabilities of NeRF. The code is available at +https://github.com/WU-CVGL/SCINeRF. + +
+
+
+
+
+ + ☆ DerainNeRF: 3D Scene Estimation with Adhesive Waterdrop Removal + + +
+ When capturing images through the glass during rainy or snowy weather +conditions, the resulting images often contain waterdrops adhered on the glass +surface, and these waterdrops significantly degrade the image quality and +performance of many computer vision algorithms. To tackle these limitations, we +propose a method to reconstruct the clear 3D scene implicitly from multi-view +images degraded by waterdrops. Our method exploits an attention network to +predict the location of waterdrops and then train a Neural Radiance Fields to +recover the 3D scene implicitly. By leveraging the strong scene representation +capabilities of NeRF, our method can render high-quality novel-view images with +waterdrops removed. Extensive experimental results on both synthetic and real +datasets show that our method is able to generate clear 3D scenes and +outperforms existing state-of-the-art (SOTA) image adhesive waterdrop removal +methods. + +
+
+
+
+
+ + ☆ Colorful Cutout: Enhancing Image Data Augmentation with Curriculum + Learning ICLR 2024 + + +
+ Data augmentation is one of the regularization strategies for the training of +deep learning models, which enhances generalizability and prevents overfitting, +leading to performance improvement. Although researchers have proposed various +data augmentation techniques, they often lack consideration for the difficulty +of augmented data. Recently, another line of research suggests incorporating +the concept of curriculum learning with data augmentation in the field of +natural language processing. In this study, we adopt curriculum data +augmentation for image data augmentation and propose colorful cutout, which +gradually increases the noise and difficulty introduced in the augmented image. +Our experimental results highlight the possibility of curriculum data +augmentation for image data. We publicly released our source code to improve +the reproducibility of our study. + +
+
+ comment: ICLR 2024 Tiny Papers +
+
+
+
+
+ + ☆ Grounding and Enhancing Grid-based Models for Neural Fields CVPR24 + + +
+ Many contemporary studies utilize grid-based models for neural field +representation, but a systematic analysis of grid-based models is still +missing, hindering the improvement of those models. Therefore, this paper +introduces a theoretical framework for grid-based models. This framework points +out that these models' approximation and generalization behaviors are +determined by grid tangent kernels (GTK), which are intrinsic properties of +grid-based models. The proposed framework facilitates a consistent and +systematic analysis of diverse grid-based models. Furthermore, the introduced +framework motivates the development of a novel grid-based model named the +Multiplicative Fourier Adaptive Grid (MulFAGrid). The numerical analysis +demonstrates that MulFAGrid exhibits a lower generalization bound than its +predecessors, indicating its robust generalization performance. Empirical +studies reveal that MulFAGrid achieves state-of-the-art performance in various +tasks, including 2D image fitting, 3D signed distance field (SDF) +reconstruction, and novel view synthesis, demonstrating superior representation +ability. The project website is available at +https://sites.google.com/view/cvpr24-2034-submission/home. + +
+
+ comment: Accepted in CVPR24 +
+
+
+
+
+ + ☆ Stable Surface Regularization for Fast Few-Shot NeRF 3DV 2024 + + +
+ This paper proposes an algorithm for synthesizing novel views under few-shot +setup. The main concept is to develop a stable surface regularization technique +called Annealing Signed Distance Function (ASDF), which anneals the surface in +a coarse-to-fine manner to accelerate convergence speed. We observe that the +Eikonal loss - which is a widely known geometric regularization - requires +dense training signal to shape different level-sets of SDF, leading to +low-fidelity results under few-shot training. In contrast, the proposed surface +regularization successfully reconstructs scenes and produce high-fidelity +geometry with stable training. Our method is further accelerated by utilizing +grid representation and monocular geometric priors. Finally, the proposed +approach is up to 45 times faster than existing few-shot novel view synthesis +methods, and it produces comparable results in the ScanNet dataset and +NeRF-Real dataset. + +
+
+ comment: 3DV 2024 +
+
+
+
+
+ + ☆ A multi-stage semi-supervised learning for ankle fracture classification + on CT images + + +
+ Because of the complicated mechanism of ankle injury, it is very difficult to +diagnose ankle fracture in clinic. In order to simplify the process of fracture +diagnosis, an automatic diagnosis model of ankle fracture was proposed. +Firstly, a tibia-fibula segmentation network is proposed for the joint +tibiofibular region of the ankle joint, and the corresponding segmentation +dataset is established on the basis of fracture data. Secondly, the image +registration method is used to register the bone segmentation mask with the +normal bone mask. Finally, a semi-supervised classifier is constructed to make +full use of a large number of unlabeled data to classify ankle fractures. +Experiments show that the proposed method can segment fractures with fracture +lines accurately and has better performance than the general method. At the +same time, this method is superior to classification network in several +indexes. + +
+
+
+
+
+ + ☆ A Parallel Attention Network for Cattle Face Recognition ICME 2024 + + +
+ Cattle face recognition holds paramount significance in domains such as +animal husbandry and behavioral research. Despite significant progress in +confined environments, applying these accomplishments in wild settings remains +challenging. Thus, we create the first large-scale cattle face recognition +dataset, ICRWE, for wild environments. It encompasses 483 cattle and 9,816 +high-resolution image samples. Each sample undergoes annotation for face +features, light conditions, and face orientation. Furthermore, we introduce a +novel parallel attention network, PANet. Comprising several cascaded +Transformer modules, each module incorporates two parallel Position Attention +Modules (PAM) and Feature Mapping Modules (FMM). PAM focuses on local and +global features at each image position through parallel channel attention, and +FMM captures intricate feature patterns through non-linear mappings. +Experimental results indicate that PANet achieves a recognition accuracy of +88.03% on the ICRWE dataset, establishing itself as the current +state-of-the-art approach. The source code is available in the supplementary +materials. + +
+
+ comment: Accepted by ICME 2024 +
+
+
+
+
+ + ☆ Semantically-Shifted Incremental Adapter-Tuning is A Continual + ViTransformer CVPR 2024 + + +
+ Class-incremental learning (CIL) aims to enable models to continuously learn +new classes while overcoming catastrophic forgetting. The introduction of +pre-trained models has brought new tuning paradigms to CIL. In this paper, we +revisit different parameter-efficient tuning (PET) methods within the context +of continual learning. We observe that adapter tuning demonstrates superiority +over prompt-based methods, even without parameter expansion in each learning +session. Motivated by this, we propose incrementally tuning the shared adapter +without imposing parameter update constraints, enhancing the learning capacity +of the backbone. Additionally, we employ feature sampling from stored +prototypes to retrain a unified classifier, further improving its performance. +We estimate the semantic shift of old prototypes without access to past samples +and update stored prototypes session by session. Our proposed method eliminates +model expansion and avoids retaining any image samples. It surpasses previous +pre-trained model-based CIL methods and demonstrates remarkable continual +learning capabilities. Experimental results on five CIL benchmarks validate the +effectiveness of our approach, achieving state-of-the-art (SOTA) performance. + +
+
+ comment: To appear at CVPR 2024 +
+
+
+
+
+ + ☆ eTraM: Event-based Traffic Monitoring Dataset + + +
+ Event cameras, with their high temporal and dynamic range and minimal memory +usage, have found applications in various fields. However, their potential in +static traffic monitoring remains largely unexplored. To facilitate this +exploration, we present eTraM - a first-of-its-kind, fully event-based traffic +monitoring dataset. eTraM offers 10 hr of data from different traffic scenarios +in various lighting and weather conditions, providing a comprehensive overview +of real-world situations. Providing 2M bounding box annotations, it covers +eight distinct classes of traffic participants, ranging from vehicles to +pedestrians and micro-mobility. eTraM's utility has been assessed using +state-of-the-art methods for traffic participant detection, including RVT, RED, +and YOLOv8. We quantitatively evaluate the ability of event-based models to +generalize on nighttime and unseen scenes. Our findings substantiate the +compelling potential of leveraging event cameras for traffic monitoring, +opening new avenues for research and application. eTraM is available at +https://eventbasedvision.github.io/eTraM + +
+
+
+
+
+ + ☆ Context-Aware Integration of Language and Visual References for Natural + Language Tracking CVPR2024 + + +
+ Tracking by natural language specification (TNL) aims to consistently +localize a target in a video sequence given a linguistic description in the +initial frame. Existing methodologies perform language-based and template-based +matching for target reasoning separately and merge the matching results from +two sources, which suffer from tracking drift when language and visual +templates miss-align with the dynamic target state and ambiguity in the later +merging stage. To tackle the issues, we propose a joint multi-modal tracking +framework with 1) a prompt modulation module to leverage the complementarity +between temporal visual templates and language expressions, enabling precise +and context-aware appearance and linguistic cues, and 2) a unified target +decoding module to integrate the multi-modal reference cues and executes the +integrated queries on the search image to predict the target location in an +end-to-end manner directly. This design ensures spatio-temporal consistency by +leveraging historical visual information and introduces an integrated solution, +generating predictions in a single step. Extensive experiments conducted on +TNL2K, OTB-Lang, LaSOT, and RefCOCOg validate the efficacy of our proposed +approach. The results demonstrate competitive performance against +state-of-the-art methods for both tracking and grounding. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Separate, Dynamic and Differentiable (SMART) Pruner for Block/Output + Channel Pruning on Computer Vision Tasks + + +
+ Deep Neural Network (DNN) pruning has emerged as a key strategy to reduce +model size, improve inference latency, and lower power consumption on DNN +accelerators. Among various pruning techniques, block and output channel +pruning have shown significant potential in accelerating hardware performance. +However, their accuracy often requires further improvement. In response to this +challenge, we introduce a separate, dynamic and differentiable (SMART) pruner. +This pruner stands out by utilizing a separate, learnable probability mask for +weight importance ranking, employing a differentiable Top k operator to achieve +target sparsity, and leveraging a dynamic temperature parameter trick to escape +from non-sparse local minima. In our experiments, the SMART pruner consistently +demonstrated its superiority over existing pruning methods across a wide range +of tasks and models on block and output channel pruning. Additionally, we +extend our testing to Transformer-based models in N:M pruning scenarios, where +SMART pruner also yields state-of-the-art results, demonstrating its +adaptability and robustness across various neural network architectures, and +pruning types. + +
+
+
+
+
+ + ☆ Rewrite the Stars CVPR 2024 + + +
+ Recent studies have drawn attention to the untapped potential of the "star +operation" (element-wise multiplication) in network design. While intuitive +explanations abound, the foundational rationale behind its application remains +largely unexplored. Our study attempts to reveal the star operation's ability +to map inputs into high-dimensional, non-linear feature spaces -- akin to +kernel tricks -- without widening the network. We further introduce StarNet, a +simple yet powerful prototype, demonstrating impressive performance and low +latency under compact network structure and efficient budget. Like stars in the +sky, the star operation appears unremarkable but holds a vast universe of +potential. Our work encourages further exploration across tasks, with codes +available at https://github.com/ma-xu/Rewrite-the-Stars. + +
+
+ comment: Accepted by CVPR 2024. Codes are made publically available at + https://github.com/ma-xu/Rewrite-the-Stars +
+
+
+
+
+ + ☆ Multi-task Magnetic Resonance Imaging Reconstruction using Meta-learning + + +
+ Using single-task deep learning methods to reconstruct Magnetic Resonance +Imaging (MRI) data acquired with different imaging sequences is inherently +challenging. The trained deep learning model typically lacks generalizability, +and the dissimilarity among image datasets with different types of contrast +leads to suboptimal learning performance. This paper proposes a meta-learning +approach to efficiently learn image features from multiple MR image datasets. +Our algorithm can perform multi-task learning to simultaneously reconstruct MR +images acquired using different imaging sequences with different image +contrasts. The experiment results demonstrate the ability of our new +meta-learning reconstruction method to successfully reconstruct +highly-undersampled k-space data from multiple MRI datasets simultaneously, +outperforming other compelling reconstruction methods previously developed for +single-task learning. + +
+
+
+
+
+ + ☆ FairRAG: Fair Human Generation via Fair Retrieval Augmentation + + +
+ Existing text-to-image generative models reflect or even amplify societal +biases ingrained in their training data. This is especially concerning for +human image generation where models are biased against certain demographic +groups. Existing attempts to rectify this issue are hindered by the inherent +limitations of the pre-trained models and fail to substantially improve +demographic diversity. In this work, we introduce Fair Retrieval Augmented +Generation (FairRAG), a novel framework that conditions pre-trained generative +models on reference images retrieved from an external image database to improve +fairness in human generation. FairRAG enables conditioning through a +lightweight linear module that projects reference images into the textual +space. To enhance fairness, FairRAG applies simple-yet-effective debiasing +strategies, providing images from diverse demographic groups during the +generative process. Extensive experiments demonstrate that FairRAG outperforms +existing methods in terms of demographic diversity, image-text alignment, and +image fidelity while incurring minimal computational overhead during inference. + +
+
+
+
+
+ + ☆ Efficient Modulation for Vision Networks ICLR 2024 + + +
+ In this work, we present efficient modulation, a novel design for efficient +vision networks. We revisit the modulation mechanism, which operates input +through convolutional context modeling and feature projection layers, and fuses +features via element-wise multiplication and an MLP block. We demonstrate that +the modulation mechanism is particularly well suited for efficient networks and +further tailor the modulation design by proposing the efficient modulation +(EfficientMod) block, which is considered the essential building block for our +networks. Benefiting from the prominent representational ability of modulation +mechanism and the proposed efficient design, our network can accomplish better +trade-offs between accuracy and efficiency and set new state-of-the-art +performance in the zoo of efficient networks. When integrating EfficientMod +with the vanilla self-attention block, we obtain the hybrid architecture which +further improves the performance without loss of efficiency. We carry out +comprehensive experiments to verify EfficientMod's performance. With fewer +parameters, our EfficientMod-s performs 0.6 top-1 accuracy better than +EfficientFormerV2-s2 and is 25% faster on GPU, and 2.9 better than +MobileViTv2-1.0 at the same GPU latency. Additionally, our method presents a +notable improvement in downstream tasks, outperforming EfficientFormerV2-s by +3.6 mIoU on the ADE20K benchmark. Code and checkpoints are available at +https://github.com/ma-xu/EfficientMod. + +
+
+ comment: Accepted by ICLR 2024. Codes are made publically available at + https://github.com/ma-xu/EfficientMod +
+
+
+
+
+ + ☆ FairCLIP: Harnessing Fairness in Vision-Language Learning CVPR 2024 + + +
+ Fairness is a critical concern in deep learning, especially in healthcare, +where these models influence diagnoses and treatment decisions. Although +fairness has been investigated in the vision-only domain, the fairness of +medical vision-language (VL) models remains unexplored due to the scarcity of +medical VL datasets for studying fairness. To bridge this research gap, we +introduce the first fair vision-language medical dataset FairVLMed that +provides detailed demographic attributes, ground-truth labels, and clinical +notes to facilitate an in-depth examination of fairness within VL foundation +models. Using FairVLMed, we conduct a comprehensive fairness analysis of two +widely-used VL models (CLIP and BLIP2), pre-trained on both natural and medical +domains, across four different protected attributes. Our results highlight +significant biases in all VL models, with Asian, Male, Non-Hispanic, and +Spanish being the preferred subgroups across the protected attributes of race, +gender, ethnicity, and language, respectively. In order to alleviate these +biases, we propose FairCLIP, an optimal-transport-based approach that achieves +a favorable trade-off between performance and fairness by reducing the Sinkhorn +distance between the overall sample distribution and the distributions +corresponding to each demographic group. As the first VL dataset of its kind, +FairVLMed holds the potential to catalyze advancements in the development of +machine learning models that are both ethically aware and clinically effective. +Our dataset and code are available at +https://ophai.hms.harvard.edu/datasets/fairvlmed10k. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Binarized Low-light Raw Video Enhancement CVPR 2024 + + +
+ Recently, deep neural networks have achieved excellent performance on +low-light raw video enhancement. However, they often come with high +computational complexity and large memory costs, which hinder their +applications on resource-limited devices. In this paper, we explore the +feasibility of applying the extremely compact binary neural network (BNN) to +low-light raw video enhancement. Nevertheless, there are two main issues with +binarizing video enhancement models. One is how to fuse the temporal +information to improve low-light denoising without complex modules. The other +is how to narrow the performance gap between binary convolutions with the full +precision ones. To address the first issue, we introduce a spatial-temporal +shift operation, which is easy-to-binarize and effective. The temporal shift +efficiently aggregates the features of neighbor frames and the spatial shift +handles the misalignment caused by the large motion in videos. For the second +issue, we present a distribution-aware binary convolution, which captures the +distribution characteristics of real-valued input and incorporates them into +plain binary convolutions to alleviate the degradation in performance. +Extensive quantitative and qualitative experiments have shown our +high-efficiency binarized low-light raw video enhancement method can attain a +promising performance. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ CP HDR: A feature point detection and description library for LDR and + HDR images + + +
+ In computer vision, characteristics refer to image regions with unique +properties, such as corners, edges, textures, or areas with high contrast. +These regions can be represented through feature points (FPs). FP detection and +description are fundamental steps to many computer vision tasks. Most FP +detection and description methods use low dynamic range (LDR) images, +sufficient for most applications involving digital images. However, LDR images +may have saturated pixels in scenes with extreme light conditions, which +degrade FP detection. On the other hand, high dynamic range (HDR) images +usually present a greater dynamic range but FP detection algorithms do not take +advantage of all the information in such images. In this study, we present a +systematic review of image detection and description algorithms that use HDR +images as input. We developed a library called CP_HDR that implements the +Harris corner detector, SIFT detector and descriptor, and two modifications of +those algorithms specialized in HDR images, called SIFT for HDR (SfHDR) and +Harris for HDR (HfHDR). Previous studies investigated the use of HDR images in +FP detection, but we did not find studies investigating the use of HDR images +in FP description. Using uniformity, repeatability rate, mean average +precision, and matching rate metrics, we compared the performance of the CP_HDR +algorithms using LDR and HDR images. We observed an increase in the uniformity +of the distribution of FPs among the high-light, mid-light, and low-light areas +of the images. The results show that using HDR images as input to detection +algorithms improves performance and that SfHDR and HfHDR enhance FP +description. + +
+
+
+
+
+ + ☆ SceneTracker: Long-term Scene Flow Estimation Network + + +
+ Considering the complementarity of scene flow estimation in the spatial +domain's focusing capability and 3D object tracking in the temporal domain's +coherence, this study aims to address a comprehensive new task that can +simultaneously capture fine-grained and long-term 3D motion in an online +manner: long-term scene flow estimation (LSFE). We introduce SceneTracker, a +novel learning-based LSFE network that adopts an iterative approach to +approximate the optimal trajectory. Besides, it dynamically indexes and +constructs appearance and depth correlation features simultaneously and employs +the Transformer to explore and utilize long-range connections within and +between trajectories. With detailed experiments, SceneTracker shows superior +capabilities in handling 3D spatial occlusion and depth noise interference, +highly tailored to the LSFE task's needs. The code for SceneTracker is +available at https://github.com/wwsource/SceneTracker. + +
+
+
+
+
+ + ☆ MI-NeRF: Learning a Single Face NeRF from Multiple Identities + + +
+ In this work, we introduce a method that learns a single dynamic neural +radiance field (NeRF) from monocular talking face videos of multiple +identities. NeRFs have shown remarkable results in modeling the 4D dynamics and +appearance of human faces. However, they require per-identity optimization. +Although recent approaches have proposed techniques to reduce the training and +rendering time, increasing the number of identities can be expensive. We +introduce MI-NeRF (multi-identity NeRF), a single unified network that models +complex non-rigid facial motion for multiple identities, using only monocular +videos of arbitrary length. The core premise in our method is to learn the +non-linear interactions between identity and non-identity specific information +with a multiplicative module. By training on multiple videos simultaneously, +MI-NeRF not only reduces the total training time compared to standard +single-identity NeRFs, but also demonstrates robustness in synthesizing novel +expressions for any input identity. We present results for both facial +expression transfer and talking face video synthesis. Our method can be further +personalized for a target identity given only a short video. + +
+
+ comment: Project page: https://aggelinacha.github.io/MI-NeRF/ +
+
+
+
+
+ + ☆ Diff-Reg v1: Diffusion Matching Model for Registration Problem + + +
+ Establishing reliable correspondences is essential for registration tasks +such as 3D and 2D3D registration. Existing methods commonly leverage geometric +or semantic point features to generate potential correspondences. However, +these features may face challenges such as large deformation, scale +inconsistency, and ambiguous matching problems (e.g., symmetry). Additionally, +many previous methods, which rely on single-pass prediction, may struggle with +local minima in complex scenarios. To mitigate these challenges, we introduce a +diffusion matching model for robust correspondence construction. Our approach +treats correspondence estimation as a denoising diffusion process within the +doubly stochastic matrix space, which gradually denoises (refines) a doubly +stochastic matching matrix to the ground-truth one for high-quality +correspondence estimation. It involves a forward diffusion process that +gradually introduces Gaussian noise into the ground truth matching matrix and a +reverse denoising process that iteratively refines the noisy matching matrix. +In particular, the feature extraction from the backbone occurs only once during +the inference phase. Our lightweight denoising module utilizes the same feature +at each reverse sampling step. Evaluation of our method on both 3D and 2D3D +registration tasks confirms its effectiveness. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2401.00436 +
+
+
+
+
+ + ☆ Using Images as Covariates: Measuring Curb Appeal with Deep Learning + + +
+ This paper details an innovative methodology to integrate image data into +traditional econometric models. Motivated by forecasting sales prices for +residential real estate, we harness the power of deep learning to add +"information" contained in images as covariates. Specifically, images of homes +were categorized and encoded using an ensemble of image classifiers (ResNet-50, +VGG16, MobileNet, and Inception V3). Unique features presented within each +image were further encoded through panoptic segmentation. Forecasts from a +neural network trained on the encoded data results in improved out-of-sample +predictive power. We also combine these image-based forecasts with standard +hedonic real estate property and location characteristics, resulting in a +unified dataset. We show that image-based forecasts increase the accuracy of +hedonic forecasts when encoded features are regarded as additional covariates. +We also attempt to "explain" which covariates the image-based forecasts are +most highly correlated with. The study exemplifies the benefits of +interdisciplinary methodologies, merging machine learning and econometrics to +harness untapped data sources for more accurate forecasting. + +
+
+
+
+
+ + ☆ Automated Identification and Segmentation of Hi Sources in CRAFTS Using + Deep Learning Method + + +
+ We introduce a machine learning-based method for extracting HI sources from +3D spectral data, and construct a dedicated dataset of HI sources from CRAFTS. +Our custom dataset provides comprehensive resources for HI source detection. +Utilizing the 3D-Unet segmentation architecture, our method reliably identifies +and segments HI sources, achieving notable performance metrics with recall +rates reaching 91.6% and accuracy levels at 95.7%. These outcomes substantiate +the value of our custom dataset and the efficacy of our proposed network in +identifying HI source. Our code is publicly available at +https://github.com/fishszh/HISF. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Classification of Diabetic Retinopathy using Pre-Trained Deep Learning + Models + + +
+ Diabetic Retinopathy (DR) stands as the leading cause of blindness globally, +particularly affecting individuals between the ages of 20 and 70. This paper +presents a Computer-Aided Diagnosis (CAD) system designed for the automatic +classification of retinal images into five distinct classes: Normal, Mild, +Moderate, Severe, and Proliferative Diabetic Retinopathy (PDR). The proposed +system leverages Convolutional Neural Networks (CNNs) employing pre-trained +deep learning models. Through the application of fine-tuning techniques, our +model is trained on fundus images of diabetic retinopathy with resolutions of +350x350x3 and 224x224x3. Experimental results obtained on the Kaggle platform, +utilizing resources comprising 4 CPUs, 17 GB RAM, and 1 GB Disk, demonstrate +the efficacy of our approach. The achieved Area Under the Curve (AUC) values +for CNN, MobileNet, VGG-16, InceptionV3, and InceptionResNetV2 models are 0.50, +0.70, 0.53, 0.63, and 0.69, respectively. + +
+
+ comment: 3 pages, 1 figure, 1 table +
+
+
+
+
+ + ☆ Fully Geometric Panoramic Localization CVPR 2024 + + +
+ We introduce a lightweight and accurate localization method that only +utilizes the geometry of 2D-3D lines. Given a pre-captured 3D map, our approach +localizes a panorama image, taking advantage of the holistic 360 view. The +system mitigates potential privacy breaches or domain discrepancies by avoiding +trained or hand-crafted visual descriptors. However, as lines alone can be +ambiguous, we express distinctive yet compact spatial contexts from +relationships between lines, namely the dominant directions of parallel lines +and the intersection between non-parallel lines. The resulting representations +are efficient in processing time and memory compared to conventional visual +descriptor-based methods. Given the groups of dominant line directions and +their intersections, we accelerate the search process to test thousands of pose +candidates in less than a millisecond without sacrificing accuracy. We +empirically show that the proposed 2D-3D matching can localize panoramas for +challenging scenes with similar structures, dramatic domain shifts or +illumination changes. Our fully geometric approach does not involve extensive +parameter tuning or neural network training, making it a practical algorithm +that can be readily deployed in the real world. Project page including the code +is available through this link: https://82magnolia.github.io/fgpl/. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Heterogeneous Network Based Contrastive Learning Method for PolSAR Land + Cover Classification + + +
+ Polarimetric synthetic aperture radar (PolSAR) image interpretation is widely +used in various fields. Recently, deep learning has made significant progress +in PolSAR image classification. Supervised learning (SL) requires a large +amount of labeled PolSAR data with high quality to achieve better performance, +however, manually labeled data is insufficient. This causes the SL to fail into +overfitting and degrades its generalization performance. Furthermore, the +scattering confusion problem is also a significant challenge that attracts more +attention. To solve these problems, this article proposes a Heterogeneous +Network based Contrastive Learning method(HCLNet). It aims to learn high-level +representation from unlabeled PolSAR data for few-shot classification according +to multi-features and superpixels. Beyond the conventional CL, HCLNet +introduces the heterogeneous architecture for the first time to utilize +heterogeneous PolSAR features better. And it develops two easy-to-use plugins +to narrow the domain gap between optics and PolSAR, including feature filter +and superpixel-based instance discrimination, which the former is used to +enhance the complementarity of multi-features, and the latter is used to +increase the diversity of negative samples. Experiments demonstrate the +superiority of HCLNet on three widely used PolSAR benchmark datasets compared +with state-of-the-art methods. Ablation studies also verify the importance of +each component. Besides, this work has implications for how to efficiently +utilize the multi-features of PolSAR data to learn better high-level +representation in CL and how to construct networks suitable for PolSAR data +better. + +
+
+
+
+
+ + ☆ Disentangling Racial Phenotypes: Fine-Grained Control of Race-related + Facial Phenotype Characteristics + + +
+ Achieving an effective fine-grained appearance variation over 2D facial +images, whilst preserving facial identity, is a challenging task due to the +high complexity and entanglement of common 2D facial feature encoding spaces. +Despite these challenges, such fine-grained control, by way of disentanglement +is a crucial enabler for data-driven racial bias mitigation strategies across +multiple automated facial analysis tasks, as it allows to analyse, characterise +and synthesise human facial diversity. In this paper, we propose a novel GAN +framework to enable fine-grained control over individual race-related phenotype +attributes of the facial images. Our framework factors the latent (feature) +space into elements that correspond to race-related facial phenotype +representations, thereby separating phenotype aspects (e.g. skin, hair colour, +nose, eye, mouth shapes), which are notoriously difficult to annotate robustly +in real-world facial data. Concurrently, we also introduce a high quality +augmented, diverse 2D face image dataset drawn from CelebA-HQ for GAN training. +Unlike prior work, our framework only relies upon 2D imagery and related +parameters to achieve state-of-the-art individual control over race-related +phenotype attributes with improved photo-realistic output. + +
+
+
+
+
+ + ☆ Nonlinearity Enhanced Adaptive Activation Function + + +
+ A simply implemented activation function with even cubic nonlinearity is +introduced that increases the accuracy of neural networks without substantial +additional computational resources. This is partially enabled through an +apparent tradeoff between convergence and accuracy. The activation function +generalizes the standard RELU function by introducing additional degrees of +freedom through optimizable parameters that enable the degree of nonlinearity +to be adjusted. The associated accuracy enhancement is quantified in the +context of the MNIST digit data set through a comparison with standard +techniques. + +
+
+
+
+
+ + ☆ PLoc: A New Evaluation Criterion Based on Physical Location for + Autonomous Driving Datasets + + +
+ Autonomous driving has garnered significant attention as a key research area +within artificial intelligence. In the context of autonomous driving scenarios, +the varying physical locations of objects correspond to different levels of +danger. However, conventional evaluation criteria for automatic driving object +detection often overlook the crucial aspect of an object's physical location, +leading to evaluation results that may not accurately reflect the genuine +threat posed by the object to the autonomous driving vehicle. To enhance the +safety of autonomous driving, this paper introduces a novel evaluation +criterion based on physical location information, termed PLoc. This criterion +transcends the limitations of traditional criteria by acknowledging that the +physical location of pedestrians in autonomous driving scenarios can provide +valuable safety-related information. Furthermore, this paper presents a newly +re-annotated dataset (ApolloScape-R) derived from ApolloScape. ApolloScape-R +involves the relabeling of pedestrians based on the significance of their +physical location. The dataset is utilized to assess the performance of various +object detection models under the proposed PLoc criterion. Experimental results +demonstrate that the average accuracy of all object detection models in +identifying a person situated in the travel lane of an autonomous vehicle is +lower than that for a person on a sidewalk. The dataset is publicly available +at https://github.com/lnyrlyed/ApolloScape-R.git + +
+
+
+
+
+ + ☆ MambaMixer: Efficient Selective State Space Models with Dual Token and + Channel Selection + + +
+ Recent advances in deep learning have mainly relied on Transformers due to +their data dependency and ability to learn at scale. The attention module in +these architectures, however, exhibits quadratic time and space in input size, +limiting their scalability for long-sequence modeling. Despite recent attempts +to design efficient and effective architecture backbone for multi-dimensional +data, such as images and multivariate time series, existing models are either +data independent, or fail to allow inter- and intra-dimension communication. +Recently, State Space Models (SSMs), and more specifically Selective State +Space Models, with efficient hardware-aware implementation, have shown +promising potential for long sequence modeling. Motivated by the success of +SSMs, we present MambaMixer, a new architecture with data-dependent weights +that uses a dual selection mechanism across tokens and channels, called +Selective Token and Channel Mixer. MambaMixer connects selective mixers using a +weighted averaging mechanism, allowing layers to have direct access to early +features. As a proof of concept, we design Vision MambaMixer (ViM2) and Time +Series MambaMixer (TSM2) architectures based on the MambaMixer block and +explore their performance in various vision and time series forecasting tasks. +Our results underline the importance of selective mixing across both tokens and +channels. In ImageNet classification, object detection, and semantic +segmentation tasks, ViM2 achieves competitive performance with well-established +vision models and outperforms SSM-based vision models. In time series +forecasting, TSM2 achieves outstanding performance compared to state-of-the-art +methods while demonstrating significantly improved computational cost. These +results show that while Transformers, cross-channel attention, and MLPs are +sufficient for good performance in time series forecasting, neither is +necessary. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Optimal Blackjack Strategy Recommender: A Comprehensive Study on + Computer Vision Integration for Enhanced Gameplay + + +
+ This research project investigates the application of several computer vision +techniques for playing card detection and recognition in the context of the +popular casino game, blackjack. The primary objective is to develop a robust +system that is capable of detecting and accurately classifying playing cards in +real-time, and displaying the optimal move recommendation based on the given +image of the current game. The proposed methodology involves using K-Means for +image segmentation, card reprojection and feature extraction, training of the +KNN classifier using a labeled dataset, and integration of the detection system +into a Blackjack Basic Strategy recommendation algorithm. Further, the study +aims to observe the effectiveness of this approach in detecting various card +designs under different lighting conditions and occlusions. Overall, the +project examines the potential benefits of incorporating computer vision +techniques, with a specific focus on card detection, into commonly played games +aiming to enhance player decision-making and optimize strategic outcomes. The +results obtained from our experimental evaluations with models developed under +considerable time constraints, highlight the potential for practical +implementation in real-world casino environments and across other similarly +structured games. + +
+
+ comment: 24 pages, 13 figures +
+
+
+
+
+ + ☆ On Inherent Adversarial Robustness of Active Vision Systems + + +
+ Current Deep Neural Networks are vulnerable to adversarial examples, which +alter their predictions by adding carefully crafted noise. Since human eyes are +robust to such inputs, it is possible that the vulnerability stems from the +standard way of processing inputs in one shot by processing every pixel with +the same importance. In contrast, neuroscience suggests that the human vision +system can differentiate salient features by (1) switching between multiple +fixation points (saccades) and (2) processing the surrounding with a +non-uniform external resolution (foveation). In this work, we advocate that the +integration of such active vision mechanisms into current deep learning systems +can offer robustness benefits. Specifically, we empirically demonstrate the +inherent robustness of two active vision methods - GFNet and FALcon - under a +black box threat model. By learning and inferencing based on downsampled +glimpses obtained from multiple distinct fixation points within an input, we +show that these active methods achieve (2-3) times greater robustness compared +to a standard passive convolutional network under state-of-the-art adversarial +attacks. More importantly, we provide illustrative and interpretable +visualization analysis that demonstrates how performing inference from distinct +fixation points makes active vision methods less vulnerable to malicious +inputs. + +
+
+
+
+
+ + ☆ Multi-Region Transfer Learning for Segmentation of Crop Field Boundaries + in Satellite Images with Limited Labels AAAI + + +
+ The goal of field boundary delineation is to predict the polygonal boundaries +and interiors of individual crop fields in overhead remotely sensed images +(e.g., from satellites or drones). Automatic delineation of field boundaries is +a necessary task for many real-world use cases in agriculture, such as +estimating cultivated area in a region or predicting end-of-season yield in a +field. Field boundary delineation can be framed as an instance segmentation +problem, but presents unique research challenges compared to traditional +computer vision datasets used for instance segmentation. The practical +applicability of previous work is also limited by the assumption that a +sufficiently-large labeled dataset is available where field boundary +delineation models will be applied, which is not the reality for most regions +(especially under-resourced regions such as Sub-Saharan Africa). We present an +approach for segmentation of crop field boundaries in satellite images in +regions lacking labeled data that uses multi-region transfer learning to adapt +model weights for the target region. We show that our approach outperforms +existing methods and that multi-region transfer learning substantially boosts +performance for multiple model architectures. Our implementation and datasets +are publicly available to enable use of the approach by end-users and serve as +a benchmark for future work. + +
+
+ comment: Accepted for 2023 AAAI Workshop on AI to Accelerate Science and + Engineering +
+
+
+
+
+ + ☆ Universal Bovine Identification via Depth Data and Deep Metric Learning + + +
+ This paper proposes and evaluates, for the first time, a top-down (dorsal +view), depth-only deep learning system for accurately identifying individual +cattle and provides associated code, datasets, and training weights for +immediate reproducibility. An increase in herd size skews the cow-to-human +ratio at the farm and makes the manual monitoring of individuals more +challenging. Therefore, real-time cattle identification is essential for the +farms and a crucial step towards precision livestock farming. Underpinned by +our previous work, this paper introduces a deep-metric learning method for +cattle identification using depth data from an off-the-shelf 3D camera. The +method relies on CNN and MLP backbones that learn well-generalised embedding +spaces from the body shape to differentiate individuals -- requiring neither +species-specific coat patterns nor close-up muzzle prints for operation. The +network embeddings are clustered using a simple algorithm such as $k$-NN for +highly accurate identification, thus eliminating the need to retrain the +network for enrolling new individuals. We evaluate two backbone architectures, +ResNet, as previously used to identify Holstein Friesians using RGB images, and +PointNet, which is specialised to operate on 3D point clouds. We also present +CowDepth2023, a new dataset containing 21,490 synchronised colour-depth image +pairs of 99 cows, to evaluate the backbones. Both ResNet and PointNet +architectures, which consume depth maps and point clouds, respectively, led to +high accuracy that is on par with the coat pattern-based backbone. + +
+
+ comment: LaTeX, 38 pages, 14 figures, 3 tables +
+
+
+
+
+ + ☆ Multi-Level Neural Scene Graphs for Dynamic Urban Environments CVPR 2024 + + +
+ We estimate the radiance field of large-scale dynamic areas from multiple +vehicle captures under varying environmental conditions. Previous works in this +domain are either restricted to static environments, do not scale to more than +a single short video, or struggle to separately represent dynamic object +instances. To this end, we present a novel, decomposable radiance field +approach for dynamic urban environments. We propose a multi-level neural scene +graph representation that scales to thousands of images from dozens of +sequences with hundreds of fast-moving objects. To enable efficient training +and rendering of our representation, we develop a fast composite ray sampling +and rendering scheme. To test our approach in urban driving scenarios, we +introduce a new, novel view synthesis benchmark. We show that our approach +outperforms prior art by a significant margin on both established and our +proposed benchmark while being faster in training and rendering. + +
+
+ comment: CVPR 2024. Project page is available at + https://tobiasfshr.github.io/pub/ml-nsg/ +
+
+
+
+
+ + ☆ Uncovering Bias in Large Vision-Language Models with Counterfactuals + + +
+ With the advent of Large Language Models (LLMs) possessing increasingly +impressive capabilities, a number of Large Vision-Language Models (LVLMs) have +been proposed to augment LLMs with visual inputs. Such models condition +generated text on both an input image and a text prompt, enabling a variety of +use cases such as visual question answering and multimodal chat. While prior +studies have examined the social biases contained in text generated by LLMs, +this topic has been relatively unexplored in LVLMs. Examining social biases in +LVLMs is particularly challenging due to the confounding contributions of bias +induced by information contained across the text and visual modalities. To +address this challenging problem, we conduct a large-scale study of text +generated by different LVLMs under counterfactual changes to input images. +Specifically, we present LVLMs with identical open-ended text prompts while +conditioning on images from different counterfactual sets, where each set +contains images which are largely identical in their depiction of a common +subject (e.g., a doctor), but vary only in terms of intersectional social +attributes (e.g., race and gender). We comprehensively evaluate the text +produced by different LVLMs under this counterfactual generation setting and +find that social attributes such as race, gender, and physical characteristics +depicted in input images can significantly influence toxicity and the +generation of competency-associated words. + +
+
+
+
+
+ + ☆ CT respiratory motion synthesis using joint supervised and adversarial + learning + + +
+ Objective: Four-dimensional computed tomography (4DCT) imaging consists in +reconstructing a CT acquisition into multiple phases to track internal organ +and tumor motion. It is commonly used in radiotherapy treatment planning to +establish planning target volumes. However, 4DCT increases protocol complexity, +may not align with patient breathing during treatment, and lead to higher +radiation delivery. Approach: In this study, we propose a deep synthesis method +to generate pseudo respiratory CT phases from static images for motion-aware +treatment planning. The model produces patient-specific deformation vector +fields (DVFs) by conditioning synthesis on external patient surface-based +estimation, mimicking respiratory monitoring devices. A key methodological +contribution is to encourage DVF realism through supervised DVF training while +using an adversarial term jointly not only on the warped image but also on the +magnitude of the DVF itself. This way, we avoid excessive smoothness typically +obtained through deep unsupervised learning, and encourage correlations with +the respiratory amplitude. Main results: Performance is evaluated using real +4DCT acquisitions with smaller tumor volumes than previously reported. Results +demonstrate for the first time that the generated pseudo-respiratory CT phases +can capture organ and tumor motion with similar accuracy to repeated 4DCT scans +of the same patient. Mean inter-scans tumor center-of-mass distances and Dice +similarity coefficients were $1.97$mm and $0.63$, respectively, for real 4DCT +phases and $2.35$mm and $0.71$ for synthetic phases, and compares favorably to +a state-of-the-art technique (RMSim). + +
+
+ comment: to appear in Phys. Med. Biol +
+
+
+
+
+ + ☆ VSRD: Instance-Aware Volumetric Silhouette Rendering for Weakly + Supervised 3D Object Detection CVPR 2024 + + +
+ Monocular 3D object detection poses a significant challenge in 3D scene +understanding due to its inherently ill-posed nature in monocular depth +estimation. Existing methods heavily rely on supervised learning using abundant +3D labels, typically obtained through expensive and labor-intensive annotation +on LiDAR point clouds. To tackle this problem, we propose a novel weakly +supervised 3D object detection framework named VSRD (Volumetric Silhouette +Rendering for Detection) to train 3D object detectors without any 3D +supervision but only weak 2D supervision. VSRD consists of multi-view 3D +auto-labeling and subsequent training of monocular 3D object detectors using +the pseudo labels generated in the auto-labeling stage. In the auto-labeling +stage, we represent the surface of each instance as a signed distance field +(SDF) and render its silhouette as an instance mask through our proposed +instance-aware volumetric silhouette rendering. To directly optimize the 3D +bounding boxes through rendering, we decompose the SDF of each instance into +the SDF of a cuboid and the residual distance field (RDF) that represents the +residual from the cuboid. This mechanism enables us to optimize the 3D bounding +boxes in an end-to-end manner by comparing the rendered instance masks with the +ground truth instance masks. The optimized 3D bounding boxes serve as effective +training data for 3D object detection. We conduct extensive experiments on the +KITTI-360 dataset, demonstrating that our method outperforms the existing +weakly supervised 3D object detection methods. The code is available at +https://github.com/skmhrk1209/VSRD. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Fast OMP for Exact Recovery and Sparse Approximation + + +
+ Orthogonal Matching Pursuit (OMP) has been a powerful method in sparse signal +recovery and approximation. However OMP suffers computational issue when the +signal has large number of non-zeros. This paper advances OMP in two fronts: it +offers a fast algorithm for the orthogonal projection of the input signal at +each iteration, and a new selection criterion for making the greedy choice, +which reduces the number of iterations it takes to recover the signal. The +proposed modifications to OMP directly reduce the computational complexity. +Experiment results show significant improvement over the classical OMP in +computation time. The paper also provided a sufficient condition for exact +recovery under the new greedy choice criterion. For general signals that may +not have sparse representations, the paper provides a bound for the +approximation error. The approximation error is at the same order as OMP but is +obtained within fewer iterations and less time. + +
+
+
+
+
+ + ☆ An Interpretable Cross-Attentive Multi-modal MRI Fusion Framework for + Schizophrenia Diagnosis + + +
+ Both functional and structural magnetic resonance imaging (fMRI and sMRI) are +widely used for the diagnosis of mental disorder. However, combining +complementary information from these two modalities is challenging due to their +heterogeneity. Many existing methods fall short of capturing the interaction +between these modalities, frequently defaulting to a simple combination of +latent features. In this paper, we propose a novel Cross-Attentive Multi-modal +Fusion framework (CAMF), which aims to capture both intra-modal and inter-modal +relationships between fMRI and sMRI, enhancing multi-modal data representation. +Specifically, our CAMF framework employs self-attention modules to identify +interactions within each modality while cross-attention modules identify +interactions between modalities. Subsequently, our approach optimizes the +integration of latent features from both modalities. This approach +significantly improves classification accuracy, as demonstrated by our +evaluations on two extensive multi-modal brain imaging datasets, where CAMF +consistently outperforms existing methods. Furthermore, the gradient-guided +Score-CAM is applied to interpret critical functional networks and brain +regions involved in schizophrenia. The bio-markers identified by CAMF align +with established research, potentially offering new insights into the diagnosis +and pathological endophenotypes of schizophrenia. + +
+
+
+
+
+ + ☆ FetalDiffusion: Pose-Controllable 3D Fetal MRI Synthesis with + Conditional Diffusion Model MICCAI 2024 + + +
+ The quality of fetal MRI is significantly affected by unpredictable and +substantial fetal motion, leading to the introduction of artifacts even when +fast acquisition sequences are employed. The development of 3D real-time fetal +pose estimation approaches on volumetric EPI fetal MRI opens up a promising +avenue for fetal motion monitoring and prediction. Challenges arise in fetal +pose estimation due to limited number of real scanned fetal MR training images, +hindering model generalization when the acquired fetal MRI lacks adequate pose. + In this study, we introduce FetalDiffusion, a novel approach utilizing a +conditional diffusion model to generate 3D synthetic fetal MRI with +controllable pose. Additionally, an auxiliary pose-level loss is adopted to +enhance model performance. Our work demonstrates the success of this proposed +model by producing high-quality synthetic fetal MRI images with accurate and +recognizable fetal poses, comparing favorably with in-vivo real fetal MRI. +Furthermore, we show that the integration of synthetic fetal MR images enhances +the fetal pose estimation model's performance, particularly when the number of +available real scanned data is limited resulting in 15.4% increase in PCK and +50.2% reduced in mean error. All experiments are done on a single 32GB V100 +GPU. Our method holds promise for improving real-time tracking models, thereby +addressing fetal motion issues more effectively. + +
+
+ comment: 8 pages, 3 figures, 2 tables, submitted to MICCAI 2024, code + available if accepted +
+
+
+
+
+ + ☆ FISBe: A real-world benchmark dataset for instance segmentation of + long-range thin filamentous structures CVPR2024 + + +
+ Instance segmentation of neurons in volumetric light microscopy images of +nervous systems enables groundbreaking research in neuroscience by facilitating +joint functional and morphological analyses of neural circuits at cellular +resolution. Yet said multi-neuron light microscopy data exhibits extremely +challenging properties for the task of instance segmentation: Individual +neurons have long-ranging, thin filamentous and widely branching morphologies, +multiple neurons are tightly inter-weaved, and partial volume effects, uneven +illumination and noise inherent to light microscopy severely impede local +disentangling as well as long-range tracing of individual neurons. These +properties reflect a current key challenge in machine learning research, namely +to effectively capture long-range dependencies in the data. While respective +methodological research is buzzing, to date methods are typically benchmarked +on synthetic datasets. To address this gap, we release the FlyLight Instance +Segmentation Benchmark (FISBe) dataset, the first publicly available +multi-neuron light microscopy dataset with pixel-wise annotations. In addition, +we define a set of instance segmentation metrics for benchmarking that we +designed to be meaningful with regard to downstream analyses. Lastly, we +provide three baselines to kick off a competition that we envision to both +advance the field of machine learning regarding methodology for capturing +long-range data dependencies, and facilitate scientific discovery in basic +neuroscience. + +
+
+ comment: CVPR2024, Project page: https://kainmueller-lab.github.io/fisbe +
+
+
+
+
+ + ☆ AgileFormer: Spatially Agile Transformer UNet for Medical Image + Segmentation + + +
+ In the past decades, deep neural networks, particularly convolutional neural +networks, have achieved state-of-the-art performance in a variety of medical +image segmentation tasks. Recently, the introduction of the vision transformer +(ViT) has significantly altered the landscape of deep segmentation models. +There has been a growing focus on ViTs, driven by their excellent performance +and scalability. However, we argue that the current design of the vision +transformer-based UNet (ViT-UNet) segmentation models may not effectively +handle the heterogeneous appearance (e.g., varying shapes and sizes) of objects +of interest in medical image segmentation tasks. To tackle this challenge, we +present a structured approach to introduce spatially dynamic components to the +ViT-UNet. This adaptation enables the model to effectively capture features of +target objects with diverse appearances. This is achieved by three main +components: \textbf{(i)} deformable patch embedding; \textbf{(ii)} spatially +dynamic multi-head attention; \textbf{(iii)} deformable positional encoding. +These components were integrated into a novel architecture, termed AgileFormer. +AgileFormer is a spatially agile ViT-UNet designed for medical image +segmentation. Experiments in three segmentation tasks using publicly available +datasets demonstrated the effectiveness of the proposed method. The code is +available at +\href{https://github.com/sotiraslab/AgileFormer}{https://github.com/sotiraslab/AgileFormer}. + +
+
+
+
+
+ + ☆ Deepfake Sentry: Harnessing Ensemble Intelligence for Resilient + Detection and Generalisation + + +
+ Recent advancements in Generative Adversarial Networks (GANs) have enabled +photorealistic image generation with high quality. However, the malicious use +of such generated media has raised concerns regarding visual misinformation. +Although deepfake detection research has demonstrated high accuracy, it is +vulnerable to advances in generation techniques and adversarial iterations on +detection countermeasures. To address this, we propose a proactive and +sustainable deepfake training augmentation solution that introduces artificial +fingerprints into models. We achieve this by employing an ensemble learning +approach that incorporates a pool of autoencoders that mimic the effect of the +artefacts introduced by the deepfake generator models. Experiments on three +datasets reveal that our proposed ensemble autoencoder-based data augmentation +learning approach offers improvements in terms of generalisation, resistance +against basic data perturbations such as noise, blurring, sharpness +enhancement, and affine transforms, resilience to commonly used lossy +compression algorithms such as JPEG, and enhanced resistance against +adversarial attacks. + +
+
+ comment: 16 pages, 1 figure, U.P.B. Sci. Bull., Series C, Vol. 85, Iss. 4, + 2023 +
+
+
+
+
+ + ☆ Robust Ensemble Person Re-Identification via Orthogonal Fusion with + Occlusion Handling + + +
+ Occlusion remains one of the major challenges in person reidentification +(ReID) as a result of the diversity of poses and the variation of appearances. +Developing novel architectures to improve the robustness of occlusion-aware +person Re-ID requires new insights, especially on low-resolution edge cameras. +We propose a deep ensemble model that harnesses both CNN and Transformer +architectures to generate robust feature representations. To achieve robust +Re-ID without the need to manually label occluded regions, we propose to take +an ensemble learning-based approach derived from the analogy between +arbitrarily shaped occluded regions and robust feature representation. Using +the orthogonality principle, our developed deep CNN model makes use of masked +autoencoder (MAE) and global-local feature fusion for robust person +identification. Furthermore, we present a part occlusion-aware transformer +capable of learning feature space that is robust to occluded regions. +Experimental results are reported on several Re-ID datasets to show the +effectiveness of our developed ensemble model named orthogonal fusion with +occlusion handling (OFOH). Compared to competing methods, the proposed OFOH +approach has achieved competent rank-1 and mAP performance. + +
+
+
+
+
+ + ☆ PikeLPN: Mitigating Overlooked Inefficiencies of Low-Precision Neural + Networks CVPR 2024 + + +
+ Low-precision quantization is recognized for its efficacy in neural network +optimization. Our analysis reveals that non-quantized elementwise operations +which are prevalent in layers such as parameterized activation functions, batch +normalization, and quantization scaling dominate the inference cost of +low-precision models. These non-quantized elementwise operations are commonly +overlooked in SOTA efficiency metrics such as Arithmetic Computation Effort +(ACE). In this paper, we propose ACEv2 - an extended version of ACE which +offers a better alignment with the inference cost of quantized models and their +energy consumption on ML hardware. Moreover, we introduce PikeLPN, a model that +addresses these efficiency issues by applying quantization to both elementwise +operations and multiply-accumulate operations. In particular, we present a +novel quantization technique for batch normalization layers named QuantNorm +which allows for quantizing the batch normalization parameters without +compromising the model performance. Additionally, we propose applying Double +Quantization where the quantization scaling parameters are quantized. +Furthermore, we recognize and resolve the issue of distribution mismatch in +Separable Convolution layers by introducing Distribution-Heterogeneous +Quantization which enables quantizing them to low-precision. PikeLPN achieves +Pareto-optimality in efficiency-accuracy trade-off with up to 3X efficiency +improvement compared to SOTA low-precision models. + +
+
+ comment: Accepted in CVPR 2024. 10 Figures, 9 Tables +
+
+
+
+
+ + ☆ Sparse Views, Near Light: A Practical Paradigm for Uncalibrated + Point-light Photometric Stereo CVPR 2024 + + +
+ Neural approaches have shown a significant progress on camera-based +reconstruction. But they require either a fairly dense sampling of the viewing +sphere, or pre-training on an existing dataset, thereby limiting their +generalizability. In contrast, photometric stereo (PS) approaches have shown +great potential for achieving high-quality reconstruction under sparse +viewpoints. Yet, they are impractical because they typically require tedious +laboratory conditions, are restricted to dark rooms, and often multi-staged, +making them subject to accumulated errors. To address these shortcomings, we +propose an end-to-end uncalibrated multi-view PS framework for reconstructing +high-resolution shapes acquired from sparse viewpoints in a real-world +environment. We relax the dark room assumption, and allow a combination of +static ambient lighting and dynamic near LED lighting, thereby enabling easy +data capture outside the lab. Experimental validation confirms that it +outperforms existing baseline approaches in the regime of sparse viewpoints by +a large margin. This allows to bring high-accuracy 3D reconstruction from the +dark room to the real world, while maintaining a reasonable data capture +complexity. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ☆ DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries + + +
+ Modern video segmentation methods adopt object queries to perform inter-frame +association and demonstrate satisfactory performance in tracking continuously +appearing objects despite large-scale motion and transient occlusion. + However, they all underperform on newly emerging and disappearing objects +that are common in the real world because they attempt to model object +emergence and disappearance through feature transitions between background and +foreground queries that have significant feature gaps. We introduce Dynamic +Anchor Queries (DAQ) to shorten the transition gap between the anchor and +target queries by dynamically generating anchor queries based on the features +of potential candidates. + Furthermore, we introduce a query-level object Emergence and Disappearance +Simulation (EDS) strategy, which unleashes DAQ's potential without any +additional cost. + Finally, we combine our proposed DAQ and EDS with DVIS~\cite{zhang2023dvis} +to obtain DVIS-DAQ. + Extensive experiments demonstrate that DVIS-DAQ achieves a new +state-of-the-art (SOTA) performance on five mainstream video segmentation +benchmarks. Code and models are available at +\url{https://github.com/SkyworkAI/DAQ-VS}. + +
+
+
+
+
+ + ☆ Holo-VQVAE: VQ-VAE for phase-only holograms + + +
+ Holography stands at the forefront of visual technology innovation, offering +immersive, three-dimensional visualizations through the manipulation of light +wave amplitude and phase. Contemporary research in hologram generation has +predominantly focused on image-to-hologram conversion, producing holograms from +existing images. These approaches, while effective, inherently limit the scope +of innovation and creativity in hologram generation. In response to this +limitation, we present Holo-VQVAE, a novel generative framework tailored for +phase-only holograms (POHs). Holo-VQVAE leverages the architecture of Vector +Quantized Variational AutoEncoders, enabling it to learn the complex +distributions of POHs. Furthermore, it integrates the Angular Spectrum Method +into the training process, facilitating learning in the image domain. This +framework allows for the generation of unseen, diverse holographic content +directly from its intricately learned latent space without requiring +pre-existing images. This pioneering work paves the way for groundbreaking +applications and methodologies in holographic content creation, opening a new +era in the exploration of holographic content. + +
+
+
+
+
+ + ♻ ☆ LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and + 200+ FPS + + +
+ Recent advancements in real-time neural rendering using point-based +techniques have paved the way for the widespread adoption of 3D +representations. However, foundational approaches like 3D Gaussian Splatting +come with a substantial storage overhead caused by growing the SfM points to +millions, often demanding gigabyte-level disk space for a single unbounded +scene, posing significant scalability challenges and hindering the splatting +efficiency. + To address this challenge, we introduce LightGaussian, a novel method +designed to transform 3D Gaussians into a more efficient and compact format. +Drawing inspiration from the concept of Network Pruning, LightGaussian +identifies Gaussians that are insignificant in contributing to the scene +reconstruction and adopts a pruning and recovery process, effectively reducing +redundancy in Gaussian counts while preserving visual effects. Additionally, +LightGaussian employs distillation and pseudo-view augmentation to distill +spherical harmonics to a lower degree, allowing knowledge transfer to more +compact representations while maintaining reflectance. Furthermore, we propose +a hybrid scheme, VecTree Quantization, to quantize all attributes, resulting in +lower bitwidth representations with minimal accuracy losses. + In summary, LightGaussian achieves an averaged compression rate over 15x +while boosting the FPS from 139 to 215, enabling an efficient representation of +complex scenes on Mip-NeRF 360, Tank and Temple datasets. + Project website: https://lightgaussian.github.io/ + +
+
+ comment: 16pages, 8figures +
+
+
+
+
+ + ♻ ☆ Gromov-Wassertein-like Distances in the Gaussian Mixture Models Space + + +
+ The Gromov-Wasserstein (GW) distance is frequently used in machine learning +to compare distributions across distinct metric spaces. Despite its utility, it +remains computationally intensive, especially for large-scale problems. +Recently, a novel Wasserstein distance specifically tailored for Gaussian +mixture models and known as MW (mixture Wasserstein) has been introduced by +several authors. In scenarios where data exhibit clustering, this approach +simplifies to a small-scale discrete optimal transport problem, which +complexity depends solely on the number of Gaussian components in the GMMs. +This paper aims to extend MW by introducing new Gromov-type distances. These +distances are designed to be isometry-invariant in Euclidean spaces and are +applicable for comparing GMMs across different dimensional spaces. Our first +contribution is the Mixture Gromov Wasserstein distance (MGW), which can be +viewed as a Gromovized version of MW. This new distance has a straightforward +discrete formulation, making it highly efficient for estimating distances +between GMMs in practical applications. To facilitate the derivation of a +transport plan between GMMs, we present a second distance, the Embedded +Wasserstein distance (EW). This distance turns out to be closely related to +several recent alternatives to Gromov-Wasserstein. We show that EW can be +adapted to derive a distance as well as optimal transportation plans between +GMMs. We demonstrate the efficiency of these newly proposed distances on medium +to large-scale problems, including shape matching and hyperspectral image color +transfer. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation ICLR 2024 + + +
+ While Large Language Models (LLMs) are the dominant models for generative +tasks in language, they do not perform as well as diffusion models on image and +video generation. To effectively use LLMs for visual generation, one crucial +component is the visual tokenizer that maps pixel-space inputs to discrete +tokens appropriate for LLM learning. In this paper, we introduce MAGVIT-v2, a +video tokenizer designed to generate concise and expressive tokens for both +videos and images using a common token vocabulary. Equipped with this new +tokenizer, we show that LLMs outperform diffusion models on standard image and +video generation benchmarks including ImageNet and Kinetics. In addition, we +demonstrate that our tokenizer surpasses the previously top-performing video +tokenizer on two more tasks: (1) video compression comparable to the +next-generation video codec (VCC) according to human evaluations, and (2) +learning effective representations for action recognition tasks. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ SERNet-Former: Semantic Segmentation by Efficient Residual Network with + Attention-Boosting Gates and Attention-Fusion Networks + + +
+ Improving the efficiency of state-of-the-art methods in semantic segmentation +requires overcoming the increasing computational cost as well as issues such as +fusing semantic information from global and local contexts. Based on the recent +success and problems that convolutional neural networks (CNNs) encounter in +semantic segmentation, this research proposes an encoder-decoder architecture +with a unique efficient residual network, Efficient-ResNet. Attention-boosting +gates (AbGs) and attention-boosting modules (AbMs) are deployed by aiming to +fuse the equivariant and feature-based semantic information with the equivalent +sizes of the output of global context of the efficient residual network in the +encoder. Respectively, the decoder network is developed with the additional +attention-fusion networks (AfNs) inspired by AbM. AfNs are designed to improve +the efficiency in the one-to-one conversion of the semantic information by +deploying additional convolution layers in the decoder part. Our network is +tested on the challenging CamVid and Cityscapes datasets, and the proposed +methods reveal significant improvements on the residual networks. To the best +of our knowledge, the developed network, SERNet-Former, achieves +state-of-the-art results (84.62 % mean IoU) on CamVid dataset and challenging +results (87.35 % mean IoU) on Cityscapes validation dataset. + +
+
+
+
+
+ + ♻ ☆ Learning to Count without Annotations CVPR'24 + + +
+ While recent supervised methods for reference-based object counting continue +to improve the performance on benchmark datasets, they have to rely on small +datasets due to the cost associated with manually annotating dozens of objects +in images. We propose UnCounTR, a model that can learn this task without +requiring any manual annotations. To this end, we construct "Self-Collages", +images with various pasted objects as training samples, that provide a rich +learning signal covering arbitrary object types and counts. Our method builds +on existing unsupervised representations and segmentation techniques to +successfully demonstrate for the first time the ability of reference-based +counting without manual supervision. Our experiments show that our method not +only outperforms simple baselines and generic models such as FasterRCNN and +DETR, but also matches the performance of supervised counting models in some +domains. + +
+
+ comment: Accepted at CVPR'24. Code available at + https://github.com/lukasknobel/SelfCollages +
+
+
+
+
+ + ♻ ☆ LipSim: A Provably Robust Perceptual Similarity Metric + + +
+ Recent years have seen growing interest in developing and applying perceptual +similarity metrics. Research has shown the superiority of perceptual metrics +over pixel-wise metrics in aligning with human perception and serving as a +proxy for the human visual system. On the other hand, as perceptual metrics +rely on neural networks, there is a growing concern regarding their resilience, +given the established vulnerability of neural networks to adversarial attacks. +It is indeed logical to infer that perceptual metrics may inherit both the +strengths and shortcomings of neural networks. In this work, we demonstrate the +vulnerability of state-of-the-art perceptual similarity metrics based on an +ensemble of ViT-based feature extractors to adversarial attacks. We then +propose a framework to train a robust perceptual similarity metric called +LipSim (Lipschitz Similarity Metric) with provable guarantees. By leveraging +1-Lipschitz neural networks as the backbone, LipSim provides guarded areas +around each data point and certificates for all perturbations within an +$\ell_2$ ball. Finally, a comprehensive set of experiments shows the +performance of LipSim in terms of natural and certified scores and on the image +retrieval application. The code is available at +https://github.com/SaraGhazanfari/LipSim. + +
+
+
+
+
+ + ♻ ☆ RNb-NeuS: Reflectance and Normal-based Multi-View 3D Reconstruction CVPR 2024 + + +
+ This paper introduces a versatile paradigm for integrating multi-view +reflectance (optional) and normal maps acquired through photometric stereo. Our +approach employs a pixel-wise joint re-parameterization of reflectance and +normal, considering them as a vector of radiances rendered under simulated, +varying illumination. This re-parameterization enables the seamless integration +of reflectance and normal maps as input data in neural volume rendering-based +3D reconstruction while preserving a single optimization objective. In +contrast, recent multi-view photometric stereo (MVPS) methods depend on +multiple, potentially conflicting objectives. Despite its apparent simplicity, +our proposed approach outperforms state-of-the-art approaches in MVPS +benchmarks across F-score, Chamfer distance, and mean angular error metrics. +Notably, it significantly improves the detailed 3D reconstruction of areas with +high curvature or low visibility. + +
+
+ comment: 14 pages, 13 figures, 7 tables. Accepted to CVPR 2024. The project + page can be accessed via + https://robinbruneau.github.io/publications/rnb_neus.html. The source code is + available at https://github.com/bbrument/RNb-NeuS +
+
+
+
+
+ + ♻ ☆ A Strong Baseline for Point Cloud Registration via Direct Superpoints + Matching + + +
+ Deep neural networks endow the downsampled superpoints with highly +discriminative feature representations. Previous dominant point cloud +registration approaches match these feature representations as the first step, +e.g., using the Sinkhorn algorithm. A RANSAC-like method is then usually +adopted as a post-processing refinement to filter the outliers. Other dominant +method is to directly predict the superpoint matchings using learned MLP +layers. Both of them have drawbacks: RANSAC-based methods are computationally +intensive and prediction-based methods suffer from outputing non-existing +points in the point cloud. In this paper, we propose a straightforward and +effective baseline to find correspondences of superpoints in a global matching +manner. We employ the normalized matching scores as weights for each +correspondence, allowing us to reject the outliers and further weigh the rest +inliers when fitting the transformation matrix without relying on the +cumbersome RANSAC. Moreover, the entire model can be trained in an end-to-end +fashion, leading to better accuracy. Our simple yet effective baseline shows +comparable or even better results than state-of-the-art methods on three +datasets including ModelNet, 3DMatch, and KITTI. We do not advocate our +approach to be \emph{the} solution for point cloud registration but use the +results to emphasize the role of matching strategy for point cloud +registration. The code and models are available at +https://github.com/neu-vi/Superpoints_Registration. + +
+
+
+
+
+ + ♻ ☆ VicTR: Video-conditioned Text Representations for Activity Recognition CVPR 2024 + + +
+ Vision-Language models (VLMs) have excelled in the image-domain -- especially +in zero-shot settings -- thanks to the availability of vast pretraining data +(i.e., paired image-text samples). However for videos, such paired data is not +as abundant. Therefore, video-VLMs are usually designed by adapting pretrained +image-VLMs to the video-domain, instead of training from scratch. All such +recipes rely on augmenting visual embeddings with temporal information (i.e., +image $\rightarrow$ video), often keeping text embeddings unchanged or even +being discarded. In this paper, we argue the contrary, that better video-VLMs +can be designed by focusing more on augmenting text, rather than visual +information. More specifically, we introduce Video-conditioned Text +Representations (VicTR): a form of text embeddings optimized w.r.t. visual +embeddings, creating a more-flexible contrastive latent space. Our model can +further make use of freely-available semantic information, in the form of +visually-grounded auxiliary text (e.g. object or scene information). We +evaluate our model on few-shot, zero-shot (HMDB-51, UCF-101), short-form +(Kinetics-400) and long-form (Charades) activity recognition benchmarks, +showing strong performance among video-VLMs. + +
+
+ comment: To appear at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Low-Energy Adaptive Personalization for Resource-Constrained + Devices + + +
+ The personalization of machine learning (ML) models to address data drift is +a significant challenge in the context of Internet of Things (IoT) +applications. Presently, most approaches focus on fine-tuning either the full +base model or its last few layers to adapt to new data, while often neglecting +energy costs. However, various types of data drift exist, and fine-tuning the +full base model or the last few layers may not result in optimal performance in +certain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy +adaptive personalization framework designed for resource-constrained devices. +We categorize data drift and personalization into three types: input-level, +feature-level, and output-level. For each type, we fine-tune different blocks +of the model to achieve optimal performance with reduced energy costs. +Specifically, input-, feature-, and output-level correspond to fine-tuning the +front, middle, and rear blocks of the model. We evaluate TBFT on a ResNet +model, three datasets, three different training sizes, and a Raspberry Pi. +Compared with the $Block Avg$, where each block is fine-tuned individually and +their performance improvements are averaged, TBFT exhibits an improvement in +model accuracy by an average of 15.30% whilst saving 41.57% energy consumption +on average compared with full fine-tuning. + +
+
+ comment: Accepetd to The 4th Workshop on Machine Learning and Systems + (EuroMLSys '24) +
+
+
+
+
+ + ♻ ☆ GlitchBench: Can large multimodal models detect video game glitches? CVPR 2024 + + +
+ Large multimodal models (LMMs) have evolved from large language models (LLMs) +to integrate multiple input modalities, such as visual inputs. This integration +augments the capacity of LLMs for tasks requiring visual comprehension and +reasoning. However, the extent and limitations of their enhanced abilities are +not fully understood, especially when it comes to real-world tasks. To address +this gap, we introduce GlitchBench, a novel benchmark derived from video game +quality assurance tasks, to test and evaluate the reasoning capabilities of +LMMs. Our benchmark is curated from a variety of unusual and glitched scenarios +from video games and aims to challenge both the visual and linguistic reasoning +powers of LMMs in detecting and interpreting out-of-the-ordinary events. We +evaluate multiple state-of-the-art LMMs, and we show that GlitchBench presents +a new challenge for these models. Code and data are available at: +https://glitchbench.github.io/ + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Rapid Motor Adaptation for Robotic Manipulator Arms CVPR 2024 + + +
+ Developing generalizable manipulation skills is a core challenge in embodied +AI. This includes generalization across diverse task configurations, +encompassing variations in object shape, density, friction coefficient, and +external disturbances such as forces applied to the robot. Rapid Motor +Adaptation (RMA) offers a promising solution to this challenge. It posits that +essential hidden variables influencing an agent's task performance, such as +object mass and shape, can be effectively inferred from the agent's action and +proprioceptive history. Drawing inspiration from RMA in locomotion and in-hand +rotation, we use depth perception to develop agents tailored for rapid motor +adaptation in a variety of manipulation tasks. We evaluated our agents on four +challenging tasks from the Maniskill2 benchmark, namely pick-and-place +operations with hundreds of objects from the YCB and EGAD datasets, peg +insertion with precise position and orientation, and operating a variety of +faucets and handles, with customized environment variations. Empirical results +demonstrate that our agents surpass state-of-the-art methods like automatic +domain randomization and vision-based policies, obtaining better generalization +performance and sample efficiency. + +
+
+ comment: Accepted at CVPR 2024. 12 pages +
+
+
+
+
+ + ♻ ☆ FlashAvatar: High-fidelity Head Avatar with Efficient Gaussian Embedding + + +
+ We propose FlashAvatar, a novel and lightweight 3D animatable avatar +representation that could reconstruct a digital avatar from a short monocular +video sequence in minutes and render high-fidelity photo-realistic images at +300FPS on a consumer-grade GPU. To achieve this, we maintain a uniform 3D +Gaussian field embedded in the surface of a parametric face model and learn +extra spatial offset to model non-surface regions and subtle facial details. +While full use of geometric priors can capture high-frequency facial details +and preserve exaggerated expressions, proper initialization can help reduce the +number of Gaussians, thus enabling super-fast rendering speed. Extensive +experimental results demonstrate that FlashAvatar outperforms existing works +regarding visual quality and personalized details and is almost an order of +magnitude faster in rendering speed. Project page: +https://ustc3dv.github.io/FlashAvatar/ + +
+
+ comment: Project page: https://ustc3dv.github.io/FlashAvatar/ +
+
+
+
+
+ + ♻ ☆ Joint chest X-ray diagnosis and clinical visual attention prediction + with multi-stage cooperative learning: enhancing interpretability + + +
+ As deep learning has become the state-of-the-art for computer-assisted +diagnosis, interpretability of the automatic decisions is crucial for clinical +deployment. While various methods were proposed in this domain, visual +attention maps of clinicians during radiological screening offer a unique asset +to provide important insights and can potentially enhance the quality of +computer-assisted diagnosis. With this paper, we introduce a novel +deep-learning framework for joint disease diagnosis and prediction of +corresponding visual saliency maps for chest X-ray scans. Specifically, we +designed a novel dual-encoder multi-task UNet, which leverages both a +DenseNet201 backbone and a Residual and Squeeze-and-Excitation block-based +encoder to extract diverse features for saliency map prediction, and a +multi-scale feature-fusion classifier to perform disease classification. To +tackle the issue of asynchronous training schedules of individual tasks in +multi-task learning, we proposed a multi-stage cooperative learning strategy, +with contrastive learning for feature encoder pretraining to boost performance. +Experiments show that our proposed method outperformed existing techniques for +chest X-ray diagnosis and the quality of visual saliency map prediction. + +
+
+
+
+
+ + ♻ ☆ EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised + Semantic Segmentation + + +
+ Semantic segmentation has innately relied on extensive pixel-level annotated +data, leading to the emergence of unsupervised methodologies. Among them, +leveraging self-supervised Vision Transformers for unsupervised semantic +segmentation (USS) has been making steady progress with expressive deep +features. Yet, for semantically segmenting images with complex objects, a +predominant challenge remains: the lack of explicit object-level semantic +encoding in patch-level features. This technical limitation often leads to +inadequate segmentation of complex objects with diverse structures. To address +this gap, we present a novel approach, EAGLE, which emphasizes object-centric +representation learning for unsupervised semantic segmentation. Specifically, +we introduce EiCue, a spectral technique providing semantic and structural cues +through an eigenbasis derived from the semantic similarity matrix of deep image +features and color affinity from an image. Further, by incorporating our +object-centric contrastive loss with EiCue, we guide our model to learn +object-level representations with intra- and inter-image object-feature +consistency, thereby enhancing semantic accuracy. Extensive experiments on +COCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art +USS results of EAGLE with accurate and consistent semantic segmentation across +complex scenes. + +
+
+
+
+
+ + ♻ ☆ Descriptor and Word Soups: Overcoming the Parameter Efficiency Accuracy + Tradeoff for Out-of-Distribution Few-shot Learning + + +
+ Over the past year, a large body of multimodal research has emerged around +zero-shot evaluation using GPT descriptors. These studies boost the zero-shot +accuracy of pretrained VL models with an ensemble of label-specific text +generated by GPT. A recent study, WaffleCLIP, demonstrated that similar +zero-shot accuracy can be achieved with an ensemble of random descriptors. +However, both zero-shot methods are un-trainable and consequently sub-optimal +when some few-shot out-of-distribution (OOD) training data is available. +Inspired by these prior works, we present two more flexible methods called +descriptor and word soups, which do not require an LLM at test time and can +leverage training data to increase OOD target accuracy. Descriptor soup +greedily selects a small set of textual descriptors using generic few-shot +training data, then calculates robust class embeddings using the selected +descriptors. Word soup greedily assembles a chain of words in a similar manner. +Compared to existing few-shot soft prompt tuning methods, word soup requires +fewer parameters by construction and less GPU memory, since it does not require +backpropagation. Both soups outperform current published few-shot methods, even +when combined with SoTA zero-shot methods, on cross-dataset and domain +generalization benchmarks. Compared with SoTA prompt and descriptor ensembling +methods, such as ProDA and WaffleCLIP, word soup achieves higher OOD accuracy +with fewer ensemble members. Please checkout our code: +github.com/Chris210634/word_soups + +
+
+
+
+
+ + ♻ ☆ LifelongMemory: Leveraging LLMs for Answering Queries in Long-form + Egocentric Videos + + +
+ In this paper we introduce LifelongMemory, a new framework for accessing +long-form egocentric videographic memory through natural language question +answering and retrieval. LifelongMemory generates concise video activity +descriptions of the camera wearer and leverages the zero-shot capabilities of +pretrained large language models to perform reasoning over long-form video +context. Furthermore, Lifelong Memory uses a confidence and explanation module +to produce confident, high-quality, and interpretable answers. Our approach +achieves state-of-the-art performance on the EgoSchema benchmark for question +answering and is highly competitive on the natural language query (NLQ) +challenge of Ego4D. Code is available at +https://github.com/Agentic-Learning-AI-Lab/lifelong-memory. + +
+
+
+
+
+ + ♻ ☆ DialogCC: An Automated Pipeline for Creating High-Quality Multi-Modal + Dialogue Dataset NAACL 2024 + + +
+ As sharing images in an instant message is a crucial factor, there has been +active research on learning an image-text multi-modal dialogue models. However, +training a well-generalized multi-modal dialogue model remains challenging due +to the low quality and limited diversity of images per dialogue in existing +multi-modal dialogue datasets. In this paper, we propose an automated pipeline +to construct a multi-modal dialogue dataset, ensuring both dialogue quality and +image diversity without requiring minimum human effort. In our pipeline, to +guarantee the coherence between images and dialogue, we prompt GPT-4 to infer +potential image-sharing moments - specifically, the utterance, speaker, +rationale, and image description. Furthermore, we leverage CLIP similarity to +maintain consistency between aligned multiple images to the utterance. Through +this pipeline, we introduce DialogCC, a high-quality and diverse multi-modal +dialogue dataset that surpasses existing datasets in terms of quality and +diversity in human evaluation. Our comprehensive experiments highlight that +when multi-modal dialogue models are trained using our dataset, their +generalization performance on unseen dialogue datasets is significantly +enhanced. We make our source code and dataset publicly available. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ 3DInAction: Understanding Human Actions in 3D Point Clouds + + +
+ We propose a novel method for 3D point cloud action recognition. +Understanding human actions in RGB videos has been widely studied in recent +years, however, its 3D point cloud counterpart remains under-explored. This is +mostly due to the inherent limitation of the point cloud data modality -- lack +of structure, permutation invariance, and varying number of points -- which +makes it difficult to learn a spatio-temporal representation. To address this +limitation, we propose the 3DinAction pipeline that first estimates patches +moving in time (t-patches) as a key building block, alongside a hierarchical +architecture that learns an informative spatio-temporal representation. We show +that our method achieves improved performance on existing datasets, including +DFAUST and IKEA ASM. Code is publicly available at +https://github.com/sitzikbs/3dincaction. + +
+
+
+
+
+ + ♻ ☆ DragVideo: Interactive Drag-style Video Editing + + +
+ Video generation models have shown their superior ability to generate +photo-realistic video. However, how to accurately control (or edit) the video +remains a formidable challenge. The main issues are: 1) how to perform direct +and accurate user control in editing; 2) how to execute editings like changing +shape, expression, and layout without unsightly distortion and artifacts to the +edited content; and 3) how to maintain spatio-temporal consistency of video +after editing. To address the above issues, we propose DragVideo, a general +drag-style video editing framework. Inspired by DragGAN, DragVideo addresses +issues 1) and 2) by proposing the drag-style video latent optimization method +which gives desired control by updating noisy video latent according to drag +instructions through video-level drag objective function. We amend issue 3) by +integrating the video diffusion model with sample-specific LoRA and Mutual +Self-Attention in DragVideo to ensure the edited result is spatio-temporally +consistent. We also present a series of testing examples for drag-style video +editing and conduct extensive experiments across a wide array of challenging +editing tasks, such as motion, skeleton editing, etc, underscoring DragVideo +can edit video in an intuitive, faithful to the user's intention manner, with +nearly unnoticeable distortion and artifacts, while maintaining spatio-temporal +consistency. While traditional prompt-based video editing fails to do the +former two and directly applying image drag editing fails in the last, +DragVideo's versatility and generality are emphasized. Github link: +https://github.com/RickySkywalker/DragVideo-Official. + +
+
+
+
+
+ + ♻ ☆ Self-learning Canonical Space for Multi-view 3D Human Pose Estimation + + +
+ Multi-view 3D human pose estimation is naturally superior to single view one, +benefiting from more comprehensive information provided by images of multiple +views. The information includes camera poses, 2D/3D human poses, and 3D +geometry. However, the accurate annotation of these information is hard to +obtain, making it challenging to predict accurate 3D human pose from multi-view +images. To deal with this issue, we propose a fully self-supervised framework, +named cascaded multi-view aggregating network (CMANet), to construct a +canonical parameter space to holistically integrate and exploit multi-view +information. In our framework, the multi-view information is grouped into two +categories: 1) intra-view information , 2) inter-view information. Accordingly, +CMANet consists of two components: intra-view module (IRV) and inter-view +module (IEV). IRV is used for extracting initial camera pose and 3D human pose +of each view; IEV is to fuse complementary pose information and cross-view 3D +geometry for a final 3D human pose. To facilitate the aggregation of the intra- +and inter-view, we define a canonical parameter space, depicted by per-view +camera pose and human pose and shape parameters ($\theta$ and $\beta$) of SMPL +model, and propose a two-stage learning procedure. At first stage, IRV learns +to estimate camera pose and view-dependent 3D human pose supervised by +confident output of an off-the-shelf 2D keypoint detector. At second stage, IRV +is frozen and IEV further refines the camera pose and optimizes the 3D human +pose by implicitly encoding the cross-view complement and 3D geometry +constraint, achieved by jointly fitting predicted multi-view 2D keypoints. The +proposed framework, modules, and learning strategy are demonstrated to be +effective by comprehensive experiments and CMANet is superior to +state-of-the-art methods in extensive quantitative and qualitative analysis. + +
+
+
+
+
+ + ♻ ☆ NOPE: Novel Object Pose Estimation from a Single Image CVPR 2024 + + +
+ The practicality of 3D object pose estimation remains limited for many +applications due to the need for prior knowledge of a 3D model and a training +period for new objects. To address this limitation, we propose an approach that +takes a single image of a new object as input and predicts the relative pose of +this object in new images without prior knowledge of the object's 3D model and +without requiring training time for new objects and categories. We achieve this +by training a model to directly predict discriminative embeddings for +viewpoints surrounding the object. This prediction is done using a simple U-Net +architecture with attention and conditioned on the desired pose, which yields +extremely fast inference. We compare our approach to state-of-the-art methods +and show it outperforms them both in terms of accuracy and robustness. Our +source code is publicly available at https://github.com/nv-nguyen/nope + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Gradient Reweighting: Towards Imbalanced Class-Incremental Learning CVPR 2024 + + +
+ Class-Incremental Learning (CIL) trains a model to continually recognize new +classes from non-stationary data while retaining learned knowledge. A major +challenge of CIL arises when applying to real-world data characterized by +non-uniform distribution, which introduces a dual imbalance problem involving +(i) disparities between stored exemplars of old tasks and new class data +(inter-phase imbalance), and (ii) severe class imbalances within each +individual task (intra-phase imbalance). We show that this dual imbalance issue +causes skewed gradient updates with biased weights in FC layers, thus inducing +over/under-fitting and catastrophic forgetting in CIL. Our method addresses it +by reweighting the gradients towards balanced optimization and unbiased +classifier learning. Additionally, we observe imbalanced forgetting where +paradoxically the instance-rich classes suffer higher performance degradation +during CIL due to a larger amount of training data becoming unavailable in +subsequent learning phases. To tackle this, we further introduce a +distribution-aware knowledge distillation loss to mitigate forgetting by +aligning output logits proportionally with the distribution of lost training +data. We validate our method on CIFAR-100, ImageNetSubset, and Food101 across +various evaluation protocols and demonstrate consistent improvements compared +to existing works, showing great potential to apply CIL in real-world scenarios +with enhanced robustness and effectiveness. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SHINOBI: Shape and Illumination using Neural Object Decomposition via + BRDF Optimization In-the-wild CVPR 2024 + + +
+ We present SHINOBI, an end-to-end framework for the reconstruction of shape, +material, and illumination from object images captured with varying lighting, +pose, and background. Inverse rendering of an object based on unconstrained +image collections is a long-standing challenge in computer vision and graphics +and requires a joint optimization over shape, radiance, and pose. We show that +an implicit shape representation based on a multi-resolution hash encoding +enables faster and robust shape reconstruction with joint camera alignment +optimization that outperforms prior work. Further, to enable the editing of +illumination and object reflectance (i.e. material) we jointly optimize BRDF +and illumination together with the object's shape. Our method is class-agnostic +and works on in-the-wild image collections of objects to produce relightable 3D +assets for several use cases such as AR/VR, movies, games, etc. Project page: +https://shinobi.aengelhardt.com Video: +https://www.youtube.com/watch?v=iFENQ6AcYd8&feature=youtu.be + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR 2024). Updated supplementary material and acknowledgements +
+
+
+
+
+ + ♻ ☆ Task2Box: Box Embeddings for Modeling Asymmetric Task Relationships + + +
+ Modeling and visualizing relationships between tasks or datasets is an +important step towards solving various meta-tasks such as dataset discovery, +multi-tasking, and transfer learning. However, many relationships, such as +containment and transferability, are naturally asymmetric and current +approaches for representation and visualization (e.g., t-SNE) do not readily +support this. We propose Task2Box, an approach to represent tasks using box +embeddings -- axis-aligned hyperrectangles in low dimensional spaces -- that +can capture asymmetric relationships between them through volumetric overlaps. +We show that Task2Box accurately predicts unseen hierarchical relationships +between nodes in ImageNet and iNaturalist datasets, as well as transferability +between tasks in the Taskonomy benchmark. We also show that box embeddings +estimated from task representations (e.g., CLIP, Task2Vec, or attribute based) +can be used to predict relationships between unseen tasks more accurately than +classifiers trained on the same representations, as well as handcrafted +asymmetric distances (e.g., KL divergence). This suggests that low-dimensional +box embeddings can effectively capture these task relationships and have the +added advantage of being interpretable. We use the approach to visualize +relationships among publicly available image classification datasets on popular +dataset hosting platform called Hugging Face. + +
+
+
+
+
+ + ♻ ☆ LoCoNet: Long-Short Context Network for Active Speaker Detection CVPR 2024 + + +
+ Active Speaker Detection (ASD) aims to identify who is speaking in each frame +of a video. ASD reasons from audio and visual information from two contexts: +long-term intra-speaker context and short-term inter-speaker context. Long-term +intra-speaker context models the temporal dependencies of the same speaker, +while short-term inter-speaker context models the interactions of speakers in +the same scene. These two contexts are complementary to each other and can help +infer the active speaker. Motivated by these observations, we propose LoCoNet, +a simple yet effective Long-Short Context Network that models the long-term +intra-speaker context and short-term inter-speaker context. We use +self-attention to model long-term intra-speaker context due to its +effectiveness in modeling long-range dependencies, and convolutional blocks +that capture local patterns to model short-term inter-speaker context. +Extensive experiments show that LoCoNet achieves state-of-the-art performance +on multiple datasets, achieving an mAP of 95.2%(+1.1%) on AVA-ActiveSpeaker, +68.1%(+22%) on Columbia dataset, 97.2%(+2.8%) on Talkies dataset and +59.7%(+8.0%) on Ego4D dataset. Moreover, in challenging cases where multiple +speakers are present, or face of active speaker is much smaller than other +faces in the same scene, LoCoNet outperforms previous state-of-the-art methods +by 3.4% on the AVA-ActiveSpeaker dataset. The code will be released at +https://github.com/SJTUwxz/LoCoNet_ASD. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Tiny Machine Learning: Progress and Futures + + +
+ Tiny Machine Learning (TinyML) is a new frontier of machine learning. By +squeezing deep learning models into billions of IoT devices and +microcontrollers (MCUs), we expand the scope of AI applications and enable +ubiquitous intelligence. However, TinyML is challenging due to hardware +constraints: the tiny memory resource makes it difficult to hold deep learning +models designed for cloud and mobile platforms. There is also limited compiler +and inference engine support for bare-metal devices. Therefore, we need to +co-design the algorithm and system stack to enable TinyML. In this review, we +will first discuss the definition, challenges, and applications of TinyML. We +then survey the recent progress in TinyML and deep learning on MCUs. Next, we +will introduce MCUNet, showing how we can achieve ImageNet-scale AI +applications on IoT devices with system-algorithm co-design. We will further +extend the solution from inference to training and introduce tiny on-device +training techniques. Finally, we present future directions in this area. +Today's large model might be tomorrow's tiny model. The scope of TinyML should +evolve and adapt over time. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2206.15472 +
+
+
+
+
+ + ♻ ☆ FocusMAE: Gallbladder Cancer Detection from Ultrasound Videos with + Focused Masked Autoencoders CVPR 2024 + + +
+ In recent years, automated Gallbladder Cancer (GBC) detection has gained the +attention of researchers. Current state-of-the-art (SOTA) methodologies relying +on ultrasound sonography (US) images exhibit limited generalization, +emphasizing the need for transformative approaches. We observe that individual +US frames may lack sufficient information to capture disease manifestation. +This study advocates for a paradigm shift towards video-based GBC detection, +leveraging the inherent advantages of spatiotemporal representations. Employing +the Masked Autoencoder (MAE) for representation learning, we address +shortcomings in conventional image-based methods. We propose a novel design +called FocusMAE to systematically bias the selection of masking tokens from +high-information regions, fostering a more refined representation of +malignancy. Additionally, we contribute the most extensive US video dataset for +GBC detection. We also note that, this is the first study on US video-based GBC +detection. We validate the proposed methods on the curated dataset, and report +a new state-of-the-art (SOTA) accuracy of 96.4% for the GBC detection problem, +against an accuracy of 84% by current Image-based SOTA - GBCNet, and RadFormer, +and 94.7% by Video-based SOTA - AdaMAE. We further demonstrate the generality +of the proposed FocusMAE on a public CT-based Covid detection dataset, +reporting an improvement in accuracy by 3.3% over current baselines. The source +code and pretrained models are available at: +https://gbc-iitd.github.io/focusmae + +
+
+ comment: To Appear at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Explaining latent representations of generative models with large + multimodal models ICLR 2024 + + +
+ Learning interpretable representations of data generative latent factors is +an important topic for the development of artificial intelligence. With the +rise of the large multimodal model, it can align images with text to generate +answers. In this work, we propose a framework to comprehensively explain each +latent variable in the generative models using a large multimodal model. We +further measure the uncertainty of our generated explanations, quantitatively +evaluate the performance of explanation generation among multiple large +multimodal models, and qualitatively visualize the variations of each latent +variable to learn the disentanglement effects of different generative models on +explanations. Finally, we discuss the explanatory capabilities and limitations +of state-of-the-art large multimodal models. + +
+
+ comment: ICLR 2024 Workshop Paper on Reliable and Responsible Foundation + Models +
+
+
+
+
+ + ♻ ☆ Versatile Medical Image Segmentation Learned from Multi-Source Datasets + via Model Self-Disambiguation + + +
+ A versatile medical image segmentation model applicable to images acquired +with diverse equipment and protocols can facilitate model deployment and +maintenance. However, building such a model typically demands a large, diverse, +and fully annotated dataset, which is challenging to obtain due to the +labor-intensive nature of data curation. To address this challenge, we propose +a cost-effective alternative that harnesses multi-source data with only partial +or sparse segmentation labels for training, substantially reducing the cost of +developing a versatile model. We devise strategies for model +self-disambiguation, prior knowledge incorporation, and imbalance mitigation to +tackle challenges associated with inconsistently labeled multi-source data, +including label ambiguity and modality, dataset, and class imbalances. +Experimental results on a multi-modal dataset compiled from eight different +sources for abdominal structure segmentation have demonstrated the +effectiveness and superior performance of our method compared to +state-of-the-art alternative approaches. We anticipate that its cost-saving +features, which optimize the utilization of existing annotated data and reduce +annotation efforts for new data, will have a significant impact in the field. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Euclidean and Affine Curve Reconstruction + + +
+ We consider practical aspects of reconstructing planar curves with prescribed +Euclidean or affine curvatures. These curvatures are invariant under the +special Euclidean group and the equi-affine groups, respectively, and play an +important role in computer vision and shape analysis. We discuss and implement +algorithms for such reconstruction, and give estimates on how close +reconstructed curves are relative to the closeness of their curvatures in +appropriate metrics. Several illustrative examples are provided. + +
+
+ comment: This paper is a result of an REU project conducted at the North + Carolina State University in the Summer and Fall 2020. This version has + several minor corrections +
+
+
+
+
+ + ♻ ☆ AVID: Any-Length Video Inpainting with Diffusion Model + + +
+ Recent advances in diffusion models have successfully enabled text-guided +image inpainting. While it seems straightforward to extend such editing +capability into the video domain, there have been fewer works regarding +text-guided video inpainting. Given a video, a masked region at its initial +frame, and an editing prompt, it requires a model to do infilling at each frame +following the editing guidance while keeping the out-of-mask region intact. +There are three main challenges in text-guided video inpainting: ($i$) temporal +consistency of the edited video, ($ii$) supporting different inpainting types +at different structural fidelity levels, and ($iii$) dealing with variable +video length. To address these challenges, we introduce Any-Length Video +Inpainting with Diffusion Model, dubbed as AVID. At its core, our model is +equipped with effective motion modules and adjustable structure guidance, for +fixed-length video inpainting. Building on top of that, we propose a novel +Temporal MultiDiffusion sampling pipeline with a middle-frame attention +guidance mechanism, facilitating the generation of videos with any desired +duration. Our comprehensive experiments show our model can robustly deal with +various inpainting types at different video duration ranges, with high quality. +More visualization results are made publicly available at +https://zhang-zx.github.io/AVID/ . + +
+
+ comment: Project website: https://zhang-zx.github.io/AVID/ +
+
+
+
+
+ + ♻ ☆ Semi-Mamba-UNet: Pixel-Level Contrastive and Pixel-Level + Cross-Supervised Visual Mamba-based UNet for Semi-Supervised Medical Image + Segmentation + + +
+ Medical image segmentation is essential in diagnostics, treatment planning, +and healthcare, with deep learning offering promising advancements. Notably, +Convolutional Neural Network (CNN) excel in capturing local image features, +whereas Vision Transformer (ViT) adeptly model long-range dependencies through +multi-head self-attention mechanisms. Despite their strengths, both CNN and ViT +face challenges in efficiently processing long-range dependencies within +medical images, often requiring substantial computational resources. This +issue, combined with the high cost and limited availability of expert +annotations, poses significant obstacles to achieving precise segmentation. To +address these challenges, this paper introduces the Semi-Mamba-UNet, which +integrates a visual mamba-based UNet architecture with a conventional UNet into +a semi-supervised learning (SSL) framework. This innovative SSL approach +leverages dual networks to jointly generate pseudo labels and cross supervise +each other, drawing inspiration from consistency regularization techniques. +Furthermore, we introduce a self-supervised pixel-level contrastive learning +strategy, employing a projector pair to further enhance feature learning +capabilities. Our comprehensive evaluation on a publicly available MRI cardiac +segmentation dataset, comparing against various SSL frameworks with different +UNet-based segmentation networks, highlights the superior performance of +Semi-Mamba-UNet. The source code has been made publicly accessible. + +
+
+
+
+
+ + ♻ ☆ DiffAvatar: Simulation-Ready Garment Optimization with Differentiable + Simulation CVPR 2024 + + +
+ The realism of digital avatars is crucial in enabling telepresence +applications with self-expression and customization. While physical simulations +can produce realistic motions for clothed humans, they require high-quality +garment assets with associated physical parameters for cloth simulations. +However, manually creating these assets and calibrating their parameters is +labor-intensive and requires specialized expertise. Current methods focus on +reconstructing geometry, but don't generate complete assets for physics-based +applications. To address this gap, we propose \papername,~a novel approach that +performs body and garment co-optimization using differentiable simulation. By +integrating physical simulation into the optimization loop and accounting for +the complex nonlinear behavior of cloth and its intricate interaction with the +body, our framework recovers body and garment geometry and extracts important +material parameters in a physically plausible way. Our experiments demonstrate +that our approach generates realistic clothing and body shape suitable for +downstream applications. We provide additional insights and results on our +webpage: https://people.csail.mit.edu/liyifei/publication/diffavatar/ + +
+
+ comment: CVPR 2024; Project page: + https://people.csail.mit.edu/liyifei/publication/diffavatar/ +
+
+
+
+
+ + ♻ ☆ Evaluating Text-to-Image Synthesis: Survey and Taxonomy of Image Quality + Metrics + + +
+ Recent advances in text-to-image synthesis enabled through a combination of +language and vision foundation models have led to a proliferation of the tools +available and an increased attention to the field. When conducting +text-to-image synthesis, a central goal is to ensure that the content between +text and image is aligned. As such, there exist numerous evaluation metrics +that aim to mimic human judgement. However, it is often unclear which metric to +use for evaluating text-to-image synthesis systems as their evaluation is +highly nuanced. In this work, we provide a comprehensive overview of existing +text-to-image evaluation metrics. Based on our findings, we propose a new +taxonomy for categorizing these metrics. Our taxonomy is grounded in the +assumption that there are two main quality criteria, namely compositionality +and generality, which ideally map to human preferences. Ultimately, we derive +guidelines for practitioners conducting text-to-image evaluation, discuss open +challenges of evaluation mechanisms, and surface limitations of current +metrics. + +
+
+ comment: preprint, 21 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Augmented Reality Warnings in Roadway Work Zones: Evaluating the Effect + of Modality on Worker Reaction Times + + +
+ Given the aging highway infrastructure requiring extensive rebuilding and +enhancements, and the consequent rise in the number of work zones, there is an +urgent need to develop advanced safety systems to protect workers. While +Augmented Reality (AR) holds significant potential for delivering warnings to +workers, its integration into roadway work zones remains relatively unexplored. +The primary objective of this study is to improve safety measures within +roadway work zones by conducting an extensive analysis of how different +combinations of multimodal AR warnings influence the reaction times of workers. +This paper addresses this gap through a series of experiments that aim to +replicate the distinctive conditions of roadway work zones, both in real-world +and virtual reality environments. Our approach comprises three key components: +an advanced AR system prototype, a VR simulation of AR functionality within the +work zone environment, and the Wizard of Oz technique to synchronize user +experiences across experiments. To assess reaction times, we leverage both the +simple reaction time (SRT) technique and an innovative vision-based metric that +utilizes real-time pose estimation. By conducting five experiments in +controlled outdoor work zones and indoor VR settings, our study provides +valuable information on how various multimodal AR warnings impact workers +reaction times. Furthermore, our findings reveal the disparities in reaction +times between VR simulations and real-world scenarios, thereby gauging VR's +capability to mirror the dynamics of roadway work zones. Furthermore, our +results substantiate the potential and reliability of vision-based reaction +time measurements. These insights resonate well with those derived using the +SRT technique, underscoring the viability of this approach for tangible +real-world uses. + +
+
+
+
+
+ + ♻ ☆ Incorporating Geo-Diverse Knowledge into Prompting for Increased + Geographical Robustness in Object Recognition CVPR + + +
+ Existing object recognition models have been shown to lack robustness in +diverse geographical scenarios due to domain shifts in design and context. +Class representations need to be adapted to more accurately reflect an object +concept under these shifts. In the absence of training data from target +geographies, we hypothesize that geographically diverse descriptive knowledge +of categories can enhance robustness. For this purpose, we explore the +feasibility of probing a large language model for geography-based object +knowledge, and we examine the effects of integrating knowledge into zero-shot +and learnable soft prompting with CLIP. Within this exploration, we propose +geography knowledge regularization to ensure that soft prompts trained on a +source set of geographies generalize to an unseen target set. Accuracy gains +over prompting baselines on DollarStreet while training only on Europe data are +up to +2.8/1.2/1.6 on target data from Africa/Asia/Americas, and +4.6 overall +on the hardest classes. Competitive performance is shown vs. few-shot target +training, and analysis is provided to direct future study of geographical +robustness. + +
+
+ comment: To appear in IEEE/CVF Computer Vision and Pattern Recognition + Conference (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ Towards 3D Vision with Low-Cost Single-Photon Cameras + + +
+ We present a method for reconstructing 3D shape of arbitrary Lambertian +objects based on measurements by miniature, energy-efficient, low-cost +single-photon cameras. These cameras, operating as time resolved image sensors, +illuminate the scene with a very fast pulse of diffuse light and record the +shape of that pulse as it returns back from the scene at a high temporal +resolution. We propose to model this image formation process, account for its +non-idealities, and adapt neural rendering to reconstruct 3D geometry from a +set of spatially distributed sensors with known poses. We show that our +approach can successfully recover complex 3D shapes from simulated data. We +further demonstrate 3D object reconstruction from real-world captures, +utilizing measurements from a commodity proximity sensor. Our work draws a +connection between image-based modeling and active range scanning and is a step +towards 3D vision with single-photon cameras. + +
+
+
+
+
+ + ♻ ☆ SteinDreamer: Variance Reduction for Text-to-3D Score Distillation via + Stein Identity + + +
+ Score distillation has emerged as one of the most prevalent approaches for +text-to-3D asset synthesis. Essentially, score distillation updates 3D +parameters by lifting and back-propagating scores averaged over different +views. In this paper, we reveal that the gradient estimation in score +distillation is inherent to high variance. Through the lens of variance +reduction, the effectiveness of SDS and VSD can be interpreted as applications +of various control variates to the Monte Carlo estimator of the distilled +score. Motivated by this rethinking and based on Stein's identity, we propose a +more general solution to reduce variance for score distillation, termed Stein +Score Distillation (SSD). SSD incorporates control variates constructed by +Stein identity, allowing for arbitrary baseline functions. This enables us to +include flexible guidance priors and network architectures to explicitly +optimize for variance reduction. In our experiments, the overall pipeline, +dubbed SteinDreamer, is implemented by instantiating the control variate with a +monocular depth estimator. The results suggest that SSD can effectively reduce +the distillation variance and consistently improve visual quality for both +object- and scene-level generation. Moreover, we demonstrate that SteinDreamer +achieves faster convergence than existing methods due to more stable gradient +updates. + +
+
+ comment: Project page: https://vita-group.github.io/SteinDreamer/ +
+
+
+
+
+ + ♻ ☆ Systematic comparison of semi-supervised and self-supervised learning + for medical image classification CVPR 2024 + + +
+ In typical medical image classification problems, labeled data is scarce +while unlabeled data is more available. Semi-supervised learning and +self-supervised learning are two different research directions that can improve +accuracy by learning from extra unlabeled data. Recent methods from both +directions have reported significant gains on traditional benchmarks. Yet past +benchmarks do not focus on medical tasks and rarely compare self- and semi- +methods together on an equal footing. Furthermore, past benchmarks often handle +hyperparameter tuning suboptimally. First, they may not tune hyperparameters at +all, leading to underfitting. Second, when tuning does occur, it often +unrealistically uses a labeled validation set that is much larger than the +training set. Therefore currently published rankings might not always +corroborate with their practical utility This study contributes a systematic +evaluation of self- and semi- methods with a unified experimental protocol +intended to guide a practitioner with scarce overall labeled data and a limited +compute budget. We answer two key questions: Can hyperparameter tuning be +effective with realistic-sized validation sets? If so, when all methods are +tuned well, which self- or semi-supervised methods achieve the best accuracy? +Our study compares 13 representative semi- and self-supervised methods to +strong labeled-set-only baselines on 4 medical datasets. From 20000+ GPU hours +of computation, we provide valuable best practices to resource-constrained +practitioners: hyperparameter tuning is effective, and the semi-supervised +method known as MixMatch delivers the most reliable gains across 4 datasets. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SplatFace: Gaussian Splat Face Reconstruction Leveraging an Optimizable + Surface + + +
+ We present SplatFace, a novel Gaussian splatting framework designed for 3D +human face reconstruction without reliance on accurate pre-determined geometry. +Our method is designed to simultaneously deliver both high-quality novel view +rendering and accurate 3D mesh reconstructions. We incorporate a generic 3D +Morphable Model (3DMM) to provide a surface geometric structure, making it +possible to reconstruct faces with a limited set of input images. We introduce +a joint optimization strategy that refines both the Gaussians and the morphable +surface through a synergistic non-rigid alignment process. A novel distance +metric, splat-to-surface, is proposed to improve alignment by considering both +the Gaussian position and covariance. The surface information is also utilized +to incorporate a world-space densification process, resulting in superior +reconstruction quality. Our experimental analysis demonstrates that the +proposed method is competitive with both other Gaussian splatting techniques in +novel view synthesis and other 3D reconstruction methods in producing 3D face +meshes with high geometric precision. + +
+
+
+
+
+ + ♻ ☆ Taming Mode Collapse in Score Distillation for Text-to-3D Generation + + +
+ Despite the remarkable performance of score distillation in text-to-3D +generation, such techniques notoriously suffer from view inconsistency issues, +also known as "Janus" artifact, where the generated objects fake each view with +multiple front faces. Although empirically effective methods have approached +this problem via score debiasing or prompt engineering, a more rigorous +perspective to explain and tackle this problem remains elusive. In this paper, +we reveal that the existing score distillation-based text-to-3D generation +frameworks degenerate to maximal likelihood seeking on each view independently +and thus suffer from the mode collapse problem, manifesting as the Janus +artifact in practice. To tame mode collapse, we improve score distillation by +re-establishing the entropy term in the corresponding variational objective, +which is applied to the distribution of rendered images. Maximizing the entropy +encourages diversity among different views in generated 3D assets, thereby +mitigating the Janus problem. Based on this new objective, we derive a new +update rule for 3D score distillation, dubbed Entropic Score Distillation +(ESD). We theoretically reveal that ESD can be simplified and implemented by +just adopting the classifier-free guidance trick upon variational score +distillation. Although embarrassingly straightforward, our extensive +experiments successfully demonstrate that ESD can be an effective treatment for +Janus artifacts in score distillation. + +
+
+ comment: Project page: https://vita-group.github.io/3D-Mode-Collapse/ +
+
+
+
+
+ + ♻ ☆ UAV-Borne Mapping Algorithms for Low-Altitude and High-Speed Drone + Applications + + +
+ This article presents an analysis of current state-of-the-art sensors and how +these sensors work with several mapping algorithms for UAV (Unmanned Aerial +Vehicle) applications, focusing on low-altitude and high-speed scenarios. A new +experimental construct is created using highly realistic environments made +possible by integrating the AirSim simulator with Google 3D maps models using +the Cesium Tiles plugin. Experiments are conducted in this high-realism +simulated environment to evaluate the performance of three distinct mapping +algorithms: (1) Direct Sparse Odometry (DSO), (2) Stereo DSO (SDSO), and (3) +DSO Lite (DSOL). Experimental results evaluate algorithms based on their +measured geometric accuracy and computational speed. The results provide +valuable insights into the strengths and limitations of each algorithm. +Findings quantify compromises in UAV algorithm selection, allowing researchers +to find the mapping solution best suited to their application, which often +requires a compromise between computational performance and the density and +accuracy of geometric map estimates. Results indicate that for UAVs with +restrictive computing resources, DSOL is the best option. For systems with +payload capacity and modest compute resources, SDSO is the best option. If only +one camera is available, DSO is the option to choose for applications that +require dense mapping results. + +
+
+
+
+
+ + ♻ ☆ Point Cloud Mamba: Point Cloud Learning via State Space Model + + +
+ In this work, for the first time, we demonstrate that Mamba-based point cloud +methods can outperform point-based methods. Mamba exhibits strong global +modeling capabilities and linear computational complexity, making it highly +attractive for point cloud analysis. To enable more effective processing of 3-D +point cloud data by Mamba, we propose a novel Consistent Traverse Serialization +to convert point clouds into 1-D point sequences while ensuring that +neighboring points in the sequence are also spatially adjacent. Consistent +Traverse Serialization yields six variants by permuting the order of x, y, and +z coordinates, and the synergistic use of these variants aids Mamba in +comprehensively observing point cloud data. Furthermore, to assist Mamba in +handling point sequences with different orders more effectively, we introduce +point prompts to inform Mamba of the sequence's arrangement rules. Finally, we +propose positional encoding based on spatial coordinate mapping to inject +positional information into point cloud sequences better. Based on these +improvements, we construct a point cloud network named Point Cloud Mamba, which +combines local and global modeling. Point Cloud Mamba surpasses the SOTA +point-based method PointNeXt and achieves new SOTA performance on the +ScanObjectNN, ModelNet40, and ShapeNetPart datasets. + +
+
+
+
+
+ + ♻ ☆ Rethinking Multi-view Representation Learning via Distilled + Disentangling CVPR 2024 + + +
+ Multi-view representation learning aims to derive robust representations that +are both view-consistent and view-specific from diverse data sources. This +paper presents an in-depth analysis of existing approaches in this domain, +highlighting a commonly overlooked aspect: the redundancy between +view-consistent and view-specific representations. To this end, we propose an +innovative framework for multi-view representation learning, which incorporates +a technique we term 'distilled disentangling'. Our method introduces the +concept of masked cross-view prediction, enabling the extraction of compact, +high-quality view-consistent representations from various sources without +incurring extra computational overhead. Additionally, we develop a distilled +disentangling module that efficiently filters out consistency-related +information from multi-view representations, resulting in purer view-specific +representations. This approach significantly reduces redundancy between +view-consistent and view-specific representations, enhancing the overall +efficiency of the learning process. Our empirical evaluations reveal that +higher mask ratios substantially improve the quality of view-consistent +representations. Moreover, we find that reducing the dimensionality of +view-consistent representations relative to that of view-specific +representations further refines the quality of the combined representations. +Our code is accessible at: https://github.com/Guanzhou-Ke/MRDD. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ You Only Sample Once: Taming One-Step Text-To-Image Synthesis by + Self-Cooperative Diffusion GANs + + +
+ We introduce YOSO, a novel generative model designed for rapid, scalable, and +high-fidelity one-step image synthesis. This is achieved by integrating the +diffusion process with GANs. Specifically, we smooth the distribution by the +denoising generator itself, performing self-cooperative learning. We show that +our method can serve as a one-step generation model training from scratch with +competitive performance. Moreover, we show that our method can be extended to +finetune pre-trained text-to-image diffusion for high-quality one-step +text-to-image synthesis even with LoRA fine-tuning. In particular, we provide +the first diffusion transformer that can generate images in one step trained on +512 resolution, with the capability of adapting to 1024 resolution without +explicit training. Our code is provided at https://github.com/Luo-Yihong/YOSO. + +
+
+ comment: Early version +
+
+
+
+
+ + ♻ ☆ V2X-DGW: Domain Generalization for Multi-agent Perception under Adverse + Weather Conditions + + +
+ Current LiDAR-based Vehicle-to-Everything (V2X) multi-agent perception +systems have shown the significant success on 3D object detection. While these +models perform well in the trained clean weather, they struggle in unseen +adverse weather conditions with the real-world domain gap. In this paper, we +propose a domain generalization approach, named V2X-DGW, for LiDAR-based 3D +object detection on multi-agent perception system under adverse weather +conditions. Not only in the clean weather does our research aim to ensure +favorable multi-agent performance, but also in the unseen adverse weather +conditions by learning only on the clean weather data. To advance research in +this area, we have simulated the impact of three prevalent adverse weather +conditions on two widely-used multi-agent datasets, resulting in the creation +of two novel benchmark datasets: OPV2V-w and V2XSet-w. + To this end, we first introduce the Adaptive Weather Augmentation (AWA) to +mimic the unseen adverse weather conditions, and then propose two alignments +for generalizable representation learning: Trust-region Weather-invariant +Alignment (TWA) and Agent-aware Contrastive Alignment (ACA). Extensive +experimental results demonstrate that our V2X-DGW achieved improvements in the +unseen adverse weather conditions. + +
+
+
+
+
+ + ♻ ☆ Cross-modal tumor segmentation using generative blending augmentation + and self training + + +
+ \textit{Objectives}: Data scarcity and domain shifts lead to biased training +sets that do not accurately represent deployment conditions. A related +practical problem is cross-modal image segmentation, where the objective is to +segment unlabelled images using previously labelled datasets from other imaging +modalities. \textit{Methods}: We propose a cross-modal segmentation method +based on conventional image synthesis boosted by a new data augmentation +technique called Generative Blending Augmentation (GBA). GBA leverages a SinGAN +model to learn representative generative features from a single training image +to diversify realistically tumor appearances. This way, we compensate for image +synthesis errors, subsequently improving the generalization power of a +downstream segmentation model. The proposed augmentation is further combined to +an iterative self-training procedure leveraging pseudo labels at each pass. +\textit{Results}: The proposed solution ranked first for vestibular schwannoma +(VS) segmentation during the validation and test phases of the MICCAI CrossMoDA +2022 challenge, with best mean Dice similarity and average symmetric surface +distance measures. \textit{Conclusion and significance}: Local contrast +alteration of tumor appearances and iterative self-training with pseudo labels +are likely to lead to performance improvements in a variety of segmentation +contexts. + +
+
+
+
+
+ + ♻ ☆ VGTS: Visually Guided Text Spotting for Novel Categories in Historical + Manuscripts + + +
+ In the field of historical manuscript research, scholars frequently encounter +novel symbols in ancient texts, investing considerable effort in their +identification and documentation. Although existing object detection methods +achieve impressive performance on known categories, they struggle to recognize +novel symbols without retraining. To address this limitation, we propose a +Visually Guided Text Spotting (VGTS) approach that accurately spots novel +characters using just one annotated support sample. The core of VGTS is a +spatial alignment module consisting of a Dual Spatial Attention (DSA) block and +a Geometric Matching (GM) block. The DSA block aims to identify, focus on, and +learn discriminative spatial regions in the support and query images, mimicking +the human visual spotting process. It first refines the support image by +analyzing inter-channel relationships to identify critical areas, and then +refines the query image by focusing on informative key points. The GM block, on +the other hand, establishes the spatial correspondence between the two images, +enabling accurate localization of the target character in the query image. To +tackle the example imbalance problem in low-resource spotting tasks, we develop +a novel torus loss function that enhances the discriminative power of the +embedding space for distance metric learning. To further validate our approach, +we introduce a new dataset featuring ancient Dongba hieroglyphics (DBH) +associated with the Naxi minority of China. Extensive experiments on the DBH +dataset and other public datasets, including EGY, VML-HD, TKH, and NC, show +that VGTS consistently surpasses state-of-the-art methods. The proposed +framework exhibits great potential for application in historical manuscript +text spotting, enabling scholars to efficiently identify and document novel +symbols with minimal annotation effort. + +
+
+
+
+
+ + ♻ ☆ CustomListener: Text-guided Responsive Interaction for User-friendly + Listening Head Generation CVPR 2024 + + +
+ Listening head generation aims to synthesize a non-verbal responsive listener +head by modeling the correlation between the speaker and the listener in +dynamic conversion.The applications of listener agent generation in virtual +interaction have promoted many works achieving the diverse and fine-grained +motion generation. However, they can only manipulate motions through simple +emotional labels, but cannot freely control the listener's motions. Since +listener agents should have human-like attributes (e.g. identity, personality) +which can be freely customized by users, this limits their realism. In this +paper, we propose a user-friendly framework called CustomListener to realize +the free-form text prior guided listener generation. To achieve +speaker-listener coordination, we design a Static to Dynamic Portrait module +(SDP), which interacts with speaker information to transform static text into +dynamic portrait token with completion rhythm and amplitude information. To +achieve coherence between segments, we design a Past Guided Generation Module +(PGG) to maintain the consistency of customized listener attributes through the +motion prior, and utilize a diffusion-based structure conditioned on the +portrait token and the motion prior to realize the controllable generation. To +train and evaluate our model, we have constructed two text-annotated listening +head datasets based on ViCo and RealTalk, which provide text-video paired +labels. Extensive experiments have verified the effectiveness of our model. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Video Super-Resolution Transformer with Masked Inter&Intra-Frame + Attention CVPR 2024 + + +
+ Recently, Vision Transformer has achieved great success in recovering missing +details in low-resolution sequences, i.e., the video super-resolution (VSR) +task. Despite its superiority in VSR accuracy, the heavy computational burden +as well as the large memory footprint hinder the deployment of +Transformer-based VSR models on constrained devices. In this paper, we address +the above issue by proposing a novel feature-level masked processing framework: +VSR with Masked Intra and inter frame Attention (MIA-VSR). The core of MIA-VSR +is leveraging feature-level temporal continuity between adjacent frames to +reduce redundant computations and make more rational use of previously enhanced +SR features. Concretely, we propose an intra-frame and inter-frame attention +block which takes the respective roles of past features and input features into +consideration and only exploits previously enhanced features to provide +supplementary information. In addition, an adaptive block-wise mask prediction +module is developed to skip unimportant computations according to feature +similarity between adjacent frames. We conduct detailed ablation studies to +validate our contributions and compare the proposed method with recent +state-of-the-art VSR approaches. The experimental results demonstrate that +MIA-VSR improves the memory and computation efficiency over state-of-the-art +methods, without trading off PSNR accuracy. The code is available at +https://github.com/LabShuHangGU/MIA-VSR. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Gradient strikes back: How filtering out high frequencies improves + explanations + + +
+ Attribution methods correspond to a class of explainability methods (XAI) +that aim to assess how individual inputs contribute to a model's +decision-making process. We have identified a significant limitation in one +type of attribution methods, known as "white-box" methods. Although highly +efficient, these methods rely on a gradient signal that is often contaminated +by high-frequency noise. To overcome this limitation, we introduce a new +approach called "FORGrad". This simple method effectively filters out noise +artifacts by using optimal cut-off frequencies tailored to the unique +characteristics of each model architecture. Our findings show that FORGrad +consistently enhances the performance of already existing white-box methods, +enabling them to compete effectively with more accurate yet computationally +demanding "black-box" methods. We anticipate that our research will foster +broader adoption of simpler and more efficient white-box methods for +explainability, offering a better balance between faithfulness and +computational efficiency. + +
+
+
+
+
+ + ♻ ☆ MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation + + +
+ Since their emergence, Convolutional Neural Networks (CNNs) have made +significant strides in medical image analysis. However, the local nature of the +convolution operator may pose a limitation for capturing global and long-range +interactions in CNNs. Recently, Transformers have gained popularity in the +computer vision community and also in medical image segmentation due to their +ability to process global features effectively. The scalability issues of the +self-attention mechanism and lack of the CNN-like inductive bias may have +limited their adoption. Therefore, hybrid Vision transformers +(CNN-Transformer), exploiting the advantages of both Convolution and +Self-attention Mechanisms, have gained importance. In this work, we present +MaxViT-UNet, a new Encoder-Decoder based UNet type hybrid vision transformer +(CNN-Transformer) for medical image segmentation. The proposed Hybrid Decoder +is designed to harness the power of both the convolution and self-attention +mechanisms at each decoding stage with a nominal memory and computational +burden. The inclusion of multi-axis self-attention, within each decoder stage, +significantly enhances the discriminating capacity between the object and +background regions, thereby helping in improving the segmentation efficiency. +In the Hybrid Decoder, a new block is also proposed. The fusion process +commences by integrating the upsampled lower-level decoder features, obtained +through transpose convolution, with the skip-connection features derived from +the hybrid encoder. Subsequently, the fused features undergo refinement through +the utilization of a multi-axis attention mechanism. The proposed decoder block +is repeated multiple times to segment the nuclei regions progressively. +Experimental results on MoNuSeg18 and MoNuSAC20 datasets demonstrate the +effectiveness of the proposed technique. + +
+
+ comment: 19 pages, 6 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Audio-Visual Compound Expression Recognition Method based on Late + Modality Fusion and Rule-based Decision + + +
+ This paper presents the results of the SUN team for the Compound Expressions +Recognition Challenge of the 6th ABAW Competition. We propose a novel +audio-visual method for compound expression recognition. Our method relies on +emotion recognition models that fuse modalities at the emotion probability +level, while decisions regarding the prediction of compound expressions are +based on predefined rules. Notably, our method does not use any training data +specific to the target task. Thus, the problem is a zero-shot classification +task. The method is evaluated in multi-corpus training and cross-corpus +validation setups. Using our proposed method is achieved an F1-score value +equals to 22.01% on the C-EXPR-DB test subset. Our findings from the challenge +demonstrate that the proposed method can potentially form a basis for +developing intelligent tools for annotating audio-visual data in the context of +human's basic and compound emotions. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ MultiCorrupt: A Multi-Modal Robustness Dataset and Benchmark of + LiDAR-Camera Fusion for 3D Object Detection + + +
+ Multi-modal 3D object detection models for automated driving have +demonstrated exceptional performance on computer vision benchmarks like +nuScenes. However, their reliance on densely sampled LiDAR point clouds and +meticulously calibrated sensor arrays poses challenges for real-world +applications. Issues such as sensor misalignment, miscalibration, and disparate +sampling frequencies lead to spatial and temporal misalignment in data from +LiDAR and cameras. Additionally, the integrity of LiDAR and camera data is +often compromised by adverse environmental conditions such as inclement +weather, leading to occlusions and noise interference. To address this +challenge, we introduce MultiCorrupt, a comprehensive benchmark designed to +evaluate the robustness of multi-modal 3D object detectors against ten distinct +types of corruptions. We evaluate five state-of-the-art multi-modal detectors +on MultiCorrupt and analyze their performance in terms of their resistance +ability. Our results show that existing methods exhibit varying degrees of +robustness depending on the type of corruption and their fusion strategy. We +provide insights into which multi-modal design choices make such models robust +against certain perturbations. The dataset generation code and benchmark are +open-sourced at https://github.com/ika-rwth-aachen/MultiCorrupt. + +
+
+ comment: Code: https://github.com/ika-rwth-aachen/MultiCorrupt +
+
+
+
+
+ + ♻ ☆ Single-Model and Any-Modality for Video Object Tracking CVPR2024 + + +
+ In the realm of video object tracking, auxiliary modalities such as depth, +thermal, or event data have emerged as valuable assets to complement the RGB +trackers. In practice, most existing RGB trackers learn a single set of +parameters to use them across datasets and applications. However, a similar +single-model unification for multi-modality tracking presents several +challenges. These challenges stem from the inherent heterogeneity of inputs -- +each with modality-specific representations, the scarcity of multi-modal +datasets, and the absence of all the modalities at all times. In this work, we +introduce Un-Track, a Unified Tracker of a single set of parameters for any +modality. To handle any modality, our method learns their common latent space +through low-rank factorization and reconstruction techniques. More importantly, +we use only the RGB-X pairs to learn the common latent space. This unique +shared representation seamlessly binds all modalities together, enabling +effective unification and accommodating any missing modality, all within a +single transformer-based architecture. Our Un-Track achieves +8.1 absolute +F-score gain, on the DepthTrack dataset, by introducing only +2.14 (over 21.50) +GFLOPs with +6.6M (over 93M) parameters, through a simple yet efficient +prompting strategy. Extensive comparisons on five benchmark datasets with +different modalities show that Un-Track surpasses both SOTA unified trackers +and modality-specific counterparts, validating our effectiveness and +practicality. The source code is publicly available at +https://github.com/Zongwei97/UnTrack. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ DXAI: Explaining Classification by Image Decomposition + + +
+ We propose a new way to explain and to visualize neural network +classification through a decomposition-based explainable AI (DXAI). Instead of +providing an explanation heatmap, our method yields a decomposition of the +image into class-agnostic and class-distinct parts, with respect to the data +and chosen classifier. Following a fundamental signal processing paradigm of +analysis and synthesis, the original image is the sum of the decomposed parts. +We thus obtain a radically different way of explaining classification. The +class-agnostic part ideally is composed of all image features which do not +posses class information, where the class-distinct part is its complementary. +This new visualization can be more helpful and informative in certain +scenarios, especially when the attributes are dense, global and additive in +nature, for instance, when colors or textures are essential for class +distinction. Code is available at https://github.com/dxai2024/dxai. + +
+
+
+
+
+ + ♻ ☆ SPOT: Self-Training with Patch-Order Permutation for Object-Centric + Learning with Autoregressive Transformers CVPR 2024 + + +
+ Unsupervised object-centric learning aims to decompose scenes into +interpretable object entities, termed slots. Slot-based auto-encoders stand out +as a prominent method for this task. Within them, crucial aspects include +guiding the encoder to generate object-specific slots and ensuring the decoder +utilizes them during reconstruction. This work introduces two novel techniques, +(i) an attention-based self-training approach, which distills superior +slot-based attention masks from the decoder to the encoder, enhancing object +segmentation, and (ii) an innovative patch-order permutation strategy for +autoregressive transformers that strengthens the role of slot vectors in +reconstruction. The effectiveness of these strategies is showcased +experimentally. The combined approach significantly surpasses prior slot-based +autoencoder methods in unsupervised object segmentation, especially with +complex real-world images. We provide the implementation code at +https://github.com/gkakogeorgiou/spot . + +
+
+ comment: CVPR 2024. Code: https://github.com/gkakogeorgiou/spot +
+
+
+
+
+ + ♻ ☆ GD^2-NeRF: Generative Detail Compensation via GAN and Diffusion for + One-shot Generalizable Neural Radiance Fields + + +
+ In this paper, we focus on the One-shot Novel View Synthesis (O-NVS) task +which targets synthesizing photo-realistic novel views given only one reference +image per scene. Previous One-shot Generalizable Neural Radiance Fields +(OG-NeRF) methods solve this task in an inference-time finetuning-free manner, +yet suffer the blurry issue due to the encoder-only architecture that highly +relies on the limited reference image. On the other hand, recent +diffusion-based image-to-3d methods show vivid plausible results via distilling +pre-trained 2D diffusion models into a 3D representation, yet require tedious +per-scene optimization. Targeting these issues, we propose the GD$^2$-NeRF, a +Generative Detail compensation framework via GAN and Diffusion that is both +inference-time finetuning-free and with vivid plausible details. In detail, +following a coarse-to-fine strategy, GD$^2$-NeRF is mainly composed of a +One-stage Parallel Pipeline (OPP) and a 3D-consistent Detail Enhancer +(Diff3DE). At the coarse stage, OPP first efficiently inserts the GAN model +into the existing OG-NeRF pipeline for primarily relieving the blurry issue +with in-distribution priors captured from the training dataset, achieving a +good balance between sharpness (LPIPS, FID) and fidelity (PSNR, SSIM). Then, at +the fine stage, Diff3DE further leverages the pre-trained image diffusion +models to complement rich out-distribution details while maintaining decent 3D +consistency. Extensive experiments on both the synthetic and real-world +datasets show that GD$^2$-NeRF noticeably improves the details while without +per-scene finetuning. + +
+
+ comment: Submitted to Journal +
+
+
+
+
+ + ♻ ☆ Deep Equilibrium Diffusion Restoration with Parallel Sampling CVPR'2024 + + +
+ Diffusion model-based image restoration (IR) aims to use diffusion models to +recover high-quality (HQ) images from degraded images, achieving promising +performance. Due to the inherent property of diffusion models, most existing +methods need long serial sampling chains to restore HQ images step-by-step, +resulting in expensive sampling time and high computation costs. Moreover, such +long sampling chains hinder understanding the relationship between inputs and +restoration results since it is hard to compute the gradients in the whole +chains. In this work, we aim to rethink the diffusion model-based IR models +through a different perspective, i.e., a deep equilibrium (DEQ) fixed point +system, called DeqIR. Specifically, we derive an analytical solution by +modeling the entire sampling chain in these IR models as a joint multivariate +fixed point system. Based on the analytical solution, we can conduct parallel +sampling and restore HQ images without training. Furthermore, we compute fast +gradients via DEQ inversion and found that initialization optimization can +boost image quality and control the generation direction. Extensive experiments +on benchmarks demonstrate the effectiveness of our method on typical IR tasks +and real-world settings. + +
+
+ comment: CVPR'2024 +
+
+
+
+
+ + ♻ ☆ Integrating Language-Derived Appearance Elements with Visual Cues in + Pedestrian Detection + + +
+ Large language models (LLMs) have shown their capabilities in understanding +contextual and semantic information regarding knowledge of instance +appearances. In this paper, we introduce a novel approach to utilize the +strengths of LLMs in understanding contextual appearance variations and to +leverage this knowledge into a vision model (here, pedestrian detection). While +pedestrian detection is considered one of the crucial tasks directly related to +our safety (e.g., intelligent driving systems), it is challenging because of +varying appearances and poses in diverse scenes. Therefore, we propose to +formulate language-derived appearance elements and incorporate them with visual +cues in pedestrian detection. To this end, we establish a description corpus +that includes numerous narratives describing various appearances of pedestrians +and other instances. By feeding them through an LLM, we extract appearance +knowledge sets that contain the representations of appearance variations. +Subsequently, we perform a task-prompting process to obtain appearance elements +which are guided representative appearance knowledge relevant to a downstream +pedestrian detection task. The obtained knowledge elements are adaptable to +various detection frameworks, so that we can provide plentiful appearance +information by integrating the language-derived appearance elements with visual +cues within a detector. Through comprehensive experiments with various +pedestrian detectors, we verify the adaptability and effectiveness of our +method showing noticeable performance gains and achieving state-of-the-art +detection performance on two public pedestrian detection benchmarks (i.e., +CrowdHuman and WiderPedestrian). + +
+
+ comment: 11 pages, 5 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled + Feature Fields + + +
+ 3D scene representations have gained immense popularity in recent years. +Methods that use Neural Radiance fields are versatile for traditional tasks +such as novel view synthesis. In recent times, some work has emerged that aims +to extend the functionality of NeRF beyond view synthesis, for semantically +aware tasks such as editing and segmentation using 3D feature field +distillation from 2D foundation models. However, these methods have two major +limitations: (a) they are limited by the rendering speed of NeRF pipelines, and +(b) implicitly represented feature fields suffer from continuity artifacts +reducing feature quality. Recently, 3D Gaussian Splatting has shown +state-of-the-art performance on real-time radiance field rendering. In this +work, we go one step further: in addition to radiance field rendering, we +enable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D +foundation model distillation. This translation is not straightforward: naively +incorporating feature fields in the 3DGS framework encounters significant +challenges, notably the disparities in spatial resolution and channel +consistency between RGB images and feature maps. We propose architectural and +training changes to efficiently avert this problem. Our proposed method is +general, and our experiments showcase novel view semantic segmentation, +language-guided editing and segment anything through learning feature fields +from state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across +experiments, our distillation method is able to provide comparable or better +results, while being significantly faster to both train and render. +Additionally, to the best of our knowledge, we are the first method to enable +point and bounding-box prompting for radiance field manipulation, by leveraging +the SAM model. Project website at: https://feature-3dgs.github.io/ + +
+
+
+
+
+ + ♻ ☆ Distribution-Aware Continual Test-Time Adaptation for Semantic + Segmentation + + +
+ Since autonomous driving systems usually face dynamic and ever-changing +environments, continual test-time adaptation (CTTA) has been proposed as a +strategy for transferring deployed models to continually changing target +domains. However, the pursuit of long-term adaptation often introduces +catastrophic forgetting and error accumulation problems, which impede the +practical implementation of CTTA in the real world. Recently, existing CTTA +methods mainly focus on utilizing a majority of parameters to fit target domain +knowledge through self-training. Unfortunately, these approaches often amplify +the challenge of error accumulation due to noisy pseudo-labels, and pose +practical limitations stemming from the heavy computational costs associated +with entire model updates. In this paper, we propose a distribution-aware +tuning (DAT) method to make the semantic segmentation CTTA efficient and +practical in real-world applications. DAT adaptively selects and updates two +small groups of trainable parameters based on data distribution during the +continual adaptation process, including domain-specific parameters (DSP) and +task-relevant parameters (TRP). Specifically, DSP exhibits sensitivity to +outputs with substantial distribution shifts, effectively mitigating the +problem of error accumulation. In contrast, TRP are allocated to positions that +are responsive to outputs with minor distribution shifts, which are fine-tuned +to avoid the catastrophic forgetting problem. In addition, since CTTA is a +temporal task, we introduce the Parameter Accumulation Update (PAU) strategy to +collect the updated DSP and TRP in target domain sequences. We conduct +extensive experiments on two widely-used semantic segmentation CTTA benchmarks, +achieving promising performance compared to previous state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Strong Transferable Adversarial Attacks via Ensembled Asymptotically + Normal Distribution Learning + + +
+ Strong adversarial examples are crucial for evaluating and enhancing the +robustness of deep neural networks. However, the performance of popular attacks +is usually sensitive, for instance, to minor image transformations, stemming +from limited information -- typically only one input example, a handful of +white-box source models, and undefined defense strategies. Hence, the crafted +adversarial examples are prone to overfit the source model, which hampers their +transferability to unknown architectures. In this paper, we propose an approach +named Multiple Asymptotically Normal Distribution Attacks (MultiANDA) which +explicitly characterize adversarial perturbations from a learned distribution. +Specifically, we approximate the posterior distribution over the perturbations +by taking advantage of the asymptotic normality property of stochastic gradient +ascent (SGA), then employ the deep ensemble strategy as an effective proxy for +Bayesian marginalization in this process, aiming to estimate a mixture of +Gaussians that facilitates a more thorough exploration of the potential +optimization space. The approximated posterior essentially describes the +stationary distribution of SGA iterations, which captures the geometric +information around the local optimum. Thus, MultiANDA allows drawing an +unlimited number of adversarial perturbations for each input and reliably +maintains the transferability. Our proposed method outperforms ten +state-of-the-art black-box attacks on deep learning models with or without +defenses through extensive experiments on seven normally trained and seven +defense models. + +
+
+
+
+
+ + ♻ ☆ DreamGaussian: Generative Gaussian Splatting for Efficient 3D Content + Creation + + +
+ Recent advances in 3D content creation mostly leverage optimization-based 3D +generation via score distillation sampling (SDS). Though promising results have +been exhibited, these methods often suffer from slow per-sample optimization, +limiting their practical usage. In this paper, we propose DreamGaussian, a +novel 3D content generation framework that achieves both efficiency and quality +simultaneously. Our key insight is to design a generative 3D Gaussian Splatting +model with companioned mesh extraction and texture refinement in UV space. In +contrast to the occupancy pruning used in Neural Radiance Fields, we +demonstrate that the progressive densification of 3D Gaussians converges +significantly faster for 3D generative tasks. To further enhance the texture +quality and facilitate downstream applications, we introduce an efficient +algorithm to convert 3D Gaussians into textured meshes and apply a fine-tuning +stage to refine the details. Extensive experiments demonstrate the superior +efficiency and competitive generation quality of our proposed approach. +Notably, DreamGaussian produces high-quality textured meshes in just 2 minutes +from a single-view image, achieving approximately 10 times acceleration +compared to existing methods. + +
+
+ comment: Camera-ready version. Project page: https://dreamgaussian.github.io/ +
+
+
+
+
+ + ♻ ☆ SEGIC: Unleashing the Emergent Correspondence for In-Context + Segmentation + + +
+ In-context segmentation aims at segmenting novel images using a few labeled +example images, termed as "in-context examples", exploring content similarities +between examples and the target. The resulting models can be generalized +seamlessly to novel segmentation tasks, significantly reducing the labeling and +training costs compared with conventional pipelines. However, in-context +segmentation is more challenging than classic ones requiring the model to learn +segmentation rules conditioned on a few samples. Unlike previous work with +ad-hoc or non-end-to-end designs, we propose SEGIC, an end-to-end +segment-in-context framework built upon a single vision foundation model (VFM). +In particular, SEGIC leverages the emergent correspondence within VFM to +capture dense relationships between target images and in-context samples. As +such, information from in-context samples is then extracted into three types of +instructions, i.e. geometric, visual, and meta instructions, serving as +explicit conditions for the final mask prediction. SEGIC is a straightforward +yet effective approach that yields state-of-the-art performance on one-shot +segmentation benchmarks. Notably, SEGIC can be easily generalized to diverse +tasks, including video object segmentation and open-vocabulary segmentation. +Code will be available at https://github.com/MengLcool/SEGIC. + +
+
+
+
+
+ + ♻ ☆ PLGSLAM: Progressive Neural Scene Represenation with Local to Global + Bundle Adjustment CVPR 2024 + + +
+ Neural implicit scene representations have recently shown encouraging results +in dense visual SLAM. However, existing methods produce low-quality scene +reconstruction and low-accuracy localization performance when scaling up to +large indoor scenes and long sequences. These limitations are mainly due to +their single, global radiance field with finite capacity, which does not adapt +to large scenarios. Their end-to-end pose networks are also not robust enough +with the growth of cumulative errors in large scenes. To this end, we introduce +PLGSLAM, a neural visual SLAM system capable of high-fidelity surface +reconstruction and robust camera tracking in real-time. To handle large-scale +indoor scenes, PLGSLAM proposes a progressive scene representation method which +dynamically allocates new local scene representation trained with frames within +a local sliding window. This allows us to scale up to larger indoor scenes and +improves robustness (even under pose drifts). In local scene representation, +PLGSLAM utilizes tri-planes for local high-frequency features with multi-layer +perceptron (MLP) networks for the low-frequency feature, achieving smoothness +and scene completion in unobserved areas. Moreover, we propose local-to-global +bundle adjustment method with a global keyframe database to address the +increased pose drifts on long sequences. Experimental results demonstrate that +PLGSLAM achieves state-of-the-art scene reconstruction results and tracking +performance across various datasets and scenarios (both in small and +large-scale indoor environments). + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ mPLUG-Owl: Modularization Empowers Large Language Models with + Multimodality + + +
+ Large language models (LLMs) have demonstrated impressive zero-shot abilities +on a variety of open-ended tasks, while recent research has also explored the +use of LLMs for multi-modal generation. In this study, we introduce mPLUG-Owl, +a novel training paradigm that equips LLMs with multi-modal abilities through +modularized learning of foundation LLM, a visual knowledge module, and a visual +abstractor module. This approach can support multiple modalities and facilitate +diverse unimodal and multimodal abilities through modality collaboration. The +training paradigm of mPLUG-Owl involves a two-stage method for aligning image +and text, which learns visual knowledge with the assistance of LLM while +maintaining and even improving the generation abilities of LLM. In the first +stage, the visual knowledge module and abstractor module are trained with a +frozen LLM module to align the image and text. In the second stage, +language-only and multi-modal supervised datasets are used to jointly fine-tune +a low-rank adaption (LoRA) module on LLM and the abstractor module by freezing +the visual knowledge module. We carefully build a visually-related instruction +evaluation set OwlEval. Experimental results show that our model outperforms +existing multi-modal models, demonstrating mPLUG-Owl's impressive instruction +and visual understanding ability, multi-turn conversation ability, and +knowledge reasoning ability. Besides, we observe some unexpected and exciting +abilities such as multi-image correlation and scene text understanding, which +makes it possible to leverage it for harder real scenarios, such as vision-only +document comprehension. Our code, pre-trained model, instruction-tuned models, +and evaluation set are available at https://github.com/X-PLUG/mPLUG-Owl. The +online demo is available at https://www.modelscope.cn/studios/damo/mPLUG-Owl. + +
+
+ comment: Working in Process +
+
+
+
+
+ + ♻ ☆ SNE-RoadSegV2: Advancing Heterogeneous Feature Fusion and Fallibility + Awareness for Freespace Detection + + +
+ Feature-fusion networks with duplex encoders have proven to be an effective +technique to solve the freespace detection problem. However, despite the +compelling results achieved by previous research efforts, the exploration of +adequate and discriminative heterogeneous feature fusion, as well as the +development of fallibility-aware loss functions remains relatively scarce. This +paper makes several significant contributions to address these limitations: (1) +It presents a novel heterogeneous feature fusion block, comprising a holistic +attention module, a heterogeneous feature contrast descriptor, and an +affinity-weighted feature recalibrator, enabling a more in-depth exploitation +of the inherent characteristics of the extracted features, (2) it incorporates +both inter-scale and intra-scale skip connections into the decoder architecture +while eliminating redundant ones, leading to both improved accuracy and +computational efficiency, and (3) it introduces two fallibility-aware loss +functions that separately focus on semantic-transition and depth-inconsistent +regions, collectively contributing to greater supervision during model +training. Our proposed heterogeneous feature fusion network (SNE-RoadSegV2), +which incorporates all these innovative components, demonstrates superior +performance in comparison to all other freespace detection algorithms across +multiple public datasets. Notably, it ranks the 1st on the official KITTI Road +benchmark. + +
+
+
+
+
+ + ♻ ☆ Gamba: Marry Gaussian Splatting with Mamba for single view 3D + reconstruction + + +
+ We tackle the challenge of efficiently reconstructing a 3D asset from a +single image with growing demands for automated 3D content creation pipelines. +Previous methods primarily rely on Score Distillation Sampling (SDS) and Neural +Radiance Fields (NeRF). Despite their significant success, these approaches +encounter practical limitations due to lengthy optimization and considerable +memory usage. In this report, we introduce Gamba, an end-to-end amortized 3D +reconstruction model from single-view images, emphasizing two main insights: +(1) 3D representation: leveraging a large number of 3D Gaussians for an +efficient 3D Gaussian splatting process; (2) Backbone design: introducing a +Mamba-based sequential network that facilitates context-dependent reasoning and +linear scalability with the sequence (token) length, accommodating a +substantial number of Gaussians. Gamba incorporates significant advancements in +data preprocessing, regularization design, and training methodologies. We +assessed Gamba against existing optimization-based and feed-forward 3D +generation approaches using the real-world scanned OmniObject3D dataset. Here, +Gamba demonstrates competitive generation capabilities, both qualitatively and +quantitatively, while achieving remarkable speed, approximately 0.6 second on a +single NVIDIA A100 GPU. + +
+
+
+
+
+ + ♻ ☆ Dr.Hair: Reconstructing Scalp-Connected Hair Strands without + Pre-training via Differentiable Rendering of Line Segments CVPR 2024 + + +
+ In the film and gaming industries, achieving a realistic hair appearance +typically involves the use of strands originating from the scalp. However, +reconstructing these strands from observed surface images of hair presents +significant challenges. The difficulty in acquiring Ground Truth (GT) data has +led state-of-the-art learning-based methods to rely on pre-training with +manually prepared synthetic CG data. This process is not only labor-intensive +and costly but also introduces complications due to the domain gap when +compared to real-world data. In this study, we propose an optimization-based +approach that eliminates the need for pre-training. Our method represents hair +strands as line segments growing from the scalp and optimizes them using a +novel differentiable rendering algorithm. To robustly optimize a substantial +number of slender explicit geometries, we introduce 3D orientation estimation +utilizing global optimization, strand initialization based on Laplace's +equation, and reparameterization that leverages geometric connectivity and +spatial proximity. Unlike existing optimization-based methods, our method is +capable of reconstructing internal hair flow in an absolute direction. Our +method exhibits robust and accurate inverse rendering, surpassing the quality +of existing methods and significantly improving processing speed. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update CVPR 2024 + + +
+ Utilizing large language models (LLMs) to compose off-the-shelf visual tools +represents a promising avenue of research for developing robust visual +assistants capable of addressing diverse visual tasks. However, these methods +often overlook the potential for continual learning, typically by freezing the +utilized tools, thus limiting their adaptation to environments requiring new +knowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual +Assistant, which operates within a framework encompassing inference, +reflection, and learning phases. During the inference phase, LLMs generate +programs and execute corresponding tools to complete assigned tasks. In the +reflection phase, a multimodal global-local reflection scheme analyzes human +feedback to determine which tools require updating. Lastly, the learning phase +employs three flexible approaches to automatically gather training data and +introduces a novel prompt tuning scheme to update the tools, allowing CLOVA to +efficiently acquire new knowledge. Experimental findings demonstrate that CLOVA +surpasses existing tool-usage methods by 5% in visual question answering and +multiple-image reasoning, by 10% in knowledge tagging, and by 20% in image +editing. These results underscore the significance of the continual learning +capability in general visual assistants. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Cell Variational Information Bottleneck Network + + +
+ In this work, we propose Cell Variational Information Bottleneck Network +(cellVIB), a convolutional neural network using information bottleneck +mechanism, which can be combined with the latest feedforward network +architecture in an end-to-end training method. Our Cell Variational Information +Bottleneck Network is constructed by stacking VIB cells, which generate feature +maps with uncertainty. As layers going deeper, the regularization effect will +gradually increase, instead of directly adding excessive regular constraints to +the output layer of the model as in Deep VIB. Under each VIB cell, the +feedforward process learns an independent mean term and an standard deviation +term, and predicts the Gaussian distribution based on them. The feedback +process is based on reparameterization trick for effective training. This work +performs an extensive analysis on MNIST dataset to verify the effectiveness of +each VIB cells, and provides an insightful analysis on how the VIB cells affect +mutual information. Experiments conducted on CIFAR-10 also prove that our +cellVIB is robust against noisy labels during training and against corrupted +images during testing. Then, we validate our method on PACS dataset, whose +results show that the VIB cells can significantly improve the generalization +performance of the basic model. Finally, in a more complex representation +learning task, face recognition, our network structure has also achieved very +competitive results. + +
+
+ comment: Found errors in the article, therefore postponing publication for now +
+
+
+
+
+ + ♻ ☆ Improving Generalization via Meta-Learning on Hard Samples CVPR 2024 + + +
+ Learned reweighting (LRW) approaches to supervised learning use an +optimization criterion to assign weights for training instances, in order to +maximize performance on a representative validation dataset. We pose and +formalize the problem of optimized selection of the validation set used in LRW +training, to improve classifier generalization. In particular, we show that +using hard-to-classify instances in the validation set has both a theoretical +connection to, and strong empirical evidence of generalization. We provide an +efficient algorithm for training this meta-optimized model, as well as a simple +train-twice heuristic for careful comparative study. We demonstrate that LRW +with easy validation data performs consistently worse than LRW with hard +validation data, establishing the validity of our meta-optimization problem. +Our proposed algorithm outperforms a wide range of baselines on a range of +datasets and domain shift challenges (Imagenet-1K, CIFAR-100, Clothing-1M, +CAMELYON, WILDS, etc.), with ~1% gains using VIT-B on Imagenet. We also show +that using naturally hard examples for validation (Imagenet-R / Imagenet-A) in +LRW training for Imagenet improves performance on both clean and naturally hard +test instances by 1-2%. Secondary analyses show that using hard validation data +in an LRW framework improves margins on test data, hinting at the mechanism +underlying our empirical gains. We believe this work opens up new research +directions for the meta-optimization of meta-learning in a supervised learning +context. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DyBluRF: Dynamic Deblurring Neural Radiance Fields for Blurry Monocular + Video + + +
+ Neural Radiance Fields (NeRF), initially developed for static scenes, have +inspired many video novel view synthesis techniques. However, the challenge for +video view synthesis arises from motion blur, a consequence of object or camera +movement during exposure, which hinders the precise synthesis of sharp +spatio-temporal views. In response, we propose a novel dynamic deblurring NeRF +framework for blurry monocular video, called DyBluRF, consisting of a Base Ray +Initialization (BRI) stage and a Motion Decomposition-based Deblurring (MDD) +stage. Our DyBluRF is the first that handles the novel view synthesis for +blurry monocular video with a novel two-stage framework. In the BRI stage, we +coarsely reconstruct dynamic 3D scenes and jointly initialize the base ray, +which is further used to predict latent sharp rays, using the inaccurate camera +pose information from the given blurry frames. In the MDD stage, we introduce a +novel Incremental Latent Sharp-rays Prediction (ILSP) approach for the blurry +monocular video frames by decomposing the latent sharp rays into global camera +motion and local object motion components. We further propose two loss +functions for effective geometry regularization and decomposition of static and +dynamic scene components without any mask supervision. Experiments show that +DyBluRF outperforms qualitatively and quantitatively the SOTA methods. + +
+
+ comment: The first two authors contributed equally to this work (equal + contribution). The last two authors advised equally to this work. Please + visit our project page at https://kaist-viclab.github.io/dyblurf-site/ +
+
+
+
+
+ + ♻ ☆ Elysium: Exploring Object-level Perception in Videos via MLLM + + +
+ Multi-modal Large Language Models (MLLMs) have demonstrated their ability to +perceive objects in still images, but their application in video-related tasks, +such as object tracking, remains understudied. This lack of exploration is +primarily due to two key challenges. Firstly, extensive pretraining on +large-scale video datasets is required to equip MLLMs with the capability to +perceive objects across multiple frames and understand inter-frame +relationships. Secondly, processing a large number of frames within the context +window of Large Language Models (LLMs) can impose a significant computational +burden. To address the first challenge, we introduce ElysiumTrack-1M, a +large-scale video dataset supported for three tasks: Single Object Tracking +(SOT), Referring Single Object Tracking (RSOT), and Video Referring Expression +Generation (Video-REG). ElysiumTrack-1M contains 1.27 million annotated video +frames with corresponding object boxes and descriptions. Leveraging this +dataset, we conduct training of MLLMs and propose a token-compression model +T-Selector to tackle the second challenge. Our proposed approach, Elysium: +Exploring Object-level Perception in Videos via MLLM, is an end-to-end +trainable MLLM that attempts to conduct object-level tasks in videos without +requiring any additional plug-in or expert models. All codes and datasets are +available at https://github.com/Hon-Wong/Elysium. + +
+
+
+
+
+ + ♻ ☆ TransNeXt: Robust Foveal Visual Perception for Vision Transformers CVPR 2024 + + +
+ Due to the depth degradation effect in residual connections, many efficient +Vision Transformers models that rely on stacking layers for information +exchange often fail to form sufficient information mixing, leading to unnatural +visual perception. To address this issue, in this paper, we propose Aggregated +Attention, a biomimetic design-based token mixer that simulates biological +foveal vision and continuous eye movement while enabling each token on the +feature map to have a global perception. Furthermore, we incorporate learnable +tokens that interact with conventional queries and keys, which further +diversifies the generation of affinity matrices beyond merely relying on the +similarity between queries and keys. Our approach does not rely on stacking for +information exchange, thus effectively avoiding depth degradation and achieving +natural visual perception. Additionally, we propose Convolutional GLU, a +channel mixer that bridges the gap between GLU and SE mechanism, which empowers +each token to have channel attention based on its nearest neighbor image +features, enhancing local modeling capability and model robustness. We combine +aggregated attention and convolutional GLU to create a new visual backbone +called TransNeXt. Extensive experiments demonstrate that our TransNeXt achieves +state-of-the-art performance across multiple model sizes. At a resolution of +$224^2$, TransNeXt-Tiny attains an ImageNet accuracy of 84.0%, surpassing +ConvNeXt-B with 69% fewer parameters. Our TransNeXt-Base achieves an ImageNet +accuracy of 86.2% and an ImageNet-A accuracy of 61.6% at a resolution of +$384^2$, a COCO object detection mAP of 57.1, and an ADE20K semantic +segmentation mIoU of 54.7. + +
+
+ comment: CVPR 2024 Camera-ready Version. Project Page: + https://github.com/DaiShiResearch/TransNeXt +
+
+
+
+
+ + ♻ ☆ GAvatar: Animatable 3D Gaussian Avatars with Implicit Mesh Learning CVPR 2024 + + +
+ Gaussian splatting has emerged as a powerful 3D representation that harnesses +the advantages of both explicit (mesh) and implicit (NeRF) 3D representations. +In this paper, we seek to leverage Gaussian splatting to generate realistic +animatable avatars from textual descriptions, addressing the limitations (e.g., +flexibility and efficiency) imposed by mesh or NeRF-based representations. +However, a naive application of Gaussian splatting cannot generate high-quality +animatable avatars and suffers from learning instability; it also cannot +capture fine avatar geometries and often leads to degenerate body parts. To +tackle these problems, we first propose a primitive-based 3D Gaussian +representation where Gaussians are defined inside pose-driven primitives to +facilitate animation. Second, to stabilize and amortize the learning of +millions of Gaussians, we propose to use neural implicit fields to predict the +Gaussian attributes (e.g., colors). Finally, to capture fine avatar geometries +and extract detailed meshes, we propose a novel SDF-based implicit mesh +learning approach for 3D Gaussians that regularizes the underlying geometries +and extracts highly detailed textured meshes. Our proposed method, GAvatar, +enables the large-scale generation of diverse animatable avatars using only +text prompts. GAvatar significantly surpasses existing methods in terms of both +appearance and geometry quality, and achieves extremely fast rendering (100 +fps) at 1K resolution. + +
+
+ comment: CVPR 2024. Project website: https://nvlabs.github.io/GAvatar +
+
+
+
+
+ + ♻ ☆ P-MapNet: Far-seeing Map Generator Enhanced by both SDMap and HDMap + Priors + + +
+ Autonomous vehicles are gradually entering city roads today, with the help of +high-definition maps (HDMaps). However, the reliance on HDMaps prevents +autonomous vehicles from stepping into regions without this expensive digital +infrastructure. This fact drives many researchers to study online HDMap +generation algorithms, but the performance of these algorithms at far regions +is still unsatisfying. We present P-MapNet, in which the letter P highlights +the fact that we focus on incorporating map priors to improve model +performance. Specifically, we exploit priors in both SDMap and HDMap. On one +hand, we extract weakly aligned SDMap from OpenStreetMap, and encode it as an +additional conditioning branch. Despite the misalignment challenge, our +attention-based architecture adaptively attends to relevant SDMap skeletons and +significantly improves performance. On the other hand, we exploit a masked +autoencoder to capture the prior distribution of HDMap, which can serve as a +refinement module to mitigate occlusions and artifacts. We benchmark on the +nuScenes and Argoverse2 datasets. Through comprehensive experiments, we show +that: (1) our SDMap prior can improve online map generation performance, using +both rasterized (by up to $+18.73$ $\rm mIoU$) and vectorized (by up to $+8.50$ +$\rm mAP$) output representations. (2) our HDMap prior can improve map +perceptual metrics by up to $6.34\%$. (3) P-MapNet can be switched into +different inference modes that covers different regions of the +accuracy-efficiency trade-off landscape. (4) P-MapNet is a far-seeing solution +that brings larger improvements on longer ranges. Codes and models are publicly +available at https://jike5.github.io/P-MapNet. + +
+
+ comment: Code: https://jike5.github.io/P-MapNet +
+
+
+
+
+ + ♻ ☆ CPPF++: Uncertainty-Aware Sim2Real Object Pose Estimation by Vote + Aggregation + + +
+ Object pose estimation constitutes a critical area within the domain of 3D +vision. While contemporary state-of-the-art methods that leverage real-world +pose annotations have demonstrated commendable performance, the procurement of +such real training data incurs substantial costs. This paper focuses on a +specific setting wherein only 3D CAD models are utilized as a priori knowledge, +devoid of any background or clutter information. We introduce a novel method, +CPPF++, designed for sim-to-real pose estimation. This method builds upon the +foundational point-pair voting scheme of CPPF, reformulating it through a +probabilistic view. To address the challenge posed by vote collision, we +propose a novel approach that involves modeling the voting uncertainty by +estimating the probabilistic distribution of each point pair within the +canonical space. Furthermore, we augment the contextual information provided by +each voting unit through the introduction of N-point tuples. To enhance the +robustness and accuracy of the model, we incorporate several innovative +modules, including noisy pair filtering, online alignment optimization, and a +tuple feature ensemble. Alongside these methodological advancements, we +introduce a new category-level pose estimation dataset, named DiversePose 300. +Empirical evidence demonstrates that our method significantly surpasses +previous sim-to-real approaches and achieves comparable or superior performance +on novel datasets. Our code is available on https://github.com/qq456cvb/CPPF2. + +
+
+
+
+
+ + ♻ ☆ Embodied Multi-Modal Agent trained by an LLM from a Parallel TextWorld CVPR 2024 + + +
+ While large language models (LLMs) excel in a simulated world of texts, they +struggle to interact with the more realistic world without perceptions of other +modalities such as visual or audio signals. Although vision-language models +(VLMs) integrate LLM modules (1) aligned with static image features, and (2) +may possess prior knowledge of world dynamics (as demonstrated in the text +world), they have not been trained in an embodied visual world and thus cannot +align with its dynamics. On the other hand, training an embodied agent in a +noisy visual world without expert guidance is often challenging and +inefficient. In this paper, we train a VLM agent living in a visual world using +an LLM agent excelling in a parallel text world. Specifically, we distill LLM's +reflection outcomes (improved actions by analyzing mistakes) in a text world's +tasks to finetune the VLM on the same tasks of the visual world, resulting in +an Embodied Multi-Modal Agent (EMMA) quickly adapting to the visual world +dynamics. Such cross-modality imitation learning between the two parallel +worlds is achieved by a novel DAgger-DPO algorithm, enabling EMMA to generalize +to a broad scope of new tasks without any further guidance from the LLM expert. +Extensive evaluations on the ALFWorld benchmark's diverse tasks highlight +EMMA's superior performance to SOTA VLM-based agents, e.g., 20%-70% improvement +in the success rate. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Interpreting CLIP's Image Representation via Text-Based Decomposition + + +
+ We investigate the CLIP image encoder by analyzing how individual model +components affect the final representation. We decompose the image +representation as a sum across individual image patches, model layers, and +attention heads, and use CLIP's text representation to interpret the summands. +Interpreting the attention heads, we characterize each head's role by +automatically finding text representations that span its output space, which +reveals property-specific roles for many heads (e.g. location or shape). Next, +interpreting the image patches, we uncover an emergent spatial localization +within CLIP. Finally, we use this understanding to remove spurious features +from CLIP and to create a strong zero-shot image segmenter. Our results +indicate that a scalable understanding of transformer models is attainable and +can be used to repair and improve models. + +
+
+ comment: Project page and code: + https://yossigandelsman.github.io/clip_decomposition/ +
+
+
+
+
+ + ♻ ☆ MMCert: Provable Defense against Adversarial Attacks to Multi-modal + Models CVPR'24 + + +
+ Different from a unimodal model whose input is from a single modality, the +input (called multi-modal input) of a multi-modal model is from multiple +modalities such as image, 3D points, audio, text, etc. Similar to unimodal +models, many existing studies show that a multi-modal model is also vulnerable +to adversarial perturbation, where an attacker could add small perturbation to +all modalities of a multi-modal input such that the multi-modal model makes +incorrect predictions for it. Existing certified defenses are mostly designed +for unimodal models, which achieve sub-optimal certified robustness guarantees +when extended to multi-modal models as shown in our experimental results. In +our work, we propose MMCert, the first certified defense against adversarial +attacks to a multi-modal model. We derive a lower bound on the performance of +our MMCert under arbitrary adversarial attacks with bounded perturbations to +both modalities (e.g., in the context of auto-driving, we bound the number of +changed pixels in both RGB image and depth image). We evaluate our MMCert using +two benchmark datasets: one for the multi-modal road segmentation task and the +other for the multi-modal emotion recognition task. Moreover, we compare our +MMCert with a state-of-the-art certified defense extended from unimodal models. +Our experimental results show that our MMCert outperforms the baseline. + +
+
+ comment: To appear in CVPR'24 +
+
+
+
+
+ + ♻ ☆ Emergent Open-Vocabulary Semantic Segmentation from Off-the-shelf + Vision-Language Models CVPR 2024 + + +
+ From image-text pairs, large-scale vision-language models (VLMs) learn to +implicitly associate image regions with words, which prove effective for tasks +like visual question answering. However, leveraging the learned association for +open-vocabulary semantic segmentation remains a challenge. In this paper, we +propose a simple, yet extremely effective, training-free technique, +Plug-and-Play Open-Vocabulary Semantic Segmentation (PnP-OVSS) for this task. +PnP-OVSS leverages a VLM with direct text-to-image cross-attention and an +image-text matching loss. To balance between over-segmentation and +under-segmentation, we introduce Salience Dropout; by iteratively dropping +patches that the model is most attentive to, we are able to better resolve the +entire extent of the segmentation mask. \shortname{} does not require any +neural network training and performs hyperparameter tuning without the need for +any segmentation annotations, even for a validation set. PnP-OVSS demonstrates +substantial improvements over comparable baselines (+29.4% mIoU on Pascal VOC, ++13.2% mIoU on Pascal Context, +14.0% mIoU on MS COCO, and +11.4% mIoU on +ADE-20K.) and even outperforms most baselines that conduct additional network +training on top of pretrained VLMs. Our codebase is at +https://github.com/letitiabanana/PnP-OVSS. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet + Representation + + +
+ Generating high-quality videos that synthesize desired realistic content is a +challenging task due to their intricate high-dimensionality and complexity of +videos. Several recent diffusion-based methods have shown comparable +performance by compressing videos to a lower-dimensional latent space, using +traditional video autoencoder architecture. However, such method that employ +standard frame-wise 2D and 3D convolution fail to fully exploit the +spatio-temporal nature of videos. To address this issue, we propose a novel +hybrid video diffusion model, called HVDM, which can capture spatio-temporal +dependencies more effectively. The HVDM is trained by a hybrid video +autoencoder which extracts a disentangled representation of the video +including: (i) a global context information captured by a 2D projected latent +(ii) a local volume information captured by 3D convolutions with wavelet +decomposition (iii) a frequency information for improving the video +reconstruction. Based on this disentangled representation, our hybrid +autoencoder provide a more comprehensive video latent enriching the generated +videos with fine structures and details. Experiments on video generation +benchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed +approach achieves state-of-the-art video generation quality, showing a wide +range of video applications (e.g., long video generation, image-to-video, and +video dynamics control). + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Predicting Gradient is Better: Exploring Self-Supervised Learning for + SAR ATR with a Joint-Embedding Predictive Architecture + + +
+ The growing Synthetic Aperture Radar (SAR) data has the potential to build a +foundation model through Self-Supervised Learning (SSL) methods, which can +achieve various SAR Automatic Target Recognition (ATR) tasks with pre-training +in large-scale unlabeled data and fine-tuning in small labeled samples. SSL +aims to construct supervision signals directly from the data, which minimizes +the need for expensive expert annotation and maximizes the use of the expanding +data pool for a foundational model. This study investigates an effective SSL +method for SAR ATR, which can pave the way for a foundation model in SAR ATR. +The primary obstacles faced in SSL for SAR ATR are the small targets in remote +sensing and speckle noise in SAR images, corresponding to the SSL approach and +signals. To overcome these challenges, we present a novel Joint-Embedding +Predictive Architecture for SAR ATR (SAR-JEPA), which leverages local masked +patches to predict the multi-scale SAR gradient representations of unseen +context. The key aspect of SAR-JEPA is integrating SAR domain features to +ensure high-quality self-supervised signals as target features. Besides, we +employ local masks and multi-scale features to accommodate the various small +targets in remote sensing. By fine-tuning and evaluating our framework on three +target recognition datasets (vehicle, ship, and aircraft) with four other +datasets as pre-training, we demonstrate its outperformance over other SSL +methods and its effectiveness with increasing SAR data. This study showcases +the potential of SSL for SAR target recognition across diverse targets, scenes, +and sensors. + +
+
+ comment: Our codes at https://github.com/waterdisappear/SAR-JEPA +
+
+
+
+
+ + ♻ ☆ GTA: A Geometry-Aware Attention Mechanism for Multi-View Transformers ICLR 2024 + + +
+ As transformers are equivariant to the permutation of input tokens, encoding +the positional information of tokens is necessary for many tasks. However, +since existing positional encoding schemes have been initially designed for NLP +tasks, their suitability for vision tasks, which typically exhibit different +structural properties in their data, is questionable. We argue that existing +positional encoding schemes are suboptimal for 3D vision tasks, as they do not +respect their underlying 3D geometric structure. Based on this hypothesis, we +propose a geometry-aware attention mechanism that encodes the geometric +structure of tokens as relative transformation determined by the geometric +relationship between queries and key-value pairs. By evaluating on multiple +novel view synthesis (NVS) datasets in the sparse wide-baseline multi-view +setting, we show that our attention, called Geometric Transform Attention +(GTA), improves learning efficiency and performance of state-of-the-art +transformer-based NVS models without any additional learned parameters and only +minor computational overhead. + +
+
+ comment: Published as a conference paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis + + +
+ Chest X-ray images are commonly used for predicting acute and chronic +cardiopulmonary conditions, but efforts to integrate them with structured +clinical data face challenges due to incomplete electronic health records +(EHR). This paper introduces MedPromptX, the first model to integrate +multimodal large language models (MLLMs), few-shot prompting (FP) and visual +grounding (VG) to combine imagery with EHR data for chest X-ray diagnosis. A +pre-trained MLLM is utilized to complement the missing EHR information, +providing a comprehensive understanding of patients' medical history. +Additionally, FP reduces the necessity for extensive training of MLLMs while +effectively tackling the issue of hallucination. Nevertheless, the process of +determining the optimal number of few-shot examples and selecting high-quality +candidates can be burdensome, yet it profoundly influences model performance. +Hence, we propose a new technique that dynamically refines few-shot data for +real-time adjustment to new patient scenarios. Moreover, VG aids in focusing +the model's attention on relevant regions of interest in X-ray images, +enhancing the identification of abnormalities. We release MedPromptX-VQA, a new +in-context visual question answering dataset encompassing interleaved image and +EHR data derived from MIMIC-IV and MIMIC-CXR databases. Results demonstrate the +SOTA performance of MedPromptX, achieving an 11% improvement in F1-score +compared to the baselines. Code and data are available at +https://github.com/BioMedIA-MBZUAI/MedPromptX + +
+
+
+
+
+ + ♻ ☆ Multi-Label Classification of Thoracic Diseases using Dense + Convolutional Network on Chest Radiographs + + +
+ Traditional methods of identifying pathologies in X-ray images rely heavily +on skilled human interpretation and are often time-consuming. The advent of +deep learning techniques has enabled the development of automated disease +diagnosis systems. Still, the performance of such systems is opaque to +end-users and limited to detecting a single pathology. In this paper, we +propose a multi-label disease prediction model that allows the detection of +more than one pathology at a given test time. We use a dense convolutional +neural network (DenseNet) for disease diagnosis. Our proposed model achieved +the highest AUC score of 0.896 for the condition Cardiomegaly with an accuracy +of 0.826, while the lowest AUC score was obtained for Nodule, at 0.655 with an +accuracy of 0.66. To build trust in decision-making, we generated heatmaps on +X-rays to visualize the regions where the model paid attention to make certain +predictions. Our proposed automated disease prediction model obtained highly +confident high-performance metrics in multi-label disease prediction tasks. + +
+
+ comment: 13 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 214 + +
+
+
+ + ☆ GaussianCube: Structuring Gaussian Splatting using Optimal Transport for + 3D Generative Modeling + + +
+ 3D Gaussian Splatting (GS) have achieved considerable improvement over Neural +Radiance Fields in terms of 3D fitting fidelity and rendering speed. However, +this unstructured representation with scattered Gaussians poses a significant +challenge for generative modeling. To address the problem, we introduce +GaussianCube, a structured GS representation that is both powerful and +efficient for generative modeling. We achieve this by first proposing a +modified densification-constrained GS fitting algorithm which can yield +high-quality fitting results using a fixed number of free Gaussians, and then +re-arranging the Gaussians into a predefined voxel grid via Optimal Transport. +The structured grid representation allows us to use standard 3D U-Net as our +backbone in diffusion generative modeling without elaborate designs. Extensive +experiments conducted on ShapeNet and OmniObject3D show that our model achieves +state-of-the-art generation results both qualitatively and quantitatively, +underscoring the potential of GaussianCube as a powerful and versatile 3D +representation. + +
+
+ comment: Project Page: https://gaussiancube.github.io/ +
+
+
+
+
+ + ☆ RSMamba: Remote Sensing Image Classification with State Space Model + + +
+ Remote sensing image classification forms the foundation of various +understanding tasks, serving a crucial function in remote sensing image +interpretation. The recent advancements of Convolutional Neural Networks (CNNs) +and Transformers have markedly enhanced classification accuracy. Nonetheless, +remote sensing scene classification remains a significant challenge, especially +given the complexity and diversity of remote sensing scenarios and the +variability of spatiotemporal resolutions. The capacity for whole-image +understanding can provide more precise semantic cues for scene discrimination. +In this paper, we introduce RSMamba, a novel architecture for remote sensing +image classification. RSMamba is based on the State Space Model (SSM) and +incorporates an efficient, hardware-aware design known as the Mamba. It +integrates the advantages of both a global receptive field and linear modeling +complexity. To overcome the limitation of the vanilla Mamba, which can only +model causal sequences and is not adaptable to two-dimensional image data, we +propose a dynamic multi-path activation mechanism to augment Mamba's capacity +to model non-causal data. Notably, RSMamba maintains the inherent modeling +mechanism of the vanilla Mamba, yet exhibits superior performance across +multiple remote sensing image classification datasets. This indicates that +RSMamba holds significant potential to function as the backbone of future +visual foundation models. The code will be available at +\url{https://github.com/KyanChen/RSMamba}. + +
+
+
+
+
+ + ☆ Detecting Image Attribution for Text-to-Image Diffusion Models in RGB + and Beyond + + +
+ Modern text-to-image (T2I) diffusion models can generate images with +remarkable realism and creativity. These advancements have sparked research in +fake image detection and attribution, yet prior studies have not fully explored +the practical and scientific dimensions of this task. In addition to +attributing images to 12 state-of-the-art T2I generators, we provide extensive +analyses on what inference stage hyperparameters and image modifications are +discernible. Our experiments reveal that initialization seeds are highly +detectable, along with other subtle variations in the image generation process +to some extent. We further investigate what visual traces are leveraged in +image attribution by perturbing high-frequency details and employing mid-level +representations of image style and structure. Notably, altering high-frequency +information causes only slight reductions in accuracy, and training an +attributor on style representations outperforms training on RGB images. Our +analyses underscore that fake images are detectable and attributable at various +levels of visual granularity than previously explored. + +
+
+ comment: Code available at https://github.com/k8xu/ImageAttribution +
+
+
+
+
+ + ☆ InterDreamer: Zero-Shot Text to 3D Dynamic Human-Object Interaction + + +
+ Text-conditioned human motion generation has experienced significant +advancements with diffusion models trained on extensive motion capture data and +corresponding textual annotations. However, extending such success to 3D +dynamic human-object interaction (HOI) generation faces notable challenges, +primarily due to the lack of large-scale interaction data and comprehensive +descriptions that align with these interactions. This paper takes the +initiative and showcases the potential of generating human-object interactions +without direct training on text-interaction pair data. Our key insight in +achieving this is that interaction semantics and dynamics can be decoupled. +Being unable to learn interaction semantics through supervised training, we +instead leverage pre-trained large models, synergizing knowledge from a large +language model and a text-to-motion model. While such knowledge offers +high-level control over interaction semantics, it cannot grasp the intricacies +of low-level interaction dynamics. To overcome this issue, we further introduce +a world model designed to comprehend simple physics, modeling how human actions +influence object motion. By integrating these components, our novel framework, +InterDreamer, is able to generate text-aligned 3D HOI sequences in a zero-shot +manner. We apply InterDreamer to the BEHAVE and CHAIRS datasets, and our +comprehensive experimental analysis demonstrates its capability to generate +realistic and coherent interaction sequences that seamlessly align with the +text directives. + +
+
+ comment: Project Page: https://sirui-xu.github.io/InterDreamer/ +
+
+
+
+
+ + ☆ MagicLens: Self-Supervised Image Retrieval with Open-Ended Instructions + + +
+ Image retrieval, i.e., finding desired images given a reference image, +inherently encompasses rich, multi-faceted search intents that are difficult to +capture solely using image-based measures. Recent work leverages text +instructions to allow users to more freely express their search intents. +However, existing work primarily focuses on image pairs that are visually +similar and/or can be characterized by a small set of pre-defined relations. +The core thesis of this paper is that text instructions can enable retrieving +images with richer relations beyond visual similarity. To show this, we +introduce MagicLens, a series of self-supervised image retrieval models that +support open-ended instructions. MagicLens is built on a key novel insight: +image pairs that naturally occur on the same web pages contain a wide range of +implicit relations (e.g., inside view of), and we can bring those implicit +relations explicit by synthesizing instructions via large multimodal models +(LMMs) and large language models (LLMs). Trained on 36.7M (query image, +instruction, target image) triplets with rich semantic relations mined from the +web, MagicLens achieves comparable or better results on eight benchmarks of +various image retrieval tasks than prior state-of-the-art (SOTA) methods. +Remarkably, it outperforms previous SOTA but with a 50X smaller model size on +multiple benchmarks. Additional human analyses on a 1.4M-image unseen corpus +further demonstrate the diversity of search intents supported by MagicLens. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ GraspXL: Generating Grasping Motions for Diverse Objects at Scale + + +
+ Human hands possess the dexterity to interact with diverse objects such as +grasping specific parts of the objects and/or approaching them from desired +directions. More importantly, humans can grasp objects of any shape without +object-specific skills. Recent works synthesize grasping motions following +single objectives such as a desired approach heading direction or a grasping +area. Moreover, they usually rely on expensive 3D hand-object data during +training and inference, which limits their capability to synthesize grasping +motions for unseen objects at scale. In this paper, we unify the generation of +hand-object grasping motions across multiple motion objectives, diverse object +shapes and dexterous hand morphologies in a policy learning framework GraspXL. +The objectives are composed of the graspable area, heading direction during +approach, wrist rotation, and hand position. Without requiring any 3D +hand-object interaction data, our policy trained with 58 objects can robustly +synthesize diverse grasping motions for more than 500k unseen objects with a +success rate of 82.2%. At the same time, the policy adheres to objectives, +which enables the generation of diverse grasps per object. Moreover, we show +that our framework can be deployed to different dexterous hands and work with +reconstructed or generated objects. We quantitatively and qualitatively +evaluate our method to show the efficacy of our approach. Our model and code +will be available. + +
+
+ comment: Project Page: https://eth-ait.github.io/graspxl/ +
+
+
+
+
+ + ☆ Change-Agent: Towards Interactive Comprehensive Change Interpretation + and Analysis from Change Detection and Change Captioning + + +
+ Monitoring changes in the Earth's surface is crucial for understanding +natural processes and human impacts, necessitating precise and comprehensive +interpretation methodologies. Remote sensing satellite imagery offers a unique +perspective for monitoring these changes, leading to the emergence of remote +sensing image change interpretation (RSICI) as a significant research focus. +Current RSICI technology encompasses change detection and change captioning, +each with its limitations in providing comprehensive interpretation. To address +this, we propose an interactive Change-Agent which integrates a multi-level +change interpretation (MCI) model as eyes and a large language model (LLM) as +the brain. Our Change-Agent can follow user instructions to achieve +comprehensive change interpretation and insightful analysis according to user +instructions, such as change detection and change captioning, change object +counting, change cause analysis, etc. Our proposed MCI model contains two +branches of pixel-level change detection and semantic-level change captioning, +in which multiple BI-temporal Iterative Interaction (BI3) layers utilize Local +Perception Enhancement (LPE) and the Global Difference Fusion Attention (GDFA) +modules to enhance the model's discriminative feature representation +capabilities. To train the MCI model, we build the LEVIR-MCI dataset with +change masks and captions of bi-temporal images. Extensive experiments +demonstrate the effectiveness of the proposed change interpretation model and +highlight the promising potential of our Change-Agent in facilitating +comprehensive and intelligent interpretation of surface changes. We will make +our dataset and codebase of the change interpretation model and Change-Agent +publicly available to facilitate future research at +https://github.com/Chen-Yang-Liu/Change-Agent + +
+
+
+
+
+ + ☆ GANTASTIC: GAN-based Transfer of Interpretable Directions for + Disentangled Image Editing in Text-to-Image Diffusion Models + + +
+ The rapid advancement in image generation models has predominantly been +driven by diffusion models, which have demonstrated unparalleled success in +generating high-fidelity, diverse images from textual prompts. Despite their +success, diffusion models encounter substantial challenges in the domain of +image editing, particularly in executing disentangled edits-changes that target +specific attributes of an image while leaving irrelevant parts untouched. In +contrast, Generative Adversarial Networks (GANs) have been recognized for their +success in disentangled edits through their interpretable latent spaces. We +introduce GANTASTIC, a novel framework that takes existing directions from +pre-trained GAN models-representative of specific, controllable attributes-and +transfers these directions into diffusion-based models. This novel approach not +only maintains the generative quality and diversity that diffusion models are +known for but also significantly enhances their capability to perform precise, +targeted image edits, thereby leveraging the best of both worlds. + +
+
+ comment: Project page: https://gantastic.github.io +
+
+
+
+
+ + ☆ Siamese Vision Transformers are Scalable Audio-visual Learners + + +
+ Traditional audio-visual methods rely on independent audio and visual +backbones, which is costly and not scalable. In this work, we investigate using +an audio-visual siamese network (AVSiam) for efficient and scalable +audio-visual pretraining. Our framework uses a single shared vision transformer +backbone to process audio and visual inputs, improving its parameter +efficiency, reducing the GPU memory footprint, and allowing us to scale our +method to larger datasets and model sizes. We pretrain our model using a +contrastive audio-visual matching objective with a multi-ratio random masking +scheme, which enables our model to process larger audio-visual instance +batches, helpful for contrastive learning. Unlike prior audio-visual methods, +our method can robustly handle audio, visual, and audio-visual inputs with a +single shared ViT backbone. Furthermore, despite using the shared backbone for +both modalities, AVSiam achieves competitive or even better results than prior +methods on AudioSet and VGGSound for audio-visual classification and retrieval. +Our code is available at https://github.com/GenjiB/AVSiam + +
+
+
+
+
+ + ☆ GauStudio: A Modular Framework for 3D Gaussian Splatting and Beyond + + +
+ We present GauStudio, a novel modular framework for modeling 3D Gaussian +Splatting (3DGS) to provide standardized, plug-and-play components for users to +easily customize and implement a 3DGS pipeline. Supported by our framework, we +propose a hybrid Gaussian representation with foreground and skyball background +models. Experiments demonstrate this representation reduces artifacts in +unbounded outdoor scenes and improves novel view synthesis. Finally, we propose +Gaussian Splatting Surface Reconstruction (GauS), a novel render-then-fuse +approach for high-fidelity mesh reconstruction from 3DGS inputs without +fine-tuning. Overall, our GauStudio framework, hybrid representation, and GauS +approach enhance 3DGS modeling and rendering capabilities, enabling +higher-quality novel view synthesis and surface reconstruction. + +
+
+ comment: Code: https://github.com/GAP-LAB-CUHK-SZ/gaustudio +
+
+
+
+
+ + ☆ RH20T-P: A Primitive-Level Robotic Dataset Towards Composable + Generalization Agents + + +
+ The ultimate goals of robotic learning is to acquire a comprehensive and +generalizable robotic system capable of performing both seen skills within the +training distribution and unseen skills in novel environments. Recent progress +in utilizing language models as high-level planners has demonstrated that the +complexity of tasks can be reduced through decomposing them into +primitive-level plans, making it possible to generalize on novel robotic tasks +in a composable manner. Despite the promising future, the community is not yet +adequately prepared for composable generalization agents, particularly due to +the lack of primitive-level real-world robotic datasets. In this paper, we +propose a primitive-level robotic dataset, namely RH20T-P, which contains about +33000 video clips covering 44 diverse and complicated robotic tasks. Each clip +is manually annotated according to a set of meticulously designed primitive +skills, facilitating the future development of composable generalization +agents. To validate the effectiveness of RH20T-P, we also construct a potential +and scalable agent based on RH20T-P, called RA-P. Equipped with two planners +specialized in task decomposition and motion planning, RA-P can adapt to novel +physical skills through composable generalization. Our website and videos can +be found at https://sites.google.com/view/rh20t-primitive/main. Dataset and +code will be made available soon. + +
+
+ comment: 24 pages, 12 figures, 6 tables +
+
+
+
+
+ + ☆ Collaborative Interactive Evolution of Art in the Latent Space of Deep + Generative Models + + +
+ Generative Adversarial Networks (GANs) have shown great success in generating +high quality images and are thus used as one of the main approaches to generate +art images. However, usually the image generation process involves sampling +from the latent space of the learned art representations, allowing little +control over the output. In this work, we first employ GANs that are trained to +produce creative images using an architecture known as Creative Adversarial +Networks (CANs), then, we employ an evolutionary approach to navigate within +the latent space of the models to discover images. We use automatic aesthetic +and collaborative interactive human evaluation metrics to assess the generated +images. In the human interactive evaluation case, we propose a collaborative +evaluation based on the assessments of several participants. Furthermore, we +also experiment with an intelligent mutation operator that aims to improve the +quality of the images through local search based on an aesthetic measure. We +evaluate the effectiveness of this approach by comparing the results produced +by the automatic and collaborative interactive evolution. The results show that +the proposed approach can generate highly attractive art images when the +evolution is guided by collaborative human feedback. + +
+
+ comment: Preprint. The Version of Record of this contribution is to be + published in the proceedings of the 13th International Conference on + Artificial Intelligence in Music, Sound, Art and Design (EvoMUSART) 2024 +
+
+
+
+
+ + ☆ SA-GS: Scale-Adaptive Gaussian Splatting for Training-Free Anti-Aliasing + + +
+ In this paper, we present a Scale-adaptive method for Anti-aliasing Gaussian +Splatting (SA-GS). While the state-of-the-art method Mip-Splatting needs +modifying the training procedure of Gaussian splatting, our method functions at +test-time and is training-free. Specifically, SA-GS can be applied to any +pretrained Gaussian splatting field as a plugin to significantly improve the +field's anti-alising performance. The core technique is to apply 2D +scale-adaptive filters to each Gaussian during test time. As pointed out by +Mip-Splatting, observing Gaussians at different frequencies leads to mismatches +between the Gaussian scales during training and testing. Mip-Splatting resolves +this issue using 3D smoothing and 2D Mip filters, which are unfortunately not +aware of testing frequency. In this work, we show that a 2D scale-adaptive +filter that is informed of testing frequency can effectively match the Gaussian +scale, thus making the Gaussian primitive distribution remain consistent across +different testing frequencies. When scale inconsistency is eliminated, sampling +rates smaller than the scene frequency result in conventional jaggedness, and +we propose to integrate the projected 2D Gaussian within each pixel during +testing. This integration is actually a limiting case of super-sampling, which +significantly improves anti-aliasing performance over vanilla Gaussian +Splatting. Through extensive experiments using various settings and both +bounded and unbounded scenes, we show SA-GS performs comparably with or better +than Mip-Splatting. Note that super-sampling and integration are only effective +when our scale-adaptive filtering is activated. Our codes, data and models are +available at https://github.com/zsy1987/SA-GS. + +
+
+ comment: Project page: https://kevinsong729.github.io/project-pages/SA-GS/ + Code: https://github.com/zsy1987/SA-GS +
+
+
+
+
+ + ☆ ILPO-NET: Network for the invariant recognition of arbitrary volumetric + patterns in 3D + + +
+ Effective recognition of spatial patterns and learning their hierarchy is +crucial in modern spatial data analysis. Volumetric data applications seek +techniques ensuring invariance not only to shifts but also to pattern +rotations. While traditional methods can readily achieve translational +invariance, rotational invariance possesses multiple challenges and remains an +active area of research. Here, we present ILPO-Net (Invariant to Local Patterns +Orientation Network), a novel approach that handles arbitrarily shaped patterns +with the convolutional operation inherently invariant to local spatial pattern +orientations using the Wigner matrix expansions. Our architecture seamlessly +integrates the new convolution operator and, when benchmarked on diverse +volumetric datasets such as MedMNIST and CATH, demonstrates superior +performance over the baselines with significantly reduced parameter counts - up +to 1000 times fewer in the case of MedMNIST. Beyond these demonstrations, +ILPO-Net's rotational invariance paves the way for other applications across +multiple disciplines. Our code is publicly available at +https://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPONet. + +
+
+
+
+
+ + ☆ Nearest Neighbor Classication for Classical Image Upsampling + + +
+ Given a set of ordered pixel data in the form of an image, our goal is to +perform upsampling on the data such that: the resulting resolution is improved +by some factor, the final result passes the human test, having added new, +believable, and realistic information and detail to the image, the time +complexity for upscaling is relatively close to that of lossy upscaling +implementations. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ SAID-NeRF: Segmentation-AIDed NeRF for Depth Completion of Transparent + Objects + + +
+ Acquiring accurate depth information of transparent objects using +off-the-shelf RGB-D cameras is a well-known challenge in Computer Vision and +Robotics. Depth estimation/completion methods are typically employed and +trained on datasets with quality depth labels acquired from either simulation, +additional sensors or specialized data collection setups and known 3d models. +However, acquiring reliable depth information for datasets at scale is not +straightforward, limiting training scalability and generalization. Neural +Radiance Fields (NeRFs) are learning-free approaches and have demonstrated wide +success in novel view synthesis and shape recovery. However, heuristics and +controlled environments (lights, backgrounds, etc) are often required to +accurately capture specular surfaces. In this paper, we propose using Visual +Foundation Models (VFMs) for segmentation in a zero-shot, label-free way to +guide the NeRF reconstruction process for these objects via the simultaneous +reconstruction of semantic fields and extensions to increase robustness. Our +proposed method Segmentation-AIDed NeRF (SAID-NeRF) shows significant +performance on depth completion datasets for transparent objects and robotic +grasping. + +
+
+ comment: 8 pages. An accompanying video is available at + https://www.youtube.com/watch?v=S4NCoUq4bmE +
+
+
+
+
+ + ☆ Semantic Map-based Generation of Navigation Instructions LREC + + +
+ We are interested in the generation of navigation instructions, either in +their own right or as training material for robotic navigation task. In this +paper, we propose a new approach to navigation instruction generation by +framing the problem as an image captioning task using semantic maps as visual +input. Conventional approaches employ a sequence of panorama images to generate +navigation instructions. Semantic maps abstract away from visual details and +fuse the information in multiple panorama images into a single top-down +representation, thereby reducing computational complexity to process the input. +We present a benchmark dataset for instruction generation using semantic maps, +propose an initial model and ask human subjects to manually assess the quality +of generated instructions. Our initial investigations show promise in using +semantic maps for instruction generation instead of a sequence of panorama +images, but there is vast scope for improvement. We release the code for data +preparation and model training at https://github.com/chengzu-li/VLGen. + +
+
+ comment: 5 pages, 2 figures, 3 tables (13 pages, 3 figures, 5 tables including + references and appendices), accepted at LREC-COLING 2024 +
+
+
+
+
+ + ☆ Enhance Image Classification via Inter-Class Image Mixup with Diffusion + Model + + +
+ Text-to-image (T2I) generative models have recently emerged as a powerful +tool, enabling the creation of photo-realistic images and giving rise to a +multitude of applications. However, the effective integration of T2I models +into fundamental image classification tasks remains an open question. A +prevalent strategy to bolster image classification performance is through +augmenting the training set with synthetic images generated by T2I models. In +this study, we scrutinize the shortcomings of both current generative and +conventional data augmentation techniques. Our analysis reveals that these +methods struggle to produce images that are both faithful (in terms of +foreground objects) and diverse (in terms of background contexts) for +domain-specific concepts. To tackle this challenge, we introduce an innovative +inter-class data augmentation method known as Diff-Mix +(https://github.com/Zhicaiwww/Diff-Mix), which enriches the dataset by +performing image translations between classes. Our empirical results +demonstrate that Diff-Mix achieves a better balance between faithfulness and +diversity, leading to a marked improvement in performance across diverse image +classification scenarios, including few-shot, conventional, and long-tail +classifications for domain-specific datasets. + +
+
+
+
+
+ + ☆ LocCa: Visual Pretraining with Location-aware Captioners + + +
+ Image captioning has been shown as an effective pretraining method similar to +contrastive pretraining. However, the incorporation of location-aware +information into visual pretraining remains an area with limited research. In +this paper, we propose a simple visual pretraining method with location-aware +captioners (LocCa). LocCa uses a simple image captioner task interface, to +teach a model to read out rich information, i.e. bounding box coordinates, and +captions, conditioned on the image pixel input. Thanks to the multitask +capabilities of an encoder-decoder architecture, we show that an image +captioner can easily handle multiple tasks during pretraining. Our experiments +demonstrate that LocCa outperforms standard captioners significantly on +localization downstream tasks while maintaining comparable performance on +holistic tasks. + +
+
+
+
+
+ + ☆ Situation Awareness for Driver-Centric Driving Style Adaptation + + +
+ There is evidence that the driving style of an autonomous vehicle is +important to increase the acceptance and trust of the passengers. The driving +situation has been found to have a significant influence on human driving +behavior. However, current driving style models only partially incorporate +driving environment information, limiting the alignment between an agent and +the given situation. Therefore, we propose a situation-aware driving style +model based on different visual feature encoders pretrained on fleet data, as +well as driving behavior predictors, which are adapted to the driving style of +a specific driver. Our experiments show that the proposed method outperforms +static driving styles significantly and forms plausible situation clusters. +Furthermore, we found that feature encoders pretrained on our dataset lead to +more precise driving behavior modeling. In contrast, feature encoders +pretrained supervised and unsupervised on different data sources lead to more +specific situation clusters, which can be utilized to constrain and control the +driving style adaptation for specific situations. Moreover, in a real-world +setting, where driving style adaptation is happening iteratively, we found the +MLP-based behavior predictors achieve good performance initially but suffer +from catastrophic forgetting. In contrast, behavior predictors based on +situationdependent statistics can learn iteratively from continuous data +streams by design. Overall, our experiments show that important information for +driving behavior prediction is contained within the visual feature encoder. The +dataset is publicly available at +huggingface.co/datasets/jHaselberger/SADC-Situation-Awareness-for-Driver-Centric-Driving-Style-Adaptation. + +
+
+ comment: 14 pages, 6 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Frame by Familiar Frame: Understanding Replication in Video Diffusion + Models + + +
+ Building on the momentum of image generation diffusion models, there is an +increasing interest in video-based diffusion models. However, video generation +poses greater challenges due to its higher-dimensional nature, the scarcity of +training data, and the complex spatiotemporal relationships involved. Image +generation models, due to their extensive data requirements, have already +strained computational resources to their limits. There have been instances of +these models reproducing elements from the training samples, leading to +concerns and even legal disputes over sample replication. Video diffusion +models, which operate with even more constrained datasets and are tasked with +generating both spatial and temporal content, may be more prone to replicating +samples from their training sets. Compounding the issue, these models are often +evaluated using metrics that inadvertently reward replication. In our paper, we +present a systematic investigation into the phenomenon of sample replication in +video diffusion models. We scrutinize various recent diffusion models for video +synthesis, assessing their tendency to replicate spatial and temporal content +in both unconditional and conditional generation scenarios. Our study +identifies strategies that are less likely to lead to replication. Furthermore, +we propose new evaluation strategies that take replication into account, +offering a more accurate measure of a model's ability to generate the original +content. + +
+
+
+
+
+ + ☆ TOD3Cap: Towards 3D Dense Captioning in Outdoor Scenes + + +
+ 3D dense captioning stands as a cornerstone in achieving a comprehensive +understanding of 3D scenes through natural language. It has recently witnessed +remarkable achievements, particularly in indoor settings. However, the +exploration of 3D dense captioning in outdoor scenes is hindered by two major +challenges: 1) the \textbf{domain gap} between indoor and outdoor scenes, such +as dynamics and sparse visual inputs, makes it difficult to directly adapt +existing indoor methods; 2) the \textbf{lack of data} with comprehensive +box-caption pair annotations specifically tailored for outdoor scenes. To this +end, we introduce the new task of outdoor 3D dense captioning. As input, we +assume a LiDAR point cloud and a set of RGB images captured by the panoramic +camera rig. The expected output is a set of object boxes with captions. To +tackle this task, we propose the TOD3Cap network, which leverages the BEV +representation to generate object box proposals and integrates Relation +Q-Former with LLaMA-Adapter to generate rich captions for these objects. We +also introduce the TOD3Cap dataset, the largest one to our knowledge for 3D +dense captioning in outdoor scenes, which contains 2.3M descriptions of 64.3K +outdoor objects from 850 scenes. Notably, our TOD3Cap network can effectively +localize and caption 3D objects in outdoor scenes, which outperforms baseline +methods by a significant margin (+9.6 CiDEr@0.5IoU). Code, data, and models are +publicly available at https://github.com/jxbbb/TOD3Cap. + +
+
+ comment: Code, data, and models are publicly available at + https://github.com/jxbbb/TOD3Cap +
+
+
+
+
+ + ☆ DenseNets Reloaded: Paradigm Shift Beyond ResNets and ViTs + + +
+ This paper revives Densely Connected Convolutional Networks (DenseNets) and +reveals the underrated effectiveness over predominant ResNet-style +architectures. We believe DenseNets' potential was overlooked due to untouched +training methods and traditional design elements not fully revealing their +capabilities. Our pilot study shows dense connections through concatenation are +strong, demonstrating that DenseNets can be revitalized to compete with modern +architectures. We methodically refine suboptimal components - architectural +adjustments, block redesign, and improved training recipes towards widening +DenseNets and boosting memory efficiency while keeping concatenation shortcuts. +Our models, employing simple architectural elements, ultimately surpass Swin +Transformer, ConvNeXt, and DeiT-III - key architectures in the residual +learning lineage. Furthermore, our models exhibit near state-of-the-art +performance on ImageNet-1K, competing with the very recent models and +downstream tasks, ADE20k semantic segmentation, and COCO object +detection/instance segmentation. Finally, we provide empirical analyses that +uncover the merits of the concatenation over additive shortcuts, steering a +renewed preference towards DenseNet-style designs. Our code is available at +https://github.com/naver-ai/rdnet. + +
+
+ comment: Code at https://github.com/naver-ai/rdnet +
+
+
+
+
+ + ☆ TOGS: Gaussian Splatting with Temporal Opacity Offset for Real-Time 4D + DSA Rendering + + +
+ Four-dimensional Digital Subtraction Angiography (4D DSA) is a medical +imaging technique that provides a series of 2D images captured at different +stages and angles during the process of contrast agent filling blood vessels. +It plays a significant role in the diagnosis of cerebrovascular diseases. +Improving the rendering quality and speed under sparse sampling is important +for observing the status and location of lesions. The current methods exhibit +inadequate rendering quality in sparse views and suffer from slow rendering +speed. To overcome these limitations, we propose TOGS, a Gaussian splatting +method with opacity offset over time, which can effectively improve the +rendering quality and speed of 4D DSA. We introduce an opacity offset table for +each Gaussian to model the temporal variations in the radiance of the contrast +agent. By interpolating the opacity offset table, the opacity variation of the +Gaussian at different time points can be determined. This enables us to render +the 2D DSA image at that specific moment. Additionally, we introduced a Smooth +loss term in the loss function to mitigate overfitting issues that may arise in +the model when dealing with sparse view scenarios. During the training phase, +we randomly prune Gaussians, thereby reducing the storage overhead of the +model. The experimental results demonstrate that compared to previous methods, +this model achieves state-of-the-art reconstruction quality under the same +number of training views. Additionally, it enables real-time rendering while +maintaining low storage overhead. The code will be publicly available. + +
+
+
+
+
+ + ☆ Img2Loc: Revisiting Image Geolocalization using Multi-modality + Foundation Models and Image-based Retrieval-Augmented Generation + + +
+ Geolocating precise locations from images presents a challenging problem in +computer vision and information retrieval.Traditional methods typically employ +either classification, which dividing the Earth surface into grid cells and +classifying images accordingly, or retrieval, which identifying locations by +matching images with a database of image-location pairs. However, +classification-based approaches are limited by the cell size and cannot yield +precise predictions, while retrieval-based systems usually suffer from poor +search quality and inadequate coverage of the global landscape at varied scale +and aggregation levels. To overcome these drawbacks, we present Img2Loc, a +novel system that redefines image geolocalization as a text generation task. +This is achieved using cutting-edge large multi-modality models like GPT4V or +LLaVA with retrieval augmented generation. Img2Loc first employs CLIP-based +representations to generate an image-based coordinate query database. It then +uniquely combines query results with images itself, forming elaborate prompts +customized for LMMs. When tested on benchmark datasets such as Im2GPS3k and +YFCC4k, Img2Loc not only surpasses the performance of previous state-of-the-art +models but does so without any model training. + +
+
+
+
+
+ + ☆ OV-Uni3DETR: Towards Unified Open-Vocabulary 3D Object Detection via + Cycle-Modality Propagation + + +
+ In the current state of 3D object detection research, the severe scarcity of +annotated 3D data, substantial disparities across different data modalities, +and the absence of a unified architecture, have impeded the progress towards +the goal of universality. In this paper, we propose \textbf{OV-Uni3DETR}, a +unified open-vocabulary 3D detector via cycle-modality propagation. Compared +with existing 3D detectors, OV-Uni3DETR offers distinct advantages: 1) +Open-vocabulary 3D detection: During training, it leverages various accessible +data, especially extensive 2D detection images, to boost training diversity. +During inference, it can detect both seen and unseen classes. 2) Modality +unifying: It seamlessly accommodates input data from any given modality, +effectively addressing scenarios involving disparate modalities or missing +sensor information, thereby supporting test-time modality switching. 3) Scene +unifying: It provides a unified multi-modal model architecture for diverse +scenes collected by distinct sensors. Specifically, we propose the +cycle-modality propagation, aimed at propagating knowledge bridging 2D and 3D +modalities, to support the aforementioned functionalities. 2D semantic +knowledge from large-vocabulary learning guides novel class discovery in the 3D +domain, and 3D geometric knowledge provides localization supervision for 2D +detection images. OV-Uni3DETR achieves the state-of-the-art performance on +various scenarios, surpassing existing methods by more than 6\% on average. Its +performance using only RGB images is on par with or even surpasses that of +previous point cloud based methods. Code and pre-trained models will be +released later. + +
+
+
+
+
+ + ☆ The Bad Batches: Enhancing Self-Supervised Learning in Image + Classification Through Representative Batch Curation + + +
+ The pursuit of learning robust representations without human supervision is a +longstanding challenge. The recent advancements in self-supervised contrastive +learning approaches have demonstrated high performance across various +representation learning challenges. However, current methods depend on the +random transformation of training examples, resulting in some cases of +unrepresentative positive pairs that can have a large impact on learning. This +limitation not only impedes the convergence of the learning process but the +robustness of the learnt representation as well as requiring larger batch sizes +to improve robustness to such bad batches. This paper attempts to alleviate the +influence of false positive and false negative pairs by employing pairwise +similarity calculations through the Fr\'echet ResNet Distance (FRD), thereby +obtaining robust representations from unlabelled data. The effectiveness of the +proposed method is substantiated by empirical results, where a linear +classifier trained on self-supervised contrastive representations achieved an +impressive 87.74\% top-1 accuracy on STL10 and 99.31\% on the Flower102 +dataset. These results emphasize the potential of the proposed approach in +pushing the boundaries of the state-of-the-art in self-supervised contrastive +learning, particularly for image classification tasks. + +
+
+ comment: 8 Pages, 4 figures, IEEE WCCI 2024 Conference +
+
+
+
+
+ + ☆ Cross-Attention is Not Always Needed: Dynamic Cross-Attention for + Audio-Visual Dimensional Emotion Recognition ICME2024 + + +
+ In video-based emotion recognition, audio and visual modalities are often +expected to have a complementary relationship, which is widely explored using +cross-attention. However, they may also exhibit weak complementary +relationships, resulting in poor representations of audio-visual features, thus +degrading the performance of the system. To address this issue, we propose +Dynamic Cross-Attention (DCA) that can dynamically select cross-attended or +unattended features on the fly based on their strong or weak complementary +relationship with each other, respectively. Specifically, a simple yet +efficient gating layer is designed to evaluate the contribution of the +cross-attention mechanism and choose cross-attended features only when they +exhibit a strong complementary relationship, otherwise unattended features. We +evaluate the performance of the proposed approach on the challenging RECOLA and +Aff-Wild2 datasets. We also compare the proposed approach with other variants +of cross-attention and show that the proposed model consistently improves the +performance on both datasets. + +
+
+ comment: Accepted at IEEE ICME2024 +
+
+
+
+
+ + ☆ GlORIE-SLAM: Globally Optimized RGB-only Implicit Encoding Point Cloud + SLAM + + +
+ Recent advancements in RGB-only dense Simultaneous Localization and Mapping +(SLAM) have predominantly utilized grid-based neural implicit encodings and/or +struggle to efficiently realize global map and pose consistency. To this end, +we propose an efficient RGB-only dense SLAM system using a flexible neural +point cloud scene representation that adapts to keyframe poses and depth +updates, without needing costly backpropagation. Another critical challenge of +RGB-only SLAM is the lack of geometric priors. To alleviate this issue, with +the aid of a monocular depth estimator, we introduce a novel DSPO layer for +bundle adjustment which optimizes the pose and depth of keyframes along with +the scale of the monocular depth. Finally, our system benefits from loop +closure and online global bundle adjustment and performs either better or +competitive to existing dense neural RGB SLAM methods in tracking, mapping and +rendering accuracy on the Replica, TUM-RGBD and ScanNet datasets. The source +code will be made available. + +
+
+
+
+
+ + ☆ De-confounded Data-free Knowledge Distillation for Handling Distribution + Shifts CVPR24 + + +
+ Data-Free Knowledge Distillation (DFKD) is a promising task to train +high-performance small models to enhance actual deployment without relying on +the original training data. Existing methods commonly avoid relying on private +data by utilizing synthetic or sampled data. However, a long-overlooked issue +is that the severe distribution shifts between their substitution and original +data, which manifests as huge differences in the quality of images and class +proportions. The harmful shifts are essentially the confounder that +significantly causes performance bottlenecks. To tackle the issue, this paper +proposes a novel perspective with causal inference to disentangle the student +models from the impact of such shifts. By designing a customized causal graph, +we first reveal the causalities among the variables in the DFKD task. +Subsequently, we propose a Knowledge Distillation Causal Intervention (KDCI) +framework based on the backdoor adjustment to de-confound the confounder. KDCI +can be flexibly combined with most existing state-of-the-art baselines. +Experiments in combination with six representative DFKD methods demonstrate the +effectiveness of our KDCI, which can obviously help existing methods under +almost all settings, \textit{e.g.}, improving the baseline by up to 15.54\% +accuracy on the CIFAR-100 dataset. + +
+
+ comment: Accepted by CVPR24 +
+
+
+
+
+ + ☆ Locate, Assign, Refine: Taming Customized Image Inpainting with + Text-Subject Guidance + + +
+ Prior studies have made significant progress in image inpainting guided by +either text or subject image. However, the research on editing with their +combined guidance is still in the early stages. To tackle this challenge, we +present LAR-Gen, a novel approach for image inpainting that enables seamless +inpainting of masked scene images, incorporating both the textual prompts and +specified subjects. Our approach adopts a coarse-to-fine manner to ensure +subject identity preservation and local semantic coherence. The process +involves (i) Locate: concatenating the noise with masked scene image to achieve +precise regional editing, (ii) Assign: employing decoupled cross-attention +mechanism to accommodate multi-modal guidance, and (iii) Refine: using a novel +RefineNet to supplement subject details. Additionally, to address the issue of +scarce training data, we introduce a novel data construction pipeline. This +pipeline extracts substantial pairs of data consisting of local text prompts +and corresponding visual instances from a vast image dataset, leveraging +publicly available large models. Extensive experiments and varied application +scenarios demonstrate the superiority of LAR-Gen in terms of both identity +preservation and text semantic consistency. Project page can be found at +\url{https://ali-vilab.github.io/largen-page/}. + +
+
+ comment: 22 pages, 14 figures +
+
+
+
+
+ + ☆ Instance-Adaptive and Geometric-Aware Keypoint Learning for + Category-Level 6D Object Pose Estimation CVPR2024 + + +
+ Category-level 6D object pose estimation aims to estimate the rotation, +translation and size of unseen instances within specific categories. In this +area, dense correspondence-based methods have achieved leading performance. +However, they do not explicitly consider the local and global geometric +information of different instances, resulting in poor generalization ability to +unseen instances with significant shape variations. To deal with this problem, +we propose a novel Instance-Adaptive and Geometric-Aware Keypoint Learning +method for category-level 6D object pose estimation (AG-Pose), which includes +two key designs: (1) The first design is an Instance-Adaptive Keypoint +Detection module, which can adaptively detect a set of sparse keypoints for +various instances to represent their geometric structures. (2) The second +design is a Geometric-Aware Feature Aggregation module, which can efficiently +integrate the local and global geometric information into keypoint features. +These two modules can work together to establish robust keypoint-level +correspondences for unseen instances, thus enhancing the generalization ability +of the model.Experimental results on CAMERA25 and REAL275 datasets show that +the proposed AG-Pose outperforms state-of-the-art methods by a large margin +without category-specific shape priors. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Model Stock: All we need is just a few fine-tuned models + + +
+ This paper introduces an efficient fine-tuning method for large pre-trained +models, offering strong in-distribution (ID) and out-of-distribution (OOD) +performance. Breaking away from traditional practices that need a multitude of +fine-tuned models for averaging, our approach employs significantly fewer +models to achieve final weights yet yield superior accuracy. Drawing from key +insights in the weight space of fine-tuned weights, we uncover a strong link +between the performance and proximity to the center of weight space. Based on +this, we introduce a method that approximates a center-close weight using only +two fine-tuned models, applicable during or after training. Our innovative +layer-wise weight averaging technique surpasses state-of-the-art model methods +such as Model Soup, utilizing only two fine-tuned models. This strategy can be +aptly coined Model Stock, highlighting its reliance on selecting a minimal +number of models to draw a more optimized-averaged model. We demonstrate the +efficacy of Model Stock with fine-tuned models based upon pre-trained CLIP +architectures, achieving remarkable performance on both ID and OOD tasks on the +standard benchmarks, all while barely bringing extra computational demands. Our +code and pre-trained models are available at +https://github.com/naver-ai/model-stock. + +
+
+ comment: Code at https://github.com/naver-ai/model-stock +
+
+
+
+
+ + ☆ XScale-NVS: Cross-Scale Novel View Synthesis with Hash Featurized + Manifold CVPR 2024 + + +
+ We propose XScale-NVS for high-fidelity cross-scale novel view synthesis of +real-world large-scale scenes. Existing representations based on explicit +surface suffer from discretization resolution or UV distortion, while implicit +volumetric representations lack scalability for large scenes due to the +dispersed weight distribution and surface ambiguity. In light of the above +challenges, we introduce hash featurized manifold, a novel hash-based +featurization coupled with a deferred neural rendering framework. This approach +fully unlocks the expressivity of the representation by explicitly +concentrating the hash entries on the 2D manifold, thus effectively +representing highly detailed contents independent of the discretization +resolution. We also introduce a novel dataset, namely GigaNVS, to benchmark +cross-scale, high-resolution novel view synthesis of realworld large-scale +scenes. Our method significantly outperforms competing baselines on various +real-world scenes, yielding an average LPIPS that is 40% lower than prior +state-of-the-art on the challenging GigaNVS benchmark. Please see our project +page at: xscalenvs.github.io. + +
+
+ comment: Accepted to CVPR 2024. Project page: xscalenvs.github.io/ +
+
+
+
+
+ + ☆ CDIMC-net: Cognitive Deep Incomplete Multi-view Clustering Network IJCAI 2020 + + +
+ In recent years, incomplete multi-view clustering, which studies the +challenging multi-view clustering problem on missing views, has received +growing research interests. Although a series of methods have been proposed to +address this issue, the following problems still exist: 1) Almost all of the +existing methods are based on shallow models, which is difficult to obtain +discriminative common representations. 2) These methods are generally sensitive +to noise or outliers since the negative samples are treated equally as the +important samples. In this paper, we propose a novel incomplete multi-view +clustering network, called Cognitive Deep Incomplete Multi-view Clustering +Network (CDIMC-net), to address these issues. Specifically, it captures the +high-level features and local structure of each view by incorporating the +view-specific deep encoders and graph embedding strategy into a framework. +Moreover, based on the human cognition, i.e., learning from easy to hard, it +introduces a self-paced strategy to select the most confident samples for model +training, which can reduce the negative influence of outliers. Experimental +results on several incomplete datasets show that CDIMC-net outperforms the +state-of-the-art incomplete multi-view clustering methods. + +
+
+ comment: Accepted by IJCAI 2020 +
+
+
+
+
+ + ☆ Debiasing Cardiac Imaging with Controlled Latent Diffusion Models + + +
+ The progress in deep learning solutions for disease diagnosis and prognosis +based on cardiac magnetic resonance imaging is hindered by highly imbalanced +and biased training data. To address this issue, we propose a method to +alleviate imbalances inherent in datasets through the generation of synthetic +data based on sensitive attributes such as sex, age, body mass index, and +health condition. We adopt ControlNet based on a denoising diffusion +probabilistic model to condition on text assembled from patient metadata and +cardiac geometry derived from segmentation masks using a large-cohort study, +specifically, the UK Biobank. We assess our method by evaluating the realism of +the generated images using established quantitative metrics. Furthermore, we +conduct a downstream classification task aimed at debiasing a classifier by +rectifying imbalances within underrepresented groups through synthetically +generated samples. Our experiments demonstrate the effectiveness of the +proposed approach in mitigating dataset imbalances, such as the scarcity of +younger patients or individuals with normal BMI level suffering from heart +failure. This work represents a major step towards the adoption of synthetic +data for the development of fair and generalizable models for medical +classification tasks. Notably, we conduct all our experiments using a single, +consumer-level GPU to highlight the feasibility of our approach within +resource-constrained environments. Our code is available at +https://github.com/faildeny/debiasing-cardiac-mri. + +
+
+
+
+
+ + ☆ RELI11D: A Comprehensive Multimodal Human Motion Dataset and Method CVPR2024 + + +
+ Comprehensive capturing of human motions requires both accurate captures of +complex poses and precise localization of the human within scenes. Most of the +HPE datasets and methods primarily rely on RGB, LiDAR, or IMU data. However, +solely using these modalities or a combination of them may not be adequate for +HPE, particularly for complex and fast movements. For holistic human motion +understanding, we present RELI11D, a high-quality multimodal human motion +dataset involves LiDAR, IMU system, RGB camera, and Event camera. It records +the motions of 10 actors performing 5 sports in 7 scenes, including 3.32 hours +of synchronized LiDAR point clouds, IMU measurement data, RGB videos and Event +steams. Through extensive experiments, we demonstrate that the RELI11D presents +considerable challenges and opportunities as it contains many rapid and complex +motions that require precise location. To address the challenge of integrating +different modalities, we propose LEIR, a multimodal baseline that effectively +utilizes LiDAR Point Cloud, Event stream, and RGB through our cross-attention +fusion strategy. We show that LEIR exhibits promising results for rapid motions +and daily motions and that utilizing the characteristics of multiple modalities +can indeed improve HPE performance. Both the dataset and source code will be +released publicly to the research community, fostering collaboration and +enabling further exploration in this field. + +
+
+ comment: CVPR2024, Project website: http://www.lidarhumanmotion.net/reli11d/ +
+
+
+
+
+ + ☆ Surface-based parcellation and vertex-wise analysis of ultra + high-resolution ex vivo 7 tesla MRI in neurodegenerative diseases MICCAI 2024 + + +
+ Magnetic resonance imaging (MRI) is the standard modality to understand human +brain structure and function in vivo (antemortem). Decades of research in human +neuroimaging has led to the widespread development of methods and tools to +provide automated volume-based segmentations and surface-based parcellations +which help localize brain functions to specialized anatomical regions. Recently +ex vivo (postmortem) imaging of the brain has opened-up avenues to study brain +structure at sub-millimeter ultra high-resolution revealing details not +possible to observe with in vivo MRI. Unfortunately, there has been limited +methodological development in ex vivo MRI primarily due to lack of datasets and +limited centers with such imaging resources. Therefore, in this work, we +present one-of-its-kind dataset of 82 ex vivo T2w whole brain hemispheres MRI +at 0.3 mm isotropic resolution spanning Alzheimer's disease and related +dementias. We adapted and developed a fast and easy-to-use automated +surface-based pipeline to parcellate, for the first time, ultra high-resolution +ex vivo brain tissue at the native subject space resolution using the +Desikan-Killiany-Tourville (DKT) brain atlas. This allows us to perform +vertex-wise analysis in the template space and thereby link morphometry +measures with pathology measurements derived from histology. We will +open-source our dataset docker container, Jupyter notebooks for ready-to-use +out-of-the-box set of tools and command line options to advance ex vivo MRI +clinical brain imaging research on the project webpage. + +
+
+ comment: Under review at MICCAI 2024 +
+
+
+
+
+ + ☆ CoherentGS: Sparse Novel View Synthesis with Coherent 3D Gaussians + + +
+ The field of 3D reconstruction from images has rapidly evolved in the past +few years, first with the introduction of Neural Radiance Field (NeRF) and more +recently with 3D Gaussian Splatting (3DGS). The latter provides a significant +edge over NeRF in terms of the training and inference speed, as well as the +reconstruction quality. Although 3DGS works well for dense input images, the +unstructured point-cloud like representation quickly overfits to the more +challenging setup of extremely sparse input images (e.g., 3 images), creating a +representation that appears as a jumble of needles from novel views. To address +this issue, we propose regularized optimization and depth-based initialization. +Our key idea is to introduce a structured Gaussian representation that can be +controlled in 2D image space. We then constraint the Gaussians, in particular +their position, and prevent them from moving independently during optimization. +Specifically, we introduce single and multiview constraints through an implicit +convolutional decoder and a total variation loss, respectively. With the +coherency introduced to the Gaussians, we further constrain the optimization +through a flow-based loss function. To support our regularized optimization, we +propose an approach to initialize the Gaussians using monocular depth estimates +at each input view. We demonstrate significant improvements compared to the +state-of-the-art sparse-view NeRF-based approaches on a variety of scenes. + +
+
+ comment: Project page: https://people.engr.tamu.edu/nimak/Papers/CoherentGS +
+
+
+
+
+ + ☆ Segmentation tool for images of cracks + + +
+ Safety-critical infrastructures, such as bridges, are periodically inspected +to check for existing damage, such as fatigue cracks and corrosion, and to +guarantee the safe use of the infrastructure. Visual inspection is the most +frequent type of general inspection, despite the fact that its detection +capability is rather limited, especially for fatigue cracks. Machine learning +algorithms can be used for augmenting the capability of classical visual +inspection of bridge structures, however, the implementation of such an +algorithm requires a massive annotated training dataset, which is +time-consuming to produce. This paper proposes a semi-automatic crack +segmentation tool that eases the manual segmentation of cracks on images needed +to create a training dataset for a machine learning algorithm. Also, it can be +used to measure the geometry of the crack. This tool makes use of an image +processing algorithm, which was initially developed for the analysis of +vascular systems on retinal images. The algorithm relies on a multi-orientation +wavelet transform, which is applied to the image to construct the so-called +"orientation scores", i.e. a modified version of the image. Afterwards, the +filtered orientation scores are used to formulate an optimal path problem that +identifies the crack. The globally optimal path between manually selected crack +endpoints is computed, using a state-of-the-art geometric tracking method. The +pixel-wise segmentation is done afterwards using the obtained crack path. The +proposed method outperforms fully automatic methods and shows potential to be +an adequate alternative to the manual data annotation. + +
+
+
+
+
+ + ☆ Jointly Training and Pruning CNNs via Learnable Agent Guidance and + Alignment CVPR + 2024 + + +
+ Structural model pruning is a prominent approach used for reducing the +computational cost of Convolutional Neural Networks (CNNs) before their +deployment on resource-constrained devices. Yet, the majority of proposed ideas +require a pretrained model before pruning, which is costly to secure. In this +paper, we propose a novel structural pruning approach to jointly learn the +weights and structurally prune architectures of CNN models. The core element of +our method is a Reinforcement Learning (RL) agent whose actions determine the +pruning ratios of the CNN model's layers, and the resulting model's accuracy +serves as its reward. We conduct the joint training and pruning by iteratively +training the model's weights and the agent's policy, and we regularize the +model's weights to align with the selected structure by the agent. The evolving +model's weights result in a dynamic reward function for the agent, which +prevents using prominent episodic RL methods with stationary environment +assumption for our purpose. We address this challenge by designing a mechanism +to model the complex changing dynamics of the reward function and provide a +representation of it to the RL agent. To do so, we take a learnable embedding +for each training epoch and employ a recurrent model to calculate a +representation of the changing environment. We train the recurrent model and +embeddings using a decoder model to reconstruct observed rewards. Such a design +empowers our agent to effectively leverage episodic observations along with the +environment representations to learn a proper policy to determine performant +sub-networks of the CNN model. Our extensive experiments on CIFAR-10 and +ImageNet using ResNets and MobileNets demonstrate the effectiveness of our +method. + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR + 2024 +
+
+
+
+
+ + ☆ SG-PGM: Partial Graph Matching Network with Semantic Geometric Fusion + for 3D Scene Graph Alignment and Its Downstream Tasks + + +
+ Scene graphs have been recently introduced into 3D spatial understanding as a +comprehensive representation of the scene. The alignment between 3D scene +graphs is the first step of many downstream tasks such as scene graph aided +point cloud registration, mosaicking, overlap checking, and robot navigation. +In this work, we treat 3D scene graph alignment as a partial graph-matching +problem and propose to solve it with a graph neural network. We reuse the +geometric features learned by a point cloud registration method and associate +the clustered point-level geometric features with the node-level semantic +feature via our designed feature fusion module. Partial matching is enabled by +using a learnable method to select the top-k similar node pairs. Subsequent +downstream tasks such as point cloud registration are achieved by running a +pre-trained registration network within the matched regions. We further propose +a point-matching rescoring method, that uses the node-wise alignment of the 3D +scene graph to reweight the matching candidates from a pre-trained point cloud +registration method. It reduces the false point correspondences estimated +especially in low-overlapping cases. Experiments show that our method improves +the alignment accuracy by 10~20% in low-overlap and random transformation +scenarios and outperforms the existing work in multiple downstream tasks. + +
+
+ comment: 16 pages, 10 figures +
+
+
+
+
+ + ☆ Benchmarking Implicit Neural Representation and Geometric Rendering in + Real-Time RGB-D SLAM CVPR 2024 + + +
+ Implicit neural representation (INR), in combination with geometric +rendering, has recently been employed in real-time dense RGB-D SLAM. Despite +active research endeavors being made, there lacks a unified protocol for fair +evaluation, impeding the evolution of this area. In this work, we establish, to +our knowledge, the first open-source benchmark framework to evaluate the +performance of a wide spectrum of commonly used INRs and rendering functions +for mapping and localization. The goal of our benchmark is to 1) gain an +intuition of how different INRs and rendering functions impact mapping and +localization and 2) establish a unified evaluation protocol w.r.t. the design +choices that may impact the mapping and localization. With the framework, we +conduct a large suite of experiments, offering various insights in choosing the +INRs and geometric rendering functions: for example, the dense feature grid +outperforms other INRs (e.g. tri-plane and hash grid), even when geometric and +color features are jointly encoded for memory efficiency. To extend the +findings into the practical scenario, a hybrid encoding strategy is proposed to +bring the best of the accuracy and completion from the grid-based and +decomposition-based INRs. We further propose explicit hybrid encoding for +high-fidelity dense grid mapping to comply with the RGB-D SLAM system that puts +the premise on robustness and computation efficiency. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Beyond Talking -- Generating Holistic 3D Human Dyadic Motion for + Communication + + +
+ In this paper, we introduce an innovative task focused on human +communication, aiming to generate 3D holistic human motions for both speakers +and listeners. Central to our approach is the incorporation of factorization to +decouple audio features and the combination of textual semantic information, +thereby facilitating the creation of more realistic and coordinated movements. +We separately train VQ-VAEs with respect to the holistic motions of both +speaker and listener. We consider the real-time mutual influence between the +speaker and the listener and propose a novel chain-like transformer-based +auto-regressive model specifically designed to characterize real-world +communication scenarios effectively which can generate the motions of both the +speaker and the listener simultaneously. These designs ensure that the results +we generate are both coordinated and diverse. Our approach demonstrates +state-of-the-art performance on two benchmark datasets. Furthermore, we +introduce the HoCo holistic communication dataset, which is a valuable resource +for future research. Our HoCo dataset and code will be released for research +purposes upon acceptance. + +
+
+
+
+
+ + ☆ Break-for-Make: Modular Low-Rank Adaptations for Composable + Content-Style Customization + + +
+ Personalized generation paradigms empower designers to customize visual +intellectual properties with the help of textual descriptions by tuning or +adapting pre-trained text-to-image models on a few images. Recent works explore +approaches for concurrently customizing both content and detailed visual style +appearance. However, these existing approaches often generate images where the +content and style are entangled. In this study, we reconsider the customization +of content and style concepts from the perspective of parameter space +construction. Unlike existing methods that utilize a shared parameter space for +content and style, we propose a learning framework that separates the parameter +space to facilitate individual learning of content and style, thereby enabling +disentangled content and style. To achieve this goal, we introduce "partly +learnable projection" (PLP) matrices to separate the original adapters into +divided sub-parameter spaces. We propose "break-for-make" customization +learning pipeline based on PLP, which is simple yet effective. We break the +original adapters into "up projection" and "down projection", train content and +style PLPs individually with the guidance of corresponding textual prompts in +the separate adapters, and maintain generalization by employing a +multi-correspondence projection learning strategy. Based on the adapters broken +apart for separate training content and style, we then make the entity +parameter space by reconstructing the content and style PLPs matrices, followed +by fine-tuning the combined adapter to generate the target object with the +desired appearance. Experiments on various styles, including textures, +materials, and artistic style, show that our method outperforms +state-of-the-art single/multiple concept learning pipelines in terms of +content-style-prompt alignment. + +
+
+
+
+
+ + ☆ Transparent and Clinically Interpretable AI for Lung Cancer Detection in + Chest X-Rays + + +
+ The rapidly advancing field of Explainable Artificial Intelligence (XAI) aims +to tackle the issue of trust regarding the use of complex black-box deep +learning models in real-world applications. Existing post-hoc XAI techniques +have recently been shown to have poor performance on medical data, producing +unreliable explanations which are infeasible for clinical use. To address this, +we propose an ante-hoc approach based on concept bottleneck models which +introduces for the first time clinical concepts into the classification +pipeline, allowing the user valuable insight into the decision-making process. +On a large public dataset of chest X-rays and associated medical reports, we +focus on the binary classification task of lung cancer detection. Our approach +yields improved classification performance in lung cancer detection when +compared to baseline deep learning models (F1 > 0.9), while also generating +clinically relevant and more reliable explanations than existing techniques. We +evaluate our approach against post-hoc image XAI techniques LIME and SHAP, as +well as CXR-LLaVA, a recent textual XAI tool which operates in the context of +question answering on chest X-rays. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ☆ SubjectDrive: Scaling Generative Data in Autonomous Driving via Subject + Control + + +
+ Autonomous driving progress relies on large-scale annotated datasets. In this +work, we explore the potential of generative models to produce vast quantities +of freely-labeled data for autonomous driving applications and present +SubjectDrive, the first model proven to scale generative data production in a +way that could continuously improve autonomous driving applications. We +investigate the impact of scaling up the quantity of generative data on the +performance of downstream perception models and find that enhancing data +diversity plays a crucial role in effectively scaling generative data +production. Therefore, we have developed a novel model equipped with a subject +control mechanism, which allows the generative model to leverage diverse +external data sources for producing varied and useful data. Extensive +evaluations confirm SubjectDrive's efficacy in generating scalable autonomous +driving training data, marking a significant step toward revolutionizing data +production methods in this field. + +
+
+ comment: Project page: https://subjectdrive.github.io/ +
+
+
+
+
+ + ☆ BAMM: Bidirectional Autoregressive Motion Model + + +
+ Generating human motion from text has been dominated by denoising motion +models either through diffusion or generative masking process. However, these +models face great limitations in usability by requiring prior knowledge of the +motion length. Conversely, autoregressive motion models address this limitation +by adaptively predicting motion endpoints, at the cost of degraded generation +quality and editing capabilities. To address these challenges, we propose +Bidirectional Autoregressive Motion Model (BAMM), a novel text-to-motion +generation framework. BAMM consists of two key components: (1) a motion +tokenizer that transforms 3D human motion into discrete tokens in latent space, +and (2) a masked self-attention transformer that autoregressively predicts +randomly masked tokens via a hybrid attention masking strategy. By unifying +generative masked modeling and autoregressive modeling, BAMM captures rich and +bidirectional dependencies among motion tokens, while learning the +probabilistic mapping from textual inputs to motion outputs with +dynamically-adjusted motion sequence length. This feature enables BAMM to +simultaneously achieving high-quality motion generation with enhanced usability +and built-in motion editability. Extensive experiments on HumanML3D and KIT-ML +datasets demonstrate that BAMM surpasses current state-of-the-art methods in +both qualitative and quantitative measures. + +
+
+
+
+
+ + ☆ Burst Super-Resolution with Diffusion Models for Improving Perceptual + Quality IJCNN 2024 + + +
+ While burst LR images are useful for improving the SR image quality compared +with a single LR image, prior SR networks accepting the burst LR images are +trained in a deterministic manner, which is known to produce a blurry SR image. +In addition, it is difficult to perfectly align the burst LR images, making the +SR image more blurry. Since such blurry images are perceptually degraded, we +aim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity +images can be reconstructed by diffusion models. However, prior SR methods +using the diffusion model are not properly optimized for the burst SR task. +Specifically, the reverse process starting from a random sample is not +optimized for image enhancement and restoration methods, including burst SR. In +our proposed method, on the other hand, burst LR features are used to +reconstruct the initial burst SR image that is fed into an intermediate step in +the diffusion model. This reverse process from the intermediate step 1) skips +diffusion steps for reconstructing the global structure of the image and 2) +focuses on steps for refining detailed textures. Our experimental results +demonstrate that our method can improve the scores of the perceptual quality +metrics. Code: https://github.com/placerkyo/BSRD + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ☆ A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation: + Generalizability and Clinical Utility Beyond the ISLES Challenge + + +
+ Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment +decisions, and prognosis. However, image and disease variability hinder the +development of generalizable AI algorithms with clinical value. We address this +gap by presenting a novel ensemble algorithm derived from the 2022 Ischemic +Stroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient +scans with ischemic stroke from various medical centers, facilitating the +development of a wide range of cutting-edge segmentation algorithms by the +research community. Through collaboration with leading teams, we combined +top-performing algorithms into an ensemble model that overcomes the limitations +of individual solutions. Our ensemble model achieved superior ischemic lesion +detection and segmentation accuracy on our internal test set compared to +individual algorithms. This accuracy generalized well across diverse image and +disease variables. Furthermore, the model excelled in extracting clinical +biomarkers. Notably, in a Turing-like test, neuroradiologists consistently +preferred the algorithm's segmentations over manual expert efforts, +highlighting increased comprehensiveness and precision. Validation using a +real-world external dataset (N=1686) confirmed the model's generalizability. +The algorithm's outputs also demonstrated strong correlations with clinical +scores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived +results, underlining its clinical relevance. This study offers two key +findings. First, we present an ensemble algorithm +(https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments +ischemic stroke lesions on DWI across diverse scenarios on par with expert +(neuro)radiologists. Second, we show the potential for biomedical challenge +outputs to extend beyond the challenge's initial objectives, demonstrating +their real-world clinical applicability. + +
+
+
+
+
+ + ☆ OAKINK2: A Dataset of Bimanual Hands-Object Manipulation in Complex Task + Completion CVPR 2024 + + +
+ We present OAKINK2, a dataset of bimanual object manipulation tasks for +complex daily activities. In pursuit of constructing the complex tasks into a +structured representation, OAKINK2 introduces three level of abstraction to +organize the manipulation tasks: Affordance, Primitive Task, and Complex Task. +OAKINK2 features on an object-centric perspective for decoding the complex +tasks, treating them as a sequence of object affordance fulfillment. The first +level, Affordance, outlines the functionalities that objects in the scene can +afford, the second level, Primitive Task, describes the minimal interaction +units that humans interact with the object to achieve its affordance, and the +third level, Complex Task, illustrates how Primitive Tasks are composed and +interdependent. OAKINK2 dataset provides multi-view image streams and precise +pose annotations for the human body, hands and various interacting objects. +This extensive collection supports applications such as interaction +reconstruction and motion synthesis. Based on the 3-level abstraction of +OAKINK2, we explore a task-oriented framework for Complex Task Completion +(CTC). CTC aims to generate a sequence of bimanual manipulation to achieve task +objectives. Within the CTC framework, we employ Large Language Models (LLMs) to +decompose the complex task objectives into sequences of Primitive Tasks and +have developed a Motion Fulfillment Model that generates bimanual hand motion +for each Primitive Task. OAKINK2 datasets and models are available at +https://oakink.net/v2. + +
+
+ comment: To be appeared in CVPR 2024. 26 pages +
+
+
+
+
+ + ☆ Brain-Shift: Unsupervised Pseudo-Healthy Brain Synthesis for Novel + Biomarker Extraction in Chronic Subdural Hematoma + + +
+ Chronic subdural hematoma (cSDH) is a common neurological condition +characterized by the accumulation of blood between the brain and the dura +mater. This accumulation of blood can exert pressure on the brain, potentially +leading to fatal outcomes. Treatment options for cSDH are limited to invasive +surgery or non-invasive management. Traditionally, the midline shift, +hand-measured by experts from an ideal sagittal plane, and the hematoma volume +have been the primary metrics for quantifying and analyzing cSDH. However, +these approaches do not quantify the local 3D brain deformation caused by cSDH. +We propose a novel method using anatomy-aware unsupervised diffeomorphic +pseudo-healthy synthesis to generate brain deformation fields. The deformation +fields derived from this process are utilized to extract biomarkers that +quantify the shift in the brain due to cSDH. We use CT scans of 121 patients +for training and validation of our method and find that our metrics allow the +identification of patients who require surgery. Our results indicate that +automatically obtained brain deformation fields might contain prognostic value +for personalized cSDH treatment. Our implementation is available on: +github.com/Barisimre/brain-morphing + +
+
+
+
+
+ + ☆ A Simple and Effective Point-based Network for Event Camera 6-DOFs Pose + Relocalization CVPR 2024 + + +
+ Event cameras exhibit remarkable attributes such as high dynamic range, +asynchronicity, and low latency, making them highly suitable for vision tasks +that involve high-speed motion in challenging lighting conditions. These +cameras implicitly capture movement and depth information in events, making +them appealing sensors for Camera Pose Relocalization (CPR) tasks. +Nevertheless, existing CPR networks based on events neglect the pivotal +fine-grained temporal information in events, resulting in unsatisfactory +performance. Moreover, the energy-efficient features are further compromised by +the use of excessively complex models, hindering efficient deployment on edge +devices. In this paper, we introduce PEPNet, a simple and effective point-based +network designed to regress six degrees of freedom (6-DOFs) event camera poses. +We rethink the relationship between the event camera and CPR tasks, leveraging +the raw Point Cloud directly as network input to harness the high-temporal +resolution and inherent sparsity of events. PEPNet is adept at abstracting the +spatial and implicit temporal features through hierarchical structure and +explicit temporal features by Attentive Bi-directional Long Short-Term Memory +(A-Bi-LSTM). By employing a carefully crafted lightweight design, PEPNet +delivers state-of-the-art (SOTA) performance on both indoor and outdoor +datasets with meager computational resources. Specifically, PEPNet attains a +significant 38% and 33% performance improvement on the random split IJRR and +M3ED datasets, respectively. Moreover, the lightweight design version +PEPNet$_{tiny}$ accomplishes results comparable to the SOTA while employing a +mere 0.5% of the parameters. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Towards Temporally Consistent Referring Video Object Segmentation + + +
+ Referring Video Object Segmentation (R-VOS) methods face challenges in +maintaining consistent object segmentation due to temporal context variability +and the presence of other visually similar objects. We propose an end-to-end +R-VOS paradigm that explicitly models temporal instance consistency alongside +the referring segmentation. Specifically, we introduce a novel hybrid memory +that facilitates inter-frame collaboration for robust spatio-temporal matching +and propagation. Features of frames with automatically generated high-quality +reference masks are propagated to segment the remaining frames based on +multi-granularity association to achieve temporally consistent R-VOS. +Furthermore, we propose a new Mask Consistency Score (MCS) metric to evaluate +the temporal consistency of video segmentation. Extensive experiments +demonstrate that our approach enhances temporal consistency by a significant +margin, leading to top-ranked performance on popular R-VOS benchmarks, i.e., +Ref-YouTube-VOS (67.1%) and Ref-DAVIS17 (65.6%). + +
+
+
+
+
+ + ☆ PointCloud-Text Matching: Benchmark Datasets and a Baseline + + +
+ In this paper, we present and study a new instance-level retrieval task: +PointCloud-Text Matching~(PTM), which aims to find the exact cross-modal +instance that matches a given point-cloud query or text query. PTM could be +applied to various scenarios, such as indoor/urban-canyon localization and +scene retrieval. However, there exists no suitable and targeted dataset for PTM +in practice. Therefore, we construct three new PTM benchmark datasets, namely +3D2T-SR, 3D2T-NR, and 3D2T-QA. We observe that the data is challenging and with +noisy correspondence due to the sparsity, noise, or disorder of point clouds +and the ambiguity, vagueness, or incompleteness of texts, which make existing +cross-modal matching methods ineffective for PTM. To tackle these challenges, +we propose a PTM baseline, named Robust PointCloud-Text Matching method (RoMa). +RoMa consists of two modules: a Dual Attention Perception module (DAP) and a +Robust Negative Contrastive Learning module (RNCL). Specifically, DAP leverages +token-level and feature-level attention to adaptively focus on useful local and +global features, and aggregate them into common representations, thereby +reducing the adverse impact of noise and ambiguity. To handle noisy +correspondence, RNCL divides negative pairs, which are much less error-prone +than positive pairs, into clean and noisy subsets, and assigns them forward and +reverse optimization directions respectively, thus enhancing robustness against +noisy correspondence. We conduct extensive experiments on our benchmarks and +demonstrate the superiority of our RoMa. + +
+
+
+
+
+ + ☆ NIGHT -- Non-Line-of-Sight Imaging from Indirect Time of Flight Data ECCV 24 + + +
+ The acquisition of objects outside the Line-of-Sight of cameras is a very +intriguing but also extremely challenging research topic. Recent works showed +the feasibility of this idea exploiting transient imaging data produced by +custom direct Time of Flight sensors. In this paper, for the first time, we +tackle this problem using only data from an off-the-shelf indirect Time of +Flight sensor without any further hardware requirement. We introduced a Deep +Learning model able to re-frame the surfaces where light bounces happen as a +virtual mirror. This modeling makes the task easier to handle and also +facilitates the construction of annotated training data. From the obtained data +it is possible to retrieve the depth information of the hidden scene. We also +provide a first-in-its-kind synthetic dataset for the task and demonstrate the +feasibility of the proposed idea over it. + +
+
+ comment: Submitted to ECCV 24, 17 pages, 6 figures, 2 tables +
+
+
+
+
+ + ☆ Infrared Small Target Detection with Scale and Location Sensitivity CVPR 2024 + + +
+ Recently, infrared small target detection (IRSTD) has been dominated by +deep-learning-based methods. However, these methods mainly focus on the design +of complex model structures to extract discriminative features, leaving the +loss functions for IRSTD under-explored. For example, the widely used +Intersection over Union (IoU) and Dice losses lack sensitivity to the scales +and locations of targets, limiting the detection performance of detectors. In +this paper, we focus on boosting detection performance with a more effective +loss but a simpler model structure. Specifically, we first propose a novel +Scale and Location Sensitive (SLS) loss to handle the limitations of existing +losses: 1) for scale sensitivity, we compute a weight for the IoU loss based on +target scales to help the detector distinguish targets with different scales: +2) for location sensitivity, we introduce a penalty term based on the center +points of targets to help the detector localize targets more precisely. Then, +we design a simple Multi-Scale Head to the plain U-Net (MSHNet). By applying +SLS loss to each scale of the predictions, our MSHNet outperforms existing +state-of-the-art methods by a large margin. In addition, the detection +performance of existing detectors can be further improved when trained with our +SLS loss, demonstrating the effectiveness and generalization of our SLS loss. +The code is available at https://github.com/ying-fu/MSHNet. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ IVLMap: Instance-Aware Visual Language Grounding for Consumer Robot + Navigation + + +
+ Vision-and-Language Navigation (VLN) is a challenging task that requires a +robot to navigate in photo-realistic environments with human natural language +promptings. Recent studies aim to handle this task by constructing the semantic +spatial map representation of the environment, and then leveraging the strong +ability of reasoning in large language models for generalizing code for guiding +the robot navigation. However, these methods face limitations in instance-level +and attribute-level navigation tasks as they cannot distinguish different +instances of the same object. To address this challenge, we propose a new +method, namely, Instance-aware Visual Language Map (IVLMap), to empower the +robot with instance-level and attribute-level semantic mapping, where it is +autonomously constructed by fusing the RGBD video data collected from the robot +agent with special-designed natural language map indexing in the bird's-in-eye +view. Such indexing is instance-level and attribute-level. In particular, when +integrated with a large language model, IVLMap demonstrates the capability to +i) transform natural language into navigation targets with instance and +attribute information, enabling precise localization, and ii) accomplish +zero-shot end-to-end navigation tasks based on natural language commands. +Extensive navigation experiments are conducted. Simulation results illustrate +that our method can achieve an average improvement of 14.4\% in navigation +accuracy. Code and demo are released at https://ivlmap.github.io/. + +
+
+
+
+
+ + ☆ Test-Time Domain Generalization for Face Anti-Spoofing CVPR + + +
+ Face Anti-Spoofing (FAS) is pivotal in safeguarding facial recognition +systems against presentation attacks. While domain generalization (DG) methods +have been developed to enhance FAS performance, they predominantly focus on +learning domain-invariant features during training, which may not guarantee +generalizability to unseen data that differs largely from the source +distributions. Our insight is that testing data can serve as a valuable +resource to enhance the generalizability beyond mere evaluation for DG FAS. In +this paper, we introduce a novel Test-Time Domain Generalization (TTDG) +framework for FAS, which leverages the testing data to boost the model's +generalizability. Our method, consisting of Test-Time Style Projection (TTSP) +and Diverse Style Shifts Simulation (DSSS), effectively projects the unseen +data to the seen domain space. In particular, we first introduce the innovative +TTSP to project the styles of the arbitrarily unseen samples of the testing +distribution to the known source space of the training distributions. We then +design the efficient DSSS to synthesize diverse style shifts via learnable +style bases with two specifically designed losses in a hyperspherical feature +space. Our method eliminates the need for model updates at the test time and +can be seamlessly integrated into not only the CNN but also ViT backbones. +Comprehensive experiments on widely used cross-domain FAS benchmarks +demonstrate our method's state-of-the-art performance and effectiveness. + +
+
+ comment: Accepted to IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR), 2024 +
+
+
+
+
+ + ☆ MedBN: Robust Test-Time Adaptation against Malicious Test Samples CVPR 2024 + + +
+ Test-time adaptation (TTA) has emerged as a promising solution to address +performance decay due to unforeseen distribution shifts between training and +test data. While recent TTA methods excel in adapting to test data variations, +such adaptability exposes a model to vulnerability against malicious examples, +an aspect that has received limited attention. Previous studies have uncovered +security vulnerabilities within TTA even when a small proportion of the test +batch is maliciously manipulated. In response to the emerging threat, we +propose median batch normalization (MedBN), leveraging the robustness of the +median for statistics estimation within the batch normalization layer during +test-time inference. Our method is algorithm-agnostic, thus allowing seamless +integration with existing TTA frameworks. Our experimental results on benchmark +datasets, including CIFAR10-C, CIFAR100-C and ImageNet-C, consistently +demonstrate that MedBN outperforms existing approaches in maintaining robust +performance across different attack scenarios, encompassing both instant and +cumulative attacks. Through extensive experiments, we show that our approach +sustains the performance even in the absence of attacks, achieving a practical +balance between robustness and performance. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Plug-and-Play Grounding of Reasoning in Multimodal Large Language Models + + +
+ The surge of Multimodal Large Language Models (MLLMs), given their prominent +emergent capabilities in instruction following and reasoning, has greatly +advanced the field of visual reasoning. However, constrained by their +non-lossless image tokenization, most MLLMs fall short of comprehensively +capturing details of text and objects, especially in high-resolution images. To +address this, we propose P2G, a novel framework for plug-and-play grounding of +reasoning in MLLMs. Specifically, P2G exploits the tool-usage potential of +MLLMs to employ expert agents to achieve on-the-fly grounding to critical +visual and textual objects of image, thus achieving deliberate reasoning via +multimodal prompting. We further create P2GB, a benchmark aimed at assessing +MLLMs' ability to understand inter-object relationships and text in challenging +high-resolution images. Comprehensive experiments on visual reasoning tasks +demonstrate the superiority of P2G. Noteworthy, P2G achieved comparable +performance with GPT-4V on P2GB, with a 7B backbone. Our work highlights the +potential of plug-and-play grounding of reasoning and opens up a promising +alternative beyond model scaling. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ Mesh2NeRF: Direct Mesh Supervision for Neural Radiance Field + Representation and Generation + + +
+ We present Mesh2NeRF, an approach to derive ground-truth radiance fields from +textured meshes for 3D generation tasks. Many 3D generative approaches +represent 3D scenes as radiance fields for training. Their ground-truth +radiance fields are usually fitted from multi-view renderings from a +large-scale synthetic 3D dataset, which often results in artifacts due to +occlusions or under-fitting issues. In Mesh2NeRF, we propose an analytic +solution to directly obtain ground-truth radiance fields from 3D meshes, +characterizing the density field with an occupancy function featuring a defined +surface thickness, and determining view-dependent color through a reflection +function considering both the mesh and environment lighting. Mesh2NeRF extracts +accurate radiance fields which provides direct supervision for training +generative NeRFs and single scene representation. We validate the effectiveness +of Mesh2NeRF across various tasks, achieving a noteworthy 3.12dB improvement in +PSNR for view synthesis in single scene representation on the ABO dataset, a +0.69 PSNR enhancement in the single-view conditional generation of ShapeNet +Cars, and notably improved mesh extraction from NeRF in the unconditional +generation of Objaverse Mugs. + +
+
+ comment: Project page: https://terencecyj.github.io/projects/Mesh2NeRF/ Video: + https://youtu.be/oufv1N3f7iY +
+
+
+
+
+ + ☆ Hypergraph-based Multi-View Action Recognition using Event Cameras + + +
+ Action recognition from video data forms a cornerstone with wide-ranging +applications. Single-view action recognition faces limitations due to its +reliance on a single viewpoint. In contrast, multi-view approaches capture +complementary information from various viewpoints for improved accuracy. +Recently, event cameras have emerged as innovative bio-inspired sensors, +leading to advancements in event-based action recognition. However, existing +works predominantly focus on single-view scenarios, leaving a gap in multi-view +event data exploitation, particularly in challenges like information deficit +and semantic misalignment. To bridge this gap, we introduce HyperMV, a +multi-view event-based action recognition framework. HyperMV converts discrete +event data into frame-like representations and extracts view-related features +using a shared convolutional network. By treating segments as vertices and +constructing hyperedges using rule-based and KNN-based strategies, a multi-view +hypergraph neural network that captures relationships across viewpoint and +temporal features is established. The vertex attention hypergraph propagation +is also introduced for enhanced feature fusion. To prompt research in this +area, we present the largest multi-view event-based action dataset +$\text{THU}^{\text{MV-EACT}}\text{-50}$, comprising 50 actions from 6 +viewpoints, which surpasses existing datasets by over tenfold. Experimental +results show that HyperMV significantly outperforms baselines in both +cross-subject and cross-view scenarios, and also exceeds the state-of-the-arts +in frame-based multi-view action recognition. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence (TPAMI 2024) +
+
+
+
+
+ + ☆ Total-Decom: Decomposed 3D Scene Reconstruction with Minimal Interaction CVPR 2024 + + +
+ Scene reconstruction from multi-view images is a fundamental problem in +computer vision and graphics. Recent neural implicit surface reconstruction +methods have achieved high-quality results; however, editing and manipulating +the 3D geometry of reconstructed scenes remains challenging due to the absence +of naturally decomposed object entities and complex object/background +compositions. In this paper, we present Total-Decom, a novel method for +decomposed 3D reconstruction with minimal human interaction. Our approach +seamlessly integrates the Segment Anything Model (SAM) with hybrid +implicit-explicit neural surface representations and a mesh-based +region-growing technique for accurate 3D object decomposition. Total-Decom +requires minimal human annotations while providing users with real-time control +over the granularity and quality of decomposition. We extensively evaluate our +method on benchmark datasets and demonstrate its potential for downstream +applications, such as animation and scene editing. The code is available at +\href{https://github.com/CVMI-Lab/Total-Decom.git}{https://github.com/CVMI-Lab/Total-Decom.git}. + +
+
+ comment: 8 pages, 7 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Sparse Generation: Making Pseudo Labels Sparse for weakly supervision + with points + + +
+ In recent years, research on point weakly supervised object detection (PWSOD) +methods in the field of computer vision has attracted people's attention. +However, existing pseudo labels generation methods perform poorly in a small +amount of supervised annotation data and dense object detection tasks. We +consider the generation of weakly supervised pseudo labels as the result of +model's sparse output, and propose a method called Sparse Generation to make +pseudo labels sparse. It constructs dense tensors through the relationship +between data and detector model, optimizes three of its parameters, and obtains +a sparse tensor via coordinated calculation, thereby indirectly obtaining +higher quality pseudo labels, and solving the model's density problem in the +situation of only a small amount of supervised annotation data can be used. On +two broadly used open-source datasets (RSOD, SIMD) and a self-built dataset +(Bullet-Hole), the experimental results showed that the proposed method has a +significant advantage in terms of overall performance metrics, comparing to +that state-of-the-art method. + +
+
+
+
+
+ + ☆ FlowDepth: Decoupling Optical Flow for Self-Supervised Monocular Depth + Estimation + + +
+ Self-supervised multi-frame methods have currently achieved promising results +in depth estimation. However, these methods often suffer from mismatch problems +due to the moving objects, which break the static assumption. Additionally, +unfairness can occur when calculating photometric errors in high-freq or +low-texture regions of the images. To address these issues, existing approaches +use additional semantic priori black-box networks to separate moving objects +and improve the model only at the loss level. Therefore, we propose FlowDepth, +where a Dynamic Motion Flow Module (DMFM) decouples the optical flow by a +mechanism-based approach and warps the dynamic regions thus solving the +mismatch problem. For the unfairness of photometric errors caused by high-freq +and low-texture regions, we use Depth-Cue-Aware Blur (DCABlur) and Cost-Volume +sparsity loss respectively at the input and the loss level to solve the +problem. Experimental results on the KITTI and Cityscapes datasets show that +our method outperforms the state-of-the-art methods. + +
+
+
+
+
+ + ☆ CAT: Exploiting Inter-Class Dynamics for Domain Adaptive Object + Detection CVPR 2024 + + +
+ Domain adaptive object detection aims to adapt detection models to domains +where annotated data is unavailable. Existing methods have been proposed to +address the domain gap using the semi-supervised student-teacher framework. +However, a fundamental issue arises from the class imbalance in the labelled +training set, which can result in inaccurate pseudo-labels. The relationship +between classes, especially where one class is a majority and the other +minority, has a large impact on class bias. We propose Class-Aware Teacher +(CAT) to address the class bias issue in the domain adaptation setting. In our +work, we approximate the class relationships with our Inter-Class Relation +module (ICRm) and exploit it to reduce the bias within the model. In this way, +we are able to apply augmentations to highly related classes, both inter- and +intra-domain, to boost the performance of minority classes while having minimal +impact on majority classes. We further reduce the bias by implementing a +class-relation weight to our classification loss. Experiments conducted on +various datasets and ablation studies show that our method is able to address +the class bias in the domain adaptation setting. On the Cityscapes to Foggy +Cityscapes dataset, we attained a 52.5 mAP, a substantial improvement over the +51.2 mAP achieved by the state-of-the-art method. + +
+
+ comment: Accepted into CVPR 2024 +
+
+
+
+
+ + ☆ Neural Fields for 3D Tracking of Anatomy and Surgical Instruments in + Monocular Laparoscopic Video Clips + + +
+ Laparoscopic video tracking primarily focuses on two target types: surgical +instruments and anatomy. The former could be used for skill assessment, while +the latter is necessary for the projection of virtual overlays. Where +instrument and anatomy tracking have often been considered two separate +problems, in this paper, we propose a method for joint tracking of all +structures simultaneously. Based on a single 2D monocular video clip, we train +a neural field to represent a continuous spatiotemporal scene, used to create +3D tracks of all surfaces visible in at least one frame. Due to the small size +of instruments, they generally cover a small part of the image only, resulting +in decreased tracking accuracy. Therefore, we propose enhanced class weighting +to improve the instrument tracks. We evaluate tracking on video clips from +laparoscopic cholecystectomies, where we find mean tracking accuracies of 92.4% +for anatomical structures and 87.4% for instruments. Additionally, we assess +the quality of depth maps obtained from the method's scene reconstructions. We +show that these pseudo-depths have comparable quality to a state-of-the-art +pre-trained depth estimator. On laparoscopic videos in the SCARED dataset, the +method predicts depth with an MAE of 2.9 mm and a relative error of 9.2%. These +results show the feasibility of using neural fields for monocular 3D +reconstruction of laparoscopic scenes. + +
+
+
+
+
+ + ☆ Imperceptible Protection against Style Imitation from Diffusion Models + + +
+ Recent progress in diffusion models has profoundly enhanced the fidelity of +image generation. However, this has raised concerns about copyright +infringements. While prior methods have introduced adversarial perturbations to +prevent style imitation, most are accompanied by the degradation of artworks' +visual quality. Recognizing the importance of maintaining this, we develop a +visually improved protection method that preserves its protection capability. +To this end, we create a perceptual map to identify areas most sensitive to +human eyes. We then adjust the protection intensity guided by an instance-aware +refinement. We also integrate a perceptual constraints bank to further improve +the imperceptibility. Results show that our method substantially elevates the +quality of the protected image without compromising on protection efficacy. + +
+
+
+
+
+ + ☆ Sine Activated Low-Rank Matrices for Parameter Efficient Learning + + +
+ Low-rank decomposition has emerged as a vital tool for enhancing parameter +efficiency in neural network architectures, gaining traction across diverse +applications in machine learning. These techniques significantly lower the +number of parameters, striking a balance between compactness and performance. +However, a common challenge has been the compromise between parameter +efficiency and the accuracy of the model, where reduced parameters often lead +to diminished accuracy compared to their full-rank counterparts. In this work, +we propose a novel theoretical framework that integrates a sinusoidal function +within the low-rank decomposition process. This approach not only preserves the +benefits of the parameter efficiency characteristic of low-rank methods but +also increases the decomposition's rank, thereby enhancing model accuracy. Our +method proves to be an adaptable enhancement for existing low-rank models, as +evidenced by its successful application in Vision Transformers (ViT), Large +Language Models (LLMs), Neural Radiance Fields (NeRF), and 3D shape modeling. +This demonstrates the wide-ranging potential and efficiency of our proposed +technique. + +
+
+ comment: The first two authors contributed equally +
+
+
+
+
+ + ☆ RTracker: Recoverable Tracking via PN Tree Structured Memory CVPR 2024 + + +
+ Existing tracking methods mainly focus on learning better target +representation or developing more robust prediction models to improve tracking +performance. While tracking performance has significantly improved, the target +loss issue occurs frequently due to tracking failures, complete occlusion, or +out-of-view situations. However, considerably less attention is paid to the +self-recovery issue of tracking methods, which is crucial for practical +applications. To this end, we propose a recoverable tracking framework, +RTracker, that uses a tree-structured memory to dynamically associate a tracker +and a detector to enable self-recovery ability. Specifically, we propose a +Positive-Negative Tree-structured memory to chronologically store and maintain +positive and negative target samples. Upon the PN tree memory, we develop +corresponding walking rules for determining the state of the target and define +a set of control flows to unite the tracker and the detector in different +tracking scenarios. Our core idea is to use the support samples of positive and +negative target categories to establish a relative distance-based criterion for +a reliable assessment of target loss. The favorable performance in comparison +against the state-of-the-art methods on numerous challenging benchmarks +demonstrates the effectiveness of the proposed algorithm. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Taming Lookup Tables for Efficient Image Retouching + + +
+ The widespread use of high-definition screens in edge devices, such as +end-user cameras, smartphones, and televisions, is spurring a significant +demand for image enhancement. Existing enhancement models often optimize for +high performance while falling short of reducing hardware inference time and +power consumption, especially on edge devices with constrained computing and +storage resources. To this end, we propose Image Color Enhancement Lookup Table +(ICELUT) that adopts LUTs for extremely efficient edge inference, without any +convolutional neural network (CNN). During training, we leverage pointwise +(1x1) convolution to extract color information, alongside a split fully +connected layer to incorporate global information. Both components are then +seamlessly converted into LUTs for hardware-agnostic deployment. ICELUT +achieves near-state-of-the-art performance and remarkably low power +consumption. We observe that the pointwise network structure exhibits robust +scalability, upkeeping the performance even with a heavily downsampled 32x32 +input image. These enable ICELUT, the first-ever purely LUT-based image +enhancer, to reach an unprecedented speed of 0.4ms on GPU and 7ms on CPU, at +least one order faster than any CNN solution. Codes are available at +https://github.com/Stephen0808/ICELUT. + +
+
+
+
+
+ + ☆ DreamSalon: A Staged Diffusion Framework for Preserving Identity-Context + in Editable Face Generation + + +
+ While large-scale pre-trained text-to-image models can synthesize diverse and +high-quality human-centered images, novel challenges arise with a nuanced task +of "identity fine editing": precisely modifying specific features of a subject +while maintaining its inherent identity and context. Existing personalization +methods either require time-consuming optimization or learning additional +encoders, adept in "identity re-contextualization". However, they often +struggle with detailed and sensitive tasks like human face editing. To address +these challenges, we introduce DreamSalon, a noise-guided, staged-editing +framework, uniquely focusing on detailed image manipulations and +identity-context preservation. By discerning editing and boosting stages via +the frequency and gradient of predicted noises, DreamSalon first performs +detailed manipulations on specific features in the editing stage, guided by +high-frequency information, and then employs stochastic denoising in the +boosting stage to improve image quality. For more precise editing, DreamSalon +semantically mixes source and target textual prompts, guided by differences in +their embedding covariances, to direct the model's focus on specific +manipulation areas. Our experiments demonstrate DreamSalon's ability to +efficiently and faithfully edit fine details on human faces, outperforming +existing methods both qualitatively and quantitatively. + +
+
+
+
+
+ + ☆ AZ-NAS: Assembling Zero-Cost Proxies for Network Architecture Search CVPR 2024 + + +
+ Training-free network architecture search (NAS) aims to discover +high-performing networks with zero-cost proxies, capturing network +characteristics related to the final performance. However, network rankings +estimated by previous training-free NAS methods have shown weak correlations +with the performance. To address this issue, we propose AZ-NAS, a novel +approach that leverages the ensemble of various zero-cost proxies to enhance +the correlation between a predicted ranking of networks and the ground truth +substantially in terms of the performance. To achieve this, we introduce four +novel zero-cost proxies that are complementary to each other, analyzing +distinct traits of architectures in the views of expressivity, progressivity, +trainability, and complexity. The proxy scores can be obtained simultaneously +within a single forward and backward pass, making an overall NAS process highly +efficient. In order to integrate the rankings predicted by our proxies +effectively, we introduce a non-linear ranking aggregation method that +highlights the networks highly-ranked consistently across all the proxies. +Experimental results conclusively demonstrate the efficacy and efficiency of +AZ-NAS, outperforming state-of-the-art methods on standard benchmarks, all +while maintaining a reasonable runtime cost. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Efficient and Effective Weakly-Supervised Action Segmentation via + Action-Transition-Aware Boundary Alignment CVPR 2024 + + +
+ Weakly-supervised action segmentation is a task of learning to partition a +long video into several action segments, where training videos are only +accompanied by transcripts (ordered list of actions). Most of existing methods +need to infer pseudo segmentation for training by serial alignment between all +frames and the transcript, which is time-consuming and hard to be parallelized +while training. In this work, we aim to escape from this inefficient alignment +with massive but redundant frames, and instead to directly localize a few +action transitions for pseudo segmentation generation, where a transition +refers to the change from an action segment to its next adjacent one in the +transcript. As the true transitions are submerged in noisy boundaries due to +intra-segment visual variation, we propose a novel Action-Transition-Aware +Boundary Alignment (ATBA) framework to efficiently and effectively filter out +noisy boundaries and detect transitions. In addition, to boost the semantic +learning in the case that noise is inevitably present in the pseudo +segmentation, we also introduce video-level losses to utilize the trusted +video-level supervision. Extensive experiments show the effectiveness of our +approach on both performance and training speed. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Towards Multimodal Video Paragraph Captioning Models Robust to Missing + Modality + + +
+ Video paragraph captioning (VPC) involves generating detailed narratives for +long videos, utilizing supportive modalities such as speech and event +boundaries. However, the existing models are constrained by the assumption of +constant availability of a single auxiliary modality, which is impractical +given the diversity and unpredictable nature of real-world scenarios. To this +end, we propose a Missing-Resistant framework MR-VPC that effectively harnesses +all available auxiliary inputs and maintains resilience even in the absence of +certain modalities. Under this framework, we propose the Multimodal VPC (MVPC) +architecture integrating video, speech, and event boundary inputs in a unified +manner to process various auxiliary inputs. Moreover, to fortify the model +against incomplete data, we introduce DropAM, a data augmentation strategy that +randomly omits auxiliary inputs, paired with DistillAM, a regularization target +that distills knowledge from teacher models trained on modality-complete data, +enabling efficient learning in modality-deficient environments. Through +exhaustive experimentation on YouCook2 and ActivityNet Captions, MR-VPC has +proven to deliver superior performance on modality-complete and +modality-missing test data. This work highlights the significance of developing +resilient VPC models and paves the way for more adaptive, robust multimodal +video understanding. + +
+
+ comment: Code available at https://github.com/lancopku/MR-VPC +
+
+
+
+
+ + ☆ GeoAuxNet: Towards Universal 3D Representation Learning for Multi-sensor + Point Clouds CVPR 2024 + + +
+ Point clouds captured by different sensors such as RGB-D cameras and LiDAR +possess non-negligible domain gaps. Most existing methods design different +network architectures and train separately on point clouds from various +sensors. Typically, point-based methods achieve outstanding performances on +even-distributed dense point clouds from RGB-D cameras, while voxel-based +methods are more efficient for large-range sparse LiDAR point clouds. In this +paper, we propose geometry-to-voxel auxiliary learning to enable voxel +representations to access point-level geometric information, which supports +better generalisation of the voxel-based backbone with additional +interpretations of multi-sensor point clouds. Specifically, we construct +hierarchical geometry pools generated by a voxel-guided dynamic point network, +which efficiently provide auxiliary fine-grained geometric information adapted +to different stages of voxel features. We conduct experiments on joint +multi-sensor datasets to demonstrate the effectiveness of GeoAuxNet. Enjoying +elaborate geometric information, our method outperforms other models +collectively trained on multi-sensor datasets, and achieve competitive results +with the-state-of-art experts on each single dataset. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Learning Multiple Representations with Inconsistency-Guided Detail + Regularization for Mask-Guided Matting + + +
+ Mask-guided matting networks have achieved significant improvements and have +shown great potential in practical applications in recent years. However, +simply learning matting representation from synthetic and +lack-of-real-world-diversity matting data, these approaches tend to overfit +low-level details in wrong regions, lack generalization to objects with complex +structures and real-world scenes such as shadows, as well as suffer from +interference of background lines or textures. To address these challenges, in +this paper, we propose a novel auxiliary learning framework for mask-guided +matting models, incorporating three auxiliary tasks: semantic segmentation, +edge detection, and background line detection besides matting, to learn +different and effective representations from different types of data and +annotations. Our framework and model introduce the following key aspects: (1) +to learn real-world adaptive semantic representation for objects with diverse +and complex structures under real-world scenes, we introduce extra semantic +segmentation and edge detection tasks on more diverse real-world data with +segmentation annotations; (2) to avoid overfitting on low-level details, we +propose a module to utilize the inconsistency between learned segmentation and +matting representations to regularize detail refinement; (3) we propose a novel +background line detection task into our auxiliary learning framework, to +suppress interference of background lines or textures. In addition, we propose +a high-quality matting benchmark, Plant-Mat, to evaluate matting methods on +complex structures. Extensively quantitative and qualitative results show that +our approach outperforms state-of-the-art mask-guided methods. + +
+
+
+
+
+ + ☆ From Activation to Initialization: Scaling Insights for Optimizing + Neural Fields CVPR 2024 + + +
+ In the realm of computer vision, Neural Fields have gained prominence as a +contemporary tool harnessing neural networks for signal representation. Despite +the remarkable progress in adapting these networks to solve a variety of +problems, the field still lacks a comprehensive theoretical framework. This +article aims to address this gap by delving into the intricate interplay +between initialization and activation, providing a foundational basis for the +robust optimization of Neural Fields. Our theoretical insights reveal a +deep-seated connection among network initialization, architectural choices, and +the optimization process, emphasizing the need for a holistic approach when +designing cutting-edge Neural Fields. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Single-Shared Network with Prior-Inspired Loss for Parameter-Efficient + Multi-Modal Imaging Skin Lesion Classification + + +
+ In this study, we introduce a multi-modal approach that efficiently +integrates multi-scale clinical and dermoscopy features within a single +network, thereby substantially reducing model parameters. The proposed method +includes three novel fusion schemes. + Firstly, unlike current methods that usually employ two individual models for +for clinical and dermoscopy modalities, we verified that multimodal feature can +be learned by sharing the parameters of encoder while leaving the individual +modal-specific classifiers. + Secondly, the shared cross-attention module can replace the individual one to +efficiently interact between two modalities at multiple layers. + Thirdly, different from current methods that equally optimize dermoscopy and +clinical branches, inspired by prior knowledge that dermoscopy images play a +more significant role than clinical images, we propose a novel biased loss. +This loss guides the single-shared network to prioritize dermoscopy information +over clinical information, implicitly learning a better joint feature +representation for the modal-specific task. + Extensive experiments on a well-recognized Seven-Point Checklist (SPC) +dataset and a collected dataset demonstrate the effectiveness of our method on +both CNN and Transformer structures. Furthermore, our method exhibits +superiority in both accuracy and model parameters compared to currently +advanced methods. + +
+
+ comment: This paper have submitted to Journal for review +
+
+
+
+
+ + ☆ Text Data-Centric Image Captioning with Interactive Prompts + + +
+ Supervised image captioning approaches have made great progress, but it is +challenging to collect high-quality human-annotated image-text data. Recently, +large-scale vision and language models (e.g., CLIP) and large-scale generative +language models (e.g., GPT-2) have shown strong performances in various tasks, +which also provide some new solutions for image captioning with web paired +data, unpaired data or even text-only data. Among them, the mainstream solution +is to project image embeddings into the text embedding space with the +assistance of consistent representations between image-text pairs from the CLIP +model. However, the current methods still face several challenges in adapting +to the diversity of data configurations in a unified solution, accurately +estimating image-text embedding bias, and correcting unsatisfactory prediction +results in the inference stage. This paper proposes a new Text data-centric +approach with Interactive Prompts for image Captioning, named TIPCap. 1) We +consider four different settings which gradually reduce the dependence on +paired data. 2) We construct a mapping module driven by multivariate Gaussian +distribution to mitigate the modality gap, which is applicable to the above +four different settings. 3) We propose a prompt interaction module that can +incorporate optional prompt information before generating captions. Extensive +experiments show that our TIPCap outperforms other weakly or unsupervised image +captioning methods and achieves a new state-of-the-art performance on two +widely used datasets, i.e., MS-COCO and Flickr30K. + +
+
+
+
+
+ + ☆ Rethinking Information Loss in Medical Image Segmentation with + Various-sized Targets + + +
+ Medical image segmentation presents the challenge of segmenting various-size +targets, demanding the model to effectively capture both local and global +information. Despite recent efforts using CNNs and ViTs to predict annotations +of different scales, these approaches often struggle to effectively balance the +detection of targets across varying sizes. Simply utilizing local information +from CNNs and global relationships from ViTs without considering potential +significant divergence in latent feature distributions may result in +substantial information loss. To address this issue, in this paper, we will +introduce a novel Stagger Network (SNet) and argues that a well-designed fusion +structure can mitigate the divergence in latent feature distributions between +CNNs and ViTs, thereby reducing information loss. Specifically, to emphasize +both global dependencies and local focus, we design a Parallel Module to bridge +the semantic gap. Meanwhile, we propose the Stagger Module, trying to fuse the +selected features that are more semantically similar. An Information Recovery +Module is further adopted to recover complementary information back to the +network. As a key contribution, we theoretically analyze that the proposed +parallel and stagger strategies would lead to less information loss, thus +certifying the SNet's rationale. Experimental results clearly proved that the +proposed SNet excels comparisons with recent SOTAs in segmenting on the Synapse +dataset where targets are in various sizes. Besides, it also demonstrates +superiority on the ACDC and the MoNuSeg datasets where targets are with more +consistent dimensions. + +
+
+
+
+
+ + ☆ Algorithmic Ways of Seeing: Using Object Detection to Facilitate Art + Exploration + + +
+ This Research through Design paper explores how object detection may be +applied to a large digital art museum collection to facilitate new ways of +encountering and experiencing art. We present the design and evaluation of an +interactive application called SMKExplore, which allows users to explore a +museum's digital collection of paintings by browsing through objects detected +in the images, as a novel form of open-ended exploration. We provide three +contributions. First, we show how an object detection pipeline can be +integrated into a design process for visual exploration. Second, we present the +design and development of an app that enables exploration of an art museum's +collection. Third, we offer reflections on future possibilities for museums and +HCI researchers to incorporate object detection techniques into the +digitalization of museums. + +
+
+
+
+
+ + ☆ RecDiffusion: Rectangling for Image Stitching with Diffusion Models + + +
+ Image stitching from different captures often results in non-rectangular +boundaries, which is often considered unappealing. To solve non-rectangular +boundaries, current solutions involve cropping, which discards image content, +inpainting, which can introduce unrelated content, or warping, which can +distort non-linear features and introduce artifacts. To overcome these issues, +we introduce a novel diffusion-based learning framework, \textbf{RecDiffusion}, +for image stitching rectangling. This framework combines Motion Diffusion +Models (MDM) to generate motion fields, effectively transitioning from the +stitched image's irregular borders to a geometrically corrected intermediary. +Followed by Content Diffusion Models (CDM) for image detail refinement. +Notably, our sampling process utilizes a weighted map to identify regions +needing correction during each iteration of CDM. Our RecDiffusion ensures +geometric accuracy and overall visual appeal, surpassing all previous methods +in both quantitative and qualitative measures when evaluated on public +benchmarks. Code is released at https://github.com/lhaippp/RecDiffusion. + +
+
+
+
+
+ + ☆ D'OH: Decoder-Only random Hypernetworks for Implicit Neural + Representations + + +
+ Deep implicit functions have been found to be an effective tool for +efficiently encoding all manner of natural signals. Their attractiveness stems +from their ability to compactly represent signals with little to no off-line +training data. Instead, they leverage the implicit bias of deep networks to +decouple hidden redundancies within the signal. In this paper, we explore the +hypothesis that additional compression can be achieved by leveraging the +redundancies that exist between layers. We propose to use a novel run-time +decoder-only hypernetwork - that uses no offline training data - to better +model this cross-layer parameter redundancy. Previous applications of +hyper-networks with deep implicit functions have applied feed-forward +encoder/decoder frameworks that rely on large offline datasets that do not +generalize beyond the signals they were trained on. We instead present a +strategy for the initialization of run-time deep implicit functions for +single-instance signals through a Decoder-Only randomly projected Hypernetwork +(D'OH). By directly changing the dimension of a latent code to approximate a +target implicit neural architecture, we provide a natural way to vary the +memory footprint of neural representations without the costly need for neural +architecture search on a space of alternative low-rate structures. + +
+
+ comment: 29 pages, 17 figures +
+
+
+
+
+ + ☆ Within the Dynamic Context: Inertia-aware 3D Human Modeling with Pose + Sequence + + +
+ Neural rendering techniques have significantly advanced 3D human body +modeling. However, previous approaches often overlook dynamics induced by +factors such as motion inertia, leading to challenges in scenarios like abrupt +stops after rotation, where the pose remains static while the appearance +changes. This limitation arises from reliance on a single pose as conditional +input, resulting in ambiguity in mapping one pose to multiple appearances. In +this study, we elucidate that variations in human appearance depend not only on +the current frame's pose condition but also on past pose states. Therefore, we +introduce Dyco, a novel method utilizing the delta pose sequence representation +for non-rigid deformations and canonical space to effectively model temporal +appearance variations. To prevent a decrease in the model's generalization +ability to novel poses, we further propose low-dimensional global context to +reduce unnecessary inter-body part dependencies and a quantization operation to +mitigate overfitting of the delta pose sequence by the model. To validate the +effectiveness of our approach, we collected a novel dataset named I3D-Human, +with a focus on capturing temporal changes in clothing appearance under +approximate poses. Through extensive experiments on both I3D-Human and existing +datasets, our approach demonstrates superior qualitative and quantitative +performance. In addition, our inertia-aware 3D human method can unprecedentedly +simulate appearance changes caused by inertia at different velocities. + +
+
+
+
+
+ + ☆ Uncertainty-Aware Deep Video Compression with Ensembles + + +
+ Deep learning-based video compression is a challenging task, and many +previous state-of-the-art learning-based video codecs use optical flows to +exploit the temporal correlation between successive frames and then compress +the residual error. Although these two-stage models are end-to-end optimized, +the epistemic uncertainty in the motion estimation and the aleatoric +uncertainty from the quantization operation lead to errors in the intermediate +representations and introduce artifacts in the reconstructed frames. This +inherent flaw limits the potential for higher bit rate savings. To address this +issue, we propose an uncertainty-aware video compression model that can +effectively capture the predictive uncertainty with deep ensembles. +Additionally, we introduce an ensemble-aware loss to encourage the diversity +among ensemble members and investigate the benefits of incorporating +adversarial training in the video compression task. Experimental results on +1080p sequences show that our model can effectively save bits by more than 20% +compared to DVC Pro. + +
+
+ comment: Published on IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ Towards Understanding Dual BN In Hybrid Adversarial Training + + +
+ There is a growing concern about applying batch normalization (BN) in +adversarial training (AT), especially when the model is trained on both +adversarial samples and clean samples (termed Hybrid-AT). With the assumption +that adversarial and clean samples are from two different domains, a common +practice in prior works is to adopt Dual BN, where BN and BN are used for +adversarial and clean branches, respectively. A popular belief for motivating +Dual BN is that estimating normalization statistics of this mixture +distribution is challenging and thus disentangling it for normalization +achieves stronger robustness. In contrast to this belief, we reveal that +disentangling statistics plays a less role than disentangling affine parameters +in model training. This finding aligns with prior work (Rebuffi et al., 2023), +and we build upon their research for further investigations. We demonstrate +that the domain gap between adversarial and clean samples is not very large, +which is counter-intuitive considering the significant influence of adversarial +perturbation on the model accuracy. We further propose a two-task hypothesis +which serves as the empirical foundation and a unified framework for Hybrid-AT +improvement. We also investigate Dual BN in test-time and reveal that affine +parameters characterize the robustness during inference. Overall, our work +sheds new light on understanding the mechanism of Dual BN in Hybrid-AT and its +underlying justification. + +
+
+ comment: Accepted at TMLR +
+
+
+
+
+ + ☆ MoDiTalker: Motion-Disentangled Diffusion Model for High-Fidelity + Talking Head Generation + + +
+ Conventional GAN-based models for talking head generation often suffer from +limited quality and unstable training. Recent approaches based on diffusion +models aimed to address these limitations and improve fidelity. However, they +still face challenges, including extensive sampling times and difficulties in +maintaining temporal consistency due to the high stochasticity of diffusion +models. To overcome these challenges, we propose a novel motion-disentangled +diffusion model for high-quality talking head generation, dubbed MoDiTalker. We +introduce the two modules: audio-to-motion (AToM), designed to generate a +synchronized lip motion from audio, and motion-to-video (MToV), designed to +produce high-quality head video following the generated motion. AToM excels in +capturing subtle lip movements by leveraging an audio attention mechanism. In +addition, MToV enhances temporal consistency by leveraging an efficient +tri-plane representation. Our experiments conducted on standard benchmarks +demonstrate that our model achieves superior performance compared to existing +models. We also provide comprehensive ablation studies and user study results. + +
+
+
+
+
+ + ☆ QNCD: Quantization Noise Correction for Diffusion Models + + +
+ Diffusion models have revolutionized image synthesis, setting new benchmarks +in quality and creativity. However, their widespread adoption is hindered by +the intensive computation required during the iterative denoising process. +Post-training quantization (PTQ) presents a solution to accelerate sampling, +aibeit at the expense of sample quality, extremely in low-bit settings. +Addressing this, our study introduces a unified Quantization Noise Correction +Scheme (QNCD), aimed at minishing quantization noise throughout the sampling +process. We identify two primary quantization challenges: intra and inter +quantization noise. Intra quantization noise, mainly exacerbated by embeddings +in the resblock module, extends activation quantization ranges, increasing +disturbances in each single denosing step. Besides, inter quantization noise +stems from cumulative quantization deviations across the entire denoising +process, altering data distributions step-by-step. QNCD combats these through +embedding-derived feature smoothing for eliminating intra quantization noise +and an effective runtime noise estimatiation module for dynamicly filtering +inter quantization noise. Extensive experiments demonstrate that our method +outperforms previous quantization methods for diffusion models, achieving +lossless results in W4A8 and W8A8 quantization settings on ImageNet (LDM-4). +Code is available at: https://github.com/huanpengchu/QNCD + +
+
+
+
+
+ + ☆ CLAP4CLIP: Continual Learning with Probabilistic Finetuning for + Vision-Language Models + + +
+ Continual learning (CL) aims to help deep neural networks to learn new +knowledge while retaining what has been learned. Recently, pre-trained +vision-language models such as CLIP, with powerful generalization ability, have +been gaining traction as practical CL candidates. However, the domain mismatch +between the pre-training and the downstream CL tasks calls for finetuning of +the CLIP on the latter. The deterministic nature of the existing finetuning +methods makes them overlook the many possible interactions across the +modalities and deems them unsafe for high-risk CL tasks requiring reliable +uncertainty estimation. To address these, our work proposes Continual LeArning +with Probabilistic finetuning (CLAP). CLAP develops probabilistic modeling over +task-specific modules with visual-guided text features, providing more reliable +fine-tuning in CL. It further alleviates forgetting by exploiting the rich +pre-trained knowledge of CLIP for weight initialization and distribution +regularization of task-specific modules. Cooperating with the diverse range of +existing prompting methods, CLAP can surpass the predominant deterministic +finetuning approaches for CL with CLIP. Lastly, we study the superior +uncertainty estimation abilities of CLAP for novel data detection and exemplar +selection within CL setups. Our code is available at +\url{https://github.com/srvCodes/clap4clip}. + +
+
+ comment: Work under review +
+
+
+
+
+ + ☆ OmniParser: A Unified Framework for Text Spotting, Key Information + Extraction and Table Recognition CVPR 2024 + + +
+ Recently, visually-situated text parsing (VsTP) has experienced notable +advancements, driven by the increasing demand for automated document +understanding and the emergence of Generative Large Language Models (LLMs) +capable of processing document-based questions. Various methods have been +proposed to address the challenging problem of VsTP. However, due to the +diversified targets and heterogeneous schemas, previous works usually design +task-specific architectures and objectives for individual tasks, which +inadvertently leads to modal isolation and complex workflow. In this paper, we +propose a unified paradigm for parsing visually-situated text across diverse +scenarios. Specifically, we devise a universal model, called OmniParser, which +can simultaneously handle three typical visually-situated text parsing tasks: +text spotting, key information extraction, and table recognition. In +OmniParser, all tasks share the unified encoder-decoder architecture, the +unified objective: point-conditioned text generation, and the unified input & +output representation: prompt & structured sequences. Extensive experiments +demonstrate that the proposed OmniParser achieves state-of-the-art (SOTA) or +highly competitive performances on 7 datasets for the three visually-situated +text parsing tasks, despite its unified, concise design. The code is available +at https://github.com/AlibabaResearch/AdvancedLiterateMachinery. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ PoCo: A Self-Supervised Approach via Polar Transformation Based + Progressive Contrastive Learning for Ophthalmic Disease Diagnosis + + +
+ Automatic ophthalmic disease diagnosis on fundus images is important in +clinical practice. However, due to complex fundus textures and limited +annotated data, developing an effective automatic method for this problem is +still challenging. In this paper, we present a self-supervised method via polar +transformation based progressive contrastive learning, called PoCo, for +ophthalmic disease diagnosis. Specifically, we novelly inject the polar +transformation into contrastive learning to 1) promote contrastive learning +pre-training to be faster and more stable and 2) naturally capture task-free +and rotation-related textures, which provides insights into disease recognition +on fundus images. Beneficially, simple normal translation-invariant convolution +on transformed images can equivalently replace the complex rotation-invariant +and sector convolution on raw images. After that, we develop a progressive +contrastive learning method to efficiently utilize large unannotated images and +a novel progressive hard negative sampling scheme to gradually reduce the +negative sample number for efficient training and performance enhancement. +Extensive experiments on three public ophthalmic disease datasets show that our +PoCo achieves state-of-the-art performance with good generalization ability, +validating that our method can reduce annotation efforts and provide reliable +diagnosis. Codes are available at \url{https://github.com/wjh892521292/PoCo}. + +
+
+
+
+
+ + ☆ Patch Spatio-Temporal Relation Prediction for Video Anomaly Detection + + +
+ Video Anomaly Detection (VAD), aiming to identify abnormalities within a +specific context and timeframe, is crucial for intelligent Video Surveillance +Systems. While recent deep learning-based VAD models have shown promising +results by generating high-resolution frames, they often lack competence in +preserving detailed spatial and temporal coherence in video frames. To tackle +this issue, we propose a self-supervised learning approach for VAD through an +inter-patch relationship prediction task. Specifically, we introduce a +two-branch vision transformer network designed to capture deep visual features +of video frames, addressing spatial and temporal dimensions responsible for +modeling appearance and motion patterns, respectively. The inter-patch +relationship in each dimension is decoupled into inter-patch similarity and the +order information of each patch. To mitigate memory consumption, we convert the +order information prediction task into a multi-label learning problem, and the +inter-patch similarity prediction task into a distance matrix regression +problem. Comprehensive experiments demonstrate the effectiveness of our method, +surpassing pixel-generation-based methods by a significant margin across three +public benchmarks. Additionally, our approach outperforms other self-supervised +learning-based methods. + +
+
+
+
+
+ + ☆ Synthetic Medical Imaging Generation with Generative Adversarial + Networks For Plain Radiographs + + +
+ In medical imaging, access to data is commonly limited due to patient privacy +restrictions and the issue that it can be difficult to acquire enough data in +the case of rare diseases.[1] The purpose of this investigation was to develop +a reusable open-source synthetic image generation pipeline, the GAN Image +Synthesis Tool (GIST), that is easy to use as well as easy to deploy. The +pipeline helps to improve and standardize AI algorithms in the digital health +space by generating high quality synthetic image data that is not linked to +specific patients. Its image generation capabilities include the ability to +generate imaging of pathologies or injuries with low incidence rates. This +improvement of digital health AI algorithms could improve diagnostic accuracy, +aid in patient care, decrease medicolegal claims, and ultimately decrease the +overall cost of healthcare. The pipeline builds on existing Generative +Adversarial Networks (GANs) algorithms, and preprocessing and evaluation steps +were included for completeness. For this work, we focused on ensuring the +pipeline supports radiography, with a focus on synthetic knee and elbow x-ray +images. In designing the pipeline, we evaluated the performance of current GAN +architectures, studying the performance on available x-ray data. We show that +the pipeline is capable of generating high quality and clinically relevant +images based on a lay person's evaluation and the Fr\'echet Inception Distance +(FID) metric. + +
+
+
+
+
+ + ☆ CRKD: Enhanced Camera-Radar Object Detection with Cross-modality + Knowledge Distillation CVPR 2024 + + +
+ In the field of 3D object detection for autonomous driving, LiDAR-Camera (LC) +fusion is the top-performing sensor configuration. Still, LiDAR is relatively +high cost, which hinders adoption of this technology for consumer automobiles. +Alternatively, camera and radar are commonly deployed on vehicles already on +the road today, but performance of Camera-Radar (CR) fusion falls behind LC +fusion. In this work, we propose Camera-Radar Knowledge Distillation (CRKD) to +bridge the performance gap between LC and CR detectors with a novel +cross-modality KD framework. We use the Bird's-Eye-View (BEV) representation as +the shared feature space to enable effective knowledge distillation. To +accommodate the unique cross-modality KD path, we propose four distillation +losses to help the student learn crucial features from the teacher model. We +present extensive evaluations on the nuScenes dataset to demonstrate the +effectiveness of the proposed CRKD framework. The project page for CRKD is +https://song-jingyu.github.io/CRKD. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Automated Black-box Prompt Engineering for Personalized Text-to-Image + Generation + + +
+ Prompt engineering is effective for controlling the output of text-to-image +(T2I) generative models, but it is also laborious due to the need for manually +crafted prompts. This challenge has spurred the development of algorithms for +automated prompt generation. However, these methods often struggle with +transferability across T2I models, require white-box access to the underlying +model, and produce non-intuitive prompts. In this work, we introduce PRISM, an +algorithm that automatically identifies human-interpretable and transferable +prompts that can effectively generate desired concepts given only black-box +access to T2I models. Inspired by large language model (LLM) jailbreaking, +PRISM leverages the in-context learning ability of LLMs to iteratively refine +the candidate prompts distribution for given reference images. Our experiments +demonstrate the versatility and effectiveness of PRISM in generating accurate +prompts for objects, styles and images across multiple T2I models, including +Stable Diffusion, DALL-E, and Midjourney. + +
+
+
+
+
+ + ☆ AAPMT: AGI Assessment Through Prompt and Metric Transformer + + +
+ The emergence of text-to-image models marks a significant milestone in the +evolution of AI-generated images (AGIs), expanding their use in diverse domains +like design, entertainment, and more. Despite these breakthroughs, the quality +of AGIs often remains suboptimal, highlighting the need for effective +evaluation methods. These methods are crucial for assessing the quality of +images relative to their textual descriptions, and they must accurately mirror +human perception. Substantial progress has been achieved in this domain, with +innovative techniques such as BLIP and DBCNN contributing significantly. +However, recent studies, including AGIQA-3K, reveal a notable discrepancy +between current methods and state-of-the-art (SOTA) standards. This gap +emphasizes the necessity for a more sophisticated and precise evaluation +metric. In response, our objective is to develop a model that could give +ratings for metrics, which focuses on parameters like perceptual quality, +authenticity, and the correspondence between text and image, that more closely +aligns with human perception. In our paper, we introduce a range of effective +methods, including prompt designs and the Metric Transformer. The Metric +Transformer is a novel structure inspired by the complex interrelationships +among various AGI quality metrics. The code is available at +https://github.com/huskydoge/CS3324-Digital-Image-Processing/tree/main/Assignment1 + +
+
+
+
+
+ + ☆ GraphAD: Interaction Scene Graph for End-to-end Autonomous Driving + + +
+ Modeling complicated interactions among the ego-vehicle, road agents, and map +elements has been a crucial part for safety-critical autonomous driving. +Previous works on end-to-end autonomous driving rely on the attention mechanism +for handling heterogeneous interactions, which fails to capture the geometric +priors and is also computationally intensive. In this paper, we propose the +Interaction Scene Graph (ISG) as a unified method to model the interactions +among the ego-vehicle, road agents, and map elements. With the representation +of the ISG, the driving agents aggregate essential information from the most +influential elements, including the road agents with potential collisions and +the map elements to follow. Since a mass of unnecessary interactions are +omitted, the more efficient scene-graph-based framework is able to focus on +indispensable connections and leads to better performance. We evaluate the +proposed method for end-to-end autonomous driving on the nuScenes dataset. +Compared with strong baselines, our method significantly outperforms in the +full-stack driving tasks, including perception, prediction, and planning. Code +will be released at https://github.com/zhangyp15/GraphAD. + +
+
+ comment: project page: https://github.com/zhangyp15/GraphAD +
+
+
+
+
+ + ☆ MMCert: Provable Defense against Adversarial Attacks to Multi-modal + Models + + +
+ Different from a unimodal model whose input is from a single modality, the +input (called multi-modal input) of a multi-modal model is from multiple +modalities such as image, 3D points, audio, text, etc. Similar to unimodal +models, many existing studies show that a multi-modal model is also vulnerable +to adversarial perturbation, where an attacker could add small perturbation to +all modalities of a multi-modal input such that the multi-modal model makes +incorrect predictions for it. Existing certified defenses are mostly designed +for unimodal models, which achieve sub-optimal certified robustness guarantees +when extended to multi-modal models as shown in our experimental results. In +our work, we propose MMCert, the first certified defense against adversarial +attacks to a multi-modal model. We derive a lower bound on the performance of +our MMCert under arbitrary adversarial attacks with bounded perturbations to +both modalities (e.g., in the context of auto-driving, we bound the number of +changed pixels in both RGB image and depth image). We evaluate our MMCert using +two benchmark datasets: one for the multi-modal road segmentation task and the +other for the multi-modal emotion recognition task. Moreover, we compare our +MMCert with a state-of-the-art certified defense extended from unimodal models. +Our experimental results show that our MMCert outperforms the baseline. + +
+
+
+
+
+ + ☆ A Real-Time Framework for Domain-Adaptive Underwater Object Detection + with Image Enhancement ICRA24 + + +
+ In recent years, significant progress has been made in the field of +underwater image enhancement (UIE). However, its practical utility for +high-level vision tasks, such as underwater object detection (UOD) in +Autonomous Underwater Vehicles (AUVs), remains relatively unexplored. It may be +attributed to several factors: (1) Existing methods typically employ UIE as a +pre-processing step, which inevitably introduces considerable computational +overhead and latency. (2) The process of enhancing images prior to training +object detectors may not necessarily yield performance improvements. (3) The +complex underwater environments can induce significant domain shifts across +different scenarios, seriously deteriorating the UOD performance. To address +these challenges, we introduce EnYOLO, an integrated real-time framework +designed for simultaneous UIE and UOD with domain-adaptation capability. +Specifically, both the UIE and UOD task heads share the same network backbone +and utilize a lightweight design. Furthermore, to ensure balanced training for +both tasks, we present a multi-stage training strategy aimed at consistently +enhancing their performance. Additionally, we propose a novel domain-adaptation +strategy to align feature embeddings originating from diverse underwater +environments. Comprehensive experiments demonstrate that our framework not only +achieves state-of-the-art (SOTA) performance in both UIE and UOD tasks, but +also shows superior adaptability when applied to different underwater +scenarios. Our efficiency analysis further highlights the substantial potential +of our framework for onboard deployment. + +
+
+ comment: accepted by ICRA24 +
+
+
+
+
+ + ☆ MVEB: Self-Supervised Learning with Multi-View Entropy Bottleneck + + +
+ Self-supervised learning aims to learn representation that can be effectively +generalized to downstream tasks. Many self-supervised approaches regard two +views of an image as both the input and the self-supervised signals, assuming +that either view contains the same task-relevant information and the shared +information is (approximately) sufficient for predicting downstream tasks. +Recent studies show that discarding superfluous information not shared between +the views can improve generalization. Hence, the ideal representation is +sufficient for downstream tasks and contains minimal superfluous information, +termed minimal sufficient representation. One can learn this representation by +maximizing the mutual information between the representation and the supervised +view while eliminating superfluous information. Nevertheless, the computation +of mutual information is notoriously intractable. In this work, we propose an +objective termed multi-view entropy bottleneck (MVEB) to learn minimal +sufficient representation effectively. MVEB simplifies the minimal sufficient +learning to maximizing both the agreement between the embeddings of two views +and the differential entropy of the embedding distribution. Our experiments +confirm that MVEB significantly improves performance. For example, it achieves +top-1 accuracy of 76.9\% on ImageNet with a vanilla ResNet-50 backbone on +linear evaluation. To the best of our knowledge, this is the new +state-of-the-art result with ResNet-50. + +
+
+ comment: Accepted by TPAMI +
+
+
+
+
+ + ☆ Tiny Machine Learning: Progress and Futures + + +
+ Tiny Machine Learning (TinyML) is a new frontier of machine learning. By +squeezing deep learning models into billions of IoT devices and +microcontrollers (MCUs), we expand the scope of AI applications and enable +ubiquitous intelligence. However, TinyML is challenging due to hardware +constraints: the tiny memory resource makes it difficult to hold deep learning +models designed for cloud and mobile platforms. There is also limited compiler +and inference engine support for bare-metal devices. Therefore, we need to +co-design the algorithm and system stack to enable TinyML. In this review, we +will first discuss the definition, challenges, and applications of TinyML. We +then survey the recent progress in TinyML and deep learning on MCUs. Next, we +will introduce MCUNet, showing how we can achieve ImageNet-scale AI +applications on IoT devices with system-algorithm co-design. We will further +extend the solution from inference to training and introduce tiny on-device +training techniques. Finally, we present future directions in this area. +Today's large model might be tomorrow's tiny model. The scope of TinyML should +evolve and adapt over time. + +
+
+ comment: IEEE Circuits and Systems Magazine (2023). arXiv admin note: text + overlap with arXiv:2206.15472 +
+
+
+
+
+ + ☆ Low-Rank Rescaled Vision Transformer Fine-Tuning: A Residual Design + Approach + + +
+ Parameter-efficient fine-tuning for pre-trained Vision Transformers aims to +adeptly tailor a model to downstream tasks by learning a minimal set of new +adaptation parameters while preserving the frozen majority of pre-trained +parameters. Striking a balance between retaining the generalizable +representation capacity of the pre-trained model and acquiring task-specific +features poses a key challenge. Currently, there is a lack of focus on guiding +this delicate trade-off. In this study, we approach the problem from the +perspective of Singular Value Decomposition (SVD) of pre-trained parameter +matrices, providing insights into the tuning dynamics of existing methods. +Building upon this understanding, we propose a Residual-based Low-Rank +Rescaling (RLRR) fine-tuning strategy. This strategy not only enhances +flexibility in parameter tuning but also ensures that new parameters do not +deviate excessively from the pre-trained model through a residual design. +Extensive experiments demonstrate that our method achieves competitive +performance across various downstream image classification tasks, all while +maintaining comparable new parameters. We believe this work takes a step +forward in offering a unified perspective for interpreting existing methods and +serves as motivation for the development of new approaches that move closer to +effectively considering the crucial trade-off mentioned above. Our code is +available at +\href{https://github.com/zstarN70/RLRR.git}{https://github.com/zstarN70/RLRR.git}. + +
+
+
+
+
+ + ☆ Generative Quanta Color Imaging CVPR + + +
+ The astonishing development of single-photon cameras has created an +unprecedented opportunity for scientific and industrial imaging. However, the +high data throughput generated by these 1-bit sensors creates a significant +bottleneck for low-power applications. In this paper, we explore the +possibility of generating a color image from a single binary frame of a +single-photon camera. We evidently find this problem being particularly +difficult to standard colorization approaches due to the substantial degree of +exposure variation. The core innovation of our paper is an exposure synthesis +model framed under a neural ordinary differential equation (Neural ODE) that +allows us to generate a continuum of exposures from a single observation. This +innovation ensures consistent exposure in binary images that colorizers take +on, resulting in notably enhanced colorization. We demonstrate applications of +the method in single-image and burst colorization and show superior generative +performance over baselines. Project website can be found at +https://vishal-s-p.github.io/projects/2023/generative_quanta_color.html. + +
+
+ comment: Accepted at IEEE Conference on Computer Vision and Pattern + Recognition (CVPR), 2024 +
+
+
+
+
+ + ☆ Single-Shared Network with Prior-Inspired Loss for Parameter-Efficient + Multi-Modal Imaging Skin Lesion Classification + + +
+ In this study, we introduce a multi-modal approach that efficiently +integrates multi-scale clinical and dermoscopy features within a single +network, thereby substantially reducing model parameters. The proposed method +includes three novel fusion schemes. Firstly, unlike current methods that +usually employ two individual models for for clinical and dermoscopy +modalities, we verified that multimodal feature can be learned by sharing the +parameters of encoder while leaving the individual modal-specific classifiers. +Secondly, the shared cross-attention module can replace the individual one to +efficiently interact between two modalities at multiple layers. Thirdly, +different from current methods that equally optimize dermoscopy and clinical +branches, inspired by prior knowledge that dermoscopy images play a more +significant role than clinical images, we propose a novel biased loss. This +loss guides the single-shared network to prioritize dermoscopy information over +clinical information, implicitly learning a better joint feature representation +for the modal-specific task. Extensive experiments on a well-recognized +Seven-Point Checklist (SPC) dataset and a collected dataset demonstrate the +effectiveness of our method on both CNN and Transformer structures. +Furthermore, our method exhibits superiority in both accuracy and model +parameters compared to currently advanced methods. + +
+
+ comment: This paper have submitted to Journal for review +
+
+
+
+
+ + ☆ Tiny Machine Learning: Progress and Futures + + +
+ Tiny Machine Learning (TinyML) is a new frontier of machine learning. By +squeezing deep learning models into billions of IoT devices and +microcontrollers (MCUs), we expand the scope of AI applications and enable +ubiquitous intelligence. However, TinyML is challenging due to hardware +constraints: the tiny memory resource makes it difficult to hold deep learning +models designed for cloud and mobile platforms. There is also limited compiler +and inference engine support for bare-metal devices. Therefore, we need to +co-design the algorithm and system stack to enable TinyML. In this review, we +will first discuss the definition, challenges, and applications of TinyML. We +then survey the recent progress in TinyML and deep learning on MCUs. Next, we +will introduce MCUNet, showing how we can achieve ImageNet-scale AI +applications on IoT devices with system-algorithm co-design. We will further +extend the solution from inference to training and introduce tiny on-device +training techniques. Finally, we present future directions in this area. +Today's large model might be tomorrow's tiny model. The scope of TinyML should +evolve and adapt over time. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2206.15472 +
+
+
+
+
+ + ☆ Towards Long Term SLAM on Thermal Imagery IROS 2024 + + +
+ Visual SLAM with thermal imagery, and other low contrast visually degraded +environments such as underwater, or in areas dominated by snow and ice, remain +a difficult problem for many state of the art (SOTA) algorithms. In addition to +challenging front-end data association, thermal imagery presents an additional +difficulty for long term relocalization and map reuse. The relative +temperatures of objects in thermal imagery change dramatically from day to +night. Feature descriptors typically used for relocalization in SLAM are unable +to maintain consistency over these diurnal changes. We show that learned +feature descriptors can be used within existing Bag of Word based localization +schemes to dramatically improve place recognition across large temporal gaps in +thermal imagery. In order to demonstrate the effectiveness of our trained +vocabulary, we have developed a baseline SLAM system, integrating learned +features and matching into a classical SLAM algorithm. Our system demonstrates +good local tracking on challenging thermal imagery, and relocalization that +overcomes dramatic day to night thermal appearance changes. Our code and +datasets are available here: +https://github.com/neufieldrobotics/IRSLAM_Baseline + +
+
+ comment: 8 pages, 7 figures, Submitted to IROS 2024 +
+
+
+
+
+ + ☆ Enhancing Efficiency in Vision Transformer Networks: Design Techniques + and Insights + + +
+ Intrigued by the inherent ability of the human visual system to identify +salient regions in complex scenes, attention mechanisms have been seamlessly +integrated into various Computer Vision (CV) tasks. Building upon this +paradigm, Vision Transformer (ViT) networks exploit attention mechanisms for +improved efficiency. This review navigates the landscape of redesigned +attention mechanisms within ViTs, aiming to enhance their performance. This +paper provides a comprehensive exploration of techniques and insights for +designing attention mechanisms, systematically reviewing recent literature in +the field of CV. This survey begins with an introduction to the theoretical +foundations and fundamental concepts underlying attention mechanisms. We then +present a systematic taxonomy of various attention mechanisms within ViTs, +employing redesigned approaches. A multi-perspective categorization is proposed +based on their application, objectives, and the type of attention applied. The +analysis includes an exploration of the novelty, strengths, weaknesses, and an +in-depth evaluation of the different proposed strategies. This culminates in +the development of taxonomies that highlight key properties and contributions. +Finally, we gather the reviewed studies along with their available open-source +implementations at our +\href{https://github.com/mindflow-institue/Awesome-Attention-Mechanism-in-Medical-Imaging}{GitHub}\footnote{\url{https://github.com/xmindflow/Awesome-Attention-Mechanism-in-Medical-Imaging}}. +We aim to regularly update it with the most recent relevant papers. + +
+
+ comment: Submitted to Computational Visual Media Journal +
+
+
+
+
+ + ☆ Vision-Language Synthetic Data Enhances Echocardiography Downstream + Tasks MICCAI 2024 + + +
+ High-quality, large-scale data is essential for robust deep learning models +in medical applications, particularly ultrasound image analysis. Diffusion +models facilitate high-fidelity medical image generation, reducing the costs +associated with acquiring and annotating new images. This paper utilizes recent +vision-language models to produce diverse and realistic synthetic +echocardiography image data, preserving key features of the original images +guided by textual and semantic label maps. Specifically, we investigate three +potential avenues: unconditional generation, generation guided by text, and a +hybrid approach incorporating both textual and semantic supervision. We show +that the rich contextual information present in the synthesized data +potentially enhances the accuracy and interpretability of downstream tasks, +such as echocardiography segmentation and classification with improved metrics +and faster convergence. Our implementation with checkpoints, prompts, and the +created synthetic dataset will be publicly available at +\href{https://github.com/Pooria90/DiffEcho}{GitHub}. + +
+
+ comment: Submitted as a conference paper to MICCAI 2024 +
+
+
+
+
+ + ☆ Is Synthetic Image Useful for Transfer Learning? An Investigation into + Data Generation, Volume, and Utilization ICLR24 + + +
+ Synthetic image data generation represents a promising avenue for training +deep learning models, particularly in the realm of transfer learning, where +obtaining real images within a specific domain can be prohibitively expensive +due to privacy and intellectual property considerations. This work delves into +the generation and utilization of synthetic images derived from text-to-image +generative models in facilitating transfer learning paradigms. Despite the high +visual fidelity of the generated images, we observe that their naive +incorporation into existing real-image datasets does not consistently enhance +model performance due to the inherent distribution gap between synthetic and +real images. To address this issue, we introduce a novel two-stage framework +called bridged transfer, which initially employs synthetic images for +fine-tuning a pre-trained model to improve its transferability and subsequently +uses real data for rapid adaptation. Alongside, We propose dataset style +inversion strategy to improve the stylistic alignment between synthetic and +real images. Our proposed methods are evaluated across 10 different datasets +and 5 distinct models, demonstrating consistent improvements, with up to 30% +accuracy increase on classification tasks. Intriguingly, we note that the +enhancements were not yet saturated, indicating that the benefits may further +increase with an expanded volume of synthetic data. + +
+
+ comment: ICLR24 Score 6865 + https://openreview.net/forum?id=CjPt1AC6w0&referrer=%5Bthe%20profile%20of%20Chen%20Chen%5D(%2Fprofile%3Fid%3D~Chen_Chen20) +
+
+
+
+
+ + ☆ DeNetDM: Debiasing by Network Depth Modulation + + +
+ When neural networks are trained on biased datasets, they tend to +inadvertently learn spurious correlations, leading to challenges in achieving +strong generalization and robustness. Current approaches to address such biases +typically involve utilizing bias annotations, reweighting based on pseudo-bias +labels, or enhancing diversity within bias-conflicting data points through +augmentation techniques. We introduce DeNetDM, a novel debiasing method based +on the observation that shallow neural networks prioritize learning core +attributes, while deeper ones emphasize biases when tasked with acquiring +distinct information. Using a training paradigm derived from Product of +Experts, we create both biased and debiased branches with deep and shallow +architectures and then distill knowledge to produce the target debiased model. +Extensive experiments and analyses demonstrate that our approach outperforms +current debiasing techniques, achieving a notable improvement of around 5% in +three datasets, encompassing both synthetic and real-world data. Remarkably, +DeNetDM accomplishes this without requiring annotations pertaining to bias +labels or bias types, while still delivering performance on par with supervised +counterparts. Furthermore, our approach effectively harnesses the diversity of +bias-conflicting points within the data, surpassing previous methods and +obviating the need for explicit augmentation-based methods to enhance the +diversity of such bias-conflicting points. The source code will be available +upon acceptance. + +
+
+ comment: 23 pages including supplementary +
+
+
+
+
+ + ☆ Multi-Frame, Lightweight & Efficient Vision-Language Models for Question + Answering in Autonomous Driving + + +
+ Vision-Language Models (VLMs) and Multi-Modal Language models (MMLMs) have +become prominent in autonomous driving research, as these models can provide +interpretable textual reasoning and responses for end-to-end autonomous driving +safety tasks using traffic scene images and other data modalities. However, +current approaches to these systems use expensive large language model (LLM) +backbones and image encoders, making such systems unsuitable for real-time +autonomous driving systems where tight memory constraints exist and fast +inference time is necessary. To address these previous issues, we develop +EM-VLM4AD, an efficient, lightweight, multi-frame vision language model which +performs Visual Question Answering for autonomous driving. In comparison to +previous approaches, EM-VLM4AD requires at least 10 times less memory and +floating point operations, while also achieving higher BLEU-4, METEOR, CIDEr, +and ROGUE scores than the existing baseline on the DriveLM dataset. EM-VLM4AD +also exhibits the ability to extract relevant information from traffic views +related to prompts and can answer questions for various autonomous driving +subtasks. We release our code to train and evaluate our model at +https://github.com/akshaygopalkr/EM-VLM4AD. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Concept-based Analysis of Neural Networks via Vision-Language Models + + +
+ Formal analysis of vision-based deep neural networks (DNNs) is highly +desirable but it is very challenging due to the difficulty of expressing formal +specifications for vision tasks and the lack of efficient verification +procedures. In this paper, we propose to leverage emerging multimodal, +vision-language, foundation models (VLMs) as a lens through which we can reason +about vision models. VLMs have been trained on a large body of images +accompanied by their textual description, and are thus implicitly aware of +high-level, human-understandable concepts describing the images. We describe a +logical specification language $\texttt{Con}_{\texttt{spec}}$ designed to +facilitate writing specifications in terms of these concepts. To define and +formally check $\texttt{Con}_{\texttt{spec}}$ specifications, we leverage a +VLM, which provides a means to encode and efficiently check natural-language +properties of vision models. We demonstrate our techniques on a ResNet-based +classifier trained on the RIVAL-10 dataset leveraging CLIP as the multimodal +model. + +
+
+
+
+
+ + ☆ X-MIC: Cross-Modal Instance Conditioning for Egocentric Action + Generalization CVPR 2024 + + +
+ Lately, there has been growing interest in adapting vision-language models +(VLMs) to image and third-person video classification due to their success in +zero-shot recognition. However, the adaptation of these models to egocentric +videos has been largely unexplored. To address this gap, we propose a simple +yet effective cross-modal adaptation framework, which we call X-MIC. Using a +video adapter, our pipeline learns to align frozen text embeddings to each +egocentric video directly in the shared embedding space. Our novel adapter +architecture retains and improves generalization of the pre-trained VLMs by +disentangling learnable temporal modeling and frozen visual encoder. This +results in an enhanced alignment of text embeddings to each egocentric video, +leading to a significant improvement in cross-dataset generalization. We +evaluate our approach on the Epic-Kitchens, Ego4D, and EGTEA datasets for +fine-grained cross-dataset action generalization, demonstrating the +effectiveness of our method. Code is available at +https://github.com/annusha/xmic + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ JIST: Joint Image and Sequence Training for Sequential Visual Place + Recognition + + +
+ Visual Place Recognition aims at recognizing previously visited places by +relying on visual clues, and it is used in robotics applications for SLAM and +localization. Since typically a mobile robot has access to a continuous stream +of frames, this task is naturally cast as a sequence-to-sequence localization +problem. Nevertheless, obtaining sequences of labelled data is much more +expensive than collecting isolated images, which can be done in an automated +way with little supervision. As a mitigation to this problem, we propose a +novel Joint Image and Sequence Training protocol (JIST) that leverages large +uncurated sets of images through a multi-task learning framework. With JIST we +also introduce SeqGeM, an aggregation layer that revisits the popular GeM +pooling to produce a single robust and compact embedding from a sequence of +single-frame embeddings. We show that our model is able to outperform previous +state of the art while being faster, using 8 times smaller descriptors, having +a lighter architecture and allowing to process sequences of various lengths. +Code is available at https://github.com/ga1i13o/JIST + +
+
+
+
+
+ + ☆ Zero-shot Prompt-based Video Encoder for Surgical Gesture Recognition + + +
+ Purpose: Surgical video is an important data stream for gesture recognition. +Thus, robust visual encoders for those data-streams is similarly important. +Methods: Leveraging the Bridge-Prompt framework, we fine-tune a pre-trained +vision-text model (CLIP) for gesture recognition in surgical videos. This can +utilize extensive outside video data such as text, but also make use of label +meta-data and weakly supervised contrastive losses. Results: Our experiments +show that prompt-based video encoder outperforms standard encoders in surgical +gesture recognition tasks. Notably, it displays strong performance in zero-shot +scenarios, where gestures/tasks that were not provided during the encoder +training phase are included in the prediction phase. Additionally, we measure +the benefit of inclusion text descriptions in the feature extractor training +schema. Conclusion: Bridge-Prompt and similar pre-trained+fine-tuned video +encoder models present significant visual representation for surgical robotics, +especially in gesture recognition tasks. Given the diverse range of surgical +tasks (gestures), the ability of these models to zero-shot transfer without the +need for any task (gesture) specific retraining makes them invaluable. + +
+
+ comment: 17 pages,4 figures, 7 tables, IPCAI 2024 +
+
+
+
+
+ + ☆ ENet-21: An Optimized light CNN Structure for Lane Detection + + +
+ Lane detection for autonomous vehicles is an important concept, yet it is a +challenging issue of driver assistance systems in modern vehicles. The +emergence of deep learning leads to significant progress in self-driving cars. +Conventional deep learning-based methods handle lane detection problems as a +binary segmentation task and determine whether a pixel belongs to a line. These +methods rely on the assumption of a fixed number of lanes, which does not +always work. This study aims to develop an optimal structure for the lane +detection problem, offering a promising solution for driver assistance features +in modern vehicles by utilizing a machine learning method consisting of binary +segmentation and Affinity Fields that can manage varying numbers of lanes and +lane change scenarios. In this approach, the Convolutional Neural Network +(CNN), is selected as a feature extractor, and the final output is obtained +through clustering of the semantic segmentation and Affinity Field outputs. Our +method uses less complex CNN architecture than exi + +
+
+ comment: The paper is under review by Soft Computing journal +
+
+
+
+
+ + ☆ Mitigating Motion Blur in Neural Radiance Fields with Events and Frames CVPR + + +
+ Neural Radiance Fields (NeRFs) have shown great potential in novel view +synthesis. However, they struggle to render sharp images when the data used for +training is affected by motion blur. On the other hand, event cameras excel in +dynamic scenes as they measure brightness changes with microsecond resolution +and are thus only marginally affected by blur. Recent methods attempt to +enhance NeRF reconstructions under camera motion by fusing frames and events. +However, they face challenges in recovering accurate color content or constrain +the NeRF to a set of predefined camera poses, harming reconstruction quality in +challenging conditions. This paper proposes a novel formulation addressing +these issues by leveraging both model- and learning-based modules. We +explicitly model the blur formation process, exploiting the event double +integral as an additional model-based prior. Additionally, we model the +event-pixel response using an end-to-end learnable response function, allowing +our method to adapt to non-idealities in the real event-camera sensor. We show, +on synthetic and real data, that the proposed approach outperforms existing +deblur NeRFs that use only frames as well as those that combine frames and +events by +6.13dB and +2.48dB, respectively. + +
+
+ comment: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), + 2024 +
+
+
+
+
+ + ☆ CLoRA: A Contrastive Approach to Compose Multiple LoRA Models + + +
+ Low-Rank Adaptations (LoRAs) have emerged as a powerful and popular technique +in the field of image generation, offering a highly effective way to adapt and +refine pre-trained deep learning models for specific tasks without the need for +comprehensive retraining. By employing pre-trained LoRA models, such as those +representing a specific cat and a particular dog, the objective is to generate +an image that faithfully embodies both animals as defined by the LoRAs. +However, the task of seamlessly blending multiple concept LoRAs to capture a +variety of concepts in one image proves to be a significant challenge. Common +approaches often fall short, primarily because the attention mechanisms within +different LoRA models overlap, leading to scenarios where one concept may be +completely ignored (e.g., omitting the dog) or where concepts are incorrectly +combined (e.g., producing an image of two cats instead of one cat and one dog). +To overcome these issues, CLoRA addresses them by updating the attention maps +of multiple LoRA models and leveraging them to create semantic masks that +facilitate the fusion of latent representations. Our method enables the +creation of composite images that truly reflect the characteristics of each +LoRA, successfully merging multiple concepts or styles. Our comprehensive +evaluations, both qualitative and quantitative, demonstrate that our approach +outperforms existing methodologies, marking a significant advancement in the +field of image generation with LoRAs. Furthermore, we share our source code, +benchmark dataset, and trained LoRA models to promote further research on this +topic. + +
+
+
+
+
+ + ☆ ShapeFusion: A 3D diffusion model for localized shape editing + + +
+ In the realm of 3D computer vision, parametric models have emerged as a +ground-breaking methodology for the creation of realistic and expressive 3D +avatars. Traditionally, they rely on Principal Component Analysis (PCA), given +its ability to decompose data to an orthonormal space that maximally captures +shape variations. However, due to the orthogonality constraints and the global +nature of PCA's decomposition, these models struggle to perform localized and +disentangled editing of 3D shapes, which severely affects their use in +applications requiring fine control such as face sculpting. In this paper, we +leverage diffusion models to enable diverse and fully localized edits on 3D +meshes, while completely preserving the un-edited regions. We propose an +effective diffusion masking training strategy that, by design, facilitates +localized manipulation of any shape region, without being limited to predefined +regions or to sparse sets of predefined control vertices. Following our +framework, a user can explicitly set their manipulation region of choice and +define an arbitrary set of vertices as handles to edit a 3D mesh. Compared to +the current state-of-the-art our method leads to more interpretable shape +manipulations than methods relying on latent code state, greater localization +and generation diversity while offering faster inference than optimization +based approaches. Project page: https://rolpotamias.github.io/Shapefusion/ + +
+
+ comment: Project Page: https://rolpotamias.github.io/Shapefusion/ +
+
+
+
+
+ + ☆ Using Deep Learning to Increase Eye-Tracking Robustness, Accuracy, and + Precision in Virtual Reality + + +
+ Algorithms for the estimation of gaze direction from mobile and video-based +eye trackers typically involve tracking a feature of the eye that moves through +the eye camera image in a way that covaries with the shifting gaze direction, +such as the center or boundaries of the pupil. Tracking these features using +traditional computer vision techniques can be difficult due to partial +occlusion and environmental reflections. Although recent efforts to use machine +learning (ML) for pupil tracking have demonstrated superior results when +evaluated using standard measures of segmentation performance, little is known +of how these networks may affect the quality of the final gaze estimate. This +work provides an objective assessment of the impact of several contemporary +ML-based methods for eye feature tracking when the subsequent gaze estimate is +produced using either feature-based or model-based methods. Metrics include the +accuracy and precision of the gaze estimate, as well as drop-out rate. + +
+
+ comment: 16 pages, 10 figures, accepted to ETRA 2024 Full Papers +
+
+
+
+
+ + ☆ MIST: Mitigating Intersectional Bias with Disentangled Cross-Attention + Editing in Text-to-Image Diffusion Models + + +
+ Diffusion-based text-to-image models have rapidly gained popularity for their +ability to generate detailed and realistic images from textual descriptions. +However, these models often reflect the biases present in their training data, +especially impacting marginalized groups. While prior efforts to debias +language models have focused on addressing specific biases, such as racial or +gender biases, efforts to tackle intersectional bias have been limited. +Intersectional bias refers to the unique form of bias experienced by +individuals at the intersection of multiple social identities. Addressing +intersectional bias is crucial because it amplifies the negative effects of +discrimination based on race, gender, and other identities. In this paper, we +introduce a method that addresses intersectional bias in diffusion-based +text-to-image models by modifying cross-attention maps in a disentangled +manner. Our approach utilizes a pre-trained Stable Diffusion model, eliminates +the need for an additional set of reference images, and preserves the original +quality for unaltered concepts. Comprehensive experiments demonstrate that our +method surpasses existing approaches in mitigating both single and +intersectional biases across various attributes. We make our source code and +debiased models for various attributes available to encourage fairness in +generative models and to support further research. + +
+
+
+
+
+ + ♻ ☆ ACT-Diffusion: Efficient Adversarial Consistency Training for One-step + Diffusion Models CVPR 2024 + + +
+ Though diffusion models excel in image generation, their step-by-step +denoising leads to slow generation speeds. Consistency training addresses this +issue with single-step sampling but often produces lower-quality generations +and requires high training costs. In this paper, we show that optimizing +consistency training loss minimizes the Wasserstein distance between target and +generated distributions. As timestep increases, the upper bound accumulates +previous consistency training losses. Therefore, larger batch sizes are needed +to reduce both current and accumulated losses. We propose Adversarial +Consistency Training (ACT), which directly minimizes the Jensen-Shannon (JS) +divergence between distributions at each timestep using a discriminator. +Theoretically, ACT enhances generation quality, and convergence. By +incorporating a discriminator into the consistency training framework, our +method achieves improved FID scores on CIFAR10 and ImageNet 64$\times$64 and +LSUN Cat 256$\times$256 datasets, retains zero-shot image inpainting +capabilities, and uses less than $1/6$ of the original batch size and fewer +than $1/2$ of the model parameters and training steps compared to the baseline +method, this leads to a substantial reduction in resource consumption. Our code +is available:https://github.com/kong13661/ACT + +
+
+ comment: To appear in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Direct Superpoints Matching for Robust Point Cloud Registration + + +
+ Deep neural networks endow the downsampled superpoints with highly +discriminative feature representations. Previous dominant point cloud +registration approaches match these feature representations as the first step, +e.g., using the Sinkhorn algorithm. A RANSAC-like method is then usually +adopted as a post-processing refinement to filter the outliers. Other dominant +method is to directly predict the superpoint matchings using learned MLP +layers. Both of them have drawbacks: RANSAC-based methods are computationally +intensive and prediction-based methods suffer from outputing non-existing +points in the point cloud. In this paper, we propose a straightforward and +effective baseline to find correspondences of superpoints in a global matching +manner. We employ the normalized matching scores as weights for each +correspondence, allowing us to reject the outliers and further weigh the rest +inliers when fitting the transformation matrix without relying on the +cumbersome RANSAC. Moreover, the entire model can be trained in an end-to-end +fashion, leading to better accuracy. Our simple yet effective baseline shows +comparable or even better results than state-of-the-art methods on three +datasets including ModelNet, 3DMatch, and KITTI. We do not advocate our +approach to be \emph{the} solution for point cloud registration but use the +results to emphasize the role of matching strategy for point cloud +registration. The code and models are available at +https://github.com/neu-vi/Superpoints_Registration. + +
+
+
+
+
+ + ♻ ☆ Learnable Earth Parser: Discovering 3D Prototypes in Aerial Scans + + +
+ We propose an unsupervised method for parsing large 3D scans of real-world +scenes with easily-interpretable shapes. This work aims to provide a practical +tool for analyzing 3D scenes in the context of aerial surveying and mapping, +without the need for user annotations. Our approach is based on a probabilistic +reconstruction model that decomposes an input 3D point cloud into a small set +of learned prototypical 3D shapes. The resulting reconstruction is visually +interpretable and can be used to perform unsupervised instance and low-shot +semantic segmentation of complex scenes. We demonstrate the usefulness of our +model on a novel dataset of seven large aerial LiDAR scans from diverse +real-world scenarios. Our approach outperforms state-of-the-art unsupervised +methods in terms of decomposition accuracy while remaining visually +interpretable. Our code and dataset are available at +https://romainloiseau.fr/learnable-earth-parser/ + +
+
+
+
+
+ + ♻ ☆ Quantum machine learning for image classification + + +
+ Image classification, a pivotal task in multiple industries, faces +computational challenges due to the burgeoning volume of visual data. This +research addresses these challenges by introducing two quantum machine learning +models that leverage the principles of quantum mechanics for effective +computations. Our first model, a hybrid quantum neural network with parallel +quantum circuits, enables the execution of computations even in the noisy +intermediate-scale quantum era, where circuits with a large number of qubits +are currently infeasible. This model demonstrated a record-breaking +classification accuracy of 99.21% on the full MNIST dataset, surpassing the +performance of known quantum-classical models, while having eight times fewer +parameters than its classical counterpart. Also, the results of testing this +hybrid model on a Medical MNIST (classification accuracy over 99%), and on +CIFAR-10 (classification accuracy over 82%), can serve as evidence of the +generalizability of the model and highlights the efficiency of quantum layers +in distinguishing common features of input data. Our second model introduces a +hybrid quantum neural network with a Quanvolutional layer, reducing image +resolution via a convolution process. The model matches the performance of its +classical counterpart, having four times fewer trainable parameters, and +outperforms a classical model with equal weight parameters. These models +represent advancements in quantum machine learning research and illuminate the +path towards more accurate image classification systems. + +
+
+ comment: 13 pages, 10 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Boosting Latent Diffusion with Flow Matching + + +
+ Recently, there has been tremendous progress in visual synthesis and the +underlying generative models. Here, diffusion models (DMs) stand out +particularly, but lately, flow matching (FM) has also garnered considerable +interest. While DMs excel in providing diverse images, they suffer from long +training and slow generation. With latent diffusion, these issues are only +partially alleviated. Conversely, FM offers faster training and inference but +exhibits less diversity in synthesis. We demonstrate that introducing FM +between the Diffusion model and the convolutional decoder offers +high-resolution image synthesis with reduced computational cost and model size. +Diffusion can then efficiently provide the necessary generation diversity. FM +compensates for the lower resolution, mapping the small latent space to a +high-dimensional one. Subsequently, the convolutional decoder of the LDM maps +these latents to high-resolution images. By combining the diversity of DMs, the +efficiency of FMs, and the effectiveness of convolutional decoders, we achieve +state-of-the-art high-resolution image synthesis at $1024^2$ with minimal +computational cost. Importantly, our approach is orthogonal to recent +approximation and speed-up strategies for the underlying DMs, making it easily +integrable into various DM frameworks. + +
+
+
+
+
+ + ♻ ☆ Transcending Forgery Specificity with Latent Space Augmentation for + Generalizable Deepfake Detection + + +
+ Deepfake detection faces a critical generalization hurdle, with performance +deteriorating when there is a mismatch between the distributions of training +and testing data. A broadly received explanation is the tendency of these +detectors to be overfitted to forgery-specific artifacts, rather than learning +features that are widely applicable across various forgeries. To address this +issue, we propose a simple yet effective detector called LSDA +(\underline{L}atent \underline{S}pace \underline{D}ata +\underline{A}ugmentation), which is based on a heuristic idea: representations +with a wider variety of forgeries should be able to learn a more generalizable +decision boundary, thereby mitigating the overfitting of method-specific +features (see Fig.~\ref{fig:toy}). Following this idea, we propose to enlarge +the forgery space by constructing and simulating variations within and across +forgery features in the latent space. This approach encompasses the acquisition +of enriched, domain-specific features and the facilitation of smoother +transitions between different forgery types, effectively bridging domain gaps. +Our approach culminates in refining a binary classifier that leverages the +distilled knowledge from the enhanced features, striving for a generalizable +deepfake detector. Comprehensive experiments show that our proposed method is +surprisingly effective and transcends state-of-the-art detectors across several +widely used benchmarks. + +
+
+
+
+
+ + ♻ ☆ Zero-shot Referring Expression Comprehension via Structural Similarity + Between Images and Captions CVPR 2024 + + +
+ Zero-shot referring expression comprehension aims at localizing bounding +boxes in an image corresponding to provided textual prompts, which requires: +(i) a fine-grained disentanglement of complex visual scene and textual context, +and (ii) a capacity to understand relationships among disentangled entities. +Unfortunately, existing large vision-language alignment (VLA) models, e.g., +CLIP, struggle with both aspects so cannot be directly used for this task. To +mitigate this gap, we leverage large foundation models to disentangle both +images and texts into triplets in the format of (subject, predicate, object). +After that, grounding is accomplished by calculating the structural similarity +matrix between visual and textual triplets with a VLA model, and subsequently +propagate it to an instance-level similarity matrix. Furthermore, to equip VLA +models with the ability of relationship understanding, we design a +triplet-matching objective to fine-tune the VLA models on a collection of +curated dataset containing abundant entity relationships. Experiments +demonstrate that our visual grounding performance increase of up to 19.5% over +the SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo +dataset, our zero-shot approach achieves comparable accuracy to the fully +supervised model. Code is available at +https://github.com/Show-han/Zeroshot_REC. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Classifying Objects in 3D Point Clouds Using Recurrent Neural Network: A + GRU LSTM Hybrid Approach + + +
+ Accurate classification of objects in 3D point clouds is a significant +problem in several applications, such as autonomous navigation and +augmented/virtual reality scenarios, which has become a research hot spot. In +this paper, we presented a deep learning strategy for 3D object classification +in augmented reality. The proposed approach is a combination of the GRU and +LSTM. LSTM networks learn longer dependencies well, but due to the number of +gates, it takes longer to train; on the other hand, GRU networks have a weaker +performance than LSTM, but their training speed is much higher than GRU, which +is The speed is due to its fewer gates. The proposed approach used the +combination of speed and accuracy of these two networks. The proposed approach +achieved an accuracy of 0.99 in the 4,499,0641 points dataset, which includes +eight classes (unlabeled, man-made terrain, natural terrain, high vegetation, +low vegetation, buildings, hardscape, scanning artifacts, cars). Meanwhile, the +traditional machine learning approaches could achieve a maximum accuracy of +0.9489 in the best case. Keywords: Point Cloud Classification, Virtual Reality, +Hybrid Model, GRULSTM, GRU, LSTM + +
+
+
+
+
+ + ♻ ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from an over-reliance on unimodal biases (e.g., language +bias and vision bias), leading to incorrect answers in complex multimodal +tasks. To investigate this issue, we propose a causal framework to interpret +the biases in Visual Question Answering (VQA) problems. Within our framework, +we devise a causal graph to elucidate the predictions of MLLMs on VQA problems, +and assess the causal effect of biases through an in-depth causal analysis. +Motivated by the causal graph, we introduce a novel MORE dataset, consisting of +12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities, +necessitating multi-hop reasoning and the surmounting of unimodal biases. +Furthermore, we propose two strategies to mitigate unimodal biases and enhance +MLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA) +framework for limited-access MLLMs and the refinement of open-source MLLMs +through fine-tuning. Extensive quantitative and qualitative experiments offer +valuable insights for future research. Our project page is at +https://opencausalab.github.io/MORE. + +
+
+
+
+
+ + ♻ ☆ Learned representation-guided diffusion models for large-image + generation + + +
+ To synthesize high-fidelity samples, diffusion models typically require +auxiliary data to guide the generation process. However, it is impractical to +procure the painstaking patch-level annotation effort required in specialized +domains like histopathology and satellite imagery; it is often performed by +domain experts and involves hundreds of millions of patches. Modern-day +self-supervised learning (SSL) representations encode rich semantic and visual +information. In this paper, we posit that such representations are expressive +enough to act as proxies to fine-grained human labels. We introduce a novel +approach that trains diffusion models conditioned on embeddings from SSL. Our +diffusion models successfully project these features back to high-quality +histopathology and remote sensing images. In addition, we construct larger +images by assembling spatially consistent patches inferred from SSL embeddings, +preserving long-range dependencies. Augmenting real data by generating +variations of real images improves downstream classifier accuracy for +patch-level and larger, image-scale classification tasks. Our models are +effective even on datasets not encountered during training, demonstrating their +robustness and generalizability. Generating images from learned embeddings is +agnostic to the source of the embeddings. The SSL embeddings used to generate a +large image can either be extracted from a reference image, or sampled from an +auxiliary model conditioned on any related modality (e.g. class labels, text, +genomic data). As proof of concept, we introduce the text-to-large image +synthesis paradigm where we successfully synthesize large pathology and +satellite images out of text descriptions. + +
+
+
+
+
+ + ♻ ☆ Human Gaussian Splatting: Real-time Rendering of Animatable Avatars CVPR 2024 + + +
+ This work addresses the problem of real-time rendering of photorealistic +human body avatars learned from multi-view videos. While the classical +approaches to model and render virtual humans generally use a textured mesh, +recent research has developed neural body representations that achieve +impressive visual quality. However, these models are difficult to render in +real-time and their quality degrades when the character is animated with body +poses different than the training observations. We propose an animatable human +model based on 3D Gaussian Splatting, that has recently emerged as a very +efficient alternative to neural radiance fields. The body is represented by a +set of gaussian primitives in a canonical space which is deformed with a coarse +to fine approach that combines forward skinning and local non-rigid refinement. +We describe how to learn our Human Gaussian Splatting (HuGS) model in an +end-to-end fashion from multi-view observations, and evaluate it against the +state-of-the-art approaches for novel pose synthesis of clothed body. Our +method achieves 1.5 dB PSNR improvement over the state-of-the-art on THuman4 +dataset while being able to render in real-time (80 fps for 512x512 +resolution). + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Predicting Species Occurrence Patterns from Partial Observations ICLR 2024 + + +
+ To address the interlinked biodiversity and climate crises, we need an +understanding of where species occur and how these patterns are changing. +However, observational data on most species remains very limited, and the +amount of data available varies greatly between taxonomic groups. We introduce +the problem of predicting species occurrence patterns given (a) satellite +imagery, and (b) known information on the occurrence of other species. To +evaluate algorithms on this task, we introduce SatButterfly, a dataset of +satellite images, environmental data and observational data for butterflies, +which is designed to pair with the existing SatBird dataset of bird +observational data. To address this task, we propose a general model, R-Tran, +for predicting species occurrence patterns that enables the use of partial +observational data wherever found. We find that R-Tran outperforms other +methods in predicting species encounter rates with partial information both +within a taxon (birds) and across taxa (birds and butterflies). Our approach +opens new perspectives to leveraging insights from species with abundant data +to other species with scarce data, by modelling the ecosystems in which they +co-occur. + +
+
+ comment: Tackling Climate Change with Machine Learning workshop at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Generalizable Tumor Synthesis CVPR 2024 + + +
+ Tumor synthesis enables the creation of artificial tumors in medical images, +facilitating the training of AI models for tumor detection and segmentation. +However, success in tumor synthesis hinges on creating visually realistic +tumors that are generalizable across multiple organs and, furthermore, the +resulting AI models being capable of detecting real tumors in images sourced +from different domains (e.g., hospitals). This paper made a progressive stride +toward generalizable tumor synthesis by leveraging a critical observation: +early-stage tumors (< 2cm) tend to have similar imaging characteristics in +computed tomography (CT), whether they originate in the liver, pancreas, or +kidneys. We have ascertained that generative AI models, e.g., Diffusion Models, +can create realistic tumors generalized to a range of organs even when trained +on a limited number of tumor examples from only one organ. Moreover, we have +shown that AI models trained on these synthetic tumors can be generalized to +detect and segment real tumors from CT volumes, encompassing a broad spectrum +of patient demographics, imaging protocols, and healthcare facilities. + +
+
+ comment: The IEEE / CVF Computer Vision and Pattern Recognition Conference + (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ Parameter Efficient Fine-tuning via Cross Block Orchestration for + Segment Anything Model CVPR2024 + + +
+ Parameter-efficient fine-tuning (PEFT) is an effective methodology to unleash +the potential of large foundation models in novel scenarios with limited +training data. In the computer vision community, PEFT has shown effectiveness +in image classification, but little research has studied its ability for image +segmentation. Fine-tuning segmentation models usually require a heavier +adjustment of parameters to align the proper projection directions in the +parameter space for new scenarios. This raises a challenge to existing PEFT +algorithms, as they often inject a limited number of individual parameters into +each block, which prevents substantial adjustment of the projection direction +of the parameter space due to the limitation of Hidden Markov Chain along +blocks. In this paper, we equip PEFT with a cross-block orchestration mechanism +to enable the adaptation of the Segment Anything Model (SAM) to various +downstream scenarios. We introduce a novel inter-block communication module, +which integrates a learnable relation matrix to facilitate communication among +different coefficient sets of each PEFT block's parameter space. Moreover, we +propose an intra-block enhancement module, which introduces a linear projection +head whose weights are generated from a hyper-complex layer, further enhancing +the impact of the adjustment of projection directions on the entire parameter +space. Extensive experiments on diverse benchmarks demonstrate that our +proposed approach consistently improves the segmentation performance +significantly on novel scenarios with only around 1K additional parameters. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ MANUS: Markerless Grasp Capture using Articulated 3D Gaussians CVPR + + +
+ Understanding how we grasp objects with our hands has important applications +in areas like robotics and mixed reality. However, this challenging problem +requires accurate modeling of the contact between hands and objects. To capture +grasps, existing methods use skeletons, meshes, or parametric models that does +not represent hand shape accurately resulting in inaccurate contacts. We +present MANUS, a method for Markerless Hand-Object Grasp Capture using +Articulated 3D Gaussians. We build a novel articulated 3D Gaussians +representation that extends 3D Gaussian splatting for high-fidelity +representation of articulating hands. Since our representation uses Gaussian +primitives, it enables us to efficiently and accurately estimate contacts +between the hand and the object. For the most accurate results, our method +requires tens of camera views that current datasets do not provide. We +therefore build MANUS-Grasps, a new dataset that contains hand-object grasps +viewed from 50+ cameras across 30+ scenes, 3 subjects, and comprising over 7M +frames. In addition to extensive qualitative results, we also show that our +method outperforms others on a quantitative contact evaluation method that uses +paint transfer from the object to the hand. + +
+
+ comment: IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ SkillDiffuser: Interpretable Hierarchical Planning via Skill + Abstractions in Diffusion-Based Task Execution CVPR 2024 + + +
+ Diffusion models have demonstrated strong potential for robotic trajectory +planning. However, generating coherent trajectories from high-level +instructions remains challenging, especially for long-range composition tasks +requiring multiple sequential skills. We propose SkillDiffuser, an end-to-end +hierarchical planning framework integrating interpretable skill learning with +conditional diffusion planning to address this problem. At the higher level, +the skill abstraction module learns discrete, human-understandable skill +representations from visual observations and language instructions. These +learned skill embeddings are then used to condition the diffusion model to +generate customized latent trajectories aligned with the skills. This allows +generating diverse state trajectories that adhere to the learnable skills. By +integrating skill learning with conditional trajectory generation, +SkillDiffuser produces coherent behavior following abstract instructions across +diverse tasks. Experiments on multi-task robotic manipulation benchmarks like +Meta-World and LOReL demonstrate state-of-the-art performance and +human-interpretable skill representations from SkillDiffuser. More +visualization results and information could be found on our website. + +
+
+ comment: Accepted by CVPR 2024. Camera ready version. Project page: + https://skilldiffuser.github.io/ +
+
+
+
+
+ + ♻ ☆ Synthesize Step-by-Step: Tools, Templates and LLMs as Data Generators + for Reasoning-Based Chart VQA CVPR 2024 + + +
+ Understanding data visualizations like charts and plots requires reasoning +about both visual elements and numerics. Although strong in extractive +questions, current chart visual question answering (chart VQA) models suffer on +complex reasoning questions. In this work, we address the lack of reasoning +ability by data augmentation. We leverage Large Language Models (LLMs), which +have shown to have strong reasoning ability, as an automatic data annotator +that generates question-answer annotations for chart images. The key innovation +in our method lies in the Synthesize Step-by-Step strategy: our LLM-based data +generator learns to decompose the complex question into step-by-step +sub-questions (rationales), which are then used to derive the final answer +using external tools, i.e. Python. This step-wise generation procedure is +trained on synthetic data generated using a template-based QA generation +pipeline. Experimental results highlight the significance of the proposed +step-by-step generation. By training with the LLM-augmented data (LAMENDA), we +significantly enhance the chart VQA models, achieving the state-of-the-art +accuracy on the ChartQA and PlotQA datasets. In particular, our approach +improves the accuracy of the previous state-of-the-art approach from 38% to 54% +on the human-written questions in the ChartQA dataset, which needs strong +reasoning. We hope our work underscores the potential of synthetic data and +encourages further exploration of data augmentation using LLMs for +reasoning-heavy tasks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Accurate 3D Phenotyping in Greenhouse through Neural Radiance + Fields + + +
+ Accurate collection of plant phenotyping is critical to optimising +sustainable farming practices in precision agriculture. Traditional phenotyping +in controlled laboratory environments, while valuable, falls short in +understanding plant growth under real-world conditions. Emerging sensor and +digital technologies offer a promising approach for direct phenotyping of +plants in farm environments. This study investigates a learning-based +phenotyping method using the Neural Radiance Field to achieve accurate in-situ +phenotyping of pepper plants in greenhouse environments. To quantitatively +evaluate the performance of this method, traditional point cloud registration +on 3D scanning data is implemented for comparison. Experimental result shows +that NeRF(Neural Radiance Fields) achieves competitive accuracy compared to the +3D scanning methods. The mean distance error between the scanner-based method +and the NeRF-based method is 0.865mm. This study shows that the learning-based +NeRF method achieves similar accuracy to 3D scanning-based methods but with +improved scalability and robustness. + +
+
+
+
+
+ + ♻ ☆ Efficient Deep Learning-based Estimation of the Vital Signs on + Smartphones + + +
+ With the increasing use of smartphones in our daily lives, these devices have +become capable of performing many complex tasks. Concerning the need for +continuous monitoring of vital signs, especially for the elderly or those with +certain types of diseases, the development of algorithms that can estimate +vital signs using smartphones has attracted researchers worldwide. In +particular, researchers have been exploring ways to estimate vital signs, such +as heart rate, oxygen saturation levels, and respiratory rate, using algorithms +that can be run on smartphones. However, many of these algorithms require +multiple pre-processing steps that might introduce some implementation +overheads or require the design of a couple of hand-crafted stages to obtain an +optimal result. To address this issue, this research proposes a novel +end-to-end solution to mobile-based vital sign estimation using deep learning +that eliminates the need for pre-processing. By using a fully convolutional +architecture, the proposed model has much fewer parameters and less +computational complexity compared to the architectures that use fully-connected +layers as the prediction heads. This also reduces the risk of overfitting. +Additionally, a public dataset for vital sign estimation, which includes 62 +videos collected from 35 men and 27 women, is provided. Overall, the proposed +end-to-end approach promises significantly improved efficiency and performance +for on-device health monitoring on readily available consumer electronics. + +
+
+ comment: 10 pages, 8 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ A Comprehensive Study of Knowledge Editing for Large Language Models + + +
+ Large Language Models (LLMs) have shown extraordinary capabilities in +understanding and generating text that closely mirrors human communication. +However, a primary limitation lies in the significant computational demands +during training, arising from their extensive parameterization. This challenge +is further intensified by the dynamic nature of the world, necessitating +frequent updates to LLMs to correct outdated information or integrate new +knowledge, thereby ensuring their continued relevance. Note that many +applications demand continual model adjustments post-training to address +deficiencies or undesirable behaviors. There is an increasing interest in +efficient, lightweight methods for on-the-fly model modifications. To this end, +recent years have seen a burgeoning in the techniques of knowledge editing for +LLMs, which aim to efficiently modify LLMs' behaviors within specific domains +while preserving overall performance across various inputs. In this paper, we +first define the knowledge editing problem and then provide a comprehensive +review of cutting-edge approaches. Drawing inspiration from educational and +cognitive research theories, we propose a unified categorization criterion that +classifies knowledge editing methods into three groups: resorting to external +knowledge, merging knowledge into the model, and editing intrinsic knowledge. +Furthermore, we introduce a new benchmark, KnowEdit, for a comprehensive +empirical evaluation of representative knowledge editing approaches. +Additionally, we provide an in-depth analysis of knowledge location, which can +give a deeper understanding of the knowledge structures inherent within LLMs. +Finally, we discuss several potential applications of knowledge editing, +outlining its broad and impactful implications. + +
+
+ comment: Ongoing work; 52 pages, 282 citations; benchmark is available at + https://huggingface.co/datasets/zjunlp/KnowEdit code is available at + https://github.com/zjunlp/EasyEdit paper list is available at + https://github.com/zjunlp/KnowledgeEditingPapers +
+
+
+
+
+ + ♻ ☆ VisionKG: Unleashing the Power of Visual Datasets via Knowledge Graph ESWC 2024 + + +
+ The availability of vast amounts of visual data with heterogeneous features +is a key factor for developing, testing, and benchmarking of new computer +vision (CV) algorithms and architectures. Most visual datasets are created and +curated for specific tasks or with limited image data distribution for very +specific situations, and there is no unified approach to manage and access them +across diverse sources, tasks, and taxonomies. This not only creates +unnecessary overheads when building robust visual recognition systems, but also +introduces biases into learning systems and limits the capabilities of +data-centric AI. To address these problems, we propose the Vision Knowledge +Graph (VisionKG), a novel resource that interlinks, organizes and manages +visual datasets via knowledge graphs and Semantic Web technologies. It can +serve as a unified framework facilitating simple access and querying of +state-of-the-art visual datasets, regardless of their heterogeneous formats and +taxonomies. One of the key differences between our approach and existing +methods is that ours is knowledge-based rather than metadatabased. It enhances +the enrichment of the semantics at both image and instance levels and offers +various data retrieval and exploratory services via SPARQL. VisionKG currently +contains 519 million RDF triples that describe approximately 40 million +entities, and are accessible at https://vision.semkg.org and through APIs. With +the integration of 30 datasets and four popular CV tasks, we demonstrate its +usefulness across various scenarios when working with CV pipelines. + +
+
+ comment: Accepted at ESWC 2024 +
+
+
+
+
+ + ♻ ☆ GaussianAvatars: Photorealistic Head Avatars with Rigged 3D Gaussians + + +
+ We introduce GaussianAvatars, a new method to create photorealistic head +avatars that are fully controllable in terms of expression, pose, and +viewpoint. The core idea is a dynamic 3D representation based on 3D Gaussian +splats that are rigged to a parametric morphable face model. This combination +facilitates photorealistic rendering while allowing for precise animation +control via the underlying parametric model, e.g., through expression transfer +from a driving sequence or by manually changing the morphable model parameters. +We parameterize each splat by a local coordinate frame of a triangle and +optimize for explicit displacement offset to obtain a more accurate geometric +representation. During avatar reconstruction, we jointly optimize for the +morphable model parameters and Gaussian splat parameters in an end-to-end +fashion. We demonstrate the animation capabilities of our photorealistic avatar +in several challenging scenarios. For instance, we show reenactments from a +driving video, where our method outperforms existing works by a significant +margin. + +
+
+ comment: Project page: https://shenhanqian.github.io/gaussian-avatars +
+
+
+
+
+ + ♻ ☆ DiffusionPoser: Real-time Human Motion Reconstruction From Arbitrary + Sparse Sensors Using Autoregressive Diffusion CVPR2024 + + +
+ Motion capture from a limited number of body-worn sensors, such as inertial +measurement units (IMUs) and pressure insoles, has important applications in +health, human performance, and entertainment. Recent work has focused on +accurately reconstructing whole-body motion from a specific sensor +configuration using six IMUs. While a common goal across applications is to use +the minimal number of sensors to achieve required accuracy, the optimal +arrangement of the sensors might differ from application to application. We +propose a single diffusion model, DiffusionPoser, which reconstructs human +motion in real-time from an arbitrary combination of sensors, including IMUs +placed at specified locations, and, pressure insoles. Unlike existing methods, +our model grants users the flexibility to determine the number and arrangement +of sensors tailored to the specific activity of interest, without the need for +retraining. A novel autoregressive inferencing scheme ensures real-time motion +reconstruction that closely aligns with measured sensor signals. The generative +nature of DiffusionPoser ensures realistic behavior, even for +degrees-of-freedom not directly measured. Qualitative results can be found on +our website: https://diffusionposer.github.io/. + +
+
+ comment: accepted at CVPR2024 +
+
+
+
+
+ + ♻ ☆ Learning to reconstruct the bubble distribution with conductivity maps + using Invertible Neural Networks and Error Diffusion + + +
+ Electrolysis is crucial for eco-friendly hydrogen production, but gas bubbles +generated during the process hinder reactions, reduce cell efficiency, and +increase energy consumption. Additionally, these gas bubbles cause changes in +the conductivity inside the cell, resulting in corresponding variations in the +induced magnetic field around the cell. Therefore, measuring these gas +bubble-induced magnetic field fluctuations using external magnetic sensors and +solving the inverse problem of Biot-Savart Law allows for estimating the +conductivity in the cell and, thus, bubble size and location. However, +determining high-resolution conductivity maps from only a few induced magnetic +field measurements is an ill-posed inverse problem. To overcome this, we +exploit Invertible Neural Networks (INNs) to reconstruct the conductivity +field. Our qualitative results and quantitative evaluation using random error +diffusion show that INN achieves far superior performance compared to Tikhonov +regularization. + +
+
+ comment: Accepted for Oral presentation at WCIPT11 (11th World Congress on + Industrial Process Tomography) +
+
+
+
+
+ + ♻ ☆ Detoxifying Large Language Models via Knowledge Editing + + +
+ This paper investigates using knowledge editing techniques to detoxify Large +Language Models (LLMs). We construct a benchmark, SafeEdit, which covers nine +unsafe categories with various powerful attack prompts and equips comprehensive +metrics for systematic evaluation. We conduct experiments with several +knowledge editing approaches, indicating that knowledge editing has the +potential to efficiently detoxify LLMs with limited impact on general +performance. Then, we propose a simple yet effective baseline, dubbed +Detoxifying with Intraoperative Neural Monitoring (DINM), to diminish the +toxicity of LLMs within a few tuning steps via only one instance. We further +provide an in-depth analysis of the internal mechanism for various detoxify +approaches, demonstrating that previous methods like SFT and DPO may merely +suppress the activations of toxic parameters, while DINM mitigates the toxicity +of the toxic parameters to a certain extent, making permanent adjustments. We +hope that these insights could shed light on future work of developing +detoxifying approaches and the underlying knowledge mechanisms of LLMs. Code +and benchmark are available at https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Ongoing work. Project website: + https://zjunlp.github.io/project/SafeEdit Due to the specificity of the + knowledge editing setting, we revise Tables 1 and 3 to present a fair + comparison of experimental results. More experimental results will be updated + soon +
+
+
+
+
+ + ♻ ☆ Fake or JPEG? Revealing Common Biases in Generated Image Detection + Datasets + + +
+ The widespread adoption of generative image models has highlighted the urgent +need to detect artificial content, which is a crucial step in combating +widespread manipulation and misinformation. Consequently, numerous detectors +and associated datasets have emerged. However, many of these datasets +inadvertently introduce undesirable biases, thereby impacting the effectiveness +and evaluation of detectors. In this paper, we emphasize that many datasets for +AI-generated image detection contain biases related to JPEG compression and +image size. Using the GenImage dataset, we demonstrate that detectors indeed +learn from these undesired factors. Furthermore, we show that removing the +named biases substantially increases robustness to JPEG compression and +significantly alters the cross-generator performance of evaluated detectors. +Specifically, it leads to more than 11 percentage points increase in +cross-generator performance for ResNet50 and Swin-T detectors on the GenImage +dataset, achieving state-of-the-art results. + We provide the dataset and source codes of this paper on the anonymous +website: https://www.unbiased-genimage.org + +
+
+
+
+
+ + ♻ ☆ Segment Every Out-of-Distribution Object + + +
+ Semantic segmentation models, while effective for in-distribution categories, +face challenges in real-world deployment due to encountering +out-of-distribution (OoD) objects. Detecting these OoD objects is crucial for +safety-critical applications. Existing methods rely on anomaly scores, but +choosing a suitable threshold for generating masks presents difficulties and +can lead to fragmentation and inaccuracy. This paper introduces a method to +convert anomaly \textbf{S}core \textbf{T}o segmentation \textbf{M}ask, called +S2M, a simple and effective framework for OoD detection in semantic +segmentation. Unlike assigning anomaly scores to pixels, S2M directly segments +the entire OoD object. By transforming anomaly scores into prompts for a +promptable segmentation model, S2M eliminates the need for threshold selection. +Extensive experiments demonstrate that S2M outperforms the state-of-the-art by +approximately 20% in IoU and 40% in mean F1 score, on average, across various +benchmarks including Fishyscapes, Segment-Me-If-You-Can, and RoadAnomaly +datasets. + +
+
+ comment: 20 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Towards Low-Energy Adaptive Personalization for Resource-Constrained + Devices + + +
+ The personalization of machine learning (ML) models to address data drift is +a significant challenge in the context of Internet of Things (IoT) +applications. Presently, most approaches focus on fine-tuning either the full +base model or its last few layers to adapt to new data, while often neglecting +energy costs. However, various types of data drift exist, and fine-tuning the +full base model or the last few layers may not result in optimal performance in +certain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy +adaptive personalization framework designed for resource-constrained devices. +We categorize data drift and personalization into three types: input-level, +feature-level, and output-level. For each type, we fine-tune different blocks +of the model to achieve optimal performance with reduced energy costs. +Specifically, input-, feature-, and output-level correspond to fine-tuning the +front, middle, and rear blocks of the model. We evaluate TBFT on a ResNet +model, three datasets, three different training sizes, and a Raspberry Pi. +Compared with the $Block Avg$, where each block is fine-tuned individually and +their performance improvements are averaged, TBFT exhibits an improvement in +model accuracy by an average of 15.30% whilst saving 41.57% energy consumption +on average compared with full fine-tuning. + +
+
+ comment: Accepetd to The 4th Workshop on Machine Learning and Systems + (EuroMLSys '24) +
+
+
+
+
+ + ♻ ☆ ViTAR: Vision Transformer with Any Resolution + + +
+ This paper tackles a significant challenge faced by Vision Transformers +(ViTs): their constrained scalability across different image resolutions. +Typically, ViTs experience a performance decline when processing resolutions +different from those seen during training. Our work introduces two key +innovations to address this issue. Firstly, we propose a novel module for +dynamic resolution adjustment, designed with a single Transformer block, +specifically to achieve highly efficient incremental token integration. +Secondly, we introduce fuzzy positional encoding in the Vision Transformer to +provide consistent positional awareness across multiple resolutions, thereby +preventing overfitting to any single training resolution. Our resulting model, +ViTAR (Vision Transformer with Any Resolution), demonstrates impressive +adaptability, achieving 83.3\% top-1 accuracy at a 1120x1120 resolution and +80.4\% accuracy at a 4032x4032 resolution, all while reducing computational +costs. ViTAR also shows strong performance in downstream tasks such as instance +and semantic segmentation and can easily combined with self-supervised learning +techniques like Masked AutoEncoder. Our work provides a cost-effective solution +for enhancing the resolution scalability of ViTs, paving the way for more +versatile and efficient high-resolution image processing. + +
+
+
+
+
+ + ♻ ☆ Self-Discovering Interpretable Diffusion Latent Directions for + Responsible Text-to-Image Generation CVPR 2024 + + +
+ Diffusion-based models have gained significant popularity for text-to-image +generation due to their exceptional image-generation capabilities. A risk with +these models is the potential generation of inappropriate content, such as +biased or harmful images. However, the underlying reasons for generating such +undesired content from the perspective of the diffusion model's internal +representation remain unclear. Previous work interprets vectors in an +interpretable latent space of diffusion models as semantic concepts. However, +existing approaches cannot discover directions for arbitrary concepts, such as +those related to inappropriate concepts. In this work, we propose a novel +self-supervised approach to find interpretable latent directions for a given +concept. With the discovered vectors, we further propose a simple approach to +mitigate inappropriate generation. Extensive experiments have been conducted to +verify the effectiveness of our mitigation approach, namely, for fair +generation, safe generation, and responsible text-enhancing generation. Project +page: \url{https://interpretdiffusion.github.io}. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Sparse 3D Reconstruction via Object-Centric Ray Sampling + + +
+ We propose a novel method for 3D object reconstruction from a sparse set of +views captured from a 360-degree calibrated camera rig. We represent the object +surface through a hybrid model that uses both an MLP-based neural +representation and a triangle mesh. A key contribution in our work is a novel +object-centric sampling scheme of the neural representation, where rays are +shared among all views. This efficiently concentrates and reduces the number of +samples used to update the neural model at each iteration. This sampling scheme +relies on the mesh representation to ensure also that samples are +well-distributed along its normals. The rendering is then performed efficiently +by a differentiable renderer. We demonstrate that this sampling scheme results +in a more effective training of the neural representation, does not require the +additional supervision of segmentation masks, yields state of the art 3D +reconstructions, and works with sparse views on the Google's Scanned Objects, +Tank and Temples and MVMC Car datasets. Code available at: +https://github.com/llukmancerkezi/ROSTER + +
+
+
+
+
+ + ♻ ☆ OpenGraph: Open-Vocabulary Hierarchical 3D Graph Representation in + Large-Scale Outdoor Environments + + +
+ Environment representations endowed with sophisticated semantics are pivotal +for facilitating seamless interaction between robots and humans, enabling them +to effectively carry out various tasks. Open-vocabulary maps, powered by +Visual-Language models (VLMs), possess inherent advantages, including zero-shot +learning and support for open-set classes. However, existing open-vocabulary +maps are primarily designed for small-scale environments, such as desktops or +rooms, and are typically geared towards limited-area tasks involving robotic +indoor navigation or in-place manipulation. They face challenges in direct +generalization to outdoor environments characterized by numerous objects and +complex tasks, owing to limitations in both understanding level and map +structure. In this work, we propose OpenGraph, the first open-vocabulary +hierarchical graph representation designed for large-scale outdoor +environments. OpenGraph initially extracts instances and their captions from +visual images, enhancing textual reasoning by encoding them. Subsequently, it +achieves 3D incremental object-centric mapping with feature embedding by +projecting images onto LiDAR point clouds. Finally, the environment is +segmented based on lane graph connectivity to construct a hierarchical graph. +Validation results from public dataset SemanticKITTI demonstrate that OpenGraph +achieves the highest segmentation and query accuracy. The source code of +OpenGraph is publicly available at https://github.com/BIT-DYN/OpenGraph. + +
+
+
+
+
+ + ♻ ☆ Manifold Constraint Regularization for Remote Sensing Image Generation + + +
+ Generative Adversarial Networks (GANs) have shown notable accomplishments in +remote sensing domain. However, this paper reveals that their performance on +remote sensing images falls short when compared to their impressive results +with natural images. This study identifies a previously overlooked issue: GANs +exhibit a heightened susceptibility to overfitting on remote sensing images.To +address this challenge, this paper analyzes the characteristics of remote +sensing images and proposes manifold constraint regularization, a novel +approach that tackles overfitting of GANs on remote sensing images for the +first time. Our method includes a new measure for evaluating the structure of +the data manifold. Leveraging this measure, we propose the manifold constraint +regularization term, which not only alleviates the overfitting problem, but +also promotes alignment between the generated and real data manifolds, leading +to enhanced quality in the generated images. The effectiveness and versatility +of this method have been corroborated through extensive validation on various +remote sensing datasets and GAN models. The proposed method not only enhances +the quality of the generated images, reflected in a 3.13\% improvement in +Frechet Inception Distance (FID) score, but also boosts the performance of the +GANs on downstream tasks, evidenced by a 3.76\% increase in classification +accuracy. + +
+
+
+
+
+ + ♻ ☆ WinSyn: A High Resolution Testbed for Synthetic Data + + +
+ We present WinSyn, a unique dataset and testbed for creating high-quality +synthetic data with procedural modeling techniques. The dataset contains +high-resolution photographs of windows, selected from locations around the +world, with 89,318 individual window crops showcasing diverse geometric and +material characteristics. We evaluate a procedural model by training semantic +segmentation networks on both synthetic and real images and then comparing +their performances on a shared test set of real images. Specifically, we +measure the difference in mean Intersection over Union (mIoU) and determine the +effective number of real images to match synthetic data's training performance. +We design a baseline procedural model as a benchmark and provide 21,290 +synthetically generated images. By tuning the procedural model, key factors are +identified which significantly influence the model's fidelity in replicating +real-world scenarios. Importantly, we highlight the challenge of procedural +modeling using current techniques, especially in their ability to replicate the +spatial semantics of real-world scenarios. This insight is critical because of +the potential of procedural models to bridge to hidden scene aspects such as +depth, reflectivity, material properties, and lighting conditions. + +
+
+ comment: cvpr version +
+
+
+
+
+ + ♻ ☆ Frequency-Adaptive Dilated Convolution for Semantic Segmentation + + +
+ Dilated convolution, which expands the receptive field by inserting gaps +between its consecutive elements, is widely employed in computer vision. In +this study, we propose three strategies to improve individual phases of dilated +convolution from the view of spectrum analysis. Departing from the conventional +practice of fixing a global dilation rate as a hyperparameter, we introduce +Frequency-Adaptive Dilated Convolution (FADC), which dynamically adjusts +dilation rates spatially based on local frequency components. Subsequently, we +design two plug-in modules to directly enhance effective bandwidth and +receptive field size. The Adaptive Kernel (AdaKern) module decomposes +convolution weights into low-frequency and high-frequency components, +dynamically adjusting the ratio between these components on a per-channel +basis. By increasing the high-frequency part of convolution weights, AdaKern +captures more high-frequency components, thereby improving effective bandwidth. +The Frequency Selection (FreqSelect) module optimally balances high- and +low-frequency components in feature representations through spatially variant +reweighting. It suppresses high frequencies in the background to encourage FADC +to learn a larger dilation, thereby increasing the receptive field for an +expanded scope. Extensive experiments on segmentation and object detection +consistently validate the efficacy of our approach. The code is publicly +available at \url{https://github.com/Linwei-Chen/FADC}. + +
+
+
+
+
+ + ♻ ☆ MRFP: Learning Generalizable Semantic Segmentation from Sim-2-Real with + Multi-Resolution Feature Perturbation CVPR 2024 + + +
+ Deep neural networks have shown exemplary performance on semantic scene +understanding tasks on source domains, but due to the absence of style +diversity during training, enhancing performance on unseen target domains using +only single source domain data remains a challenging task. Generation of +simulated data is a feasible alternative to retrieving large style-diverse +real-world datasets as it is a cumbersome and budget-intensive process. +However, the large domain-specfic inconsistencies between simulated and +real-world data pose a significant generalization challenge in semantic +segmentation. In this work, to alleviate this problem, we propose a novel +MultiResolution Feature Perturbation (MRFP) technique to randomize +domain-specific fine-grained features and perturb style of coarse features. Our +experimental results on various urban-scene segmentation datasets clearly +indicate that, along with the perturbation of style-information, perturbation +of fine-feature components is paramount to learn domain invariant robust +feature maps for semantic segmentation models. MRFP is a simple and +computationally efficient, transferable module with no additional learnable +parameters or objective functions, that helps state-of-the-art deep neural +networks to learn robust domain invariant features for simulation-to-real +semantic segmentation. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Continual Learning: Applications and the Road Forward + + +
+ Continual learning is a subfield of machine learning, which aims to allow +machine learning models to continuously learn on new data, by accumulating +knowledge without forgetting what was learned in the past. In this work, we +take a step back, and ask: "Why should one care about continual learning in the +first place?". We set the stage by examining recent continual learning papers +published at four major machine learning conferences, and show that +memory-constrained settings dominate the field. Then, we discuss five open +problems in machine learning, and even though they might seem unrelated to +continual learning at first sight, we show that continual learning will +inevitably be part of their solution. These problems are model editing, +personalization and specialization, on-device learning, faster (re-)training +and reinforcement learning. Finally, by comparing the desiderata from these +unsolved problems and the current assumptions in continual learning, we +highlight and discuss four future directions for continual learning research. +We hope that this work offers an interesting perspective on the future of +continual learning, while displaying its potential value and the paths we have +to pursue in order to make it successful. This work is the result of the many +discussions the authors had at the Dagstuhl seminar on Deep Continual Learning, +in March 2023. + +
+
+
+
+
+ + ♻ ☆ TimeChat: A Time-sensitive Multimodal Large Language Model for Long + Video Understanding CVPR 2024 + + +
+ This work proposes TimeChat, a time-sensitive multimodal large language model +specifically designed for long video understanding. Our model incorporates two +key architectural contributions: (1) a timestamp-aware frame encoder that binds +visual content with the timestamp of each frame, and (2) a sliding video +Q-Former that produces a video token sequence of varying lengths to accommodate +videos of various durations. Additionally, we construct an instruction-tuning +dataset, encompassing 6 tasks and a total of 125K instances, to further enhance +TimeChat's instruction-following performance. Experiment results across various +video understanding tasks, such as dense captioning, temporal grounding, and +highlight detection, demonstrate TimeChat's strong zero-shot temporal +localization and reasoning capabilities. For example, it achieves +9.2 F1 score +and +2.8 CIDEr on YouCook2, +5.8 HIT@1 on QVHighlights, and +27.5 R@1 (IoU=0.5) +on Charades-STA, compared to state-of-the-art video large language models, +holding the potential to serve as a versatile video assistant for long-form +video comprehension tasks and satisfy realistic user requirements. + +
+
+ comment: CVPR 2024 camera-ready version, code is available at + https://github.com/RenShuhuai-Andy/TimeChat +
+
+
+
+
+ + ♻ ☆ RFAConv: Innovating Spatial Attention and Standard Convolutional + Operation + + +
+ Spatial attention has been widely used to improve the performance of +convolutional neural networks. However, it has certain limitations. In this +paper, we propose a new perspective on the effectiveness of spatial attention, +which is that the spatial attention mechanism essentially solves the problem of +convolutional kernel parameter sharing. However, the information contained in +the attention map generated by spatial attention is not sufficient for +large-size convolutional kernels. Therefore, we propose a novel attention +mechanism called Receptive-Field Attention (RFA). Existing spatial attention, +such as Convolutional Block Attention Module (CBAM) and Coordinated Attention +(CA) focus only on spatial features, which does not fully address the problem +of convolutional kernel parameter sharing. In contrast, RFA not only focuses on +the receptive-field spatial feature but also provides effective attention +weights for large-size convolutional kernels. The Receptive-Field Attention +convolutional operation (RFAConv), developed by RFA, represents a new approach +to replace the standard convolution operation. It offers nearly negligible +increment of computational cost and parameters, while significantly improving +network performance. We conducted a series of experiments on ImageNet-1k, COCO, +and VOC datasets to demonstrate the superiority of our approach. Of particular +importance, we believe that it is time to shift focus from spatial features to +receptive-field spatial features for current spatial attention mechanisms. In +this way, we can further improve network performance and achieve even better +results. The code and pre-trained models for the relevant tasks can be found at +https://github.com/Liuchen1997/RFAConv. + +
+
+ comment: 12 pages, 11figures +
+
+
+
+
+ + ♻ ☆ DoseDiff: Distance-aware Diffusion Model for Dose Prediction in + Radiotherapy + + +
+ Treatment planning, which is a critical component of the radiotherapy +workflow, is typically carried out by a medical physicist in a time-consuming +trial-and-error manner. Previous studies have proposed knowledge-based or +deep-learning-based methods for predicting dose distribution maps to assist +medical physicists in improving the efficiency of treatment planning. However, +these dose prediction methods usually fail to effectively utilize distance +information between surrounding tissues and targets or organs-at-risk (OARs). +Moreover, they are poor at maintaining the distribution characteristics of ray +paths in the predicted dose distribution maps, resulting in a loss of valuable +information. In this paper, we propose a distance-aware diffusion model +(DoseDiff) for precise prediction of dose distribution. We define dose +prediction as a sequence of denoising steps, wherein the predicted dose +distribution map is generated with the conditions of the computed tomography +(CT) image and signed distance maps (SDMs). The SDMs are obtained by distance +transformation from the masks of targets or OARs, which provide the distance +from each pixel in the image to the outline of the targets or OARs. We further +propose a multi-encoder and multi-scale fusion network (MMFNet) that +incorporates multi-scale and transformer-based fusion modules to enhance +information fusion between the CT image and SDMs at the feature level. We +evaluate our model on two in-house datasets and a public dataset, respectively. +The results demonstrate that our DoseDiff method outperforms state-of-the-art +dose prediction methods in terms of both quantitative performance and visual +quality. + +
+
+
+
+
+ + ♻ ☆ H2ASeg: Hierarchical Adaptive Interaction and Weighting Network for + Tumor Segmentation in PET/CT Images + + +
+ Positron emission tomography (PET) combined with computed tomography (CT) +imaging is routinely used in cancer diagnosis and prognosis by providing +complementary information. Automatically segmenting tumors in PET/CT images can +significantly improve examination efficiency. Traditional multi-modal +segmentation solutions mainly rely on concatenation operations for modality +fusion, which fail to effectively model the non-linear dependencies between PET +and CT modalities. Recent studies have investigated various approaches to +optimize the fusion of modality-specific features for enhancing joint +representations. However, modality-specific encoders used in these methods +operate independently, inadequately leveraging the synergistic relationships +inherent in PET and CT modalities, for example, the complementarity between +semantics and structure. To address these issues, we propose a Hierarchical +Adaptive Interaction and Weighting Network termed H2ASeg to explore the +intrinsic cross-modal correlations and transfer potential complementary +information. Specifically, we design a Modality-Cooperative Spatial Attention +(MCSA) module that performs intra- and inter-modal interactions globally and +locally. Additionally, a Target-Aware Modality Weighting (TAMW) module is +developed to highlight tumor-related features within multi-modal features, +thereby refining tumor segmentation. By embedding these modules across +different layers, H2ASeg can hierarchically model cross-modal correlations, +enabling a nuanced understanding of both semantic and structural tumor +features. Extensive experiments demonstrate the superiority of H2ASeg, +outperforming state-of-the-art methods on AutoPet-II and Hecktor2022 +benchmarks. The code is released at https://github.com/JinPLu/H2ASeg. + +
+
+ comment: 10 pages,4 figures +
+
+
+
+
+ + ♻ ☆ EgoThink: Evaluating First-Person Perspective Thinking Capability of + Vision-Language Models + + +
+ Vision-language models (VLMs) have recently shown promising results in +traditional downstream tasks. Evaluation studies have emerged to assess their +abilities, with the majority focusing on the third-person perspective, and only +a few addressing specific tasks from the first-person perspective. However, the +capability of VLMs to "think" from a first-person perspective, a crucial +attribute for advancing autonomous agents and robotics, remains largely +unexplored. To bridge this research gap, we introduce EgoThink, a novel visual +question-answering benchmark that encompasses six core capabilities with twelve +detailed dimensions. The benchmark is constructed using selected clips from +egocentric videos, with manually annotated question-answer pairs containing +first-person information. To comprehensively assess VLMs, we evaluate eighteen +popular VLMs on EgoThink. Moreover, given the open-ended format of the answers, +we use GPT-4 as the automatic judge to compute single-answer grading. +Experimental results indicate that although GPT-4V leads in numerous +dimensions, all evaluated VLMs still possess considerable potential for +improvement in first-person perspective tasks. Meanwhile, enlarging the number +of trainable parameters has the most significant impact on model performance on +EgoThink. In conclusion, EgoThink serves as a valuable addition to existing +evaluation benchmarks for VLMs, providing an indispensable resource for future +research in the realm of embodied artificial intelligence and robotics. + +
+
+
+
+
+ + ♻ ☆ Data-free Defense of Black Box Models Against Adversarial Attacks CVPR + + +
+ Several companies often safeguard their trained deep models (i.e., details of +architecture, learnt weights, training details etc.) from third-party users by +exposing them only as black boxes through APIs. Moreover, they may not even +provide access to the training data due to proprietary reasons or sensitivity +concerns. In this work, we propose a novel defense mechanism for black box +models against adversarial attacks in a data-free set up. We construct +synthetic data via generative model and train surrogate network using model +stealing techniques. To minimize adversarial contamination on perturbed +samples, we propose 'wavelet noise remover' (WNR) that performs discrete +wavelet decomposition on input images and carefully select only a few important +coefficients determined by our 'wavelet coefficient selection module' (WCSM). +To recover the high-frequency content of the image after noise removal via WNR, +we further train a 'regenerator' network with an objective to retrieve the +coefficients such that the reconstructed image yields similar to original +predictions on the surrogate model. At test time, WNR combined with trained +regenerator network is prepended to the black box network, resulting in a high +boost in adversarial accuracy. Our method improves the adversarial accuracy on +CIFAR-10 by 38.98% and 32.01% on state-of-the-art Auto Attack compared to +baseline, even when the attacker uses surrogate architecture (Alexnet-half and +Alexnet) similar to the black box architecture (Alexnet) with same model +stealing strategy as defender. The code is available at +https://github.com/vcl-iisc/data-free-black-box-defense + +
+
+ comment: CVPR Workshop (Under Review) +
+
+
+
+
+ + ♻ ☆ Intrinsic Image Decomposition Using Point Cloud Representation + + +
+ The purpose of intrinsic decomposition is to separate an image into its +albedo (reflective properties) and shading components (illumination +properties). This is challenging because it's an ill-posed problem. +Conventional approaches primarily concentrate on 2D imagery and fail to fully +exploit the capabilities of 3D data representation. 3D point clouds offer a +more comprehensive format for representing scenes, as they combine geometric +and color information effectively. To this end, in this paper, we introduce +Point Intrinsic Net (PoInt-Net), which leverages 3D point cloud data to +concurrently estimate albedo and shading maps. The merits of PoInt-Net include +the following aspects. First, the model is efficient, achieving consistent +performance across point clouds of any size with training only required on +small-scale point clouds. Second, it exhibits remarkable robustness; even when +trained exclusively on datasets comprising individual objects, PoInt-Net +demonstrates strong generalization to unseen objects and scenes. Third, it +delivers superior accuracy over conventional 2D approaches, demonstrating +enhanced performance across various metrics on different datasets. (Code +Released) + +
+
+ comment: Code: https://github.com/xyxingx/PoInt-Net +
+
+
+
+
+ + ♻ ☆ UADA3D: Unsupervised Adversarial Domain Adaptation for 3D Object + Detection with Sparse LiDAR and Large Domain Gaps + + +
+ In this study, we address a gap in existing unsupervised domain adaptation +approaches on LiDAR-based 3D object detection, which have predominantly +concentrated on adapting between established, high-density autonomous driving +datasets. We focus on sparser point clouds, capturing scenarios from different +perspectives: not just from vehicles on the road but also from mobile robots on +sidewalks, which encounter significantly different environmental conditions and +sensor configurations. We introduce Unsupervised Adversarial Domain Adaptation +for 3D Object Detection (UADA3D). UADA3D does not depend on pre-trained source +models or teacher-student architectures. Instead, it uses an adversarial +approach to directly learn domain-invariant features. We demonstrate its +efficacy in various adaptation scenarios, showing significant improvements in +both self-driving car and mobile robot domains. Our code is open-source and +will be available soon. + +
+
+
+
+
+ + ♻ ☆ AnimatableDreamer: Text-Guided Non-rigid 3D Model Generation and + Reconstruction with Canonical Score Distillation + + +
+ Advances in 3D generation have facilitated sequential 3D model generation +(a.k.a 4D generation), yet its application for animatable objects with large +motion remains scarce. Our work proposes AnimatableDreamer, a text-to-4D +generation framework capable of generating diverse categories of non-rigid +objects on skeletons extracted from a monocular video. At its core, +AnimatableDreamer is equipped with our novel optimization design dubbed +Canonical Score Distillation (CSD), which lifts 2D diffusion for temporal +consistent 4D generation. CSD, designed from a score gradient perspective, +generates a canonical model with warp-robustness across different +articulations. Notably, it also enhances the authenticity of bones and skinning +by integrating inductive priors from a diffusion model. Furthermore, with +multi-view distillation, CSD infers invisible regions, thereby improving the +fidelity of monocular non-rigid reconstruction. Extensive experiments +demonstrate the capability of our method in generating high-flexibility +text-guided 3D models from the monocular video, while also showing improved +reconstruction performance over existing non-rigid reconstruction methods. + +
+
+ comment: Project page: https://animatabledreamer.github.io/ +
+
+
+
+
+ + ♻ ☆ Text2Loc: 3D Point Cloud Localization from Natural Language CVPR 2024 + + +
+ We tackle the problem of 3D point cloud localization based on a few natural +linguistic descriptions and introduce a novel neural network, Text2Loc, that +fully interprets the semantic relationship between points and text. Text2Loc +follows a coarse-to-fine localization pipeline: text-submap global place +recognition, followed by fine localization. In global place recognition, +relational dynamics among each textual hint are captured in a hierarchical +transformer with max-pooling (HTM), whereas a balance between positive and +negative pairs is maintained using text-submap contrastive learning. Moreover, +we propose a novel matching-free fine localization method to further refine the +location predictions, which completely removes the need for complicated +text-instance matching and is lighter, faster, and more accurate than previous +methods. Extensive experiments show that Text2Loc improves the localization +accuracy by up to $2\times$ over the state-of-the-art on the KITTI360Pose +dataset. Our project page is publicly available at +\url{https://yan-xia.github.io/projects/text2loc/}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RT-SRTS: Angle-Agnostic Real-Time Simultaneous 3D Reconstruction and + Tumor Segmentation from Single X-Ray Projection + + +
+ Radiotherapy is one of the primary treatment methods for tumors, but the +organ movement caused by respiration limits its accuracy. Recently, 3D imaging +from a single X-ray projection has received extensive attention as a promising +approach to address this issue. However, current methods can only reconstruct +3D images without directly locating the tumor and are only validated for +fixed-angle imaging, which fails to fully meet the requirements of motion +control in radiotherapy. In this study, a novel imaging method RT-SRTS is +proposed which integrates 3D imaging and tumor segmentation into one network +based on multi-task learning (MTL) and achieves real-time simultaneous 3D +reconstruction and tumor segmentation from a single X-ray projection at any +angle. Furthermore, the attention enhanced calibrator (AEC) and +uncertain-region elaboration (URE) modules have been proposed to aid feature +extraction and improve segmentation accuracy. The proposed method was evaluated +on fifteen patient cases and compared with three state-of-the-art methods. It +not only delivers superior 3D reconstruction but also demonstrates commendable +tumor segmentation results. Simultaneous reconstruction and segmentation can be +completed in approximately 70 ms, significantly faster than the required time +threshold for real-time tumor tracking. The efficacies of both AEC and URE have +also been validated in ablation studies. The code of work is available at +https://github.com/ZywooSimple/RT-SRTS. + +
+
+
+
+
+ + ♻ ☆ NaviNeRF: NeRF-based 3D Representation Disentanglement by Latent + Semantic Navigation + + +
+ 3D representation disentanglement aims to identify, decompose, and manipulate +the underlying explanatory factors of 3D data, which helps AI fundamentally +understand our 3D world. This task is currently under-explored and poses great +challenges: (i) the 3D representations are complex and in general contains much +more information than 2D image; (ii) many 3D representations are not well +suited for gradient-based optimization, let alone disentanglement. To address +these challenges, we use NeRF as a differentiable 3D representation, and +introduce a self-supervised Navigation to identify interpretable semantic +directions in the latent space. To our best knowledge, this novel method, +dubbed NaviNeRF, is the first work to achieve fine-grained 3D disentanglement +without any priors or supervisions. Specifically, NaviNeRF is built upon the +generative NeRF pipeline, and equipped with an Outer Navigation Branch and an +Inner Refinement Branch. They are complementary -- the outer navigation is to +identify global-view semantic directions, and the inner refinement dedicates to +fine-grained attributes. A synergistic loss is further devised to coordinate +two branches. Extensive experiments demonstrate that NaviNeRF has a superior +fine-grained 3D disentanglement ability than the previous 3D-aware models. Its +performance is also comparable to editing-oriented models relying on semantic +or geometry priors. + +
+
+
+
+
+ + ♻ ☆ MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient + image-text retrieval + + +
+ Due to the success of large-scale visual-language pretraining (VLP) models +and the widespread use of image-text retrieval in industry areas, it is now +critically necessary to reduce the model size and streamline their +mobile-device deployment. Single- and dual-stream model structures are commonly +used in image-text retrieval with the goal of closing the semantic gap between +textual and visual modalities. While single-stream models use deep feature +fusion to achieve more accurate cross-model alignment, dual-stream models are +better at offline indexing and fast inference.We propose a Multi-teacher +Cross-modality Alignment Distillation (MCAD) technique to integrate the +advantages of single- and dual-stream models. By incorporating the fused +single-stream features into the image and text features of the dual-stream +model, we formulate new modified teacher similarity distributions and features. +Then, we conduct both distribution and feature distillation to boost the +capability of the student dual-stream model, achieving high retrieval +performance without increasing inference complexity.Extensive experiments +demonstrate the remarkable performance and high efficiency of MCAD on +image-text retrieval tasks. Furthermore, we implement a lightweight CLIP model +on Snapdragon/Dimensity chips with only $\sim$100M running memory and +$\sim$8.0ms search latency, achieving the mobile-device application of VLP +models. + +
+
+
+
+
+ + ♻ ☆ Detect-Order-Construct: A Tree Construction based Approach for + Hierarchical Document Structure Analysis + + +
+ Document structure analysis (aka document layout analysis) is crucial for +understanding the physical layout and logical structure of documents, with +applications in information retrieval, document summarization, knowledge +extraction, etc. In this paper, we concentrate on Hierarchical Document +Structure Analysis (HDSA) to explore hierarchical relationships within +structured documents created using authoring software employing hierarchical +schemas, such as LaTeX, Microsoft Word, and HTML. To comprehensively analyze +hierarchical document structures, we propose a tree construction based approach +that addresses multiple subtasks concurrently, including page object detection +(Detect), reading order prediction of identified objects (Order), and the +construction of intended hierarchical structure (Construct). We present an +effective end-to-end solution based on this framework to demonstrate its +performance. To assess our approach, we develop a comprehensive benchmark +called Comp-HRDoc, which evaluates the above subtasks simultaneously. Our +end-to-end system achieves state-of-the-art performance on two large-scale +document layout analysis datasets (PubLayNet and DocLayNet), a high-quality +hierarchical document structure reconstruction dataset (HRDoc), and our +Comp-HRDoc benchmark. The Comp-HRDoc benchmark will be released to facilitate +further research in this field. + +
+
+ comment: Submitted to Pattern Recognition +
+
+
+
+
+ + ♻ ☆ Multi-modal In-Context Learning Makes an Ego-evolving Scene Text + Recognizer CVPR2024 + + +
+ Scene text recognition (STR) in the wild frequently encounters challenges +when coping with domain variations, font diversity, shape deformations, etc. A +straightforward solution is performing model fine-tuning tailored to a specific +scenario, but it is computationally intensive and requires multiple model +copies for various scenarios. Recent studies indicate that large language +models (LLMs) can learn from a few demonstration examples in a training-free +manner, termed "In-Context Learning" (ICL). Nevertheless, applying LLMs as a +text recognizer is unacceptably resource-consuming. Moreover, our pilot +experiments on LLMs show that ICL fails in STR, mainly attributed to the +insufficient incorporation of contextual information from diverse samples in +the training stage. To this end, we introduce E$^2$STR, a STR model trained +with context-rich scene text sequences, where the sequences are generated via +our proposed in-context training strategy. E$^2$STR demonstrates that a +regular-sized model is sufficient to achieve effective ICL capabilities in STR. +Extensive experiments show that E$^2$STR exhibits remarkable training-free +adaptation in various scenarios and outperforms even the fine-tuned +state-of-the-art approaches on public benchmarks. The code is released at +https://github.com/bytedance/E2STR . + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Scalable Diffusion Models with State Space Backbone + + +
+ This paper presents a new exploration into a category of diffusion models +built upon state space architecture. We endeavor to train diffusion models for +image data, wherein the traditional U-Net backbone is supplanted by a state +space backbone, functioning on raw patches or latent space. Given its notable +efficacy in accommodating long-range dependencies, Diffusion State Space Models +(DiS) are distinguished by treating all inputs including time, condition, and +noisy image patches as tokens. Our assessment of DiS encompasses both +unconditional and class-conditional image generation scenarios, revealing that +DiS exhibits comparable, if not superior, performance to CNN-based or +Transformer-based U-Net architectures of commensurate size. Furthermore, we +analyze the scalability of DiS, gauged by the forward pass complexity +quantified in Gflops. DiS models with higher Gflops, achieved through +augmentation of depth/width or augmentation of input tokens, consistently +demonstrate lower FID. In addition to demonstrating commendable scalability +characteristics, DiS-H/2 models in latent space achieve performance levels akin +to prior diffusion models on class-conditional ImageNet benchmarks at the +resolution of 256$\times$256 and 512$\times$512, while significantly reducing +the computational burden. The code and models are available at: +https://github.com/feizc/DiS. + +
+
+
+
+
+ + ♻ ☆ OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor for + General Video Recognition + + +
+ Due to the resource-intensive nature of training vision-language models on +expansive video data, a majority of studies have centered on adapting +pre-trained image-language models to the video domain. Dominant pipelines +propose to tackle the visual discrepancies with additional temporal learners +while overlooking the substantial discrepancy for web-scaled descriptive +narratives and concise action category names, leading to less distinct semantic +space and potential performance limitations. In this work, we prioritize the +refinement of text knowledge to facilitate generalizable video recognition. To +address the limitations of the less distinct semantic space of category names, +we prompt a large language model (LLM) to augment action class names into +Spatio-Temporal Descriptors thus bridging the textual discrepancy and serving +as a knowledge base for general recognition. Moreover, to assign the best +descriptors with different video instances, we propose Optimal Descriptor +Solver, forming the video recognition problem as solving the optimal matching +flow across frame-level representations and descriptors. Comprehensive +evaluations in zero-shot, few-shot, and fully supervised video recognition +highlight the effectiveness of our approach. Our best model achieves a +state-of-the-art zero-shot accuracy of 75.1% on Kinetics-600. + +
+
+ comment: Technical report. Project Page: https://tomchen-ctj.github.io/OST/ +
+
+
+
+
+ + ♻ ☆ FedSOL: Stabilized Orthogonal Learning with Proximal Restrictions in + Federated Learning CVPR 2024 + + +
+ Federated Learning (FL) aggregates locally trained models from individual +clients to construct a global model. While FL enables learning a model with +data privacy, it often suffers from significant performance degradation when +clients have heterogeneous data distributions. This data heterogeneity causes +the model to forget the global knowledge acquired from previously sampled +clients after being trained on local datasets. Although the introduction of +proximal objectives in local updates helps to preserve global knowledge, it can +also hinder local learning by interfering with local objectives. To address +this problem, we propose a novel method, Federated Stabilized Orthogonal +Learning (FedSOL), which adopts an orthogonal learning strategy to balance the +two conflicting objectives. FedSOL is designed to identify gradients of local +objectives that are inherently orthogonal to directions affecting the proximal +objective. Specifically, FedSOL targets parameter regions where learning on the +local objective is minimally influenced by proximal weight perturbations. Our +experiments demonstrate that FedSOL consistently achieves state-of-the-art +performance across various scenarios. + +
+
+ comment: The IEEE/CVF Conference on Computer Vision and Pattern Recognition + 2024 (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ UFineBench: Towards Text-based Person Retrieval with Ultra-fine + Granularity + + +
+ Existing text-based person retrieval datasets often have relatively +coarse-grained text annotations. This hinders the model to comprehend the +fine-grained semantics of query texts in real scenarios. To address this +problem, we contribute a new benchmark named \textbf{UFineBench} for text-based +person retrieval with ultra-fine granularity. + Firstly, we construct a new \textbf{dataset} named UFine6926. We collect a +large number of person images and manually annotate each image with two +detailed textual descriptions, averaging 80.8 words each. The average word +count is three to four times that of the previous datasets. In addition of +standard in-domain evaluation, we also propose a special \textbf{evaluation +paradigm} more representative of real scenarios. It contains a new evaluation +set with cross domains, cross textual granularity and cross textual styles, +named UFine3C, and a new evaluation metric for accurately measuring retrieval +ability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a +more efficient \textbf{algorithm} especially designed for text-based person +retrieval with ultra fine-grained texts. It achieves fine granularity mining by +adopting a shared cross-modal granularity decoder and hard negative match +mechanism. + With standard in-domain evaluation, CFAM establishes competitive performance +across various datasets, especially on our ultra fine-grained UFine6926. +Furthermore, by evaluating on UFine3C, we demonstrate that training on our +UFine6926 significantly improves generalization to real scenarios compared with +other coarse-grained datasets. The dataset and code will be made publicly +available at \url{https://github.com/Zplusdragon/UFineBench}. + +
+
+
+
+
+ + ♻ ☆ Subjective-Aligned Dataset and Metric for Text-to-Video Quality + Assessment + + +
+ With the rapid development of generative models, Artificial +Intelligence-Generated Contents (AIGC) have exponentially increased in daily +lives. Among them, Text-to-Video (T2V) generation has received widespread +attention. Though many T2V models have been released for generating high +perceptual quality videos, there is still lack of a method to evaluate the +quality of these videos quantitatively. To solve this issue, we establish the +largest-scale Text-to-Video Quality Assessment DataBase (T2VQA-DB) to date. The +dataset is composed of 10,000 videos generated by 9 different T2V models. We +also conduct a subjective study to obtain each video's corresponding mean +opinion score. Based on T2VQA-DB, we propose a novel transformer-based model +for subjective-aligned Text-to-Video Quality Assessment (T2VQA). The model +extracts features from text-video alignment and video fidelity perspectives, +then it leverages the ability of a large language model to give the prediction +score. Experimental results show that T2VQA outperforms existing T2V metrics +and SOTA video quality assessment models. Quantitative analysis indicates that +T2VQA is capable of giving subjective-align predictions, validating its +effectiveness. The dataset and code will be released at +https://github.com/QMME/T2VQA. + +
+
+
+
+
+ + ♻ ☆ ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth + Estimation CVPR + + +
+ In the absence of parallax cues, a learning-based single image depth +estimation (SIDE) model relies heavily on shading and contextual cues in the +image. While this simplicity is attractive, it is necessary to train such +models on large and varied datasets, which are difficult to capture. It has +been shown that using embeddings from pre-trained foundational models, such as +CLIP, improves zero shot transfer in several applications. Taking inspiration +from this, in our paper we explore the use of global image priors generated +from a pre-trained ViT model to provide more detailed contextual information. +We argue that the embedding vector from a ViT model, pre-trained on a large +dataset, captures greater relevant information for SIDE than the usual route of +generating pseudo image captions, followed by CLIP based text embeddings. Based +on this idea, we propose a new SIDE model using a diffusion backbone which is +conditioned on ViT embeddings. Our proposed design establishes a new +state-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of +0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on +KITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to +0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model +trained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%) +over NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%, +18%, 45%, 9%) by ZoeDepth. The code is available at +https://ecodepth-iitd.github.io + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ Can 3D Vision-Language Models Truly Understand Natural Language? + + +
+ Rapid advancements in 3D vision-language (3D-VL) tasks have opened up new +avenues for human interaction with embodied agents or robots using natural +language. Despite this progress, we find a notable limitation: existing 3D-VL +models exhibit sensitivity to the styles of language input, struggling to +understand sentences with the same semantic meaning but written in different +variants. This observation raises a critical question: Can 3D vision-language +models truly understand natural language? To test the language +understandability of 3D-VL models, we first propose a language robustness task +for systematically assessing 3D-VL models across various tasks, benchmarking +their performance when presented with different language style variants. +Importantly, these variants are commonly encountered in applications requiring +direct interaction with humans, such as embodied robotics, given the diversity +and unpredictability of human language. We propose a 3D Language Robustness +Dataset, designed based on the characteristics of human language, to facilitate +the systematic study of robustness. Our comprehensive evaluation uncovers a +significant drop in the performance of all existing models across various 3D-VL +tasks. Even the state-of-the-art 3D-LLM fails to understand some variants of +the same sentences. Further in-depth analysis suggests that the existing models +have a fragile and biased fusion module, which stems from the low diversity of +the existing dataset. Finally, we propose a training-free module driven by LLM, +which improves language robustness. Datasets and code will be available at +github. + +
+
+ comment: https://github.com/VincentDENGP/3D-LR +
+
+
+
+
+ + ♻ ☆ Finding needles in a haystack: A Black-Box Approach to Invisible + Watermark Detection + + +
+ In this paper, we propose WaterMark Detection (WMD), the first invisible +watermark detection method under a black-box and annotation-free setting. WMD +is capable of detecting arbitrary watermarks within a given reference dataset +using a clean non-watermarked dataset as a reference, without relying on +specific decoding methods or prior knowledge of the watermarking techniques. We +develop WMD using foundations of offset learning, where a clean non-watermarked +dataset enables us to isolate the influence of only watermarked samples in the +reference dataset. Our comprehensive evaluations demonstrate the effectiveness +of WMD, significantly outperforming naive detection methods, which only yield +AUC scores around 0.5. In contrast, WMD consistently achieves impressive +detection AUC scores, surpassing 0.9 in most single-watermark datasets and +exceeding 0.7 in more challenging multi-watermark scenarios across diverse +datasets and watermarking methods. As invisible watermarks become increasingly +prevalent, while specific decoding techniques remain undisclosed, our approach +provides a versatile solution and establishes a path toward increasing +accountability, transparency, and trust in our digital visual content. + +
+
+
+
+
+ + ♻ ☆ Noisy-Correspondence Learning for Text-to-Image Person Re-identification + + +
+ Text-to-image person re-identification (TIReID) is a compelling topic in the +cross-modal community, which aims to retrieve the target person based on a +textual query. Although numerous TIReID methods have been proposed and achieved +promising performance, they implicitly assume the training image-text pairs are +correctly aligned, which is not always the case in real-world scenarios. In +practice, the image-text pairs inevitably exist under-correlated or even +false-correlated, a.k.a noisy correspondence (NC), due to the low quality of +the images and annotation errors. To address this problem, we propose a novel +Robust Dual Embedding method (RDE) that can learn robust visual-semantic +associations even with NC. Specifically, RDE consists of two main components: +1) A Confident Consensus Division (CCD) module that leverages the dual-grained +decisions of dual embedding modules to obtain a consensus set of clean training +data, which enables the model to learn correct and reliable visual-semantic +associations. 2) A Triplet Alignment Loss (TAL) relaxes the conventional +Triplet Ranking loss with the hardest negative samples to a log-exponential +upper bound over all negative ones, thus preventing the model collapse under NC +and can also focus on hard-negative samples for promising performance. We +conduct extensive experiments on three public benchmarks, namely CUHK-PEDES, +ICFG-PEDES, and RSTPReID, to evaluate the performance and robustness of our +RDE. Our method achieves state-of-the-art results both with and without +synthetic noisy correspondences on all three datasets. Code is available at +https://github.com/QinYang79/RDE. + +
+
+
+
+
+ + ♻ ☆ SpecNeRF: Gaussian Directional Encoding for Specular Reflections CVPR2024 + + +
+ Neural radiance fields have achieved remarkable performance in modeling the +appearance of 3D scenes. However, existing approaches still struggle with the +view-dependent appearance of glossy surfaces, especially under complex lighting +of indoor environments. Unlike existing methods, which typically assume distant +lighting like an environment map, we propose a learnable Gaussian directional +encoding to better model the view-dependent effects under near-field lighting +conditions. Importantly, our new directional encoding captures the +spatially-varying nature of near-field lighting and emulates the behavior of +prefiltered environment maps. As a result, it enables the efficient evaluation +of preconvolved specular color at any 3D location with varying roughness +coefficients. We further introduce a data-driven geometry prior that helps +alleviate the shape radiance ambiguity in reflection modeling. We show that our +Gaussian directional encoding and geometry prior significantly improve the +modeling of challenging specular reflections in neural radiance fields, which +helps decompose appearance into more physically meaningful components. + +
+
+ comment: Accepted to CVPR2024, Project page: + https://limacv.github.io/SpecNeRF_web/ +
+
+
+
+
+ + ♻ ☆ Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method + + +
+ Gaze plays a crucial role in revealing human attention and intention, +shedding light on the cognitive processes behind human actions. The integration +of gaze guidance with the dynamics of hand-object interactions boosts the +accuracy of human motion prediction. However, the lack of datasets that capture +the intricate relationship and consistency among gaze, hand, and object +movements remains a substantial hurdle. In this paper, we introduce the first +Gaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task +for synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI, +features simultaneous 3D modeling of gaze, hand, and object interactions, +comprising 479 sequences with an average duration of 19.1 seconds, 812 +sub-sequences, and 33 objects of various sizes. We propose a hierarchical +framework centered on a gaze-guided hand-object interaction diffusion model, +named GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions +into spatial-temporal features and goal pose conditions at different levels of +information granularity. During the diffusion phase, two gaze-conditioned +diffusion models are stacked to simplify the complex synthesis of hand-object +motions. Here, the object motion diffusion model generates sequences of object +motions based on gaze conditions, while the hand motion diffusion model +produces hand motions based on the generated object motion. To improve +fine-grained goal pose alignment, we introduce a Spherical Gaussian constraint +to guide the denoising step. In the subsequent post-diffusion phase, we +optimize the generated hand motions using contact consistency. Our extensive +experiments highlight the uniqueness of our dataset and the effectiveness of +our approach. + +
+
+
+
+
+ + ♻ ☆ HQ-VAE: Hierarchical Discrete Representation Learning with Variational + Bayes + + +
+ Vector quantization (VQ) is a technique to deterministically learn features +with discrete codebook representations. It is commonly performed with a +variational autoencoding model, VQ-VAE, which can be further extended to +hierarchical structures for making high-fidelity reconstructions. However, such +hierarchical extensions of VQ-VAE often suffer from the codebook/layer collapse +issue, where the codebook is not efficiently used to express the data, and +hence degrades reconstruction accuracy. To mitigate this problem, we propose a +novel unified framework to stochastically learn hierarchical discrete +representation on the basis of the variational Bayes framework, called +hierarchically quantized variational autoencoder (HQ-VAE). HQ-VAE naturally +generalizes the hierarchical variants of VQ-VAE, such as VQ-VAE-2 and +residual-quantized VAE (RQ-VAE), and provides them with a Bayesian training +scheme. Our comprehensive experiments on image datasets show that HQ-VAE +enhances codebook usage and improves reconstruction performance. We also +validated HQ-VAE in terms of its applicability to a different modality with an +audio dataset. + +
+
+ comment: 34 pages with 17 figures, accepted for TMLR +
+
+
+
+
+ + ♻ ☆ Enhancing Object Coherence in Layout-to-Image Synthesis + + +
+ Layout-to-image synthesis is an emerging technique in conditional image +generation. It aims to generate complex scenes, where users require fine +control over the layout of the objects in a scene. However, it remains +challenging to control the object coherence, including semantic coherence +(e.g., the cat looks at the flowers or not) and physical coherence (e.g., the +hand and the racket should not be misaligned). In this paper, we propose a +novel diffusion model with effective global semantic fusion (GSF) and +self-similarity feature enhancement modules to guide the object coherence for +this task. For semantic coherence, we argue that the image caption contains +rich information for defining the semantic relationship within the objects in +the images. Instead of simply employing cross-attention between captions and +generated images, which addresses the highly relevant layout restriction and +semantic coherence separately and thus leads to unsatisfying results shown in +our experiments, we develop GSF to fuse the supervision from the layout +restriction and semantic coherence requirement and exploit it to guide the +image synthesis process. Moreover, to improve the physical coherence, we +develop a Self-similarity Coherence Attention (SCA) module to explicitly +integrate local contextual physical coherence into each pixel's generation +process. Specifically, we adopt a self-similarity map to encode the coherence +restrictions and employ it to extract coherent features from text embedding. +Through visualization of our self-similarity map, we explore the essence of +SCA, revealing that its effectiveness is not only in capturing reliable +physical coherence patterns but also in enhancing complex texture generation. +Extensive experiments demonstrate the superiority of our proposed method in +both image generation quality and controllability. + +
+
+ comment: GitHub: https://github.com/CodeGoat24/EOCNet +
+
+
+
+
+ + ♻ ☆ GS-IR: 3D Gaussian Splatting for Inverse Rendering + + +
+ We propose GS-IR, a novel inverse rendering approach based on 3D Gaussian +Splatting (GS) that leverages forward mapping volume rendering to achieve +photorealistic novel view synthesis and relighting results. Unlike previous +works that use implicit neural representations and volume rendering (e.g. +NeRF), which suffer from low expressive power and high computational +complexity, we extend GS, a top-performance representation for novel view +synthesis, to estimate scene geometry, surface material, and environment +illumination from multi-view images captured under unknown lighting conditions. +There are two main problems when introducing GS to inverse rendering: 1) GS +does not support producing plausible normal natively; 2) forward mapping (e.g. +rasterization and splatting) cannot trace the occlusion like backward mapping +(e.g. ray tracing). To address these challenges, our GS-IR proposes an +efficient optimization scheme that incorporates a depth-derivation-based +regularization for normal estimation and a baking-based occlusion to model +indirect lighting. The flexible and expressive GS representation allows us to +achieve fast and compact geometry reconstruction, photorealistic novel view +synthesis, and effective physically-based rendering. We demonstrate the +superiority of our method over baseline methods through qualitative and +quantitative evaluations on various challenging scenes. + +
+
+
+
+
+ + ♻ ☆ ProTeCt: Prompt Tuning for Taxonomic Open Set Classification CVPR 2024 + + +
+ Visual-language foundation models, like CLIP, learn generalized +representations that enable zero-shot open-set classification. Few-shot +adaptation methods, based on prompt tuning, have been shown to further improve +performance on downstream datasets. However, these methods do not fare well in +the taxonomic open set (TOS) setting, where the classifier is asked to make +predictions from label sets across different levels of semantic granularity. +Frequently, they infer incorrect labels at coarser taxonomic class levels, even +when the inference at the leaf level (original class labels) is correct. To +address this problem, we propose a prompt tuning technique that calibrates the +hierarchical consistency of model predictions. A set of metrics of hierarchical +consistency, the Hierarchical Consistent Accuracy (HCA) and the Mean Treecut +Accuracy (MTA), are first proposed to evaluate TOS model performance. A new +Prompt Tuning for Hierarchical Consistency (ProTeCt) technique is then proposed +to calibrate classification across label set granularities. Results show that +ProTeCt can be combined with existing prompt tuning methods to significantly +improve TOS classification without degrading the leaf level classification +performance. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SpikingResformer: Bridging ResNet and Vision Transformer in Spiking + Neural Networks CVPR + + +
+ The remarkable success of Vision Transformers in Artificial Neural Networks +(ANNs) has led to a growing interest in incorporating the self-attention +mechanism and transformer-based architecture into Spiking Neural Networks +(SNNs). While existing methods propose spiking self-attention mechanisms that +are compatible with SNNs, they lack reasonable scaling methods, and the overall +architectures proposed by these methods suffer from a bottleneck in effectively +extracting local features. To address these challenges, we propose a novel +spiking self-attention mechanism named Dual Spike Self-Attention (DSSA) with a +reasonable scaling method. Based on DSSA, we propose a novel spiking Vision +Transformer architecture called SpikingResformer, which combines the +ResNet-based multi-stage architecture with our proposed DSSA to improve both +performance and energy efficiency while reducing parameters. Experimental +results show that SpikingResformer achieves higher accuracy with fewer +parameters and lower energy consumption than other spiking Vision Transformer +counterparts. Notably, our SpikingResformer-L achieves 79.40% top-1 accuracy on +ImageNet with 4 time-steps, which is the state-of-the-art result in the SNN +field. + +
+
+ comment: To be published in the 2024 IEEE/CVF Conference on Computer Vision + and Pattern Recognition (CVPR) +
+
+
+
+
+ + ♻ ☆ MemoNav: Working Memory Model for Visual Navigation CVPR 2024 + + +
+ Image-goal navigation is a challenging task that requires an agent to +navigate to a goal indicated by an image in unfamiliar environments. Existing +methods utilizing diverse scene memories suffer from inefficient exploration +since they use all historical observations for decision-making without +considering the goal-relevant fraction. To address this limitation, we present +MemoNav, a novel memory model for image-goal navigation, which utilizes a +working memory-inspired pipeline to improve navigation performance. +Specifically, we employ three types of navigation memory. The node features on +a map are stored in the short-term memory (STM), as these features are +dynamically updated. A forgetting module then retains the informative STM +fraction to increase efficiency. We also introduce long-term memory (LTM) to +learn global scene representations by progressively aggregating STM features. +Subsequently, a graph attention module encodes the retained STM and the LTM to +generate working memory (WM) which contains the scene features essential for +efficient navigation. The synergy among these three memory types boosts +navigation performance by enabling the agent to learn and leverage +goal-relevant scene features within a topological map. Our evaluation on +multi-goal tasks demonstrates that MemoNav significantly outperforms previous +methods across all difficulty levels in both Gibson and Matterport3D scenes. +Qualitative results further illustrate that MemoNav plans more efficient +routes. + +
+
+ comment: Accepted to CVPR 2024. Code: https://github.com/ZJULiHongxin/MemoNav +
+
+
+
+
+ + ♻ ☆ FlexEdit: Flexible and Controllable Diffusion-based Object-centric Image + Editing + + +
+ Our work addresses limitations seen in previous approaches for object-centric +editing problems, such as unrealistic results due to shape discrepancies and +limited control in object replacement or insertion. To this end, we introduce +FlexEdit, a flexible and controllable editing framework for objects where we +iteratively adjust latents at each denoising step using our FlexEdit block. +Initially, we optimize latents at test time to align with specified object +constraints. Then, our framework employs an adaptive mask, automatically +extracted during denoising, to protect the background while seamlessly blending +new content into the target image. We demonstrate the versatility of FlexEdit +in various object editing tasks and curate an evaluation test suite with +samples from both real and synthetic images, along with novel evaluation +metrics designed for object-centric editing. We conduct extensive experiments +on different editing scenarios, demonstrating the superiority of our editing +framework over recent advanced text-guided image editing methods. Our project +page is published at https://flex-edit.github.io/. + +
+
+ comment: Our project page: https://flex-edit.github.io/ +
+
+
+
+
+ + ♻ ☆ Feature Unlearning for Pre-trained GANs and VAEs + + +
+ We tackle the problem of feature unlearning from a pre-trained image +generative model: GANs and VAEs. Unlike a common unlearning task where an +unlearning target is a subset of the training set, we aim to unlearn a specific +feature, such as hairstyle from facial images, from the pre-trained generative +models. As the target feature is only presented in a local region of an image, +unlearning the entire image from the pre-trained model may result in losing +other details in the remaining region of the image. To specify which features +to unlearn, we collect randomly generated images that contain the target +features. We then identify a latent representation corresponding to the target +feature and then use the representation to fine-tune the pre-trained model. +Through experiments on MNIST, CelebA, and FFHQ datasets, we show that target +features are successfully removed while keeping the fidelity of the original +models. Further experiments with an adversarial attack show that the unlearned +model is more robust under the presence of malicious parties. + +
+
+
+
+
+ + ♻ ☆ MMM: Generative Masked Motion Model CVPR + + +
+ Recent advances in text-to-motion generation using diffusion and +autoregressive models have shown promising results. However, these models often +suffer from a trade-off between real-time performance, high fidelity, and +motion editability. To address this gap, we introduce MMM, a novel yet simple +motion generation paradigm based on Masked Motion Model. MMM consists of two +key components: (1) a motion tokenizer that transforms 3D human motion into a +sequence of discrete tokens in latent space, and (2) a conditional masked +motion transformer that learns to predict randomly masked motion tokens, +conditioned on the pre-computed text tokens. By attending to motion and text +tokens in all directions, MMM explicitly captures inherent dependency among +motion tokens and semantic mapping between motion and text tokens. During +inference, this allows parallel and iterative decoding of multiple motion +tokens that are highly consistent with fine-grained text descriptions, +therefore simultaneously achieving high-fidelity and high-speed motion +generation. In addition, MMM has innate motion editability. By simply placing +mask tokens in the place that needs editing, MMM automatically fills the gaps +while guaranteeing smooth transitions between editing and non-editing parts. +Extensive experiments on the HumanML3D and KIT-ML datasets demonstrate that MMM +surpasses current leading methods in generating high-quality motion (evidenced +by superior FID scores of 0.08 and 0.429), while offering advanced editing +features such as body-part modification, motion in-betweening, and the +synthesis of long motion sequences. In addition, MMM is two orders of magnitude +faster on a single mid-range GPU than editable motion diffusion models. Our +project page is available at \url{https://exitudio.github.io/MMM-page}. + +
+
+ comment: accepted to CVPR +
+
+
+
+
+ + ♻ ☆ SDSTrack: Self-Distillation Symmetric Adapter Learning for Multi-Modal + Visual Object Tracking CVPR2024 + + +
+ Multimodal Visual Object Tracking (VOT) has recently gained significant +attention due to its robustness. Early research focused on fully fine-tuning +RGB-based trackers, which was inefficient and lacked generalized representation +due to the scarcity of multimodal data. Therefore, recent studies have utilized +prompt tuning to transfer pre-trained RGB-based trackers to multimodal data. +However, the modality gap limits pre-trained knowledge recall, and the +dominance of the RGB modality persists, preventing the full utilization of +information from other modalities. To address these issues, we propose a novel +symmetric multimodal tracking framework called SDSTrack. We introduce +lightweight adaptation for efficient fine-tuning, which directly transfers the +feature extraction ability from RGB to other domains with a small number of +trainable parameters and integrates multimodal features in a balanced, +symmetric manner. Furthermore, we design a complementary masked patch +distillation strategy to enhance the robustness of trackers in complex +environments, such as extreme weather, poor imaging, and sensor failure. +Extensive experiments demonstrate that SDSTrack outperforms state-of-the-art +methods in various multimodal tracking scenarios, including RGB+Depth, +RGB+Thermal, and RGB+Event tracking, and exhibits impressive results in extreme +conditions. Our source code is available at https://github.com/hoqolo/SDSTrack. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ InterGen: Diffusion-based Multi-human Motion Generation under Complex + Interactions + + +
+ We have recently seen tremendous progress in diffusion advances for +generating realistic human motions. Yet, they largely disregard the multi-human +interactions. In this paper, we present InterGen, an effective diffusion-based +approach that incorporates human-to-human interactions into the motion +diffusion process, which enables layman users to customize high-quality +two-person interaction motions, with only text guidance. We first contribute a +multimodal dataset, named InterHuman. It consists of about 107M frames for +diverse two-person interactions, with accurate skeletal motions and 23,337 +natural language descriptions. For the algorithm side, we carefully tailor the +motion diffusion model to our two-person interaction setting. To handle the +symmetry of human identities during interactions, we propose two cooperative +transformer-based denoisers that explicitly share weights, with a mutual +attention mechanism to further connect the two denoising processes. Then, we +propose a novel representation for motion input in our interaction diffusion +model, which explicitly formulates the global relations between the two +performers in the world frame. We further introduce two novel regularization +terms to encode spatial relations, equipped with a corresponding damping scheme +during the training of our interaction diffusion model. Extensive experiments +validate the effectiveness and generalizability of InterGen. Notably, it can +generate more diverse and compelling two-person motions than previous methods +and enables various downstream applications for human interactions. + +
+
+ comment: accepted by IJCV 2024 +
+
+
+
+
+ + ♻ ☆ Gaining the Sparse Rewards by Exploring Lottery Tickets in Spiking + Neural Network + + +
+ Deploying energy-efficient deep learning algorithms on computational-limited +devices, such as robots, is still a pressing issue for real-world applications. +Spiking Neural Networks (SNNs), a novel brain-inspired algorithm, offer a +promising solution due to their low-latency and low-energy properties over +traditional Artificial Neural Networks (ANNs). Despite their advantages, the +dense structure of deep SNNs can still result in extra energy consumption. The +Lottery Ticket Hypothesis (LTH) posits that within dense neural networks, there +exist winning Lottery Tickets (LTs), namely sub-networks, that can be obtained +without compromising performance. Inspired by this, this paper delves into the +spiking-based LTs (SLTs), examining their unique properties and potential for +extreme efficiency. Then, two significant sparse \textbf{\textit{Rewards}} are +gained through comprehensive explorations and meticulous experiments on SLTs +across various dense structures. Moreover, a sparse algorithm tailored for +spiking transformer structure, which incorporates convolution operations into +the Patch Embedding Projection (ConvPEP) module, has been proposed to achieve +Multi-level Sparsity (MultiSp). MultiSp refers to (1) Patch number sparsity; +(2) ConvPEP weights sparsity and binarization; and (3) ConvPEP activation layer +binarization. Extensive experiments demonstrate that our method achieves +extreme sparsity with only a slight performance decrease, paving the way for +deploying energy-efficient neural networks in robotics and beyond. + +
+
+ comment: This paper is under submission +
+
+
+
+
+ + ♻ ☆ FluoroSAM: A Language-aligned Foundation Model for X-ray Image + Segmentation + + +
+ Automated X-ray image segmentation would accelerate research and development +in diagnostic and interventional precision medicine. Prior efforts have +contributed task-specific models capable of solving specific image analysis +problems, but the utility of these models is restricted to their particular +task domain, and expanding to broader use requires additional data, labels, and +retraining efforts. Recently, foundation models (FMs) -- machine learning +models trained on large amounts of highly variable data thus enabling broad +applicability -- have emerged as promising tools for automated image analysis. +Existing FMs for medical image analysis focus on scenarios and modalities where +objects are clearly defined by visually apparent boundaries, such as surgical +tool segmentation in endoscopy. X-ray imaging, by contrast, does not generally +offer such clearly delineated boundaries or structure priors. During X-ray +image formation, complex 3D structures are projected in transmission onto the +imaging plane, resulting in overlapping features of varying opacity and shape. +To pave the way toward an FM for comprehensive and automated analysis of +arbitrary medical X-ray images, we develop FluoroSAM, a language-aligned +variant of the Segment-Anything Model, trained from scratch on 1.6M synthetic +X-ray images. FluoroSAM is trained on data including masks for 128 organ types +and 464 non-anatomical objects, such as tools and implants. In real X-ray +images of cadaveric specimens, FluoroSAM is able to segment bony anatomical +structures based on text-only prompting with 0.51 and 0.79 DICE with +point-based refinement, outperforming competing SAM variants for all +structures. FluoroSAM is also capable of zero-shot generalization to segmenting +classes beyond the training set thanks to its language alignment, which we +demonstrate for full lung segmentation on real chest X-rays. + +
+
+
+
+
+ + ♻ ☆ FMA-Net: Flow-Guided Dynamic Filtering and Iterative Feature Refinement + with Multi-Attention for Joint Video Super-Resolution and Deblurring CVPR2024 + + +
+ We present a joint learning scheme of video super-resolution and deblurring, +called VSRDB, to restore clean high-resolution (HR) videos from blurry +low-resolution (LR) ones. This joint restoration problem has drawn much less +attention compared to single restoration problems. In this paper, we propose a +novel flow-guided dynamic filtering (FGDF) and iterative feature refinement +with multi-attention (FRMA), which constitutes our VSRDB framework, denoted as +FMA-Net. Specifically, our proposed FGDF enables precise estimation of both +spatio-temporally-variant degradation and restoration kernels that are aware of +motion trajectories through sophisticated motion representation learning. +Compared to conventional dynamic filtering, the FGDF enables the FMA-Net to +effectively handle large motions into the VSRDB. Additionally, the stacked FRMA +blocks trained with our novel temporal anchor (TA) loss, which temporally +anchors and sharpens features, refine features in a course-to-fine manner +through iterative updates. Extensive experiments demonstrate the superiority of +the proposed FMA-Net over state-of-the-art methods in terms of both +quantitative and qualitative quality. Codes and pre-trained models are +available at: https://kaist-viclab.github.io/fmanet-site + +
+
+ comment: CVPR2024 (camera-ready version). The last two authors are + co-corresponding authors. Please visit our project page at + https://kaist-viclab.github.io/fmanet-site +
+
+
+
+
+ + ♻ ☆ HallE-Control: Controlling Object Hallucination in Large Multimodal + Models + + +
+ Current Large Multimodal Models (LMMs) achieve remarkable progress, yet there +remains significant uncertainty regarding their ability to accurately apprehend +visual details, that is, in performing detailed captioning. To address this, we +introduce $\textit{CCEval}$, a GPT-4 assisted evaluation method for detailed +captioning. Interestingly, while LMMs demonstrate minimal object existence +hallucination in existing VQA benchmarks, our proposed evaluation reveals +continued susceptibility to such hallucinations. In this paper, we make the +first attempt to investigate such hallucination from different aspects, +including image resolution, the language decoder size, and instruction data +amount, quality, granularity. Our findings underscore the unwarranted inference +when the language description includes details at a finer object granularity +than what the vision module can ground or verify, thus inducing hallucination. +To control such hallucinations, we further attribute the reliability of +captioning to contextual knowledge (involving only contextually grounded +objects) and parametric knowledge (containing inferred objects by the model). +Thus, we introduce $\textit{HallE-Control}$, a controllable LMM in terms of +$\textbf{Hall}$ucination in object $\textbf{E}$xistence. HallE-Control can +condition the captioning to shift between (i) exclusively depicting contextual +knowledge for grounded objects and (ii) blending it with parametric knowledge +to imagine inferred objects. Our method reduces hallucination by 44% compared +to LLaVA$_{7B}$ and maintains the object coverage. + +
+
+ comment: Our code is publicly available at + https://github.com/bronyayang/HallE_Control +
+
+
+
+
+ + ♻ ☆ Data-Efficient Multimodal Fusion on a Single GPU CVPR 2024 + + +
+ The goal of multimodal alignment is to learn a single latent space that is +shared between multimodal inputs. The most powerful models in this space have +been trained using massive datasets of paired inputs and large-scale +computational resources, making them prohibitively expensive to train in many +practical scenarios. We surmise that existing unimodal encoders pre-trained on +large amounts of unimodal data should provide an effective bootstrap to create +multimodal models from unimodal ones at much lower costs. We therefore propose +FuseMix, a multimodal augmentation scheme that operates on the latent spaces of +arbitrary pre-trained unimodal encoders. Using FuseMix for multimodal +alignment, we achieve competitive performance -- and in certain cases +outperform state-of-the art methods -- in both image-text and audio-text +retrieval, with orders of magnitude less compute and data: for example, we +outperform CLIP on the Flickr30K text-to-image retrieval task with $\sim \! +600\times$ fewer GPU days and $\sim \! 80\times$ fewer image-text pairs. +Additionally, we show how our method can be applied to convert pre-trained +text-to-image generative models into audio-to-image ones. Code is available at: +https://github.com/layer6ai-labs/fusemix. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ QN-Mixer: A Quasi-Newton MLP-Mixer Model for Sparse-View CT + Reconstruction CVPR 2024 + + +
+ Inverse problems span across diverse fields. In medical contexts, computed +tomography (CT) plays a crucial role in reconstructing a patient's internal +structure, presenting challenges due to artifacts caused by inherently +ill-posed inverse problems. Previous research advanced image quality via +post-processing and deep unrolling algorithms but faces challenges, such as +extended convergence times with ultra-sparse data. Despite enhancements, +resulting images often show significant artifacts, limiting their effectiveness +for real-world diagnostic applications. We aim to explore deep second-order +unrolling algorithms for solving imaging inverse problems, emphasizing their +faster convergence and lower time complexity compared to common first-order +methods like gradient descent. In this paper, we introduce QN-Mixer, an +algorithm based on the quasi-Newton approach. We use learned parameters through +the BFGS algorithm and introduce Incept-Mixer, an efficient neural architecture +that serves as a non-local regularization term, capturing long-range +dependencies within images. To address the computational demands typically +associated with quasi-Newton algorithms that require full Hessian matrix +computations, we present a memory-efficient alternative. Our approach +intelligently downsamples gradient information, significantly reducing +computational requirements while maintaining performance. The approach is +validated through experiments on the sparse-view CT problem, involving various +datasets and scanning protocols, and is compared with post-processing and deep +unrolling state-of-the-art approaches. Our method outperforms existing +approaches and achieves state-of-the-art performance in terms of SSIM and PSNR, +all while reducing the number of unrolling iterations required. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://towzeur.github.io/QN-Mixer/ +
+
+
+
+
+ + ♻ ☆ Learning from One Continuous Video Stream CVPR + + +
+ We introduce a framework for online learning from a single continuous video +stream -- the way people and animals learn, without mini-batches, data +augmentation or shuffling. This poses great challenges given the high +correlation between consecutive video frames and there is very little prior +work on it. Our framework allows us to do a first deep dive into the topic and +includes a collection of streams and tasks composed from two existing video +datasets, plus methodology for performance evaluation that considers both +adaptation and generalization. We employ pixel-to-pixel modelling as a +practical and flexible way to switch between pre-training and single-stream +evaluation as well as between arbitrary tasks, without ever requiring changes +to models and always using the same pixel loss. Equipped with this framework we +obtained large single-stream learning gains from pre-training with a novel +family of future prediction tasks, found that momentum hurts, and that the pace +of weight updates matters. The combination of these insights leads to matching +the performance of IID learning with batch size 1, when using the same +architecture and without costly replay buffers. + +
+
+ comment: CVPR camera ready version +
+
+
+
+
+ + ♻ ☆ Residual-based Language Models are Free Boosters for Biomedical Imaging + + +
+ In this study, we uncover the unexpected efficacy of residual-based large +language models (LLMs) as part of encoders for biomedical imaging tasks, a +domain traditionally devoid of language or textual data. The approach diverges +from established methodologies by utilizing a frozen transformer block, +extracted from pre-trained LLMs, as an innovative encoder layer for the direct +processing of visual tokens. This strategy represents a significant departure +from the standard multi-modal vision-language frameworks, which typically hinge +on language-driven prompts and inputs. We found that these LLMs could boost +performance across a spectrum of biomedical imaging applications, including +both 2D and 3D visual classification tasks, serving as plug-and-play boosters. +More interestingly, as a byproduct, we found that the proposed framework +achieved superior performance, setting new state-of-the-art results on +extensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we +aim to open new avenues for employing LLMs in biomedical imaging and enriching +the understanding of their potential in this specialized domain. + +
+
+
+
+
+ + ♻ ☆ Advances in Kidney Biopsy Lesion Assessment through Dense Instance + Segmentation + + +
+ Renal biopsies are the gold standard for diagnosis of kidney diseases. Lesion +scores made by renal pathologists are semi-quantitative and exhibit high +inter-observer variability. Automating lesion classification within segmented +anatomical structures can provide decision support in quantification analysis +and reduce the inter-observer variability. Nevertheless, classifying lesions in +regions-of-interest (ROIs) is clinically challenging due to (a) a large amount +of densely packed anatomical objects (up to 1000), (b) class imbalance across +different compartments (at least 3), (c) significant variation in object scales +(i.e. sizes and shapes), and (d) the presence of multi-label lesions per +anatomical structure. Existing models lack the capacity to address these +complexities efficiently and generically. This paper presents \textbf{a +generalized technical solution} for large-scale, multi-source datasets with +diverse lesions. Our approach utilizes two sub-networks: dense instance +segmentation and lesion classification. We introduce \textbf{DiffRegFormer}, an +end-to-end dense instance segmentation model designed for multi-class, +multi-scale objects within ROIs. Combining diffusion models, transformers, and +RCNNs, DiffRegFormer efficiently recognizes over 500 objects across three +anatomical classes (glomeruli, tubuli, arteries) within ROIs on a single NVIDIA +GeForce RTX 3090 GPU. On a dataset of 303 ROIs (from 148 Jones' silver-stained +renal WSIs), it outperforms state of art models, achieving AP of 52.1\% +(detection) and 46.8\% (segmentation). Our lesion classification sub-network +achieves 89.2\% precision and 64.6\% recall on 21889 object patches (from the +303 ROIs). Importantly, the model demonstrates direct domain transfer to +PAS-stained WSIs without fine-tuning. + +
+
+ comment: 16 pages, 15 figures, 6 tables, Journal +
+
+
+
+
+ + ♻ ☆ DecentNeRFs: Decentralized Neural Radiance Fields from Crowdsourced + Images + + +
+ Neural radiance fields (NeRFs) show potential for transforming images +captured worldwide into immersive 3D visual experiences. However, most of this +captured visual data remains siloed in our camera rolls as these images contain +personal details. Even if made public, the problem of learning 3D +representations of billions of scenes captured daily in a centralized manner is +computationally intractable. Our approach, DecentNeRF, is the first attempt at +decentralized, crowd-sourced NeRFs that require $\sim 10^4\times$ less server +computing for a scene than a centralized approach. Instead of sending the raw +data, our approach requires users to send a 3D representation, distributing the +high computation cost of training centralized NeRFs between the users. It +learns photorealistic scene representations by decomposing users' 3D views into +personal and global NeRFs and a novel optimally weighted aggregation of only +the latter. We validate the advantage of our approach to learn NeRFs with +photorealism and minimal server computation cost on structured synthetic and +real-world photo tourism datasets. We further analyze how secure aggregation of +global NeRFs in DecentNeRF minimizes the undesired reconstruction of personal +content by the server. + +
+
+
+
+
+ + ♻ ☆ Hybrid quantum image classification and federated learning for hepatic + steatosis diagnosis + + +
+ In the realm of liver transplantation, accurately determining hepatic +steatosis levels is crucial. Recognizing the essential need for improved +diagnostic precision, particularly for optimizing diagnosis time by swiftly +handling easy-to-solve cases and allowing the expert time to focus on more +complex cases, this study aims to develop cutting-edge algorithms that enhance +the classification of liver biopsy images. Additionally, the challenge of +maintaining data privacy arises when creating automated algorithmic solutions, +as sharing patient data between hospitals is restricted, further complicating +the development and validation process. This research tackles diagnostic +accuracy by leveraging novel techniques from the rapidly evolving field of +quantum machine learning, known for their superior generalization abilities. +Concurrently, it addresses privacy concerns through the implementation of +privacy-conscious collaborative machine learning with federated learning. We +introduce a hybrid quantum neural network model that leverages real-world +clinical data to assess non-alcoholic liver steatosis accurately. This model +achieves an image classification accuracy of 97%, surpassing traditional +methods by 1.8%. Moreover, by employing a federated learning approach that +allows data from different clients to be shared while ensuring privacy, we +maintain an accuracy rate exceeding 90%. This initiative marks a significant +step towards a scalable, collaborative, efficient, and dependable computational +framework that aids clinical pathologists in their daily diagnostic tasks. + +
+
+ comment: 13 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Wasserstein Distortion: Unifying Fidelity and Realism + + +
+ We introduce a distortion measure for images, Wasserstein distortion, that +simultaneously generalizes pixel-level fidelity on the one hand and realism or +perceptual quality on the other. We show how Wasserstein distortion reduces to +a pure fidelity constraint or a pure realism constraint under different +parameter choices and discuss its metric properties. Pairs of images that are +close under Wasserstein distortion illustrate its utility. In particular, we +generate random textures that have high fidelity to a reference texture in one +location of the image and smoothly transition to an independent realization of +the texture as one moves away from this point. Wasserstein distortion attempts +to generalize and unify prior work on texture generation, image realism and +distortion, and models of the early human visual system, in the form of an +optimizable metric in the mathematical sense. + +
+
+
+
+
+ + ♻ ☆ GOTCHA: Real-Time Video Deepfake Detection via Challenge-Response + + +
+ With the rise of AI-enabled Real-Time Deepfakes (RTDFs), the integrity of +online video interactions has become a growing concern. RTDFs have now made it +feasible to replace an imposter's face with their victim in live video +interactions. Such advancement in deepfakes also coaxes detection to rise to +the same standard. However, existing deepfake detection techniques are +asynchronous and hence ill-suited for RTDFs. To bridge this gap, we propose a +challenge-response approach that establishes authenticity in live settings. We +focus on talking-head style video interaction and present a taxonomy of +challenges that specifically target inherent limitations of RTDF generation +pipelines. We evaluate representative examples from the taxonomy by collecting +a unique dataset comprising eight challenges, which consistently and visibly +degrades the quality of state-of-the-art deepfake generators. These results are +corroborated both by humans and a new automated scoring function, leading to +88.6% and 80.1% AUC, respectively. The findings underscore the promising +potential of challenge-response systems for explainable and scalable real-time +deepfake detection in practical scenarios. We provide access to data and code +at https://github.com/mittalgovind/GOTCHA-Deepfakes + +
+
+ comment: 20 pages, 19 figures, Code and data released +
+
+
+
+
+ + ♻ ☆ Federated attention consistent learning models for prostate cancer + diagnosis and Gleason grading + + +
+ Artificial intelligence (AI) holds significant promise in transforming +medical imaging, enhancing diagnostics, and refining treatment strategies. +However, the reliance on extensive multicenter datasets for training AI models +poses challenges due to privacy concerns. Federated learning provides a +solution by facilitating collaborative model training across multiple centers +without sharing raw data. This study introduces a federated +attention-consistent learning (FACL) framework to address challenges associated +with large-scale pathological images and data heterogeneity. FACL enhances +model generalization by maximizing attention consistency between local clients +and the server model. To ensure privacy and validate robustness, we +incorporated differential privacy by introducing noise during parameter +transfer. We assessed the effectiveness of FACL in cancer diagnosis and Gleason +grading tasks using 19,461 whole-slide images of prostate cancer from multiple +centers. In the diagnosis task, FACL achieved an area under the curve (AUC) of +0.9718, outperforming seven centers with an average AUC of 0.9499 when +categories are relatively balanced. For the Gleason grading task, FACL attained +a Kappa score of 0.8463, surpassing the average Kappa score of 0.7379 from six +centers. In conclusion, FACL offers a robust, accurate, and cost-effective AI +training model for prostate cancer pathology while maintaining effective data +safeguards. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using + Reinforcement and Imitation Learning + + +
+ Robotic-assisted surgical systems have demonstrated significant potential in +enhancing surgical precision and minimizing human errors. However, existing +systems lack the ability to accommodate the unique preferences and requirements +of individual surgeons. Additionally, they primarily focus on general surgeries +(e.g., laparoscopy) and are not suitable for highly precise microsurgeries, +such as ophthalmic procedures. Thus, we propose a simulation-based image-guided +approach for surgeon-centered autonomous agents that can adapt to the +individual surgeon's skill level and preferred surgical techniques during +ophthalmic cataract surgery. Our approach utilizes a simulated environment to +train reinforcement and imitation learning agents guided by image data to +perform all tasks of the incision phase of cataract surgery. By integrating the +surgeon's actions and preferences into the training process with the +surgeon-in-the-loop, our approach enables the robot to implicitly learn and +adapt to the individual surgeon's unique approach through demonstrations. This +results in a more intuitive and personalized surgical experience for the +surgeon. Simultaneously, it ensures consistent performance for the autonomous +robotic apprentice. We define and evaluate the effectiveness of our approach +using our proposed metrics; and highlight the trade-off between a generic agent +and a surgeon-centered adapted agent. Moreover, our approach has the potential +to extend to other ophthalmic surgical procedures, opening the door to a new +generation of surgeon-in-the-loop autonomous surgical robots. We provide an +open-source simulation framework for future development and reproducibility. + +
+
+
+
+
+ + ♻ ☆ TUNeS: A Temporal U-Net with Self-Attention for Video-based Surgical + Phase Recognition + + +
+ To enable context-aware computer assistance in the operating room of the +future, cognitive systems need to understand automatically which surgical phase +is being performed by the medical team. The primary source of information for +surgical phase recognition is typically video, which presents two challenges: +extracting meaningful features from the video stream and effectively modeling +temporal information in the sequence of visual features. For temporal modeling, +attention mechanisms have gained popularity due to their ability to capture +long-range dependencies. In this paper, we explore design choices for attention +in existing temporal models for surgical phase recognition and propose a novel +approach that uses attention more effectively and does not require hand-crafted +constraints: TUNeS, an efficient and simple temporal model that incorporates +self-attention at the core of a convolutional U-Net structure. In addition, we +propose to train the feature extractor, a standard CNN, together with an LSTM +on preferably long video segments, i.e., with long temporal context. In our +experiments, almost all temporal models performed better on top of feature +extractors that were trained with longer temporal context. On these +contextualized features, TUNeS achieves state-of-the-art results on the +Cholec80 and AutoLaparo datasets. + +
+
+ comment: Major revision: comparison to Temporal U-Transformer +
+
+
+
+
+ + ♻ ☆ STREAM: Spatio-TempoRal Evaluation and Analysis Metric for Video + Generative Models ICLR 2024 + + +
+ Image generative models have made significant progress in generating +realistic and diverse images, supported by comprehensive guidance from various +evaluation metrics. However, current video generative models struggle to +generate even short video clips, with limited tools that provide insights for +improvements. Current video evaluation metrics are simple adaptations of image +metrics by switching the embeddings with video embedding networks, which may +underestimate the unique characteristics of video. Our analysis reveals that +the widely used Frechet Video Distance (FVD) has a stronger emphasis on the +spatial aspect than the temporal naturalness of video and is inherently +constrained by the input size of the embedding networks used, limiting it to 16 +frames. Additionally, it demonstrates considerable instability and diverges +from human evaluations. To address the limitations, we propose STREAM, a new +video evaluation metric uniquely designed to independently evaluate spatial and +temporal aspects. This feature allows comprehensive analysis and evaluation of +video generative models from various perspectives, unconstrained by video +length. We provide analytical and experimental evidence demonstrating that +STREAM provides an effective evaluation tool for both visual and temporal +quality of videos, offering insights into area of improvement for video +generative models. To the best of our knowledge, STREAM is the first evaluation +metric that can separately assess the temporal and spatial aspects of videos. +Our code is available at https://github.com/pro2nit/STREAM. + +
+
+ comment: Our work is accepted to ICLR 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 222 + +
+
+
+ + ☆ Real Acoustic Fields: An Audio-Visual Room Acoustics Dataset and + Benchmark CVPR 2024 + + +
+ We present a new dataset called Real Acoustic Fields (RAF) that captures real +acoustic room data from multiple modalities. The dataset includes high-quality +and densely captured room impulse response data paired with multi-view images, +and precise 6DoF pose tracking data for sound emitters and listeners in the +rooms. We used this dataset to evaluate existing methods for novel-view +acoustic synthesis and impulse response generation which previously relied on +synthetic data. In our evaluation, we thoroughly assessed existing audio and +audio-visual models against multiple criteria and proposed settings to enhance +their performance on real-world data. We also conducted experiments to +investigate the impact of incorporating visual data (i.e., images and depth) +into neural acoustic field models. Additionally, we demonstrated the +effectiveness of a simple sim2real approach, where a model is pre-trained with +simulated data and fine-tuned with sparse real-world data, resulting in +significant improvements in the few-shot learning approach. RAF is the first +dataset to provide densely captured room acoustic data, making it an ideal +resource for researchers working on audio and audio-visual neural acoustic +field modeling techniques. Demos and datasets are available on our project +page: https://facebookresearch.github.io/real-acoustic-fields/ + +
+
+ comment: Accepted to CVPR 2024. Project site: + https://facebookresearch.github.io/real-acoustic-fields/ +
+
+
+
+
+ + ☆ MetaCap: Meta-learning Priors from Multi-View Imagery for Sparse-view + Human Performance Capture and Rendering + + +
+ Faithful human performance capture and free-view rendering from sparse RGB +observations is a long-standing problem in Vision and Graphics. The main +challenges are the lack of observations and the inherent ambiguities of the +setting, e.g. occlusions and depth ambiguity. As a result, radiance fields, +which have shown great promise in capturing high-frequency appearance and +geometry details in dense setups, perform poorly when na\"ively supervising +them on sparse camera views, as the field simply overfits to the sparse-view +inputs. To address this, we propose MetaCap, a method for efficient and +high-quality geometry recovery and novel view synthesis given very sparse or +even a single view of the human. Our key idea is to meta-learn the radiance +field weights solely from potentially sparse multi-view videos, which can serve +as a prior when fine-tuning them on sparse imagery depicting the human. This +prior provides a good network weight initialization, thereby effectively +addressing ambiguities in sparse-view capture. Due to the articulated structure +of the human body and motion-induced surface deformations, learning such a +prior is non-trivial. Therefore, we propose to meta-learn the field weights in +a pose-canonicalized space, which reduces the spatial feature range and makes +feature learning more effective. Consequently, one can fine-tune our field +parameters to quickly generalize to unseen poses, novel illumination conditions +as well as novel and sparse (even monocular) camera views. For evaluating our +method under different scenarios, we collect a new dataset, WildDynaCap, which +contains subjects captured in, both, a dense camera dome and in-the-wild sparse +camera rigs, and demonstrate superior results compared to recent +state-of-the-art methods on both public and WildDynaCap dataset. + +
+
+ comment: Project page: https://vcai.mpi-inf.mpg.de/projects/MetaCap/ +
+
+
+
+
+ + ☆ Benchmarking Object Detectors with COCO: A New Path Forward + + +
+ The Common Objects in Context (COCO) dataset has been instrumental in +benchmarking object detectors over the past decade. Like every dataset, COCO +contains subtle errors and imperfections stemming from its annotation +procedure. With the advent of high-performing models, we ask whether these +errors of COCO are hindering its utility in reliably benchmarking further +progress. In search for an answer, we inspect thousands of masks from COCO +(2017 version) and uncover different types of errors such as imprecise mask +boundaries, non-exhaustively annotated instances, and mislabeled masks. Due to +the prevalence of COCO, we choose to correct these errors to maintain +continuity with prior research. We develop COCO-ReM (Refined Masks), a cleaner +set of annotations with visibly better mask quality than COCO-2017. We evaluate +fifty object detectors and find that models that predict visually sharper masks +score higher on COCO-ReM, affirming that they were being incorrectly penalized +due to errors in COCO-2017. Moreover, our models trained using COCO-ReM +converge faster and score higher than their larger variants trained using +COCO-2017, highlighting the importance of data quality in improving object +detectors. With these findings, we advocate using COCO-ReM for future object +detection research. Our dataset is available at https://cocorem.xyz + +
+
+ comment: Technical report. Dataset website: https://cocorem.xyz and code: + https://github.com/kdexd/coco-rem +
+
+
+
+
+ + ☆ ObjectDrop: Bootstrapping Counterfactuals for Photorealistic Object + Removal and Insertion + + +
+ Diffusion models have revolutionized image editing but often generate images +that violate physical laws, particularly the effects of objects on the scene, +e.g., occlusions, shadows, and reflections. By analyzing the limitations of +self-supervised approaches, we propose a practical solution centered on a +\q{counterfactual} dataset. Our method involves capturing a scene before and +after removing a single object, while minimizing other changes. By fine-tuning +a diffusion model on this dataset, we are able to not only remove objects but +also their effects on the scene. However, we find that applying this approach +for photorealistic object insertion requires an impractically large dataset. To +tackle this challenge, we propose bootstrap supervision; leveraging our object +removal model trained on a small counterfactual dataset, we synthetically +expand this dataset considerably. Our approach significantly outperforms prior +methods in photorealistic object removal and insertion, particularly at +modeling the effects of objects on the scene. + +
+
+
+
+
+ + ☆ Garment3DGen: 3D Garment Stylization and Texture Generation + + +
+ We introduce Garment3DGen a new method to synthesize 3D garment assets from a +base mesh given a single input image as guidance. Our proposed approach allows +users to generate 3D textured clothes based on both real and synthetic images, +such as those generated by text prompts. The generated assets can be directly +draped and simulated on human bodies. First, we leverage the recent progress of +image to 3D diffusion methods to generate 3D garment geometries. However, since +these geometries cannot be utilized directly for downstream tasks, we propose +to use them as pseudo ground-truth and set up a mesh deformation optimization +procedure that deforms a base template mesh to match the generated 3D target. +Second, we introduce carefully designed losses that allow the input base mesh +to freely deform towards the desired target, yet preserve mesh quality and +topology such that they can be simulated. Finally, a texture estimation module +generates high-fidelity texture maps that are globally and locally consistent +and faithfully capture the input guidance, allowing us to render the generated +3D assets. With Garment3DGen users can generate the textured 3D garment of +their choice without the need of artist intervention. One can provide a textual +prompt describing the garment they desire to generate a simulation-ready 3D +asset. We present a plethora of quantitative and qualitative comparisons on +various assets both real and generated and provide use-cases of how one can +generate simulation-ready 3D garments. + +
+
+ comment: Project Page: https://nsarafianos.github.io/garment3dgen +
+
+
+
+
+ + ☆ Mini-Gemini: Mining the Potential of Multi-modality Vision Language + Models + + +
+ In this work, we introduce Mini-Gemini, a simple and effective framework +enhancing multi-modality Vision Language Models (VLMs). Despite the +advancements in VLMs facilitating basic visual dialog and reasoning, a +performance gap persists compared to advanced models like GPT-4 and Gemini. We +try to narrow the gap by mining the potential of VLMs for better performance +and any-to-any workflow from three aspects, i.e., high-resolution visual +tokens, high-quality data, and VLM-guided generation. To enhance visual tokens, +we propose to utilize an additional visual encoder for high-resolution +refinement without increasing the visual token count. We further construct a +high-quality dataset that promotes precise image comprehension and +reasoning-based generation, expanding the operational scope of current VLMs. In +general, Mini-Gemini further mines the potential of VLMs and empowers current +frameworks with image understanding, reasoning, and generation simultaneously. +Mini-Gemini supports a series of dense and MoE Large Language Models (LLMs) +from 2B to 34B. It is demonstrated to achieve leading performance in several +zero-shot benchmarks and even surpasses the developed private models. Code and +models are available at https://github.com/dvlab-research/MiniGemini. + +
+
+ comment: Code and models are available at + https://github.com/dvlab-research/MiniGemini +
+
+
+
+
+ + ☆ Duolando: Follower GPT with Off-Policy Reinforcement Learning for Dance + Accompaniment ICLR 2024 + + +
+ We introduce a novel task within the field of 3D dance generation, termed +dance accompaniment, which necessitates the generation of responsive movements +from a dance partner, the "follower", synchronized with the lead dancer's +movements and the underlying musical rhythm. Unlike existing solo or group +dance generation tasks, a duet dance scenario entails a heightened degree of +interaction between the two participants, requiring delicate coordination in +both pose and position. To support this task, we first build a large-scale and +diverse duet interactive dance dataset, DD100, by recording about 117 minutes +of professional dancers' performances. To address the challenges inherent in +this task, we propose a GPT-based model, Duolando, which autoregressively +predicts the subsequent tokenized motion conditioned on the coordinated +information of the music, the leader's and the follower's movements. To further +enhance the GPT's capabilities of generating stable results on unseen +conditions (music and leader motions), we devise an off-policy reinforcement +learning strategy that allows the model to explore viable trajectories from +out-of-distribution samplings, guided by human-defined rewards. Based on the +collected dataset and proposed method, we establish a benchmark with several +carefully designed metrics. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth + Estimation CVPR + + +
+ In the absence of parallax cues, a learning-based single image depth +estimation (SIDE) model relies heavily on shading and contextual cues in the +image. While this simplicity is attractive, it is necessary to train such +models on large and varied datasets, which are difficult to capture. It has +been shown that using embeddings from pre-trained foundational models, such as +CLIP, improves zero shot transfer in several applications. Taking inspiration +from this, in our paper we explore the use of global image priors generated +from a pre-trained ViT model to provide more detailed contextual information. +We argue that the embedding vector from a ViT model, pre-trained on a large +dataset, captures greater relevant information for SIDE than the usual route of +generating pseudo image captions, followed by CLIP based text embeddings. Based +on this idea, we propose a new SIDE model using a diffusion backbone which is +conditioned on ViT embeddings. Our proposed design establishes a new +state-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of +0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on +KITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to +0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model +trained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%) +over NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%, +18%, 45%, 9%) by ZoeDepth. The code is available at +https://github.com/Aradhye2002/EcoDepth. + +
+
+ comment: Accepted at IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ Gamba: Marry Gaussian Splatting with Mamba for single view 3D + reconstruction + + +
+ We tackle the challenge of efficiently reconstructing a 3D asset from a +single image with growing demands for automated 3D content creation pipelines. +Previous methods primarily rely on Score Distillation Sampling (SDS) and Neural +Radiance Fields (NeRF). Despite their significant success, these approaches +encounter practical limitations due to lengthy optimization and considerable +memory usage. In this report, we introduce Gamba, an end-to-end amortized 3D +reconstruction model from single-view images, emphasizing two main insights: +(1) 3D representation: leveraging a large number of 3D Gaussians for an +efficient 3D Gaussian splatting process; (2) Backbone design: introducing a +Mamba-based sequential network that facilitates context-dependent reasoning and +linear scalability with the sequence (token) length, accommodating a +substantial number of Gaussians. Gamba incorporates significant advancements in +data preprocessing, regularization design, and training methodologies. We +assessed Gamba against existing optimization-based and feed-forward 3D +generation approaches using the real-world scanned OmniObject3D dataset. Here, +Gamba demonstrates competitive generation capabilities, both qualitatively and +quantitatively, while achieving remarkable speed, approximately 0.6 second on a +single NVIDIA A100 GPU. + +
+
+
+
+
+ + ☆ Object Pose Estimation via the Aggregation of Diffusion Features CVPR2024 + + +
+ Estimating the pose of objects from images is a crucial task of 3D scene +understanding, and recent approaches have shown promising results on very large +benchmarks. However, these methods experience a significant performance drop +when dealing with unseen objects. We believe that it results from the limited +generalizability of image features. To address this problem, we have an +in-depth analysis on the features of diffusion models, e.g. Stable Diffusion, +which hold substantial potential for modeling unseen objects. Based on this +analysis, we then innovatively introduce these diffusion features for object +pose estimation. To achieve this, we propose three distinct architectures that +can effectively capture and aggregate diffusion features of different +granularity, greatly improving the generalizability of object pose estimation. +Our approach outperforms the state-of-the-art methods by a considerable margin +on three popular benchmark datasets, LM, O-LM, and T-LESS. In particular, our +method achieves higher accuracy than the previous best arts on unseen objects: +98.2% vs. 93.5% on Unseen LM, 85.9% vs. 76.3% on Unseen O-LM, showing the +strong generalizability of our method. Our code is released at +https://github.com/Tianfu18/diff-feats-pose. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ SplatFace: Gaussian Splat Face Reconstruction Leveraging an Optimizable + Surface + + +
+ We present SplatFace, a novel Gaussian splatting framework designed for 3D +human face reconstruction without reliance on accurate pre-determined geometry. +Our method is designed to simultaneously deliver both high-quality novel view +rendering and accurate 3D mesh reconstructions. We incorporate a generic 3D +Morphable Model (3DMM) to provide a surface geometric structure, making it +possible to reconstruct faces with a limited set of input images. We introduce +a joint optimization strategy that refines both the Gaussians and the morphable +surface through a synergistic non-rigid alignment process. A novel distance +metric, splat-to-surface, is proposed to improve alignment by considering both +the Gaussian position and covariance. The surface information is also utilized +to incorporate a world-space densification process, resulting in superior +reconstruction quality. Our experimental analysis demonstrates that the +proposed method is competitive with both other Gaussian splatting techniques in +novel view synthesis and other 3D reconstruction methods in producing 3D face +meshes with high geometric precision. + +
+
+
+
+
+ + ☆ ImageNet-D: Benchmarking Neural Network Robustness on Diffusion + Synthetic Object CVPR 2024 + + +
+ We establish rigorous benchmarks for visual perception robustness. Synthetic +images such as ImageNet-C, ImageNet-9, and Stylized ImageNet provide specific +type of evaluation over synthetic corruptions, backgrounds, and textures, yet +those robustness benchmarks are restricted in specified variations and have low +synthetic quality. In this work, we introduce generative model as a data source +for synthesizing hard images that benchmark deep models' robustness. Leveraging +diffusion models, we are able to generate images with more diversified +backgrounds, textures, and materials than any prior work, where we term this +benchmark as ImageNet-D. Experimental results show that ImageNet-D results in a +significant accuracy drop to a range of vision models, from the standard ResNet +visual classifier to the latest foundation models like CLIP and MiniGPT-4, +significantly reducing their accuracy by up to 60\%. Our work suggests that +diffusion models can be an effective source to test vision models. The code and +dataset are available at https://github.com/chenshuang-zhang/imagenet_d. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ ModaLink: Unifying Modalities for Efficient Image-to-PointCloud Place + Recognition + + +
+ Place recognition is an important task for robots and autonomous cars to +localize themselves and close loops in pre-built maps. While single-modal +sensor-based methods have shown satisfactory performance, cross-modal place +recognition that retrieving images from a point-cloud database remains a +challenging problem. Current cross-modal methods transform images into 3D +points using depth estimation for modality conversion, which are usually +computationally intensive and need expensive labeled data for depth +supervision. In this work, we introduce a fast and lightweight framework to +encode images and point clouds into place-distinctive descriptors. We propose +an effective Field of View (FoV) transformation module to convert point clouds +into an analogous modality as images. This module eliminates the necessity for +depth estimation and helps subsequent modules achieve real-time performance. We +further design a non-negative factorization-based encoder to extract mutually +consistent semantic features between point clouds and images. This encoder +yields more distinctive global descriptors for retrieval. Experimental results +on the KITTI dataset show that our proposed methods achieve state-of-the-art +performance while running in real time. Additional evaluation on the HAOMO +dataset covering a 17 km trajectory further shows the practical generalization +capabilities. We have released the implementation of our methods as open source +at: https://github.com/haomo-ai/ModaLink.git. + +
+
+ comment: 8 pages, 11 figures, conference +
+
+
+
+
+ + ☆ Detection of subclinical atherosclerosis by image-based deep learning on + chest x-ray + + +
+ Aims. To develop a deep-learning based system for recognition of subclinical +atherosclerosis on a plain frontal chest x-ray. Methods and Results. A +deep-learning algorithm to predict coronary artery calcium (CAC) score (the +AI-CAC model) was developed on 460 chest x-ray (80% training cohort, 20% +internal validation cohort) of primary prevention patients (58.4% male, median +age 63 [51-74] years) with available paired chest x-ray and chest computed +tomography (CT) indicated for any clinical reason and performed within 3 +months. The CAC score calculated on chest CT was used as ground truth. The +model was validated on an temporally-independent cohort of 90 patients from the +same institution (external validation). The diagnostic accuracy of the AI-CAC +model assessed by the area under the curve (AUC) was the primary outcome. +Overall, median AI-CAC score was 35 (0-388) and 28.9% patients had no AI-CAC. +AUC of the AI-CAC model to identify a CAC>0 was 0.90 in the internal validation +cohort and 0.77 in the external validation cohort. Sensitivity was consistently +above 92% in both cohorts. In the overall cohort (n=540), among patients with +AI-CAC=0, a single ASCVD event occurred, after 4.3 years. Patients with +AI-CAC>0 had significantly higher Kaplan Meier estimates for ASCVD events +(13.5% vs. 3.4%, log-rank=0.013). Conclusion. The AI-CAC model seems to +accurately detect subclinical atherosclerosis on chest x-ray with elevated +sensitivity, and to predict ASCVD events with elevated negative predictive +value. Adoption of the AI-CAC model to refine CV risk stratification or as an +opportunistic screening tool requires prospective evaluation. + +
+
+ comment: Submitted to European Heart Journal - Cardiovascular Imaging Added + also the additional material 44 pages (30 main paper, 14 additional + material), 14 figures (5 main manuscript, 9 additional material) +
+
+
+
+
+ + ☆ A vascular synthetic model for improved aneurysm segmentation and + detection via Deep Neural Networks + + +
+ We hereby present a full synthetic model, able to mimic the various +constituents of the cerebral vascular tree: the cerebral arteries, the +bifurcations and the intracranial aneurysms. By building this model, our goal +was to provide a substantial dataset of brain arteries which could be used by a +3D Convolutional Neural Network (CNN) to either segment or detect/recognize +various vascular diseases (such as artery dissection/thrombosis) or even some +portions of the cerebral vasculature, such as the bifurcations or aneurysms. In +this study, we will particularly focus on Intra-Cranial Aneurysm (ICA) +detection and segmentation. The cerebral aneurysms most often occur on a +particular structure of the vascular tree named the Circle of Willis. Various +studies have been conducted to detect and monitor the ICAs and those based on +Deep Learning (DL) achieve the best performances. Specifically, in this work, +we propose a full synthetic 3D model able to mimic the brain vasculature as +acquired by Magnetic Resonance Angiography (MRA), and more particularly the +Time Of Flight (TOF) principle. Among the various MRI modalities, the MRA-TOF +allows to have a relatively good rendering of the blood vessels and is +non-invasive (no contrast liquid injection). Our model has been designed to +simultaneously mimic the arteries geometry, the ICA shape and the background +noise. The geometry of the vascular tree is modeled thanks to an interpolation +with 3D Spline functions, and the statistical properties of the background MRI +noise is collected from MRA acquisitions and reproduced within the model. In +this work, we thoroughly describe the synthetic vasculature model, we build up +a neural network designed for ICA segmentation and detection, and finally, we +carry out an in-depth evaluation of the performance gap gained thanks to the +synthetic model data augmentation. + +
+
+
+
+
+ + ☆ Enhancing Manufacturing Quality Prediction Models through the + Integration of Explainability Methods + + +
+ This research presents a method that utilizes explainability techniques to +amplify the performance of machine learning (ML) models in forecasting the +quality of milling processes, as demonstrated in this paper through a +manufacturing use case. The methodology entails the initial training of ML +models, followed by a fine-tuning phase where irrelevant features identified +through explainability methods are eliminated. This procedural refinement +results in performance enhancements, paving the way for potential reductions in +manufacturing costs and a better understanding of the trained ML models. This +study highlights the usefulness of explainability techniques in both explaining +and optimizing predictive models in the manufacturing realm. + +
+
+
+
+
+ + ☆ Towards Image Ambient Lighting Normalization + + +
+ Lighting normalization is a crucial but underexplored restoration task with +broad applications. However, existing works often simplify this task within the +context of shadow removal, limiting the light sources to one and +oversimplifying the scene, thus excluding complex self-shadows and restricting +surface classes to smooth ones. Although promising, such simplifications hinder +generalizability to more realistic settings encountered in daily use. In this +paper, we propose a new challenging task termed Ambient Lighting Normalization +(ALN), which enables the study of interactions between shadows, unifying image +restoration and shadow removal in a broader context. To address the lack of +appropriate datasets for ALN, we introduce the large-scale high-resolution +dataset Ambient6K, comprising samples obtained from multiple light sources and +including self-shadows resulting from complex geometries, which is the first of +its kind. For benchmarking, we select various mainstream methods and rigorously +evaluate them on Ambient6K. Additionally, we propose IFBlend, a novel strong +baseline that maximizes Image-Frequency joint entropy to selectively restore +local areas under different lighting conditions, without relying on shadow +localization priors. Experiments show that IFBlend achieves SOTA scores on +Ambient6K and exhibits competitive performance on conventional shadow removal +benchmarks compared to shadow-specific models with mask priors. The dataset, +benchmark, and code are available at https://github.com/fvasluianu97/IFBlend. + +
+
+
+
+
+ + ☆ Semi-Supervised Learning for Deep Causal Generative Models + + +
+ Developing models that can answer questions of the form "How would $x$ change +if $y$ had been $z$?" is fundamental for advancing medical image analysis. +Training causal generative models that address such counterfactual questions, +though, currently requires that all relevant variables have been observed and +that corresponding labels are available in training data. However, clinical +data may not have complete records for all patients and state of the art causal +generative models are unable to take full advantage of this. We thus develop, +for the first time, a semi-supervised deep causal generative model that +exploits the causal relationships between variables to maximise the use of all +available data. We explore this in the setting where each sample is either +fully labelled or fully unlabelled, as well as the more clinically realistic +case of having different labels missing for each sample. We leverage techniques +from causal inference to infer missing values and subsequently generate +realistic counterfactuals, even for samples with incomplete labels. + +
+
+
+
+
+ + ☆ Mitigating Hallucinations in Large Vision-Language Models with + Instruction Contrastive Decoding + + +
+ Large Vision-Language Models (LVLMs) are increasingly adept at generating +contextually detailed and coherent responses from visual inputs. However, their +application in multimodal decision-making and open-ended generation is hindered +by a notable rate of hallucinations, where generated text inaccurately +represents the visual contents. To address this issue, this paper introduces +the Instruction Contrastive Decoding (ICD) method, a novel approach designed to +reduce hallucinations during LVLM inference. Our method is inspired by our +observation that what we call disturbance instructions significantly exacerbate +hallucinations in multimodal fusion modules. ICD contrasts distributions from +standard and instruction disturbance, thereby increasing alignment uncertainty +and effectively subtracting hallucinated concepts from the original +distribution. Through comprehensive experiments on discriminative benchmarks +(POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that +ICD significantly mitigates both object-level and attribute-level +hallucinations. Moreover, our method not only addresses hallucinations but also +significantly enhances the general perception and recognition capabilities of +LVLMs. + +
+
+
+
+
+ + ☆ Bringing Textual Prompt to AI-Generated Image Quality Assessment ICME2024 + + +
+ AI-Generated Images (AGIs) have inherent multimodal nature. Unlike +traditional image quality assessment (IQA) on natural scenarios, AGIs quality +assessment (AGIQA) takes the correspondence of image and its textual prompt +into consideration. This is coupled in the ground truth score, which confuses +the unimodal IQA methods. To solve this problem, we introduce IP-IQA (AGIs +Quality Assessment via Image and Prompt), a multimodal framework for AGIQA via +corresponding image and prompt incorporation. Specifically, we propose a novel +incremental pretraining task named Image2Prompt for better understanding of +AGIs and their corresponding textual prompts. An effective and efficient +image-prompt fusion module, along with a novel special [QA] token, are also +applied. Both are plug-and-play and beneficial for the cooperation of image and +its corresponding prompt. Experiments demonstrate that our IP-IQA achieves the +state-of-the-art on AGIQA-1k and AGIQA-3k datasets. Code will be available. + +
+
+ comment: 6 pages, 3 figures, accepted by ICME2024 +
+
+
+
+
+ + ☆ SAT-NGP : Unleashing Neural Graphics Primitives for Fast Relightable + Transient-Free 3D reconstruction from Satellite Imagery + + +
+ Current stereo-vision pipelines produce high accuracy 3D reconstruction when +using multiple pairs or triplets of satellite images. However, these pipelines +are sensitive to the changes between images that can occur as a result of +multi-date acquisitions. Such variations are mainly due to variable shadows, +reflexions and transient objects (cars, vegetation). To take such changes into +account, Neural Radiance Fields (NeRF) have recently been applied to multi-date +satellite imagery. However, Neural methods are very compute-intensive, taking +dozens of hours to learn, compared with minutes for standard stereo-vision +pipelines. Following the ideas of Instant Neural Graphics Primitives we propose +to use an efficient sampling strategy and multi-resolution hash encoding to +accelerate the learning. Our model, Satellite Neural Graphics Primitives +(SAT-NGP) decreases the learning time to 15 minutes while maintaining the +quality of the 3D reconstruction. + +
+
+ comment: 5 pages, 3 figures, 1 table; Accepted to International Geoscience and + Remote Sensing Symposium (IGARSS) 2024; Code available at + https://github.com/Ellimac0/SAT-NGP +
+
+
+
+
+ + ☆ Dense Vision Transformer Compression with Few Samples CVPR 2024 + + +
+ Few-shot model compression aims to compress a large model into a more compact +one with only a tiny training set (even without labels). Block-level pruning +has recently emerged as a leading technique in achieving high accuracy and low +latency in few-shot CNN compression. But, few-shot compression for Vision +Transformers (ViT) remains largely unexplored, which presents a new challenge. +In particular, the issue of sparse compression exists in traditional CNN +few-shot methods, which can only produce very few compressed models of +different model sizes. This paper proposes a novel framework for few-shot ViT +compression named DC-ViT. Instead of dropping the entire block, DC-ViT +selectively eliminates the attention module while retaining and reusing +portions of the MLP module. DC-ViT enables dense compression, which outputs +numerous compressed models that densely populate the range of model complexity. +DC-ViT outperforms state-of-the-art few-shot compression methods by a +significant margin of 10 percentage points, along with lower latency in the +compression of ViT and its variants. + +
+
+ comment: Accepted to CVPR 2024. Note: Jianxin Wu is a contributing author for + the arXiv version of this paper but is not listed as an author in the CVPR + version due to his role as Program Chair +
+
+
+
+
+ + ☆ Annolid: Annotate, Segment, and Track Anything You Need + + +
+ Annolid is a deep learning-based software package designed for the +segmentation, labeling, and tracking of research targets within video files, +focusing primarily on animal behavior analysis. Based on state-of-the-art +instance segmentation methods, Annolid now harnesses the Cutie video object +segmentation model to achieve resilient, markerless tracking of multiple +animals from single annotated frames, even in environments in which they may be +partially or entirely concealed by environmental features or by one another. +Our integration of Segment Anything and Grounding-DINO strategies additionally +enables the automatic masking and segmentation of recognizable animals and +objects by text command, removing the need for manual annotation. Annolid's +comprehensive approach to object segmentation flexibly accommodates a broad +spectrum of behavior analysis applications, enabling the classification of +diverse behavioral states such as freezing, digging, pup huddling, and social +interactions in addition to the tracking of animals and their body parts. + +
+
+
+
+
+ + ☆ Deep Learning for Robust and Explainable Models in Computer Vision + + +
+ Recent breakthroughs in machine and deep learning (ML and DL) research have +provided excellent tools for leveraging enormous amounts of data and optimizing +huge models with millions of parameters to obtain accurate networks for image +processing. These developments open up tremendous opportunities for using +artificial intelligence (AI) in the automation and human assisted AI industry. +However, as more and more models are deployed and used in practice, many +challenges have emerged. This thesis presents various approaches that address +robustness and explainability challenges for using ML and DL in practice. + Robustness and reliability are the critical components of any model before +certification and deployment in practice. Deep convolutional neural networks +(CNNs) exhibit vulnerability to transformations of their inputs, such as +rotation and scaling, or intentional manipulations as described in the +adversarial attack literature. In addition, building trust in AI-based models +requires a better understanding of current models and developing methods that +are more explainable and interpretable a priori. + This thesis presents developments in computer vision models' robustness and +explainability. Furthermore, this thesis offers an example of using vision +models' feature response visualization (models' interpretations) to improve +robustness despite interpretability and robustness being seemingly unrelated in +the related research. Besides methodological developments for robust and +explainable vision models, a key message of this thesis is introducing model +interpretation techniques as a tool for understanding vision models and +improving their design and robustness. In addition to the theoretical +developments, this thesis demonstrates several applications of ML and DL in +different contexts, such as medical imaging and affective computing. + +
+
+ comment: 150 pages, 37 figures, 12 tables +
+
+
+
+
+ + ☆ InstructBrush: Learning Attention-based Instruction Optimization for + Image Editing + + +
+ In recent years, instruction-based image editing methods have garnered +significant attention in image editing. However, despite encompassing a wide +range of editing priors, these methods are helpless when handling editing tasks +that are challenging to accurately describe through language. We propose +InstructBrush, an inversion method for instruction-based image editing methods +to bridge this gap. It extracts editing effects from exemplar image pairs as +editing instructions, which are further applied for image editing. Two key +techniques are introduced into InstructBrush, Attention-based Instruction +Optimization and Transformation-oriented Instruction Initialization, to address +the limitations of the previous method in terms of inversion effects and +instruction generalization. To explore the ability of instruction inversion +methods to guide image editing in open scenarios, we establish a +TransformationOriented Paired Benchmark (TOP-Bench), which contains a rich set +of scenes and editing types. The creation of this benchmark paves the way for +further exploration of instruction inversion. Quantitatively and qualitatively, +our approach achieves superior performance in editing and is more semantically +consistent with the target editing effects. + +
+
+ comment: Project Page: https://royzhao926.github.io/InstructBrush/ +
+
+
+
+
+ + ☆ Addressing Data Annotation Challenges in Multiple Sensors: A Solution + for Scania Collected Datasets + + +
+ Data annotation in autonomous vehicles is a critical step in the development +of Deep Neural Network (DNN) based models or the performance evaluation of the +perception system. This often takes the form of adding 3D bounding boxes on +time-sequential and registered series of point-sets captured from active +sensors like Light Detection and Ranging (LiDAR) and Radio Detection and +Ranging (RADAR). When annotating multiple active sensors, there is a need to +motion compensate and translate the points to a consistent coordinate frame and +timestamp respectively. However, highly dynamic objects pose a unique +challenge, as they can appear at different timestamps in each sensor's data. +Without knowing the speed of the objects, their position appears to be +different in different sensor outputs. Thus, even after motion compensation, +highly dynamic objects are not matched from multiple sensors in the same frame, +and human annotators struggle to add unique bounding boxes that capture all +objects. This article focuses on addressing this challenge, primarily within +the context of Scania collected datasets. The proposed solution takes a track +of an annotated object as input and uses the Moving Horizon Estimation (MHE) to +robustly estimate its speed. The estimated speed profile is utilized to correct +the position of the annotated box and add boxes to object clusters missed by +the original annotation. + +
+
+ comment: Accepted to European Control Conference 2024 +
+
+
+
+
+ + ☆ Transformers-based architectures for stroke segmentation: A review + + +
+ Stroke remains a significant global health concern, necessitating precise and +efficient diagnostic tools for timely intervention and improved patient +outcomes. The emergence of deep learning methodologies has transformed the +landscape of medical image analysis. Recently, Transformers, initially designed +for natural language processing, have exhibited remarkable capabilities in +various computer vision applications, including medical image analysis. This +comprehensive review aims to provide an in-depth exploration of the +cutting-edge Transformer-based architectures applied in the context of stroke +segmentation. It commences with an exploration of stroke pathology, imaging +modalities, and the challenges associated with accurate diagnosis and +segmentation. Subsequently, the review delves into the fundamental ideas of +Transformers, offering detailed insights into their architectural intricacies +and the underlying mechanisms that empower them to effectively capture complex +spatial information within medical images. The existing literature is +systematically categorized and analyzed, discussing various approaches that +leverage Transformers for stroke segmentation. A critical assessment is +provided, highlighting the strengths and limitations of these methods, +including considerations of performance and computational efficiency. +Additionally, this review explores potential avenues for future research and +development + +
+
+
+
+
+ + ☆ FlexEdit: Flexible and Controllable Diffusion-based Object-centric Image + Editing + + +
+ Our work addresses limitations seen in previous approaches for object-centric +editing problems, such as unrealistic results due to shape discrepancies and +limited control in object replacement or insertion. To this end, we introduce +FlexEdit, a flexible and controllable editing framework for objects where we +iteratively adjust latents at each denoising step using our FlexEdit block. +Initially, we optimize latents at test time to align with specified object +constraints. Then, our framework employs an adaptive mask, automatically +extracted during denoising, to protect the background while seamlessly blending +new content into the target image. We demonstrate the versatility of FlexEdit +in various object editing tasks and curate an evaluation test suite with +samples from both real and synthetic images, along with novel evaluation +metrics designed for object-centric editing. We conduct extensive experiments +on different editing scenarios, demonstrating the superiority of our editing +framework over recent advanced text-guided image editing methods. Our project +page is published at https://flex-edit.github.io/. + +
+
+ comment: Our project page: https://flex-edit.github.io/ +
+
+
+
+
+ + ☆ RAP: Retrieval-Augmented Planner for Adaptive Procedure Planning in + Instructional Videos + + +
+ Procedure Planning in instructional videos entails generating a sequence of +action steps based on visual observations of the initial and target states. +Despite the rapid progress in this task, there remain several critical +challenges to be solved: (1) Adaptive procedures: Prior works hold an +unrealistic assumption that the number of action steps is known and fixed, +leading to non-generalizable models in real-world scenarios where the sequence +length varies. (2) Temporal relation: Understanding the step temporal relation +knowledge is essential in producing reasonable and executable plans. (3) +Annotation cost: Annotating instructional videos with step-level labels (i.e., +timestamp) or sequence-level labels (i.e., action category) is demanding and +labor-intensive, limiting its generalizability to large-scale datasets.In this +work, we propose a new and practical setting, called adaptive procedure +planning in instructional videos, where the procedure length is not fixed or +pre-determined. To address these challenges we introduce Retrieval-Augmented +Planner (RAP) model. Specifically, for adaptive procedures, RAP adaptively +determines the conclusion of actions using an auto-regressive model +architecture. For temporal relation, RAP establishes an external memory module +to explicitly retrieve the most relevant state-action pairs from the training +videos and revises the generated procedures. To tackle high annotation cost, +RAP utilizes a weakly-supervised learning manner to expand the training dataset +to other task-relevant, unannotated videos by generating pseudo labels for +action steps. Experiments on CrossTask and COIN benchmarks show the superiority +of RAP over traditional fixed-length models, establishing it as a strong +baseline solution for adaptive procedure planning. + +
+
+ comment: 23 pages, 6 figures, 12 tables +
+
+
+
+
+ + ☆ Homogeneous Tokenizer Matters: Homogeneous Visual Tokenizer for Remote + Sensing Image Understanding + + +
+ The tokenizer, as one of the fundamental components of large models, has long +been overlooked or even misunderstood in visual tasks. One key factor of the +great comprehension power of the large language model is that natural language +tokenizers utilize meaningful words or subwords as the basic elements of +language. In contrast, mainstream visual tokenizers, represented by patch-based +methods such as Patch Embed, rely on meaningless rectangular patches as basic +elements of vision, which cannot serve as effectively as words or subwords in +language. Starting from the essence of the tokenizer, we defined semantically +independent regions (SIRs) for vision. We designed a simple HOmogeneous visual +tOKenizer: HOOK. HOOK mainly consists of two modules: the Object Perception +Module (OPM) and the Object Vectorization Module (OVM). To achieve homogeneity, +the OPM splits the image into 4*4 pixel seeds and then utilizes the attention +mechanism to perceive SIRs. The OVM employs cross-attention to merge seeds +within the same SIR. To achieve adaptability, the OVM defines a variable number +of learnable vectors as cross-attention queries, allowing for the adjustment of +token quantity. We conducted experiments on the NWPU-RESISC45, WHU-RS19 +classification dataset, and GID5 segmentation dataset for sparse and dense +tasks. The results demonstrate that the visual tokens obtained by HOOK +correspond to individual objects, which demonstrates homogeneity. HOOK +outperformed Patch Embed by 6\% and 10\% in the two tasks and achieved +state-of-the-art performance compared to the baselines used for comparison. +Compared to Patch Embed, which requires more than one hundred tokens for one +image, HOOK requires only 6 and 8 tokens for sparse and dense tasks, +respectively, resulting in efficiency improvements of 1.5 to 2.8 times. The +code is available at https://github.com/GeoX-Lab/Hook. + +
+
+ comment: 20 pages, 8 figures, 6 tables +
+
+
+
+
+ + ☆ Users prefer Jpegli over same-sized libjpeg-turbo or MozJPEG + + +
+ We performed pairwise comparisons by human raters of JPEG images from +MozJPEG, libjpeg-turbo and our new Jpegli encoder. When compressing images at a +quality similar to libjpeg-turbo quality 95, the Jpegli images were 54% likely +to be preferred over both libjpeg-turbo and MozJPEG images, but used only 2.8 +bits per pixel compared to libjpeg-turbo and MozJPEG that used 3.8 and 3.5 bits +per pixel respectively. The raw ratings and source images are publicly +available for further analysis and study. + +
+
+
+
+
+ + ☆ The Impact of Uniform Inputs on Activation Sparsity and Energy-Latency + Attacks in Computer Vision SP 2024 + + +
+ Resource efficiency plays an important role for machine learning nowadays. +The energy and decision latency are two critical aspects to ensure a +sustainable and practical application. Unfortunately, the energy consumption +and decision latency are not robust against adversaries. Researchers have +recently demonstrated that attackers can compute and submit so-called sponge +examples at inference time to increase the energy consumption and decision +latency of neural networks. In computer vision, the proposed strategy crafts +inputs with less activation sparsity which could otherwise be used to +accelerate the computation. In this paper, we analyze the mechanism how these +energy-latency attacks reduce activation sparsity. In particular, we find that +input uniformity is a key enabler. A uniform image, that is, an image with +mostly flat, uniformly colored surfaces, triggers more activations due to a +specific interplay of convolution, batch normalization, and ReLU activation. +Based on these insights, we propose two new simple, yet effective strategies +for crafting sponge examples: sampling images from a probability distribution +and identifying dense, yet inconspicuous inputs in natural datasets. We +empirically examine our findings in a comprehensive evaluation with multiple +image classification models and show that our attack achieves the same sparsity +effect as prior sponge-example methods, but at a fraction of computation +effort. We also show that our sponge examples transfer between different neural +networks. Finally, we discuss applications of our findings for the good by +improving efficiency by increasing sparsity. + +
+
+ comment: Accepted at the DLSP 2024 +
+
+
+
+
+ + ☆ HandBooster: Boosting 3D Hand-Mesh Reconstruction by Conditional + Synthesis and Sampling of Hand-Object Interactions + + +
+ Reconstructing 3D hand mesh robustly from a single image is very challenging, +due to the lack of diversity in existing real-world datasets. While data +synthesis helps relieve the issue, the syn-to-real gap still hinders its usage. +In this work, we present HandBooster, a new approach to uplift the data +diversity and boost the 3D hand-mesh reconstruction performance by training a +conditional generative space on hand-object interactions and purposely sampling +the space to synthesize effective data samples. First, we construct versatile +content-aware conditions to guide a diffusion model to produce realistic images +with diverse hand appearances, poses, views, and backgrounds; favorably, +accurate 3D annotations are obtained for free. Then, we design a novel +condition creator based on our similarity-aware distribution sampling +strategies to deliberately find novel and realistic interaction poses that are +distinctive from the training set. Equipped with our method, several baselines +can be significantly improved beyond the SOTA on the HO3D and DexYCB +benchmarks. Our code will be released on +https://github.com/hxwork/HandBooster_Pytorch. + +
+
+
+
+
+ + ☆ Artifact Reduction in 3D and 4D Cone-beam Computed Tomography Images + with Deep Learning -- A Review + + +
+ Deep learning based approaches have been used to improve image quality in +cone-beam computed tomography (CBCT), a medical imaging technique often used in +applications such as image-guided radiation therapy, implant dentistry or +orthopaedics. In particular, while deep learning methods have been applied to +reduce various types of CBCT image artifacts arising from motion, metal +objects, or low-dose acquisition, a comprehensive review summarizing the +successes and shortcomings of these approaches, with a primary focus on the +type of artifacts rather than the architecture of neural networks, is lacking +in the literature. In this review, the data generation and simulation +pipelines, and artifact reduction techniques are specifically investigated for +each type of artifact. We provide an overview of deep learning techniques that +have successfully been shown to reduce artifacts in 3D, as well as in +time-resolved (4D) CBCT through the use of projection- and/or volume-domain +optimizations, or by introducing neural networks directly within the CBCT +reconstruction algorithms. Research gaps are identified to suggest avenues for +future exploration. One of the key findings of this work is an observed trend +towards the use of generative models including GANs and score-based or +diffusion models, accompanied with the need for more diverse and open training +datasets and simulations. + +
+
+ comment: 16 pages, 4 figures, 1 Table, published in IEEE Access Journal +
+
+
+
+
+ + ☆ CosalPure: Learning Concept from Group Images for Robust Co-Saliency + Detection + + +
+ Co-salient object detection (CoSOD) aims to identify the common and salient +(usually in the foreground) regions across a given group of images. Although +achieving significant progress, state-of-the-art CoSODs could be easily +affected by some adversarial perturbations, leading to substantial accuracy +reduction. The adversarial perturbations can mislead CoSODs but do not change +the high-level semantic information (e.g., concept) of the co-salient objects. +In this paper, we propose a novel robustness enhancement framework by first +learning the concept of the co-salient objects based on the input group images +and then leveraging this concept to purify adversarial perturbations, which are +subsequently fed to CoSODs for robustness enhancement. Specifically, we propose +CosalPure containing two modules, i.e., group-image concept learning and +concept-guided diffusion purification. For the first module, we adopt a +pre-trained text-to-image diffusion model to learn the concept of co-salient +objects within group images where the learned concept is robust to adversarial +examples. For the second module, we map the adversarial image to the latent +space and then perform diffusion generation by embedding the learned concept +into the noise prediction function as an extra condition. Our method can +effectively alleviate the influence of the SOTA adversarial attack containing +different adversarial patterns, including exposure and noise. The extensive +results demonstrate that our method could enhance the robustness of CoSODs +significantly. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Attention Calibration for Disentangled Text-to-Image Personalization CVPR 2024 + + +
+ Recent thrilling progress in large-scale text-to-image (T2I) models has +unlocked unprecedented synthesis quality of AI-generated content (AIGC) +including image generation, 3D and video composition. Further, personalized +techniques enable appealing customized production of a novel concept given only +several images as reference. However, an intriguing problem persists: Is it +possible to capture multiple, novel concepts from one single reference image? +In this paper, we identify that existing approaches fail to preserve visual +consistency with the reference image and eliminate cross-influence from +concepts. To alleviate this, we propose an attention calibration mechanism to +improve the concept-level understanding of the T2I model. Specifically, we +first introduce new learnable modifiers bound with classes to capture +attributes of multiple concepts. Then, the classes are separated and +strengthened following the activation of the cross-attention operation, +ensuring comprehensive and self-contained concepts. Additionally, we suppress +the attention activation of different classes to mitigate mutual influence +among concepts. Together, our proposed method, dubbed DisenDiff, can learn +disentangled multiple concepts from one single image and produce novel +customized images with learned concepts. We demonstrate that our method +outperforms the current state of the art in both qualitative and quantitative +evaluations. More importantly, our proposed techniques are compatible with LoRA +and inpainting pipelines, enabling more interactive experiences. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ OrCo: Towards Better Generalization via Orthogonality and Contrast for + Few-Shot Class-Incremental Learning + + +
+ Few-Shot Class-Incremental Learning (FSCIL) introduces a paradigm in which +the problem space expands with limited data. FSCIL methods inherently face the +challenge of catastrophic forgetting as data arrives incrementally, making +models susceptible to overwriting previously acquired knowledge. Moreover, +given the scarcity of labeled samples available at any given time, models may +be prone to overfitting and find it challenging to strike a balance between +extensive pretraining and the limited incremental data. To address these +challenges, we propose the OrCo framework built on two core principles: +features' orthogonality in the representation space, and contrastive learning. +In particular, we improve the generalization of the embedding space by +employing a combination of supervised and self-supervised contrastive losses +during the pretraining phase. Additionally, we introduce OrCo loss to address +challenges arising from data limitations during incremental sessions. Through +feature space perturbations and orthogonality between classes, the OrCo loss +maximizes margins and reserves space for the following incremental data. This, +in turn, ensures the accommodation of incoming classes in the feature space +without compromising previously acquired knowledge. Our experimental results +showcase state-of-the-art performance across three benchmark datasets, +including mini-ImageNet, CIFAR100, and CUB datasets. Code is available at +https://github.com/noorahmedds/OrCo + +
+
+
+
+
+ + ☆ A Semi-supervised Nighttime Dehazing Baseline with Spatial-Frequency + Aware and Realistic Brightness Constraint CVPR2024 + + +
+ Existing research based on deep learning has extensively explored the problem +of daytime image dehazing. However, few studies have considered the +characteristics of nighttime hazy scenes. There are two distinctions between +nighttime and daytime haze. First, there may be multiple active colored light +sources with lower illumination intensity in nighttime scenes, which may cause +haze, glow and noise with localized, coupled and frequency inconsistent +characteristics. Second, due to the domain discrepancy between simulated and +real-world data, unrealistic brightness may occur when applying a dehazing +model trained on simulated data to real-world data. To address the above two +issues, we propose a semi-supervised model for real-world nighttime dehazing. +First, the spatial attention and frequency spectrum filtering are implemented +as a spatial-frequency domain information interaction module to handle the +first issue. Second, a pseudo-label-based retraining strategy and a local +window-based brightness loss for semi-supervised training process is designed +to suppress haze and glow while achieving realistic brightness. Experiments on +public benchmarks validate the effectiveness of the proposed method and its +superiority over state-of-the-art methods. The source code and Supplementary +Materials are placed in the https://github.com/Xiaofeng-life/SFSNiD. + +
+
+ comment: This paper is accepted by CVPR2024 +
+
+
+
+
+ + ☆ Efficient Heatmap-Guided 6-Dof Grasp Detection in Cluttered Scenes + + +
+ Fast and robust object grasping in clutter is a crucial component of +robotics. Most current works resort to the whole observed point cloud for 6-Dof +grasp generation, ignoring the guidance information excavated from global +semantics, thus limiting high-quality grasp generation and real-time +performance. In this work, we show that the widely used heatmaps are +underestimated in the efficiency of 6-Dof grasp generation. Therefore, we +propose an effective local grasp generator combined with grasp heatmaps as +guidance, which infers in a global-to-local semantic-to-point way. +Specifically, Gaussian encoding and the grid-based strategy are applied to +predict grasp heatmaps as guidance to aggregate local points into graspable +regions and provide global semantic information. Further, a novel non-uniform +anchor sampling mechanism is designed to improve grasp accuracy and diversity. +Benefiting from the high-efficiency encoding in the image space and focusing on +points in local graspable regions, our framework can perform high-quality grasp +detection in real-time and achieve state-of-the-art results. In addition, real +robot experiments demonstrate the effectiveness of our method with a success +rate of 94% and a clutter completion rate of 100%. Our code is available at +https://github.com/THU-VCLab/HGGD. + +
+
+ comment: Extensive results on GraspNet-1B dataset +
+
+
+
+
+ + ☆ Language Plays a Pivotal Role in the Object-Attribute Compositional + Generalization of CLIP + + +
+ Vision-language models, such as CLIP, have shown promising +Out-of-Distribution (OoD) generalization under various types of distribution +shifts. Recent studies attempted to investigate the leading cause of this +capability. In this work, we follow the same path, but focus on a specific type +of OoD data - images with novel compositions of attribute-object pairs - and +study whether such models can successfully classify those images into +composition classes. We carefully designed an authentic image test dataset +called ImageNet-AO, consisting of attributes for objects that are unlikely +encountered in the CLIP training sets. We found that CLIPs trained with large +datasets such as OpenAI CLIP, LAION-400M, and LAION-2B show orders-of-magnitude +improvement in effective compositional OoD generalization compared to both +supervised models and CLIPs trained with smaller datasets, such as CC-12M and +YFCC-15M. Our results provide evidence that the scale and diversity of training +data and language supervision play a key role in unlocking the compositional +generalization abilities of vision-language models. + +
+
+ comment: Oral accepted at OODCV 2023(http://www.ood-cv.org) +
+
+
+
+
+ + ☆ CT-3DFlow : Leveraging 3D Normalizing Flows for Unsupervised Detection + of Pathological Pulmonary CT scans + + +
+ Unsupervised pathology detection can be implemented by training a model on +healthy data only and measuring the deviation from the training set upon +inference, for example with CNN-based feature extraction and one-class +classifiers, or reconstruction-score-based methods such as AEs, GANs and +Diffusion models. Normalizing Flows (NF) have the ability to directly learn the +probability distribution of training examples through an invertible +architecture. We leverage this property in a novel 3D NF-based model named +CT-3DFlow, specifically tailored for patient-level pulmonary pathology +detection in chest CT data. Our model is trained unsupervised on healthy 3D +pulmonary CT patches, and detects deviations from its log-likelihood +distribution as anomalies. We aggregate patches-level likelihood values from a +patient's CT scan to provide a patient-level 'normal'/'abnormal' prediction. +Out-of-distribution detection performance is evaluated using expert annotations +on a separate chest CT test dataset, outperforming other state-of-the-art +methods. + +
+
+
+
+
+ + ☆ ParCo: Part-Coordinating Text-to-Motion Synthesis + + +
+ We study a challenging task: text-to-motion synthesis, aiming to generate +motions that align with textual descriptions and exhibit coordinated movements. +Currently, the part-based methods introduce part partition into the motion +synthesis process to achieve finer-grained generation. However, these methods +encounter challenges such as the lack of coordination between different part +motions and difficulties for networks to understand part concepts. Moreover, +introducing finer-grained part concepts poses computational complexity +challenges. In this paper, we propose Part-Coordinating Text-to-Motion +Synthesis (ParCo), endowed with enhanced capabilities for understanding part +motions and communication among different part motion generators, ensuring a +coordinated and fined-grained motion synthesis. Specifically, we discretize +whole-body motion into multiple part motions to establish the prior concept of +different parts. Afterward, we employ multiple lightweight generators designed +to synthesize different part motions and coordinate them through our part +coordination module. Our approach demonstrates superior performance on common +benchmarks with economic computations, including HumanML3D and KIT-ML, +providing substantial evidence of its effectiveness. Code is available at +https://github.com/qrzou/ParCo . + +
+
+
+
+
+ + ☆ HEMIT: H&E to Multiplex-immunohistochemistry Image Translation with + Dual-Branch Pix2pix Generator + + +
+ Computational analysis of multiplexed immunofluorescence histology data is +emerging as an important method for understanding the tumour micro-environment +in cancer. This work presents HEMIT, a dataset designed for translating +Hematoxylin and Eosin (H&E) sections to multiplex-immunohistochemistry (mIHC) +images, featuring DAPI, CD3, and panCK markers. Distinctively, HEMIT's mIHC +images are multi-component and cellular-level aligned with H&E, enriching +supervised stain translation tasks. To our knowledge, HEMIT is the first +publicly available cellular-level aligned dataset that enables H&E to +multi-target mIHC image translation. This dataset provides the computer vision +community with a valuable resource to develop novel computational methods which +have the potential to gain new insights from H&E slide archives. + We also propose a new dual-branch generator architecture, using residual +Convolutional Neural Networks (CNNs) and Swin Transformers which achieves +better translation outcomes than other popular algorithms. When evaluated on +HEMIT, it outperforms pix2pixHD, pix2pix, U-Net, and ResNet, achieving the +highest overall score on key metrics including the Structural Similarity Index +Measure (SSIM), Pearson correlation score (R), and Peak signal-to-noise Ratio +(PSNR). Additionally, downstream analysis has been used to further validate the +quality of the generated mIHC images. These results set a new benchmark in the +field of stain translation tasks. + +
+
+
+
+
+ + ☆ Direct mineral content prediction from drill core images via transfer + learning + + +
+ Deep subsurface exploration is important for mining, oil and gas industries, +as well as in the assessment of geological units for the disposal of chemical +or nuclear waste, or the viability of geothermal energy systems. Typically, +detailed examinations of subsurface formations or units are performed on +cuttings or core materials extracted during drilling campaigns, as well as on +geophysical borehole data, which provide detailed information about the +petrophysical properties of the rocks. Depending on the volume of rock samples +and the analytical program, the laboratory analysis and diagnostics can be very +time-consuming. This study investigates the potential of utilizing machine +learning, specifically convolutional neural networks (CNN), to assess the +lithology and mineral content solely from analysis of drill core images, aiming +to support and expedite the subsurface geological exploration. The paper +outlines a comprehensive methodology, encompassing data preprocessing, machine +learning methods, and transfer learning techniques. The outcome reveals a +remarkable 96.7% accuracy in the classification of drill core segments into +distinct formation classes. Furthermore, a CNN model was trained for the +evaluation of mineral content using a learning data set from multidimensional +log analysis data (silicate, total clay, carbonate). When benchmarked against +laboratory XRD measurements on samples from the cores, both the advanced +multidimensional log analysis model and the neural network approach developed +here provide equally good performance. This work demonstrates that deep +learning and particularly transfer learning can support extracting +petrophysical properties, including mineral content and formation +classification, from drill core images, thus offering a road map for enhancing +model performance and data set quality in image-based analysis of drill cores. + +
+
+
+
+
+ + ☆ VersaT2I: Improving Text-to-Image Models with Versatile Reward + + +
+ Recent text-to-image (T2I) models have benefited from large-scale and +high-quality data, demonstrating impressive performance. However, these T2I +models still struggle to produce images that are aesthetically pleasing, +geometrically accurate, faithful to text, and of good low-level quality. We +present VersaT2I, a versatile training framework that can boost the performance +with multiple rewards of any T2I model. We decompose the quality of the image +into several aspects such as aesthetics, text-image alignment, geometry, +low-level quality, etc. Then, for every quality aspect, we select high-quality +images in this aspect generated by the model as the training set to finetune +the T2I model using the Low-Rank Adaptation (LoRA). Furthermore, we introduce a +gating function to combine multiple quality aspects, which can avoid conflicts +between different quality aspects. Our method is easy to extend and does not +require any manual annotation, reinforcement learning, or model architecture +changes. Extensive experiments demonstrate that VersaT2I outperforms the +baseline methods across various quality criteria. + +
+
+
+
+
+ + ☆ I2CKD : Intra- and Inter-Class Knowledge Distillation for Semantic + Segmentation + + +
+ This paper proposes a new knowledge distillation method tailored for image +semantic segmentation, termed Intra- and Inter-Class Knowledge Distillation +(I2CKD). The focus of this method is on capturing and transferring knowledge +between the intermediate layers of teacher (cumbersome model) and student +(compact model). For knowledge extraction, we exploit class prototypes derived +from feature maps. To facilitate knowledge transfer, we employ a triplet loss +in order to minimize intra-class variances and maximize inter-class variances +between teacher and student prototypes. Consequently, I2CKD enables the student +to better mimic the feature representation of the teacher for each class, +thereby enhancing the segmentation performance of the compact network. +Extensive experiments on three segmentation datasets, i.e., Cityscapes, Pascal +VOC and CamVid, using various teacher-student network pairs demonstrate the +effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Modeling uncertainty for Gaussian Splatting + + +
+ We present Stochastic Gaussian Splatting (SGS): the first framework for +uncertainty estimation using Gaussian Splatting (GS). GS recently advanced the +novel-view synthesis field by achieving impressive reconstruction quality at a +fraction of the computational cost of Neural Radiance Fields (NeRF). However, +contrary to the latter, it still lacks the ability to provide information about +the confidence associated with their outputs. To address this limitation, in +this paper, we introduce a Variational Inference-based approach that seamlessly +integrates uncertainty prediction into the common rendering pipeline of GS. +Additionally, we introduce the Area Under Sparsification Error (AUSE) as a new +term in the loss function, enabling optimization of uncertainty estimation +alongside image reconstruction. Experimental results on the LLFF dataset +demonstrate that our method outperforms existing approaches in terms of both +image rendering quality and uncertainty estimation accuracy. Overall, our +framework equips practitioners with valuable insights into the reliability of +synthesized views, facilitating safer decision-making in real-world +applications. + +
+
+
+
+
+ + ☆ DiffusionFace: Towards a Comprehensive Dataset for Diffusion-Based Face + Forgery Analysis + + +
+ The rapid progress in deep learning has given rise to hyper-realistic facial +forgery methods, leading to concerns related to misinformation and security +risks. Existing face forgery datasets have limitations in generating +high-quality facial images and addressing the challenges posed by evolving +generative techniques. To combat this, we present DiffusionFace, the first +diffusion-based face forgery dataset, covering various forgery categories, +including unconditional and Text Guide facial image generation, Img2Img, +Inpaint, and Diffusion-based facial exchange algorithms. Our DiffusionFace +dataset stands out with its extensive collection of 11 diffusion models and the +high-quality of the generated images, providing essential metadata and a +real-world internet-sourced forgery facial image dataset for evaluation. +Additionally, we provide an in-depth analysis of the data and introduce +practical evaluation protocols to rigorously assess discriminative models' +effectiveness in detecting counterfeit facial images, aiming to enhance +security in facial image authentication processes. The dataset is available for +download at \url{https://github.com/Rapisurazurite/DiffFace}. + +
+
+
+
+
+ + ☆ Density-guided Translator Boosts Synthetic-to-Real Unsupervised Domain + Adaptive Segmentation of 3D Point Clouds CVPR2024 + + +
+ 3D synthetic-to-real unsupervised domain adaptive segmentation is crucial to +annotating new domains. Self-training is a competitive approach for this task, +but its performance is limited by different sensor sampling patterns (i.e., +variations in point density) and incomplete training strategies. In this work, +we propose a density-guided translator (DGT), which translates point density +between domains, and integrates it into a two-stage self-training pipeline +named DGT-ST. First, in contrast to existing works that simultaneously conduct +data generation and feature/output alignment within unstable adversarial +training, we employ the non-learnable DGT to bridge the domain gap at the input +level. Second, to provide a well-initialized model for self-training, we +propose a category-level adversarial network in stage one that utilizes the +prototype to prevent negative transfer. Finally, by leveraging the designs +above, a domain-mixed self-training method with source-aware consistency loss +is proposed in stage two to narrow the domain gap further. Experiments on two +synthetic-to-real segmentation tasks (SynLiDAR $\rightarrow$ semanticKITTI and +SynLiDAR $\rightarrow$ semanticPOSS) demonstrate that DGT-ST outperforms +state-of-the-art methods, achieving 9.4$\%$ and 4.3$\%$ mIoU improvements, +respectively. Code is available at \url{https://github.com/yuan-zm/DGT-ST}. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Deep Learning Segmentation and Classification of Red Blood Cells Using a + Large Multi-Scanner Dataset + + +
+ Digital pathology has recently been revolutionized by advancements in +artificial intelligence, deep learning, and high-performance computing. With +its advanced tools, digital pathology can help improve and speed up the +diagnostic process, reduce human errors, and streamline the reporting step. In +this paper, we report a new large red blood cell (RBC) image dataset and +propose a two-stage deep learning framework for RBC image segmentation and +classification. The dataset is a highly diverse dataset of more than 100K RBCs +containing eight different classes. The dataset, which is considerably larger +than any publicly available hematopathology dataset, was labeled independently +by two hematopathologists who also manually created masks for RBC cell +segmentation. Subsequently, in the proposed framework, first, a U-Net model was +trained to achieve automatic RBC image segmentation. Second, an EfficientNetB0 +model was trained to classify RBC images into one of the eight classes using a +transfer learning approach with a 5X2 cross-validation scheme. An IoU of 98.03% +and an average classification accuracy of 96.5% were attained on the test set. +Moreover, we have performed experimental comparisons against several prominent +CNN models. These comparisons show the superiority of the proposed model with a +good balance between performance and computational cost. + +
+
+ comment: 15 pages, 12 figures, 8 tables +
+
+
+
+
+ + ☆ DiffStyler: Diffusion-based Localized Image Style Transfer + + +
+ Image style transfer aims to imbue digital imagery with the distinctive +attributes of style targets, such as colors, brushstrokes, shapes, whilst +concurrently preserving the semantic integrity of the content. Despite the +advancements in arbitrary style transfer methods, a prevalent challenge remains +the delicate equilibrium between content semantics and style attributes. Recent +developments in large-scale text-to-image diffusion models have heralded +unprecedented synthesis capabilities, albeit at the expense of relying on +extensive and often imprecise textual descriptions to delineate artistic +styles. Addressing these limitations, this paper introduces DiffStyler, a novel +approach that facilitates efficient and precise arbitrary image style transfer. +DiffStyler lies the utilization of a text-to-image Stable Diffusion model-based +LoRA to encapsulate the essence of style targets. This approach, coupled with +strategic cross-LoRA feature and attention injection, guides the style transfer +process. The foundation of our methodology is rooted in the observation that +LoRA maintains the spatial feature consistency of UNet, a discovery that +further inspired the development of a mask-wise style transfer technique. This +technique employs masks extracted through a pre-trained FastSAM model, +utilizing mask prompts to facilitate feature fusion during the denoising +process, thereby enabling localized style transfer that preserves the original +image's unaffected regions. Moreover, our approach accommodates multiple style +targets through the use of corresponding masks. Through extensive +experimentation, we demonstrate that DiffStyler surpasses previous methods in +achieving a more harmonious balance between content preservation and style +integration. + +
+
+
+
+
+ + ☆ Scaling Vision-and-Language Navigation With Offline RL + + +
+ The study of vision-and-language navigation (VLN) has typically relied on +expert trajectories, which may not always be available in real-world situations +due to the significant effort required to collect them. On the other hand, +existing approaches to training VLN agents that go beyond available expert data +involve data augmentations or online exploration which can be tedious and +risky. In contrast, it is easy to access large repositories of suboptimal +offline trajectories. Inspired by research in offline reinforcement learning +(ORL), we introduce a new problem setup of VLN-ORL which studies VLN using +suboptimal demonstration data. We introduce a simple and effective +reward-conditioned approach that can account for dataset suboptimality for +training VLN agents, as well as benchmarks to evaluate progress and promote +research in this area. We empirically study various noise models for +characterizing dataset suboptimality among other unique challenges in VLN-ORL +and instantiate it for the VLN$\circlearrowright$BERT and MTVM architectures in +the R2R and RxR environments. Our experiments demonstrate that the proposed +reward-conditioned approach leads to significant performance improvements, even +in complex and intricate environments. + +
+
+ comment: Published in Transactions on Machine Learning Research (04/2024) +
+
+
+
+
+ + ☆ SingularTrajectory: Universal Trajectory Predictor Using Diffusion Model CVPR 2024 + + +
+ There are five types of trajectory prediction tasks: deterministic, +stochastic, domain adaptation, momentary observation, and few-shot. These +associated tasks are defined by various factors, such as the length of input +paths, data split and pre-processing methods. Interestingly, even though they +commonly take sequential coordinates of observations as input and infer future +paths in the same coordinates as output, designing specialized architectures +for each task is still necessary. For the other task, generality issues can +lead to sub-optimal performances. In this paper, we propose SingularTrajectory, +a diffusion-based universal trajectory prediction framework to reduce the +performance gap across the five tasks. The core of SingularTrajectory is to +unify a variety of human dynamics representations on the associated tasks. To +do this, we first build a Singular space to project all types of motion +patterns from each task into one embedding space. We next propose an adaptive +anchor working in the Singular space. Unlike traditional fixed anchor methods +that sometimes yield unacceptable paths, our adaptive anchor enables correct +anchors, which are put into a wrong location, based on a traversability map. +Finally, we adopt a diffusion-based predictor to further enhance the prototype +paths using a cascaded denoising process. Our unified framework ensures the +generality across various benchmark settings such as input modality, and +trajectory lengths. Extensive experiments on five public benchmarks demonstrate +that SingularTrajectory substantially outperforms existing models, highlighting +its effectiveness in estimating general dynamics of human movements. Code is +publicly available at https://github.com/inhwanbae/SingularTrajectory . + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Can Language Beat Numerical Regression? Language-Based Multimodal + Trajectory Prediction CVPR 2024 + + +
+ Language models have demonstrated impressive ability in context understanding +and generative performance. Inspired by the recent success of language +foundation models, in this paper, we propose LMTraj (Language-based Multimodal +Trajectory predictor), which recasts the trajectory prediction task into a sort +of question-answering problem. Departing from traditional numerical regression +models, which treat the trajectory coordinate sequence as continuous signals, +we consider them as discrete signals like text prompts. Specially, we first +transform an input space for the trajectory coordinate into the natural +language space. Here, the entire time-series trajectories of pedestrians are +converted into a text prompt, and scene images are described as text +information through image captioning. The transformed numerical and image data +are then wrapped into the question-answering template for use in a language +model. Next, to guide the language model in understanding and reasoning +high-level knowledge, such as scene context and social relationships between +pedestrians, we introduce an auxiliary multi-task question and answering. We +then train a numerical tokenizer with the prompt data. We encourage the +tokenizer to separate the integer and decimal parts well, and leverage it to +capture correlations between the consecutive numbers in the language model. +Lastly, we train the language model using the numerical tokenizer and all of +the question-answer prompts. Here, we propose a beam-search-based most-likely +prediction and a temperature-based multimodal prediction to implement both +deterministic and stochastic inferences. Applying our LMTraj, we show that the +language-based model can be a powerful pedestrian trajectory predictor, and +outperforms existing numerical-based predictor methods. Code is publicly +available at https://github.com/inhwanbae/LMTrajectory . + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ $\mathrm{F^2Depth}$: Self-supervised Indoor Monocular Depth Estimation + via Optical Flow Consistency and Feature Map Synthesis + + +
+ Self-supervised monocular depth estimation methods have been increasingly +given much attention due to the benefit of not requiring large, labelled +datasets. Such self-supervised methods require high-quality salient features +and consequently suffer from severe performance drop for indoor scenes, where +low-textured regions dominant in the scenes are almost indiscriminative. To +address the issue, we propose a self-supervised indoor monocular depth +estimation framework called $\mathrm{F^2Depth}$. A self-supervised optical flow +estimation network is introduced to supervise depth learning. To improve +optical flow estimation performance in low-textured areas, only some patches of +points with more discriminative features are adopted for finetuning based on +our well-designed patch-based photometric loss. The finetuned optical flow +estimation network generates high-accuracy optical flow as a supervisory signal +for depth estimation. Correspondingly, an optical flow consistency loss is +designed. Multi-scale feature maps produced by finetuned optical flow +estimation network perform warping to compute feature map synthesis loss as +another supervisory signal for depth learning. Experimental results on the NYU +Depth V2 dataset demonstrate the effectiveness of the framework and our +proposed losses. To evaluate the generalization ability of our +$\mathrm{F^2Depth}$, we collect a Campus Indoor depth dataset composed of +approximately 1500 points selected from 99 images in 18 scenes. Zero-shot +generalization experiments on 7-Scenes dataset and Campus Indoor achieve +$\delta_1$ accuracy of 75.8% and 76.0% respectively. The accuracy results show +that our model can generalize well to monocular images captured in unknown +indoor scenes. + +
+
+
+
+
+ + ☆ Backpropagation-free Network for 3D Test-time Adaptation CVPR 2024 + + +
+ Real-world systems often encounter new data over time, which leads to +experiencing target domain shifts. Existing Test-Time Adaptation (TTA) methods +tend to apply computationally heavy and memory-intensive backpropagation-based +approaches to handle this. Here, we propose a novel method that uses a +backpropagation-free approach for TTA for the specific case of 3D data. Our +model uses a two-stream architecture to maintain knowledge about the source +domain as well as complementary target-domain-specific information. The +backpropagation-free property of our model helps address the well-known +forgetting problem and mitigates the error accumulation issue. The proposed +method also eliminates the need for the usually noisy process of +pseudo-labeling and reliance on costly self-supervised training. Moreover, our +method leverages subspace learning, effectively reducing the distribution +variance between the two domains. Furthermore, the source-domain-specific and +the target-domain-specific streams are aligned using a novel entropy-based +adaptive fusion strategy. Extensive experiments on popular benchmarks +demonstrate the effectiveness of our method. The code will be available at +https://github.com/abie-e/BFTT3D. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ U-Sketch: An Efficient Approach for Sketch to Image Diffusion Models + + +
+ Diffusion models have demonstrated remarkable performance in text-to-image +synthesis, producing realistic and high resolution images that faithfully +adhere to the corresponding text-prompts. Despite their great success, they +still fall behind in sketch-to-image synthesis tasks, where in addition to +text-prompts, the spatial layout of the generated images has to closely follow +the outlines of certain reference sketches. Employing an MLP latent edge +predictor to guide the spatial layout of the synthesized image by predicting +edge maps at each denoising step has been recently proposed. Despite yielding +promising results, the pixel-wise operation of the MLP does not take into +account the spatial layout as a whole, and demands numerous denoising +iterations to produce satisfactory images, leading to time inefficiency. To +this end, we introduce U-Sketch, a framework featuring a U-Net type latent edge +predictor, which is capable of efficiently capturing both local and global +features, as well as spatial correlations between pixels. Moreover, we propose +the addition of a sketch simplification network that offers the user the choice +of preprocessing and simplifying input sketches for enhanced outputs. The +experimental results, corroborated by user feedback, demonstrate that our +proposed U-Net latent edge predictor leads to more realistic results, that are +better aligned with the spatial outlines of the reference sketches, while +drastically reducing the number of required denoising steps and, consequently, +the overall execution time. + +
+
+
+
+
+ + ☆ ECNet: Effective Controllable Text-to-Image Diffusion Models + + +
+ The conditional text-to-image diffusion models have garnered significant +attention in recent years. However, the precision of these models is often +compromised mainly for two reasons, ambiguous condition input and inadequate +condition guidance over single denoising loss. To address the challenges, we +introduce two innovative solutions. Firstly, we propose a Spatial Guidance +Injector (SGI) which enhances conditional detail by encoding text inputs with +precise annotation information. This method directly tackles the issue of +ambiguous control inputs by providing clear, annotated guidance to the model. +Secondly, to overcome the issue of limited conditional supervision, we +introduce Diffusion Consistency Loss (DCL), which applies supervision on the +denoised latent code at any given time step. This encourages consistency +between the latent code at each time step and the input signal, thereby +enhancing the robustness and accuracy of the output. The combination of SGI and +DCL results in our Effective Controllable Network (ECNet), which offers a more +accurate controllable end-to-end text-to-image generation framework with a more +precise conditioning input and stronger controllable supervision. We validate +our approach through extensive experiments on generation under various +conditions, such as human body skeletons, facial landmarks, and sketches of +general objects. The results consistently demonstrate that our method +significantly enhances the controllability and robustness of the generated +images, outperforming existing state-of-the-art controllable text-to-image +models. + +
+
+
+
+
+ + ☆ A Channel-ensemble Approach: Unbiased and Low-variance Pseudo-labels is + Critical for Semi-supervised Classification + + +
+ Semi-supervised learning (SSL) is a practical challenge in computer vision. +Pseudo-label (PL) methods, e.g., FixMatch and FreeMatch, obtain the State Of +The Art (SOTA) performances in SSL. These approaches employ a +threshold-to-pseudo-label (T2L) process to generate PLs by truncating the +confidence scores of unlabeled data predicted by the self-training method. +However, self-trained models typically yield biased and high-variance +predictions, especially in the scenarios when a little labeled data are +supplied. To address this issue, we propose a lightweight channel-based +ensemble method to effectively consolidate multiple inferior PLs into the +theoretically guaranteed unbiased and low-variance one. Importantly, our +approach can be readily extended to any SSL framework, such as FixMatch or +FreeMatch. Experimental results demonstrate that our method significantly +outperforms state-of-the-art techniques on CIFAR10/100 in terms of +effectiveness and efficiency. + +
+
+
+
+
+ + ☆ An Image Grid Can Be Worth a Video: Zero-shot Video Question Answering + Using a VLM + + +
+ Stimulated by the sophisticated reasoning capabilities of recent Large +Language Models (LLMs), a variety of strategies for bridging video modality +have been devised. A prominent strategy involves Video Language Models +(VideoLMs), which train a learnable interface with video data to connect +advanced vision encoders with LLMs. Recently, an alternative strategy has +surfaced, employing readily available foundation models, such as VideoLMs and +LLMs, across multiple stages for modality bridging. In this study, we introduce +a simple yet novel strategy where only a single Vision Language Model (VLM) is +utilized. Our starting point is the plain insight that a video comprises a +series of images, or frames, interwoven with temporal information. The essence +of video comprehension lies in adeptly managing the temporal aspects along with +the spatial details of each frame. Initially, we transform a video into a +single composite image by arranging multiple frames in a grid layout. The +resulting single image is termed as an image grid. This format, while +maintaining the appearance of a solitary image, effectively retains temporal +information within the grid structure. Therefore, the image grid approach +enables direct application of a single high-performance VLM without +necessitating any video-data training. Our extensive experimental analysis +across ten zero-shot video question answering benchmarks, including five +open-ended and five multiple-choice benchmarks, reveals that the proposed Image +Grid Vision Language Model (IG-VLM) surpasses the existing methods in nine out +of ten benchmarks. + +
+
+ comment: Our code is available at https://github.com/imagegridworth/IG-VLM +
+
+
+
+
+ + ☆ Colour and Brush Stroke Pattern Recognition in Abstract Art using + Modified Deep Convolutional Generative Adversarial Networks + + +
+ Abstract Art is an immensely popular, discussed form of art that often has +the ability to depict the emotions of an artist. Many researchers have made +attempts to study abstract art in the form of edge detection, brush stroke and +emotion recognition algorithms using machine and deep learning. This papers +describes the study of a wide distribution of abstract paintings using +Generative Adversarial Neural Networks(GAN). GANs have the ability to learn and +reproduce a distribution enabling researchers and scientists to effectively +explore and study the generated image space. However, the challenge lies in +developing an efficient GAN architecture that overcomes common training +pitfalls. This paper addresses this challenge by introducing a modified-DCGAN +(mDCGAN) specifically designed for high-quality artwork generation. The +approach involves a thorough exploration of the modifications made, delving +into the intricate workings of DCGANs, optimisation techniques, and +regularisation methods aimed at improving stability and realism in art +generation enabling effective study of generated patterns. The proposed mDCGAN +incorporates meticulous adjustments in layer configurations and architectural +choices, offering tailored solutions to the unique demands of art generation +while effectively combating issues like mode collapse and gradient vanishing. +Further this paper explores the generated latent space by performing random +walks to understand vector relationships between brush strokes and colours in +the abstract art space and a statistical analysis of unstable outputs after a +certain period of GAN training and compare its significant difference. These +findings validate the effectiveness of the proposed approach, emphasising its +potential to revolutionise the field of digital art generation and digital art +ecosystem. + +
+
+ comment: 28 pages, 5 tables, 7 figures +
+
+
+
+
+ + ☆ FTBC: Forward Temporal Bias Correction for Optimizing ANN-SNN Conversion + + +
+ Spiking Neural Networks (SNNs) offer a promising avenue for energy-efficient +computing compared with Artificial Neural Networks (ANNs), closely mirroring +biological neural processes. However, this potential comes with inherent +challenges in directly training SNNs through spatio-temporal backpropagation -- +stemming from the temporal dynamics of spiking neurons and their discrete +signal processing -- which necessitates alternative ways of training, most +notably through ANN-SNN conversion. In this work, we introduce a lightweight +Forward Temporal Bias Correction (FTBC) technique, aimed at enhancing +conversion accuracy without the computational overhead. We ground our method on +provided theoretical findings that through proper temporal bias calibration the +expected error of ANN-SNN conversion can be reduced to be zero after each time +step. We further propose a heuristic algorithm for finding the temporal bias +only in the forward pass, thus eliminating the computational burden of +backpropagation and we evaluate our method on CIFAR-10/100 and ImageNet +datasets, achieving a notable increase in accuracy on all datasets. Codes are +released at a GitHub repository. + +
+
+
+
+
+ + ☆ Generative Multi-modal Models are Good Class-Incremental Learners CVPR 2024 + + +
+ In class-incremental learning (CIL) scenarios, the phenomenon of catastrophic +forgetting caused by the classifier's bias towards the current task has long +posed a significant challenge. It is mainly caused by the characteristic of +discriminative models. With the growing popularity of the generative +multi-modal models, we would explore replacing discriminative models with +generative ones for CIL. However, transitioning from discriminative to +generative models requires addressing two key challenges. The primary challenge +lies in transferring the generated textual information into the classification +of distinct categories. Additionally, it requires formulating the task of CIL +within a generative framework. To this end, we propose a novel generative +multi-modal model (GMM) framework for class-incremental learning. Our approach +directly generates labels for images using an adapted generative model. After +obtaining the detailed text, we use a text encoder to extract text features and +employ feature matching to determine the most similar label as the +classification prediction. In the conventional CIL settings, we achieve +significantly better results in long-sequence task scenarios. Under the +Few-shot CIL setting, we have improved by at least 14\% accuracy over all the +current state-of-the-art methods with significantly less forgetting. Our code +is available at \url{https://github.com/DoubleClass/GMM}. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ BAM: Box Abstraction Monitors for Real-time OoD Detection in Object + Detection + + +
+ Out-of-distribution (OoD) detection techniques for deep neural networks +(DNNs) become crucial thanks to their filtering of abnormal inputs, especially +when DNNs are used in safety-critical applications and interact with an open +and dynamic environment. Nevertheless, integrating OoD detection into +state-of-the-art (SOTA) object detection DNNs poses significant challenges, +partly due to the complexity introduced by the SOTA OoD construction methods, +which require the modification of DNN architecture and the introduction of +complex loss functions. This paper proposes a simple, yet surprisingly +effective, method that requires neither retraining nor architectural change in +object detection DNN, called Box Abstraction-based Monitors (BAM). The novelty +of BAM stems from using a finite union of convex box abstractions to capture +the learned features of objects for in-distribution (ID) data, and an important +observation that features from OoD data are more likely to fall outside of +these boxes. The union of convex regions within the feature space allows the +formation of non-convex and interpretable decision boundaries, overcoming the +limitations of VOS-like detectors without sacrificing real-time performance. +Experiments integrating BAM into Faster R-CNN-based object detection DNNs +demonstrate a considerably improved performance against SOTA OoD detection +techniques. + +
+
+
+
+
+ + ☆ Ship in Sight: Diffusion Models for Ship-Image Super Resolution IJCNN + + +
+ In recent years, remarkable advancements have been achieved in the field of +image generation, primarily driven by the escalating demand for high-quality +outcomes across various image generation subtasks, such as inpainting, +denoising, and super resolution. A major effort is devoted to exploring the +application of super-resolution techniques to enhance the quality of +low-resolution images. In this context, our method explores in depth the +problem of ship image super resolution, which is crucial for coastal and port +surveillance. We investigate the opportunity given by the growing interest in +text-to-image diffusion models, taking advantage of the prior knowledge that +such foundation models have already learned. In particular, we present a +diffusion-model-based architecture that leverages text conditioning during +training while being class-aware, to best preserve the crucial details of the +ships during the generation of the super-resoluted image. Since the specificity +of this task and the scarcity availability of off-the-shelf data, we also +introduce a large labeled ship dataset scraped from online ship images, mostly +from ShipSpotting\footnote{\url{www.shipspotting.com}} website. Our method +achieves more robust results than other deep learning models previously +employed for super resolution, as proven by the multiple experiments performed. +Moreover, we investigate how this model can benefit downstream tasks, such as +classification and object detection, thus emphasizing practical implementation +in a real-world scenario. Experimental results show flexibility, reliability, +and impressive performance of the proposed framework over state-of-the-art +methods for different tasks. The code is available at: +https://github.com/LuigiSigillo/ShipinSight . + +
+
+ comment: Accepted at 2024 International Joint Conference on Neural Networks + (IJCNN) +
+
+
+
+
+ + ☆ ViTAR: Vision Transformer with Any Resolution + + +
+ his paper tackles a significant challenge faced by Vision Transformers +(ViTs): their constrained scalability across different image resolutions. +Typically, ViTs experience a performance decline when processing resolutions +different from those seen during training. Our work introduces two key +innovations to address this issue. Firstly, we propose a novel module for +dynamic resolution adjustment, designed with a single Transformer block, +specifically to achieve highly efficient incremental token integration. +Secondly, we introduce fuzzy positional encoding in the Vision Transformer to +provide consistent positional awareness across multiple resolutions, thereby +preventing overfitting to any single training resolution. Our resulting model, +ViTAR (Vision Transformer with Any Resolution), demonstrates impressive +adaptability, achieving 83.3\% top-1 accuracy at a 1120x1120 resolution and +80.4\% accuracy at a 4032x4032 resolution, all while reducing computational +costs. ViTAR also shows strong performance in downstream tasks such as instance +and semantic segmentation and can easily combined with self-supervised learning +techniques like Masked AutoEncoder. Our work provides a cost-effective solution +for enhancing the resolution scalability of ViTs, paving the way for more +versatile and efficient high-resolution image processing. + +
+
+
+
+
+ + ☆ Learning CNN on ViT: A Hybrid Model to Explicitly Class-specific + Boundaries for Domain Adaptation + + +
+ Most domain adaptation (DA) methods are based on either a convolutional +neural networks (CNNs) or a vision transformers (ViTs). They align the +distribution differences between domains as encoders without considering their +unique characteristics. For instance, ViT excels in accuracy due to its +superior ability to capture global representations, while CNN has an advantage +in capturing local representations. This fact has led us to design a hybrid +method to fully take advantage of both ViT and CNN, called Explicitly +Class-specific Boundaries (ECB). ECB learns CNN on ViT to combine their +distinct strengths. In particular, we leverage ViT's properties to explicitly +find class-specific decision boundaries by maximizing the discrepancy between +the outputs of the two classifiers to detect target samples far from the source +support. In contrast, the CNN encoder clusters target features based on the +previously defined class-specific boundaries by minimizing the discrepancy +between the probabilities of the two classifiers. Finally, ViT and CNN mutually +exchange knowledge to improve the quality of pseudo labels and reduce the +knowledge discrepancies of these models. Compared to conventional DA methods, +our ECB achieves superior performance, which verifies its effectiveness in this +hybrid model. The project website can be found +https://dotrannhattuong.github.io/ECB/website/. + +
+
+
+
+
+ + ☆ MonoHair: High-Fidelity Hair Modeling from a Monocular Video CVPR 2024 + + +
+ Undoubtedly, high-fidelity 3D hair is crucial for achieving realism, artistic +expression, and immersion in computer graphics. While existing 3D hair modeling +methods have achieved impressive performance, the challenge of achieving +high-quality hair reconstruction persists: they either require strict capture +conditions, making practical applications difficult, or heavily rely on learned +prior data, obscuring fine-grained details in images. To address these +challenges, we propose MonoHair,a generic framework to achieve high-fidelity +hair reconstruction from a monocular video, without specific requirements for +environments. Our approach bifurcates the hair modeling process into two main +stages: precise exterior reconstruction and interior structure inference. The +exterior is meticulously crafted using our Patch-based Multi-View Optimization +(PMVO). This method strategically collects and integrates hair information from +multiple views, independent of prior data, to produce a high-fidelity exterior +3D line map. This map not only captures intricate details but also facilitates +the inference of the hair's inner structure. For the interior, we employ a +data-driven, multi-view 3D hair reconstruction method. This method utilizes 2D +structural renderings derived from the reconstructed exterior, mirroring the +synthetic 2D inputs used during training. This alignment effectively bridges +the domain gap between our training data and real-world data, thereby enhancing +the accuracy and reliability of our interior structure inference. Lastly, we +generate a strand model and resolve the directional ambiguity by our hair +growth algorithm. Our experiments demonstrate that our method exhibits +robustness across diverse hairstyles and achieves state-of-the-art performance. +For more results, please refer to our project page +https://keyuwu-cs.github.io/MonoHair/. + +
+
+ comment: Accepted by IEEE CVPR 2024 +
+
+
+
+
+ + ☆ Generating Diverse Agricultural Data for Vision-Based Farming + Applications + + +
+ We present a specialized procedural model for generating synthetic +agricultural scenes, focusing on soybean crops, along with various weeds. This +model is capable of simulating distinct growth stages of these plants, diverse +soil conditions, and randomized field arrangements under varying lighting +conditions. The integration of real-world textures and environmental factors +into the procedural generation process enhances the photorealism and +applicability of the synthetic data. Our dataset includes 12,000 images with +semantic labels, offering a comprehensive resource for computer vision tasks in +precision agriculture, such as semantic segmentation for autonomous weed +control. We validate our model's effectiveness by comparing the synthetic data +against real agricultural images, demonstrating its potential to significantly +augment training data for machine learning models in agriculture. This approach +not only provides a cost-effective solution for generating high-quality, +diverse data but also addresses specific needs in agricultural vision tasks +that are not fully covered by general-purpose models. + +
+
+ comment: 10 pages, 8 figures, 3 tables +
+
+
+
+
+ + ☆ A Quantum Fuzzy-based Approach for Real-Time Detection of Solar Coronal + Holes + + +
+ The detection and analysis of the solar coronal holes (CHs) is an important +field of study in the domain of solar physics. Mainly, it is required for the +proper prediction of the geomagnetic storms which directly or indirectly affect +various space and ground-based systems. For the detection of CHs till date, the +solar scientist depends on manual hand-drawn approaches. However, with the +advancement of image processing technologies, some automated image segmentation +methods have been used for the detection of CHs. In-spite of this, fast and +accurate detection of CHs are till a major issues. Here in this work, a novel +quantum computing-based fast fuzzy c-mean technique has been developed for fast +detection of the CHs region. The task has been carried out in two stages, in +first stage the solar image has been segmented using a quantum computing based +fast fuzzy c-mean (QCFFCM) and in the later stage the CHs has been extracted +out from the segmented image based on image morphological operation. In the +work, quantum computing has been used to optimize the cost function of the fast +fuzzy c-mean (FFCM) algorithm, where quantum approximate optimization algorithm +(QAOA) has been used to optimize the quadratic part of the cost function. The +proposed method has been tested for 193 \AA{} SDO/AIA full-disk solar image +datasets and has been compared with the existing techniques. The outcome shows +the comparable performance of the proposed method with the existing one within +a very lesser time. + +
+
+ comment: 14 pages, 5 figures, 3 tables +
+
+
+
+
+ + ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from an over-reliance on unimodal biases (e.g., language +bias and vision bias), leading to incorrect answers in complex multimodal +tasks. To investigate this issue, we propose a causal framework to interpret +the biases in Visual Question Answering (VQA) problems. Within our framework, +we devise a causal graph to elucidate the predictions of MLLMs on VQA problems, +and assess the causal effect of biases through an in-depth causal analysis. +Motivated by the causal graph, we introduce a novel MORE dataset, consisting of +12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities, +necessitating multi-hop reasoning and the surmounting of unimodal biases. +Furthermore, we propose two strategies to mitigate unimodal biases and enhance +MLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA) +framework for limited-access MLLMs and the refinement of open-source MLLMs +through fine-tuning. Extensive quantitative and qualitative experiments offer +valuable insights for future research. + +
+
+
+
+
+ + ☆ Learning Inclusion Matching for Animation Paint Bucket Colorization CVPR 2024 + + +
+ Colorizing line art is a pivotal task in the production of hand-drawn cel +animation. This typically involves digital painters using a paint bucket tool +to manually color each segment enclosed by lines, based on RGB values +predetermined by a color designer. This frame-by-frame process is both arduous +and time-intensive. Current automated methods mainly focus on segment matching. +This technique migrates colors from a reference to the target frame by aligning +features within line-enclosed segments across frames. However, issues like +occlusion and wrinkles in animations often disrupt these direct +correspondences, leading to mismatches. In this work, we introduce a new +learning-based inclusion matching pipeline, which directs the network to +comprehend the inclusion relationships between segments rather than relying +solely on direct visual correspondences. Our method features a two-stage +pipeline that integrates a coarse color warping module with an inclusion +matching module, enabling more nuanced and accurate colorization. To facilitate +the training of our network, we also develope a unique dataset, referred to as +PaintBucket-Character. This dataset includes rendered line arts alongside their +colorized counterparts, featuring various 3D characters. Extensive experiments +demonstrate the effectiveness and superiority of our method over existing +techniques. + +
+
+ comment: accepted to CVPR 2024. Project Page: + https://ykdai.github.io/projects/InclusionMatching +
+
+
+
+
+ + ☆ H2ASeg: Hierarchical Adaptive Interaction and Weighting Network for + Tumor Segmentation in PET/CT Images + + +
+ Positron emission tomography (PET) combined with computed tomography (CT) +imaging is routinely used in cancer diagnosis and prognosis by providing +complementary information. Automatically segmenting tumors in PET/CT images can +significantly improve examination efficiency. Traditional multi-modal +segmentation solutions mainly rely on concatenation operations for modality +fusion, which fail to effectively model the non-linear dependencies between PET +and CT modalities. Recent studies have investigated various approaches to +optimize the fusion of modality-specific features for enhancing joint +representations. However, modality-specific encoders used in these methods +operate independently, inadequately leveraging the synergistic relationships +inherent in PET and CT modalities, for example, the complementarity between +semantics and structure. To address these issues, we propose a Hierarchical +Adaptive Interaction and Weighting Network termed H2ASeg to explore the +intrinsic cross-modal correlations and transfer potential complementary +information. Specifically, we design a Modality-Cooperative Spatial Attention +(MCSA) module that performs intra- and inter-modal interactions globally and +locally. Additionally, a Target-Aware Modality Weighting (TAMW) module is +developed to highlight tumor-related features within multi-modal features, +thereby refining tumor segmentation. By embedding these modules across +different layers, H2ASeg can hierarchically model cross-modal correlations, +enabling a nuanced understanding of both semantic and structural tumor +features. Extensive experiments demonstrate the superiority of H2ASeg, +outperforming state-of-the-art methods on AutoPet-II and Hecktor2022 +benchmarks. The code is released at https://github.com/G14nTDo4/H2ASeg. + +
+
+ comment: 10 pages,4 figures +
+
+
+
+
+ + ☆ DODA: Diffusion for Object-detection Domain Adaptation in Agriculture + + +
+ The diverse and high-quality content generated by recent generative models +demonstrates the great potential of using synthetic data to train downstream +models. However, in vision, especially in objection detection, related areas +are not fully explored, the synthetic images are merely used to balance the +long tails of existing datasets, and the accuracy of the generated labels is +low, the full potential of generative models has not been exploited. In this +paper, we propose DODA, a data synthesizer that can generate high-quality +object detection data for new domains in agriculture. Specifically, we improve +the controllability of layout-to-image through encoding layout as an image, +thereby improving the quality of labels, and use a visual encoder to provide +visual clues for the diffusion model to decouple visual features from the +diffusion model, and empowering the model the ability to generate data in new +domains. On the Global Wheat Head Detection (GWHD) Dataset, which is the +largest dataset in agriculture and contains diverse domains, using the data +synthesized by DODA improves the performance of the object detector by +12.74-17.76 AP$_{50}$ in the domain that was significantly shifted from the +training data. + +
+
+
+
+
+ + ☆ Tracking-Assisted Object Detection with Event Cameras + + +
+ Event-based object detection has recently garnered attention in the computer +vision community due to the exceptional properties of event cameras, such as +high dynamic range and no motion blur. However, feature asynchronism and +sparsity cause invisible objects due to no relative motion to the camera, +posing a significant challenge in the task. Prior works have studied various +memory mechanisms to preserve as many features as possible at the current time, +guided by temporal clues. While these implicit-learned memories retain some +short-term information, they still struggle to preserve long-term features +effectively. In this paper, we consider those invisible objects as +pseudo-occluded objects and aim to reveal their features. Firstly, we introduce +visibility attribute of objects and contribute an auto-labeling algorithm to +append additional visibility labels on an existing event camera dataset. +Secondly, we exploit tracking strategies for pseudo-occluded objects to +maintain their permanence and retain their bounding boxes, even when features +have not been available for a very long time. These strategies can be treated +as an explicit-learned memory guided by the tracking objective to record the +displacements of objects across frames. Lastly, we propose a spatio-temporal +feature aggregation module to enrich the latent features and a consistency loss +to increase the robustness of the overall pipeline. We conduct comprehensive +experiments to verify our method's effectiveness where still objects are +retained but real occluded objects are discarded. The results demonstrate that +(1) the additional visibility labels can assist in supervised training, and (2) +our method outperforms state-of-the-art approaches with a significant +improvement of 7.9% absolute mAP. + +
+
+
+
+
+ + ☆ PIPNet3D: Interpretable Detection of Alzheimer in MRI Scans + + +
+ Information from neuroimaging examinations (CT, MRI) is increasingly used to +support diagnoses of dementia, e.g., Alzheimer's disease. While current +clinical practice is mainly based on visual inspection and feature engineering, +Deep Learning approaches can be used to automate the analysis and to discover +new image-biomarkers. Part-prototype neural networks (PP-NN) are an alternative +to standard blackbox models, and have shown promising results in general +computer vision. PP-NN's base their reasoning on prototypical image regions +that are learned fully unsupervised, and combined with a simple-to-understand +decision layer. We present PIPNet3D, a PP-NN for volumetric images. We apply +PIPNet3D to the clinical case study of Alzheimer's Disease diagnosis from +structural Magnetic Resonance Imaging (sMRI). We assess the quality of +prototypes under a systematic evaluation framework, propose new metrics to +evaluate brain prototypes and perform an evaluation with domain experts. Our +results show that PIPNet3D is an interpretable, compact model for Alzheimer's +diagnosis with its reasoning well aligned to medical domain knowledge. Notably, +PIPNet3D achieves the same accuracy as its blackbox counterpart; and removing +the remaining clinically irrelevant prototypes from its decision process does +not decrease predictive performance. + +
+
+
+
+
+ + ☆ Implementation of the Principal Component Analysis onto High-Performance + Computer Facilities for Hyperspectral Dimensionality Reduction: Results and + Comparisons + + +
+ Dimensionality reduction represents a critical preprocessing step in order to +increase the efficiency and the performance of many hyperspectral imaging +algorithms. However, dimensionality reduction algorithms, such as the Principal +Component Analysis (PCA), suffer from their computationally demanding nature, +becoming advisable for their implementation onto high-performance computer +architectures for applications under strict latency constraints. This work +presents the implementation of the PCA algorithm onto two different +high-performance devices, namely, an NVIDIA Graphics Processing Unit (GPU) and +a Kalray manycore, uncovering a highly valuable set of tips and tricks in order +to take full advantage of the inherent parallelism of these high-performance +computing platforms, and hence, reducing the time that is required to process a +given hyperspectral image. Moreover, the achieved results obtained with +different hyperspectral images have been compared with the ones that were +obtained with a field programmable gate array (FPGA)-based implementation of +the PCA algorithm that has been recently published, providing, for the first +time in the literature, a comprehensive analysis in order to highlight the pros +and cons of each option. + +
+
+ comment: 30 pages, 10 figures +
+
+
+
+
+ + ☆ Uncertainty-Aware SAR ATR: Defending Against Adversarial Attacks via + Bayesian Neural Networks + + +
+ Adversarial attacks have demonstrated the vulnerability of Machine Learning +(ML) image classifiers in Synthetic Aperture Radar (SAR) Automatic Target +Recognition (ATR) systems. An adversarial attack can deceive the classifier +into making incorrect predictions by perturbing the input SAR images, for +example, with a few scatterers attached to the on-ground objects. Therefore, it +is critical to develop robust SAR ATR systems that can detect potential +adversarial attacks by leveraging the inherent uncertainty in ML classifiers, +thereby effectively alerting human decision-makers. In this paper, we propose a +novel uncertainty-aware SAR ATR for detecting adversarial attacks. +Specifically, we leverage the capability of Bayesian Neural Networks (BNNs) in +performing image classification with quantified epistemic uncertainty to +measure the confidence for each input SAR image. By evaluating the uncertainty, +our method alerts when the input SAR image is likely to be adversarially +generated. Simultaneously, we also generate visual explanations that reveal the +specific regions in the SAR image where the adversarial scatterers are likely +to to be present, thus aiding human decision-making with hints of evidence of +adversarial attacks. Experiments on the MSTAR dataset demonstrate that our +approach can identify over 80% adversarial SAR images with fewer than 20% false +alarms, and our visual explanations can identify up to over 90% of scatterers +in an adversarial SAR image. + +
+
+
+
+
+ + ☆ Selective Mixup Fine-Tuning for Optimizing Non-Decomposable Objectives ICLR 2024 + + +
+ The rise in internet usage has led to the generation of massive amounts of +data, resulting in the adoption of various supervised and semi-supervised +machine learning algorithms, which can effectively utilize the colossal amount +of data to train models. However, before deploying these models in the real +world, these must be strictly evaluated on performance measures like worst-case +recall and satisfy constraints such as fairness. We find that current +state-of-the-art empirical techniques offer sub-optimal performance on these +practical, non-decomposable performance objectives. On the other hand, the +theoretical techniques necessitate training a new model from scratch for each +performance objective. To bridge the gap, we propose SelMix, a selective +mixup-based inexpensive fine-tuning technique for pre-trained models, to +optimize for the desired objective. The core idea of our framework is to +determine a sampling distribution to perform a mixup of features between +samples from particular classes such that it optimizes the given objective. We +comprehensively evaluate our technique against the existing empirical and +theoretically principled methods on standard benchmark datasets for imbalanced +classification. We find that proposed SelMix fine-tuning significantly improves +the performance for various practical non-decomposable objectives across +benchmarks. + +
+
+ comment: ICLR 2024 SpotLight +
+
+
+
+
+ + ☆ Multi-scale Unified Network for Image Classification + + +
+ Convolutional Neural Networks (CNNs) have advanced significantly in visual +representation learning and recognition. However, they face notable challenges +in performance and computational efficiency when dealing with real-world, +multi-scale image inputs. Conventional methods rescale all input images into a +fixed size, wherein a larger fixed size favors performance but rescaling small +size images to a larger size incurs digitization noise and increased +computation cost. In this work, we carry out a comprehensive, layer-wise +investigation of CNN models in response to scale variation, based on Centered +Kernel Alignment (CKA) analysis. The observations reveal lower layers are more +sensitive to input image scale variations than high-level layers. Inspired by +this insight, we propose Multi-scale Unified Network (MUSN) consisting of +multi-scale subnets, a unified network, and scale-invariant constraint. Our +method divides the shallow layers into multi-scale subnets to enable feature +extraction from multi-scale inputs, and the low-level features are unified in +deep layers for extracting high-level semantic features. A scale-invariant +constraint is posed to maintain feature consistency across different scales. +Extensive experiments on ImageNet and other scale-diverse datasets, demonstrate +that MSUN achieves significant improvements in both model performance and +computational efficiency. Particularly, MSUN yields an accuracy increase up to +44.53% and diminishes FLOPs by 7.01-16.13% in multi-scale scenarios. + +
+
+
+
+
+ + ☆ Efficient Test-Time Adaptation of Vision-Language Models CVPR 2024 + + +
+ Test-time adaptation with pre-trained vision-language models has attracted +increasing attention for tackling distribution shifts during the test time. +Though prior studies have achieved very promising performance, they involve +intensive computation which is severely unaligned with test-time adaptation. We +design TDA, a training-free dynamic adapter that enables effective and +efficient test-time adaptation with vision-language models. TDA works with a +lightweight key-value cache that maintains a dynamic queue with few-shot pseudo +labels as values and the corresponding test-sample features as keys. Leveraging +the key-value cache, TDA allows adapting to test data gradually via progressive +pseudo label refinement which is super-efficient without incurring any +backpropagation. In addition, we introduce negative pseudo labeling that +alleviates the adverse impact of pseudo label noises by assigning pseudo labels +to certain negative classes when the model is uncertain about its pseudo label +predictions. Extensive experiments over two benchmarks demonstrate TDA's +superior effectiveness and efficiency as compared with the state-of-the-art. +The code has been released in \url{https://kdiaaa.github.io/tda/}. + +
+
+ comment: Accepted to CVPR 2024. The code has been released in + \url{https://kdiaaa.github.io/tda/} +
+
+
+
+
+ + ☆ Towards Non-Exemplar Semi-Supervised Class-Incremental Learning + + +
+ Deep neural networks perform remarkably well in close-world scenarios. +However, novel classes emerged continually in real applications, making it +necessary to learn incrementally. Class-incremental learning (CIL) aims to +gradually recognize new classes while maintaining the discriminability of old +ones. Existing CIL methods have two limitations: a heavy reliance on preserving +old data for forgetting mitigation and the need for vast labeled data for +knowledge adaptation. To overcome these issues, we propose a non-exemplar +semi-supervised CIL framework with contrastive learning and semi-supervised +incremental prototype classifier (Semi-IPC). On the one hand, contrastive +learning helps the model learn rich representations, easing the trade-off +between learning representations of new classes and forgetting that of old +classes. On the other hand, Semi-IPC learns a prototype for each class with +unsupervised regularization, enabling the model to incrementally learn from +partially labeled new data while maintaining the knowledge of old classes. +Experiments on benchmark datasets demonstrate the strong performance of our +method: without storing any old samples and only using less than 1% of labels, +Semi-IPC outperforms advanced exemplar-based methods. We hope our work offers +new insights for future CIL research. The code will be made publicly available. + +
+
+
+
+
+ + ☆ SGDM: Static-Guided Dynamic Module Make Stronger Visual Models + + +
+ The spatial attention mechanism has been widely used to improve object +detection performance. However, its operation is currently limited to static +convolutions lacking content-adaptive features. This paper innovatively +approaches from the perspective of dynamic convolution. We propose Razor +Dynamic Convolution (RDConv) to address thetwo flaws in dynamic weight +convolution, making it hard to implement in spatial mechanism: 1) it is +computation-heavy; 2) when generating weights, spatial information is +disregarded. Firstly, by using Razor Operation to generate certain features, we +vastly reduce the parameters of the entire dynamic convolution operation. +Secondly, we added a spatial branch inside RDConv to generate convolutional +kernel parameters with richer spatial information. Embedding dynamic +convolution will also bring the problem of sensitivity to high-frequency noise. +We propose the Static-Guided Dynamic Module (SGDM) to address this limitation. +By using SGDM, we utilize a set of asymmetric static convolution kernel +parameters to guide the construction of dynamic convolution. We introduce the +mechanism of shared weights in static convolution to solve the problem of +dynamic convolution being sensitive to high-frequency noise. Extensive +experiments illustrate that multiple different object detection backbones +equipped with SGDM achieve a highly competitive boost in performance(e.g., +4% +mAP with YOLOv5n on VOC and +1.7% mAP with YOLOv8n on COCO) with negligible +parameter increase(i.e., +0.33M on YOLOv5n and +0.19M on YOLOv8n). + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ☆ AIR-HLoc: Adaptive Image Retrieval for Efficient Visual Localisation + + +
+ State-of-the-art (SOTA) hierarchical localisation pipelines (HLoc) rely on +image retrieval (IR) techniques to establish 2D-3D correspondences by selecting +the $k$ most similar images from a reference image database for a given query +image. Although higher values of $k$ enhance localisation robustness, the +computational cost for feature matching increases linearly with $k$. In this +paper, we observe that queries that are the most similar to images in the +database result in a higher proportion of feature matches and, thus, more +accurate positioning. Thus, a small number of images is sufficient for queries +very similar to images in the reference database. We then propose a novel +approach, AIR-HLoc, which divides query images into different localisation +difficulty levels based on their similarity to the reference image database. We +consider an image with high similarity to the reference image as an easy query +and an image with low similarity as a hard query. Easy queries show a limited +improvement in accuracy when increasing $k$. Conversely, higher values of $k$ +significantly improve accuracy for hard queries. Given the limited improvement +in accuracy when increasing $k$ for easy queries and the significant +improvement for hard queries, we adapt the value of $k$ to the query's +difficulty level. Therefore, AIR-HLoc optimizes processing time by adaptively +assigning different values of $k$ based on the similarity between the query and +reference images without losing accuracy. Our extensive experiments on the +Cambridge Landmarks, 7Scenes, and Aachen Day-Night-v1.1 datasets demonstrate +our algorithm's efficacy, reducing 30\%, 26\%, and 11\% in computational +overhead while maintaining SOTA accuracy compared to HLoc with fixed image +retrieval. + +
+
+
+
+
+ + ☆ DVLO: Deep Visual-LiDAR Odometry with Local-to-Global Feature Fusion and + Bi-Directional Structure Alignment + + +
+ Information inside visual and LiDAR data is well complementary derived from +the fine-grained texture of images and massive geometric information in point +clouds. However, it remains challenging to explore effective visual-LiDAR +fusion, mainly due to the intrinsic data structure inconsistency between two +modalities: Images are regular and dense, but LiDAR points are unordered and +sparse. To address the problem, we propose a local-to-global fusion network +with bi-directional structure alignment. To obtain locally fused features, we +project points onto image plane as cluster centers and cluster image pixels +around each center. Image pixels are pre-organized as pseudo points for +image-to-point structure alignment. Then, we convert points to pseudo images by +cylindrical projection (point-to-image structure alignment) and perform +adaptive global feature fusion between point features with local fused +features. Our method achieves state-of-the-art performance on KITTI odometry +and FlyingThings3D scene flow datasets compared to both single-modal and +multi-modal methods. Codes will be released later. + +
+
+
+
+
+ + ☆ Unleashing the Potential of SAM for Medical Adaptation via Hierarchical + Decoding CVPR 2024 + + +
+ The Segment Anything Model (SAM) has garnered significant attention for its +versatile segmentation abilities and intuitive prompt-based interface. However, +its application in medical imaging presents challenges, requiring either +substantial training costs and extensive medical datasets for full model +fine-tuning or high-quality prompts for optimal performance. This paper +introduces H-SAM: a prompt-free adaptation of SAM tailored for efficient +fine-tuning of medical images via a two-stage hierarchical decoding procedure. +In the initial stage, H-SAM employs SAM's original decoder to generate a prior +probabilistic mask, guiding a more intricate decoding process in the second +stage. Specifically, we propose two key designs: 1) A class-balanced, +mask-guided self-attention mechanism addressing the unbalanced label +distribution, enhancing image embedding; 2) A learnable mask cross-attention +mechanism spatially modulating the interplay among different image regions +based on the prior mask. Moreover, the inclusion of a hierarchical pixel +decoder in H-SAM enhances its proficiency in capturing fine-grained and +localized details. This approach enables SAM to effectively integrate learned +medical priors, facilitating enhanced adaptation for medical image segmentation +with limited samples. Our H-SAM demonstrates a 4.78% improvement in average +Dice compared to existing prompt-free SAM variants for multi-organ segmentation +using only 10% of 2D slices. Notably, without using any unlabeled data, H-SAM +even outperforms state-of-the-art semi-supervised models relying on extensive +unlabeled training data across various medical datasets. Our code is available +at https://github.com/Cccccczh404/H-SAM. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Image Deraining via Self-supervised Reinforcement Learning + + +
+ The quality of images captured outdoors is often affected by the weather. One +factor that interferes with sight is rain, which can obstruct the view of +observers and computer vision applications that rely on those images. The work +aims to recover rain images by removing rain streaks via Self-supervised +Reinforcement Learning (RL) for image deraining (SRL-Derain). We locate rain +streak pixels from the input rain image via dictionary learning and use +pixel-wise RL agents to take multiple inpainting actions to remove rain +progressively. To our knowledge, this work is the first attempt where +self-supervised RL is applied to image deraining. Experimental results on +several benchmark image-deraining datasets show that the proposed SRL-Derain +performs favorably against state-of-the-art few-shot and self-supervised +deraining and denoising methods. + +
+
+
+
+
+ + ☆ Branch-Tuning: Balancing Stability and Plasticity for Continual + Self-Supervised Learning + + +
+ Self-supervised learning (SSL) has emerged as an effective paradigm for +deriving general representations from vast amounts of unlabeled data. However, +as real-world applications continually integrate new content, the high +computational and resource demands of SSL necessitate continual learning rather +than complete retraining. This poses a challenge in striking a balance between +stability and plasticity when adapting to new information. In this paper, we +employ Centered Kernel Alignment for quantitatively analyzing model stability +and plasticity, revealing the critical roles of batch normalization layers for +stability and convolutional layers for plasticity. Motivated by this, we +propose Branch-tuning, an efficient and straightforward method that achieves a +balance between stability and plasticity in continual SSL. Branch-tuning +consists of branch expansion and compression, and can be easily applied to +various SSL methods without the need of modifying the original methods, +retaining old data or models. We validate our method through incremental +experiments on various benchmark datasets, demonstrating its effectiveness and +practical value in real-world scenarios. We hope our work offers new insights +for future continual self-supervised learning research. The code will be made +publicly available. + +
+
+
+
+
+ + ☆ Toward Interactive Regional Understanding in Vision-Large Language + Models NAACL 2024 + + +
+ Recent Vision-Language Pre-training (VLP) models have demonstrated +significant advancements. Nevertheless, these models heavily rely on image-text +pairs that capture only coarse and global information of an image, leading to a +limitation in their regional understanding ability. In this work, we introduce +\textbf{RegionVLM}, equipped with explicit regional modeling capabilities, +allowing them to understand user-indicated image regions. To achieve this, we +design a simple yet innovative architecture, requiring no modifications to the +model architecture or objective function. Additionally, we leverage a dataset +that contains a novel source of information, namely Localized Narratives, which +has been overlooked in previous VLP research. Our experiments demonstrate that +our single generalist model not only achieves an interactive dialogue system +but also exhibits superior performance on various zero-shot region +understanding tasks, without compromising its ability for global image +understanding. + +
+
+ comment: NAACL 2024 Main Conference +
+
+
+
+
+ + ☆ Enhancing Generative Class Incremental Learning Performance with Model + Forgetting Approach + + +
+ This study presents a novel approach to Generative Class Incremental Learning +(GCIL) by introducing the forgetting mechanism, aimed at dynamically managing +class information for better adaptation to streaming data. GCIL is one of the +hot topics in the field of computer vision, and this is considered one of the +crucial tasks in society, specifically the continual learning of generative +models. The ability to forget is a crucial brain function that facilitates +continual learning by selectively discarding less relevant information for +humans. However, in the field of machine learning models, the concept of +intentionally forgetting has not been extensively investigated. In this study +we aim to bridge this gap by incorporating the forgetting mechanisms into GCIL, +thereby examining their impact on the models' ability to learn in continual +learning. Through our experiments, we have found that integrating the +forgetting mechanisms significantly enhances the models' performance in +acquiring new knowledge, underscoring the positive role that strategic +forgetting plays in the process of continual learning. + +
+
+
+
+
+ + ☆ Beyond Embeddings: The Promise of Visual Table in Multi-Modal Models + + +
+ Visual representation learning has been a cornerstone in computer vision, +evolving from supervised learning with human-annotated labels to aligning +image-text pairs from the Internet. Despite recent advancements in multi-modal +large language models (MLLMs), the visual representations they rely on, such as +CLIP embeddings, often lack access to external world knowledge critical for +real-world visual reasoning. In this work, we propose Visual Table, a novel +visual representation tailored for MLLMs. It provides hierarchical text +descriptions of holistic visual scenes, consisting of a scene description and +multiple object-centric descriptions that encompass categories, attributes, and +knowledge at instance level. We further develop a scalable generator for visual +table generation and train it on small-scale annotations from GPT4V. Extensive +evaluations demonstrate that, with generated visual tables as additional visual +representations, our model can consistently outperform the state-of-the-art +(SOTA) MLLMs across diverse benchmarks. When visual tables serve as standalone +visual representations, our model can closely match or even beat the SOTA MLLMs +that are built on CLIP visual embeddings. Our code is available at +https://github.com/LaVi-Lab/Visual-Table. + +
+
+ comment: Project page: https://github.com/LaVi-Lab/Visual-Table +
+
+
+
+
+ + ☆ NeuSDFusion: A Spatial-Aware Generative Model for 3D Shape Completion, + Reconstruction, and Generation + + +
+ 3D shape generation aims to produce innovative 3D content adhering to +specific conditions and constraints. Existing methods often decompose 3D shapes +into a sequence of localized components, treating each element in isolation +without considering spatial consistency. As a result, these approaches exhibit +limited versatility in 3D data representation and shape generation, hindering +their ability to generate highly diverse 3D shapes that comply with the +specified constraints. In this paper, we introduce a novel spatial-aware 3D +shape generation framework that leverages 2D plane representations for enhanced +3D shape modeling. To ensure spatial coherence and reduce memory usage, we +incorporate a hybrid shape representation technique that directly learns a +continuous signed distance field representation of the 3D shape using +orthogonal 2D planes. Additionally, we meticulously enforce spatial +correspondences across distinct planes using a transformer-based autoencoder +structure, promoting the preservation of spatial relationships in the generated +3D shapes. This yields an algorithm that consistently outperforms +state-of-the-art 3D shape generation methods on various tasks, including +unconditional shape generation, multi-modal shape completion, single-view +reconstruction, and text-to-shape synthesis. + +
+
+
+
+
+ + ☆ TAFormer: A Unified Target-Aware Transformer for Video and Motion Joint + Prediction in Aerial Scenes + + +
+ As drone technology advances, using unmanned aerial vehicles for aerial +surveys has become the dominant trend in modern low-altitude remote sensing. +The surge in aerial video data necessitates accurate prediction for future +scenarios and motion states of the interested target, particularly in +applications like traffic management and disaster response. Existing video +prediction methods focus solely on predicting future scenes (video frames), +suffering from the neglect of explicitly modeling target's motion states, which +is crucial for aerial video interpretation. To address this issue, we introduce +a novel task called Target-Aware Aerial Video Prediction, aiming to +simultaneously predict future scenes and motion states of the target. Further, +we design a model specifically for this task, named TAFormer, which provides a +unified modeling approach for both video and target motion states. +Specifically, we introduce Spatiotemporal Attention (STA), which decouples the +learning of video dynamics into spatial static attention and temporal dynamic +attention, effectively modeling the scene appearance and motion. Additionally, +we design an Information Sharing Mechanism (ISM), which elegantly unifies the +modeling of video and target motion by facilitating information interaction +through two sets of messenger tokens. Moreover, to alleviate the difficulty of +distinguishing targets in blurry predictions, we introduce Target-Sensitive +Gaussian Loss (TSGL), enhancing the model's sensitivity to both target's +position and content. Extensive experiments on UAV123VP and VisDroneVP (derived +from single-object tracking datasets) demonstrate the exceptional performance +of TAFormer in target-aware video prediction, showcasing its adaptability to +the additional requirements of aerial video interpretation for target +awareness. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ Benchmarking Image Transformers for Prostate Cancer Detection from + Ultrasound Data SP + + +
+ PURPOSE: Deep learning methods for classifying prostate cancer (PCa) in +ultrasound images typically employ convolutional networks (CNNs) to detect +cancer in small regions of interest (ROI) along a needle trace region. However, +this approach suffers from weak labelling, since the ground-truth +histopathology labels do not describe the properties of individual ROIs. +Recently, multi-scale approaches have sought to mitigate this issue by +combining the context awareness of transformers with a CNN feature extractor to +detect cancer from multiple ROIs using multiple-instance learning (MIL). In +this work, we present a detailed study of several image transformer +architectures for both ROI-scale and multi-scale classification, and a +comparison of the performance of CNNs and transformers for ultrasound-based +prostate cancer classification. We also design a novel multi-objective learning +strategy that combines both ROI and core predictions to further mitigate label +noise. METHODS: We evaluate 3 image transformers on ROI-scale cancer +classification, then use the strongest model to tune a multi-scale classifier +with MIL. We train our MIL models using our novel multi-objective learning +strategy and compare our results to existing baselines. RESULTS: We find that +for both ROI-scale and multi-scale PCa detection, image transformer backbones +lag behind their CNN counterparts. This deficit in performance is even more +noticeable for larger models. When using multi-objective learning, we can +improve performance of MIL, with a 77.9% AUROC, a sensitivity of 75.9%, and a +specificity of 66.3%. CONCLUSION: Convolutional networks are better suited for +modelling sparse datasets of prostate ultrasounds, producing more robust +features than transformers in PCa detection. Multi-scale methods remain the +best architecture for this task, with multi-objective learning presenting an +effective way to improve performance. + +
+
+ comment: early draft, 7 pages; Accepted to SPIE Medical Imaging 2024 +
+
+
+
+
+ + ☆ Fourier or Wavelet bases as counterpart self-attention in spikformer for + efficient visual classification + + +
+ Energy-efficient spikformer has been proposed by integrating the biologically +plausible spiking neural network (SNN) and artificial Transformer, whereby the +Spiking Self-Attention (SSA) is used to achieve both higher accuracy and lower +computational cost. However, it seems that self-attention is not always +necessary, especially in sparse spike-form calculation manners. In this paper, +we innovatively replace vanilla SSA (using dynamic bases calculating from Query +and Key) with spike-form Fourier Transform, Wavelet Transform, and their +combinations (using fixed triangular or wavelets bases), based on a key +hypothesis that both of them use a set of basis functions for information +transformation. Hence, the Fourier-or-Wavelet-based spikformer (FWformer) is +proposed and verified in visual classification tasks, including both static +image and event-based video datasets. The FWformer can achieve comparable or +even higher accuracies ($0.4\%$-$1.5\%$), higher running speed ($9\%$-$51\%$ +for training and $19\%$-$70\%$ for inference), reduced theoretical energy +consumption ($20\%$-$25\%$), and reduced GPU memory usage ($4\%$-$26\%$), +compared to the standard spikformer. Our result indicates the continuous +refinement of new Transformers, that are inspired either by biological +discovery (spike-form), or information theory (Fourier or Wavelet Transform), +is promising. + +
+
+ comment: 18 pages, 2 figures. arXiv admin note: substantial text overlap with + arXiv:2308.02557 +
+
+
+
+
+ + ☆ NeuroPictor: Refining fMRI-to-Image Reconstruction via Multi-individual + Pretraining and Multi-level Modulation + + +
+ Recent fMRI-to-image approaches mainly focused on associating fMRI signals +with specific conditions of pre-trained diffusion models. These approaches, +while producing high-quality images, capture only a limited aspect of the +complex information in fMRI signals and offer little detailed control over +image creation. In contrast, this paper proposes to directly modulate the +generation process of diffusion models using fMRI signals. Our approach, +NeuroPictor, divides the fMRI-to-image process into three steps: i) fMRI +calibrated-encoding, to tackle multi-individual pre-training for a shared +latent space to minimize individual difference and enable the subsequent +cross-subject training; ii) fMRI-to-image cross-subject pre-training, +perceptually learning to guide diffusion model with high- and low-level +conditions across different individuals; iii) fMRI-to-image single-subject +refining, similar with step ii but focus on adapting to particular individual. +NeuroPictor extracts high-level semantic features from fMRI signals that +characterizing the visual stimulus and incrementally fine-tunes the diffusion +model with a low-level manipulation network to provide precise structural +instructions. By training with over 60,000 fMRI-image pairs from various +individuals, our model enjoys superior fMRI-to-image decoding capacity, +particularly in the within-subject setting, as evidenced in benchmark datasets. +Project page: https://jingyanghuo.github.io/neuropictor/. + +
+
+
+
+
+ + ☆ An Evolutionary Network Architecture Search Framework with Adaptive + Multimodal Fusion for Hand Gesture Recognition + + +
+ Hand gesture recognition (HGR) based on multimodal data has attracted +considerable attention owing to its great potential in applications. Various +manually designed multimodal deep networks have performed well in multimodal +HGR (MHGR), but most of existing algorithms require a lot of expert experience +and time-consuming manual trials. To address these issues, we propose an +evolutionary network architecture search framework with the adaptive multimodel +fusion (AMF-ENAS). Specifically, we design an encoding space that +simultaneously considers fusion positions and ratios of the multimodal data, +allowing for the automatic construction of multimodal networks with different +architectures through decoding. Additionally, we consider three input streams +corresponding to intra-modal surface electromyography (sEMG), intra-modal +accelerometer (ACC), and inter-modal sEMG-ACC. To automatically adapt to +various datasets, the ENAS framework is designed to automatically search a MHGR +network with appropriate fusion positions and ratios. To the best of our +knowledge, this is the first time that ENAS has been utilized in MHGR to tackle +issues related to the fusion position and ratio of multimodal data. +Experimental results demonstrate that AMF-ENAS achieves state-of-the-art +performance on the Ninapro DB2, DB3, and DB7 datasets. + +
+
+
+
+
+ + ☆ Road Obstacle Detection based on Unknown Objectness Scores ICRA 2024 + + +
+ The detection of unknown traffic obstacles is vital to ensure safe autonomous +driving. The standard object-detection methods cannot identify unknown objects +that are not included under predefined categories. This is because +object-detection methods are trained to assign a background label to pixels +corresponding to the presence of unknown objects. To address this problem, the +pixel-wise anomaly-detection approach has attracted increased research +attention. Anomaly-detection techniques, such as uncertainty estimation and +perceptual difference from reconstructed images, make it possible to identify +pixels of unknown objects as out-of-distribution (OoD) samples. However, when +applied to images with many unknowns and complex components, such as driving +scenes, these methods often exhibit unstable performance. The purpose of this +study is to achieve stable performance for detecting unknown objects by +incorporating the object-detection fashions into the pixel-wise anomaly +detection methods. To achieve this goal, we adopt a semantic-segmentation +network with a sigmoid head that simultaneously provides pixel-wise anomaly +scores and objectness scores. Our experimental results show that the objectness +scores play an important role in improving the detection performance. Based on +these results, we propose a novel anomaly score by integrating these two +scores, which we term as unknown objectness score. Quantitative evaluations +show that the proposed method outperforms state-of-the-art methods when applied +to the publicly available datasets. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ☆ Few-shot Online Anomaly Detection and Segmentation + + +
+ Detecting anomaly patterns from images is a crucial artificial intelligence +technique in industrial applications. Recent research in this domain has +emphasized the necessity of a large volume of training data, overlooking the +practical scenario where, post-deployment of the model, unlabeled data +containing both normal and abnormal samples can be utilized to enhance the +model's performance. Consequently, this paper focuses on addressing the +challenging yet practical few-shot online anomaly detection and segmentation +(FOADS) task. Under the FOADS framework, models are trained on a few-shot +normal dataset, followed by inspection and improvement of their capabilities by +leveraging unlabeled streaming data containing both normal and abnormal samples +simultaneously. + To tackle this issue, we propose modeling the feature distribution of normal +images using a Neural Gas network, which offers the flexibility to adapt the +topology structure to identify outliers in the data flow. In order to achieve +improved performance with limited training samples, we employ multi-scale +feature embedding extracted from a CNN pre-trained on ImageNet to obtain a +robust representation. Furthermore, we introduce an algorithm that can +incrementally update parameters without the need to store previous samples. +Comprehensive experimental results demonstrate that our method can achieve +substantial performance under the FOADS setting, while ensuring that the time +complexity remains within an acceptable range on MVTec AD and BTAD datasets. + +
+
+
+
+
+ + ☆ Generative Medical Segmentation + + +
+ Rapid advancements in medical image segmentation performance have been +significantly driven by the development of Convolutional Neural Networks (CNNs) +and Vision Transformers (ViTs). However, these models introduce high +computational demands and often have limited ability to generalize across +diverse medical imaging datasets. In this manuscript, we introduce Generative +Medical Segmentation (GMS), a novel approach leveraging a generative model for +image segmentation. Concretely, GMS employs a robust pre-trained Variational +Autoencoder (VAE) to derive latent representations of both images and masks, +followed by a mapping model that learns the transition from image to mask in +the latent space. This process culminates in generating a precise segmentation +mask within the image space using the pre-trained VAE decoder. The design of +GMS leads to fewer learnable parameters in the model, resulting in a reduced +computational burden and enhanced generalization capability. Our extensive +experimental analysis across five public datasets in different medical imaging +domains demonstrates GMS outperforms existing discriminative segmentation +models and has remarkable domain generalization. Our experiments suggest GMS +could set a new benchmark for medical image segmentation, offering a scalable +and effective solution. GMS implementation and model weights are available at +https://github.com/King-HAW/GMS. + +
+
+
+
+
+ + ☆ Looking Beyond What You See: An Empirical Analysis on Subgroup + Intersectional Fairness for Multi-label Chest X-ray Classification Using + Social Determinants of Racial Health Inequities ICCV + + +
+ There has been significant progress in implementing deep learning models in +disease diagnosis using chest X- rays. Despite these advancements, inherent +biases in these models can lead to disparities in prediction accuracy across +protected groups. In this study, we propose a framework to achieve accurate +diagnostic outcomes and ensure fairness across intersectional groups in +high-dimensional chest X- ray multi-label classification. Transcending +traditional protected attributes, we consider complex interactions within +social determinants, enabling a more granular benchmark and evaluation of +fairness. We present a simple and robust method that involves retraining the +last classification layer of pre-trained models using a balanced dataset across +groups. Additionally, we account for fairness constraints and integrate +class-balanced fine-tuning for multi-label settings. The evaluation of our +method on the MIMIC-CXR dataset demonstrates that our framework achieves an +optimal tradeoff between accuracy and fairness compared to baseline methods. + +
+
+ comment: ICCV CVAMD 2023 +
+
+
+
+
+ + ☆ Middle Fusion and Multi-Stage, Multi-Form Prompts for Robust RGB-T + Tracking + + +
+ RGB-T tracking, a vital downstream task of object tracking, has made +remarkable progress in recent years. Yet, it remains hindered by two major +challenges: 1) the trade-off between performance and efficiency; 2) the +scarcity of training data. To address the latter challenge, some recent methods +employ prompts to fine-tune pre-trained RGB tracking models and leverage +upstream knowledge in a parameter-efficient manner. However, these methods +inadequately explore modality-independent patterns and disregard the dynamic +reliability of different modalities in open scenarios. We propose M3PT, a novel +RGB-T prompt tracking method that leverages middle fusion and multi-modal and +multi-stage visual prompts to overcome these challenges. We pioneer the use of +the middle fusion framework for RGB-T tracking, which achieves a balance +between performance and efficiency. Furthermore, we incorporate the pre-trained +RGB tracking model into the framework and utilize multiple flexible prompt +strategies to adapt the pre-trained model to the comprehensive exploration of +uni-modal patterns and the improved modeling of fusion-modal features, +harnessing the potential of prompt learning in RGB-T tracking. Our method +outperforms the state-of-the-art methods on four challenging benchmarks, while +attaining 46.1 fps inference speed. + +
+
+
+
+
+ + ☆ LayoutFlow: Flow Matching for Layout Generation + + +
+ Finding a suitable layout represents a crucial task for diverse applications +in graphic design. Motivated by simpler and smoother sampling trajectories, we +explore the use of Flow Matching as an alternative to current diffusion-based +layout generation models. Specifically, we propose LayoutFlow, an efficient +flow-based model capable of generating high-quality layouts. Instead of +progressively denoising the elements of a noisy layout, our method learns to +gradually move, or flow, the elements of an initial sample until it reaches its +final prediction. In addition, we employ a conditioning scheme that allows us +to handle various generation tasks with varying degrees of conditioning with a +single model. Empirically, LayoutFlow performs on par with state-of-the-art +models while being significantly faster. + +
+
+
+
+
+ + ☆ Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting + + +
+ We present a method for large-mask pluralistic image inpainting based on the +generative framework of discrete latent codes. Our method learns latent priors, +discretized as tokens, by only performing computations at the visible locations +of the image. This is realized by a restrictive partial encoder that predicts +the token label for each visible block, a bidirectional transformer that infers +the missing labels by only looking at these tokens, and a dedicated synthesis +network that couples the tokens with the partial image priors to generate +coherent and pluralistic complete image even under extreme mask settings. +Experiments on public benchmarks validate our design choices as the proposed +method outperforms strong baselines in both visual quality and diversity +metrics. + +
+
+ comment: cvpr 2024 +
+
+
+
+
+ + ☆ Multi-Layer Dense Attention Decoder for Polyp Segmentation + + +
+ Detecting and segmenting polyps is crucial for expediting the diagnosis of +colon cancer. This is a challenging task due to the large variations of polyps +in color, texture, and lighting conditions, along with subtle differences +between the polyp and its surrounding area. Recently, vision Transformers have +shown robust abilities in modeling global context for polyp segmentation. +However, they face two major limitations: the inability to learn local +relations among multi-level layers and inadequate feature aggregation in the +decoder. To address these issues, we propose a novel decoder architecture aimed +at hierarchically aggregating locally enhanced multi-level dense features. +Specifically, we introduce a novel module named Dense Attention Gate (DAG), +which adaptively fuses all previous layers' features to establish local feature +relations among all layers. Furthermore, we propose a novel nested decoder +architecture that hierarchically aggregates decoder features, thereby enhancing +semantic features. We incorporate our novel dense decoder with the PVT backbone +network and conduct evaluations on five polyp segmentation datasets: Kvasir, +CVC-300, CVC-ColonDB, CVC-ClinicDB, and ETIS. Our experiments and comparisons +with nine competing segmentation models demonstrate that the proposed +architecture achieves state-of-the-art performance and outperforms the previous +models on four datasets. The source code is available at: +https://github.com/krushi1992/Dense-Decoder. + +
+
+
+
+
+ + ☆ Online Embedding Multi-Scale CLIP Features into 3D Maps + + +
+ This study introduces a novel approach to online embedding of multi-scale +CLIP (Contrastive Language-Image Pre-Training) features into 3D maps. By +harnessing CLIP, this methodology surpasses the constraints of conventional +vocabulary-limited methods and enables the incorporation of semantic +information into the resultant maps. While recent approaches have explored the +embedding of multi-modal features in maps, they often impose significant +computational costs, lacking practicality for exploring unfamiliar environments +in real time. Our approach tackles these challenges by efficiently computing +and embedding multi-scale CLIP features, thereby facilitating the exploration +of unfamiliar environments through real-time map generation. Moreover, the +embedding CLIP features into the resultant maps makes offline retrieval via +linguistic queries feasible. In essence, our approach simultaneously achieves +real-time object search and mapping of unfamiliar environments. Additionally, +we propose a zero-shot object-goal navigation system based on our mapping +approach, and we validate its efficacy through object-goal navigation, offline +object retrieval, and multi-object-goal navigation in both simulated +environments and real robot experiments. The findings demonstrate that our +method not only exhibits swifter performance than state-of-the-art mapping +methods but also surpasses them in terms of the success rate of object-goal +navigation tasks. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ LITA: Language Instructed Temporal-Localization Assistant + + +
+ There has been tremendous progress in multimodal Large Language Models +(LLMs). Recent works have extended these models to video input with promising +instruction following capabilities. However, an important missing piece is +temporal localization. These models cannot accurately answer the "When?" +questions. We identify three key aspects that limit their temporal localization +capabilities: (i) time representation, (ii) architecture, and (iii) data. We +address these shortcomings by proposing Language Instructed +Temporal-Localization Assistant (LITA) with the following features: (1) We +introduce time tokens that encode timestamps relative to the video length to +better represent time in videos. (2) We introduce SlowFast tokens in the +architecture to capture temporal information at fine temporal resolution. (3) +We emphasize temporal localization data for LITA. In addition to leveraging +existing video datasets with timestamps, we propose a new task, Reasoning +Temporal Localization (RTL), along with the dataset, ActivityNet-RTL, for +learning and evaluating this task. Reasoning temporal localization requires +both the reasoning and temporal localization of Video LLMs. LITA demonstrates +strong performance on this challenging task, nearly doubling the temporal mean +intersection-over-union (mIoU) of baselines. In addition, we show that our +emphasis on temporal localization also substantially improves video-based text +generation compared to existing Video LLMs, including a 36% relative +improvement of Temporal Understanding. Code is available at: +https://github.com/NVlabs/LITA + +
+
+
+
+
+ + ☆ Illicit object detection in X-ray images using Vision Transformers + + +
+ Illicit object detection is a critical task performed at various +high-security locations, including airports, train stations, subways, and +ports. The continuous and tedious work of examining thousands of X-ray images +per hour can be mentally taxing. Thus, Deep Neural Networks (DNNs) can be used +to automate the X-ray image analysis process, improve efficiency and alleviate +the security officers' inspection burden. The neural architectures typically +utilized in relevant literature are Convolutional Neural Networks (CNNs), with +Vision Transformers (ViTs) rarely employed. In order to address this gap, this +paper conducts a comprehensive evaluation of relevant ViT architectures on +illicit item detection in X-ray images. This study utilizes both Transformer +and hybrid backbones, such as SWIN and NextViT, and detectors, such as DINO and +RT-DETR. The results demonstrate the remarkable accuracy of the DINO +Transformer detector in the low-data regime, the impressive real-time +performance of YOLOv8, and the effectiveness of the hybrid NextViT backbone. + +
+
+
+
+
+ + ☆ Egocentric Scene-aware Human Trajectory Prediction + + +
+ Wearable collaborative robots stand to assist human wearers who need fall +prevention assistance or wear exoskeletons. Such a robot needs to be able to +predict the ego motion of the wearer based on egocentric vision and the +surrounding scene. In this work, we leveraged body-mounted cameras and sensors +to anticipate the trajectory of human wearers through complex surroundings. To +facilitate research in ego-motion prediction, we have collected a comprehensive +walking scene navigation dataset centered on the user's perspective. We present +a method to predict human motion conditioning on the surrounding static scene. +Our method leverages a diffusion model to produce a distribution of potential +future trajectories, taking into account the user's observation of the +environment. We introduce a compact representation to encode the user's visual +memory of the surroundings, as well as an efficient sample-generating technique +to speed up real-time inference of a diffusion model. We ablate our model and +compare it to baselines, and results show that our model outperforms existing +methods on key metrics of collision avoidance and trajectory mode coverage. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ WALT3D: Generating Realistic Training Data from Time-Lapse Imagery for + Reconstructing Dynamic Objects under Occlusion CVPR 2024 + + +
+ Current methods for 2D and 3D object understanding struggle with severe +occlusions in busy urban environments, partly due to the lack of large-scale +labeled ground-truth annotations for learning occlusion. In this work, we +introduce a novel framework for automatically generating a large, realistic +dataset of dynamic objects under occlusions using freely available time-lapse +imagery. By leveraging off-the-shelf 2D (bounding box, segmentation, keypoint) +and 3D (pose, shape) predictions as pseudo-groundtruth, unoccluded 3D objects +are identified automatically and composited into the background in a clip-art +style, ensuring realistic appearances and physically accurate occlusion +configurations. The resulting clip-art image with pseudo-groundtruth enables +efficient training of object reconstruction methods that are robust to +occlusions. Our method demonstrates significant improvements in both 2D and 3D +reconstruction, particularly in scenarios with heavily occluded objects like +vehicles and people in urban scenes. + +
+
+ comment: To appear in CVPR 2024 +
+
+
+
+
+ + ☆ Robust Active Speaker Detection in Noisy Environments + + +
+ This paper addresses the issue of active speaker detection (ASD) in noisy +environments and formulates a robust active speaker detection (rASD) problem. +Existing ASD approaches leverage both audio and visual modalities, but +non-speech sounds in the surrounding environment can negatively impact +performance. To overcome this, we propose a novel framework that utilizes +audio-visual speech separation as guidance to learn noise-free audio features. +These features are then utilized in an ASD model, and both tasks are jointly +optimized in an end-to-end framework. Our proposed framework mitigates residual +noise and audio quality reduction issues that can occur in a naive cascaded +two-stage framework that directly uses separated speech for ASD, and enables +the two tasks to be optimized simultaneously. To further enhance the robustness +of the audio features and handle inherent speech noises, we propose a dynamic +weighted loss approach to train the speech separator. We also collected a +real-world noise audio dataset to facilitate investigations. Experiments +demonstrate that non-speech audio noises significantly impact ASD models, and +our proposed approach improves ASD performance in noisy environments. The +framework is general and can be applied to different ASD approaches to improve +their robustness. Our code, models, and data will be released. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Cross--domain Fiber Cluster Shape Analysis for Language Performance + Cognitive Score Prediction + + +
+ Shape plays an important role in computer graphics, offering informative +features to convey an object's morphology and functionality. Shape analysis in +brain imaging can help interpret structural and functionality correlations of +the human brain. In this work, we investigate the shape of the brain's 3D white +matter connections and its potential predictive relationship to human cognitive +function. We reconstruct brain connections as sequences of 3D points using +diffusion magnetic resonance imaging (dMRI) tractography. To describe each +connection, we extract 12 shape descriptors in addition to traditional dMRI +connectivity and tissue microstructure features. We introduce a novel +framework, Shape--fused Fiber Cluster Transformer (SFFormer), that leverages a +multi-head cross-attention feature fusion module to predict subject-specific +language performance based on dMRI tractography. We assess the performance of +the method on a large dataset including 1065 healthy young adults. The results +demonstrate that both the transformer-based SFFormer model and its inter/intra +feature fusion with shape, microstructure, and connectivity are informative, +and together, they improve the prediction of subject-specific language +performance scores. Overall, our results indicate that the shape of the brain's +connections is predictive of human language function. + +
+
+ comment: 2 figures, 11 pages +
+
+
+
+
+ + ☆ Envisioning MedCLIP: A Deep Dive into Explainability for Medical + Vision-Language Models + + +
+ Explaining Deep Learning models is becoming increasingly important in the +face of daily emerging multimodal models, particularly in safety-critical +domains like medical imaging. However, the lack of detailed investigations into +the performance of explainability methods on these models is widening the gap +between their development and safe deployment. In this work, we analyze the +performance of various explainable AI methods on a vision-language model, +MedCLIP, to demystify its inner workings. We also provide a simple methodology +to overcome the shortcomings of these methods. Our work offers a different new +perspective on the explainability of a recent well-known VLM in the medical +domain and our assessment method is generalizable to other current and possible +future VLMs. + +
+
+
+
+
+ + ☆ Robustness and Visual Explanation for Black Box Image, Video, and ECG + Signal Classification with Reinforcement Learning AAAI + + +
+ We present a generic Reinforcement Learning (RL) framework optimized for +crafting adversarial attacks on different model types spanning from ECG signal +analysis (1D), image classification (2D), and video classification (3D). The +framework focuses on identifying sensitive regions and inducing +misclassifications with minimal distortions and various distortion types. The +novel RL method outperforms state-of-the-art methods for all three +applications, proving its efficiency. Our RL approach produces superior +localization masks, enhancing interpretability for image classification and ECG +analysis models. For applications such as ECG analysis, our platform highlights +critical ECG segments for clinicians while ensuring resilience against +prevalent distortions. This comprehensive tool aims to bolster both resilience +with adversarial training and transparency across varied applications and data +types. + +
+
+ comment: AAAI Proceedings reference: + https://ojs.aaai.org/index.php/AAAI/article/view/30579 +
+
+
+
+
+ + ☆ TextCraftor: Your Text Encoder Can be Image Quality Controller + + +
+ Diffusion-based text-to-image generative models, e.g., Stable Diffusion, have +revolutionized the field of content generation, enabling significant +advancements in areas like image editing and video synthesis. Despite their +formidable capabilities, these models are not without their limitations. It is +still challenging to synthesize an image that aligns well with the input text, +and multiple runs with carefully crafted prompts are required to achieve +satisfactory results. To mitigate these limitations, numerous studies have +endeavored to fine-tune the pre-trained diffusion models, i.e., UNet, utilizing +various technologies. Yet, amidst these efforts, a pivotal question of +text-to-image diffusion model training has remained largely unexplored: Is it +possible and feasible to fine-tune the text encoder to improve the performance +of text-to-image diffusion models? Our findings reveal that, instead of +replacing the CLIP text encoder used in Stable Diffusion with other large +language models, we can enhance it through our proposed fine-tuning approach, +TextCraftor, leading to substantial improvements in quantitative benchmarks and +human assessments. Interestingly, our technique also empowers controllable +image generation through the interpolation of different text encoders +fine-tuned with various rewards. We also demonstrate that TextCraftor is +orthogonal to UNet finetuning, and can be combined to further improve +generative quality. + +
+
+
+
+
+ + ☆ Lift3D: Zero-Shot Lifting of Any 2D Vision Model to 3D CVPR + + +
+ In recent years, there has been an explosion of 2D vision models for numerous +tasks such as semantic segmentation, style transfer or scene editing, enabled +by large-scale 2D image datasets. At the same time, there has been renewed +interest in 3D scene representations such as neural radiance fields from +multi-view images. However, the availability of 3D or multiview data is still +substantially limited compared to 2D image datasets, making extending 2D vision +models to 3D data highly desirable but also very challenging. Indeed, extending +a single 2D vision operator like scene editing to 3D typically requires a +highly creative method specialized to that task and often requires per-scene +optimization. In this paper, we ask the question of whether any 2D vision model +can be lifted to make 3D consistent predictions. We answer this question in the +affirmative; our new Lift3D method trains to predict unseen views on feature +spaces generated by a few visual models (i.e. DINO and CLIP), but then +generalizes to novel vision operators and tasks, such as style transfer, +super-resolution, open vocabulary segmentation and image colorization; for some +of these tasks, there is no comparable previous 3D method. In many cases, we +even outperform state-of-the-art methods specialized for the task in question. +Moreover, Lift3D is a zero-shot method, in the sense that it requires no +task-specific training, nor scene-specific optimization. + +
+
+ comment: Computer Vision and Pattern Recognition Conference (CVPR), 2024 +
+
+
+
+
+ + ☆ SMOF: Streaming Modern CNNs on FPGAs with Smart Off-Chip Eviction + + +
+ Convolutional Neural Networks (CNNs) have demonstrated their effectiveness in +numerous vision tasks. However, their high processing requirements necessitate +efficient hardware acceleration to meet the application's performance targets. +In the space of FPGAs, streaming-based dataflow architectures are often adopted +by users, as significant performance gains can be achieved through layer-wise +pipelining and reduced off-chip memory access by retaining data on-chip. +However, modern topologies, such as the UNet, YOLO, and X3D models, utilise +long skip connections, requiring significant on-chip storage and thus limiting +the performance achieved by such system architectures. The paper addresses the +above limitation by introducing weight and activation eviction mechanisms to +off-chip memory along the computational pipeline, taking into account the +available compute and memory resources. The proposed mechanism is incorporated +into an existing toolflow, expanding the design space by utilising off-chip +memory as a buffer. This enables the mapping of such modern CNNs to devices +with limited on-chip memory, under the streaming architecture design approach. +SMOF has demonstrated the capacity to deliver competitive and, in some cases, +state-of-the-art performance across a spectrum of computer vision tasks, +achieving up to 10.65 X throughput improvement compared to previous works. + +
+
+ comment: 12 pages, 8 figures, 5 tables +
+
+
+
+
+ + ☆ CPR: Retrieval Augmented Generation for Copyright Protection CVPR 2024 + + +
+ Retrieval Augmented Generation (RAG) is emerging as a flexible and robust +technique to adapt models to private users data without training, to handle +credit attribution, and to allow efficient machine unlearning at scale. +However, RAG techniques for image generation may lead to parts of the retrieved +samples being copied in the model's output. To reduce risks of leaking private +information contained in the retrieved set, we introduce Copy-Protected +generation with Retrieval (CPR), a new method for RAG with strong copyright +protection guarantees in a mixed-private setting for diffusion models.CPR +allows to condition the output of diffusion models on a set of retrieved +images, while also guaranteeing that unique identifiable information about +those example is not exposed in the generated outputs. In particular, it does +so by sampling from a mixture of public (safe) distribution and private (user) +distribution by merging their diffusion scores at inference. We prove that CPR +satisfies Near Access Freeness (NAF) which bounds the amount of information an +attacker may be able to extract from the generated images. We provide two +algorithms for copyright protection, CPR-KL and CPR-Choose. Unlike previously +proposed rejection-sampling-based NAF methods, our methods enable efficient +copyright-protected sampling with a single run of backward diffusion. We show +that our method can be applied to any pre-trained conditional diffusion model, +such as Stable Diffusion or unCLIP. In particular, we empirically show that +applying CPR on top of unCLIP improves quality and text-to-image alignment of +the generated results (81.4 to 83.17 on TIFA benchmark), while enabling credit +attribution, copy-right protection, and deterministic, constant time, +unlearning. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ PLOT-TAL -- Prompt Learning with Optimal Transport for Few-Shot Temporal + Action Localization + + +
+ This paper introduces a novel approach to temporal action localization (TAL) +in few-shot learning. Our work addresses the inherent limitations of +conventional single-prompt learning methods that often lead to overfitting due +to the inability to generalize across varying contexts in real-world videos. +Recognizing the diversity of camera views, backgrounds, and objects in videos, +we propose a multi-prompt learning framework enhanced with optimal transport. +This design allows the model to learn a set of diverse prompts for each action, +capturing general characteristics more effectively and distributing the +representation to mitigate the risk of overfitting. Furthermore, by employing +optimal transport theory, we efficiently align these prompts with action +features, optimizing for a comprehensive representation that adapts to the +multifaceted nature of video data. Our experiments demonstrate significant +improvements in action localization accuracy and robustness in few-shot +settings on the standard challenging datasets of THUMOS-14 and EpicKitchens100, +highlighting the efficacy of our multi-prompt optimal transport approach in +overcoming the challenges of conventional few-shot TAL methods. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ UniDepth: Universal Monocular Metric Depth Estimation + + +
+ Accurate monocular metric depth estimation (MMDE) is crucial to solving +downstream tasks in 3D perception and modeling. However, the remarkable +accuracy of recent MMDE methods is confined to their training domains. These +methods fail to generalize to unseen domains even in the presence of moderate +domain gaps, which hinders their practical applicability. We propose a new +model, UniDepth, capable of reconstructing metric 3D scenes from solely single +images across domains. Departing from the existing MMDE methods, UniDepth +directly predicts metric 3D points from the input image at inference time +without any additional information, striving for a universal and flexible MMDE +solution. In particular, UniDepth implements a self-promptable camera module +predicting dense camera representation to condition depth features. Our model +exploits a pseudo-spherical output representation, which disentangles camera +and depth representations. In addition, we propose a geometric invariance loss +that promotes the invariance of camera-prompted depth features. Thorough +evaluations on ten datasets in a zero-shot regime consistently demonstrate the +superior performance of UniDepth, even when compared with methods directly +trained on the testing domains. Code and models are available at: +https://github.com/lpiccinelli-eth/unidepth + +
+
+
+
+
+ + ☆ A Geometric Explanation of the Likelihood OOD Detection Paradox + + +
+ Likelihood-based deep generative models (DGMs) commonly exhibit a puzzling +behaviour: when trained on a relatively complex dataset, they assign higher +likelihood values to out-of-distribution (OOD) data from simpler sources. +Adding to the mystery, OOD samples are never generated by these DGMs despite +having higher likelihoods. This two-pronged paradox has yet to be conclusively +explained, making likelihood-based OOD detection unreliable. Our primary +observation is that high-likelihood regions will not be generated if they +contain minimal probability mass. We demonstrate how this seeming contradiction +of large densities yet low probability mass can occur around data confined to +low-dimensional manifolds. We also show that this scenario can be identified +through local intrinsic dimension (LID) estimation, and propose a method for +OOD detection which pairs the likelihoods and LID estimates obtained from a +pre-trained DGM. Our method can be applied to normalizing flows and score-based +diffusion models, and obtains results which match or surpass state-of-the-art +OOD detection benchmarks using the same DGM backbones. Our code is available at +https://github.com/layer6ai-labs/dgm_ood_detection. + +
+
+
+
+
+ + ☆ Enhancing Multiple Object Tracking Accuracy via Quantum Annealing + + +
+ Multiple object tracking (MOT), a key task in image recognition, presents a +persistent challenge in balancing processing speed and tracking accuracy. This +study introduces a novel approach that leverages quantum annealing (QA) to +expedite computation speed, while enhancing tracking accuracy through the +ensembling of object tracking processes. A method to improve the matching +integration process is also proposed. By utilizing the sequential nature of +MOT, this study further augments the tracking method via reverse annealing +(RA). Experimental validation confirms the maintenance of high accuracy with an +annealing time of a mere 3 $\mu$s per tracking process. The proposed method +holds significant potential for real-time MOT applications, including traffic +flow measurement for urban traffic light control, collision prediction for +autonomous robots and vehicles, and management of products mass-produced in +factories. + +
+
+ comment: 19pages, 15 figures +
+
+
+
+
+ + ☆ Self-Expansion of Pre-trained Models with Mixture of Adapters for + Continual Learning + + +
+ Continual learning aims to learn from a stream of continuously arriving data +with minimum forgetting of previously learned knowledge. While previous works +have explored the effectiveness of leveraging the generalizable knowledge from +pre-trained models in continual learning, existing parameter-efficient +fine-tuning approaches focus on the use of a predetermined or task-wise set of +adapters or prompts. However, these approaches still suffer from forgetting due +to task interference on jointly used parameters or restricted flexibility. The +reliance on a static model architecture may lead to the allocation of excessive +parameters that are not essential or, conversely, inadequate adaptation for +downstream tasks, given that the scale and distribution of incoming data are +unpredictable in continual learning. We propose Self-Expansion of pre-trained +models with Modularized Adaptation (SEMA), a novel fine-tuning approach which +automatically decides to reuse or add adapter modules on demand in continual +learning, depending on whether drastic distribution shift that could not be +handled by existing modules is detected at different representation levels. We +design each adapter module to consist of an adapter and a representation +descriptor, specifically, implemented as an autoencoder. The representation +descriptor functions as a distributional shift indicator during training and +triggers adapter expansion. For better usage of the adapters, an expandable +weighting router is learned jointly for mixture of adapter outputs. By +comparing with vision-transformer-based continual learning adaptation methods, +we demonstrate that the proposed framework outperforms the state-of-the-art +without memory rehearsal. + +
+
+
+
+
+ + ☆ AIC-UNet: Anatomy-informed Cascaded UNet for Robust Multi-Organ + Segmentation + + +
+ Imposing key anatomical features, such as the number of organs, their shapes, +sizes, and relative positions, is crucial for building a robust multi-organ +segmentation model. Current attempts to incorporate anatomical features include +broadening effective receptive fields (ERF) size with resource- and +data-intensive modules such as self-attention or introducing organ-specific +topology regularizers, which may not scale to multi-organ segmentation problems +where inter-organ relation also plays a huge role. We introduce a new approach +to impose anatomical constraints on any existing encoder-decoder segmentation +model by conditioning model prediction with learnable anatomy prior. More +specifically, given an abdominal scan, a part of the encoder spatially warps a +learnable prior to align with the given input scan using thin plate spline +(TPS) grid interpolation. The warped prior is then integrated during the +decoding phase to guide the model for more anatomy-informed predictions. Code +is available at +\hyperlink{https://anonymous.4open.science/r/AIC-UNet-7048}{https://anonymous.4open.science/r/AIC-UNet-7048}. + +
+
+
+
+
+ + ☆ Capability-aware Prompt Reformulation Learning for Text-to-Image + Generation SIGIR 2024 + + +
+ Text-to-image generation systems have emerged as revolutionary tools in the +realm of artistic creation, offering unprecedented ease in transforming textual +prompts into visual art. However, the efficacy of these systems is intricately +linked to the quality of user-provided prompts, which often poses a challenge +to users unfamiliar with prompt crafting. This paper addresses this challenge +by leveraging user reformulation data from interaction logs to develop an +automatic prompt reformulation model. Our in-depth analysis of these logs +reveals that user prompt reformulation is heavily dependent on the individual +user's capability, resulting in significant variance in the quality of +reformulation pairs. To effectively use this data for training, we introduce +the Capability-aware Prompt Reformulation (CAPR) framework. CAPR innovatively +integrates user capability into the reformulation process through two key +components: the Conditional Reformulation Model (CRM) and Configurable +Capability Features (CCF). CRM reformulates prompts according to a specified +user capability, as represented by CCF. The CCF, in turn, offers the +flexibility to tune and guide the CRM's behavior. This enables CAPR to +effectively learn diverse reformulation strategies across various user +capacities and to simulate high-capability user reformulation during inference. +Extensive experiments on standard text-to-image generation benchmarks showcase +CAPR's superior performance over existing baselines and its remarkable +robustness on unseen systems. Furthermore, comprehensive analyses validate the +effectiveness of different components. CAPR can facilitate user-friendly +interaction with text-to-image systems and make advanced artistic creation more +achievable for a broader range of users. + +
+
+ comment: Accepted at SIGIR 2024 +
+
+
+
+
+ + ♻ ☆ Shifting to Machine Supervision: Annotation-Efficient Semi and + Self-Supervised Learning for Automatic Medical Image Segmentation and + Classification + + +
+ Advancements in clinical treatment are increasingly constrained by the +limitations of supervised learning techniques, which depend heavily on large +volumes of annotated data. The annotation process is not only costly but also +demands substantial time from clinical specialists. Addressing this issue, we +introduce the S4MI (Self-Supervision and Semi-Supervision for Medical Imaging) +pipeline, a novel approach that leverages advancements in self-supervised and +semi-supervised learning. These techniques engage in auxiliary tasks that do +not require labeling, thus simplifying the scaling of machine supervision +compared to fully-supervised methods. Our study benchmarks these techniques on +three distinct medical imaging datasets to evaluate their effectiveness in +classification and segmentation tasks. Notably, we observed that self +supervised learning significantly surpassed the performance of supervised +methods in the classification of all evaluated datasets. Remarkably, the +semi-supervised approach demonstrated superior outcomes in segmentation, +outperforming fully-supervised methods while using 50% fewer labels across all +datasets. In line with our commitment to contributing to the scientific +community, we have made the S4MI code openly accessible, allowing for broader +application and further development of these methods. + +
+
+ comment: Seventeen pages (incl. references), five figures, and one table. + (Under Review) +
+
+
+
+
+ + ♻ ☆ Boosting Object Detection with Zero-Shot Day-Night Domain Adaptation CVPR 2024 + + +
+ Detecting objects in low-light scenarios presents a persistent challenge, as +detectors trained on well-lit data exhibit significant performance degradation +on low-light data due to low visibility. Previous methods mitigate this issue +by exploring image enhancement or object detection techniques with real +low-light image datasets. However, the progress is impeded by the inherent +difficulties about collecting and annotating low-light images. To address this +challenge, we propose to boost low-light object detection with zero-shot +day-night domain adaptation, which aims to generalize a detector from well-lit +scenarios to low-light ones without requiring real low-light data. Revisiting +Retinex theory in the low-level vision, we first design a reflectance +representation learning module to learn Retinex-based illumination invariance +in images with a carefully designed illumination invariance reinforcement +strategy. Next, an interchange-redecomposition-coherence procedure is +introduced to improve over the vanilla Retinex image decomposition process by +performing two sequential image decompositions and introducing a +redecomposition cohering loss. Extensive experiments on ExDark, DARK FACE, and +CODaN datasets show strong low-light generalizability of our method. Our code +is available at https://github.com/ZPDu/DAI-Net. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Decoupled Data Consistency with Diffusion Purification for Image + Restoration + + +
+ Diffusion models have recently gained traction as a powerful class of deep +generative priors, excelling in a wide range of image restoration tasks due to +their exceptional ability to model data distributions. To solve image +restoration problems, many existing techniques achieve data consistency by +incorporating additional likelihood gradient steps into the reverse sampling +process of diffusion models. However, the additional gradient steps pose a +challenge for real-world practical applications as they incur a large +computational overhead, thereby increasing inference time. They also present +additional difficulties when using accelerated diffusion model samplers, as the +number of data consistency steps is limited by the number of reverse sampling +steps. In this work, we propose a novel diffusion-based image restoration +solver that addresses these issues by decoupling the reverse process from the +data consistency steps. Our method involves alternating between a +reconstruction phase to maintain data consistency and a refinement phase that +enforces the prior via diffusion purification. Our approach demonstrates +versatility, making it highly adaptable for efficient problem-solving in latent +space. Additionally, it reduces the necessity for numerous sampling steps +through the integration of consistency models. The efficacy of our approach is +validated through comprehensive experiments across various image restoration +tasks, including image denoising, deblurring, inpainting, and super-resolution. + +
+
+
+
+
+ + ♻ ☆ Interpretable machine learning for time-to-event prediction in medicine + and healthcare + + +
+ Time-to-event prediction, e.g. cancer survival analysis or hospital length of +stay, is a highly prominent machine learning task in medical and healthcare +applications. However, only a few interpretable machine learning methods comply +with its challenges. To facilitate a comprehensive explanatory analysis of +survival models, we formally introduce time-dependent feature effects and +global feature importance explanations. We show how post-hoc interpretation +methods allow for finding biases in AI systems predicting length of stay using +a novel multi-modal dataset created from 1235 X-ray images with textual +radiology reports annotated by human experts. Moreover, we evaluate cancer +survival models beyond predictive performance to include the importance of +multi-omics feature groups based on a large-scale benchmark comprising 11 +datasets from The Cancer Genome Atlas (TCGA). Model developers can use the +proposed methods to debug and improve machine learning algorithms, while +physicians can discover disease biomarkers and assess their significance. We +hope the contributed open data and code resources facilitate future work in the +emerging research direction of explainable survival analysis. + +
+
+ comment: An extended version of an AIME 2023 paper submitted to Artificial + Intelligence in Medicine +
+
+
+
+
+ + ♻ ☆ Simplified Diffusion Schrödinger Bridge + + +
+ This paper introduces a novel theoretical simplification of the Diffusion +Schr\"odinger Bridge (DSB) that facilitates its unification with Score-based +Generative Models (SGMs), addressing the limitations of DSB in complex data +generation and enabling faster convergence and enhanced performance. By +employing SGMs as an initial solution for DSB, our approach capitalizes on the +strengths of both frameworks, ensuring a more efficient training process and +improving the performance of SGM. We also propose a reparameterization +technique that, despite theoretical approximations, practically improves the +network's fitting capabilities. Our extensive experimental evaluations confirm +the effectiveness of the simplified DSB, demonstrating its significant +improvements. We believe the contributions of this work pave the way for +advanced generative modeling. The code is available at +https://github.com/checkcrab/SDSB. + +
+
+
+
+
+ + ♻ ☆ Self-supervised co-salient object detection via feature correspondence + at multiple scales + + +
+ Our paper introduces a novel two-stage self-supervised approach for detecting +co-occurring salient objects (CoSOD) in image groups without requiring +segmentation annotations. Unlike existing unsupervised methods that rely solely +on patch-level information (e.g. clustering patch descriptors) or on +computation heavy off-the-shelf components for CoSOD, our lightweight model +leverages feature correspondences at both patch and region levels, +significantly improving prediction performance. In the first stage, we train a +self-supervised network that detects co-salient regions by computing local +patch-level feature correspondences across images. We obtain the segmentation +predictions using confidence-based adaptive thresholding. In the next stage, we +refine these intermediate segmentations by eliminating the detected regions +(within each image) whose averaged feature representations are dissimilar to +the foreground feature representation averaged across all the cross-attention +maps (from the previous stage). Extensive experiments on three CoSOD benchmark +datasets show that our self-supervised model outperforms the corresponding +state-of-the-art models by a huge margin (e.g. on the CoCA dataset, our model +has a 13.7% F-measure gain over the SOTA unsupervised CoSOD model). Notably, +our self-supervised model also outperforms several recent fully supervised +CoSOD models on the three test datasets (e.g., on the CoCA dataset, our model +has a 4.6% F-measure gain over a recent supervised CoSOD model). + +
+
+
+
+
+ + ♻ ☆ LION: Implicit Vision Prompt Tuning AAAI2024 + + +
+ Despite recent competitive performance across a range of vision tasks, vision +Transformers still have an issue of heavy computational costs. Recently, vision +prompt learning has provided an economic solution to this problem without +fine-tuning the whole large-scale models. However, the efficiency of existing +models are still far from satisfactory due to insertion of extensive prompts +blocks and trick prompt designs. In this paper, we propose an efficient vision +model named impLicit vIsion prOmpt tuNing (LION), which is motivated by deep +implicit models with stable memory costs for various complex tasks. In +particular, we merely insect two equilibrium implicit layers in two ends of the +pre-trained main backbone with parameters in the backbone frozen. Moreover, we +prune the parameters in these two layers according to lottery hypothesis. The +performance obtained by our LION are promising on a wide range of datasets. In +particular, our LION reduces up to 11.5% of training parameter numbers while +obtaining higher performance compared with the state-of-the-art baseline VPT, +especially under challenging scenes. Furthermore, we find that our proposed +LION had a good generalization performance, making it an easy way to boost +transfer learning in the future. + +
+
+ comment: Accepted by AAAI2024; 9 pages, 3 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Incorporating simulated spatial context information improves the + effectiveness of contrastive learning models + + +
+ Visual learning often occurs in a specific context, where an agent acquires +skills through exploration and tracking of its location in a consistent +environment. The historical spatial context of the agent provides a similarity +signal for self-supervised contrastive learning. We present a unique approach, +termed Environmental Spatial Similarity (ESS), that complements existing +contrastive learning methods. Using images from simulated, photorealistic +environments as an experimental setting, we demonstrate that ESS outperforms +traditional instance discrimination approaches. Moreover, sampling additional +data from the same environment substantially improves accuracy and provides new +augmentations. ESS allows remarkable proficiency in room classification and +spatial prediction tasks, especially in unfamiliar environments. This learning +paradigm has the potential to enable rapid visual learning in agents operating +in new environments with unique visual characteristics. Potentially +transformative applications span from robotics to space exploration. Our proof +of concept demonstrates improved efficiency over methods that rely on +extensive, disconnected datasets. + +
+
+
+
+
+ + ♻ ☆ Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised + Learning AAAI2024 + + +
+ Semi-supervised learning (SSL) methods assume that labeled data, unlabeled +data and test data are from the same distribution. Open-set semi-supervised +learning (Open-set SSL) considers a more practical scenario, where unlabeled +data and test data contain new categories (outliers) not observed in labeled +data (inliers). Most previous works focused on outlier detection via binary +classifiers, which suffer from insufficient scalability and inability to +distinguish different types of uncertainty. In this paper, we propose a novel +framework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these +limitations. Concretely, we first introduce evidential deep learning (EDL) as +an outlier detector to quantify different types of uncertainty, and design +different uncertainty metrics for self-training and inference. Furthermore, we +propose a novel adaptive negative optimization strategy, making EDL more +tailored to the unlabeled dataset containing both inliers and outliers. As +demonstrated empirically, our proposed method outperforms existing +state-of-the-art methods across four datasets. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Vision Transformer-Based Deep Learning for Histologic Classification of + Endometrial Cancer + + +
+ Endometrial cancer, the fourth most common cancer in females in the United +States, with the lifetime risk for developing this disease is approximately +2.8% in women. Precise histologic evaluation and molecular classification of +endometrial cancer is important for effective patient management and +determining the best treatment modalities. This study introduces EndoNet, which +uses convolutional neural networks for extracting histologic features and a +vision transformer for aggregating these features and classifying slides based +on their visual characteristics into high- and low- grade. The model was +trained on 929 digitized hematoxylin and eosin-stained whole-slide images of +endometrial cancer from hysterectomy cases at Dartmouth-Health. It classifies +these slides into low-grade (Endometroid Grades 1 and 2) and high-grade +(endometroid carcinoma FIGO grade 3, uterine serous carcinoma, carcinosarcoma) +categories. EndoNet was evaluated on an internal test set of 110 patients and +an external test set of 100 patients from the public TCGA database. The model +achieved a weighted average F1-score of 0.91 (95% CI: 0.86-0.95) and an AUC of +0.95 (95% CI: 0.89-0.99) on the internal test, and 0.86 (95% CI: 0.80-0.94) for +F1-score and 0.86 (95% CI: 0.75-0.93) for AUC on the external test. Pending +further validation, EndoNet has the potential to support pathologists without +the need of manual annotations in classifying the grades of gynecologic +pathology tumors. + +
+
+ comment: 4 Tables and 3 Figures +
+
+
+
+
+ + ♻ ☆ Automated Construction of Time-Space Diagrams for Traffic Analysis Using + Street-View Video Sequence SC + + +
+ Time-space diagrams are essential tools for analyzing traffic patterns and +optimizing transportation infrastructure and traffic management strategies. +Traditional data collection methods for these diagrams have limitations in +terms of temporal and spatial coverage. Recent advancements in camera +technology have overcome these limitations and provided extensive urban data. +In this study, we propose an innovative approach to constructing time-space +diagrams by utilizing street-view video sequences captured by cameras mounted +on moving vehicles. Using the state-of-the-art YOLOv5, StrongSORT, and +photogrammetry techniques for distance calculation, we can infer vehicle +trajectories from the video data and generate time-space diagrams. To evaluate +the effectiveness of our proposed method, we utilized datasets from the KITTI +computer vision benchmark suite. The evaluation results demonstrate that our +approach can generate trajectories from video data, although there are some +errors that can be mitigated by improving the performance of the detector, +tracker, and distance calculation components. In conclusion, the utilization of +street-view video sequences captured by cameras mounted on moving vehicles, +combined with state-of-the-art computer vision techniques, has immense +potential for constructing comprehensive time-space diagrams. These diagrams +offer valuable insights into traffic patterns and contribute to the design of +transportation infrastructure and traffic management strategies. + +
+
+ comment: The paper is published in 2023 IEEE 26th International Conference on + Intelligent Transportation Systems (ITSC) +
+
+
+
+
+ + ♻ ☆ SOAC: Spatio-Temporal Overlap-Aware Multi-Sensor Calibration using + Neural Radiance Fields CVPR 2024 + + +
+ In rapidly-evolving domains such as autonomous driving, the use of multiple +sensors with different modalities is crucial to ensure high operational +precision and stability. To correctly exploit the provided information by each +sensor in a single common frame, it is essential for these sensors to be +accurately calibrated. In this paper, we leverage the ability of Neural +Radiance Fields (NeRF) to represent different sensors modalities in a common +volumetric representation to achieve robust and accurate spatio-temporal sensor +calibration. By designing a partitioning approach based on the visible part of +the scene for each sensor, we formulate the calibration problem using only the +overlapping areas. This strategy results in a more robust and accurate +calibration that is less prone to failure. We demonstrate that our approach +works on outdoor urban scenes by validating it on multiple established driving +datasets. Results show that our method is able to get better accuracy and +robustness compared to existing methods. + +
+
+ comment: Accepted at CVPR 2024. Project page: https://qherau.github.io/SOAC/ +
+
+
+
+
+ + ♻ ☆ Point, Segment and Count: A Generalized Framework for Object Counting CVPR 2024 + + +
+ Class-agnostic object counting aims to count all objects in an image with +respect to example boxes or class names, \emph{a.k.a} few-shot and zero-shot +counting. In this paper, we propose a generalized framework for both few-shot +and zero-shot object counting based on detection. Our framework combines the +superior advantages of two foundation models without compromising their +zero-shot capability: (\textbf{i}) SAM to segment all possible objects as mask +proposals, and (\textbf{ii}) CLIP to classify proposals to obtain accurate +object counts. However, this strategy meets the obstacles of efficiency +overhead and the small crowded objects that cannot be localized and +distinguished. To address these issues, our framework, termed PseCo, follows +three steps: point, segment, and count. Specifically, we first propose a +class-agnostic object localization to provide accurate but least point prompts +for SAM, which consequently not only reduces computation costs but also avoids +missing small objects. Furthermore, we propose a generalized object +classification that leverages CLIP image/text embeddings as the classifier, +following a hierarchical knowledge distillation to obtain discriminative +classifications among hierarchical mask proposals. Extensive experimental +results on FSC-147, COCO, and LVIS demonstrate that PseCo achieves +state-of-the-art performance in both few-shot/zero-shot object +counting/detection. Code: https://github.com/Hzzone/PseCo + +
+
+ comment: Accepted by CVPR 2024. Camera ready +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised Emotion Transition Learning for Diverse 3D Co-speech + Gesture Generation CVPR 2024 + + +
+ Generating vivid and emotional 3D co-speech gestures is crucial for virtual +avatar animation in human-machine interaction applications. While the existing +methods enable generating the gestures to follow a single emotion label, they +overlook that long gesture sequence modeling with emotion transition is more +practical in real scenes. In addition, the lack of large-scale available +datasets with emotional transition speech and corresponding 3D human gestures +also limits the addressing of this task. To fulfill this goal, we first +incorporate the ChatGPT-4 and an audio inpainting approach to construct the +high-fidelity emotion transition human speeches. Considering obtaining the +realistic 3D pose annotations corresponding to the dynamically inpainted +emotion transition audio is extremely difficult, we propose a novel weakly +supervised training strategy to encourage authority gesture transitions. +Specifically, to enhance the coordination of transition gestures w.r.t +different emotional ones, we model the temporal association representation +between two different emotional gesture sequences as style guidance and infuse +it into the transition generation. We further devise an emotion mixture +mechanism that provides weak supervision based on a learnable mixed emotion +label for transition gestures. Last, we present a keyframe sampler to supply +effective initial posture cues in long sequences, enabling us to generate +diverse gestures. Extensive experiments demonstrate that our method outperforms +the state-of-the-art models constructed by adapting single emotion-conditioned +counterparts on our newly defined emotion transition task and datasets. Our +code and dataset will be released on the project page: +https://xingqunqi-lab.github.io/Emo-Transition-Gesture/. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning by Erasing: Conditional Entropy based Transferable + Out-Of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection is essential to handle the distribution +shifts between training and test scenarios. For a new in-distribution (ID) +dataset, existing methods require retraining to capture the dataset-specific +feature representation or data distribution. In this paper, we propose a deep +generative models (DGM) based transferable OOD detection method, which is +unnecessary to retrain on a new ID dataset. We design an image erasing strategy +to equip exclusive conditional entropy distribution for each ID dataset, which +determines the discrepancy of DGM's posteriori ucertainty distribution on +different ID datasets. Owing to the powerful representation capacity of +convolutional neural networks, the proposed model trained on complex dataset +can capture the above discrepancy between ID datasets without retraining and +thus achieve transferable OOD detection. We validate the proposed method on +five datasets and verity that ours achieves comparable performance to the +state-of-the-art group based OOD detection methods that need to be retrained to +deploy on new ID datasets. Our code is available at +https://github.com/oOHCIOo/CETOOD. + +
+
+ comment: update new experimental results +
+
+
+
+
+ + ♻ ☆ Dual Structure-Aware Image Filterings for Semi-supervised Medical Image + Segmentation + + +
+ Semi-supervised image segmentation has attracted great attention recently. +The key is how to leverage unlabeled images in the training process. Most +methods maintain consistent predictions of the unlabeled images under +variations (e.g., adding noise/perturbations, or creating alternative versions) +in the image and/or model level. In most image-level variation, medical images +often have prior structure information, which has not been well explored. In +this paper, we propose novel dual structure-aware image filterings (DSAIF) as +the image-level variations for semi-supervised medical image segmentation. +Motivated by connected filtering that simplifies image via filtering in +structure-aware tree-based image representation, we resort to the dual contrast +invariant Max-tree and Min-tree representation. Specifically, we propose a +novel connected filtering that removes topologically equivalent nodes (i.e. +connected components) having no siblings in the Max/Min-tree. This results in +two filtered images preserving topologically critical structure. Applying the +proposed DSAIF to mutually supervised networks decreases the consensus of their +erroneous predictions on unlabeled images. This helps to alleviate the +confirmation bias issue of overfitting to noisy pseudo labels of unlabeled +images, and thus effectively improves the segmentation performance. Extensive +experimental results on three benchmark datasets demonstrate that the proposed +method significantly/consistently outperforms some state-of-the-art methods. +The source codes will be publicly available. + +
+
+
+
+
+ + ♻ ☆ Decomposing Disease Descriptions for Enhanced Pathology Detection: A + Multi-Aspect Vision-Language Pre-training Framework CVPR2024 + + +
+ Medical vision language pre-training (VLP) has emerged as a frontier of +research, enabling zero-shot pathological recognition by comparing the query +image with the textual descriptions for each disease. Due to the complex +semantics of biomedical texts, current methods struggle to align medical images +with key pathological findings in unstructured reports. This leads to the +misalignment with the target disease's textual representation. In this paper, +we introduce a novel VLP framework designed to dissect disease descriptions +into their fundamental aspects, leveraging prior knowledge about the visual +manifestations of pathologies. This is achieved by consulting a large language +model and medical experts. Integrating a Transformer module, our approach +aligns an input image with the diverse elements of a disease, generating +aspect-centric image representations. By consolidating the matches from each +aspect, we improve the compatibility between an image and its associated +disease. Additionally, capitalizing on the aspect-oriented representations, we +present a dual-head Transformer tailored to process known and unknown diseases, +optimizing the comprehensive detection efficacy. Conducting experiments on +seven downstream datasets, ours improves the accuracy of recent methods by up +to 8.56% and 17.0% for seen and unseen categories, respectively. Our code is +released at https://github.com/HieuPhan33/MAVL. + +
+
+ comment: Accepted at CVPR2024. Pre-print before final camera-ready version +
+
+
+
+
+ + ♻ ☆ Shapley Values-Powered Framework for Fair Reward Split in Content + Produced by GenAI + + +
+ It is evident that, currently, generative models are surpassed in quality by +human professionals. However, with the advancements in Artificial Intelligence, +this gap will narrow, leading to scenarios where individuals who have dedicated +years of their lives to mastering a skill become obsolete due to their high +costs, which are inherently linked to the time they require to complete a task +-- a task that AI could accomplish in minutes or seconds. To avoid future +social upheavals, we must, even now, contemplate how to fairly assess the +contributions of such individuals in training generative models and how to +compensate them for the reduction or complete loss of their incomes. In this +work, we propose a method to structure collaboration between model developers +and data providers. To achieve this, we employ Shapley Values to quantify the +contribution of artist(s) in an image generated by the Stable Diffusion-v1.5 +model and to equitably allocate the reward among them. + +
+
+ comment: 36 pages, 32 figures +
+
+
+
+
+ + ♻ ☆ E4S: Fine-grained Face Swapping via Editing With Regional GAN Inversion + + +
+ This paper proposes a novel approach to face swapping from the perspective of +fine-grained facial editing, dubbed "editing for swapping" (E4S). The +traditional face swapping methods rely on global feature extraction and fail to +preserve the detailed source identity. In contrast, we propose a Regional GAN +Inversion (RGI) method, which allows the explicit disentanglement of shape and +texture. Specifically, our E4S performs face swapping in the latent space of a +pretrained StyleGAN, where a multi-scale mask-guided encoder is applied to +project the texture of each facial component into regional style codes and a +mask-guided injection module manipulating feature maps with the style codes. +Based on this disentanglement, face swapping can be simplified as style and +mask swapping. Besides, due to the large lighting condition gap, transferring +the source skin into the target image may lead to disharmony lighting. We +propose a re-coloring network to make the swapped face maintain the target +lighting condition while preserving the source skin. Further, to deal with the +potential mismatch areas during mask exchange, we design a face inpainting +module to refine the face shape. The extensive comparisons with +state-of-the-art methods demonstrate that our E4S outperforms existing methods +in preserving texture, shape, and lighting. Our implementation is available at +https://github.com/e4s2024/E4S2024. + +
+
+ comment: Project Page: https://e4s2024.github.io/ ;. arXiv admin note: text + overlap with arXiv:2211.14068 +
+
+
+
+
+ + ♻ ☆ ViDA: Homeostatic Visual Domain Adapter for Continual Test Time + Adaptation ICLR2024 + + +
+ Since real-world machine systems are running in non-stationary environments, +Continual Test-Time Adaptation (CTTA) task is proposed to adapt the pre-trained +model to continually changing target domains. Recently, existing methods mainly +focus on model-based adaptation, which aims to leverage a self-training manner +to extract the target domain knowledge. However, pseudo labels can be noisy and +the updated model parameters are unreliable under dynamic data distributions, +leading to error accumulation and catastrophic forgetting in the continual +adaptation process. To tackle these challenges and maintain the model +plasticity, we design a Visual Domain Adapter (ViDA) for CTTA, explicitly +handling both domain-specific and domain-shared knowledge. Specifically, we +first comprehensively explore the different domain representations of the +adapters with trainable high-rank or low-rank embedding spaces. Then we inject +ViDAs into the pre-trained model, which leverages high-rank and low-rank +features to adapt the current domain distribution and maintain the continual +domain-shared knowledge, respectively. To exploit the low-rank and high-rank +ViDAs more effectively, we further propose a Homeostatic Knowledge Allotment +(HKA) strategy, which adaptively combines different knowledge from each ViDA. +Extensive experiments conducted on four widely used benchmarks demonstrate that +our proposed method achieves state-of-the-art performance in both +classification and segmentation CTTA tasks. Note that, our method can be +regarded as a novel transfer paradigm for large-scale models, delivering +promising results in adaptation to continually changing distributions. Project +page: https://sites.google.com/view/iclr2024-vida/home. + +
+
+ comment: Accepted by ICLR2024 +
+
+
+
+
+ + ♻ ☆ Visually Guided Generative Text-Layout Pre-training for Document + Intelligence NAACL 2024 + + +
+ Prior study shows that pre-training techniques can boost the performance of +visual document understanding (VDU), which typically requires models to gain +abilities to perceive and reason both document texts and layouts (e.g., +locations of texts and table-cells). To this end, we propose visually guided +generative text-layout pre-training, named ViTLP. Given a document image, the +model optimizes hierarchical language and layout modeling objectives to +generate the interleaved text and layout sequence. In addition, to address the +limitation of processing long documents by Transformers, we introduce a +straightforward yet effective multi-segment generative pre-training scheme, +facilitating ViTLP to process word-intensive documents of any length. ViTLP can +function as a native OCR model to localize and recognize texts of document +images. Besides, ViTLP can be effectively applied to various downstream VDU +tasks. Extensive experiments show that ViTLP achieves competitive performance +over existing baselines on benchmark VDU tasks, including information +extraction, document classification, and document question answering. + +
+
+ comment: Accepted to NAACL 2024 main conference. The first version of this + paper was submitted to OpenReview + (https://openreview.net/forum?id=ARtBIBAmNR) in June 2023 +
+
+
+
+
+ + ♻ ☆ Intraoperative 2D/3D Image Registration via Differentiable X-ray + Rendering CVPR 2024 + + +
+ Surgical decisions are informed by aligning rapid portable 2D intraoperative +images (e.g., X-rays) to a high-fidelity 3D preoperative reference scan (e.g., +CT). 2D/3D image registration often fails in practice: conventional +optimization methods are prohibitively slow and susceptible to local minima, +while neural networks trained on small datasets fail on new patients or require +impractical landmark supervision. We present DiffPose, a self-supervised +approach that leverages patient-specific simulation and differentiable +physics-based rendering to achieve accurate 2D/3D registration without relying +on manually labeled data. Preoperatively, a CNN is trained to regress the pose +of a randomly oriented synthetic X-ray rendered from the preoperative CT. The +CNN then initializes rapid intraoperative test-time optimization that uses the +differentiable X-ray renderer to refine the solution. Our work further proposes +several geometrically principled methods for sampling camera poses from +$\mathbf{SE}(3)$, for sparse differentiable rendering, and for driving +registration in the tangent space $\mathfrak{se}(3)$ with geodesic and +multiscale locality-sensitive losses. DiffPose achieves sub-millimeter accuracy +across surgical datasets at intraoperative speeds, improving upon existing +unsupervised methods by an order of magnitude and even outperforming supervised +baselines. Our code is available at https://github.com/eigenvivek/DiffPose. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Challenging Common Paradigms in Multi-Task Learning + + +
+ While multi-task learning (MTL) has gained significant attention in recent +years, its underlying mechanisms remain poorly understood. Recent methods did +not yield consistent performance improvements over single task learning (STL) +baselines, underscoring the importance of gaining more profound insights about +challenges specific to MTL. In our study, we challenge paradigms in MTL in the +context of STL: First, the impact of the choice of optimizer has only been +mildly investigated in MTL. We show the pivotal role of common STL tools such +as the Adam optimizer in MTL empirically in various experiments. To further +investigate Adam's effectiveness, we theoretical derive a partial loss-scale +invariance under mild assumptions. Second, the notion of gradient conflicts has +often been phrased as a specific problem in MTL. We delve into the role of +gradient conflicts in MTL and compare it to STL. For angular gradient alignment +we find no evidence that this is a unique problem in MTL. We emphasize +differences in gradient magnitude as the main distinguishing factor. Lastly, we +compare the transferability of features learned through MTL and STL on common +image corruptions, and find light evidence that MTL can lead to superior +transferability. Overall, we find surprising similarities between STL and MTL +suggesting to consider methods from both fields in a broader context. + +
+
+ comment: - +
+
+
+
+
+ + ♻ ☆ Neural Fields for Interactive Visualization of Statistical Dependencies + in 3D Simulation Ensembles + + +
+ We present the first neural network that has learned to compactly represent +and can efficiently reconstruct the statistical dependencies between the values +of physical variables at different spatial locations in large 3D simulation +ensembles. Going beyond linear dependencies, we consider mutual information as +a measure of non-linear dependence. We demonstrate learning and reconstruction +with a large weather forecast ensemble comprising 1000 members, each storing +multiple physical variables at a 250 x 352 x 20 simulation grid. By +circumventing compute-intensive statistical estimators at runtime, we +demonstrate significantly reduced memory and computation requirements for +reconstructing the major dependence structures. This enables embedding the +estimator into a GPU-accelerated direct volume renderer and interactively +visualizing all mutual dependencies for a selected domain point. + +
+
+
+
+
+ + ♻ ☆ SAR-Net: Multi-scale Direction-aware SAR Network via Global Information + Fusion + + +
+ Deep learning has driven significant progress in object detection using +Synthetic Aperture Radar (SAR) imagery. Existing methods, while achieving +promising results, often struggle to effectively integrate local and global +information, particularly direction-aware features. This paper proposes +SAR-Net, a novel framework specifically designed for global fusion of +direction-aware information in SAR object detection. SAR-Net leverages two key +innovations: the Unity Compensation Mechanism (UCM) and the Direction-aware +Attention Module (DAM). UCM facilitates the establishment of complementary +relationships among features across different scales, enabling efficient global +information fusion. Among them, Multi-scale Alignment Module (MAM) and distinct +Multi-level Fusion Module (MFM) enhance feature integration by capturing both +texture detail and semantic information. Then, Multi-feature Embedding Module +(MEM) feeds back global features into the primary branches, further improving +information transmission. Additionally, DAM, through bidirectional attention +polymerization, captures direction-aware information, effectively eliminating +background interference. Extensive experiments demonstrate the effectiveness of +SAR-Net, achieving state-of-the-art results on aircraft (SAR-AIRcraft-1.0) and +ship datasets (SSDD, HRSID), confirming its generalization capability and +robustness. + +
+
+
+
+
+ + ♻ ☆ Hourglass Tokenizer for Efficient Transformer-Based 3D Human Pose + Estimation CVPR 2024 + + +
+ Transformers have been successfully applied in the field of video-based 3D +human pose estimation. However, the high computational costs of these video +pose transformers (VPTs) make them impractical on resource-constrained devices. +In this paper, we present a plug-and-play pruning-and-recovering framework, +called Hourglass Tokenizer (HoT), for efficient transformer-based 3D human pose +estimation from videos. Our HoT begins with pruning pose tokens of redundant +frames and ends with recovering full-length tokens, resulting in a few pose +tokens in the intermediate transformer blocks and thus improving the model +efficiency. To effectively achieve this, we propose a token pruning cluster +(TPC) that dynamically selects a few representative tokens with high semantic +diversity while eliminating the redundancy of video frames. In addition, we +develop a token recovering attention (TRA) to restore the detailed +spatio-temporal information based on the selected tokens, thereby expanding the +network output to the original full-length temporal resolution for fast +inference. Extensive experiments on two benchmark datasets (i.e., Human3.6M and +MPI-INF-3DHP) demonstrate that our method can achieve both high efficiency and +estimation accuracy compared to the original VPT models. For instance, applying +to MotionBERT and MixSTE on Human3.6M, our HoT can save nearly 50% FLOPs +without sacrificing accuracy and nearly 40% FLOPs with only 0.2% accuracy drop, +respectively. Code and models are available at +https://github.com/NationalGAILab/HoT. + +
+
+ comment: Accepted by CVPR 2024, Open Sourced +
+
+
+
+
+ + ♻ ☆ Enhancing Object Coherence in Layout-to-Image Synthesis + + +
+ Layout-to-image synthesis is an emerging technique in conditional image +generation. It aims to generate complex scenes, where users require fine +control over the layout of the objects in a scene. However, it remains +challenging to control the object coherence, including semantic coherence +(e.g., the cat looks at the flowers or not) and physical coherence (e.g., the +hand and the racket should not be misaligned). In this paper, we propose a +novel diffusion model with effective global semantic fusion (GSF) and +self-similarity feature enhancement modules to guide the object coherence for +this task. For semantic coherence, we argue that the image caption contains +rich information for defining the semantic relationship within the objects in +the images. Instead of simply employing cross-attention between captions and +generated images, which addresses the highly relevant layout restriction and +semantic coherence separately and thus leads to unsatisfying results shown in +our experiments, we develop GSF to fuse the supervision from the layout +restriction and semantic coherence requirement and exploit it to guide the +image synthesis process. Moreover, to improve the physical coherence, we +develop a Self-similarity Coherence Attention (SCA) module to explicitly +integrate local contextual physical coherence into each pixel's generation +process. Specifically, we adopt a self-similarity map to encode the coherence +restrictions and employ it to extract coherent features from text embedding. +Through visualization of our self-similarity map, we explore the essence of +SCA, revealing that its effectiveness is not only in capturing reliable +physical coherence patterns but also in enhancing complex texture generation. +Extensive experiments demonstrate the superiority of our proposed method in +both image generation quality and controllability. + +
+
+
+
+
+ + ♻ ☆ BEVUDA: Multi-geometric Space Alignments for Domain Adaptive BEV 3D + Object Detection ICRA2024 + + +
+ Vision-centric bird-eye-view (BEV) perception has shown promising potential +in autonomous driving. Recent works mainly focus on improving efficiency or +accuracy but neglect the challenges when facing environment changing, resulting +in severe degradation of transfer performance. For BEV perception, we figure +out the significant domain gaps existing in typical real-world cross-domain +scenarios and comprehensively solve the Domain Adaption (DA) problem for +multi-view 3D object detection. Since BEV perception approaches are complicated +and contain several components, the domain shift accumulation on multiple +geometric spaces (i.e., 2D, 3D Voxel, BEV) makes BEV DA even challenging. In +this paper, we propose a Multi-space Alignment Teacher-Student (MATS) framework +to ease the domain shift accumulation, which consists of a Depth-Aware Teacher +(DAT) and a Geometric-space Aligned Student (GAS) model. DAT tactfully combines +target lidar and reliable depth prediction to construct depth-aware +information, extracting target domain-specific knowledge in Voxel and BEV +feature spaces. It then transfers the sufficient domain knowledge of multiple +spaces to the student model. In order to jointly alleviate the domain shift, +GAS projects multi-geometric space features to a shared geometric embedding +space and decreases data distribution distance between two domains. To verify +the effectiveness of our method, we conduct BEV 3D object detection experiments +on three cross-domain scenarios and achieve state-of-the-art performance. + +
+
+ comment: Accepted by ICRA2024 +
+
+
+
+
+ + ♻ ☆ Back to 3D: Few-Shot 3D Keypoint Detection with Back-Projected 2D + Features CVPR 2024 + + +
+ With the immense growth of dataset sizes and computing resources in recent +years, so-called foundation models have become popular in NLP and vision tasks. +In this work, we propose to explore foundation models for the task of keypoint +detection on 3D shapes. A unique characteristic of keypoint detection is that +it requires semantic and geometric awareness while demanding high localization +accuracy. To address this problem, we propose, first, to back-project features +from large pre-trained 2D vision models onto 3D shapes and employ them for this +task. We show that we obtain robust 3D features that contain rich semantic +information and analyze multiple candidate features stemming from different 2D +foundation models. Second, we employ a keypoint candidate optimization module +which aims to match the average observed distribution of keypoints on the shape +and is guided by the back-projected features. The resulting approach achieves a +new state of the art for few-shot keypoint detection on the KeyPointNet +dataset, almost doubling the performance of the previous best methods. + +
+
+ comment: Accepted to CVPR 2024, Project page: + https://wimmerth.github.io/back-to-3d.html +
+
+
+
+
+ + ♻ ☆ Fast Dynamic 3D Object Generation from a Single-view Video + + +
+ Generating dynamic 3D object from a single-view video is challenging due to +the lack of 4D labeled data. Extending image-to-3D pipelines by transferring +off-the-shelf image generation models such as score distillation sampling, +existing methods tend to be slow and expensive to scale due to the need for +back-propagating the information-limited supervision signals through a large +pretrained model. To address this, we propose an efficient video-to-4D object +generation framework called Efficient4D. It generates high-quality +spacetime-consistent images under different camera views, and then uses them as +labeled data to directly train a novel 4D Gaussian splatting model with +explicit point cloud geometry, enabling real-time rendering under continuous +camera trajectories. Extensive experiments on synthetic and real videos show +that Efficient4D offers a remarkable 20-fold increase in speed when compared to +prior art alternatives while preserving the quality of novel view synthesis. +For example, Efficient4D takes only 6 mins to model a dynamic object, vs 120 +mins by Consistent4D. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ UniTraj: A Unified Framework for Scalable Vehicle Trajectory Prediction + + +
+ Vehicle trajectory prediction has increasingly relied on data-driven +solutions, but their ability to scale to different data domains and the impact +of larger dataset sizes on their generalization remain under-explored. While +these questions can be studied by employing multiple datasets, it is +challenging due to several discrepancies, e.g., in data formats, map +resolution, and semantic annotation types. To address these challenges, we +introduce UniTraj, a comprehensive framework that unifies various datasets, +models, and evaluation criteria, presenting new opportunities for the vehicle +trajectory prediction field. In particular, using UniTraj, we conduct extensive +experiments and find that model performance significantly drops when +transferred to other datasets. However, enlarging data size and diversity can +substantially improve performance, leading to a new state-of-the-art result for +the nuScenes dataset. We provide insights into dataset characteristics to +explain these findings. The code can be found here: +https://github.com/vita-epfl/UniTraj + +
+
+
+
+
+ + ♻ ☆ CLIP-DINOiser: Teaching CLIP a few DINO tricks for open-vocabulary + semantic segmentation + + +
+ The popular CLIP model displays impressive zero-shot capabilities thanks to +its seamless interaction with arbitrary text prompts. However, its lack of +spatial awareness makes it unsuitable for dense computer vision tasks, e.g., +semantic segmentation, without an additional fine-tuning step that often uses +annotations and can potentially suppress its original open-vocabulary +properties. Meanwhile, self-supervised representation methods have demonstrated +good localization properties without human-made annotations nor explicit +supervision. In this work, we take the best of both worlds and propose an +open-vocabulary semantic segmentation method, which does not require any +annotations. We propose to locally improve dense MaskCLIP features, which are +computed with a simple modification of CLIP's last pooling layer, by +integrating localization priors extracted from self-supervised features. By +doing so, we greatly improve the performance of MaskCLIP and produce smooth +outputs. Moreover, we show that the used self-supervised feature properties can +directly be learnt from CLIP features. Our method CLIP-DINOiser needs only a +single forward pass of CLIP and two light convolutional layers at inference, no +extra supervision nor extra memory and reaches state-of-the-art results on +challenging and fine-grained benchmarks such as COCO, Pascal Context, +Cityscapes and ADE20k. The code to reproduce our results is available at +https://github.com/wysoczanska/clip_dinoiser. + +
+
+
+
+
+ + ♻ ☆ Continual-MAE: Adaptive Distribution Masked Autoencoders for Continual + Test-Time Adaptation CVPR2024 + + +
+ Continual Test-Time Adaptation (CTTA) is proposed to migrate a source +pre-trained model to continually changing target distributions, addressing +real-world dynamism. Existing CTTA methods mainly rely on entropy minimization +or teacher-student pseudo-labeling schemes for knowledge extraction in +unlabeled target domains. However, dynamic data distributions cause +miscalibrated predictions and noisy pseudo-labels in existing self-supervised +learning methods, hindering the effective mitigation of error accumulation and +catastrophic forgetting problems during the continual adaptation process. To +tackle these issues, we propose a continual self-supervised method, Adaptive +Distribution Masked Autoencoders (ADMA), which enhances the extraction of +target domain knowledge while mitigating the accumulation of distribution +shifts. Specifically, we propose a Distribution-aware Masking (DaM) mechanism +to adaptively sample masked positions, followed by establishing consistency +constraints between the masked target samples and the original target samples. +Additionally, for masked tokens, we utilize an efficient decoder to reconstruct +a hand-crafted feature descriptor (e.g., Histograms of Oriented Gradients), +leveraging its invariant properties to boost task-relevant representations. +Through conducting extensive experiments on four widely recognized benchmarks, +our proposed method attains state-of-the-art performance in both classification +and segmentation CTTA tasks. Our project page: +https://sites.google.com/view/continual-mae/home. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ A2V: A Semi-Supervised Domain Adaptation Framework for Brain Vessel + Segmentation via Two-Phase Training Angiography-to-Venography Translation BMVC + + +
+ We present a semi-supervised domain adaptation framework for brain vessel +segmentation from different image modalities. Existing state-of-the-art methods +focus on a single modality, despite the wide range of available cerebrovascular +imaging techniques. This can lead to significant distribution shifts that +negatively impact the generalization across modalities. By relying on annotated +angiographies and a limited number of annotated venographies, our framework +accomplishes image-to-image translation and semantic segmentation, leveraging a +disentangled and semantically rich latent space to represent heterogeneous data +and perform image-level adaptation from source to target domains. Moreover, we +reduce the typical complexity of cycle-based architectures and minimize the use +of adversarial training, which allows us to build an efficient and intuitive +model with stable training. We evaluate our method on magnetic resonance +angiographies and venographies. While achieving state-of-the-art performance in +the source domain, our method attains a Dice score coefficient in the target +domain that is only 8.9% lower, highlighting its promising potential for robust +cerebrovascular image segmentation across different modalities. + +
+
+ comment: Accepted at the 34th British Machine Vision Conference (BMVC) +
+
+
+
+
+ + ♻ ☆ Debiasing Multimodal Large Language Models + + +
+ In the realms of computer vision and natural language processing, Large +Vision-Language Models (LVLMs) have become indispensable tools, proficient in +generating textual descriptions based on visual inputs. Despite their +advancements, our investigation reveals a noteworthy bias in the generated +content, where the output is primarily influenced by the underlying Large +Language Models (LLMs) prior rather than the input image. Our empirical +experiments underscore the persistence of this bias, as LVLMs often provide +confident answers even in the absence of relevant images or given incongruent +visual input. To rectify these biases and redirect the model's focus toward +vision information, we introduce two simple, training-free strategies. Firstly, +for tasks such as classification or multi-choice question-answering (QA), we +propose a ``calibration'' step through affine transformation to adjust the +output distribution. This ``Post-Hoc debias'' approach ensures uniform scores +for each answer when the image is absent, serving as an effective +regularization technique to alleviate the influence of LLM priors. For more +intricate open-ended generation tasks, we extend this method to ``Debias +sampling'', drawing inspirations from contrastive decoding methods. +Furthermore, our investigation sheds light on the instability of LVLMs across +various decoding configurations. Through systematic exploration of different +settings, we significantly enhance performance, surpassing reported results and +raising concerns about the fairness of existing evaluations. Comprehensive +experiments substantiate the effectiveness of our proposed strategies in +mitigating biases. These strategies not only prove beneficial in minimizing +hallucinations but also contribute to the generation of more helpful and +precise illustrations. + +
+
+ comment: 38 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ SIGNeRF: Scene Integrated Generation for Neural Radiance Fields + + +
+ Advances in image diffusion models have recently led to notable improvements +in the generation of high-quality images. In combination with Neural Radiance +Fields (NeRFs), they enabled new opportunities in 3D generation. However, most +generative 3D approaches are object-centric and applying them to editing +existing photorealistic scenes is not trivial. We propose SIGNeRF, a novel +approach for fast and controllable NeRF scene editing and scene-integrated +object generation. A new generative update strategy ensures 3D consistency +across the edited images, without requiring iterative optimization. We find +that depth-conditioned diffusion models inherently possess the capability to +generate 3D consistent views by requesting a grid of images instead of single +views. Based on these insights, we introduce a multi-view reference sheet of +modified images. Our method updates an image collection consistently based on +the reference sheet and refines the original NeRF with the newly generated +image set in one go. By exploiting the depth conditioning mechanism of the +image diffusion model, we gain fine control over the spatial location of the +edit and enforce shape guidance by a selected region or an external mesh. + +
+
+ comment: Project Page: https://signerf.jdihlmann.com +
+
+
+
+
+ + ♻ ☆ LocalStyleFool: Regional Video Style Transfer Attack Using Segment + Anything Model SP + + +
+ Previous work has shown that well-crafted adversarial perturbations can +threaten the security of video recognition systems. Attackers can invade such +models with a low query budget when the perturbations are semantic-invariant, +such as StyleFool. Despite the query efficiency, the naturalness of the minutia +areas still requires amelioration, since StyleFool leverages style transfer to +all pixels in each frame. To close the gap, we propose LocalStyleFool, an +improved black-box video adversarial attack that superimposes regional +style-transfer-based perturbations on videos. Benefiting from the popularity +and scalably usability of Segment Anything Model (SAM), we first extract +different regions according to semantic information and then track them through +the video stream to maintain the temporal consistency. Then, we add +style-transfer-based perturbations to several regions selected based on the +associative criterion of transfer-based gradient information and regional area. +Perturbation fine adjustment is followed to make stylized videos adversarial. +We demonstrate that LocalStyleFool can improve both intra-frame and inter-frame +naturalness through a human-assessed survey, while maintaining competitive +fooling rate and query efficiency. Successful experiments on the +high-resolution dataset also showcase that scrupulous segmentation of SAM helps +to improve the scalability of adversarial attacks under high-resolution data. + +
+
+ comment: Accepted to 2024 IEEE Security and Privacy Workshops (SPW) +
+
+
+
+
+ + ♻ ☆ TULIP: Transformer for Upsampling of LiDAR Point Cloud CVPR20224 + + +
+ LiDAR Upsampling is a challenging task for the perception systems of robots +and autonomous vehicles, due to the sparse and irregular structure of +large-scale scene contexts. Recent works propose to solve this problem by +converting LiDAR data from 3D Euclidean space into an image super-resolution +problem in 2D image space. Although their methods can generate high-resolution +range images with fine-grained details, the resulting 3D point clouds often +blur out details and predict invalid points. In this paper, we propose TULIP, a +new method to reconstruct high-resolution LiDAR point clouds from +low-resolution LiDAR input. We also follow a range image-based approach but +specifically modify the patch and window geometries of a Swin-Transformer-based +network to better fit the characteristics of range images. We conducted several +experiments on three public real-world and simulated datasets. TULIP +outperforms state-of-the-art methods in all relevant metrics and generates +robust and more realistic point clouds than prior works. + +
+
+ comment: The paper was accepted by CVPR20224 +
+
+
+
+
+ + ♻ ☆ 3D Face Reconstruction Using A Spectral-Based Graph Convolution Encoder WWW 2024 + + +
+ Monocular 3D face reconstruction plays a crucial role in avatar generation, +with significant demand in web-related applications such as generating virtual +financial advisors in FinTech. Current reconstruction methods predominantly +rely on deep learning techniques and employ 2D self-supervision as a means to +guide model learning. However, these methods encounter challenges in capturing +the comprehensive 3D structural information of the face due to the utilization +of 2D images for model training purposes. To overcome this limitation and +enhance the reconstruction of 3D structural features, we propose an innovative +approach that integrates existing 2D features with 3D features to guide the +model learning process. Specifically, we introduce the 3D-ID Loss, which +leverages the high-dimensional structure features extracted from a +Spectral-Based Graph Convolution Encoder applied to the facial mesh. This +approach surpasses the sole reliance on the 3D information provided by the +facial mesh vertices coordinates. Our model is trained using 2D-3D data pairs +from a combination of datasets and achieves state-of-the-art performance on the +NoW benchmark. + +
+
+ comment: 4 pages, 3 figures. Accepted to WWW 2024 +
+
+
+
+
+ + ♻ ☆ AEROBLADE: Training-Free Detection of Latent Diffusion Images Using + Autoencoder Reconstruction Error CVPR 2024 + + +
+ With recent text-to-image models, anyone can generate deceptively realistic +images with arbitrary contents, fueling the growing threat of visual +disinformation. A key enabler for generating high-resolution images with low +computational cost has been the development of latent diffusion models (LDMs). +In contrast to conventional diffusion models, LDMs perform the denoising +process in the low-dimensional latent space of a pre-trained autoencoder (AE) +instead of the high-dimensional image space. Despite their relevance, the +forensic analysis of LDMs is still in its infancy. In this work we propose +AEROBLADE, a novel detection method which exploits an inherent component of +LDMs: the AE used to transform images between image and latent space. We find +that generated images can be more accurately reconstructed by the AE than real +images, allowing for a simple detection approach based on the reconstruction +error. Most importantly, our method is easy to implement and does not require +any training, yet nearly matches the performance of detectors that rely on +extensive training. We empirically demonstrate that AEROBLADE is effective +against state-of-the-art LDMs, including Stable Diffusion and Midjourney. +Beyond detection, our approach allows for the qualitative analysis of images, +which can be leveraged for identifying inpainted regions. We release our code +and data at https://github.com/jonasricker/aeroblade . + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A citizen science toolkit to collect human perceptions of urban + environments using open street view images + + +
+ Street View-level Imagery (SVI) is a valuable data source for studies (e.g., +environmental assessments, green space identification or land cover +classification). While commercial SVI is available, such providers commonly +restrict copying or reuse in ways necessary for research. Open SVI datasets are +readily available from less restrictive sources, such as Mapillary, but due to +the heterogeneity of the images, these require substantial preprocessing, +filtering, and careful quality checks. We present an efficient method for +automated downloading, processing, cropping, and filtering open SVI, to be used +in a survey of human perceptions of the streets portrayed in these images. We +demonstrate our open-source reusable SVI preparation and smartphone-friendly +perception-survey software with Amsterdam (Netherlands) as the case study. +Using a citizen science approach, we collected from 331 people 22,637 ratings +about their perceptions for various criteria. We have published our software in +a public repository for future re-use and reproducibility. + +
+
+
+
+
+ + ♻ ☆ Scalable Non-Cartesian Magnetic Resonance Imaging with R2D2 + + +
+ We propose a new approach for non-Cartesian magnetic resonance image +reconstruction. While unrolled architectures provide robustness via +data-consistency layers, embedding measurement operators in Deep Neural Network +(DNN) can become impractical at large scale. Alternative Plug-and-Play (PnP) +approaches, where the denoising DNNs are blind to the measurement setting, are +not affected by this limitation and have also proven effective, but their +highly iterative nature also affects scalability. To address this scalability +challenge, we leverage the "Residual-to-Residual DNN series for high-Dynamic +range imaging (R2D2)" approach recently introduced in astronomical imaging. +R2D2's reconstruction is formed as a series of residual images, iteratively +estimated as outputs of DNNs taking the previous iteration's image estimate and +associated data residual as inputs. The method can be interpreted as a learned +version of the Matching Pursuit algorithm. We demonstrate R2D2 in simulation, +considering radial k-space sampling acquisition sequences. Our preliminary +results suggest that R2D2 achieves: (i) suboptimal performance compared to its +unrolled incarnation R2D2-Net, which is however non-scalable due to the +necessary embedding of NUFFT-based data-consistency layers; (ii) superior +reconstruction quality to a scalable version of R2D2-Net embedding an FFT-based +approximation for data consistency; (iii) superior reconstruction quality to +PnP, while only requiring few iterations. + +
+
+ comment: submitted to IEEE EUSIPCO 2024 +
+
+
+
+
+ + ♻ ☆ FoMo-Bench: a multi-modal, multi-scale and multi-task Forest Monitoring + Benchmark for remote sensing foundation models + + +
+ Forests are an essential part of Earth's ecosystems and natural systems, as +well as providing services on which humanity depends, yet they are rapidly +changing as a result of land use decisions and climate change. Understanding +and mitigating negative effects requires parsing data on forests at global +scale from a broad array of sensory modalities, and recently many such problems +have been approached using machine learning algorithms for remote sensing. To +date, forest-monitoring problems have largely been addressed in isolation. +Inspired by the rise of foundation models for computer vision and remote +sensing, we here present the first unified Forest Monitoring Benchmark +(FoMo-Bench). FoMo-Bench consists of 15 diverse datasets encompassing +satellite, aerial, and inventory data, covering a variety of geographical +regions, and including multispectral, red-green-blue, synthetic aperture radar +(SAR) and LiDAR data with various temporal, spatial and spectral resolutions. +FoMo-Bench includes multiple types of forest-monitoring tasks, spanning +classification, segmentation, and object detection. To further enhance the +diversity of tasks and geographies represented in FoMo-Bench, we introduce a +novel global dataset, TalloS, combining satellite imagery with ground-based +annotations for tree species classification, encompassing 1,000+ categories +across multiple hierarchical taxonomic levels (species, genus, family). +Finally, we propose FoMo-Net, a baseline foundation model with the capacity to +process any combination of commonly used spectral bands in remote sensing, +across diverse ground sampling distances and geographical locations worldwide. +This work aims to inspire research collaborations between machine learning and +forest biology researchers in exploring scalable multi-modal and multi-task +models for forest monitoring. All code and data will be made publicly +available. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ Retrieval-Augmented Generation for AI-Generated Content: A Survey + + +
+ The development of Artificial Intelligence Generated Content (AIGC) has been +facilitated by advancements in model algorithms, the increasing scale of +foundation models, and the availability of ample high-quality datasets. While +AIGC has achieved remarkable performance, it still faces several challenges, +such as the difficulty of maintaining up-to-date and long-tail knowledge, the +risk of data leakage, and the high costs associated with training and +inference. Retrieval-Augmented Generation(RAG) has recently emerged as a +paradigm to address such challenges. In particular, RAG introduces the +information retrieval process, which enhances the generation process by +retrieving relevant objects from available data stores, leading to higher +accuracy and better robustness. In this paper, we comprehensively review +existing efforts that integrate RAG technique into AIGC scenarios. We first +classify RAG foundations according to how the retriever augments the generator, +distilling the fundamental abstractions of the augmentation methodologies for +various retrievers and generators. This unified perspective encompasses all RAG +scenarios, illuminating advancements and pivotal technologies that help with +potential future progress. We also summarize additional enhancements methods +for RAG, facilitating effective engineering and implementation of RAG systems. +Then from another view, we survey on practical applications of RAG across +different modalities and tasks, offering valuable references for researchers +and practitioners. Furthermore, we introduce the benchmarks for RAG, discuss +the limitations of current RAG systems, and suggest potential directions for +future research.Project Repo: https://github.com/hymie122/RAG-Survey. + +
+
+ comment: Citing 380 papers, 36 pages, 16 figures. Project: + https://github.com/hymie122/RAG-Survey +
+
+
+
+
+ + ♻ ☆ Learning Concept-Based Causal Transition and Symbolic Reasoning for + Visual Planning + + +
+ Visual planning simulates how humans make decisions to achieve desired goals +in the form of searching for visual causal transitions between an initial +visual state and a final visual goal state. It has become increasingly +important in egocentric vision with its advantages in guiding agents to perform +daily tasks in complex environments. In this paper, we propose an interpretable +and generalizable visual planning framework consisting of i) a novel +Substitution-based Concept Learner (SCL) that abstracts visual inputs into +disentangled concept representations, ii) symbol abstraction and reasoning that +performs task planning via the self-learned symbols, and iii) a Visual Causal +Transition model (ViCT) that grounds visual causal transitions to semantically +similar real-world actions. Given an initial state, we perform goal-conditioned +visual planning with a symbolic reasoning method fueled by the learned +representations and causal transitions to reach the goal state. To verify the +effectiveness of the proposed model, we collect a large-scale visual planning +dataset based on AI2-THOR, dubbed as CCTP. Extensive experiments on this +challenging dataset demonstrate the superior performance of our method in +visual task planning. Empirically, we show that our framework can generalize to +unseen task trajectories, unseen object categories, and real-world data. +Further details of this work are provided at +https://fqyqc.github.io/ConTranPlan/. + +
+
+
+
+
+ + ♻ ☆ Centered Masking for Language-Image Pre-Training + + +
+ We introduce Gaussian masking for Language-Image Pre-Training (GLIP) a novel, +straightforward, and effective technique for masking image patches during +pre-training of a vision-language model. GLIP builds on Fast Language-Image +Pre-Training (FLIP), which randomly masks image patches while training a CLIP +model. GLIP replaces random masking with centered masking, that uses a Gaussian +distribution and is inspired by the importance of image patches at the center +of the image. GLIP retains the same computational savings as FLIP, while +improving performance across a range of downstream datasets and tasks, as +demonstrated by our experimental results. We show the benefits of GLIP to be +easy to obtain, requiring no delicate tuning of the Gaussian, and also +applicable to data sets containing images without an obvious center focus. + +
+
+
+
+
+ + ♻ ☆ Physical 3D Adversarial Attacks against Monocular Depth Estimation in + Autonomous Driving CVPR 2024 + + +
+ Deep learning-based monocular depth estimation (MDE), extensively applied in +autonomous driving, is known to be vulnerable to adversarial attacks. Previous +physical attacks against MDE models rely on 2D adversarial patches, so they +only affect a small, localized region in the MDE map but fail under various +viewpoints. To address these limitations, we propose 3D Depth Fool +(3D$^2$Fool), the first 3D texture-based adversarial attack against MDE models. +3D$^2$Fool is specifically optimized to generate 3D adversarial textures +agnostic to model types of vehicles and to have improved robustness in bad +weather conditions, such as rain and fog. Experimental results validate the +superior performance of our 3D$^2$Fool across various scenarios, including +vehicles, MDE models, weather conditions, and viewpoints. Real-world +experiments with printed 3D textures on physical vehicle models further +demonstrate that our 3D$^2$Fool can cause an MDE error of over 10 meters. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised Conditional Embedding for Referred Visual Search + + +
+ This paper introduces a new challenge for image similarity search in the +context of fashion, addressing the inherent ambiguity in this domain stemming +from complex images. We present Referred Visual Search (RVS), a task allowing +users to define more precisely the desired similarity, following recent +interest in the industry. We release a new large public dataset, +LAION-RVS-Fashion, consisting of 272k fashion products with 842k images +extracted from LAION, designed explicitly for this task. However, unlike +traditional visual search methods in the industry, we demonstrate that superior +performance can be achieved by bypassing explicit object detection and adopting +weakly-supervised conditional contrastive learning on image tuples. Our method +is lightweight and demonstrates robustness, reaching Recall at one superior to +strong detection-based baselines against 2M distractors. Code, data and models +are available at https://www.github.com/Simon-Lepage/CondViT-LRVSF . + +
+
+ comment: 28 pages, 13 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Multi-criteria Token Fusion with One-step-ahead Attention for Efficient + Vision Transformers CVPR + + +
+ Vision Transformer (ViT) has emerged as a prominent backbone for computer +vision. For more efficient ViTs, recent works lessen the quadratic cost of the +self-attention layer by pruning or fusing the redundant tokens. However, these +works faced the speed-accuracy trade-off caused by the loss of information. +Here, we argue that token fusion needs to consider diverse relations between +tokens to minimize information loss. In this paper, we propose a Multi-criteria +Token Fusion (MCTF), that gradually fuses the tokens based on multi-criteria +(e.g., similarity, informativeness, and size of fused tokens). Further, we +utilize the one-step-ahead attention, which is the improved approach to capture +the informativeness of the tokens. By training the model equipped with MCTF +using a token reduction consistency, we achieve the best speed-accuracy +trade-off in the image classification (ImageNet1K). Experimental results prove +that MCTF consistently surpasses the previous reduction methods with and +without training. Specifically, DeiT-T and DeiT-S with MCTF reduce FLOPs by +about 44% while improving the performance (+0.5%, and +0.3%) over the base +model, respectively. We also demonstrate the applicability of MCTF in various +Vision Transformers (e.g., T2T-ViT, LV-ViT), achieving at least 31% speedup +without performance degradation. Code is available at +https://github.com/mlvlab/MCTF. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ Task-Adaptive Saliency Guidance for Exemplar-free Class Incremental + Learning CVPR 2024 + + +
+ Exemplar-free Class Incremental Learning (EFCIL) aims to sequentially learn +tasks with access only to data from the current one. EFCIL is of interest +because it mitigates concerns about privacy and long-term storage of data, +while at the same time alleviating the problem of catastrophic forgetting in +incremental learning. In this work, we introduce task-adaptive saliency for +EFCIL and propose a new framework, which we call Task-Adaptive Saliency +Supervision (TASS), for mitigating the negative effects of saliency drift +between different tasks. We first apply boundary-guided saliency to maintain +task adaptivity and \textit{plasticity} on model attention. Besides, we +introduce task-agnostic low-level signals as auxiliary supervision to increase +the \textit{stability} of model attention. Finally, we introduce a module for +injecting and recovering saliency noise to increase the robustness of saliency +preservation. Our experiments demonstrate that our method can better preserve +saliency maps across tasks and achieve state-of-the-art results on the +CIFAR-100, Tiny-ImageNet, and ImageNet-Subset EFCIL benchmarks. Code is +available at \url{https://github.com/scok30/tass}. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ The Effects of Mixed Sample Data Augmentation are Class Dependent + + +
+ Mixed Sample Data Augmentation (MSDA) techniques, such as Mixup, CutMix, and +PuzzleMix, have been widely acknowledged for enhancing performance in a variety +of tasks. A previous study reported the class dependency of traditional data +augmentation (DA), where certain classes benefit disproportionately compared to +others. This paper reveals a class dependent effect of MSDA, where some classes +experience improved performance while others experience degraded performance. +This research addresses the issue of class dependency in MSDA and proposes an +algorithm to mitigate it. The approach involves training on a mixture of MSDA +and non-MSDA data, which not only mitigates the negative impact on the affected +classes, but also improves overall accuracy. Furthermore, we provide in-depth +analysis and discussion of why MSDA introduced class dependencies and which +classes are most likely to have them. + +
+
+ comment: 21 pages, 18 figures, Overall Revision +
+
+
+
+
+ + ♻ ☆ Spectral Meets Spatial: Harmonising 3D Shape Matching and Interpolation CVPR2024 + + +
+ Although 3D shape matching and interpolation are highly interrelated, they +are often studied separately and applied sequentially to relate different 3D +shapes, thus resulting in sub-optimal performance. In this work we present a +unified framework to predict both point-wise correspondences and shape +interpolation between 3D shapes. To this end, we combine the deep functional +map framework with classical surface deformation models to map shapes in both +spectral and spatial domains. On the one hand, by incorporating spatial maps, +our method obtains more accurate and smooth point-wise correspondences compared +to previous functional map methods for shape matching. On the other hand, by +introducing spectral maps, our method gets rid of commonly used but +computationally expensive geodesic distance constraints that are only valid for +near-isometric shape deformations. Furthermore, we propose a novel test-time +adaptation scheme to capture both pose-dominant and shape-dominant +deformations. Using different challenging datasets, we demonstrate that our +method outperforms previous state-of-the-art methods for both shape matching +and interpolation, even compared to supervised approaches. + +
+
+ comment: accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ CEIMVEN: An Approach of Cutting Edge Implementation of Modified Versions + of EfficientNet (V1-V2) Architecture for Breast Cancer Detection and + Classification from Ultrasound Images + + +
+ Undoubtedly breast cancer identifies itself as one of the most widespread and +terrifying cancers across the globe. Millions of women are getting affected +each year from it. Breast cancer remains the major one for being the reason of +largest number of demise of women. In the recent time of research, Medical +Image Computing and Processing has been playing a significant role for +detecting and classifying breast cancers from ultrasound images and mammograms, +along with the celestial touch of deep neural networks. In this research, we +focused mostly on our rigorous implementations and iterative result analysis of +different cutting-edge modified versions of EfficientNet architectures namely +EfficientNet-V1 (b0-b7) and EfficientNet-V2 (b0-b3) with ultrasound image, +named as CEIMVEN. We utilized transfer learning approach here for using the +pre-trained models of EfficientNet versions. We activated the hyper-parameter +tuning procedures, added fully connected layers, discarded the unprecedented +outliers and recorded the accuracy results from our custom modified +EfficientNet architectures. Our deep learning model training approach was +related to both identifying the cancer affected areas with region of interest +(ROI) techniques and multiple classifications (benign, malignant and normal). +The approximate testing accuracies we got from the modified versions of +EfficientNet-V1 (b0- 99.15%, b1- 98.58%, b2- 98.43%, b3- 98.01%, b4- 98.86%, +b5- 97.72%, b6- 97.72%, b7- 98.72%) and EfficientNet-V2 (b0- 99.29%, b1- +99.01%, b2- 98.72%, b3- 99.43%) are showing very bright future and strong +potentials of deep learning approach for the successful detection and +classification of breast cancers from the ultrasound images at a very early +stage. The code for this research is available here: +https://github.com/ac005sheekar/CEIMVEN-Breast. + +
+
+
+
+
+ + ♻ ☆ ViT-CoMer: Vision Transformer with Convolutional Multi-scale Feature + Interaction for Dense Predictions CVPR2024 + + +
+ Although Vision Transformer (ViT) has achieved significant success in +computer vision, it does not perform well in dense prediction tasks due to the +lack of inner-patch information interaction and the limited diversity of +feature scale. Most existing studies are devoted to designing vision-specific +transformers to solve the above problems, which introduce additional +pre-training costs. Therefore, we present a plain, pre-training-free, and +feature-enhanced ViT backbone with Convolutional Multi-scale feature +interaction, named ViT-CoMer, which facilitates bidirectional interaction +between CNN and transformer. Compared to the state-of-the-art, ViT-CoMer has +the following advantages: (1) We inject spatial pyramid multi-receptive field +convolutional features into the ViT architecture, which effectively alleviates +the problems of limited local information interaction and single-feature +representation in ViT. (2) We propose a simple and efficient CNN-Transformer +bidirectional fusion interaction module that performs multi-scale fusion across +hierarchical features, which is beneficial for handling dense prediction tasks. +(3) We evaluate the performance of ViT-CoMer across various dense prediction +tasks, different frameworks, and multiple advanced pre-training. Notably, our +ViT-CoMer-L achieves 64.3% AP on COCO val2017 without extra training data, and +62.1% mIoU on ADE20K val, both of which are comparable to state-of-the-art +methods. We hope ViT-CoMer can serve as a new backbone for dense prediction +tasks to facilitate future research. The code will be released at +https://github.com/Traffic-X/ViT-CoMer. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ InterControl: Generate Human Motion Interactions by Controlling Every + Joint + + +
+ Text-conditioned human motion synthesis has made remarkable progress with the +emergence of diffusion models in recent research. However, the majority of +these motion diffusion models are primarily designed for a single character and +overlook multi-human interactions. In our approach, we strive to explore this +problem by synthesizing human motion with interactions for a group of +characters of any size. The key aspect of our approach is the adaptation of +human-wise interactions as pairs of human joints that can be either in contact +or separated by a desired distance. In contrast to existing methods that +necessitate training motion generation models on multi-human motion datasets +with a fixed number of characters, our approach inherently possesses the +flexibility to model human interactions involving an arbitrary number of +individuals, thereby transcending the limitations imposed by the training data. +We introduce a novel controllable motion generation method, InterControl, to +encourage the synthesized motions maintaining the desired distance between +joint pairs. It consists of a motion controller and an inverse kinematics +guidance module that realistically and accurately aligns the joints of +synthesized characters to the desired location. Furthermore, we demonstrate +that the distance between joint pairs for human-wise interactions can be +generated using an off-the-shelf Large Language Model (LLM). Experimental +results highlight the capability of our framework to generate interactions with +multiple human characters and its potential to work with off-the-shelf +physics-based character simulators. + +
+
+ comment: Generate human interactions with only single-person data via joint + contact pairs, code https://github.com/zhenzhiwang/intercontrol +
+
+
+
+
+ + ♻ ☆ SSM Meets Video Diffusion Models: Efficient Video Generation with + Structured State Spaces ICLR 2024 + + +
+ Given the remarkable achievements in image generation through diffusion +models, the research community has shown increasing interest in extending these +models to video generation. Recent diffusion models for video generation have +predominantly utilized attention layers to extract temporal features. However, +attention layers are limited by their memory consumption, which increases +quadratically with the length of the sequence. This limitation presents +significant challenges when attempting to generate longer video sequences using +diffusion models. To overcome this challenge, we propose leveraging state-space +models (SSMs). SSMs have recently gained attention as viable alternatives due +to their linear memory consumption relative to sequence length. In the +experiments, we first evaluate our SSM-based model with UCF101, a standard +benchmark of video generation. In addition, to investigate the potential of +SSMs for longer video generation, we perform an experiment using the MineRL +Navigate dataset, varying the number of frames to 64, 200, and 400. In these +settings, our SSM-based model can considerably save memory consumption for +longer sequences, while maintaining competitive FVD scores to the +attention-based models. Our codes are available at +https://github.com/shim0114/SSM-Meets-Video-Diffusion-Models. + +
+
+ comment: Accepted as workshop paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Rotation-Invariant Transformer for Point Cloud Matching CVPR 2023 + + +
+ The intrinsic rotation invariance lies at the core of matching point clouds +with handcrafted descriptors. However, it is widely despised by recent deep +matchers that obtain the rotation invariance extrinsically via data +augmentation. As the finite number of augmented rotations can never span the +continuous SO(3) space, these methods usually show instability when facing +rotations that are rarely seen. To this end, we introduce RoITr, a +Rotation-Invariant Transformer to cope with the pose variations in the point +cloud matching task. We contribute both on the local and global levels. +Starting from the local level, we introduce an attention mechanism embedded +with Point Pair Feature (PPF)-based coordinates to describe the pose-invariant +geometry, upon which a novel attention-based encoder-decoder architecture is +constructed. We further propose a global transformer with rotation-invariant +cross-frame spatial awareness learned by the self-attention mechanism, which +significantly improves the feature distinctiveness and makes the model robust +with respect to the low overlap. Experiments are conducted on both the rigid +and non-rigid public benchmarks, where RoITr outperforms all the +state-of-the-art models by a considerable margin in the low-overlapping +scenarios. Especially when the rotations are enlarged on the challenging +3DLoMatch benchmark, RoITr surpasses the existing methods by at least 13 and 5 +percentage points in terms of Inlier Ratio and Registration Recall, +respectively. + +
+
+ comment: Accepted to CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Extend Your Own Correspondences: Unsupervised Distant Point Cloud + Registration by Progressive Distance Extension CVPR + + +
+ Registration of point clouds collected from a pair of distant vehicles +provides a comprehensive and accurate 3D view of the driving scenario, which is +vital for driving safety related applications, yet existing literature suffers +from the expensive pose label acquisition and the deficiency to generalize to +new data distributions. In this paper, we propose EYOC, an unsupervised distant +point cloud registration method that adapts to new point cloud distributions on +the fly, requiring no global pose labels. The core idea of EYOC is to train a +feature extractor in a progressive fashion, where in each round, the feature +extractor, trained with near point cloud pairs, can label slightly farther +point cloud pairs, enabling self-supervision on such far point cloud pairs. +This process continues until the derived extractor can be used to register +distant point clouds. Particularly, to enable high-fidelity correspondence +label generation, we devise an effective spatial filtering scheme to select the +most representative correspondences to register a point cloud pair, and then +utilize the aligned point clouds to discover more correct correspondences. +Experiments show that EYOC can achieve comparable performance with +state-of-the-art supervised methods at a lower training cost. Moreover, it +outwits supervised methods regarding generalization performance on new data +distributions. + +
+
+ comment: In Proceedings of the IEEE/CVF Conference on Computer Vision and + Pattern Recognition (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ Foundation Model Makes Clustering A Better Initialization For Cold-Start + Active Learning + + +
+ Active learning selects the most informative samples from the unlabelled +dataset to annotate in the context of a limited annotation budget. While +numerous methods have been proposed for subsequent sample selection based on an +initialized model, scant attention has been paid to the indispensable phase of +active learning: selecting samples for model cold-start initialization. Most of +the previous studies resort to random sampling or naive clustering. However, +random sampling is prone to fluctuation, and naive clustering suffers from +convergence speed, particularly when dealing with high-dimensional data such as +imaging data. In this work, we propose to integrate foundation models with +clustering methods to select samples for cold-start active learning +initialization. Foundation models refer to those trained on massive datasets by +the self-supervised paradigm and capable of generating informative and +compacted embeddings for various downstream tasks. Leveraging these embeddings +to replace raw features such as pixel values, clustering quickly converges and +identifies better initial samples. For a comprehensive comparison, we included +a classic ImageNet-supervised model to acquire embeddings. Experiments on two +clinical tasks of image classification and segmentation demonstrated that +foundation model-based clustering efficiently pinpointed informative initial +samples, leading to models showcasing enhanced performance than the baseline +methods. We envisage that this study provides an effective paradigm for future +cold-start active learning. + +
+
+
+
+
+ + ♻ ☆ DifFlow3D: Toward Robust Uncertainty-Aware Scene Flow Estimation with + Iterative Diffusion-Based Refinement CVPR 2024 + + +
+ Scene flow estimation, which aims to predict per-point 3D displacements of +dynamic scenes, is a fundamental task in the computer vision field. However, +previous works commonly suffer from unreliable correlation caused by locally +constrained searching ranges, and struggle with accumulated inaccuracy arising +from the coarse-to-fine structure. To alleviate these problems, we propose a +novel uncertainty-aware scene flow estimation network (DifFlow3D) with the +diffusion probabilistic model. Iterative diffusion-based refinement is designed +to enhance the correlation robustness and resilience to challenging cases, e.g. +dynamics, noisy inputs, repetitive patterns, etc. To restrain the generation +diversity, three key flow-related features are leveraged as conditions in our +diffusion model. Furthermore, we also develop an uncertainty estimation module +within diffusion to evaluate the reliability of estimated scene flow. Our +DifFlow3D achieves state-of-the-art performance, with 24.0% and 29.1% EPE3D +reduction respectively on FlyingThings3D and KITTI 2015 datasets. Notably, our +method achieves an unprecedented millimeter-level accuracy (0.0078m in EPE3D) +on the KITTI dataset. Additionally, our diffusion-based refinement paradigm can +be readily integrated as a plug-and-play module into existing scene flow +networks, significantly increasing their estimation accuracy. Codes are +released at https://github.com/IRMVLab/DifFlow3D. + +
+
+ comment: Camera-ready version of CVPR 2024. Codes are released at + https://github.com/IRMVLab/DifFlow3D +
+
+
+
+
+ + ♻ ☆ Task-wise Sampling Convolutions for Arbitrary-Oriented Object Detection + in Aerial Images + + +
+ Arbitrary-oriented object detection (AOOD) has been widely applied to locate +and classify objects with diverse orientations in remote sensing images. +However, the inconsistent features for the localization and classification +tasks in AOOD models may lead to ambiguity and low-quality object predictions, +which constrains the detection performance. In this article, an AOOD method +called task-wise sampling convolutions (TS-Conv) is proposed. TS-Conv +adaptively samples task-wise features from respective sensitive regions and +maps these features together in alignment to guide a dynamic label assignment +for better predictions. Specifically, sampling positions of the localization +convolution in TS-Conv are supervised by the oriented bounding box (OBB) +prediction associated with spatial coordinates, while sampling positions and +convolutional kernel of the classification convolution are designed to be +adaptively adjusted according to different orientations for improving the +orientation robustness of features. Furthermore, a dynamic +task-consistent-aware label assignment (DTLA) strategy is developed to select +optimal candidate positions and assign labels dynamically according to ranked +task-aware scores obtained from TS-Conv. Extensive experiments on several +public datasets covering multiple scenes, multimodal images, and multiple +categories of objects demonstrate the effectiveness, scalability, and superior +performance of the proposed TS-Conv. + +
+
+ comment: 15 pages, 13 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ FSC: Few-point Shape Completion CVPR 2024 + + +
+ While previous studies have demonstrated successful 3D object shape +completion with a sufficient number of points, they often fail in scenarios +when a few points, e.g. tens of points, are observed. Surprisingly, via entropy +analysis, we find that even a few points, e.g. 64 points, could retain +substantial information to help recover the 3D shape of the object. To address +the challenge of shape completion with very sparse point clouds, we then +propose Few-point Shape Completion (FSC) model, which contains a novel +dual-branch feature extractor for handling extremely sparse inputs, coupled +with an extensive branch for maximal point utilization with a saliency branch +for dynamic importance assignment. This model is further bolstered by a +two-stage revision network that refines both the extracted features and the +decoder output, enhancing the detail and authenticity of the completed point +cloud. Our experiments demonstrate the feasibility of recovering 3D shapes from +a few points. The proposed Few-point Shape Completion (FSC) model outperforms +previous methods on both few-point inputs and many-point inputs, and shows good +generalizability to different object categories. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via + Expressive Masked Audio Gesture Modeling CVPR + + +
+ We propose EMAGE, a framework to generate full-body human gestures from audio +and masked gestures, encompassing facial, local body, hands, and global +movements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new +mesh-level holistic co-speech dataset. BEAT2 combines MoShed SMPLX body with +FLAME head parameters and further refines the modeling of head, neck, and +finger movements, offering a community-standardized, high-quality 3D motion +captured dataset. EMAGE leverages masked body gesture priors during training to +boost inference performance. It involves a Masked Audio Gesture Transformer, +facilitating joint training on audio-to-gesture generation and masked gesture +reconstruction to effectively encode audio and body gesture hints. Encoded body +hints from masked gestures are then separately employed to generate facial and +body movements. Moreover, EMAGE adaptively merges speech features from the +audio's rhythm and content and utilizes four compositional VQ-VAEs to enhance +the results' fidelity and diversity. Experiments demonstrate that EMAGE +generates holistic gestures with state-of-the-art performance and is flexible +in accepting predefined spatial-temporal gesture inputs, generating complete, +audio-synchronized results. Our code and dataset are available at +https://pantomatrix.github.io/EMAGE/ + +
+
+ comment: Conflict of Interest Disclosure; CVPR Camera Ready; Project Page: + https://pantomatrix.github.io/EMAGE/ +
+
+
+
+
+ + ♻ ☆ PPAD: Iterative Interactions of Prediction and Planning for End-to-end + Autonomous Driving + + +
+ We present a new interaction mechanism of prediction and planning for +end-to-end autonomous driving, called PPAD (Iterative Interaction of Prediction +and Planning Autonomous Driving), which considers the timestep-wise interaction +to better integrate prediction and planning. An ego vehicle performs motion +planning at each timestep based on the trajectory prediction of surrounding +agents (e.g., vehicles and pedestrians) and its local road conditions. Unlike +existing end-to-end autonomous driving frameworks, PPAD models the interactions +among ego, agents, and the dynamic environment in an autoregressive manner by +interleaving the Prediction and Planning processes at every timestep, instead +of a single sequential process of prediction followed by planning. +Specifically, we design ego-to-agent, ego-to-map, and ego-to-BEV interaction +mechanisms with hierarchical dynamic key objects attention to better model the +interactions. The experiments on the nuScenes benchmark show that our approach +outperforms state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation + System + + +
+ Accuracy and computational efficiency are the most important metrics to +Visual Inertial Navigation System (VINS). The existing VINS algorithms with +either high accuracy or low computational complexity, are difficult to provide +the high precision localization in resource-constrained devices. To this end, +we propose a novel filter-based VINS framework named SchurVINS, which could +guarantee both high accuracy by building a complete residual model and low +computational complexity with Schur complement. Technically, we first formulate +the full residual model where Gradient, Hessian and observation covariance are +explicitly modeled. Then Schur complement is employed to decompose the full +model into ego-motion residual model and landmark residual model. Finally, +Extended Kalman Filter (EKF) update is implemented in these two models with +high efficiency. Experiments on EuRoC and TUM-VI datasets show that our method +notably outperforms state-of-the-art (SOTA) methods in both accuracy and +computational complexity. The experimental code of SchurVINS is available at +https://github.com/bytedance/SchurVINS. + +
+
+
+
+
+ + ♻ ☆ Generalized Logit Adjustment: Calibrating Fine-tuned Models by Removing + Label Bias in Foundation Models NeurIPS2023 + + +
+ Foundation models like CLIP allow zero-shot transfer on various tasks without +additional training data. Yet, the zero-shot performance is less competitive +than a fully supervised one. Thus, to enhance the performance, fine-tuning and +ensembling are also commonly adopted to better fit the downstream tasks. +However, we argue that such prior work has overlooked the inherent biases in +foundation models. Due to the highly imbalanced Web-scale training set, these +foundation models are inevitably skewed toward frequent semantics, and thus the +subsequent fine-tuning or ensembling is still biased. In this study, we +systematically examine the biases in foundation models and demonstrate the +efficacy of our proposed Generalized Logit Adjustment (GLA) method. Note that +bias estimation in foundation models is challenging, as most pre-train data +cannot be explicitly accessed like in traditional long-tailed classification +tasks. To this end, GLA has an optimization-based bias estimation approach for +debiasing foundation models. As our work resolves a fundamental flaw in the +pre-training, the proposed GLA demonstrates significant improvements across a +diverse range of tasks: it achieves 1.5 pp accuracy gains on ImageNet, an large +average improvement (1.4-4.6 pp) on 11 few-shot datasets, 2.4 pp gains on +long-tailed classification. Codes are in \url{https://github.com/BeierZhu/GLA}. + +
+
+ comment: V2 proposed a more effective method for label distribution + estimation. V1 fixed a typo in abstract; Accepted by NeurIPS2023 +
+
+
+
+
+ + ♻ ☆ BridgeTower: Building Bridges Between Encoders in Vision-Language + Representation Learning AAAI 2023 + + +
+ Vision-Language (VL) models with the Two-Tower architecture have dominated +visual-language representation learning in recent years. Current VL models +either use lightweight uni-modal encoders and learn to extract, align and fuse +both modalities simultaneously in a deep cross-modal encoder, or feed the +last-layer uni-modal representations from the deep pre-trained uni-modal +encoders into the top cross-modal encoder. Both approaches potentially restrict +vision-language representation learning and limit model performance. In this +paper, we propose BridgeTower, which introduces multiple bridge layers that +build a connection between the top layers of uni-modal encoders and each layer +of the cross-modal encoder. This enables effective bottom-up cross-modal +alignment and fusion between visual and textual representations of different +semantic levels of pre-trained uni-modal encoders in the cross-modal encoder. +Pre-trained with only 4M images, BridgeTower achieves state-of-the-art +performance on various downstream vision-language tasks. In particular, on the +VQAv2 test-std set, BridgeTower achieves an accuracy of 78.73%, outperforming +the previous state-of-the-art model METER by 1.09% with the same pre-training +data and almost negligible additional parameters and computational costs. +Notably, when further scaling the model, BridgeTower achieves an accuracy of +81.15%, surpassing models that are pre-trained on orders-of-magnitude larger +datasets. Code and checkpoints are available at +https://github.com/microsoft/BridgeTower. + +
+
+ comment: Accepted by AAAI 2023, Oral +
+
+
+
+
+ + ♻ ☆ Scalable and Robust Transformer Decoders for Interpretable Image + Classification with Foundation Models + + +
+ Interpretable computer vision models can produce transparent predictions, +where the features of an image are compared with prototypes from a training +dataset and the similarity between them forms a basis for classification. +Nevertheless these methods are computationally expensive to train, introduce +additional complexity and may require domain knowledge to adapt +hyper-parameters to a new dataset. Inspired by developments in object +detection, segmentation and large-scale self-supervised foundation vision +models, we introduce Component Features (ComFe), a novel explainable-by-design +image classification approach using a transformer-decoder head and hierarchical +mixture-modelling. With only global image labels and no segmentation or part +annotations, ComFe can identify consistent image components, such as the head, +body, wings and tail of a bird, and the image background, and determine which +of these features are informative in making a prediction. We demonstrate that +ComFe obtains higher accuracy compared to previous interpretable models across +a range of fine-grained vision benchmarks, without the need to individually +tune hyper-parameters for each dataset. We also show that ComFe outperforms a +non-interpretable linear head across a range of datasets, including ImageNet, +and improves performance on generalisation and robustness benchmarks. + +
+
+
+
+
+ + ♻ ☆ Discovering and Mitigating Visual Biases through Keyword Explanation CVPR 2024 + + +
+ Addressing biases in computer vision models is crucial for real-world AI +deployments. However, mitigating visual biases is challenging due to their +unexplainable nature, often identified indirectly through visualization or +sample statistics, which necessitates additional human supervision for +interpretation. To tackle this issue, we propose the Bias-to-Text (B2T) +framework, which interprets visual biases as keywords. Specifically, we extract +common keywords from the captions of mispredicted images to identify potential +biases in the model. We then validate these keywords by measuring their +similarity to the mispredicted images using a vision-language scoring model. +The keyword explanation form of visual bias offers several advantages, such as +a clear group naming for bias discovery and a natural extension for debiasing +using these group names. Our experiments demonstrate that B2T can identify +known biases, such as gender bias in CelebA, background bias in Waterbirds, and +distribution shifts in ImageNet-R/C. Additionally, B2T uncovers novel biases in +larger datasets, such as Dollar Street and ImageNet. For example, we discovered +a contextual bias between "bee" and "flower" in ImageNet. We also highlight +various applications of B2T keywords, including debiased training, CLIP +prompting, and model comparison. + +
+
+ comment: CVPR 2024. First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Few-shot Learner Parameterization by Diffusion Time-steps CVPR 2024 + + +
+ Even when using large multi-modal foundation models, few-shot learning is +still challenging -- if there is no proper inductive bias, it is nearly +impossible to keep the nuanced class attributes while removing the visually +prominent attributes that spuriously correlate with class labels. To this end, +we find an inductive bias that the time-steps of a Diffusion Model (DM) can +isolate the nuanced class attributes, i.e., as the forward diffusion adds noise +to an image at each time-step, nuanced attributes are usually lost at an +earlier time-step than the spurious attributes that are visually prominent. +Building on this, we propose Time-step Few-shot (TiF) learner. We train +class-specific low-rank adapters for a text-conditioned DM to make up for the +lost attributes, such that images can be accurately reconstructed from their +noisy ones given a prompt. Hence, at a small time-step, the adapter and prompt +are essentially a parameterization of only the nuanced class attributes. For a +test image, we can use the parameterization to only extract the nuanced class +attributes for classification. TiF learner significantly outperforms OpenCLIP +and its adapters on a variety of fine-grained and customized few-shot learning +tasks. Codes are in https://github.com/yue-zhongqi/tif. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Unified Sequence-to-Sequence Learning for Single- and Multi-Modal Visual + Object Tracking CVPR2023 + + +
+ In this paper, we introduce a new sequence-to-sequence learning framework for +RGB-based and multi-modal object tracking. First, we present SeqTrack for +RGB-based tracking. It casts visual tracking as a sequence generation task, +forecasting object bounding boxes in an autoregressive manner. This differs +from previous trackers, which depend on the design of intricate head networks, +such as classification and regression heads. SeqTrack employs a basic +encoder-decoder transformer architecture. The encoder utilizes a bidirectional +transformer for feature extraction, while the decoder generates bounding box +sequences autoregressively using a causal transformer. The loss function is a +plain cross-entropy. Second, we introduce SeqTrackv2, a unified +sequence-to-sequence framework for multi-modal tracking tasks. Expanding upon +SeqTrack, SeqTrackv2 integrates a unified interface for auxiliary modalities +and a set of task-prompt tokens to specify the task. This enables it to manage +multi-modal tracking tasks using a unified model and parameter set. This +sequence learning paradigm not only simplifies the tracking framework, but also +showcases superior performance across 14 challenging benchmarks spanning five +single- and multi-modal tracking tasks. The code and models are available at +https://github.com/chenxin-dlut/SeqTrackv2. + +
+
+ comment: This is a new expanded version of our previous CVPR2023 paper + "SeqTrack: Sequence to Sequence Learning for Visual Object Tracking." + SeqTrackv2 extends SeqTrack to four multi-modal tracking tasks with a unified + model and parameter set +
+
+
+
+
+ + ♻ ☆ Generative 3D Part Assembly via Part-Whole-Hierarchy Message Passing + + +
+ Generative 3D part assembly involves understanding part relationships and +predicting their 6-DoF poses for assembling a realistic 3D shape. Prior work +often focus on the geometry of individual parts, neglecting part-whole +hierarchies of objects. Leveraging two key observations: 1) super-part poses +provide strong hints about part poses, and 2) predicting super-part poses is +easier due to fewer superparts, we propose a part-whole-hierarchy message +passing network for efficient 3D part assembly. We first introduce super-parts +by grouping geometrically similar parts without any semantic labels. Then we +employ a part-whole hierarchical encoder, wherein a super-part encoder predicts +latent super-part poses based on input parts. Subsequently, we transform the +point cloud using the latent poses, feeding it to the part encoder for +aggregating super-part information and reasoning about part relationships to +predict all part poses. In training, only ground-truth part poses are required. +During inference, the predicted latent poses of super-parts enhance +interpretability. Experimental results on the PartNet dataset show that our +method achieves state-of-the-art performance in part and connectivity accuracy +and enables an interpretable hierarchical part assembly. Code is available at +https://github.com/pkudba/3DHPA. + +
+
+
+
+
+ + ♻ ☆ Distilling ODE Solvers of Diffusion Models into Smaller Steps + + +
+ Abstract Diffusion models have recently gained prominence as a novel category +of generative models. Despite their success, these models face a notable +drawback in terms of slow sampling speeds, requiring a high number of function +evaluations (NFE) in the order of hundreds or thousands. In response, both +learning-free and learning-based sampling strategies have been explored to +expedite the sampling process. Learning-free sampling employs various ordinary +differential equation (ODE) solvers based on the formulation of diffusion ODEs. +However, it encounters challenges in faithfully tracking the true sampling +trajectory, particularly for small NFE. Conversely, learning-based sampling +methods, such as knowledge distillation, demand extensive additional training, +limiting their practical applicability. To overcome these limitations, we +introduce Distilled-ODE solvers (D-ODE solvers), a straightforward distillation +approach grounded in ODE solver formulations. Our method seamlessly integrates +the strengths of both learning-free and learning-based sampling. D-ODE solvers +are constructed by introducing a single parameter adjustment to existing ODE +solvers. Furthermore, we optimize D-ODE solvers with smaller steps using +knowledge distillation from ODE solvers with larger steps across a batch of +samples. Comprehensive experiments demonstrate the superior performance of +D-ODE solvers compared to existing ODE solvers, including DDIM, PNDM, +DPM-Solver, DEIS, and EDM, particularly in scenarios with fewer NFE. Notably, +our method incurs negligible computational overhead compared to previous +distillation techniques, facilitating straightforward and rapid integration +with existing samplers. Qualitative analysis reveals that D-ODE solvers not +only enhance image quality but also faithfully follow the target ODE +trajectory. + +
+
+
+
+
+ + ♻ ☆ DiffPrompter: Differentiable Implicit Visual Prompts for + Semantic-Segmentation in Adverse Conditions + + +
+ Semantic segmentation in adverse weather scenarios is a critical task for +autonomous driving systems. While foundation models have shown promise, the +need for specialized adaptors becomes evident for handling more challenging +scenarios. We introduce DiffPrompter, a novel differentiable visual and latent +prompting mechanism aimed at expanding the learning capabilities of existing +adaptors in foundation models. Our proposed $\nabla$HFC image processing block +excels particularly in adverse weather conditions, where conventional methods +often fall short. Furthermore, we investigate the advantages of jointly +training visual and latent prompts, demonstrating that this combined approach +significantly enhances performance in out-of-distribution scenarios. Our +differentiable visual prompts leverage parallel and series architectures to +generate prompts, effectively improving object segmentation tasks in adverse +conditions. Through a comprehensive series of experiments and evaluations, we +provide empirical evidence to support the efficacy of our approach. Project +page at https://diffprompter.github.io. + +
+
+
+
+
+ + ♻ ☆ Language Models are Free Boosters for Biomedical Imaging Tasks + + +
+ In this study, we uncover the unexpected efficacy of residual-based large +language models (LLMs) as part of encoders for biomedical imaging tasks, a +domain traditionally devoid of language or textual data. The approach diverges +from established methodologies by utilizing a frozen transformer block, +extracted from pre-trained LLMs, as an innovative encoder layer for the direct +processing of visual tokens. This strategy represents a significant departure +from the standard multi-modal vision-language frameworks, which typically hinge +on language-driven prompts and inputs. We found that these LLMs could boost +performance across a spectrum of biomedical imaging applications, including +both 2D and 3D visual classification tasks, serving as plug-and-play boosters. +More interestingly, as a byproduct, we found that the proposed framework +achieved superior performance, setting new state-of-the-art results on +extensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we +aim to open new avenues for employing LLMs in biomedical imaging and enriching +the understanding of their potential in this specialized domain. + +
+
+
+
+
+ + ♻ ☆ Contrastive Pre-Training with Multi-View Fusion for No-Reference Point + Cloud Quality Assessment + + +
+ No-reference point cloud quality assessment (NR-PCQA) aims to automatically +evaluate the perceptual quality of distorted point clouds without available +reference, which have achieved tremendous improvements due to the utilization +of deep neural networks. However, learning-based NR-PCQA methods suffer from +the scarcity of labeled data and usually perform suboptimally in terms of +generalization. To solve the problem, we propose a novel contrastive +pre-training framework tailored for PCQA (CoPA), which enables the pre-trained +model to learn quality-aware representations from unlabeled data. To obtain +anchors in the representation space, we project point clouds with different +distortions into images and randomly mix their local patches to form mixed +images with multiple distortions. Utilizing the generated anchors, we constrain +the pre-training process via a quality-aware contrastive loss following the +philosophy that perceptual quality is closely related to both content and +distortion. Furthermore, in the model fine-tuning stage, we propose a +semantic-guided multi-view fusion module to effectively integrate the features +of projected images from multiple perspectives. Extensive experiments show that +our method outperforms the state-of-the-art PCQA methods on popular benchmarks. +Further investigations demonstrate that CoPA can also benefit existing +learning-based PCQA models. + +
+
+
+
+
+ + ♻ ☆ LLMs in Political Science: Heralding a New Era of Visual Analysis + + +
+ Interest is increasing among political scientists in leveraging the extensive +information available in images. However, the challenge of interpreting these +images lies in the need for specialized knowledge in computer vision and access +to specialized hardware. As a result, image analysis has been limited to a +relatively small group within the political science community. This landscape +could potentially change thanks to the rise of large language models (LLMs). +This paper aims to raise awareness of the feasibility of using Gemini for image +content analysis. A retrospective analysis was conducted on a corpus of 688 +images. Content reports were elicited from Gemini for each image and then +manually evaluated by the authors. We find that Gemini is highly accurate in +performing object detection, which is arguably the most common and fundamental +task in image analysis for political scientists. Equally important, we show +that it is easy to implement as the entire command consists of a single prompt +in natural language; it is fast to run and should meet the time budget of most +researchers; and it is free to use and does not require any specialized +hardware. In addition, we illustrate how political scientists can leverage +Gemini for other image understanding tasks, including face identification, +sentiment analysis, and caption generation. Our findings suggest that Gemini +and other similar LLMs have the potential to drastically stimulate and +accelerate image research in political science and social sciences more +broadly. + +
+
+ comment: 7 pages, 3 tables +
+
+
+
+
+ + ♻ ☆ Trustworthy Self-Attention: Enabling the Network to Focus Only on the + Most Relevant References + + +
+ The prediction of optical flow for occluded points is still a difficult +problem that has not yet been solved. Recent methods use self-attention to find +relevant non-occluded points as references for estimating the optical flow of +occluded points based on the assumption of self-similarity. However, they rely +on visual features of a single image and weak constraints, which are not +sufficient to constrain the trained network to focus on erroneous and weakly +relevant reference points. We make full use of online occlusion recognition +information to construct occlusion extended visual features and two strong +constraints, allowing the network to learn to focus only on the most relevant +references without requiring occlusion ground truth to participate in the +training of the network. Our method adds very few network parameters to the +original framework, making it very lightweight. Extensive experiments show that +our model has the greatest cross-dataset generalization. Our method achieves +much greater error reduction, 18.6%, 16.2%, and 20.1% for all points, +non-occluded points, and occluded points respectively from the state-of-the-art +GMA-base method, MATCHFlow(GMA), on Sintel Albedo pass. Furthermore, our model +achieves state-of-the-art performance on the Sintel bench-marks, ranking \#1 +among all published methods on Sintel clean pass. The code will be open-source. + +
+
+ comment: Correct Figure 1 +
+
+
+
+
+ + ♻ ☆ Dyadic Interaction Modeling for Social Behavior Generation + + +
+ Human-human communication is like a delicate dance where listeners and +speakers concurrently interact to maintain conversational dynamics. Hence, an +effective model for generating listener nonverbal behaviors requires +understanding the dyadic context and interaction. In this paper, we present an +effective framework for creating 3D facial motions in dyadic interactions. +Existing work consider a listener as a reactive agent with reflexive behaviors +to the speaker's voice and facial motions. The heart of our framework is Dyadic +Interaction Modeling (DIM), a pre-training approach that jointly models +speakers' and listeners' motions through masking and contrastive learning to +learn representations that capture the dyadic context. To enable the generation +of non-deterministic behaviors, we encode both listener and speaker motions +into discrete latent representations, through VQ-VAE. The pre-trained model is +further fine-tuned for motion generation. Extensive experiments demonstrate the +superiority of our framework in generating listener motions, establishing a new +state-of-the-art according to the quantitative measures capturing the diversity +and realism of generated motions. Qualitative results demonstrate the superior +capabilities of the proposed approach in generating diverse and realistic +expressions, eye blinks and head gestures. + +
+
+
+
+
+ + ♻ ☆ Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet + Representation + + +
+ Generating high-quality videos that synthesize desired realistic content is a +challenging task due to their intricate high-dimensionality and complexity of +videos. Several recent diffusion-based methods have shown comparable +performance by compressing videos to a lower-dimensional latent space, using +traditional video autoencoder architecture. However, such method that employ +standard frame-wise 2D and 3D convolution fail to fully exploit the +spatio-temporal nature of videos. To address this issue, we propose a novel +hybrid video diffusion model, called HVDM, which can capture spatio-temporal +dependencies more effectively. The HVDM is trained by a hybrid video +autoencoder which extracts a disentangled representation of the video +including: (i) a global context information captured by a 2D projected latent +(ii) a local volume information captured by 3D convolutions with wavelet +decomposition (iii) a frequency information for improving the video +reconstruction. Based on this disentangled representation, our hybrid +autoencoder provide a more comprehensive video latent enriching the generated +videos with fine structures and details. Experiments on video generation +benchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed +approach achieves state-of-the-art video generation quality, showing a wide +range of video applications (e.g., long video generation, image-to-video, and +video dynamics control). + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Deep Learning-Driven Approach for Handwritten Chinese Character + Classification + + +
+ Handwritten character recognition (HCR) is a challenging problem for machine +learning researchers. Unlike printed text data, handwritten character datasets +have more variation due to human-introduced bias. With numerous unique +character classes present, some data, such as Logographic Scripts or +Sino-Korean character sequences, bring new complications to the HCR problem. +The classification task on such datasets requires the model to learn +high-complexity details of the images that share similar features. With recent +advances in computational resource availability and further computer vision +theory development, some research teams have effectively addressed the arising +challenges. Although known for achieving high accuracy while keeping the number +of parameters small, many common approaches are still not generalizable and use +dataset-specific solutions to achieve better results. Due to complex structure, +existing methods frequently prevent the solutions from gaining popularity. This +paper proposes a highly scalable approach for detailed character image +classification by introducing the model architecture, data preprocessing steps, +and testing design instructions. We also perform experiments to compare the +performance of our method with that of existing ones to show the improvements +achieved. + +
+
+ comment: 30 pages, 9 figures, 2 tables, preprint v2 +
+
+
+
+
+ + ♻ ☆ X-Portrait: Expressive Portrait Animation with Hierarchical Motion + Attention + + +
+ We propose X-Portrait, an innovative conditional diffusion model tailored for +generating expressive and temporally coherent portrait animation. Specifically, +given a single portrait as appearance reference, we aim to animate it with +motion derived from a driving video, capturing both highly dynamic and subtle +facial expressions along with wide-range head movements. As its core, we +leverage the generative prior of a pre-trained diffusion model as the rendering +backbone, while achieve fine-grained head pose and expression control with +novel controlling signals within the framework of ControlNet. In contrast to +conventional coarse explicit controls such as facial landmarks, our motion +control module is learned to interpret the dynamics directly from the original +driving RGB inputs. The motion accuracy is further enhanced with a patch-based +local control module that effectively enhance the motion attention to +small-scale nuances like eyeball positions. Notably, to mitigate the identity +leakage from the driving signals, we train our motion control modules with +scaling-augmented cross-identity images, ensuring maximized disentanglement +from the appearance reference modules. Experimental results demonstrate the +universal effectiveness of X-Portrait across a diverse range of facial +portraits and expressive driving sequences, and showcase its proficiency in +generating captivating portrait animations with consistently maintained +identity characteristics. + +
+
+
+
+
+ + ♻ ☆ Targeted collapse regularized autoencoder for anomaly detection: black + hole at the center + + +
+ Autoencoders have been extensively used in the development of recent anomaly +detection techniques. The premise of their application is based on the notion +that after training the autoencoder on normal training data, anomalous inputs +will exhibit a significant reconstruction error. Consequently, this enables a +clear differentiation between normal and anomalous samples. In practice, +however, it is observed that autoencoders can generalize beyond the normal +class and achieve a small reconstruction error on some of the anomalous +samples. To improve the performance, various techniques propose additional +components and more sophisticated training procedures. In this work, we propose +a remarkably straightforward alternative: instead of adding neural network +components, involved computations, and cumbersome training, we complement the +reconstruction loss with a computationally light term that regulates the norm +of representations in the latent space. The simplicity of our approach +minimizes the requirement for hyperparameter tuning and customization for new +applications which, paired with its permissive data modality constraint, +enhances the potential for successful adoption across a broad range of +applications. We test the method on various visual and tabular benchmarks and +demonstrate that the technique matches and frequently outperforms more complex +alternatives. We further demonstrate that implementing this idea in the context +of state-of-the-art methods can further improve their performance. We also +provide a theoretical analysis and numerical simulations that help demonstrate +the underlying process that unfolds during training and how it helps with +anomaly detection. This mitigates the black-box nature of autoencoder-based +anomaly detection algorithms and offers an avenue for further investigation of +advantages, fail cases, and potential new directions. + +
+
+ comment: 18 pages, 4 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ PIE-NeRF: Physics-based Interactive Elastodynamics with NeRF + + +
+ We show that physics-based simulations can be seamlessly integrated with NeRF +to generate high-quality elastodynamics of real-world objects. Unlike existing +methods, we discretize nonlinear hyperelasticity in a meshless way, obviating +the necessity for intermediate auxiliary shape proxies like a tetrahedral mesh +or voxel grid. A quadratic generalized moving least square (Q-GMLS) is employed +to capture nonlinear dynamics and large deformation on the implicit model. Such +meshless integration enables versatile simulations of complex and codimensional +shapes. We adaptively place the least-square kernels according to the NeRF +density field to significantly reduce the complexity of the nonlinear +simulation. As a result, physically realistic animations can be conveniently +synthesized using our method for a wide range of hyperelastic materials at an +interactive rate. For more information, please visit our project page at +https://fytalon.github.io/pienerf/. + +
+
+
+
+
+ + ♻ ☆ Towards Fairness-Aware Adversarial Learning CVPR 2024 + + +
+ Although adversarial training (AT) has proven effective in enhancing the +model's robustness, the recently revealed issue of fairness in robustness has +not been well addressed, i.e. the robust accuracy varies significantly among +different categories. In this paper, instead of uniformly evaluating the +model's average class performance, we delve into the issue of robust fairness, +by considering the worst-case distribution across various classes. We propose a +novel learning paradigm, named Fairness-Aware Adversarial Learning (FAAL). As a +generalization of conventional AT, we re-define the problem of adversarial +training as a min-max-max framework, to ensure both robustness and fairness of +the trained model. Specifically, by taking advantage of distributional robust +optimization, our method aims to find the worst distribution among different +categories, and the solution is guaranteed to obtain the upper bound +performance with high probability. In particular, FAAL can fine-tune an unfair +robust model to be fair within only two epochs, without compromising the +overall clean and robust accuracies. Extensive experiments on various image +datasets validate the superior performance and efficiency of the proposed FAAL +compared to other state-of-the-art methods. + +
+
+ comment: This work will appear in the CVPR 2024 conference proceedings +
+
+
+
+
+ + ♻ ☆ Multi-modal Misinformation Detection: Approaches, Challenges and + Opportunities + + +
+ As social media platforms are evolving from text-based forums into +multi-modal environments, the nature of misinformation in social media is also +transforming accordingly. Taking advantage of the fact that visual modalities +such as images and videos are more favorable and attractive to the users and +textual contents are sometimes skimmed carelessly, misinformation spreaders +have recently targeted contextual connections between the modalities e.g., text +and image. Hence many researchers have developed automatic techniques for +detecting possible cross-modal discordance in web-based content. We analyze, +categorize and identify existing approaches in addition to challenges and +shortcomings they face in order to unearth new research opportunities in the +field of multi-modal misinformation detection. + +
+
+
+
+
+ + ♻ ☆ HybridNeRF: Efficient Neural Rendering via Adaptive Volumetric Surfaces CVPR 2024 + + +
+ Neural radiance fields provide state-of-the-art view synthesis quality but +tend to be slow to render. One reason is that they make use of volume +rendering, thus requiring many samples (and model queries) per ray at render +time. Although this representation is flexible and easy to optimize, most +real-world objects can be modeled more efficiently with surfaces instead of +volumes, requiring far fewer samples per ray. This observation has spurred +considerable progress in surface representations such as signed distance +functions, but these may struggle to model semi-opaque and thin structures. We +propose a method, HybridNeRF, that leverages the strengths of both +representations by rendering most objects as surfaces while modeling the +(typically) small fraction of challenging regions volumetrically. We evaluate +HybridNeRF against the challenging Eyeful Tower dataset along with other +commonly used view synthesis datasets. When comparing to state-of-the-art +baselines, including recent rasterization-based approaches, we improve error +rates by 15-30% while achieving real-time framerates (at least 36 FPS) for +virtual-reality resolutions (2Kx2K). + +
+
+ comment: CVPR 2024 Project page: https://haithemturki.com/hybrid-nerf/ +
+
+
+
+
+ + ♻ ☆ L2B: Learning to Bootstrap Robust Models for Combating Label Noise CVPR 2024 + + +
+ Deep neural networks have shown great success in representation learning. +However, when learning with noisy labels (LNL), they can easily overfit and +fail to generalize to new data. This paper introduces a simple and effective +method, named Learning to Bootstrap (L2B), which enables models to bootstrap +themselves using their own predictions without being adversely affected by +erroneous pseudo-labels. It achieves this by dynamically adjusting the +importance weight between real observed and generated labels, as well as +between different samples through meta-learning. Unlike existing instance +reweighting methods, the key to our method lies in a new, versatile objective +that enables implicit relabeling concurrently, leading to significant +improvements without incurring additional costs. + L2B offers several benefits over the baseline methods. It yields more robust +models that are less susceptible to the impact of noisy labels by guiding the +bootstrapping procedure more effectively. It better exploits the valuable +information contained in corrupted instances by adapting the weights of both +instances and labels. Furthermore, L2B is compatible with existing LNL methods +and delivers competitive results spanning natural and medical imaging tasks +including classification and segmentation under both synthetic and real-world +noise. Extensive experiments demonstrate that our method effectively mitigates +the challenges of noisy labels, often necessitating few to no validation +samples, and is well generalized to other tasks such as image segmentation. +This not only positions it as a robust complement to existing LNL techniques +but also underscores its practical applicability. The code and models are +available at https://github.com/yuyinzhou/l2b. + +
+
+ comment: CVPR 2024; code is available at https://github.com/yuyinzhou/l2b +
+
+
+
+
+ + ♻ ☆ Visual Acuity Prediction on Real-Life Patient Data Using a Machine + Learning Based Multistage System + + +
+ In ophthalmology, intravitreal operative medication therapy (IVOM) is a +widespread treatment for diseases related to the age-related macular +degeneration (AMD), the diabetic macular edema (DME), as well as the retinal +vein occlusion (RVO). However, in real-world settings, patients often suffer +from loss of vision on time scales of years despite therapy, whereas the +prediction of the visual acuity (VA) and the earliest possible detection of +deterioration under real-life conditions is challenging due to heterogeneous +and incomplete data. In this contribution, we present a workflow for the +development of a research-compatible data corpus fusing different IT systems of +the department of ophthalmology of a German maximum care hospital. The +extensive data corpus allows predictive statements of the expected progression +of a patient and his or her VA in each of the three diseases. For the disease +AMD, we found out a significant deterioration of the visual acuity over time. +Within our proposed multistage system, we subsequently classify the VA +progression into the three groups of therapy "winners", "stabilizers", and +"losers" (WSL classification scheme). Our OCT biomarker classification using an +ensemble of deep neural networks results in a classification accuracy +(F1-score) of over 98 %, enabling us to complete incomplete OCT documentations +while allowing us to exploit them for a more precise VA modelling process. Our +VA prediction requires at least four VA examinations and optionally OCT +biomarkers from the same time period to predict the VA progression within a +forecasted time frame, whereas our prediction is currently restricted to IVOM / +no therapy. We achieve a final prediction accuracy of 69 % in macro average +F1-score, while being in the same range as the ophthalmologists with 57.8 and +50 +- 10.7 % F1-score. + +
+
+ comment: Preprint for journal Scientific Reports (Springer) +
+
+
+
+
+ + ♻ ☆ LUWA Dataset: Learning Lithic Use-Wear Analysis on Microscopic Images CVPR + + +
+ Lithic Use-Wear Analysis (LUWA) using microscopic images is an underexplored +vision-for-science research area. It seeks to distinguish the worked material, +which is critical for understanding archaeological artifacts, material +interactions, tool functionalities, and dental records. However, this +challenging task goes beyond the well-studied image classification problem for +common objects. It is affected by many confounders owing to the complex wear +mechanism and microscopic imaging, which makes it difficult even for human +experts to identify the worked material successfully. In this paper, we +investigate the following three questions on this unique vision task for the +first time:(i) How well can state-of-the-art pre-trained models (like DINOv2) +generalize to the rarely seen domain? (ii) How can few-shot learning be +exploited for scarce microscopic images? (iii) How do the ambiguous +magnification and sensing modality influence the classification accuracy? To +study these, we collaborated with archaeologists and built the first +open-source and the largest LUWA dataset containing 23,130 microscopic images +with different magnifications and sensing modalities. Extensive experiments +show that existing pre-trained models notably outperform human experts but +still leave a large gap for improvements. Most importantly, the LUWA dataset +provides an underexplored opportunity for vision and learning communities and +complements existing image classification problems on common objects. + +
+
+ comment: CVPR +
+
+
+
+
+ + ♻ ☆ SuPerPM: A Large Deformation-Robust Surgical Perception Framework Based + on Deep Point Matching Learned from Physical Constrained Simulation Data + + +
+ Manipulation of tissue with surgical tools often results in large +deformations that current methods in tracking and reconstructing algorithms +have not effectively addressed. A major source of tracking errors during large +deformations stems from wrong data association between observed sensor +measurements with previously tracked scene. To mitigate this issue, we present +a surgical perception framework, SuPerPM, that leverages learning-based +non-rigid point cloud matching for data association, thus accommodating larger +deformations. The learning models typically require training data with ground +truth point cloud correspondences, which is challenging or even impractical to +collect in surgical environments. Thus, for tuning the learning model, we +gather endoscopic data of soft tissue being manipulated by a surgical robot and +then establish correspondences between point clouds at different time points to +serve as ground truth. This was achieved by employing a position-based dynamics +(PBD) simulation to ensure that the correspondences adhered to physical +constraints. The proposed framework is demonstrated on several challenging +surgical datasets that are characterized by large deformations, achieving +superior performance over state-of-the-art surgical scene tracking algorithms. + +
+
+
+
+
+ + ♻ ☆ What's in a Prior? Learned Proximal Networks for Inverse Problems + + +
+ Proximal operators are ubiquitous in inverse problems, commonly appearing as +part of algorithmic strategies to regularize problems that are otherwise +ill-posed. Modern deep learning models have been brought to bear for these +tasks too, as in the framework of plug-and-play or deep unrolling, where they +loosely resemble proximal operators. Yet, something essential is lost in +employing these purely data-driven approaches: there is no guarantee that a +general deep network represents the proximal operator of any function, nor is +there any characterization of the function for which the network might provide +some approximate proximal. This not only makes guaranteeing convergence of +iterative schemes challenging but, more fundamentally, complicates the analysis +of what has been learned by these networks about their training data. Herein we +provide a framework to develop learned proximal networks (LPN), prove that they +provide exact proximal operators for a data-driven nonconvex regularizer, and +show how a new training strategy, dubbed proximal matching, provably promotes +the recovery of the log-prior of the true data distribution. Such LPN provide +general, unsupervised, expressive proximal operators that can be used for +general inverse problems with convergence guarantees. We illustrate our results +in a series of cases of increasing complexity, demonstrating that these models +not only result in state-of-the-art performance, but provide a window into the +resulting priors learned from data. + +
+
+
+
+
+ + ♻ ☆ COVID-19 detection from pulmonary CT scans using a novel EfficientNet + with attention mechanism + + +
+ Manual analysis and diagnosis of COVID-19 through the examination of Computed +Tomography (CT) images of the lungs can be time-consuming and result in errors, +especially given high volume of patients and numerous images per patient. So, +we address the need for automation of this task by developing a new deep +learning model-based pipeline. Our motivation was sparked by the CVPR Workshop +on "Domain Adaptation, Explainability and Fairness in AI for Medical Image +Analysis", more specifically, the "COVID-19 Diagnosis Competition (DEF-AI-MIA +COV19D)" under the same Workshop. This challenge provides an opportunity to +assess our proposed pipeline for COVID-19 detection from CT scan images. The +same pipeline incorporates the original EfficientNet, but with an added +Attention Mechanism: EfficientNet-AM. Also, unlike the traditional/past +pipelines, which relied on a pre-processing step, our pipeline takes the raw +selected input images without any such step, except for an image-selection step +to simply reduce the number of CT images required for training and/or testing. +Moreover, our pipeline is computationally efficient, as, for example, it does +not incorporate a decoder for segmenting the lungs. It also does not combine +different backbones nor combine RNN with a backbone, as other pipelines in the +past did. Nevertheless, our pipeline still outperforms all approaches presented +by other teams in last year's instance of the same challenge, at least based on +the validation subset of the competition dataset. + +
+
+
+
+
+ + ♻ ☆ Multi-camera calibration with pattern rigs, including for + non-overlapping cameras: CALICO + + +
+ This paper describes CALICO, a method for multi-camera calibration suitable +for challenging contexts: stationary and mobile multi-camera systems, cameras +without overlapping fields of view, and non-synchronized cameras. Recent +approaches are roughly divided into infrastructure- and pattern-based. +Infrastructure-based approaches use the scene's features to calibrate, while +pattern-based approaches use calibration patterns. Infrastructure-based +approaches are not suitable for stationary camera systems, and pattern-based +approaches may constrain camera placement because shared fields of view or +extremely large patterns are required. + CALICO is a pattern-based approach, where the multi-calibration problem is +formulated using rigidity constraints between patterns and cameras. We use a +{\it pattern rig}: several patterns rigidly attached to each other or some +structure. We express the calibration problem as that of algebraic and +reprojection error minimization problems. Simulated and real experiments +demonstrate the method in a variety of settings. CALICO compared favorably to +Kalibr. Mean reconstruction accuracy error was $\le 0.71$ mm for real camera +rigs, and $\le 1.11$ for simulated camera rigs. Code and data releases are +available at \cite{tabb_amy_2019_3520866} and +\url{https://github.com/amy-tabb/calico}. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ From Correspondences to Pose: Non-minimal Certifiably Optimal Relative + Pose without Disambiguation CVPR 2024 + + +
+ Estimating the relative camera pose from $n \geq 5$ correspondences between +two calibrated views is a fundamental task in computer vision. This process +typically involves two stages: 1) estimating the essential matrix between the +views, and 2) disambiguating among the four candidate relative poses that +satisfy the epipolar geometry. In this paper, we demonstrate a novel approach +that, for the first time, bypasses the second stage. Specifically, we show that +it is possible to directly estimate the correct relative camera pose from +correspondences without needing a post-processing step to enforce the +cheirality constraint on the correspondences. Building on recent advances in +certifiable non-minimal optimization, we frame the relative pose estimation as +a Quadratically Constrained Quadratic Program (QCQP). By applying the +appropriate constraints, we ensure the estimation of a camera pose that +corresponds to a valid 3D geometry and that is globally optimal when certified. +We validate our method through exhaustive synthetic and real-world experiments, +confirming the efficacy, efficiency and accuracy of the proposed approach. Code +is available at https://github.com/javrtg/C2P. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Reasoning over the Behaviour of Objects in Video-Clips for Adverb-Type + Recognition + + +
+ In this work, following the intuition that adverbs describing scene-sequences +are best identified by reasoning over high-level concepts of object-behavior, +we propose the design of a new framework that reasons over object-behaviours +extracted from raw-video-clips to recognize the clip's corresponding +adverb-types. Importantly, while previous works for general scene +adverb-recognition assume knowledge of the clips underlying action-types, our +method is directly applicable in the more general problem setting where the +action-type of a video-clip is unknown. Specifically, we propose a novel +pipeline that extracts human-interpretable object-behaviour-facts from raw +video clips and propose novel symbolic and transformer based reasoning methods +that operate over these extracted facts to identify adverb-types. Experiment +results demonstrate that our proposed methods perform favourably against the +previous state-of-the-art. Additionally, to support efforts in symbolic +video-processing, we release two new datasets of object-behaviour-facts +extracted from raw video clips - the MSR-VTT-ASP and ActivityNet-ASP datasets. + +
+
+
+
+
+ + ♻ ☆ SHViT: Single-Head Vision Transformer with Memory Efficient Macro Design CVPR 2024 + + +
+ Recently, efficient Vision Transformers have shown great performance with low +latency on resource-constrained devices. Conventionally, they use 4x4 patch +embeddings and a 4-stage structure at the macro level, while utilizing +sophisticated attention with multi-head configuration at the micro level. This +paper aims to address computational redundancy at all design levels in a +memory-efficient manner. We discover that using larger-stride patchify stem not +only reduces memory access costs but also achieves competitive performance by +leveraging token representations with reduced spatial redundancy from the early +stages. Furthermore, our preliminary analyses suggest that attention layers in +the early stages can be substituted with convolutions, and several attention +heads in the latter stages are computationally redundant. To handle this, we +introduce a single-head attention module that inherently prevents head +redundancy and simultaneously boosts accuracy by parallelly combining global +and local information. Building upon our solutions, we introduce SHViT, a +Single-Head Vision Transformer that obtains the state-of-the-art speed-accuracy +tradeoff. For example, on ImageNet-1k, our SHViT-S4 is 3.3x, 8.1x, and 2.4x +faster than MobileViTv2 x1.0 on GPU, CPU, and iPhone12 mobile device, +respectively, while being 1.3% more accurate. For object detection and instance +segmentation on MS COCO using Mask-RCNN head, our model achieves performance +comparable to FastViT-SA12 while exhibiting 3.8x and 2.0x lower backbone +latency on GPU and mobile device, respectively. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 227 + +
+
+
+ + ☆ Efficient Video Object Segmentation via Modulated Cross-Attention Memory + + +
+ Recently, transformer-based approaches have shown promising results for +semi-supervised video object segmentation. However, these approaches typically +struggle on long videos due to increased GPU memory demands, as they frequently +expand the memory bank every few frames. We propose a transformer-based +approach, named MAVOS, that introduces an optimized and dynamic long-term +modulated cross-attention (MCA) memory to model temporal smoothness without +requiring frequent memory expansion. The proposed MCA effectively encodes both +local and global features at various levels of granularity while efficiently +maintaining consistent speed regardless of the video length. Extensive +experiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017, +demonstrate the effectiveness of our proposed contributions leading to +real-time inference and markedly reduced memory demands without any degradation +in segmentation accuracy on long videos. Compared to the best existing +transformer-based approach, our MAVOS increases the speed by 7.6x, while +significantly reducing the GPU memory by 87% with comparable segmentation +performance on short and long video datasets. Notably on the LVOS dataset, our +MAVOS achieves a J&F score of 63.3% while operating at 37 frames per second +(FPS) on a single V100 GPU. Our code and models will be publicly available at: +https://github.com/Amshaker/MAVOS. + +
+
+
+
+
+ + ☆ ConvoFusion: Multi-Modal Conversational Diffusion for Co-Speech Gesture + Synthesis CVPR 2024 + + +
+ Gestures play a key role in human communication. Recent methods for co-speech +gesture generation, while managing to generate beat-aligned motions, struggle +generating gestures that are semantically aligned with the utterance. Compared +to beat gestures that align naturally to the audio signal, semantically +coherent gestures require modeling the complex interactions between the +language and human motion, and can be controlled by focusing on certain words. +Therefore, we present ConvoFusion, a diffusion-based approach for multi-modal +gesture synthesis, which can not only generate gestures based on multi-modal +speech inputs, but can also facilitate controllability in gesture synthesis. +Our method proposes two guidance objectives that allow the users to modulate +the impact of different conditioning modalities (e.g. audio vs text) as well as +to choose certain words to be emphasized during gesturing. Our method is +versatile in that it can be trained either for generating monologue gestures or +even the conversational gestures. To further advance the research on +multi-party interactive gestures, the DnD Group Gesture dataset is released, +which contains 6 hours of gesture data showing 5 people interacting with one +another. We compare our method with several recent works and demonstrate +effectiveness of our method on a variety of tasks. We urge the reader to watch +our supplementary video at our website. + +
+
+ comment: CVPR 2024. Project Page: + https://vcai.mpi-inf.mpg.de/projects/ConvoFusion/ +
+
+
+
+
+ + ☆ OmniVid: A Generative Framework for Universal Video Understanding CVPR 2024 + + +
+ The core of video understanding tasks, such as recognition, captioning, and +tracking, is to automatically detect objects or actions in a video and analyze +their temporal evolution. Despite sharing a common goal, different tasks often +rely on distinct model architectures and annotation formats. In contrast, +natural language processing benefits from a unified output space, i.e., text +sequences, which simplifies the training of powerful foundational language +models, such as GPT-3, with extensive training corpora. Inspired by this, we +seek to unify the output space of video understanding tasks by using languages +as labels and additionally introducing time and box tokens. In this way, a +variety of video tasks could be formulated as video-grounded token generation. +This enables us to address various types of video tasks, including +classification (such as action recognition), captioning (covering clip +captioning, video question answering, and dense video captioning), and +localization tasks (such as visual object tracking) within a fully shared +encoder-decoder architecture, following a generative framework. Through +comprehensive experiments, we demonstrate such a simple and straightforward +idea is quite effective and can achieve state-of-the-art or competitive results +on seven video benchmarks, providing a novel perspective for more universal +video understanding. Code is available at https://github.com/wangjk666/OmniVid. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ AiOS: All-in-One-Stage Expressive Human Pose and Shape Estimation + + +
+ Expressive human pose and shape estimation (a.k.a. 3D whole-body mesh +recovery) involves the human body, hand, and expression estimation. Most +existing methods have tackled this task in a two-stage manner, first detecting +the human body part with an off-the-shelf detection model and inferring the +different human body parts individually. Despite the impressive results +achieved, these methods suffer from 1) loss of valuable contextual information +via cropping, 2) introducing distractions, and 3) lacking inter-association +among different persons and body parts, inevitably causing performance +degradation, especially for crowded scenes. To address these issues, we +introduce a novel all-in-one-stage framework, AiOS, for multiple expressive +human pose and shape recovery without an additional human detection step. +Specifically, our method is built upon DETR, which treats multi-person +whole-body mesh recovery task as a progressive set prediction problem with +various sequential detection. We devise the decoder tokens and extend them to +our task. Specifically, we first employ a human token to probe a human location +in the image and encode global features for each instance, which provides a +coarse location for the later transformer block. Then, we introduce a +joint-related token to probe the human joint in the image and encoder a +fine-grained local feature, which collaborates with the global feature to +regress the whole-body mesh. This straightforward but effective model +outperforms previous state-of-the-art methods by a 9% reduction in NMVE on +AGORA, a 30% reduction in PVE on EHF, a 10% reduction in PVE on ARCTIC, and a +3% reduction in PVE on EgoBody. + +
+
+ comment: Homepage: https://ttxskk.github.io/AiOS/ +
+
+
+
+
+ + ☆ SLEDGE: Synthesizing Simulation Environments for Driving Agents with + Generative Models + + +
+ SLEDGE is the first generative simulator for vehicle motion planning trained +on real-world driving logs. Its core component is a learned model that is able +to generate agent bounding boxes and lane graphs. The model's outputs serve as +an initial state for traffic simulation. The unique properties of the entities +to be generated for SLEDGE, such as their connectivity and variable count per +scene, render the naive application of most modern generative models to this +task non-trivial. Therefore, together with a systematic study of existing lane +graph representations, we introduce a novel raster-to-vector autoencoder +(RVAE). It encodes agents and the lane graph into distinct channels in a +rasterized latent map. This facilitates both lane-conditioned agent generation +and combined generation of lanes and agents with a Diffusion Transformer. Using +generated entities in SLEDGE enables greater control over the simulation, e.g. +upsampling turns or increasing traffic density. Further, SLEDGE can support +500m long routes, a capability not found in existing data-driven simulators +like nuPlan. It presents new challenges for planning algorithms, evidenced by +failure rates of over 40% for PDM, the winner of the 2023 nuPlan challenge, +when tested on hard routes and dense traffic generated by our model. Compared +to nuPlan, SLEDGE requires 500$\times$ less storage to set up (<4GB), making it +a more accessible option and helping with democratizing future research in this +field. + +
+
+
+
+
+ + ☆ Track Everything Everywhere Fast and Robustly + + +
+ We propose a novel test-time optimization approach for efficiently and +robustly tracking any pixel at any time in a video. The latest state-of-the-art +optimization-based tracking technique, OmniMotion, requires a prohibitively +long optimization time, rendering it impractical for downstream applications. +OmniMotion is sensitive to the choice of random seeds, leading to unstable +convergence. To improve efficiency and robustness, we introduce a novel +invertible deformation network, CaDeX++, which factorizes the function +representation into a local spatial-temporal feature grid and enhances the +expressivity of the coupling blocks with non-linear functions. While CaDeX++ +incorporates a stronger geometric bias within its architectural design, it also +takes advantage of the inductive bias provided by the vision foundation models. +Our system utilizes monocular depth estimation to represent scene geometry and +enhances the objective by incorporating DINOv2 long-term semantics to regulate +the optimization process. Our experiments demonstrate a substantial improvement +in training speed (more than \textbf{10 times} faster), robustness, and +accuracy in tracking over the SoTA optimization-based method OmniMotion. + +
+
+ comment: project page: https://timsong412.github.io/FastOmniTrack/ +
+
+
+
+
+ + ☆ Towards Explaining Hypercomplex Neural Networks + + +
+ Hypercomplex neural networks are gaining increasing interest in the deep +learning community. The attention directed towards hypercomplex models +originates from several aspects, spanning from purely theoretical and +mathematical characteristics to the practical advantage of lightweight models +over conventional networks, and their unique properties to capture both global +and local relations. In particular, a branch of these architectures, +parameterized hypercomplex neural networks (PHNNs), has also gained popularity +due to their versatility across a multitude of application domains. +Nonetheless, only few attempts have been made to explain or interpret their +intricacies. In this paper, we propose inherently interpretable PHNNs and +quaternion-like networks, thus without the need for any post-hoc method. To +achieve this, we define a type of cosine-similarity transform within the +parameterized hypercomplex domain. This PHB-cos transform induces weight +alignment with relevant input features and allows to reduce the model into a +single linear transform, rendering it directly interpretable. In this work, we +start to draw insights into how this unique branch of neural models operates. +We observe that hypercomplex networks exhibit a tendency to concentrate on the +shape around the main object of interest, in addition to the shape of the +object itself. We provide a thorough analysis, studying single neurons of +different layers and comparing them against how real-valued networks learn. The +code of the paper is available at https://github.com/ispamm/HxAI. + +
+
+ comment: The paper has been accepted at IEEE WCCI 2024 +
+
+
+
+
+ + ☆ FastCAR: Fast Classification And Regression Multi-Task Learning via Task + Consolidation for Modelling a Continuous Property Variable of Object Classes + + +
+ FastCAR is a novel task consolidation approach in Multi-Task Learning (MTL) +for a classification and a regression task, despite task heterogeneity with +only subtle correlation. It addresses object classification and continuous +property variable regression, a crucial use case in science and engineering. +FastCAR involves a labeling transformation approach that can be used with a +single-task regression network architecture. FastCAR outperforms traditional +MTL model families, parametrized in the landscape of architecture and loss +weighting schemes, when learning of both tasks are collectively considered +(classification accuracy of 99.54%, regression mean absolute percentage error +of 2.3%). The experiments performed used an Advanced Steel Property dataset +contributed by us. The dataset comprises 4536 images of 224x224 pixels, +annotated with object classes and hardness properties that take continuous +values. With the labeling transformation and single-task regression network +architecture, FastCAR achieves reduced latency and time efficiency. + +
+
+
+
+
+ + ☆ AID: Attention Interpolation of Text-to-Image Diffusion + + +
+ Conditional diffusion models can create unseen images in various settings, +aiding image interpolation. Interpolation in latent spaces is well-studied, but +interpolation with specific conditions like text or poses is less understood. +Simple approaches, such as linear interpolation in the space of conditions, +often result in images that lack consistency, smoothness, and fidelity. To that +end, we introduce a novel training-free technique named Attention Interpolation +via Diffusion (AID). Our key contributions include 1) proposing an inner/outer +interpolated attention layer; 2) fusing the interpolated attention with +self-attention to boost fidelity; and 3) applying beta distribution to +selection to increase smoothness. We also present a variant, Prompt-guided +Attention Interpolation via Diffusion (PAID), that considers interpolation as a +condition-dependent generative process. This method enables the creation of new +images with greater consistency, smoothness, and efficiency, and offers control +over the exact path of interpolation. Our approach demonstrates effectiveness +for conceptual and spatial interpolation. Code and demo are available at +https://github.com/QY-H00/attention-interpolation-diffusion. + +
+
+
+
+
+ + ☆ TC4D: Trajectory-Conditioned Text-to-4D Generation + + +
+ Recent techniques for text-to-4D generation synthesize dynamic 3D scenes +using supervision from pre-trained text-to-video models. However, existing +representations for motion, such as deformation models or time-dependent neural +representations, are limited in the amount of motion they can generate-they +cannot synthesize motion extending far beyond the bounding box used for volume +rendering. The lack of a more flexible motion model contributes to the gap in +realism between 4D generation methods and recent, near-photorealistic video +generation models. Here, we propose TC4D: trajectory-conditioned text-to-4D +generation, which factors motion into global and local components. We represent +the global motion of a scene's bounding box using rigid transformation along a +trajectory parameterized by a spline. We learn local deformations that conform +to the global trajectory using supervision from a text-to-video model. Our +approach enables the synthesis of scenes animated along arbitrary trajectories, +compositional scene generation, and significant improvements to the realism and +amount of generated motion, which we evaluate qualitatively and through a user +study. Video results can be viewed on our website: +https://sherwinbahmani.github.io/tc4d. + +
+
+ comment: Project Page: https://sherwinbahmani.github.io/tc4d +
+
+
+
+
+ + ☆ CMP: Cooperative Motion Prediction with Multi-Agent Communication + + +
+ The confluence of the advancement of Autonomous Vehicles (AVs) and the +maturity of Vehicle-to-Everything (V2X) communication has enabled the +capability of cooperative connected and automated vehicles (CAVs). Building on +top of cooperative perception, this paper explores the feasibility and +effectiveness of cooperative motion prediction. Our method, CMP, takes LiDAR +signals as input to enhance tracking and prediction capabilities. Unlike +previous work that focuses separately on either cooperative perception or +motion prediction, our framework, to the best of our knowledge, is the first to +address the unified problem where CAVs share information in both perception and +prediction modules. Incorporated into our design is the unique capability to +tolerate realistic V2X bandwidth limitations and transmission delays, while +dealing with bulky perception representations. We also propose a prediction +aggregation module, which unifies the predictions obtained by different CAVs +and generates the final prediction. Through extensive experiments and ablation +studies, we demonstrate the effectiveness of our method in cooperative +perception, tracking, and motion prediction tasks. In particular, CMP reduces +the average prediction error by 17.2\% with fewer missing detections compared +with the no cooperation setting. Our work marks a significant step forward in +the cooperative capabilities of CAVs, showcasing enhanced performance in +complex scenarios. + +
+
+
+
+
+ + ☆ Leveraging Near-Field Lighting for Monocular Depth Estimation from + Endoscopy Videos + + +
+ Monocular depth estimation in endoscopy videos can enable assistive and +robotic surgery to obtain better coverage of the organ and detection of various +health issues. Despite promising progress on mainstream, natural image depth +estimation, techniques perform poorly on endoscopy images due to a lack of +strong geometric features and challenging illumination effects. In this paper, +we utilize the photometric cues, i.e., the light emitted from an endoscope and +reflected by the surface, to improve monocular depth estimation. We first +create two novel loss functions with supervised and self-supervised variants +that utilize a per-pixel shading representation. We then propose a novel depth +refinement network (PPSNet) that leverages the same per-pixel shading +representation. Finally, we introduce teacher-student transfer learning to +produce better depth maps from both synthetic data with supervision and +clinical data with self-supervision. We achieve state-of-the-art results on the +C3VD dataset while estimating high-quality depth maps from clinical data. Our +code, pre-trained models, and supplementary materials can be found on our +project page: https://ppsnet.github.io/ + +
+
+ comment: 26 pages, 7 tables, 7 figures +
+
+
+
+
+ + ☆ ELGC-Net: Efficient Local-Global Context Aggregation for Remote Sensing + Change Detection + + +
+ Deep learning has shown remarkable success in remote sensing change detection +(CD), aiming to identify semantic change regions between co-registered +satellite image pairs acquired at distinct time stamps. However, existing +convolutional neural network and transformer-based frameworks often struggle to +accurately segment semantic change regions. Moreover, transformers-based +methods with standard self-attention suffer from quadratic computational +complexity with respect to the image resolution, making them less practical for +CD tasks with limited training data. To address these issues, we propose an +efficient change detection framework, ELGC-Net, which leverages rich contextual +information to precisely estimate change regions while reducing the model size. +Our ELGC-Net comprises a Siamese encoder, fusion modules, and a decoder. The +focus of our design is the introduction of an Efficient Local-Global Context +Aggregator module within the encoder, capturing enhanced global context and +local spatial information through a novel pooled-transpose (PT) attention and +depthwise convolution, respectively. The PT attention employs pooling +operations for robust feature extraction and minimizes computational cost with +transposed attention. Extensive experiments on three challenging CD datasets +demonstrate that ELGC-Net outperforms existing methods. Compared to the recent +transformer-based CD approach (ChangeFormer), ELGC-Net achieves a 1.4% gain in +intersection over union metric on the LEVIR-CD dataset, while significantly +reducing trainable parameters. Our proposed ELGC-Net sets a new +state-of-the-art performance in remote sensing change detection benchmarks. +Finally, we also introduce ELGC-Net-LW, a lighter variant with significantly +reduced computational complexity, suitable for resource-constrained settings, +while achieving comparable performance. Project url +https://github.com/techmn/elgcnet. + +
+
+ comment: accepted at IEEE TGRS +
+
+
+
+
+ + ☆ Scalable Non-Cartesian Magnetic Resonance Imaging with R2D2 + + +
+ We propose a new approach for non-Cartesian magnetic resonance image +reconstruction. While unrolled architectures provide robustness via +data-consistency layers, embedding measurement operators in Deep Neural Network +(DNN) can become impractical at large scale. Alternative Plug-and-Play (PnP) +approaches, where the denoising DNNs are blind to the measurement setting, are +not affected by this limitation and have also proven effective, but their +highly iterative nature also affects scalability. To address this scalability +challenge, we leverage the "Residual-to-Residual DNN series for high-Dynamic +range imaging (R2D2)" approach recently introduced in astronomical imaging. +R2D2's reconstruction is formed as a series of residual images, iteratively +estimated as outputs of DNNs taking the previous iteration's image estimate and +associated data residual as inputs. The method can be interpreted as a learned +version of the Matching Pursuit algorithm. We demonstrate R2D2 in simulation, +considering radial k-space sampling acquisition sequences. Our preliminary +results suggest that R2D2 achieves: (i) suboptimal performance compared to its +unrolled incarnation R2D2-Net, which is however non-scalable due to the +necessary embedding of NUFFT-based data-consistency layers; (ii) superior +reconstruction quality to a scalable version of R2D2-Net embedding an FFT-based +approximation for data consistency; (iii) superior reconstruction quality to +PnP, while only requiring few iterations. + +
+
+ comment: submitted to IEEE EUSIPCO 2024 +
+
+
+
+
+ + ☆ Serpent: Scalable and Efficient Image Restoration via Multi-scale + Structured State Space Models + + +
+ The landscape of computational building blocks of efficient image restoration +architectures is dominated by a combination of convolutional processing and +various attention mechanisms. However, convolutional filters are inherently +local and therefore struggle at modeling long-range dependencies in images. On +the other hand, attention excels at capturing global interactions between +arbitrary image regions, however at a quadratic cost in image dimension. In +this work, we propose Serpent, an architecture that leverages recent advances +in state space models (SSMs) in its core computational block. SSMs, originally +introduced for sequence modeling, can maintain a global receptive field with a +favorable linear scaling in input size. Our preliminary results demonstrate +that Serpent can achieve reconstruction quality on par with state-of-the-art +techniques, while requiring orders of magnitude less compute (up to $150$ fold +reduction in FLOPS) and a factor of up to $5\times$ less GPU memory while +maintaining a compact model size. + +
+
+ comment: 7 pages, 5 figures, preliminary workshop submission of a + comprehensive work to be released soon +
+
+
+
+
+ + ☆ Octree-GS: Towards Consistent Real-time Rendering with LOD-Structured 3D + Gaussians + + +
+ The recent 3D Gaussian splatting (3D-GS) has shown remarkable rendering +fidelity and efficiency compared to NeRF-based neural scene representations. +While demonstrating the potential for real-time rendering, 3D-GS encounters +rendering bottlenecks in large scenes with complex details due to an excessive +number of Gaussian primitives located within the viewing frustum. This +limitation is particularly noticeable in zoom-out views and can lead to +inconsistent rendering speeds in scenes with varying details. Moreover, it +often struggles to capture the corresponding level of details at different +scales with its heuristic density control operation. Inspired by the +Level-of-Detail (LOD) techniques, we introduce Octree-GS, featuring an +LOD-structured 3D Gaussian approach supporting level-of-detail decomposition +for scene representation that contributes to the final rendering results. Our +model dynamically selects the appropriate level from the set of +multi-resolution anchor points, ensuring consistent rendering performance with +adaptive LOD adjustments while maintaining high-fidelity rendering results. + +
+
+ comment: Project page: https://city-super.github.io/octree-gs/ +
+
+
+
+
+ + ☆ A Survey on 3D Egocentric Human Pose Estimation + + +
+ Egocentric human pose estimation aims to estimate human body poses and +develop body representations from a first-person camera perspective. It has +gained vast popularity in recent years because of its wide range of +applications in sectors like XR-technologies, human-computer interaction, and +fitness tracking. However, to the best of our knowledge, there is no systematic +literature review based on the proposed solutions regarding egocentric 3D human +pose estimation. To that end, the aim of this survey paper is to provide an +extensive overview of the current state of egocentric pose estimation research. +In this paper, we categorize and discuss the popular datasets and the different +pose estimation models, highlighting the strengths and weaknesses of different +methods by comparative analysis. This survey can be a valuable resource for +both researchers and practitioners in the field, offering insights into key +concepts and cutting-edge solutions in egocentric pose estimation, its +wide-ranging applications, as well as the open problems with future scope. + +
+
+
+
+
+ + ☆ 2D Gaussian Splatting for Geometrically Accurate Radiance Fields + + +
+ 3D Gaussian Splatting (3DGS) has recently revolutionized radiance field +reconstruction, achieving high quality novel view synthesis and fast rendering +speed without baking. However, 3DGS fails to accurately represent surfaces due +to the multi-view inconsistent nature of 3D Gaussians. We present 2D Gaussian +Splatting (2DGS), a novel approach to model and reconstruct geometrically +accurate radiance fields from multi-view images. Our key idea is to collapse +the 3D volume into a set of 2D oriented planar Gaussian disks. Unlike 3D +Gaussians, 2D Gaussians provide view-consistent geometry while modeling +surfaces intrinsically. To accurately recover thin surfaces and achieve stable +optimization, we introduce a perspective-accurate 2D splatting process +utilizing ray-splat intersection and rasterization. Additionally, we +incorporate depth distortion and normal consistency terms to further enhance +the quality of the reconstructions. We demonstrate that our differentiable +renderer allows for noise-free and detailed geometry reconstruction while +maintaining competitive appearance quality, fast training speed, and real-time +rendering. Our code will be made publicly available. + +
+
+ comment: 12 pages, 12 figures +
+
+
+
+
+ + ☆ Sen2Fire: A Challenging Benchmark Dataset for Wildfire Detection using + Sentinel Data + + +
+ Utilizing satellite imagery for wildfire detection presents substantial +potential for practical applications. To advance the development of machine +learning algorithms in this domain, our study introduces the \textit{Sen2Fire} +dataset--a challenging satellite remote sensing dataset tailored for wildfire +detection. This dataset is curated from Sentinel-2 multi-spectral data and +Sentinel-5P aerosol product, comprising a total of 2466 image patches. Each +patch has a size of 512$\times$512 pixels with 13 bands. Given the distinctive +sensitivities of various wavebands to wildfire responses, our research focuses +on optimizing wildfire detection by evaluating different wavebands and +employing a combination of spectral indices, such as normalized burn ratio +(NBR) and normalized difference vegetation index (NDVI). The results suggest +that, in contrast to using all bands for wildfire detection, selecting specific +band combinations yields superior performance. Additionally, our study +underscores the positive impact of integrating Sentinel-5 aerosol data for +wildfire detection. The code and dataset are available online +(https://zenodo.org/records/10881058). + +
+
+
+
+
+ + ☆ Superior and Pragmatic Talking Face Generation with Teacher-Student + Framework + + +
+ Talking face generation technology creates talking videos from arbitrary +appearance and motion signal, with the "arbitrary" offering ease of use but +also introducing challenges in practical applications. Existing methods work +well with standard inputs but suffer serious performance degradation with +intricate real-world ones. Moreover, efficiency is also an important concern in +deployment. To comprehensively address these issues, we introduce SuperFace, a +teacher-student framework that balances quality, robustness, cost and +editability. We first propose a simple but effective teacher model capable of +handling inputs of varying qualities to generate high-quality results. Building +on this, we devise an efficient distillation strategy to acquire an +identity-specific student model that maintains quality with significantly +reduced computational load. Our experiments validate that SuperFace offers a +more comprehensive solution than existing methods for the four mentioned +objectives, especially in reducing FLOPs by 99\% with the student model. +SuperFace can be driven by both video and audio and allows for localized facial +attributes editing. + +
+
+
+
+
+ + ☆ Deepfake Generation and Detection: A Benchmark and Survey + + +
+ In addition to the advancements in deepfake generation, corresponding +detection technologies need to continuously evolve to regulate the potential +misuse of deepfakes, such as for privacy invasion and phishing attacks. This +survey comprehensively reviews the latest developments in deepfake generation +and detection, summarizing and analyzing the current state of the art in this +rapidly evolving field. We first unify task definitions, comprehensively +introduce datasets and metrics, and discuss the development of generation and +detection technology frameworks. Then, we discuss the development of several +related sub-fields and focus on researching four mainstream deepfake fields: +popular face swap, face reenactment, talking face generation, and facial +attribute editing, as well as foreign detection. Subsequently, we +comprehensively benchmark representative methods on popular datasets for each +field, fully evaluating the latest and influential works published in top +conferences/journals. Finally, we analyze the challenges and future research +directions of the discussed fields. We closely follow the latest developments +in https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection. + +
+
+
+
+
+ + ☆ Low-Latency Neural Stereo Streaming CVPR2024 + + +
+ The rise of new video modalities like virtual reality or autonomous driving +has increased the demand for efficient multi-view video compression methods, +both in terms of rate-distortion (R-D) performance and in terms of delay and +runtime. While most recent stereo video compression approaches have shown +promising performance, they compress left and right views sequentially, leading +to poor parallelization and runtime performance. This work presents Low-Latency +neural codec for Stereo video Streaming (LLSS), a novel parallel stereo video +coding method designed for fast and efficient low-latency stereo video +streaming. Instead of using a sequential cross-view motion compensation like +existing methods, LLSS introduces a bidirectional feature shifting module to +directly exploit mutual information among views and encode them effectively +with a joint cross-view prior model for entropy coding. Thanks to this design, +LLSS processes left and right views in parallel, minimizing latency; all while +substantially improving R-D performance compared to both existing neural and +conventional codecs. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Boosting Diffusion Models with Moving Average Sampling in Frequency + Domain CVPR 2024 + + +
+ Diffusion models have recently brought a powerful revolution in image +generation. Despite showing impressive generative capabilities, most of these +models rely on the current sample to denoise the next one, possibly resulting +in denoising instability. In this paper, we reinterpret the iterative denoising +process as model optimization and leverage a moving average mechanism to +ensemble all the prior samples. Instead of simply applying moving average to +the denoised samples at different timesteps, we first map the denoised samples +to data space and then perform moving average to avoid distribution shift +across timesteps. In view that diffusion models evolve the recovery from +low-frequency components to high-frequency details, we further decompose the +samples into different frequency components and execute moving average +separately on each component. We name the complete approach "Moving Average +Sampling in Frequency domain (MASF)". MASF could be seamlessly integrated into +mainstream pre-trained diffusion models and sampling schedules. Extensive +experiments on both unconditional and conditional diffusion models demonstrate +that our MASF leads to superior performances compared to the baselines, with +almost negligible additional complexity cost. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ To Supervise or Not to Supervise: Understanding and Addressing the Key + Challenges of 3D Transfer Learning + + +
+ Transfer learning has long been a key factor in the advancement of many +fields including 2D image analysis. Unfortunately, its applicability in 3D data +processing has been relatively limited. While several approaches for 3D +transfer learning have been proposed in recent literature, with contrastive +learning gaining particular prominence, most existing methods in this domain +have only been studied and evaluated in limited scenarios. Most importantly, +there is currently a lack of principled understanding of both when and why 3D +transfer learning methods are applicable. Remarkably, even the applicability of +standard supervised pre-training is poorly understood. In this work, we conduct +the first in-depth quantitative and qualitative investigation of supervised and +contrastive pre-training strategies and their utility in downstream 3D tasks. +We demonstrate that layer-wise analysis of learned features provides +significant insight into the downstream utility of trained networks. Informed +by this analysis, we propose a simple geometric regularization strategy, which +improves the transferability of supervised pre-training. Our work thus sheds +light onto both the specific challenges of 3D transfer learning, as well as +strategies to overcome them. + +
+
+
+
+
+ + ☆ Hierarchical Open-Vocabulary 3D Scene Graphs for Language-Grounded Robot + Navigation + + +
+ Recent open-vocabulary robot mapping methods enrich dense geometric maps with +pre-trained visual-language features. While these maps allow for the prediction +of point-wise saliency maps when queried for a certain language concept, +large-scale environments and abstract queries beyond the object level still +pose a considerable hurdle, ultimately limiting language-grounded robotic +navigation. In this work, we present HOV-SG, a hierarchical open-vocabulary 3D +scene graph mapping approach for language-grounded robot navigation. Leveraging +open-vocabulary vision foundation models, we first obtain state-of-the-art +open-vocabulary segment-level maps in 3D and subsequently construct a 3D scene +graph hierarchy consisting of floor, room, and object concepts, each enriched +with open-vocabulary features. Our approach is able to represent multi-story +buildings and allows robotic traversal of those using a cross-floor Voronoi +graph. HOV-SG is evaluated on three distinct datasets and surpasses previous +baselines in open-vocabulary semantic accuracy on the object, room, and floor +level while producing a 75% reduction in representation size compared to dense +open-vocabulary maps. In order to prove the efficacy and generalization +capabilities of HOV-SG, we showcase successful long-horizon +language-conditioned robot navigation within real-world multi-storage +environments. We provide code and trial video data at http://hovsg.github.io/. + +
+
+ comment: Code and video are available at http://hovsg.github.io/ +
+
+
+
+
+ + ☆ ReMamber: Referring Image Segmentation with Mamba Twister + + +
+ Referring Image Segmentation (RIS) leveraging transformers has achieved great +success on the interpretation of complex visual-language tasks. However, the +quadratic computation cost makes it resource-consuming in capturing long-range +visual-language dependencies. Fortunately, Mamba addresses this with efficient +linear complexity in processing. However, directly applying Mamba to +multi-modal interactions presents challenges, primarily due to inadequate +channel interactions for the effective fusion of multi-modal data. In this +paper, we propose ReMamber, a novel RIS architecture that integrates the power +of Mamba with a multi-modal Mamba Twister block. The Mamba Twister explicitly +models image-text interaction, and fuses textual and visual features through +its unique channel and spatial twisting mechanism. We achieve the +state-of-the-art on three challenging benchmarks. Moreover, we conduct thorough +analyses of ReMamber and discuss other fusion designs using Mamba. These +provide valuable perspectives for future research. + +
+
+
+
+
+ + ☆ GTA-HDR: A Large-Scale Synthetic Dataset for HDR Image Reconstruction + + +
+ High Dynamic Range (HDR) content (i.e., images and videos) has a broad range +of applications. However, capturing HDR content from real-world scenes is +expensive and time- consuming. Therefore, the challenging task of +reconstructing visually accurate HDR images from their Low Dynamic Range (LDR) +counterparts is gaining attention in the vision research community. A major +challenge in this research problem is the lack of datasets, which capture +diverse scene conditions (e.g., lighting, shadows, weather, locations, +landscapes, objects, humans, buildings) and various image features (e.g., +color, contrast, saturation, hue, luminance, brightness, radiance). To address +this gap, in this paper, we introduce GTA-HDR, a large-scale synthetic dataset +of photo-realistic HDR images sampled from the GTA-V video game. We perform +thorough evaluation of the proposed dataset, which demonstrates significant +qualitative and quantitative improvements of the state-of-the-art HDR image +reconstruction methods. Furthermore, we demonstrate the effectiveness of the +proposed dataset and its impact on additional computer vision tasks including +3D human pose estimation, human body part segmentation, and holistic scene +segmentation. The dataset, data collection pipeline, and evaluation code are +available at: https://github.com/HrishavBakulBarua/GTA-HDR. + +
+
+ comment: Submitted to IEEE +
+
+
+
+
+ + ☆ A foundation model utilizing chest CT volumes and radiology reports for + supervised-level zero-shot detection of abnormalities + + +
+ A major challenge in computational research in 3D medical imaging is the lack +of comprehensive datasets. Addressing this issue, our study introduces CT-RATE, +the first 3D medical imaging dataset that pairs images with textual reports. +CT-RATE consists of 25,692 non-contrast chest CT volumes, expanded to 50,188 +through various reconstructions, from 21,304 unique patients, along with +corresponding radiology text reports. Leveraging CT-RATE, we developed CT-CLIP, +a CT-focused contrastive language-image pre-training framework. As a versatile, +self-supervised model, CT-CLIP is designed for broad application and does not +require task-specific training. Remarkably, CT-CLIP outperforms +state-of-the-art, fully supervised methods in multi-abnormality detection +across all key metrics, thus eliminating the need for manual annotation. We +also demonstrate its utility in case retrieval, whether using imagery or +textual queries, thereby advancing knowledge dissemination. The open-source +release of CT-RATE and CT-CLIP marks a significant advancement in medical AI, +enhancing 3D imaging analysis and fostering innovation in healthcare. + +
+
+
+
+
+ + ☆ Assessment of Multimodal Large Language Models in Alignment with Human + Values + + +
+ Large Language Models (LLMs) aim to serve as versatile assistants aligned +with human values, as defined by the principles of being helpful, honest, and +harmless (hhh). However, in terms of Multimodal Large Language Models (MLLMs), +despite their commendable performance in perception and reasoning tasks, their +alignment with human values remains largely unexplored, given the complexity of +defining hhh dimensions in the visual world and the difficulty in collecting +relevant data that accurately mirrors real-world situations. To address this +gap, we introduce Ch3Ef, a Compreh3ensive Evaluation dataset and strategy for +assessing alignment with human expectations. Ch3Ef dataset contains 1002 +human-annotated data samples, covering 12 domains and 46 tasks based on the hhh +principle. We also present a unified evaluation strategy supporting assessment +across various scenarios and different perspectives. Based on the evaluation +results, we summarize over 10 key findings that deepen the understanding of +MLLM capabilities, limitations, and the dynamic relationships between +evaluation levels, guiding future advancements in the field. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2311.02692 +
+
+
+
+
+ + ☆ DiffH2O: Diffusion-Based Synthesis of Hand-Object Interactions from + Textual Descriptions + + +
+ Generating natural hand-object interactions in 3D is challenging as the +resulting hand and object motions are expected to be physically plausible and +semantically meaningful. Furthermore, generalization to unseen objects is +hindered by the limited scale of available hand-object interaction datasets. We +propose DiffH2O, a novel method to synthesize realistic, one or two-handed +object interactions from provided text prompts and geometry of the object. The +method introduces three techniques that enable effective learning from limited +data. First, we decompose the task into a grasping stage and a text-based +interaction stage and use separate diffusion models for each. In the grasping +stage, the model only generates hand motions, whereas in the interaction phase +both hand and object poses are synthesized. Second, we propose a compact +representation that tightly couples hand and object poses. Third, we propose +two different guidance schemes to allow more control of the generated motions: +grasp guidance and detailed textual guidance. Grasp guidance takes a single +target grasping pose and guides the diffusion model to reach this grasp at the +end of the grasping stage, which provides control over the grasping pose. Given +a grasping motion from this stage, multiple different actions can be prompted +in the interaction phase. For textual guidance, we contribute comprehensive +text descriptions to the GRAB dataset and show that they enable our method to +have more fine-grained control over hand-object interactions. Our quantitative +and qualitative evaluation demonstrates that the proposed method outperforms +baseline methods and leads to natural hand-object motions. Moreover, we +demonstrate the practicality of our framework by utilizing a hand pose estimate +from an off-the-shelf pose estimator for guidance, and then sampling multiple +different actions in the interaction stage. + +
+
+ comment: Project Page: https://diffh2o.github.io/ +
+
+
+
+
+ + ☆ Efficient Image Pre-Training with Siamese Cropped Masked Autoencoders + + +
+ Self-supervised pre-training of image encoders is omnipresent in the +literature, particularly following the introduction of Masked autoencoders +(MAE). Current efforts attempt to learn object-centric representations from +motion in videos. In particular, SiamMAE recently introduced a Siamese network, +training a shared-weight encoder from two frames of a video with a high +asymmetric masking ratio (95%). In this work, we propose CropMAE, an +alternative approach to the Siamese pre-training introduced by SiamMAE. Our +method specifically differs by exclusively considering pairs of cropped images +sourced from the same image but cropped differently, deviating from the +conventional pairs of frames extracted from a video. CropMAE therefore +alleviates the need for video datasets, while maintaining competitive +performances and drastically reducing pre-training time. Furthermore, we +demonstrate that CropMAE learns similar object-centric representations without +explicit motion, showing that current self-supervised learning methods do not +learn objects from motion, but rather thanks to the Siamese architecture. +Finally, CropMAE achieves the highest masking ratio to date (98.5%), enabling +the reconstruction of images using only two visible patches. Our code is +available at https://github.com/alexandre-eymael/CropMAE. + +
+
+ comment: 19 pages, 6 figures, 3 tables, 1 page of supplementary material +
+
+
+
+
+ + ☆ DN-Splatter: Depth and Normal Priors for Gaussian Splatting and Meshing + + +
+ 3D Gaussian splatting, a novel differentiable rendering technique, has +achieved state-of-the-art novel view synthesis results with high rendering +speeds and relatively low training times. However, its performance on scenes +commonly seen in indoor datasets is poor due to the lack of geometric +constraints during optimization. We extend 3D Gaussian splatting with depth and +normal cues to tackle challenging indoor datasets and showcase techniques for +efficient mesh extraction, an important downstream application. Specifically, +we regularize the optimization procedure with depth information, enforce local +smoothness of nearby Gaussians, and use the geometry of the 3D Gaussians +supervised by normal cues to achieve better alignment with the true scene +geometry. We improve depth estimation and novel view synthesis results over +baselines and show how this simple yet effective regularization technique can +be used to directly extract meshes from the Gaussian representation yielding +more physically accurate reconstructions on indoor scenes. Our code will be +released in https://github.com/maturk/dn-splatter. + +
+
+
+
+
+ + ☆ Annotated Biomedical Video Generation using Denoising Diffusion + Probabilistic Models and Flow Fields + + +
+ The segmentation and tracking of living cells play a vital role within the +biomedical domain, particularly in cancer research, drug development, and +developmental biology. These are usually tedious and time-consuming tasks that +are traditionally done by biomedical experts. Recently, to automatize these +processes, deep learning based segmentation and tracking methods have been +proposed. These methods require large-scale datasets and their full potential +is constrained by the scarcity of annotated data in the biomedical imaging +domain. To address this limitation, we propose Biomedical Video Diffusion Model +(BVDM), capable of generating realistic-looking synthetic microscopy videos. +Trained only on a single real video, BVDM can generate videos of arbitrary +length with pixel-level annotations that can be used for training data-hungry +models. It is composed of a denoising diffusion probabilistic model (DDPM) +generating high-fidelity synthetic cell microscopy images and a flow prediction +model (FPM) predicting the non-rigid transformation between consecutive video +frames. During inference, initially, the DDPM imposes realistic cell textures +on synthetic cell masks which are generated based on real data statistics. The +flow prediction model predicts the flow field between consecutive masks and +applies that to the DDPM output from the previous time frame to create the next +one while keeping temporal consistency. BVDM outperforms state-of-the-art +synthetic live cell microscopy video generation models. Furthermore, we +demonstrate that a sufficiently large synthetic dataset enhances the +performance of cell segmentation and tracking models compared to using a +limited amount of available real data. + +
+
+
+
+
+ + ☆ Improving Text-to-Image Consistency via Automatic Prompt Optimization + + +
+ Impressive advances in text-to-image (T2I) generative models have yielded a +plethora of high performing models which are able to generate aesthetically +appealing, photorealistic images. Despite the progress, these models still +struggle to produce images that are consistent with the input prompt, +oftentimes failing to capture object quantities, relations and attributes +properly. Existing solutions to improve prompt-image consistency suffer from +the following challenges: (1) they oftentimes require model fine-tuning, (2) +they only focus on nearby prompt samples, and (3) they are affected by +unfavorable trade-offs among image quality, representation diversity, and +prompt-image consistency. In this paper, we address these challenges and +introduce a T2I optimization-by-prompting framework, OPT2I, which leverages a +large language model (LLM) to improve prompt-image consistency in T2I models. +Our framework starts from a user prompt and iteratively generates revised +prompts with the goal of maximizing a consistency score. Our extensive +validation on two datasets, MSCOCO and PartiPrompts, shows that OPT2I can boost +the initial consistency score by up to 24.9% in terms of DSG score while +preserving the FID and increasing the recall between generated and real data. +Our work paves the way toward building more reliable and robust T2I systems by +harnessing the power of LLMs. + +
+
+
+
+
+ + ☆ Towards 3D Vision with Low-Cost Single-Photon Cameras + + +
+ We present a method for reconstructing 3D shape of arbitrary Lambertian +objects based on measurements by miniature, energy-efficient, low-cost +single-photon cameras. These cameras, operating as time resolved image sensors, +illuminate the scene with a very fast pulse of diffuse light and record the +shape of that pulse as it returns back from the scene at a high temporal +resolution. We propose to model this image formation process, account for its +non-idealities, and adapt neural rendering to reconstruct 3D geometry from a +set of spatially distributed sensors with known poses. We show that our +approach can successfully recover complex 3D shapes from simulated data. We +further demonstrate 3D object reconstruction from real-world captures, +utilizing measurements from a commodity proximity sensor. Our work draws a +connection between image-based modeling and active range scanning and is a step +towards 3D vision with single-photon cameras. + +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Prompt-Engineered Large Multimodal Models + Versus Fine-Tuned Vision Transformers in Image-Based Security Applications + + +
+ The success of Large Language Models (LLMs) has led to a parallel rise in the +development of Large Multimodal Models (LMMs), such as Gemini-pro, which have +begun to transform a variety of applications. These sophisticated multimodal +models are designed to interpret and analyze complex data, integrating both +textual and visual information on a scale previously unattainable, opening new +avenues for a range of applications. This paper investigates the applicability +and effectiveness of prompt-engineered Gemini-pro LMMs versus fine-tuned Vision +Transformer (ViT) models in addressing critical security challenges. We focus +on two distinct tasks: a visually evident task of detecting simple triggers, +such as small squares in images, indicative of potential backdoors, and a +non-visually evident task of malware classification through visual +representations. Our results highlight a significant divergence in performance, +with Gemini-pro falling short in accuracy and reliability when compared to +fine-tuned ViT models. The ViT models, on the other hand, demonstrate +exceptional accuracy, achieving near-perfect performance on both tasks. This +study not only showcases the strengths and limitations of prompt-engineered +LMMs in cybersecurity applications but also emphasizes the unmatched efficacy +of fine-tuned ViT models for precise and dependable tasks. + +
+
+
+
+
+ + ☆ GenesisTex: Adapting Image Denoising Diffusion to Texture Space + + +
+ We present GenesisTex, a novel method for synthesizing textures for 3D +geometries from text descriptions. GenesisTex adapts the pretrained image +diffusion model to texture space by texture space sampling. Specifically, we +maintain a latent texture map for each viewpoint, which is updated with +predicted noise on the rendering of the corresponding viewpoint. The sampled +latent texture maps are then decoded into a final texture map. During the +sampling process, we focus on both global and local consistency across multiple +viewpoints: global consistency is achieved through the integration of style +consistency mechanisms within the noise prediction network, and low-level +consistency is achieved by dynamically aligning latent textures. Finally, we +apply reference-based inpainting and img2img on denser views for texture +refinement. Our approach overcomes the limitations of slow optimization in +distillation-based methods and instability in inpainting-based methods. +Experiments on meshes from various sources demonstrate that our method +surpasses the baseline methods quantitatively and qualitatively. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ☆ CT Synthesis with Conditional Diffusion Models for Abdominal Lymph Node + Segmentation + + +
+ Despite the significant success achieved by deep learning methods in medical +image segmentation, researchers still struggle in the computer-aided diagnosis +of abdominal lymph nodes due to the complex abdominal environment, small and +indistinguishable lesions, and limited annotated data. To address these +problems, we present a pipeline that integrates the conditional diffusion model +for lymph node generation and the nnU-Net model for lymph node segmentation to +improve the segmentation performance of abdominal lymph nodes through +synthesizing a diversity of realistic abdominal lymph node data. We propose +LN-DDPM, a conditional denoising diffusion probabilistic model (DDPM) for lymph +node (LN) generation. LN-DDPM utilizes lymph node masks and anatomical +structure masks as model conditions. These conditions work in two conditioning +mechanisms: global structure conditioning and local detail conditioning, to +distinguish between lymph nodes and their surroundings and better capture lymph +node characteristics. The obtained paired abdominal lymph node images and masks +are used for the downstream segmentation task. Experimental results on the +abdominal lymph node datasets demonstrate that LN-DDPM outperforms other +generative methods in the abdominal lymph node image synthesis and better +assists the downstream abdominal lymph node segmentation task. + +
+
+
+
+
+ + ☆ MUTE-SLAM: Real-Time Neural SLAM with Multiple Tri-Plane Hash + Representations + + +
+ We introduce MUTE-SLAM, a real-time neural RGB-D SLAM system employing +multiple tri-plane hash-encodings for efficient scene representation. MUTE-SLAM +effectively tracks camera positions and incrementally builds a scalable +multi-map representation for both small and large indoor environments. It +dynamically allocates sub-maps for newly observed local regions, enabling +constraint-free mapping without prior scene information. Unlike traditional +grid-based methods, we use three orthogonal axis-aligned planes for +hash-encoding scene properties, significantly reducing hash collisions and the +number of trainable parameters. This hybrid approach not only speeds up +convergence but also enhances the fidelity of surface reconstruction. +Furthermore, our optimization strategy concurrently optimizes all sub-maps +intersecting with the current camera frustum, ensuring global consistency. +Extensive testing on both real-world and synthetic datasets has shown that +MUTE-SLAM delivers state-of-the-art surface reconstruction quality and +competitive tracking performance across diverse indoor settings. The code will +be made public upon acceptance of the paper. + +
+
+
+
+
+ + ☆ Makeup Prior Models for 3D Facial Makeup Estimation and Applications CVPR2024 + + +
+ In this work, we introduce two types of makeup prior models to extend +existing 3D face prior models: PCA-based and StyleGAN2-based priors. The +PCA-based prior model is a linear model that is easy to construct and is +computationally efficient. However, it retains only low-frequency information. +Conversely, the StyleGAN2-based model can represent high-frequency information +with relatively higher computational cost than the PCA-based model. Although +there is a trade-off between the two models, both are applicable to 3D facial +makeup estimation and related applications. By leveraging makeup prior models +and designing a makeup consistency module, we effectively address the +challenges that previous methods faced in robustly estimating makeup, +particularly in the context of handling self-occluded faces. In experiments, we +demonstrate that our approach reduces computational costs by several orders of +magnitude, achieving speeds up to 180 times faster. In addition, by improving +the accuracy of the estimated makeup, we confirm that our methods are highly +advantageous for various 3D facial makeup applications such as 3D makeup face +reconstruction, user-friendly makeup editing, makeup transfer, and +interpolation. + +
+
+ comment: CVPR2024. Project: https://yangxingchao.github.io/makeup-priors-page +
+
+
+
+
+ + ☆ Noise2Noise Denoising of CRISM Hyperspectral Data ICLR 2024 + + +
+ Hyperspectral data acquired by the Compact Reconnaissance Imaging +Spectrometer for Mars (CRISM) have allowed for unparalleled mapping of the +surface mineralogy of Mars. Due to sensor degradation over time, a significant +portion of the recently acquired data is considered unusable. Here a new +data-driven model architecture, Noise2Noise4Mars (N2N4M), is introduced to +remove noise from CRISM images. Our model is self-supervised and does not +require zero-noise target data, making it well suited for use in Planetary +Science applications where high quality labelled data is scarce. We demonstrate +its strong performance on synthetic-noise data and CRISM images, and its impact +on downstream classification performance, outperforming benchmark methods on +most metrics. This allows for detailed analysis for critical sites of interest +on the Martian surface, including proposed lander sites. + +
+
+ comment: 5 pages, 3 figures. Accepted as a conference paper at the ICLR 2024 + ML4RS Workshop +
+
+
+
+
+ + ☆ DataCook: Crafting Anti-Adversarial Examples for Healthcare Data + Copyright Protection + + +
+ In the realm of healthcare, the challenges of copyright protection and +unauthorized third-party misuse are increasingly significant. Traditional +methods for data copyright protection are applied prior to data distribution, +implying that models trained on these data become uncontrollable. This paper +introduces a novel approach, named DataCook, designed to safeguard the +copyright of healthcare data during the deployment phase. DataCook operates by +"cooking" the raw data before distribution, enabling the development of models +that perform normally on this processed data. However, during the deployment +phase, the original test data must be also "cooked" through DataCook to ensure +normal model performance. This process grants copyright holders control over +authorization during the deployment phase. The mechanism behind DataCook is by +crafting anti-adversarial examples (AntiAdv), which are designed to enhance +model confidence, as opposed to standard adversarial examples (Adv) that aim to +confuse models. Similar to Adv, AntiAdv introduces imperceptible perturbations, +ensuring that the data processed by DataCook remains easily understandable. We +conducted extensive experiments on MedMNIST datasets, encompassing both 2D/3D +data and the high-resolution variants. The outcomes indicate that DataCook +effectively meets its objectives, preventing models trained on AntiAdv from +analyzing unauthorized data effectively, without compromising the validity and +accuracy of the data in legitimate scenarios. Code and data are available at +https://github.com/MedMNIST/DataCook. + +
+
+
+
+
+ + ☆ Multi-Task Dense Prediction via Mixture of Low-Rank Experts CVPR 2024 + + +
+ Previous multi-task dense prediction methods based on the Mixture of Experts +(MoE) have received great performance but they neglect the importance of +explicitly modeling the global relations among all tasks. In this paper, we +present a novel decoder-focused method for multi-task dense prediction, called +Mixture-of-Low-Rank-Experts (MLoRE). To model the global task relationships, +MLoRE adds a generic convolution path to the original MoE structure, where each +task feature can go through this path for explicit parameter sharing. +Furthermore, to control the parameters and computational cost brought by the +increase in the number of experts, we take inspiration from LoRA and propose to +leverage the low-rank format of a vanilla convolution in the expert network. +Since the low-rank experts have fewer parameters and can be dynamically +parameterized into the generic convolution, the parameters and computational +cost do not change much with the increase of experts. Benefiting from this +design, we increase the number of experts and its reception field to enlarge +the representation capacity, facilitating multiple dense tasks learning in a +unified network. Extensive experiments on the PASCAL-Context and NYUD-v2 +benchmarks show that our MLoRE achieves superior performance compared to +previous state-of-the-art methods on all metrics. Our code is available at +https://github.com/YuqiYang213/MLoRE. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Paired Diffusion: Generation of related, synthetic PET-CT-Segmentation + scans using Linked Denoising Diffusion Probabilistic Models + + +
+ The rapid advancement of Artificial Intelligence (AI) in biomedical imaging +and radiotherapy is hindered by the limited availability of large imaging data +repositories. With recent research and improvements in denoising diffusion +probabilistic models (DDPM), high quality synthetic medical scans are now +possible. Despite this, there is currently no way of generating multiple +related images, such as a corresponding ground truth which can be used to train +models, so synthetic scans are often manually annotated before use. This +research introduces a novel architecture that is able to generate multiple, +related PET-CT-tumour mask pairs using paired networks and conditional +encoders. Our approach includes innovative, time step-controlled mechanisms and +a `noise-seeding' strategy to improve DDPM sampling consistency. While our +model requires a modified perceptual loss function to ensure accurate feature +alignment we show generation of clearly aligned synthetic images and +improvement in segmentation accuracy with generated images. + +
+
+ comment: to be published in IEEE International Symposium on Biomedical Imaging + 2024 +
+
+
+
+
+ + ☆ FastPerson: Enhancing Video Learning through Effective Video + Summarization that Preserves Linguistic and Visual Contexts + + +
+ Quickly understanding lengthy lecture videos is essential for learners with +limited time and interest in various topics to improve their learning +efficiency. To this end, video summarization has been actively researched to +enable users to view only important scenes from a video. However, these studies +focus on either the visual or audio information of a video and extract +important segments in the video. Therefore, there is a risk of missing +important information when both the teacher's speech and visual information on +the blackboard or slides are important, such as in a lecture video. To tackle +this issue, we propose FastPerson, a video summarization approach that +considers both the visual and auditory information in lecture videos. +FastPerson creates summary videos by utilizing audio transcriptions along with +on-screen images and text, minimizing the risk of overlooking crucial +information for learners. Further, it provides a feature that allows learners +to switch between the summary and original videos for each chapter of the +video, enabling them to adjust the pace of learning based on their interests +and level of understanding. We conducted an evaluation with 40 participants to +assess the effectiveness of our method and confirmed that it reduced viewing +time by 53\% at the same level of comprehension as that when using traditional +video playback methods. + +
+
+
+
+
+ + ☆ Deep Learning for Segmentation of Cracks in High-Resolution Images of + Steel Bridges + + +
+ Automating the current bridge visual inspection practices using drones and +image processing techniques is a prominent way to make these inspections more +effective, robust, and less expensive. In this paper, we investigate the +development of a novel deep-learning method for the detection of fatigue cracks +in high-resolution images of steel bridges. First, we present a novel and +challenging dataset comprising of images of cracks in steel bridges. Secondly, +we integrate the ConvNext neural network with a previous state- of-the-art +encoder-decoder network for crack segmentation. We study and report, the +effects of the use of background patches on the network performance when +applied to high-resolution images of cracks in steel bridges. Finally, we +introduce a loss function that allows the use of more background patches for +the training process, which yields a significant reduction in false positive +rates. + +
+
+
+
+
+ + ☆ Invisible Gas Detection: An RGB-Thermal Cross Attention Network and A + New Benchmark + + +
+ The widespread use of various chemical gases in industrial processes +necessitates effective measures to prevent their leakage during transportation +and storage, given their high toxicity. Thermal infrared-based computer vision +detection techniques provide a straightforward approach to identify gas leakage +areas. However, the development of high-quality algorithms has been challenging +due to the low texture in thermal images and the lack of open-source datasets. +In this paper, we present the RGB-Thermal Cross Attention Network (RT-CAN), +which employs an RGB-assisted two-stream network architecture to integrate +texture information from RGB images and gas area information from thermal +images. Additionally, to facilitate the research of invisible gas detection, we +introduce Gas-DB, an extensive open-source gas detection database including +about 1.3K well-annotated RGB-thermal images with eight variant collection +scenes. Experimental results demonstrate that our method successfully leverages +the advantages of both modalities, achieving state-of-the-art (SOTA) +performance among RGB-thermal methods, surpassing single-stream SOTA models in +terms of accuracy, Intersection of Union (IoU), and F2 metrics by 4.86%, 5.65%, +and 4.88%, respectively. The code and data will be made available soon. + +
+
+
+
+
+ + ☆ Groupwise Query Specialization and Quality-Aware Multi-Assignment for + Transformer-based Visual Relationship Detection CVPR 2024 + + +
+ Visual Relationship Detection (VRD) has seen significant advancements with +Transformer-based architectures recently. However, we identify two key +limitations in a conventional label assignment for training Transformer-based +VRD models, which is a process of mapping a ground-truth (GT) to a prediction. +Under the conventional assignment, an unspecialized query is trained since a +query is expected to detect every relation, which makes it difficult for a +query to specialize in specific relations. Furthermore, a query is also +insufficiently trained since a GT is assigned only to a single prediction, +therefore near-correct or even correct predictions are suppressed by being +assigned no relation as a GT. To address these issues, we propose Groupwise +Query Specialization and Quality-Aware Multi-Assignment (SpeaQ). Groupwise +Query Specialization trains a specialized query by dividing queries and +relations into disjoint groups and directing a query in a specific query group +solely toward relations in the corresponding relation group. Quality-Aware +Multi-Assignment further facilitates the training by assigning a GT to multiple +predictions that are significantly close to a GT in terms of a subject, an +object, and the relation in between. Experimental results and analyses show +that SpeaQ effectively trains specialized queries, which better utilize the +capacity of a model, resulting in consistent performance gains with zero +additional inference cost across multiple VRD models and benchmarks. Code is +available at https://github.com/mlvlab/SpeaQ. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Panonut360: A Head and Eye Tracking Dataset for Panoramic Video ACM MM + + +
+ With the rapid development and widespread application of VR/AR technology, +maximizing the quality of immersive panoramic video services that match users' +personal preferences and habits has become a long-standing challenge. +Understanding the saliency region where users focus, based on data collected +with HMDs, can promote multimedia encoding, transmission, and quality +assessment. At the same time, large-scale datasets are essential for +researchers and developers to explore short/long-term user behavior patterns +and train AI models related to panoramic videos. However, existing panoramic +video datasets often include low-frequency user head or eye movement data +through short-term videos only, lacking sufficient data for analyzing users' +Field of View (FoV) and generating video saliency regions. + Driven by these practical factors, in this paper, we present a head and eye +tracking dataset involving 50 users (25 males and 25 females) watching 15 +panoramic videos. The dataset provides details on the viewport and gaze +attention locations of users. Besides, we present some statistics samples +extracted from the dataset. For example, the deviation between head and eye +movements challenges the widely held assumption that gaze attention decreases +from the center of the FoV following a Gaussian distribution. Our analysis +reveals a consistent downward offset in gaze fixations relative to the FoV in +experimental settings involving multiple users and videos. That's why we name +the dataset Panonut, a saliency weighting shaped like a donut. Finally, we also +provide a script that generates saliency distributions based on given head or +eye coordinates and pre-generated saliency distribution map sets of each video +from the collected eye tracking data. + The dataset is available on website: https://dianvrlab.github.io/Panonut360/. + +
+
+ comment: 7 pages,ACM MMSys'24 accepted +
+
+
+
+
+ + ☆ The Solution for the CVPR 2023 1st foundation model challenge-Track2 + + +
+ In this paper, we propose a solution for cross-modal transportation +retrieval. Due to the cross-domain problem of traffic images, we divide the +problem into two sub-tasks of pedestrian retrieval and vehicle retrieval +through a simple strategy. In pedestrian retrieval tasks, we use IRRA as the +base model and specifically design an Attribute Classification to mine the +knowledge implied by attribute labels. More importantly, We use the strategy of +Inclusion Relation Matching to make the image-text pairs with inclusion +relation have similar representation in the feature space. For the vehicle +retrieval task, we use BLIP as the base model. Since aligning the color +attributes of vehicles is challenging, we introduce attribute-based object +detection techniques to add color patch blocks to vehicle images for color data +augmentation. This serves as strong prior information, helping the model +perform the image-text alignment. At the same time, we incorporate labeled +attributes into the image-text alignment loss to learn fine-grained alignment +and prevent similar images and texts from being incorrectly separated. Our +approach ranked first in the final B-board test with a score of 70.9. + +
+
+
+
+
+ + ☆ Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical + Image Segmentation + + +
+ Image segmentation holds a vital position in the realms of diagnosis and +treatment within the medical domain. Traditional convolutional neural networks +(CNNs) and Transformer models have made significant advancements in this realm, +but they still encounter challenges because of limited receptive field or high +computing complexity. Recently, State Space Models (SSMs), particularly Mamba +and its variants, have demonstrated notable performance in the field of vision. +However, their feature extraction methods may not be sufficiently effective and +retain some redundant structures, leaving room for parameter reduction. +Motivated by previous spatial and channel attention methods, we propose Triplet +Mamba-UNet. The method leverages residual VSS Blocks to extract intensive +contextual features, while Triplet SSM is employed to fuse features across +spatial and channel dimensions. We conducted experiments on ISIC17, ISIC18, +CVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets, +demonstrating the superior segmentation performance of our proposed TM-UNet. +Additionally, compared to the previous VM-UNet, our model achieves a one-third +reduction in parameters. + +
+
+
+
+
+ + ☆ PlainMamba: Improving Non-Hierarchical Mamba in Visual Recognition + + +
+ We present PlainMamba: a simple non-hierarchical state space model (SSM) +designed for general visual recognition. The recent Mamba model has shown how +SSMs can be highly competitive with other architectures on sequential data and +initial attempts have been made to apply it to images. In this paper, we +further adapt the selective scanning process of Mamba to the visual domain, +enhancing its ability to learn features from two-dimensional images by (i) a +continuous 2D scanning process that improves spatial continuity by ensuring +adjacency of tokens in the scanning sequence, and (ii) direction-aware updating +which enables the model to discern the spatial relations of tokens by encoding +directional information. Our architecture is designed to be easy to use and +easy to scale, formed by stacking identical PlainMamba blocks, resulting in a +model with constant width throughout all layers. The architecture is further +simplified by removing the need for special tokens. We evaluate PlainMamba on a +variety of visual recognition tasks including image classification, semantic +segmentation, object detection, and instance segmentation. Our method achieves +performance gains over previous non-hierarchical models and is competitive with +hierarchical alternatives. For tasks requiring high-resolution inputs, in +particular, PlainMamba requires much less computing while maintaining high +performance. Code and models are available at +https://github.com/ChenhongyiYang/PlainMamba + +
+
+
+
+
+ + ☆ AniPortrait: Audio-Driven Synthesis of Photorealistic Portrait Animation + + +
+ In this study, we propose AniPortrait, a novel framework for generating +high-quality animation driven by audio and a reference portrait image. Our +methodology is divided into two stages. Initially, we extract 3D intermediate +representations from audio and project them into a sequence of 2D facial +landmarks. Subsequently, we employ a robust diffusion model, coupled with a +motion module, to convert the landmark sequence into photorealistic and +temporally consistent portrait animation. Experimental results demonstrate the +superiority of AniPortrait in terms of facial naturalness, pose diversity, and +visual quality, thereby offering an enhanced perceptual experience. Moreover, +our methodology exhibits considerable potential in terms of flexibility and +controllability, which can be effectively applied in areas such as facial +motion editing or face reenactment. We release code and model weights at +https://github.com/scutzzj/AniPortrait + +
+
+
+
+
+ + ☆ Manifold-Guided Lyapunov Control with Diffusion Models + + +
+ This paper presents a novel approach to generating stabilizing controllers +for a large class of dynamical systems using diffusion models. The core +objective is to develop stabilizing control functions by identifying the +closest asymptotically stable vector field relative to a predetermined manifold +and adjusting the control function based on this finding. To achieve this, we +employ a diffusion model trained on pairs consisting of asymptotically stable +vector fields and their corresponding Lyapunov functions. Our numerical results +demonstrate that this pre-trained model can achieve stabilization over +previously unseen systems efficiently and rapidly, showcasing the potential of +our approach in fast zero-shot control and generalizability. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Not All Similarities Are Created Equal: Leveraging Data-Driven Biases to + Inform GenAI Copyright Disputes + + +
+ The advent of Generative Artificial Intelligence (GenAI) models, including +GitHub Copilot, OpenAI GPT, and Stable Diffusion, has revolutionized content +creation, enabling non-professionals to produce high-quality content across +various domains. This transformative technology has led to a surge of synthetic +content and sparked legal disputes over copyright infringement. To address +these challenges, this paper introduces a novel approach that leverages the +learning capacity of GenAI models for copyright legal analysis, demonstrated +with GPT2 and Stable Diffusion models. Copyright law distinguishes between +original expressions and generic ones (Sc\`enes \`a faire), protecting the +former and permitting reproduction of the latter. However, this distinction has +historically been challenging to make consistently, leading to over-protection +of copyrighted works. GenAI offers an unprecedented opportunity to enhance this +legal analysis by revealing shared patterns in preexisting works. We propose a +data-driven approach to identify the genericity of works created by GenAI, +employing "data-driven bias" to assess the genericity of expressive +compositions. This approach aids in copyright scope determination by utilizing +the capabilities of GenAI to identify and prioritize expressive elements and +rank them according to their frequency in the model's dataset. The potential +implications of measuring expressive genericity for copyright law are profound. +Such scoring could assist courts in determining copyright scope during +litigation, inform the registration practices of Copyright Offices, allowing +registration of only highly original synthetic works, and help copyright owners +signal the value of their works and facilitate fairer licensing deals. More +generally, this approach offers valuable insights to policymakers grappling +with adapting copyright law to the challenges posed by the era of GenAI. + +
+
+ comment: Presented at ACM CSLAW 2024 +
+
+
+
+
+ + ☆ Hierarchical Light Transformer Ensembles for Multimodal Trajectory + Forecasting + + +
+ Accurate trajectory forecasting is crucial for the performance of various +systems, such as advanced driver-assistance systems and self-driving vehicles. +These forecasts allow to anticipate events leading to collisions and, +therefore, to mitigate them. Deep Neural Networks have excelled in motion +forecasting, but issues like overconfidence and uncertainty quantification +persist. Deep Ensembles address these concerns, yet applying them to multimodal +distributions remains challenging. In this paper, we propose a novel approach +named Hierarchical Light Transformer Ensembles (HLT-Ens), aimed at efficiently +training an ensemble of Transformer architectures using a novel hierarchical +loss function. HLT-Ens leverages grouped fully connected layers, inspired by +grouped convolution techniques, to capture multimodal distributions, +effectively. Through extensive experimentation, we demonstrate that HLT-Ens +achieves state-of-the-art performance levels, offering a promising avenue for +improving trajectory forecasting techniques. + +
+
+
+
+
+ + ☆ Predicting Perceived Gloss: Do Weak Labels Suffice? + + +
+ Estimating perceptual attributes of materials directly from images is a +challenging task due to their complex, not fully-understood interactions with +external factors, such as geometry and lighting. Supervised deep learning +models have recently been shown to outperform traditional approaches, but rely +on large datasets of human-annotated images for accurate perception +predictions. Obtaining reliable annotations is a costly endeavor, aggravated by +the limited ability of these models to generalise to different aspects of +appearance. In this work, we show how a much smaller set of human annotations +("strong labels") can be effectively augmented with automatically derived "weak +labels" in the context of learning a low-dimensional image-computable gloss +metric. We evaluate three alternative weak labels for predicting human gloss +perception from limited annotated data. Incorporating weak labels enhances our +gloss prediction beyond the current state of the art. Moreover, it enables a +substantial reduction in human annotation costs without sacrificing accuracy, +whether working with rendered images or real photographs. + +
+
+ comment: Computer Graphics Forum (Eurographics 2024) +
+
+
+
+
+ + ☆ DiffFAE: Advancing High-fidelity One-shot Facial Appearance Editing with + Space-sensitive Customization and Semantic Preservation + + +
+ Facial Appearance Editing (FAE) aims to modify physical attributes, such as +pose, expression and lighting, of human facial images while preserving +attributes like identity and background, showing great importance in +photograph. In spite of the great progress in this area, current researches +generally meet three challenges: low generation fidelity, poor attribute +preservation, and inefficient inference. To overcome above challenges, this +paper presents DiffFAE, a one-stage and highly-efficient diffusion-based +framework tailored for high-fidelity FAE. For high-fidelity query attributes +transfer, we adopt Space-sensitive Physical Customization (SPC), which ensures +the fidelity and generalization ability by utilizing rendering texture derived +from 3D Morphable Model (3DMM). In order to preserve source attributes, we +introduce the Region-responsive Semantic Composition (RSC). This module is +guided to learn decoupled source-regarding features, thereby better preserving +the identity and alleviating artifacts from non-facial attributes such as hair, +clothes, and background. We further introduce a consistency regularization for +our pipeline to enhance editing controllability by leveraging prior knowledge +in the attention matrices of diffusion model. Extensive experiments demonstrate +the superiority of DiffFAE over existing methods, achieving state-of-the-art +performance in facial appearance editing. + +
+
+
+
+
+ + ☆ Exploring Dynamic Transformer for Efficient Object Tracking + + +
+ The speed-precision trade-off is a critical problem for visual object +tracking which usually requires low latency and deployment on constrained +resources. Existing solutions for efficient tracking mainly focus on adopting +light-weight backbones or modules, which nevertheless come at the cost of a +sacrifice in precision. In this paper, inspired by dynamic network routing, we +propose DyTrack, a dynamic transformer framework for efficient tracking. +Real-world tracking scenarios exhibit diverse levels of complexity. We argue +that a simple network is sufficient for easy frames in video sequences, while +more computation could be assigned to difficult ones. DyTrack automatically +learns to configure proper reasoning routes for various inputs, gaining better +utilization of the available computational budget. Thus, it can achieve higher +performance with the same running speed. We formulate instance-specific +tracking as a sequential decision problem and attach terminating branches to +intermediate layers of the entire model. Especially, to fully utilize the +computations, we introduce the feature recycling mechanism to reuse the outputs +of predecessors. Furthermore, a target-aware self-distillation strategy is +designed to enhance the discriminating capabilities of early predictions by +effectively mimicking the representation pattern of the deep model. Extensive +experiments on multiple benchmarks demonstrate that DyTrack achieves promising +speed-precision trade-offs with only a single model. For instance, DyTrack +obtains 64.9% AUC on LaSOT with a speed of 256 fps. + +
+
+
+
+
+ + ☆ High-Resolution Image Translation Model Based on Grayscale Redefinition + + +
+ Image-to-image translation is a technique that focuses on transferring images +from one domain to another while maintaining the essential content +representations. In recent years, image-to-image translation has gained +significant attention and achieved remarkable advancements due to its diverse +applications in computer vision and image processing tasks. In this work, we +propose an innovative method for image translation between different domains. +For high-resolution image translation tasks, we use a grayscale adjustment +method to achieve pixel-level translation. For other tasks, we utilize the +Pix2PixHD model with a coarse-to-fine generator, multi-scale discriminator, and +improved loss to enhance the image translation performance. On the other hand, +to tackle the issue of sparse training data, we adopt model weight +initialization from other task to optimize the performance of the current task. + +
+
+
+
+
+ + ☆ Learning with Unreliability: Fast Few-shot Voxel Radiance Fields with + Relative Geometric Consistency CVPR 2024 + + +
+ We propose a voxel-based optimization framework, ReVoRF, for few-shot +radiance fields that strategically address the unreliability in pseudo novel +view synthesis. Our method pivots on the insight that relative depth +relationships within neighboring regions are more reliable than the absolute +color values in disoccluded areas. Consequently, we devise a bilateral +geometric consistency loss that carefully navigates the trade-off between color +fidelity and geometric accuracy in the context of depth consistency for +uncertain regions. Moreover, we present a reliability-guided learning strategy +to discern and utilize the variable quality across synthesized views, +complemented by a reliability-aware voxel smoothing algorithm that smoothens +the transition between reliable and unreliable data patches. Our approach +allows for a more nuanced use of all available data, promoting enhanced +learning from regions previously considered unsuitable for high-quality +reconstruction. Extensive experiments across diverse datasets reveal that our +approach attains significant gains in efficiency and accuracy, delivering +rendering speeds of 3 FPS, 7 mins to train a $360^\circ$ scene, and a 5\% +improvement in PSNR over existing few-shot methods. Code is available at +https://github.com/HKCLynn/ReVoRF. + +
+
+ comment: CVPR 2024 final version +
+
+
+
+
+ + ☆ UADA3D: Unsupervised Adversarial Domain Adaptation for 3D Object + Detection with Sparse LiDAR and Large Domain Gaps + + +
+ In this study, we address a gap in existing unsupervised domain adaptation +approaches on LiDAR-based 3D object detection, which have predominantly +concentrated on adapting between established, high-density autonomous driving +datasets. We focus on sparser point clouds, capturing scenarios from different +perspectives: not just from vehicles on the road but also from mobile robots on +sidewalks, which encounter significantly different environmental conditions and +sensor configurations. We introduce Unsupervised Adversarial Domain Adaptation +for 3D Object Detection (UADA3D). UADA3D does not depend on pre-trained source +models or teacher-student architectures. Instead, it uses an adversarial +approach to directly learn domain-invariant features. We demonstrate its +efficacy in various adaptation scenarios, showing significant improvements in +both self-driving car and mobile robot domains. Our code is open-source and +will be available soon. + +
+
+
+
+
+ + ☆ AniArtAvatar: Animatable 3D Art Avatar from a Single Image + + +
+ We present a novel approach for generating animatable 3D-aware art avatars +from a single image, with controllable facial expressions, head poses, and +shoulder movements. Unlike previous reenactment methods, our approach utilizes +a view-conditioned 2D diffusion model to synthesize multi-view images from a +single art portrait with a neutral expression. With the generated colors and +normals, we synthesize a static avatar using an SDF-based neural surface. For +avatar animation, we extract control points, transfer the motion with these +points, and deform the implicit canonical space. Firstly, we render the front +image of the avatar, extract the 2D landmarks, and project them to the 3D space +using a trained SDF network. We extract 3D driving landmarks using 3DMM and +transfer the motion to the avatar landmarks. To animate the avatar pose, we +manually set the body height and bound the head and torso of an avatar with two +cages. The head and torso can be animated by transforming the two cages. Our +approach is a one-shot pipeline that can be applied to various styles. +Experiments demonstrate that our method can generate high-quality 3D art +avatars with desired control over different motions. + +
+
+
+
+
+ + ☆ Grad-CAMO: Learning Interpretable Single-Cell Morphological Profiles + from 3D Cell Painting Images + + +
+ Despite their black-box nature, deep learning models are extensively used in +image-based drug discovery to extract feature vectors from single cells in +microscopy images. To better understand how these networks perform +representation learning, we employ visual explainability techniques (e.g., +Grad-CAM). Our analyses reveal several mechanisms by which supervised models +cheat, exploiting biologically irrelevant pixels when extracting morphological +features from images, such as noise in the background. This raises doubts +regarding the fidelity of learned single-cell representations and their +relevance when investigating downstream biological questions. To address this +misalignment between researcher expectations and machine behavior, we introduce +Grad-CAMO, a novel single-cell interpretability score for supervised feature +extractors. Grad-CAMO measures the proportion of a model's attention that is +concentrated on the cell of interest versus the background. This metric can be +assessed per-cell or averaged across a validation set, offering a tool to audit +individual features vectors or guide the improved design of deep learning +architectures. Importantly, Grad-CAMO seamlessly integrates into existing +workflows, requiring no dataset or model modifications, and is compatible with +both 2D and 3D Cell Painting data. Additional results are available at +https://github.com/eigenvivek/Grad-CAMO. + +
+
+
+
+
+ + ☆ MMVP: A Multimodal MoCap Dataset with Vision and Pressure Sensors CVPR2024 + + +
+ Foot contact is an important cue not only for human motion capture but also +for motion understanding and physically plausible motion generation. However, +most of the foot-contact annotations in existing datasets are estimated by +purely visual matching and distance thresholding, which results in low accuracy +and coarse granularity. Even though existing multimodal datasets +synergistically capture plantar pressure (foot contact) and visual signals, +they are specifically designed for small-range and slow motion such as Taiji +Quan and Yoga. Therefore, there is still a lack of a vision-pressure multimodal +dataset with large-range and fast human motion, as well as accurate and dense +foot-contact annotation. To fill this gap, we propose a Multimodal MoCap +Dataset with Vision and Pressure sensors, named MMVP. MMVP provides accurate +and dense plantar pressure signals synchronized with RGBD observations, which +is especially useful for both plausible shape estimation, robust pose fitting +without foot drifting, and accurate global translation tracking. To validate +the dataset, we propose an RGBD-P SMPL fitting method and also a +monocular-video-based baseline framework, VP-MoCap, for human motion capture. +Experiments demonstrate that our RGBD-P SMPL Fitting results significantly +outperform pure visual motion capture. Moreover, VP-MoCap outperforms SOTA +methods in foot-contact and global translation estimation accuracy. We believe +the configuration of the dataset and the baseline frameworks will stimulate the +research in this direction and also provide a good reference for MoCap +applications in various domains. Project page: +https://haolyuan.github.io/MMVP-Dataset/. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Fake or JPEG? Revealing Common Biases in Generated Image Detection + Datasets + + +
+ The widespread adoption of generative image models has highlighted the urgent +need to detect artificial content, which is a crucial step in combating +widespread manipulation and misinformation. Consequently, numerous detectors +and associated datasets have emerged. However, many of these datasets +inadvertently introduce undesirable biases, thereby impacting the effectiveness +and evaluation of detectors. In this paper, we emphasize that many datasets for +AI-generated image detection contain biases related to JPEG compression and +image size. Using the GenImage dataset, we demonstrate that detectors indeed +learn from these undesired factors. Furthermore, we show that removing the +named biases substantially increases robustness to JPEG compression and +significantly alters the cross-generator performance of evaluated detectors. +Specifically, it leads to more than 11 percentage points increase in +cross-generator performance for ResNet50 and Swin-T detectors on the GenImage +dataset, achieving state-of-the-art results. + We provide the dataset and source codes of this paper on the anonymous +website: https://www.unbiased-genimage.org + +
+
+
+
+
+ + ☆ Dual Memory Networks: A Versatile Adaptation Approach for + Vision-Language Models CVPR2024 + + +
+ With the emergence of pre-trained vision-language models like CLIP, how to +adapt them to various downstream classification tasks has garnered significant +attention in recent research. The adaptation strategies can be typically +categorized into three paradigms: zero-shot adaptation, few-shot adaptation, +and the recently-proposed training-free few-shot adaptation. Most existing +approaches are tailored for a specific setting and can only cater to one or two +of these paradigms. In this paper, we introduce a versatile adaptation approach +that can effectively work under all three settings. Specifically, we propose +the dual memory networks that comprise dynamic and static memory components. +The static memory caches training data knowledge, enabling training-free +few-shot adaptation, while the dynamic memory preserves historical test +features online during the testing process, allowing for the exploration of +additional data insights beyond the training set. This novel capability +enhances model performance in the few-shot setting and enables model usability +in the absence of training data. The two memory networks employ the same +flexible memory interactive strategy, which can operate in a training-free mode +and can be further enhanced by incorporating learnable projection layers. Our +approach is tested across 11 datasets under the three task settings. +Remarkably, in the zero-shot scenario, it outperforms existing methods by over +3\% and even shows superior results against methods utilizing external training +data. Additionally, our method exhibits robust performance against natural +distribution shifts. Codes are available at \url{https://github.com/YBZh/DMN}. + +
+
+ comment: CVPR2024; Codes are available at \url{https://github.com/YBZh/DMN} +
+
+
+
+
+ + ☆ DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping + + +
+ Recently, significant progress has been achieved in sensing real large-scale +outdoor 3D environments, particularly by using modern acquisition equipment +such as LiDAR sensors. Unfortunately, they are fundamentally limited in their +ability to produce dense, complete 3D scenes. To address this issue, recent +learning-based methods integrate neural implicit representations and +optimizable feature grids to approximate surfaces of 3D scenes. However, +naively fitting samples along raw LiDAR rays leads to noisy 3D mapping results +due to the nature of sparse, conflicting LiDAR measurements. Instead, in this +work we depart from fitting LiDAR data exactly, instead letting the network +optimize a non-metric monotonic implicit field defined in 3D space. To fit our +field, we design a learning system integrating a monotonicity loss that enables +optimizing neural monotonic fields and leverages recent progress in large-scale +3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as +captured by multiple quantitative and perceptual measures and visual results +obtained for Mai City, Newer College, and KITTI benchmarks. The code of our +approach will be made publicly available. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Practical Applications of Advanced Cloud Services and Generative AI + Systems in Medical Image Analysis + + +
+ The medical field is one of the important fields in the application of +artificial intelligence technology. With the explosive growth and +diversification of medical data, as well as the continuous improvement of +medical needs and challenges, artificial intelligence technology is playing an +increasingly important role in the medical field. Artificial intelligence +technologies represented by computer vision, natural language processing, and +machine learning have been widely penetrated into diverse scenarios such as +medical imaging, health management, medical information, and drug research and +development, and have become an important driving force for improving the level +and quality of medical services.The article explores the transformative +potential of generative AI in medical imaging, emphasizing its ability to +generate syntheticACM-2 data, enhance images, aid in anomaly detection, and +facilitate image-to-image translation. Despite challenges like model +complexity, the applications of generative models in healthcare, including +Med-PaLM 2 technology, show promising results. By addressing limitations in +dataset size and diversity, these models contribute to more accurate diagnoses +and improved patient outcomes. However, ethical considerations and +collaboration among stakeholders are essential for responsible implementation. +Through experiments leveraging GANs to augment brain tumor MRI datasets, the +study demonstrates how generative AI can enhance image quality and diversity, +ultimately advancing medical diagnostics and patient care. + +
+
+
+
+
+ + ☆ A Gaze-grounded Visual Question Answering Dataset for Clarifying + Ambiguous Japanese Questions LREC + + +
+ Situated conversations, which refer to visual information as visual question +answering (VQA), often contain ambiguities caused by reliance on directive +information. This problem is exacerbated because some languages, such as +Japanese, often omit subjective or objective terms. Such ambiguities in +questions are often clarified by the contexts in conversational situations, +such as joint attention with a user or user gaze information. In this study, we +propose the Gaze-grounded VQA dataset (GazeVQA) that clarifies ambiguous +questions using gaze information by focusing on a clarification process +complemented by gaze information. We also propose a method that utilizes gaze +target estimation results to improve the accuracy of GazeVQA tasks. Our +experimental results showed that the proposed method improved the performance +in some cases of a VQA system on GazeVQA and identified some typical problems +of GazeVQA tasks that need to be improved. + +
+
+ comment: LREC-COLING 2024 +
+
+
+
+
+ + ☆ WordRobe: Text-Guided Generation of Textured 3D Garments + + +
+ In this paper, we tackle a new and challenging problem of text-driven +generation of 3D garments with high-quality textures. We propose "WordRobe", a +novel framework for the generation of unposed & textured 3D garment meshes from +user-friendly text prompts. We achieve this by first learning a latent +representation of 3D garments using a novel coarse-to-fine training strategy +and a loss for latent disentanglement, promoting better latent interpolation. +Subsequently, we align the garment latent space to the CLIP embedding space in +a weakly supervised manner, enabling text-driven 3D garment generation and +editing. For appearance modeling, we leverage the zero-shot generation +capability of ControlNet to synthesize view-consistent texture maps in a single +feed-forward inference step, thereby drastically decreasing the generation time +as compared to existing methods. We demonstrate superior performance over +current SOTAs for learning 3D garment latent space, garment interpolation, and +text-driven texture synthesis, supported by quantitative evaluation and +qualitative user study. The unposed 3D garment meshes generated using WordRobe +can be directly fed to standard cloth simulation & animation pipelines without +any post-processing. + +
+
+
+
+
+ + ☆ NeRF-HuGS: Improved Neural Radiance Fields in Non-static Scenes Using + Heuristics-Guided Segmentation CVPR2024 + + +
+ Neural Radiance Field (NeRF) has been widely recognized for its excellence in +novel view synthesis and 3D scene reconstruction. However, their effectiveness +is inherently tied to the assumption of static scenes, rendering them +susceptible to undesirable artifacts when confronted with transient distractors +such as moving objects or shadows. In this work, we propose a novel paradigm, +namely "Heuristics-Guided Segmentation" (HuGS), which significantly enhances +the separation of static scenes from transient distractors by harmoniously +combining the strengths of hand-crafted heuristics and state-of-the-art +segmentation models, thus significantly transcending the limitations of +previous solutions. Furthermore, we delve into the meticulous design of +heuristics, introducing a seamless fusion of Structure-from-Motion (SfM)-based +heuristics and color residual heuristics, catering to a diverse range of +texture profiles. Extensive experiments demonstrate the superiority and +robustness of our method in mitigating transient distractors for NeRFs trained +in non-static scenes. Project page: https://cnhaox.github.io/NeRF-HuGS/. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ☆ Boosting Few-Shot Learning with Disentangled Self-Supervised Learning + and Meta-Learning for Medical Image Classification + + +
+ Background and objective: Employing deep learning models in critical domains +such as medical imaging poses challenges associated with the limited +availability of training data. We present a strategy for improving the +performance and generalization capabilities of models trained in low-data +regimes. Methods: The proposed method starts with a pre-training phase, where +features learned in a self-supervised learning setting are disentangled to +improve the robustness of the representations for downstream tasks. We then +introduce a meta-fine-tuning step, leveraging related classes between +meta-training and meta-testing phases but varying the granularity level. This +approach aims to enhance the model's generalization capabilities by exposing it +to more challenging classification tasks during meta-training and evaluating it +on easier tasks but holding greater clinical relevance during meta-testing. We +demonstrate the effectiveness of the proposed approach through a series of +experiments exploring several backbones, as well as diverse pre-training and +fine-tuning schemes, on two distinct medical tasks, i.e., classification of +prostate cancer aggressiveness from MRI data and classification of breast +cancer malignity from microscopic images. Results: Our results indicate that +the proposed approach consistently yields superior performance w.r.t. ablation +experiments, maintaining competitiveness even when a distribution shift between +training and evaluation data occurs. Conclusion: Extensive experiments +demonstrate the effectiveness and wide applicability of the proposed approach. +We hope that this work will add another solution to the arsenal of addressing +learning issues in data-scarce imaging domains. + +
+
+ comment: 20 pages, 4 figures, 4 tables. Submitted to Elsevier on 25 March 2024 +
+
+
+
+
+ + ☆ Equipping Sketch Patches with Context-Aware Positional Encoding for + Graphic Sketch Representation + + +
+ The drawing order of a sketch records how it is created stroke-by-stroke by a +human being. For graphic sketch representation learning, recent studies have +injected sketch drawing orders into graph edge construction by linking each +patch to another in accordance to a temporal-based nearest neighboring +strategy. However, such constructed graph edges may be unreliable, since a +sketch could have variants of drawings. In this paper, we propose a +variant-drawing-protected method by equipping sketch patches with context-aware +positional encoding (PE) to make better use of drawing orders for learning +graphic sketch representation. Instead of injecting sketch drawings into graph +edges, we embed these sequential information into graph nodes only. More +specifically, each patch embedding is equipped with a sinusoidal absolute PE to +highlight the sequential position in the drawing order. And its neighboring +patches, ranked by the values of self-attention scores between patch +embeddings, are equipped with learnable relative PEs to restore the contextual +positions within a neighborhood. During message aggregation via graph +convolutional networks, a node receives both semantic contents from patch +embeddings and contextual patterns from PEs by its neighbors, arriving at +drawing-order-enhanced sketch representations. Experimental results indicate +that our method significantly improves sketch healing and controllable sketch +synthesis. + +
+
+
+
+
+ + ☆ Boosting Adversarial Training via Fisher-Rao Norm-based Regularization CVPR2024 + + +
+ Adversarial training is extensively utilized to improve the adversarial +robustness of deep neural networks. Yet, mitigating the degradation of standard +generalization performance in adversarial-trained models remains an open +problem. This paper attempts to resolve this issue through the lens of model +complexity. First, We leverage the Fisher-Rao norm, a geometrically invariant +metric for model complexity, to establish the non-trivial bounds of the +Cross-Entropy Loss-based Rademacher complexity for a ReLU-activated Multi-Layer +Perceptron. Then we generalize a complexity-related variable, which is +sensitive to the changes in model width and the trade-off factors in +adversarial training. Moreover, intensive empirical evidence validates that +this variable highly correlates with the generalization gap of Cross-Entropy +loss between adversarial-trained and standard-trained models, especially during +the initial and final phases of the training process. Building upon this +observation, we propose a novel regularization framework, called Logit-Oriented +Adversarial Training (LOAT), which can mitigate the trade-off between +robustness and accuracy while imposing only a negligible increase in +computational overhead. Our extensive experiments demonstrate that the proposed +regularization strategy can boost the performance of the prevalent adversarial +training algorithms, including PGD-AT, TRADES, TRADES (LSE), MART, and DM-AT, +across various network architectures. Our code will be available at +https://github.com/TrustAI/LOAT. + +
+
+ comment: This paper has been accepted to CVPR2024 +
+
+
+
+
+ + ☆ Random-coupled Neural Network + + +
+ Improving the efficiency of current neural networks and modeling them in +biological neural systems have become popular research directions in recent +years. Pulse-coupled neural network (PCNN) is a well applicated model for +imitating the computation characteristics of the human brain in computer vision +and neural network fields. However, differences between the PCNN and biological +neural systems remain: limited neural connection, high computational cost, and +lack of stochastic property. In this study, random-coupled neural network +(RCNN) is proposed. It overcomes these difficulties in PCNN's neuromorphic +computing via a random inactivation process. This process randomly closes some +neural connections in the RCNN model, realized by the random inactivation +weight matrix of link input. This releases the computational burden of PCNN, +making it affordable to achieve vast neural connections. Furthermore, the image +and video processing mechanisms of RCNN are researched. It encodes constant +stimuli as periodic spike trains and periodic stimuli as chaotic spike trains, +the same as biological neural information encoding characteristics. Finally, +the RCNN is applicated to image segmentation, fusion, and pulse shape +discrimination subtasks. It is demonstrated to be robust, efficient, and highly +anti-noised, with outstanding performance in all applications mentioned above. + +
+
+
+
+
+ + ☆ DS-AL: A Dual-Stream Analytic Learning for Exemplar-Free + Class-Incremental Learning AAAI 2024 + + +
+ Class-incremental learning (CIL) under an exemplar-free constraint has +presented a significant challenge. Existing methods adhering to this constraint +are prone to catastrophic forgetting, far more so than replay-based techniques +that retain access to past samples. In this paper, to solve the exemplar-free +CIL problem, we propose a Dual-Stream Analytic Learning (DS-AL) approach. The +DS-AL contains a main stream offering an analytical (i.e., closed-form) linear +solution, and a compensation stream improving the inherent under-fitting +limitation due to adopting linear mapping. The main stream redefines the CIL +problem into a Concatenated Recursive Least Squares (C-RLS) task, allowing an +equivalence between the CIL and its joint-learning counterpart. The +compensation stream is governed by a Dual-Activation Compensation (DAC) module. +This module re-activates the embedding with a different activation function +from the main stream one, and seeks fitting compensation by projecting the +embedding to the null space of the main stream's linear mapping. Empirical +results demonstrate that the DS-AL, despite being an exemplar-free technique, +delivers performance comparable with or better than that of replay-based +methods across various datasets, including CIFAR-100, ImageNet-100 and +ImageNet-Full. Additionally, the C-RLS' equivalent property allows the DS-AL to +execute CIL in a phase-invariant manner. This is evidenced by a +never-before-seen 500-phase CIL ImageNet task, which performs on a level +identical to a 5-phase one. Our codes are available at +https://github.com/ZHUANGHP/Analytic-continual-learning. + +
+
+ comment: Accepted in AAAI 2024 +
+
+
+
+
+ + ☆ SeNM-VAE: Semi-Supervised Noise Modeling with Hierarchical Variational + Autoencoder + + +
+ The data bottleneck has emerged as a fundamental challenge in learning based +image restoration methods. Researchers have attempted to generate synthesized +training data using paired or unpaired samples to address this challenge. This +study proposes SeNM-VAE, a semi-supervised noise modeling method that leverages +both paired and unpaired datasets to generate realistic degraded data. Our +approach is based on modeling the conditional distribution of degraded and +clean images with a specially designed graphical model. Under the variational +inference framework, we develop an objective function for handling both paired +and unpaired data. We employ our method to generate paired training samples for +real-world image denoising and super-resolution tasks. Our approach excels in +the quality of synthetic degraded images compared to other unpaired and paired +noise modeling methods. Furthermore, our approach demonstrates remarkable +performance in downstream image restoration tasks, even with limited paired +data. With more paired data, our method achieves the best performance on the +SIDD dataset. + +
+
+
+
+
+ + ☆ Sharing the Cost of Success: A Game for Evaluating and Learning + Collaborative Multi-Agent Instruction Giving and Following Policies LREC + + +
+ In collaborative goal-oriented settings, the participants are not only +interested in achieving a successful outcome, but do also implicitly negotiate +the effort they put into the interaction (by adapting to each other). In this +work, we propose a challenging interactive reference game that requires two +players to coordinate on vision and language observations. The learning signal +in this game is a score (given after playing) that takes into account the +achieved goal and the players' assumed efforts during the interaction. We show +that a standard Proximal Policy Optimization (PPO) setup achieves a high +success rate when bootstrapped with heuristic partner behaviors that implement +insights from the analysis of human-human interactions. And we find that a +pairing of neural partners indeed reduces the measured joint effort when +playing together repeatedly. However, we observe that in comparison to a +reasonable heuristic pairing there is still room for improvement -- which +invites further research in the direction of cost-sharing in collaborative +interactions. + +
+
+ comment: 9 pages, Accepted at LREC-COLING 2024 +
+
+
+
+
+ + ☆ Dr.Hair: Reconstructing Scalp-Connected Hair Strands without + Pre-training via Differentiable Rendering of Line Segments CVPR 2024 + + +
+ In the film and gaming industries, achieving a realistic hair appearance +typically involves the use of strands originating from the scalp. However, +reconstructing these strands from observed surface images of hair presents +significant challenges. The difficulty in acquiring Ground Truth (GT) data has +led state-of-the-art learning-based methods to rely on pre-training with +manually prepared synthetic CG data. This process is not only labor-intensive +and costly but also introduces complications due to the domain gap when +compared to real-world data. In this study, we propose an optimization-based +approach that eliminates the need for pre-training. Our method represents hair +strands as line segments growing from the scalp and optimizes them using a +novel differentiable rendering algorithm. To robustly optimize a substantial +number of slender explicit geometries, we introduce 3D orientation estimation +utilizing global optimization, strand initialization based on Laplace's +equation, and reparameterization that leverages geometric connectivity and +spatial proximity. Unlike existing optimization-based methods, our method is +capable of reconstructing internal hair flow in an absolute direction. Our +method exhibits robust and accurate inverse rendering, surpassing the quality +of existing methods and significantly improving processing speed. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ DiffGaze: A Diffusion Model for Continuous Gaze Sequence Generation on + 360° Images + + +
+ We present DiffGaze, a novel method for generating realistic and diverse +continuous human gaze sequences on 360{\deg} images based on a conditional +score-based denoising diffusion model. Generating human gaze on 360{\deg} +images is important for various human-computer interaction and computer +graphics applications, e.g. for creating large-scale eye tracking datasets or +for realistic animation of virtual humans. However, existing methods are +limited to predicting discrete fixation sequences or aggregated saliency maps, +thereby neglecting crucial parts of natural gaze behaviour. Our method uses +features extracted from 360{\deg} images as condition and uses two transformers +to model the temporal and spatial dependencies of continuous human gaze. We +evaluate DiffGaze on two 360{\deg} image benchmarks for gaze sequence +generation as well as scanpath prediction and saliency prediction. Our +evaluations show that DiffGaze outperforms state-of-the-art methods on all +tasks on both benchmarks. We also report a 21-participant user study showing +that our method generates gaze sequences that are indistinguishable from real +human sequences. + +
+
+
+
+
+ + ☆ LaRE^2: Latent Reconstruction Error Based Method for Diffusion-Generated + Image Detection CVPR 2024 + + +
+ The evolution of Diffusion Models has dramatically improved image generation +quality, making it increasingly difficult to differentiate between real and +generated images. This development, while impressive, also raises significant +privacy and security concerns. In response to this, we propose a novel Latent +REconstruction error guided feature REfinement method (LaRE^2) for detecting +the diffusion-generated images. We come up with the Latent Reconstruction Error +(LaRE), the first reconstruction-error based feature in the latent space for +generated image detection. LaRE surpasses existing methods in terms of feature +extraction efficiency while preserving crucial cues required to differentiate +between the real and the fake. To exploit LaRE, we propose an Error-Guided +feature REfinement module (EGRE), which can refine the image feature guided by +LaRE to enhance the discriminativeness of the feature. Our EGRE utilizes an +align-then-refine mechanism, which effectively refines the image feature for +generated-image detection from both spatial and channel perspectives. Extensive +experiments on the large-scale GenImage benchmark demonstrate the superiority +of our LaRE^2, which surpasses the best SoTA method by up to 11.9%/12.1% +average ACC/AP across 8 different image generators. LaRE also surpasses +existing methods in terms of feature extraction cost, delivering an impressive +speed enhancement of 8 times. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Building Bridges across Spatial and Temporal Resolutions: + Reference-Based Super-Resolution via Change Priors and Conditional Diffusion + Model CVPR2024 + + +
+ Reference-based super-resolution (RefSR) has the potential to build bridges +across spatial and temporal resolutions of remote sensing images. However, +existing RefSR methods are limited by the faithfulness of content +reconstruction and the effectiveness of texture transfer in large scaling +factors. Conditional diffusion models have opened up new opportunities for +generating realistic high-resolution images, but effectively utilizing +reference images within these models remains an area for further exploration. +Furthermore, content fidelity is difficult to guarantee in areas without +relevant reference information. To solve these issues, we propose a +change-aware diffusion model named Ref-Diff for RefSR, using the land cover +change priors to guide the denoising process explicitly. Specifically, we +inject the priors into the denoising model to improve the utilization of +reference information in unchanged areas and regulate the reconstruction of +semantically relevant content in changed areas. With this powerful guidance, we +decouple the semantics-guided denoising and reference texture-guided denoising +processes to improve the model performance. Extensive experiments demonstrate +the superior effectiveness and robustness of the proposed method compared with +state-of-the-art RefSR methods in both quantitative and qualitative +evaluations. The code and data are available at +https://github.com/dongrunmin/RefDiff. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Chain of Compression: A Systematic Approach to Combinationally Compress + Convolutional Neural Networks + + +
+ Convolutional neural networks (CNNs) have achieved significant popularity, +but their computational and memory intensity poses challenges for +resource-constrained computing systems, particularly with the prerequisite of +real-time performance. To release this burden, model compression has become an +important research focus. Many approaches like quantization, pruning, early +exit, and knowledge distillation have demonstrated the effect of reducing +redundancy in neural networks. Upon closer examination, it becomes apparent +that each approach capitalizes on its unique features to compress the neural +network, and they can also exhibit complementary behavior when combined. To +explore the interactions and reap the benefits from the complementary features, +we propose the Chain of Compression, which works on the combinational sequence +to apply these common techniques to compress the neural network. Validated on +the image-based regression and classification networks across different data +sets, our proposed Chain of Compression can significantly compress the +computation cost by 100-1000 times with ignorable accuracy loss compared with +the baseline model. + +
+
+ comment: 10 pages, 15 figures +
+
+
+
+
+ + ☆ Integrating Mamba Sequence Model and Hierarchical Upsampling Network for + Accurate Semantic Segmentation of Multiple Sclerosis Legion + + +
+ Integrating components from convolutional neural networks and state space +models in medical image segmentation presents a compelling approach to enhance +accuracy and efficiency. We introduce Mamba HUNet, a novel architecture +tailored for robust and efficient segmentation tasks. Leveraging strengths from +Mamba UNet and the lighter version of Hierarchical Upsampling Network (HUNet), +Mamba HUNet combines convolutional neural networks local feature extraction +power with state space models long range dependency modeling capabilities. We +first converted HUNet into a lighter version, maintaining performance parity +and then integrated this lighter HUNet into Mamba HUNet, further enhancing its +efficiency. The architecture partitions input grayscale images into patches, +transforming them into 1D sequences for processing efficiency akin to Vision +Transformers and Mamba models. Through Visual State Space blocks and patch +merging layers, hierarchical features are extracted while preserving spatial +information. Experimental results on publicly available Magnetic Resonance +Imaging scans, notably in Multiple Sclerosis lesion segmentation, demonstrate +Mamba HUNet's effectiveness across diverse segmentation tasks. The model's +robustness and flexibility underscore its potential in handling complex +anatomical structures. These findings establish Mamba HUNet as a promising +solution in advancing medical image segmentation, with implications for +improving clinical decision making processes. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Test-time Adaptation Meets Image Enhancement: Improving Accuracy via + Uncertainty-aware Logit Switching IJCNN2024 + + +
+ Deep neural networks have achieved remarkable success in a variety of +computer vision applications. However, there is a problem of degrading accuracy +when the data distribution shifts between training and testing. As a solution +of this problem, Test-time Adaptation~(TTA) has been well studied because of +its practicality. Although TTA methods increase accuracy under distribution +shift by updating the model at test time, using high-uncertainty predictions is +known to degrade accuracy. Since the input image is the root of the +distribution shift, we incorporate a new perspective on enhancing the input +image into TTA methods to reduce the prediction's uncertainty. We hypothesize +that enhancing the input image reduces prediction's uncertainty and increase +the accuracy of TTA methods. On the basis of our hypothesis, we propose a novel +method: Test-time Enhancer and Classifier Adaptation~(TECA). In TECA, the +classification model is combined with the image enhancement model that +transforms input images into recognition-friendly ones, and these models are +updated by existing TTA methods. Furthermore, we found that the prediction from +the enhanced image does not always have lower uncertainty than the prediction +from the original image. Thus, we propose logit switching, which compares the +uncertainty measure of these predictions and outputs the lower one. In our +experiments, we evaluate TECA with various TTA methods and show that TECA +reduces prediction's uncertainty and increases accuracy of TTA methods despite +having no hyperparameters and little parameter overhead. + +
+
+ comment: Accepted to IJCNN2024 +
+
+
+
+
+ + ☆ InterHandGen: Two-Hand Interaction Generation via Cascaded Reverse + Diffusion CVPR 2024 + + +
+ We present InterHandGen, a novel framework that learns the generative prior +of two-hand interaction. Sampling from our model yields plausible and diverse +two-hand shapes in close interaction with or without an object. Our prior can +be incorporated into any optimization or learning methods to reduce ambiguity +in an ill-posed setup. Our key observation is that directly modeling the joint +distribution of multiple instances imposes high learning complexity due to its +combinatorial nature. Thus, we propose to decompose the modeling of joint +distribution into the modeling of factored unconditional and conditional single +instance distribution. In particular, we introduce a diffusion model that +learns the single-hand distribution unconditional and conditional to another +hand via conditioning dropout. For sampling, we combine anti-penetration and +classifier-free guidance to enable plausible generation. Furthermore, we +establish the rigorous evaluation protocol of two-hand synthesis, where our +method significantly outperforms baseline generative models in terms of +plausibility and diversity. We also demonstrate that our diffusion prior can +boost the performance of two-hand reconstruction from monocular in-the-wild +images, achieving new state-of-the-art accuracy. + +
+
+ comment: Accepted to CVPR 2024, project page: + https://jyunlee.github.io/projects/interhandgen/ +
+
+
+
+
+ + ☆ Learning to Visually Localize Sound Sources from Mixtures without Prior + Source Knowledge CVPR 2024 + + +
+ The goal of the multi-sound source localization task is to localize sound +sources from the mixture individually. While recent multi-sound source +localization methods have shown improved performance, they face challenges due +to their reliance on prior information about the number of objects to be +separated. In this paper, to overcome this limitation, we present a novel +multi-sound source localization method that can perform localization without +prior knowledge of the number of sound sources. To achieve this goal, we +propose an iterative object identification (IOI) module, which can recognize +sound-making objects in an iterative manner. After finding the regions of +sound-making objects, we devise object similarity-aware clustering (OSC) loss +to guide the IOI module to effectively combine regions of the same object but +also distinguish between different objects and backgrounds. It enables our +method to perform accurate localization of sound-making objects without any +prior knowledge. Extensive experimental results on the MUSIC and VGGSound +benchmarks show the significant performance improvements of the proposed method +over the existing methods for both single and multi-source. Our code is +available at: https://github.com/VisualAIKHU/NoPrior_MultiSSL + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Neural Clustering based Visual Representation Learning CVPR 2024 + + +
+ We investigate a fundamental aspect of machine vision: the measurement of +features, by revisiting clustering, one of the most classic approaches in +machine learning and data analysis. Existing visual feature extractors, +including ConvNets, ViTs, and MLPs, represent an image as rectangular regions. +Though prevalent, such a grid-style paradigm is built upon engineering practice +and lacks explicit modeling of data distribution. In this work, we propose +feature extraction with clustering (FEC), a conceptually elegant yet +surprisingly ad-hoc interpretable neural clustering framework, which views +feature extraction as a process of selecting representatives from data and thus +automatically captures the underlying data distribution. Given an image, FEC +alternates between grouping pixels into individual clusters to abstract +representatives and updating the deep features of pixels with current +representatives. Such an iterative working mechanism is implemented in the form +of several neural layers and the final representatives can be used for +downstream tasks. The cluster assignments across layers, which can be viewed +and inspected by humans, make the forward process of FEC fully transparent and +empower it with promising ad-hoc interpretability. Extensive experiments on +various visual recognition models and tasks verify the effectiveness, +generality, and interpretability of FEC. We expect this work will provoke a +rethink of the current de facto grid-style paradigm. + +
+
+ comment: CVPR 2024. Code: https://github.com/guikunchen/FEC/ +
+
+
+
+
+ + ☆ SSF3D: Strict Semi-Supervised 3D Object Detection with Switching Filter + + +
+ SSF3D modified the semi-supervised 3D object detection (SS3DOD) framework, +which designed specifically for point cloud data. Leveraging the +characteristics of non-coincidence and weak correlation of target objects in +point cloud, we adopt a strategy of retaining only the truth-determining pseudo +labels and trimming the other fuzzy labels with points, instead of pursuing a +balance between the quantity and quality of pseudo labels. Besides, we notice +that changing the filter will make the model meet different distributed +targets, which is beneficial to break the training bottleneck. Two mechanism +are introduced to achieve above ideas: strict threshold and filter switching. +The experiments are conducted to analyze the effectiveness of above approaches +and their impact on the overall performance of the system. Evaluating on the +KITTI dataset, SSF3D exhibits superior performance compared to the current +state-of-the-art methods. The code will be released here. + +
+
+
+
+
+ + ☆ Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object + Detection CVPR2024 + + +
+ We delve into pseudo-labeling for semi-supervised monocular 3D object +detection (SSM3OD) and discover two primary issues: a misalignment between the +prediction quality of 3D and 2D attributes and the tendency of depth +supervision derived from pseudo-labels to be noisy, leading to significant +optimization conflicts with other reliable forms of supervision. We introduce a +novel decoupled pseudo-labeling (DPL) approach for SSM3OD. Our approach +features a Decoupled Pseudo-label Generation (DPG) module, designed to +efficiently generate pseudo-labels by separately processing 2D and 3D +attributes. This module incorporates a unique homography-based method for +identifying dependable pseudo-labels in BEV space, specifically for 3D +attributes. Additionally, we present a DepthGradient Projection (DGP) module to +mitigate optimization conflicts caused by noisy depth supervision of +pseudo-labels, effectively decoupling the depth gradient and removing +conflicting gradients. This dual decoupling strategy-at both the pseudo-label +generation and gradient levels-significantly improves the utilization of +pseudo-labels in SSM3OD. Our comprehensive experiments on the KITTI benchmark +demonstrate the superiority of our method over existing approaches. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ☆ Self-Rectifying Diffusion Sampling with Perturbed-Attention Guidance + + +
+ Recent studies have demonstrated that diffusion models are capable of +generating high-quality samples, but their quality heavily depends on sampling +guidance techniques, such as classifier guidance (CG) and classifier-free +guidance (CFG). These techniques are often not applicable in unconditional +generation or in various downstream tasks such as image restoration. In this +paper, we propose a novel sampling guidance, called Perturbed-Attention +Guidance (PAG), which improves diffusion sample quality across both +unconditional and conditional settings, achieving this without requiring +additional training or the integration of external modules. PAG is designed to +progressively enhance the structure of samples throughout the denoising +process. It involves generating intermediate samples with degraded structure by +substituting selected self-attention maps in diffusion U-Net with an identity +matrix, by considering the self-attention mechanisms' ability to capture +structural information, and guiding the denoising process away from these +degraded samples. In both ADM and Stable Diffusion, PAG surprisingly improves +sample quality in conditional and even unconditional scenarios. Moreover, PAG +significantly improves the baseline performance in various downstream tasks +where existing guidances such as CG or CFG cannot be fully utilized, including +ControlNet with empty prompts and image restoration such as inpainting and +deblurring. + +
+
+ comment: Project page is available at + https://ku-cvlab.github.io/Perturbed-Attention-Guidance +
+
+
+
+
+ + ☆ AIDE: An Automatic Data Engine for Object Detection in Autonomous + Driving CVPR-2024 + + +
+ Autonomous vehicle (AV) systems rely on robust perception models as a +cornerstone of safety assurance. However, objects encountered on the road +exhibit a long-tailed distribution, with rare or unseen categories posing +challenges to a deployed perception model. This necessitates an expensive +process of continuously curating and annotating data with significant human +effort. We propose to leverage recent advances in vision-language and large +language models to design an Automatic Data Engine (AIDE) that automatically +identifies issues, efficiently curates data, improves the model through +auto-labeling, and verifies the model through generation of diverse scenarios. +This process operates iteratively, allowing for continuous self-improvement of +the model. We further establish a benchmark for open-world detection on AV +datasets to comprehensively evaluate various learning paradigms, demonstrating +our method's superior performance at a reduced cost. + +
+
+ comment: Accepted by CVPR-2024 +
+
+
+
+
+ + ☆ CoDA: Instructive Chain-of-Domain Adaptation with Severity-Aware Visual + Prompt Tuning + + +
+ Unsupervised Domain Adaptation (UDA) aims to adapt models from labeled source +domains to unlabeled target domains. When adapting to adverse scenes, existing +UDA methods fail to perform well due to the lack of instructions, leading their +models to overlook discrepancies within all adverse scenes. To tackle this, we +propose CoDA which instructs models to distinguish, focus, and learn from these +discrepancies at scene and image levels. Specifically, CoDA consists of a +Chain-of-Domain (CoD) strategy and a Severity-Aware Visual Prompt Tuning +(SAVPT) mechanism. CoD focuses on scene-level instructions to divide all +adverse scenes into easy and hard scenes, guiding models to adapt from source +to easy domains with easy scene images, and then to hard domains with hard +scene images, thereby laying a solid foundation for whole adaptations. Building +upon this foundation, we employ SAVPT to dive into more detailed image-level +instructions to boost performance. SAVPT features a novel metric Severity that +divides all adverse scene images into low-severity and high-severity images. +Then Severity directs visual prompts and adapters, instructing models to +concentrate on unified severity features instead of scene-specific features, +without adding complexity to the model architecture. CoDA achieves SOTA +performances on widely-used benchmarks under all adverse scenes. Notably, CoDA +outperforms the existing ones by 4.6%, and 10.3% mIoU on the Foggy Driving, and +Foggy Zurich benchmarks, respectively. Our code is available at +https://github.com/Cuzyoung/CoDA + +
+
+
+
+
+ + ☆ Activity-Biometrics: Person Identification from Daily Activities CVPR 2024 + + +
+ In this work, we study a novel problem which focuses on person identification +while performing daily activities. Learning biometric features from RGB videos +is challenging due to spatio-temporal complexity and presence of appearance +biases such as clothing color and background. We propose ABNet, a novel +framework which leverages disentanglement of biometric and non-biometric +features to perform effective person identification from daily activities. +ABNet relies on a bias-less teacher to learn biometric features from RGB videos +and explicitly disentangle non-biometric features with the help of biometric +distortion. In addition, ABNet also exploits activity prior for biometrics +which is enabled by joint biometric and activity learning. We perform +comprehensive evaluation of the proposed approach across five different +datasets which are derived from existing activity recognition benchmarks. +Furthermore, we extensively compare ABNet with existing works in person +identification and demonstrate its effectiveness for activity-based biometrics +across all five datasets. The code and dataset can be accessed at: +\url{https://github.com/sacrcv/Activity-Biometrics/} + +
+
+ comment: CVPR 2024 Main conference +
+
+
+
+
+ + ☆ TRAM: Global Trajectory and Motion of 3D Humans from in-the-wild Videos + + +
+ We propose TRAM, a two-stage method to reconstruct a human's global +trajectory and motion from in-the-wild videos. TRAM robustifies SLAM to recover +the camera motion in the presence of dynamic humans and uses the scene +background to derive the motion scale. Using the recovered camera as a +metric-scale reference frame, we introduce a video transformer model (VIMO) to +regress the kinematic body motion of a human. By composing the two motions, we +achieve accurate recovery of 3D humans in the world space, reducing global +motion errors by 60% from prior work. https://yufu-wang.github.io/tram4d/ + +
+
+ comment: The project website: https://yufu-wang.github.io/tram4d/ +
+
+
+
+
+ + ☆ Language Models are Free Boosters for Biomedical Imaging Tasks + + +
+ In this study, we uncover the unexpected efficacy of residual-based large +language models (LLMs) as part of encoders for biomedical imaging tasks, a +domain traditionally devoid of language or textual data. The approach diverges +from established methodologies by utilizing a frozen transformer block, +extracted from pre-trained LLMs, as an innovative encoder layer for the direct +processing of visual tokens. This strategy represents a significant departure +from the standard multi-modal vision-language frameworks, which typically hinge +on language-driven prompts and inputs. We found that these LLMs could boost +performance across a spectrum of biomedical imaging applications, including +both 2D and 3D visual classification tasks, serving as plug-and-play boosters. +More interestingly, as a byproduct, we found that the proposed framework +achieved superior performance, setting new state-of-the-art results on +extensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we +aim to open new avenues for employing LLMs in biomedical imaging and enriching +the understanding of their potential in this specialized domain. + +
+
+
+
+
+ + ☆ The Solution for the ICCV 2023 1st Scientific Figure Captioning + Challenge + + +
+ In this paper, we propose a solution for improving the quality of captions +generated for figures in papers. We adopt the approach of summarizing the +textual content in the paper to generate image captions. Throughout our study, +we encounter discrepancies in the OCR information provided in the official +dataset. To rectify this, we employ the PaddleOCR toolkit to extract OCR +information from all images. Moreover, we observe that certain textual content +in the official paper pertains to images that are not relevant for captioning, +thereby introducing noise during caption generation. To mitigate this issue, we +leverage LLaMA to extract image-specific information by querying the textual +content based on image mentions, effectively filtering out extraneous +information. Additionally, we recognize a discrepancy between the primary use +of maximum likelihood estimation during text generation and the evaluation +metrics such as ROUGE employed to assess the quality of generated captions. To +bridge this gap, we integrate the BRIO model framework, enabling a more +coherent alignment between the generation and evaluation processes. Our +approach ranked first in the final test with a score of 4.49. + +
+
+
+
+
+ + ☆ OVER-NAV: Elevating Iterative Vision-and-Language Navigation with + Open-Vocabulary Detection and StructurEd Representation CVPR 2024 + + +
+ Recent advances in Iterative Vision-and-Language Navigation (IVLN) introduce +a more meaningful and practical paradigm of VLN by maintaining the agent's +memory across tours of scenes. Although the long-term memory aligns better with +the persistent nature of the VLN task, it poses more challenges on how to +utilize the highly unstructured navigation memory with extremely sparse +supervision. Towards this end, we propose OVER-NAV, which aims to go over and +beyond the current arts of IVLN techniques. In particular, we propose to +incorporate LLMs and open-vocabulary detectors to distill key information and +establish correspondence between multi-modal signals. Such a mechanism +introduces reliable cross-modal supervision and enables on-the-fly +generalization to unseen scenes without the need of extra annotation and +re-training. To fully exploit the interpreted navigation data, we further +introduce a structured representation, coded Omnigraph, to effectively +integrate multi-modal information along the tour. Accompanied with a novel +omnigraph fusion mechanism, OVER-NAV is able to extract the most relevant +knowledge from omnigraph for a more accurate navigating action. In addition, +OVER-NAV seamlessly supports both discrete and continuous environments under a +unified framework. We demonstrate the superiority of OVER-NAV in extensive +experiments. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Labeling subtypes in a Parkinson's Cohort using Multifeatures in MRI - + Integrating Grey and White Matter Information + + +
+ Thresholding of networks has long posed a challenge in brain connectivity +analysis. Weighted networks are typically binarized using threshold measures to +facilitate network analysis. Previous studies on MRI-based brain networks have +predominantly utilized density or sparsity-based thresholding techniques, +optimized within specific ranges derived from network metrics such as path +length, clustering coefficient, and small-world index. Thus, determination of a +single threshold value for facilitating comparative analysis of networks +remains elusive. To address this, our study introduces Mutual K-Nearest +Neighbor (MKNN)-based thresholding for brain network analysis. Here, nearest +neighbor selection is based on the highest correlation between features of +brain regions. Construction of brain networks was accomplished by computing +Pearson correlations between grey matter volume and white matter volume for +each pair of brain regions. Structural MRI data from 180 Parkinsons patients +and 70 controls from the NIMHANS, India were analyzed. Subtypes within +Parkinsons disease were identified based on grey and white matter volume +atrophy using source-based morphometric decomposition. The loading coefficients +were correlated with clinical features to discern clinical relationship with +the deciphered subtypes. Our data-mining approach revealed: Subtype A (N = 51, +intermediate type), Subtype B (N = 57, mild-severe type with mild motor +symptoms), and Subtype AB (N = 36, most-severe type with predominance in motor +impairment). Subtype-specific weighted matrices were binarized using MKNN-based +thresholding for brain network analysis. Permutation tests on network metrics +of resulting bipartite graphs demonstrated significant group differences in +betweenness centrality and participation coefficient. The identified hubs were +specific to each subtype, with some hubs conserved across different subtypes. + +
+
+ comment: 31 pages, 10 figures, 3 tables +
+
+
+
+
+ + ☆ Staircase Localization for Autonomous Exploration in Urban Environments + + +
+ A staircase localization method is proposed for robots to explore urban +environments autonomously. The proposed method employs a modular design in the +form of a cascade pipeline consisting of three modules of stair detection, line +segment detection, and stair localization modules. The stair detection module +utilizes an object detection algorithm based on deep learning to generate a +region of interest (ROI). From the ROI, line segment features are extracted +using a deep line segment detection algorithm. The extracted line segments are +used to localize a staircase in terms of position, orientation, and stair +direction. The stair detection and localization are performed only with a +single RGB-D camera. Each component of the proposed pipeline does not need to +be designed particularly for staircases, which makes it easy to maintain the +whole pipeline and replace each component with state-of-the-art deep learning +detection techniques. The results of real-world experiments show that the +proposed method can perform accurate stair detection and localization during +autonomous exploration for various structured and unstructured upstairs and +downstairs with shadows, dirt, and occlusions by artificial and natural +objects. + +
+
+ comment: 9 pages, 10 figures +
+
+
+
+
+ + ☆ Accuracy enhancement method for speech emotion recognition from + spectrogram using temporal frequency correlation and positional information + learning through knowledge transfer + + +
+ In this paper, we propose a method to improve the accuracy of speech emotion +recognition (SER) by using vision transformer (ViT) to attend to the +correlation of frequency (y-axis) with time (x-axis) in spectrogram and +transferring positional information between ViT through knowledge transfer. The +proposed method has the following originality i) We use vertically segmented +patches of log-Mel spectrogram to analyze the correlation of frequencies over +time. This type of patch allows us to correlate the most relevant frequencies +for a particular emotion with the time they were uttered. ii) We propose the +use of image coordinate encoding, an absolute positional encoding suitable for +ViT. By normalizing the x, y coordinates of the image to -1 to 1 and +concatenating them to the image, we can effectively provide valid absolute +positional information for ViT. iii) Through feature map matching, the locality +and location information of the teacher network is effectively transmitted to +the student network. Teacher network is a ViT that contains locality of +convolutional stem and absolute position information through image coordinate +encoding, and student network is a structure that lacks positional encoding in +the basic ViT structure. In feature map matching stage, we train through the +mean absolute error (L1 loss) to minimize the difference between the feature +maps of the two networks. To validate the proposed method, three emotion +datasets (SAVEE, EmoDB, and CREMA-D) consisting of speech were converted into +log-Mel spectrograms for comparison experiments. The experimental results show +that the proposed method significantly outperforms the state-of-the-art methods +in terms of weighted accuracy while requiring significantly fewer floating +point operations (FLOPs). Overall, the proposed method offers an promising +solution for SER by providing improved efficiency and performance. + +
+
+
+
+
+ + ☆ Physical 3D Adversarial Attacks against Monocular Depth Estimation in + Autonomous Driving CVPR 2024 + + +
+ Deep learning-based monocular depth estimation (MDE), extensively applied in +autonomous driving, is known to be vulnerable to adversarial attacks. Previous +physical attacks against MDE models rely on 2D adversarial patches, so they +only affect a small, localized region in the MDE map but fail under various +viewpoints. To address these limitations, we propose 3D Depth Fool +(3D$^2$Fool), the first 3D texture-based adversarial attack against MDE models. +3D$^2$Fool is specifically optimized to generate 3D adversarial textures +agnostic to model types of vehicles and to have improved robustness in bad +weather conditions, such as rain and fog. Experimental results validate the +superior performance of our 3D$^2$Fool across various scenarios, including +vehicles, MDE models, weather conditions, and viewpoints. Real-world +experiments with printed 3D textures on physical vehicle models further +demonstrate that our 3D$^2$Fool can cause an MDE error of over 10 meters. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Tracing and segmentation of molecular patterns in 3-dimensional + cryo-et/em density maps through algorithmic image processing and deep + learning-based techniques + + +
+ Understanding the structures of biological macromolecules is highly important +as they are closely associated with cellular functionalities. Comprehending the +precise organization actin filaments is crucial because they form the dynamic +cytoskeleton, which offers structural support to cells and connects the cell's +interior with its surroundings. However, determining the precise organization +of actin filaments is challenging due to the poor quality of cryo-electron +tomography (cryo-ET) images, which suffer from low signal-to-noise (SNR) ratios +and the presence of missing wedge, as well as diverse shape characteristics of +actin filaments. To address these formidable challenges, the primary component +of this dissertation focuses on developing sophisticated computational +techniques for tracing actin filaments. In particular, three novel +methodologies have been developed: i) BundleTrac, for tracing bundle-like actin +filaments found in Stereocilium, ii) Spaghetti Tracer, for tracing filaments +that move individually with loosely cohesive movements, and iii) Struwwel +Tracer, for tracing randomly orientated actin filaments in the actin network. +The second component of the dissertation introduces a convolutional neural +network (CNN) based segmentation model to determine the location of protein +secondary structures, such as helices and beta-sheets, in medium-resolution +(5-10 Angstrom) 3-dimensional cryo-electron microscopy (cryo-EM) images. This +methodology later evolved into a tool named DeepSSETracer. The final component +of the dissertation presents a novel algorithm, cylindrical fit measure, to +estimate image structure match at helix regions in medium-resolution cryo-EM +images. Overall, my dissertation has made significant contributions to +addressing critical research challenges in structural biology by introducing +various computational methods and tools. + +
+
+
+
+
+ + ☆ The Effects of Short Video-Sharing Services on Video Copy Detection + + +
+ The short video-sharing services that allow users to post 10-30 second videos +(e.g., YouTube Shorts and TikTok) have attracted a lot of attention in recent +years. However, conventional video copy detection (VCD) methods mainly focus on +general video-sharing services (e.g., YouTube and Bilibili), and the effects of +short video-sharing services on video copy detection are still unclear. +Considering that illegally copied videos in short video-sharing services have +service-distinctive characteristics, especially in those time lengths, the pros +and cons of VCD in those services are required to be analyzed. In this paper, +we examine the effects of short video-sharing services on VCD by constructing a +dataset that has short video-sharing service characteristics. Our novel dataset +is automatically constructed from the publicly available dataset to have +reference videos and fixed short-time-length query videos, and such automation +procedures assure the reproducibility and data privacy preservation of this +paper. From the experimental results focusing on segment-level and video-level +situations, we can see that three effects: "Segment-level VCD in short +video-sharing services is more difficult than those in general video-sharing +services", "Video-level VCD in short video-sharing services is easier than +those in general video-sharing services", "The video alignment component mainly +suppress the detection performance in short video-sharing services". + +
+
+
+
+
+ + ☆ Automated Report Generation for Lung Cytological Images Using a CNN + Vision Classifier and Multiple-Transformer Text Decoders: Preliminary Study + + +
+ Cytology plays a crucial role in lung cancer diagnosis. Pulmonary cytology +involves cell morphological characterization in the specimen and reporting the +corresponding findings, which are extremely burdensome tasks. In this study, we +propose a report-generation technique for lung cytology images. In total, 71 +benign and 135 malignant pulmonary cytology specimens were collected. Patch +images were extracted from the captured specimen images, and the findings were +assigned to each image as a dataset for report generation. The proposed method +consists of a vision model and a text decoder. In the former, a convolutional +neural network (CNN) is used to classify a given image as benign or malignant, +and the features related to the image are extracted from the intermediate +layer. Independent text decoders for benign and malignant cells are prepared +for text generation, and the text decoder switches according to the CNN +classification results. The text decoder is configured using a Transformer that +uses the features obtained from the CNN for report generation. Based on the +evaluation results, the sensitivity and specificity were 100% and 96.4%, +respectively, for automated benign and malignant case classification, and the +saliency map indicated characteristic benign and malignant areas. The grammar +and style of the generated texts were confirmed as correct and in better +agreement with gold standard compared to existing LLM-based image-captioning +methods and single-text-decoder ablation model. These results indicate that the +proposed method is useful for pulmonary cytology classification and reporting. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Leak and Learn: An Attacker's Cookbook to Train Using Leaked Data from + Federated Learning CVPR 2024 + + +
+ Federated learning is a decentralized learning paradigm introduced to +preserve privacy of client data. Despite this, prior work has shown that an +attacker at the server can still reconstruct the private training data using +only the client updates. These attacks are known as data reconstruction attacks +and fall into two major categories: gradient inversion (GI) and linear layer +leakage attacks (LLL). However, despite demonstrating the effectiveness of +these attacks in breaching privacy, prior work has not investigated the +usefulness of the reconstructed data for downstream tasks. In this work, we +explore data reconstruction attacks through the lens of training and improving +models with leaked data. We demonstrate the effectiveness of both GI and LLL +attacks in maliciously training models using the leaked data more accurately +than a benign federated learning strategy. Counter-intuitively, this bump in +training quality can occur despite limited reconstruction quality or a small +total number of leaked images. Finally, we show the limitations of these +attacks for downstream training, individually for GI attacks and for LLL +attacks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Pseudo-MRI-Guided PET Image Reconstruction Method Based on a Diffusion + Probabilistic Model + + +
+ Anatomically guided PET reconstruction using MRI information has been shown +to have the potential to improve PET image quality. However, these improvements +are limited to PET scans with paired MRI information. In this work we employed +a diffusion probabilistic model (DPM) to infer T1-weighted-MRI (deep-MRI) +images from FDG-PET brain images. We then use the DPM-generated T1w-MRI to +guide the PET reconstruction. The model was trained with brain FDG scans, and +tested in datasets containing multiple levels of counts. Deep-MRI images +appeared somewhat degraded than the acquired MRI images. Regarding PET image +quality, volume of interest analysis in different brain regions showed that +both PET reconstructed images using the acquired and the deep-MRI images +improved image quality compared to OSEM. Same conclusions were found analysing +the decimated datasets. A subjective evaluation performed by two physicians +confirmed that OSEM scored consistently worse than the MRI-guided PET images +and no significant differences were observed between the MRI-guided PET images. +This proof of concept shows that it is possible to infer DPM-based MRI imagery +to guide the PET reconstruction, enabling the possibility of changing +reconstruction parameters such as the strength of the prior on anatomically +guided PET reconstruction in the absence of MRI. + +
+
+
+
+
+ + ☆ Integrative Graph-Transformer Framework for Histopathology Whole Slide + Image Representation and Classification + + +
+ In digital pathology, the multiple instance learning (MIL) strategy is widely +used in the weakly supervised histopathology whole slide image (WSI) +classification task where giga-pixel WSIs are only labeled at the slide level. +However, existing attention-based MIL approaches often overlook contextual +information and intrinsic spatial relationships between neighboring tissue +tiles, while graph-based MIL frameworks have limited power to recognize the +long-range dependencies. In this paper, we introduce the integrative +graph-transformer framework that simultaneously captures the context-aware +relational features and global WSI representations through a novel Graph +Transformer Integration (GTI) block. Specifically, each GTI block consists of a +Graph Convolutional Network (GCN) layer modeling neighboring relations at the +local instance level and an efficient global attention model capturing +comprehensive global information from extensive feature embeddings. Extensive +experiments on three publicly available WSI datasets: TCGA-NSCLC, TCGA-RCC and +BRIGHT, demonstrate the superiority of our approach over current +state-of-the-art MIL methods, achieving an improvement of 1.0% to 2.6% in +accuracy and 0.7%-1.6% in AUROC. + +
+
+
+
+
+ + ☆ Recommendation of data-free class-incremental learning algorithms by + simulating future data + + +
+ Class-incremental learning deals with sequential data streams composed of +batches of classes. Various algorithms have been proposed to address the +challenging case where samples from past classes cannot be stored. However, +selecting an appropriate algorithm for a user-defined setting is an open +problem, as the relative performance of these algorithms depends on the +incremental settings. To solve this problem, we introduce an algorithm +recommendation method that simulates the future data stream. Given an initial +set of classes, it leverages generative models to simulate future classes from +the same visual domain. We evaluate recent algorithms on the simulated stream +and recommend the one which performs best in the user-defined incremental +setting. We illustrate the effectiveness of our method on three large datasets +using six algorithms and six incremental settings. Our method outperforms +competitive baselines, and performance is close to that of an oracle choosing +the best algorithm in each setting. This work contributes to facilitate the +practical deployment of incremental learning. + +
+
+
+
+
+ + ☆ EgoLifter: Open-world 3D Segmentation for Egocentric Perception + + +
+ In this paper we present EgoLifter, a novel system that can automatically +segment scenes captured from egocentric sensors into a complete decomposition +of individual 3D objects. The system is specifically designed for egocentric +data where scenes contain hundreds of objects captured from natural +(non-scanning) motion. EgoLifter adopts 3D Gaussians as the underlying +representation of 3D scenes and objects and uses segmentation masks from the +Segment Anything Model (SAM) as weak supervision to learn flexible and +promptable definitions of object instances free of any specific object +taxonomy. To handle the challenge of dynamic objects in ego-centric videos, we +design a transient prediction module that learns to filter out dynamic objects +in the 3D reconstruction. The result is a fully automatic pipeline that is able +to reconstruct 3D object instances as collections of 3D Gaussians that +collectively compose the entire scene. We created a new benchmark on the Aria +Digital Twin dataset that quantitatively demonstrates its state-of-the-art +performance in open-world 3D segmentation from natural egocentric input. We run +EgoLifter on various egocentric activity datasets which shows the promise of +the method for 3D egocentric perception at scale. + +
+
+ comment: Preprint. Project page: https://egolifter.github.io/ +
+
+
+
+
+ + ☆ TDIP: Tunable Deep Image Processing, a Real Time Melt Pool Monitoring + Solution + + +
+ In the era of Industry 4.0, Additive Manufacturing (AM), particularly metal +AM, has emerged as a significant contributor due to its innovative and +cost-effective approach to fabricate highly intricate geometries. Despite its +potential, this industry still lacks real-time capable process monitoring +algorithms. Recent advancements in this field suggest that Melt Pool (MP) +signatures during the fabrication process contain crucial information about +process dynamics and quality. To obtain this information, various sensory +approaches, such as high-speed cameras-based vision modules are employed for +online fabrication monitoring. However, many conventional in-depth analyses +still cannot process all the recorded data simultaneously. Although +conventional Image Processing (ImP) solutions provide a targeted tunable +approach, they pose a trade-off between convergence certainty and convergence +speed. As a result, conventional methods are not suitable for a dynamically +changing application like MP monitoring. Therefore, this article proposes the +implementation of a Tunable Deep Image Processing (TDIP) method to address the +data-rich monitoring needs in real-time. The proposed model is first trained to +replicate an ImP algorithm with tunable features and methodology. The TDIP +model is then further improved to account for MP geometries and fabrication +quality based on the vision input and process parameters. The TDIP model +achieved over 94% estimation accuracy with more than 96% R2 score for quality, +geometry, and MP signature estimation and isolation. The TDIP model can process +500 images per second, while conventional methods taking a few minutes per +image. This significant processing time reduction enables the integration of +vision-based monitoring in real-time for processes and quality estimation. + +
+
+
+
+
+ + ☆ QuakeSet: A Dataset and Low-Resource Models to Monitor Earthquakes + through Sentinel-1 SC + + +
+ Earthquake monitoring is necessary to promptly identify the affected areas, +the severity of the events, and, finally, to estimate damages and plan the +actions needed for the restoration process. The use of seismic stations to +monitor the strength and origin of earthquakes is limited when dealing with +remote areas (we cannot have global capillary coverage). Identification and +analysis of all affected areas is mandatory to support areas not monitored by +traditional stations. Using social media images in crisis management has proven +effective in various situations. However, they are still limited by the +possibility of using communication infrastructures in case of an earthquake and +by the presence of people in the area. Moreover, social media images and +messages cannot be used to estimate the actual severity of earthquakes and +their characteristics effectively. The employment of satellites to monitor +changes around the globe grants the possibility of exploiting instrumentation +that is not limited by the visible spectrum, the presence of land +infrastructures, and people in the affected areas. In this work, we propose a +new dataset composed of images taken from Sentinel-1 and a new series of tasks +to help monitor earthquakes from a new detailed view. Coupled with the data, we +provide a series of traditional machine learning and deep learning models as +baselines to assess the effectiveness of ML-based models in earthquake +analysis. + +
+
+ comment: Accepted at ISCRAM 2024 +
+
+
+
+
+ + ☆ Segment Any Medical Model Extended SP + + +
+ The Segment Anything Model (SAM) has drawn significant attention from +researchers who work on medical image segmentation because of its +generalizability. However, researchers have found that SAM may have limited +performance on medical images compared to state-of-the-art non-foundation +models. Regardless, the community sees potential in extending, fine-tuning, +modifying, and evaluating SAM for analysis of medical imaging. An increasing +number of works have been published focusing on the mentioned four directions, +where variants of SAM are proposed. To this end, a unified platform helps push +the boundary of the foundation model for medical images, facilitating the use, +modification, and validation of SAM and its variants in medical image +segmentation. In this work, we introduce SAMM Extended (SAMME), a platform that +integrates new SAM variant models, adopts faster communication protocols, +accommodates new interactive modes, and allows for fine-tuning of subcomponents +of the models. These features can expand the potential of foundation models +like SAM, and the results can be translated to applications such as +image-guided therapy, mixed reality interaction, robotic navigation, and data +augmentation. + +
+
+ comment: The content of the manuscript has been presented in SPIE Medical + Imaging 2024, and had been accepted to appear in the proceedings of the + conference +
+
+
+
+
+ + ☆ Mathematical Foundation and Corrections for Full Range Head Pose + Estimation + + +
+ Numerous works concerning head pose estimation (HPE) offer algorithms or +proposed neural network-based approaches for extracting Euler angles from +either facial key points or directly from images of the head region. However, +many works failed to provide clear definitions of the coordinate systems and +Euler or Tait-Bryan angles orders in use. It is a well-known fact that rotation +matrices depend on coordinate systems, and yaw, roll, and pitch angles are +sensitive to their application order. Without precise definitions, it becomes +challenging to validate the correctness of the output head pose and drawing +routines employed in prior works. In this paper, we thoroughly examined the +Euler angles defined in the 300W-LP dataset, head pose estimation such as +3DDFA-v2, 6D-RepNet, WHENet, etc, and the validity of their drawing routines of +the Euler angles. When necessary, we infer their coordinate system and sequence +of yaw, roll, pitch from provided code. This paper presents (1) code and +algorithms for inferring coordinate system from provided source code, code for +Euler angle application order and extracting precise rotation matrices and the +Euler angles, (2) code and algorithms for converting poses from one rotation +system to another, (3) novel formulae for 2D augmentations of the rotation +matrices, and (4) derivations and code for the correct drawing routines for +rotation matrices and poses. This paper also addresses the feasibility of +defining rotations with right-handed coordinate system in Wikipedia and SciPy, +which makes the Euler angle extraction much easier for full-range head pose +research. + +
+
+
+
+
+ + ☆ Tutorial on Diffusion Models for Imaging and Vision + + +
+ The astonishing growth of generative tools in recent years has empowered many +exciting applications in text-to-image generation and text-to-video generation. +The underlying principle behind these generative tools is the concept of +diffusion, a particular sampling mechanism that has overcome some shortcomings +that were deemed difficult in the previous approaches. The goal of this +tutorial is to discuss the essential ideas underlying the diffusion models. The +target audience of this tutorial includes undergraduate and graduate students +who are interested in doing research on diffusion models or applying these +models to solve other problems. + +
+
+
+
+
+ + ☆ Efficient Multi-Band Temporal Video Filter for Reducing Human-Robot + Interaction + + +
+ Although mobile robots have on-board sensors to perform navigation, their +efficiency in completing paths can be enhanced by planning to avoid human +interaction. Infrastructure cameras can capture human activity continuously for +the purpose of compiling activity analytics to choose efficient times and +routes. We describe a cascade temporal filtering method to efficiently extract +short- and long-term activity in two time dimensions, isochronal and +chronological, for use in global path planning and local navigation +respectively. The temporal filter has application either independently, or, if +object recognition is also required, it can be used as a pre-filter to perform +activity-gating of the more computationally expensive neural network +processing. For a testbed 32-camera network, we show how this hybrid approach +can achieve over 8 times improvement in frames per second throughput and 6.5 +times reduction of system power use. We also show how the cost map of static +objects in the ROS robot software development framework is augmented with +dynamic regions determined from the temporal filter. + +
+
+ comment: 15 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ A Personalized Video-Based Hand Taxonomy: Application for Individuals + with Spinal Cord Injury + + +
+ Hand function is critical for our interactions and quality of life. Spinal +cord injuries (SCI) can impair hand function, reducing independence. A +comprehensive evaluation of function in home and community settings requires a +hand grasp taxonomy for individuals with impaired hand function. Developing +such a taxonomy is challenging due to unrepresented grasp types in standard +taxonomies, uneven data distribution across injury levels, and limited data. +This study aims to automatically identify the dominant distinct hand grasps in +egocentric video using semantic clustering. Egocentric video recordings +collected in the homes of 19 individual with cervical SCI were used to cluster +grasping actions with semantic significance. A deep learning model integrating +posture and appearance data was employed to create a personalized hand +taxonomy. Quantitative analysis reveals a cluster purity of 67.6% +- 24.2% with +with 18.0% +- 21.8% redundancy. Qualitative assessment revealed meaningful +clusters in video content. This methodology provides a flexible and effective +strategy to analyze hand function in the wild. It offers researchers and +clinicians an efficient tool for evaluating hand function, aiding sensitive +assessments and tailored intervention plans. + +
+
+
+
+
+ + ☆ OCAI: Improving Optical Flow Estimation by Occlusion and Consistency + Aware Interpolation CVPR 2024 + + +
+ The scarcity of ground-truth labels poses one major challenge in developing +optical flow estimation models that are both generalizable and robust. While +current methods rely on data augmentation, they have yet to fully exploit the +rich information available in labeled video sequences. We propose OCAI, a +method that supports robust frame interpolation by generating intermediate +video frames alongside optical flows in between. Utilizing a forward warping +approach, OCAI employs occlusion awareness to resolve ambiguities in pixel +values and fills in missing values by leveraging the forward-backward +consistency of optical flows. Additionally, we introduce a teacher-student +style semi-supervised learning method on top of the interpolated frames. Using +a pair of unlabeled frames and the teacher model's predicted optical flow, we +generate interpolated frames and flows to train a student model. The teacher's +weights are maintained using Exponential Moving Averaging of the student. Our +evaluations demonstrate perceptually superior interpolation quality and +enhanced optical flow accuracy on established benchmarks such as Sintel and +KITTI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ EgoPoseFormer: A Simple Baseline for Egocentric 3D Human Pose Estimation + + +
+ We present EgoPoseFormer, a simple yet effective transformer-based model for +stereo egocentric human pose estimation. The main challenge in egocentric pose +estimation is overcoming joint invisibility, which is caused by self-occlusion +or a limited field of view (FOV) of head-mounted cameras. Our approach +overcomes this challenge by incorporating a two-stage pose estimation paradigm: +in the first stage, our model leverages the global information to estimate each +joint's coarse location, then in the second stage, it employs a DETR style +transformer to refine the coarse locations by exploiting fine-grained stereo +visual features. In addition, we present a deformable stereo operation to +enable our transformer to effectively process multi-view features, which +enables it to accurately localize each joint in the 3D world. We evaluate our +method on the stereo UnrealEgo dataset and show it significantly outperforms +previous approaches while being computationally efficient: it improves MPJPE by +27.4mm (45% improvement) with only 7.9% model parameters and 13.1% FLOPs +compared to the state-of-the-art. Surprisingly, with proper training +techniques, we find that even our first-stage pose proposal network can achieve +superior performance compared to previous arts. We also show that our method +can be seamlessly extended to monocular settings, which achieves +state-of-the-art performance on the SceneEgo dataset, improving MPJPE by 25.5mm +(21% improvement) compared to the best existing method with only 60.7% model +parameters and 36.4% FLOPs. + +
+
+ comment: Tech Report +
+
+
+
+
+ + ☆ Every Shot Counts: Using Exemplars for Repetition Counting in Videos + + +
+ Video repetition counting infers the number of repetitions of recurring +actions or motion within a video. We propose an exemplar-based approach that +discovers visual correspondence of video exemplars across repetitions within +target videos. Our proposed Every Shot Counts (ESCounts) model is an +attention-based encoder-decoder that encodes videos of varying lengths +alongside exemplars from the same and different videos. In training, ESCounts +regresses locations of high correspondence to the exemplars within the video. +In tandem, our method learns a latent that encodes representations of general +repetitive motions, which we use for exemplar-free, zero-shot inference. +Extensive experiments over commonly used datasets (RepCount, Countix, and +UCFRep) showcase ESCounts obtaining state-of-the-art performance across all +three datasets. On RepCount, ESCounts increases the off-by-one from 0.39 to +0.56 and decreases the mean absolute error from 0.38 to 0.21. Detailed +ablations further demonstrate the effectiveness of our method. + +
+
+ comment: Project website: https://sinhasaptarshi.github.io/escounts +
+
+
+
+
+ + ☆ State of the art applications of deep learning within tracking and + detecting marine debris: A survey + + +
+ Deep learning techniques have been explored within the marine litter problem +for approximately 20 years but the majority of the research has developed +rapidly in the last five years. We provide an in-depth, up to date, summary and +analysis of 28 of the most recent and significant contributions of deep +learning in marine debris. From cross referencing the research paper results, +the YOLO family significantly outperforms all other methods of object detection +but there are many respected contributions to this field that have +categorically agreed that a comprehensive database of underwater debris is not +currently available for machine learning. Using a small dataset curated and +labelled by us, we tested YOLOv5 on a binary classification task and found the +accuracy was low and the rate of false positives was high; highlighting the +importance of a comprehensive database. We conclude this survey with over 40 +future research recommendations and open challenges. + +
+
+ comment: Review paper, 60 pages including references, 1 figure, 3 tables, 1 + supplementary data +
+
+
+
+
+ + ☆ Spectral Convolutional Transformer: Harmonizing Real vs. Complex + Multi-View Spectral Operators for Vision Transformer + + +
+ Transformers used in vision have been investigated through diverse +architectures - ViT, PVT, and Swin. These have worked to improve the attention +mechanism and make it more efficient. Differently, the need for including local +information was felt, leading to incorporating convolutions in transformers +such as CPVT and CvT. Global information is captured using a complex Fourier +basis to achieve global token mixing through various methods, such as AFNO, +GFNet, and Spectformer. We advocate combining three diverse views of data - +local, global, and long-range dependence. We also investigate the simplest +global representation using only the real domain spectral representation - +obtained through the Hartley transform. We use a convolutional operator in the +initial layers to capture local information. Through these two contributions, +we are able to optimize and obtain a spectral convolution transformer (SCT) +that provides improved performance over the state-of-the-art methods while +reducing the number of parameters. Through extensive experiments, we show that +SCT-C-small gives state-of-the-art performance on the ImageNet dataset and +reaches 84.5\% top-1 accuracy, while SCT-C-Large reaches 85.9\% and SCT-C-Huge +reaches 86.4\%. We evaluate SCT on transfer learning on datasets such as +CIFAR-10, CIFAR-100, Oxford Flower, and Stanford Car. We also evaluate SCT on +downstream tasks i.e. instance segmentation on the MSCOCO dataset. The project +page is available on this webpage.\url{https://github.com/badripatro/sct} + +
+
+
+
+
+ + ☆ Global Point Cloud Registration Network for Large Transformations + + +
+ Three-dimensional data registration is an established yet challenging problem +that is key in many different applications, such as mapping the environment for +autonomous vehicles, and modeling objects and people for avatar creation, among +many others. Registration refers to the process of mapping multiple data into +the same coordinate system by means of matching correspondences and +transformation estimation. Novel proposals exploit the benefits of deep +learning architectures for this purpose, as they learn the best features for +the data, providing better matches and hence results. However, the state of the +art is usually focused on cases of relatively small transformations, although +in certain applications and in a real and practical environment, large +transformations are very common. In this paper, we present ReLaTo (Registration +for Large Transformations), an architecture that faces the cases where large +transformations happen while maintaining good performance for local +transformations. This proposal uses a novel Softmax pooling layer to find +correspondences in a bilateral consensus manner between two point sets, +sampling the most confident matches. These matches are used to estimate a +coarse and global registration using weighted Singular Value Decomposition +(SVD). A target-guided denoising step is then applied to both the obtained +matches and latent features, estimating the final fine registration considering +the local geometry. All these steps are carried out following an end-to-end +approach, which has been shown to improve 10 state-of-the-art registration +methods in two datasets commonly used for this task (ModelNet40 and KITTI), +especially in the case of large transformations. + +
+
+
+
+
+ + ☆ TGGLinesPlus: A robust topological graph-guided computer vision + algorithm for line detection from images + + +
+ Line detection is a classic and essential problem in image processing, +computer vision and machine intelligence. Line detection has many important +applications, including image vectorization (e.g., document recognition and art +design), indoor mapping, and important societal challenges (e.g., sea ice +fracture line extraction from satellite imagery). Many line detection +algorithms and methods have been developed, but robust and intuitive methods +are still lacking. In this paper, we proposed and implemented a topological +graph-guided algorithm, named TGGLinesPlus, for line detection. Our experiments +on images from a wide range of domains have demonstrated the flexibility of our +TGGLinesPlus algorithm. We also benchmarked our algorithm with five classic and +state-of-the-art line detection methods and the results demonstrate the +robustness of TGGLinesPlus. We hope our open-source implementation of +TGGLinesPlus will inspire and pave the way for many applications where spatial +science matters. + +
+
+ comment: Our TGGLinesPlus Python implementation is open source. 27 pages, 8 + figures and 4 tables +
+
+
+
+
+ + ☆ Move as You Say, Interact as You Can: Language-guided Human Motion + Generation with Scene Affordance CVPR 2024 + + +
+ Despite significant advancements in text-to-motion synthesis, generating +language-guided human motion within 3D environments poses substantial +challenges. These challenges stem primarily from (i) the absence of powerful +generative models capable of jointly modeling natural language, 3D scenes, and +human motion, and (ii) the generative models' intensive data requirements +contrasted with the scarcity of comprehensive, high-quality, +language-scene-motion datasets. To tackle these issues, we introduce a novel +two-stage framework that employs scene affordance as an intermediate +representation, effectively linking 3D scene grounding and conditional motion +generation. Our framework comprises an Affordance Diffusion Model (ADM) for +predicting explicit affordance map and an Affordance-to-Motion Diffusion Model +(AMDM) for generating plausible human motions. By leveraging scene affordance +maps, our method overcomes the difficulty in generating human motion under +multimodal condition signals, especially when training with limited data +lacking extensive language-scene-motion pairs. Our extensive experiments +demonstrate that our approach consistently outperforms all baselines on +established benchmarks, including HumanML3D and HUMANISE. Additionally, we +validate our model's exceptional generalization capabilities on a specially +curated evaluation set featuring previously unseen descriptions and scenes. + +
+
+ comment: CVPR 2024; 16 pages +
+
+
+
+
+ + ☆ Bidirectional Consistency Models + + +
+ Diffusion models (DMs) are capable of generating remarkably high-quality +samples by iteratively denoising a random vector, a process that corresponds to +moving along the probability flow ordinary differential equation (PF ODE). +Interestingly, DMs can also invert an input image to noise by moving backward +along the PF ODE, a key operation for downstream tasks such as interpolation +and image editing. However, the iterative nature of this process restricts its +speed, hindering its broader application. Recently, Consistency Models (CMs) +have emerged to address this challenge by approximating the integral of the PF +ODE, thereby bypassing the need to iterate. Yet, the absence of an explicit ODE +solver complicates the inversion process. To resolve this, we introduce the +Bidirectional Consistency Model (BCM), which learns a single neural network +that enables both forward and backward traversal along the PF ODE, efficiently +unifying generation and inversion tasks within one framework. Notably, our +proposed method enables one-step generation and inversion while also allowing +the use of additional steps to enhance generation quality or reduce +reconstruction error. Furthermore, by leveraging our model's bidirectional +consistency, we introduce a sampling strategy that can enhance FID while +preserving the generated image content. We further showcase our model's +capabilities in several downstream tasks, such as interpolation and inpainting, +and present demonstrations of potential applications, including blind +restoration of compressed images and defending black-box adversarial attacks. + +
+
+ comment: 40 pages, 25 figures +
+
+
+
+
+ + ☆ SpectralWaste Dataset: Multimodal Data for Waste Sorting Automation + + +
+ The increase in non-biodegradable waste is a worldwide concern. Recycling +facilities play a crucial role, but their automation is hindered by the complex +characteristics of waste recycling lines like clutter or object deformation. In +addition, the lack of publicly available labeled data for these environments +makes developing robust perception systems challenging. Our work explores the +benefits of multimodal perception for object segmentation in real waste +management scenarios. First, we present SpectralWaste, the first dataset +collected from an operational plastic waste sorting facility that provides +synchronized hyperspectral and conventional RGB images. This dataset contains +labels for several categories of objects that commonly appear in sorting plants +and need to be detected and separated from the main trash flow for several +reasons, such as security in the management line or reuse. Additionally, we +propose a pipeline employing different object segmentation architectures and +evaluate the alternatives on our dataset, conducting an extensive analysis for +both multimodal and unimodal alternatives. Our evaluation pays special +attention to efficiency and suitability for real-time processing and +demonstrates how HSI can bring a boost to RGB-only perception in these +realistic industrial settings without much computational overhead. + +
+
+
+
+
+ + ☆ Predicting species occurrence patterns from partial observations ICLR 2024 + + +
+ To address the interlinked biodiversity and climate crises, we need an +understanding of where species occur and how these patterns are changing. +However, observational data on most species remains very limited, and the +amount of data available varies greatly between taxonomic groups. We introduce +the problem of predicting species occurrence patterns given (a) satellite +imagery, and (b) known information on the occurrence of other species. To +evaluate algorithms on this task, we introduce SatButterfly, a dataset of +satellite images, environmental data and observational data for butterflies, +which is designed to pair with the existing SatBird dataset of bird +observational data. To address this task, we propose a general model, R-Tran, +for predicting species occurrence patterns that enables the use of partial +observational data wherever found. We find that R-Tran outperforms other +methods in predicting species encounter rates with partial information both +within a taxon (birds) and across taxa (birds and butterflies). Our approach +opens new perspectives to leveraging insights from species with abundant data +to other species with scarce data, by modelling the ecosystems in which they +co-occur. + +
+
+ comment: Tackling Climate Change with Machine Learning workshop at ICLR 2024 +
+
+
+
+
+ + ☆ Text Is MASS: Modeling as Stochastic Embedding for Text-Video Retrieval CVPR 2024 + + +
+ The increasing prevalence of video clips has sparked growing interest in +text-video retrieval. Recent advances focus on establishing a joint embedding +space for text and video, relying on consistent embedding representations to +compute similarity. However, the text content in existing datasets is generally +short and concise, making it hard to fully describe the redundant semantics of +a video. Correspondingly, a single text embedding may be less expressive to +capture the video embedding and empower the retrieval. In this study, we +propose a new stochastic text modeling method T-MASS, i.e., text is modeled as +a stochastic embedding, to enrich text embedding with a flexible and resilient +semantic range, yielding a text mass. To be specific, we introduce a +similarity-aware radius module to adapt the scale of the text mass upon the +given text-video pairs. Plus, we design and develop a support text +regularization to further control the text mass during the training. The +inference pipeline is also tailored to fully exploit the text mass for accurate +retrieval. Empirical evidence suggests that T-MASS not only effectively +attracts relevant text-video pairs while distancing irrelevant ones, but also +enables the determination of precise text embeddings for relevant pairs. Our +experimental results show a substantial improvement of T-MASS over baseline (3% +to 6.3% by R@1). Also, T-MASS achieves state-of-the-art performance on five +benchmark datasets, including MSRVTT, LSMDC, DiDeMo, VATEX, and Charades. + +
+
+ comment: Accepted by CVPR 2024, code and model are available at + https://github.com/Jiamian-Wang/T-MASS-text-video-retrieval +
+
+
+
+
+ + ☆ GTA-HDR: A Large-Scale Synthetic Dataset for HDR Image Reconstruction + + +
+ High Dynamic Range (HDR) content (i.e., images and videos) has a broad range +of applications. However, capturing HDR content from real-world scenes is +expensive and time-consuming. Therefore, the challenging task of reconstructing +visually accurate HDR images from their Low Dynamic Range (LDR) counterparts is +gaining attention in the vision research community. A major challenge in this +research problem is the lack of datasets, which capture diverse scene +conditions (e.g., lighting, shadows, weather, locations, landscapes, objects, +humans, buildings) and various image features (e.g., color, contrast, +saturation, hue, luminance, brightness, radiance). To address this gap, in this +paper, we introduce GTA-HDR, a large-scale synthetic dataset of photo-realistic +HDR images sampled from the GTA-V video game. We perform thorough evaluation of +the proposed dataset, which demonstrates significant qualitative and +quantitative improvements of the state-of-the-art HDR image reconstruction +methods. Furthermore, we demonstrate the effectiveness of the proposed dataset +and its impact on additional computer vision tasks including 3D human pose +estimation, human body part segmentation, and holistic scene segmentation. The +dataset, data collection pipeline, and evaluation code are available at: +https://github.com/HrishavBakulBarua/GTA-HDR. + +
+
+ comment: Submitted to IEEE +
+
+
+
+
+ + ☆ Noise2Noise Denoising of CRISM Hyperspectral Data ICLR 2024 + + +
+ Hyperspectral data acquired by the Compact Reconnaissance Imaging +Spectrometer for Mars (CRISM) have allowed for unparalleled mapping of the +surface mineralogy of Mars. Due to sensor degradation over time, a significant +portion of the recently acquired data is considered unusable. Here a new +data-driven model architecture, Noise2Noise4Mars (N2N4M), is introduced to +remove noise from CRISM images. Our model is self-supervised and does not +require zero-noise target data, making it well suited for use in Planetary +Science applications where high quality labelled data is scarce. We demonstrate +its strong performance on synthetic-noise data and CRISM images, and its impact +on downstream classification performance, outperforming benchmark methods on +most metrics. This allows for detailed analysis for critical sites of interest +on the Martian surface, including proposed lander sites. + +
+
+ comment: 5 pages, 3 figures. Accepted as a conference paper at the ICLR 2024 + ML4RS Workshop +
+
+
+
+
+ + ☆ Semi-Supervised Image Captioning Considering Wasserstein Graph Matching + + +
+ Image captioning can automatically generate captions for the given images, +and the key challenge is to learn a mapping function from visual features to +natural language features. Existing approaches are mostly supervised ones, +i.e., each image has a corresponding sentence in the training set. However, +considering that describing images always requires a huge of manpower, we +usually have limited amount of described images (i.e., image-text pairs) and a +large number of undescribed images in real-world applications. Thereby, a +dilemma is the "Semi-Supervised Image Captioning". To solve this problem, we +propose a novel Semi-Supervised Image Captioning method considering Wasserstein +Graph Matching (SSIC-WGM), which turns to adopt the raw image inputs to +supervise the generated sentences. Different from traditional single modal +semi-supervised methods, the difficulty of semi-supervised cross-modal learning +lies in constructing intermediately comparable information among heterogeneous +modalities. In this paper, SSIC-WGM adopts the successful scene graphs as +intermediate information, and constrains the generated sentences from two +aspects: 1) inter-modal consistency. SSIC-WGM constructs the scene graphs of +the raw image and generated sentence respectively, then employs the wasserstein +distance to better measure the similarity between region embeddings of +different graphs. 2) intra-modal consistency. SSIC-WGM takes the data +augmentation techniques for the raw images, then constrains the consistency +among augmented images and generated sentences. Consequently, SSIC-WGM combines +the cross-modal pseudo supervision and structure invariant measure for +efficiently using the undescribed images, and learns more reasonable mapping +function. + +
+
+
+
+
+ + ☆ Deep Learning for Segmentation of Cracks in High-Resolution Images of + Steel Bridges + + +
+ Automating the current bridge visual inspection practices using drones and +image processing techniques is a prominent way to make these inspections more +effective, robust, and less expensive. In this paper, we investigate the +development of a novel deep-learning method for the detection of fatigue cracks +in high-resolution images of steel bridges. First, we present a novel and +challenging dataset comprising of images of cracks in steel bridges. Secondly, +we integrate the ConvNext neural network with a previous state-of-the-art +encoder-decoder network for crack segmentation. We study and report, the +effects of the use of background patches on the network performance when +applied to high-resolution images of cracks in steel bridges. Finally, we +introduce a loss function that allows the use of more background patches for +the training process, which yields a significant reduction in false positive +rates. + +
+
+
+
+
+ + ☆ Solution for Point Tracking Task of ICCV 1st Perception Test Challenge + 2023 + + +
+ This report proposes an improved method for the Tracking Any Point (TAP) +task, which tracks any physical surface through a video. Several existing +approaches have explored the TAP by considering the temporal relationships to +obtain smooth point motion trajectories, however, they still suffer from the +cumulative error caused by temporal prediction. To address this issue, we +propose a simple yet effective approach called TAP with confident static points +(TAPIR+), which focuses on rectifying the tracking of the static point in the +videos shot by a static camera. To clarify, our approach contains two key +components: (1) Multi-granularity Camera Motion Detection, which could identify +the video sequence by the static camera shot. (2) CMR-based point trajectory +prediction with one moving object segmentation approach to isolate the static +point from the moving object. Our approach ranked first in the final test with +a score of 0.46. + +
+
+
+
+
+ + ☆ Labeling subtypes in a Parkinson's Cohort using Multifeatures in MRI -- + Integrating Grey and White Matter Information + + +
+ Thresholding of networks has long posed a challenge in brain connectivity +analysis. Weighted networks are typically binarized using threshold measures to +facilitate network analysis. Previous studies on MRI-based brain networks have +predominantly utilized density or sparsity-based thresholding techniques, +optimized within specific ranges derived from network metrics such as path +length, clustering coefficient, and small-world index. Thus, determination of a +single threshold value for facilitating comparative analysis of networks +remains elusive. To address this, our study introduces Mutual K-Nearest +Neighbor (MKNN)-based thresholding for brain network analysis. Here, nearest +neighbor selection is based on the highest correlation between features of +brain regions. Construction of brain networks was accomplished by computing +Pearson correlations between grey matter volume and white matter volume for +each pair of brain regions. Structural MRI data from 180 Parkinsons patients +and 70 controls from the NIMHANS, India were analyzed. Subtypes within +Parkinsons disease were identified based on grey and white matter volume +atrophy using source-based morphometric decomposition. The loading coefficients +were correlated with clinical features to discern clinical relationship with +the deciphered subtypes. Our data-mining approach revealed: Subtype A (N = 51, +intermediate type), Subtype B (N = 57, mild-severe type with mild motor +symptoms), and Subtype AB (N = 36, most-severe type with predominance in motor +impairment). Subtype-specific weighted matrices were binarized using MKNN-based +thresholding for brain network analysis. Permutation tests on network metrics +of resulting bipartite graphs demonstrated significant group differences in +betweenness centrality and participation coefficient. The identified hubs were +specific to each subtype, with some hubs conserved across different subtypes. + +
+
+ comment: 31 pages, 10 figures, 3 tables +
+
+
+
+
+ + ☆ Predicting risk of cardiovascular disease using retinal OCT imaging + + +
+ We investigated the potential of optical coherence tomography (OCT) as an +additional imaging technique to predict future cardiovascular disease (CVD). We +utilised a self-supervised deep learning approach based on Variational +Autoencoders (VAE) to learn low-dimensional representations of high-dimensional +3D OCT images and to capture distinct characteristics of different retinal +layers within the OCT image. A Random Forest (RF) classifier was subsequently +trained using the learned latent features and participant demographic and +clinical data, to differentiate between patients at risk of CVD events (MI or +stroke) and non-CVD cases. Our predictive model, trained on multimodal data, +was assessed based on its ability to correctly identify individuals likely to +suffer from a CVD event(MI or stroke), within a 5-year interval after image +acquisition. Our self-supervised VAE feature selection and multimodal Random +Forest classifier differentiate between patients at risk of future CVD events +and the control group with an AUC of 0.75, outperforming the clinically +established QRISK3 score (AUC= 0.597). The choroidal layer visible in OCT +images was identified as an important predictor of future CVD events using a +novel approach to model explanability. Retinal OCT imaging provides a +cost-effective and non-invasive alternative to predict the risk of +cardiovascular disease and is readily accessible in optometry practices and +hospitals. + +
+
+ comment: 18 pages for main manuscript, 7 figures, 2 pages for appendix and + preprint for a journal +
+
+
+
+
+ + ☆ Clinical Domain Knowledge-Derived Template Improves Post Hoc AI + Explanations in Pneumothorax Classification + + +
+ Background: Pneumothorax is an acute thoracic disease caused by abnormal air +collection between the lungs and chest wall. To address the opaqueness often +associated with deep learning (DL) models, explainable artificial intelligence +(XAI) methods have been introduced to outline regions related to pneumothorax +diagnoses made by DL models. However, these explanations sometimes diverge from +actual lesion areas, highlighting the need for further improvement. Method: We +propose a template-guided approach to incorporate the clinical knowledge of +pneumothorax into model explanations generated by XAI methods, thereby +enhancing the quality of these explanations. Utilizing one lesion delineation +created by radiologists, our approach first generates a template that +represents potential areas of pneumothorax occurrence. This template is then +superimposed on model explanations to filter out extraneous explanations that +fall outside the template's boundaries. To validate its efficacy, we carried +out a comparative analysis of three XAI methods with and without our template +guidance when explaining two DL models in two real-world datasets. Results: The +proposed approach consistently improved baseline XAI methods across twelve +benchmark scenarios built on three XAI methods, two DL models, and two +datasets. The average incremental percentages, calculated by the performance +improvements over the baseline performance, were 97.8% in Intersection over +Union (IoU) and 94.1% in Dice Similarity Coefficient (DSC) when comparing model +explanations and ground-truth lesion areas. Conclusions: In the context of +pneumothorax diagnoses, we proposed a template-guided approach for improving AI +explanations. We anticipate that our template guidance will forge a fresh +approach to elucidating AI models by integrating clinical domain expertise. + +
+
+
+
+
+ + ☆ SugarcaneNet2024: An Optimized Weighted Average Ensemble Approach of + LASSO Regularized Pre-trained Models for Sugarcane Disease Classification + + +
+ Sugarcane, a key crop for the world's sugar industry, is prone to several +diseases that have a substantial negative influence on both its yield and +quality. To effectively manage and implement preventative initiatives, diseases +must be detected promptly and accurately. In this study, we present a unique +model called sugarcaneNet2024 that outperforms previous methods for +automatically and quickly detecting sugarcane disease through leaf image +processing. Our proposed model consolidates an optimized weighted average +ensemble of seven customized and LASSO-regularized pre-trained models, +particularly InceptionV3, InceptionResNetV2, DenseNet201, DenseNet169, +Xception, and ResNet152V2. Initially, we added three more dense layers with +0.0001 LASSO regularization, three 30% dropout layers, and three batch +normalizations with renorm enabled at the bottom of these pre-trained models to +improve the performance. The accuracy of sugarcane leaf disease classification +was greatly increased by this addition. Following this, several comparative +studies between the average ensemble and individual models were carried out, +indicating that the ensemble technique performed better. The average ensemble +of all modified pre-trained models produced outstanding outcomes: 100%, 99%, +99%, and 99.45% for f1 score, precision, recall, and accuracy, respectively. +Performance was further enhanced by the implementation of an optimized weighted +average ensemble technique incorporated with grid search. This optimized +sugarcaneNet2024 model performed the best for detecting sugarcane diseases, +having achieved accuracy, precision, recall, and F1 score of 99.67%, 100%, +100%, and 100% , respectively. + +
+
+ comment: 32 pages, 11 Figures, 13 Tables +
+
+
+
+
+ + ♻ ☆ DiVa-360: The Dynamic Visual Dataset for Immersive Neural Fields + + +
+ Advances in neural fields are enabling high-fidelity capture of the shape and +appearance of dynamic 3D scenes. However, their capabilities lag behind those +offered by conventional representations such as 2D videos because of +algorithmic challenges and the lack of large-scale multi-view real-world +datasets. We address the dataset limitation with DiVa-360, a real-world 360 +dynamic visual dataset that contains synchronized high-resolution and +long-duration multi-view video sequences of table-scale scenes captured using a +customized low-cost system with 53 cameras. It contains 21 object-centric +sequences categorized by different motion types, 25 intricate hand-object +interaction sequences, and 8 long-duration sequences for a total of 17.4 M +image frames. In addition, we provide foreground-background segmentation masks, +synchronized audio, and text descriptions. We benchmark the state-of-the-art +dynamic neural field methods on DiVa-360 and provide insights about existing +methods and future challenges on long-duration neural field capture. + +
+
+
+
+
+ + ♻ ☆ HoloVIC: Large-scale Dataset and Benchmark for Multi-Sensor Holographic + Intersection and Vehicle-Infrastructure Cooperative CVPR 2024 + + +
+ Vehicle-to-everything (V2X) is a popular topic in the field of Autonomous +Driving in recent years. Vehicle-infrastructure cooperation (VIC) becomes one +of the important research area. Due to the complexity of traffic conditions +such as blind spots and occlusion, it greatly limits the perception +capabilities of single-view roadside sensing systems. To further enhance the +accuracy of roadside perception and provide better information to the vehicle +side, in this paper, we constructed holographic intersections with various +layouts to build a large-scale multi-sensor holographic vehicle-infrastructure +cooperation dataset, called HoloVIC. Our dataset includes 3 different types of +sensors (Camera, Lidar, Fisheye) and employs 4 sensor-layouts based on the +different intersections. Each intersection is equipped with 6-18 sensors to +capture synchronous data. While autonomous vehicles pass through these +intersections for collecting VIC data. HoloVIC contains in total on 100k+ +synchronous frames from different sensors. Additionally, we annotated 3D +bounding boxes based on Camera, Fisheye, and Lidar. We also associate the IDs +of the same objects across different devices and consecutive frames in +sequence. Based on HoloVIC, we formulated four tasks to facilitate the +development of related research. We also provide benchmarks for these tasks. + +
+
+ comment: Accept to CVPR 2024, Benchmark Website: https://holovic.net +
+
+
+
+
+ + ♻ ☆ TRIPS: Trilinear Point Splatting for Real-Time Radiance Field Rendering + + +
+ Point-based radiance field rendering has demonstrated impressive results for +novel view synthesis, offering a compelling blend of rendering quality and +computational efficiency. However, also latest approaches in this domain are +not without their shortcomings. 3D Gaussian Splatting [Kerbl and Kopanas et al. +2023] struggles when tasked with rendering highly detailed scenes, due to +blurring and cloudy artifacts. On the other hand, ADOP [R\"uckert et al. 2022] +can accommodate crisper images, but the neural reconstruction network decreases +performance, it grapples with temporal instability and it is unable to +effectively address large gaps in the point cloud. + In this paper, we present TRIPS (Trilinear Point Splatting), an approach that +combines ideas from both Gaussian Splatting and ADOP. The fundamental concept +behind our novel technique involves rasterizing points into a screen-space +image pyramid, with the selection of the pyramid layer determined by the +projected point size. This approach allows rendering arbitrarily large points +using a single trilinear write. A lightweight neural network is then used to +reconstruct a hole-free image including detail beyond splat resolution. +Importantly, our render pipeline is entirely differentiable, allowing for +automatic optimization of both point sizes and positions. + Our evaluation demonstrate that TRIPS surpasses existing state-of-the-art +methods in terms of rendering quality while maintaining a real-time frame rate +of 60 frames per second on readily available hardware. This performance extends +to challenging scenarios, such as scenes featuring intricate geometry, +expansive landscapes, and auto-exposed footage. + The project page is located at: https://lfranke.github.io/trips/ + +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Crowd Counting from Unlabeled Data + + +
+ Automatic Crowd behavior analysis can be applied to effectively help the +daily transportation statistics and planning, which helps the smart city +construction. As one of the most important keys, crowd counting has drawn +increasing attention. Recent works achieved promising performance but relied on +the supervised paradigm with expensive crowd annotations. To alleviate the +annotation cost in real-world transportation scenarios, in this work we +proposed a semi-supervised learning framework $S^{4}\textit{Crowd}$, which can +leverage both unlabeled/labeled data for robust crowd counting. In the +unsupervised pathway, two \textit{self-supervised losses} were proposed to +simulate the crowd variations such as scale, illumination, based on which +supervised information pseudo labels were generated and gradually refined. We +also proposed a crowd-driven recurrent unit \textit{Gated-Crowd-Recurrent-Unit +(GCRU)}, which can preserve discriminant crowd information by extracting +second-order statistics, yielding pseudo labels with improved quality. A joint +loss including both unsupervised/supervised information was proposed, and a +dynamic weighting strategy was employed to balance the importance of the +unsupervised loss and supervised loss at different training stages. We +conducted extensive experiments on four popular crowd counting datasets in +semi-supervised settings. Experimental results supported the effectiveness of +each proposed component in our $S^{4}$Crowd framework. Our method achieved +competitive performance in semi-supervised learning approaches on these crowd +counting datasets. + +
+
+
+
+
+ + ♻ ☆ Efficient Pre-training for Localized Instruction Generation of Videos + + +
+ Procedural videos show step-by-step demonstrations of tasks like recipe +preparation. Understanding such videos is challenging, involving the precise +localization of steps and the generation of textual instructions. Manually +annotating steps and writing instructions is costly, which limits the size of +current datasets and hinders effective learning. Leveraging large but noisy +video-transcript datasets for pre-training can boost performance, but demands +significant computational resources. Furthermore, transcripts contain +irrelevant content and exhibit style variation compared to instructions written +by human annotators. To mitigate both issues, we propose a technique, +Sieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters +irrelevant transcripts and (ii) Swap enhances the quality of the text +instruction by automatically replacing the transcripts with human-written +instructions from a text-only recipe dataset. The curated dataset, three orders +of magnitude smaller than current web-scale datasets, enables efficient +training of large-scale models with competitive performance. We complement our +Sieve-\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step +localization and instruction generation for procedural videos. When this model +is pre-trained on our curated dataset, it achieves state-of-the-art performance +in zero-shot and finetuning settings on YouCook2 and Tasty, while using a +fraction of the computational resources. + +
+
+ comment: This version has some missing experiments and elaborative technical + details +
+
+
+
+
+ + ♻ ☆ SimLVSeg: Simplifying Left Ventricular Segmentation in 2D+Time + Echocardiograms with Self- and Weakly-Supervised Learning + + +
+ Echocardiography has become an indispensable clinical imaging modality for +general heart health assessment. From calculating biomarkers such as ejection +fraction to the probability of a patient's heart failure, accurate segmentation +of the heart structures allows doctors to assess the heart's condition and +devise treatments with greater precision and accuracy. However, achieving +accurate and reliable left ventricle segmentation is time-consuming and +challenging due to different reasons. Hence, clinicians often rely on +segmenting the left ventricular (LV) in two specific echocardiogram frames to +make a diagnosis. This limited coverage in manual LV segmentation poses a +challenge for developing automatic LV segmentation with high temporal +consistency, as the resulting dataset is typically annotated sparsely. In +response to this challenge, this work introduces SimLVSeg, a novel paradigm +that enables video-based networks for consistent LV segmentation from sparsely +annotated echocardiogram videos. SimLVSeg consists of self-supervised +pre-training with temporal masking, followed by weakly supervised learning +tailored for LV segmentation from sparse annotations. We demonstrate how +SimLVSeg outperforms the state-of-the-art solutions by achieving a 93.32% +(95%CI 93.21-93.43%) dice score on the largest 2D+time echocardiography dataset +(EchoNet-Dynamic) while being more efficient. SimLVSeg is compatible with two +types of video segmentation networks: 2D super image and 3D segmentation. To +show the effectiveness of our approach, we provide extensive ablation studies, +including pre-training settings and various deep learning backbones. We further +conduct an out-of-distribution test to showcase SimLVSeg's generalizability on +unseen distribution (CAMUS dataset). The code is publicly available at +https://github.com/fadamsyah/SimLVSeg. + +
+
+
+
+
+ + ♻ ☆ HIMap: HybrId Representation Learning for End-to-end Vectorized HD Map + Construction CVPR 2024 + + +
+ Vectorized High-Definition (HD) map construction requires predictions of the +category and point coordinates of map elements (e.g. road boundary, lane +divider, pedestrian crossing, etc.). State-of-the-art methods are mainly based +on point-level representation learning for regressing accurate point +coordinates. However, this pipeline has limitations in obtaining element-level +information and handling element-level failures, e.g. erroneous element shape +or entanglement between elements. To tackle the above issues, we propose a +simple yet effective HybrId framework named HIMap to sufficiently learn and +interact both point-level and element-level information. Concretely, we +introduce a hybrid representation called HIQuery to represent all map elements, +and propose a point-element interactor to interactively extract and encode the +hybrid information of elements, e.g. point position and element shape, into the +HIQuery. Additionally, we present a point-element consistency constraint to +enhance the consistency between the point-level and element-level information. +Finally, the output point-element integrated HIQuery can be directly converted +into map elements' class, point coordinates, and mask. We conduct extensive +experiments and consistently outperform previous methods on both nuScenes and +Argoverse2 datasets. Notably, our method achieves $77.8$ mAP on the nuScenes +dataset, remarkably superior to previous SOTAs by $8.3$ mAP at least. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploiting Semantic Reconstruction to Mitigate Hallucinations in + Vision-Language Models + + +
+ Hallucinations in vision-language models pose a significant challenge to +their reliability, particularly in the generation of long captions. Current +methods fall short of accurately identifying and mitigating these +hallucinations. To address this issue, we introduce ESREAL, a novel +unsupervised learning framework designed to suppress the generation of +hallucinations through accurate localization and penalization of hallucinated +tokens. Initially, ESREAL creates a reconstructed image based on the generated +caption and aligns its corresponding regions with those of the original image. +This semantic reconstruction aids in identifying both the presence and type of +token-level hallucinations within the generated caption. Subsequently, ESREAL +computes token-level hallucination scores by assessing the semantic similarity +of aligned regions based on the type of hallucination. Finally, ESREAL employs +a proximal policy optimization algorithm, where it selectively penalizes +hallucinated tokens according to their token-level hallucination scores. Our +framework notably reduces hallucinations in LLaVA, InstructBLIP, and mPLUG-Owl2 +by 32.81%, 27.08%, and 7.46% on the CHAIR metric. This improvement is achieved +solely through signals derived from the image itself, without the need for any +image-text pairs. + +
+
+
+
+
+ + ♻ ☆ Pushing Auto-regressive Models for 3D Shape Generation at Capacity and + Scalability + + +
+ Auto-regressive models have achieved impressive results in 2D image +generation by modeling joint distributions in grid space. In this paper, we +extend auto-regressive models to 3D domains, and seek a stronger ability of 3D +shape generation by improving auto-regressive models at capacity and +scalability simultaneously. Firstly, we leverage an ensemble of publicly +available 3D datasets to facilitate the training of large-scale models. It +consists of a comprehensive collection of approximately 900,000 objects, with +multiple properties of meshes, points, voxels, rendered images, and text +captions. This diverse labeled dataset, termed Objaverse-Mix, empowers our +model to learn from a wide range of object variations. However, directly +applying 3D auto-regression encounters critical challenges of high +computational demands on volumetric grids and ambiguous auto-regressive order +along grid dimensions, resulting in inferior quality of 3D shapes. To this end, +we then present a novel framework Argus3D in terms of capacity. Concretely, our +approach introduces discrete representation learning based on a latent vector +instead of volumetric grids, which not only reduces computational costs but +also preserves essential geometric details by learning the joint distributions +in a more tractable order. The capacity of conditional generation can thus be +realized by simply concatenating various conditioning inputs to the latent +vector, such as point clouds, categories, images, and texts. In addition, +thanks to the simplicity of our model architecture, we naturally scale up our +approach to a larger model with an impressive 3.6 billion parameters, further +enhancing the quality of versatile 3D generation. Extensive experiments on four +generation tasks demonstrate that Argus3D can synthesize diverse and faithful +shapes across multiple categories, achieving remarkable performance. + +
+
+ comment: Project page: https://argus-3d.github.io/ . Datasets: + https://huggingface.co/datasets/BAAI/Objaverse-MIX. arXiv admin note: + substantial text overlap with arXiv:2303.14700 +
+
+
+
+
+ + ♻ ☆ ReMoS: 3D Motion-Conditioned Reaction Synthesis for Two-Person + Interactions + + +
+ Current approaches for 3D human motion synthesis generate high-quality +animations of digital humans performing a wide variety of actions and gestures. +However, a notable technological gap exists in addressing the complex dynamics +of multi-human interactions within this paradigm. In this work, we present +ReMoS, a denoising diffusion-based model that synthesizes full-body reactive +motion of a person in a two-person interaction scenario. Assuming the motion of +one person is given, we employ a combined spatio-temporal cross-attention +mechanism to synthesize the reactive body and hand motion of the second person, +thereby completing the interactions between the two. We demonstrate ReMoS +across challenging two-person scenarios such as pair-dancing, Ninjutsu, +kickboxing, and acrobatics, where one person's movements have complex and +diverse influences on the other. We also contribute the ReMoCap dataset for +two-person interactions containing full-body and finger motions. We evaluate +ReMoS through multiple quantitative metrics, qualitative visualizations, and a +user study, and also indicate usability in interactive motion editing +applications. + +
+
+ comment: 17 pages, 7 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis + + +
+ Chest X-ray images are commonly used for predicting acute and chronic +cardiopulmonary conditions, but efforts to integrate them with structured +clinical data face challenges due to incomplete electronic health records +(EHR). This paper introduces \textbf{MedPromptX}, the first model to integrate +multimodal large language models (MLLMs), few-shot prompting (FP) and visual +grounding (VG) to combine imagery with EHR data for chest X-ray diagnosis. A +pre-trained MLLM is utilized to complement the missing EHR information, +providing a comprehensive understanding of patients' medical history. +Additionally, FP reduces the necessity for extensive training of MLLMs while +effectively tackling the issue of hallucination. Nevertheless, the process of +determining the optimal number of few-shot examples and selecting high-quality +candidates can be burdensome, yet it profoundly influences model performance. +Hence, we propose a new technique that dynamically refines few-shot data for +real-time adjustment to new patient scenarios. Moreover, VG aids in focusing +the model's attention on relevant regions of interest in X-ray images, +enhancing the identification of abnormalities. We release MedPromptX-VQA, a new +in-context visual question answering dataset encompassing interleaved image and +EHR data derived from MIMIC-IV and MIMIC-CXR databases. Results demonstrate the +SOTA performance of MedPromptX, achieving an 11% improvement in F1-score +compared to the baselines. Code and data are available at +https://github.com/BioMedIA-MBZUAI/MedPromptX + +
+
+
+
+
+ + ♻ ☆ Text-Guided Variational Image Generation for Industrial Anomaly + Detection and Segmentation CVPR 2024 + + +
+ We propose a text-guided variational image generation method to address the +challenge of getting clean data for anomaly detection in industrial +manufacturing. Our method utilizes text information about the target object, +learned from extensive text library documents, to generate non-defective data +images resembling the input image. The proposed framework ensures that the +generated non-defective images align with anticipated distributions derived +from textual and image-based knowledge, ensuring stability and generality. +Experimental results demonstrate the effectiveness of our approach, surpassing +previous methods even with limited non-defective data. Our approach is +validated through generalization tests across four baseline models and three +distinct datasets. We present an additional analysis to enhance the +effectiveness of anomaly detection models by utilizing the generated images. + +
+
+ comment: 18 pages, Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Identity-aware Dual-constraint Network for Cloth-Changing Person + Re-identification + + +
+ Cloth-Changing Person Re-Identification (CC-ReID) aims to accurately identify +the target person in more realistic surveillance scenarios, where pedestrians +usually change their clothing. Despite great progress, limited cloth-changing +training samples in existing CC-ReID datasets still prevent the model from +adequately learning cloth-irrelevant features. In addition, due to the absence +of explicit supervision to keep the model constantly focused on +cloth-irrelevant areas, existing methods are still hampered by the disruption +of clothing variations. To solve the above issues, we propose an Identity-aware +Dual-constraint Network (IDNet) for the CC-ReID task. Specifically, to help the +model extract cloth-irrelevant clues, we propose a Clothes Diversity +Augmentation (CDA), which generates more realistic cloth-changing samples by +enriching the clothing color while preserving the texture. In addition, a +Multi-scale Constraint Block (MCB) is designed, which extracts fine-grained +identity-related features and effectively transfers cloth-irrelevant knowledge. +Moreover, a Counterfactual-guided Attention Module (CAM) is presented, which +learns cloth-irrelevant features from channel and space dimensions and utilizes +the counterfactual intervention for supervising the attention map to highlight +identity-related regions. Finally, a Semantic Alignment Constraint (SAC) is +designed to facilitate high-level semantic feature interaction. Comprehensive +experiments on four CC-ReID datasets indicate that our method outperforms prior +state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ Unveiling the Pitfalls of Knowledge Editing for Large Language Models ICLR 2024 + + +
+ As the cost associated with fine-tuning Large Language Models (LLMs) +continues to rise, recent research efforts have pivoted towards developing +methodologies to edit implicit knowledge embedded within LLMs. Yet, there's +still a dark cloud lingering overhead -- will knowledge editing trigger +butterfly effect? since it is still unclear whether knowledge editing might +introduce side effects that pose potential risks or not. This paper pioneers +the investigation into the potential pitfalls associated with knowledge editing +for LLMs. To achieve this, we introduce new benchmark datasets and propose +innovative evaluation metrics. Our results underline two pivotal concerns: (1) +Knowledge Conflict: Editing groups of facts that logically clash can magnify +the inherent inconsistencies in LLMs-a facet neglected by previous methods. (2) +Knowledge Distortion: Altering parameters with the aim of editing factual +knowledge can irrevocably warp the innate knowledge structure of LLMs. +Experimental results vividly demonstrate that knowledge editing might +inadvertently cast a shadow of unintended consequences on LLMs, which warrant +attention and efforts for future works. Code and data are available at +https://github.com/zjunlp/PitfallsKnowledgeEditing. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Generative 3D Part Assembly via Part-Whole-Hierarchy Message Passing + + +
+ Generative 3D part assembly involves understanding part relationships and +predicting their 6-DoF poses for assembling a realistic 3D shape. Prior work +often focus on the geometry of individual parts, neglecting part-whole +hierarchies of objects. Leveraging two key observations: 1) super-part poses +provide strong hints about part poses, and 2) predicting super-part poses is +easier due to fewer superparts, we propose a part-whole-hierarchy message +passing network for efficient 3D part assembly. We first introduce super-parts +by grouping geometrically similar parts without any semantic labels. Then we +employ a part-whole hierarchical encoder, wherein a super-part encoder predicts +latent super-part poses based on input parts. Subsequently, we transform the +point cloud using the latent poses, feeding it to the part encoder for +aggregating super-part information and reasoning about part relationships to +predict all part poses. In training, only ground-truth part poses are required. +During inference, the predicted latent poses of super-parts enhance +interpretability. Experimental results on the PartNet dataset show that our +method achieves state-of-the-art performance in part and connectivity accuracy +and enables an interpretable hierarchical part assembly. + +
+
+
+
+
+ + ♻ ☆ InNeRF360: Text-Guided 3D-Consistent Object Inpainting on 360-degree + Neural Radiance Fields CVPR 2024 + + +
+ We propose InNeRF360, an automatic system that accurately removes +text-specified objects from 360-degree Neural Radiance Fields (NeRF). The +challenge is to effectively remove objects while inpainting perceptually +consistent content for the missing regions, which is particularly demanding for +existing NeRF models due to their implicit volumetric representation. Moreover, +unbounded scenes are more prone to floater artifacts in the inpainted region +than frontal-facing scenes, as the change of object appearance and background +across views is more sensitive to inaccurate segmentations and inconsistent +inpainting. With a trained NeRF and a text description, our method efficiently +removes specified objects and inpaints visually consistent content without +artifacts. We apply depth-space warping to enforce consistency across multiview +text-encoded segmentations, and then refine the inpainted NeRF model using +perceptual priors and 3D diffusion-based geometric priors to ensure visual +plausibility. Through extensive experiments in segmentation and inpainting on +360-degree and frontal-facing NeRFs, we show that our approach is effective and +enhances NeRF's editability. Project page: https://ivrl.github.io/InNeRF360. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Passive Non-Line-of-Sight Imaging with Light Transport Modulation + + +
+ Passive non-line-of-sight (NLOS) imaging has witnessed rapid development in +recent years, due to its ability to image objects that are out of sight. The +light transport condition plays an important role in this task since changing +the conditions will lead to different imaging models. Existing learning-based +NLOS methods usually train independent models for different light transport +conditions, which is computationally inefficient and impairs the practicality +of the models. In this work, we propose NLOS-LTM, a novel passive NLOS imaging +method that effectively handles multiple light transport conditions with a +single network. We achieve this by inferring a latent light transport +representation from the projection image and using this representation to +modulate the network that reconstructs the hidden image from the projection +image. We train a light transport encoder together with a vector quantizer to +obtain the light transport representation. To further regulate this +representation, we jointly learn both the reconstruction network and the +reprojection network during training. A set of light transport modulation +blocks is used to modulate the two jointly trained networks in a multi-scale +way. Extensive experiments on a large-scale passive NLOS dataset demonstrate +the superiority of the proposed method. The code is available at +https://github.com/JerryOctopus/NLOS-LTM. + +
+
+
+
+
+ + ♻ ☆ ViT-Lens: Towards Omni-modal Representations CVPR2024 + + +
+ Aiming to advance AI agents, large foundation models significantly improve +reasoning and instruction execution, yet the current focus on vision and +language neglects the potential of perceiving diverse modalities in open-world +environments. However, the success of data-driven vision and language models is +costly or even infeasible to be reproduced for rare modalities. In this paper, +we present ViT-Lens-2 that facilitates efficient omni-modal representation +learning by perceiving novel modalities with a pretrained ViT and aligning them +to a pre-defined space. Specifically, the modality-specific lens is tuned to +project any-modal signals to an intermediate embedding space, which are then +processed by a strong ViT with pre-trained visual knowledge. The encoded +representations are optimized toward aligning with the modal-independent space, +pre-defined by off-the-shelf foundation models. ViT-Lens-2 provides a unified +solution for representation learning of increasing modalities with two +appealing advantages: (i) Unlocking the great potential of pretrained ViTs to +novel modalities effectively with efficient data regime; (ii) Enabling emergent +downstream capabilities through modality alignment and shared ViT parameters. +We tailor ViT-Lens-2 to learn representations for 3D point cloud, depth, audio, +tactile and EEG, and set new state-of-the-art results across various +understanding tasks, such as zero-shot classification. By seamlessly +integrating ViT-Lens-2 into Multimodal Foundation Models, we enable +Any-modality to Text and Image Generation in a zero-shot manner. Code and +models are available at https://github.com/TencentARC/ViT-Lens. + +
+
+ comment: This work is a follow-up of arXiv:2308.10185. Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Implicit Discriminative Knowledge Learning for Visible-Infrared Person + Re-Identification CVPR 2024 + + +
+ Visible-Infrared Person Re-identification (VI-ReID) is a challenging +cross-modal pedestrian retrieval task, due to significant intra-class +variations and cross-modal discrepancies among different cameras. Existing +works mainly focus on embedding images of different modalities into a unified +space to mine modality-shared features. They only seek distinctive information +within these shared features, while ignoring the identity-aware useful +information that is implicit in the modality-specific features. To address this +issue, we propose a novel Implicit Discriminative Knowledge Learning (IDKL) +network to uncover and leverage the implicit discriminative information +contained within the modality-specific. First, we extract modality-specific and +modality-shared features using a novel dual-stream network. Then, the +modality-specific features undergo purification to reduce their modality style +discrepancies while preserving identity-aware discriminative knowledge. +Subsequently, this kind of implicit knowledge is distilled into the +modality-shared feature to enhance its distinctiveness. Finally, an alignment +loss is proposed to minimize modality discrepancy on enhanced modality-shared +features. Extensive experiments on multiple public datasets demonstrate the +superiority of IDKL network over the state-of-the-art methods. Code is +available at https://github.com/1KK077/IDKL. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ In Search of a Data Transformation That Accelerates Neural Field + Training CVPR 2024 + + +
+ Neural field is an emerging paradigm in data representation that trains a +neural network to approximate the given signal. A key obstacle that prevents +its widespread adoption is the encoding speed-generating neural fields requires +an overfitting of a neural network, which can take a significant number of SGD +steps to reach the desired fidelity level. In this paper, we delve into the +impacts of data transformations on the speed of neural field training, +specifically focusing on how permuting pixel locations affect the convergence +speed of SGD. Counterintuitively, we find that randomly permuting the pixel +locations can considerably accelerate the training. To explain this phenomenon, +we examine the neural field training through the lens of PSNR curves, loss +landscapes, and error patterns. Our analyses suggest that the random pixel +permutations remove the easy-to-fit patterns, which facilitate easy +optimization in the early stage but hinder capturing fine details of the +signal. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ AV2AV: Direct Audio-Visual Speech to Audio-Visual Speech Translation + with Unified Audio-Visual Speech Representation CVPR 2024 + + +
+ This paper proposes a novel direct Audio-Visual Speech to Audio-Visual Speech +Translation (AV2AV) framework, where the input and output of the system are +multimodal (i.e., audio and visual speech). With the proposed AV2AV, two key +advantages can be brought: 1) We can perform real-like conversations with +individuals worldwide in a virtual meeting by utilizing our own primary +languages. In contrast to Speech-to-Speech Translation (A2A), which solely +translates between audio modalities, the proposed AV2AV directly translates +between audio-visual speech. This capability enhances the dialogue experience +by presenting synchronized lip movements along with the translated speech. 2) +We can improve the robustness of the spoken language translation system. By +employing the complementary information of audio-visual speech, the system can +effectively translate spoken language even in the presence of acoustic noise, +showcasing robust performance. To mitigate the problem of the absence of a +parallel AV2AV translation dataset, we propose to train our spoken language +translation system with the audio-only dataset of A2A. This is done by learning +unified audio-visual speech representations through self-supervised learning in +advance to train the translation system. Moreover, we propose an AV-Renderer +that can generate raw audio and video in parallel. It is designed with +zero-shot speaker modeling, thus the speaker in source audio-visual speech can +be maintained at the target translated audio-visual speech. The effectiveness +of AV2AV is evaluated with extensive experiments in a many-to-many language +translation setting. Demo page is available on +https://choijeongsoo.github.io/av2av. + +
+
+ comment: CVPR 2024. Code & Demo: https://choijeongsoo.github.io/av2av +
+
+
+
+
+ + ♻ ☆ SINC: Spatial Composition of 3D Human Motions for Simultaneous Action + Generation + + +
+ Our goal is to synthesize 3D human motions given textual inputs describing +simultaneous actions, for example 'waving hand' while 'walking' at the same +time. We refer to generating such simultaneous movements as performing 'spatial +compositions'. In contrast to temporal compositions that seek to transition +from one action to another, spatial compositing requires understanding which +body parts are involved in which action, to be able to move them +simultaneously. Motivated by the observation that the correspondence between +actions and body parts is encoded in powerful language models, we extract this +knowledge by prompting GPT-3 with text such as "what are the body parts +involved in the action ?", while also providing the parts list and +few-shot examples. Given this action-part mapping, we combine body parts from +two motions together and establish the first automated method to spatially +compose two actions. However, training data with compositional actions is +always limited by the combinatorics. Hence, we further create synthetic data +with this approach, and use it to train a new state-of-the-art text-to-motion +generation model, called SINC ("SImultaneous actioN Compositions for 3D human +motions"). In our experiments, that training with such GPT-guided synthetic +data improves spatial composition generation over baselines. Our code is +publicly available at https://sinc.is.tue.mpg.de/. + +
+
+ comment: Teaser Fixed +
+
+
+
+
+ + ♻ ☆ Powerful Lossy Compression for Noisy Images ICME 2024 + + +
+ Image compression and denoising represent fundamental challenges in image +processing with many real-world applications. To address practical demands, +current solutions can be categorized into two main strategies: 1) sequential +method; and 2) joint method. However, sequential methods have the disadvantage +of error accumulation as there is information loss between multiple individual +models. Recently, the academic community began to make some attempts to tackle +this problem through end-to-end joint methods. Most of them ignore that +different regions of noisy images have different characteristics. To solve +these problems, in this paper, our proposed signal-to-noise ratio~(SNR) aware +joint solution exploits local and non-local features for image compression and +denoising simultaneously. We design an end-to-end trainable network, which +includes the main encoder branch, the guidance branch, and the signal-to-noise +ratio~(SNR) aware branch. We conducted extensive experiments on both synthetic +and real-world datasets, demonstrating that our joint solution outperforms +existing state-of-the-art methods. + +
+
+ comment: Accepted by ICME 2024 +
+
+
+
+
+ + ♻ ☆ ViT-Lens: Initiating Omni-Modal Exploration through 3D Insights + + +
+ Though the success of CLIP-based training recipes in vision-language models, +their scalability to more modalities (e.g., 3D, audio, etc.) is limited to +large-scale data, which is expensive or even inapplicable for rare modalities. +In this paper, we present ViT-Lens that facilitates efficient omni-modal +representation learning by perceiving novel modalities with a pretrained ViT +and aligning to a pre-defined space. Specifically, the modality-specific lens +is tuned to project multimodal signals to the shared embedding space, which are +then processed by a strong ViT that carries pre-trained image knowledge. The +encoded multimodal representations are optimized toward aligning with the +modal-independent space, pre-defined by off-the-shelf foundation models. A +well-trained lens with a ViT backbone has the potential to serve as one of +these foundation models, supervising the learning of subsequent modalities. +ViT-Lens provides a unified solution for representation learning of increasing +modalities with two appealing benefits: (i) Exploiting the pretrained ViT +across tasks and domains effectively with efficient data regime; (ii) Emergent +downstream capabilities of novel modalities are demonstrated due to the +modality alignment space. We evaluate ViT-Lens in the context of 3D as an +initial verification. In zero-shot 3D classification, ViT-Lens achieves +substantial improvements over previous state-of-the-art, showing 52.0% accuracy +on Objaverse-LVIS, 87.4% on ModelNet40, and 60.6% on ScanObjectNN. Furthermore, +we enable zero-shot 3D question-answering by simply integrating the trained 3D +lens into the InstructBLIP model without any adaptation. We will release the +results of ViT-Lens on more modalities in the near future. + +
+
+ comment: 19 pages, 4 figures and 9 tables +
+
+
+
+
+ + ♻ ☆ TP2O: Creative Text Pair-to-Object Generation using Balance + Swap-Sampling + + +
+ Generating creative combinatorial objects from two seemingly unrelated object +texts is a challenging task in text-to-image synthesis, often hindered by a +focus on emulating existing data distributions. In this paper, we develop a +straightforward yet highly effective method, called \textbf{balance +swap-sampling}. First, we propose a swapping mechanism that generates a novel +combinatorial object image set by randomly exchanging intrinsic elements of two +text embeddings through a cutting-edge diffusion model. Second, we introduce a +balance swapping region to efficiently sample a small subset from the newly +generated image set by balancing CLIP distances between the new images and +their original generations, increasing the likelihood of accepting the +high-quality combinations. Last, we employ a segmentation method to compare +CLIP distances among the segmented components, ultimately selecting the most +promising object from the sampled subset. Extensive experiments demonstrate +that our approach outperforms recent SOTA T2I methods. Surprisingly, our +results even rival those of human artists, such as frog-broccoli. + +
+
+ comment: Project page: https://tp2o.github.io/anon/ +
+
+
+
+
+ + ♻ ☆ Segment and Caption Anything CVPR 24 + + +
+ We propose a method to efficiently equip the Segment Anything Model (SAM) +with the ability to generate regional captions. SAM presents strong +generalizability to segment anything while is short for semantic understanding. +By introducing a lightweight query-based feature mixer, we align the +region-specific features with the embedding space of language models for later +caption generation. As the number of trainable parameters is small (typically +in the order of tens of millions), it costs less computation, less memory +usage, and less communication bandwidth, resulting in both fast and scalable +training. To address the scarcity problem of regional caption data, we propose +to first pre-train our model on objection detection and segmentation tasks. We +call this step weak supervision pretraining since the pre-training data only +contains category names instead of full-sentence descriptions. The weak +supervision pretraining allows us to leverage many publicly available object +detection and segmentation datasets. We conduct extensive experiments to +demonstrate the superiority of our method and validate each design choice. This +work serves as a stepping stone towards scaling up regional captioning data and +sheds light on exploring efficient ways to augment SAM with regional semantics. +The project page, along with the associated code, can be accessed via +https://xk-huang.github.io/segment-caption-anything/. + +
+
+ comment: The project page, along with the associated code, can be accessed via + https://xk-huang.github.io/segment-caption-anything/; Update author + information; Accepted by CVPR 24 +
+
+
+
+
+ + ♻ ☆ TagAlign: Improving Vision-Language Alignment with Multi-Tag + Classification + + +
+ The crux of learning vision-language models is to extract semantically +aligned information from visual and linguistic data. Existing attempts usually +face the problem of coarse alignment, e.g., the vision encoder struggles in +localizing an attribute-specified object. In this work, we propose an +embarrassingly simple approach to better align image and text features with no +need of additional data formats other than image-text pairs. Concretely, given +an image and its paired text, we manage to parse objects (e.g., cat) and +attributes (e.g., black) from the description, which are highly likely to exist +in the image. It is noteworthy that the parsing pipeline is fully automatic and +thus enjoys good scalability. With these parsed semantics as supervision +signals, we can complement the commonly used image-text contrastive loss with +the multi-tag classification loss. Extensive experimental results on a broad +suite of semantic segmentation datasets substantiate the average 5.2\% +improvement of our framework over existing alternatives. Furthermore, the +visualization results indicate that attribute supervision makes vision-language +models accurately localize attribute-specified objects. Project page can be +found at https://qinying-liu.github.io/Tag-Align. + +
+
+
+
+
+ + ♻ ☆ SGS-SLAM: Semantic Gaussian Splatting For Neural Dense SLAM + + +
+ We present SGS-SLAM, the first semantic visual SLAM system based on Gaussian +Splatting. It incorporates appearance, geometry, and semantic features through +multi-channel optimization, addressing the oversmoothing limitations of neural +implicit SLAM systems in high-quality rendering, scene understanding, and +object-level geometry. We introduce a unique semantic feature loss that +effectively compensates for the shortcomings of traditional depth and color +losses in object optimization. Through a semantic-guided keyframe selection +strategy, we prevent erroneous reconstructions caused by cumulative errors. +Extensive experiments demonstrate that SGS-SLAM delivers state-of-the-art +performance in camera pose estimation, map reconstruction, precise semantic +segmentation, and object-level geometric accuracy, while ensuring real-time +rendering capabilities. + +
+
+
+
+
+ + ♻ ☆ ArtAdapter: Text-to-Image Style Transfer using Multi-Level Style Encoder + and Explicit Adaptation + + +
+ This work introduces ArtAdapter, a transformative text-to-image (T2I) style +transfer framework that transcends traditional limitations of color, +brushstrokes, and object shape, capturing high-level style elements such as +composition and distinctive artistic expression. The integration of a +multi-level style encoder with our proposed explicit adaptation mechanism +enables ArtAdapter to achieve unprecedented fidelity in style transfer, +ensuring close alignment with textual descriptions. Additionally, the +incorporation of an Auxiliary Content Adapter (ACA) effectively separates +content from style, alleviating the borrowing of content from style references. +Moreover, our novel fast finetuning approach could further enhance zero-shot +style representation while mitigating the risk of overfitting. Comprehensive +evaluations confirm that ArtAdapter surpasses current state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Clean-image Backdoor Attacks + + +
+ To gather a significant quantity of annotated training data for +high-performance image classification models, numerous companies opt to enlist +third-party providers to label their unlabeled data. This practice is widely +regarded as secure, even in cases where some annotated errors occur, as the +impact of these minor inaccuracies on the final performance of the models is +negligible and existing backdoor attacks require attacker's ability to poison +the training images. Nevertheless, in this paper, we propose clean-image +backdoor attacks which uncover that backdoors can still be injected via a +fraction of incorrect labels without modifying the training images. +Specifically, in our attacks, the attacker first seeks a trigger feature to +divide the training images into two parts: those with the feature and those +without it. Subsequently, the attacker falsifies the labels of the former part +to a backdoor class. The backdoor will be finally implanted into the target +model after it is trained on the poisoned data. During the inference phase, the +attacker can activate the backdoor in two ways: slightly modifying the input +image to obtain the trigger feature, or taking an image that naturally has the +trigger feature as input. We conduct extensive experiments to demonstrate the +effectiveness and practicality of our attacks. According to the experimental +results, we conclude that our attacks seriously jeopardize the fairness and +robustness of image classification models, and it is necessary to be vigilant +about the incorrect labels in outsourced labeling. + +
+
+
+
+
+ + ♻ ☆ Transferring Relative Monocular Depth to Surgical Vision with Temporal + Consistency + + +
+ Relative monocular depth, inferring depth up to shift and scale from a single +image, is an active research topic. Recent deep learning models, trained on +large and varied meta-datasets, now provide excellent performance in the domain +of natural images. However, few datasets exist which provide ground truth depth +for endoscopic images, making training such models from scratch unfeasible. +This work investigates the transfer of these models into the surgical domain, +and presents an effective and simple way to improve on standard supervision +through the use of temporal consistency self-supervision. We show temporal +consistency significantly improves supervised training alone when transferring +to the low-data regime of endoscopy, and outperforms the prevalent +self-supervision technique for this task. In addition we show our method +drastically outperforms the state-of-the-art method from within the domain of +endoscopy. We also release our code, model and ensembled meta-dataset, +Meta-MED, establishing a strong benchmark for future work. + +
+
+
+
+
+ + ♻ ☆ Towards Source-free Domain Adaptive Semantic Segmentation via + Importance-aware and Prototype-contrast Learning + + +
+ Domain adaptive semantic segmentation enables robust pixel-wise understanding +in real-world driving scenes. Source-free domain adaptation, as a more +practical technique, addresses the concerns of data privacy and storage +limitations in typical unsupervised domain adaptation methods, making it +especially relevant in the context of intelligent vehicles. It utilizes a +well-trained source model and unlabeled target data to achieve adaptation in +the target domain. However, in the absence of source data and target labels, +current solutions cannot sufficiently reduce the impact of domain shift and +fully leverage the information from the target data. In this paper, we propose +an end-to-end source-free domain adaptation semantic segmentation method via +Importance-Aware and Prototype-Contrast (IAPC) learning. The proposed IAPC +framework effectively extracts domain-invariant knowledge from the well-trained +source model and learns domain-specific knowledge from the unlabeled target +domain. Specifically, considering the problem of domain shift in the prediction +of the target domain by the source model, we put forward an importance-aware +mechanism for the biased target prediction probability distribution to extract +domain-invariant knowledge from the source model. We further introduce a +prototype-contrast strategy, which includes a prototype-symmetric cross-entropy +loss and a prototype-enhanced cross-entropy loss, to learn target intra-domain +knowledge without relying on labels. A comprehensive variety of experiments on +two domain adaptive semantic segmentation benchmarks demonstrates that the +proposed end-to-end IAPC solution outperforms existing state-of-the-art +methods. The source code is publicly available at +https://github.com/yihong-97/Source-free-IAPC. + +
+
+ comment: Accepted to IEEE Transactions on Intelligent Vehicles (T-IV). The + source code is publicly available at + https://github.com/yihong-97/Source-free-IAPC +
+
+
+
+
+ + ♻ ☆ SD4Match: Learning to Prompt Stable Diffusion Model for Semantic + Matching CVPR 2024 + + +
+ In this paper, we address the challenge of matching semantically similar +keypoints across image pairs. Existing research indicates that the intermediate +output of the UNet within the Stable Diffusion (SD) can serve as robust image +feature maps for such a matching task. We demonstrate that by employing a basic +prompt tuning technique, the inherent potential of Stable Diffusion can be +harnessed, resulting in a significant enhancement in accuracy over previous +approaches. We further introduce a novel conditional prompting module that +conditions the prompt on the local details of the input image pairs, leading to +a further improvement in performance. We designate our approach as SD4Match, +short for Stable Diffusion for Semantic Matching. Comprehensive evaluations of +SD4Match on the PF-Pascal, PF-Willow, and SPair-71k datasets show that it sets +new benchmarks in accuracy across all these datasets. Particularly, SD4Match +outperforms the previous state-of-the-art by a margin of 12 percentage points +on the challenging SPair-71k dataset. + +
+
+ comment: Accepted to CVPR 2024. Project website: + https://sd4match.active.vision/ +
+
+
+
+
+ + ♻ ☆ ObjectCompose: Evaluating Resilience of Vision-Based Models on + Object-to-Background Compositional Changes + + +
+ Given the large-scale multi-modal training of recent vision-based models and +their generalization capabilities, understanding the extent of their robustness +is critical for their real-world deployment. In this work, we evaluate the +resilience of current vision-based models against diverse object-to-background +context variations. The majority of robustness evaluation methods have +introduced synthetic datasets to induce changes to object characteristics +(viewpoints, scale, color) or utilized image transformation techniques +(adversarial changes, common corruptions) on real images to simulate shifts in +distributions. Recent works have explored leveraging large language models and +diffusion models to generate changes in the background. However, these methods +either lack in offering control over the changes to be made or distort the +object semantics, making them unsuitable for the task. Our method, on the other +hand, can induce diverse object-to-background changes while preserving the +original semantics and appearance of the object. To achieve this goal, we +harness the generative capabilities of text-to-image, image-to-text, and +image-to-segment models to automatically generate a broad spectrum of +object-to-background changes. We induce both natural and adversarial background +changes by either modifying the textual prompts or optimizing the latents and +textual embedding of text-to-image models. We produce various versions of +standard vision datasets (ImageNet, COCO), incorporating either diverse and +realistic backgrounds into the images or introducing color, texture, and +adversarial changes in the background. We conduct extensive experiment to +analyze the robustness of vision-based models against object-to-background +context variations across diverse tasks. Code +https://github.com/Muhammad-Huzaifaa/ObjectCompose.git + +
+
+
+
+
+ + ♻ ☆ Motion Generation from Fine-grained Textual Descriptions + + +
+ The task of text2motion is to generate human motion sequences from given +textual descriptions, where the model explores diverse mappings from natural +language instructions to human body movements. While most existing works are +confined to coarse-grained motion descriptions, e.g., "A man squats.", +fine-grained descriptions specifying movements of relevant body parts are +barely explored. Models trained with coarse-grained texts may not be able to +learn mappings from fine-grained motion-related words to motion primitives, +resulting in the failure to generate motions from unseen descriptions. In this +paper, we build a large-scale language-motion dataset specializing in +fine-grained textual descriptions, FineHumanML3D, by feeding GPT-3.5-turbo with +step-by-step instructions with pseudo-code compulsory checks. Accordingly, we +design a new text2motion model, FineMotionDiffuse, making full use of +fine-grained textual information. Our quantitative evaluation shows that +FineMotionDiffuse trained on FineHumanML3D improves FID by a large margin of +0.38, compared with competitive baselines. According to the qualitative +evaluation and case study, our model outperforms MotionDiffuse in generating +spatially or chronologically composite motions, by learning the implicit +mappings from fine-grained descriptions to the corresponding basic motions. We +release our data at https://github.com/KunhangL/finemotiondiffuse. + +
+
+
+
+
+ + ♻ ☆ Towards Low-Energy Adaptive Personalization for Resource-Constrained + Devices + + +
+ The personalization of machine learning (ML) models to address data drift is +a significant challenge in the context of Internet of Things (IoT) +applications. Presently, most approaches focus on fine-tuning either the full +base model or its last few layers to adapt to new data, while often neglecting +energy costs. However, various types of data drift exist, and fine-tuning the +full base model or the last few layers may not result in optimal performance in +certain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy +adaptive personalization framework designed for resource-constrained devices. +We categorize data drift and personalization into three types: input-level, +feature-level, and output-level. For each type, we fine-tune different blocks +of the model to achieve optimal performance with reduced energy costs. +Specifically, input-, feature-, and output-level correspond to fine-tuning the +front, middle, and rear blocks of the model. We evaluate TBFT on a ResNet +model, three datasets, three different training sizes, and a Raspberry Pi. +Compared with the $Block Avg$, where each block is fine-tuned individually and +their performance improvements are averaged, TBFT exhibits an improvement in +model accuracy by an average of 15.30% whilst saving 41.57% energy consumption +on average compared with full fine-tuning. + +
+
+ comment: Accepetd to The 4th Workshop on Machine Learning and Systems + (EuroMLSys '24) +
+
+
+
+
+ + ♻ ☆ FPT: Fine-grained Prompt Tuning for Parameter and Memory Efficient Fine + Tuning in High-resolution Medical Image Classification + + +
+ Parameter-efficient fine-tuning (PEFT) is proposed as a cost-effective way to +transfer pre-trained models to downstream tasks, avoiding the high cost of +updating entire large-scale pre-trained models (LPMs). In this work, we present +Fine-grained Prompt Tuning (FPT), a novel PEFT method for medical image +classification. FPT significantly reduces memory consumption compared to other +PEFT methods, especially in high-resolution contexts. To achieve this, we first +freeze the weights of the LPM and construct a learnable lightweight side +network. The frozen LPM takes high-resolution images as input to extract +fine-grained features, while the side network is fed low-resolution images to +reduce memory usage. To allow the side network to access pre-trained knowledge, +we introduce fine-grained prompts that summarize information from the LPM +through a fusion module. Important tokens selection and preloading techniques +are employed to further reduce training cost and memory requirements. We +evaluate FPT on four medical datasets with varying sizes, modalities, and +complexities. Experimental results demonstrate that FPT achieves comparable +performance to fine-tuning the entire LPM while using only 1.8% of the +learnable parameters and 13% of the memory costs of an encoder ViT-B model with +a 512 x 512 input resolution. + +
+
+
+
+
+ + ♻ ☆ SegVol: Universal and Interactive Volumetric Medical Image Segmentation + + +
+ Precise image segmentation provides clinical study with instructive +information. Despite the remarkable progress achieved in medical image +segmentation, there is still an absence of 3D foundation segmentation model +that can segment a wide range of anatomical categories with easy user +interaction. In this paper, we propose a 3D foundation segmentation model, +named SegVol, supporting universal and interactive volumetric medical image +segmentation. By scaling up training data to 90K unlabeled Computed Tomography +(CT) volumes and 6K labeled CT volumes, this foundation model supports the +segmentation of over 200 anatomical categories using semantic and spatial +prompts. Extensive experiments on 10 internal validation tasks and 18 external +validation tasks verify that SegVol outperforms the state of the art by a large +margin. Through its capacity to provide precise volumetric segmentation across +various anatomical categories, SegVol has the potential to accelerate +advancements in medical imaging diagnosis and facilitate treatment +optimization. The model and code are publicly available at: +https://github.com/BAAI-DCAI/SegVol. + +
+
+
+
+
+ + ♻ ☆ DreamComposer: Controllable 3D Object Generation via Multi-View + Conditions + + +
+ Utilizing pre-trained 2D large-scale generative models, recent works are +capable of generating high-quality novel views from a single in-the-wild image. +However, due to the lack of information from multiple views, these works +encounter difficulties in generating controllable novel views. In this paper, +we present DreamComposer, a flexible and scalable framework that can enhance +existing view-aware diffusion models by injecting multi-view conditions. +Specifically, DreamComposer first uses a view-aware 3D lifting module to obtain +3D representations of an object from multiple views. Then, it renders the +latent features of the target view from 3D representations with the multi-view +feature fusion module. Finally the target view features extracted from +multi-view inputs are injected into a pre-trained diffusion model. Experiments +show that DreamComposer is compatible with state-of-the-art diffusion models +for zero-shot novel view synthesis, further enhancing them to generate +high-fidelity novel view images with multi-view conditions, ready for +controllable 3D object reconstruction and various other applications. + +
+
+ comment: Project Page: https://yhyang-myron.github.io/DreamComposer/ +
+
+
+
+
+ + ♻ ☆ Regularizing Self-supervised 3D Scene Flows with Surface Awareness and + Cyclic Consistency + + +
+ Learning without supervision how to predict 3D scene flows from point clouds +is essential to many perception systems. We propose a novel learning framework +for this task which improves the necessary regularization. Relying on the +assumption that scene elements are mostly rigid, current smoothness losses are +built on the definition of ``rigid clusters" in the input point clouds. The +definition of these clusters is challenging and has a significant impact on the +quality of predicted flows. We introduce two new consistency losses that +enlarge clusters while preventing them from spreading over distinct objects. In +particular, we enforce \emph{temporal} consistency with a forward-backward +cyclic loss and \emph{spatial} consistency by considering surface orientation +similarity in addition to spatial proximity. The proposed losses are +model-independent and can thus be used in a plug-and-play fashion to +significantly improve the performance of existing models, as demonstrated on +two most widely used architectures. We also showcase the effectiveness and +generalization capability of our framework on four standard sensor-unique +driving datasets, achieving state-of-the-art performance in 3D scene flow +estimation. Our codes are available on https://github.com/ctu-vras/sac-flow. + +
+
+
+
+
+ + ♻ ☆ P2ANet: A Dataset and Benchmark for Dense Action Detection from Table + Tennis Match Broadcasting Videos + + +
+ While deep learning has been widely used for video analytics, such as video +classification and action detection, dense action detection with fast-moving +subjects from sports videos is still challenging. In this work, we release yet +another sports video benchmark \TheName{} for \emph{\underline{P}}ing +\emph{\underline{P}}ong-\emph{\underline{A}}ction detection, which consists of +2,721 video clips collected from the broadcasting videos of professional table +tennis matches in World Table Tennis Championships and Olympiads. We work with +a crew of table tennis professionals and referees on a specially designed +annotation toolbox to obtain fine-grained action labels (in 14 classes) for +every ping-pong action that appeared in the dataset, and formulate two sets of +action detection problems -- \emph{action localization} and \emph{action +recognition}. We evaluate a number of commonly-seen action recognition (e.g., +TSM, TSN, Video SwinTransformer, and Slowfast) and action localization models +(e.g., BSN, BSN++, BMN, TCANet), using \TheName{} for both problems, under +various settings. These models can only achieve 48\% area under the AR-AN curve +for localization and 82\% top-one accuracy for recognition since the ping-pong +actions are dense with fast-moving subjects but broadcasting videos are with +only 25 FPS. The results confirm that \TheName{} is still a challenging task +and can be used as a special benchmark for dense action detection from videos. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Semantic Segmentation Through Depth-Guided Feature + Correlation and Sampling CVPR 2024 + + +
+ Traditionally, training neural networks to perform semantic segmentation +required expensive human-made annotations. But more recently, advances in the +field of unsupervised learning have made significant progress on this issue and +towards closing the gap to supervised algorithms. To achieve this, semantic +knowledge is distilled by learning to correlate randomly sampled features from +images across an entire dataset. In this work, we build upon these advances by +incorporating information about the structure of the scene into the training +process through the use of depth information. We achieve this by (1) learning +depth-feature correlation by spatially correlate the feature maps with the +depth maps to induce knowledge about the structure of the scene and (2) +implementing farthest-point sampling to more effectively select relevant +features by utilizing 3D sampling techniques on depth information of the scene. +Finally, we demonstrate the effectiveness of our technical contributions +through extensive experimentation and present significant improvements in +performance across multiple benchmark datasets. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Reflectance Map: Single-Image Stochastic Inverse Rendering of + Illumination and Reflectance CVPR 2024 + + +
+ Reflectance bounds the frequency spectrum of illumination in the object +appearance. In this paper, we introduce the first stochastic inverse rendering +method, which recovers the attenuated frequency spectrum of an illumination +jointly with the reflectance of an object of known geometry from a single +image. Our key idea is to solve this blind inverse problem in the reflectance +map, an appearance representation invariant to the underlying geometry, by +learning to reverse the image formation with a novel diffusion model which we +refer to as the Diffusion Reflectance Map Network (DRMNet). Given an observed +reflectance map converted and completed from the single input image, DRMNet +generates a reflectance map corresponding to a perfect mirror sphere while +jointly estimating the reflectance. The forward process can be understood as +gradually filtering a natural illumination with lower and lower frequency +reflectance and additive Gaussian noise. DRMNet learns to invert this process +with two subnetworks, IllNet and RefNet, which work in concert towards this +joint estimation. The network is trained on an extensive synthetic dataset and +is demonstrated to generalize to real images, showing state-of-the-art accuracy +on established datasets. + +
+
+ comment: to be published in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ ProMamba: Prompt-Mamba for polyp segmentation + + +
+ Detecting polyps through colonoscopy is an important task in medical image +segmentation, which provides significant assistance and reference value for +clinical surgery. However, accurate segmentation of polyps is a challenging +task due to two main reasons. Firstly, polyps exhibit various shapes and +colors. Secondly, the boundaries between polyps and their normal surroundings +are often unclear. Additionally, significant differences between different +datasets lead to limited generalization capabilities of existing methods. To +address these issues, we propose a segmentation model based on Prompt-Mamba, +which incorporates the latest Vision-Mamba and prompt technologies. Compared to +previous models trained on the same dataset, our model not only maintains high +segmentation accuracy on the validation part of the same dataset but also +demonstrates superior accuracy on unseen datasets, exhibiting excellent +generalization capabilities. Notably, we are the first to apply the +Vision-Mamba architecture to polyp segmentation and the first to utilize prompt +technology in a polyp segmentation model. Our model efficiently accomplishes +segmentation tasks, surpassing previous state-of-the-art methods by an average +of 5% across six datasets. Furthermore, we have developed multiple versions of +our model with scaled parameter counts, achieving better performance than +previous models even with fewer parameters. Our code and trained weights will +be released soon. + +
+
+ comment: 10 pages, 2 figures,3 tabels +
+
+
+
+
+ + ♻ ☆ SocialCircle: Learning the Angle-based Social Interaction Representation + for Pedestrian Trajectory Prediction CVPR 2024 + + +
+ Analyzing and forecasting trajectories of agents like pedestrians and cars in +complex scenes has become more and more significant in many intelligent systems +and applications. The diversity and uncertainty in socially interactive +behaviors among a rich variety of agents make this task more challenging than +other deterministic computer vision tasks. Researchers have made a lot of +efforts to quantify the effects of these interactions on future trajectories +through different mathematical models and network structures, but this problem +has not been well solved. Inspired by marine animals that localize the +positions of their companions underwater through echoes, we build a new +anglebased trainable social interaction representation, named SocialCircle, for +continuously reflecting the context of social interactions at different angular +orientations relative to the target agent. We validate the effect of the +proposed SocialCircle by training it along with several newly released +trajectory prediction models, and experiments show that the SocialCircle not +only quantitatively improves the prediction performance, but also qualitatively +helps better simulate social interactions when forecasting pedestrian +trajectories in a way that is consistent with human intuitions. + +
+
+ comment: CVPR 2024 accepted +
+
+
+
+
+ + ♻ ☆ Emotic Masked Autoencoder with Attention Fusion for Facial Expression + Recognition + + +
+ Facial Expression Recognition (FER) is a critical task within computer vision +with diverse applications across various domains. Addressing the challenge of +limited FER datasets, which hampers the generalization capability of expression +recognition models, is imperative for enhancing performance. Our paper presents +an innovative approach integrating the MAE-Face self-supervised learning (SSL) +method and Fusion Attention mechanism for expression classification, +particularly showcased in the 6th Affective Behavior 32 pages harvmac; added +references for section 5Analysis in-the-wild (ABAW) competition. Additionally, +we propose preprocessing techniques to emphasize essential facial features, +thereby enhancing model performance on both training and validation sets, +notably demonstrated on the Aff-wild2 dataset. + +
+
+ comment: 6 pages; added references for section 1; corrected typo for email + author +
+
+
+
+
+ + ♻ ☆ Learning User Embeddings from Human Gaze for Personalised Saliency + Prediction + + +
+ Reusable embeddings of user behaviour have shown significant performance +improvements for the personalised saliency prediction task. However, prior +works require explicit user characteristics and preferences as input, which are +often difficult to obtain. We present a novel method to extract user embeddings +from pairs of natural images and corresponding saliency maps generated from a +small amount of user-specific eye tracking data. At the core of our method is a +Siamese convolutional neural encoder that learns the user embeddings by +contrasting the image and personal saliency map pairs of different users. +Evaluations on two public saliency datasets show that the generated embeddings +have high discriminative power, are effective at refining universal saliency +maps to the individual users, and generalise well across users and images. +Finally, based on our model's ability to encode individual user +characteristics, our work points towards other applications that can benefit +from reusable embeddings of gaze behaviour. + +
+
+
+
+
+ + ♻ ☆ VRP-SAM: SAM with Visual Reference Prompt CVPR 2024 + + +
+ In this paper, we propose a novel Visual Reference Prompt (VRP) encoder that +empowers the Segment Anything Model (SAM) to utilize annotated reference images +as prompts for segmentation, creating the VRP-SAM model. In essence, VRP-SAM +can utilize annotated reference images to comprehend specific objects and +perform segmentation of specific objects in target image. It is note that the +VRP encoder can support a variety of annotation formats for reference images, +including \textbf{point}, \textbf{box}, \textbf{scribble}, and \textbf{mask}. +VRP-SAM achieves a breakthrough within the SAM framework by extending its +versatility and applicability while preserving SAM's inherent strengths, thus +enhancing user-friendliness. To enhance the generalization ability of VRP-SAM, +the VRP encoder adopts a meta-learning strategy. To validate the effectiveness +of VRP-SAM, we conducted extensive empirical studies on the Pascal and COCO +datasets. Remarkably, VRP-SAM achieved state-of-the-art performance in visual +reference segmentation with minimal learnable parameters. Furthermore, VRP-SAM +demonstrates strong generalization capabilities, allowing it to perform +segmentation of unseen objects and enabling cross-domain segmentation. The +source code and models will be available at +\url{https://github.com/syp2ysy/VRP-SAM} + +
+
+ comment: Accepted by CVPR 2024; The camera-ready version +
+
+
+
+
+ + ♻ ☆ SeFFeC: Semantic Facial Feature Control for Fine-grained Face Editing + + +
+ We propose Semantic Facial Feature Control (SeFFeC) - a novel method for +fine-grained face shape editing. Our method enables the manipulation of +human-understandable, semantic face features, such as nose length or mouth +width, which are defined by different groups of facial landmarks. In contrast +to existing methods, the use of facial landmarks enables precise measurement of +the facial features, which then enables training SeFFeC without any manually +annotated labels. SeFFeC consists of a transformer-based encoder network that +takes a latent vector of a pre-trained generative model and a facial feature +embedding as input, and learns to modify the latent vector to perform the +desired face edit operation. To ensure that the desired feature measurement is +changed towards the target value without altering uncorrelated features, we +introduced a novel semantic face feature loss. Qualitative and quantitative +results show that SeFFeC enables precise and fine-grained control of 23 facial +features, some of which could not previously be controlled by other methods, +without requiring manual annotations. Unlike existing methods, SeFFeC also +provides deterministic control over the exact values of the facial features and +more localised and disentangled face edits. + +
+
+
+
+
+ + ♻ ☆ Dual Prototype Attention for Unsupervised Video Object Segmentation CVPR 2024 + + +
+ Unsupervised video object segmentation (VOS) aims to detect and segment the +most salient object in videos. The primary techniques used in unsupervised VOS +are 1) the collaboration of appearance and motion information; and 2) temporal +fusion between different frames. This paper proposes two novel prototype-based +attention mechanisms, inter-modality attention (IMA) and inter-frame attention +(IFA), to incorporate these techniques via dense propagation across different +modalities and frames. IMA densely integrates context information from +different modalities based on a mutual refinement. IFA injects global context +of a video to the query frame, enabling a full utilization of useful properties +from multiple frames. Experimental results on public benchmark datasets +demonstrate that our proposed approach outperforms all existing methods by a +substantial margin. The proposed two components are also thoroughly validated +via ablative study. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ From Pretext to Purpose: Batch-Adaptive Self-Supervised Learning + + +
+ In recent years, self-supervised contrastive learning has emerged as a +distinguished paradigm in the artificial intelligence landscape. It facilitates +unsupervised feature learning through contrastive delineations at the instance +level. However, crafting an effective self-supervised paradigm remains a +pivotal challenge within this field. This paper delves into two crucial factors +impacting self-supervised contrastive learning-bach size and pretext tasks, and +from a data processing standpoint, proposes an adaptive technique of batch +fusion. The proposed method, via dimensionality reduction and reconstruction of +batch data, enables formerly isolated individual data to partake in intra-batch +communication through the Embedding Layer. Moreover, it adaptively amplifies +the self-supervised feature encoding capability as the training progresses. We +conducted a linear classification test of this method based on the classic +contrastive learning framework on ImageNet-1k. The empirical findings +illustrate that our approach achieves state-of-the-art performance under +equitable comparisons. Benefiting from its "plug-and-play" characteristics, we +further explored other contrastive learning methods. On the ImageNet-100, +compared to the original performance, the top1 has seen a maximum increase of +1.25%. We suggest that the proposed method may contribute to the advancement of +data-driven self-supervised learning research, bringing a fresh perspective to +this community. + +
+
+ comment: 14 pages, 2 figures, the code of this paper will be released soon +
+
+
+
+
+ + ♻ ☆ LLaFS: When Large Language Models Meet Few-Shot Segmentation CVPR2024 + + +
+ This paper proposes LLaFS, the first attempt to leverage large language +models (LLMs) in few-shot segmentation. In contrast to the conventional +few-shot segmentation methods that only rely on the limited and biased +information from the annotated support images, LLaFS leverages the vast prior +knowledge gained by LLM as an effective supplement and directly uses the LLM to +segment images in a few-shot manner. To enable the text-based LLM to handle +image-related tasks, we carefully design an input instruction that allows the +LLM to produce segmentation results represented as polygons, and propose a +region-attribute table to simulate the human visual mechanism and provide +multi-modal guidance. We also synthesize pseudo samples and use curriculum +learning for pretraining to augment data and achieve better optimization. LLaFS +achieves state-of-the-art results on multiple datasets, showing the potential +of using LLMs for few-shot computer vision tasks. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ EcoSense: Energy-Efficient Intelligent Sensing for In-Shore Ship + Detection through Edge-Cloud Collaboration + + +
+ Detecting marine objects inshore presents challenges owing to algorithmic +intricacies and complexities in system deployment. We propose a +difficulty-aware edge-cloud collaborative sensing system that splits the task +into object localization and fine-grained classification. Objects are +classified either at the edge or within the cloud, based on their estimated +difficulty. The framework comprises a low-power device-tailored front-end model +for object localization, classification, and difficulty estimation, along with +a transformer-graph convolutional network-based back-end model for fine-grained +classification. Our system demonstrates superior performance (mAP@0.5 +4.3%}) +on widely used marine object detection datasets, significantly reducing both +data transmission volume (by 95.43%) and energy consumption (by 72.7%}) at the +system level. We validate the proposed system across various embedded system +platforms and in real-world scenarios involving drone deployment. + +
+
+
+
+
+ + ♻ ☆ Vision Transformers with Hierarchical Attention + + +
+ This paper tackles the high computational/space complexity associated with +Multi-Head Self-Attention (MHSA) in vanilla vision transformers. To this end, +we propose Hierarchical MHSA (H-MHSA), a novel approach that computes +self-attention in a hierarchical fashion. Specifically, we first divide the +input image into patches as commonly done, and each patch is viewed as a token. +Then, the proposed H-MHSA learns token relationships within local patches, +serving as local relationship modeling. Then, the small patches are merged into +larger ones, and H-MHSA models the global dependencies for the small number of +the merged tokens. At last, the local and global attentive features are +aggregated to obtain features with powerful representation capacity. Since we +only calculate attention for a limited number of tokens at each step, the +computational load is reduced dramatically. Hence, H-MHSA can efficiently model +global relationships among tokens without sacrificing fine-grained information. +With the H-MHSA module incorporated, we build a family of +Hierarchical-Attention-based Transformer Networks, namely HAT-Net. To +demonstrate the superiority of HAT-Net in scene understanding, we conduct +extensive experiments on fundamental vision tasks, including image +classification, semantic segmentation, object detection, and instance +segmentation. Therefore, HAT-Net provides a new perspective for vision +transformers. Code and pretrained models are available at +https://github.com/yun-liu/HAT-Net. + +
+
+ comment: Machine Intelligence Research (MIR), DOI: 10.1007/s11633-024-1393-8 +
+
+
+
+
+ + ♻ ☆ Domain-Aware Fine-Tuning: Enhancing Neural Network Adaptability + + +
+ Fine-tuning pre-trained neural network models has become a widely adopted +approach across various domains. However, it can lead to the distortion of +pre-trained feature extractors that already possess strong generalization +capabilities. Mitigating feature distortion during adaptation to new target +domains is crucial. Recent studies have shown promising results in handling +feature distortion by aligning the head layer on in-distribution datasets +before performing fine-tuning. Nonetheless, a significant limitation arises +from the treatment of batch normalization layers during fine-tuning, leading to +suboptimal performance. In this paper, we propose Domain-Aware Fine-Tuning +(DAFT), a novel approach that incorporates batch normalization conversion and +the integration of linear probing and fine-tuning. Our batch normalization +conversion method effectively mitigates feature distortion by reducing +modifications to the neural network during fine-tuning. Additionally, we +introduce the integration of linear probing and fine-tuning to optimize the +head layer with gradual adaptation of the feature extractor. By leveraging +batch normalization layers and integrating linear probing and fine-tuning, our +DAFT significantly mitigates feature distortion and achieves improved model +performance on both in-distribution and out-of-distribution datasets. Extensive +experiments demonstrate that our method outperforms other baseline methods, +demonstrating its effectiveness in not only improving performance but also +mitigating feature distortion. + +
+
+
+
+
+ + ♻ ☆ NeuS-PIR: Learning Relightable Neural Surface using Pre-Integrated + Rendering + + +
+ This paper presents a method, namely NeuS-PIR, for recovering relightable +neural surfaces using pre-integrated rendering from multi-view images or video. +Unlike methods based on NeRF and discrete meshes, our method utilizes implicit +neural surface representation to reconstruct high-quality geometry, which +facilitates the factorization of the radiance field into two components: a +spatially varying material field and an all-frequency lighting representation. +This factorization, jointly optimized using an adapted differentiable +pre-integrated rendering framework with material encoding regularization, in +turn addresses the ambiguity of geometry reconstruction and leads to better +disentanglement and refinement of each scene property. Additionally, we +introduced a method to distil indirect illumination fields from the learned +representations, further recovering the complex illumination effect like +inter-reflection. Consequently, our method enables advanced applications such +as relighting, which can be seamlessly integrated with modern graphics engines. +Qualitative and quantitative experiments have shown that NeuS-PIR outperforms +existing methods across various tasks on both synthetic and real datasets. +Source code is available at https://github.com/Sheldonmao/NeuSPIR + +
+
+
+
+
+ + ♻ ☆ Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method + + +
+ Gaze plays a crucial role in revealing human attention and intention, +shedding light on the cognitive processes behind human actions. The integration +of gaze guidance with the dynamics of hand-object interactions boosts the +accuracy of human motion prediction. However, the lack of datasets that capture +the intricate relationship and consistency among gaze, hand, and object +movements remains a substantial hurdle. In this paper, we introduce the first +Gaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task +for synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI, +features simultaneous 3D modeling of gaze, hand, and object interactions, +comprising 479 sequences with an average duration of 19.1 seconds, 812 +sub-sequences, and 33 objects of various sizes. We propose a hierarchical +framework centered on a gaze-guided hand-object interaction diffusion model, +named GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions +into spatial-temporal features and goal pose conditions at different levels of +information granularity. During the diffusion phase, two gaze-conditioned +diffusion models are stacked to simplify the complex synthesis of hand-object +motions. Here, the object motion diffusion model generates sequences of object +motions based on gaze conditions, while the hand motion diffusion model +produces hand motions based on the generated object motion. To improve +fine-grained goal pose alignment, we introduce a Spherical Gaussian constraint +to guide the denoising step. In the subsequent post-diffusion phase, we +optimize the generated hand motions using contact consistency. Our extensive +experiments highlight the uniqueness of our dataset and the effectiveness of +our approach. + +
+
+
+
+
+ + ♻ ☆ Learning-based Axial Video Motion Magnification + + +
+ Video motion magnification amplifies invisible small motions to be +perceptible, which provides humans with a spatially dense and holistic +understanding of small motions in the scene of interest. This is based on the +premise that magnifying small motions enhances the legibility of motions. In +the real world, however, vibrating objects often possess convoluted systems +that have complex natural frequencies, modes, and directions. Existing motion +magnification often fails to improve legibility since the intricate motions +still retain complex characteristics even after being magnified, which may +distract us from analyzing them. In this work, we focus on improving legibility +by proposing a new concept, axial motion magnification, which magnifies +decomposed motions along the user-specified direction. Axial motion +magnification can be applied to various applications where motions of specific +axes are critical, by providing simplified and easily readable motion +information. To achieve this, we propose a novel Motion Separation Module that +enables to disentangle and magnify the motion representation along axes of +interest. Furthermore, we build a new synthetic training dataset for the axial +motion magnification task. Our proposed method improves the legibility of +resulting motions along certain axes by adding a new feature: user +controllability. Axial motion magnification is a more generalized concept; +thus, our method can be directly adapted to the generic motion magnification +and achieves favorable performance against competing methods. + +
+
+ comment: main paper: 12 pages, supplementary: 10 pages, 20 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Decomposing Disease Descriptions for Enhanced Pathology Detection: A + Multi-Aspect Vision-Language Pre-training Framework CVPR2024 + + +
+ Medical vision language pre-training (VLP) has emerged as a frontier of +research, enabling zero-shot pathological recognition by comparing the query +image with the textual descriptions for each disease. Due to the complex +semantics of biomedical texts, current methods struggle to align medical images +with key pathological findings in unstructured reports. This leads to the +misalignment with the target disease's textual representation. In this paper, +we introduce a novel VLP framework designed to dissect disease descriptions +into their fundamental aspects, leveraging prior knowledge about the visual +manifestations of pathologies. This is achieved by consulting a large language +model and medical experts. Integrating a Transformer module, our approach +aligns an input image with the diverse elements of a disease, generating +aspect-centric image representations. By consolidating the matches from each +aspect, we improve the compatibility between an image and its associated +disease. Additionally, capitalizing on the aspect-oriented representations, we +present a dual-head Transformer tailored to process known and unknown diseases, +optimizing the comprehensive detection efficacy. Conducting experiments on +seven downstream datasets, ours improves the accuracy of recent methods by up +to 8.56% and 17.0% for seen and unseen categories, respectively. Our code is +released at https://github.com/HieuPhan33/MAVL. + +
+
+ comment: Accepted at CVPR2024. Pre-print before final camera-ready version +
+
+
+
+
+ + ♻ ☆ Lodge: A Coarse to Fine Diffusion Network for Long Dance Generation + Guided by the Characteristic Dance Primitives CVPR2024 + + +
+ We propose Lodge, a network capable of generating extremely long dance +sequences conditioned on given music. We design Lodge as a two-stage coarse to +fine diffusion architecture, and propose the characteristic dance primitives +that possess significant expressiveness as intermediate representations between +two diffusion models. The first stage is global diffusion, which focuses on +comprehending the coarse-level music-dance correlation and production +characteristic dance primitives. In contrast, the second-stage is the local +diffusion, which parallelly generates detailed motion sequences under the +guidance of the dance primitives and choreographic rules. In addition, we +propose a Foot Refine Block to optimize the contact between the feet and the +ground, enhancing the physical realism of the motion. Our approach can +parallelly generate dance sequences of extremely long length, striking a +balance between global choreographic patterns and local motion quality and +expressiveness. Extensive experiments validate the efficacy of our method. + +
+
+ comment: Accepted by CVPR2024, Project page: + https://li-ronghui.github.io/lodge +
+
+
+
+
+ + ♻ ☆ Image Captioning in news report scenario + + +
+ Image captioning strives to generate pertinent captions for specified images, +situating itself at the crossroads of Computer Vision (CV) and Natural Language +Processing (NLP). This endeavor is of paramount importance with far-reaching +applications in recommendation systems, news outlets, social media, and beyond. +Particularly within the realm of news reporting, captions are expected to +encompass detailed information, such as the identities of celebrities captured +in the images. However, much of the existing body of work primarily centers +around understanding scenes and actions. In this paper, we explore the realm of +image captioning specifically tailored for celebrity photographs, illustrating +its broad potential for enhancing news industry practices. This exploration +aims to augment automated news content generation, thereby facilitating a more +nuanced dissemination of information. Our endeavor shows a broader horizon, +enriching the narrative in news reporting through a more intuitive image +captioning framework. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ LoCo: Locally Constrained Training-Free Layout-to-Image Synthesis + + +
+ Recent text-to-image diffusion models have reached an unprecedented level in +generating high-quality images. However, their exclusive reliance on textual +prompts often falls short in precise control of image compositions. In this +paper, we propose LoCo, a training-free approach for layout-to-image Synthesis +that excels in producing high-quality images aligned with both textual prompts +and layout instructions. Specifically, we introduce a Localized Attention +Constraint (LAC), leveraging semantic affinity between pixels in self-attention +maps to create precise representations of desired objects and effectively +ensure the accurate placement of objects in designated regions. We further +propose a Padding Token Constraint (PTC) to leverage the semantic information +embedded in previously neglected padding tokens, improving the consistency +between object appearance and layout instructions. LoCo seamlessly integrates +into existing text-to-image and layout-to-image models, enhancing their +performance in spatial control and addressing semantic failures observed in +prior methods. Extensive experiments showcase the superiority of our approach, +surpassing existing state-of-the-art training-free layout-to-image methods both +qualitatively and quantitatively across multiple benchmarks. + +
+
+ comment: Demo: https://huggingface.co/spaces/Pusheen/LoCo; Project page: + https://momopusheen.github.io/LoCo/ +
+
+
+
+
+ + ♻ ☆ $\texttt{NePhi}$: Neural Deformation Fields for Approximately + Diffeomorphic Medical Image Registration + + +
+ This work proposes NePhi, a generalizable neural deformation model which +results in approximately diffeomorphic transformations. In contrast to the +predominant voxel-based transformation fields used in learning-based +registration approaches, NePhi represents deformations functionally, leading to +great flexibility within the design space of memory consumption during training +and inference, inference time, registration accuracy, as well as transformation +regularity. Specifically, NePhi 1) requires less memory compared to voxel-based +learning approaches, 2) improves inference speed by predicting latent codes, +compared to current existing neural deformation based registration approaches +that \emph{only} rely on optimization, 3) improves accuracy via instance +optimization, and 4) shows excellent deformation regularity which is highly +desirable for medical image registration. We demonstrate the performance of +NePhi on a 2D synthetic dataset as well as for real 3D lung registration. Our +results show that NePhi can match the accuracy of voxel-based representations +in a single-resolution registration setting. For multi-resolution registration, +our method matches the accuracy of current SOTA learning-based registration +approaches with instance optimization while reducing memory requirements by a +factor of five. + +
+
+
+
+
+ + ♻ ☆ A Novel Approach to Industrial Defect Generation through Blended Latent + Diffusion Model with Online Adaptation + + +
+ Effectively addressing the challenge of industrial Anomaly Detection (AD) +necessitates an ample supply of defective samples, a constraint often hindered +by their scarcity in industrial contexts. This paper introduces a novel +algorithm designed to augment defective samples, thereby enhancing AD +performance. The proposed method tailors the blended latent diffusion model for +defect sample generation, employing a diffusion model to generate defective +samples in the latent space. A feature editing process, controlled by a +``trimap" mask and text prompts, refines the generated samples. The image +generation inference process is structured into three stages: a free diffusion +stage, an editing diffusion stage, and an online decoder adaptation stage. This +sophisticated inference strategy yields high-quality synthetic defective +samples with diverse pattern variations, leading to significantly improved AD +accuracies based on the augmented training set. Specifically, on the widely +recognized MVTec AD dataset, the proposed method elevates the state-of-the-art +(SOTA) performance of AD with augmented data by 1.5%, 1.9%, and 3.1% for AD +metrics AP, IAP, and IAP90, respectively. The implementation code of this work +can be found at the GitHub repository +https://github.com/GrandpaXun242/AdaBLDM.git + +
+
+ comment: 13 pages,7 figures +
+
+
+
+
+ + ♻ ☆ X-Portrait: Expressive Portrait Animation with Hierarchical Motion + Attention + + +
+ We propose X-Portrait, an innovative conditional diffusion model tailored for +generating expressive and temporally coherent portrait animation. Specifically, +given a single portrait as appearance reference, we aim to animate it with +motion derived from a driving video, capturing both highly dynamic and subtle +facial expressions along with wide-range head movements. As its core, we +leverage the generative prior of a pre-trained diffusion model as the rendering +backbone, while achieve fine-grained head pose and expression control with +novel controlling signals within the framework of ControlNet. In contrast to +conventional coarse explicit controls such as facial landmarks, our motion +control module is learned to interpret the dynamics directly from the original +driving RGB inputs. The motion accuracy is further enhanced with a patch-based +local control module that effectively enhance the motion attention to +small-scale nuances like eyeball positions. Notably, to mitigate the identity +leakage from the driving signals, we train our motion control modules with +scaling-augmented cross-identity images, ensuring maximized disentanglement +from the appearance reference modules. Experimental results demonstrate the +universal effectiveness of X-Portrait across a diverse range of facial +portraits and expressive driving sequences, and showcase its proficiency in +generating captivating portrait animations with consistently maintained +identity characteristics. + +
+
+
+
+
+ + ♻ ☆ VMRNN: Integrating Vision Mamba and LSTM for Efficient and Accurate + Spatiotemporal Forecasting + + +
+ Combining CNNs or ViTs, with RNNs for spatiotemporal forecasting, has yielded +unparalleled results in predicting temporal and spatial dynamics. However, +modeling extensive global information remains a formidable challenge; CNNs are +limited by their narrow receptive fields, and ViTs struggle with the intensive +computational demands of their attention mechanisms. The emergence of recent +Mamba-based architectures has been met with enthusiasm for their exceptional +long-sequence modeling capabilities, surpassing established vision models in +efficiency and accuracy, which motivates us to develop an innovative +architecture tailored for spatiotemporal forecasting. In this paper, we propose +the VMRNN cell, a new recurrent unit that integrates the strengths of Vision +Mamba blocks with LSTM. We construct a network centered on VMRNN cells to +tackle spatiotemporal prediction tasks effectively. Our extensive evaluations +show that our proposed approach secures competitive results on a variety of +tasks while maintaining a smaller model size. Our code is available at +https://github.com/yyyujintang/VMRNN-PyTorch. + +
+
+ comment: 11 pages, 7 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ DiffCast: A Unified Framework via Residual Diffusion for Precipitation + Nowcasting CVPR 2024 + + +
+ Precipitation nowcasting is an important spatio-temporal prediction task to +predict the radar echoes sequences based on current observations, which can +serve both meteorological science and smart city applications. Due to the +chaotic evolution nature of the precipitation systems, it is a very challenging +problem. Previous studies address the problem either from the perspectives of +deterministic modeling or probabilistic modeling. However, their predictions +suffer from the blurry, high-value echoes fading away and position inaccurate +issues. The root reason of these issues is that the chaotic evolutionary +precipitation systems are not appropriately modeled. Inspired by the nature of +the systems, we propose to decompose and model them from the perspective of +global deterministic motion and local stochastic variations with residual +mechanism. A unified and flexible framework that can equip any type of +spatio-temporal models is proposed based on residual diffusion, which +effectively tackles the shortcomings of previous methods. Extensive +experimental results on four publicly available radar datasets demonstrate the +effectiveness and superiority of the proposed framework, compared to +state-of-the-art techniques. Our code is publicly available at +https://github.com/DeminYu98/DiffCast. + +
+
+ comment: CVPR 2024; https://github.com/DeminYu98/DiffCast +
+
+
+
+
+ + ♻ ☆ Diffusion Models Generate Images Like Painters: an Analytical Theory of + Outline First, Details Later NeurIPS23 + + +
+ How do diffusion generative models convert pure noise into meaningful images? +In a variety of pretrained diffusion models (including conditional latent space +models like Stable Diffusion), we observe that the reverse diffusion process +that underlies image generation has the following properties: (i) individual +trajectories tend to be low-dimensional and resemble 2D `rotations'; (ii) +high-variance scene features like layout tend to emerge earlier, while +low-variance details tend to emerge later; and (iii) early perturbations tend +to have a greater impact on image content than later perturbations. To +understand these phenomena, we derive and study a closed-form solution to the +probability flow ODE for a Gaussian distribution, which shows that the reverse +diffusion state rotates towards a gradually-specified target on the image +manifold. It also shows that generation involves first committing to an +outline, and then to finer and finer details. We find that this solution +accurately describes the initial phase of image generation for pretrained +models, and can in principle be used to make image generation more efficient by +skipping reverse diffusion steps. Finally, we use our solution to characterize +the image manifold in Stable Diffusion. Our viewpoint reveals an unexpected +similarity between generation by GANs and diffusion and provides a conceptual +link between diffusion and image retrieval. + +
+
+ comment: 44 pages, 28 figures. A briefer version was presented at NeurIPS23 + Workshop on Diffusion Models [arXiv:2311.10892] +
+
+
+
+
+ + ♻ ☆ Confidence-Triggered Detection: Accelerating Real-time + Tracking-by-detection Systems + + +
+ Real-time object tracking necessitates a delicate balance between speed and +accuracy, a challenge exacerbated by the computational demands of deep learning +methods. In this paper, we propose Confidence-Triggered Detection (CTD), an +innovative approach that strategically bypasses object detection for frames +closely resembling intermediate states, leveraging tracker confidence scores. +CTD not only enhances tracking speed but also preserves accuracy, surpassing +existing tracking algorithms. Through extensive evaluation across various +tracker confidence thresholds, we identify an optimal trade-off between +tracking speed and accuracy, providing crucial insights for parameter +fine-tuning and enhancing CTD's practicality in real-world scenarios. Our +experiments across diverse detection models underscore the robustness and +versatility of the CTD framework, demonstrating its potential to enable +real-time tracking in resource-constrained environments. + +
+
+ comment: 9 pages, 5 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Troika: Multi-Path Cross-Modal Traction for Compositional Zero-Shot + Learning CVPR 2024 + + +
+ Recent compositional zero-shot learning (CZSL) methods adapt pre-trained +vision-language models (VLMs) by constructing trainable prompts only for +composed state-object pairs. Relying on learning the joint representation of +seen compositions, these methods ignore the explicit modeling of the state and +object, thus limiting the exploitation of pre-trained knowledge and +generalization to unseen compositions. With a particular focus on the +universality of the solution, in this work, we propose a novel paradigm for +CZSL models that establishes three identification branches (i.e., Multi-Path) +to jointly model the state, object, and composition. The presented Troika is +our implementation that aligns the branch-specific prompt representations with +decomposed visual features. To calibrate the bias between semantically similar +multi-modal representations, we further devise a Cross-Modal Traction module +into Troika that shifts the prompt representation towards the current visual +content. We conduct extensive experiments on three popular benchmarks, where +our method significantly outperforms existing methods in both closed-world and +open-world settings. The code will be available at +https://github.com/bighuang624/Troika. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Joint Learning Neuronal Skeleton and Brain Circuit Topology with + Permutation Invariant Encoders for Neuron Classification AAAI 2024 + + +
+ Determining the types of neurons within a nervous system plays a significant +role in the analysis of brain connectomics and the investigation of +neurological diseases. However, the efficiency of utilizing anatomical, +physiological, or molecular characteristics of neurons is relatively low and +costly. With the advancements in electron microscopy imaging and analysis +techniques for brain tissue, we are able to obtain whole-brain connectome +consisting neuronal high-resolution morphology and connectivity information. +However, few models are built based on such data for automated neuron +classification. In this paper, we propose NeuNet, a framework that combines +morphological information of neurons obtained from skeleton and topological +information between neurons obtained from neural circuit. Specifically, NeuNet +consists of three components, namely Skeleton Encoder, Connectome Encoder, and +Readout Layer. Skeleton Encoder integrates the local information of neurons in +a bottom-up manner, with a one-dimensional convolution in neural skeleton's +point data; Connectome Encoder uses a graph neural network to capture the +topological information of neural circuit; finally, Readout Layer fuses the +above two information and outputs classification results. We reprocess and +release two new datasets for neuron classification task from volume electron +microscopy(VEM) images of human brain cortex and Drosophila brain. Experiments +on these two datasets demonstrated the effectiveness of our model with accuracy +of 0.9169 and 0.9363, respectively. Code and data are available at: +https://github.com/WHUminghui/NeuNet. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Doubly Abductive Counterfactual Inference for Text-based Image Editing CVPR 2024 + + +
+ We study text-based image editing (TBIE) of a single image by counterfactual +inference because it is an elegant formulation to precisely address the +requirement: the edited image should retain the fidelity of the original one. +Through the lens of the formulation, we find that the crux of TBIE is that +existing techniques hardly achieve a good trade-off between editability and +fidelity, mainly due to the overfitting of the single-image fine-tuning. To +this end, we propose a Doubly Abductive Counterfactual inference framework +(DAC). We first parameterize an exogenous variable as a UNet LoRA, whose +abduction can encode all the image details. Second, we abduct another exogenous +variable parameterized by a text encoder LoRA, which recovers the lost +editability caused by the overfitted first abduction. Thanks to the second +abduction, which exclusively encodes the visual transition from post-edit to +pre-edit, its inversion -- subtracting the LoRA -- effectively reverts pre-edit +back to post-edit, thereby accomplishing the edit. Through extensive +experiments, our DAC achieves a good trade-off between editability and +fidelity. Thus, we can support a wide spectrum of user editing intents, +including addition, removal, manipulation, replacement, style transfer, and +facial change, which are extensively validated in both qualitative and +quantitative evaluations. Codes are in https://github.com/xuesong39/DAC. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic + Human Modeling + + +
+ High-quality human reconstruction and photo-realistic rendering of a dynamic +scene is a long-standing problem in computer vision and graphics. Despite +considerable efforts invested in developing various capture systems and +reconstruction algorithms, recent advancements still struggle with loose or +oversized clothing and overly complex poses. In part, this is due to the +challenges of acquiring high-quality human datasets. To facilitate the +development of these fields, in this paper, we present PKU-DyMVHumans, a +versatile human-centric dataset for high-fidelity reconstruction and rendering +of dynamic human scenarios from dense multi-view videos. It comprises 8.2 +million frames captured by more than 56 synchronized cameras across diverse +scenarios. These sequences comprise 32 human subjects across 45 different +scenarios, each with a high-detailed appearance and realistic human motion. +Inspired by recent advancements in neural radiance field (NeRF)-based scene +representations, we carefully set up an off-the-shelf framework that is easy to +provide those state-of-the-art NeRF-based implementations and benchmark on +PKU-DyMVHumans dataset. It is paving the way for various applications like +fine-grained foreground/background decomposition, high-quality human +reconstruction and photo-realistic novel view synthesis of a dynamic scene. +Extensive studies are performed on the benchmark, demonstrating new +observations and challenges that emerge from using such high-fidelity dynamic +data. The dataset is available at: https://pku-dymvhumans.github.io. + +
+
+
+
+
+ + ♻ ☆ Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models ICCV 2023 + + +
+ Despite tremendous progress in generating high-quality images using diffusion +models, synthesizing a sequence of animated frames that are both photorealistic +and temporally coherent is still in its infancy. While off-the-shelf +billion-scale datasets for image generation are available, collecting similar +video data of the same scale is still challenging. Also, training a video +diffusion model is computationally much more expensive than its image +counterpart. In this work, we explore finetuning a pretrained image diffusion +model with video data as a practical solution for the video synthesis task. We +find that naively extending the image noise prior to video noise prior in video +diffusion leads to sub-optimal performance. Our carefully designed video noise +prior leads to substantially better performance. Extensive experimental +validation shows that our model, Preserve Your Own Correlation (PYoCo), attains +SOTA zero-shot text-to-video results on the UCF-101 and MSR-VTT benchmarks. It +also achieves SOTA video generation quality on the small-scale UCF-101 +benchmark with a $10\times$ smaller model using significantly less computation +than the prior art. + +
+
+ comment: ICCV 2023. Project webpage: + https://research.nvidia.com/labs/dir/pyoco +
+
+
+
+
+ + ♻ ☆ MEDDAP: Medical Dataset Enhancement via Diversified Augmentation + Pipeline MICCAI-2024 + + +
+ The effectiveness of Deep Neural Networks (DNNs) heavily relies on the +abundance and accuracy of available training data. However, collecting and +annotating data on a large scale is often both costly and time-intensive, +particularly in medical cases where practitioners are already occupied with +their duties. Moreover, ensuring that the model remains robust across various +scenarios of image capture is crucial in medical domains, especially when +dealing with ultrasound images that vary based on the settings of different +devices and the manual operation of the transducer. To address this challenge, +we introduce a novel pipeline called MEDDAP, which leverages Stable Diffusion +(SD) models to augment existing small datasets by automatically generating new +informative labeled samples. Pretrained checkpoints for SD are typically based +on natural images, and training them for medical images requires significant +GPU resources due to their heavy parameters. To overcome this challenge, we +introduce USLoRA (Ultrasound Low-Rank Adaptation), a novel fine-tuning method +tailored specifically for ultrasound applications. USLoRA allows for selective +fine-tuning of weights within SD, requiring fewer than 0.1\% of parameters +compared to fully fine-tuning only the UNet portion of SD. To enhance dataset +diversity, we incorporate different adjectives into the generation process +prompts, thereby desensitizing the classifiers to intensity changes across +different images. This approach is inspired by clinicians' decision-making +processes regarding breast tumors, where tumor shape often plays a more crucial +role than intensity. In conclusion, our pipeline not only outperforms +classifiers trained on the original dataset but also demonstrates superior +performance when encountering unseen datasets. The source code is available at +https://github.com/yasamin-med/MEDDAP. + +
+
+ comment: submitted to miccai 2024 submitted to miccai 2024 Submitted to + MICCAI-2024 +
+
+
+
+
+ + ♻ ☆ HOOD: Real-Time Human Presence and Out-of-Distribution Detection Using + FMCW Radar + + +
+ Detecting human presence indoors with millimeter-wave frequency-modulated +continuous-wave (FMCW) radar faces challenges from both moving and stationary +clutter. This work proposes a robust and real-time capable human presence and +out-of-distribution (OOD) detection method using 60 GHz short-range FMCW radar. +HOOD solves the human presence and OOD detection problems simultaneously in a +single pipeline. Our solution relies on a reconstruction-based architecture and +works with radar macro and micro range-Doppler images (RDIs). HOOD aims to +accurately detect the presence of humans in the presence or absence of moving +and stationary disturbers. Since HOOD is also an OOD detector, it aims to +detect moving or stationary clutters as OOD in humans' absence and predicts the +current scene's output as "no presence." HOOD performs well in diverse +scenarios, demonstrating its effectiveness across different human activities +and situations. On our dataset collected with a 60 GHz short-range FMCW radar, +we achieve an average AUROC of 94.36%. Additionally, our extensive evaluations +and experiments demonstrate that HOOD outperforms state-of-the-art (SOTA) OOD +detection methods in terms of common OOD detection metrics. Importantly, HOOD +also perfectly fits on Raspberry Pi 3B+ with an ARM Cortex-A53 CPU, which +showcases its versatility across different hardware environments. Videos of our +human presence detection experiments are available at: +https://muskahya.github.io/HOOD + +
+
+ comment: 10 pages, 2 figures, project page: https://muskahya.github.io/HOOD +
+
+
+
+
+ + ♻ ☆ HIVE: Harnessing Human Feedback for Instructional Visual Editing CVPR + + +
+ Incorporating human feedback has been shown to be crucial to align text +generated by large language models to human preferences. We hypothesize that +state-of-the-art instructional image editing models, where outputs are +generated based on an input image and an editing instruction, could similarly +benefit from human feedback, as their outputs may not adhere to the correct +instructions and preferences of users. In this paper, we present a novel +framework to harness human feedback for instructional visual editing (HIVE). +Specifically, we collect human feedback on the edited images and learn a reward +function to capture the underlying user preferences. We then introduce scalable +diffusion model fine-tuning methods that can incorporate human preferences +based on the estimated reward. Besides, to mitigate the bias brought by the +limitation of data, we contribute a new 1M training dataset, a 3.6K reward +dataset for rewards learning, and a 1K evaluation dataset to boost the +performance of instructional image editing. We conduct extensive empirical +experiments quantitatively and qualitatively, showing that HIVE is favored over +previous state-of-the-art instructional image editing approaches by a large +margin. + +
+
+ comment: In CVPR, 2024 +
+
+
+
+
+ + ♻ ☆ ERM++: An Improved Baseline for Domain Generalization + + +
+ Domain Generalization (DG) measures a classifier's ability to generalize to +new distributions of data it was not trained on. Recent work has shown that a +hyperparameter-tuned Empirical Risk Minimization (ERM) training procedure, that +is simply minimizing the empirical risk on the source domains, can outperform +most existing DG methods. ERM has achieved such strong results while only +tuning hyper-parameters such as learning rate, weight decay, batch size, and +dropout. However there are additional hyperparameters which further limit +overfitting and catastrophic forgetting. We therefore focus on tuning +previously untuned hyper-parameters, including training amount, initialization, +and additional regularizers. We call the resulting stronger baseline ERM++. +ERM++ improves the performance of DG by over 5% compared to prior ERM baselines +on a standard benchmark of 5 datasets with a ResNet-50 and over 15% with a +ViT-B/16, and outperforms all SOTA methods on DomainBed with both +architectures. We also explore the relationship between DG performance and +similarity to pre-training data, and find that similarity to pre-training data +distributions is an important driver of performance, but that ERM++ with +stronger initializations can deliver strong performance even on dissimilar +datasets.Code is released at https://github.com/piotr-teterwak/erm_plusplus. + +
+
+ comment: An improved baseline for Domain Generalization +
+
+
+
+
+ + ♻ ☆ Step-Calibrated Diffusion for Biomedical Optical Image Restoration + + +
+ High-quality, high-resolution medical imaging is essential for clinical care. +Raman-based biomedical optical imaging uses non-ionizing infrared radiation to +evaluate human tissues in real time and is used for early cancer detection, +brain tumor diagnosis, and intraoperative tissue analysis. Unfortunately, +optical imaging is vulnerable to image degradation due to laser scattering and +absorption, which can result in diagnostic errors and misguided treatment. +Restoration of optical images is a challenging computer vision task because the +sources of image degradation are multi-factorial, stochastic, and +tissue-dependent, preventing a straightforward method to obtain paired +low-quality/high-quality data. Here, we present Restorative Step-Calibrated +Diffusion (RSCD), an unpaired image restoration method that views the image +restoration problem as completing the finishing steps of a diffusion-based +image generation task. RSCD uses a step calibrator model to dynamically +determine the severity of image degradation and the number of steps required to +complete the reverse diffusion process for image restoration. RSCD outperforms +other widely used unpaired image restoration methods on both image quality and +perceptual evaluation metrics for restoring optical images. Medical imaging +experts consistently prefer images restored using RSCD in blinded comparison +experiments and report minimal to no hallucinations. Finally, we show that RSCD +improves performance on downstream clinical imaging tasks, including automated +brain tumor diagnosis and deep tissue imaging. Our code is available at +https://github.com/MLNeurosurg/restorative_step-calibrated_diffusion. + +
+
+
+
+
+ + ♻ ☆ Visual Whole-Body Control for Legged Loco-Manipulation + + +
+ We study the problem of mobile manipulation using legged robots equipped with +an arm, namely legged loco-manipulation. The robot legs, while usually utilized +for mobility, offer an opportunity to amplify the manipulation capabilities by +conducting whole-body control. That is, the robot can control the legs and the +arm at the same time to extend its workspace. We propose a framework that can +conduct the whole-body control autonomously with visual observations. Our +approach, namely Visual Whole-Body Control(VBC), is composed of a low-level +policy using all degrees of freedom to track the end-effector manipulator +position and a high-level policy proposing the end-effector position based on +visual inputs. We train both levels of policies in simulation and perform +Sim2Real transfer for real robot deployment. We perform extensive experiments +and show significant improvements over baselines in picking up diverse objects +in different configurations (heights, locations, orientations) and +environments. Project page: https://wholebody-b1.github.io + +
+
+ comment: The first two authors contribute equally. Project page: + https://wholebody-b1.github.io +
+
+
+
+
+ + ♻ ☆ CLAMP: Contrastive LAnguage Model Prompt-tuning + + +
+ Large language models (LLMs) have emerged as powerful general-purpose +interfaces for many machine learning problems. Recent work has adapted LLMs to +generative visual tasks like image captioning, visual question answering, and +visual chat, using a relatively small amount of instruction-tuning data. In +this paper, we explore whether modern LLMs can also be adapted to classifying +an image into a set of categories. First, we evaluate multimodal LLMs that are +tuned for generative tasks on zero-shot image classification and find that +their performance is far below that of specialized models like CLIP. We then +propose an approach for light fine-tuning of LLMs using the same contrastive +image-caption matching objective as CLIP. Our results show that LLMs can, +indeed, achieve good image classification performance when adapted this way. +Our approach beats state-of-the-art mLLMs by 13% and slightly outperforms +contrastive learning with a custom text model, while also retaining the LLM's +generative abilities. LLM initialization appears to particularly help +classification in domains under-represented in the visual pre-training data. + +
+
+
+
+
+ + ♻ ☆ Fast Point Cloud to Mesh Reconstruction for Deformable Object Tracking + + +
+ The world around us is full of soft objects we perceive and deform with +dexterous hand movements. For a robotic hand to control soft objects, it has to +acquire online state feedback of the deforming object. While RGB-D cameras can +collect occluded point clouds at a rate of 30Hz, this does not represent a +continuously trackable object surface. Hence, in this work, we developed a +method that takes as input a template mesh which is the mesh of an object in +its non-deformed state and a deformed point cloud of the same object, and then +shapes the template mesh such that it matches the deformed point cloud. The +reconstruction of meshes from point clouds has long been studied in the field +of Computer graphics under 3D reconstruction and 4D reconstruction, however, +both lack the speed and generalizability needed for robotics applications. Our +model is designed using a point cloud auto-encoder and a Real-NVP architecture. +Our trained model can perform mesh reconstruction and tracking at a rate of +58Hz on a template mesh of 3000 vertices and a deformed point cloud of 5000 +points and is generalizable to the deformations of six different object +categories which are assumed to be made of soft material in our experiments +(scissors, hammer, foam brick, cleanser bottle, orange, and dice). The object +meshes are taken from the YCB benchmark dataset. An instance of a downstream +application can be the control algorithm for a robotic hand that requires +online feedback from the state of the manipulated object which would allow +online grasp adaptation in a closed-loop manner. Furthermore, the tracking +capacity of our method can help in the system identification of deforming +objects in a marker-free approach. In future work, we will extend our trained +model to generalize beyond six object categories and additionally to real-world +deforming point clouds. + +
+
+ comment: 8 pages with appendix,16 figures +
+
+
+
+
+ + ♻ ☆ SplaTAM: Splat, Track & Map 3D Gaussians for Dense RGB-D SLAM CVPR 2024 + + +
+ Dense simultaneous localization and mapping (SLAM) is crucial for robotics +and augmented reality applications. However, current methods are often hampered +by the non-volumetric or implicit way they represent a scene. This work +introduces SplaTAM, an approach that, for the first time, leverages explicit +volumetric representations, i.e., 3D Gaussians, to enable high-fidelity +reconstruction from a single unposed RGB-D camera, surpassing the capabilities +of existing methods. SplaTAM employs a simple online tracking and mapping +system tailored to the underlying Gaussian representation. It utilizes a +silhouette mask to elegantly capture the presence of scene density. This +combination enables several benefits over prior representations, including fast +rendering and dense optimization, quickly determining if areas have been +previously mapped, and structured map expansion by adding more Gaussians. +Extensive experiments show that SplaTAM achieves up to 2x superior performance +in camera pose estimation, map construction, and novel-view synthesis over +existing methods, paving the way for more immersive high-fidelity SLAM +applications. + +
+
+ comment: CVPR 2024. Website: https://spla-tam.github.io/ +
+
+
+
+
+ + ♻ ☆ FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects + + +
+ We present FoundationPose, a unified foundation model for 6D object pose +estimation and tracking, supporting both model-based and model-free setups. Our +approach can be instantly applied at test-time to a novel object without +fine-tuning, as long as its CAD model is given, or a small number of reference +images are captured. We bridge the gap between these two setups with a neural +implicit representation that allows for effective novel view synthesis, keeping +the downstream pose estimation modules invariant under the same unified +framework. Strong generalizability is achieved via large-scale synthetic +training, aided by a large language model (LLM), a novel transformer-based +architecture, and contrastive learning formulation. Extensive evaluation on +multiple public datasets involving challenging scenarios and objects indicate +our unified approach outperforms existing methods specialized for each task by +a large margin. In addition, it even achieves comparable results to +instance-level methods despite the reduced assumptions. Project page: +https://nvlabs.github.io/FoundationPose/ + +
+
+
+
+
+ + ♻ ☆ Living Scenes: Multi-object Relocalization and Reconstruction in + Changing 3D Environments CVPR 2024 + + +
+ Research into dynamic 3D scene understanding has primarily focused on +short-term change tracking from dense observations, while little attention has +been paid to long-term changes with sparse observations. We address this gap +with MoRE, a novel approach for multi-object relocalization and reconstruction +in evolving environments. We view these environments as "living scenes" and +consider the problem of transforming scans taken at different points in time +into a 3D reconstruction of the object instances, whose accuracy and +completeness increase over time. At the core of our method lies an +SE(3)-equivariant representation in a single encoder-decoder network, trained +on synthetic data. This representation enables us to seamlessly tackle instance +matching, registration, and reconstruction. We also introduce a joint +optimization algorithm that facilitates the accumulation of point clouds +originating from the same instance across multiple scans taken at different +points in time. We validate our method on synthetic and real-world data and +demonstrate state-of-the-art performance in both end-to-end performance and +individual subtasks. + +
+
+ comment: CVPR 2024 camera-ready +
+
+
+
+
+ + ♻ ☆ Object Detectors in the Open Environment: Challenges, Solutions, and + Outlook + + +
+ With the emergence of foundation models, deep learning-based object detectors +have shown practical usability in closed set scenarios. However, for real-world +tasks, object detectors often operate in open environments, where crucial +factors (e.g., data distribution, objective) that influence model learning are +often changing. The dynamic and intricate nature of the open environment poses +novel and formidable challenges to object detectors. Unfortunately, current +research on object detectors in open environments lacks a comprehensive +analysis of their distinctive characteristics, challenges, and corresponding +solutions, which hinders their secure deployment in critical real-world +scenarios. This paper aims to bridge this gap by conducting a comprehensive +review and analysis of object detectors in open environments. We initially +identified limitations of key structural components within the existing +detection pipeline and propose the open environment object detector challenge +framework that includes four quadrants (i.e., out-of-domain, out-of-category, +robust learning, and incremental learning) based on the dimensions of the data +/ target changes. For each quadrant of challenges in the proposed framework, we +present a detailed description and systematic analysis of the overarching goals +and core difficulties, systematically review the corresponding solutions, and +benchmark their performance over multiple widely adopted datasets. In addition, +we engage in a discussion of open problems and potential avenues for future +research. This paper aims to provide a fresh, comprehensive, and systematic +understanding of the challenges and solutions associated with open-environment +object detectors, thus catalyzing the development of more solid applications in +real-world scenarios. A project related to this survey can be found at +https://github.com/LiangSiyuan21/OEOD_Survey. + +
+
+ comment: 32 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ MP5: A Multi-modal Open-ended Embodied System in Minecraft via Active + Perception CVPR2024 + + +
+ It is a long-lasting goal to design an embodied system that can solve +long-horizon open-world tasks in human-like ways. However, existing approaches +usually struggle with compound difficulties caused by the logic-aware +decomposition and context-aware execution of these tasks. To this end, we +introduce MP5, an open-ended multimodal embodied system built upon the +challenging Minecraft simulator, which can decompose feasible sub-objectives, +design sophisticated situation-aware plans, and perform embodied action +control, with frequent communication with a goal-conditioned active perception +scheme. Specifically, MP5 is developed on top of recent advances in Multimodal +Large Language Models (MLLMs), and the system is modulated into functional +modules that can be scheduled and collaborated to ultimately solve pre-defined +context- and process-dependent tasks. Extensive experiments prove that MP5 can +achieve a 22% success rate on difficult process-dependent tasks and a 91% +success rate on tasks that heavily depend on the context. Moreover, MP5 +exhibits a remarkable ability to address many open-ended tasks that are +entirely novel. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Learning from Synthetic Human Group Activities + + +
+ The study of complex human interactions and group activities has become a +focal point in human-centric computer vision. However, progress in related +tasks is often hindered by the challenges of obtaining large-scale labeled +datasets from real-world scenarios. To address the limitation, we introduce +M3Act, a synthetic data generator for multi-view multi-group multi-person human +atomic actions and group activities. Powered by Unity Engine, M3Act features +multiple semantic groups, highly diverse and photorealistic images, and a +comprehensive set of annotations, which facilitates the learning of +human-centered tasks across single-person, multi-person, and multi-group +conditions. We demonstrate the advantages of M3Act across three core +experiments. The results suggest our synthetic dataset can significantly +improve the performance of several downstream methods and replace real-world +datasets to reduce cost. Notably, M3Act improves the state-of-the-art MOTRv2 on +DanceTrack dataset, leading to a hop on the leaderboard from 10th to 2nd place. +Moreover, M3Act opens new research for controllable 3D group activity +generation. We define multiple metrics and propose a competitive baseline for +the novel task. Our code and data are available at our project page: +http://cjerry1243.github.io/M3Act. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 214 + +
+
+
+ + ☆ Exploiting Priors from 3D Diffusion Models for RGB-Based One-Shot View + Planning IROS + 2024 + + +
+ Object reconstruction is relevant for many autonomous robotic tasks that +require interaction with the environment. A key challenge in such scenarios is +planning view configurations to collect informative measurements for +reconstructing an initially unknown object. One-shot view planning enables +efficient data collection by predicting view configurations and planning the +globally shortest path connecting all views at once. However, geometric priors +about the object are required to conduct one-shot view planning. In this work, +we propose a novel one-shot view planning approach that utilizes the powerful +3D generation capabilities of diffusion models as priors. By incorporating such +geometric priors into our pipeline, we achieve effective one-shot view planning +starting with only a single RGB image of the object to be reconstructed. Our +planning experiments in simulation and real-world setups indicate that our +approach balances well between object reconstruction quality and movement cost. + +
+
+ comment: Sicong Pan and Liren Jin have equal contribution. Submitted to IROS + 2024 +
+
+
+
+
+ + ☆ CurbNet: Curb Detection Framework Based on LiDAR Point Cloud + Segmentation + + +
+ Curb detection is an important function in intelligent driving and can be +used to determine drivable areas of the road. However, curbs are difficult to +detect due to the complex road environment. This paper introduces CurbNet, a +novel framework for curb detection, leveraging point cloud segmentation. +Addressing the dearth of comprehensive curb datasets and the absence of 3D +annotations, we have developed the 3D-Curb dataset, encompassing 7,100 frames, +which represents the largest and most categorically diverse collection of curb +point clouds currently available. Recognizing that curbs are primarily +characterized by height variations, our approach harnesses spatially-rich 3D +point clouds for training. To tackle the challenges presented by the uneven +distribution of curb features on the xy-plane and their reliance on z-axis +high-frequency features, we introduce the multi-scale and channel attention +(MSCA) module, a bespoke solution designed to optimize detection performance. +Moreover, we propose an adaptive weighted loss function group, specifically +formulated to counteract the imbalance in the distribution of curb point clouds +relative to other categories. Our extensive experimentation on 2 major datasets +has yielded results that surpass existing benchmarks set by leading curb +detection and point cloud segmentation models. By integrating multi-clustering +and curve fitting techniques in our post-processing stage, we have +substantially reduced noise in curb detection, thereby enhancing precision to +0.8744. Notably, CurbNet has achieved an exceptional average metrics of over +0.95 at a tolerance of just 0.15m, thereby establishing a new benchmark. +Furthermore, corroborative real-world experiments and dataset analyzes mutually +validate each other, solidifying CurbNet's superior detection proficiency and +its robust generalizability. + +
+
+
+
+
+ + ☆ HPL-ESS: Hybrid Pseudo-Labeling for Unsupervised Event-based Semantic + Segmentation + + +
+ Event-based semantic segmentation has gained popularity due to its capability +to deal with scenarios under high-speed motion and extreme lighting conditions, +which cannot be addressed by conventional RGB cameras. Since it is hard to +annotate event data, previous approaches rely on event-to-image reconstruction +to obtain pseudo labels for training. However, this will inevitably introduce +noise, and learning from noisy pseudo labels, especially when generated from a +single source, may reinforce the errors. This drawback is also called +confirmation bias in pseudo-labeling. In this paper, we propose a novel hybrid +pseudo-labeling framework for unsupervised event-based semantic segmentation, +HPL-ESS, to alleviate the influence of noisy pseudo labels. In particular, we +first employ a plain unsupervised domain adaptation framework as our baseline, +which can generate a set of pseudo labels through self-training. Then, we +incorporate offline event-to-image reconstruction into the framework, and +obtain another set of pseudo labels by predicting segmentation maps on the +reconstructed images. A noisy label learning strategy is designed to mix the +two sets of pseudo labels and enhance the quality. Moreover, we propose a soft +prototypical alignment module to further improve the consistency of target +domain features. Extensive experiments show that our proposed method +outperforms existing state-of-the-art methods by a large margin on the +DSEC-Semantic dataset (+5.88% accuracy, +10.32% mIoU), which even surpasses +several supervised methods. + +
+
+
+
+
+ + ☆ The Anatomy of Adversarial Attacks: Concept-based XAI Dissection + + +
+ Adversarial attacks (AAs) pose a significant threat to the reliability and +robustness of deep neural networks. While the impact of these attacks on model +predictions has been extensively studied, their effect on the learned +representations and concepts within these models remains largely unexplored. In +this work, we perform an in-depth analysis of the influence of AAs on the +concepts learned by convolutional neural networks (CNNs) using eXplainable +artificial intelligence (XAI) techniques. Through an extensive set of +experiments across various network architectures and targeted AA techniques, we +unveil several key findings. First, AAs induce substantial alterations in the +concept composition within the feature space, introducing new concepts or +modifying existing ones. Second, the adversarial perturbation itself can be +linearly decomposed into a set of latent vector components, with a subset of +these being responsible for the attack's success. Notably, we discover that +these components are target-specific, i.e., are similar for a given target +class throughout different AA techniques and starting classes. Our findings +provide valuable insights into the nature of AAs and their impact on learned +representations, paving the way for the development of more robust and +interpretable deep learning models, as well as effective defenses against +adversarial threats. + +
+
+
+
+
+ + ☆ Diff-Def: Diffusion-Generated Deformation Fields for Conditional Atlases + + +
+ Anatomical atlases are widely used for population analysis. Conditional +atlases target a particular sub-population defined via certain conditions (e.g. +demographics or pathologies) and allow for the investigation of fine-grained +anatomical differences - such as morphological changes correlated with age. +Existing approaches use either registration-based methods that are unable to +handle large anatomical variations or generative models, which can suffer from +training instabilities and hallucinations. To overcome these limitations, we +use latent diffusion models to generate deformation fields, which transform a +general population atlas into one representing a specific sub-population. By +generating a deformation field and registering the conditional atlas to a +neighbourhood of images, we ensure structural plausibility and avoid +hallucinations, which can occur during direct image synthesis. We compare our +method to several state-of-the-art atlas generation methods in experiments +using 5000 brain as well as whole-body MR images from UK Biobank. Our method +generates highly realistic atlases with smooth transformations and high +anatomical fidelity, outperforming the baselines. + +
+
+
+
+
+ + ☆ Creating a Digital Twin of Spinal Surgery: A Proof of Concept + + +
+ Surgery digitalization is the process of creating a virtual replica of +real-world surgery, also referred to as a surgical digital twin (SDT). It has +significant applications in various fields such as education and training, +surgical planning, and automation of surgical tasks. Given their detailed +representations of surgical procedures, SDTs are an ideal foundation for +machine learning methods, enabling automatic generation of training data. In +robotic surgery, SDTs can provide realistic virtual environments in which +robots may learn through trial and error. In this paper, we present a proof of +concept (PoC) for surgery digitalization that is applied to an ex-vivo spinal +surgery performed in realistic conditions. The proposed digitalization focuses +on the acquisition and modelling of the geometry and appearance of the entire +surgical scene. We employ five RGB-D cameras for dynamic 3D reconstruction of +the surgeon, a high-end camera for 3D reconstruction of the anatomy, an +infrared stereo camera for surgical instrument tracking, and a laser scanner +for 3D reconstruction of the operating room and data fusion. We justify the +proposed methodology, discuss the challenges faced and further extensions of +our prototype. While our PoC partially relies on manual data curation, its high +quality and great potential motivate the development of automated methods for +the creation of SDTs. The quality of our SDT can be assessed in a rendered +video available at https://youtu.be/LqVaWGgaTMY . + +
+
+
+
+
+ + ☆ DPStyler: Dynamic PromptStyler for Source-Free Domain Generalization + + +
+ Source-Free Domain Generalization (SFDG) aims to develop a model that works +for unseen target domains without relying on any source domain. Recent work, +PromptStyler, employs text prompts to simulate different distribution shifts in +the joint vision-language space, allowing the model to generalize effectively +to unseen domains without using any images. However, 1) PromptStyler's style +generation strategy has limitations, as all style patterns are fixed after the +first training phase. This leads to the training set in the second training +phase being restricted to a limited set of styles. Additionally, 2) the frozen +text encoder in PromptStyler result in the encoder's output varying with the +style of the input text prompts, making it difficult for the model to learn +domain-invariant features. In this paper, we introduce Dynamic PromptStyler +(DPStyler), comprising Style Generation and Style Removal modules to address +these issues. The Style Generation module refreshes all styles at every +training epoch, while the Style Removal module eliminates variations in the +encoder's output features caused by input styles. Moreover, since the Style +Generation module, responsible for generating style word vectors using random +sampling or style mixing, makes the model sensitive to input text prompts, we +introduce a model ensemble method to mitigate this sensitivity. Extensive +experiments demonstrate that our framework outperforms state-of-the-art methods +on benchmark datasets. + +
+
+
+
+
+ + ☆ Assessing the Performance of Deep Learning for Automated Gleason Grading + in Prostate Cancer + + +
+ Prostate cancer is a dominant health concern calling for advanced diagnostic +tools. Utilizing digital pathology and artificial intelligence, this study +explores the potential of 11 deep neural network architectures for automated +Gleason grading in prostate carcinoma focusing on comparing traditional and +recent architectures. A standardized image classification pipeline, based on +the AUCMEDI framework, facilitated robust evaluation using an in-house dataset +consisting of 34,264 annotated tissue tiles. The results indicated varying +sensitivity across architectures, with ConvNeXt demonstrating the strongest +performance. Notably, newer architectures achieved superior performance, even +though with challenges in differentiating closely related Gleason grades. The +ConvNeXt model was capable of learning a balance between complexity and +generalizability. Overall, this study lays the groundwork for enhanced Gleason +grading systems, potentially improving diagnostic efficiency for prostate +cancer. + +
+
+
+
+
+ + ☆ Synapse: Learning Preferential Concepts from Visual Demonstrations + + +
+ This paper addresses the problem of preference learning, which aims to learn +user-specific preferences (e.g., "good parking spot", "convenient drop-off +location") from visual input. Despite its similarity to learning factual +concepts (e.g., "red cube"), preference learning is a fundamentally harder +problem due to its subjective nature and the paucity of person-specific +training data. We address this problem using a new framework called Synapse, +which is a neuro-symbolic approach designed to efficiently learn preferential +concepts from limited demonstrations. Synapse represents preferences as +neuro-symbolic programs in a domain-specific language (DSL) that operates over +images, and leverages a novel combination of visual parsing, large language +models, and program synthesis to learn programs representing individual +preferences. We evaluate Synapse through extensive experimentation including a +user case study focusing on mobility-related concepts in mobile robotics and +autonomous driving. Our evaluation demonstrates that Synapse significantly +outperforms existing baselines as well as its own ablations. The code and other +details can be found on the project website https://amrl.cs.utexas.edu/synapse . + +
+
+ comment: 23 pages, 7 figures; Preprint +
+
+
+
+
+ + ☆ DeepGleason: a System for Automated Gleason Grading of Prostate Cancer + using Deep Neural Networks + + +
+ Advances in digital pathology and artificial intelligence (AI) offer +promising opportunities for clinical decision support and enhancing diagnostic +workflows. Previous studies already demonstrated AI's potential for automated +Gleason grading, but lack state-of-the-art methodology and model reusability. +To address this issue, we propose DeepGleason: an open-source deep neural +network based image classification system for automated Gleason grading using +whole-slide histopathology images from prostate tissue sections. Implemented +with the standardized AUCMEDI framework, our tool employs a tile-wise +classification approach utilizing fine-tuned image preprocessing techniques in +combination with a ConvNeXt architecture which was compared to various +state-of-the-art architectures. The neural network model was trained and +validated on an in-house dataset of 34,264 annotated tiles from 369 prostate +carcinoma slides. We demonstrated that DeepGleason is capable of highly +accurate and reliable Gleason grading with a macro-averaged F1-score of 0.806, +AUC of 0.991, and Accuracy of 0.974. The internal architecture comparison +revealed that the ConvNeXt model was superior performance-wise on our dataset +to established and other modern architectures like transformers. Furthermore, +we were able to outperform the current state-of-the-art in tile-wise +fine-classification with a sensitivity and specificity of 0.94 and 0.98 for +benign vs malignant detection as well as of 0.91 and 0.75 for Gleason 3 vs +Gleason 4 & 5 classification, respectively. Our tool contributes to the wider +adoption of AI-based Gleason grading within the research community and paves +the way for broader clinical application of deep learning models in digital +pathology. DeepGleason is open-source and publicly available for research +application in the following Git repository: +https://github.com/frankkramer-lab/DeepGleason. + +
+
+
+
+
+ + ☆ FOOL: Addressing the Downlink Bottleneck in Satellite Computing with + Neural Feature Compression + + +
+ Nanosatellite constellations equipped with sensors capturing large geographic +regions provide unprecedented opportunities for Earth observation. As +constellation sizes increase, network contention poses a downlink bottleneck. +Orbital Edge Computing (OEC) leverages limited onboard compute resources to +reduce transfer costs by processing the raw captures at the source. However, +current solutions have limited practicability due to reliance on crude +filtering methods or over-prioritizing particular downstream tasks. + This work presents FOOL, an OEC-native and task-agnostic feature compression +method that preserves prediction performance. FOOL partitions high-resolution +satellite imagery to maximize throughput. Further, it embeds context and +leverages inter-tile dependencies to lower transfer costs with negligible +overhead. While FOOL is a feature compressor, it can recover images with +competitive scores on perceptual quality measures at lower bitrates. We +extensively evaluate transfer cost reduction by including the peculiarity of +intermittently available network connections in low earth orbit. Lastly, we +test the feasibility of our system for standardized nanosatellite form factors. +We demonstrate that FOOL permits downlinking over 100x the data volume without +relying on prior information on the downstream tasks. + +
+
+ comment: 18 pages, double column, 19 figures, 7 tables, Initial Submission to + IEEE Transactions on Mobile Computing +
+
+
+
+
+ + ☆ Domain Adaptive Detection of MAVs: A Benchmark and Noise Suppression + Network + + +
+ Visual detection of Micro Air Vehicles (MAVs) has attracted increasing +attention in recent years due to its important application in various tasks. +The existing methods for MAV detection assume that the training set and testing +set have the same distribution. As a result, when deployed in new domains, the +detectors would have a significant performance degradation due to domain +discrepancy. In this paper, we study the problem of cross-domain MAV detection. +The contributions of this paper are threefold. 1) We propose a +Multi-MAV-Multi-Domain (M3D) dataset consisting of both simulation and +realistic images. Compared to other existing datasets, the proposed one is more +comprehensive in the sense that it covers rich scenes, diverse MAV types, and +various viewing angles. A new benchmark for cross-domain MAV detection is +proposed based on the proposed dataset. 2) We propose a Noise Suppression +Network (NSN) based on the framework of pseudo-labeling and a large-to-small +training procedure. To reduce the challenging pseudo-label noises, two novel +modules are designed in this network. The first is a prior-based curriculum +learning module for allocating adaptive thresholds for pseudo labels with +different difficulties. The second is a masked copy-paste augmentation module +for pasting truly-labeled MAVs on unlabeled target images and thus decreasing +pseudo-label noises. 3) Extensive experimental results verify the superior +performance of the proposed method compared to the state-of-the-art ones. In +particular, it achieves mAP of 46.9%(+5.8%), 50.5%(+3.7%), and 61.5%(+11.3%) on +the tasks of simulation-to-real adaptation, cross-scene adaptation, and +cross-camera adaptation, respectively. + +
+
+ comment: 17 pages, 11 figures. Accepted by IEEE Transactions on Automation + Science and Engineering +
+
+
+
+
+ + ☆ Clustering Propagation for Universal Medical Image Segmentation CVPR2024 + + +
+ Prominent solutions for medical image segmentation are typically tailored for +automatic or interactive setups, posing challenges in facilitating progress +achieved in one task to another.$_{\!}$ This$_{\!}$ also$_{\!}$ +necessitates$_{\!}$ separate$_{\!}$ models for each task, duplicating both +training time and parameters.$_{\!}$ To$_{\!}$ address$_{\!}$ above$_{\!}$ +issues,$_{\!}$ we$_{\!}$ introduce$_{\!}$ S2VNet,$_{\!}$ a$_{\!}$ +universal$_{\!}$ framework$_{\!}$ that$_{\!}$ leverages$_{\!}$ +Slice-to-Volume$_{\!}$ propagation$_{\!}$ to$_{\!}$ unify automatic/interactive +segmentation within a single model and one training session. Inspired by +clustering-based segmentation techniques, S2VNet makes full use of the +slice-wise structure of volumetric data by initializing cluster centers from +the cluster$_{\!}$ results$_{\!}$ of$_{\!}$ previous$_{\!}$ slice.$_{\!}$ This +enables knowledge acquired from prior slices to assist in the segmentation of +the current slice, further efficiently bridging the communication between +remote slices using mere 2D networks. Moreover, such a framework readily +accommodates interactive segmentation with no architectural change, simply by +initializing centroids from user inputs. S2VNet distinguishes itself by swift +inference speeds and reduced memory consumption compared to prevailing 3D +solutions. It can also handle multi-class interactions with each of them +serving to initialize different centroids. Experiments on three benchmarks +demonstrate S2VNet surpasses task-specified solutions on both +automatic/interactive setups. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Self-Adaptive Reality-Guided Diffusion for Artifact-Free + Super-Resolution + + +
+ Artifact-free super-resolution (SR) aims to translate low-resolution images +into their high-resolution counterparts with a strict integrity of the original +content, eliminating any distortions or synthetic details. While traditional +diffusion-based SR techniques have demonstrated remarkable abilities to enhance +image detail, they are prone to artifact introduction during iterative +procedures. Such artifacts, ranging from trivial noise to unauthentic textures, +deviate from the true structure of the source image, thus challenging the +integrity of the super-resolution process. In this work, we propose +Self-Adaptive Reality-Guided Diffusion (SARGD), a training-free method that +delves into the latent space to effectively identify and mitigate the +propagation of artifacts. Our SARGD begins by using an artifact detector to +identify implausible pixels, creating a binary mask that highlights artifacts. +Following this, the Reality Guidance Refinement (RGR) process refines artifacts +by integrating this mask with realistic latent representations, improving +alignment with the original image. Nonetheless, initial realistic-latent +representations from lower-quality images result in over-smoothing in the final +output. To address this, we introduce a Self-Adaptive Guidance (SAG) mechanism. +It dynamically computes a reality score, enhancing the sharpness of the +realistic latent. These alternating mechanisms collectively achieve +artifact-free super-resolution. Extensive experiments demonstrate the +superiority of our method, delivering detailed artifact-free high-resolution +images while reducing sampling steps by 2X. We release our code at +https://github.com/ProAirVerse/Self-Adaptive-Guidance-Diffusion.git. + +
+
+
+
+
+ + ☆ Multi-Scale Texture Loss for CT denoising with GANs + + +
+ Generative Adversarial Networks (GANs) have proved as a powerful framework +for denoising applications in medical imaging. However, GAN-based denoising +algorithms still suffer from limitations in capturing complex relationships +within the images. In this regard, the loss function plays a crucial role in +guiding the image generation process, encompassing how much a synthetic image +differs from a real image. To grasp highly complex and non-linear textural +relationships in the training process, this work presents a loss function that +leverages the intrinsic multi-scale nature of the Gray-Level-Co-occurrence +Matrix (GLCM). Although the recent advances in deep learning have demonstrated +superior performance in classification and detection tasks, we hypothesize that +its information content can be valuable when integrated into GANs' training. To +this end, we propose a differentiable implementation of the GLCM suited for +gradient-based optimization. Our approach also introduces a self-attention +layer that dynamically aggregates the multi-scale texture information extracted +from the images. We validate our approach by carrying out extensive experiments +in the context of low-dose CT denoising, a challenging application that aims to +enhance the quality of noisy CT scans. We utilize three publicly available +datasets, including one simulated and two real datasets. The results are +promising as compared to other well-established loss functions, being also +consistent across three different GAN architectures. The code is available at: +https://github.com/FrancescoDiFeola/DenoTextureLoss + +
+
+
+
+
+ + ☆ AI-Generated Video Detection via Spatio-Temporal Anomaly Learning + + +
+ The advancement of generation models has led to the emergence of highly +realistic artificial intelligence (AI)-generated videos. Malicious users can +easily create non-existent videos to spread false information. This letter +proposes an effective AI-generated video detection (AIGVDet) scheme by +capturing the forensic traces with a two-branch spatio-temporal convolutional +neural network (CNN). Specifically, two ResNet sub-detectors are learned +separately for identifying the anomalies in spatical and optical flow domains, +respectively. Results of such sub-detectors are fused to further enhance the +discrimination ability. A large-scale generated video dataset (GVD) is +constructed as a benchmark for model training and evaluation. Extensive +experimental results verify the high generalization and robustness of our +AIGVDet scheme. Code and dataset will be available at +https://github.com/multimediaFor/AIGVDet. + +
+
+
+
+
+ + ☆ V2X-PC: Vehicle-to-everything Collaborative Perception via Point Cluster + + +
+ The objective of the collaborative vehicle-to-everything perception task is +to enhance the individual vehicle's perception capability through message +communication among neighboring traffic agents. Previous methods focus on +achieving optimal performance within bandwidth limitations and typically adopt +BEV maps as the basic collaborative message units. However, we demonstrate that +collaboration with dense representations is plagued by object feature +destruction during message packing, inefficient message aggregation for +long-range collaboration, and implicit structure representation communication. +To tackle these issues, we introduce a brand new message unit, namely point +cluster, designed to represent the scene sparsely with a combination of +low-level structure information and high-level semantic information. The point +cluster inherently preserves object information while packing messages, with +weak relevance to the collaboration range, and supports explicit structure +modeling. Building upon this representation, we propose a novel framework +V2X-PC for collaborative perception. This framework includes a Point Cluster +Packing (PCP) module to keep object feature and manage bandwidth through the +manipulation of cluster point numbers. As for effective message aggregation, we +propose a Point Cluster Aggregation (PCA) module to match and merge point +clusters associated with the same object. To further handle time latency and +pose errors encountered in real-world scenarios, we propose parameter-free +solutions that can adapt to different noisy levels without finetuning. +Experiments on two widely recognized collaborative perception benchmarks +showcase the superior performance of our method compared to the previous +state-of-the-art approaches relying on BEV maps. + +
+
+
+
+
+ + ☆ SDXS: Real-Time One-Step Latent Diffusion Models with Image Conditions + + +
+ Recent advancements in diffusion models have positioned them at the forefront +of image generation. Despite their superior performance, diffusion models are +not without drawbacks; they are characterized by complex architectures and +substantial computational demands, resulting in significant latency due to +their iterative sampling process. To mitigate these limitations, we introduce a +dual approach involving model miniaturization and a reduction in sampling +steps, aimed at significantly decreasing model latency. Our methodology +leverages knowledge distillation to streamline the U-Net and image decoder +architectures, and introduces an innovative one-step DM training technique that +utilizes feature matching and score distillation. We present two models, +SDXS-512 and SDXS-1024, achieving inference speeds of approximately 100 FPS +(30x faster than SD v1.5) and 30 FP (60x faster than SDXL) on a single GPU, +respectively. Moreover, our training approach offers promising applications in +image-conditioned control, facilitating efficient image-to-image translation. + +
+
+
+
+
+ + ☆ Calibrating Bayesian UNet++ for Sub-Seasonal Forecasting ICLR 2024 + + +
+ Seasonal forecasting is a crucial task when it comes to detecting the extreme +heat and colds that occur due to climate change. Confidence in the predictions +should be reliable since a small increase in the temperatures in a year has a +big impact on the world. Calibration of the neural networks provides a way to +ensure our confidence in the predictions. However, calibrating regression +models is an under-researched topic, especially in forecasters. We calibrate a +UNet++ based architecture, which was shown to outperform physics-based models +in temperature anomalies. We show that with a slight trade-off between +prediction error and calibration error, it is possible to get more reliable and +sharper forecasts. We believe that calibration should be an important part of +safety-critical machine learning applications such as weather forecasters. + +
+
+ comment: Accepted as a workshop paper at "ICLR 2024 Tackling Climate Change + with Machine Learning" +
+
+
+
+
+ + ☆ Enhancing Industrial Transfer Learning with Style Filter: Cost Reduction + and Defect-Focus + + +
+ Addressing the challenge of data scarcity in industrial domains, transfer +learning emerges as a pivotal paradigm. This work introduces Style Filter, a +tailored methodology for industrial contexts. By selectively filtering source +domain data before knowledge transfer, Style Filter reduces the quantity of +data while maintaining or even enhancing the performance of transfer learning +strategy. Offering label-free operation, minimal reliance on prior knowledge, +independence from specific models, and re-utilization, Style Filter is +evaluated on authentic industrial datasets, highlighting its effectiveness when +employed before conventional transfer strategies in the deep learning domain. +The results underscore the effectiveness of Style Filter in real-world +industrial applications. + +
+
+ comment: 17 pages, 11 figures,4 tables +
+
+
+
+
+ + ☆ SatSynth: Augmenting Image-Mask Pairs through Diffusion Models for + Aerial Semantic Segmentation CVPR2024 + + +
+ In recent years, semantic segmentation has become a pivotal tool in +processing and interpreting satellite imagery. Yet, a prevalent limitation of +supervised learning techniques remains the need for extensive manual +annotations by experts. In this work, we explore the potential of generative +image diffusion to address the scarcity of annotated data in earth observation +tasks. The main idea is to learn the joint data manifold of images and labels, +leveraging recent advancements in denoising diffusion probabilistic models. To +the best of our knowledge, we are the first to generate both images and +corresponding masks for satellite segmentation. We find that the obtained pairs +not only display high quality in fine-scale features but also ensure a wide +sampling diversity. Both aspects are crucial for earth observation data, where +semantic classes can vary severely in scale and occurrence frequency. We employ +the novel data instances for downstream segmentation, as a form of data +augmentation. In our experiments, we provide comparisons to prior works based +on discriminative diffusion models or GANs. We demonstrate that integrating +generated samples yields significant quantitative improvements for satellite +semantic segmentation -- both compared to baselines and when training only on +the original data. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ EDUE: Expert Disagreement-Guided One-Pass Uncertainty Estimation for + Medical Image Segmentation + + +
+ Deploying deep learning (DL) models in medical applications relies on +predictive performance and other critical factors, such as conveying +trustworthy predictive uncertainty. Uncertainty estimation (UE) methods provide +potential solutions for evaluating prediction reliability and improving the +model confidence calibration. Despite increasing interest in UE, challenges +persist, such as the need for explicit methods to capture aleatoric uncertainty +and align uncertainty estimates with real-life disagreements among domain +experts. This paper proposes an Expert Disagreement-Guided Uncertainty +Estimation (EDUE) for medical image segmentation. By leveraging variability in +ground-truth annotations from multiple raters, we guide the model during +training and incorporate random sampling-based strategies to enhance +calibration confidence. Our method achieves 55% and 23% improvement in +correlation on average with expert disagreements at the image and pixel levels, +respectively, better calibration, and competitive segmentation performance +compared to the state-of-the-art deep ensembles, requiring only a single +forward pass. + +
+
+
+
+
+ + ☆ In the Search for Optimal Multi-view Learning Models for Crop + Classification with Global Remote Sensing Data + + +
+ Crop classification is of critical importance due to its role in studying +crop pattern changes, resource management, and carbon sequestration. When +employing data-driven techniques for its prediction, utilizing various temporal +data sources is necessary. Deep learning models have proven to be effective for +this task by mapping time series data to high-level representation for +prediction. However, they face substantial challenges when dealing with +multiple input patterns. The literature offers limited guidance for Multi-View +Learning (MVL) scenarios, as it has primarily focused on exploring fusion +strategies with specific encoders and validating them in local regions. In +contrast, we investigate the impact of simultaneous selection of the fusion +strategy and the encoder architecture evaluated on a global-scale cropland and +crop-type classifications. We use a range of five fusion strategies (Input, +Feature, Decision, Ensemble, Hybrid) and five temporal encoder architectures +(LSTM, GRU, TempCNN, TAE, L-TAE) as possible MVL model configurations. The +validation is on the CropHarvest dataset that provides optical, radar, and +weather time series, and topographic information as input data. We found that +in scenarios with a limited number of labeled samples, a unique configuration +is insufficient for all the cases. Instead, a specialized combination, +including encoder and fusion strategy, should be meticulously sought. To +streamline this search process, we suggest initially identifying the optimal +encoder architecture tailored for a particular fusion strategy, and then +determining the most suitable fusion strategy for the classification task. We +provide a technical framework for researchers exploring crop classification or +related tasks through a MVL approach. + +
+
+ comment: submitted to journal +
+
+
+
+
+ + ☆ SegICL: A Universal In-context Learning Framework for Enhanced + Segmentation in Medical Imaging + + +
+ Medical image segmentation models adapting to new tasks in a training-free +manner through in-context learning is an exciting advancement. Universal +segmentation models aim to generalize across the diverse modality of medical +images, yet their effectiveness often diminishes when applied to +out-of-distribution (OOD) data modalities and tasks, requiring intricate +fine-tuning of model for optimal performance. For addressing this challenge, we +introduce SegICL, a novel approach leveraging In-Context Learning (ICL) for +image segmentation. Unlike existing methods, SegICL has the capability to +employ text-guided segmentation and conduct in-context learning with a small +set of image-mask pairs, eliminating the need for training the model from +scratch or fine-tuning for OOD tasks (including OOD modality and dataset). +Extensive experimental validation of SegICL demonstrates a positive correlation +between the number of prompt samples and segmentation performance on OOD +modalities and tasks. This indicates that SegICL effectively address new +segmentation tasks based on contextual information. Additionally, SegICL also +exhibits comparable segmentation performance to mainstream models on OOD and +in-distribution tasks. Our code will be released soon. + +
+
+
+
+
+ + ☆ Revealing Vulnerabilities of Neural Networks in Parameter Learning and + Defense Against Explanation-Aware Backdoors + + +
+ Explainable Artificial Intelligence (XAI) strategies play a crucial part in +increasing the understanding and trustworthiness of neural networks. +Nonetheless, these techniques could potentially generate misleading +explanations. Blinding attacks can drastically alter a machine learning +algorithm's prediction and explanation, providing misleading information by +adding visually unnoticeable artifacts into the input, while maintaining the +model's accuracy. It poses a serious challenge in ensuring the reliability of +XAI methods. To ensure the reliability of XAI methods poses a real challenge, +we leverage statistical analysis to highlight the changes in CNN weights within +a CNN following blinding attacks. We introduce a method specifically designed +to limit the effectiveness of such attacks during the evaluation phase, +avoiding the need for extra training. The method we suggest defences against +most modern explanation-aware adversarial attacks, achieving an approximate +decrease of ~99\% in the Attack Success Rate (ASR) and a ~91\% reduction in the +Mean Square Error (MSE) between the original explanation and the defended +(post-attack) explanation across three unique types of attacks. + +
+
+
+
+
+ + ☆ Elysium: Exploring Object-level Perception in Videos via MLLM + + +
+ Multi-modal Large Language Models (MLLMs) have demonstrated their ability to +perceive objects in still images, but their application in video-related tasks, +such as object tracking, remains understudied. This lack of exploration is +primarily due to two key challenges. Firstly, extensive pretraining on +large-scale video datasets is required to equip MLLMs with the capability to +perceive objects across multiple frames and understand inter-frame +relationships. Secondly, processing a large number of frames within the context +window of Large Language Models (LLMs) can impose a significant computational +burden. To address the first challenge, we introduce ElysiumTrack-1M, a +large-scale video dataset paired with novel tasks: Referring Single Object +Tracking (RSOT) and Video Referring Expression Generation (Video-REG). +ElysiumTrack-1M contains 1.27 million annotated video frames with corresponding +object boxes and descriptions. Leveraging this dataset, we conduct training of +MLLMs and propose a token-compression model T-Selector to tackle the second +challenge. Our proposed approach, Elysium: Exploring Object-level Perception in +Videos via MLLM, is an end-to-end trainable MLLM that makes the first attempt +to conduct object-level tasks in videos without requiring any additional +plug-in or expert models. + +
+
+
+
+
+ + ☆ QKFormer: Hierarchical Spiking Transformer using Q-K Attention + + +
+ Spiking Transformers, which integrate Spiking Neural Networks (SNNs) with +Transformer architectures, have attracted significant attention due to their +potential for energy efficiency and high performance. However, existing models +in this domain still suffer from suboptimal performance. We introduce several +innovations to improve the performance: i) We propose a novel spike-form Q-K +attention mechanism, tailored for SNNs, which efficiently models the importance +of token or channel dimensions through binary vectors with linear complexity. +ii) We incorporate the hierarchical structure, which significantly benefits the +performance of both the brain and artificial neural networks, into spiking +transformers to obtain multi-scale spiking representation. iii) We design a +versatile and powerful patch embedding module with a deformed shortcut +specifically for spiking transformers. Together, we develop QKFormer, a +hierarchical spiking transformer based on Q-K attention with direct training. +QKFormer shows significantly superior performance over existing +state-of-the-art SNN models on various mainstream datasets. Notably, with +comparable size to Spikformer (66.34 M, 74.81%), QKFormer (64.96 M) achieves a +groundbreaking top-1 accuracy of 85.65% on ImageNet-1k, substantially +outperforming Spikformer by 10.84%. To our best knowledge, this is the first +time that directly training SNNs have exceeded 85% accuracy on ImageNet-1K. The +code and models are publicly available at +https://github.com/zhouchenlin2096/QKFormer + +
+
+ comment: 10 pages, code: https://github.com/zhouchenlin2096/QKFormer +
+
+
+
+
+ + ☆ DOrA: 3D Visual Grounding with Order-Aware Referring + + +
+ 3D visual grounding aims to identify the target object within a 3D point +cloud scene referred to by a natural language description. While previous works +attempt to exploit the verbo-visual relation with proposed cross-modal +transformers, unstructured natural utterances and scattered objects might lead +to undesirable performances. In this paper, we introduce DOrA, a novel 3D +visual grounding framework with Order-Aware referring. DOrA is designed to +leverage Large Language Models (LLMs) to parse language description, suggesting +a referential order of anchor objects. Such ordered anchor objects allow DOrA +to update visual features and locate the target object during the grounding +process. Experimental results on the NR3D and ScanRefer datasets demonstrate +our superiority in both low-resource and full-data scenarios. In particular, +DOrA surpasses current state-of-the-art frameworks by 9.3% and 7.8% grounding +accuracy under 1% data and 10% data settings, respectively. + +
+
+
+
+
+ + ☆ VMRNN: Integrating Vision Mamba and LSTM for Efficient and Accurate + Spatiotemporal Forecasting + + +
+ Combining CNNs or ViTs, with RNNs for spatiotemporal forecasting, has yielded +unparalleled results in predicting temporal and spatial dynamics. However, +modeling extensive global information remains a formidable challenge; CNNs are +limited by their narrow receptive fields, and ViTs struggle with the intensive +computational demands of their attention mechanisms. The emergence of recent +Mamba-based architectures has been met with enthusiasm for their exceptional +long-sequence modeling capabilities, surpassing established vision models in +efficiency and accuracy, which motivates us to develop an innovative +architecture tailored for spatiotemporal forecasting. In this paper, we propose +the VMRNN cell, a new recurrent unit that integrates the strengths of Vision +Mamba blocks with LSTM. We construct a network centered on VMRNN cells to +tackle spatiotemporal prediction tasks effectively. Our extensive evaluations +show that our proposed approach secures competitive results on a variety of +tasks while maintaining a smaller model size. Our code is available at +https://github.com/yyyujintang/VMRNN-PyTorch. + +
+
+ comment: 11 pages, 7 figures. arXiv admin note: text overlap with + arXiv:2308.09891 by other authors +
+
+
+
+
+ + ☆ An Intermediate Fusion ViT Enables Efficient Text-Image Alignment in + Diffusion Models + + +
+ Diffusion models have been widely used for conditional data cross-modal +generation tasks such as text-to-image and text-to-video. However, +state-of-the-art models still fail to align the generated visual concepts with +high-level semantics in a language such as object count, spatial relationship, +etc. We approach this problem from a multimodal data fusion perspective and +investigate how different fusion strategies can affect vision-language +alignment. We discover that compared to the widely used early fusion of +conditioning text in a pretrained image feature space, a specially designed +intermediate fusion can: (i) boost text-to-image alignment with improved +generation quality and (ii) improve training and inference efficiency by +reducing low-rank text-to-image attention calculations. We perform experiments +using a text-to-image generation task on the MS-COCO dataset. We compare our +intermediate fusion mechanism with the classic early fusion mechanism on two +common conditioning methods on a U-shaped ViT backbone. Our intermediate fusion +model achieves a higher CLIP Score and lower FID, with 20% reduced FLOPs, and +50% increased training speed compared to a strong U-ViT baseline with an early +fusion. + +
+
+
+
+
+ + ☆ Open-Set Recognition in the Age of Vision-Language Models + + +
+ Are vision-language models (VLMs) open-set models because they are trained on +internet-scale datasets? We answer this question with a clear no - VLMs +introduce closed-set assumptions via their finite query set, making them +vulnerable to open-set conditions. We systematically evaluate VLMs for open-set +recognition and find they frequently misclassify objects not contained in their +query set, leading to alarmingly low precision when tuned for high recall and +vice versa. We show that naively increasing the size of the query set to +contain more and more classes does not mitigate this problem, but instead +causes diminishing task performance and open-set performance. We establish a +revised definition of the open-set problem for the age of VLMs, define a new +benchmark and evaluation protocol to facilitate standardised evaluation and +research in this important area, and evaluate promising baseline approaches +based on predictive uncertainty and dedicated negative embeddings on a range of +VLM classifiers and object detectors. + +
+
+ comment: 31 pages, under review +
+
+
+
+
+ + ☆ ModeTv2: GPU-accelerated Motion Decomposition Transformer for Pairwise + Optimization in Medical Image Registration + + +
+ Deformable image registration plays a crucial role in medical imaging, aiding +in disease diagnosis and image-guided interventions. Traditional iterative +methods are slow, while deep learning (DL) accelerates solutions but faces +usability and precision challenges. This study introduces a pyramid network +with the enhanced motion decomposition Transformer (ModeTv2) operator, +showcasing superior pairwise optimization (PO) akin to traditional methods. We +re-implement ModeT operator with CUDA extensions to enhance its computational +efficiency. We further propose RegHead module which refines deformation fields, +improves the realism of deformation and reduces parameters. By adopting the PO, +the proposed network balances accuracy, efficiency, and generalizability. +Extensive experiments on two public brain MRI datasets and one abdominal CT +dataset demonstrate the network's suitability for PO, providing a DL model with +enhanced usability and interpretability. The code is publicly available. + +
+
+
+
+
+ + ☆ CMViM: Contrastive Masked Vim Autoencoder for 3D Multi-modal + Representation Learning for AD classification + + +
+ Alzheimer's disease (AD) is an incurable neurodegenerative condition leading +to cognitive and functional deterioration. Given the lack of a cure, prompt and +precise AD diagnosis is vital, a complex process dependent on multiple factors +and multi-modal data. While successful efforts have been made to integrate +multi-modal representation learning into medical datasets, scant attention has +been given to 3D medical images. In this paper, we propose Contrastive Masked +Vim Autoencoder (CMViM), the first efficient representation learning method +tailored for 3D multi-modal data. Our proposed framework is built on a masked +Vim autoencoder to learn a unified multi-modal representation and +long-dependencies contained in 3D medical images. We also introduce an +intra-modal contrastive learning module to enhance the capability of the +multi-modal Vim encoder for modeling the discriminative features in the same +modality, and an inter-modal contrastive learning module to alleviate +misaligned representation among modalities. Our framework consists of two main +steps: 1) incorporate the Vision Mamba (Vim) into the mask autoencoder to +reconstruct 3D masked multi-modal data efficiently. 2) align the multi-modal +representations with contrastive learning mechanisms from both intra-modal and +inter-modal aspects. Our framework is pre-trained and validated ADNI2 dataset +and validated on the downstream task for AD classification. The proposed CMViM +yields 2.7\% AUC performance improvement compared with other state-of-the-art +methods. + +
+
+ comment: 11 pages, 1 figure +
+
+
+
+
+ + ☆ Visually Guided Generative Text-Layout Pre-training for Document + Intelligence NAACL 2024 + + +
+ Prior study shows that pre-training techniques can boost the performance of +visual document understanding (VDU), which typically requires models to gain +abilities to perceive and reason both document texts and layouts (e.g., +locations of texts and table-cells). To this end, we propose visually guided +generative text-layout pre-training, named ViTLP. Given a document image, the +model optimizes hierarchical language and layout modeling objectives to +generate the interleaved text and layout sequence. In addition, to address the +limitation of processing long documents by Transformers, we introduce a +straightforward yet effective multi-segment generative pre-training scheme, +facilitating ViTLP to process word-intensive documents of any length. ViTLP can +function as a native OCR model to localize and recognize texts of document +images. Besides, ViTLP can be effectively applied to various downstream VDU +tasks. Extensive experiments show that ViTLP achieves competitive performance +over existing baselines on benchmark VDU tasks, including information +extraction, document classification, and document question answering. + +
+
+ comment: Accepted to NAACL 2024 main conference. The first version of this + paper was submitted to OpenReview + (https://openreview.net/forum?id=ARtBIBAmNR) in June 2023 +
+
+
+
+
+ + ☆ Let Real Images be as a Judger, Spotting Fake Images Synthesized with + Generative Models + + +
+ In the last few years, generative models have shown their powerful +capabilities in synthesizing realistic images in both quality and diversity +(i.e., facial images, and natural subjects). Unfortunately, the artifact +patterns in fake images synthesized by different generative models are +inconsistent, leading to the failure of previous research that relied on +spotting subtle differences between real and fake. In our preliminary +experiments, we find that the artifacts in fake images always change with the +development of the generative model, while natural images exhibit stable +statistical properties. In this paper, we employ natural traces shared only by +real images as an additional predictive target in the detector. Specifically, +the natural traces are learned from the wild real images and we introduce +extended supervised contrastive learning to bring them closer to real images +and further away from fake ones. This motivates the detector to make decisions +based on the proximity of images to the natural traces. To conduct a +comprehensive experiment, we built a high-quality and diverse dataset that +includes generative models comprising 6 GAN and 6 diffusion models, to evaluate +the effectiveness in generalizing unknown forgery techniques and robustness in +surviving different transformations. Experimental results show that our +proposed method gives 96.1% mAP significantly outperforms the baselines. +Extensive experiments conducted on the widely recognized platform Midjourney +reveal that our proposed method achieves an accuracy exceeding 78.4%, +underscoring its practicality for real-world application deployment. The source +code and partial self-built dataset are available in supplementary material. + +
+
+
+
+
+ + ☆ Make-Your-Anchor: A Diffusion-based 2D Avatar Generation Framework CVPR2024 + + +
+ Despite the remarkable process of talking-head-based avatar-creating +solutions, directly generating anchor-style videos with full-body motions +remains challenging. In this study, we propose Make-Your-Anchor, a novel system +necessitating only a one-minute video clip of an individual for training, +subsequently enabling the automatic generation of anchor-style videos with +precise torso and hand movements. Specifically, we finetune a proposed +structure-guided diffusion model on input video to render 3D mesh conditions +into human appearances. We adopt a two-stage training strategy for the +diffusion model, effectively binding movements with specific appearances. To +produce arbitrary long temporal video, we extend the 2D U-Net in the frame-wise +diffusion model to a 3D style without additional training cost, and a simple +yet effective batch-overlapped temporal denoising module is proposed to bypass +the constraints on video length during inference. Finally, a novel +identity-specific face enhancement module is introduced to improve the visual +quality of facial regions in the output videos. Comparative experiments +demonstrate the effectiveness and superiority of the system in terms of visual +quality, temporal coherence, and identity preservation, outperforming SOTA +diffusion/non-diffusion methods. Project page: +\url{https://github.com/ICTMCG/Make-Your-Anchor}. + +
+
+ comment: accepted at CVPR2024 +
+
+
+
+
+ + ☆ Medical Image Registration and Its Application in Retinal Images: A + Review + + +
+ Medical image registration is vital for disease diagnosis and treatment with +its ability to merge diverse information of images, which may be captured under +different times, angles, or modalities. Although several surveys have reviewed +the development of medical image registration, these surveys have not +systematically summarized methodologies of existing medical image registration +methods. To this end, we provide a comprehensive review of these methods from +traditional and deep learning-based directions, aiming to help audiences +understand the development of medical image registration quickly. In +particular, we review recent advances in retinal image registration at the end +of each section, which has not attracted much attention. Additionally, we also +discuss the current challenges of retinal image registration and provide +insights and prospects for future research. + +
+
+
+
+
+ + ☆ Self-Supervised Learning for Medical Image Data with Anatomy-Oriented + Imaging Planes + + +
+ Self-supervised learning has emerged as a powerful tool for pretraining deep +networks on unlabeled data, prior to transfer learning of target tasks with +limited annotation. The relevance between the pretraining pretext and target +tasks is crucial to the success of transfer learning. Various pretext tasks +have been proposed to utilize properties of medical image data (e.g., three +dimensionality), which are more relevant to medical image analysis than generic +ones for natural images. However, previous work rarely paid attention to data +with anatomy-oriented imaging planes, e.g., standard cardiac magnetic resonance +imaging views. As these imaging planes are defined according to the anatomy of +the imaged organ, pretext tasks effectively exploiting this information can +pretrain the networks to gain knowledge on the organ of interest. In this work, +we propose two complementary pretext tasks for this group of medical image data +based on the spatial relationship of the imaging planes. The first is to learn +the relative orientation between the imaging planes and implemented as +regressing their intersecting lines. The second exploits parallel imaging +planes to regress their relative slice locations within a stack. Both pretext +tasks are conceptually straightforward and easy to implement, and can be +combined in multitask learning for better representation learning. Thorough +experiments on two anatomical structures (heart and knee) and representative +target tasks (semantic segmentation and classification) demonstrate that the +proposed pretext tasks are effective in pretraining deep networks for +remarkably boosted performance on the target tasks, and superior to other +recent approaches. + +
+
+ comment: Medical Image Analysis +
+
+
+
+
+ + ☆ PathoTune: Adapting Visual Foundation Model to Pathological Specialists MICCAI 2024 + + +
+ As natural image understanding moves towards the pretrain-finetune era, +research in pathology imaging is concurrently evolving. Despite the predominant +focus on pretraining pathological foundation models, how to adapt foundation +models to downstream tasks is little explored. For downstream adaptation, we +propose the existence of two domain gaps, i.e., the Foundation-Task Gap and the +Task-Instance Gap. To mitigate these gaps, we introduce PathoTune, a framework +designed to efficiently adapt pathological or even visual foundation models to +pathology-specific tasks via multi-modal prompt tuning. The proposed framework +leverages Task-specific Visual Prompts and Task-specific Textual Prompts to +identify task-relevant features, along with Instance-specific Visual Prompts +for encoding single pathological image features. Results across multiple +datasets at both patch-level and WSI-level demonstrate its superior performance +over single-modality prompt tuning approaches. Significantly, PathoTune +facilitates the direct adaptation of natural visual foundation models to +pathological tasks, drastically outperforming pathological foundation models +with simple linear probing. The code will be available upon acceptance. + +
+
+ comment: Submitted to MICCAI 2024 +
+
+
+
+
+ + ☆ CT-Bound: Fast Boundary Estimation From Noisy Images Via Hybrid + Convolution and Transformer Neural Networks + + +
+ We present CT-Bound, a fast boundary estimation method for noisy images using +a hybrid Convolution and Transformer neural network. The proposed architecture +decomposes boundary estimation into two tasks: local detection and global +regularization of image boundaries. It first estimates a parametric +representation of boundary structures only using the input image within a small +receptive field and then refines the boundary structure in the parameter domain +without accessing the input image. Because of this, a part of the network can +be easily trained using naive, synthetic images and still generalized to real +images, and the entire architecture is computationally efficient as the +boundary refinement is non-iterative and not in the image domain. Compared with +the previous highest accuracy methods, our experiment shows that CT-Bound is +100 times faster, producing comparably accurate, high-quality boundary and +color maps. We also demonstrate that CT-Bound can produce boundary and color +maps on real captured images without extra fine-tuning and real-time boundary +map and color map videos at ten frames per second. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ REFRAME: Reflective Surface Real-Time Rendering for Mobile Devices + + +
+ This work tackles the challenging task of achieving real-time novel view +synthesis on various scenes, including highly reflective objects and unbounded +outdoor scenes. Existing real-time rendering methods, especially those based on +meshes, often have subpar performance in modeling surfaces with rich +view-dependent appearances. Our key idea lies in leveraging meshes for +rendering acceleration while incorporating a novel approach to parameterize +view-dependent information. We decompose the color into diffuse and specular, +and model the specular color in the reflected direction based on a neural +environment map. Our experiments demonstrate that our method achieves +comparable reconstruction quality for highly reflective surfaces compared to +state-of-the-art offline methods, while also efficiently enabling real-time +rendering on edge devices such as smartphones. + +
+
+ comment: Project Page:https://xdimlab.github.io/REFRAME/ +
+
+
+
+
+ + ☆ Camera-aware Label Refinement for Unsupervised Person Re-identification + + +
+ Unsupervised person re-identification aims to retrieve images of a specified +person without identity labels. Many recent unsupervised Re-ID approaches adopt +clustering-based methods to measure cross-camera feature similarity to roughly +divide images into clusters. They ignore the feature distribution discrepancy +induced by camera domain gap, resulting in the unavoidable performance +degradation. Camera information is usually available, and the feature +distribution in the single camera usually focuses more on the appearance of the +individual and has less intra-identity variance. Inspired by the observation, +we introduce a \textbf{C}amera-\textbf{A}ware \textbf{L}abel +\textbf{R}efinement~(CALR) framework that reduces camera discrepancy by +clustering intra-camera similarity. Specifically, we employ intra-camera +training to obtain reliable local pseudo labels within each camera, and then +refine global labels generated by inter-camera clustering and train the +discriminative model using more reliable global pseudo labels in a self-paced +manner. Meanwhile, we develop a camera-alignment module to align feature +distributions under different cameras, which could help deal with the camera +variance further. Extensive experiments validate the superiority of our +proposed method over state-of-the-art approaches. The code is accessible at +https://github.com/leeBooMla/CALR. + +
+
+ comment: submitted to IEEE TMM +
+
+
+
+
+ + ☆ If CLIP Could Talk: Understanding Vision-Language Model Representations + Through Their Preferred Concept Descriptions + + +
+ Recent works often assume that Vision-Language Model (VLM) representations +are based on visual attributes like shape. However, it is unclear to what +extent VLMs prioritize this information to represent concepts. We propose +Extract and Explore (EX2), a novel approach to characterize important textual +features for VLMs. EX2 uses reinforcement learning to align a large language +model with VLM preferences and generates descriptions that incorporate the +important features for the VLM. Then, we inspect the descriptions to identify +the features that contribute to VLM representations. We find that spurious +descriptions have a major role in VLM representations despite providing no +helpful information, e.g., Click to enlarge photo of CONCEPT. More importantly, +among informative descriptions, VLMs rely significantly on non-visual +attributes like habitat to represent visual concepts. Also, our analysis +reveals that different VLMs prioritize different attributes in their +representations. Overall, we show that VLMs do not simply match images to scene +descriptions and that non-visual or even spurious descriptions significantly +influence their representations. + +
+
+ comment: Code: https://github.com/BatsResearch/ex2 +
+
+
+
+
+ + ☆ RCBEVDet: Radar-camera Fusion in Bird's Eye View for 3D Object Detection CVPR2024 + + +
+ Three-dimensional object detection is one of the key tasks in autonomous +driving. To reduce costs in practice, low-cost multi-view cameras for 3D object +detection are proposed to replace the expansive LiDAR sensors. However, relying +solely on cameras is difficult to achieve highly accurate and robust 3D object +detection. An effective solution to this issue is combining multi-view cameras +with the economical millimeter-wave radar sensor to achieve more reliable +multi-modal 3D object detection. In this paper, we introduce RCBEVDet, a +radar-camera fusion 3D object detection method in the bird's eye view (BEV). +Specifically, we first design RadarBEVNet for radar BEV feature extraction. +RadarBEVNet consists of a dual-stream radar backbone and a Radar Cross-Section +(RCS) aware BEV encoder. In the dual-stream radar backbone, a point-based +encoder and a transformer-based encoder are proposed to extract radar features, +with an injection and extraction module to facilitate communication between the +two encoders. The RCS-aware BEV encoder takes RCS as the object size prior to +scattering the point feature in BEV. Besides, we present the Cross-Attention +Multi-layer Fusion module to automatically align the multi-modal BEV feature +from radar and camera with the deformable attention mechanism, and then fuse +the feature with channel and spatial fusion layers. Experimental results show +that RCBEVDet achieves new state-of-the-art radar-camera fusion results on +nuScenes and view-of-delft (VoD) 3D object detection benchmarks. Furthermore, +RCBEVDet achieves better 3D detection results than all real-time camera-only +and radar-camera 3D object detectors with a faster inference speed at 21~28 +FPS. The source code will be released at https://github.com/VDIGPKU/RCBEVDet. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Producing and Leveraging Online Map Uncertainty in Trajectory Prediction CVPR 2024 + + +
+ High-definition (HD) maps have played an integral role in the development of +modern autonomous vehicle (AV) stacks, albeit with high associated labeling and +maintenance costs. As a result, many recent works have proposed methods for +estimating HD maps online from sensor data, enabling AVs to operate outside of +previously-mapped regions. However, current online map estimation approaches +are developed in isolation of their downstream tasks, complicating their +integration in AV stacks. In particular, they do not produce uncertainty or +confidence estimates. In this work, we extend multiple state-of-the-art online +map estimation methods to additionally estimate uncertainty and show how this +enables more tightly integrating online mapping with trajectory forecasting. In +doing so, we find that incorporating uncertainty yields up to 50% faster +training convergence and up to 15% better prediction performance on the +real-world nuScenes driving dataset. + +
+
+ comment: 14 pages, 14 figures, 6 tables. CVPR 2024 +
+
+
+
+
+ + ☆ Real-time Neuron Segmentation for Voltage Imaging + + +
+ In voltage imaging, where the membrane potentials of individual neurons are +recorded at from hundreds to thousand frames per second using fluorescence +microscopy, data processing presents a challenge. Even a fraction of a minute +of recording with a limited image size yields gigabytes of video data +consisting of tens of thousands of frames, which can be time-consuming to +process. Moreover, millisecond-level short exposures lead to noisy video +frames, obscuring neuron footprints especially in deep-brain samples where +noisy signals are buried in background fluorescence. To address this challenge, +we propose a fast neuron segmentation method able to detect multiple, +potentially overlapping, spiking neurons from noisy video frames, and implement +a data processing pipeline incorporating the proposed segmentation method along +with GPU-accelerated motion correction. By testing on existing datasets as well +as on new datasets we introduce, we show that our pipeline extracts neuron +footprints that agree well with human annotation even from cluttered datasets, +and demonstrate real-time processing of voltage imaging data on a single +desktop computer for the first time. + +
+
+
+
+
+ + ☆ DOCTR: Disentangled Object-Centric Transformer for Point Scene + Understanding + + +
+ Point scene understanding is a challenging task to process real-world scene +point cloud, which aims at segmenting each object, estimating its pose, and +reconstructing its mesh simultaneously. Recent state-of-the-art method first +segments each object and then processes them independently with multiple stages +for the different sub-tasks. This leads to a complex pipeline to optimize and +makes it hard to leverage the relationship constraints between multiple +objects. In this work, we propose a novel Disentangled Object-Centric +TRansformer (DOCTR) that explores object-centric representation to facilitate +learning with multiple objects for the multiple sub-tasks in a unified manner. +Each object is represented as a query, and a Transformer decoder is adapted to +iteratively optimize all the queries involving their relationship. In +particular, we introduce a semantic-geometry disentangled query (SGDQ) design +that enables the query features to attend separately to semantic information +and geometric information relevant to the corresponding sub-tasks. A hybrid +bipartite matching module is employed to well use the supervisions from all the +sub-tasks during training. Qualitative and quantitative experimental results +demonstrate that our method achieves state-of-the-art performance on the +challenging ScanNet dataset. Code is available at +https://github.com/SAITPublic/DOCTR. + +
+
+
+
+
+ + ☆ Benchmarks and Challenges in Pose Estimation for Egocentric Hand + Interactions with Objects + + +
+ We interact with the world with our hands and see it through our own +(egocentric) perspective. A holistic 3D understanding of such interactions from +egocentric views is important for tasks in robotics, AR/VR, action recognition +and motion generation. Accurately reconstructing such interactions in 3D is +challenging due to heavy occlusion, viewpoint bias, camera distortion, and +motion blur from the head movement. To this end, we designed the HANDS23 +challenge based on the AssemblyHands and ARCTIC datasets with carefully +designed training and testing splits. Based on the results of the top submitted +methods and more recent baselines on the leaderboards, we perform a thorough +analysis on 3D hand(-object) reconstruction tasks. Our analysis demonstrates +the effectiveness of addressing distortion specific to egocentric cameras, +adopting high-capacity transformers to learn complex hand-object interactions, +and fusing predictions from different views. Our study further reveals +challenging scenarios intractable with state-of-the-art methods, such as fast +hand motion, object reconstruction from narrow egocentric views, and close +contact between two hands and objects. Our efforts will enrich the community's +knowledge foundation and facilitate future hand studies on egocentric +hand-object interactions. + +
+
+
+
+
+ + ☆ Enhancing Visual Place Recognition via Fast and Slow Adaptive Biasing in + Event Cameras + + +
+ Event cameras are increasingly popular in robotics due to their beneficial +features, such as low latency, energy efficiency, and high dynamic range. +Nevertheless, their downstream task performance is greatly influenced by the +optimization of bias parameters. These parameters, for instance, regulate the +necessary change in light intensity to trigger an event, which in turn depends +on factors such as the environment lighting and camera motion. This paper +introduces feedback control algorithms that automatically tune the bias +parameters through two interacting methods: 1) An immediate, on-the-fly fast +adaptation of the refractory period, which sets the minimum interval between +consecutive events, and 2) if the event rate exceeds the specified bounds even +after changing the refractory period repeatedly, the controller adapts the +pixel bandwidth and event thresholds, which stabilizes after a short period of +noise events across all pixels (slow adaptation). Our evaluation focuses on the +visual place recognition task, where incoming query images are compared to a +given reference database. We conducted comprehensive evaluations of our +algorithms' adaptive feedback control in real-time. To do so, we collected the +QCR-Fast-and-Slow dataset that contains DAVIS346 event camera streams from 366 +repeated traversals of a Scout Mini robot navigating through a 100 meter long +indoor lab setting (totaling over 35km distance traveled) in varying brightness +conditions with ground truth location information. Our proposed feedback +controllers result in superior performance when compared to the standard bias +settings and prior feedback control methods. Our findings also detail the +impact of bias adjustments on task performance and feature ablation studies on +the fast and slow adaptation mechanisms. + +
+
+ comment: 8 pages, 9 figures, paper under review +
+
+
+
+
+ + ☆ Refining Text-to-Image Generation: Towards Accurate Training-Free + Glyph-Enhanced Image Generation + + +
+ Over the past few years, Text-to-Image (T2I) generation approaches based on +diffusion models have gained significant attention. However, vanilla diffusion +models often suffer from spelling inaccuracies in the text displayed within the +generated images. The capability to generate visual text is crucial, offering +both academic interest and a wide range of practical applications. To produce +accurate visual text images, state-of-the-art techniques adopt a +glyph-controlled image generation approach, consisting of a text layout +generator followed by an image generator that is conditioned on the generated +text layout. Nevertheless, our study reveals that these models still face three +primary challenges, prompting us to develop a testbed to facilitate future +research. We introduce a benchmark, LenCom-Eval, specifically designed for +testing models' capability in generating images with Lengthy and Complex visual +text. Subsequently, we introduce a training-free framework to enhance the +two-stage generation approaches. We examine the effectiveness of our approach +on both LenCom-Eval and MARIO-Eval benchmarks and demonstrate notable +improvements across a range of evaluation metrics, including CLIPScore, OCR +precision, recall, F1 score, accuracy, and edit distance scores. For instance, +our proposed framework improves the backbone model, TextDiffuser, by more than +23\% and 13.5\% in terms of OCR word F1 on LenCom-Eval and MARIO-Eval, +respectively. Our work makes a unique contribution to the field by focusing on +generating images with long and rare text sequences, a niche previously +unexplored by existing literature + +
+
+
+
+
+ + ☆ Unsupervised Template-assisted Point Cloud Shape Correspondence Network CVPR2024 + + +
+ Unsupervised point cloud shape correspondence aims to establish point-wise +correspondences between source and target point clouds. Existing methods obtain +correspondences directly by computing point-wise feature similarity between +point clouds. However, non-rigid objects possess strong deformability and +unusual shapes, making it a longstanding challenge to directly establish +correspondences between point clouds with unconventional shapes. To address +this challenge, we propose an unsupervised Template-Assisted point cloud shape +correspondence Network, termed TANet, including a template generation module +and a template assistance module. The proposed TANet enjoys several merits. +Firstly, the template generation module establishes a set of learnable +templates with explicit structures. Secondly, we introduce a template +assistance module that extensively leverages the generated templates to +establish more accurate shape correspondences from multiple perspectives. +Extensive experiments on four human and animal datasets demonstrate that TANet +achieves favorable performance against state-of-the-art methods. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Spike-NeRF: Neural Radiance Field Based On Spike Camera ICME2024 + + +
+ As a neuromorphic sensor with high temporal resolution, spike cameras offer +notable advantages over traditional cameras in high-speed vision applications +such as high-speed optical estimation, depth estimation, and object tracking. +Inspired by the success of the spike camera, we proposed Spike-NeRF, the first +Neural Radiance Field derived from spike data, to achieve 3D reconstruction and +novel viewpoint synthesis of high-speed scenes. Instead of the multi-view +images at the same time of NeRF, the inputs of Spike-NeRF are continuous spike +streams captured by a moving spike camera in a very short time. To reconstruct +a correct and stable 3D scene from high-frequency but unstable spike data, we +devised spike masks along with a distinctive loss function. We evaluate our +method qualitatively and numerically on several challenging synthetic scenes +generated by blender with the spike camera simulator. Our results demonstrate +that Spike-NeRF produces more visually appealing results than the existing +methods and the baseline we proposed in high-speed scenes. Our code and data +will be released soon. + +
+
+ comment: This paper is accepted by ICME2024 +
+
+
+
+
+ + ☆ A Survey on Long Video Generation: Challenges, Methods, and Prospects + + +
+ Video generation is a rapidly advancing research area, garnering significant +attention due to its broad range of applications. One critical aspect of this +field is the generation of long-duration videos, which presents unique +challenges and opportunities. This paper presents the first survey of recent +advancements in long video generation and summarises them into two key +paradigms: divide and conquer temporal autoregressive. + We delve into the common models employed in each paradigm, including aspects +of network design and conditioning techniques. Furthermore, we offer a +comprehensive overview and classification of the datasets and evaluation +metrics which are crucial for advancing long video generation research. +Concluding with a summary of existing studies, we also discuss the emerging +challenges and future directions in this dynamic field. We hope that this +survey will serve as an essential reference for researchers and practitioners +in the realm of long video generation. + +
+
+
+
+
+ + ☆ Ensemble Adversarial Defense via Integration of Multiple Dispersed Low + Curvature Models IJCNN + + +
+ The integration of an ensemble of deep learning models has been extensively +explored to enhance defense against adversarial attacks. The diversity among +sub-models increases the attack cost required to deceive the majority of the +ensemble, thereby improving the adversarial robustness. While existing +approaches mainly center on increasing diversity in feature representations or +dispersion of first-order gradients with respect to input, the limited +correlation between these diversity metrics and adversarial robustness +constrains the performance of ensemble adversarial defense. In this work, we +aim to enhance ensemble diversity by reducing attack transferability. We +identify second-order gradients, which depict the loss curvature, as a key +factor in adversarial robustness. Computing the Hessian matrix involved in +second-order gradients is computationally expensive. To address this, we +approximate the Hessian-vector product using differential approximation. Given +that low curvature provides better robustness, our ensemble model was designed +to consider the influence of curvature among different sub-models. We introduce +a novel regularizer to train multiple more-diverse low-curvature network +models. Extensive experiments across various datasets demonstrate that our +ensemble model exhibits superior robustness against a range of attacks, +underscoring the effectiveness of our approach. + +
+
+ comment: Accepted to The 2024 International Joint Conference on Neural + Networks (IJCNN) +
+
+
+
+
+ + ☆ ASDF: Assembly State Detection Utilizing Late Fusion by Integrating 6D + Pose Estimation + + +
+ In medical and industrial domains, providing guidance for assembly processes +is critical to ensure efficiency and safety. Errors in assembly can lead to +significant consequences such as extended surgery times, and prolonged +manufacturing or maintenance times in industry. Assembly scenarios can benefit +from in-situ AR visualization to provide guidance, reduce assembly times and +minimize errors. To enable in-situ visualization 6D pose estimation can be +leveraged. Existing 6D pose estimation techniques primarily focus on individual +objects and static captures. However, assembly scenarios have various dynamics +including occlusion during assembly and dynamics in the assembly objects +appearance. Existing work, combining object detection/6D pose estimation and +assembly state detection focuses either on pure deep learning-based approaches, +or limit the assembly state detection to building blocks. To address the +challenges of 6D pose estimation in combination with assembly state detection, +our approach ASDF builds upon the strengths of YOLOv8, a real-time capable +object detection framework. We extend this framework, refine the object pose +and fuse pose knowledge with network-detected pose information. Utilizing our +late fusion in our Pose2State module results in refined 6D pose estimation and +assembly state detection. By combining both pose and state information, our +Pose2State module predicts the final assembly state with precision. Our +evaluation on our ASDF dataset shows that our Pose2State module leads to an +improved assembly state detection and that the improvement of the assembly +state further leads to a more robust 6D pose estimation. Moreover, on the GBOT +dataset, we outperform the pure deep learning-based network, and even +outperform the hybrid and pure tracking-based approaches. + +
+
+
+
+
+ + ☆ Multi-attention Associate Prediction Network for Visual Tracking + + +
+ Classification-regression prediction networks have realized impressive +success in several modern deep trackers. However, there is an inherent +difference between classification and regression tasks, so they have diverse +even opposite demands for feature matching. Existed models always ignore the +key issue and only employ a unified matching block in two task branches, +decaying the decision quality. Besides, these models also struggle with +decision misalignment situation. In this paper, we propose a multi-attention +associate prediction network (MAPNet) to tackle the above problems. Concretely, +two novel matchers, i.e., category-aware matcher and spatial-aware matcher, are +first designed for feature comparison by integrating self, cross, channel or +spatial attentions organically. They are capable of fully capturing the +category-related semantics for classification and the local spatial contexts +for regression, respectively. Then, we present a dual alignment module to +enhance the correspondences between two branches, which is useful to find the +optimal tracking solution. Finally, we describe a Siamese tracker built upon +the proposed prediction network, which achieves the leading performance on five +tracking benchmarks, consisting of LaSOT, TrackingNet, GOT-10k, TNL2k and +UAV123, and surpasses other state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Text-IF: Leveraging Semantic Text Guidance for Degradation-Aware and + Interactive Image Fusion CVPR 2024 + + +
+ Image fusion aims to combine information from different source images to +create a comprehensively representative image. Existing fusion methods are +typically helpless in dealing with degradations in low-quality source images +and non-interactive to multiple subjective and objective needs. To solve them, +we introduce a novel approach that leverages semantic text guidance image +fusion model for degradation-aware and interactive image fusion task, termed as +Text-IF. It innovatively extends the classical image fusion to the text guided +image fusion along with the ability to harmoniously address the degradation and +interaction issues during fusion. Through the text semantic encoder and +semantic interaction fusion decoder, Text-IF is accessible to the all-in-one +infrared and visible image degradation-aware processing and the interactive +flexible fusion outcomes. In this way, Text-IF achieves not only multi-modal +image fusion, but also multi-modal information fusion. Extensive experiments +prove that our proposed text guided image fusion strategy has obvious +advantages over SOTA methods in the image fusion performance and degradation +treatment. The code is available at https://github.com/XunpengYi/Text-IF. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Dia-LLaMA: Towards Large Language Model-driven CT Report Generation + + +
+ Medical report generation has achieved remarkable advancements yet has still +been faced with several challenges. First, the inherent imbalance in the +distribution of normal and abnormal cases may lead models to exhibit a biased +focus on normal samples, resulting in unreliable diagnoses. Second, the +frequent occurrence of common template sentences in the reports may overwhelm +the critical abnormal information. Moreover, existing works focus on 2D chest +X-rays, leaving CT report generation underexplored due to the high-dimensional +nature of CT images and the limited availability of CT-report pairs. Recently, +LLM has shown a great ability to generate reliable answers with appropriate +prompts, which shed light on addressing the aforementioned challenges. In this +paper, we propose Dia-LLaMA, a framework to adapt the LLaMA2-7B for CT report +generation by incorporating diagnostic information as guidance prompts. +Considering the high dimension of CT, we leverage a pre-trained ViT3D with +perceiver to extract the visual information. To tailor the LLM for report +generation and emphasize abnormality, we extract additional diagnostic +information by referring to a disease prototype memory bank, which is updated +during training to capture common disease representations. Furthermore, we +introduce disease-aware attention to enable the model to adjust attention for +different diseases. Experiments on the chest CT dataset demonstrated that our +proposed method outperformed previous methods and achieved state-of-the-art on +both clinical efficacy performance and natural language generation metrics. The +code will be made publically available. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Synthesize Step-by-Step: Tools, Templates and LLMs as Data Generators + for Reasoning-Based Chart VQA CVPR 2024 + + +
+ Understanding data visualizations like charts and plots requires reasoning +about both visual elements and numerics. Although strong in extractive +questions, current chart visual question answering (chart VQA) models suffer on +complex reasoning questions. In this work, we address the lack of reasoning +ability by data augmentation. We leverage Large Language Models (LLMs), which +have shown to have strong reasoning ability, as an automatic data annotator +that generates question-answer annotations for chart images. The key innovation +in our method lies in the Synthesize Step-by-Step strategy: our LLM-based data +generator learns to decompose the complex question into step-by-step +sub-questions (rationales), which are then used to derive the final answer +using external tools, i.e. Python. This step-wise generation procedure is +trained on synthetic data generated using a template-based QA generation +pipeline. Experimental results highlight the significance of the proposed +step-by-step generation. By training with the LLM-augmented data (LAMENDA), we +significantly enhance the chart VQA models, achieving the state-of-the-art +accuracy on the ChartQA and PlotQA datasets. In particular, our approach +improves the accuracy of the previous state-of-the-art approach from 38% to 54% +on the human-written questions in the ChartQA dataset, which needs strong +reasoning. We hope our work underscores the potential of synthetic data and +encourages further exploration of data augmentation using LLMs for +reasoning-heavy tasks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Residual Dense Swin Transformer for Continuous Depth-Independent + Ultrasound Imaging ICASSP2024 + + +
+ Ultrasound imaging is crucial for evaluating organ morphology and function, +yet depth adjustment can degrade image quality and field-of-view, presenting a +depth-dependent dilemma. Traditional interpolation-based zoom-in techniques +often sacrifice detail and introduce artifacts. Motivated by the potential of +arbitrary-scale super-resolution to naturally address these inherent +challenges, we present the Residual Dense Swin Transformer Network (RDSTN), +designed to capture the non-local characteristics and long-range dependencies +intrinsic to ultrasound images. It comprises a linear embedding module for +feature enhancement, an encoder with shifted-window attention for modeling +non-locality, and an MLP decoder for continuous detail reconstruction. This +strategy streamlines balancing image quality and field-of-view, which offers +superior textures over traditional methods. Experimentally, RDSTN outperforms +existing approaches while requiring fewer parameters. In conclusion, RDSTN +shows promising potential for ultrasound image enhancement by overcoming the +limitations of conventional interpolation-based methods and achieving +depth-independent imaging. + +
+
+ comment: Accepted by ICASSP2024, https://ieeexplore.ieee.org/document/10447712 +
+
+
+
+
+ + ☆ FlashEval: Towards Fast and Accurate Evaluation of Text-to-image + Diffusion Generative Models CVPR 2024 + + +
+ In recent years, there has been significant progress in the development of +text-to-image generative models. Evaluating the quality of the generative +models is one essential step in the development process. Unfortunately, the +evaluation process could consume a significant amount of computational +resources, making the required periodic evaluation of model performance (e.g., +monitoring training progress) impractical. Therefore, we seek to improve the +evaluation efficiency by selecting the representative subset of the text-image +dataset. We systematically investigate the design choices, including the +selection criteria (textural features or image-based metrics) and the selection +granularity (prompt-level or set-level). We find that the insights from prior +work on subset selection for training data do not generalize to this problem, +and we propose FlashEval, an iterative search algorithm tailored to evaluation +data selection. We demonstrate the effectiveness of FlashEval on ranking +diffusion models with various configurations, including architectures, +quantization levels, and sampler schedules on COCO and DiffusionDB datasets. +Our searched 50-item subset could achieve comparable evaluation quality to the +randomly sampled 500-item subset for COCO annotations on unseen models, +achieving a 10x evaluation speedup. We release the condensed subset of these +commonly used datasets to help facilitate diffusion algorithm design and +evaluation, and open-source FlashEval as a tool for condensing future datasets, +accessible at https://github.com/thu-nics/FlashEval. + +
+
+ comment: The paper is accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Elite360D: Towards Efficient 360 Depth Estimation via Semantic- and + Distance-Aware Bi-Projection Fusion CVPR2024 + + +
+ 360 depth estimation has recently received great attention for 3D +reconstruction owing to its omnidirectional field of view (FoV). Recent +approaches are predominantly focused on cross-projection fusion with +geometry-based re-projection: they fuse 360 images with equirectangular +projection (ERP) and another projection type, e.g., cubemap projection to +estimate depth with the ERP format. However, these methods suffer from 1) +limited local receptive fields, making it hardly possible to capture large FoV +scenes, and 2) prohibitive computational cost, caused by the complex +cross-projection fusion module design. In this paper, we propose Elite360D, a +novel framework that inputs the ERP image and icosahedron projection (ICOSAP) +point set, which is undistorted and spatially continuous. Elite360D is superior +in its capacity in learning a representation from a local-with-global +perspective. With a flexible ERP image encoder, it includes an ICOSAP point +encoder, and a Bi-projection Bi-attention Fusion (B2F) module (totally ~1M +parameters). Specifically, the ERP image encoder can take various perspective +image-trained backbones (e.g., ResNet, Transformer) to extract local features. +The point encoder extracts the global features from the ICOSAP. Then, the B2F +module captures the semantic- and distance-aware dependencies between each +pixel of the ERP feature and the entire ICOSAP feature set. Without specific +backbone design and obvious computational cost increase, Elite360D outperforms +the prior arts on several benchmark datasets. + +
+
+ comment: 8 pages, accepted by CVPR2024 +
+
+
+
+
+ + ☆ GoodSAM: Bridging Domain and Capacity Gaps via Segment Anything Model + for Distortion-aware Panoramic Semantic Segmentation CVPR 2024 + + +
+ This paper tackles a novel yet challenging problem: how to transfer knowledge +from the emerging Segment Anything Model (SAM) -- which reveals impressive +zero-shot instance segmentation capacity -- to learn a compact panoramic +semantic segmentation model, i.e., student, without requiring any labeled data. +This poses considerable challenges due to SAM's inability to provide semantic +labels and the large capacity gap between SAM and the student. To this end, we +propose a novel framework, called GoodSAM, that introduces a teacher assistant +(TA) to provide semantic information, integrated with SAM to generate ensemble +logits to achieve knowledge transfer. Specifically, we propose a +Distortion-Aware Rectification (DAR) module that first addresses the distortion +problem of panoramic images by imposing prediction-level consistency and +boundary enhancement. This subtly enhances TA's prediction capacity on +panoramic images. DAR then incorporates a cross-task complementary fusion block +to adaptively merge the predictions of SAM and TA to obtain more reliable +ensemble logits. Moreover, we introduce a Multi-level Knowledge Adaptation +(MKA) module to efficiently transfer the multi-level feature knowledge from TA +and ensemble logits to learn a compact student model. Extensive experiments on +two benchmarks show that our GoodSAM achieves a remarkable +3.75\% mIoU +improvement over the state-of-the-art (SOTA) domain adaptation methods. Also, +our most lightweight model achieves comparable performance to the SOTA methods +with only 3.7M parameters. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Distilling Semantic Priors from SAM to Efficient Image Restoration + Models + + +
+ In image restoration (IR), leveraging semantic priors from segmentation +models has been a common approach to improve performance. The recent segment +anything model (SAM) has emerged as a powerful tool for extracting advanced +semantic priors to enhance IR tasks. However, the computational cost of SAM is +prohibitive for IR, compared to existing smaller IR models. The incorporation +of SAM for extracting semantic priors considerably hampers the model inference +efficiency. To address this issue, we propose a general framework to distill +SAM's semantic knowledge to boost exiting IR models without interfering with +their inference process. Specifically, our proposed framework consists of the +semantic priors fusion (SPF) scheme and the semantic priors distillation (SPD) +scheme. SPF fuses two kinds of information between the restored image predicted +by the original IR model and the semantic mask predicted by SAM for the refined +restored image. SPD leverages a self-distillation manner to distill the fused +semantic priors to boost the performance of original IR models. Additionally, +we design a semantic-guided relation (SGR) module for SPD, which ensures +semantic feature representation space consistency to fully distill the priors. +We demonstrate the effectiveness of our framework across multiple IR models and +tasks, including deraining, deblurring, and denoising. + +
+
+
+
+
+ + ☆ Generating Potent Poisons and Backdoors from Scratch with Guided + Diffusion + + +
+ Modern neural networks are often trained on massive datasets that are web +scraped with minimal human inspection. As a result of this insecure curation +pipeline, an adversary can poison or backdoor the resulting model by uploading +malicious data to the internet and waiting for a victim to scrape and train on +it. Existing approaches for creating poisons and backdoors start with randomly +sampled clean data, called base samples, and then modify those samples to craft +poisons. However, some base samples may be significantly more amenable to +poisoning than others. As a result, we may be able to craft more potent poisons +by carefully choosing the base samples. In this work, we use guided diffusion +to synthesize base samples from scratch that lead to significantly more potent +poisons and backdoors than previous state-of-the-art attacks. Our Guided +Diffusion Poisoning (GDP) base samples can be combined with any downstream +poisoning or backdoor attack to boost its effectiveness. Our implementation +code is publicly available at: https://github.com/hsouri/GDP . + +
+
+
+
+
+ + ☆ RSTAR: Rotational Streak Artifact Reduction in 4D CBCT using Separable + and Circular Convolutions + + +
+ Four-dimensional cone-beam computed tomography (4D CBCT) provides +respiration-resolved images and can be used for image-guided radiation therapy. +However, the ability to reveal respiratory motion comes at the cost of image +artifacts. As raw projection data are sorted into multiple respiratory phases, +there is a limited number of cone-beam projections available for image +reconstruction. Consequently, the 4D CBCT images are covered by severe streak +artifacts. Although several deep learning-based methods have been proposed to +address this issue, most algorithms employ ordinary network models, neglecting +the intrinsic structural prior within 4D CBCT images. In this paper, we first +explore the origin and appearance of streak artifacts in 4D CBCT +images.Specifically, we find that streak artifacts exhibit a periodic +rotational motion along with the patient's respiration. This unique motion +pattern inspires us to distinguish the artifacts from the desired anatomical +structures in the spatiotemporal domain. Thereafter, we propose a +spatiotemporal neural network named RSTAR-Net with separable and circular +convolutions for Rotational Streak Artifact Reduction. The specially designed +model effectively encodes dynamic image features, facilitating the recovery of +4D CBCT images. Moreover, RSTAR-Net is also lightweight and computationally +efficient. Extensive experiments substantiate the effectiveness of our proposed +method, and RSTAR-Net shows superior performance to comparison methods. + +
+
+
+
+
+ + ☆ ChebMixer: Efficient Graph Representation Learning with MLP Mixer + + +
+ Graph neural networks have achieved remarkable success in learning graph +representations, especially graph Transformer, which has recently shown +superior performance on various graph mining tasks. However, graph Transformer +generally treats nodes as tokens, which results in quadratic complexity +regarding the number of nodes during self-attention computation. The graph MLP +Mixer addresses this challenge by using the efficient MLP Mixer technique from +computer vision. However, the time-consuming process of extracting graph tokens +limits its performance. In this paper, we present a novel architecture named +ChebMixer, a newly graph MLP Mixer that uses fast Chebyshev polynomials-based +spectral filtering to extract a sequence of tokens. Firstly, we produce +multiscale representations of graph nodes via fast Chebyshev polynomial-based +spectral filtering. Next, we consider each node's multiscale representations as +a sequence of tokens and refine the node representation with an effective MLP +Mixer. Finally, we aggregate the multiscale representations of nodes through +Chebyshev interpolation. Owing to the powerful representation capabilities and +fast computational properties of MLP Mixer, we can quickly extract more +informative node representations to improve the performance of downstream +tasks. The experimental results prove our significant improvements in a variety +of scenarios ranging from graph node classification to medical image +segmentation. + +
+
+
+
+
+ + ☆ 3D-EffiViTCaps: 3D Efficient Vision Transformer with Capsule for Medical + Image Segmentation ICPR2024 + + +
+ Medical image segmentation (MIS) aims to finely segment various organs. It +requires grasping global information from both parts and the entire image for +better segmenting, and clinically there are often certain requirements for +segmentation efficiency. Convolutional neural networks (CNNs) have made +considerable achievements in MIS. However, they are difficult to fully collect +global context information and their pooling layer may cause information loss. +Capsule networks, which combine the benefits of CNNs while taking into account +additional information such as relative location that CNNs do not, have lately +demonstrated some advantages in MIS. Vision Transformer (ViT) employs +transformers in visual tasks. Transformer based on attention mechanism has +excellent global inductive modeling capabilities and is expected to capture +longrange information. Moreover, there have been resent studies on making ViT +more lightweight to minimize model complexity and increase efficiency. In this +paper, we propose a U-shaped 3D encoder-decoder network named 3D-EffiViTCaps, +which combines 3D capsule blocks with 3D EfficientViT blocks for MIS. Our +encoder uses capsule blocks and EfficientViT blocks to jointly capture local +and global semantic information more effectively and efficiently with less +information loss, while the decoder employs CNN blocks and EfficientViT blocks +to catch ffner details for segmentation. We conduct experiments on various +datasets, including iSeg-2017, Hippocampus and Cardiac to verify the +performance and efficiency of 3D-EffiViTCaps, which performs better than +previous 3D CNN-based, 3D Capsule-based and 3D Transformer-based models. We +further implement a series of ablation experiments on the main blocks. Our code +is available at: https://github.com/HidNeuron/3D-EffiViTCaps. + +
+
+ comment: 15 pages, 4 figures, submitted to ICPR2024 +
+
+
+
+
+ + ☆ Impact of Video Compression Artifacts on Fisheye Camera Visual + Perception Tasks + + +
+ Autonomous driving systems require extensive data collection schemes to cover +the diverse scenarios needed for building a robust and safe system. The data +volumes are in the order of Exabytes and have to be stored for a long period of +time (i.e., more than 10 years of the vehicle's life cycle). Lossless +compression doesn't provide sufficient compression ratios, hence, lossy video +compression has been explored. It is essential to prove that lossy video +compression artifacts do not impact the performance of the perception +algorithms. However, there is limited work in this area to provide a solid +conclusion. In particular, there is no such work for fisheye cameras, which +have high radial distortion and where compression may have higher artifacts. +Fisheye cameras are commonly used in automotive systems for 3D object detection +task. In this work, we provide the first analysis of the impact of standard +video compression codecs on wide FOV fisheye camera images. We demonstrate that +the achievable compression with negligible impact depends on the dataset and +temporal prediction of the video codec. We propose a radial distortion-aware +zonal metric to evaluate the performance of artifacts in fisheye images. In +addition, we present a novel method for estimating affine mode parameters of +the latest VVC codec, and suggest some areas for improvement in video codecs +for the application to fisheye imagery. + +
+
+
+
+
+ + ☆ MEDDAP: Medical Dataset Enhancement via Diversified Augmentation + Pipeline MICCAI-2024 + + +
+ The effectiveness of Deep Neural Networks (DNNs) heavily relies on the +abundance and accuracy of available training data. However, collecting and +annotating data on a large scale is often both costly and time-intensive, +particularly in medical cases where practitioners are already occupied with +their duties. Moreover, ensuring that the model remains robust across various +scenarios of image capture is crucial in medical domains, especially when +dealing with ultrasound images that vary based on the settings of different +devices and the manual operation of the transducer. To address this challenge, +we introduce a novel pipeline called MEDDAP, which leverages Stable Diffusion +(SD) models to augment existing small datasets by automatically generating new +informative labeled samples. Pretrained checkpoints for SD are typically based +on natural images, and training them for medical images requires significant +GPU resources due to their heavy parameters. To overcome this challenge, we +introduce USLoRA (Ultrasound Low-Rank Adaptation), a novel fine-tuning method +tailored specifically for ultrasound applications. USLoRA allows for selective +fine-tuning of weights within SD, requiring fewer than 0.1\% of parameters +compared to fully fine-tuning only the UNet portion of SD. To enhance dataset +diversity, we incorporate different adjectives into the generation process +prompts, thereby desensitizing the classifiers to intensity changes across +different images. This approach is inspired by clinicians' decision-making +processes regarding breast tumors, where tumor shape often plays a more crucial +role than intensity. In conclusion, our pipeline not only outperforms +classifiers trained on the original dataset but also demonstrates superior +performance when encountering unseen datasets. The source code is available at +https://github.com/yasamin-med/MEDDAP. + +
+
+ comment: submitted to miccai 2024 submitted to miccai 2024 Submitted to + MICCAI-2024 +
+
+
+
+
+ + ☆ Decoding the visual attention of pathologists to reveal their level of + expertise + + +
+ We present a method for classifying the expertise of a pathologist based on +how they allocated their attention during a cancer reading. We engage this +decoding task by developing a novel method for predicting the attention of +pathologists as they read whole-slide Images (WSIs) of prostate and make cancer +grade classifications. Our ground truth measure of a pathologists' attention is +the x, y and z (magnification) movement of their viewport as they navigated +through WSIs during readings, and to date we have the attention behavior of 43 +pathologists reading 123 WSIs. These data revealed that specialists have higher +agreement in both their attention and cancer grades compared to general +pathologists and residents, suggesting that sufficient information may exist in +their attention behavior to classify their expertise level. To attempt this, we +trained a transformer-based model to predict the visual attention heatmaps of +resident, general, and specialist (GU) pathologists during Gleason grading. +Based solely on a pathologist's attention during a reading, our model was able +to predict their level of expertise with 75.3%, 56.1%, and 77.2% accuracy, +respectively, better than chance and baseline models. Our model therefore +enables a pathologist's expertise level to be easily and objectively evaluated, +important for pathology training and competency assessment. Tools developed +from our model could also be used to help pathology trainees learn how to read +WSIs like an expert. + +
+
+
+
+
+ + ☆ DreamPolisher: Towards High-Quality Text-to-3D Generation via Geometric + Diffusion + + +
+ We present DreamPolisher, a novel Gaussian Splatting based method with +geometric guidance, tailored to learn cross-view consistency and intricate +detail from textual descriptions. While recent progress on text-to-3D +generation methods have been promising, prevailing methods often fail to ensure +view-consistency and textural richness. This problem becomes particularly +noticeable for methods that work with text input alone. To address this, we +propose a two-stage Gaussian Splatting based approach that enforces geometric +consistency among views. Initially, a coarse 3D generation undergoes refinement +via geometric optimization. Subsequently, we use a ControlNet driven refiner +coupled with the geometric consistency term to improve both texture fidelity +and overall consistency of the generated 3D asset. Empirical evaluations across +diverse textual prompts spanning various object categories demonstrate the +efficacy of DreamPolisher in generating consistent and realistic 3D objects, +aligning closely with the semantics of the textual instructions. + +
+
+ comment: Project webpage: https://yuanze-lin.me/DreamPolisher_page/ +
+
+
+
+
+ + ☆ Co-Occurring of Object Detection and Identification towards unlabeled + object discovery + + +
+ In this paper, we propose a novel deep learning based approach for +identifying co-occurring objects in conjunction with base objects in multilabel +object categories. Nowadays, with the advancement in computer vision based +techniques we need to know about co-occurring objects with respect to base +object for various purposes. The pipeline of the proposed work is composed of +two stages: in the first stage of the proposed model we detect all the bounding +boxes present in the image and their corresponding labels, then in the second +stage we perform co-occurrence matrix analysis. In co-occurrence matrix +analysis, we set base classes based on the maximum occurrences of the labels +and build association rules and generate frequent patterns. These frequent +patterns will show base classes and their corresponding co-occurring classes. +We performed our experiments on two publicly available datasets: Pascal VOC and +MS-COCO. The experimental results on public benchmark dataset is reported in +Sec 4. Further we extend this work by considering all frequently objects as +unlabeled and what if they are occluded as well. + +
+
+ comment: 6 pages, 2 figures, +
+
+
+
+
+ + ☆ DiffusionAct: Controllable Diffusion Autoencoder for One-shot Face + Reenactment + + +
+ Video-driven neural face reenactment aims to synthesize realistic facial +images that successfully preserve the identity and appearance of a source face, +while transferring the target head pose and facial expressions. Existing +GAN-based methods suffer from either distortions and visual artifacts or poor +reconstruction quality, i.e., the background and several important appearance +details, such as hair style/color, glasses and accessories, are not faithfully +reconstructed. Recent advances in Diffusion Probabilistic Models (DPMs) enable +the generation of high-quality realistic images. To this end, in this paper we +present DiffusionAct, a novel method that leverages the photo-realistic image +generation of diffusion models to perform neural face reenactment. +Specifically, we propose to control the semantic space of a Diffusion +Autoencoder (DiffAE), in order to edit the facial pose of the input images, +defined as the head pose orientation and the facial expressions. Our method +allows one-shot, self, and cross-subject reenactment, without requiring +subject-specific fine-tuning. We compare against state-of-the-art GAN-, +StyleGAN2-, and diffusion-based methods, showing better or on-par reenactment +performance. + +
+
+ comment: Project page: https://stelabou.github.io/diffusionact/ +
+
+
+
+
+ + ☆ AnimateMe: 4D Facial Expressions via Diffusion Models + + +
+ The field of photorealistic 3D avatar reconstruction and generation has +garnered significant attention in recent years; however, animating such avatars +remains challenging. Recent advances in diffusion models have notably enhanced +the capabilities of generative models in 2D animation. In this work, we +directly utilize these models within the 3D domain to achieve controllable and +high-fidelity 4D facial animation. By integrating the strengths of diffusion +processes and geometric deep learning, we employ Graph Neural Networks (GNNs) +as denoising diffusion models in a novel approach, formulating the diffusion +process directly on the mesh space and enabling the generation of 3D facial +expressions. This facilitates the generation of facial deformations through a +mesh-diffusion-based model. Additionally, to ensure temporal coherence in our +animations, we propose a consistent noise sampling method. Under a series of +both quantitative and qualitative experiments, we showcase that the proposed +method outperforms prior work in 4D expression synthesis by generating +high-fidelity extreme expressions. Furthermore, we applied our method to +textured 4D facial expression generation, implementing a straightforward +extension that involves training on a large-scale textured 4D facial expression +database. + +
+
+
+
+
+ + ☆ Strategies to Improve Real-World Applicability of Laparoscopic Anatomy + Segmentation Models + + +
+ Accurate identification and localization of anatomical structures of varying +size and appearance in laparoscopic imaging are necessary to leverage the +potential of computer vision techniques for surgical decision support. +Segmentation performance of such models is traditionally reported using metrics +of overlap such as IoU. However, imbalanced and unrealistic representation of +classes in the training data and suboptimal selection of reported metrics have +the potential to skew nominal segmentation performance and thereby ultimately +limit clinical translation. In this work, we systematically analyze the impact +of class characteristics (i.e., organ size differences), training and test data +composition (i.e., representation of positive and negative examples), and +modeling parameters (i.e., foreground-to-background class weight) on eight +segmentation metrics: accuracy, precision, recall, IoU, F1 score, specificity, +Hausdorff Distance, and Average Symmetric Surface Distance. Based on our +findings, we propose two simple yet effective strategies to improve real-world +applicability of image segmentation models in laparoscopic surgical data: (1) +inclusion of negative examples in the training process and (2) adaptation of +foreground-background weights in segmentation models to maximize model +performance with respect to specific metrics of interest, depending on the +clinical use case. + +
+
+ comment: 13 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ LOTUS: Evasive and Resilient Backdoor Attacks through Sub-Partitioning CVPR + 2024 + + +
+ Backdoor attack poses a significant security threat to Deep Learning +applications. Existing attacks are often not evasive to established backdoor +detection techniques. This susceptibility primarily stems from the fact that +these attacks typically leverage a universal trigger pattern or transformation +function, such that the trigger can cause misclassification for any input. In +response to this, recent papers have introduced attacks using sample-specific +invisible triggers crafted through special transformation functions. While +these approaches manage to evade detection to some extent, they reveal +vulnerability to existing backdoor mitigation techniques. To address and +enhance both evasiveness and resilience, we introduce a novel backdoor attack +LOTUS. Specifically, it leverages a secret function to separate samples in the +victim class into a set of partitions and applies unique triggers to different +partitions. Furthermore, LOTUS incorporates an effective trigger focusing +mechanism, ensuring only the trigger corresponding to the partition can induce +the backdoor behavior. Extensive experimental results show that LOTUS can +achieve high attack success rate across 4 datasets and 7 model structures, and +effectively evading 13 backdoor detection and mitigation techniques. The code +is available at https://github.com/Megum1/LOTUS. + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR + 2024) +
+
+
+
+
+ + ☆ Brain Stroke Segmentation Using Deep Learning Models: A Comparative + Study + + +
+ Stroke segmentation plays a crucial role in the diagnosis and treatment of +stroke patients by providing spatial information about affected brain regions +and the extent of damage. Segmenting stroke lesions accurately is a challenging +task, given that conventional manual techniques are time consuming and prone to +errors. Recently, advanced deep models have been introduced for general medical +image segmentation, demonstrating promising results that surpass many state of +the art networks when evaluated on specific datasets. With the advent of the +vision Transformers, several models have been introduced based on them, while +others have aimed to design better modules based on traditional convolutional +layers to extract long-range dependencies like Transformers. The question of +whether such high-level designs are necessary for all segmentation cases to +achieve the best results remains unanswered. In this study, we selected four +types of deep models that were recently proposed and evaluated their +performance for stroke segmentation: a pure Transformer-based architecture +(DAE-Former), two advanced CNN-based models (LKA and DLKA) with attention +mechanisms in their design, an advanced hybrid model that incorporates CNNs +with Transformers (FCT), and the well- known self-adaptive nnUNet framework +with its configuration based on given data. We examined their performance on +two publicly available datasets, and found that the nnUNet achieved the best +results with the simplest design among all. Revealing the robustness issue of +Transformers to such variabilities serves as a potential reason for their +weaker performance. Furthermore, nnUNet's success underscores the significant +impact of preprocessing and postprocessing techniques in enhancing segmentation +results, surpassing the focus solely on architectural designs + +
+
+
+
+
+ + ☆ Histogram Layers for Neural Engineered Features + + +
+ In the computer vision literature, many effective histogram-based features +have been developed. These engineered features include local binary patterns +and edge histogram descriptors among others and they have been shown to be +informative features for a variety of computer vision tasks. In this paper, we +explore whether these features can be learned through histogram layers embedded +in a neural network and, therefore, be leveraged within deep learning +frameworks. By using histogram features, local statistics of the feature maps +from the convolution neural networks can be used to better represent the data. +We present neural versions of local binary pattern and edge histogram +descriptors that jointly improve the feature representation and perform image +classification. Experiments are presented on benchmark and real-world datasets. + +
+
+ comment: 11 pages, 7 figures, submitted for review +
+
+
+
+
+ + ☆ Engagement Measurement Based on Facial Landmarks and Spatial-Temporal + Graph Convolutional Networks + + +
+ Engagement in virtual learning is crucial for a variety of factors including +learner satisfaction, performance, and compliance with learning programs, but +measuring it is a challenging task. There is therefore considerable interest in +utilizing artificial intelligence and affective computing to measure engagement +in natural settings as well as on a large scale. This paper introduces a novel, +privacy-preserving method for engagement measurement from videos. It uses +facial landmarks, which carry no personally identifiable information, extracted +from videos via the MediaPipe deep learning solution. The extracted facial +landmarks are fed to a Spatial-Temporal Graph Convolutional Network (ST-GCN) to +output the engagement level of the learner in the video. To integrate the +ordinal nature of the engagement variable into the training process, ST-GCNs +undergo training in a novel ordinal learning framework based on transfer +learning. Experimental results on two video student engagement measurement +datasets show the superiority of the proposed method compared to previous +methods with improved state-of-the-art on the EngageNet dataset with a %3.1 +improvement in four-class engagement level classification accuracy and on the +Online Student Engagement dataset with a %1.5 improvement in binary engagement +classification accuracy. The relatively lightweight ST-GCN and its integration +with the real-time MediaPipe deep learning solution make the proposed approach +capable of being deployed on virtual learning platforms and measuring +engagement in real time. + +
+
+
+
+
+ + ☆ Task2Box: Box Embeddings for Modeling Asymmetric Task Relationships + + +
+ Modeling and visualizing relationships between tasks or datasets is an +important step towards solving various meta-tasks such as dataset discovery, +multi-tasking, and transfer learning. However, many relationships, such as +containment and transferability, are naturally asymmetric and current +approaches for representation and visualization (e.g., t-SNE do not readily +support this. We propose Task2Box, an approach to represent tasks using box +embeddings -- axis-aligned hyperrectangles in low dimensional spaces -- that +can capture asymmetric relationships between them through volumetric overlaps. +We show that Task2Box accurately predicts unseen hierarchical relationships +between nodes in ImageNet and iNaturalist datasets, as well as transferability +between tasks in the Taskonomy benchmark. We also show that box embeddings +estimated from task representations (e.g., CLIP, Task2Vec, or attribute based) +can be used to predict relationships between unseen tasks more accurately than +classifiers trained on the same representations, as well as handcrafted +asymmetric distances (e.g., KL divergence). This suggests that low-dimensional +box embeddings can effectively capture these task relationships and have the +added advantage of being interpretable. We use the approach to visualize +relationships among publicly available image classification datasets on popular +dataset hosting platform called Hugging Face. + +
+
+
+
+
+ + ☆ Benchmarking Video Frame Interpolation + + +
+ Video frame interpolation, the task of synthesizing new frames in between two +or more given ones, is becoming an increasingly popular research target. +However, the current evaluation of frame interpolation techniques is not ideal. +Due to the plethora of test datasets available and inconsistent computation of +error metrics, a coherent and fair comparison across papers is very +challenging. Furthermore, new test sets have been proposed as part of method +papers so they are unable to provide the in-depth evaluation of a dedicated +benchmarking paper. Another severe downside is that these test sets violate the +assumption of linearity when given two input frames, making it impossible to +solve without an oracle. We hence strongly believe that the community would +greatly benefit from a benchmarking paper, which is what we propose. +Specifically, we present a benchmark which establishes consistent error metrics +by utilizing a submission website that computes them, provides insights by +analyzing the interpolation quality with respect to various per-pixel +attributes such as the motion magnitude, contains a carefully designed test set +adhering to the assumption of linearity by utilizing synthetic data, and +evaluates the computational efficiency in a coherent manner. + +
+
+ comment: http://sniklaus.com/vfibench +
+
+
+
+
+ + ☆ Animal Avatars: Reconstructing Animatable 3D Animals from Casual Videos + + +
+ We present a method to build animatable dog avatars from monocular videos. +This is challenging as animals display a range of (unpredictable) non-rigid +movements and have a variety of appearance details (e.g., fur, spots, tails). +We develop an approach that links the video frames via a 4D solution that +jointly solves for animal's pose variation, and its appearance (in a canonical +pose). To this end, we significantly improve the quality of template-based +shape fitting by endowing the SMAL parametric model with Continuous Surface +Embeddings, which brings image-to-mesh reprojection constaints that are denser, +and thus stronger, than the previously used sparse semantic keypoint +correspondences. To model appearance, we propose an implicit duplex-mesh +texture that is defined in the canonical pose, but can be deformed using SMAL +pose coefficients and later rendered to enforce a photometric compatibility +with the input video frames. On the challenging CoP3D and APTv2 datasets, we +demonstrate superior results (both in terms of pose estimates and predicted +appearance) to existing template-free (RAC) and template-based approaches +(BARC, BITE). + +
+
+
+
+
+ + ☆ SynFog: A Photo-realistic Synthetic Fog Dataset based on End-to-end + Imaging Simulation for Advancing Real-World Defogging in Autonomous Driving + + +
+ To advance research in learning-based defogging algorithms, various synthetic +fog datasets have been developed. However, existing datasets created using the +Atmospheric Scattering Model (ASM) or real-time rendering engines often +struggle to produce photo-realistic foggy images that accurately mimic the +actual imaging process. This limitation hinders the effective generalization of +models from synthetic to real data. In this paper, we introduce an end-to-end +simulation pipeline designed to generate photo-realistic foggy images. This +pipeline comprehensively considers the entire physically-based foggy scene +imaging process, closely aligning with real-world image capture methods. Based +on this pipeline, we present a new synthetic fog dataset named SynFog, which +features both sky light and active lighting conditions, as well as three levels +of fog density. Experimental results demonstrate that models trained on SynFog +exhibit superior performance in visual perception and detection accuracy +compared to others when applied to real-world foggy images. + +
+
+
+
+
+ + ☆ A Comparative Analysis of Visual Odometry in Virtual and Real-World + Railways Environments + + +
+ Perception tasks play a crucial role in the development of automated +operations and systems across multiple application fields. In the railway +transportation domain, these tasks can improve the safety, reliability, and +efficiency of various perations, including train localization, signal +recognition, and track discrimination. However, collecting considerable and +precisely labeled datasets for testing such novel algorithms poses extreme +challenges in the railway environment due to the severe restrictions in +accessing the infrastructures and the practical difficulties associated with +properly equipping trains with the required sensors, such as cameras and +LiDARs. The remarkable innovations of graphic engine tools offer new solutions +to craft realistic synthetic datasets. To illustrate the advantages of +employing graphic simulation for early-stage testing of perception tasks in the +railway domain, this paper presents a comparative analysis of the performance +of a SLAM algorithm applied both in a virtual synthetic environment and a +real-world scenario. The analysis leverages virtual railway environments +created with the latest version of Unreal Engine, facilitating data collection +and allowing the examination of challenging scenarios, including +low-visibility, dangerous operational modes, and complex environments. The +results highlight the feasibility and potentiality of graphic simulation to +advance perception tasks in the railway domain. + +
+
+
+
+
+ + ☆ A Study in Dataset Pruning for Image Super-Resolution + + +
+ In image Super-Resolution (SR), relying on large datasets for training is a +double-edged sword. While offering rich training material, they also demand +substantial computational and storage resources. In this work, we analyze +dataset pruning as a solution to these challenges. We introduce a novel +approach that reduces a dataset to a core-set of training samples, selected +based on their loss values as determined by a simple pre-trained SR model. By +focusing the training on just 50% of the original dataset, specifically on the +samples characterized by the highest loss values, we achieve results comparable +to or even surpassing those obtained from training on the entire dataset. +Interestingly, our analysis reveals that the top 5% of samples with the highest +loss values negatively affect the training process. Excluding these samples and +adjusting the selection to favor easier samples further enhances training +outcomes. Our work opens new perspectives to the untapped potential of dataset +pruning in image SR. It suggests that careful selection of training data based +on loss-value metrics can lead to better SR models, challenging the +conventional wisdom that more data inevitably leads to better performance. + +
+
+
+
+
+ + ☆ Continuous, Subject-Specific Attribute Control in T2I Models by + Identifying Semantic Directions + + +
+ In recent years, advances in text-to-image (T2I) diffusion models have +substantially elevated the quality of their generated images. However, +achieving fine-grained control over attributes remains a challenge due to the +limitations of natural language prompts (such as no continuous set of +intermediate descriptions existing between ``person'' and ``old person''). Even +though many methods were introduced that augment the model or generation +process to enable such control, methods that do not require a fixed reference +image are limited to either enabling global fine-grained attribute expression +control or coarse attribute expression control localized to specific subjects, +not both simultaneously. We show that there exist directions in the commonly +used token-level CLIP text embeddings that enable fine-grained subject-specific +control of high-level attributes in text-to-image models. Based on this +observation, we introduce one efficient optimization-free and one robust +optimization-based method to identify these directions for specific attributes +from contrastive text prompts. We demonstrate that these directions can be used +to augment the prompt text input with fine-grained control over attributes of +specific subjects in a compositional manner (control over multiple attributes +of a single subject) without having to adapt the diffusion model. Project page: +https://compvis.github.io/attribute-control. Code is available at +https://github.com/CompVis/attribute-control. + +
+
+ comment: Project page: https://compvis.github.io/attribute-control +
+
+
+
+
+ + ☆ Calib3D: Calibrating Model Preferences for Reliable 3D Scene + Understanding + + +
+ Safety-critical 3D scene understanding tasks necessitate not only accurate +but also confident predictions from 3D perception models. This study introduces +Calib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D +scene understanding models from an uncertainty estimation viewpoint. We +comprehensively evaluate 28 state-of-the-art models across 10 diverse 3D +datasets, uncovering insightful phenomena that cope with both the aleatoric and +epistemic uncertainties in 3D scene understanding. We discover that despite +achieving impressive levels of accuracy, existing models frequently fail to +provide reliable uncertainty estimates -- a pitfall that critically undermines +their applicability in safety-sensitive contexts. Through extensive analysis of +key factors such as network capacity, LiDAR representations, rasterization +resolutions, and 3D data augmentation techniques, we correlate these aspects +directly with the model calibration efficacy. Furthermore, we introduce DeptS, +a novel depth-aware scaling approach aimed at enhancing 3D model calibration. +Extensive experiments across a wide range of configurations validate the +superiority of our method. We hope this work could serve as a cornerstone for +fostering reliable 3D scene understanding. Code and benchmark toolkits are +publicly available. + +
+
+ comment: Preprint; 37 pages, 8 figures, 11 tables; Code at + https://github.com/ldkong1205/Calib3D +
+
+
+
+
+ + ☆ Optimizing LiDAR Placements for Robust Driving Perception in Adverse + Conditions + + +
+ The robustness of driving perception systems under unprecedented conditions +is crucial for safety-critical usages. Latest advancements have prompted +increasing interests towards multi-LiDAR perception. However, prevailing +driving datasets predominantly utilize single-LiDAR systems and collect data +devoid of adverse conditions, failing to capture the complexities of real-world +environments accurately. Addressing these gaps, we proposed Place3D, a +full-cycle pipeline that encompasses LiDAR placement optimization, data +generation, and downstream evaluations. Our framework makes three appealing +contributions. 1) To identify the most effective configurations for multi-LiDAR +systems, we introduce a Surrogate Metric of the Semantic Occupancy Grids +(M-SOG) to evaluate LiDAR placement quality. 2) Leveraging the M-SOG metric, we +propose a novel optimization strategy to refine multi-LiDAR placements. 3) +Centered around the theme of multi-condition multi-LiDAR perception, we collect +a 364,000-frame dataset from both clean and adverse conditions. Extensive +experiments demonstrate that LiDAR placements optimized using our approach +outperform various baselines. We showcase exceptional robustness in both 3D +object detection and LiDAR semantic segmentation tasks, under diverse adverse +weather and sensor failure conditions. Code and benchmark toolkit are publicly +available. + +
+
+ comment: Preprint; 40 pages, 11 figures, 15 tables; Code at + https://github.com/ywyeli/Place3D +
+
+
+
+
+ + ☆ FlashFace: Human Image Personalization with High-fidelity Identity + Preservation + + +
+ This work presents FlashFace, a practical tool with which users can easily +personalize their own photos on the fly by providing one or a few reference +face images and a text prompt. Our approach is distinguishable from existing +human photo customization methods by higher-fidelity identity preservation and +better instruction following, benefiting from two subtle designs. First, we +encode the face identity into a series of feature maps instead of one image +token as in prior arts, allowing the model to retain more details of the +reference faces (e.g., scars, tattoos, and face shape ). Second, we introduce a +disentangled integration strategy to balance the text and image guidance during +the text-to-image generation process, alleviating the conflict between the +reference faces and the text prompts (e.g., personalizing an adult into a +"child" or an "elder"). Extensive experimental results demonstrate the +effectiveness of our method on various applications, including human image +personalization, face swapping under language prompts, making virtual +characters into real people, etc. Project Page: +https://jshilong.github.io/flashface-page. + +
+
+ comment: Project Page:https://jshilong.github.io/flashface-page +
+
+
+
+
+ + ☆ DreamLIP: Language-Image Pre-training with Long Captions + + +
+ Language-image pre-training largely relies on how precisely and thoroughly a +text describes its paired image. In practice, however, the contents of an image +can be so rich that well describing them requires lengthy captions (e.g., with +10 sentences), which are usually missing in existing datasets. Consequently, +there are currently no clear evidences on whether and how language-image +pre-training could benefit from long captions. To figure this out, we first +re-caption 30M images with detailed descriptions using a pre-trained +Multi-modality Large Language Model (MLLM), and then study the usage of the +resulting captions under a contrastive learning framework. We observe that, +each sentence within a long caption is very likely to describe the image +partially (e.g., an object). Motivated by this, we propose to dynamically +sample sub-captions from the text label to construct multiple positive pairs, +and introduce a grouping loss to match the embeddings of each sub-caption with +its corresponding local image patches in a self-supervised manner. Experimental +results on a wide rage of downstream tasks demonstrate the consistent +superiority of our method, termed DreamLIP, over previous alternatives, +highlighting its fine-grained representational capacity. It is noteworthy that, +on the tasks of image-text retrieval and semantic segmentation, our model +trained with 30M image-text pairs achieves on par or even better performance +than CLIP trained with 400M pairs. Project page is available at +https://zyf0619sjtu.github.io/dream-lip. + +
+
+
+
+
+ + ☆ Invertible Diffusion Models for Compressed Sensing + + +
+ While deep neural networks (NN) significantly advance image compressed +sensing (CS) by improving reconstruction quality, the necessity of training +current CS NNs from scratch constrains their effectiveness and hampers rapid +deployment. Although recent methods utilize pre-trained diffusion models for +image reconstruction, they struggle with slow inference and restricted +adaptability to CS. To tackle these challenges, this paper proposes Invertible +Diffusion Models (IDM), a novel efficient, end-to-end diffusion-based CS +method. IDM repurposes a large-scale diffusion sampling process as a +reconstruction model, and finetunes it end-to-end to recover original images +directly from CS measurements, moving beyond the traditional paradigm of +one-step noise estimation learning. To enable such memory-intensive end-to-end +finetuning, we propose a novel two-level invertible design to transform both +(1) the multi-step sampling process and (2) the noise estimation U-Net in each +step into invertible networks. As a result, most intermediate features are +cleared during training to reduce up to 93.8% GPU memory. In addition, we +develop a set of lightweight modules to inject measurements into noise +estimator to further facilitate reconstruction. Experiments demonstrate that +IDM outperforms existing state-of-the-art CS networks by up to 2.64dB in PSNR. +Compared to the recent diffusion model-based approach DDNM, our IDM achieves up +to 10.09dB PSNR gain and 14.54 times faster inference. + +
+
+
+
+
+ + ☆ TRIP: Temporal Residual Learning with Image Noise Prior for + Image-to-Video Diffusion Models CVPR 2024 + + +
+ Recent advances in text-to-video generation have demonstrated the utility of +powerful diffusion models. Nevertheless, the problem is not trivial when +shaping diffusion models to animate static image (i.e., image-to-video +generation). The difficulty originates from the aspect that the diffusion +process of subsequent animated frames should not only preserve the faithful +alignment with the given image but also pursue temporal coherence among +adjacent frames. To alleviate this, we present TRIP, a new recipe of +image-to-video diffusion paradigm that pivots on image noise prior derived from +static image to jointly trigger inter-frame relational reasoning and ease the +coherent temporal modeling via temporal residual learning. Technically, the +image noise prior is first attained through one-step backward diffusion process +based on both static image and noised video latent codes. Next, TRIP executes a +residual-like dual-path scheme for noise prediction: 1) a shortcut path that +directly takes image noise prior as the reference noise of each frame to +amplify the alignment between the first frame and subsequent frames; 2) a +residual path that employs 3D-UNet over noised video and static image latent +codes to enable inter-frame relational reasoning, thereby easing the learning +of the residual noise for each frame. Furthermore, both reference and residual +noise of each frame are dynamically merged via attention mechanism for final +video generation. Extensive experiments on WebVid-10M, DTDB and MSR-VTT +datasets demonstrate the effectiveness of our TRIP for image-to-video +generation. Please see our project page at https://trip-i2v.github.io/TRIP/. + +
+
+ comment: CVPR 2024; Project page: https://trip-i2v.github.io/TRIP/ +
+
+
+
+
+ + ☆ SD-DiT: Unleashing the Power of Self-supervised Discrimination in + Diffusion Transformer CVPR 2024 + + +
+ Diffusion Transformer (DiT) has emerged as the new trend of generative +diffusion models on image generation. In view of extremely slow convergence in +typical DiT, recent breakthroughs have been driven by mask strategy that +significantly improves the training efficiency of DiT with additional +intra-image contextual learning. Despite this progress, mask strategy still +suffers from two inherent limitations: (a) training-inference discrepancy and +(b) fuzzy relations between mask reconstruction & generative diffusion process, +resulting in sub-optimal training of DiT. In this work, we address these +limitations by novelly unleashing the self-supervised discrimination knowledge +to boost DiT training. Technically, we frame our DiT in a teacher-student +manner. The teacher-student discriminative pairs are built on the diffusion +noises along the same Probability Flow Ordinary Differential Equation (PF-ODE). +Instead of applying mask reconstruction loss over both DiT encoder and decoder, +we decouple DiT encoder and decoder to separately tackle discriminative and +generative objectives. In particular, by encoding discriminative pairs with +student and teacher DiT encoders, a new discriminative loss is designed to +encourage the inter-image alignment in the self-supervised embedding space. +After that, student samples are fed into student DiT decoder to perform the +typical generative diffusion task. Extensive experiments are conducted on +ImageNet dataset, and our method achieves a competitive balance between +training cost and generative capacity. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ VP3D: Unleashing 2D Visual Prompt for Text-to-3D Generation CVPR 2024 + + +
+ Recent innovations on text-to-3D generation have featured Score Distillation +Sampling (SDS), which enables the zero-shot learning of implicit 3D models +(NeRF) by directly distilling prior knowledge from 2D diffusion models. +However, current SDS-based models still struggle with intricate text prompts +and commonly result in distorted 3D models with unrealistic textures or +cross-view inconsistency issues. In this work, we introduce a novel Visual +Prompt-guided text-to-3D diffusion model (VP3D) that explicitly unleashes the +visual appearance knowledge in 2D visual prompt to boost text-to-3D generation. +Instead of solely supervising SDS with text prompt, VP3D first capitalizes on +2D diffusion model to generate a high-quality image from input text, which +subsequently acts as visual prompt to strengthen SDS optimization with explicit +visual appearance. Meanwhile, we couple the SDS optimization with additional +differentiable reward function that encourages rendering images of 3D models to +better visually align with 2D visual prompt and semantically match with text +prompt. Through extensive experiments, we show that the 2D Visual Prompt in our +VP3D significantly eases the learning of visual appearance of 3D models and +thus leads to higher visual fidelity with more detailed textures. It is also +appealing in view that when replacing the self-generating visual prompt with a +given reference image, VP3D is able to trigger a new task of stylized +text-to-3D generation. Our project page is available at +https://vp3d-cvpr24.github.io. + +
+
+ comment: CVPR 2024; Project page: https://vp3d-cvpr24.github.io +
+
+
+
+
+ + ☆ Learning Spatial Adaptation and Temporal Coherence in Diffusion Models + for Video Super-Resolution CVPR 2024 + + +
+ Diffusion models are just at a tipping point for image super-resolution task. +Nevertheless, it is not trivial to capitalize on diffusion models for video +super-resolution which necessitates not only the preservation of visual +appearance from low-resolution to high-resolution videos, but also the temporal +consistency across video frames. In this paper, we propose a novel approach, +pursuing Spatial Adaptation and Temporal Coherence (SATeCo), for video +super-resolution. SATeCo pivots on learning spatial-temporal guidance from +low-resolution videos to calibrate both latent-space high-resolution video +denoising and pixel-space video reconstruction. Technically, SATeCo freezes all +the parameters of the pre-trained UNet and VAE, and only optimizes two +deliberately-designed spatial feature adaptation (SFA) and temporal feature +alignment (TFA) modules, in the decoder of UNet and VAE. SFA modulates frame +features via adaptively estimating affine parameters for each pixel, +guaranteeing pixel-wise guidance for high-resolution frame synthesis. TFA +delves into feature interaction within a 3D local window (tubelet) through +self-attention, and executes cross-attention between tubelet and its +low-resolution counterpart to guide temporal feature alignment. Extensive +experiments conducted on the REDS4 and Vid4 datasets demonstrate the +effectiveness of our approach. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Visual CoT: Unleashing Chain-of-Thought Reasoning in Multi-Modal + Language Models + + +
+ This paper presents Visual CoT, a novel pipeline that leverages the reasoning +capabilities of multi-modal large language models (MLLMs) by incorporating +visual Chain-of-Thought (CoT) reasoning. While MLLMs have shown promise in +various visual tasks, they often lack interpretability and struggle with +complex visual inputs. To address these challenges, we propose a multi-turn +processing pipeline that dynamically focuses on visual inputs and provides +interpretable thoughts. We collect and introduce the Visual CoT dataset +comprising 373k question-answer pairs, annotated with intermediate bounding +boxes highlighting key regions essential for answering the questions. +Importantly, the introduced benchmark is capable of evaluating MLLMs in +scenarios requiring specific local region identification. Extensive experiments +demonstrate the effectiveness of our framework and shed light on better +inference strategies. The Visual CoT dataset, benchmark, and pre-trained models +are available to foster further research in this direction. + +
+
+ comment: Code: https://github.com/deepcs233/Visual-CoT +
+
+
+
+
+ + ☆ Understanding Long Videos in One Multimodal Language Model Pass + + +
+ Large Language Models (LLMs), known to contain a strong awareness of world +knowledge, have allowed recent approaches to achieve excellent performance on +Long-Video Understanding benchmarks, but at high inference costs. In this work, +we first propose Likelihood Selection, a simple technique that unlocks faster +inference in autoregressive LLMs for multiple-choice tasks common in long-video +benchmarks. In addition to faster inference, we discover the resulting models +to yield surprisingly good accuracy on long-video tasks, even with no video +specific information. Building on this, we inject video-specific object-centric +information extracted from off-the-shelf pre-trained models and utilize natural +language as a medium for information fusion. Our resulting Multimodal Video +Understanding (MVU) framework demonstrates state-of-the-art performance across +long-video and fine-grained action recognition benchmarks. Code available at: +https://github.com/kahnchana/mvu + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Composed Video Retrieval via Enriched Context and Discriminative + Embeddings CVPR-2024 + + +
+ Composed video retrieval (CoVR) is a challenging problem in computer vision +which has recently highlighted the integration of modification text with visual +queries for more sophisticated video search in large databases. Existing works +predominantly rely on visual queries combined with modification text to +distinguish relevant videos. However, such a strategy struggles to fully +preserve the rich query-specific context in retrieved target videos and only +represents the target video using visual embedding. We introduce a novel CoVR +framework that leverages detailed language descriptions to explicitly encode +query-specific contextual information and learns discriminative embeddings of +vision only, text only and vision-text for better alignment to accurately +retrieve matched target videos. Our proposed framework can be flexibly employed +for both composed video (CoVR) and image (CoIR) retrieval tasks. Experiments on +three datasets show that our approach obtains state-of-the-art performance for +both CovR and zero-shot CoIR tasks, achieving gains as high as around 7% in +terms of recall@K=1 score. Our code, models, detailed language descriptions for +WebViD-CoVR dataset are available at +\url{https://github.com/OmkarThawakar/composed-video-retrieval} + +
+
+ comment: CVPR-2024 +
+
+
+
+
+ + ☆ DriveCoT: Integrating Chain-of-Thought Reasoning with End-to-End Driving + + +
+ End-to-end driving has made significant progress in recent years, +demonstrating benefits such as system simplicity and competitive driving +performance under both open-loop and closed-loop settings. Nevertheless, the +lack of interpretability and controllability in its driving decisions hinders +real-world deployment for end-to-end driving systems. In this paper, we collect +a comprehensive end-to-end driving dataset named DriveCoT, leveraging the CARLA +simulator. It contains sensor data, control decisions, and chain-of-thought +labels to indicate the reasoning process. We utilize the challenging driving +scenarios from the CARLA leaderboard 2.0, which involve high-speed driving and +lane-changing, and propose a rule-based expert policy to control the vehicle +and generate ground truth labels for its reasoning process across different +driving aspects and the final decisions. This dataset can serve as an open-loop +end-to-end driving benchmark, enabling the evaluation of accuracy in various +chain-of-thought aspects and the final decision. In addition, we propose a +baseline model called DriveCoT-Agent, trained on our dataset, to generate +chain-of-thought predictions and final decisions. The trained model exhibits +strong performance in both open-loop and closed-loop evaluations, demonstrating +the effectiveness of our proposed dataset. + +
+
+
+
+
+ + ☆ Mapping Image Transformations Onto Pixel Processor Arrays + + +
+ Pixel Processor Arrays (PPA) present a new vision sensor/processor +architecture consisting of a SIMD array of processor elements, each capable of +light capture, storage, processing and local communication. Such a device +allows visual data to be efficiently stored and manipulated directly upon the +focal plane, but also demands the invention of new approaches and algorithms, +suitable for the massively-parallel fine-grain processor arrays. In this paper +we demonstrate how various image transformations, including shearing, rotation +and scaling, can be performed directly upon a PPA. The implementation details +are presented using the SCAMP-5 vision chip, that contains a 256x256 +pixel-parallel array. Our approaches for performing the image transformations +efficiently exploit the parallel computation in a cellular processor array, +minimizing the number of SIMD instructions required. These fundamental image +transformations are vital building blocks for many visual tasks. This paper +aims to serve as a reference for future PPA research while demonstrating the +flexibility of PPA architectures. + +
+
+
+
+
+ + ☆ Comp4D: LLM-Guided Compositional 4D Scene Generation + + +
+ Recent advancements in diffusion models for 2D and 3D content creation have +sparked a surge of interest in generating 4D content. However, the scarcity of +3D scene datasets constrains current methodologies to primarily object-centric +generation. To overcome this limitation, we present Comp4D, a novel framework +for Compositional 4D Generation. Unlike conventional methods that generate a +singular 4D representation of the entire scene, Comp4D innovatively constructs +each 4D object within the scene separately. Utilizing Large Language Models +(LLMs), the framework begins by decomposing an input text prompt into distinct +entities and maps out their trajectories. It then constructs the compositional +4D scene by accurately positioning these objects along their designated paths. +To refine the scene, our method employs a compositional score distillation +technique guided by the pre-defined trajectories, utilizing pre-trained +diffusion models across text-to-image, text-to-video, and text-to-3D domains. +Extensive experiments demonstrate our outstanding 4D content creation +capability compared to prior arts, showcasing superior visual quality, motion +fidelity, and enhanced object interactions. + +
+
+ comment: Project page: https://vita-group.github.io/Comp4D/ +
+
+
+
+
+ + ☆ Be Yourself: Bounded Attention for Multi-Subject Text-to-Image + Generation + + +
+ Text-to-image diffusion models have an unprecedented ability to generate +diverse and high-quality images. However, they often struggle to faithfully +capture the intended semantics of complex input prompts that include multiple +subjects. Recently, numerous layout-to-image extensions have been introduced to +improve user control, aiming to localize subjects represented by specific +tokens. Yet, these methods often produce semantically inaccurate images, +especially when dealing with multiple semantically or visually similar +subjects. In this work, we study and analyze the causes of these limitations. +Our exploration reveals that the primary issue stems from inadvertent semantic +leakage between subjects in the denoising process. This leakage is attributed +to the diffusion model's attention layers, which tend to blend the visual +features of different subjects. To address these issues, we introduce Bounded +Attention, a training-free method for bounding the information flow in the +sampling process. Bounded Attention prevents detrimental leakage among subjects +and enables guiding the generation to promote each subject's individuality, +even with complex multi-subject conditioning. Through extensive +experimentation, we demonstrate that our method empowers the generation of +multiple subjects that better align with given prompts and layouts. + +
+
+ comment: Project page: https://omer11a.github.io/bounded-attention/ +
+
+
+
+
+ + ☆ Self-STORM: Deep Unrolled Self-Supervised Learning for Super-Resolution + Microscopy + + +
+ The use of fluorescent molecules to create long sequences of low-density, +diffraction-limited images enables highly-precise molecule localization. +However, this methodology requires lengthy imaging times, which limits the +ability to view dynamic interactions of live cells on short time scales. Many +techniques have been developed to reduce the number of frames needed for +localization, from classic iterative optimization to deep neural networks. +Particularly, deep algorithm unrolling utilizes both the structure of iterative +sparse recovery algorithms and the performance gains of supervised deep +learning. However, the robustness of this approach is highly dependant on +having sufficient training data. In this paper we introduce deep unrolled +self-supervised learning, which alleviates the need for such data by training a +sequence-specific, model-based autoencoder that learns only from given +measurements. Our proposed method exceeds the performance of its supervised +counterparts, thus allowing for robust, dynamic imaging well below the +diffraction limit without any labeled training samples. Furthermore, the +suggested model-based autoencoder scheme can be utilized to enhance +generalization in any sparse recovery framework, without the need for external +training data. + +
+
+
+
+
+ + ☆ Joint chest X-ray diagnosis and clinical visual attention prediction + with multi-stage cooperative learning: enhancing interpretability + + +
+ As deep learning has become the state-of-the-art for computer-assisted +diagnosis, interpretability of the automatic decisions is crucial for clinical +deployment. While various methods were proposed in this domain, visual +attention maps of clinicians during radiological screening offer a unique asset +to provide important insights and can potentially enhance the quality of +computer-assisted diagnosis. With this paper, we introduce a novel +deep-learning framework for joint disease diagnosis and prediction of +corresponding visual saliency maps for chest X-ray scans. Specifically, we +designed a novel dual-encoder multi-task UNet, which leverages both a +DenseNet201 backbone and a Residual and Squeeze-and-Excitation block-based +encoder to extract diverse features for saliency map prediction, and a +multi-scale feature-fusion classifier to perform disease classification. To +tackle the issue of asynchronous training schedules of individual tasks in +multi-task learning, we proposed a multi-stage cooperative learning strategy, +with contrastive learning for feature encoder pretraining to boost performance. +Experiments show that our proposed method outperformed existing techniques for +chest X-ray diagnosis and the quality of visual saliency map prediction. + +
+
+
+
+
+ + ☆ Visual Whole-Body Control for Legged Loco-Manipulation + + +
+ We study the problem of mobile manipulation using legged robots equipped with +an arm, namely legged loco-manipulation. The robot legs, while usually utilized +for mobility, offer an opportunity to amplify the manipulation capabilities by +conducting whole-body control. That is, the robot can control the legs and the +arm at the same time to extend its workspace. We propose a framework that can +conduct the whole-body control autonomously with visual observations. Our +approach, namely \ourFull~(\our), is composed of a low-level policy using all +degrees of freedom to track the end-effector manipulator position and a +high-level policy proposing the end-effector position based on visual inputs. +We train both levels of policies in simulation and perform Sim2Real transfer +for real robot deployment. We perform extensive experiments and show +significant improvements over baselines in picking up diverse objects in +different configurations (heights, locations, orientations) and environments. +Project page: https://wholebody-b1.github.io + +
+
+ comment: The first two authors contribute equally. Project page: + https://wholebody-b1.github.io +
+
+
+
+
+ + ☆ GSDF: 3DGS Meets SDF for Improved Rendering and Reconstruction + + +
+ Presenting a 3D scene from multiview images remains a core and long-standing +challenge in computer vision and computer graphics. Two main requirements lie +in rendering and reconstruction. Notably, SOTA rendering quality is usually +achieved with neural volumetric rendering techniques, which rely on aggregated +point/primitive-wise color and neglect the underlying scene geometry. Learning +of neural implicit surfaces is sparked from the success of neural rendering. +Current works either constrain the distribution of density fields or the shape +of primitives, resulting in degraded rendering quality and flaws on the learned +scene surfaces. The efficacy of such methods is limited by the inherent +constraints of the chosen neural representation, which struggles to capture +fine surface details, especially for larger, more intricate scenes. To address +these issues, we introduce GSDF, a novel dual-branch architecture that combines +the benefits of a flexible and efficient 3D Gaussian Splatting (3DGS) +representation with neural Signed Distance Fields (SDF). The core idea is to +leverage and enhance the strengths of each branch while alleviating their +limitation through mutual guidance and joint supervision. We show on diverse +scenes that our design unlocks the potential for more accurate and detailed +surface reconstructions, and at the meantime benefits 3DGS rendering with +structures that are more aligned with the underlying geometry. + +
+
+ comment: Project page: https://city-super.github.io/GSDF +
+
+
+
+
+ + ☆ TwinLiteNetPlus: A Stronger Model for Real-time Drivable Area and Lane + Segmentation + + +
+ Semantic segmentation is crucial for autonomous driving, particularly for +Drivable Area and Lane Segmentation, ensuring safety and navigation. To address +the high computational costs of current state-of-the-art (SOTA) models, this +paper introduces TwinLiteNetPlus (TwinLiteNet$^+$), a model adept at balancing +efficiency and accuracy. TwinLiteNet$^+$ incorporates standard and depth-wise +separable dilated convolutions, reducing complexity while maintaining high +accuracy. It is available in four configurations, from the robust 1.94 +million-parameter TwinLiteNet$^+_{\text{Large}}$ to the ultra-compact +34K-parameter TwinLiteNet$^+_{\text{Nano}}$. Notably, +TwinLiteNet$^+_{\text{Large}}$ attains a 92.9\% mIoU for Drivable Area +Segmentation and a 34.2\% IoU for Lane Segmentation. These results notably +outperform those of current SOTA models while requiring a computational cost +that is approximately 11 times lower in terms of Floating Point Operations +(FLOPs) compared to the existing SOTA model. Extensively tested on various +embedded devices, TwinLiteNet$^+$ demonstrates promising latency and power +efficiency, underscoring its suitability for real-world autonomous vehicle +applications. + +
+
+
+
+
+ + ☆ Isolated Diffusion: Optimizing Multi-Concept Text-to-Image Generation + Training-Freely with Isolated Diffusion Guidance + + +
+ Large-scale text-to-image diffusion models have achieved great success in +synthesizing high-quality and diverse images given target text prompts. Despite +the revolutionary image generation ability, current state-of-the-art models +still struggle to deal with multi-concept generation accurately in many cases. +This phenomenon is known as ``concept bleeding" and displays as the unexpected +overlapping or merging of various concepts. This paper presents a general +approach for text-to-image diffusion models to address the mutual interference +between different subjects and their attachments in complex scenes, pursuing +better text-image consistency. The core idea is to isolate the synthesizing +processes of different concepts. We propose to bind each attachment to +corresponding subjects separately with split text prompts. Besides, we +introduce a revision method to fix the concept bleeding problem in +multi-subject synthesis. We first depend on pre-trained object detection and +segmentation models to obtain the layouts of subjects. Then we isolate and +resynthesize each subject individually with corresponding text prompts to avoid +mutual interference. Overall, we achieve a training-free strategy, named +Isolated Diffusion, to optimize multi-concept text-to-image synthesis. It is +compatible with the latest Stable Diffusion XL (SDXL) and prior Stable +Diffusion (SD) models. We compare our approach with alternative methods using a +variety of multi-concept text prompts and demonstrate its effectiveness with +clear advantages in text-image consistency and user study. + +
+
+
+
+
+ + ☆ Hyperspherical Classification with Dynamic Label-to-Prototype Assignment CVPR 2024 + + +
+ Aiming to enhance the utilization of metric space by the parametric softmax +classifier, recent studies suggest replacing it with a non-parametric +alternative. Although a non-parametric classifier may provide better metric +space utilization, it introduces the challenge of capturing inter-class +relationships. A shared characteristic among prior non-parametric classifiers +is the static assignment of labels to prototypes during the training, ie, each +prototype consistently represents a class throughout the training course. +Orthogonal to previous works, we present a simple yet effective method to +optimize the category assigned to each prototype (label-to-prototype +assignment) during the training. To this aim, we formalize the problem as a +two-step optimization objective over network parameters and label-to-prototype +assignment mapping. We solve this optimization using a sequential combination +of gradient descent and Bipartide matching. We demonstrate the benefits of the +proposed approach by conducting experiments on balanced and long-tail +classification problems using different backbone network architectures. In +particular, our method outperforms its competitors by 1.22\% accuracy on +CIFAR-100, and 2.15\% on ImageNet-200 using a metric space dimension half of +the size of its competitors. Code: +https://github.com/msed-Ebrahimi/DL2PA_CVPR24 + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ PropTest: Automatic Property Testing for Improved Visual Programming + + +
+ Visual Programming has emerged as an alternative to end-to-end black-box +visual reasoning models. This type of methods leverage Large Language Models +(LLMs) to decompose a problem and generate the source code for an executable +computer program. This strategy has the advantage of offering an interpretable +reasoning path and does not require finetuning a model with task-specific data. +We propose PropTest, a general strategy that improves visual programming by +further using an LLM to generate code that tests for visual properties in an +initial round of proposed solutions. Particularly, our method tests for +data-type consistency, as well as syntactic and semantic properties in the +generated solutions. Our proposed solution outperforms baselines and achieves +comparable results to state-of-the-art methods while using smaller and publicly +available LLMs (CodeLlama-7B and WizardCoder-15B). This is demonstrated across +different benchmarks on visual question answering and referring expression +comprehension, showing the efficacy of our approach in enhancing the +performance and generalization of visual reasoning tasks. Specifically, +PropTest improves ViperGPT by obtaining 48.66% accuracy (+8.3%) on the A-OKVQA +benchmark and 52.8% (+3.3%) on the RefCOCO+ benchmark using CodeLlama-7B. + +
+
+ comment: Project Page: https://jaywonkoo17.github.io/PropTest/ +
+
+
+
+
+ + ☆ Make-It-Vivid: Dressing Your Animatable Biped Cartoon Characters from + Text + + +
+ Creating and animating 3D biped cartoon characters is crucial and valuable in +various applications. Compared with geometry, the diverse texture design plays +an important role in making 3D biped cartoon characters vivid and charming. +Therefore, we focus on automatic texture design for cartoon characters based on +input instructions. This is challenging for domain-specific requirements and a +lack of high-quality data. To address this challenge, we propose Make-It-Vivid, +the first attempt to enable high-quality texture generation from text in UV +space. We prepare a detailed text-texture paired data for 3D characters by +using vision-question-answering agents. Then we customize a pretrained +text-to-image model to generate texture map with template structure while +preserving the natural 2D image knowledge. Furthermore, to enhance fine-grained +details, we propose a novel adversarial learning scheme to shorten the domain +gap between original dataset and realistic texture domain. Extensive +experiments show that our approach outperforms current texture generation +methods, resulting in efficient character texturing and faithful generation +with prompts. Besides, we showcase various applications such as out of domain +generation and texture stylization. We also provide an efficient generation +system for automatic text-guided textured character generation and animation. + +
+
+ comment: Project page: https://make-it-vivid.github.io/ +
+
+
+
+
+ + ☆ Provably Robust Score-Based Diffusion Posterior Sampling for + Plug-and-Play Image Reconstruction + + +
+ In a great number of tasks in science and engineering, the goal is to infer +an unknown image from a small number of measurements collected from a known +forward model describing certain sensing or imaging modality. Due to resource +constraints, this task is often extremely ill-posed, which necessitates the +adoption of expressive prior information to regularize the solution space. +Score-based diffusion models, due to its impressive empirical success, have +emerged as an appealing candidate of an expressive prior in image +reconstruction. In order to accommodate diverse tasks at once, it is of great +interest to develop efficient, consistent and robust algorithms that +incorporate {\em unconditional} score functions of an image prior distribution +in conjunction with flexible choices of forward models. + This work develops an algorithmic framework for employing score-based +diffusion models as an expressive data prior in general nonlinear inverse +problems. Motivated by the plug-and-play framework in the imaging community, we +introduce a diffusion plug-and-play method (\textsf{DPnP}) that alternatively +calls two samplers, a proximal consistency sampler based solely on the +likelihood function of the forward model, and a denoising diffusion sampler +based solely on the score functions of the image prior. The key insight is that +denoising under white Gaussian noise can be solved {\em rigorously} via both +stochastic (i.e., DDPM-type) and deterministic (i.e., DDIM-type) samplers using +the unconditional score functions. We establish both asymptotic and +non-asymptotic performance guarantees of \textsf{DPnP}, and provide numerical +experiments to illustrate its promise in solving both linear and nonlinear +image reconstruction tasks. To the best of our knowledge, \textsf{DPnP} is the +first provably-robust posterior sampling method for nonlinear inverse problems +using unconditional diffusion priors. + +
+
+
+
+
+ + ☆ Towards Balanced RGB-TSDF Fusion for Consistent Semantic Scene + Completion by 3D RGB Feature Completion and a Classwise Entropy Loss Function + + +
+ Semantic Scene Completion (SSC) aims to jointly infer semantics and +occupancies of 3D scenes. Truncated Signed Distance Function (TSDF), a 3D +encoding of depth, has been a common input for SSC. Furthermore, RGB-TSDF +fusion, seems promising since these two modalities provide color and geometry +information, respectively. Nevertheless, RGB-TSDF fusion has been considered +nontrivial and commonly-used naive addition will result in inconsistent +results. We argue that the inconsistency comes from the sparsity of RGB +features upon projecting into 3D space, while TSDF features are dense, leading +to imbalanced feature maps when summed up. To address this RGB-TSDF +distribution difference, we propose a two-stage network with a 3D RGB feature +completion module that completes RGB features with meaningful values for +occluded areas. Moreover, we propose an effective classwise entropy loss +function to punish inconsistency. Extensive experiments on public datasets +verify that our method achieves state-of-the-art performance among methods that +do not adopt extra data. + +
+
+
+
+
+ + ☆ CVT-xRF: Contrastive In-Voxel Transformer for 3D Consistent Radiance + Fields from Sparse Inputs CVPR 2024 + + +
+ Neural Radiance Fields (NeRF) have shown impressive capabilities for +photorealistic novel view synthesis when trained on dense inputs. However, when +trained on sparse inputs, NeRF typically encounters issues of incorrect density +or color predictions, mainly due to insufficient coverage of the scene causing +partial and sparse supervision, thus leading to significant performance +degradation. While existing works mainly consider ray-level consistency to +construct 2D learning regularization based on rendered color, depth, or +semantics on image planes, in this paper we propose a novel approach that +models 3D spatial field consistency to improve NeRF's performance with sparse +inputs. Specifically, we first adopt a voxel-based ray sampling strategy to +ensure that the sampled rays intersect with a certain voxel in 3D space. We +then randomly sample additional points within the voxel and apply a Transformer +to infer the properties of other points on each ray, which are then +incorporated into the volume rendering. By backpropagating through the +rendering loss, we enhance the consistency among neighboring points. +Additionally, we propose to use a contrastive loss on the encoder output of the +Transformer to further improve consistency within each voxel. Experiments +demonstrate that our method yields significant improvement over different +radiance fields in the sparse inputs setting, and achieves comparable +performance with current works. + +
+
+ comment: The paper is accepted by CVPR 2024. Project page is available at + https://zhongyingji.github.io/CVT-xRF +
+
+
+
+
+ + ☆ INPC: Implicit Neural Point Clouds for Radiance Field Rendering + + +
+ We introduce a new approach for reconstruction and novel-view synthesis of +unbounded real-world scenes. In contrast to previous methods using either +volumetric fields, grid-based models, or discrete point cloud proxies, we +propose a hybrid scene representation, which implicitly encodes a point cloud +in a continuous octree-based probability field and a multi-resolution hash +grid. In doing so, we combine the benefits of both worlds by retaining +favorable behavior during optimization: Our novel implicit point cloud +representation and differentiable bilinear rasterizer enable fast rendering +while preserving fine geometric detail without depending on initial priors like +structure-from-motion point clouds. Our method achieves state-of-the-art image +quality on several common benchmark datasets. Furthermore, we achieve fast +inference at interactive frame rates, and can extract explicit point clouds to +further enhance performance. + +
+
+ comment: Project page: https://fhahlbohm.github.io/inpc/ +
+
+
+
+
+ + ☆ Multiple Object Tracking as ID Prediction + + +
+ In Multiple Object Tracking (MOT), tracking-by-detection methods have stood +the test for a long time, which split the process into two parts according to +the definition: object detection and association. They leverage robust +single-frame detectors and treat object association as a post-processing step +through hand-crafted heuristic algorithms and surrogate tasks. However, the +nature of heuristic techniques prevents end-to-end exploitation of training +data, leading to increasingly cumbersome and challenging manual modification +while facing complicated or novel scenarios. In this paper, we regard this +object association task as an End-to-End in-context ID prediction problem and +propose a streamlined baseline called MOTIP. Specifically, we form the target +embeddings into historical trajectory information while considering the +corresponding IDs as in-context prompts, then directly predict the ID labels +for the objects in the current frame. Thanks to this end-to-end process, MOTIP +can learn tracking capabilities straight from training data, freeing itself +from burdensome hand-crafted algorithms. Without bells and whistles, our method +achieves impressive state-of-the-art performance in complex scenarios like +DanceTrack and SportsMOT, and it performs competitively with other +transformer-based methods on MOT17. We believe that MOTIP demonstrates +remarkable potential and can serve as a starting point for future research. The +code is available at https://github.com/MCG-NJU/MOTIP. + +
+
+ comment: 71.4 HOTA on DanceTrack (with CrowdHuman), 67.5/70.0 HOTA on + DanceTrack built upon Deformable DETR and DAB-Deformable DETR respectively + (without additional data). The code repository will be created within several + days +
+
+
+
+
+ + ☆ From Two Stream to One Stream: Efficient RGB-T Tracking via Mutual + Prompt Learning and Knowledge Distillation + + +
+ Due to the complementary nature of visible light and thermal in-frared +modalities, object tracking based on the fusion of visible light images and +thermal images (referred to as RGB-T tracking) has received increasing +attention from researchers in recent years. How to achieve more comprehensive +fusion of information from the two modalities at a lower cost has been an issue +that re-searchers have been exploring. Inspired by visual prompt learn-ing, we +designed a novel two-stream RGB-T tracking architecture based on cross-modal +mutual prompt learning, and used this model as a teacher to guide a one-stream +student model for rapid learning through knowledge distillation techniques. +Extensive experiments have shown that, compared to similar RGB-T track-ers, our +designed teacher model achieved the highest precision rate, while the student +model, with comparable precision rate to the teacher model, realized an +inference speed more than three times faster than the teacher model.(Codes will +be available if accepted.) + +
+
+
+
+
+ + ☆ UrbanVLP: A Multi-Granularity Vision-Language Pre-Trained Foundation + Model for Urban Indicator Prediction + + +
+ Urban indicator prediction aims to infer socio-economic metrics in diverse +urban landscapes using data-driven methods. However, prevalent pre-trained +models, particularly those reliant on satellite imagery, face dual challenges. +Firstly, concentrating solely on macro-level patterns from satellite data may +introduce bias, lacking nuanced details at micro levels, such as architectural +details at a place. Secondly, the lack of interpretability in pre-trained +models limits their utility in providing transparent evidence for urban +planning. In response to these issues, we devise a novel Vision-Language +Pre-Trained Model (UrbanVLP) in this paper. Our UrbanVLP seamlessly integrates +multi-granularity information from both macro (satellite) and micro +(street-view) levels, overcoming the limitations of prior pre-trained models. +Moreover, it introduces automatic text generation and calibration, elevating +interpretability in downstream applications by producing high-quality text +descriptions of urban imagery. Rigorous experiments conducted across six +socio-economic tasks underscore UrbanVLP's superior performance. We also deploy +a web platform to verify its practicality. + +
+
+
+
+
+ + ☆ One-Shot Domain Incremental Learning IJCNN + + +
+ Domain incremental learning (DIL) has been discussed in previous studies on +deep neural network models for classification. In DIL, we assume that samples +on new domains are observed over time. The models must classify inputs on all +domains. In practice, however, we may encounter a situation where we need to +perform DIL under the constraint that the samples on the new domain are +observed only infrequently. Therefore, in this study, we consider the extreme +case where we have only one sample from the new domain, which we call one-shot +DIL. We first empirically show that existing DIL methods do not work well in +one-shot DIL. We have analyzed the reason for this failure through various +investigations. According to our analysis, we clarify that the difficulty of +one-shot DIL is caused by the statistics in the batch normalization layers. +Therefore, we propose a technique regarding these statistics and demonstrate +the effectiveness of our technique through experiments on open datasets. + +
+
+ comment: accepted at IEEE International Joint Conference on Neural Networks + (IJCNN) 2024 +
+
+
+
+
+ + ☆ Learning from Reduced Labels for Long-Tailed Data + + +
+ Long-tailed data is prevalent in real-world classification tasks and heavily +relies on supervised information, which makes the annotation process +exceptionally labor-intensive and time-consuming. Unfortunately, despite being +a common approach to mitigate labeling costs, existing weakly supervised +learning methods struggle to adequately preserve supervised information for +tail samples, resulting in a decline in accuracy for the tail classes. To +alleviate this problem, we introduce a novel weakly supervised labeling setting +called Reduced Label. The proposed labeling setting not only avoids the decline +of supervised information for the tail samples, but also decreases the labeling +costs associated with long-tailed data. Additionally, we propose an +straightforward and highly efficient unbiased framework with strong theoretical +guarantees to learn from these Reduced Labels. Extensive experiments conducted +on benchmark datasets including ImageNet validate the effectiveness of our +approach, surpassing the performance of state-of-the-art weakly supervised +methods. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ Resolution Limit of Single-Photon LiDAR + + +
+ Single-photon Light Detection and Ranging (LiDAR) systems are often equipped +with an array of detectors for improved spatial resolution and sensing speed. +However, given a fixed amount of flux produced by the laser transmitter across +the scene, the per-pixel Signal-to-Noise Ratio (SNR) will decrease when more +pixels are packed in a unit space. This presents a fundamental trade-off +between the spatial resolution of the sensor array and the SNR received at each +pixel. Theoretical characterization of this fundamental limit is explored. By +deriving the photon arrival statistics and introducing a series of new +approximation techniques, the Mean Squared Error (MSE) of the +maximum-likelihood estimator of the time delay is derived. The theoretical +predictions align well with simulations and real data. + +
+
+
+
+
+ + ☆ ProIn: Learning to Predict Trajectory Based on Progressive Interactions + for Autonomous Driving + + +
+ Accurate motion prediction of pedestrians, cyclists, and other surrounding +vehicles (all called agents) is very important for autonomous driving. Most +existing works capture map information through an one-stage interaction with +map by vector-based attention, to provide map constraints for social +interaction and multi-modal differentiation. However, these methods have to +encode all required map rules into the focal agent's feature, so as to retain +all possible intentions' paths while at the meantime to adapt to potential +social interaction. In this work, a progressive interaction network is proposed +to enable the agent's feature to progressively focus on relevant maps, in order +to better learn agents' feature representation capturing the relevant map +constraints. The network progressively encode the complex influence of map +constraints into the agent's feature through graph convolutions at the +following three stages: after historical trajectory encoder, after social +interaction, and after multi-modal differentiation. In addition, a weight +allocation mechanism is proposed for multi-modal training, so that each mode +can obtain learning opportunities from a single-mode ground truth. Experiments +have validated the superiority of progressive interactions to the existing +one-stage interaction, and demonstrate the effectiveness of each component. +Encouraging results were obtained in the challenging benchmarks. + +
+
+
+
+
+ + ☆ Brain Stroke Segmentation Using Deep Learning Models: A Comparative + Study + + +
+ Stroke segmentation plays a crucial role in the diagnosis and treatment of +stroke patients by providing spatial information about affected brain regions +and the extent of damage. Segmenting stroke lesions accurately is a challenging +task, given that conventional manual techniques are time consuming and prone to +errors. Recently, advanced deep models have been introduced for general medical +image segmentation, demonstrating promising results that surpass many state of +the art networks when evaluated on specific datasets. With the advent of the +vision Transformers, several models have been introduced based on them, while +others have aimed to design better modules based on traditional convolutional +layers to extract long-range dependencies like Transformers. The question of +whether such high-level designs are necessary for all segmentation cases to +achieve the best results remains unanswered. In this study, we selected four +types of deep models that were recently proposed and evaluated their +performance for stroke segmentation: a pure Transformer-based architecture +(DAE-Former), two advanced CNN-based models (LKA and DLKA) with attention +mechanisms in their design, an advanced hybrid model that incorporates CNNs +with Transformers (FCT), and the well-known self-adaptive nnUNet framework with +its configuration based on given data. We examined their performance on two +publicly available datasets, and found that the nnUNet achieved the best +results with the simplest design among all. Revealing the robustness issue of +Transformers to such variabilities serves as a potential reason for their +weaker performance. Furthermore, nnUNet's success underscores the significant +impact of preprocessing and postprocessing techniques in enhancing segmentation +results, surpassing the focus solely on architectural designs + +
+
+
+
+
+ + ♻ ☆ HAIFIT: Human-Centered AI for Fashion Image Translation + + +
+ In the realm of fashion design, sketches serve as the canvas for expressing +an artist's distinctive drawing style and creative vision, capturing intricate +details like stroke variations and texture nuances. The advent of +sketch-to-image cross-modal translation technology has notably aided designers. +However, existing methods often compromise these sketch details during image +generation, resulting in images that deviate from the designer's intended +concept. This limitation hampers the ability to offer designers a precise +preview of the final output. To overcome this challenge, we introduce HAIFIT, a +novel approach that transforms sketches into high-fidelity, lifelike clothing +images by integrating multi-scale features and capturing extensive feature map +dependencies from diverse perspectives. Through extensive qualitative and +quantitative evaluations conducted on our self-collected dataset, our method +demonstrates superior performance compared to existing methods in generating +photorealistic clothing images. Our method excels in preserving the distinctive +style and intricate details essential for fashion design applications. + +
+
+ comment: 8 pages,8 figures +
+
+
+
+
+ + ♻ ☆ SeMoLi: What Moves Together Belongs Together CVPR 2024 + + +
+ We tackle semi-supervised object detection based on motion cues. Recent +results suggest that heuristic-based clustering methods in conjunction with +object trackers can be used to pseudo-label instances of moving objects and use +these as supervisory signals to train 3D object detectors in Lidar data without +manual supervision. We re-think this approach and suggest that both, object +detection, as well as motion-inspired pseudo-labeling, can be tackled in a +data-driven manner. We leverage recent advances in scene flow estimation to +obtain point trajectories from which we extract long-term, class-agnostic +motion patterns. Revisiting correlation clustering in the context of message +passing networks, we learn to group those motion patterns to cluster points to +object instances. By estimating the full extent of the objects, we obtain +per-scan 3D bounding boxes that we use to supervise a Lidar object detection +network. Our method not only outperforms prior heuristic-based approaches (57.5 +AP, +14 improvement over prior work), more importantly, we show we can +pseudo-label and train object detectors across datasets. + +
+
+ comment: Accepted to CVPR 2024! +
+
+
+
+
+ + ♻ ☆ Geometric Generative Models based on Morphological Equivariant PDEs and + GANs + + +
+ Content and image generation consist in creating or generating data from +noisy information by extracting specific features such as texture, edges, and +other thin image structures. We are interested here in generative models, and +two main problems are addressed. Firstly, the improvements of specific feature +extraction while accounting at multiscale levels intrinsic geometric features; +and secondly, the equivariance of the network to reduce its complexity and +provide a geometric interpretability. To proceed, we propose a geometric +generative model based on an equivariant partial differential equation (PDE) +for group convolution neural networks (G-CNNs), so called PDE-G-CNNs, built on +morphology operators and generative adversarial networks (GANs). Equivariant +morphological PDE layers are composed of multiscale dilations and erosions +formulated in Riemannian manifolds, while group symmetries are defined on a Lie +group. We take advantage of the Lie group structure to properly integrate the +equivariance in layers, and are able to use the Riemannian metric to solve the +multiscale morphological operations. Each point of the Lie group is associated +with a unique point in the manifold, which helps us derive a metric on the +Riemannian manifold from a tensor field invariant under the Lie group so that +the induced metric has the same symmetries. The proposed geometric +morphological GAN (GM-GAN) is obtained by using the proposed morphological +equivariant convolutions in PDE-G-CNNs to bring nonlinearity in classical CNNs. +GM-GAN is evaluated on MNIST data and compared with GANs. Preliminary results +show that GM-GAN model outperforms classical GAN. + +
+
+
+
+
+ + ♻ ☆ Towards Precise 3D Human Pose Estimation with Multi-Perspective + Spatial-Temporal Relational Transformers IJCNN 2024 + + +
+ 3D human pose estimation captures the human joint points in three-dimensional +space while keeping the depth information and physical structure. That is +essential for applications that require precise pose information, such as +human-computer interaction, scene understanding, and rehabilitation training. +Due to the challenges in data collection, mainstream datasets of 3D human pose +estimation are primarily composed of multi-view video data collected in +laboratory environments, which contains rich spatial-temporal correlation +information besides the image frame content. Given the remarkable +self-attention mechanism of transformers, capable of capturing the +spatial-temporal correlation from multi-view video datasets, we propose a +multi-stage framework for 3D sequence-to-sequence (seq2seq) human pose +detection. Firstly, the spatial module represents the human pose feature by +intra-image content, while the frame-image relation module extracts temporal +relationships and 3D spatial positional relationship features between the +multi-perspective images. Secondly, the self-attention mechanism is adopted to +eliminate the interference from non-human body parts and reduce computing +resources. Our method is evaluated on Human3.6M, a popular 3D human pose +detection dataset. Experimental results demonstrate that our approach achieves +state-of-the-art performance on this dataset. The source code will be available +at https://github.com/WUJINHUAN/3D-human-pose. + +
+
+ comment: Accepted to IJCNN 2024. The source code will be available at + https://github.com/WUJINHUAN/3D-human-pose +
+
+
+
+
+ + ♻ ☆ Meet JEANIE: a Similarity Measure for 3D Skeleton Sequences via + Temporal-Viewpoint Alignment ACCV'22 + + +
+ Video sequences exhibit significant nuisance variations (undesired effects) +of speed of actions, temporal locations, and subjects' poses, leading to +temporal-viewpoint misalignment when comparing two sets of frames or evaluating +the similarity of two sequences. Thus, we propose Joint tEmporal and cAmera +viewpoiNt alIgnmEnt (JEANIE) for sequence pairs. In particular, we focus on 3D +skeleton sequences whose camera and subjects' poses can be easily manipulated +in 3D. We evaluate JEANIE on skeletal Few-shot Action Recognition (FSAR), where +matching well temporal blocks (temporal chunks that make up a sequence) of +support-query sequence pairs (by factoring out nuisance variations) is +essential due to limited samples of novel classes. Given a query sequence, we +create its several views by simulating several camera locations. For a support +sequence, we match it with view-simulated query sequences, as in the popular +Dynamic Time Warping (DTW). Specifically, each support temporal block can be +matched to the query temporal block with the same or adjacent (next) temporal +index, and adjacent camera views to achieve joint local temporal-viewpoint +warping. JEANIE selects the smallest distance among matching paths with +different temporal-viewpoint warping patterns, an advantage over DTW which only +performs temporal alignment. We also propose an unsupervised FSAR akin to +clustering of sequences with JEANIE as a distance measure. JEANIE achieves +state-of-the-art results on NTU-60, NTU-120, Kinetics-skeleton and UWA3D +Multiview Activity II on supervised and unsupervised FSAR, and their +meta-learning inspired fusion. + +
+
+ comment: Accepted by the International Journal of Computer Vision (IJCV). An + extension of our ACCV'22 paper [arXiv:arXiv:2210.16820] which was + distinguished by the Sang Uk Lee Best Student Paper Award +
+
+
+
+
+ + ♻ ☆ An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference + Acceleration for Large Vision-Language Models + + +
+ In this study, we identify the inefficient attention phenomena in Large +Vision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5, +QwenVL-Chat and Video-LLaVA. We find out that the attention computation over +visual tokens is of extreme inefficiency in the deep layers of popular LVLMs, +suggesting a need for a sparser approach compared to textual data handling. To +this end, we introduce FastV, a versatile plug-and-play method designed to +optimize computational efficiency by learning adaptive attention patterns in +early layers and pruning visual tokens in subsequent ones. Our evaluations +demonstrate FastV's ability to dramatically reduce computational costs (e.g., a +45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a +wide range of image and video understanding tasks. The computational efficiency +and performance trade-off of FastV are highly customizable and +pareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve +a lower budget than that of a 7B-parameter model, while still maintaining +superior performance. We believe FastV has practical values for deployment of +LVLMs in edge devices and commercial models. Code is released at +https://github.com/pkunlp-icler/FastV. + +
+
+ comment: 21 papes, 8 figures, code is released at + https://github.com/pkunlp-icler/FastV +
+
+
+
+
+ + ♻ ☆ MambaIR: A Simple Baseline for Image Restoration with State-Space Model + + +
+ Recent years have seen significant advancements in image restoration, largely +attributed to the development of modern deep neural networks, such as CNNs and +Transformers. However, existing restoration backbones often face the dilemma +between global receptive fields and efficient computation, hindering their +application in practice. Recently, the Selective Structured State Space Model, +especially the improved version Mamba, has shown great potential for long-range +dependency modeling with linear complexity, which offers a way to resolve the +above dilemma. However, the standard Mamba still faces certain challenges in +low-level vision such as local pixel forgetting and channel redundancy. In this +work, we introduce a simple but effective baseline, named MambaIR, which +introduces both local enhancement and channel attention to improve the vanilla +Mamba. In this way, our MambaIR takes advantage of the local pixel similarity +and reduces the channel redundancy. Extensive experiments demonstrate the +superiority of our method, for example, MambaIR outperforms SwinIR by up to +0.45dB on image SR, using similar computational cost but with a global +receptive field. Code is available at \url{https://github.com/csguoh/MambaIR}. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Text-Conditioned Resampler For Long Form Video Understanding + + +
+ In this paper we present a text-conditioned video resampler (TCR) module that +uses a pre-trained and frozen visual encoder and large language model (LLM) to +process long video sequences for a task. TCR localises relevant visual features +from the video given a text condition and provides them to a LLM to generate a +text response. Due to its lightweight design and use of cross-attention, TCR +can process more than 100 frames at a time with plain attention and without +optimised implementations. We make the following contributions: (i) we design a +transformer-based sampling architecture that can process long videos +conditioned on a task, together with a training method that enables it to +bridge pre-trained visual and language models; (ii) we identify tasks that +could benefit from longer video perception; and (iii) we empirically validate +its efficacy on a wide variety of evaluation tasks including NextQA, EgoSchema, +and the EGO4D-LTA challenge. + +
+
+
+
+
+ + ♻ ☆ EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via + Expressive Masked Audio Gesture Modeling CVPR + + +
+ We propose EMAGE, a framework to generate full-body human gestures from audio +and masked gestures, encompassing facial, local body, hands, and global +movements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new +mesh-level holistic co-speech dataset. BEAT2 combines MoShed SMPLX body with +FLAME head parameters and further refines the modeling of head, neck, and +finger movements, offering a community-standardized, high-quality 3D motion +captured dataset. EMAGE leverages masked body gesture priors during training to +boost inference performance. It involves a Masked Audio Gesture Transformer, +facilitating joint training on audio-to-gesture generation and masked gesture +reconstruction to effectively encode audio and body gesture hints. Encoded body +hints from masked gestures are then separately employed to generate facial and +body movements. Moreover, EMAGE adaptively merges speech features from the +audio's rhythm and content and utilizes four compositional VQ-VAEs to enhance +the results' fidelity and diversity. Experiments demonstrate that EMAGE +generates holistic gestures with state-of-the-art performance and is flexible +in accepting predefined spatial-temporal gesture inputs, generating complete, +audio-synchronized results. Our code and dataset are available at +https://pantomatrix.github.io/EMAGE/ + +
+
+ comment: CVPR Camera Ready; Project Page: https://pantomatrix.github.io/EMAGE/ +
+
+
+
+
+ + ♻ ☆ BioNeRF: Biologically Plausible Neural Radiance Fields for View + Synthesis + + +
+ This paper presents BioNeRF, a biologically plausible architecture that +models scenes in a 3D representation and synthesizes new views through radiance +fields. Since NeRF relies on the network weights to store the scene's +3-dimensional representation, BioNeRF implements a cognitive-inspired mechanism +that fuses inputs from multiple sources into a memory-like structure, improving +the storing capacity and extracting more intrinsic and correlated information. +BioNeRF also mimics a behavior observed in pyramidal cells concerning +contextual information, in which the memory is provided as the context and +combined with the inputs of two subsequent neural models, one responsible for +producing the volumetric densities and the other the colors used to render the +scene. Experimental results show that BioNeRF outperforms state-of-the-art +results concerning a quality measure that encodes human perception in two +datasets: real-world images and synthetic data. + +
+
+
+
+
+ + ♻ ☆ Multi-modal Instruction Tuned LLMs with Fine-grained Visual Perception CVPR 2024 + + +
+ Multimodal Large Language Model (MLLMs) leverages Large Language Models as a +cognitive framework for diverse visual-language tasks. Recent efforts have been +made to equip MLLMs with visual perceiving and grounding capabilities. However, +there still remains a gap in providing fine-grained pixel-level perceptions and +extending interactions beyond text-specific inputs. In this work, we propose +{\bf{AnyRef}}, a general MLLM model that can generate pixel-wise object +perceptions and natural language descriptions from multi-modality references, +such as texts, boxes, images, or audio. This innovation empowers users with +greater flexibility to engage with the model beyond textual and regional +prompts, without modality-specific designs. Through our proposed refocusing +mechanism, the generated grounding output is guided to better focus on the +referenced object, implicitly incorporating additional pixel-level supervision. +This simple modification utilizes attention scores generated during the +inference of LLM, eliminating the need for extra computations while exhibiting +performance enhancements in both grounding masks and referring expressions. +With only publicly available training data, our model achieves state-of-the-art +results across multiple benchmarks, including diverse modality referring +segmentation and region-level referring expression generation. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Boosting Adversarial Transferability by Block Shuffle and Rotation CVPR 2024 + + +
+ Adversarial examples mislead deep neural networks with imperceptible +perturbations and have brought significant threats to deep learning. An +important aspect is their transferability, which refers to their ability to +deceive other models, thus enabling attacks in the black-box setting. Though +various methods have been proposed to boost transferability, the performance +still falls short compared with white-box attacks. In this work, we observe +that existing input transformation based attacks, one of the mainstream +transfer-based attacks, result in different attention heatmaps on various +models, which might limit the transferability. We also find that breaking the +intrinsic relation of the image can disrupt the attention heatmap of the +original image. Based on this finding, we propose a novel input transformation +based attack called block shuffle and rotation (BSR). Specifically, BSR splits +the input image into several blocks, then randomly shuffles and rotates these +blocks to construct a set of new images for gradient calculation. Empirical +evaluations on the ImageNet dataset demonstrate that BSR could achieve +significantly better transferability than the existing input transformation +based methods under single-model and ensemble-model settings. Combining BSR +with the current input transformation method can further improve the +transferability, which significantly outperforms the state-of-the-art methods. +Code is available at https://github.com/Trustworthy-AI-Group/BSR + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Word4Per: Zero-shot Composed Person Retrieval + + +
+ Searching for specific person has great social benefits and security value, +and it often involves a combination of visual and textual information. +Conventional person retrieval methods, whether image-based or text-based, +usually fall short in effectively harnessing both types of information, leading +to the loss of accuracy. In this paper, a whole new task called Composed Person +Retrieval (CPR) is proposed to jointly utilize both image and text information +for target person retrieval. However, the supervised CPR requires very costly +manual annotation dataset, while there are currently no available resources. To +mitigate this issue, we firstly introduce the Zero-shot Composed Person +Retrieval (ZS-CPR), which leverages existing domain-related data to resolve the +CPR problem without expensive annotations. Secondly, to learn ZS-CPR model, we +propose a two-stage learning framework, Word4Per, where a lightweight Textual +Inversion Network (TINet) and a text-based person retrieval model based on +fine-tuned Contrastive Language-Image Pre-training (CLIP) network are learned +without utilizing any CPR data. Thirdly, a finely annotated Image-Text Composed +Person Retrieval (ITCPR) dataset is built as the benchmark to assess the +performance of the proposed Word4Per framework. Extensive experiments under +both Rank-1 and mAP demonstrate the effectiveness of Word4Per for the ZS-CPR +task, surpassing the comparative methods by over 10\%. The code and ITCPR +dataset will be publicly available at +https://github.com/Delong-liu-bupt/Word4Per. + +
+
+
+
+
+ + ♻ ☆ Knowledge Distillation for Road Detection based on cross-model + Semi-Supervised Learning + + +
+ The advancement of knowledge distillation has played a crucial role in +enabling the transfer of knowledge from larger teacher models to smaller and +more efficient student models, and is particularly beneficial for online and +resource-constrained applications. The effectiveness of the student model +heavily relies on the quality of the distilled knowledge received from the +teacher. Given the accessibility of unlabelled remote sensing data, +semi-supervised learning has become a prevalent strategy for enhancing model +performance. However, relying solely on semi-supervised learning with smaller +models may be insufficient due to their limited capacity for feature +extraction. This limitation restricts their ability to exploit training data. +To address this issue, we propose an integrated approach that combines +knowledge distillation and semi-supervised learning methods. This hybrid +approach leverages the robust capabilities of large models to effectively +utilise large unlabelled data whilst subsequently providing the small student +model with rich and informative features for enhancement. The proposed +semi-supervised learning-based knowledge distillation (SSLKD) approach +demonstrates a notable improvement in the performance of the student model, in +the application of road segmentation, surpassing the effectiveness of +traditional semi-supervised learning methods. + +
+
+
+
+
+ + ♻ ☆ Unveiling the Blind Spots: A Critical Examination of Fairness in + Autonomous Driving Systems + + +
+ Autonomous driving systems have extended the spectrum of Web of Things for +intelligent vehicles and have become an important component of the Web +ecosystem. Similar to traditional Web-based applications, fairness is an +essential aspect for ensuring the high quality of autonomous driving systems, +particularly in the context of pedestrian detectors within them. However, there +is an absence in the literature of a comprehensive assessment of the fairness +of current Deep Learning (DL)-based pedestrian detectors. To fill the gap, we +evaluate eight widely-explored DL-based pedestrian detectors across demographic +groups on large-scale real-world datasets. To enable a thorough fairness +evaluation, we provide extensive annotations for the datasets, resulting in +8,311 images with 16,070 gender labels, 20,115 age labels, and 3,513 skin tone +labels. Our findings reveal significant fairness issues related to age. The +undetected proportions for adults are 20.14% lower compared to children. +Furthermore, we explore how various driving scenarios affect the fairness of +pedestrian detectors. We find that the bias may exacerbate for children and +females towards low brightness and low contrast. + +
+
+ comment: Update the models evaluated and the experimental results +
+
+
+
+
+ + ♻ ☆ HiFi-123: Towards High-fidelity One Image to 3D Content Generation + + +
+ Recent advances in diffusion models have enabled 3D generation from a single +image. However, current methods often produce suboptimal results for novel +views, with blurred textures and deviations from the reference image, limiting +their practical applications. In this paper, we introduce HiFi-123, a method +designed for high-fidelity and multi-view consistent 3D generation. Our +contributions are twofold: First, we propose a Reference-Guided Novel View +Enhancement (RGNV) technique that significantly improves the fidelity of +diffusion-based zero-shot novel view synthesis methods. Second, capitalizing on +the RGNV, we present a novel Reference-Guided State Distillation (RGSD) loss. +When incorporated into the optimization-based image-to-3D pipeline, our method +significantly improves 3D generation quality, achieving state-of-the-art +performance. Comprehensive evaluations demonstrate the effectiveness of our +approach over existing methods, both qualitatively and quantitatively. Video +results are available on the project page. + +
+
+ comment: Project Page: https://drexubery.github.io/HiFi-123/ +
+
+
+
+
+ + ♻ ☆ SVGDreamer: Text Guided SVG Generation with Diffusion Model CVPR 2024 + + +
+ Recently, text-guided scalable vector graphics (SVGs) synthesis has shown +promise in domains such as iconography and sketch. However, existing +text-to-SVG generation methods lack editability and struggle with visual +quality and result diversity. To address these limitations, we propose a novel +text-guided vector graphics synthesis method called SVGDreamer. SVGDreamer +incorporates a semantic-driven image vectorization (SIVE) process that enables +the decomposition of synthesis into foreground objects and background, thereby +enhancing editability. Specifically, the SIVE process introduce attention-based +primitive control and an attention-mask loss function for effective control and +manipulation of individual elements. Additionally, we propose a Vectorized +Particle-based Score Distillation (VPSD) approach to tackle the challenges of +shape over-smoothing, color over-saturation, limited diversity in results, and +slow convergence in existing text-to-SVG generation methods. VPSD models SVGs +as distributions of control points and colors to counteract over-smoothing and +over-saturation. Furthermore, VPSD leverages a reward model to reweight vector +particles, which improves aesthetic appeal and accelerates convergence. +Extensive experiments have been conducted to validate the effectiveness of +SVGDreamer, demonstrating its superiority over baseline methods in terms of +editability, visual quality, and diversity. The code and demo of SVGDreamer can +be found at https://ximinng.github.io/SVGDreamer-project/ + +
+
+ comment: Accepted by CVPR 2024. project link: + https://ximinng.github.io/SVGDreamer-project/ +
+
+
+
+
+ + ♻ ☆ Variational Bayes image restoration with compressive autoencoders + + +
+ Regularization of inverse problems is of paramount importance in +computational imaging. The ability of neural networks to learn efficient image +representations has been recently exploited to design powerful data-driven +regularizers. While state-of-the-art plug-and-play methods rely on an implicit +regularization provided by neural denoisers, alternative Bayesian approaches +consider Maximum A Posteriori (MAP) estimation in the latent space of a +generative model, thus with an explicit regularization. However, +state-of-the-art deep generative models require a huge amount of training data +compared to denoisers. Besides, their complexity hampers the optimization +involved in latent MAP derivation. In this work, we first propose to use +compressive autoencoders instead. These networks, which can be seen as +variational autoencoders with a flexible latent prior, are smaller and easier +to train than state-of-the-art generative models. As a second contribution, we +introduce the Variational Bayes Latent Estimation (VBLE) algorithm, which +performs latent estimation within the framework of variational inference. +Thanks to a simple yet efficient parameterization of the variational posterior, +VBLE allows for fast and easy (approximate) posterior sampling. Experimental +results on image datasets BSD and FFHQ demonstrate that VBLE reaches similar +performance than state-of-the-art plug-and-play methods, while being able to +quantify uncertainties faster than other existing posterior sampling +techniques. + +
+
+
+
+
+ + ♻ ☆ Mask Grounding for Referring Image Segmentation CVPR2024 + + +
+ Referring Image Segmentation (RIS) is a challenging task that requires an +algorithm to segment objects referred by free-form language expressions. +Despite significant progress in recent years, most state-of-the-art (SOTA) +methods still suffer from considerable language-image modality gap at the pixel +and word level. These methods generally 1) rely on sentence-level language +features for language-image alignment and 2) lack explicit training supervision +for fine-grained visual grounding. Consequently, they exhibit weak object-level +correspondence between visual and language features. Without well-grounded +features, prior methods struggle to understand complex expressions that require +strong reasoning over relationships among multiple objects, especially when +dealing with rarely used or ambiguous clauses. To tackle this challenge, we +introduce a novel Mask Grounding auxiliary task that significantly improves +visual grounding within language features, by explicitly teaching the model to +learn fine-grained correspondence between masked textual tokens and their +matching visual objects. Mask Grounding can be directly used on prior RIS +methods and consistently bring improvements. Furthermore, to holistically +address the modality gap, we also design a cross-modal alignment loss and an +accompanying alignment module. These additions work synergistically with Mask +Grounding. With all these techniques, our comprehensive approach culminates in +MagNet (Mask-grounded Network), an architecture that significantly outperforms +prior arts on three key benchmarks (RefCOCO, RefCOCO+ and G-Ref), demonstrating +our method's effectiveness in addressing current limitations of RIS algorithms. +Our code and pre-trained weights will be released. + +
+
+ comment: Accepted by CVPR2024; Project page: + https://yxchng.github.io/projects/mask-grounding +
+
+
+
+
+ + ♻ ☆ Multimodal-Conditioned Latent Diffusion Models for Fashion Image Editing + + +
+ Fashion illustration is a crucial medium for designers to convey their +creative vision and transform design concepts into tangible representations +that showcase the interplay between clothing and the human body. In the context +of fashion design, computer vision techniques have the potential to enhance and +streamline the design process. Departing from prior research primarily focused +on virtual try-on, this paper tackles the task of multimodal-conditioned +fashion image editing. Our approach aims to generate human-centric fashion +images guided by multimodal prompts, including text, human body poses, garment +sketches, and fabric textures. To address this problem, we propose extending +latent diffusion models to incorporate these multiple modalities and modifying +the structure of the denoising network, taking multimodal prompts as input. To +condition the proposed architecture on fabric textures, we employ textual +inversion techniques and let diverse cross-attention layers of the denoising +network attend to textual and texture information, thus incorporating different +granularity conditioning details. Given the lack of datasets for the task, we +extend two existing fashion datasets, Dress Code and VITON-HD, with multimodal +annotations. Experimental evaluations demonstrate the effectiveness of our +proposed approach in terms of realism and coherence concerning the provided +multimodal inputs. + +
+
+
+
+
+ + ♻ ☆ LightIt: Illumination Modeling and Control for Diffusion Models + + +
+ We introduce LightIt, a method for explicit illumination control for image +generation. Recent generative methods lack lighting control, which is crucial +to numerous artistic aspects of image generation such as setting the overall +mood or cinematic appearance. To overcome these limitations, we propose to +condition the generation on shading and normal maps. We model the lighting with +single bounce shading, which includes cast shadows. We first train a shading +estimation module to generate a dataset of real-world images and shading pairs. +Then, we train a control network using the estimated shading and normals as +input. Our method demonstrates high-quality image generation and lighting +control in numerous scenes. Additionally, we use our generated dataset to train +an identity-preserving relighting model, conditioned on an image and a target +shading. Our method is the first that enables the generation of images with +controllable, consistent lighting and performs on par with specialized +relighting state-of-the-art methods. + +
+
+ comment: Project page: https://peter-kocsis.github.io/LightIt/ Video: + https://youtu.be/cCfSBD5aPLI +
+
+
+
+
+ + ♻ ☆ Fully automated workflow for the design of patient-specific orthopaedic + implants: application to total knee arthroplasty + + +
+ Arthroplasty is commonly performed to treat joint osteoarthritis, reducing +pain and improving mobility. While arthroplasty has known several technical +improvements, a significant share of patients are still unsatisfied with their +surgery. Personalised arthroplasty improves surgical outcomes however current +solutions require delays, making it difficult to integrate in clinical routine. +We propose a fully automated workflow to design patient-specific implants, +presented for total knee arthroplasty, the most widely performed arthroplasty +in the world nowadays. + The proposed pipeline first uses artificial neural networks to segment the +proximal and distal extremities of the femur and tibia. Then the full bones are +reconstructed using augmented statistical shape models, combining shape and +landmarks information. Finally, 77 morphological parameters are computed to +design patient-specific implants. The developed workflow has been trained using +91 CT scans of lower limb and evaluated on 41 CT scans manually segmented, in +terms of accuracy and execution time. + The workflow accuracy was $0.4\pm0.2mm$ for the segmentation, $1.2\pm0.4mm$ +for the full bones reconstruction, and $2.8\pm2.2mm$ for the anatomical +landmarks determination. The custom implants fitted the patients' anatomy with +$0.6\pm0.2mm$ accuracy. The whole process from segmentation to implants' design +lasted about 5 minutes. + The proposed workflow allows for a fast and reliable personalisation of knee +implants, directly from the patient CT image without requiring any manual +intervention. It establishes a patient-specific pre-operative planning for TKA +in a very short time making it easily available for all patients. Combined with +efficient implant manufacturing techniques, this solution could help answer the +growing number of arthroplasties while reducing complications and improving the +patients' satisfaction. + +
+
+
+
+
+ + ♻ ☆ denoiSplit: a method for joint image splitting and unsupervised + denoising + + +
+ In this work we present denoiSplit, a method to tackle a new analysis task, +i.e. the challenge of joint semantic image splitting and unsupervised +denoising. This dual approach has important applications in fluorescence +microscopy, where semantic image splitting has important applications but noise +does generally hinder the downstream analysis of image content. Image splitting +involves dissecting an image into its distinguishable semantic structures. We +show that the current state-of-the-art method for this task struggles in the +presence of image noise, inadvertently also distributing the noise across the +predicted outputs. The method we present here can deal with image noise by +integrating an unsupervised denoising sub-task. This integration results in +improved semantic image unmixing, even in the presence of notable and realistic +levels of imaging noise. A key innovation in denoiSplit is the use of +specifically formulated noise models and the suitable adjustment of +KL-divergence loss for the high-dimensional hierarchical latent space we are +training. We showcase the performance of denoiSplit across 4 tasks on +real-world microscopy images. Additionally, we perform qualitative and +quantitative evaluations and compare results to existing benchmarks, +demonstrating the effectiveness of using denoiSplit: a single Variational +Splitting Encoder-Decoder (VSE) Network using two suitable noise models to +jointly perform semantic splitting and denoising. + +
+
+
+
+
+ + ♻ ☆ Unraveling Instance Associations: A Closer Look for Audio-Visual + Segmentation + + +
+ Audio-visual segmentation (AVS) is a challenging task that involves +accurately segmenting sounding objects based on audio-visual cues. The +effectiveness of audio-visual learning critically depends on achieving accurate +cross-modal alignment between sound and visual objects. Successful audio-visual +learning requires two essential components: 1) a challenging dataset with +high-quality pixel-level multi-class annotated images associated with audio +files, and 2) a model that can establish strong links between audio information +and its corresponding visual object. However, these requirements are only +partially addressed by current methods, with training sets containing biased +audio-visual data, and models that generalise poorly beyond this biased +training set. In this work, we propose a new cost-effective strategy to build +challenging and relatively unbiased high-quality audio-visual segmentation +benchmarks. We also propose a new informative sample mining method for +audio-visual supervised contrastive learning to leverage discriminative +contrastive samples to enforce cross-modal understanding. We show empirical +results that demonstrate the effectiveness of our benchmark. Furthermore, +experiments conducted on existing AVS datasets and on our new benchmark show +that our method achieves state-of-the-art (SOTA) segmentation accuracy. + +
+
+ comment: Code is available at https://github.com/cyh-0/CAVP +
+
+
+
+
+ + ♻ ☆ FocusCLIP: Multimodal Subject-Level Guidance for Zero-Shot Transfer in + Human-Centric Tasks + + +
+ We propose FocusCLIP, integrating subject-level guidance--a specialized +mechanism for target-specific supervision--into the CLIP framework for improved +zero-shot transfer on human-centric tasks. Our novel contributions enhance CLIP +on both the vision and text sides. On the vision side, we incorporate ROI +heatmaps emulating human visual attention mechanisms to emphasize +subject-relevant image regions. On the text side, we introduce human pose +descriptions to provide rich contextual information. For human-centric tasks, +FocusCLIP is trained with images from the MPII Human Pose dataset. The proposed +approach surpassed CLIP by an average of 8.61% across five previously unseen +datasets covering three human-centric tasks. FocusCLIP achieved an average +accuracy of 33.65% compared to 25.04% by CLIP. We observed a 3.98% improvement +in activity recognition, a 14.78% improvement in age classification, and a +7.06% improvement in emotion recognition. Moreover, using our proposed +single-shot LLM prompting strategy, we release a high-quality MPII Pose +Descriptions dataset to encourage further research in multimodal learning for +human-centric tasks. Furthermore, we also demonstrate the effectiveness of our +subject-level supervision on non-human-centric tasks. FocusCLIP shows a 2.47% +improvement over CLIP in zero-shot bird classification using the CUB dataset. +Our findings emphasize the potential of integrating subject-level guidance with +general pretraining methods for enhanced downstream performance. + +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Self-Supervised Image Denoising: A Comprehensive + Review + + +
+ The advent of deep learning has brought a revolutionary transformation to +image denoising techniques. However, the persistent challenge of acquiring +noise-clean pairs for supervised methods in real-world scenarios remains +formidable, necessitating the exploration of more practical self-supervised +image denoising. This paper focuses on self-supervised image denoising methods +that offer effective solutions to address this challenge. Our comprehensive +review thoroughly analyzes the latest advancements in self-supervised image +denoising approaches, categorizing them into three distinct classes: General +methods, Blind Spot Network (BSN)-based methods, and Transformer-based methods. +For each class, we provide a concise theoretical analysis along with their +practical applications. To assess the effectiveness of these methods, we +present both quantitative and qualitative experimental results on various +datasets, utilizing classical algorithms as benchmarks. Additionally, we +critically discuss the current limitations of these methods and propose +promising directions for future research. By offering a detailed overview of +recent developments in self-supervised image denoising, this review serves as +an invaluable resource for researchers and practitioners in the field, +facilitating a deeper understanding of this emerging domain and inspiring +further advancements. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ BiTT: Bi-directional Texture Reconstruction of Interacting Two Hands + from a Single Image CVPR 2024 + + +
+ Creating personalized hand avatars is important to offer a realistic +experience to users on AR / VR platforms. While most prior studies focused on +reconstructing 3D hand shapes, some recent work has tackled the reconstruction +of hand textures on top of shapes. However, these methods are often limited to +capturing pixels on the visible side of a hand, requiring diverse views of the +hand in a video or multiple images as input. In this paper, we propose a novel +method, BiTT(Bi-directional Texture reconstruction of Two hands), which is the +first end-to-end trainable method for relightable, pose-free texture +reconstruction of two interacting hands taking only a single RGB image, by +three novel components: 1) bi-directional (left $\leftrightarrow$ right) +texture reconstruction using the texture symmetry of left / right hands, 2) +utilizing a texture parametric model for hand texture recovery, and 3) the +overall coarse-to-fine stage pipeline for reconstructing personalized texture +of two interacting hands. BiTT first estimates the scene light condition and +albedo image from an input image, then reconstructs the texture of both hands +through the texture parametric model and bi-directional texture reconstructor. +In experiments using InterHand2.6M and RGB2Hands datasets, our method +significantly outperforms state-of-the-art hand texture reconstruction methods +quantitatively and qualitatively. The code is available at +https://github.com/yunminjin2/BiTT + +
+
+ comment: Accepted by CVPR 2024, Project Page: + https://yunminjin2.github.io/projects/bitt/ +
+
+
+
+
+ + ♻ ☆ Toulouse Hyperspectral Data Set: a benchmark data set to assess + semi-supervised spectral representation learning and pixel-wise + classification techniques + + +
+ Airborne hyperspectral images can be used to map the land cover in large +urban areas, thanks to their very high spatial and spectral resolutions on a +wide spectral domain. While the spectral dimension of hyperspectral images is +highly informative of the chemical composition of the land surface, the use of +state-of-the-art machine learning algorithms to map the land cover has been +dramatically limited by the availability of training data. To cope with the +scarcity of annotations, semi-supervised and self-supervised techniques have +lately raised a lot of interest in the community. Yet, the publicly available +hyperspectral data sets commonly used to benchmark machine learning models are +not totally suited to evaluate their generalization performances due to one or +several of the following properties: a limited geographical coverage (which +does not reflect the spectral diversity in metropolitan areas), a small number +of land cover classes and a lack of appropriate standard train / test splits +for semi-supervised and self-supervised learning. Therefore, we release in this +paper the Toulouse Hyperspectral Data Set that stands out from other data sets +in the above-mentioned respects in order to meet key issues in spectral +representation learning and classification over large-scale hyperspectral +images with very few labeled pixels. Besides, we discuss and experiment +self-supervised techniques for spectral representation learning, including the +Masked Autoencoder, and establish a baseline for pixel-wise classification +achieving 85% overall accuracy and 77% F1 score. The Toulouse Hyperspectral +Data Set and our code are publicly available at +https://www.toulouse-hyperspectral-data-set.com and +https://www.github.com/Romain3Ch216/tlse-experiments, respectively. + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Geometric Prior Based Deep Human Point Cloud Geometry Compression + + +
+ The emergence of digital avatars has raised an exponential increase in the +demand for human point clouds with realistic and intricate details. The +compression of such data becomes challenging with overwhelming data amounts +comprising millions of points. Herein, we leverage the human geometric prior in +geometry redundancy removal of point clouds, greatly promoting the compression +performance. More specifically, the prior provides topological constraints as +geometry initialization, allowing adaptive adjustments with a compact parameter +set that could be represented with only a few bits. Therefore, we can envisage +high-resolution human point clouds as a combination of geometric priors and +structural deviations. The priors could first be derived with an aligned point +cloud, and subsequently the difference of features is compressed into a compact +latent code. The proposed framework can operate in a play-and-plug fashion with +existing learning based point cloud compression methods. Extensive experimental +results show that our approach significantly improves the compression +performance without deteriorating the quality, demonstrating its promise in a +variety of applications. + +
+
+ comment: Accepted by TCSVT 2024 +
+
+
+
+
+ + ♻ ☆ Explaining CLIP's performance disparities on data from blind/low vision + users CVPR + + +
+ Large multi-modal models (LMMs) hold the potential to usher in a new era of +automated visual assistance for people who are blind or low vision (BLV). Yet, +these models have not been systematically evaluated on data captured by BLV +users. We address this by empirically assessing CLIP, a widely-used LMM likely +to underpin many assistive technologies. Testing 25 CLIP variants in a +zero-shot classification task, we find that their accuracy is 15 percentage +points lower on average for images captured by BLV users than web-crawled +images. This disparity stems from CLIP's sensitivities to 1) image content +(e.g. not recognizing disability objects as well as other objects); 2) image +quality (e.g. not being robust to lighting variation); and 3) text content +(e.g. not recognizing objects described by tactile adjectives as well as visual +ones). We delve deeper with a textual analysis of three common pre-training +datasets: LAION-400M, LAION-2B and DataComp-1B, showing that disability content +is rarely mentioned. We then provide three examples that illustrate how the +performance disparities extend to three downstream models underpinned by CLIP: +OWL-ViT, CLIPSeg and DALL-E2. We find that few-shot learning with as few as 5 +images can mitigate CLIP's quality-of-service disparities for BLV users in some +scenarios, which we discuss alongside a set of other possible mitigations. + +
+
+ comment: Accepted at 2024 IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR) +
+
+
+
+
+ + ♻ ☆ Distributionally Generative Augmentation for Fair Facial Attribute + Classification CVPR 2024 + + +
+ Facial Attribute Classification (FAC) holds substantial promise in widespread +applications. However, FAC models trained by traditional methodologies can be +unfair by exhibiting accuracy inconsistencies across varied data +subpopulations. This unfairness is largely attributed to bias in data, where +some spurious attributes (e.g., Male) statistically correlate with the target +attribute (e.g., Smiling). Most of existing fairness-aware methods rely on the +labels of spurious attributes, which may be unavailable in practice. This work +proposes a novel, generation-based two-stage framework to train a fair FAC +model on biased data without additional annotation. Initially, we identify the +potential spurious attributes based on generative models. Notably, it enhances +interpretability by explicitly showing the spurious attributes in image space. +Following this, for each image, we first edit the spurious attributes with a +random degree sampled from a uniform distribution, while keeping target +attribute unchanged. Then we train a fair FAC model by fostering model +invariance to these augmentation. Extensive experiments on three common +datasets demonstrate the effectiveness of our method in promoting fairness in +FAC without compromising accuracy. Codes are in +https://github.com/heqianpei/DiGA. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Contrastive Pre-Training with Multi-View Fusion for No-Reference Point + Cloud Quality Assessment + + +
+ No-reference point cloud quality assessment (NR-PCQA) aims to automatically +evaluate the perceptual quality of distorted point clouds without available +reference, which have achieved tremendous improvements due to the utilization +of deep neural networks. However, learning-based NR-PCQA methods suffer from +the scarcity of labeled data and usually perform suboptimally in terms of +generalization. To solve the problem, we propose a novel contrastive +pre-training framework tailored for PCQA (CoPA), which enables the pre-trained +model to learn quality-aware representations from unlabeled data. To obtain +anchors in the representation space, we project point clouds with different +distortions into images and randomly mix their local patches to form mixed +images with multiple distortions. Utilizing the generated anchors, we constrain +the pre-training process via a quality-aware contrastive loss following the +philosophy that perceptual quality is closely related to both content and +distortion. Furthermore, in the model fine-tuning stage, we propose a +semantic-guided multi-view fusion module to effectively integrate the features +of projected images from multiple perspectives. Extensive experiments show that +our method outperforms the state-of-the-art PCQA methods on popular benchmarks. +Further investigations demonstrate that CoPA can also benefit existing +learning-based PCQA models. + +
+
+
+
+
+ + ♻ ☆ Differentiable Point-based Inverse Rendering + + +
+ We present differentiable point-based inverse rendering, DPIR, an +analysis-by-synthesis method that processes images captured under diverse +illuminations to estimate shape and spatially-varying BRDF. To this end, we +adopt point-based rendering, eliminating the need for multiple samplings per +ray, typical of volumetric rendering, thus significantly enhancing the speed of +inverse rendering. To realize this idea, we devise a hybrid point-volumetric +representation for geometry and a regularized basis-BRDF representation for +reflectance. The hybrid geometric representation enables fast rendering through +point-based splatting while retaining the geometric details and stability +inherent to SDF-based representations. The regularized basis-BRDF mitigates the +ill-posedness of inverse rendering stemming from limited light-view angular +samples. We also propose an efficient shadow detection method using point-based +shadow map rendering. Our extensive evaluations demonstrate that DPIR +outperforms prior works in terms of reconstruction accuracy, computational +efficiency, and memory footprint. Furthermore, our explicit point-based +representation and rendering enables intuitive geometry and reflectance +editing. + +
+
+
+
+
+ + ♻ ☆ HallusionBench: An Advanced Diagnostic Suite for Entangled Language + Hallucination and Visual Illusion in Large Vision-Language Models CVPR 2024 + + +
+ We introduce HallusionBench, a comprehensive benchmark designed for the +evaluation of image-context reasoning. This benchmark presents significant +challenges to advanced large visual-language models (LVLMs), such as +GPT-4V(Vision), Gemini Pro Vision, Claude 3, and LLaVA-1.5, by emphasizing +nuanced understanding and interpretation of visual data. The benchmark +comprises 346 images paired with 1129 questions, all meticulously crafted by +human experts. We introduce a novel structure for these visual questions +designed to establish control groups. This structure enables us to conduct a +quantitative analysis of the models' response tendencies, logical consistency, +and various failure modes. In our evaluation on HallusionBench, we benchmarked +15 different models, highlighting a 31.42% question-pair accuracy achieved by +the state-of-the-art GPT-4V. Notably, all other evaluated models achieve +accuracy below 16%. Moreover, our analysis not only highlights the observed +failure modes, including language hallucination and visual illusion, but also +deepens an understanding of these pitfalls. Our comprehensive case studies +within HallusionBench shed light on the challenges of hallucination and +illusion in LVLMs. Based on these insights, we suggest potential pathways for +their future improvement. The benchmark and codebase can be accessed at +https://github.com/tianyi-lab/HallusionBench. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Time-Efficient and Identity-Consistent Virtual Try-On Using A Variant of + Altered Diffusion Models + + +
+ This study discusses the critical issues of Virtual Try-On in contemporary +e-commerce and the prospective metaverse, emphasizing the challenges of +preserving intricate texture details and distinctive features of the target +person and the clothes in various scenarios, such as clothing texture and +identity characteristics like tattoos or accessories. In addition to the +fidelity of the synthesized images, the efficiency of the synthesis process +presents a significant hurdle. Various existing approaches are explored, +highlighting the limitations and unresolved aspects, e.g., identity information +omission, uncontrollable artifacts, and low synthesis speed. It then proposes a +novel diffusion-based solution that addresses garment texture preservation and +user identity retention during virtual try-on. The proposed network comprises +two primary modules - a warping module aligning clothing with individual +features and a try-on module refining the attire and generating missing parts +integrated with a mask-aware post-processing technique ensuring the integrity +of the individual's identity. It demonstrates impressive results, surpassing +the state-of-the-art in speed by nearly 20 times during inference, with +superior fidelity in qualitative assessments. Quantitative evaluations confirm +comparable performance with the recent SOTA method on the VITON-HD and +Dresscode datasets. + +
+
+
+
+
+ + ♻ ☆ Mipha: A Comprehensive Overhaul of Multimodal Assistant with Small + Language Models + + +
+ Multimodal Large Language Models (MLLMs) have showcased impressive skills in +tasks related to visual understanding and reasoning. Yet, their widespread +application faces obstacles due to the high computational demands during both +the training and inference phases, restricting their use to a limited audience +within the research and user communities. In this paper, we investigate the +design aspects of Multimodal Small Language Models (MSLMs) and propose an +efficient multimodal assistant named Mipha, which is designed to create synergy +among various aspects: visual representation, language models, and optimization +strategies. We show that without increasing the volume of training data, our +Mipha-3B outperforms the state-of-the-art large MLLMs, especially +LLaVA-1.5-13B, on multiple benchmarks. Through detailed discussion, we provide +insights and guidelines for developing strong MSLMs that rival the capabilities +of MLLMs. Our code is available at https://github.com/zhuyiche/llava-phi. + +
+
+
+
+
+ + ♻ ☆ Dispersed Structured Light for Hyperspectral 3D Imaging + + +
+ Hyperspectral 3D imaging aims to acquire both depth and spectral information +of a scene. However, existing methods are either prohibitively expensive and +bulky or compromise on spectral and depth accuracy. In this work, we present +Dispersed Structured Light (DSL), a cost-effective and compact method for +accurate hyperspectral 3D imaging. DSL modifies a traditional projector-camera +system by placing a sub-millimeter thick diffraction grating film front of the +projector. The grating disperses structured light based on light wavelength. To +utilize the dispersed structured light, we devise a model for dispersive +projection image formation and a per-pixel hyperspectral 3D reconstruction +method. We validate DSL by instantiating a compact experimental prototype. DSL +achieves spectral accuracy of 18.8nm full-width half-maximum (FWHM) and depth +error of 1mm. We demonstrate that DSL outperforms prior work on practical +hyperspectral 3D imaging. DSL promises accurate and practical hyperspectral 3D +imaging for diverse application domains, including computer vision and +graphics, cultural heritage, geology, and biology. + +
+
+
+
+
+ + ♻ ☆ PIA: Your Personalized Image Animator via Plug-and-Play Modules in + Text-to-Image Models + + +
+ Recent advancements in personalized text-to-image (T2I) models have +revolutionized content creation, empowering non-experts to generate stunning +images with unique styles. While promising, adding realistic motions into these +personalized images by text poses significant challenges in preserving distinct +styles, high-fidelity details, and achieving motion controllability by text. In +this paper, we present PIA, a Personalized Image Animator that excels in +aligning with condition images, achieving motion controllability by text, and +the compatibility with various personalized T2I models without specific tuning. +To achieve these goals, PIA builds upon a base T2I model with well-trained +temporal alignment layers, allowing for the seamless transformation of any +personalized T2I model into an image animation model. A key component of PIA is +the introduction of the condition module, which utilizes the condition frame +and inter-frame affinity as input to transfer appearance information guided by +the affinity hint for individual frame synthesis in the latent space. This +design mitigates the challenges of appearance-related image alignment within +and allows for a stronger focus on aligning with motion-related guidance. + +
+
+ comment: Project page: https://pi-animator.github.io/ +
+
+
+
+
+ + ♻ ☆ I-PHYRE: Interactive Physical Reasoning ICLR 2024 + + +
+ Current evaluation protocols predominantly assess physical reasoning in +stationary scenes, creating a gap in evaluating agents' abilities to interact +with dynamic events. While contemporary methods allow agents to modify initial +scene configurations and observe consequences, they lack the capability to +interact with events in real time. To address this, we introduce I-PHYRE, a +framework that challenges agents to simultaneously exhibit intuitive physical +reasoning, multi-step planning, and in-situ intervention. Here, intuitive +physical reasoning refers to a quick, approximate understanding of physics to +address complex problems; multi-step denotes the need for extensive sequence +planning in I-PHYRE, considering each intervention can significantly alter +subsequent choices; and in-situ implies the necessity for timely object +manipulation within a scene, where minor timing deviations can result in task +failure. We formulate four game splits to scrutinize agents' learning and +generalization of essential principles of interactive physical reasoning, +fostering learning through interaction with representative scenarios. Our +exploration involves three planning strategies and examines several supervised +and reinforcement agents' zero-shot generalization proficiency on I-PHYRE. The +outcomes highlight a notable gap between existing learning algorithms and human +performance, emphasizing the imperative for more research in enhancing agents +with interactive physical reasoning capabilities. The environment and baselines +will be made publicly available. + +
+
+ comment: 21 pages, ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Solving the bongard-logo problem by modeling a probabilistic model + + +
+ Abstract reasoning problems challenge the perceptual and cognitive abilities +of AI algorithms, demanding deeper pattern discernment and inductive reasoning +beyond explicit image features. This study introduces PMoC, a tailored +probability model for the Bongard-Logo problem, achieving high reasoning +accuracy by constructing independent probability models. Additionally, we +present Pose-Transformer, an enhanced Transformer-Encoder designed for complex +abstract reasoning tasks, including Bongard-Logo, RAVEN, I-RAVEN, and PGM. +Pose-Transformer incorporates positional information learning, inspired by +capsule networks' pose matrices, enhancing its focus on local positional +relationships in image data processing. When integrated with PMoC, it further +improves reasoning accuracy. Our approach effectively addresses reasoning +difficulties associated with abstract entities' positional changes, +outperforming previous models on the OIG, D3$\times$3 subsets of RAVEN, and PGM +databases. This research contributes to advancing AI's capabilities in abstract +reasoning and cognitive pattern recognition. + +
+
+ comment: 14 pages, 11 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract + Reasoning process + + +
+ Abstract reasoning problems pose significant challenges to artificial +intelligence algorithms, demanding cognitive capabilities beyond those required +for perception tasks. This study introduces the Triple-CFN approach to tackle +the Bongard-Logo problem, achieving notable reasoning accuracy by implicitly +reorganizing the concept space of conflicting instances. Additionally, the +Triple-CFN paradigm proves effective for the RPM problem with necessary +modifications, yielding competitive results. To further enhance performance on +the RPM issue, we develop the Meta Triple-CFN network, which explicitly +structures the problem space while maintaining interpretability on progressive +patterns. The success of Meta Triple-CFN is attributed to its paradigm of +modeling the conceptual space, equivalent to normalizing reasoning information. +Based on this ideology, we introduce the Re-space layer, enhancing the +performance of both Meta Triple-CFN and Triple-CFN. This paper aims to +contribute to advancements in machine intelligence by exploring innovative +network designs for addressing abstract reasoning problems, paving the way for +further breakthroughs in this domain. + +
+
+ comment: 14 pages, 14 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ D4C glove-train: solving the RPM and Bongard-logo problem by + distributing and Circumscribing concepts + + +
+ This paper achieves noteworthy progress in the realm of abstract reasoning, +particularly in addressing Raven's Progressive Matrices (RPM) and Bongard-Logo +challenges. Initially, we introduce Lico-Net, a novel baseline model that +resolves RPM problems with remarkable accuracy. Leveraging this foundation, we +advance with the D3C approach, which advocates representing the underlying +concepts in abstract reasoning problems through distributions. This perspective +enhances the performance of both Lico-Net and a baseline model excelling in +Bongard-Logo tasks. To bolster the computational efficiency of D3C, we present +the D3C-cos variant, offering a streamlined yet precise solution. Furthermore, +we propose the D2C method, redefining conceptual boundaries within these +domains and bridging the divide between high-level abstractions and their +lower-dimensional counterparts. Finally, we extend our methodology to D4C, +employing adversarial techniques to refine conceptual boundaries further and +demonstrate substantial improvements in both RPM and Bongard-Logo challenges. +Overall, our contributions present a fresh outlook and practical advancements +in the field of abstract reasoning. + +
+
+ comment: 18 pages, 19 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ CiPR: An Efficient Framework with Cross-instance Positive Relations for + Generalized Category Discovery + + +
+ We tackle the issue of generalized category discovery (GCD). GCD considers +the open-world problem of automatically clustering a partially labelled +dataset, in which the unlabelled data may contain instances from both novel +categories and labelled classes. In this paper, we address the GCD problem with +an unknown category number for the unlabelled data. We propose a framework, +named CiPR, to bootstrap the representation by exploiting Cross-instance +Positive Relations in the partially labelled data for contrastive learning, +which have been neglected in existing methods. To obtain reliable +cross-instance relations to facilitate representation learning, we introduce a +semi-supervised hierarchical clustering algorithm, named selective neighbor +clustering (SNC), which can produce a clustering hierarchy directly from the +connected components of a graph constructed from selective neighbors. We +further present a method to estimate the unknown class number using SNC with a +joint reference score that considers clustering indexes of both labelled and +unlabelled data, and extend SNC to allow label assignment for the unlabelled +instances with a given class number. We thoroughly evaluate our framework on +public generic image recognition datasets and challenging fine-grained +datasets, and establish a new state-of-the-art. Code: +https://github.com/haoosz/CiPR + +
+
+ comment: Accepted to TMLR. Code: https://github.com/haoosz/CiPR +
+
+
+
+
+ + ♻ ☆ HalluciDoctor: Mitigating Hallucinatory Toxicity in Visual Instruction + Data CVPR 2024 + + +
+ Multi-modal Large Language Models (MLLMs) tuned on machine-generated +instruction-following data have demonstrated remarkable performance in various +multi-modal understanding and generation tasks. However, the hallucinations +inherent in machine-generated data, which could lead to hallucinatory outputs +in MLLMs, remain under-explored. This work aims to investigate various +hallucinations (i.e., object, relation, attribute hallucinations) and mitigate +those hallucinatory toxicities in large-scale machine-generated visual +instruction datasets. Drawing on the human ability to identify factual errors, +we present a novel hallucination detection and elimination framework, +HalluciDoctor, based on the cross-checking paradigm. We use our framework to +identify and eliminate hallucinations in the training data automatically. +Interestingly, HalluciDoctor also indicates that spurious correlations arising +from long-tail object co-occurrences contribute to hallucinations. Based on +that, we execute counterfactual visual instruction expansion to balance data +distribution, thereby enhancing MLLMs' resistance to hallucinations. +Comprehensive experiments on hallucination evaluation benchmarks show that our +method successfully mitigates 44.6% hallucinations relatively and maintains +competitive performance compared to LLaVA. The data and code for this paper are +publicly available. \url{https://github.com/Yuqifan1117/HalluciDoctor}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ W-HMR: Human Mesh Recovery in World Space with Weak-supervised Camera + Calibration and Orientation Correction + + +
+ For a long time, in reconstructing 3D human bodies from monocular images, +most methods opted to simplify the task by minimizing the influence of the +camera. Using a coarse focal length setting results in the reconstructed bodies +not aligning well with distorted images. Ignoring camera rotation leads to an +unrealistic reconstructed body pose in world space. Consequently, the +application scenarios of existing methods are confined to controlled +environments. When confronted with complex and diverse in-the-wild images, they +struggle to achieve accurate and reasonable reconstruction in world space. To +address the above issues, we propose W-HMR, which decouples global body +recovery into camera calibration, local body recovery, and global body +orientation correction. We design the first weak-supervised camera calibration +method for body distortion, eliminating dependence on focal length labels and +achieving finer mesh-image alignment. We propose a novel orientation correction +module to allow the reconstructed human body to remain normal in world space. +Decoupling body orientation and body pose enables our model to consider the +accuracy in camera coordinate and the reasonableness in world coordinate +simultaneously, expanding the range of applications. As a result, W-HMR +achieves high-quality reconstruction in dual coordinate systems, particularly +in challenging scenes. Codes and demos have been released on the project page +https://yw0208.github.io/w-hmr/. + +
+
+ comment: Project Page: https://yw0208.github.io/w-hmr/ +
+
+
+
+
+ + ♻ ☆ When Semantic Segmentation Meets Frequency Aliasing ICLR 2024 + + +
+ Despite recent advancements in semantic segmentation, where and what pixels +are hard to segment remains largely unexplored. Existing research only +separates an image into easy and hard regions and empirically observes the +latter are associated with object boundaries. In this paper, we conduct a +comprehensive analysis of hard pixel errors, categorizing them into three +types: false responses, merging mistakes, and displacements. Our findings +reveal a quantitative association between hard pixels and aliasing, which is +distortion caused by the overlapping of frequency components in the Fourier +domain during downsampling. To identify the frequencies responsible for +aliasing, we propose using the equivalent sampling rate to calculate the +Nyquist frequency, which marks the threshold for aliasing. Then, we introduce +the aliasing score as a metric to quantify the extent of aliasing. While +positively correlated with the proposed aliasing score, three types of hard +pixels exhibit different patterns. Here, we propose two novel de-aliasing +filter (DAF) and frequency mixing (FreqMix) modules to alleviate aliasing +degradation by accurately removing or adjusting frequencies higher than the +Nyquist frequency. The DAF precisely removes the frequencies responsible for +aliasing before downsampling, while the FreqMix dynamically selects +high-frequency components within the encoder block. Experimental results +demonstrate consistent improvements in semantic segmentation and low-light +instance segmentation tasks. The code is available at: +https://github.com/Linwei-Chen/Seg-Aliasing. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Cell Variational Information Bottleneck Network + + +
+ In this work, we propose Cell Variational Information Bottleneck Network +(cellVIB), a convolutional neural network using information bottleneck +mechanism, which can be combined with the latest feedforward network +architecture in an end-to-end training method. Our Cell Variational Information +Bottleneck Network is constructed by stacking VIB cells, which generate feature +maps with uncertainty. As layers going deeper, the regularization effect will +gradually increase, instead of directly adding excessive regular constraints to +the output layer of the model as in Deep VIB. Under each VIB cell, the +feedforward process learns an independent mean term and an standard deviation +term, and predicts the Gaussian distribution based on them. The feedback +process is based on reparameterization trick for effective training. This work +performs an extensive analysis on MNIST dataset to verify the effectiveness of +each VIB cells, and provides an insightful analysis on how the VIB cells affect +mutual information. Experiments conducted on CIFAR-10 also prove that our +cellVIB is robust against noisy labels during training and against corrupted +images during testing. Then, we validate our method on PACS dataset, whose +results show that the VIB cells can significantly improve the generalization +performance of the basic model. Finally, in a more complex representation +learning task, face recognition, our network structure has also achieved very +competitive results. + +
+
+
+
+
+ + ♻ ☆ Don't Judge by the Look: Towards Motion Coherent Video Representation ICLR2024 + + +
+ Current training pipelines in object recognition neglect Hue Jittering when +doing data augmentation as it not only brings appearance changes that are +detrimental to classification, but also the implementation is inefficient in +practice. In this study, we investigate the effect of hue variance in the +context of video understanding and find this variance to be beneficial since +static appearances are less important in videos that contain motion +information. Based on this observation, we propose a data augmentation method +for video understanding, named Motion Coherent Augmentation (MCA), that +introduces appearance variation in videos and implicitly encourages the model +to prioritize motion patterns, rather than static appearances. Concretely, we +propose an operation SwapMix to efficiently modify the appearance of video +samples, and introduce Variation Alignment (VA) to resolve the distribution +shift caused by SwapMix, enforcing the model to learn appearance invariant +representations. Comprehensive empirical evaluation across various +architectures and different datasets solidly validates the effectiveness and +generalization ability of MCA, and the application of VA in other augmentation +methods. Code is available at https://github.com/BeSpontaneous/MCA-pytorch. + +
+
+ comment: Accepted by ICLR2024 +
+
+
+
+
+ + ♻ ☆ Cartoon Hallucinations Detection: Pose-aware In Context Visual Learning + + +
+ Large-scale Text-to-Image (TTI) models have become a common approach for +generating training data in various generative fields. However, visual +hallucinations, which contain perceptually critical defects, remain a concern, +especially in non-photorealistic styles like cartoon characters. We propose a +novel visual hallucination detection system for cartoon character images +generated by TTI models. Our approach leverages pose-aware in-context visual +learning (PA-ICVL) with Vision-Language Models (VLMs), utilizing both RGB +images and pose information. By incorporating pose guidance from a fine-tuned +pose estimator, we enable VLMs to make more accurate decisions. Experimental +results demonstrate significant improvements in identifying visual +hallucinations compared to baseline methods relying solely on RGB images. This +research advances TTI models by mitigating visual hallucinations, expanding +their potential in non-photorealistic domains. + +
+
+ comment: 11 pages, 12 figures, 1 table, Project page: + https://gh-bumsookim.github.io/Cartoon-Hallucinations-Detection/ +
+
+
+
+
+ + ♻ ☆ MMA-Diffusion: MultiModal Attack on Diffusion Models CVPR 2024 + + +
+ In recent years, Text-to-Image (T2I) models have seen remarkable +advancements, gaining widespread adoption. However, this progress has +inadvertently opened avenues for potential misuse, particularly in generating +inappropriate or Not-Safe-For-Work (NSFW) content. Our work introduces +MMA-Diffusion, a framework that presents a significant and realistic threat to +the security of T2I models by effectively circumventing current defensive +measures in both open-source models and commercial online services. Unlike +previous approaches, MMA-Diffusion leverages both textual and visual modalities +to bypass safeguards like prompt filters and post-hoc safety checkers, thus +exposing and highlighting the vulnerabilities in existing defense mechanisms. + +
+
+ comment: CVPR 2024. Code is available at + https://github.com/yangyijune/MMA-Diffusion +
+
+
+
+
+ + ♻ ☆ Noisy-Correspondence Learning for Text-to-Image Person Re-identification + + +
+ Text-to-image person re-identification (TIReID) is a compelling topic in the +cross-modal community, which aims to retrieve the target person based on a +textual query. Although numerous TIReID methods have been proposed and achieved +promising performance, they implicitly assume the training image-text pairs are +correctly aligned, which is not always the case in real-world scenarios. In +practice, the image-text pairs inevitably exist under-correlated or even +false-correlated, a.k.a noisy correspondence (NC), due to the low quality of +the images and annotation errors. To address this problem, we propose a novel +Robust Dual Embedding method (RDE) that can learn robust visual-semantic +associations even with NC. Specifically, RDE consists of two main components: +1) A Confident Consensus Division (CCD) module that leverages the dual-grained +decisions of dual embedding modules to obtain a consensus set of clean training +data, which enables the model to learn correct and reliable visual-semantic +associations. 2) A Triplet Alignment Loss (TAL) relaxes the conventional +Triplet Ranking loss with the hardest negative samples to a log-exponential +upper bound over all negative ones, thus preventing the model collapse under NC +and can also focus on hard-negative samples for promising performance. We +conduct extensive experiments on three public benchmarks, namely CUHK-PEDES, +ICFG-PEDES, and RSTPReID, to evaluate the performance and robustness of our +RDE. Our method achieves state-of-the-art results both with and without +synthetic noisy correspondences on all three datasets. Code is available at +https://github.com/QinYang79/RDE. + +
+
+
+
+
+ + ♻ ☆ CRS-Diff: Controllable Generative Remote Sensing Foundation Model + + +
+ The emergence of diffusion models has revolutionized the field of image +generation, providing new methods for creating high-quality, high-resolution +images across various applications. However, the potential of these models for +generating domain-specific images, particularly remote sensing (RS) images, +remains largely untapped. RS images that are notable for their high resolution, +extensive coverage, and rich information content, bring new challenges that +general diffusion models may not adequately address. This paper proposes +CRS-Diff, a pioneering diffusion modeling framework specifically tailored for +generating remote sensing imagery, leveraging the inherent advantages of +diffusion models while integrating advanced control mechanisms to ensure that +the imagery is not only visually clear but also enriched with geographic and +temporal information. The model integrates global and local control inputs, +enabling precise combinations of generation conditions to refine the generation +process. A comprehensive evaluation of CRS-Diff has demonstrated its superior +capability to generate RS imagery both in a single condition and multiple +conditions compared with previous methods in terms of image quality and +diversity. + +
+
+
+
+
+ + ♻ ☆ Telling Left from Right: Identifying Geometry-Aware Semantic + Correspondence CVPR 24 + + +
+ While pre-trained large-scale vision models have shown significant promise +for semantic correspondence, their features often struggle to grasp the +geometry and orientation of instances. This paper identifies the importance of +being geometry-aware for semantic correspondence and reveals a limitation of +the features of current foundation models under simple post-processing. We show +that incorporating this information can markedly enhance semantic +correspondence performance with simple but effective solutions in both +zero-shot and supervised settings. We also construct a new challenging +benchmark for semantic correspondence built from an existing animal pose +estimation dataset, for both pre-training validating models. Our method +achieves a PCK@0.10 score of 65.4 (zero-shot) and 85.6 (supervised) on the +challenging SPair-71k dataset, outperforming the state of the art by 5.5p and +11.0p absolute gains, respectively. Our code and datasets are publicly +available at: https://telling-left-from-right.github.io/. + +
+
+ comment: Accepted by CVPR 24, project page: + https://telling-left-from-right.github.io/ +
+
+
+
+
+ + ♻ ☆ VURF: A General-purpose Reasoning and Self-refinement Framework for + Video Understanding + + +
+ Recent studies have demonstrated the effectiveness of Large Language Models +(LLMs) as reasoning modules that can deconstruct complex tasks into more +manageable sub-tasks, particularly when applied to visual reasoning tasks for +images. In contrast, this paper introduces a Video Understanding and Reasoning +Framework (VURF) based on the reasoning power of LLMs. Ours is a novel approach +to extend the utility of LLMs in the context of video tasks, leveraging their +capacity to generalize from minimal input and output demonstrations within a +contextual framework. By presenting LLMs with pairs of instructions and their +corresponding high-level programs, we harness their contextual learning +capabilities to generate executable visual programs for video understanding. To +enhance program's accuracy and robustness, we implement two important +strategies. Firstly, we employ a feedback-generation approach, powered by +GPT-3.5, to rectify errors in programs utilizing unsupported functions. +Secondly, taking motivation from recent works on self refinement of LLM +outputs, we introduce an iterative procedure for improving the quality of the +in-context examples by aligning the initial outputs to the outputs that would +have been generated had the LLM not been bound by the structure of the +in-context examples. Our results on several video-specific tasks, including +visual QA, video anticipation, pose estimation and multi-video QA illustrate +the efficacy of these enhancements in improving the performance of visual +programming approaches for video tasks. + +
+
+
+
+
+ + ♻ ☆ URS-NeRF: Unordered Rolling Shutter Bundle Adjustment for Neural + Radiance Fields + + +
+ We propose a novel rolling shutter bundle adjustment method for neural +radiance fields (NeRF), which utilizes the unordered rolling shutter (RS) +images to obtain the implicit 3D representation. Existing NeRF methods suffer +from low-quality images and inaccurate initial camera poses due to the RS +effect in the image, whereas, the previous method that incorporates the RS into +NeRF requires strict sequential data input, limiting its widespread +applicability. In constant, our method recovers the physical formation of RS +images by estimating camera poses and velocities, thereby removing the input +constraints on sequential data. Moreover, we adopt a coarse-to-fine training +strategy, in which the RS epipolar constraints of the pairwise frames in the +scene graph are used to detect the camera poses that fall into local minima. +The poses detected as outliers are corrected by the interpolation method with +neighboring poses. The experimental results validate the effectiveness of our +method over state-of-the-art works and demonstrate that the reconstruction of +3D representations is not constrained by the requirement of video sequence +input. + +
+
+
+
+
+ + ♻ ☆ Improving White-box Robustness of Pre-processing Defenses via Joint + Adversarial Training + + +
+ Deep neural networks (DNNs) are vulnerable to adversarial noise. A range of +adversarial defense techniques have been proposed to mitigate the interference +of adversarial noise, among which the input pre-processing methods are scalable +and show great potential to safeguard DNNs. However, pre-processing methods may +suffer from the robustness degradation effect, in which the defense reduces +rather than improving the adversarial robustness of a target model in a +white-box setting. A potential cause of this negative effect is that +adversarial training examples are static and independent to the pre-processing +model. To solve this problem, we investigate the influence of full adversarial +examples which are crafted against the full model, and find they indeed have a +positive impact on the robustness of defenses. Furthermore, we find that simply +changing the adversarial training examples in pre-processing methods does not +completely alleviate the robustness degradation effect. This is due to the +adversarial risk of the pre-processed model being neglected, which is another +cause of the robustness degradation effect. Motivated by above analyses, we +propose a method called Joint Adversarial Training based Pre-processing (JATP) +defense. Specifically, we formulate a feature similarity based adversarial risk +for the pre-processing model by using full adversarial examples found in a +feature space. Unlike standard adversarial training, we only update the +pre-processing model, which prompts us to introduce a pixel-wise loss to +improve its cross-model transferability. We then conduct a joint adversarial +training on the pre-processing model to minimize this overall risk. Empirical +results show that our method could effectively mitigate the robustness +degradation effect across different target models in comparison to previous +state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ Masked Vector Quantization + + +
+ Generative models with discrete latent representations have recently +demonstrated an impressive ability to learn complex high-dimensional data +distributions. However, their performance relies on a long sequence of tokens +per instance and a large number of codebook entries, resulting in long sampling +times and considerable computation to fit the categorical posterior. To address +these issues, we propose the Masked Vector Quantization (MVQ) framework which +increases the representational capacity of each code vector by learning mask +configurations via a stochastic winner-takes-all training regime called +Multiple Hypothese Dropout (MH-Dropout). On ImageNet 64$\times$64, MVQ reduces +FID in existing vector quantization architectures by up to $68\%$ at 2 tokens +per instance and $57\%$ at 5 tokens. These improvements widen as codebook +entries is reduced and allows for $7\textit{--}45\times$ speed-up in token +sampling during inference. As an additional benefit, we find that smaller +latent spaces lead to MVQ identifying transferable visual representations where +multiple can be smoothly combined. + +
+
+ comment: A newer version of this manuscript was archived under 2312.11735 +
+
+
+
+
+ + ♻ ☆ LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal + Models + + +
+ Large Multimodal Models (LMMs) have shown significant reasoning capabilities +by connecting a visual encoder and a large language model. LMMs typically use a +fixed amount of visual tokens, such as the penultimate layer features in the +CLIP visual encoder, as the prefix content. Recent LMMs incorporate more +complex visual inputs, such as high-resolution images and videos, which +increase the number of visual tokens significantly. However, due to the design +of the Transformer architecture, computational costs associated with these +models tend to increase quadratically with the number of input tokens. To +tackle this problem, we explore a token reduction mechanism and find, similar +to prior work, that many visual tokens are spatially redundant. Based on this, +we propose PruMerge, a novel adaptive visual token reduction approach, which +largely reduces the number of visual tokens while maintaining comparable model +performance. We first select the unpruned visual tokens based on their +similarity to class tokens and spatial tokens. We then cluster the pruned +tokens based on key similarity and merge the clustered tokens with the unpruned +tokens to supplement their information. Empirically, when applied to LLaVA-1.5, +our approach can compress the visual tokens by 18 times on average, and achieve +comparable performance across diverse visual question-answering and reasoning +tasks. Code and checkpoints are at https://llava-prumerge.github.io/. + +
+
+ comment: Project page: https://llava-prumerge.github.io/ +
+
+
+
+
+ + ♻ ☆ Point-DETR3D: Leveraging Imagery Data with Spatial Point Prior for + Weakly Semi-supervised 3D Object Detection AAAI2024 + + +
+ Training high-accuracy 3D detectors necessitates massive labeled 3D +annotations with 7 degree-of-freedom, which is laborious and time-consuming. +Therefore, the form of point annotations is proposed to offer significant +prospects for practical applications in 3D detection, which is not only more +accessible and less expensive but also provides strong spatial information for +object localization. In this paper, we empirically discover that it is +non-trivial to merely adapt Point-DETR to its 3D form, encountering two main +bottlenecks: 1) it fails to encode strong 3D prior into the model, and 2) it +generates low-quality pseudo labels in distant regions due to the extreme +sparsity of LiDAR points. To overcome these challenges, we introduce +Point-DETR3D, a teacher-student framework for weakly semi-supervised 3D +detection, designed to fully capitalize on point-wise supervision within a +constrained instance-wise annotation budget.Different from Point-DETR which +encodes 3D positional information solely through a point encoder, we propose an +explicit positional query initialization strategy to enhance the positional +prior. Considering the low quality of pseudo labels at distant regions produced +by the teacher model, we enhance the detector's perception by incorporating +dense imagery data through a novel Cross-Modal Deformable RoI Fusion +(D-RoI).Moreover, an innovative point-guided self-supervised learning technique +is proposed to allow for fully exploiting point priors, even in student +models.Extensive experiments on representative nuScenes dataset demonstrate our +Point-DETR3D obtains significant improvements compared to previous works. +Notably, with only 5% of labeled data, Point-DETR3D achieves over 90% +performance of its fully supervised counterpart. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Cell Tracking according to Biological Needs -- Strong Mitosis-aware + Random-finite Sets Tracker with Aleatoric Uncertainty + + +
+ Cell tracking and segmentation assist biologists in extracting insights from +large-scale microscopy time-lapse data. Driven by local accuracy metrics, +current tracking approaches often suffer from a lack of long-term consistency. +To address this issue, we introduce an uncertainty estimation technique for +neural tracking-by-regression frameworks and incorporate it into our novel +extended Poisson multi-Bernoulli mixture tracker. Our uncertainty estimation +identifies uncertain associations within high-performing tracking-by-regression +methods using problem-specific test-time augmentations. Leveraging this +uncertainty, along with a novel mitosis-aware assignment problem formulation, +our tracker resolves false associations and mitosis detections stemming from +long-term conflicts. We evaluate our approach on nine competitive datasets and +demonstrate that it outperforms the current state-of-the-art on biologically +relevant metrics substantially, achieving improvements by a factor of +approximately $5.75$. Furthermore, we uncover new insights into the behavior of +tracking-by-regression uncertainty. + +
+
+ comment: 23 pages, 10 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ MEDPSeg: Hierarchical polymorphic multitask learning for the + segmentation of ground-glass opacities, consolidation, and pulmonary + structures on computed tomography + + +
+ The COVID-19 pandemic response highlighted the potential of deep learning +methods in facilitating the diagnosis, prognosis and understanding of lung +diseases through automated segmentation of pulmonary structures and lesions in +chest computed tomography (CT). Automated separation of lung lesion into +ground-glass opacity (GGO) and consolidation is hindered due to the +labor-intensive and subjective nature of this task, resulting in scarce +availability of ground truth for supervised learning. To tackle this problem, +we propose MEDPSeg. MEDPSeg learns from heterogeneous chest CT targets through +hierarchical polymorphic multitask learning (HPML). HPML explores the +hierarchical nature of GGO and consolidation, lung lesions, and the lungs, with +further benefits achieved through multitasking airway and pulmonary artery +segmentation. Over 6000 volumetric CT scans from different partially labeled +sources were used for training and testing. Experiments show PML enabling new +state-of-the-art performance for GGO and consolidation segmentation tasks. In +addition, MEDPSeg simultaneously performs segmentation of the lung parenchyma, +airways, pulmonary artery, and lung lesions, all in a single forward +prediction, with performance comparable to state-of-the-art methods specialized +in each of those targets. Finally, we provide an open-source implementation +with a graphical user interface at https://github.com/MICLab-Unicamp/medpseg. + +
+
+ comment: This manuscript is under review and might change in the future +
+
+
+
+
+ + ♻ ☆ SCHEME: Scalable Channer Mixer for Vision Transformers + + +
+ Vision Transformers have received significant attention due to their +impressive performance in many vision tasks. While the token mixer or attention +block has been studied in great detail, the channel mixer or feature mixing +block (FFN or MLP) has not been explored in depth albeit it accounts for a bulk +of the parameters and computation in a model. In this work, we study whether +sparse feature mixing can replace the dense connections and confirm this with a +block diagonal MLP structure that improves the accuracy by supporting larger +expansion ratios. To improve the feature clusters formed by this structure and +thereby further improve the accuracy, a lightweight, parameter-free, channel +covariance attention (CCA) mechanism is introduced as a parallel branch during +training. This design of CCA enables gradual feature mixing across channel +groups during training whose contribution decays to zero as the training +progresses to convergence. This allows the CCA block to be discarded during +inference, thus enabling enhanced performance with no additional computational +cost. The resulting $\textit{Scalable CHannEl MixEr}$ (SCHEME) can be plugged +into any ViT architecture to obtain a gamut of models with different trade-offs +between complexity and performance by controlling the block diagonal structure +size in the MLP. This is shown by the introduction of a new family of +SCHEMEformer models that is shown to establish new Pareto frontiers for +accuracy vs FLOPS, accuracy vs model size, and accuracy vs throughput, +especially for fast transformers of small model size. For example, the +SCHEMEformer establishes a new SOTA of 79.7% accuracy for ViTs using pure +attention mixers on ImageNet-1K at 1.77G FLOPs. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Context-Aware Meta-Learning ICLR 2024 + + +
+ Large Language Models like ChatGPT demonstrate a remarkable capacity to learn +new concepts during inference without any fine-tuning. However, visual models +trained to detect new objects during inference have been unable to replicate +this ability, and instead either perform poorly or require meta-training and/or +fine-tuning on similar objects. In this work, we propose a meta-learning +algorithm that emulates Large Language Models by learning new visual concepts +during inference without fine-tuning. Our approach leverages a frozen +pre-trained feature extractor, and analogous to in-context learning, recasts +visual meta-learning as sequence modeling over datapoints with known labels and +a test datapoint with an unknown label. On 8 out of 11 meta-learning +benchmarks, our approach -- without meta-training or fine-tuning -- exceeds or +matches the state-of-the-art algorithm, P>M>F, which is meta-trained on these +benchmarks. Our code is available at https://github.com/cfifty/CAML. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ MetaSegNet: Metadata-collaborative Vision-Language Representation + Learning for Semantic Segmentation of Remote Sensing Images + + +
+ Semantic segmentation of remote sensing images plays a vital role in a wide +range of Earth Observation (EO) applications, such as land use land cover +mapping, environment monitoring, and sustainable development. Driven by rapid +developments in Artificial Intelligence (AI), deep learning (DL) has emerged as +the mainstream tool for semantic segmentation and has achieved many +breakthroughs in the field of remote sensing. However, the existing DL-based +methods mainly focus on unimodal visual data while ignoring the rich multimodal +information involved in the real world, usually demonstrating weak reliability +and generlization. Inspired by the success of Vision Transformers and large +language models, we propose a novel metadata-collaborative multimodal +segmentation network (MetaSegNet) that applies vision-language representation +learning for semantic segmentation of remote sensing images. Unlike the common +model structure that only uses unimodal visual data, we extract the key +characteristic (e.g. the climate zone) from freely available remote sensing +image metadata and transfer it into knowledge-based text prompts via the +generic ChatGPT. Then, we construct an image encoder, a text encoder and a +cross-modal attention fusion subnetwork to extract the image and text feature +and apply image-text interaction. Benefiting from such a design, the proposed +MetaSegNet demonstrates superior generalization and achieves competitive +accuracy with the state-of-the-art semantic segmentation methods on the +large-scale OpenEarthMap dataset (68.6% mIoU) and Potsdam dataset (93.3% mean +F1 score) as well as LoveDA dataset (52.2% mIoU). + +
+
+
+
+
+ + ♻ ☆ Fix-Con: Automatic Fault Localization and Repair of Deep Learning Model + Conversions between Frameworks + + +
+ Converting deep learning models between frameworks is a common step to +maximize model compatibility across devices and leverage optimization features +that may be exclusively provided in one deep learning framework. However, this +conversion process may be riddled with bugs, making the converted models either +undeployable or problematic, considerably degrading their prediction +correctness. + In this paper we propose an automated approach for fault localization and +repair, Fix-Con, during model conversion between deep learning frameworks. +Fix-Con is capable of detecting and fixing faults introduced in model input, +parameters, hyperparameters, and the model graph during conversion. + Fix-Con uses a set of fault types (mined from surveying conversion issues +reported \nick{in code repositories and forums}) to localize potential +conversion faults in the converted target model and then repair them +appropriately, e.g., replacing the parameters of the target model with those +from the source model. This is done iteratively for every image in the dataset, +comparing output label differences between the source model and the converted +target model until all differences are resolved. We evaluate the effectiveness +of Fix-Con in fixing model conversion bugs of three widely used image +recognition models converted across four different deep learning frameworks. +Overall, Fix-Con was able to fix $462$ out of $755$ detected conversion faults, +either completely repairing or significantly improving the performance of $14$ +out of the $15$ erroneous conversion cases. + +
+
+ comment: 12 pages, 4 figures, 3 tables, 1 algorithm +
+
+
+
+
+ + ♻ ☆ DISN: Deep Implicit Surface Network for High-quality Single-view 3D + Reconstruction + + +
+ Reconstructing 3D shapes from single-view images has been a long-standing +research problem. In this paper, we present DISN, a Deep Implicit Surface +Network which can generate a high-quality detail-rich 3D mesh from an 2D image +by predicting the underlying signed distance fields. In addition to utilizing +global image features, DISN predicts the projected location for each 3D point +on the 2D image, and extracts local features from the image feature maps. +Combining global and local features significantly improves the accuracy of the +signed distance field prediction, especially for the detail-rich areas. To the +best of our knowledge, DISN is the first method that constantly captures +details such as holes and thin structures present in 3D shapes from single-view +images. DISN achieves the state-of-the-art single-view reconstruction +performance on a variety of shape categories reconstructed from both synthetic +and real images. Code is available at https://github.com/xharlie/DISN The +supplementary can be found at +https://xharlie.github.io/images/neurips_2019_supp.pdf + +
+
+ comment: This project was in part supported by the gift funding to the + University of Southern California from Adobe Research +
+
+
+
+
+ + ♻ ☆ Fault Localization for Buggy Deep Learning Framework Conversions in + Image Recognition + + +
+ When deploying Deep Neural Networks (DNNs), developers often convert models +from one deep learning framework to another (e.g., TensorFlow to PyTorch). +However, this process is error-prone and can impact target model accuracy. To +identify the extent of such impact, we perform and briefly present a +differential analysis against three DNNs widely used for image recognition +(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep +learning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which +revealed numerous model crashes and output label discrepancies of up to 100%. +To mitigate such errors, we present a novel approach towards fault localization +and repair of buggy deep learning framework conversions, focusing on +pre-trained image recognition models. Our technique consists of four stages of +analysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters, +and 4) graph representation. In addition, we propose various strategies towards +fault repair of the faults detected. We implement our technique on top of the +Apache TVM deep learning compiler, and we test it by conducting a preliminary +fault localization analysis for the conversion of InceptionV3 from TF to +TFLite. Our approach detected a fault in a common DNN converter tool, which +introduced precision errors in weights, reducing model accuracy. After our +fault localization, we repaired the issue, reducing our conversion error to +zero. + +
+
+ comment: 5 pages, 3 figures, 1 table +
+
+
+
+
+ + ♻ ☆ DeltaNN: Assessing the Impact of Computational Environment Parameters on + the Performance of Image Recognition Models + + +
+ Image recognition tasks typically use deep learning and require enormous +processing power, thus relying on hardware accelerators like GPUs and TPUs for +fast, timely processing. Failure in real-time image recognition tasks can occur +due to sub-optimal mapping on hardware accelerators during model deployment, +which may lead to timing uncertainty and erroneous behavior. Mapping on +hardware accelerators is done using multiple software components like deep +learning frameworks, compilers, and device libraries, that we refer to as the +computational environment. Owing to the increased use of image recognition +tasks in safety-critical applications like autonomous driving and medical +imaging, it is imperative to assess their robustness to changes in the +computational environment, as the impact of parameters like deep learning +frameworks, compiler optimizations, and hardware devices on model performance +and correctness is not yet well understood. + In this paper we present a differential testing framework, DeltaNN, that +allows us to assess the impact of different computational environment +parameters on the performance of image recognition models during deployment, +post training. DeltaNN generates different implementations of a given image +recognition model for variations in environment parameters, namely, deep +learning frameworks, compiler optimizations and hardware devices and analyzes +differences in model performance as a result. Using DeltaNN, we conduct an +empirical study of robustness analysis of three popular image recognition +models using the ImageNet dataset. We report the impact in terms of +misclassifications and inference time differences across different settings. In +total, we observed up to 100% output label differences across deep learning +frameworks, and up to 81% unexpected performance degradation in terms of +inference time, when applying compiler optimizations. + +
+
+ comment: 11 pages, 10 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Few-Shot Class Incremental Learning with Attention-Aware Self-Adaptive + Prompt + + +
+ Few-Shot Class-Incremental Learning (FSCIL) models aim to incrementally learn +new classes with scarce samples while preserving knowledge of old ones. +Existing FSCIL methods usually fine-tune the entire backbone, leading to +overfitting and hindering the potential to learn new classes. On the other +hand, recent prompt-based CIL approaches alleviate forgetting by training +prompts with sufficient data in each task. In this work, we propose a novel +framework named Attention-aware Self-adaptive Prompt (ASP). ASP encourages +task-invariant prompts to capture shared knowledge by reducing specific +information from the attention aspect. Additionally, self-adaptive +task-specific prompts in ASP provide specific information and transfer +knowledge from old classes to new classes with an Information Bottleneck +learning objective. In summary, ASP prevents overfitting on base task and does +not require enormous data in few-shot incremental tasks. Extensive experiments +on three benchmark datasets validate that ASP consistently outperforms +state-of-the-art FSCIL and prompt-based CIL methods in terms of both learning +new classes and mitigating forgetting. + +
+
+
+
+
+ + ♻ ☆ Pre-Trained Masked Image Model for Mobile Robot Navigation ICRA 2024 + + +
+ 2D top-down maps are commonly used for the navigation and exploration of +mobile robots through unknown areas. Typically, the robot builds the navigation +maps incrementally from local observations using onboard sensors. Recent works +have shown that predicting the structural patterns in the environment through +learning-based approaches can greatly enhance task efficiency. While many such +works build task-specific networks using limited datasets, we show that the +existing foundational vision networks can accomplish the same without any +fine-tuning. Specifically, we use Masked Autoencoders, pre-trained on street +images, to present novel applications for field-of-view expansion, single-agent +topological exploration, and multi-agent exploration for indoor mapping, across +different input modalities. Our work motivates the use of foundational vision +models for generalized structure prediction-driven applications, especially in +the dearth of training data. For more qualitative results see +https://raaslab.org/projects/MIM4Robots. + +
+
+ comment: Accepted at ICRA 2024 +
+
+
+
+
+ + ♻ ☆ LEOD: Label-Efficient Object Detection for Event Cameras CVPR 2024 + + +
+ Object detection with event cameras benefits from the sensor's low latency +and high dynamic range. However, it is costly to fully label event streams for +supervised training due to their high temporal resolution. To reduce this cost, +we present LEOD, the first method for label-efficient event-based detection. +Our approach unifies weakly- and semi-supervised object detection with a +self-training mechanism. We first utilize a detector pre-trained on limited +labels to produce pseudo ground truth on unlabeled events. Then, the detector +is re-trained with both real and generated labels. Leveraging the temporal +consistency of events, we run bi-directional inference and apply tracking-based +post-processing to enhance the quality of pseudo labels. To stabilize training +against label noise, we further design a soft anchor assignment strategy. We +introduce new experimental protocols to evaluate the task of label-efficient +event-based detection on Gen1 and 1Mpx datasets. LEOD consistently outperforms +supervised baselines across various labeling ratios. For example, on Gen1, it +improves mAP by 8.6% and 7.8% for RVT-S trained with 1% and 2% labels. On 1Mpx, +RVT-S with 10% labels even surpasses its fully-supervised counterpart using +100% labels. LEOD maintains its effectiveness even when all labeled data are +available, reaching new state-of-the-art results. Finally, we show that our +method readily scales to improve larger detectors as well. Code is released at +https://github.com/Wuziyi616/LEOD + +
+
+ comment: CVPR 2024. Code: https://github.com/Wuziyi616/LEOD +
+
+
+
+
+ + ♻ ☆ ContextSeg: Sketch Semantic Segmentation by Querying the Context with + Attention + + +
+ Sketch semantic segmentation is a well-explored and pivotal problem in +computer vision involving the assignment of pre-defined part labels to +individual strokes. This paper presents ContextSeg - a simple yet highly +effective approach to tackling this problem with two stages. In the first +stage, to better encode the shape and positional information of strokes, we +propose to predict an extra dense distance field in an autoencoder network to +reinforce structural information learning. In the second stage, we treat an +entire stroke as a single entity and label a group of strokes within the same +semantic part using an auto-regressive Transformer with the default attention +mechanism. By group-based labeling, our method can fully leverage the context +information when making decisions for the remaining groups of strokes. Our +method achieves the best segmentation accuracy compared with state-of-the-art +approaches on two representative datasets and has been extensively evaluated +demonstrating its superior performance. Additionally, we offer insights into +solving part imbalance in training data and the preliminary experiment on +cross-category training, which can inspire future research in this field. + +
+
+
+
+
+ + ♻ ☆ Fusing Domain-Specific Content from Large Language Models into Knowledge + Graphs for Enhanced Zero Shot Object State Classification AAAI + + +
+ Domain-specific knowledge can significantly contribute to addressing a wide +variety of vision tasks. However, the generation of such knowledge entails +considerable human labor and time costs. This study investigates the potential +of Large Language Models (LLMs) in generating and providing domain-specific +information through semantic embeddings. To achieve this, an LLM is integrated +into a pipeline that utilizes Knowledge Graphs and pre-trained semantic vectors +in the context of the Vision-based Zero-shot Object State Classification task. +We thoroughly examine the behavior of the LLM through an extensive ablation +study. Our findings reveal that the integration of LLM-based embeddings, in +combination with general-purpose pre-trained embeddings, leads to substantial +performance improvements. Drawing insights from this ablation study, we conduct +a comparative analysis against competing models, thereby highlighting the +state-of-the-art performance achieved by the proposed approach. + +
+
+ comment: Accepted at the AAAI-MAKE 24 +
+
+
+
+
+ + ♻ ☆ A Closer Look at the Few-Shot Adaptation of Large Vision-Language Models CVPR 2024 + + +
+ Efficient transfer learning (ETL) is receiving increasing attention to adapt +large pre-trained language-vision models on downstream tasks with a few labeled +samples. While significant progress has been made, we reveal that +state-of-the-art ETL approaches exhibit strong performance only in +narrowly-defined experimental setups, and with a careful adjustment of +hyperparameters based on a large corpus of labeled samples. In particular, we +make two interesting, and surprising empirical observations. First, to +outperform a simple Linear Probing baseline, these methods require to optimize +their hyper-parameters on each target task. And second, they typically +underperform -- sometimes dramatically -- standard zero-shot predictions in the +presence of distributional drifts. Motivated by the unrealistic assumptions +made in the existing literature, i.e., access to a large validation set and +case-specific grid-search for optimal hyperparameters, we propose a novel +approach that meets the requirements of real-world scenarios. More concretely, +we introduce a CLass-Adaptive linear Probe (CLAP) objective, whose balancing +term is optimized via an adaptation of the general Augmented Lagrangian method +tailored to this context. We comprehensively evaluate CLAP on a broad span of +datasets and scenarios, demonstrating that it consistently outperforms SoTA +approaches, while yet being a much more efficient alternative. + +
+
+ comment: CVPR 2024. Code: https://github.com/jusiro/CLAP +
+
+
+
+
+ + ♻ ☆ Testing MediaPipe Holistic for Linguistic Analysis of Nonmanual Markers + in Sign Languages + + +
+ Advances in Deep Learning have made possible reliable landmark tracking of +human bodies and faces that can be used for a variety of tasks. We test a +recent Computer Vision solution, MediaPipe Holistic (MPH), to find out if its +tracking of the facial features is reliable enough for a linguistic analysis of +data from sign languages, and compare it to an older solution (OpenFace, OF). +We use an existing data set of sentences in Kazakh-Russian Sign Language and a +newly created small data set of videos with head tilts and eyebrow movements. +We find that MPH does not perform well enough for linguistic analysis of +eyebrow movement - but in a different way from OF, which is also performing +poorly without correction. We reiterate a previous proposal to train additional +correction models to overcome these limitations. + +
+
+
+
+
+ + ♻ ☆ CADTalk: An Algorithm and Benchmark for Semantic Commenting of CAD + Programs + + +
+ CAD programs are a popular way to compactly encode shapes as a sequence of +operations that are easy to parametrically modify. However, without sufficient +semantic comments and structure, such programs can be challenging to +understand, let alone modify. We introduce the problem of semantic commenting +CAD programs, wherein the goal is to segment the input program into code blocks +corresponding to semantically meaningful shape parts and assign a semantic +label to each block. We solve the problem by combining program parsing with +visual-semantic analysis afforded by recent advances in foundational language +and vision models. Specifically, by executing the input programs, we create +shapes, which we use to generate conditional photorealistic images to make use +of semantic annotators for such images. We then distill the information across +the images and link back to the original programs to semantically comment on +them. Additionally, we collected and annotated a benchmark dataset, CADTalk, +consisting of 5,288 machine-made programs and 45 human-made programs with +ground truth semantic comments. We extensively evaluated our approach, compared +it to a GPT-based baseline, and an open-set shape segmentation baseline, and +reported an 83.24% accuracy on the new CADTalk dataset. Code and data: +https://enigma-li.github.io/CADTalk/. + +
+
+
+
+
+ + ♻ ☆ TetraSphere: A Neural Descriptor for O(3)-Invariant Point Cloud Analysis CVPR 2024 + + +
+ In many practical applications, 3D point cloud analysis requires rotation +invariance. In this paper, we present a learnable descriptor invariant under 3D +rotations and reflections, i.e., the O(3) actions, utilizing the recently +introduced steerable 3D spherical neurons and vector neurons. Specifically, we +propose an embedding of the 3D spherical neurons into 4D vector neurons, which +leverages end-to-end training of the model. In our approach, we perform +TetraTransform--an equivariant embedding of the 3D input into 4D, constructed +from the steerable neurons--and extract deeper O(3)-equivariant features using +vector neurons. This integration of the TetraTransform into the VN-DGCNN +framework, termed TetraSphere, negligibly increases the number of parameters by +less than 0.0002%. TetraSphere sets a new state-of-the-art performance +classifying randomly rotated real-world object scans of the challenging subsets +of ScanObjectNN. Additionally, TetraSphere outperforms all equivariant methods +on randomly rotated synthetic data: classifying objects from ModelNet40 and +segmenting parts of the ShapeNet shapes. Thus, our results reveal the practical +value of steerable 3D spherical neurons for learning in 3D Euclidean space. The +code is available at https://github.com/pavlo-melnyk/tetrasphere. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Check, Locate, Rectify: A Training-Free Layout Calibration System for + Text-to-Image Generation + + +
+ Diffusion models have recently achieved remarkable progress in generating +realistic images. However, challenges remain in accurately understanding and +synthesizing the layout requirements in the textual prompts. To align the +generated image with layout instructions, we present a training-free layout +calibration system SimM that intervenes in the generative process on the fly +during inference time. Specifically, following a "check-locate-rectify" +pipeline, the system first analyses the prompt to generate the target layout +and compares it with the intermediate outputs to automatically detect errors. +Then, by moving the located activations and making intra- and inter-map +adjustments, the rectification process can be performed with negligible +computational overhead. To evaluate SimM over a range of layout requirements, +we present a benchmark SimMBench that compensates for the lack of superlative +spatial relations in existing datasets. And both quantitative and qualitative +results demonstrate the effectiveness of the proposed SimM in calibrating the +layout inconsistencies. Our project page is at https://simm-t2i.github.io/SimM. + +
+
+
+
+
+ + ♻ ☆ From Pixels to Insights: A Survey on Automatic Chart Understanding in + the Era of Large Foundation Models + + +
+ Data visualization in the form of charts plays a pivotal role in data +analysis, offering critical insights and aiding in informed decision-making. +Automatic chart understanding has witnessed significant advancements with the +rise of large foundation models in recent years. Foundation models, such as +large language models, have revolutionized various natural language processing +tasks and are increasingly being applied to chart understanding tasks. This +survey paper provides a comprehensive overview of the recent developments, +challenges, and future directions in chart understanding within the context of +these foundation models. We review fundamental building blocks crucial for +studying chart understanding tasks. Additionally, we explore various tasks and +their evaluation metrics and sources of both charts and textual inputs. Various +modeling strategies are then examined, encompassing both classification-based +and generation-based approaches, along with tool augmentation techniques that +enhance chart understanding performance. Furthermore, we discuss the +state-of-the-art performance of each task and discuss how we can improve the +performance. Challenges and future directions are addressed, highlighting the +importance of several topics, such as domain-specific charts, lack of efforts +in developing evaluation metrics, and agent-oriented settings. This survey +paper serves as a comprehensive resource for researchers and practitioners in +the fields of natural language processing, computer vision, and data analysis, +providing valuable insights and directions for future research in chart +understanding leveraging large foundation models. The studies mentioned in this +paper, along with emerging new research, will be continually updated at: +https://github.com/khuangaf/Awesome-Chart-Understanding. + +
+
+
+
+
+ + ♻ ☆ Learning Disentangled Identifiers for Action-Customized Text-to-Image + Generation + + +
+ This study focuses on a novel task in text-to-image (T2I) generation, namely +action customization. The objective of this task is to learn the co-existing +action from limited data and generalize it to unseen humans or even animals. +Experimental results show that existing subject-driven customization methods +fail to learn the representative characteristics of actions and struggle in +decoupling actions from context features, including appearance. To overcome the +preference for low-level features and the entanglement of high-level features, +we propose an inversion-based method Action-Disentangled Identifier (ADI) to +learn action-specific identifiers from the exemplar images. ADI first expands +the semantic conditioning space by introducing layer-wise identifier tokens, +thereby increasing the representational richness while distributing the +inversion across different features. Then, to block the inversion of +action-agnostic features, ADI extracts the gradient invariance from the +constructed sample triples and masks the updates of irrelevant channels. To +comprehensively evaluate the task, we present an ActionBench that includes a +variety of actions, each accompanied by meticulously selected samples. Both +quantitative and qualitative results show that our ADI outperforms existing +baselines in action-customized T2I generation. Our project page is at +https://adi-t2i.github.io/ADI. + +
+
+
+
+
+ + ♻ ☆ TACO: Benchmarking Generalizable Bimanual Tool-ACtion-Object + Understanding + + +
+ Humans commonly work with multiple objects in daily life and can intuitively +transfer manipulation skills to novel objects by understanding object +functional regularities. However, existing technical approaches for analyzing +and synthesizing hand-object manipulation are mostly limited to handling a +single hand and object due to the lack of data support. To address this, we +construct TACO, an extensive bimanual hand-object-interaction dataset spanning +a large variety of tool-action-object compositions for daily human activities. +TACO contains 2.5K motion sequences paired with third-person and egocentric +views, precise hand-object 3D meshes, and action labels. To rapidly expand the +data scale, we present a fully automatic data acquisition pipeline combining +multi-view sensing with an optical motion capture system. With the vast +research fields provided by TACO, we benchmark three generalizable +hand-object-interaction tasks: compositional action recognition, generalizable +hand-object motion forecasting, and cooperative grasp synthesis. Extensive +experiments reveal new insights, challenges, and opportunities for advancing +the studies of generalizable hand-object motion analysis and synthesis. Our +data and code are available at https://taco2024.github.io. + +
+
+
+
+
+ + ♻ ☆ Estimating Uncertainty in Landslide Segmentation Models + + +
+ Landslides are a recurring, widespread hazard. Preparation and mitigation +efforts can be aided by a high-quality, large-scale dataset that covers global +at-risk areas. Such a dataset currently does not exist and is impossible to +construct manually. Recent automated efforts focus on deep learning models for +landslide segmentation (pixel labeling) from satellite imagery. However, it is +also important to characterize the uncertainty or confidence levels of such +segmentations. Accurate and robust uncertainty estimates can enable low-cost +(in terms of manual labor) oversight of auto-generated landslide databases to +resolve errors, identify hard negative examples, and increase the size of +labeled training data. In this paper, we evaluate several methods for assessing +pixel-level uncertainty of the segmentation. Three methods that do not require +architectural changes were compared, including Pre-Threshold activations, +Monte-Carlo Dropout and Test-Time Augmentation -- a method that measures the +robustness of predictions in the face of data augmentation. Experimentally, the +quality of the latter method was consistently higher than the others across a +variety of models and metrics in our dataset. + +
+
+
+
+
+ + ♻ ☆ Point Transformer V3: Simpler, Faster, Stronger CVPR 2024 + + +
+ This paper is not motivated to seek innovation within the attention +mechanism. Instead, it focuses on overcoming the existing trade-offs between +accuracy and efficiency within the context of point cloud processing, +leveraging the power of scale. Drawing inspiration from recent advances in 3D +large-scale representation learning, we recognize that model performance is +more influenced by scale than by intricate design. Therefore, we present Point +Transformer V3 (PTv3), which prioritizes simplicity and efficiency over the +accuracy of certain mechanisms that are minor to the overall performance after +scaling, such as replacing the precise neighbor search by KNN with an efficient +serialized neighbor mapping of point clouds organized with specific patterns. +This principle enables significant scaling, expanding the receptive field from +16 to 1024 points while remaining efficient (a 3x increase in processing speed +and a 10x improvement in memory efficiency compared with its predecessor, +PTv2). PTv3 attains state-of-the-art results on over 20 downstream tasks that +span both indoor and outdoor scenarios. Further enhanced with multi-dataset +joint training, PTv3 pushes these results to a higher level. + +
+
+ comment: CVPR 2024, code available at Pointcept + (https://github.com/Pointcept/PointTransformerV3) +
+
+
+
+
+ + ♻ ☆ A Forward and Backward Compatible Framework for Few-shot + Class-incremental Pill Recognition + + +
+ Automatic Pill Recognition (APR) systems are crucial for enhancing hospital +efficiency, assisting visually impaired individuals, and preventing +cross-infection. However, most existing deep learning-based pill recognition +systems can only perform classification on classes with sufficient training +data. In practice, the high cost of data annotation and the continuous increase +in new pill classes necessitate the development of a few-shot class-incremental +pill recognition system. This paper introduces the first few-shot +class-incremental pill recognition framework, named Discriminative and +Bidirectional Compatible Few-Shot Class-Incremental Learning (DBC-FSCIL). It +encompasses forward-compatible and backward-compatible learning components. In +forward-compatible learning, we propose an innovative virtual class synthesis +strategy and a Center-Triplet (CT) loss to enhance discriminative feature +learning. These virtual classes serve as placeholders in the feature space for +future class updates, providing diverse semantic knowledge for model training. +For backward-compatible learning, we develop a strategy to synthesize reliable +pseudo-features of old classes using uncertainty quantification, facilitating +Data Replay (DR) and Knowledge Distillation (KD). This approach allows for the +flexible synthesis of features and effectively reduces additional storage +requirements for samples and models. Additionally, we construct a new pill +image dataset for FSCIL and assess various mainstream FSCIL methods, +establishing new benchmarks. Our experimental results demonstrate that our +framework surpasses existing State-of-the-art (SOTA) methods. The code is +available at https://github.com/zhang-jinghua/DBC-FSCIL. + +
+
+
+
+
+ + ♻ ☆ Efficient Dataset Distillation via Minimax Diffusion CVPR 2024 + + +
+ Dataset distillation reduces the storage and computational consumption of +training a network by generating a small surrogate dataset that encapsulates +rich information of the original large-scale one. However, previous +distillation methods heavily rely on the sample-wise iterative optimization +scheme. As the images-per-class (IPC) setting or image resolution grows larger, +the necessary computation will demand overwhelming time and resources. In this +work, we intend to incorporate generative diffusion techniques for computing +the surrogate dataset. Observing that key factors for constructing an effective +surrogate dataset are representativeness and diversity, we design additional +minimax criteria in the generative training to enhance these facets for the +generated images of diffusion models. We present a theoretical model of the +process as hierarchical diffusion control demonstrating the flexibility of the +diffusion process to target these criteria without jeopardizing the +faithfulness of the sample to the desired distribution. The proposed method +achieves state-of-the-art validation performance while demanding much less +computational resources. Under the 100-IPC setting on ImageWoof, our method +requires less than one-twentieth the distillation time of previous methods, yet +yields even better performance. Source code and generated data are available in +https://github.com/vimar-gu/MinimaxDiffusion. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Zero-BEV: Zero-shot Projection of Any First-Person Modality to BEV Maps + + +
+ Bird's-eye view (BEV) maps are an important geometrically structured +representation widely used in robotics, in particular self-driving vehicles and +terrestrial robots. Existing algorithms either require depth information for +the geometric projection, which is not always reliably available, or are +trained end-to-end in a fully supervised way to map visual first-person +observations to BEV representation, and are therefore restricted to the output +modality they have been trained for. In contrast, we propose a new model +capable of performing zero-shot projections of any modality available in a +first person view to the corresponding BEV map. This is achieved by +disentangling the geometric inverse perspective projection from the modality +transformation, eg. RGB to occupancy. The method is general and we showcase +experiments projecting to BEV three different modalities: semantic +segmentation, motion vectors and object bounding boxes detected in first +person. We experimentally show that the model outperforms competing methods, in +particular the widely used baseline resorting to monocular depth estimation. + +
+
+
+
+
+ + ♻ ☆ LAENeRF: Local Appearance Editing for Neural Radiance Fields CVPR 2024 + + +
+ Due to the omnipresence of Neural Radiance Fields (NeRFs), the interest +towards editable implicit 3D representations has surged over the last years. +However, editing implicit or hybrid representations as used for NeRFs is +difficult due to the entanglement of appearance and geometry encoded in the +model parameters. Despite these challenges, recent research has shown first +promising steps towards photorealistic and non-photorealistic appearance edits. +The main open issues of related work include limited interactivity, a lack of +support for local edits and large memory requirements, rendering them less +useful in practice. We address these limitations with LAENeRF, a unified +framework for photorealistic and non-photorealistic appearance editing of +NeRFs. To tackle local editing, we leverage a voxel grid as starting point for +region selection. We learn a mapping from expected ray terminations to final +output color, which can optionally be supervised by a style loss, resulting in +a framework which can perform photorealistic and non-photorealistic appearance +editing of selected regions. Relying on a single point per ray for our mapping, +we limit memory requirements and enable fast optimization. To guarantee +interactivity, we compose the output color using a set of learned, modifiable +base colors, composed with additive layer mixing. Compared to concurrent work, +LAENeRF enables recoloring and stylization while keeping processing time low. +Furthermore, we demonstrate that our approach surpasses baseline methods both +quantitatively and qualitatively. + +
+
+ comment: Accepted to CVPR 2024! Project website: + https://r4dl.github.io/LAENeRF/ +
+
+
+
+
+ + ♻ ☆ A Call to Reflect on Evaluation Practices for Age Estimation: + Comparative Analysis of the State-of-the-Art and a Unified Benchmark CVPR 2024 + + +
+ Comparing different age estimation methods poses a challenge due to the +unreliability of published results stemming from inconsistencies in the +benchmarking process. Previous studies have reported continuous performance +improvements over the past decade using specialized methods; however, our +findings challenge these claims. This paper identifies two trivial, yet +persistent issues with the currently used evaluation protocol and describes how +to resolve them. We offer an extensive comparative analysis for +state-of-the-art facial age estimation methods. Surprisingly, we find that the +performance differences between the methods are negligible compared to the +effect of other factors, such as facial alignment, facial coverage, image +resolution, model architecture, or the amount of data used for pretraining. We +use the gained insights to propose using FaRL as the backbone model and +demonstrate its effectiveness on all public datasets. We make the source code +and exact data splits public on GitHub. + +
+
+ comment: CVPR 2024 Camera-Ready +
+
+
+
+
+ + ♻ ☆ Investigating and Mitigating the Side Effects of Noisy Views for + Self-Supervised Clustering Algorithms in Practical Multi-View Scenarios + + +
+ Multi-view clustering (MVC) aims at exploring category structures among +multi-view data in self-supervised manners. Multiple views provide more +information than single views and thus existing MVC methods can achieve +satisfactory performance. However, their performance might seriously degenerate +when the views are noisy in practical multi-view scenarios. In this paper, we +formally investigate the drawback of noisy views and then propose a +theoretically grounded deep MVC method (namely MVCAN) to address this issue. +Specifically, we propose a novel MVC objective that enables un-shared +parameters and inconsistent clustering predictions across multiple views to +reduce the side effects of noisy views. Furthermore, a two-level multi-view +iterative optimization is designed to generate robust learning targets for +refining individual views' representation learning. Theoretical analysis +reveals that MVCAN works by achieving the multi-view consistency, +complementarity, and noise robustness. Finally, experiments on extensive public +datasets demonstrate that MVCAN outperforms state-of-the-art methods and is +robust against the existence of noisy views. + +
+
+
+
+
+ + ♻ ☆ V4D: Voxel for 4D Novel View Synthesis + + +
+ Neural radiance fields have made a remarkable breakthrough in the novel view +synthesis task at the 3D static scene. However, for the 4D circumstance (e.g., +dynamic scene), the performance of the existing method is still limited by the +capacity of the neural network, typically in a multilayer perceptron network +(MLP). In this paper, we utilize 3D Voxel to model the 4D neural radiance +field, short as V4D, where the 3D voxel has two formats. The first one is to +regularly model the 3D space and then use the sampled local 3D feature with the +time index to model the density field and the texture field by a tiny MLP. The +second one is in look-up tables (LUTs) format that is for the pixel-level +refinement, where the pseudo-surface produced by the volume rendering is +utilized as the guidance information to learn a 2D pixel-level refinement +mapping. The proposed LUTs-based refinement module achieves the performance +gain with little computational cost and could serve as the plug-and-play module +in the novel view synthesis task. Moreover, we propose a more effective +conditional positional encoding toward the 4D data that achieves performance +gain with negligible computational burdens. Extensive experiments demonstrate +that the proposed method achieves state-of-the-art performance at a low +computational cost. + +
+
+ comment: Code released. Accepted by IEEE TVCG 2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 111 + +
+
+
+ + ☆ AutoInst: Automatic Instance-Based Segmentation of LiDAR 3D Scans + + +
+ Recently, progress in acquisition equipment such as LiDAR sensors has enabled +sensing increasingly spacious outdoor 3D environments. Making sense of such 3D +acquisitions requires fine-grained scene understanding, such as constructing +instance-based 3D scene segmentations. Commonly, a neural network is trained +for this task; however, this requires access to a large, densely annotated +dataset, which is widely known to be challenging to obtain. To address this +issue, in this work we propose to predict instance segmentations for 3D scenes +in an unsupervised way, without relying on ground-truth annotations. To this +end, we construct a learning framework consisting of two components: (1) a +pseudo-annotation scheme for generating initial unsupervised pseudo-labels; and +(2) a self-training algorithm for instance segmentation to fit robust, accurate +instances from initial noisy proposals. To enable generating 3D instance mask +proposals, we construct a weighted proxy-graph by connecting 3D points with +edges integrating multi-modal image- and point-based self-supervised features, +and perform graph-cuts to isolate individual pseudo-instances. We then build on +a state-of-the-art point-based architecture and train a 3D instance +segmentation model, resulting in significant refinement of initial proposals. +To scale to arbitrary complexity 3D scenes, we design our algorithm to operate +on local 3D point chunks and construct a merging step to generate scene-level +instance segmentations. Experiments on the challenging SemanticKITTI benchmark +demonstrate the potential of our approach, where it attains 13.3% higher +Average Precision and 9.1% higher F1 score compared to the best-performing +baseline. The code will be made publicly available at +https://github.com/artonson/autoinst. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ latentSplat: Autoencoding Variational Gaussians for Fast Generalizable + 3D Reconstruction + + +
+ We present latentSplat, a method to predict semantic Gaussians in a 3D latent +space that can be splatted and decoded by a light-weight generative 2D +architecture. Existing methods for generalizable 3D reconstruction either do +not enable fast inference of high resolution novel views due to slow volume +rendering, or are limited to interpolation of close input views, even in +simpler settings with a single central object, where 360-degree generalization +is possible. In this work, we combine a regression-based approach with a +generative model, moving towards both of these capabilities within the same +method, trained purely on readily available real video data. The core of our +method are variational 3D Gaussians, a representation that efficiently encodes +varying uncertainty within a latent space consisting of 3D feature Gaussians. +From these Gaussians, specific instances can be sampled and rendered via +efficient Gaussian splatting and a fast, generative decoder network. We show +that latentSplat outperforms previous works in reconstruction quality and +generalization, while being fast and scalable to high-resolution data. + +
+
+ comment: Project website: https://geometric-rl.mpi-inf.mpg.de/latentsplat/ +
+
+
+
+
+ + ☆ HemoSet: The First Blood Segmentation Dataset for Automation of + Hemostasis Management + + +
+ Hemorrhaging occurs in surgeries of all types, forcing surgeons to quickly +adapt to the visual interference that results from blood rapidly filling the +surgical field. Introducing automation into the crucial surgical task of +hemostasis management would offload mental and physical tasks from the surgeon +and surgical assistants while simultaneously increasing the efficiency and +safety of the operation. The first step in automation of hemostasis management +is detection of blood in the surgical field. To propel the development of blood +detection algorithms in surgeries, we present HemoSet, the first blood +segmentation dataset based on bleeding during a live animal robotic surgery. +Our dataset features vessel hemorrhage scenarios where turbulent flow leads to +abnormal pooling geometries in surgical fields. These pools are formed in +conditions endemic to surgical procedures -- uneven heterogeneous tissue, under +glossy lighting conditions and rapid tool movement. We benchmark several +state-of-the-art segmentation models and provide insight into the difficulties +specific to blood detection. We intend for HemoSet to spur development of +autonomous blood suction tools by providing a platform for training and +refining blood segmentation models, addressing the precision needed for such +robotics. + +
+
+
+
+
+ + ☆ AVicuna: Audio-Visual LLM with Interleaver and Context-Boundary + Alignment for Temporal Referential Dialogue + + +
+ In everyday communication, humans frequently use speech and gestures to refer +to specific areas or objects, a process known as Referential Dialogue (RD). +While prior studies have investigated RD through Large Language Models (LLMs) +or Large Multimodal Models (LMMs) in static contexts, the exploration of +Temporal Referential Dialogue (TRD) within audio-visual media remains limited. +Two primary challenges hinder progress in this field: (1) the absence of +comprehensive, untrimmed audio-visual video datasets with precise temporal +annotations, and (2) the need for methods to integrate complex temporal +auditory and visual cues effectively. To address these challenges, we introduce +a novel framework to generate PU-VALOR, an extensive audio-visual dataset +comprising over 114,000 untrimmed videos with accurate temporal demarcations. +We also present AVicuna, featuring an Audio-Visual Tokens Interleaver (AVTI) +that ensures the temporal alignment of audio-visual information. Additionally, +we develop the A5-222K dataset, encompassing more than 200,000 audio-text +pairings, to facilitate the audio and text alignments. Our experiments +demonstrate that AVicuna can effectively handle TRD in audio-visual videos and +achieve state-of-the-art performance on various audio-visual video +understanding tasks, particularly in untrimmed videos. We further investigate +the optimal audio-interleaving rate for interleaved audio-visual inputs, which +maximizes performance on the Audio-Visual Event Dense Localization task. + +
+
+
+
+
+ + ☆ L-MAE: Longitudinal masked auto-encoder with time and severity-aware + encoding for diabetic retinopathy progression prediction + + +
+ Pre-training strategies based on self-supervised learning (SSL) have proven +to be effective pretext tasks for many downstream tasks in computer vision. Due +to the significant disparity between medical and natural images, the +application of typical SSL is not straightforward in medical imaging. +Additionally, those pretext tasks often lack context, which is critical for +computer-aided clinical decision support. In this paper, we developed a +longitudinal masked auto-encoder (MAE) based on the well-known +Transformer-based MAE. In particular, we explored the importance of time-aware +position embedding as well as disease progression-aware masking. Taking into +account the time between examinations instead of just scheduling them offers +the benefit of capturing temporal changes and trends. The masking strategy, for +its part, evolves during follow-up to better capture pathological changes, +ensuring a more accurate assessment of disease progression. Using OPHDIAT, a +large follow-up screening dataset targeting diabetic retinopathy (DR), we +evaluated the pre-trained weights on a longitudinal task, which is to predict +the severity label of the next visit within 3 years based on the past time +series examinations. Our results demonstrated the relevancy of both time-aware +position embedding and masking strategies based on disease progression +knowledge. Compared to popular baseline models and standard longitudinal +Transformers, these simple yet effective extensions significantly enhance the +predictive ability of deep classification models. + +
+
+
+
+
+ + ☆ Object Detectors in the Open Environment:Challenges, Solutions, and + Outlook + + +
+ With the emergence of foundation models, deep learning-based object detectors +have shown practical usability in closed set scenarios. However, for real-world +tasks, object detectors often operate in open environments, where crucial +factors (\eg, data distribution, objective) that influence model learning are +often changing. The dynamic and intricate nature of the open environment poses +novel and formidable challenges to object detectors. Unfortunately, current +research on object detectors in open environments lacks a comprehensive +analysis of their distinctive characteristics, challenges, and corresponding +solutions, which hinders their secure deployment in critical real-world +scenarios. This paper aims to bridge this gap by conducting a comprehensive +review and analysis of object detectors in open environments. We initially +identified limitations of key structural components within the existing +detection pipeline and propose the open environment object detector challenge +framework that includes four quadrants (\ie, out-of-domain, out-of-category, +robust learning, and incremental learning) based on the dimensions of the data +/ target changes. For each quadrant of challenges in the proposed framework, we +present a detailed description and systematic analysis of the overarching goals +and core difficulties, systematically review the corresponding solutions, and +benchmark their performance over multiple widely adopted datasets. In addition, +we engage in a discussion of open problems and potential avenues for future +research. This paper aims to provide a fresh, comprehensive, and systematic +understanding of the challenges and solutions associated with open-environment +object detectors, thus catalyzing the development of more solid applications in +real-world scenarios. + +
+
+ comment: 32 pages, 17 figures +
+
+
+
+
+ + ☆ Constricting Normal Latent Space for Anomaly Detection with Normal-only + Training Data ICLR + + +
+ In order to devise an anomaly detection model using only normal training +data, an autoencoder (AE) is typically trained to reconstruct the data. As a +result, the AE can extract normal representations in its latent space. During +test time, since AE is not trained using real anomalies, it is expected to +poorly reconstruct the anomalous data. However, several researchers have +observed that it is not the case. In this work, we propose to limit the +reconstruction capability of AE by introducing a novel latent constriction +loss, which is added to the existing reconstruction loss. By using our method, +no extra computational cost is added to the AE during test time. Evaluations +using three video anomaly detection benchmark datasets, i.e., Ped2, Avenue, and +ShanghaiTech, demonstrate the effectiveness of our method in limiting the +reconstruction capability of AE, which leads to a better anomaly detection +model. + +
+
+ comment: ICLR Workshop 2024 (PML4LRS) +
+
+
+
+
+ + ☆ Emotion Recognition from the perspective of Activity Recognition + + +
+ Applications of an efficient emotion recognition system can be found in +several domains such as medicine, driver fatigue surveillance, social robotics, +and human-computer interaction. Appraising human emotional states, behaviors, +and reactions displayed in real-world settings can be accomplished using latent +continuous dimensions. Continuous dimensional models of human affect, such as +those based on valence and arousal are more accurate in describing a broad +range of spontaneous everyday emotions than more traditional models of discrete +stereotypical emotion categories (e.g. happiness, surprise). Most of the prior +work on estimating valence and arousal considers laboratory settings and acted +data. But, for emotion recognition systems to be deployed and integrated into +real-world mobile and computing devices, we need to consider data collected in +the world. Action recognition is a domain of Computer Vision that involves +capturing complementary information on appearance from still frames and motion +between frames. In this paper, we treat emotion recognition from the +perspective of action recognition by exploring the application of deep learning +architectures specifically designed for action recognition, for continuous +affect recognition. We propose a novel three-stream end-to-end deep learning +regression pipeline with an attention mechanism, which is an ensemble design +based on sub-modules of multiple state-of-the-art action recognition systems. +The pipeline constitutes a novel data pre-processing approach with a spatial +self-attention mechanism to extract keyframes. The optical flow of +high-attention regions of the face is extracted to capture temporal context. +AFEW-VA in-the-wild dataset has been used to conduct comparative experiments. +Quantitative analysis shows that the proposed model outperforms multiple +standard baselines of both emotion recognition and action recognition models. + +
+
+
+
+
+ + ☆ Out-of-Distribution Detection via Deep Multi-Comprehension Ensemble + + +
+ Recent research underscores the pivotal role of the Out-of-Distribution (OOD) +feature representation field scale in determining the efficacy of models in OOD +detection. Consequently, the adoption of model ensembles has emerged as a +prominent strategy to augment this feature representation field, capitalizing +on anticipated model diversity. + However, our introduction of novel qualitative and quantitative model +ensemble evaluation methods, specifically Loss Basin/Barrier Visualization and +the Self-Coupling Index, reveals a critical drawback in existing ensemble +methods. We find that these methods incorporate weights that are +affine-transformable, exhibiting limited variability and thus failing to +achieve the desired diversity in feature representation. + To address this limitation, we elevate the dimensions of traditional model +ensembles, incorporating various factors such as different weight +initializations, data holdout, etc., into distinct supervision tasks. This +innovative approach, termed Multi-Comprehension (MC) Ensemble, leverages +diverse training tasks to generate distinct comprehensions of the data and +labels, thereby extending the feature representation field. + Our experimental results demonstrate the superior performance of the MC +Ensemble strategy in OOD detection compared to both the naive Deep Ensemble +method and a standalone model of comparable size. This underscores the +effectiveness of our proposed approach in enhancing the model's capability to +detect instances outside its training distribution. + +
+
+
+
+
+ + ☆ Laplacian-guided Entropy Model in Neural Codec with Blur-dissipated + Synthesis CVPR2024 + + +
+ While replacing Gaussian decoders with a conditional diffusion model enhances +the perceptual quality of reconstructions in neural image compression, their +lack of inductive bias for image data restricts their ability to achieve +state-of-the-art perceptual levels. To address this limitation, we adopt a +non-isotropic diffusion model at the decoder side. This model imposes an +inductive bias aimed at distinguishing between frequency contents, thereby +facilitating the generation of high-quality images. Moreover, our framework is +equipped with a novel entropy model that accurately models the probability +distribution of latent representation by exploiting spatio-channel correlations +in latent space, while accelerating the entropy decoding step. This +channel-wise entropy model leverages both local and global spatial contexts +within each channel chunk. The global spatial context is built upon the +Transformer, which is specifically designed for image compression tasks. The +designed Transformer employs a Laplacian-shaped positional encoding, the +learnable parameters of which are adaptively adjusted for each channel cluster. +Our experiments demonstrate that our proposed framework yields better +perceptual quality compared to cutting-edge generative-based codecs, and the +proposed entropy model contributes to notable bitrate savings. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Unlearning Backdoor Threats: Enhancing Backdoor Defense in Multimodal + Contrastive Learning via Local Token Unlearning + + +
+ Multimodal contrastive learning has emerged as a powerful paradigm for +building high-quality features using the complementary strengths of various +data modalities. However, the open nature of such systems inadvertently +increases the possibility of backdoor attacks. These attacks subtly embed +malicious behaviors within the model during training, which can be activated by +specific triggers in the inference phase, posing significant security risks. +Despite existing countermeasures through fine-tuning that reduce the adverse +impacts of such attacks, these defenses often degrade the clean accuracy and +necessitate the construction of extensive clean training pairs. In this paper, +we explore the possibility of a less-cost defense from the perspective of model +unlearning, that is, whether the model can be made to quickly \textbf{u}nlearn +\textbf{b}ackdoor \textbf{t}hreats (UBT) by constructing a small set of +poisoned samples. Specifically, we strengthen the backdoor shortcuts to +discover suspicious samples through overfitting training prioritized by weak +similarity samples. Building on the initial identification of suspicious +samples, we introduce an innovative token-based localized forgetting training +regime. This technique specifically targets the poisoned aspects of the model, +applying a focused effort to unlearn the backdoor associations and trying not +to damage the integrity of the overall model. Experimental results show that +our method not only ensures a minimal success rate for attacks, but also +preserves the model's high clean accuracy. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ Partially Blinded Unlearning: Class Unlearning for Deep Networks a + Bayesian Perspective + + +
+ In order to adhere to regulatory standards governing individual data privacy +and safety, machine learning models must systematically eliminate information +derived from specific subsets of a user's training data that can no longer be +utilized. The emerging discipline of Machine Unlearning has arisen as a pivotal +area of research, facilitating the process of selectively discarding +information designated to specific sets or classes of data from a pre-trained +model, thereby eliminating the necessity for extensive retraining from scratch. +The principal aim of this study is to formulate a methodology tailored for the +purposeful elimination of information linked to a specific class of data from a +pre-trained classification network. This intentional removal is crafted to +degrade the model's performance specifically concerning the unlearned data +class while concurrently minimizing any detrimental impacts on the model's +performance in other classes. To achieve this goal, we frame the class +unlearning problem from a Bayesian perspective, which yields a loss function +that minimizes the log-likelihood associated with the unlearned data with a +stability regularization in parameter space. This stability regularization +incorporates Mohalanobis distance with respect to the Fisher Information matrix +and $l_2$ distance from the pre-trained model parameters. Our novel approach, +termed \textbf{Partially-Blinded Unlearning (PBU)}, surpasses existing +state-of-the-art class unlearning methods, demonstrating superior +effectiveness. Notably, PBU achieves this efficacy without requiring awareness +of the entire training dataset but only to the unlearned data points, marking a +distinctive feature of its performance. + +
+
+
+
+
+ + ☆ On the Equivalency, Substitutability, and Flexibility of Synthetic Data + + +
+ We study, from an empirical standpoint, the efficacy of synthetic data in +real-world scenarios. Leveraging synthetic data for training perception models +has become a key strategy embraced by the community due to its efficiency, +scalability, perfect annotations, and low costs. Despite proven advantages, few +studies put their stress on how to efficiently generate synthetic datasets to +solve real-world problems and to what extent synthetic data can reduce the +effort for real-world data collection. To answer the questions, we +systematically investigate several interesting properties of synthetic data -- +the equivalency of synthetic data to real-world data, the substitutability of +synthetic data for real data, and the flexibility of synthetic data generators +to close up domain gaps. Leveraging the M3Act synthetic data generator, we +conduct experiments on DanceTrack and MOT17. Our results suggest that synthetic +data not only enhances model performance but also demonstrates substitutability +for real data, with 60% to 80% replacement without performance loss. In +addition, our study of the impact of synthetic data distributions on downstream +performance reveals the importance of flexible data generators in narrowing +domain gaps for improved model adaptability. + +
+
+
+
+
+ + ☆ Adversarially Masked Video Consistency for Unsupervised Domain + Adaptation + + +
+ We study the problem of unsupervised domain adaptation for egocentric videos. +We propose a transformer-based model to learn class-discriminative and +domain-invariant feature representations. It consists of two novel designs. The +first module is called Generative Adversarial Domain Alignment Network with the +aim of learning domain-invariant representations. It simultaneously learns a +mask generator and a domain-invariant encoder in an adversarial way. The +domain-invariant encoder is trained to minimize the distance between the source +and target domain. The masking generator, conversely, aims at producing +challenging masks by maximizing the domain distance. The second is a Masked +Consistency Learning module to learn class-discriminative representations. It +enforces the prediction consistency between the masked target videos and their +full forms. To better evaluate the effectiveness of domain adaptation methods, +we construct a more challenging benchmark for egocentric videos, U-Ego4D. Our +method achieves state-of-the-art performance on the Epic-Kitchen and the +proposed U-Ego4D benchmark. + +
+
+
+
+
+ + ☆ Low Rank Groupwise Deformations for Motion Tracking in Cardiac Cine MRI + + +
+ Diffeomorphic image registration is a commonly used method to deform one +image to resemble another. While warping a single image to another is useful, +it can be advantageous to warp multiple images simultaneously, such as in +tracking the motion of the heart across a sequence of images. In this paper, +our objective is to propose a novel method capable of registering a group or +sequence of images to a target image, resulting in registered images that +appear identical and therefore have a low rank. Moreover, we aim for these +registered images to closely resemble the target image. Through experimental +evidence, we will demonstrate our method's superior efficacy in producing +low-rank groupwise deformations compared to other state-of-the-art approaches. + +
+
+ comment: A thesis submitted to the University of Birmingham for MSc Degree +
+
+
+
+
+ + ☆ Dual-modal Prior Semantic Guided Infrared and Visible Image Fusion for + Intelligent Transportation System + + +
+ Infrared and visible image fusion (IVF) plays an important role in +intelligent transportation system (ITS). The early works predominantly focus on +boosting the visual appeal of the fused result, and only several recent +approaches have tried to combine the high-level vision task with IVF. However, +they prioritize the design of cascaded structure to seek unified suitable +features and fit different tasks. Thus, they tend to typically bias toward to +reconstructing raw pixels without considering the significance of semantic +features. Therefore, we propose a novel prior semantic guided image fusion +method based on the dual-modality strategy, improving the performance of IVF in +ITS. Specifically, to explore the independent significant semantic of each +modality, we first design two parallel semantic segmentation branches with a +refined feature adaptive-modulation (RFaM) mechanism. RFaM can perceive the +features that are semantically distinct enough in each semantic segmentation +branch. Then, two pilot experiments based on the two branches are conducted to +capture the significant prior semantic of two images, which then is applied to +guide the fusion task in the integration of semantic segmentation branches and +fusion branches. In addition, to aggregate both high-level semantics and +impressive visual effects, we further investigate the frequency response of the +prior semantics, and propose a multi-level representation-adaptive fusion +(MRaF) module to explicitly integrate the low-frequent prior semantic with the +high-frequent details. Extensive experiments on two public datasets demonstrate +the superiority of our method over the state-of-the-art image fusion +approaches, in terms of either the visual appeal or the high-level semantics. + +
+
+
+
+
+ + ☆ Inverse Rendering of Glossy Objects via the Neural Plenoptic Function + and Radiance Fields CVPR 2024 + + +
+ Inverse rendering aims at recovering both geometry and materials of objects. +It provides a more compatible reconstruction for conventional rendering +engines, compared with the neural radiance fields (NeRFs). On the other hand, +existing NeRF-based inverse rendering methods cannot handle glossy objects with +local light interactions well, as they typically oversimplify the illumination +as a 2D environmental map, which assumes infinite lights only. Observing the +superiority of NeRFs in recovering radiance fields, we propose a novel 5D +Neural Plenoptic Function (NeP) based on NeRFs and ray tracing, such that more +accurate lighting-object interactions can be formulated via the rendering +equation. We also design a material-aware cone sampling strategy to efficiently +integrate lights inside the BRDF lobes with the help of pre-filtered radiance +fields. Our method has two stages: the geometry of the target object and the +pre-filtered environmental radiance fields are reconstructed in the first +stage, and materials of the target object are estimated in the second stage +with the proposed NeP and material-aware cone sampling strategy. Extensive +experiments on the proposed real-world and synthetic datasets demonstrate that +our method can reconstruct high-fidelity geometry/materials of challenging +glossy objects with complex lighting interactions from nearby objects. Project +webpage: https://whyy.site/paper/nep + +
+
+ comment: CVPR 2024 paper. Project webpage https://whyy.site/paper/nep +
+
+
+
+
+ + ☆ Exemplar-Free Class Incremental Learning via Incremental Representation + + +
+ Exemplar-Free Class Incremental Learning (efCIL) aims to continuously +incorporate the knowledge from new classes while retaining previously learned +information, without storing any old-class exemplars (i.e., samples). For this +purpose, various efCIL methods have been proposed over the past few years, +generally with elaborately constructed old pseudo-features, increasing the +difficulty of model development and interpretation. In contrast, we propose a +\textbf{simple Incremental Representation (IR) framework} for efCIL without +constructing old pseudo-features. IR utilizes dataset augmentation to cover a +suitable feature space and prevents the model from forgetting by using a single +L2 space maintenance loss. We discard the transient classifier trained on each +one of the sequence tasks and instead replace it with a 1-near-neighbor +classifier for inference, ensuring the representation is incrementally updated +during CIL. Extensive experiments demonstrate that our proposed IR achieves +comparable performance while significantly preventing the model from forgetting +on CIFAR100, TinyImageNet, and ImageNetSubset datasets. + +
+
+
+
+
+ + ☆ Leveraging Deep Learning and Xception Architecture for High-Accuracy MRI + Classification in Alzheimer Diagnosis + + +
+ Exploring the application of deep learning technologies in the field of +medical diagnostics, Magnetic Resonance Imaging (MRI) provides a unique +perspective for observing and diagnosing complex neurodegenerative diseases +such as Alzheimer Disease (AD). With advancements in deep learning, +particularly in Convolutional Neural Networks (CNNs) and the Xception network +architecture, we are now able to analyze and classify vast amounts of MRI data +with unprecedented accuracy. The progress of this technology not only enhances +our understanding of brain structural changes but also opens up new avenues for +monitoring disease progression through non-invasive means and potentially +allows for precise diagnosis in the early stages of the disease. + This study aims to classify MRI images using deep learning models to identify +different stages of Alzheimer Disease through a series of innovative data +processing and model construction steps. Our experimental results show that the +deep learning framework based on the Xception model achieved a 99.6% accuracy +rate in the multi-class MRI image classification task, demonstrating its +potential application value in assistive diagnosis. Future research will focus +on expanding the dataset, improving model interpretability, and clinical +validation to further promote the application of deep learning technology in +the medical field, with the hope of bringing earlier diagnosis and more +personalized treatment plans to Alzheimer Disease patients. + +
+
+
+
+
+ + ☆ Frankenstein: Generating Semantic-Compositional 3D Scenes in One + Tri-Plane + + +
+ We present Frankenstein, a diffusion-based framework that can generate +semantic-compositional 3D scenes in a single pass. Unlike existing methods that +output a single, unified 3D shape, Frankenstein simultaneously generates +multiple separated shapes, each corresponding to a semantically meaningful +part. The 3D scene information is encoded in one single tri-plane tensor, from +which multiple Singed Distance Function (SDF) fields can be decoded to +represent the compositional shapes. During training, an auto-encoder compresses +tri-planes into a latent space, and then the denoising diffusion process is +employed to approximate the distribution of the compositional scenes. +Frankenstein demonstrates promising results in generating room interiors as +well as human avatars with automatically separated parts. The generated scenes +facilitate many downstream applications, such as part-wise re-texturing, object +rearrangement in the room or avatar cloth re-targeting. + +
+
+ comment: Video: https://youtu.be/lRn-HqyCrLI +
+
+
+
+
+ + ☆ Image Captioning in news report scenario + + +
+ Image captioning strives to generate pertinent captions for specified images, +situating itself at the crossroads of Computer Vision (CV) and Natural Language +Processing (NLP). This endeavor is of paramount importance with far-reaching +applications in recommendation systems, news outlets, social media, and beyond. +Particularly within the realm of news reporting, captions are expected to +encompass detailed information, such as the identities of celebrities captured +in the images. However, much of the existing body of work primarily centers +around understanding scenes and actions. In this paper, we explore the realm of +image captioning specifically tailored for celebrity photographs, illustrating +its broad potential for enhancing news industry practices. This exploration +aims to augment automated news content generation, thereby facilitating a more +nuanced dissemination of information. Our endeavor shows a broader horizon, +enriching the narrative in news reporting through a more intuitive image +captioning framework. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Skull-to-Face: Anatomy-Guided 3D Facial Reconstruction and Editing + + +
+ Deducing the 3D face from a skull is an essential but challenging task in +forensic science and archaeology. Existing methods for automated facial +reconstruction yield inaccurate results, suffering from the non-determinative +nature of the problem that a skull with a sparse set of tissue depth cannot +fully determine the skinned face. Additionally, their texture-less results +require further post-processing stages to achieve a photo-realistic appearance. +This paper proposes an end-to-end 3D face reconstruction and exploration tool, +providing textured 3D faces for reference. With the help of state-of-the-art +text-to-image diffusion models and image-based facial reconstruction +techniques, we generate an initial reference 3D face, whose biological profile +aligns with the given skull. We then adapt these initial faces to meet the +statistical expectations of extruded anatomical landmarks on the skull through +an optimization process. The joint statistical distribution of tissue depths is +learned on a small set of anatomical landmarks on the skull. To support further +adjustment, we propose an efficient face adaptation tool to assist users in +tuning tissue depths, either globally or at local regions, while observing +plausible visual feedback. Experiments conducted on a real skull-face dataset +demonstrated the effectiveness of our proposed pipeline in terms of +reconstruction accuracy, diversity, and stability. + +
+
+
+
+
+ + ☆ Blur2Blur: Blur Conversion for Unsupervised Image Deblurring on Unknown + Domains CVPR 2024 + + +
+ This paper presents an innovative framework designed to train an image +deblurring algorithm tailored to a specific camera device. This algorithm works +by transforming a blurry input image, which is challenging to deblur, into +another blurry image that is more amenable to deblurring. The transformation +process, from one blurry state to another, leverages unpaired data consisting +of sharp and blurry images captured by the target camera device. Learning this +blur-to-blur transformation is inherently simpler than direct blur-to-sharp +conversion, as it primarily involves modifying blur patterns rather than the +intricate task of reconstructing fine image details. The efficacy of the +proposed approach has been demonstrated through comprehensive experiments on +various benchmarks, where it significantly outperforms state-of-the-art methods +both quantitatively and qualitatively. Our code and data are available at +https://zero1778.github.io/blur2blur/ + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ FH-SSTNet: Forehead Creases based User Verification using Spatio-Spatial + Temporal Network + + +
+ Biometric authentication, which utilizes contactless features, such as +forehead patterns, has become increasingly important for identity verification +and access management. The proposed method is based on learning a 3D +spatio-spatial temporal convolution to create detailed pictures of forehead +patterns. We introduce a new CNN model called the Forehead Spatio-Spatial +Temporal Network (FH-SSTNet), which utilizes a 3D CNN architecture with triplet +loss to capture distinguishing features. We enhance the model's discrimination +capability using Arcloss in the network's head. Experimentation on the Forehead +Creases version 1 (FH-V1) dataset, containing 247 unique subjects, demonstrates +the superior performance of FH-SSTNet compared to existing methods and +pre-trained CNNs like ResNet50, especially for forehead-based user +verification. The results demonstrate the superior performance of FH-SSTNet for +forehead-based user verification, confirming its effectiveness in identity +authentication. + +
+
+ comment: 6 pages, 5 Figure, IWBF conference +
+
+
+
+
+ + ☆ From Discrete to Continuous: Deep Fair Clustering With Transferable + Representations + + +
+ We consider the problem of deep fair clustering, which partitions data into +clusters via the representations extracted by deep neural networks while hiding +sensitive data attributes. To achieve fairness, existing methods present a +variety of fairness-related objective functions based on the group fairness +criterion. However, these works typically assume that the sensitive attributes +are discrete and do not work for continuous sensitive variables, such as the +proportion of the female population in an area. Besides, the potential of the +representations learned from clustering tasks to improve performance on other +tasks is ignored by existing works. In light of these limitations, we propose a +flexible deep fair clustering method that can handle discrete and continuous +sensitive attributes simultaneously. Specifically, we design an information +bottleneck style objective function to learn fair and clustering-friendly +representations. Furthermore, we explore for the first time the transferability +of the extracted representations to other downstream tasks. Unlike existing +works, we impose fairness at the representation level, which could guarantee +fairness for the transferred task regardless of clustering results. To verify +the effectiveness of the proposed method, we perform extensive experiments on +datasets with discrete and continuous sensitive attributes, demonstrating the +advantage of our method in comparison with state-of-the-art methods. + +
+
+
+
+
+ + ☆ Diffusion Model is a Good Pose Estimator from 3D RF-Vision + + +
+ Human pose estimation (HPE) from Radio Frequency vision (RF-vision) performs +human sensing using RF signals that penetrate obstacles without revealing +privacy (e.g., facial information). Recently, mmWave radar has emerged as a +promising RF-vision sensor, providing radar point clouds by processing RF +signals. However, the mmWave radar has a limited resolution with severe noise, +leading to inaccurate and inconsistent human pose estimation. This work +proposes mmDiff, a novel diffusion-based pose estimator tailored for noisy +radar data. Our approach aims to provide reliable guidance as conditions to +diffusion models. Two key challenges are addressed by mmDiff: (1) +miss-detection of parts of human bodies, which is addressed by a module that +isolates feature extraction from different body parts, and (2) signal +inconsistency due to environmental interference, which is tackled by +incorporating prior knowledge of body structure and motion. Several modules are +designed to achieve these goals, whose features work as the conditions for the +subsequent diffusion model, eliminating the miss-detection and instability of +HPE based on RF-vision. Extensive experiments demonstrate that mmDiff +outperforms existing methods significantly, achieving state-of-the-art +performances on public datasets. + +
+
+
+
+
+ + ☆ Pose-Guided Self-Training with Two-Stage Clustering for Unsupervised + Landmark Discovery CVPR 2024 + + +
+ Unsupervised landmarks discovery (ULD) for an object category is a +challenging computer vision problem. In pursuit of developing a robust ULD +framework, we explore the potential of a recent paradigm of self-supervised +learning algorithms, known as diffusion models. Some recent works have shown +that these models implicitly contain important correspondence cues. Towards +harnessing the potential of diffusion models for the ULD task, we make the +following core contributions. First, we propose a ZeroShot ULD baseline based +on simple clustering of random pixel locations with nearest neighbour matching. +It delivers better results than existing ULD methods. Second, motivated by the +ZeroShot performance, we develop a ULD algorithm based on diffusion features +using self-training and clustering which also outperforms prior methods by +notable margins. Third, we introduce a new proxy task based on generating +latent pose codes and also propose a two-stage clustering mechanism to +facilitate effective pseudo-labeling, resulting in a significant performance +improvement. Overall, our approach consistently outperforms state-of-the-art +methods on four challenging benchmarks AFLW, MAFL, CatHeads and LS3D by +significant margins. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ☆ Cross-domain Multi-modal Few-shot Object Detection via Rich Text + + +
+ Cross-modal feature extraction and integration have led to steady performance +improvements in few-shot learning tasks due to generating richer features. +However, existing multi-modal object detection (MM-OD) methods degrade when +facing significant domain-shift and are sample insufficient. We hypothesize +that rich text information could more effectively help the model to build a +knowledge relationship between the vision instance and its language description +and can help mitigate domain shift. Specifically, we study the Cross-Domain +few-shot generalization of MM-OD (CDMM-FSOD) and propose a meta-learning based +multi-modal few-shot object detection method that utilizes rich text semantic +information as an auxiliary modality to achieve domain adaptation in the +context of FSOD. Our proposed network contains (i) a multi-modal feature +aggregation module that aligns the vision and language support feature +embeddings and (ii) a rich text semantic rectify module that utilizes +bidirectional text feature generation to reinforce multi-modal feature +alignment and thus to enhance the model's language understanding capability. We +evaluate our model on common standard cross-domain object detection datasets +and demonstrate that our approach considerably outperforms existing FSOD +methods. + +
+
+
+
+
+ + ☆ Improving Scene Graph Generation with Relation Words' Debiasing in + Vision-Language Models + + +
+ Scene Graph Generation (SGG) provides basic language representation of visual +scenes, requiring models to grasp complex and diverse semantics between various +objects. However, this complexity and diversity in SGG also leads to +underrepresentation, where part of test triplets are rare or even unseen during +training, resulting in imprecise predictions. To tackle this, we propose using +the SGG models with pretrained vision-language models (VLMs) to enhance +representation. However, due to the gap between the pretraining and SGG, +directly ensembling the pretrained VLMs leads to severe biases across relation +words. Thus, we introduce LM Estimation to approximate the words' distribution +underlies in the pretraining language sets, and then use the distribution for +debiasing. After that, we ensemble VLMs with SGG models to enhance +representation. Considering that each model may represent better at different +samples, we use a certainty-aware indicator to score each sample and +dynamically adjust the ensemble weights. Our method effectively addresses the +words biases, enhances SGG's representation, and achieve markable performance +enhancements. It is training-free and integrates well with existing SGG models. + +
+
+
+
+
+ + ☆ EgoExoLearn: A Dataset for Bridging Asynchronous Ego- and Exo-centric + View of Procedural Activities in Real World CVPR 2024 + + +
+ Being able to map the activities of others into one's own point of view is +one fundamental human skill even from a very early age. Taking a step toward +understanding this human ability, we introduce EgoExoLearn, a large-scale +dataset that emulates the human demonstration following process, in which +individuals record egocentric videos as they execute tasks guided by +demonstration videos. Focusing on the potential applications in daily +assistance and professional support, EgoExoLearn contains egocentric and +demonstration video data spanning 120 hours captured in daily life scenarios +and specialized laboratories. Along with the videos we record high-quality gaze +data and provide detailed multimodal annotations, formulating a playground for +modeling the human ability to bridge asynchronous procedural actions from +different viewpoints. To this end, we present benchmarks such as cross-view +association, cross-view action planning, and cross-view referenced skill +assessment, along with detailed analysis. We expect EgoExoLearn can serve as an +important resource for bridging the actions across views, thus paving the way +for creating AI agents capable of seamlessly learning by observing humans in +the real world. Code and data can be found at: +https://github.com/OpenGVLab/EgoExoLearn + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing MRI-Based Classification of Alzheimer's Disease with + Explainable 3D Hybrid Compact Convolutional Transformers + + +
+ Alzheimer's disease (AD), characterized by progressive cognitive decline and +memory loss, presents a formidable global health challenge, underscoring the +critical importance of early and precise diagnosis for timely interventions and +enhanced patient outcomes. While MRI scans provide valuable insights into brain +structures, traditional analysis methods often struggle to discern intricate 3D +patterns crucial for AD identification. Addressing this challenge, we introduce +an alternative end-to-end deep learning model, the 3D Hybrid Compact +Convolutional Transformers 3D (HCCT). By synergistically combining +convolutional neural networks (CNNs) and vision transformers (ViTs), the 3D +HCCT adeptly captures both local features and long-range relationships within +3D MRI scans. Extensive evaluations on prominent AD benchmark dataset, ADNI, +demonstrate the 3D HCCT's superior performance, surpassing state of the art CNN +and transformer-based methods in classification accuracy. Its robust +generalization capability and interpretability marks a significant stride in AD +classification from 3D MRI scans, promising more accurate and reliable +diagnoses for improved patient care and superior clinical outcomes. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Fusion of Minutia Cylinder Codes and Minutia Patch Embeddings for Latent + Fingerprint Recognition + + +
+ Latent fingerprints are one of the most widely used forensic evidence by law +enforcement agencies. However, latent recognition performance is far from the +exemplary performance of sensor fingerprint recognition due to deformations and +artifacts within these images. In this study, we propose a fusion based local +matching approach towards latent fingerprint recognition. Recent latent +recognition studies typically relied on local descriptor generation methods, in +which either handcrafted minutiae features or deep neural network features are +extracted around a minutia of interest, in the latent recognition process. +Proposed approach would integrate these handcrafted features with a recently +proposed deep neural network embedding features in a multi-stage fusion +approach to significantly improve latent recognition results. Effectiveness of +the proposed approach has been shown on several public and private data sets. +As demonstrated in our experimental results, proposed method improves rank-1 +identification accuracy by considerably for real-world datasets when compared +to either the single usage of these features or existing state-of-the-art +methods in the literature. + +
+
+ comment: 9 pages,7 figures, 4 tables +
+
+
+
+
+ + ☆ Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method + + +
+ Gaze plays a crucial role in revealing human attention and intention, +shedding light on the cognitive processes behind human actions. The integration +of gaze guidance with the dynamics of hand-object interactions boosts the +accuracy of human motion prediction. However, the lack of datasets that capture +the intricate relationship and consistency among gaze, hand, and object +movements remains a substantial hurdle. In this paper, we introduce the first +Gaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task +for synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI, +features simultaneous 3D modeling of gaze, hand, and object interactions, +comprising 479 sequences with an average duration of 19.1 seconds, 812 +sub-sequences, and 33 objects of various sizes. We propose a hierarchical +framework centered on a gaze-guided hand-object interaction diffusion model, +named GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions +into spatial-temporal features and goal pose conditions at different levels of +information granularity. During the diffusion phase, two gaze-conditioned +diffusion models are stacked to simplify the complex synthesis of hand-object +motions. Here, the object motion diffusion model generates sequences of object +motions based on gaze conditions, while the hand motion diffusion model +produces hand motions based on the generated object motion. To improve +fine-grained goal pose alignment, we introduce a Spherical Gaussian constraint +to guide the denoising step. In the subsequent post-diffusion phase, we +optimize the generated hand motions using contact consistency. Our extensive +experiments highlight the uniqueness of our dataset and the effectiveness of +our approach. + +
+
+
+
+
+ + ☆ Exploiting Semantic Reconstruction to Mitigate Hallucinations in + Vision-Language Models + + +
+ Hallucinations in vision-language models pose a significant challenge to +their reliability, particularly in the generation of long captions. Current +methods fall short of accurately identifying and mitigating these +hallucinations. To address this issue, we introduce ESREAL, a novel +unsupervised learning framework designed to suppress the generation of +hallucinations through accurate localization and penalization of hallucinated +tokens. Initially, ESREAL creates a reconstructed image based on the generated +caption and aligns its corresponding regions with those of the original image. +This semantic reconstruction aids in identifying both the presence and type of +token-level hallucinations within the generated caption. Subsequently, ESREAL +computes token-level hallucination scores by assessing the semantic similarity +of aligned regions based on the type of hallucination. Finally, ESREAL employs +a proximal policy optimization algorithm, where it selectively penalizes +hallucinated tokens according to their token-level hallucination scores. Our +framework notably reduces hallucinations in LLaVA, InstructBLIP, and mPLUG-Owl2 +by 32.81%, 27.08%, and 7.46% on the CHAIR metric. This improvement is achieved +solely through signals derived from the image itself, without the need for any +image-text pairs. + +
+
+
+
+
+ + ☆ Towards Online Real-Time Memory-based Video Inpainting Transformers + + +
+ Video inpainting tasks have seen significant improvements in recent years +with the rise of deep neural networks and, in particular, vision transformers. +Although these models show promising reconstruction quality and temporal +consistency, they are still unsuitable for live videos, one of the last steps +to make them completely convincing and usable. The main limitations are that +these state-of-the-art models inpaint using the whole video (offline +processing) and show an insufficient frame rate. In our approach, we propose a +framework to adapt existing inpainting transformers to these constraints by +memorizing and refining redundant computations while maintaining a decent +inpainting quality. Using this framework with some of the most recent +inpainting models, we show great online results with a consistent throughput +above 20 frames per second. The code and pretrained models will be made +available upon acceptance. + +
+
+
+
+
+ + ☆ Realtime Robust Shape Estimation of Deformable Linear Object ICRA 2024 + + +
+ Realtime shape estimation of continuum objects and manipulators is essential +for developing accurate planning and control paradigms. The existing methods +that create dense point clouds from camera images, and/or use distinguishable +markers on a deformable body have limitations in realtime tracking of large +continuum objects/manipulators. The physical occlusion of markers can often +compromise accurate shape estimation. We propose a robust method to estimate +the shape of linear deformable objects in realtime using scattered and +unordered key points. By utilizing a robust probability-based labeling +algorithm, our approach identifies the true order of the detected key points +and then reconstructs the shape using piecewise spline interpolation. The +approach only relies on knowing the number of the key points and the interval +between two neighboring points. We demonstrate the robustness of the method +when key points are partially occluded. The proposed method is also integrated +into a simulation in Unity for tracking the shape of a cable with a length of +1m and a radius of 5mm. The simulation results show that our proposed approach +achieves an average length error of 1.07% over the continuum's centerline and +an average cross-section error of 2.11mm. The real-world experiments of +tracking and estimating a heavy-load cable prove that the proposed approach is +robust under occlusion and complex entanglement scenarios. + +
+
+ comment: This paper has been accepted to IEEE ICRA 2024 as a contributed paper +
+
+
+
+
+ + ☆ CFAT: Unleashing TriangularWindows for Image Super-resolution CVPR 2024 + + +
+ Transformer-based models have revolutionized the field of image +super-resolution (SR) by harnessing their inherent ability to capture complex +contextual features. The overlapping rectangular shifted window technique used +in transformer architecture nowadays is a common practice in super-resolution +models to improve the quality and robustness of image upscaling. However, it +suffers from distortion at the boundaries and has limited unique shifting +modes. To overcome these weaknesses, we propose a non-overlapping triangular +window technique that synchronously works with the rectangular one to mitigate +boundary-level distortion and allows the model to access more unique sifting +modes. In this paper, we propose a Composite Fusion Attention Transformer +(CFAT) that incorporates triangular-rectangular window-based local attention +with a channel-based global attention technique in image super-resolution. As a +result, CFAT enables attention mechanisms to be activated on more image pixels +and captures long-range, multi-scale features to improve SR performance. The +extensive experimental results and ablation study demonstrate the effectiveness +of CFAT in the SR domain. Our proposed model shows a significant 0.7 dB +performance improvement over other state-of-the-art SR architectures. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Entity-NeRF: Detecting and Removing Moving Entities in Urban Scenes CVPR 2024 + + +
+ Recent advancements in the study of Neural Radiance Fields (NeRF) for dynamic +scenes often involve explicit modeling of scene dynamics. However, this +approach faces challenges in modeling scene dynamics in urban environments, +where moving objects of various categories and scales are present. In such +settings, it becomes crucial to effectively eliminate moving objects to +accurately reconstruct static backgrounds. Our research introduces an +innovative method, termed here as Entity-NeRF, which combines the strengths of +knowledge-based and statistical strategies. This approach utilizes entity-wise +statistics, leveraging entity segmentation and stationary entity classification +through thing/stuff segmentation. To assess our methodology, we created an +urban scene dataset masked with moving objects. Our comprehensive experiments +demonstrate that Entity-NeRF notably outperforms existing techniques in +removing moving objects and reconstructing static urban backgrounds, both +quantitatively and qualitatively. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR 2024), Project website: + https://otonari726.github.io/entitynerf/ +
+
+
+
+
+ + ☆ Salience DETR: Enhancing Detection Transformer with Hierarchical + Salience Filtering Refinement CVPR 2024 + + +
+ DETR-like methods have significantly increased detection performance in an +end-to-end manner. The mainstream two-stage frameworks of them perform dense +self-attention and select a fraction of queries for sparse cross-attention, +which is proven effective for improving performance but also introduces a heavy +computational burden and high dependence on stable query selection. This paper +demonstrates that suboptimal two-stage selection strategies result in scale +bias and redundancy due to the mismatch between selected queries and objects in +two-stage initialization. To address these issues, we propose hierarchical +salience filtering refinement, which performs transformer encoding only on +filtered discriminative queries, for a better trade-off between computational +efficiency and precision. The filtering process overcomes scale bias through a +novel scale-independent salience supervision. To compensate for the semantic +misalignment among queries, we introduce elaborate query refinement modules for +stable two-stage initialization. Based on above improvements, the proposed +Salience DETR achieves significant improvements of +4.0% AP, +0.2% AP, +4.4% AP +on three challenging task-specific detection datasets, as well as 49.2% AP on +COCO 2017 with less FLOPs. The code is available at +https://github.com/xiuqhou/Salience-DETR. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing Video Transformers for Action Understanding with VLM-aided + Training + + +
+ Owing to their ability to extract relevant spatio-temporal video embeddings, +Vision Transformers (ViTs) are currently the best performing models in video +action understanding. However, their generalization over domains or datasets is +somewhat limited. In contrast, Visual Language Models (VLMs) have demonstrated +exceptional generalization performance, but are currently unable to process +videos. Consequently, they cannot extract spatio-temporal patterns that are +crucial for action understanding. In this paper, we propose the Four-tiered +Prompts (FTP) framework that takes advantage of the complementary strengths of +ViTs and VLMs. We retain ViTs' strong spatio-temporal representation ability +but improve the visual encodings to be more comprehensive and general by +aligning them with VLM outputs. The FTP framework adds four feature processors +that focus on specific aspects of human action in videos: action category, +action components, action description, and context information. The VLMs are +only employed during training, and inference incurs a minimal computation cost. +Our approach consistently yields state-of-the-art performance. For instance, we +achieve remarkable top-1 accuracy of 93.8% on Kinetics-400 and 83.4% on +Something-Something V2, surpassing VideoMAEv2 by 2.8% and 2.6%, respectively. + +
+
+
+
+
+ + ☆ Enhancing Visual Continual Learning with Language-Guided Supervision CVPR 2024 + + +
+ Continual learning (CL) aims to empower models to learn new tasks without +forgetting previously acquired knowledge. Most prior works concentrate on the +techniques of architectures, replay data, regularization, \etc. However, the +category name of each class is largely neglected. Existing methods commonly +utilize the one-hot labels and randomly initialize the classifier head. We +argue that the scarce semantic information conveyed by the one-hot labels +hampers the effective knowledge transfer across tasks. In this paper, we +revisit the role of the classifier head within the CL paradigm and replace the +classifier with semantic knowledge from pretrained language models (PLMs). +Specifically, we use PLMs to generate semantic targets for each class, which +are frozen and serve as supervision signals during training. Such targets fully +consider the semantic correlation between all classes across tasks. Empirical +studies show that our approach mitigates forgetting by alleviating +representation drifting and facilitating knowledge transfer across tasks. The +proposed method is simple to implement and can seamlessly be plugged into +existing methods with negligible adjustments. Extensive experiments based on +eleven mainstream baselines demonstrate the effectiveness and generalizability +of our approach to various protocols. For example, under the class-incremental +learning setting on ImageNet-100, our method significantly improves the Top-1 +accuracy by 3.2\% to 6.1\% while reducing the forgetting rate by 2.6\% to +13.1\%. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Self-Supervised Multi-Frame Neural Scene Flow + + +
+ Neural Scene Flow Prior (NSFP) and Fast Neural Scene Flow (FNSF) have shown +remarkable adaptability in the context of large out-of-distribution autonomous +driving. Despite their success, the underlying reasons for their astonishing +generalization capabilities remain unclear. Our research addresses this gap by +examining the generalization capabilities of NSFP through the lens of uniform +stability, revealing that its performance is inversely proportional to the +number of input point clouds. This finding sheds light on NSFP's effectiveness +in handling large-scale point cloud scene flow estimation tasks. Motivated by +such theoretical insights, we further explore the improvement of scene flow +estimation by leveraging historical point clouds across multiple frames, which +inherently increases the number of point clouds. Consequently, we propose a +simple and effective method for multi-frame point cloud scene flow estimation, +along with a theoretical evaluation of its generalization abilities. Our +analysis confirms that the proposed method maintains a limited generalization +error, suggesting that adding multiple frames to the scene flow optimization +process does not detract from its generalizability. Extensive experimental +results on large-scale autonomous driving Waymo Open and Argoverse lidar +datasets demonstrate that the proposed method achieves state-of-the-art +performance. + +
+
+
+
+
+ + ☆ Opportunities and challenges in the application of large artificial + intelligence models in radiology + + +
+ Influenced by ChatGPT, artificial intelligence (AI) large models have +witnessed a global upsurge in large model research and development. As people +enjoy the convenience by this AI large model, more and more large models in +subdivided fields are gradually being proposed, especially large models in +radiology imaging field. This article first introduces the development history +of large models, technical details, workflow, working principles of multimodal +large models and working principles of video generation large models. Secondly, +we summarize the latest research progress of AI large models in radiology +education, radiology report generation, applications of unimodal and multimodal +radiology. Finally, this paper also summarizes some of the challenges of large +AI models in radiology, with the aim of better promoting the rapid revolution +in the field of radiography. + +
+
+
+
+
+ + ☆ EVA: Zero-shot Accurate Attributes and Multi-Object Video Editing + + +
+ Current diffusion-based video editing primarily focuses on local editing +(\textit{e.g.,} object/background editing) or global style editing by utilizing +various dense correspondences. However, these methods often fail to accurately +edit the foreground and background simultaneously while preserving the original +layout. We find that the crux of the issue stems from the imprecise +distribution of attention weights across designated regions, including +inaccurate text-to-attribute control and attention leakage. To tackle this +issue, we introduce EVA, a \textbf{zero-shot} and \textbf{multi-attribute} +video editing framework tailored for human-centric videos with complex motions. +We incorporate a Spatial-Temporal Layout-Guided Attention mechanism that +leverages the intrinsic positive and negative correspondences of cross-frame +diffusion features. To avoid attention leakage, we utilize these +correspondences to boost the attention scores of tokens within the same +attribute across all video frames while limiting interactions between tokens of +different attributes in the self-attention layer. For precise text-to-attribute +manipulation, we use discrete text embeddings focused on specific layout areas +within the cross-attention layer. Benefiting from the precise attention weight +distribution, EVA can be easily generalized to multi-object editing scenarios +and achieves accurate identity mapping. Extensive experiments demonstrate EVA +achieves state-of-the-art results in real-world scenarios. Full results are +provided at https://knightyxp.github.io/EVA/ + +
+
+ comment: Project page: https://knightyxp.github.io/EVA +
+
+
+
+
+ + ☆ CG-SLAM: Efficient Dense RGB-D SLAM in a Consistent Uncertainty-aware 3D + Gaussian Field + + +
+ Recently neural radiance fields (NeRF) have been widely exploited as 3D +representations for dense simultaneous localization and mapping (SLAM). Despite +their notable successes in surface modeling and novel view synthesis, existing +NeRF-based methods are hindered by their computationally intensive and +time-consuming volume rendering pipeline. This paper presents an efficient +dense RGB-D SLAM system, i.e., CG-SLAM, based on a novel uncertainty-aware 3D +Gaussian field with high consistency and geometric stability. Through an +in-depth analysis of Gaussian Splatting, we propose several techniques to +construct a consistent and stable 3D Gaussian field suitable for tracking and +mapping. Additionally, a novel depth uncertainty model is proposed to ensure +the selection of valuable Gaussian primitives during optimization, thereby +improving tracking efficiency and accuracy. Experiments on various datasets +demonstrate that CG-SLAM achieves superior tracking and mapping performance +with a notable tracking speed of up to 15 Hz. We will make our source code +publicly available. Project page: https://zju3dv.github.io/cg-slam. + +
+
+ comment: Project Page: https://zju3dv.github.io/cg-slam +
+
+
+
+
+ + ☆ Are NeRFs ready for autonomous driving? Towards closing the + real-to-simulation gap + + +
+ Neural Radiance Fields (NeRFs) have emerged as promising tools for advancing +autonomous driving (AD) research, offering scalable closed-loop simulation and +data augmentation capabilities. However, to trust the results achieved in +simulation, one needs to ensure that AD systems perceive real and rendered data +in the same way. Although the performance of rendering methods is increasing, +many scenarios will remain inherently challenging to reconstruct faithfully. To +this end, we propose a novel perspective for addressing the real-to-simulated +data gap. Rather than solely focusing on improving rendering fidelity, we +explore simple yet effective methods to enhance perception model robustness to +NeRF artifacts without compromising performance on real data. Moreover, we +conduct the first large-scale investigation into the real-to-simulated data gap +in an AD setting using a state-of-the-art neural rendering technique. +Specifically, we evaluate object detectors and an online mapping model on real +and simulated data, and study the effects of different pre-training strategies. +Our results show notable improvements in model robustness to simulated data, +even improving real-world performance in some cases. Last, we delve into the +correlation between the real-to-simulated gap and image reconstruction metrics, +identifying FID and LPIPS as strong indicators. + +
+
+
+
+
+ + ☆ PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic + Human Modeling + + +
+ High-quality human reconstruction and photo-realistic rendering of a dynamic +scene is a long-standing problem in computer vision and graphics. Despite +considerable efforts invested in developing various capture systems and +reconstruction algorithms, recent advancements still struggle with loose or +oversized clothing and overly complex poses. In part, this is due to the +challenges of acquiring high-quality human datasets. To facilitate the +development of these fields, in this paper, we present PKU-DyMVHumans, a +versatile human-centric dataset for high-fidelity reconstruction and rendering +of dynamic human scenarios from dense multi-view videos. It comprises 8.2 +million frames captured by more than 56 synchronized cameras across diverse +scenarios. These sequences comprise 32 human subjects across 45 different +scenarios, each with a high-detailed appearance and realistic human motion. +Inspired by recent advancements in neural radiance field (NeRF)-based scene +representations, we carefully set up an off-the-shelf framework that is easy to +provide those state-of-the-art NeRF-based implementations and benchmark on +PKU-DyMVHumans dataset. It is paving the way for various applications like +fine-grained foreground/background decomposition, high-quality human +reconstruction and photo-realistic novel view synthesis of a dynamic scene. +Extensive studies are performed on the benchmark, demonstrating new +observations and challenges that emerge from using such high-fidelity dynamic +data. The dataset is available at: https://pku-dymvhumans.github.io. + +
+
+
+
+
+ + ☆ Landmark-Guided Cross-Speaker Lip Reading with Mutual Information + Regularization LREC + + +
+ Lip reading, the process of interpreting silent speech from visual lip +movements, has gained rising attention for its wide range of realistic +applications. Deep learning approaches greatly improve current lip reading +systems. However, lip reading in cross-speaker scenarios where the speaker +identity changes, poses a challenging problem due to inter-speaker variability. +A well-trained lip reading system may perform poorly when handling a brand new +speaker. To learn a speaker-robust lip reading model, a key insight is to +reduce visual variations across speakers, avoiding the model overfitting to +specific speakers. In this work, in view of both input visual clues and latent +representations based on a hybrid CTC/attention architecture, we propose to +exploit the lip landmark-guided fine-grained visual clues instead of +frequently-used mouth-cropped images as input features, diminishing +speaker-specific appearance characteristics. Furthermore, a max-min mutual +information regularization approach is proposed to capture speaker-insensitive +latent representations. Experimental evaluations on public lip reading datasets +demonstrate the effectiveness of the proposed approach under the intra-speaker +and inter-speaker conditions. + +
+
+ comment: To appear in LREC-COLING 2024 +
+
+
+
+
+ + ☆ Robust Diffusion Models for Adversarial Purification + + +
+ Diffusion models (DMs) based adversarial purification (AP) has shown to be +the most powerful alternative to adversarial training (AT). However, these +methods neglect the fact that pre-trained diffusion models themselves are not +robust to adversarial attacks as well. Additionally, the diffusion process can +easily destroy semantic information and generate a high quality image but +totally different from the original input image after the reverse process, +leading to degraded standard accuracy. To overcome these issues, a natural idea +is to harness adversarial training strategy to retrain or fine-tune the +pre-trained diffusion model, which is computationally prohibitive. We propose a +novel robust reverse process with adversarial guidance, which is independent of +given pre-trained DMs and avoids retraining or fine-tuning the DMs. This robust +guidance can not only ensure to generate purified examples retaining more +semantic content but also mitigate the accuracy-robustness trade-off of DMs for +the first time, which also provides DM-based AP an efficient adaptive ability +to new attacks. Extensive experiments are conducted to demonstrate that our +method achieves the state-of-the-art results and exhibits generalization +against different attacks. + +
+
+
+
+
+ + ☆ Segment Anything Model for Road Network Graph Extraction + + +
+ We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for +extracting large-scale, vectorized road network graphs from satellite imagery. +To predict graph geometry, we formulate it as a dense semantic segmentation +task, leveraging the inherent strengths of SAM. The image encoder of SAM is +fine-tuned to produce probability masks for roads and intersections, from which +the graph vertices are extracted via simple non-maximum suppression. To predict +graph topology, we designed a lightweight transformer-based graph neural +network, which leverages the SAM image embeddings to estimate the edge +existence probabilities between vertices. Our approach directly predicts the +graph vertices and edges for large regions without expensive and complex +post-processing heuristics, and is capable of building complete road network +graphs spanning multiple square kilometers in a matter of seconds. With its +simple, straightforward, and minimalist design, SAM-Road achieves comparable +accuracy with the state-of-the-art method RNGDet++, while being 40 times faster +on the City-scale dataset. We thus demonstrate the power of a foundational +vision model when applied to a graph learning task. The code is available at +https://github.com/htcr/sam_road. + +
+
+
+
+
+ + ☆ A General and Efficient Federated Split Learning with Pre-trained Image + Transformers for Heterogeneous Data + + +
+ Federated Split Learning (FSL) is a promising distributed learning paradigm +in practice, which gathers the strengths of both Federated Learning (FL) and +Split Learning (SL) paradigms, to ensure model privacy while diminishing the +resource overhead of each client, especially on large transformer models in a +resource-constrained environment, e.g., Internet of Things (IoT). However, +almost all works merely investigate the performance with simple neural network +models in FSL. Despite the minor efforts focusing on incorporating Vision +Transformers (ViT) as model architectures, they train ViT from scratch, thereby +leading to enormous training overhead in each device with limited resources. +Therefore, in this paper, we harness Pre-trained Image Transformers (PITs) as +the initial model, coined FES-PIT, to accelerate the training process and +improve model robustness. Furthermore, we propose FES-PTZO to hinder the +gradient inversion attack, especially having the capability compatible with +black-box scenarios, where the gradient information is unavailable. Concretely, +FES-PTZO approximates the server gradient by utilizing a zeroth-order (ZO) +optimization, which replaces the backward propagation with just one forward +process. Empirically, we are the first to provide a systematic evaluation of +FSL methods with PITs in real-world datasets, different partial device +participations, and heterogeneous data splits. Our experiments verify the +effectiveness of our algorithms. + +
+
+
+
+
+ + ☆ Edit3K: Universal Representation Learning for Video Editing Components + + +
+ This paper focuses on understanding the predominant video creation pipeline, +i.e., compositional video editing with six main types of editing components, +including video effects, animation, transition, filter, sticker, and text. In +contrast to existing visual representation learning of visual materials (i.e., +images/videos), we aim to learn visual representations of editing +actions/components that are generally applied on raw materials. We start by +proposing the first large-scale dataset for editing components of video +creation, which covers about $3,094$ editing components with $618,800$ videos. +Each video in our dataset is rendered by various image/video materials with a +single editing component, which supports atomic visual understanding of +different editing components. It can also benefit several downstream tasks, +e.g., editing component recommendation, editing component +recognition/retrieval, etc. Existing visual representation methods perform +poorly because it is difficult to disentangle the visual appearance of editing +components from raw materials. To that end, we benchmark popular alternative +solutions and propose a novel method that learns to attend to the appearance of +editing components regardless of raw materials. Our method achieves favorable +results on editing component retrieval/recognition compared to the alternative +solutions. A user study is also conducted to show that our representations +cluster visually similar editing components better than other alternatives. +Furthermore, our learned representations used to transition recommendation +tasks achieve state-of-the-art results on the AutoTransition dataset. The code +and dataset will be released for academic use. + +
+
+
+
+
+ + ☆ Semantic Is Enough: Only Semantic Information For NeRF Reconstruction + + +
+ Recent research that combines implicit 3D representation with semantic +information, like Semantic-NeRF, has proven that NeRF model could perform +excellently in rendering 3D structures with semantic labels. This research aims +to extend the Semantic Neural Radiance Fields (Semantic-NeRF) model by focusing +solely on semantic output and removing the RGB output component. We reformulate +the model and its training procedure to leverage only the cross-entropy loss +between the model semantic output and the ground truth semantic images, +removing the colour data traditionally used in the original Semantic-NeRF +approach. We then conduct a series of identical experiments using the original +and the modified Semantic-NeRF model. Our primary objective is to obverse the +impact of this modification on the model performance by Semantic-NeRF, focusing +on tasks such as scene understanding, object detection, and segmentation. The +results offer valuable insights into the new way of rendering the scenes and +provide an avenue for further research and development in semantic-focused 3D +scene understanding. + +
+
+
+
+
+ + ☆ V2X-Real: a Largs-Scale Dataset for Vehicle-to-Everything Cooperative + Perception + + +
+ Recent advancements in Vehicle-to-Everything (V2X) technologies have enabled +autonomous vehicles to share sensing information to see through occlusions, +greatly boosting the perception capability. However, there are no real-world +datasets to facilitate the real V2X cooperative perception research -- existing +datasets either only support Vehicle-to-Infrastructure cooperation or +Vehicle-to-Vehicle cooperation. In this paper, we propose a dataset that has a +mixture of multiple vehicles and smart infrastructure simultaneously to +facilitate the V2X cooperative perception development with multi-modality +sensing data. Our V2X-Real is collected using two connected automated vehicles +and two smart infrastructures, which are all equipped with multi-modal sensors +including LiDAR sensors and multi-view cameras. The whole dataset contains 33K +LiDAR frames and 171K camera data with over 1.2M annotated bounding boxes of 10 +categories in very challenging urban scenarios. According to the collaboration +mode and ego perspective, we derive four types of datasets for Vehicle-Centric, +Infrastructure-Centric, Vehicle-to-Vehicle, and +Infrastructure-to-Infrastructure cooperative perception. Comprehensive +multi-class multi-agent benchmarks of SOTA cooperative perception methods are +provided. The V2X-Real dataset and benchmark codes will be released. + +
+
+
+
+
+ + ☆ Exploring the Impact of Dataset Bias on Dataset Distillation + + +
+ Dataset Distillation (DD) is a promising technique to synthesize a smaller +dataset that preserves essential information from the original dataset. This +synthetic dataset can serve as a substitute for the original large-scale one, +and help alleviate the training workload. However, current DD methods typically +operate under the assumption that the dataset is unbiased, overlooking +potential bias issues within the dataset itself. To fill in this blank, we +systematically investigate the influence of dataset bias on DD. To the best of +our knowledge, this is the first exploration in the DD domain. Given that there +are no suitable biased datasets for DD, we first construct two biased datasets, +CMNIST-DD and CCIFAR10-DD, to establish a foundation for subsequent analysis. +Then we utilize existing DD methods to generate synthetic datasets on CMNIST-DD +and CCIFAR10-DD, and evaluate their performance following the standard process. +Experiments demonstrate that biases present in the original dataset +significantly impact the performance of the synthetic dataset in most cases, +which highlights the necessity of identifying and mitigating biases in the +original datasets during DD. Finally, we reformulate DD within the context of a +biased dataset. Our code along with biased datasets are available at +https://github.com/yaolu-zjut/Biased-DD. + +
+
+
+
+
+ + ☆ A Unified Module for Accelerating STABLE-DIFFUSION: LCM-LORA + + +
+ This paper presents a comprehensive study on the unified module for +accelerating stable-diffusion processes, specifically focusing on the lcm-lora +module. Stable-diffusion processes play a crucial role in various scientific +and engineering domains, and their acceleration is of paramount importance for +efficient computational performance. The standard iterative procedures for +solving fixed-source discrete ordinates problems often exhibit slow +convergence, particularly in optically thick scenarios. To address this +challenge, unconditionally stable diffusion-acceleration methods have been +developed, aiming to enhance the computational efficiency of transport +equations and discrete ordinates problems. This study delves into the +theoretical foundations and numerical results of unconditionally stable +diffusion synthetic acceleration methods, providing insights into their +stability and performance for model discrete ordinates problems. Furthermore, +the paper explores recent advancements in diffusion model acceleration, +including on device acceleration of large diffusion models via gpu aware +optimizations, highlighting the potential for significantly improved inference +latency. The results and analyses in this study provide important insights into +stable diffusion processes and have important ramifications for the creation +and application of acceleration methods specifically, the lcm-lora module in a +variety of computing environments. + +
+
+
+
+
+ + ☆ RPMArt: Towards Robust Perception and Manipulation for Articulated + Objects IROS 2024 + + +
+ Articulated objects are commonly found in daily life. It is essential that +robots can exhibit robust perception and manipulation skills for articulated +objects in real-world robotic applications. However, existing methods for +articulated objects insufficiently address noise in point clouds and struggle +to bridge the gap between simulation and reality, thus limiting the practical +deployment in real-world scenarios. To tackle these challenges, we propose a +framework towards Robust Perception and Manipulation for Articulated Objects +(RPMArt), which learns to estimate the articulation parameters and manipulate +the articulation part from the noisy point cloud. Our primary contribution is a +Robust Articulation Network (RoArtNet) that is able to predict both joint +parameters and affordable points robustly by local feature learning and point +tuple voting. Moreover, we introduce an articulation-aware classification +scheme to enhance its ability for sim-to-real transfer. Finally, with the +estimated affordable point and articulation joint constraint, the robot can +generate robust actions to manipulate articulated objects. After learning only +from synthetic data, RPMArt is able to transfer zero-shot to real-world +articulated objects. Experimental results confirm our approach's effectiveness, +with our framework achieving state-of-the-art performance in both noise-added +simulation and real-world environments. The code and data will be open-sourced +for reproduction. More results are published on the project website at +https://r-pmart.github.io . + +
+
+ comment: 8 pages, 7 figures, submitted to 2024 IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS 2024), project website at + https://r-pmart.github.io +
+
+
+
+
+ + ☆ PaPr: Training-Free One-Step Patch Pruning with Lightweight ConvNets for + Faster Inference + + +
+ As deep neural networks evolve from convolutional neural networks (ConvNets) +to advanced vision transformers (ViTs), there is an increased need to eliminate +redundant data for faster processing without compromising accuracy. Previous +methods are often architecture-specific or necessitate re-training, restricting +their applicability with frequent model updates. To solve this, we first +introduce a novel property of lightweight ConvNets: their ability to identify +key discriminative patch regions in images, irrespective of model's final +accuracy or size. We demonstrate that fully-connected layers are the primary +bottleneck for ConvNets performance, and their suppression with simple weight +recalibration markedly enhances discriminative patch localization performance. +Using this insight, we introduce PaPr, a method for substantially pruning +redundant patches with minimal accuracy loss using lightweight ConvNets across +a variety of deep learning architectures, including ViTs, ConvNets, and hybrid +transformers, without any re-training. Moreover, the simple early-stage +one-step patch pruning with PaPr enhances existing patch reduction methods. +Through extensive testing on diverse architectures, PaPr achieves significantly +higher accuracy over state-of-the-art patch reduction methods with similar FLOP +count reduction. More specifically, PaPr reduces about 70% of redundant patches +in videos with less than 0.8% drop in accuracy, and up to 3.7x FLOPs reduction, +which is a 15% more reduction with 2.5% higher accuracy. + +
+
+
+
+
+ + ☆ Fill in the ____ (a Diffusion-based Image Inpainting Pipeline) + + +
+ Image inpainting is the process of taking an image and generating lost or +intentionally occluded portions. Inpainting has countless applications +including restoring previously damaged pictures, restoring the quality of +images that have been degraded due to compression, and removing unwanted +objects/text. Modern inpainting techniques have shown remarkable ability in +generating sensible completions for images with mask occlusions. In our paper, +an overview of the progress of inpainting techniques will be provided, along +with identifying current leading approaches, focusing on their strengths and +weaknesses. A critical gap in these existing models will be addressed, focusing +on the ability to prompt and control what exactly is generated. We will +additionally justify why we think this is the natural next progressive step +that inpainting models must take, and provide multiple approaches to +implementing this functionality. Finally, we will evaluate the results of our +approaches by qualitatively checking whether they generate high-quality images +that correctly inpaint regions with the objects that they are instructed to +produce. + +
+
+
+
+
+ + ☆ SM2C: Boost the Semi-supervised Segmentation for Medical Image by using + Meta Pseudo Labels and Mixed Images + + +
+ Recently, machine learning-based semantic segmentation algorithms have +demonstrated their potential to accurately segment regions and contours in +medical images, allowing the precise location of anatomical structures and +abnormalities. Although medical images are difficult to acquire and annotate, +semi-supervised learning methods are efficient in dealing with the scarcity of +labeled data. However, overfitting is almost inevitable due to the limited +images for training. Furthermore, the intricate shapes of organs and lesions in +medical images introduce additional complexity in different cases, preventing +networks from acquiring a strong ability to generalize. To this end, we +introduce a novel method called Scaling-up Mix with Multi-Class (SM2C). This +method uses three strategies - scaling-up image size, multi-class mixing, and +object shape jittering - to improve the ability to learn semantic features +within medical images. By diversifying the shape of the segmentation objects +and enriching the semantic information within each sample, the SM2C +demonstrates its potential, especially in the training of unlabelled data. +Extensive experiments demonstrate the effectiveness of the SM2C on three +benchmark medical image segmentation datasets. The proposed framework shows +significant improvements over state-of-the-art counterparts. + +
+
+
+
+
+ + ☆ Knowledge-Enhanced Dual-stream Zero-shot Composed Image Retrieval CVPR 2024 + + +
+ We study the zero-shot Composed Image Retrieval (ZS-CIR) task, which is to +retrieve the target image given a reference image and a description without +training on the triplet datasets. Previous works generate pseudo-word tokens by +projecting the reference image features to the text embedding space. However, +they focus on the global visual representation, ignoring the representation of +detailed attributes, e.g., color, object number and layout. To address this +challenge, we propose a Knowledge-Enhanced Dual-stream zero-shot composed image +retrieval framework (KEDs). KEDs implicitly models the attributes of the +reference images by incorporating a database. The database enriches the +pseudo-word tokens by providing relevant images and captions, emphasizing +shared attribute information in various aspects. In this way, KEDs recognizes +the reference image from diverse perspectives. Moreover, KEDs adopts an extra +stream that aligns pseudo-word tokens with textual concepts, leveraging +pseudo-triplets mined from image-text pairs. The pseudo-word tokens generated +in this stream are explicitly aligned with fine-grained semantics in the text +embedding space. Extensive experiments on widely used benchmarks, i.e. +ImageNet-R, COCO object, Fashion-IQ and CIRR, show that KEDs outperforms +previous zero-shot composed image retrieval methods. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Diverse Representation Embedding for Lifelong Person Re-Identification + + +
+ Lifelong Person Re-Identification (LReID) aims to continuously learn from +successive data streams, matching individuals across multiple cameras. The key +challenge for LReID is how to effectively preserve old knowledge while learning +new information incrementally. Task-level domain gaps and limited old task +datasets are key factors leading to catastrophic forgetting in ReLD, which are +overlooked in existing methods. To alleviate this problem, we propose a novel +Diverse Representation Embedding (DRE) framework for LReID. The proposed DRE +preserves old knowledge while adapting to new information based on +instance-level and task-level layout. Concretely, an Adaptive Constraint Module +(ACM) is proposed to implement integration and push away operations between +multiple representations, obtaining dense embedding subspace for each instance +to improve matching ability on limited old task datasets. Based on the +processed diverse representation, we interact knowledge between the adjustment +model and the learner model through Knowledge Update (KU) and Knowledge +Preservation (KP) strategies at the task-level layout, which reduce the +task-wise domain gap on both old and new tasks, and exploit diverse +representation of each instance in limited datasets from old tasks, improving +model performance for extended periods. Extensive experiments were conducted on +eleven Re-ID datasets, including five seen datasets for training in order-1 and +order-2 orders and six unseen datasets for inference. Compared to +state-of-the-art methods, our method achieves significantly improved +performance in holistic, large-scale, and occluded datasets. + +
+
+ comment: 11 pages,7 Tables,3 Figures +
+
+
+
+
+ + ☆ SDSTrack: Self-Distillation Symmetric Adapter Learning for Multi-Modal + Visual Object Tracking CVPR2024 + + +
+ Multimodal Visual Object Tracking (VOT) has recently gained significant +attention due to its robustness. Early research focused on fully fine-tuning +RGB-based trackers, which was inefficient and lacked generalized representation +due to the scarcity of multimodal data. Therefore, recent studies have utilized +prompt tuning to transfer pre-trained RGB-based trackers to multimodal data. +However, the modality gap limits pre-trained knowledge recall, and the +dominance of the RGB modality persists, preventing the full utilization of +information from other modalities. To address these issues, we propose a novel +symmetric multimodal tracking framework called SDSTrack. We introduce +lightweight adaptation for efficient fine-tuning, which directly transfers the +feature extraction ability from RGB to other domains with a small number of +trainable parameters and integrates multimodal features in a balanced, +symmetric manner. Furthermore, we design a complementary masked patch +distillation strategy to enhance the robustness of trackers in complex +environments, such as extreme weather, poor imaging, and sensor failure. +Extensive experiments demonstrate that SDSTrack outperforms state-of-the-art +methods in various multimodal tracking scenarios, including RGB+Depth, +RGB+Thermal, and RGB+Event tracking, and exhibits impressive results in extreme +conditions. Our source code is available at https://github.com/hoqolo/SDSTrack. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Multi-Scale Spatio-Temporal Graph Convolutional Network for Facial + Expression Spotting + + +
+ Facial expression spotting is a significant but challenging task in facial +expression analysis. The accuracy of expression spotting is affected not only +by irrelevant facial movements but also by the difficulty of perceiving subtle +motions in micro-expressions. In this paper, we propose a Multi-Scale +Spatio-Temporal Graph Convolutional Network (SpoT-GCN) for facial expression +spotting. To extract more robust motion features, we track both short- and +long-term motion of facial muscles in compact sliding windows whose window +length adapts to the temporal receptive field of the network. This strategy, +termed the receptive field adaptive sliding window strategy, effectively +magnifies the motion features while alleviating the problem of severe head +movement. The subtle motion features are then converted to a facial graph +representation, whose spatio-temporal graph patterns are learned by a graph +convolutional network. This network learns both local and global features from +multiple scales of facial graph structures using our proposed facial local +graph pooling (FLGP). Furthermore, we introduce supervised contrastive learning +to enhance the discriminative capability of our model for difficult-to-classify +frames. The experimental results on the SAMM-LV and CAS(ME)^2 datasets +demonstrate that our method achieves state-of-the-art performance, particularly +in micro-expression spotting. Ablation studies further verify the effectiveness +of our proposed modules. + +
+
+ comment: Accepted by FG2024 +
+
+
+
+
+ + ☆ BIMCV-R: A Landmark Dataset for 3D CT Text-Image Retrieval + + +
+ The burgeoning integration of 3D medical imaging into healthcare has led to a +substantial increase in the workload of medical professionals. To assist +clinicians in their diagnostic processes and alleviate their workload, the +development of a robust system for retrieving similar case studies presents a +viable solution. While the concept holds great promise, the field of 3D medical +text-image retrieval is currently limited by the absence of robust evaluation +benchmarks and curated datasets. To remedy this, our study presents a +groundbreaking dataset, BIMCV-R (This dataset will be released upon +acceptance.), which includes an extensive collection of 8,069 3D CT volumes, +encompassing over 2 million slices, paired with their respective radiological +reports. Expanding upon the foundational work of our dataset, we craft a +retrieval strategy, MedFinder. This approach employs a dual-stream network +architecture, harnessing the potential of large language models to advance the +field of medical image retrieval beyond existing text-image retrieval +solutions. It marks our preliminary step towards developing a system capable of +facilitating text-to-image, image-to-text, and keyword-based retrieval tasks. + +
+
+
+
+
+ + ☆ Mars Spectrometry 2: Gas Chromatography -- Second place solution + + +
+ The Mars Spectrometry 2: Gas Chromatography challenge was sponsored by NASA +and run on the DrivenData competition platform in 2022. This report describes +the solution which achieved the second-best score on the competition's test +dataset. The solution utilized two-dimensional, image-like representations of +the competition's chromatography data samples. A number of different +Convolutional Neural Network models were trained and ensembled for the final +submission. + +
+
+
+
+
+ + ☆ Exploring Accurate 3D Phenotyping in Greenhouse through Neural Radiance + Fields + + +
+ Accurate collection of plant phenotyping is critical to optimising +sustainable farming practices in precision agriculture. Traditional phenotyping +in controlled laboratory environments, while valuable, falls short in +understanding plant growth under real-world conditions. Emerging sensor and +digital technologies offer a promising approach for direct phenotyping of +plants in farm environments. This study investigates a learning-based +phenotyping method using the Neural Radiance Field to achieve accurate in-situ +phenotyping of pepper plants in greenhouse environments. To quantitatively +evaluate the performance of this method, traditional point cloud registration +on 3D scanning data is implemented for comparison. Experimental result shows +that NeRF(Neural Radiance Fields) achieves competitive accuracy compared to the +3D scanning methods. The mean distance error between the scanner-based method +and the NeRF-based method is 0.865mm. This study shows that the learning-based +NeRF method achieves similar accuracy to 3D scanning-based methods but with +improved scalability and robustness. + +
+
+
+
+
+ + ☆ Towards Two-Stream Foveation-based Active Vision Learning + + +
+ Deep neural network (DNN) based machine perception frameworks process the +entire input in a one-shot manner to provide answers to both "what object is +being observed" and "where it is located". In contrast, the "two-stream +hypothesis" from neuroscience explains the neural processing in the human +visual cortex as an active vision system that utilizes two separate regions of +the brain to answer the what and the where questions. In this work, we propose +a machine learning framework inspired by the "two-stream hypothesis" and +explore the potential benefits that it offers. Specifically, the proposed +framework models the following mechanisms: 1) ventral (what) stream focusing on +the input regions perceived by the fovea part of an eye (foveation), 2) dorsal +(where) stream providing visual guidance, and 3) iterative processing of the +two streams to calibrate visual focus and process the sequence of focused image +patches. The training of the proposed framework is accomplished by label-based +DNN training for the ventral stream model and reinforcement learning for the +dorsal stream model. We show that the two-stream foveation-based learning is +applicable to the challenging task of weakly-supervised object localization +(WSOL), where the training data is limited to the object class or its +attributes. The framework is capable of both predicting the properties of an +object and successfully localizing it by predicting its bounding box. We also +show that, due to the independent nature of the two streams, the dorsal model +can be applied on its own to unseen images to localize objects from different +datasets. + +
+
+ comment: 18 pages, 14 figures, Under consideration at IEEE Transactions on + Cognitive and Developmental Systems +
+
+
+
+
+ + ☆ CBGT-Net: A Neuromimetic Architecture for Robust Classification of + Streaming Data + + +
+ This paper describes CBGT-Net, a neural network model inspired by the +cortico-basal ganglia-thalamic (CBGT) circuits found in mammalian brains. +Unlike traditional neural network models, which either generate an output for +each provided input, or an output after a fixed sequence of inputs, the +CBGT-Net learns to produce an output after a sufficient criteria for evidence +is achieved from a stream of observed data. For each observation, the CBGT-Net +generates a vector that explicitly represents the amount of evidence the +observation provides for each potential decision, accumulates the evidence over +time, and generates a decision when the accumulated evidence exceeds a +pre-defined threshold. We evaluate the proposed model on two image +classification tasks, where models need to predict image categories based on a +stream of small patches extracted from the image. We show that the CBGT-Net +provides improved accuracy and robustness compared to models trained to +classify from a single patch, and models leveraging an LSTM layer to classify +from a fixed sequence length of patches. + +
+
+
+
+
+ + ♻ ☆ Ghost on the Shell: An Expressive Representation of General 3D Shapes ICLR 2024 + + +
+ The creation of photorealistic virtual worlds requires the accurate modeling +of 3D surface geometry for a wide range of objects. For this, meshes are +appealing since they 1) enable fast physics-based rendering with realistic +material and lighting, 2) support physical simulation, and 3) are +memory-efficient for modern graphics pipelines. Recent work on reconstructing +and statistically modeling 3D shape, however, has critiqued meshes as being +topologically inflexible. To capture a wide range of object shapes, any 3D +representation must be able to model solid, watertight, shapes as well as thin, +open, surfaces. Recent work has focused on the former, and methods for +reconstructing open surfaces do not support fast reconstruction with material +and lighting or unconditional generative modelling. Inspired by the observation +that open surfaces can be seen as islands floating on watertight surfaces, we +parameterize open surfaces by defining a manifold signed distance field on +watertight templates. With this parameterization, we further develop a +grid-based and differentiable representation that parameterizes both watertight +and non-watertight meshes of arbitrary topology. Our new representation, called +Ghost-on-the-Shell (G-Shell), enables two important applications: +differentiable rasterization-based reconstruction from multiview images and +generative modelling of non-watertight meshes. We empirically demonstrate that +G-Shell achieves state-of-the-art performance on non-watertight mesh +reconstruction and generation tasks, while also performing effectively for +watertight meshes. + +
+
+ comment: ICLR 2024 Oral (v3: 30 pages, 19 figures, Project Page: + https://gshell3d.github.io/) +
+
+
+
+
+ + ♻ ☆ VQPy: An Object-Oriented Approach to Modern Video Analytics + + +
+ Video analytics is widely used in contemporary systems and services. At the +forefront of video analytics are video queries that users develop to find +objects of particular interest. Building upon the insight that video objects +(e.g., human, animals, cars, etc.), the center of video analytics, are similar +in spirit to objects modeled by traditional object-oriented languages, we +propose to develop an object-oriented approach to video analytics. This +approach, named VQPy, consists of a frontend$\unicode{x2015}$a Python variant +with constructs that make it easy for users to express video objects and their +interactions$\unicode{x2015}$as well as an extensible backend that can +automatically construct and optimize pipelines based on video objects. We have +implemented and open-sourced VQPy, which has been productized in Cisco as part +of its DeepVision framework. + +
+
+ comment: MLSys'24 +
+
+
+
+
+ + ♻ ☆ Latent Dataset Distillation with Diffusion Models + + +
+ The efficacy of machine learning has traditionally relied on the availability +of increasingly larger datasets. However, large datasets pose storage +challenges and contain non-influential samples, which could be ignored during +training without impacting the final accuracy of the model. In response to +these limitations, the concept of distilling the information on a dataset into +a condensed set of (synthetic) samples, namely a distilled dataset, emerged. +One crucial aspect is the selected architecture (usually ConvNet) for linking +the original and synthetic datasets. However, the final accuracy is lower if +the employed model architecture differs from the model used during +distillation. Another challenge is the generation of high-resolution images, +e.g., 128x128 and higher. In this paper, we propose Latent Dataset Distillation +with Diffusion Models (LD3M) that combine diffusion in latent space with +dataset distillation to tackle both challenges. LD3M incorporates a novel +diffusion process tailored for dataset distillation, which improves the +gradient norms for learning synthetic images. By adjusting the number of +diffusion steps, LD3M also offers a straightforward way of controlling the +trade-off between speed and accuracy. We evaluate our approach in several +ImageNet subsets and for high-resolution images (128x128 and 256x256). As a +result, LD3M consistently outperforms state-of-the-art distillation techniques +by up to 4.8 p.p. and 4.2 p.p. for 1 and 10 images per class, respectively. + +
+
+
+
+
+ + ♻ ☆ BAGS: Blur Agnostic Gaussian Splatting through Multi-Scale Kernel + Modeling + + +
+ Recent efforts in using 3D Gaussians for scene reconstruction and novel view +synthesis can achieve impressive results on curated benchmarks; however, images +captured in real life are often blurry. In this work, we analyze the robustness +of Gaussian-Splatting-based methods against various image blur, such as motion +blur, defocus blur, downscaling blur, \etc. Under these degradations, +Gaussian-Splatting-based methods tend to overfit and produce worse results than +Neural-Radiance-Field-based methods. To address this issue, we propose Blur +Agnostic Gaussian Splatting (BAGS). BAGS introduces additional 2D modeling +capacities such that a 3D-consistent and high quality scene can be +reconstructed despite image-wise blur. Specifically, we model blur by +estimating per-pixel convolution kernels from a Blur Proposal Network (BPN). +BPN is designed to consider spatial, color, and depth variations of the scene +to maximize modeling capacity. Additionally, BPN also proposes a +quality-assessing mask, which indicates regions where blur occur. Finally, we +introduce a coarse-to-fine kernel optimization scheme; this optimization scheme +is fast and avoids sub-optimal solutions due to a sparse point cloud +initialization, which often occurs when we apply Structure-from-Motion on +blurry images. We demonstrate that BAGS achieves photorealistic renderings +under various challenging blur conditions and imaging geometry, while +significantly improving upon existing approaches. + +
+
+
+
+
+ + ♻ ☆ Detection of diabetic retinopathy using longitudinal self-supervised + learning MICCAI + + +
+ Longitudinal imaging is able to capture both static anatomical structures and +dynamic changes in disease progression towards earlier and better +patient-specific pathology management. However, conventional approaches for +detecting diabetic retinopathy (DR) rarely take advantage of longitudinal +information to improve DR analysis. In this work, we investigate the benefit of +exploiting self-supervised learning with a longitudinal nature for DR diagnosis +purposes. We compare different longitudinal self-supervised learning (LSSL) +methods to model the disease progression from longitudinal retinal color fundus +photographs (CFP) to detect early DR severity changes using a pair of +consecutive exams. The experiments were conducted on a longitudinal DR +screening dataset with or without those trained encoders (LSSL) acting as a +longitudinal pretext task. Results achieve an AUC of 0.875 for the baseline +(model trained from scratch) and an AUC of 0.96 (95% CI: 0.9593-0.9655 DeLong +test) with a p-value < 2.2e-16 on early fusion using a simple ResNet alike +architecture with frozen LSSL weights, suggesting that the LSSL latent space +enables to encode the dynamic of DR progression. + +
+
+ comment: Accepted preprint for presentation at MICCAI-OMIA +
+
+
+
+
+ + ♻ ☆ Influencer Backdoor Attack on Semantic Segmentation + + +
+ When a small number of poisoned samples are injected into the training +dataset of a deep neural network, the network can be induced to exhibit +malicious behavior during inferences, which poses potential threats to +real-world applications. While they have been intensively studied in +classification, backdoor attacks on semantic segmentation have been largely +overlooked. Unlike classification, semantic segmentation aims to classify every +pixel within a given image. In this work, we explore backdoor attacks on +segmentation models to misclassify all pixels of a victim class by injecting a +specific trigger on non-victim pixels during inferences, which is dubbed +Influencer Backdoor Attack (IBA). IBA is expected to maintain the +classification accuracy of non-victim pixels and mislead classifications of all +victim pixels in every single inference and could be easily applied to +real-world scenes. Based on the context aggregation ability of segmentation +models, we proposed a simple, yet effective, Nearest-Neighbor trigger injection +strategy. We also introduce an innovative Pixel Random Labeling strategy which +maintains optimal performance even when the trigger is placed far from the +victim pixels. Our extensive experiments reveal that current segmentation +models do suffer from backdoor attacks, demonstrate IBA real-world +applicability, and show that our proposed techniques can further increase +attack performance. + +
+
+
+
+
+ + ♻ ☆ DNGaussian: Optimizing Sparse-View 3D Gaussian Radiance Fields with + Global-Local Depth Normalization CVPR 2024 + + +
+ Radiance fields have demonstrated impressive performance in synthesizing +novel views from sparse input views, yet prevailing methods suffer from high +training costs and slow inference speed. This paper introduces DNGaussian, a +depth-regularized framework based on 3D Gaussian radiance fields, offering +real-time and high-quality few-shot novel view synthesis at low costs. Our +motivation stems from the highly efficient representation and surprising +quality of the recent 3D Gaussian Splatting, despite it will encounter a +geometry degradation when input views decrease. In the Gaussian radiance +fields, we find this degradation in scene geometry primarily lined to the +positioning of Gaussian primitives and can be mitigated by depth constraint. +Consequently, we propose a Hard and Soft Depth Regularization to restore +accurate scene geometry under coarse monocular depth supervision while +maintaining a fine-grained color appearance. To further refine detailed +geometry reshaping, we introduce Global-Local Depth Normalization, enhancing +the focus on small local depth changes. Extensive experiments on LLFF, DTU, and +Blender datasets demonstrate that DNGaussian outperforms state-of-the-art +methods, achieving comparable or better results with significantly reduced +memory cost, a $25 \times$ reduction in training time, and over $3000 \times$ +faster rendering speed. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://fictionarry.github.io/DNGaussian/ +
+
+
+
+
+ + ♻ ☆ DGC-GNN: Leveraging Geometry and Color Cues for Visual Descriptor-Free + 2D-3D Matching CVPR 2024 + + +
+ Matching 2D keypoints in an image to a sparse 3D point cloud of the scene +without requiring visual descriptors has garnered increased interest due to its +low memory requirements, inherent privacy preservation, and reduced need for +expensive 3D model maintenance compared to visual descriptor-based methods. +However, existing algorithms often compromise on performance, resulting in a +significant deterioration compared to their descriptor-based counterparts. In +this paper, we introduce DGC-GNN, a novel algorithm that employs a +global-to-local Graph Neural Network (GNN) that progressively exploits +geometric and color cues to represent keypoints, thereby improving matching +accuracy. Our procedure encodes both Euclidean and angular relations at a +coarse level, forming the geometric embedding to guide the point matching. We +evaluate DGC-GNN on both indoor and outdoor datasets, demonstrating that it not +only doubles the accuracy of the state-of-the-art visual descriptor-free +algorithm but also substantially narrows the performance gap between +descriptor-based and descriptor-free methods. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DemoCaricature: Democratising Caricature Generation with a Rough Sketch + + +
+ In this paper, we democratise caricature generation, empowering individuals +to effortlessly craft personalised caricatures with just a photo and a +conceptual sketch. Our objective is to strike a delicate balance between +abstraction and identity, while preserving the creativity and subjectivity +inherent in a sketch. To achieve this, we present Explicit Rank-1 Model Editing +alongside single-image personalisation, selectively applying nuanced edits to +cross-attention layers for a seamless merge of identity and style. +Additionally, we propose Random Mask Reconstruction to enhance robustness, +directing the model to focus on distinctive identity and style features. +Crucially, our aim is not to replace artists but to eliminate accessibility +barriers, allowing enthusiasts to engage in the artistry. + +
+
+
+
+
+ + ♻ ☆ SG-Bot: Object Rearrangement via Coarse-to-Fine Robotic Imagination on + Scene Graphs ICRA 2024 + + +
+ Object rearrangement is pivotal in robotic-environment interactions, +representing a significant capability in embodied AI. In this paper, we present +SG-Bot, a novel rearrangement framework that utilizes a coarse-to-fine scheme +with a scene graph as the scene representation. Unlike previous methods that +rely on either known goal priors or zero-shot large models, SG-Bot exemplifies +lightweight, real-time, and user-controllable characteristics, seamlessly +blending the consideration of commonsense knowledge with automatic generation +capabilities. SG-Bot employs a three-fold procedure--observation, imagination, +and execution--to adeptly address the task. Initially, objects are discerned +and extracted from a cluttered scene during the observation. These objects are +first coarsely organized and depicted within a scene graph, guided by either +commonsense or user-defined criteria. Then, this scene graph subsequently +informs a generative model, which forms a fine-grained goal scene considering +the shape information from the initial scene and object semantics. Finally, for +execution, the initial and envisioned goal scenes are matched to formulate +robotic action policies. Experimental results demonstrate that SG-Bot +outperforms competitors by a large margin. + +
+
+ comment: ICRA 2024 accepted. Project website: + https://sites.google.com/view/sg-bot +
+
+
+
+
+ + ♻ ☆ C-TPT: Calibrated Test-Time Prompt Tuning for Vision-Language Models via + Text Feature Dispersion ICLR 2024 + + +
+ In deep learning, test-time adaptation has gained attention as a method for +model fine-tuning without the need for labeled data. A prime exemplification is +the recently proposed test-time prompt tuning for large-scale vision-language +models such as CLIP. Unfortunately, these prompts have been mainly developed to +improve accuracy, overlooking the importance of calibration, which is a crucial +aspect for quantifying prediction uncertainty. However, traditional calibration +methods rely on substantial amounts of labeled data, making them impractical +for test-time scenarios. To this end, this paper explores calibration during +test-time prompt tuning by leveraging the inherent properties of CLIP. Through +a series of observations, we find that the prompt choice significantly affects +the calibration in CLIP, where the prompts leading to higher text feature +dispersion result in better-calibrated predictions. Introducing the Average +Text Feature Dispersion (ATFD), we establish its relationship with calibration +error and present a novel method, Calibrated Test-time Prompt Tuning (C-TPT), +for optimizing prompts during test-time with enhanced calibration. Through +extensive experiments on different CLIP architectures and datasets, we show +that C-TPT can effectively improve the calibration of test-time prompt tuning +without needing labeled data. The code is publicly accessible at +https://github.com/hee-suk-yoon/C-TPT. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ UCM-Net: A Lightweight and Efficient Solution for Skin Lesion + Segmentation using MLP and CNN + + +
+ Skin cancer is a significant public health problem, and computer-aided +diagnosis can help to prevent and treat it. A crucial step for computer-aided +diagnosis is accurately segmenting skin lesions in images, which allows for +lesion detection, classification, and analysis. However, this task is +challenging due to the diverse characteristics of lesions, such as appearance, +shape, size, color, texture, and location, as well as image quality issues like +noise, artifacts, and occlusions. Deep learning models have recently been +applied to skin lesion segmentation, but they have high parameter counts and +computational demands, making them unsuitable for mobile health applications. +To address this challenge, we propose UCM-Net, a novel, efficient, and +lightweight solution that integrates Multi-Layer Perceptions (MLP) and +Convolutional Neural Networks (CNN). Unlike conventional UNet architectures, +our UCMNet-Block reduces parameter overhead and enhances UCM-Net's learning +capabilities, leading to robust segmentation performance. We validate UCM-Net's +competitiveness through extensive experiments on PH2, isic2017 and isic2018 +datasets. Remarkably, UCM-Net has less than 50KB parameters and less than 0.05 +Giga-Operations Per Second (GLOPs), setting a new possible standard for +efficiency in skin lesion segmentation. The source code will be publicly +available. + +
+
+ comment: 17 pages, under review +
+
+
+
+
+ + ♻ ☆ CEIMVEN: An Approach of Cutting Edge Implementation of Modified Versions + of EfficientNet (V1-V2) Architecture for Breast Cancer Detection and + Classification from Ultrasound Images + + +
+ Undoubtedly breast cancer identifies itself as one of the most widespread and +terrifying cancers across the globe. Millions of women are getting affected +each year from it. Breast cancer remains the major one for being the reason of +largest number of demise of women. In the recent time of research, Medical +Image Computing and Processing has been playing a significant role for +detecting and classifying breast cancers from ultrasound images and mammograms, +along with the celestial touch of deep neural networks. In this research, we +focused mostly on our rigorous implementations and iterative result analysis of +different cutting-edge modified versions of EfficientNet architectures namely +EfficientNet-V1 (b0-b7) and EfficientNet-V2 (b0-b3) with ultrasound image, +named as CEIMVEN. We utilized transfer learning approach here for using the +pre-trained models of EfficientNet versions. We activated the hyper-parameter +tuning procedures, added fully connected layers, discarded the unprecedented +outliers and recorded the accuracy results from our custom modified +EfficientNet architectures. Our deep learning model training approach was +related to both identifying the cancer affected areas with region of interest +(ROI) techniques and multiple classifications (benign, malignant and normal). +The approximate testing accuracies we got from the modified versions of +EfficientNet-V1 (b0- 99.15%, b1- 98.58%, b2- 98.43%, b3- 98.01%, b4- 98.86%, +b5- 97.72%, b6- 97.72%, b7- 98.72%) and EfficientNet-V2 (b0- 99.29%, b1- +99.01%, b2- 98.72%, b3- 99.43%) are showing very bright future and strong +potentials of deep learning approach for the successful detection and +classification of breast cancers from the ultrasound images at a very early +stage. The code for this research is available here: +https://github.com/ac005sheekar/CEIMVEN-Cutting-Edge-Implementation-of-Modified-EfficientNet-V1-V2-for-BreastCancer-Detection. + +
+
+
+
+
+ + ♻ ☆ CARZero: Cross-Attention Alignment for Radiology Zero-Shot + Classification + + +
+ The advancement of Zero-Shot Learning in the medical domain has been driven +forward by using pre-trained models on large-scale image-text pairs, focusing +on image-text alignment. However, existing methods primarily rely on cosine +similarity for alignment, which may not fully capture the complex relationship +between medical images and reports. To address this gap, we introduce a novel +approach called Cross-Attention Alignment for Radiology Zero-Shot +Classification (CARZero). Our approach innovatively leverages cross-attention +mechanisms to process image and report features, creating a Similarity +Representation that more accurately reflects the intricate relationships in +medical semantics. This representation is then linearly projected to form an +image-text similarity matrix for cross-modality alignment. Additionally, +recognizing the pivotal role of prompt selection in zero-shot learning, CARZero +incorporates a Large Language Model-based prompt alignment strategy. This +strategy standardizes diverse diagnostic expressions into a unified format for +both training and inference phases, overcoming the challenges of manual prompt +design. Our approach is simple yet effective, demonstrating state-of-the-art +performance in zero-shot classification on five official chest radiograph +diagnostic test sets, including remarkable results on datasets with long-tail +distributions of rare diseases. This achievement is attributed to our new +image-text alignment strategy, which effectively addresses the complex +relationship between medical images and reports. Code and models are available +at https://github.com/laihaoran/CARZero. + +
+
+
+
+
+ + ♻ ☆ DGL-GAN: Discriminator Guided Learning for GAN Compression + + +
+ Generative Adversarial Networks (GANs) with high computation costs, e.g., +BigGAN and StyleGAN2, have achieved remarkable results in synthesizing +high-resolution images from random noise. Reducing the computation cost of GANs +while keeping generating photo-realistic images is a challenging field. In this +work, we propose a novel yet simple {\bf D}iscriminator {\bf G}uided {\bf +L}earning approach for compressing vanilla {\bf GAN}, dubbed {\bf DGL-GAN}. +Motivated by the phenomenon that the teacher discriminator may contain some +meaningful information about both real images and fake images, we merely +transfer the knowledge from the teacher discriminator via the adversarial +interaction between the teacher discriminator and the student generator. We +apply DGL-GAN to compress the two most representative large-scale vanilla GANs, +i.e., StyleGAN2 and BigGAN. Experiments show that DGL-GAN achieves +state-of-the-art (SOTA) results on both StyleGAN2 and BigGAN. Moreover, DGL-GAN +is also effective in boosting the performance of original uncompressed GANs. +Original uncompressed StyleGAN2 boosted with DGL-GAN achieves FID 2.65 on FFHQ, +which achieves a new state-of-the-art performance. Code and models are +available at \url{https://github.com/yuesongtian/DGL-GAN} + +
+
+
+
+
+ + ♻ ☆ MAS: Multi-view Ancestral Sampling for 3D motion generation using 2D + diffusion + + +
+ We introduce Multi-view Ancestral Sampling (MAS), a method for 3D motion +generation, using 2D diffusion models that were trained on motions obtained +from in-the-wild videos. As such, MAS opens opportunities to exciting and +diverse fields of motion previously under-explored as 3D data is scarce and +hard to collect. MAS works by simultaneously denoising multiple 2D motion +sequences representing different views of the same 3D motion. It ensures +consistency across all views at each diffusion step by combining the individual +generations into a unified 3D sequence, and projecting it back to the original +views. We demonstrate MAS on 2D pose data acquired from videos depicting +professional basketball maneuvers, rhythmic gymnastic performances featuring a +ball apparatus, and horse races. In each of these domains, 3D motion capture is +arduous, and yet, MAS generates diverse and realistic 3D sequences. Unlike the +Score Distillation approach, which optimizes each sample by repeatedly applying +small fixes, our method uses a sampling process that was constructed for the +diffusion framework. As we demonstrate, MAS avoids common issues such as +out-of-domain sampling and mode-collapse. https://guytevet.github.io/mas-page/ + +
+
+
+
+
+ + ♻ ☆ BEVNeXt: Reviving Dense BEV Frameworks for 3D Object Detection + + +
+ Recently, the rise of query-based Transformer decoders is reshaping +camera-based 3D object detection. These query-based decoders are surpassing the +traditional dense BEV (Bird's Eye View)-based methods. However, we argue that +dense BEV frameworks remain important due to their outstanding abilities in +depth estimation and object localization, depicting 3D scenes accurately and +comprehensively. This paper aims to address the drawbacks of the existing dense +BEV-based 3D object detectors by introducing our proposed enhanced components, +including a CRF-modulated depth estimation module enforcing object-level +consistencies, a long-term temporal aggregation module with extended receptive +fields, and a two-stage object decoder combining perspective techniques with +CRF-modulated depth embedding. These enhancements lead to a "modernized" dense +BEV framework dubbed BEVNeXt. On the nuScenes benchmark, BEVNeXt outperforms +both BEV-based and query-based frameworks under various settings, achieving a +state-of-the-art result of 64.2 NDS on the nuScenes test set. Code will be +available at \url{https://github.com/woxihuanjiangguo/BEVNeXt}. + +
+
+
+
+
+ + ♻ ☆ Training-free Zero-shot Composed Image Retrieval with Local Concept + Reranking + + +
+ Composed image retrieval attempts to retrieve an image of interest from +gallery images through a composed query of a reference image and its +corresponding modified text. It has recently attracted attention due to the +collaboration of information-rich images and concise language to precisely +express the requirements of target images. Most current composed image +retrieval methods follow a supervised learning approach to training on a costly +triplet dataset composed of a reference image, modified text, and a +corresponding target image. To avoid difficult to-obtain labeled triplet +training data, zero-shot composed image retrieval (ZS-CIR) has been introduced, +which aims to retrieve the target image by learning from image-text pairs +(self-supervised triplets), without the need for human-labeled triplets. +However, this self-supervised triplet learning approach is computationally less +effective and less understandable as it assumes the interaction between image +and text is conducted with implicit query embedding without explicit semantical +interpretation. In this work, we present a new training-free zero-shot composed +image retrieval method which translates the query into explicit +human-understandable text. This helps improve model learning efficiency to +enhance the generalization capacity of foundation models. Further, we introduce +a Local Concept Re-ranking (LCR) mechanism to focus on discriminative local +information extracted from the modified instructions. Extensive experiments on +four ZS-CIR benchmarks show that our method achieves comparable performances to +that of the state of-the-art triplet training based methods, but significantly +outperforms other training-free methods on the open domain datasets (CIRR, +CIRCO and COCO), as well as the fashion domain dataset (FashionIQ). + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Reality's Canvas, Language's Brush: Crafting 3D Avatars from Monocular + Video + + +
+ Recent advancements in 3D avatar generation excel with multi-view supervision +for photorealistic models. However, monocular counterparts lag in quality +despite broader applicability. We propose ReCaLaB to close this gap. ReCaLaB is +a fully-differentiable pipeline that learns high-fidelity 3D human avatars from +just a single RGB video. A pose-conditioned deformable NeRF is optimized to +volumetrically represent a human subject in canonical T-pose. The canonical +representation is then leveraged to efficiently associate neural textures using +2D-3D correspondences. This enables the separation of diffused color generation +and lighting correction branches that jointly compose an RGB prediction. The +design allows to control intermediate results for human pose, body shape, +texture, and lighting with text prompts. An image-conditioned diffusion model +thereby helps to animate appearance and pose of the 3D avatar to create video +sequences with previously unseen human motion. Extensive experiments show that +ReCaLaB outperforms previous monocular approaches in terms of image quality for +image synthesis tasks. Moreover, natural language offers an intuitive user +interface for creative manipulation of 3D human avatars. + +
+
+ comment: Video link: https://youtu.be/Oz83z1es2J4 +
+
+
+
+
+ + ♻ ☆ AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in + Text-to-Image Generation + + +
+ Text-to-Image (T2I) diffusion models have achieved remarkable success in +image generation. Despite their progress, challenges remain in both +prompt-following ability, image quality and lack of high-quality datasets, +which are essential for refining these models. As acquiring labeled data is +costly, we introduce AGFSync, a framework that enhances T2I diffusion models +through Direct Preference Optimization (DPO) in a fully AI-driven approach. +AGFSync utilizes Vision-Language Models (VLM) to assess image quality across +style, coherence, and aesthetics, generating feedback data within an AI-driven +loop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and +SDXL, our extensive experiments on the TIFA dataset demonstrate notable +improvements in VQA scores, aesthetic evaluations, and performance on the HPSv2 +benchmark, consistently outperforming the base models. AGFSync's method of +refining T2I diffusion models paves the way for scalable alignment techniques. + +
+
+
+
+
+ + ♻ ☆ Video Editing via Factorized Diffusion Distillation + + +
+ We introduce Emu Video Edit (EVE), a model that establishes a new +state-of-the art in video editing without relying on any supervised video +editing data. To develop EVE we separately train an image editing adapter and a +video generation adapter, and attach both to the same text-to-image model. +Then, to align the adapters towards video editing we introduce a new +unsupervised distillation procedure, Factorized Diffusion Distillation. This +procedure distills knowledge from one or more teachers simultaneously, without +any supervised data. We utilize this procedure to teach EVE to edit videos by +jointly distilling knowledge to (i) precisely edit each individual frame from +the image editing adapter, and (ii) ensure temporal consistency among the +edited frames using the video generation adapter. Finally, to demonstrate the +potential of our approach in unlocking other capabilities, we align additional +combinations of adapters + +
+
+
+
+
+ + ♻ ☆ HyMNet: a Multimodal Deep Learning System for Hypertension + Classification using Fundus Photographs and Cardiometabolic Risk Factors + + +
+ In recent years, deep learning has shown promise in predicting hypertension +(HTN) from fundus images. However, most prior research has primarily focused on +analyzing a single type of data, which may not capture the full complexity of +HTN risk. To address this limitation, this study introduces a multimodal deep +learning (MMDL) system, dubbed HyMNet, which combines fundus images and +cardiometabolic risk factors, specifically age and gender, to improve +hypertension detection capabilities. Our MMDL system uses RETFound, a +foundation model pre-trained on 1.6 million retinal images, for the fundus path +and a fully connected neural network for the age and gender path. The two paths +are jointly trained by concatenating the feature vectors from each path that +are then fed into a fusion network. The system was trained on 5,016 retinal +images from 1,243 individuals collected from the Saudi Ministry of National +Guard Health Affairs. The results show that the multimodal model that +integrates fundus images along with age and gender outperforms the unimodal +system trained solely on fundus photographs, with an F1 score of 0.771 [0.747, +0.796], and 0.745 [0.719, 0.772] for hypertension detection, respectively. +Additionally, we studied the effect underlying diabetes mellitus has on the +model's predictive ability, concluding that diabetes is used as a confounding +variable for distinguishing hypertensive cases. Our code and model weights are +publicly available at https://github.com/MohammedSB/HyMNet. + +
+
+
+
+
+ + ♻ ☆ Few-shot Object Localization + + +
+ Existing object localization methods are tailored to locate a specific class +of objects, relying on abundant labeled data for model optimization. However, +in numerous real-world scenarios, acquiring large labeled data can be arduous, +significantly constraining the broader application of localization models. To +bridge this research gap, this paper proposes the novel task of Few-Shot Object +Localization (FSOL), which seeks to achieve precise localization with limited +samples available. This task achieves generalized object localization by +leveraging a small number of labeled support samples to query the positional +information of objects within corresponding images. To advance this research +field, we propose an innovative high-performance baseline model. Our model +integrates a dual-path feature augmentation module to enhance shape association +and gradient differences between supports and query images, alongside a self +query module designed to explore the association between feature maps and query +images. Experimental results demonstrate a significant performance improvement +of our approach in the FSOL task, establishing an efficient benchmark for +further research. All codes and data are available at +https://github.com/Ryh1218/FSOL. + +
+
+
+
+
+ + ♻ ☆ Less is More: A Closer Look at Semantic-based Few-Shot Learning + + +
+ Few-shot Learning aims to learn and distinguish new categories with a very +limited number of available images, presenting a significant challenge in the +realm of deep learning. Recent researchers have sought to leverage the +additional textual or linguistic information of these rare categories with a +pre-trained language model to facilitate learning, thus partially alleviating +the problem of insufficient supervision signals. However, the full potential of +the textual information and pre-trained language model have been underestimated +in the few-shot learning till now, resulting in limited performance +enhancements. To address this, we propose a simple but effective framework for +few-shot learning tasks, specifically designed to exploit the textual +information and language model. In more detail, we explicitly exploit the +zero-shot capability of the pre-trained language model with the learnable +prompt. And we just add the visual feature with the textual feature for +inference directly without the intricate designed fusion modules in previous +works. Additionally, we apply the self-ensemble and distillation to further +enhance these components. Our extensive experiments conducted across four +widely used few-shot datasets demonstrate that our simple framework achieves +impressive results. Particularly noteworthy is its outstanding performance in +the 1-shot learning task, surpassing state-of-the-art methods by an average of +3.0\% in classification accuracy. \footnote{We will make the source codes of +the proposed framework publicly available upon acceptance. }. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Cross-Domain Image Retrieval via Prototypical Optimal + Transport AAAI2024 + + +
+ Unsupervised cross-domain image retrieval (UCIR) aims to retrieve images +sharing the same category across diverse domains without relying on labeled +data. Prior approaches have typically decomposed the UCIR problem into two +distinct tasks: intra-domain representation learning and cross-domain feature +alignment. However, these segregated strategies overlook the potential +synergies between these tasks. This paper introduces ProtoOT, a novel Optimal +Transport formulation explicitly tailored for UCIR, which integrates +intra-domain feature representation learning and cross-domain alignment into a +unified framework. ProtoOT leverages the strengths of the K-means clustering +method to effectively manage distribution imbalances inherent in UCIR. By +utilizing K-means for generating initial prototypes and approximating class +marginal distributions, we modify the constraints in Optimal Transport +accordingly, significantly enhancing its performance in UCIR scenarios. +Furthermore, we incorporate contrastive learning into the ProtoOT framework to +further improve representation learning. This encourages local semantic +consistency among features with similar semantics, while also explicitly +enforcing separation between features and unmatched prototypes, thereby +enhancing global discriminativeness. ProtoOT surpasses existing +state-of-the-art methods by a notable margin across benchmark datasets. +Notably, on DomainNet, ProtoOT achieves an average P@200 enhancement of 24.44%, +and on Office-Home, it demonstrates a P@15 improvement of 12.12%. Code is +available at https://github.com/HCVLAB/ProtoOT. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models + + +
+ The popularity of pre-trained large models has revolutionized downstream +tasks across diverse fields, such as language, vision, and multi-modality. To +minimize the adaption cost for downstream tasks, many Parameter-Efficient +Fine-Tuning (PEFT) techniques are proposed for language and 2D image +pre-trained models. However, the specialized PEFT method for 3D pre-trained +models is still under-explored. To this end, we introduce Point-PEFT, a novel +framework for adapting point cloud pre-trained models with minimal learnable +parameters. Specifically, for a pre-trained 3D model, we freeze most of its +parameters, and only tune the newly added PEFT modules on downstream tasks, +which consist of a Point-prior Prompt and a Geometry-aware Adapter. The +Point-prior Prompt adopts a set of learnable prompt tokens, for which we +propose to construct a memory bank with domain-specific knowledge, and utilize +a parameter-free attention to enhance the prompt tokens. The Geometry-aware +Adapter aims to aggregate point cloud features within spatial neighborhoods to +capture fine-grained geometric information through local interactions. +Extensive experiments indicate that our Point-PEFT can achieve better +performance than the full fine-tuning on various downstream tasks, while using +only 5% of the trainable parameters, demonstrating the efficiency and +effectiveness of our approach. Code is released at +https://github.com/Ivan-Tang-3D/Point-PEFT. + +
+
+ comment: The specialized PEFT framework for 3D pre-trained models, which + achieves competitive performance to full fine-tuning, and significantly + reduces the computational resources. Project page: + https://github.com/Ivan-Tang-3D/Point-PEFT +
+
+
+
+
+ + ♻ ☆ A Literature Review of Literature Reviews in Pattern Analysis and + Machine Intelligence + + +
+ By consolidating scattered knowledge, the literature review provides a +comprehensive understanding of the investigated topic. However, reading, +conducting, or peer-reviewing review papers generally demands a significant +investment of time and effort from researchers. To improve efficiency, this +paper aims to provide a thorough review of reviews in the PAMI field from +diverse perspectives. First, this paper proposes several article-level, +field-normalized, and large language model-empowered bibliometric indicators to +evaluate reviews. To facilitate this, a meta-data database dubbed RiPAMI, and a +topic dataset are constructed. Second, based on these indicators, the study +presents comparative analyses of representative reviews, unveiling the +characteristics of publications across various fields, periods, and journals. +The newly emerging AI-generated literature reviews are also appraised, and the +observed differences suggest that most AI-generated reviews still lag behind +human-authored reviews in multiple aspects. Third, we briefly provide a +subjective evaluation of representative PAMI reviews and introduce a paper +structure-based typology of literature reviews. This typology may improve the +clarity and effectiveness for scholars in reading and writing reviews, while +also serving as a guide for AI systems in generating well-organized reviews. +Finally, this work offers insights into the current challenges of literature +reviews and envisions future directions for their development. + +
+
+ comment: IEEE version v1. [February 19, 2024] IEEE version v2 with typos + fixed. [February 23, 2024] IEEE version v3 with errors fixed. [February 29, + 2024] IEEE version v4 with improved quaility. [February 29, 2024] +
+
+
+
+
+ + ♻ ☆ See, Imagine, Plan: Discovering and Hallucinating Tasks from a Single + Image + + +
+ Humans can not only recognize and understand the world in its current state +but also envision future scenarios that extend beyond immediate perception. To +resemble this profound human capacity, we introduce zero-shot task +hallucination -- given a single RGB image of any scene comprising unknown +environments and objects, our model can identify potential tasks and imagine +their execution in a vivid narrative, realized as a video. We develop a modular +pipeline that progressively enhances scene decomposition, comprehension, and +reconstruction, incorporating VLM for dynamic interaction and 3D motion +planning for object trajectories. Our model can discover diverse tasks, with +the generated task videos demonstrating realistic and compelling visual +outcomes that are understandable by both machines and humans. Project Page: +https://dannymcy.github.io/zeroshot_task_hallucination/ + +
+
+ comment: Project Page: https://dannymcy.github.io/zeroshot_task_hallucination/ +
+
+
+
+
+ + ♻ ☆ Improving Online Source-free Domain Adaptation for Object Detection by + Unsupervised Data Acquisition + + +
+ Effective object detection in mobile robots is challenged by deployment in +diverse and unfamiliar environments. Online Source-Free Domain Adaptation +(O-SFDA) offers model adaptation using a stream of unlabeled data from a target +domain in online manner. However, not all captured frames contain information +that is beneficial for adaptation, particularly when there is a strong class +imbalance. This paper introduces a novel approach to enhance O-SFDA for +adaptive object detection in mobile robots via unsupervised data acquisition. +Our methodology prioritizes the most informative unlabeled frames for inclusion +in the online training process. Empirical evaluation on a real-world dataset +reveals that our method outperforms existing state-of-the-art O-SFDA +techniques, demonstrating the viability of unsupervised data acquisition for +improving adaptive object detection in mobile robots. + +
+
+
+
+
+ + ♻ ☆ Frequency Decoupling for Motion Magnification via Multi-Level Isomorphic + Architecture CVPR2024 + + +
+ Video Motion Magnification (VMM) aims to reveal subtle and imperceptible +motion information of objects in the macroscopic world. Prior methods directly +model the motion field from the Eulerian perspective by Representation Learning +that separates shape and texture or Multi-domain Learning from phase +fluctuations. Inspired by the frequency spectrum, we observe that the +low-frequency components with stable energy always possess spatial structure +and less noise, making them suitable for modeling the subtle motion field. To +this end, we present FD4MM, a new paradigm of Frequency Decoupling for Motion +Magnification with a Multi-level Isomorphic Architecture to capture multi-level +high-frequency details and a stable low-frequency structure (motion field) in +video space. Since high-frequency details and subtle motions are susceptible to +information degradation due to their inherent subtlety and unavoidable external +interference from noise, we carefully design Sparse High/Low-pass Filters to +enhance the integrity of details and motion structures, and a Sparse Frequency +Mixer to promote seamless recoupling. Besides, we innovatively design a +contrastive regularization for this task to strengthen the model's ability to +discriminate irrelevant features, reducing undesired motion magnification. +Extensive experiments on both Real-world and Synthetic Datasets show that our +FD4MM outperforms SOTA methods. Meanwhile, FD4MM reduces FLOPs by 1.63$\times$ +and boosts inference speed by 1.68$\times$ than the latest method. Our code is +available at https://github.com/Jiafei127/FD4MM. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ SynerMix: Synergistic Mixup Solution for Enhanced Intra-Class Cohesion + and Inter-Class Separability in Image Classification + + +
+ To address the issues of MixUp and its variants (e.g., Manifold MixUp) in +image classification tasks-namely, their neglect of mixing within the same +class (intra-class mixup) and their inadequacy in enhancing intra-class +cohesion through their mixing operations-we propose a novel mixup method named +SynerMix-Intra and, building upon this, introduce a synergistic mixup solution +named SynerMix. SynerMix-Intra specifically targets intra-class mixup to +bolster intra-class cohesion, a feature not addressed by current mixup methods. +For each mini-batch, it leverages feature representations of unaugmented +original images from each class to generate a synthesized feature +representation through random linear interpolation. All synthesized +representations are then fed into the classification and loss layers to +calculate an average classification loss that significantly enhances +intra-class cohesion. Furthermore, SynerMix combines SynerMix-Intra with an +existing mixup approach (e.g., MixUp, Manifold MixUp), which primarily focuses +on inter-class mixup and has the benefit of enhancing inter-class separability. +In doing so, it integrates both inter- and intra-class mixup in a balanced way +while concurrently improving intra-class cohesion and inter-class separability. +Experimental results on six datasets show that SynerMix achieves a 0.1% to +3.43% higher accuracy than the best of either MixUp or SynerMix-Intra alone, +averaging a 1.16% gain. It also surpasses the top-performer of either Manifold +MixUp or SynerMix-Intra by 0.12% to 5.16%, with an average gain of 1.11%. Given +that SynerMix is model-agnostic, it holds significant potential for application +in other domains where mixup methods have shown promise, such as speech and +text classification. Our code is publicly available at: +https://github.com/wxitxy/synermix.git. + +
+
+ comment: 25 pages,12 figures +
+
+
+
+
+ + ♻ ☆ SAM-DA: UAV Tracks Anything at Night with SAM-Powered Domain Adaptation + + +
+ Domain adaptation (DA) has demonstrated significant promise for real-time +nighttime unmanned aerial vehicle (UAV) tracking. However, the state-of-the-art +(SOTA) DA still lacks the potential object with accurate pixel-level location +and boundary to generate the high-quality target domain training sample. This +key issue constrains the transfer learning of the real-time daytime SOTA +trackers for challenging nighttime UAV tracking. Recently, the notable Segment +Anything Model (SAM) has achieved a remarkable zero-shot generalization ability +to discover abundant potential objects due to its huge data-driven training +approach. To solve the aforementioned issue, this work proposes a novel +SAM-powered DA framework for real-time nighttime UAV tracking, i.e., SAM-DA. +Specifically, an innovative SAM-powered target domain training sample swelling +is designed to determine enormous high-quality target domain training samples +from every single raw nighttime image. This novel one-to-many generation +significantly expands the high-quality target domain training sample for DA. +Comprehensive experiments on extensive nighttime UAV videos prove the +robustness and domain adaptability of SAM-DA for nighttime UAV tracking. +Especially, compared to the SOTA DA, SAM-DA can achieve better performance with +fewer raw nighttime images, i.e., the fewer-better training. This economized +training approach facilitates the quick validation and deployment of algorithms +for UAVs. The code is available at https://github.com/vision4robotics/SAM-DA. + +
+
+
+
+
+ + ♻ ☆ SAI3D: Segment Any Instance in 3D Scenes CVPR 2024 + + +
+ Advancements in 3D instance segmentation have traditionally been tethered to +the availability of annotated datasets, limiting their application to a narrow +spectrum of object categories. Recent efforts have sought to harness +vision-language models like CLIP for open-set semantic reasoning, yet these +methods struggle to distinguish between objects of the same categories and rely +on specific prompts that are not universally applicable. In this paper, we +introduce SAI3D, a novel zero-shot 3D instance segmentation approach that +synergistically leverages geometric priors and semantic cues derived from +Segment Anything Model (SAM). Our method partitions a 3D scene into geometric +primitives, which are then progressively merged into 3D instance segmentations +that are consistent with the multi-view SAM masks. Moreover, we design a +hierarchical region-growing algorithm with a dynamic thresholding mechanism, +which largely improves the robustness of finegrained 3D scene parsing.Empirical +evaluations on ScanNet, Matterport3D and the more challenging ScanNet++ +datasets demonstrate the superiority of our approach. Notably, SAI3D +outperforms existing open-vocabulary baselines and even surpasses +fully-supervised methods in class-agnostic segmentation on ScanNet++. Our +project page is at https://yd-yin.github.io/SAI3D. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploiting Auxiliary Caption for Video Grounding + + +
+ Video grounding aims to locate a moment of interest matching the given query +sentence from an untrimmed video. Previous works ignore the {sparsity dilemma} +in video annotations, which fails to provide the context information between +potential events and query sentences in the dataset. In this paper, we contend +that exploiting easily available captions which describe general actions, i.e., +auxiliary captions defined in our paper, will significantly boost the +performance. To this end, we propose an Auxiliary Caption Network (ACNet) for +video grounding. Specifically, we first introduce dense video captioning to +generate dense captions and then obtain auxiliary captions by Non-Auxiliary +Caption Suppression (NACS). To capture the potential information in auxiliary +captions, we propose Caption Guided Attention (CGA) project the semantic +relations between auxiliary captions and query sentences into temporal space +and fuse them into visual representations. Considering the gap between +auxiliary captions and ground truth, we propose Asymmetric Cross-modal +Contrastive Learning (ACCL) for constructing more negative pairs to maximize +cross-modal mutual information. Extensive experiments on three public datasets +(i.e., ActivityNet Captions, TACoS and ActivityNet-CG) demonstrate that our +method significantly outperforms state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ InsertNeRF: Instilling Generalizability into NeRF with HyperNet Modules ICLR 2024 + + +
+ Generalizing Neural Radiance Fields (NeRF) to new scenes is a significant +challenge that existing approaches struggle to address without extensive +modifications to vanilla NeRF framework. We introduce InsertNeRF, a method for +INStilling gEneRalizabiliTy into NeRF. By utilizing multiple plug-and-play +HyperNet modules, InsertNeRF dynamically tailors NeRF's weights to specific +reference scenes, transforming multi-scale sampling-aware features into +scene-specific representations. This novel design allows for more accurate and +efficient representations of complex appearances and geometries. Experiments +show that this method not only achieves superior generalization performance but +also provides a flexible pathway for integration with other NeRF-like systems, +even in sparse input settings. Code will be available +https://github.com/bbbbby-99/InsertNeRF. + +
+
+ comment: This work was accepted at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Task-Customized Mixture of Adapters for General Image Fusion CVPR 2024 + + +
+ General image fusion aims at integrating important information from +multi-source images. However, due to the significant cross-task gap, the +respective fusion mechanism varies considerably in practice, resulting in +limited performance across subtasks. To handle this problem, we propose a novel +task-customized mixture of adapters (TC-MoA) for general image fusion, +adaptively prompting various fusion tasks in a unified model. We borrow the +insight from the mixture of experts (MoE), taking the experts as efficient +tuning adapters to prompt a pre-trained foundation model. These adapters are +shared across different tasks and constrained by mutual information +regularization, ensuring compatibility with different tasks while +complementarity for multi-source images. The task-specific routing networks +customize these adapters to extract task-specific information from different +sources with dynamic dominant intensity, performing adaptive visual feature +prompt fusion. Notably, our TC-MoA controls the dominant intensity bias for +different fusion tasks, successfully unifying multiple fusion tasks in a single +model. Extensive experiments show that TC-MoA outperforms the competing +approaches in learning commonalities while retaining compatibility for general +image fusion (multi-modal, multi-exposure, and multi-focus), and also +demonstrating striking controllability on more generalization experiments. The +code is available at https://github.com/YangSun22/TC-MoA . + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ GPS-Gaussian: Generalizable Pixel-wise 3D Gaussian Splatting for + Real-time Human Novel View Synthesis CVPR 2024 + + +
+ We present a new approach, termed GPS-Gaussian, for synthesizing novel views +of a character in a real-time manner. The proposed method enables 2K-resolution +rendering under a sparse-view camera setting. Unlike the original Gaussian +Splatting or neural implicit rendering methods that necessitate per-subject +optimizations, we introduce Gaussian parameter maps defined on the source views +and regress directly Gaussian Splatting properties for instant novel view +synthesis without any fine-tuning or optimization. To this end, we train our +Gaussian parameter regression module on a large amount of human scan data, +jointly with a depth estimation module to lift 2D parameter maps to 3D space. +The proposed framework is fully differentiable and experiments on several +datasets demonstrate that our method outperforms state-of-the-art methods while +achieving an exceeding rendering speed. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://shunyuanzheng.github.io/GPS-Gaussian +
+
+
+
+
+ + ♻ ☆ EHRDiff: Exploring Realistic EHR Synthesis with Diffusion Models + + +
+ Electronic health records (EHR) contain a wealth of biomedical information, +serving as valuable resources for the development of precision medicine +systems. However, privacy concerns have resulted in limited access to +high-quality and large-scale EHR data for researchers, impeding progress in +methodological development. Recent research has delved into synthesizing +realistic EHR data through generative modeling techniques, where a majority of +proposed methods relied on generative adversarial networks (GAN) and their +variants for EHR synthesis. Despite GAN-based methods attaining +state-of-the-art performance in generating EHR data, these approaches are +difficult to train and prone to mode collapse. Recently introduced in +generative modeling, diffusion models have established cutting-edge performance +in image generation, but their efficacy in EHR data synthesis remains largely +unexplored. In this study, we investigate the potential of diffusion models for +EHR data synthesis and introduce a novel method, EHRDiff. Through extensive +experiments, EHRDiff establishes new state-of-the-art quality for synthetic EHR +data, protecting private information in the meanwhile. + +
+
+ comment: Accepted by TMLR, preprint of camera-ready version +
+
+
+
+
+ + ♻ ☆ A Large-Scale Empirical Study on Improving the Fairness of Image + Classification Models ISSTA 2024 + + +
+ Fairness has been a critical issue that affects the adoption of deep learning +models in real practice. To improve model fairness, many existing methods have +been proposed and evaluated to be effective in their own contexts. However, +there is still no systematic evaluation among them for a comprehensive +comparison under the same context, which makes it hard to understand the +performance distinction among them, hindering the research progress and +practical adoption of them. To fill this gap, this paper endeavours to conduct +the first large-scale empirical study to comprehensively compare the +performance of existing state-of-the-art fairness improving techniques. +Specifically, we target the widely-used application scenario of image +classification, and utilized three different datasets and five commonly-used +performance metrics to assess in total 13 methods from diverse categories. Our +findings reveal substantial variations in the performance of each method across +different datasets and sensitive attributes, indicating over-fitting on +specific datasets by many existing methods. Furthermore, different fairness +evaluation metrics, due to their distinct focuses, yield significantly +different assessment results. Overall, we observe that pre-processing methods +and in-processing methods outperform post-processing methods, with +pre-processing methods exhibiting the best performance. Our empirical study +offers comprehensive recommendations for enhancing fairness in deep learning +models. We approach the problem from multiple dimensions, aiming to provide a +uniform evaluation platform and inspire researchers to explore more effective +fairness solutions via a set of implications. + +
+
+ comment: Accepted by the 33rd ACM SIGSOFT International Symposium on Software + Testing and Analysis (ISSTA 2024). Please include ISSTA in any citations +
+
+
+
+
+ + ♻ ☆ A Number Sense as an Emergent Property of the Manipulating Brain + + +
+ The ability to understand and manipulate numbers and quantities emerges +during childhood, but the mechanism through which humans acquire and develop +this ability is still poorly understood. We explore this question through a +model, assuming that the learner is able to pick up and place small objects +from, and to, locations of its choosing, and will spontaneously engage in such +undirected manipulation. We further assume that the learner's visual system +will monitor the changing arrangements of objects in the scene and will learn +to predict the effects of each action by comparing perception with a +supervisory signal from the motor system. We model perception using standard +deep networks for feature extraction and classification, and gradient descent +learning. Our main finding is that, from learning the task of action +prediction, an unexpected image representation emerges exhibiting regularities +that foreshadow the perception and representation of numbers and quantity. +These include distinct categories for zero and the first few natural numbers, a +strict ordering of the numbers, and a one-dimensional signal that correlates +with numerical quantity. As a result, our model acquires the ability to +estimate numerosity, i.e. the number of objects in the scene, as well as +subitization, i.e. the ability to recognize at a glance the exact number of +objects in small scenes. Remarkably, subitization and numerosity estimation +extrapolate to scenes containing many objects, far beyond the three objects +used during training. We conclude that important aspects of a facility with +numbers and quantities may be learned with supervision from a simple +pre-training task. Our observations suggest that cross-modal learning is a +powerful learning mechanism that may be harnessed in artificial intelligence. + +
+
+ comment: 16 pages, 5 figures, 15 supplemental figures +
+
+
+
+
+ + ♻ ☆ To Generate or Not? Safety-Driven Unlearned Diffusion Models Are Still + Easy To Generate Unsafe Images ... For Now + + +
+ The recent advances in diffusion models (DMs) have revolutionized the +generation of realistic and complex images. However, these models also +introduce potential safety hazards, such as producing harmful content and +infringing data copyrights. Despite the development of safety-driven unlearning +techniques to counteract these challenges, doubts about their efficacy persist. +To tackle this issue, we introduce an evaluation framework that leverages +adversarial prompts to discern the trustworthiness of these safety-driven DMs +after they have undergone the process of unlearning harmful concepts. +Specifically, we investigated the adversarial robustness of DMs, assessed by +adversarial prompts, when eliminating unwanted concepts, styles, and objects. +We develop an effective and efficient adversarial prompt generation approach +for DMs, termed UnlearnDiffAtk. This method capitalizes on the intrinsic +classification abilities of DMs to simplify the creation of adversarial +prompts, thereby eliminating the need for auxiliary classification or diffusion +models.Through extensive benchmarking, we evaluate the robustness of five +widely-used safety-driven unlearned DMs (i.e., DMs after unlearning undesirable +concepts, styles, or objects) across a variety of tasks. Our results +demonstrate the effectiveness and efficiency merits of UnlearnDiffAtk over the +state-of-the-art adversarial prompt generation method and reveal the lack of +robustness of current safety-driven unlearning techniques when applied to DMs. +Codes are available at https://github.com/OPTML-Group/Diffusion-MU-Attack. +WARNING: This paper contains model outputs that may be offensive in nature. + +
+
+ comment: Codes are available at + https://github.com/OPTML-Group/Diffusion-MU-Attack +
+
+
+
+
+ + ♻ ☆ Multi-view Deep Subspace Clustering Networks + + +
+ Multi-view subspace clustering aims to discover the inherent structure of +data by fusing multiple views of complementary information. Most existing +methods first extract multiple types of handcrafted features and then learn a +joint affinity matrix for clustering. The disadvantage of this approach lies in +two aspects: 1) multi-view relations are not embedded into feature learning, +and 2) the end-to-end learning manner of deep learning is not suitable for +multi-view clustering. Even when deep features have been extracted, it is a +nontrivial problem to choose a proper backbone for clustering on different +datasets. To address these issues, we propose the Multi-view Deep Subspace +Clustering Networks (MvDSCN), which learns a multi-view self-representation +matrix in an end-to-end manner. The MvDSCN consists of two sub-networks, \ie, a +diversity network (Dnet) and a universality network (Unet). A latent space is +built using deep convolutional autoencoders, and a self-representation matrix +is learned in the latent space using a fully connected layer. Dnet learns +view-specific self-representation matrices, whereas Unet learns a common +self-representation matrix for all views. To exploit the complementarity of +multi-view representations, the Hilbert--Schmidt independence criterion (HSIC) +is introduced as a diversity regularizer that captures the nonlinear, +high-order inter-view relations. Because different views share the same label +space, the self-representation matrices of each view are aligned to the common +one by universality regularization. The MvDSCN also unifies multiple backbones +to boost clustering performance and avoid the need for model selection. +Experiments demonstrate the superiority of the MvDSCN. + +
+
+ comment: Accepted by T-CYB +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`